From d679c2e1e8d96f71f85e2ef3877407d264212cc3 Mon Sep 17 00:00:00 2001 From: Eugen Hristev Date: Fri, 17 Jan 2025 10:42:28 +0200 Subject: [PATCH 0001/3206] pstore/zone: rewrite some comments for better understanding Rewrite some comments to make it more clear and easier to understand; fix typos. Signed-off-by: Eugen Hristev Link: https://lore.kernel.org/r/20241224154405.295840-1-eugen.hristev@linaro.org Link: https://lore.kernel.org/r/20250117084228.3218024-1-eugen.hristev@linaro.org Signed-off-by: Kees Cook --- fs/pstore/zone.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index ceb5639a062960..eb61ba5bb96422 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -43,7 +43,7 @@ struct psz_buffer { * * @magic: magic num for kmsg dump header * @time: kmsg dump trigger time - * @compressed: whether conpressed + * @compressed: whether compressed * @counter: kmsg dump counter * @reason: the kmsg dump reason (e.g. oops, panic, etc) * @data: pointer to log data @@ -214,7 +214,7 @@ static int psz_zone_write(struct pstore_zone *zone, atomic_set(&zone->buffer->datalen, wlen + off); } - /* avoid to damage old records */ + /* avoid damaging old records */ if (!is_on_panic() && !atomic_read(&pstore_zone_cxt.recovered)) goto dirty; @@ -249,7 +249,7 @@ static int psz_zone_write(struct pstore_zone *zone, return 0; dirty: - /* no need to mark dirty if going to try next zone */ + /* no need to mark it dirty if going to try next zone */ if (wcnt == -ENOMSG) return -ENOMSG; atomic_set(&zone->dirty, true); @@ -378,7 +378,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt) struct timespec64 time = { }; unsigned long i; /* - * Recover may on panic, we can't allocate any memory by kmalloc. + * Recover may happen on panic, we can't allocate any memory by kmalloc. * So, we use local array instead. */ char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0}; @@ -856,11 +856,11 @@ static int notrace psz_record_write(struct pstore_zone *zone, /** * psz_zone_write will set datalen as start + cnt. - * It work if actual data length lesser than buffer size. - * If data length greater than buffer size, pmsg will rewrite to - * beginning of zone, which make buffer->datalen wrongly. + * It works if actual data length is lesser than buffer size. + * If data length is greater than buffer size, pmsg will rewrite to + * the beginning of the zone, which makes buffer->datalen wrong. * So we should reset datalen as buffer size once actual data length - * greater than buffer size. + * is greater than buffer size. */ if (is_full_data) { atomic_set(&zone->buffer->datalen, zone->buffer_size); @@ -878,8 +878,9 @@ static int notrace psz_pstore_write(struct pstore_record *record) atomic_set(&cxt->on_panic, 1); /* - * if on panic, do not write except panic records - * Fix case that panic_write prints log which wakes up console backend. + * If on panic, do not write anything except panic records. + * Fix the case when panic_write prints log that wakes up + * console backend. */ if (is_on_panic() && record->type != PSTORE_TYPE_DMESG) return -EBUSY; From cce436aafc2abad691fdd37de63ec8a4490b42ce Mon Sep 17 00:00:00 2001 From: Johannes Nixdorf Date: Fri, 25 Jul 2025 18:31:18 +0200 Subject: [PATCH 0002/3206] seccomp: Fix a race with WAIT_KILLABLE_RECV if the tracer replies too fast Normally the tracee starts in SECCOMP_NOTIFY_INIT, sends an event to the tracer, and starts to wait interruptibly. With SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV, if the tracer receives the message (SECCOMP_NOTIFY_SENT is reached) while the tracee was waiting and is subsequently interrupted, the tracee begins to wait again uninterruptibly (but killable). This fails if SECCOMP_NOTIFY_REPLIED is reached before the tracee is interrupted, as the check only considered SECCOMP_NOTIFY_SENT as a condition to begin waiting again. In this case the tracee is interrupted even though the tracer already acted on its behalf. This breaks the assumption SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV wanted to ensure, namely that the tracer can be sure the syscall is not interrupted or restarted on the tracee after it is received on the tracer. Fix this by also considering SECCOMP_NOTIFY_REPLIED when evaluating whether to switch to uninterruptible waiting. With the condition changed the loop in seccomp_do_user_notification() would exit immediately after deciding that noninterruptible waiting is required if the operation already reached SECCOMP_NOTIFY_REPLIED, skipping the code that processes pending addfd commands first. Prevent this by executing the remaining loop body one last time in this case. Fixes: c2aa2dfef243 ("seccomp: Add wait_killable semantic to seccomp user notifier") Reported-by: Ali Polatel Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220291 Signed-off-by: Johannes Nixdorf Link: https://lore.kernel.org/r/20250725-seccomp-races-v2-1-cf8b9d139596@nixdorf.dev Signed-off-by: Kees Cook --- kernel/seccomp.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 41aa761c7738ce..3bbfba30a777a1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1139,7 +1139,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn static bool should_sleep_killable(struct seccomp_filter *match, struct seccomp_knotif *n) { - return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT; + return match->wait_killable_recv && n->state >= SECCOMP_NOTIFY_SENT; } static int seccomp_do_user_notification(int this_syscall, @@ -1186,13 +1186,11 @@ static int seccomp_do_user_notification(int this_syscall, if (err != 0) { /* - * Check to see if the notifcation got picked up and - * whether we should switch to wait killable. + * Check to see whether we should switch to wait + * killable. Only return the interrupted error if not. */ - if (!wait_killable && should_sleep_killable(match, &n)) - continue; - - goto interrupted; + if (!(!wait_killable && should_sleep_killable(match, &n))) + goto interrupted; } addfd = list_first_entry_or_null(&n.addfd, From b0c9bfbab925ac6385d4d06a134fd89cadf771fe Mon Sep 17 00:00:00 2001 From: Johannes Nixdorf Date: Fri, 25 Jul 2025 18:31:19 +0200 Subject: [PATCH 0003/3206] selftests/seccomp: Add a test for the WAIT_KILLABLE_RECV fast reply race If WAIT_KILLABLE_RECV was specified, and an event is received, the tracee's syscall is not supposed to be interruptible. This was not properly ensured if the reply was sent too fast, and an interrupting signal was received before the reply was processed on the tracee side. Add a test for this, that consists of: - a tracee with a timer that keeps sending it signals while repeatedly running a traced syscall in a loop, - a tracer that repeatedly handles all syscalls from the tracee in a loop, and - a shared pipe between both, on which the tracee sends one byte per syscall attempted and the tracer reads one byte per syscall handled. If the syscall for the tracee is restarted after the tracer received the event for it due to this bug, the tracee will not have sent a second token on the pipe, which the tracer will notice and fail the test. The tests also uses SECCOMP_IOCTL_NOTIF_ADDFD with SECCOMP_ADDFD_FLAG_SEND for the reply, as the fix for the bug has an additional code path change for handling addfd, which would not be exercised by a simple SECCOMP_IOCTL_NOTIF_SEND, and it is possible to fix the bug while leaving the same race intact for the addfd case. This test is not guaranteed to reproduce the bug on every run, but the parameters (signal frequency and number of repeated syscalls) have been chosen so that on my machine this test: - takes ~0.8s in the good case (+1s in the failure case), and - detects the bug in 999 of 1000 runs. Signed-off-by: Johannes Nixdorf Link: https://lore.kernel.org/r/20250725-seccomp-races-v2-2-cf8b9d139596@nixdorf.dev Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 61acbd45ffaaf8..fc4910d35342a1 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -3547,6 +3548,10 @@ static void signal_handler(int signal) perror("write from signal"); } +static void signal_handler_nop(int signal) +{ +} + TEST(user_notification_signal) { pid_t pid; @@ -4819,6 +4824,132 @@ TEST(user_notification_wait_killable_fatal) EXPECT_EQ(SIGTERM, WTERMSIG(status)); } +/* Ensure signals after the reply do not interrupt */ +TEST(user_notification_wait_killable_after_reply) +{ + int i, max_iter = 100000; + int listener, status; + int pipe_fds[2]; + pid_t pid; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) + { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + listener = user_notif_syscall( + __NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV); + ASSERT_GE(listener, 0); + + /* + * Used to count invocations. One token is transferred from the child + * to the parent per syscall invocation, the parent tries to take + * one token per successful RECV. If the syscall is restarted after + * RECV the parent will try to get two tokens while the child only + * provided one. + */ + ASSERT_EQ(pipe(pipe_fds), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct sigaction new_action = { + .sa_handler = signal_handler_nop, + .sa_flags = SA_RESTART, + }; + struct itimerval timer = { + .it_value = { .tv_usec = 1000 }, + .it_interval = { .tv_usec = 1000 }, + }; + char c = 'a'; + + close(pipe_fds[0]); + + /* Setup the sigaction with SA_RESTART */ + if (sigaction(SIGALRM, &new_action, NULL)) { + perror("sigaction"); + exit(1); + } + + /* + * Kill with SIGALRM repeatedly, to try to hit the race when + * handling the syscall. + */ + if (setitimer(ITIMER_REAL, &timer, NULL) < 0) + perror("setitimer"); + + for (i = 0; i < max_iter; ++i) { + int fd; + + /* Send one token per iteration to catch repeats. */ + if (write(pipe_fds[1], &c, sizeof(c)) != 1) { + perror("write"); + exit(1); + } + + fd = syscall(__NR_dup, 0); + if (fd < 0) { + perror("dup"); + exit(1); + } + close(fd); + } + + exit(0); + } + + close(pipe_fds[1]); + + for (i = 0; i < max_iter; ++i) { + struct seccomp_notif req = {}; + struct seccomp_notif_addfd addfd = {}; + struct pollfd pfd = { + .fd = pipe_fds[0], + .events = POLLIN, + }; + char c; + + /* + * Try to receive one token. If it failed, one child syscall + * was restarted after RECV and needed to be handled twice. + */ + ASSERT_EQ(poll(&pfd, 1, 1000), 1) + kill(pid, SIGKILL); + + ASSERT_EQ(read(pipe_fds[0], &c, sizeof(c)), 1) + kill(pid, SIGKILL); + + /* + * Get the notification, reply to it as fast as possible to test + * whether the child wrongly skips going into the non-preemptible + * (TASK_KILLABLE) state. + */ + do + ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req); + while (ret < 0 && errno == ENOENT); /* Accept interruptions before RECV */ + ASSERT_EQ(ret, 0) + kill(pid, SIGKILL); + + addfd.id = req.id; + addfd.flags = SECCOMP_ADDFD_FLAG_SEND; + addfd.srcfd = 0; + ASSERT_GE(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), 0) + kill(pid, SIGKILL); + } + + /* + * Wait for the process to exit, and make sure the process terminated + * with a zero exit code.. + */ + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + struct tsync_vs_thread_leader_args { pthread_t leader; }; From 86de56487e5f0017ffd5930b0dbd9dda43048849 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 30 Jul 2025 11:58:52 -0700 Subject: [PATCH 0004/3206] bpf: Allow syscall bpf programs to call non-recur helpers Allow syscall programs to call non-recur helpers too since syscall bpf programs runs in process context through bpf syscall, BPF_PROG_TEST_RUN, and cannot run recursively. bpf_task_storage_{get,set} have "_recur" versions that call trylock instead of taking the lock directly to avoid deadlock when called by bpf programs that run recursively. Currently, only bpf_lsm, bpf_iter, struct_ops without private stack are allow to call the non-recur helpers since they cannot be recursively called in another bpf program. Signed-off-by: Amery Hung Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20250730185903.3574598-2-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 94defa405c85e3..c823f8efe3edd2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -962,6 +962,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) case BPF_PROG_TYPE_STRUCT_OPS: return prog->aux->jits_use_priv_stack; case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_SYSCALL: return false; default: return true; From 31e838e1cdf4c4088fee8154ce8c12713ebfb2da Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 30 Jul 2025 11:58:53 -0700 Subject: [PATCH 0005/3206] selftests/bpf: Introduce task local data Task local data defines an abstract storage type for storing task- specific data (TLD). This patch provides user space and bpf implementation as header-only libraries for accessing task local data. Task local data is a bpf task local storage map with two UPTRs: - tld_meta_u, shared by all tasks of a process, consists of the total count and size of TLDs and an array of metadata of TLDs. A TLD metadata contains the size and name. The name is used to identify a specific TLD in bpf programs. - u_tld_data points to a task-specific memory. It stores TLD data and the starting offset of data in a page. Task local design decouple user space and bpf programs. Since bpf program does not know the size of TLDs in compile time, u_tld_data is declared as a page to accommodate TLDs up to a page. As a result, while user space will likely allocate memory smaller than a page for actual TLDs, it needs to pin a page to kernel. It will pin the page that contains enough memory if the allocated memory spans across the page boundary. The library also creates another task local storage map, tld_key_map, to cache keys for bpf programs to speed up the access. Below are the core task local data API: User space BPF Define TLD TLD_DEFINE_KEY(), tld_create_key() - Init TLD object - tld_object_init() Get TLD data tld_get_data() tld_get_data() - TLD_DEFINE_KEY(), tld_create_key() A TLD is first defined by the user space with TLD_DEFINE_KEY() or tld_create_key(). TLD_DEFINE_KEY() defines a TLD statically and allocates just enough memory during initialization. tld_create_key() allows creating TLDs on the fly, but has a fix memory budget, TLD_DYN_DATA_SIZE. Internally, they all call __tld_create_key(), which iterates tld_meta_u->metadata to check if a TLD can be added. The total TLD size needs to fit into a page (limit of UPTR), and no two TLDs can have the same name. If a TLD can be added, u_tld_meta->cnt is increased using cmpxchg as there may be other concurrent __tld_create_key(). After a successful cmpxchg, the last available tld_meta_u->metadata now belongs to the calling thread. To prevent other threads from reading incomplete metadata while it is being updated, tld_meta_u->metadata->size is used to signal the completion. Finally, the offset, derived from adding up prior TLD sizes is then encapsulated as an opaque object key to prevent user misuse. The offset is guaranteed to be 8-byte aligned to prevent load/store tearing and allow atomic operations on it. - tld_get_data() User space programs can pass the key to tld_get_data() to get a pointer to the associated TLD. The pointer will remain valid for the lifetime of the thread. tld_data_u is lazily allocated on the first call to tld_get_data(). Trying to read task local data from bpf will result in -ENODATA during tld_object_init(). The task-specific memory need to be freed manually by calling tld_free() on thread exit to prevent memory leak or use TLD_FREE_DATA_ON_THREAD_EXIT. - tld_object_init() (BPF) BPF programs need to call tld_object_init() before calling tld_get_data(). This is to avoid redundant map lookup in tld_get_data() by storing pointers to the map values on stack. The pointers are encapsulated as tld_object. tld_key_map is also created on the first time tld_object_init() is called to cache TLD keys successfully fetched by tld_get_data(). bpf_task_storage_get(.., F_CREATE) needs to be retried since it may fail when another thread has already taken the percpu counter lock for the task local storage. - tld_get_data() (BPF) BPF programs can also get a pointer to a TLD with tld_get_data(). It uses the cached key in tld_key_map to locate the data in tld_data_u->data. If the cached key is not set yet (<= 0), __tld_fetch_key() will be called to iterate tld_meta_u->metadata and find the TLD by name. To prevent redundant string comparison in the future when the search fail, the tld_meta_u->cnt is stored in the non-positive range of the key. Next time, __tld_fetch_key() will be called only if there are new TLDs and the search will start from the newly added tld_meta_u->metadata using the old tld_meta_u-cnt. Signed-off-by: Amery Hung Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20250730185903.3574598-3-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/task_local_data.h | 386 ++++++++++++++++++ .../selftests/bpf/progs/task_local_data.bpf.h | 237 +++++++++++ 2 files changed, 623 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/task_local_data.h create mode 100644 tools/testing/selftests/bpf/progs/task_local_data.bpf.h diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h new file mode 100644 index 00000000000000..a408d10c368821 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TASK_LOCAL_DATA_H +#define __TASK_LOCAL_DATA_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT +#include +#endif + +#include + +/* + * OPTIONS + * + * Define the option before including the header + * + * TLD_FREE_DATA_ON_THREAD_EXIT - Frees memory on thread exit automatically + * + * Thread-specific memory for storing TLD is allocated lazily on the first call to + * tld_get_data(). The thread that calls it must also call tld_free() on thread exit + * to prevent memory leak. Pthread will be included if the option is defined. A pthread + * key will be registered with a destructor that calls tld_free(). + * + * + * TLD_DYN_DATA_SIZE - The maximum size of memory allocated for TLDs created dynamically + * (default: 64 bytes) + * + * A TLD can be defined statically using TLD_DEFINE_KEY() or created on the fly using + * tld_create_key(). As the total size of TLDs created with tld_create_key() cannot be + * possibly known statically, a memory area of size TLD_DYN_DATA_SIZE will be allocated + * for these TLDs. This additional memory is allocated for every thread that calls + * tld_get_data() even if no tld_create_key are actually called, so be mindful of + * potential memory wastage. Use TLD_DEFINE_KEY() whenever possible as just enough memory + * will be allocated for TLDs created with it. + * + * + * TLD_NAME_LEN - The maximum length of the name of a TLD (default: 62) + * + * Setting TLD_NAME_LEN will affect the maximum number of TLDs a process can store, + * TLD_MAX_DATA_CNT. + * + * + * TLD_DATA_USE_ALIGNED_ALLOC - Always use aligned_alloc() instead of malloc() + * + * When allocating the memory for storing TLDs, we need to make sure there is a memory + * region of the X bytes within a page. This is due to the limit posed by UPTR: memory + * pinned to the kernel cannot exceed a page nor can it cross the page boundary. The + * library normally calls malloc(2*X) given X bytes of total TLDs, and only uses + * aligned_alloc(PAGE_SIZE, X) when X >= PAGE_SIZE / 2. This is to reduce memory wastage + * as not all memory allocator can use the exact amount of memory requested to fulfill + * aligned_alloc(). For example, some may round the size up to the alignment. Enable the + * option to always use aligned_alloc() if the implementation has low memory overhead. + */ + +#define TLD_PAGE_SIZE getpagesize() +#define TLD_PAGE_MASK (~(TLD_PAGE_SIZE - 1)) + +#define TLD_ROUND_MASK(x, y) ((__typeof__(x))((y) - 1)) +#define TLD_ROUND_UP(x, y) ((((x) - 1) | TLD_ROUND_MASK(x, y)) + 1) + +#define TLD_READ_ONCE(x) (*(volatile typeof(x) *)&(x)) + +#ifndef TLD_DYN_DATA_SIZE +#define TLD_DYN_DATA_SIZE 64 +#endif + +#define TLD_MAX_DATA_CNT (TLD_PAGE_SIZE / sizeof(struct tld_metadata) - 1) + +#ifndef TLD_NAME_LEN +#define TLD_NAME_LEN 62 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + __s16 off; +} tld_key_t; + +struct tld_metadata { + char name[TLD_NAME_LEN]; + _Atomic __u16 size; +}; + +struct tld_meta_u { + _Atomic __u8 cnt; + __u16 size; + struct tld_metadata metadata[]; +}; + +struct tld_data_u { + __u64 start; /* offset of tld_data_u->data in a page */ + char data[]; +}; + +struct tld_map_value { + void *data; + struct tld_meta_u *meta; +}; + +struct tld_meta_u * _Atomic tld_meta_p __attribute__((weak)); +__thread struct tld_data_u *tld_data_p __attribute__((weak)); +__thread void *tld_data_alloc_p __attribute__((weak)); + +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT +pthread_key_t tld_pthread_key __attribute__((weak)); + +static void tld_free(void); + +static void __tld_thread_exit_handler(void *unused) +{ + tld_free(); +} +#endif + +static int __tld_init_meta_p(void) +{ + struct tld_meta_u *meta, *uninit = NULL; + int err = 0; + + meta = (struct tld_meta_u *)aligned_alloc(TLD_PAGE_SIZE, TLD_PAGE_SIZE); + if (!meta) { + err = -ENOMEM; + goto out; + } + + memset(meta, 0, TLD_PAGE_SIZE); + meta->size = TLD_DYN_DATA_SIZE; + + if (!atomic_compare_exchange_strong(&tld_meta_p, &uninit, meta)) { + free(meta); + goto out; + } + +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT + pthread_key_create(&tld_pthread_key, __tld_thread_exit_handler); +#endif +out: + return err; +} + +static int __tld_init_data_p(int map_fd) +{ + bool use_aligned_alloc = false; + struct tld_map_value map_val; + struct tld_data_u *data; + void *data_alloc = NULL; + int err, tid_fd = -1; + + tid_fd = syscall(SYS_pidfd_open, gettid(), O_EXCL); + if (tid_fd < 0) { + err = -errno; + goto out; + } + +#ifdef TLD_DATA_USE_ALIGNED_ALLOC + use_aligned_alloc = true; +#endif + + /* + * tld_meta_p->size = TLD_DYN_DATA_SIZE + + * total size of TLDs defined via TLD_DEFINE_KEY() + */ + data_alloc = (use_aligned_alloc || tld_meta_p->size * 2 >= TLD_PAGE_SIZE) ? + aligned_alloc(TLD_PAGE_SIZE, tld_meta_p->size) : + malloc(tld_meta_p->size * 2); + if (!data_alloc) { + err = -ENOMEM; + goto out; + } + + /* + * Always pass a page-aligned address to UPTR since the size of tld_map_value::data + * is a page in BTF. If data_alloc spans across two pages, use the page that contains large + * enough memory. + */ + if (TLD_PAGE_SIZE - (~TLD_PAGE_MASK & (intptr_t)data_alloc) >= tld_meta_p->size) { + map_val.data = (void *)(TLD_PAGE_MASK & (intptr_t)data_alloc); + data = data_alloc; + data->start = (~TLD_PAGE_MASK & (intptr_t)data_alloc) + + offsetof(struct tld_data_u, data); + } else { + map_val.data = (void *)(TLD_ROUND_UP((intptr_t)data_alloc, TLD_PAGE_SIZE)); + data = (void *)(TLD_ROUND_UP((intptr_t)data_alloc, TLD_PAGE_SIZE)); + data->start = offsetof(struct tld_data_u, data); + } + map_val.meta = TLD_READ_ONCE(tld_meta_p); + + err = bpf_map_update_elem(map_fd, &tid_fd, &map_val, 0); + if (err) { + free(data_alloc); + goto out; + } + + tld_data_p = data; + tld_data_alloc_p = data_alloc; +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT + pthread_setspecific(tld_pthread_key, (void *)1); +#endif +out: + if (tid_fd >= 0) + close(tid_fd); + return err; +} + +static tld_key_t __tld_create_key(const char *name, size_t size, bool dyn_data) +{ + int err, i, sz, off = 0; + __u8 cnt; + + if (!TLD_READ_ONCE(tld_meta_p)) { + err = __tld_init_meta_p(); + if (err) + return (tld_key_t){err}; + } + + for (i = 0; i < TLD_MAX_DATA_CNT; i++) { +retry: + cnt = atomic_load(&tld_meta_p->cnt); + if (i < cnt) { + /* A metadata is not ready until size is updated with a non-zero value */ + while (!(sz = atomic_load(&tld_meta_p->metadata[i].size))) + sched_yield(); + + if (!strncmp(tld_meta_p->metadata[i].name, name, TLD_NAME_LEN)) + return (tld_key_t){-EEXIST}; + + off += TLD_ROUND_UP(sz, 8); + continue; + } + + /* + * TLD_DEFINE_KEY() is given memory upto a page while at most + * TLD_DYN_DATA_SIZE is allocated for tld_create_key() + */ + if (dyn_data) { + if (off + TLD_ROUND_UP(size, 8) > tld_meta_p->size) + return (tld_key_t){-E2BIG}; + } else { + if (off + TLD_ROUND_UP(size, 8) > TLD_PAGE_SIZE - sizeof(struct tld_data_u)) + return (tld_key_t){-E2BIG}; + tld_meta_p->size += TLD_ROUND_UP(size, 8); + } + + /* + * Only one tld_create_key() can increase the current cnt by one and + * takes the latest available slot. Other threads will check again if a new + * TLD can still be added, and then compete for the new slot after the + * succeeding thread update the size. + */ + if (!atomic_compare_exchange_strong(&tld_meta_p->cnt, &cnt, cnt + 1)) + goto retry; + + strncpy(tld_meta_p->metadata[i].name, name, TLD_NAME_LEN); + atomic_store(&tld_meta_p->metadata[i].size, size); + return (tld_key_t){(__s16)off}; + } + + return (tld_key_t){-ENOSPC}; +} + +/** + * TLD_DEFINE_KEY() - Define a TLD and a global variable key associated with the TLD. + * + * @name: The name of the TLD + * @size: The size of the TLD + * @key: The variable name of the key. Cannot exceed TLD_NAME_LEN + * + * The macro can only be used in file scope. + * + * A global variable key of opaque type, tld_key_t, will be declared and initialized before + * main() starts. Use tld_key_is_err() or tld_key_err_or_zero() later to check if the key + * creation succeeded. Pass the key to tld_get_data() to get a pointer to the TLD. + * bpf programs can also fetch the same key by name. + * + * The total size of TLDs created using TLD_DEFINE_KEY() cannot exceed a page. Just + * enough memory will be allocated for each thread on the first call to tld_get_data(). + */ +#define TLD_DEFINE_KEY(key, name, size) \ +tld_key_t key; \ + \ +__attribute__((constructor)) \ +void __tld_define_key_##key(void) \ +{ \ + key = __tld_create_key(name, size, false); \ +} + +/** + * tld_create_key() - Create a TLD and return a key associated with the TLD. + * + * @name: The name the TLD + * @size: The size of the TLD + * + * Return an opaque object key. Use tld_key_is_err() or tld_key_err_or_zero() to check + * if the key creation succeeded. Pass the key to tld_get_data() to get a pointer to + * locate the TLD. bpf programs can also fetch the same key by name. + * + * Use tld_create_key() only when a TLD needs to be created dynamically (e.g., @name is + * not known statically or a TLD needs to be created conditionally) + * + * An additional TLD_DYN_DATA_SIZE bytes are allocated per-thread to accommodate TLDs + * created dynamically with tld_create_key(). Since only a user page is pinned to the + * kernel, when TLDs created with TLD_DEFINE_KEY() uses more than TLD_PAGE_SIZE - + * TLD_DYN_DATA_SIZE, the buffer size will be limited to the rest of the page. + */ +__attribute__((unused)) +static tld_key_t tld_create_key(const char *name, size_t size) +{ + return __tld_create_key(name, size, true); +} + +__attribute__((unused)) +static inline bool tld_key_is_err(tld_key_t key) +{ + return key.off < 0; +} + +__attribute__((unused)) +static inline int tld_key_err_or_zero(tld_key_t key) +{ + return tld_key_is_err(key) ? key.off : 0; +} + +/** + * tld_get_data() - Get a pointer to the TLD associated with the given key of the + * calling thread. + * + * @map_fd: A file descriptor of tld_data_map, the underlying BPF task local storage map + * of task local data. + * @key: A key object created by TLD_DEFINE_KEY() or tld_create_key(). + * + * Return a pointer to the TLD if the key is valid; NULL if not enough memory for TLD + * for this thread, or the key is invalid. The returned pointer is guaranteed to be 8-byte + * aligned. + * + * Threads that call tld_get_data() must call tld_free() on exit to prevent + * memory leak if TLD_FREE_DATA_ON_THREAD_EXIT is not defined. + */ +__attribute__((unused)) +static void *tld_get_data(int map_fd, tld_key_t key) +{ + if (!TLD_READ_ONCE(tld_meta_p)) + return NULL; + + /* tld_data_p is allocated on the first invocation of tld_get_data() */ + if (!tld_data_p && __tld_init_data_p(map_fd)) + return NULL; + + return tld_data_p->data + key.off; +} + +/** + * tld_free() - Free task local data memory of the calling thread + * + * For the calling thread, all pointers to TLDs acquired before will become invalid. + * + * Users must call tld_free() on thread exit to prevent memory leak. Alternatively, + * define TLD_FREE_DATA_ON_THREAD_EXIT and a thread exit handler will be registered + * to free the memory automatically. + */ +__attribute__((unused)) +static void tld_free(void) +{ + if (tld_data_alloc_p) { + free(tld_data_alloc_p); + tld_data_alloc_p = NULL; + tld_data_p = NULL; + } +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __TASK_LOCAL_DATA_H */ diff --git a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h new file mode 100644 index 00000000000000..432fff2af8441f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TASK_LOCAL_DATA_BPF_H +#define __TASK_LOCAL_DATA_BPF_H + +/* + * Task local data is a library that facilitates sharing per-task data + * between user space and bpf programs. + * + * + * USAGE + * + * A TLD, an entry of data in task local data, first needs to be created by the + * user space. This is done by calling user space API, TLD_DEFINE_KEY() or + * tld_create_key(), with the name of the TLD and the size. + * + * TLD_DEFINE_KEY(prio, "priority", sizeof(int)); + * + * or + * + * void func_call(...) { + * tld_key_t prio, in_cs; + * + * prio = tld_create_key("priority", sizeof(int)); + * in_cs = tld_create_key("in_critical_section", sizeof(bool)); + * ... + * + * A key associated with the TLD, which has an opaque type tld_key_t, will be + * initialized or returned. It can be used to get a pointer to the TLD in the + * user space by calling tld_get_data(). + * + * In a bpf program, tld_object_init() first needs to be called to initialized a + * tld_object on the stack. Then, TLDs can be accessed by calling tld_get_data(). + * The API will try to fetch the key by the name and use it to locate the data. + * A pointer to the TLD will be returned. It also caches the key in a task local + * storage map, tld_key_map, whose value type, struct tld_keys, must be defined + * by the developer. + * + * struct tld_keys { + * tld_key_t prio; + * tld_key_t in_cs; + * }; + * + * SEC("struct_ops") + * void prog(struct task_struct task, ...) + * { + * struct tld_object tld_obj; + * int err, *p; + * + * err = tld_object_init(task, &tld_obj); + * if (err) + * return; + * + * p = tld_get_data(&tld_obj, prio, "priority", sizeof(int)); + * if (p) + * // do something depending on *p + */ +#include +#include + +#define TLD_ROUND_MASK(x, y) ((__typeof__(x))((y) - 1)) +#define TLD_ROUND_UP(x, y) ((((x) - 1) | TLD_ROUND_MASK(x, y)) + 1) + +#define TLD_MAX_DATA_CNT (__PAGE_SIZE / sizeof(struct tld_metadata) - 1) + +#ifndef TLD_NAME_LEN +#define TLD_NAME_LEN 62 +#endif + +#ifndef TLD_KEY_MAP_CREATE_RETRY +#define TLD_KEY_MAP_CREATE_RETRY 10 +#endif + +typedef struct { + __s16 off; +} tld_key_t; + +struct tld_metadata { + char name[TLD_NAME_LEN]; + __u16 size; +}; + +struct tld_meta_u { + __u8 cnt; + __u16 size; + struct tld_metadata metadata[TLD_MAX_DATA_CNT]; +}; + +struct tld_data_u { + __u64 start; /* offset of tld_data_u->data in a page */ + char data[__PAGE_SIZE - sizeof(__u64)]; +}; + +struct tld_map_value { + struct tld_data_u __uptr *data; + struct tld_meta_u __uptr *meta; +}; + +typedef struct tld_uptr_dummy { + struct tld_data_u data[0]; + struct tld_meta_u meta[0]; +} *tld_uptr_dummy_t; + +struct tld_object { + struct tld_map_value *data_map; + struct tld_keys *key_map; + /* + * Force the compiler to generate the actual definition of tld_meta_u + * and tld_data_u in BTF. Without it, tld_meta_u and u_tld_data will + * be BTF_KIND_FWD. + */ + tld_uptr_dummy_t dummy[0]; +}; + +/* + * Map value of tld_key_map for caching keys. Must be defined by the developer. + * Members should be tld_key_t and passed to the 3rd argument of tld_fetch_key(). + */ +struct tld_keys; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tld_map_value); +} tld_data_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tld_keys); +} tld_key_map SEC(".maps"); + +/** + * tld_object_init() - Initialize a tld_object. + * + * @task: The task_struct of the target task + * @tld_obj: A pointer to a tld_object to be initialized + * + * Return 0 on success; -ENODATA if the user space did not initialize task local data + * for the current task through tld_get_data(); -ENOMEM if the creation of tld_key_map + * fails + */ +__attribute__((unused)) +static int tld_object_init(struct task_struct *task, struct tld_object *tld_obj) +{ + int i; + + tld_obj->data_map = bpf_task_storage_get(&tld_data_map, task, 0, 0); + if (!tld_obj->data_map) + return -ENODATA; + + bpf_for(i, 0, TLD_KEY_MAP_CREATE_RETRY) { + tld_obj->key_map = bpf_task_storage_get(&tld_key_map, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (likely(tld_obj->key_map)) + break; + } + if (!tld_obj->key_map) + return -ENOMEM; + + return 0; +} + +/* + * Return the offset of TLD if @name is found. Otherwise, return the current TLD count + * using the nonpositive range so that the next tld_get_data() can skip fetching key if + * no new TLD is added or start comparing name from the first newly added TLD. + */ +__attribute__((unused)) +static int __tld_fetch_key(struct tld_object *tld_obj, const char *name, int i_start) +{ + struct tld_metadata *metadata; + int i, cnt, start, off = 0; + + if (!tld_obj->data_map || !tld_obj->data_map->data || !tld_obj->data_map->meta) + return 0; + + start = tld_obj->data_map->data->start; + cnt = tld_obj->data_map->meta->cnt; + metadata = tld_obj->data_map->meta->metadata; + + bpf_for(i, 0, cnt) { + if (i >= TLD_MAX_DATA_CNT) + break; + + if (i >= i_start && !bpf_strncmp(metadata[i].name, TLD_NAME_LEN, name)) + return start + off; + + off += TLD_ROUND_UP(metadata[i].size, 8); + } + + return -cnt; +} + +/** + * tld_get_data() - Retrieve a pointer to the TLD associated with the name. + * + * @tld_obj: A pointer to a valid tld_object initialized by tld_object_init() + * @key: The cached key of the TLD in tld_key_map + * @name: The name of the key associated with a TLD + * @size: The size of the TLD. Must be a known constant value + * + * Return a pointer to the TLD associated with @name; NULL if not found or @size is too + * big. @key is used to cache the key if the TLD is found to speed up subsequent calls. + * It should be defined as an member of tld_keys of tld_key_t type by the developer. + */ +#define tld_get_data(tld_obj, key, name, size) \ + ({ \ + void *data = NULL, *_data = (tld_obj)->data_map->data; \ + long off = (tld_obj)->key_map->key.off; \ + int cnt; \ + \ + if (likely(_data)) { \ + if (likely(off > 0)) { \ + barrier_var(off); \ + if (likely(off < __PAGE_SIZE - size)) \ + data = _data + off; \ + } else { \ + cnt = -(off); \ + if (likely((tld_obj)->data_map->meta) && \ + cnt < (tld_obj)->data_map->meta->cnt) { \ + off = __tld_fetch_key(tld_obj, name, cnt); \ + (tld_obj)->key_map->key.off = off; \ + \ + if (likely(off < __PAGE_SIZE - size)) { \ + barrier_var(off); \ + if (off > 0) \ + data = _data + off; \ + } \ + } \ + } \ + } \ + data; \ + }) + +#endif From 120f1a950e495d9751fdb5c8b7852d94546dcd03 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 30 Jul 2025 11:58:54 -0700 Subject: [PATCH 0006/3206] selftests/bpf: Test basic task local data operations Test basic operations of task local data with valid and invalid tld_create_key(). For invalid calls, make sure they return the right error code and check that the TLDs are not inserted by running tld_get_data(" value_not_exists") on the bpf side. The call should a null pointer. For valid calls, first make sure the TLDs are created by calling tld_get_data() on the bpf side. The call should return a valid pointer. Finally, verify that the TLDs are indeed task-specific (i.e., their addresses do not overlap) with multiple user threads. This done by writing values unique to each thread, reading them from both user space and bpf, and checking if the value read back matches the value written. Signed-off-by: Amery Hung Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20250730185903.3574598-4-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/test_task_local_data.c | 192 ++++++++++++++++++ .../bpf/progs/test_task_local_data.c | 65 ++++++ 2 files changed, 257 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_task_local_data.c create mode 100644 tools/testing/selftests/bpf/progs/test_task_local_data.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c new file mode 100644 index 00000000000000..2e77d3fa2534b0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#define TLD_FREE_DATA_ON_THREAD_EXIT +#define TLD_DYN_DATA_SIZE 4096 +#include "task_local_data.h" + +struct test_tld_struct { + __u64 a; + __u64 b; + __u64 c; + __u64 d; +}; + +#include "test_task_local_data.skel.h" + +TLD_DEFINE_KEY(value0_key, "value0", sizeof(int)); + +/* + * Reset task local data between subtests by clearing metadata other + * than the statically defined value0. This is safe as subtests run + * sequentially. Users of task local data library should not touch + * library internal. + */ +static void reset_tld(void) +{ + if (TLD_READ_ONCE(tld_meta_p)) { + /* Remove TLDs created by tld_create_key() */ + tld_meta_p->cnt = 1; + tld_meta_p->size = TLD_DYN_DATA_SIZE; + memset(&tld_meta_p->metadata[1], 0, + (TLD_MAX_DATA_CNT - 1) * sizeof(struct tld_metadata)); + } +} + +/* Serialize access to bpf program's global variables */ +static pthread_mutex_t global_mutex; + +static tld_key_t *tld_keys; + +#define TEST_BASIC_THREAD_NUM 32 + +void *test_task_local_data_basic_thread(void *arg) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct test_task_local_data *skel = (struct test_task_local_data *)arg; + int fd, err, tid, *value0, *value1; + struct test_tld_struct *value2; + + fd = bpf_map__fd(skel->maps.tld_data_map); + + value0 = tld_get_data(fd, value0_key); + if (!ASSERT_OK_PTR(value0, "tld_get_data")) + goto out; + + value1 = tld_get_data(fd, tld_keys[1]); + if (!ASSERT_OK_PTR(value1, "tld_get_data")) + goto out; + + value2 = tld_get_data(fd, tld_keys[2]); + if (!ASSERT_OK_PTR(value2, "tld_get_data")) + goto out; + + tid = gettid(); + + *value0 = tid + 0; + *value1 = tid + 1; + value2->a = tid + 2; + value2->b = tid + 3; + value2->c = tid + 4; + value2->d = tid + 5; + + pthread_mutex_lock(&global_mutex); + /* Run task_main that read task local data and save to global variables */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.task_main), &opts); + ASSERT_OK(err, "run task_main"); + ASSERT_OK(opts.retval, "task_main retval"); + + ASSERT_EQ(skel->bss->test_value0, tid + 0, "tld_get_data value0"); + ASSERT_EQ(skel->bss->test_value1, tid + 1, "tld_get_data value1"); + ASSERT_EQ(skel->bss->test_value2.a, tid + 2, "tld_get_data value2.a"); + ASSERT_EQ(skel->bss->test_value2.b, tid + 3, "tld_get_data value2.b"); + ASSERT_EQ(skel->bss->test_value2.c, tid + 4, "tld_get_data value2.c"); + ASSERT_EQ(skel->bss->test_value2.d, tid + 5, "tld_get_data value2.d"); + pthread_mutex_unlock(&global_mutex); + + /* Make sure valueX are indeed local to threads */ + ASSERT_EQ(*value0, tid + 0, "value0"); + ASSERT_EQ(*value1, tid + 1, "value1"); + ASSERT_EQ(value2->a, tid + 2, "value2.a"); + ASSERT_EQ(value2->b, tid + 3, "value2.b"); + ASSERT_EQ(value2->c, tid + 4, "value2.c"); + ASSERT_EQ(value2->d, tid + 5, "value2.d"); + + *value0 = tid + 5; + *value1 = tid + 4; + value2->a = tid + 3; + value2->b = tid + 2; + value2->c = tid + 1; + value2->d = tid + 0; + + /* Run task_main again */ + pthread_mutex_lock(&global_mutex); + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.task_main), &opts); + ASSERT_OK(err, "run task_main"); + ASSERT_OK(opts.retval, "task_main retval"); + + ASSERT_EQ(skel->bss->test_value0, tid + 5, "tld_get_data value0"); + ASSERT_EQ(skel->bss->test_value1, tid + 4, "tld_get_data value1"); + ASSERT_EQ(skel->bss->test_value2.a, tid + 3, "tld_get_data value2.a"); + ASSERT_EQ(skel->bss->test_value2.b, tid + 2, "tld_get_data value2.b"); + ASSERT_EQ(skel->bss->test_value2.c, tid + 1, "tld_get_data value2.c"); + ASSERT_EQ(skel->bss->test_value2.d, tid + 0, "tld_get_data value2.d"); + pthread_mutex_unlock(&global_mutex); + +out: + pthread_exit(NULL); +} + +static void test_task_local_data_basic(void) +{ + struct test_task_local_data *skel; + pthread_t thread[TEST_BASIC_THREAD_NUM]; + char dummy_key_name[TLD_NAME_LEN]; + tld_key_t key; + int i, err; + + reset_tld(); + + ASSERT_OK(pthread_mutex_init(&global_mutex, NULL), "pthread_mutex_init"); + + skel = test_task_local_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + tld_keys = calloc(TLD_MAX_DATA_CNT, sizeof(tld_key_t)); + if (!ASSERT_OK_PTR(tld_keys, "calloc tld_keys")) + goto out; + + ASSERT_FALSE(tld_key_is_err(value0_key), "TLD_DEFINE_KEY"); + tld_keys[1] = tld_create_key("value1", sizeof(int)); + ASSERT_FALSE(tld_key_is_err(tld_keys[1]), "tld_create_key"); + tld_keys[2] = tld_create_key("value2", sizeof(struct test_tld_struct)); + ASSERT_FALSE(tld_key_is_err(tld_keys[2]), "tld_create_key"); + + /* + * Shouldn't be able to store data exceed a page. Create a TLD just big + * enough to exceed a page. TLDs already created are int value0, int + * value1, and struct test_tld_struct value2. + */ + key = tld_create_key("value_not_exist", + TLD_PAGE_SIZE - 2 * sizeof(int) - sizeof(struct test_tld_struct) + 1); + ASSERT_EQ(tld_key_err_or_zero(key), -E2BIG, "tld_create_key"); + + key = tld_create_key("value2", sizeof(struct test_tld_struct)); + ASSERT_EQ(tld_key_err_or_zero(key), -EEXIST, "tld_create_key"); + + /* Shouldn't be able to create the (TLD_MAX_DATA_CNT+1)-th TLD */ + for (i = 3; i < TLD_MAX_DATA_CNT; i++) { + snprintf(dummy_key_name, TLD_NAME_LEN, "dummy_value%d", i); + tld_keys[i] = tld_create_key(dummy_key_name, sizeof(int)); + ASSERT_FALSE(tld_key_is_err(tld_keys[i]), "tld_create_key"); + } + key = tld_create_key("value_not_exist", sizeof(struct test_tld_struct)); + ASSERT_EQ(tld_key_err_or_zero(key), -ENOSPC, "tld_create_key"); + + /* Access TLDs from multiple threads and check if they are thread-specific */ + for (i = 0; i < TEST_BASIC_THREAD_NUM; i++) { + err = pthread_create(&thread[i], NULL, test_task_local_data_basic_thread, skel); + if (!ASSERT_OK(err, "pthread_create")) + goto out; + } + +out: + for (i = 0; i < TEST_BASIC_THREAD_NUM; i++) + pthread_join(thread[i], NULL); + + if (tld_keys) { + free(tld_keys); + tld_keys = NULL; + } + tld_free(); + test_task_local_data__destroy(skel); +} + +void test_task_local_data(void) +{ + if (test__start_subtest("task_local_data_basic")) + test_task_local_data_basic(); +} diff --git a/tools/testing/selftests/bpf/progs/test_task_local_data.c b/tools/testing/selftests/bpf/progs/test_task_local_data.c new file mode 100644 index 00000000000000..fffafc0130449e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_task_local_data.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#include "task_local_data.bpf.h" + +struct tld_keys { + tld_key_t value0; + tld_key_t value1; + tld_key_t value2; + tld_key_t value_not_exist; +}; + +struct test_tld_struct { + __u64 a; + __u64 b; + __u64 c; + __u64 d; +}; + +int test_value0; +int test_value1; +struct test_tld_struct test_value2; + +SEC("syscall") +int task_main(void *ctx) +{ + struct tld_object tld_obj; + struct test_tld_struct *struct_p; + struct task_struct *task; + int err, *int_p; + + task = bpf_get_current_task_btf(); + err = tld_object_init(task, &tld_obj); + if (err) + return 1; + + int_p = tld_get_data(&tld_obj, value0, "value0", sizeof(int)); + if (int_p) + test_value0 = *int_p; + else + return 2; + + int_p = tld_get_data(&tld_obj, value1, "value1", sizeof(int)); + if (int_p) + test_value1 = *int_p; + else + return 3; + + struct_p = tld_get_data(&tld_obj, value2, "value2", sizeof(struct test_tld_struct)); + if (struct_p) + test_value2 = *struct_p; + else + return 4; + + int_p = tld_get_data(&tld_obj, value_not_exist, "value_not_exist", sizeof(int)); + if (int_p) + return 5; + + return 0; +} + +char _license[] SEC("license") = "GPL"; From 784181141782204f6dbbeadf01780e81da5fcb29 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 30 Jul 2025 11:58:55 -0700 Subject: [PATCH 0007/3206] selftests/bpf: Test concurrent task local data key creation Test thread-safety of tld_create_key(). Since tld_create_key() does not rely on locks but memory barriers and atomic operations to protect the shared metadata, the thread-safety of the function is non-trivial. Make sure concurrent tld_key_create(), both valid and invalid, can not race and corrupt metatada, which may leads to TLDs not being thread- specific or duplicate TLDs with the same name. Signed-off-by: Amery Hung Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20250730185903.3574598-5-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/test_task_local_data.c | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c index 2e77d3fa2534b0..3b5cd2cd89c7c2 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c @@ -185,8 +185,113 @@ static void test_task_local_data_basic(void) test_task_local_data__destroy(skel); } +#define TEST_RACE_THREAD_NUM (TLD_MAX_DATA_CNT - 3) + +void *test_task_local_data_race_thread(void *arg) +{ + int err = 0, id = (intptr_t)arg; + char key_name[32]; + tld_key_t key; + + key = tld_create_key("value_not_exist", TLD_PAGE_SIZE + 1); + if (tld_key_err_or_zero(key) != -E2BIG) { + err = 1; + goto out; + } + + /* Only one thread will succeed in creating value1 */ + key = tld_create_key("value1", sizeof(int)); + if (!tld_key_is_err(key)) + tld_keys[1] = key; + + /* Only one thread will succeed in creating value2 */ + key = tld_create_key("value2", sizeof(struct test_tld_struct)); + if (!tld_key_is_err(key)) + tld_keys[2] = key; + + snprintf(key_name, 32, "thread_%d", id); + tld_keys[id] = tld_create_key(key_name, sizeof(int)); + if (tld_key_is_err(tld_keys[id])) + err = 2; +out: + return (void *)(intptr_t)err; +} + +static void test_task_local_data_race(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + pthread_t thread[TEST_RACE_THREAD_NUM]; + struct test_task_local_data *skel; + int fd, i, j, err, *data; + void *ret = NULL; + + skel = test_task_local_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + tld_keys = calloc(TLD_MAX_DATA_CNT, sizeof(tld_key_t)); + if (!ASSERT_OK_PTR(tld_keys, "calloc tld_keys")) + goto out; + + fd = bpf_map__fd(skel->maps.tld_data_map); + + ASSERT_FALSE(tld_key_is_err(value0_key), "TLD_DEFINE_KEY"); + tld_keys[0] = value0_key; + + for (j = 0; j < 100; j++) { + reset_tld(); + + for (i = 0; i < TEST_RACE_THREAD_NUM; i++) { + /* + * Try to make tld_create_key() race with each other. Call + * tld_create_key(), both valid and invalid, from different threads. + */ + err = pthread_create(&thread[i], NULL, test_task_local_data_race_thread, + (void *)(intptr_t)(i + 3)); + if (CHECK_FAIL(err)) + break; + } + + /* Wait for all tld_create_key() to return */ + for (i = 0; i < TEST_RACE_THREAD_NUM; i++) { + pthread_join(thread[i], &ret); + if (CHECK_FAIL(ret)) + break; + } + + /* Write a unique number to each TLD */ + for (i = 0; i < TLD_MAX_DATA_CNT; i++) { + data = tld_get_data(fd, tld_keys[i]); + if (CHECK_FAIL(!data)) + break; + *data = i; + } + + /* Read TLDs and check the value to see if any address collides with another */ + for (i = 0; i < TLD_MAX_DATA_CNT; i++) { + data = tld_get_data(fd, tld_keys[i]); + if (CHECK_FAIL(*data != i)) + break; + } + + /* Run task_main to make sure no invalid TLDs are added */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.task_main), &opts); + ASSERT_OK(err, "run task_main"); + ASSERT_OK(opts.retval, "task_main retval"); + } +out: + if (tld_keys) { + free(tld_keys); + tld_keys = NULL; + } + tld_free(); + test_task_local_data__destroy(skel); +} + void test_task_local_data(void) { if (test__start_subtest("task_local_data_basic")) test_task_local_data_basic(); + if (test__start_subtest("task_local_data_race")) + test_task_local_data_race(); } From 2d812311c2b28cc9096e29862a957aeb32bfdb76 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Wed, 23 Jul 2025 22:44:40 +0800 Subject: [PATCH 0008/3206] bpftool: Add bpf_token show Add `bpftool token show` command to get token info from bpffs in /proc/mounts. Example plain output for `token show`: token_info /sys/fs/bpf/token allowed_cmds: map_create prog_load allowed_maps: allowed_progs: kprobe allowed_attachs: xdp token_info /sys/fs/bpf/token2 allowed_cmds: map_create prog_load allowed_maps: allowed_progs: kprobe allowed_attachs: xdp Example json output for `token show`: [{ "token_info": "/sys/fs/bpf/token", "allowed_cmds": ["map_create", "prog_load"], "allowed_maps": [], "allowed_progs": ["kprobe"], "allowed_attachs": ["xdp"] }, { "token_info": "/sys/fs/bpf/token2", "allowed_cmds": ["map_create", "prog_load"], "allowed_maps": [], "allowed_progs": ["kprobe"], "allowed_attachs": ["xdp"] }] Reviewed-by: Quentin Monnet Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/20250723144442.1427943-1-chen.dylane@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/main.c | 3 +- tools/bpf/bpftool/main.h | 1 + tools/bpf/bpftool/token.c | 225 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 tools/bpf/bpftool/token.c diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 2b7f2bd3a7dbc7..0f1183b2ed0a07 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -61,7 +61,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n" + " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter | token }\n" " " HELP_SPEC_OPTIONS " |\n" " {-V|--version} }\n" "", @@ -87,6 +87,7 @@ static const struct cmd commands[] = { { "gen", do_gen }, { "struct_ops", do_struct_ops }, { "iter", do_iter }, + { "token", do_token }, { "version", do_version }, { 0 } }; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 6db704fda5c00f..a2bb0714b3d6cf 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -166,6 +166,7 @@ int do_tracelog(int argc, char **arg) __weak; int do_feature(int argc, char **argv) __weak; int do_struct_ops(int argc, char **argv) __weak; int do_iter(int argc, char **argv) __weak; +int do_token(int argc, char **argv) __weak; int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int prog_parse_fd(int *argc, char ***argv); diff --git a/tools/bpf/bpftool/token.c b/tools/bpf/bpftool/token.c new file mode 100644 index 00000000000000..6312e662a1299a --- /dev/null +++ b/tools/bpf/bpftool/token.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2025 Didi Technology Co., Tao Chen */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "json_writer.h" +#include "main.h" + +#define MOUNTS_FILE "/proc/mounts" + +static bool has_delegate_options(const char *mnt_ops) +{ + return strstr(mnt_ops, "delegate_cmds") || + strstr(mnt_ops, "delegate_maps") || + strstr(mnt_ops, "delegate_progs") || + strstr(mnt_ops, "delegate_attachs"); +} + +static char *get_delegate_value(const char *opts, const char *key) +{ + char *token, *rest, *ret = NULL; + char *opts_copy = strdup(opts); + + if (!opts_copy) + return NULL; + + for (token = strtok_r(opts_copy, ",", &rest); token; + token = strtok_r(NULL, ",", &rest)) { + if (strncmp(token, key, strlen(key)) == 0 && + token[strlen(key)] == '=') { + ret = token + strlen(key) + 1; + break; + } + } + free(opts_copy); + + return ret; +} + +static void print_items_per_line(const char *input, int items_per_line) +{ + char *str, *rest, *strs; + int cnt = 0; + + if (!input) + return; + + strs = strdup(input); + if (!strs) + return; + + for (str = strtok_r(strs, ":", &rest); str; + str = strtok_r(NULL, ":", &rest)) { + if (cnt % items_per_line == 0) + printf("\n\t "); + + printf("%-20s", str); + cnt++; + } + + free(strs); +} + +#define ITEMS_PER_LINE 4 +static void show_token_info_plain(struct mntent *mntent) +{ + char *value; + + printf("token_info %s", mntent->mnt_dir); + + printf("\n\tallowed_cmds:"); + value = get_delegate_value(mntent->mnt_opts, "delegate_cmds"); + print_items_per_line(value, ITEMS_PER_LINE); + + printf("\n\tallowed_maps:"); + value = get_delegate_value(mntent->mnt_opts, "delegate_maps"); + print_items_per_line(value, ITEMS_PER_LINE); + + printf("\n\tallowed_progs:"); + value = get_delegate_value(mntent->mnt_opts, "delegate_progs"); + print_items_per_line(value, ITEMS_PER_LINE); + + printf("\n\tallowed_attachs:"); + value = get_delegate_value(mntent->mnt_opts, "delegate_attachs"); + print_items_per_line(value, ITEMS_PER_LINE); + printf("\n"); +} + +static void split_json_array_str(const char *input) +{ + char *str, *rest, *strs; + + if (!input) { + jsonw_start_array(json_wtr); + jsonw_end_array(json_wtr); + return; + } + + strs = strdup(input); + if (!strs) + return; + + jsonw_start_array(json_wtr); + for (str = strtok_r(strs, ":", &rest); str; + str = strtok_r(NULL, ":", &rest)) { + jsonw_string(json_wtr, str); + } + jsonw_end_array(json_wtr); + + free(strs); +} + +static void show_token_info_json(struct mntent *mntent) +{ + char *value; + + jsonw_start_object(json_wtr); + + jsonw_string_field(json_wtr, "token_info", mntent->mnt_dir); + + jsonw_name(json_wtr, "allowed_cmds"); + value = get_delegate_value(mntent->mnt_opts, "delegate_cmds"); + split_json_array_str(value); + + jsonw_name(json_wtr, "allowed_maps"); + value = get_delegate_value(mntent->mnt_opts, "delegate_maps"); + split_json_array_str(value); + + jsonw_name(json_wtr, "allowed_progs"); + value = get_delegate_value(mntent->mnt_opts, "delegate_progs"); + split_json_array_str(value); + + jsonw_name(json_wtr, "allowed_attachs"); + value = get_delegate_value(mntent->mnt_opts, "delegate_attachs"); + split_json_array_str(value); + + jsonw_end_object(json_wtr); +} + +static int __show_token_info(struct mntent *mntent) +{ + if (json_output) + show_token_info_json(mntent); + else + show_token_info_plain(mntent); + + return 0; +} + +static int show_token_info(void) +{ + FILE *fp; + struct mntent *ent; + + fp = setmntent(MOUNTS_FILE, "r"); + if (!fp) { + p_err("Failed to open: %s", MOUNTS_FILE); + return -1; + } + + if (json_output) + jsonw_start_array(json_wtr); + + while ((ent = getmntent(fp)) != NULL) { + if (strncmp(ent->mnt_type, "bpf", 3) == 0) { + if (has_delegate_options(ent->mnt_opts)) + __show_token_info(ent); + } + } + + if (json_output) + jsonw_end_array(json_wtr); + + endmntent(fp); + + return 0; +} + +static int do_show(int argc, char **argv) +{ + if (argc) + return BAD_ARG(); + + return show_token_info(); +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list }\n" + " %1$s %2$s help\n" + "\n" + "", + bin_name, argv[-2]); + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { 0 } +}; + +int do_token(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} From b7f6400849162b918020c5d10d5b7f378afbf470 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Wed, 23 Jul 2025 22:44:41 +0800 Subject: [PATCH 0009/3206] bpftool: Add bpftool-token manpage Add bpftool-token manpage with information and examples of token-related commands. Suggested-by: Quentin Monnet Reviewed-by: Quentin Monnet Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/20250723144442.1427943-2-chen.dylane@linux.dev Signed-off-by: Alexei Starovoitov --- .../bpftool/Documentation/bpftool-token.rst | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 tools/bpf/bpftool/Documentation/bpftool-token.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-token.rst b/tools/bpf/bpftool/Documentation/bpftool-token.rst new file mode 100644 index 00000000000000..d082c499cfe370 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-token.rst @@ -0,0 +1,64 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +bpftool-token +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF tokens +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + +**bpftool** [*OPTIONS*] **token** *COMMAND* + +*OPTIONS* := { |COMMON_OPTIONS| } + +*COMMANDS* := { **show** | **list** | **help** } + +TOKEN COMMANDS +=============== + +| **bpftool** **token** { **show** | **list** } +| **bpftool** **token help** +| + +DESCRIPTION +=========== +bpftool token { show | list } + List BPF token information for each *bpffs* mount point containing token + information on the system. Information include mount point path, allowed + **bpf**\ () system call commands, maps, programs, and attach types for the + token. + +bpftool prog help + Print short help message. + +OPTIONS +======== +.. include:: common_options.rst + +EXAMPLES +======== +| +| **# mkdir -p /sys/fs/bpf/token** +| **# mount -t bpf bpffs /sys/fs/bpf/token** \ +| **-o delegate_cmds=prog_load:map_create** \ +| **-o delegate_progs=kprobe** \ +| **-o delegate_attachs=xdp** +| **# bpftool token list** + +:: + + token_info /sys/fs/bpf/token + allowed_cmds: + map_create prog_load + allowed_maps: + allowed_progs: + kprobe + allowed_attachs: + xdp From f3af62b6cee8af9f07012051874af2d2a451f0e5 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Wed, 23 Jul 2025 22:44:42 +0800 Subject: [PATCH 0010/3206] bpftool: Add bash completion for token argument This commit updates the bash completion script with the new token argument. $ bpftool token help list show Reviewed-by: Quentin Monnet Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/20250723144442.1427943-3-chen.dylane@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/bash-completion/bpftool | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index a759ba24471d67..527bb47ac4629d 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1215,6 +1215,17 @@ _bpftool() ;; esac ;; + token) + case $command in + show|list) + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help show list' -- "$cur" ) ) + ;; + esac + ;; esac } && complete -F _bpftool bpftool From d87a513d093726d121dd5c816e26803111a259d0 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 6 Aug 2025 09:25:38 -0700 Subject: [PATCH 0011/3206] bpf: Allow struct_ops to get map id by kdata Add bpf_struct_ops_id() to enable struct_ops implementors to use struct_ops map id as the unique id of a struct_ops in their subsystem. A subsystem that wishes to create a mapping between id and struct_ops instance pointer can update the mapping accordingly during bpf_struct_ops::reg(), unreg(), and update(). Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250806162540.681679-2-ameryhung@gmail.com --- include/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc700925b802fe..e7ee089e8a31a5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1985,6 +1985,7 @@ static inline void bpf_module_put(const void *data, struct module *owner) module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); +u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 687a3e9c76f527..a41e6730edcf33 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1174,6 +1174,18 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } +u32 bpf_struct_ops_id(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + + return st_map->map.id; +} +EXPORT_SYMBOL_GPL(bpf_struct_ops_id); + static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; From eeb52b6279cfebae07703f772621fb9d5972e541 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 6 Aug 2025 09:25:39 -0700 Subject: [PATCH 0012/3206] selftests/bpf: Add multi_st_ops that supports multiple instances Current struct_ops in bpf_testmod only support attaching single instance. Add multi_st_ops that supports multiple instances. The struct_ops uses map id as the struct_ops id and will reject attachment with an existing id. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250806162540.681679-3-ameryhung@gmail.com --- .../selftests/bpf/test_kmods/bpf_testmod.c | 112 ++++++++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod.h | 6 + .../bpf/test_kmods/bpf_testmod_kfunc.h | 2 + 3 files changed, 120 insertions(+) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index e9e918cdf31ff2..2beb9b2fcbd876 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1057,6 +1057,8 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) return args->a; } +__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -1097,6 +1099,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABL BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1528,6 +1531,114 @@ static struct bpf_struct_ops testmod_st_ops = { .owner = THIS_MODULE, }; +struct hlist_head multi_st_ops_list; +static DEFINE_SPINLOCK(multi_st_ops_lock); + +static int multi_st_ops_init(struct btf *btf) +{ + spin_lock_init(&multi_st_ops_lock); + INIT_HLIST_HEAD(&multi_st_ops_list); + + return 0; +} + +static int multi_st_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static struct bpf_testmod_multi_st_ops *multi_st_ops_find_nolock(u32 id) +{ + struct bpf_testmod_multi_st_ops *st_ops; + + hlist_for_each_entry(st_ops, &multi_st_ops_list, node) { + if (st_ops->id == id) + return st_ops; + } + + return NULL; +} + +int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) +{ + struct bpf_testmod_multi_st_ops *st_ops; + unsigned long flags; + int ret = -1; + + spin_lock_irqsave(&multi_st_ops_lock, flags); + st_ops = multi_st_ops_find_nolock(id); + if (st_ops) + ret = st_ops->test_1(args); + spin_unlock_irqrestore(&multi_st_ops_lock, flags); + + return ret; +} + +static int multi_st_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_testmod_multi_st_ops *st_ops = + (struct bpf_testmod_multi_st_ops *)kdata; + unsigned long flags; + int err = 0; + u32 id; + + if (!st_ops->test_1) + return -EINVAL; + + id = bpf_struct_ops_id(kdata); + + spin_lock_irqsave(&multi_st_ops_lock, flags); + if (multi_st_ops_find_nolock(id)) { + pr_err("multi_st_ops(id:%d) has already been registered\n", id); + err = -EEXIST; + goto unlock; + } + + st_ops->id = id; + hlist_add_head(&st_ops->node, &multi_st_ops_list); +unlock: + spin_unlock_irqrestore(&multi_st_ops_lock, flags); + + return err; +} + +static void multi_st_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_testmod_multi_st_ops *st_ops; + unsigned long flags; + u32 id; + + id = bpf_struct_ops_id(kdata); + + spin_lock_irqsave(&multi_st_ops_lock, flags); + st_ops = multi_st_ops_find_nolock(id); + if (st_ops) + hlist_del(&st_ops->node); + spin_unlock_irqrestore(&multi_st_ops_lock, flags); +} + +static int bpf_testmod_multi_st_ops__test_1(struct st_ops_args *args) +{ + return 0; +} + +static struct bpf_testmod_multi_st_ops multi_st_ops_cfi_stubs = { + .test_1 = bpf_testmod_multi_st_ops__test_1, +}; + +struct bpf_struct_ops testmod_multi_st_ops = { + .verifier_ops = &bpf_testmod_verifier_ops, + .init = multi_st_ops_init, + .init_member = multi_st_ops_init_member, + .reg = multi_st_ops_reg, + .unreg = multi_st_ops_unreg, + .cfi_stubs = &multi_st_ops_cfi_stubs, + .name = "bpf_testmod_multi_st_ops", + .owner = THIS_MODULE, +}; + extern int bpf_fentry_test1(int a); static int bpf_testmod_init(void) @@ -1550,6 +1661,7 @@ static int bpf_testmod_init(void) ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2); ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops3, bpf_testmod_ops3); ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops); + ret = ret ?: register_bpf_struct_ops(&testmod_multi_st_ops, bpf_testmod_multi_st_ops); ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors, ARRAY_SIZE(bpf_testmod_dtors), THIS_MODULE); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h index c9fab51f16e2b7..f6e492f9d04268 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h @@ -116,4 +116,10 @@ struct bpf_testmod_st_ops { struct module *owner; }; +struct bpf_testmod_multi_st_ops { + int (*test_1)(struct st_ops_args *args); + struct hlist_node node; + int id; +}; + #endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index b58817938debd8..286e7faa47544e 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -159,4 +159,6 @@ void bpf_kfunc_trusted_task_test(struct task_struct *ptr) __ksym; void bpf_kfunc_trusted_num_test(int *ptr) __ksym; void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym; +int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ From ba7000f1c360f34286f48bd5e670cefbab77ce8f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 6 Aug 2025 09:25:40 -0700 Subject: [PATCH 0013/3206] selftests/bpf: Test multi_st_ops and calling kfuncs from different programs Test multi_st_ops and demonstrate how different bpf programs can call a kfuncs that refers to the struct_ops instance in the same source file by id. The id is defined as a global vairable and initialized before attaching the skeleton. Kfuncs that take the id can hide the argument with a macro to make it almost transparent to bpf program developers. The test involves two struct_ops returning different values from .test_1. In syscall and tracing programs, check if the correct value is returned by a kfunc that calls .test_1. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250806162540.681679-4-ameryhung@gmail.com --- .../test_struct_ops_id_ops_mapping.c | 74 +++++++++++++++++++ .../bpf/progs/struct_ops_id_ops_mapping1.c | 59 +++++++++++++++ .../bpf/progs/struct_ops_id_ops_mapping2.c | 59 +++++++++++++++ 3 files changed, 192 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_struct_ops_id_ops_mapping.c create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping1.c create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping2.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_id_ops_mapping.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_id_ops_mapping.c new file mode 100644 index 00000000000000..fd8762ba4b6732 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_id_ops_mapping.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "struct_ops_id_ops_mapping1.skel.h" +#include "struct_ops_id_ops_mapping2.skel.h" + +static void test_st_ops_id_ops_mapping(void) +{ + struct struct_ops_id_ops_mapping1 *skel1 = NULL; + struct struct_ops_id_ops_mapping2 *skel2 = NULL; + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + int err, pid, prog1_fd, prog2_fd; + + skel1 = struct_ops_id_ops_mapping1__open_and_load(); + if (!ASSERT_OK_PTR(skel1, "struct_ops_id_ops_mapping1__open")) + goto out; + + skel2 = struct_ops_id_ops_mapping2__open_and_load(); + if (!ASSERT_OK_PTR(skel2, "struct_ops_id_ops_mapping2__open")) + goto out; + + err = bpf_map_get_info_by_fd(bpf_map__fd(skel1->maps.st_ops_map), + &info, &len); + if (!ASSERT_OK(err, "bpf_map_get_info_by_fd")) + goto out; + + skel1->bss->st_ops_id = info.id; + + err = bpf_map_get_info_by_fd(bpf_map__fd(skel2->maps.st_ops_map), + &info, &len); + if (!ASSERT_OK(err, "bpf_map_get_info_by_fd")) + goto out; + + skel2->bss->st_ops_id = info.id; + + err = struct_ops_id_ops_mapping1__attach(skel1); + if (!ASSERT_OK(err, "struct_ops_id_ops_mapping1__attach")) + goto out; + + err = struct_ops_id_ops_mapping2__attach(skel2); + if (!ASSERT_OK(err, "struct_ops_id_ops_mapping2__attach")) + goto out; + + /* run tracing prog that calls .test_1 and checks return */ + pid = getpid(); + skel1->bss->test_pid = pid; + skel2->bss->test_pid = pid; + sys_gettid(); + skel1->bss->test_pid = 0; + skel2->bss->test_pid = 0; + + /* run syscall_prog that calls .test_1 and checks return */ + prog1_fd = bpf_program__fd(skel1->progs.syscall_prog); + err = bpf_prog_test_run_opts(prog1_fd, NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + prog2_fd = bpf_program__fd(skel2->progs.syscall_prog); + err = bpf_prog_test_run_opts(prog2_fd, NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + ASSERT_EQ(skel1->bss->test_err, 0, "skel1->bss->test_err"); + ASSERT_EQ(skel2->bss->test_err, 0, "skel2->bss->test_err"); + +out: + struct_ops_id_ops_mapping1__destroy(skel1); + struct_ops_id_ops_mapping2__destroy(skel2); +} + +void test_struct_ops_id_ops_mapping(void) +{ + if (test__start_subtest("st_ops_id_ops_mapping")) + test_st_ops_id_ops_mapping(); +} diff --git a/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping1.c b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping1.c new file mode 100644 index 00000000000000..ad8bb546c9bfff --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping1.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +#define bpf_kfunc_multi_st_ops_test_1(args) bpf_kfunc_multi_st_ops_test_1(args, st_ops_id) +int st_ops_id; + +int test_pid; +int test_err; + +#define MAP1_MAGIC 1234 + +SEC("struct_ops") +int BPF_PROG(test_1, struct st_ops_args *args) +{ + return MAP1_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP1_MAGIC) + test_err++; + + return 0; +} + +SEC("syscall") +int syscall_prog(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP1_MAGIC) + test_err++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map = { + .test_1 = (void *)test_1, +}; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping2.c b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping2.c new file mode 100644 index 00000000000000..cea1a2f4b62f6e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping2.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +#define bpf_kfunc_multi_st_ops_test_1(args) bpf_kfunc_multi_st_ops_test_1(args, st_ops_id) +int st_ops_id; + +int test_pid; +int test_err; + +#define MAP2_MAGIC 4567 + +SEC("struct_ops") +int BPF_PROG(test_1, struct st_ops_args *args) +{ + return MAP2_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP2_MAGIC) + test_err++; + + return 0; +} + +SEC("syscall") +int syscall_prog(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP2_MAGIC) + test_err++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map = { + .test_1 = (void *)test_1, +}; From cb070a8156c16383cad5e4d9f678b7273f0208cd Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 6 Aug 2025 18:02:04 -0700 Subject: [PATCH 0014/3206] bpf: removed unused 'env' parameter from is_reg64 and insn_has_def32 Parameter 'env' is not used by is_reg64() and insn_has_def32() functions. Remove the parameter to make it clear that neither function depends on 'env' state, e.g. env->insn_aux_data. Signed-off-by: Eduard Zingerman Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250807010205.3210608-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0806295945e400..69eb2b5c2218b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3663,7 +3663,7 @@ static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE. */ -static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, +static bool is_reg64(struct bpf_insn *insn, u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) { u8 code, class, op; @@ -3774,14 +3774,14 @@ static int insn_def_regno(const struct bpf_insn *insn) } /* Return TRUE if INSN has defined any 32-bit value explicitly. */ -static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) +static bool insn_has_def32(struct bpf_insn *insn) { int dst_reg = insn_def_regno(insn); if (dst_reg == -1) return false; - return !is_reg64(env, insn, dst_reg, NULL, DST_OP); + return !is_reg64(insn, dst_reg, NULL, DST_OP); } static void mark_insn_zext(struct bpf_verifier_env *env, @@ -3812,7 +3812,7 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r mark_reg_scratched(env, regno); reg = ®s[regno]; - rw64 = is_reg64(env, insn, regno, reg, t); + rw64 = is_reg64(insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (reg->type == NOT_INIT) { @@ -20712,7 +20712,7 @@ static void adjust_insn_aux_data(struct bpf_verifier_env *env, * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the * original insn at old prog. */ - old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); + old_data[off].zext_dst = insn_has_def32(insn + off + cnt - 1); if (cnt == 1) return; @@ -20724,7 +20724,7 @@ static void adjust_insn_aux_data(struct bpf_verifier_env *env, for (i = off; i < off + cnt - 1; i++) { /* Expand insni[off]'s seen count to the patched range. */ new_data[i].seen = old_seen; - new_data[i].zext_dst = insn_has_def32(env, insn + i); + new_data[i].zext_dst = insn_has_def32(insn + i); } env->insn_aux_data = new_data; vfree(old_data); @@ -21131,7 +21131,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, * BPF_STX + SRC_OP, so it is safe to pass NULL * here. */ - if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) { + if (is_reg64(&insn, load_reg, NULL, DST_OP)) { if (class == BPF_LD && BPF_MODE(code) == BPF_IMM) i++; From 77620d1267392b1a34bfc437d2adea3006f95865 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 6 Aug 2025 18:02:05 -0700 Subject: [PATCH 0015/3206] bpf: use realloc in bpf_patch_insn_data Avoid excessive vzalloc/vfree calls when patching instructions in do_misc_fixups(). bpf_patch_insn_data() uses vzalloc to allocate new memory for env->insn_aux_data for each patch as follows: struct bpf_prog *bpf_patch_insn_data(env, ...) { ... new_data = vzalloc(... O(program size) ...); ... adjust_insn_aux_data(env, new_data, ...); ... } void adjust_insn_aux_data(env, new_data, ...) { ... memcpy(new_data, env->insn_aux_data); vfree(env->insn_aux_data); env->insn_aux_data = new_data; ... } The vzalloc/vfree pair is hot in perf report collected for e.g. pyperf180 test case. It can be replaced with a call to vrealloc in order to reduce the number of actual memory allocations. This is a stop-gap solution, as bpf_patch_insn_data is still hot in the profile. More comprehansive solutions had been discussed before e.g. as in [1]. [1] https://lore.kernel.org/bpf/CAEf4BzY_E8MSL4mD0UPuuiDcbJhh9e2xQo2=5w+ppRWWiYSGvQ@mail.gmail.com/ Suggested-by: Alexei Starovoitov Signed-off-by: Eduard Zingerman Acked-by: Kumar Kartikeya Dwivedi Tested-by: Anton Protopopov Link: https://lore.kernel.org/r/20250807010205.3210608-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 69eb2b5c2218b2..a61d5799669233 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20699,12 +20699,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) * [0, off) and [off, end) to new locations, so the patched range stays zero */ static void adjust_insn_aux_data(struct bpf_verifier_env *env, - struct bpf_insn_aux_data *new_data, struct bpf_prog *new_prog, u32 off, u32 cnt) { - struct bpf_insn_aux_data *old_data = env->insn_aux_data; + struct bpf_insn_aux_data *data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; - u32 old_seen = old_data[off].seen; + u32 old_seen = data[off].seen; u32 prog_len; int i; @@ -20712,22 +20711,20 @@ static void adjust_insn_aux_data(struct bpf_verifier_env *env, * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the * original insn at old prog. */ - old_data[off].zext_dst = insn_has_def32(insn + off + cnt - 1); + data[off].zext_dst = insn_has_def32(insn + off + cnt - 1); if (cnt == 1) return; prog_len = new_prog->len; - memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); - memcpy(new_data + off + cnt - 1, old_data + off, - sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + memmove(data + off + cnt - 1, data + off, + sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1)); for (i = off; i < off + cnt - 1; i++) { /* Expand insni[off]'s seen count to the patched range. */ - new_data[i].seen = old_seen; - new_data[i].zext_dst = insn_has_def32(insn + i); + data[i].seen = old_seen; + data[i].zext_dst = insn_has_def32(insn + i); } - env->insn_aux_data = new_data; - vfree(old_data); } static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) @@ -20765,10 +20762,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of struct bpf_insn_aux_data *new_data = NULL; if (len > 1) { - new_data = vzalloc(array_size(env->prog->len + len - 1, - sizeof(struct bpf_insn_aux_data))); + new_data = vrealloc(env->insn_aux_data, + array_size(env->prog->len + len - 1, + sizeof(struct bpf_insn_aux_data)), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!new_data) return NULL; + + env->insn_aux_data = new_data; } new_prog = bpf_patch_insn_single(env->prog, off, patch, len); @@ -20780,7 +20781,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of vfree(new_data); return NULL; } - adjust_insn_aux_data(env, new_data, new_prog, off, len); + adjust_insn_aux_data(env, new_prog, off, len); adjust_subprog_starts(env, off, len); adjust_poke_descs(new_prog, off, len); return new_prog; From fa479132845e94b60068fad01c2a9979b3efe2dc Mon Sep 17 00:00:00 2001 From: Li Jun Date: Wed, 30 Jul 2025 18:50:19 +0800 Subject: [PATCH 0016/3206] bpf: Standardize function declaration style 'noinlne' after 'int' cause "ERROR: inline keyword should sit between storage class and type" by checkpatch.pl - Standardize function declaration style by moving 'noinline' modifier - Fix asm volatile statement formatting Signed-off-by: Li Jun Link: https://lore.kernel.org/r/20250730105019.436235-1-lijun01@kylinos.cn Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 9728dbd4c66c51..4a862d60538618 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -524,27 +524,27 @@ __bpf_kfunc int bpf_fentry_test1(int a) } EXPORT_SYMBOL_GPL(bpf_fentry_test1); -int noinline bpf_fentry_test2(int a, u64 b) +noinline int bpf_fentry_test2(int a, u64 b) { return a + b; } -int noinline bpf_fentry_test3(char a, int b, u64 c) +noinline int bpf_fentry_test3(char a, int b, u64 c) { return a + b + c; } -int noinline bpf_fentry_test4(void *a, char b, int c, u64 d) +noinline int bpf_fentry_test4(void *a, char b, int c, u64 d) { return (long)a + b + c + d; } -int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) +noinline int bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) { return a + (long)b + c + d + e; } -int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) +noinline int bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) { return a + (long)b + c + d + (long)e + f; } @@ -553,13 +553,13 @@ struct bpf_fentry_test_t { struct bpf_fentry_test_t *a; }; -int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) +noinline int bpf_fentry_test7(struct bpf_fentry_test_t *arg) { - asm volatile ("": "+r"(arg)); + asm volatile ("" : "+r"(arg)); return (long)arg; } -int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg) +noinline int bpf_fentry_test8(struct bpf_fentry_test_t *arg) { return (long)arg->a; } @@ -569,12 +569,12 @@ __bpf_kfunc u32 bpf_fentry_test9(u32 *a) return *a; } -int noinline bpf_fentry_test10(const void *a) +noinline int bpf_fentry_test10(const void *a) { return (long)a; } -void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) +noinline void bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) { } @@ -598,7 +598,7 @@ __bpf_kfunc int bpf_modify_return_test_tp(int nonce) return nonce; } -int noinline bpf_fentry_shadow_test(int a) +noinline int bpf_fentry_shadow_test(int a) { return a + 1; } From f7fbf3091f4cc4133574852f655593e1613d1af0 Mon Sep 17 00:00:00 2001 From: Daniel Almeida Date: Tue, 29 Jul 2025 14:31:40 -0300 Subject: [PATCH 0017/3206] rust: regulator: remove needless &mut from member functions Regulator functions like "regulator_enable()" and "regulator_disable()" already provide their own locking through "regulator_lock_dependent()", so we can safely call the Rust API with a shared reference. This was already the case with Regulator::set_voltage() on the Rust side, but it was forgotten for Regulator::enable() and Regulator::disable(). Signed-off-by: Daniel Almeida Link: https://patch.msgid.link/20250729-regulator-send-sync-v1-1-8bcbd546b940@collabora.com Reviewed-by: Alexandre Courbot Reviewed-by: Alice Ryhl Signed-off-by: Mark Brown --- rust/kernel/regulator.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/rust/kernel/regulator.rs b/rust/kernel/regulator.rs index 65f3a125348f2d..d56aa229e838c4 100644 --- a/rust/kernel/regulator.rs +++ b/rust/kernel/regulator.rs @@ -203,20 +203,20 @@ pub struct Error { /// // A fictictious probe function that obtains a regulator and sets it up. /// fn probe(dev: &Device) -> Result { /// // Obtain a reference to a (fictitious) regulator. -/// let mut regulator = Regulator::::get(dev, c_str!("vcc"))?; +/// let regulator = Regulator::::get(dev, c_str!("vcc"))?; /// /// Ok(PrivateData { regulator }) /// } /// /// // A fictictious function that indicates that the device is going to be used. -/// fn open(dev: &Device, data: &mut PrivateData) -> Result { +/// fn open(dev: &Device, data: &PrivateData) -> Result { /// // Increase the `enabled` reference count. /// data.regulator.enable()?; /// /// Ok(()) /// } /// -/// fn close(dev: &Device, data: &mut PrivateData) -> Result { +/// fn close(dev: &Device, data: &PrivateData) -> Result { /// // Decrease the `enabled` reference count. /// data.regulator.disable()?; /// @@ -289,12 +289,12 @@ impl Regulator { }) } - fn enable_internal(&mut self) -> Result { + fn enable_internal(&self) -> Result { // SAFETY: Safe as per the type invariants of `Regulator`. to_result(unsafe { bindings::regulator_enable(self.inner.as_ptr()) }) } - fn disable_internal(&mut self) -> Result { + fn disable_internal(&self) -> Result { // SAFETY: Safe as per the type invariants of `Regulator`. to_result(unsafe { bindings::regulator_disable(self.inner.as_ptr()) }) } @@ -310,7 +310,7 @@ impl Regulator { pub fn try_into_enabled(self) -> Result, Error> { // We will be transferring the ownership of our `regulator_get()` count to // `Regulator`. - let mut regulator = ManuallyDrop::new(self); + let regulator = ManuallyDrop::new(self); regulator .enable_internal() @@ -339,7 +339,7 @@ impl Regulator { pub fn try_into_disabled(self) -> Result, Error> { // We will be transferring the ownership of our `regulator_get()` count // to `Regulator`. - let mut regulator = ManuallyDrop::new(self); + let regulator = ManuallyDrop::new(self); regulator .disable_internal() @@ -366,12 +366,12 @@ impl Regulator { } /// Increases the `enabled` reference count. - pub fn enable(&mut self) -> Result { + pub fn enable(&self) -> Result { self.enable_internal() } /// Decreases the `enabled` reference count. - pub fn disable(&mut self) -> Result { + pub fn disable(&self) -> Result { self.disable_internal() } } From 9a200cbdb54349909a42b45379e792e4b39dd223 Mon Sep 17 00:00:00 2001 From: Daniel Almeida Date: Tue, 29 Jul 2025 14:31:41 -0300 Subject: [PATCH 0018/3206] rust: regulator: implement Send and Sync for Regulator Sending a &Regulator to another thread is safe, as the regulator core will properly handle the locking for us. Additionally, there are no restrictions that prevents sending a Regulator to another thread. Given these two facts, implement Send and Sync. Signed-off-by: Daniel Almeida Link: https://patch.msgid.link/20250729-regulator-send-sync-v1-2-8bcbd546b940@collabora.com Reviewed-by: Alexandre Courbot Reviewed-by: Alice Ryhl Signed-off-by: Mark Brown --- rust/kernel/regulator.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/rust/kernel/regulator.rs b/rust/kernel/regulator.rs index d56aa229e838c4..704147e18bfc9c 100644 --- a/rust/kernel/regulator.rs +++ b/rust/kernel/regulator.rs @@ -398,6 +398,14 @@ impl Drop for Regulator { } } +// SAFETY: It is safe to send a `Regulator` across threads. In particular, a +// Regulator can be dropped from any thread. +unsafe impl Send for Regulator {} + +// SAFETY: It is safe to send a &Regulator across threads because the C side +// handles its own locking. +unsafe impl Sync for Regulator {} + /// A voltage. /// /// This type represents a voltage value in microvolts. From 82f0907931f016c04bcb992f764bd65992c7008e Mon Sep 17 00:00:00 2001 From: Joy Zou Date: Thu, 31 Jul 2025 18:44:41 +0800 Subject: [PATCH 0019/3206] dt-bindings: regulator: add PF0900 regulator yaml Add device binding doc for PF0900 PMIC driver. Can not get the I2C_CRC_EN config by reading register. If the PMIC OTP_I2C_CRC_EN is enable, need to add the nxp,i2c-crc-enable property. Signed-off-by: Joy Zou Link: https://patch.msgid.link/20250731-b4-pf09-v2-v3-1-4c2659516582@nxp.com Reviewed-by: Rob Herring (Arm) Signed-off-by: Mark Brown --- .../bindings/regulator/nxp,pf0900.yaml | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 Documentation/devicetree/bindings/regulator/nxp,pf0900.yaml diff --git a/Documentation/devicetree/bindings/regulator/nxp,pf0900.yaml b/Documentation/devicetree/bindings/regulator/nxp,pf0900.yaml new file mode 100644 index 00000000000000..8c8fc2cd4cedb0 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/nxp,pf0900.yaml @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/nxp,pf0900.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: NXP PF0900 Power Management Integrated Circuit regulators + +maintainers: + - Joy Zou + +description: + The PF0900 is a power management integrated circuit (PMIC) optimized + for high performance i.MX9x based applications. It features five high + efficiency buck converters, three linear and one vaon regulators. It + provides low quiescent current in Standby and low power off Modes. + +properties: + compatible: + enum: + - nxp,pf0900 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + regulators: + type: object + additionalProperties: false + + properties: + vaon: + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + + patternProperties: + "^ldo[1-3]$": + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + + "^sw[1-5]$": + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + + nxp,i2c-crc-enable: + type: boolean + description: + The CRC enabled during register read/write. Controlled by customer + unviewable fuse bits OTP_I2C_CRC_EN. Check chip part number. + +required: + - compatible + - reg + - interrupts + - regulators + +additionalProperties: false + +examples: + - | + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + pmic@8 { + compatible = "nxp,pf0900"; + reg = <0x08>; + interrupt-parent = <&pcal6524>; + interrupts = <89 IRQ_TYPE_LEVEL_LOW>; + nxp,i2c-crc-enable; + + regulators { + vaon { + regulator-name = "VAON"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + }; + + sw1 { + regulator-name = "SW1"; + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + regulator-ramp-delay = <1950>; + regulator-state-mem { + regulator-on-in-suspend; + regulator-suspend-max-microvolt = <650000>; + regulator-suspend-min-microvolt = <650000>; + }; + }; + + sw2 { + regulator-name = "SW2"; + regulator-min-microvolt = <300000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + regulator-ramp-delay = <1950>; + }; + + sw3 { + regulator-name = "SW3"; + regulator-min-microvolt = <300000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + regulator-ramp-delay = <1950>; + }; + + sw4 { + regulator-name = "SW4"; + regulator-min-microvolt = <300000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + regulator-ramp-delay = <1950>; + }; + + sw5 { + regulator-name = "SW5"; + regulator-min-microvolt = <300000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + regulator-ramp-delay = <1950>; + }; + + ldo1 { + regulator-name = "LDO1"; + regulator-min-microvolt = <750000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + }; + + ldo2 { + regulator-name = "LDO2"; + regulator-min-microvolt = <650000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + }; + + ldo3 { + regulator-name = "LDO3"; + regulator-min-microvolt = <650000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + }; + }; + }; + }; From 162e23657e5379f07c6404dbfbf4367cb438ea7d Mon Sep 17 00:00:00 2001 From: Joy Zou Date: Thu, 31 Jul 2025 18:44:42 +0800 Subject: [PATCH 0020/3206] regulator: pf0900: Add PMIC PF0900 support The PF0900 is a power management integrated circuit (PMIC) optimized for high performance i.MX9x based applications. It features five high efficiency buck converters, three linear and one vaon regulators. It provides low quiescent current in Standby and low power off Modes. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507221720.jxsGRfcE-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202506181134.0Hkvy7CK-lkp@intel.com/ Signed-off-by: Joy Zou Link: https://patch.msgid.link/20250731-b4-pf09-v2-v3-2-4c2659516582@nxp.com Signed-off-by: Mark Brown --- drivers/regulator/Kconfig | 8 + drivers/regulator/Makefile | 1 + drivers/regulator/pf0900-regulator.c | 975 +++++++++++++++++++++++++++ 3 files changed, 984 insertions(+) create mode 100644 drivers/regulator/pf0900-regulator.c diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index eaa6df1c9f8066..b474b4bf63b37d 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -1006,6 +1006,14 @@ config REGULATOR_PCAP This driver provides support for the voltage regulators of the PCAP2 PMIC. +config REGULATOR_PF0900 + tristate "NXP PF0900/PF0901/PF09XX regulator driver" + depends on I2C + select REGMAP_I2C + help + Say y here to support the NXP PF0900/PF0901/PF09XX PMIC + regulator driver. + config REGULATOR_PF8X00 tristate "NXP PF8100/PF8121A/PF8200 regulator driver" depends on I2C && OF diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index be98b29d6675d8..74b029d29a8b7f 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -124,6 +124,7 @@ obj-$(CONFIG_REGULATOR_QCOM_SPMI) += qcom_spmi-regulator.o obj-$(CONFIG_REGULATOR_QCOM_USB_VBUS) += qcom_usb_vbus-regulator.o obj-$(CONFIG_REGULATOR_PALMAS) += palmas-regulator.o obj-$(CONFIG_REGULATOR_PCA9450) += pca9450-regulator.o +obj-$(CONFIG_REGULATOR_PF0900) += pf0900-regulator.o obj-$(CONFIG_REGULATOR_PF9453) += pf9453-regulator.o obj-$(CONFIG_REGULATOR_PF8X00) += pf8x00-regulator.o obj-$(CONFIG_REGULATOR_PFUZE100) += pfuze100-regulator.o diff --git a/drivers/regulator/pf0900-regulator.c b/drivers/regulator/pf0900-regulator.c new file mode 100644 index 00000000000000..b5effee3291724 --- /dev/null +++ b/drivers/regulator/pf0900-regulator.c @@ -0,0 +1,975 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright 2025 NXP. +// NXP PF0900 pmic driver + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum pf0900_regulators { + PF0900_SW1 = 0, + PF0900_SW2, + PF0900_SW3, + PF0900_SW4, + PF0900_SW5, + PF0900_LDO1, + PF0900_LDO2, + PF0900_LDO3, + PF0900_VAON, + PF0900_REGULATOR_CNT, +}; + +enum { + PF0900_DVS_LEVEL_RUN = 0, + PF0900_DVS_LEVEL_STANDBY, + PF0900_DVS_LEVEL_MAX, +}; + + +#define PF0900_VAON_VOLTAGE_NUM 0x03 +#define PF0900_SW_VOLTAGE_NUM 0x100 +#define PF0900_LDO_VOLTAGE_NUM 0x20 + +#define REGU_SW_CNT 0x5 +#define REGU_LDO_VAON_CNT 0x4 + +enum { + PF0900_REG_DEV_ID = 0x00, + PF0900_REG_DEV_FAM = 0x01, + PF0900_REG_REV_ID = 0x02, + PF0900_REG_PROG_ID1 = 0x03, + PF0900_REG_PROG_ID2 = 0x04, + PF0900_REG_SYSTEM_INT = 0x05, + PF0900_REG_STATUS1_INT = 0x06, + PF0900_REG_STATUS1_MSK = 0x07, + PF0900_REG_STATUS1_SNS = 0x08, + PF0900_REG_STATUS2_INT = 0x09, + PF0900_REG_STATUS2_MSK = 0x0A, + PF0900_REG_STATUS2_SNS = 0x0B, + PF0900_REG_STATUS3_INT = 0x0C, + PF0900_REG_STATUS3_MSK = 0x0D, + PF0900_REG_SW_MODE_INT = 0x0E, + PF0900_REG_SW_MODE_MSK = 0x0F, + PF0900_REG_SW_ILIM_INT = 0x10, + PF0900_REG_SW_ILIM_MSK = 0x11, + PF0900_REG_SW_ILIM_SNS = 0x12, + PF0900_REG_LDO_ILIM_INT = 0x13, + PF0900_REG_LDO_ILIM_MSK = 0x14, + PF0900_REG_LDO_ILIM_SNS = 0x15, + PF0900_REG_SW_UV_INT = 0x16, + PF0900_REG_SW_UV_MSK = 0x17, + PF0900_REG_SW_UV_SNS = 0x18, + PF0900_REG_SW_OV_INT = 0x19, + PF0900_REG_SW_OV_MSK = 0x1A, + PF0900_REG_SW_OV_SNS = 0x1B, + PF0900_REG_LDO_UV_INT = 0x1C, + PF0900_REG_LDO_UV_MSK = 0x1D, + PF0900_REG_LDO_UV_SNS = 0x1E, + PF0900_REG_LDO_OV_INT = 0x1F, + PF0900_REG_LDO_OV_MSK = 0x20, + PF0900_REG_LDO_OV_SNS = 0x21, + PF0900_REG_PWRON_INT = 0x22, + PF0900_REG_IO_INT = 0x24, + PF0900_REG_IO_MSK = 0x25, + PF0900_REG_IO_SNS = 0x26, + PF0900_REG_IOSHORT_SNS = 0x27, + PF0900_REG_ABIST_OV1 = 0x28, + PF0900_REG_ABIST_OV2 = 0x29, + PF0900_REG_ABIST_UV1 = 0x2A, + PF0900_REG_ABIST_UV2 = 0x2B, + PF0900_REG_ABIST_IO = 0x2C, + PF0900_REG_TEST_FLAGS = 0x2D, + PF0900_REG_HFAULT_FLAGS = 0x2E, + PF0900_REG_FAULT_FLAGS = 0x2F, + PF0900_REG_FS0B_CFG = 0x30, + PF0900_REG_FCCU_CFG = 0x31, + PF0900_REG_RSTB_CFG1 = 0x32, + PF0900_REG_SYSTEM_CMD = 0x33, + PF0900_REG_FS0B_CMD = 0x34, + PF0900_REG_SECURE_WR1 = 0x35, + PF0900_REG_SECURE_WR2 = 0x36, + PF0900_REG_VMON_CFG1 = 0x37, + PF0900_REG_SYS_CFG1 = 0x38, + PF0900_REG_GPO_CFG = 0x39, + PF0900_REG_GPO_CTRL = 0x3A, + PF0900_REG_PWRUP_CFG = 0x3B, + PF0900_REG_RSTB_PWRUP = 0x3C, + PF0900_REG_GPIO1_PWRUP = 0x3D, + PF0900_REG_GPIO2_PWRUP = 0x3E, + PF0900_REG_GPIO3_PWRUP = 0x3F, + PF0900_REG_GPIO4_PWRUP = 0x40, + PF0900_REG_VMON1_PWRUP = 0x41, + PF0900_REG_VMON2_PWRUP = 0x42, + PF0900_REG_SW1_PWRUP = 0x43, + PF0900_REG_SW2_PWRUP = 0x44, + PF0900_REG_SW3_PWRUP = 0x45, + PF0900_REG_SW4_PWRUP = 0x46, + PF0900_REG_SW5_PWRUP = 0x47, + PF0900_REG_LDO1_PWRUP = 0x48, + PF0900_REG_LDO2_PWRUP = 0x49, + PF0900_REG_LDO3_PWRUP = 0x4A, + PF0900_REG_VAON_PWRUP = 0x4B, + PF0900_REG_FREQ_CTRL = 0x4C, + PF0900_REG_PWRON_CFG = 0x4D, + PF0900_REG_WD_CTRL1 = 0x4E, + PF0900_REG_WD_CTRL2 = 0x4F, + PF0900_REG_WD_CFG1 = 0x50, + PF0900_REG_WD_CFG2 = 0x51, + PF0900_REG_WD_CNT1 = 0x52, + PF0900_REG_WD_CNT2 = 0x53, + PF0900_REG_FAULT_CFG = 0x54, + PF0900_REG_FAULT_CNT = 0x55, + PF0900_REG_DFS_CNT = 0x56, + PF0900_REG_AMUX_CFG = 0x57, + PF0900_REG_VMON1_RUN_CFG = 0x58, + PF0900_REG_VMON1_STBY_CFG = 0x59, + PF0900_REG_VMON1_CTRL = 0x5A, + PF0900_REG_VMON2_RUN_CFG = 0x5B, + PF0900_REG_VMON2_STBY_CFG = 0x5C, + PF0900_REG_VMON2_CTRL = 0x5D, + PF0900_REG_SW1_VRUN = 0x5E, + PF0900_REG_SW1_VSTBY = 0x5F, + PF0900_REG_SW1_MODE = 0x60, + PF0900_REG_SW1_CFG1 = 0x61, + PF0900_REG_SW1_CFG2 = 0x62, + PF0900_REG_SW2_VRUN = 0x63, + PF0900_REG_SW2_VSTBY = 0x64, + PF0900_REG_SW2_MODE = 0x65, + PF0900_REG_SW2_CFG1 = 0x66, + PF0900_REG_SW2_CFG2 = 0x67, + PF0900_REG_SW3_VRUN = 0x68, + PF0900_REG_SW3_VSTBY = 0x69, + PF0900_REG_SW3_MODE = 0x6A, + PF0900_REG_SW3_CFG1 = 0x6B, + PF0900_REG_SW3_CFG2 = 0x6C, + PF0900_REG_SW4_VRUN = 0x6D, + PF0900_REG_SW4_VSTBY = 0x6E, + PF0900_REG_SW4_MODE = 0x6F, + PF0900_REG_SW4_CFG1 = 0x70, + PF0900_REG_SW4_CFG2 = 0x71, + PF0900_REG_SW5_VRUN = 0x72, + PF0900_REG_SW5_VSTBY = 0x73, + PF0900_REG_SW5_MODE = 0x74, + PF0900_REG_SW5_CFG1 = 0x75, + PF0900_REG_SW5_CFG2 = 0x76, + PF0900_REG_LDO1_RUN = 0x77, + PF0900_REG_LDO1_STBY = 0x78, + PF0900_REG_LDO1_CFG2 = 0x79, + PF0900_REG_LDO2_RUN = 0x7A, + PF0900_REG_LDO2_STBY = 0x7B, + PF0900_REG_LDO2_CFG2 = 0x7C, + PF0900_REG_LDO3_RUN = 0x7D, + PF0900_REG_LDO3_STBY = 0x7E, + PF0900_REG_LDO3_CFG2 = 0x7F, + PF0900_REG_VAON_CFG1 = 0x80, + PF0900_REG_VAON_CFG2 = 0x81, + PF0900_REG_SYS_DIAG = 0x82, + PF0900_MAX_REGISTER, +}; + +/* PF0900 SW MODE */ +#define SW_RUN_MODE_OFF 0x00 +#define SW_RUN_MODE_PWM 0x01 +#define SW_RUN_MODE_PFM 0x02 +#define SW_STBY_MODE_OFF 0x00 +#define SW_STBY_MODE_PWM 0x04 +#define SW_STBY_MODE_PFM 0x08 + +/* PF0900 SW MODE MASK */ +#define SW_RUN_MODE_MASK GENMASK(1, 0) +#define SW_STBY_MODE_MASK GENMASK(3, 2) + +/* PF0900 SW VRUN/VSTBY MASK */ +#define PF0900_SW_VOL_MASK GENMASK(7, 0) + +/* PF0900_REG_VAON_CFG1 bits */ +#define PF0900_VAON_1P8V 0x01 + +#define PF0900_VAON_MASK GENMASK(1, 0) + +/* PF0900_REG_SWX_CFG1 MASK */ +#define PF0900_SW_DVS_MASK GENMASK(4, 3) + +/* PF0900_REG_LDO_RUN MASK */ +#define VLDO_RUN_MASK GENMASK(4, 0) +#define LDO_RUN_EN_MASK BIT(5) + +/* PF0900_REG_STATUS1_INT bits */ +#define PF0900_IRQ_PWRUP BIT(3) + +/* PF0900_REG_ILIM_INT bits */ +#define PF0900_IRQ_SW1_IL BIT(0) +#define PF0900_IRQ_SW2_IL BIT(1) +#define PF0900_IRQ_SW3_IL BIT(2) +#define PF0900_IRQ_SW4_IL BIT(3) +#define PF0900_IRQ_SW5_IL BIT(4) + +#define PF0900_IRQ_LDO1_IL BIT(0) +#define PF0900_IRQ_LDO2_IL BIT(1) +#define PF0900_IRQ_LDO3_IL BIT(2) + +/* PF0900_REG_UV_INT bits */ +#define PF0900_IRQ_SW1_UV BIT(0) +#define PF0900_IRQ_SW2_UV BIT(1) +#define PF0900_IRQ_SW3_UV BIT(2) +#define PF0900_IRQ_SW4_UV BIT(3) +#define PF0900_IRQ_SW5_UV BIT(4) + +#define PF0900_IRQ_LDO1_UV BIT(0) +#define PF0900_IRQ_LDO2_UV BIT(1) +#define PF0900_IRQ_LDO3_UV BIT(2) +#define PF0900_IRQ_VAON_UV BIT(3) + +/* PF0900_REG_OV_INT bits */ +#define PF0900_IRQ_SW1_OV BIT(0) +#define PF0900_IRQ_SW2_OV BIT(1) +#define PF0900_IRQ_SW3_OV BIT(2) +#define PF0900_IRQ_SW4_OV BIT(3) +#define PF0900_IRQ_SW5_OV BIT(4) + +#define PF0900_IRQ_LDO1_OV BIT(0) +#define PF0900_IRQ_LDO2_OV BIT(1) +#define PF0900_IRQ_LDO3_OV BIT(2) +#define PF0900_IRQ_VAON_OV BIT(3) + +struct pf0900_regulator_desc { + struct regulator_desc desc; + unsigned int suspend_enable_mask; + unsigned int suspend_voltage_reg; + unsigned int suspend_voltage_cache; +}; + +struct pf0900_drvdata { + const struct pf0900_regulator_desc *desc; + unsigned int rcnt; +}; + +struct pf0900 { + struct device *dev; + struct regmap *regmap; + const struct pf0900_drvdata *drvdata; + struct regulator_dev *rdevs[PF0900_REGULATOR_CNT]; + int irq; + unsigned short addr; + bool crc_en; +}; + +enum pf0900_regulator_type { + PF0900_SW = 0, + PF0900_LDO, +}; + +#define PF0900_REGU_IRQ(_reg, _type, _event) \ + { \ + .reg = _reg, \ + .type = _type, \ + .event = _event, \ + } + +struct pf0900_regulator_irq { + unsigned int reg; + unsigned int type; + unsigned int event; +}; + +static const struct regmap_range pf0900_range = { + .range_min = PF0900_REG_DEV_ID, + .range_max = PF0900_REG_SYS_DIAG, +}; + +static const struct regmap_access_table pf0900_volatile_regs = { + .yes_ranges = &pf0900_range, + .n_yes_ranges = 1, +}; + +static const struct regmap_config pf0900_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .volatile_table = &pf0900_volatile_regs, + .max_register = PF0900_MAX_REGISTER - 1, + .cache_type = REGCACHE_MAPLE, +}; + +static uint8_t crc8_j1850(unsigned short addr, unsigned int reg, + unsigned int val) +{ + uint8_t crcBuf[3]; + uint8_t t_crc; + uint8_t i, j; + + crcBuf[0] = addr; + crcBuf[1] = reg; + crcBuf[2] = val; + t_crc = 0xFF; + + /* + * The CRC calculation is based on the standard CRC-8-SAE as + * defined in the SAE-J1850 specification with the following + * characteristics. + * Polynomial = 0x1D + * Initial Value = 0xFF + * The CRC byte is calculated by shifting 24-bit data through + * the CRC polynomial.The 24-bits package is built as follows: + * DEVICE_ADDR[b8] + REGISTER_ADDR [b8] +DATA[b8] + * The DEVICE_ADDR is calculated as the 7-bit slave address + * shifted left one space plus the corresponding read/write bit. + * (7Bit Address [b7] << 1 ) + R/W = DEVICE_ADDR[b8] + */ + for (i = 0; i < sizeof(crcBuf); i++) { + t_crc ^= crcBuf[i]; + for (j = 0; j < 8; j++) { + if ((t_crc & 0x80) != 0) { + t_crc <<= 1; + t_crc ^= 0x1D; + } else { + t_crc <<= 1; + } + } + } + + return t_crc; +} + +static int pf0900_regmap_read(void *context, unsigned int reg, + unsigned int *val) +{ + struct device *dev = context; + struct i2c_client *i2c = to_i2c_client(dev); + struct pf0900 *pf0900 = dev_get_drvdata(dev); + int ret; + u8 crc; + + if (!pf0900 || !pf0900->dev) + return -EINVAL; + + if (reg >= PF0900_MAX_REGISTER) { + dev_err(pf0900->dev, "Invalid register address: 0x%x\n", reg); + return -EINVAL; + } + + if (pf0900->crc_en) { + ret = i2c_smbus_read_word_data(i2c, reg); + if (ret < 0) { + dev_err(pf0900->dev, "Read error at reg=0x%x: %d\n", reg, ret); + return ret; + } + + *val = (u16)ret; + crc = crc8_j1850(pf0900->addr << 1 | 0x1, reg, FIELD_GET(GENMASK(7, 0), *val)); + if (crc != FIELD_GET(GENMASK(15, 8), *val)) { + dev_err(pf0900->dev, "Crc check error!\n"); + return -EINVAL; + } + *val = FIELD_GET(GENMASK(7, 0), *val); + } else { + ret = i2c_smbus_read_byte_data(i2c, reg); + if (ret < 0) { + dev_err(pf0900->dev, "Read error at reg=0x%x: %d\n", reg, ret); + return ret; + } + *val = ret; + } + + return 0; +} + +static int pf0900_regmap_write(void *context, unsigned int reg, + unsigned int val) +{ + struct device *dev = context; + struct i2c_client *i2c = to_i2c_client(dev); + struct pf0900 *pf0900 = dev_get_drvdata(dev); + uint8_t data[2]; + int ret; + + if (!pf0900 || !pf0900->dev) + return -EINVAL; + + if (reg >= PF0900_MAX_REGISTER) { + dev_err(pf0900->dev, "Invalid register address: 0x%x\n", reg); + return -EINVAL; + } + + data[0] = val; + if (pf0900->crc_en) { + /* Get CRC */ + data[1] = crc8_j1850(pf0900->addr << 1, reg, data[0]); + val = FIELD_PREP(GENMASK(15, 8), data[1]) | data[0]; + ret = i2c_smbus_write_word_data(i2c, reg, val); + } else { + ret = i2c_smbus_write_byte_data(i2c, reg, data[0]); + } + + if (ret) { + dev_err(pf0900->dev, "Write reg=0x%x error!\n", reg); + return ret; + } + + return 0; +} + +static int pf0900_suspend_enable(struct regulator_dev *rdev) +{ + struct pf0900_regulator_desc *rdata = rdev_get_drvdata(rdev); + struct regmap *rmap = rdev_get_regmap(rdev); + + return regmap_update_bits(rmap, rdata->desc.enable_reg, + rdata->suspend_enable_mask, SW_STBY_MODE_PFM); +} + +static int pf0900_suspend_disable(struct regulator_dev *rdev) +{ + struct pf0900_regulator_desc *rdata = rdev_get_drvdata(rdev); + struct regmap *rmap = rdev_get_regmap(rdev); + + return regmap_update_bits(rmap, rdata->desc.enable_reg, + rdata->suspend_enable_mask, SW_STBY_MODE_OFF); +} + +static int pf0900_set_suspend_voltage(struct regulator_dev *rdev, int uV) +{ + struct pf0900_regulator_desc *rdata = rdev_get_drvdata(rdev); + struct regmap *rmap = rdev_get_regmap(rdev); + int ret; + + if (rdata->suspend_voltage_cache == uV) + return 0; + + ret = regulator_map_voltage_iterate(rdev, uV, uV); + if (ret < 0) { + dev_err(rdev_get_dev(rdev), "failed to map %i uV\n", uV); + return ret; + } + + dev_dbg(rdev_get_dev(rdev), "uV: %i, reg: 0x%x, msk: 0x%x, val: 0x%x\n", + uV, rdata->suspend_voltage_reg, rdata->desc.vsel_mask, ret); + ret = regmap_update_bits(rmap, rdata->suspend_voltage_reg, + rdata->desc.vsel_mask, ret); + if (ret < 0) { + dev_err(rdev_get_dev(rdev), "failed to set %i uV\n", uV); + return ret; + } + + rdata->suspend_voltage_cache = uV; + + return 0; +} + +static const struct regmap_bus pf0900_regmap_bus = { + .reg_read = pf0900_regmap_read, + .reg_write = pf0900_regmap_write, +}; + +static const struct regulator_ops pf0900_avon_regulator_ops = { + .list_voltage = regulator_list_voltage_table, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, +}; + +static const struct regulator_ops pf0900_dvs_sw_regulator_ops = { + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .list_voltage = regulator_list_voltage_linear_range, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .set_ramp_delay = regulator_set_ramp_delay_regmap, + .set_suspend_enable = pf0900_suspend_enable, + .set_suspend_disable = pf0900_suspend_disable, + .set_suspend_voltage = pf0900_set_suspend_voltage, +}; + +static const struct regulator_ops pf0900_ldo_regulator_ops = { + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .list_voltage = regulator_list_voltage_linear_range, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, +}; + +/* + * SW1/2/3/4/5 + * SW1_DVS[1:0] SW1 DVS ramp rate setting + * 00: 15.6mV/8usec + * 01: 15.6mV/4usec + * 10: 15.6mV/2usec + * 11: 15.6mV/1usec + */ +static const unsigned int pf0900_dvs_sw_ramp_table[] = { + 1950, 3900, 7800, 15600 +}; + +/* VAON 1.8V, 3.0V, or 3.3V */ +static const int pf0900_vaon_voltages[] = { + 0, 1800000, 3000000, 3300000, +}; + +/* + * SW1 0.5V to 3.3V + * 0.5V to 1.35V (6.25mV step) + * 1.8V to 2.5V (125mV step) + * 2.8V to 3.3V (250mV step) + */ +static const struct linear_range pf0900_dvs_sw1_volts[] = { + REGULATOR_LINEAR_RANGE(0, 0x00, 0x08, 0), + REGULATOR_LINEAR_RANGE(500000, 0x09, 0x91, 6250), + REGULATOR_LINEAR_RANGE(0, 0x92, 0x9E, 0), + REGULATOR_LINEAR_RANGE(1500000, 0x9F, 0x9F, 0), + REGULATOR_LINEAR_RANGE(1800000, 0xA0, 0xD8, 12500), + REGULATOR_LINEAR_RANGE(0, 0xD9, 0xDF, 0), + REGULATOR_LINEAR_RANGE(2800000, 0xE0, 0xF4, 25000), + REGULATOR_LINEAR_RANGE(0, 0xF5, 0xFF, 0), +}; + +/* + * SW2/3/4/5 0.3V to 3.3V + * 0.45V to 1.35V (6.25mV step) + * 1.8V to 2.5V (125mV step) + * 2.8V to 3.3V (250mV step) + */ +static const struct linear_range pf0900_dvs_sw2345_volts[] = { + REGULATOR_LINEAR_RANGE(300000, 0x00, 0x00, 0), + REGULATOR_LINEAR_RANGE(450000, 0x01, 0x91, 6250), + REGULATOR_LINEAR_RANGE(0, 0x92, 0x9E, 0), + REGULATOR_LINEAR_RANGE(1500000, 0x9F, 0x9F, 0), + REGULATOR_LINEAR_RANGE(1800000, 0xA0, 0xD8, 12500), + REGULATOR_LINEAR_RANGE(0, 0xD9, 0xDF, 0), + REGULATOR_LINEAR_RANGE(2800000, 0xE0, 0xF4, 25000), + REGULATOR_LINEAR_RANGE(0, 0xF5, 0xFF, 0), +}; + +/* + * LDO1 + * 0.75V to 3.3V + */ +static const struct linear_range pf0900_ldo1_volts[] = { + REGULATOR_LINEAR_RANGE(750000, 0x00, 0x0F, 50000), + REGULATOR_LINEAR_RANGE(1800000, 0x10, 0x1F, 100000), +}; + +/* + * LDO2/3 + * 0.65V to 3.3V (50mV step) + */ +static const struct linear_range pf0900_ldo23_volts[] = { + REGULATOR_LINEAR_RANGE(650000, 0x00, 0x0D, 50000), + REGULATOR_LINEAR_RANGE(1400000, 0x0E, 0x0F, 100000), + REGULATOR_LINEAR_RANGE(1800000, 0x10, 0x1F, 100000), +}; + +static const struct pf0900_regulator_desc pf0900_regulators[] = { + { + .desc = { + .name = "sw1", + .of_match = of_match_ptr("sw1"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_SW1, + .ops = &pf0900_dvs_sw_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_SW_VOLTAGE_NUM, + .linear_ranges = pf0900_dvs_sw1_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_dvs_sw1_volts), + .vsel_reg = PF0900_REG_SW1_VRUN, + .vsel_mask = PF0900_SW_VOL_MASK, + .enable_reg = PF0900_REG_SW1_MODE, + .enable_mask = SW_RUN_MODE_MASK, + .enable_val = SW_RUN_MODE_PWM, + .ramp_reg = PF0900_REG_SW1_CFG1, + .ramp_mask = PF0900_SW_DVS_MASK, + .ramp_delay_table = pf0900_dvs_sw_ramp_table, + .n_ramp_values = ARRAY_SIZE(pf0900_dvs_sw_ramp_table), + .owner = THIS_MODULE, + }, + .suspend_enable_mask = SW_STBY_MODE_MASK, + .suspend_voltage_reg = PF0900_REG_SW1_VSTBY, + }, + { + .desc = { + .name = "sw2", + .of_match = of_match_ptr("sw2"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_SW2, + .ops = &pf0900_dvs_sw_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_SW_VOLTAGE_NUM, + .linear_ranges = pf0900_dvs_sw2345_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_dvs_sw2345_volts), + .vsel_reg = PF0900_REG_SW2_VRUN, + .vsel_mask = PF0900_SW_VOL_MASK, + .enable_reg = PF0900_REG_SW2_MODE, + .enable_mask = SW_RUN_MODE_MASK, + .enable_val = SW_RUN_MODE_PWM, + .ramp_reg = PF0900_REG_SW2_CFG1, + .ramp_mask = PF0900_SW_DVS_MASK, + .ramp_delay_table = pf0900_dvs_sw_ramp_table, + .n_ramp_values = ARRAY_SIZE(pf0900_dvs_sw_ramp_table), + .owner = THIS_MODULE, + }, + .suspend_enable_mask = SW_STBY_MODE_MASK, + .suspend_voltage_reg = PF0900_REG_SW2_VSTBY, + }, + { + .desc = { + .name = "sw3", + .of_match = of_match_ptr("sw3"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_SW3, + .ops = &pf0900_dvs_sw_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_SW_VOLTAGE_NUM, + .linear_ranges = pf0900_dvs_sw2345_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_dvs_sw2345_volts), + .vsel_reg = PF0900_REG_SW3_VRUN, + .vsel_mask = PF0900_SW_VOL_MASK, + .enable_reg = PF0900_REG_SW3_MODE, + .enable_mask = SW_RUN_MODE_MASK, + .enable_val = SW_RUN_MODE_PWM, + .ramp_reg = PF0900_REG_SW3_CFG1, + .ramp_mask = PF0900_SW_DVS_MASK, + .ramp_delay_table = pf0900_dvs_sw_ramp_table, + .n_ramp_values = ARRAY_SIZE(pf0900_dvs_sw_ramp_table), + .owner = THIS_MODULE, + }, + .suspend_enable_mask = SW_STBY_MODE_MASK, + .suspend_voltage_reg = PF0900_REG_SW3_VSTBY, + }, + { + .desc = { + .name = "sw4", + .of_match = of_match_ptr("sw4"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_SW4, + .ops = &pf0900_dvs_sw_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_SW_VOLTAGE_NUM, + .linear_ranges = pf0900_dvs_sw2345_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_dvs_sw2345_volts), + .vsel_reg = PF0900_REG_SW4_VRUN, + .vsel_mask = PF0900_SW_VOL_MASK, + .enable_reg = PF0900_REG_SW4_MODE, + .enable_mask = SW_RUN_MODE_MASK, + .enable_val = SW_RUN_MODE_PWM, + .ramp_reg = PF0900_REG_SW4_CFG1, + .ramp_mask = PF0900_SW_DVS_MASK, + .ramp_delay_table = pf0900_dvs_sw_ramp_table, + .n_ramp_values = ARRAY_SIZE(pf0900_dvs_sw_ramp_table), + .owner = THIS_MODULE, + }, + .suspend_enable_mask = SW_STBY_MODE_MASK, + .suspend_voltage_reg = PF0900_REG_SW4_VSTBY, + }, + { + .desc = { + .name = "sw5", + .of_match = of_match_ptr("sw5"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_SW5, + .ops = &pf0900_dvs_sw_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_SW_VOLTAGE_NUM, + .linear_ranges = pf0900_dvs_sw2345_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_dvs_sw2345_volts), + .vsel_reg = PF0900_REG_SW5_VRUN, + .vsel_mask = PF0900_SW_VOL_MASK, + .enable_reg = PF0900_REG_SW5_MODE, + .enable_mask = SW_RUN_MODE_MASK, + .enable_val = SW_RUN_MODE_PWM, + .ramp_reg = PF0900_REG_SW5_CFG1, + .ramp_mask = PF0900_SW_DVS_MASK, + .ramp_delay_table = pf0900_dvs_sw_ramp_table, + .n_ramp_values = ARRAY_SIZE(pf0900_dvs_sw_ramp_table), + .owner = THIS_MODULE, + }, + .suspend_enable_mask = SW_STBY_MODE_MASK, + .suspend_voltage_reg = PF0900_REG_SW5_VSTBY, + }, + { + .desc = { + .name = "ldo1", + .of_match = of_match_ptr("ldo1"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_LDO1, + .ops = &pf0900_ldo_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_LDO_VOLTAGE_NUM, + .linear_ranges = pf0900_ldo1_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_ldo1_volts), + .vsel_reg = PF0900_REG_LDO1_RUN, + .vsel_mask = VLDO_RUN_MASK, + .enable_reg = PF0900_REG_LDO1_RUN, + .enable_mask = LDO_RUN_EN_MASK, + .owner = THIS_MODULE, + }, + }, + { + .desc = { + .name = "ldo2", + .of_match = of_match_ptr("ldo2"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_LDO2, + .ops = &pf0900_ldo_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_LDO_VOLTAGE_NUM, + .linear_ranges = pf0900_ldo23_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_ldo23_volts), + .vsel_reg = PF0900_REG_LDO2_RUN, + .vsel_mask = VLDO_RUN_MASK, + .enable_reg = PF0900_REG_LDO2_RUN, + .enable_mask = LDO_RUN_EN_MASK, + .owner = THIS_MODULE, + }, + }, + { + .desc = { + .name = "ldo3", + .of_match = of_match_ptr("ldo3"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_LDO3, + .ops = &pf0900_ldo_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_LDO_VOLTAGE_NUM, + .linear_ranges = pf0900_ldo23_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_ldo23_volts), + .vsel_reg = PF0900_REG_LDO3_RUN, + .vsel_mask = VLDO_RUN_MASK, + .enable_reg = PF0900_REG_LDO3_RUN, + .enable_mask = LDO_RUN_EN_MASK, + .owner = THIS_MODULE, + }, + }, + { + .desc = { + .name = "vaon", + .of_match = of_match_ptr("vaon"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_VAON, + .ops = &pf0900_avon_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_VAON_VOLTAGE_NUM, + .volt_table = pf0900_vaon_voltages, + .enable_reg = PF0900_REG_VAON_CFG1, + .enable_mask = PF0900_VAON_MASK, + .enable_val = PF0900_VAON_1P8V, + .vsel_reg = PF0900_REG_VAON_CFG1, + .vsel_mask = PF0900_VAON_MASK, + .owner = THIS_MODULE, + }, + }, +}; + +struct pf0900_regulator_irq regu_irqs[] = { + PF0900_REGU_IRQ(PF0900_REG_SW_ILIM_INT, PF0900_SW, REGULATOR_ERROR_OVER_CURRENT_WARN), + PF0900_REGU_IRQ(PF0900_REG_LDO_ILIM_INT, PF0900_LDO, REGULATOR_ERROR_OVER_CURRENT_WARN), + PF0900_REGU_IRQ(PF0900_REG_SW_UV_INT, PF0900_SW, REGULATOR_ERROR_UNDER_VOLTAGE_WARN), + PF0900_REGU_IRQ(PF0900_REG_LDO_UV_INT, PF0900_LDO, REGULATOR_ERROR_UNDER_VOLTAGE_WARN), + PF0900_REGU_IRQ(PF0900_REG_SW_OV_INT, PF0900_SW, REGULATOR_ERROR_OVER_VOLTAGE_WARN), + PF0900_REGU_IRQ(PF0900_REG_LDO_OV_INT, PF0900_LDO, REGULATOR_ERROR_OVER_VOLTAGE_WARN), +}; + +static irqreturn_t pf0900_irq_handler(int irq, void *data) +{ + unsigned int val, regu, i, index; + struct pf0900 *pf0900 = data; + int ret; + + for (i = 0; i < ARRAY_SIZE(regu_irqs); i++) { + ret = regmap_read(pf0900->regmap, regu_irqs[i].reg, &val); + if (ret < 0) { + dev_err(pf0900->dev, "Failed to read %d\n", ret); + return IRQ_NONE; + } + if (val) { + ret = regmap_write_bits(pf0900->regmap, regu_irqs[i].reg, val, val); + if (ret < 0) { + dev_err(pf0900->dev, "Failed to update %d\n", ret); + return IRQ_NONE; + } + + if (regu_irqs[i].type == PF0900_SW) { + for (index = 0; index < REGU_SW_CNT; index++) { + if (val & BIT(index)) { + regu = (enum pf0900_regulators)index; + regulator_notifier_call_chain(pf0900->rdevs[regu], + regu_irqs[i].event, + NULL); + } + } + } else if (regu_irqs[i].type == PF0900_LDO) { + for (index = 0; index < REGU_LDO_VAON_CNT; index++) { + if (val & BIT(index)) { + regu = (enum pf0900_regulators)index + PF0900_LDO1; + regulator_notifier_call_chain(pf0900->rdevs[regu], + regu_irqs[i].event, + NULL); + } + } + } + } + } + + return IRQ_HANDLED; +} + +static int pf0900_i2c_probe(struct i2c_client *i2c) +{ + const struct pf0900_regulator_desc *regulator_desc; + const struct pf0900_drvdata *drvdata = NULL; + struct device_node *np = i2c->dev.of_node; + unsigned int device_id, device_fam, i; + struct regulator_config config = { }; + struct pf0900 *pf0900; + int ret; + + if (!i2c->irq) + return dev_err_probe(&i2c->dev, -EINVAL, "No IRQ configured?\n"); + + pf0900 = devm_kzalloc(&i2c->dev, sizeof(struct pf0900), GFP_KERNEL); + if (!pf0900) + return -ENOMEM; + + drvdata = device_get_match_data(&i2c->dev); + if (!drvdata) + return dev_err_probe(&i2c->dev, -EINVAL, "unable to find driver data\n"); + + regulator_desc = drvdata->desc; + pf0900->drvdata = drvdata; + pf0900->crc_en = of_property_read_bool(np, "nxp,i2c-crc-enable"); + pf0900->irq = i2c->irq; + pf0900->dev = &i2c->dev; + pf0900->addr = i2c->addr; + + dev_set_drvdata(&i2c->dev, pf0900); + + pf0900->regmap = devm_regmap_init(&i2c->dev, &pf0900_regmap_bus, &i2c->dev, + &pf0900_regmap_config); + if (IS_ERR(pf0900->regmap)) + return dev_err_probe(&i2c->dev, PTR_ERR(pf0900->regmap), + "regmap initialization failed\n"); + ret = regmap_read(pf0900->regmap, PF0900_REG_DEV_ID, &device_id); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Read device id error\n"); + + ret = regmap_read(pf0900->regmap, PF0900_REG_DEV_FAM, &device_fam); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Read device fam error\n"); + + /* Check your board and dts for match the right pmic */ + if (device_fam == 0x09 && (device_id & 0x1F) != 0x0) + return dev_err_probe(&i2c->dev, -EINVAL, "Device id(%x) mismatched\n", + device_id >> 4); + + for (i = 0; i < drvdata->rcnt; i++) { + const struct regulator_desc *desc; + const struct pf0900_regulator_desc *r; + + r = ®ulator_desc[i]; + desc = &r->desc; + config.regmap = pf0900->regmap; + config.driver_data = (void *)r; + config.dev = pf0900->dev; + + pf0900->rdevs[i] = devm_regulator_register(pf0900->dev, desc, &config); + if (IS_ERR(pf0900->rdevs[i])) + return dev_err_probe(pf0900->dev, PTR_ERR(pf0900->rdevs[i]), + "Failed to register regulator(%s)\n", desc->name); + } + + ret = devm_request_threaded_irq(pf0900->dev, pf0900->irq, NULL, + pf0900_irq_handler, + (IRQF_TRIGGER_FALLING | IRQF_ONESHOT), + "pf0900-irq", pf0900); + + if (ret != 0) + return dev_err_probe(pf0900->dev, ret, "Failed to request IRQ: %d\n", + pf0900->irq); + /* + * The PWRUP_M is unmasked by default. When the device enter in RUN state, + * it will assert the PWRUP_I interrupt and assert the INTB pin to inform + * the MCU that it has finished the power up sequence properly. + */ + ret = regmap_write_bits(pf0900->regmap, PF0900_REG_STATUS1_INT, PF0900_IRQ_PWRUP, + PF0900_IRQ_PWRUP); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Clean PWRUP_I error\n"); + + /* mask interrupt PWRUP */ + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_STATUS1_MSK, PF0900_IRQ_PWRUP, + PF0900_IRQ_PWRUP); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_SW_ILIM_MSK, PF0900_IRQ_SW1_IL | + PF0900_IRQ_SW2_IL | PF0900_IRQ_SW3_IL | PF0900_IRQ_SW4_IL | + PF0900_IRQ_SW5_IL, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_SW_UV_MSK, PF0900_IRQ_SW1_UV | + PF0900_IRQ_SW2_UV | PF0900_IRQ_SW3_UV | PF0900_IRQ_SW4_UV | + PF0900_IRQ_SW5_UV, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_SW_OV_MSK, PF0900_IRQ_SW1_OV | + PF0900_IRQ_SW2_OV | PF0900_IRQ_SW3_OV | PF0900_IRQ_SW4_OV | + PF0900_IRQ_SW5_OV, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_LDO_ILIM_MSK, PF0900_IRQ_LDO1_IL | + PF0900_IRQ_LDO2_IL | PF0900_IRQ_LDO3_IL, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_LDO_UV_MSK, PF0900_IRQ_LDO1_UV | + PF0900_IRQ_LDO2_UV | PF0900_IRQ_LDO3_UV | PF0900_IRQ_VAON_UV, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_LDO_OV_MSK, PF0900_IRQ_LDO1_OV | + PF0900_IRQ_LDO2_OV | PF0900_IRQ_LDO3_OV | PF0900_IRQ_VAON_OV, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + return 0; +} + +static struct pf0900_drvdata pf0900_drvdata = { + .desc = pf0900_regulators, + .rcnt = ARRAY_SIZE(pf0900_regulators), +}; + +static const struct of_device_id pf0900_of_match[] = { + { .compatible = "nxp,pf0900", .data = &pf0900_drvdata}, + { } +}; + +MODULE_DEVICE_TABLE(of, pf0900_of_match); + +static struct i2c_driver pf0900_i2c_driver = { + .driver = { + .name = "nxp-pf0900", + .of_match_table = pf0900_of_match, + }, + .probe = pf0900_i2c_probe, +}; + +module_i2c_driver(pf0900_i2c_driver); + +MODULE_AUTHOR("Joy Zou "); +MODULE_DESCRIPTION("NXP PF0900 Power Management IC driver"); +MODULE_LICENSE("GPL"); From 692abf80ce8443a4ded8cade25a1acf772436293 Mon Sep 17 00:00:00 2001 From: Waqar Hameed Date: Tue, 5 Aug 2025 11:33:37 +0200 Subject: [PATCH 0021/3206] spi: nxp-fspi: Remove error print for devm_add_action_or_reset() When `devm_add_action_or_reset()` fails, it is due to a failed memory allocation and will thus return `-ENOMEM`. `dev_err_probe()` doesn't do anything when error is `-ENOMEM`. Therefore, remove the useless call to `dev_err_probe()` when `devm_add_action_or_reset()` fails, and just return the value instead. Signed-off-by: Waqar Hameed Link: https://patch.msgid.link/pndqzxqkt8u.a.out@axis.com Signed-off-by: Mark Brown --- drivers/spi/spi-nxp-fspi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c index b92bfef47371fa..848fa9319e36d6 100644 --- a/drivers/spi/spi-nxp-fspi.c +++ b/drivers/spi/spi-nxp-fspi.c @@ -1284,7 +1284,7 @@ static int nxp_fspi_probe(struct platform_device *pdev) ret = devm_add_action_or_reset(dev, nxp_fspi_cleanup, f); if (ret) - return dev_err_probe(dev, ret, "Failed to register nxp_fspi_cleanup\n"); + return ret; return devm_spi_register_controller(&pdev->dev, ctlr); } From f1ac30a862cc5d53392d59f42b513e79948acb18 Mon Sep 17 00:00:00 2001 From: Darshan R Date: Mon, 28 Jul 2025 12:41:04 +0000 Subject: [PATCH 0022/3206] spi: sunplus: sp7021: Clean up coding style This patch tidies up minor coding style deviations within the Sunplus SP7021 SPI driver, ensuring closer adherence to established kernel coding guidelines. Specifically, it addresses: - Correction of a whitespace inconsistency before a comma in `writel()` calls. - Alignment of function parameter indentation for `struct spi_transfer *xfer` in `sp7021_spi_host_transfer_one()` and `sp7021_spi_target_transfer_one()`. While purely cosmetic, these adjustments contribute to improved code readability and maintainability, making future development and review easier. Signed-off-by: Darshan R. Link: https://patch.msgid.link/20250728124104.6370-1-rathod.darshan.0896@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-sunplus-sp7021.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-sunplus-sp7021.c b/drivers/spi/spi-sunplus-sp7021.c index 7fd4cc6f74c25e..256ae07db6becc 100644 --- a/drivers/spi/spi-sunplus-sp7021.c +++ b/drivers/spi/spi-sunplus-sp7021.c @@ -103,7 +103,7 @@ static irqreturn_t sp7021_spi_target_irq(int irq, void *dev) data_status = readl(pspim->s_base + SP7021_DATA_RDY_REG); data_status |= SP7021_SLAVE_CLR_INT; - writel(data_status , pspim->s_base + SP7021_DATA_RDY_REG); + writel(data_status, pspim->s_base + SP7021_DATA_RDY_REG); complete(&pspim->target_isr); return IRQ_HANDLED; } @@ -296,7 +296,7 @@ static void sp7021_spi_setup_clk(struct spi_controller *ctlr, struct spi_transfe } static int sp7021_spi_host_transfer_one(struct spi_controller *ctlr, struct spi_device *spi, - struct spi_transfer *xfer) + struct spi_transfer *xfer) { struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); unsigned long timeout = msecs_to_jiffies(1000); @@ -360,7 +360,7 @@ static int sp7021_spi_host_transfer_one(struct spi_controller *ctlr, struct spi_ } static int sp7021_spi_target_transfer_one(struct spi_controller *ctlr, struct spi_device *spi, - struct spi_transfer *xfer) + struct spi_transfer *xfer) { struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); struct device *dev = pspim->dev; From 379f819733f28b5aa1802f3ede442f08b7275f4e Mon Sep 17 00:00:00 2001 From: Manikandan Muralidharan Date: Wed, 30 Jul 2025 15:40:13 +0530 Subject: [PATCH 0023/3206] spi: atmel: simplify MR register update in cs_activate() simplified the MR register configuration by updating only the PCS field using SPI_BFINS() instead of rewriting the entire register. Avoids code duplication. Signed-off-by: Manikandan Muralidharan Link: https://patch.msgid.link/20250730101015.323964-1-manikandan.m@microchip.com Signed-off-by: Mark Brown --- drivers/spi/spi-atmel.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/spi/spi-atmel.c b/drivers/spi/spi-atmel.c index 89a6b46cd3191a..409f544d898375 100644 --- a/drivers/spi/spi-atmel.c +++ b/drivers/spi/spi-atmel.c @@ -397,20 +397,10 @@ static void cs_activate(struct atmel_spi *as, struct spi_device *spi) * on CS1,2,3 needs SPI_CSR0.BITS config as SPI_CSR1,2,3.BITS */ spi_writel(as, CSR0, asd->csr); - if (as->caps.has_wdrbt) { - spi_writel(as, MR, - SPI_BF(PCS, ~(0x01 << chip_select)) - | SPI_BIT(WDRBT) - | SPI_BIT(MODFDIS) - | SPI_BIT(MSTR)); - } else { - spi_writel(as, MR, - SPI_BF(PCS, ~(0x01 << chip_select)) - | SPI_BIT(MODFDIS) - | SPI_BIT(MSTR)); - } mr = spi_readl(as, MR); + mr = SPI_BFINS(PCS, ~(0x01 << chip_select), mr); + spi_writel(as, MR, mr); /* * Ensures the clock polarity is valid before we actually From a673ebd0a2a587f3d6b923d59d363fc9e8a9f920 Mon Sep 17 00:00:00 2001 From: Manikandan Muralidharan Date: Wed, 30 Jul 2025 15:40:14 +0530 Subject: [PATCH 0024/3206] spi: dt-bindings: atmel,at91rm9200-spi: Add support for optional 'spi_gclk' clock Update the Atmel SPI DT binding to support an optional programmable SPI generic clock 'spi_gclk', in addition to the required 'spi_clk'. Signed-off-by: Manikandan Muralidharan Link: https://patch.msgid.link/20250730101015.323964-2-manikandan.m@microchip.com Signed-off-by: Mark Brown --- .../devicetree/bindings/spi/atmel,at91rm9200-spi.yaml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/spi/atmel,at91rm9200-spi.yaml b/Documentation/devicetree/bindings/spi/atmel,at91rm9200-spi.yaml index d29772994cf5f1..11885d0cc2099f 100644 --- a/Documentation/devicetree/bindings/spi/atmel,at91rm9200-spi.yaml +++ b/Documentation/devicetree/bindings/spi/atmel,at91rm9200-spi.yaml @@ -31,11 +31,16 @@ properties: maxItems: 1 clock-names: - contains: - const: spi_clk + items: + - const: spi_clk + - const: spi_gclk + minItems: 1 clocks: - maxItems: 1 + items: + - description: Peripheral Bus clock + - description: Programmable Generic clock + minItems: 1 dmas: items: From 91e5722baaea06c1f1b105e9e3fd6645dbdd938b Mon Sep 17 00:00:00 2001 From: Manikandan Muralidharan Date: Wed, 30 Jul 2025 15:40:15 +0530 Subject: [PATCH 0025/3206] spi: atmel: Add support for handling GCLK as a clock source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SPI peripheral clock is typically used to derive the serial clock (SPCK) via the FLEX_SPI_CSRx.SCBR field. However, on platforms like the SAM9X7 SoC, where the peripheral clock can reach up to 266 MHz, this may exceed the SCBR limit, causing SPI transfers to fail. This patch adds support for using the SPI Generic Clock (GCLK) as an alternative and more flexible clock source for SPCK generation. The FLEX_SPI_MR.BRSRCCLK bit is updated accordingly to select between the peripheral clock and GCLK. Signed-off-by: Manikandan Muralidharan Link: https://patch.msgid.link/20250730101015.323964-3-manikandan.m@microchip.com Signed-off-by: Mark Brown --- drivers/spi/spi-atmel.c | 64 +++++++++++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 8 deletions(-) diff --git a/drivers/spi/spi-atmel.c b/drivers/spi/spi-atmel.c index 409f544d898375..89977bff76d270 100644 --- a/drivers/spi/spi-atmel.c +++ b/drivers/spi/spi-atmel.c @@ -256,6 +256,7 @@ struct atmel_spi { void __iomem *regs; int irq; struct clk *clk; + struct clk *gclk; struct platform_device *pdev; unsigned long spi_clk; @@ -1480,6 +1481,8 @@ static void atmel_get_caps(struct atmel_spi *as) static void atmel_spi_init(struct atmel_spi *as) { + u32 mr = 0; + spi_writel(as, CR, SPI_BIT(SWRST)); spi_writel(as, CR, SPI_BIT(SWRST)); /* AT91SAM9263 Rev B workaround */ @@ -1487,12 +1490,17 @@ static void atmel_spi_init(struct atmel_spi *as) if (as->fifo_size) spi_writel(as, CR, SPI_BIT(FIFOEN)); - if (as->caps.has_wdrbt) { - spi_writel(as, MR, SPI_BIT(WDRBT) | SPI_BIT(MODFDIS) - | SPI_BIT(MSTR)); - } else { - spi_writel(as, MR, SPI_BIT(MSTR) | SPI_BIT(MODFDIS)); - } + /* + * If GCLK is selected as the source clock for the bit rate generation + * Enable the BRSRCCLK/FDIV/DIV32 bit + */ + if (as->gclk) + mr |= SPI_BIT(FDIV); + + if (as->caps.has_wdrbt) + mr |= SPI_BIT(WDRBT); + + spi_writel(as, MR, mr | SPI_BIT(MODFDIS) | SPI_BIT(MSTR)); if (as->use_pdc) spi_writel(as, PTCR, SPI_BIT(RXTDIS) | SPI_BIT(TXTDIS)); @@ -1555,6 +1563,11 @@ static int atmel_spi_probe(struct platform_device *pdev) as->phybase = regs->start; as->irq = irq; as->clk = clk; + as->gclk = devm_clk_get_optional(&pdev->dev, "spi_gclk"); + if (IS_ERR(as->gclk)) { + ret = PTR_ERR(as->gclk); + goto out_unmap_regs; + } init_completion(&as->xfer_completion); @@ -1615,7 +1628,19 @@ static int atmel_spi_probe(struct platform_device *pdev) if (ret) goto out_free_irq; - as->spi_clk = clk_get_rate(clk); + /* + * In cases where the peripheral clock is higher,the FLEX_SPI_CSRx.SCBR + * exceeds the threshold (SCBR ≤ 255), the GCLK is used as the source clock + * for the SPCK (SPI Serial Clock) bit rate generation + */ + if (as->gclk) { + ret = clk_prepare_enable(as->gclk); + if (ret) + goto out_disable_clk; + as->spi_clk = clk_get_rate(as->gclk); + } else { + as->spi_clk = clk_get_rate(clk); + } as->fifo_size = 0; if (!of_property_read_u32(pdev->dev.of_node, "atmel,fifo-size", @@ -1650,6 +1675,8 @@ static int atmel_spi_probe(struct platform_device *pdev) spi_writel(as, CR, SPI_BIT(SWRST)); spi_writel(as, CR, SPI_BIT(SWRST)); /* AT91SAM9263 Rev B workaround */ + clk_disable_unprepare(as->gclk); +out_disable_clk: clk_disable_unprepare(clk); out_free_irq: out_unmap_regs: @@ -1685,6 +1712,8 @@ static void atmel_spi_remove(struct platform_device *pdev) spin_unlock_irq(&as->lock); clk_disable_unprepare(as->clk); + if (as->gclk) + clk_disable_unprepare(as->gclk); pm_runtime_put_noidle(&pdev->dev); pm_runtime_disable(&pdev->dev); @@ -1696,6 +1725,8 @@ static int atmel_spi_runtime_suspend(struct device *dev) struct atmel_spi *as = spi_controller_get_devdata(host); clk_disable_unprepare(as->clk); + if (as->gclk) + clk_disable_unprepare(as->gclk); pinctrl_pm_select_sleep_state(dev); return 0; @@ -1705,10 +1736,20 @@ static int atmel_spi_runtime_resume(struct device *dev) { struct spi_controller *host = dev_get_drvdata(dev); struct atmel_spi *as = spi_controller_get_devdata(host); + int ret; pinctrl_pm_select_default_state(dev); - return clk_prepare_enable(as->clk); + ret = clk_prepare_enable(as->clk); + if (ret) + return ret; + if (as->gclk) { + ret = clk_prepare_enable(as->gclk); + if (ret) + return ret; + } + + return 0; } static int atmel_spi_suspend(struct device *dev) @@ -1736,10 +1777,17 @@ static int atmel_spi_resume(struct device *dev) ret = clk_prepare_enable(as->clk); if (ret) return ret; + if (as->gclk) { + ret = clk_prepare_enable(as->gclk); + if (ret) + return ret; + } atmel_spi_init(as); clk_disable_unprepare(as->clk); + if (as->gclk) + clk_disable_unprepare(as->gclk); if (!pm_runtime_suspended(dev)) { ret = atmel_spi_runtime_resume(dev); From a1d0b0ae65ae3f32597edfbb547f16c75601cd87 Mon Sep 17 00:00:00 2001 From: Gabor Juhos Date: Fri, 8 Aug 2025 15:15:32 +0200 Subject: [PATCH 0026/3206] spi: spi-qpic-snand: avoid double assignment in qcom_spi_probe() The snandc->dev pointer is being assigned twice in the qcom_spi_probe() function. Remove the second assignment as that uses the same pointer value than the first one. No functional changes. Signed-off-by: Gabor Juhos Link: https://patch.msgid.link/20250808-qpic-snand-double-assign-fix-v2-1-1a3d0ed0d404@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-qpic-snand.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/spi/spi-qpic-snand.c b/drivers/spi/spi-qpic-snand.c index 0ceaad7dba3cb5..5c72d8ed74bb4e 100644 --- a/drivers/spi/spi-qpic-snand.c +++ b/drivers/spi/spi-qpic-snand.c @@ -1549,7 +1549,6 @@ static int qcom_spi_probe(struct platform_device *pdev) } snandc->props = dev_data; - snandc->dev = &pdev->dev; snandc->core_clk = devm_clk_get(dev, "core"); if (IS_ERR(snandc->core_clk)) From 22571172257a55c443f1a9306e963da4c6187e83 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Wed, 12 Feb 2025 18:03:53 +0100 Subject: [PATCH 0027/3206] dt-bindings: dma: qcom: bam-dma: Add missing required properties num-channels and qcom,num-ees are required when there are no clocks specified in the device tree, because we have no reliable way to read them from the hardware registers if we cannot ensure the BAM hardware is up when the device is being probed. This has often been forgotten when adding new SoC device trees, so make this clear by describing this requirement in the schema. Signed-off-by: Stephan Gerhold Link: https://lore.kernel.org/r/20250212-bam-dma-fixes-v1-7-f560889e65d8@linaro.org Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml b/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml index f2f87f0f545bc5..6493a6968bb4b9 100644 --- a/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml +++ b/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml @@ -92,8 +92,12 @@ required: anyOf: - required: - qcom,powered-remotely + - num-channels + - qcom,num-ees - required: - qcom,controlled-remotely + - num-channels + - qcom,num-ees - required: - clocks - clock-names From 5068b5254812433e841a40886e695633148d362d Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Wed, 12 Feb 2025 18:03:54 +0100 Subject: [PATCH 0028/3206] dmaengine: qcom: bam_dma: Fix DT error handling for num-channels/ees When we don't have a clock specified in the device tree, we have no way to ensure the BAM is on. This is often the case for remotely-controlled or remotely-powered BAM instances. In this case, we need to read num-channels from the DT to have all the necessary information to complete probing. However, at the moment invalid device trees without clock and without num-channels still continue probing, because the error handling is missing return statements. The driver will then later try to read the number of channels from the registers. This is unsafe, because it relies on boot firmware and lucky timing to succeed. Unfortunately, the lack of proper error handling here has been abused for several Qualcomm SoCs upstream, causing early boot crashes in several situations [1, 2]. Avoid these early crashes by erroring out when any of the required DT properties are missing. Note that this will break some of the existing DTs upstream (mainly BAM instances related to the crypto engine). However, clearly these DTs have never been tested properly, since the error in the kernel log was just ignored. It's safer to disable the crypto engine for these broken DTBs. [1]: https://lore.kernel.org/r/CY01EKQVWE36.B9X5TDXAREPF@fairphone.com/ [2]: https://lore.kernel.org/r/20230626145959.646747-1-krzysztof.kozlowski@linaro.org/ Cc: stable@vger.kernel.org Fixes: 48d163b1aa6e ("dmaengine: qcom: bam_dma: get num-channels and num-ees from dt") Signed-off-by: Stephan Gerhold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20250212-bam-dma-fixes-v1-8-f560889e65d8@linaro.org Signed-off-by: Vinod Koul --- drivers/dma/qcom/bam_dma.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c index bbc3276992bb01..2cf060174795fe 100644 --- a/drivers/dma/qcom/bam_dma.c +++ b/drivers/dma/qcom/bam_dma.c @@ -1283,13 +1283,17 @@ static int bam_dma_probe(struct platform_device *pdev) if (!bdev->bamclk) { ret = of_property_read_u32(pdev->dev.of_node, "num-channels", &bdev->num_channels); - if (ret) + if (ret) { dev_err(bdev->dev, "num-channels unspecified in dt\n"); + return ret; + } ret = of_property_read_u32(pdev->dev.of_node, "qcom,num-ees", &bdev->num_ees); - if (ret) + if (ret) { dev_err(bdev->dev, "num-ees unspecified in dt\n"); + return ret; + } } ret = clk_prepare_enable(bdev->bamclk); From 432418bf27ababad360d4c07d2efcd235107a971 Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Mon, 21 Jul 2025 04:44:49 +0000 Subject: [PATCH 0029/3206] platform/chrome: cros_ec_chardev: Remove redundant struct field `ec_dev` in the `struct chardev_data` is unused. Remove the field and the whole struct as well. Link: https://lore.kernel.org/r/20250721044456.2736300-2-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec_chardev.c | 28 +++++++++-------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_chardev.c b/drivers/platform/chrome/cros_ec_chardev.c index 21a484385fc5db..5c858d30dd524a 100644 --- a/drivers/platform/chrome/cros_ec_chardev.c +++ b/drivers/platform/chrome/cros_ec_chardev.c @@ -31,11 +31,6 @@ /* Arbitrary bounded size for the event queue */ #define CROS_MAX_EVENT_LEN PAGE_SIZE -struct chardev_data { - struct cros_ec_dev *ec_dev; - struct miscdevice misc; -}; - struct chardev_priv { struct cros_ec_dev *ec_dev; struct notifier_block notifier; @@ -379,29 +374,28 @@ static int cros_ec_chardev_probe(struct platform_device *pdev) { struct cros_ec_dev *ec_dev = dev_get_drvdata(pdev->dev.parent); struct cros_ec_platform *ec_platform = dev_get_platdata(ec_dev->dev); - struct chardev_data *data; + struct miscdevice *misc; /* Create a char device: we want to create it anew */ - data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); - if (!data) + misc = devm_kzalloc(&pdev->dev, sizeof(*misc), GFP_KERNEL); + if (!misc) return -ENOMEM; - data->ec_dev = ec_dev; - data->misc.minor = MISC_DYNAMIC_MINOR; - data->misc.fops = &chardev_fops; - data->misc.name = ec_platform->ec_name; - data->misc.parent = pdev->dev.parent; + misc->minor = MISC_DYNAMIC_MINOR; + misc->fops = &chardev_fops; + misc->name = ec_platform->ec_name; + misc->parent = pdev->dev.parent; - dev_set_drvdata(&pdev->dev, data); + dev_set_drvdata(&pdev->dev, misc); - return misc_register(&data->misc); + return misc_register(misc); } static void cros_ec_chardev_remove(struct platform_device *pdev) { - struct chardev_data *data = dev_get_drvdata(&pdev->dev); + struct miscdevice *misc = dev_get_drvdata(&pdev->dev); - misc_deregister(&data->misc); + misc_deregister(misc); } static const struct platform_device_id cros_ec_chardev_id[] = { From 032c59c1d3cd456053339fb95f6ee8e68c5ad1e2 Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Mon, 21 Jul 2025 04:44:51 +0000 Subject: [PATCH 0030/3206] platform/chrome: cros_ec_chardev: Decouple fops from struct cros_ec_dev The fops doesn't really need to hold a reference to struct cros_ec_dev. Remove the references from the fops. No functional changes. Link: https://lore.kernel.org/r/20250721044456.2736300-4-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec_chardev.c | 44 +++++++++++------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_chardev.c b/drivers/platform/chrome/cros_ec_chardev.c index 5c858d30dd524a..c9d80ad5b57e18 100644 --- a/drivers/platform/chrome/cros_ec_chardev.c +++ b/drivers/platform/chrome/cros_ec_chardev.c @@ -32,12 +32,13 @@ #define CROS_MAX_EVENT_LEN PAGE_SIZE struct chardev_priv { - struct cros_ec_dev *ec_dev; + struct cros_ec_device *ec_dev; struct notifier_block notifier; wait_queue_head_t wait_event; unsigned long event_mask; struct list_head events; size_t event_len; + u16 cmd_offset; }; struct ec_event { @@ -47,7 +48,7 @@ struct ec_event { u8 data[]; }; -static int ec_get_version(struct cros_ec_dev *ec, char *str, int maxlen) +static int ec_get_version(struct chardev_priv *priv, char *str, int maxlen) { static const char * const current_image_name[] = { "unknown", "read-only", "read-write", "invalid", @@ -60,10 +61,10 @@ static int ec_get_version(struct cros_ec_dev *ec, char *str, int maxlen) if (!msg) return -ENOMEM; - msg->command = EC_CMD_GET_VERSION + ec->cmd_offset; + msg->command = EC_CMD_GET_VERSION + priv->cmd_offset; msg->insize = sizeof(*resp); - ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg); + ret = cros_ec_cmd_xfer_status(priv->ec_dev, msg); if (ret < 0) { snprintf(str, maxlen, "Unknown EC version, returned error: %d\n", @@ -91,7 +92,7 @@ static int cros_ec_chardev_mkbp_event(struct notifier_block *nb, { struct chardev_priv *priv = container_of(nb, struct chardev_priv, notifier); - struct cros_ec_device *ec_dev = priv->ec_dev->ec_dev; + struct cros_ec_device *ec_dev = priv->ec_dev; struct ec_event *event; unsigned long event_bit = 1 << ec_dev->event_data.event_type; int total_size = sizeof(*event) + ec_dev->event_size; @@ -156,7 +157,8 @@ static struct ec_event *cros_ec_chardev_fetch_event(struct chardev_priv *priv, static int cros_ec_chardev_open(struct inode *inode, struct file *filp) { struct miscdevice *mdev = filp->private_data; - struct cros_ec_dev *ec_dev = dev_get_drvdata(mdev->parent); + struct cros_ec_dev *ec = dev_get_drvdata(mdev->parent); + struct cros_ec_device *ec_dev = ec->ec_dev; struct chardev_priv *priv; int ret; @@ -165,13 +167,14 @@ static int cros_ec_chardev_open(struct inode *inode, struct file *filp) return -ENOMEM; priv->ec_dev = ec_dev; + priv->cmd_offset = ec->cmd_offset; filp->private_data = priv; INIT_LIST_HEAD(&priv->events); init_waitqueue_head(&priv->wait_event); nonseekable_open(inode, filp); priv->notifier.notifier_call = cros_ec_chardev_mkbp_event; - ret = blocking_notifier_chain_register(&ec_dev->ec_dev->event_notifier, + ret = blocking_notifier_chain_register(&ec_dev->event_notifier, &priv->notifier); if (ret) { dev_err(ec_dev->dev, "failed to register event notifier\n"); @@ -199,7 +202,6 @@ static ssize_t cros_ec_chardev_read(struct file *filp, char __user *buffer, char msg[sizeof(struct ec_response_get_version) + sizeof(CROS_EC_DEV_VERSION)]; struct chardev_priv *priv = filp->private_data; - struct cros_ec_dev *ec_dev = priv->ec_dev; size_t count; int ret; @@ -233,7 +235,7 @@ static ssize_t cros_ec_chardev_read(struct file *filp, char __user *buffer, if (*offset != 0) return 0; - ret = ec_get_version(ec_dev, msg, sizeof(msg)); + ret = ec_get_version(priv, msg, sizeof(msg)); if (ret) return ret; @@ -249,10 +251,10 @@ static ssize_t cros_ec_chardev_read(struct file *filp, char __user *buffer, static int cros_ec_chardev_release(struct inode *inode, struct file *filp) { struct chardev_priv *priv = filp->private_data; - struct cros_ec_dev *ec_dev = priv->ec_dev; + struct cros_ec_device *ec_dev = priv->ec_dev; struct ec_event *event, *e; - blocking_notifier_chain_unregister(&ec_dev->ec_dev->event_notifier, + blocking_notifier_chain_unregister(&ec_dev->event_notifier, &priv->notifier); list_for_each_entry_safe(event, e, &priv->events, node) { @@ -267,7 +269,7 @@ static int cros_ec_chardev_release(struct inode *inode, struct file *filp) /* * Ioctls */ -static long cros_ec_chardev_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg) +static long cros_ec_chardev_ioctl_xcmd(struct chardev_priv *priv, void __user *arg) { struct cros_ec_command *s_cmd; struct cros_ec_command u_cmd; @@ -296,8 +298,8 @@ static long cros_ec_chardev_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg) goto exit; } - s_cmd->command += ec->cmd_offset; - ret = cros_ec_cmd_xfer(ec->ec_dev, s_cmd); + s_cmd->command += priv->cmd_offset; + ret = cros_ec_cmd_xfer(priv->ec_dev, s_cmd); /* Only copy data to userland if data was received. */ if (ret < 0) goto exit; @@ -309,10 +311,9 @@ static long cros_ec_chardev_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg) return ret; } -static long cros_ec_chardev_ioctl_readmem(struct cros_ec_dev *ec, - void __user *arg) +static long cros_ec_chardev_ioctl_readmem(struct chardev_priv *priv, void __user *arg) { - struct cros_ec_device *ec_dev = ec->ec_dev; + struct cros_ec_device *ec_dev = priv->ec_dev; struct cros_ec_readmem s_mem = { }; long num; @@ -341,16 +342,15 @@ static long cros_ec_chardev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct chardev_priv *priv = filp->private_data; - struct cros_ec_dev *ec = priv->ec_dev; if (_IOC_TYPE(cmd) != CROS_EC_DEV_IOC) return -ENOTTY; switch (cmd) { case CROS_EC_DEV_IOCXCMD: - return cros_ec_chardev_ioctl_xcmd(ec, (void __user *)arg); + return cros_ec_chardev_ioctl_xcmd(priv, (void __user *)arg); case CROS_EC_DEV_IOCRDMEM: - return cros_ec_chardev_ioctl_readmem(ec, (void __user *)arg); + return cros_ec_chardev_ioctl_readmem(priv, (void __user *)arg); case CROS_EC_DEV_IOCEVENTMASK: priv->event_mask = arg; return 0; @@ -372,8 +372,8 @@ static const struct file_operations chardev_fops = { static int cros_ec_chardev_probe(struct platform_device *pdev) { - struct cros_ec_dev *ec_dev = dev_get_drvdata(pdev->dev.parent); - struct cros_ec_platform *ec_platform = dev_get_platdata(ec_dev->dev); + struct cros_ec_dev *ec = dev_get_drvdata(pdev->dev.parent); + struct cros_ec_platform *ec_platform = dev_get_platdata(ec->dev); struct miscdevice *misc; /* Create a char device: we want to create it anew */ From 03db20aaa3ba3998b5a025e243b04e33b5bdefa5 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Mon, 28 Jul 2025 08:05:43 +0200 Subject: [PATCH 0031/3206] gpio: stmpe: Allow to compile as a module Add the necessary boilerplate to also make this driver modular. Keep the subsys_initcall to not change registration order for built-in. Also add OF match table for module autoloading. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20250728060544.18169-1-alexander.stein@ew.tq-group.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 2 +- drivers/gpio/gpio-stmpe.c | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index e43abb322fa6e1..a437fe652dbc63 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -1559,7 +1559,7 @@ config GPIO_SL28CPLD called gpio-sl28cpld. config GPIO_STMPE - bool "STMPE GPIOs" + tristate "STMPE GPIOs" depends on MFD_STMPE depends on OF_GPIO select GPIOLIB_IRQCHIP diff --git a/drivers/gpio/gpio-stmpe.c b/drivers/gpio/gpio-stmpe.c index 5dd4c21a8e601b..7bf270af07fe49 100644 --- a/drivers/gpio/gpio-stmpe.c +++ b/drivers/gpio/gpio-stmpe.c @@ -534,10 +534,16 @@ static int stmpe_gpio_probe(struct platform_device *pdev) return devm_gpiochip_add_data(dev, &stmpe_gpio->chip, stmpe_gpio); } +static const struct of_device_id stmpe_gpio_of_matches[] = { + { .compatible = "st,stmpe-gpio", }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, stmpe_gpio_of_matches); + static struct platform_driver stmpe_gpio_driver = { .driver = { - .suppress_bind_attrs = true, - .name = "stmpe-gpio", + .name = "stmpe-gpio", + .of_match_table = stmpe_gpio_of_matches, }, .probe = stmpe_gpio_probe, }; @@ -547,3 +553,13 @@ static int __init stmpe_gpio_init(void) return platform_driver_register(&stmpe_gpio_driver); } subsys_initcall(stmpe_gpio_init); + +static void __exit stmpe_gpio_exit(void) +{ + platform_driver_unregister(&stmpe_gpio_driver); +} +module_exit(stmpe_gpio_exit); + +MODULE_DESCRIPTION("STMPE expander GPIO"); +MODULE_AUTHOR("Rabin Vincent "); +MODULE_LICENSE("GPL"); From 7ce73ee6dc5977fd9dbe8194e7e8073a2f9a50fb Mon Sep 17 00:00:00 2001 From: Li Jun Date: Thu, 31 Jul 2025 17:16:37 +0800 Subject: [PATCH 0032/3206] gpio: aggregator: fix macros coding style error These changes just fix Linux Kernel Coding Style, no functional improve. -Macros with complex values should be enclosed in parentheses Signed-off-by: Li Jun Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250731091637.595136-1-lijun01@kylinos.cn Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index af9d8b3a711dad..f83aa60d8945a9 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -247,8 +247,8 @@ struct gpiochip_fwd { unsigned long tmp[]; /* values and descs for multiple ops */ }; -#define fwd_tmp_values(fwd) &(fwd)->tmp[0] -#define fwd_tmp_descs(fwd) (void *)&(fwd)->tmp[BITS_TO_LONGS((fwd)->chip.ngpio)] +#define fwd_tmp_values(fwd) (&(fwd)->tmp[0]) +#define fwd_tmp_descs(fwd) ((void *)&(fwd)->tmp[BITS_TO_LONGS((fwd)->chip.ngpio)]) #define fwd_tmp_size(ngpios) (BITS_TO_LONGS((ngpios)) + (ngpios)) From 0d1e68fb1efd2fe951d68cf05460e3672d4a846d Mon Sep 17 00:00:00 2001 From: Waqar Hameed Date: Tue, 5 Aug 2025 11:33:33 +0200 Subject: [PATCH 0033/3206] gpio: twl4030: Remove error print for devm_add_action_or_reset() When `devm_add_action_or_reset()` fails, it is due to a failed memory allocation and will thus return `-ENOMEM`. `dev_err_probe()` doesn't do anything when error is `-ENOMEM`. Therefore, remove the useless call to `dev_err_probe()` when `devm_add_action_or_reset()` fails, and just return the value instead. Signed-off-by: Waqar Hameed Link: https://lore.kernel.org/r/pndjz3im7te.a.out@axis.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-twl4030.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-twl4030.c b/drivers/gpio/gpio-twl4030.c index a33dc7c7e7a08d..a851702befdea7 100644 --- a/drivers/gpio/gpio-twl4030.c +++ b/drivers/gpio/gpio-twl4030.c @@ -597,9 +597,7 @@ static int gpio_twl4030_probe(struct platform_device *pdev) ret = devm_add_action_or_reset(&pdev->dev, gpio_twl4030_power_off_action, d); if (ret) - return dev_err_probe(&pdev->dev, ret, - "failed to install power off handler\n"); - + return ret; } return 0; From 8912b2862b9b74a0dc4e3ea1aacdec2f8abd7e1d Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 9 Jul 2025 17:08:13 +0100 Subject: [PATCH 0034/3206] pinctrl: renesas: rzg2l: Fix invalid unsigned return in rzg3s_oen_read() rzg3s_oen_read() returns a u32 value, but previously propagated a negative error code from rzg3s_pin_to_oen_bit(), resulting in an unintended large positive value due to unsigned conversion. This caused incorrect output-enable reporting for certain pins. Without this patch, pins P1_0-P1_4 and P7_0-P7_4 are incorrectly reported as "output enabled" in the pinconf-pins debugfs file. With this fix, only P1_0-P1_1 and P7_0-P7_1 are shown as "output enabled", which matches the hardware manual. Fix this by returning 0 when the OEN bit lookup fails, treating the pin as output-disabled by default. Fixes: a9024a323af2 ("pinctrl: renesas: rzg2l: Clean up and refactor OEN read/write functions") Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250709160819.306875-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index c52263c2a7b093..22bc5b8f65fdee 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -1124,7 +1124,7 @@ static u32 rzg3s_oen_read(struct rzg2l_pinctrl *pctrl, unsigned int _pin) bit = rzg3s_pin_to_oen_bit(pctrl, _pin); if (bit < 0) - return bit; + return 0; return !(readb(pctrl->base + ETH_MODE) & BIT(bit)); } From 27c76cc93c6c2ee6c0d7fec6dedc1214a977e66f Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 6 Aug 2025 20:55:50 +0100 Subject: [PATCH 0035/3206] pinctrl: renesas: rzg2l: Parameterize OEN register offset Prepare for supporting SoCs with varying OEN register locations by parameterizing the OEN offset in the rzg2l driver. Introduce an `oen` field in the rzg2l_register_offsets structure and update rzg2l_read_oen(), rzg2l_write_oen(), suspend/resume caching, and SoC hwcfg entries to use this offset instead of the hard-coded ETH_MODE value. As part of this change, rename the field `eth_mode` in the register cache to `oen` to better reflect its general purpose and decouple the naming from a specific register. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250806195555.1372317-3-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 29 +++++++++++++++---------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index 22bc5b8f65fdee..55f40103b7da43 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -146,7 +146,6 @@ #define SD_CH(off, ch) ((off) + (ch) * 4) #define ETH_POC(off, ch) ((off) + (ch) * 4) #define QSPI (0x3008) -#define ETH_MODE (0x3018) #define PFC_OEN (0x3C40) /* known on RZ/V2H(P) only */ #define PVDD_2500 2 /* I/O domain voltage 2.5V */ @@ -221,11 +220,13 @@ static const struct pin_config_item renesas_rzv2h_conf_items[] = { * @pwpr: PWPR register offset * @sd_ch: SD_CH register offset * @eth_poc: ETH_POC register offset + * @oen: OEN register offset */ struct rzg2l_register_offsets { u16 pwpr; u16 sd_ch; u16 eth_poc; + u16 oen; }; /** @@ -322,7 +323,7 @@ struct rzg2l_pinctrl_pin_settings { * @ien: IEN registers cache * @sd_ch: SD_CH registers cache * @eth_poc: ET_POC registers cache - * @eth_mode: ETH_MODE register cache + * @oen: Output Enable register cache * @qspi: QSPI registers cache */ struct rzg2l_pinctrl_reg_cache { @@ -335,7 +336,7 @@ struct rzg2l_pinctrl_reg_cache { u32 *pupd[2]; u8 sd_ch[2]; u8 eth_poc[2]; - u8 eth_mode; + u8 oen; u8 qspi; }; @@ -1073,11 +1074,12 @@ static u32 rzg2l_read_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin) if (bit < 0) return 0; - return !(readb(pctrl->base + ETH_MODE) & BIT(bit)); + return !(readb(pctrl->base + pctrl->data->hwcfg->regs.oen) & BIT(bit)); } static int rzg2l_write_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oen) { + u16 oen_offset = pctrl->data->hwcfg->regs.oen; unsigned long flags; int bit; u8 val; @@ -1087,12 +1089,12 @@ static int rzg2l_write_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oe return bit; spin_lock_irqsave(&pctrl->lock, flags); - val = readb(pctrl->base + ETH_MODE); + val = readb(pctrl->base + oen_offset); if (oen) val &= ~BIT(bit); else val |= BIT(bit); - writeb(val, pctrl->base + ETH_MODE); + writeb(val, pctrl->base + oen_offset); spin_unlock_irqrestore(&pctrl->lock, flags); return 0; @@ -1126,11 +1128,12 @@ static u32 rzg3s_oen_read(struct rzg2l_pinctrl *pctrl, unsigned int _pin) if (bit < 0) return 0; - return !(readb(pctrl->base + ETH_MODE) & BIT(bit)); + return !(readb(pctrl->base + pctrl->data->hwcfg->regs.oen) & BIT(bit)); } static int rzg3s_oen_write(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oen) { + u16 oen_offset = pctrl->data->hwcfg->regs.oen; unsigned long flags; int bit; u8 val; @@ -1140,12 +1143,12 @@ static int rzg3s_oen_write(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oe return bit; spin_lock_irqsave(&pctrl->lock, flags); - val = readb(pctrl->base + ETH_MODE); + val = readb(pctrl->base + oen_offset); if (oen) val &= ~BIT(bit); else val |= BIT(bit); - writeb(val, pctrl->base + ETH_MODE); + writeb(val, pctrl->base + oen_offset); spin_unlock_irqrestore(&pctrl->lock, flags); return 0; @@ -3164,7 +3167,8 @@ static int rzg2l_pinctrl_suspend_noirq(struct device *dev) } cache->qspi = readb(pctrl->base + QSPI); - cache->eth_mode = readb(pctrl->base + ETH_MODE); + if (pctrl->data->hwcfg->regs.oen) + cache->oen = readb(pctrl->base + pctrl->data->hwcfg->regs.oen); if (!atomic_read(&pctrl->wakeup_path)) clk_disable_unprepare(pctrl->clk); @@ -3189,7 +3193,8 @@ static int rzg2l_pinctrl_resume_noirq(struct device *dev) } writeb(cache->qspi, pctrl->base + QSPI); - writeb(cache->eth_mode, pctrl->base + ETH_MODE); + if (pctrl->data->hwcfg->regs.oen) + writeb(cache->oen, pctrl->base + pctrl->data->hwcfg->regs.oen); for (u8 i = 0; i < 2; i++) { if (regs->sd_ch) writeb(cache->sd_ch[i], pctrl->base + SD_CH(regs->sd_ch, i)); @@ -3241,6 +3246,7 @@ static const struct rzg2l_hwcfg rzg2l_hwcfg = { .pwpr = 0x3014, .sd_ch = 0x3000, .eth_poc = 0x300c, + .oen = 0x3018, }, .iolh_groupa_ua = { /* 3v3 power source */ @@ -3256,6 +3262,7 @@ static const struct rzg2l_hwcfg rzg3s_hwcfg = { .pwpr = 0x3000, .sd_ch = 0x3004, .eth_poc = 0x3010, + .oen = 0x3018, }, .iolh_groupa_ua = { /* 1v8 power source */ From a3455a0c24834eaa20f032b214d53b1607d60073 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 6 Aug 2025 20:55:51 +0100 Subject: [PATCH 0036/3206] pinctrl: renesas: rzg2l: Unify OEN access by making pin-to-bit mapping configurable Refactor the RZG2L pinctrl driver to support reuse of the common rzg2l_read_oen() and rzg2l_write_oen() helpers across SoCs with different output-enable (OEN) bit mappings. Introduce a new `pin_to_oen_bit` callback in `struct rzg2l_pinctrl_data` to allow SoCs to provide custom logic for mapping a pin to its OEN bit. Update the generic OEN read/write paths to use this callback when present. With this change, SoCs like RZ/G3S can reuse the common OEN handling code by simply supplying their own `pin_to_oen_bit` implementation. The previously duplicated `rzg3s_oen_read()` and `rzg3s_oen_write()` functions are now removed. This improves maintainability and prepares the driver for supporting future SoCs with minimal duplication. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250806195555.1372317-4-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 54 +++++++------------------ 1 file changed, 15 insertions(+), 39 deletions(-) diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index 55f40103b7da43..0245c657a3ad12 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -296,6 +296,7 @@ struct rzg2l_pinctrl_data { #endif void (*pwpr_pfc_lock_unlock)(struct rzg2l_pinctrl *pctrl, bool lock); void (*pmc_writeb)(struct rzg2l_pinctrl *pctrl, u8 val, u16 offset); + int (*pin_to_oen_bit)(struct rzg2l_pinctrl *pctrl, unsigned int _pin); u32 (*oen_read)(struct rzg2l_pinctrl *pctrl, unsigned int _pin); int (*oen_write)(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oen); int (*hw_to_bias_param)(unsigned int val); @@ -1070,7 +1071,10 @@ static u32 rzg2l_read_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin) { int bit; - bit = rzg2l_pin_to_oen_bit(pctrl, _pin); + if (!pctrl->data->pin_to_oen_bit) + return 0; + + bit = pctrl->data->pin_to_oen_bit(pctrl, _pin); if (bit < 0) return 0; @@ -1084,9 +1088,12 @@ static int rzg2l_write_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oe int bit; u8 val; - bit = rzg2l_pin_to_oen_bit(pctrl, _pin); + if (!pctrl->data->pin_to_oen_bit) + return -EINVAL; + + bit = pctrl->data->pin_to_oen_bit(pctrl, _pin); if (bit < 0) - return bit; + return -EINVAL; spin_lock_irqsave(&pctrl->lock, flags); val = readb(pctrl->base + oen_offset); @@ -1120,40 +1127,6 @@ static int rzg3s_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) return bit; } -static u32 rzg3s_oen_read(struct rzg2l_pinctrl *pctrl, unsigned int _pin) -{ - int bit; - - bit = rzg3s_pin_to_oen_bit(pctrl, _pin); - if (bit < 0) - return 0; - - return !(readb(pctrl->base + pctrl->data->hwcfg->regs.oen) & BIT(bit)); -} - -static int rzg3s_oen_write(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oen) -{ - u16 oen_offset = pctrl->data->hwcfg->regs.oen; - unsigned long flags; - int bit; - u8 val; - - bit = rzg3s_pin_to_oen_bit(pctrl, _pin); - if (bit < 0) - return bit; - - spin_lock_irqsave(&pctrl->lock, flags); - val = readb(pctrl->base + oen_offset); - if (oen) - val &= ~BIT(bit); - else - val |= BIT(bit); - writeb(val, pctrl->base + oen_offset); - spin_unlock_irqrestore(&pctrl->lock, flags); - - return 0; -} - static int rzg2l_hw_to_bias_param(unsigned int bias) { switch (bias) { @@ -3312,6 +3285,7 @@ static struct rzg2l_pinctrl_data r9a07g043_data = { #endif .pwpr_pfc_lock_unlock = &rzg2l_pwpr_pfc_lock_unlock, .pmc_writeb = &rzg2l_pmc_writeb, + .pin_to_oen_bit = &rzg2l_pin_to_oen_bit, .oen_read = &rzg2l_read_oen, .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzg2l_hw_to_bias_param, @@ -3329,6 +3303,7 @@ static struct rzg2l_pinctrl_data r9a07g044_data = { .hwcfg = &rzg2l_hwcfg, .pwpr_pfc_lock_unlock = &rzg2l_pwpr_pfc_lock_unlock, .pmc_writeb = &rzg2l_pmc_writeb, + .pin_to_oen_bit = &rzg2l_pin_to_oen_bit, .oen_read = &rzg2l_read_oen, .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzg2l_hw_to_bias_param, @@ -3345,8 +3320,9 @@ static struct rzg2l_pinctrl_data r9a08g045_data = { .hwcfg = &rzg3s_hwcfg, .pwpr_pfc_lock_unlock = &rzg2l_pwpr_pfc_lock_unlock, .pmc_writeb = &rzg2l_pmc_writeb, - .oen_read = &rzg3s_oen_read, - .oen_write = &rzg3s_oen_write, + .pin_to_oen_bit = &rzg3s_pin_to_oen_bit, + .oen_read = &rzg2l_read_oen, + .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzg2l_hw_to_bias_param, .bias_param_to_hw = &rzg2l_bias_param_to_hw, }; From dd0d40d8f4ac8c2cb5967fdb2e1efa70cfc0c9ee Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 6 Aug 2025 20:55:52 +0100 Subject: [PATCH 0037/3206] pinctrl: renesas: rzg2l: Remove OEN ops for RZ/G3E The RZ/G3E pin controller does not advertise PIN_CFG_OEN capability, so there is no valid mapping for output-enable bits on this SoC. Remove the oen_read and oen_write callbacks from the RZ/G3E driver data to defer OEN support until PIN_CFG_OEN support is added. This is a preparatory change for future unification of OEN handling across the driver. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250806195555.1372317-5-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index 0245c657a3ad12..3aa552e1d7abb4 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -3344,8 +3344,6 @@ static struct rzg2l_pinctrl_data r9a09g047_data = { #endif .pwpr_pfc_lock_unlock = &rzv2h_pwpr_pfc_lock_unlock, .pmc_writeb = &rzv2h_pmc_writeb, - .oen_read = &rzv2h_oen_read, - .oen_write = &rzv2h_oen_write, .hw_to_bias_param = &rzv2h_hw_to_bias_param, .bias_param_to_hw = &rzv2h_bias_param_to_hw, }; From cd39805be85b8ff45b0ad2715d8de48dbe404cee Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 6 Aug 2025 20:55:53 +0100 Subject: [PATCH 0038/3206] pinctrl: renesas: rzg2l: Unify OEN handling across RZ/{G2L,V2H,V2N} Unify the OEN handling on RZ/V2H(P) and RZ/V2N SoCs by reusing the existing rzg2l_read_oen and rzg2l_write_oen functions from RZ/G2L. Add a pin_to_oen_bit callback in rzg2l_pinctrl_data to look up per-pin OEN bit positions, and introduce an oen_pwpr_lock flag in the hwcfg to manage PWPR locking on SoCs that require it (RZ/V2H(P) family). Remove the hardcoded PFC_OEN define and obsolete per-SoC OEN helpers. Also drop redundant checks for the OEN offset in the suspend/resume paths, as all supported SoCs now provide a valid offset through the `regs.oen` field. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250806195555.1372317-6-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 68 ++++++++----------------- 1 file changed, 22 insertions(+), 46 deletions(-) diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index 3aa552e1d7abb4..bb79d08482e105 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -146,7 +146,6 @@ #define SD_CH(off, ch) ((off) + (ch) * 4) #define ETH_POC(off, ch) ((off) + (ch) * 4) #define QSPI (0x3008) -#define PFC_OEN (0x3C40) /* known on RZ/V2H(P) only */ #define PVDD_2500 2 /* I/O domain voltage 2.5V */ #define PVDD_1800 1 /* I/O domain voltage <= 1.8V */ @@ -255,6 +254,7 @@ enum rzg2l_iolh_index { * @iolh_groupb_oi: IOLH group B output impedance specific values * @tint_start_index: the start index for the TINT interrupts * @drive_strength_ua: drive strength in uA is supported (otherwise mA is supported) + * @oen_pwpr_lock: flag indicating if the OEN register is locked by PWPR * @func_base: base number for port function (see register PFC) * @oen_max_pin: the maximum pin number supporting output enable * @oen_max_port: the maximum port number supporting output enable @@ -267,6 +267,7 @@ struct rzg2l_hwcfg { u16 iolh_groupb_oi[4]; u16 tint_start_index; bool drive_strength_ua; + bool oen_pwpr_lock; u8 func_base; u8 oen_max_pin; u8 oen_max_port; @@ -1083,10 +1084,11 @@ static u32 rzg2l_read_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin) static int rzg2l_write_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oen) { + const struct rzg2l_register_offsets *regs = &pctrl->data->hwcfg->regs; u16 oen_offset = pctrl->data->hwcfg->regs.oen; unsigned long flags; + u8 val, pwpr; int bit; - u8 val; if (!pctrl->data->pin_to_oen_bit) return -EINVAL; @@ -1101,7 +1103,13 @@ static int rzg2l_write_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oe val &= ~BIT(bit); else val |= BIT(bit); + if (pctrl->data->hwcfg->oen_pwpr_lock) { + pwpr = readb(pctrl->base + regs->pwpr); + writeb(pwpr | PWPR_REGWE_B, pctrl->base + regs->pwpr); + } writeb(val, pctrl->base + oen_offset); + if (pctrl->data->hwcfg->oen_pwpr_lock) + writeb(pwpr & ~PWPR_REGWE_B, pctrl->base + regs->pwpr); spin_unlock_irqrestore(&pctrl->lock, flags); return 0; @@ -1192,7 +1200,7 @@ static int rzv2h_bias_param_to_hw(enum pin_config_param param) return -EINVAL; } -static u8 rzv2h_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) +static int rzv2h_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) { static const char * const pin_names[] = { "ET0_TXC_TXCLK", "ET1_TXC_TXCLK", "XSPI0_RESET0N", "XSPI0_CS0N", @@ -1206,41 +1214,7 @@ static u8 rzv2h_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) } /* Should not happen. */ - return 0; -} - -static u32 rzv2h_oen_read(struct rzg2l_pinctrl *pctrl, unsigned int _pin) -{ - u8 bit; - - bit = rzv2h_pin_to_oen_bit(pctrl, _pin); - - return !(readb(pctrl->base + PFC_OEN) & BIT(bit)); -} - -static int rzv2h_oen_write(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oen) -{ - const struct rzg2l_hwcfg *hwcfg = pctrl->data->hwcfg; - const struct rzg2l_register_offsets *regs = &hwcfg->regs; - unsigned long flags; - u8 val, bit; - u8 pwpr; - - bit = rzv2h_pin_to_oen_bit(pctrl, _pin); - spin_lock_irqsave(&pctrl->lock, flags); - val = readb(pctrl->base + PFC_OEN); - if (oen) - val &= ~BIT(bit); - else - val |= BIT(bit); - - pwpr = readb(pctrl->base + regs->pwpr); - writeb(pwpr | PWPR_REGWE_B, pctrl->base + regs->pwpr); - writeb(val, pctrl->base + PFC_OEN); - writeb(pwpr & ~PWPR_REGWE_B, pctrl->base + regs->pwpr); - spin_unlock_irqrestore(&pctrl->lock, flags); - - return 0; + return -EINVAL; } static int rzg2l_pinctrl_pinconf_get(struct pinctrl_dev *pctldev, @@ -3140,8 +3114,7 @@ static int rzg2l_pinctrl_suspend_noirq(struct device *dev) } cache->qspi = readb(pctrl->base + QSPI); - if (pctrl->data->hwcfg->regs.oen) - cache->oen = readb(pctrl->base + pctrl->data->hwcfg->regs.oen); + cache->oen = readb(pctrl->base + pctrl->data->hwcfg->regs.oen); if (!atomic_read(&pctrl->wakeup_path)) clk_disable_unprepare(pctrl->clk); @@ -3166,8 +3139,7 @@ static int rzg2l_pinctrl_resume_noirq(struct device *dev) } writeb(cache->qspi, pctrl->base + QSPI); - if (pctrl->data->hwcfg->regs.oen) - writeb(cache->oen, pctrl->base + pctrl->data->hwcfg->regs.oen); + writeb(cache->oen, pctrl->base + pctrl->data->hwcfg->regs.oen); for (u8 i = 0; i < 2; i++) { if (regs->sd_ch) writeb(cache->sd_ch[i], pctrl->base + SD_CH(regs->sd_ch, i)); @@ -3267,8 +3239,10 @@ static const struct rzg2l_hwcfg rzg3s_hwcfg = { static const struct rzg2l_hwcfg rzv2h_hwcfg = { .regs = { .pwpr = 0x3c04, + .oen = 0x3c40, }, .tint_start_index = 17, + .oen_pwpr_lock = true, }; static struct rzg2l_pinctrl_data r9a07g043_data = { @@ -3365,8 +3339,9 @@ static struct rzg2l_pinctrl_data r9a09g056_data = { #endif .pwpr_pfc_lock_unlock = &rzv2h_pwpr_pfc_lock_unlock, .pmc_writeb = &rzv2h_pmc_writeb, - .oen_read = &rzv2h_oen_read, - .oen_write = &rzv2h_oen_write, + .pin_to_oen_bit = &rzv2h_pin_to_oen_bit, + .oen_read = &rzg2l_read_oen, + .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzv2h_hw_to_bias_param, .bias_param_to_hw = &rzv2h_bias_param_to_hw, }; @@ -3389,8 +3364,9 @@ static struct rzg2l_pinctrl_data r9a09g057_data = { #endif .pwpr_pfc_lock_unlock = &rzv2h_pwpr_pfc_lock_unlock, .pmc_writeb = &rzv2h_pmc_writeb, - .oen_read = &rzv2h_oen_read, - .oen_write = &rzv2h_oen_write, + .pin_to_oen_bit = &rzv2h_pin_to_oen_bit, + .oen_read = &rzg2l_read_oen, + .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzv2h_hw_to_bias_param, .bias_param_to_hw = &rzv2h_bias_param_to_hw, }; From 54ac76e13ace31cf732bf5261811ef5af67022b7 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 6 Aug 2025 20:55:54 +0100 Subject: [PATCH 0039/3206] pinctrl: renesas: rzg2l: Add PFC_OEN support for RZ/G3E SoC Add support for configuring the PFC_OEN register on the RZ/G3E SoC to enable output-enable control for specific pins. On this SoC, certain pins such as TXC_TXCLK need to support switching between input and output modes depending on the PHY interface mode (e.g., MII vs RGMII). This functionality maps to the 'output-enable' property in the device tree and requires explicit control via the PFC_OEN register. This change updates the r9a09g047_variable_pin_cfg array to mark PB1, PE1, PL0, PL1, PL2, and PL4 with the PIN_CFG_OEN flag to indicate output-enable support. A new helper, rzg3e_pin_to_oen_bit(), is introduced to map these pin names to their respective OEN bit positions, and the corresponding callbacks are wired into the RZ/G3E SoC configuration using the generic rzg2l_read_oen() and rzg2l_write_oen() accessors. Additionally, the GPIO configuration for the PB, PE, and PL ports is updated to use the variable port pack macro, enabling per-pin configuration necessary for OEN handling. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250806195555.1372317-7-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 61 +++++++++++++++++++++---- 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index bb79d08482e105..9ac9f3019c9287 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -397,6 +397,14 @@ static const u64 r9a09g047_variable_pin_cfg[] = { RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PA, 5, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PA, 6, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PA, 7, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 0, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 1, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 2, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 3, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 4, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 5, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 6, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 7, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 0, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_IEN), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 1, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 2, RZV2H_MPXED_PIN_FUNCS), @@ -405,6 +413,14 @@ static const u64 r9a09g047_variable_pin_cfg[] = { RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 5, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 6, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 7, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 0, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 1, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 2, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 3, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 4, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 5, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 6, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 7, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PG, 0, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PG, 1, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_IEN), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PG, 2, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_IEN), @@ -424,6 +440,14 @@ static const u64 r9a09g047_variable_pin_cfg[] = { RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PJ, 2, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PJ, 3, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PJ, 4, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 0, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 1, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 2, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 3, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 4, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 5, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 6, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 7, RZV2H_MPXED_PIN_FUNCS), }; static const u64 r9a09g057_variable_pin_cfg[] = { @@ -1200,23 +1224,39 @@ static int rzv2h_bias_param_to_hw(enum pin_config_param param) return -EINVAL; } -static int rzv2h_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) +static int rzg2l_pin_names_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin, + const char * const pin_names[], unsigned int count) { - static const char * const pin_names[] = { "ET0_TXC_TXCLK", "ET1_TXC_TXCLK", - "XSPI0_RESET0N", "XSPI0_CS0N", - "XSPI0_CKN", "XSPI0_CKP" }; const struct pinctrl_pin_desc *pin_desc = &pctrl->desc.pins[_pin]; unsigned int i; - for (i = 0; i < ARRAY_SIZE(pin_names); i++) { + for (i = 0; i < count; i++) { if (!strcmp(pin_desc->name, pin_names[i])) return i; } - /* Should not happen. */ return -EINVAL; } +static int rzv2h_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) +{ + static const char * const pin_names[] = { + "ET0_TXC_TXCLK", "ET1_TXC_TXCLK", "XSPI0_RESET0N", + "XSPI0_CS0N", "XSPI0_CKN", "XSPI0_CKP" + }; + + return rzg2l_pin_names_to_oen_bit(pctrl, _pin, pin_names, ARRAY_SIZE(pin_names)); +} + +static int rzg3e_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) +{ + static const char * const pin_names[] = { + "PB1", "PE1", "PL4", "PL1", "PL2", "PL0" + }; + + return rzg2l_pin_names_to_oen_bit(pctrl, _pin, pin_names, ARRAY_SIZE(pin_names)); +} + static int rzg2l_pinctrl_pinconf_get(struct pinctrl_dev *pctldev, unsigned int _pin, unsigned long *config) @@ -2008,17 +2048,17 @@ static const u64 r9a09g047_gpio_configs[] = { RZG2L_GPIO_PORT_PACK(6, 0x28, RZV2H_MPXED_PIN_FUNCS), /* P8 */ 0x0, RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x2a), /* PA */ - RZG2L_GPIO_PORT_PACK(8, 0x2b, RZV2H_MPXED_PIN_FUNCS), /* PB */ + RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x2b), /* PB */ RZG2L_GPIO_PORT_PACK(3, 0x2c, RZV2H_MPXED_PIN_FUNCS), /* PC */ RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x2d), /* PD */ - RZG2L_GPIO_PORT_PACK(8, 0x2e, RZV2H_MPXED_PIN_FUNCS), /* PE */ + RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x2e), /* PE */ RZG2L_GPIO_PORT_PACK(3, 0x2f, RZV2H_MPXED_PIN_FUNCS), /* PF */ RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x30), /* PG */ RZG2L_GPIO_PORT_PACK_VARIABLE(6, 0x31), /* PH */ 0x0, RZG2L_GPIO_PORT_PACK_VARIABLE(5, 0x33), /* PJ */ RZG2L_GPIO_PORT_PACK(4, 0x34, RZV2H_MPXED_PIN_FUNCS), /* PK */ - RZG2L_GPIO_PORT_PACK(8, 0x35, RZV2H_MPXED_PIN_FUNCS), /* PL */ + RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x35), /* PL */ RZG2L_GPIO_PORT_PACK(8, 0x36, RZV2H_MPXED_PIN_FUNCS), /* PM */ 0x0, 0x0, @@ -3318,6 +3358,9 @@ static struct rzg2l_pinctrl_data r9a09g047_data = { #endif .pwpr_pfc_lock_unlock = &rzv2h_pwpr_pfc_lock_unlock, .pmc_writeb = &rzv2h_pmc_writeb, + .pin_to_oen_bit = &rzg3e_pin_to_oen_bit, + .oen_read = &rzg2l_read_oen, + .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzv2h_hw_to_bias_param, .bias_param_to_hw = &rzv2h_bias_param_to_hw, }; From 560c633d378a0cf34afcd66c99321d6306a7e937 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 6 Aug 2025 20:55:55 +0100 Subject: [PATCH 0040/3206] pinctrl: renesas: rzg2l: Drop oen_read and oen_write callbacks Remove oen_read and oen_write callbacks from rzg2l_pinctrl_data as all SoCs now use the same rzg2l_read_oen() and rzg2l_write_oen() functions directly. Change rzg2l_read_oen() return type to int for proper error reporting and update callers to handle errors consistently. This simplifies the code by removing redundant callbacks and ensures uniform OEN handling across all supported SoCs. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250806195555.1372317-8-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 35 +++++++------------------ 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index 9ac9f3019c9287..b182b3b8a542a2 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -298,8 +298,6 @@ struct rzg2l_pinctrl_data { void (*pwpr_pfc_lock_unlock)(struct rzg2l_pinctrl *pctrl, bool lock); void (*pmc_writeb)(struct rzg2l_pinctrl *pctrl, u8 val, u16 offset); int (*pin_to_oen_bit)(struct rzg2l_pinctrl *pctrl, unsigned int _pin); - u32 (*oen_read)(struct rzg2l_pinctrl *pctrl, unsigned int _pin); - int (*oen_write)(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oen); int (*hw_to_bias_param)(unsigned int val); int (*bias_param_to_hw)(enum pin_config_param param); }; @@ -1092,16 +1090,16 @@ static int rzg2l_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) return -EINVAL; } -static u32 rzg2l_read_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin) +static int rzg2l_read_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin) { int bit; if (!pctrl->data->pin_to_oen_bit) - return 0; + return -EOPNOTSUPP; bit = pctrl->data->pin_to_oen_bit(pctrl, _pin); if (bit < 0) - return 0; + return -EINVAL; return !(readb(pctrl->base + pctrl->data->hwcfg->regs.oen) & BIT(bit)); } @@ -1115,7 +1113,7 @@ static int rzg2l_write_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oe int bit; if (!pctrl->data->pin_to_oen_bit) - return -EINVAL; + return -EOPNOTSUPP; bit = pctrl->data->pin_to_oen_bit(pctrl, _pin); if (bit < 0) @@ -1298,11 +1296,10 @@ static int rzg2l_pinctrl_pinconf_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_OUTPUT_ENABLE: if (!(cfg & PIN_CFG_OEN)) return -EINVAL; - if (!pctrl->data->oen_read) - return -EOPNOTSUPP; - arg = pctrl->data->oen_read(pctrl, _pin); - if (!arg) - return -EINVAL; + ret = rzg2l_read_oen(pctrl, _pin); + if (ret < 0) + return ret; + arg = ret; break; case PIN_CONFIG_POWER_SOURCE: @@ -1461,9 +1458,7 @@ static int rzg2l_pinctrl_pinconf_set(struct pinctrl_dev *pctldev, case PIN_CONFIG_OUTPUT_ENABLE: if (!(cfg & PIN_CFG_OEN)) return -EINVAL; - if (!pctrl->data->oen_write) - return -EOPNOTSUPP; - ret = pctrl->data->oen_write(pctrl, _pin, !!arg); + ret = rzg2l_write_oen(pctrl, _pin, !!arg); if (ret) return ret; break; @@ -3300,8 +3295,6 @@ static struct rzg2l_pinctrl_data r9a07g043_data = { .pwpr_pfc_lock_unlock = &rzg2l_pwpr_pfc_lock_unlock, .pmc_writeb = &rzg2l_pmc_writeb, .pin_to_oen_bit = &rzg2l_pin_to_oen_bit, - .oen_read = &rzg2l_read_oen, - .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzg2l_hw_to_bias_param, .bias_param_to_hw = &rzg2l_bias_param_to_hw, }; @@ -3318,8 +3311,6 @@ static struct rzg2l_pinctrl_data r9a07g044_data = { .pwpr_pfc_lock_unlock = &rzg2l_pwpr_pfc_lock_unlock, .pmc_writeb = &rzg2l_pmc_writeb, .pin_to_oen_bit = &rzg2l_pin_to_oen_bit, - .oen_read = &rzg2l_read_oen, - .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzg2l_hw_to_bias_param, .bias_param_to_hw = &rzg2l_bias_param_to_hw, }; @@ -3335,8 +3326,6 @@ static struct rzg2l_pinctrl_data r9a08g045_data = { .pwpr_pfc_lock_unlock = &rzg2l_pwpr_pfc_lock_unlock, .pmc_writeb = &rzg2l_pmc_writeb, .pin_to_oen_bit = &rzg3s_pin_to_oen_bit, - .oen_read = &rzg2l_read_oen, - .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzg2l_hw_to_bias_param, .bias_param_to_hw = &rzg2l_bias_param_to_hw, }; @@ -3359,8 +3348,6 @@ static struct rzg2l_pinctrl_data r9a09g047_data = { .pwpr_pfc_lock_unlock = &rzv2h_pwpr_pfc_lock_unlock, .pmc_writeb = &rzv2h_pmc_writeb, .pin_to_oen_bit = &rzg3e_pin_to_oen_bit, - .oen_read = &rzg2l_read_oen, - .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzv2h_hw_to_bias_param, .bias_param_to_hw = &rzv2h_bias_param_to_hw, }; @@ -3383,8 +3370,6 @@ static struct rzg2l_pinctrl_data r9a09g056_data = { .pwpr_pfc_lock_unlock = &rzv2h_pwpr_pfc_lock_unlock, .pmc_writeb = &rzv2h_pmc_writeb, .pin_to_oen_bit = &rzv2h_pin_to_oen_bit, - .oen_read = &rzg2l_read_oen, - .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzv2h_hw_to_bias_param, .bias_param_to_hw = &rzv2h_bias_param_to_hw, }; @@ -3408,8 +3393,6 @@ static struct rzg2l_pinctrl_data r9a09g057_data = { .pwpr_pfc_lock_unlock = &rzv2h_pwpr_pfc_lock_unlock, .pmc_writeb = &rzv2h_pmc_writeb, .pin_to_oen_bit = &rzv2h_pin_to_oen_bit, - .oen_read = &rzg2l_read_oen, - .oen_write = &rzg2l_write_oen, .hw_to_bias_param = &rzv2h_hw_to_bias_param, .bias_param_to_hw = &rzv2h_bias_param_to_hw, }; From 1071d560afb4c245c2076494226df47db5a35708 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 11 Aug 2025 13:17:32 +0200 Subject: [PATCH 0041/3206] dm-stripe: fix a possible integer overflow There's a possible integer overflow in stripe_io_hints if we have too large chunk size. Test if the overflow happened, and if it did, don't set limits->io_min and limits->io_opt; Signed-off-by: Mikulas Patocka Reviewed-by: John Garry Suggested-by: Dongsheng Yang Cc: stable@vger.kernel.org --- drivers/md/dm-stripe.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 58902091bf79b9..1461dc740dae6c 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -456,11 +456,15 @@ static void stripe_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct stripe_c *sc = ti->private; - unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT; + unsigned int io_min, io_opt; limits->chunk_sectors = sc->chunk_size; - limits->io_min = chunk_size; - limits->io_opt = chunk_size * sc->stripes; + + if (!check_shl_overflow(sc->chunk_size, SECTOR_SHIFT, &io_min) && + !check_mul_overflow(io_min, sc->stripes, &io_opt)) { + limits->io_min = io_min; + limits->io_opt = io_opt; + } } static struct target_type stripe_target = { From 15a04f94f42973d85afa2df66f1d18e0bca3385c Mon Sep 17 00:00:00 2001 From: Thiago Becker Date: Thu, 24 Jul 2025 17:35:16 -0300 Subject: [PATCH 0042/3206] locks: Remove the last reference to EXPORT_OP_ASYNC_LOCK. Commit b875bd5b381e ("exportfs: Remove EXPORT_OP_ASYNC_LOCK") removed all references to EXPORT_OP_ASYNC_LOCK, but one lasted in the comments for fs/locks.c. Remove it. Signed-off-by: Thiago Becker Link: https://lore.kernel.org/20250724203516.153616-1-tbecker@redhat.com Signed-off-by: Christian Brauner --- fs/locks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 559f02aa417221..04a3f0e2072461 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2328,8 +2328,8 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX * locks, the ->lock() interface may return asynchronously, before the lock has * been granted or denied by the underlying filesystem, if (and only if) - * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations - * flags need to be set. + * lm_grant is set. Additionally FOP_ASYNC_LOCK in file_operations fop_flags + * need to be set. * * Callers expecting ->lock() to return asynchronously will only use F_SETLK, * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a From 17e8b7e08fa8bf7a936f70444a42a88750410251 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jul 2025 09:48:54 +0200 Subject: [PATCH 0043/3206] fs: mark file_remove_privs_flags static file_remove_privs_flags is only used inside of inode.c, mark it static. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250724074854.3316911-1-hch@lst.de Signed-off-by: Christian Brauner --- fs/inode.c | 3 +-- include/linux/fs.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 01ebdc40021e2d..3cbb78412a3ec3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2189,7 +2189,7 @@ static int __remove_privs(struct mnt_idmap *idmap, return notify_change(idmap, dentry, &newattrs, NULL); } -int file_remove_privs_flags(struct file *file, unsigned int flags) +static int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); @@ -2214,7 +2214,6 @@ int file_remove_privs_flags(struct file *file, unsigned int flags) inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d7051f..796319914b0a06 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3393,7 +3393,6 @@ static inline struct inode *new_inode_pseudo(struct super_block *sb) extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); -extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); From 4e021920812d164bb02c30cc40e08a3681b1c755 Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Wed, 30 Jul 2025 20:18:53 +0000 Subject: [PATCH 0044/3206] fs: document 'name' parameter for name_contains_dotdot() The kernel-doc for name_contains_dotdot() was missing the @name parameter description, leading to a warning during make htmldocs. Add the missing documentation to resolve this warning. Signed-off-by: Kriish Sharma Link: https://lore.kernel.org/20250730201853.8436-1-kriish.sharma2006@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 796319914b0a06..780e9c774c5496 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3281,7 +3281,7 @@ static inline bool is_dot_dotdot(const char *name, size_t len) /** * name_contains_dotdot - check if a file name contains ".." path components - * + * @name: File path string to check * Search for ".." surrounded by either '/' or start/end of string. */ static inline bool name_contains_dotdot(const char *name) From f7d812357e40ef3ce1c0dcdd929590f1dbc6c344 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Fri, 8 Aug 2025 16:37:58 +0800 Subject: [PATCH 0045/3206] fs: fix "writen"->"written" Trivial fix to spelling mistake in comment text. Signed-off-by: Xichao Zhao Link: https://lore.kernel.org/20250808083758.229563-1-zhao.xichao@vivo.com Signed-off-by: Christian Brauner --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index cd43ff89fbaa38..138a693c234614 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4828,7 +4828,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to - * be writen back improperly if their true value is unknown to + * be written back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) From 708c04a5c2b78e22f56e2350de41feba74dfccd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 5 Aug 2025 14:38:08 +0200 Subject: [PATCH 0046/3206] fs: always return zero on success from replace_fd() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit replace_fd() returns the number of the new file descriptor through the return value of do_dup2(). However its callers never care about the specific returned number. In fact the caller in receive_fd_replace() treats any non-zero return value as an error and therefore never calls __receive_sock() for most file descriptors, which is a bug. To fix the bug in receive_fd_replace() and to avoid the same issue happening in future callers, signal success through a plain zero. Suggested-by: Al Viro Link: https://lore.kernel.org/lkml/20250801220215.GS222315@ZenIV/ Fixes: 173817151b15 ("fs: Expand __receive_fd() to accept existing fd") Fixes: 42eb0d54c08a ("fs: split receive_fd_replace from __receive_fd") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/20250805-fix-receive_fd_replace-v3-1-b72ba8b34bac@linutronix.de Signed-off-by: Christian Brauner --- fs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/file.c b/fs/file.c index 6d2275c3be9c69..28743b742e3cf6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1330,7 +1330,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; - return do_dup2(files, file, fd, flags); + err = do_dup2(files, file, fd, flags); + if (err < 0) + return err; + return 0; out_unlock: spin_unlock(&files->file_lock); From 56ecfd9175b999dfc303ac6a0f9ea4bd1bee49d7 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 23 Jul 2025 14:21:54 +0100 Subject: [PATCH 0047/3206] fs: Remove mount_nodev mount_nodev has had no in-tree users since cc0876f817d6 ("vfs: Convert devpts to use the new mount API"). Remove it. Signed-off-by: Pedro Falcato Link: https://lore.kernel.org/20250723132156.225410-2-pfalcato@suse.de Signed-off-by: Christian Brauner --- fs/super.c | 20 -------------------- include/linux/fs.h | 3 --- 2 files changed, 23 deletions(-) diff --git a/fs/super.c b/fs/super.c index 7f876f32343ad4..7daa20737f2ed4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1773,26 +1773,6 @@ void kill_block_super(struct super_block *sb) EXPORT_SYMBOL(kill_block_super); #endif -struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - int error; - struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); - - if (IS_ERR(s)) - return ERR_CAST(s); - - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - s->s_flags |= SB_ACTIVE; - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_nodev); - /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d7051f..204328ed7ebb3f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2716,9 +2716,6 @@ static inline bool is_mgtime(const struct inode *inode) extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); From f7d161c2804f3ad36bdc3222cb93c8fee67be98c Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 23 Jul 2025 14:21:55 +0100 Subject: [PATCH 0048/3206] fs: Remove mount_bdev mount_bdev has no in-tree users ever since f2fs adopted the new mount API. Remove it. Signed-off-by: Pedro Falcato Link: https://lore.kernel.org/20250723132156.225410-3-pfalcato@suse.de Signed-off-by: Christian Brauner --- fs/super.c | 43 ------------------------------------------- include/linux/fs.h | 3 --- 2 files changed, 46 deletions(-) diff --git a/fs/super.c b/fs/super.c index 7daa20737f2ed4..a038848e8d1f75 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1716,49 +1716,6 @@ int get_tree_bdev(struct fs_context *fc, } EXPORT_SYMBOL(get_tree_bdev); -static int test_bdev_super(struct super_block *s, void *data) -{ - return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; -} - -struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *s; - int error; - dev_t dev; - - error = lookup_bdev(dev_name, &dev); - if (error) - return ERR_PTR(error); - - flags |= SB_NOSEC; - s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); - if (IS_ERR(s)) - return ERR_CAST(s); - - if (s->s_root) { - if ((flags ^ s->s_flags) & SB_RDONLY) { - deactivate_locked_super(s); - return ERR_PTR(-EBUSY); - } - } else { - error = setup_bdev_super(s, flags, NULL); - if (!error) - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - s->s_flags |= SB_ACTIVE; - } - - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_bdev); - void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; diff --git a/include/linux/fs.h b/include/linux/fs.h index 204328ed7ebb3f..98afcf455b28d0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2713,9 +2713,6 @@ static inline bool is_mgtime(const struct inode *inode) return inode->i_opflags & IOP_MGTIME; } -extern struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); From ad7fe23b4b0dc0c26187df92a5649948ef7049fa Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 6 Aug 2025 16:07:05 +1000 Subject: [PATCH 0049/3206] fscontext: add custom-prefix log helpers Sometimes, errors associated with an fscontext come from the VFS or otherwise outside of the filesystem driver itself. However, the default logging of errorfc will always prefix the message with the filesystem name. So, add some *fcp() wrappers that allow for custom prefixes to be used when emitting information to the fscontext log. Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250806-errorfc-mount-too-revealing-v2-1-534b9b4d45bb@cyphar.com Signed-off-by: Christian Brauner --- include/linux/fs_context.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 7773eb870039c4..671f031be173b7 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -186,10 +186,12 @@ struct fc_log { extern __attribute__((format(printf, 4, 5))) void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...); -#define __logfc(fc, l, fmt, ...) logfc((fc)->log.log, NULL, \ - l, fmt, ## __VA_ARGS__) -#define __plog(p, l, fmt, ...) logfc((p)->log, (p)->prefix, \ - l, fmt, ## __VA_ARGS__) +#define __logfc(fc, l, fmt, ...) \ + logfc((fc)->log.log, NULL, (l), (fmt), ## __VA_ARGS__) +#define __plogp(p, prefix, l, fmt, ...) \ + logfc((p)->log, (prefix), (l), (fmt), ## __VA_ARGS__) +#define __plog(p, l, fmt, ...) __plogp(p, (p)->prefix, l, fmt, ## __VA_ARGS__) + /** * infof - Store supplementary informational message * @fc: The context in which to log the informational message @@ -201,6 +203,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__) #define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__) #define infofc(fc, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__) +#define infofcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'i', fmt, ## __VA_ARGS__) /** * warnf - Store supplementary warning message @@ -213,6 +217,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define warnf(fc, fmt, ...) __logfc(fc, 'w', fmt, ## __VA_ARGS__) #define warn_plog(p, fmt, ...) __plog(p, 'w', fmt, ## __VA_ARGS__) #define warnfc(fc, fmt, ...) __plog((&(fc)->log), 'w', fmt, ## __VA_ARGS__) +#define warnfcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'w', fmt, ## __VA_ARGS__) /** * errorf - Store supplementary error message @@ -225,6 +231,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define errorf(fc, fmt, ...) __logfc(fc, 'e', fmt, ## __VA_ARGS__) #define error_plog(p, fmt, ...) __plog(p, 'e', fmt, ## __VA_ARGS__) #define errorfc(fc, fmt, ...) __plog((&(fc)->log), 'e', fmt, ## __VA_ARGS__) +#define errorfcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'e', fmt, ## __VA_ARGS__) /** * invalf - Store supplementary invalid argument error message @@ -237,5 +245,7 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define invalf(fc, fmt, ...) (errorf(fc, fmt, ## __VA_ARGS__), -EINVAL) #define inval_plog(p, fmt, ...) (error_plog(p, fmt, ## __VA_ARGS__), -EINVAL) #define invalfc(fc, fmt, ...) (errorfc(fc, fmt, ## __VA_ARGS__), -EINVAL) +#define invalfcp(fc, prefix, fmt, ...) \ + (errorfcp(fc, prefix, fmt, ## __VA_ARGS__), -EINVAL) #endif /* _LINUX_FS_CONTEXT_H */ From 92becd1701a835d0807fa1f5c03b43762af27444 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 23 Jul 2025 14:21:56 +0100 Subject: [PATCH 0050/3206] docs/vfs: Remove mentions to the old mount API helpers Now that mount_bdev(), mount_nodev() and mount_single() have all been removed, remove mentions to them in vfs.rst. While we're at it, redirect people looking for mount API docs to mount_api.rst (which documents the newer API). Signed-off-by: Pedro Falcato Link: https://lore.kernel.org/20250723132156.225410-4-pfalcato@suse.de Signed-off-by: Christian Brauner --- Documentation/filesystems/vfs.rst | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 486a9163347478..229eb90c96f2a9 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -209,31 +209,8 @@ method fills in is the "s_op" field. This is a pointer to a "struct super_operations" which describes the next level of the filesystem implementation. -Usually, a filesystem uses one of the generic mount() implementations -and provides a fill_super() callback instead. The generic variants are: - -``mount_bdev`` - mount a filesystem residing on a block device - -``mount_nodev`` - mount a filesystem that is not backed by a device - -``mount_single`` - mount a filesystem which shares the instance between all mounts - -A fill_super() callback implementation has the following arguments: - -``struct super_block *sb`` - the superblock structure. The callback must initialize this - properly. - -``void *data`` - arbitrary mount options, usually comes as an ASCII string (see - "Mount Options" section) - -``int silent`` - whether or not to be silent on error - +For more information on mounting (and the new mount API), see +Documentation/filesystems/mount_api.rst. The Superblock Object ===================== From 807602d8cfc8b63e2c645fbbc66c8e82479ee311 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 6 Aug 2025 16:07:06 +1000 Subject: [PATCH 0051/3206] vfs: output mount_too_revealing() errors to fscontext It makes little sense for fsmount() to output the warning message when mount_too_revealing() is violated to kmsg. Instead, the warning should be output (with a "VFS" prefix) to the fscontext log. In addition, include the same log message for mount_too_revealing() when doing a regular mount for consistency. With the newest fsopen()-based mount(8) from util-linux, the error messages now look like # mount -t proc proc /tmp mount: /tmp: fsmount() failed: VFS: Mount too revealing. dmesg(1) may have more information after failed mount system call. which could finally result in mount_too_revealing() errors being easier for users to detect and understand. Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250806-errorfc-mount-too-revealing-v2-2-534b9b4d45bb@cyphar.com Signed-off-by: Christian Brauner --- fs/namespace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d33837..86d12f88b68852 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3724,8 +3724,10 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, int error; error = security_sb_kern_mount(sb); - if (!error && mount_too_revealing(sb, &mnt_flags)) + if (!error && mount_too_revealing(sb, &mnt_flags)) { + errorfcp(fc, "VFS", "Mount too revealing"); error = -EPERM; + } if (unlikely(error)) { fc_drop_locked(fc); @@ -4441,7 +4443,7 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, ret = -EPERM; if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { - pr_warn("VFS: Mount too revealing\n"); + errorfcp(fc, "VFS", "Mount too revealing"); goto err_unlock; } From 72d271a7baa7062cb27e774ac37c5459c6d20e22 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 7 Aug 2025 03:55:23 +1000 Subject: [PATCH 0052/3206] fscontext: do not consume log entries when returning -EMSGSIZE Userspace generally expects APIs that return -EMSGSIZE to allow for them to adjust their buffer size and retry the operation. However, the fscontext log would previously clear the message even in the -EMSGSIZE case. Given that it is very cheap for us to check whether the buffer is too small before we remove the message from the ring buffer, let's just do that instead. While we're at it, refactor some fscontext_read() into a separate helper to make the ring buffer logic a bit easier to read. Fixes: 007ec26cdc9f ("vfs: Implement logging through fs_context") Cc: David Howells Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250807-fscontext-log-cleanups-v3-1-8d91d6242dc3@cyphar.com Signed-off-by: Christian Brauner --- fs/fsopen.c | 70 +++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/fs/fsopen.c b/fs/fsopen.c index 1aaf4cb2afb29e..f645c99204eb06 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -18,50 +18,56 @@ #include "internal.h" #include "mount.h" +static inline const char *fetch_message_locked(struct fc_log *log, size_t len, + bool *need_free) +{ + const char *p; + int index; + + if (unlikely(log->head == log->tail)) + return ERR_PTR(-ENODATA); + + index = log->tail & (ARRAY_SIZE(log->buffer) - 1); + p = log->buffer[index]; + if (unlikely(strlen(p) > len)) + return ERR_PTR(-EMSGSIZE); + + log->buffer[index] = NULL; + *need_free = log->need_free & (1 << index); + log->need_free &= ~(1 << index); + log->tail++; + + return p; +} + /* * Allow the user to read back any error, warning or informational messages. + * Only one message is returned for each read(2) call. */ static ssize_t fscontext_read(struct file *file, char __user *_buf, size_t len, loff_t *pos) { struct fs_context *fc = file->private_data; - struct fc_log *log = fc->log.log; - unsigned int logsize = ARRAY_SIZE(log->buffer); - ssize_t ret; - char *p; + ssize_t err; + const char *p __free(kfree) = NULL, *message; bool need_free; - int index, n; + int n; - ret = mutex_lock_interruptible(&fc->uapi_mutex); - if (ret < 0) - return ret; - - if (log->head == log->tail) { - mutex_unlock(&fc->uapi_mutex); - return -ENODATA; - } - - index = log->tail & (logsize - 1); - p = log->buffer[index]; - need_free = log->need_free & (1 << index); - log->buffer[index] = NULL; - log->need_free &= ~(1 << index); - log->tail++; + err = mutex_lock_interruptible(&fc->uapi_mutex); + if (err < 0) + return err; + message = fetch_message_locked(fc->log.log, len, &need_free); mutex_unlock(&fc->uapi_mutex); + if (IS_ERR(message)) + return PTR_ERR(message); - ret = -EMSGSIZE; - n = strlen(p); - if (n > len) - goto err_free; - ret = -EFAULT; - if (copy_to_user(_buf, p, n) != 0) - goto err_free; - ret = n; - -err_free: if (need_free) - kfree(p); - return ret; + p = message; + + n = strlen(message); + if (copy_to_user(_buf, message, n)) + return -EFAULT; + return n; } static int fscontext_release(struct inode *inode, struct file *file) From df579e471111b9f8691d75c980f59cc085fa97a3 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 7 Aug 2025 03:55:24 +1000 Subject: [PATCH 0053/3206] selftests/filesystems: add basic fscontext log tests Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250807-fscontext-log-cleanups-v3-2-8d91d6242dc3@cyphar.com Signed-off-by: Christian Brauner --- .../testing/selftests/filesystems/.gitignore | 1 + tools/testing/selftests/filesystems/Makefile | 2 +- tools/testing/selftests/filesystems/fclog.c | 130 ++++++++++++++++++ 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/filesystems/fclog.c diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore index fcbdb1297e24e8..64ac0dfa46b7ef 100644 --- a/tools/testing/selftests/filesystems/.gitignore +++ b/tools/testing/selftests/filesystems/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only dnotify_test devpts_pts +fclog file_stressor anon_inode_test kernfs_test diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile index 73d4650af1a517..85427d7f19b9b5 100644 --- a/tools/testing/selftests/filesystems/Makefile +++ b/tools/testing/selftests/filesystems/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += $(KHDR_INCLUDES) -TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test +TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog TEST_GEN_PROGS_EXTENDED := dnotify_test include ../lib.mk diff --git a/tools/testing/selftests/filesystems/fclog.c b/tools/testing/selftests/filesystems/fclog.c new file mode 100644 index 00000000000000..912a8b755c3b63 --- /dev/null +++ b/tools/testing/selftests/filesystems/fclog.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Aleksa Sarai + * Copyright (C) 2025 SUSE LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#define ASSERT_ERRNO(expected, _t, seen) \ + __EXPECT(expected, #expected, \ + ({__typeof__(seen) _tmp_seen = (seen); \ + _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1) + +#define ASSERT_ERRNO_EQ(expected, seen) \ + ASSERT_ERRNO(expected, ==, seen) + +#define ASSERT_SUCCESS(seen) \ + ASSERT_ERRNO(0, <=, seen) + +FIXTURE(ns) +{ + int host_mntns; +}; + +FIXTURE_SETUP(ns) +{ + /* Stash the old mntns. */ + self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_mntns); + + /* Create a new mount namespace and make it private. */ + ASSERT_SUCCESS(unshare(CLONE_NEWNS)); + ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL)); +} + +FIXTURE_TEARDOWN(ns) +{ + ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS)); + ASSERT_SUCCESS(close(self->host_mntns)); +} + +TEST_F(ns, fscontext_log_enodata) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + /* A brand new fscontext has no log entries. */ + char buf[128] = {}; + for (int i = 0; i < 16; i++) + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_errorfc) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + char buf[128] = {}; + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_errorfc_after_fsmount) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + int mfd = fsmount(fsfd, FSMOUNT_CLOEXEC, MOUNT_ATTR_NOEXEC | MOUNT_ATTR_NOSUID); + ASSERT_SUCCESS(mfd); + ASSERT_SUCCESS(move_mount(mfd, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)); + + /* + * The fscontext log should still contain data even after + * FSCONFIG_CMD_CREATE and fsmount(). + */ + char buf[128] = {}; + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_emsgsize) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + char buf[128] = {}; + /* + * Attempting to read a message with too small a buffer should not + * result in the message getting consumed. + */ + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 0)); + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 1)); + for (int i = 0; i < 16; i++) + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 16)); + + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_HARNESS_MAIN From 6ec4b94e8e959b4201ca0bfc43fa50dc946d10cb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Aug 2025 15:00:01 +0200 Subject: [PATCH 0054/3206] gpio: TODO: remove the task for converting to the new line setters The task is complete, but this was not reflected in the TODO file. Fixes: d9d87d90cc0b10cd ("treewide: rename GPIO set callbacks back to their original names") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/417af7e5a110c527eb759289bd5d2fd6885f4e01.1754917104.git.geert+renesas@glider.be Signed-off-by: Bartosz Golaszewski --- drivers/gpio/TODO | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO index 7a09a4f58551b5..b797499e627ee9 100644 --- a/drivers/gpio/TODO +++ b/drivers/gpio/TODO @@ -176,18 +176,6 @@ cannot be converted yet, but watch this space! ------------------------------------------------------------------------------- -Convert all GPIO chips to using the new, value returning line setters - -struct gpio_chip's set() and set_multiple() callbacks are now deprecated. They -return void and thus do not allow drivers to indicate failure to set the line -value back to the caller. - -We've now added new variants - set_rv() and set_multiple_rv() that return an -integer. Let's convert all GPIO drivers treewide to use the new callbacks, -remove the old ones and finally rename the new ones back to the old names. - -------------------------------------------------------------------------------- - Remove legacy sysfs features We have two parallel per-chip class devices and per-exported-line attribute From c6871d56b52ec177b8b8bd891fa0cbbd004b646d Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 29 Jul 2025 12:00:44 +0800 Subject: [PATCH 0055/3206] regulator: tps6594-regulator: Remove unneeded semicolon Remove unnecessary semicolons reported by Coccinelle/coccicheck and the semantic patch at scripts/coccinelle/misc/semicolon.cocci. Signed-off-by: Chen Ni Link: https://patch.msgid.link/20250729040044.1851988-1-nichen@iscas.ac.cn Signed-off-by: Mark Brown --- drivers/regulator/tps6594-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/tps6594-regulator.c b/drivers/regulator/tps6594-regulator.c index ab882daec7c5fd..645e83462c645e 100644 --- a/drivers/regulator/tps6594-regulator.c +++ b/drivers/regulator/tps6594-regulator.c @@ -647,7 +647,7 @@ static int tps6594_regulator_probe(struct platform_device *pdev) default: dev_err(tps->dev, "unknown chip_id %lu\n", tps->chip_id); return -EINVAL; - }; + } enum { MULTI_BUCK12, From f6cc4140e161831e5796f099f5abc3af953ae2b8 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Thu, 7 Aug 2025 20:39:49 +0800 Subject: [PATCH 0056/3206] regulator: tps6524x: Remove unnecessary memset devm_kzalloc() has already been initialized to full 0 space, there is no need to use memset() to initialize again. Signed-off-by: Liao Yuanhong Link: https://patch.msgid.link/20250807123949.495193-1-liaoyuanhong@vivo.com Signed-off-by: Mark Brown --- drivers/regulator/tps6524x-regulator.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/regulator/tps6524x-regulator.c b/drivers/regulator/tps6524x-regulator.c index 3fee7e38c68bc9..6beb51293e8e1d 100644 --- a/drivers/regulator/tps6524x-regulator.c +++ b/drivers/regulator/tps6524x-regulator.c @@ -598,7 +598,6 @@ static int pmic_probe(struct spi_device *spi) spi_set_drvdata(spi, hw); - memset(hw, 0, sizeof(struct tps6524x)); hw->dev = dev; hw->spi = spi; mutex_init(&hw->lock); From bb2441402392ef1f49563be68e8f0dcb127ac965 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Tue, 5 Aug 2025 22:40:56 +0300 Subject: [PATCH 0057/3206] regulator: add s2dos05 regulator support S2DOS05 has 1 buck and 4 LDO regulators, used for powering panel/touchscreen. Signed-off-by: Dzmitry Sankouski Link: https://patch.msgid.link/20250805-starqltechn_integration_upstream-v8-1-09d8a321fafe@gmail.com Signed-off-by: Mark Brown --- MAINTAINERS | 2 +- drivers/regulator/Kconfig | 8 ++ drivers/regulator/Makefile | 1 + drivers/regulator/s2dos05-regulator.c | 165 ++++++++++++++++++++++++++ include/linux/regulator/s2dos05.h | 73 ++++++++++++ 5 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 drivers/regulator/s2dos05-regulator.c create mode 100644 include/linux/regulator/s2dos05.h diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..f35f9b503956dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22387,7 +22387,7 @@ F: Documentation/devicetree/bindings/regulator/samsung,s2m*.yaml F: Documentation/devicetree/bindings/regulator/samsung,s5m*.yaml F: drivers/clk/clk-s2mps11.c F: drivers/mfd/sec*.[ch] -F: drivers/regulator/s2m*.c +F: drivers/regulator/s2*.c F: drivers/regulator/s5m*.c F: drivers/rtc/rtc-s5m.c F: include/linux/mfd/samsung/ diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index eaa6df1c9f8066..2399c8a9a1c5d7 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -1344,6 +1344,14 @@ config REGULATOR_RTQ2208 and two ldos. It features wide output voltage range from 0.4V to 2.05V and the capability to configure the corresponding power stages. +config REGULATOR_S2DOS05 + tristate "Samsung S2DOS05 voltage regulator" + depends on MFD_SEC_CORE || COMPILE_TEST + help + This driver provides support for the voltage regulators of the S2DOS05. + The S2DOS05 is a companion power management IC for the smart phones. + The S2DOS05 has 4 LDOs and 1 BUCK outputs. + config REGULATOR_S2MPA01 tristate "Samsung S2MPA01 voltage regulator" depends on MFD_SEC_CORE || COMPILE_TEST diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index be98b29d6675d8..78605b0fa0b28c 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -156,6 +156,7 @@ obj-$(CONFIG_REGULATOR_RTMV20) += rtmv20-regulator.o obj-$(CONFIG_REGULATOR_RTQ2134) += rtq2134-regulator.o obj-$(CONFIG_REGULATOR_RTQ6752) += rtq6752-regulator.o obj-$(CONFIG_REGULATOR_RTQ2208) += rtq2208-regulator.o +obj-$(CONFIG_REGULATOR_S2DOS05) += s2dos05-regulator.o obj-$(CONFIG_REGULATOR_S2MPA01) += s2mpa01.o obj-$(CONFIG_REGULATOR_S2MPS11) += s2mps11.o obj-$(CONFIG_REGULATOR_S5M8767) += s5m8767.o diff --git a/drivers/regulator/s2dos05-regulator.c b/drivers/regulator/s2dos05-regulator.c new file mode 100644 index 00000000000000..1463585c456520 --- /dev/null +++ b/drivers/regulator/s2dos05-regulator.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// s2dos05.c - Regulator driver for the Samsung s2dos05 +// +// Copyright (C) 2025 Dzmitry Sankouski + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct s2dos05_data { + struct regmap *regmap; + struct device *dev; +}; + +#define _BUCK(macro) S2DOS05_BUCK##macro +#define _buck_ops(num) s2dos05_ops##num +#define _LDO(macro) S2DOS05_LDO##macro +#define _REG(ctrl) S2DOS05_REG##ctrl +#define _ldo_ops(num) s2dos05_ops##num +#define _MASK(macro) S2DOS05_ENABLE_MASK##macro +#define _TIME(macro) S2DOS05_ENABLE_TIME##macro + +#define BUCK_DESC(_name, _id, _ops, m, s, v, e, em, t, a) { \ + .name = _name, \ + .id = _id, \ + .ops = _ops, \ + .of_match = of_match_ptr(_name), \ + .of_match_full_name = true, \ + .regulators_node = of_match_ptr("regulators"), \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = m, \ + .uV_step = s, \ + .n_voltages = S2DOS05_BUCK_N_VOLTAGES, \ + .vsel_reg = v, \ + .vsel_mask = S2DOS05_BUCK_VSEL_MASK, \ + .enable_reg = e, \ + .enable_mask = em, \ + .enable_time = t, \ + .active_discharge_off = 0, \ + .active_discharge_on = S2DOS05_BUCK_FD_MASK, \ + .active_discharge_reg = a, \ + .active_discharge_mask = S2DOS05_BUCK_FD_MASK \ +} + +#define LDO_DESC(_name, _id, _ops, m, s, v, e, em, t, a) { \ + .name = _name, \ + .id = _id, \ + .ops = _ops, \ + .of_match = of_match_ptr(_name), \ + .of_match_full_name = true, \ + .regulators_node = of_match_ptr("regulators"), \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = m, \ + .uV_step = s, \ + .n_voltages = S2DOS05_LDO_N_VOLTAGES, \ + .vsel_reg = v, \ + .vsel_mask = S2DOS05_LDO_VSEL_MASK, \ + .enable_reg = e, \ + .enable_mask = em, \ + .enable_time = t, \ + .active_discharge_off = 0, \ + .active_discharge_on = S2DOS05_LDO_FD_MASK, \ + .active_discharge_reg = a, \ + .active_discharge_mask = S2DOS05_LDO_FD_MASK \ +} + +static const struct regulator_ops s2dos05_ops = { + .list_voltage = regulator_list_voltage_linear, + .map_voltage = regulator_map_voltage_linear, + .is_enabled = regulator_is_enabled_regmap, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .set_active_discharge = regulator_set_active_discharge_regmap, +}; + +static const struct regulator_desc regulators[S2DOS05_REGULATOR_MAX] = { + // name, id, ops, min_uv, uV_step, vsel_reg, enable_reg + LDO_DESC("ldo1", _LDO(1), &_ldo_ops(), _LDO(_MIN1), + _LDO(_STEP1), _REG(_LDO1_CFG), + _REG(_EN), _MASK(_L1), _TIME(_LDO), _REG(_LDO1_CFG)), + LDO_DESC("ldo2", _LDO(2), &_ldo_ops(), _LDO(_MIN1), + _LDO(_STEP1), _REG(_LDO2_CFG), + _REG(_EN), _MASK(_L2), _TIME(_LDO), _REG(_LDO2_CFG)), + LDO_DESC("ldo3", _LDO(3), &_ldo_ops(), _LDO(_MIN2), + _LDO(_STEP1), _REG(_LDO3_CFG), + _REG(_EN), _MASK(_L3), _TIME(_LDO), _REG(_LDO3_CFG)), + LDO_DESC("ldo4", _LDO(4), &_ldo_ops(), _LDO(_MIN2), + _LDO(_STEP1), _REG(_LDO4_CFG), + _REG(_EN), _MASK(_L4), _TIME(_LDO), _REG(_LDO4_CFG)), + BUCK_DESC("buck", _BUCK(1), &_buck_ops(), _BUCK(_MIN1), + _BUCK(_STEP1), _REG(_BUCK_VOUT), + _REG(_EN), _MASK(_B1), _TIME(_BUCK), _REG(_BUCK_CFG)), +}; + +static int s2dos05_pmic_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct sec_pmic_dev *iodev = dev_get_drvdata(pdev->dev.parent); + struct s2dos05_data *s2dos05; + struct regulator_config config = { }; + unsigned int rdev_num = ARRAY_SIZE(regulators); + + s2dos05 = devm_kzalloc(dev, sizeof(*s2dos05), GFP_KERNEL); + if (!s2dos05) + return -ENOMEM; + + platform_set_drvdata(pdev, s2dos05); + + s2dos05->regmap = iodev->regmap_pmic; + s2dos05->dev = dev; + if (!dev->of_node) + dev->of_node = dev->parent->of_node; + + config.dev = dev; + config.driver_data = s2dos05; + + for (int i = 0; i < rdev_num; i++) { + struct regulator_dev *regulator; + + regulator = devm_regulator_register(&pdev->dev, + ®ulators[i], &config); + if (IS_ERR(regulator)) { + return dev_err_probe(&pdev->dev, PTR_ERR(regulator), + "regulator init failed for %d\n", i); + } + } + + return 0; +} + +static const struct platform_device_id s2dos05_pmic_id[] = { + { "s2dos05-regulator" }, + { }, +}; +MODULE_DEVICE_TABLE(platform, s2dos05_pmic_id); + +static struct platform_driver s2dos05_platform_driver = { + .driver = { + .name = "s2dos05", + }, + .probe = s2dos05_pmic_probe, + .id_table = s2dos05_pmic_id, +}; +module_platform_driver(s2dos05_platform_driver); + +MODULE_AUTHOR("Dzmitry Sankouski "); +MODULE_DESCRIPTION("Samsung S2DOS05 Regulator Driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/regulator/s2dos05.h b/include/linux/regulator/s2dos05.h new file mode 100644 index 00000000000000..2e89fcbce76958 --- /dev/null +++ b/include/linux/regulator/s2dos05.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +// s2dos05.h +// +// Copyright (c) 2016 Samsung Electronics Co., Ltd +// http://www.samsung.com +// Copyright (C) 2024 Dzmitry Sankouski + +#ifndef __LINUX_S2DOS05_H +#define __LINUX_S2DOS05_H + +// S2DOS05 registers +// Slave Addr : 0xC0 +enum S2DOS05_reg { + S2DOS05_REG_DEV_ID, + S2DOS05_REG_TOPSYS_STAT, + S2DOS05_REG_STAT, + S2DOS05_REG_EN, + S2DOS05_REG_LDO1_CFG, + S2DOS05_REG_LDO2_CFG, + S2DOS05_REG_LDO3_CFG, + S2DOS05_REG_LDO4_CFG, + S2DOS05_REG_BUCK_CFG, + S2DOS05_REG_BUCK_VOUT, + S2DOS05_REG_IRQ_MASK = 0x0D, + S2DOS05_REG_SSD_TSD = 0x0E, + S2DOS05_REG_OCL = 0x10, + S2DOS05_REG_IRQ = 0x11 +}; + +// S2DOS05 regulator ids +enum S2DOS05_regulators { + S2DOS05_LDO1, + S2DOS05_LDO2, + S2DOS05_LDO3, + S2DOS05_LDO4, + S2DOS05_BUCK1, + S2DOS05_REG_MAX, +}; + +#define S2DOS05_IRQ_PWRMT_MASK BIT(5) +#define S2DOS05_IRQ_TSD_MASK BIT(4) +#define S2DOS05_IRQ_SSD_MASK BIT(3) +#define S2DOS05_IRQ_SCP_MASK BIT(2) +#define S2DOS05_IRQ_UVLO_MASK BIT(1) +#define S2DOS05_IRQ_OCD_MASK BIT(0) + +#define S2DOS05_BUCK_MIN1 506250 +#define S2DOS05_LDO_MIN1 1500000 +#define S2DOS05_LDO_MIN2 2700000 +#define S2DOS05_BUCK_STEP1 6250 +#define S2DOS05_LDO_STEP1 25000 +#define S2DOS05_LDO_VSEL_MASK 0x7F +#define S2DOS05_LDO_FD_MASK 0x80 +#define S2DOS05_BUCK_VSEL_MASK 0xFF +#define S2DOS05_BUCK_FD_MASK 0x08 + +#define S2DOS05_ENABLE_MASK_L1 BIT(0) +#define S2DOS05_ENABLE_MASK_L2 BIT(1) +#define S2DOS05_ENABLE_MASK_L3 BIT(2) +#define S2DOS05_ENABLE_MASK_L4 BIT(3) +#define S2DOS05_ENABLE_MASK_B1 BIT(4) + +#define S2DOS05_RAMP_DELAY 12000 + +#define S2DOS05_ENABLE_TIME_LDO 50 +#define S2DOS05_ENABLE_TIME_BUCK 350 + +#define S2DOS05_LDO_N_VOLTAGES (S2DOS05_LDO_VSEL_MASK + 1) +#define S2DOS05_BUCK_N_VOLTAGES (S2DOS05_BUCK_VSEL_MASK + 1) + +#define S2DOS05_REGULATOR_MAX (S2DOS05_REG_MAX) + +#endif // __LINUX_S2DOS05_H From a54ef14188519a0994d0264f701f5771815fa11e Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 7 Aug 2025 16:44:57 -0500 Subject: [PATCH 0058/3206] regulator: dt-bindings: Clean-up active-semi,act8945a duplication The active-semi,act8945a binding is documented in multiple places. The charger child node is documented in regulator/active-semi,act8945a.yaml and power/supply/active-semi,act8945a-charger.yaml. An old text binding is in mfd/act8945a.txt. Update the regulator/active-semi,act8945a.yaml with the additional descriptions and constraints from power/supply/active-semi,act8945a-charger.yaml, and then remove it and mfd/act8945a.txt. Signed-off-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250807214459.4173892-1-robh@kernel.org Signed-off-by: Mark Brown --- .../devicetree/bindings/mfd/act8945a.txt | 82 ------------------- .../supply/active-semi,act8945a-charger.yaml | 76 ----------------- .../regulator/active-semi,act8945a.yaml | 25 ++++-- 3 files changed, 19 insertions(+), 164 deletions(-) delete mode 100644 Documentation/devicetree/bindings/mfd/act8945a.txt delete mode 100644 Documentation/devicetree/bindings/power/supply/active-semi,act8945a-charger.yaml diff --git a/Documentation/devicetree/bindings/mfd/act8945a.txt b/Documentation/devicetree/bindings/mfd/act8945a.txt deleted file mode 100644 index 5ca75d888b4a5b..00000000000000 --- a/Documentation/devicetree/bindings/mfd/act8945a.txt +++ /dev/null @@ -1,82 +0,0 @@ -Device-Tree bindings for Active-semi ACT8945A MFD driver - -Required properties: - - compatible: "active-semi,act8945a". - - reg: the I2C slave address for the ACT8945A chip - -The chip exposes two subdevices: - - a regulators: see ../regulator/act8945a-regulator.txt - - a charger: see ../power/act8945a-charger.txt - -Example: - pmic@5b { - compatible = "active-semi,act8945a"; - reg = <0x5b>; - - active-semi,vsel-high; - - regulators { - vdd_1v35_reg: REG_DCDC1 { - regulator-name = "VDD_1V35"; - regulator-min-microvolt = <1350000>; - regulator-max-microvolt = <1350000>; - regulator-always-on; - }; - - vdd_1v2_reg: REG_DCDC2 { - regulator-name = "VDD_1V2"; - regulator-min-microvolt = <1100000>; - regulator-max-microvolt = <1300000>; - regulator-always-on; - }; - - vdd_3v3_reg: REG_DCDC3 { - regulator-name = "VDD_3V3"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - regulator-always-on; - }; - - vdd_fuse_reg: REG_LDO1 { - regulator-name = "VDD_FUSE"; - regulator-min-microvolt = <2500000>; - regulator-max-microvolt = <2500000>; - regulator-always-on; - }; - - vdd_3v3_lp_reg: REG_LDO2 { - regulator-name = "VDD_3V3_LP"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - regulator-always-on; - }; - - vdd_led_reg: REG_LDO3 { - regulator-name = "VDD_LED"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - regulator-always-on; - }; - - vdd_sdhc_1v8_reg: REG_LDO4 { - regulator-name = "VDD_SDHC_1V8"; - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <1800000>; - regulator-always-on; - }; - }; - - charger { - compatible = "active-semi,act8945a-charger"; - pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_charger_chglev &pinctrl_charger_lbo &pinctrl_charger_irq>; - interrupt-parent = <&pioA>; - interrupts = <45 IRQ_TYPE_LEVEL_LOW>; - - active-semi,chglev-gpios = <&pioA 12 GPIO_ACTIVE_HIGH>; - active-semi,lbo-gpios = <&pioA 72 GPIO_ACTIVE_LOW>; - active-semi,input-voltage-threshold-microvolt = <6600>; - active-semi,precondition-timeout = <40>; - active-semi,total-timeout = <3>; - }; - }; diff --git a/Documentation/devicetree/bindings/power/supply/active-semi,act8945a-charger.yaml b/Documentation/devicetree/bindings/power/supply/active-semi,act8945a-charger.yaml deleted file mode 100644 index 5220d9cb16d880..00000000000000 --- a/Documentation/devicetree/bindings/power/supply/active-semi,act8945a-charger.yaml +++ /dev/null @@ -1,76 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -%YAML 1.2 ---- -$id: http://devicetree.org/schemas/power/supply/active-semi,act8945a-charger.yaml# -$schema: http://devicetree.org/meta-schemas/core.yaml# - -title: Active-semi ACT8945A Charger Function - -maintainers: - - Sebastian Reichel - -allOf: - - $ref: power-supply.yaml# - -properties: - compatible: - const: active-semi,act8945a-charger - - interrupts: - maxItems: 1 - - active-semi,chglev-gpios: - maxItems: 1 - description: charge current level GPIO - - active-semi,lbo-gpios: - maxItems: 1 - description: low battery voltage detect GPIO - - active-semi,input-voltage-threshold-microvolt: - description: | - Specifies the charger's input over-voltage threshold value. - Despite the name, specified values are in millivolt (mV). - Defaults to 6.6 V - enum: [ 6600, 7000, 7500, 8000 ] - - active-semi,precondition-timeout: - $ref: /schemas/types.yaml#/definitions/uint32 - description: | - Specifies the charger's PRECONDITION safety timer setting value in minutes. - If 0, it means to disable this timer. - Defaults to 40 minutes. - enum: [ 0, 40, 60, 80 ] - - active-semi,total-timeout: - $ref: /schemas/types.yaml#/definitions/uint32 - description: | - Specifies the charger's total safety timer setting value in hours; - If 0, it means to disable this timer; - Defaults to 3 hours. - enum: [ 0, 3, 4, 5 ] - -required: - - compatible - - interrupts - - active-semi,chglev-gpios - - active-semi,lbo-gpios - -additionalProperties: false - -examples: - - | - #include - #include - pmic { - charger { - compatible = "active-semi,act8945a-charger"; - interrupt-parent = <&pioA>; - interrupts = <45 IRQ_TYPE_LEVEL_LOW>; - active-semi,chglev-gpios = <&pioA 12 GPIO_ACTIVE_HIGH>; - active-semi,lbo-gpios = <&pioA 72 GPIO_ACTIVE_LOW>; - active-semi,input-voltage-threshold-microvolt = <6600>; - active-semi,precondition-timeout = <40>; - active-semi,total-timeout = <3>; - }; - }; diff --git a/Documentation/devicetree/bindings/regulator/active-semi,act8945a.yaml b/Documentation/devicetree/bindings/regulator/active-semi,act8945a.yaml index bdf3f7d34ef51b..a8d579844dc7bc 100644 --- a/Documentation/devicetree/bindings/regulator/active-semi,act8945a.yaml +++ b/Documentation/devicetree/bindings/regulator/active-semi,act8945a.yaml @@ -91,28 +91,41 @@ properties: maxItems: 1 active-semi,chglev-gpios: - description: CGHLEV GPIO + description: charge current level GPIO maxItems: 1 active-semi,lbo-gpios: - description: LBO GPIO + description: low battery voltage detect GPIO maxItems: 1 active-semi,input-voltage-threshold-microvolt: - description: Input voltage threshold - maxItems: 1 + description: + Specifies the charger's input over-voltage threshold value. Despite + the name, specified values are in millivolt (mV). + enum: [ 6600, 7000, 7500, 8000 ] + default: 6600 active-semi,precondition-timeout: - description: Precondition timeout + description: + Specifies the charger's PRECONDITION safety timer setting value in + minutes. If 0, it means to disable this timer. + enum: [ 0, 40, 60, 80 ] + default: 40 $ref: /schemas/types.yaml#/definitions/uint32 active-semi,total-timeout: - description: Total timeout + description: + Specifies the charger's total safety timer setting value in hours; If + 0, it means to disable this timer; + enum: [ 0, 3, 4, 5 ] + default: 3 $ref: /schemas/types.yaml#/definitions/uint32 required: - compatible - interrupts + - active-semi,chglev-gpios + - active-semi,lbo-gpios additionalProperties: false From 181fe022ecf8a8e85def0e94852c631c59a8b3f6 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:44 +0200 Subject: [PATCH 0059/3206] gpiolib: add support to register sparse pin range Add support to register for GPIO<->pin mapping using a list of non consecutive pins. The core already supports sparse pin range (pins member of struct pinctrl_gpio_range), but it was not possible to register one. If pins is not NULL the core uses it, otherwise it assumes that a consecutive pin range was registered and it uses pin_base. The function gpiochip_add_pin_range() which allocates and fills the struct pinctrl_gpio_range was renamed to gpiochip_add_pin_range_with_pins() and the pins parameter was added. Two new functions were added, gpiochip_add_pin_range() and gpiochip_add_sparse_pin_range() to register a consecutive or sparse pins range. Both use gpiochip_add_pin_range_with_pins(). Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-1-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 29 ++++++++++++++------- include/linux/gpio/driver.h | 51 ++++++++++++++++++++++++++++++++++--- 2 files changed, 68 insertions(+), 12 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 0d2b470a252eeb..98d2fa60249056 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2349,11 +2349,13 @@ int gpiochip_add_pingroup_range(struct gpio_chip *gc, EXPORT_SYMBOL_GPL(gpiochip_add_pingroup_range); /** - * gpiochip_add_pin_range() - add a range for GPIO <-> pin mapping + * gpiochip_add_pin_range_with_pins() - add a range for GPIO <-> pin mapping * @gc: the gpiochip to add the range for * @pinctl_name: the dev_name() of the pin controller to map to * @gpio_offset: the start offset in the current gpio_chip number space * @pin_offset: the start offset in the pin controller number space + * @pins: the list of non consecutive pins to accumulate in this range (if not + * NULL, pin_offset is ignored by pinctrl core) * @npins: the number of pins from the offset of each pin space (GPIO and * pin controller) to accumulate in this range * @@ -2365,9 +2367,12 @@ EXPORT_SYMBOL_GPL(gpiochip_add_pingroup_range); * Returns: * 0 on success, or a negative errno on failure. */ -int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, - unsigned int gpio_offset, unsigned int pin_offset, - unsigned int npins) +int gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int const *pins, + unsigned int npins) { struct gpio_pin_range *pin_range; struct gpio_device *gdev = gc->gpiodev; @@ -2385,6 +2390,7 @@ int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, pin_range->range.name = gc->label; pin_range->range.base = gdev->base + gpio_offset; pin_range->range.pin_base = pin_offset; + pin_range->range.pins = pins; pin_range->range.npins = npins; pin_range->pctldev = pinctrl_find_and_add_gpio_range(pinctl_name, &pin_range->range); @@ -2394,16 +2400,21 @@ int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, kfree(pin_range); return ret; } - chip_dbg(gc, "created GPIO range %d->%d ==> %s PIN %d->%d\n", - gpio_offset, gpio_offset + npins - 1, - pinctl_name, - pin_offset, pin_offset + npins - 1); + if (pin_range->range.pins) + chip_dbg(gc, "created GPIO range %d->%d ==> %s %d sparse PIN range { %d, ... }", + gpio_offset, gpio_offset + npins - 1, + pinctl_name, npins, pins[0]); + else + chip_dbg(gc, "created GPIO range %d->%d ==> %s PIN %d->%d\n", + gpio_offset, gpio_offset + npins - 1, + pinctl_name, + pin_offset, pin_offset + npins - 1); list_add_tail(&pin_range->node, &gdev->pin_ranges); return 0; } -EXPORT_SYMBOL_GPL(gpiochip_add_pin_range); +EXPORT_SYMBOL_GPL(gpiochip_add_pin_range_with_pins); /** * gpiochip_remove_pin_ranges() - remove all the GPIO <-> pin mappings diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 667f8fd58a793f..9fcd4a988081f7 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -772,16 +772,50 @@ struct gpio_pin_range { #ifdef CONFIG_PINCTRL -int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, - unsigned int gpio_offset, unsigned int pin_offset, - unsigned int npins); +int gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int const *pins, + unsigned int npins); int gpiochip_add_pingroup_range(struct gpio_chip *gc, struct pinctrl_dev *pctldev, unsigned int gpio_offset, const char *pin_group); void gpiochip_remove_pin_ranges(struct gpio_chip *gc); +static inline int +gpiochip_add_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int npins) +{ + return gpiochip_add_pin_range_with_pins(gc, pinctl_name, gpio_offset, + pin_offset, NULL, npins); +} + +static inline int +gpiochip_add_sparse_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int const *pins, + unsigned int npins) +{ + return gpiochip_add_pin_range_with_pins(gc, pinctl_name, gpio_offset, 0, + pins, npins); +} #else /* ! CONFIG_PINCTRL */ +static inline int +gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int npins) +{ + return 0; +} + static inline int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, unsigned int gpio_offset, unsigned int pin_offset, @@ -789,6 +823,17 @@ gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, { return 0; } + +static inline int +gpiochip_add_sparse_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int const *pins, + unsigned int npins) +{ + return 0; +} + static inline int gpiochip_add_pingroup_range(struct gpio_chip *gc, struct pinctrl_dev *pctldev, From 871c7cd54830c0bda15513238ea9d46fc1cae991 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:45 +0200 Subject: [PATCH 0060/3206] gpio: aggregator: move GPIO forwarder allocation in a dedicated function Move the GPIO forwarder allocation and static initialization in a dedicated function. Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-2-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 50 ++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index af9d8b3a711dad..75c8d0deef2fe2 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -475,6 +475,35 @@ static int gpiochip_fwd_setup_delay_line(struct device *dev, struct gpio_chip *c } #endif /* !CONFIG_OF_GPIO */ +static struct gpiochip_fwd * +devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios) +{ + struct gpiochip_fwd *fwd; + struct gpio_chip *chip; + + fwd = devm_kzalloc(dev, struct_size(fwd, tmp, fwd_tmp_size(ngpios)), GFP_KERNEL); + if (!fwd) + return ERR_PTR(-ENOMEM); + + chip = &fwd->chip; + + chip->label = dev_name(dev); + chip->parent = dev; + chip->owner = THIS_MODULE; + chip->get_direction = gpio_fwd_get_direction; + chip->direction_input = gpio_fwd_direction_input; + chip->direction_output = gpio_fwd_direction_output; + chip->get = gpio_fwd_get; + chip->get_multiple = gpio_fwd_get_multiple_locked; + chip->set = gpio_fwd_set; + chip->set_multiple = gpio_fwd_set_multiple_locked; + chip->to_irq = gpio_fwd_to_irq; + chip->base = -1; + chip->ngpio = ngpios; + + return fwd; +} + /** * gpiochip_fwd_create() - Create a new GPIO forwarder * @dev: Parent device pointer @@ -495,16 +524,14 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, struct gpio_desc *descs[], unsigned long features) { - const char *label = dev_name(dev); struct gpiochip_fwd *fwd; struct gpio_chip *chip; unsigned int i; int error; - fwd = devm_kzalloc(dev, struct_size(fwd, tmp, fwd_tmp_size(ngpios)), - GFP_KERNEL); - if (!fwd) - return ERR_PTR(-ENOMEM); + fwd = devm_gpiochip_fwd_alloc(dev, ngpios); + if (IS_ERR(fwd)) + return fwd; chip = &fwd->chip; @@ -526,19 +553,6 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, chip->set_config = gpio_fwd_set_config; } - chip->label = label; - chip->parent = dev; - chip->owner = THIS_MODULE; - chip->get_direction = gpio_fwd_get_direction; - chip->direction_input = gpio_fwd_direction_input; - chip->direction_output = gpio_fwd_direction_output; - chip->get = gpio_fwd_get; - chip->get_multiple = gpio_fwd_get_multiple_locked; - chip->set = gpio_fwd_set; - chip->set_multiple = gpio_fwd_set_multiple_locked; - chip->to_irq = gpio_fwd_to_irq; - chip->base = -1; - chip->ngpio = ngpios; fwd->descs = descs; if (chip->can_sleep) From c44ce91b8ada680074aa976e61fcef5633e6f086 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:46 +0200 Subject: [PATCH 0061/3206] gpio: aggregator: refactor the code to add GPIO desc in the forwarder Create a dedicated function to add a GPIO desc in the forwarder. Instead of saving a GPIO descs array pointer, now the GPIO descs are passed one by one to the forwarder which registers them in its own array. So after the call of gpiochip_fwd_create(), the passed array can be free. Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-3-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 59 +++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index 75c8d0deef2fe2..43e488ee838344 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -485,6 +485,10 @@ devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios) if (!fwd) return ERR_PTR(-ENOMEM); + fwd->descs = devm_kcalloc(dev, ngpios, sizeof(*fwd->descs), GFP_KERNEL); + if (!fwd->descs) + return ERR_PTR(-ENOMEM); + chip = &fwd->chip; chip->label = dev_name(dev); @@ -504,13 +508,43 @@ devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios) return fwd; } +static int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, + struct gpio_desc *desc, + unsigned int offset) +{ + struct gpio_chip *parent = gpiod_to_chip(desc); + struct gpio_chip *chip = &fwd->chip; + + if (offset > chip->ngpio) + return -EINVAL; + + /* + * If any of the GPIO lines are sleeping, then the entire forwarder + * will be sleeping. + * If any of the chips support .set_config(), then the forwarder will + * support setting configs. + */ + if (gpiod_cansleep(desc)) + chip->can_sleep = true; + + if (parent && parent->set_config) + chip->set_config = gpio_fwd_set_config; + + fwd->descs[offset] = desc; + + dev_dbg(chip->parent, "%u => gpio %d irq %d\n", offset, + desc_to_gpio(desc), gpiod_to_irq(desc)); + + return 0; +} + /** * gpiochip_fwd_create() - Create a new GPIO forwarder * @dev: Parent device pointer * @ngpios: Number of GPIOs in the forwarder. * @descs: Array containing the GPIO descriptors to forward to. - * This array must contain @ngpios entries, and must not be deallocated - * before the forwarder has been destroyed again. + * This array must contain @ngpios entries, and can be deallocated + * as the forwarder has its own array. * @features: Bitwise ORed features as defined with FWD_FEATURE_*. * * This function creates a new gpiochip, which forwards all GPIO operations to @@ -535,26 +569,12 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, chip = &fwd->chip; - /* - * If any of the GPIO lines are sleeping, then the entire forwarder - * will be sleeping. - * If any of the chips support .set_config(), then the forwarder will - * support setting configs. - */ for (i = 0; i < ngpios; i++) { - struct gpio_chip *parent = gpiod_to_chip(descs[i]); - - dev_dbg(dev, "%u => gpio %d irq %d\n", i, - desc_to_gpio(descs[i]), gpiod_to_irq(descs[i])); - - if (gpiod_cansleep(descs[i])) - chip->can_sleep = true; - if (parent && parent->set_config) - chip->set_config = gpio_fwd_set_config; + error = gpiochip_fwd_desc_add(fwd, descs[i], i); + if (error) + return ERR_PTR(error); } - fwd->descs = descs; - if (chip->can_sleep) mutex_init(&fwd->mlock); else @@ -1348,6 +1368,7 @@ static int gpio_aggregator_probe(struct platform_device *pdev) return PTR_ERR(fwd); platform_set_drvdata(pdev, fwd); + devm_kfree(dev, descs); return 0; } From 10d022efe2c4bb2020a07b2e4d94b658ce30aca4 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:47 +0200 Subject: [PATCH 0062/3206] gpio: aggregator: refactor the forwarder registration part Add a new function gpiochip_fwd_register(), which finalizes the initialization of the forwarder and registers the corresponding gpiochip. Reviewed-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-4-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index 43e488ee838344..95415f92843606 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -538,6 +538,18 @@ static int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, return 0; } +static int gpiochip_fwd_register(struct gpiochip_fwd *fwd) +{ + struct gpio_chip *chip = &fwd->chip; + + if (chip->can_sleep) + mutex_init(&fwd->mlock); + else + spin_lock_init(&fwd->slock); + + return devm_gpiochip_add_data(chip->parent, chip, fwd); +} + /** * gpiochip_fwd_create() - Create a new GPIO forwarder * @dev: Parent device pointer @@ -575,18 +587,13 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, return ERR_PTR(error); } - if (chip->can_sleep) - mutex_init(&fwd->mlock); - else - spin_lock_init(&fwd->slock); - if (features & FWD_FEATURE_DELAY) { error = gpiochip_fwd_setup_delay_line(dev, chip, fwd); if (error) return ERR_PTR(error); } - error = devm_gpiochip_add_data(dev, chip, fwd); + error = gpiochip_fwd_register(fwd); if (error) return ERR_PTR(error); From b94cf35db606453c64e5da13c56cc6f8cabc6a33 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:48 +0200 Subject: [PATCH 0063/3206] gpio: aggregator: update gpiochip_fwd_setup_delay_line() parameters Remove useless parameters of gpiochip_fwd_setup_delay_line(). Reviewed-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-5-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index 95415f92843606..cc54d19f2dd113 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -453,10 +453,11 @@ static int gpiochip_fwd_delay_of_xlate(struct gpio_chip *chip, return line; } -static int gpiochip_fwd_setup_delay_line(struct device *dev, struct gpio_chip *chip, - struct gpiochip_fwd *fwd) +static int gpiochip_fwd_setup_delay_line(struct gpiochip_fwd *fwd) { - fwd->delay_timings = devm_kcalloc(dev, chip->ngpio, + struct gpio_chip *chip = &fwd->chip; + + fwd->delay_timings = devm_kcalloc(chip->parent, chip->ngpio, sizeof(*fwd->delay_timings), GFP_KERNEL); if (!fwd->delay_timings) @@ -468,8 +469,7 @@ static int gpiochip_fwd_setup_delay_line(struct device *dev, struct gpio_chip *c return 0; } #else -static int gpiochip_fwd_setup_delay_line(struct device *dev, struct gpio_chip *chip, - struct gpiochip_fwd *fwd) +static int gpiochip_fwd_setup_delay_line(struct gpiochip_fwd *fwd) { return 0; } @@ -571,7 +571,6 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, unsigned long features) { struct gpiochip_fwd *fwd; - struct gpio_chip *chip; unsigned int i; int error; @@ -579,8 +578,6 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, if (IS_ERR(fwd)) return fwd; - chip = &fwd->chip; - for (i = 0; i < ngpios; i++) { error = gpiochip_fwd_desc_add(fwd, descs[i], i); if (error) @@ -588,7 +585,7 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, } if (features & FWD_FEATURE_DELAY) { - error = gpiochip_fwd_setup_delay_line(dev, chip, fwd); + error = gpiochip_fwd_setup_delay_line(fwd); if (error) return ERR_PTR(error); } From 6e986f8852f56cf9214ea2ec02b4b432e201d02c Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:49 +0200 Subject: [PATCH 0064/3206] gpio: aggregator: export symbols of the GPIO forwarder library Export all symbols and create header file for the GPIO forwarder library. It will be used in the next changes. Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-6-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 202 ++++++++++++++++++++++++++++++++- include/linux/gpio/forwarder.h | 37 ++++++ 2 files changed, 233 insertions(+), 6 deletions(-) create mode 100644 include/linux/gpio/forwarder.h diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index cc54d19f2dd113..c1952b28b7b7f9 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include +#include #include #include "dev-sync-probe.h" @@ -475,8 +477,180 @@ static int gpiochip_fwd_setup_delay_line(struct gpiochip_fwd *fwd) } #endif /* !CONFIG_OF_GPIO */ -static struct gpiochip_fwd * -devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios) +/** + * gpiochip_fwd_get_gpiochip - Get the GPIO chip for the GPIO forwarder + * @fwd: GPIO forwarder + * + * Returns: The GPIO chip for the GPIO forwarder + */ +struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd) +{ + return &fwd->chip; +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_gpiochip, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get_direction - Return the current direction of a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: 0 for output, 1 for input, or an error code in case of error. + */ +int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get_direction(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get_direction, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_direction_output - Set a GPIO forwarder line direction to + * output + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @value: value to set + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_direction_output(struct gpiochip_fwd *fwd, unsigned int offset, + int value) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_direction_output(gc, offset, value); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_direction_output, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_direction_input - Set a GPIO forwarder line direction to input + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_direction_input(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_direction_input(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_direction_input, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get - Return a GPIO forwarder line's value + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: The GPIO's logical value, i.e. taking the ACTIVE_LOW status into + * account, or negative errno on failure. + */ +int gpiochip_fwd_gpio_get(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get_multiple - Get values for multiple GPIO forwarder lines + * @fwd: GPIO forwarder + * @mask: bit mask array; one bit per line; BITS_PER_LONG bits per word defines + * which lines are to be read + * @bits: bit value array; one bit per line; BITS_PER_LONG bits per word will + * contains the read values for the lines specified by mask + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_get_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, + unsigned long *bits) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get_multiple_locked(gc, mask, bits); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get_multiple, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set - Assign value to a GPIO forwarder line. + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @value: value to set + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_set(struct gpiochip_fwd *fwd, unsigned int offset, int value) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set(gc, offset, value); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set_multiple - Assign values to multiple GPIO forwarder lines + * @fwd: GPIO forwarder + * @mask: bit mask array; one bit per output; BITS_PER_LONG bits per word + * defines which outputs are to be changed + * @bits: bit value array; one bit per output; BITS_PER_LONG bits per word + * defines the values the outputs specified by mask are to be set to + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_set_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, + unsigned long *bits) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set_multiple_locked(gc, mask, bits); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set_multiple, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set_config - Set @config for a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @config: Same packed config format as generic pinconf + * + * Returns: 0 on success, %-ENOTSUPP if the controller doesn't support setting + * the configuration. + */ +int gpiochip_fwd_gpio_set_config(struct gpiochip_fwd *fwd, unsigned int offset, + unsigned long config) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set_config(gc, offset, config); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set_config, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_to_irq - Return the IRQ corresponding to a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: The Linux IRQ corresponding to the passed line, or an error code in + * case of error. + */ +int gpiochip_fwd_gpio_to_irq(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_to_irq(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_to_irq, "GPIO_FORWARDER"); + +/** + * devm_gpiochip_fwd_alloc - Allocate and initialize a new GPIO forwarder + * @dev: Parent device pointer + * @ngpios: Number of GPIOs in the forwarder + * + * Returns: An opaque object pointer, or an ERR_PTR()-encoded negative error + * code on failure. + */ +struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, + unsigned int ngpios) { struct gpiochip_fwd *fwd; struct gpio_chip *chip; @@ -507,10 +681,18 @@ devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios) return fwd; } +EXPORT_SYMBOL_NS_GPL(devm_gpiochip_fwd_alloc, "GPIO_FORWARDER"); -static int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, - struct gpio_desc *desc, - unsigned int offset) +/** + * gpiochip_fwd_desc_add - Add a GPIO desc in the forwarder + * @fwd: GPIO forwarder + * @desc: GPIO descriptor to register + * @offset: offset for the GPIO in the forwarder + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, + unsigned int offset) { struct gpio_chip *parent = gpiod_to_chip(desc); struct gpio_chip *chip = &fwd->chip; @@ -537,8 +719,15 @@ static int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, return 0; } +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_add, "GPIO_FORWARDER"); -static int gpiochip_fwd_register(struct gpiochip_fwd *fwd) +/** + * gpiochip_fwd_register - Register a GPIO forwarder + * @fwd: GPIO forwarder + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_register(struct gpiochip_fwd *fwd) { struct gpio_chip *chip = &fwd->chip; @@ -549,6 +738,7 @@ static int gpiochip_fwd_register(struct gpiochip_fwd *fwd) return devm_gpiochip_add_data(chip->parent, chip, fwd); } +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_register, "GPIO_FORWARDER"); /** * gpiochip_fwd_create() - Create a new GPIO forwarder diff --git a/include/linux/gpio/forwarder.h b/include/linux/gpio/forwarder.h new file mode 100644 index 00000000000000..e21a1b7b190525 --- /dev/null +++ b/include/linux/gpio/forwarder.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_GPIO_FORWARDER_H +#define __LINUX_GPIO_FORWARDER_H + +struct gpio_desc; +struct gpio_chip; +struct gpiochip_fwd; + +struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, + unsigned int ngpios); +int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, + struct gpio_desc *desc, unsigned int offset); +int gpiochip_fwd_register(struct gpiochip_fwd *fwd); + +struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd); + +int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, + unsigned int offset); +int gpiochip_fwd_gpio_direction_input(struct gpiochip_fwd *fwd, + unsigned int offset); +int gpiochip_fwd_gpio_direction_output(struct gpiochip_fwd *fwd, + unsigned int offset, + int value); +int gpiochip_fwd_gpio_get(struct gpiochip_fwd *fwd, unsigned int offset); +int gpiochip_fwd_gpio_get_multiple(struct gpiochip_fwd *fwd, + unsigned long *mask, + unsigned long *bits); +int gpiochip_fwd_gpio_set(struct gpiochip_fwd *fwd, unsigned int offset, + int value); +int gpiochip_fwd_gpio_set_multiple(struct gpiochip_fwd *fwd, + unsigned long *mask, + unsigned long *bits); +int gpiochip_fwd_gpio_set_config(struct gpiochip_fwd *fwd, unsigned int offset, + unsigned long config); +int gpiochip_fwd_gpio_to_irq(struct gpiochip_fwd *fwd, unsigned int offset); + +#endif From b31c68fd851e74526ad963362ea205eb97b9a710 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:50 +0200 Subject: [PATCH 0065/3206] gpio: aggregator: handle runtime registration of gpio_desc in gpiochip_fwd Add request() callback to check if the GPIO descriptor was well registered in the gpiochip_fwd before using it. This is done to handle the case where GPIO descriptor is added at runtime in the forwarder. If at least one GPIO descriptor was not added before the forwarder registration, we assume the forwarder can sleep as if a GPIO is added at runtime it may sleep. Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-7-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 63 ++++++++++++++++++++++++++++++---- include/linux/gpio/forwarder.h | 2 ++ 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index c1952b28b7b7f9..f0d38d76cf73fc 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -246,6 +246,7 @@ struct gpiochip_fwd { spinlock_t slock; /* protects tmp[] if !can_sleep */ }; struct gpiochip_fwd_timing *delay_timings; + unsigned long *valid_mask; unsigned long tmp[]; /* values and descs for multiple ops */ }; @@ -254,10 +255,24 @@ struct gpiochip_fwd { #define fwd_tmp_size(ngpios) (BITS_TO_LONGS((ngpios)) + (ngpios)) +static int gpio_fwd_request(struct gpio_chip *chip, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(chip); + + return test_bit(offset, fwd->valid_mask) ? 0 : -ENODEV; +} + static int gpio_fwd_get_direction(struct gpio_chip *chip, unsigned int offset) { struct gpiochip_fwd *fwd = gpiochip_get_data(chip); + /* + * get_direction() is called during gpiochip registration, return + * -ENODEV if there is no GPIO desc for the line. + */ + if (!test_bit(offset, fwd->valid_mask)) + return -ENODEV; + return gpiod_get_direction(fwd->descs[offset]); } @@ -489,6 +504,21 @@ struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd) } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_gpiochip, "GPIO_FORWARDER"); +/** + * gpiochip_fwd_gpio_request - Request a line of the GPIO forwarder + * @fwd: GPIO forwarder + * @offset: the offset of the line to request + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_request(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_request(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_request, "GPIO_FORWARDER"); + /** * gpiochip_fwd_gpio_get_direction - Return the current direction of a GPIO forwarder line * @fwd: GPIO forwarder @@ -663,11 +693,16 @@ struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, if (!fwd->descs) return ERR_PTR(-ENOMEM); + fwd->valid_mask = devm_bitmap_zalloc(dev, ngpios, GFP_KERNEL); + if (!fwd->valid_mask) + return ERR_PTR(-ENOMEM); + chip = &fwd->chip; chip->label = dev_name(dev); chip->parent = dev; chip->owner = THIS_MODULE; + chip->request = gpio_fwd_request; chip->get_direction = gpio_fwd_get_direction; chip->direction_input = gpio_fwd_direction_input; chip->direction_output = gpio_fwd_direction_output; @@ -694,24 +729,21 @@ EXPORT_SYMBOL_NS_GPL(devm_gpiochip_fwd_alloc, "GPIO_FORWARDER"); int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, unsigned int offset) { - struct gpio_chip *parent = gpiod_to_chip(desc); struct gpio_chip *chip = &fwd->chip; if (offset > chip->ngpio) return -EINVAL; + if (test_and_set_bit(offset, fwd->valid_mask)) + return -EEXIST; + /* * If any of the GPIO lines are sleeping, then the entire forwarder * will be sleeping. - * If any of the chips support .set_config(), then the forwarder will - * support setting configs. */ if (gpiod_cansleep(desc)) chip->can_sleep = true; - if (parent && parent->set_config) - chip->set_config = gpio_fwd_set_config; - fwd->descs[offset] = desc; dev_dbg(chip->parent, "%u => gpio %d irq %d\n", offset, @@ -721,6 +753,18 @@ int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_add, "GPIO_FORWARDER"); +/** + * gpiochip_fwd_desc_free - Remove a GPIO desc from the forwarder + * @fwd: GPIO forwarder + * @offset: offset of GPIO desc to remove + */ +void gpiochip_fwd_desc_free(struct gpiochip_fwd *fwd, unsigned int offset) +{ + if (test_and_clear_bit(offset, fwd->valid_mask)) + gpiod_put(fwd->descs[offset]); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_free, "GPIO_FORWARDER"); + /** * gpiochip_fwd_register - Register a GPIO forwarder * @fwd: GPIO forwarder @@ -731,6 +775,13 @@ int gpiochip_fwd_register(struct gpiochip_fwd *fwd) { struct gpio_chip *chip = &fwd->chip; + /* + * Some gpio_desc were not registered. They will be registered at runtime + * but we have to suppose they can sleep. + */ + if (!bitmap_full(fwd->valid_mask, chip->ngpio)) + chip->can_sleep = true; + if (chip->can_sleep) mutex_init(&fwd->mlock); else diff --git a/include/linux/gpio/forwarder.h b/include/linux/gpio/forwarder.h index e21a1b7b190525..45e0190308f09f 100644 --- a/include/linux/gpio/forwarder.h +++ b/include/linux/gpio/forwarder.h @@ -10,10 +10,12 @@ struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios); int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, unsigned int offset); +void gpiochip_fwd_desc_free(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_register(struct gpiochip_fwd *fwd); struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd); +int gpiochip_fwd_gpio_request(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_gpio_direction_input(struct gpiochip_fwd *fwd, From 60e92c1009c7c6abd4a9d0caf33a8cba5d09f67c Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:51 +0200 Subject: [PATCH 0066/3206] gpio: aggregator: add possibility to attach data to the forwarder Add a data pointer to store private data in the forwarder. Reviewed-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-8-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 20 ++++++++++++++++++-- include/linux/gpio/forwarder.h | 4 +++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index f0d38d76cf73fc..fb3694d581d1ed 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -246,6 +246,7 @@ struct gpiochip_fwd { spinlock_t slock; /* protects tmp[] if !can_sleep */ }; struct gpiochip_fwd_timing *delay_timings; + void *data; unsigned long *valid_mask; unsigned long tmp[]; /* values and descs for multiple ops */ }; @@ -504,6 +505,18 @@ struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd) } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_gpiochip, "GPIO_FORWARDER"); +/** + * gpiochip_fwd_get_data - Get driver-private data for the GPIO forwarder + * @fwd: GPIO forwarder + * + * Returns: The driver-private data for the GPIO forwarder + */ +void *gpiochip_fwd_get_data(struct gpiochip_fwd *fwd) +{ + return fwd->data; +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_data, "GPIO_FORWARDER"); + /** * gpiochip_fwd_gpio_request - Request a line of the GPIO forwarder * @fwd: GPIO forwarder @@ -768,10 +781,11 @@ EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_free, "GPIO_FORWARDER"); /** * gpiochip_fwd_register - Register a GPIO forwarder * @fwd: GPIO forwarder + * @data: driver-private data associated with this forwarder * * Returns: 0 on success, or negative errno on failure. */ -int gpiochip_fwd_register(struct gpiochip_fwd *fwd) +int gpiochip_fwd_register(struct gpiochip_fwd *fwd, void *data) { struct gpio_chip *chip = &fwd->chip; @@ -787,6 +801,8 @@ int gpiochip_fwd_register(struct gpiochip_fwd *fwd) else spin_lock_init(&fwd->slock); + fwd->data = data; + return devm_gpiochip_add_data(chip->parent, chip, fwd); } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_register, "GPIO_FORWARDER"); @@ -831,7 +847,7 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, return ERR_PTR(error); } - error = gpiochip_fwd_register(fwd); + error = gpiochip_fwd_register(fwd, NULL); if (error) return ERR_PTR(error); diff --git a/include/linux/gpio/forwarder.h b/include/linux/gpio/forwarder.h index 45e0190308f09f..ee5d8355f73557 100644 --- a/include/linux/gpio/forwarder.h +++ b/include/linux/gpio/forwarder.h @@ -11,10 +11,12 @@ struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, unsigned int offset); void gpiochip_fwd_desc_free(struct gpiochip_fwd *fwd, unsigned int offset); -int gpiochip_fwd_register(struct gpiochip_fwd *fwd); +int gpiochip_fwd_register(struct gpiochip_fwd *fwd, void *data); struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd); +void *gpiochip_fwd_get_data(struct gpiochip_fwd *fwd); + int gpiochip_fwd_gpio_request(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, unsigned int offset); From 53ec9169db1345f04174febb90f88a871fc28d9e Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:52 +0200 Subject: [PATCH 0067/3206] lib/string_choices: Add str_input_output() helper Add str_input_output() helper to return 'input' or 'output' string literal. Also add the inversed variant str_output_input(). Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-9-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- include/linux/string_choices.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index f3ba4f52ff2601..a27c87c954ae0d 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -41,6 +41,12 @@ static inline const char *str_high_low(bool v) } #define str_low_high(v) str_high_low(!(v)) +static inline const char *str_input_output(bool v) +{ + return v ? "input" : "output"; +} +#define str_output_input(v) str_input_output(!(v)) + static inline const char *str_on_off(bool v) { return v ? "on" : "off"; From 8a5a0294f40a50e5be83e9b7ebbc15b546f64e41 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Mon, 4 Aug 2025 21:26:42 +0100 Subject: [PATCH 0068/3206] dt-bindings: clock: renesas,r9a09g077/87: Add USB_CLK clock ID Add the USB clock (USB_CLK) definition for the Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs. USB_CLK is used as the reference clock for USB PHY layer. Signed-off-by: Lad Prabhakar Acked-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250804202643.3967484-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h | 1 + include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h | 1 + 2 files changed, 2 insertions(+) diff --git a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h index 7ecc4f0b235aac..0c2ce81a874487 100644 --- a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h +++ b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h @@ -25,5 +25,6 @@ #define R9A09G077_CLK_PCLKM 13 #define R9A09G077_CLK_PCLKL 14 #define R9A09G077_SDHI_CLKHS 15 +#define R9A09G077_USB_CLK 16 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G077_CPG_H__ */ diff --git a/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h index 925e57703925dd..70ee883f2386b4 100644 --- a/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h +++ b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h @@ -25,5 +25,6 @@ #define R9A09G087_CLK_PCLKM 13 #define R9A09G087_CLK_PCLKL 14 #define R9A09G087_SDHI_CLKHS 15 +#define R9A09G087_USB_CLK 16 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G087_CPG_H__ */ From 5293e8f2a854344ef9aba2391b44c7a437889ebb Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Fri, 8 Aug 2025 14:30:15 +0100 Subject: [PATCH 0069/3206] dt-bindings: pinctrl: renesas: Document RZ/T2H and RZ/N2H SoCs Document the pin and GPIO controller IP for the Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs, and add the shared DTSI header file used by both the bindings and the driver. The RZ/T2H SoC supports 729 pins, while RZ/N2H supports 576 pins. Both share the same controller architecture; separate compatible strings are added for each SoC to distinguish them. Co-developed-by: Thierry Bultel Signed-off-by: Thierry Bultel Signed-off-by: Lad Prabhakar Reviewed-by: "Rob Herring (Arm)" Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250808133017.2053637-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- .../pinctrl/renesas,r9a09g077-pinctrl.yaml | 172 ++++++++++++++++++ .../pinctrl/renesas,r9a09g077-pinctrl.h | 22 +++ 2 files changed, 194 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/renesas,r9a09g077-pinctrl.yaml create mode 100644 include/dt-bindings/pinctrl/renesas,r9a09g077-pinctrl.h diff --git a/Documentation/devicetree/bindings/pinctrl/renesas,r9a09g077-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/renesas,r9a09g077-pinctrl.yaml new file mode 100644 index 00000000000000..36d66597148424 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/renesas,r9a09g077-pinctrl.yaml @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/renesas,r9a09g077-pinctrl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas RZ/T2H and RZ/N2H Pin and GPIO controller + +maintainers: + - Lad Prabhakar + +description: + The Renesas RZ/T2H and RZ/N2H SoCs feature a combined Pin and GPIO controller. + Pin multiplexing and GPIO configuration are performed on a per-pin basis. + Each port supports up to 8 pins, each configurable for either GPIO (port mode) + or alternate function mode. Each pin supports function mode values ranging from + 0x0 to 0x2A, allowing selection from up to 43 different functions. + +properties: + compatible: + enum: + - renesas,r9a09g077-pinctrl # RZ/T2H + - renesas,r9a09g087-pinctrl # RZ/N2H + + reg: + minItems: 1 + items: + - description: Non-safety I/O Port base + - description: Safety I/O Port safety region base + - description: Safety I/O Port Non-safety region base + + reg-names: + minItems: 1 + items: + - const: nsr + - const: srs + - const: srn + + gpio-controller: true + + '#gpio-cells': + const: 2 + description: + The first cell contains the global GPIO port index, constructed using the + RZT2H_GPIO() helper macro from + (e.g. "RZT2H_GPIO(3, 0)" for P03_0). The second cell represents the consumer + flag. Use the macros defined in include/dt-bindings/gpio/gpio.h. + + gpio-ranges: + maxItems: 1 + + clocks: + maxItems: 1 + + power-domains: + maxItems: 1 + +definitions: + renesas-rzt2h-n2h-pins-node: + type: object + allOf: + - $ref: pincfg-node.yaml# + - $ref: pinmux-node.yaml# + properties: + pinmux: + description: + Values are constructed from I/O port number, pin number, and + alternate function configuration number using the RZT2H_PORT_PINMUX() + helper macro from . + pins: true + phandle: true + input: true + input-enable: true + output-enable: true + oneOf: + - required: [pinmux] + - required: [pins] + additionalProperties: false + +patternProperties: + # Grouping nodes: allow multiple "-pins" subnodes within a "-group" + '.*-group$': + type: object + description: + Pin controller client devices can organize pin configuration entries into + grouping nodes ending in "-group". These group nodes may contain multiple + child nodes each ending in "-pins" to configure distinct sets of pins. + additionalProperties: false + patternProperties: + '-pins$': + $ref: '#/definitions/renesas-rzt2h-n2h-pins-node' + + # Standalone "-pins" nodes under client devices or groups + '-pins$': + $ref: '#/definitions/renesas-rzt2h-n2h-pins-node' + + '-hog$': + type: object + description: GPIO hog node + properties: + gpio-hog: true + gpios: true + input: true + output-high: true + output-low: true + line-name: true + required: + - gpio-hog + - gpios + additionalProperties: false + +allOf: + - $ref: pinctrl.yaml# + +required: + - compatible + - reg + - reg-names + - gpio-controller + - '#gpio-cells' + - gpio-ranges + - clocks + - power-domains + +unevaluatedProperties: false + +examples: + - | + #include + #include + + pinctrl@802c0000 { + compatible = "renesas,r9a09g077-pinctrl"; + reg = <0x802c0000 0x2000>, + <0x812c0000 0x2000>, + <0x802b0000 0x2000>; + reg-names = "nsr", "srs", "srn"; + clocks = <&cpg CPG_CORE R9A09G077_CLK_PCLKM>; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&pinctrl 0 0 288>; + power-domains = <&cpg>; + + serial0-pins { + pinmux = , /* Tx */ + ; /* Rx */ + }; + + sd1-pwr-en-hog { + gpio-hog; + gpios = ; + output-high; + line-name = "sd1_pwr_en"; + }; + + i2c0-pins { + pins = "RIIC0_SDA", "RIIC0_SCL"; + input-enable; + }; + + sd0-sd-group { + ctrl-pins { + pinmux = , /* SD0_CLK */ + ; /* SD0_CMD */ + }; + + data-pins { + pinmux = , /* SD0_CLK */ + ; /* SD0_CMD */ + }; + }; + }; diff --git a/include/dt-bindings/pinctrl/renesas,r9a09g077-pinctrl.h b/include/dt-bindings/pinctrl/renesas,r9a09g077-pinctrl.h new file mode 100644 index 00000000000000..f088793f23eedc --- /dev/null +++ b/include/dt-bindings/pinctrl/renesas,r9a09g077-pinctrl.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * This header provides constants for Renesas RZ/T2H family pinctrl bindings. + * + * Copyright (C) 2025 Renesas Electronics Corp. + */ + +#ifndef __DT_BINDINGS_PINCTRL_RENESAS_R9A09G077_PINCTRL_H__ +#define __DT_BINDINGS_PINCTRL_RENESAS_R9A09G077_PINCTRL_H__ + +#define RZT2H_PINS_PER_PORT 8 + +/* + * Create the pin index from its bank and position numbers and store in + * the upper 16 bits the alternate function identifier + */ +#define RZT2H_PORT_PINMUX(b, p, f) ((b) * RZT2H_PINS_PER_PORT + (p) | ((f) << 16)) + +/* Convert a port and pin label to its global pin index */ +#define RZT2H_GPIO(port, pin) ((port) * RZT2H_PINS_PER_PORT + (pin)) + +#endif /* __DT_BINDINGS_PINCTRL_RENESAS_R9A09G077_PINCTRL_H__ */ From ecb0605364465c051ff4744f3a848ef084b44769 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 11 Aug 2025 15:50:28 +0900 Subject: [PATCH 0070/3206] vfs: show filesystem name at dump_inode() Commit 8b17e540969a ("vfs: add initial support for CONFIG_DEBUG_VFS") added dump_inode(), but dump_inode() currently reports only raw pointer address. Comment says that adding a proper inode dumping routine is a TODO. However, syzkaller concurrently tests multiple filesystems, and several filesystems started calling dump_inode() due to hitting VFS_BUG_ON_INODE() added by commit af153bb63a33 ("vfs: catch invalid modes in may_open()") before a proper inode dumping routine is implemented. Show filesystem name at dump_inode() so that we can find which filesystem has passed an invalid mode to may_open() from syzkaller's crash reports. Link: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d Signed-off-by: Tetsuo Handa Link: https://lore.kernel.org/ceaf4021-65cc-422e-9d0e-6afa18dd8276@I-love.SAKURA.ne.jp Signed-off-by: Christian Brauner --- fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 3cbb78412a3ec3..cc0f717a140d3e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2913,7 +2913,7 @@ EXPORT_SYMBOL(mode_strip_sgid); */ void dump_inode(struct inode *inode, const char *reason) { - pr_warn("%s encountered for inode %px", reason, inode); + pr_warn("%s encountered for inode %px (%s)\n", reason, inode, inode->i_sb->s_type->name); } EXPORT_SYMBOL(dump_inode); From 1e5f0fb41fccf5ecbb5506551790335c9578e320 Mon Sep 17 00:00:00 2001 From: Askar Safin Date: Mon, 11 Aug 2025 04:52:23 +0000 Subject: [PATCH 0071/3206] vfs: fs/namespace.c: remove ms_flags argument from do_remount ...because it is not used Signed-off-by: Askar Safin Link: https://lore.kernel.org/20250811045444.1813009-1-safinaskar@zohomail.com Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 86d12f88b68852..9f732e03c2e56c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3279,7 +3279,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ -static int do_remount(struct path *path, int ms_flags, int sb_flags, +static int do_remount(struct path *path, int sb_flags, int mnt_flags, void *data) { int err; @@ -4111,7 +4111,7 @@ int path_mount(const char *dev_name, struct path *path, if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) return do_reconfigure_mnt(path, mnt_flags); if (flags & MS_REMOUNT) - return do_remount(path, flags, sb_flags, mnt_flags, data_page); + return do_remount(path, sb_flags, mnt_flags, data_page); if (flags & MS_BIND) return do_loopback(path, dev_name, flags & MS_REC); if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) From c5055d0c8eddfb89ed895ae0642e2a2a0804143d Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 16 Jul 2025 14:27:30 -0700 Subject: [PATCH 0072/3206] audit: fix indentation in audit_log_exit() Fix two indentation errors in audit_log_exit(). Signed-off-by: Casey Schaufler [PM: subject tweak] Signed-off-by: Paul Moore --- kernel/auditsc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index eb98cd6fe91fb5..1c29541c8fb645 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1778,15 +1778,16 @@ static void audit_log_exit(void) axs->target_sessionid[i], &axs->target_ref[i], axs->target_comm[i])) - call_panic = 1; + call_panic = 1; } if (context->target_pid && audit_log_pid_context(context, context->target_pid, context->target_auid, context->target_uid, context->target_sessionid, - &context->target_ref, context->target_comm)) - call_panic = 1; + &context->target_ref, + context->target_comm)) + call_panic = 1; if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); From d8c09d7b55da39a10c8fd7f2b3a3f88f5f55764c Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 18 Jul 2025 22:37:34 +0200 Subject: [PATCH 0073/3206] audit: Replace deprecated strcpy() with strscpy() strcpy() is deprecated; use strscpy() instead. Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Thorsten Blum Signed-off-by: Paul Moore --- kernel/audit_tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b0eae2a3c895d8..1605df0a171ea8 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -93,8 +93,10 @@ static struct kmem_cache *audit_tree_mark_cachep __ro_after_init; static struct audit_tree *alloc_tree(const char *s) { struct audit_tree *tree; + size_t sz; - tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL); + sz = strlen(s) + 1; + tree = kmalloc(struct_size(tree, pathname, sz), GFP_KERNEL); if (tree) { refcount_set(&tree->count, 1); tree->goner = 0; @@ -103,7 +105,7 @@ static struct audit_tree *alloc_tree(const char *s) INIT_LIST_HEAD(&tree->list); INIT_LIST_HEAD(&tree->same_root); tree->root = NULL; - strcpy(tree->pathname, s); + strscpy(tree->pathname, s, sz); } return tree; } From df1145b56c6f92696acec7730694a19fb4c8a174 Mon Sep 17 00:00:00 2001 From: Kieran Moy Date: Sat, 5 Jul 2025 15:48:10 +0800 Subject: [PATCH 0074/3206] audit: fix typo in auditfilter.c comment Correct the misspelling of "searching" (was "serarching") in the function documentation for audit_update_lsm_rules. Found via code inspection, no functional impact. Signed-off-by: Kieran Moy Signed-off-by: Paul Moore --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e3f42018ed46fa..4ed0e10e20b5ec 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1440,7 +1440,7 @@ static int update_lsm_rule(struct audit_krule *r) } /* This function will re-initialize the lsm_rule field of all applicable rules. - * It will traverse the filter lists serarching for rules that contain LSM + * It will traverse the filter lists searching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the * LSM field is re-initialized, and the old rule is replaced with the * updated rule. */ From ce8370e2e62a903e18be7dd0e0be2eee079501e1 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 6 Aug 2025 17:04:07 -0400 Subject: [PATCH 0075/3206] audit: record fanotify event regardless of presence of rules When no audit rules are in place, fanotify event results are unconditionally dropped due to an explicit check for the existence of any audit rules. Given this is a report from another security sub-system, allow it to be recorded regardless of the existence of any audit rules. To test, install and run the fapolicyd daemon with default config. Then as an unprivileged user, create and run a very simple binary that should be denied. Then check for an event with ausearch -m FANOTIFY -ts recent Link: https://issues.redhat.com/browse/RHEL-9065 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index a394614ccd0b81..e3f06eba9c6e6e 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -527,7 +527,7 @@ static inline void audit_log_kern_module(const char *name) static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { - if (!audit_dummy_context()) + if (audit_enabled) __audit_fanotify(response, friar); } From 5f9383bd4168eaf88a673d3eac5d8935a2202846 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 23 Jul 2025 20:23:04 +0800 Subject: [PATCH 0076/3206] selinux: Remove unused function selinux_policycap_netif_wildcard() This is unused since commit a3d3043ef24a ("selinux: get netif_wildcard policycap from policy instead of cache"). Signed-off-by: Yue Haibing Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/include/security.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 8201e6a3ac0fc8..7f19972f7922e9 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -203,12 +203,6 @@ static inline bool selinux_policycap_netlink_xperm(void) selinux_state.policycap[POLICYDB_CAP_NETLINK_XPERM]); } -static inline bool selinux_policycap_netif_wildcard(void) -{ - return READ_ONCE( - selinux_state.policycap[POLICYDB_CAP_NETIF_WILDCARD]); -} - struct selinux_policy_convert_data; struct selinux_load_state { From d4e8dc8e8b34771b0a3d474d243bcfcbddef8612 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Tue, 29 Jul 2025 17:10:44 +0800 Subject: [PATCH 0077/3206] selinux: use a consistent method to get full socket from skb In order to maintain code consistency and readability, skb_to_full_sk() is used to get full socket from skb. Signed-off-by: Tianjia Zhang Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d40..e474cd7398effd 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5885,7 +5885,7 @@ static unsigned int selinux_ip_output(void *priv, struct sk_buff *skb, /* we do this in the LOCAL_OUT path and not the POST_ROUTING path * because we want to make sure we apply the necessary labeling * before IPsec is applied so we can leverage AH protection */ - sk = sk_to_full_sk(skb->sk); + sk = skb_to_full_sk(skb); if (sk) { struct sk_security_struct *sksec; From fbec18dc9940a89e094bae890b8dc0af00ddc4ff Mon Sep 17 00:00:00 2001 From: Gabor Juhos Date: Sun, 10 Aug 2025 16:38:50 +0200 Subject: [PATCH 0078/3206] spi: spi-qpic-snand: remove unused 'dev' member of struct 'qpic_ecc' The 'dev' member of the 'qpic_ecc' structure is never used in the code so remove that. No functional changes. Signed-off-by: Gabor Juhos Reviewed-by: Konrad Dybcio Link: https://patch.msgid.link/20250810-qpic-snand-qpic_ecc-cleanup-v1-1-33a6b2bcbc67@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-qpic-snand.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/spi/spi-qpic-snand.c b/drivers/spi/spi-qpic-snand.c index 0ceaad7dba3cb5..53fe248a4b9a9e 100644 --- a/drivers/spi/spi-qpic-snand.c +++ b/drivers/spi/spi-qpic-snand.c @@ -78,7 +78,6 @@ struct qcom_ecc_stats { }; struct qpic_ecc { - struct device *dev; int ecc_bytes_hw; int spare_bytes; int bbm_size; From 6b7e2aa50bdaf88cd4c2a5e2059a7bf32d85a8b1 Mon Sep 17 00:00:00 2001 From: Gabor Juhos Date: Sun, 10 Aug 2025 16:38:51 +0200 Subject: [PATCH 0079/3206] spi: spi-qpic-snand: remove 'clr*status' members of struct 'qpic_ecc' In the qcom_spi_ecc_init_ctx_pipelined() function, the 'clrflashstatus' and the 'clrreadstatus' members of the ECC context gets initialized with constant values. Then these values are used by several functions to set the corresponding members in the register cache. Because the values are never modified, change the code to set the those directly in the register cache by the qcom_spi_ecc_init_ctx_pipelined() function, and remove the repetitive code from the other functions to reduce code duplication. Also, remove the respective members from the 'qpic_ecc' structure as those became unused due to the change. No functional changes intended. Signed-off-by: Gabor Juhos Link: https://patch.msgid.link/20250810-qpic-snand-qpic_ecc-cleanup-v1-2-33a6b2bcbc67@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-qpic-snand.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/spi/spi-qpic-snand.c b/drivers/spi/spi-qpic-snand.c index 53fe248a4b9a9e..22fb59dbe2a0c7 100644 --- a/drivers/spi/spi-qpic-snand.c +++ b/drivers/spi/spi-qpic-snand.c @@ -94,8 +94,6 @@ struct qpic_ecc { u32 cfg1_raw; u32 ecc_buf_cfg; u32 ecc_bch_cfg; - u32 clrflashstatus; - u32 clrreadstatus; bool bch_enabled; }; @@ -381,12 +379,12 @@ static int qcom_spi_ecc_init_ctx_pipelined(struct nand_device *nand) FIELD_PREP(ECC_PARITY_SIZE_BYTES_BCH_MASK, ecc_cfg->ecc_bytes_hw); ecc_cfg->ecc_buf_cfg = FIELD_PREP(NUM_STEPS_MASK, 0x203); - ecc_cfg->clrflashstatus = FS_READY_BSY_N; - ecc_cfg->clrreadstatus = 0xc0; conf->step_size = ecc_cfg->step_size; conf->strength = ecc_cfg->strength; + snandc->regs->clrflashstatus = cpu_to_le32(FS_READY_BSY_N); + snandc->regs->clrreadstatus = cpu_to_le32(0xc0); snandc->regs->erased_cw_detect_cfg_clr = cpu_to_le32(CLR_ERASED_PAGE_DET); snandc->regs->erased_cw_detect_cfg_set = cpu_to_le32(SET_ERASED_PAGE_DET); @@ -598,8 +596,6 @@ static int qcom_spi_read_last_cw(struct qcom_nand_controller *snandc, snandc->regs->cfg0 = cpu_to_le32(cfg0); snandc->regs->cfg1 = cpu_to_le32(cfg1); snandc->regs->ecc_bch_cfg = cpu_to_le32(ecc_bch_cfg); - snandc->regs->clrflashstatus = cpu_to_le32(ecc_cfg->clrflashstatus); - snandc->regs->clrreadstatus = cpu_to_le32(ecc_cfg->clrreadstatus); snandc->regs->exec = cpu_to_le32(1); qcom_spi_set_read_loc(snandc, num_cw - 1, 0, 0, ecc_cfg->cw_size, 1); @@ -733,8 +729,6 @@ static int qcom_spi_read_cw_raw(struct qcom_nand_controller *snandc, u8 *data_bu snandc->regs->cfg0 = cpu_to_le32(cfg0); snandc->regs->cfg1 = cpu_to_le32(cfg1); snandc->regs->ecc_bch_cfg = cpu_to_le32(ecc_bch_cfg); - snandc->regs->clrflashstatus = cpu_to_le32(ecc_cfg->clrflashstatus); - snandc->regs->clrreadstatus = cpu_to_le32(ecc_cfg->clrreadstatus); snandc->regs->exec = cpu_to_le32(1); qcom_spi_set_read_loc(snandc, raw_cw, 0, 0, ecc_cfg->cw_size, 1); @@ -849,8 +843,6 @@ static int qcom_spi_read_page_ecc(struct qcom_nand_controller *snandc, snandc->regs->cfg0 = cpu_to_le32(cfg0); snandc->regs->cfg1 = cpu_to_le32(cfg1); snandc->regs->ecc_bch_cfg = cpu_to_le32(ecc_bch_cfg); - snandc->regs->clrflashstatus = cpu_to_le32(ecc_cfg->clrflashstatus); - snandc->regs->clrreadstatus = cpu_to_le32(ecc_cfg->clrreadstatus); snandc->regs->exec = cpu_to_le32(1); qcom_spi_set_read_loc(snandc, 0, 0, 0, ecc_cfg->cw_data, 1); @@ -942,8 +934,6 @@ static int qcom_spi_read_page_oob(struct qcom_nand_controller *snandc, snandc->regs->cfg0 = cpu_to_le32(cfg0); snandc->regs->cfg1 = cpu_to_le32(cfg1); snandc->regs->ecc_bch_cfg = cpu_to_le32(ecc_bch_cfg); - snandc->regs->clrflashstatus = cpu_to_le32(ecc_cfg->clrflashstatus); - snandc->regs->clrreadstatus = cpu_to_le32(ecc_cfg->clrreadstatus); snandc->regs->exec = cpu_to_le32(1); qcom_spi_set_read_loc(snandc, 0, 0, 0, ecc_cfg->cw_data, 1); @@ -1063,8 +1053,6 @@ static int qcom_spi_program_raw(struct qcom_nand_controller *snandc, snandc->regs->cfg0 = cpu_to_le32(cfg0); snandc->regs->cfg1 = cpu_to_le32(cfg1); snandc->regs->ecc_bch_cfg = cpu_to_le32(ecc_bch_cfg); - snandc->regs->clrflashstatus = cpu_to_le32(ecc_cfg->clrflashstatus); - snandc->regs->clrreadstatus = cpu_to_le32(ecc_cfg->clrreadstatus); snandc->regs->exec = cpu_to_le32(1); qcom_spi_config_page_write(snandc); From 0e6608d4938eb209616e8673c95364bb2a7d55bd Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 3 Aug 2025 18:22:40 +0800 Subject: [PATCH 0080/3206] fscrypt: Remove redundant __GFP_NOWARN GFP_NOWAIT already includes __GFP_NOWARN, so let's remove the redundant __GFP_NOWARN. Signed-off-by: Qianfeng Rong Link: https://lore.kernel.org/r/20250803102243.623705-3-rongqianfeng@vivo.com Signed-off-by: Eric Biggers --- fs/crypto/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 486fcb2ecf13eb..e92967e20e2ab9 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -148,7 +148,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, */ for (i = 0; i < nr_pages; i++) { pages[i] = fscrypt_alloc_bounce_page(i == 0 ? GFP_NOFS : - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!pages[i]) break; } From 690de2902dca98aec96de004428c020ca902f047 Mon Sep 17 00:00:00 2001 From: Wojciech Siudy Date: Tue, 3 Jun 2025 14:47:20 +0200 Subject: [PATCH 0081/3206] i2c: muxes: pca954x: Use reset controller only Fallback to legacy reset-gpios is no more needed, because the framework can use reset-gpios properity now as well. Signed-off-by: Wojciech Siudy [wsa: fixed whitespace issue] Signed-off-by: Wolfram Sang --- drivers/i2c/muxes/i2c-mux-pca954x.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c index b9f370c9f018c2..a9ddd548781ccf 100644 --- a/drivers/i2c/muxes/i2c-mux-pca954x.c +++ b/drivers/i2c/muxes/i2c-mux-pca954x.c @@ -118,7 +118,6 @@ struct pca954x { raw_spinlock_t lock; struct regulator *supply; - struct gpio_desc *reset_gpio; struct reset_control *reset_cont; }; @@ -527,17 +526,6 @@ static int pca954x_get_reset(struct device *dev, struct pca954x *data) if (IS_ERR(data->reset_cont)) return dev_err_probe(dev, PTR_ERR(data->reset_cont), "Failed to get reset\n"); - else if (data->reset_cont) - return 0; - - /* - * fallback to legacy reset-gpios - */ - data->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH); - if (IS_ERR(data->reset_gpio)) { - return dev_err_probe(dev, PTR_ERR(data->reset_gpio), - "Failed to get reset gpio"); - } return 0; } @@ -546,8 +534,6 @@ static void pca954x_reset_deassert(struct pca954x *data) { if (data->reset_cont) reset_control_deassert(data->reset_cont); - else - gpiod_set_value_cansleep(data->reset_gpio, 0); } /* @@ -589,7 +575,7 @@ static int pca954x_probe(struct i2c_client *client) if (ret) goto fail_cleanup; - if (data->reset_cont || data->reset_gpio) { + if (data->reset_cont) { udelay(1); pca954x_reset_deassert(data); /* Give the chip some time to recover. */ From 94c29677640312fcfab261bddc1006bc43efe4ba Mon Sep 17 00:00:00 2001 From: Wojciech Siudy Date: Tue, 3 Jun 2025 14:47:21 +0200 Subject: [PATCH 0082/3206] i2c: muxes: pca954x: Reset if (de)select fails If the channel selection or deselection times out, it indicates a failure in the mux's I2C subsystem. Without sending a reset pulse, a power-on-reset of the entire device would be required to restore communication. The datasheet specifies a minimum hold time of 4 ns for the reset pulse, but due to the path's capacitance and themux having its own clock, it is recommended to extend this to approximately 1 us. Since reset controller is used, there is no need to take care of other devices sharing the same reset line. Signed-off-by: Wojciech Siudy [wsa: removed superfluous prinout, unneeded braces and fixed indent] Signed-off-by: Wolfram Sang --- drivers/i2c/muxes/i2c-mux-pca954x.c | 34 ++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c index a9ddd548781ccf..75c8d08fa24e56 100644 --- a/drivers/i2c/muxes/i2c-mux-pca954x.c +++ b/drivers/i2c/muxes/i2c-mux-pca954x.c @@ -315,6 +315,25 @@ static u8 pca954x_regval(struct pca954x *data, u8 chan) return 1 << chan; } +static void pca954x_reset_assert(struct pca954x *data) +{ + if (data->reset_cont) + reset_control_assert(data->reset_cont); +} + +static void pca954x_reset_deassert(struct pca954x *data) +{ + if (data->reset_cont) + reset_control_deassert(data->reset_cont); +} + +static void pca954x_reset_mux(struct pca954x *data) +{ + pca954x_reset_assert(data); + udelay(1); + pca954x_reset_deassert(data); +} + static int pca954x_select_chan(struct i2c_mux_core *muxc, u32 chan) { struct pca954x *data = i2c_mux_priv(muxc); @@ -328,6 +347,8 @@ static int pca954x_select_chan(struct i2c_mux_core *muxc, u32 chan) ret = pca954x_reg_write(muxc->parent, client, regval); data->last_chan = ret < 0 ? 0 : regval; } + if (ret == -ETIMEDOUT && data->reset_cont) + pca954x_reset_mux(data); return ret; } @@ -337,6 +358,7 @@ static int pca954x_deselect_mux(struct i2c_mux_core *muxc, u32 chan) struct pca954x *data = i2c_mux_priv(muxc); struct i2c_client *client = data->client; s32 idle_state; + int ret = 0; idle_state = READ_ONCE(data->idle_state); if (idle_state >= 0) @@ -346,8 +368,10 @@ static int pca954x_deselect_mux(struct i2c_mux_core *muxc, u32 chan) if (idle_state == MUX_IDLE_DISCONNECT) { /* Deselect active channel */ data->last_chan = 0; - return pca954x_reg_write(muxc->parent, client, - data->last_chan); + ret = pca954x_reg_write(muxc->parent, client, + data->last_chan); + if (ret == -ETIMEDOUT && data->reset_cont) + pca954x_reset_mux(data); } /* otherwise leave as-is */ @@ -530,12 +554,6 @@ static int pca954x_get_reset(struct device *dev, struct pca954x *data) return 0; } -static void pca954x_reset_deassert(struct pca954x *data) -{ - if (data->reset_cont) - reset_control_deassert(data->reset_cont); -} - /* * I2C init/probing/exit functions */ From de68c05189cc4508c3ac4e1e44da1ddb16b1bceb Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 4 Aug 2025 13:04:49 +0200 Subject: [PATCH 0083/3206] tools/sched_ext: Receive updates from SCX repo Receive tools/sched_ext updates form https://github.com/sched-ext/scx to sync userspace bits: - basic BPF arena allocator abstractions, - additional process flags definitions, - fixed is_migration_disabled() helper, - separate out user_exit_info BPF and user space code. This also fixes the following warning when building the selftests: tools/sched_ext/include/scx/common.bpf.h:550:9: warning: 'likely' macro redefined [-Wmacro-redefined] 550 | #define likely(x) __builtin_expect(!!(x), 1) | ^ Co-developed-by: Cheng-Yang Chou Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- .../include/scx/bpf_arena_common.bpf.h | 175 ++++++++++++++++++ .../sched_ext/include/scx/bpf_arena_common.h | 33 ++++ tools/sched_ext/include/scx/common.bpf.h | 102 +++++++++- tools/sched_ext/include/scx/common.h | 5 +- tools/sched_ext/include/scx/compat.bpf.h | 5 + .../include/scx/user_exit_info.bpf.h | 40 ++++ tools/sched_ext/include/scx/user_exit_info.h | 49 +---- .../include/scx/user_exit_info_common.h | 30 +++ tools/sched_ext/scx_central.bpf.c | 2 +- tools/sched_ext/scx_central.c | 1 + tools/sched_ext/scx_flatcg.bpf.c | 2 +- tools/sched_ext/scx_flatcg.c | 2 + tools/sched_ext/scx_simple.c | 2 + 13 files changed, 388 insertions(+), 60 deletions(-) create mode 100644 tools/sched_ext/include/scx/bpf_arena_common.bpf.h create mode 100644 tools/sched_ext/include/scx/bpf_arena_common.h create mode 100644 tools/sched_ext/include/scx/user_exit_info.bpf.h create mode 100644 tools/sched_ext/include/scx/user_exit_info_common.h diff --git a/tools/sched_ext/include/scx/bpf_arena_common.bpf.h b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h new file mode 100644 index 00000000000000..4366fb3c91ce8a --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef PAGE_SIZE +#define PAGE_SIZE __PAGE_SIZE +/* + * for older kernels try sizeof(struct genradix_node) + * or flexible: + * static inline long __bpf_page_size(void) { + * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); + * } + * but generated code is not great. + */ +#endif + +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) +#define __arena __attribute__((address_space(1))) +#define __arena_global __attribute__((address_space(1))) +#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ +#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ +#else + +/* emit instruction: + * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as + * + * This is a workaround for LLVM compiler versions without + * __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena + * pointers and native kernel/userspace ones. In this case we explicitly do so + * with cast_kern() and cast_user(). E.g., in the Linux kernel tree, + * tools/testing/selftests/bpf includes tests that use these macros to implement + * linked lists and hashtables backed by arena memory. In sched_ext, we use + * cast_kern() and cast_user() for compatibility with older LLVM toolchains. + */ +#ifndef bpf_addr_space_cast +#define bpf_addr_space_cast(var, dst_as, src_as)\ + asm volatile(".byte 0xBF; \ + .ifc %[reg], r0; \ + .byte 0x00; \ + .endif; \ + .ifc %[reg], r1; \ + .byte 0x11; \ + .endif; \ + .ifc %[reg], r2; \ + .byte 0x22; \ + .endif; \ + .ifc %[reg], r3; \ + .byte 0x33; \ + .endif; \ + .ifc %[reg], r4; \ + .byte 0x44; \ + .endif; \ + .ifc %[reg], r5; \ + .byte 0x55; \ + .endif; \ + .ifc %[reg], r6; \ + .byte 0x66; \ + .endif; \ + .ifc %[reg], r7; \ + .byte 0x77; \ + .endif; \ + .ifc %[reg], r8; \ + .byte 0x88; \ + .endif; \ + .ifc %[reg], r9; \ + .byte 0x99; \ + .endif; \ + .short %[off]; \ + .long %[as]" \ + : [reg]"+r"(var) \ + : [off]"i"(BPF_ADDR_SPACE_CAST) \ + , [as]"i"((dst_as << 16) | src_as)); +#endif + +#define __arena +#define __arena_global SEC(".addr_space.1") +#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) +#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) +#endif + +void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, + int node_id, __u64 flags) __ksym __weak; +void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; + +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef TEST +#define can_loop true +#define __cond_break(expr) expr +#else +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */ +#endif /* __BPF_FEATURE_MAY_GOTO */ +#endif /* TEST */ + +#define cond_break __cond_break(break) +#define cond_break_label(label) __cond_break(goto label) + + +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; diff --git a/tools/sched_ext/include/scx/bpf_arena_common.h b/tools/sched_ext/include/scx/bpf_arena_common.h new file mode 100644 index 00000000000000..10141db0b59d02 --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef arena_container_of +#define arena_container_of(ptr, type, member) \ + ({ \ + void __arena *__mptr = (void __arena *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + +/* Provide the definition of PAGE_SIZE. */ +#include + +#define __arena +#define __arg_arena +#define cast_kern(ptr) /* nop for user space */ +#define cast_user(ptr) /* nop for user space */ +char __attribute__((weak)) arena[1]; + +#ifndef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) +#endif + +static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, + int node_id, __u64 flags) +{ + return NULL; +} +static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) +{ +} diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index d4e21558e98269..86abdb3c3142ac 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -24,14 +24,26 @@ #include #include #include -#include "user_exit_info.h" +#include "user_exit_info.bpf.h" #include "enum_defs.autogen.h" +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ +#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_EXITING 0x00000004 #define CLOCK_MONOTONIC 1 +#ifndef NR_CPUS +#define NR_CPUS 1024 +#endif + +#ifndef NUMA_NO_NODE +#define NUMA_NO_NODE (-1) +#endif + extern int LINUX_KERNEL_VERSION __kconfig; extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak; extern const char CONFIG_LOCALVERSION[64] __kconfig __weak; @@ -107,6 +119,9 @@ void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __ static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} +#define SCX_STRINGIFY(x) #x +#define SCX_TOSTRING(x) SCX_STRINGIFY(x) + /* * Helper macro for initializing the fmt and variadic argument inputs to both * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to @@ -141,13 +156,15 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments * instead of an array of u64. Invoking this macro will cause the scheduler to * exit in an erroneous state, with diagnostic information being passed to the - * user. + * user. It appends the file and line number to aid debugging. */ #define scx_bpf_error(fmt, args...) \ ({ \ - scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_bstr_preamble( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args) \ scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ - ___scx_bpf_bstr_format_checker(fmt, ##args); \ + ___scx_bpf_bstr_format_checker( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args); \ }) /* @@ -229,6 +246,7 @@ BPF_PROG(name, ##args) * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of * `MEMBER_VPTR(ptr, ->member)`. */ +#ifndef MEMBER_VPTR #define MEMBER_VPTR(base, member) (typeof((base) member) *) \ ({ \ u64 __base = (u64)&(base); \ @@ -245,6 +263,7 @@ BPF_PROG(name, ##args) [max]"i"(sizeof(base) - sizeof((base) member))); \ __addr; \ }) +#endif /* MEMBER_VPTR */ /** * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element @@ -260,6 +279,7 @@ BPF_PROG(name, ##args) * size of the array to compute the max, which will result in rejection by * the verifier. */ +#ifndef ARRAY_ELEM_PTR #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ ({ \ u64 __base = (u64)arr; \ @@ -274,7 +294,7 @@ BPF_PROG(name, ##args) [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ __addr; \ }) - +#endif /* ARRAY_ELEM_PTR */ /* * BPF declarations and helpers @@ -438,8 +458,27 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) */ static inline bool is_migration_disabled(const struct task_struct *p) { - if (bpf_core_field_exists(p->migration_disabled)) - return p->migration_disabled; + /* + * Testing p->migration_disabled in a BPF code is tricky because the + * migration is _always_ disabled while running the BPF code. + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF + * code execution disable and re-enable the migration of the current + * task, respectively. So, the _current_ task of the sched_ext ops is + * always migration-disabled. Moreover, p->migration_disabled could be + * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is + * executed in the middle of the other BPF code execution. + * + * Therefore, we should decide that the _current_ task is + * migration-disabled only when its migration_disabled count is greater + * than one. In other words, when p->migration_disabled == 1, there is + * an ambiguity, so we should check if @p is the current task or not. + */ + if (bpf_core_field_exists(p->migration_disabled)) { + if (p->migration_disabled == 1) + return bpf_get_current_task_btf() != p; + else + return p->migration_disabled; + } return false; } @@ -476,7 +515,7 @@ static inline s64 time_delta(u64 after, u64 before) */ static inline bool time_after(u64 a, u64 b) { - return (s64)(b - a) < 0; + return (s64)(b - a) < 0; } /** @@ -500,7 +539,7 @@ static inline bool time_before(u64 a, u64 b) */ static inline bool time_after_eq(u64 a, u64 b) { - return (s64)(a - b) >= 0; + return (s64)(a - b) >= 0; } /** @@ -547,9 +586,15 @@ static inline bool time_in_range_open(u64 a, u64 b, u64 c) */ /* useful compiler attributes */ +#ifndef likely #define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#ifndef __maybe_unused #define __maybe_unused __attribute__((__unused__)) +#endif /* * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They @@ -632,6 +677,26 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __u.__val; \ }) +/* + * __calc_avg - Calculate exponential weighted moving average (EWMA) with + * @old and @new values. @decay represents how large the @old value remains. + * With a larger @decay value, the moving average changes slowly, exhibiting + * fewer fluctuations. + */ +#define __calc_avg(old, new, decay) ({ \ + typeof(decay) thr = 1 << (decay); \ + typeof(old) ret; \ + if (((old) < thr) || ((new) < thr)) { \ + if (((old) == 1) && ((new) == 0)) \ + ret = 0; \ + else \ + ret = ((old) - ((old) >> 1)) + ((new) >> 1); \ + } else { \ + ret = ((old) - ((old) >> (decay))) + ((new) >> (decay)); \ + } \ + ret; \ +}) + /* * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. * @v: The value for which we're computing the base 2 logarithm. @@ -662,6 +727,25 @@ static inline u32 log2_u64(u64 v) return log2_u32(v) + 1; } +/* + * sqrt_u64 - Calculate the square root of value @x using Newton's method. + */ +static inline u64 __sqrt_u64(u64 x) +{ + if (x == 0 || x == 1) + return x; + + u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32); + + for (int i = 0; i < 8; ++i) { + u64 q = x / r; + if (r <= q) + break; + r = (r + q) >> 1; + } + return r; +} + /* * Return a value proportionally scaled to the task's weight. */ diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h index 1dc76bd8429660..b3c6372bcf810b 100644 --- a/tools/sched_ext/include/scx/common.h +++ b/tools/sched_ext/include/scx/common.h @@ -75,8 +75,9 @@ typedef int64_t s64; #include "enums.h" /* not available when building kernel tools/sched_ext */ -#if __has_include() -#include +#if __has_include() +#include "bpf_arena_common.h" +#include #endif #endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 9252e1a00556f5..36e0cd2fd4edaf 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -38,6 +38,7 @@ void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__i void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ (bpf_ksym_exists(scx_bpf_dsq_insert) ? \ @@ -82,6 +83,10 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) +#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ + (bpf_ksym_exists(bpf_cpumask_populate) ? \ + (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) + #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h new file mode 100644 index 00000000000000..e7ac6611a99014 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ + +#ifndef __USER_EXIT_INFO_BPF_H +#define __USER_EXIT_INFO_BPF_H + +#ifndef LSP +#include "vmlinux.h" +#endif +#include + +#include "user_exit_info_common.h" + +#define UEI_DEFINE(__name) \ + char RESIZABLE_ARRAY(data, __name##_dump); \ + const volatile u32 __name##_dump_len; \ + struct user_exit_info __name SEC(".data") + +#define UEI_RECORD(__uei_name, __ei) ({ \ + bpf_probe_read_kernel_str(__uei_name.reason, \ + sizeof(__uei_name.reason), (__ei)->reason); \ + bpf_probe_read_kernel_str(__uei_name.msg, \ + sizeof(__uei_name.msg), (__ei)->msg); \ + bpf_probe_read_kernel_str(__uei_name##_dump, \ + __uei_name##_dump_len, (__ei)->dump); \ + if (bpf_core_field_exists((__ei)->exit_code)) \ + __uei_name.exit_code = (__ei)->exit_code; \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ + (__ei)->kind); \ +}) + +#endif /* __USER_EXIT_INFO_BPF_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index 66f856640ee7e2..399697fa372fb1 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -10,55 +10,11 @@ #ifndef __USER_EXIT_INFO_H #define __USER_EXIT_INFO_H -#ifdef LSP -#define __bpf__ -#include "../vmlinux.h" -#endif - -enum uei_sizes { - UEI_REASON_LEN = 128, - UEI_MSG_LEN = 1024, - UEI_DUMP_DFL_LEN = 32768, -}; - -struct user_exit_info { - int kind; - s64 exit_code; - char reason[UEI_REASON_LEN]; - char msg[UEI_MSG_LEN]; -}; - -#ifdef __bpf__ - -#ifndef LSP -#include "vmlinux.h" -#endif -#include - -#define UEI_DEFINE(__name) \ - char RESIZABLE_ARRAY(data, __name##_dump); \ - const volatile u32 __name##_dump_len; \ - struct user_exit_info __name SEC(".data") - -#define UEI_RECORD(__uei_name, __ei) ({ \ - bpf_probe_read_kernel_str(__uei_name.reason, \ - sizeof(__uei_name.reason), (__ei)->reason); \ - bpf_probe_read_kernel_str(__uei_name.msg, \ - sizeof(__uei_name.msg), (__ei)->msg); \ - bpf_probe_read_kernel_str(__uei_name##_dump, \ - __uei_name##_dump_len, (__ei)->dump); \ - if (bpf_core_field_exists((__ei)->exit_code)) \ - __uei_name.exit_code = (__ei)->exit_code; \ - /* use __sync to force memory barrier */ \ - __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ - (__ei)->kind); \ -}) - -#else /* !__bpf__ */ - #include #include +#include "user_exit_info_common.h" + /* no need to call the following explicitly if SCX_OPS_LOAD() is used */ #define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ @@ -114,5 +70,4 @@ enum uei_ecode_mask { #define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) -#endif /* __bpf__ */ #endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h new file mode 100644 index 00000000000000..2d0981aedd8981 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info_common.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __USER_EXIT_INFO_COMMON_H +#define __USER_EXIT_INFO_COMMON_H + +#ifdef LSP +#include "../vmlinux.h" +#endif + +enum uei_sizes { + UEI_REASON_LEN = 128, + UEI_MSG_LEN = 1024, + UEI_DUMP_DFL_LEN = 32768, +}; + +struct user_exit_info { + int kind; + s64 exit_code; + char reason[UEI_REASON_LEN]; + char msg[UEI_MSG_LEN]; +}; + +#endif /* __USER_EXIT_INFO_COMMON_H */ diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 50bc1737c167a1..55df8b7988657b 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * A central FIFO sched_ext scheduler which demonstrates the followings: + * A central FIFO sched_ext scheduler which demonstrates the following: * * a. Making all scheduling decisions from one CPU: * diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 6ba6e610eeaa03..55931a4cd71c7c 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -61,6 +61,7 @@ int main(int argc, char **argv) skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + assert(skel->rodata->nr_cpu_ids > 0); assert(skel->rodata->nr_cpu_ids <= INT32_MAX); while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index fdc7170639e604..2c720e3ecad593 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -950,5 +950,5 @@ SCX_OPS_DEFINE(flatcg_ops, .cgroup_move = (void *)fcg_cgroup_move, .init = (void *)fcg_init, .exit = (void *)fcg_exit, - .flags = SCX_OPS_ENQ_EXITING, + .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, .name = "flatcg"); diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c index 6dd423eeb4ff98..cd85eb4011793c 100644 --- a/tools/sched_ext/scx_flatcg.c +++ b/tools/sched_ext/scx_flatcg.c @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include @@ -137,6 +138,7 @@ int main(int argc, char **argv) skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + assert(skel->rodata->nr_cpus > 0); skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c index 76d83199545cb2..06d4b13bf76bcc 100644 --- a/tools/sched_ext/scx_simple.c +++ b/tools/sched_ext/scx_simple.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ static void sigint_handler(int simple) static void read_stats(struct scx_simple *skel, __u64 *stats) { int nr_cpus = libbpf_num_possible_cpus(); + assert(nr_cpus > 0); __u64 cnts[2][nr_cpus]; __u32 idx; From b41dc83f0790fd3488a45b31de0b0c3af7d441fe Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 Aug 2025 11:26:29 -0700 Subject: [PATCH 0084/3206] kunit, lib/crypto: Move run_irq_test() to common header Rename run_irq_test() to kunit_run_irq_test() and move it to a public header so that it can be reused by crc_kunit. Link: https://lore.kernel.org/r/20250811182631.376302-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/kunit/run-in-irq-context.h | 129 ++++++++++++++++++++++++++ lib/crypto/tests/hash-test-template.h | 123 +----------------------- 2 files changed, 133 insertions(+), 119 deletions(-) create mode 100644 include/kunit/run-in-irq-context.h diff --git a/include/kunit/run-in-irq-context.h b/include/kunit/run-in-irq-context.h new file mode 100644 index 00000000000000..108e96433ea45b --- /dev/null +++ b/include/kunit/run-in-irq-context.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Helper function for testing code in interrupt contexts + * + * Copyright 2025 Google LLC + */ +#ifndef _KUNIT_RUN_IN_IRQ_CONTEXT_H +#define _KUNIT_RUN_IN_IRQ_CONTEXT_H + +#include +#include +#include +#include + +#define KUNIT_IRQ_TEST_HRTIMER_INTERVAL us_to_ktime(5) + +struct kunit_irq_test_state { + bool (*func)(void *test_specific_state); + void *test_specific_state; + bool task_func_reported_failure; + bool hardirq_func_reported_failure; + bool softirq_func_reported_failure; + unsigned long hardirq_func_calls; + unsigned long softirq_func_calls; + struct hrtimer timer; + struct work_struct bh_work; +}; + +static enum hrtimer_restart kunit_irq_test_timer_func(struct hrtimer *timer) +{ + struct kunit_irq_test_state *state = + container_of(timer, typeof(*state), timer); + + WARN_ON_ONCE(!in_hardirq()); + state->hardirq_func_calls++; + + if (!state->func(state->test_specific_state)) + state->hardirq_func_reported_failure = true; + + hrtimer_forward_now(&state->timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL); + queue_work(system_bh_wq, &state->bh_work); + return HRTIMER_RESTART; +} + +static void kunit_irq_test_bh_work_func(struct work_struct *work) +{ + struct kunit_irq_test_state *state = + container_of(work, typeof(*state), bh_work); + + WARN_ON_ONCE(!in_serving_softirq()); + state->softirq_func_calls++; + + if (!state->func(state->test_specific_state)) + state->softirq_func_reported_failure = true; +} + +/* + * Helper function which repeatedly runs the given @func in task, softirq, and + * hardirq context concurrently, and reports a failure to KUnit if any + * invocation of @func in any context returns false. @func is passed + * @test_specific_state as its argument. At most 3 invocations of @func will + * run concurrently: one in each of task, softirq, and hardirq context. + * + * The main purpose of this interrupt context testing is to validate fallback + * code paths that run in contexts where the normal code path cannot be used, + * typically due to the FPU or vector registers already being in-use in kernel + * mode. These code paths aren't covered when the test code is executed only by + * the KUnit test runner thread in task context. The reason for the concurrency + * is because merely using hardirq context is not sufficient to reach a fallback + * code path on some architectures; the hardirq actually has to occur while the + * FPU or vector unit was already in-use in kernel mode. + * + * Another purpose of this testing is to detect issues with the architecture's + * irq_fpu_usable() and kernel_fpu_begin/end() or equivalent functions, + * especially in softirq context when the softirq may have interrupted a task + * already using kernel-mode FPU or vector (if the arch didn't prevent that). + * Crypto functions are often executed in softirqs, so this is important. + */ +static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *), + int max_iterations, + void *test_specific_state) +{ + struct kunit_irq_test_state state = { + .func = func, + .test_specific_state = test_specific_state, + }; + unsigned long end_jiffies; + + /* + * Set up a hrtimer (the way we access hardirq context) and a work + * struct for the BH workqueue (the way we access softirq context). + */ + hrtimer_setup_on_stack(&state.timer, kunit_irq_test_timer_func, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + INIT_WORK_ONSTACK(&state.bh_work, kunit_irq_test_bh_work_func); + + /* Run for up to max_iterations or 1 second, whichever comes first. */ + end_jiffies = jiffies + HZ; + hrtimer_start(&state.timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL, + HRTIMER_MODE_REL_HARD); + for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies); + i++) { + if (!func(test_specific_state)) + state.task_func_reported_failure = true; + } + + /* Cancel the timer and work. */ + hrtimer_cancel(&state.timer); + flush_work(&state.bh_work); + + /* Sanity check: the timer and BH functions should have been run. */ + KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0, + "Timer function was not called"); + KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0, + "BH work function was not called"); + + /* Check for incorrect hash values reported from any context. */ + KUNIT_EXPECT_FALSE_MSG( + test, state.task_func_reported_failure, + "Incorrect hash values reported from task context"); + KUNIT_EXPECT_FALSE_MSG( + test, state.hardirq_func_reported_failure, + "Incorrect hash values reported from hardirq context"); + KUNIT_EXPECT_FALSE_MSG( + test, state.softirq_func_reported_failure, + "Incorrect hash values reported from softirq context"); +} + +#endif /* _KUNIT_RUN_IN_IRQ_CONTEXT_H */ diff --git a/lib/crypto/tests/hash-test-template.h b/lib/crypto/tests/hash-test-template.h index f437a0a9ac6cd1..61b43e62779fbf 100644 --- a/lib/crypto/tests/hash-test-template.h +++ b/lib/crypto/tests/hash-test-template.h @@ -5,11 +5,9 @@ * * Copyright 2025 Google LLC */ +#include #include -#include -#include #include -#include /* test_buf is a guarded buffer, i.e. &test_buf[TEST_BUF_LEN] is not mapped. */ #define TEST_BUF_LEN 16384 @@ -319,119 +317,6 @@ static void test_hash_ctx_zeroization(struct kunit *test) "Hash context was not zeroized by finalization"); } -#define IRQ_TEST_HRTIMER_INTERVAL us_to_ktime(5) - -struct hash_irq_test_state { - bool (*func)(void *test_specific_state); - void *test_specific_state; - bool task_func_reported_failure; - bool hardirq_func_reported_failure; - bool softirq_func_reported_failure; - unsigned long hardirq_func_calls; - unsigned long softirq_func_calls; - struct hrtimer timer; - struct work_struct bh_work; -}; - -static enum hrtimer_restart hash_irq_test_timer_func(struct hrtimer *timer) -{ - struct hash_irq_test_state *state = - container_of(timer, typeof(*state), timer); - - WARN_ON_ONCE(!in_hardirq()); - state->hardirq_func_calls++; - - if (!state->func(state->test_specific_state)) - state->hardirq_func_reported_failure = true; - - hrtimer_forward_now(&state->timer, IRQ_TEST_HRTIMER_INTERVAL); - queue_work(system_bh_wq, &state->bh_work); - return HRTIMER_RESTART; -} - -static void hash_irq_test_bh_work_func(struct work_struct *work) -{ - struct hash_irq_test_state *state = - container_of(work, typeof(*state), bh_work); - - WARN_ON_ONCE(!in_serving_softirq()); - state->softirq_func_calls++; - - if (!state->func(state->test_specific_state)) - state->softirq_func_reported_failure = true; -} - -/* - * Helper function which repeatedly runs the given @func in task, softirq, and - * hardirq context concurrently, and reports a failure to KUnit if any - * invocation of @func in any context returns false. @func is passed - * @test_specific_state as its argument. At most 3 invocations of @func will - * run concurrently: one in each of task, softirq, and hardirq context. - * - * The main purpose of this interrupt context testing is to validate fallback - * code paths that run in contexts where the normal code path cannot be used, - * typically due to the FPU or vector registers already being in-use in kernel - * mode. These code paths aren't covered when the test code is executed only by - * the KUnit test runner thread in task context. The reason for the concurrency - * is because merely using hardirq context is not sufficient to reach a fallback - * code path on some architectures; the hardirq actually has to occur while the - * FPU or vector unit was already in-use in kernel mode. - * - * Another purpose of this testing is to detect issues with the architecture's - * irq_fpu_usable() and kernel_fpu_begin/end() or equivalent functions, - * especially in softirq context when the softirq may have interrupted a task - * already using kernel-mode FPU or vector (if the arch didn't prevent that). - * Crypto functions are often executed in softirqs, so this is important. - */ -static void run_irq_test(struct kunit *test, bool (*func)(void *), - int max_iterations, void *test_specific_state) -{ - struct hash_irq_test_state state = { - .func = func, - .test_specific_state = test_specific_state, - }; - unsigned long end_jiffies; - - /* - * Set up a hrtimer (the way we access hardirq context) and a work - * struct for the BH workqueue (the way we access softirq context). - */ - hrtimer_setup_on_stack(&state.timer, hash_irq_test_timer_func, - CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - INIT_WORK_ONSTACK(&state.bh_work, hash_irq_test_bh_work_func); - - /* Run for up to max_iterations or 1 second, whichever comes first. */ - end_jiffies = jiffies + HZ; - hrtimer_start(&state.timer, IRQ_TEST_HRTIMER_INTERVAL, - HRTIMER_MODE_REL_HARD); - for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies); - i++) { - if (!func(test_specific_state)) - state.task_func_reported_failure = true; - } - - /* Cancel the timer and work. */ - hrtimer_cancel(&state.timer); - flush_work(&state.bh_work); - - /* Sanity check: the timer and BH functions should have been run. */ - KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0, - "Timer function was not called"); - KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0, - "BH work function was not called"); - - /* Check for incorrect hash values reported from any context. */ - KUNIT_EXPECT_FALSE_MSG( - test, state.task_func_reported_failure, - "Incorrect hash values reported from task context"); - KUNIT_EXPECT_FALSE_MSG( - test, state.hardirq_func_reported_failure, - "Incorrect hash values reported from hardirq context"); - KUNIT_EXPECT_FALSE_MSG( - test, state.softirq_func_reported_failure, - "Incorrect hash values reported from softirq context"); -} - #define IRQ_TEST_DATA_LEN 256 #define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */ @@ -469,7 +354,7 @@ static void test_hash_interrupt_context_1(struct kunit *test) HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN, state.expected_hashes[i]); - run_irq_test(test, hash_irq_test1_func, 100000, &state); + kunit_run_irq_test(test, hash_irq_test1_func, 100000, &state); } struct hash_irq_test2_hash_ctx { @@ -500,7 +385,7 @@ static bool hash_irq_test2_func(void *state_) if (WARN_ON_ONCE(ctx == &state->ctxs[ARRAY_SIZE(state->ctxs)])) { /* * This should never happen, as the number of contexts is equal - * to the maximum concurrency level of run_irq_test(). + * to the maximum concurrency level of kunit_run_irq_test(). */ return false; } @@ -566,7 +451,7 @@ static void test_hash_interrupt_context_2(struct kunit *test) state->update_lens[state->num_steps++] = remaining; state->num_steps += 2; /* for init and final */ - run_irq_test(test, hash_irq_test2_func, 250000, state); + kunit_run_irq_test(test, hash_irq_test2_func, 250000, state); } #define UNKEYED_HASH_KUNIT_CASES \ From 842ec21357f15ff722695dd87daf99823f297185 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 Aug 2025 11:26:30 -0700 Subject: [PATCH 0085/3206] lib/crc: crc_kunit: Test CRC computation in interrupt contexts Test that if CRCs are computed in task, softirq, and hardirq context concurrently, then all results are as expected. Implement this using kunit_run_irq_test() which is also used by the lib/crypto/ tests. As with the corresponding lib/crypto/ tests, the purpose of this is to test fallback code paths and to exercise edge cases in the architecture's support for in-kernel FPU/SIMD/vector. Remove the code from crc_test() that sometimes disabled interrupts, as that was just an incomplete attempt to achieve similar test coverage. Link: https://lore.kernel.org/r/20250811182631.376302-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crc/tests/crc_kunit.c | 62 +++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/lib/crc/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c index f08d985d8860e8..9a450e25ac8116 100644 --- a/lib/crc/tests/crc_kunit.c +++ b/lib/crc/tests/crc_kunit.c @@ -6,6 +6,7 @@ * * Author: Eric Biggers */ +#include #include #include #include @@ -141,6 +142,54 @@ static size_t generate_random_length(size_t max_length) return len % (max_length + 1); } +#define IRQ_TEST_DATA_LEN 512 +#define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */ + +struct crc_irq_test_state { + const struct crc_variant *v; + u64 initial_crc; + u64 expected_crcs[IRQ_TEST_NUM_BUFFERS]; + atomic_t seqno; +}; + +/* + * Compute the CRC of one of the test messages and verify that it matches the + * expected CRC from @state->expected_crcs. To increase the chance of detecting + * problems, cycle through multiple messages. + */ +static bool crc_irq_test_func(void *state_) +{ + struct crc_irq_test_state *state = state_; + const struct crc_variant *v = state->v; + u32 i = (u32)atomic_inc_return(&state->seqno) % IRQ_TEST_NUM_BUFFERS; + u64 actual_crc = v->func(state->initial_crc, + &test_buffer[i * IRQ_TEST_DATA_LEN], + IRQ_TEST_DATA_LEN); + + return actual_crc == state->expected_crcs[i]; +} + +/* + * Test that if CRCs are computed in task, softirq, and hardirq context + * concurrently, then all results are as expected. + */ +static void crc_interrupt_context_test(struct kunit *test, + const struct crc_variant *v) +{ + struct crc_irq_test_state state = { + .v = v, + .initial_crc = generate_random_initial_crc(v), + }; + + for (int i = 0; i < IRQ_TEST_NUM_BUFFERS; i++) { + state.expected_crcs[i] = crc_ref( + v, state.initial_crc, + &test_buffer[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN); + } + + kunit_run_irq_test(test, crc_irq_test_func, 100000, &state); +} + /* Test that v->func gives the same CRCs as a reference implementation. */ static void crc_test(struct kunit *test, const struct crc_variant *v) { @@ -149,7 +198,6 @@ static void crc_test(struct kunit *test, const struct crc_variant *v) for (i = 0; i < CRC_KUNIT_NUM_TEST_ITERS; i++) { u64 init_crc, expected_crc, actual_crc; size_t len, offset; - bool nosimd; init_crc = generate_random_initial_crc(v); len = generate_random_length(CRC_KUNIT_MAX_LEN); @@ -168,22 +216,18 @@ static void crc_test(struct kunit *test, const struct crc_variant *v) /* Refresh the data occasionally. */ prandom_bytes_state(&rng, &test_buffer[offset], len); - nosimd = rand32() % 8 == 0; - /* * Compute the CRC, and verify that it equals the CRC computed * by a simple bit-at-a-time reference implementation. */ expected_crc = crc_ref(v, init_crc, &test_buffer[offset], len); - if (nosimd) - local_irq_disable(); actual_crc = v->func(init_crc, &test_buffer[offset], len); - if (nosimd) - local_irq_enable(); KUNIT_EXPECT_EQ_MSG(test, expected_crc, actual_crc, - "Wrong result with len=%zu offset=%zu nosimd=%d", - len, offset, nosimd); + "Wrong result with len=%zu offset=%zu", + len, offset); } + + crc_interrupt_context_test(test, v); } static __always_inline void From c2a0c5156a40c40edb0cce80ce11c97ab39c67e3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 Aug 2025 11:26:31 -0700 Subject: [PATCH 0086/3206] lib/crc: Use underlying functions instead of crypto_simd_usable() Since crc_kunit now tests the fallback code paths without using crypto_simd_disabled_for_test, make the CRC code just use the underlying may_use_simd() and irq_fpu_usable() functions directly instead of crypto_simd_usable(). This eliminates an unnecessary layer. Take the opportunity to add likely() and unlikely() annotations as well. Link: https://lore.kernel.org/r/20250811182631.376302-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crc/arm/crc-t10dif.h | 6 ++---- lib/crc/arm/crc32.h | 6 ++---- lib/crc/arm64/crc-t10dif.h | 6 ++---- lib/crc/arm64/crc32.h | 11 ++++++----- lib/crc/powerpc/crc-t10dif.h | 5 +++-- lib/crc/powerpc/crc32.h | 5 +++-- lib/crc/x86/crc-pclmul-template.h | 3 +-- lib/crc/x86/crc32.h | 2 +- 8 files changed, 20 insertions(+), 24 deletions(-) diff --git a/lib/crc/arm/crc-t10dif.h b/lib/crc/arm/crc-t10dif.h index 2edf7e9681d05a..1a969f4024d479 100644 --- a/lib/crc/arm/crc-t10dif.h +++ b/lib/crc/arm/crc-t10dif.h @@ -5,8 +5,6 @@ * Copyright (C) 2016 Linaro Ltd */ -#include - #include #include @@ -23,7 +21,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) { if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { if (static_branch_likely(&have_pmull)) { - if (crypto_simd_usable()) { + if (likely(may_use_simd())) { kernel_neon_begin(); crc = crc_t10dif_pmull64(crc, data, length); kernel_neon_end(); @@ -31,7 +29,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) } } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && static_branch_likely(&have_neon) && - crypto_simd_usable()) { + likely(may_use_simd())) { u8 buf[16] __aligned(16); kernel_neon_begin(); diff --git a/lib/crc/arm/crc32.h b/lib/crc/arm/crc32.h index 018007e162a2b6..ae71aa60b7a74e 100644 --- a/lib/crc/arm/crc32.h +++ b/lib/crc/arm/crc32.h @@ -7,8 +7,6 @@ #include -#include - #include #include #include @@ -34,7 +32,7 @@ static inline u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len) static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) { if (len >= PMULL_MIN_LEN + 15 && - static_branch_likely(&have_pmull) && crypto_simd_usable()) { + static_branch_likely(&have_pmull) && likely(may_use_simd())) { size_t n = -(uintptr_t)p & 15; /* align p to 16-byte boundary */ @@ -63,7 +61,7 @@ static inline u32 crc32c_scalar(u32 crc, const u8 *p, size_t len) static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) { if (len >= PMULL_MIN_LEN + 15 && - static_branch_likely(&have_pmull) && crypto_simd_usable()) { + static_branch_likely(&have_pmull) && likely(may_use_simd())) { size_t n = -(uintptr_t)p & 15; /* align p to 16-byte boundary */ diff --git a/lib/crc/arm64/crc-t10dif.h b/lib/crc/arm64/crc-t10dif.h index c4521a7f1ee9bf..435a990ec43c27 100644 --- a/lib/crc/arm64/crc-t10dif.h +++ b/lib/crc/arm64/crc-t10dif.h @@ -7,8 +7,6 @@ #include -#include - #include #include @@ -25,7 +23,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) { if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { if (static_branch_likely(&have_pmull)) { - if (crypto_simd_usable()) { + if (likely(may_use_simd())) { kernel_neon_begin(); crc = crc_t10dif_pmull_p64(crc, data, length); kernel_neon_end(); @@ -33,7 +31,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) } } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && static_branch_likely(&have_asimd) && - crypto_simd_usable()) { + likely(may_use_simd())) { u8 buf[16]; kernel_neon_begin(); diff --git a/lib/crc/arm64/crc32.h b/lib/crc/arm64/crc32.h index 6e5dec45f05d21..31e649cd40a2fd 100644 --- a/lib/crc/arm64/crc32.h +++ b/lib/crc/arm64/crc32.h @@ -5,8 +5,6 @@ #include #include -#include - // The minimum input length to consider the 4-way interleaved code path static const size_t min_len = 1024; @@ -23,7 +21,8 @@ static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32_le_base(crc, p, len); - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + if (len >= min_len && cpu_have_named_feature(PMULL) && + likely(may_use_simd())) { kernel_neon_begin(); crc = crc32_le_arm64_4way(crc, p, len); kernel_neon_end(); @@ -43,7 +42,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32c_base(crc, p, len); - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + if (len >= min_len && cpu_have_named_feature(PMULL) && + likely(may_use_simd())) { kernel_neon_begin(); crc = crc32c_le_arm64_4way(crc, p, len); kernel_neon_end(); @@ -63,7 +63,8 @@ static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32_be_base(crc, p, len); - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + if (len >= min_len && cpu_have_named_feature(PMULL) && + likely(may_use_simd())) { kernel_neon_begin(); crc = crc32_be_arm64_4way(crc, p, len); kernel_neon_end(); diff --git a/lib/crc/powerpc/crc-t10dif.h b/lib/crc/powerpc/crc-t10dif.h index 59e16804a6eae9..e033d5f57bae22 100644 --- a/lib/crc/powerpc/crc-t10dif.h +++ b/lib/crc/powerpc/crc-t10dif.h @@ -6,8 +6,8 @@ * [based on crc32c-vpmsum_glue.c] */ +#include #include -#include #include #include #include @@ -29,7 +29,8 @@ static inline u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len) u32 crc = crci; if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || - !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) + !static_branch_likely(&have_vec_crypto) || + unlikely(!may_use_simd())) return crc_t10dif_generic(crc, p, len); if ((unsigned long)p & VMX_ALIGN_MASK) { diff --git a/lib/crc/powerpc/crc32.h b/lib/crc/powerpc/crc32.h index 811cc2e6ed24d4..cc8fa3913d4e01 100644 --- a/lib/crc/powerpc/crc32.h +++ b/lib/crc/powerpc/crc32.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include -#include #include #include #include @@ -24,7 +24,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) unsigned int tail; if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || - !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) + !static_branch_likely(&have_vec_crypto) || + unlikely(!may_use_simd())) return crc32c_base(crc, p, len); if ((unsigned long)p & VMX_ALIGN_MASK) { diff --git a/lib/crc/x86/crc-pclmul-template.h b/lib/crc/x86/crc-pclmul-template.h index 35c950d7010c28..02744831c6fac0 100644 --- a/lib/crc/x86/crc-pclmul-template.h +++ b/lib/crc/x86/crc-pclmul-template.h @@ -12,7 +12,6 @@ #include #include -#include #include #include "crc-pclmul-consts.h" @@ -57,7 +56,7 @@ static inline bool have_avx512(void) #define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \ do { \ if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \ - crypto_simd_usable()) { \ + likely(irq_fpu_usable())) { \ const void *consts_ptr; \ \ consts_ptr = (consts).fold_across_128_bits_consts; \ diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h index cea2c96d08d09e..2c4a5976654ad7 100644 --- a/lib/crc/x86/crc32.h +++ b/lib/crc/x86/crc32.h @@ -44,7 +44,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) return crc32c_base(crc, p, len); if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && - static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { + static_branch_likely(&have_pclmulqdq) && likely(irq_fpu_usable())) { /* * Long length, the vector registers are usable, and the CPU is * 64-bit and supports both CRC32 and PCLMULQDQ instructions. From e5bc887413e513a2cb658370dd09fa9e1702fb3b Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 6 Aug 2025 17:17:07 -0400 Subject: [PATCH 0087/3206] lsm: use lsm_blob_alloc() in lsm_bdev_alloc() Convert the lsm_bdev_alloc() function to use the lsm_blob_alloc() helper like all of the other LSM security blob allocators. Reviewed-by: Casey Schaufler Acked-by: Serge Hallyn Signed-off-by: Paul Moore --- security/security.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/security/security.c b/security/security.c index ad163f06bf7abc..a88ebfca322420 100644 --- a/security/security.c +++ b/security/security.c @@ -823,16 +823,8 @@ static int lsm_msg_msg_alloc(struct msg_msg *mp) */ static int lsm_bdev_alloc(struct block_device *bdev) { - if (blob_sizes.lbs_bdev == 0) { - bdev->bd_security = NULL; - return 0; - } - - bdev->bd_security = kzalloc(blob_sizes.lbs_bdev, GFP_KERNEL); - if (!bdev->bd_security) - return -ENOMEM; - - return 0; + return lsm_blob_alloc(&bdev->bd_security, blob_sizes.lbs_bdev, + GFP_KERNEL); } /** From 5816bf4273edb32716a88c796e0b04f0e12962eb Mon Sep 17 00:00:00 2001 From: Blaise Boscaccy Date: Tue, 22 Jul 2025 14:21:34 -0700 Subject: [PATCH 0088/3206] lsm,selinux: Add LSM blob support for BPF objects This patch introduces LSM blob support for BPF maps, programs, and tokens to enable LSM stacking and multiplexing of LSM modules that govern BPF objects. Additionally, the existing BPF hooks used by SELinux have been updated to utilize the new blob infrastructure, removing the assumption of exclusive ownership of the security pointer. Signed-off-by: Blaise Boscaccy [PM: dropped local variable init, style fixes] Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 3 ++ security/security.c | 86 +++++++++++++++++++++++++++++-- security/selinux/hooks.c | 56 ++++---------------- security/selinux/include/objsec.h | 20 +++++++ 4 files changed, 116 insertions(+), 49 deletions(-) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 090d1d3e19fed6..79ec5a2bdcca7a 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -116,6 +116,9 @@ struct lsm_blob_sizes { int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ int lbs_tun_dev; int lbs_bdev; + int lbs_bpf_map; + int lbs_bpf_prog; + int lbs_bpf_token; }; /* diff --git a/security/security.c b/security/security.c index a88ebfca322420..ca126b02d2feed 100644 --- a/security/security.c +++ b/security/security.c @@ -283,6 +283,9 @@ static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) lsm_set_blob_size(&needed->lbs_xattr_count, &blob_sizes.lbs_xattr_count); lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev); + lsm_set_blob_size(&needed->lbs_bpf_map, &blob_sizes.lbs_bpf_map); + lsm_set_blob_size(&needed->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog); + lsm_set_blob_size(&needed->lbs_bpf_token, &blob_sizes.lbs_bpf_token); } /* Prepare LSM for initialization. */ @@ -480,6 +483,9 @@ static void __init ordered_lsm_init(void) init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev); init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); init_debug("bdev blob size = %d\n", blob_sizes.lbs_bdev); + init_debug("bpf map blob size = %d\n", blob_sizes.lbs_bpf_map); + init_debug("bpf prog blob size = %d\n", blob_sizes.lbs_bpf_prog); + init_debug("bpf token blob size = %d\n", blob_sizes.lbs_bpf_token); /* * Create any kmem_caches needed for blobs @@ -827,6 +833,47 @@ static int lsm_bdev_alloc(struct block_device *bdev) GFP_KERNEL); } +#ifdef CONFIG_BPF_SYSCALL +/** + * lsm_bpf_map_alloc - allocate a composite bpf_map blob + * @map: the bpf_map that needs a blob + * + * Allocate the bpf_map blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_map_alloc(struct bpf_map *map) +{ + return lsm_blob_alloc(&map->security, blob_sizes.lbs_bpf_map, GFP_KERNEL); +} + +/** + * lsm_bpf_prog_alloc - allocate a composite bpf_prog blob + * @prog: the bpf_prog that needs a blob + * + * Allocate the bpf_prog blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_prog_alloc(struct bpf_prog *prog) +{ + return lsm_blob_alloc(&prog->aux->security, blob_sizes.lbs_bpf_prog, GFP_KERNEL); +} + +/** + * lsm_bpf_token_alloc - allocate a composite bpf_token blob + * @token: the bpf_token that needs a blob + * + * Allocate the bpf_token blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_token_alloc(struct bpf_token *token) +{ + return lsm_blob_alloc(&token->security, blob_sizes.lbs_bpf_token, GFP_KERNEL); +} +#endif /* CONFIG_BPF_SYSCALL */ + /** * lsm_early_task - during initialization allocate a composite task blob * @task: the task that needs a blob @@ -5706,7 +5753,16 @@ int security_bpf_prog(struct bpf_prog *prog) int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_map_create, map, attr, token, kernel); + int rc; + + rc = lsm_bpf_map_alloc(map); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_map_create, map, attr, token, kernel); + if (unlikely(rc)) + security_bpf_map_free(map); + return rc; } /** @@ -5725,7 +5781,16 @@ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_prog_load, prog, attr, token, kernel); + int rc; + + rc = lsm_bpf_prog_alloc(prog); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_prog_load, prog, attr, token, kernel); + if (unlikely(rc)) + security_bpf_prog_free(prog); + return rc; } /** @@ -5742,7 +5807,16 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path) { - return call_int_hook(bpf_token_create, token, attr, path); + int rc; + + rc = lsm_bpf_token_alloc(token); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_token_create, token, attr, path); + if (unlikely(rc)) + security_bpf_token_free(token); + return rc; } /** @@ -5786,6 +5860,8 @@ int security_bpf_token_capable(const struct bpf_token *token, int cap) void security_bpf_map_free(struct bpf_map *map) { call_void_hook(bpf_map_free, map); + kfree(map->security); + map->security = NULL; } /** @@ -5797,6 +5873,8 @@ void security_bpf_map_free(struct bpf_map *map) void security_bpf_prog_free(struct bpf_prog *prog) { call_void_hook(bpf_prog_free, prog); + kfree(prog->aux->security); + prog->aux->security = NULL; } /** @@ -5808,6 +5886,8 @@ void security_bpf_prog_free(struct bpf_prog *prog) void security_bpf_token_free(struct bpf_token *token) { call_void_hook(bpf_token_free, token); + kfree(token->security); + token->security = NULL; } #endif /* CONFIG_BPF_SYSCALL */ diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d40..4da5e792b42e26 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7062,14 +7062,14 @@ static int bpf_fd_pass(const struct file *file, u32 sid) if (file->f_op == &bpf_map_fops) { map = file->private_data; - bpfsec = map->security; + bpfsec = selinux_bpf_map_security(map); ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(file->f_mode), NULL); if (ret) return ret; } else if (file->f_op == &bpf_prog_fops) { prog = file->private_data; - bpfsec = prog->aux->security; + bpfsec = selinux_bpf_prog_security(prog); ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); if (ret) @@ -7083,7 +7083,7 @@ static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode) u32 sid = current_sid(); struct bpf_security_struct *bpfsec; - bpfsec = map->security; + bpfsec = selinux_bpf_map_security(map); return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(fmode), NULL); } @@ -7093,7 +7093,7 @@ static int selinux_bpf_prog(struct bpf_prog *prog) u32 sid = current_sid(); struct bpf_security_struct *bpfsec; - bpfsec = prog->aux->security; + bpfsec = selinux_bpf_prog_security(prog); return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); } @@ -7103,69 +7103,33 @@ static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_map_security(map); bpfsec->sid = current_sid(); - map->security = bpfsec; return 0; } -static void selinux_bpf_map_free(struct bpf_map *map) -{ - struct bpf_security_struct *bpfsec = map->security; - - map->security = NULL; - kfree(bpfsec); -} - static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token, bool kernel) { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_prog_security(prog); bpfsec->sid = current_sid(); - prog->aux->security = bpfsec; return 0; } -static void selinux_bpf_prog_free(struct bpf_prog *prog) -{ - struct bpf_security_struct *bpfsec = prog->aux->security; - - prog->aux->security = NULL; - kfree(bpfsec); -} - static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path) { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_token_security(token); bpfsec->sid = current_sid(); - token->security = bpfsec; return 0; } - -static void selinux_bpf_token_free(struct bpf_token *token) -{ - struct bpf_security_struct *bpfsec = token->security; - - token->security = NULL; - kfree(bpfsec); -} #endif struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { @@ -7183,6 +7147,9 @@ struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { .lbs_xattr_count = SELINUX_INODE_INIT_XATTRS, .lbs_tun_dev = sizeof(struct tun_security_struct), .lbs_ib = sizeof(struct ib_security_struct), + .lbs_bpf_map = sizeof(struct bpf_security_struct), + .lbs_bpf_prog = sizeof(struct bpf_security_struct), + .lbs_bpf_token = sizeof(struct bpf_security_struct), }; #ifdef CONFIG_PERF_EVENTS @@ -7536,9 +7503,6 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(bpf, selinux_bpf), LSM_HOOK_INIT(bpf_map, selinux_bpf_map), LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog), - LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free), - LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free), - LSM_HOOK_INIT(bpf_token_free, selinux_bpf_token_free), #endif #ifdef CONFIG_PERF_EVENTS diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 1d7ac59015a12d..2d5139c6d45b30 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -26,6 +26,7 @@ #include #include #include +#include #include "flask.h" #include "avc.h" @@ -245,4 +246,23 @@ selinux_perf_event(void *perf_event) return perf_event + selinux_blob_sizes.lbs_perf_event; } +#ifdef CONFIG_BPF_SYSCALL +static inline struct bpf_security_struct * +selinux_bpf_map_security(struct bpf_map *map) +{ + return map->security + selinux_blob_sizes.lbs_bpf_map; +} + +static inline struct bpf_security_struct * +selinux_bpf_prog_security(struct bpf_prog *prog) +{ + return prog->aux->security + selinux_blob_sizes.lbs_bpf_prog; +} + +static inline struct bpf_security_struct * +selinux_bpf_token_security(struct bpf_token *token) +{ + return token->security + selinux_blob_sizes.lbs_bpf_token; +} +#endif /* CONFIG_BPF_SYSCALL */ #endif /* _SELINUX_OBJSEC_H_ */ From 942e47ab228c7dd27c2ae043c17e7aab2028082c Mon Sep 17 00:00:00 2001 From: Pengyu Luo Date: Tue, 12 Aug 2025 17:39:56 +0800 Subject: [PATCH 0089/3206] phy: qualcomm: phy-qcom-eusb2-repeater: fix override properties property "qcom,tune-usb2-preem" is for EUSB2_TUNE_USB2_PREEM property "qcom,tune-usb2-amplitude" is for EUSB2_TUNE_IUSB2 The downstream correspondence is as follows: EUSB2_TUNE_USB2_PREEM: Tx pre-emphasis tuning EUSB2_TUNE_IUSB2: HS trasmit amplitude EUSB2_TUNE_SQUELCH_U: Squelch detection threshold EUSB2_TUNE_HSDISC: HS disconnect threshold EUSB2_TUNE_EUSB_SLEW: slew rate Fixes: 31bc94de7602 ("phy: qualcomm: phy-qcom-eusb2-repeater: Don't zero-out registers") Signed-off-by: Pengyu Luo Reviewed-by: Konrad Dybcio Reviewed-by: Luca Weiss Link: https://lore.kernel.org/r/20250812093957.32235-1-mitltlatltl@gmail.com Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c b/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c index e0f2acc8109c10..8fcbc312fd616a 100644 --- a/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c +++ b/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c @@ -127,13 +127,13 @@ static int eusb2_repeater_init(struct phy *phy) rptr->cfg->init_tbl[i].value); /* Override registers from devicetree values */ - if (!of_property_read_u8(np, "qcom,tune-usb2-amplitude", &val)) + if (!of_property_read_u8(np, "qcom,tune-usb2-preem", &val)) regmap_write(regmap, base + EUSB2_TUNE_USB2_PREEM, val); if (!of_property_read_u8(np, "qcom,tune-usb2-disc-thres", &val)) regmap_write(regmap, base + EUSB2_TUNE_HSDISC, val); - if (!of_property_read_u8(np, "qcom,tune-usb2-preem", &val)) + if (!of_property_read_u8(np, "qcom,tune-usb2-amplitude", &val)) regmap_write(regmap, base + EUSB2_TUNE_IUSB2, val); /* Wait for status OK */ From 0ecc0e17f05bbb953b6ea3812e78bab224f08595 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Tue, 12 Aug 2025 11:38:16 +0800 Subject: [PATCH 0090/3206] spi: bcm2835: Remove redundant semicolons Remove unnecessary semicolons after comments. Fixes: 3ecd37edaa2a6 ("spi: bcm2835: enable dma modes for transfers meeting certain conditions") Signed-off-by: Liao Yuanhong Link: https://patch.msgid.link/20250812033817.487565-2-liaoyuanhong@vivo.com Signed-off-by: Mark Brown --- drivers/spi/spi-bcm2835.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c index 77de5a07639afb..192cc5ef65fb7b 100644 --- a/drivers/spi/spi-bcm2835.c +++ b/drivers/spi/spi-bcm2835.c @@ -622,7 +622,7 @@ static void bcm2835_spi_dma_rx_done(void *data) /* reset fifo and HW */ bcm2835_spi_reset_hw(bs); - /* and mark as completed */; + /* and mark as completed */ spi_finalize_current_transfer(ctlr); } From 528a813a5d98702cc6fd51983484cf7eec9ebcac Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Tue, 12 Aug 2025 11:38:17 +0800 Subject: [PATCH 0091/3206] spi: mtk-snfi: Remove redundant semicolons Remove unnecessary semicolons. Fixes: 764f1b7481645 ("spi: add driver for MTK SPI NAND Flash Interface") Signed-off-by: Liao Yuanhong Link: https://patch.msgid.link/20250812033817.487565-3-liaoyuanhong@vivo.com Signed-off-by: Mark Brown --- drivers/spi/spi-mtk-snfi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/spi/spi-mtk-snfi.c b/drivers/spi/spi-mtk-snfi.c index e82ee6dcf4986e..ae38c244e25811 100644 --- a/drivers/spi/spi-mtk-snfi.c +++ b/drivers/spi/spi-mtk-snfi.c @@ -1139,7 +1139,6 @@ static int mtk_snand_write_page_cache(struct mtk_snand *snf, // Prepare for custom write interrupt nfi_write32(snf, NFI_INTR_EN, NFI_IRQ_INTR_EN | NFI_IRQ_CUS_PG); reinit_completion(&snf->op_done); - ; // Trigger NFI into custom mode nfi_write16(snf, NFI_CMD, NFI_CMD_DUMMY_WRITE); From b832b19318534bb4f1673b24d78037fee339c679 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 11 Aug 2025 14:10:21 +0200 Subject: [PATCH 0092/3206] spi: loopback-test: Don't use %pK through printk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the past %pK was preferable to %p as it would not leak raw pointer values into the kernel log. Since commit ad67b74d2469 ("printk: hash addresses printed with %p") the regular %p has been improved to avoid this issue. Furthermore, restricted pointers ("%pK") were never meant to be used through printk(). They can still unintentionally leak raw pointers or acquire sleeping locks in atomic contexts. Switch to the regular pointer formatting which is safer and easier to reason about. There are still a few users of %pK left, but these use it through seq_file, for which its usage is safe. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20250811-restricted-pointers-spi-v1-1-32c47f954e4d@linutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-loopback-test.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-loopback-test.c b/drivers/spi/spi-loopback-test.c index 7dd92deffe3fb1..e0b131aa29b62e 100644 --- a/drivers/spi/spi-loopback-test.c +++ b/drivers/spi/spi-loopback-test.c @@ -446,7 +446,7 @@ static void spi_test_dump_message(struct spi_device *spi, int i; u8 b; - dev_info(&spi->dev, " spi_msg@%pK\n", msg); + dev_info(&spi->dev, " spi_msg@%p\n", msg); if (msg->status) dev_info(&spi->dev, " status: %i\n", msg->status); @@ -456,15 +456,15 @@ static void spi_test_dump_message(struct spi_device *spi, msg->actual_length); list_for_each_entry(xfer, &msg->transfers, transfer_list) { - dev_info(&spi->dev, " spi_transfer@%pK\n", xfer); + dev_info(&spi->dev, " spi_transfer@%p\n", xfer); dev_info(&spi->dev, " len: %i\n", xfer->len); - dev_info(&spi->dev, " tx_buf: %pK\n", xfer->tx_buf); + dev_info(&spi->dev, " tx_buf: %p\n", xfer->tx_buf); if (dump_data && xfer->tx_buf) spi_test_print_hex_dump(" TX: ", xfer->tx_buf, xfer->len); - dev_info(&spi->dev, " rx_buf: %pK\n", xfer->rx_buf); + dev_info(&spi->dev, " rx_buf: %p\n", xfer->rx_buf); if (dump_data && xfer->rx_buf) spi_test_print_hex_dump(" RX: ", xfer->rx_buf, @@ -558,7 +558,7 @@ static int spi_check_rx_ranges(struct spi_device *spi, /* if still not found then something has modified too much */ /* we could list the "closest" transfer here... */ dev_err(&spi->dev, - "loopback strangeness - rx changed outside of allowed range at: %pK\n", + "loopback strangeness - rx changed outside of allowed range at: %p\n", addr); /* do not return, only set ret, * so that we list all addresses @@ -696,7 +696,7 @@ static int spi_test_translate(struct spi_device *spi, } dev_err(&spi->dev, - "PointerRange [%pK:%pK[ not in range [%pK:%pK[ or [%pK:%pK[\n", + "PointerRange [%p:%p[ not in range [%p:%p[ or [%p:%p[\n", *ptr, *ptr + len, RX(0), RX(SPI_TEST_MAX_SIZE), TX(0), TX(SPI_TEST_MAX_SIZE)); From 5cfdfc623835d40664f642b75a56d9934f24c5ff Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 6 Aug 2025 15:01:37 -0500 Subject: [PATCH 0093/3206] dt-bindings: phy: marvell,comphy-cp110: Fix clock and child node constraints In converting marvell,comphy-cp110 to schema, the constraints for clocks on marvell,comphy-a3700 are wrong, the maximum number of child nodes are wrong, and the phy nodes may have a 'connector' child node: phy@18300 (marvell,comphy-a3700): clock-names: False schema does not allow ['xtal'] phy@120000 (marvell,comphy-cp110): 'phy@3', 'phy@4', 'phy@5' do not match any of the regexes: '^phy@[0-2]$', '^pinctrl-[0-9]+$' phy@120000 (marvell,comphy-cp110): phy@2: 'connector' does not match any of the regexes: '^pinctrl-[0-9]+$' Fixes: 50355ac70d4f ("dt-bindings: phy: Convert marvell,comphy-cp110 to DT schema") Signed-off-by: Rob Herring (Arm) Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20250806200138.1366189-1-robh@kernel.org Signed-off-by: Vinod Koul --- .../bindings/phy/marvell,comphy-cp110.yaml | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml b/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml index d9501df4288692..c35d3164280538 100644 --- a/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml +++ b/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml @@ -47,21 +47,19 @@ properties: const: 0 clocks: + minItems: 1 maxItems: 3 - description: Reference clocks for CP110; MG clock, MG Core clock, AXI clock clock-names: - items: - - const: mg_clk - - const: mg_core_clk - - const: axi_clk + minItems: 1 + maxItems: 3 marvell,system-controller: description: Phandle to the Marvell system controller (CP110 only) $ref: /schemas/types.yaml#/definitions/phandle patternProperties: - '^phy@[0-2]$': + '^phy@[0-5]$': description: A COMPHY lane child node type: object additionalProperties: false @@ -69,10 +67,14 @@ patternProperties: properties: reg: description: COMPHY lane number + maximum: 5 '#phy-cells': const: 1 + connector: + type: object + required: - reg - '#phy-cells' @@ -91,13 +93,24 @@ allOf: then: properties: - clocks: false - clock-names: false + clocks: + maxItems: 1 + clock-names: + const: xtal required: - reg-names else: + properties: + clocks: + minItems: 3 + clock-names: + items: + - const: mg_clk + - const: mg_core_clk + - const: axi_clk + required: - marvell,system-controller From bca065733afd1e3a89a02f05ffe14e966cd5f78e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 24 Jul 2025 15:12:04 +0200 Subject: [PATCH 0094/3206] phy: tegra: xusb: fix device and OF node leak at probe Make sure to drop the references taken to the PMC OF node and device by of_parse_phandle() and of_find_device_by_node() during probe. Note the holding a reference to the PMC device does not prevent the PMC regmap from going away (e.g. if the PMC driver is unbound) so there is no need to keep the reference. Fixes: 2d1021487273 ("phy: tegra: xusb: Add wake/sleepwalk for Tegra210") Cc: stable@vger.kernel.org # 5.14 Cc: JC Kuo Signed-off-by: Johan Hovold Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250724131206.2211-2-johan@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/tegra/xusb-tegra210.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/phy/tegra/xusb-tegra210.c b/drivers/phy/tegra/xusb-tegra210.c index ebc8a7e21a3181..3409924498e9cf 100644 --- a/drivers/phy/tegra/xusb-tegra210.c +++ b/drivers/phy/tegra/xusb-tegra210.c @@ -3164,18 +3164,22 @@ tegra210_xusb_padctl_probe(struct device *dev, } pdev = of_find_device_by_node(np); + of_node_put(np); if (!pdev) { dev_warn(dev, "PMC device is not available\n"); goto out; } - if (!platform_get_drvdata(pdev)) + if (!platform_get_drvdata(pdev)) { + put_device(&pdev->dev); return ERR_PTR(-EPROBE_DEFER); + } padctl->regmap = dev_get_regmap(&pdev->dev, "usb_sleepwalk"); if (!padctl->regmap) dev_info(dev, "failed to find PMC regmap\n"); + put_device(&pdev->dev); out: return &padctl->base; } From 64961557efa1b98f375c0579779e7eeda1a02c42 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 24 Jul 2025 15:12:05 +0200 Subject: [PATCH 0095/3206] phy: ti: omap-usb2: fix device leak at unbind Make sure to drop the reference to the control device taken by of_find_device_by_node() during probe when the driver is unbound. Fixes: 478b6c7436c2 ("usb: phy: omap-usb2: Don't use omap_get_control_dev()") Cc: stable@vger.kernel.org # 3.13 Cc: Roger Quadros Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20250724131206.2211-3-johan@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/ti/phy-omap-usb2.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/phy/ti/phy-omap-usb2.c b/drivers/phy/ti/phy-omap-usb2.c index c1a0ef979142ce..c444bb2530ca29 100644 --- a/drivers/phy/ti/phy-omap-usb2.c +++ b/drivers/phy/ti/phy-omap-usb2.c @@ -363,6 +363,13 @@ static void omap_usb2_init_errata(struct omap_usb *phy) phy->flags |= OMAP_USB2_DISABLE_CHRG_DET; } +static void omap_usb2_put_device(void *_dev) +{ + struct device *dev = _dev; + + put_device(dev); +} + static int omap_usb2_probe(struct platform_device *pdev) { struct omap_usb *phy; @@ -373,6 +380,7 @@ static int omap_usb2_probe(struct platform_device *pdev) struct device_node *control_node; struct platform_device *control_pdev; const struct usb_phy_data *phy_data; + int ret; phy_data = device_get_match_data(&pdev->dev); if (!phy_data) @@ -423,6 +431,11 @@ static int omap_usb2_probe(struct platform_device *pdev) return -EINVAL; } phy->control_dev = &control_pdev->dev; + + ret = devm_add_action_or_reset(&pdev->dev, omap_usb2_put_device, + phy->control_dev); + if (ret) + return ret; } else { if (of_property_read_u32_index(node, "syscon-phy-power", 1, From e19bcea99749ce8e8f1d359f68ae03210694ad56 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 24 Jul 2025 15:12:06 +0200 Subject: [PATCH 0096/3206] phy: ti-pipe3: fix device leak at unbind Make sure to drop the reference to the control device taken by of_find_device_by_node() during probe when the driver is unbound. Fixes: 918ee0d21ba4 ("usb: phy: omap-usb3: Don't use omap_get_control_dev()") Cc: stable@vger.kernel.org # 3.13 Cc: Roger Quadros Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20250724131206.2211-4-johan@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/ti/phy-ti-pipe3.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/phy/ti/phy-ti-pipe3.c b/drivers/phy/ti/phy-ti-pipe3.c index da2cbacb982c6b..ae764d6524c99a 100644 --- a/drivers/phy/ti/phy-ti-pipe3.c +++ b/drivers/phy/ti/phy-ti-pipe3.c @@ -667,12 +667,20 @@ static int ti_pipe3_get_clk(struct ti_pipe3 *phy) return 0; } +static void ti_pipe3_put_device(void *_dev) +{ + struct device *dev = _dev; + + put_device(dev); +} + static int ti_pipe3_get_sysctrl(struct ti_pipe3 *phy) { struct device *dev = phy->dev; struct device_node *node = dev->of_node; struct device_node *control_node; struct platform_device *control_pdev; + int ret; phy->phy_power_syscon = syscon_regmap_lookup_by_phandle(node, "syscon-phy-power"); @@ -704,6 +712,11 @@ static int ti_pipe3_get_sysctrl(struct ti_pipe3 *phy) } phy->control_dev = &control_pdev->dev; + + ret = devm_add_action_or_reset(dev, ti_pipe3_put_device, + phy->control_dev); + if (ret) + return ret; } if (phy->mode == PIPE3_MODE_PCIE) { From aac1256a41cfbbaca12d6c0a5753d1e3b8d2d8bf Mon Sep 17 00:00:00 2001 From: Ziyue Zhang Date: Fri, 25 Jul 2025 18:22:29 +0800 Subject: [PATCH 0097/3206] dt-bindings: phy: qcom,sc8280xp-qmp-pcie-phy: Update pcie phy bindings The gcc_aux_clk is required by the PCIe controller but not by the PCIe PHY. In PCIe PHY, the source of aux_clk used in low-power mode should be gcc_phy_aux_clk. Hence, remove gcc_aux_clk and replace it with gcc_phy_aux_clk. Fixes: fd2d4e4c1986 ("dt-bindings: phy: qcom,qmp: Add sa8775p QMP PCIe PHY") Signed-off-by: Ziyue Zhang Acked-by: Rob Herring (Arm) Reviewed-by: Johan Hovold Link: https://lore.kernel.org/r/20250725102231.3608298-2-ziyue.zhang@oss.qualcomm.com Signed-off-by: Vinod Koul --- .../devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml index a1ae8c7988c891..b6f140bf5b3b2f 100644 --- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml +++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml @@ -176,6 +176,8 @@ allOf: compatible: contains: enum: + - qcom,sa8775p-qmp-gen4x2-pcie-phy + - qcom,sa8775p-qmp-gen4x4-pcie-phy - qcom,sc8280xp-qmp-gen3x1-pcie-phy - qcom,sc8280xp-qmp-gen3x2-pcie-phy - qcom,sc8280xp-qmp-gen3x4-pcie-phy @@ -197,8 +199,6 @@ allOf: contains: enum: - qcom,qcs8300-qmp-gen4x2-pcie-phy - - qcom,sa8775p-qmp-gen4x2-pcie-phy - - qcom,sa8775p-qmp-gen4x4-pcie-phy then: properties: clocks: From 6af515c9f3ccec3eb8a262ca86bef2c499d07951 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 23 Jul 2025 11:21:52 -0400 Subject: [PATCH 0098/3206] dlm: check for defined force value in dlm_lockspace_release Force values over 3 are undefined, so don't treat them as 3. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 1929327ffbe1cf..ee11a70def92d5 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -730,7 +730,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_device_deregister(ls); - if (force < 3 && dlm_user_daemon_available()) + if (force != 3 && dlm_user_daemon_available()) do_uevent(ls, 0); dlm_recoverd_stop(ls); From bea90085dcb0f9a75748e73d723bde557a5ebf1a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 23 Jul 2025 11:21:53 -0400 Subject: [PATCH 0099/3206] dlm: use defines for force values in dlm_release_lockspace Clarify the use of the force parameter by renaming it to "release_option" and adding defines (with descriptions) for each of the accepted values. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- drivers/md/md-cluster.c | 4 ++-- fs/dlm/lockspace.c | 20 +++++++++----------- fs/dlm/user.c | 6 +++--- fs/gfs2/lock_dlm.c | 4 ++-- fs/ocfs2/stack_user.c | 2 +- include/linux/dlm.h | 26 +++++++++++++++++++++++++- 6 files changed, 42 insertions(+), 20 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 5497eaee96e7d3..6e9a0045f0ffca 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -979,7 +979,7 @@ static int join(struct mddev *mddev, int nodes) lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); if (cinfo->lockspace) - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); mddev->cluster_info = NULL; kfree(cinfo); return ret; @@ -1042,7 +1042,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); unlock_all_bitmaps(mddev); - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); kfree(cinfo); return 0; } diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index ee11a70def92d5..be8dbf48222953 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -671,19 +671,20 @@ int dlm_new_user_lockspace(const char *name, const char *cluster, This is because there may be LKBs queued as ASTs that have been unlinked from their RSBs and are pending deletion once the AST has been delivered */ -static int lockspace_busy(struct dlm_ls *ls, int force) +static int lockspace_busy(struct dlm_ls *ls, int release_option) { struct dlm_lkb *lkb; unsigned long id; int rv = 0; read_lock_bh(&ls->ls_lkbxa_lock); - if (force == 0) { + if (release_option == DLM_RELEASE_NO_LOCKS) { xa_for_each(&ls->ls_lkbxa, id, lkb) { rv = 1; break; } - } else if (force == 1) { + } else if (release_option == DLM_RELEASE_UNUSED) { + /* TODO: handle this UNUSED option as NO_LOCKS in later patch */ xa_for_each(&ls->ls_lkbxa, id, lkb) { if (lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV) { @@ -698,11 +699,11 @@ static int lockspace_busy(struct dlm_ls *ls, int force) return rv; } -static int release_lockspace(struct dlm_ls *ls, int force) +static int release_lockspace(struct dlm_ls *ls, int release_option) { int busy, rv; - busy = lockspace_busy(ls, force); + busy = lockspace_busy(ls, release_option); spin_lock_bh(&lslist_lock); if (ls->ls_create_count == 1) { @@ -730,7 +731,8 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_device_deregister(ls); - if (force != 3 && dlm_user_daemon_available()) + if (release_option != DLM_RELEASE_NO_EVENT && + dlm_user_daemon_available()) do_uevent(ls, 0); dlm_recoverd_stop(ls); @@ -782,11 +784,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) * lockspace must continue to function as usual, participating in recoveries, * until this returns. * - * Force has 4 possible values: - * 0 - don't destroy lockspace if it has any LKBs - * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs - * 2 - destroy lockspace regardless of LKBs - * 3 - destroy lockspace as part of a forced shutdown + * See DLM_RELEASE defines for release_option values and their meaning. */ int dlm_release_lockspace(void *lockspace, int force) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 5cb3896be8260f..51daf4acbe318b 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -425,7 +425,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params) dlm_put_lockspace(ls); if (error) - dlm_release_lockspace(lockspace, 0); + dlm_release_lockspace(lockspace, DLM_RELEASE_NO_LOCKS); else error = ls->ls_device.minor; @@ -436,7 +436,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) { dlm_lockspace_t *lockspace; struct dlm_ls *ls; - int error, force = 0; + int error, force = DLM_RELEASE_NO_LOCKS; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -446,7 +446,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) return -ENOENT; if (params->flags & DLM_USER_LSFLG_FORCEFREE) - force = 2; + force = DLM_RELEASE_NORMAL; lockspace = ls; dlm_put_lockspace(ls); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index cee5d199d2d870..aac4dd6d0381f7 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1400,7 +1400,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) return 0; fail_release: - dlm_release_lockspace(ls->ls_dlm, 2); + dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL); fail_free: free_recover_size(ls); fail: @@ -1437,7 +1437,7 @@ static void gdlm_unmount(struct gfs2_sbd *sdp) /* mounted_lock and control_lock will be purged in dlm recovery */ release: if (ls->ls_dlm) { - dlm_release_lockspace(ls->ls_dlm, 2); + dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL); ls->ls_dlm = NULL; } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 0f045e45fa0c3e..765105f1ff8a2c 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -952,7 +952,7 @@ static const struct dlm_lockspace_ops ocfs2_ls_ops = { static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) { version_unlock(conn); - dlm_release_lockspace(conn->cc_lockspace, 2); + dlm_release_lockspace(conn->cc_lockspace, DLM_RELEASE_NORMAL); conn->cc_lockspace = NULL; ocfs2_live_connection_drop(conn->cc_private); conn->cc_private = NULL; diff --git a/include/linux/dlm.h b/include/linux/dlm.h index bacda9898f2b6c..cc7a36244893d0 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -87,13 +87,37 @@ int dlm_new_lockspace(const char *name, const char *cluster, const struct dlm_lockspace_ops *ops, void *ops_arg, int *ops_result, dlm_lockspace_t **lockspace); +/* + * dlm_release_lockspace() release_option values: + * + * DLM_RELEASE_NO_LOCKS returns -EBUSY if any locks (lkb's) + * exist in the local lockspace. + * + * DLM_RELEASE_UNUSED previous value that is no longer used. + * + * DLM_RELEASE_NORMAL releases the lockspace regardless of any + * locks managed in the local lockspace. + * + * DLM_RELEASE_NO_EVENT release the lockspace regardless of any + * locks managed in the local lockspace, and does not submit + * a leave event to the cluster manager, so other nodes will + * not be notified that the node should be removed from the + * list of lockspace members. + */ +#define DLM_RELEASE_NO_LOCKS 0 +#define DLM_RELEASE_UNUSED 1 +#define DLM_RELEASE_NORMAL 2 +#define DLM_RELEASE_NO_EVENT 3 + /* * dlm_release_lockspace * * Stop a lockspace. + * + * release_option: see DLM_RELEASE values above. */ -int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force); +int dlm_release_lockspace(dlm_lockspace_t *lockspace, int release_option); /* * dlm_lock From 5665374c7246159dbca1a80a0b54ad27e3379bcb Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 23 Jul 2025 11:21:54 -0400 Subject: [PATCH 0100/3206] dlm: add new RELEASE_RECOVER uevent attribute for release_lockspace RELEASE_RECOVER=0|1 is passed to user space as part of the OFFLINE uevent for leaving a lockspace, and used by subsequent patches. RELEASE_RECOVER=1 tells user space that the release_lockspace/leave should be handled by the remaining lockspace members as if the leaving node had failed. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index be8dbf48222953..6ff666a511c7ae 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -186,12 +186,17 @@ static struct kobj_type dlm_ktype = { static struct kset *dlm_kset; -static int do_uevent(struct dlm_ls *ls, int in) +static int do_uevent(struct dlm_ls *ls, int in, unsigned int release_recover) { - if (in) + char message[512] = {}; + char *envp[] = { message, NULL }; + + if (in) { kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE); - else - kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); + } else { + snprintf(message, 511, "RELEASE_RECOVER=%u", release_recover); + kobject_uevent_env(&ls->ls_kobj, KOBJ_OFFLINE, envp); + } log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving"); @@ -575,7 +580,7 @@ static int new_lockspace(const char *name, const char *cluster, current lockspace members are (via configfs) and then tells the lockspace to start running (via sysfs) in dlm_ls_start(). */ - error = do_uevent(ls, 1); + error = do_uevent(ls, 1, 0); if (error < 0) goto out_recoverd; @@ -592,7 +597,7 @@ static int new_lockspace(const char *name, const char *cluster, return 0; out_members: - do_uevent(ls, 0); + do_uevent(ls, 0, 0); dlm_clear_members(ls); kfree(ls->ls_node_array); out_recoverd: @@ -733,7 +738,7 @@ static int release_lockspace(struct dlm_ls *ls, int release_option) if (release_option != DLM_RELEASE_NO_EVENT && dlm_user_daemon_available()) - do_uevent(ls, 0); + do_uevent(ls, 0, 0); dlm_recoverd_stop(ls); From de7b4869b4ecf5790b0e7875c5522d43d7a61d79 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 23 Jul 2025 11:21:55 -0400 Subject: [PATCH 0101/3206] dlm: add new configfs entry release_recover for lockspace members A new configfs entry is added for a lockspace member: /config/dlm//spaces//nodes//release_recover release_recover can be set to 1 by userspace (dlm_controld process) prior to removing the lockspace member (rmdir of the ). This tells the kernel to handle the removed member as if it had failed, i.e. recovery steps for a failed node should be perfomed, as opposed to the recovery steps for a node doing a controlled leave. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/config.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++-- fs/dlm/config.h | 2 ++ fs/dlm/member.c | 7 ++++-- 3 files changed, 69 insertions(+), 4 deletions(-) diff --git a/fs/dlm/config.c b/fs/dlm/config.c index a23fd524a6ee35..a0d75b5c83c632 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -26,6 +26,7 @@ /* * /config/dlm//spaces//nodes//nodeid (refers to ) * /config/dlm//spaces//nodes//weight + * /config/dlm//spaces//nodes//release_recover * /config/dlm//comms//nodeid (refers to ) * /config/dlm//comms//local * /config/dlm//comms//addr (write only) @@ -267,6 +268,7 @@ enum { enum { NODE_ATTR_NODEID = 0, NODE_ATTR_WEIGHT, + NODE_ATTR_RELEASE_RECOVER, }; struct dlm_clusters { @@ -280,6 +282,8 @@ struct dlm_spaces { struct dlm_space { struct config_group group; struct list_head members; + struct list_head members_gone; + int members_gone_count; struct mutex members_lock; int members_count; struct dlm_nodes *nds; @@ -310,6 +314,14 @@ struct dlm_node { int weight; int new; int comm_seq; /* copy of cm->seq when nd->nodeid is set */ + unsigned int release_recover; +}; + +struct dlm_member_gone { + int nodeid; + unsigned int release_recover; + + struct list_head list; /* space->members_gone */ }; static struct configfs_group_operations clusters_ops = { @@ -480,6 +492,7 @@ static struct config_group *make_space(struct config_group *g, const char *name) configfs_add_default_group(&nds->ns_group, &sp->group); INIT_LIST_HEAD(&sp->members); + INIT_LIST_HEAD(&sp->members_gone); mutex_init(&sp->members_lock); sp->members_count = 0; sp->nds = nds; @@ -587,10 +600,20 @@ static void drop_node(struct config_group *g, struct config_item *i) { struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); struct dlm_node *nd = config_item_to_node(i); + struct dlm_member_gone *mb_gone; + + mb_gone = kzalloc(sizeof(*mb_gone), GFP_KERNEL); + if (!mb_gone) + return; mutex_lock(&sp->members_lock); list_del(&nd->list); sp->members_count--; + + mb_gone->nodeid = nd->nodeid; + mb_gone->release_recover = nd->release_recover; + list_add(&mb_gone->list, &sp->members_gone); + sp->members_gone_count++; mutex_unlock(&sp->members_lock); config_item_put(i); @@ -815,12 +838,34 @@ static ssize_t node_weight_store(struct config_item *item, const char *buf, return len; } +static ssize_t node_release_recover_show(struct config_item *item, char *buf) +{ + struct dlm_node *n = config_item_to_node(item); + + return sprintf(buf, "%u\n", n->release_recover); +} + +static ssize_t node_release_recover_store(struct config_item *item, + const char *buf, size_t len) +{ + struct dlm_node *n = config_item_to_node(item); + int rc; + + rc = kstrtouint(buf, 0, &n->release_recover); + if (rc) + return rc; + + return len; +} + CONFIGFS_ATTR(node_, nodeid); CONFIGFS_ATTR(node_, weight); +CONFIGFS_ATTR(node_, release_recover); static struct configfs_attribute *node_attrs[] = { [NODE_ATTR_NODEID] = &node_attr_nodeid, [NODE_ATTR_WEIGHT] = &node_attr_weight, + [NODE_ATTR_RELEASE_RECOVER] = &node_attr_release_recover, NULL, }; @@ -882,9 +927,10 @@ static void put_comm(struct dlm_comm *cm) int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out) { + struct dlm_member_gone *mb_gone, *mb_safe; + struct dlm_config_node *nodes, *node; struct dlm_space *sp; struct dlm_node *nd; - struct dlm_config_node *nodes, *node; int rv, count; sp = get_space(lsname); @@ -898,7 +944,7 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, goto out; } - count = sp->members_count; + count = sp->members_count + sp->members_gone_count; nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS); if (!nodes) { @@ -917,6 +963,20 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, nd->new = 0; } + /* we delay the remove on nodes until here as configfs does + * not support addtional attributes for rmdir(). + */ + list_for_each_entry_safe(mb_gone, mb_safe, &sp->members_gone, list) { + node->nodeid = mb_gone->nodeid; + node->release_recover = mb_gone->release_recover; + node->gone = true; + node++; + + list_del(&mb_gone->list); + sp->members_gone_count--; + kfree(mb_gone); + } + *count_out = count; *nodes_out = nodes; rv = 0; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 13a3d0b2619425..4ebd45f752762c 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -17,8 +17,10 @@ struct dlm_config_node { int nodeid; int weight; + bool gone; int new; uint32_t comm_seq; + unsigned int release_recover; }; extern const struct rhashtable_params dlm_rhash_rsb_params; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index b0864c93230f53..152d2cb16f5914 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -569,10 +569,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { node = find_config_node(rv, memb->nodeid); - if (node && !node->new) + if (node && !node->new && !node->gone) continue; - if (!node) { + if (node->gone) { log_rinfo(ls, "remove member %d", memb->nodeid); } else { /* removed and re-added */ @@ -591,6 +591,9 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) for (i = 0; i < rv->nodes_count; i++) { node = &rv->nodes[i]; + if (node->gone) + continue; + if (dlm_is_member(ls, node->nodeid)) continue; error = dlm_add_member(ls, node); From 6f4f4ca5caf73de5e86329547d4527b3e0c08488 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 23 Jul 2025 11:21:56 -0400 Subject: [PATCH 0102/3206] dlm: add new flag DLM_RELEASE_RECOVER for dlm_lockspace_release When dlm_lockspace_release() is passed DLM_RELEASE_RECOVER, it tells the dlm to handle the release/leave as if the node had failed, i.e. perform recovery steps for a failed node, like recover_slot(). When DLM_RELEASE_RECOVER is set: - dlm_release_lockspace() includes RELEASE_RECOVER=1 in the OFFLINE uevent sent to userspace. - userspace/dlm_controld sends a message to all lockspace members indicating that the subsequent node removal should be handled as if the node had failed. - when dlm_controld on all nodes receives the new message, it sets the release_recover configfs entry to 1 for the node. - when the dlm/kernel next performs recovery and removes the node, it will see that release_recover has been set, and will perform recovery steps for the node as if it had failed, e.g. the recover_slot() callback is called to notify the fs. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 2 +- fs/dlm/member.c | 14 ++++++++++---- include/linux/dlm.h | 5 +++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 6ff666a511c7ae..d986b7ef153dcf 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -738,7 +738,7 @@ static int release_lockspace(struct dlm_ls *ls, int release_option) if (release_option != DLM_RELEASE_NO_EVENT && dlm_user_daemon_available()) - do_uevent(ls, 0, 0); + do_uevent(ls, 0, (release_option == DLM_RELEASE_RECOVER)); dlm_recoverd_stop(ls); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 152d2cb16f5914..356337102015ec 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -478,7 +478,8 @@ static void dlm_lsop_recover_prep(struct dlm_ls *ls) ls->ls_ops->recover_prep(ls->ls_ops_arg); } -static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) +static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb, + unsigned int release_recover) { struct dlm_slot slot; uint32_t seq; @@ -495,7 +496,7 @@ static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) error = dlm_comm_seq(memb->nodeid, &seq, false); - if (!error && seq == memb->comm_seq) + if (!release_recover && !error && seq == memb->comm_seq) return; slot.nodeid = memb->nodeid; @@ -552,6 +553,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) struct dlm_member *memb, *safe; struct dlm_config_node *node; int i, error, neg = 0, low = -1; + unsigned int release_recover; /* previously removed members that we've not finished removing need to * count as a negative change so the "neg" recovery steps will happen @@ -572,8 +574,12 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) if (node && !node->new && !node->gone) continue; + release_recover = 0; + if (node->gone) { - log_rinfo(ls, "remove member %d", memb->nodeid); + release_recover = node->release_recover; + log_rinfo(ls, "remove member %d%s", memb->nodeid, + release_recover ? " (release_recover)" : ""); } else { /* removed and re-added */ log_rinfo(ls, "remove member %d comm_seq %u %u", @@ -584,7 +590,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) list_move(&memb->list, &ls->ls_nodes_gone); remove_remote_member(memb->nodeid); ls->ls_num_nodes--; - dlm_lsop_recover_slot(ls, memb); + dlm_lsop_recover_slot(ls, memb, release_recover); } /* add new members to ls_nodes */ diff --git a/include/linux/dlm.h b/include/linux/dlm.h index cc7a36244893d0..108eb953eb1811 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -103,11 +103,16 @@ int dlm_new_lockspace(const char *name, const char *cluster, * a leave event to the cluster manager, so other nodes will * not be notified that the node should be removed from the * list of lockspace members. + * + * DLM_RELEASE_RECOVER like DLM_RELEASE_NORMAL, but the remaining + * nodes will handle the removal of the node as if the node + * had failed, e.g. the recover_slot() callback would be used. */ #define DLM_RELEASE_NO_LOCKS 0 #define DLM_RELEASE_UNUSED 1 #define DLM_RELEASE_NORMAL 2 #define DLM_RELEASE_NO_EVENT 3 +#define DLM_RELEASE_RECOVER 4 /* * dlm_release_lockspace From f20e70a341dd67ac4aca23f93b6acdca6779e69d Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Thu, 7 Aug 2025 22:05:45 +0800 Subject: [PATCH 0103/3206] selinux: Remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. No functional changes. Signed-off-by: Qianfeng Rong Acked-by: Stephen Smalley [PM: fixed horizontal spacing / alignment, line wraps] Signed-off-by: Paul Moore --- security/selinux/avc.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 4b4837a20225bc..430b0e23ee00dc 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -292,27 +292,26 @@ static struct avc_xperms_decision_node struct avc_xperms_decision_node *xpd_node; struct extended_perms_decision *xpd; - xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, - GFP_NOWAIT | __GFP_NOWARN); + xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT); if (!xpd_node) return NULL; xpd = &xpd_node->xpd; if (which & XPERMS_ALLOWED) { xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!xpd->allowed) goto error; } if (which & XPERMS_AUDITALLOW) { xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!xpd->auditallow) goto error; } if (which & XPERMS_DONTAUDIT) { xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!xpd->dontaudit) goto error; } @@ -340,7 +339,7 @@ static struct avc_xperms_node *avc_xperms_alloc(void) { struct avc_xperms_node *xp_node; - xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT | __GFP_NOWARN); + xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT); if (!xp_node) return xp_node; INIT_LIST_HEAD(&xp_node->xpd_head); @@ -495,7 +494,7 @@ static struct avc_node *avc_alloc_node(void) { struct avc_node *node; - node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT | __GFP_NOWARN); + node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT); if (!node) goto out; From 1f54d5e5cd2a03cb6dd8326db576f8763f9b6785 Mon Sep 17 00:00:00 2001 From: Daniel Almeida Date: Mon, 11 Aug 2025 13:03:39 -0300 Subject: [PATCH 0104/3206] rust: irq: add irq module Add the IRQ module. Future patches will then introduce support for IRQ registrations and handlers. Reviewed-by: Alice Ryhl Tested-by: Joel Fernandes Tested-by: Dirk Behme Signed-off-by: Daniel Almeida Link: https://lore.kernel.org/r/20250811-topics-tyr-request_irq2-v9-1-0485dcd9bcbf@collabora.com Signed-off-by: Danilo Krummrich --- rust/kernel/irq.rs | 11 +++++++++++ rust/kernel/lib.rs | 1 + 2 files changed, 12 insertions(+) create mode 100644 rust/kernel/irq.rs diff --git a/rust/kernel/irq.rs b/rust/kernel/irq.rs new file mode 100644 index 00000000000000..fae7b15effc80c --- /dev/null +++ b/rust/kernel/irq.rs @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! IRQ abstractions. +//! +//! An IRQ is an interrupt request from a device. It is used to get the CPU's +//! attention so it can service a hardware event in a timely manner. +//! +//! The current abstractions handle IRQ requests and handlers, i.e.: it allows +//! drivers to register a handler for a given IRQ line. +//! +//! C header: [`include/linux/device.h`](srctree/include/linux/interrupt.h) diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index ed53169e795c0b..f8db761c5c95fc 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -92,6 +92,7 @@ pub mod fs; pub mod init; pub mod io; pub mod ioctl; +pub mod irq; pub mod jump_label; #[cfg(CONFIG_KUNIT)] pub mod kunit; From 746680ec6696585e30db3e18c93a63df9cbec39c Mon Sep 17 00:00:00 2001 From: Daniel Almeida Date: Mon, 11 Aug 2025 13:03:40 -0300 Subject: [PATCH 0105/3206] rust: irq: add flags module Manipulating IRQ flags (i.e.: IRQF_*) will soon be necessary, specially to register IRQ handlers through bindings::request_irq(). Add a kernel::irq::Flags for that purpose. Reviewed-by: Alice Ryhl Tested-by: Joel Fernandes Tested-by: Dirk Behme Signed-off-by: Daniel Almeida Link: https://lore.kernel.org/r/20250811-topics-tyr-request_irq2-v9-2-0485dcd9bcbf@collabora.com [ Use expect(dead_code) for into_inner(), fix broken intra-doc link and typo. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/irq.rs | 5 ++ rust/kernel/irq/flags.rs | 125 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 rust/kernel/irq/flags.rs diff --git a/rust/kernel/irq.rs b/rust/kernel/irq.rs index fae7b15effc80c..068df2fea31de5 100644 --- a/rust/kernel/irq.rs +++ b/rust/kernel/irq.rs @@ -9,3 +9,8 @@ //! drivers to register a handler for a given IRQ line. //! //! C header: [`include/linux/device.h`](srctree/include/linux/interrupt.h) + +/// Flags to be used when registering IRQ handlers. +mod flags; + +pub use flags::Flags; diff --git a/rust/kernel/irq/flags.rs b/rust/kernel/irq/flags.rs new file mode 100644 index 00000000000000..d769d68e8edcb4 --- /dev/null +++ b/rust/kernel/irq/flags.rs @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +// SPDX-FileCopyrightText: Copyright 2025 Collabora ltd. + +use crate::bindings; +use crate::prelude::*; + +/// Flags to be used when registering IRQ handlers. +/// +/// Flags can be used to request specific behaviors when registering an IRQ +/// handler, and can be combined using the `|`, `&`, and `!` operators to +/// further control the system's behavior. +/// +/// A common use case is to register a shared interrupt, as sharing the line +/// between devices is increasingly common in modern systems and is even +/// required for some buses. This requires setting [`Flags::SHARED`] when +/// requesting the interrupt. Other use cases include setting the trigger type +/// through `Flags::TRIGGER_*`, which determines when the interrupt fires, or +/// controlling whether the interrupt is masked after the handler runs by using +/// [`Flags::ONESHOT`]. +/// +/// If an invalid combination of flags is provided, the system will refuse to +/// register the handler, and lower layers will enforce certain flags when +/// necessary. This means, for example, that all the +/// `crate::irq::Registration` for a shared interrupt have to agree on +/// [`Flags::SHARED`] and on the same trigger type, if set. +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Flags(c_ulong); + +impl Flags { + /// Use the interrupt line as already configured. + pub const TRIGGER_NONE: Flags = Flags::new(bindings::IRQF_TRIGGER_NONE); + + /// The interrupt is triggered when the signal goes from low to high. + pub const TRIGGER_RISING: Flags = Flags::new(bindings::IRQF_TRIGGER_RISING); + + /// The interrupt is triggered when the signal goes from high to low. + pub const TRIGGER_FALLING: Flags = Flags::new(bindings::IRQF_TRIGGER_FALLING); + + /// The interrupt is triggered while the signal is held high. + pub const TRIGGER_HIGH: Flags = Flags::new(bindings::IRQF_TRIGGER_HIGH); + + /// The interrupt is triggered while the signal is held low. + pub const TRIGGER_LOW: Flags = Flags::new(bindings::IRQF_TRIGGER_LOW); + + /// Allow sharing the IRQ among several devices. + pub const SHARED: Flags = Flags::new(bindings::IRQF_SHARED); + + /// Set by callers when they expect sharing mismatches to occur. + pub const PROBE_SHARED: Flags = Flags::new(bindings::IRQF_PROBE_SHARED); + + /// Flag to mark this interrupt as timer interrupt. + pub const TIMER: Flags = Flags::new(bindings::IRQF_TIMER); + + /// Interrupt is per CPU. + pub const PERCPU: Flags = Flags::new(bindings::IRQF_PERCPU); + + /// Flag to exclude this interrupt from irq balancing. + pub const NOBALANCING: Flags = Flags::new(bindings::IRQF_NOBALANCING); + + /// Interrupt is used for polling (only the interrupt that is registered + /// first in a shared interrupt is considered for performance reasons). + pub const IRQPOLL: Flags = Flags::new(bindings::IRQF_IRQPOLL); + + /// Interrupt is not re-enabled after the hardirq handler finished. Used by + /// threaded interrupts which need to keep the irq line disabled until the + /// threaded handler has been run. + pub const ONESHOT: Flags = Flags::new(bindings::IRQF_ONESHOT); + + /// Do not disable this IRQ during suspend. Does not guarantee that this + /// interrupt will wake the system from a suspended state. + pub const NO_SUSPEND: Flags = Flags::new(bindings::IRQF_NO_SUSPEND); + + /// Force enable it on resume even if [`Flags::NO_SUSPEND`] is set. + pub const FORCE_RESUME: Flags = Flags::new(bindings::IRQF_FORCE_RESUME); + + /// Interrupt cannot be threaded. + pub const NO_THREAD: Flags = Flags::new(bindings::IRQF_NO_THREAD); + + /// Resume IRQ early during syscore instead of at device resume time. + pub const EARLY_RESUME: Flags = Flags::new(bindings::IRQF_EARLY_RESUME); + + /// If the IRQ is shared with a [`Flags::NO_SUSPEND`] user, execute this + /// interrupt handler after suspending interrupts. For system wakeup devices + /// users need to implement wakeup detection in their interrupt handlers. + pub const COND_SUSPEND: Flags = Flags::new(bindings::IRQF_COND_SUSPEND); + + /// Don't enable IRQ or NMI automatically when users request it. Users will + /// enable it explicitly by `enable_irq` or `enable_nmi` later. + pub const NO_AUTOEN: Flags = Flags::new(bindings::IRQF_NO_AUTOEN); + + /// Exclude from runnaway detection for IPI and similar handlers, depends on + /// `PERCPU`. + pub const NO_DEBUG: Flags = Flags::new(bindings::IRQF_NO_DEBUG); + + #[expect(dead_code)] + pub(crate) fn into_inner(self) -> c_ulong { + self.0 + } + + const fn new(value: u32) -> Self { + build_assert!(value as u64 <= c_ulong::MAX as u64); + Self(value as c_ulong) + } +} + +impl core::ops::BitOr for Flags { + type Output = Self; + fn bitor(self, rhs: Self) -> Self::Output { + Self(self.0 | rhs.0) + } +} + +impl core::ops::BitAnd for Flags { + type Output = Self; + fn bitand(self, rhs: Self) -> Self::Output { + Self(self.0 & rhs.0) + } +} + +impl core::ops::Not for Flags { + type Output = Self; + fn not(self) -> Self::Output { + Self(!self.0) + } +} From 0851d34a8cc3a0a43acd79a5c4980d45c6471aab Mon Sep 17 00:00:00 2001 From: Daniel Almeida Date: Mon, 11 Aug 2025 13:03:41 -0300 Subject: [PATCH 0106/3206] rust: irq: add support for non-threaded IRQs and handlers This patch adds support for non-threaded IRQs and handlers through irq::Registration and the irq::Handler trait. Registering an irq is dependent upon having a IrqRequest that was previously allocated by a given device. This will be introduced in subsequent patches. Tested-by: Joel Fernandes Tested-by: Dirk Behme Reviewed-by: Alice Ryhl Signed-off-by: Daniel Almeida Link: https://lore.kernel.org/r/20250811-topics-tyr-request_irq2-v9-3-0485dcd9bcbf@collabora.com [ Remove expect(dead_code) from Flags::into_inner(), add expect(dead_code) to IrqRequest::new(), fix intra-doc links. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/bindings/bindings_helper.h | 1 + rust/helpers/helpers.c | 1 + rust/helpers/irq.c | 9 ++ rust/kernel/irq.rs | 5 + rust/kernel/irq/flags.rs | 3 +- rust/kernel/irq/request.rs | 265 ++++++++++++++++++++++++++++++++ 6 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 rust/helpers/irq.c create mode 100644 rust/kernel/irq/request.rs diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 84d60635e8a9ba..69a975da829f0c 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 7cf7fe95e41dd5..44b2005d50140d 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -22,6 +22,7 @@ #include "dma.c" #include "drm.c" #include "err.c" +#include "irq.c" #include "fs.c" #include "io.c" #include "jump_label.c" diff --git a/rust/helpers/irq.c b/rust/helpers/irq.c new file mode 100644 index 00000000000000..1faca428e2c047 --- /dev/null +++ b/rust/helpers/irq.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +int rust_helper_request_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *name, void *dev) +{ + return request_irq(irq, handler, flags, name, dev); +} diff --git a/rust/kernel/irq.rs b/rust/kernel/irq.rs index 068df2fea31de5..c1019bc36ad1e7 100644 --- a/rust/kernel/irq.rs +++ b/rust/kernel/irq.rs @@ -13,4 +13,9 @@ /// Flags to be used when registering IRQ handlers. mod flags; +/// IRQ allocation and handling. +mod request; + pub use flags::Flags; + +pub use request::{Handler, IrqRequest, IrqReturn, Registration}; diff --git a/rust/kernel/irq/flags.rs b/rust/kernel/irq/flags.rs index d769d68e8edcb4..adfde96ec47cf4 100644 --- a/rust/kernel/irq/flags.rs +++ b/rust/kernel/irq/flags.rs @@ -21,7 +21,7 @@ use crate::prelude::*; /// If an invalid combination of flags is provided, the system will refuse to /// register the handler, and lower layers will enforce certain flags when /// necessary. This means, for example, that all the -/// `crate::irq::Registration` for a shared interrupt have to agree on +/// [`crate::irq::Registration`] for a shared interrupt have to agree on /// [`Flags::SHARED`] and on the same trigger type, if set. #[derive(Clone, Copy, PartialEq, Eq)] pub struct Flags(c_ulong); @@ -92,7 +92,6 @@ impl Flags { /// `PERCPU`. pub const NO_DEBUG: Flags = Flags::new(bindings::IRQF_NO_DEBUG); - #[expect(dead_code)] pub(crate) fn into_inner(self) -> c_ulong { self.0 } diff --git a/rust/kernel/irq/request.rs b/rust/kernel/irq/request.rs new file mode 100644 index 00000000000000..c585811cfdfbf7 --- /dev/null +++ b/rust/kernel/irq/request.rs @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-2.0 +// SPDX-FileCopyrightText: Copyright 2025 Collabora ltd. + +//! This module provides types like [`Registration`] which allow users to +//! register handlers for a given IRQ line. + +use core::marker::PhantomPinned; + +use crate::alloc::Allocator; +use crate::device::{Bound, Device}; +use crate::devres::Devres; +use crate::error::to_result; +use crate::irq::flags::Flags; +use crate::prelude::*; +use crate::str::CStr; +use crate::sync::Arc; + +/// The value that can be returned from a [`Handler`] or a `ThreadedHandler`. +#[repr(u32)] +pub enum IrqReturn { + /// The interrupt was not from this device or was not handled. + None = bindings::irqreturn_IRQ_NONE, + + /// The interrupt was handled by this device. + Handled = bindings::irqreturn_IRQ_HANDLED, +} + +/// Callbacks for an IRQ handler. +pub trait Handler: Sync { + /// The hard IRQ handler. + /// + /// This is executed in interrupt context, hence all corresponding + /// limitations do apply. + /// + /// All work that does not necessarily need to be executed from + /// interrupt context, should be deferred to a threaded handler. + /// See also `ThreadedRegistration`. + fn handle(&self) -> IrqReturn; +} + +impl Handler for Arc { + fn handle(&self) -> IrqReturn { + T::handle(self) + } +} + +impl Handler for Box { + fn handle(&self) -> IrqReturn { + T::handle(self) + } +} + +/// # Invariants +/// +/// - `self.irq` is the same as the one passed to `request_{threaded}_irq`. +/// - `cookie` was passed to `request_{threaded}_irq` as the cookie. It is guaranteed to be unique +/// by the type system, since each call to `new` will return a different instance of +/// `Registration`. +#[pin_data(PinnedDrop)] +struct RegistrationInner { + irq: u32, + cookie: *mut c_void, +} + +impl RegistrationInner { + fn synchronize(&self) { + // SAFETY: safe as per the invariants of `RegistrationInner` + unsafe { bindings::synchronize_irq(self.irq) }; + } +} + +#[pinned_drop] +impl PinnedDrop for RegistrationInner { + fn drop(self: Pin<&mut Self>) { + // SAFETY: + // + // Safe as per the invariants of `RegistrationInner` and: + // + // - The containing struct is `!Unpin` and was initialized using + // pin-init, so it occupied the same memory location for the entirety of + // its lifetime. + // + // Notice that this will block until all handlers finish executing, + // i.e.: at no point will &self be invalid while the handler is running. + unsafe { bindings::free_irq(self.irq, self.cookie) }; + } +} + +// SAFETY: We only use `inner` on drop, which called at most once with no +// concurrent access. +unsafe impl Sync for RegistrationInner {} + +// SAFETY: It is safe to send `RegistrationInner` across threads. +unsafe impl Send for RegistrationInner {} + +/// A request for an IRQ line for a given device. +/// +/// # Invariants +/// +/// - `ìrq` is the number of an interrupt source of `dev`. +/// - `irq` has not been registered yet. +pub struct IrqRequest<'a> { + dev: &'a Device, + irq: u32, +} + +impl<'a> IrqRequest<'a> { + /// Creates a new IRQ request for the given device and IRQ number. + /// + /// # Safety + /// + /// - `irq` should be a valid IRQ number for `dev`. + #[expect(dead_code)] + pub(crate) unsafe fn new(dev: &'a Device, irq: u32) -> Self { + // INVARIANT: `irq` is a valid IRQ number for `dev`. + IrqRequest { dev, irq } + } + + /// Returns the IRQ number of an [`IrqRequest`]. + pub fn irq(&self) -> u32 { + self.irq + } +} + +/// A registration of an IRQ handler for a given IRQ line. +/// +/// # Examples +/// +/// The following is an example of using `Registration`. It uses a +/// [`Completion`] to coordinate between the IRQ +/// handler and process context. [`Completion`] uses interior mutability, so the +/// handler can signal with [`Completion::complete_all()`] and the process +/// context can wait with [`Completion::wait_for_completion()`] even though +/// there is no way to get a mutable reference to the any of the fields in +/// `Data`. +/// +/// [`Completion`]: kernel::sync::Completion +/// [`Completion::complete_all()`]: kernel::sync::Completion::complete_all +/// [`Completion::wait_for_completion()`]: kernel::sync::Completion::wait_for_completion +/// +/// ``` +/// use kernel::c_str; +/// use kernel::device::Bound; +/// use kernel::irq::{self, Flags, IrqRequest, IrqReturn, Registration}; +/// use kernel::prelude::*; +/// use kernel::sync::{Arc, Completion}; +/// +/// // Data shared between process and IRQ context. +/// #[pin_data] +/// struct Data { +/// #[pin] +/// completion: Completion, +/// } +/// +/// impl irq::Handler for Data { +/// // Executed in IRQ context. +/// fn handle(&self) -> IrqReturn { +/// self.completion.complete_all(); +/// IrqReturn::Handled +/// } +/// } +/// +/// // Registers an IRQ handler for the given IrqRequest. +/// // +/// // This runs in process context and assumes `request` was previously acquired from a device. +/// fn register_irq( +/// handler: impl PinInit, +/// request: IrqRequest<'_>, +/// ) -> Result>> { +/// let registration = Registration::new(request, Flags::SHARED, c_str!("my_device"), handler); +/// +/// let registration = Arc::pin_init(registration, GFP_KERNEL)?; +/// +/// registration.handler().completion.wait_for_completion(); +/// +/// Ok(registration) +/// } +/// # Ok::<(), Error>(()) +/// ``` +/// +/// # Invariants +/// +/// * We own an irq handler using `&self.handler` as its private data. +#[pin_data] +pub struct Registration { + #[pin] + inner: Devres, + + #[pin] + handler: T, + + /// Pinned because we need address stability so that we can pass a pointer + /// to the callback. + #[pin] + _pin: PhantomPinned, +} + +impl Registration { + /// Registers the IRQ handler with the system for the given IRQ number. + pub fn new<'a>( + request: IrqRequest<'a>, + flags: Flags, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> impl PinInit + 'a { + try_pin_init!(&this in Self { + handler <- handler, + inner <- Devres::new( + request.dev, + try_pin_init!(RegistrationInner { + // SAFETY: `this` is a valid pointer to the `Registration` instance + cookie: unsafe { &raw mut (*this.as_ptr()).handler }.cast(), + irq: { + // SAFETY: + // - The callbacks are valid for use with request_irq. + // - If this succeeds, the slot is guaranteed to be valid until the + // destructor of Self runs, which will deregister the callbacks + // before the memory location becomes invalid. + to_result(unsafe { + bindings::request_irq( + request.irq, + Some(handle_irq_callback::), + flags.into_inner(), + name.as_char_ptr(), + (&raw mut (*this.as_ptr()).handler).cast(), + ) + })?; + request.irq + } + }) + ), + _pin: PhantomPinned, + }) + } + + /// Returns a reference to the handler that was registered with the system. + pub fn handler(&self) -> &T { + &self.handler + } + + /// Wait for pending IRQ handlers on other CPUs. + /// + /// This will attempt to access the inner [`Devres`] container. + pub fn try_synchronize(&self) -> Result { + let inner = self.inner.try_access().ok_or(ENODEV)?; + inner.synchronize(); + Ok(()) + } + + /// Wait for pending IRQ handlers on other CPUs. + pub fn synchronize(&self, dev: &Device) -> Result { + let inner = self.inner.access(dev)?; + inner.synchronize(); + Ok(()) + } +} + +/// # Safety +/// +/// This function should be only used as the callback in `request_irq`. +unsafe extern "C" fn handle_irq_callback(_irq: i32, ptr: *mut c_void) -> c_uint { + // SAFETY: `ptr` is a pointer to T set in `Registration::new` + let handler = unsafe { &*(ptr as *const T) }; + T::handle(handler) as c_uint +} From 135d40523244dcad3c64eb2ce131cf018db5cff4 Mon Sep 17 00:00:00 2001 From: Daniel Almeida Date: Mon, 11 Aug 2025 13:03:42 -0300 Subject: [PATCH 0107/3206] rust: irq: add support for threaded IRQs and handlers This patch adds support for threaded IRQs and handlers through irq::ThreadedRegistration and the irq::ThreadedHandler trait. Threaded interrupts are more permissive in the sense that further processing is possible in a kthread. This means that said execution takes place outside of interrupt context, which is rather restrictive in many ways. Registering a threaded irq is dependent upon having an IrqRequest that was previously allocated by a given device. This will be introduced in subsequent patches. Tested-by: Joel Fernandes Tested-by: Dirk Behme Reviewed-by: Alice Ryhl Signed-off-by: Daniel Almeida Link: https://lore.kernel.org/r/20250811-topics-tyr-request_irq2-v9-4-0485dcd9bcbf@collabora.com [ Add now available intra-doc links back in. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/irq.rs | 5 +- rust/kernel/irq/request.rs | 232 ++++++++++++++++++++++++++++++++++++- 2 files changed, 232 insertions(+), 5 deletions(-) diff --git a/rust/kernel/irq.rs b/rust/kernel/irq.rs index c1019bc36ad1e7..20abd405665596 100644 --- a/rust/kernel/irq.rs +++ b/rust/kernel/irq.rs @@ -18,4 +18,7 @@ mod request; pub use flags::Flags; -pub use request::{Handler, IrqRequest, IrqReturn, Registration}; +pub use request::{ + Handler, IrqRequest, IrqReturn, Registration, ThreadedHandler, ThreadedIrqReturn, + ThreadedRegistration, +}; diff --git a/rust/kernel/irq/request.rs b/rust/kernel/irq/request.rs index c585811cfdfbf7..7a956566f503f5 100644 --- a/rust/kernel/irq/request.rs +++ b/rust/kernel/irq/request.rs @@ -1,8 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 // SPDX-FileCopyrightText: Copyright 2025 Collabora ltd. -//! This module provides types like [`Registration`] which allow users to -//! register handlers for a given IRQ line. +//! This module provides types like [`Registration`] and +//! [`ThreadedRegistration`], which allow users to register handlers for a given +//! IRQ line. use core::marker::PhantomPinned; @@ -15,7 +16,7 @@ use crate::prelude::*; use crate::str::CStr; use crate::sync::Arc; -/// The value that can be returned from a [`Handler`] or a `ThreadedHandler`. +/// The value that can be returned from a [`Handler`] or a [`ThreadedHandler`]. #[repr(u32)] pub enum IrqReturn { /// The interrupt was not from this device or was not handled. @@ -34,7 +35,7 @@ pub trait Handler: Sync { /// /// All work that does not necessarily need to be executed from /// interrupt context, should be deferred to a threaded handler. - /// See also `ThreadedRegistration`. + /// See also [`ThreadedRegistration`]. fn handle(&self) -> IrqReturn; } @@ -263,3 +264,226 @@ unsafe extern "C" fn handle_irq_callback(_irq: i32, ptr: *mut c_void let handler = unsafe { &*(ptr as *const T) }; T::handle(handler) as c_uint } + +/// The value that can be returned from [`ThreadedHandler::handle`]. +#[repr(u32)] +pub enum ThreadedIrqReturn { + /// The interrupt was not from this device or was not handled. + None = bindings::irqreturn_IRQ_NONE, + + /// The interrupt was handled by this device. + Handled = bindings::irqreturn_IRQ_HANDLED, + + /// The handler wants the handler thread to wake up. + WakeThread = bindings::irqreturn_IRQ_WAKE_THREAD, +} + +/// Callbacks for a threaded IRQ handler. +pub trait ThreadedHandler: Sync { + /// The hard IRQ handler. + /// + /// This is executed in interrupt context, hence all corresponding + /// limitations do apply. All work that does not necessarily need to be + /// executed from interrupt context, should be deferred to the threaded + /// handler, i.e. [`ThreadedHandler::handle_threaded`]. + /// + /// The default implementation returns [`ThreadedIrqReturn::WakeThread`]. + fn handle(&self) -> ThreadedIrqReturn { + ThreadedIrqReturn::WakeThread + } + + /// The threaded IRQ handler. + /// + /// This is executed in process context. The kernel creates a dedicated + /// `kthread` for this purpose. + fn handle_threaded(&self) -> IrqReturn; +} + +impl ThreadedHandler for Arc { + fn handle(&self) -> ThreadedIrqReturn { + T::handle(self) + } + + fn handle_threaded(&self) -> IrqReturn { + T::handle_threaded(self) + } +} + +impl ThreadedHandler for Box { + fn handle(&self) -> ThreadedIrqReturn { + T::handle(self) + } + + fn handle_threaded(&self) -> IrqReturn { + T::handle_threaded(self) + } +} + +/// A registration of a threaded IRQ handler for a given IRQ line. +/// +/// Two callbacks are required: one to handle the IRQ, and one to handle any +/// other work in a separate thread. +/// +/// The thread handler is only called if the IRQ handler returns +/// [`ThreadedIrqReturn::WakeThread`]. +/// +/// # Examples +/// +/// The following is an example of using [`ThreadedRegistration`]. It uses a +/// [`Mutex`](kernel::sync::Mutex) to provide interior mutability. +/// +/// ``` +/// use kernel::c_str; +/// use kernel::device::Bound; +/// use kernel::irq::{ +/// self, Flags, IrqRequest, IrqReturn, ThreadedHandler, ThreadedIrqReturn, +/// ThreadedRegistration, +/// }; +/// use kernel::prelude::*; +/// use kernel::sync::{Arc, Mutex}; +/// +/// // Declare a struct that will be passed in when the interrupt fires. The u32 +/// // merely serves as an example of some internal data. +/// // +/// // [`irq::ThreadedHandler::handle`] takes `&self`. This example +/// // illustrates how interior mutability can be used when sharing the data +/// // between process context and IRQ context. +/// #[pin_data] +/// struct Data { +/// #[pin] +/// value: Mutex, +/// } +/// +/// impl ThreadedHandler for Data { +/// // This will run (in a separate kthread) if and only if +/// // [`ThreadedHandler::handle`] returns [`WakeThread`], which it does by +/// // default. +/// fn handle_threaded(&self) -> IrqReturn { +/// let mut data = self.value.lock(); +/// *data += 1; +/// IrqReturn::Handled +/// } +/// } +/// +/// // Registers a threaded IRQ handler for the given [`IrqRequest`]. +/// // +/// // This is executing in process context and assumes that `request` was +/// // previously acquired from a device. +/// fn register_threaded_irq( +/// handler: impl PinInit, +/// request: IrqRequest<'_>, +/// ) -> Result>> { +/// let registration = +/// ThreadedRegistration::new(request, Flags::SHARED, c_str!("my_device"), handler); +/// +/// let registration = Arc::pin_init(registration, GFP_KERNEL)?; +/// +/// { +/// // The data can be accessed from process context too. +/// let mut data = registration.handler().value.lock(); +/// *data += 1; +/// } +/// +/// Ok(registration) +/// } +/// # Ok::<(), Error>(()) +/// ``` +/// +/// # Invariants +/// +/// * We own an irq handler using `&T` as its private data. +#[pin_data] +pub struct ThreadedRegistration { + #[pin] + inner: Devres, + + #[pin] + handler: T, + + /// Pinned because we need address stability so that we can pass a pointer + /// to the callback. + #[pin] + _pin: PhantomPinned, +} + +impl ThreadedRegistration { + /// Registers the IRQ handler with the system for the given IRQ number. + pub fn new<'a>( + request: IrqRequest<'a>, + flags: Flags, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> impl PinInit + 'a { + try_pin_init!(&this in Self { + handler <- handler, + inner <- Devres::new( + request.dev, + try_pin_init!(RegistrationInner { + // SAFETY: `this` is a valid pointer to the `ThreadedRegistration` instance. + cookie: unsafe { &raw mut (*this.as_ptr()).handler }.cast(), + irq: { + // SAFETY: + // - The callbacks are valid for use with request_threaded_irq. + // - If this succeeds, the slot is guaranteed to be valid until the + // destructor of Self runs, which will deregister the callbacks + // before the memory location becomes invalid. + to_result(unsafe { + bindings::request_threaded_irq( + request.irq, + Some(handle_threaded_irq_callback::), + Some(thread_fn_callback::), + flags.into_inner(), + name.as_char_ptr(), + (&raw mut (*this.as_ptr()).handler).cast(), + ) + })?; + request.irq + } + }) + ), + _pin: PhantomPinned, + }) + } + + /// Returns a reference to the handler that was registered with the system. + pub fn handler(&self) -> &T { + &self.handler + } + + /// Wait for pending IRQ handlers on other CPUs. + /// + /// This will attempt to access the inner [`Devres`] container. + pub fn try_synchronize(&self) -> Result { + let inner = self.inner.try_access().ok_or(ENODEV)?; + inner.synchronize(); + Ok(()) + } + + /// Wait for pending IRQ handlers on other CPUs. + pub fn synchronize(&self, dev: &Device) -> Result { + let inner = self.inner.access(dev)?; + inner.synchronize(); + Ok(()) + } +} + +/// # Safety +/// +/// This function should be only used as the callback in `request_threaded_irq`. +unsafe extern "C" fn handle_threaded_irq_callback( + _irq: i32, + ptr: *mut c_void, +) -> c_uint { + // SAFETY: `ptr` is a pointer to T set in `ThreadedRegistration::new` + let handler = unsafe { &*(ptr as *const T) }; + T::handle(handler) as c_uint +} + +/// # Safety +/// +/// This function should be only used as the callback in `request_threaded_irq`. +unsafe extern "C" fn thread_fn_callback(_irq: i32, ptr: *mut c_void) -> c_uint { + // SAFETY: `ptr` is a pointer to T set in `ThreadedRegistration::new` + let handler = unsafe { &*(ptr as *const T) }; + T::handle_threaded(handler) as c_uint +} From 17e70f0c549f4a19da7d681d60b552901833f8f3 Mon Sep 17 00:00:00 2001 From: Daniel Almeida Date: Mon, 11 Aug 2025 13:03:43 -0300 Subject: [PATCH 0108/3206] rust: platform: add irq accessors These accessors can be used to retrieve a irq::Registration and irq::ThreadedRegistration from a platform device by index or name. Alternatively, drivers can retrieve an IrqRequest from a bound platform device for later use. These accessors ensure that only valid IRQ lines can ever be registered. Reviewed-by: Alice Ryhl Tested-by: Joel Fernandes Tested-by: Dirk Behme Signed-off-by: Daniel Almeida Link: https://lore.kernel.org/r/20250811-topics-tyr-request_irq2-v9-5-0485dcd9bcbf@collabora.com [ Remove expect(dead_code) from IrqRequest::new(), re-format macros and macro invocations to not exceed 100 characters line length. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/irq/request.rs | 1 - rust/kernel/platform.rs | 176 +++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 1 deletion(-) diff --git a/rust/kernel/irq/request.rs b/rust/kernel/irq/request.rs index 7a956566f503f5..4033df7d0dce1d 100644 --- a/rust/kernel/irq/request.rs +++ b/rust/kernel/irq/request.rs @@ -111,7 +111,6 @@ impl<'a> IrqRequest<'a> { /// # Safety /// /// - `irq` should be a valid IRQ number for `dev`. - #[expect(dead_code)] pub(crate) unsafe fn new(dev: &'a Device, irq: u32) -> Self { // INVARIANT: `irq` is a valid IRQ number for `dev`. IrqRequest { dev, irq } diff --git a/rust/kernel/platform.rs b/rust/kernel/platform.rs index 8f028c76f9fa61..ce2bb4d4e88289 100644 --- a/rust/kernel/platform.rs +++ b/rust/kernel/platform.rs @@ -10,6 +10,7 @@ use crate::{ driver, error::{from_result, to_result, Result}, io::{mem::IoRequest, Resource}, + irq::{self, IrqRequest}, of, prelude::*, types::Opaque, @@ -284,6 +285,181 @@ impl Device { } } +macro_rules! define_irq_accessor_by_index { + ( + $(#[$meta:meta])* $fn_name:ident, + $request_fn:ident, + $reg_type:ident, + $handler_trait:ident + ) => { + $(#[$meta])* + pub fn $fn_name<'a, T: irq::$handler_trait + 'static>( + &'a self, + flags: irq::Flags, + index: u32, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> Result, Error> + 'a> { + let request = self.$request_fn(index)?; + + Ok(irq::$reg_type::::new( + request, + flags, + name, + handler, + )) + } + }; +} + +macro_rules! define_irq_accessor_by_name { + ( + $(#[$meta:meta])* $fn_name:ident, + $request_fn:ident, + $reg_type:ident, + $handler_trait:ident + ) => { + $(#[$meta])* + pub fn $fn_name<'a, T: irq::$handler_trait + 'static>( + &'a self, + flags: irq::Flags, + irq_name: &CStr, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> Result, Error> + 'a> { + let request = self.$request_fn(irq_name)?; + + Ok(irq::$reg_type::::new( + request, + flags, + name, + handler, + )) + } + }; +} + +impl Device { + /// Returns an [`IrqRequest`] for the IRQ at the given index, if any. + pub fn irq_by_index(&self, index: u32) -> Result> { + // SAFETY: `self.as_raw` returns a valid pointer to a `struct platform_device`. + let irq = unsafe { bindings::platform_get_irq(self.as_raw(), index) }; + + if irq < 0 { + return Err(Error::from_errno(irq)); + } + + // SAFETY: `irq` is guaranteed to be a valid IRQ number for `&self`. + Ok(unsafe { IrqRequest::new(self.as_ref(), irq as u32) }) + } + + /// Returns an [`IrqRequest`] for the IRQ at the given index, but does not + /// print an error if the IRQ cannot be obtained. + pub fn optional_irq_by_index(&self, index: u32) -> Result> { + // SAFETY: `self.as_raw` returns a valid pointer to a `struct platform_device`. + let irq = unsafe { bindings::platform_get_irq_optional(self.as_raw(), index) }; + + if irq < 0 { + return Err(Error::from_errno(irq)); + } + + // SAFETY: `irq` is guaranteed to be a valid IRQ number for `&self`. + Ok(unsafe { IrqRequest::new(self.as_ref(), irq as u32) }) + } + + /// Returns an [`IrqRequest`] for the IRQ with the given name, if any. + pub fn irq_by_name(&self, name: &CStr) -> Result> { + // SAFETY: `self.as_raw` returns a valid pointer to a `struct platform_device`. + let irq = unsafe { bindings::platform_get_irq_byname(self.as_raw(), name.as_char_ptr()) }; + + if irq < 0 { + return Err(Error::from_errno(irq)); + } + + // SAFETY: `irq` is guaranteed to be a valid IRQ number for `&self`. + Ok(unsafe { IrqRequest::new(self.as_ref(), irq as u32) }) + } + + /// Returns an [`IrqRequest`] for the IRQ with the given name, but does not + /// print an error if the IRQ cannot be obtained. + pub fn optional_irq_by_name(&self, name: &CStr) -> Result> { + // SAFETY: `self.as_raw` returns a valid pointer to a `struct platform_device`. + let irq = unsafe { + bindings::platform_get_irq_byname_optional(self.as_raw(), name.as_char_ptr()) + }; + + if irq < 0 { + return Err(Error::from_errno(irq)); + } + + // SAFETY: `irq` is guaranteed to be a valid IRQ number for `&self`. + Ok(unsafe { IrqRequest::new(self.as_ref(), irq as u32) }) + } + + define_irq_accessor_by_index!( + /// Returns a [`irq::Registration`] for the IRQ at the given index. + request_irq_by_index, + irq_by_index, + Registration, + Handler + ); + define_irq_accessor_by_name!( + /// Returns a [`irq::Registration`] for the IRQ with the given name. + request_irq_by_name, + irq_by_name, + Registration, + Handler + ); + define_irq_accessor_by_index!( + /// Does the same as [`Self::request_irq_by_index`], except that it does + /// not print an error message if the IRQ cannot be obtained. + request_optional_irq_by_index, + optional_irq_by_index, + Registration, + Handler + ); + define_irq_accessor_by_name!( + /// Does the same as [`Self::request_irq_by_name`], except that it does + /// not print an error message if the IRQ cannot be obtained. + request_optional_irq_by_name, + optional_irq_by_name, + Registration, + Handler + ); + + define_irq_accessor_by_index!( + /// Returns a [`irq::ThreadedRegistration`] for the IRQ at the given index. + request_threaded_irq_by_index, + irq_by_index, + ThreadedRegistration, + ThreadedHandler + ); + define_irq_accessor_by_name!( + /// Returns a [`irq::ThreadedRegistration`] for the IRQ with the given name. + request_threaded_irq_by_name, + irq_by_name, + ThreadedRegistration, + ThreadedHandler + ); + define_irq_accessor_by_index!( + /// Does the same as [`Self::request_threaded_irq_by_index`], except + /// that it does not print an error message if the IRQ cannot be + /// obtained. + request_optional_threaded_irq_by_index, + optional_irq_by_index, + ThreadedRegistration, + ThreadedHandler + ); + define_irq_accessor_by_name!( + /// Does the same as [`Self::request_threaded_irq_by_name`], except that + /// it does not print an error message if the IRQ cannot be obtained. + request_optional_threaded_irq_by_name, + optional_irq_by_name, + ThreadedRegistration, + ThreadedHandler + ); +} + // SAFETY: `Device` is a transparent wrapper of a type that doesn't depend on `Device`'s generic // argument. kernel::impl_device_context_deref!(unsafe { Device }); From 9b6d4fb9804febb1ae75e7259bb475cea58e28a7 Mon Sep 17 00:00:00 2001 From: Daniel Almeida Date: Mon, 11 Aug 2025 13:03:44 -0300 Subject: [PATCH 0109/3206] rust: pci: add irq accessors These accessors can be used to retrieve a irq::Registration or a irq::ThreadedRegistration from a pci device. Alternatively, drivers can retrieve an IrqRequest from a bound PCI device for later use. These accessors ensure that only valid IRQ lines can ever be registered. Reviewed-by: Alice Ryhl Tested-by: Joel Fernandes Tested-by: Dirk Behme Signed-off-by: Daniel Almeida Link: https://lore.kernel.org/r/20250811-topics-tyr-request_irq2-v9-6-0485dcd9bcbf@collabora.com Signed-off-by: Danilo Krummrich --- rust/helpers/pci.c | 8 ++++++++ rust/kernel/pci.rs | 45 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/rust/helpers/pci.c b/rust/helpers/pci.c index ef9cb38c81a6a5..5bf56004478c69 100644 --- a/rust/helpers/pci.c +++ b/rust/helpers/pci.c @@ -11,3 +11,11 @@ bool rust_helper_dev_is_pci(const struct device *dev) { return dev_is_pci(dev); } + +#ifndef CONFIG_PCI_MSI +int rust_helper_pci_irq_vector(struct pci_dev *pdev, unsigned int nvec) +{ + return pci_irq_vector(pdev, nvec); +} + +#endif diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 887ee611b55310..3bf1737635a9e2 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -10,8 +10,8 @@ use crate::{ devres::Devres, driver, error::{from_result, to_result, Result}, - io::Io, - io::IoRaw, + io::{Io, IoRaw}, + irq::{self, IrqRequest}, str::CStr, types::{ARef, Opaque}, ThisModule, @@ -431,6 +431,47 @@ impl Device { ) -> impl PinInit, Error> + 'a { self.iomap_region_sized::<0>(bar, name) } + + /// Returns an [`IrqRequest`] for the IRQ vector at the given index, if any. + pub fn irq_vector(&self, index: u32) -> Result> { + // SAFETY: `self.as_raw` returns a valid pointer to a `struct pci_dev`. + let irq = unsafe { crate::bindings::pci_irq_vector(self.as_raw(), index) }; + if irq < 0 { + return Err(crate::error::Error::from_errno(irq)); + } + // SAFETY: `irq` is guaranteed to be a valid IRQ number for `&self`. + Ok(unsafe { IrqRequest::new(self.as_ref(), irq as u32) }) + } + + /// Returns a [`kernel::irq::Registration`] for the IRQ vector at the given + /// index. + pub fn request_irq<'a, T: crate::irq::Handler + 'static>( + &'a self, + index: u32, + flags: irq::Flags, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> Result, Error> + 'a> { + let request = self.irq_vector(index)?; + + Ok(irq::Registration::::new(request, flags, name, handler)) + } + + /// Returns a [`kernel::irq::ThreadedRegistration`] for the IRQ vector at + /// the given index. + pub fn request_threaded_irq<'a, T: crate::irq::ThreadedHandler + 'static>( + &'a self, + index: u32, + flags: irq::Flags, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> Result, Error> + 'a> { + let request = self.irq_vector(index)?; + + Ok(irq::ThreadedRegistration::::new( + request, flags, name, handler, + )) + } } impl Device { From 29e16fcd67ee5b1d0417a657294cf96fdf2f8df9 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 11 Aug 2025 13:03:45 -0300 Subject: [PATCH 0110/3206] rust: irq: add &Device argument to irq callbacks When working with a bus device, many operations are only possible while the device is still bound. The &Device type represents a proof in the type system that you are in a scope where the device is guaranteed to still be bound. Since we deregister irq callbacks when unbinding a device, if an irq callback is running, that implies that the device has not yet been unbound. To allow drivers to take advantage of that, add an additional argument to irq callbacks. Signed-off-by: Alice Ryhl Reviewed-by: Boqun Feng Signed-off-by: Daniel Almeida Link: https://lore.kernel.org/r/20250811-topics-tyr-request_irq2-v9-7-0485dcd9bcbf@collabora.com Signed-off-by: Danilo Krummrich --- rust/kernel/irq/request.rs | 95 +++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 38 deletions(-) diff --git a/rust/kernel/irq/request.rs b/rust/kernel/irq/request.rs index 4033df7d0dce1d..b150563fdef809 100644 --- a/rust/kernel/irq/request.rs +++ b/rust/kernel/irq/request.rs @@ -36,18 +36,18 @@ pub trait Handler: Sync { /// All work that does not necessarily need to be executed from /// interrupt context, should be deferred to a threaded handler. /// See also [`ThreadedRegistration`]. - fn handle(&self) -> IrqReturn; + fn handle(&self, device: &Device) -> IrqReturn; } impl Handler for Arc { - fn handle(&self) -> IrqReturn { - T::handle(self) + fn handle(&self, device: &Device) -> IrqReturn { + T::handle(self, device) } } impl Handler for Box { - fn handle(&self) -> IrqReturn { - T::handle(self) + fn handle(&self, device: &Device) -> IrqReturn { + T::handle(self, device) } } @@ -140,7 +140,7 @@ impl<'a> IrqRequest<'a> { /// /// ``` /// use kernel::c_str; -/// use kernel::device::Bound; +/// use kernel::device::{Bound, Device}; /// use kernel::irq::{self, Flags, IrqRequest, IrqReturn, Registration}; /// use kernel::prelude::*; /// use kernel::sync::{Arc, Completion}; @@ -154,7 +154,7 @@ impl<'a> IrqRequest<'a> { /// /// impl irq::Handler for Data { /// // Executed in IRQ context. -/// fn handle(&self) -> IrqReturn { +/// fn handle(&self, _dev: &Device) -> IrqReturn { /// self.completion.complete_all(); /// IrqReturn::Handled /// } @@ -180,7 +180,7 @@ impl<'a> IrqRequest<'a> { /// /// # Invariants /// -/// * We own an irq handler using `&self.handler` as its private data. +/// * We own an irq handler whose cookie is a pointer to `Self`. #[pin_data] pub struct Registration { #[pin] @@ -208,21 +208,24 @@ impl Registration { inner <- Devres::new( request.dev, try_pin_init!(RegistrationInner { - // SAFETY: `this` is a valid pointer to the `Registration` instance - cookie: unsafe { &raw mut (*this.as_ptr()).handler }.cast(), + // INVARIANT: `this` is a valid pointer to the `Registration` instance + cookie: this.as_ptr().cast::(), irq: { // SAFETY: // - The callbacks are valid for use with request_irq. // - If this succeeds, the slot is guaranteed to be valid until the // destructor of Self runs, which will deregister the callbacks // before the memory location becomes invalid. + // - When request_irq is called, everything that handle_irq_callback will + // touch has already been initialized, so it's safe for the callback to + // be called immediately. to_result(unsafe { bindings::request_irq( request.irq, Some(handle_irq_callback::), flags.into_inner(), name.as_char_ptr(), - (&raw mut (*this.as_ptr()).handler).cast(), + this.as_ptr().cast::(), ) })?; request.irq @@ -259,9 +262,13 @@ impl Registration { /// /// This function should be only used as the callback in `request_irq`. unsafe extern "C" fn handle_irq_callback(_irq: i32, ptr: *mut c_void) -> c_uint { - // SAFETY: `ptr` is a pointer to T set in `Registration::new` - let handler = unsafe { &*(ptr as *const T) }; - T::handle(handler) as c_uint + // SAFETY: `ptr` is a pointer to `Registration` set in `Registration::new` + let registration = unsafe { &*(ptr as *const Registration) }; + // SAFETY: The irq callback is removed before the device is unbound, so the fact that the irq + // callback is running implies that the device has not yet been unbound. + let device = unsafe { registration.inner.device().as_bound() }; + + T::handle(®istration.handler, device) as c_uint } /// The value that can be returned from [`ThreadedHandler::handle`]. @@ -287,7 +294,8 @@ pub trait ThreadedHandler: Sync { /// handler, i.e. [`ThreadedHandler::handle_threaded`]. /// /// The default implementation returns [`ThreadedIrqReturn::WakeThread`]. - fn handle(&self) -> ThreadedIrqReturn { + #[expect(unused_variables)] + fn handle(&self, device: &Device) -> ThreadedIrqReturn { ThreadedIrqReturn::WakeThread } @@ -295,26 +303,26 @@ pub trait ThreadedHandler: Sync { /// /// This is executed in process context. The kernel creates a dedicated /// `kthread` for this purpose. - fn handle_threaded(&self) -> IrqReturn; + fn handle_threaded(&self, device: &Device) -> IrqReturn; } impl ThreadedHandler for Arc { - fn handle(&self) -> ThreadedIrqReturn { - T::handle(self) + fn handle(&self, device: &Device) -> ThreadedIrqReturn { + T::handle(self, device) } - fn handle_threaded(&self) -> IrqReturn { - T::handle_threaded(self) + fn handle_threaded(&self, device: &Device) -> IrqReturn { + T::handle_threaded(self, device) } } impl ThreadedHandler for Box { - fn handle(&self) -> ThreadedIrqReturn { - T::handle(self) + fn handle(&self, device: &Device) -> ThreadedIrqReturn { + T::handle(self, device) } - fn handle_threaded(&self) -> IrqReturn { - T::handle_threaded(self) + fn handle_threaded(&self, device: &Device) -> IrqReturn { + T::handle_threaded(self, device) } } @@ -333,7 +341,7 @@ impl ThreadedHandler for Box { /// /// ``` /// use kernel::c_str; -/// use kernel::device::Bound; +/// use kernel::device::{Bound, Device}; /// use kernel::irq::{ /// self, Flags, IrqRequest, IrqReturn, ThreadedHandler, ThreadedIrqReturn, /// ThreadedRegistration, @@ -357,7 +365,7 @@ impl ThreadedHandler for Box { /// // This will run (in a separate kthread) if and only if /// // [`ThreadedHandler::handle`] returns [`WakeThread`], which it does by /// // default. -/// fn handle_threaded(&self) -> IrqReturn { +/// fn handle_threaded(&self, _dev: &Device) -> IrqReturn { /// let mut data = self.value.lock(); /// *data += 1; /// IrqReturn::Handled @@ -390,7 +398,7 @@ impl ThreadedHandler for Box { /// /// # Invariants /// -/// * We own an irq handler using `&T` as its private data. +/// * We own an irq handler whose cookie is a pointer to `Self`. #[pin_data] pub struct ThreadedRegistration { #[pin] @@ -418,14 +426,17 @@ impl ThreadedRegistration { inner <- Devres::new( request.dev, try_pin_init!(RegistrationInner { - // SAFETY: `this` is a valid pointer to the `ThreadedRegistration` instance. - cookie: unsafe { &raw mut (*this.as_ptr()).handler }.cast(), + // INVARIANT: `this` is a valid pointer to the `ThreadedRegistration` instance. + cookie: this.as_ptr().cast::(), irq: { // SAFETY: // - The callbacks are valid for use with request_threaded_irq. // - If this succeeds, the slot is guaranteed to be valid until the - // destructor of Self runs, which will deregister the callbacks - // before the memory location becomes invalid. + // destructor of Self runs, which will deregister the callbacks + // before the memory location becomes invalid. + // - When request_threaded_irq is called, everything that the two callbacks + // will touch has already been initialized, so it's safe for the + // callbacks to be called immediately. to_result(unsafe { bindings::request_threaded_irq( request.irq, @@ -433,7 +444,7 @@ impl ThreadedRegistration { Some(thread_fn_callback::), flags.into_inner(), name.as_char_ptr(), - (&raw mut (*this.as_ptr()).handler).cast(), + this.as_ptr().cast::(), ) })?; request.irq @@ -473,16 +484,24 @@ unsafe extern "C" fn handle_threaded_irq_callback( _irq: i32, ptr: *mut c_void, ) -> c_uint { - // SAFETY: `ptr` is a pointer to T set in `ThreadedRegistration::new` - let handler = unsafe { &*(ptr as *const T) }; - T::handle(handler) as c_uint + // SAFETY: `ptr` is a pointer to `ThreadedRegistration` set in `ThreadedRegistration::new` + let registration = unsafe { &*(ptr as *const ThreadedRegistration) }; + // SAFETY: The irq callback is removed before the device is unbound, so the fact that the irq + // callback is running implies that the device has not yet been unbound. + let device = unsafe { registration.inner.device().as_bound() }; + + T::handle(®istration.handler, device) as c_uint } /// # Safety /// /// This function should be only used as the callback in `request_threaded_irq`. unsafe extern "C" fn thread_fn_callback(_irq: i32, ptr: *mut c_void) -> c_uint { - // SAFETY: `ptr` is a pointer to T set in `ThreadedRegistration::new` - let handler = unsafe { &*(ptr as *const T) }; - T::handle_threaded(handler) as c_uint + // SAFETY: `ptr` is a pointer to `ThreadedRegistration` set in `ThreadedRegistration::new` + let registration = unsafe { &*(ptr as *const ThreadedRegistration) }; + // SAFETY: The irq callback is removed before the device is unbound, so the fact that the irq + // callback is running implies that the device has not yet been unbound. + let device = unsafe { registration.inner.device().as_bound() }; + + T::handle_threaded(®istration.handler, device) as c_uint } From 377c2b3c46c6ff33cb6559b98c44e22f1ff9fefc Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 25 Jul 2025 22:28:40 +0200 Subject: [PATCH 0111/3206] MAINTAINERS: add "DEVICE I/O & IRQ [RUST]" entry This entry will handle device I/O patches and abstractions (such as memory-mapped IO and system resources series [1]), as well as IRQ ones (such as the `request_irq` series [2]). Patches will flow through driver-core, at least for the time being. Danilo, Alice and Daniel will maintain it. Cc: Danilo Krummrich Cc: Alice Ryhl Cc: Daniel Almeida Link: https://lore.kernel.org/rust-for-linux/20250717-topics-tyr-platform_iomem-v15-0-beca780b77e3@collabora.com/ [1] Link: https://lore.kernel.org/rust-for-linux/20250715-topics-tyr-request_irq2-v7-0-d469c0f37c07@collabora.com/ [2] Signed-off-by: Miguel Ojeda Acked-by: Daniel Almeida Acked-by: Alice Ryhl Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250725202840.2251768-1-ojeda@kernel.org Signed-off-by: Danilo Krummrich --- MAINTAINERS | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..38b90f3a04dd90 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7021,6 +7021,21 @@ F: drivers/devfreq/event/ F: include/dt-bindings/pmu/exynos_ppmu.h F: include/linux/devfreq-event.h +DEVICE I/O & IRQ [RUST] +M: Danilo Krummrich +M: Alice Ryhl +M: Daniel Almeida +L: rust-for-linux@vger.kernel.org +S: Supported +W: https://rust-for-linux.com +B: https://github.com/Rust-for-Linux/linux/issues +C: https://rust-for-linux.zulipchat.com +T: git git://git.kernel.org/pub/scm/linux/kernel/git/driver-core/driver-core.git +F: rust/kernel/io.rs +F: rust/kernel/io/ +F: rust/kernel/irq.rs +F: rust/kernel/irq/ + DEVICE RESOURCE MANAGEMENT HELPERS M: Hans de Goede R: Matti Vaittinen From 8a013ec9cb7af9921656c0e78c73510c9e4a0cc1 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 12 Aug 2025 13:50:35 +0200 Subject: [PATCH 0112/3206] cgroup: Replace deprecated strcpy() with strscpy() strcpy() is deprecated; use strscpy() instead. Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Thorsten Blum Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 2a4a387f867abc..763343fbd5a174 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -1133,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc) if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, ctx->release_agent); + strscpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } From 3e2b799008a78c21c648328bed9f566335f0394e Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Mon, 4 Aug 2025 20:27:30 +0800 Subject: [PATCH 0113/3206] bpf: Remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. No functional changes. Signed-off-by: Qianfeng Rong Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250804122731.460158-1-rongqianfeng@vivo.com --- kernel/bpf/devmap.c | 2 +- kernel/bpf/local_storage.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 482d284a155386..2625601de76e95 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -865,7 +865,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab_netdev *dev; dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), - GFP_NOWAIT | __GFP_NOWARN, + GFP_NOWAIT, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 632d51b05fe983..c93a756e035c02 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -165,7 +165,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key, } new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), - __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN, + __GFP_ZERO | GFP_NOWAIT, map->numa_node); if (!new) return -ENOMEM; From c93c59baa5ab57e94b874000cec56e26611b7a23 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 11 Aug 2025 20:58:20 +0200 Subject: [PATCH 0114/3206] bpf: Tidy verifier bug message Yonghong noticed that error messages for potential verifier bugs often have a '(1)' at the end. This is happening because verifier_bug_if(cond, env, fmt, args...) prints "(" #cond ")\n" as part of the message and verifier_bug() is defined as: #define verifier_bug(env, fmt, args...) verifier_bug_if(1, env, fmt, ##args) Hence, verifier_bug() always ends up displaying '(1)'. This small patch fixes it by having verifier_bug_if conditionally call verifier_bug instead of the other way around. Fixes: 1cb0f56d9618 ("bpf: WARN_ONCE on verifier bugs") Reported-by: Yonghong Song Signed-off-by: Paul Chaignon Signed-off-by: Andrii Nakryiko Tested-by: Eduard Zingerman Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/aJo9THBrzo8jFXsh@mail.gmail.com --- include/linux/bpf_verifier.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c823f8efe3edd2..020de62bd09cdd 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -875,13 +875,15 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, #define verifier_bug_if(cond, env, fmt, args...) \ ({ \ bool __cond = (cond); \ - if (unlikely(__cond)) { \ - BPF_WARN_ONCE(1, "verifier bug: " fmt "(" #cond ")\n", ##args); \ - bpf_log(&env->log, "verifier bug: " fmt "(" #cond ")\n", ##args); \ - } \ + if (unlikely(__cond)) \ + verifier_bug(env, fmt " (" #cond ")", ##args); \ (__cond); \ }) -#define verifier_bug(env, fmt, args...) verifier_bug_if(1, env, fmt, ##args) +#define verifier_bug(env, fmt, args...) \ + ({ \ + BPF_WARN_ONCE(1, "verifier bug: " fmt "\n", ##args); \ + bpf_log(&env->log, "verifier bug: " fmt "\n", ##args); \ + }) static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { From bf0c2a84df9fb0f7779eb24c30198ef93f292e66 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Mon, 11 Aug 2025 20:39:49 +0800 Subject: [PATCH 0115/3206] bpf: Replace kvfree with kfree for kzalloc memory The 'backedge' pointer is allocated with kzalloc(), which returns physically contiguous memory. Using kvfree() to deallocate such memory is functionally safe but semantically incorrect. Replace kvfree() with kfree() to avoid unnecessary is_vmalloc_addr() check in kvfree(). Signed-off-by: Qianfeng Rong Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20250811123949.552885-1-rongqianfeng@vivo.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3c2e7f36a09c91..3a3982fe20d48a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19553,7 +19553,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) err = err ?: add_scc_backedge(env, &sl->state, backedge); if (err) { free_verifier_state(&backedge->state, false); - kvfree(backedge); + kfree(backedge); return err; } } From 148547000cfc1ba8cec02857268333d08724b9cc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 13 Aug 2025 08:38:27 +0300 Subject: [PATCH 0116/3206] gpio: aggregator: Fix off by one in gpiochip_fwd_desc_add() The "> chip->ngpio" comparison here needs to be ">= chip->ngpio", otherwise it leads to an out of bounds access. The fwd->valid_mask bitmap only has chip->ngpio bits and the fwd->descs[] array has that same number of elements. These values are set in devm_gpiochip_fwd_alloc(). Fixes: c44ce91b8ada ("gpio: aggregator: refactor the code to add GPIO desc in the forwarder") Signed-off-by: Dan Carpenter Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/aJwk0yBSCccGCjX3@stanley.mountain Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index 0ef6556f98b131..37600faf4a4b72 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -744,7 +744,7 @@ int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, { struct gpio_chip *chip = &fwd->chip; - if (offset > chip->ngpio) + if (offset >= chip->ngpio) return -EINVAL; if (test_and_set_bit(offset, fwd->valid_mask)) From 8abbbbb588f1f1bf95ae56c1531a17520ce487e2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 11 Aug 2025 22:27:22 +0900 Subject: [PATCH 0117/3206] platform/chrome: cros_ec: Avoid -Wflex-array-member-not-at-end warning -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Use the new TRAILING_OVERLAP() helper to fix the following warning: drivers/platform/chrome/cros_ec.c:106:40: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] This helper creates a union between a flexible-array member (FAM) and a set of members that would otherwise follow it. This overlays the trailing members onto the FAM while preserving the original memory layout. Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/aJnvuv334M7TljoB@kspp Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c index fd58781a2fb7da..b1730362e399c7 100644 --- a/drivers/platform/chrome/cros_ec.c +++ b/drivers/platform/chrome/cros_ec.c @@ -102,14 +102,13 @@ EXPORT_SYMBOL(cros_ec_irq_thread); static int cros_ec_sleep_event(struct cros_ec_device *ec_dev, u8 sleep_event) { int ret; - struct { - struct cros_ec_command msg; + TRAILING_OVERLAP(struct cros_ec_command, msg, data, union { struct ec_params_host_sleep_event req0; struct ec_params_host_sleep_event_v1 req1; struct ec_response_host_sleep_event_v1 resp1; } u; - } __packed buf; + ) __packed buf; memset(&buf, 0, sizeof(buf)); From f7439a723e5aa5b35c76355e1b9b2cd1108f656e Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Wed, 13 Aug 2025 17:48:55 +0800 Subject: [PATCH 0118/3206] platform/chrome: wilco_ec: Remove redundant semicolons Remove unnecessary semicolons. Signed-off-by: Liao Yuanhong Link: https://lore.kernel.org/r/20250813094858.557742-1-liaoyuanhong@vivo.com Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/wilco_ec/telemetry.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/chrome/wilco_ec/telemetry.c b/drivers/platform/chrome/wilco_ec/telemetry.c index 7d8ae2cbf72f9e..b18043e31ae4fa 100644 --- a/drivers/platform/chrome/wilco_ec/telemetry.c +++ b/drivers/platform/chrome/wilco_ec/telemetry.c @@ -388,7 +388,7 @@ static int telem_device_probe(struct platform_device *pdev) dev_set_name(&dev_data->dev, TELEM_DEV_NAME_FMT, minor); device_initialize(&dev_data->dev); - /* Initialize the character device and add it to userspace */; + /* Initialize the character device and add it to userspace */ cdev_init(&dev_data->cdev, &telem_fops); error = cdev_device_add(&dev_data->cdev, &dev_data->dev); if (error) { From d0de6895095356f1cc2d5f9825df220f10d34cee Mon Sep 17 00:00:00 2001 From: Jeff Chang Date: Wed, 13 Aug 2025 10:08:50 +0800 Subject: [PATCH 0119/3206] regulator: dt-bindings: Add Richtek RT5133 Support Add bindings for Richtek RT5133 IC Controlled PMIC Signed-off-by: Jeff Chang Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250813020910.2977555-1-jeff_chang@richtek.com Signed-off-by: Mark Brown --- .../bindings/regulator/richtek,rt5133.yaml | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 Documentation/devicetree/bindings/regulator/richtek,rt5133.yaml diff --git a/Documentation/devicetree/bindings/regulator/richtek,rt5133.yaml b/Documentation/devicetree/bindings/regulator/richtek,rt5133.yaml new file mode 100644 index 00000000000000..d2e007fee6ba1f --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/richtek,rt5133.yaml @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/richtek,rt5133.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Richtek RT5133 PMIC Regulator + +maintainers: + - ShihChia Chang + +description: + The RT5133 is an integrated Power Management IC for portable devices, + featuring 8 LDOs and 3 GPOs. It allows programmable output voltages, + soft-start times, and protections via I2C. GPO operation depends on LDO1 + voltage. + +properties: + compatible: + enum: + - richtek,rt5133 + + reg: + maxItems: 1 + + enable-gpios: + maxItems: 1 + + wakeup-source: true + + interrupts: + maxItems: 1 + + gpio-controller: true + + "#gpio-cells": + const: 2 + + richtek,oc-shutdown-all: + type: boolean + description: + Controls the behavior when any LDO (Low Dropout Regulator) enters an + Over Current state. + If set to true, all LDO channels will be shut down. + If set to false, only the affected LDO channel will shut down itself. + + richtek,pgb-shutdown-all: + type: boolean + description: + Controls the behavior when any LDO enters a Power Good Bad state. + If set to true, all LDO channels will be shut down. + If set to false, only the affected LDO channel will shut down itself. + + regulators: + type: object + additionalProperties: false + + properties: + base: + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + description: + Properties for the base regulator, which is the top-level supply for + LDO1 to LDO6. It functions merely as an on/off switch rather than + regulating voltages. If none of LDO1 to LDO6 are in use, switching + off the base will reduce the quiescent current. + + required: + - regulator-name + + patternProperties: + "^ldo([1-6])$": + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + description: + Properties for single LDO regulator + + required: + - regulator-name + + "^ldo([7-8])$": + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + description: + Properties for single LDO regulator + + properties: + vin-supply: true + + required: + - regulator-name + - vin-supply + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + #include + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + + pmic@18 { + compatible = "richtek,rt5133"; + reg = <0x18>; + wakeup-source; + interrupts-extended = <&gpio 0 IRQ_TYPE_EDGE_FALLING>; + enable-gpios = <&gpio 2 GPIO_ACTIVE_HIGH>; + gpio-controller; + #gpio-cells = <2>; + richtek,oc-shutdown-all; + richtek,pgb-shutdown-all; + regulators { + base { + regulator-name = "base"; + }; + pvin78: ldo1 { + regulator-name = "ldo1"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3199998>; + regulator-active-discharge = <1>; + }; + ldo2 { + regulator-name = "ldo2"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3200000>; + regulator-active-discharge = <1>; + }; + ldo3 { + regulator-name = "ldo3"; + regulator-min-microvolt = <1700000>; + regulator-max-microvolt = <3000000>; + regulator-active-discharge = <1>; + }; + ldo4 { + regulator-name = "ldo4"; + regulator-min-microvolt = <1700000>; + regulator-max-microvolt = <3000000>; + regulator-active-discharge = <1>; + }; + ldo5 { + regulator-name = "ldo5"; + regulator-min-microvolt = <1700000>; + regulator-max-microvolt = <3000000>; + regulator-active-discharge = <1>; + }; + ldo6 { + regulator-name = "ldo6"; + regulator-min-microvolt = <1700000>; + regulator-max-microvolt = <3000000>; + regulator-active-discharge = <1>; + }; + ldo7 { + regulator-name = "ldo7"; + regulator-min-microvolt = <900000>; + regulator-max-microvolt = <1200000>; + regulator-active-discharge = <1>; + vin-supply = <&pvin78>; + }; + ldo8 { + regulator-name = "ldo8"; + regulator-min-microvolt = <855000>; + regulator-max-microvolt = <1200000>; + regulator-active-discharge = <1>; + vin-supply = <&pvin78>; + }; + }; + }; + }; From 714165e1c4b0d5b8c6d095fe07f65e6e7047aaeb Mon Sep 17 00:00:00 2001 From: Jeff Chang Date: Wed, 13 Aug 2025 10:08:51 +0800 Subject: [PATCH 0120/3206] regulator: rt5133: Add RT5133 PMIC regulator Support RT5133 is a highly-integrated chip. It includes 8 LDOs and 3 GPOs that can be used to drive output high/low purpose. The dependency of the GPO block is internally LDO1 Voltage. Signed-off-by: Jeff Chang Link: https://patch.msgid.link/20250813020910.2977555-2-jeff_chang@richtek.com Signed-off-by: Mark Brown --- drivers/regulator/Kconfig | 12 + drivers/regulator/Makefile | 1 + drivers/regulator/rt5133-regulator.c | 642 +++++++++++++++++++++++++++ 3 files changed, 655 insertions(+) create mode 100644 drivers/regulator/rt5133-regulator.c diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index e511174be7bf54..223a500370d093 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -1248,6 +1248,18 @@ config REGULATOR_RT5120 600mV to 1395mV, per step 6.250mV. The others are all fixed voltage by external hardware circuit. +config REGULATOR_RT5133 + tristate "Richtek RT5133 PMIC Regulators" + depends on I2C && GPIOLIB && OF + select REGMAP + select CRC8 + select OF_GPIO + help + This driver adds support for RT5133 PMIC regulators. + RT5133 is an integrated chip. It includes 8 LDOs and 3 GPOs that + can be used to drive output high/low purpose. The dependency of the + GPO block is internally LDO1 Voltage. + config REGULATOR_RT5190A tristate "Richtek RT5190A PMIC" depends on I2C diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index 80f820ed22a323..54e67e1c0c676a 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -147,6 +147,7 @@ obj-$(CONFIG_REGULATOR_RT4803) += rt4803.o obj-$(CONFIG_REGULATOR_RT4831) += rt4831-regulator.o obj-$(CONFIG_REGULATOR_RT5033) += rt5033-regulator.o obj-$(CONFIG_REGULATOR_RT5120) += rt5120-regulator.o +obj-$(CONFIG_REGULATOR_RT5133) += rt5133-regulator.o obj-$(CONFIG_REGULATOR_RT5190A) += rt5190a-regulator.o obj-$(CONFIG_REGULATOR_RT5739) += rt5739.o obj-$(CONFIG_REGULATOR_RT5759) += rt5759-regulator.o diff --git a/drivers/regulator/rt5133-regulator.c b/drivers/regulator/rt5133-regulator.c new file mode 100644 index 00000000000000..d0f367381fbbde --- /dev/null +++ b/drivers/regulator/rt5133-regulator.c @@ -0,0 +1,642 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Richtek Technology Corp. +// Author: ChiYuan Huang +// Author: ShihChia Chang + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RT5133_REG_CHIP_INFO 0x00 +#define RT5133_REG_RST_CTRL 0x06 +#define RT5133_REG_BASE_CTRL 0x09 +#define RT5133_REG_GPIO_CTRL 0x0B +#define RT5133_REG_BASE_EVT 0x10 +#define RT5133_REG_LDO_PGB_STAT 0x15 +#define RT5133_REG_BASE_MASK 0x16 +#define RT5133_REG_LDO_SHDN 0x19 +#define RT5133_REG_LDO_ON 0x1A +#define RT5133_REG_LDO_OFF 0x1B +#define RT5133_REG_LDO1_CTRL1 0x20 +#define RT5133_REG_LDO1_CTRL2 0x21 +#define RT5133_REG_LDO1_CTRL3 0x22 +#define RT5133_REG_LDO2_CTRL1 0x24 +#define RT5133_REG_LDO2_CTRL2 0x25 +#define RT5133_REG_LDO2_CTRL3 0x26 +#define RT5133_REG_LDO3_CTRL1 0x28 +#define RT5133_REG_LDO3_CTRL2 0x29 +#define RT5133_REG_LDO3_CTRL3 0x2A +#define RT5133_REG_LDO4_CTRL1 0x2C +#define RT5133_REG_LDO4_CTRL2 0x2D +#define RT5133_REG_LDO4_CTRL3 0x2E +#define RT5133_REG_LDO5_CTRL1 0x30 +#define RT5133_REG_LDO5_CTRL2 0x31 +#define RT5133_REG_LDO5_CTRL3 0x32 +#define RT5133_REG_LDO6_CTRL1 0x34 +#define RT5133_REG_LDO6_CTRL2 0x35 +#define RT5133_REG_LDO6_CTRL3 0x36 +#define RT5133_REG_LDO7_CTRL1 0x38 +#define RT5133_REG_LDO7_CTRL2 0x39 +#define RT5133_REG_LDO7_CTRL3 0x3A +#define RT5133_REG_LDO8_CTRL1 0x3C +#define RT5133_REG_LDO8_CTRL2 0x3D +#define RT5133_REG_LDO8_CTRL3 0x3E +#define RT5133_REG_LDO8_CTRL4 0x3F + +#define RT5133_LDO_REG_BASE(_id) (0x20 + ((_id) - 1) * 4) + +#define RT5133_VENDOR_ID_MASK GENMASK(7, 4) +#define RT5133_RESET_CODE 0xB1 + +#define RT5133_FOFF_BASE_MASK BIT(1) +#define RT5133_OCSHDN_ALL_MASK BIT(7) +#define RT5133_OCSHDN_ALL_SHIFT (7) +#define RT5133_PGBSHDN_ALL_MASK BIT(6) +#define RT5133_PGBSHDN_ALL_SHIFT (6) + +#define RT5133_OCPTSEL_MASK BIT(5) +#define RT5133_PGBPTSEL_MASK BIT(4) +#define RT5133_STBTDSEL_MASK GENMASK(1, 0) + +#define RT5133_LDO_ENABLE_MASK BIT(7) +#define RT5133_LDO_VSEL_MASK GENMASK(7, 5) +#define RT5133_LDO_AD_MASK BIT(2) +#define RT5133_LDO_SOFT_START_MASK GENMASK(1, 0) + +#define RT5133_GPIO_NR 3 + +#define RT5133_LDO_PGB_EVT_MASK GENMASK(23, 16) +#define RT5133_LDO_PGB_EVT_SHIFT 16 +#define RT5133_LDO_OC_EVT_MASK GENMASK(15, 8) +#define RT5133_LDO_OC_EVT_SHIFT 8 +#define RT5133_VREF_EVT_MASK BIT(6) +#define RT5133_BASE_EVT_MASK GENMASK(7, 0) +#define RT5133_INTR_CLR_MASK GENMASK(23, 0) +#define RT5133_INTR_BYTE_NR 3 + +#define RT5133_MAX_I2C_BLOCK_SIZE 1 + +#define RT5133_CRC8_POLYNOMIAL 0x7 + +#define RT5133_I2C_ADDR_LEN 1 +#define RT5133_PREDATA_LEN 2 +#define RT5133_I2C_CRC_LEN 1 +#define RT5133_REG_ADDR_LEN 1 +#define RT5133_I2C_DUMMY_LEN 1 + +#define I2C_ADDR_XLATE_8BIT(_addr, _rw) ((((_addr) & 0x7F) << 1) | (_rw)) + +enum { + RT5133_REGULATOR_BASE = 0, + RT5133_REGULATOR_LDO1, + RT5133_REGULATOR_LDO2, + RT5133_REGULATOR_LDO3, + RT5133_REGULATOR_LDO4, + RT5133_REGULATOR_LDO5, + RT5133_REGULATOR_LDO6, + RT5133_REGULATOR_LDO7, + RT5133_REGULATOR_LDO8, + RT5133_REGULATOR_MAX +}; + +struct chip_data { + const struct regulator_desc *regulators; + const u8 vendor_id; +}; + +struct rt5133_priv { + struct device *dev; + struct regmap *regmap; + struct gpio_desc *enable_gpio; + struct regulator_dev *rdev[RT5133_REGULATOR_MAX]; + struct gpio_chip gc; + const struct chip_data *cdata; + unsigned int gpio_output_flag; + u8 crc8_tbls[CRC8_TABLE_SIZE]; +}; + +static const unsigned int vout_type1_tables[] = { + 1800000, 2500000, 2700000, 2800000, 2900000, 3000000, 3100000, 3200000 +}; + +static const unsigned int vout_type2_tables[] = { + 1700000, 1800000, 1900000, 2500000, 2700000, 2800000, 2900000, 3000000 +}; + +static const unsigned int vout_type3_tables[] = { + 900000, 950000, 1000000, 1050000, 1100000, 1150000, 1200000, 1800000 +}; + +static const unsigned int vout_type4_tables[] = { + 855000, 900000, 950000, 1000000, 1040000, 1090000, 1140000, 1710000 +}; + +static const struct regulator_ops rt5133_regulator_ops = { + .list_voltage = regulator_list_voltage_table, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .set_active_discharge = regulator_set_active_discharge_regmap, +}; + +static const struct regulator_ops rt5133_base_regulator_ops = { + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, +}; + +#define RT5133_REGULATOR_DESC(_name, _node_name, vtables, _supply) \ +{\ + .name = #_name,\ + .id = RT5133_REGULATOR_##_name,\ + .of_match = of_match_ptr(#_node_name),\ + .regulators_node = of_match_ptr("regulators"),\ + .supply_name = _supply,\ + .type = REGULATOR_VOLTAGE,\ + .owner = THIS_MODULE,\ + .ops = &rt5133_regulator_ops,\ + .n_voltages = ARRAY_SIZE(vtables),\ + .volt_table = vtables,\ + .enable_reg = RT5133_REG_##_name##_CTRL1,\ + .enable_mask = RT5133_LDO_ENABLE_MASK,\ + .vsel_reg = RT5133_REG_##_name##_CTRL2,\ + .vsel_mask = RT5133_LDO_VSEL_MASK,\ + .active_discharge_reg = RT5133_REG_##_name##_CTRL3,\ + .active_discharge_mask = RT5133_LDO_AD_MASK,\ +} + +static const struct regulator_desc rt5133_regulators[] = { + /* For digital part, base current control */ + { + .name = "base", + .id = RT5133_REGULATOR_BASE, + .of_match = of_match_ptr("base"), + .regulators_node = of_match_ptr("regulators"), + .type = REGULATOR_VOLTAGE, + .owner = THIS_MODULE, + .ops = &rt5133_base_regulator_ops, + .enable_reg = RT5133_REG_BASE_CTRL, + .enable_mask = RT5133_FOFF_BASE_MASK, + .enable_is_inverted = true, + }, + RT5133_REGULATOR_DESC(LDO1, ldo1, vout_type1_tables, "base"), + RT5133_REGULATOR_DESC(LDO2, ldo2, vout_type1_tables, "base"), + RT5133_REGULATOR_DESC(LDO3, ldo3, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO4, ldo4, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO5, ldo5, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO6, ldo6, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO7, ldo7, vout_type3_tables, "vin"), + RT5133_REGULATOR_DESC(LDO8, ldo8, vout_type3_tables, "vin"), +}; + +static const struct regulator_desc rt5133a_regulators[] = { + /* For digital part, base current control */ + { + .name = "base", + .id = RT5133_REGULATOR_BASE, + .of_match = of_match_ptr("base"), + .regulators_node = of_match_ptr("regulators"), + .type = REGULATOR_VOLTAGE, + .owner = THIS_MODULE, + .ops = &rt5133_base_regulator_ops, + .enable_reg = RT5133_REG_BASE_CTRL, + .enable_mask = RT5133_FOFF_BASE_MASK, + .enable_is_inverted = true, + }, + RT5133_REGULATOR_DESC(LDO1, ldo1, vout_type1_tables, "base"), + RT5133_REGULATOR_DESC(LDO2, ldo2, vout_type1_tables, "base"), + RT5133_REGULATOR_DESC(LDO3, ldo3, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO4, ldo4, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO5, ldo5, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO6, ldo6, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO7, ldo7, vout_type3_tables, "vin"), + RT5133_REGULATOR_DESC(LDO8, ldo8, vout_type4_tables, "vin"), +}; + +static const struct chip_data regulator_data[] = { + { rt5133_regulators, 0x70}, + { rt5133a_regulators, 0x80}, +}; + +static int rt5133_gpio_direction_output(struct gpio_chip *gpio, + unsigned int offset, int value) +{ + struct rt5133_priv *priv = gpiochip_get_data(gpio); + + if (offset >= RT5133_GPIO_NR) + return -EINVAL; + + return regmap_update_bits(priv->regmap, RT5133_REG_GPIO_CTRL, + BIT(7 - offset) | BIT(3 - offset), + value ? BIT(7 - offset) | BIT(3 - offset) : 0); +} + +static int rt5133_gpio_get(struct gpio_chip *chip, unsigned int offset) +{ + struct rt5133_priv *priv = gpiochip_get_data(chip); + + return !!(priv->gpio_output_flag & BIT(offset)); +} + +static int rt5133_get_gpioen_mask(unsigned int offset, unsigned int *mask) +{ + if (offset >= RT5133_GPIO_NR) + return -EINVAL; + + *mask = (BIT(7 - offset) | BIT(3 - offset)); + + return 0; +} + +static int rt5133_gpio_set(struct gpio_chip *chip, unsigned int offset, int set_val) +{ + struct rt5133_priv *priv = gpiochip_get_data(chip); + unsigned int mask = 0, val = 0, next_flag = priv->gpio_output_flag; + int ret = 0; + + ret = rt5133_get_gpioen_mask(offset, &mask); + if (ret) { + dev_err(priv->dev, "%s get gpion en mask failed, offset(%d)\n", __func__, offset); + return ret; + } + + val = set_val ? mask : 0; + + if (set_val) + next_flag |= BIT(offset); + else + next_flag &= ~BIT(offset); + + ret = regmap_update_bits(priv->regmap, RT5133_REG_GPIO_CTRL, mask, val); + if (ret) { + dev_err(priv->dev, "Failed to set gpio [%d] val %d\n", offset, + set_val); + return ret; + } + + priv->gpio_output_flag = next_flag; + return 0; +} + +static irqreturn_t rt5133_intr_handler(int irq_number, void *data) +{ + struct rt5133_priv *priv = data; + u32 intr_evts = 0, handle_evts; + int i, ret; + + ret = regmap_bulk_read(priv->regmap, RT5133_REG_BASE_EVT, &intr_evts, + RT5133_INTR_BYTE_NR); + if (ret) { + dev_err(priv->dev, "%s, read event failed\n", __func__); + return IRQ_NONE; + } + + handle_evts = intr_evts & RT5133_BASE_EVT_MASK; + /* + * VREF_EVT is a special case, if base off + * this event will also be trigger. Skip it + */ + if (handle_evts & ~RT5133_VREF_EVT_MASK) + dev_dbg(priv->dev, "base event occurred [0x%02x]\n", + handle_evts); + + handle_evts = (intr_evts & RT5133_LDO_OC_EVT_MASK) >> + RT5133_LDO_OC_EVT_SHIFT; + + for (i = RT5133_REGULATOR_LDO1; i < RT5133_REGULATOR_MAX && handle_evts; i++) { + if (!(handle_evts & BIT(i - 1))) + continue; + regulator_notifier_call_chain(priv->rdev[i], + REGULATOR_EVENT_OVER_CURRENT, + &i); + } + + handle_evts = (intr_evts & RT5133_LDO_PGB_EVT_MASK) >> + RT5133_LDO_PGB_EVT_SHIFT; + for (i = RT5133_REGULATOR_LDO1; i < RT5133_REGULATOR_MAX && handle_evts; i++) { + if (!(handle_evts & BIT(i - 1))) + continue; + regulator_notifier_call_chain(priv->rdev[i], + REGULATOR_EVENT_FAIL, &i); + } + + ret = regmap_bulk_write(priv->regmap, RT5133_REG_BASE_EVT, &intr_evts, + RT5133_INTR_BYTE_NR); + if (ret) + dev_err(priv->dev, "%s, clear event failed\n", __func__); + + return IRQ_HANDLED; +} + +static int rt5133_enable_interrupts(int irq_no, struct rt5133_priv *priv) +{ + u32 mask = RT5133_INTR_CLR_MASK; + int ret; + + /* Force to write clear all events */ + ret = regmap_bulk_write(priv->regmap, RT5133_REG_BASE_EVT, &mask, + RT5133_INTR_BYTE_NR); + if (ret) { + dev_err(priv->dev, "Failed to clear all interrupts\n"); + return ret; + } + + /* Unmask all interrupts */ + mask = 0; + ret = regmap_bulk_write(priv->regmap, RT5133_REG_BASE_MASK, &mask, + RT5133_INTR_BYTE_NR); + if (ret) { + dev_err(priv->dev, "Failed to unmask all interrupts\n"); + return ret; + } + + return devm_request_threaded_irq(priv->dev, irq_no, NULL, + rt5133_intr_handler, IRQF_ONESHOT, + dev_name(priv->dev), priv); +} + +static int rt5133_regmap_hw_read(void *context, const void *reg_buf, + size_t reg_size, void *val_buf, + size_t val_size) +{ + struct rt5133_priv *priv = context; + struct i2c_client *client = to_i2c_client(priv->dev); + u8 reg = *(u8 *)reg_buf, crc; + u8 *buf; + int buf_len = RT5133_PREDATA_LEN + val_size + RT5133_I2C_CRC_LEN; + int read_len, ret; + + buf = kzalloc(buf_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + buf[0] = I2C_ADDR_XLATE_8BIT(client->addr, I2C_SMBUS_READ); + buf[1] = reg; + + read_len = val_size + RT5133_I2C_CRC_LEN; + ret = i2c_smbus_read_i2c_block_data(client, reg, read_len, + buf + RT5133_PREDATA_LEN); + + if (ret < 0) + goto out_read_err; + + if (ret != read_len) { + ret = -EIO; + goto out_read_err; + } + + crc = crc8(priv->crc8_tbls, buf, RT5133_PREDATA_LEN + val_size, 0); + if (crc != buf[RT5133_PREDATA_LEN + val_size]) { + ret = -EIO; + goto out_read_err; + } + + memcpy(val_buf, buf + RT5133_PREDATA_LEN, val_size); + dev_dbg(priv->dev, "%s, reg = 0x%02x, data = 0x%02x\n", __func__, reg, *(u8 *)val_buf); + +out_read_err: + kfree(buf); + return (ret < 0) ? ret : 0; +} + +static int rt5133_regmap_hw_write(void *context, const void *data, size_t count) +{ + struct rt5133_priv *priv = context; + struct i2c_client *client = to_i2c_client(priv->dev); + u8 reg = *(u8 *)data, crc; + u8 *buf; + int buf_len = RT5133_I2C_ADDR_LEN + count + RT5133_I2C_CRC_LEN + + RT5133_I2C_DUMMY_LEN; + int write_len, ret; + + buf = kzalloc(buf_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + buf[0] = I2C_ADDR_XLATE_8BIT(client->addr, I2C_SMBUS_WRITE); + buf[1] = reg; + memcpy(buf + RT5133_PREDATA_LEN, data + RT5133_REG_ADDR_LEN, + count - RT5133_REG_ADDR_LEN); + + crc = crc8(priv->crc8_tbls, buf, RT5133_I2C_ADDR_LEN + count, 0); + buf[RT5133_I2C_ADDR_LEN + count] = crc; + + write_len = count - RT5133_REG_ADDR_LEN + RT5133_I2C_CRC_LEN + + RT5133_I2C_DUMMY_LEN; + ret = i2c_smbus_write_i2c_block_data(client, reg, write_len, + buf + RT5133_PREDATA_LEN); + + dev_dbg(priv->dev, "%s, reg = 0x%02x, data = 0x%02x\n", __func__, reg, + *(u8 *)(buf + RT5133_PREDATA_LEN)); + kfree(buf); + return ret; +} + +static const struct regmap_bus rt5133_regmap_bus = { + .read = rt5133_regmap_hw_read, + .write = rt5133_regmap_hw_write, + /* Due to crc, the block read/write length has the limit */ + .max_raw_read = RT5133_MAX_I2C_BLOCK_SIZE, + .max_raw_write = RT5133_MAX_I2C_BLOCK_SIZE, +}; + +static bool rt5133_is_volatile_reg(struct device *dev, unsigned int reg) +{ + switch (reg) { + case RT5133_REG_CHIP_INFO: + case RT5133_REG_BASE_EVT...RT5133_REG_LDO_PGB_STAT: + case RT5133_REG_LDO_ON...RT5133_REG_LDO_OFF: + case RT5133_REG_LDO1_CTRL1: + case RT5133_REG_LDO2_CTRL1: + case RT5133_REG_LDO3_CTRL1: + case RT5133_REG_LDO4_CTRL1: + case RT5133_REG_LDO5_CTRL1: + case RT5133_REG_LDO6_CTRL1: + case RT5133_REG_LDO7_CTRL1: + case RT5133_REG_LDO8_CTRL1: + return true; + default: + return false; + }; +} + +static const struct regmap_config rt5133_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = RT5133_REG_LDO8_CTRL4, + .cache_type = REGCACHE_FLAT, + .num_reg_defaults_raw = RT5133_REG_LDO8_CTRL4 + 1, + .volatile_reg = rt5133_is_volatile_reg, +}; + +static int rt5133_chip_reset(struct rt5133_priv *priv) +{ + int ret; + + ret = regmap_write(priv->regmap, RT5133_REG_RST_CTRL, + RT5133_RESET_CODE); + if (ret) + return ret; + + /* Wait for register reset to take effect */ + udelay(2); + + return 0; +} + +static int rt5133_validate_vendor_info(struct rt5133_priv *priv) +{ + unsigned int val = 0; + int i, ret; + + ret = regmap_read(priv->regmap, RT5133_REG_CHIP_INFO, &val); + if (ret) + return ret; + + for (i = 0; i < ARRAY_SIZE(regulator_data); i++) { + if ((val & RT5133_VENDOR_ID_MASK) == + regulator_data[i].vendor_id){ + priv->cdata = ®ulator_data[i]; + break; + } + } + if (IS_ERR(priv->cdata)) { + dev_err(priv->dev, "Failed to find regualtor match version\n"); + return -ENODEV; + } + + return 0; +} + +static int rt5133_parse_dt(struct rt5133_priv *priv) +{ + unsigned int val = 0; + int ret = 0; + + if (!device_property_read_bool(priv->dev, "richtek,oc-shutdown-all")) + val = 0; + else + val = 1 << RT5133_OCSHDN_ALL_SHIFT; + ret = regmap_update_bits(priv->regmap, RT5133_REG_LDO_SHDN, + RT5133_OCSHDN_ALL_MASK, val); + if (ret) + return ret; + + if (!device_property_read_bool(priv->dev, "richtek,pgb-shutdown-all")) + val = 0; + else + val = 1 << RT5133_PGBSHDN_ALL_SHIFT; + return regmap_update_bits(priv->regmap, RT5133_REG_LDO_SHDN, + RT5133_PGBSHDN_ALL_MASK, val); +} + +static int rt5133_probe(struct i2c_client *i2c) +{ + struct rt5133_priv *priv; + struct regulator_config config = {0}; + int i, ret; + + priv = devm_kzalloc(&i2c->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->dev = &i2c->dev; + crc8_populate_msb(priv->crc8_tbls, RT5133_CRC8_POLYNOMIAL); + + priv->enable_gpio = devm_gpiod_get_optional(&i2c->dev, "enable", + GPIOD_OUT_HIGH); + if (IS_ERR(priv->enable_gpio)) + dev_err(&i2c->dev, "Failed to request HWEN gpio, check if default en=high\n"); + + priv->regmap = devm_regmap_init(&i2c->dev, &rt5133_regmap_bus, priv, + &rt5133_regmap_config); + if (IS_ERR(priv->regmap)) { + dev_err(&i2c->dev, "Failed to register regmap\n"); + return PTR_ERR(priv->regmap); + } + + ret = rt5133_validate_vendor_info(priv); + if (ret) { + dev_err(&i2c->dev, "Failed to check vendor info [%d]\n", ret); + return ret; + } + + ret = rt5133_chip_reset(priv); + if (ret) { + dev_err(&i2c->dev, "Failed to execute sw reset\n"); + return ret; + } + + config.dev = &i2c->dev; + config.driver_data = priv; + config.regmap = priv->regmap; + + for (i = 0; i < RT5133_REGULATOR_MAX; i++) { + priv->rdev[i] = devm_regulator_register(&i2c->dev, + priv->cdata->regulators + i, + &config); + if (IS_ERR(priv->rdev[i])) { + dev_err(&i2c->dev, + "Failed to register [%d] regulator\n", i); + return PTR_ERR(priv->rdev[i]); + } + } + + ret = rt5133_parse_dt(priv); + if (ret) { + dev_err(&i2c->dev, "%s, Failed to parse dt\n", __func__); + return ret; + } + + priv->gc.label = dev_name(&i2c->dev); + priv->gc.parent = &i2c->dev; + priv->gc.base = -1; + priv->gc.ngpio = RT5133_GPIO_NR; + priv->gc.set = rt5133_gpio_set; + priv->gc.get = rt5133_gpio_get; + priv->gc.direction_output = rt5133_gpio_direction_output; + priv->gc.can_sleep = true; + + ret = devm_gpiochip_add_data(&i2c->dev, &priv->gc, priv); + if (ret) + return ret; + + ret = rt5133_enable_interrupts(i2c->irq, priv); + if (ret) { + dev_err(&i2c->dev, "enable interrupt failed\n"); + return ret; + } + + i2c_set_clientdata(i2c, priv); + + return ret; +} + +static const struct of_device_id __maybe_unused rt5133_of_match_table[] = { + { .compatible = "richtek,rt5133", }, + { } +}; +MODULE_DEVICE_TABLE(of, rt5133_of_match_table); + +static struct i2c_driver rt5133_driver = { + .driver = { + .name = "rt5133", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .of_match_table = rt5133_of_match_table, + }, + .probe = rt5133_probe, +}; +module_i2c_driver(rt5133_driver); + +MODULE_DESCRIPTION("RT5133 Regulator Driver"); +MODULE_LICENSE("GPL v2"); From 9c45f95222beecd6a284fd1284d54dd7a772cf59 Mon Sep 17 00:00:00 2001 From: Gabor Juhos Date: Fri, 8 Aug 2025 19:15:01 +0200 Subject: [PATCH 0121/3206] spi: spi-qpic-snand: handle 'use_ecc' parameter of qcom_spi_config_cw_read() During raw read, neither the status of the ECC correction nor the erased state of the codeword gets checked by the qcom_spi_read_cw_raw() function, so in case of raw access reading the corresponding registers via DMA is superfluous. Extend the qcom_spi_config_cw_read() function to evaluate the existing (but actually unused) 'use_ecc' parameter, and configure reading only the flash status register when ECC is not used. With the change, the code gets in line with the corresponding part of the config_nand_cw_read() function in the qcom_nandc driver. Signed-off-by: Gabor Juhos Reviewed-by: Konrad Dybcio Link: https://patch.msgid.link/20250808-qpic-snand-handle-use_ecc-v1-1-67289fbb5e2f@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-qpic-snand.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-qpic-snand.c b/drivers/spi/spi-qpic-snand.c index 1f065d9907103f..2812ffc421d27a 100644 --- a/drivers/spi/spi-qpic-snand.c +++ b/drivers/spi/spi-qpic-snand.c @@ -491,9 +491,14 @@ qcom_spi_config_cw_read(struct qcom_nand_controller *snandc, bool use_ecc, int c qcom_write_reg_dma(snandc, &snandc->regs->cmd, NAND_FLASH_CMD, 1, NAND_BAM_NEXT_SGL); qcom_write_reg_dma(snandc, &snandc->regs->exec, NAND_EXEC_CMD, 1, NAND_BAM_NEXT_SGL); - qcom_read_reg_dma(snandc, NAND_FLASH_STATUS, 2, 0); - qcom_read_reg_dma(snandc, NAND_ERASED_CW_DETECT_STATUS, 1, - NAND_BAM_NEXT_SGL); + if (use_ecc) { + qcom_read_reg_dma(snandc, NAND_FLASH_STATUS, 2, 0); + qcom_read_reg_dma(snandc, NAND_ERASED_CW_DETECT_STATUS, 1, + NAND_BAM_NEXT_SGL); + } else { + qcom_read_reg_dma(snandc, NAND_FLASH_STATUS, 1, + NAND_BAM_NEXT_SGL); + } } static int qcom_spi_block_erase(struct qcom_nand_controller *snandc) From 70d00858645c0fb72ac63cf61c784b600aa9ea50 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 13 Aug 2025 08:38:08 +0300 Subject: [PATCH 0122/3206] audit: add a missing tab Someone got a bit carried away deleting tabs. Add it back. Signed-off-by: Dan Carpenter Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c29541c8fb645..497bda0043fb0b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1778,7 +1778,7 @@ static void audit_log_exit(void) axs->target_sessionid[i], &axs->target_ref[i], axs->target_comm[i])) - call_panic = 1; + call_panic = 1; } if (context->target_pid && From 7cedf7345cce7a12c28cce8df0bdc8a62d4ca438 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Thu, 31 Jul 2025 10:50:06 +0800 Subject: [PATCH 0123/3206] rust: alloc: kvec: add doc example for as_slice method Add a practical usage example to the documentation of KVec::as_slice() showing how to: Create a new KVec. Push elements into it. Convert to a slice via as_slice(). Co-developed-by: Geliang Tang Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu Reviewed-by: Alice Ryhl Reviewed-by: Kunwu Chan Reviewed-by: David Gow Link: https://lore.kernel.org/r/4e7f396f38ed8a780f863384bfc3d7de135ef3ea.1753929369.git.zhuhui@kylinos.cn Signed-off-by: Danilo Krummrich --- rust/kernel/alloc/kvec.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rust/kernel/alloc/kvec.rs b/rust/kernel/alloc/kvec.rs index 3c72e0bdddb871..fa04cc0987d6bc 100644 --- a/rust/kernel/alloc/kvec.rs +++ b/rust/kernel/alloc/kvec.rs @@ -224,6 +224,16 @@ where } /// Returns a slice of the entire vector. + /// + /// # Examples + /// + /// ``` + /// let mut v = KVec::new(); + /// v.push(1, GFP_KERNEL)?; + /// v.push(2, GFP_KERNEL)?; + /// assert_eq!(v.as_slice(), &[1, 2]); + /// # Ok::<(), Error>(()) + /// ``` #[inline] pub fn as_slice(&self) -> &[T] { self From a55498c7d8a651ca5c7b8169e84d48af77e60723 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Thu, 31 Jul 2025 10:50:07 +0800 Subject: [PATCH 0124/3206] rust: alloc: kvec: simplify KUnit test module name to "rust_kvec" Remove redundant "_kunit" suffix from test module name. The naming is now consistent with other Rust components as the test context is already implied by the #[kunit_tests] macro and test module location. Co-developed-by: Geliang Tang Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu Reviewed-by: David Gow Link: https://lore.kernel.org/r/4eb554c3bf03dd4f9e6dea659497938baab61dba.1753929369.git.zhuhui@kylinos.cn Signed-off-by: Danilo Krummrich --- rust/kernel/alloc/kvec.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/alloc/kvec.rs b/rust/kernel/alloc/kvec.rs index fa04cc0987d6bc..7316f48aaa90ab 100644 --- a/rust/kernel/alloc/kvec.rs +++ b/rust/kernel/alloc/kvec.rs @@ -1304,7 +1304,7 @@ impl<'vec, T> Drop for DrainAll<'vec, T> { } } -#[macros::kunit_tests(rust_kvec_kunit)] +#[macros::kunit_tests(rust_kvec)] mod tests { use super::*; use crate::prelude::*; From 37533933bfe92cd5a99ef4743f31dac62ccc8de0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Aug 2025 18:15:01 +0200 Subject: [PATCH 0125/3206] regulator: remove unneeded 'fast_io' parameter in regmap_config When using MMIO with regmap, fast_io is implied. No need to set it again. Signed-off-by: Wolfram Sang Link: https://patch.msgid.link/20250813161517.4746-16-wsa+renesas@sang-engineering.com Signed-off-by: Mark Brown --- drivers/regulator/qcom-refgen-regulator.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/regulator/qcom-refgen-regulator.c b/drivers/regulator/qcom-refgen-regulator.c index cfa72ce85bc898..299ac3c8c3bc3d 100644 --- a/drivers/regulator/qcom-refgen-regulator.c +++ b/drivers/regulator/qcom-refgen-regulator.c @@ -94,7 +94,6 @@ static const struct regmap_config qcom_refgen_regmap_config = { .reg_bits = 32, .reg_stride = 4, .val_bits = 32, - .fast_io = true, }; static int qcom_refgen_probe(struct platform_device *pdev) From 4c70fb2624ab1588faa58dcd407d4c61d64b288d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 13 Aug 2025 08:29:01 +0000 Subject: [PATCH 0126/3206] cpuset: remove redundant CS_ONLINE flag The CS_ONLINE flag was introduced prior to the CSS_ONLINE flag in the cpuset subsystem. Currently, the flag setting sequence is as follows: 1. cpuset_css_online() sets CS_ONLINE 2. css->flags gets CSS_ONLINE set ... 3. cgroup->kill_css sets CSS_DYING 4. cpuset_css_offline() clears CS_ONLINE 5. css->flags clears CSS_ONLINE The is_cpuset_online() check currently occurs between steps 1 and 3. However, it would be equally safe to perform this check between steps 2 and 3, as CSS_ONLINE provides the same synchronization guarantee as CS_ONLINE. Since CS_ONLINE is redundant with CSS_ONLINE and provides no additional synchronization benefits, we can safely remove it to simplify the code. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 +++++ kernel/cgroup/cpuset-internal.h | 3 +-- kernel/cgroup/cpuset.c | 4 +--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e23..ae73dbb19165c8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -354,6 +354,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css) return css->flags & CSS_DYING; } +static inline bool css_is_online(struct cgroup_subsys_state *css) +{ + return css->flags & CSS_ONLINE; +} + static inline bool css_is_self(struct cgroup_subsys_state *css) { if (css == &css->cgroup->self) { diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 383963e28ac69c..75b3aef39231dd 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -38,7 +38,6 @@ enum prs_errcode { /* bits in struct cpuset flags field */ typedef enum { - CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); + return css_is_online(&cs->css) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 27adb04df675d4..3466ebbf10164a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -207,7 +207,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ static struct cpuset top_cpuset = { - .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, @@ -3496,7 +3496,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpus_read_lock(); mutex_lock(&cpuset_mutex); - set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) @@ -3571,7 +3570,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); - clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); From 48124569bbc6bfda1df3e9ee17b19d559f4b1aa3 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Aug 2025 18:15:05 +0200 Subject: [PATCH 0127/3206] spi: remove unneeded 'fast_io' parameter in regmap_config When using MMIO with regmap, fast_io is implied. No need to set it again. Signed-off-by: Wolfram Sang Link: https://patch.msgid.link/20250813161517.4746-20-wsa+renesas@sang-engineering.com Signed-off-by: Mark Brown --- drivers/spi/spi-altera-platform.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/spi/spi-altera-platform.c b/drivers/spi/spi-altera-platform.c index 585393802e9f9f..e163774fd65b49 100644 --- a/drivers/spi/spi-altera-platform.c +++ b/drivers/spi/spi-altera-platform.c @@ -30,7 +30,6 @@ static const struct regmap_config spi_altera_config = { .reg_bits = 32, .reg_stride = 4, .val_bits = 32, - .fast_io = true, }; static int altera_spi_probe(struct platform_device *pdev) From 2caa6b88e0ba0231fb4ff0ba8e73cedd5fb81fc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 11 Aug 2025 14:08:04 +0200 Subject: [PATCH 0128/3206] bpf: Don't use %pK through printk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the past %pK was preferable to %p as it would not leak raw pointer values into the kernel log. Since commit ad67b74d2469 ("printk: hash addresses printed with %p") the regular %p has been improved to avoid this issue. Furthermore, restricted pointers ("%pK") were never meant to be used through printk(). They can still unintentionally leak raw pointers or acquire sleeping locks in atomic contexts. Switch to the regular pointer formatting which is safer and easier to reason about. Signed-off-by: Thomas Weißschuh Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250811-restricted-pointers-bpf-v1-1-a1d7cc3cb9e7@linutronix.de --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e7fd3ee759e07..52fecb7a1fe36d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1296,7 +1296,7 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { - pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen, + pr_err("flen=%u proglen=%u pass=%u image=%p from=%s pid=%d\n", flen, proglen, pass, image, current->comm, task_pid_nr(current)); if (image) From 07866544e410e4c895a729971e4164861b41fad5 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Tue, 12 Aug 2025 10:50:39 -0700 Subject: [PATCH 0129/3206] selftests/bpf: Copy test_kmods when installing selftest Commit d6212d82bf26 ("selftests/bpf: Consolidate kernel modules into common directory") consolidated the Makefile of test_kmods. However, since it removed test_kmods from TEST_GEN_PROGS_EXTENDED, the kernel modules required by bpf selftests are now missing from kselftest_install when "make install". Fix it by adding test_kmod to TEST_GEN_FILES. Fixes: d6212d82bf26 ("selftests/bpf: Consolidate kernel modules into common directory") Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250812175039.2323570-1-ameryhung@gmail.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4863106034dfbc..77794efc020ea6 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -137,7 +137,7 @@ TEST_GEN_PROGS_EXTENDED = \ xdping \ xskxceiver -TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi +TEST_GEN_FILES += $(TEST_KMODS) liburandom_read.so urandom_read sign-file uprobe_multi ifneq ($(V),1) submake_extras := feature_display=0 From 5893165a27e7d1f318aa91f0cd84e795dab7460d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Aug 2025 18:14:49 +0200 Subject: [PATCH 0130/3206] gpio: remove unneeded 'fast_io' parameter in regmap_config When using MMIO with regmap, fast_io is implied. No need to set it again. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250813161517.4746-4-wsa+renesas@sang-engineering.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mvebu.c | 1 - drivers/gpio/gpio-sifive.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c index 5e3f54cb8bc463..261ffd0c614b71 100644 --- a/drivers/gpio/gpio-mvebu.c +++ b/drivers/gpio/gpio-mvebu.c @@ -602,7 +602,6 @@ static const struct regmap_config mvebu_gpio_regmap_config = { .reg_bits = 32, .reg_stride = 4, .val_bits = 32, - .fast_io = true, }; /* diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index 067c8edb62e205..98ef975c44d9a6 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -174,7 +174,6 @@ static const struct regmap_config sifive_gpio_regmap_config = { .reg_bits = 32, .reg_stride = 4, .val_bits = 32, - .fast_io = true, .disable_locking = true, }; From 535fd4c98452c87537a40610abba45daf5761ec6 Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Thu, 31 Jul 2025 08:44:50 -0400 Subject: [PATCH 0131/3206] serial: sc16is7xx: fix bug in flow control levels init When trying to set MCR[2], XON1 is incorrectly accessed instead. And when writing to the TCR register to configure flow control levels, we are incorrectly writing to the MSR register. The default value of $00 is then used for TCR, which means that selectable trigger levels in FCR are used in place of TCR. TCR/TLR access requires EFR[4] (enable enhanced functions) and MCR[2] to be set. EFR[4] is already set in probe(). MCR access requires LCR[7] to be zero. Since LCR is set to $BF when trying to set MCR[2], XON1 is incorrectly accessed instead because MCR shares the same address space as XON1. Since MCR[2] is unmodified and still zero, when writing to TCR we are in fact writing to MSR because TCR/TLR registers share the same address space as MSR/SPR. Fix by first removing useless reconfiguration of EFR[4] (enable enhanced functions), as it is already enabled in sc16is7xx_probe() since commit 43c51bb573aa ("sc16is7xx: make sure device is in suspend once probed"). Now LCR is $00, which means that MCR access is enabled. Also remove regcache_cache_bypass() calls since we no longer access the enhanced registers set, and TCR is already declared as volatile (in fact by declaring MSR as volatile, which shares the same address). Finally disable access to TCR/TLR registers after modifying them by clearing MCR[2]. Note: the comment about "... and internal clock div" is wrong and can be ignored/removed as access to internal clock div registers (DLL/DLH) is permitted only when LCR[7] is logic 1, not when enhanced features is enabled. And DLL/DLH access is not needed in sc16is7xx_startup(). Fixes: dfeae619d781 ("serial: sc16is7xx") Cc: stable@vger.kernel.org Signed-off-by: Hugo Villeneuve Link: https://lore.kernel.org/r/20250731124451.1108864-1-hugo@hugovil.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sc16is7xx.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index 3f38fba8f6ea5d..a668e0bb26b397 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -1177,17 +1177,6 @@ static int sc16is7xx_startup(struct uart_port *port) sc16is7xx_port_write(port, SC16IS7XX_FCR_REG, SC16IS7XX_FCR_FIFO_BIT); - /* Enable EFR */ - sc16is7xx_port_write(port, SC16IS7XX_LCR_REG, - SC16IS7XX_LCR_CONF_MODE_B); - - regcache_cache_bypass(one->regmap, true); - - /* Enable write access to enhanced features and internal clock div */ - sc16is7xx_port_update(port, SC16IS7XX_EFR_REG, - SC16IS7XX_EFR_ENABLE_BIT, - SC16IS7XX_EFR_ENABLE_BIT); - /* Enable TCR/TLR */ sc16is7xx_port_update(port, SC16IS7XX_MCR_REG, SC16IS7XX_MCR_TCRTLR_BIT, @@ -1199,7 +1188,8 @@ static int sc16is7xx_startup(struct uart_port *port) SC16IS7XX_TCR_RX_RESUME(24) | SC16IS7XX_TCR_RX_HALT(48)); - regcache_cache_bypass(one->regmap, false); + /* Disable TCR/TLR access */ + sc16is7xx_port_update(port, SC16IS7XX_MCR_REG, SC16IS7XX_MCR_TCRTLR_BIT, 0); /* Now, initialize the UART */ sc16is7xx_port_write(port, SC16IS7XX_LCR_REG, SC16IS7XX_LCR_WORD_LEN_8); From ee047e1d85d73496541c54bd4f432c9464e13e65 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 12 Aug 2025 14:16:31 +0200 Subject: [PATCH 0132/3206] dt-bindings: serial: brcm,bcm7271-uart: Constrain clocks Lists should have fixed constraints, because binding must be specific in respect to hardware, thus add missing constraints to number of clocks. Cc: stable Fixes: 88a499cd70d4 ("dt-bindings: Add support for the Broadcom UART driver") Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250812121630.67072-2-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml b/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml index 89c462653e2d33..8cc848ae11cb73 100644 --- a/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml +++ b/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml @@ -41,7 +41,7 @@ properties: - const: dma_intr2 clocks: - minItems: 1 + maxItems: 1 clock-names: const: sw_baud From 387d00028cccee7575f6416953bef62f849d83e3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 12 Aug 2025 22:21:50 -0500 Subject: [PATCH 0133/3206] dt-bindings: serial: 8250: move a constraint A block that required a "spacemit,k1-uart" compatible node to specify two clocks was placed in the wrong spot in the binding. Conor Dooley pointed out it belongs earlier in the file, as part of the initial "allOf". Fixes: 2c0594f9f0629 ("dt-bindings: serial: 8250: support an optional second clock") Cc: stable Reported-by: Conor Dooley Closes: https://lore.kernel.org/lkml/20250729-reshuffle-contented-e6def76b540b@spud/ Signed-off-by: Alex Elder Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250813032151.2330616-1-elder@riscstar.com Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/serial/8250.yaml | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/Documentation/devicetree/bindings/serial/8250.yaml b/Documentation/devicetree/bindings/serial/8250.yaml index e46bee8d25bf02..f59c0b37e8ebb2 100644 --- a/Documentation/devicetree/bindings/serial/8250.yaml +++ b/Documentation/devicetree/bindings/serial/8250.yaml @@ -48,7 +48,6 @@ allOf: oneOf: - required: [ clock-frequency ] - required: [ clocks ] - - if: properties: compatible: @@ -66,6 +65,28 @@ allOf: items: - const: core - const: bus + - if: + properties: + compatible: + contains: + enum: + - spacemit,k1-uart + - nxp,lpc1850-uart + then: + required: + - clocks + - clock-names + properties: + clocks: + minItems: 2 + clock-names: + minItems: 2 + else: + properties: + clocks: + maxItems: 1 + clock-names: + maxItems: 1 properties: compatible: @@ -264,29 +285,6 @@ required: - reg - interrupts -if: - properties: - compatible: - contains: - enum: - - spacemit,k1-uart - - nxp,lpc1850-uart -then: - required: - - clocks - - clock-names - properties: - clocks: - minItems: 2 - clock-names: - minItems: 2 -else: - properties: - clocks: - maxItems: 1 - clock-names: - maxItems: 1 - unevaluatedProperties: false examples: From a1b51534b532dd4f0499907865553ee9251bebc3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 12 Aug 2025 22:13:35 -0500 Subject: [PATCH 0134/3206] dt-bindings: serial: 8250: allow "main" and "uart" as clock names There are two compatible strings defined in "8250.yaml" that require two clocks to be specified, along with their names: - "spacemit,k1-uart", used in "spacemit/k1.dtsi" - "nxp,lpc1850-uart", used in "lpc/lpc18xx.dtsi" When only one clock is used, the name is not required. However there are two places that do specify a name: - In "mediatek/mt7623.dtsi", the clock for the "mediatek,mtk-btif" compatible serial device is named "main" - In "qca/ar9132.dtsi", the clock for the "ns8250" compatible serial device is named "uart" In commit d2db0d7815444 ("dt-bindings: serial: 8250: allow clock 'uartclk' and 'reg' for nxp,lpc1850-uart"), Frank Li added the restriction that two named clocks be used for the NXP platform mentioned above. Change that logic, so that an additional condition for (only) the SpacemiT platform similarly restricts the two clocks to have the names "core" and "bus". Finally, add "main" and "uart" as allowed names when a single clock is specified. Fixes: 2c0594f9f0629 ("dt-bindings: serial: 8250: support an optional second clock") Cc: stable Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507160314.wrC51lXX-lkp@intel.com/ Signed-off-by: Alex Elder Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250813031338.2328392-1-elder@riscstar.com Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/serial/8250.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/serial/8250.yaml b/Documentation/devicetree/bindings/serial/8250.yaml index f59c0b37e8ebb2..b243afa69a1aeb 100644 --- a/Documentation/devicetree/bindings/serial/8250.yaml +++ b/Documentation/devicetree/bindings/serial/8250.yaml @@ -59,7 +59,12 @@ allOf: items: - const: uartclk - const: reg - else: + - if: + properties: + compatible: + contains: + const: spacemit,k1-uart + then: properties: clock-names: items: @@ -183,6 +188,9 @@ properties: minItems: 1 maxItems: 2 oneOf: + - enum: + - main + - uart - items: - const: core - const: bus From 6d068f1ae2a2f713d7f21a9a602e65b3d6b6fc6d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 14 Aug 2025 08:33:26 +0100 Subject: [PATCH 0135/3206] regulator: rt5133: Fix spelling mistake "regualtor" -> "regulator" There is a spelling mistake in a dev_err message. Fix it. Signed-off-by: Colin Ian King Link: https://patch.msgid.link/20250814073326.17644-1-colin.i.king@gmail.com Signed-off-by: Mark Brown --- drivers/regulator/rt5133-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/rt5133-regulator.c b/drivers/regulator/rt5133-regulator.c index d0f367381fbbde..39532618e73d49 100644 --- a/drivers/regulator/rt5133-regulator.c +++ b/drivers/regulator/rt5133-regulator.c @@ -511,7 +511,7 @@ static int rt5133_validate_vendor_info(struct rt5133_priv *priv) } } if (IS_ERR(priv->cdata)) { - dev_err(priv->dev, "Failed to find regualtor match version\n"); + dev_err(priv->dev, "Failed to find regulator match version\n"); return -ENODEV; } From 9969779d0803f5dcd4460ae7aca2bc3fd91bff12 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:42 -0700 Subject: [PATCH 0136/3206] Documentation/hw-vuln: Add VMSCAPE documentation VMSCAPE is a vulnerability that may allow a guest to influence the branch prediction in host userspace, particularly affecting hypervisors like QEMU. Add the documentation. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Dave Hansen --- Documentation/admin-guide/hw-vuln/index.rst | 1 + Documentation/admin-guide/hw-vuln/vmscape.rst | 110 ++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 Documentation/admin-guide/hw-vuln/vmscape.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 89ca636081b71e..55d747511f833b 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -26,3 +26,4 @@ are configurable at compile, boot or run time. rsb old_microcode indirect-target-selection + vmscape diff --git a/Documentation/admin-guide/hw-vuln/vmscape.rst b/Documentation/admin-guide/hw-vuln/vmscape.rst new file mode 100644 index 00000000000000..d9b9a2b6c114c0 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/vmscape.rst @@ -0,0 +1,110 @@ +.. SPDX-License-Identifier: GPL-2.0 + +VMSCAPE +======= + +VMSCAPE is a vulnerability that may allow a guest to influence the branch +prediction in host userspace. It particularly affects hypervisors like QEMU. + +Even if a hypervisor may not have any sensitive data like disk encryption keys, +guest-userspace may be able to attack the guest-kernel using the hypervisor as +a confused deputy. + +Affected processors +------------------- + +The following CPU families are affected by VMSCAPE: + +**Intel processors:** + - Skylake generation (Parts without Enhanced-IBRS) + - Cascade Lake generation - (Parts affected by ITS guest/host separation) + - Alder Lake and newer (Parts affected by BHI) + +Note that, BHI affected parts that use BHB clearing software mitigation e.g. +Icelake are not vulnerable to VMSCAPE. + +**AMD processors:** + - Zen series (families 0x17, 0x19, 0x1a) + +** Hygon processors:** + - Family 0x18 + +Mitigation +---------- + +Conditional IBPB +---------------- + +Kernel tracks when a CPU has run a potentially malicious guest and issues an +IBPB before the first exit to userspace after VM-exit. If userspace did not run +between VM-exit and the next VM-entry, no IBPB is issued. + +Note that the existing userspace mitigation against Spectre-v2 is effective in +protecting the userspace. They are insufficient to protect the userspace VMMs +from a malicious guest. This is because Spectre-v2 mitigations are applied at +context switch time, while the userspace VMM can run after a VM-exit without a +context switch. + +Vulnerability enumeration and mitigation is not applied inside a guest. This is +because nested hypervisors should already be deploying IBPB to isolate +themselves from nested guests. + +SMT considerations +------------------ + +When Simultaneous Multi-Threading (SMT) is enabled, hypervisors can be +vulnerable to cross-thread attacks. For complete protection against VMSCAPE +attacks in SMT environments, STIBP should be enabled. + +The kernel will issue a warning if SMT is enabled without adequate STIBP +protection. Warning is not issued when: + +- SMT is disabled +- STIBP is enabled system-wide +- Intel eIBRS is enabled (which implies STIBP protection) + +System information and options +------------------------------ + +The sysfs file showing VMSCAPE mitigation status is: + + /sys/devices/system/cpu/vulnerabilities/vmscape + +The possible values in this file are: + + * 'Not affected': + + The processor is not vulnerable to VMSCAPE attacks. + + * 'Vulnerable': + + The processor is vulnerable and no mitigation has been applied. + + * 'Mitigation: IBPB before exit to userspace': + + Conditional IBPB mitigation is enabled. The kernel tracks when a CPU has + run a potentially malicious guest and issues an IBPB before the first + exit to userspace after VM-exit. + + * 'Mitigation: IBPB on VMEXIT': + + IBPB is issued on every VM-exit. This occurs when other mitigations like + RETBLEED or SRSO are already issuing IBPB on VM-exit. + +Mitigation control on the kernel command line +---------------------------------------------- + +The mitigation can be controlled via the ``vmscape=`` command line parameter: + + * ``vmscape=off``: + + Disable the VMSCAPE mitigation. + + * ``vmscape=ibpb``: + + Enable conditional IBPB mitigation (default when CONFIG_MITIGATION_VMSCAPE=y). + + * ``vmscape=force``: + + Force vulnerability detection and mitigation even on processors that are + not known to be affected. From a508cec6e5215a3fbc7e73ae86a5c5602187934d Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:42 -0700 Subject: [PATCH 0137/3206] x86/vmscape: Enumerate VMSCAPE bug The VMSCAPE vulnerability may allow a guest to cause Branch Target Injection (BTI) in userspace hypervisors. Kernels (both host and guest) have existing defenses against direct BTI attacks from guests. There are also inter-process BTI mitigations which prevent processes from attacking each other. However, the threat in this case is to a userspace hypervisor within the same process as the attacker. Userspace hypervisors have access to their own sensitive data like disk encryption keys and also typically have access to all guest data. This means guest userspace may use the hypervisor as a confused deputy to attack sensitive guest kernel data. There are no existing mitigations for these attacks. Introduce X86_BUG_VMSCAPE for this vulnerability and set it on affected Intel and AMD CPUs. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/common.c | 65 ++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 602957dd2609ce..b6fa5c33c85d85 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -550,4 +550,5 @@ #define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */ #define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ #define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */ +#define X86_BUG_VMSCAPE X86_BUG( 1*32+10) /* "vmscape" CPU is affected by VMSCAPE attacks from guests */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 34a054181c4dc4..2b87c93e660963 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1236,6 +1236,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define ITS_NATIVE_ONLY BIT(9) /* CPU is affected by Transient Scheduler Attacks */ #define TSA BIT(10) +/* CPU is affected by VMSCAPE */ +#define VMSCAPE BIT(11) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), @@ -1247,44 +1249,55 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), - VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), - VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS), - VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), - VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS), - VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS), - VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS), - VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS), + VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_METEORLAKE_L, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_H, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_U, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_LUNARLAKE_M, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SAPPHIRERAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_GRANITERAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_EMERALDRAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_CRESTMONT_X, X86_STEP_MAX, VMSCAPE), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), - VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), - VULNBL_AMD(0x19, SRSO | TSA), - VULNBL_AMD(0x1a, SRSO), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO | VMSCAPE), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO | VMSCAPE), + VULNBL_AMD(0x19, SRSO | TSA | VMSCAPE), + VULNBL_AMD(0x1a, SRSO | VMSCAPE), {} }; @@ -1543,6 +1556,14 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } } + /* + * Set the bug only on bare-metal. A nested hypervisor should already be + * deploying IBPB to isolate itself from nested guests. + */ + if (cpu_matches(cpu_vuln_blacklist, VMSCAPE) && + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) + setup_force_cpu_bug(X86_BUG_VMSCAPE); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; From 2f8f173413f1cbf52660d04df92d0069c4306d25 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:42 -0700 Subject: [PATCH 0138/3206] x86/vmscape: Add conditional IBPB mitigation VMSCAPE is a vulnerability that exploits insufficient branch predictor isolation between a guest and a userspace hypervisor (like QEMU). Existing mitigations already protect kernel/KVM from a malicious guest. Userspace can additionally be protected by flushing the branch predictors after a VMexit. Since it is the userspace that consumes the poisoned branch predictors, conditionally issue an IBPB after a VMexit and before returning to userspace. Workloads that frequently switch between hypervisor and userspace will incur the most overhead from the new IBPB. This new IBPB is not integrated with the existing IBPB sites. For instance, a task can use the existing speculation control prctl() to get an IBPB at context switch time. With this implementation, the IBPB is doubled up: one at context switch and another before running userspace. The intent is to integrate and optimize these cases post-embargo. [ dhansen: elaborate on suboptimal IBPB solution ] Suggested-by: Dave Hansen Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Acked-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/entry-common.h | 7 +++++++ arch/x86/include/asm/nospec-branch.h | 2 ++ arch/x86/kernel/cpu/bugs.c | 8 ++++++++ arch/x86/kvm/x86.c | 9 +++++++++ 5 files changed, 27 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index b6fa5c33c85d85..c8e177016cc4be 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -494,6 +494,7 @@ #define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */ #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ +#define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ /* * BUG word(s) diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index d535a97c728422..ce3eb6d5fdf9f2 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -93,6 +93,13 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, * 8 (ia32) bits. */ choose_random_kstack_offset(rdtsc()); + + /* Avoid unnecessary reads of 'x86_ibpb_exit_to_user' */ + if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER) && + this_cpu_read(x86_ibpb_exit_to_user)) { + indirect_branch_prediction_barrier(); + this_cpu_write(x86_ibpb_exit_to_user, false); + } } #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 10f261678749a7..e29f82466f4323 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -530,6 +530,8 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) : "memory"); } +DECLARE_PER_CPU(bool, x86_ibpb_exit_to_user); + static inline void indirect_branch_prediction_barrier(void) { asm_inline volatile(ALTERNATIVE("", "call write_ibpb", X86_FEATURE_IBPB) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b74bf937cd9fbc..410f8df8b77a1a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -105,6 +105,14 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); +/* + * Set when the CPU has run a potentially malicious guest. An IBPB will + * be needed to before running userspace. That IBPB will flush the branch + * predictor content. + */ +DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user); +EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user); + u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; static u64 __ro_after_init x86_arch_cap_msr; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1c49bc681c469..58d19443c9a368 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11007,6 +11007,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_fpu.xfd_err) wrmsrq(MSR_IA32_XFD_ERR, 0); + /* + * Mark this CPU as needing a branch predictor flush before running + * userspace. Must be done before enabling preemption to ensure it gets + * set for the CPU that actually ran the guest, and not the CPU that it + * may migrate to. + */ + if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER)) + this_cpu_write(x86_ibpb_exit_to_user, true); + /* * Consume any pending interrupts, including the possible source of * VM-Exit on SVM and any ticks that occur between VM-Exit and now. From 556c1ad666ad90c50ec8fccb930dd5046cfbecfb Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:42 -0700 Subject: [PATCH 0139/3206] x86/vmscape: Enable the mitigation Enable the previously added mitigation for VMscape. Add the cmdline vmscape={off|ibpb|force} and sysfs reporting. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Dave Hansen --- .../ABI/testing/sysfs-devices-system-cpu | 1 + .../admin-guide/kernel-parameters.txt | 11 +++ arch/x86/Kconfig | 9 ++ arch/x86/kernel/cpu/bugs.c | 90 +++++++++++++++++++ drivers/base/cpu.c | 3 + include/linux/cpu.h | 1 + 6 files changed, 115 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index ab8cd337f43aad..8aed6d94c4cd0d 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -586,6 +586,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/srbds /sys/devices/system/cpu/vulnerabilities/tsa /sys/devices/system/cpu/vulnerabilities/tsx_async_abort + /sys/devices/system/cpu/vulnerabilities/vmscape Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 747a55abf4946b..5a7a83c411e9c5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3829,6 +3829,7 @@ srbds=off [X86,INTEL] ssbd=force-off [ARM64] tsx_async_abort=off [X86] + vmscape=off [X86] Exceptions: This does not have any effect on @@ -8041,6 +8042,16 @@ vmpoff= [KNL,S390] Perform z/VM CP command after power off. Format: + vmscape= [X86] Controls mitigation for VMscape attacks. + VMscape attacks can leak information from a userspace + hypervisor to a guest via speculative side-channels. + + off - disable the mitigation + ibpb - use Indirect Branch Prediction Barrier + (IBPB) mitigation (default) + force - force vulnerability detection even on + unaffected processors + vsyscall= [X86-64,EARLY] Controls the behavior of vsyscalls (i.e. calls to fixed addresses of 0xffffffffff600x00 from legacy diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100eb..52c8910ba2efdd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2701,6 +2701,15 @@ config MITIGATION_TSA security vulnerability on AMD CPUs which can lead to forwarding of invalid info to subsequent instructions and thus can affect their timing and thereby cause a leakage. + +config MITIGATION_VMSCAPE + bool "Mitigate VMSCAPE" + depends on KVM + default y + help + Enable mitigation for VMSCAPE attacks. VMSCAPE is a hardware security + vulnerability on Intel and AMD CPUs that may allow a guest to do + Spectre v2 style attacks on userspace hypervisor. endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 410f8df8b77a1a..c81024dfc4c868 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -96,6 +96,9 @@ static void __init its_update_mitigation(void); static void __init its_apply_mitigation(void); static void __init tsa_select_mitigation(void); static void __init tsa_apply_mitigation(void); +static void __init vmscape_select_mitigation(void); +static void __init vmscape_update_mitigation(void); +static void __init vmscape_apply_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -270,6 +273,7 @@ void __init cpu_select_mitigations(void) its_select_mitigation(); bhi_select_mitigation(); tsa_select_mitigation(); + vmscape_select_mitigation(); /* * After mitigations are selected, some may need to update their @@ -301,6 +305,7 @@ void __init cpu_select_mitigations(void) bhi_update_mitigation(); /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ srso_update_mitigation(); + vmscape_update_mitigation(); spectre_v1_apply_mitigation(); spectre_v2_apply_mitigation(); @@ -318,6 +323,7 @@ void __init cpu_select_mitigations(void) its_apply_mitigation(); bhi_apply_mitigation(); tsa_apply_mitigation(); + vmscape_apply_mitigation(); } /* @@ -3322,6 +3328,77 @@ static void __init srso_apply_mitigation(void) } } +#undef pr_fmt +#define pr_fmt(fmt) "VMSCAPE: " fmt + +enum vmscape_mitigations { + VMSCAPE_MITIGATION_NONE, + VMSCAPE_MITIGATION_AUTO, + VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER, + VMSCAPE_MITIGATION_IBPB_ON_VMEXIT, +}; + +static const char * const vmscape_strings[] = { + [VMSCAPE_MITIGATION_NONE] = "Vulnerable", + /* [VMSCAPE_MITIGATION_AUTO] */ + [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace", + [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT", +}; + +static enum vmscape_mitigations vmscape_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_VMSCAPE) ? VMSCAPE_MITIGATION_AUTO : VMSCAPE_MITIGATION_NONE; + +static int __init vmscape_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } else if (!strcmp(str, "ibpb")) { + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_VMSCAPE); + vmscape_mitigation = VMSCAPE_MITIGATION_AUTO; + } else { + pr_err("Ignoring unknown vmscape=%s option.\n", str); + } + + return 0; +} +early_param("vmscape", vmscape_parse_cmdline); + +static void __init vmscape_select_mitigation(void) +{ + if (cpu_mitigations_off() || + !boot_cpu_has_bug(X86_BUG_VMSCAPE) || + !boot_cpu_has(X86_FEATURE_IBPB)) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + return; + } + + if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; +} + +static void __init vmscape_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE)) + return; + + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB || + srso_mitigation == SRSO_MITIGATION_IBPB_ON_VMEXIT) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_ON_VMEXIT; + + pr_info("%s\n", vmscape_strings[vmscape_mitigation]); +} + +static void __init vmscape_apply_mitigation(void) +{ + if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER) + setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER); +} + #undef pr_fmt #define pr_fmt(fmt) fmt @@ -3570,6 +3647,11 @@ static ssize_t tsa_show_state(char *buf) return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]); } +static ssize_t vmscape_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", vmscape_strings[vmscape_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -3636,6 +3718,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_TSA: return tsa_show_state(buf); + case X86_BUG_VMSCAPE: + return vmscape_show_state(buf); + default: break; } @@ -3727,6 +3812,11 @@ ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_TSA); } + +ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_VMSCAPE); +} #endif void __warn_thunk(void) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index efc575a00edda9..008da0354fba35 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -603,6 +603,7 @@ CPU_SHOW_VULN_FALLBACK(ghostwrite); CPU_SHOW_VULN_FALLBACK(old_microcode); CPU_SHOW_VULN_FALLBACK(indirect_target_selection); CPU_SHOW_VULN_FALLBACK(tsa); +CPU_SHOW_VULN_FALLBACK(vmscape); static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); @@ -622,6 +623,7 @@ static DEVICE_ATTR(ghostwrite, 0444, cpu_show_ghostwrite, NULL); static DEVICE_ATTR(old_microcode, 0444, cpu_show_old_microcode, NULL); static DEVICE_ATTR(indirect_target_selection, 0444, cpu_show_indirect_target_selection, NULL); static DEVICE_ATTR(tsa, 0444, cpu_show_tsa, NULL); +static DEVICE_ATTR(vmscape, 0444, cpu_show_vmscape, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -642,6 +644,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_old_microcode.attr, &dev_attr_indirect_target_selection.attr, &dev_attr_tsa.attr, + &dev_attr_vmscape.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index b91b993f58ee7f..487b3bf2e1eaf7 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -83,6 +83,7 @@ extern ssize_t cpu_show_old_microcode(struct device *dev, extern ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, From 6449f5baf9c78a7a442d64f4a61378a21c5db113 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:43 -0700 Subject: [PATCH 0140/3206] x86/bugs: Move cpu_bugs_smt_update() down cpu_bugs_smt_update() uses global variables from different mitigations. For SMT updates it can't currently use vmscape_mitigation that is defined after it. Since cpu_bugs_smt_update() depends on many other mitigations, move it after all mitigations are defined. With that, it can use vmscape_mitigation in a moment. No functional change. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen --- arch/x86/kernel/cpu/bugs.c | 165 +++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 82 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c81024dfc4c868..1f8c1c51d0577c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2551,88 +2551,6 @@ static void update_mds_branch_idle(void) } } -#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" -#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" -#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" - -void cpu_bugs_smt_update(void) -{ - mutex_lock(&spec_ctrl_mutex); - - if (sched_smt_active() && unprivileged_ebpf_enabled() && - spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) - pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); - - switch (spectre_v2_user_stibp) { - case SPECTRE_V2_USER_NONE: - break; - case SPECTRE_V2_USER_STRICT: - case SPECTRE_V2_USER_STRICT_PREFERRED: - update_stibp_strict(); - break; - case SPECTRE_V2_USER_PRCTL: - case SPECTRE_V2_USER_SECCOMP: - update_indir_branch_cond(); - break; - } - - switch (mds_mitigation) { - case MDS_MITIGATION_FULL: - case MDS_MITIGATION_AUTO: - case MDS_MITIGATION_VMWERV: - if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) - pr_warn_once(MDS_MSG_SMT); - update_mds_branch_idle(); - break; - case MDS_MITIGATION_OFF: - break; - } - - switch (taa_mitigation) { - case TAA_MITIGATION_VERW: - case TAA_MITIGATION_AUTO: - case TAA_MITIGATION_UCODE_NEEDED: - if (sched_smt_active()) - pr_warn_once(TAA_MSG_SMT); - break; - case TAA_MITIGATION_TSX_DISABLED: - case TAA_MITIGATION_OFF: - break; - } - - switch (mmio_mitigation) { - case MMIO_MITIGATION_VERW: - case MMIO_MITIGATION_AUTO: - case MMIO_MITIGATION_UCODE_NEEDED: - if (sched_smt_active()) - pr_warn_once(MMIO_MSG_SMT); - break; - case MMIO_MITIGATION_OFF: - break; - } - - switch (tsa_mitigation) { - case TSA_MITIGATION_USER_KERNEL: - case TSA_MITIGATION_VM: - case TSA_MITIGATION_AUTO: - case TSA_MITIGATION_FULL: - /* - * TSA-SQ can potentially lead to info leakage between - * SMT threads. - */ - if (sched_smt_active()) - static_branch_enable(&cpu_buf_idle_clear); - else - static_branch_disable(&cpu_buf_idle_clear); - break; - case TSA_MITIGATION_NONE: - case TSA_MITIGATION_UCODE_NEEDED: - break; - } - - mutex_unlock(&spec_ctrl_mutex); -} - #undef pr_fmt #define pr_fmt(fmt) "Speculative Store Bypass: " fmt @@ -3402,6 +3320,89 @@ static void __init vmscape_apply_mitigation(void) #undef pr_fmt #define pr_fmt(fmt) fmt +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" +#define VMSCAPE_MSG_SMT "VMSCAPE: SMT on, STIBP is required for full protection. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/vmscape.html for more details.\n" + +void cpu_bugs_smt_update(void) +{ + mutex_lock(&spec_ctrl_mutex); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + + switch (spectre_v2_user_stibp) { + case SPECTRE_V2_USER_NONE: + break; + case SPECTRE_V2_USER_STRICT: + case SPECTRE_V2_USER_STRICT_PREFERRED: + update_stibp_strict(); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + update_indir_branch_cond(); + break; + } + + switch (mds_mitigation) { + case MDS_MITIGATION_FULL: + case MDS_MITIGATION_AUTO: + case MDS_MITIGATION_VMWERV: + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) + pr_warn_once(MDS_MSG_SMT); + update_mds_branch_idle(); + break; + case MDS_MITIGATION_OFF: + break; + } + + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_AUTO: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_AUTO: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + + switch (tsa_mitigation) { + case TSA_MITIGATION_USER_KERNEL: + case TSA_MITIGATION_VM: + case TSA_MITIGATION_AUTO: + case TSA_MITIGATION_FULL: + /* + * TSA-SQ can potentially lead to info leakage between + * SMT threads. + */ + if (sched_smt_active()) + static_branch_enable(&cpu_buf_idle_clear); + else + static_branch_disable(&cpu_buf_idle_clear); + break; + case TSA_MITIGATION_NONE: + case TSA_MITIGATION_UCODE_NEEDED: + break; + } + + mutex_unlock(&spec_ctrl_mutex); +} + #ifdef CONFIG_SYSFS #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" From b7cc9887231526ca4fa89f3fa4119e47c2dc7b1e Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:43 -0700 Subject: [PATCH 0141/3206] x86/vmscape: Warn when STIBP is disabled with SMT Cross-thread attacks are generally harder as they require the victim to be co-located on a core. However, with VMSCAPE the adversary targets belong to the same guest execution, that are more likely to get co-located. In particular, a thread that is currently executing userspace hypervisor (after the IBPB) may still be targeted by a guest execution from a sibling thread. Issue a warning about the potential risk, except when: - SMT is disabled - STIBP is enabled system-wide - Intel eIBRS is enabled (which implies STIBP protection) Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen --- arch/x86/kernel/cpu/bugs.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1f8c1c51d0577c..fa32615db71d40 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -3400,6 +3400,28 @@ void cpu_bugs_smt_update(void) break; } + switch (vmscape_mitigation) { + case VMSCAPE_MITIGATION_NONE: + case VMSCAPE_MITIGATION_AUTO: + break; + case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT: + case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER: + /* + * Hypervisors can be attacked across-threads, warn for SMT when + * STIBP is not already enabled system-wide. + * + * Intel eIBRS (!AUTOIBRS) implies STIBP on. + */ + if (!sched_smt_active() || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || + (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + break; + pr_warn_once(VMSCAPE_MSG_SMT); + break; + } + mutex_unlock(&spec_ctrl_mutex); } From 452d90def2dce8513d75981a4dc48e94e65ff54b Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 14 Aug 2025 11:22:11 -0400 Subject: [PATCH 0142/3206] dlm: handle invalid lockspace member remove Since commit de7b4869b4ecf ("dlm: add new configfs entry release_recover for lockspace members") we are moving lockspace members into a gone list before removing them to get additional removing attributes from configfs. There is still a very unlikely possibility when find_config_node() returns NULL, then for some reason the node wasn't marked as gone but it was removed. We will just handle this case and drop an error to observe if this case can ever happen. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/gfs2/aJ2Ssuh8xlsTutrA@stanley.mountain/T/#u Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/member.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 356337102015ec..c0f557a80a7542 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -571,7 +571,13 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { node = find_config_node(rv, memb->nodeid); - if (node && !node->new && !node->gone) + if (!node) { + log_error(ls, "remove member %d invalid", + memb->nodeid); + return -EFAULT; + } + + if (!node->new && !node->gone) continue; release_recover = 0; From a8abcff174f7f9ce4587c6451b1a2450d01f52c9 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 14 Aug 2025 11:22:12 -0400 Subject: [PATCH 0143/3206] dlm: move to rinfo for all middle conversion cases Since commit f74dacb4c8116 ("dlm: fix recovery of middle conversions") we introduced additional debugging information if we hit the middle conversion by using log_limit(). The DLM log_limit() functionality requires a DLM debug option being enabled. As this case is so rarely and excempt any potential introduced new issue with recovery we switching it to log_rinfo() ad this is ratelimited under normal DLM loglevel. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lock.c | 2 +- fs/dlm/recover.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 6dd3a524cd3529..be938fdf17d967 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -5576,7 +5576,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, if (rl->rl_status == DLM_LKSTS_CONVERT && middle_conversion(lkb)) { /* We may need to adjust grmode depending on other granted locks. */ - log_limit(ls, "%s %x middle convert gr %d rq %d remote %d %x", + log_rinfo(ls, "%s %x middle convert gr %d rq %d remote %d %x", __func__, lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid); rsb_set_flag(r, RSB_RECOVER_CONVERT); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index be4240f09abd42..3ac020fb8139ea 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -842,7 +842,7 @@ static void recover_conversion(struct dlm_rsb *r) */ if (((lkb->lkb_grmode == DLM_LOCK_PR) && (other_grmode == DLM_LOCK_CW)) || ((lkb->lkb_grmode == DLM_LOCK_CW) && (other_grmode == DLM_LOCK_PR))) { - log_limit(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL", + log_rinfo(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL", __func__, lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid, other_lkid, other_grmode); From 8d90041a0d285044b89629f539ca0685e156848b Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 14 Aug 2025 11:22:13 -0400 Subject: [PATCH 0144/3206] dlm: handle release_option as unsigned Future patches will introduce a invalid argument check for undefined values. All values for release_option are positive integer values to not check on negative values as well we just change the parameter to unsigned int. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 8 ++++---- include/linux/dlm.h | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d986b7ef153dcf..8eadbf0fdf179d 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -676,7 +676,7 @@ int dlm_new_user_lockspace(const char *name, const char *cluster, This is because there may be LKBs queued as ASTs that have been unlinked from their RSBs and are pending deletion once the AST has been delivered */ -static int lockspace_busy(struct dlm_ls *ls, int release_option) +static int lockspace_busy(struct dlm_ls *ls, unsigned int release_option) { struct dlm_lkb *lkb; unsigned long id; @@ -704,7 +704,7 @@ static int lockspace_busy(struct dlm_ls *ls, int release_option) return rv; } -static int release_lockspace(struct dlm_ls *ls, int release_option) +static int release_lockspace(struct dlm_ls *ls, unsigned int release_option) { int busy, rv; @@ -792,7 +792,7 @@ static int release_lockspace(struct dlm_ls *ls, int release_option) * See DLM_RELEASE defines for release_option values and their meaning. */ -int dlm_release_lockspace(void *lockspace, int force) +int dlm_release_lockspace(void *lockspace, unsigned int release_option) { struct dlm_ls *ls; int error; @@ -803,7 +803,7 @@ int dlm_release_lockspace(void *lockspace, int force) dlm_put_lockspace(ls); mutex_lock(&ls_lock); - error = release_lockspace(ls, force); + error = release_lockspace(ls, release_option); if (!error) ls_count--; if (!ls_count) diff --git a/include/linux/dlm.h b/include/linux/dlm.h index 108eb953eb1811..34015a008b801c 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -122,7 +122,8 @@ int dlm_new_lockspace(const char *name, const char *cluster, * release_option: see DLM_RELEASE values above. */ -int dlm_release_lockspace(dlm_lockspace_t *lockspace, int release_option); +int dlm_release_lockspace(dlm_lockspace_t *lockspace, + unsigned int release_option); /* * dlm_lock From 8e40210788636619404871df07445fa4590138b4 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 14 Aug 2025 11:22:14 -0400 Subject: [PATCH 0145/3206] dlm: check for undefined release_option values Checking on all undefined release_option values to return -EINVAL in case a user is providing them to dlm_release_lockspace(). Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 3 +++ include/linux/dlm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 8eadbf0fdf179d..ddaa765587068f 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -797,6 +797,9 @@ int dlm_release_lockspace(void *lockspace, unsigned int release_option) struct dlm_ls *ls; int error; + if (release_option > __DLM_RELEASE_MAX) + return -EINVAL; + ls = dlm_find_lockspace_local(lockspace); if (!ls) return -EINVAL; diff --git a/include/linux/dlm.h b/include/linux/dlm.h index 34015a008b801c..7e7b45b0d09709 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -113,6 +113,7 @@ int dlm_new_lockspace(const char *name, const char *cluster, #define DLM_RELEASE_NORMAL 2 #define DLM_RELEASE_NO_EVENT 3 #define DLM_RELEASE_RECOVER 4 +#define __DLM_RELEASE_MAX DLM_RELEASE_RECOVER /* * dlm_release_lockspace From 02fc01adec1c4b107fed08c30ac979b62136998f Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 19 Jul 2025 09:17:21 +0000 Subject: [PATCH 0146/3206] riscv, bpf: Extract emit_stx() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's a lot of redundant code related to store from register operations, let's extract emit_stx() to make code more compact. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20250719091730.2660197-2-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit_comp64.c | 172 ++++++++------------------------ 1 file changed, 41 insertions(+), 131 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 10e01ff06312d9..ba75ba179b260d 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -559,52 +559,39 @@ static int emit_load_64(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_cont return ctx->ninsns - insns_start; } -static void emit_store_8(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_stx_insn(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context *ctx) { - if (is_12b_int(off)) { + switch (size) { + case BPF_B: emit(rv_sb(rd, off, rs), ctx); - return; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit(rv_sb(RV_REG_T1, 0, rs), ctx); -} - -static void emit_store_16(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) -{ - if (is_12b_int(off)) { + break; + case BPF_H: emit(rv_sh(rd, off, rs), ctx); - return; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit(rv_sh(RV_REG_T1, 0, rs), ctx); -} - -static void emit_store_32(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) -{ - if (is_12b_int(off)) { + break; + case BPF_W: emit_sw(rd, off, rs, ctx); - return; + break; + case BPF_DW: + emit_sd(rd, off, rs, ctx); + break; } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit_sw(RV_REG_T1, 0, rs, ctx); } -static void emit_store_64(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static int emit_stx(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context *ctx) { + int insns_start; + if (is_12b_int(off)) { - emit_sd(rd, off, rs, ctx); - return; + insns_start = ctx->ninsns; + emit_stx_insn(rd, off, rs, size, ctx); + return ctx->ninsns - insns_start; } emit_imm(RV_REG_T1, off, ctx); emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit_sd(RV_REG_T1, 0, rs, ctx); + insns_start = ctx->ninsns; + emit_stx_insn(RV_REG_T1, 0, rs, size, ctx); + return ctx->ninsns - insns_start; } static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, @@ -642,20 +629,7 @@ static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, /* store_release(dst_reg + off16, src_reg) */ case BPF_STORE_REL: emit_fence_rw_w(ctx); - switch (BPF_SIZE(code)) { - case BPF_B: - emit_store_8(rd, off, rs, ctx); - break; - case BPF_H: - emit_store_16(rd, off, rs, ctx); - break; - case BPF_W: - emit_store_32(rd, off, rs, ctx); - break; - case BPF_DW: - emit_store_64(rd, off, rs, ctx); - break; - } + emit_stx(rd, off, rs, BPF_SIZE(code), ctx); break; default: pr_err_once("bpf-jit: invalid atomic load/store opcode %02x\n", imm); @@ -2023,106 +1997,42 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, /* STX: *(size *)(dst + off) = src */ case BPF_STX | BPF_MEM | BPF_B: - emit_store_8(rd, off, rs, ctx); - break; case BPF_STX | BPF_MEM | BPF_H: - emit_store_16(rd, off, rs, ctx); - break; case BPF_STX | BPF_MEM | BPF_W: - emit_store_32(rd, off, rs, ctx); - break; case BPF_STX | BPF_MEM | BPF_DW: - emit_store_64(rd, off, rs, ctx); - break; - case BPF_STX | BPF_ATOMIC | BPF_B: - case BPF_STX | BPF_ATOMIC | BPF_H: - case BPF_STX | BPF_ATOMIC | BPF_W: - case BPF_STX | BPF_ATOMIC | BPF_DW: - if (bpf_atomic_is_load_store(insn)) - ret = emit_atomic_ld_st(rd, rs, insn, ctx); - else - ret = emit_atomic_rmw(rd, rs, insn, ctx); - if (ret) - return ret; - break; - + /* STX | PROBE_MEM32: *(size *)(dst + RV_REG_ARENA + off) = src */ case BPF_STX | BPF_PROBE_MEM32 | BPF_B: case BPF_STX | BPF_PROBE_MEM32 | BPF_H: case BPF_STX | BPF_PROBE_MEM32 | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: { - int insn_len, insns_start; - - emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); - rd = RV_REG_T2; - - switch (BPF_SIZE(code)) { - case BPF_B: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sb(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sb(RV_REG_T1, 0, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_H: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sh(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sh(RV_REG_T1, 0, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_W: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sw(rd, off, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit_sw(RV_REG_T1, 0, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_DW: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sd(rd, off, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } + int insn_len; - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit_sd(RV_REG_T1, 0, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T2; } - ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, - insn_len); + insn_len = emit_stx(rd, off, rs, BPF_SIZE(code), ctx); + + ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, insn_len); if (ret) return ret; - break; } + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (bpf_atomic_is_load_store(insn)) + ret = emit_atomic_ld_st(rd, rs, insn, ctx); + else + ret = emit_atomic_rmw(rd, rs, insn, ctx); + if (ret) + return ret; + break; + default: pr_err("bpf-jit: unknown opcode %02x\n", code); return -EINVAL; From d92c11a6b55b16a679c8533119ee77baf098cb4e Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 19 Jul 2025 09:17:22 +0000 Subject: [PATCH 0147/3206] riscv, bpf: Extract emit_st() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's a lot of redundant code related to store from immediate operations, let's extract emit_st() to make code more compact. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20250719091730.2660197-3-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit_comp64.c | 135 ++++++-------------------------- 1 file changed, 26 insertions(+), 109 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index ba75ba179b260d..5e354f686ea3b3 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -577,6 +577,24 @@ static void emit_stx_insn(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context } } +static int emit_st(u8 rd, s16 off, s32 imm, u8 size, struct rv_jit_context *ctx) +{ + int insns_start; + + emit_imm(RV_REG_T1, imm, ctx); + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_stx_insn(rd, off, RV_REG_T1, size, ctx); + return ctx->ninsns - insns_start; + } + + emit_imm(RV_REG_T2, off, ctx); + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); + insns_start = ctx->ninsns; + emit_stx_insn(RV_REG_T2, 0, RV_REG_T1, size, ctx); + return ctx->ninsns - insns_start; +} + static int emit_stx(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context *ctx) { int insns_start; @@ -1870,128 +1888,27 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit(rv_sb(rd, off, RV_REG_T1), ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx); - break; - case BPF_ST | BPF_MEM | BPF_H: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit(rv_sh(rd, off, RV_REG_T1), ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx); - break; case BPF_ST | BPF_MEM | BPF_W: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit_sw(rd, off, RV_REG_T1, ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx); - break; case BPF_ST | BPF_MEM | BPF_DW: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit_sd(rd, off, RV_REG_T1, ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx); - break; - + /* ST | PROBE_MEM32: *(size *)(dst + RV_REG_ARENA + off) = imm */ case BPF_ST | BPF_PROBE_MEM32 | BPF_B: case BPF_ST | BPF_PROBE_MEM32 | BPF_H: case BPF_ST | BPF_PROBE_MEM32 | BPF_W: case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: { - int insn_len, insns_start; - - emit_add(RV_REG_T3, rd, RV_REG_ARENA, ctx); - rd = RV_REG_T3; - - /* Load imm to a register then store it */ - emit_imm(RV_REG_T1, imm, ctx); - - switch (BPF_SIZE(code)) { - case BPF_B: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sb(rd, off, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_H: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sh(rd, off, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_W: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sw(rd, off, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_DW: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sd(rd, off, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } + int insn_len; - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit_add(RV_REG_T3, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T3; } - ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, - insn_len); + insn_len = emit_st(rd, off, imm, BPF_SIZE(code), ctx); + + ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, insn_len); if (ret) return ret; - break; } From 01422a4f2c78c69febeafb9110d013f2bb46c58f Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 19 Jul 2025 09:17:23 +0000 Subject: [PATCH 0148/3206] riscv, bpf: Extract emit_ldx() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's a lot of redundant code related to load into register operations, let's extract emit_ldx() to make code more compact. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20250719091730.2660197-4-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit_comp64.c | 143 ++++++++------------------------ 1 file changed, 35 insertions(+), 108 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 5e354f686ea3b3..a6a9fd9193e5e4 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -473,90 +473,24 @@ static inline void emit_kcfi(u32 hash, struct rv_jit_context *ctx) emit(hash, ctx); } -static int emit_load_8(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_ldx_insn(u8 rd, s16 off, u8 rs, u8 size, bool sign_ext, + struct rv_jit_context *ctx) { - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lb(rd, off, rs), ctx); - else - emit(rv_lbu(rd, off, rs), ctx); - return ctx->ninsns - insns_start; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lb(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lbu(rd, 0, RV_REG_T1), ctx); - return ctx->ninsns - insns_start; -} - -static int emit_load_16(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) -{ - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lh(rd, off, rs), ctx); - else - emit(rv_lhu(rd, off, rs), ctx); - return ctx->ninsns - insns_start; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lh(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lhu(rd, 0, RV_REG_T1), ctx); - return ctx->ninsns - insns_start; -} - -static int emit_load_32(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) -{ - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lw(rd, off, rs), ctx); - else - emit(rv_lwu(rd, off, rs), ctx); - return ctx->ninsns - insns_start; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lw(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lwu(rd, 0, RV_REG_T1), ctx); - return ctx->ninsns - insns_start; -} - -static int emit_load_64(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) -{ - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; + switch (size) { + case BPF_B: + emit(sign_ext ? rv_lb(rd, off, rs) : rv_lbu(rd, off, rs), ctx); + break; + case BPF_H: + emit(sign_ext ? rv_lh(rd, off, rs) : rv_lhu(rd, off, rs), ctx); + break; + case BPF_W: + emit(sign_ext ? rv_lw(rd, off, rs) : rv_lwu(rd, off, rs), ctx); + break; + case BPF_DW: emit_ld(rd, off, rs, ctx); - return ctx->ninsns - insns_start; + break; } - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - emit_ld(rd, 0, RV_REG_T1, ctx); - return ctx->ninsns - insns_start; } static void emit_stx_insn(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context *ctx) @@ -577,6 +511,24 @@ static void emit_stx_insn(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context } } +static int emit_ldx(u8 rd, s16 off, u8 rs, u8 size, bool sign_ext, + struct rv_jit_context *ctx) +{ + int insns_start; + + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_ldx_insn(rd, off, rs, size, sign_ext, ctx); + return ctx->ninsns - insns_start; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); + insns_start = ctx->ninsns; + emit_ldx_insn(rd, 0, RV_REG_T1, size, sign_ext, ctx); + return ctx->ninsns - insns_start; +} + static int emit_st(u8 rd, s16 off, s32 imm, u8 size, struct rv_jit_context *ctx) { int insns_start; @@ -622,20 +574,7 @@ static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, switch (imm) { /* dst_reg = load_acquire(src_reg + off16) */ case BPF_LOAD_ACQ: - switch (BPF_SIZE(code)) { - case BPF_B: - emit_load_8(false, rd, off, rs, ctx); - break; - case BPF_H: - emit_load_16(false, rd, off, rs, ctx); - break; - case BPF_W: - emit_load_32(false, rd, off, rs, ctx); - break; - case BPF_DW: - emit_load_64(false, rd, off, rs, ctx); - break; - } + emit_ldx(rd, off, rs, BPF_SIZE(code), false, ctx); emit_fence_r_rw(ctx); /* If our next insn is a redundant zext, return 1 to tell @@ -1859,20 +1798,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, rs = RV_REG_T2; } - switch (BPF_SIZE(code)) { - case BPF_B: - insn_len = emit_load_8(sign_ext, rd, off, rs, ctx); - break; - case BPF_H: - insn_len = emit_load_16(sign_ext, rd, off, rs, ctx); - break; - case BPF_W: - insn_len = emit_load_32(sign_ext, rd, off, rs, ctx); - break; - case BPF_DW: - insn_len = emit_load_64(sign_ext, rd, off, rs, ctx); - break; - } + insn_len = emit_ldx(rd, off, rs, BPF_SIZE(code), sign_ext, ctx); ret = add_exception_handler(insn, ctx, rd, insn_len); if (ret) @@ -1882,6 +1808,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, return 1; break; } + /* speculation barrier */ case BPF_ST | BPF_NOSPEC: break; From ec74ae56626bad5707d677d3bf05a1e3350a971e Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 19 Jul 2025 09:17:24 +0000 Subject: [PATCH 0149/3206] riscv: Separate toolchain support dependency from RISCV_ISA_ZACAS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RV64 bpf is going to support ZACAS instructions. Let's separate toolchain support dependency from RISCV_ISA_ZACAS. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20250719091730.2660197-5-pulehui@huaweicloud.com --- arch/riscv/Kconfig | 1 - arch/riscv/include/asm/cmpxchg.h | 6 ++++-- arch/riscv/kernel/setup.c | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a4b233a0659ed8..451eb23d86c963 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -714,7 +714,6 @@ config TOOLCHAIN_HAS_ZACAS config RISCV_ISA_ZACAS bool "Zacas extension support for atomic CAS" - depends on TOOLCHAIN_HAS_ZACAS depends on RISCV_ALTERNATIVE default y help diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 0b749e71021624..4f4f389282b8c3 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -133,6 +133,7 @@ ({ \ if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) && \ IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && \ + IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZACAS) && \ riscv_has_extension_unlikely(RISCV_ISA_EXT_ZABHA) && \ riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS)) { \ r = o; \ @@ -180,6 +181,7 @@ r, p, co, o, n) \ ({ \ if (IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && \ + IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZACAS) && \ riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS)) { \ r = o; \ \ @@ -315,7 +317,7 @@ arch_cmpxchg_release((ptr), (o), (n)); \ }) -#if defined(CONFIG_64BIT) && defined(CONFIG_RISCV_ISA_ZACAS) +#if defined(CONFIG_64BIT) && defined(CONFIG_RISCV_ISA_ZACAS) && defined(CONFIG_TOOLCHAIN_HAS_ZACAS) #define system_has_cmpxchg128() riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS) @@ -351,7 +353,7 @@ union __u128_halves { #define arch_cmpxchg128_local(ptr, o, n) \ __arch_cmpxchg128((ptr), (o), (n), "") -#endif /* CONFIG_64BIT && CONFIG_RISCV_ISA_ZACAS */ +#endif /* CONFIG_64BIT && CONFIG_RISCV_ISA_ZACAS && CONFIG_TOOLCHAIN_HAS_ZACAS */ #ifdef CONFIG_RISCV_ISA_ZAWRS /* diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index f90cce7a3acea8..14235e58c539cd 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -290,6 +290,7 @@ static void __init riscv_spinlock_init(void) if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) && IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && + IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZACAS) && riscv_isa_extension_available(NULL, ZABHA) && riscv_isa_extension_available(NULL, ZACAS)) { using_ext = "using Zabha"; From 5090b339eeb3417baa940d690923e630e077d8f8 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 19 Jul 2025 09:17:25 +0000 Subject: [PATCH 0150/3206] riscv, bpf: Add rv_ext_enabled macro for runtime detection extentsion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add rv_ext_enabled macro to check whether the runtime detection extension is enabled. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20250719091730.2660197-6-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit.h | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index e7b032dfd17f0f..0964df48c25eaf 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -13,21 +13,15 @@ #include #include +/* verify runtime detection extension status */ +#define rv_ext_enabled(ext) \ + (IS_ENABLED(CONFIG_RISCV_ISA_##ext) && riscv_has_extension_likely(RISCV_ISA_EXT_##ext)) + static inline bool rvc_enabled(void) { return IS_ENABLED(CONFIG_RISCV_ISA_C); } -static inline bool rvzba_enabled(void) -{ - return IS_ENABLED(CONFIG_RISCV_ISA_ZBA) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBA); -} - -static inline bool rvzbb_enabled(void) -{ - return IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBB); -} - enum { RV_REG_ZERO = 0, /* The constant value 0 */ RV_REG_RA = 1, /* Return address */ @@ -1123,7 +1117,7 @@ static inline void emit_sw(u8 rs1, s32 off, u8 rs2, struct rv_jit_context *ctx) static inline void emit_sh2add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) { - if (rvzba_enabled()) { + if (rv_ext_enabled(ZBA)) { emit(rvzba_sh2add(rd, rs1, rs2), ctx); return; } @@ -1134,7 +1128,7 @@ static inline void emit_sh2add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx static inline void emit_sh3add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) { - if (rvzba_enabled()) { + if (rv_ext_enabled(ZBA)) { emit(rvzba_sh3add(rd, rs1, rs2), ctx); return; } @@ -1184,7 +1178,7 @@ static inline void emit_subw(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) static inline void emit_sextb(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { emit(rvzbb_sextb(rd, rs), ctx); return; } @@ -1195,7 +1189,7 @@ static inline void emit_sextb(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_sexth(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { emit(rvzbb_sexth(rd, rs), ctx); return; } @@ -1211,7 +1205,7 @@ static inline void emit_sextw(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { emit(rvzbb_zexth(rd, rs), ctx); return; } @@ -1222,7 +1216,7 @@ static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzba_enabled()) { + if (rv_ext_enabled(ZBA)) { emit(rvzba_zextw(rd, rs), ctx); return; } @@ -1233,7 +1227,7 @@ static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_bswap(u8 rd, s32 imm, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { int bits = 64 - imm; emit(rvzbb_rev8(rd, rd), ctx); From de39d2c4cdb68ece47ff9609dd5c9e87b918d1bd Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 19 Jul 2025 09:17:26 +0000 Subject: [PATCH 0151/3206] riscv, bpf: Add Zacas instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Zacas instructions introduced by [0] to reduce code size and improve performance of RV64 JIT. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: Björn Töpel Acked-by: Björn Töpel Link: https://github.com/riscvarchive/riscv-zacas/releases/download/v1.0/riscv-zacas.pdf [0] Link: https://lore.kernel.org/bpf/20250719091730.2660197-7-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index 0964df48c25eaf..2351fba5d3e7c0 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -751,6 +751,17 @@ static inline u16 rvc_swsp(u32 imm8, u8 rs2) return rv_css_insn(0x6, imm, rs2, 0x2); } +/* RVZACAS instructions. */ +static inline u32 rvzacas_amocas_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x5, aq, rl, rs2, rs1, 2, rd, 0x2f); +} + +static inline u32 rvzacas_amocas_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x5, aq, rl, rs2, rs1, 3, rd, 0x2f); +} + /* RVZBA instructions. */ static inline u32 rvzba_sh2add(u8 rd, u8 rs1, u8 rs2) { From 1c0196b878a69927449d649163388ad63cafcacb Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 19 Jul 2025 09:17:27 +0000 Subject: [PATCH 0152/3206] riscv, bpf: Optimize cmpxchg insn with Zacas support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Optimize cmpxchg instruction with amocas.w and amocas.d Zacas instructions. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20250719091730.2660197-8-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit.h | 27 +++++++++++++++++++++++++++ arch/riscv/net/bpf_jit_comp64.c | 18 ++---------------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index 2351fba5d3e7c0..0790f40b7e9dbf 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -1294,6 +1294,33 @@ static inline void emit_bswap(u8 rd, s32 imm, struct rv_jit_context *ctx) emit_mv(rd, RV_REG_T2, ctx); } +static inline void emit_cmpxchg(u8 rd, u8 rs, u8 r0, bool is64, struct rv_jit_context *ctx) +{ + int jmp_offset; + + if (rv_ext_enabled(ZACAS)) { + emit(is64 ? rvzacas_amocas_d(r0, rs, rd, 1, 1) : + rvzacas_amocas_w(r0, rs, rd, 1, 1), ctx); + if (!is64) + emit_zextw(r0, r0, ctx); + return; + } + + if (is64) + emit_mv(RV_REG_T2, r0, ctx); + else + emit_addiw(RV_REG_T2, r0, 0, ctx); + emit(is64 ? rv_lr_d(r0, 0, rd, 0, 0) : + rv_lr_w(r0, 0, rd, 0, 0), ctx); + jmp_offset = ninsns_rvoff(8); + emit(rv_bne(RV_REG_T2, r0, jmp_offset >> 1), ctx); + emit(is64 ? rv_sc_d(RV_REG_T3, rs, rd, 0, 1) : + rv_sc_w(RV_REG_T3, rs, rd, 0, 1), ctx); + jmp_offset = ninsns_rvoff(-6); + emit(rv_bne(RV_REG_T3, 0, jmp_offset >> 1), ctx); + emit_fence_rw_rw(ctx); +} + #endif /* __riscv_xlen == 64 */ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index a6a9fd9193e5e4..8e813809d3054e 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -599,10 +599,9 @@ static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, struct rv_jit_context *ctx) { - u8 r0, code = insn->code; + u8 code = insn->code; s16 off = insn->off; s32 imm = insn->imm; - int jmp_offset; bool is64; if (BPF_SIZE(code) != BPF_W && BPF_SIZE(code) != BPF_DW) { @@ -673,20 +672,7 @@ static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, break; /* r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg); */ case BPF_CMPXCHG: - r0 = bpf_to_rv_reg(BPF_REG_0, ctx); - if (is64) - emit_mv(RV_REG_T2, r0, ctx); - else - emit_addiw(RV_REG_T2, r0, 0, ctx); - emit(is64 ? rv_lr_d(r0, 0, rd, 0, 0) : - rv_lr_w(r0, 0, rd, 0, 0), ctx); - jmp_offset = ninsns_rvoff(8); - emit(rv_bne(RV_REG_T2, r0, jmp_offset >> 1), ctx); - emit(is64 ? rv_sc_d(RV_REG_T3, rs, rd, 0, 1) : - rv_sc_w(RV_REG_T3, rs, rd, 0, 1), ctx); - jmp_offset = ninsns_rvoff(-6); - emit(rv_bne(RV_REG_T3, 0, jmp_offset >> 1), ctx); - emit_fence_rw_rw(ctx); + emit_cmpxchg(rd, rs, regmap[BPF_REG_0], is64, ctx); break; default: pr_err_once("bpf-jit: invalid atomic RMW opcode %02x\n", imm); From b18f4aae6a5db5ada5aba02b43dbdd3909e5f87c Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 19 Jul 2025 09:17:28 +0000 Subject: [PATCH 0153/3206] riscv, bpf: Add ex_insn_off and ex_jmp_off for exception table handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ex_insn_off and ex_jmp_off fields to struct rv_jit_context so that add_exception_handler() does not need to be immediately followed by the instruction to add the exception table. ex_insn_off indicates the offset of the instruction to add the exception table, and ex_jmp_off indicates the offset to jump over the faulting instruction. This is to prepare for adding the exception table to atomic instructions later, because some atomic instructions need to perform zext or other operations. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20250719091730.2660197-9-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit.h | 2 + arch/riscv/net/bpf_jit_comp64.c | 84 +++++++++++++++------------------ 2 files changed, 39 insertions(+), 47 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index 0790f40b7e9dbf..be2915444ce509 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -78,6 +78,8 @@ struct rv_jit_context { int epilogue_offset; int *offset; /* BPF to RV */ int nexentries; + int ex_insn_off; + int ex_jmp_off; unsigned long flags; int stack_size; u64 arena_vm_start; diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 8e813809d3054e..56b592af53a64d 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -511,57 +511,54 @@ static void emit_stx_insn(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context } } -static int emit_ldx(u8 rd, s16 off, u8 rs, u8 size, bool sign_ext, +static void emit_ldx(u8 rd, s16 off, u8 rs, u8 size, bool sign_ext, struct rv_jit_context *ctx) { - int insns_start; - if (is_12b_int(off)) { - insns_start = ctx->ninsns; + ctx->ex_insn_off = ctx->ninsns; emit_ldx_insn(rd, off, rs, size, sign_ext, ctx); - return ctx->ninsns - insns_start; + ctx->ex_jmp_off = ctx->ninsns; + return; } emit_imm(RV_REG_T1, off, ctx); emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; + ctx->ex_insn_off = ctx->ninsns; emit_ldx_insn(rd, 0, RV_REG_T1, size, sign_ext, ctx); - return ctx->ninsns - insns_start; + ctx->ex_jmp_off = ctx->ninsns; } -static int emit_st(u8 rd, s16 off, s32 imm, u8 size, struct rv_jit_context *ctx) +static void emit_st(u8 rd, s16 off, s32 imm, u8 size, struct rv_jit_context *ctx) { - int insns_start; - emit_imm(RV_REG_T1, imm, ctx); if (is_12b_int(off)) { - insns_start = ctx->ninsns; + ctx->ex_insn_off = ctx->ninsns; emit_stx_insn(rd, off, RV_REG_T1, size, ctx); - return ctx->ninsns - insns_start; + ctx->ex_jmp_off = ctx->ninsns; + return; } emit_imm(RV_REG_T2, off, ctx); emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; + ctx->ex_insn_off = ctx->ninsns; emit_stx_insn(RV_REG_T2, 0, RV_REG_T1, size, ctx); - return ctx->ninsns - insns_start; + ctx->ex_jmp_off = ctx->ninsns; } -static int emit_stx(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context *ctx) +static void emit_stx(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context *ctx) { - int insns_start; - if (is_12b_int(off)) { - insns_start = ctx->ninsns; + ctx->ex_insn_off = ctx->ninsns; emit_stx_insn(rd, off, rs, size, ctx); - return ctx->ninsns - insns_start; + ctx->ex_jmp_off = ctx->ninsns; + return; } emit_imm(RV_REG_T1, off, ctx); emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; + ctx->ex_insn_off = ctx->ninsns; emit_stx_insn(RV_REG_T1, 0, rs, size, ctx); - return ctx->ninsns - insns_start; + ctx->ex_jmp_off = ctx->ninsns; } static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, @@ -700,9 +697,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, } /* For accesses to BTF pointers, add an entry to the exception table */ -static int add_exception_handler(const struct bpf_insn *insn, - struct rv_jit_context *ctx, - int dst_reg, int insn_len) +static int add_exception_handler(const struct bpf_insn *insn, int dst_reg, + struct rv_jit_context *ctx) { struct exception_table_entry *ex; unsigned long pc; @@ -710,21 +706,22 @@ static int add_exception_handler(const struct bpf_insn *insn, off_t fixup_offset; if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable || - (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX && - BPF_MODE(insn->code) != BPF_PROBE_MEM32)) + ctx->ex_insn_off <= 0 || ctx->ex_jmp_off <= 0) return 0; - if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries)) - return -EINVAL; + if (BPF_MODE(insn->code) != BPF_PROBE_MEM && + BPF_MODE(insn->code) != BPF_PROBE_MEMSX && + BPF_MODE(insn->code) != BPF_PROBE_MEM32) + return 0; - if (WARN_ON_ONCE(insn_len > ctx->ninsns)) + if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries)) return -EINVAL; - if (WARN_ON_ONCE(!rvc_enabled() && insn_len == 1)) + if (WARN_ON_ONCE(ctx->ex_insn_off > ctx->ninsns || ctx->ex_jmp_off > ctx->ninsns)) return -EINVAL; ex = &ctx->prog->aux->extable[ctx->nexentries]; - pc = (unsigned long)&ctx->ro_insns[ctx->ninsns - insn_len]; + pc = (unsigned long)&ctx->ro_insns[ctx->ex_insn_off]; /* * This is the relative offset of the instruction that may fault from @@ -748,7 +745,7 @@ static int add_exception_handler(const struct bpf_insn *insn, * that may fault. The execution will jump to this after handling the * fault. */ - fixup_offset = (long)&ex->fixup - (pc + insn_len * sizeof(u16)); + fixup_offset = (long)&ex->fixup - (long)&ctx->ro_insns[ctx->ex_jmp_off]; if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, fixup_offset)) return -ERANGE; @@ -765,6 +762,8 @@ static int add_exception_handler(const struct bpf_insn *insn, FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); ex->type = EX_TYPE_BPF; + ctx->ex_insn_off = 0; + ctx->ex_jmp_off = 0; ctx->nexentries++; return 0; } @@ -1774,7 +1773,6 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: { bool sign_ext; - int insn_len; sign_ext = BPF_MODE(insn->code) == BPF_MEMSX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX; @@ -1784,9 +1782,9 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, rs = RV_REG_T2; } - insn_len = emit_ldx(rd, off, rs, BPF_SIZE(code), sign_ext, ctx); + emit_ldx(rd, off, rs, BPF_SIZE(code), sign_ext, ctx); - ret = add_exception_handler(insn, ctx, rd, insn_len); + ret = add_exception_handler(insn, rd, ctx); if (ret) return ret; @@ -1809,21 +1807,17 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_ST | BPF_PROBE_MEM32 | BPF_H: case BPF_ST | BPF_PROBE_MEM32 | BPF_W: case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: - { - int insn_len; - if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { emit_add(RV_REG_T3, rd, RV_REG_ARENA, ctx); rd = RV_REG_T3; } - insn_len = emit_st(rd, off, imm, BPF_SIZE(code), ctx); + emit_st(rd, off, imm, BPF_SIZE(code), ctx); - ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, insn_len); + ret = add_exception_handler(insn, REG_DONT_CLEAR_MARKER, ctx); if (ret) return ret; break; - } /* STX: *(size *)(dst + off) = src */ case BPF_STX | BPF_MEM | BPF_B: @@ -1835,21 +1829,17 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_STX | BPF_PROBE_MEM32 | BPF_H: case BPF_STX | BPF_PROBE_MEM32 | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: - { - int insn_len; - if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); rd = RV_REG_T2; } - insn_len = emit_stx(rd, off, rs, BPF_SIZE(code), ctx); + emit_stx(rd, off, rs, BPF_SIZE(code), ctx); - ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, insn_len); + ret = add_exception_handler(insn, REG_DONT_CLEAR_MARKER, ctx); if (ret) return ret; break; - } case BPF_STX | BPF_ATOMIC | BPF_B: case BPF_STX | BPF_ATOMIC | BPF_H: From fb7cefabae8117c203155ef169a386bec43bbba9 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 19 Jul 2025 09:17:29 +0000 Subject: [PATCH 0154/3206] riscv, bpf: Add support arena atomics for RV64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add arena atomics support for RMW atomics and load-acquire and store-release instructions. Non-Zacas cmpxchg is implemented via loop, which is not currently supported because it requires more complex extable and loop logic. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20250719091730.2660197-10-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit.h | 2 ++ arch/riscv/net/bpf_jit_comp64.c | 60 +++++++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index be2915444ce509..632ced07bca442 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -1301,8 +1301,10 @@ static inline void emit_cmpxchg(u8 rd, u8 rs, u8 r0, bool is64, struct rv_jit_co int jmp_offset; if (rv_ext_enabled(ZACAS)) { + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rvzacas_amocas_d(r0, rs, rd, 1, 1) : rvzacas_amocas_w(r0, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(r0, r0, ctx); return; diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 56b592af53a64d..549c3063c7f115 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -571,6 +571,11 @@ static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, switch (imm) { /* dst_reg = load_acquire(src_reg + off16) */ case BPF_LOAD_ACQ: + if (BPF_MODE(code) == BPF_PROBE_ATOMIC) { + emit_add(RV_REG_T2, rs, RV_REG_ARENA, ctx); + rs = RV_REG_T2; + } + emit_ldx(rd, off, rs, BPF_SIZE(code), false, ctx); emit_fence_r_rw(ctx); @@ -582,6 +587,11 @@ static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, break; /* store_release(dst_reg + off16, src_reg) */ case BPF_STORE_REL: + if (BPF_MODE(code) == BPF_PROBE_ATOMIC) { + emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T2; + } + emit_fence_rw_w(ctx); emit_stx(rd, off, rs, BPF_SIZE(code), ctx); break; @@ -599,13 +609,12 @@ static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, u8 code = insn->code; s16 off = insn->off; s32 imm = insn->imm; - bool is64; + bool is64 = BPF_SIZE(code) == BPF_DW; if (BPF_SIZE(code) != BPF_W && BPF_SIZE(code) != BPF_DW) { pr_err_once("bpf-jit: 1- and 2-byte RMW atomics are not supported\n"); return -EINVAL; } - is64 = BPF_SIZE(code) == BPF_DW; if (off) { if (is_12b_int(off)) { @@ -617,53 +626,76 @@ static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, rd = RV_REG_T1; } + if (BPF_MODE(code) == BPF_PROBE_ATOMIC) { + emit_add(RV_REG_T1, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T1; + } + switch (imm) { /* lock *(u32/u64 *)(dst_reg + off16) = src_reg */ case BPF_ADD: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoadd_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoadd_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; case BPF_AND: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoand_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoand_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; case BPF_OR: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoor_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoor_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; case BPF_XOR: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoxor_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoxor_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; /* src_reg = atomic_fetch_(dst_reg + off16, src_reg) */ case BPF_ADD | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoadd_d(rs, rs, rd, 1, 1) : rv_amoadd_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_AND | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoand_d(rs, rs, rd, 1, 1) : rv_amoand_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_OR | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoor_d(rs, rs, rd, 1, 1) : rv_amoor_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_XOR | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoxor_d(rs, rs, rd, 1, 1) : rv_amoxor_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; /* src_reg = atomic_xchg(dst_reg + off16, src_reg); */ case BPF_XCHG: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoswap_d(rs, rs, rd, 1, 1) : rv_amoswap_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; @@ -711,7 +743,8 @@ static int add_exception_handler(const struct bpf_insn *insn, int dst_reg, if (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX && - BPF_MODE(insn->code) != BPF_PROBE_MEM32) + BPF_MODE(insn->code) != BPF_PROBE_MEM32 && + BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) return 0; if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries)) @@ -1841,14 +1874,21 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, return ret; break; + /* Atomics */ case BPF_STX | BPF_ATOMIC | BPF_B: case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_B: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_H: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: if (bpf_atomic_is_load_store(insn)) ret = emit_atomic_ld_st(rd, rs, insn, ctx); else ret = emit_atomic_rmw(rd, rs, insn, ctx); + + ret = ret ?: add_exception_handler(insn, REG_DONT_CLEAR_MARKER, ctx); if (ret) return ret; break; @@ -1979,6 +2019,20 @@ bool bpf_jit_supports_arena(void) return true; } +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + if (in_arena) { + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == BPF_CMPXCHG) + return rv_ext_enabled(ZACAS); + } + } + + return true; +} + bool bpf_jit_supports_percpu_insn(void) { return true; From dc0fe956144d9ba906bc6b2fe35b3cd54e1bfdb0 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 19 Jul 2025 09:17:30 +0000 Subject: [PATCH 0155/3206] selftests/bpf: Enable arena atomics tests for RV64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable arena atomics tests for RV64. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20250719091730.2660197-11-pulehui@huaweicloud.com --- tools/testing/selftests/bpf/progs/arena_atomics.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c index a52feff9811264..d1841aac94a22f 100644 --- a/tools/testing/selftests/bpf/progs/arena_atomics.c +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -28,7 +28,8 @@ bool skip_all_tests = true; #if defined(ENABLE_ATOMICS_TESTS) && \ defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ - (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) bool skip_lacq_srel_tests __attribute((__section__(".data"))) = false; #else bool skip_lacq_srel_tests = true; @@ -314,7 +315,8 @@ int load_acquire(const void *ctx) { #if defined(ENABLE_ATOMICS_TESTS) && \ defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ - (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) #define LOAD_ACQUIRE_ARENA(SIZEOP, SIZE, SRC, DST) \ { asm volatile ( \ @@ -365,7 +367,8 @@ int store_release(const void *ctx) { #if defined(ENABLE_ATOMICS_TESTS) && \ defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ - (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) #define STORE_RELEASE_ARENA(SIZEOP, DST, VAL) \ { asm volatile ( \ From 2b986b9e917bc88f81aa1ed386af63b26c983f1d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 Aug 2025 20:24:37 +0200 Subject: [PATCH 0156/3206] bpf, cpumap: Disable page_pool direct xdp_return need larger scope When running an XDP bpf_prog on the remote CPU in cpumap code then we must disable the direct return optimization that xdp_return can perform for mem_type page_pool. This optimization assumes code is still executing under RX-NAPI of the original receiving CPU, which isn't true on this remote CPU. The cpumap code already disabled this via helpers xdp_set_return_frame_no_direct() and xdp_clear_return_frame_no_direct(), but the scope didn't include xdp_do_flush(). When doing XDP_REDIRECT towards e.g devmap this causes the function bq_xmit_all() to run with direct return optimization enabled. This can lead to hard to find bugs. The issue only happens when bq_xmit_all() cannot ndo_xdp_xmit all frames and them frees them via xdp_return_frame_rx_napi(). Fix by expanding scope to include xdp_do_flush(). This was found by Dragos Tatulea. Fixes: 11941f8a8536 ("bpf: cpumap: Implement generic cpumap") Reported-by: Dragos Tatulea Reported-by: Chris Arges Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Tested-by: Chris Arges Link: https://patch.msgid.link/175519587755.3008742.1088294435150406835.stgit@firesoul --- kernel/bpf/cpumap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b2b7b8ec2c2a1c..c46360b278712b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -186,7 +186,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, struct xdp_buff xdp; int i, nframes = 0; - xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; for (i = 0; i < n; i++) { @@ -231,7 +230,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } - xdp_clear_return_frame_no_direct(); stats->pass += nframes; return nframes; @@ -255,6 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + xdp_set_return_frame_no_direct(); ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); if (unlikely(ret->skb_n)) @@ -264,6 +263,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (stats->redirect) xdp_do_flush(); + xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); From e4414b01c1cd9887bbde92f946c1ba94e40d6d64 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Aug 2025 22:06:55 +0200 Subject: [PATCH 0157/3206] bpf: Check the helper function is valid in get_helper_proto kernel test robot reported verifier bug [1] where the helper func pointer could be NULL due to disabled config option. As Alexei suggested we could check on that in get_helper_proto directly. Marking tail_call helper func with BPF_PTR_POISON, because it is unused by design. [1] https://lore.kernel.org/oe-lkp/202507160818.68358831-lkp@intel.com Reported-by: kernel test robot Reported-by: syzbot+a9ed3d9132939852d0df@syzkaller.appspotmail.com Suggested-by: Alexei Starovoitov Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Paul Chaignon Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250814200655.945632-1-jolsa@kernel.org Closes: https://lore.kernel.org/oe-lkp/202507160818.68358831-lkp@intel.com --- kernel/bpf/core.c | 5 ++++- kernel/bpf/verifier.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5d1650af899d04..f8ac77d08ca72b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3024,7 +3024,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { - .func = NULL, + /* func is unused for tail_call, we set it to pass the + * get_helper_proto check + */ + .func = BPF_PTR_POISON, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af690..c89e2b1bc644b7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11354,7 +11354,7 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return -EINVAL; *ptr = env->ops->get_func_proto(func_id, env->prog); - return *ptr ? 0 : -EINVAL; + return *ptr && (*ptr)->func ? 0 : -EINVAL; } static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, From 5d6d30eca4dd1c9e8515a8d4b13106205d5c0ec4 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 14 Aug 2025 18:31:37 -0700 Subject: [PATCH 0158/3206] x86/build: Remove cc-option for GCC retpoline flags The minimum supported version of GCC to build the x86 kernel was bumped to GCC 8.1 in a3e8fe814ad1 ("x86/build: Raise the minimum GCC version to 8.1"). '-mindirect-branch' and '-mindirect-branch-register' were first supported in GCC 8.1, so there is no need to call cc-option to inquire if it is supported. Signed-off-by: Nathan Chancellor Signed-off-by: Borislav Petkov (AMD) Link: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=da99fd4a3ca06b43b08ba8d96dab84e83ac90aa7 Link: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=d543c04b795f8af4ebe5b3d5f38156ef4e5734f1 Link: https://lore.kernel.org/20250814-x86-min-ver-cleanups-v1-1-ff7f19457523@kernel.org --- arch/x86/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1913d342969ba2..ed5657395d6aca 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -13,8 +13,8 @@ else endif ifdef CONFIG_CC_IS_GCC -RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) -RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register) +RETPOLINE_CFLAGS := -mindirect-branch=thunk-extern -mindirect-branch-register +RETPOLINE_VDSO_CFLAGS := -mindirect-branch=thunk-inline -mindirect-branch-register endif ifdef CONFIG_CC_IS_CLANG RETPOLINE_CFLAGS := -mretpoline-external-thunk From e8c4f6ee8eeed8e02800bed6afb9aa22fc3476a1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Aug 2025 12:38:59 +0200 Subject: [PATCH 0159/3206] perf: Remove redundant condition for AUX buffer size It is already checked whether the VMA size is the same as nr_pages * PAGE_SIZE, so later checking both: aux_size == vma_size && aux_size == nr_pages * PAGE_SIZE is redundant. Remove the vma_size check as nr_pages is what is actually used in the allocation function. That prepares for splitting out the buffer allocation into separate functions, so that only nr_pages needs to be handed in. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104018.424519320@infradead.org --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 8060c2857bb2b3..eea3a7d6c61d70 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7043,7 +7043,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) goto aux_unlock; - if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) + if (aux_size != nr_pages * PAGE_SIZE) goto aux_unlock; /* already mapped with a different size */ From 81e026ca47b386e4213c1beff069038a3ba8bb76 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Aug 2025 12:39:00 +0200 Subject: [PATCH 0160/3206] perf: Split out mlock limit handling To prepare for splitting the buffer allocation out into separate functions for the ring buffer and the AUX buffer, split out mlock limit handling into a helper function, which can be called from both. No functional change intended. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104018.541975109@infradead.org --- kernel/events/core.c | 75 ++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index eea3a7d6c61d70..f6299012ed734f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6927,17 +6927,49 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) return err; } +static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra) +{ + unsigned long user_locked, user_lock_limit, locked, lock_limit; + struct user_struct *user = current_user(); + + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + /* Increase the limit linearly with more CPUs */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm); + + /* + * sysctl_perf_event_mlock may have changed, so that + * user->locked_vm > user_lock_limit + */ + if (user_locked > user_lock_limit) + user_locked = user_lock_limit; + user_locked += *user_extra; + + if (user_locked > user_lock_limit) { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ + *extra = user_locked - user_lock_limit; + *user_extra -= *extra; + } + + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra; + + return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK); +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; - unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); + unsigned long vma_size, nr_pages; + long user_extra = 0, extra = 0; struct mutex *aux_mutex = NULL; struct perf_buffer *rb = NULL; - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra = 0, extra = 0; int ret, flags = 0; mapped_f mapped; @@ -7063,38 +7095,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } } - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm); - - /* - * sysctl_perf_event_mlock may have changed, so that - * user->locked_vm > user_lock_limit - */ - if (user_locked > user_lock_limit) - user_locked = user_lock_limit; - user_locked += user_extra; - - if (user_locked > user_lock_limit) { - /* - * charge locked_vm until it hits user_lock_limit; - * charge the rest from pinned_vm - */ - extra = user_locked - user_lock_limit; - user_extra -= extra; - } - - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; - - if ((locked > lock_limit) && perf_is_paranoid() && - !capable(CAP_IPC_LOCK)) { + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { ret = -EPERM; goto unlock; } From 1ea3e3b0dadc06c5e6c1bdf5312e70ee861b1ba0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Aug 2025 12:39:01 +0200 Subject: [PATCH 0161/3206] perf: Split out VM accounting Similarly to the mlock limit calculation the VM accounting is required for both the ringbuffer and the AUX buffer allocations. To prepare for splitting them out into separate functions, move the accounting into a helper function. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104018.660347811@infradead.org --- kernel/events/core.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index f6299012ed734f..f90847101ade97 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6962,10 +6962,17 @@ static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK); } +static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra) +{ + struct user_struct *user = current_user(); + + atomic_long_add(user_extra, &user->locked_vm); + atomic64_add(extra, &vma->vm_mm->pinned_vm); +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; - struct user_struct *user = current_user(); unsigned long vma_size, nr_pages; long user_extra = 0, extra = 0; struct mutex *aux_mutex = NULL; @@ -7136,9 +7143,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unlock: if (!ret) { - atomic_long_add(user_extra, &user->locked_vm); - atomic64_add(extra, &vma->vm_mm->pinned_vm); - + perf_mmap_account(vma, user_extra, extra); atomic_inc(&event->mmap_count); } else if (rb) { /* AUX allocation failed */ From 86a0a7c59845e7093c9c73a7115c9d86349499d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:02 +0200 Subject: [PATCH 0162/3206] perf: Move perf_mmap_calc_limits() into both rb and aux branches if (cond) { A; } else { B; } C; into if (cond) { A; C; } else { B; C; } Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104018.781244099@infradead.org --- kernel/events/core.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index f90847101ade97..9f19c612c80f7d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7054,6 +7054,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ring_buffer_attach(event, NULL); } + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(!rb && event->rb); + + if (vma->vm_flags & VM_WRITE) + flags |= RING_BUFFER_WRITABLE; + } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -7100,17 +7110,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = 0; goto unlock; } - } - if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - ret = -EPERM; - goto unlock; - } + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + ret = -EPERM; + goto unlock; + } - WARN_ON(!rb && event->rb); + WARN_ON(!rb && event->rb); - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; + if (vma->vm_flags & VM_WRITE) + flags |= RING_BUFFER_WRITABLE; + } if (!rb) { rb = rb_alloc(nr_pages, From 3821f258686691cf12bbfc636ab22fa2b049dc86 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:03 +0200 Subject: [PATCH 0163/3206] perf: Merge consecutive conditionals in perf_mmap() if (cond) { A; } else { B; } if (cond) { C; } else { D; } into: if (cond) { A; C; } else { B; D; } Notably the conditions are not identical in form, but are equivalent. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104018.900078502@infradead.org --- kernel/events/core.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 9f19c612c80f7d..085f36f6113754 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7064,6 +7064,25 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, flags); + + if (!rb) { + ret = -ENOMEM; + goto unlock; + } + + atomic_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; + + ring_buffer_attach(event, rb); + + perf_event_update_time(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); + ret = 0; } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -7120,29 +7139,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; - } - if (!rb) { - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); - - if (!rb) { - ret = -ENOMEM; - goto unlock; - } - - atomic_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); - rb->mmap_locked = extra; - - ring_buffer_attach(event, rb); - - perf_event_update_time(event); - perf_event_init_userpage(event); - perf_event_update_userpage(event); - ret = 0; - } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); if (!ret) { From 4118994b33bb628dd9aeb941c5af6f950f1dea90 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:04 +0200 Subject: [PATCH 0164/3206] perf: Move common code into both rb and aux branches if (cond) { A; } else { B; } C; into if (cond) { A; C; } else { B; C; } Notably C has a success branch and both A and B have two places for success. For A (rb case), duplicate the success case because later patches will result in them no longer being identical. For B (aux case), share using goto (cleaned up later). Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.016252852@infradead.org --- kernel/events/core.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 085f36f6113754..dfe09b0332739d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7043,6 +7043,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = 0; /* We need the rb to map pages. */ rb = event->rb; + perf_mmap_account(vma, user_extra, extra); + atomic_inc(&event->mmap_count); goto unlock; } @@ -7083,6 +7085,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) perf_event_init_userpage(event); perf_event_update_userpage(event); ret = 0; + + perf_mmap_account(vma, user_extra, extra); + atomic_inc(&event->mmap_count); } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -7127,11 +7132,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (rb_has_aux(rb)) { atomic_inc(&rb->aux_mmap_count); ret = 0; - goto unlock; + goto aux_success; } if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { ret = -EPERM; + atomic_dec(&rb->mmap_count); goto unlock; } @@ -7142,20 +7148,19 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); - if (!ret) { - atomic_set(&rb->aux_mmap_count, 1); - rb->aux_mmap_locked = extra; + if (ret) { + atomic_dec(&rb->mmap_count); + goto unlock; } - } -unlock: - if (!ret) { + atomic_set(&rb->aux_mmap_count, 1); + rb->aux_mmap_locked = extra; +aux_success: perf_mmap_account(vma, user_extra, extra); atomic_inc(&event->mmap_count); - } else if (rb) { - /* AUX allocation failed */ - atomic_dec(&rb->mmap_count); } + +unlock: aux_unlock: if (aux_mutex) mutex_unlock(aux_mutex); From 41b80e1d74bdef5e48ea63d186244b9f6f82a4da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:05 +0200 Subject: [PATCH 0165/3206] perf: Remove redundant aux_unlock label unlock and aux_unlock are now identical, remove the aux_unlock one. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.131293512@infradead.org --- kernel/events/core.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index dfe09b0332739d..89fb069913d075 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7098,7 +7098,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) rb = event->rb; if (!rb) - goto aux_unlock; + goto unlock; aux_mutex = &rb->aux_mutex; mutex_lock(aux_mutex); @@ -7107,27 +7107,27 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) aux_size = READ_ONCE(rb->user_page->aux_size); if (aux_offset < perf_data_size(rb) + PAGE_SIZE) - goto aux_unlock; + goto unlock; if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) - goto aux_unlock; + goto unlock; /* already mapped with a different offset */ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) - goto aux_unlock; + goto unlock; if (aux_size != nr_pages * PAGE_SIZE) - goto aux_unlock; + goto unlock; /* already mapped with a different size */ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) - goto aux_unlock; + goto unlock; if (!is_power_of_2(nr_pages)) - goto aux_unlock; + goto unlock; if (!atomic_inc_not_zero(&rb->mmap_count)) - goto aux_unlock; + goto unlock; if (rb_has_aux(rb)) { atomic_inc(&rb->aux_mmap_count); @@ -7161,7 +7161,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } unlock: -aux_unlock: if (aux_mutex) mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); From b33a51564e3eb6c468979f9f08d9b4ad8451bed7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:06 +0200 Subject: [PATCH 0166/3206] perf: Use guard() for aux_mutex in perf_mmap() After duplicating the common code into the rb/aux branches is it possible to use a simple guard() for the aux_mutex. Making the aux branch self-contained. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.246250452@infradead.org --- kernel/events/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 89fb069913d075..236c60adde8839 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6975,7 +6975,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) struct perf_event *event = file->private_data; unsigned long vma_size, nr_pages; long user_extra = 0, extra = 0; - struct mutex *aux_mutex = NULL; struct perf_buffer *rb = NULL; int ret, flags = 0; mapped_f mapped; @@ -7100,8 +7099,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!rb) goto unlock; - aux_mutex = &rb->aux_mutex; - mutex_lock(aux_mutex); + guard(mutex)(&rb->aux_mutex); aux_offset = READ_ONCE(rb->user_page->aux_offset); aux_size = READ_ONCE(rb->user_page->aux_size); @@ -7161,8 +7159,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } unlock: - if (aux_mutex) - mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); if (ret) From 8558dca9fbdf825edf30b5fb74fbbbf3e6ba5dce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:07 +0200 Subject: [PATCH 0167/3206] perf: Reflow to get rid of aux_success label Mostly re-indent noise needed to get rid of that label. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.362581570@infradead.org --- kernel/events/core.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 236c60adde8839..5bbea8127bb6c8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7130,30 +7130,29 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (rb_has_aux(rb)) { atomic_inc(&rb->aux_mmap_count); ret = 0; - goto aux_success; - } - if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - ret = -EPERM; - atomic_dec(&rb->mmap_count); - goto unlock; - } + } else { + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + ret = -EPERM; + atomic_dec(&rb->mmap_count); + goto unlock; + } - WARN_ON(!rb && event->rb); + WARN_ON(!rb && event->rb); - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; + if (vma->vm_flags & VM_WRITE) + flags |= RING_BUFFER_WRITABLE; - ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, - event->attr.aux_watermark, flags); - if (ret) { - atomic_dec(&rb->mmap_count); - goto unlock; - } + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, + event->attr.aux_watermark, flags); + if (ret) { + atomic_dec(&rb->mmap_count); + goto unlock; + } - atomic_set(&rb->aux_mmap_count, 1); - rb->aux_mmap_locked = extra; -aux_success: + atomic_set(&rb->aux_mmap_count, 1); + rb->aux_mmap_locked = extra; + } perf_mmap_account(vma, user_extra, extra); atomic_inc(&event->mmap_count); } From 2aee37682391332d26c01e703170e0d9358c7252 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:08 +0200 Subject: [PATCH 0168/3206] perf: Split out the AUX buffer allocation Move the AUX buffer allocation branch into its own function. Originally-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.494205648@infradead.org --- kernel/events/core.c | 144 +++++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5bbea8127bb6c8..e76afd9c17592a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6970,6 +6970,82 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long atomic64_add(extra, &vma->vm_mm->pinned_vm); } +static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + u64 aux_offset, aux_size; + struct perf_buffer *rb; + int ret, rb_flags = 0; + + rb = event->rb; + if (!rb) + return -EINVAL; + + guard(mutex)(&rb->aux_mutex); + + /* + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. + */ + aux_offset = READ_ONCE(rb->user_page->aux_offset); + aux_size = READ_ONCE(rb->user_page->aux_size); + + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + return -EINVAL; + + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + return -EINVAL; + + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + return -EINVAL; + + if (aux_size != nr_pages * PAGE_SIZE) + return -EINVAL; + + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + return -EINVAL; + + if (!is_power_of_2(nr_pages)) + return -EINVAL; + + if (!atomic_inc_not_zero(&rb->mmap_count)) + return -EINVAL; + + if (rb_has_aux(rb)) { + atomic_inc(&rb->aux_mmap_count); + + } else { + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + atomic_dec(&rb->mmap_count); + return -EPERM; + } + + WARN_ON(!rb && event->rb); + + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; + + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, + event->attr.aux_watermark, rb_flags); + if (ret) { + atomic_dec(&rb->mmap_count); + return ret; + } + + atomic_set(&rb->aux_mmap_count, 1); + rb->aux_mmap_locked = extra; + } + + perf_mmap_account(vma, user_extra, extra); + atomic_inc(&event->mmap_count); + + return 0; +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; @@ -7088,73 +7164,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) perf_mmap_account(vma, user_extra, extra); atomic_inc(&event->mmap_count); } else { - /* - * AUX area mapping: if rb->aux_nr_pages != 0, it's already - * mapped, all subsequent mappings should have the same size - * and offset. Must be above the normal perf buffer. - */ - u64 aux_offset, aux_size; - - rb = event->rb; - if (!rb) - goto unlock; - - guard(mutex)(&rb->aux_mutex); - - aux_offset = READ_ONCE(rb->user_page->aux_offset); - aux_size = READ_ONCE(rb->user_page->aux_size); - - if (aux_offset < perf_data_size(rb) + PAGE_SIZE) - goto unlock; - - if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) - goto unlock; - - /* already mapped with a different offset */ - if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) - goto unlock; - - if (aux_size != nr_pages * PAGE_SIZE) - goto unlock; - - /* already mapped with a different size */ - if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) - goto unlock; - - if (!is_power_of_2(nr_pages)) - goto unlock; - - if (!atomic_inc_not_zero(&rb->mmap_count)) - goto unlock; - - if (rb_has_aux(rb)) { - atomic_inc(&rb->aux_mmap_count); - ret = 0; - - } else { - if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - ret = -EPERM; - atomic_dec(&rb->mmap_count); - goto unlock; - } - - WARN_ON(!rb && event->rb); - - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; - - ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, - event->attr.aux_watermark, flags); - if (ret) { - atomic_dec(&rb->mmap_count); - goto unlock; - } - - atomic_set(&rb->aux_mmap_count, 1); - rb->aux_mmap_locked = extra; - } - perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + ret = perf_mmap_aux(vma, event, nr_pages); } unlock: From 191759e5ea9f6995171ed2ffcc41a2377f946a3a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:09 +0200 Subject: [PATCH 0169/3206] perf: Make RB allocation branch self sufficient Ensure @rb usage doesn't extend out of the branch block. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.605285302@infradead.org --- kernel/events/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index e76afd9c17592a..875c27b28e9b9a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7116,8 +7116,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) * multiple times. */ ret = 0; - /* We need the rb to map pages. */ - rb = event->rb; perf_mmap_account(vma, user_extra, extra); atomic_inc(&event->mmap_count); goto unlock; @@ -7136,8 +7134,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(!rb && event->rb); - if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; @@ -7190,7 +7186,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) * full cleanup in this case and therefore does not invoke * vmops::close(). */ - ret = map_range(rb, vma); + ret = map_range(event->rb, vma); if (ret) perf_mmap_close(vma); From 5d299897f1e36025400ca84fd36c15925a383b03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:10 +0200 Subject: [PATCH 0170/3206] perf: Split out the RB allocation Move the RB buffer allocation branch into its own function. Originally-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.722214699@infradead.org --- kernel/events/core.c | 145 ++++++++++++++++++++++--------------------- 1 file changed, 73 insertions(+), 72 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 875c27b28e9b9a..3a5fd2b802e4f7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6970,6 +6970,75 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long atomic64_add(extra, &vma->vm_mm->pinned_vm); } +static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + struct perf_buffer *rb; + int rb_flags = 0; + + nr_pages -= 1; + + /* + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); + + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) + return -EINVAL; + + if (atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Success -- managed to mmap() the same buffer + * multiple times. + */ + perf_mmap_account(vma, user_extra, extra); + atomic_inc(&event->mmap_count); + return 0; + } + + /* + * Raced against perf_mmap_close()'s + * atomic_dec_and_mutex_lock() remove the + * event and continue as if !event->rb + */ + ring_buffer_attach(event, NULL); + } + + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) + return -EPERM; + + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; + + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, rb_flags); + + if (!rb) + return -ENOMEM; + + atomic_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; + + ring_buffer_attach(event, rb); + + perf_event_update_time(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); + + perf_mmap_account(vma, user_extra, extra); + atomic_inc(&event->mmap_count); + + return 0; +} + static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, unsigned long nr_pages) { @@ -7050,10 +7119,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; unsigned long vma_size, nr_pages; - long user_extra = 0, extra = 0; - struct perf_buffer *rb = NULL; - int ret, flags = 0; mapped_f mapped; + int ret; /* * Don't allow mmap() of inherited per-task counters. This would @@ -7079,8 +7146,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_size != PAGE_SIZE * nr_pages) return -EINVAL; - user_extra = nr_pages; - mutex_lock(&event->mmap_mutex); ret = -EINVAL; @@ -7094,74 +7159,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - if (vma->vm_pgoff == 0) { - nr_pages -= 1; - - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - goto unlock; - - WARN_ON_ONCE(event->ctx->parent_ctx); - - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) - goto unlock; - - if (atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Success -- managed to mmap() the same buffer - * multiple times. - */ - ret = 0; - perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); - goto unlock; - } - - /* - * Raced against perf_mmap_close()'s - * atomic_dec_and_mutex_lock() remove the - * event and continue as if !event->rb - */ - ring_buffer_attach(event, NULL); - } - - if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - ret = -EPERM; - goto unlock; - } - - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; - - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); - - if (!rb) { - ret = -ENOMEM; - goto unlock; - } - - atomic_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); - rb->mmap_locked = extra; - - ring_buffer_attach(event, rb); - - perf_event_update_time(event); - perf_event_init_userpage(event); - perf_event_update_userpage(event); - ret = 0; - - perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); - } else { + if (vma->vm_pgoff == 0) + ret = perf_mmap_rb(vma, event, nr_pages); + else ret = perf_mmap_aux(vma, event, nr_pages); - } unlock: mutex_unlock(&event->mmap_mutex); From d23a6dbc0a71741eb7b141fdc04e31360fba46ef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:11 +0200 Subject: [PATCH 0171/3206] perf: Use scoped_guard() for mmap_mutex in perf_mmap() Mostly just re-indent noise. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.838047976@infradead.org --- kernel/events/core.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 3a5fd2b802e4f7..41941dfadfcbc7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7146,30 +7146,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_size != PAGE_SIZE * nr_pages) return -EINVAL; - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; + scoped_guard (mutex, &event->mmap_mutex) { + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; - /* - * This relies on __pmu_detach_event() taking mmap_mutex after marking - * the event REVOKED. Either we observe the state, or __pmu_detach_event() - * will detach the rb created here. - */ - if (event->state <= PERF_EVENT_STATE_REVOKED) { - ret = -ENODEV; - goto unlock; + if (vma->vm_pgoff == 0) + ret = perf_mmap_rb(vma, event, nr_pages); + else + ret = perf_mmap_aux(vma, event, nr_pages); + if (ret) + return ret; } - if (vma->vm_pgoff == 0) - ret = perf_mmap_rb(vma, event, nr_pages); - else - ret = perf_mmap_aux(vma, event, nr_pages); - -unlock: - mutex_unlock(&event->mmap_mutex); - - if (ret) - return ret; - /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. From 59741451b49ce9964a9758c19d6f7df2a1255c75 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:12 +0200 Subject: [PATCH 0172/3206] perf: Identify the 0->1 transition for event::mmap_count Needed because refcount_inc() doesn't allow the 0->1 transition. Specifically, this is the case where we've created the RB, this means there was no RB, and as such there could not have been an mmap. Additionally we hold mmap_mutex to serialize everything. This must be the first. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250812104019.956479989@infradead.org --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 41941dfadfcbc7..f6211ab1850364 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7034,7 +7034,7 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, perf_event_update_userpage(event); perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + atomic_set(&event->mmap_count, 1); return 0; } From 448f97fba9013ffa13f5dd82febd18836b189499 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Aug 2025 12:39:13 +0200 Subject: [PATCH 0173/3206] perf: Convert mmap() refcounts to refcount_t The recently fixed reference count leaks could have been detected by using refcount_t and refcount_t would have mitigated the potential overflow at least. Now that the code is properly structured, convert the mmap() related mmap_count variants over to refcount_t. No functional change intended. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104020.071507932@infradead.org --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 40 ++++++++++++++++++------------------- kernel/events/internal.h | 4 ++-- kernel/events/ring_buffer.c | 2 +- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ec9d9602568395..bfbf9ea53f259f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -859,7 +859,7 @@ struct perf_event { /* mmap bits */ struct mutex mmap_mutex; - atomic_t mmap_count; + refcount_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; diff --git a/kernel/events/core.c b/kernel/events/core.c index f6211ab1850364..ea357044d78062 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3968,7 +3968,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, */ static inline bool event_update_userpage(struct perf_event *event) { - if (likely(!atomic_read(&event->mmap_count))) + if (likely(!refcount_read(&event->mmap_count))) return false; perf_event_update_time(event); @@ -6704,11 +6704,11 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; mapped_f mapped = get_mapped(event, event_mapped); - atomic_inc(&event->mmap_count); - atomic_inc(&event->rb->mmap_count); + refcount_inc(&event->mmap_count); + refcount_inc(&event->rb->mmap_count); if (vma->vm_pgoff) - atomic_inc(&event->rb->aux_mmap_count); + refcount_inc(&event->rb->aux_mmap_count); if (mapped) mapped(event, vma->vm_mm); @@ -6743,7 +6743,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { + refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -6763,10 +6763,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&rb->aux_mutex); } - if (atomic_dec_and_test(&rb->mmap_count)) + if (refcount_dec_and_test(&rb->mmap_count)) detach_rest = true; - if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); @@ -6992,19 +6992,19 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (data_page_nr(event->rb) != nr_pages) return -EINVAL; - if (atomic_inc_not_zero(&event->rb->mmap_count)) { + if (refcount_inc_not_zero(&event->rb->mmap_count)) { /* * Success -- managed to mmap() the same buffer * multiple times. */ perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + refcount_inc(&event->mmap_count); return 0; } /* * Raced against perf_mmap_close()'s - * atomic_dec_and_mutex_lock() remove the + * refcount_dec_and_mutex_lock() remove the * event and continue as if !event->rb */ ring_buffer_attach(event, NULL); @@ -7023,7 +7023,7 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (!rb) return -ENOMEM; - atomic_set(&rb->mmap_count, 1); + refcount_set(&rb->mmap_count, 1); rb->mmap_user = get_current_user(); rb->mmap_locked = extra; @@ -7034,7 +7034,7 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, perf_event_update_userpage(event); perf_mmap_account(vma, user_extra, extra); - atomic_set(&event->mmap_count, 1); + refcount_set(&event->mmap_count, 1); return 0; } @@ -7081,15 +7081,15 @@ static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, if (!is_power_of_2(nr_pages)) return -EINVAL; - if (!atomic_inc_not_zero(&rb->mmap_count)) + if (!refcount_inc_not_zero(&rb->mmap_count)) return -EINVAL; if (rb_has_aux(rb)) { - atomic_inc(&rb->aux_mmap_count); + refcount_inc(&rb->aux_mmap_count); } else { if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - atomic_dec(&rb->mmap_count); + refcount_dec(&rb->mmap_count); return -EPERM; } @@ -7101,16 +7101,16 @@ static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, rb_flags); if (ret) { - atomic_dec(&rb->mmap_count); + refcount_dec(&rb->mmap_count); return ret; } - atomic_set(&rb->aux_mmap_count, 1); + refcount_set(&rb->aux_mmap_count, 1); rb->aux_mmap_locked = extra; } perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + refcount_inc(&event->mmap_count); return 0; } @@ -13254,7 +13254,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) + if (refcount_read(&event->mmap_count)) goto unlock; if (output_event) { @@ -13267,7 +13267,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) goto unlock; /* did we race against perf_mmap_close() */ - if (!atomic_read(&rb->mmap_count)) { + if (!refcount_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 249288d82b8dcf..d9cc5708309184 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -35,7 +35,7 @@ struct perf_buffer { spinlock_t event_lock; struct list_head event_list; - atomic_t mmap_count; + refcount_t mmap_count; unsigned long mmap_locked; struct user_struct *mmap_user; @@ -47,7 +47,7 @@ struct perf_buffer { unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; - atomic_t aux_mmap_count; + refcount_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index aa9a759e824fee..20a90502373620 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic(). */ - if (!atomic_read(&rb->aux_mmap_count)) + if (!refcount_read(&rb->aux_mmap_count)) goto err; if (!refcount_inc_not_zero(&rb->aux_refcount)) From 15769d9478bdb3e09184c5adbab6e2d2dfcbf1b1 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 3 Aug 2025 18:22:42 +0800 Subject: [PATCH 0174/3206] fs-writeback: Remove redundant __GFP_NOWARN GFP_NOWAIT already includes __GFP_NOWARN, so let's remove the redundant __GFP_NOWARN. Signed-off-by: Qianfeng Rong Link: https://lore.kernel.org/20250803102243.623705-5-rongqianfeng@vivo.com Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641d7..a3ee79bfb2a403 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1123,7 +1123,7 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, dirty = dirty * 10 / 8; /* issue the writeback work */ - work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); + work = kzalloc(sizeof(*work), GFP_NOWAIT); if (work) { work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; From f8f59a2c05dc16d19432e3154a9ac7bc385f4b92 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 13 Aug 2025 17:11:05 +0200 Subject: [PATCH 0175/3206] copy_file_range: limit size if in compat mode If the process runs in 32-bit compat mode, copy_file_range results can be in the in-band error range. In this case limit copy length to MAX_RW_COUNT to prevent a signed overflow. Reported-by: Florian Weimer Closes: https://lore.kernel.org/all/lhuh5ynl8z5.fsf@oldenburg.str.redhat.com/ Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/20250813151107.99856-1-mszeredi@redhat.com Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/read_write.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index c5b6265d984bae..833bae068770a4 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1576,6 +1576,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (len == 0) return 0; + /* + * Make sure return value doesn't overflow in 32bit compat mode. Also + * limit the size for all cases except when calling ->copy_file_range(). + */ + if (splice || !file_out->f_op->copy_file_range || in_compat_syscall()) + len = min_t(size_t, MAX_RW_COUNT, len); + file_start_write(file_out); /* @@ -1589,9 +1596,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, - min_t(loff_t, MAX_RW_COUNT, len), - REMAP_FILE_CAN_SHORTEN); + file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; @@ -1624,8 +1629,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ - ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, - min_t(size_t, len, MAX_RW_COUNT), 0); + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0); done: if (ret > 0) { fsnotify_access(file_in); From 8d3e8057eeadab134a71d6a495d6f1b6818144fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20=C3=96zkan?= Date: Sun, 20 Jul 2025 12:48:37 +0300 Subject: [PATCH 0176/3206] rust: make `ArrayLayout::new_unchecked` a `const fn` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes `ArrayLayout::new_unchecked` a `const fn` to allow compile-time evaluation. Signed-off-by: Onur Özkan Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20250720094838.29530-3-work@onurozkan.dev Signed-off-by: Danilo Krummrich --- rust/kernel/alloc/layout.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/alloc/layout.rs b/rust/kernel/alloc/layout.rs index 93ed514f7cc7ed..52cbf61c4539ef 100644 --- a/rust/kernel/alloc/layout.rs +++ b/rust/kernel/alloc/layout.rs @@ -80,7 +80,7 @@ impl ArrayLayout { /// # Safety /// /// `len` must be a value, for which `len * size_of::() <= isize::MAX` is true. - pub unsafe fn new_unchecked(len: usize) -> Self { + pub const unsafe fn new_unchecked(len: usize) -> Self { // INVARIANT: By the safety requirements of this function // `len * size_of::() <= isize::MAX`. Self { From f87de6919dc126f22c7b5bf92e378f0346f190a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20=C3=96zkan?= Date: Sun, 20 Jul 2025 12:48:38 +0300 Subject: [PATCH 0177/3206] rust: make `kvec::Vec` functions `const fn` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes various `kvec::Vec` functions `const fn` to allow compile-time evaluation. Signed-off-by: Onur Özkan Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20250720094838.29530-4-work@onurozkan.dev Signed-off-by: Danilo Krummrich --- rust/kernel/alloc/kvec.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rust/kernel/alloc/kvec.rs b/rust/kernel/alloc/kvec.rs index 7316f48aaa90ab..d42dbdc44f0fd5 100644 --- a/rust/kernel/alloc/kvec.rs +++ b/rust/kernel/alloc/kvec.rs @@ -175,7 +175,7 @@ where /// Returns the number of elements that can be stored within the vector without allocating /// additional memory. - pub fn capacity(&self) -> usize { + pub const fn capacity(&self) -> usize { if const { Self::is_zst() } { usize::MAX } else { @@ -185,7 +185,7 @@ where /// Returns the number of elements stored within the vector. #[inline] - pub fn len(&self) -> usize { + pub const fn len(&self) -> usize { self.len } @@ -196,7 +196,7 @@ where /// - `additional` must be less than or equal to `self.capacity - self.len`. /// - All elements within the interval [`self.len`,`self.len + additional`) must be initialized. #[inline] - pub unsafe fn inc_len(&mut self, additional: usize) { + pub const unsafe fn inc_len(&mut self, additional: usize) { // Guaranteed by the type invariant to never underflow. debug_assert!(additional <= self.capacity() - self.len()); // INVARIANT: By the safety requirements of this method this represents the exact number of @@ -255,7 +255,7 @@ where /// Returns a raw pointer to the vector's backing buffer, or, if `T` is a ZST, a dangling raw /// pointer. #[inline] - pub fn as_ptr(&self) -> *const T { + pub const fn as_ptr(&self) -> *const T { self.ptr.as_ptr() } @@ -271,7 +271,7 @@ where /// assert!(!v.is_empty()); /// ``` #[inline] - pub fn is_empty(&self) -> bool { + pub const fn is_empty(&self) -> bool { self.len() == 0 } From 4005dac6573128d6e7b46c4ed43df1b23dab7765 Mon Sep 17 00:00:00 2001 From: Abhinav Ananthu Date: Tue, 12 Aug 2025 13:21:10 +0530 Subject: [PATCH 0178/3206] rust: auxiliary: Use `c_` types from prelude instead of Update auxiliary FFI callback signatures to reference the `c_` types provided by the kernel prelude, rather than accessing them via `kernel::ffi::`. Signed-off-by: Abhinav Ananthu Link: https://lore.kernel.org/r/20250812075109.4099-1-abhinav.ogl@gmail.com Signed-off-by: Danilo Krummrich --- rust/kernel/auxiliary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/auxiliary.rs b/rust/kernel/auxiliary.rs index 4749fb6bffef34..f73c460665ec19 100644 --- a/rust/kernel/auxiliary.rs +++ b/rust/kernel/auxiliary.rs @@ -55,7 +55,7 @@ impl Adapter { extern "C" fn probe_callback( adev: *mut bindings::auxiliary_device, id: *const bindings::auxiliary_device_id, - ) -> kernel::ffi::c_int { + ) -> c_int { // SAFETY: The auxiliary bus only ever calls the probe callback with a valid pointer to a // `struct auxiliary_device`. // From b0d73ad126957df989c26d78bd4747a270244dd0 Mon Sep 17 00:00:00 2001 From: Abhinav Ananthu Date: Tue, 12 Aug 2025 09:01:02 +0530 Subject: [PATCH 0179/3206] rust: pci: use c_* types via kernel prelude Update PCI FFI callback signatures to use `c_` from the prelude, instead of accessing it via `kernel::ffi::`. Signed-off-by: Abhinav Ananthu Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20250812033101.5257-1-abhinav.ogl@gmail.com Signed-off-by: Danilo Krummrich --- rust/kernel/pci.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 3bf1737635a9e2..71d25dbcb392bb 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -60,7 +60,7 @@ impl Adapter { extern "C" fn probe_callback( pdev: *mut bindings::pci_dev, id: *const bindings::pci_device_id, - ) -> kernel::ffi::c_int { + ) -> c_int { // SAFETY: The PCI bus only ever calls the probe callback with a valid pointer to a // `struct pci_dev`. // @@ -347,7 +347,7 @@ impl Bar { // `ioptr` is valid by the safety requirements. // `num` is valid by the safety requirements. unsafe { - bindings::pci_iounmap(pdev.as_raw(), ioptr as *mut kernel::ffi::c_void); + bindings::pci_iounmap(pdev.as_raw(), ioptr as *mut c_void); bindings::pci_release_region(pdev.as_raw(), num); } } From cd58b0b11d2160b174f82b71eb3847457aedaeca Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 30 Jul 2025 11:34:16 +1000 Subject: [PATCH 0180/3206] rust: Update PCI binding safety comments and add inline compiler hint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the safety comments to be consistent with other safety comments in the PCI bindings. Also add an inline compiler hint. Suggested-by: Danilo Krummrich Cc: Danilo Krummrich Cc: Bjorn Helgaas Cc: Krzysztof Wilczyński Cc: Miguel Ojeda Cc: Alex Gaynor Cc: Boqun Feng Cc: Gary Guo Cc: Björn Roy Baron Cc: Benno Lossin Cc: Andreas Hindborg Cc: Alice Ryhl Cc: Trevor Gross Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: John Hubbard Cc: Alexandre Courbot Cc: linux-pci@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Alistair Popple Link: https://lore.kernel.org/r/20250730013417.640593-1-apopple@nvidia.com Signed-off-by: Danilo Krummrich --- rust/kernel/pci.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 71d25dbcb392bb..5fa74d62773041 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -388,14 +388,18 @@ impl Device { impl Device { /// Returns the PCI vendor ID. + #[inline] pub fn vendor_id(&self) -> u16 { - // SAFETY: `self.as_raw` is a valid pointer to a `struct pci_dev`. + // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a + // `struct pci_dev`. unsafe { (*self.as_raw()).vendor } } /// Returns the PCI device ID. + #[inline] pub fn device_id(&self) -> u16 { - // SAFETY: `self.as_raw` is a valid pointer to a `struct pci_dev`. + // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a + // `struct pci_dev`. unsafe { (*self.as_raw()).device } } From b6a37d1d4694111895248b771513153ccace606a Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 30 Jul 2025 11:34:17 +1000 Subject: [PATCH 0181/3206] rust: Add several miscellaneous PCI helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add bindings to obtain a PCI device's resource start address, bus/ device function, revision ID and subsystem device and vendor IDs. These will be used by the nova-core GPU driver which is currently in development. Reviewed-by: Alexandre Courbot Cc: Danilo Krummrich Cc: Bjorn Helgaas Cc: Krzysztof Wilczyński Cc: Miguel Ojeda Cc: Alex Gaynor Cc: Boqun Feng Cc: Gary Guo Cc: Björn Roy Baron Cc: Benno Lossin Cc: Andreas Hindborg Cc: Alice Ryhl Cc: Trevor Gross Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: John Hubbard Cc: Alexandre Courbot Cc: linux-pci@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Alistair Popple Link: https://lore.kernel.org/r/20250730013417.640593-2-apopple@nvidia.com Signed-off-by: Danilo Krummrich --- rust/helpers/pci.c | 10 ++++++++++ rust/kernel/pci.rs | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/rust/helpers/pci.c b/rust/helpers/pci.c index 5bf56004478c69..fb814572b23631 100644 --- a/rust/helpers/pci.c +++ b/rust/helpers/pci.c @@ -2,6 +2,16 @@ #include +u16 rust_helper_pci_dev_id(struct pci_dev *dev) +{ + return PCI_DEVID(dev->bus->number, dev->devfn); +} + +resource_size_t rust_helper_pci_resource_start(struct pci_dev *pdev, int bar) +{ + return pci_resource_start(pdev, bar); +} + resource_size_t rust_helper_pci_resource_len(struct pci_dev *pdev, int bar) { return pci_resource_len(pdev, bar); diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 5fa74d62773041..08fb69e5eb9751 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -403,6 +403,50 @@ impl Device { unsafe { (*self.as_raw()).device } } + /// Returns the PCI revision ID. + #[inline] + pub fn revision_id(&self) -> u8 { + // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a + // `struct pci_dev`. + unsafe { (*self.as_raw()).revision } + } + + /// Returns the PCI bus device/function. + #[inline] + pub fn dev_id(&self) -> u16 { + // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a + // `struct pci_dev`. + unsafe { bindings::pci_dev_id(self.as_raw()) } + } + + /// Returns the PCI subsystem vendor ID. + #[inline] + pub fn subsystem_vendor_id(&self) -> u16 { + // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a + // `struct pci_dev`. + unsafe { (*self.as_raw()).subsystem_vendor } + } + + /// Returns the PCI subsystem device ID. + #[inline] + pub fn subsystem_device_id(&self) -> u16 { + // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a + // `struct pci_dev`. + unsafe { (*self.as_raw()).subsystem_device } + } + + /// Returns the start of the given PCI bar resource. + pub fn resource_start(&self, bar: u32) -> Result { + if !Bar::index_is_valid(bar) { + return Err(EINVAL); + } + + // SAFETY: + // - `bar` is a valid bar number, as guaranteed by the above call to `Bar::index_is_valid`, + // - by its type invariant `self.as_raw` is always a valid pointer to a `struct pci_dev`. + Ok(unsafe { bindings::pci_resource_start(self.as_raw(), bar.try_into()?) }) + } + /// Returns the size of the given PCI bar resource. pub fn resource_len(&self, bar: u32) -> Result { if !Bar::index_is_valid(bar) { From 1b1a946dc2b535785663742f9e4f15fd64bece60 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 11 Aug 2025 12:31:50 +0000 Subject: [PATCH 0182/3206] rust: alloc: specify the minimum alignment of each allocator The kernel's allocators sometimes provide a higher alignment than the end-user requested, so add a new constant on the Allocator trait to let the allocator specify what its minimum guaranteed alignment is. This allows the ForeignOwnable trait to provide a more accurate value of FOREIGN_ALIGN when using a pointer type such as Box, which will be useful with certain collections such as XArray that store its own data in the low bits of pointers. Reviewed-by: Benno Lossin Signed-off-by: Alice Ryhl Acked-by: Liam R. Howlett Reviewed-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250811-align-min-allocator-v2-1-3386cc94f4fc@google.com [ Add helper for ARCH_KMALLOC_MINALIGN; remove cast to usize. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/bindings/bindings_helper.h | 1 + rust/kernel/alloc.rs | 8 ++++++++ rust/kernel/alloc/allocator.rs | 8 ++++++++ 3 files changed, 17 insertions(+) diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 84d60635e8a9ba..4ad9add117ea0f 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -84,6 +84,7 @@ /* `bindgen` gets confused at certain things. */ const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN; +const size_t RUST_CONST_HELPER_ARCH_KMALLOC_MINALIGN = ARCH_KMALLOC_MINALIGN; const size_t RUST_CONST_HELPER_PAGE_SIZE = PAGE_SIZE; const gfp_t RUST_CONST_HELPER_GFP_ATOMIC = GFP_ATOMIC; const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL; diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index a2c49e5494d334..907301334d8c11 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -137,6 +137,14 @@ pub mod flags { /// - Implementers must ensure that all trait functions abide by the guarantees documented in the /// `# Guarantees` sections. pub unsafe trait Allocator { + /// The minimum alignment satisfied by all allocations from this allocator. + /// + /// # Guarantees + /// + /// Any pointer allocated by this allocator is guaranteed to be aligned to `MIN_ALIGN` even if + /// the requested layout has a smaller alignment. + const MIN_ALIGN: usize; + /// Allocate memory based on `layout` and `flags`. /// /// On success, returns a buffer represented as `NonNull<[u8]>` that satisfies the layout diff --git a/rust/kernel/alloc/allocator.rs b/rust/kernel/alloc/allocator.rs index aa2dfa9dca4c30..5003907f024075 100644 --- a/rust/kernel/alloc/allocator.rs +++ b/rust/kernel/alloc/allocator.rs @@ -17,6 +17,8 @@ use crate::alloc::{AllocError, Allocator}; use crate::bindings; use crate::pr_warn; +const ARCH_KMALLOC_MINALIGN: usize = bindings::ARCH_KMALLOC_MINALIGN; + /// The contiguous kernel allocator. /// /// `Kmalloc` is typically used for physically contiguous allocations up to page size, but also @@ -128,6 +130,8 @@ impl ReallocFunc { // - passing a pointer to a valid memory allocation is OK, // - `realloc` satisfies the guarantees, since `ReallocFunc::call` has the same. unsafe impl Allocator for Kmalloc { + const MIN_ALIGN: usize = ARCH_KMALLOC_MINALIGN; + #[inline] unsafe fn realloc( ptr: Option>, @@ -145,6 +149,8 @@ unsafe impl Allocator for Kmalloc { // - passing a pointer to a valid memory allocation is OK, // - `realloc` satisfies the guarantees, since `ReallocFunc::call` has the same. unsafe impl Allocator for Vmalloc { + const MIN_ALIGN: usize = kernel::page::PAGE_SIZE; + #[inline] unsafe fn realloc( ptr: Option>, @@ -169,6 +175,8 @@ unsafe impl Allocator for Vmalloc { // - passing a pointer to a valid memory allocation is OK, // - `realloc` satisfies the guarantees, since `ReallocFunc::call` has the same. unsafe impl Allocator for KVmalloc { + const MIN_ALIGN: usize = ARCH_KMALLOC_MINALIGN; + #[inline] unsafe fn realloc( ptr: Option>, From bb9749f32ad38cf343e6650a34125be6aacd65d1 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 11 Aug 2025 12:31:51 +0000 Subject: [PATCH 0183/3206] rust: alloc: take the allocator into account for FOREIGN_ALIGN When converting a Box into a void pointer, the allocator might guarantee a higher alignment than the type itself does, and in that case it is guaranteed that the void pointer has that higher alignment. This is quite useful when combined with the XArray, which you can only create using a ForeignOwnable whose FOREIGN_ALIGN is at least 4. This means that you can now always use a Box with the XArray no matter the alignment of T. Reviewed-by: Benno Lossin Signed-off-by: Alice Ryhl Acked-by: Liam R. Howlett Reviewed-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250811-align-min-allocator-v2-2-3386cc94f4fc@google.com Signed-off-by: Danilo Krummrich --- rust/kernel/alloc/kbox.rs | 13 +++++++++---- rust/kernel/sync/arc.rs | 6 +++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/rust/kernel/alloc/kbox.rs b/rust/kernel/alloc/kbox.rs index 856d05aa60f134..eedab0be1eff78 100644 --- a/rust/kernel/alloc/kbox.rs +++ b/rust/kernel/alloc/kbox.rs @@ -401,12 +401,17 @@ where } // SAFETY: The pointer returned by `into_foreign` comes from a well aligned -// pointer to `T`. +// pointer to `T` allocated by `A`. unsafe impl ForeignOwnable for Box where A: Allocator, { - const FOREIGN_ALIGN: usize = core::mem::align_of::(); + const FOREIGN_ALIGN: usize = if core::mem::align_of::() < A::MIN_ALIGN { + A::MIN_ALIGN + } else { + core::mem::align_of::() + }; + type Borrowed<'a> = &'a T; type BorrowedMut<'a> = &'a mut T; @@ -435,12 +440,12 @@ where } // SAFETY: The pointer returned by `into_foreign` comes from a well aligned -// pointer to `T`. +// pointer to `T` allocated by `A`. unsafe impl ForeignOwnable for Pin> where A: Allocator, { - const FOREIGN_ALIGN: usize = core::mem::align_of::(); + const FOREIGN_ALIGN: usize = as ForeignOwnable>::FOREIGN_ALIGN; type Borrowed<'a> = Pin<&'a T>; type BorrowedMut<'a> = Pin<&'a mut T>; diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs index 63a66761d0c7d7..74121cf935f364 100644 --- a/rust/kernel/sync/arc.rs +++ b/rust/kernel/sync/arc.rs @@ -373,10 +373,10 @@ impl Arc { } } -// SAFETY: The pointer returned by `into_foreign` comes from a well aligned -// pointer to `ArcInner`. +// SAFETY: The pointer returned by `into_foreign` was originally allocated as an +// `KBox>`, so that type is what determines the alignment. unsafe impl ForeignOwnable for Arc { - const FOREIGN_ALIGN: usize = core::mem::align_of::>(); + const FOREIGN_ALIGN: usize = > as ForeignOwnable>::FOREIGN_ALIGN; type Borrowed<'a> = ArcBorrow<'a, T>; type BorrowedMut<'a> = Self::Borrowed<'a>; From c80d79720647ed77ebc0198abd5a0807efdaff0b Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Fri, 15 Aug 2025 12:12:14 +0000 Subject: [PATCH 0184/3206] bpf/selftests: Fix test_tcpnotify_user Based on a bisect, it appears that commit 7ee988770326 ("timers: Implement the hierarchical pull model") has somehow inadvertently broken BPF selftest test_tcpnotify_user. The error that is being generated by this test is as follows: FAILED: Wrong stats Expected 10 calls, got 8 It looks like the change allows timer functions to be run on CPUs different from the one they are armed on. The test had pinned itself to CPU 0, and in the past the retransmit attempts also occurred on CPU 0. The test had set the max_entries attribute for BPF_MAP_TYPE_PERF_EVENT_ARRAY to 2 and was calling bpf_perf_event_output() with BPF_F_CURRENT_CPU, so the entry was likely to be in range. With the change to allow timers to run on other CPUs, the current CPU tasked with performing the retransmit might be bumped and in turn fall out of range, as the event will be filtered out via __bpf_perf_event_output() using: if (unlikely(index >= array->map.max_entries)) return -E2BIG; A possible change would be to explicitly set the max_entries attribute for perf_event_map in test_tcpnotify_kern.c to a value that's at least as large as the number of CPUs. As it turns out however, if the field is left unset, then the libbpf will determine the number of CPUs available on the underlying system and update the max_entries attribute accordingly in map_set_def_max_entries(). A further problem with the test is that it has a thread that continues running up until the program exits. The main thread cleans up some LIBBPF data structures, while the other thread continues to use them, which inevitably will trigger a SIGSEGV. This can be dealt with by telling the thread to run for as long as necessary and doing a pthread_join on it before exiting the program. Finally, I don't think binding the process to CPU 0 is meaningful for this test any more, so get rid of that. Fixes: 435f90a338ae ("selftests/bpf: add a test case for sock_ops perf-event notification") Signed-off-by: Matt Bobrowski Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/aJ8kHhwgATmA3rLf@google.com --- .../selftests/bpf/progs/test_tcpnotify_kern.c | 1 - .../selftests/bpf/test_tcpnotify_user.c | 20 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c b/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c index 540181c115a85a..ef00d38b0a8d24 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c @@ -23,7 +23,6 @@ struct { struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(max_entries, 2); __type(key, int); __type(value, __u32); } perf_event_map SEC(".maps"); diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c index 595194453ff8f8..35b4893ccdf8ae 100644 --- a/tools/testing/selftests/bpf/test_tcpnotify_user.c +++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c @@ -15,20 +15,18 @@ #include #include #include -#include #include -#include -#include "bpf_util.h" #include "cgroup_helpers.h" #include "test_tcpnotify.h" -#include "trace_helpers.h" #include "testing_helpers.h" #define SOCKET_BUFFER_SIZE (getpagesize() < 8192L ? getpagesize() : 8192L) pthread_t tid; +static bool exit_thread; + int rx_callbacks; static void dummyfn(void *ctx, int cpu, void *data, __u32 size) @@ -45,7 +43,7 @@ void tcp_notifier_poller(struct perf_buffer *pb) { int err; - while (1) { + while (!exit_thread) { err = perf_buffer__poll(pb, 100); if (err < 0 && err != -EINTR) { printf("failed perf_buffer__poll: %d\n", err); @@ -78,15 +76,10 @@ int main(int argc, char **argv) int error = EXIT_FAILURE; struct bpf_object *obj; char test_script[80]; - cpu_set_t cpuset; __u32 key = 0; libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - CPU_ZERO(&cpuset); - CPU_SET(0, &cpuset); - pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); - cg_fd = cgroup_setup_and_join(cg_path); if (cg_fd < 0) goto err; @@ -151,6 +144,13 @@ int main(int argc, char **argv) sleep(10); + exit_thread = true; + int ret = pthread_join(tid, NULL); + if (ret) { + printf("FAILED: pthread_join\n"); + goto err; + } + if (verify_result(&g)) { printf("FAILED: Wrong stats Expected %d calls, got %d\n", g.ncalls, rx_callbacks); From 1e180614b3608e1cb0f81753b2172af253d58a52 Mon Sep 17 00:00:00 2001 From: Shankari Anand Date: Thu, 14 Aug 2025 16:16:15 +0530 Subject: [PATCH 0185/3206] rust: driver-core: Update ARef and AlwaysRefCounted imports from sync::aref Update call sites in the driver-core files and its related samples to import `ARef` and `AlwaysRefCounted` from `sync::aref` instead of `types`. This aligns with the ongoing effort to move `ARef` and `AlwaysRefCounted` to sync. Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1173 Signed-off-by: Shankari Anand Link: https://lore.kernel.org/r/20250814104615.355106-1-shankari.ak0208@gmail.com Signed-off-by: Danilo Krummrich --- rust/kernel/auxiliary.rs | 2 +- rust/kernel/device.rs | 7 ++++--- rust/kernel/devres.rs | 4 ++-- rust/kernel/pci.rs | 5 +++-- rust/kernel/platform.rs | 2 +- samples/rust/rust_driver_pci.rs | 2 +- samples/rust/rust_driver_platform.rs | 2 +- 7 files changed, 13 insertions(+), 11 deletions(-) diff --git a/rust/kernel/auxiliary.rs b/rust/kernel/auxiliary.rs index f73c460665ec19..8dc7490a79a4a9 100644 --- a/rust/kernel/auxiliary.rs +++ b/rust/kernel/auxiliary.rs @@ -245,7 +245,7 @@ kernel::impl_device_context_deref!(unsafe { Device }); kernel::impl_device_context_into_aref!(Device); // SAFETY: Instances of `Device` are always reference-counted. -unsafe impl crate::types::AlwaysRefCounted for Device { +unsafe impl crate::sync::aref::AlwaysRefCounted for Device { fn inc_ref(&self) { // SAFETY: The existence of a shared reference guarantees that the refcount is non-zero. unsafe { bindings::get_device(self.as_ref().as_raw()) }; diff --git a/rust/kernel/device.rs b/rust/kernel/device.rs index b8613289de8e7c..b860ab389f1812 100644 --- a/rust/kernel/device.rs +++ b/rust/kernel/device.rs @@ -6,7 +6,8 @@ use crate::{ bindings, - types::{ARef, ForeignOwnable, Opaque}, + sync::aref::ARef, + types::{ForeignOwnable, Opaque}, }; use core::{fmt, marker::PhantomData, ptr}; @@ -292,7 +293,7 @@ kernel::impl_device_context_deref!(unsafe { Device }); kernel::impl_device_context_into_aref!(Device); // SAFETY: Instances of `Device` are always reference-counted. -unsafe impl crate::types::AlwaysRefCounted for Device { +unsafe impl crate::sync::aref::AlwaysRefCounted for Device { fn inc_ref(&self) { // SAFETY: The existence of a shared reference guarantees that the refcount is non-zero. unsafe { bindings::get_device(self.as_raw()) }; @@ -411,7 +412,7 @@ macro_rules! impl_device_context_deref { #[macro_export] macro_rules! __impl_device_context_into_aref { ($src:ty, $device:tt) => { - impl ::core::convert::From<&$device<$src>> for $crate::types::ARef<$device> { + impl ::core::convert::From<&$device<$src>> for $crate::sync::aref::ARef<$device> { fn from(dev: &$device<$src>) -> Self { (&**dev).into() } diff --git a/rust/kernel/devres.rs b/rust/kernel/devres.rs index da18091143a67f..99b7520019f051 100644 --- a/rust/kernel/devres.rs +++ b/rust/kernel/devres.rs @@ -13,8 +13,8 @@ use crate::{ ffi::c_void, prelude::*, revocable::{Revocable, RevocableGuard}, - sync::{rcu, Completion}, - types::{ARef, ForeignOwnable, Opaque, ScopeGuard}, + sync::{aref::ARef, rcu, Completion}, + types::{ForeignOwnable, Opaque, ScopeGuard}, }; use pin_init::Wrapper; diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 08fb69e5eb9751..cae4e274f7766f 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -13,7 +13,8 @@ use crate::{ io::{Io, IoRaw}, irq::{self, IrqRequest}, str::CStr, - types::{ARef, Opaque}, + sync::aref::ARef, + types::Opaque, ThisModule, }; use core::{ @@ -544,7 +545,7 @@ kernel::impl_device_context_into_aref!(Device); impl crate::dma::Device for Device {} // SAFETY: Instances of `Device` are always reference-counted. -unsafe impl crate::types::AlwaysRefCounted for Device { +unsafe impl crate::sync::aref::AlwaysRefCounted for Device { fn inc_ref(&self) { // SAFETY: The existence of a shared reference guarantees that the refcount is non-zero. unsafe { bindings::pci_dev_get(self.as_raw()) }; diff --git a/rust/kernel/platform.rs b/rust/kernel/platform.rs index ce2bb4d4e88289..7205fe3416d36c 100644 --- a/rust/kernel/platform.rs +++ b/rust/kernel/platform.rs @@ -468,7 +468,7 @@ kernel::impl_device_context_into_aref!(Device); impl crate::dma::Device for Device {} // SAFETY: Instances of `Device` are always reference-counted. -unsafe impl crate::types::AlwaysRefCounted for Device { +unsafe impl crate::sync::aref::AlwaysRefCounted for Device { fn inc_ref(&self) { // SAFETY: The existence of a shared reference guarantees that the refcount is non-zero. unsafe { bindings::get_device(self.as_ref().as_raw()) }; diff --git a/samples/rust/rust_driver_pci.rs b/samples/rust/rust_driver_pci.rs index 606946ff4d7fd9..0798019014cd95 100644 --- a/samples/rust/rust_driver_pci.rs +++ b/samples/rust/rust_driver_pci.rs @@ -4,7 +4,7 @@ //! //! To make this driver probe, QEMU must be run with `-device pci-testdev`. -use kernel::{bindings, c_str, device::Core, devres::Devres, pci, prelude::*, types::ARef}; +use kernel::{bindings, c_str, device::Core, devres::Devres, pci, prelude::*, sync::aref::ARef}; struct Regs; diff --git a/samples/rust/rust_driver_platform.rs b/samples/rust/rust_driver_platform.rs index 69ed55b7b0faad..6473baf4f1206a 100644 --- a/samples/rust/rust_driver_platform.rs +++ b/samples/rust/rust_driver_platform.rs @@ -72,7 +72,7 @@ use kernel::{ of, platform, prelude::*, str::CString, - types::ARef, + sync::aref::ARef, }; struct SampleDriver { From 7e25d84f460c59322bf01dfeb1fb4745b488f714 Mon Sep 17 00:00:00 2001 From: Shankari Anand Date: Thu, 14 Aug 2025 16:11:33 +0530 Subject: [PATCH 0186/3206] rust: dma: Update ARef and AlwaysRefCounted imports from sync::aref Update call sites in the dma subsystem to import `ARef` and `AlwaysRefCounted` from `sync::aref` instead of `types`. This aligns with the ongoing effort to move `ARef` and `AlwaysRefCounted` to sync. Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1173 Signed-off-by: Shankari Anand Link: https://lore.kernel.org/r/20250814104133.350093-1-shankari.ak0208@gmail.com Signed-off-by: Danilo Krummrich --- rust/kernel/dma.rs | 2 +- samples/rust/rust_dma.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/kernel/dma.rs b/rust/kernel/dma.rs index 2bc8ab51ec280f..68fe6762442438 100644 --- a/rust/kernel/dma.rs +++ b/rust/kernel/dma.rs @@ -9,8 +9,8 @@ use crate::{ device::{Bound, Core}, error::{to_result, Result}, prelude::*, + sync::aref::ARef, transmute::{AsBytes, FromBytes}, - types::ARef, }; /// Trait to be implemented by DMA capable bus devices. diff --git a/samples/rust/rust_dma.rs b/samples/rust/rust_dma.rs index c5e7cce6865402..997a9c4cf2b3d9 100644 --- a/samples/rust/rust_dma.rs +++ b/samples/rust/rust_dma.rs @@ -10,7 +10,7 @@ use kernel::{ dma::{CoherentAllocation, Device, DmaMask}, pci, prelude::*, - types::ARef, + sync::aref::ARef, }; struct DmaSampleDriver { From d87208128a3330c0eab18301ab39bdb419647730 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 14 Aug 2025 18:31:38 -0700 Subject: [PATCH 0187/3206] x86/build: Remove cc-option from stack alignment flags '-mpreferred-stack-boundary' (the GCC option) and '-mstack-alignment' (the clang option) have been supported in their respective compilers for some time, so it is unnecessary to check for support for them via cc-option. '-mpreferred-stack-boundary=3' had a restriction on '-mno-sse' until GCC 7.1 but that is irrelevant for most of the kernel, which includes '-mno-sse'. Move to simple Kconfig checks to avoid querying the compiler for the flags that it supports. Signed-off-by: Nathan Chancellor Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250814-x86-min-ver-cleanups-v1-2-ff7f19457523@kernel.org --- arch/x86/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index ed5657395d6aca..f0aa58d2ec17e8 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -37,10 +37,11 @@ export RETPOLINE_VDSO_CFLAGS # For gcc stack alignment is specified with -mpreferred-stack-boundary, # clang has the option -mstack-alignment for that purpose. -ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) +ifdef CONFIG_CC_IS_GCC cc_stack_align4 := -mpreferred-stack-boundary=2 cc_stack_align8 := -mpreferred-stack-boundary=3 -else ifneq ($(call cc-option, -mstack-alignment=16),) +endif +ifdef CONFIG_CC_IS_CLANG cc_stack_align4 := -mstack-alignment=4 cc_stack_align8 := -mstack-alignment=8 endif From fb13ae067ad7f07df2bdc374aa911ff58dfd19ce Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Wed, 16 Jul 2025 18:06:29 -0700 Subject: [PATCH 0188/3206] EDAC: Add EDAC driver for ARM Cortex A72 cores The driver is designed to support error detection and reporting for Cortex A72 cores, specifically within their L1 and L2 cache systems. The errors are detected by reading CPU/L2 memory error syndrome registers. Unfortunately there is no robust way to inject errors into the caches, so this driver doesn't contain any code to actually test it. It has been tested though with code taken from an older version [1] of this driver. For reasons stated in thread [1], the error injection code is not suitable for mainline, so it is removed from the driver. [1] https://lore.kernel.org/all/1521073067-24348-1-git-send-email-york.sun@nxp.com/#t [ bp: minor touchups. ] Signed-off-by: Sascha Hauer Co-developed-by: Vijay Balakrishna Signed-off-by: Vijay Balakrishna Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/1752714390-27389-2-git-send-email-vijayb@linux.microsoft.com --- MAINTAINERS | 7 ++ drivers/edac/Kconfig | 8 ++ drivers/edac/Makefile | 1 + drivers/edac/a72_edac.c | 225 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 241 insertions(+) create mode 100644 drivers/edac/a72_edac.c diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..ac20587a3e987a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8736,6 +8736,13 @@ F: Documentation/driver-api/edac.rst F: drivers/edac/ F: include/linux/edac.h +EDAC-A72 +M: Vijay Balakrishna +M: Tyler Hicks +L: linux-edac@vger.kernel.org +S: Supported +F: drivers/edac/a72_edac.c + EDAC-DMC520 M: Lei Wang L: linux-edac@vger.kernel.org diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 19ad3c3b675ddb..b824472208c48a 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -576,4 +576,12 @@ config EDAC_LOONGSON errors (CE) only. Loongson-3A5000/3C5000/3D5000/3A6000/3C6000 are compatible. +config EDAC_CORTEX_A72 + tristate "ARM Cortex A72" + depends on ARM64 + help + Support for L1/L2 cache error detection for ARM Cortex A72 processor. + The detected and reported errors are from reading CPU/L2 memory error + syndrome registers. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index a8f2d8f6c894a9..c1736f264320cc 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -88,3 +88,4 @@ obj-$(CONFIG_EDAC_NPCM) += npcm_edac.o obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o obj-$(CONFIG_EDAC_VERSAL) += versal_edac.o obj-$(CONFIG_EDAC_LOONGSON) += loongson_edac.o +obj-$(CONFIG_EDAC_CORTEX_A72) += a72_edac.o diff --git a/drivers/edac/a72_edac.c b/drivers/edac/a72_edac.c new file mode 100644 index 00000000000000..9262d75c385599 --- /dev/null +++ b/drivers/edac/a72_edac.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cortex A72 EDAC L1 and L2 cache error detection + * + * Copyright (c) 2020 Pengutronix, Sascha Hauer + * Copyright (c) 2025 Microsoft Corporation, + * + * Based on Code from: + * Copyright (c) 2018, NXP Semiconductor + * Author: York Sun + */ + +#include +#include +#include +#include + +#include "edac_module.h" + +#define DRVNAME "a72-edac" + +#define SYS_CPUMERRSR_EL1 sys_reg(3, 1, 15, 2, 2) +#define SYS_L2MERRSR_EL1 sys_reg(3, 1, 15, 2, 3) + +#define CPUMERRSR_EL1_RAMID GENMASK(30, 24) +#define L2MERRSR_EL1_CPUID_WAY GENMASK(21, 18) + +#define CPUMERRSR_EL1_VALID BIT(31) +#define CPUMERRSR_EL1_FATAL BIT(63) +#define L2MERRSR_EL1_VALID BIT(31) +#define L2MERRSR_EL1_FATAL BIT(63) + +#define L1_I_TAG_RAM 0x00 +#define L1_I_DATA_RAM 0x01 +#define L1_D_TAG_RAM 0x08 +#define L1_D_DATA_RAM 0x09 +#define TLB_RAM 0x18 + +#define MESSAGE_SIZE 64 + +struct mem_err_synd_reg { + u64 cpu_mesr; + u64 l2_mesr; +}; + +static struct cpumask compat_mask; + +static void report_errors(struct edac_device_ctl_info *edac_ctl, int cpu, + struct mem_err_synd_reg *mesr) +{ + u64 cpu_mesr = mesr->cpu_mesr; + u64 l2_mesr = mesr->l2_mesr; + char msg[MESSAGE_SIZE]; + + if (cpu_mesr & CPUMERRSR_EL1_VALID) { + const char *str; + bool fatal = cpu_mesr & CPUMERRSR_EL1_FATAL; + + switch (FIELD_GET(CPUMERRSR_EL1_RAMID, cpu_mesr)) { + case L1_I_TAG_RAM: + str = "L1-I Tag RAM"; + break; + case L1_I_DATA_RAM: + str = "L1-I Data RAM"; + break; + case L1_D_TAG_RAM: + str = "L1-D Tag RAM"; + break; + case L1_D_DATA_RAM: + str = "L1-D Data RAM"; + break; + case TLB_RAM: + str = "TLB RAM"; + break; + default: + str = "Unspecified"; + break; + } + + snprintf(msg, MESSAGE_SIZE, "%s %s error(s) on CPU %d", + str, fatal ? "fatal" : "correctable", cpu); + + if (fatal) + edac_device_handle_ue(edac_ctl, cpu, 0, msg); + else + edac_device_handle_ce(edac_ctl, cpu, 0, msg); + } + + if (l2_mesr & L2MERRSR_EL1_VALID) { + bool fatal = l2_mesr & L2MERRSR_EL1_FATAL; + + snprintf(msg, MESSAGE_SIZE, "L2 %s error(s) on CPU %d CPUID/WAY 0x%lx", + fatal ? "fatal" : "correctable", cpu, + FIELD_GET(L2MERRSR_EL1_CPUID_WAY, l2_mesr)); + if (fatal) + edac_device_handle_ue(edac_ctl, cpu, 1, msg); + else + edac_device_handle_ce(edac_ctl, cpu, 1, msg); + } +} + +static void read_errors(void *data) +{ + struct mem_err_synd_reg *mesr = data; + + mesr->cpu_mesr = read_sysreg_s(SYS_CPUMERRSR_EL1); + if (mesr->cpu_mesr & CPUMERRSR_EL1_VALID) { + write_sysreg_s(0, SYS_CPUMERRSR_EL1); + isb(); + } + mesr->l2_mesr = read_sysreg_s(SYS_L2MERRSR_EL1); + if (mesr->l2_mesr & L2MERRSR_EL1_VALID) { + write_sysreg_s(0, SYS_L2MERRSR_EL1); + isb(); + } +} + +static void a72_edac_check(struct edac_device_ctl_info *edac_ctl) +{ + struct mem_err_synd_reg mesr; + int cpu; + + cpus_read_lock(); + for_each_cpu_and(cpu, cpu_online_mask, &compat_mask) { + smp_call_function_single(cpu, read_errors, &mesr, true); + report_errors(edac_ctl, cpu, &mesr); + } + cpus_read_unlock(); +} + +static int a72_edac_probe(struct platform_device *pdev) +{ + struct edac_device_ctl_info *edac_ctl; + struct device *dev = &pdev->dev; + int rc; + + edac_ctl = edac_device_alloc_ctl_info(0, "cpu", + num_possible_cpus(), "L", 2, 1, + edac_device_alloc_index()); + if (!edac_ctl) + return -ENOMEM; + + edac_ctl->edac_check = a72_edac_check; + edac_ctl->dev = dev; + edac_ctl->mod_name = dev_name(dev); + edac_ctl->dev_name = dev_name(dev); + edac_ctl->ctl_name = DRVNAME; + dev_set_drvdata(dev, edac_ctl); + + rc = edac_device_add_device(edac_ctl); + if (rc) + goto out_dev; + + return 0; + +out_dev: + edac_device_free_ctl_info(edac_ctl); + + return rc; +} + +static void a72_edac_remove(struct platform_device *pdev) +{ + struct edac_device_ctl_info *edac_ctl = dev_get_drvdata(&pdev->dev); + + edac_device_del_device(edac_ctl->dev); + edac_device_free_ctl_info(edac_ctl); +} + +static const struct of_device_id cortex_arm64_edac_of_match[] = { + { .compatible = "arm,cortex-a72" }, + {} +}; +MODULE_DEVICE_TABLE(of, cortex_arm64_edac_of_match); + +static struct platform_driver a72_edac_driver = { + .probe = a72_edac_probe, + .remove = a72_edac_remove, + .driver = { + .name = DRVNAME, + }, +}; + +static struct platform_device *a72_pdev; + +static int __init a72_edac_driver_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device_node *np __free(device_node) = of_cpu_device_node_get(cpu); + if (np) { + if (of_match_node(cortex_arm64_edac_of_match, np) && + of_property_read_bool(np, "edac-enabled")) { + cpumask_set_cpu(cpu, &compat_mask); + } + } else { + pr_warn("failed to find device node for CPU %d\n", cpu); + } + } + + if (cpumask_empty(&compat_mask)) + return 0; + + a72_pdev = platform_device_register_simple(DRVNAME, -1, NULL, 0); + if (IS_ERR(a72_pdev)) { + pr_err("failed to register A72 EDAC device\n"); + return PTR_ERR(a72_pdev); + } + + return platform_driver_register(&a72_edac_driver); +} + +static void __exit a72_edac_driver_exit(void) +{ + platform_device_unregister(a72_pdev); + platform_driver_unregister(&a72_edac_driver); +} + +module_init(a72_edac_driver_init); +module_exit(a72_edac_driver_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sascha Hauer "); +MODULE_DESCRIPTION("Cortex A72 L1 and L2 cache EDAC driver"); From eb0e3f301d6ee7c813556fa6ca714e436f5235b0 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Wed, 16 Jul 2025 18:06:30 -0700 Subject: [PATCH 0189/3206] dt-bindings: arm: cpus: Add edac-enabled property Some ARM Cortex CPUs including A72 have Error Detection And Correction (EDAC) support on their L1 and L2 caches. That functionality is in implementation defined registers, so using it is not safe in virtualized environments or when EL3 already uses these registers. Add a edac-enabled flag which can be explicitly set when EDAC can be used. [ bp: Massage commit message. ] Signed-off-by: Sascha Hauer Signed-off-by: Vijay Balakrishna Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/1752714390-27389-3-git-send-email-vijayb@linux.microsoft.com --- Documentation/devicetree/bindings/arm/cpus.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Documentation/devicetree/bindings/arm/cpus.yaml b/Documentation/devicetree/bindings/arm/cpus.yaml index 5bd517befb6805..4accf4cbc6c710 100644 --- a/Documentation/devicetree/bindings/arm/cpus.yaml +++ b/Documentation/devicetree/bindings/arm/cpus.yaml @@ -353,6 +353,12 @@ properties: $ref: /schemas/types.yaml#/definitions/phandle description: Link to Mediatek Cache Coherent Interconnect + edac-enabled: + $ref: /schemas/types.yaml#/definitions/flag + description: + A72 CPUs support Error Detection And Correction (EDAC) on their L1 and + L2 caches. This flag marks this function as usable. + qcom,saw: $ref: /schemas/types.yaml#/definitions/phandle description: @@ -399,6 +405,17 @@ properties: allOf: - $ref: /schemas/cpu.yaml# - $ref: /schemas/opp/opp-v1.yaml# + - if: + not: + properties: + compatible: + contains: + const: arm,cortex-a72 + then: + # Allow edac-enabled only for Cortex A72 + properties: + edac-enabled: false + - if: # If the enable-method property contains one of those values properties: From abdaf49be5424db74e19d167c10d7dad79a0efc2 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Thu, 14 Aug 2025 20:14:29 +0800 Subject: [PATCH 0190/3206] bpf: Remove migrate_disable in kprobe_multi_link_prog_run Graph tracer framework ensures we won't migrate, kprobe_multi_link_prog_run called all the way from graph tracer, which disables preemption in function_graph_enter_regs, as Jiri and Yonghong suggested, there is no need to use migrate_disable. As a result, some overhead may will be reduced. And add cant_sleep check for __this_cpu_inc_return. Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250814121430.2347454-1-chen.dylane@linux.dev --- kernel/trace/bpf_trace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3ae52978cae61a..606007c387c52f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2728,20 +2728,25 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, struct pt_regs *regs; int err; + /* + * graph tracer framework ensures we won't migrate, so there is no need + * to use migrate_disable for bpf_prog_run again. The check here just for + * __this_cpu_inc_return. + */ + cant_sleep(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { bpf_prog_inc_misses_counter(link->link.prog); err = 1; goto out; } - migrate_disable(); rcu_read_lock(); regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr()); old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); - migrate_enable(); out: __this_cpu_dec(bpf_prog_active); From 6c6b4146deb12d20f42490d5013f2043df942161 Mon Sep 17 00:00:00 2001 From: Yureka Lilian Date: Thu, 14 Aug 2025 20:01:12 +0200 Subject: [PATCH 0191/3206] libbpf: Fix reuse of DEVMAP Previously, re-using pinned DEVMAP maps would always fail, because get_map_info on a DEVMAP always returns flags with BPF_F_RDONLY_PROG set, but BPF_F_RDONLY_PROG being set on a map during creation is invalid. Thus, ignore the BPF_F_RDONLY_PROG flag in the flags returned from get_map_info when checking for compatibility with an existing DEVMAP. The same problem is handled in a third-party ebpf library: - https://github.com/cilium/ebpf/issues/925 - https://github.com/cilium/ebpf/pull/930 Fixes: 0cdbb4b09a06 ("devmap: Allow map lookups from eBPF") Signed-off-by: Yureka Lilian Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250814180113.1245565-3-yuka@yuka.dev --- tools/lib/bpf/libbpf.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8f5a81b672e1b8..fe4fc5438678c5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5093,6 +5093,16 @@ static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) return false; } + /* + * bpf_get_map_info_by_fd() for DEVMAP will always return flags with + * BPF_F_RDONLY_PROG set, but it generally is not set at map creation time. + * Thus, ignore the BPF_F_RDONLY_PROG flag in the flags returned from + * bpf_get_map_info_by_fd() when checking for compatibility with an + * existing DEVMAP. + */ + if (map->def.type == BPF_MAP_TYPE_DEVMAP || map->def.type == BPF_MAP_TYPE_DEVMAP_HASH) + map_info.map_flags &= ~BPF_F_RDONLY_PROG; + return (map_info.type == map->def.type && map_info.key_size == map->def.key_size && map_info.value_size == map->def.value_size && From 7f8fa9d370c11ba2fb296598267e14d3bfe4ea11 Mon Sep 17 00:00:00 2001 From: Yureka Lilian Date: Thu, 14 Aug 2025 20:01:13 +0200 Subject: [PATCH 0192/3206] selftests/bpf: Add test for DEVMAP reuse The test covers basic re-use of a pinned DEVMAP map, with both matching and mismatching parameters. Signed-off-by: Yureka Lilian Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20250814180113.1245565-4-yuka@yuka.dev --- .../bpf/prog_tests/pinning_devmap_reuse.c | 50 +++++++++++++++++++ .../selftests/bpf/progs/test_pinning_devmap.c | 20 ++++++++ 2 files changed, 70 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/pinning_devmap_reuse.c create mode 100644 tools/testing/selftests/bpf/progs/test_pinning_devmap.c diff --git a/tools/testing/selftests/bpf/prog_tests/pinning_devmap_reuse.c b/tools/testing/selftests/bpf/prog_tests/pinning_devmap_reuse.c new file mode 100644 index 00000000000000..9ae49b587f3e4c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/pinning_devmap_reuse.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + + +#include "test_pinning_devmap.skel.h" + +void test_pinning_devmap_reuse(void) +{ + const char *pinpath1 = "/sys/fs/bpf/pinmap1"; + const char *pinpath2 = "/sys/fs/bpf/pinmap2"; + struct test_pinning_devmap *skel1 = NULL, *skel2 = NULL; + int err; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); + + /* load the object a first time */ + skel1 = test_pinning_devmap__open_and_load(); + if (!ASSERT_OK_PTR(skel1, "skel_load1")) + goto out; + + /* load the object a second time, re-using the pinned map */ + skel2 = test_pinning_devmap__open_and_load(); + if (!ASSERT_OK_PTR(skel2, "skel_load2")) + goto out; + + /* we can close the reference safely without + * the map's refcount falling to 0 + */ + test_pinning_devmap__destroy(skel1); + skel1 = NULL; + + /* now, swap the pins */ + err = renameat2(0, pinpath1, 0, pinpath2, RENAME_EXCHANGE); + if (!ASSERT_OK(err, "swap pins")) + goto out; + + /* load the object again, this time the re-use should fail */ + skel1 = test_pinning_devmap__open_and_load(); + if (!ASSERT_ERR_PTR(skel1, "skel_load3")) + goto out; + +out: + unlink(pinpath1); + unlink(pinpath2); + test_pinning_devmap__destroy(skel1); + test_pinning_devmap__destroy(skel2); +} diff --git a/tools/testing/selftests/bpf/progs/test_pinning_devmap.c b/tools/testing/selftests/bpf/progs/test_pinning_devmap.c new file mode 100644 index 00000000000000..c855f8f87effcb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_pinning_devmap.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} pinmap1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u32); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} pinmap2 SEC(".maps"); From 5ff74f5f71f83cce3c920cd17940df0fe0401865 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Aug 2025 19:02:40 -0700 Subject: [PATCH 0193/3206] lib/crc: Drop inline from all *_mod_init_arch() functions Drop 'inline' from all the *_mod_init_arch() functions so that the compiler will warn about any bugs where they are unused due to not being wired up properly. (There are no such bugs currently, so this just establishes a more robust convention for the future. Of course, these functions also tend to get inlined anyway, regardless of the keyword.) Link: https://lore.kernel.org/r/20250816020240.431545-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crc/arm/crc-t10dif.h | 2 +- lib/crc/arm/crc32.h | 2 +- lib/crc/arm64/crc-t10dif.h | 2 +- lib/crc/loongarch/crc32.h | 2 +- lib/crc/mips/crc32.h | 2 +- lib/crc/powerpc/crc-t10dif.h | 2 +- lib/crc/powerpc/crc32.h | 2 +- lib/crc/sparc/crc32.h | 2 +- lib/crc/x86/crc-t10dif.h | 2 +- lib/crc/x86/crc32.h | 2 +- lib/crc/x86/crc64.h | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/crc/arm/crc-t10dif.h b/lib/crc/arm/crc-t10dif.h index 1a969f4024d479..63441de5e3f161 100644 --- a/lib/crc/arm/crc-t10dif.h +++ b/lib/crc/arm/crc-t10dif.h @@ -43,7 +43,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) } #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch -static inline void crc_t10dif_mod_init_arch(void) +static void crc_t10dif_mod_init_arch(void) { if (elf_hwcap & HWCAP_NEON) { static_branch_enable(&have_neon); diff --git a/lib/crc/arm/crc32.h b/lib/crc/arm/crc32.h index ae71aa60b7a74e..7b76f52f6907db 100644 --- a/lib/crc/arm/crc32.h +++ b/lib/crc/arm/crc32.h @@ -83,7 +83,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) #define crc32_be_arch crc32_be_base /* not implemented on this arch */ #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { if (elf_hwcap2 & HWCAP2_CRC32) static_branch_enable(&have_crc32); diff --git a/lib/crc/arm64/crc-t10dif.h b/lib/crc/arm64/crc-t10dif.h index 435a990ec43c27..f88db297180516 100644 --- a/lib/crc/arm64/crc-t10dif.h +++ b/lib/crc/arm64/crc-t10dif.h @@ -45,7 +45,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) } #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch -static inline void crc_t10dif_mod_init_arch(void) +static void crc_t10dif_mod_init_arch(void) { if (cpu_have_named_feature(ASIMD)) { static_branch_enable(&have_asimd); diff --git a/lib/crc/loongarch/crc32.h b/lib/crc/loongarch/crc32.h index 6de5c96594afc8..d34fa4c6863252 100644 --- a/lib/crc/loongarch/crc32.h +++ b/lib/crc/loongarch/crc32.h @@ -101,7 +101,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) #define crc32_be_arch crc32_be_base /* not implemented on this arch */ #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { if (cpu_has_crc32) static_branch_enable(&have_crc32); diff --git a/lib/crc/mips/crc32.h b/lib/crc/mips/crc32.h index 11cb272c63a691..3100354a049eb5 100644 --- a/lib/crc/mips/crc32.h +++ b/lib/crc/mips/crc32.h @@ -148,7 +148,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) #define crc32_be_arch crc32_be_base /* not implemented on this arch */ #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { if (cpu_have_feature(cpu_feature(MIPS_CRC32))) static_branch_enable(&have_crc32); diff --git a/lib/crc/powerpc/crc-t10dif.h b/lib/crc/powerpc/crc-t10dif.h index e033d5f57bae22..8f4592a5323d62 100644 --- a/lib/crc/powerpc/crc-t10dif.h +++ b/lib/crc/powerpc/crc-t10dif.h @@ -62,7 +62,7 @@ static inline u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len) } #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch -static inline void crc_t10dif_mod_init_arch(void) +static void crc_t10dif_mod_init_arch(void) { if (cpu_has_feature(CPU_FTR_ARCH_207S) && (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) diff --git a/lib/crc/powerpc/crc32.h b/lib/crc/powerpc/crc32.h index cc8fa3913d4e01..0c852272a3828b 100644 --- a/lib/crc/powerpc/crc32.h +++ b/lib/crc/powerpc/crc32.h @@ -55,7 +55,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) } #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { if (cpu_has_feature(CPU_FTR_ARCH_207S) && (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) diff --git a/lib/crc/sparc/crc32.h b/lib/crc/sparc/crc32.h index 60f2765ac01573..df7c350acd7b5b 100644 --- a/lib/crc/sparc/crc32.h +++ b/lib/crc/sparc/crc32.h @@ -44,7 +44,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *data, size_t len) } #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { unsigned long cfr; diff --git a/lib/crc/x86/crc-t10dif.h b/lib/crc/x86/crc-t10dif.h index 2a02a3026f3f88..8ee8824da551c5 100644 --- a/lib/crc/x86/crc-t10dif.h +++ b/lib/crc/x86/crc-t10dif.h @@ -19,7 +19,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) } #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch -static inline void crc_t10dif_mod_init_arch(void) +static void crc_t10dif_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { static_branch_enable(&have_pclmulqdq); diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h index 2c4a5976654ad7..19a5e3c6c73bb4 100644 --- a/lib/crc/x86/crc32.h +++ b/lib/crc/x86/crc32.h @@ -106,7 +106,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) #define crc32_be_arch crc32_be_base /* not implemented on this arch */ #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_XMM4_2)) static_branch_enable(&have_crc32); diff --git a/lib/crc/x86/crc64.h b/lib/crc/x86/crc64.h index fde1222c4c584c..7d459931934365 100644 --- a/lib/crc/x86/crc64.h +++ b/lib/crc/x86/crc64.h @@ -27,7 +27,7 @@ static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) } #define crc64_mod_init_arch crc64_mod_init_arch -static inline void crc64_mod_init_arch(void) +static void crc64_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { static_branch_enable(&have_pclmulqdq); From 487fe3a936b0b84ef09fa324b4c01d059886f951 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 14 Aug 2025 18:31:39 -0700 Subject: [PATCH 0194/3206] x86/build: Clean up stack alignment flags in CC_FLAGS_FPU The minimum supported version of GCC to build the x86 kernel was bumped to GCC 8.1 in a3e8fe814ad1 ("x86/build: Raise the minimum GCC version to 8.1"). Prior to GCC 7.1, '-mpreferred-stack-boundary=3' was not allowed with '-msse', so areas of the kernel that needed floating point had a different alignment. Now that GCC > 7.1 is mandatory, there is no need to have a different value of '-mpreferred-stack-boundary' from the rest of the kernel, so remove this handling from CC_FLAGS_FPU. Signed-off-by: Nathan Chancellor Signed-off-by: Borislav Petkov (AMD) Link: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=34fac449e121be97dd073c5428cc855367b2872c Link: https://lore.kernel.org/20250814-x86-min-ver-cleanups-v1-3-ff7f19457523@kernel.org --- arch/x86/Makefile | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index f0aa58d2ec17e8..0c82a610fb5470 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -84,19 +84,7 @@ KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-av # CC_FLAGS_FPU := -msse -msse2 ifdef CONFIG_CC_IS_GCC -# Stack alignment mismatch, proceed with caution. -# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3 -# (8B stack alignment). -# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 -# -# The "-msse" in the first argument is there so that the -# -mpreferred-stack-boundary=3 build error: -# -# -mpreferred-stack-boundary=3 is not between 4 and 12 -# -# can be triggered. Otherwise gcc doesn't complain. CC_FLAGS_FPU += -mhard-float -CC_FLAGS_FPU += $(call cc-option,-msse -mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4) endif ifeq ($(CONFIG_X86_KERNEL_IBT),y) From 0a42d732c136d3466cd19fafa7317d3004430318 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 14 Aug 2025 18:31:40 -0700 Subject: [PATCH 0195/3206] x86/build: Remove cc-option from -mno-fp-ret-in-387 This has been supported in GCC for forever and clang gained support for it as an alias of '-mno-x87' in LLVM 14. Now that x86 requires LLVM 15 or newer since 7861640aac52 ("x86/build: Raise the minimum LLVM version to 15.0.0"), this flag can be unconditionally added, saving a compiler invocation. Signed-off-by: Nathan Chancellor Signed-off-by: Borislav Petkov (AMD) Link: https://github.com/llvm/llvm-project/commit/a9fba2be35db674971382e38b99a31403444d9bf Link: https://lore.kernel.org/20250814-x86-min-ver-cleanups-v1-4-ff7f19457523@kernel.org --- arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 0c82a610fb5470..1bbf943fe9e1ba 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -148,7 +148,7 @@ else # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += -mno-80387 - KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) + KBUILD_CFLAGS += -mno-fp-ret-in-387 # By default gcc and clang use a stack alignment of 16 bytes for x86. # However the standard kernel entry on x86-64 leaves the stack on an From 1201f6fb5bfdbd10985ac3c8f49ef8f4f88b5c94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 31 Jul 2025 10:00:31 +0200 Subject: [PATCH 0196/3206] tools/nolibc: fix error return value of clock_nanosleep() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clock_nanosleep() returns a positive error value. Unlike other libc functions it *does not* return -1 nor set errno. Fix the return value and also adapt nanosleep(). Fixes: 7c02bc4088af ("tools/nolibc: add support for clock_nanosleep() and nanosleep()") Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250731-nolibc-clock_nanosleep-ret-v1-1-9e4af7855e61@linutronix.de Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/time.h | 5 +++-- tools/testing/selftests/nolibc/nolibc-test.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h index d02bc44d2643a5..e9c1b976791a65 100644 --- a/tools/include/nolibc/time.h +++ b/tools/include/nolibc/time.h @@ -133,7 +133,8 @@ static __attribute__((unused)) int clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqtp, struct timespec *rmtp) { - return __sysret(sys_clock_nanosleep(clockid, flags, rqtp, rmtp)); + /* Directly return a positive error number */ + return -sys_clock_nanosleep(clockid, flags, rqtp, rmtp); } static __inline__ @@ -145,7 +146,7 @@ double difftime(time_t time1, time_t time2) static __inline__ int nanosleep(const struct timespec *rqtp, struct timespec *rmtp) { - return clock_nanosleep(CLOCK_REALTIME, 0, rqtp, rmtp); + return __sysret(sys_clock_nanosleep(CLOCK_REALTIME, 0, rqtp, rmtp)); } diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index a297ee0d6d0754..cc4d730ac4656f 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -1334,6 +1334,7 @@ int run_syscall(int min, int max) CASE_TEST(chroot_root); EXPECT_SYSZR(euid0, chroot("/")); break; CASE_TEST(chroot_blah); EXPECT_SYSER(1, chroot("/proc/self/blah"), -1, ENOENT); break; CASE_TEST(chroot_exe); EXPECT_SYSER(1, chroot(argv0), -1, ENOTDIR); break; + CASE_TEST(clock_nanosleep); ts.tv_nsec = -1; EXPECT_EQ(1, EINVAL, clock_nanosleep(CLOCK_REALTIME, 0, &ts, NULL)); break; CASE_TEST(close_m1); EXPECT_SYSER(1, close(-1), -1, EBADF); break; CASE_TEST(close_dup); EXPECT_SYSZR(1, close(dup(0))); break; CASE_TEST(dup_0); tmp = dup(0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; From 337927d9895a800a63ffe852616ab05f5d304971 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 14 Aug 2025 18:31:41 -0700 Subject: [PATCH 0197/3206] x86/build: Remove cc-option from -mskip-rax-setup This has been supported in GCC since 5.1 and clang since 14.0. Now that x86 requires LLVM 15 or newer since 7861640aac52 ("x86/build: Raise the minimum LLVM version to 15.0.0"), this flag can be unconditionally added, saving a compiler invocation. Signed-off-by: Nathan Chancellor Signed-off-by: Borislav Petkov (AMD) Link: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=fbe575b652f5bdcc459f447a0e6f0e059996d4ef Link: https://github.com/llvm/llvm-project/commit/a9fba2be35db674971382e38b99a31403444d9bf Link: https://lore.kernel.org/20250814-x86-min-ver-cleanups-v1-5-ff7f19457523@kernel.org --- arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1bbf943fe9e1ba..4b4e2a3ac6df77 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -160,7 +160,7 @@ else KBUILD_CFLAGS += $(cc_stack_align8) # Use -mskip-rax-setup if supported. - KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) + KBUILD_CFLAGS += -mskip-rax-setup ifdef CONFIG_X86_NATIVE_CPU KBUILD_CFLAGS += -march=native From cfd956dcb101aa3d25bac321fae923323a47c607 Mon Sep 17 00:00:00 2001 From: Fabian Vogt Date: Fri, 15 Aug 2025 13:33:28 +0200 Subject: [PATCH 0198/3206] tty: hvc_console: Call hvc_kick in hvc_write unconditionally After hvc_write completes, call hvc_kick also in the case the output buffer has been drained, to ensure tty_wakeup gets called. This fixes that functions which wait for a drained buffer got stuck occasionally. Cc: stable Closes: https://bugzilla.opensuse.org/show_bug.cgi?id=1230062 Signed-off-by: Fabian Vogt Link: https://lore.kernel.org/r/2011735.PYKUYFuaPT@fvogt-thinkpad Signed-off-by: Greg Kroah-Hartman --- drivers/tty/hvc/hvc_console.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c index cd1f657f782df2..13c663a154c4e8 100644 --- a/drivers/tty/hvc/hvc_console.c +++ b/drivers/tty/hvc/hvc_console.c @@ -543,10 +543,10 @@ static ssize_t hvc_write(struct tty_struct *tty, const u8 *buf, size_t count) } /* - * Racy, but harmless, kick thread if there is still pending data. + * Kick thread to flush if there's still pending data + * or to wakeup the write queue. */ - if (hp->n_outbuf) - hvc_kick(); + hvc_kick(); return written; } From 2c6a28f3ef729ed2d5b174b4e0f33172fb286bab Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 14 Aug 2025 18:31:42 -0700 Subject: [PATCH 0199/3206] x86/Kconfig: Clean up LLVM version checks in IBT configurations The minimum supported version of LLVM for building the x86 kernel was bumped to 15.0.0 in 7861640aac52 ("x86/build: Raise the minimum LLVM version to 15.0.0"), so the checks for Clang 14.0.0 and ld.lld 14.0.0 or newer will always been true. Clean them up. Signed-off-by: Nathan Chancellor Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250814-x86-min-ver-cleanups-v1-6-ff7f19457523@kernel.org --- arch/x86/Kconfig | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100eb..85b91267c01ce7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1753,11 +1753,7 @@ config X86_UMIP config CC_HAS_IBT # GCC >= 9 and binutils >= 2.29 # Retpoline check to work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93654 - # Clang/LLVM >= 14 - # https://github.com/llvm/llvm-project/commit/e0b89df2e0f0130881bf6c39bf31d7f6aac00e0f - # https://github.com/llvm/llvm-project/commit/dfcf69770bc522b9e411c66454934a37c1f35332 - def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || \ - (CC_IS_CLANG && CLANG_VERSION >= 140000)) && \ + def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || CC_IS_CLANG) && \ $(as-instr,endbr64) config X86_CET @@ -1769,8 +1765,6 @@ config X86_KERNEL_IBT prompt "Indirect Branch Tracking" def_bool y depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL - # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f - depends on !LD_IS_LLD || LLD_VERSION >= 140000 select OBJTOOL select X86_CET help From 292cb391479d50f4379a0abab34324de92c82a92 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 13 Aug 2025 23:03:52 -0700 Subject: [PATCH 0200/3206] software node: Constify node_group in registration functions The software_node_register_node_group() and software_node_unregister_node_group() functions take in essence an array of pointers to software_node structs. Since the functions do not modify the array declare the argument as constant, so that static arrays can be declared as const and annotated as __initconst. Signed-off-by: Dmitry Torokhov Link: https://lore.kernel.org/r/2zny5grbgtwbplynxffxg6dkgjgqf45aigwmgxio5stesdr3wi@gf2zamk5amic Signed-off-by: Greg Kroah-Hartman --- drivers/base/swnode.c | 5 ++--- include/linux/property.h | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index deda7f35a05987..be1e9e61a7bf4d 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -844,7 +844,7 @@ swnode_register(const struct software_node *node, struct swnode *parent, * of this function or by ordering the array such that parent comes before * child. */ -int software_node_register_node_group(const struct software_node **node_group) +int software_node_register_node_group(const struct software_node * const *node_group) { unsigned int i; int ret; @@ -877,8 +877,7 @@ EXPORT_SYMBOL_GPL(software_node_register_node_group); * remove the nodes individually, in the correct order (child before * parent). */ -void software_node_unregister_node_group( - const struct software_node **node_group) +void software_node_unregister_node_group(const struct software_node * const *node_group) { unsigned int i = 0; diff --git a/include/linux/property.h b/include/linux/property.h index 82f0cb3abd1e22..d1e80b3c9918b3 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -574,8 +574,8 @@ const struct software_node * software_node_find_by_name(const struct software_node *parent, const char *name); -int software_node_register_node_group(const struct software_node **node_group); -void software_node_unregister_node_group(const struct software_node **node_group); +int software_node_register_node_group(const struct software_node * const *node_group); +void software_node_unregister_node_group(const struct software_node * const *node_group); int software_node_register(const struct software_node *node); void software_node_unregister(const struct software_node *node); From f63aaf6e71de897954fbde4e4a17a9dcdbe5e7e1 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 13 Aug 2025 10:20:22 +0200 Subject: [PATCH 0201/3206] clk: renesas: mstp: Add genpd OF provider at postcore_initcall() Genpd OF providers must now be registered after genpd bus registration. However, cpg_mstp_add_clk_domain() is only called from CLK_OF_DECLARE(), which is too early. Hence on R-Car M1A, R-Car H1, and RZ/A1, the CPG/MSTP Clock Domain fails to register, and any devices residing in that clock domain fail to probe. Fix this by splitting initialization into two steps: - The first part keeps on registering the PM domain with genpd at CLK_OF_DECLARE(), - The second and new part moves the registration of the genpd OF provider to a postcore_initcall(). See also commit c5ae5a0c61120d0c ("pmdomain: renesas: rcar-sysc: Add genpd OF provider at postcore_initcall"). Fixes: 18a3a510ecfd0e50 ("pmdomain: core: Add the genpd->dev to the genpd provider bus") Signed-off-by: Geert Uytterhoeven Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/81ef5f8d5d31374b7852b05453c52d2f735062a2.1755073087.git.geert+renesas@glider.be --- drivers/clk/renesas/clk-mstp.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/clk/renesas/clk-mstp.c b/drivers/clk/renesas/clk-mstp.c index 5bc473c2adb33a..2f65fe2c6bdf07 100644 --- a/drivers/clk/renesas/clk-mstp.c +++ b/drivers/clk/renesas/clk-mstp.c @@ -303,6 +303,9 @@ void cpg_mstp_detach_dev(struct generic_pm_domain *unused, struct device *dev) pm_clk_destroy(dev); } +static struct device_node *cpg_mstp_pd_np __initdata = NULL; +static struct generic_pm_domain *cpg_mstp_pd_genpd __initdata = NULL; + void __init cpg_mstp_add_clk_domain(struct device_node *np) { struct generic_pm_domain *pd; @@ -324,5 +327,20 @@ void __init cpg_mstp_add_clk_domain(struct device_node *np) pd->detach_dev = cpg_mstp_detach_dev; pm_genpd_init(pd, &pm_domain_always_on_gov, false); - of_genpd_add_provider_simple(np, pd); + cpg_mstp_pd_np = of_node_get(np); + cpg_mstp_pd_genpd = pd; +} + +static int __init cpg_mstp_pd_init_provider(void) +{ + int error; + + if (!cpg_mstp_pd_np) + return -ENODEV; + + error = of_genpd_add_provider_simple(cpg_mstp_pd_np, cpg_mstp_pd_genpd); + + of_node_put(cpg_mstp_pd_np); + return error; } +postcore_initcall(cpg_mstp_pd_init_provider); From 758e743362cdc0cc45d8717431b2eabf29084ef1 Mon Sep 17 00:00:00 2001 From: Pawel Zalewski Date: Wed, 23 Jul 2025 11:02:08 +0100 Subject: [PATCH 0202/3206] leds: leds-is31fl32xx: Add support for is31fl3236a Also add an additional and optional control register for setting the output PWM frequency to 22kHz. The default is 3kHz and this option puts the operational frequency outside of the audible range. Signed-off-by: Pawel Zalewski Link: https://lore.kernel.org/r/20250723-leds-is31fl3236a-v6-3-210328058625@thegoodpenguin.co.uk Signed-off-by: Lee Jones --- drivers/leds/leds-is31fl32xx.c | 47 +++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/drivers/leds/leds-is31fl32xx.c b/drivers/leds/leds-is31fl32xx.c index 8793330dd4142f..dc9349f9d3501b 100644 --- a/drivers/leds/leds-is31fl32xx.c +++ b/drivers/leds/leds-is31fl32xx.c @@ -32,6 +32,8 @@ #define IS31FL3216_CONFIG_SSD_ENABLE BIT(7) #define IS31FL3216_CONFIG_SSD_DISABLE 0 +#define IS31FL32XX_PWM_FREQUENCY_22KHZ 0x01 + struct is31fl32xx_priv; struct is31fl32xx_led_data { struct led_classdev cdev; @@ -53,6 +55,7 @@ struct is31fl32xx_priv { * @pwm_update_reg : address of PWM Update register * @global_control_reg : address of Global Control register (optional) * @reset_reg : address of Reset register (optional) + * @output_frequency_setting_reg: address of output frequency register (optional) * @pwm_register_base : address of first PWM register * @pwm_registers_reversed: : true if PWM registers count down instead of up * @led_control_register_base : address of first LED control register (optional) @@ -76,6 +79,7 @@ struct is31fl32xx_chipdef { u8 pwm_update_reg; u8 global_control_reg; u8 reset_reg; + u8 output_frequency_setting_reg; u8 pwm_register_base; bool pwm_registers_reversed; u8 led_control_register_base; @@ -90,6 +94,19 @@ static const struct is31fl32xx_chipdef is31fl3236_cdef = { .pwm_update_reg = 0x25, .global_control_reg = 0x4a, .reset_reg = 0x4f, + .output_frequency_setting_reg = IS31FL32XX_REG_NONE, + .pwm_register_base = 0x01, + .led_control_register_base = 0x26, + .enable_bits_per_led_control_register = 1, +}; + +static const struct is31fl32xx_chipdef is31fl3236a_cdef = { + .channels = 36, + .shutdown_reg = 0x00, + .pwm_update_reg = 0x25, + .global_control_reg = 0x4a, + .reset_reg = 0x4f, + .output_frequency_setting_reg = 0x4b, .pwm_register_base = 0x01, .led_control_register_base = 0x26, .enable_bits_per_led_control_register = 1, @@ -101,6 +118,7 @@ static const struct is31fl32xx_chipdef is31fl3235_cdef = { .pwm_update_reg = 0x25, .global_control_reg = 0x4a, .reset_reg = 0x4f, + .output_frequency_setting_reg = IS31FL32XX_REG_NONE, .pwm_register_base = 0x05, .led_control_register_base = 0x2a, .enable_bits_per_led_control_register = 1, @@ -112,6 +130,7 @@ static const struct is31fl32xx_chipdef is31fl3218_cdef = { .pwm_update_reg = 0x16, .global_control_reg = IS31FL32XX_REG_NONE, .reset_reg = 0x17, + .output_frequency_setting_reg = IS31FL32XX_REG_NONE, .pwm_register_base = 0x01, .led_control_register_base = 0x13, .enable_bits_per_led_control_register = 6, @@ -126,6 +145,7 @@ static const struct is31fl32xx_chipdef is31fl3216_cdef = { .pwm_update_reg = 0xB0, .global_control_reg = IS31FL32XX_REG_NONE, .reset_reg = IS31FL32XX_REG_NONE, + .output_frequency_setting_reg = IS31FL32XX_REG_NONE, .pwm_register_base = 0x10, .pwm_registers_reversed = true, .led_control_register_base = 0x01, @@ -363,8 +383,21 @@ static struct is31fl32xx_led_data *is31fl32xx_find_led_data( static int is31fl32xx_parse_dt(struct device *dev, struct is31fl32xx_priv *priv) { + const struct is31fl32xx_chipdef *cdef = priv->cdef; int ret = 0; + if ((cdef->output_frequency_setting_reg != IS31FL32XX_REG_NONE) && + of_property_read_bool(dev_of_node(dev), "issi,22khz-pwm")) { + + ret = is31fl32xx_write(priv, cdef->output_frequency_setting_reg, + IS31FL32XX_PWM_FREQUENCY_22KHZ); + + if (ret) { + dev_err(dev, "Failed to write output PWM frequency register\n"); + return ret; + } + } + for_each_available_child_of_node_scoped(dev_of_node(dev), child) { struct led_init_data init_data = {}; struct is31fl32xx_led_data *led_data = @@ -404,12 +437,13 @@ static int is31fl32xx_parse_dt(struct device *dev, } static const struct of_device_id of_is31fl32xx_match[] = { - { .compatible = "issi,is31fl3236", .data = &is31fl3236_cdef, }, - { .compatible = "issi,is31fl3235", .data = &is31fl3235_cdef, }, - { .compatible = "issi,is31fl3218", .data = &is31fl3218_cdef, }, - { .compatible = "si-en,sn3218", .data = &is31fl3218_cdef, }, - { .compatible = "issi,is31fl3216", .data = &is31fl3216_cdef, }, - { .compatible = "si-en,sn3216", .data = &is31fl3216_cdef, }, + { .compatible = "issi,is31fl3236", .data = &is31fl3236_cdef, }, + { .compatible = "issi,is31fl3236a", .data = &is31fl3236a_cdef, }, + { .compatible = "issi,is31fl3235", .data = &is31fl3235_cdef, }, + { .compatible = "issi,is31fl3218", .data = &is31fl3218_cdef, }, + { .compatible = "si-en,sn3218", .data = &is31fl3218_cdef, }, + { .compatible = "issi,is31fl3216", .data = &is31fl3216_cdef, }, + { .compatible = "si-en,sn3216", .data = &is31fl3216_cdef, }, {}, }; @@ -466,6 +500,7 @@ static void is31fl32xx_remove(struct i2c_client *client) */ static const struct i2c_device_id is31fl32xx_id[] = { { "is31fl3236" }, + { "is31fl3236a" }, { "is31fl3235" }, { "is31fl3218" }, { "sn3218" }, From f4fc2d87aa167e3da5333c3db6cf702db1f98c05 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 24 Jul 2025 12:20:30 +0100 Subject: [PATCH 0203/3206] leds: Kconfig: Fix spelling mistake "limitiation" -> "limitation" There is a spelling mistake in the LEDS_BD2606MVV config. Fix it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20250724112030.142121-1-colin.i.king@gmail.com Signed-off-by: Lee Jones --- drivers/leds/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index 6e3dce7e35a490..06e6291be11b2f 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -674,7 +674,7 @@ config LEDS_BD2606MVV help This option enables support for BD2606MVV LED driver chips accessed via the I2C bus. It supports setting brightness, with - the limitiation that there are groups of two channels sharing + the limitation that there are groups of two channels sharing a brightness setting, but not the on/off setting. To compile this driver as a module, choose M here: the module will From 6e3779e3c6f9dcc9267bf98bef70773a0b13dcbb Mon Sep 17 00:00:00 2001 From: Len Bao Date: Sun, 27 Jul 2025 07:56:45 +0000 Subject: [PATCH 0204/3206] leds: max77705: Function return instead of variable assignment Coverity noticed that assigning value -EINVAL to 'ret' in the if statement is useless because 'ret' is overwritten a few lines later. However, after inspect the code, this warning reveals that we need to return -EINVAL instead of the variable assignment. So, fix it. Coverity-id: 1646104 Fixes: aebb5fc9a0d8 ("leds: max77705: Add LEDs support") Signed-off-by: Len Bao Link: https://lore.kernel.org/r/20250727075649.34496-1-len.bao@gmx.us Signed-off-by: Lee Jones --- drivers/leds/leds-max77705.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/leds-max77705.c b/drivers/leds/leds-max77705.c index 933cb4f19be9bc..b7403b3fcf5e72 100644 --- a/drivers/leds/leds-max77705.c +++ b/drivers/leds/leds-max77705.c @@ -180,7 +180,7 @@ static int max77705_add_led(struct device *dev, struct regmap *regmap, struct fw ret = fwnode_property_read_u32(np, "reg", ®); if (ret || reg >= MAX77705_LED_NUM_LEDS) - ret = -EINVAL; + return -EINVAL; info = devm_kcalloc(dev, num_channels, sizeof(*info), GFP_KERNEL); if (!info) From 5974e8f6c3e47ab097c3dd8ece7324d1f88fe739 Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Tue, 29 Jul 2025 12:51:22 +0800 Subject: [PATCH 0205/3206] leds: flash: leds-qcom-flash: Update torch current clamp setting There is a register to clamp the flash current per LED channel when safety timer is disabled. It needs to be updated according to the maximum torch LED current setting to ensure the torch current won't be clamped unexpectedly. Fixes: 96a2e242a5dc ("leds: flash: Add driver to support flash LED module in QCOM PMICs") Signed-off-by: Fenglin Wu Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20250729-fix-torch-clamp-issue-v2-1-9b83816437a3@oss.qualcomm.com Signed-off-by: Lee Jones --- drivers/leds/flash/leds-qcom-flash.c | 62 ++++++++++++++++------------ 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/drivers/leds/flash/leds-qcom-flash.c b/drivers/leds/flash/leds-qcom-flash.c index 89cf5120f5d55b..db7c2c743adc75 100644 --- a/drivers/leds/flash/leds-qcom-flash.c +++ b/drivers/leds/flash/leds-qcom-flash.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2022, 2024 Qualcomm Innovation Center, Inc. All rights reserved. + * Copyright (c) 2022, 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved. */ #include @@ -114,36 +114,39 @@ enum { REG_THERM_THRSH1, REG_THERM_THRSH2, REG_THERM_THRSH3, + REG_TORCH_CLAMP, REG_MAX_COUNT, }; static const struct reg_field mvflash_3ch_regs[REG_MAX_COUNT] = { - REG_FIELD(0x08, 0, 7), /* status1 */ - REG_FIELD(0x09, 0, 7), /* status2 */ - REG_FIELD(0x0a, 0, 7), /* status3 */ - REG_FIELD_ID(0x40, 0, 7, 3, 1), /* chan_timer */ - REG_FIELD_ID(0x43, 0, 6, 3, 1), /* itarget */ - REG_FIELD(0x46, 7, 7), /* module_en */ - REG_FIELD(0x47, 0, 5), /* iresolution */ - REG_FIELD_ID(0x49, 0, 2, 3, 1), /* chan_strobe */ - REG_FIELD(0x4c, 0, 2), /* chan_en */ - REG_FIELD(0x56, 0, 2), /* therm_thrsh1 */ - REG_FIELD(0x57, 0, 2), /* therm_thrsh2 */ - REG_FIELD(0x58, 0, 2), /* therm_thrsh3 */ + [REG_STATUS1] = REG_FIELD(0x08, 0, 7), + [REG_STATUS2] = REG_FIELD(0x09, 0, 7), + [REG_STATUS3] = REG_FIELD(0x0a, 0, 7), + [REG_CHAN_TIMER] = REG_FIELD_ID(0x40, 0, 7, 3, 1), + [REG_ITARGET] = REG_FIELD_ID(0x43, 0, 6, 3, 1), + [REG_MODULE_EN] = REG_FIELD(0x46, 7, 7), + [REG_IRESOLUTION] = REG_FIELD(0x47, 0, 5), + [REG_CHAN_STROBE] = REG_FIELD_ID(0x49, 0, 2, 3, 1), + [REG_CHAN_EN] = REG_FIELD(0x4c, 0, 2), + [REG_THERM_THRSH1] = REG_FIELD(0x56, 0, 2), + [REG_THERM_THRSH2] = REG_FIELD(0x57, 0, 2), + [REG_THERM_THRSH3] = REG_FIELD(0x58, 0, 2), + [REG_TORCH_CLAMP] = REG_FIELD(0xec, 0, 6), }; static const struct reg_field mvflash_4ch_regs[REG_MAX_COUNT] = { - REG_FIELD(0x06, 0, 7), /* status1 */ - REG_FIELD(0x07, 0, 6), /* status2 */ - REG_FIELD(0x09, 0, 7), /* status3 */ - REG_FIELD_ID(0x3e, 0, 7, 4, 1), /* chan_timer */ - REG_FIELD_ID(0x42, 0, 6, 4, 1), /* itarget */ - REG_FIELD(0x46, 7, 7), /* module_en */ - REG_FIELD(0x49, 0, 3), /* iresolution */ - REG_FIELD_ID(0x4a, 0, 6, 4, 1), /* chan_strobe */ - REG_FIELD(0x4e, 0, 3), /* chan_en */ - REG_FIELD(0x7a, 0, 2), /* therm_thrsh1 */ - REG_FIELD(0x78, 0, 2), /* therm_thrsh2 */ + [REG_STATUS1] = REG_FIELD(0x06, 0, 7), + [REG_STATUS2] = REG_FIELD(0x07, 0, 6), + [REG_STATUS3] = REG_FIELD(0x09, 0, 7), + [REG_CHAN_TIMER] = REG_FIELD_ID(0x3e, 0, 7, 4, 1), + [REG_ITARGET] = REG_FIELD_ID(0x42, 0, 6, 4, 1), + [REG_MODULE_EN] = REG_FIELD(0x46, 7, 7), + [REG_IRESOLUTION] = REG_FIELD(0x49, 0, 3), + [REG_CHAN_STROBE] = REG_FIELD_ID(0x4a, 0, 6, 4, 1), + [REG_CHAN_EN] = REG_FIELD(0x4e, 0, 3), + [REG_THERM_THRSH1] = REG_FIELD(0x7a, 0, 2), + [REG_THERM_THRSH2] = REG_FIELD(0x78, 0, 2), + [REG_TORCH_CLAMP] = REG_FIELD(0xed, 0, 6), }; struct qcom_flash_data { @@ -156,6 +159,7 @@ struct qcom_flash_data { u8 max_channels; u8 chan_en_bits; u8 revision; + u8 torch_clamp; }; struct qcom_flash_led { @@ -702,6 +706,7 @@ static int qcom_flash_register_led_device(struct device *dev, u32 current_ua, timeout_us; u32 channels[4]; int i, rc, count; + u8 torch_clamp; count = fwnode_property_count_u32(node, "led-sources"); if (count <= 0) { @@ -751,6 +756,12 @@ static int qcom_flash_register_led_device(struct device *dev, current_ua = min_t(u32, current_ua, TORCH_CURRENT_MAX_UA * led->chan_count); led->max_torch_current_ma = current_ua / UA_PER_MA; + torch_clamp = (current_ua / led->chan_count) / TORCH_IRES_UA; + if (torch_clamp != 0) + torch_clamp--; + + flash_data->torch_clamp = max_t(u8, flash_data->torch_clamp, torch_clamp); + if (fwnode_property_present(node, "flash-max-microamp")) { flash->led_cdev.flags |= LED_DEV_CAP_FLASH; @@ -917,8 +928,7 @@ static int qcom_flash_led_probe(struct platform_device *pdev) flash_data->leds_count++; } - return 0; - + return regmap_field_write(flash_data->r_fields[REG_TORCH_CLAMP], flash_data->torch_clamp); release: while (flash_data->v4l2_flash[flash_data->leds_count] && flash_data->leds_count) v4l2_flash_release(flash_data->v4l2_flash[flash_data->leds_count--]); From 7d5c3cac1f395324673e9ec32b65e4f6ff23fcb9 Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Tue, 29 Jul 2025 12:51:23 +0800 Subject: [PATCH 0206/3206] leds: flash: leds-qcom-flash: Add a separate register map for PMI8998 The 3-channel flash module in PMI8998 has several registers different than the others, such as: torch_clamp. Add different register fields for it. Signed-off-by: Fenglin Wu Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20250729-fix-torch-clamp-issue-v2-2-9b83816437a3@oss.qualcomm.com Signed-off-by: Lee Jones --- drivers/leds/flash/leds-qcom-flash.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/leds/flash/leds-qcom-flash.c b/drivers/leds/flash/leds-qcom-flash.c index db7c2c743adc75..b03a6833e3e3a0 100644 --- a/drivers/leds/flash/leds-qcom-flash.c +++ b/drivers/leds/flash/leds-qcom-flash.c @@ -118,6 +118,22 @@ enum { REG_MAX_COUNT, }; +static const struct reg_field mvflash_3ch_pmi8998_regs[REG_MAX_COUNT] = { + [REG_STATUS1] = REG_FIELD(0x08, 0, 5), + [REG_STATUS2] = REG_FIELD(0x09, 0, 7), + [REG_STATUS3] = REG_FIELD(0x0a, 0, 7), + [REG_CHAN_TIMER] = REG_FIELD_ID(0x40, 0, 7, 3, 1), + [REG_ITARGET] = REG_FIELD_ID(0x43, 0, 6, 3, 1), + [REG_MODULE_EN] = REG_FIELD(0x46, 7, 7), + [REG_IRESOLUTION] = REG_FIELD(0x47, 0, 5), + [REG_CHAN_STROBE] = REG_FIELD_ID(0x49, 0, 2, 3, 1), + [REG_CHAN_EN] = REG_FIELD(0x4c, 0, 2), + [REG_THERM_THRSH1] = REG_FIELD(0x56, 0, 2), + [REG_THERM_THRSH2] = REG_FIELD(0x57, 0, 2), + [REG_THERM_THRSH3] = REG_FIELD(0x58, 0, 2), + [REG_TORCH_CLAMP] = REG_FIELD(0xea, 0, 6), +}; + static const struct reg_field mvflash_3ch_regs[REG_MAX_COUNT] = { [REG_STATUS1] = REG_FIELD(0x08, 0, 7), [REG_STATUS2] = REG_FIELD(0x09, 0, 7), @@ -862,13 +878,20 @@ static int qcom_flash_led_probe(struct platform_device *pdev) return rc; } - if (val == FLASH_SUBTYPE_3CH_PM8150_VAL || val == FLASH_SUBTYPE_3CH_PMI8998_VAL) { + if (val == FLASH_SUBTYPE_3CH_PM8150_VAL) { flash_data->hw_type = QCOM_MVFLASH_3CH; flash_data->max_channels = 3; regs = devm_kmemdup(dev, mvflash_3ch_regs, sizeof(mvflash_3ch_regs), GFP_KERNEL); if (!regs) return -ENOMEM; + } else if (val == FLASH_SUBTYPE_3CH_PMI8998_VAL) { + flash_data->hw_type = QCOM_MVFLASH_3CH; + flash_data->max_channels = 3; + regs = devm_kmemdup(dev, mvflash_3ch_pmi8998_regs, + sizeof(mvflash_3ch_pmi8998_regs), GFP_KERNEL); + if (!regs) + return -ENOMEM; } else if (val == FLASH_SUBTYPE_4CH_VAL) { flash_data->hw_type = QCOM_MVFLASH_4CH; flash_data->max_channels = 4; From 5f755ba95ae10fd4fa28d64345056ffc18d12c5a Mon Sep 17 00:00:00 2001 From: Erick Shepherd Date: Thu, 24 Jul 2025 13:53:54 -0500 Subject: [PATCH 0207/3206] mmc: sdhci: Disable SD card clock before changing parameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per the SD Host Controller Simplified Specification v4.20 §3.2.3, change the SD card clock parameters only after first disabling the external card clock. Doing this fixes a spurious clock pulse on Baytrail and Apollo Lake SD controllers which otherwise breaks voltage switching with a specific Swissbit SD card. This change is limited to Intel host controllers to avoid an issue reported on ARM64 devices. Signed-off-by: Kyle Roeschley Signed-off-by: Brad Mouring Signed-off-by: Erick Shepherd Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20250724185354.815888-1-erick.shepherd@ni.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-core.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index 826958992dfe26..47a0a738862b58 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -679,8 +679,19 @@ static int intel_start_signal_voltage_switch(struct mmc_host *mmc, return 0; } +static void sdhci_intel_set_clock(struct sdhci_host *host, unsigned int clock) +{ + u16 clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL); + + /* Stop card clock separately to avoid glitches on clock line */ + if (clk & SDHCI_CLOCK_CARD_EN) + sdhci_writew(host, clk & ~SDHCI_CLOCK_CARD_EN, SDHCI_CLOCK_CONTROL); + + sdhci_set_clock(host, clock); +} + static const struct sdhci_ops sdhci_intel_byt_ops = { - .set_clock = sdhci_set_clock, + .set_clock = sdhci_intel_set_clock, .set_power = sdhci_intel_set_power, .enable_dma = sdhci_pci_enable_dma, .set_bus_width = sdhci_set_bus_width, @@ -690,7 +701,7 @@ static const struct sdhci_ops sdhci_intel_byt_ops = { }; static const struct sdhci_ops sdhci_intel_glk_ops = { - .set_clock = sdhci_set_clock, + .set_clock = sdhci_intel_set_clock, .set_power = sdhci_intel_set_power, .enable_dma = sdhci_pci_enable_dma, .set_bus_width = sdhci_set_bus_width, From d76ae3c79ca9044191f2e2a8c85326869a72fa39 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 24 Jul 2025 12:28:17 +0100 Subject: [PATCH 0208/3206] mmc: Kconfig: Fix spelling mistake "referrered" -> "referred" There are two spelling mistakes in the config. Fix them. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20250724112817.142784-1-colin.i.king@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 7232de1c068873..4afa0130779d97 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -359,7 +359,7 @@ config MMC_SDHCI_S3C depends on PLAT_SAMSUNG || ARCH_S5PV210 || ARCH_EXYNOS || COMPILE_TEST help This selects the Secure Digital Host Controller Interface (SDHCI) - often referrered to as the HSMMC block in some of the Samsung + often referred to as the HSMMC block in some of the Samsung S3C6410, S5Pv210 and Exynos (Exynso4210, Exynos4412) SoCs. If you have a controller with this interface (thereforeyou build for @@ -401,7 +401,7 @@ config MMC_SDHCI_SPEAR depends on OF help This selects the Secure Digital Host Controller Interface (SDHCI) - often referrered to as the HSMMC block in some of the ST SPEAR range + often referred to as the HSMMC block in some of the ST SPEAR range of SoC If you have a controller with this interface, say Y or M here. From 4591511daba4e7e9a8452b116c3f7ea5718c60ce Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 31 Jul 2025 10:41:16 +0100 Subject: [PATCH 0209/3206] mmc: davinci: Remove space before newline There is a extraneous space before a newline in a dev_err message. Remove the space. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20250731094116.2163061-1-colin.i.king@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/davinci_mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c index c691f1b603953f..2a3c8058b0fbf3 100644 --- a/drivers/mmc/host/davinci_mmc.c +++ b/drivers/mmc/host/davinci_mmc.c @@ -588,7 +588,7 @@ static void mmc_davinci_request(struct mmc_host *mmc, struct mmc_request *req) cpu_relax(); } if (mmcst1 & MMCST1_BUSY) { - dev_err(mmc_dev(host->mmc), "still BUSY? bad ... \n"); + dev_err(mmc_dev(host->mmc), "still BUSY? bad ...\n"); req->cmd->error = -ETIMEDOUT; mmc_request_done(mmc, req); return; From 0caebd1658bbb3796bec4e3449876742d29701ab Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Fri, 25 Jul 2025 08:01:51 +0200 Subject: [PATCH 0210/3206] dt-bindings: mmc: fsl,esdhc: Add explicit reference to mmc-controller-common Even though it is referenced by mmc/mmc-controller.yaml it still raises the warning: esdhc@1560000 (fsl,ls1021a-esdhc): Unevaluated properties are not allowed ('bus-width' was unexpected) Adding an explicit reference fixes this. Signed-off-by: Alexander Stein Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250725060152.262094-1-alexander.stein@ew.tq-group.com Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/fsl,esdhc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/mmc/fsl,esdhc.yaml b/Documentation/devicetree/bindings/mmc/fsl,esdhc.yaml index 62087cf920df8f..f45e592901e24b 100644 --- a/Documentation/devicetree/bindings/mmc/fsl,esdhc.yaml +++ b/Documentation/devicetree/bindings/mmc/fsl,esdhc.yaml @@ -90,6 +90,7 @@ required: allOf: - $ref: sdhci-common.yaml# + - $ref: mmc-controller-common.yaml# unevaluatedProperties: false From c33aca6c44c081b6aaff8f203d408688859a81fb Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sun, 22 Jun 2025 18:19:43 +0200 Subject: [PATCH 0211/3206] MAINTAINERS: EDAC: Drop inactive reviewers I know y'all busy with other stuff and don't have time for EDAC review. Signed-off-by: Borislav Petkov (AMD) Acked-by: James Morse Link: https://lore.kernel.org/20250622161943.4700-1-bp@kernel.org --- MAINTAINERS | 3 --- 1 file changed, 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..dbbeabfc8a4717 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8726,9 +8726,6 @@ F: drivers/edac/thunderx_edac* EDAC-CORE M: Borislav Petkov M: Tony Luck -R: James Morse -R: Mauro Carvalho Chehab -R: Robert Richter L: linux-edac@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next From 550bc517e59347b3b1af7d290eac4fb1411a3d4e Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 17 Aug 2025 22:23:27 +0800 Subject: [PATCH 0212/3206] regulator: bd718x7: Use kcalloc() instead of kzalloc() Replace calls of 'devm_kzalloc(dev, count * sizeof([type]), flags)' with 'devm_kcalloc(dev, count, sizeof([type]), flags)' in setup_feedback_loop() for safer memory allocation with built-in overflow protection. Signed-off-by: Qianfeng Rong Reviewed-by: Matti Vaittinen Link: https://patch.msgid.link/20250817142327.174531-1-rongqianfeng@vivo.com Signed-off-by: Mark Brown --- drivers/regulator/bd718x7-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/bd718x7-regulator.c b/drivers/regulator/bd718x7-regulator.c index e803cc59d68a5c..022d98f3c32a2d 100644 --- a/drivers/regulator/bd718x7-regulator.c +++ b/drivers/regulator/bd718x7-regulator.c @@ -1598,7 +1598,7 @@ static int setup_feedback_loop(struct device *dev, struct device_node *np, if (desc->n_linear_ranges && desc->linear_ranges) { struct linear_range *new; - new = devm_kzalloc(dev, desc->n_linear_ranges * + new = devm_kcalloc(dev, desc->n_linear_ranges, sizeof(struct linear_range), GFP_KERNEL); if (!new) From 07826c02eda9da406524bd50cd3fd321be8c9447 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 15 Aug 2025 11:44:00 -0500 Subject: [PATCH 0213/3206] MAINTAINERS: merge TRIGGER SOURCE sections Merge all of Documentation/devicetree/bindings/trigger-source/* as a single entry. In the previous merge cycle, there was a merge conflict between the IIO and SPI trees and we ended up with the current state, but the intention was to have a single entry for all trigger-source bindings. Signed-off-by: David Lechner Link: https://patch.msgid.link/20250815-spi-offload-trigger-followup-v1-1-8ec5413a8383@baylibre.com Signed-off-by: Mark Brown --- MAINTAINERS | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index daf520a13bdf6a..eb3f98f512ade9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -25601,16 +25601,10 @@ W: https://github.com/srcres258/linux-doc T: git https://github.com/srcres258/linux-doc.git doc-zh-tw F: Documentation/translations/zh_TW/ -TRIGGER SOURCE - ADI UTIL SIGMA DELTA SPI -M: David Lechner -S: Maintained -F: Documentation/devicetree/bindings/trigger-source/adi,util-sigma-delta-spi.yaml - TRIGGER SOURCE M: David Lechner S: Maintained -F: Documentation/devicetree/bindings/trigger-source/gpio-trigger.yaml -F: Documentation/devicetree/bindings/trigger-source/pwm-trigger.yaml +F: Documentation/devicetree/bindings/trigger-source/* TRUSTED SECURITY MODULE (TSM) INFRASTRUCTURE M: Dan Williams From 0056b410355713556d8a10306f82e55b28d33ba8 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 15 Aug 2025 11:44:01 -0500 Subject: [PATCH 0214/3206] spi: offload trigger: adi-util-sigma-delta: clean up imports Clean up imports using include-what-you-use principles. Suggested-by: Andy Shevchenko Signed-off-by: David Lechner Link: https://patch.msgid.link/20250815-spi-offload-trigger-followup-v1-2-8ec5413a8383@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c b/drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c index 035d088d4d33d6..8468c773713a3d 100644 --- a/drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c +++ b/drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c @@ -5,12 +5,15 @@ */ #include -#include +#include +#include #include #include #include #include #include +#include +#include static bool adi_util_sigma_delta_match(struct spi_offload_trigger *trigger, enum spi_offload_trigger_type type, From a750050349ea138e3e86c66a8a41de736619b9de Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Fri, 15 Aug 2025 16:21:15 +0800 Subject: [PATCH 0215/3206] spi: spi-fsl-lpspi: use min_t() to improve code Use min_t() to reduce the code in fsl_lpspi_setup_transfer() and improve its readability. Signed-off-by: Qianfeng Rong Link: https://patch.msgid.link/20250815082118.586422-2-rongqianfeng@vivo.com Reviewed-by: Frank Li Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-lpspi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c index 67d4000c3cef5b..f5595e2ba2d560 100644 --- a/drivers/spi/spi-fsl-lpspi.c +++ b/drivers/spi/spi-fsl-lpspi.c @@ -25,6 +25,7 @@ #include #include #include +#include #define DRIVER_NAME "fsl_lpspi" @@ -475,10 +476,9 @@ static int fsl_lpspi_setup_transfer(struct spi_controller *controller, fsl_lpspi->tx = fsl_lpspi_buf_tx_u32; } - if (t->len <= fsl_lpspi->txfifosize) - fsl_lpspi->watermark = t->len; - else - fsl_lpspi->watermark = fsl_lpspi->txfifosize; + fsl_lpspi->watermark = min_t(typeof(fsl_lpspi->watermark), + fsl_lpspi->txfifosize, + t->len); if (fsl_lpspi_can_dma(controller, spi, t)) fsl_lpspi->usedma = true; From 1bdc716023a78c2c41fdcb3fc37f09da1be4b7df Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Fri, 15 Aug 2025 16:21:16 +0800 Subject: [PATCH 0216/3206] spi: npcm-fiu: use min_t() to improve code Use min_t() to reduce the code in npcm_fiu_read() and improve its readability. Signed-off-by: Qianfeng Rong Link: https://patch.msgid.link/20250815082118.586422-3-rongqianfeng@vivo.com Signed-off-by: Mark Brown --- drivers/spi/spi-npcm-fiu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-npcm-fiu.c b/drivers/spi/spi-npcm-fiu.c index 67cc1d86de425e..cccd17f247754d 100644 --- a/drivers/spi/spi-npcm-fiu.c +++ b/drivers/spi/spi-npcm-fiu.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -498,10 +499,7 @@ static int npcm_fiu_read(struct spi_mem *mem, const struct spi_mem_op *op) do { addr = ((u32)op->addr.val + i); - if (currlen < 16) - readlen = currlen; - else - readlen = 16; + readlen = min_t(int, currlen, 16); buf_ptr = data + i; ret = npcm_fiu_uma_read(mem, op, addr, true, buf_ptr, From 90179609efa421b1ccc7d8eafbc078bafb25777c Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Fri, 15 Aug 2025 16:21:17 +0800 Subject: [PATCH 0217/3206] spi: spl022: use min_t() to improve code Use min_t() to reduce the code in setup_dma_scatter() and improve its readability. Signed-off-by: Qianfeng Rong Link: https://patch.msgid.link/20250815082118.586422-4-rongqianfeng@vivo.com Signed-off-by: Mark Brown --- drivers/spi/spi-pl022.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c index dd87cf4f70dd56..9e56e87746142f 100644 --- a/drivers/spi/spi-pl022.c +++ b/drivers/spi/spi-pl022.c @@ -33,6 +33,7 @@ #include #include #include +#include /* * This macro is used to define some register default values. @@ -760,10 +761,9 @@ static void setup_dma_scatter(struct pl022 *pl022, * we just feed in this, else we stuff in as much * as we can. */ - if (bytesleft < (PAGE_SIZE - offset_in_page(bufp))) - mapbytes = bytesleft; - else - mapbytes = PAGE_SIZE - offset_in_page(bufp); + mapbytes = min_t(int, bytesleft, + PAGE_SIZE - offset_in_page(bufp)); + sg_set_page(sg, virt_to_page(bufp), mapbytes, offset_in_page(bufp)); bufp += mapbytes; @@ -775,10 +775,7 @@ static void setup_dma_scatter(struct pl022 *pl022, } else { /* Map the dummy buffer on every page */ for_each_sg(sgtab->sgl, sg, sgtab->nents, i) { - if (bytesleft < PAGE_SIZE) - mapbytes = bytesleft; - else - mapbytes = PAGE_SIZE; + mapbytes = min_t(int, bytesleft, PAGE_SIZE); sg_set_page(sg, virt_to_page(pl022->dummypage), mapbytes, 0); bytesleft -= mapbytes; From eada40e057fc1842358d9daca3abe5cacb21e8a1 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 13 Aug 2025 14:06:28 +0200 Subject: [PATCH 0218/3206] s390/bpf: Do not write tail call counter into helper and kfunc frames Only BPF functions make use of the tail call counter; helpers and kfuncs ignore and most likely also clobber it. Writing it into these functions' frames is pointless and misleading, so do not do it. Fixes: dd691e847d28 ("s390/bpf: Implement bpf_jit_supports_subprog_tailcalls()") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250813121016.163375-2-iii@linux.ibm.com --- arch/s390/net/bpf_jit_comp.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bb17efe29d6570..bfac1ddf3447b9 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1790,6 +1790,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, REG_SET_SEEN(BPF_REG_5); jit->seen |= SEEN_FUNC; + /* * Copy the tail call counter to where the callee expects it. * @@ -1800,10 +1801,17 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, * Note 2: We assume that the verifier does not let us call the * main program, which clears the tail call counter on entry. */ - /* mvc tail_call_cnt(4,%r15),frame_off+tail_call_cnt(%r15) */ - _EMIT6(0xd203f000 | offsetof(struct prog_frame, tail_call_cnt), - 0xf000 | (jit->frame_off + - offsetof(struct prog_frame, tail_call_cnt))); + + if (insn->src_reg == BPF_PSEUDO_CALL) + /* + * mvc tail_call_cnt(4,%r15), + * frame_off+tail_call_cnt(%r15) + */ + _EMIT6(0xd203f000 | offsetof(struct prog_frame, + tail_call_cnt), + 0xf000 | (jit->frame_off + + offsetof(struct prog_frame, + tail_call_cnt))); /* Sign-extend the kfunc arguments. */ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { From c861a6b147137d10b5ff88a2c492ba376cd1b8b0 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 13 Aug 2025 14:06:29 +0200 Subject: [PATCH 0219/3206] s390/bpf: Write back tail call counter for BPF_PSEUDO_CALL The tailcall_bpf2bpf_hierarchy_1 test hangs on s390. Its call graph is as follows: entry() subprog_tail() bpf_tail_call_static(0) -> entry + tail_call_start subprog_tail() bpf_tail_call_static(0) -> entry + tail_call_start entry() copies its tail call counter to the subprog_tail()'s frame, which then increments it. However, the incremented result is discarded, leading to an astronomically large number of tail calls. Fix by writing the incremented counter back to the entry()'s frame. Fixes: dd691e847d28 ("s390/bpf: Implement bpf_jit_supports_subprog_tailcalls()") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250813121016.163375-3-iii@linux.ibm.com --- arch/s390/net/bpf_jit_comp.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bfac1ddf3447b9..ccb83ac3e6f321 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1793,13 +1793,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* * Copy the tail call counter to where the callee expects it. - * - * Note 1: The callee can increment the tail call counter, but - * we do not load it back, since the x86 JIT does not do this - * either. - * - * Note 2: We assume that the verifier does not let us call the - * main program, which clears the tail call counter on entry. */ if (insn->src_reg == BPF_PSEUDO_CALL) @@ -1833,6 +1826,22 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, call_r1(jit); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); + + /* + * Copy the potentially updated tail call counter back. + */ + + if (insn->src_reg == BPF_PSEUDO_CALL) + /* + * mvc frame_off+tail_call_cnt(%r15), + * tail_call_cnt(4,%r15) + */ + _EMIT6(0xd203f000 | (jit->frame_off + + offsetof(struct prog_frame, + tail_call_cnt)), + 0xf000 | offsetof(struct prog_frame, + tail_call_cnt)); + break; } case BPF_JMP | BPF_TAIL_CALL: { From bc3905a71f02511607d3ccf732360580209cac4c Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 13 Aug 2025 14:06:30 +0200 Subject: [PATCH 0220/3206] s390/bpf: Write back tail call counter for BPF_TRAMP_F_CALL_ORIG The tailcall_bpf2bpf_hierarchy_fentry test hangs on s390. Its call graph is as follows: entry() subprog_tail() trampoline() fentry() the rest of subprog_tail() # via BPF_TRAMP_F_CALL_ORIG return to entry() The problem is that the rest of subprog_tail() increments the tail call counter, but the trampoline discards the incremented value. This results in an astronomically large number of tail calls. Fix by making the trampoline write the incremented tail call counter back. Fixes: 528eb2cb87bc ("s390/bpf: Implement arch_prepare_bpf_trampoline()") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250813121016.163375-4-iii@linux.ibm.com --- arch/s390/net/bpf_jit_comp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index ccb83ac3e6f321..b2b8eb62b82e02 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2839,6 +2839,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, /* stg %r2,retval_off(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, tjit->retval_off); + /* mvc tccnt_off(%r15),tail_call_cnt(4,%r15) */ + _EMIT6(0xd203f000 | tjit->tccnt_off, + 0xf000 | offsetof(struct prog_frame, tail_call_cnt)); im->ip_after_call = jit->prg_buf + jit->prg; From 12741630350c6ba250ffd480dbb0bbab7e8ac80a Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 13 Aug 2025 14:06:31 +0200 Subject: [PATCH 0221/3206] selftests/bpf: Clobber a lot of registers in tailcall_bpf2bpf_hierarchy tests Clobbering a lot of registers and stack slots helps exposing tail call counter overwrite bugs in JITs. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250813121016.163375-5-iii@linux.ibm.com --- .../selftests/bpf/progs/bpf_test_utils.h | 18 ++++++++++++++++++ .../bpf/progs/tailcall_bpf2bpf_hierarchy1.c | 3 +++ .../bpf/progs/tailcall_bpf2bpf_hierarchy2.c | 3 +++ .../bpf/progs/tailcall_bpf2bpf_hierarchy3.c | 3 +++ .../progs/tailcall_bpf2bpf_hierarchy_fentry.c | 3 +++ 5 files changed, 30 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_test_utils.h diff --git a/tools/testing/selftests/bpf/progs/bpf_test_utils.h b/tools/testing/selftests/bpf/progs/bpf_test_utils.h new file mode 100644 index 00000000000000..f4e67b492dd2b1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_test_utils.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BPF_TEST_UTILS_H__ +#define __BPF_TEST_UTILS_H__ + +#include +#include "bpf_misc.h" + +/* Clobber as many native registers and stack slots as possible. */ +static __always_inline void clobber_regs_stack(void) +{ + char tmp_str[] = "123456789"; + unsigned long tmp; + + bpf_strtoul(tmp_str, sizeof(tmp_str), 0, &tmp); + __sink(tmp); +} + +#endif diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c index 327ca395e8601a..d556b19413d7b7 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c @@ -2,6 +2,7 @@ #include #include #include "bpf_legacy.h" +#include "bpf_test_utils.h" struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); @@ -24,6 +25,8 @@ int entry(struct __sk_buff *skb) { int ret = 1; + clobber_regs_stack(); + count++; subprog_tail(skb); subprog_tail(skb); diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c index 72fd0d577506a2..ae94c9c70ab7d5 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c @@ -2,6 +2,7 @@ #include #include #include "bpf_misc.h" +#include "bpf_test_utils.h" int classifier_0(struct __sk_buff *skb); int classifier_1(struct __sk_buff *skb); @@ -60,6 +61,8 @@ int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb) { int ret = 0; + clobber_regs_stack(); + subprog_tail0(skb); subprog_tail1(skb); diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c index a7fb91cb05b736..56b6b009984072 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c @@ -2,6 +2,7 @@ #include #include #include "bpf_misc.h" +#include "bpf_test_utils.h" int classifier_0(struct __sk_buff *skb); @@ -53,6 +54,8 @@ int tailcall_bpf2bpf_hierarchy_3(struct __sk_buff *skb) { int ret = 0; + clobber_regs_stack(); + bpf_tail_call_static(skb, &jmp_table0, 0); __sink(ret); diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c index c87f9ca982d3ee..5261395713cd58 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c @@ -4,6 +4,7 @@ #include "vmlinux.h" #include #include +#include "bpf_test_utils.h" struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); @@ -24,6 +25,8 @@ int subprog_tail(void *ctx) SEC("fentry/dummy") int BPF_PROG(fentry, struct sk_buff *skb) { + clobber_regs_stack(); + count++; subprog_tail(ctx); subprog_tail(ctx); From d87fdb1f27d7b1f3309bba00955f0aa1cd19b33e Mon Sep 17 00:00:00 2001 From: Fushuai Wang Date: Mon, 18 Aug 2025 11:23:44 +0800 Subject: [PATCH 0222/3206] bpf: Replace get_next_cpu() with cpumask_next_wrap() The get_next_cpu() function was only used in one place to find the next possible CPU, which can be replaced by cpumask_next_wrap(). Signed-off-by: Fushuai Wang Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250818032344.23229-1-wangfushuai@baidu.com --- kernel/bpf/bpf_lru_list.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 2d6e1c98d8adc3..e7a2fc60523f6c 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -19,14 +19,6 @@ #define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) -static int get_next_cpu(int cpu) -{ - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(cpu_possible_mask); - return cpu; -} - /* Local list helpers */ static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) { @@ -482,7 +474,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); - steal = get_next_cpu(steal); + steal = cpumask_next_wrap(steal, cpu_possible_mask); } while (!node && steal != first_steal); loc_l->next_steal = steal; From 2be3fd903a6775ce21b2a138950aab09d0998427 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 18 Aug 2025 16:02:04 +0200 Subject: [PATCH 0223/3206] selftests/nolibc: be more specific about variables affecting nolibc-test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only one of these variables is used. $CC is preferred over $CROSS_COMPILE. Make this clear in the help message. Suggested-by: Willy Tarreau Link: https://lore.kernel.org/lkml/20250817093905.GA14213@1wt.eu/ Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile.nolibc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index 0fb759ba992ee6..b013a5d724c9af 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -267,7 +267,7 @@ help: @echo " all call the \"run\" target below" @echo " help this help" @echo " sysroot create the nolibc sysroot here (uses \$$ARCH)" - @echo " nolibc-test build the executable (uses \$$CC and \$$CROSS_COMPILE)" + @echo " nolibc-test build the executable (uses \$$CC or \$$CROSS_COMPILE)" @echo " libc-test build an executable using the compiler's default libc instead" @echo " run-user runs the executable under QEMU (uses \$$XARCH, \$$TEST)" @echo " initramfs.cpio prepare the initramfs archive with nolibc-test" From 32042f638cb839b4eb4a76b5dfc7c0a412327a6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 19 Jul 2025 17:38:27 +0200 Subject: [PATCH 0224/3206] selftests/nolibc: deduplicate invocations of toplevel Makefile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various targets of the testsuite call back into the toplevel kernel Makefile. These calls use various parameters and are quite long. Introduce a common variable to make future changes smaller and the lines shorter. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250719-nolibc-llvm-system-v1-1-1730216ce171@weissschuh.net --- tools/testing/selftests/nolibc/Makefile.nolibc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index b013a5d724c9af..d8dbb63fe4f78a 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -262,6 +262,9 @@ REPORT ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++ if (f || !p || !done) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \ printf("\nSee all results in %s\n", ARGV[1]); }' +# Execute the toplevel kernel Makefile +KBUILD_MAKE = $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) + help: @echo "Supported targets under selftests/nolibc:" @echo " all call the \"run\" target below" @@ -340,17 +343,17 @@ initramfs: nolibc-test $(Q)cp nolibc-test initramfs/init defconfig: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(DEFCONFIG) + $(Q)$(KBUILD_MAKE) $(DEFCONFIG) $(Q)if [ -n "$(EXTRACONFIG)" ]; then \ $(srctree)/scripts/config --file $(objtree)/.config $(EXTRACONFIG); \ - $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) olddefconfig < /dev/null; \ + $(KBUILD_MAKE) olddefconfig < /dev/null; \ fi kernel: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null + $(Q)$(KBUILD_MAKE) $(IMAGE_NAME) < /dev/null kernel-standalone: initramfs - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs < /dev/null + $(Q)$(KBUILD_MAKE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs < /dev/null # run the tests after building the kernel run: kernel initramfs.cpio From 1a5b40317dcbcd5e498cc8d2b3e5f48755bab322 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 19 Jul 2025 17:38:28 +0200 Subject: [PATCH 0225/3206] selftests/nolibc: don't pass CC to toplevel Makefile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The toplevel Makefile is capable of calculating CC from CROSS_COMPILE and/or ARCH. Stop passing the unnecessary variable. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250719-nolibc-llvm-system-v1-2-1730216ce171@weissschuh.net --- tools/testing/selftests/nolibc/Makefile.nolibc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index d8dbb63fe4f78a..ebb097d228b9ea 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -263,7 +263,7 @@ REPORT ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++ printf("\nSee all results in %s\n", ARGV[1]); }' # Execute the toplevel kernel Makefile -KBUILD_MAKE = $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) +KBUILD_MAKE = $(MAKE) -C $(srctree) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) help: @echo "Supported targets under selftests/nolibc:" From 850047b19741490631855a475ccaa3ed29316039 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 19 Jul 2025 17:38:29 +0200 Subject: [PATCH 0226/3206] selftests/nolibc: always compile the kernel with GCC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LLVM/clang can not build the kernel for all architectures supported by nolibc. The current setup uses the same compiler to build the kernel as is used for nolibc-test. This prevents using the full qemu-system tests for LLVM builds. Instead always build the kernel with GCC. For the nolibc testsuite the kernel does not need to be built with LLVM. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250719-nolibc-llvm-system-v1-3-1730216ce171@weissschuh.net --- tools/testing/selftests/nolibc/Makefile.nolibc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index ebb097d228b9ea..330e000baeb1db 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -263,7 +263,7 @@ REPORT ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++ printf("\nSee all results in %s\n", ARGV[1]); }' # Execute the toplevel kernel Makefile -KBUILD_MAKE = $(MAKE) -C $(srctree) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) +KBUILD_MAKE = $(MAKE) -C $(srctree) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) LLVM= help: @echo "Supported targets under selftests/nolibc:" @@ -276,8 +276,8 @@ help: @echo " initramfs.cpio prepare the initramfs archive with nolibc-test" @echo " initramfs prepare the initramfs tree with nolibc-test" @echo " defconfig create a fresh new default config (uses \$$XARCH)" - @echo " kernel (re)build the kernel (uses \$$XARCH)" - @echo " kernel-standalone (re)build the kernel with the initramfs (uses \$$XARCH)" + @echo " kernel (re)build the kernel (uses \$$XARCH, \$$CROSS_COMPILE)" + @echo " kernel-standalone (re)build the kernel with the initramfs (uses \$$XARCH, \$$CROSS_COMPILE)" @echo " run runs the kernel in QEMU after building it (uses \$$XARCH, \$$TEST)" @echo " rerun runs a previously prebuilt kernel in QEMU (uses \$$XARCH, \$$TEST)" @echo " clean clean the sysroot, initramfs, build and output files" From 26178b713f2b3f5bc411ed8316d1635615896111 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 30 Jul 2025 16:46:52 +0900 Subject: [PATCH 0227/3206] x86/insn: Add XOP prefix instructions decoder support Support decoding AMD's XOP prefix encoded instructions. These instructions are introduced for Bulldozer micro architecture, and not supported on Intel's processors. But when compiling kernel with CONFIG_X86_NATIVE_CPU on some AMD processor (e.g. -march=bdver2), these instructions can be used. Closes: https://lore.kernel.org/all/871pq06728.fsf@wylie.me.uk/ Reported-by: Alan J. Wylie Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Borislav Petkov (AMD) Tested-by: Alan J. Wylie Link: https://lore.kernel.org/175386161199.564247.597496379413236944.stgit@devnote2 --- arch/x86/include/asm/inat.h | 15 +++ arch/x86/include/asm/insn.h | 51 +++++++- arch/x86/lib/inat.c | 13 ++ arch/x86/lib/insn.c | 35 ++++-- arch/x86/lib/x86-opcode-map.txt | 111 +++++++++++++++++- arch/x86/tools/gen-insn-attr-x86.awk | 44 +++++++ tools/arch/x86/include/asm/inat.h | 15 +++ tools/arch/x86/include/asm/insn.h | 51 +++++++- tools/arch/x86/lib/inat.c | 13 ++ tools/arch/x86/lib/insn.c | 35 ++++-- tools/arch/x86/lib/x86-opcode-map.txt | 111 +++++++++++++++++- tools/arch/x86/tools/gen-insn-attr-x86.awk | 44 +++++++ .../intel-pt-decoder/intel-pt-insn-decoder.c | 2 +- 13 files changed, 513 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 97f341777db54c..1b3060a3425cfe 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -37,6 +37,8 @@ #define INAT_PFX_EVEX 15 /* EVEX prefix */ /* x86-64 REX2 prefix */ #define INAT_PFX_REX2 16 /* 0xD5 */ +/* AMD XOP prefix */ +#define INAT_PFX_XOP 17 /* 0x8F */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -77,6 +79,7 @@ #define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_XOPOK INAT_VEXOK #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) #define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) @@ -111,6 +114,8 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_pp); +extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, + insn_byte_t map_select); /* Attribute checking functions */ static inline int inat_is_legacy_prefix(insn_attr_t attr) @@ -164,6 +169,11 @@ static inline int inat_is_vex3_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; } +static inline int inat_is_xop_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_XOP; +} + static inline int inat_is_escape(insn_attr_t attr) { return attr & INAT_ESC_MASK; @@ -229,6 +239,11 @@ static inline int inat_accept_vex(insn_attr_t attr) return attr & INAT_VEXOK; } +static inline int inat_accept_xop(insn_attr_t attr) +{ + return attr & INAT_XOPOK; +} + static inline int inat_must_vex(insn_attr_t attr) { return attr & (INAT_VEXONLY | INAT_EVEXONLY); diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 7152ea809e6a5e..091f88c8254d3a 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -71,7 +71,10 @@ struct insn { * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ - struct insn_field vex_prefix; /* VEX prefix */ + union { + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field xop_prefix; /* XOP prefix */ + }; struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 @@ -135,6 +138,17 @@ struct insn { #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ +/* XOP bit fields */ +#define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */ +#define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */ +#define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */ +#define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */ +#define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */ +#define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */ +#define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */ +#define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */ +#define X86_XOP_M_MIN 0x08 /* Min of XOP.M */ +#define X86_XOP_M_MAX 0x1f /* Max of XOP.M */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); @@ -178,7 +192,7 @@ static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) return X86_REX2_M(insn->rex_prefix.bytes[1]); } -static inline int insn_is_avx(struct insn *insn) +static inline int insn_is_avx_or_xop(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); @@ -192,6 +206,22 @@ static inline int insn_is_evex(struct insn *insn) return (insn->vex_prefix.nbytes == 4); } +/* If we already know this is AVX/XOP encoded */ +static inline int avx_insn_is_xop(struct insn *insn) +{ + insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]); + + return inat_is_xop_prefix(attr); +} + +static inline int insn_is_xop(struct insn *insn) +{ + if (!insn_is_avx_or_xop(insn)) + return 0; + + return avx_insn_is_xop(insn); +} + static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; @@ -222,11 +252,26 @@ static inline insn_byte_t insn_vex_w_bit(struct insn *insn) return X86_VEX_W(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_xop_map_bits(struct insn *insn) +{ + if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */ + return 0; + return X86_XOP_M(insn->xop_prefix.bytes[1]); +} + +static inline insn_byte_t insn_xop_p_bits(struct insn *insn) +{ + return X86_XOP_P(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { - if (insn_is_avx(insn)) + if (insn_is_avx_or_xop(insn)) { + if (avx_insn_is_xop(insn)) + return insn_xop_p_bits(insn); return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ + } if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index b0f3b2a62ae27b..a5cafd402cfd3d 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -81,3 +81,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, return table[opcode]; } +insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) +{ + const insn_attr_t *table; + + if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) + return 0; + map_select -= X86_XOP_M_MIN; + /* At first, this checks the master table */ + table = inat_xop_tables[map_select]; + if (!table) + return 0; + return table[opcode]; +} diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 149a57e334ab5c..225af1399c9d3f 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -200,12 +200,15 @@ int insn_get_prefixes(struct insn *insn) } insn->rex_prefix.got = 1; - /* Decode VEX prefix */ + /* Decode VEX/XOP prefix */ b = peek_next(insn_byte_t, insn); - attr = inat_get_opcode_attribute(b); - if (inat_is_vex_prefix(attr)) { + if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); - if (!insn->x86_64) { + + if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) { + /* Grp1A.0 is always POP Ev */ + goto vex_end; + } else if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is @@ -226,13 +229,13 @@ int insn_get_prefixes(struct insn *insn) if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; - } else if (inat_is_vex3_prefix(attr)) { + } else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) - /* VEX.W overrides opnd_size */ + /* VEX.W/XOP.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* @@ -288,9 +291,22 @@ int insn_get_opcode(struct insn *insn) insn_set_byte(opcode, 0, op); opcode->nbytes = 1; - /* Check if there is VEX prefix or not */ - if (insn_is_avx(insn)) { + /* Check if there is VEX/XOP prefix or not */ + if (insn_is_avx_or_xop(insn)) { insn_byte_t m, p; + + /* XOP prefix has different encoding */ + if (unlikely(avx_insn_is_xop(insn))) { + m = insn_xop_map_bits(insn); + insn->attr = inat_get_xop_attribute(op, m); + if (!inat_accept_xop(insn->attr)) { + insn->attr = 0; + return -EINVAL; + } + /* XOP has only 1 byte for opcode */ + goto end; + } + m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); @@ -383,7 +399,8 @@ int insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) && + !inat_accept_xop(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 262f7ca1fb9527..2a4e69ecc2de0f 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -27,6 +27,11 @@ # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. +# (xop): this opcode accepts XOP prefix. +# +# XOP Superscripts +# (W=0): this opcode requires XOP.W == 0 +# (W=1): this opcode requires XOP.W == 1 # # Last Prefix Superscripts # - (66): the last prefix is 0x66 @@ -194,7 +199,7 @@ AVXcode: 8c: MOV Ev,Sw 8d: LEA Gv,M 8e: MOV Sw,Ew -8f: Grp1A (1A) | POP Ev (d64) +8f: Grp1A (1A) | POP Ev (d64) | XOP (Prefix) # 0x90 - 0x9f 90: NOP | PAUSE (F3) | XCHG r8,rAX 91: XCHG rCX/r9,rAX @@ -1106,6 +1111,84 @@ AVXcode: 7 f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) EndTable +# From AMD64 Architecture Programmer's Manual Vol3, Appendix A.1.5 +Table: XOP map 8h +Referrer: +XOPcode: 0 +85: VPMACSSWW Vo,Ho,Wo,Lo +86: VPMACSSWD Vo,Ho,Wo,Lo +87: VPMACSSDQL Vo,Ho,Wo,Lo +8e: VPMACSSDD Vo,Ho,Wo,Lo +8f: VPMACSSDQH Vo,Ho,Wo,Lo +95: VPMACSWW Vo,Ho,Wo,Lo +96: VPMACSWD Vo,Ho,Wo,Lo +97: VPMACSDQL Vo,Ho,Wo,Lo +9e: VPMACSDD Vo,Ho,Wo,Lo +9f: VPMACSDQH Vo,Ho,Wo,Lo +a2: VPCMOV Vx,Hx,Wx,Lx (W=0) | VPCMOV Vx,Hx,Lx,Wx (W=1) +a3: VPPERM Vo,Ho,Wo,Lo (W=0) | VPPERM Vo,Ho,Lo,Wo (W=1) +a6: VPMADCSSWD Vo,Ho,Wo,Lo +b6: VPMADCSWD Vo,Ho,Wo,Lo +c0: VPROTB Vo,Wo,Ib +c1: VPROTW Vo,Wo,Ib +c2: VPROTD Vo,Wo,Ib +c3: VPROTQ Vo,Wo,Ib +cc: VPCOMccB Vo,Ho,Wo,Ib +cd: VPCOMccW Vo,Ho,Wo,Ib +ce: VPCOMccD Vo,Ho,Wo,Ib +cf: VPCOMccQ Vo,Ho,Wo,Ib +ec: VPCOMccUB Vo,Ho,Wo,Ib +ed: VPCOMccUW Vo,Ho,Wo,Ib +ee: VPCOMccUD Vo,Ho,Wo,Ib +ef: VPCOMccUQ Vo,Ho,Wo,Ib +EndTable + +Table: XOP map 9h +Referrer: +XOPcode: 1 +01: GrpXOP1 +02: GrpXOP2 +12: GrpXOP3 +80: VFRCZPS Vx,Wx +81: VFRCZPD Vx,Wx +82: VFRCZSS Vq,Wss +83: VFRCZSD Vq,Wsd +90: VPROTB Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +91: VPROTW Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +92: VPROTD Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +93: VPROTQ Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +94: VPSHLB Vo,Wo,Ho (W=0) | VPSHLB Vo,Ho,Wo (W=1) +95: VPSHLW Vo,Wo,Ho (W=0) | VPSHLW Vo,Ho,Wo (W=1) +96: VPSHLD Vo,Wo,Ho (W=0) | VPSHLD Vo,Ho,Wo (W=1) +97: VPSHLQ Vo,Wo,Ho (W=0) | VPSHLQ Vo,Ho,Wo (W=1) +98: VPSHAB Vo,Wo,Ho (W=0) | VPSHAB Vo,Ho,Wo (W=1) +99: VPSHAW Vo,Wo,Ho (W=0) | VPSHAW Vo,Ho,Wo (W=1) +9a: VPSHAD Vo,Wo,Ho (W=0) | VPSHAD Vo,Ho,Wo (W=1) +9b: VPSHAQ Vo,Wo,Ho (W=0) | VPSHAQ Vo,Ho,Wo (W=1) +c1: VPHADDBW Vo,Wo +c2: VPHADDBD Vo,Wo +c3: VPHADDBQ Vo,Wo +c6: VPHADDWD Vo,Wo +c7: VPHADDWQ Vo,Wo +cb: VPHADDDQ Vo,Wo +d1: VPHADDUBWD Vo,Wo +d2: VPHADDUBD Vo,Wo +d3: VPHADDUBQ Vo,Wo +d6: VPHADDUWD Vo,Wo +d7: VPHADDUWQ Vo,Wo +db: VPHADDUDQ Vo,Wo +e1: VPHSUBBW Vo,Wo +e2: VPHSUBWD Vo,Wo +e3: VPHSUBDQ Vo,Wo +EndTable + +Table: XOP map Ah +Referrer: +XOPcode: 2 +10: BEXTR Gy,Ey,Id +12: GrpXOP4 +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1320,3 +1403,29 @@ GrpTable: GrpRNG 4: xcrypt-cfb 5: xcrypt-ofb EndTable + +# GrpXOP1-4 is shown in AMD APM Vol.3 Appendix A as XOP group #1-4 +GrpTable: GrpXOP1 +1: BLCFILL By,Ey (xop) +2: BLSFILL By,Ey (xop) +3: BLCS By,Ey (xop) +4: TZMSK By,Ey (xop) +5: BLCIC By,Ey (xop) +6: BLSIC By,Ey (xop) +7: T1MSKC By,Ey (xop) +EndTable + +GrpTable: GrpXOP2 +1: BLCMSK By,Ey (xop) +6: BLCI By,Ey (xop) +EndTable + +GrpTable: GrpXOP3 +0: LLWPCB Ry (xop) +1: SLWPCB Ry (xop) +EndTable + +GrpTable: GrpXOP4 +0: LWPINS By,Ed,Id (xop) +1: LWPVAL By,Ed,Id (xop) +EndTable diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 2c19d7fc8a8559..7ea1b75e59b742 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -21,6 +21,7 @@ function clear_vars() { eid = -1 # escape id gid = -1 # group id aid = -1 # AVX id + xopid = -1 # XOP id tname = "" } @@ -39,9 +40,11 @@ BEGIN { ggid = 1 geid = 1 gaid = 0 + gxopid = 0 delete etable delete gtable delete atable + delete xoptable opnd_expr = "^[A-Za-z/]" ext_expr = "^\\(" @@ -61,6 +64,7 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Lo"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -87,6 +91,8 @@ BEGIN { evexonly_expr = "\\(ev\\)" # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size evex_scalable_expr = "\\(es\\)" + # All opcodes in XOP table or with (xop) superscript accept XOP prefix + xopok_expr = "\\(xop\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -106,6 +112,7 @@ BEGIN { prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" prefix_num["REX2"] = "INAT_PFX_REX2" + prefix_num["XOP"] = "INAT_PFX_XOP" clear_vars() } @@ -147,6 +154,7 @@ function array_size(arr, i,c) { if (NF != 1) { # AVX/escape opcode table aid = $2 + xopid = -1 if (gaid <= aid) gaid = aid + 1 if (tname == "") # AVX only opcode table @@ -156,6 +164,20 @@ function array_size(arr, i,c) { tname = "inat_primary_table" } +/^XOPcode:/ { + if (NF != 1) { + # XOP opcode table + xopid = $2 + aid = -1 + if (gxopid <= xopid) + gxopid = xopid + 1 + if (tname == "") # XOP only opcode table + tname = sprintf("inat_xop_table_%d", $2) + } + if (xopid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + /^GrpTable:/ { print "/* " $0 " */" if (!($2 in group)) @@ -206,6 +228,8 @@ function print_table(tbl,name,fmt,n) etable[eid,0] = tname if (aid >= 0) atable[aid,0] = tname + else if (xopid >= 0) + xoptable[xopid] = tname } if (array_size(lptable1) != 0) { print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", @@ -347,6 +371,8 @@ function convert_operands(count,opnd, i,j,imm,mod) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") + else if (match(ext, xopok_expr) || xopid >= 0) + flags = add_flags(flags, "INAT_XOPOK") # check prefixes if (match(ext, prefix_expr)) { @@ -413,6 +439,14 @@ END { print " ["i"]["j"] = "atable[i,j]"," print "};\n" + print "/* XOP opcode map array */" + print "const insn_attr_t * const inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1]" \ + " = {" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print " ["i"] = "xoptable[i]"," + print "};" + print "#else /* !__BOOT_COMPRESSED */\n" print "/* Escape opcode map array */" @@ -430,6 +464,10 @@ END { "[INAT_LSTPFX_MAX + 1];" print "" + print "/* XOP opcode map array */" + print "static const insn_attr_t *inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1];" + print "" + print "static void inat_init_tables(void)" print "{" @@ -455,6 +493,12 @@ END { if (atable[i,j]) print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + print "" + print "\t/* Print XOP opcode map array */" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print "\tinat_xop_tables["i"] = "xoptable[i]";" + print "}" print "#endif" } diff --git a/tools/arch/x86/include/asm/inat.h b/tools/arch/x86/include/asm/inat.h index 183aa662b16523..099e926595bd98 100644 --- a/tools/arch/x86/include/asm/inat.h +++ b/tools/arch/x86/include/asm/inat.h @@ -37,6 +37,8 @@ #define INAT_PFX_EVEX 15 /* EVEX prefix */ /* x86-64 REX2 prefix */ #define INAT_PFX_REX2 16 /* 0xD5 */ +/* AMD XOP prefix */ +#define INAT_PFX_XOP 17 /* 0x8F */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -77,6 +79,7 @@ #define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_XOPOK INAT_VEXOK #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) #define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) @@ -111,6 +114,8 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_pp); +extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, + insn_byte_t map_select); /* Attribute checking functions */ static inline int inat_is_legacy_prefix(insn_attr_t attr) @@ -164,6 +169,11 @@ static inline int inat_is_vex3_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; } +static inline int inat_is_xop_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_XOP; +} + static inline int inat_is_escape(insn_attr_t attr) { return attr & INAT_ESC_MASK; @@ -229,6 +239,11 @@ static inline int inat_accept_vex(insn_attr_t attr) return attr & INAT_VEXOK; } +static inline int inat_accept_xop(insn_attr_t attr) +{ + return attr & INAT_XOPOK; +} + static inline int inat_must_vex(insn_attr_t attr) { return attr & (INAT_VEXONLY | INAT_EVEXONLY); diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 0e5abd896ad42d..c683d609934b79 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -71,7 +71,10 @@ struct insn { * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ - struct insn_field vex_prefix; /* VEX prefix */ + union { + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field xop_prefix; /* XOP prefix */ + }; struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 @@ -135,6 +138,17 @@ struct insn { #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ +/* XOP bit fields */ +#define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */ +#define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */ +#define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */ +#define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */ +#define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */ +#define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */ +#define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */ +#define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */ +#define X86_XOP_M_MIN 0x08 /* Min of XOP.M */ +#define X86_XOP_M_MAX 0x1f /* Max of XOP.M */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); @@ -178,7 +192,7 @@ static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) return X86_REX2_M(insn->rex_prefix.bytes[1]); } -static inline int insn_is_avx(struct insn *insn) +static inline int insn_is_avx_or_xop(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); @@ -192,6 +206,22 @@ static inline int insn_is_evex(struct insn *insn) return (insn->vex_prefix.nbytes == 4); } +/* If we already know this is AVX/XOP encoded */ +static inline int avx_insn_is_xop(struct insn *insn) +{ + insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]); + + return inat_is_xop_prefix(attr); +} + +static inline int insn_is_xop(struct insn *insn) +{ + if (!insn_is_avx_or_xop(insn)) + return 0; + + return avx_insn_is_xop(insn); +} + static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; @@ -222,11 +252,26 @@ static inline insn_byte_t insn_vex_w_bit(struct insn *insn) return X86_VEX_W(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_xop_map_bits(struct insn *insn) +{ + if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */ + return 0; + return X86_XOP_M(insn->xop_prefix.bytes[1]); +} + +static inline insn_byte_t insn_xop_p_bits(struct insn *insn) +{ + return X86_XOP_P(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { - if (insn_is_avx(insn)) + if (insn_is_avx_or_xop(insn)) { + if (avx_insn_is_xop(insn)) + return insn_xop_p_bits(insn); return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ + } if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); diff --git a/tools/arch/x86/lib/inat.c b/tools/arch/x86/lib/inat.c index dfbcc64059412a..ffcb0e27453b8e 100644 --- a/tools/arch/x86/lib/inat.c +++ b/tools/arch/x86/lib/inat.c @@ -81,3 +81,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, return table[opcode]; } +insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) +{ + const insn_attr_t *table; + + if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) + return 0; + map_select -= X86_XOP_M_MIN; + /* At first, this checks the master table */ + table = inat_xop_tables[map_select]; + if (!table) + return 0; + return table[opcode]; +} diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index bce69c6bfa6972..1d1c57c74d1fc2 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -200,12 +200,15 @@ int insn_get_prefixes(struct insn *insn) } insn->rex_prefix.got = 1; - /* Decode VEX prefix */ + /* Decode VEX/XOP prefix */ b = peek_next(insn_byte_t, insn); - attr = inat_get_opcode_attribute(b); - if (inat_is_vex_prefix(attr)) { + if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); - if (!insn->x86_64) { + + if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) { + /* Grp1A.0 is always POP Ev */ + goto vex_end; + } else if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is @@ -226,13 +229,13 @@ int insn_get_prefixes(struct insn *insn) if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; - } else if (inat_is_vex3_prefix(attr)) { + } else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) - /* VEX.W overrides opnd_size */ + /* VEX.W/XOP.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* @@ -288,9 +291,22 @@ int insn_get_opcode(struct insn *insn) insn_set_byte(opcode, 0, op); opcode->nbytes = 1; - /* Check if there is VEX prefix or not */ - if (insn_is_avx(insn)) { + /* Check if there is VEX/XOP prefix or not */ + if (insn_is_avx_or_xop(insn)) { insn_byte_t m, p; + + /* XOP prefix has different encoding */ + if (unlikely(avx_insn_is_xop(insn))) { + m = insn_xop_map_bits(insn); + insn->attr = inat_get_xop_attribute(op, m); + if (!inat_accept_xop(insn->attr)) { + insn->attr = 0; + return -EINVAL; + } + /* XOP has only 1 byte for opcode */ + goto end; + } + m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); @@ -383,7 +399,8 @@ int insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) && + !inat_accept_xop(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index 262f7ca1fb9527..2a4e69ecc2de0f 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -27,6 +27,11 @@ # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. +# (xop): this opcode accepts XOP prefix. +# +# XOP Superscripts +# (W=0): this opcode requires XOP.W == 0 +# (W=1): this opcode requires XOP.W == 1 # # Last Prefix Superscripts # - (66): the last prefix is 0x66 @@ -194,7 +199,7 @@ AVXcode: 8c: MOV Ev,Sw 8d: LEA Gv,M 8e: MOV Sw,Ew -8f: Grp1A (1A) | POP Ev (d64) +8f: Grp1A (1A) | POP Ev (d64) | XOP (Prefix) # 0x90 - 0x9f 90: NOP | PAUSE (F3) | XCHG r8,rAX 91: XCHG rCX/r9,rAX @@ -1106,6 +1111,84 @@ AVXcode: 7 f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) EndTable +# From AMD64 Architecture Programmer's Manual Vol3, Appendix A.1.5 +Table: XOP map 8h +Referrer: +XOPcode: 0 +85: VPMACSSWW Vo,Ho,Wo,Lo +86: VPMACSSWD Vo,Ho,Wo,Lo +87: VPMACSSDQL Vo,Ho,Wo,Lo +8e: VPMACSSDD Vo,Ho,Wo,Lo +8f: VPMACSSDQH Vo,Ho,Wo,Lo +95: VPMACSWW Vo,Ho,Wo,Lo +96: VPMACSWD Vo,Ho,Wo,Lo +97: VPMACSDQL Vo,Ho,Wo,Lo +9e: VPMACSDD Vo,Ho,Wo,Lo +9f: VPMACSDQH Vo,Ho,Wo,Lo +a2: VPCMOV Vx,Hx,Wx,Lx (W=0) | VPCMOV Vx,Hx,Lx,Wx (W=1) +a3: VPPERM Vo,Ho,Wo,Lo (W=0) | VPPERM Vo,Ho,Lo,Wo (W=1) +a6: VPMADCSSWD Vo,Ho,Wo,Lo +b6: VPMADCSWD Vo,Ho,Wo,Lo +c0: VPROTB Vo,Wo,Ib +c1: VPROTW Vo,Wo,Ib +c2: VPROTD Vo,Wo,Ib +c3: VPROTQ Vo,Wo,Ib +cc: VPCOMccB Vo,Ho,Wo,Ib +cd: VPCOMccW Vo,Ho,Wo,Ib +ce: VPCOMccD Vo,Ho,Wo,Ib +cf: VPCOMccQ Vo,Ho,Wo,Ib +ec: VPCOMccUB Vo,Ho,Wo,Ib +ed: VPCOMccUW Vo,Ho,Wo,Ib +ee: VPCOMccUD Vo,Ho,Wo,Ib +ef: VPCOMccUQ Vo,Ho,Wo,Ib +EndTable + +Table: XOP map 9h +Referrer: +XOPcode: 1 +01: GrpXOP1 +02: GrpXOP2 +12: GrpXOP3 +80: VFRCZPS Vx,Wx +81: VFRCZPD Vx,Wx +82: VFRCZSS Vq,Wss +83: VFRCZSD Vq,Wsd +90: VPROTB Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +91: VPROTW Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +92: VPROTD Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +93: VPROTQ Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +94: VPSHLB Vo,Wo,Ho (W=0) | VPSHLB Vo,Ho,Wo (W=1) +95: VPSHLW Vo,Wo,Ho (W=0) | VPSHLW Vo,Ho,Wo (W=1) +96: VPSHLD Vo,Wo,Ho (W=0) | VPSHLD Vo,Ho,Wo (W=1) +97: VPSHLQ Vo,Wo,Ho (W=0) | VPSHLQ Vo,Ho,Wo (W=1) +98: VPSHAB Vo,Wo,Ho (W=0) | VPSHAB Vo,Ho,Wo (W=1) +99: VPSHAW Vo,Wo,Ho (W=0) | VPSHAW Vo,Ho,Wo (W=1) +9a: VPSHAD Vo,Wo,Ho (W=0) | VPSHAD Vo,Ho,Wo (W=1) +9b: VPSHAQ Vo,Wo,Ho (W=0) | VPSHAQ Vo,Ho,Wo (W=1) +c1: VPHADDBW Vo,Wo +c2: VPHADDBD Vo,Wo +c3: VPHADDBQ Vo,Wo +c6: VPHADDWD Vo,Wo +c7: VPHADDWQ Vo,Wo +cb: VPHADDDQ Vo,Wo +d1: VPHADDUBWD Vo,Wo +d2: VPHADDUBD Vo,Wo +d3: VPHADDUBQ Vo,Wo +d6: VPHADDUWD Vo,Wo +d7: VPHADDUWQ Vo,Wo +db: VPHADDUDQ Vo,Wo +e1: VPHSUBBW Vo,Wo +e2: VPHSUBWD Vo,Wo +e3: VPHSUBDQ Vo,Wo +EndTable + +Table: XOP map Ah +Referrer: +XOPcode: 2 +10: BEXTR Gy,Ey,Id +12: GrpXOP4 +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1320,3 +1403,29 @@ GrpTable: GrpRNG 4: xcrypt-cfb 5: xcrypt-ofb EndTable + +# GrpXOP1-4 is shown in AMD APM Vol.3 Appendix A as XOP group #1-4 +GrpTable: GrpXOP1 +1: BLCFILL By,Ey (xop) +2: BLSFILL By,Ey (xop) +3: BLCS By,Ey (xop) +4: TZMSK By,Ey (xop) +5: BLCIC By,Ey (xop) +6: BLSIC By,Ey (xop) +7: T1MSKC By,Ey (xop) +EndTable + +GrpTable: GrpXOP2 +1: BLCMSK By,Ey (xop) +6: BLCI By,Ey (xop) +EndTable + +GrpTable: GrpXOP3 +0: LLWPCB Ry (xop) +1: SLWPCB Ry (xop) +EndTable + +GrpTable: GrpXOP4 +0: LWPINS By,Ed,Id (xop) +1: LWPVAL By,Ed,Id (xop) +EndTable diff --git a/tools/arch/x86/tools/gen-insn-attr-x86.awk b/tools/arch/x86/tools/gen-insn-attr-x86.awk index 2c19d7fc8a8559..7ea1b75e59b742 100644 --- a/tools/arch/x86/tools/gen-insn-attr-x86.awk +++ b/tools/arch/x86/tools/gen-insn-attr-x86.awk @@ -21,6 +21,7 @@ function clear_vars() { eid = -1 # escape id gid = -1 # group id aid = -1 # AVX id + xopid = -1 # XOP id tname = "" } @@ -39,9 +40,11 @@ BEGIN { ggid = 1 geid = 1 gaid = 0 + gxopid = 0 delete etable delete gtable delete atable + delete xoptable opnd_expr = "^[A-Za-z/]" ext_expr = "^\\(" @@ -61,6 +64,7 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Lo"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -87,6 +91,8 @@ BEGIN { evexonly_expr = "\\(ev\\)" # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size evex_scalable_expr = "\\(es\\)" + # All opcodes in XOP table or with (xop) superscript accept XOP prefix + xopok_expr = "\\(xop\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -106,6 +112,7 @@ BEGIN { prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" prefix_num["REX2"] = "INAT_PFX_REX2" + prefix_num["XOP"] = "INAT_PFX_XOP" clear_vars() } @@ -147,6 +154,7 @@ function array_size(arr, i,c) { if (NF != 1) { # AVX/escape opcode table aid = $2 + xopid = -1 if (gaid <= aid) gaid = aid + 1 if (tname == "") # AVX only opcode table @@ -156,6 +164,20 @@ function array_size(arr, i,c) { tname = "inat_primary_table" } +/^XOPcode:/ { + if (NF != 1) { + # XOP opcode table + xopid = $2 + aid = -1 + if (gxopid <= xopid) + gxopid = xopid + 1 + if (tname == "") # XOP only opcode table + tname = sprintf("inat_xop_table_%d", $2) + } + if (xopid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + /^GrpTable:/ { print "/* " $0 " */" if (!($2 in group)) @@ -206,6 +228,8 @@ function print_table(tbl,name,fmt,n) etable[eid,0] = tname if (aid >= 0) atable[aid,0] = tname + else if (xopid >= 0) + xoptable[xopid] = tname } if (array_size(lptable1) != 0) { print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", @@ -347,6 +371,8 @@ function convert_operands(count,opnd, i,j,imm,mod) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") + else if (match(ext, xopok_expr) || xopid >= 0) + flags = add_flags(flags, "INAT_XOPOK") # check prefixes if (match(ext, prefix_expr)) { @@ -413,6 +439,14 @@ END { print " ["i"]["j"] = "atable[i,j]"," print "};\n" + print "/* XOP opcode map array */" + print "const insn_attr_t * const inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1]" \ + " = {" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print " ["i"] = "xoptable[i]"," + print "};" + print "#else /* !__BOOT_COMPRESSED */\n" print "/* Escape opcode map array */" @@ -430,6 +464,10 @@ END { "[INAT_LSTPFX_MAX + 1];" print "" + print "/* XOP opcode map array */" + print "static const insn_attr_t *inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1];" + print "" + print "static void inat_init_tables(void)" print "{" @@ -455,6 +493,12 @@ END { if (atable[i,j]) print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + print "" + print "\t/* Print XOP opcode map array */" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print "\tinat_xop_tables["i"] = "xoptable[i]";" + print "}" print "#endif" } diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c index 8fabddc1c0dad2..72c7a4e15d617b 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -32,7 +32,7 @@ static void intel_pt_insn_decoder(struct insn *insn, intel_pt_insn->rel = 0; intel_pt_insn->emulated_ptwrite = false; - if (insn_is_avx(insn)) { + if (insn_is_avx_or_xop(insn)) { intel_pt_insn->op = INTEL_PT_OP_OTHER; intel_pt_insn->branch = INTEL_PT_BR_NO_BRANCH; intel_pt_insn->length = insn->length; From 65fe705367efc3e06ccfa2a3a107081d8e69f70e Mon Sep 17 00:00:00 2001 From: Julien Massot Date: Fri, 1 Aug 2025 13:18:09 +0200 Subject: [PATCH 0228/3206] dt-bindings: pinctrl: mediatek: mt8183: Allow gpio-line-names Add support for the 'gpio-line-names' property in the MT8183 pinctrl binding. This allows naming the GPIOs, which is already done in several device trees (e.g. mt8183-kukui-jacuzzi.dtsi, mt8183-kukui-kakadu.dtsi, mt8183-kukui-krane.dtsi), but was previously generating DT schema warnings. Signed-off-by: Julien Massot Reviewed-by: AngeloGioacchino Del Regno Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/20250801-mtk-dtb-warnings-v1-7-6ba4e432427b@collabora.com Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml index 464879274cae4c..3db2438fadc78b 100644 --- a/Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml @@ -48,6 +48,8 @@ properties: description: GPIO valid number range. + gpio-line-names: true + interrupt-controller: true interrupts: From 694a97ee25581630f70773d160fcc2364afe98c8 Mon Sep 17 00:00:00 2001 From: Hendrik Hamerlinck Date: Tue, 5 Aug 2025 17:07:01 +0200 Subject: [PATCH 0229/3206] pinctrl: spacemit: remove extra line in debug output The debug output for spacemit_pinconf_dbg_show() prints an extra newline at the end. This is redundant as pinconf_pins_show() in pinconf.c already adds a newline in its for loop. Remove the newline to avoid the extra line in the output. Example current output: $ cat /sys/kernel/debug/pinctrl/d401e000.pinctrl/pinconf-pins Pin config settings per pin Format: pin (name): configs pin 0 (GPIO_00): , bias pull disabled, io type (Fixed/1V8), drive strength (32 mA), register (0x1041) pin 1 (GPIO_01): slew rate (0x0), bias pull disabled, io type (Fixed/1V8), drive strength (32 mA), register (0x1041) pin 2 (GPIO_02): slew rate (0x0), bias pull disabled, io type (Fixed/1V8), drive strength (32 mA), register (0x1041) ... Signed-off-by: Hendrik Hamerlinck Reviewed-by: Yixun Lan Link: https://lore.kernel.org/20250805150701.129113-1-hendrik.hamerlinck@hammernet.be Signed-off-by: Linus Walleij --- drivers/pinctrl/spacemit/pinctrl-k1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/spacemit/pinctrl-k1.c b/drivers/pinctrl/spacemit/pinctrl-k1.c index 9996b1c4a07e72..fb361f2acb54fb 100644 --- a/drivers/pinctrl/spacemit/pinctrl-k1.c +++ b/drivers/pinctrl/spacemit/pinctrl-k1.c @@ -707,7 +707,7 @@ static void spacemit_pinconf_dbg_show(struct pinctrl_dev *pctldev, spacemit_get_drive_strength_mA(IO_TYPE_1V8, tmp), spacemit_get_drive_strength_mA(IO_TYPE_3V3, tmp)); - seq_printf(seq, ", register (0x%04x)\n", value); + seq_printf(seq, ", register (0x%04x)", value); } static const struct pinconf_ops spacemit_pinconf_ops = { From 236152dd9b1675a35eee912e79e6c57ca6b6732f Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Thu, 7 Aug 2025 14:20:38 +0800 Subject: [PATCH 0230/3206] pinctrl: single: fix bias pull up/down handling in pin_config_set In the pin_config_set function, when handling PIN_CONFIG_BIAS_PULL_DOWN or PIN_CONFIG_BIAS_PULL_UP, the function calls pcs_pinconf_clear_bias() which writes the register. However, the subsequent operations continue using the stale 'data' value from before the register write, effectively causing the bias clear operation to be overwritten and not take effect. Fix this by reading the 'data' value from the register after calling pcs_pinconf_clear_bias(). This bug seems to have existed when this code was first merged in commit 9dddb4df90d1 ("pinctrl: single: support generic pinconf"). Signed-off-by: Chi Zhang Link: https://lore.kernel.org/20250807062038.13610-1-chizhang@asrmicro.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-single.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index 5cda6201b60f53..8aedee2720bcb9 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -589,8 +589,10 @@ static int pcs_pinconf_set(struct pinctrl_dev *pctldev, /* 4 parameters */ case PIN_CONFIG_BIAS_PULL_DOWN: case PIN_CONFIG_BIAS_PULL_UP: - if (arg) + if (arg) { pcs_pinconf_clear_bias(pctldev, pin); + data = pcs->read(pcs->base + offset); + } fallthrough; case PIN_CONFIG_INPUT_SCHMITT_ENABLE: data &= ~func->conf[i].mask; From dbe99ea541f023d73abf5730c1477ba96111ef83 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sat, 16 Aug 2025 15:15:54 +0000 Subject: [PATCH 0231/3206] bpf: Add a verbose message when the BTF limit is reached When a BPF program which is being loaded reaches the map limit (MAX_USED_MAPS) or the BTF limit (MAX_USED_BTFS) the -E2BIG is returned. However, in the former case there is an accompanying verifier verbose message, and in the latter case there is not. Add a verbose message to make the behaviour symmetrical. Reported-by: Kevin Sheldrake Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250816151554.902995-1-a.s.protopopov@gmail.com --- kernel/bpf/verifier.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3a3982fe20d48a..07cc4a738c6787 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20193,8 +20193,11 @@ static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf) if (env->used_btfs[i].btf == btf) return i; - if (env->used_btf_cnt >= MAX_USED_BTFS) + if (env->used_btf_cnt >= MAX_USED_BTFS) { + verbose(env, "The total number of btfs per program has reached the limit of %u\n", + MAX_USED_BTFS); return -E2BIG; + } btf_get(btf); From dca2f73cf19fedd7bc38fa6a0eb50fea63cd0214 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:53 +0200 Subject: [PATCH 0232/3206] pinctrl: Add pin controller driver for AAEON UP boards This enables the pin control support of the onboard FPGA on AAEON UP boards. This FPGA acts as a level shifter between the Intel SoC pins and the pin header, and also as a mux or switch. +---------+ +--------------+ +---+ | | | | | | PWM0 | \ | | H | |----------|------ \-----|-------------| E | | I2C0_SDA | | | A | Intel SoC |----------|------\ | | D | | GPIO0 | \------|-------------| E | |----------|------ | | R | | | FPGA | | | ----------+ +--------------+ +---+ For most of the pins, the FPGA opens/closes a switch to enable/disable the access to the SoC pin from a pin header. Each switch, has a direction flag that is set depending the status of the SoC pin. For some other pins, the FPGA acts as a mux, and routes one pin (or the other one) to the header. The driver also provides a GPIO chip. It requests SoC pins in GPIO mode, and drives them in tandem with FPGA pins (switch/mux direction). This commit adds support only for UP Squared board. Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/20250811-aaeon-up-board-pinctrl-support-v9-10-29f0cbbdfb30@bootlin.com Signed-off-by: Linus Walleij --- drivers/pinctrl/Kconfig | 19 + drivers/pinctrl/Makefile | 1 + drivers/pinctrl/pinctrl-upboard.c | 1070 +++++++++++++++++++++++++++++ 3 files changed, 1090 insertions(+) create mode 100644 drivers/pinctrl/pinctrl-upboard.c diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index ddd11668457ced..463cf252c275bb 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -600,6 +600,25 @@ config PINCTRL_TH1520 This driver is needed for RISC-V development boards like the BeagleV Ahead and the LicheePi 4A. +config PINCTRL_UPBOARD + tristate "AAeon UP board FPGA pin controller" + depends on MFD_UPBOARD_FPGA + select PINMUX + select GENERIC_PINCTRL_GROUPS + select GENERIC_PINMUX_FUNCTIONS + select GPIOLIB + select GPIO_AGGREGATOR + help + Pin controller for the FPGA GPIO lines on UP boards. Due to the + hardware layout, the driver controls the FPGA pins in tandem with + their corresponding Intel SoC GPIOs. + + Currently supported: + - UP Squared + + To compile this driver as a module, choose M here: the module + will be called pinctrl-upboard. + config PINCTRL_ZYNQ bool "Pinctrl driver for Xilinx Zynq" depends on ARCH_ZYNQ || COMPILE_TEST diff --git a/drivers/pinctrl/Makefile b/drivers/pinctrl/Makefile index 909ab89a56d293..949693f2f0ab1a 100644 --- a/drivers/pinctrl/Makefile +++ b/drivers/pinctrl/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_PINCTRL_SX150X) += pinctrl-sx150x.o obj-$(CONFIG_PINCTRL_TB10X) += pinctrl-tb10x.o obj-$(CONFIG_PINCTRL_TPS6594) += pinctrl-tps6594.o obj-$(CONFIG_PINCTRL_TH1520) += pinctrl-th1520.o +obj-$(CONFIG_PINCTRL_UPBOARD) += pinctrl-upboard.o obj-$(CONFIG_PINCTRL_ZYNQMP) += pinctrl-zynqmp.o obj-$(CONFIG_PINCTRL_ZYNQ) += pinctrl-zynq.o diff --git a/drivers/pinctrl/pinctrl-upboard.c b/drivers/pinctrl/pinctrl-upboard.c new file mode 100644 index 00000000000000..f8c8b9d8499003 --- /dev/null +++ b/drivers/pinctrl/pinctrl-upboard.c @@ -0,0 +1,1070 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * UP board pin control driver. + * + * Copyright (C) 2025 Bootlin + * + * Author: Thomas Richard + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "core.h" +#include "pinmux.h" + +enum upboard_pin_mode { + UPBOARD_PIN_MODE_FUNCTION, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_DISABLED, +}; + +struct upboard_pin { + struct regmap_field *funcbit; + struct regmap_field *enbit; + struct regmap_field *dirbit; +}; + +struct upboard_pingroup { + struct pingroup grp; + enum upboard_pin_mode mode; + const enum upboard_pin_mode *modes; +}; + +struct upboard_pinctrl_data { + const struct upboard_pingroup *groups; + size_t ngroups; + const struct pinfunction *funcs; + size_t nfuncs; + const unsigned int *pin_header; + size_t ngpio; +}; + +struct upboard_pinctrl { + struct device *dev; + struct pinctrl_dev *pctldev; + const struct upboard_pinctrl_data *pctrl_data; + struct gpio_pin_range pin_range; + struct upboard_pin *pins; +}; + +struct upboard_pinctrl_map { + const struct pinctrl_map *maps; + size_t nmaps; +}; + +enum upboard_func0_fpgabit { + UPBOARD_FUNC_I2C0_EN = 8, + UPBOARD_FUNC_I2C1_EN = 9, + UPBOARD_FUNC_CEC0_EN = 12, + UPBOARD_FUNC_ADC0_EN = 14, +}; + +static const struct reg_field upboard_i2c0_reg = + REG_FIELD(UPBOARD_REG_FUNC_EN0, UPBOARD_FUNC_I2C0_EN, UPBOARD_FUNC_I2C0_EN); + +static const struct reg_field upboard_i2c1_reg = + REG_FIELD(UPBOARD_REG_FUNC_EN0, UPBOARD_FUNC_I2C1_EN, UPBOARD_FUNC_I2C1_EN); + +static const struct reg_field upboard_adc0_reg = + REG_FIELD(UPBOARD_REG_FUNC_EN0, UPBOARD_FUNC_ADC0_EN, UPBOARD_FUNC_ADC0_EN); + +#define UPBOARD_UP_BIT_TO_PIN(bit) UPBOARD_UP_BIT_##bit + +#define UPBOARD_UP_PIN_NAME(id) \ + { \ + .number = UPBOARD_UP_BIT_##id, \ + .name = #id, \ + } + +#define UPBOARD_UP_PIN_MUX(bit, data) \ + { \ + .number = UPBOARD_UP_BIT_##bit, \ + .name = "PINMUX_"#bit, \ + .drv_data = (void *)(data), \ + } + +#define UPBOARD_UP_PIN_FUNC(id, data) \ + { \ + .number = UPBOARD_UP_BIT_##id, \ + .name = #id, \ + .drv_data = (void *)(data), \ + } + +enum upboard_up_fpgabit { + UPBOARD_UP_BIT_I2C1_SDA, + UPBOARD_UP_BIT_I2C1_SCL, + UPBOARD_UP_BIT_ADC0, + UPBOARD_UP_BIT_UART1_RTS, + UPBOARD_UP_BIT_GPIO27, + UPBOARD_UP_BIT_GPIO22, + UPBOARD_UP_BIT_SPI_MOSI, + UPBOARD_UP_BIT_SPI_MISO, + UPBOARD_UP_BIT_SPI_CLK, + UPBOARD_UP_BIT_I2C0_SDA, + UPBOARD_UP_BIT_GPIO5, + UPBOARD_UP_BIT_GPIO6, + UPBOARD_UP_BIT_PWM1, + UPBOARD_UP_BIT_I2S_FRM, + UPBOARD_UP_BIT_GPIO26, + UPBOARD_UP_BIT_UART1_TX, + UPBOARD_UP_BIT_UART1_RX, + UPBOARD_UP_BIT_I2S_CLK, + UPBOARD_UP_BIT_GPIO23, + UPBOARD_UP_BIT_GPIO24, + UPBOARD_UP_BIT_GPIO25, + UPBOARD_UP_BIT_SPI_CS0, + UPBOARD_UP_BIT_SPI_CS1, + UPBOARD_UP_BIT_I2C0_SCL, + UPBOARD_UP_BIT_PWM0, + UPBOARD_UP_BIT_UART1_CTS, + UPBOARD_UP_BIT_I2S_DIN, + UPBOARD_UP_BIT_I2S_DOUT, +}; + +static const struct pinctrl_pin_desc upboard_up_pins[] = { + UPBOARD_UP_PIN_FUNC(I2C1_SDA, &upboard_i2c1_reg), + UPBOARD_UP_PIN_FUNC(I2C1_SCL, &upboard_i2c1_reg), + UPBOARD_UP_PIN_FUNC(ADC0, &upboard_adc0_reg), + UPBOARD_UP_PIN_NAME(UART1_RTS), + UPBOARD_UP_PIN_NAME(GPIO27), + UPBOARD_UP_PIN_NAME(GPIO22), + UPBOARD_UP_PIN_NAME(SPI_MOSI), + UPBOARD_UP_PIN_NAME(SPI_MISO), + UPBOARD_UP_PIN_NAME(SPI_CLK), + UPBOARD_UP_PIN_FUNC(I2C0_SDA, &upboard_i2c0_reg), + UPBOARD_UP_PIN_NAME(GPIO5), + UPBOARD_UP_PIN_NAME(GPIO6), + UPBOARD_UP_PIN_NAME(PWM1), + UPBOARD_UP_PIN_NAME(I2S_FRM), + UPBOARD_UP_PIN_NAME(GPIO26), + UPBOARD_UP_PIN_NAME(UART1_TX), + UPBOARD_UP_PIN_NAME(UART1_RX), + UPBOARD_UP_PIN_NAME(I2S_CLK), + UPBOARD_UP_PIN_NAME(GPIO23), + UPBOARD_UP_PIN_NAME(GPIO24), + UPBOARD_UP_PIN_NAME(GPIO25), + UPBOARD_UP_PIN_NAME(SPI_CS0), + UPBOARD_UP_PIN_NAME(SPI_CS1), + UPBOARD_UP_PIN_FUNC(I2C0_SCL, &upboard_i2c0_reg), + UPBOARD_UP_PIN_NAME(PWM0), + UPBOARD_UP_PIN_NAME(UART1_CTS), + UPBOARD_UP_PIN_NAME(I2S_DIN), + UPBOARD_UP_PIN_NAME(I2S_DOUT), +}; + +static const unsigned int upboard_up_pin_header[] = { + UPBOARD_UP_BIT_TO_PIN(I2C0_SDA), + UPBOARD_UP_BIT_TO_PIN(I2C0_SCL), + UPBOARD_UP_BIT_TO_PIN(I2C1_SDA), + UPBOARD_UP_BIT_TO_PIN(I2C1_SCL), + UPBOARD_UP_BIT_TO_PIN(ADC0), + UPBOARD_UP_BIT_TO_PIN(GPIO5), + UPBOARD_UP_BIT_TO_PIN(GPIO6), + UPBOARD_UP_BIT_TO_PIN(SPI_CS1), + UPBOARD_UP_BIT_TO_PIN(SPI_CS0), + UPBOARD_UP_BIT_TO_PIN(SPI_MISO), + UPBOARD_UP_BIT_TO_PIN(SPI_MOSI), + UPBOARD_UP_BIT_TO_PIN(SPI_CLK), + UPBOARD_UP_BIT_TO_PIN(PWM0), + UPBOARD_UP_BIT_TO_PIN(PWM1), + UPBOARD_UP_BIT_TO_PIN(UART1_TX), + UPBOARD_UP_BIT_TO_PIN(UART1_RX), + UPBOARD_UP_BIT_TO_PIN(UART1_CTS), + UPBOARD_UP_BIT_TO_PIN(UART1_RTS), + UPBOARD_UP_BIT_TO_PIN(I2S_CLK), + UPBOARD_UP_BIT_TO_PIN(I2S_FRM), + UPBOARD_UP_BIT_TO_PIN(I2S_DIN), + UPBOARD_UP_BIT_TO_PIN(I2S_DOUT), + UPBOARD_UP_BIT_TO_PIN(GPIO22), + UPBOARD_UP_BIT_TO_PIN(GPIO23), + UPBOARD_UP_BIT_TO_PIN(GPIO24), + UPBOARD_UP_BIT_TO_PIN(GPIO25), + UPBOARD_UP_BIT_TO_PIN(GPIO26), + UPBOARD_UP_BIT_TO_PIN(GPIO27), +}; + +static const unsigned int upboard_up_uart1_pins[] = { + UPBOARD_UP_BIT_TO_PIN(UART1_TX), + UPBOARD_UP_BIT_TO_PIN(UART1_RX), + UPBOARD_UP_BIT_TO_PIN(UART1_RTS), + UPBOARD_UP_BIT_TO_PIN(UART1_CTS), +}; + +static const enum upboard_pin_mode upboard_up_uart1_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, +}; + +static_assert(ARRAY_SIZE(upboard_up_uart1_modes) == ARRAY_SIZE(upboard_up_uart1_pins)); + +static const unsigned int upboard_up_i2c0_pins[] = { + UPBOARD_UP_BIT_TO_PIN(I2C0_SCL), + UPBOARD_UP_BIT_TO_PIN(I2C0_SDA), +}; + +static const unsigned int upboard_up_i2c1_pins[] = { + UPBOARD_UP_BIT_TO_PIN(I2C1_SCL), + UPBOARD_UP_BIT_TO_PIN(I2C1_SDA), +}; + +static const unsigned int upboard_up_spi2_pins[] = { + UPBOARD_UP_BIT_TO_PIN(SPI_MOSI), + UPBOARD_UP_BIT_TO_PIN(SPI_MISO), + UPBOARD_UP_BIT_TO_PIN(SPI_CLK), + UPBOARD_UP_BIT_TO_PIN(SPI_CS0), + UPBOARD_UP_BIT_TO_PIN(SPI_CS1), +}; + +static const enum upboard_pin_mode upboard_up_spi2_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, +}; + +static_assert(ARRAY_SIZE(upboard_up_spi2_modes) == ARRAY_SIZE(upboard_up_spi2_pins)); + +static const unsigned int upboard_up_i2s0_pins[] = { + UPBOARD_UP_BIT_TO_PIN(I2S_FRM), + UPBOARD_UP_BIT_TO_PIN(I2S_CLK), + UPBOARD_UP_BIT_TO_PIN(I2S_DIN), + UPBOARD_UP_BIT_TO_PIN(I2S_DOUT), +}; + +static const enum upboard_pin_mode upboard_up_i2s0_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, +}; + +static_assert(ARRAY_SIZE(upboard_up_i2s0_pins) == ARRAY_SIZE(upboard_up_i2s0_modes)); + +static const unsigned int upboard_up_pwm0_pins[] = { + UPBOARD_UP_BIT_TO_PIN(PWM0), +}; + +static const unsigned int upboard_up_pwm1_pins[] = { + UPBOARD_UP_BIT_TO_PIN(PWM1), +}; + +static const unsigned int upboard_up_adc0_pins[] = { + UPBOARD_UP_BIT_TO_PIN(ADC0), +}; + +#define UPBOARD_PINGROUP(n, p, m) \ +{ \ + .grp = PINCTRL_PINGROUP(n, p, ARRAY_SIZE(p)), \ + .mode = __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(m), const enum upboard_pin_mode *), \ + 0, m), \ + .modes = __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(m), const enum upboard_pin_mode *), \ + m, NULL), \ +} + +static const struct upboard_pingroup upboard_up_pin_groups[] = { + UPBOARD_PINGROUP("uart1_grp", upboard_up_uart1_pins, &upboard_up_uart1_modes[0]), + UPBOARD_PINGROUP("i2c0_grp", upboard_up_i2c0_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("i2c1_grp", upboard_up_i2c1_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("spi2_grp", upboard_up_spi2_pins, &upboard_up_spi2_modes[0]), + UPBOARD_PINGROUP("i2s0_grp", upboard_up_i2s0_pins, &upboard_up_i2s0_modes[0]), + UPBOARD_PINGROUP("pwm0_grp", upboard_up_pwm0_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("pwm1_grp", upboard_up_pwm1_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("adc0_grp", upboard_up_adc0_pins, UPBOARD_PIN_MODE_GPIO_IN), +}; + +static const char * const upboard_up_uart1_groups[] = { "uart1_grp" }; +static const char * const upboard_up_i2c0_groups[] = { "i2c0_grp" }; +static const char * const upboard_up_i2c1_groups[] = { "i2c1_grp" }; +static const char * const upboard_up_spi2_groups[] = { "spi2_grp" }; +static const char * const upboard_up_i2s0_groups[] = { "i2s0_grp" }; +static const char * const upboard_up_pwm0_groups[] = { "pwm0_grp" }; +static const char * const upboard_up_pwm1_groups[] = { "pwm1_grp" }; +static const char * const upboard_up_adc0_groups[] = { "adc0_grp" }; + +#define UPBOARD_FUNCTION(func, groups) PINCTRL_PINFUNCTION(func, groups, ARRAY_SIZE(groups)) + +static const struct pinfunction upboard_up_pin_functions[] = { + UPBOARD_FUNCTION("uart1", upboard_up_uart1_groups), + UPBOARD_FUNCTION("i2c0", upboard_up_i2c0_groups), + UPBOARD_FUNCTION("i2c1", upboard_up_i2c1_groups), + UPBOARD_FUNCTION("spi2", upboard_up_spi2_groups), + UPBOARD_FUNCTION("i2s0", upboard_up_i2s0_groups), + UPBOARD_FUNCTION("pwm0", upboard_up_pwm0_groups), + UPBOARD_FUNCTION("pwm1", upboard_up_pwm1_groups), + UPBOARD_FUNCTION("adc0", upboard_up_adc0_groups), +}; + +static const struct upboard_pinctrl_data upboard_up_pinctrl_data = { + .groups = &upboard_up_pin_groups[0], + .ngroups = ARRAY_SIZE(upboard_up_pin_groups), + .funcs = &upboard_up_pin_functions[0], + .nfuncs = ARRAY_SIZE(upboard_up_pin_functions), + .pin_header = &upboard_up_pin_header[0], + .ngpio = ARRAY_SIZE(upboard_up_pin_header), +}; + +#define UPBOARD_UP2_BIT_TO_PIN(bit) UPBOARD_UP2_BIT_##bit + +#define UPBOARD_UP2_PIN_NAME(id) \ + { \ + .number = UPBOARD_UP2_BIT_##id, \ + .name = #id, \ + } + +#define UPBOARD_UP2_PIN_MUX(bit, data) \ + { \ + .number = UPBOARD_UP2_BIT_##bit, \ + .name = "PINMUX_"#bit, \ + .drv_data = (void *)(data), \ + } + +#define UPBOARD_UP2_PIN_FUNC(id, data) \ + { \ + .number = UPBOARD_UP2_BIT_##id, \ + .name = #id, \ + .drv_data = (void *)(data), \ + } + +enum upboard_up2_fpgabit { + UPBOARD_UP2_BIT_UART1_TXD, + UPBOARD_UP2_BIT_UART1_RXD, + UPBOARD_UP2_BIT_UART1_RTS, + UPBOARD_UP2_BIT_UART1_CTS, + UPBOARD_UP2_BIT_GPIO3_ADC0, + UPBOARD_UP2_BIT_GPIO5_ADC2, + UPBOARD_UP2_BIT_GPIO6_ADC3, + UPBOARD_UP2_BIT_GPIO11, + UPBOARD_UP2_BIT_EXHAT_LVDS1n, + UPBOARD_UP2_BIT_EXHAT_LVDS1p, + UPBOARD_UP2_BIT_SPI2_TXD, + UPBOARD_UP2_BIT_SPI2_RXD, + UPBOARD_UP2_BIT_SPI2_FS1, + UPBOARD_UP2_BIT_SPI2_FS0, + UPBOARD_UP2_BIT_SPI2_CLK, + UPBOARD_UP2_BIT_SPI1_TXD, + UPBOARD_UP2_BIT_SPI1_RXD, + UPBOARD_UP2_BIT_SPI1_FS1, + UPBOARD_UP2_BIT_SPI1_FS0, + UPBOARD_UP2_BIT_SPI1_CLK, + UPBOARD_UP2_BIT_I2C0_SCL, + UPBOARD_UP2_BIT_I2C0_SDA, + UPBOARD_UP2_BIT_I2C1_SCL, + UPBOARD_UP2_BIT_I2C1_SDA, + UPBOARD_UP2_BIT_PWM1, + UPBOARD_UP2_BIT_PWM0, + UPBOARD_UP2_BIT_EXHAT_LVDS0n, + UPBOARD_UP2_BIT_EXHAT_LVDS0p, + UPBOARD_UP2_BIT_GPIO24, + UPBOARD_UP2_BIT_GPIO10, + UPBOARD_UP2_BIT_GPIO2, + UPBOARD_UP2_BIT_GPIO1, + UPBOARD_UP2_BIT_EXHAT_LVDS3n, + UPBOARD_UP2_BIT_EXHAT_LVDS3p, + UPBOARD_UP2_BIT_EXHAT_LVDS4n, + UPBOARD_UP2_BIT_EXHAT_LVDS4p, + UPBOARD_UP2_BIT_EXHAT_LVDS5n, + UPBOARD_UP2_BIT_EXHAT_LVDS5p, + UPBOARD_UP2_BIT_I2S_SDO, + UPBOARD_UP2_BIT_I2S_SDI, + UPBOARD_UP2_BIT_I2S_WS_SYNC, + UPBOARD_UP2_BIT_I2S_BCLK, + UPBOARD_UP2_BIT_EXHAT_LVDS6n, + UPBOARD_UP2_BIT_EXHAT_LVDS6p, + UPBOARD_UP2_BIT_EXHAT_LVDS7n, + UPBOARD_UP2_BIT_EXHAT_LVDS7p, + UPBOARD_UP2_BIT_EXHAT_LVDS2n, + UPBOARD_UP2_BIT_EXHAT_LVDS2p, +}; + +static const struct pinctrl_pin_desc upboard_up2_pins[] = { + UPBOARD_UP2_PIN_NAME(UART1_TXD), + UPBOARD_UP2_PIN_NAME(UART1_RXD), + UPBOARD_UP2_PIN_NAME(UART1_RTS), + UPBOARD_UP2_PIN_NAME(UART1_CTS), + UPBOARD_UP2_PIN_NAME(GPIO3_ADC0), + UPBOARD_UP2_PIN_NAME(GPIO5_ADC2), + UPBOARD_UP2_PIN_NAME(GPIO6_ADC3), + UPBOARD_UP2_PIN_NAME(GPIO11), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS1n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS1p), + UPBOARD_UP2_PIN_NAME(SPI2_TXD), + UPBOARD_UP2_PIN_NAME(SPI2_RXD), + UPBOARD_UP2_PIN_NAME(SPI2_FS1), + UPBOARD_UP2_PIN_NAME(SPI2_FS0), + UPBOARD_UP2_PIN_NAME(SPI2_CLK), + UPBOARD_UP2_PIN_NAME(SPI1_TXD), + UPBOARD_UP2_PIN_NAME(SPI1_RXD), + UPBOARD_UP2_PIN_NAME(SPI1_FS1), + UPBOARD_UP2_PIN_NAME(SPI1_FS0), + UPBOARD_UP2_PIN_NAME(SPI1_CLK), + UPBOARD_UP2_PIN_MUX(I2C0_SCL, &upboard_i2c0_reg), + UPBOARD_UP2_PIN_MUX(I2C0_SDA, &upboard_i2c0_reg), + UPBOARD_UP2_PIN_MUX(I2C1_SCL, &upboard_i2c1_reg), + UPBOARD_UP2_PIN_MUX(I2C1_SDA, &upboard_i2c1_reg), + UPBOARD_UP2_PIN_NAME(PWM1), + UPBOARD_UP2_PIN_NAME(PWM0), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS0n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS0p), + UPBOARD_UP2_PIN_MUX(GPIO24, &upboard_i2c0_reg), + UPBOARD_UP2_PIN_MUX(GPIO10, &upboard_i2c0_reg), + UPBOARD_UP2_PIN_MUX(GPIO2, &upboard_i2c1_reg), + UPBOARD_UP2_PIN_MUX(GPIO1, &upboard_i2c1_reg), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS3n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS3p), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS4n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS4p), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS5n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS5p), + UPBOARD_UP2_PIN_NAME(I2S_SDO), + UPBOARD_UP2_PIN_NAME(I2S_SDI), + UPBOARD_UP2_PIN_NAME(I2S_WS_SYNC), + UPBOARD_UP2_PIN_NAME(I2S_BCLK), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS6n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS6p), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS7n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS7p), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS2n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS2p), +}; + +static const unsigned int upboard_up2_pin_header[] = { + UPBOARD_UP2_BIT_TO_PIN(GPIO10), + UPBOARD_UP2_BIT_TO_PIN(GPIO24), + UPBOARD_UP2_BIT_TO_PIN(GPIO1), + UPBOARD_UP2_BIT_TO_PIN(GPIO2), + UPBOARD_UP2_BIT_TO_PIN(GPIO3_ADC0), + UPBOARD_UP2_BIT_TO_PIN(GPIO11), + UPBOARD_UP2_BIT_TO_PIN(SPI2_CLK), + UPBOARD_UP2_BIT_TO_PIN(SPI1_FS1), + UPBOARD_UP2_BIT_TO_PIN(SPI1_FS0), + UPBOARD_UP2_BIT_TO_PIN(SPI1_RXD), + UPBOARD_UP2_BIT_TO_PIN(SPI1_TXD), + UPBOARD_UP2_BIT_TO_PIN(SPI1_CLK), + UPBOARD_UP2_BIT_TO_PIN(PWM0), + UPBOARD_UP2_BIT_TO_PIN(PWM1), + UPBOARD_UP2_BIT_TO_PIN(UART1_TXD), + UPBOARD_UP2_BIT_TO_PIN(UART1_RXD), + UPBOARD_UP2_BIT_TO_PIN(UART1_CTS), + UPBOARD_UP2_BIT_TO_PIN(UART1_RTS), + UPBOARD_UP2_BIT_TO_PIN(I2S_BCLK), + UPBOARD_UP2_BIT_TO_PIN(I2S_WS_SYNC), + UPBOARD_UP2_BIT_TO_PIN(I2S_SDI), + UPBOARD_UP2_BIT_TO_PIN(I2S_SDO), + UPBOARD_UP2_BIT_TO_PIN(GPIO6_ADC3), + UPBOARD_UP2_BIT_TO_PIN(SPI2_FS1), + UPBOARD_UP2_BIT_TO_PIN(SPI2_RXD), + UPBOARD_UP2_BIT_TO_PIN(SPI2_TXD), + UPBOARD_UP2_BIT_TO_PIN(SPI2_FS0), + UPBOARD_UP2_BIT_TO_PIN(GPIO5_ADC2), +}; + +static const unsigned int upboard_up2_uart1_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(UART1_TXD), + UPBOARD_UP2_BIT_TO_PIN(UART1_RXD), + UPBOARD_UP2_BIT_TO_PIN(UART1_RTS), + UPBOARD_UP2_BIT_TO_PIN(UART1_CTS), +}; + +static const enum upboard_pin_mode upboard_up2_uart1_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, +}; + +static_assert(ARRAY_SIZE(upboard_up2_uart1_modes) == ARRAY_SIZE(upboard_up2_uart1_pins)); + +static const unsigned int upboard_up2_i2c0_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(I2C0_SCL), + UPBOARD_UP2_BIT_TO_PIN(I2C0_SDA), + UPBOARD_UP2_BIT_TO_PIN(GPIO24), + UPBOARD_UP2_BIT_TO_PIN(GPIO10), +}; + +static const unsigned int upboard_up2_i2c1_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(I2C1_SCL), + UPBOARD_UP2_BIT_TO_PIN(I2C1_SDA), + UPBOARD_UP2_BIT_TO_PIN(GPIO2), + UPBOARD_UP2_BIT_TO_PIN(GPIO1), +}; + +static const unsigned int upboard_up2_spi1_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(SPI1_TXD), + UPBOARD_UP2_BIT_TO_PIN(SPI1_RXD), + UPBOARD_UP2_BIT_TO_PIN(SPI1_FS1), + UPBOARD_UP2_BIT_TO_PIN(SPI1_FS0), + UPBOARD_UP2_BIT_TO_PIN(SPI1_CLK), +}; + +static const unsigned int upboard_up2_spi2_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(SPI2_TXD), + UPBOARD_UP2_BIT_TO_PIN(SPI2_RXD), + UPBOARD_UP2_BIT_TO_PIN(SPI2_FS1), + UPBOARD_UP2_BIT_TO_PIN(SPI2_FS0), + UPBOARD_UP2_BIT_TO_PIN(SPI2_CLK), +}; + +static const enum upboard_pin_mode upboard_up2_spi_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, +}; + +static_assert(ARRAY_SIZE(upboard_up2_spi_modes) == ARRAY_SIZE(upboard_up2_spi1_pins)); + +static_assert(ARRAY_SIZE(upboard_up2_spi_modes) == ARRAY_SIZE(upboard_up2_spi2_pins)); + +static const unsigned int upboard_up2_i2s0_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(I2S_BCLK), + UPBOARD_UP2_BIT_TO_PIN(I2S_WS_SYNC), + UPBOARD_UP2_BIT_TO_PIN(I2S_SDI), + UPBOARD_UP2_BIT_TO_PIN(I2S_SDO), +}; + +static const enum upboard_pin_mode upboard_up2_i2s0_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, +}; + +static_assert(ARRAY_SIZE(upboard_up2_i2s0_modes) == ARRAY_SIZE(upboard_up2_i2s0_pins)); + +static const unsigned int upboard_up2_pwm0_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(PWM0), +}; + +static const unsigned int upboard_up2_pwm1_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(PWM1), +}; + +static const unsigned int upboard_up2_adc0_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(GPIO3_ADC0), +}; + +static const unsigned int upboard_up2_adc2_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(GPIO5_ADC2), +}; + +static const unsigned int upboard_up2_adc3_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(GPIO6_ADC3), +}; + +static const struct upboard_pingroup upboard_up2_pin_groups[] = { + UPBOARD_PINGROUP("uart1_grp", upboard_up2_uart1_pins, &upboard_up2_uart1_modes[0]), + UPBOARD_PINGROUP("i2c0_grp", upboard_up2_i2c0_pins, UPBOARD_PIN_MODE_FUNCTION), + UPBOARD_PINGROUP("i2c1_grp", upboard_up2_i2c1_pins, UPBOARD_PIN_MODE_FUNCTION), + UPBOARD_PINGROUP("spi1_grp", upboard_up2_spi1_pins, &upboard_up2_spi_modes[0]), + UPBOARD_PINGROUP("spi2_grp", upboard_up2_spi2_pins, &upboard_up2_spi_modes[0]), + UPBOARD_PINGROUP("i2s0_grp", upboard_up2_i2s0_pins, &upboard_up2_i2s0_modes[0]), + UPBOARD_PINGROUP("pwm0_grp", upboard_up2_pwm0_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("pwm1_grp", upboard_up2_pwm1_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("adc0_grp", upboard_up2_adc0_pins, UPBOARD_PIN_MODE_GPIO_IN), + UPBOARD_PINGROUP("adc2_grp", upboard_up2_adc2_pins, UPBOARD_PIN_MODE_GPIO_IN), + UPBOARD_PINGROUP("adc3_grp", upboard_up2_adc3_pins, UPBOARD_PIN_MODE_GPIO_IN), +}; + +static const char * const upboard_up2_uart1_groups[] = { "uart1_grp" }; +static const char * const upboard_up2_i2c0_groups[] = { "i2c0_grp" }; +static const char * const upboard_up2_i2c1_groups[] = { "i2c1_grp" }; +static const char * const upboard_up2_spi1_groups[] = { "spi1_grp" }; +static const char * const upboard_up2_spi2_groups[] = { "spi2_grp" }; +static const char * const upboard_up2_i2s0_groups[] = { "i2s0_grp" }; +static const char * const upboard_up2_pwm0_groups[] = { "pwm0_grp" }; +static const char * const upboard_up2_pwm1_groups[] = { "pwm1_grp" }; +static const char * const upboard_up2_adc0_groups[] = { "adc0_grp" }; +static const char * const upboard_up2_adc2_groups[] = { "adc2_grp" }; +static const char * const upboard_up2_adc3_groups[] = { "adc3_grp" }; + +static const struct pinfunction upboard_up2_pin_functions[] = { + UPBOARD_FUNCTION("uart1", upboard_up2_uart1_groups), + UPBOARD_FUNCTION("i2c0", upboard_up2_i2c0_groups), + UPBOARD_FUNCTION("i2c1", upboard_up2_i2c1_groups), + UPBOARD_FUNCTION("spi1", upboard_up2_spi1_groups), + UPBOARD_FUNCTION("spi2", upboard_up2_spi2_groups), + UPBOARD_FUNCTION("i2s0", upboard_up2_i2s0_groups), + UPBOARD_FUNCTION("pwm0", upboard_up2_pwm0_groups), + UPBOARD_FUNCTION("pwm1", upboard_up2_pwm1_groups), + UPBOARD_FUNCTION("adc0", upboard_up2_adc0_groups), + UPBOARD_FUNCTION("adc2", upboard_up2_adc2_groups), + UPBOARD_FUNCTION("adc3", upboard_up2_adc3_groups), +}; + +static const struct upboard_pinctrl_data upboard_up2_pinctrl_data = { + .groups = &upboard_up2_pin_groups[0], + .ngroups = ARRAY_SIZE(upboard_up2_pin_groups), + .funcs = &upboard_up2_pin_functions[0], + .nfuncs = ARRAY_SIZE(upboard_up2_pin_functions), + .pin_header = &upboard_up2_pin_header[0], + .ngpio = ARRAY_SIZE(upboard_up2_pin_header), +}; + +static int upboard_pinctrl_set_function(struct pinctrl_dev *pctldev, unsigned int offset) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct upboard_pin *p = &pctrl->pins[offset]; + int ret; + + if (!p->funcbit) + return -EPERM; + + ret = regmap_field_write(p->enbit, 0); + if (ret) + return ret; + + return regmap_field_write(p->funcbit, 1); +} + +static int upboard_pinctrl_gpio_commit_enable(struct pinctrl_dev *pctldev, unsigned int offset) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct upboard_pin *p = &pctrl->pins[offset]; + int ret; + + if (p->funcbit) { + ret = regmap_field_write(p->funcbit, 0); + if (ret) + return ret; + } + + return regmap_field_write(p->enbit, 1); +} + +static int upboard_pinctrl_gpio_request_enable(struct pinctrl_dev *pctldev, + struct pinctrl_gpio_range *range, + unsigned int offset) +{ + return upboard_pinctrl_gpio_commit_enable(pctldev, offset); +} + +static void upboard_pinctrl_gpio_commit_disable(struct pinctrl_dev *pctldev, unsigned int offset) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct upboard_pin *p = &pctrl->pins[offset]; + + regmap_field_write(p->enbit, 0); +}; + +static void upboard_pinctrl_gpio_disable_free(struct pinctrl_dev *pctldev, + struct pinctrl_gpio_range *range, unsigned int offset) +{ + return upboard_pinctrl_gpio_commit_disable(pctldev, offset); +} + +static int upboard_pinctrl_gpio_commit_direction(struct pinctrl_dev *pctldev, unsigned int offset, + bool input) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct upboard_pin *p = &pctrl->pins[offset]; + + return regmap_field_write(p->dirbit, input); +} + +static int upboard_pinctrl_gpio_set_direction(struct pinctrl_dev *pctldev, + struct pinctrl_gpio_range *range, + unsigned int offset, bool input) +{ + return upboard_pinctrl_gpio_commit_direction(pctldev, offset, input); +} + +static int upboard_pinctrl_set_mux(struct pinctrl_dev *pctldev, unsigned int func_selector, + unsigned int group_selector) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + const struct upboard_pinctrl_data *pctrl_data = pctrl->pctrl_data; + const struct upboard_pingroup *upgroups = pctrl_data->groups; + struct group_desc *grp; + unsigned int mode, i; + int ret; + + grp = pinctrl_generic_get_group(pctldev, group_selector); + if (!grp) + return -EINVAL; + + for (i = 0; i < grp->grp.npins; i++) { + mode = upgroups[group_selector].mode ?: upgroups[group_selector].modes[i]; + if (mode == UPBOARD_PIN_MODE_FUNCTION) { + ret = upboard_pinctrl_set_function(pctldev, grp->grp.pins[i]); + if (ret) + return ret; + + continue; + } + + ret = upboard_pinctrl_gpio_commit_enable(pctldev, grp->grp.pins[i]); + if (ret) + return ret; + + ret = upboard_pinctrl_gpio_commit_direction(pctldev, grp->grp.pins[i], + mode == UPBOARD_PIN_MODE_GPIO_IN); + if (ret) + return ret; + } + + return 0; +} + +static const struct pinmux_ops upboard_pinmux_ops = { + .get_functions_count = pinmux_generic_get_function_count, + .get_function_name = pinmux_generic_get_function_name, + .get_function_groups = pinmux_generic_get_function_groups, + .set_mux = upboard_pinctrl_set_mux, + .gpio_request_enable = upboard_pinctrl_gpio_request_enable, + .gpio_disable_free = upboard_pinctrl_gpio_disable_free, + .gpio_set_direction = upboard_pinctrl_gpio_set_direction, +}; + +static int upboard_pinctrl_pin_get_mode(struct pinctrl_dev *pctldev, unsigned int pin) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct upboard_pin *p = &pctrl->pins[pin]; + unsigned int val; + int ret; + + if (p->funcbit) { + ret = regmap_field_read(p->funcbit, &val); + if (ret) + return ret; + if (val) + return UPBOARD_PIN_MODE_FUNCTION; + } + + ret = regmap_field_read(p->enbit, &val); + if (ret) + return ret; + if (!val) + return UPBOARD_PIN_MODE_DISABLED; + + ret = regmap_field_read(p->dirbit, &val); + if (ret) + return ret; + + return val ? UPBOARD_PIN_MODE_GPIO_IN : UPBOARD_PIN_MODE_GPIO_OUT; +} + +static void upboard_pinctrl_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s, + unsigned int offset) +{ + int ret; + + ret = upboard_pinctrl_pin_get_mode(pctldev, offset); + if (ret == UPBOARD_PIN_MODE_FUNCTION) + seq_puts(s, "mode function "); + else if (ret == UPBOARD_PIN_MODE_DISABLED) + seq_puts(s, "HIGH-Z "); + else if (ret < 0) + seq_puts(s, "N/A "); + else + seq_printf(s, "GPIO (%s) ", str_input_output(ret == UPBOARD_PIN_MODE_GPIO_IN)); +} + +static const struct pinctrl_ops upboard_pinctrl_ops = { + .get_groups_count = pinctrl_generic_get_group_count, + .get_group_name = pinctrl_generic_get_group_name, + .get_group_pins = pinctrl_generic_get_group_pins, + .pin_dbg_show = upboard_pinctrl_dbg_show, +}; + +static int upboard_gpio_request(struct gpio_chip *gc, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(gc); + struct upboard_pinctrl *pctrl = gpiochip_fwd_get_data(fwd); + unsigned int pin = pctrl->pctrl_data->pin_header[offset]; + struct gpio_desc *desc; + int ret; + + ret = pinctrl_gpio_request(gc, offset); + if (ret) + return ret; + + desc = gpiod_get_index(pctrl->dev, "external", pin, 0); + if (IS_ERR(desc)) { + pinctrl_gpio_free(gc, offset); + return PTR_ERR(desc); + } + + return gpiochip_fwd_desc_add(fwd, desc, offset); +} + +static void upboard_gpio_free(struct gpio_chip *gc, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(gc); + + gpiochip_fwd_desc_free(fwd, offset); + pinctrl_gpio_free(gc, offset); +} + +static int upboard_gpio_get_direction(struct gpio_chip *gc, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(gc); + struct upboard_pinctrl *pctrl = gpiochip_fwd_get_data(fwd); + unsigned int pin = pctrl->pctrl_data->pin_header[offset]; + int mode; + + /* If the pin is in function mode or high-z, input direction is returned */ + mode = upboard_pinctrl_pin_get_mode(pctrl->pctldev, pin); + if (mode < 0) + return mode; + + if (mode == UPBOARD_PIN_MODE_GPIO_OUT) + return GPIO_LINE_DIRECTION_OUT; + + return GPIO_LINE_DIRECTION_IN; +} + +static int upboard_gpio_direction_input(struct gpio_chip *gc, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(gc); + int ret; + + ret = pinctrl_gpio_direction_input(gc, offset); + if (ret) + return ret; + + return gpiochip_fwd_gpio_direction_input(fwd, offset); +} + +static int upboard_gpio_direction_output(struct gpio_chip *gc, unsigned int offset, int value) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(gc); + int ret; + + ret = pinctrl_gpio_direction_output(gc, offset); + if (ret) + return ret; + + return gpiochip_fwd_gpio_direction_output(fwd, offset, value); +} + +static int upboard_pinctrl_register_groups(struct upboard_pinctrl *pctrl) +{ + const struct upboard_pingroup *groups = pctrl->pctrl_data->groups; + size_t ngroups = pctrl->pctrl_data->ngroups; + unsigned int i; + int ret; + + for (i = 0; i < ngroups; i++) { + ret = pinctrl_generic_add_group(pctrl->pctldev, groups[i].grp.name, + groups[i].grp.pins, groups[i].grp.npins, pctrl); + if (ret < 0) + return ret; + } + + return 0; +} + +static int upboard_pinctrl_register_functions(struct upboard_pinctrl *pctrl) +{ + const struct pinfunction *funcs = pctrl->pctrl_data->funcs; + size_t nfuncs = pctrl->pctrl_data->nfuncs; + unsigned int i; + int ret; + + for (i = 0; i < nfuncs ; i++) { + ret = pinmux_generic_add_function(pctrl->pctldev, funcs[i].name, + funcs[i].groups, funcs[i].ngroups, NULL); + if (ret < 0) + return ret; + } + + return 0; +} + +static const struct pinctrl_map pinctrl_map_apl01[] = { + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:00", "pwm0_grp", "pwm0"), + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:00", "pwm1_grp", "pwm1"), + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:00", "uart1_grp", "uart1"), + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:02", "i2c0_grp", "i2c0"), + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:02", "i2c1_grp", "i2c1"), + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:01", "ssp0_grp", "ssp0"), +}; + +static const struct upboard_pinctrl_map upboard_pinctrl_map_apl01 = { + .maps = &pinctrl_map_apl01[0], + .nmaps = ARRAY_SIZE(pinctrl_map_apl01), +}; + +static const struct dmi_system_id dmi_platform_info[] = { + { + /* UP Squared */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "AAEON"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "UP-APL01"), + }, + .driver_data = (void *)&upboard_pinctrl_map_apl01, + }, + { } +}; + +static int upboard_pinctrl_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct upboard_fpga *fpga = dev_get_drvdata(dev->parent); + const struct upboard_pinctrl_map *board_map; + const struct dmi_system_id *dmi_id; + struct pinctrl_desc *pctldesc; + struct upboard_pinctrl *pctrl; + struct upboard_pin *pins; + struct gpiochip_fwd *fwd; + struct pinctrl *pinctrl; + struct gpio_chip *chip; + unsigned int i; + int ret; + + pctldesc = devm_kzalloc(dev, sizeof(*pctldesc), GFP_KERNEL); + if (!pctldesc) + return -ENOMEM; + + pctrl = devm_kzalloc(dev, sizeof(*pctrl), GFP_KERNEL); + if (!pctrl) + return -ENOMEM; + + switch (fpga->fpga_data->type) { + case UPBOARD_UP_FPGA: + pctldesc->pins = upboard_up_pins; + pctldesc->npins = ARRAY_SIZE(upboard_up_pins); + pctrl->pctrl_data = &upboard_up_pinctrl_data; + break; + case UPBOARD_UP2_FPGA: + pctldesc->pins = upboard_up2_pins; + pctldesc->npins = ARRAY_SIZE(upboard_up2_pins); + pctrl->pctrl_data = &upboard_up2_pinctrl_data; + break; + default: + return dev_err_probe(dev, -ENODEV, "Unsupported device type %d\n", + fpga->fpga_data->type); + } + + dmi_id = dmi_first_match(dmi_platform_info); + if (!dmi_id) + return dev_err_probe(dev, -ENODEV, "Unsupported board\n"); + + board_map = (const struct upboard_pinctrl_map *)dmi_id->driver_data; + + pctldesc->name = dev_name(dev); + pctldesc->owner = THIS_MODULE; + pctldesc->pctlops = &upboard_pinctrl_ops; + pctldesc->pmxops = &upboard_pinmux_ops; + + pctrl->dev = dev; + + pins = devm_kcalloc(dev, pctldesc->npins, sizeof(*pins), GFP_KERNEL); + if (!pins) + return -ENOMEM; + + /* Initialize pins */ + for (i = 0; i < pctldesc->npins; i++) { + const struct pinctrl_pin_desc *pin_desc = &pctldesc->pins[i]; + unsigned int regoff = pin_desc->number / UPBOARD_REGISTER_SIZE; + unsigned int lsb = pin_desc->number % UPBOARD_REGISTER_SIZE; + struct reg_field * const fld_func = pin_desc->drv_data; + struct upboard_pin *pin = &pins[i]; + struct reg_field fldconf = {}; + + if (fld_func) { + pin->funcbit = devm_regmap_field_alloc(dev, fpga->regmap, *fld_func); + if (IS_ERR(pin->funcbit)) + return PTR_ERR(pin->funcbit); + } + + fldconf.reg = UPBOARD_REG_GPIO_EN0 + regoff; + fldconf.lsb = lsb; + fldconf.msb = lsb; + pin->enbit = devm_regmap_field_alloc(dev, fpga->regmap, fldconf); + if (IS_ERR(pin->enbit)) + return PTR_ERR(pin->enbit); + + fldconf.reg = UPBOARD_REG_GPIO_DIR0 + regoff; + fldconf.lsb = lsb; + fldconf.msb = lsb; + pin->dirbit = devm_regmap_field_alloc(dev, fpga->regmap, fldconf); + if (IS_ERR(pin->dirbit)) + return PTR_ERR(pin->dirbit); + } + + pctrl->pins = pins; + + ret = devm_pinctrl_register_and_init(dev, pctldesc, pctrl, &pctrl->pctldev); + if (ret) + return dev_err_probe(dev, ret, "Failed to register pinctrl\n"); + + ret = upboard_pinctrl_register_groups(pctrl); + if (ret) + return dev_err_probe(dev, ret, "Failed to register groups\n"); + + ret = upboard_pinctrl_register_functions(pctrl); + if (ret) + return dev_err_probe(dev, ret, "Failed to register functions\n"); + + ret = devm_pinctrl_register_mappings(dev, board_map->maps, board_map->nmaps); + if (ret) + return ret; + + pinctrl = devm_pinctrl_get_select_default(dev); + if (IS_ERR(pinctrl)) + return dev_err_probe(dev, PTR_ERR(pinctrl), "Failed to select pinctrl\n"); + + ret = pinctrl_enable(pctrl->pctldev); + if (ret) + return ret; + + fwd = devm_gpiochip_fwd_alloc(dev, pctrl->pctrl_data->ngpio); + if (IS_ERR(fwd)) + return dev_err_probe(dev, PTR_ERR(fwd), "Failed to allocate the gpiochip forwarder\n"); + + chip = gpiochip_fwd_get_gpiochip(fwd); + chip->request = upboard_gpio_request; + chip->free = upboard_gpio_free; + chip->get_direction = upboard_gpio_get_direction; + chip->direction_output = upboard_gpio_direction_output; + chip->direction_input = upboard_gpio_direction_input; + + ret = gpiochip_fwd_register(fwd, pctrl); + if (ret) + return dev_err_probe(dev, ret, "Failed to register the gpiochip forwarder\n"); + + return gpiochip_add_sparse_pin_range(chip, dev_name(dev), 0, pctrl->pctrl_data->pin_header, + pctrl->pctrl_data->ngpio); +} + +static struct platform_driver upboard_pinctrl_driver = { + .driver = { + .name = "upboard-pinctrl", + }, + .probe = upboard_pinctrl_probe, +}; +module_platform_driver(upboard_pinctrl_driver); + +MODULE_AUTHOR("Thomas Richard Date: Thu, 14 Aug 2025 11:59:27 +0200 Subject: [PATCH 0233/3206] bpf: Add dynptr type for skb metadata Add a dynptr type, similar to skb dynptr, but for the skb metadata access. The dynptr provides an alternative to __sk_buff->data_meta for accessing the custom metadata area allocated using the bpf_xdp_adjust_meta() helper. More importantly, it abstracts away the fact where the storage for the custom metadata lives, which opens up the way to persist the metadata by relocating it as the skb travels through the network stack layers. Writes to skb metadata invalidate any existing skb payload and metadata slices. While this is more restrictive that needed at the moment, it leaves the door open to reallocating the metadata on writes, and should be only a minor inconvenience to the users. Only the program types which can access __sk_buff->data_meta today are allowed to create a dynptr for skb metadata at the moment. We need to modify the network stack to persist the metadata across layers before opening up access to other BPF hooks. Once more BPF hooks gain access to skb_meta dynptr, we will also need to add a read-only variant of the helper similar to bpf_dynptr_from_skb_rdonly. skb_meta dynptr ops are stubbed out and implemented by subsequent changes. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Reviewed-by: Jesse Brandeburg Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-1-8a39e636e0fb@cloudflare.com --- include/linux/bpf.h | 7 ++++++- kernel/bpf/helpers.c | 7 +++++++ kernel/bpf/log.c | 2 ++ kernel/bpf/verifier.c | 15 +++++++++++++-- net/core/filter.c | 41 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 69 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc700925b802fe..ec527b476dba71 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -767,12 +767,15 @@ enum bpf_type_flag { */ MEM_WRITE = BIT(18 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */ + DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ - | DYNPTR_TYPE_XDP) + | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1358,6 +1361,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_SKB, /* Underlying data is a xdp_buff */ BPF_DYNPTR_TYPE_XDP, + /* Points to skb_metadata_end()-skb_metadata_len() */ + BPF_DYNPTR_TYPE_SKB_META, }; int bpf_dynptr_check_size(u32 size); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68c9..9552b32208c541 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1780,6 +1780,8 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); + case BPF_DYNPTR_TYPE_SKB_META: + return -EOPNOTSUPP; /* not implemented */ default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; @@ -1836,6 +1838,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, if (flags) return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); + case BPF_DYNPTR_TYPE_SKB_META: + return -EOPNOTSUPP; /* not implemented */ default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; @@ -1882,6 +1886,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: case BPF_DYNPTR_TYPE_XDP: + case BPF_DYNPTR_TYPE_SKB_META: /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: @@ -2710,6 +2715,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); return buffer__opt; } + case BPF_DYNPTR_TYPE_SKB_META: + return NULL; /* not implemented */ default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 38050f4ee40030..e4983c1303e760 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -498,6 +498,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type) return "skb"; case BPF_DYNPTR_TYPE_XDP: return "xdp"; + case BPF_DYNPTR_TYPE_SKB_META: + return "skb_meta"; case BPF_DYNPTR_TYPE_INVALID: return ""; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af690..5964bed40ffbf0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -674,6 +674,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_SKB; case DYNPTR_TYPE_XDP: return BPF_DYNPTR_TYPE_XDP; + case DYNPTR_TYPE_SKB_META: + return BPF_DYNPTR_TYPE_SKB_META; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -690,6 +692,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) return DYNPTR_TYPE_SKB; case BPF_DYNPTR_TYPE_XDP: return DYNPTR_TYPE_XDP; + case BPF_DYNPTR_TYPE_SKB_META: + return DYNPTR_TYPE_SKB_META; default: return 0; } @@ -2274,7 +2278,8 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg) { return base_type(reg->type) == PTR_TO_MEM && - (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP); + (reg->type & + (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META)); } /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ @@ -11641,7 +11646,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT; - if (dynptr_type == BPF_DYNPTR_TYPE_SKB) + if (dynptr_type == BPF_DYNPTR_TYPE_SKB || + dynptr_type == BPF_DYNPTR_TYPE_SKB_META) /* this will trigger clear_all_pkt_pointers(), which will * invalidate all dynptr slices associated with the skb */ @@ -12228,6 +12234,7 @@ enum special_kfunc_type { KF_bpf_rbtree_right, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, + KF_bpf_dynptr_from_skb_meta, KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, @@ -12277,9 +12284,11 @@ BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_from_skb_meta) #else BTF_ID_UNUSED BTF_ID_UNUSED +BTF_ID_UNUSED #endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) @@ -13253,6 +13262,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_SKB; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) { dynptr_arg_type |= DYNPTR_TYPE_XDP; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) { + dynptr_arg_type |= DYNPTR_TYPE_SKB_META; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; diff --git a/net/core/filter.c b/net/core/filter.c index da391e2b0788d0..31b4b50dbadf3f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12007,6 +12007,36 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, return 0; } +/** + * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area. + * @skb_: socket buffer carrying the metadata + * @flags: future use, must be zero + * @ptr__uninit: dynptr to initialize + * + * Set up a dynptr for access to the metadata area earlier allocated from the + * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to + * &__sk_buff->data_meta. + * + * Return: + * * %0 - dynptr ready to use + * * %-EINVAL - invalid flags, dynptr set to null + */ +__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)skb_; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + + return 0; +} + __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { @@ -12181,6 +12211,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) +BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) + BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) @@ -12202,6 +12236,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .set = &bpf_kfunc_check_set_skb, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb_meta, +}; + static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, @@ -12237,6 +12276,8 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); From 6877cd392baecf816c2ba896a9d42874628004a5 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:28 +0200 Subject: [PATCH 0234/3206] bpf: Enable read/write access to skb metadata through a dynptr Now that we can create a dynptr to skb metadata, make reads to the metadata area possible with bpf_dynptr_read() or through a bpf_dynptr_slice(), and make writes to the metadata area possible with bpf_dynptr_write() or through a bpf_dynptr_slice_rdwr(). Note that for cloned skbs which share data with the original, we limit the skb metadata dynptr to be read-only since we don't unclone on a bpf_dynptr_write to metadata. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-2-8a39e636e0fb@cloudflare.com --- include/linux/filter.h | 6 ++++++ kernel/bpf/helpers.c | 10 +++++++--- net/core/filter.c | 16 ++++++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e7fd3ee759e07..9ed21b65e2e968 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1784,6 +1784,7 @@ int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1818,6 +1819,11 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi unsigned long len, bool flush) { } + +static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return NULL; +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9552b32208c541..cdffd74ddbe65d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1781,7 +1781,8 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_SKB_META: - return -EOPNOTSUPP; /* not implemented */ + memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; @@ -1839,7 +1840,10 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); case BPF_DYNPTR_TYPE_SKB_META: - return -EOPNOTSUPP; /* not implemented */ + if (flags) + return -EINVAL; + memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; @@ -2716,7 +2720,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, return buffer__opt; } case BPF_DYNPTR_TYPE_SKB_META: - return NULL; /* not implemented */ + return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; diff --git a/net/core/filter.c b/net/core/filter.c index 31b4b50dbadf3f..63f3baee2dafa7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11990,6 +11990,16 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return func; } +/** + * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area. + * @skb: socket buffer carrying the metadata + * @offset: offset into the metadata area, must be <= skb_metadata_len() + */ +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) @@ -12017,6 +12027,9 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to * &__sk_buff->data_meta. * + * If passed @skb_ is a clone which shares the data with the original, the + * dynptr will be read-only. This limitation may be lifted in the future. + * * Return: * * %0 - dynptr ready to use * * %-EINVAL - invalid flags, dynptr set to null @@ -12034,6 +12047,9 @@ __bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + if (skb_cloned(skb)) + bpf_dynptr_set_rdonly(ptr); + return 0; } From 0e74eb4d57f00e6103ac23ce2312766c25ad88f6 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:29 +0200 Subject: [PATCH 0235/3206] selftests/bpf: Cover verifier checks for skb_meta dynptr type dynptr for skb metadata behaves the same way as the dynptr for skb data with one exception - writes to skb_meta dynptr don't invalidate existing skb and skb_meta slices. Duplicate those the skb dynptr tests which we can, since bpf_dynptr_from_skb_meta kfunc can be called only from TC BPF, to cover the skb_meta dynptr verifier checks. Also add a couple of new tests (skb_data_valid_*) to ensure we don't invalidate the slices in the mentioned case, which are specific to skb_meta dynptr. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Reviewed-by: Jesse Brandeburg Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-3-8a39e636e0fb@cloudflare.com --- .../testing/selftests/bpf/prog_tests/dynptr.c | 2 + .../testing/selftests/bpf/progs/dynptr_fail.c | 258 ++++++++++++++++++ .../selftests/bpf/progs/dynptr_success.c | 55 ++++ 3 files changed, 315 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index 9b2d9ceda21023..b9f86cb91e81b5 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -32,6 +32,8 @@ static struct { {"test_ringbuf", SETUP_SYSCALL_SLEEP}, {"test_skb_readonly", SETUP_SKB_PROG}, {"test_dynptr_skb_data", SETUP_SKB_PROG}, + {"test_dynptr_skb_meta_data", SETUP_SKB_PROG}, + {"test_dynptr_skb_meta_flags", SETUP_SKB_PROG}, {"test_adjust", SETUP_SYSCALL_SLEEP}, {"test_adjust_err", SETUP_SYSCALL_SLEEP}, {"test_zero_size_dynptr", SETUP_SYSCALL_SLEEP}, diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index bd8f15229f5c73..dda6a8dada826c 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -269,6 +269,26 @@ int data_slice_out_of_bounds_skb(struct __sk_buff *skb) return SK_PASS; } +/* A metadata slice can't be accessed out of bounds */ +SEC("?tc") +__failure __msg("value is outside of the allowed memory range") +int data_slice_out_of_bounds_skb_meta(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + /* this should fail */ + *(md + 1) = 42; + + return SK_PASS; +} + SEC("?raw_tp") __failure __msg("value is outside of the allowed memory range") int data_slice_out_of_bounds_map_value(void *ctx) @@ -1089,6 +1109,26 @@ int skb_invalid_slice_write(struct __sk_buff *skb) return SK_PASS; } +/* bpf_dynptr_slice()s are read-only and cannot be written to */ +SEC("?tc") +__failure __msg("R{{[0-9]+}} cannot write into rdonly_mem") +int skb_meta_invalid_slice_write(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + /* The read-only data slice is invalidated whenever a helper changes packet data */ SEC("?tc") __failure __msg("invalid mem access 'scalar'") @@ -1192,6 +1232,188 @@ int skb_invalid_data_slice4(struct __sk_buff *skb) return SK_PASS; } +/* Read-only skb data slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *d; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + d = bpf_dynptr_slice(&data, 0, NULL, sizeof(*d)); + if (!d) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + val = *d; + + return SK_PASS; +} + +/* Read-write skb data slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *d; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + d = bpf_dynptr_slice_rdwr(&data, 0, NULL, sizeof(*d)); + if (!d) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + *d = 42; + + return SK_PASS; +} + +/* Read-only skb metadata slice is invalidated on write to skb data */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_meta_slice_invalid_after_payload_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *md; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&data, 0, "x", 1, 0); + + /* this should fail */ + val = *md; + + return SK_PASS; +} + +/* Read-write skb metadata slice is invalidated on write to skb data slice */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_meta_slice_invalid_after_payload_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *md; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&data, 0, "x", 1, 0); + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + +/* Read-only skb metadata slice is invalidated whenever a helper changes packet data */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_meta_slice_invalid_after_payload_helper(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + if (bpf_skb_pull_data(skb, skb->len)) + return SK_DROP; + + /* this should fail */ + val = *md; + + return SK_PASS; +} + +/* Read-write skb metadata slice is invalidated whenever a helper changes packet data */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_meta_slice_invalid_after_payload_helper(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + if (bpf_skb_pull_data(skb, skb->len)) + return SK_DROP; + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + +/* Read-only skb metadata slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_meta_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + val = *md; + + return SK_PASS; +} + +/* Read-write skb metadata slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_meta_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + /* The read-only data slice is invalidated whenever a helper changes packet data */ SEC("?xdp") __failure __msg("invalid mem access 'scalar'") @@ -1255,6 +1477,19 @@ int skb_invalid_ctx(void *ctx) return 0; } +/* Only supported prog type can create skb_meta-type dynptrs */ +SEC("?raw_tp") +__failure __msg("calling kernel function bpf_dynptr_from_skb_meta is not allowed") +int skb_meta_invalid_ctx(void *ctx) +{ + struct bpf_dynptr meta; + + /* this should fail */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + + return 0; +} + SEC("fentry/skb_tx_error") __failure __msg("must be referenced or trusted") int BPF_PROG(skb_invalid_ctx_fentry, void *skb) @@ -1665,6 +1900,29 @@ int clone_skb_packet_data(struct __sk_buff *skb) return 0; } +/* A skb clone's metadata slice becomes invalid anytime packet data changes */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int clone_skb_packet_meta(struct __sk_buff *skb) +{ + struct bpf_dynptr clone, meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + bpf_dynptr_clone(&meta, &clone); + md = bpf_dynptr_slice_rdwr(&clone, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + if (bpf_skb_pull_data(skb, skb->len)) + return SK_DROP; + + /* this should fail */ + *md = 42; + + return 0; +} + /* A xdp clone's data slices should be invalid anytime packet data changes */ SEC("?xdp") __failure __msg("invalid mem access 'scalar'") diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index 8315273cb900c2..127dea342e5a67 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -211,6 +211,61 @@ int test_dynptr_skb_data(struct __sk_buff *skb) return 1; } +SEC("?tc") +int test_dynptr_skb_meta_data(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + int ret; + + err = 1; + ret = bpf_dynptr_from_skb_meta(skb, 0, &meta); + if (ret) + return 1; + + /* This should return NULL. Must use bpf_dynptr_slice API */ + err = 2; + md = bpf_dynptr_data(&meta, 0, sizeof(*md)); + if (md) + return 1; + + err = 0; + return 1; +} + +/* Check that skb metadata dynptr ops don't accept any flags. */ +SEC("?tc") +int test_dynptr_skb_meta_flags(struct __sk_buff *skb) +{ + const __u64 INVALID_FLAGS = ~0ULL; + struct bpf_dynptr meta; + __u8 buf; + int ret; + + err = 1; + ret = bpf_dynptr_from_skb_meta(skb, INVALID_FLAGS, &meta); + if (ret != -EINVAL) + return 1; + + err = 2; + ret = bpf_dynptr_from_skb_meta(skb, 0, &meta); + if (ret) + return 1; + + err = 3; + ret = bpf_dynptr_read(&buf, 0, &meta, 0, INVALID_FLAGS); + if (ret != -EINVAL) + return 1; + + err = 4; + ret = bpf_dynptr_write(&meta, 0, &buf, 0, INVALID_FLAGS); + if (ret != -EINVAL) + return 1; + + err = 0; + return 1; +} + SEC("tp/syscalls/sys_enter_nanosleep") int test_adjust(void *ctx) { From 6dfd5e01e1a7728e162f721cd8adf5ecd24fbc80 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:30 +0200 Subject: [PATCH 0236/3206] selftests/bpf: Pass just bpf_map to xdp_context_test helper Prepare for parametrizing the xdp_context tests. The assert_test_result helper doesn't need the whole skeleton. Pass just what it needs. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Reviewed-by: Jesse Brandeburg Acked-by: Eduard Zingerman Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-4-8a39e636e0fb@cloudflare.com --- .../selftests/bpf/prog_tests/xdp_context_test_run.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index b9d9f0a502cea7..0134651d94ab08 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -156,15 +156,14 @@ static int send_test_packet(int ifindex) return -1; } -static void assert_test_result(struct test_xdp_meta *skel) +static void assert_test_result(const struct bpf_map *result_map) { int err; __u32 map_key = 0; __u8 map_value[TEST_PAYLOAD_LEN]; - err = bpf_map__lookup_elem(skel->maps.test_result, &map_key, - sizeof(map_key), &map_value, - TEST_PAYLOAD_LEN, BPF_ANY); + err = bpf_map__lookup_elem(result_map, &map_key, sizeof(map_key), + &map_value, TEST_PAYLOAD_LEN, BPF_ANY); if (!ASSERT_OK(err, "lookup test_result")) return; @@ -248,7 +247,7 @@ void test_xdp_context_veth(void) if (!ASSERT_OK(ret, "send_test_packet")) goto close; - assert_test_result(skel); + assert_test_result(skel->maps.test_result); close: close_netns(nstoken); @@ -313,7 +312,7 @@ void test_xdp_context_tuntap(void) if (!ASSERT_EQ(ret, sizeof(packet), "write packet")) goto close; - assert_test_result(skel); + assert_test_result(skel->maps.test_result); close: if (tap_fd >= 0) From dd9f6cfb4ef4394c2afd1e2b564af25aff151bc8 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:31 +0200 Subject: [PATCH 0237/3206] selftests/bpf: Parametrize test_xdp_context_tuntap We want to add more test cases to cover different ways to access the metadata area. Prepare for it. Pull up the skeleton management. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Reviewed-by: Jesse Brandeburg Acked-by: Eduard Zingerman Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-5-8a39e636e0fb@cloudflare.com --- .../bpf/prog_tests/xdp_context_test_run.c | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 0134651d94ab08..6c66e27e5bc74a 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -256,12 +256,13 @@ void test_xdp_context_veth(void) netns_free(tx_ns); } -void test_xdp_context_tuntap(void) +static void test_tuntap(struct bpf_program *xdp_prog, + struct bpf_program *tc_prog, + struct bpf_map *result_map) { LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); struct netns_obj *ns = NULL; - struct test_xdp_meta *skel = NULL; __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; int tap_fd = -1; int tap_ifindex; @@ -277,10 +278,6 @@ void test_xdp_context_tuntap(void) SYS(close, "ip link set dev " TAP_NAME " up"); - skel = test_xdp_meta__open_and_load(); - if (!ASSERT_OK_PTR(skel, "open and load skeleton")) - goto close; - tap_ifindex = if_nametoindex(TAP_NAME); if (!ASSERT_GE(tap_ifindex, 0, "if_nametoindex")) goto close; @@ -290,12 +287,12 @@ void test_xdp_context_tuntap(void) if (!ASSERT_OK(ret, "bpf_tc_hook_create")) goto close; - tc_opts.prog_fd = bpf_program__fd(skel->progs.ing_cls); + tc_opts.prog_fd = bpf_program__fd(tc_prog); ret = bpf_tc_attach(&tc_hook, &tc_opts); if (!ASSERT_OK(ret, "bpf_tc_attach")) goto close; - ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(skel->progs.ing_xdp), + ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(xdp_prog), 0, NULL); if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) goto close; @@ -312,11 +309,25 @@ void test_xdp_context_tuntap(void) if (!ASSERT_EQ(ret, sizeof(packet), "write packet")) goto close; - assert_test_result(skel->maps.test_result); + assert_test_result(result_map); close: if (tap_fd >= 0) close(tap_fd); - test_xdp_meta__destroy(skel); netns_free(ns); } + +void test_xdp_context_tuntap(void) +{ + struct test_xdp_meta *skel = NULL; + + skel = test_xdp_meta__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + return; + + if (test__start_subtest("data_meta")) + test_tuntap(skel->progs.ing_xdp, skel->progs.ing_cls, + skel->maps.test_result); + + test_xdp_meta__destroy(skel); +} From 153f6bfd489076309227413e9221960712336369 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:32 +0200 Subject: [PATCH 0238/3206] selftests/bpf: Cover read access to skb metadata via dynptr Exercise reading from SKB metadata area in two new ways: 1. indirectly, with bpf_dynptr_read(), and 2. directly, with bpf_dynptr_slice(). Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Reviewed-by: Jesse Brandeburg Acked-by: Eduard Zingerman Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-6-8a39e636e0fb@cloudflare.com --- tools/testing/selftests/bpf/bpf_kfuncs.h | 3 ++ .../bpf/prog_tests/xdp_context_test_run.c | 21 ++++++++++ .../selftests/bpf/progs/test_xdp_meta.c | 42 +++++++++++++++++++ 3 files changed, 66 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 9386dfe8b8849e..794d44d19c8865 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -19,6 +19,9 @@ extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags, extern int bpf_dynptr_from_xdp(struct xdp_md *xdp, __u64 flags, struct bpf_dynptr *ptr__uninit) __ksym __weak; +extern int bpf_dynptr_from_skb_meta(struct __sk_buff *skb, __u64 flags, + struct bpf_dynptr *ptr__uninit) __ksym __weak; + /* Description * Obtain a read-only pointer to the dynptr's data * Returns diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 6c66e27e5bc74a..7e4526461a4cb6 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -171,6 +171,18 @@ static void assert_test_result(const struct bpf_map *result_map) "test_result map contains test payload"); } +static bool clear_test_result(struct bpf_map *result_map) +{ + const __u8 v[sizeof(test_payload)] = {}; + const __u32 k = 0; + int err; + + err = bpf_map__update_elem(result_map, &k, sizeof(k), v, sizeof(v), BPF_ANY); + ASSERT_OK(err, "update test_result"); + + return err == 0; +} + void test_xdp_context_veth(void) { LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); @@ -268,6 +280,9 @@ static void test_tuntap(struct bpf_program *xdp_prog, int tap_ifindex; int ret; + if (!clear_test_result(result_map)) + return; + ns = netns_new(TAP_NETNS, true); if (!ASSERT_OK_PTR(ns, "create and open ns")) return; @@ -328,6 +343,12 @@ void test_xdp_context_tuntap(void) if (test__start_subtest("data_meta")) test_tuntap(skel->progs.ing_xdp, skel->progs.ing_cls, skel->maps.test_result); + if (test__start_subtest("dynptr_read")) + test_tuntap(skel->progs.ing_xdp, skel->progs.ing_cls_dynptr_read, + skel->maps.test_result); + if (test__start_subtest("dynptr_slice")) + test_tuntap(skel->progs.ing_xdp, skel->progs.ing_cls_dynptr_slice, + skel->maps.test_result); test_xdp_meta__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index fcf6ca14f2ea28..0ba647fb1b1d7b 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -1,8 +1,10 @@ +#include #include #include #include #include +#include "bpf_kfuncs.h" #define META_SIZE 32 @@ -40,6 +42,46 @@ int ing_cls(struct __sk_buff *ctx) return TC_ACT_SHOT; } +/* Read from metadata using bpf_dynptr_read helper */ +SEC("tc") +int ing_cls_dynptr_read(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + const __u32 zero = 0; + __u8 *dst; + + dst = bpf_map_lookup_elem(&test_result, &zero); + if (!dst) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_read(dst, META_SIZE, &meta, 0, 0); + + return TC_ACT_SHOT; +} + +/* Read from metadata using read-only dynptr slice */ +SEC("tc") +int ing_cls_dynptr_slice(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + const __u32 zero = 0; + __u8 *dst, *src; + + dst = bpf_map_lookup_elem(&test_result, &zero); + if (!dst) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + src = bpf_dynptr_slice(&meta, 0, NULL, META_SIZE); + if (!src) + return TC_ACT_SHOT; + + __builtin_memcpy(dst, src, META_SIZE); + + return TC_ACT_SHOT; +} + SEC("xdp") int ing_xdp(struct xdp_md *ctx) { From ed93360807801e7f69b74efec98a1bd674ba035e Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:33 +0200 Subject: [PATCH 0239/3206] selftests/bpf: Cover write access to skb metadata via dynptr Add tests what exercise writes to skb metadata in two ways: 1. indirectly, using bpf_dynptr_write helper, 2. directly, using a read-write dynptr slice. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Reviewed-by: Jesse Brandeburg Acked-by: Eduard Zingerman Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-7-8a39e636e0fb@cloudflare.com --- .../bpf/prog_tests/xdp_context_test_run.c | 36 ++++++++-- .../selftests/bpf/progs/test_xdp_meta.c | 67 +++++++++++++++++++ 2 files changed, 98 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 7e4526461a4cb6..79c4c58276e672 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -269,7 +269,8 @@ void test_xdp_context_veth(void) } static void test_tuntap(struct bpf_program *xdp_prog, - struct bpf_program *tc_prog, + struct bpf_program *tc_prio_1_prog, + struct bpf_program *tc_prio_2_prog, struct bpf_map *result_map) { LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); @@ -302,11 +303,20 @@ static void test_tuntap(struct bpf_program *xdp_prog, if (!ASSERT_OK(ret, "bpf_tc_hook_create")) goto close; - tc_opts.prog_fd = bpf_program__fd(tc_prog); + tc_opts.prog_fd = bpf_program__fd(tc_prio_1_prog); ret = bpf_tc_attach(&tc_hook, &tc_opts); if (!ASSERT_OK(ret, "bpf_tc_attach")) goto close; + if (tc_prio_2_prog) { + LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 2, + .prog_fd = bpf_program__fd(tc_prio_2_prog)); + + ret = bpf_tc_attach(&tc_hook, &tc_opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + goto close; + } + ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(xdp_prog), 0, NULL); if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) @@ -341,13 +351,29 @@ void test_xdp_context_tuntap(void) return; if (test__start_subtest("data_meta")) - test_tuntap(skel->progs.ing_xdp, skel->progs.ing_cls, + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls, + NULL, /* tc prio 2 */ skel->maps.test_result); if (test__start_subtest("dynptr_read")) - test_tuntap(skel->progs.ing_xdp, skel->progs.ing_cls_dynptr_read, + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls_dynptr_read, + NULL, /* tc prio 2 */ skel->maps.test_result); if (test__start_subtest("dynptr_slice")) - test_tuntap(skel->progs.ing_xdp, skel->progs.ing_cls_dynptr_slice, + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls_dynptr_slice, + NULL, /* tc prio 2 */ + skel->maps.test_result); + if (test__start_subtest("dynptr_write")) + test_tuntap(skel->progs.ing_xdp_zalloc_meta, + skel->progs.ing_cls_dynptr_write, + skel->progs.ing_cls_dynptr_read, + skel->maps.test_result); + if (test__start_subtest("dynptr_slice_rdwr")) + test_tuntap(skel->progs.ing_xdp_zalloc_meta, + skel->progs.ing_cls_dynptr_slice_rdwr, + skel->progs.ing_cls_dynptr_slice, skel->maps.test_result); test_xdp_meta__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index 0ba647fb1b1d7b..e7879860f4039a 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -60,6 +60,24 @@ int ing_cls_dynptr_read(struct __sk_buff *ctx) return TC_ACT_SHOT; } +/* Write to metadata using bpf_dynptr_write helper */ +SEC("tc") +int ing_cls_dynptr_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + __u8 *src; + + bpf_dynptr_from_skb(ctx, 0, &data); + src = bpf_dynptr_slice(&data, sizeof(struct ethhdr), NULL, META_SIZE); + if (!src) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_write(&meta, 0, src, META_SIZE, 0); + + return TC_ACT_UNSPEC; /* pass */ +} + /* Read from metadata using read-only dynptr slice */ SEC("tc") int ing_cls_dynptr_slice(struct __sk_buff *ctx) @@ -82,6 +100,55 @@ int ing_cls_dynptr_slice(struct __sk_buff *ctx) return TC_ACT_SHOT; } +/* Write to metadata using writeable dynptr slice */ +SEC("tc") +int ing_cls_dynptr_slice_rdwr(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + __u8 *src, *dst; + + bpf_dynptr_from_skb(ctx, 0, &data); + src = bpf_dynptr_slice(&data, sizeof(struct ethhdr), NULL, META_SIZE); + if (!src) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + dst = bpf_dynptr_slice_rdwr(&meta, 0, NULL, META_SIZE); + if (!dst) + return TC_ACT_SHOT; + + __builtin_memcpy(dst, src, META_SIZE); + + return TC_ACT_UNSPEC; /* pass */ +} + +/* Reserve and clear space for metadata but don't populate it */ +SEC("xdp") +int ing_xdp_zalloc_meta(struct xdp_md *ctx) +{ + struct ethhdr *eth = ctx_ptr(ctx, data); + __u8 *meta; + int ret; + + /* Drop any non-test packets */ + if (eth + 1 > ctx_ptr(ctx, data_end)) + return XDP_DROP; + if (eth->h_proto != 0) + return XDP_DROP; + + ret = bpf_xdp_adjust_meta(ctx, -META_SIZE); + if (ret < 0) + return XDP_DROP; + + meta = ctx_ptr(ctx, data_meta); + if (meta + META_SIZE > ctx_ptr(ctx, data)) + return XDP_DROP; + + __builtin_memset(meta, 0, META_SIZE); + + return XDP_PASS; +} + SEC("xdp") int ing_xdp(struct xdp_md *ctx) { From bd1b51b319788cbc5769a44f0081a1cb012f8ae4 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:34 +0200 Subject: [PATCH 0240/3206] selftests/bpf: Cover read/write to skb metadata at an offset Exercise r/w access to skb metadata through an offset-adjusted dynptr, read/write helper with an offset argument, and a slice starting at an offset. Also check for the expected errors when the offset is out of bounds. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Reviewed-by: Jesse Brandeburg Acked-by: Eduard Zingerman Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-8-8a39e636e0fb@cloudflare.com --- .../bpf/prog_tests/xdp_context_test_run.c | 10 ++ .../selftests/bpf/progs/test_xdp_meta.c | 119 ++++++++++++++++++ 2 files changed, 129 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 79c4c58276e672..24a7b4b7fdb663 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -375,6 +375,16 @@ void test_xdp_context_tuntap(void) skel->progs.ing_cls_dynptr_slice_rdwr, skel->progs.ing_cls_dynptr_slice, skel->maps.test_result); + if (test__start_subtest("dynptr_offset")) + test_tuntap(skel->progs.ing_xdp_zalloc_meta, + skel->progs.ing_cls_dynptr_offset_wr, + skel->progs.ing_cls_dynptr_offset_rd, + skel->maps.test_result); + if (test__start_subtest("dynptr_offset_oob")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls_dynptr_offset_oob, + skel->progs.ing_cls, + skel->maps.test_result); test_xdp_meta__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index e7879860f4039a..ee3d8adf5e9cc2 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -122,6 +123,124 @@ int ing_cls_dynptr_slice_rdwr(struct __sk_buff *ctx) return TC_ACT_UNSPEC; /* pass */ } +/* Read skb metadata in chunks from various offsets in different ways. */ +SEC("tc") +int ing_cls_dynptr_offset_rd(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + const __u32 chunk_len = META_SIZE / 4; + const __u32 zero = 0; + __u8 *dst, *src; + + dst = bpf_map_lookup_elem(&test_result, &zero); + if (!dst) + return TC_ACT_SHOT; + + /* 1. Regular read */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_read(dst, chunk_len, &meta, 0, 0); + dst += chunk_len; + + /* 2. Read from an offset-adjusted dynptr */ + bpf_dynptr_adjust(&meta, chunk_len, bpf_dynptr_size(&meta)); + bpf_dynptr_read(dst, chunk_len, &meta, 0, 0); + dst += chunk_len; + + /* 3. Read at an offset */ + bpf_dynptr_read(dst, chunk_len, &meta, chunk_len, 0); + dst += chunk_len; + + /* 4. Read from a slice starting at an offset */ + src = bpf_dynptr_slice(&meta, 2 * chunk_len, NULL, chunk_len); + if (!src) + return TC_ACT_SHOT; + __builtin_memcpy(dst, src, chunk_len); + + return TC_ACT_SHOT; +} + +/* Write skb metadata in chunks at various offsets in different ways. */ +SEC("tc") +int ing_cls_dynptr_offset_wr(struct __sk_buff *ctx) +{ + const __u32 chunk_len = META_SIZE / 4; + __u8 payload[META_SIZE]; + struct bpf_dynptr meta; + __u8 *dst, *src; + + bpf_skb_load_bytes(ctx, sizeof(struct ethhdr), payload, sizeof(payload)); + src = payload; + + /* 1. Regular write */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_write(&meta, 0, src, chunk_len, 0); + src += chunk_len; + + /* 2. Write to an offset-adjusted dynptr */ + bpf_dynptr_adjust(&meta, chunk_len, bpf_dynptr_size(&meta)); + bpf_dynptr_write(&meta, 0, src, chunk_len, 0); + src += chunk_len; + + /* 3. Write at an offset */ + bpf_dynptr_write(&meta, chunk_len, src, chunk_len, 0); + src += chunk_len; + + /* 4. Write to a slice starting at an offset */ + dst = bpf_dynptr_slice_rdwr(&meta, 2 * chunk_len, NULL, chunk_len); + if (!dst) + return TC_ACT_SHOT; + __builtin_memcpy(dst, src, chunk_len); + + return TC_ACT_UNSPEC; /* pass */ +} + +/* Pass an OOB offset to dynptr read, write, adjust, slice. */ +SEC("tc") +int ing_cls_dynptr_offset_oob(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + __u8 md, *p; + int err; + + err = bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (err) + goto fail; + + /* read offset OOB */ + err = bpf_dynptr_read(&md, sizeof(md), &meta, META_SIZE, 0); + if (err != -E2BIG) + goto fail; + + /* write offset OOB */ + err = bpf_dynptr_write(&meta, META_SIZE, &md, sizeof(md), 0); + if (err != -E2BIG) + goto fail; + + /* adjust end offset OOB */ + err = bpf_dynptr_adjust(&meta, 0, META_SIZE + 1); + if (err != -ERANGE) + goto fail; + + /* adjust start offset OOB */ + err = bpf_dynptr_adjust(&meta, META_SIZE + 1, META_SIZE + 1); + if (err != -ERANGE) + goto fail; + + /* slice offset OOB */ + p = bpf_dynptr_slice(&meta, META_SIZE, NULL, sizeof(*p)); + if (p) + goto fail; + + /* slice rdwr offset OOB */ + p = bpf_dynptr_slice_rdwr(&meta, META_SIZE, NULL, sizeof(*p)); + if (p) + goto fail; + + return TC_ACT_UNSPEC; +fail: + return TC_ACT_SHOT; +} + /* Reserve and clear space for metadata but don't populate it */ SEC("xdp") int ing_xdp_zalloc_meta(struct xdp_md *ctx) From 403fae59781fddc699af761f38ed024d3245096b Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:35 +0200 Subject: [PATCH 0241/3206] selftests/bpf: Cover metadata access from a modified skb clone Demonstrate that, when processing an skb clone, the metadata gets truncated if the program contains a direct write to either the payload or the metadata, due to an implicit unclone in the prologue, and otherwise the dynptr to the metadata is limited to being read-only. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-9-8a39e636e0fb@cloudflare.com --- tools/testing/selftests/bpf/config | 1 + .../bpf/prog_tests/xdp_context_test_run.c | 123 ++++++++++- .../selftests/bpf/progs/test_xdp_meta.c | 191 ++++++++++++++++++ 3 files changed, 305 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 8916ab814a3ead..70b28c1e653ead 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -61,6 +61,7 @@ CONFIG_MPLS_IPTUNNEL=y CONFIG_MPLS_ROUTING=y CONFIG_MPTCP=y CONFIG_NET_ACT_GACT=y +CONFIG_NET_ACT_MIRRED=y CONFIG_NET_ACT_SKBMOD=y CONFIG_NET_CLS=y CONFIG_NET_CLS_ACT=y diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 24a7b4b7fdb663..46e0730174ed66 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -9,6 +9,7 @@ #define TX_NETNS "xdp_context_tx" #define RX_NETNS "xdp_context_rx" #define TAP_NAME "tap0" +#define DUMMY_NAME "dum0" #define TAP_NETNS "xdp_context_tuntap" #define TEST_PAYLOAD_LEN 32 @@ -156,6 +157,22 @@ static int send_test_packet(int ifindex) return -1; } +static int write_test_packet(int tap_fd) +{ + __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; + int n; + + /* The ethernet header doesn't need to be valid for this test */ + memset(packet, 0, sizeof(struct ethhdr)); + memcpy(packet + sizeof(struct ethhdr), test_payload, TEST_PAYLOAD_LEN); + + n = write(tap_fd, packet, sizeof(packet)); + if (!ASSERT_EQ(n, sizeof(packet), "write packet")) + return -1; + + return 0; +} + static void assert_test_result(const struct bpf_map *result_map) { int err; @@ -276,7 +293,6 @@ static void test_tuntap(struct bpf_program *xdp_prog, LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); struct netns_obj *ns = NULL; - __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; int tap_fd = -1; int tap_ifindex; int ret; @@ -322,19 +338,82 @@ static void test_tuntap(struct bpf_program *xdp_prog, if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) goto close; - /* The ethernet header is not relevant for this test and doesn't need to - * be meaningful. - */ - struct ethhdr eth = { 0 }; + ret = write_test_packet(tap_fd); + if (!ASSERT_OK(ret, "write_test_packet")) + goto close; - memcpy(packet, ð, sizeof(eth)); - memcpy(packet + sizeof(eth), test_payload, TEST_PAYLOAD_LEN); + assert_test_result(result_map); + +close: + if (tap_fd >= 0) + close(tap_fd); + netns_free(ns); +} + +/* Write a packet to a tap dev and copy it to ingress of a dummy dev */ +static void test_tuntap_mirred(struct bpf_program *xdp_prog, + struct bpf_program *tc_prog, + bool *test_pass) +{ + LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); + struct netns_obj *ns = NULL; + int dummy_ifindex; + int tap_fd = -1; + int tap_ifindex; + int ret; - ret = write(tap_fd, packet, sizeof(packet)); - if (!ASSERT_EQ(ret, sizeof(packet), "write packet")) + *test_pass = false; + + ns = netns_new(TAP_NETNS, true); + if (!ASSERT_OK_PTR(ns, "netns_new")) + return; + + /* Setup dummy interface */ + SYS(close, "ip link add name " DUMMY_NAME " type dummy"); + SYS(close, "ip link set dev " DUMMY_NAME " up"); + + dummy_ifindex = if_nametoindex(DUMMY_NAME); + if (!ASSERT_GE(dummy_ifindex, 0, "if_nametoindex")) goto close; - assert_test_result(result_map); + tc_hook.ifindex = dummy_ifindex; + ret = bpf_tc_hook_create(&tc_hook); + if (!ASSERT_OK(ret, "bpf_tc_hook_create")) + goto close; + + tc_opts.prog_fd = bpf_program__fd(tc_prog); + ret = bpf_tc_attach(&tc_hook, &tc_opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + goto close; + + /* Setup TAP interface */ + tap_fd = open_tuntap(TAP_NAME, true); + if (!ASSERT_GE(tap_fd, 0, "open_tuntap")) + goto close; + + SYS(close, "ip link set dev " TAP_NAME " up"); + + tap_ifindex = if_nametoindex(TAP_NAME); + if (!ASSERT_GE(tap_ifindex, 0, "if_nametoindex")) + goto close; + + ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(xdp_prog), 0, NULL); + if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) + goto close; + + /* Copy all packets received from TAP to dummy ingress */ + SYS(close, "tc qdisc add dev " TAP_NAME " clsact"); + SYS(close, "tc filter add dev " TAP_NAME " ingress " + "protocol all matchall " + "action mirred ingress mirror dev " DUMMY_NAME); + + /* Receive a packet on TAP */ + ret = write_test_packet(tap_fd); + if (!ASSERT_OK(ret, "write_test_packet")) + goto close; + + ASSERT_TRUE(*test_pass, "test_pass"); close: if (tap_fd >= 0) @@ -385,6 +464,30 @@ void test_xdp_context_tuntap(void) skel->progs.ing_cls_dynptr_offset_oob, skel->progs.ing_cls, skel->maps.test_result); + if (test__start_subtest("clone_data_meta_empty_on_data_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_data_meta_empty_on_data_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_data_meta_empty_on_meta_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_data_meta_empty_on_meta_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_empty_on_data_slice_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_empty_on_data_slice_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_empty_on_meta_slice_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_empty_on_meta_slice_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_rdonly_before_data_dynptr_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_rdonly_before_data_dynptr_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_rdonly_before_meta_dynptr_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_rdonly_before_meta_dynptr_write, + &skel->bss->test_pass); test_xdp_meta__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index ee3d8adf5e9cc2..d79cb74b571e72 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -26,6 +26,8 @@ struct { __uint(value_size, META_SIZE); } test_result SEC(".maps"); +bool test_pass; + SEC("tc") int ing_cls(struct __sk_buff *ctx) { @@ -301,4 +303,193 @@ int ing_xdp(struct xdp_md *ctx) return XDP_PASS; } +/* + * Check that skb->data_meta..skb->data is empty if prog writes to packet + * _payload_ using packet pointers. Applies only to cloned skbs. + */ +SEC("tc") +int clone_data_meta_empty_on_data_write(struct __sk_buff *ctx) +{ + struct ethhdr *eth = ctx_ptr(ctx, data); + + if (eth + 1 > ctx_ptr(ctx, data_end)) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect no metadata */ + if (ctx->data_meta != ctx->data) + goto out; + + /* Packet write to trigger unclone in prologue */ + eth->h_proto = 42; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb->data_meta..skb->data is empty if prog writes to packet + * _metadata_ using packet pointers. Applies only to cloned skbs. + */ +SEC("tc") +int clone_data_meta_empty_on_meta_write(struct __sk_buff *ctx) +{ + struct ethhdr *eth = ctx_ptr(ctx, data); + __u8 *md = ctx_ptr(ctx, data_meta); + + if (eth + 1 > ctx_ptr(ctx, data_end)) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + if (md + 1 > ctx_ptr(ctx, data)) { + /* Expect no metadata */ + test_pass = true; + } else { + /* Metadata write to trigger unclone in prologue */ + *md = 42; + } +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is writable but empty if prog writes to packet + * _payload_ using a dynptr slice. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_empty_on_data_slice_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + struct ethhdr *eth; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice_rdwr(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect no metadata */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) + goto out; + + /* Packet write to trigger unclone in prologue */ + eth->h_proto = 42; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is writable but empty if prog writes to packet + * _metadata_ using a dynptr slice. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_empty_on_meta_slice_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + const struct ethhdr *eth; + __u8 *md; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect no metadata */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) + goto out; + + /* Metadata write to trigger unclone in prologue */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (md) + *md = 42; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is read-only before prog writes to packet payload + * using dynptr_write helper. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_rdonly_before_data_dynptr_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + const struct ethhdr *eth; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect read-only metadata before unclone */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE) + goto out; + + /* Helper write to payload will unclone the packet */ + bpf_dynptr_write(&data, offsetof(struct ethhdr, h_proto), "x", 1, 0); + + /* Expect no metadata after unclone */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != 0) + goto out; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is read-only if prog writes to packet + * metadata using dynptr_write helper. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_rdonly_before_meta_dynptr_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + const struct ethhdr *eth; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect read-only metadata */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE) + goto out; + + /* Metadata write. Expect failure. */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_write(&meta, 0, "x", 1, 0) != -EINVAL) + goto out; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + char _license[] SEC("license") = "GPL"; From e73f759d2e98c729ca6f98dad4ca6d7b9120e576 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Fri, 15 Aug 2025 11:56:03 +0800 Subject: [PATCH 0242/3206] security: use umax() to improve code Use umax() to reduce the code in update_mmap_min_addr() and improve its readability. Signed-off-by: Qianfeng Rong [PM: subj line tweak] Signed-off-by: Paul Moore --- security/min_addr.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/security/min_addr.c b/security/min_addr.c index df1bc643d886bd..c55bb84b863209 100644 --- a/security/min_addr.c +++ b/security/min_addr.c @@ -3,6 +3,7 @@ #include #include #include +#include /* amount of vm to protect from userspace access by both DAC and the LSM*/ unsigned long mmap_min_addr; @@ -16,10 +17,7 @@ unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; static void update_mmap_min_addr(void) { #ifdef CONFIG_LSM_MMAP_MIN_ADDR - if (dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) - mmap_min_addr = dac_mmap_min_addr; - else - mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; + mmap_min_addr = umax(dac_mmap_min_addr, CONFIG_LSM_MMAP_MIN_ADDR); #else mmap_min_addr = dac_mmap_min_addr; #endif From bf7a6a67050f5d59a674ecc3ecd06d6b09cfec49 Mon Sep 17 00:00:00 2001 From: Vincent Li Date: Mon, 18 Aug 2025 09:51:13 -0700 Subject: [PATCH 0243/3206] bpftool: Add kernel.kptr_restrict hint for no instructions From bpftool's github repository issue [0]: When a Linux distribution has the kernel.kptr_restrict set to 2, bpftool prog dump jited returns "no instructions returned". This message can be puzzling to bpftool users who are not familiar with kernel BPF internals, so add a small hint for bpftool users to check the kernel.kptr_restrict setting similar to the DUMP_XLATED case. Outside of kernel.kptr_restrict, no instructions could also be returned in case the JIT was disabled. Signed-off-by: Vincent Li Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://github.com/libbpf/bpftool/issues/184 [0] Link: https://lore.kernel.org/bpf/20250818165113.15982-1-vincent.mc.li@gmail.com --- tools/bpf/bpftool/prog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 9722d841abc05d..cf18c387968014 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -714,7 +714,7 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, if (mode == DUMP_JITED) { if (info->jited_prog_len == 0 || !info->jited_prog_insns) { - p_info("no instructions returned"); + p_err("error retrieving jit dump: no instructions returned or kernel.kptr_restrict set?"); return -1; } buf = u64_to_ptr(info->jited_prog_insns); From dcb8d01b65fb5a891ddbbedcbe6eff0b8ec37867 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 18 Jul 2025 19:13:39 +0300 Subject: [PATCH 0244/3206] dt-bindings: power: qcom-rpmpd: split RPMh domains definitions Historically both RPM and RPMh domain definitions were a part of the same, qcom-rpmpd.h header. Now as we have a separate header for RPMh definitions, qcom,rpmhpd.h, move all RPMh power domain definitions to that header. Signed-off-by: Dmitry Baryshkov Acked-by: Rob Herring (Arm) Reviewed-by: Konrad Dybcio Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20250718-rework-rpmhpd-rpmpd-v1-1-eedca108e540@oss.qualcomm.com Signed-off-by: Ulf Hansson --- include/dt-bindings/power/qcom,rpmhpd.h | 233 ++++++++++++++++++++++++ include/dt-bindings/power/qcom-rpmpd.h | 228 +---------------------- 2 files changed, 234 insertions(+), 227 deletions(-) diff --git a/include/dt-bindings/power/qcom,rpmhpd.h b/include/dt-bindings/power/qcom,rpmhpd.h index e54ffa3614515c..73cceb88953f70 100644 --- a/include/dt-bindings/power/qcom,rpmhpd.h +++ b/include/dt-bindings/power/qcom,rpmhpd.h @@ -29,4 +29,237 @@ #define RPMHPD_NSP2 19 #define RPMHPD_GMXC 20 +/* RPMh Power Domain performance levels */ +#define RPMH_REGULATOR_LEVEL_RETENTION 16 +#define RPMH_REGULATOR_LEVEL_MIN_SVS 48 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D3 50 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D2 52 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D1 56 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D0 60 +#define RPMH_REGULATOR_LEVEL_LOW_SVS 64 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_P1 72 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_L1 80 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_L2 96 +#define RPMH_REGULATOR_LEVEL_SVS 128 +#define RPMH_REGULATOR_LEVEL_SVS_L0 144 +#define RPMH_REGULATOR_LEVEL_SVS_L1 192 +#define RPMH_REGULATOR_LEVEL_SVS_L2 224 +#define RPMH_REGULATOR_LEVEL_NOM 256 +#define RPMH_REGULATOR_LEVEL_NOM_L0 288 +#define RPMH_REGULATOR_LEVEL_NOM_L1 320 +#define RPMH_REGULATOR_LEVEL_NOM_L2 336 +#define RPMH_REGULATOR_LEVEL_TURBO 384 +#define RPMH_REGULATOR_LEVEL_TURBO_L0 400 +#define RPMH_REGULATOR_LEVEL_TURBO_L1 416 +#define RPMH_REGULATOR_LEVEL_TURBO_L2 432 +#define RPMH_REGULATOR_LEVEL_TURBO_L3 448 +#define RPMH_REGULATOR_LEVEL_TURBO_L4 452 +#define RPMH_REGULATOR_LEVEL_TURBO_L5 456 +#define RPMH_REGULATOR_LEVEL_SUPER_TURBO 464 +#define RPMH_REGULATOR_LEVEL_SUPER_TURBO_NO_CPR 480 + +/* + * Platform-specific power domain bindings. Don't add new entries here, use + * RPMHPD_* above. + */ + +/* SA8775P Power Domain Indexes */ +#define SA8775P_CX 0 +#define SA8775P_CX_AO 1 +#define SA8775P_DDR 2 +#define SA8775P_EBI 3 +#define SA8775P_GFX 4 +#define SA8775P_LCX 5 +#define SA8775P_LMX 6 +#define SA8775P_MMCX 7 +#define SA8775P_MMCX_AO 8 +#define SA8775P_MSS 9 +#define SA8775P_MX 10 +#define SA8775P_MX_AO 11 +#define SA8775P_MXC 12 +#define SA8775P_MXC_AO 13 +#define SA8775P_NSP0 14 +#define SA8775P_NSP1 15 +#define SA8775P_XO 16 + +/* SDM670 Power Domain Indexes */ +#define SDM670_MX 0 +#define SDM670_MX_AO 1 +#define SDM670_CX 2 +#define SDM670_CX_AO 3 +#define SDM670_LMX 4 +#define SDM670_LCX 5 +#define SDM670_GFX 6 +#define SDM670_MSS 7 + +/* SDM845 Power Domain Indexes */ +#define SDM845_EBI 0 +#define SDM845_MX 1 +#define SDM845_MX_AO 2 +#define SDM845_CX 3 +#define SDM845_CX_AO 4 +#define SDM845_LMX 5 +#define SDM845_LCX 6 +#define SDM845_GFX 7 +#define SDM845_MSS 8 + +/* SDX55 Power Domain Indexes */ +#define SDX55_MSS 0 +#define SDX55_MX 1 +#define SDX55_CX 2 + +/* SDX65 Power Domain Indexes */ +#define SDX65_MSS 0 +#define SDX65_MX 1 +#define SDX65_MX_AO 2 +#define SDX65_CX 3 +#define SDX65_CX_AO 4 +#define SDX65_MXC 5 + +/* SM6350 Power Domain Indexes */ +#define SM6350_CX 0 +#define SM6350_GFX 1 +#define SM6350_LCX 2 +#define SM6350_LMX 3 +#define SM6350_MSS 4 +#define SM6350_MX 5 + +/* SM8150 Power Domain Indexes */ +#define SM8150_MSS 0 +#define SM8150_EBI 1 +#define SM8150_LMX 2 +#define SM8150_LCX 3 +#define SM8150_GFX 4 +#define SM8150_MX 5 +#define SM8150_MX_AO 6 +#define SM8150_CX 7 +#define SM8150_CX_AO 8 +#define SM8150_MMCX 9 +#define SM8150_MMCX_AO 10 + +/* SA8155P is a special case, kept for backwards compatibility */ +#define SA8155P_CX SM8150_CX +#define SA8155P_CX_AO SM8150_CX_AO +#define SA8155P_EBI SM8150_EBI +#define SA8155P_GFX SM8150_GFX +#define SA8155P_MSS SM8150_MSS +#define SA8155P_MX SM8150_MX +#define SA8155P_MX_AO SM8150_MX_AO + +/* SM8250 Power Domain Indexes */ +#define SM8250_CX 0 +#define SM8250_CX_AO 1 +#define SM8250_EBI 2 +#define SM8250_GFX 3 +#define SM8250_LCX 4 +#define SM8250_LMX 5 +#define SM8250_MMCX 6 +#define SM8250_MMCX_AO 7 +#define SM8250_MX 8 +#define SM8250_MX_AO 9 + +/* SM8350 Power Domain Indexes */ +#define SM8350_CX 0 +#define SM8350_CX_AO 1 +#define SM8350_EBI 2 +#define SM8350_GFX 3 +#define SM8350_LCX 4 +#define SM8350_LMX 5 +#define SM8350_MMCX 6 +#define SM8350_MMCX_AO 7 +#define SM8350_MX 8 +#define SM8350_MX_AO 9 +#define SM8350_MXC 10 +#define SM8350_MXC_AO 11 +#define SM8350_MSS 12 + +/* SM8450 Power Domain Indexes */ +#define SM8450_CX 0 +#define SM8450_CX_AO 1 +#define SM8450_EBI 2 +#define SM8450_GFX 3 +#define SM8450_LCX 4 +#define SM8450_LMX 5 +#define SM8450_MMCX 6 +#define SM8450_MMCX_AO 7 +#define SM8450_MX 8 +#define SM8450_MX_AO 9 +#define SM8450_MXC 10 +#define SM8450_MXC_AO 11 +#define SM8450_MSS 12 + +/* SM8550 Power Domain Indexes */ +#define SM8550_CX 0 +#define SM8550_CX_AO 1 +#define SM8550_EBI 2 +#define SM8550_GFX 3 +#define SM8550_LCX 4 +#define SM8550_LMX 5 +#define SM8550_MMCX 6 +#define SM8550_MMCX_AO 7 +#define SM8550_MX 8 +#define SM8550_MX_AO 9 +#define SM8550_MXC 10 +#define SM8550_MXC_AO 11 +#define SM8550_MSS 12 +#define SM8550_NSP 13 + +/* QDU1000/QRU1000 Power Domain Indexes */ +#define QDU1000_EBI 0 +#define QDU1000_MSS 1 +#define QDU1000_CX 2 +#define QDU1000_MX 3 + +/* SC7180 Power Domain Indexes */ +#define SC7180_CX 0 +#define SC7180_CX_AO 1 +#define SC7180_GFX 2 +#define SC7180_MX 3 +#define SC7180_MX_AO 4 +#define SC7180_LMX 5 +#define SC7180_LCX 6 +#define SC7180_MSS 7 + +/* SC7280 Power Domain Indexes */ +#define SC7280_CX 0 +#define SC7280_CX_AO 1 +#define SC7280_EBI 2 +#define SC7280_GFX 3 +#define SC7280_MX 4 +#define SC7280_MX_AO 5 +#define SC7280_LMX 6 +#define SC7280_LCX 7 +#define SC7280_MSS 8 + +/* SC8180X Power Domain Indexes */ +#define SC8180X_CX 0 +#define SC8180X_CX_AO 1 +#define SC8180X_EBI 2 +#define SC8180X_GFX 3 +#define SC8180X_LCX 4 +#define SC8180X_LMX 5 +#define SC8180X_MMCX 6 +#define SC8180X_MMCX_AO 7 +#define SC8180X_MSS 8 +#define SC8180X_MX 9 +#define SC8180X_MX_AO 10 + +/* SC8280XP Power Domain Indexes */ +#define SC8280XP_CX 0 +#define SC8280XP_CX_AO 1 +#define SC8280XP_DDR 2 +#define SC8280XP_EBI 3 +#define SC8280XP_GFX 4 +#define SC8280XP_LCX 5 +#define SC8280XP_LMX 6 +#define SC8280XP_MMCX 7 +#define SC8280XP_MMCX_AO 8 +#define SC8280XP_MSS 9 +#define SC8280XP_MX 10 +#define SC8280XP_MXC 12 +#define SC8280XP_MX_AO 11 +#define SC8280XP_NSP 13 +#define SC8280XP_QPHY 14 +#define SC8280XP_XO 15 + #endif diff --git a/include/dt-bindings/power/qcom-rpmpd.h b/include/dt-bindings/power/qcom-rpmpd.h index f15bcee7c9283e..d303b3b37f18e0 100644 --- a/include/dt-bindings/power/qcom-rpmpd.h +++ b/include/dt-bindings/power/qcom-rpmpd.h @@ -4,66 +4,7 @@ #ifndef _DT_BINDINGS_POWER_QCOM_RPMPD_H #define _DT_BINDINGS_POWER_QCOM_RPMPD_H -/* SA8775P Power Domain Indexes */ -#define SA8775P_CX 0 -#define SA8775P_CX_AO 1 -#define SA8775P_DDR 2 -#define SA8775P_EBI 3 -#define SA8775P_GFX 4 -#define SA8775P_LCX 5 -#define SA8775P_LMX 6 -#define SA8775P_MMCX 7 -#define SA8775P_MMCX_AO 8 -#define SA8775P_MSS 9 -#define SA8775P_MX 10 -#define SA8775P_MX_AO 11 -#define SA8775P_MXC 12 -#define SA8775P_MXC_AO 13 -#define SA8775P_NSP0 14 -#define SA8775P_NSP1 15 -#define SA8775P_XO 16 - -/* SDM670 Power Domain Indexes */ -#define SDM670_MX 0 -#define SDM670_MX_AO 1 -#define SDM670_CX 2 -#define SDM670_CX_AO 3 -#define SDM670_LMX 4 -#define SDM670_LCX 5 -#define SDM670_GFX 6 -#define SDM670_MSS 7 - -/* SDM845 Power Domain Indexes */ -#define SDM845_EBI 0 -#define SDM845_MX 1 -#define SDM845_MX_AO 2 -#define SDM845_CX 3 -#define SDM845_CX_AO 4 -#define SDM845_LMX 5 -#define SDM845_LCX 6 -#define SDM845_GFX 7 -#define SDM845_MSS 8 - -/* SDX55 Power Domain Indexes */ -#define SDX55_MSS 0 -#define SDX55_MX 1 -#define SDX55_CX 2 - -/* SDX65 Power Domain Indexes */ -#define SDX65_MSS 0 -#define SDX65_MX 1 -#define SDX65_MX_AO 2 -#define SDX65_CX 3 -#define SDX65_CX_AO 4 -#define SDX65_MXC 5 - -/* SM6350 Power Domain Indexes */ -#define SM6350_CX 0 -#define SM6350_GFX 1 -#define SM6350_LCX 2 -#define SM6350_LMX 3 -#define SM6350_MSS 4 -#define SM6350_MX 5 +#include /* SM6375 Power Domain Indexes */ #define SM6375_VDDCX 0 @@ -77,173 +18,6 @@ #define SM6375_VDD_LPI_CX 8 #define SM6375_VDD_LPI_MX 9 -/* SM8150 Power Domain Indexes */ -#define SM8150_MSS 0 -#define SM8150_EBI 1 -#define SM8150_LMX 2 -#define SM8150_LCX 3 -#define SM8150_GFX 4 -#define SM8150_MX 5 -#define SM8150_MX_AO 6 -#define SM8150_CX 7 -#define SM8150_CX_AO 8 -#define SM8150_MMCX 9 -#define SM8150_MMCX_AO 10 - -/* SA8155P is a special case, kept for backwards compatibility */ -#define SA8155P_CX SM8150_CX -#define SA8155P_CX_AO SM8150_CX_AO -#define SA8155P_EBI SM8150_EBI -#define SA8155P_GFX SM8150_GFX -#define SA8155P_MSS SM8150_MSS -#define SA8155P_MX SM8150_MX -#define SA8155P_MX_AO SM8150_MX_AO - -/* SM8250 Power Domain Indexes */ -#define SM8250_CX 0 -#define SM8250_CX_AO 1 -#define SM8250_EBI 2 -#define SM8250_GFX 3 -#define SM8250_LCX 4 -#define SM8250_LMX 5 -#define SM8250_MMCX 6 -#define SM8250_MMCX_AO 7 -#define SM8250_MX 8 -#define SM8250_MX_AO 9 - -/* SM8350 Power Domain Indexes */ -#define SM8350_CX 0 -#define SM8350_CX_AO 1 -#define SM8350_EBI 2 -#define SM8350_GFX 3 -#define SM8350_LCX 4 -#define SM8350_LMX 5 -#define SM8350_MMCX 6 -#define SM8350_MMCX_AO 7 -#define SM8350_MX 8 -#define SM8350_MX_AO 9 -#define SM8350_MXC 10 -#define SM8350_MXC_AO 11 -#define SM8350_MSS 12 - -/* SM8450 Power Domain Indexes */ -#define SM8450_CX 0 -#define SM8450_CX_AO 1 -#define SM8450_EBI 2 -#define SM8450_GFX 3 -#define SM8450_LCX 4 -#define SM8450_LMX 5 -#define SM8450_MMCX 6 -#define SM8450_MMCX_AO 7 -#define SM8450_MX 8 -#define SM8450_MX_AO 9 -#define SM8450_MXC 10 -#define SM8450_MXC_AO 11 -#define SM8450_MSS 12 - -/* SM8550 Power Domain Indexes */ -#define SM8550_CX 0 -#define SM8550_CX_AO 1 -#define SM8550_EBI 2 -#define SM8550_GFX 3 -#define SM8550_LCX 4 -#define SM8550_LMX 5 -#define SM8550_MMCX 6 -#define SM8550_MMCX_AO 7 -#define SM8550_MX 8 -#define SM8550_MX_AO 9 -#define SM8550_MXC 10 -#define SM8550_MXC_AO 11 -#define SM8550_MSS 12 -#define SM8550_NSP 13 - -/* QDU1000/QRU1000 Power Domain Indexes */ -#define QDU1000_EBI 0 -#define QDU1000_MSS 1 -#define QDU1000_CX 2 -#define QDU1000_MX 3 - -/* SC7180 Power Domain Indexes */ -#define SC7180_CX 0 -#define SC7180_CX_AO 1 -#define SC7180_GFX 2 -#define SC7180_MX 3 -#define SC7180_MX_AO 4 -#define SC7180_LMX 5 -#define SC7180_LCX 6 -#define SC7180_MSS 7 - -/* SC7280 Power Domain Indexes */ -#define SC7280_CX 0 -#define SC7280_CX_AO 1 -#define SC7280_EBI 2 -#define SC7280_GFX 3 -#define SC7280_MX 4 -#define SC7280_MX_AO 5 -#define SC7280_LMX 6 -#define SC7280_LCX 7 -#define SC7280_MSS 8 - -/* SC8180X Power Domain Indexes */ -#define SC8180X_CX 0 -#define SC8180X_CX_AO 1 -#define SC8180X_EBI 2 -#define SC8180X_GFX 3 -#define SC8180X_LCX 4 -#define SC8180X_LMX 5 -#define SC8180X_MMCX 6 -#define SC8180X_MMCX_AO 7 -#define SC8180X_MSS 8 -#define SC8180X_MX 9 -#define SC8180X_MX_AO 10 - -/* SC8280XP Power Domain Indexes */ -#define SC8280XP_CX 0 -#define SC8280XP_CX_AO 1 -#define SC8280XP_DDR 2 -#define SC8280XP_EBI 3 -#define SC8280XP_GFX 4 -#define SC8280XP_LCX 5 -#define SC8280XP_LMX 6 -#define SC8280XP_MMCX 7 -#define SC8280XP_MMCX_AO 8 -#define SC8280XP_MSS 9 -#define SC8280XP_MX 10 -#define SC8280XP_MXC 12 -#define SC8280XP_MX_AO 11 -#define SC8280XP_NSP 13 -#define SC8280XP_QPHY 14 -#define SC8280XP_XO 15 - -/* SDM845 Power Domain performance levels */ -#define RPMH_REGULATOR_LEVEL_RETENTION 16 -#define RPMH_REGULATOR_LEVEL_MIN_SVS 48 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D3 50 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D2 52 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D1 56 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D0 60 -#define RPMH_REGULATOR_LEVEL_LOW_SVS 64 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_P1 72 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_L1 80 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_L2 96 -#define RPMH_REGULATOR_LEVEL_SVS 128 -#define RPMH_REGULATOR_LEVEL_SVS_L0 144 -#define RPMH_REGULATOR_LEVEL_SVS_L1 192 -#define RPMH_REGULATOR_LEVEL_SVS_L2 224 -#define RPMH_REGULATOR_LEVEL_NOM 256 -#define RPMH_REGULATOR_LEVEL_NOM_L0 288 -#define RPMH_REGULATOR_LEVEL_NOM_L1 320 -#define RPMH_REGULATOR_LEVEL_NOM_L2 336 -#define RPMH_REGULATOR_LEVEL_TURBO 384 -#define RPMH_REGULATOR_LEVEL_TURBO_L0 400 -#define RPMH_REGULATOR_LEVEL_TURBO_L1 416 -#define RPMH_REGULATOR_LEVEL_TURBO_L2 432 -#define RPMH_REGULATOR_LEVEL_TURBO_L3 448 -#define RPMH_REGULATOR_LEVEL_TURBO_L4 452 -#define RPMH_REGULATOR_LEVEL_TURBO_L5 456 -#define RPMH_REGULATOR_LEVEL_SUPER_TURBO 464 -#define RPMH_REGULATOR_LEVEL_SUPER_TURBO_NO_CPR 480 - /* MDM9607 Power Domains */ #define MDM9607_VDDCX 0 #define MDM9607_VDDCX_AO 1 From e6e1e3b6b8f9b9b78aa0dccdde431145cefb05f5 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 18 Jul 2025 19:13:40 +0300 Subject: [PATCH 0245/3206] dt-bindings: power: qcom-rpmpd: sort out entries After removing RPMh PD indices, it becomes obvious that several entries don't follow the alphabetic sorting order. Move them in order to keep the file sorted. Signed-off-by: Dmitry Baryshkov Acked-by: Rob Herring (Arm) Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20250718-rework-rpmhpd-rpmpd-v1-2-eedca108e540@oss.qualcomm.com Signed-off-by: Ulf Hansson --- include/dt-bindings/power/qcom-rpmpd.h | 42 +++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/include/dt-bindings/power/qcom-rpmpd.h b/include/dt-bindings/power/qcom-rpmpd.h index d303b3b37f18e0..65f7d5ecc3521c 100644 --- a/include/dt-bindings/power/qcom-rpmpd.h +++ b/include/dt-bindings/power/qcom-rpmpd.h @@ -6,18 +6,6 @@ #include -/* SM6375 Power Domain Indexes */ -#define SM6375_VDDCX 0 -#define SM6375_VDDCX_AO 1 -#define SM6375_VDDCX_VFL 2 -#define SM6375_VDDMX 3 -#define SM6375_VDDMX_AO 4 -#define SM6375_VDDMX_VFL 5 -#define SM6375_VDDGX 6 -#define SM6375_VDDGX_AO 7 -#define SM6375_VDD_LPI_CX 8 -#define SM6375_VDD_LPI_MX 9 - /* MDM9607 Power Domains */ #define MDM9607_VDDCX 0 #define MDM9607_VDDCX_AO 1 @@ -130,6 +118,16 @@ #define MSM8998_SSCMX 8 #define MSM8998_SSCMX_VFL 9 +/* QCM2290 Power Domains */ +#define QCM2290_VDDCX 0 +#define QCM2290_VDDCX_AO 1 +#define QCM2290_VDDCX_VFL 2 +#define QCM2290_VDDMX 3 +#define QCM2290_VDDMX_AO 4 +#define QCM2290_VDDMX_VFL 5 +#define QCM2290_VDD_LPI_CX 6 +#define QCM2290_VDD_LPI_MX 7 + /* QCS404 Power Domains */ #define QCS404_VDDMX 0 #define QCS404_VDDMX_AO 1 @@ -169,15 +167,17 @@ #define SM6125_VDDMX_AO 4 #define SM6125_VDDMX_VFL 5 -/* QCM2290 Power Domains */ -#define QCM2290_VDDCX 0 -#define QCM2290_VDDCX_AO 1 -#define QCM2290_VDDCX_VFL 2 -#define QCM2290_VDDMX 3 -#define QCM2290_VDDMX_AO 4 -#define QCM2290_VDDMX_VFL 5 -#define QCM2290_VDD_LPI_CX 6 -#define QCM2290_VDD_LPI_MX 7 +/* SM6375 Power Domain Indexes */ +#define SM6375_VDDCX 0 +#define SM6375_VDDCX_AO 1 +#define SM6375_VDDCX_VFL 2 +#define SM6375_VDDMX 3 +#define SM6375_VDDMX_AO 4 +#define SM6375_VDDMX_VFL 5 +#define SM6375_VDDGX 6 +#define SM6375_VDDGX_AO 7 +#define SM6375_VDD_LPI_CX 8 +#define SM6375_VDD_LPI_MX 9 /* RPM SMD Power Domain performance levels */ #define RPM_SMD_LEVEL_RETENTION 16 From 94838f383a050e124c044e74a954777b8f2e6c17 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 18 Jul 2025 19:13:41 +0300 Subject: [PATCH 0246/3206] dt-bindings: power: qcom-rpmpd: add generic bindings for RPM power domains Some of the Qualcomm RPM PD controllers use a common set of indices for power domains. Add generic indices for Qualcomm RPM power domain controllers. Signed-off-by: Dmitry Baryshkov Acked-by: Rob Herring (Arm) Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20250718-rework-rpmhpd-rpmpd-v1-3-eedca108e540@oss.qualcomm.com Signed-off-by: Ulf Hansson --- include/dt-bindings/power/qcom-rpmpd.h | 121 ++++++++++++++----------- 1 file changed, 70 insertions(+), 51 deletions(-) diff --git a/include/dt-bindings/power/qcom-rpmpd.h b/include/dt-bindings/power/qcom-rpmpd.h index 65f7d5ecc3521c..4371ac941f29d9 100644 --- a/include/dt-bindings/power/qcom-rpmpd.h +++ b/include/dt-bindings/power/qcom-rpmpd.h @@ -6,18 +6,37 @@ #include +/* Generic RPM Power Domain Indexes */ +#define RPMPD_VDDCX 0 +#define RPMPD_VDDCX_AO 1 +/* VFC and VFL are mutually exclusive and can not be present on the same platform */ +#define RPMPD_VDDCX_VFC 2 +#define RPMPD_VDDCX_VFL 2 +#define RPMPD_VDDMX 3 +#define RPMPD_VDDMX_AO 4 +#define RPMPD_VDDMX_VFL 5 +#define RPMPD_SSCCX 6 +#define RPMPD_SSCCX_VFL 7 +#define RPMPD_SSCMX 8 +#define RPMPD_SSCMX_VFL 9 + +/* + * Platform-specific power domain bindings. Don't add new entries here, use + * RPMPD_* above. + */ + /* MDM9607 Power Domains */ -#define MDM9607_VDDCX 0 -#define MDM9607_VDDCX_AO 1 -#define MDM9607_VDDCX_VFL 2 -#define MDM9607_VDDMX 3 -#define MDM9607_VDDMX_AO 4 -#define MDM9607_VDDMX_VFL 5 +#define MDM9607_VDDCX RPMPD_VDDCX +#define MDM9607_VDDCX_AO RPMPD_VDDCX_AO +#define MDM9607_VDDCX_VFL RPMPD_VDDCX_VFL +#define MDM9607_VDDMX RPMPD_VDDMX +#define MDM9607_VDDMX_AO RPMPD_VDDMX_AO +#define MDM9607_VDDMX_VFL RPMPD_VDDMX_VFL /* MSM8226 Power Domain Indexes */ -#define MSM8226_VDDCX 0 -#define MSM8226_VDDCX_AO 1 -#define MSM8226_VDDCX_VFC 2 +#define MSM8226_VDDCX RPMPD_VDDCX +#define MSM8226_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8226_VDDCX_VFC RPMPD_VDDCX_VFC /* MSM8939 Power Domains */ #define MSM8939_VDDMDCX 0 @@ -30,11 +49,11 @@ #define MSM8939_VDDMX_AO 7 /* MSM8916 Power Domain Indexes */ -#define MSM8916_VDDCX 0 -#define MSM8916_VDDCX_AO 1 -#define MSM8916_VDDCX_VFC 2 -#define MSM8916_VDDMX 3 -#define MSM8916_VDDMX_AO 4 +#define MSM8916_VDDCX RPMPD_VDDCX +#define MSM8916_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8916_VDDCX_VFC RPMPD_VDDCX_VFC +#define MSM8916_VDDMX RPMPD_VDDMX +#define MSM8916_VDDMX_AO RPMPD_VDDMX_AO /* MSM8909 Power Domain Indexes */ #define MSM8909_VDDCX MSM8916_VDDCX @@ -44,11 +63,11 @@ #define MSM8909_VDDMX_AO MSM8916_VDDMX_AO /* MSM8917 Power Domain Indexes */ -#define MSM8917_VDDCX 0 -#define MSM8917_VDDCX_AO 1 -#define MSM8917_VDDCX_VFL 2 -#define MSM8917_VDDMX 3 -#define MSM8917_VDDMX_AO 4 +#define MSM8917_VDDCX RPMPD_VDDCX +#define MSM8917_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8917_VDDCX_VFL RPMPD_VDDCX_VFL +#define MSM8917_VDDMX RPMPD_VDDMX +#define MSM8917_VDDMX_AO RPMPD_VDDMX_AO /* MSM8937 Power Domain Indexes */ #define MSM8937_VDDCX MSM8917_VDDCX @@ -81,12 +100,12 @@ #define MSM8974_VDDGFX_VFC 4 /* MSM8976 Power Domain Indexes */ -#define MSM8976_VDDCX 0 -#define MSM8976_VDDCX_AO 1 -#define MSM8976_VDDCX_VFL 2 -#define MSM8976_VDDMX 3 -#define MSM8976_VDDMX_AO 4 -#define MSM8976_VDDMX_VFL 5 +#define MSM8976_VDDCX RPMPD_VDDCX +#define MSM8976_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8976_VDDCX_VFL RPMPD_VDDCX_VFL +#define MSM8976_VDDMX RPMPD_VDDMX +#define MSM8976_VDDMX_AO RPMPD_VDDMX_AO +#define MSM8976_VDDMX_VFL RPMPD_VDDMX_VFL /* MSM8994 Power Domain Indexes */ #define MSM8994_VDDCX 0 @@ -107,16 +126,16 @@ #define MSM8996_VDDSSCX_VFC 6 /* MSM8998 Power Domain Indexes */ -#define MSM8998_VDDCX 0 -#define MSM8998_VDDCX_AO 1 -#define MSM8998_VDDCX_VFL 2 -#define MSM8998_VDDMX 3 -#define MSM8998_VDDMX_AO 4 -#define MSM8998_VDDMX_VFL 5 -#define MSM8998_SSCCX 6 -#define MSM8998_SSCCX_VFL 7 -#define MSM8998_SSCMX 8 -#define MSM8998_SSCMX_VFL 9 +#define MSM8998_VDDCX RPMPD_VDDCX +#define MSM8998_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8998_VDDCX_VFL RPMPD_VDDCX_VFL +#define MSM8998_VDDMX RPMPD_VDDMX +#define MSM8998_VDDMX_AO RPMPD_VDDMX_AO +#define MSM8998_VDDMX_VFL RPMPD_VDDMX_VFL +#define MSM8998_SSCCX RPMPD_SSCCX +#define MSM8998_SSCCX_VFL RPMPD_SSCCX_VFL +#define MSM8998_SSCMX RPMPD_SSCMX +#define MSM8998_SSCMX_VFL RPMPD_SSCMX_VFL /* QCM2290 Power Domains */ #define QCM2290_VDDCX 0 @@ -138,16 +157,16 @@ #define QCS404_LPIMX_VFL 6 /* SDM660 Power Domains */ -#define SDM660_VDDCX 0 -#define SDM660_VDDCX_AO 1 -#define SDM660_VDDCX_VFL 2 -#define SDM660_VDDMX 3 -#define SDM660_VDDMX_AO 4 -#define SDM660_VDDMX_VFL 5 -#define SDM660_SSCCX 6 -#define SDM660_SSCCX_VFL 7 -#define SDM660_SSCMX 8 -#define SDM660_SSCMX_VFL 9 +#define SDM660_VDDCX RPMPD_VDDCX +#define SDM660_VDDCX_AO RPMPD_VDDCX_AO +#define SDM660_VDDCX_VFL RPMPD_VDDCX_VFL +#define SDM660_VDDMX RPMPD_VDDMX +#define SDM660_VDDMX_AO RPMPD_VDDMX_AO +#define SDM660_VDDMX_VFL RPMPD_VDDMX_VFL +#define SDM660_SSCCX RPMPD_SSCCX +#define SDM660_SSCCX_VFL RPMPD_SSCCX_VFL +#define SDM660_SSCMX RPMPD_SSCMX +#define SDM660_SSCMX_VFL RPMPD_SSCMX_VFL /* SM6115 Power Domains */ #define SM6115_VDDCX 0 @@ -160,12 +179,12 @@ #define SM6115_VDD_LPI_MX 7 /* SM6125 Power Domains */ -#define SM6125_VDDCX 0 -#define SM6125_VDDCX_AO 1 -#define SM6125_VDDCX_VFL 2 -#define SM6125_VDDMX 3 -#define SM6125_VDDMX_AO 4 -#define SM6125_VDDMX_VFL 5 +#define SM6125_VDDCX RPMPD_VDDCX +#define SM6125_VDDCX_AO RPMPD_VDDCX_AO +#define SM6125_VDDCX_VFL RPMPD_VDDCX_VFL +#define SM6125_VDDMX RPMPD_VDDMX +#define SM6125_VDDMX_AO RPMPD_VDDMX_AO +#define SM6125_VDDMX_VFL RPMPD_VDDMX_VFL /* SM6375 Power Domain Indexes */ #define SM6375_VDDCX 0 From 31f799ae091bb5254085520a41ecc060dc667e01 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 18 Jul 2025 19:13:42 +0300 Subject: [PATCH 0247/3206] pmdomain: qcom: rpmpd: switch to RPMPD_* indices Use generic RPMPD_* defines for power domain instead of using platform-specific defines for the platforms that use shared indices. Signed-off-by: Dmitry Baryshkov Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20250718-rework-rpmhpd-rpmpd-v1-4-eedca108e540@oss.qualcomm.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/qcom/rpmpd.c | 112 +++++++++++++++++----------------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/drivers/pmdomain/qcom/rpmpd.c b/drivers/pmdomain/qcom/rpmpd.c index 833c46944600fa..f8580ec0f73785 100644 --- a/drivers/pmdomain/qcom/rpmpd.c +++ b/drivers/pmdomain/qcom/rpmpd.c @@ -631,12 +631,12 @@ static struct rpmpd ssc_mx_rwsm0_vfl = { }; static struct rpmpd *mdm9607_rpmpds[] = { - [MDM9607_VDDCX] = &cx_s3a_lvl, - [MDM9607_VDDCX_AO] = &cx_s3a_lvl_ao, - [MDM9607_VDDCX_VFL] = &cx_s3a_vfl, - [MDM9607_VDDMX] = &mx_l12a_lvl, - [MDM9607_VDDMX_AO] = &mx_l12a_lvl_ao, - [MDM9607_VDDMX_VFL] = &mx_l12a_vfl, + [RPMPD_VDDCX] = &cx_s3a_lvl, + [RPMPD_VDDCX_AO] = &cx_s3a_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_s3a_vfl, + [RPMPD_VDDMX] = &mx_l12a_lvl, + [RPMPD_VDDMX_AO] = &mx_l12a_lvl_ao, + [RPMPD_VDDMX_VFL] = &mx_l12a_vfl, }; static const struct rpmpd_desc mdm9607_desc = { @@ -646,9 +646,9 @@ static const struct rpmpd_desc mdm9607_desc = { }; static struct rpmpd *msm8226_rpmpds[] = { - [MSM8226_VDDCX] = &cx_s1a_corner, - [MSM8226_VDDCX_AO] = &cx_s1a_corner_ao, - [MSM8226_VDDCX_VFC] = &cx_s1a_vfc, + [RPMPD_VDDCX] = &cx_s1a_corner, + [RPMPD_VDDCX_AO] = &cx_s1a_corner_ao, + [RPMPD_VDDCX_VFC] = &cx_s1a_vfc, }; static const struct rpmpd_desc msm8226_desc = { @@ -675,11 +675,11 @@ static const struct rpmpd_desc msm8939_desc = { }; static struct rpmpd *msm8916_rpmpds[] = { - [MSM8916_VDDCX] = &cx_s1a_corner, - [MSM8916_VDDCX_AO] = &cx_s1a_corner_ao, - [MSM8916_VDDCX_VFC] = &cx_s1a_vfc, - [MSM8916_VDDMX] = &mx_l3a_corner, - [MSM8916_VDDMX_AO] = &mx_l3a_corner_ao, + [RPMPD_VDDCX] = &cx_s1a_corner, + [RPMPD_VDDCX_AO] = &cx_s1a_corner_ao, + [RPMPD_VDDCX_VFC] = &cx_s1a_vfc, + [RPMPD_VDDMX] = &mx_l3a_corner, + [RPMPD_VDDMX_AO] = &mx_l3a_corner_ao, }; static const struct rpmpd_desc msm8916_desc = { @@ -689,11 +689,11 @@ static const struct rpmpd_desc msm8916_desc = { }; static struct rpmpd *msm8917_rpmpds[] = { - [MSM8917_VDDCX] = &cx_s2a_lvl, - [MSM8917_VDDCX_AO] = &cx_s2a_lvl_ao, - [MSM8917_VDDCX_VFL] = &cx_s2a_vfl, - [MSM8917_VDDMX] = &mx_l3a_lvl, - [MSM8917_VDDMX_AO] = &mx_l3a_lvl_ao, + [RPMPD_VDDCX] = &cx_s2a_lvl, + [RPMPD_VDDCX_AO] = &cx_s2a_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_s2a_vfl, + [RPMPD_VDDMX] = &mx_l3a_lvl, + [RPMPD_VDDMX_AO] = &mx_l3a_lvl_ao, }; static const struct rpmpd_desc msm8917_desc = { @@ -747,12 +747,12 @@ static const struct rpmpd_desc msm8974pro_pma8084_desc = { }; static struct rpmpd *msm8976_rpmpds[] = { - [MSM8976_VDDCX] = &cx_s2a_lvl, - [MSM8976_VDDCX_AO] = &cx_s2a_lvl_ao, - [MSM8976_VDDCX_VFL] = &cx_rwsc2_vfl, - [MSM8976_VDDMX] = &mx_s6a_lvl, - [MSM8976_VDDMX_AO] = &mx_s6a_lvl_ao, - [MSM8976_VDDMX_VFL] = &mx_rwsm6_vfl, + [RPMPD_VDDCX] = &cx_s2a_lvl, + [RPMPD_VDDCX_AO] = &cx_s2a_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_rwsc2_vfl, + [RPMPD_VDDMX] = &mx_s6a_lvl, + [RPMPD_VDDMX_AO] = &mx_s6a_lvl_ao, + [RPMPD_VDDMX_VFL] = &mx_rwsm6_vfl, }; static const struct rpmpd_desc msm8976_desc = { @@ -796,16 +796,16 @@ static const struct rpmpd_desc msm8996_desc = { }; static struct rpmpd *msm8998_rpmpds[] = { - [MSM8998_VDDCX] = &cx_rwcx0_lvl, - [MSM8998_VDDCX_AO] = &cx_rwcx0_lvl_ao, - [MSM8998_VDDCX_VFL] = &cx_rwcx0_vfl, - [MSM8998_VDDMX] = &mx_rwmx0_lvl, - [MSM8998_VDDMX_AO] = &mx_rwmx0_lvl_ao, - [MSM8998_VDDMX_VFL] = &mx_rwmx0_vfl, - [MSM8998_SSCCX] = &ssc_cx_rwsc0_lvl, - [MSM8998_SSCCX_VFL] = &ssc_cx_rwsc0_vfl, - [MSM8998_SSCMX] = &ssc_mx_rwsm0_lvl, - [MSM8998_SSCMX_VFL] = &ssc_mx_rwsm0_vfl, + [RPMPD_VDDCX] = &cx_rwcx0_lvl, + [RPMPD_VDDCX_AO] = &cx_rwcx0_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_rwcx0_vfl, + [RPMPD_VDDMX] = &mx_rwmx0_lvl, + [RPMPD_VDDMX_AO] = &mx_rwmx0_lvl_ao, + [RPMPD_VDDMX_VFL] = &mx_rwmx0_vfl, + [RPMPD_SSCCX] = &ssc_cx_rwsc0_lvl, + [RPMPD_SSCCX_VFL] = &ssc_cx_rwsc0_vfl, + [RPMPD_SSCMX] = &ssc_mx_rwsm0_lvl, + [RPMPD_SSCMX_VFL] = &ssc_mx_rwsm0_vfl, }; static const struct rpmpd_desc msm8998_desc = { @@ -831,11 +831,11 @@ static const struct rpmpd_desc qcs404_desc = { }; static struct rpmpd *qm215_rpmpds[] = { - [QM215_VDDCX] = &cx_s1a_lvl, - [QM215_VDDCX_AO] = &cx_s1a_lvl_ao, - [QM215_VDDCX_VFL] = &cx_s1a_vfl, - [QM215_VDDMX] = &mx_l2a_lvl, - [QM215_VDDMX_AO] = &mx_l2a_lvl_ao, + [RPMPD_VDDCX] = &cx_s1a_lvl, + [RPMPD_VDDCX_AO] = &cx_s1a_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_s1a_vfl, + [RPMPD_VDDMX] = &mx_l2a_lvl, + [RPMPD_VDDMX_AO] = &mx_l2a_lvl_ao, }; static const struct rpmpd_desc qm215_desc = { @@ -845,16 +845,16 @@ static const struct rpmpd_desc qm215_desc = { }; static struct rpmpd *sdm660_rpmpds[] = { - [SDM660_VDDCX] = &cx_rwcx0_lvl, - [SDM660_VDDCX_AO] = &cx_rwcx0_lvl_ao, - [SDM660_VDDCX_VFL] = &cx_rwcx0_vfl, - [SDM660_VDDMX] = &mx_rwmx0_lvl, - [SDM660_VDDMX_AO] = &mx_rwmx0_lvl_ao, - [SDM660_VDDMX_VFL] = &mx_rwmx0_vfl, - [SDM660_SSCCX] = &ssc_cx_rwlc0_lvl, - [SDM660_SSCCX_VFL] = &ssc_cx_rwlc0_vfl, - [SDM660_SSCMX] = &ssc_mx_rwlm0_lvl, - [SDM660_SSCMX_VFL] = &ssc_mx_rwlm0_vfl, + [RPMPD_VDDCX] = &cx_rwcx0_lvl, + [RPMPD_VDDCX_AO] = &cx_rwcx0_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_rwcx0_vfl, + [RPMPD_VDDMX] = &mx_rwmx0_lvl, + [RPMPD_VDDMX_AO] = &mx_rwmx0_lvl_ao, + [RPMPD_VDDMX_VFL] = &mx_rwmx0_vfl, + [RPMPD_SSCCX] = &ssc_cx_rwlc0_lvl, + [RPMPD_SSCCX_VFL] = &ssc_cx_rwlc0_vfl, + [RPMPD_SSCMX] = &ssc_mx_rwlm0_lvl, + [RPMPD_SSCMX_VFL] = &ssc_mx_rwlm0_vfl, }; static const struct rpmpd_desc sdm660_desc = { @@ -881,12 +881,12 @@ static const struct rpmpd_desc sm6115_desc = { }; static struct rpmpd *sm6125_rpmpds[] = { - [SM6125_VDDCX] = &cx_rwcx0_lvl, - [SM6125_VDDCX_AO] = &cx_rwcx0_lvl_ao, - [SM6125_VDDCX_VFL] = &cx_rwcx0_vfl, - [SM6125_VDDMX] = &mx_rwmx0_lvl, - [SM6125_VDDMX_AO] = &mx_rwmx0_lvl_ao, - [SM6125_VDDMX_VFL] = &mx_rwmx0_vfl, + [RPMPD_VDDCX] = &cx_rwcx0_lvl, + [RPMPD_VDDCX_AO] = &cx_rwcx0_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_rwcx0_vfl, + [RPMPD_VDDMX] = &mx_rwmx0_lvl, + [RPMPD_VDDMX_AO] = &mx_rwmx0_lvl_ao, + [RPMPD_VDDMX_VFL] = &mx_rwmx0_vfl, }; static const struct rpmpd_desc sm6125_desc = { From 4bcff9c05b9db73aa2cfe911e948b24151858ce7 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 11 Aug 2025 17:02:00 +0200 Subject: [PATCH 0248/3206] pinctrl: stm32: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/20250811-gpio-mmio-pinctrl-conv-v1-1-a84c5da2be20@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/stm32/pinctrl-stm32-hdp.c | 34 +++++++++++++---------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c index e91442eb566bb2..dea49b9aabf2ae 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -45,7 +46,7 @@ struct stm32_hdp { void __iomem *base; struct clk *clk; struct pinctrl_dev *pctl_dev; - struct gpio_chip gpio_chip; + struct gpio_generic_chip gpio_chip; u32 mux_conf; u32 gposet_conf; const char * const *func_name; @@ -603,6 +604,7 @@ MODULE_DEVICE_TABLE(of, stm32_hdp_of_match); static int stm32_hdp_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct stm32_hdp *hdp; u8 version; @@ -635,21 +637,25 @@ static int stm32_hdp_probe(struct platform_device *pdev) if (err) return dev_err_probe(dev, err, "Failed to enable pinctrl\n"); - hdp->gpio_chip.get_direction = stm32_hdp_gpio_get_direction; - hdp->gpio_chip.ngpio = ARRAY_SIZE(stm32_hdp_pins); - hdp->gpio_chip.can_sleep = true; - hdp->gpio_chip.names = stm32_hdp_pins_group; - - err = bgpio_init(&hdp->gpio_chip, dev, 4, - hdp->base + HDP_GPOVAL, - hdp->base + HDP_GPOSET, - hdp->base + HDP_GPOCLR, - NULL, NULL, BGPIOF_NO_INPUT); + hdp->gpio_chip.gc.get_direction = stm32_hdp_gpio_get_direction; + hdp->gpio_chip.gc.ngpio = ARRAY_SIZE(stm32_hdp_pins); + hdp->gpio_chip.gc.can_sleep = true; + hdp->gpio_chip.gc.names = stm32_hdp_pins_group; + + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = hdp->base + HDP_GPOVAL, + .set = hdp->base + HDP_GPOSET, + .clr = hdp->base + HDP_GPOCLR, + .flags = BGPIOF_NO_INPUT, + }; + + err = gpio_generic_chip_init(&hdp->gpio_chip, &config); if (err) - return dev_err_probe(dev, err, "Failed to init bgpio\n"); - + return dev_err_probe(dev, err, "Failed to init the generic GPIO chip\n"); - err = devm_gpiochip_add_data(dev, &hdp->gpio_chip, hdp); + err = devm_gpiochip_add_data(dev, &hdp->gpio_chip.gc, hdp); if (err) return dev_err_probe(dev, err, "Failed to add gpiochip\n"); From 658cd189d793d56c1f80841da6a8b5d2d7c72aa3 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 11 Aug 2025 17:02:01 +0200 Subject: [PATCH 0249/3206] pinctrl: equilibrium: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/20250811-gpio-mmio-pinctrl-conv-v1-2-a84c5da2be20@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-equilibrium.c | 26 ++++++++++++++++---------- drivers/pinctrl/pinctrl-equilibrium.h | 2 +- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/pinctrl/pinctrl-equilibrium.c b/drivers/pinctrl/pinctrl-equilibrium.c index fce804d42e7d7f..21004418567938 100644 --- a/drivers/pinctrl/pinctrl-equilibrium.c +++ b/drivers/pinctrl/pinctrl-equilibrium.c @@ -2,6 +2,7 @@ /* Copyright (C) 2019 Intel Corporation */ #include +#include #include #include #include @@ -179,7 +180,7 @@ static int gpiochip_setup(struct device *dev, struct eqbr_gpio_ctrl *gctrl) struct gpio_irq_chip *girq; struct gpio_chip *gc; - gc = &gctrl->chip; + gc = &gctrl->chip.gc; gc->label = gctrl->name; gc->fwnode = gctrl->fwnode; gc->request = gpiochip_generic_request; @@ -191,7 +192,7 @@ static int gpiochip_setup(struct device *dev, struct eqbr_gpio_ctrl *gctrl) return 0; } - girq = &gctrl->chip.irq; + girq = &gctrl->chip.gc.irq; gpio_irq_chip_set_chip(girq, &eqbr_irq_chip); girq->parent_handler = eqbr_irq_handler; girq->num_parents = 1; @@ -208,6 +209,7 @@ static int gpiochip_setup(struct device *dev, struct eqbr_gpio_ctrl *gctrl) static int gpiolib_reg(struct eqbr_pinctrl_drv_data *drvdata) { + struct gpio_generic_chip_config config; struct device *dev = drvdata->dev; struct eqbr_gpio_ctrl *gctrl; struct device_node *np; @@ -239,12 +241,16 @@ static int gpiolib_reg(struct eqbr_pinctrl_drv_data *drvdata) } raw_spin_lock_init(&gctrl->lock); - ret = bgpio_init(&gctrl->chip, dev, gctrl->bank->nr_pins / 8, - gctrl->membase + GPIO_IN, - gctrl->membase + GPIO_OUTSET, - gctrl->membase + GPIO_OUTCLR, - gctrl->membase + GPIO_DIR, - NULL, 0); + config = (typeof(config)){ + .dev = dev, + .sz = gctrl->bank->nr_pins / 8, + .dat = gctrl->membase + GPIO_IN, + .set = gctrl->membase + GPIO_OUTSET, + .clr = gctrl->membase + GPIO_OUTCLR, + .dirout = gctrl->membase + GPIO_DIR, + }; + + ret = gpio_generic_chip_init(&gctrl->chip, &config); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; @@ -254,7 +260,7 @@ static int gpiolib_reg(struct eqbr_pinctrl_drv_data *drvdata) if (ret) return ret; - ret = devm_gpiochip_add_data(dev, &gctrl->chip, gctrl); + ret = devm_gpiochip_add_data(dev, &gctrl->chip.gc, gctrl); if (ret) return ret; } @@ -499,7 +505,7 @@ static int eqbr_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, bank->pin_base, pin); return -ENODEV; } - gc = &gctrl->chip; + gc = &gctrl->chip.gc; gc->direction_output(gc, offset, 0); continue; default: diff --git a/drivers/pinctrl/pinctrl-equilibrium.h b/drivers/pinctrl/pinctrl-equilibrium.h index b4d149bde39d8d..b56124d7fe9132 100644 --- a/drivers/pinctrl/pinctrl-equilibrium.h +++ b/drivers/pinctrl/pinctrl-equilibrium.h @@ -96,7 +96,7 @@ struct fwnode_handle; * @lock: spin lock to protect gpio register write. */ struct eqbr_gpio_ctrl { - struct gpio_chip chip; + struct gpio_generic_chip chip; struct fwnode_handle *fwnode; struct eqbr_pin_bank *bank; void __iomem *membase; From d2e9afca3a40eb4fe39ef0654e89cc19b2e0a939 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 11 Aug 2025 17:02:02 +0200 Subject: [PATCH 0250/3206] pinctrl: npcm8xx: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/20250811-gpio-mmio-pinctrl-conv-v1-3-a84c5da2be20@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c | 154 +++++++++++----------- 1 file changed, 78 insertions(+), 76 deletions(-) diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c index 3c3b9d8d3681c6..0f155a685bbae7 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -90,7 +91,7 @@ struct debounce_time { }; struct npcm8xx_gpio { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *base; struct debounce_time debounce; int irqbase; @@ -115,24 +116,20 @@ struct npcm8xx_pinctrl { }; /* GPIO handling in the pinctrl driver */ -static void npcm_gpio_set(struct gpio_chip *gc, void __iomem *reg, +static void npcm_gpio_set(struct gpio_generic_chip *chip, void __iomem *reg, unsigned int pinmask) { - unsigned long flags; + guard(gpio_generic_lock_irqsave)(chip); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); iowrite32(ioread32(reg) | pinmask, reg); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } -static void npcm_gpio_clr(struct gpio_chip *gc, void __iomem *reg, +static void npcm_gpio_clr(struct gpio_generic_chip *chip, void __iomem *reg, unsigned int pinmask) { - unsigned long flags; + guard(gpio_generic_lock_irqsave)(chip); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); iowrite32(ioread32(reg) & ~pinmask, reg); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static void npcmgpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) @@ -233,32 +230,32 @@ static int npcmgpio_set_irq_type(struct irq_data *d, unsigned int type) switch (type) { case IRQ_TYPE_EDGE_RISING: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_EVBE, gpio); - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_POL, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_EVBE, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_POL, gpio); break; case IRQ_TYPE_EDGE_FALLING: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_EVBE, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_POL, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_EVBE, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_POL, gpio); break; case IRQ_TYPE_EDGE_BOTH: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_POL, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_EVBE, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_POL, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_EVBE, gpio); break; case IRQ_TYPE_LEVEL_LOW: - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_POL, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_POL, gpio); break; case IRQ_TYPE_LEVEL_HIGH: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_POL, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_POL, gpio); break; default: return -EINVAL; } if (type & IRQ_TYPE_LEVEL_MASK) { - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_EVTYP, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_EVTYP, gpio); irq_set_handler_locked(d, handle_level_irq); } else if (type & IRQ_TYPE_EDGE_BOTH) { - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_EVTYP, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_EVTYP, gpio); irq_set_handler_locked(d, handle_edge_irq); } @@ -1842,7 +1839,7 @@ static void npcm8xx_setfunc(struct regmap *gcr_regmap, const unsigned int *pin, static int npcm8xx_get_slew_rate(struct npcm8xx_gpio *bank, struct regmap *gcr_regmap, unsigned int pin) { - int gpio = pin % bank->gc.ngpio; + int gpio = pin % bank->chip.gc.ngpio; unsigned long pinmask = BIT(gpio); u32 val; @@ -1862,15 +1859,15 @@ static int npcm8xx_set_slew_rate(struct npcm8xx_gpio *bank, int arg) { void __iomem *OSRC_Offset = bank->base + NPCM8XX_GP_N_OSRC; - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); if (pincfg[pin].flag & SLEW) { switch (arg) { case 0: - npcm_gpio_clr(&bank->gc, OSRC_Offset, gpio); + npcm_gpio_clr(&bank->chip, OSRC_Offset, gpio); return 0; case 1: - npcm_gpio_set(&bank->gc, OSRC_Offset, gpio); + npcm_gpio_set(&bank->chip, OSRC_Offset, gpio); return 0; default: return -EINVAL; @@ -1902,7 +1899,7 @@ static int npcm8xx_get_drive_strength(struct pinctrl_dev *pctldev, struct npcm8xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm8xx_gpio *bank = &npcm->gpio_bank[pin / NPCM8XX_GPIO_PER_BANK]; - int gpio = pin % bank->gc.ngpio; + int gpio = pin % bank->chip.gc.ngpio; unsigned long pinmask = BIT(gpio); int flg, val; u32 ds = 0; @@ -1913,7 +1910,7 @@ static int npcm8xx_get_drive_strength(struct pinctrl_dev *pctldev, val = ioread32(bank->base + NPCM8XX_GP_N_ODSC) & pinmask; ds = val ? DSHI(flg) : DSLO(flg); - dev_dbg(bank->gc.parent, "pin %d strength %d = %d\n", pin, val, ds); + dev_dbg(bank->chip.gc.parent, "pin %d strength %d = %d\n", pin, val, ds); return ds; } @@ -1923,15 +1920,15 @@ static int npcm8xx_set_drive_strength(struct npcm8xx_pinctrl *npcm, { struct npcm8xx_gpio *bank = &npcm->gpio_bank[pin / NPCM8XX_GPIO_PER_BANK]; - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); int v; v = pincfg[pin].flag & DRIVE_STRENGTH_MASK; if (DSLO(v) == nval) - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_ODSC, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_ODSC, gpio); else if (DSHI(v) == nval) - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_ODSC, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_ODSC, gpio); else return -ENOTSUPP; @@ -2054,7 +2051,7 @@ static int npcm_gpio_set_direction(struct pinctrl_dev *pctldev, struct npcm8xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm8xx_gpio *bank = &npcm->gpio_bank[offset / NPCM8XX_GPIO_PER_BANK]; - int gpio = BIT(offset % bank->gc.ngpio); + int gpio = BIT(offset % bank->chip.gc.ngpio); if (input) iowrite32(gpio, bank->base + NPCM8XX_GP_N_OEC); @@ -2085,7 +2082,7 @@ static int debounce_timing_setting(struct npcm8xx_gpio *bank, u32 gpio, if (bank->debounce.set_val[i]) { if (bank->debounce.nanosec_val[i] == nanosecs) { debounce_select = i << gpio_debounce; - npcm_gpio_set(&bank->gc, DBNCS_offset, + npcm_gpio_set(&bank->chip, DBNCS_offset, debounce_select); break; } @@ -2093,7 +2090,7 @@ static int debounce_timing_setting(struct npcm8xx_gpio *bank, u32 gpio, bank->debounce.set_val[i] = true; bank->debounce.nanosec_val[i] = nanosecs; debounce_select = i << gpio_debounce; - npcm_gpio_set(&bank->gc, DBNCS_offset, debounce_select); + npcm_gpio_set(&bank->chip, DBNCS_offset, debounce_select); switch (nanosecs) { case 1 ... 1040: iowrite32(0, bank->base + NPCM8XX_GP_N_DBNCP0 + (i * 4)); @@ -2145,21 +2142,21 @@ static int npcm_set_debounce(struct npcm8xx_pinctrl *npcm, unsigned int pin, { struct npcm8xx_gpio *bank = &npcm->gpio_bank[pin / NPCM8XX_GPIO_PER_BANK]; - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); int ret; if (nanosecs) { - ret = debounce_timing_setting(bank, pin % bank->gc.ngpio, + ret = debounce_timing_setting(bank, pin % bank->chip.gc.ngpio, nanosecs); if (ret) dev_err(npcm->dev, "Pin %d, All four debounce timing values are used, please use one of exist debounce values\n", pin); else - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_DBNC, + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_DBNC, gpio); return ret; } - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_DBNC, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_DBNC, gpio); return 0; } @@ -2172,7 +2169,7 @@ static int npcm8xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, struct npcm8xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm8xx_gpio *bank = &npcm->gpio_bank[pin / NPCM8XX_GPIO_PER_BANK]; - int gpio = pin % bank->gc.ngpio; + int gpio = pin % bank->chip.gc.ngpio; unsigned long pinmask = BIT(gpio); u32 ie, oe, pu, pd; int rc = 0; @@ -2235,34 +2232,34 @@ static int npcm8xx_config_set_one(struct npcm8xx_pinctrl *npcm, struct npcm8xx_gpio *bank = &npcm->gpio_bank[pin / NPCM8XX_GPIO_PER_BANK]; u32 arg = pinconf_to_config_argument(config); - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); switch (param) { case PIN_CONFIG_BIAS_DISABLE: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_PU, gpio); - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_PD, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_PU, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_PD, gpio); break; case PIN_CONFIG_BIAS_PULL_DOWN: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_PU, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_PD, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_PU, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_PD, gpio); break; case PIN_CONFIG_BIAS_PULL_UP: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_PD, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_PU, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_PD, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_PU, gpio); break; case PIN_CONFIG_INPUT_ENABLE: iowrite32(gpio, bank->base + NPCM8XX_GP_N_OEC); - bank->direction_input(&bank->gc, pin % bank->gc.ngpio); + bank->direction_input(&bank->chip.gc, pin % bank->chip.gc.ngpio); break; case PIN_CONFIG_OUTPUT: - bank->direction_output(&bank->gc, pin % bank->gc.ngpio, arg); + bank->direction_output(&bank->chip.gc, pin % bank->chip.gc.ngpio, arg); iowrite32(gpio, bank->base + NPCM8XX_GP_N_OES); break; case PIN_CONFIG_DRIVE_PUSH_PULL: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_OTYP, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_OTYP, gpio); break; case PIN_CONFIG_DRIVE_OPEN_DRAIN: - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_OTYP, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_OTYP, gpio); break; case PIN_CONFIG_INPUT_DEBOUNCE: return npcm_set_debounce(npcm, pin, arg * 1000); @@ -2313,13 +2310,14 @@ static int npcmgpio_add_pin_ranges(struct gpio_chip *chip) { struct npcm8xx_gpio *bank = gpiochip_get_data(chip); - return gpiochip_add_pin_range(&bank->gc, dev_name(chip->parent), - bank->pinctrl_id, bank->gc.base, - bank->gc.ngpio); + return gpiochip_add_pin_range(&bank->chip.gc, dev_name(chip->parent), + bank->pinctrl_id, bank->chip.gc.base, + bank->chip.gc.ngpio); } static int npcm8xx_gpio_fw(struct npcm8xx_pinctrl *pctrl) { + struct gpio_generic_chip_config config; struct fwnode_reference_args args; struct device *dev = pctrl->dev; struct fwnode_handle *child; @@ -2331,15 +2329,19 @@ static int npcm8xx_gpio_fw(struct npcm8xx_pinctrl *pctrl) if (!pctrl->gpio_bank[id].base) return dev_err_probe(dev, -ENXIO, "fwnode_iomap id %d failed\n", id); - ret = bgpio_init(&pctrl->gpio_bank[id].gc, dev, 4, - pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DIN, - pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DOUT, - NULL, - NULL, - pctrl->gpio_bank[id].base + NPCM8XX_GP_N_IEM, - BGPIOF_READ_OUTPUT_REG_SET); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DIN, + .set = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DOUT, + .dirin = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_IEM, + .flags = BGPIOF_READ_OUTPUT_REG_SET, + }; + + ret = gpio_generic_chip_init(&pctrl->gpio_bank[id].chip, &config); if (ret) - return dev_err_probe(dev, ret, "bgpio_init() failed\n"); + return dev_err_probe(dev, ret, + "failed to initialize the generic GPIO chip\n"); ret = fwnode_property_get_reference_args(child, "gpio-ranges", NULL, 3, 0, &args); if (ret < 0) @@ -2353,26 +2355,26 @@ static int npcm8xx_gpio_fw(struct npcm8xx_pinctrl *pctrl) pctrl->gpio_bank[id].irq_chip = npcmgpio_irqchip; pctrl->gpio_bank[id].irqbase = id * NPCM8XX_GPIO_PER_BANK; pctrl->gpio_bank[id].pinctrl_id = args.args[0]; - pctrl->gpio_bank[id].gc.base = -1; - pctrl->gpio_bank[id].gc.ngpio = args.args[2]; - pctrl->gpio_bank[id].gc.owner = THIS_MODULE; - pctrl->gpio_bank[id].gc.parent = dev; - pctrl->gpio_bank[id].gc.fwnode = child; - pctrl->gpio_bank[id].gc.label = devm_kasprintf(dev, GFP_KERNEL, "%pfw", child); - if (pctrl->gpio_bank[id].gc.label == NULL) + pctrl->gpio_bank[id].chip.gc.base = -1; + pctrl->gpio_bank[id].chip.gc.ngpio = args.args[2]; + pctrl->gpio_bank[id].chip.gc.owner = THIS_MODULE; + pctrl->gpio_bank[id].chip.gc.parent = dev; + pctrl->gpio_bank[id].chip.gc.fwnode = child; + pctrl->gpio_bank[id].chip.gc.label = devm_kasprintf(dev, GFP_KERNEL, "%pfw", child); + if (pctrl->gpio_bank[id].chip.gc.label == NULL) return -ENOMEM; - pctrl->gpio_bank[id].gc.dbg_show = npcmgpio_dbg_show; - pctrl->gpio_bank[id].direction_input = pctrl->gpio_bank[id].gc.direction_input; - pctrl->gpio_bank[id].gc.direction_input = npcmgpio_direction_input; - pctrl->gpio_bank[id].direction_output = pctrl->gpio_bank[id].gc.direction_output; - pctrl->gpio_bank[id].gc.direction_output = npcmgpio_direction_output; - pctrl->gpio_bank[id].request = pctrl->gpio_bank[id].gc.request; - pctrl->gpio_bank[id].gc.request = npcmgpio_gpio_request; - pctrl->gpio_bank[id].gc.free = pinctrl_gpio_free; + pctrl->gpio_bank[id].chip.gc.dbg_show = npcmgpio_dbg_show; + pctrl->gpio_bank[id].direction_input = pctrl->gpio_bank[id].chip.gc.direction_input; + pctrl->gpio_bank[id].chip.gc.direction_input = npcmgpio_direction_input; + pctrl->gpio_bank[id].direction_output = pctrl->gpio_bank[id].chip.gc.direction_output; + pctrl->gpio_bank[id].chip.gc.direction_output = npcmgpio_direction_output; + pctrl->gpio_bank[id].request = pctrl->gpio_bank[id].chip.gc.request; + pctrl->gpio_bank[id].chip.gc.request = npcmgpio_gpio_request; + pctrl->gpio_bank[id].chip.gc.free = pinctrl_gpio_free; for (i = 0 ; i < NPCM8XX_DEBOUNCE_MAX ; i++) pctrl->gpio_bank[id].debounce.set_val[i] = false; - pctrl->gpio_bank[id].gc.add_pin_ranges = npcmgpio_add_pin_ranges; + pctrl->gpio_bank[id].chip.gc.add_pin_ranges = npcmgpio_add_pin_ranges; id++; } @@ -2387,7 +2389,7 @@ static int npcm8xx_gpio_register(struct npcm8xx_pinctrl *pctrl) for (id = 0 ; id < pctrl->bank_num ; id++) { struct gpio_irq_chip *girq; - girq = &pctrl->gpio_bank[id].gc.irq; + girq = &pctrl->gpio_bank[id].chip.gc.irq; girq->chip = &pctrl->gpio_bank[id].irq_chip; girq->parent_handler = npcmgpio_irq_handler; girq->num_parents = 1; @@ -2401,7 +2403,7 @@ static int npcm8xx_gpio_register(struct npcm8xx_pinctrl *pctrl) girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_level_irq; ret = devm_gpiochip_add_data(pctrl->dev, - &pctrl->gpio_bank[id].gc, + &pctrl->gpio_bank[id].chip.gc, &pctrl->gpio_bank[id]); if (ret) return dev_err_probe(pctrl->dev, ret, "Failed to add GPIO chip %u\n", id); From 9e546aa9d5e032cd89529cd377631a60b0dfe35a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 11 Aug 2025 17:02:03 +0200 Subject: [PATCH 0251/3206] pinctrl: npcm7xx: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/20250811-gpio-mmio-pinctrl-conv-v1-4-a84c5da2be20@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c | 181 +++++++++++----------- 1 file changed, 90 insertions(+), 91 deletions(-) diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c index b8872d8f5930ad..c2ca71ebb9736d 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -77,7 +78,7 @@ /* Structure for register banks */ struct npcm7xx_gpio { void __iomem *base; - struct gpio_chip gc; + struct gpio_generic_chip chip; int irqbase; int irq; u32 pinctrl_id; @@ -99,32 +100,26 @@ struct npcm7xx_pinctrl { }; /* GPIO handling in the pinctrl driver */ -static void npcm_gpio_set(struct gpio_chip *gc, void __iomem *reg, +static void npcm_gpio_set(struct gpio_generic_chip *chip, void __iomem *reg, unsigned int pinmask) { - unsigned long flags; unsigned long val; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(chip); val = ioread32(reg) | pinmask; iowrite32(val, reg); - - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } -static void npcm_gpio_clr(struct gpio_chip *gc, void __iomem *reg, +static void npcm_gpio_clr(struct gpio_generic_chip *chip, void __iomem *reg, unsigned int pinmask) { - unsigned long flags; unsigned long val; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(chip); val = ioread32(reg) & ~pinmask; iowrite32(val, reg); - - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static void npcmgpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) @@ -132,9 +127,9 @@ static void npcmgpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) struct npcm7xx_gpio *bank = gpiochip_get_data(chip); seq_printf(s, "-- module %d [gpio%d - %d]\n", - bank->gc.base / bank->gc.ngpio, - bank->gc.base, - bank->gc.base + bank->gc.ngpio); + bank->chip.gc.base / bank->chip.gc.ngpio, + bank->chip.gc.base, + bank->chip.gc.base + bank->chip.gc.ngpio); seq_printf(s, "DIN :%.8x DOUT:%.8x IE :%.8x OE :%.8x\n", ioread32(bank->base + NPCM7XX_GP_N_DIN), ioread32(bank->base + NPCM7XX_GP_N_DOUT), @@ -220,7 +215,7 @@ static void npcmgpio_irq_handler(struct irq_desc *desc) chained_irq_enter(chip, desc); sts = ioread32(bank->base + NPCM7XX_GP_N_EVST); en = ioread32(bank->base + NPCM7XX_GP_N_EVEN); - dev_dbg(bank->gc.parent, "==> got irq sts %.8lx %.8lx\n", sts, + dev_dbg(bank->chip.gc.parent, "==> got irq sts %.8lx %.8lx\n", sts, en); sts &= en; @@ -235,42 +230,42 @@ static int npcmgpio_set_irq_type(struct irq_data *d, unsigned int type) struct npcm7xx_gpio *bank = gpiochip_get_data(gc); unsigned int gpio = BIT(irqd_to_hwirq(d)); - dev_dbg(bank->gc.parent, "setirqtype: %u.%u = %u\n", gpio, + dev_dbg(bank->chip.gc.parent, "setirqtype: %u.%u = %u\n", gpio, d->irq, type); switch (type) { case IRQ_TYPE_EDGE_RISING: - dev_dbg(bank->gc.parent, "edge.rising\n"); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_EVBE, gpio); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_POL, gpio); + dev_dbg(bank->chip.gc.parent, "edge.rising\n"); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_EVBE, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_POL, gpio); break; case IRQ_TYPE_EDGE_FALLING: - dev_dbg(bank->gc.parent, "edge.falling\n"); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_EVBE, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_POL, gpio); + dev_dbg(bank->chip.gc.parent, "edge.falling\n"); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_EVBE, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_POL, gpio); break; case IRQ_TYPE_EDGE_BOTH: - dev_dbg(bank->gc.parent, "edge.both\n"); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_EVBE, gpio); + dev_dbg(bank->chip.gc.parent, "edge.both\n"); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_EVBE, gpio); break; case IRQ_TYPE_LEVEL_LOW: - dev_dbg(bank->gc.parent, "level.low\n"); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_POL, gpio); + dev_dbg(bank->chip.gc.parent, "level.low\n"); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_POL, gpio); break; case IRQ_TYPE_LEVEL_HIGH: - dev_dbg(bank->gc.parent, "level.high\n"); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_POL, gpio); + dev_dbg(bank->chip.gc.parent, "level.high\n"); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_POL, gpio); break; default: - dev_dbg(bank->gc.parent, "invalid irq type\n"); + dev_dbg(bank->chip.gc.parent, "invalid irq type\n"); return -EINVAL; } if (type & (IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW)) { - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_EVTYP, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_EVTYP, gpio); irq_set_handler_locked(d, handle_level_irq); } else if (type & (IRQ_TYPE_EDGE_BOTH | IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING)) { - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_EVTYP, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_EVTYP, gpio); irq_set_handler_locked(d, handle_edge_irq); } @@ -283,7 +278,7 @@ static void npcmgpio_irq_ack(struct irq_data *d) struct npcm7xx_gpio *bank = gpiochip_get_data(gc); unsigned int gpio = irqd_to_hwirq(d); - dev_dbg(bank->gc.parent, "irq_ack: %u.%u\n", gpio, d->irq); + dev_dbg(bank->chip.gc.parent, "irq_ack: %u.%u\n", gpio, d->irq); iowrite32(BIT(gpio), bank->base + NPCM7XX_GP_N_EVST); } @@ -295,7 +290,7 @@ static void npcmgpio_irq_mask(struct irq_data *d) unsigned int gpio = irqd_to_hwirq(d); /* Clear events */ - dev_dbg(bank->gc.parent, "irq_mask: %u.%u\n", gpio, d->irq); + dev_dbg(bank->chip.gc.parent, "irq_mask: %u.%u\n", gpio, d->irq); iowrite32(BIT(gpio), bank->base + NPCM7XX_GP_N_EVENC); gpiochip_disable_irq(gc, gpio); } @@ -309,7 +304,7 @@ static void npcmgpio_irq_unmask(struct irq_data *d) /* Enable events */ gpiochip_enable_irq(gc, gpio); - dev_dbg(bank->gc.parent, "irq_unmask: %u.%u\n", gpio, d->irq); + dev_dbg(bank->chip.gc.parent, "irq_unmask: %u.%u\n", gpio, d->irq); iowrite32(BIT(gpio), bank->base + NPCM7XX_GP_N_EVENS); } @@ -1423,7 +1418,7 @@ static int npcm7xx_get_slew_rate(struct npcm7xx_gpio *bank, struct regmap *gcr_regmap, unsigned int pin) { u32 val; - int gpio = (pin % bank->gc.ngpio); + int gpio = (pin % bank->chip.gc.ngpio); unsigned long pinmask = BIT(gpio); if (pincfg[pin].flag & SLEW) @@ -1443,16 +1438,16 @@ static int npcm7xx_set_slew_rate(struct npcm7xx_gpio *bank, struct regmap *gcr_regmap, unsigned int pin, int arg) { - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); if (pincfg[pin].flag & SLEW) { switch (arg) { case 0: - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_OSRC, + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_OSRC, gpio); return 0; case 1: - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_OSRC, + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_OSRC, gpio); return 0; default: @@ -1485,7 +1480,7 @@ static int npcm7xx_get_drive_strength(struct pinctrl_dev *pctldev, struct npcm7xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm7xx_gpio *bank = &npcm->gpio_bank[pin / NPCM7XX_GPIO_PER_BANK]; - int gpio = (pin % bank->gc.ngpio); + int gpio = (pin % bank->chip.gc.ngpio); unsigned long pinmask = BIT(gpio); u32 ds = 0; int flg, val; @@ -1496,7 +1491,7 @@ static int npcm7xx_get_drive_strength(struct pinctrl_dev *pctldev, val = ioread32(bank->base + NPCM7XX_GP_N_ODSC) & pinmask; ds = val ? DSHI(flg) : DSLO(flg); - dev_dbg(bank->gc.parent, + dev_dbg(bank->chip.gc.parent, "pin %d strength %d = %d\n", pin, val, ds); return ds; } @@ -1511,20 +1506,20 @@ static int npcm7xx_set_drive_strength(struct npcm7xx_pinctrl *npcm, int v; struct npcm7xx_gpio *bank = &npcm->gpio_bank[pin / NPCM7XX_GPIO_PER_BANK]; - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); v = (pincfg[pin].flag & DRIVE_STRENGTH_MASK); if (!nval || !v) return -ENOTSUPP; if (DSLO(v) == nval) { - dev_dbg(bank->gc.parent, + dev_dbg(bank->chip.gc.parent, "setting pin %d to low strength [%d]\n", pin, nval); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_ODSC, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_ODSC, gpio); return 0; } else if (DSHI(v) == nval) { - dev_dbg(bank->gc.parent, + dev_dbg(bank->chip.gc.parent, "setting pin %d to high strength [%d]\n", pin, nval); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_ODSC, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_ODSC, gpio); return 0; } @@ -1657,9 +1652,9 @@ static int npcm_gpio_set_direction(struct pinctrl_dev *pctldev, struct npcm7xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm7xx_gpio *bank = &npcm->gpio_bank[offset / NPCM7XX_GPIO_PER_BANK]; - int gpio = BIT(offset % bank->gc.ngpio); + int gpio = BIT(offset % bank->chip.gc.ngpio); - dev_dbg(bank->gc.parent, "GPIO Set Direction: %d = %d\n", offset, + dev_dbg(bank->chip.gc.parent, "GPIO Set Direction: %d = %d\n", offset, input); if (input) iowrite32(gpio, bank->base + NPCM7XX_GP_N_OEC); @@ -1687,7 +1682,7 @@ static int npcm7xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, struct npcm7xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm7xx_gpio *bank = &npcm->gpio_bank[pin / NPCM7XX_GPIO_PER_BANK]; - int gpio = (pin % bank->gc.ngpio); + int gpio = (pin % bank->chip.gc.ngpio); unsigned long pinmask = BIT(gpio); u32 ie, oe, pu, pd; int rc = 0; @@ -1750,38 +1745,38 @@ static int npcm7xx_config_set_one(struct npcm7xx_pinctrl *npcm, u16 arg = pinconf_to_config_argument(config); struct npcm7xx_gpio *bank = &npcm->gpio_bank[pin / NPCM7XX_GPIO_PER_BANK]; - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); - dev_dbg(bank->gc.parent, "param=%d %d[GPIO]\n", param, pin); + dev_dbg(bank->chip.gc.parent, "param=%d %d[GPIO]\n", param, pin); switch (param) { case PIN_CONFIG_BIAS_DISABLE: - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_PU, gpio); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_PD, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_PU, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_PD, gpio); break; case PIN_CONFIG_BIAS_PULL_DOWN: - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_PU, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_PD, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_PU, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_PD, gpio); break; case PIN_CONFIG_BIAS_PULL_UP: - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_PD, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_PU, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_PD, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_PU, gpio); break; case PIN_CONFIG_INPUT_ENABLE: iowrite32(gpio, bank->base + NPCM7XX_GP_N_OEC); - bank->direction_input(&bank->gc, pin % bank->gc.ngpio); + bank->direction_input(&bank->chip.gc, pin % bank->chip.gc.ngpio); break; case PIN_CONFIG_OUTPUT: - bank->direction_output(&bank->gc, pin % bank->gc.ngpio, arg); + bank->direction_output(&bank->chip.gc, pin % bank->chip.gc.ngpio, arg); iowrite32(gpio, bank->base + NPCM7XX_GP_N_OES); break; case PIN_CONFIG_DRIVE_PUSH_PULL: - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_OTYP, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_OTYP, gpio); break; case PIN_CONFIG_DRIVE_OPEN_DRAIN: - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_OTYP, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_OTYP, gpio); break; case PIN_CONFIG_INPUT_DEBOUNCE: - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_DBNC, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_DBNC, gpio); break; case PIN_CONFIG_SLEW_RATE: return npcm7xx_set_slew_rate(bank, npcm->gcr_regmap, pin, arg); @@ -1829,6 +1824,7 @@ static const struct pinctrl_desc npcm7xx_pinctrl_desc = { static int npcm7xx_gpio_of(struct npcm7xx_pinctrl *pctrl) { + struct gpio_generic_chip_config config; int ret = -ENXIO; struct device *dev = pctrl->dev; struct fwnode_reference_args args; @@ -1840,15 +1836,18 @@ static int npcm7xx_gpio_of(struct npcm7xx_pinctrl *pctrl) if (!pctrl->gpio_bank[id].base) return -EINVAL; - ret = bgpio_init(&pctrl->gpio_bank[id].gc, dev, 4, - pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DIN, - pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DOUT, - NULL, - NULL, - pctrl->gpio_bank[id].base + NPCM7XX_GP_N_IEM, - BGPIOF_READ_OUTPUT_REG_SET); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DIN, + .set = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DOUT, + .dirin = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_IEM, + .flags = BGPIOF_READ_OUTPUT_REG_SET, + }; + + ret = gpio_generic_chip_init(&pctrl->gpio_bank[id].chip, &config); if (ret) { - dev_err(dev, "bgpio_init() failed\n"); + dev_err(dev, "failed to initialize the generic GPIO chip\n"); return ret; } @@ -1866,23 +1865,23 @@ static int npcm7xx_gpio_of(struct npcm7xx_pinctrl *pctrl) pctrl->gpio_bank[id].irq = ret; pctrl->gpio_bank[id].irqbase = id * NPCM7XX_GPIO_PER_BANK; pctrl->gpio_bank[id].pinctrl_id = args.args[0]; - pctrl->gpio_bank[id].gc.base = args.args[1]; - pctrl->gpio_bank[id].gc.ngpio = args.args[2]; - pctrl->gpio_bank[id].gc.owner = THIS_MODULE; - pctrl->gpio_bank[id].gc.parent = dev; - pctrl->gpio_bank[id].gc.fwnode = child; - pctrl->gpio_bank[id].gc.label = devm_kasprintf(dev, GFP_KERNEL, "%pfw", child); - if (pctrl->gpio_bank[id].gc.label == NULL) + pctrl->gpio_bank[id].chip.gc.base = args.args[1]; + pctrl->gpio_bank[id].chip.gc.ngpio = args.args[2]; + pctrl->gpio_bank[id].chip.gc.owner = THIS_MODULE; + pctrl->gpio_bank[id].chip.gc.parent = dev; + pctrl->gpio_bank[id].chip.gc.fwnode = child; + pctrl->gpio_bank[id].chip.gc.label = devm_kasprintf(dev, GFP_KERNEL, "%pfw", child); + if (pctrl->gpio_bank[id].chip.gc.label == NULL) return -ENOMEM; - pctrl->gpio_bank[id].gc.dbg_show = npcmgpio_dbg_show; - pctrl->gpio_bank[id].direction_input = pctrl->gpio_bank[id].gc.direction_input; - pctrl->gpio_bank[id].gc.direction_input = npcmgpio_direction_input; - pctrl->gpio_bank[id].direction_output = pctrl->gpio_bank[id].gc.direction_output; - pctrl->gpio_bank[id].gc.direction_output = npcmgpio_direction_output; - pctrl->gpio_bank[id].request = pctrl->gpio_bank[id].gc.request; - pctrl->gpio_bank[id].gc.request = npcmgpio_gpio_request; - pctrl->gpio_bank[id].gc.free = pinctrl_gpio_free; + pctrl->gpio_bank[id].chip.gc.dbg_show = npcmgpio_dbg_show; + pctrl->gpio_bank[id].direction_input = pctrl->gpio_bank[id].chip.gc.direction_input; + pctrl->gpio_bank[id].chip.gc.direction_input = npcmgpio_direction_input; + pctrl->gpio_bank[id].direction_output = pctrl->gpio_bank[id].chip.gc.direction_output; + pctrl->gpio_bank[id].chip.gc.direction_output = npcmgpio_direction_output; + pctrl->gpio_bank[id].request = pctrl->gpio_bank[id].chip.gc.request; + pctrl->gpio_bank[id].chip.gc.request = npcmgpio_gpio_request; + pctrl->gpio_bank[id].chip.gc.free = pinctrl_gpio_free; id++; } @@ -1897,7 +1896,7 @@ static int npcm7xx_gpio_register(struct npcm7xx_pinctrl *pctrl) for (id = 0 ; id < pctrl->bank_num ; id++) { struct gpio_irq_chip *girq; - girq = &pctrl->gpio_bank[id].gc.irq; + girq = &pctrl->gpio_bank[id].chip.gc.irq; gpio_irq_chip_set_chip(girq, &npcmgpio_irqchip); girq->parent_handler = npcmgpio_irq_handler; girq->num_parents = 1; @@ -1912,21 +1911,21 @@ static int npcm7xx_gpio_register(struct npcm7xx_pinctrl *pctrl) girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_level_irq; ret = devm_gpiochip_add_data(pctrl->dev, - &pctrl->gpio_bank[id].gc, + &pctrl->gpio_bank[id].chip.gc, &pctrl->gpio_bank[id]); if (ret) { dev_err(pctrl->dev, "Failed to add GPIO chip %u\n", id); goto err_register; } - ret = gpiochip_add_pin_range(&pctrl->gpio_bank[id].gc, + ret = gpiochip_add_pin_range(&pctrl->gpio_bank[id].chip.gc, dev_name(pctrl->dev), pctrl->gpio_bank[id].pinctrl_id, - pctrl->gpio_bank[id].gc.base, - pctrl->gpio_bank[id].gc.ngpio); + pctrl->gpio_bank[id].chip.gc.base, + pctrl->gpio_bank[id].chip.gc.ngpio); if (ret < 0) { dev_err(pctrl->dev, "Failed to add GPIO bank %u\n", id); - gpiochip_remove(&pctrl->gpio_bank[id].gc); + gpiochip_remove(&pctrl->gpio_bank[id].chip.gc); goto err_register; } } @@ -1935,7 +1934,7 @@ static int npcm7xx_gpio_register(struct npcm7xx_pinctrl *pctrl) err_register: for (; id > 0; id--) - gpiochip_remove(&pctrl->gpio_bank[id - 1].gc); + gpiochip_remove(&pctrl->gpio_bank[id - 1].chip.gc); return ret; } From 051f7141673aa112a9c4119e1b5e0d0f580951f3 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 11 Aug 2025 17:02:04 +0200 Subject: [PATCH 0252/3206] pinctrl: wpcm450: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/20250811-gpio-mmio-pinctrl-conv-v1-5-a84c5da2be20@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/nuvoton/pinctrl-wpcm450.c | 44 ++++++++++++++--------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c index 8d8314ba0e4cb5..4dd8a3daa83e44 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c +++ b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -47,7 +48,7 @@ struct wpcm450_pinctrl; struct wpcm450_bank; struct wpcm450_gpio { - struct gpio_chip gc; + struct gpio_generic_chip chip; struct wpcm450_pinctrl *pctrl; const struct wpcm450_bank *bank; }; @@ -184,11 +185,12 @@ static void wpcm450_gpio_irq_unmask(struct irq_data *d) } /* - * This is an implementation of the gpio_chip->get() function, for use in - * wpcm450_gpio_fix_evpol. Unfortunately, we can't use the bgpio-provided - * implementation there, because it would require taking gpio_chip->bgpio_lock, - * which is a spin lock, but wpcm450_gpio_fix_evpol must work in contexts where - * a raw spin lock is held. + * FIXME: This is an implementation of the gpio_chip->get() function, for use + * in wpcm450_gpio_fix_evpol(). It was implemented back when gpio-mmio used a + * regular spinlock internally, while wpcm450_gpio_fix_evpol() needed to work + * in contexts with a raw spinlock held. Since then, the gpio generic chip has + * been switched to using a raw spinlock so this should be converted to using + * the locking interfaces provided in linux/gpio/gneneric.h. */ static int wpcm450_gpio_get(struct wpcm450_gpio *gpio, int offset) { @@ -329,7 +331,7 @@ static void wpcm450_gpio_irqhandler(struct irq_desc *desc) for_each_set_bit(bit, &pending, 32) { int offset = wpcm450_irq_bitnum_to_gpio(gpio, bit); - generic_handle_domain_irq(gpio->gc.irq.domain, offset); + generic_handle_domain_irq(gpio->chip.gc.irq.domain, offset); } chained_irq_exit(chip, desc); } @@ -1012,7 +1014,7 @@ static int wpcm450_gpio_add_pin_ranges(struct gpio_chip *chip) struct wpcm450_gpio *gpio = gpiochip_get_data(chip); const struct wpcm450_bank *bank = gpio->bank; - return gpiochip_add_pin_range(&gpio->gc, dev_name(gpio->pctrl->dev), + return gpiochip_add_pin_range(&gpio->chip.gc, dev_name(gpio->pctrl->dev), 0, bank->base, bank->length); } @@ -1029,6 +1031,7 @@ static int wpcm450_gpio_register(struct platform_device *pdev, "Resource fail for GPIO controller\n"); for_each_gpiochip_node(dev, child) { + struct gpio_generic_chip_config config; void __iomem *dat = NULL; void __iomem *set = NULL; void __iomem *dirout = NULL; @@ -1060,17 +1063,26 @@ static int wpcm450_gpio_register(struct platform_device *pdev, } else { flags = BGPIOF_NO_OUTPUT; } - ret = bgpio_init(&gpio->gc, dev, 4, - dat, set, NULL, dirout, NULL, flags); + + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = dat, + .set = set, + .dirout = dirout, + .flags = flags, + }; + + ret = gpio_generic_chip_init(&gpio->chip, &config); if (ret < 0) return dev_err_probe(dev, ret, "GPIO initialization failed\n"); - gpio->gc.ngpio = bank->length; - gpio->gc.set_config = wpcm450_gpio_set_config; - gpio->gc.fwnode = child; - gpio->gc.add_pin_ranges = wpcm450_gpio_add_pin_ranges; + gpio->chip.gc.ngpio = bank->length; + gpio->chip.gc.set_config = wpcm450_gpio_set_config; + gpio->chip.gc.fwnode = child; + gpio->chip.gc.add_pin_ranges = wpcm450_gpio_add_pin_ranges; - girq = &gpio->gc.irq; + girq = &gpio->chip.gc.irq; gpio_irq_chip_set_chip(girq, &wpcm450_gpio_irqchip); girq->parent_handler = wpcm450_gpio_irqhandler; girq->parents = devm_kcalloc(dev, WPCM450_NUM_GPIO_IRQS, @@ -1094,7 +1106,7 @@ static int wpcm450_gpio_register(struct platform_device *pdev, girq->num_parents++; } - ret = devm_gpiochip_add_data(dev, &gpio->gc, gpio); + ret = devm_gpiochip_add_data(dev, &gpio->chip.gc, gpio); if (ret) return dev_err_probe(dev, ret, "Failed to add GPIO chip\n"); } From 91d7789fd016e0616913313b3306b83f8a608489 Mon Sep 17 00:00:00 2001 From: Joy Zou Date: Wed, 6 Aug 2025 19:41:10 +0800 Subject: [PATCH 0253/3206] dt-bindings: soc: imx-blk-ctrl: add i.MX91 blk-ctrl compatible Add new compatible string "fsl,imx91-media-blk-ctrl" for i.MX91, which has different input clocks compared to i.MX93. Update the clock-names list and handle it in the if-else branch accordingly. Keep the same restriction for the existed compatible strings. Reviewed-by: Rob Herring (Arm) Reviewed-by: Frank Li Signed-off-by: Joy Zou Link: https://lore.kernel.org/r/20250806114119.1948624-3-joy.zou@nxp.com Signed-off-by: Ulf Hansson --- .../soc/imx/fsl,imx93-media-blk-ctrl.yaml | 59 +++++++++++++++---- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/soc/imx/fsl,imx93-media-blk-ctrl.yaml b/Documentation/devicetree/bindings/soc/imx/fsl,imx93-media-blk-ctrl.yaml index b3554e7f9e76dd..34aea58094e553 100644 --- a/Documentation/devicetree/bindings/soc/imx/fsl,imx93-media-blk-ctrl.yaml +++ b/Documentation/devicetree/bindings/soc/imx/fsl,imx93-media-blk-ctrl.yaml @@ -18,7 +18,9 @@ description: properties: compatible: items: - - const: fsl,imx93-media-blk-ctrl + - enum: + - fsl,imx91-media-blk-ctrl + - fsl,imx93-media-blk-ctrl - const: syscon reg: @@ -31,21 +33,54 @@ properties: maxItems: 1 clocks: + minItems: 8 maxItems: 10 clock-names: - items: - - const: apb - - const: axi - - const: nic - - const: disp - - const: cam - - const: pxp - - const: lcdif - - const: isi - - const: csi - - const: dsi + minItems: 8 + maxItems: 10 +allOf: + - if: + properties: + compatible: + contains: + const: fsl,imx91-media-blk-ctrl + then: + properties: + clocks: + maxItems: 8 + clock-names: + items: + - const: apb + - const: axi + - const: nic + - const: disp + - const: cam + - const: lcdif + - const: isi + - const: csi + - if: + properties: + compatible: + contains: + const: fsl,imx93-media-blk-ctrl + then: + properties: + clocks: + minItems: 10 + clock-names: + items: + - const: apb + - const: axi + - const: nic + - const: disp + - const: cam + - const: pxp + - const: lcdif + - const: isi + - const: csi + - const: dsi required: - compatible - reg From 88f9d89871014559557ecc9b9ae9c9c0821a6e96 Mon Sep 17 00:00:00 2001 From: Joy Zou Date: Wed, 6 Aug 2025 19:41:17 +0800 Subject: [PATCH 0254/3206] pmdomain: imx93-blk-ctrl: use ARRAY_SIZE() instead of hardcode number Optimize i.MX93 num_clks hardcode with ARRAY_SIZE(). Reviewed-by: Frank Li Signed-off-by: Joy Zou Link: https://lore.kernel.org/r/20250806114119.1948624-10-joy.zou@nxp.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/imx93-blk-ctrl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/pmdomain/imx/imx93-blk-ctrl.c b/drivers/pmdomain/imx/imx93-blk-ctrl.c index 0e2ba8ec55d757..1dcb84593e0168 100644 --- a/drivers/pmdomain/imx/imx93-blk-ctrl.c +++ b/drivers/pmdomain/imx/imx93-blk-ctrl.c @@ -418,11 +418,15 @@ static const struct regmap_access_table imx93_media_blk_ctl_access_table = { .n_yes_ranges = ARRAY_SIZE(imx93_media_blk_ctl_yes_ranges), }; +static const char * const media_blk_clk_names[] = { + "axi", "apb", "nic" +}; + static const struct imx93_blk_ctrl_data imx93_media_blk_ctl_dev_data = { .domains = imx93_media_blk_ctl_domain_data, .num_domains = ARRAY_SIZE(imx93_media_blk_ctl_domain_data), - .clk_names = (const char *[]){ "axi", "apb", "nic", }, - .num_clks = 3, + .clk_names = media_blk_clk_names, + .num_clks = ARRAY_SIZE(media_blk_clk_names), .reg_access_table = &imx93_media_blk_ctl_access_table, }; From a83a7a7b205018a436cbdbd9676cfaaf01773f47 Mon Sep 17 00:00:00 2001 From: Joy Zou Date: Wed, 6 Aug 2025 19:41:18 +0800 Subject: [PATCH 0255/3206] pmdomain: imx93-blk-ctrl: mask DSI and PXP PD domain register on i.MX91 The i.MX91 is derived from i.MX93, but there is no DSI and PXP in i.MX91, Add skip_mask in struct imx93_blk_ctrl_data, then skip DSI and PXP for i.MX91 Soc. Signed-off-by: Joy Zou Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250806114119.1948624-11-joy.zou@nxp.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/imx93-blk-ctrl.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/pmdomain/imx/imx93-blk-ctrl.c b/drivers/pmdomain/imx/imx93-blk-ctrl.c index 1dcb84593e0168..e094fe5a42bf64 100644 --- a/drivers/pmdomain/imx/imx93-blk-ctrl.c +++ b/drivers/pmdomain/imx/imx93-blk-ctrl.c @@ -86,6 +86,7 @@ struct imx93_blk_ctrl_domain { struct imx93_blk_ctrl_data { const struct imx93_blk_ctrl_domain_data *domains; + u32 skip_mask; int num_domains; const char * const *clk_names; int num_clks; @@ -250,6 +251,8 @@ static int imx93_blk_ctrl_probe(struct platform_device *pdev) int j; domain->data = data; + if (bc_data->skip_mask & BIT(i)) + continue; for (j = 0; j < data->num_clks; j++) domain->clks[j].id = data->clk_names[j]; @@ -422,6 +425,15 @@ static const char * const media_blk_clk_names[] = { "axi", "apb", "nic" }; +static const struct imx93_blk_ctrl_data imx91_media_blk_ctl_dev_data = { + .domains = imx93_media_blk_ctl_domain_data, + .skip_mask = BIT(IMX93_MEDIABLK_PD_MIPI_DSI) | BIT(IMX93_MEDIABLK_PD_PXP), + .num_domains = ARRAY_SIZE(imx93_media_blk_ctl_domain_data), + .clk_names = media_blk_clk_names, + .num_clks = ARRAY_SIZE(media_blk_clk_names), + .reg_access_table = &imx93_media_blk_ctl_access_table, +}; + static const struct imx93_blk_ctrl_data imx93_media_blk_ctl_dev_data = { .domains = imx93_media_blk_ctl_domain_data, .num_domains = ARRAY_SIZE(imx93_media_blk_ctl_domain_data), @@ -432,6 +444,9 @@ static const struct imx93_blk_ctrl_data imx93_media_blk_ctl_dev_data = { static const struct of_device_id imx93_blk_ctrl_of_match[] = { { + .compatible = "fsl,imx91-media-blk-ctrl", + .data = &imx91_media_blk_ctl_dev_data + }, { .compatible = "fsl,imx93-media-blk-ctrl", .data = &imx93_media_blk_ctl_dev_data }, { From 87660a84bb2897344a368aabe990789cdced5954 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Aug 2025 18:15:00 +0200 Subject: [PATCH 0256/3206] pmdomain: remove unneeded 'fast_io' parameter in regmap_config When using MMIO with regmap, fast_io is implied. No need to set it again. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250813161517.4746-15-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/gpc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pmdomain/imx/gpc.c b/drivers/pmdomain/imx/gpc.c index f18c7e6e75ddc5..33991f3c6b5564 100644 --- a/drivers/pmdomain/imx/gpc.c +++ b/drivers/pmdomain/imx/gpc.c @@ -343,7 +343,6 @@ static const struct regmap_config imx_gpc_regmap_config = { .rd_table = &access_table, .wr_table = &access_table, .max_register = 0x2ac, - .fast_io = true, }; static struct generic_pm_domain *imx_gpc_onecell_domains[] = { From d8f3ae7b38fea546e64a6cfcdc7d061c85f086e2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 13 Aug 2025 15:04:24 +0200 Subject: [PATCH 0257/3206] pmdomain: renesas: rcar-sysc: Make rcar_sysc_onecell_np __initdata rcar_sysc_onecell_np() is only used by functions marked __init, so it can be freed when init memory is freed. Fixes: c5ae5a0c6112 ("pmdomain: renesas: rcar-sysc: Add genpd OF provider at postcore_initcall") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/e20a848ff952924f8f58c335f9a0242cb2565921.1755090234.git.geert+renesas@glider.be Signed-off-by: Ulf Hansson --- drivers/pmdomain/renesas/rcar-sysc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pmdomain/renesas/rcar-sysc.c b/drivers/pmdomain/renesas/rcar-sysc.c index 4b310c1d35fa6b..2d4161170c63d0 100644 --- a/drivers/pmdomain/renesas/rcar-sysc.c +++ b/drivers/pmdomain/renesas/rcar-sysc.c @@ -342,7 +342,7 @@ struct rcar_pm_domains { }; static struct genpd_onecell_data *rcar_sysc_onecell_data; -static struct device_node *rcar_sysc_onecell_np; +static struct device_node *rcar_sysc_onecell_np __initdata = NULL; static int __init rcar_sysc_pd_init(void) { From faba66cfcef6374020d2ac7bed091b2793eefeec Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 12 Aug 2025 16:12:43 +0800 Subject: [PATCH 0258/3206] pinctrl: bcm: use PTR_ERR_OR_ZERO() to simplify code Use the standard error pointer macro to shorten the code and simplify. Signed-off-by: Xichao Zhao Link: https://lore.kernel.org/20250812081243.22659-1-zhao.xichao@vivo.com Signed-off-by: Linus Walleij --- drivers/pinctrl/bcm/pinctrl-bcm6358.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pinctrl/bcm/pinctrl-bcm6358.c b/drivers/pinctrl/bcm/pinctrl-bcm6358.c index 891de49d76e744..4c8cd65fc31e8f 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm6358.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm6358.c @@ -343,10 +343,8 @@ static int bcm6358_pinctrl_probe(struct platform_device *pdev) pc = platform_get_drvdata(pdev); priv->overlays = devm_regmap_field_alloc(dev, pc->regs, overlays); - if (IS_ERR(priv->overlays)) - return PTR_ERR(priv->overlays); - return 0; + return PTR_ERR_OR_ZERO(priv->overlays); } static const struct of_device_id bcm6358_pinctrl_match[] = { From bba95412064207ea3a0ea1776daec6480e5f8b0f Mon Sep 17 00:00:00 2001 From: Shankari Anand Date: Sat, 16 Aug 2025 17:53:23 +0530 Subject: [PATCH 0259/3206] rust: pid_namespace: update AlwaysRefCounted imports from sync::aref Update call sites in `pid_namespace.rs` to import `AlwaysRefCounted` from `sync::aref` instead of `types`. This aligns with the ongoing effort to move `ARef` and `AlwaysRefCounted` to `sync`. Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1173 Signed-off-by: Shankari Anand Link: https://lore.kernel.org/20250816122323.11657-1-shankari.ak0208@gmail.com Reviewed-by: Benno Lossin Signed-off-by: Christian Brauner --- rust/kernel/pid_namespace.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/rust/kernel/pid_namespace.rs b/rust/kernel/pid_namespace.rs index 0e93808e4639b3..979a9718f153d8 100644 --- a/rust/kernel/pid_namespace.rs +++ b/rust/kernel/pid_namespace.rs @@ -7,10 +7,7 @@ //! C header: [`include/linux/pid_namespace.h`](srctree/include/linux/pid_namespace.h) and //! [`include/linux/pid.h`](srctree/include/linux/pid.h) -use crate::{ - bindings, - types::{AlwaysRefCounted, Opaque}, -}; +use crate::{bindings, sync::aref::AlwaysRefCounted, types::Opaque}; use core::ptr; /// Wraps the kernel's `struct pid_namespace`. Thread safe. From 8e7e265d558e0257d6dacc78ec64aff4ba75f61e Mon Sep 17 00:00:00 2001 From: Charalampos Mitrodimas Date: Sat, 16 Aug 2025 14:14:37 +0000 Subject: [PATCH 0260/3206] debugfs: fix mount options not being applied Mount options (uid, gid, mode) are silently ignored when debugfs is mounted. This is a regression introduced during the conversion to the new mount API. When the mount API conversion was done, the parsed options were never applied to the superblock when it was reused. As a result, the mount options were ignored when debugfs was mounted. Fix this by following the same pattern as the tracefs fix in commit e4d32142d1de ("tracing: Fix tracefs mount options"). Call debugfs_reconfigure() in debugfs_get_tree() to apply the mount options to the superblock after it has been created or reused. As an example, with the bug the "mode" mount option is ignored: $ mount -o mode=0666 -t debugfs debugfs /tmp/debugfs_test $ mount | grep debugfs_test debugfs on /tmp/debugfs_test type debugfs (rw,relatime) $ ls -ld /tmp/debugfs_test drwx------ 25 root root 0 Aug 4 14:16 /tmp/debugfs_test With the fix applied, it works as expected: $ mount -o mode=0666 -t debugfs debugfs /tmp/debugfs_test $ mount | grep debugfs_test debugfs on /tmp/debugfs_test type debugfs (rw,relatime,mode=666) $ ls -ld /tmp/debugfs_test drw-rw-rw- 37 root root 0 Aug 2 17:28 /tmp/debugfs_test Fixes: a20971c18752 ("vfs: Convert debugfs to use the new mount API") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220406 Cc: stable@vger.kernel.org Reviewed-by: Eric Sandeen Signed-off-by: Charalampos Mitrodimas Link: https://lore.kernel.org/20250816-debugfs-mount-opts-v3-1-d271dad57b5b@posteo.net Signed-off-by: Christian Brauner --- fs/debugfs/inode.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index a0357b0cf362d8..c12d649df6a543 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -183,6 +183,9 @@ static int debugfs_reconfigure(struct fs_context *fc) struct debugfs_fs_info *sb_opts = sb->s_fs_info; struct debugfs_fs_info *new_opts = fc->s_fs_info; + if (!new_opts) + return 0; + sync_filesystem(sb); /* structure copy of new mount options to sb */ @@ -282,10 +285,16 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc) static int debugfs_get_tree(struct fs_context *fc) { + int err; + if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return -EPERM; - return get_tree_single(fc, debugfs_fill_super); + err = get_tree_single(fc, debugfs_fill_super); + if (err) + return err; + + return debugfs_reconfigure(fc); } static void debugfs_free_fc(struct fs_context *fc) From 1d6249c1ce826fcf03c695973095eb4a50fb7fd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 11 Aug 2025 11:13:35 +0200 Subject: [PATCH 0261/3206] sysfs: remove bin_attribute::read_new/write_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These transitional fields are now unused and unnecessary. Remove them and their logic in the sysfs core. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250811-sysfs-const-bin_attr-final-v4-1-7b6053fd58bb@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 22 +++++----------------- include/linux/sysfs.h | 4 ---- 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1ca143d2f22ad2..3825e780cc580d 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -97,12 +97,9 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, count = size - pos; } - if (!battr->read && !battr->read_new) + if (!battr->read) return -EIO; - if (battr->read_new) - return battr->read_new(of->file, kobj, battr, buf, pos, count); - return battr->read(of->file, kobj, battr, buf, pos, count); } @@ -161,12 +158,9 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, if (!count) return 0; - if (!battr->write && !battr->write_new) + if (!battr->write) return -EIO; - if (battr->write_new) - return battr->write_new(of->file, kobj, battr, buf, pos, count); - return battr->write(of->file, kobj, battr, buf, pos, count); } @@ -335,19 +329,13 @@ int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, const struct kernfs_ops *ops; struct kernfs_node *kn; - if (battr->read && battr->read_new) - return -EINVAL; - - if (battr->write && battr->write_new) - return -EINVAL; - if (battr->mmap) ops = &sysfs_bin_kfops_mmap; - else if ((battr->read || battr->read_new) && (battr->write || battr->write_new)) + else if (battr->read && battr->write) ops = &sysfs_bin_kfops_rw; - else if (battr->read || battr->read_new) + else if (battr->read) ops = &sysfs_bin_kfops_ro; - else if (battr->write || battr->write_new) + else if (battr->write) ops = &sysfs_bin_kfops_wo; else ops = &sysfs_file_kfops_empty; diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index f418aae4f1134f..7544f6d81c0598 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -308,12 +308,8 @@ struct bin_attribute { struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); - ssize_t (*read_new)(struct file *, struct kobject *, const struct bin_attribute *, - char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); - ssize_t (*write_new)(struct file *, struct kobject *, - const struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, From 44d454fcffa8b08d6d66df132121c1d387fa85db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 11 Aug 2025 11:13:36 +0200 Subject: [PATCH 0262/3206] sysfs: remove attribute_group::bin_attrs_new MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This transitional field is now unused and unnecessary. Remove it. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250811-sysfs-const-bin_attr-final-v4-2-7b6053fd58bb@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 7544f6d81c0598..9a25a29116528f 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -106,10 +106,7 @@ struct attribute_group { const struct bin_attribute *, int); struct attribute **attrs; - union { - const struct bin_attribute *const *bin_attrs; - const struct bin_attribute *const *bin_attrs_new; - }; + const struct bin_attribute *const *bin_attrs; }; #define SYSFS_PREALLOC 010000 @@ -293,7 +290,7 @@ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ - .bin_attrs_new = _name##_attrs, \ + .bin_attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) From eef32050636a3390724dfe532ab9af9d58c854c3 Mon Sep 17 00:00:00 2001 From: Andrea della Porta Date: Tue, 12 Aug 2025 11:26:18 +0200 Subject: [PATCH 0263/3206] pinctrl: rp1: Add regmap ranges to RP1 gpio controller The current gpio driver for RP1 shows only the very first register from sysfs, e.g.: $ cat /sys/kernel/debug/regmap/1f000d0000.gpio-rp1-pinctrl/registers 0: 0abe0000 Add the correct ranges to the regmap configuration. Signed-off-by: Andrea della Porta Link: https://lore.kernel.org/20250812092618.14270-1-andrea.porta@suse.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rp1.c | 95 +++++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 5 deletions(-) diff --git a/drivers/pinctrl/pinctrl-rp1.c b/drivers/pinctrl/pinctrl-rp1.c index dadafc935dbb28..b231efc62ff366 100644 --- a/drivers/pinctrl/pinctrl-rp1.c +++ b/drivers/pinctrl/pinctrl-rp1.c @@ -1623,12 +1623,97 @@ MODULE_DEVICE_TABLE(of, rp1_pinctrl_match); static struct rp1_pinctrl rp1_pinctrl_data = {}; -static const struct regmap_config rp1_pinctrl_regmap_cfg = { +static const struct regmap_range rp1_gpio_reg_ranges[] = { + /* BANK 0 */ + regmap_reg_range(0x2004, 0x20dc), + regmap_reg_range(0x3004, 0x30dc), + regmap_reg_range(0x0004, 0x00dc), + regmap_reg_range(0x0124, 0x0124), + regmap_reg_range(0x211c, 0x211c), + regmap_reg_range(0x311c, 0x311c), + /* BANK 1 */ + regmap_reg_range(0x6004, 0x602c), + regmap_reg_range(0x7004, 0x702c), + regmap_reg_range(0x4004, 0x402c), + regmap_reg_range(0x4124, 0x4124), + regmap_reg_range(0x611c, 0x611c), + regmap_reg_range(0x711c, 0x711c), + /* BANK 2 */ + regmap_reg_range(0xa004, 0xa09c), + regmap_reg_range(0xb004, 0xb09c), + regmap_reg_range(0x8004, 0x809c), + regmap_reg_range(0x8124, 0x8124), + regmap_reg_range(0xa11c, 0xa11c), + regmap_reg_range(0xb11c, 0xb11c), +}; + +static const struct regmap_range rp1_rio_reg_ranges[] = { + /* BANK 0 */ + regmap_reg_range(0x2000, 0x2004), + regmap_reg_range(0x3000, 0x3004), + regmap_reg_range(0x0004, 0x0008), + /* BANK 1 */ + regmap_reg_range(0x6000, 0x6004), + regmap_reg_range(0x7000, 0x7004), + regmap_reg_range(0x4004, 0x4008), + /* BANK 2 */ + regmap_reg_range(0xa000, 0xa004), + regmap_reg_range(0xb000, 0xb004), + regmap_reg_range(0x8004, 0x8008), +}; + +static const struct regmap_range rp1_pads_reg_ranges[] = { + /* BANK 0 */ + regmap_reg_range(0x0004, 0x0070), + /* BANK 1 */ + regmap_reg_range(0x4004, 0x4018), + /* BANK 2 */ + regmap_reg_range(0x8004, 0x8050), +}; + +static const struct regmap_access_table rp1_gpio_reg_table = { + .yes_ranges = rp1_gpio_reg_ranges, + .n_yes_ranges = ARRAY_SIZE(rp1_gpio_reg_ranges), +}; + +static const struct regmap_access_table rp1_rio_reg_table = { + .yes_ranges = rp1_rio_reg_ranges, + .n_yes_ranges = ARRAY_SIZE(rp1_rio_reg_ranges), +}; + +static const struct regmap_access_table rp1_pads_reg_table = { + .yes_ranges = rp1_pads_reg_ranges, + .n_yes_ranges = ARRAY_SIZE(rp1_pads_reg_ranges), +}; + +static const struct regmap_config rp1_pinctrl_gpio_regmap_cfg = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .fast_io = true, + .rd_table = &rp1_gpio_reg_table, + .name = "rp1-gpio", + .max_register = 0xb11c, +}; + +static const struct regmap_config rp1_pinctrl_rio_regmap_cfg = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .fast_io = true, + .rd_table = &rp1_rio_reg_table, + .name = "rp1-rio", + .max_register = 0xb004, +}; + +static const struct regmap_config rp1_pinctrl_pads_regmap_cfg = { .reg_bits = 32, .val_bits = 32, .reg_stride = 4, .fast_io = true, - .name = "rp1-pinctrl", + .rd_table = &rp1_pads_reg_table, + .name = "rp1-pads", + .max_register = 0x8050, }; static int rp1_gen_regfield(struct device *dev, @@ -1685,17 +1770,17 @@ static int rp1_pinctrl_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(pc->pads_base), "could not get PADS IO memory\n"); gpio_regmap = devm_regmap_init_mmio(dev, pc->gpio_base, - &rp1_pinctrl_regmap_cfg); + &rp1_pinctrl_gpio_regmap_cfg); if (IS_ERR(gpio_regmap)) return dev_err_probe(dev, PTR_ERR(gpio_regmap), "could not init GPIO regmap\n"); rio_regmap = devm_regmap_init_mmio(dev, pc->rio_base, - &rp1_pinctrl_regmap_cfg); + &rp1_pinctrl_rio_regmap_cfg); if (IS_ERR(rio_regmap)) return dev_err_probe(dev, PTR_ERR(rio_regmap), "could not init RIO regmap\n"); pads_regmap = devm_regmap_init_mmio(dev, pc->pads_base, - &rp1_pinctrl_regmap_cfg); + &rp1_pinctrl_pads_regmap_cfg); if (IS_ERR(pads_regmap)) return dev_err_probe(dev, PTR_ERR(pads_regmap), "could not init PADS regmap\n"); From 15e3363af00a9e5b4f5c10b17940733a8613ef57 Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Tue, 12 Aug 2025 16:24:40 -0500 Subject: [PATCH 0264/3206] dt-bindings: pinctrl: Document Tegra186 pin controllers Tegra186 contains two pin controllers. Document their compatible strings and describe the list of pins and functions that they provide. Reviewed-by: Rob Herring (Arm) Signed-off-by: Aaron Kling Link: https://lore.kernel.org/20250812-tegra186-pinctrl-v3-1-115714eeecb1@gmail.com Signed-off-by: Linus Walleij --- .../pinctrl/nvidia,tegra186-pinmux.yaml | 285 ++++++++++++++++++ 1 file changed, 285 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/nvidia,tegra186-pinmux.yaml diff --git a/Documentation/devicetree/bindings/pinctrl/nvidia,tegra186-pinmux.yaml b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra186-pinmux.yaml new file mode 100644 index 00000000000000..ac764d0ac4b638 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra186-pinmux.yaml @@ -0,0 +1,285 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/nvidia,tegra186-pinmux.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: NVIDIA Tegra186 Pinmux Controller + +maintainers: + - Thierry Reding + - Jon Hunter + +properties: + compatible: + enum: + - nvidia,tegra186-pinmux + - nvidia,tegra186-pinmux-aon + + reg: + items: + - description: pinmux registers + +patternProperties: + "^pinmux(-[a-z0-9-]+)?$": + type: object + + # pin groups + additionalProperties: + $ref: nvidia,tegra-pinmux-common.yaml + unevaluatedProperties: false + properties: + nvidia,function: + enum: [ aud, can0, can1, ccla, dca, dcb, dcc, directdc, directdc1, + displaya, displayb, dmic1, dmic2, dmic3, dmic4, dmic5, dp, + dspk0, dspk1, dtv, eqos, extperiph1, extperiph2, extperiph3, + extperiph4, gp, gpio, hdmi, i2c1, i2c2, i2c3, i2c5, i2c7, + i2c8, i2c9, i2s1, i2s2, i2s3, i2s4, i2s5, i2s6, iqc0, iqc1, + nv, pe, pe0, pe1, pe2, qspi, rsvd0, rsvd1, rsvd2, rsvd3, + sata, sce, sdmmc1, sdmmc2, sdmmc3, sdmmc4, soc, spdif, spi1, + spi2, spi3, spi4, touch, uarta, uartb, uartc, uartd, uarte, + uartf, uartg, ufs0, usb, vgp1, vgp2, vgp3, vgp4, vgp5, vgp6, + wdt ] + + nvidia,pull: true + nvidia,tristate: true + nvidia,schmitt: true + nvidia,enable-input: true + nvidia,open-drain: true + nvidia,lock: true + nvidia,drive-type: true + nvidia,io-hv: true + + required: + - nvidia,pins + +additionalProperties: false + +allOf: + - if: + properties: + compatible: + const: nvidia,tegra186-pinmux + then: + patternProperties: + "^pinmux(-[a-z0-9-]+)?$": + type: object + additionalProperties: + properties: + nvidia,pins: + description: An array of strings. Each string contains the name + of a pin or group. Valid values for these names are listed + below. + items: + enum: [ pex_l0_rst_n_pa0, pex_l0_clkreq_n_pa1, + pex_wake_n_pa2, pex_l1_rst_n_pa3, + pex_l1_clkreq_n_pa4, pex_l2_rst_n_pa5, + pex_l2_clkreq_n_pa6, uart4_tx_pb0, uart4_rx_pb1, + uart4_rts_pb2, uart4_cts_pb3, gpio_wan1_pb4, + gpio_wan2_pb5, gpio_wan3_pb6, gpio_wan4_pc0, + dap2_sclk_pc1, dap2_dout_pc2, dap2_din_pc3, + dap2_fs_pc4, gen1_i2c_scl_pc5, gen1_i2c_sda_pc6, + sdmmc1_clk_pd0, sdmmc1_cmd_pd1, sdmmc1_dat0_pd2, + sdmmc1_dat1_pd3, sdmmc1_dat2_pd4, sdmmc1_dat3_pd5, + eqos_txc_pe0, eqos_td0_pe1, eqos_td1_pe2, + eqos_td2_pe3, eqos_td3_pe4, eqos_tx_ctl_pe5, + eqos_rd0_pe6, eqos_rd1_pe7, eqos_rd2_pf0, + eqos_rd3_pf1, eqos_rx_ctl_pf2, eqos_rxc_pf3, + eqos_mdio_pf4, eqos_mdc_pf5, sdmmc3_clk_pg0, + sdmmc3_cmd_pg1, sdmmc3_dat0_pg2, sdmmc3_dat1_pg3, + sdmmc3_dat2_pg4, sdmmc3_dat3_pg5, gpio_wan5_ph0, + gpio_wan6_ph1, gpio_wan7_ph2, gpio_wan8_ph3, + bcpu_pwr_req_ph4, mcpu_pwr_req_ph5, gpu_pwr_req_ph6, + gpio_pq0_pi0, gpio_pq1_pi1, gpio_pq2_pi2, + gpio_pq3_pi3, gpio_pq4_pi4, gpio_pq5_pi5, + gpio_pq6_pi6, gpio_pq7_pi7, dap1_sclk_pj0, + dap1_dout_pj1, dap1_din_pj2, dap1_fs_pj3, + aud_mclk_pj4, gpio_aud0_pj5, gpio_aud1_pj6, + gpio_aud2_pj7, gpio_aud3_pk0, gen7_i2c_scl_pl0, + gen7_i2c_sda_pl1, gen9_i2c_scl_pl2, gen9_i2c_sda_pl3, + usb_vbus_en0_pl4, usb_vbus_en1_pl5, gp_pwm6_pl6, + gp_pwm7_pl7, dmic1_dat_pm0, dmic1_clk_pm1, + dmic2_dat_pm2, dmic2_clk_pm3, dmic4_dat_pm4, + dmic4_clk_pm5, gpio_cam1_pn0, gpio_cam2_pn1, + gpio_cam3_pn2, gpio_cam4_pn3, gpio_cam6_pn5, + gpio_cam7_pn6, extperiph1_clk_po0, + extperiph2_clk_po1, cam_i2c_scl_po2, cam_i2c_sda_po3, + dp_aux_ch0_hpd_pp0, dp_aux_ch1_hpd_pp1, hdmi_cec_pp2, + gpio_edp0_pp3, gpio_edp1_pp4, gpio_edp2_pp5, + gpio_edp3_pp6, directdc1_clk_pq0, directdc1_in_pq1, + directdc1_out0_pq2, directdc1_out1_pq3, + directdc1_out2_pq4, directdc1_out3_pq5, + qspi_sck_pr0, qspi_io0_pr1, qspi_io1_pr2, + qspi_io2_pr3, qspi_io3_pr4, qspi_cs_n_pr5, + uart1_tx_pt0, uart1_rx_pt1, uart1_rts_pt2, + uart1_cts_pt3, uart2_tx_px0, uart2_rx_px1, + uart2_rts_px2, uart2_cts_px3, uart5_tx_px4, + uart5_rx_px5, uart5_rts_px6, uart5_cts_px7, + gpio_mdm1_py0, gpio_mdm2_py1, gpio_mdm3_py2, + gpio_mdm4_py3, gpio_mdm5_py4, gpio_mdm6_py5, + gpio_mdm7_py6, ufs0_ref_clk_pbb0, ufs0_rst_pbb1, + dap4_sclk_pcc0, dap4_dout_pcc1, dap4_din_pcc2, + dap4_fs_pcc3, directdc_comp, sdmmc1_comp, eqos_comp, + sdmmc3_comp, qspi_comp, + # drive groups + drive_gpio_aud3_pk0, drive_gpio_aud2_pj7, + drive_gpio_aud1_pj6, drive_gpio_aud0_pj5, + drive_aud_mclk_pj4, drive_dap1_fs_pj3, + drive_dap1_din_pj2, drive_dap1_dout_pj1, + drive_dap1_sclk_pj0, drive_dmic1_clk_pm1, + drive_dmic1_dat_pm0, drive_dmic2_dat_pm2, + drive_dmic2_clk_pm3, drive_dmic4_dat_pm4, + drive_dmic4_clk_pm5, drive_dap4_fs_pcc3, + drive_dap4_din_pcc2, drive_dap4_dout_pcc1, + drive_dap4_sclk_pcc0, drive_extperiph2_clk_po1, + drive_extperiph1_clk_po0, drive_cam_i2c_sda_po3, + drive_cam_i2c_scl_po2, drive_gpio_cam1_pn0, + drive_gpio_cam2_pn1, drive_gpio_cam3_pn2, + drive_gpio_cam4_pn3, drive_gpio_cam5_pn4, + drive_gpio_cam6_pn5, drive_gpio_cam7_pn6, + drive_dap2_din_pc3, drive_dap2_dout_pc2, + drive_dap2_fs_pc4, drive_dap2_sclk_pc1, + drive_uart4_cts_pb3, drive_uart4_rts_pb2, + drive_uart4_rx_pb1, drive_uart4_tx_pb0, + drive_gpio_wan4_pc0, drive_gpio_wan3_pb6, + drive_gpio_wan2_pb5, drive_gpio_wan1_pb4, + drive_gen1_i2c_scl_pc5, drive_gen1_i2c_sda_pc6, + drive_uart1_cts_pt3, drive_uart1_rts_pt2, + drive_uart1_rx_pt1, drive_uart1_tx_pt0, + drive_directdc1_out3_pq5, drive_directdc1_out2_pq4, + drive_directdc1_out1_pq3, drive_directdc1_out0_pq2, + drive_directdc1_in_pq1, drive_directdc1_clk_pq0, + drive_gpio_pq0_pi0, drive_gpio_pq1_pi1, + drive_gpio_pq2_pi2, drive_gpio_pq3_pi3, + drive_gpio_pq4_pi4, drive_gpio_pq5_pi5, + drive_gpio_pq6_pi6, drive_gpio_pq7_pi7, + drive_gpio_edp2_pp5, drive_gpio_edp3_pp6, + drive_gpio_edp0_pp3, drive_gpio_edp1_pp4, + drive_dp_aux_ch0_hpd_pp0, drive_dp_aux_ch1_hpd_pp1, + drive_hdmi_cec_pp2, drive_pex_l2_clkreq_n_pa6, + drive_pex_wake_n_pa2, drive_pex_l1_clkreq_n_pa4, + drive_pex_l1_rst_n_pa3, drive_pex_l0_clkreq_n_pa1, + drive_pex_l0_rst_n_pa0, drive_pex_l2_rst_n_pa5, + drive_sdmmc1_clk_pd0, drive_sdmmc1_cmd_pd1, + drive_sdmmc1_dat3_pd5, drive_sdmmc1_dat2_pd4, + drive_sdmmc1_dat1_pd3, drive_sdmmc1_dat0_pd2, + drive_eqos_td3_pe4, drive_eqos_td2_pe3, + drive_eqos_td1_pe2, drive_eqos_td0_pe1, + drive_eqos_rd3_pf1, drive_eqos_rd2_pf0, + drive_eqos_rd1_pe7, drive_eqos_mdio_pf4, + drive_eqos_rd0_pe6, drive_eqos_mdc_pf5, + drive_eqos_txc_pe0, drive_eqos_rxc_pf3, + drive_eqos_tx_ctl_pe5, drive_eqos_rx_ctl_pf2, + drive_sdmmc3_dat3_pg5, drive_sdmmc3_dat2_pg4, + drive_sdmmc3_dat1_pg3, drive_sdmmc3_dat0_pg2, + drive_sdmmc3_cmd_pg1, drive_sdmmc3_clk_pg0, + drive_qspi_io3_pr4, drive_qspi_io2_pr3, + drive_qspi_io1_pr2, drive_qspi_io0_pr1, + drive_qspi_sck_pr0, drive_qspi_cs_n_pr5, + drive_gpio_wan8_ph3, drive_gpio_wan7_ph2, + drive_gpio_wan6_ph1, drive_gpio_wan5_ph0, + drive_uart2_tx_px0, drive_uart2_rx_px1, + drive_uart2_rts_px2, drive_uart2_cts_px3, + drive_uart5_rx_px5, drive_uart5_tx_px4, + drive_uart5_rts_px6, drive_uart5_cts_px7, + drive_gpio_mdm1_py0, drive_gpio_mdm2_py1, + drive_gpio_mdm3_py2, drive_gpio_mdm4_py3, + drive_gpio_mdm5_py4, drive_gpio_mdm6_py5, + drive_gpio_mdm7_py6, drive_bcpu_pwr_req_ph4, + drive_mcpu_pwr_req_ph5, drive_gpu_pwr_req_ph6, + drive_gen7_i2c_scl_pl0, drive_gen7_i2c_sda_pl1, + drive_gen9_i2c_sda_pl3, drive_gen9_i2c_scl_pl2, + drive_usb_vbus_en0_pl4, drive_usb_vbus_en1_pl5, + drive_gp_pwm7_pl7, drive_gp_pwm6_pl6, + drive_ufs0_rst_pbb1, drive_ufs0_ref_clk_pbb0, + drive_directdc_comp, drive_sdmmc1_comp, + drive_eqos_comp, drive_sdmmc3_comp, drive_sdmmc4_clk, + drive_sdmmc4_cmd, drive_sdmmc4_dqs, + drive_sdmmc4_dat7, drive_sdmmc4_dat6, + drive_sdmmc4_dat5, drive_sdmmc4_dat4, + drive_sdmmc4_dat3, drive_sdmmc4_dat2, + drive_sdmmc4_dat1, drive_sdmmc4_dat0, + drive_qspi_comp ] + + - if: + properties: + compatible: + const: nvidia,tegra186-pinmux-aon + then: + patternProperties: + "^pinmux(-[a-z0-9-]+)?$": + type: object + additionalProperties: + properties: + nvidia,pins: + items: + enum: [ pwr_i2c_scl_ps0, pwr_i2c_sda_ps1, batt_oc_ps2, + safe_state_ps3, vcomp_alert_ps4, gpio_dis0_pu0, + gpio_dis1_pu1, gpio_dis2_pu2, gpio_dis3_pu3, + gpio_dis4_pu4, gpio_dis5_pu5, gpio_sen0_pv0, + gpio_sen1_pv1, gpio_sen2_pv2, gpio_sen3_pv3, + gpio_sen4_pv4, gpio_sen5_pv5, gpio_sen6_pv6, + gpio_sen7_pv7, gen8_i2c_scl_pw0, gen8_i2c_sda_pw1, + uart3_tx_pw2, uart3_rx_pw3, uart3_rts_pw4, + uart3_cts_pw5, uart7_tx_pw6, uart7_rx_pw7, + can1_dout_pz0, can1_din_pz1, can0_dout_pz2, + can0_din_pz3, can_gpio0_paa0, can_gpio1_paa1, + can_gpio2_paa2, can_gpio3_paa3, can_gpio4_paa4, + can_gpio5_paa5, can_gpio6_paa6, can_gpio7_paa7, + gpio_sen8_pee0, gpio_sen9_pee1, touch_clk_pee2, + power_on_pff0, gpio_sw1_pff1, gpio_sw2_pff2, + gpio_sw3_pff3, gpio_sw4_pff4, shutdown, pmu_int, + soc_pwr_req, clk_32k_in, + # drive groups + drive_touch_clk_pee2, drive_uart3_cts_pw5, + drive_uart3_rts_pw4, drive_uart3_rx_pw3, + drive_uart3_tx_pw2, drive_gen8_i2c_sda_pw1, + drive_gen8_i2c_scl_pw0, drive_uart7_rx_pw7, + drive_uart7_tx_pw6, drive_gpio_sen0_pv0, + drive_gpio_sen1_pv1, drive_gpio_sen2_pv2, + drive_gpio_sen3_pv3, drive_gpio_sen4_pv4, + drive_gpio_sen5_pv5, drive_gpio_sen6_pv6, + drive_gpio_sen7_pv7, drive_gpio_sen8_pee0, + drive_gpio_sen9_pee1, drive_can_gpio7_paa7, + drive_can1_dout_pz0, drive_can1_din_pz1, + drive_can0_dout_pz2, drive_can0_din_pz3, + drive_can_gpio0_paa0, drive_can_gpio1_paa1, + drive_can_gpio2_paa2, drive_can_gpio3_paa3, + drive_can_gpio4_paa4, drive_can_gpio5_paa5, + drive_can_gpio6_paa6, drive_gpio_sw1_pff1, + drive_gpio_sw2_pff2, drive_gpio_sw3_pff3, + drive_gpio_sw4_pff4, drive_shutdown, drive_pmu_int, + drive_safe_state_ps3, drive_vcomp_alert_ps4, + drive_soc_pwr_req, drive_batt_oc_ps2, + drive_clk_32k_in, drive_power_on_pff0, + drive_pwr_i2c_scl_ps0, drive_pwr_i2c_sda_ps1, + drive_gpio_dis0_pu0, drive_gpio_dis1_pu1, + drive_gpio_dis2_pu2, drive_gpio_dis3_pu3, + drive_gpio_dis4_pu4, drive_gpio_dis5_pu5 ] + +required: + - compatible + - reg + +examples: + - | + #include + + pinmux@2430000 { + compatible = "nvidia,tegra186-pinmux"; + reg = <0x2430000 0x15000>; + + pinctrl-names = "jetson_io"; + pinctrl-0 = <&jetson_io_pinmux>; + + jetson_io_pinmux: pinmux { + hdr40-pin7 { + nvidia,pins = "aud_mclk_pj4"; + nvidia,function = "aud"; + nvidia,pull = ; + nvidia,tristate = ; + nvidia,enable-input = ; + }; + }; + }; +... From eed8e4c07d85c8955a44502bc1f61e4155c94051 Mon Sep 17 00:00:00 2001 From: Shankari Anand Date: Thu, 14 Aug 2025 15:31:01 +0530 Subject: [PATCH 0265/3206] rust: fs: update ARef and AlwaysRefCounted imports from sync::aref Update call sites in the fs subsystem to import `ARef` and `AlwaysRefCounted` from `sync::aref` instead of `types`. This aligns with the ongoing effort to move `ARef` and `AlwaysRefCounted` to sync. Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1173 Acked-by: Alice Ryhl Signed-off-by: Shankari Anand Link: https://lore.kernel.org/20250814100101.304408-1-shankari.ak0208@gmail.com Signed-off-by: Christian Brauner --- rust/kernel/fs/file.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rust/kernel/fs/file.rs b/rust/kernel/fs/file.rs index 35fd5db35c4652..18cf579d3312fa 100644 --- a/rust/kernel/fs/file.rs +++ b/rust/kernel/fs/file.rs @@ -11,7 +11,8 @@ use crate::{ bindings, cred::Credential, error::{code::*, Error, Result}, - types::{ARef, AlwaysRefCounted, NotThreadSafe, Opaque}, + sync::aref::{ARef, AlwaysRefCounted}, + types::{NotThreadSafe, Opaque}, }; use core::ptr; From 542baf77f092b435ddd2b340e82dbb7b14559d56 Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Tue, 12 Aug 2025 16:24:41 -0500 Subject: [PATCH 0266/3206] pinctrl: tegra: Add Tegra186 pinmux driver This is based on Nvidia's downstream 5.10 driver, rewritten to match the mainline Tegra194 pinmux driver. Signed-off-by: Aaron Kling Link: https://lore.kernel.org/20250812-tegra186-pinctrl-v3-2-115714eeecb1@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/tegra/Kconfig | 4 + drivers/pinctrl/tegra/Makefile | 1 + drivers/pinctrl/tegra/pinctrl-tegra186.c | 1979 ++++++++++++++++++++++ drivers/soc/tegra/Kconfig | 1 + 4 files changed, 1985 insertions(+) create mode 100644 drivers/pinctrl/tegra/pinctrl-tegra186.c diff --git a/drivers/pinctrl/tegra/Kconfig b/drivers/pinctrl/tegra/Kconfig index 4e87d19323ba88..660d101ea3679a 100644 --- a/drivers/pinctrl/tegra/Kconfig +++ b/drivers/pinctrl/tegra/Kconfig @@ -24,6 +24,10 @@ config PINCTRL_TEGRA210 bool select PINCTRL_TEGRA +config PINCTRL_TEGRA186 + bool + select PINCTRL_TEGRA + config PINCTRL_TEGRA194 bool select PINCTRL_TEGRA diff --git a/drivers/pinctrl/tegra/Makefile b/drivers/pinctrl/tegra/Makefile index a93973701d4cce..82176526549e7d 100644 --- a/drivers/pinctrl/tegra/Makefile +++ b/drivers/pinctrl/tegra/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_PINCTRL_TEGRA30) += pinctrl-tegra30.o obj-$(CONFIG_PINCTRL_TEGRA114) += pinctrl-tegra114.o obj-$(CONFIG_PINCTRL_TEGRA124) += pinctrl-tegra124.o obj-$(CONFIG_PINCTRL_TEGRA210) += pinctrl-tegra210.o +obj-$(CONFIG_PINCTRL_TEGRA186) += pinctrl-tegra186.o obj-$(CONFIG_PINCTRL_TEGRA194) += pinctrl-tegra194.o obj-$(CONFIG_PINCTRL_TEGRA234) += pinctrl-tegra234.o obj-$(CONFIG_PINCTRL_TEGRA_XUSB) += pinctrl-tegra-xusb.o diff --git a/drivers/pinctrl/tegra/pinctrl-tegra186.c b/drivers/pinctrl/tegra/pinctrl-tegra186.c new file mode 100644 index 00000000000000..4a1d6476af9bb4 --- /dev/null +++ b/drivers/pinctrl/tegra/pinctrl-tegra186.c @@ -0,0 +1,1979 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Pinctrl data for the NVIDIA Tegra186 pinmux + * + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include + +#include "pinctrl-tegra.h" + +/* Define unique ID for each pins */ +enum { + TEGRA_PIN_PEX_L0_RST_N_PA0, + TEGRA_PIN_PEX_L0_CLKREQ_N_PA1, + TEGRA_PIN_PEX_WAKE_N_PA2, + TEGRA_PIN_PEX_L1_RST_N_PA3, + TEGRA_PIN_PEX_L1_CLKREQ_N_PA4, + TEGRA_PIN_PEX_L2_RST_N_PA5, + TEGRA_PIN_PEX_L2_CLKREQ_N_PA6, + TEGRA_PIN_UART4_TX_PB0, + TEGRA_PIN_UART4_RX_PB1, + TEGRA_PIN_UART4_RTS_PB2, + TEGRA_PIN_UART4_CTS_PB3, + TEGRA_PIN_GPIO_WAN1_PB4, + TEGRA_PIN_GPIO_WAN2_PB5, + TEGRA_PIN_GPIO_WAN3_PB6, + TEGRA_PIN_GPIO_WAN4_PC0, + TEGRA_PIN_DAP2_SCLK_PC1, + TEGRA_PIN_DAP2_DOUT_PC2, + TEGRA_PIN_DAP2_DIN_PC3, + TEGRA_PIN_DAP2_FS_PC4, + TEGRA_PIN_GEN1_I2C_SCL_PC5, + TEGRA_PIN_GEN1_I2C_SDA_PC6, + TEGRA_PIN_SDMMC1_CLK_PD0, + TEGRA_PIN_SDMMC1_CMD_PD1, + TEGRA_PIN_SDMMC1_DAT0_PD2, + TEGRA_PIN_SDMMC1_DAT1_PD3, + TEGRA_PIN_SDMMC1_DAT2_PD4, + TEGRA_PIN_SDMMC1_DAT3_PD5, + TEGRA_PIN_EQOS_TXC_PE0, + TEGRA_PIN_EQOS_TD0_PE1, + TEGRA_PIN_EQOS_TD1_PE2, + TEGRA_PIN_EQOS_TD2_PE3, + TEGRA_PIN_EQOS_TD3_PE4, + TEGRA_PIN_EQOS_TX_CTL_PE5, + TEGRA_PIN_EQOS_RD0_PE6, + TEGRA_PIN_EQOS_RD1_PE7, + TEGRA_PIN_EQOS_RD2_PF0, + TEGRA_PIN_EQOS_RD3_PF1, + TEGRA_PIN_EQOS_RX_CTL_PF2, + TEGRA_PIN_EQOS_RXC_PF3, + TEGRA_PIN_EQOS_MDIO_PF4, + TEGRA_PIN_EQOS_MDC_PF5, + TEGRA_PIN_SDMMC3_CLK_PG0, + TEGRA_PIN_SDMMC3_CMD_PG1, + TEGRA_PIN_SDMMC3_DAT0_PG2, + TEGRA_PIN_SDMMC3_DAT1_PG3, + TEGRA_PIN_SDMMC3_DAT2_PG4, + TEGRA_PIN_SDMMC3_DAT3_PG5, + TEGRA_PIN_GPIO_WAN5_PH0, + TEGRA_PIN_GPIO_WAN6_PH1, + TEGRA_PIN_GPIO_WAN7_PH2, + TEGRA_PIN_GPIO_WAN8_PH3, + TEGRA_PIN_BCPU_PWR_REQ_PH4, + TEGRA_PIN_MCPU_PWR_REQ_PH5, + TEGRA_PIN_GPU_PWR_REQ_PH6, + TEGRA_PIN_GPIO_PQ0_PI0, + TEGRA_PIN_GPIO_PQ1_PI1, + TEGRA_PIN_GPIO_PQ2_PI2, + TEGRA_PIN_GPIO_PQ3_PI3, + TEGRA_PIN_GPIO_PQ4_PI4, + TEGRA_PIN_GPIO_PQ5_PI5, + TEGRA_PIN_GPIO_PQ6_PI6, + TEGRA_PIN_GPIO_PQ7_PI7, + TEGRA_PIN_DAP1_SCLK_PJ0, + TEGRA_PIN_DAP1_DOUT_PJ1, + TEGRA_PIN_DAP1_DIN_PJ2, + TEGRA_PIN_DAP1_FS_PJ3, + TEGRA_PIN_AUD_MCLK_PJ4, + TEGRA_PIN_GPIO_AUD0_PJ5, + TEGRA_PIN_GPIO_AUD1_PJ6, + TEGRA_PIN_GPIO_AUD2_PJ7, + TEGRA_PIN_GPIO_AUD3_PK0, + TEGRA_PIN_GEN7_I2C_SCL_PL0, + TEGRA_PIN_GEN7_I2C_SDA_PL1, + TEGRA_PIN_GEN9_I2C_SCL_PL2, + TEGRA_PIN_GEN9_I2C_SDA_PL3, + TEGRA_PIN_USB_VBUS_EN0_PL4, + TEGRA_PIN_USB_VBUS_EN1_PL5, + TEGRA_PIN_GP_PWM6_PL6, + TEGRA_PIN_GP_PWM7_PL7, + TEGRA_PIN_DMIC1_DAT_PM0, + TEGRA_PIN_DMIC1_CLK_PM1, + TEGRA_PIN_DMIC2_DAT_PM2, + TEGRA_PIN_DMIC2_CLK_PM3, + TEGRA_PIN_DMIC4_DAT_PM4, + TEGRA_PIN_DMIC4_CLK_PM5, + TEGRA_PIN_GPIO_CAM1_PN0, + TEGRA_PIN_GPIO_CAM2_PN1, + TEGRA_PIN_GPIO_CAM3_PN2, + TEGRA_PIN_GPIO_CAM4_PN3, + TEGRA_PIN_GPIO_CAM5_PN4, + TEGRA_PIN_GPIO_CAM6_PN5, + TEGRA_PIN_GPIO_CAM7_PN6, + TEGRA_PIN_EXTPERIPH1_CLK_PO0, + TEGRA_PIN_EXTPERIPH2_CLK_PO1, + TEGRA_PIN_CAM_I2C_SCL_PO2, + TEGRA_PIN_CAM_I2C_SDA_PO3, + TEGRA_PIN_DP_AUX_CH0_HPD_PP0, + TEGRA_PIN_DP_AUX_CH1_HPD_PP1, + TEGRA_PIN_HDMI_CEC_PP2, + TEGRA_PIN_GPIO_EDP0_PP3, + TEGRA_PIN_GPIO_EDP1_PP4, + TEGRA_PIN_GPIO_EDP2_PP5, + TEGRA_PIN_GPIO_EDP3_PP6, + TEGRA_PIN_DIRECTDC1_CLK_PQ0, + TEGRA_PIN_DIRECTDC1_IN_PQ1, + TEGRA_PIN_DIRECTDC1_OUT0_PQ2, + TEGRA_PIN_DIRECTDC1_OUT1_PQ3, + TEGRA_PIN_DIRECTDC1_OUT2_PQ4, + TEGRA_PIN_DIRECTDC1_OUT3_PQ5, + TEGRA_PIN_QSPI_SCK_PR0, + TEGRA_PIN_QSPI_IO0_PR1, + TEGRA_PIN_QSPI_IO1_PR2, + TEGRA_PIN_QSPI_IO2_PR3, + TEGRA_PIN_QSPI_IO3_PR4, + TEGRA_PIN_QSPI_CS_N_PR5, + TEGRA_PIN_UART1_TX_PT0, + TEGRA_PIN_UART1_RX_PT1, + TEGRA_PIN_UART1_RTS_PT2, + TEGRA_PIN_UART1_CTS_PT3, + TEGRA_PIN_UART2_TX_PX0, + TEGRA_PIN_UART2_RX_PX1, + TEGRA_PIN_UART2_RTS_PX2, + TEGRA_PIN_UART2_CTS_PX3, + TEGRA_PIN_UART5_TX_PX4, + TEGRA_PIN_UART5_RX_PX5, + TEGRA_PIN_UART5_RTS_PX6, + TEGRA_PIN_UART5_CTS_PX7, + TEGRA_PIN_GPIO_MDM1_PY0, + TEGRA_PIN_GPIO_MDM2_PY1, + TEGRA_PIN_GPIO_MDM3_PY2, + TEGRA_PIN_GPIO_MDM4_PY3, + TEGRA_PIN_GPIO_MDM5_PY4, + TEGRA_PIN_GPIO_MDM6_PY5, + TEGRA_PIN_GPIO_MDM7_PY6, + TEGRA_PIN_UFS0_REF_CLK_PBB0, + TEGRA_PIN_UFS0_RST_PBB1, + TEGRA_PIN_DAP4_SCLK_PCC0, + TEGRA_PIN_DAP4_DOUT_PCC1, + TEGRA_PIN_DAP4_DIN_PCC2, + TEGRA_PIN_DAP4_FS_PCC3, + TEGRA_PIN_DIRECTDC_COMP, + TEGRA_PIN_SDMMC1_COMP, + TEGRA_PIN_EQOS_COMP, + TEGRA_PIN_SDMMC3_COMP, + TEGRA_PIN_QSPI_COMP, +}; + +enum { + TEGRA_PIN_PWR_I2C_SCL_PS0, + TEGRA_PIN_PWR_I2C_SDA_PS1, + TEGRA_PIN_BATT_OC_PS2, + TEGRA_PIN_SAFE_STATE_PS3, + TEGRA_PIN_VCOMP_ALERT_PS4, + TEGRA_PIN_GPIO_DIS0_PU0, + TEGRA_PIN_GPIO_DIS1_PU1, + TEGRA_PIN_GPIO_DIS2_PU2, + TEGRA_PIN_GPIO_DIS3_PU3, + TEGRA_PIN_GPIO_DIS4_PU4, + TEGRA_PIN_GPIO_DIS5_PU5, + TEGRA_PIN_GPIO_SEN0_PV0, + TEGRA_PIN_GPIO_SEN1_PV1, + TEGRA_PIN_GPIO_SEN2_PV2, + TEGRA_PIN_GPIO_SEN3_PV3, + TEGRA_PIN_GPIO_SEN4_PV4, + TEGRA_PIN_GPIO_SEN5_PV5, + TEGRA_PIN_GPIO_SEN6_PV6, + TEGRA_PIN_GPIO_SEN7_PV7, + TEGRA_PIN_GEN8_I2C_SCL_PW0, + TEGRA_PIN_GEN8_I2C_SDA_PW1, + TEGRA_PIN_UART3_TX_PW2, + TEGRA_PIN_UART3_RX_PW3, + TEGRA_PIN_UART3_RTS_PW4, + TEGRA_PIN_UART3_CTS_PW5, + TEGRA_PIN_UART7_TX_PW6, + TEGRA_PIN_UART7_RX_PW7, + TEGRA_PIN_CAN1_DOUT_PZ0, + TEGRA_PIN_CAN1_DIN_PZ1, + TEGRA_PIN_CAN0_DOUT_PZ2, + TEGRA_PIN_CAN0_DIN_PZ3, + TEGRA_PIN_CAN_GPIO0_PAA0, + TEGRA_PIN_CAN_GPIO1_PAA1, + TEGRA_PIN_CAN_GPIO2_PAA2, + TEGRA_PIN_CAN_GPIO3_PAA3, + TEGRA_PIN_CAN_GPIO4_PAA4, + TEGRA_PIN_CAN_GPIO5_PAA5, + TEGRA_PIN_CAN_GPIO6_PAA6, + TEGRA_PIN_CAN_GPIO7_PAA7, + TEGRA_PIN_GPIO_SEN8_PEE0, + TEGRA_PIN_GPIO_SEN9_PEE1, + TEGRA_PIN_TOUCH_CLK_PEE2, + TEGRA_PIN_POWER_ON_PFF0, + TEGRA_PIN_GPIO_SW1_PFF1, + TEGRA_PIN_GPIO_SW2_PFF2, + TEGRA_PIN_GPIO_SW3_PFF3, + TEGRA_PIN_GPIO_SW4_PFF4, + TEGRA_PIN_SHUTDOWN, + TEGRA_PIN_PMU_INT, + TEGRA_PIN_SOC_PWR_REQ, + TEGRA_PIN_CLK_32K_IN, +}; + +/* Table for pin descriptor */ +static const struct pinctrl_pin_desc tegra186_pins[] = { + PINCTRL_PIN(TEGRA_PIN_PEX_L0_RST_N_PA0, "PEX_L0_RST_N_PA0"), + PINCTRL_PIN(TEGRA_PIN_PEX_L0_CLKREQ_N_PA1, "PEX_L0_CLKREQ_N_PA1"), + PINCTRL_PIN(TEGRA_PIN_PEX_WAKE_N_PA2, "PEX_WAKE_N_PA2"), + PINCTRL_PIN(TEGRA_PIN_PEX_L1_RST_N_PA3, "PEX_L1_RST_N_PA3"), + PINCTRL_PIN(TEGRA_PIN_PEX_L1_CLKREQ_N_PA4, "PEX_L1_CLKREQ_N_PA4"), + PINCTRL_PIN(TEGRA_PIN_PEX_L2_RST_N_PA5, "PEX_L2_RST_N_PA5"), + PINCTRL_PIN(TEGRA_PIN_PEX_L2_CLKREQ_N_PA6, "PEX_L2_CLKREQ_N_PA6"), + PINCTRL_PIN(TEGRA_PIN_UART4_TX_PB0, "UART4_TX_PB0"), + PINCTRL_PIN(TEGRA_PIN_UART4_RX_PB1, "UART4_RX_PB1"), + PINCTRL_PIN(TEGRA_PIN_UART4_RTS_PB2, "UART4_RTS_PB2"), + PINCTRL_PIN(TEGRA_PIN_UART4_CTS_PB3, "UART4_CTS_PB3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN1_PB4, "GPIO_WAN1_PB4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN2_PB5, "GPIO_WAN2_PB5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN3_PB6, "GPIO_WAN3_PB6"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN4_PC0, "GPIO_WAN4_PC0"), + PINCTRL_PIN(TEGRA_PIN_DAP2_SCLK_PC1, "DAP2_SCLK_PC1"), + PINCTRL_PIN(TEGRA_PIN_DAP2_DOUT_PC2, "DAP2_DOUT_PC2"), + PINCTRL_PIN(TEGRA_PIN_DAP2_DIN_PC3, "DAP2_DIN_PC3"), + PINCTRL_PIN(TEGRA_PIN_DAP2_FS_PC4, "DAP2_FS_PC4"), + PINCTRL_PIN(TEGRA_PIN_GEN1_I2C_SCL_PC5, "GEN1_I2C_SCL_PC5"), + PINCTRL_PIN(TEGRA_PIN_GEN1_I2C_SDA_PC6, "GEN1_I2C_SDA_PC6"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_CLK_PD0, "SDMMC1_CLK_PD0"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_CMD_PD1, "SDMMC1_CMD_PD1"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_DAT0_PD2, "SDMMC1_DAT0_PD2"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_DAT1_PD3, "SDMMC1_DAT1_PD3"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_DAT2_PD4, "SDMMC1_DAT2_PD4"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_DAT3_PD5, "SDMMC1_DAT3_PD5"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TXC_PE0, "EQOS_TXC_PE0"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TD0_PE1, "EQOS_TD0_PE1"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TD1_PE2, "EQOS_TD1_PE2"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TD2_PE3, "EQOS_TD2_PE3"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TD3_PE4, "EQOS_TD3_PE4"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TX_CTL_PE5, "EQOS_TX_CTL_PE5"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RD0_PE6, "EQOS_RD0_PE6"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RD1_PE7, "EQOS_RD1_PE7"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RD2_PF0, "EQOS_RD2_PF0"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RD3_PF1, "EQOS_RD3_PF1"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RX_CTL_PF2, "EQOS_RX_CTL_PF2"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RXC_PF3, "EQOS_RXC_PF3"), + PINCTRL_PIN(TEGRA_PIN_EQOS_MDIO_PF4, "EQOS_MDIO_PF4"), + PINCTRL_PIN(TEGRA_PIN_EQOS_MDC_PF5, "EQOS_MDC_PF5"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_CLK_PG0, "SDMMC3_CLK_PG0"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_CMD_PG1, "SDMMC3_CMD_PG1"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_DAT0_PG2, "SDMMC3_DAT0_PG2"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_DAT1_PG3, "SDMMC3_DAT1_PG3"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_DAT2_PG4, "SDMMC3_DAT2_PG4"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_DAT3_PG5, "SDMMC3_DAT3_PG5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN5_PH0, "GPIO_WAN5_PH0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN6_PH1, "GPIO_WAN6_PH1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN7_PH2, "GPIO_WAN7_PH2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN8_PH3, "GPIO_WAN8_PH3"), + PINCTRL_PIN(TEGRA_PIN_BCPU_PWR_REQ_PH4, "BCPU_PWR_REQ_PH4"), + PINCTRL_PIN(TEGRA_PIN_MCPU_PWR_REQ_PH5, "MCPU_PWR_REQ_PH5"), + PINCTRL_PIN(TEGRA_PIN_GPU_PWR_REQ_PH6, "GPU_PWR_REQ_PH6"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ0_PI0, "GPIO_PQ0_PI0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ1_PI1, "GPIO_PQ1_PI1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ2_PI2, "GPIO_PQ2_PI2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ3_PI3, "GPIO_PQ3_PI3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ4_PI4, "GPIO_PQ4_PI4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ5_PI5, "GPIO_PQ5_PI5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ6_PI6, "GPIO_PQ6_PI6"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ7_PI7, "GPIO_PQ7_PI7"), + PINCTRL_PIN(TEGRA_PIN_DAP1_SCLK_PJ0, "DAP1_SCLK_PJ0"), + PINCTRL_PIN(TEGRA_PIN_DAP1_DOUT_PJ1, "DAP1_DOUT_PJ1"), + PINCTRL_PIN(TEGRA_PIN_DAP1_DIN_PJ2, "DAP1_DIN_PJ2"), + PINCTRL_PIN(TEGRA_PIN_DAP1_FS_PJ3, "DAP1_FS_PJ3"), + PINCTRL_PIN(TEGRA_PIN_AUD_MCLK_PJ4, "AUD_MCLK_PJ4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_AUD0_PJ5, "GPIO_AUD0_PJ5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_AUD1_PJ6, "GPIO_AUD1_PJ6"), + PINCTRL_PIN(TEGRA_PIN_GPIO_AUD2_PJ7, "GPIO_AUD2_PJ7"), + PINCTRL_PIN(TEGRA_PIN_GPIO_AUD3_PK0, "GPIO_AUD3_PK0"), + PINCTRL_PIN(TEGRA_PIN_GEN7_I2C_SCL_PL0, "GEN7_I2C_SCL_PL0"), + PINCTRL_PIN(TEGRA_PIN_GEN7_I2C_SDA_PL1, "GEN7_I2C_SDA_PL1"), + PINCTRL_PIN(TEGRA_PIN_GEN9_I2C_SCL_PL2, "GEN9_I2C_SCL_PL2"), + PINCTRL_PIN(TEGRA_PIN_GEN9_I2C_SDA_PL3, "GEN9_I2C_SDA_PL3"), + PINCTRL_PIN(TEGRA_PIN_USB_VBUS_EN0_PL4, "USB_VBUS_EN0_PL4"), + PINCTRL_PIN(TEGRA_PIN_USB_VBUS_EN1_PL5, "USB_VBUS_EN1_PL5"), + PINCTRL_PIN(TEGRA_PIN_GP_PWM6_PL6, "GP_PWM6_PL6"), + PINCTRL_PIN(TEGRA_PIN_GP_PWM7_PL7, "GP_PWM7_PL7"), + PINCTRL_PIN(TEGRA_PIN_DMIC1_DAT_PM0, "DMIC1_DAT_PM0"), + PINCTRL_PIN(TEGRA_PIN_DMIC1_CLK_PM1, "DMIC1_CLK_PM1"), + PINCTRL_PIN(TEGRA_PIN_DMIC2_DAT_PM2, "DMIC2_DAT_PM2"), + PINCTRL_PIN(TEGRA_PIN_DMIC2_CLK_PM3, "DMIC2_CLK_PM3"), + PINCTRL_PIN(TEGRA_PIN_DMIC4_DAT_PM4, "DMIC4_DAT_PM4"), + PINCTRL_PIN(TEGRA_PIN_DMIC4_CLK_PM5, "DMIC4_CLK_PM5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM1_PN0, "GPIO_CAM1_PN0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM2_PN1, "GPIO_CAM2_PN1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM3_PN2, "GPIO_CAM3_PN2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM4_PN3, "GPIO_CAM4_PN3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM5_PN4, "GPIO_CAM6_PN5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM6_PN5, "GPIO_CAM6_PN5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM7_PN6, "GPIO_CAM7_PN6"), + PINCTRL_PIN(TEGRA_PIN_EXTPERIPH1_CLK_PO0, "EXTPERIPH1_CLK_PO0"), + PINCTRL_PIN(TEGRA_PIN_EXTPERIPH2_CLK_PO1, "EXTPERIPH2_CLK_PO1"), + PINCTRL_PIN(TEGRA_PIN_CAM_I2C_SCL_PO2, "CAM_I2C_SCL_PO2"), + PINCTRL_PIN(TEGRA_PIN_CAM_I2C_SDA_PO3, "CAM_I2C_SDA_PO3"), + PINCTRL_PIN(TEGRA_PIN_DP_AUX_CH0_HPD_PP0, "DP_AUX_CH0_HPD_PP0"), + PINCTRL_PIN(TEGRA_PIN_DP_AUX_CH1_HPD_PP1, "DP_AUX_CH1_HPD_PP1"), + PINCTRL_PIN(TEGRA_PIN_HDMI_CEC_PP2, "HDMI_CEC_PP2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_EDP0_PP3, "GPIO_EDP0_PP3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_EDP1_PP4, "GPIO_EDP1_PP4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_EDP2_PP5, "GPIO_EDP2_PP5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_EDP3_PP6, "GPIO_EDP3_PP6"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_CLK_PQ0, "DIRECTDC1_CLK_PQ0"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_IN_PQ1, "DIRECTDC1_IN_PQ1"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_OUT0_PQ2, "DIRECTDC1_OUT0_PQ2"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_OUT1_PQ3, "DIRECTDC1_OUT1_PQ3"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_OUT2_PQ4, "DIRECTDC1_OUT2_PQ4"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_OUT3_PQ5, "DIRECTDC1_OUT3_PQ5"), + PINCTRL_PIN(TEGRA_PIN_QSPI_SCK_PR0, "QSPI_SCK_PR0"), + PINCTRL_PIN(TEGRA_PIN_QSPI_IO0_PR1, "QSPI_IO0_PR1"), + PINCTRL_PIN(TEGRA_PIN_QSPI_IO1_PR2, "QSPI_IO1_PR2"), + PINCTRL_PIN(TEGRA_PIN_QSPI_IO2_PR3, "QSPI_IO2_PR3"), + PINCTRL_PIN(TEGRA_PIN_QSPI_IO3_PR4, "QSPI_IO3_PR4"), + PINCTRL_PIN(TEGRA_PIN_QSPI_CS_N_PR5, "QSPI_CS_N_PR5"), + PINCTRL_PIN(TEGRA_PIN_UART1_TX_PT0, "UART1_TX_PT0"), + PINCTRL_PIN(TEGRA_PIN_UART1_RX_PT1, "UART1_RX_PT1"), + PINCTRL_PIN(TEGRA_PIN_UART1_RTS_PT2, "UART1_RTS_PT2"), + PINCTRL_PIN(TEGRA_PIN_UART1_CTS_PT3, "UART1_CTS_PT3"), + PINCTRL_PIN(TEGRA_PIN_UART2_TX_PX0, "UART2_TX_PX0"), + PINCTRL_PIN(TEGRA_PIN_UART2_RX_PX1, "UART2_RX_PX1"), + PINCTRL_PIN(TEGRA_PIN_UART2_RTS_PX2, "UART2_RTS_PX2"), + PINCTRL_PIN(TEGRA_PIN_UART2_CTS_PX3, "UART2_CTS_PX3"), + PINCTRL_PIN(TEGRA_PIN_UART5_TX_PX4, "UART5_TX_PX4"), + PINCTRL_PIN(TEGRA_PIN_UART5_RX_PX5, "UART5_RX_PX5"), + PINCTRL_PIN(TEGRA_PIN_UART5_RTS_PX6, "UART5_RTS_PX6"), + PINCTRL_PIN(TEGRA_PIN_UART5_CTS_PX7, "UART5_CTS_PX7"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM1_PY0, "GPIO_MDM1_PY0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM2_PY1, "GPIO_MDM2_PY1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM3_PY2, "GPIO_MDM3_PY2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM4_PY3, "GPIO_MDM4_PY3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM5_PY4, "GPIO_MDM5_PY4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM6_PY5, "GPIO_MDM6_PY5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM7_PY6, "GPIO_MDM7_PY6"), + PINCTRL_PIN(TEGRA_PIN_UFS0_REF_CLK_PBB0, "UFS0_REF_CLK_PBB0"), + PINCTRL_PIN(TEGRA_PIN_UFS0_RST_PBB1, "UFS0_RST_PBB1"), + PINCTRL_PIN(TEGRA_PIN_DAP4_SCLK_PCC0, "DAP4_SCLK_PCC0"), + PINCTRL_PIN(TEGRA_PIN_DAP4_DOUT_PCC1, "DAP4_DOUT_PCC1"), + PINCTRL_PIN(TEGRA_PIN_DAP4_DIN_PCC2, "DAP4_DIN_PCC2"), + PINCTRL_PIN(TEGRA_PIN_DAP4_FS_PCC3, "DAP4_FS_PCC3"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC_COMP, "DIRECTDC_COMP"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_COMP, "SDMMC1_COMP"), + PINCTRL_PIN(TEGRA_PIN_EQOS_COMP, "EQOS_COMP"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_COMP, "SDMMC3_COMP"), + PINCTRL_PIN(TEGRA_PIN_QSPI_COMP, "QSPI_COMP"), +}; + +static const unsigned int pex_l0_rst_n_pa0_pins[] = { + TEGRA_PIN_PEX_L0_RST_N_PA0, +}; + +static const unsigned int pex_l0_clkreq_n_pa1_pins[] = { + TEGRA_PIN_PEX_L0_CLKREQ_N_PA1, +}; + +static const unsigned int pex_wake_n_pa2_pins[] = { + TEGRA_PIN_PEX_WAKE_N_PA2, +}; + +static const unsigned int pex_l1_rst_n_pa3_pins[] = { + TEGRA_PIN_PEX_L1_RST_N_PA3, +}; + +static const unsigned int pex_l1_clkreq_n_pa4_pins[] = { + TEGRA_PIN_PEX_L1_CLKREQ_N_PA4, +}; + +static const unsigned int pex_l2_rst_n_pa5_pins[] = { + TEGRA_PIN_PEX_L2_RST_N_PA5, +}; + +static const unsigned int pex_l2_clkreq_n_pa6_pins[] = { + TEGRA_PIN_PEX_L2_CLKREQ_N_PA6, +}; + +static const unsigned int uart4_tx_pb0_pins[] = { + TEGRA_PIN_UART4_TX_PB0, +}; + +static const unsigned int uart4_rx_pb1_pins[] = { + TEGRA_PIN_UART4_RX_PB1, +}; + +static const unsigned int uart4_rts_pb2_pins[] = { + TEGRA_PIN_UART4_RTS_PB2, +}; + +static const unsigned int uart4_cts_pb3_pins[] = { + TEGRA_PIN_UART4_CTS_PB3, +}; + +static const unsigned int gpio_wan1_pb4_pins[] = { + TEGRA_PIN_GPIO_WAN1_PB4, +}; + +static const unsigned int gpio_wan2_pb5_pins[] = { + TEGRA_PIN_GPIO_WAN2_PB5, +}; + +static const unsigned int gpio_wan3_pb6_pins[] = { + TEGRA_PIN_GPIO_WAN3_PB6, +}; + +static const unsigned int gpio_wan4_pc0_pins[] = { + TEGRA_PIN_GPIO_WAN4_PC0, +}; + +static const unsigned int dap2_sclk_pc1_pins[] = { + TEGRA_PIN_DAP2_SCLK_PC1, +}; + +static const unsigned int dap2_dout_pc2_pins[] = { + TEGRA_PIN_DAP2_DOUT_PC2, +}; + +static const unsigned int dap2_din_pc3_pins[] = { + TEGRA_PIN_DAP2_DIN_PC3, +}; + +static const unsigned int dap2_fs_pc4_pins[] = { + TEGRA_PIN_DAP2_FS_PC4, +}; + +static const unsigned int gen1_i2c_scl_pc5_pins[] = { + TEGRA_PIN_GEN1_I2C_SCL_PC5, +}; + +static const unsigned int gen1_i2c_sda_pc6_pins[] = { + TEGRA_PIN_GEN1_I2C_SDA_PC6, +}; + +static const unsigned int sdmmc1_clk_pd0_pins[] = { + TEGRA_PIN_SDMMC1_CLK_PD0, +}; + +static const unsigned int sdmmc1_cmd_pd1_pins[] = { + TEGRA_PIN_SDMMC1_CMD_PD1, +}; + +static const unsigned int sdmmc1_dat0_pd2_pins[] = { + TEGRA_PIN_SDMMC1_DAT0_PD2, +}; + +static const unsigned int sdmmc1_dat1_pd3_pins[] = { + TEGRA_PIN_SDMMC1_DAT1_PD3, +}; + +static const unsigned int sdmmc1_dat2_pd4_pins[] = { + TEGRA_PIN_SDMMC1_DAT2_PD4, +}; + +static const unsigned int sdmmc1_dat3_pd5_pins[] = { + TEGRA_PIN_SDMMC1_DAT3_PD5, +}; + +static const unsigned int eqos_txc_pe0_pins[] = { + TEGRA_PIN_EQOS_TXC_PE0, +}; + +static const unsigned int eqos_td0_pe1_pins[] = { + TEGRA_PIN_EQOS_TD0_PE1, +}; + +static const unsigned int eqos_td1_pe2_pins[] = { + TEGRA_PIN_EQOS_TD1_PE2, +}; + +static const unsigned int eqos_td2_pe3_pins[] = { + TEGRA_PIN_EQOS_TD2_PE3, +}; + +static const unsigned int eqos_td3_pe4_pins[] = { + TEGRA_PIN_EQOS_TD3_PE4, +}; + +static const unsigned int eqos_tx_ctl_pe5_pins[] = { + TEGRA_PIN_EQOS_TX_CTL_PE5, +}; + +static const unsigned int eqos_rd0_pe6_pins[] = { + TEGRA_PIN_EQOS_RD0_PE6, +}; + +static const unsigned int eqos_rd1_pe7_pins[] = { + TEGRA_PIN_EQOS_RD1_PE7, +}; + +static const unsigned int eqos_rd2_pf0_pins[] = { + TEGRA_PIN_EQOS_RD2_PF0, +}; + +static const unsigned int eqos_rd3_pf1_pins[] = { + TEGRA_PIN_EQOS_RD3_PF1, +}; + +static const unsigned int eqos_rx_ctl_pf2_pins[] = { + TEGRA_PIN_EQOS_RX_CTL_PF2, +}; + +static const unsigned int eqos_rxc_pf3_pins[] = { + TEGRA_PIN_EQOS_RXC_PF3, +}; + +static const unsigned int eqos_mdio_pf4_pins[] = { + TEGRA_PIN_EQOS_MDIO_PF4, +}; + +static const unsigned int eqos_mdc_pf5_pins[] = { + TEGRA_PIN_EQOS_MDC_PF5, +}; + +static const unsigned int sdmmc3_clk_pg0_pins[] = { + TEGRA_PIN_SDMMC3_CLK_PG0, +}; + +static const unsigned int sdmmc3_cmd_pg1_pins[] = { + TEGRA_PIN_SDMMC3_CMD_PG1, +}; + +static const unsigned int sdmmc3_dat0_pg2_pins[] = { + TEGRA_PIN_SDMMC3_DAT0_PG2, +}; + +static const unsigned int sdmmc3_dat1_pg3_pins[] = { + TEGRA_PIN_SDMMC3_DAT1_PG3, +}; + +static const unsigned int sdmmc3_dat2_pg4_pins[] = { + TEGRA_PIN_SDMMC3_DAT2_PG4, +}; + +static const unsigned int sdmmc3_dat3_pg5_pins[] = { + TEGRA_PIN_SDMMC3_DAT3_PG5, +}; + +static const unsigned int gpio_wan5_ph0_pins[] = { + TEGRA_PIN_GPIO_WAN5_PH0, +}; + +static const unsigned int gpio_wan6_ph1_pins[] = { + TEGRA_PIN_GPIO_WAN6_PH1, +}; + +static const unsigned int gpio_wan7_ph2_pins[] = { + TEGRA_PIN_GPIO_WAN7_PH2, +}; + +static const unsigned int gpio_wan8_ph3_pins[] = { + TEGRA_PIN_GPIO_WAN8_PH3, +}; + +static const unsigned int bcpu_pwr_req_ph4_pins[] = { + TEGRA_PIN_BCPU_PWR_REQ_PH4, +}; + +static const unsigned int mcpu_pwr_req_ph5_pins[] = { + TEGRA_PIN_MCPU_PWR_REQ_PH5, +}; + +static const unsigned int gpu_pwr_req_ph6_pins[] = { + TEGRA_PIN_GPU_PWR_REQ_PH6, +}; + +static const unsigned int gpio_pq0_pi0_pins[] = { + TEGRA_PIN_GPIO_PQ0_PI0, +}; + +static const unsigned int gpio_pq1_pi1_pins[] = { + TEGRA_PIN_GPIO_PQ1_PI1, +}; + +static const unsigned int gpio_pq2_pi2_pins[] = { + TEGRA_PIN_GPIO_PQ2_PI2, +}; + +static const unsigned int gpio_pq3_pi3_pins[] = { + TEGRA_PIN_GPIO_PQ3_PI3, +}; + +static const unsigned int gpio_pq4_pi4_pins[] = { + TEGRA_PIN_GPIO_PQ4_PI4, +}; + +static const unsigned int gpio_pq5_pi5_pins[] = { + TEGRA_PIN_GPIO_PQ5_PI5, +}; + +static const unsigned int gpio_pq6_pi6_pins[] = { + TEGRA_PIN_GPIO_PQ6_PI6, +}; + +static const unsigned int gpio_pq7_pi7_pins[] = { + TEGRA_PIN_GPIO_PQ7_PI7, +}; + +static const unsigned int dap1_sclk_pj0_pins[] = { + TEGRA_PIN_DAP1_SCLK_PJ0, +}; + +static const unsigned int dap1_dout_pj1_pins[] = { + TEGRA_PIN_DAP1_DOUT_PJ1, +}; + +static const unsigned int dap1_din_pj2_pins[] = { + TEGRA_PIN_DAP1_DIN_PJ2, +}; + +static const unsigned int dap1_fs_pj3_pins[] = { + TEGRA_PIN_DAP1_FS_PJ3, +}; + +static const unsigned int aud_mclk_pj4_pins[] = { + TEGRA_PIN_AUD_MCLK_PJ4, +}; + +static const unsigned int gpio_aud0_pj5_pins[] = { + TEGRA_PIN_GPIO_AUD0_PJ5, +}; + +static const unsigned int gpio_aud1_pj6_pins[] = { + TEGRA_PIN_GPIO_AUD1_PJ6, +}; + +static const unsigned int gpio_aud2_pj7_pins[] = { + TEGRA_PIN_GPIO_AUD2_PJ7, +}; + +static const unsigned int gpio_aud3_pk0_pins[] = { + TEGRA_PIN_GPIO_AUD3_PK0, +}; + +static const unsigned int gen7_i2c_scl_pl0_pins[] = { + TEGRA_PIN_GEN7_I2C_SCL_PL0, +}; + +static const unsigned int gen7_i2c_sda_pl1_pins[] = { + TEGRA_PIN_GEN7_I2C_SDA_PL1, +}; + +static const unsigned int gen9_i2c_scl_pl2_pins[] = { + TEGRA_PIN_GEN9_I2C_SCL_PL2, +}; + +static const unsigned int gen9_i2c_sda_pl3_pins[] = { + TEGRA_PIN_GEN9_I2C_SDA_PL3, +}; + +static const unsigned int usb_vbus_en0_pl4_pins[] = { + TEGRA_PIN_USB_VBUS_EN0_PL4, +}; + +static const unsigned int usb_vbus_en1_pl5_pins[] = { + TEGRA_PIN_USB_VBUS_EN1_PL5, +}; + +static const unsigned int gp_pwm6_pl6_pins[] = { + TEGRA_PIN_GP_PWM6_PL6, +}; + +static const unsigned int gp_pwm7_pl7_pins[] = { + TEGRA_PIN_GP_PWM7_PL7, +}; + +static const unsigned int dmic1_dat_pm0_pins[] = { + TEGRA_PIN_DMIC1_DAT_PM0, +}; + +static const unsigned int dmic1_clk_pm1_pins[] = { + TEGRA_PIN_DMIC1_CLK_PM1, +}; + +static const unsigned int dmic2_dat_pm2_pins[] = { + TEGRA_PIN_DMIC2_DAT_PM2, +}; + +static const unsigned int dmic2_clk_pm3_pins[] = { + TEGRA_PIN_DMIC2_CLK_PM3, +}; + +static const unsigned int dmic4_dat_pm4_pins[] = { + TEGRA_PIN_DMIC4_DAT_PM4, +}; + +static const unsigned int dmic4_clk_pm5_pins[] = { + TEGRA_PIN_DMIC4_CLK_PM5, +}; + +static const unsigned int gpio_cam1_pn0_pins[] = { + TEGRA_PIN_GPIO_CAM1_PN0, +}; + +static const unsigned int gpio_cam2_pn1_pins[] = { + TEGRA_PIN_GPIO_CAM2_PN1, +}; + +static const unsigned int gpio_cam3_pn2_pins[] = { + TEGRA_PIN_GPIO_CAM3_PN2, +}; + +static const unsigned int gpio_cam4_pn3_pins[] = { + TEGRA_PIN_GPIO_CAM4_PN3, +}; + +static const unsigned int gpio_cam5_pn4_pins[] = { + TEGRA_PIN_GPIO_CAM5_PN4, +}; + +static const unsigned int gpio_cam6_pn5_pins[] = { + TEGRA_PIN_GPIO_CAM6_PN5, +}; + +static const unsigned int gpio_cam7_pn6_pins[] = { + TEGRA_PIN_GPIO_CAM7_PN6, +}; + +static const unsigned int extperiph1_clk_po0_pins[] = { + TEGRA_PIN_EXTPERIPH1_CLK_PO0, +}; + +static const unsigned int extperiph2_clk_po1_pins[] = { + TEGRA_PIN_EXTPERIPH2_CLK_PO1, +}; + +static const unsigned int cam_i2c_scl_po2_pins[] = { + TEGRA_PIN_CAM_I2C_SCL_PO2, +}; + +static const unsigned int cam_i2c_sda_po3_pins[] = { + TEGRA_PIN_CAM_I2C_SDA_PO3, +}; + +static const unsigned int dp_aux_ch0_hpd_pp0_pins[] = { + TEGRA_PIN_DP_AUX_CH0_HPD_PP0, +}; + +static const unsigned int dp_aux_ch1_hpd_pp1_pins[] = { + TEGRA_PIN_DP_AUX_CH1_HPD_PP1, +}; + +static const unsigned int hdmi_cec_pp2_pins[] = { + TEGRA_PIN_HDMI_CEC_PP2, +}; + +static const unsigned int gpio_edp0_pp3_pins[] = { + TEGRA_PIN_GPIO_EDP0_PP3, +}; + +static const unsigned int gpio_edp1_pp4_pins[] = { + TEGRA_PIN_GPIO_EDP1_PP4, +}; + +static const unsigned int gpio_edp2_pp5_pins[] = { + TEGRA_PIN_GPIO_EDP2_PP5, +}; + +static const unsigned int gpio_edp3_pp6_pins[] = { + TEGRA_PIN_GPIO_EDP3_PP6, +}; + +static const unsigned int directdc1_clk_pq0_pins[] = { + TEGRA_PIN_DIRECTDC1_CLK_PQ0, +}; + +static const unsigned int directdc1_in_pq1_pins[] = { + TEGRA_PIN_DIRECTDC1_IN_PQ1, +}; + +static const unsigned int directdc1_out0_pq2_pins[] = { + TEGRA_PIN_DIRECTDC1_OUT0_PQ2, +}; + +static const unsigned int directdc1_out1_pq3_pins[] = { + TEGRA_PIN_DIRECTDC1_OUT1_PQ3, +}; + +static const unsigned int directdc1_out2_pq4_pins[] = { + TEGRA_PIN_DIRECTDC1_OUT2_PQ4, +}; + +static const unsigned int directdc1_out3_pq5_pins[] = { + TEGRA_PIN_DIRECTDC1_OUT3_PQ5, +}; + +static const unsigned int qspi_sck_pr0_pins[] = { + TEGRA_PIN_QSPI_SCK_PR0, +}; + +static const unsigned int qspi_io0_pr1_pins[] = { + TEGRA_PIN_QSPI_IO0_PR1, +}; + +static const unsigned int qspi_io1_pr2_pins[] = { + TEGRA_PIN_QSPI_IO1_PR2, +}; + +static const unsigned int qspi_io2_pr3_pins[] = { + TEGRA_PIN_QSPI_IO2_PR3, +}; + +static const unsigned int qspi_io3_pr4_pins[] = { + TEGRA_PIN_QSPI_IO3_PR4, +}; + +static const unsigned int qspi_cs_n_pr5_pins[] = { + TEGRA_PIN_QSPI_CS_N_PR5, +}; + +static const unsigned int pwr_i2c_scl_ps0_pins[] = { + TEGRA_PIN_PWR_I2C_SCL_PS0, +}; + +static const unsigned int pwr_i2c_sda_ps1_pins[] = { + TEGRA_PIN_PWR_I2C_SDA_PS1, +}; + +static const unsigned int batt_oc_ps2_pins[] = { + TEGRA_PIN_BATT_OC_PS2, +}; + +static const unsigned int safe_state_ps3_pins[] = { + TEGRA_PIN_SAFE_STATE_PS3, +}; + +static const unsigned int vcomp_alert_ps4_pins[] = { + TEGRA_PIN_VCOMP_ALERT_PS4, +}; + +static const unsigned int uart1_tx_pt0_pins[] = { + TEGRA_PIN_UART1_TX_PT0, +}; + +static const unsigned int uart1_rx_pt1_pins[] = { + TEGRA_PIN_UART1_RX_PT1, +}; + +static const unsigned int uart1_rts_pt2_pins[] = { + TEGRA_PIN_UART1_RTS_PT2, +}; + +static const unsigned int uart1_cts_pt3_pins[] = { + TEGRA_PIN_UART1_CTS_PT3, +}; + +static const unsigned int gpio_dis0_pu0_pins[] = { + TEGRA_PIN_GPIO_DIS0_PU0, +}; + +static const unsigned int gpio_dis1_pu1_pins[] = { + TEGRA_PIN_GPIO_DIS1_PU1, +}; + +static const unsigned int gpio_dis2_pu2_pins[] = { + TEGRA_PIN_GPIO_DIS2_PU2, +}; + +static const unsigned int gpio_dis3_pu3_pins[] = { + TEGRA_PIN_GPIO_DIS3_PU3, +}; + +static const unsigned int gpio_dis4_pu4_pins[] = { + TEGRA_PIN_GPIO_DIS4_PU4, +}; + +static const unsigned int gpio_dis5_pu5_pins[] = { + TEGRA_PIN_GPIO_DIS5_PU5, +}; + +static const unsigned int gpio_sen0_pv0_pins[] = { + TEGRA_PIN_GPIO_SEN0_PV0, +}; + +static const unsigned int gpio_sen1_pv1_pins[] = { + TEGRA_PIN_GPIO_SEN1_PV1, +}; + +static const unsigned int gpio_sen2_pv2_pins[] = { + TEGRA_PIN_GPIO_SEN2_PV2, +}; + +static const unsigned int gpio_sen3_pv3_pins[] = { + TEGRA_PIN_GPIO_SEN3_PV3, +}; + +static const unsigned int gpio_sen4_pv4_pins[] = { + TEGRA_PIN_GPIO_SEN4_PV4, +}; + +static const unsigned int gpio_sen5_pv5_pins[] = { + TEGRA_PIN_GPIO_SEN5_PV5, +}; + +static const unsigned int gpio_sen6_pv6_pins[] = { + TEGRA_PIN_GPIO_SEN6_PV6, +}; + +static const unsigned int gpio_sen7_pv7_pins[] = { + TEGRA_PIN_GPIO_SEN7_PV7, +}; + +static const unsigned int gen8_i2c_scl_pw0_pins[] = { + TEGRA_PIN_GEN8_I2C_SCL_PW0, +}; + +static const unsigned int gen8_i2c_sda_pw1_pins[] = { + TEGRA_PIN_GEN8_I2C_SDA_PW1, +}; + +static const unsigned int uart3_tx_pw2_pins[] = { + TEGRA_PIN_UART3_TX_PW2, +}; + +static const unsigned int uart3_rx_pw3_pins[] = { + TEGRA_PIN_UART3_RX_PW3, +}; + +static const unsigned int uart3_rts_pw4_pins[] = { + TEGRA_PIN_UART3_RTS_PW4, +}; + +static const unsigned int uart3_cts_pw5_pins[] = { + TEGRA_PIN_UART3_CTS_PW5, +}; + +static const unsigned int uart7_tx_pw6_pins[] = { + TEGRA_PIN_UART7_TX_PW6, +}; + +static const unsigned int uart7_rx_pw7_pins[] = { + TEGRA_PIN_UART7_RX_PW7, +}; + +static const unsigned int uart2_tx_px0_pins[] = { + TEGRA_PIN_UART2_TX_PX0, +}; + +static const unsigned int uart2_rx_px1_pins[] = { + TEGRA_PIN_UART2_RX_PX1, +}; + +static const unsigned int uart2_rts_px2_pins[] = { + TEGRA_PIN_UART2_RTS_PX2, +}; + +static const unsigned int uart2_cts_px3_pins[] = { + TEGRA_PIN_UART2_CTS_PX3, +}; + +static const unsigned int uart5_tx_px4_pins[] = { + TEGRA_PIN_UART5_TX_PX4, +}; + +static const unsigned int uart5_rx_px5_pins[] = { + TEGRA_PIN_UART5_RX_PX5, +}; + +static const unsigned int uart5_rts_px6_pins[] = { + TEGRA_PIN_UART5_RTS_PX6, +}; + +static const unsigned int uart5_cts_px7_pins[] = { + TEGRA_PIN_UART5_CTS_PX7, +}; + +static const unsigned int gpio_mdm1_py0_pins[] = { + TEGRA_PIN_GPIO_MDM1_PY0, +}; + +static const unsigned int gpio_mdm2_py1_pins[] = { + TEGRA_PIN_GPIO_MDM2_PY1, +}; + +static const unsigned int gpio_mdm3_py2_pins[] = { + TEGRA_PIN_GPIO_MDM3_PY2, +}; + +static const unsigned int gpio_mdm4_py3_pins[] = { + TEGRA_PIN_GPIO_MDM4_PY3, +}; + +static const unsigned int gpio_mdm5_py4_pins[] = { + TEGRA_PIN_GPIO_MDM5_PY4, +}; + +static const unsigned int gpio_mdm6_py5_pins[] = { + TEGRA_PIN_GPIO_MDM6_PY5, +}; + +static const unsigned int gpio_mdm7_py6_pins[] = { + TEGRA_PIN_GPIO_MDM7_PY6, +}; + +static const unsigned int can1_dout_pz0_pins[] = { + TEGRA_PIN_CAN1_DOUT_PZ0, +}; + +static const unsigned int can1_din_pz1_pins[] = { + TEGRA_PIN_CAN1_DIN_PZ1, +}; + +static const unsigned int can0_dout_pz2_pins[] = { + TEGRA_PIN_CAN0_DOUT_PZ2, +}; + +static const unsigned int can0_din_pz3_pins[] = { + TEGRA_PIN_CAN0_DIN_PZ3, +}; + +static const unsigned int can_gpio0_paa0_pins[] = { + TEGRA_PIN_CAN_GPIO0_PAA0, +}; + +static const unsigned int can_gpio1_paa1_pins[] = { + TEGRA_PIN_CAN_GPIO1_PAA1, +}; + +static const unsigned int can_gpio2_paa2_pins[] = { + TEGRA_PIN_CAN_GPIO2_PAA2, +}; + +static const unsigned int can_gpio3_paa3_pins[] = { + TEGRA_PIN_CAN_GPIO3_PAA3, +}; + +static const unsigned int can_gpio4_paa4_pins[] = { + TEGRA_PIN_CAN_GPIO4_PAA4, +}; + +static const unsigned int can_gpio5_paa5_pins[] = { + TEGRA_PIN_CAN_GPIO5_PAA5, +}; + +static const unsigned int can_gpio6_paa6_pins[] = { + TEGRA_PIN_CAN_GPIO6_PAA6, +}; + +static const unsigned int can_gpio7_paa7_pins[] = { + TEGRA_PIN_CAN_GPIO7_PAA7, +}; + +static const unsigned int ufs0_ref_clk_pbb0_pins[] = { + TEGRA_PIN_UFS0_REF_CLK_PBB0, +}; + +static const unsigned int ufs0_rst_pbb1_pins[] = { + TEGRA_PIN_UFS0_RST_PBB1, +}; + +static const unsigned int dap4_sclk_pcc0_pins[] = { + TEGRA_PIN_DAP4_SCLK_PCC0, +}; + +static const unsigned int dap4_dout_pcc1_pins[] = { + TEGRA_PIN_DAP4_DOUT_PCC1, +}; + +static const unsigned int dap4_din_pcc2_pins[] = { + TEGRA_PIN_DAP4_DIN_PCC2, +}; + +static const unsigned int dap4_fs_pcc3_pins[] = { + TEGRA_PIN_DAP4_FS_PCC3, +}; + +static const unsigned int gpio_sen8_pee0_pins[] = { + TEGRA_PIN_GPIO_SEN8_PEE0, +}; + +static const unsigned int gpio_sen9_pee1_pins[] = { + TEGRA_PIN_GPIO_SEN9_PEE1, +}; + +static const unsigned int touch_clk_pee2_pins[] = { + TEGRA_PIN_TOUCH_CLK_PEE2, +}; + +static const unsigned int power_on_pff0_pins[] = { + TEGRA_PIN_POWER_ON_PFF0, +}; + +static const unsigned int gpio_sw1_pff1_pins[] = { + TEGRA_PIN_GPIO_SW1_PFF1, +}; + +static const unsigned int gpio_sw2_pff2_pins[] = { + TEGRA_PIN_GPIO_SW2_PFF2, +}; + +static const unsigned int gpio_sw3_pff3_pins[] = { + TEGRA_PIN_GPIO_SW3_PFF3, +}; + +static const unsigned int gpio_sw4_pff4_pins[] = { + TEGRA_PIN_GPIO_SW4_PFF4, +}; + +static const unsigned int directdc_comp_pins[] = { + TEGRA_PIN_DIRECTDC_COMP, +}; + +static const unsigned int sdmmc1_comp_pins[] = { + TEGRA_PIN_SDMMC1_COMP, +}; + +static const unsigned int eqos_comp_pins[] = { + TEGRA_PIN_EQOS_COMP, +}; + +static const unsigned int sdmmc3_comp_pins[] = { + TEGRA_PIN_SDMMC3_COMP, +}; + +static const unsigned int qspi_comp_pins[] = { + TEGRA_PIN_QSPI_COMP, +}; + +static const unsigned int shutdown_pins[] = { + TEGRA_PIN_SHUTDOWN, +}; + +static const unsigned int pmu_int_pins[] = { + TEGRA_PIN_PMU_INT, +}; + +static const unsigned int soc_pwr_req_pins[] = { + TEGRA_PIN_SOC_PWR_REQ, +}; + +static const unsigned int clk_32k_in_pins[] = { + TEGRA_PIN_CLK_32K_IN, +}; + +static const unsigned int sdmmc4_clk_pins[] = {}; + +static const unsigned int sdmmc4_cmd_pins[] = {}; + +static const unsigned int sdmmc4_dqs_pins[] = {}; + +static const unsigned int sdmmc4_dat7_pins[] = {}; + +static const unsigned int sdmmc4_dat6_pins[] = {}; + +static const unsigned int sdmmc4_dat5_pins[] = {}; + +static const unsigned int sdmmc4_dat4_pins[] = {}; + +static const unsigned int sdmmc4_dat3_pins[] = {}; + +static const unsigned int sdmmc4_dat2_pins[] = {}; + +static const unsigned int sdmmc4_dat1_pins[] = {}; + +static const unsigned int sdmmc4_dat0_pins[] = {}; + +/* Define unique ID for each function */ +enum tegra_mux_dt { + TEGRA_MUX_RSVD0, + TEGRA_MUX_RSVD1, + TEGRA_MUX_RSVD2, + TEGRA_MUX_RSVD3, + TEGRA_MUX_TOUCH, + TEGRA_MUX_UARTC, + TEGRA_MUX_I2C8, + TEGRA_MUX_UARTG, + TEGRA_MUX_SPI2, + TEGRA_MUX_GP, + TEGRA_MUX_DCA, + TEGRA_MUX_WDT, + TEGRA_MUX_I2C2, + TEGRA_MUX_CAN1, + TEGRA_MUX_CAN0, + TEGRA_MUX_DMIC3, + TEGRA_MUX_DMIC5, + TEGRA_MUX_GPIO, + TEGRA_MUX_DSPK1, + TEGRA_MUX_DSPK0, + TEGRA_MUX_SPDIF, + TEGRA_MUX_AUD, + TEGRA_MUX_I2S1, + TEGRA_MUX_DMIC1, + TEGRA_MUX_DMIC2, + TEGRA_MUX_I2S3, + TEGRA_MUX_DMIC4, + TEGRA_MUX_I2S4, + TEGRA_MUX_EXTPERIPH2, + TEGRA_MUX_EXTPERIPH1, + TEGRA_MUX_I2C3, + TEGRA_MUX_VGP1, + TEGRA_MUX_VGP2, + TEGRA_MUX_VGP3, + TEGRA_MUX_VGP4, + TEGRA_MUX_VGP5, + TEGRA_MUX_VGP6, + TEGRA_MUX_EXTPERIPH3, + TEGRA_MUX_EXTPERIPH4, + TEGRA_MUX_SPI4, + TEGRA_MUX_I2S2, + TEGRA_MUX_UARTD, + TEGRA_MUX_I2C1, + TEGRA_MUX_UARTA, + TEGRA_MUX_DIRECTDC1, + TEGRA_MUX_DIRECTDC, + TEGRA_MUX_IQC0, + TEGRA_MUX_IQC1, + TEGRA_MUX_I2S6, + TEGRA_MUX_DTV, + TEGRA_MUX_UARTF, + TEGRA_MUX_SDMMC3, + TEGRA_MUX_SDMMC4, + TEGRA_MUX_SDMMC1, + TEGRA_MUX_DP, + TEGRA_MUX_HDMI, + TEGRA_MUX_PE2, + TEGRA_MUX_SATA, + TEGRA_MUX_PE, + TEGRA_MUX_PE1, + TEGRA_MUX_PE0, + TEGRA_MUX_SOC, + TEGRA_MUX_EQOS, + TEGRA_MUX_SDMMC2, + TEGRA_MUX_QSPI, + TEGRA_MUX_SCE, + TEGRA_MUX_I2C5, + TEGRA_MUX_DISPLAYA, + TEGRA_MUX_DISPLAYB, + TEGRA_MUX_DCC, + TEGRA_MUX_DCB, + TEGRA_MUX_SPI1, + TEGRA_MUX_UARTB, + TEGRA_MUX_UARTE, + TEGRA_MUX_SPI3, + TEGRA_MUX_NV, + TEGRA_MUX_CCLA, + TEGRA_MUX_I2C7, + TEGRA_MUX_I2C9, + TEGRA_MUX_I2S5, + TEGRA_MUX_USB, + TEGRA_MUX_UFS0, +}; + +/* Make list of each function name */ +#define TEGRA_PIN_FUNCTION(lid) #lid + +static const char * const tegra186_functions[] = { + TEGRA_PIN_FUNCTION(rsvd0), + TEGRA_PIN_FUNCTION(rsvd1), + TEGRA_PIN_FUNCTION(rsvd2), + TEGRA_PIN_FUNCTION(rsvd3), + TEGRA_PIN_FUNCTION(touch), + TEGRA_PIN_FUNCTION(uartc), + TEGRA_PIN_FUNCTION(i2c8), + TEGRA_PIN_FUNCTION(uartg), + TEGRA_PIN_FUNCTION(spi2), + TEGRA_PIN_FUNCTION(gp), + TEGRA_PIN_FUNCTION(dca), + TEGRA_PIN_FUNCTION(wdt), + TEGRA_PIN_FUNCTION(i2c2), + TEGRA_PIN_FUNCTION(can1), + TEGRA_PIN_FUNCTION(can0), + TEGRA_PIN_FUNCTION(dmic3), + TEGRA_PIN_FUNCTION(dmic5), + TEGRA_PIN_FUNCTION(gpio), + TEGRA_PIN_FUNCTION(dspk1), + TEGRA_PIN_FUNCTION(dspk0), + TEGRA_PIN_FUNCTION(spdif), + TEGRA_PIN_FUNCTION(aud), + TEGRA_PIN_FUNCTION(i2s1), + TEGRA_PIN_FUNCTION(dmic1), + TEGRA_PIN_FUNCTION(dmic2), + TEGRA_PIN_FUNCTION(i2s3), + TEGRA_PIN_FUNCTION(dmic4), + TEGRA_PIN_FUNCTION(i2s4), + TEGRA_PIN_FUNCTION(extperiph2), + TEGRA_PIN_FUNCTION(extperiph1), + TEGRA_PIN_FUNCTION(i2c3), + TEGRA_PIN_FUNCTION(vgp1), + TEGRA_PIN_FUNCTION(vgp2), + TEGRA_PIN_FUNCTION(vgp3), + TEGRA_PIN_FUNCTION(vgp4), + TEGRA_PIN_FUNCTION(vgp5), + TEGRA_PIN_FUNCTION(vgp6), + TEGRA_PIN_FUNCTION(extperiph3), + TEGRA_PIN_FUNCTION(extperiph4), + TEGRA_PIN_FUNCTION(spi4), + TEGRA_PIN_FUNCTION(i2s2), + TEGRA_PIN_FUNCTION(uartd), + TEGRA_PIN_FUNCTION(i2c1), + TEGRA_PIN_FUNCTION(uarta), + TEGRA_PIN_FUNCTION(directdc1), + TEGRA_PIN_FUNCTION(directdc), + TEGRA_PIN_FUNCTION(iqc0), + TEGRA_PIN_FUNCTION(iqc1), + TEGRA_PIN_FUNCTION(i2s6), + TEGRA_PIN_FUNCTION(dtv), + TEGRA_PIN_FUNCTION(uartf), + TEGRA_PIN_FUNCTION(sdmmc3), + TEGRA_PIN_FUNCTION(sdmmc4), + TEGRA_PIN_FUNCTION(sdmmc1), + TEGRA_PIN_FUNCTION(dp), + TEGRA_PIN_FUNCTION(hdmi), + TEGRA_PIN_FUNCTION(pe2), + TEGRA_PIN_FUNCTION(sata), + TEGRA_PIN_FUNCTION(pe), + TEGRA_PIN_FUNCTION(pe1), + TEGRA_PIN_FUNCTION(pe0), + TEGRA_PIN_FUNCTION(soc), + TEGRA_PIN_FUNCTION(eqos), + TEGRA_PIN_FUNCTION(sdmmc2), + TEGRA_PIN_FUNCTION(qspi), + TEGRA_PIN_FUNCTION(sce), + TEGRA_PIN_FUNCTION(i2c5), + TEGRA_PIN_FUNCTION(displaya), + TEGRA_PIN_FUNCTION(displayb), + TEGRA_PIN_FUNCTION(dcc), + TEGRA_PIN_FUNCTION(dcb), + TEGRA_PIN_FUNCTION(spi1), + TEGRA_PIN_FUNCTION(uartb), + TEGRA_PIN_FUNCTION(uarte), + TEGRA_PIN_FUNCTION(spi3), + TEGRA_PIN_FUNCTION(nv), + TEGRA_PIN_FUNCTION(ccla), + TEGRA_PIN_FUNCTION(i2c7), + TEGRA_PIN_FUNCTION(i2c9), + TEGRA_PIN_FUNCTION(i2s5), + TEGRA_PIN_FUNCTION(usb), + TEGRA_PIN_FUNCTION(ufs0), +}; + +#define PINGROUP_REG_Y(r) ((r)) +#define PINGROUP_REG_N(r) -1 + +#define DRV_PINGROUP_Y(r) ((r)) +#define DRV_PINGROUP_N(r) -1 + +#define DRV_PINGROUP_ENTRY_N(pg_name) \ + .drv_reg = -1, \ + .drv_bank = -1, \ + .drvdn_bit = -1, \ + .drvdn_width = -1, \ + .drvup_bit = -1, \ + .drvup_width = -1, \ + .slwr_bit = -1, \ + .slwr_width = -1, \ + .slwf_bit = -1, \ + .slwf_width = -1 + +#define DRV_PINGROUP_ENTRY_Y(r, drvdn_b, drvdn_w, drvup_b, \ + drvup_w, slwr_b, slwr_w, slwf_b, \ + slwf_w, bank) \ + .drv_reg = ((r)), \ + .drv_bank = bank, \ + .drvdn_bit = drvdn_b, \ + .drvdn_width = drvdn_w, \ + .drvup_bit = drvup_b, \ + .drvup_width = drvup_w, \ + .slwr_bit = slwr_b, \ + .slwr_width = slwr_w, \ + .slwf_bit = slwf_b, \ + .slwf_width = slwf_w + +#define PIN_PINGROUP_ENTRY_N(pg_name) \ + .mux_reg = -1, \ + .pupd_reg = -1, \ + .tri_reg = -1, \ + .einput_bit = -1, \ + .e_io_hv_bit = -1, \ + .odrain_bit = -1, \ + .lock_bit = -1, \ + .parked_bit = -1, \ + .lpmd_bit = -1, \ + .drvtype_bit = -1, \ + .lpdr_bit = -1, \ + .pbias_buf_bit = -1, \ + .preemp_bit = -1, \ + .rfu_in_bit = -1 + +#define PIN_PINGROUP_ENTRY_Y(r, bank, pupd, e_io_hv, e_lpbk, e_input, \ + e_lpdr, e_pbias_buf, gpio_sfio_sel, \ + e_od, schmitt_b, drvtype, epreemp, \ + io_reset, rfu_in) \ + .mux_reg = PINGROUP_REG_Y(r), \ + .lpmd_bit = -1, \ + .lock_bit = -1, \ + .hsm_bit = -1, \ + .mux_bank = bank, \ + .mux_bit = 0, \ + .pupd_reg = PINGROUP_REG_##pupd(r), \ + .pupd_bank = bank, \ + .pupd_bit = 2, \ + .tri_reg = PINGROUP_REG_Y(r), \ + .tri_bank = bank, \ + .tri_bit = 4, \ + .einput_bit = e_input, \ + .sfsel_bit = gpio_sfio_sel, \ + .odrain_bit = e_od, \ + .schmitt_bit = schmitt_b, \ + .drvtype_bit = 13, \ + .lpdr_bit = e_lpdr, \ + +/* main drive pin groups */ +#define drive_gpio_aud3_pk0 DRV_PINGROUP_ENTRY_Y(0x1004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_aud2_pj7 DRV_PINGROUP_ENTRY_Y(0x100c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_aud1_pj6 DRV_PINGROUP_ENTRY_Y(0x1014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_aud0_pj5 DRV_PINGROUP_ENTRY_Y(0x101c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_aud_mclk_pj4 DRV_PINGROUP_ENTRY_Y(0x1024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap1_fs_pj3 DRV_PINGROUP_ENTRY_Y(0x102c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap1_din_pj2 DRV_PINGROUP_ENTRY_Y(0x1034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap1_dout_pj1 DRV_PINGROUP_ENTRY_Y(0x103c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap1_sclk_pj0 DRV_PINGROUP_ENTRY_Y(0x1044, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dmic1_clk_pm1 DRV_PINGROUP_ENTRY_Y(0x2004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dmic1_dat_pm0 DRV_PINGROUP_ENTRY_Y(0x200c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dmic2_dat_pm2 DRV_PINGROUP_ENTRY_Y(0x2014, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dmic2_clk_pm3 DRV_PINGROUP_ENTRY_Y(0x201c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dmic4_dat_pm4 DRV_PINGROUP_ENTRY_Y(0x2024, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dmic4_clk_pm5 DRV_PINGROUP_ENTRY_Y(0x202c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dap4_fs_pcc3 DRV_PINGROUP_ENTRY_Y(0x2034, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dap4_din_pcc2 DRV_PINGROUP_ENTRY_Y(0x203c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dap4_dout_pcc1 DRV_PINGROUP_ENTRY_Y(0x2044, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dap4_sclk_pcc0 DRV_PINGROUP_ENTRY_Y(0x204c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_extperiph2_clk_po1 DRV_PINGROUP_ENTRY_Y(0x0004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_extperiph1_clk_po0 DRV_PINGROUP_ENTRY_Y(0x000c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_cam_i2c_sda_po3 DRV_PINGROUP_ENTRY_Y(0x0014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_cam_i2c_scl_po2 DRV_PINGROUP_ENTRY_Y(0x001c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam1_pn0 DRV_PINGROUP_ENTRY_Y(0x0024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam2_pn1 DRV_PINGROUP_ENTRY_Y(0x002c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam3_pn2 DRV_PINGROUP_ENTRY_Y(0x0034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam4_pn3 DRV_PINGROUP_ENTRY_Y(0x003c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam5_pn4 DRV_PINGROUP_ENTRY_Y(0x0044, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam6_pn5 DRV_PINGROUP_ENTRY_Y(0x004c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam7_pn6 DRV_PINGROUP_ENTRY_Y(0x0054, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap2_din_pc3 DRV_PINGROUP_ENTRY_Y(0x4004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap2_dout_pc2 DRV_PINGROUP_ENTRY_Y(0x400c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap2_fs_pc4 DRV_PINGROUP_ENTRY_Y(0x4014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap2_sclk_pc1 DRV_PINGROUP_ENTRY_Y(0x401c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart4_cts_pb3 DRV_PINGROUP_ENTRY_Y(0x4024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart4_rts_pb2 DRV_PINGROUP_ENTRY_Y(0x402c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart4_rx_pb1 DRV_PINGROUP_ENTRY_Y(0x4034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart4_tx_pb0 DRV_PINGROUP_ENTRY_Y(0x403c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan4_pc0 DRV_PINGROUP_ENTRY_Y(0x4044, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan3_pb6 DRV_PINGROUP_ENTRY_Y(0x404c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan2_pb5 DRV_PINGROUP_ENTRY_Y(0x4054, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan1_pb4 DRV_PINGROUP_ENTRY_Y(0x405c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen1_i2c_scl_pc5 DRV_PINGROUP_ENTRY_Y(0x4064, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen1_i2c_sda_pc6 DRV_PINGROUP_ENTRY_Y(0x406c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart1_cts_pt3 DRV_PINGROUP_ENTRY_Y(0x5004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart1_rts_pt2 DRV_PINGROUP_ENTRY_Y(0x500c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart1_rx_pt1 DRV_PINGROUP_ENTRY_Y(0x5014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart1_tx_pt0 DRV_PINGROUP_ENTRY_Y(0x501c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_directdc1_out3_pq5 DRV_PINGROUP_ENTRY_Y(0x502c, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_directdc1_out2_pq4 DRV_PINGROUP_ENTRY_Y(0x5034, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_directdc1_out1_pq3 DRV_PINGROUP_ENTRY_Y(0x503c, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_directdc1_out0_pq2 DRV_PINGROUP_ENTRY_Y(0x5044, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_directdc1_in_pq1 DRV_PINGROUP_ENTRY_Y(0x504c, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_directdc1_clk_pq0 DRV_PINGROUP_ENTRY_Y(0x5054, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_gpio_pq0_pi0 DRV_PINGROUP_ENTRY_Y(0x3004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq1_pi1 DRV_PINGROUP_ENTRY_Y(0x300c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq2_pi2 DRV_PINGROUP_ENTRY_Y(0x3014, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq3_pi3 DRV_PINGROUP_ENTRY_Y(0x301c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq4_pi4 DRV_PINGROUP_ENTRY_Y(0x3024, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq5_pi5 DRV_PINGROUP_ENTRY_Y(0x302c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq6_pi6 DRV_PINGROUP_ENTRY_Y(0x3034, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq7_pi7 DRV_PINGROUP_ENTRY_Y(0x303c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_edp2_pp5 DRV_PINGROUP_ENTRY_Y(0x10004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_edp3_pp6 DRV_PINGROUP_ENTRY_Y(0x1000c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_edp0_pp3 DRV_PINGROUP_ENTRY_Y(0x10014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_edp1_pp4 DRV_PINGROUP_ENTRY_Y(0x1001c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dp_aux_ch0_hpd_pp0 DRV_PINGROUP_ENTRY_Y(0x10024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dp_aux_ch1_hpd_pp1 DRV_PINGROUP_ENTRY_Y(0x1002c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_hdmi_cec_pp2 DRV_PINGROUP_ENTRY_Y(0x10034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l2_clkreq_n_pa6 DRV_PINGROUP_ENTRY_Y(0x7004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_wake_n_pa2 DRV_PINGROUP_ENTRY_Y(0x700c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l1_clkreq_n_pa4 DRV_PINGROUP_ENTRY_Y(0x7014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l1_rst_n_pa3 DRV_PINGROUP_ENTRY_Y(0x701c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l0_clkreq_n_pa1 DRV_PINGROUP_ENTRY_Y(0x7024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l0_rst_n_pa0 DRV_PINGROUP_ENTRY_Y(0x702c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l2_rst_n_pa5 DRV_PINGROUP_ENTRY_Y(0x7034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_sdmmc1_clk_pd0 DRV_PINGROUP_ENTRY_Y(0x8004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc1_cmd_pd1 DRV_PINGROUP_ENTRY_Y(0x800c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc1_dat3_pd5 DRV_PINGROUP_ENTRY_Y(0x8018, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc1_dat2_pd4 DRV_PINGROUP_ENTRY_Y(0x8020, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc1_dat1_pd3 DRV_PINGROUP_ENTRY_Y(0x8028, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc1_dat0_pd2 DRV_PINGROUP_ENTRY_Y(0x8030, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_td3_pe4 DRV_PINGROUP_ENTRY_Y(0x9004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_td2_pe3 DRV_PINGROUP_ENTRY_Y(0x900c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_td1_pe2 DRV_PINGROUP_ENTRY_Y(0x9014, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_td0_pe1 DRV_PINGROUP_ENTRY_Y(0x901c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rd3_pf1 DRV_PINGROUP_ENTRY_Y(0x9024, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rd2_pf0 DRV_PINGROUP_ENTRY_Y(0x902c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rd1_pe7 DRV_PINGROUP_ENTRY_Y(0x9034, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_mdio_pf4 DRV_PINGROUP_ENTRY_Y(0x903c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rd0_pe6 DRV_PINGROUP_ENTRY_Y(0x9044, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_mdc_pf5 DRV_PINGROUP_ENTRY_Y(0x904c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_txc_pe0 DRV_PINGROUP_ENTRY_Y(0x9058, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rxc_pf3 DRV_PINGROUP_ENTRY_Y(0x9060, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_tx_ctl_pe5 DRV_PINGROUP_ENTRY_Y(0x9068, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rx_ctl_pf2 DRV_PINGROUP_ENTRY_Y(0x9070, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_dat3_pg5 DRV_PINGROUP_ENTRY_Y(0xa004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_dat2_pg4 DRV_PINGROUP_ENTRY_Y(0xa00c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_dat1_pg3 DRV_PINGROUP_ENTRY_Y(0xa014, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_dat0_pg2 DRV_PINGROUP_ENTRY_Y(0xa01c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_cmd_pg1 DRV_PINGROUP_ENTRY_Y(0xa028, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_clk_pg0 DRV_PINGROUP_ENTRY_Y(0xa030, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_io3_pr4 DRV_PINGROUP_ENTRY_Y(0xB004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_io2_pr3 DRV_PINGROUP_ENTRY_Y(0xB00C, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_io1_pr2 DRV_PINGROUP_ENTRY_Y(0xB014, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_io0_pr1 DRV_PINGROUP_ENTRY_Y(0xB01C, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_sck_pr0 DRV_PINGROUP_ENTRY_Y(0xB024, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_cs_n_pr5 DRV_PINGROUP_ENTRY_Y(0xB02C, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_wan8_ph3 DRV_PINGROUP_ENTRY_Y(0xd004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan7_ph2 DRV_PINGROUP_ENTRY_Y(0xd00c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan6_ph1 DRV_PINGROUP_ENTRY_Y(0xd014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan5_ph0 DRV_PINGROUP_ENTRY_Y(0xd01c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart2_tx_px0 DRV_PINGROUP_ENTRY_Y(0xd024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart2_rx_px1 DRV_PINGROUP_ENTRY_Y(0xd02c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart2_rts_px2 DRV_PINGROUP_ENTRY_Y(0xd034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart2_cts_px3 DRV_PINGROUP_ENTRY_Y(0xd03c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart5_rx_px5 DRV_PINGROUP_ENTRY_Y(0xd044, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart5_tx_px4 DRV_PINGROUP_ENTRY_Y(0xd04c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart5_rts_px6 DRV_PINGROUP_ENTRY_Y(0xd054, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart5_cts_px7 DRV_PINGROUP_ENTRY_Y(0xd05c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm1_py0 DRV_PINGROUP_ENTRY_Y(0xd064, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm2_py1 DRV_PINGROUP_ENTRY_Y(0xd06c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm3_py2 DRV_PINGROUP_ENTRY_Y(0xd074, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm4_py3 DRV_PINGROUP_ENTRY_Y(0xd07c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm5_py4 DRV_PINGROUP_ENTRY_Y(0xd084, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm6_py5 DRV_PINGROUP_ENTRY_Y(0xd08c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm7_py6 DRV_PINGROUP_ENTRY_Y(0xd094, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_bcpu_pwr_req_ph4 DRV_PINGROUP_ENTRY_Y(0xd09c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_mcpu_pwr_req_ph5 DRV_PINGROUP_ENTRY_Y(0xd0a4, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpu_pwr_req_ph6 DRV_PINGROUP_ENTRY_Y(0xd0ac, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen7_i2c_scl_pl0 DRV_PINGROUP_ENTRY_Y(0xd0b4, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen7_i2c_sda_pl1 DRV_PINGROUP_ENTRY_Y(0xd0bc, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen9_i2c_sda_pl3 DRV_PINGROUP_ENTRY_Y(0xd0c4, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen9_i2c_scl_pl2 DRV_PINGROUP_ENTRY_Y(0xd0cc, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_usb_vbus_en0_pl4 DRV_PINGROUP_ENTRY_Y(0xd0d4, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_usb_vbus_en1_pl5 DRV_PINGROUP_ENTRY_Y(0xd0dc, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gp_pwm7_pl7 DRV_PINGROUP_ENTRY_Y(0xd0e4, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gp_pwm6_pl6 DRV_PINGROUP_ENTRY_Y(0xd0ec, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_ufs0_rst_pbb1 DRV_PINGROUP_ENTRY_Y(0x11004, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_ufs0_ref_clk_pbb0 DRV_PINGROUP_ENTRY_Y(0x1100c, 12, 9, 24, 8, -1, -1, -1, -1, 0) + +#define drive_directdc_comp DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc1_comp DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_eqos_comp DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc3_comp DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_clk DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_cmd DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dqs DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat7 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat6 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat5 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat4 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat3 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat2 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat1 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat0 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_qspi_comp DRV_PINGROUP_ENTRY_N(no_entry) + +/* AON drive pin groups */ +#define drive_touch_clk_pee2 DRV_PINGROUP_ENTRY_Y(0x2004, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart3_cts_pw5 DRV_PINGROUP_ENTRY_Y(0x200c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart3_rts_pw4 DRV_PINGROUP_ENTRY_Y(0x2014, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart3_rx_pw3 DRV_PINGROUP_ENTRY_Y(0x201c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart3_tx_pw2 DRV_PINGROUP_ENTRY_Y(0x2024, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gen8_i2c_sda_pw1 DRV_PINGROUP_ENTRY_Y(0x202c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gen8_i2c_scl_pw0 DRV_PINGROUP_ENTRY_Y(0x2034, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart7_rx_pw7 DRV_PINGROUP_ENTRY_Y(0x203c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart7_tx_pw6 DRV_PINGROUP_ENTRY_Y(0x2044, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen0_pv0 DRV_PINGROUP_ENTRY_Y(0x204c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen1_pv1 DRV_PINGROUP_ENTRY_Y(0x2054, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen2_pv2 DRV_PINGROUP_ENTRY_Y(0x205c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen3_pv3 DRV_PINGROUP_ENTRY_Y(0x2064, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen4_pv4 DRV_PINGROUP_ENTRY_Y(0x206c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen5_pv5 DRV_PINGROUP_ENTRY_Y(0x2074, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen6_pv6 DRV_PINGROUP_ENTRY_Y(0x207c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen7_pv7 DRV_PINGROUP_ENTRY_Y(0x2084, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen8_pee0 DRV_PINGROUP_ENTRY_Y(0x208c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen9_pee1 DRV_PINGROUP_ENTRY_Y(0x2094, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_can_gpio7_paa7 DRV_PINGROUP_ENTRY_Y(0x3004, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can1_dout_pz0 DRV_PINGROUP_ENTRY_Y(0x300C, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can1_din_pz1 DRV_PINGROUP_ENTRY_Y(0x3014, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can0_dout_pz2 DRV_PINGROUP_ENTRY_Y(0x301c, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can0_din_pz3 DRV_PINGROUP_ENTRY_Y(0x3024, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio0_paa0 DRV_PINGROUP_ENTRY_Y(0x302c, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio1_paa1 DRV_PINGROUP_ENTRY_Y(0x3034, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio2_paa2 DRV_PINGROUP_ENTRY_Y(0x303c, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio3_paa3 DRV_PINGROUP_ENTRY_Y(0x3044, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio4_paa4 DRV_PINGROUP_ENTRY_Y(0x304c, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio5_paa5 DRV_PINGROUP_ENTRY_Y(0x3054, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio6_paa6 DRV_PINGROUP_ENTRY_Y(0x305c, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_gpio_sw1_pff1 DRV_PINGROUP_ENTRY_Y(0x1004, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sw2_pff2 DRV_PINGROUP_ENTRY_Y(0x100c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sw3_pff3 DRV_PINGROUP_ENTRY_Y(0x1014, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sw4_pff4 DRV_PINGROUP_ENTRY_Y(0x101c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_shutdown DRV_PINGROUP_ENTRY_Y(0x1024, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_pmu_int DRV_PINGROUP_ENTRY_Y(0x102C, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_safe_state_ps3 DRV_PINGROUP_ENTRY_Y(0x1034, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_vcomp_alert_ps4 DRV_PINGROUP_ENTRY_Y(0x103c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_soc_pwr_req DRV_PINGROUP_ENTRY_Y(0x1044, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_batt_oc_ps2 DRV_PINGROUP_ENTRY_Y(0x104c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_clk_32k_in DRV_PINGROUP_ENTRY_Y(0x1054, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_power_on_pff0 DRV_PINGROUP_ENTRY_Y(0x105c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_pwr_i2c_scl_ps0 DRV_PINGROUP_ENTRY_Y(0x1064, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_pwr_i2c_sda_ps1 DRV_PINGROUP_ENTRY_Y(0x106c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis0_pu0 DRV_PINGROUP_ENTRY_Y(0x1084, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis1_pu1 DRV_PINGROUP_ENTRY_Y(0x108c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis2_pu2 DRV_PINGROUP_ENTRY_Y(0x1094, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis3_pu3 DRV_PINGROUP_ENTRY_Y(0x109c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis4_pu4 DRV_PINGROUP_ENTRY_Y(0x10a4, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis5_pu5 DRV_PINGROUP_ENTRY_Y(0x10ac, 12, 5, 20, 5, -1, -1, -1, -1, 1) + +#define PINGROUP(pg_name, f0, f1, f2, f3, r, bank, pupd, e_io_hv, e_lpbk, e_input, e_lpdr, e_pbias_buf, \ + gpio_sfio_sel, e_od, schmitt_b, drvtype, epreemp, io_reset, rfu_in) \ + { \ + .name = #pg_name, \ + .pins = pg_name##_pins, \ + .npins = ARRAY_SIZE(pg_name##_pins), \ + .funcs = { \ + TEGRA_MUX_##f0, \ + TEGRA_MUX_##f1, \ + TEGRA_MUX_##f2, \ + TEGRA_MUX_##f3, \ + }, \ + PIN_PINGROUP_ENTRY_Y(r, bank, pupd, e_io_hv, e_lpbk, \ + e_input, e_lpdr, e_pbias_buf, \ + gpio_sfio_sel, e_od, \ + schmitt_b, drvtype, \ + epreemp, io_reset, \ + rfu_in) \ + drive_##pg_name, \ + } + +static const struct tegra_pingroup tegra186_groups[] = { + PINGROUP(gpio_aud3_pk0, RSVD0, DSPK1, SPDIF, RSVD3, 0x1000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_aud2_pj7, RSVD0, DSPK1, SPDIF, RSVD3, 0x1008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_aud1_pj6, RSVD0, RSVD1, RSVD2, RSVD3, 0x1010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_aud0_pj5, RSVD0, RSVD1, RSVD2, RSVD3, 0x1018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(aud_mclk_pj4, AUD, RSVD1, RSVD2, RSVD3, 0x1020, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap1_fs_pj3, I2S1, RSVD1, RSVD2, RSVD3, 0x1028, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap1_din_pj2, I2S1, RSVD1, RSVD2, RSVD3, 0x1030, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap1_dout_pj1, I2S1, RSVD1, RSVD2, RSVD3, 0x1038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap1_sclk_pj0, I2S1, RSVD1, RSVD2, RSVD3, 0x1040, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dmic1_clk_pm1, DMIC1, I2S3, RSVD2, RSVD3, 0x2000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dmic1_dat_pm0, DMIC1, I2S3, RSVD2, RSVD3, 0x2008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dmic2_dat_pm2, DMIC2, I2S3, RSVD2, RSVD3, 0x2010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dmic2_clk_pm3, DMIC2, I2S3, RSVD2, RSVD3, 0x2018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dmic4_dat_pm4, DMIC4, DSPK0, RSVD2, RSVD3, 0x2020, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dmic4_clk_pm5, DMIC4, DSPK0, RSVD2, RSVD3, 0x2028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dap4_fs_pcc3, I2S4, RSVD1, RSVD2, RSVD3, 0x2030, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dap4_din_pcc2, I2S4, RSVD1, RSVD2, RSVD3, 0x2038, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dap4_dout_pcc1, I2S4, RSVD1, RSVD2, RSVD3, 0x2040, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dap4_sclk_pcc0, I2S4, RSVD1, RSVD2, RSVD3, 0x2048, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(extperiph2_clk_po1, EXTPERIPH2, RSVD1, RSVD2, RSVD3, 0x0000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(extperiph1_clk_po0, EXTPERIPH1, RSVD1, RSVD2, RSVD3, 0x0008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(cam_i2c_sda_po3, I2C3, RSVD1, RSVD2, RSVD3, 0x0010, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(cam_i2c_scl_po2, I2C3, RSVD1, RSVD2, RSVD3, 0x0018, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam1_pn0, VGP1, RSVD1, RSVD2, RSVD3, 0x0020, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam2_pn1, VGP2, EXTPERIPH3, RSVD2, RSVD3, 0x0028, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam3_pn2, VGP3, EXTPERIPH4, RSVD2, RSVD3, 0x0030, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam4_pn3, VGP4, SPI4, RSVD2, RSVD3, 0x0038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam5_pn4, VGP5, SPI4, RSVD2, RSVD3, 0x0040, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam6_pn5, VGP6, SPI4, RSVD2, RSVD3, 0x0048, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam7_pn6, RSVD0, SPI4, RSVD2, RSVD3, 0x0050, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap2_din_pc3, I2S2, RSVD1, RSVD2, RSVD3, 0x4000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap2_dout_pc2, I2S2, RSVD1, RSVD2, RSVD3, 0x4008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap2_fs_pc4, I2S2, RSVD1, RSVD2, RSVD3, 0x4010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap2_sclk_pc1, I2S2, RSVD1, RSVD2, RSVD3, 0x4018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart4_cts_pb3, UARTD, RSVD1, RSVD2, RSVD3, 0x4020, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart4_rts_pb2, UARTD, RSVD1, RSVD2, RSVD3, 0x4028, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart4_rx_pb1, UARTD, RSVD1, RSVD2, RSVD3, 0x4030, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart4_tx_pb0, UARTD, RSVD1, RSVD2, RSVD3, 0x4038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan4_pc0, RSVD0, RSVD1, RSVD2, RSVD3, 0x4040, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan3_pb6, RSVD0, RSVD1, RSVD2, RSVD3, 0x4048, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan2_pb5, RSVD0, RSVD1, RSVD2, RSVD3, 0x4050, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan1_pb4, RSVD0, RSVD1, RSVD2, RSVD3, 0x4058, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen1_i2c_scl_pc5, I2C1, RSVD1, RSVD2, RSVD3, 0x4060, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen1_i2c_sda_pc6, I2C1, RSVD1, RSVD2, RSVD3, 0x4068, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart1_cts_pt3, UARTA, RSVD1, RSVD2, RSVD3, 0x5000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart1_rts_pt2, UARTA, RSVD1, RSVD2, RSVD3, 0x5008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart1_rx_pt1, UARTA, RSVD1, RSVD2, RSVD3, 0x5010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart1_tx_pt0, UARTA, RSVD1, RSVD2, RSVD3, 0x5018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(directdc1_out3_pq5, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc1_out2_pq4, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5030, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc1_out1_pq3, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5038, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc1_out0_pq2, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5040, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc1_in_pq1, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5048, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc1_clk_pq0, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5050, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc_comp, DIRECTDC, RSVD1, RSVD2, RSVD3, 0x5058, 0, Y, -1, -1, -1, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq0_pi0, RSVD0, IQC0, I2S6, RSVD3, 0x3000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq1_pi1, RSVD0, IQC0, I2S6, RSVD3, 0x3008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq2_pi2, RSVD0, IQC0, I2S6, RSVD3, 0x3010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq3_pi3, RSVD0, IQC0, I2S6, RSVD3, 0x3018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq4_pi4, RSVD0, IQC1, DTV, RSVD3, 0x3020, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq5_pi5, RSVD0, IQC1, DTV, RSVD3, 0x3028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq6_pi6, RSVD0, IQC1, DTV, RSVD3, 0x3030, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq7_pi7, RSVD0, IQC1, DTV, RSVD3, 0x3038, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_edp2_pp5, RSVD0, UARTF, SDMMC3, RSVD3, 0x10000, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_edp3_pp6, RSVD0, UARTF, SDMMC1, RSVD3, 0x10008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_edp0_pp3, RSVD0, UARTF, SDMMC3, RSVD3, 0x10010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_edp1_pp4, RSVD0, UARTF, SDMMC1, RSVD3, 0x10018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dp_aux_ch0_hpd_pp0, DP, RSVD1, RSVD2, RSVD3, 0x10020, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dp_aux_ch1_hpd_pp1, DP, RSVD1, RSVD2, RSVD3, 0x10028, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(hdmi_cec_pp2, HDMI, RSVD1, RSVD2, RSVD3, 0x10030, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l2_clkreq_n_pa6, PE2, GP, SATA, RSVD3, 0x7000, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_wake_n_pa2, PE, RSVD1, RSVD2, RSVD3, 0x7008, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l1_clkreq_n_pa4, PE1, RSVD1, RSVD2, RSVD3, 0x7010, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l1_rst_n_pa3, PE1, RSVD1, RSVD2, RSVD3, 0x7018, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l0_clkreq_n_pa1, PE0, RSVD1, RSVD2, RSVD3, 0x7020, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l0_rst_n_pa0, PE0, RSVD1, RSVD2, RSVD3, 0x7028, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l2_rst_n_pa5, PE2, SOC, SATA, RSVD3, 0x7030, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(sdmmc1_clk_pd0, SDMMC1, RSVD1, RSVD2, RSVD3, 0x8000, 0, Y, 5, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc1_cmd_pd1, SDMMC1, RSVD1, RSVD2, RSVD3, 0x8008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc1_comp, SDMMC1, RSVD1, RSVD2, RSVD3, 0x8010, 0, Y, -1, -1, 6, -1, -1, -1, -1, -1, N, -1, -1, N), + PINGROUP(sdmmc1_dat3_pd5, SDMMC1, RSVD1, RSVD2, RSVD3, 0x8014, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc1_dat2_pd4, SDMMC1, RSVD1, RSVD2, RSVD3, 0x801c, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc1_dat1_pd3, SDMMC1, RSVD1, RSVD2, RSVD3, 0x8024, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc1_dat0_pd2, SDMMC1, RSVD1, RSVD2, RSVD3, 0x802c, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_td3_pe4, EQOS, SDMMC2, RSVD2, RSVD3, 0x9000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_td2_pe3, EQOS, SDMMC2, RSVD2, RSVD3, 0x9008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_td1_pe2, EQOS, SDMMC2, RSVD2, RSVD3, 0x9010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_td0_pe1, EQOS, SDMMC2, RSVD2, RSVD3, 0x9018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rd3_pf1, EQOS, SDMMC2, RSVD2, RSVD3, 0x9020, 0, Y, -1, 5, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rd2_pf0, EQOS, SDMMC2, RSVD2, RSVD3, 0x9028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rd1_pe7, EQOS, SDMMC2, RSVD2, RSVD3, 0x9030, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_mdio_pf4, EQOS, SOC, RSVD2, RSVD3, 0x9038, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rd0_pe6, EQOS, SDMMC2, RSVD2, RSVD3, 0x9040, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_mdc_pf5, EQOS, RSVD1, RSVD2, RSVD3, 0x9048, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_comp, EQOS, SDMMC2, RSVD2, RSVD3, 0x9050, 0, Y, -1, -1, -1, -1, -1, -1, -1, -1, N, -1, -1, N), + PINGROUP(eqos_txc_pe0, EQOS, SDMMC2, RSVD2, RSVD3, 0x9054, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rxc_pf3, EQOS, SDMMC2, RSVD2, RSVD3, 0x905c, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_tx_ctl_pe5, EQOS, SDMMC2, RSVD2, RSVD3, 0x9064, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rx_ctl_pf2, EQOS, SDMMC2, RSVD2, RSVD3, 0x906c, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_dat3_pg5, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_dat2_pg4, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_dat1_pg3, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_dat0_pg2, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_comp, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa020, 0, Y, -1, -1, -1, -1, -1, -1, -1, -1, N, -1, -1, N), + PINGROUP(sdmmc3_cmd_pg1, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa024, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_clk_pg0, SDMMC3, RSVD1, RSVD1, RSVD3, 0xa02c, 0, Y, -1, 5, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_clk, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6004, 0, Y, -1, 5, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_cmd, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6008, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dqs, SDMMC4, RSVD1, RSVD2, RSVD3, 0x600c, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat7, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6010, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat6, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6014, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat5, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6018, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat4, SDMMC4, RSVD1, RSVD2, RSVD3, 0x601c, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat3, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6020, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat2, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6024, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat1, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6028, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat0, SDMMC4, RSVD1, RSVD2, RSVD3, 0x602c, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(qspi_io3_pr4, QSPI, RSVD1, RSVD2, RSVD3, 0xB000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_io2_pr3, QSPI, RSVD1, RSVD2, RSVD3, 0xB008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_io1_pr2, QSPI, RSVD1, RSVD2, RSVD3, 0xB010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_io0_pr1, QSPI, RSVD1, RSVD2, RSVD3, 0xB018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_sck_pr0, QSPI, RSVD1, RSVD2, RSVD3, 0xB020, 0, Y, -1, 5, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_cs_n_pr5, QSPI, RSVD1, RSVD2, RSVD3, 0xB028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_comp, QSPI, RSVD1, RSVD2, RSVD3, 0xB030, 0, Y, -1, -1, -1, -1, -1, -1, -1, -1, Y, -1, -1, Y), + PINGROUP(gpio_wan8_ph3, RSVD0, RSVD1, SPI1, RSVD3, 0xd000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan7_ph2, RSVD0, RSVD1, SPI1, RSVD3, 0xd008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan6_ph1, RSVD0, RSVD1, SPI1, RSVD3, 0xd010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan5_ph0, RSVD0, RSVD1, SPI1, RSVD3, 0xd018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart2_tx_px0, UARTB, RSVD1, RSVD2, RSVD3, 0xd020, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart2_rx_px1, UARTB, RSVD1, RSVD2, RSVD3, 0xd028, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart2_rts_px2, UARTB, RSVD1, RSVD2, RSVD3, 0xd030, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart2_cts_px3, UARTB, RSVD1, RSVD2, RSVD3, 0xd038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart5_rx_px5, UARTE, SPI3, GP, RSVD3, 0xd040, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart5_tx_px4, UARTE, SPI3, NV, RSVD3, 0xd048, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart5_rts_px6, UARTE, SPI3, RSVD2, RSVD3, 0xd050, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart5_cts_px7, UARTE, SPI3, RSVD2, RSVD3, 0xd058, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm1_py0, RSVD0, RSVD1, RSVD2, RSVD3, 0xd060, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm2_py1, RSVD0, RSVD1, RSVD2, RSVD3, 0xd068, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm3_py2, RSVD0, RSVD1, RSVD2, RSVD3, 0xd070, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm4_py3, RSVD0, SPI1, CCLA, RSVD3, 0xd078, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm5_py4, RSVD0, SPI1, RSVD2, RSVD3, 0xd080, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm6_py5, SOC, RSVD1, RSVD2, RSVD3, 0xd088, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm7_py6, RSVD0, RSVD1, RSVD2, RSVD3, 0xd090, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(bcpu_pwr_req_ph4, RSVD0, RSVD1, RSVD2, RSVD3, 0xd098, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(mcpu_pwr_req_ph5, RSVD0, RSVD1, RSVD2, RSVD3, 0xd0a0, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpu_pwr_req_ph6, RSVD0, RSVD1, RSVD2, RSVD3, 0xd0a8, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen7_i2c_scl_pl0, I2C7, I2S5, RSVD2, RSVD3, 0xd0b0, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen7_i2c_sda_pl1, I2C7, I2S5, RSVD2, RSVD3, 0xd0b8, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen9_i2c_sda_pl3, I2C9, I2S5, RSVD2, RSVD3, 0xd0c0, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen9_i2c_scl_pl2, I2C9, I2S5, RSVD2, RSVD3, 0xd0c8, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(usb_vbus_en0_pl4, USB, RSVD1, RSVD2, RSVD3, 0xd0d0, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(usb_vbus_en1_pl5, USB, RSVD1, RSVD2, RSVD3, 0xd0d8, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gp_pwm7_pl7, GP, RSVD1, RSVD2, RSVD3, 0xd0e0, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gp_pwm6_pl6, GP, RSVD1, RSVD2, RSVD3, 0xd0e8, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(ufs0_rst_pbb1, UFS0, RSVD1, RSVD2, RSVD3, 0x11000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(ufs0_ref_clk_pbb0, UFS0, RSVD1, RSVD2, RSVD3, 0x11008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), +}; + +static const struct tegra_pinctrl_soc_data tegra186_pinctrl = { + .pins = tegra186_pins, + .npins = ARRAY_SIZE(tegra186_pins), + .functions = tegra186_functions, + .nfunctions = ARRAY_SIZE(tegra186_functions), + .groups = tegra186_groups, + .ngroups = ARRAY_SIZE(tegra186_groups), + .hsm_in_mux = false, + .schmitt_in_mux = true, + .drvtype_in_mux = true, + .sfsel_in_mux = true, +}; + +static const struct pinctrl_pin_desc tegra186_aon_pins[] = { + PINCTRL_PIN(TEGRA_PIN_PWR_I2C_SCL_PS0, "PWR_I2C_SCL_PS0"), + PINCTRL_PIN(TEGRA_PIN_PWR_I2C_SDA_PS1, "PWR_I2C_SDA_PS1"), + PINCTRL_PIN(TEGRA_PIN_BATT_OC_PS2, "BATT_OC_PS2"), + PINCTRL_PIN(TEGRA_PIN_SAFE_STATE_PS3, "SAFE_STATE_PS3"), + PINCTRL_PIN(TEGRA_PIN_VCOMP_ALERT_PS4, "VCOMP_ALERT_PS4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS0_PU0, "GPIO_DIS0_PU0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS1_PU1, "GPIO_DIS1_PU1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS2_PU2, "GPIO_DIS2_PU2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS3_PU3, "GPIO_DIS3_PU3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS4_PU4, "GPIO_DIS4_PU4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS5_PU5, "GPIO_DIS5_PU5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN0_PV0, "GPIO_SEN0_PV0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN1_PV1, "GPIO_SEN1_PV1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN2_PV2, "GPIO_SEN2_PV2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN3_PV3, "GPIO_SEN3_PV3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN4_PV4, "GPIO_SEN4_PV4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN5_PV5, "GPIO_SEN5_PV5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN6_PV6, "GPIO_SEN6_PV6"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN7_PV7, "GPIO_SEN7_PV7"), + PINCTRL_PIN(TEGRA_PIN_GEN8_I2C_SCL_PW0, "GEN8_I2C_SCL_PW0"), + PINCTRL_PIN(TEGRA_PIN_GEN8_I2C_SDA_PW1, "GEN8_I2C_SDA_PW1"), + PINCTRL_PIN(TEGRA_PIN_UART3_TX_PW2, "UART3_TX_PW2"), + PINCTRL_PIN(TEGRA_PIN_UART3_RX_PW3, "UART3_RX_PW3"), + PINCTRL_PIN(TEGRA_PIN_UART3_RTS_PW4, "UART3_RTS_PW4"), + PINCTRL_PIN(TEGRA_PIN_UART3_CTS_PW5, "UART3_CTS_PW5"), + PINCTRL_PIN(TEGRA_PIN_UART7_TX_PW6, "UART7_TX_PW6"), + PINCTRL_PIN(TEGRA_PIN_UART7_RX_PW7, "UART7_RX_PW7"), + PINCTRL_PIN(TEGRA_PIN_CAN1_DOUT_PZ0, "CAN1_DOUT_PZ0"), + PINCTRL_PIN(TEGRA_PIN_CAN1_DIN_PZ1, "CAN1_DIN_PZ1"), + PINCTRL_PIN(TEGRA_PIN_CAN0_DOUT_PZ2, "CAN0_DOUT_PZ2"), + PINCTRL_PIN(TEGRA_PIN_CAN0_DIN_PZ3, "CAN0_DIN_PZ3"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO0_PAA0, "CAN_GPIO0_PAA0"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO1_PAA1, "CAN_GPIO1_PAA1"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO2_PAA2, "CAN_GPIO2_PAA2"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO3_PAA3, "CAN_GPIO3_PAA3"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO4_PAA4, "CAN_GPIO4_PAA4"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO5_PAA5, "CAN_GPIO5_PAA5"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO6_PAA6, "CAN_GPIO6_PAA6"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO7_PAA7, "CAN_GPIO7_PAA7"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN8_PEE0, "GPIO_SEN8_PEE0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN9_PEE1, "GPIO_SEN9_PEE1"), + PINCTRL_PIN(TEGRA_PIN_TOUCH_CLK_PEE2, "TOUCH_CLK_PEE2"), + PINCTRL_PIN(TEGRA_PIN_POWER_ON_PFF0, "POWER_ON_PFF0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SW1_PFF1, "GPIO_SW1_PFF1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SW2_PFF2, "GPIO_SW2_PFF2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SW3_PFF3, "GPIO_SW3_PFF3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SW4_PFF4, "GPIO_SW4_PFF4"), + PINCTRL_PIN(TEGRA_PIN_SHUTDOWN, "SHUTDOWN"), + PINCTRL_PIN(TEGRA_PIN_PMU_INT, "PMU_INT"), + PINCTRL_PIN(TEGRA_PIN_SOC_PWR_REQ, "SOC_PWR_REQ"), + PINCTRL_PIN(TEGRA_PIN_CLK_32K_IN, "CLK_32K_IN"), +}; + +static const struct tegra_pingroup tegra186_aon_groups[] = { + PINGROUP(touch_clk_pee2, TOUCH, RSVD1, RSVD2, RSVD3, 0x2000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart3_cts_pw5, UARTC, RSVD1, RSVD2, RSVD3, 0x2008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart3_rts_pw4, UARTC, RSVD1, RSVD2, RSVD3, 0x2010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart3_rx_pw3, UARTC, RSVD1, RSVD2, RSVD3, 0x2018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart3_tx_pw2, UARTC, RSVD1, RSVD2, RSVD3, 0x2020, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen8_i2c_sda_pw1, I2C8, RSVD1, RSVD2, RSVD3, 0x2028, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen8_i2c_scl_pw0, I2C8, RSVD1, RSVD2, RSVD3, 0x2030, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart7_rx_pw7, UARTG, RSVD1, RSVD2, RSVD3, 0x2038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart7_tx_pw6, UARTG, RSVD1, RSVD2, RSVD3, 0x2040, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen0_pv0, RSVD0, RSVD1, RSVD2, RSVD3, 0x2048, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen1_pv1, SPI2, RSVD1, RSVD2, RSVD3, 0x2050, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen2_pv2, SPI2, RSVD1, RSVD2, RSVD3, 0x2058, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen3_pv3, SPI2, RSVD1, RSVD2, RSVD3, 0x2060, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen4_pv4, SPI2, RSVD1, RSVD2, RSVD3, 0x2068, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen5_pv5, RSVD0, RSVD1, RSVD2, RSVD3, 0x2070, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen6_pv6, RSVD0, GP, RSVD2, RSVD3, 0x2078, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen7_pv7, RSVD0, WDT, RSVD2, RSVD3, 0x2080, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen8_pee0, RSVD0, I2C2, RSVD2, RSVD3, 0x2088, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen9_pee1, RSVD0, I2C2, RSVD2, RSVD3, 0x2090, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(can_gpio7_paa7, RSVD0, WDT, RSVD2, RSVD3, 0x3000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can1_dout_pz0, CAN1, RSVD1, RSVD2, RSVD3, 0x3008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can1_din_pz1, CAN1, RSVD1, RSVD2, RSVD3, 0x3010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can0_dout_pz2, CAN0, RSVD1, RSVD2, RSVD3, 0x3018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can0_din_pz3, CAN0, RSVD1, RSVD2, RSVD3, 0x3020, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio0_paa0, RSVD0, DMIC3, DMIC5, RSVD3, 0x3028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio1_paa1, RSVD0, DMIC3, DMIC5, RSVD3, 0x3030, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio2_paa2, GPIO, RSVD1, RSVD2, RSVD3, 0x3038, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio3_paa3, RSVD0, RSVD1, RSVD2, RSVD3, 0x3040, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio4_paa4, RSVD0, RSVD1, RSVD2, RSVD3, 0x3048, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio5_paa5, RSVD0, RSVD1, RSVD2, RSVD3, 0x3050, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio6_paa6, RSVD0, RSVD1, RSVD2, RSVD3, 0x3058, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_sw1_pff1, RSVD0, RSVD1, RSVD2, RSVD3, 0x1000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sw2_pff2, RSVD0, RSVD1, RSVD2, RSVD3, 0x1008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sw3_pff3, RSVD0, RSVD1, RSVD2, RSVD3, 0x1010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sw4_pff4, RSVD0, RSVD1, RSVD2, RSVD3, 0x1018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(shutdown, RSVD0, RSVD1, RSVD2, RSVD3, 0x1020, 0, Y, -1, -1, 6, 8, -1, -1, -1, 12, N, -1, -1, N), + PINGROUP(pmu_int, RSVD0, RSVD1, RSVD2, RSVD3, 0x1028, 0, Y, -1, -1, 6, 8, -1, -1, -1, 12, N, -1, -1, N), + PINGROUP(safe_state_ps3, SCE, RSVD1, RSVD2, RSVD3, 0x1030, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(vcomp_alert_ps4, SOC, RSVD1, RSVD2, RSVD3, 0x1038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(soc_pwr_req, RSVD0, RSVD1, RSVD2, RSVD3, 0x1040, 0, Y, -1, -1, 6, 8, -1, -1, -1, 12, N, -1, -1, N), + PINGROUP(batt_oc_ps2, SOC, RSVD1, RSVD2, RSVD3, 0x1048, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(clk_32k_in, RSVD0, RSVD1, RSVD2, RSVD3, 0x1050, 0, Y, -1, -1, 6, 8, -1, -1, -1, -1, N, -1, -1, N), + PINGROUP(power_on_pff0, RSVD0, RSVD1, RSVD2, RSVD3, 0x1058, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pwr_i2c_scl_ps0, I2C5, RSVD1, RSVD2, RSVD3, 0x1060, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pwr_i2c_sda_ps1, I2C5, RSVD1, RSVD2, RSVD3, 0x1068, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis0_pu0, RSVD0, GP, DCB, DCC, 0x1080, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis1_pu1, RSVD0, RSVD1, DISPLAYA, RSVD3, 0x1088, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis2_pu2, RSVD0, GP, DCA, RSVD3, 0x1090, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis3_pu3, RSVD0, RSVD1, DISPLAYB, DCC, 0x1098, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis4_pu4, RSVD0, SOC, DCA, RSVD3, 0x10a0, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis5_pu5, RSVD0, GP, DCC, DCB, 0x10a8, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), +}; + +static const struct tegra_pinctrl_soc_data tegra186_pinctrl_aon = { + .pins = tegra186_aon_pins, + .npins = ARRAY_SIZE(tegra186_aon_pins), + .functions = tegra186_functions, + .nfunctions = ARRAY_SIZE(tegra186_functions), + .groups = tegra186_aon_groups, + .ngroups = ARRAY_SIZE(tegra186_aon_groups), + .hsm_in_mux = false, + .schmitt_in_mux = true, + .drvtype_in_mux = true, + .sfsel_in_mux = true, +}; + +static int tegra186_pinctrl_probe(struct platform_device *pdev) +{ + const struct tegra_pinctrl_soc_data *soc = of_device_get_match_data(&pdev->dev); + + return tegra_pinctrl_probe(pdev, soc); +} + +static const struct of_device_id tegra186_pinctrl_of_match[] = { + { .compatible = "nvidia,tegra186-pinmux", .data = &tegra186_pinctrl }, + { .compatible = "nvidia,tegra186-pinmux-aon", .data = &tegra186_pinctrl_aon }, + { }, +}; + +static struct platform_driver tegra186_pinctrl_driver = { + .driver = { + .name = "tegra186-pinctrl", + .of_match_table = tegra186_pinctrl_of_match, + }, + .probe = tegra186_pinctrl_probe, +}; + +static int __init tegra186_pinctrl_init(void) +{ + return platform_driver_register(&tegra186_pinctrl_driver); +} +arch_initcall(tegra186_pinctrl_init); diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig index 9392c2c43cc80e..c0fc54c3cd35e4 100644 --- a/drivers/soc/tegra/Kconfig +++ b/drivers/soc/tegra/Kconfig @@ -96,6 +96,7 @@ config ARCH_TEGRA_210_SOC config ARCH_TEGRA_186_SOC bool "NVIDIA Tegra186 SoC" depends on !CPU_BIG_ENDIAN + select PINCTRL_TEGRA186 select MAILBOX select SOC_TEGRA_PMC help From 76196742f49ed5e6b12804c8777c4c11a173289e Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 14 Jul 2025 12:46:37 +0000 Subject: [PATCH 0267/3206] pid: add Rust files to MAINTAINERS This files is maintained by Christian Brauner, thus add it to the relevant MAINTAINERS entry. Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/20250714124637.1905722-2-aliceryhl@google.com Acked-by: Miguel Ojeda Signed-off-by: Christian Brauner --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..5f423e08071875 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19834,6 +19834,7 @@ M: Christian Brauner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git +F: rust/kernel/pid_namespace.rs F: samples/pidfd/ F: tools/testing/selftests/clone3/ F: tools/testing/selftests/pid_namespace/ From 73861970938ad1323eb02bbbc87f6fbd1e5bacca Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 13 Aug 2025 00:17:44 +0900 Subject: [PATCH 0268/3206] minixfs: Verify inode mode when loading from disk The inode mode loaded from corrupted disk can be invalid. Do like what commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") does. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d Signed-off-by: Tetsuo Handa Link: https://lore.kernel.org/ec982681-84b8-4624-94fa-8af15b77cbd2@I-love.SAKURA.ne.jp Signed-off-by: Christian Brauner --- fs/minix/inode.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/minix/inode.c b/fs/minix/inode.c index df9d11479caf1e..32db676127a9ed 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -492,8 +492,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) inode->i_op = &minix_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &minix_aops; - } else + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { init_special_inode(inode, inode->i_mode, rdev); + } else { + printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + make_bad_inode(inode); + } } /* From 96d6bc84742f9e08b70bd1193da32cf0f09a0058 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Tue, 12 Aug 2025 15:54:44 +0800 Subject: [PATCH 0269/3206] pinctrl: equilibrium: Remove redundant semicolons Remove unnecessary semicolons. Fixes: 1948d5c51dba4 ("pinctrl: Add pinmux & GPIO controller driver for a new SoC") Signed-off-by: Liao Yuanhong Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/20250812075444.8310-1-liaoyuanhong@vivo.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-equilibrium.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-equilibrium.c b/drivers/pinctrl/pinctrl-equilibrium.c index 21004418567938..01731d51a7c43e 100644 --- a/drivers/pinctrl/pinctrl-equilibrium.c +++ b/drivers/pinctrl/pinctrl-equilibrium.c @@ -445,7 +445,7 @@ static int eqbr_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, } raw_spin_unlock_irqrestore(&pctl->lock, flags); *config = pinconf_to_config_packed(param, val); -; + return 0; } From 006568ab4c5ca2309ceb36fa553e390b4aa9c0c7 Mon Sep 17 00:00:00 2001 From: gaoxiang17 Date: Sat, 2 Aug 2025 10:21:23 +0800 Subject: [PATCH 0270/3206] pid: Add a judgment for ns null in pid_nr_ns __task_pid_nr_ns ns = task_active_pid_ns(current); pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); if (pid && ns->level <= pid->level) { Sometimes null is returned for task_active_pid_ns. Then it will trigger kernel panic in pid_nr_ns. For example: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 Mem abort info: ESR = 0x0000000096000007 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x07: level 3 translation fault Data abort info: ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 39-bit VAs, pgdp=00000002175aa000 [0000000000000058] pgd=08000002175ab003, p4d=08000002175ab003, pud=08000002175ab003, pmd=08000002175be003, pte=0000000000000000 pstate: 834000c5 (Nzcv daIF +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : __task_pid_nr_ns+0x74/0xd0 lr : __task_pid_nr_ns+0x24/0xd0 sp : ffffffc08001bd10 x29: ffffffc08001bd10 x28: ffffffd4422b2000 x27: 0000000000000001 x26: ffffffd442821168 x25: ffffffd442821000 x24: 00000f89492eab31 x23: 00000000000000c0 x22: ffffff806f5693c0 x21: ffffff806f5693c0 x20: 0000000000000001 x19: 0000000000000000 x18: 0000000000000000 x17: 00000000529c6ef0 x16: 00000000529c6ef0 x15: 00000000023a1adc x14: 0000000000000003 x13: 00000000007ef6d8 x12: 001167c391c78800 x11: 00ffffffffffffff x10: 0000000000000000 x9 : 0000000000000001 x8 : ffffff80816fa3c0 x7 : 0000000000000000 x6 : 49534d702d535449 x5 : ffffffc080c4c2c0 x4 : ffffffd43ee128c8 x3 : ffffffd43ee124dc x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffffff806f5693c0 Call trace: __task_pid_nr_ns+0x74/0xd0 ... __handle_irq_event_percpu+0xd4/0x284 handle_irq_event+0x48/0xb0 handle_fasteoi_irq+0x160/0x2d8 generic_handle_domain_irq+0x44/0x60 gic_handle_irq+0x4c/0x114 call_on_irq_stack+0x3c/0x74 do_interrupt_handler+0x4c/0x84 el1_interrupt+0x34/0x58 el1h_64_irq_handler+0x18/0x24 el1h_64_irq+0x68/0x6c account_kernel_stack+0x60/0x144 exit_task_stack_account+0x1c/0x80 do_exit+0x7e4/0xaf8 ... get_signal+0x7bc/0x8d8 do_notify_resume+0x128/0x828 el0_svc+0x6c/0x70 el0t_64_sync_handler+0x68/0xbc el0t_64_sync+0x1a8/0x1ac Code: 35fffe54 911a02a8 f9400108 b4000128 (b9405a69) ---[ end trace 0000000000000000 ]--- Kernel panic - not syncing: Oops: Fatal exception in interrupt Signed-off-by: gaoxiang17 Link: https://lore.kernel.org/20250802022123.3536934-1-gxxa03070307@gmail.com Reviewed-by: Baoquan He Signed-off-by: Christian Brauner --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/pid.c b/kernel/pid.c index c45a28c16cd256..14e908f2f0cbfb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -491,7 +491,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) struct upid *upid; pid_t nr = 0; - if (pid && ns->level <= pid->level) { + if (pid && ns && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; From abdfd4948e45c51b19162cf8b3f5003f8f53c9b9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 10 Aug 2025 19:36:04 +0200 Subject: [PATCH 0271/3206] pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit task_pid_vnr(another_task) will crash if the caller was already reaped. The pid_alive(current) check can't really help, the parent/debugger can call release_task() right after this check. This also means that even task_ppid_nr_ns(current, NULL) is not safe, pid_alive() only ensures that it is safe to dereference ->real_parent. Change __task_pid_nr_ns() to ensure ns != NULL. Originally-by: 高翔 Link: https://lore.kernel.org/all/20250802022123.3536934-1-gxxa03070307@gmail.com/ Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/20250810173604.GA19991@redhat.com Signed-off-by: Christian Brauner --- kernel/pid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/pid.c b/kernel/pid.c index 14e908f2f0cbfb..f62a7df2f04cf7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -514,7 +514,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (ns) + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; From b1afcaddd6c8475ee346a60525f9504965673e0c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 10 Aug 2025 19:36:15 +0200 Subject: [PATCH 0272/3206] pid: change bacct_add_tsk() to use task_ppid_nr_ns() to simplify the code. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/20250810173615.GA20000@redhat.com Signed-off-by: Christian Brauner --- kernel/tsacct.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 16b283f9d83141..6ea2f6363b9089 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -57,12 +57,11 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); + stats->ac_ppid = task_ppid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); - stats->ac_ppid = pid_alive(tsk) ? - task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); task_cputime(tsk, &utime, &stime); From d00f5232851c4895db8f0228881c31608feaab30 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 10 Aug 2025 19:36:20 +0200 Subject: [PATCH 0273/3206] pid: change task_state() to use task_ppid_nr_ns() to simplify the code. Note that only tpid and max_fds really need rcu_read_lock(), we could move task_ppid_nr_ns/task_tgid_nr_ns/task_numa_group_id/get_task_cred outside of rcu read section. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/20250810173620.GA20007@redhat.com Signed-off-by: Christian Brauner --- fs/proc/array.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index d6a0369caa931e..69269745d73b8f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -157,13 +157,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, unsigned int max_fds = 0; rcu_read_lock(); - ppid = pid_alive(p) ? - task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; - tracer = ptrace_parent(p); if (tracer) tpid = task_pid_nr_ns(tracer, ns); + ppid = task_ppid_nr_ns(p, ns); tgid = task_tgid_nr_ns(p, ns); ngid = task_numa_group_id(p); cred = get_task_cred(p); From c1dd310f1d76b4b13f1854618087af2513140897 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 19 Aug 2025 12:02:38 +0800 Subject: [PATCH 0274/3206] spi: SPISG: Use devm_kcalloc() in aml_spisg_clk_init() Replace calls of devm_kzalloc() with devm_kcalloc() in aml_spisg_clk_init() for safer memory allocation with built-in overflow protection, and replace sizeof(struct clk_div_table) with sizeof(*tbl) to shorten the line. Signed-off-by: Qianfeng Rong Link: https://patch.msgid.link/20250819040239.434863-1-rongqianfeng@vivo.com Signed-off-by: Mark Brown --- drivers/spi/spi-amlogic-spisg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-amlogic-spisg.c b/drivers/spi/spi-amlogic-spisg.c index 2ab8bdf2a6761e..64244a0bd40868 100644 --- a/drivers/spi/spi-amlogic-spisg.c +++ b/drivers/spi/spi-amlogic-spisg.c @@ -662,7 +662,7 @@ static int aml_spisg_clk_init(struct spisg_device *spisg, void __iomem *base) clk_disable_unprepare(spisg->pclk); - tbl = devm_kzalloc(dev, sizeof(struct clk_div_table) * (DIV_NUM + 1), GFP_KERNEL); + tbl = devm_kcalloc(dev, (DIV_NUM + 1), sizeof(*tbl), GFP_KERNEL); if (!tbl) return -ENOMEM; From 2a5d410916d3a8dcac06f72494f252314e933cfb Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 19 Aug 2025 17:20:38 +0800 Subject: [PATCH 0275/3206] spi: spi_amd: Remove the use of dev_err_probe() The dev_err_probe() doesn't do anything when error is '-ENOMEM'. Therefore, remove the useless call to dev_err_probe(), and just return the value instead. Signed-off-by: Xichao Zhao Link: https://patch.msgid.link/20250819092044.549464-2-zhao.xichao@vivo.com Signed-off-by: Mark Brown --- drivers/spi/spi-amd-pci.c | 5 ++--- drivers/spi/spi-amd.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-amd-pci.c b/drivers/spi/spi-amd-pci.c index e5faab414c1726..d48c3a5da303d9 100644 --- a/drivers/spi/spi-amd-pci.c +++ b/drivers/spi/spi-amd-pci.c @@ -38,7 +38,7 @@ static int amd_spi_pci_probe(struct pci_dev *pdev, /* Allocate storage for host and driver private data */ host = devm_spi_alloc_host(dev, sizeof(struct amd_spi)); if (!host) - return dev_err_probe(dev, -ENOMEM, "Error allocating SPI host\n"); + return -ENOMEM; amd_spi = spi_controller_get_devdata(host); @@ -47,8 +47,7 @@ static int amd_spi_pci_probe(struct pci_dev *pdev, amd_spi->io_remap_addr = devm_ioremap(dev, io_base_addr, AMD_HID2_MEM_SIZE); if (!amd_spi->io_remap_addr) - return dev_err_probe(dev, -ENOMEM, - "ioremap of SPI registers failed\n"); + return -ENOMEM; dev_dbg(dev, "io_remap_address: %p\n", amd_spi->io_remap_addr); diff --git a/drivers/spi/spi-amd.c b/drivers/spi/spi-amd.c index 02e7fe095a0b55..4d1dce4f497406 100644 --- a/drivers/spi/spi-amd.c +++ b/drivers/spi/spi-amd.c @@ -857,7 +857,7 @@ static int amd_spi_probe(struct platform_device *pdev) /* Allocate storage for host and driver private data */ host = devm_spi_alloc_host(dev, sizeof(struct amd_spi)); if (!host) - return dev_err_probe(dev, -ENOMEM, "Error allocating SPI host\n"); + return -ENOMEM; amd_spi = spi_controller_get_devdata(host); amd_spi->io_remap_addr = devm_platform_ioremap_resource(pdev, 0); From 0d00ebc6b869f4df67c05522bc1e8d01d1c7daa7 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 19 Aug 2025 17:20:39 +0800 Subject: [PATCH 0276/3206] spi: SPISG: Remove the use of dev_err_probe() The dev_err_probe() doesn't do anything when error is '-ENOMEM'. Therefore, remove the useless call to dev_err_probe(), and just return the value instead. Signed-off-by: Xichao Zhao Link: https://patch.msgid.link/20250819092044.549464-3-zhao.xichao@vivo.com Signed-off-by: Mark Brown --- drivers/spi/spi-amlogic-spisg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-amlogic-spisg.c b/drivers/spi/spi-amlogic-spisg.c index 2ab8bdf2a6761e..d1d5f3114fa8dd 100644 --- a/drivers/spi/spi-amlogic-spisg.c +++ b/drivers/spi/spi-amlogic-spisg.c @@ -733,7 +733,7 @@ static int aml_spisg_probe(struct platform_device *pdev) else ctlr = spi_alloc_host(dev, sizeof(*spisg)); if (!ctlr) - return dev_err_probe(dev, -ENOMEM, "controller allocation failed\n"); + return -ENOMEM; spisg = spi_controller_get_devdata(ctlr); spisg->controller = ctlr; From 2aade32d1ffc5f91cc447ed6387c0f619fb980c3 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 19 Aug 2025 17:20:40 +0800 Subject: [PATCH 0277/3206] spi: Remove the use of dev_err_probe() The dev_err_probe() doesn't do anything when error is '-ENOMEM'. Therefore, remove the useless call to dev_err_probe(), and just return the value instead. Signed-off-by: Xichao Zhao Link: https://patch.msgid.link/20250819092044.549464-4-zhao.xichao@vivo.com Signed-off-by: Mark Brown --- drivers/spi/spi-microchip-core-qspi.c | 3 +-- drivers/spi/spi-microchip-core.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-microchip-core-qspi.c b/drivers/spi/spi-microchip-core-qspi.c index d13a9b755c7f8a..0a6f65c77eac74 100644 --- a/drivers/spi/spi-microchip-core-qspi.c +++ b/drivers/spi/spi-microchip-core-qspi.c @@ -701,8 +701,7 @@ static int mchp_coreqspi_probe(struct platform_device *pdev) ctlr = devm_spi_alloc_host(&pdev->dev, sizeof(*qspi)); if (!ctlr) - return dev_err_probe(&pdev->dev, -ENOMEM, - "unable to allocate host for QSPI controller\n"); + return -ENOMEM; qspi = spi_controller_get_devdata(ctlr); platform_set_drvdata(pdev, qspi); diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c index 62ba0bd9cbb7e7..9128b86c536603 100644 --- a/drivers/spi/spi-microchip-core.c +++ b/drivers/spi/spi-microchip-core.c @@ -534,8 +534,7 @@ static int mchp_corespi_probe(struct platform_device *pdev) host = devm_spi_alloc_host(&pdev->dev, sizeof(*spi)); if (!host) - return dev_err_probe(&pdev->dev, -ENOMEM, - "unable to allocate host for SPI controller\n"); + return -ENOMEM; platform_set_drvdata(pdev, host); From 2bee48c9d1cd1749922d0e2df54330c924e14a0e Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 19 Aug 2025 17:20:41 +0800 Subject: [PATCH 0278/3206] spi: mt65xx: Remove the use of dev_err_probe() The dev_err_probe() doesn't do anything when error is '-ENOMEM'. Therefore, remove the useless call to dev_err_probe(), and just return the value instead. Signed-off-by: Xichao Zhao Link: https://patch.msgid.link/20250819092044.549464-5-zhao.xichao@vivo.com Signed-off-by: Mark Brown --- drivers/spi/spi-mt65xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c index a6032d44771bfd..8a3c00c3af42a7 100644 --- a/drivers/spi/spi-mt65xx.c +++ b/drivers/spi/spi-mt65xx.c @@ -1159,7 +1159,7 @@ static int mtk_spi_probe(struct platform_device *pdev) host = devm_spi_alloc_host(dev, sizeof(*mdata)); if (!host) - return dev_err_probe(dev, -ENOMEM, "failed to alloc spi host\n"); + return -ENOMEM; host->auto_runtime_pm = true; host->dev.of_node = dev->of_node; From 67259af78219bdbdea00491f32b1c0971a33401e Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 19 Aug 2025 17:20:42 +0800 Subject: [PATCH 0279/3206] spi: pxa2xx: Remove the use of dev_err_probe() The dev_err_probe() doesn't do anything when error is '-ENOMEM'. Therefore, remove the useless call to dev_err_probe(), and just return the value instead. Signed-off-by: Xichao Zhao Link: https://patch.msgid.link/20250819092044.549464-6-zhao.xichao@vivo.com Signed-off-by: Mark Brown --- drivers/spi/spi-pxa2xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index 06711a62fa3dca..ec7117a94d5f17 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -1283,7 +1283,7 @@ int pxa2xx_spi_probe(struct device *dev, struct ssp_device *ssp, else controller = devm_spi_alloc_host(dev, sizeof(*drv_data)); if (!controller) - return dev_err_probe(dev, -ENOMEM, "cannot alloc spi_controller\n"); + return -ENOMEM; drv_data = spi_controller_get_devdata(controller); drv_data->controller = controller; From 27848c082ba0b22850fd9fb7b185c015423dcdc7 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 19 Aug 2025 17:20:43 +0800 Subject: [PATCH 0280/3206] spi: s3c64xx: Remove the use of dev_err_probe() The dev_err_probe() doesn't do anything when error is '-ENOMEM'. Therefore, remove the useless call to dev_err_probe(), and just return the value instead. Signed-off-by: Xichao Zhao Reviewed-by: Tudor Ambarus Link: https://patch.msgid.link/20250819092044.549464-7-zhao.xichao@vivo.com Signed-off-by: Mark Brown --- drivers/spi/spi-s3c64xx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c index b1567243ae196a..3a00f9e480c5c2 100644 --- a/drivers/spi/spi-s3c64xx.c +++ b/drivers/spi/spi-s3c64xx.c @@ -1268,8 +1268,7 @@ static int s3c64xx_spi_probe(struct platform_device *pdev) host = devm_spi_alloc_host(&pdev->dev, sizeof(*sdd)); if (!host) - return dev_err_probe(&pdev->dev, -ENOMEM, - "Unable to allocate SPI Host\n"); + return -ENOMEM; platform_set_drvdata(pdev, host); From 9a96082f99453a3c0f5ea7a70d19a66dbfbd55bc Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Aug 2025 18:14:59 +0200 Subject: [PATCH 0281/3206] pinctrl: remove unneeded 'fast_io' parameter in regmap_config When using MMIO with regmap, fast_io is implied. No need to set it again. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/20250813161517.4746-14-wsa+renesas@sang-engineering.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rp1.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/pinctrl/pinctrl-rp1.c b/drivers/pinctrl/pinctrl-rp1.c index b231efc62ff366..605105aa0cbc7f 100644 --- a/drivers/pinctrl/pinctrl-rp1.c +++ b/drivers/pinctrl/pinctrl-rp1.c @@ -1690,7 +1690,6 @@ static const struct regmap_config rp1_pinctrl_gpio_regmap_cfg = { .reg_bits = 32, .val_bits = 32, .reg_stride = 4, - .fast_io = true, .rd_table = &rp1_gpio_reg_table, .name = "rp1-gpio", .max_register = 0xb11c, @@ -1700,7 +1699,6 @@ static const struct regmap_config rp1_pinctrl_rio_regmap_cfg = { .reg_bits = 32, .val_bits = 32, .reg_stride = 4, - .fast_io = true, .rd_table = &rp1_rio_reg_table, .name = "rp1-rio", .max_register = 0xb004, @@ -1710,7 +1708,6 @@ static const struct regmap_config rp1_pinctrl_pads_regmap_cfg = { .reg_bits = 32, .val_bits = 32, .reg_stride = 4, - .fast_io = true, .rd_table = &rp1_pads_reg_table, .name = "rp1-pads", .max_register = 0x8050, From 29a799917947d24c55e8902ef29f5cb52195682c Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 5 Aug 2025 09:47:39 +0200 Subject: [PATCH 0282/3206] dt-bindings: power: mediatek: Document access-controllers property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow specifying access-controllers in the main power controller node and deprecate the old mediatek,infracfg, mediatek,infracfg-nao and mediatek,smi properties located in the children. This is done in order to both simplify the power controller nodes and in preparation for adding support for new generation SoCs like MT8196/MT6991 and other variants, which will need to set protection on new busses. Reviewed-by: Nícolas F. R. A. Prado Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250805074746.29457-4-angelogioacchino.delregno@collabora.com Signed-off-by: Ulf Hansson --- .../power/mediatek,power-controller.yaml | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml index 9c7cc632abee25..500d98921581a3 100644 --- a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml +++ b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml @@ -44,6 +44,15 @@ properties: '#size-cells': const: 0 + access-controllers: + description: + A number of phandles to external blocks to set and clear the required + bits to enable or disable bus protection, necessary to avoid any bus + faults while enabling or disabling a power domain. + For example, this may hold phandles to INFRACFG and SMI. + minItems: 1 + maxItems: 3 + patternProperties: "^power-domain@[0-9a-f]+$": $ref: "#/$defs/power-domain-node" @@ -123,14 +132,17 @@ $defs: mediatek,infracfg: $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the INFRACFG register range. + deprecated: true mediatek,infracfg-nao: $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the INFRACFG-NAO register range. + deprecated: true mediatek,smi: $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the SMI register range. + deprecated: true required: - reg @@ -138,6 +150,31 @@ $defs: required: - compatible +allOf: + - if: + properties: + compatible: + contains: + enum: + - mediatek,mt8183-power-controller + then: + properties: + access-controllers: + minItems: 2 + maxItems: 2 + + - if: + properties: + compatible: + contains: + enum: + - mediatek,mt8365-power-controller + then: + properties: + access-controllers: + minItems: 3 + maxItems: 3 + additionalProperties: false examples: From c29345fa5f66bea0790cf2219f57b974d4fc177b Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 5 Aug 2025 09:47:40 +0200 Subject: [PATCH 0283/3206] pmdomain: mediatek: Refactor bus protection regmaps retrieval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to add support for new generation SoCs like MT8196, MT6991 and other variants, which require to set bus protection on different busses than the ones found on legacy chips, and to also simplify and reduce memory footprint of this driver, refactor the mechanism to retrieve and use the bus protection regmaps. This is done by removing the three pointers to struct regmap from struct scpsys_domain (allocated for each power domain) and moving them to the main struct scpsys (allocated per driver instance) as an array of pointers to regmap named **bus_prot. That deprecates the old devicetree properties to grab phandles to the three predefined busses (infracfg, infracfg-nao and smi) and replaces it with the base property "access-controllers" that is meant to be an array of phandles holding the same busses where required (for now - for legacy SoCs). The new bus protection phandles are indexed by the bus_prot_index member of struct scpsys, used to map "bus type" (ex.: infra, smi, etc) to the specific *bus_prot[x] element. While the old per-power-domain regmap pointers were removed, the support for old devicetree was retained by still checking if the new property (in DT) and new-style declaration (in SoC specific platform data) are both present at probe time. If those are not present, a lookup for the old properties will be done in all of the children of the power controller, and pointers to regmaps will be retrieved with the old properties, but then will be internally remapped to follow the new style regmap anyway as to let this driver benefit of the memory footprint reduction. Finally, it was necessary to change macros in mtk-pm-domains.h and in mt8365-pm-domains.h to make use of the new style bus protection declaration, as the actual HW block is now recognized not by flags but by its own scpsys_bus_prot_block enumeration. The BUS_PROT_(STA)_COMPONENT_{INFRA,INFRA_NAO,SMI} flags were also removed since they are now unused, and because that enumeration was initially meant to vary the logic of bus protection and not the bus where work is performed, anyway! Reviewed-by: Nícolas F. R. A. Prado Signed-off-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250805074746.29457-5-angelogioacchino.delregno@collabora.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/mediatek/mt8365-pm-domains.h | 8 +- drivers/pmdomain/mediatek/mtk-pm-domains.c | 188 ++++++++++++++---- drivers/pmdomain/mediatek/mtk-pm-domains.h | 53 +++-- 3 files changed, 187 insertions(+), 62 deletions(-) diff --git a/drivers/pmdomain/mediatek/mt8365-pm-domains.h b/drivers/pmdomain/mediatek/mt8365-pm-domains.h index 3d83d49eaa7c8d..6fbd5ef8d67253 100644 --- a/drivers/pmdomain/mediatek/mt8365-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8365-pm-domains.h @@ -29,11 +29,9 @@ MT8365_SMI_COMMON_CLAMP_EN) #define MT8365_BUS_PROT_WAY_EN(_set_mask, _set, _sta_mask, _sta) \ - _BUS_PROT(_set_mask, _set, _set, _sta_mask, _sta, \ - BUS_PROT_COMPONENT_INFRA | \ - BUS_PROT_STA_COMPONENT_INFRA_NAO | \ - BUS_PROT_INVERTED | \ - BUS_PROT_REG_UPDATE) + _BUS_PROT_STA(INFRA, INFRA_NAO, _set_mask, _set, _set, \ + _sta_mask, _sta, \ + BUS_PROT_INVERTED | BUS_PROT_REG_UPDATE) static const struct scpsys_domain_data scpsys_domain_data_mt8365[] = { [MT8365_POWER_DOMAIN_MM] = { diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c index a58ed7e2d9a479..48dc5f18843820 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.c +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c @@ -47,9 +47,6 @@ struct scpsys_domain { struct clk_bulk_data *clks; int num_subsys_clks; struct clk_bulk_data *subsys_clks; - struct regmap *infracfg_nao; - struct regmap *infracfg; - struct regmap *smi; struct regulator *supply; }; @@ -57,6 +54,8 @@ struct scpsys { struct device *dev; struct regmap *base; const struct scpsys_soc_data *soc_data; + u8 bus_prot_index[BUS_PROT_BLOCK_COUNT]; + struct regmap **bus_prot; struct genpd_onecell_data pd_data; struct generic_pm_domain *domains[]; }; @@ -125,19 +124,19 @@ static int scpsys_sram_disable(struct scpsys_domain *pd) static struct regmap *scpsys_bus_protect_get_regmap(struct scpsys_domain *pd, const struct scpsys_bus_prot_data *bpd) { - if (bpd->flags & BUS_PROT_COMPONENT_SMI) - return pd->smi; - else - return pd->infracfg; + struct scpsys *scpsys = pd->scpsys; + unsigned short block_idx = scpsys->bus_prot_index[bpd->bus_prot_block]; + + return scpsys->bus_prot[block_idx]; } static struct regmap *scpsys_bus_protect_get_sta_regmap(struct scpsys_domain *pd, const struct scpsys_bus_prot_data *bpd) { - if (bpd->flags & BUS_PROT_STA_COMPONENT_INFRA_NAO) - return pd->infracfg_nao; - else - return scpsys_bus_protect_get_regmap(pd, bpd); + struct scpsys *scpsys = pd->scpsys; + int block_idx = scpsys->bus_prot_index[bpd->bus_prot_sta_block]; + + return scpsys->bus_prot[block_idx]; } static int scpsys_bus_protect_clear(struct scpsys_domain *pd, @@ -149,7 +148,7 @@ static int scpsys_bus_protect_clear(struct scpsys_domain *pd, u32 expected_ack; u32 val; - expected_ack = (bpd->flags & BUS_PROT_STA_COMPONENT_INFRA_NAO ? sta_mask : 0); + expected_ack = (bpd->bus_prot_sta_block == BUS_PROT_BLOCK_INFRA_NAO ? sta_mask : 0); if (bpd->flags & BUS_PROT_REG_UPDATE) regmap_clear_bits(regmap, bpd->bus_prot_clr, bpd->bus_prot_set_clr_mask); @@ -355,7 +354,6 @@ generic_pm_domain *scpsys_add_one_domain(struct scpsys *scpsys, struct device_no { const struct scpsys_domain_data *domain_data; struct scpsys_domain *pd; - struct device_node *smi_node; struct property *prop; const char *clk_name; int i, ret, num_clks; @@ -396,32 +394,6 @@ generic_pm_domain *scpsys_add_one_domain(struct scpsys *scpsys, struct device_no node); } - pd->infracfg = syscon_regmap_lookup_by_phandle_optional(node, "mediatek,infracfg"); - if (IS_ERR(pd->infracfg)) - return dev_err_cast_probe(scpsys->dev, pd->infracfg, - "%pOF: failed to get infracfg regmap\n", - node); - - smi_node = of_parse_phandle(node, "mediatek,smi", 0); - if (smi_node) { - pd->smi = device_node_to_regmap(smi_node); - of_node_put(smi_node); - if (IS_ERR(pd->smi)) - return dev_err_cast_probe(scpsys->dev, pd->smi, - "%pOF: failed to get SMI regmap\n", - node); - } - - if (MTK_SCPD_CAPS(pd, MTK_SCPD_HAS_INFRA_NAO)) { - pd->infracfg_nao = syscon_regmap_lookup_by_phandle(node, "mediatek,infracfg-nao"); - if (IS_ERR(pd->infracfg_nao)) - return dev_err_cast_probe(scpsys->dev, pd->infracfg_nao, - "%pOF: failed to get infracfg-nao regmap\n", - node); - } else { - pd->infracfg_nao = NULL; - } - num_clks = of_clk_get_parent_count(node); if (num_clks > 0) { /* Calculate number of subsys_clks */ @@ -615,6 +587,136 @@ static void scpsys_domain_cleanup(struct scpsys *scpsys) } } +static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *scpsys) +{ + const u8 bp_blocks[3] = { + BUS_PROT_BLOCK_INFRA, BUS_PROT_BLOCK_SMI, BUS_PROT_BLOCK_INFRA_NAO + }; + struct device_node *np = dev->of_node; + struct device_node *node, *smi_np; + int num_regmaps = 0, i, j; + struct regmap *regmap[3]; + + /* + * Legacy code retrieves a maximum of three bus protection handles: + * some may be optional, or may not be, so the array of bp blocks + * that is normally passed in as platform data must be dynamically + * built in this case. + * + * Here, try to retrieve all of the regmaps that the legacy code + * supported and then count the number of the ones that are present, + * this makes it then possible to allocate the array of bus_prot + * regmaps and convert all to the new style handling. + */ + node = of_find_node_with_property(np, "mediatek,infracfg"); + if (node) { + regmap[0] = syscon_regmap_lookup_by_phandle(node, "mediatek,infracfg"); + of_node_put(node); + num_regmaps++; + if (IS_ERR(regmap[0])) + return dev_err_probe(dev, PTR_ERR(regmap[0]), + "%pOF: failed to get infracfg regmap\n", + node); + } else { + regmap[0] = NULL; + } + + node = of_find_node_with_property(np, "mediatek,smi"); + if (node) { + smi_np = of_parse_phandle(node, "mediatek,smi", 0); + of_node_put(node); + if (!smi_np) + return -ENODEV; + + regmap[1] = device_node_to_regmap(smi_np); + num_regmaps++; + of_node_put(smi_np); + if (IS_ERR(regmap[1])) + return dev_err_probe(dev, PTR_ERR(regmap[1]), + "%pOF: failed to get SMI regmap\n", + node); + } else { + regmap[1] = NULL; + } + + node = of_find_node_with_property(np, "mediatek,infracfg-nao"); + if (node) { + regmap[2] = syscon_regmap_lookup_by_phandle(node, "mediatek,infracfg-nao"); + num_regmaps++; + of_node_put(node); + if (IS_ERR(regmap[2])) + return dev_err_probe(dev, PTR_ERR(regmap[2]), + "%pOF: failed to get infracfg regmap\n", + node); + } else { + regmap[2] = NULL; + } + + scpsys->bus_prot = devm_kmalloc_array(dev, num_regmaps, + sizeof(*scpsys->bus_prot), GFP_KERNEL); + if (!scpsys->bus_prot) + return -ENOMEM; + + for (i = 0, j = 0; i < ARRAY_SIZE(bp_blocks); i++) { + enum scpsys_bus_prot_block bp_type; + + if (!regmap[i]) + continue; + + bp_type = bp_blocks[i]; + scpsys->bus_prot_index[bp_type] = j; + scpsys->bus_prot[j] = regmap[i]; + + j++; + } + + return 0; +} + +static int scpsys_get_bus_protection(struct device *dev, struct scpsys *scpsys) +{ + const struct scpsys_soc_data *soc = scpsys->soc_data; + struct device_node *np = dev->of_node; + int i, num_handles; + + num_handles = of_count_phandle_with_args(np, "access-controllers", NULL); + if (num_handles < 0 || num_handles != soc->num_bus_prot_blocks) + return dev_err_probe(dev, -EINVAL, + "Cannot get access controllers: expected %u, got %d\n", + soc->num_bus_prot_blocks, num_handles); + + scpsys->bus_prot = devm_kmalloc_array(dev, soc->num_bus_prot_blocks, + sizeof(*scpsys->bus_prot), GFP_KERNEL); + if (!scpsys->bus_prot) + return -ENOMEM; + + for (i = 0; i < soc->num_bus_prot_blocks; i++) { + enum scpsys_bus_prot_block bp_type; + struct device_node *node; + + node = of_parse_phandle(np, "access-controllers", i); + if (!node) + return -EINVAL; + + /* + * Index the bus protection regmaps so that we don't have to + * find the right one by type with a loop at every execution + * of power sequence(s). + */ + bp_type = soc->bus_prot_blocks[i]; + scpsys->bus_prot_index[bp_type] = i; + + scpsys->bus_prot[i] = device_node_to_regmap(node); + of_node_put(node); + if (IS_ERR_OR_NULL(scpsys->bus_prot[i])) + return dev_err_probe(dev, scpsys->bus_prot[i] ? + PTR_ERR(scpsys->bus_prot[i]) : -ENXIO, + "Cannot get regmap for access controller %d\n", i); + } + + return 0; +} + static const struct of_device_id scpsys_of_match[] = { { .compatible = "mediatek,mt6735-power-controller", @@ -701,6 +803,14 @@ static int scpsys_probe(struct platform_device *pdev) return PTR_ERR(scpsys->base); } + if (of_find_property(np, "access-controllers", NULL)) + ret = scpsys_get_bus_protection(dev, scpsys); + else + ret = scpsys_get_bus_protection_legacy(dev, scpsys); + + if (ret) + return ret; + ret = -ENODEV; for_each_available_child_of_node(np, node) { struct generic_pm_domain *domain; diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.h b/drivers/pmdomain/mediatek/mtk-pm-domains.h index 7085fa2976e98b..4f2d331a866ab0 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.h +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.h @@ -50,30 +50,43 @@ enum scpsys_bus_prot_flags { BUS_PROT_REG_UPDATE = BIT(1), BUS_PROT_IGNORE_CLR_ACK = BIT(2), BUS_PROT_INVERTED = BIT(3), - BUS_PROT_COMPONENT_INFRA = BIT(4), - BUS_PROT_COMPONENT_SMI = BIT(5), - BUS_PROT_STA_COMPONENT_INFRA_NAO = BIT(6), }; -#define _BUS_PROT(_set_clr_mask, _set, _clr, _sta_mask, _sta, _flags) { \ - .bus_prot_set_clr_mask = (_set_clr_mask), \ - .bus_prot_set = _set, \ - .bus_prot_clr = _clr, \ - .bus_prot_sta_mask = (_sta_mask), \ - .bus_prot_sta = _sta, \ - .flags = _flags \ +enum scpsys_bus_prot_block { + BUS_PROT_BLOCK_INFRA, + BUS_PROT_BLOCK_INFRA_NAO, + BUS_PROT_BLOCK_SMI, + BUS_PROT_BLOCK_COUNT, +}; + +#define _BUS_PROT_STA(_hwip, _sta_hwip, _set_clr_mask, _set, _clr, \ + _sta_mask, _sta, _flags) \ + { \ + .bus_prot_block = BUS_PROT_BLOCK_##_hwip, \ + .bus_prot_sta_block = BUS_PROT_BLOCK_##_sta_hwip, \ + .bus_prot_set_clr_mask = (_set_clr_mask), \ + .bus_prot_set = _set, \ + .bus_prot_clr = _clr, \ + .bus_prot_sta_mask = (_sta_mask), \ + .bus_prot_sta = _sta, \ + .flags = _flags \ } -#define BUS_PROT_WR(_hwip, _mask, _set, _clr, _sta) \ - _BUS_PROT(_mask, _set, _clr, _mask, _sta, BUS_PROT_COMPONENT_##_hwip) +#define _BUS_PROT(_hwip, _set_clr_mask, _set, _clr, _sta_mask, \ + _sta, _flags) \ + _BUS_PROT_STA(_hwip, _hwip, _set_clr_mask, _set, _clr, \ + _sta_mask, _sta, _flags) + +#define BUS_PROT_WR(_hwip, _mask, _set, _clr, _sta) \ + _BUS_PROT(_hwip, _mask, _set, _clr, _mask, _sta, 0) -#define BUS_PROT_WR_IGN(_hwip, _mask, _set, _clr, _sta) \ - _BUS_PROT(_mask, _set, _clr, _mask, _sta, \ - BUS_PROT_COMPONENT_##_hwip | BUS_PROT_IGNORE_CLR_ACK) +#define BUS_PROT_WR_IGN(_hwip, _mask, _set, _clr, _sta) \ + _BUS_PROT(_hwip, _mask, _set, _clr, _mask, _sta, \ + BUS_PROT_IGNORE_CLR_ACK) -#define BUS_PROT_UPDATE(_hwip, _mask, _set, _clr, _sta) \ - _BUS_PROT(_mask, _set, _clr, _mask, _sta, \ - BUS_PROT_COMPONENT_##_hwip | BUS_PROT_REG_UPDATE) +#define BUS_PROT_UPDATE(_hwip, _mask, _set, _clr, _sta) \ + _BUS_PROT(_hwip, _mask, _set, _clr, _mask, _sta, \ + BUS_PROT_REG_UPDATE) #define BUS_PROT_INFRA_UPDATE_TOPAXI(_mask) \ BUS_PROT_UPDATE(INFRA, _mask, \ @@ -82,6 +95,8 @@ enum scpsys_bus_prot_flags { INFRA_TOPAXI_PROTECTSTA1) struct scpsys_bus_prot_data { + u8 bus_prot_block; + u8 bus_prot_sta_block; u32 bus_prot_set_clr_mask; u32 bus_prot_set; u32 bus_prot_clr; @@ -119,6 +134,8 @@ struct scpsys_domain_data { struct scpsys_soc_data { const struct scpsys_domain_data *domains_data; int num_domains; + enum scpsys_bus_prot_block *bus_prot_blocks; + int num_bus_prot_blocks; }; #endif /* __SOC_MEDIATEK_MTK_PM_DOMAINS_H */ From ad4bbdc59b75edf7f3bda836136a1f2bace427b8 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 5 Aug 2025 09:47:41 +0200 Subject: [PATCH 0284/3206] pmdomain: mediatek: Handle SoCs with inverted SRAM power-down bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some SoCs, and even some subsystems in the same SoC, may have the logic for SRAM power-down inverted, as in, setting the bit means "power down" and unsetting means "power up": this is because some hardware subsystems use this as a power-lock indication and some use this as a power down one (for example, usually, the modem ss has it inverted!). In preparation for adding support for power domains with inverted SRAM_PDN bits, add a new MTK_SCPD_SRAM_PDN_INVERTED flag and check for it in scpsys_sram_enable() and scpsys_sram_disable(). Reviewed-by: Nícolas F. R. A. Prado Signed-off-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250805074746.29457-6-angelogioacchino.delregno@collabora.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/mediatek/mtk-pm-domains.c | 27 ++++++++++++++++------ drivers/pmdomain/mediatek/mtk-pm-domains.h | 1 + 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c index 48dc5f18843820..6118a389244a88 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.c +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c @@ -79,16 +79,23 @@ static bool scpsys_domain_is_on(struct scpsys_domain *pd) static int scpsys_sram_enable(struct scpsys_domain *pd) { - u32 pdn_ack = pd->data->sram_pdn_ack_bits; + u32 expected_ack, pdn_ack = pd->data->sram_pdn_ack_bits; struct scpsys *scpsys = pd->scpsys; unsigned int tmp; int ret; - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + if (MTK_SCPD_CAPS(pd, MTK_SCPD_SRAM_PDN_INVERTED)) { + regmap_set_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + expected_ack = pdn_ack; + } else { + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + expected_ack = 0; + } /* Either wait until SRAM_PDN_ACK all 1 or 0 */ ret = regmap_read_poll_timeout(scpsys->base, pd->data->ctl_offs, tmp, - (tmp & pdn_ack) == 0, MTK_POLL_DELAY_US, MTK_POLL_TIMEOUT); + (tmp & pdn_ack) == expected_ack, + MTK_POLL_DELAY_US, MTK_POLL_TIMEOUT); if (ret < 0) return ret; @@ -103,7 +110,7 @@ static int scpsys_sram_enable(struct scpsys_domain *pd) static int scpsys_sram_disable(struct scpsys_domain *pd) { - u32 pdn_ack = pd->data->sram_pdn_ack_bits; + u32 expected_ack, pdn_ack = pd->data->sram_pdn_ack_bits; struct scpsys *scpsys = pd->scpsys; unsigned int tmp; @@ -113,12 +120,18 @@ static int scpsys_sram_disable(struct scpsys_domain *pd) regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_SRAM_ISOINT_B_BIT); } - regmap_set_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + if (MTK_SCPD_CAPS(pd, MTK_SCPD_SRAM_PDN_INVERTED)) { + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + expected_ack = 0; + } else { + regmap_set_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + expected_ack = pdn_ack; + } /* Either wait until SRAM_PDN_ACK all 1 or 0 */ return regmap_read_poll_timeout(scpsys->base, pd->data->ctl_offs, tmp, - (tmp & pdn_ack) == pdn_ack, MTK_POLL_DELAY_US, - MTK_POLL_TIMEOUT); + (tmp & pdn_ack) == expected_ack, + MTK_POLL_DELAY_US, MTK_POLL_TIMEOUT); } static struct regmap *scpsys_bus_protect_get_regmap(struct scpsys_domain *pd, diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.h b/drivers/pmdomain/mediatek/mtk-pm-domains.h index 4f2d331a866ab0..fbbfb23a873968 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.h +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.h @@ -13,6 +13,7 @@ #define MTK_SCPD_EXT_BUCK_ISO BIT(6) #define MTK_SCPD_HAS_INFRA_NAO BIT(7) #define MTK_SCPD_STRICT_BUS_PROTECTION BIT(8) +#define MTK_SCPD_SRAM_PDN_INVERTED BIT(9) #define MTK_SCPD_CAPS(_scpd, _x) ((_scpd)->data->caps & (_x)) #define SPM_VDE_PWR_CON 0x0210 From 0e8e6b5f6a31a3d001ab83f7af9394fccd4bb569 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 5 Aug 2025 09:47:42 +0200 Subject: [PATCH 0285/3206] pmdomain: mediatek: Move ctl sequences out of power_on/off functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to support power domains of new SoCs and the modem power domains for both new and already supported chips, move the generic control power sequences out of the scpsys_power_on() and scpsys_power_off() and put them in new scpsys_ctl_pwrseq_on(), scpsys_ctl_pewseq_off() functions. Reviewed-by: Nícolas F. R. A. Prado Signed-off-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250805074746.29457-7-angelogioacchino.delregno@collabora.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/mediatek/mtk-pm-domains.c | 57 ++++++++++++++-------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c index 6118a389244a88..d84f0e7cde12f2 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.c +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c @@ -244,11 +244,45 @@ static int scpsys_regulator_disable(struct regulator *supply) return supply ? regulator_disable(supply) : 0; } +static int scpsys_ctl_pwrseq_on(struct scpsys_domain *pd) +{ + struct scpsys *scpsys = pd->scpsys; + bool tmp; + int ret; + + /* subsys power on */ + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_2ND_BIT); + + /* wait until PWR_ACK = 1 */ + ret = readx_poll_timeout(scpsys_domain_is_on, pd, tmp, tmp, MTK_POLL_DELAY_US, + MTK_POLL_TIMEOUT); + if (ret < 0) + return ret; + + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_CLK_DIS_BIT); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ISO_BIT); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); + + return 0; +} + +static void scpsys_ctl_pwrseq_off(struct scpsys_domain *pd) +{ + struct scpsys *scpsys = pd->scpsys; + + /* subsys power off */ + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ISO_BIT); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_CLK_DIS_BIT); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_2ND_BIT); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); +} + static int scpsys_power_on(struct generic_pm_domain *genpd) { struct scpsys_domain *pd = container_of(genpd, struct scpsys_domain, genpd); struct scpsys *scpsys = pd->scpsys; - bool tmp; int ret; ret = scpsys_regulator_enable(pd->supply); @@ -263,20 +297,10 @@ static int scpsys_power_on(struct generic_pm_domain *genpd) regmap_clear_bits(scpsys->base, pd->data->ext_buck_iso_offs, pd->data->ext_buck_iso_mask); - /* subsys power on */ - regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); - regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_2ND_BIT); - - /* wait until PWR_ACK = 1 */ - ret = readx_poll_timeout(scpsys_domain_is_on, pd, tmp, tmp, MTK_POLL_DELAY_US, - MTK_POLL_TIMEOUT); - if (ret < 0) + ret = scpsys_ctl_pwrseq_on(pd); + if (ret) goto err_pwr_ack; - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_CLK_DIS_BIT); - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ISO_BIT); - regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); - /* * In few Mediatek platforms(e.g. MT6779), the bus protect policy is * stricter, which leads to bus protect release must be prior to bus @@ -342,12 +366,7 @@ static int scpsys_power_off(struct generic_pm_domain *genpd) clk_bulk_disable_unprepare(pd->num_subsys_clks, pd->subsys_clks); - /* subsys power off */ - regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ISO_BIT); - regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_CLK_DIS_BIT); - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_2ND_BIT); - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); + scpsys_ctl_pwrseq_off(pd); /* wait until PWR_ACK = 0 */ ret = readx_poll_timeout(scpsys_domain_is_on, pd, tmp, !tmp, MTK_POLL_DELAY_US, From 16d861d2bce8b1d28b6d94ffbfcdaa9cf833542b Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 5 Aug 2025 09:47:43 +0200 Subject: [PATCH 0286/3206] pmdomain: mediatek: Add support for modem power sequences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the modem power domains by adding its specific power sequence in functions scpsys_modem_pwrseq_{on,off}() and call them if the flag MTK_SCPD_MODEM_PWRSEQ is present. While at it, since some SoC models need to skip setting/clearing the PWR_RST_B_BIT, also add a MTK_SCPD_SKIP_RESET_B flag for that. Reviewed-by: Nícolas F. R. A. Prado Signed-off-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250805074746.29457-8-angelogioacchino.delregno@collabora.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/mediatek/mtk-pm-domains.c | 41 ++++++++++++++++++++-- drivers/pmdomain/mediatek/mtk-pm-domains.h | 2 ++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c index d84f0e7cde12f2..cf749ba5c3c7aa 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.c +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c @@ -279,6 +279,36 @@ static void scpsys_ctl_pwrseq_off(struct scpsys_domain *pd) regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); } +static int scpsys_modem_pwrseq_on(struct scpsys_domain *pd) +{ + struct scpsys *scpsys = pd->scpsys; + bool tmp; + int ret; + + if (!MTK_SCPD_CAPS(pd, MTK_SCPD_SKIP_RESET_B)) + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); + + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); + + /* wait until PWR_ACK = 1 */ + ret = readx_poll_timeout(scpsys_domain_is_on, pd, tmp, tmp, MTK_POLL_DELAY_US, + MTK_POLL_TIMEOUT); + if (ret < 0) + return ret; + + return 0; +} + +static void scpsys_modem_pwrseq_off(struct scpsys_domain *pd) +{ + struct scpsys *scpsys = pd->scpsys; + + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); + + if (!MTK_SCPD_CAPS(pd, MTK_SCPD_SKIP_RESET_B)) + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); +} + static int scpsys_power_on(struct generic_pm_domain *genpd) { struct scpsys_domain *pd = container_of(genpd, struct scpsys_domain, genpd); @@ -297,7 +327,11 @@ static int scpsys_power_on(struct generic_pm_domain *genpd) regmap_clear_bits(scpsys->base, pd->data->ext_buck_iso_offs, pd->data->ext_buck_iso_mask); - ret = scpsys_ctl_pwrseq_on(pd); + if (MTK_SCPD_CAPS(pd, MTK_SCPD_MODEM_PWRSEQ)) + ret = scpsys_modem_pwrseq_on(pd); + else + ret = scpsys_ctl_pwrseq_on(pd); + if (ret) goto err_pwr_ack; @@ -366,7 +400,10 @@ static int scpsys_power_off(struct generic_pm_domain *genpd) clk_bulk_disable_unprepare(pd->num_subsys_clks, pd->subsys_clks); - scpsys_ctl_pwrseq_off(pd); + if (MTK_SCPD_CAPS(pd, MTK_SCPD_MODEM_PWRSEQ)) + scpsys_modem_pwrseq_off(pd); + else + scpsys_ctl_pwrseq_off(pd); /* wait until PWR_ACK = 0 */ ret = readx_poll_timeout(scpsys_domain_is_on, pd, tmp, !tmp, MTK_POLL_DELAY_US, diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.h b/drivers/pmdomain/mediatek/mtk-pm-domains.h index fbbfb23a873968..931a54f1c5caf1 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.h +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.h @@ -14,6 +14,8 @@ #define MTK_SCPD_HAS_INFRA_NAO BIT(7) #define MTK_SCPD_STRICT_BUS_PROTECTION BIT(8) #define MTK_SCPD_SRAM_PDN_INVERTED BIT(9) +#define MTK_SCPD_MODEM_PWRSEQ BIT(10) +#define MTK_SCPD_SKIP_RESET_B BIT(11) #define MTK_SCPD_CAPS(_scpd, _x) ((_scpd)->data->caps & (_x)) #define SPM_VDE_PWR_CON 0x0210 From 9d02c94342b35a94c4f1feecf94aa68e1565e6af Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 5 Aug 2025 09:47:44 +0200 Subject: [PATCH 0287/3206] pmdomain: mediatek: Add support for RTFF Hardware in MT8196/MT6991 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New generation SoCs use a new RTFF Hardware to save power during operation of various IPs, other than managing isolation of the internal buck converters during powerup/down of power domains. Since some of the power domains need different RTFF handling, add a new scpys_rtff_type enumeration and hold the value for each power domain in struct scpsys_domain_data. If RTFF HW is available, the RTFF additional power sequences are handled in scpsys_ctl_pwrseq_{on,off}(). Reviewed-by: Nícolas F. R. A. Prado Signed-off-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250805074746.29457-9-angelogioacchino.delregno@collabora.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/mediatek/mtk-pm-domains.c | 94 +++++++++++++++++++++- drivers/pmdomain/mediatek/mtk-pm-domains.h | 18 +++++ 2 files changed, 111 insertions(+), 1 deletion(-) diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c index cf749ba5c3c7aa..0ebe7379b94e50 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.c +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c @@ -39,6 +39,12 @@ #define PWR_SRAM_CLKISO_BIT BIT(5) #define PWR_SRAM_ISOINT_B_BIT BIT(6) +#define PWR_RTFF_SAVE BIT(24) +#define PWR_RTFF_NRESTORE BIT(25) +#define PWR_RTFF_CLK_DIS BIT(26) +#define PWR_RTFF_SAVE_FLAG BIT(27) +#define PWR_RTFF_UFS_CLK_DIS BIT(28) + struct scpsys_domain { struct generic_pm_domain genpd; const struct scpsys_domain_data *data; @@ -247,7 +253,7 @@ static int scpsys_regulator_disable(struct regulator *supply) static int scpsys_ctl_pwrseq_on(struct scpsys_domain *pd) { struct scpsys *scpsys = pd->scpsys; - bool tmp; + bool do_rtff_nrestore, tmp; int ret; /* subsys power on */ @@ -260,10 +266,72 @@ static int scpsys_ctl_pwrseq_on(struct scpsys_domain *pd) if (ret < 0) return ret; + if (pd->data->rtff_type == SCPSYS_RTFF_TYPE_PCIE_PHY) + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_CLK_DIS_BIT); regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ISO_BIT); + + /* Wait for RTFF HW to sync buck isolation state if this is PCIe PHY RTFF */ + if (pd->data->rtff_type == SCPSYS_RTFF_TYPE_PCIE_PHY) + udelay(5); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); + /* + * RTFF HW state may be modified by secure world or remote processors. + * + * With the only exception of STOR_UFS, which always needs save/restore, + * check if this power domain's RTFF is already on before trying to do + * the NRESTORE procedure, otherwise the system will lock up. + */ + switch (pd->data->rtff_type) { + case SCPSYS_RTFF_TYPE_GENERIC: + case SCPSYS_RTFF_TYPE_PCIE_PHY: + { + u32 ctl_status; + + regmap_read(scpsys->base, pd->data->ctl_offs, &ctl_status); + do_rtff_nrestore = ctl_status & PWR_RTFF_SAVE_FLAG; + break; + } + case SCPSYS_RTFF_TYPE_STOR_UFS: + /* STOR_UFS always needs NRESTORE */ + do_rtff_nrestore = true; + break; + default: + do_rtff_nrestore = false; + break; + } + + /* Return early if RTFF NRESTORE shall not be done */ + if (!do_rtff_nrestore) + return 0; + + switch (pd->data->rtff_type) { + case SCPSYS_RTFF_TYPE_GENERIC: + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE_FLAG); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + break; + case SCPSYS_RTFF_TYPE_PCIE_PHY: + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE_FLAG); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + break; + case SCPSYS_RTFF_TYPE_STOR_UFS: + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_UFS_CLK_DIS); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_UFS_CLK_DIS); + break; + default: + break; + } + return 0; } @@ -271,8 +339,32 @@ static void scpsys_ctl_pwrseq_off(struct scpsys_domain *pd) { struct scpsys *scpsys = pd->scpsys; + switch (pd->data->rtff_type) { + case SCPSYS_RTFF_TYPE_GENERIC: + case SCPSYS_RTFF_TYPE_PCIE_PHY: + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE_FLAG); + break; + case SCPSYS_RTFF_TYPE_STOR_UFS: + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_UFS_CLK_DIS); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_UFS_CLK_DIS); + break; + default: + break; + } + /* subsys power off */ regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ISO_BIT); + + /* Wait for RTFF HW to sync buck isolation state if this is PCIe PHY RTFF */ + if (pd->data->rtff_type == SCPSYS_RTFF_TYPE_PCIE_PHY) + udelay(1); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_CLK_DIS_BIT); regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_2ND_BIT); diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.h b/drivers/pmdomain/mediatek/mtk-pm-domains.h index 931a54f1c5caf1..b2e3dee0383119 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.h +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.h @@ -108,6 +108,22 @@ struct scpsys_bus_prot_data { u8 flags; }; +/** + * enum scpsys_rtff_type - Type of RTFF Hardware for power domain + * @SCPSYS_RTFF_NONE: RTFF HW not present or domain not RTFF managed + * @SCPSYS_RTFF_TYPE_GENERIC: Non-CPU, peripheral-generic RTFF HW + * @SCPSYS_RTFF_TYPE_PCIE_PHY: PCI-Express PHY specific RTFF HW + * @SCPSYS_RTFF_TYPE_STOR_UFS: Storage (UFS) specific RTFF HW + * @SCPSYS_RTFF_TYPE_MAX: Number of supported RTFF HW Types + */ +enum scpsys_rtff_type { + SCPSYS_RTFF_NONE = 0, + SCPSYS_RTFF_TYPE_GENERIC, + SCPSYS_RTFF_TYPE_PCIE_PHY, + SCPSYS_RTFF_TYPE_STOR_UFS, + SCPSYS_RTFF_TYPE_MAX +}; + /** * struct scpsys_domain_data - scp domain data for power on/off flow * @name: The name of the power domain. @@ -118,6 +134,7 @@ struct scpsys_bus_prot_data { * @ext_buck_iso_offs: The offset for external buck isolation * @ext_buck_iso_mask: The mask for external buck isolation * @caps: The flag for active wake-up action. + * @rtff_type: The power domain RTFF HW type * @bp_cfg: bus protection configuration for any subsystem */ struct scpsys_domain_data { @@ -129,6 +146,7 @@ struct scpsys_domain_data { int ext_buck_iso_offs; u32 ext_buck_iso_mask; u16 caps; + enum scpsys_rtff_type rtff_type; const struct scpsys_bus_prot_data bp_cfg[SPM_MAX_BUS_PROT_DATA]; int pwr_sta_offs; int pwr_sta2nd_offs; From ffeebf7587f518a3717fad308cf735adbbcaba97 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 5 Aug 2025 09:47:45 +0200 Subject: [PATCH 0288/3206] pmdomain: mediatek: Convert all SoCs to new style regmap retrieval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the bus_prot_blocks handle and declare num_bus_prot_blocks to allow all of the currently supported AArch64 MediaTek SoCs to use the new style regmap retrieval in the driver when a new style devicetree declaring the mediatek,bus-protection phandle(s) in the main power controller node is found. Reviewed-by: Nícolas F. R. A. Prado Signed-off-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250805074746.29457-10-angelogioacchino.delregno@collabora.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/mediatek/mt6795-pm-domains.h | 5 +++++ drivers/pmdomain/mediatek/mt8167-pm-domains.h | 5 +++++ drivers/pmdomain/mediatek/mt8173-pm-domains.h | 5 +++++ drivers/pmdomain/mediatek/mt8183-pm-domains.h | 5 +++++ drivers/pmdomain/mediatek/mt8186-pm-domains.h | 5 +++++ drivers/pmdomain/mediatek/mt8188-pm-domains.h | 6 ++++++ drivers/pmdomain/mediatek/mt8192-pm-domains.h | 5 +++++ drivers/pmdomain/mediatek/mt8195-pm-domains.h | 5 +++++ drivers/pmdomain/mediatek/mt8365-pm-domains.h | 6 ++++++ 9 files changed, 47 insertions(+) diff --git a/drivers/pmdomain/mediatek/mt6795-pm-domains.h b/drivers/pmdomain/mediatek/mt6795-pm-domains.h index a3f7785b04bd38..dc8e9f8877addd 100644 --- a/drivers/pmdomain/mediatek/mt6795-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt6795-pm-domains.h @@ -9,6 +9,9 @@ /* * MT6795 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt6795[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt6795[] = { [MT6795_POWER_DOMAIN_VDEC] = { @@ -107,6 +110,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt6795[] = { static const struct scpsys_soc_data mt6795_scpsys_data = { .domains_data = scpsys_domain_data_mt6795, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt6795), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt6795, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt6795), }; #endif /* __SOC_MEDIATEK_MT6795_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8167-pm-domains.h b/drivers/pmdomain/mediatek/mt8167-pm-domains.h index 8a0e898b79ab8b..f6ee48a711a16a 100644 --- a/drivers/pmdomain/mediatek/mt8167-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8167-pm-domains.h @@ -12,6 +12,9 @@ /* * MT8167 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8167[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt8167[] = { [MT8167_POWER_DOMAIN_MM] = { @@ -99,6 +102,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8167[] = { static const struct scpsys_soc_data mt8167_scpsys_data = { .domains_data = scpsys_domain_data_mt8167, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8167), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8167, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8167), }; #endif /* __SOC_MEDIATEK_MT8167_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8173-pm-domains.h b/drivers/pmdomain/mediatek/mt8173-pm-domains.h index 7be0f47f521404..561a644b5d1cb2 100644 --- a/drivers/pmdomain/mediatek/mt8173-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8173-pm-domains.h @@ -9,6 +9,9 @@ /* * MT8173 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8173[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt8173[] = { [MT8173_POWER_DOMAIN_VDEC] = { @@ -118,6 +121,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8173[] = { static const struct scpsys_soc_data mt8173_scpsys_data = { .domains_data = scpsys_domain_data_mt8173, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8173), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8173, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8173), }; #endif /* __SOC_MEDIATEK_MT8173_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8183-pm-domains.h b/drivers/pmdomain/mediatek/mt8183-pm-domains.h index c4c1b63d85b194..3742782a2702e4 100644 --- a/drivers/pmdomain/mediatek/mt8183-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8183-pm-domains.h @@ -9,6 +9,9 @@ /* * MT8183 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8183[] = { + BUS_PROT_BLOCK_INFRA, BUS_PROT_BLOCK_SMI +}; static const struct scpsys_domain_data scpsys_domain_data_mt8183[] = { [MT8183_POWER_DOMAIN_AUDIO] = { @@ -290,6 +293,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8183[] = { static const struct scpsys_soc_data mt8183_scpsys_data = { .domains_data = scpsys_domain_data_mt8183, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8183), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8183, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8183), }; #endif /* __SOC_MEDIATEK_MT8183_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8186-pm-domains.h b/drivers/pmdomain/mediatek/mt8186-pm-domains.h index cbac715c38fac2..00b9861af7c9c0 100644 --- a/drivers/pmdomain/mediatek/mt8186-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8186-pm-domains.h @@ -13,6 +13,9 @@ /* * MT8186 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8186[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt8186[] = { [MT8186_POWER_DOMAIN_MFG0] = { @@ -361,6 +364,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8186[] = { static const struct scpsys_soc_data mt8186_scpsys_data = { .domains_data = scpsys_domain_data_mt8186, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8186), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8186, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8186), }; #endif /* __SOC_MEDIATEK_MT8186_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8188-pm-domains.h b/drivers/pmdomain/mediatek/mt8188-pm-domains.h index 007235be9efe59..3a989e83e9b791 100644 --- a/drivers/pmdomain/mediatek/mt8188-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8188-pm-domains.h @@ -14,6 +14,10 @@ * MT8188 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8188[] = { + BUS_PROT_BLOCK_INFRA +}; + static const struct scpsys_domain_data scpsys_domain_data_mt8188[] = { [MT8188_POWER_DOMAIN_MFG0] = { .name = "mfg0", @@ -685,6 +689,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8188[] = { static const struct scpsys_soc_data mt8188_scpsys_data = { .domains_data = scpsys_domain_data_mt8188, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8188), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8188, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8188), }; #endif /* __SOC_MEDIATEK_MT8188_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8192-pm-domains.h b/drivers/pmdomain/mediatek/mt8192-pm-domains.h index 6f139eed376937..5d62fac5f68231 100644 --- a/drivers/pmdomain/mediatek/mt8192-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8192-pm-domains.h @@ -9,6 +9,9 @@ /* * MT8192 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8192[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt8192[] = { [MT8192_POWER_DOMAIN_AUDIO] = { @@ -380,6 +383,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8192[] = { static const struct scpsys_soc_data mt8192_scpsys_data = { .domains_data = scpsys_domain_data_mt8192, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8192), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8192, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8192), }; #endif /* __SOC_MEDIATEK_MT8192_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8195-pm-domains.h b/drivers/pmdomain/mediatek/mt8195-pm-domains.h index 59aa031ae6323b..9405e8f62eaf07 100644 --- a/drivers/pmdomain/mediatek/mt8195-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8195-pm-domains.h @@ -13,6 +13,9 @@ /* * MT8195 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8195[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt8195[] = { [MT8195_POWER_DOMAIN_PCIE_MAC_P0] = { @@ -661,6 +664,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8195[] = { static const struct scpsys_soc_data mt8195_scpsys_data = { .domains_data = scpsys_domain_data_mt8195, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8195), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8195, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8195), }; #endif /* __SOC_MEDIATEK_MT8195_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8365-pm-domains.h b/drivers/pmdomain/mediatek/mt8365-pm-domains.h index 6fbd5ef8d67253..33265ab8ce76f2 100644 --- a/drivers/pmdomain/mediatek/mt8365-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8365-pm-domains.h @@ -33,6 +33,10 @@ _sta_mask, _sta, \ BUS_PROT_INVERTED | BUS_PROT_REG_UPDATE) +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8365[] = { + BUS_PROT_BLOCK_INFRA, BUS_PROT_BLOCK_INFRA_NAO, BUS_PROT_BLOCK_SMI +}; + static const struct scpsys_domain_data scpsys_domain_data_mt8365[] = { [MT8365_POWER_DOMAIN_MM] = { .name = "mm", @@ -190,6 +194,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8365[] = { static const struct scpsys_soc_data mt8365_scpsys_data = { .domains_data = scpsys_domain_data_mt8365, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8365), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8365, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8365), }; #endif /* __SOC_MEDIATEK_MT8365_PM_DOMAINS_H */ From ec205a929257e766117bc541b8b27b7797b2a19a Mon Sep 17 00:00:00 2001 From: Da Xue Date: Thu, 14 Aug 2025 14:12:36 -0400 Subject: [PATCH 0289/3206] pinctrl: meson-g12a: add GPIOC_7 pcie_clkreqn pinmux Amlogic G12 exposes PCIe clock request signal on GPIOC_7 pinmux func 1 Add the relevant pinmux and pin groups Signed-off-by: Da Xue Reviewed-by: Martin Blumenstingl Link: https://lore.kernel.org/20250814181236.1956731-1-da@libre.computer Signed-off-by: Linus Walleij --- drivers/pinctrl/meson/pinctrl-meson-g12a.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/pinctrl/meson/pinctrl-meson-g12a.c b/drivers/pinctrl/meson/pinctrl-meson-g12a.c index 8b9130c6e170b9..117e72b4ffcb7b 100644 --- a/drivers/pinctrl/meson/pinctrl-meson-g12a.c +++ b/drivers/pinctrl/meson/pinctrl-meson-g12a.c @@ -442,6 +442,8 @@ static const unsigned int tdm_c_dout1_z_pins[] = { GPIOZ_3 }; static const unsigned int tdm_c_dout2_z_pins[] = { GPIOZ_4 }; static const unsigned int tdm_c_dout3_z_pins[] = { GPIOZ_5 }; +static const unsigned int pcie_clkreqn_pins[] = { GPIOC_7 }; + static const struct meson_pmx_group meson_g12a_periphs_groups[] = { GPIO_GROUP(GPIOZ_0), GPIO_GROUP(GPIOZ_1), @@ -721,6 +723,7 @@ static const struct meson_pmx_group meson_g12a_periphs_groups[] = { GROUP(pdm_din2_c, 4), GROUP(pdm_din3_c, 4), GROUP(pdm_dclk_c, 4), + GROUP(pcie_clkreqn, 1), /* bank GPIOH */ GROUP(spi1_mosi, 3), @@ -1183,6 +1186,10 @@ static const char * const tdm_c_groups[] = { "tdm_c_dout2_z", "tdm_c_dout3_z", }; +static const char * const pcie_clkreqn_groups[] = { + "pcie_clkreqn" +}; + static const char * const gpio_aobus_groups[] = { "GPIOAO_0", "GPIOAO_1", "GPIOAO_2", "GPIOAO_3", "GPIOAO_4", "GPIOAO_5", "GPIOAO_6", "GPIOAO_7", "GPIOAO_8", "GPIOAO_9", @@ -1309,6 +1316,7 @@ static const struct meson_pmx_func meson_g12a_periphs_functions[] = { FUNCTION(tdm_a), FUNCTION(tdm_b), FUNCTION(tdm_c), + FUNCTION(pcie_clkreqn), }; static const struct meson_pmx_func meson_g12a_aobus_functions[] = { From fec40f44afdabcbc4a7748e4278f30737b54bb1a Mon Sep 17 00:00:00 2001 From: Rex Chen Date: Mon, 28 Jul 2025 17:22:29 +0900 Subject: [PATCH 0290/3206] mmc: core: SPI mode remove cmd7 SPI mode doesn't support cmd7, so remove it in mmc_sdio_alive() and confirm if sdio is active by checking CCCR register value is available or not. Signed-off-by: Rex Chen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250728082230.1037917-2-rex.chen_1@nxp.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/sdio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c index 0f753367aec1c1..83085e76486aa8 100644 --- a/drivers/mmc/core/sdio.c +++ b/drivers/mmc/core/sdio.c @@ -945,7 +945,11 @@ static void mmc_sdio_remove(struct mmc_host *host) */ static int mmc_sdio_alive(struct mmc_host *host) { - return mmc_select_card(host->card); + if (!mmc_host_is_spi(host)) + return mmc_select_card(host->card); + else + return mmc_io_rw_direct(host->card, 0, 0, SDIO_CCCR_CCCR, 0, + NULL); } /* From fef12d9f5bcf7e2b19a7cf1295c6abd5642dd241 Mon Sep 17 00:00:00 2001 From: Rex Chen Date: Mon, 28 Jul 2025 17:22:30 +0900 Subject: [PATCH 0291/3206] mmc: mmc_spi: multiple block read remove read crc ack For multiple block read, the current implementation, transfer packet includes cmd53 + cmd53 response + block nums*(1byte token + block length bytes payload + 2bytes CRC + 1byte transfer), the last 1byte transfer of every block is not needed, so remove it. Why doesn't multiple block read need CRC ack? For read operation, host side get the payload and CRC value, then will only check the CRC value to confirm if the data is correct or not, but not send CRC ack to card. If the data is correct, save it, or discard it and retransmit if data is error, so the last 1byte transfer of every block make no sense. What's the side effect of this 1byte transfer? As the SPI is full duplex, if add this redundant 1byte transfer, SDIO card side take it as the token of next block, then all the next sub blocks sequence distort. Signed-off-by: Rex Chen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250728082230.1037917-3-rex.chen_1@nxp.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/mmc_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 35b0ad273b4ff6..95a32ff29ee166 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -563,7 +563,7 @@ mmc_spi_setup_data_message(struct mmc_spi_host *host, bool multiple, bool write) * the next token (next data block, or STOP_TRAN). We can try to * minimize I/O ops by using a single read to collect end-of-busy. */ - if (multiple || write) { + if (write) { t = &host->early_status; memset(t, 0, sizeof(*t)); t->len = write ? sizeof(scratch->status) : 1; From 9c174e4dacee9fb2014a4ffc953d79a5707b77e4 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Sun, 29 Jun 2025 21:38:56 +0100 Subject: [PATCH 0292/3206] mmc: host: renesas_sdhi: Fix the actual clock Wrong actual clock reported, if the SD clock division ratio is other than 1:1(bits DIV[7:0] in SD_CLK_CTRL are set to 11111111). On high speed mode, cat /sys/kernel/debug/mmc1/ios Without the patch: clock: 50000000 Hz actual clock: 200000000 Hz After the fix: clock: 50000000 Hz actual clock: 50000000 Hz Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20250629203859.170850-1-biju.das.jz@bp.renesas.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index fb8ca03f661d7f..a41291a28e9bdc 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -222,7 +222,11 @@ static void renesas_sdhi_set_clock(struct tmio_mmc_host *host, clk &= ~0xff; } - sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, clk & CLK_CTL_DIV_MASK); + clock = clk & CLK_CTL_DIV_MASK; + if (clock != 0xff) + host->mmc->actual_clock /= (1 << (ffs(clock) + 1)); + + sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, clock); if (!(host->pdata->flags & TMIO_MMC_MIN_RCAR2)) usleep_range(10000, 11000); From 74f44ad07d1063933c237a7db16f6a4036643d60 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 30 Jul 2025 17:46:14 +0100 Subject: [PATCH 0293/3206] mmc: tmio: Add 64-bit read/write support for SD_BUF0 in polling mode As per the RZ/{G2L,G3E} HW manual SD_BUF0 can be accessed by 16/32/64 bits. Most of the data transfer in SD/SDIO/eMMC mode is more than 8 bytes. During testing it is found that, if the DMA buffer is not aligned to 128 bit it fallback to PIO mode. In such cases, 64-bit access is much more efficient than the current 16-bit. Tested-by: Wolfram Sang Reviewed-by: Wolfram Sang Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20250730164618.233117-2-biju.das.jz@bp.renesas.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc.h | 15 ++++++++++++++ drivers/mmc/host/tmio_mmc_core.c | 33 ++++++++++++++++++++++++++++++ include/linux/platform_data/tmio.h | 3 +++ 3 files changed, 51 insertions(+) diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index d730b7633ae1aa..c8cdb1c0722e7b 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -242,6 +243,20 @@ static inline void sd_ctrl_read32_rep(struct tmio_mmc_host *host, int addr, ioread32_rep(host->ctl + (addr << host->bus_shift), buf, count); } +#ifdef CONFIG_64BIT +static inline void sd_ctrl_read64_rep(struct tmio_mmc_host *host, int addr, + u64 *buf, int count) +{ + readsq(host->ctl + (addr << host->bus_shift), buf, count); +} + +static inline void sd_ctrl_write64_rep(struct tmio_mmc_host *host, int addr, + const u64 *buf, int count) +{ + writesq(host->ctl + (addr << host->bus_shift), buf, count); +} +#endif + static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val) { diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 21c2f9095baca2..775e0d9353d571 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -349,6 +349,39 @@ static void tmio_mmc_transfer_data(struct tmio_mmc_host *host, /* * Transfer the data */ +#ifdef CONFIG_64BIT + if (host->pdata->flags & TMIO_MMC_64BIT_DATA_PORT) { + u64 *buf64 = (u64 *)buf; + u64 data = 0; + + if (count >= 8) { + if (is_read) + sd_ctrl_read64_rep(host, CTL_SD_DATA_PORT, + buf64, count >> 3); + else + sd_ctrl_write64_rep(host, CTL_SD_DATA_PORT, + buf64, count >> 3); + } + + /* if count was multiple of 8 */ + if (!(count & 0x7)) + return; + + buf64 += count >> 3; + count %= 8; + + if (is_read) { + sd_ctrl_read64_rep(host, CTL_SD_DATA_PORT, &data, 1); + memcpy(buf64, &data, count); + } else { + memcpy(&data, buf64, count); + sd_ctrl_write64_rep(host, CTL_SD_DATA_PORT, &data, 1); + } + + return; + } +#endif + if (host->pdata->flags & TMIO_MMC_32BIT_DATA_PORT) { u32 data = 0; u32 *buf32 = (u32 *)buf; diff --git a/include/linux/platform_data/tmio.h b/include/linux/platform_data/tmio.h index b060124ba1aef8..426291713b83d5 100644 --- a/include/linux/platform_data/tmio.h +++ b/include/linux/platform_data/tmio.h @@ -47,6 +47,9 @@ /* Some controllers have a CBSY bit */ #define TMIO_MMC_HAVE_CBSY BIT(11) +/* Some controllers have a 64-bit wide data port register */ +#define TMIO_MMC_64BIT_DATA_PORT BIT(12) + struct tmio_mmc_data { void *chan_priv_tx; void *chan_priv_rx; From 709fe7aa5aaf8047b528561a0082db8a3a29010c Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 30 Jul 2025 17:46:15 +0100 Subject: [PATCH 0294/3206] mmc: renesas_sdhi: Enable 64-bit polling mode Enable 64-bit polling mode for R-Car gen3 and RZ/G2L SoCs. Reviewed-by: Wolfram Sang Tested-by: Wolfram Sang Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20250730164618.233117-3-biju.das.jz@bp.renesas.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_internal_dmac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/renesas_sdhi_internal_dmac.c b/drivers/mmc/host/renesas_sdhi_internal_dmac.c index 4b389e92399e82..9e3ed0bcddd6c4 100644 --- a/drivers/mmc/host/renesas_sdhi_internal_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_internal_dmac.c @@ -107,7 +107,8 @@ static const struct renesas_sdhi_of_data of_data_rza2 = { static const struct renesas_sdhi_of_data of_data_rcar_gen3 = { .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_CLK_ACTUAL | - TMIO_MMC_HAVE_CBSY | TMIO_MMC_MIN_RCAR2, + TMIO_MMC_HAVE_CBSY | TMIO_MMC_MIN_RCAR2 | + TMIO_MMC_64BIT_DATA_PORT, .capabilities = MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ | MMC_CAP_CMD23 | MMC_CAP_WAIT_WHILE_BUSY, .capabilities2 = MMC_CAP2_NO_WRITE_PROTECT | MMC_CAP2_MERGE_CAPABLE, From b65e630a55a490a0269ab1e4a282af975848064c Mon Sep 17 00:00:00 2001 From: Jiayi Li Date: Mon, 4 Aug 2025 10:48:25 +0800 Subject: [PATCH 0295/3206] memstick: Add timeout to prevent indefinite waiting Add timeout handling to wait_for_completion calls in memstick_set_rw_addr() and memstick_alloc_card() to prevent indefinite blocking in case of hardware or communication failures. Signed-off-by: Jiayi Li Link: https://lore.kernel.org/r/20250804024825.1565078-1-lijiayi@kylinos.cn Signed-off-by: Ulf Hansson --- drivers/memstick/core/memstick.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c index 7f3f47db4c98a5..ee7a5c7ba1c520 100644 --- a/drivers/memstick/core/memstick.c +++ b/drivers/memstick/core/memstick.c @@ -370,7 +370,9 @@ int memstick_set_rw_addr(struct memstick_dev *card) { card->next_request = h_memstick_set_rw_addr; memstick_new_req(card->host); - wait_for_completion(&card->mrq_complete); + if (!wait_for_completion_timeout(&card->mrq_complete, + msecs_to_jiffies(500))) + card->current_mrq.error = -ETIMEDOUT; return card->current_mrq.error; } @@ -404,7 +406,9 @@ static struct memstick_dev *memstick_alloc_card(struct memstick_host *host) card->next_request = h_memstick_read_dev_id; memstick_new_req(host); - wait_for_completion(&card->mrq_complete); + if (!wait_for_completion_timeout(&card->mrq_complete, + msecs_to_jiffies(500))) + card->current_mrq.error = -ETIMEDOUT; if (card->current_mrq.error) goto err_out; From d2e6fb2c31a07f34e5e7533df11431cb0d2ecf9f Mon Sep 17 00:00:00 2001 From: Ricky Wu Date: Tue, 12 Aug 2025 11:08:11 +0800 Subject: [PATCH 0296/3206] misc: rtsx: usb card reader: add OCP support This patch adds support for Over Current Protection (OCP) to the Realtek USB card reader driver. The OCP mechanism protects the hardware by detecting and handling current overload conditions. This implementation includes: - Register configurations to enable OCP monitoring. - Handling of OCP interrupt events and associated error reporting. - Card power management changes in response to OCP triggers. This enhancement improves the robustness of the driver when operating in environments where electrical anomalies may occur, particularly with SD and MS card interfaces. Signed-off-by: Ricky Wu Link: https://lore.kernel.org/r/20250812030811.2426112-1-ricky_wu@realtek.com Signed-off-by: Ulf Hansson --- drivers/memstick/host/rtsx_usb_ms.c | 5 ++++- drivers/misc/cardreader/rtsx_usb.c | 7 ++++++ drivers/mmc/host/rtsx_usb_sdmmc.c | 33 ++++++++++++++++++++++++++--- include/linux/rtsx_usb.h | 11 ++++++++++ 4 files changed, 52 insertions(+), 4 deletions(-) diff --git a/drivers/memstick/host/rtsx_usb_ms.c b/drivers/memstick/host/rtsx_usb_ms.c index 3878136227e49c..9389e9643c2455 100644 --- a/drivers/memstick/host/rtsx_usb_ms.c +++ b/drivers/memstick/host/rtsx_usb_ms.c @@ -216,7 +216,10 @@ static int ms_power_off(struct rtsx_usb_ms *host) rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_CLK_EN, MS_CLK_EN, 0); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_OE, MS_OUTPUT_EN, 0); - + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, + POWER_MASK, POWER_OFF); + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, + POWER_MASK | LDO3318_PWR_MASK, POWER_OFF | LDO_SUSPEND); err = rtsx_usb_send_cmd(ucr, MODE_C, 100); if (err < 0) return err; diff --git a/drivers/misc/cardreader/rtsx_usb.c b/drivers/misc/cardreader/rtsx_usb.c index d007a4455ce5ba..1830e9ed252165 100644 --- a/drivers/misc/cardreader/rtsx_usb.c +++ b/drivers/misc/cardreader/rtsx_usb.c @@ -552,6 +552,10 @@ static int rtsx_usb_reset_chip(struct rtsx_ucr *ucr) ret = rtsx_usb_send_cmd(ucr, MODE_C, 100); if (ret) return ret; + /* config OCP */ + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_DETECT_EN, MS_OCP_DETECT_EN); + rtsx_usb_write_register(ucr, OCPPARA1, 0xF0, 0x50); + rtsx_usb_write_register(ucr, OCPPARA2, 0x7, 0x3); /* config non-crystal mode */ rtsx_usb_read_register(ucr, CFG_MODE, &val); @@ -722,6 +726,9 @@ static int rtsx_usb_suspend(struct usb_interface *intf, pm_message_t message) if (val & (SD_CD | MS_CD)) { device_for_each_child(&intf->dev, NULL, rtsx_usb_resume_child); return -EAGAIN; + } else { + /* if the card does not exists, clear OCP status */ + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_CLEAR, MS_OCP_CLEAR); } } else { /* There is an ongoing operation*/ diff --git a/drivers/mmc/host/rtsx_usb_sdmmc.c b/drivers/mmc/host/rtsx_usb_sdmmc.c index c5f6b9df066b58..e1ed39c657c37c 100644 --- a/drivers/mmc/host/rtsx_usb_sdmmc.c +++ b/drivers/mmc/host/rtsx_usb_sdmmc.c @@ -48,7 +48,7 @@ struct rtsx_usb_sdmmc { bool ddr_mode; unsigned char power_mode; - + u16 ocp_stat; #ifdef RTSX_USB_USE_LEDS_CLASS struct led_classdev led; char led_name[32]; @@ -785,6 +785,9 @@ static int sdmmc_get_cd(struct mmc_host *mmc) mutex_unlock(&ucr->dev_mutex); + /* get OCP status */ + host->ocp_stat = (val >> 4) & 0x03; + /* Treat failed detection as non-exist */ if (err) goto no_card; @@ -795,6 +798,11 @@ static int sdmmc_get_cd(struct mmc_host *mmc) } no_card: + /* clear OCP status */ + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_CLEAR, MS_OCP_CLEAR); + host->ocp_stat = 0; + } host->card_exist = false; return 0; } @@ -818,7 +826,11 @@ static void sdmmc_request(struct mmc_host *mmc, struct mmc_request *mrq) cmd->error = -ENOMEDIUM; goto finish_detect_card; } - + /* check OCP stat */ + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + cmd->error = -ENOMEDIUM; + goto finish_detect_card; + } mutex_lock(&ucr->dev_mutex); mutex_lock(&host->host_mutex); @@ -952,6 +964,10 @@ static int sd_power_on(struct rtsx_usb_sdmmc *host) struct rtsx_ucr *ucr = host->ucr; int err; + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + dev_dbg(sdmmc_dev(host), "over current\n"); + return -EIO; + } dev_dbg(sdmmc_dev(host), "%s\n", __func__); rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_SELECT, 0x07, SD_MOD_SEL); @@ -977,9 +993,19 @@ static int sd_power_on(struct rtsx_usb_sdmmc *host) usleep_range(800, 1000); + rtsx_usb_init_cmd(ucr); + /* WA OCP issue: after OCP, there were problems with reopen card power */ + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, POWER_MASK, POWER_ON); + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, FPDCTL, SSC_POWER_MASK, SSC_POWER_DOWN); + err = rtsx_usb_send_cmd(ucr, MODE_C, 100); + if (err) + return err; + msleep(20); + rtsx_usb_write_register(ucr, FPDCTL, SSC_POWER_MASK, SSC_POWER_ON); + usleep_range(180, 200); rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, - POWER_MASK|LDO3318_PWR_MASK, POWER_ON|LDO_ON); + LDO3318_PWR_MASK, LDO_ON); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_OE, SD_OUTPUT_EN, SD_OUTPUT_EN); @@ -1332,6 +1358,7 @@ static void rtsx_usb_init_host(struct rtsx_usb_sdmmc *host) mmc->max_req_size = 524288; host->power_mode = MMC_POWER_OFF; + host->ocp_stat = 0; } static int rtsx_usb_sdmmc_drv_probe(struct platform_device *pdev) diff --git a/include/linux/rtsx_usb.h b/include/linux/rtsx_usb.h index f267a06c6b1e7e..276b509c03e364 100644 --- a/include/linux/rtsx_usb.h +++ b/include/linux/rtsx_usb.h @@ -99,6 +99,17 @@ extern int rtsx_usb_card_exclusive_check(struct rtsx_ucr *ucr, int card); #define CD_MASK (SD_CD | MS_CD | XD_CD) #define SD_WP 0x08 +/* OCPCTL */ +#define MS_OCP_DETECT_EN 0x08 +#define MS_OCP_INT_EN 0x04 +#define MS_OCP_INT_CLR 0x02 +#define MS_OCP_CLEAR 0x01 + +/* OCPSTAT */ +#define MS_OCP_DETECT 0x80 +#define MS_OCP_NOW 0x02 +#define MS_OCP_EVER 0x01 + /* reader command field offset & parameters */ #define READ_REG_CMD 0 #define WRITE_REG_CMD 1 From 40c3c7dec7cca43af1dd5d0b72499dac87d1c07b Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 12 Aug 2025 17:29:08 +0800 Subject: [PATCH 0297/3206] mmc: meson-mx-sdhc: use PTR_ERR_OR_ZERO() to simplify code Use the standard error pointer macro to shorten the code and simplify. Signed-off-by: Xichao Zhao Reviewed-by: Martin Blumenstingl Link: https://lore.kernel.org/r/20250812092908.101867-1-zhao.xichao@vivo.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-mx-sdhc-clkc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/mmc/host/meson-mx-sdhc-clkc.c b/drivers/mmc/host/meson-mx-sdhc-clkc.c index cbd17a596cd25c..6d619bd0a8dc89 100644 --- a/drivers/mmc/host/meson-mx-sdhc-clkc.c +++ b/drivers/mmc/host/meson-mx-sdhc-clkc.c @@ -84,10 +84,8 @@ static int meson_mx_sdhc_gate_clk_hw_register(struct device *dev, return ret; clk_bulk_data[bulk_index].clk = devm_clk_hw_get_clk(dev, hw, name_suffix); - if (IS_ERR(clk_bulk_data[bulk_index].clk)) - return PTR_ERR(clk_bulk_data[bulk_index].clk); - return 0; + return PTR_ERR_OR_ZERO(clk_bulk_data[bulk_index].clk); } int meson_mx_sdhc_register_clkc(struct device *dev, void __iomem *base, From ff1bd1190e746a0ba2c6db7b84c3f677fdf1dad1 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Aug 2025 18:14:56 +0200 Subject: [PATCH 0298/3206] mmc: remove unneeded 'fast_io' parameter in regmap_config When using MMIO with regmap, fast_io is implied. No need to set it again. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250813161517.4746-11-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index e4fc345be7e5d3..8a099508b939c8 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -95,7 +95,6 @@ static const struct regmap_config sdhci_am654_regmap_config = { .reg_bits = 32, .val_bits = 32, .reg_stride = 4, - .fast_io = true, }; struct timing_data { From 18f7439fa171f798f2ff8aa69611db41f658242a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Monin?= Date: Mon, 18 Aug 2025 16:02:46 +0200 Subject: [PATCH 0299/3206] mmc: core: add mmc_card_can_cmd23 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a dedicated helper to check for CMD23 support for MMC card, similar to mmc_host_can_cmd23 for the host, as it is easy to get the check wrong. Signed-off-by: Benoît Monin Link: https://lore.kernel.org/r/20250818-mobileye-emmc-for-upstream-4-v4-1-34ecb3995e96@bootlin.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/core.c | 9 +++++++++ drivers/mmc/core/core.h | 1 + 2 files changed, 10 insertions(+) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 874c6fe92855e3..88fd231fee1d15 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -1875,6 +1875,15 @@ bool mmc_card_can_secure_erase_trim(struct mmc_card *card) } EXPORT_SYMBOL(mmc_card_can_secure_erase_trim); +bool mmc_card_can_cmd23(struct mmc_card *card) +{ + return ((mmc_card_mmc(card) && + card->csd.mmca_vsn >= CSD_SPEC_VER_3) || + (mmc_card_sd(card) && !mmc_card_ult_capacity(card) && + card->scr.cmds & SD_SCR_CMD23_SUPPORT)); +} +EXPORT_SYMBOL(mmc_card_can_cmd23); + int mmc_erase_group_aligned(struct mmc_card *card, sector_t from, unsigned int nr) { diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index 622085cd766f91..73f5d3d8c77d5d 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -123,6 +123,7 @@ bool mmc_card_can_trim(struct mmc_card *card); bool mmc_card_can_discard(struct mmc_card *card); bool mmc_card_can_sanitize(struct mmc_card *card); bool mmc_card_can_secure_erase_trim(struct mmc_card *card); +bool mmc_card_can_cmd23(struct mmc_card *card); int mmc_erase_group_aligned(struct mmc_card *card, sector_t from, unsigned int nr); unsigned int mmc_calc_max_discard(struct mmc_card *card); From 05849fc5d842ece65a537c7492cf8dc038d15d57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Monin?= Date: Mon, 18 Aug 2025 16:02:47 +0200 Subject: [PATCH 0300/3206] mmc: card: add mmc_card_blk_no_cmd23 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper to check for the missing CMD23 quirk, similar to other quirk helpers. Also reorder the helpers to match the order of the quirk bits defined in include/linux/mmc/card.h. Signed-off-by: Benoît Monin Link: https://lore.kernel.org/r/20250818-mobileye-emmc-for-upstream-4-v4-2-34ecb3995e96@bootlin.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/card.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h index 9cbdd240c3a7d4..1200951bab08c2 100644 --- a/drivers/mmc/core/card.h +++ b/drivers/mmc/core/card.h @@ -245,14 +245,19 @@ static inline int mmc_blksz_for_byte_mode(const struct mmc_card *c) return c->quirks & MMC_QUIRK_BLKSZ_FOR_BYTE_MODE; } +static inline int mmc_card_nonstd_func_interface(const struct mmc_card *c) +{ + return c->quirks & MMC_QUIRK_NONSTD_FUNC_IF; +} + static inline int mmc_card_disable_cd(const struct mmc_card *c) { return c->quirks & MMC_QUIRK_DISABLE_CD; } -static inline int mmc_card_nonstd_func_interface(const struct mmc_card *c) +static inline int mmc_card_blk_no_cmd23(const struct mmc_card *c) { - return c->quirks & MMC_QUIRK_NONSTD_FUNC_IF; + return c->quirks & MMC_QUIRK_BLK_NO_CMD23; } static inline int mmc_card_broken_byte_mode_512(const struct mmc_card *c) From e5af5f5478d2e8ad6edc5bccd43b7d581177fe41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Monin?= Date: Mon, 18 Aug 2025 16:02:48 +0200 Subject: [PATCH 0301/3206] mmc: mmc_test: use mmc_card cmd23 helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use mmc_card_can_cmd23 instead of using a local and partial implementation, and check for the CMD23 quirk with mmc_card_blk_no_cmd23. Signed-off-by: Benoît Monin Link: https://lore.kernel.org/r/20250818-mobileye-emmc-for-upstream-4-v4-3-34ecb3995e96@bootlin.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc_test.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c index 80e5d87a5e50be..67d4a301895c90 100644 --- a/drivers/mmc/core/mmc_test.c +++ b/drivers/mmc/core/mmc_test.c @@ -180,20 +180,14 @@ static int mmc_test_set_blksize(struct mmc_test_card *test, unsigned size) return mmc_set_blocklen(test->card, size); } -static bool mmc_test_card_cmd23(struct mmc_card *card) -{ - return mmc_card_mmc(card) || - (mmc_card_sd(card) && card->scr.cmds & SD_SCR_CMD23_SUPPORT); -} - static void mmc_test_prepare_sbc(struct mmc_test_card *test, struct mmc_request *mrq, unsigned int blocks) { struct mmc_card *card = test->card; if (!mrq->sbc || !mmc_host_can_cmd23(card->host) || - !mmc_test_card_cmd23(card) || !mmc_op_multi(mrq->cmd->opcode) || - (card->quirks & MMC_QUIRK_BLK_NO_CMD23)) { + !mmc_card_can_cmd23(card) || !mmc_op_multi(mrq->cmd->opcode) || + mmc_card_blk_no_cmd23(card)) { mrq->sbc = NULL; return; } From f33bba9b6442fdc092ecc3b1555bc4cdbb9962bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Monin?= Date: Mon, 18 Aug 2025 16:02:49 +0200 Subject: [PATCH 0302/3206] mmc: block: use mmc_card cmd23 helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the dedicated helpers for CMD23 card support. Signed-off-by: Benoît Monin Link: https://lore.kernel.org/r/20250818-mobileye-emmc-for-upstream-4-v4-4-34ecb3995e96@bootlin.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 9cc47bf94804b6..8fd9891462054d 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1768,8 +1768,7 @@ static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq, * these, while retaining features like reliable writes. */ if ((md->flags & MMC_BLK_CMD23) && mmc_op_multi(brq->cmd.opcode) && - (do_rel_wr || !(card->quirks & MMC_QUIRK_BLK_NO_CMD23) || - do_data_tag)) { + (do_rel_wr || !mmc_card_blk_no_cmd23(card) || do_data_tag)) { brq->sbc.opcode = MMC_SET_BLOCK_COUNT; brq->sbc.arg = brq->data.blocks | (do_rel_wr ? (1 << 31) : 0) | @@ -2618,13 +2617,8 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, */ md->read_only = mmc_blk_readonly(card); - if (mmc_host_can_cmd23(card->host)) { - if ((mmc_card_mmc(card) && - card->csd.mmca_vsn >= CSD_SPEC_VER_3) || - (mmc_card_sd(card) && !mmc_card_ult_capacity(card) && - card->scr.cmds & SD_SCR_CMD23_SUPPORT)) - md->flags |= MMC_BLK_CMD23; - } + if (mmc_host_can_cmd23(card->host) && mmc_card_can_cmd23(card)) + md->flags |= MMC_BLK_CMD23; if (md->flags & MMC_BLK_CMD23 && ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) || From 99e6cc80d5ce5af5781f84d20e4f3478d66ee8ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Monin?= Date: Mon, 18 Aug 2025 16:02:50 +0200 Subject: [PATCH 0303/3206] mmc: core: add mmc_read_tuning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide a function to the MMC hosts to read some blocks of data as part of their tuning. This function only returns the status of the read operation, not the data read. Signed-off-by: Benoît Monin Link: https://lore.kernel.org/r/20250818-mobileye-emmc-for-upstream-4-v4-5-34ecb3995e96@bootlin.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc_ops.c | 72 ++++++++++++++++++++++++++++++++++++++ include/linux/mmc/host.h | 1 + 2 files changed, 73 insertions(+) diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index 66283825513cb4..a952cc8265af8f 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -1077,3 +1077,75 @@ int mmc_sanitize(struct mmc_card *card, unsigned int timeout_ms) return err; } EXPORT_SYMBOL_GPL(mmc_sanitize); + +/** + * mmc_read_tuning() - read data blocks from the mmc + * @host: mmc host doing the read + * @blksz: data block size + * @blocks: number of blocks to read + * + * Read one or more blocks of data from the beginning of the mmc. This is a + * low-level helper for tuning operation. It is assumed that CMD23 can be used + * for multi-block read if the host supports it. + * + * Note: Allocate and free a temporary buffer to store the data read. The data + * is not available outside of the function, only the status of the read + * operation. + * + * Return: 0 in case of success, otherwise -EIO / -ENOMEM / -E2BIG + */ +int mmc_read_tuning(struct mmc_host *host, unsigned int blksz, unsigned int blocks) +{ + struct mmc_request mrq = {}; + struct mmc_command sbc = {}; + struct mmc_command cmd = {}; + struct mmc_command stop = {}; + struct mmc_data data = {}; + struct scatterlist sg; + void *buf; + unsigned int len; + + if (blocks > 1) { + if (mmc_host_can_cmd23(host)) { + mrq.sbc = &sbc; + sbc.opcode = MMC_SET_BLOCK_COUNT; + sbc.arg = blocks; + sbc.flags = MMC_RSP_R1 | MMC_CMD_AC; + } + cmd.opcode = MMC_READ_MULTIPLE_BLOCK; + mrq.stop = &stop; + stop.opcode = MMC_STOP_TRANSMISSION; + stop.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC; + } else { + cmd.opcode = MMC_READ_SINGLE_BLOCK; + } + + mrq.cmd = &cmd; + cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC; + + mrq.data = &data; + data.flags = MMC_DATA_READ; + data.blksz = blksz; + data.blocks = blocks; + data.blk_addr = 0; + data.sg = &sg; + data.sg_len = 1; + data.timeout_ns = 1000000000; + + if (check_mul_overflow(blksz, blocks, &len)) + return -E2BIG; + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + sg_init_one(&sg, buf, len); + + mmc_wait_for_req(host, &mrq); + kfree(buf); + + if (sbc.error || cmd.error || data.error) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_GPL(mmc_read_tuning); diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 68f09a955a9020..5ed5d203de23c1 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -743,5 +743,6 @@ int mmc_send_status(struct mmc_card *card, u32 *status); int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error); int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode); int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd); +int mmc_read_tuning(struct mmc_host *host, unsigned int blksz, unsigned int blocks); #endif /* LINUX_MMC_HOST_H */ From 60613a8b9b8187d789750423132ad7664ca448c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Monin?= Date: Mon, 18 Aug 2025 16:02:51 +0200 Subject: [PATCH 0304/3206] mmc: sdhci-cadence: implement multi-block read gap tuning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The controller suspends the clock between blocks when reading from the MMC as part of its flow-control, called read block gap. At higher clock speed and with IO delay between the controller and the MMC, this clock pause can happen too late, during the read of the next block and trigger a read error. To prevent this, the delay can be programmed for each mode via the pair of registers HRS37/38. This delay is obtained during tuning, by trying a multi-block read and increasing the delay until the read succeeds. For now, the tuning is only done in HS200, as the read error has only been observed at that speed. Acked-by: Adrian Hunter Signed-off-by: Benoît Monin Link: https://lore.kernel.org/r/20250818-mobileye-emmc-for-upstream-4-v4-6-34ecb3995e96@bootlin.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-cadence.c | 63 +++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-cadence.c b/drivers/mmc/host/sdhci-cadence.c index 2d823e158c5984..a2a4a5b0ab9648 100644 --- a/drivers/mmc/host/sdhci-cadence.c +++ b/drivers/mmc/host/sdhci-cadence.c @@ -36,6 +36,24 @@ #define SDHCI_CDNS_HRS06_MODE_MMC_HS400 0x5 #define SDHCI_CDNS_HRS06_MODE_MMC_HS400ES 0x6 +/* Read block gap */ +#define SDHCI_CDNS_HRS37 0x94 /* interface mode select */ +#define SDHCI_CDNS_HRS37_MODE_DS 0x0 +#define SDHCI_CDNS_HRS37_MODE_HS 0x1 +#define SDHCI_CDNS_HRS37_MODE_UDS_SDR12 0x8 +#define SDHCI_CDNS_HRS37_MODE_UDS_SDR25 0x9 +#define SDHCI_CDNS_HRS37_MODE_UDS_SDR50 0xa +#define SDHCI_CDNS_HRS37_MODE_UDS_SDR104 0xb +#define SDHCI_CDNS_HRS37_MODE_UDS_DDR50 0xc +#define SDHCI_CDNS_HRS37_MODE_MMC_LEGACY 0x20 +#define SDHCI_CDNS_HRS37_MODE_MMC_SDR 0x21 +#define SDHCI_CDNS_HRS37_MODE_MMC_DDR 0x22 +#define SDHCI_CDNS_HRS37_MODE_MMC_HS200 0x23 +#define SDHCI_CDNS_HRS37_MODE_MMC_HS400 0x24 +#define SDHCI_CDNS_HRS37_MODE_MMC_HS400ES 0x25 +#define SDHCI_CDNS_HRS38 0x98 /* Read block gap coefficient */ +#define SDHCI_CDNS_HRS38_BLKGAP_MAX 0xf + /* SRS - Slot Register Set (SDHCI-compatible) */ #define SDHCI_CDNS_SRS_BASE 0x200 @@ -251,6 +269,44 @@ static int sdhci_cdns_set_tune_val(struct sdhci_host *host, unsigned int val) return 0; } +/** + * sdhci_cdns_tune_blkgap() - tune multi-block read gap + * @mmc: MMC host + * + * Tune delay used in multi block read. To do so, + * try sending multi-block read command with incremented gap, unless + * it succeeds. + * + * Return: error code + */ +static int sdhci_cdns_tune_blkgap(struct mmc_host *mmc) +{ + struct sdhci_host *host = mmc_priv(mmc); + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_cdns_priv *priv = sdhci_pltfm_priv(pltfm_host); + void __iomem *hrs37_reg = priv->hrs_addr + SDHCI_CDNS_HRS37; + void __iomem *hrs38_reg = priv->hrs_addr + SDHCI_CDNS_HRS38; + int ret; + u32 gap; + u32 hrs37_mode; + + /* Currently only needed in HS200 mode */ + if (host->timing != MMC_TIMING_MMC_HS200) + return 0; + + writel(hrs37_mode, hrs37_reg); + + for (gap = 0; gap <= SDHCI_CDNS_HRS38_BLKGAP_MAX; gap++) { + writel(gap, hrs38_reg); + ret = mmc_read_tuning(mmc, 512, 32); + if (!ret) + break; + } + + dev_dbg(mmc_dev(mmc), "read block gap tune %s, gap %d\n", ret ? "failed" : "OK", gap); + return ret; +} + /* * In SD mode, software must not use the hardware tuning and instead perform * an almost identical procedure to eMMC. @@ -261,6 +317,7 @@ static int sdhci_cdns_execute_tuning(struct sdhci_host *host, u32 opcode) int max_streak = 0; int end_of_streak = 0; int i; + int ret; /* * Do not execute tuning for UHS_SDR50 or UHS_DDR50. @@ -288,7 +345,11 @@ static int sdhci_cdns_execute_tuning(struct sdhci_host *host, u32 opcode) return -EIO; } - return sdhci_cdns_set_tune_val(host, end_of_streak - max_streak / 2); + ret = sdhci_cdns_set_tune_val(host, end_of_streak - max_streak / 2); + if (ret) + return ret; + + return sdhci_cdns_tune_blkgap(host->mmc); } static void sdhci_cdns_set_uhs_signaling(struct sdhci_host *host, From fe2e8f17a56fe14915cd6b709e9f56a16029f104 Mon Sep 17 00:00:00 2001 From: Fange Zhang Date: Mon, 18 Aug 2025 12:41:23 +0800 Subject: [PATCH 0305/3206] pinctrl: sx150x: Make the driver tristate Set PINCTRL_SX150X config option as a tristate and add MODULE_DEVICE_TABLE()/MODULE_LICENSE() to export appropriate information. Signed-off-by: Fange Zhang Link: https://lore.kernel.org/20250818-modularize-sx150x-gpio-expander-v1-1-c2a027200fed@oss.qualcomm.com Signed-off-by: Linus Walleij --- drivers/pinctrl/Kconfig | 2 +- drivers/pinctrl/pinctrl-sx150x.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index 463cf252c275bb..6714aecce32587 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -550,7 +550,7 @@ config PINCTRL_STMFX interrupt-controller. config PINCTRL_SX150X - bool "Semtech SX150x I2C GPIO expander pinctrl driver" + tristate "Semtech SX150x I2C GPIO expander pinctrl driver" depends on I2C=y select PINMUX select PINCONF diff --git a/drivers/pinctrl/pinctrl-sx150x.c b/drivers/pinctrl/pinctrl-sx150x.c index 53cf8168b274c5..b613568b42b73d 100644 --- a/drivers/pinctrl/pinctrl-sx150x.c +++ b/drivers/pinctrl/pinctrl-sx150x.c @@ -863,6 +863,7 @@ static const struct of_device_id sx150x_of_match[] = { { .compatible = "semtech,sx1509q", .data = &sx1509q_device_data }, {}, }; +MODULE_DEVICE_TABLE(of, sx150x_of_match); static int sx150x_reset(struct sx150x_pinctrl *pctl) { @@ -1266,3 +1267,6 @@ static int __init sx150x_init(void) return i2c_add_driver(&sx150x_driver); } subsys_initcall(sx150x_init); + +MODULE_DESCRIPTION("Semtech SX150x I2C GPIO expander pinctrl driver"); +MODULE_LICENSE("GPL"); From f7f804633c91f0fbf03eefbae39eec2205191a82 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 18 Aug 2025 15:30:14 +0300 Subject: [PATCH 0306/3206] regulator: rt5133: Fix IS_ERR() vs NULL bug in rt5133_validate_vendor_info() The "priv->cdata" pointer isn't an error pointer; this should be a NULL check instead. Otherwise it leads to a NULL pointer dereference in the caller, rt5133_probe(). Fixes: 714165e1c4b0 ("regulator: rt5133: Add RT5133 PMIC regulator Support") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/aKMc1oK-7yY4cD3K@stanley.mountain Signed-off-by: Mark Brown --- drivers/regulator/rt5133-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/rt5133-regulator.c b/drivers/regulator/rt5133-regulator.c index 39532618e73d49..129b1f13c88028 100644 --- a/drivers/regulator/rt5133-regulator.c +++ b/drivers/regulator/rt5133-regulator.c @@ -510,7 +510,7 @@ static int rt5133_validate_vendor_info(struct rt5133_priv *priv) break; } } - if (IS_ERR(priv->cdata)) { + if (!priv->cdata) { dev_err(priv->dev, "Failed to find regulator match version\n"); return -ENODEV; } From ec0be3cdf40b5302248f3fb27a911cc630e8b855 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Tue, 19 Aug 2025 12:25:43 +0800 Subject: [PATCH 0307/3206] regulator: consumer.rst: document bulk operations The current consumer documentation does not include bulk operations, providing an example of how to acquire multiple regulators by calling regulator_get() multiple times. That solution is valid and slightly simpler for a small amount of regulators, but it does not scale well. Document the bulk operations to get, enable and disable regulators. Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20250819-reg_consumer_doc-v1-1-b631fc0d35a3@gmail.com Signed-off-by: Mark Brown --- Documentation/power/regulator/consumer.rst | 30 +++++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/Documentation/power/regulator/consumer.rst b/Documentation/power/regulator/consumer.rst index 9d2416f63f6e36..c01675b25a901e 100644 --- a/Documentation/power/regulator/consumer.rst +++ b/Documentation/power/regulator/consumer.rst @@ -23,10 +23,18 @@ To release the regulator the consumer driver should call :: regulator_put(regulator); Consumers can be supplied by more than one regulator e.g. codec consumer with -analog and digital supplies :: +analog and digital supplies by means of bulk operations :: + + struct regulator_bulk_data supplies[2]; + + supplies[0].supply = "Vcc"; /* digital core */ + supplies[1].supply = "Avdd"; /* analog */ + + ret = regulator_bulk_get(dev, ARRAY_SIZE(supplies), supplies); + + // convenience helper to call regulator_put() on multiple regulators + regulator_bulk_free(ARRAY_SIZE(supplies), supplies); - digital = regulator_get(dev, "Vcc"); /* digital core */ - analog = regulator_get(dev, "Avdd"); /* analog */ The regulator access functions regulator_get() and regulator_put() will usually be called in your device drivers probe() and remove() respectively. @@ -51,11 +59,21 @@ A consumer can determine if a regulator is enabled by calling:: This will return > zero when the regulator is enabled. +A set of regulators can be enabled with a single bulk operation :: + + int regulator_bulk_enable(int num_consumers, + struct regulator_bulk_data *consumers); + A consumer can disable its supply when no longer needed by calling:: int regulator_disable(regulator); +Or a number of them :: + + int regulator_bulk_disable(int num_consumers, + struct regulator_bulk_data *consumers); + NOTE: This may not disable the supply if it's shared with other consumers. The regulator will only be disabled when the enabled reference count is zero. @@ -64,11 +82,15 @@ Finally, a regulator can be forcefully disabled in the case of an emergency:: int regulator_force_disable(regulator); +This operation is also supported for multiple regulators :: + + int regulator_bulk_force_disable(int num_consumers, + struct regulator_bulk_data *consumers); + NOTE: this will immediately and forcefully shutdown the regulator output. All consumers will be powered off. - 3. Regulator Voltage Control & Status (dynamic drivers) ======================================================= From 8efe8816a7ddc98a733ca3326ae8794750bbbd34 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 18 Aug 2025 20:08:48 +0200 Subject: [PATCH 0308/3206] rust: alloc: add ARCH_KMALLOC_MINALIGN to bindgen blocklist For some architectures, such as X86_64, ARCH_KMALLOC_MINALIGN is not resolvable for bindgen. E.g. due to being defined as __alignof__(unsigned long long). Hence, we have to create a rust helper, i.e. let the C compiler evaluate the expression and store it in a const. However, if for other architectures, such as arm64, ARCH_KMALLOC_MINALIGN does evaluate to something that can be directly processed by bindgen, we end up with multiple definitions of ARCH_KMALLOC_MINALIGN in the generated bindings. error[E0428]: the name `ARCH_KMALLOC_MINALIGN` is defined multiple times --> /builddir/build/BUILD/kernel-6.17.0-build/kernel-next-20250818/linux-6.17.0-0.0.next.20250818.423.vanilla.fc44.aarch64/rust/bindings/bindings_generated.rs:134545:1 | 9622 | pub const ARCH_KMALLOC_MINALIGN: u32 = 8; | ----------------------------------------- previous definition of the value `ARCH_KMALLOC_MINALIGN` here ... 134545 | pub const ARCH_KMALLOC_MINALIGN: usize = 8; | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ `ARCH_KMALLOC_MINALIGN` redefined here | = note: `ARCH_KMALLOC_MINALIGN` must be defined only once in the value namespace of this module To fix this up, add ARCH_KMALLOC_MINALIGN to the blocklist of bindgen, such that we always only generate the symbol from the rust helper. Reported-by: Thorsten Leemhuis Closes: https://lore.kernel.org/all/8aa05f08-ef6e-4dfe-9453-beaab7b3cb98@leemhuis.info/ Fixes: 1b1a946dc2b5 ("rust: alloc: specify the minimum alignment of each allocator") Tested-by: Thorsten Leemhuis Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20250818180923.192042-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- rust/bindgen_parameters | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/bindgen_parameters b/rust/bindgen_parameters index 0f96af8b9a7fee..02b371b98b3912 100644 --- a/rust/bindgen_parameters +++ b/rust/bindgen_parameters @@ -34,3 +34,4 @@ # We use const helpers to aid bindgen, to avoid conflicts when constants are # recognized, block generation of the non-helper constants. --blocklist-item ARCH_SLAB_MINALIGN +--blocklist-item ARCH_KMALLOC_MINALIGN From 046c56178a73ad7883fc38e6caa0474025c0fe86 Mon Sep 17 00:00:00 2001 From: Shankari Anand Date: Sat, 16 Aug 2025 17:14:09 +0530 Subject: [PATCH 0309/3206] rust,cred: update AlwaysRefCounted import to sync::aref Update the import of `AlwaysRefCounted` in `cred.rs` to use `sync::aref` instead of `types`. This is part of the ongoing effort to move `ARef` and `AlwaysRefCounted` to the `sync` module for better modularity. Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1173 Signed-off-by: Shankari Anand Acked-by: Serge Hallyn Reviewed-by: Benno Lossin Reviewed-by: Alice Ryhl [PM: subj tweak] Signed-off-by: Paul Moore --- rust/kernel/cred.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/rust/kernel/cred.rs b/rust/kernel/cred.rs index 2599f01e8b285f..4a2229542fb73d 100644 --- a/rust/kernel/cred.rs +++ b/rust/kernel/cred.rs @@ -8,11 +8,7 @@ //! //! Reference: -use crate::{ - bindings, - task::Kuid, - types::{AlwaysRefCounted, Opaque}, -}; +use crate::{bindings, sync::aref::AlwaysRefCounted, task::Kuid, types::Opaque}; /// Wraps the kernel's `struct cred`. /// From 71b69f817e91b588030d7d47ddbdc4857a92eb4e Mon Sep 17 00:00:00 2001 From: Kyle Manna Date: Tue, 19 Aug 2025 09:17:39 -0700 Subject: [PATCH 0310/3206] EDAC/ie31200: Add two more Intel Alder Lake-S SoCs for EDAC support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Host Device IDs (DID0) correspond to: * Intel Core i7-12700K * Intel Core i5-12600K See documentation: * 12th Generation Intel® Core™ Processors Datasheet * Volume 1 of 2, Doc. No.: 655258, Rev.: 011 * https://edc.intel.com/output/DownloadPdfDocument?id=8297 (PDF) Signed-off-by: Kyle Manna Signed-off-by: Tony Luck Reviewed-by: Qiuxu Zhuo Link: https://lore.kernel.org/r/20250819161739.3241152-1-kyle@kylemanna.com --- drivers/edac/ie31200_edac.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c index 5c1fa1c0d12e3c..5a080ab65476da 100644 --- a/drivers/edac/ie31200_edac.c +++ b/drivers/edac/ie31200_edac.c @@ -99,6 +99,8 @@ /* Alder Lake-S */ #define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1 0x4660 +#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2 0x4668 /* 8P+4E, e.g. i7-12700K */ +#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3 0x4648 /* 6P+4E, e.g. i5-12600K */ /* Bartlett Lake-S */ #define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1 0x4639 @@ -761,6 +763,8 @@ static const struct pci_device_id ie31200_pci_tbl[] = { { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_6), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_HX_1), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_2), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_3), (kernel_ulong_t)&rpl_s_cfg}, From 2e6fe1bbefd9c059c3787d1c620fe67343a94dff Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Wed, 6 Aug 2025 14:57:07 +0800 Subject: [PATCH 0311/3206] EDAC/i10nm: Skip DIMM enumeration on a disabled memory controller When loading the i10nm_edac driver on some Intel Granite Rapids servers, a call trace may appear as follows: UBSAN: shift-out-of-bounds in drivers/edac/skx_common.c:453:16 shift exponent -66 is negative ... __ubsan_handle_shift_out_of_bounds+0x1e3/0x390 skx_get_dimm_info.cold+0x47/0xd40 [skx_edac_common] i10nm_get_dimm_config+0x23e/0x390 [i10nm_edac] skx_register_mci+0x159/0x220 [skx_edac_common] i10nm_init+0xcb0/0x1ff0 [i10nm_edac] ... This occurs because some BIOS may disable a memory controller if there aren't any memory DIMMs populated on this memory controller. The DIMMMTR register of this disabled memory controller contains the invalid value ~0, resulting in the call trace above. Fix this call trace by skipping DIMM enumeration on a disabled memory controller. Fixes: ba987eaaabf9 ("EDAC/i10nm: Add Intel Granite Rapids server support") Reported-by: Jose Jesus Ambriz Meza Reported-by: Chia-Lin Kao (AceLan) Closes: https://lore.kernel.org/all/20250730063155.2612379-1-acelan.kao@canonical.com/ Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Tested-by: Chia-Lin Kao (AceLan) Link: https://lore.kernel.org/r/20250806065707.3533345-1-qiuxu.zhuo@intel.com --- drivers/edac/i10nm_base.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index bf4171ac191d3f..9d00f247f4e0ea 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -1057,6 +1057,15 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan) return !!GET_BITFIELD(mcmtr, 2, 2); } +static bool i10nm_channel_disabled(struct skx_imc *imc, int chan) +{ + u32 mcmtr = I10NM_GET_MCMTR(imc, chan); + + edac_dbg(1, "mc%d ch%d mcmtr reg %x\n", imc->mc, chan, mcmtr); + + return (mcmtr == ~0 || GET_BITFIELD(mcmtr, 18, 18)); +} + static int i10nm_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) { @@ -1070,6 +1079,11 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, if (!imc->mbase) continue; + if (i10nm_channel_disabled(imc, i)) { + edac_dbg(1, "mc%d ch%d is disabled.\n", imc->mc, i); + continue; + } + ndimms = 0; if (res_cfg->type != GNR) From 219af5dfce98fe254335ea79c2bf98428c38370c Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 31 Jul 2025 22:55:28 +0800 Subject: [PATCH 0312/3206] EDAC/{skx_common,skx}: Use configuration data, not global macros Use model-specific configuration data for the number of memory controllers per socket, channels per memory controller, and DIMMs per channel as intended, instead of relying on global macros for maximum values. No functional changes intended. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20250731145534.2759334-2-qiuxu.zhuo@intel.com --- drivers/edac/skx_base.c | 33 ++++++++++++++++++++------------- drivers/edac/skx_common.c | 16 +++++++++------- drivers/edac/skx_common.h | 1 + 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 29897b21fb8e35..078ddf95cc6e60 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -33,6 +33,15 @@ static unsigned int nvdimm_count; #define MASK26 0x3FFFFFF /* Mask for 2^26 */ #define MASK29 0x1FFFFFFF /* Mask for 2^29 */ +static struct res_config skx_cfg = { + .type = SKX, + .decs_did = 0x2016, + .busno_cfg_offset = 0xcc, + .ddr_imc_num = 2, + .ddr_chan_num = 3, + .ddr_dimm_num = 2, +}; + static struct skx_dev *get_skx_dev(struct pci_bus *bus, u8 idx) { struct skx_dev *d; @@ -52,7 +61,7 @@ enum munittype { struct munit { u16 did; - u16 devfn[SKX_NUM_IMC]; + u16 devfn[2]; u8 busidx; u8 per_socket; enum munittype mtype; @@ -89,11 +98,11 @@ static int get_all_munits(const struct munit *m) if (!pdev) break; ndev++; - if (m->per_socket == SKX_NUM_IMC) { - for (i = 0; i < SKX_NUM_IMC; i++) + if (m->per_socket == skx_cfg.ddr_imc_num) { + for (i = 0; i < skx_cfg.ddr_imc_num; i++) if (m->devfn[i] == pdev->devfn) break; - if (i == SKX_NUM_IMC) + if (i == skx_cfg.ddr_imc_num) goto fail; } d = get_skx_dev(pdev->bus, m->busidx); @@ -157,12 +166,6 @@ static int get_all_munits(const struct munit *m) return -ENODEV; } -static struct res_config skx_cfg = { - .type = SKX, - .decs_did = 0x2016, - .busno_cfg_offset = 0xcc, -}; - static const struct x86_cpu_id skx_cpuids[] = { X86_MATCH_VFM(INTEL_SKYLAKE_X, &skx_cfg), { } @@ -186,11 +189,11 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) /* Only the mcmtr on the first channel is effective */ pci_read_config_dword(imc->chan[0].cdev, 0x87c, &mcmtr); - for (i = 0; i < SKX_NUM_CHANNELS; i++) { + for (i = 0; i < cfg->ddr_chan_num; i++) { ndimms = 0; pci_read_config_dword(imc->chan[i].cdev, 0x8C, &amap); pci_read_config_dword(imc->chan[i].cdev, 0x400, &mcddrtcfg); - for (j = 0; j < SKX_NUM_DIMMS; j++) { + for (j = 0; j < cfg->ddr_dimm_num; j++) { dimm = edac_get_dimm(mci, i, j, 0); pci_read_config_dword(imc->chan[i].cdev, 0x80 + 4 * j, &mtr); @@ -620,6 +623,7 @@ static int __init skx_init(void) return -ENODEV; cfg = (struct res_config *)id->driver_data; + skx_set_res_cfg(cfg); rc = skx_get_hi_lo(0x2034, off, &skx_tolm, &skx_tohm); if (rc) @@ -652,10 +656,13 @@ static int __init skx_init(void) goto fail; edac_dbg(2, "src_id = %d\n", src_id); - for (i = 0; i < SKX_NUM_IMC; i++) { + for (i = 0; i < cfg->ddr_imc_num; i++) { d->imc[i].mc = mc++; d->imc[i].lmc = i; d->imc[i].src_id = src_id; + d->imc[i].num_channels = cfg->ddr_chan_num; + d->imc[i].num_dimms = cfg->ddr_dimm_num; + rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev, "Skylake Socket", EDAC_MOD_STR, skx_get_dimm_config, cfg); diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 39c733dbc5b9fb..cc7d36cf7f3bc2 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -320,10 +320,10 @@ static int get_width(u32 mtr) */ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) { + int ndev = 0, imc_num = cfg->ddr_imc_num + cfg->hbm_imc_num; struct pci_dev *pdev, *prev; struct skx_dev *d; u32 reg; - int ndev = 0; prev = NULL; for (;;) { @@ -354,8 +354,10 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) d->seg = GET_BITFIELD(reg, 16, 23); } - edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x\n", - d->bus[0], d->bus[1], d->bus[2], d->bus[3]); + d->num_imc = imc_num; + + edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x, imcs %d\n", + d->bus[0], d->bus[1], d->bus[2], d->bus[3], imc_num); list_add_tail(&d->list, &dev_edac_list); prev = pdev; @@ -541,10 +543,10 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, /* Allocate a new MC control structure */ layers[0].type = EDAC_MC_LAYER_CHANNEL; - layers[0].size = NUM_CHANNELS; + layers[0].size = imc->num_channels; layers[0].is_virt_csrow = false; layers[1].type = EDAC_MC_LAYER_SLOT; - layers[1].size = NUM_DIMMS; + layers[1].size = imc->num_dimms; layers[1].is_virt_csrow = true; mci = edac_mc_alloc(imc->mc, ARRAY_SIZE(layers), layers, sizeof(struct skx_pvt)); @@ -784,7 +786,7 @@ void skx_remove(void) list_for_each_entry_safe(d, tmp, &dev_edac_list, list) { list_del(&d->list); - for (i = 0; i < NUM_IMC; i++) { + for (i = 0; i < d->num_imc; i++) { if (d->imc[i].mci) skx_unregister_mci(&d->imc[i]); @@ -794,7 +796,7 @@ void skx_remove(void) if (d->imc[i].mbase) iounmap(d->imc[i].mbase); - for (j = 0; j < NUM_CHANNELS; j++) { + for (j = 0; j < d->imc[i].num_channels; j++) { if (d->imc[i].chan[j].cdev) pci_dev_put(d->imc[i].chan[j].cdev); } diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index ec4966f7ea40b3..3f6007a972679b 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -134,6 +134,7 @@ struct skx_dev { struct pci_dev *uracu; /* for i10nm CPU */ struct pci_dev *pcu_cr3; /* for HBM memory detection */ u32 mcroute; + int num_imc; /* * Some server BIOS may hide certain memory controllers, and the * EDAC driver skips those hidden memory controllers. However, the From 59cfc06a874e01633865d0b05eba47b09b8b3a8f Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 31 Jul 2025 22:55:29 +0800 Subject: [PATCH 0313/3206] EDAC/skx_common: Move mc_mapping to be a field inside struct skx_imc The mc_mapping and imc fields of struct skx_dev have the same size, NUM_IMC. Move mc_mapping to be a field inside struct skx_imc to prepare for making the imc array of memory controller instances a flexible array. No functional changes intended. Suggested-by: Tony Luck Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20250731145534.2759334-3-qiuxu.zhuo@intel.com --- drivers/edac/skx_common.c | 8 ++++---- drivers/edac/skx_common.h | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index cc7d36cf7f3bc2..3f6a074d685c93 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -131,7 +131,7 @@ static void skx_init_mc_mapping(struct skx_dev *d) * EDAC driver. */ for (int i = 0; i < NUM_IMC; i++) - d->mc_mapping[i] = i; + d->imc[i].mc_mapping = i; } void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) @@ -139,16 +139,16 @@ void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n", pmc, lmc); - d->mc_mapping[pmc] = lmc; + d->imc[pmc].mc_mapping = lmc; } EXPORT_SYMBOL_GPL(skx_set_mc_mapping); static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc) { edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n", - pmc, d->mc_mapping[pmc]); + pmc, d->imc[pmc].mc_mapping); - return d->mc_mapping[pmc]; + return d->imc[pmc].mc_mapping; } static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 3f6007a972679b..95d61d23f89ef4 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -135,16 +135,6 @@ struct skx_dev { struct pci_dev *pcu_cr3; /* for HBM memory detection */ u32 mcroute; int num_imc; - /* - * Some server BIOS may hide certain memory controllers, and the - * EDAC driver skips those hidden memory controllers. However, the - * ADXL still decodes memory error address using physical memory - * controller indices. The mapping table is used to convert the - * physical indices (reported by ADXL) to the logical indices - * (used the EDAC driver) of present memory controllers during the - * error handling process. - */ - u8 mc_mapping[NUM_IMC]; struct skx_imc { struct mem_ctl_info *mci; struct pci_dev *mdev; /* for i10nm CPU */ @@ -156,6 +146,16 @@ struct skx_dev { u8 mc; /* system wide mc# */ u8 lmc; /* socket relative mc# */ u8 src_id; + /* + * Some server BIOS may hide certain memory controllers, and the + * EDAC driver skips those hidden memory controllers. However, the + * ADXL still decodes memory error address using physical memory + * controller indices. The mapping table is used to convert the + * physical indices (reported by ADXL) to the logical indices + * (used the EDAC driver) of present memory controllers during the + * error handling process. + */ + u8 mc_mapping; struct skx_channel { struct pci_dev *cdev; struct pci_dev *edev; From 30b47b71fdc0091db5a4fa9503ad630aeb0ab3eb Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 31 Jul 2025 22:55:30 +0800 Subject: [PATCH 0314/3206] EDAC/skx_common: Swap memory controller index mapping The current mapping of memory controller indices is from physical index [1] to logical index [2], as show below: skx_dev->imc[pmc].mc_mapping = lmc Since skx_dev->imc[] is an array of present memory controller instances, mapping memory controller indices from logical index to physical index, as show below, is more reasonable. This is also a preparatory step for making skx_dev->imc[] a flexible array. skx_dev->imc[lmc].mc_mapping = pmc Both mappings are equivalent. No functional changes intended. [1] Indices for memory controllers include both those present to the OS and those disabled by BIOS. [2] Indices for memory controllers present to the OS. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20250731145534.2759334-4-qiuxu.zhuo@intel.com --- drivers/edac/skx_common.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 3f6a074d685c93..e03268e9073f19 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -130,7 +130,7 @@ static void skx_init_mc_mapping(struct skx_dev *d) * the logical indices of the memory controllers enumerated by the * EDAC driver. */ - for (int i = 0; i < NUM_IMC; i++) + for (int i = 0; i < d->num_imc; i++) d->imc[i].mc_mapping = i; } @@ -139,22 +139,28 @@ void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n", pmc, lmc); - d->imc[pmc].mc_mapping = lmc; + d->imc[lmc].mc_mapping = pmc; } EXPORT_SYMBOL_GPL(skx_set_mc_mapping); -static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc) +static int skx_get_mc_mapping(struct skx_dev *d, u8 pmc) { - edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n", - pmc, d->imc[pmc].mc_mapping); + for (int lmc = 0; lmc < d->num_imc; lmc++) { + if (d->imc[lmc].mc_mapping == pmc) { + edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n", + pmc, lmc); - return d->imc[pmc].mc_mapping; + return lmc; + } + } + + return -1; } static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) { + int i, lmc, len = 0; struct skx_dev *d; - int i, len = 0; if (res->addr >= skx_tohm || (res->addr >= skx_tolm && res->addr < BIT_ULL(32))) { @@ -218,7 +224,13 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) return false; } - res->imc = skx_get_mc_mapping(d, res->imc); + lmc = skx_get_mc_mapping(d, res->imc); + if (lmc < 0) { + skx_printk(KERN_ERR, "No lmc for imc %d\n", res->imc); + return false; + } + + res->imc = lmc; for (i = 0; i < adxl_component_count; i++) { if (adxl_values[i] == ~0x0ull) From 43060ca5332462903f57c2ff9b49b69124618070 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 31 Jul 2025 22:55:31 +0800 Subject: [PATCH 0315/3206] EDAC/skx_common: Make skx_dev->imc[] a flexible array The current skx->imc[NUM_IMC] array of memory controller instances is sized using the macro NUM_IMC. Each time EDAC support is added for a new CPU, NUM_IMC needs to be updated to ensure it is greater than or equal to the number of memory controllers for the new CPU. This approach is inconvenient and results in memory waste for older CPUs with fewer memory controllers. To address this, make skx->imc[] a flexible array and determine its size from configuration data or at runtime. Suggested-by: Tony Luck Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20250731145534.2759334-5-qiuxu.zhuo@intel.com --- drivers/edac/skx_common.c | 3 ++- drivers/edac/skx_common.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index e03268e9073f19..09187043c0451d 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -343,7 +344,7 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) if (!pdev) break; ndev++; - d = kzalloc(sizeof(*d), GFP_KERNEL); + d = kzalloc(struct_size(d, imc, imc_num), GFP_KERNEL); if (!d) { pci_dev_put(pdev); return -ENOMEM; diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 95d61d23f89ef4..e7038fd45d0631 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -172,7 +172,7 @@ struct skx_dev { u8 colbits; } dimms[NUM_DIMMS]; } chan[NUM_CHANNELS]; - } imc[NUM_IMC]; + } imc[]; }; struct skx_pvt { From 91ded20fa2fefe2620b0abb998838312ea8bbfd2 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 31 Jul 2025 22:55:32 +0800 Subject: [PATCH 0316/3206] EDAC/skx_common: Remove redundant upper bound check for res->imc The following upper bound check for the memory controller physical index decoded by ADXL is the only place where use the macro 'NUM_IMC' is used: res->imc > NUM_IMC - 1 Since this check is already covered by skx_get_mc_mapping(), meaning no memory controller logical index exists for an invalid memory controller physical index decoded by ADXL, remove the redundant upper bound check so that the definition for 'NUM_IMC' can be cleaned up (in another patch). Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20250731145534.2759334-6-qiuxu.zhuo@intel.com --- drivers/edac/skx_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 09187043c0451d..5899a111049551 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -207,7 +207,7 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) res->cs = (int)adxl_values[component_indices[INDEX_CS]]; } - if (res->imc > NUM_IMC - 1 || res->imc < 0) { + if (res->imc < 0) { skx_printk(KERN_ERR, "Bad imc %d\n", res->imc); return false; } From f7a29a37373baa0bd0823a5d55e97dc7f75a4dab Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 31 Jul 2025 22:55:33 +0800 Subject: [PATCH 0317/3206] EDAC/i10nm: Reallocate skx_dev list if preconfigured cnt != runtime cnt Ideally, read the present DDR memory controller count first and then allocate the skx_dev list using this count. However, this approach requires adding a significant amount of code similar to skx_get_all_bus_mappings() to obtain the PCI bus mappings for the first socket and use these mappings along with the related PCI register offset to read the memory controller count. Given that the Granite Rapids CPU is the only one that can detect the count of memory controllers at runtime (other CPUs use the count in the configuration data), to reduce code complexity, reallocate the skx_dev list only if the preconfigured count of DDR memory controllers differs from the count read at runtime for Granite Rapids CPU. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20250731145534.2759334-7-qiuxu.zhuo@intel.com --- drivers/edac/i10nm_base.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 9d00f247f4e0ea..2010a47149f448 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -468,17 +468,18 @@ static int i10nm_get_imc_num(struct res_config *cfg) return -ENODEV; } - if (imc_num > I10NM_NUM_DDR_IMC) { - i10nm_printk(KERN_ERR, "Need to make I10NM_NUM_DDR_IMC >= %d\n", imc_num); - return -EINVAL; - } - if (cfg->ddr_imc_num != imc_num) { /* - * Store the number of present DDR memory controllers. + * Update the configuration data to reflect the number of + * present DDR memory controllers. */ cfg->ddr_imc_num = imc_num; edac_dbg(2, "Set DDR MC number: %d", imc_num); + + /* Release and reallocate skx_dev list with the updated number. */ + skx_remove(); + if (skx_get_all_bus_mappings(cfg, &i10nm_edac_list) <= 0) + return -ENODEV; } return 0; From a95dcf3d67430493554a9fe3a3a8717c21554508 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 31 Jul 2025 22:55:34 +0800 Subject: [PATCH 0318/3206] EDAC/skx_common: Remove unused *NUM*_IMC macros There are no references to the *NUM*_IMC macros, so remove them. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20250731145534.2759334-8-qiuxu.zhuo@intel.com --- drivers/edac/skx_common.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index e7038fd45d0631..73ba89786cdfd5 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -29,23 +29,18 @@ #define GET_BITFIELD(v, lo, hi) \ (((v) & GENMASK_ULL((hi), (lo))) >> (lo)) -#define SKX_NUM_IMC 2 /* Memory controllers per socket */ #define SKX_NUM_CHANNELS 3 /* Channels per memory controller */ #define SKX_NUM_DIMMS 2 /* Max DIMMS per channel */ -#define I10NM_NUM_DDR_IMC 12 #define I10NM_NUM_DDR_CHANNELS 2 #define I10NM_NUM_DDR_DIMMS 2 -#define I10NM_NUM_HBM_IMC 16 #define I10NM_NUM_HBM_CHANNELS 2 #define I10NM_NUM_HBM_DIMMS 1 -#define I10NM_NUM_IMC (I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC) #define I10NM_NUM_CHANNELS MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS) #define I10NM_NUM_DIMMS MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS) -#define NUM_IMC MAX(SKX_NUM_IMC, I10NM_NUM_IMC) #define NUM_CHANNELS MAX(SKX_NUM_CHANNELS, I10NM_NUM_CHANNELS) #define NUM_DIMMS MAX(SKX_NUM_DIMMS, I10NM_NUM_DIMMS) From 34d4d093077a5c60d452a2f42d0c1a08e55b8614 Mon Sep 17 00:00:00 2001 From: Thierry Bultel Date: Fri, 8 Aug 2025 14:30:16 +0100 Subject: [PATCH 0319/3206] pinctrl: renesas: Add support for RZ/T2H Add a pin control and GPIO driver for the Renesas RZ/T2H (R9A09G077) SoC. Signed-off-by: Thierry Bultel Co-developed-by: Lad Prabhakar Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250808133017.2053637-3-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/Kconfig | 12 + drivers/pinctrl/renesas/Makefile | 1 + drivers/pinctrl/renesas/pinctrl-rzt2h.c | 797 ++++++++++++++++++++++++ 3 files changed, 810 insertions(+) create mode 100644 drivers/pinctrl/renesas/pinctrl-rzt2h.c diff --git a/drivers/pinctrl/renesas/Kconfig b/drivers/pinctrl/renesas/Kconfig index 99ae34a56871c2..c8b84c158e86df 100644 --- a/drivers/pinctrl/renesas/Kconfig +++ b/drivers/pinctrl/renesas/Kconfig @@ -44,6 +44,7 @@ config PINCTRL_RENESAS select PINCTRL_RZG2L if ARCH_R9A09G047 select PINCTRL_RZG2L if ARCH_R9A09G056 select PINCTRL_RZG2L if ARCH_R9A09G057 + select PINCTRL_RZT2H if ARCH_R9A09G077 select PINCTRL_PFC_SH7203 if CPU_SUBTYPE_SH7203 select PINCTRL_PFC_SH7264 if CPU_SUBTYPE_SH7264 select PINCTRL_PFC_SH7269 if CPU_SUBTYPE_SH7269 @@ -302,6 +303,17 @@ config PINCTRL_RZN1 help This selects pinctrl driver for Renesas RZ/N1 devices. +config PINCTRL_RZT2H + bool "pin control support for RZ/N2H and RZ/T2H" if COMPILE_TEST + depends on 64BIT && OF + select GPIOLIB + select GENERIC_PINCTRL_GROUPS + select GENERIC_PINMUX_FUNCTIONS + select GENERIC_PINCONF + help + This selects GPIO and pinctrl driver for Renesas RZ/T2H + platforms. + config PINCTRL_RZV2M bool "pin control support for RZ/V2M" if COMPILE_TEST depends on OF diff --git a/drivers/pinctrl/renesas/Makefile b/drivers/pinctrl/renesas/Makefile index 2ba623e04bf8c7..1c5144a1c4b8f0 100644 --- a/drivers/pinctrl/renesas/Makefile +++ b/drivers/pinctrl/renesas/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_PINCTRL_RZA1) += pinctrl-rza1.o obj-$(CONFIG_PINCTRL_RZA2) += pinctrl-rza2.o obj-$(CONFIG_PINCTRL_RZG2L) += pinctrl-rzg2l.o obj-$(CONFIG_PINCTRL_RZN1) += pinctrl-rzn1.o +obj-$(CONFIG_PINCTRL_RZT2H) += pinctrl-rzt2h.o obj-$(CONFIG_PINCTRL_RZV2M) += pinctrl-rzv2m.o ifeq ($(CONFIG_COMPILE_TEST),y) diff --git a/drivers/pinctrl/renesas/pinctrl-rzt2h.c b/drivers/pinctrl/renesas/pinctrl-rzt2h.c new file mode 100644 index 00000000000000..28f5d48e33db87 --- /dev/null +++ b/drivers/pinctrl/renesas/pinctrl-rzt2h.c @@ -0,0 +1,797 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas RZ/T2H Pin Control and GPIO driver core + * + * Based on drivers/pinctrl/renesas/pinctrl-rzg2l.c + * + * Copyright (C) 2025 Renesas Electronics Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "../core.h" +#include "../pinconf.h" +#include "../pinmux.h" + +#define DRV_NAME "pinctrl-rzt2h" + +#define P(m) (0x001 * (m)) +#define PM(m) (0x200 + 2 * (m)) +#define PMC(m) (0x400 + (m)) +#define PFC(m) (0x600 + 8 * (m)) +#define PIN(m) (0x800 + (m)) +#define RSELP(m) (0xc00 + (m)) + +#define PM_MASK GENMASK(1, 0) +#define PM_PIN_MASK(pin) (PM_MASK << ((pin) * 2)) +#define PM_INPUT BIT(0) +#define PM_OUTPUT BIT(1) + +#define PFC_MASK GENMASK_ULL(5, 0) +#define PFC_PIN_MASK(pin) (PFC_MASK << ((pin) * 8)) + +/* + * Use 16 lower bits [15:0] for pin identifier + * Use 8 higher bits [23:16] for pin mux function + */ +#define MUX_PIN_ID_MASK GENMASK(15, 0) +#define MUX_FUNC_MASK GENMASK(23, 16) + +#define RZT2H_PIN_ID_TO_PORT(id) ((id) / RZT2H_PINS_PER_PORT) +#define RZT2H_PIN_ID_TO_PIN(id) ((id) % RZT2H_PINS_PER_PORT) + +#define RZT2H_MAX_SAFETY_PORTS 12 + +struct rzt2h_pinctrl_data { + unsigned int n_port_pins; + const u8 *port_pin_configs; + unsigned int n_ports; +}; + +struct rzt2h_pinctrl { + struct pinctrl_dev *pctl; + struct pinctrl_desc desc; + struct pinctrl_pin_desc *pins; + const struct rzt2h_pinctrl_data *data; + void __iomem *base0, *base1; + struct device *dev; + struct gpio_chip gpio_chip; + struct pinctrl_gpio_range gpio_range; + spinlock_t lock; /* lock read/write registers */ + struct mutex mutex; /* serialize adding groups and functions */ + bool safety_port_enabled; +}; + +#define RZT2H_GET_BASE(pctrl, port) \ + ((port) > RZT2H_MAX_SAFETY_PORTS ? (pctrl)->base0 : (pctrl)->base1) + +#define RZT2H_PINCTRL_REG_ACCESS(size, type) \ +static inline void rzt2h_pinctrl_write##size(struct rzt2h_pinctrl *pctrl, u8 port, \ + type val, unsigned int offset) \ +{ \ + write##size(val, RZT2H_GET_BASE(pctrl, port) + offset); \ +} \ +static inline type rzt2h_pinctrl_read##size(struct rzt2h_pinctrl *pctrl, u8 port, \ + unsigned int offset) \ +{ \ + return read##size(RZT2H_GET_BASE(pctrl, port) + offset); \ +} + +RZT2H_PINCTRL_REG_ACCESS(b, u8) +RZT2H_PINCTRL_REG_ACCESS(w, u16) +RZT2H_PINCTRL_REG_ACCESS(q, u64) + +static int rzt2h_validate_pin(struct rzt2h_pinctrl *pctrl, unsigned int offset) +{ + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 pin = RZT2H_PIN_ID_TO_PIN(offset); + u8 pincfg; + + if (offset >= pctrl->data->n_port_pins || port >= pctrl->data->n_ports) + return -EINVAL; + + if (!pctrl->safety_port_enabled && port <= RZT2H_MAX_SAFETY_PORTS) + return -EINVAL; + + pincfg = pctrl->data->port_pin_configs[port]; + return (pincfg & BIT(pin)) ? 0 : -EINVAL; +} + +static void rzt2h_pinctrl_set_pfc_mode(struct rzt2h_pinctrl *pctrl, + u8 port, u8 pin, u8 func) +{ + u64 reg64; + u16 reg16; + + guard(spinlock_irqsave)(&pctrl->lock); + + /* Set pin to 'Non-use (Hi-Z input protection)' */ + reg16 = rzt2h_pinctrl_readw(pctrl, port, PM(port)); + reg16 &= ~PM_PIN_MASK(pin); + rzt2h_pinctrl_writew(pctrl, port, reg16, PM(port)); + + /* Temporarily switch to GPIO mode with PMC register */ + reg16 = rzt2h_pinctrl_readb(pctrl, port, PMC(port)); + rzt2h_pinctrl_writeb(pctrl, port, reg16 & ~BIT(pin), PMC(port)); + + /* Select Pin function mode with PFC register */ + reg64 = rzt2h_pinctrl_readq(pctrl, port, PFC(port)); + reg64 &= ~PFC_PIN_MASK(pin); + rzt2h_pinctrl_writeq(pctrl, port, reg64 | ((u64)func << (pin * 8)), PFC(port)); + + /* Switch to Peripheral pin function with PMC register */ + reg16 = rzt2h_pinctrl_readb(pctrl, port, PMC(port)); + rzt2h_pinctrl_writeb(pctrl, port, reg16 | BIT(pin), PMC(port)); +}; + +static int rzt2h_pinctrl_set_mux(struct pinctrl_dev *pctldev, + unsigned int func_selector, + unsigned int group_selector) +{ + struct rzt2h_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + const struct function_desc *func; + struct group_desc *group; + const unsigned int *pins; + unsigned int i; + u8 *psel_val; + int ret; + + func = pinmux_generic_get_function(pctldev, func_selector); + if (!func) + return -EINVAL; + + group = pinctrl_generic_get_group(pctldev, group_selector); + if (!group) + return -EINVAL; + + psel_val = func->data; + pins = group->grp.pins; + + for (i = 0; i < group->grp.npins; i++) { + dev_dbg(pctrl->dev, "port:%u pin:%u PSEL:%u\n", + RZT2H_PIN_ID_TO_PORT(pins[i]), RZT2H_PIN_ID_TO_PIN(pins[i]), + psel_val[i]); + ret = rzt2h_validate_pin(pctrl, pins[i]); + if (ret) + return ret; + + rzt2h_pinctrl_set_pfc_mode(pctrl, RZT2H_PIN_ID_TO_PORT(pins[i]), + RZT2H_PIN_ID_TO_PIN(pins[i]), psel_val[i]); + } + + return 0; +}; + +static int rzt2h_map_add_config(struct pinctrl_map *map, + const char *group_or_pin, + enum pinctrl_map_type type, + unsigned long *configs, + unsigned int num_configs) +{ + unsigned long *cfgs; + + cfgs = kmemdup_array(configs, num_configs, sizeof(*cfgs), GFP_KERNEL); + if (!cfgs) + return -ENOMEM; + + map->type = type; + map->data.configs.group_or_pin = group_or_pin; + map->data.configs.configs = cfgs; + map->data.configs.num_configs = num_configs; + + return 0; +} + +static int rzt2h_dt_subnode_to_map(struct pinctrl_dev *pctldev, + struct device_node *np, + struct device_node *parent, + struct pinctrl_map **map, + unsigned int *num_maps, + unsigned int *index) +{ + struct rzt2h_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct pinctrl_map *maps = *map; + unsigned int nmaps = *num_maps; + unsigned long *configs = NULL; + unsigned int num_pinmux = 0; + unsigned int idx = *index; + unsigned int num_pins, i; + unsigned int num_configs; + struct property *pinmux; + struct property *prop; + int ret, gsel, fsel; + const char **pin_fn; + unsigned int *pins; + const char *name; + const char *pin; + u8 *psel_val; + + pinmux = of_find_property(np, "pinmux", NULL); + if (pinmux) + num_pinmux = pinmux->length / sizeof(u32); + + ret = of_property_count_strings(np, "pins"); + if (ret == -EINVAL) { + num_pins = 0; + } else if (ret < 0) { + dev_err(pctrl->dev, "Invalid pins list in DT\n"); + return ret; + } else { + num_pins = ret; + } + + if (!num_pinmux && !num_pins) + return 0; + + if (num_pinmux && num_pins) { + dev_err(pctrl->dev, + "DT node must contain either a pinmux or pins and not both\n"); + return -EINVAL; + } + + ret = pinconf_generic_parse_dt_config(np, pctldev, &configs, &num_configs); + if (ret < 0) + return ret; + + if (num_pins && !num_configs) { + dev_err(pctrl->dev, "DT node must contain a config\n"); + ret = -ENODEV; + goto done; + } + + if (num_pinmux) { + nmaps += 1; + if (num_configs) + nmaps += 1; + } + + if (num_pins) + nmaps += num_pins; + + maps = krealloc_array(maps, nmaps, sizeof(*maps), GFP_KERNEL); + if (!maps) { + ret = -ENOMEM; + goto done; + } + + *map = maps; + *num_maps = nmaps; + if (num_pins) { + of_property_for_each_string(np, "pins", prop, pin) { + ret = rzt2h_map_add_config(&maps[idx], pin, + PIN_MAP_TYPE_CONFIGS_PIN, + configs, num_configs); + if (ret < 0) + goto done; + + idx++; + } + ret = 0; + goto done; + } + + pins = devm_kcalloc(pctrl->dev, num_pinmux, sizeof(*pins), GFP_KERNEL); + psel_val = devm_kcalloc(pctrl->dev, num_pinmux, sizeof(*psel_val), + GFP_KERNEL); + pin_fn = devm_kzalloc(pctrl->dev, sizeof(*pin_fn), GFP_KERNEL); + if (!pins || !psel_val || !pin_fn) { + ret = -ENOMEM; + goto done; + } + + /* Collect pin locations and mux settings from DT properties */ + for (i = 0; i < num_pinmux; ++i) { + u32 value; + + ret = of_property_read_u32_index(np, "pinmux", i, &value); + if (ret) + goto done; + pins[i] = FIELD_GET(MUX_PIN_ID_MASK, value); + psel_val[i] = FIELD_GET(MUX_FUNC_MASK, value); + } + + if (parent) { + name = devm_kasprintf(pctrl->dev, GFP_KERNEL, "%pOFn.%pOFn", + parent, np); + if (!name) { + ret = -ENOMEM; + goto done; + } + } else { + name = np->name; + } + + if (num_configs) { + ret = rzt2h_map_add_config(&maps[idx], name, + PIN_MAP_TYPE_CONFIGS_GROUP, + configs, num_configs); + if (ret < 0) + goto done; + + idx++; + } + + scoped_guard(mutex, &pctrl->mutex) { + /* Register a single pin group listing all the pins we read from DT */ + gsel = pinctrl_generic_add_group(pctldev, name, pins, num_pinmux, NULL); + if (gsel < 0) { + ret = gsel; + goto done; + } + + /* + * Register a single group function where the 'data' is an array PSEL + * register values read from DT. + */ + pin_fn[0] = name; + fsel = pinmux_generic_add_function(pctldev, name, pin_fn, 1, psel_val); + if (fsel < 0) { + ret = fsel; + goto remove_group; + } + } + + maps[idx].type = PIN_MAP_TYPE_MUX_GROUP; + maps[idx].data.mux.group = name; + maps[idx].data.mux.function = name; + idx++; + + dev_dbg(pctrl->dev, "Parsed %pOF with %d pins\n", np, num_pinmux); + ret = 0; + goto done; + +remove_group: + pinctrl_generic_remove_group(pctldev, gsel); +done: + *index = idx; + kfree(configs); + return ret; +} + +static void rzt2h_dt_free_map(struct pinctrl_dev *pctldev, + struct pinctrl_map *map, + unsigned int num_maps) +{ + unsigned int i; + + if (!map) + return; + + for (i = 0; i < num_maps; ++i) { + if (map[i].type == PIN_MAP_TYPE_CONFIGS_GROUP || + map[i].type == PIN_MAP_TYPE_CONFIGS_PIN) + kfree(map[i].data.configs.configs); + } + kfree(map); +} + +static int rzt2h_dt_node_to_map(struct pinctrl_dev *pctldev, + struct device_node *np, + struct pinctrl_map **map, + unsigned int *num_maps) +{ + struct rzt2h_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + unsigned int index; + int ret; + + *map = NULL; + *num_maps = 0; + index = 0; + + for_each_child_of_node_scoped(np, child) { + ret = rzt2h_dt_subnode_to_map(pctldev, child, np, map, + num_maps, &index); + if (ret < 0) + goto done; + } + + if (*num_maps == 0) { + ret = rzt2h_dt_subnode_to_map(pctldev, np, NULL, map, + num_maps, &index); + if (ret < 0) + goto done; + } + + if (*num_maps) + return 0; + + dev_err(pctrl->dev, "no mapping found in node %pOF\n", np); + ret = -EINVAL; + +done: + rzt2h_dt_free_map(pctldev, *map, *num_maps); + return ret; +} + +static const struct pinctrl_ops rzt2h_pinctrl_pctlops = { + .get_groups_count = pinctrl_generic_get_group_count, + .get_group_name = pinctrl_generic_get_group_name, + .get_group_pins = pinctrl_generic_get_group_pins, + .dt_node_to_map = rzt2h_dt_node_to_map, + .dt_free_map = rzt2h_dt_free_map, +}; + +static const struct pinmux_ops rzt2h_pinctrl_pmxops = { + .get_functions_count = pinmux_generic_get_function_count, + .get_function_name = pinmux_generic_get_function_name, + .get_function_groups = pinmux_generic_get_function_groups, + .set_mux = rzt2h_pinctrl_set_mux, + .strict = true, +}; + +static int rzt2h_gpio_request(struct gpio_chip *chip, unsigned int offset) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + int ret; + u8 reg; + + ret = rzt2h_validate_pin(pctrl, offset); + if (ret) + return ret; + + ret = pinctrl_gpio_request(chip, offset); + if (ret) + return ret; + + guard(spinlock_irqsave)(&pctrl->lock); + + /* Select GPIO mode in PMC Register */ + reg = rzt2h_pinctrl_readb(pctrl, port, PMC(port)); + reg &= ~BIT(bit); + rzt2h_pinctrl_writeb(pctrl, port, reg, PMC(port)); + + return 0; +} + +static void rzt2h_gpio_set_direction(struct rzt2h_pinctrl *pctrl, u32 port, + u8 bit, bool output) +{ + u16 reg; + + guard(spinlock_irqsave)(&pctrl->lock); + + reg = rzt2h_pinctrl_readw(pctrl, port, PM(port)); + reg &= ~PM_PIN_MASK(bit); + + reg |= (output ? PM_OUTPUT : PM_INPUT) << (bit * 2); + rzt2h_pinctrl_writew(pctrl, port, reg, PM(port)); +} + +static int rzt2h_gpio_get_direction(struct gpio_chip *chip, unsigned int offset) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + u16 reg; + int ret; + + ret = rzt2h_validate_pin(pctrl, offset); + if (ret) + return ret; + + if (rzt2h_pinctrl_readb(pctrl, port, PMC(port)) & BIT(bit)) + return -EINVAL; + + reg = rzt2h_pinctrl_readw(pctrl, port, PM(port)); + reg = (reg >> (bit * 2)) & PM_MASK; + if (reg & PM_OUTPUT) + return GPIO_LINE_DIRECTION_OUT; + if (reg & PM_INPUT) + return GPIO_LINE_DIRECTION_IN; + + return -EINVAL; +} + +static int rzt2h_gpio_set(struct gpio_chip *chip, unsigned int offset, + int value) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + u8 reg; + + guard(spinlock_irqsave)(&pctrl->lock); + + reg = rzt2h_pinctrl_readb(pctrl, port, P(port)); + if (value) + rzt2h_pinctrl_writeb(pctrl, port, reg | BIT(bit), P(port)); + else + rzt2h_pinctrl_writeb(pctrl, port, reg & ~BIT(bit), P(port)); + + return 0; +} + +static int rzt2h_gpio_get(struct gpio_chip *chip, unsigned int offset) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + u16 reg; + + reg = rzt2h_pinctrl_readw(pctrl, port, PM(port)); + reg = (reg >> (bit * 2)) & PM_MASK; + if (reg & PM_INPUT) + return !!(rzt2h_pinctrl_readb(pctrl, port, PIN(port)) & BIT(bit)); + if (reg & PM_OUTPUT) + return !!(rzt2h_pinctrl_readb(pctrl, port, P(port)) & BIT(bit)); + + return -EINVAL; +} + +static int rzt2h_gpio_direction_input(struct gpio_chip *chip, + unsigned int offset) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + + rzt2h_gpio_set_direction(pctrl, port, bit, false); + + return 0; +} + +static int rzt2h_gpio_direction_output(struct gpio_chip *chip, + unsigned int offset, int value) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + + rzt2h_gpio_set(chip, offset, value); + rzt2h_gpio_set_direction(pctrl, port, bit, true); + + return 0; +} + +static void rzt2h_gpio_free(struct gpio_chip *chip, unsigned int offset) +{ + pinctrl_gpio_free(chip, offset); + + /* + * Set the GPIO as an input to ensure that the next GPIO request won't + * drive the GPIO pin as an output. + */ + rzt2h_gpio_direction_input(chip, offset); +} + +static const char * const rzt2h_gpio_names[] = { + "P00_0", "P00_1", "P00_2", "P00_3", "P00_4", "P00_5", "P00_6", "P00_7", + "P01_0", "P01_1", "P01_2", "P01_3", "P01_4", "P01_5", "P01_6", "P01_7", + "P02_0", "P02_1", "P02_2", "P02_3", "P02_4", "P02_5", "P02_6", "P02_7", + "P03_0", "P03_1", "P03_2", "P03_3", "P03_4", "P03_5", "P03_6", "P03_7", + "P04_0", "P04_1", "P04_2", "P04_3", "P04_4", "P04_5", "P04_6", "P04_7", + "P05_0", "P05_1", "P05_2", "P05_3", "P05_4", "P05_5", "P05_6", "P05_7", + "P06_0", "P06_1", "P06_2", "P06_3", "P06_4", "P06_5", "P06_6", "P06_7", + "P07_0", "P07_1", "P07_2", "P07_3", "P07_4", "P07_5", "P07_6", "P07_7", + "P08_0", "P08_1", "P08_2", "P08_3", "P08_4", "P08_5", "P08_6", "P08_7", + "P09_0", "P09_1", "P09_2", "P09_3", "P09_4", "P09_5", "P09_6", "P09_7", + "P10_0", "P10_1", "P10_2", "P10_3", "P10_4", "P10_5", "P10_6", "P10_7", + "P11_0", "P11_1", "P11_2", "P11_3", "P11_4", "P11_5", "P11_6", "P11_7", + "P12_0", "P12_1", "P12_2", "P12_3", "P12_4", "P12_5", "P12_6", "P12_7", + "P13_0", "P13_1", "P13_2", "P13_3", "P13_4", "P13_5", "P13_6", "P13_7", + "P14_0", "P14_1", "P14_2", "P14_3", "P14_4", "P14_5", "P14_6", "P14_7", + "P15_0", "P15_1", "P15_2", "P15_3", "P15_4", "P15_5", "P15_6", "P15_7", + "P16_0", "P16_1", "P16_2", "P16_3", "P16_4", "P16_5", "P16_6", "P16_7", + "P17_0", "P17_1", "P17_2", "P17_3", "P17_4", "P17_5", "P17_6", "P17_7", + "P18_0", "P18_1", "P18_2", "P18_3", "P18_4", "P18_5", "P18_6", "P18_7", + "P19_0", "P19_1", "P19_2", "P19_3", "P19_4", "P19_5", "P19_6", "P19_7", + "P20_0", "P20_1", "P20_2", "P20_3", "P20_4", "P20_5", "P20_6", "P20_7", + "P21_0", "P21_1", "P21_2", "P21_3", "P21_4", "P21_5", "P21_6", "P21_7", + "P22_0", "P22_1", "P22_2", "P22_3", "P22_4", "P22_5", "P22_6", "P22_7", + "P23_0", "P23_1", "P23_2", "P23_3", "P23_4", "P23_5", "P23_6", "P23_7", + "P24_0", "P24_1", "P24_2", "P24_3", "P24_4", "P24_5", "P24_6", "P24_7", + "P25_0", "P25_1", "P25_2", "P25_3", "P25_4", "P25_5", "P25_6", "P25_7", + "P26_0", "P26_1", "P26_2", "P26_3", "P26_4", "P26_5", "P26_6", "P26_7", + "P27_0", "P27_1", "P27_2", "P27_3", "P27_4", "P27_5", "P27_6", "P27_7", + "P28_0", "P28_1", "P28_2", "P28_3", "P28_4", "P28_5", "P28_6", "P28_7", + "P29_0", "P29_1", "P29_2", "P29_3", "P29_4", "P29_5", "P29_6", "P29_7", + "P30_0", "P30_1", "P30_2", "P30_3", "P30_4", "P30_5", "P30_6", "P30_7", + "P31_0", "P31_1", "P31_2", "P31_3", "P31_4", "P31_5", "P31_6", "P31_7", + "P32_0", "P32_1", "P32_2", "P32_3", "P32_4", "P32_5", "P32_6", "P32_7", + "P33_0", "P33_1", "P33_2", "P33_3", "P33_4", "P33_5", "P33_6", "P33_7", + "P34_0", "P34_1", "P34_2", "P34_3", "P34_4", "P34_5", "P34_6", "P34_7", + "P35_0", "P35_1", "P35_2", "P35_3", "P35_4", "P35_5", "P35_6", "P35_7", +}; + +static int rzt2h_gpio_register(struct rzt2h_pinctrl *pctrl) +{ + struct pinctrl_gpio_range *range = &pctrl->gpio_range; + struct gpio_chip *chip = &pctrl->gpio_chip; + struct device *dev = pctrl->dev; + struct of_phandle_args of_args; + int ret; + + ret = of_parse_phandle_with_fixed_args(dev->of_node, "gpio-ranges", 3, 0, &of_args); + if (ret) + return dev_err_probe(dev, ret, "Unable to parse gpio-ranges\n"); + + if (of_args.args[0] != 0 || of_args.args[1] != 0 || + of_args.args[2] != pctrl->data->n_port_pins) + return dev_err_probe(dev, -EINVAL, + "gpio-ranges does not match selected SOC\n"); + + chip->base = -1; + chip->parent = dev; + chip->owner = THIS_MODULE; + chip->ngpio = of_args.args[2]; + chip->names = rzt2h_gpio_names; + chip->request = rzt2h_gpio_request; + chip->free = rzt2h_gpio_free; + chip->get_direction = rzt2h_gpio_get_direction; + chip->direction_input = rzt2h_gpio_direction_input; + chip->direction_output = rzt2h_gpio_direction_output; + chip->get = rzt2h_gpio_get; + chip->set = rzt2h_gpio_set; + chip->label = dev_name(dev); + + range->id = 0; + range->pin_base = 0; + range->base = 0; + range->npins = chip->ngpio; + range->name = chip->label; + range->gc = chip; + + ret = devm_gpiochip_add_data(dev, chip, pctrl); + if (ret) + return dev_err_probe(dev, ret, "gpiochip registration failed\n"); + + return ret; +} + +static int rzt2h_pinctrl_register(struct rzt2h_pinctrl *pctrl) +{ + struct pinctrl_desc *desc = &pctrl->desc; + struct device *dev = pctrl->dev; + struct pinctrl_pin_desc *pins; + unsigned int i, j; + int ret; + + desc->name = DRV_NAME; + desc->npins = pctrl->data->n_port_pins; + desc->pctlops = &rzt2h_pinctrl_pctlops; + desc->pmxops = &rzt2h_pinctrl_pmxops; + desc->owner = THIS_MODULE; + + pins = devm_kcalloc(dev, desc->npins, sizeof(*pins), GFP_KERNEL); + if (!pins) + return -ENOMEM; + + pctrl->pins = pins; + desc->pins = pins; + + for (i = 0, j = 0; i < pctrl->data->n_port_pins; i++) { + pins[i].number = i; + pins[i].name = rzt2h_gpio_names[i]; + if (i && !(i % RZT2H_PINS_PER_PORT)) + j++; + } + + ret = devm_pinctrl_register_and_init(dev, desc, pctrl, &pctrl->pctl); + if (ret) + return dev_err_probe(dev, ret, "pinctrl registration failed\n"); + + ret = pinctrl_enable(pctrl->pctl); + if (ret) + return dev_err_probe(dev, ret, "pinctrl enable failed\n"); + + return rzt2h_gpio_register(pctrl); +} + +static int rzt2h_pinctrl_cfg_regions(struct platform_device *pdev, + struct rzt2h_pinctrl *pctrl) +{ + struct resource *res; + + pctrl->base0 = devm_platform_ioremap_resource_byname(pdev, "nsr"); + if (IS_ERR(pctrl->base0)) + return PTR_ERR(pctrl->base0); + + /* + * Open-coded instead of using devm_platform_ioremap_resource_byname() + * because the "srs" region is optional. + */ + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "srs"); + if (res) { + u8 port; + + pctrl->base1 = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(pctrl->base1)) + return PTR_ERR(pctrl->base1); + + pctrl->safety_port_enabled = true; + + /* Configure to select safety region 0x812c0xxx */ + for (port = 0; port <= RZT2H_MAX_SAFETY_PORTS; port++) + writeb(0x0, pctrl->base1 + RSELP(port)); + } + + return 0; +} + +static int rzt2h_pinctrl_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rzt2h_pinctrl *pctrl; + int ret; + + pctrl = devm_kzalloc(dev, sizeof(*pctrl), GFP_KERNEL); + if (!pctrl) + return -ENOMEM; + + pctrl->dev = dev; + pctrl->data = of_device_get_match_data(dev); + + ret = rzt2h_pinctrl_cfg_regions(pdev, pctrl); + if (ret) + return ret; + + spin_lock_init(&pctrl->lock); + mutex_init(&pctrl->mutex); + platform_set_drvdata(pdev, pctrl); + + return rzt2h_pinctrl_register(pctrl); +} + +static const u8 r9a09g077_gpio_configs[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, +}; + +static struct rzt2h_pinctrl_data r9a09g077_data = { + .n_port_pins = ARRAY_SIZE(r9a09g077_gpio_configs) * RZT2H_PINS_PER_PORT, + .port_pin_configs = r9a09g077_gpio_configs, + .n_ports = ARRAY_SIZE(r9a09g077_gpio_configs), +}; + +static const struct of_device_id rzt2h_pinctrl_of_table[] = { + { + .compatible = "renesas,r9a09g077-pinctrl", + .data = &r9a09g077_data, + }, + { /* sentinel */ } +}; + +static struct platform_driver rzt2h_pinctrl_driver = { + .driver = { + .name = DRV_NAME, + .of_match_table = of_match_ptr(rzt2h_pinctrl_of_table), + .suppress_bind_attrs = true, + }, + .probe = rzt2h_pinctrl_probe, +}; + +static int __init rzt2h_pinctrl_init(void) +{ + return platform_driver_register(&rzt2h_pinctrl_driver); +} +core_initcall(rzt2h_pinctrl_init); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Thierry Bultel "); +MODULE_AUTHOR("Lad Prabhakar "); +MODULE_DESCRIPTION("Pin and gpio controller driver for the RZ/T2H family"); From d1d31e2739ff063da1e85cd9b44316ca5cccdba8 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Fri, 8 Aug 2025 14:30:17 +0100 Subject: [PATCH 0320/3206] pinctrl: renesas: rzt2h: Add support for RZ/N2H The Renesas RZ/N2H (R9A09G087) SoC shares a similar pin controller architecture with the RZ/T2H (R9A09G077) SoC, differing primarily in the number of supported pins: 576 on RZ/N2H versus 729 on RZ/T2H. Add the necessary pin configuration data and compatible string to enable support for the RZ/N2H SoC in the RZ/T2H pinctrl driver. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250808133017.2053637-4-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/Kconfig | 1 + drivers/pinctrl/renesas/pinctrl-rzt2h.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/drivers/pinctrl/renesas/Kconfig b/drivers/pinctrl/renesas/Kconfig index c8b84c158e86df..8cbd79a1341468 100644 --- a/drivers/pinctrl/renesas/Kconfig +++ b/drivers/pinctrl/renesas/Kconfig @@ -45,6 +45,7 @@ config PINCTRL_RENESAS select PINCTRL_RZG2L if ARCH_R9A09G056 select PINCTRL_RZG2L if ARCH_R9A09G057 select PINCTRL_RZT2H if ARCH_R9A09G077 + select PINCTRL_RZT2H if ARCH_R9A09G087 select PINCTRL_PFC_SH7203 if CPU_SUBTYPE_SH7203 select PINCTRL_PFC_SH7264 if CPU_SUBTYPE_SH7264 select PINCTRL_PFC_SH7269 if CPU_SUBTYPE_SH7269 diff --git a/drivers/pinctrl/renesas/pinctrl-rzt2h.c b/drivers/pinctrl/renesas/pinctrl-rzt2h.c index 28f5d48e33db87..3872638f5ebb39 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzt2h.c +++ b/drivers/pinctrl/renesas/pinctrl-rzt2h.c @@ -762,17 +762,33 @@ static const u8 r9a09g077_gpio_configs[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, }; +static const u8 r9a09g087_gpio_configs[] = { + 0x1f, 0xff, 0xff, 0x1f, 0x00, 0xfe, 0xff, 0x00, 0x7e, 0xf0, 0xff, 0x01, + 0xff, 0xff, 0xff, 0x00, 0xe0, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, + 0xe0, 0xff, 0xff, 0x7f, 0x00, 0xfe, 0xff, 0x7f, 0x00, 0xfc, 0x7f, +}; + static struct rzt2h_pinctrl_data r9a09g077_data = { .n_port_pins = ARRAY_SIZE(r9a09g077_gpio_configs) * RZT2H_PINS_PER_PORT, .port_pin_configs = r9a09g077_gpio_configs, .n_ports = ARRAY_SIZE(r9a09g077_gpio_configs), }; +static struct rzt2h_pinctrl_data r9a09g087_data = { + .n_port_pins = ARRAY_SIZE(r9a09g087_gpio_configs) * RZT2H_PINS_PER_PORT, + .port_pin_configs = r9a09g087_gpio_configs, + .n_ports = ARRAY_SIZE(r9a09g087_gpio_configs), +}; + static const struct of_device_id rzt2h_pinctrl_of_table[] = { { .compatible = "renesas,r9a09g077-pinctrl", .data = &r9a09g077_data, }, + { + .compatible = "renesas,r9a09g087-pinctrl", + .data = &r9a09g087_data, + }, { /* sentinel */ } }; From b8dc2302544b66367bedec19ee16477ff5a25a92 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 18 Aug 2025 16:24:03 +0200 Subject: [PATCH 0321/3206] pinctrl: stm32: Constify static 'pinctrl_desc' The local static 'struct pinctrl_desc' is not modified, so can be made const for code safety. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/20250818142402.132008-2-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/stm32/pinctrl-stm32-hdp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c index dea49b9aabf2ae..a8a4c2eee837ad 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c @@ -576,7 +576,7 @@ static const struct pinmux_ops stm32_hdp_pinmux_ops = { .gpio_set_direction = NULL, }; -static struct pinctrl_desc stm32_hdp_pdesc = { +static const struct pinctrl_desc stm32_hdp_pdesc = { .name = DRIVER_NAME, .pins = stm32_hdp_pins, .npins = ARRAY_SIZE(stm32_hdp_pins), From d072148a8631f102de60ed5a3a827e85d09d24f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Aug 2025 10:25:00 +0200 Subject: [PATCH 0322/3206] fs: add a FMODE_ flag to indicate IOCB_HAS_METADATA availability Currently the kernel will happily route io_uring requests with metadata to file operations that don't support it. Add a FMODE_ flag to guard that. Fixes: 4de2ce04c862 ("fs: introduce IOCB_HAS_METADATA for metadata") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250819082517.2038819-2-hch@lst.de Signed-off-by: Christian Brauner --- block/fops.c | 3 +++ include/linux/fs.h | 3 ++- io_uring/rw.c | 3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/block/fops.c b/block/fops.c index 82451ac8ff25dd..08e7c21bd9f103 100644 --- a/block/fops.c +++ b/block/fops.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -687,6 +688,8 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (bdev_can_atomic_write(bdev)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + if (blk_get_integrity(bdev->bd_disk)) + filp->f_mode |= FMODE_HAS_METADATA; ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d7051f..601d036a6c78ef 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -149,7 +149,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) -/* FMODE_* bit 13 */ +/* Supports IOCB_HAS_METADATA */ +#define FMODE_HAS_METADATA ((__force fmode_t)(1 << 13)) /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) diff --git a/io_uring/rw.c b/io_uring/rw.c index 52a5b950b2e5e9..af5a54b5db1233 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -886,6 +886,9 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (req->flags & REQ_F_HAS_METADATA) { struct io_async_rw *io = req->async_data; + if (!(file->f_mode & FMODE_HAS_METADATA)) + return -EINVAL; + /* * We have a union of meta fields with wpq used for buffered-io * in io_async_rw, so fail it here. From 2729a60bbfb9215997f25372ebe9b7964f038296 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Aug 2025 10:25:01 +0200 Subject: [PATCH 0323/3206] block: don't silently ignore metadata for sync read/write The block fops don't try to handle metadata for synchronous requests, probably because the completion handler looks at dio->iocb which is not valid for synchronous requests. But silently ignoring metadata (or warning in case of __blkdev_direct_IO_simple) is a really bad idea as that can cause silent data corruption if a user ever shows up. Instead simply handle metadata for synchronous requests as the completion handler can simply check for bio_integrity() as the block layer default integrity will already be freed at this point, and thus bio_integrity() will only return true for user mapped integrity. Fixes: 3d8b5a22d404 ("block: add support to pass user meta buffer") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250819082517.2038819-3-hch@lst.de Reviewed-by: Martin K. Petersen Signed-off-by: Christian Brauner --- block/fops.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/block/fops.c b/block/fops.c index 08e7c21bd9f103..ddbc69c0922baa 100644 --- a/block/fops.c +++ b/block/fops.c @@ -55,7 +55,6 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct bio bio; ssize_t ret; - WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -132,7 +131,7 @@ static void blkdev_bio_end_io(struct bio *bio) if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; - if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) + if (bio_integrity(bio)) bio_integrity_unmap_user(bio); if (atomic_dec_and_test(&dio->ref)) { @@ -234,7 +233,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } bio->bi_opf |= REQ_NOWAIT; } - if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { + if (iocb->ki_flags & IOCB_HAS_METADATA) { ret = bio_integrity_map_iter(bio, iocb->private); if (unlikely(ret)) goto fail; @@ -302,7 +301,7 @@ static void blkdev_bio_end_io_async(struct bio *bio) ret = blk_status_to_errno(bio->bi_status); } - if (iocb->ki_flags & IOCB_HAS_METADATA) + if (bio_integrity(bio)) bio_integrity_unmap_user(bio); iocb->ki_complete(iocb, ret); @@ -423,7 +422,8 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (likely(nr_pages <= BIO_MAX_VECS)) { + if (likely(nr_pages <= BIO_MAX_VECS && + !(iocb->ki_flags & IOCB_HAS_METADATA))) { if (is_sync_kiocb(iocb)) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); From b5bbbb70e5f5e1b14b2d5e3d88b8e18cd99f986d Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 19 Aug 2025 12:20:38 +0200 Subject: [PATCH 0324/3206] s390/bpf: Use direct calls and jumps where possible After the V!=R rework (commit c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces")), all kernel code and related data are allocated within a 4G region, making it possible to use relative addressing in BPF code more extensively. Convert as many indirect calls and jumps to direct calls as possible, namely: * BPF_CALL * __bpf_tramp_enter() * __bpf_tramp_exit() * __bpf_prog_enter() * __bpf_prog_exit() * fentry * fmod_ret * fexit * BPF_TRAMP_F_CALL_ORIG without BPF_TRAMP_F_ORIG_STACK * Trampoline returns without BPF_TRAMP_F_SKIP_FRAME and BPF_TRAMP_F_ORIG_STACK The following indirect calls and jumps remain: * Prog returns * Trampoline returns with BPF_TRAMP_F_SKIP_FRAME or BPF_TRAMP_F_ORIG_STACK * BPF_TAIL_CALL * BPF_TRAMP_F_CALL_ORIG with BPF_TRAMP_F_ORIG_STACK As a result, only one usage of call_r1() remains, so inline it. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250819102116.252203-1-iii@linux.ibm.com --- arch/s390/net/bpf_jit_comp.c | 80 +++++++++++++++--------------------- 1 file changed, 32 insertions(+), 48 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index b2b8eb62b82e02..fd45f03a213cf1 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -674,20 +674,6 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp) _EMIT2(0x07f0 | reg); \ } while (0) -/* - * Call r1 either directly or via __s390_indirect_jump_r1 thunk - */ -static void call_r1(struct bpf_jit *jit) -{ - if (nospec_uses_trampoline()) - /* brasl %r14,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, - __s390_indirect_jump_r1); - else - /* basr %r14,%r1 */ - EMIT2(0x0d00, REG_14, REG_1); -} - /* * Function epilogue */ @@ -1820,10 +1806,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, } } - /* lgrl %w1,func */ - EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func)); - /* %r1() */ - call_r1(jit); + /* brasl %r14,func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, (void *)func); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); @@ -2534,14 +2518,12 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * goto skip; */ - /* %r1 = __bpf_prog_enter */ - load_imm64(jit, REG_1, (u64)bpf_trampoline_enter(p)); /* %r2 = p */ load_imm64(jit, REG_2, (u64)p); /* la %r3,run_ctx_off(%r15) */ EMIT4_DISP(0x41000000, REG_3, REG_15, tjit->run_ctx_off); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_prog_enter */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, bpf_trampoline_enter(p)); /* ltgr %r7,%r2 */ EMIT4(0xb9020000, REG_7, REG_2); /* brcl 8,skip */ @@ -2552,15 +2534,13 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * retval = bpf_func(args, p->insnsi); */ - /* %r1 = p->bpf_func */ - load_imm64(jit, REG_1, (u64)p->bpf_func); /* la %r2,bpf_args_off(%r15) */ EMIT4_DISP(0x41000000, REG_2, REG_15, tjit->bpf_args_off); /* %r3 = p->insnsi */ if (!p->jited) load_imm64(jit, REG_3, (u64)p->insnsi); - /* %r1() */ - call_r1(jit); + /* brasl %r14,p->bpf_func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, p->bpf_func); /* stg %r2,retval_off(%r15) */ if (save_ret) { if (sign_extend(jit, REG_2, m->ret_size, m->ret_flags)) @@ -2577,16 +2557,14 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * __bpf_prog_exit(p, start, &run_ctx); */ - /* %r1 = __bpf_prog_exit */ - load_imm64(jit, REG_1, (u64)bpf_trampoline_exit(p)); /* %r2 = p */ load_imm64(jit, REG_2, (u64)p); /* lgr %r3,%r7 */ EMIT4(0xb9040000, REG_3, REG_7); /* la %r4,run_ctx_off(%r15) */ EMIT4_DISP(0x41000000, REG_4, REG_15, tjit->run_ctx_off); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_prog_exit */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, bpf_trampoline_exit(p)); return 0; } @@ -2746,9 +2724,6 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, /* lgr %r8,%r0 */ EMIT4(0xb9040000, REG_8, REG_0); - } else { - /* %r8 = func_addr + S390X_PATCH_SIZE */ - load_imm64(jit, REG_8, (u64)func_addr + S390X_PATCH_SIZE); } /* @@ -2774,12 +2749,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, * __bpf_tramp_enter(im); */ - /* %r1 = __bpf_tramp_enter */ - load_imm64(jit, REG_1, (u64)__bpf_tramp_enter); /* %r2 = im */ load_imm64(jit, REG_2, (u64)im); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_tramp_enter */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, __bpf_tramp_enter); } for (i = 0; i < fentry->nr_links; i++) @@ -2832,10 +2805,19 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, /* mvc tail_call_cnt(4,%r15),tccnt_off(%r15) */ _EMIT6(0xd203f000 | offsetof(struct prog_frame, tail_call_cnt), 0xf000 | tjit->tccnt_off); - /* lgr %r1,%r8 */ - EMIT4(0xb9040000, REG_1, REG_8); - /* %r1() */ - call_r1(jit); + if (flags & BPF_TRAMP_F_ORIG_STACK) { + if (nospec_uses_trampoline()) + /* brasl %r14,__s390_indirect_jump_r8 */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, + __s390_indirect_jump_r8); + else + /* basr %r14,%r8 */ + EMIT2(0x0d00, REG_14, REG_8); + } else { + /* brasl %r14,func_addr+S390X_PATCH_SIZE */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, + func_addr + S390X_PATCH_SIZE); + } /* stg %r2,retval_off(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, tjit->retval_off); @@ -2866,12 +2848,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, * __bpf_tramp_exit(im); */ - /* %r1 = __bpf_tramp_exit */ - load_imm64(jit, REG_1, (u64)__bpf_tramp_exit); /* %r2 = im */ load_imm64(jit, REG_2, (u64)im); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_tramp_exit */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, __bpf_tramp_exit); } /* lmg %r2,%rN,reg_args_off(%r15) */ @@ -2880,7 +2860,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, REG_2 + (nr_reg_args - 1), REG_15, tjit->reg_args_off); /* lgr %r1,%r8 */ - if (!(flags & BPF_TRAMP_F_SKIP_FRAME)) + if (!(flags & BPF_TRAMP_F_SKIP_FRAME) && + (flags & BPF_TRAMP_F_ORIG_STACK)) EMIT4(0xb9040000, REG_1, REG_8); /* lmg %r7,%r8,r7_r8_off(%r15) */ EMIT6_DISP_LH(0xeb000000, 0x0004, REG_7, REG_8, REG_15, @@ -2899,9 +2880,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size); if (flags & BPF_TRAMP_F_SKIP_FRAME) EMIT_JUMP_REG(14); - else + else if (flags & BPF_TRAMP_F_ORIG_STACK) EMIT_JUMP_REG(1); - + else + /* brcl 0xf,func_addr+S390X_PATCH_SIZE */ + EMIT6_PCREL_RILC_PTR(0xc0040000, 0xf, + func_addr + S390X_PATCH_SIZE); return 0; } From 2693227c1150d58bf82ef45a394a554373be5286 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 19 Aug 2025 22:51:19 +0100 Subject: [PATCH 0325/3206] libbpf: Export bpf_object__prepare symbol Add missing LIBBPF_API macro for bpf_object__prepare function to enable its export. libbpf.map had bpf_object__prepare already listed. Fixes: 1315c28ed809 ("libbpf: Split bpf object load into prepare/load") Signed-off-by: Mykyta Yatsenko Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250819215119.37795-1-mykyta.yatsenko5@gmail.com --- tools/lib/bpf/libbpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 455a957cb702ca..2b86e21190d37b 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -252,7 +252,7 @@ bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, * @return 0, on success; negative error code, otherwise, error code is * stored in errno */ -int bpf_object__prepare(struct bpf_object *obj); +LIBBPF_API int bpf_object__prepare(struct bpf_object *obj); /** * @brief **bpf_object__load()** loads BPF object into kernel. From efe89a30f70753d861340a20365812e93d34a0de Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Fri, 8 Aug 2025 14:46:43 +0200 Subject: [PATCH 0326/3206] s390/sclp: Refactor sclp_cmd.c To improve readability, refactor sclp_cmd.c: * Move defines and structures to the beginning. * Reverse x-mas tree usage. * Remove spaces after casting to eliminate checkpatch warnings. * Remove unnecessary comments. * Reframe certain comments. * Convert all unsigned long long to unsigned long since unsigned long long is a leftover from the 31/32-bit era. * Use correct format specifiers. * Add braces to for loops with bodies containing more than one line. * Sort header files and remove unnecessary ones. * Use __packed and __aligned instead of __attribute((packed, aligned(8))) in structures. Acked-by: Heiko Carstens Signed-off-by: Sumanth Korikkar Signed-off-by: Alexander Gordeev --- drivers/s390/char/sclp_cmd.c | 179 ++++++++++++++++------------------- 1 file changed, 83 insertions(+), 96 deletions(-) diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 16469678548f2f..ee0884d99d8849 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -8,31 +8,56 @@ #define KMSG_COMPONENT "sclp_cmd" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#include #include -#include -#include +#include #include -#include -#include -#include -#include +#include +#include #include #include -#include -#include +#include +#include +#include +#include #include -#include -#include -#include -#include +#include #include +#include #include +#include #include "sclp.h" -#define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001 -#define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001 +#define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001 +#define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001 +/* CPU configuration related functions */ +#define SCLP_CMDW_CONFIGURE_CPU 0x00110001 +#define SCLP_CMDW_DECONFIGURE_CPU 0x00100001 +/* Channel path configuration related functions */ +#define SCLP_CMDW_CONFIGURE_CHPATH 0x000f0001 +#define SCLP_CMDW_DECONFIGURE_CHPATH 0x000e0001 +#define SCLP_CMDW_READ_CHPATH_INFORMATION 0x00030001 + +struct cpu_configure_sccb { + struct sccb_header header; +} __packed __aligned(8); + +struct chp_cfg_sccb { + struct sccb_header header; + u8 ccm; + u8 reserved[6]; + u8 cssid; +} __packed; + +struct chp_info_sccb { + struct sccb_header header; + u8 recognized[SCLP_CHP_INFO_MASK_SIZE]; + u8 standby[SCLP_CHP_INFO_MASK_SIZE]; + u8 configured[SCLP_CHP_INFO_MASK_SIZE]; + u8 ccm; + u8 reserved[6]; + u8 cssid; +} __packed; static void sclp_sync_callback(struct sclp_req *req, void *data) { @@ -64,13 +89,11 @@ int sclp_sync_request_timeout(sclp_cmdw_t cmd, void *sccb, int timeout) request->callback_data = &completion; init_completion(&completion); - /* Perform sclp request. */ rc = sclp_add_request(request); if (rc) goto out; wait_for_completion(&completion); - /* Check response. */ if (request->status != SCLP_REQ_DONE) { pr_warn("sync request failed (cmd=0x%08x, status=0x%02x)\n", cmd, request->status); @@ -81,22 +104,15 @@ int sclp_sync_request_timeout(sclp_cmdw_t cmd, void *sccb, int timeout) return rc; } -/* - * CPU configuration related functions. - */ - -#define SCLP_CMDW_CONFIGURE_CPU 0x00110001 -#define SCLP_CMDW_DECONFIGURE_CPU 0x00100001 - int _sclp_get_core_info(struct sclp_core_info *info) { - int rc; - int length = test_facility(140) ? EXT_SCCB_READ_CPU : PAGE_SIZE; struct read_cpu_info_sccb *sccb; + int rc, length; if (!SCLP_HAS_CPU_INFO) return -EOPNOTSUPP; + length = test_facility(140) ? EXT_SCCB_READ_CPU : PAGE_SIZE; sccb = (void *)__get_free_pages(GFP_KERNEL | GFP_DMA | __GFP_ZERO, get_order(length)); if (!sccb) return -ENOMEM; @@ -114,14 +130,10 @@ int _sclp_get_core_info(struct sclp_core_info *info) } sclp_fill_core_info(info, sccb); out: - free_pages((unsigned long) sccb, get_order(length)); + free_pages((unsigned long)sccb, get_order(length)); return rc; } -struct cpu_configure_sccb { - struct sccb_header header; -} __attribute__((packed, aligned(8))); - static int do_core_configure(sclp_cmdw_t cmd) { struct cpu_configure_sccb *sccb; @@ -130,8 +142,8 @@ static int do_core_configure(sclp_cmdw_t cmd) if (!SCLP_HAS_CPU_RECONFIG) return -EOPNOTSUPP; /* - * This is not going to cross a page boundary since we force - * kmalloc to have a minimum alignment of 8 bytes on s390. + * Use kmalloc to have a minimum alignment of 8 bytes and ensure sccb + * is not going to cross a page boundary. */ sccb = kzalloc(sizeof(*sccb), GFP_KERNEL | GFP_DMA); if (!sccb) @@ -183,6 +195,14 @@ struct assign_storage_sccb { u16 rn; } __packed; +struct attach_storage_sccb { + struct sccb_header header; + u16 :16; + u16 assigned; + u32 :32; + u32 entries[]; +} __packed; + int arch_get_memory_phys_device(unsigned long start_pfn) { if (!sclp.rzm) @@ -190,9 +210,9 @@ int arch_get_memory_phys_device(unsigned long start_pfn) return PFN_PHYS(start_pfn) >> ilog2(sclp.rzm); } -static unsigned long long rn2addr(u16 rn) +static unsigned long rn2addr(u16 rn) { - return (unsigned long long) (rn - 1) * sclp.rzm; + return (unsigned long)(rn - 1) * sclp.rzm; } static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) @@ -200,7 +220,7 @@ static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) struct assign_storage_sccb *sccb; int rc; - sccb = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!sccb) return -ENOMEM; sccb->header.length = PAGE_SIZE; @@ -219,13 +239,13 @@ static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) break; } out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } static int sclp_assign_storage(u16 rn) { - unsigned long long start; + unsigned long start; int rc; rc = do_assign_storage(SCLP_CMDW_ASSIGN_STORAGE, rn); @@ -241,21 +261,12 @@ static int sclp_unassign_storage(u16 rn) return do_assign_storage(SCLP_CMDW_UNASSIGN_STORAGE, rn); } -struct attach_storage_sccb { - struct sccb_header header; - u16 :16; - u16 assigned; - u32 :32; - u32 entries[]; -} __packed; - static int sclp_attach_storage(u8 id) { struct attach_storage_sccb *sccb; - int rc; - int i; + int rc, i; - sccb = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!sccb) return -ENOMEM; sccb->header.length = PAGE_SIZE; @@ -277,7 +288,7 @@ static int sclp_attach_storage(u8 id) break; } out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } @@ -285,7 +296,7 @@ static int sclp_mem_change_state(unsigned long start, unsigned long size, int online) { struct memory_increment *incr; - unsigned long long istart; + unsigned long istart; int rc = 0; list_for_each_entry(incr, &sclp_mem_list, list) { @@ -338,7 +349,7 @@ static int sclp_mem_notifier(struct notifier_block *nb, switch (action) { case MEM_GOING_OFFLINE: /* - * We do not allow to set memory blocks offline that contain + * Do not allow to set memory blocks offline that contain * standby memory. This is done to simplify the "memory online" * case. */ @@ -390,16 +401,16 @@ static struct notifier_block sclp_mem_nb = { .notifier_call = sclp_mem_notifier, }; -static void __init align_to_block_size(unsigned long long *start, - unsigned long long *size, - unsigned long long alignment) +static void __init align_to_block_size(unsigned long *start, + unsigned long *size, + unsigned long alignment) { - unsigned long long start_align, size_align; + unsigned long start_align, size_align; start_align = roundup(*start, alignment); size_align = rounddown(*start + *size, alignment) - start_align; - pr_info("Standby memory at 0x%llx (%lluM of %lluM usable)\n", + pr_info("Standby memory at 0x%lx (%luM of %luM usable)\n", *start, size_align >> 20, *size >> 20); *start = start_align; *size = size_align; @@ -407,7 +418,7 @@ static void __init align_to_block_size(unsigned long long *start, static void __init add_memory_merged(u16 rn) { - unsigned long long start, size, addr, block_size; + unsigned long start, size, addr, block_size; static u16 first_rn, num; if (rn && first_rn && (first_rn + num == rn)) { @@ -417,7 +428,7 @@ static void __init add_memory_merged(u16 rn) if (!first_rn) goto skip_add; start = rn2addr(first_rn); - size = (unsigned long long) num * sclp.rzm; + size = (unsigned long)num * sclp.rzm; if (start >= ident_map_size) goto skip_add; if (start + size > ident_map_size) @@ -426,10 +437,11 @@ static void __init add_memory_merged(u16 rn) align_to_block_size(&start, &size, block_size); if (!size) goto skip_add; - for (addr = start; addr < start + size; addr += block_size) + for (addr = start; addr < start + size; addr += block_size) { add_memory(0, addr, block_size, cpu_has_edat1() ? MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); + } skip_add: first_rn = rn; num = 1; @@ -439,9 +451,10 @@ static void __init sclp_add_standby_memory(void) { struct memory_increment *incr; - list_for_each_entry(incr, &sclp_mem_list, list) + list_for_each_entry(incr, &sclp_mem_list, list) { if (incr->standby) add_memory_merged(incr->rn); + } add_memory_merged(0); } @@ -480,12 +493,13 @@ static int __init sclp_detect_standby_memory(void) struct read_storage_sccb *sccb; int i, id, assigned, rc; - if (oldmem_data.start) /* No standby memory in kdump mode */ + /* No standby memory in kdump mode */ + if (oldmem_data.start) return 0; - if ((sclp.facilities & 0xe00000000000ULL) != 0xe00000000000ULL) + if ((sclp.facilities & 0xe00000000000UL) != 0xe00000000000UL) return 0; rc = -ENOMEM; - sccb = (void *) __get_free_page(GFP_KERNEL | GFP_DMA); + sccb = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); if (!sccb) goto out; assigned = 0; @@ -531,28 +545,13 @@ static int __init sclp_detect_standby_memory(void) goto out; sclp_add_standby_memory(); out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } __initcall(sclp_detect_standby_memory); #endif /* CONFIG_MEMORY_HOTPLUG */ -/* - * Channel path configuration related functions. - */ - -#define SCLP_CMDW_CONFIGURE_CHPATH 0x000f0001 -#define SCLP_CMDW_DECONFIGURE_CHPATH 0x000e0001 -#define SCLP_CMDW_READ_CHPATH_INFORMATION 0x00030001 - -struct chp_cfg_sccb { - struct sccb_header header; - u8 ccm; - u8 reserved[6]; - u8 cssid; -} __attribute__((packed)); - static int do_chp_configure(sclp_cmdw_t cmd) { struct chp_cfg_sccb *sccb; @@ -560,8 +559,7 @@ static int do_chp_configure(sclp_cmdw_t cmd) if (!SCLP_HAS_CHP_RECONFIG) return -EOPNOTSUPP; - /* Prepare sccb. */ - sccb = (struct chp_cfg_sccb *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + sccb = (struct chp_cfg_sccb *)get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!sccb) return -ENOMEM; sccb->header.length = sizeof(*sccb); @@ -581,7 +579,7 @@ static int do_chp_configure(sclp_cmdw_t cmd) break; } out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } @@ -609,16 +607,6 @@ int sclp_chp_deconfigure(struct chp_id chpid) return do_chp_configure(SCLP_CMDW_DECONFIGURE_CHPATH | chpid.id << 8); } -struct chp_info_sccb { - struct sccb_header header; - u8 recognized[SCLP_CHP_INFO_MASK_SIZE]; - u8 standby[SCLP_CHP_INFO_MASK_SIZE]; - u8 configured[SCLP_CHP_INFO_MASK_SIZE]; - u8 ccm; - u8 reserved[6]; - u8 cssid; -} __attribute__((packed)); - /** * sclp_chp_read_info - perform read channel-path information sclp command * @info: resulting channel-path information data @@ -634,8 +622,7 @@ int sclp_chp_read_info(struct sclp_chp_info *info) if (!SCLP_HAS_CHP_INFO) return -EOPNOTSUPP; - /* Prepare sccb. */ - sccb = (struct chp_info_sccb *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + sccb = (struct chp_info_sccb *)get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!sccb) return -ENOMEM; sccb->header.length = sizeof(*sccb); @@ -652,6 +639,6 @@ int sclp_chp_read_info(struct sclp_chp_info *info) memcpy(info->standby, sccb->standby, SCLP_CHP_INFO_MASK_SIZE); memcpy(info->configured, sccb->configured, SCLP_CHP_INFO_MASK_SIZE); out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } From f9de6cdf4cf8c932ee94f6e25cd7434a97c78bf3 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Fri, 8 Aug 2025 14:46:44 +0200 Subject: [PATCH 0327/3206] s390/sclp: Move memory hotplug code for better modularity To improve readability and prepare for future extensions, move the memory hotplug code to a separate file. In addtion to it, add required headers in sclp_mem.c and remove unnecessary headers in sclp_cmd.c Acked-by: Heiko Carstens Signed-off-by: Sumanth Korikkar Signed-off-by: Alexander Gordeev --- drivers/s390/char/Makefile | 1 + drivers/s390/char/sclp_cmd.c | 385 --------------------------------- drivers/s390/char/sclp_mem.c | 399 +++++++++++++++++++++++++++++++++++ 3 files changed, 400 insertions(+), 385 deletions(-) create mode 100644 drivers/s390/char/sclp_mem.c diff --git a/drivers/s390/char/Makefile b/drivers/s390/char/Makefile index 81d6744e1861fd..dcbd51152ee3f8 100644 --- a/drivers/s390/char/Makefile +++ b/drivers/s390/char/Makefile @@ -21,6 +21,7 @@ obj-y += ctrlchar.o keyboard.o defkeymap.o sclp.o sclp_rw.o sclp_quiesce.o \ sclp_cmd.o sclp_config.o sclp_cpi_sys.o sclp_ocf.o sclp_ctl.o \ sclp_early.o sclp_early_core.o sclp_sd.o +obj-$(CONFIG_MEMORY_HOTPLUG) += sclp_mem.o obj-$(CONFIG_TN3270) += raw3270.o con3270.o obj-$(CONFIG_TN3270_FS) += fs3270.o diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index ee0884d99d8849..3480198eac0255 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -9,27 +9,17 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include -#include #include #include #include -#include -#include -#include -#include #include #include #include #include -#include -#include -#include #include #include "sclp.h" -#define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001 -#define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001 /* CPU configuration related functions */ #define SCLP_CMDW_CONFIGURE_CPU 0x00110001 #define SCLP_CMDW_DECONFIGURE_CPU 0x00100001 @@ -177,381 +167,6 @@ int sclp_core_deconfigure(u8 core) return do_core_configure(SCLP_CMDW_DECONFIGURE_CPU | core << 8); } -#ifdef CONFIG_MEMORY_HOTPLUG - -static DEFINE_MUTEX(sclp_mem_mutex); -static LIST_HEAD(sclp_mem_list); -static u8 sclp_max_storage_id; -static DECLARE_BITMAP(sclp_storage_ids, 256); - -struct memory_increment { - struct list_head list; - u16 rn; - int standby; -}; - -struct assign_storage_sccb { - struct sccb_header header; - u16 rn; -} __packed; - -struct attach_storage_sccb { - struct sccb_header header; - u16 :16; - u16 assigned; - u32 :32; - u32 entries[]; -} __packed; - -int arch_get_memory_phys_device(unsigned long start_pfn) -{ - if (!sclp.rzm) - return 0; - return PFN_PHYS(start_pfn) >> ilog2(sclp.rzm); -} - -static unsigned long rn2addr(u16 rn) -{ - return (unsigned long)(rn - 1) * sclp.rzm; -} - -static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) -{ - struct assign_storage_sccb *sccb; - int rc; - - sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); - if (!sccb) - return -ENOMEM; - sccb->header.length = PAGE_SIZE; - sccb->rn = rn; - rc = sclp_sync_request_timeout(cmd, sccb, SCLP_QUEUE_INTERVAL); - if (rc) - goto out; - switch (sccb->header.response_code) { - case 0x0020: - case 0x0120: - break; - default: - pr_warn("assign storage failed (cmd=0x%08x, response=0x%04x, rn=0x%04x)\n", - cmd, sccb->header.response_code, rn); - rc = -EIO; - break; - } -out: - free_page((unsigned long)sccb); - return rc; -} - -static int sclp_assign_storage(u16 rn) -{ - unsigned long start; - int rc; - - rc = do_assign_storage(SCLP_CMDW_ASSIGN_STORAGE, rn); - if (rc) - return rc; - start = rn2addr(rn); - storage_key_init_range(start, start + sclp.rzm); - return 0; -} - -static int sclp_unassign_storage(u16 rn) -{ - return do_assign_storage(SCLP_CMDW_UNASSIGN_STORAGE, rn); -} - -static int sclp_attach_storage(u8 id) -{ - struct attach_storage_sccb *sccb; - int rc, i; - - sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); - if (!sccb) - return -ENOMEM; - sccb->header.length = PAGE_SIZE; - sccb->header.function_code = 0x40; - rc = sclp_sync_request_timeout(0x00080001 | id << 8, sccb, - SCLP_QUEUE_INTERVAL); - if (rc) - goto out; - switch (sccb->header.response_code) { - case 0x0020: - set_bit(id, sclp_storage_ids); - for (i = 0; i < sccb->assigned; i++) { - if (sccb->entries[i]) - sclp_unassign_storage(sccb->entries[i] >> 16); - } - break; - default: - rc = -EIO; - break; - } -out: - free_page((unsigned long)sccb); - return rc; -} - -static int sclp_mem_change_state(unsigned long start, unsigned long size, - int online) -{ - struct memory_increment *incr; - unsigned long istart; - int rc = 0; - - list_for_each_entry(incr, &sclp_mem_list, list) { - istart = rn2addr(incr->rn); - if (start + size - 1 < istart) - break; - if (start > istart + sclp.rzm - 1) - continue; - if (online) - rc |= sclp_assign_storage(incr->rn); - else - sclp_unassign_storage(incr->rn); - if (rc == 0) - incr->standby = online ? 0 : 1; - } - return rc ? -EIO : 0; -} - -static bool contains_standby_increment(unsigned long start, unsigned long end) -{ - struct memory_increment *incr; - unsigned long istart; - - list_for_each_entry(incr, &sclp_mem_list, list) { - istart = rn2addr(incr->rn); - if (end - 1 < istart) - continue; - if (start > istart + sclp.rzm - 1) - continue; - if (incr->standby) - return true; - } - return false; -} - -static int sclp_mem_notifier(struct notifier_block *nb, - unsigned long action, void *data) -{ - unsigned long start, size; - struct memory_notify *arg; - unsigned char id; - int rc = 0; - - arg = data; - start = arg->start_pfn << PAGE_SHIFT; - size = arg->nr_pages << PAGE_SHIFT; - mutex_lock(&sclp_mem_mutex); - for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1) - sclp_attach_storage(id); - switch (action) { - case MEM_GOING_OFFLINE: - /* - * Do not allow to set memory blocks offline that contain - * standby memory. This is done to simplify the "memory online" - * case. - */ - if (contains_standby_increment(start, start + size)) - rc = -EPERM; - break; - case MEM_PREPARE_ONLINE: - /* - * Access the altmap_start_pfn and altmap_nr_pages fields - * within the struct memory_notify specifically when dealing - * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. - * - * When altmap is in use, take the specified memory range - * online, which includes the altmap. - */ - if (arg->altmap_nr_pages) { - start = PFN_PHYS(arg->altmap_start_pfn); - size += PFN_PHYS(arg->altmap_nr_pages); - } - rc = sclp_mem_change_state(start, size, 1); - if (rc || !arg->altmap_nr_pages) - break; - /* - * Set CMMA state to nodat here, since the struct page memory - * at the beginning of the memory block will not go through the - * buddy allocator later. - */ - __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages); - break; - case MEM_FINISH_OFFLINE: - /* - * When altmap is in use, take the specified memory range - * offline, which includes the altmap. - */ - if (arg->altmap_nr_pages) { - start = PFN_PHYS(arg->altmap_start_pfn); - size += PFN_PHYS(arg->altmap_nr_pages); - } - sclp_mem_change_state(start, size, 0); - break; - default: - break; - } - mutex_unlock(&sclp_mem_mutex); - return rc ? NOTIFY_BAD : NOTIFY_OK; -} - -static struct notifier_block sclp_mem_nb = { - .notifier_call = sclp_mem_notifier, -}; - -static void __init align_to_block_size(unsigned long *start, - unsigned long *size, - unsigned long alignment) -{ - unsigned long start_align, size_align; - - start_align = roundup(*start, alignment); - size_align = rounddown(*start + *size, alignment) - start_align; - - pr_info("Standby memory at 0x%lx (%luM of %luM usable)\n", - *start, size_align >> 20, *size >> 20); - *start = start_align; - *size = size_align; -} - -static void __init add_memory_merged(u16 rn) -{ - unsigned long start, size, addr, block_size; - static u16 first_rn, num; - - if (rn && first_rn && (first_rn + num == rn)) { - num++; - return; - } - if (!first_rn) - goto skip_add; - start = rn2addr(first_rn); - size = (unsigned long)num * sclp.rzm; - if (start >= ident_map_size) - goto skip_add; - if (start + size > ident_map_size) - size = ident_map_size - start; - block_size = memory_block_size_bytes(); - align_to_block_size(&start, &size, block_size); - if (!size) - goto skip_add; - for (addr = start; addr < start + size; addr += block_size) { - add_memory(0, addr, block_size, - cpu_has_edat1() ? - MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); - } -skip_add: - first_rn = rn; - num = 1; -} - -static void __init sclp_add_standby_memory(void) -{ - struct memory_increment *incr; - - list_for_each_entry(incr, &sclp_mem_list, list) { - if (incr->standby) - add_memory_merged(incr->rn); - } - add_memory_merged(0); -} - -static void __init insert_increment(u16 rn, int standby, int assigned) -{ - struct memory_increment *incr, *new_incr; - struct list_head *prev; - u16 last_rn; - - new_incr = kzalloc(sizeof(*new_incr), GFP_KERNEL); - if (!new_incr) - return; - new_incr->rn = rn; - new_incr->standby = standby; - last_rn = 0; - prev = &sclp_mem_list; - list_for_each_entry(incr, &sclp_mem_list, list) { - if (assigned && incr->rn > rn) - break; - if (!assigned && incr->rn - last_rn > 1) - break; - last_rn = incr->rn; - prev = &incr->list; - } - if (!assigned) - new_incr->rn = last_rn + 1; - if (new_incr->rn > sclp.rnmax) { - kfree(new_incr); - return; - } - list_add(&new_incr->list, prev); -} - -static int __init sclp_detect_standby_memory(void) -{ - struct read_storage_sccb *sccb; - int i, id, assigned, rc; - - /* No standby memory in kdump mode */ - if (oldmem_data.start) - return 0; - if ((sclp.facilities & 0xe00000000000UL) != 0xe00000000000UL) - return 0; - rc = -ENOMEM; - sccb = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); - if (!sccb) - goto out; - assigned = 0; - for (id = 0; id <= sclp_max_storage_id; id++) { - memset(sccb, 0, PAGE_SIZE); - sccb->header.length = PAGE_SIZE; - rc = sclp_sync_request(SCLP_CMDW_READ_STORAGE_INFO | id << 8, sccb); - if (rc) - goto out; - switch (sccb->header.response_code) { - case 0x0010: - set_bit(id, sclp_storage_ids); - for (i = 0; i < sccb->assigned; i++) { - if (!sccb->entries[i]) - continue; - assigned++; - insert_increment(sccb->entries[i] >> 16, 0, 1); - } - break; - case 0x0310: - break; - case 0x0410: - for (i = 0; i < sccb->assigned; i++) { - if (!sccb->entries[i]) - continue; - assigned++; - insert_increment(sccb->entries[i] >> 16, 1, 1); - } - break; - default: - rc = -EIO; - break; - } - if (!rc) - sclp_max_storage_id = sccb->max_id; - } - if (rc || list_empty(&sclp_mem_list)) - goto out; - for (i = 1; i <= sclp.rnmax - assigned; i++) - insert_increment(0, 1, 0); - rc = register_memory_notifier(&sclp_mem_nb); - if (rc) - goto out; - sclp_add_standby_memory(); -out: - free_page((unsigned long)sccb); - return rc; -} -__initcall(sclp_detect_standby_memory); - -#endif /* CONFIG_MEMORY_HOTPLUG */ - static int do_chp_configure(sclp_cmdw_t cmd) { struct chp_cfg_sccb *sccb; diff --git a/drivers/s390/char/sclp_mem.c b/drivers/s390/char/sclp_mem.c new file mode 100644 index 00000000000000..27f49f5fd35849 --- /dev/null +++ b/drivers/s390/char/sclp_mem.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory hotplug support via sclp + * + * Copyright IBM Corp. 2025 + */ + +#define KMSG_COMPONENT "sclp_mem" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sclp.h" + +#define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001 +#define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001 + +static DEFINE_MUTEX(sclp_mem_mutex); +static LIST_HEAD(sclp_mem_list); +static u8 sclp_max_storage_id; +static DECLARE_BITMAP(sclp_storage_ids, 256); + +struct memory_increment { + struct list_head list; + u16 rn; + int standby; +}; + +struct assign_storage_sccb { + struct sccb_header header; + u16 rn; +} __packed; + +struct attach_storage_sccb { + struct sccb_header header; + u16 :16; + u16 assigned; + u32 :32; + u32 entries[]; +} __packed; + +int arch_get_memory_phys_device(unsigned long start_pfn) +{ + if (!sclp.rzm) + return 0; + return PFN_PHYS(start_pfn) >> ilog2(sclp.rzm); +} + +static unsigned long rn2addr(u16 rn) +{ + return (unsigned long)(rn - 1) * sclp.rzm; +} + +static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) +{ + struct assign_storage_sccb *sccb; + int rc; + + sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!sccb) + return -ENOMEM; + sccb->header.length = PAGE_SIZE; + sccb->rn = rn; + rc = sclp_sync_request_timeout(cmd, sccb, SCLP_QUEUE_INTERVAL); + if (rc) + goto out; + switch (sccb->header.response_code) { + case 0x0020: + case 0x0120: + break; + default: + pr_warn("assign storage failed (cmd=0x%08x, response=0x%04x, rn=0x%04x)\n", + cmd, sccb->header.response_code, rn); + rc = -EIO; + break; + } +out: + free_page((unsigned long)sccb); + return rc; +} + +static int sclp_assign_storage(u16 rn) +{ + unsigned long start; + int rc; + + rc = do_assign_storage(SCLP_CMDW_ASSIGN_STORAGE, rn); + if (rc) + return rc; + start = rn2addr(rn); + storage_key_init_range(start, start + sclp.rzm); + return 0; +} + +static int sclp_unassign_storage(u16 rn) +{ + return do_assign_storage(SCLP_CMDW_UNASSIGN_STORAGE, rn); +} + +static int sclp_attach_storage(u8 id) +{ + struct attach_storage_sccb *sccb; + int rc, i; + + sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!sccb) + return -ENOMEM; + sccb->header.length = PAGE_SIZE; + sccb->header.function_code = 0x40; + rc = sclp_sync_request_timeout(0x00080001 | id << 8, sccb, + SCLP_QUEUE_INTERVAL); + if (rc) + goto out; + switch (sccb->header.response_code) { + case 0x0020: + set_bit(id, sclp_storage_ids); + for (i = 0; i < sccb->assigned; i++) { + if (sccb->entries[i]) + sclp_unassign_storage(sccb->entries[i] >> 16); + } + break; + default: + rc = -EIO; + break; + } +out: + free_page((unsigned long)sccb); + return rc; +} + +static int sclp_mem_change_state(unsigned long start, unsigned long size, + int online) +{ + struct memory_increment *incr; + unsigned long istart; + int rc = 0; + + list_for_each_entry(incr, &sclp_mem_list, list) { + istart = rn2addr(incr->rn); + if (start + size - 1 < istart) + break; + if (start > istart + sclp.rzm - 1) + continue; + if (online) + rc |= sclp_assign_storage(incr->rn); + else + sclp_unassign_storage(incr->rn); + if (rc == 0) + incr->standby = online ? 0 : 1; + } + return rc ? -EIO : 0; +} + +static bool contains_standby_increment(unsigned long start, unsigned long end) +{ + struct memory_increment *incr; + unsigned long istart; + + list_for_each_entry(incr, &sclp_mem_list, list) { + istart = rn2addr(incr->rn); + if (end - 1 < istart) + continue; + if (start > istart + sclp.rzm - 1) + continue; + if (incr->standby) + return true; + } + return false; +} + +static int sclp_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + unsigned long start, size; + struct memory_notify *arg; + unsigned char id; + int rc = 0; + + arg = data; + start = arg->start_pfn << PAGE_SHIFT; + size = arg->nr_pages << PAGE_SHIFT; + mutex_lock(&sclp_mem_mutex); + for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1) + sclp_attach_storage(id); + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Do not allow to set memory blocks offline that contain + * standby memory. This is done to simplify the "memory online" + * case. + */ + if (contains_standby_increment(start, start + size)) + rc = -EPERM; + break; + case MEM_PREPARE_ONLINE: + /* + * Access the altmap_start_pfn and altmap_nr_pages fields + * within the struct memory_notify specifically when dealing + * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. + * + * When altmap is in use, take the specified memory range + * online, which includes the altmap. + */ + if (arg->altmap_nr_pages) { + start = PFN_PHYS(arg->altmap_start_pfn); + size += PFN_PHYS(arg->altmap_nr_pages); + } + rc = sclp_mem_change_state(start, size, 1); + if (rc || !arg->altmap_nr_pages) + break; + /* + * Set CMMA state to nodat here, since the struct page memory + * at the beginning of the memory block will not go through the + * buddy allocator later. + */ + __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages); + break; + case MEM_FINISH_OFFLINE: + /* + * When altmap is in use, take the specified memory range + * offline, which includes the altmap. + */ + if (arg->altmap_nr_pages) { + start = PFN_PHYS(arg->altmap_start_pfn); + size += PFN_PHYS(arg->altmap_nr_pages); + } + sclp_mem_change_state(start, size, 0); + break; + default: + break; + } + mutex_unlock(&sclp_mem_mutex); + return rc ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block sclp_mem_nb = { + .notifier_call = sclp_mem_notifier, +}; + +static void __init align_to_block_size(unsigned long *start, + unsigned long *size, + unsigned long alignment) +{ + unsigned long start_align, size_align; + + start_align = roundup(*start, alignment); + size_align = rounddown(*start + *size, alignment) - start_align; + + pr_info("Standby memory at 0x%lx (%luM of %luM usable)\n", + *start, size_align >> 20, *size >> 20); + *start = start_align; + *size = size_align; +} + +static void __init add_memory_merged(u16 rn) +{ + unsigned long start, size, addr, block_size; + static u16 first_rn, num; + + if (rn && first_rn && (first_rn + num == rn)) { + num++; + return; + } + if (!first_rn) + goto skip_add; + start = rn2addr(first_rn); + size = (unsigned long)num * sclp.rzm; + if (start >= ident_map_size) + goto skip_add; + if (start + size > ident_map_size) + size = ident_map_size - start; + block_size = memory_block_size_bytes(); + align_to_block_size(&start, &size, block_size); + if (!size) + goto skip_add; + for (addr = start; addr < start + size; addr += block_size) { + add_memory(0, addr, block_size, + cpu_has_edat1() ? + MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); + } +skip_add: + first_rn = rn; + num = 1; +} + +static void __init sclp_add_standby_memory(void) +{ + struct memory_increment *incr; + + list_for_each_entry(incr, &sclp_mem_list, list) { + if (incr->standby) + add_memory_merged(incr->rn); + } + add_memory_merged(0); +} + +static void __init insert_increment(u16 rn, int standby, int assigned) +{ + struct memory_increment *incr, *new_incr; + struct list_head *prev; + u16 last_rn; + + new_incr = kzalloc(sizeof(*new_incr), GFP_KERNEL); + if (!new_incr) + return; + new_incr->rn = rn; + new_incr->standby = standby; + last_rn = 0; + prev = &sclp_mem_list; + list_for_each_entry(incr, &sclp_mem_list, list) { + if (assigned && incr->rn > rn) + break; + if (!assigned && incr->rn - last_rn > 1) + break; + last_rn = incr->rn; + prev = &incr->list; + } + if (!assigned) + new_incr->rn = last_rn + 1; + if (new_incr->rn > sclp.rnmax) { + kfree(new_incr); + return; + } + list_add(&new_incr->list, prev); +} + +static int __init sclp_detect_standby_memory(void) +{ + struct read_storage_sccb *sccb; + int i, id, assigned, rc; + + /* No standby memory in kdump mode */ + if (oldmem_data.start) + return 0; + if ((sclp.facilities & 0xe00000000000UL) != 0xe00000000000UL) + return 0; + rc = -ENOMEM; + sccb = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); + if (!sccb) + goto out; + assigned = 0; + for (id = 0; id <= sclp_max_storage_id; id++) { + memset(sccb, 0, PAGE_SIZE); + sccb->header.length = PAGE_SIZE; + rc = sclp_sync_request(SCLP_CMDW_READ_STORAGE_INFO | id << 8, sccb); + if (rc) + goto out; + switch (sccb->header.response_code) { + case 0x0010: + set_bit(id, sclp_storage_ids); + for (i = 0; i < sccb->assigned; i++) { + if (!sccb->entries[i]) + continue; + assigned++; + insert_increment(sccb->entries[i] >> 16, 0, 1); + } + break; + case 0x0310: + break; + case 0x0410: + for (i = 0; i < sccb->assigned; i++) { + if (!sccb->entries[i]) + continue; + assigned++; + insert_increment(sccb->entries[i] >> 16, 1, 1); + } + break; + default: + rc = -EIO; + break; + } + if (!rc) + sclp_max_storage_id = sccb->max_id; + } + if (rc || list_empty(&sclp_mem_list)) + goto out; + for (i = 1; i <= sclp.rnmax - assigned; i++) + insert_increment(0, 1, 0); + rc = register_memory_notifier(&sclp_mem_nb); + if (rc) + goto out; + sclp_add_standby_memory(); +out: + free_page((unsigned long)sccb); + return rc; +} +__initcall(sclp_detect_standby_memory); From de88e74889a30bd9ff4047726021cde857348b4b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 4 Aug 2025 17:08:54 +0200 Subject: [PATCH 0328/3206] s390/bitops: Slightly optimize ffs() and fls64() Use a simpler algorithm to calculate the result of ffs() and fls64(). This generates slightly better code and increases readability. Kernel image size is reduced by ~3kb (gcc 15.1.0 + defconfig). Suggested-by: Nina Schoetterl-Glausch Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index a5ca0a9476916c..10d7573d1582c4 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -193,10 +193,9 @@ static inline unsigned long __ffs(unsigned long word) */ static inline int ffs(int word) { - unsigned long mask = 2 * BITS_PER_LONG - 1; unsigned int val = (unsigned int)word; - return (1 + (__flogr(-val & val) ^ (BITS_PER_LONG - 1))) & mask; + return BITS_PER_LONG - __flogr(-val & val); } /** @@ -223,9 +222,7 @@ static inline unsigned long __fls(unsigned long word) */ static inline int fls64(unsigned long word) { - unsigned long mask = 2 * BITS_PER_LONG - 1; - - return (1 + (__flogr(word) ^ (BITS_PER_LONG - 1))) & mask; + return BITS_PER_LONG - __flogr(word); } /** From 669bc57e7016cf9d1a9eedb2a984c4fb4fd67f3d Mon Sep 17 00:00:00 2001 From: Juergen Christ Date: Mon, 11 Aug 2025 17:22:53 +0200 Subject: [PATCH 0329/3206] s390/bitops: Optimize inlining GCC inlining heuristics prevent code growth due to inlining into cold paths. This causes GCC to emit a partially specialized version of __flogr for non-constant input for all occurrences on cold paths. This happens since the overhead seen during inlining includes setting up a union register_pair, calling flogr, and extracting and casting the result. This overhead is not removed until the function is lowered into RTL. But this happens after inlining. For -ftrivial-var-auto-init=zero builds, an additional initialization of the union register_pair adds another statement to be inlinined. This is unneeded since the even register is initialized anyway and the odd register is not an input register. It is only marked as such since the whole pair has to be marked as a read/write output register. Mark the union register_pair as uninitialized to get rid of this statement. This, however, does not change the code since the initialization happens when part of the register pair is written. Nevertheless, GCC function size approximation during inlining is reduced by one statement. Force inlining of flogr and also flatten some other functions that should be leaf functions but are called in cold context, like, e.g., __init functions. Acked-by: Heiko Carstens Signed-off-by: Juergen Christ Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 10d7573d1582c4..9dfb687ba62009 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -130,7 +130,7 @@ static inline bool test_bit_inv(unsigned long nr, * where the most significant bit has bit number 0. * If no bit is set this function returns 64. */ -static inline unsigned char __flogr(unsigned long word) +static __always_inline unsigned char __flogr(unsigned long word) { if (__builtin_constant_p(word)) { unsigned long bit = 0; @@ -163,7 +163,7 @@ static inline unsigned char __flogr(unsigned long word) } return bit; } else { - union register_pair rp; + union register_pair rp __uninitialized; rp.even = word; asm volatile( @@ -179,7 +179,7 @@ static inline unsigned char __flogr(unsigned long word) * * Undefined if no bit exists, so code should check against 0 first. */ -static inline unsigned long __ffs(unsigned long word) +static __always_inline __flatten unsigned long __ffs(unsigned long word) { return __flogr(-word & word) ^ (BITS_PER_LONG - 1); } @@ -191,7 +191,7 @@ static inline unsigned long __ffs(unsigned long word) * This is defined the same way as the libc and * compiler builtin ffs routines (man ffs). */ -static inline int ffs(int word) +static __always_inline __flatten int ffs(int word) { unsigned int val = (unsigned int)word; @@ -204,7 +204,7 @@ static inline int ffs(int word) * * Undefined if no set bit exists, so code should check against 0 first. */ -static inline unsigned long __fls(unsigned long word) +static __always_inline __flatten unsigned long __fls(unsigned long word) { return __flogr(word) ^ (BITS_PER_LONG - 1); } @@ -220,7 +220,7 @@ static inline unsigned long __fls(unsigned long word) * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ -static inline int fls64(unsigned long word) +static __always_inline __flatten int fls64(unsigned long word) { return BITS_PER_LONG - __flogr(word); } @@ -232,7 +232,7 @@ static inline int fls64(unsigned long word) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(unsigned int word) +static __always_inline __flatten int fls(unsigned int word) { return fls64(word); } From 776cc2ec155cfc066087436a6afc86fe52bd5365 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 23 Jul 2025 08:26:31 +0200 Subject: [PATCH 0330/3206] EDAC/altera: Use dev_fwnode() irq_domain_create_simple() takes fwnode as the first argument. It can be extracted from the struct device using dev_fwnode() helper instead of using of_node with of_fwnode_handle(). So use the dev_fwnode() helper. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Yazen Ghannam Acked-by: Dinh Nguyen Link: https://lore.kernel.org/20250723062631.1830757-1-jirislaby@kernel.org --- drivers/edac/altera_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index cae52c654a15c6..cfd17a8e5865d2 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -2131,8 +2131,8 @@ static int altr_edac_a10_probe(struct platform_device *pdev) edac->irq_chip.name = pdev->dev.of_node->name; edac->irq_chip.irq_mask = a10_eccmgr_irq_mask; edac->irq_chip.irq_unmask = a10_eccmgr_irq_unmask; - edac->domain = irq_domain_create_linear(of_fwnode_handle(pdev->dev.of_node), - 64, &a10_eccmgr_ic_ops, edac); + edac->domain = irq_domain_create_linear(dev_fwnode(&pdev->dev), 64, &a10_eccmgr_ic_ops, + edac); if (!edac->domain) { dev_err(&pdev->dev, "Error adding IRQ domain\n"); return -ENOMEM; From dd7ddd082206f675e967c205df2a35b9c4e64949 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 19 Aug 2025 10:28:49 -0700 Subject: [PATCH 0331/3206] mmc: sdhci-cadence: Fix -Wuninitialized in sdhci_cdns_tune_blkgap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clang warns (or errors with CONFIG_WERROR=y): drivers/mmc/host/sdhci-cadence.c:297:9: error: variable 'hrs37_mode' is uninitialized when used here [-Werror,-Wuninitialized] 297 | writel(hrs37_mode, hrs37_reg); | ^~~~~~~~~~ drivers/mmc/host/sdhci-cadence.c:291:16: note: initialize the variable 'hrs37_mode' to silence this warning 291 | u32 hrs37_mode; | ^ | = 0 A previous revision assigned SDHCI_CDNS_HRS37_MODE_MMC_HS200 to hrs37_mode in a switch statement but the final revision moved to a simple if statement. Pass that as the value to writel() and remove hrs37_mode, clearing up the warning. Fixes: 60613a8b9b81 ("mmc: sdhci-cadence: implement multi-block read gap tuning") Signed-off-by: Nathan Chancellor Tested-by: Benoît Monin Link: https://lore.kernel.org/r/20250819-mmc-sdhci-cadence-fix-uninit-hrs37_mode-v1-1-94aa2d0c438a@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-cadence.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci-cadence.c b/drivers/mmc/host/sdhci-cadence.c index a2a4a5b0ab9648..eaa88897a256c0 100644 --- a/drivers/mmc/host/sdhci-cadence.c +++ b/drivers/mmc/host/sdhci-cadence.c @@ -288,13 +288,12 @@ static int sdhci_cdns_tune_blkgap(struct mmc_host *mmc) void __iomem *hrs38_reg = priv->hrs_addr + SDHCI_CDNS_HRS38; int ret; u32 gap; - u32 hrs37_mode; /* Currently only needed in HS200 mode */ if (host->timing != MMC_TIMING_MMC_HS200) return 0; - writel(hrs37_mode, hrs37_reg); + writel(SDHCI_CDNS_HRS37_MODE_MMC_HS200, hrs37_reg); for (gap = 0; gap <= SDHCI_CDNS_HRS38_BLKGAP_MAX; gap++) { writel(gap, hrs38_reg); From 5d0702dc9c2f6700140b7366dd9ec6c78cde3aa1 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 20 Aug 2025 11:48:01 +0100 Subject: [PATCH 0332/3206] mmc: renesas_sdhi: Replace magic number '0xff' in renesas_sdhi_set_clock() Replace the magic number '0xff' with CLK_CTL_DIV_MASK macro for finding actual clock in renesas_sdhi_set_clock(). Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250820104808.94562-1-biju.das.jz@bp.renesas.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index a41291a28e9bdc..f56fa2cd208dd9 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -223,7 +223,7 @@ static void renesas_sdhi_set_clock(struct tmio_mmc_host *host, } clock = clk & CLK_CTL_DIV_MASK; - if (clock != 0xff) + if (clock != CLK_CTL_DIV_MASK) host->mmc->actual_clock /= (1 << (ffs(clock) + 1)); sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, clock); From 06ea48beece831a8447758e5432ad1cd60765363 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 19 Aug 2025 15:17:53 +0200 Subject: [PATCH 0333/3206] ARM: dts: allwinner: Minor whitespace cleanup The DTS code coding style expects exactly one space around '=' or '{' characters. Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250819131752.86948-2-krzysztof.kozlowski@linaro.org Signed-off-by: Chen-Yu Tsai --- arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts | 2 +- arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi | 2 +- arch/arm/boot/dts/allwinner/sun8i-r40.dtsi | 2 +- arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts b/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts index 83d283cf66334f..d425d9ee83db0e 100644 --- a/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts +++ b/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts @@ -218,7 +218,7 @@ &usbphy { usb0_id_det-gpios = <&pio 7 4 (GPIO_ACTIVE_HIGH | GPIO_PULL_UP)>; /* PH4 */ usb0_vbus_det-gpios = <&pio 7 5 (GPIO_ACTIVE_HIGH | GPIO_PULL_UP)>; /* PH5 */ - usb0_vbus-supply = <®_usb0_vbus>; + usb0_vbus-supply = <®_usb0_vbus>; usb1_vbus-supply = <®_usb1_vbus>; usb2_vbus-supply = <®_usb2_vbus>; status = "okay"; diff --git a/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi b/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi index 272584881bb214..a0f787581dd902 100644 --- a/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi +++ b/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi @@ -82,7 +82,7 @@ }; &ehci0 { - status = "okay"; + status = "okay"; }; &mmc1 { diff --git a/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi b/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi index fa162f7fa9f011..f0ed802a9d08e6 100644 --- a/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi +++ b/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi @@ -705,7 +705,7 @@ }; /omit-if-no-ref/ - uart2_rts_cts_pi_pins: uart2-rts-cts-pi-pins{ + uart2_rts_cts_pi_pins: uart2-rts-cts-pi-pins { pins = "PI16", "PI17"; function = "uart2"; }; diff --git a/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts b/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts index 5143cb4e7b787a..cb6292319f39d7 100644 --- a/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts +++ b/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts @@ -29,7 +29,7 @@ clk_can0: clock-can0 { compatible = "fixed-clock"; #clock-cells = <0>; - clock-frequency = <40000000>; + clock-frequency = <40000000>; }; gpio-keys { From f41c538881eec4dcf5961a242097d447f848cda6 Mon Sep 17 00:00:00 2001 From: Yi Sun Date: Tue, 29 Jul 2025 23:03:12 +0800 Subject: [PATCH 0334/3206] dmaengine: idxd: Remove improper idxd_free The call to idxd_free() introduces a duplicate put_device() leading to a reference count underflow: refcount_t: underflow; use-after-free. WARNING: CPU: 15 PID: 4428 at lib/refcount.c:28 refcount_warn_saturate+0xbe/0x110 ... Call Trace: idxd_remove+0xe4/0x120 [idxd] pci_device_remove+0x3f/0xb0 device_release_driver_internal+0x197/0x200 driver_detach+0x48/0x90 bus_remove_driver+0x74/0xf0 pci_unregister_driver+0x2e/0xb0 idxd_exit_module+0x34/0x7a0 [idxd] __do_sys_delete_module.constprop.0+0x183/0x280 do_syscall_64+0x54/0xd70 entry_SYSCALL_64_after_hwframe+0x76/0x7e The idxd_unregister_devices() which is invoked at the very beginning of idxd_remove(), already takes care of the necessary put_device() through the following call path: idxd_unregister_devices() -> device_unregister() -> put_device() In addition, when CONFIG_DEBUG_KOBJECT_RELEASE is enabled, put_device() may trigger asynchronous cleanup via schedule_delayed_work(). If idxd_free() is called immediately after, it can result in a use-after-free. Remove the improper idxd_free() to avoid both the refcount underflow and potential memory corruption during module unload. Fixes: d5449ff1b04d ("dmaengine: idxd: Add missing idxd cleanup to fix memory leak in remove call") Signed-off-by: Yi Sun Tested-by: Shuai Xue Reviewed-by: Dave Jiang Acked-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/20250729150313.1934101-2-yi.sun@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 35bdefd3728bb8..487268fc527b0c 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -1294,7 +1294,6 @@ static void idxd_remove(struct pci_dev *pdev) idxd_cleanup(idxd); pci_iounmap(pdev, idxd->reg_base); put_device(idxd_confdev(idxd)); - idxd_free(idxd); pci_disable_device(pdev); } From b7cb9a034305d52222433fad10c3de10204f29e7 Mon Sep 17 00:00:00 2001 From: Yi Sun Date: Tue, 29 Jul 2025 23:03:13 +0800 Subject: [PATCH 0335/3206] dmaengine: idxd: Fix refcount underflow on module unload A recent refactor introduced a misplaced put_device() call, resulting in a reference count underflow during module unload. There is no need to add additional put_device() calls for idxd groups, engines, or workqueues. Although the commit claims: "Note, this also fixes the missing put_device() for idxd groups, engines, and wqs." It appears no such omission actually existed. The required cleanup is already handled by the call chain: idxd_unregister_devices() -> device_unregister() -> put_device() Extend idxd_cleanup() to handle the remaining necessary cleanup and remove idxd_cleanup_internals(), which duplicates deallocation logic for idxd, engines, groups, and workqueues. Memory management is also properly handled through the Linux device model. Fixes: a409e919ca32 ("dmaengine: idxd: Refactor remove call with idxd_cleanup() helper") Signed-off-by: Yi Sun Tested-by: Shuai Xue Reviewed-by: Dave Jiang Acked-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/20250729150313.1934101-3-yi.sun@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 487268fc527b0c..efe614a528faf3 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -1291,7 +1291,10 @@ static void idxd_remove(struct pci_dev *pdev) device_unregister(idxd_confdev(idxd)); idxd_shutdown(pdev); idxd_device_remove_debugfs(idxd); - idxd_cleanup(idxd); + perfmon_pmu_remove(idxd); + idxd_cleanup_interrupts(idxd); + if (device_pasid_enabled(idxd)) + idxd_disable_system_pasid(idxd); pci_iounmap(pdev, idxd->reg_base); put_device(idxd_confdev(idxd)); pci_disable_device(pdev); From 39aaa337449e71a41d4813be0226a722827ba606 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 11 Aug 2025 13:43:39 +0300 Subject: [PATCH 0336/3206] dmaengine: idxd: Fix double free in idxd_setup_wqs() The clean up in idxd_setup_wqs() has had a couple bugs because the error handling is a bit subtle. It's simpler to just re-write it in a cleaner way. The issues here are: 1) If "idxd->max_wqs" is <= 0 then we call put_device(conf_dev) when "conf_dev" hasn't been initialized. 2) If kzalloc_node() fails then again "conf_dev" is invalid. It's either uninitialized or it points to the "conf_dev" from the previous iteration so it leads to a double free. It's better to free partial loop iterations within the loop and then the unwinding at the end can handle whole loop iterations. I also renamed the labels to describe what the goto does and not where the goto was located. Fixes: 3fd2f4bc010c ("dmaengine: idxd: fix memory leak in error handling path of idxd_setup_wqs") Reported-by: Colin Ian King Closes: https://lore.kernel.org/all/20250811095836.1642093-1-colin.i.king@gmail.com/ Signed-off-by: Dan Carpenter Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/aJnJW3iYTDDCj9sk@stanley.mountain Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index efe614a528faf3..8c4725ad1f648d 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -189,27 +189,30 @@ static int idxd_setup_wqs(struct idxd_device *idxd) idxd->wq_enable_map = bitmap_zalloc_node(idxd->max_wqs, GFP_KERNEL, dev_to_node(dev)); if (!idxd->wq_enable_map) { rc = -ENOMEM; - goto err_bitmap; + goto err_free_wqs; } for (i = 0; i < idxd->max_wqs; i++) { wq = kzalloc_node(sizeof(*wq), GFP_KERNEL, dev_to_node(dev)); if (!wq) { rc = -ENOMEM; - goto err; + goto err_unwind; } idxd_dev_set_type(&wq->idxd_dev, IDXD_DEV_WQ); conf_dev = wq_confdev(wq); wq->id = i; wq->idxd = idxd; - device_initialize(wq_confdev(wq)); + device_initialize(conf_dev); conf_dev->parent = idxd_confdev(idxd); conf_dev->bus = &dsa_bus_type; conf_dev->type = &idxd_wq_device_type; rc = dev_set_name(conf_dev, "wq%d.%d", idxd->id, wq->id); - if (rc < 0) - goto err; + if (rc < 0) { + put_device(conf_dev); + kfree(wq); + goto err_unwind; + } mutex_init(&wq->wq_lock); init_waitqueue_head(&wq->err_queue); @@ -220,15 +223,20 @@ static int idxd_setup_wqs(struct idxd_device *idxd) wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES; wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev)); if (!wq->wqcfg) { + put_device(conf_dev); + kfree(wq); rc = -ENOMEM; - goto err; + goto err_unwind; } if (idxd->hw.wq_cap.op_config) { wq->opcap_bmap = bitmap_zalloc(IDXD_MAX_OPCAP_BITS, GFP_KERNEL); if (!wq->opcap_bmap) { + kfree(wq->wqcfg); + put_device(conf_dev); + kfree(wq); rc = -ENOMEM; - goto err_opcap_bmap; + goto err_unwind; } bitmap_copy(wq->opcap_bmap, idxd->opcap_bmap, IDXD_MAX_OPCAP_BITS); } @@ -239,13 +247,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd) return 0; -err_opcap_bmap: - kfree(wq->wqcfg); - -err: - put_device(conf_dev); - kfree(wq); - +err_unwind: while (--i >= 0) { wq = idxd->wqs[i]; if (idxd->hw.wq_cap.op_config) @@ -254,11 +256,10 @@ static int idxd_setup_wqs(struct idxd_device *idxd) conf_dev = wq_confdev(wq); put_device(conf_dev); kfree(wq); - } bitmap_free(idxd->wq_enable_map); -err_bitmap: +err_free_wqs: kfree(idxd->wqs); return rc; From 78e097fbca719ce3dbca8f81a6c339a8e0e41d1c Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Wed, 20 Aug 2025 17:22:42 +0800 Subject: [PATCH 0337/3206] libbpf: Add documentation to version and error API functions Add documentation for the following API functions: - libbpf_major_version() - libbpf_minor_version() - libbpf_version_string() - libbpf_strerror() Signed-off-by: Cryolitia PukNgae Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250820-libbpf-doc-1-v1-1-13841f25a134@uniontech.com --- tools/lib/bpf/libbpf.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 2b86e21190d37b..2e91148d9b44d1 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -24,8 +24,25 @@ extern "C" { #endif +/** + * @brief **libbpf_major_version()** provides the major version of libbpf. + * @return An integer, the major version number + */ LIBBPF_API __u32 libbpf_major_version(void); + +/** + * @brief **libbpf_minor_version()** provides the minor version of libbpf. + * @return An integer, the minor version number + */ LIBBPF_API __u32 libbpf_minor_version(void); + +/** + * @brief **libbpf_version_string()** provides the version of libbpf in a + * human-readable form, e.g., "v1.7". + * @return Pointer to a static string containing the version + * + * The format is *not* a part of a stable API and may change in the future. + */ LIBBPF_API const char *libbpf_version_string(void); enum libbpf_errno { @@ -49,6 +66,14 @@ enum libbpf_errno { __LIBBPF_ERRNO__END, }; +/** + * @brief **libbpf_strerror()** converts the provided error code into a + * human-readable string. + * @param err The error code to convert + * @param buf Pointer to a buffer where the error message will be stored + * @param size The number of bytes in the buffer + * @return 0, on success; negative error code, otherwise + */ LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); /** From 136d029662cdde77d3e4db5c07de655f35f0239f Mon Sep 17 00:00:00 2001 From: Rakuram Eswaran Date: Wed, 20 Aug 2025 21:56:13 +0530 Subject: [PATCH 0338/3206] Documentation/staging: Fix typo and incorrect citation in crc32.rst In Documentation/staging/crc32.rst, below errors have been corrected: 1. Line 37: from "to being" to "to bring" 2. Line 119: Incorrect citation date: It must be August 1988 instead of August 1998 Signed-off-by: Rakuram Eswaran Link: https://lore.kernel.org/r/20250820162615.6942-1-rakuram.e96@gmail.com Signed-off-by: Eric Biggers --- Documentation/staging/crc32.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/staging/crc32.rst b/Documentation/staging/crc32.rst index 7542220967cb4c..64f3dd430a6ca7 100644 --- a/Documentation/staging/crc32.rst +++ b/Documentation/staging/crc32.rst @@ -34,7 +34,7 @@ do it in the right order, matching the endianness. Just like with ordinary division, you proceed one digit (bit) at a time. Each step of the division you take one more digit (bit) of the dividend and append it to the current remainder. Then you figure out the -appropriate multiple of the divisor to subtract to being the remainder +appropriate multiple of the divisor to subtract to bring the remainder back into range. In binary, this is easy - it has to be either 0 or 1, and to make the XOR cancel, it's just a copy of bit 32 of the remainder. @@ -116,7 +116,7 @@ for any fractional bytes at the end. To reduce the number of conditional branches, software commonly uses the byte-at-a-time table method, popularized by Dilip V. Sarwate, "Computation of Cyclic Redundancy Checks via Table Look-Up", Comm. ACM -v.31 no.8 (August 1998) p. 1008-1013. +v.31 no.8 (August 1988) p. 1008-1013. Here, rather than just shifting one bit of the remainder to decide in the correct multiple to subtract, we can shift a byte at a time. From 278033a225e13ec21900f0a92b8351658f5377f2 Mon Sep 17 00:00:00 2001 From: Lichen Liu Date: Fri, 15 Aug 2025 20:14:59 +0800 Subject: [PATCH 0339/3206] fs: Add 'initramfs_options' to set initramfs mount options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_TMPFS is enabled, the initial root filesystem is a tmpfs. By default, a tmpfs mount is limited to using 50% of the available RAM for its content. This can be problematic in memory-constrained environments, particularly during a kdump capture. In a kdump scenario, the capture kernel boots with a limited amount of memory specified by the 'crashkernel' parameter. If the initramfs is large, it may fail to unpack into the tmpfs rootfs due to insufficient space. This is because to get X MB of usable space in tmpfs, 2*X MB of memory must be available for the mount. This leads to an OOM failure during the early boot process, preventing a successful crash dump. This patch introduces a new kernel command-line parameter, initramfs_options, which allows passing specific mount options directly to the rootfs when it is first mounted. This gives users control over the rootfs behavior. For example, a user can now specify initramfs_options=size=75% to allow the tmpfs to use up to 75% of the available memory. This can significantly reduce the memory pressure for kdump. Consider a practical example: To unpack a 48MB initramfs, the tmpfs needs 48MB of usable space. With the default 50% limit, this requires a memory pool of 96MB to be available for the tmpfs mount. The total memory requirement is therefore approximately: 16MB (vmlinuz) + 48MB (loaded initramfs) + 48MB (unpacked kernel) + 96MB (for tmpfs) + 12MB (runtime overhead) ≈ 220MB. By using initramfs_options=size=75%, the memory pool required for the 48MB tmpfs is reduced to 48MB / 0.75 = 64MB. This reduces the total memory requirement by 32MB (96MB - 64MB), allowing the kdump to succeed with a smaller crashkernel size, such as 192MB. An alternative approach of reusing the existing rootflags parameter was considered. However, a new, dedicated initramfs_options parameter was chosen to avoid altering the current behavior of rootflags (which applies to the final root filesystem) and to prevent any potential regressions. Also add documentation for the new kernel parameter "initramfs_options" This approach is inspired by prior discussions and patches on the topic. Ref: https://www.lightofdawn.org/blog/?viewDetailed=00128 Ref: https://landley.net/notes-2015.html#01-01-2015 Ref: https://lkml.org/lkml/2021/6/29/783 Ref: https://www.kernel.org/doc/html/latest/filesystems/ramfs-rootfs-initramfs.html#what-is-rootfs Signed-off-by: Lichen Liu Link: https://lore.kernel.org/20250815121459.3391223-1-lichliu@redhat.com Tested-by: Rob Landley Signed-off-by: Christian Brauner --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ fs/namespace.c | 11 ++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 747a55abf4946b..10361d7b748949 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6428,6 +6428,9 @@ rootflags= [KNL] Set root filesystem mount option string + initramfs_options= [KNL] + Specify mount options for for the initramfs mount. + rootfstype= [KNL] Set root filesystem type rootwait [KNL] Wait (indefinitely) for root device to show up. diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d33837..ba3e5215b6362d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -65,6 +65,15 @@ static int __init set_mphash_entries(char *str) } __setup("mphash_entries=", set_mphash_entries); +static char * __initdata initramfs_options; +static int __init initramfs_options_setup(char *str) +{ + initramfs_options = str; + return 1; +} + +__setup("initramfs_options=", initramfs_options_setup); + static u64 event; static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); @@ -6086,7 +6095,7 @@ static void __init init_mount_tree(void) struct mnt_namespace *ns; struct path root; - mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); + mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); if (IS_ERR(mnt)) panic("Can't create rootfs"); From 41a86f62424ac436cb51e3de612ef1e1ddb0c873 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Wed, 20 Aug 2025 21:34:24 +0800 Subject: [PATCH 0340/3206] fs: fix indentation style Replace 8 leading spaces with a tab to follow kernel coding style. Signed-off-by: Guopeng Zhang Link: https://lore.kernel.org/20250820133424.1667467-1-zhangguopeng@kylinos.cn Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index ae6d1312b1849c..51f77c65c0c61e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2455,7 +2455,7 @@ struct vfsmount *clone_private_mount(const struct path *path) return ERR_PTR(-EINVAL); } - if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if (__has_locked_children(old_mnt, path->dentry)) From 16c07342b5425b86547146a6e51d9e32cee8d300 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 20 Aug 2025 18:33:11 -0500 Subject: [PATCH 0341/3206] gpiolib: acpi: Program debounce when finding GPIO When soc-button-array looks up the GPIO to use it calls acpi_find_gpio() which will parse _CRS. acpi_find_gpio.cold (drivers/gpio/gpiolib-acpi-core.c:953) gpiod_find_and_request (drivers/gpio/gpiolib.c:4598 drivers/gpio/gpiolib.c:4625) gpiod_get_index (drivers/gpio/gpiolib.c:4877) The GPIO is setup basically, but the debounce information is discarded. The platform will assert what debounce should be in _CRS, so program it at the time it's available. Reviewed-by: Mika Westerberg Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Andy Shevchenko --- drivers/gpio/gpiolib-acpi-core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpio/gpiolib-acpi-core.c b/drivers/gpio/gpiolib-acpi-core.c index 12b24a717e43f1..e81a29902bb274 100644 --- a/drivers/gpio/gpiolib-acpi-core.c +++ b/drivers/gpio/gpiolib-acpi-core.c @@ -944,6 +944,7 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, bool can_fallback = acpi_can_fallback_to_crs(adev, con_id); struct acpi_gpio_info info; struct gpio_desc *desc; + int ret; desc = __acpi_find_gpio(fwnode, con_id, idx, can_fallback, &info); if (IS_ERR(desc)) @@ -957,6 +958,12 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, acpi_gpio_update_gpiod_flags(dflags, &info); acpi_gpio_update_gpiod_lookup_flags(lookupflags, &info); + + /* ACPI uses hundredths of milliseconds units */ + ret = gpio_set_debounce_timeout(desc, info.debounce * 10); + if (ret) + return ERR_PTR(ret); + return desc; } From 2092b3b278be03f73a2438a9d14e2ac472eec033 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 19 Aug 2025 22:39:32 +0800 Subject: [PATCH 0342/3206] pinctrl: microchip-sgpio: use kcalloc() instead of kzalloc() Use devm_kcalloc() in microchip_sgpio_register_bank() to gain built-in overflow protection, making memory allocation safer when calculating allocation size compared to explicit multiplication. Signed-off-by: Qianfeng Rong Link: https://lore.kernel.org/20250819143935.372084-2-rongqianfeng@vivo.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-microchip-sgpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-microchip-sgpio.c b/drivers/pinctrl/pinctrl-microchip-sgpio.c index 6191e5c1381531..069cc665a4023a 100644 --- a/drivers/pinctrl/pinctrl-microchip-sgpio.c +++ b/drivers/pinctrl/pinctrl-microchip-sgpio.c @@ -824,7 +824,7 @@ static int microchip_sgpio_register_bank(struct device *dev, pctl_desc->confops = &sgpio_confops; pctl_desc->owner = THIS_MODULE; - pins = devm_kzalloc(dev, sizeof(*pins)*ngpios, GFP_KERNEL); + pins = devm_kcalloc(dev, ngpios, sizeof(*pins), GFP_KERNEL); if (!pins) return -ENOMEM; From ae666486ee3baf97571df837ed8acd9e05d015bf Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 19 Aug 2025 22:39:33 +0800 Subject: [PATCH 0343/3206] pinctrl: pinctrl-zynqmp: use kcalloc() instead of kzalloc() Use devm_kcalloc() in versal_pinctrl_prepare_pin_desc() to gain built-in overflow protection, making memory allocation safer when calculating allocation size compared to explicit multiplication. Signed-off-by: Qianfeng Rong Acked-by: Michal Simek Link: https://lore.kernel.org/20250819143935.372084-3-rongqianfeng@vivo.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-zynqmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-zynqmp.c b/drivers/pinctrl/pinctrl-zynqmp.c index fddf0fef4b13b8..585fe1661e170b 100644 --- a/drivers/pinctrl/pinctrl-zynqmp.c +++ b/drivers/pinctrl/pinctrl-zynqmp.c @@ -919,7 +919,7 @@ static int versal_pinctrl_prepare_pin_desc(struct device *dev, if (ret) return ret; - pins = devm_kzalloc(dev, sizeof(*pins) * *npins, GFP_KERNEL); + pins = devm_kcalloc(dev, *npins, sizeof(*pins), GFP_KERNEL); if (!pins) return -ENOMEM; From 42311ea58302d340339369a39eb84c6bfb703750 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 19 Aug 2025 22:39:34 +0800 Subject: [PATCH 0344/3206] pinctrl: qcom: sc8180x: use kcalloc() instead of kzalloc() Use devm_kcalloc() in sc8180x_pinctrl_add_tile_resources() to gain built-in overflow protection, making memory allocation safer when calculating allocation size compared to explicit multiplication. Signed-off-by: Qianfeng Rong Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/20250819143935.372084-4-rongqianfeng@vivo.com Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-sc8180x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/qcom/pinctrl-sc8180x.c b/drivers/pinctrl/qcom/pinctrl-sc8180x.c index 26dd165d154348..ae5134fdebd995 100644 --- a/drivers/pinctrl/qcom/pinctrl-sc8180x.c +++ b/drivers/pinctrl/qcom/pinctrl-sc8180x.c @@ -1634,7 +1634,7 @@ static int sc8180x_pinctrl_add_tile_resources(struct platform_device *pdev) return 0; /* Allocate for new resources */ - nres = devm_kzalloc(&pdev->dev, sizeof(*nres) * nres_num, GFP_KERNEL); + nres = devm_kcalloc(&pdev->dev, nres_num, sizeof(*nres), GFP_KERNEL); if (!nres) return -ENOMEM; From a90d6f4aa0bb1dabae4be04955938c86ad7d3489 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 19 Aug 2025 22:39:35 +0800 Subject: [PATCH 0345/3206] pinctrl: sunxi: use kcalloc() instead of kzalloc() Use devm_kcalloc() in init_pins_table() and prepare_function_table() to gain built-in overflow protection, making memory allocation safer when calculating allocation size compared to explicit multiplication. Signed-off-by: Qianfeng Rong Acked-by: Chen-Yu Tsai Link: https://lore.kernel.org/20250819143935.372084-5-rongqianfeng@vivo.com Signed-off-by: Linus Walleij --- drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c b/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c index 4e34b0cd3b73aa..5f13315ebff340 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c +++ b/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c @@ -103,7 +103,7 @@ static struct sunxi_desc_pin *init_pins_table(struct device *dev, return ERR_PTR(-EINVAL); } - pins = devm_kzalloc(dev, desc->npins * sizeof(*pins), GFP_KERNEL); + pins = devm_kcalloc(dev, desc->npins, sizeof(*pins), GFP_KERNEL); if (!pins) return ERR_PTR(-ENOMEM); @@ -199,7 +199,7 @@ static int prepare_function_table(struct device *dev, struct device_node *pnode, * Allocate the memory needed for the functions in one table. * We later use pointers into this table to mark each pin. */ - func = devm_kzalloc(dev, num_funcs * sizeof(*func), GFP_KERNEL); + func = devm_kcalloc(dev, num_funcs, sizeof(*func), GFP_KERNEL); if (!func) return -ENOMEM; From be1e0283021ec73c2eb92839db9a471a068709d9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 21 Aug 2025 13:50:47 +0200 Subject: [PATCH 0346/3206] coredump: don't pointlessly check and spew warnings When a write happens it doesn't make sense to check perform checks on the input. Skip them. Whether a fixes tag is licensed is a bit of a gray area here but I'll add one for the socket validation part I added recently. Link: https://lore.kernel.org/20250821-moosbedeckt-denunziant-7908663f3563@brauner Fixes: 16195d2c7dd2 ("coredump: validate socket name as it is written") Reported-by: Brad Spengler Signed-off-by: Christian Brauner --- fs/coredump.c | 4 ++++ fs/exec.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/coredump.c b/fs/coredump.c index 5dce257c67fc8b..60bc9685e14985 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1466,11 +1466,15 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write, ssize_t retval; char old_core_pattern[CORENAME_MAX_SIZE]; + if (write) + return proc_dostring(table, write, buffer, lenp, ppos); + retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE); error = proc_dostring(table, write, buffer, lenp, ppos); if (error) return error; + if (!check_coredump_socket()) { strscpy(core_pattern, old_core_pattern, retval + 1); return -EINVAL; diff --git a/fs/exec.c b/fs/exec.c index 2a1e5e4042a149..e861a4b7ffda92 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) + if (!error && !write) validate_coredump_safety(); return error; } From ee97f1fe67c0dff62549c6d8e8a7784431e65d46 Mon Sep 17 00:00:00 2001 From: Andrea della Porta Date: Tue, 12 Aug 2025 10:46:39 +0200 Subject: [PATCH 0347/3206] dt-bindings: pinctrl: rp1: Describe groups for RP1 pin controller The DT binding for RP1 pin controller currently lacks the group definitions. Add groups enumeration to the schema. Signed-off-by: Andrea della Porta Reviewed-by: Rob Herring (Arm) Link: https://lore.kernel.org/20250812084639.13442-1-andrea.porta@suse.com Signed-off-by: Linus Walleij --- .../pinctrl/raspberrypi,rp1-gpio.yaml | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pinctrl/raspberrypi,rp1-gpio.yaml b/Documentation/devicetree/bindings/pinctrl/raspberrypi,rp1-gpio.yaml index eec9a9b58542f9..af6fbbd4feeaf6 100644 --- a/Documentation/devicetree/bindings/pinctrl/raspberrypi,rp1-gpio.yaml +++ b/Documentation/devicetree/bindings/pinctrl/raspberrypi,rp1-gpio.yaml @@ -72,10 +72,36 @@ $defs: pins: description: List of gpio pins affected by the properties specified in this - subnode. + subnode (either this or "groups" must be specified). items: pattern: '^gpio([0-9]|[1-4][0-9]|5[0-3])$' + groups: + description: + List of groups affected by the properties specified in this + subnode (either this or "pins" must be specified). + items: + anyOf: + - pattern: '^gpio([0-9]|[1-4][0-9]|5[0-3])$' + - enum: [ uart0, uart0_ctrl, uart1, uart1_ctrl, uart2, uart2_ctrl, + uart3, uart3_ctrl, uart4, uart4_ctrl, uart5_0, + uart5_0_ctrl, uart5_1, uart5_1_ctrl, uart5_2, + uart5_2_ctrl, uart5_3, + sd0, sd1, + i2s0, i2s0_dual, i2s0_quad, i2s1, i2s1_dual, i2s1_quad, + i2s2_0, i2s2_0_dual, i2s2_1, i2s2_1_dual, + i2c4_0, i2c4_1, i2c4_2, i2c4_3, i2c6_0, i2c6_1, i2c5_0, + i2c5_1, i2c5_2, i2c5_3, i2c0_0, i2c0_1, i2c1_0, i2c1_1, + i2c2_0, i2c2_1, i2c3_0, i2c3_1, i2c3_2, + dpi_16bit, dpi_16bit_cpadhi, dpi_16bit_pad666, + dpi_18bit, dpi_18bit_cpadhi, dpi_24bit, + spi0, spi0_quad, spi1, spi2, spi3, spi4, spi5, spi6_0, + spi6_1, spi7_0, spi7_1, spi8_0, spi8_1, + aaud_0, aaud_1, aaud_2, aaud_3, aaud_4, + vbus0_0, vbus0_1, vbus1, vbus2, vbus3, + mic_0, mic_1, mic_2, mic_3, + ir ] + function: enum: [ alt0, alt1, alt2, alt3, alt4, gpio, alt6, alt7, alt8, none, aaud, dcd0, dpi, dsi0_te_ext, dsi1_te_ext, dsr0, dtr0, gpclk0, @@ -103,6 +129,13 @@ $defs: drive-strength: enum: [ 2, 4, 8, 12 ] + required: + - function + + oneOf: + - required: [ groups ] + - required: [ pins ] + additionalProperties: false allOf: From 6c9468aad215a198742c8375b0415e42521c905c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:54 -0700 Subject: [PATCH 0348/3206] fscrypt: replace raw loads of info pointer with helper function Add and use a helper function fscrypt_get_inode_info_raw(). It loads an inode's fscrypt info pointer using a raw dereference, which is appropriate when the caller knows the key setup already happened. This eliminates most occurrences of inode::i_crypt_info in the source, in preparation for replacing that with a filesystem-specific field. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-2-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/crypto/bio.c | 2 +- fs/crypto/crypto.c | 14 ++++++++------ fs/crypto/fname.c | 11 ++++++----- fs/crypto/hooks.c | 2 +- fs/crypto/inline_crypt.c | 12 +++++++----- fs/crypto/policy.c | 7 ++++--- include/linux/fscrypt.h | 16 ++++++++++++++++ 7 files changed, 43 insertions(+), 21 deletions(-) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 486fcb2ecf13eb..0d746de4cd1039 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -113,7 +113,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index b6ccab524fdef8..07f9cbfe3ea411 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -173,7 +173,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs, gfp_t gfp_flags) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; @@ -232,8 +232,9 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_ENCRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); @@ -255,7 +256,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + @@ -305,8 +306,9 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_DECRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index f9f6713e144f7a..fb77ad1ca74a2c 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -94,7 +94,7 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -138,7 +138,7 @@ static int fname_decrypt(const struct inode *inode, const struct fscrypt_str *iname, struct fscrypt_str *oname) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -274,8 +274,9 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { - return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, - orig_len, max_len, + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); + + return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len, encrypted_len_ret); } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); @@ -543,7 +544,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name); */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { - const struct fscrypt_inode_info *ci = dir->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir); WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index e0b32ac841f765..7a5d4c168c49e6 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -199,7 +199,7 @@ int fscrypt_prepare_setflags(struct inode *inode, err = fscrypt_require_key(inode); if (err) return err; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index caaff809765b29..5dee7c498bc8c5 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -263,7 +263,7 @@ int fscrypt_derive_sw_secret(struct super_block *sb, bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { - return inode->i_crypt_info->ci_inlinecrypt; + return fscrypt_get_inode_info_raw(inode)->ci_inlinecrypt; } EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto); @@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, if (!fscrypt_inode_uses_inline_crypto(inode)) return; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); fscrypt_generate_dun(ci, first_lblk, dun); bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); @@ -385,22 +385,24 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { const struct bio_crypt_ctx *bc = bio->bi_crypt_context; + const struct fscrypt_inode_info *ci; u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; if (!!bc != fscrypt_inode_uses_inline_crypto(inode)) return false; if (!bc) return true; + ci = fscrypt_get_inode_info_raw(inode); /* * Comparing the key pointers is good enough, as all I/O for each key * uses the same pointer. I.e., there's currently no need to support * merging requests where the keys are the same but the pointers differ. */ - if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key) + if (bc->bc_key != ci->ci_enc_key.blk_key) return false; - fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); + fscrypt_generate_dun(ci, next_lblk, next_dun); return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun); } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); @@ -502,7 +504,7 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) if (nr_blocks <= 1) return nr_blocks; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (!(fscrypt_policy_flags(&ci->ci_policy) & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) return nr_blocks; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6ad30ae07c065c..9d51f3500de376 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -727,7 +727,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) err = fscrypt_require_key(dir); if (err) return ERR_PTR(err); - return &dir->i_crypt_info->ci_policy; + return &fscrypt_get_inode_info_raw(dir)->ci_policy; } return fscrypt_get_dummy_policy(dir->i_sb); @@ -746,7 +746,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) */ int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); BUILD_BUG_ON(sizeof(union fscrypt_context) != FSCRYPT_SET_CONTEXT_MAX_SIZE); @@ -771,7 +771,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); */ int fscrypt_set_context(struct inode *inode, void *fs_data) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci; union fscrypt_context ctx; int ctxsize; @@ -783,6 +783,7 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) * This may be the first time the inode number is available, so do any * delayed key setup that requires the inode number. */ + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) fscrypt_hash_inode_number(ci, ci->ci_master_key); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 10dd161690a28c..23c5198612d1a6 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -195,6 +195,22 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +/* + * Load the inode's fscrypt info pointer, using a raw dereference. Since this + * uses a raw dereference with no memory barrier, it is appropriate to use only + * when the caller knows the inode's key setup already happened, resulting in + * non-NULL fscrypt info. E.g., the file contents en/decryption functions use + * this, since fscrypt_file_open() set up the key. + */ +static inline struct fscrypt_inode_info * +fscrypt_get_inode_info_raw(const struct inode *inode) +{ + struct fscrypt_inode_info *ci = inode->i_crypt_info; + + VFS_WARN_ON_ONCE(ci == NULL); + return ci; +} + static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { From 93221de31a8df6710e02328f82dc68d7ab4ad9e6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:55 -0700 Subject: [PATCH 0349/3206] fscrypt: add support for info in fs-specific part of inode Add an inode_info_offs field to struct fscrypt_operations, and update fs/crypto/ to support it. When set to a nonzero value, it specifies the offset to the fscrypt_inode_info pointer within the filesystem-specific part of the inode structure, to be used instead of inode::i_crypt_info. Since this makes inode::i_crypt_info no longer necessarily used, update comments that mentioned it. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-3-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/crypto/fscrypt_private.h | 4 ++-- fs/crypto/keysetup.c | 43 ++++++++++++++++++++++--------------- include/linux/fscrypt.h | 22 +++++++++++++++---- 3 files changed, 46 insertions(+), 23 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d8b485b9881c50..245e6b84aa1741 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -249,8 +249,8 @@ struct fscrypt_prepared_key { * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is - * allocated and stored in ->i_crypt_info. Once created, it remains until the - * inode is evicted. + * allocated and a pointer to it is stored in the file's in-memory inode. Once + * created, it remains until the inode is evicted. */ struct fscrypt_inode_info { diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 4f3b9ecbfe4e66..c1f85715c27607 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -642,15 +642,16 @@ fscrypt_setup_encryption_info(struct inode *inode, goto out; /* - * For existing inodes, multiple tasks may race to set ->i_crypt_info. - * So use cmpxchg_release(). This pairs with the smp_load_acquire() in - * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with - * a RELEASE barrier so that other tasks can ACQUIRE it. + * For existing inodes, multiple tasks may race to set the inode's + * fscrypt info pointer. So use cmpxchg_release(). This pairs with the + * smp_load_acquire() in fscrypt_get_inode_info(). I.e., publish the + * pointer with a RELEASE barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { + if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) == + NULL) { /* - * We won the race and set ->i_crypt_info to our crypt_info. - * Now link it into the master key's inode list. + * We won the race and set the inode's fscrypt info to our + * crypt_info. Now link it into the master key's inode list. */ if (mk) { crypt_info->ci_master_key = mk; @@ -681,13 +682,13 @@ fscrypt_setup_encryption_info(struct inode *inode, * %false unless the operation being performed is needed in * order for files (or directories) to be deleted. * - * Set up ->i_crypt_info, if it hasn't already been done. + * Set up the inode's encryption key, if it hasn't already been done. * - * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So + * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe. So * generally this shouldn't be called from within a filesystem transaction. * - * Return: 0 if ->i_crypt_info was set or was already set, *or* if the - * encryption key is unavailable. (Use fscrypt_has_encryption_key() to + * Return: 0 if the key is now set up, *or* if it couldn't be set up because the + * needed master key is absent. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) @@ -741,9 +742,9 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * - * If the directory is encrypted, set up its ->i_crypt_info in preparation for + * If the directory is encrypted, set up its encryption key in preparation for * encrypting the name of the new file. Also, if the new inode will be - * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. + * encrypted, set up its encryption key too and set *encrypt_ret=true. * * This isn't %GFP_NOFS-safe, and therefore it should be called before starting * any filesystem transaction to create the inode. For this reason, ->i_ino @@ -752,8 +753,8 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * This doesn't persist the new inode's encryption context. That still needs to * be done later by calling fscrypt_set_context(). * - * Return: 0 on success, -ENOKEY if the encryption key is missing, or another - * -errno code + * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode + * but the needed master key is absent, or another -errno code */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) @@ -800,8 +801,16 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); */ void fscrypt_put_encryption_info(struct inode *inode) { - put_crypt_info(inode->i_crypt_info); - inode->i_crypt_info = NULL; + /* + * Ideally we'd start with a lightweight IS_ENCRYPTED() check here + * before proceeding to retrieve and check the pointer. However, during + * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED. If + * an error occurs, it needs to be cleaned up regardless. + */ + struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode); + + put_crypt_info(*ci_addr); + *ci_addr = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 23c5198612d1a6..d7ff53accbfef1 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -61,6 +61,12 @@ struct fscrypt_name { /* Crypto operations for filesystems */ struct fscrypt_operations { + /* + * The offset of the pointer to struct fscrypt_inode_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /* * If set, then fs/crypto/ will allocate a global bounce page pool the @@ -195,6 +201,14 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +static inline struct fscrypt_inode_info ** +fscrypt_inode_info_addr(const struct inode *inode) +{ + if (inode->i_sb->s_cop->inode_info_offs == 0) + return (struct fscrypt_inode_info **)&inode->i_crypt_info; + return (void *)inode + inode->i_sb->s_cop->inode_info_offs; +} + /* * Load the inode's fscrypt info pointer, using a raw dereference. Since this * uses a raw dereference with no memory barrier, it is appropriate to use only @@ -205,7 +219,7 @@ int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, static inline struct fscrypt_inode_info * fscrypt_get_inode_info_raw(const struct inode *inode) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = *fscrypt_inode_info_addr(inode); VFS_WARN_ON_ONCE(ci == NULL); return ci; @@ -216,11 +230,11 @@ fscrypt_get_inode_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). - * I.e., another task may publish ->i_crypt_info concurrently, executing - * a RELEASE barrier. We need to use smp_load_acquire() here to safely + * I.e., another task may publish the fscrypt info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ - return smp_load_acquire(&inode->i_crypt_info); + return smp_load_acquire(fscrypt_inode_info_addr(inode)); } /** From 80e07df424e583d4124be6059be54080e4c4cb64 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:56 -0700 Subject: [PATCH 0350/3206] ext4: move crypt info pointer to fs-specific part of inode Move the fscrypt_inode_info pointer into the filesystem-specific part of the inode by adding the field ext4_inode_info::i_crypt_info and configuring fscrypt_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-4-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Signed-off-by: Christian Brauner --- fs/ext4/crypto.c | 2 ++ fs/ext4/ext4.h | 4 ++++ fs/ext4/super.c | 3 +++ 3 files changed, 9 insertions(+) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 0a056d97e64024..cf0a0970c09562 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -227,6 +227,8 @@ static bool ext4_has_stable_inodes(struct super_block *sb) } const struct fscrypt_operations ext4_cryptops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 01a6e2de7fc3ef..c897109dadb157 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1182,6 +1182,10 @@ struct ext4_inode_info { __u32 i_csum_seed; kprojid_t i_projid; + +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif }; /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c7d39da7e733b1..0c3059ecce37ce 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1470,6 +1470,9 @@ static void init_once(void *foo) init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + ei->i_crypt_info = NULL; +#endif } static int __init init_inodecache(void) From 7afb71ee92de72f54877b06a31d46614edcaad24 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:57 -0700 Subject: [PATCH 0351/3206] f2fs: move crypt info pointer to fs-specific part of inode Move the fscrypt_inode_info pointer into the filesystem-specific part of the inode by adding the field f2fs_inode_info::i_crypt_info and configuring fscrypt_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-5-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/super.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 46be7560548ce2..2f5c30c069c3ca 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -907,6 +907,9 @@ struct f2fs_inode_info { unsigned int atomic_write_cnt; loff_t original_i_size; /* original i_size before atomic write */ +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */ +#endif }; static inline void get_read_extent_info(struct extent_info *ext, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e16c4e2830c298..b42b55280d9e38 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -480,6 +480,9 @@ static void init_once(void *foo) struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; inode_init_once(&fi->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + fi->i_crypt_info = NULL; +#endif } #ifdef CONFIG_QUOTA @@ -3570,6 +3573,8 @@ static struct block_device **f2fs_get_devices(struct super_block *sb, } static const struct fscrypt_operations f2fs_cryptops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_crypt_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, @@ -3581,7 +3586,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .has_stable_inodes = f2fs_has_stable_inodes, .get_devices = f2fs_get_devices, }; -#endif +#endif /* CONFIG_FS_ENCRYPTION */ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) From e1add70aaa5ea469980a60a1747ab9863fec2124 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:58 -0700 Subject: [PATCH 0352/3206] ubifs: move crypt info pointer to fs-specific part of inode Move the fscrypt_inode_info pointer into the filesystem-specific part of the inode by adding the field ubifs_inode::i_crypt_info and configuring fscrypt_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Note that the initialization of ubifs_inode::i_crypt_info to NULL on inode allocation is handled by the memset() in ubifs_alloc_inode(). Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-6-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/ubifs/crypto.c | 2 ++ fs/ubifs/ubifs.h | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index fb5ac358077b15..0b14d004a095ba 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -88,6 +88,8 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, } const struct fscrypt_operations ubifs_crypt_operations = { + .inode_info_offs = (int)offsetof(struct ubifs_inode, i_crypt_info) - + (int)offsetof(struct ubifs_inode, vfs_inode), .legacy_key_prefix = "ubifs:", .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 5db45c9e26ee0e..49e50431741cd2 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -365,6 +365,7 @@ struct ubifs_gced_idx_leb { * @read_in_a_row: number of consecutive pages read in a row (for bulk read) * @data_len: length of the data attached to the inode * @data: inode's data + * @i_crypt_info: inode's fscrypt information * * @ui_mutex exists for two main reasons. At first it prevents inodes from * being written back while UBIFS changing them, being in the middle of an VFS @@ -416,6 +417,9 @@ struct ubifs_inode { pgoff_t read_in_a_row; int data_len; void *data; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif }; /** From bbe395ded3ef2a2aecfc90372bda2b3e3ed8f2ee Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:59 -0700 Subject: [PATCH 0353/3206] ceph: move crypt info pointer to fs-specific part of inode Move the fscrypt_inode_info pointer into the filesystem-specific part of the inode by adding the field ceph_inode_info::i_crypt_info and configuring fscrypt_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-7-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/ceph/crypto.c | 2 ++ fs/ceph/inode.c | 1 + fs/ceph/super.h | 1 + 3 files changed, 4 insertions(+) diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index cab7226192073f..7026e794813ca1 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -133,6 +133,8 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) } static struct fscrypt_operations ceph_fscrypt_ops = { + .inode_info_offs = (int)offsetof(struct ceph_inode_info, i_crypt_info) - + (int)offsetof(struct ceph_inode_info, netfs.inode), .needs_bounce_pages = 1, .get_context = ceph_crypt_get_context, .set_context = ceph_crypt_set_context, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index fc543075b827a9..480cb3a1d639ac 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -665,6 +665,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); #ifdef CONFIG_FS_ENCRYPTION + ci->i_crypt_info = NULL; ci->fscrypt_auth = NULL; ci->fscrypt_auth_len = 0; #endif diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cf176aab0f8239..25d8bacbcf4408 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -463,6 +463,7 @@ struct ceph_inode_info { unsigned long i_work_mask; #ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; u32 fscrypt_auth_len; u32 fscrypt_file_len; u8 *fscrypt_auth; From ab90c2d2476c4dd6deddd089c7e83b858d135783 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:00 -0700 Subject: [PATCH 0354/3206] fs: remove inode::i_crypt_info Now that all fscrypt-capable filesystems store the pointer to fscrypt_inode_info in the filesystem-specific part of the inode structure, inode::i_crypt_info is no longer needed. Update fscrypt_inode_info_addr() to no longer support the fallback to inode::i_crypt_info. Finally, remove inode::i_crypt_info itself along with the now-unnecessary forward declaration of fscrypt_inode_info. The end result of the migration to the filesystem-specific pointer is memory savings on CONFIG_FS_ENCRYPTION=y kernels for all filesystems that don't support fscrypt. Specifically, their in-memory inodes are now smaller by the size of a pointer: either 4 or 8 bytes. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-8-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ----- include/linux/fscrypt.h | 8 ++++++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d7051f..1dafa18169be61 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -72,7 +72,6 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; -struct fscrypt_inode_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; @@ -780,10 +779,6 @@ struct inode { struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif -#ifdef CONFIG_FS_ENCRYPTION - struct fscrypt_inode_info *i_crypt_info; -#endif - #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index d7ff53accbfef1..516aba5b858b54 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -201,11 +201,15 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +/* + * Returns the address of the fscrypt info pointer within the + * filesystem-specific part of the inode. (To save memory on filesystems that + * don't support fscrypt, a field in 'struct inode' itself is no longer used.) + */ static inline struct fscrypt_inode_info ** fscrypt_inode_info_addr(const struct inode *inode) { - if (inode->i_sb->s_cop->inode_info_offs == 0) - return (struct fscrypt_inode_info **)&inode->i_crypt_info; + VFS_WARN_ON_ONCE(inode->i_sb->s_cop->inode_info_offs == 0); return (void *)inode + inode->i_sb->s_cop->inode_info_offs; } From 2a7349add18e5915cd87251af5f98db1772b6131 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:01 -0700 Subject: [PATCH 0355/3206] fsverity: add support for info in fs-specific part of inode Add an inode_info_offs field to struct fsverity_operations, and update fs/verity/ to support it. When set to a nonzero value, it specifies the offset to the fsverity_info pointer within the filesystem-specific part of the inode structure, to be used instead of inode::i_verity_info. Since this makes inode::i_verity_info no longer necessarily used, update comments that mentioned it. This is a prerequisite for a later commit that removes inode::i_verity_info, saving memory and improving cache efficiency on filesystems that don't support fsverity. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-9-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/verity/enable.c | 6 ++--- fs/verity/fsverity_private.h | 9 ++++---- fs/verity/open.c | 23 ++++++++++--------- fs/verity/verify.c | 2 +- include/linux/fsverity.h | 44 ++++++++++++++++++++++++++++-------- 5 files changed, 55 insertions(+), 29 deletions(-) diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 503268cf429627..89eccc4becf902 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -284,9 +284,9 @@ static int enable_verity(struct file *filp, /* Successfully enabled verity */ /* - * Readers can start using ->i_verity_info immediately, so it - * can't be rolled back once set. So don't set it until just - * after the filesystem has successfully enabled verity. + * Readers can start using the inode's verity info immediately, + * so it can't be rolled back once set. So don't set it until + * just after the filesystem has successfully enabled verity. */ fsverity_set_info(inode, vi); } diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 5fe854a5b9ad3d..bc1d887c532e74 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -63,10 +63,11 @@ struct merkle_tree_params { * fsverity_info - cached verity metadata for an inode * * When a verity file is first opened, an instance of this struct is allocated - * and stored in ->i_verity_info; it remains until the inode is evicted. It - * caches information about the Merkle tree that's needed to efficiently verify - * data read from the file. It also caches the file digest. The Merkle tree - * pages themselves are not cached here, but the filesystem may cache them. + * and a pointer to it is stored in the file's in-memory inode. It remains + * until the inode is evicted. It caches information about the Merkle tree + * that's needed to efficiently verify data read from the file. It also caches + * the file digest. The Merkle tree pages themselves are not cached here, but + * the filesystem may cache them. */ struct fsverity_info { struct merkle_tree_params tree_params; diff --git a/fs/verity/open.c b/fs/verity/open.c index c561e130cd0c61..77b1c977af0256 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -244,17 +244,17 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* - * Multiple tasks may race to set ->i_verity_info, so use - * cmpxchg_release(). This pairs with the smp_load_acquire() in - * fsverity_get_info(). I.e., here we publish ->i_verity_info with a - * RELEASE barrier so that other tasks can ACQUIRE it. + * Multiple tasks may race to set the inode's verity info pointer, so + * use cmpxchg_release(). This pairs with the smp_load_acquire() in + * fsverity_get_info(). I.e., publish the pointer with a RELEASE + * barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { - /* Lost the race, so free the fsverity_info we allocated. */ + if (cmpxchg_release(fsverity_info_addr(inode), NULL, vi) != NULL) { + /* Lost the race, so free the verity info we allocated. */ fsverity_free_info(vi); /* - * Afterwards, the caller may access ->i_verity_info directly, - * so make sure to ACQUIRE the winning fsverity_info. + * Afterwards, the caller may access the inode's verity info + * directly, so make sure to ACQUIRE the winning verity info. */ (void)fsverity_get_info(inode); } @@ -350,7 +350,6 @@ int fsverity_get_descriptor(struct inode *inode, return 0; } -/* Ensure the inode has an ->i_verity_info */ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); @@ -395,8 +394,10 @@ EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); void __fsverity_cleanup_inode(struct inode *inode) { - fsverity_free_info(inode->i_verity_info); - inode->i_verity_info = NULL; + struct fsverity_info **vi_addr = fsverity_info_addr(inode); + + fsverity_free_info(*vi_addr); + *vi_addr = NULL; } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); diff --git a/fs/verity/verify.c b/fs/verity/verify.c index a1f00c3fd3b276..affc307eb6a6b1 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -245,7 +245,7 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, unsigned long max_ra_pages) { struct inode *inode = data_folio->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; + struct fsverity_info *vi = *fsverity_info_addr(inode); const unsigned int block_size = vi->tree_params.block_size; u64 pos = (u64)data_folio->index << PAGE_SHIFT; diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 1eb7eae580be70..e0f132cb783932 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -28,6 +28,12 @@ /* Verity operations for filesystems */ struct fsverity_operations { + /** + * The offset of the pointer to struct fsverity_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /** * Begin enabling verity on the given file. @@ -124,15 +130,33 @@ struct fsverity_operations { #ifdef CONFIG_FS_VERITY +static inline struct fsverity_info ** +fsverity_info_addr(const struct inode *inode) +{ + if (inode->i_sb->s_vop->inode_info_offs == 0) + return (struct fsverity_info **)&inode->i_verity_info; + return (void *)inode + inode->i_sb->s_vop->inode_info_offs; +} + static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { /* - * Pairs with the cmpxchg_release() in fsverity_set_info(). - * I.e., another task may publish ->i_verity_info concurrently, - * executing a RELEASE barrier. We need to use smp_load_acquire() here - * to safely ACQUIRE the memory the other task published. + * Since this function can be called on inodes belonging to filesystems + * that don't support fsverity at all, and fsverity_info_addr() doesn't + * work on such filesystems, we have to start with an IS_VERITY() check. + * Checking IS_VERITY() here is also useful to minimize the overhead of + * fsverity_active() on non-verity files. + */ + if (!IS_VERITY(inode)) + return NULL; + + /* + * Pairs with the cmpxchg_release() in fsverity_set_info(). I.e., + * another task may publish the inode's verity info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely + * ACQUIRE the memory the other task published. */ - return smp_load_acquire(&inode->i_verity_info); + return smp_load_acquire(fsverity_info_addr(inode)); } /* enable.c */ @@ -156,11 +180,11 @@ void __fsverity_cleanup_inode(struct inode *inode); * fsverity_cleanup_inode() - free the inode's verity info, if present * @inode: an inode being evicted * - * Filesystems must call this on inode eviction to free ->i_verity_info. + * Filesystems must call this on inode eviction to free the inode's verity info. */ static inline void fsverity_cleanup_inode(struct inode *inode) { - if (inode->i_verity_info) + if (*fsverity_info_addr(inode)) __fsverity_cleanup_inode(inode); } @@ -267,12 +291,12 @@ static inline bool fsverity_verify_page(struct page *page) * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check * - * This checks whether ->i_verity_info has been set. + * This checks whether the inode's verity info has been set. * * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with - * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) + * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before the verity info.) * * Return: true if reads need to go through fs-verity, otherwise false */ @@ -287,7 +311,7 @@ static inline bool fsverity_active(const struct inode *inode) * @filp: the struct file being set up * * When opening a verity file, deny the open if it is for writing. Otherwise, - * set up the inode's ->i_verity_info if not already done. + * set up the inode's verity info if not already done. * * When combined with fscrypt, this must be called after fscrypt_file_open(). * Otherwise, we won't have the key set up to decrypt the verity metadata. From c9fff804b59c5495db944ddf84e1f963967cc361 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:02 -0700 Subject: [PATCH 0356/3206] ext4: move verity info pointer to fs-specific part of inode Move the fsverity_info pointer into the filesystem-specific part of the inode by adding the field ext4_inode_info::i_verity_info and configuring fsverity_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_verity_info, saving memory and improving cache efficiency on filesystems that don't support fsverity. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-10-ebiggers@kernel.org Acked-by: Theodore Ts'o Signed-off-by: Christian Brauner --- fs/ext4/ext4.h | 4 ++++ fs/ext4/super.c | 3 +++ fs/ext4/verity.c | 2 ++ 3 files changed, 9 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c897109dadb157..6cb784a56b3bab 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1186,6 +1186,10 @@ struct ext4_inode_info { #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; #endif + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif }; /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c3059ecce37ce..46138a6cb32a3d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1473,6 +1473,9 @@ static void init_once(void *foo) #ifdef CONFIG_FS_ENCRYPTION ei->i_crypt_info = NULL; #endif +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif } static int __init init_inodecache(void) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index d9203228ce9796..b0acb0c5031379 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -389,6 +389,8 @@ static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations ext4_verityops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_verity_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), .begin_enable_verity = ext4_begin_enable_verity, .end_enable_verity = ext4_end_enable_verity, .get_verity_descriptor = ext4_get_verity_descriptor, From 1f66cef4a9a3033b76db08de25eb017ddc6967e6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:03 -0700 Subject: [PATCH 0357/3206] f2fs: move verity info pointer to fs-specific part of inode Move the fsverity_info pointer into the filesystem-specific part of the inode by adding the field f2fs_inode_info::i_verity_info and configuring fsverity_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_verity_info, saving memory and improving cache efficiency on filesystems that don't support fsverity. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-11-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/super.c | 3 +++ fs/f2fs/verity.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2f5c30c069c3ca..6e465bbc85ee5c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -910,6 +910,9 @@ struct f2fs_inode_info { #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */ #endif +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; /* filesystem verity info */ +#endif }; static inline void get_read_extent_info(struct extent_info *ext, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b42b55280d9e38..1db024b20e29b4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -483,6 +483,9 @@ static void init_once(void *foo) #ifdef CONFIG_FS_ENCRYPTION fi->i_crypt_info = NULL; #endif +#ifdef CONFIG_FS_VERITY + fi->i_verity_info = NULL; +#endif } #ifdef CONFIG_QUOTA diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 2287f238ae09eb..f0ab9a3c7a82b3 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -287,6 +287,8 @@ static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations f2fs_verityops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_verity_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), .begin_enable_verity = f2fs_begin_enable_verity, .end_enable_verity = f2fs_end_enable_verity, .get_verity_descriptor = f2fs_get_verity_descriptor, From fcafdd4210658986470208230253ba5cdc6107a0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:04 -0700 Subject: [PATCH 0358/3206] btrfs: move verity info pointer to fs-specific part of inode Move the fsverity_info pointer into the filesystem-specific part of the inode by adding the field btrfs_inode::i_verity_info and configuring fsverity_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_verity_info, saving memory and improving cache efficiency on filesystems that don't support fsverity. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-12-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/btrfs/btrfs_inode.h | 5 +++++ fs/btrfs/inode.c | 3 +++ fs/btrfs/verity.c | 2 ++ 3 files changed, 10 insertions(+) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b99fb027329290..2c9489497cbeac 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -338,6 +338,11 @@ struct btrfs_inode { struct list_head delayed_iput; struct rw_semaphore i_mmap_lock; + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif + struct inode vfs_inode; }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b77dd22b8cdbe3..de722b232ec145 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7961,6 +7961,9 @@ static void init_once(void *foo) struct btrfs_inode *ei = foo; inode_init_once(&ei->vfs_inode); +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif } void __cold btrfs_destroy_cachep(void) diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index b7a96a005487e1..4633cbcfcdb900 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -802,6 +802,8 @@ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations btrfs_verityops = { + .inode_info_offs = (int)offsetof(struct btrfs_inode, i_verity_info) - + (int)offsetof(struct btrfs_inode, vfs_inode), .begin_enable_verity = btrfs_begin_enable_verity, .end_enable_verity = btrfs_end_enable_verity, .get_verity_descriptor = btrfs_get_verity_descriptor, From 818c659ac164e4e4639ceaedaccbdfebb1ef63b5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:05 -0700 Subject: [PATCH 0359/3206] fs: remove inode::i_verity_info Now that all fsverity-capable filesystems store the pointer to fsverity_info in the filesystem-specific part of the inode structure, inode::i_verity_info is no longer needed. Update fsverity_info_addr() to no longer support the fallback to inode::i_verity_info. Finally, remove inode::i_verity_info itself, and move the forward declaration of struct fsverity_info from fs.h (which no longer needs it) to fsverity.h. The end result of the migration to the filesystem-specific pointer is memory savings on CONFIG_FS_VERITY=y kernels for all filesystems that don't support fsverity. Specifically, their in-memory inodes are now smaller by the size of a pointer: either 4 or 8 bytes. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-13-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ----- include/linux/fsverity.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 1dafa18169be61..12ecc6b0e6f96b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -73,7 +73,6 @@ struct seq_file; struct workqueue_struct; struct iov_iter; struct fscrypt_operations; -struct fsverity_info; struct fsverity_operations; struct fsnotify_mark_connector; struct fsnotify_sb_info; @@ -779,10 +778,6 @@ struct inode { struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif -#ifdef CONFIG_FS_VERITY - struct fsverity_info *i_verity_info; -#endif - void *i_private; /* fs or device private pointer */ } __randomize_layout; diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index e0f132cb783932..844f7b8b56bbce 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -26,6 +26,8 @@ /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 +struct fsverity_info; + /* Verity operations for filesystems */ struct fsverity_operations { /** @@ -130,11 +132,15 @@ struct fsverity_operations { #ifdef CONFIG_FS_VERITY +/* + * Returns the address of the verity info pointer within the filesystem-specific + * part of the inode. (To save memory on filesystems that don't support + * fsverity, a field in 'struct inode' itself is no longer used.) + */ static inline struct fsverity_info ** fsverity_info_addr(const struct inode *inode) { - if (inode->i_sb->s_vop->inode_info_offs == 0) - return (struct fsverity_info **)&inode->i_verity_info; + VFS_WARN_ON_ONCE(inode->i_sb->s_vop->inode_info_offs == 0); return (void *)inode + inode->i_sb->s_vop->inode_info_offs; } From 8a3d00dde63a339d31d1fdeead24ddfd4d459c70 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:06 -0700 Subject: [PATCH 0360/3206] fsverity: check IS_VERITY() in fsverity_cleanup_inode() Since getting the address of the fsverity_info has gotten a bit more expensive, make fsverity_cleanup_inode() check for IS_VERITY() instead. This avoids adding more overhead to non-verity files. This assumes that verity info is never set when !IS_VERITY(), which is currently true, but add a VFS_WARN_ON_ONCE() that asserts that. (This of course defeats the optimization, but only when CONFIG_VFS_DEBUG=y.) Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-14-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fsverity.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 844f7b8b56bbce..5bc7280425a719 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -190,8 +190,15 @@ void __fsverity_cleanup_inode(struct inode *inode); */ static inline void fsverity_cleanup_inode(struct inode *inode) { - if (*fsverity_info_addr(inode)) + /* + * Only IS_VERITY() inodes can have verity info, so start by checking + * for IS_VERITY() (which is faster than retrieving the pointer to the + * verity info). This minimizes overhead for non-verity inodes. + */ + if (IS_VERITY(inode)) __fsverity_cleanup_inode(inode); + else + VFS_WARN_ON_ONCE(*fsverity_info_addr(inode) != NULL); } /* read_metadata.c */ From ac9eea3d08c25fb213deb113d246ff5dadb31fbc Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 11 Aug 2025 12:14:56 +0200 Subject: [PATCH 0361/3206] rust: alloc: implement Box::pin_slice() Add a new constructor to Box to facilitate Box creation from a pinned slice of elements. This allows to efficiently allocate memory for e.g. slices of structrures containing spinlocks or mutexes. Such slices may be used in kmemcache like or zpool API implementations. Signed-off-by: Alice Ryhl Signed-off-by: Vitaly Wool Link: https://lore.kernel.org/r/20250811101456.2901694-1-vitaly.wool@konsulko.se [ Add empty lines after struct definitions in the example; end sentences with a period. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/alloc/kbox.rs | 77 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/rust/kernel/alloc/kbox.rs b/rust/kernel/alloc/kbox.rs index eedab0be1eff78..1aa83d751f10d3 100644 --- a/rust/kernel/alloc/kbox.rs +++ b/rust/kernel/alloc/kbox.rs @@ -290,6 +290,83 @@ where Ok(Self::new(x, flags)?.into()) } + /// Construct a pinned slice of elements `Pin>`. + /// + /// This is a convenient means for creation of e.g. slices of structrures containing spinlocks + /// or mutexes. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::{new_spinlock, SpinLock}; + /// + /// struct Inner { + /// a: u32, + /// b: u32, + /// } + /// + /// #[pin_data] + /// struct Example { + /// c: u32, + /// #[pin] + /// d: SpinLock, + /// } + /// + /// impl Example { + /// fn new() -> impl PinInit { + /// try_pin_init!(Self { + /// c: 10, + /// d <- new_spinlock!(Inner { a: 20, b: 30 }), + /// }) + /// } + /// } + /// + /// // Allocate a boxed slice of 10 `Example`s. + /// let s = KBox::pin_slice( + /// | _i | Example::new(), + /// 10, + /// GFP_KERNEL + /// )?; + /// + /// assert_eq!(s[5].c, 10); + /// assert_eq!(s[3].d.lock().a, 20); + /// # Ok::<(), Error>(()) + /// ``` + pub fn pin_slice( + mut init: Func, + len: usize, + flags: Flags, + ) -> Result>, E> + where + Func: FnMut(usize) -> Item, + Item: PinInit, + E: From, + { + let mut buffer = super::Vec::::with_capacity(len, flags)?; + for i in 0..len { + let ptr = buffer.spare_capacity_mut().as_mut_ptr().cast(); + // SAFETY: + // - `ptr` is a valid pointer to uninitialized memory. + // - `ptr` is not used if an error is returned. + // - `ptr` won't be moved until it is dropped, i.e. it is pinned. + unsafe { init(i).__pinned_init(ptr)? }; + + // SAFETY: + // - `i + 1 <= len`, hence we don't exceed the capacity, due to the call to + // `with_capacity()` above. + // - The new value at index buffer.len() + 1 is the only element being added here, and + // it has been initialized above by `init(i).__pinned_init(ptr)`. + unsafe { buffer.inc_len(1) }; + } + + let (ptr, _, _) = buffer.into_raw_parts(); + let slice = core::ptr::slice_from_raw_parts_mut(ptr, len); + + // SAFETY: `slice` points to an allocation allocated with `A` (`buffer`) and holds a valid + // `[T]`. + Ok(Pin::from(unsafe { Box::from_raw(slice) })) + } + /// Convert a [`Box`] to a [`Pin>`]. If `T` does not implement /// [`Unpin`], then `x` will be pinned in memory and can't be moved. pub fn into_pin(this: Self) -> Pin { From 842aedc3907deb154eedb88c7e4c3278c9b33701 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 21 Aug 2025 09:20:54 +0900 Subject: [PATCH 0362/3206] rust: Add cpu_relax() helper Add cpu_relax() helper in preparation for supporting read_poll_timeout(). Reviewed-by: Alice Ryhl Reviewed-by: Andreas Hindborg Reviewed-by: Daniel Almeida Acked-by: Miguel Ojeda Signed-off-by: FUJITA Tomonori Link: https://lore.kernel.org/r/20250821002055.3654160-2-fujita.tomonori@gmail.com Signed-off-by: Danilo Krummrich --- rust/helpers/helpers.c | 1 + rust/helpers/processor.c | 8 ++++++++ rust/kernel/lib.rs | 1 + rust/kernel/processor.rs | 14 ++++++++++++++ 4 files changed, 24 insertions(+) create mode 100644 rust/helpers/processor.c create mode 100644 rust/kernel/processor.rs diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 44b2005d50140d..a90a87cff38c70 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -35,6 +35,7 @@ #include "pid_namespace.c" #include "platform.c" #include "poll.c" +#include "processor.c" #include "property.c" #include "rbtree.c" #include "rcu.c" diff --git a/rust/helpers/processor.c b/rust/helpers/processor.c new file mode 100644 index 00000000000000..d41355e14d6eb5 --- /dev/null +++ b/rust/helpers/processor.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +void rust_helper_cpu_relax(void) +{ + cpu_relax(); +} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index f8db761c5c95fc..953a6cba501cbe 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -111,6 +111,7 @@ pub mod pid_namespace; pub mod platform; pub mod prelude; pub mod print; +pub mod processor; pub mod rbtree; pub mod regulator; pub mod revocable; diff --git a/rust/kernel/processor.rs b/rust/kernel/processor.rs new file mode 100644 index 00000000000000..85b49b3614dd2d --- /dev/null +++ b/rust/kernel/processor.rs @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Processor related primitives. +//! +//! C header: [`include/linux/processor.h`](srctree/include/linux/processor.h) + +/// Lower CPU power consumption or yield to a hyperthreaded twin processor. +/// +/// It also happens to serve as a compiler barrier. +#[inline] +pub fn cpu_relax() { + // SAFETY: Always safe to call. + unsafe { bindings::cpu_relax() } +} From 7769cb177b23142b83f22abd06e492cc25157893 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:11 +0200 Subject: [PATCH 0363/3206] uprobes: Remove breakpoint in unapply_uprobe under mmap_write_lock Currently unapply_uprobe takes mmap_read_lock, but it might call remove_breakpoint which eventually changes user pages. Current code writes either breakpoint or original instruction, so it can go away with read lock as explained in here [1]. But with the upcoming change that writes multiple instructions on the probed address we need to ensure that any update to mm's pages is exclusive. [1] https://lore.kernel.org/all/20240710140045.GA1084@redhat.com/ Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-2-jolsa@kernel.org --- kernel/events/uprobes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd83..1cbfe3cfe57385 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -482,7 +482,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, * @opcode_vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @opcode_vaddr. * - * Called with mm->mmap_lock held for read or write. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, @@ -1463,7 +1463,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - mmap_read_lock(mm); + mmap_write_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1480,7 +1480,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, vma, vaddr); } - mmap_read_unlock(mm); + mmap_write_unlock(mm); return err; } From 0f07b7919d679050d354d3279faa74bdc7ce17a0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:12 +0200 Subject: [PATCH 0364/3206] uprobes: Rename arch_uretprobe_trampoline function We are about to add uprobe trampoline, so cleaning up the namespace. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-3-jolsa@kernel.org --- arch/x86/kernel/uprobes.c | 2 +- include/linux/uprobes.h | 2 +- kernel/events/uprobes.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6d383839e83936..77050e5a4680db 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -338,7 +338,7 @@ extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; -void *arch_uprobe_trampoline(unsigned long *psize) +void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c390946c..01112f27cd2150 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -224,7 +224,7 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); extern void uprobe_handle_trampoline(struct pt_regs *regs); -extern void *arch_uprobe_trampoline(unsigned long *psize); +extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); #else /* !CONFIG_UPROBES */ struct uprobes_state { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1cbfe3cfe57385..dd4dd156c9567e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1726,7 +1726,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -void * __weak arch_uprobe_trampoline(unsigned long *psize) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; @@ -1758,7 +1758,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - insns = arch_uprobe_trampoline(&insns_size); + insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) From 82afdd05a16a424409682e06a53d6afcda038d30 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:13 +0200 Subject: [PATCH 0365/3206] uprobes: Make copy_from_page global Making copy_from_page global and adding uprobe prefix. Adding the uprobe prefix to copy_to_page as well for symmetry. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-4-jolsa@kernel.org --- include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 01112f27cd2150..7447e15559b861 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -226,6 +226,7 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, extern void uprobe_handle_trampoline(struct pt_regs *regs); extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); +extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index dd4dd156c9567e..f993a3422083df 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); @@ -205,7 +205,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { @@ -1051,7 +1051,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); + uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; @@ -1397,7 +1397,7 @@ struct uprobe *uprobe_register(struct inode *inode, return ERR_PTR(-EINVAL); /* - * This ensures that copy_from_page(), copy_to_page() and + * This ensures that uprobe_copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) @@ -2393,7 +2393,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ From 33d7b2beaf34a3c0f6406bc76f6e1b1755150ad9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:14 +0200 Subject: [PATCH 0366/3206] uprobes: Add uprobe_write function Adding uprobe_write function that does what uprobe_write_opcode did so far, but allows to pass verify callback function that checks the memory location before writing the opcode. It will be used in following changes to implement specific checking logic for instruction update. The uprobe_write_opcode now calls uprobe_write with verify_opcode as the verify callback. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-5-jolsa@kernel.org --- include/linux/uprobes.h | 5 +++++ kernel/events/uprobes.c | 14 ++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 7447e15559b861..e13382054435d8 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -187,6 +187,9 @@ struct uprobes_state { struct xol_area *xol_area; }; +typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, + uprobe_opcode_t *opcode); + extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -195,6 +198,8 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, + uprobe_opcode_t opcode, uprobe_write_verify_t verify); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f993a3422083df..838ac40e91e616 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -399,7 +399,7 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, return identical; } -static int __uprobe_write_opcode(struct vm_area_struct *vma, +static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, unsigned long opcode_vaddr, uprobe_opcode_t opcode) { @@ -487,6 +487,12 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, uprobe_opcode_t opcode) +{ + return uprobe_write(auprobe, vma, opcode_vaddr, opcode, verify_opcode); +} + +int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, uprobe_write_verify_t verify) { const unsigned long vaddr = opcode_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -509,7 +515,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really - * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() * cannot deal with PMDs yet. */ if (is_register) @@ -521,7 +527,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, goto out; folio = page_folio(page); - ret = verify_opcode(page, opcode_vaddr, &opcode); + ret = verify(page, opcode_vaddr, &opcode); if (ret <= 0) { folio_put(folio); goto out; @@ -560,7 +566,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, opcode_vaddr, opcode); folio_walk_end(&fw, vma); } From f8b7c528b4fb7018d12b6bb63bb52576cfc73697 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:15 +0200 Subject: [PATCH 0367/3206] uprobes: Add nbytes argument to uprobe_write Adding nbytes argument to uprobe_write and related functions as preparation for writing whole instructions in following changes. Also renaming opcode arguments to insn, which seems to fit better. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-6-jolsa@kernel.org --- include/linux/uprobes.h | 4 ++-- kernel/events/uprobes.c | 26 ++++++++++++++------------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index e13382054435d8..147c4a0a1af9f3 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -188,7 +188,7 @@ struct uprobes_state { }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, - uprobe_opcode_t *opcode); + uprobe_opcode_t *insn, int nbytes); extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -199,7 +199,7 @@ extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t opcode, uprobe_write_verify_t verify); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 838ac40e91e616..c133fd4b492d83 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -208,7 +209,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); - if (is_swbp_insn(new_opcode)) { + if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { @@ -401,10 +402,10 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long opcode_vaddr, uprobe_opcode_t opcode) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(&opcode); + const unsigned long vaddr = insn_vaddr & PAGE_MASK; + const bool is_register = !!is_swbp_insn(insn); bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -429,7 +430,7 @@ static int __uprobe_write(struct vm_area_struct *vma, */ flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); - copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + copy_to_page(fw->page, insn_vaddr, insn, nbytes); /* * When unregistering, we may only zap a PTE if uffd is disabled and @@ -488,13 +489,14 @@ static int __uprobe_write(struct vm_area_struct *vma, int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, uprobe_opcode_t opcode) { - return uprobe_write(auprobe, vma, opcode_vaddr, opcode, verify_opcode); + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode, uprobe_write_verify_t verify) + const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + uprobe_write_verify_t verify) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; int ret, is_register, ref_ctr_updated = 0; @@ -504,7 +506,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, struct folio *folio; struct page *page; - is_register = is_swbp_insn(&opcode); + is_register = is_swbp_insn(insn); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -527,7 +529,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, goto out; folio = page_folio(page); - ret = verify(page, opcode_vaddr, &opcode); + ret = verify(page, insn_vaddr, insn, nbytes); if (ret <= 0) { folio_put(folio); goto out; @@ -566,7 +568,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes); folio_walk_end(&fw, vma); } From ec46350fe1e2338f42ee84974c36b25afe8ba53a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:16 +0200 Subject: [PATCH 0368/3206] uprobes: Add is_register argument to uprobe_write and uprobe_write_opcode The uprobe_write has special path to restore the original page when we write original instruction back. This happens when uprobe_write detects that we want to write anything else but breakpoint instruction. Moving the detection away and passing it to uprobe_write as argument, so it's possible to write different instructions (other than just breakpoint and rest). Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-7-jolsa@kernel.org --- arch/arm/probes/uprobes/core.c | 2 +- include/linux/uprobes.h | 5 +++-- kernel/events/uprobes.c | 21 +++++++++++---------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index 885e0c5e8c20df..3d96fb41d6245d 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - __opcode_to_mem_arm(auprobe->bpinsn)); + __opcode_to_mem_arm(auprobe->bpinsn), true); } bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 147c4a0a1af9f3..518b2675646981 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -197,9 +197,10 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, + bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c133fd4b492d83..955e5ed3e383c1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -402,10 +402,10 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + bool is_register) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(insn); bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -487,26 +487,27 @@ static int __uprobe_write(struct vm_area_struct *vma, * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode) + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, + bool is_register) { - return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode); + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, + verify_opcode, is_register); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify) + uprobe_write_verify_t verify, bool is_register) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - int ret, is_register, ref_ctr_updated = 0; + int ret, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page; - is_register = is_swbp_insn(insn); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -568,7 +569,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); folio_walk_end(&fw, vma); } @@ -610,7 +611,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); } /** @@ -626,7 +627,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - *(uprobe_opcode_t *)&auprobe->insn); + *(uprobe_opcode_t *)&auprobe->insn, false); } /* uprobe should have guaranteed positive refcount */ From 18a111256a0b4fedfe47101f084441a84d7e357a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:17 +0200 Subject: [PATCH 0369/3206] uprobes: Add do_ref_ctr argument to uprobe_write function Making update_ref_ctr call in uprobe_write conditional based on do_ref_ctr argument. This way we can use uprobe_write for instruction update without doing ref_ctr_offset update. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-8-jolsa@kernel.org --- include/linux/uprobes.h | 2 +- kernel/events/uprobes.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 518b2675646981..5080619560d424 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -200,7 +200,7 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 955e5ed3e383c1..da2b3d0deab627 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -491,12 +491,12 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, bool is_register) { return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, - verify_opcode, is_register); + verify_opcode, is_register, true /* do_update_ref_ctr */); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify, bool is_register) + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -537,7 +537,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, } /* We are going to replace instruction, update ref_ctr. */ - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) { folio_put(folio); @@ -589,7 +589,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && ref_ctr_updated) + if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ From 91440ff4cafad4c86322a612e523f7f021a493e7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:18 +0200 Subject: [PATCH 0370/3206] uprobes/x86: Add mapping for optimized uprobe trampolines Adding support to add special mapping for user space trampoline with following functions: uprobe_trampoline_get - find or add uprobe_trampoline uprobe_trampoline_put - remove or destroy uprobe_trampoline The user space trampoline is exported as arch specific user space special mapping through tramp_mapping, which is initialized in following changes with new uprobe syscall. The uprobe trampoline needs to be callable/reachable from the probed address, so while searching for available address we use is_reachable_by_call function to decide if the uprobe trampoline is callable from the probe address. All uprobe_trampoline objects are stored in uprobes_state object and are cleaned up when the process mm_struct goes down. Adding new arch hooks for that, because this change is x86_64 specific. Locking is provided by callers in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-9-jolsa@kernel.org --- arch/x86/kernel/uprobes.c | 144 ++++++++++++++++++++++++++++++++++++++ include/linux/uprobes.h | 6 ++ kernel/events/uprobes.c | 10 +++ kernel/fork.c | 1 + 4 files changed, 161 insertions(+) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 77050e5a4680db..6c4dcbdd0c3c02 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -608,6 +608,150 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) *sr = utask->autask.saved_scratch_register; } } + +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static struct page *tramp_mapping_pages[2] __ro_after_init; + +static struct vm_special_mapping tramp_mapping = { + .name = "[uprobes-trampoline]", + .mremap = tramp_mremap, + .pages = tramp_mapping_pages, +}; + +struct uprobe_trampoline { + struct hlist_node node; + unsigned long vaddr; +}; + +static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) +{ + long delta = (long)(vaddr + 5 - vtramp); + + return delta >= INT_MIN && delta <= INT_MAX; +} + +static unsigned long find_nearest_trampoline(unsigned long vaddr) +{ + struct vm_unmapped_area_info info = { + .length = PAGE_SIZE, + .align_mask = ~PAGE_MASK, + }; + unsigned long low_limit, high_limit; + unsigned long low_tramp, high_tramp; + unsigned long call_end = vaddr + 5; + + if (check_add_overflow(call_end, INT_MIN, &low_limit)) + low_limit = PAGE_SIZE; + + high_limit = call_end + INT_MAX; + + /* Search up from the caller address. */ + info.low_limit = call_end; + info.high_limit = min(high_limit, TASK_SIZE); + high_tramp = vm_unmapped_area(&info); + + /* Search down from the caller address. */ + info.low_limit = max(low_limit, PAGE_SIZE); + info.high_limit = call_end; + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + low_tramp = vm_unmapped_area(&info); + + if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp)) + return -ENOMEM; + if (IS_ERR_VALUE(high_tramp)) + return low_tramp; + if (IS_ERR_VALUE(low_tramp)) + return high_tramp; + + /* Return address that's closest to the caller address. */ + if (call_end - low_tramp < high_tramp - call_end) + return low_tramp; + return high_tramp; +} + +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) +{ + struct pt_regs *regs = task_pt_regs(current); + struct mm_struct *mm = current->mm; + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + + if (!user_64bit_mode(regs)) + return NULL; + + vaddr = find_nearest_trampoline(vaddr); + if (IS_ERR_VALUE(vaddr)) + return NULL; + + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); + if (unlikely(!tramp)) + return NULL; + + tramp->vaddr = vaddr; + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, + &tramp_mapping); + if (IS_ERR(vma)) { + kfree(tramp); + return NULL; + } + return tramp; +} + +__maybe_unused +static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) +{ + struct uprobes_state *state = ¤t->mm->uprobes_state; + struct uprobe_trampoline *tramp = NULL; + + if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE) + return NULL; + + hlist_for_each_entry(tramp, &state->head_tramps, node) { + if (is_reachable_by_call(tramp->vaddr, vaddr)) { + *new = false; + return tramp; + } + } + + tramp = create_uprobe_trampoline(vaddr); + if (!tramp) + return NULL; + + *new = true; + hlist_add_head(&tramp->node, &state->head_tramps); + return tramp; +} + +static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) +{ + /* + * We do not unmap and release uprobe trampoline page itself, + * because there's no easy way to make sure none of the threads + * is still inside the trampoline. + */ + hlist_del(&tramp->node); + kfree(tramp); +} + +void arch_uprobe_init_state(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); +} + +void arch_uprobe_clear_state(struct mm_struct *mm) +{ + struct uprobes_state *state = &mm->uprobes_state; + struct uprobe_trampoline *tramp; + struct hlist_node *n; + + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) + destroy_uprobe_trampoline(tramp); +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 5080619560d424..b40d33aae0168b 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -17,6 +17,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -185,6 +186,9 @@ struct xol_area; struct uprobes_state { struct xol_area *xol_area; +#ifdef CONFIG_X86_64 + struct hlist_head head_tramps; +#endif }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, @@ -233,6 +237,8 @@ extern void uprobe_handle_trampoline(struct pt_regs *regs); extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); +extern void arch_uprobe_clear_state(struct mm_struct *mm); +extern void arch_uprobe_init_state(struct mm_struct *mm); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index da2b3d0deab627..2cd7a4c6f303bd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1801,6 +1801,14 @@ static struct xol_area *get_xol_area(void) return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1812,6 +1820,8 @@ void uprobe_clear_state(struct mm_struct *mm) delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); + arch_uprobe_clear_state(mm); + if (!area) return; diff --git a/kernel/fork.c b/kernel/fork.c index af673856499dca..d827cc6c53623e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1015,6 +1015,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } From 56101b69c9190667f473b9f93f8b6d8209aaa816 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:19 +0200 Subject: [PATCH 0371/3206] uprobes/x86: Add uprobe syscall to speed up uprobe Adding new uprobe syscall that calls uprobe handlers for given 'breakpoint' address. The idea is that the 'breakpoint' address calls the user space trampoline which executes the uprobe syscall. The syscall handler reads the return address of the initial call to retrieve the original 'breakpoint' address. With this address we find the related uprobe object and call its consumers. Adding the arch_uprobe_trampoline_mapping function that provides uprobe trampoline mapping. This mapping is backed with one global page initialized at __init time and shared by the all the mapping instances. We do not allow to execute uprobe syscall if the caller is not from uprobe trampoline mapping. The uprobe syscall ensures the consumer (bpf program) sees registers values in the state before the trampoline was called. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-10-jolsa@kernel.org --- arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/x86/kernel/uprobes.c | 139 +++++++++++++++++++++++++ include/linux/syscalls.h | 2 + include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 17 +++ kernel/sys_ni.c | 1 + 6 files changed, 161 insertions(+) diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 92cf0fe2291eb9..ced2a1deecd7ce 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq 335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c4dcbdd0c3c02..d18e1ae5990164 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm) hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) destroy_uprobe_trampoline(tramp); } + +static bool __in_uprobe_trampoline(unsigned long ip) +{ + struct vm_area_struct *vma = vma_lookup(current->mm, ip); + + return vma && vma_is_special_mapping(vma, &tramp_mapping); +} + +static bool in_uprobe_trampoline(unsigned long ip) +{ + struct mm_struct *mm = current->mm; + bool found, retry = true; + unsigned int seq; + + rcu_read_lock(); + if (mmap_lock_speculate_try_begin(mm, &seq)) { + found = __in_uprobe_trampoline(ip); + retry = mmap_lock_speculate_retry(mm, seq); + } + rcu_read_unlock(); + + if (retry) { + mmap_read_lock(mm); + found = __in_uprobe_trampoline(ip); + mmap_read_unlock(mm); + } + return found; +} + +/* + * See uprobe syscall trampoline; the call to the trampoline will push + * the return address on the stack, the trampoline itself then pushes + * cx, r11 and ax. + */ +struct uprobe_syscall_args { + unsigned long ax; + unsigned long r11; + unsigned long cx; + unsigned long retaddr; +}; + +SYSCALL_DEFINE0(uprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + struct uprobe_syscall_args args; + unsigned long ip, sp; + int err; + + /* Allow execution only from uprobe trampolines. */ + if (!in_uprobe_trampoline(regs->ip)) + goto sigill; + + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); + if (err) + goto sigill; + + ip = regs->ip; + + /* + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: + * - adjust ip to the probe address, call saved next instruction address + * - adjust sp to the probe's stack frame (check trampoline code) + */ + regs->ax = args.ax; + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ip = args.retaddr - 5; + regs->sp += sizeof(args); + regs->orig_ax = -1; + + sp = regs->sp; + + handle_syscall_uprobe(regs, regs->ip); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + */ + if (regs->sp != sp) { + /* skip the trampoline call */ + if (args.retaddr - 5 == regs->ip) + regs->ip += 5; + return regs->ax; + } + + regs->sp -= sizeof(args); + + /* for the case uprobe_consumer has changed ax/r11/cx */ + args.ax = regs->ax; + args.r11 = regs->r11; + args.cx = regs->cx; + + /* keep return address unless we are instructed otherwise */ + if (args.retaddr - 5 != regs->ip) + args.retaddr = regs->ip; + + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + return 0; + +sigill: + force_sig(SIGILL); + return -1; +} + +asm ( + ".pushsection .rodata\n" + ".balign " __stringify(PAGE_SIZE) "\n" + "uprobe_trampoline_entry:\n" + "push %rcx\n" + "push %r11\n" + "push %rax\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "pop %rax\n" + "pop %r11\n" + "pop %rcx\n" + "ret\n" + ".balign " __stringify(PAGE_SIZE) "\n" + ".popsection\n" +); + +extern u8 uprobe_trampoline_entry[]; + +static int __init arch_uprobes_init(void) +{ + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); + return 0; +} + +late_initcall(arch_uprobes_init); + #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 77f45e5d44139d..66c06fcdfe19e2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); asmlinkage long sys_uretprobe(void); +asmlinkage long sys_uprobe(void); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b40d33aae0168b..b6b077cc7d0f2a 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -239,6 +239,7 @@ extern unsigned long uprobe_get_trampoline_vaddr(void); extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); extern void arch_uprobe_clear_state(struct mm_struct *mm); extern void arch_uprobe_init_state(struct mm_struct *mm); +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2cd7a4c6f303bd..eb07e602b6c9ad 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2771,6 +2771,23 @@ static void handle_swbp(struct pt_regs *regs) rcu_read_unlock_trace(); } +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + guard(rcu_tasks_trace)(); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + return; + if (!get_utask()) + return; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + return; + handler_chain(uprobe, regs); +} + /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c6c..bf5d05c635ffd5 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); COND_SYSCALL(rseq); COND_SYSCALL(uretprobe); +COND_SYSCALL(uprobe); From ba2bfc97b4629b10bd8d02b36e04f3932a04cac4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:20 +0200 Subject: [PATCH 0372/3206] uprobes/x86: Add support to optimize uprobes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Putting together all the previously added pieces to support optimized uprobes on top of 5-byte nop instruction. The current uprobe execution goes through following: - installs breakpoint instruction over original instruction - exception handler hit and calls related uprobe consumers - and either simulates original instruction or does out of line single step execution of it - returns to user space The optimized uprobe path does following: - checks the original instruction is 5-byte nop (plus other checks) - adds (or uses existing) user space trampoline with uprobe syscall - overwrites original instruction (5-byte nop) with call to user space trampoline - the user space trampoline executes uprobe syscall that calls related uprobe consumers - trampoline returns back to next instruction This approach won't speed up all uprobes as it's limited to using nop5 as original instruction, but we plan to use nop5 as USDT probe instruction (which currently uses single byte nop) and speed up the USDT probes. The arch_uprobe_optimize triggers the uprobe optimization and is called after first uprobe hit. I originally had it called on uprobe installation but then it clashed with elf loader, because the user space trampoline was added in a place where loader might need to put elf segments, so I decided to do it after first uprobe hit when loading is done. The uprobe is un-optimized in arch specific set_orig_insn call. The instruction overwrite is x86 arch specific and needs to go through 3 updates: (on top of nop5 instruction) - write int3 into 1st byte - write last 4 bytes of the call instruction - update the call instruction opcode And cleanup goes though similar reverse stages: - overwrite call opcode with breakpoint (int3) - write last 4 bytes of the nop5 instruction - write the nop5 first instruction byte We do not unmap and release uprobe trampoline when it's no longer needed, because there's no easy way to make sure none of the threads is still inside the trampoline. But we do not waste memory, because there's just single page for all the uprobe trampoline mappings. We do waste frame on page mapping for every 4GB by keeping the uprobe trampoline page mapped, but that seems ok. We take the benefit from the fact that set_swbp and set_orig_insn are called under mmap_write_lock(mm), so we can use the current instruction as the state the uprobe is in - nop5/breakpoint/call trampoline - and decide the needed action (optimize/un-optimize) based on that. Attaching the speed up from benchs/run_bench_uprobes.sh script: current: usermode-count : 152.604 ± 0.044M/s syscall-count : 13.359 ± 0.042M/s --> uprobe-nop : 3.229 ± 0.002M/s uprobe-push : 3.086 ± 0.004M/s uprobe-ret : 1.114 ± 0.004M/s uprobe-nop5 : 1.121 ± 0.005M/s uretprobe-nop : 2.145 ± 0.002M/s uretprobe-push : 2.070 ± 0.001M/s uretprobe-ret : 0.931 ± 0.001M/s uretprobe-nop5 : 0.957 ± 0.001M/s after the change: usermode-count : 152.448 ± 0.244M/s syscall-count : 14.321 ± 0.059M/s uprobe-nop : 3.148 ± 0.007M/s uprobe-push : 2.976 ± 0.004M/s uprobe-ret : 1.068 ± 0.003M/s --> uprobe-nop5 : 7.038 ± 0.007M/s uretprobe-nop : 2.109 ± 0.004M/s uretprobe-push : 2.035 ± 0.001M/s uretprobe-ret : 0.908 ± 0.001M/s uretprobe-nop5 : 3.377 ± 0.009M/s I see bit more speed up on Intel (above) compared to AMD. The big nop5 speed up is partly due to emulating nop5 and partly due to optimization. The key speed up we do this for is the USDT switch from nop to nop5: uprobe-nop : 3.148 ± 0.007M/s uprobe-nop5 : 7.038 ± 0.007M/s Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-11-jolsa@kernel.org --- arch/x86/include/asm/uprobes.h | 7 + arch/x86/kernel/uprobes.c | 283 ++++++++++++++++++++++++++++++++- include/linux/uprobes.h | 6 +- kernel/events/uprobes.c | 16 +- 4 files changed, 305 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 678fb546f0a75c..1ee2e5115955cd 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +enum { + ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0, + ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1, +}; + struct uprobe_xol_ops; struct arch_uprobe { @@ -45,6 +50,8 @@ struct arch_uprobe { u8 ilen; } push; }; + + unsigned long flags; }; struct arch_uprobe_task { diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index d18e1ae5990164..209ce74ab93f4b 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Post-execution fixups. */ @@ -702,7 +703,6 @@ static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) return tramp; } -__maybe_unused static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) { struct uprobes_state *state = ¤t->mm->uprobes_state; @@ -891,6 +891,280 @@ static int __init arch_uprobes_init(void) late_initcall(arch_uprobes_init); +enum { + EXPECT_SWBP, + EXPECT_CALL, +}; + +struct write_opcode_ctx { + unsigned long base; + int expect; +}; + +static int is_call_insn(uprobe_opcode_t *insn) +{ + return *insn == CALL_INSN_OPCODE; +} + +/* + * Verification callback used by int3_update uprobe_write calls to make sure + * the underlying instruction is as expected - either int3 or call. + */ +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) +{ + struct write_opcode_ctx *ctx = data; + uprobe_opcode_t old_opcode[5]; + + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); + + switch (ctx->expect) { + case EXPECT_SWBP: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + case EXPECT_CALL: + if (is_call_insn(&old_opcode[0])) + return 1; + break; + } + + return -1; +} + +/* + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. + * (borrowed comment from smp_text_poke_batch_finish) + * + * The way it is done: + * - Add an INT3 trap to the address that will be patched + * - SMP sync all CPUs + * - Update all but the first byte of the patched range + * - SMP sync all CPUs + * - Replace the first byte (INT3) by the first byte of the replacing opcode + * - SMP sync all CPUs + */ +static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, char *insn, bool optimize) +{ + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + int err; + + /* + * Write int3 trap. + * + * The swbp_optimize path comes with breakpoint already installed, + * so we can skip this step for optimize == true. + */ + if (!optimize) { + ctx.expect = EXPECT_CALL; + err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + } + + smp_text_poke_sync_each_cpu(); + + /* Write all but the first byte of the patched range. */ + ctx.expect = EXPECT_SWBP; + err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + + /* + * Write first byte. + * + * The swbp_unoptimize needs to finish uprobe removal together + * with ref_ctr update, using uprobe_write with proper flags. + */ + err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn, + optimize /* is_register */, !optimize /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + return 0; +} + +static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, unsigned long tramp) +{ + u8 call[5]; + + __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr, + (const void *) tramp, CALL_INSN_SIZE); + return int3_update(auprobe, vma, vaddr, call, true /* optimize */); +} + +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */); +} + +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) +{ + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; + struct vm_area_struct *vma; + struct page *page; + + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(page)) + return PTR_ERR(page); + uprobe_copy_from_page(page, vaddr, dst, len); + put_page(page); + return 0; +} + +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *call = (struct __arch_relative_insn *) insn; + + if (!is_call_insn(insn)) + return false; + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); +} + +static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized) +{ + uprobe_opcode_t insn[5]; + int err; + + err = copy_from_vaddr(mm, vaddr, &insn, 5); + if (err) + return err; + *optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr); + return 0; +} + +static bool should_optimize(struct arch_uprobe *auprobe) +{ + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); +} + +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (should_optimize(auprobe)) { + bool optimized = false; + int err; + + /* + * We could race with another thread that already optimized the probe, + * so let's not overwrite it with int3 again in this case. + */ + err = is_optimized(vma->vm_mm, vaddr, &optimized); + if (err) + return err; + if (optimized) + return 0; + } + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, + true /* is_register */); +} + +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { + struct mm_struct *mm = vma->vm_mm; + bool optimized = false; + int err; + + err = is_optimized(mm, vaddr, &optimized); + if (err) + return err; + if (optimized) { + err = swbp_unoptimize(auprobe, vma, vaddr); + WARN_ON_ONCE(err); + return err; + } + } + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, + false /* is_register */); +} + +static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr) +{ + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + bool new = false; + int err = 0; + + vma = find_vma(mm, vaddr); + if (!vma) + return -EINVAL; + tramp = get_uprobe_trampoline(vaddr, &new); + if (!tramp) + return -EINVAL; + err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr); + if (WARN_ON_ONCE(err) && new) + destroy_uprobe_trampoline(tramp); + return err; +} + +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + struct mm_struct *mm = current->mm; + uprobe_opcode_t insn[5]; + + /* + * Do not optimize if shadow stack is enabled, the return address hijack + * code in arch_uretprobe_hijack_return_addr updates wrong frame when + * the entry uprobe is optimized and the shadow stack crashes the app. + */ + if (shstk_is_enabled()) + return; + + if (!should_optimize(auprobe)) + return; + + mmap_write_lock(mm); + + /* + * Check if some other thread already optimized the uprobe for us, + * if it's the case just go away silently. + */ + if (copy_from_vaddr(mm, vaddr, &insn, 5)) + goto unlock; + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) + goto unlock; + + /* + * If we fail to optimize the uprobe we set the fail bit so the + * above should_optimize will fail from now on. + */ + if (__arch_uprobe_optimize(auprobe, mm, vaddr)) + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); + +unlock: + mmap_write_unlock(mm); +} + +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + if (memcmp(&auprobe->insn, x86_nops[5], 5)) + return false; + /* We can't do cross page atomic writes yet. */ + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit @@ -904,6 +1178,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + return false; +} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -1270,6 +1548,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret) return ret; + if (can_optimize(auprobe, addr)) + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); + ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b6b077cc7d0f2a..08ef78439d0df2 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -192,7 +192,7 @@ struct uprobes_state { }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, - uprobe_opcode_t *insn, int nbytes); + uprobe_opcode_t *insn, int nbytes, void *data); extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -204,7 +204,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); @@ -240,6 +241,7 @@ extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void * extern void arch_uprobe_clear_state(struct mm_struct *mm); extern void arch_uprobe_init_state(struct mm_struct *mm); extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eb07e602b6c9ad..4a194d7c838b52 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,7 +192,7 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, - int nbytes) + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -491,12 +491,13 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, bool is_register) { return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, - verify_opcode, is_register, true /* do_update_ref_ctr */); + verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr) + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -530,7 +531,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, goto out; folio = page_folio(page); - ret = verify(page, insn_vaddr, insn, nbytes); + ret = verify(page, insn_vaddr, insn, nbytes, data); if (ret <= 0) { folio_put(folio); goto out; @@ -2696,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -2760,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; From 985e820b72e0a89746f51c072ed10caf78890244 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2025 12:28:36 +0200 Subject: [PATCH 0373/3206] uprobes/x86: Add struct uretprobe_syscall_args Like uprobe_syscall_args; keep things consistent. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123656.705837806@infradead.org --- arch/x86/kernel/uprobes.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 209ce74ab93f4b..580989dfddc75f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -311,6 +311,12 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool #ifdef CONFIG_X86_64 +struct uretprobe_syscall_args { + unsigned long r11; + unsigned long cx; + unsigned long ax; +}; + asm ( ".pushsection .rodata\n" ".global uretprobe_trampoline_entry\n" @@ -324,8 +330,8 @@ asm ( "uretprobe_syscall_check:\n" "popq %r11\n" "popq %rcx\n" - - /* The uretprobe syscall replaces stored %rax value with final + /* + * The uretprobe syscall replaces stored %rax value with final * return address, so we don't restore %rax in here and just * call ret. */ @@ -366,7 +372,8 @@ static unsigned long trampoline_check_ip(unsigned long tramp) SYSCALL_DEFINE0(uretprobe) { struct pt_regs *regs = task_pt_regs(current); - unsigned long err, ip, sp, r11_cx_ax[3], tramp; + struct uretprobe_syscall_args args; + unsigned long err, ip, sp, tramp; /* If there's no trampoline, we are called from wrong place. */ tramp = uprobe_get_trampoline_vaddr(); @@ -377,15 +384,15 @@ SYSCALL_DEFINE0(uretprobe) if (unlikely(regs->ip != trampoline_check_ip(tramp))) goto sigill; - err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); if (err) goto sigill; /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ - regs->r11 = r11_cx_ax[0]; - regs->cx = r11_cx_ax[1]; - regs->ax = r11_cx_ax[2]; - regs->sp += sizeof(r11_cx_ax); + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ax = args.ax; + regs->sp += sizeof(args); regs->orig_ax = -1; ip = regs->ip; @@ -401,21 +408,21 @@ SYSCALL_DEFINE0(uretprobe) */ if (regs->sp != sp || shstk_is_enabled()) return regs->ax; - regs->sp -= sizeof(r11_cx_ax); + regs->sp -= sizeof(args); /* for the case uprobe_consumer has changed r11/cx */ - r11_cx_ax[0] = regs->r11; - r11_cx_ax[1] = regs->cx; + args.r11 = regs->r11; + args.cx = regs->cx; /* * ax register is passed through as return value, so we can use * its space on stack for ip value and jump to it through the * trampoline's ret instruction */ - r11_cx_ax[2] = regs->ip; + args.ax = regs->ip; regs->ip = ip; - err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax)); + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); if (err) goto sigill; From fd54052b60cf6e73cf918fd8653cd7a5c84b0cc3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2025 12:37:22 +0200 Subject: [PATCH 0374/3206] uprobes/x86: Optimize is_optimize() Make is_optimized() return a tri-state and avoid return through argument. This simplifies things a little. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123656.823296198@infradead.org --- arch/x86/kernel/uprobes.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 580989dfddc75f..3b46a894c20499 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1047,7 +1047,7 @@ static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) return __in_uprobe_trampoline(vaddr + 5 + call->raddr); } -static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized) +static int is_optimized(struct mm_struct *mm, unsigned long vaddr) { uprobe_opcode_t insn[5]; int err; @@ -1055,8 +1055,7 @@ static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimiz err = copy_from_vaddr(mm, vaddr, &insn, 5); if (err) return err; - *optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr); - return 0; + return __is_optimized((uprobe_opcode_t *)&insn, vaddr); } static bool should_optimize(struct arch_uprobe *auprobe) @@ -1069,17 +1068,14 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { if (should_optimize(auprobe)) { - bool optimized = false; - int err; - /* * We could race with another thread that already optimized the probe, * so let's not overwrite it with int3 again in this case. */ - err = is_optimized(vma->vm_mm, vaddr, &optimized); - if (err) - return err; - if (optimized) + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) return 0; } return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, @@ -1090,17 +1086,13 @@ int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { - struct mm_struct *mm = vma->vm_mm; - bool optimized = false; - int err; - - err = is_optimized(mm, vaddr, &optimized); - if (err) - return err; - if (optimized) { - err = swbp_unoptimize(auprobe, vma, vaddr); - WARN_ON_ONCE(err); - return err; + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) { + ret = swbp_unoptimize(auprobe, vma, vaddr); + WARN_ON_ONCE(ret); + return ret; } } return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, From 7c2bfc183b05103287cc32ad68184f7d4312c06d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2025 12:55:14 +0200 Subject: [PATCH 0375/3206] uprobes/x86: Accept more NOP forms Instead of only accepting the x86_64 nop5 chosen by the kernel, accept any x86_64 NOP or NOPL instruction that is 5 bytes. Notably, the x86_64 nop5 pattern is valid in 32bit apps and could get compiler generated when build for i686 (which introduced NOPL). Since the trampoline is x86_64 only, make sure to limit to x86_64 code. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123656.935559566@infradead.org --- arch/x86/kernel/uprobes.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 3b46a894c20499..d513c9761f3418 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1157,10 +1157,37 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) mmap_write_unlock(mm); } -static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +static bool insn_is_nop(struct insn *insn) { - if (memcmp(&auprobe->insn, x86_nops[5], 5)) + return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; +} + +static bool insn_is_nopl(struct insn *insn) +{ + if (insn->opcode.nbytes != 2) + return false; + + if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) + return false; + + if (!insn->modrm.nbytes) + return false; + + if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) + return false; + + /* 0f 1f /0 - NOPL */ + return true; +} + +static bool can_optimize(struct insn *insn, unsigned long vaddr) +{ + if (!insn->x86_64 || insn->length != 5) return false; + + if (!insn_is_nop(insn) && !insn_is_nopl(insn)) + return false; + /* We can't do cross page atomic writes yet. */ return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; } @@ -1177,7 +1204,7 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } -static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +static bool can_optimize(struct insn *insn, unsigned long vaddr) { return false; } @@ -1539,15 +1566,15 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) */ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - struct insn insn; u8 fix_ip_or_call = UPROBE_FIX_IP; + struct insn insn; int ret; ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); if (ret) return ret; - if (can_optimize(auprobe, addr)) + if (can_optimize(&insn, addr)) set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); ret = branch_setup_xol_ops(auprobe, &insn); From f349ec80865dfeaf8a33feae6b4a500db039c69c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2025 14:28:34 +0200 Subject: [PATCH 0376/3206] uprobes/x86: Fix uprobe syscall vs shadow stack The uprobe syscall stores and strips the trampoline stack frame from the user context, to make it appear similar to an exception at the original instruction. It then restores the trampoline stack when it can exit using sysexit. Make sure to match the regular stack manipulation with shadow stack operations such that regular and shadow stack don't get out of sync and causes trouble. This enables using the optimization when shadow stack is in use. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123657.055790090@infradead.org --- arch/x86/include/asm/shstk.h | 4 ++++ arch/x86/kernel/shstk.c | 40 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/uprobes.c | 17 ++++++++------- 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h index ba6f2fe438488d..6723dbf6ab8295 100644 --- a/arch/x86/include/asm/shstk.h +++ b/arch/x86/include/asm/shstk.h @@ -23,6 +23,8 @@ int setup_signal_shadow_stack(struct ksignal *ksig); int restore_signal_shadow_stack(void); int shstk_update_last_frame(unsigned long val); bool shstk_is_enabled(void); +int shstk_pop(u64 *val); +int shstk_push(u64 val); #else static inline long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { return -EINVAL; } @@ -35,6 +37,8 @@ static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; } static inline int restore_signal_shadow_stack(void) { return 0; } static inline int shstk_update_last_frame(unsigned long val) { return 0; } static inline bool shstk_is_enabled(void) { return false; } +static inline int shstk_pop(u64 *val) { return -ENOTSUPP; } +static inline int shstk_push(u64 val) { return -ENOTSUPP; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ #endif /* __ASSEMBLER__ */ diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 2ddf23387c7ef7..2dd91a3053d0f9 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr(void) return ssp; } +int shstk_pop(u64 *val) +{ + int ret = 0; + u64 ssp; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + if (val && get_user(*val, (__user u64 *)ssp)) + ret = -EFAULT; + else + wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE); + fpregs_unlock(); + + return ret; +} + +int shstk_push(u64 val) +{ + u64 ssp; + int ret; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + ssp -= SS_FRAME_SIZE; + ret = write_user_shstk_64((__user void *)ssp, val); + if (!ret) + wrmsrq(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return ret; +} + #define SHSTK_DATA_BIT BIT(63) static int put_shstk_data(u64 __user *addr, u64 data) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index d513c9761f3418..ab6547bf2fa5d6 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -804,7 +804,7 @@ SYSCALL_DEFINE0(uprobe) { struct pt_regs *regs = task_pt_regs(current); struct uprobe_syscall_args args; - unsigned long ip, sp; + unsigned long ip, sp, sret; int err; /* Allow execution only from uprobe trampolines. */ @@ -831,6 +831,10 @@ SYSCALL_DEFINE0(uprobe) sp = regs->sp; + err = shstk_pop((u64 *)&sret); + if (err == -EFAULT || (!err && sret != args.retaddr)) + goto sigill; + handle_syscall_uprobe(regs, regs->ip); /* @@ -855,6 +859,9 @@ SYSCALL_DEFINE0(uprobe) if (args.retaddr - 5 != regs->ip) args.retaddr = regs->ip; + if (shstk_push(args.retaddr) == -EFAULT) + goto sigill; + regs->ip = ip; err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); @@ -1124,14 +1131,6 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) struct mm_struct *mm = current->mm; uprobe_opcode_t insn[5]; - /* - * Do not optimize if shadow stack is enabled, the return address hijack - * code in arch_uretprobe_hijack_return_addr updates wrong frame when - * the entry uprobe is optimized and the shadow stack crashes the app. - */ - if (shstk_is_enabled()) - return; - if (!should_optimize(auprobe)) return; From 60ed85b7e46983725954103beaca5ca41816cd58 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Aug 2025 13:48:48 +0200 Subject: [PATCH 0377/3206] uprobes/x86: Make asm style consistent The asm syntax in uretprobe_trampoline and uprobe_trampoline differs in the use of operand size suffixes. Make them consistent and remove all size suffixes. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123657.163417243@infradead.org --- arch/x86/kernel/uprobes.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index ab6547bf2fa5d6..643027e35c4614 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -321,21 +321,21 @@ asm ( ".pushsection .rodata\n" ".global uretprobe_trampoline_entry\n" "uretprobe_trampoline_entry:\n" - "pushq %rax\n" - "pushq %rcx\n" - "pushq %r11\n" - "movq $" __stringify(__NR_uretprobe) ", %rax\n" + "push %rax\n" + "push %rcx\n" + "push %r11\n" + "mov $" __stringify(__NR_uretprobe) ", %rax\n" "syscall\n" ".global uretprobe_syscall_check\n" "uretprobe_syscall_check:\n" - "popq %r11\n" - "popq %rcx\n" + "pop %r11\n" + "pop %rcx\n" /* * The uretprobe syscall replaces stored %rax value with final * return address, so we don't restore %rax in here and just * call ret. */ - "retq\n" + "ret\n" ".global uretprobe_trampoline_end\n" "uretprobe_trampoline_end:\n" ".popsection\n" @@ -885,7 +885,7 @@ asm ( "push %rcx\n" "push %r11\n" "push %rax\n" - "movq $" __stringify(__NR_uprobe) ", %rax\n" + "mov $" __stringify(__NR_uprobe) ", %rax\n" "syscall\n" "pop %rax\n" "pop %r11\n" From 354492a0e1bc4a408e26ebe14166bd1064182439 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2025 19:49:56 +0200 Subject: [PATCH 0378/3206] uprobes/x86: Add SLS mitigation to the trampolines It is trivial; no reason not to. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123657.277506098@infradead.org --- arch/x86/kernel/uprobes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 643027e35c4614..0a8c0a4a54233a 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -336,6 +336,7 @@ asm ( * call ret. */ "ret\n" + "int3\n" ".global uretprobe_trampoline_end\n" "uretprobe_trampoline_end:\n" ".popsection\n" @@ -891,6 +892,7 @@ asm ( "pop %r11\n" "pop %rcx\n" "ret\n" + "int3\n" ".balign " __stringify(PAGE_SIZE) "\n" ".popsection\n" ); From 17c3b001576452fda8fd19d664a93a988b309780 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:21 +0200 Subject: [PATCH 0379/3206] selftests/bpf: Import usdt.h from libbpf/usdt project Importing usdt.h from libbpf/usdt project. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-12-jolsa@kernel.org --- tools/testing/selftests/bpf/usdt.h | 545 +++++++++++++++++++++++++++++ 1 file changed, 545 insertions(+) create mode 100644 tools/testing/selftests/bpf/usdt.h diff --git a/tools/testing/selftests/bpf/usdt.h b/tools/testing/selftests/bpf/usdt.h new file mode 100644 index 00000000000000..549d1f77481019 --- /dev/null +++ b/tools/testing/selftests/bpf/usdt.h @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * This single-header library defines a collection of variadic macros for + * defining and triggering USDTs (User Statically-Defined Tracepoints): + * + * - For USDTs without associated semaphore: + * USDT(group, name, args...) + * + * - For USDTs with implicit (transparent to the user) semaphore: + * USDT_WITH_SEMA(group, name, args...) + * USDT_IS_ACTIVE(group, name) + * + * - For USDTs with explicit (user-defined and provided) semaphore: + * USDT_WITH_EXPLICIT_SEMA(sema, group, name, args...) + * USDT_SEMA_IS_ACTIVE(sema) + * + * all of which emit a NOP instruction into the instruction stream, and so + * have *zero* overhead for the surrounding code. USDTs are identified by + * a combination of `group` and `name` identifiers, which is used by external + * tracing tooling (tracers) for identifying exact USDTs of interest. + * + * USDTs can have an associated (2-byte) activity counter (USDT semaphore), + * automatically maintained by Linux kernel whenever any correctly written + * BPF-based tracer is attached to the USDT. This USDT semaphore can be used + * to check whether there is a need to do any extra data collection and + * processing for a given USDT (if necessary), and otherwise avoid extra work + * for a common case of USDT not being traced ("active"). + * + * See documentation for USDT_WITH_SEMA()/USDT_IS_ACTIVE() or + * USDT_WITH_EXPLICIT_SEMA()/USDT_SEMA_IS_ACTIVE() APIs below for details on + * working with USDTs with implicitly or explicitly associated + * USDT semaphores, respectively. + * + * There is also some additional data recorded into an auxiliary note + * section. The data in the note section describes the operands, in terms of + * size and location, used by tracing tooling to know where to find USDT + * arguments. Each location is encoded as an assembler operand string. + * Tracing tools (bpftrace and BPF-based tracers, systemtap, etc) insert + * breakpoints on top of the nop, and decode the location operand-strings, + * like an assembler, to find the values being passed. + * + * The operand strings are selected by the compiler for each operand. + * They are constrained by inline-assembler codes.The default is: + * + * #define USDT_ARG_CONSTRAINT nor + * + * This is a good default if the operands tend to be integral and + * moderate in number (smaller than number of registers). In other + * cases, the compiler may report "'asm' requires impossible reload" or + * similar. In this case, consider simplifying the macro call (fewer + * and simpler operands), reduce optimization, or override the default + * constraints string via: + * + * #define USDT_ARG_CONSTRAINT g + * #include + * + * For some historical description of USDT v3 format (the one used by this + * library and generally recognized and assumed by BPF-based tracing tools) + * see [0]. The more formal specification can be found at [1]. Additional + * argument constraints information can be found at [2]. + * + * Original SystemTap's sys/sdt.h implementation ([3]) was used as a base for + * this USDT library implementation. Current implementation differs *a lot* in + * terms of exposed user API and general usability, which was the main goal + * and focus of the reimplementation work. Nevertheless, underlying recorded + * USDT definitions are fully binary compatible and any USDT-based tooling + * should work equally well with USDTs defined by either SystemTap's or this + * library's USDT implementation. + * + * [0] https://ecos.sourceware.org/ml/systemtap/2010-q3/msg00145.html + * [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + * [2] https://gcc.gnu.org/onlinedocs/gcc/Constraints.html + * [3] https://sourceware.org/git/?p=systemtap.git;a=blob;f=includes/sys/sdt.h + */ +#ifndef __USDT_H +#define __USDT_H + +/* + * Changelog: + * + * 0.1.0 + * ----- + * - Initial release + */ +#define USDT_MAJOR_VERSION 0 +#define USDT_MINOR_VERSION 1 +#define USDT_PATCH_VERSION 0 + +/* C++20 and C23 added __VA_OPT__ as a standard replacement for non-standard `##__VA_ARGS__` extension */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L) +#define __usdt_va_opt 1 +#define __usdt_va_args(...) __VA_OPT__(,) __VA_ARGS__ +#else +#define __usdt_va_args(...) , ##__VA_ARGS__ +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. No USDT semaphore is + * associated with this USDT. + * + * Such "semaphoreless" USDTs are commonly used when there is no extra data + * collection or processing needed to collect and prepare USDT arguments and + * they are just available in the surrounding code. USDT() macro will just + * record their locations in CPU registers or in memory for tracing tooling to + * be able to access them, if necessary. + */ +#ifdef __usdt_va_opt +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0 __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0, ##__VA_ARGS__) +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. USDT also get an + * implicitly-defined associated USDT semaphore, which will be "activated" by + * tracing tooling and can be used to check whether USDT is being actively + * observed. + * + * USDTs with semaphore are commonly used when there is a need to perform + * additional data collection and processing to prepare USDT arguments, which + * otherwise might not be necessary for the rest of application logic. In such + * case, USDT semaphore can be used to avoid unnecessary extra work. If USDT + * is not traced (which is presumed to be a common situation), the associated + * USDT semaphore is "inactive", and so there is no need to waste resources to + * prepare USDT arguments. Use USDT_IS_ACTIVE(group, name) to check whether + * USDT is "active". + * + * N.B. There is an inherent (albeit short) gap between checking whether USDT + * is active and triggering corresponding USDT, in which external tracer can + * be attached to an USDT and activate USDT semaphore after the activity check. + * If such a race occurs, tracers might miss one USDT execution. Tracers are + * expected to accommodate such possibility and this is expected to not be + * a problem for applications and tracers. + * + * N.B. Implicit USDT semaphore defined by USDT_WITH_SEMA() is contained + * within a single executable or shared library and is not shared outside + * them. I.e., if you use USDT_WITH_SEMA() with the same USDT group and name + * identifier across executable and shared library, it will work and won't + * conflict, per se, but will define independent USDT semaphores, one for each + * shared library/executable in which USDT_WITH_SEMA(group, name) is used. + * That is, if you attach to this USDT in one shared library (or executable), + * then only USDT semaphore within that shared library (or executable) will be + * updated by the kernel, while other libraries (or executable) will not see + * activated USDT semaphore. In short, it's best to use unique USDT group:name + * identifiers across different shared libraries (and, equivalently, between + * executable and shared library). This is advanced consideration and is + * rarely (if ever) seen in practice, but just to avoid surprises this is + * called out here. (Static libraries become a part of final executable, once + * linked by linker, so the above considerations don't apply to them.) + */ +#ifdef __usdt_va_opt +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name) \ + __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name), \ + ##__VA_ARGS__) +#endif + +struct usdt_sema { volatile unsigned short active; }; + +/* + * Check if USDT with `group`:`name` identifier is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into the given USDT. It is assumed that USDT is triggered with + * USDT_WITH_SEMA() macro which will implicitly define associated USDT + * semaphore. (If one needs more control over USDT semaphore, see + * USDT_DEFINE_SEMA() and USDT_WITH_EXPLICIT_SEMA() macros below.) + * + * N.B. Such checks are necessarily racy and speculative. Between checking + * whether USDT is active and triggering the USDT itself, tracer can be + * detached with no notification. This race should be extremely rare and worst + * case should result in one-time wasted extra data collection and processing. + */ +#define USDT_IS_ACTIVE(group, name) ({ \ + extern struct usdt_sema __usdt_sema_name(group, name) \ + __usdt_asm_name(__usdt_sema_name(group, name)); \ + __usdt_sema_implicit(__usdt_sema_name(group, name)); \ + __usdt_sema_name(group, name).active > 0; \ +}) + +/* + * APIs for working with user-defined explicit USDT semaphores. + * + * This is a less commonly used advanced API for use cases in which user needs + * an explicit control over (potentially shared across multiple USDTs) USDT + * semaphore instance. This can be used when there is a group of logically + * related USDTs that all need extra data collection and processing whenever + * any of a family of related USDTs are "activated" (i.e., traced). In such + * a case, all such related USDTs will be associated with the same shared USDT + * semaphore defined with USDT_DEFINE_SEMA() and the USDTs themselves will be + * triggered with USDT_WITH_EXPLICIT_SEMA() macros, taking an explicit extra + * USDT semaphore identifier as an extra parameter. + */ + +/** + * Underlying C global variable name for user-defined USDT semaphore with + * `sema` identifier. Could be useful for debugging, but normally shouldn't be + * used explicitly. + */ +#define USDT_SEMA(sema) __usdt_sema_##sema + +/* + * Define storage for user-defined USDT semaphore `sema`. + * + * Should be used only once in non-header source file to let compiler allocate + * space for the semaphore variable. Just like with any other global variable. + * + * This macro can be used anywhere where global variable declaration is + * allowed. Just like with global variable definitions, there should be only + * one definition of user-defined USDT semaphore with given `sema` identifier, + * otherwise compiler or linker will complain about duplicate variable + * definition. + * + * For C++, it is allowed to use USDT_DEFINE_SEMA() both in global namespace + * and inside namespaces (including nested namespaces). Just make sure that + * USDT_DECLARE_SEMA() is placed within the namespace where this semaphore is + * referenced, or any of its parent namespaces, so the C++ language-level + * identifier is visible to the code that needs to reference the semaphore. + * At the lowest layer, USDT semaphores have global naming and visibility + * (they have a corresponding `__usdt_sema_` symbol, which can be linked + * against from C or C++ code, if necessary). To keep it simple, putting + * USDT_DECLARE_SEMA() declarations into global namespaces is the simplest + * no-brainer solution. All these aspects are irrelevant for plain C, because + * C doesn't have namespaces and everything is always in the global namespace. + * + * N.B. Due to USDT metadata being recorded in non-allocatable ELF note + * section, it has limitations when it comes to relocations, which, in + * practice, means that it's not possible to correctly share USDT semaphores + * between main executable and shared libraries, or even between multiple + * shared libraries. USDT semaphore has to be contained to individual shared + * library or executable to avoid unpleasant surprises with half-working USDT + * semaphores. We enforce this by marking semaphore ELF symbols as having + * a hidden visibility. This is quite an advanced use case and consideration + * and for most users this should have no consequences whatsoever. + */ +#define USDT_DEFINE_SEMA(sema) \ + struct usdt_sema __usdt_sema_sec USDT_SEMA(sema) \ + __usdt_asm_name(USDT_SEMA(sema)) \ + __attribute__((visibility("hidden"))) = { 0 } + +/* + * Declare extern reference to user-defined USDT semaphore `sema`. + * + * Refers to a variable defined in another compilation unit by + * USDT_DEFINE_SEMA() and allows to use the same USDT semaphore across + * multiple compilation units (i.e., .c and .cpp files). + * + * See USDT_DEFINE_SEMA() notes above for C++ language usage peculiarities. + */ +#define USDT_DECLARE_SEMA(sema) \ + extern struct usdt_sema USDT_SEMA(sema) __usdt_asm_name(USDT_SEMA(sema)) + +/* + * Check if user-defined USDT semaphore `sema` is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into USDT(s) associated with USDT semaphore `sema`. + * + * N.B. Such checks are necessarily racy. Between checking the state of USDT + * semaphore and triggering associated USDT(s), the active tracer might attach + * or detach. This race should be extremely rare and worst case should result + * in one-time missed USDT event or wasted extra data collection and + * processing. USDT-using tracers should be written with this in mind and is + * not a concern of the application defining USDTs with associated semaphore. + */ +#define USDT_SEMA_IS_ACTIVE(sema) (USDT_SEMA(sema).active > 0) + +/* + * Invoke USDT specified by `group` and `name` identifiers and associate + * explicitly user-defined semaphore `sema` with it. Pass through `args` as + * USDT arguments. `args` are optional and zero arguments are acceptable. + * + * Semaphore is defined with the help of USDT_DEFINE_SEMA() macro and can be + * checked whether active with USDT_SEMA_IS_ACTIVE(). + */ +#ifdef __usdt_va_opt +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema), ##__VA_ARGS__) +#else +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema) __VA_OPT__(,) __VA_ARGS__) +#endif + +/* + * Adjustable implementation aspects + */ +#ifndef USDT_ARG_CONSTRAINT +#if defined __powerpc__ +#define USDT_ARG_CONSTRAINT nZr +#elif defined __arm__ +#define USDT_ARG_CONSTRAINT g +#elif defined __loongarch__ +#define USDT_ARG_CONSTRAINT nmr +#else +#define USDT_ARG_CONSTRAINT nor +#endif +#endif /* USDT_ARG_CONSTRAINT */ + +#ifndef USDT_NOP +#if defined(__ia64__) || defined(__s390__) || defined(__s390x__) +#define USDT_NOP nop 0 +#else +#define USDT_NOP nop +#endif +#endif /* USDT_NOP */ + +/* + * Implementation details + */ +/* USDT name for implicitly-defined USDT semaphore, derived from group:name */ +#define __usdt_sema_name(group, name) __usdt_sema_##group##__##name +/* ELF section into which USDT semaphores are put */ +#define __usdt_sema_sec __attribute__((section(".probes"))) + +#define __usdt_concat(a, b) a ## b +#define __usdt_apply(fn, n) __usdt_concat(fn, n) + +#ifndef __usdt_nth +#define __usdt_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, N, ...) N +#endif + +#ifndef __usdt_narg +#ifdef __usdt_va_opt +#define __usdt_narg(...) __usdt_nth(_ __VA_OPT__(,) __VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#else +#define __usdt_narg(...) __usdt_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif +#endif /* __usdt_narg */ + +#define __usdt_hash # +#define __usdt_str_(x) #x +#define __usdt_str(x) __usdt_str_(x) + +#ifndef __usdt_asm_name +#define __usdt_asm_name(name) __asm__(__usdt_str(name)) +#endif + +#define __usdt_asm0() "\n" +#define __usdt_asm1(x) __usdt_str(x) "\n" +#define __usdt_asm2(x, ...) __usdt_str(x) "," __usdt_asm1(__VA_ARGS__) +#define __usdt_asm3(x, ...) __usdt_str(x) "," __usdt_asm2(__VA_ARGS__) +#define __usdt_asm4(x, ...) __usdt_str(x) "," __usdt_asm3(__VA_ARGS__) +#define __usdt_asm5(x, ...) __usdt_str(x) "," __usdt_asm4(__VA_ARGS__) +#define __usdt_asm6(x, ...) __usdt_str(x) "," __usdt_asm5(__VA_ARGS__) +#define __usdt_asm7(x, ...) __usdt_str(x) "," __usdt_asm6(__VA_ARGS__) +#define __usdt_asm8(x, ...) __usdt_str(x) "," __usdt_asm7(__VA_ARGS__) +#define __usdt_asm9(x, ...) __usdt_str(x) "," __usdt_asm8(__VA_ARGS__) +#define __usdt_asm10(x, ...) __usdt_str(x) "," __usdt_asm9(__VA_ARGS__) +#define __usdt_asm11(x, ...) __usdt_str(x) "," __usdt_asm10(__VA_ARGS__) +#define __usdt_asm12(x, ...) __usdt_str(x) "," __usdt_asm11(__VA_ARGS__) +#define __usdt_asm(...) __usdt_apply(__usdt_asm, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#ifdef __LP64__ +#define __usdt_asm_addr .8byte +#else +#define __usdt_asm_addr .4byte +#endif + +#define __usdt_asm_strz_(x) __usdt_asm1(.asciz #x) +#define __usdt_asm_strz(x) __usdt_asm_strz_(x) +#define __usdt_asm_str_(x) __usdt_asm1(.ascii #x) +#define __usdt_asm_str(x) __usdt_asm_str_(x) + +/* "semaphoreless" USDT case */ +#ifndef __usdt_sema_none +#define __usdt_sema_none(sema) +#endif + +/* implicitly defined __usdt_sema__group__name semaphore (using weak symbols) */ +#ifndef __usdt_sema_implicit +#define __usdt_sema_implicit(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm1(.ifndef sema) \ + __usdt_asm3( .pushsection .probes, "aw", "progbits") \ + __usdt_asm1( .weak sema) \ + __usdt_asm1( .hidden sema) \ + __usdt_asm1( .align 2) \ + __usdt_asm1(sema:) \ + __usdt_asm1( .zero 2) \ + __usdt_asm2( .type sema, @object) \ + __usdt_asm2( .size sema, 2) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + ); +#endif + +/* externally defined semaphore using USDT_DEFINE_SEMA() and passed explicitly by user */ +#ifndef __usdt_sema_explicit +#define __usdt_sema_explicit(sema) \ + __asm__ __volatile__ ("" :: "m" (sema)); +#endif + +/* main USDT definition (nop and .note.stapsdt metadata) */ +#define __usdt_probe(group, name, sema_def, sema, ...) do { \ + sema_def(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm( 990: USDT_NOP) \ + __usdt_asm3( .pushsection .note.stapsdt, "", "note") \ + __usdt_asm1( .balign 4) \ + __usdt_asm3( .4byte 992f-991f,994f-993f,3) \ + __usdt_asm1(991: .asciz "stapsdt") \ + __usdt_asm1(992: .balign 4) \ + __usdt_asm1(993: __usdt_asm_addr 990b) \ + __usdt_asm1( __usdt_asm_addr _.stapsdt.base) \ + __usdt_asm1( __usdt_asm_addr sema) \ + __usdt_asm_strz(group) \ + __usdt_asm_strz(name) \ + __usdt_asm_args(__VA_ARGS__) \ + __usdt_asm1( .ascii "\0") \ + __usdt_asm1(994: .balign 4) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.ifndef _.stapsdt.base) \ + __usdt_asm5( .pushsection .stapsdt.base,"aG","progbits",.stapsdt.base,comdat)\ + __usdt_asm1( .weak _.stapsdt.base) \ + __usdt_asm1( .hidden _.stapsdt.base) \ + __usdt_asm1(_.stapsdt.base:) \ + __usdt_asm1( .space 1) \ + __usdt_asm2( .size _.stapsdt.base, 1) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + :: __usdt_asm_ops(__VA_ARGS__) \ + ); \ +} while (0) + +/* + * NB: gdb PR24541 highlighted an unspecified corner of the sdt.h + * operand note format. + * + * The named register may be a longer or shorter (!) alias for the + * storage where the value in question is found. For example, on + * i386, 64-bit value may be put in register pairs, and a register + * name stored would identify just one of them. Previously, gcc was + * asked to emit the %w[id] (16-bit alias of some registers holding + * operands), even when a wider 32-bit value was used. + * + * Bottom line: the byte-width given before the @ sign governs. If + * there is a mismatch between that width and that of the named + * register, then a sys/sdt.h note consumer may need to employ + * architecture-specific heuristics to figure out where the compiler + * has actually put the complete value. + */ +#if defined(__powerpc__) || defined(__powerpc64__) +#define __usdt_argref(id) %I[id]%[id] +#elif defined(__i386__) +#define __usdt_argref(id) %k[id] /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */ +#else +#define __usdt_argref(id) %[id] +#endif + +#define __usdt_asm_arg(n) __usdt_asm_str(%c[__usdt_asz##n]) \ + __usdt_asm1(.ascii "@") \ + __usdt_asm_str(__usdt_argref(__usdt_aval##n)) + +#define __usdt_asm_args0 /* no arguments */ +#define __usdt_asm_args1 __usdt_asm_arg(1) +#define __usdt_asm_args2 __usdt_asm_args1 __usdt_asm1(.ascii " ") __usdt_asm_arg(2) +#define __usdt_asm_args3 __usdt_asm_args2 __usdt_asm1(.ascii " ") __usdt_asm_arg(3) +#define __usdt_asm_args4 __usdt_asm_args3 __usdt_asm1(.ascii " ") __usdt_asm_arg(4) +#define __usdt_asm_args5 __usdt_asm_args4 __usdt_asm1(.ascii " ") __usdt_asm_arg(5) +#define __usdt_asm_args6 __usdt_asm_args5 __usdt_asm1(.ascii " ") __usdt_asm_arg(6) +#define __usdt_asm_args7 __usdt_asm_args6 __usdt_asm1(.ascii " ") __usdt_asm_arg(7) +#define __usdt_asm_args8 __usdt_asm_args7 __usdt_asm1(.ascii " ") __usdt_asm_arg(8) +#define __usdt_asm_args9 __usdt_asm_args8 __usdt_asm1(.ascii " ") __usdt_asm_arg(9) +#define __usdt_asm_args10 __usdt_asm_args9 __usdt_asm1(.ascii " ") __usdt_asm_arg(10) +#define __usdt_asm_args11 __usdt_asm_args10 __usdt_asm1(.ascii " ") __usdt_asm_arg(11) +#define __usdt_asm_args12 __usdt_asm_args11 __usdt_asm1(.ascii " ") __usdt_asm_arg(12) +#define __usdt_asm_args(...) __usdt_apply(__usdt_asm_args, __usdt_narg(__VA_ARGS__)) + +#define __usdt_is_arr(x) (__builtin_classify_type(x) == 14 || __builtin_classify_type(x) == 5) +#define __usdt_arg_size(x) (__usdt_is_arr(x) ? sizeof(void *) : sizeof(x)) + +/* + * We can't use __builtin_choose_expr() in C++, so fall back to table-based + * signedness determination for known types, utilizing templates magic. + */ +#ifdef __cplusplus + +#define __usdt_is_signed(x) (!__usdt_is_arr(x) && __usdt_t<__typeof(x)>::is_signed) + +#include + +template struct __usdt_t { static const bool is_signed = false; }; +template struct __usdt_t : public __usdt_t {}; +template struct __usdt_t : public __usdt_t {}; + +#define __usdt_def_signed(T) \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; } +#define __usdt_maybe_signed(T) \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; } + +__usdt_def_signed(signed char); +__usdt_def_signed(short); +__usdt_def_signed(int); +__usdt_def_signed(long); +__usdt_def_signed(long long); +__usdt_maybe_signed(char); +__usdt_maybe_signed(wchar_t); + +#else /* !__cplusplus */ + +#define __usdt_is_inttype(x) (__builtin_classify_type(x) >= 1 && __builtin_classify_type(x) <= 4) +#define __usdt_inttype(x) __typeof(__builtin_choose_expr(__usdt_is_inttype(x), (x), 0U)) +#define __usdt_is_signed(x) ((__usdt_inttype(x))-1 < (__usdt_inttype(x))1) + +#endif /* __cplusplus */ + +#define __usdt_asm_op(n, x) \ + [__usdt_asz##n] "n" ((__usdt_is_signed(x) ? (int)-1 : 1) * (int)__usdt_arg_size(x)), \ + [__usdt_aval##n] __usdt_str(USDT_ARG_CONSTRAINT)(x) + +#define __usdt_asm_ops0() [__usdt_dummy] "g" (0) +#define __usdt_asm_ops1(x) __usdt_asm_op(1, x) +#define __usdt_asm_ops2(a,x) __usdt_asm_ops1(a), __usdt_asm_op(2, x) +#define __usdt_asm_ops3(a,b,x) __usdt_asm_ops2(a,b), __usdt_asm_op(3, x) +#define __usdt_asm_ops4(a,b,c,x) __usdt_asm_ops3(a,b,c), __usdt_asm_op(4, x) +#define __usdt_asm_ops5(a,b,c,d,x) __usdt_asm_ops4(a,b,c,d), __usdt_asm_op(5, x) +#define __usdt_asm_ops6(a,b,c,d,e,x) __usdt_asm_ops5(a,b,c,d,e), __usdt_asm_op(6, x) +#define __usdt_asm_ops7(a,b,c,d,e,f,x) __usdt_asm_ops6(a,b,c,d,e,f), __usdt_asm_op(7, x) +#define __usdt_asm_ops8(a,b,c,d,e,f,g,x) __usdt_asm_ops7(a,b,c,d,e,f,g), __usdt_asm_op(8, x) +#define __usdt_asm_ops9(a,b,c,d,e,f,g,h,x) __usdt_asm_ops8(a,b,c,d,e,f,g,h), __usdt_asm_op(9, x) +#define __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,x) __usdt_asm_ops9(a,b,c,d,e,f,g,h,i), __usdt_asm_op(10, x) +#define __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,x) __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,j), __usdt_asm_op(11, x) +#define __usdt_asm_ops12(a,b,c,d,e,f,g,h,i,j,k,x) __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,k), __usdt_asm_op(12, x) +#define __usdt_asm_ops(...) __usdt_apply(__usdt_asm_ops, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#endif /* __USDT_H */ From 4e7005223e6dab882646d96d0e2aa84a5dd07b56 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:22 +0200 Subject: [PATCH 0380/3206] selftests/bpf: Reorg the uprobe_syscall test function Adding __test_uprobe_syscall with non x86_64 stub to execute all the tests, so we don't need to keep adding non x86_64 stub functions for new tests. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-13-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 34 +++++++------------ 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index b17dc39a23dbfa..a8f00aee779934 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -350,29 +350,8 @@ static void test_uretprobe_shadow_stack(void) ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } -#else -static void test_uretprobe_regs_equal(void) -{ - test__skip(); -} - -static void test_uretprobe_regs_change(void) -{ - test__skip(); -} - -static void test_uretprobe_syscall_call(void) -{ - test__skip(); -} -static void test_uretprobe_shadow_stack(void) -{ - test__skip(); -} -#endif - -void test_uprobe_syscall(void) +static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) test_uretprobe_regs_equal(); @@ -383,3 +362,14 @@ void test_uprobe_syscall(void) if (test__start_subtest("uretprobe_shadow_stack")) test_uretprobe_shadow_stack(); } +#else +static void __test_uprobe_syscall(void) +{ + test__skip(); +} +#endif + +void test_uprobe_syscall(void) +{ + __test_uprobe_syscall(); +} From 7932c4cf577187dec42ddfba0aba26434cecab0c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:23 +0200 Subject: [PATCH 0381/3206] selftests/bpf: Rename uprobe_syscall_executed prog to test_uretprobe_multi Renaming uprobe_syscall_executed prog to test_uretprobe_multi to fit properly in the following changes that add more programs. Plus adding pid filter and increasing executed variable. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-14-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 12 ++++++++---- .../selftests/bpf/progs/uprobe_syscall_executed.c | 8 ++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index a8f00aee779934..6d58a44da2b240 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -252,6 +252,7 @@ static void test_uretprobe_syscall_call(void) ); struct uprobe_syscall_executed *skel; int pid, status, err, go[2], c = 0; + struct bpf_link *link; if (!ASSERT_OK(pipe(go), "pipe")) return; @@ -277,11 +278,14 @@ static void test_uretprobe_syscall_call(void) _exit(0); } - skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid, - "/proc/self/exe", - "uretprobe_syscall_call", &opts); - if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi")) + skel->bss->pid = pid; + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + pid, "/proc/self/exe", + "uretprobe_syscall_call", &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) goto cleanup; + skel->links.test_uretprobe_multi = link; /* kick the child */ write(go[1], &c, 1); diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c index 0d7f1a7db2e2ec..8f48976a33aa59 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c @@ -8,10 +8,14 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; int executed = 0; +int pid; SEC("uretprobe.multi") -int test(struct pt_regs *regs) +int test_uretprobe_multi(struct pt_regs *ctx) { - executed = 1; + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; return 0; } From d5c86c3370100620fa9c2e8dc9350c354b30ddb4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:24 +0200 Subject: [PATCH 0382/3206] selftests/bpf: Add uprobe/usdt syscall tests Adding tests for optimized uprobe/usdt probes. Checking that we get expected trampoline and attached bpf programs get executed properly. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-15-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 284 +++++++++++++++++- .../bpf/progs/uprobe_syscall_executed.c | 52 ++++ 2 files changed, 335 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index 6d58a44da2b240..b91135abcf8abb 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,11 @@ #include "uprobe_syscall.skel.h" #include "uprobe_syscall_executed.skel.h" +#define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00 +#include "usdt.h" + +#pragma GCC diagnostic ignored "-Wattributes" + __naked unsigned long uretprobe_regs_trigger(void) { asm volatile ( @@ -305,6 +311,265 @@ static void test_uretprobe_syscall_call(void) close(go[0]); } +#define TRAMP "[uprobes-trampoline]" + +__attribute__((aligned(16))) +__nocf_check __weak __naked void uprobe_test(void) +{ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} + +__attribute__((aligned(16))) +__nocf_check __weak void usdt_test(void) +{ + USDT(optimized_uprobe, usdt); +} + +static int find_uprobes_trampoline(void *tramp_addr) +{ + void *start, *end; + char line[128]; + int ret = -1; + FILE *maps; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + fprintf(stderr, "cannot open maps\n"); + return -1; + } + + while (fgets(line, sizeof(line), maps)) { + int m = -1; + + /* We care only about private r-x mappings. */ + if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", &start, &end, &m) != 2) + continue; + if (m < 0) + continue; + if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1) && (start == tramp_addr)) { + ret = 0; + break; + } + } + + fclose(maps); + return ret; +} + +static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; + +static void *find_nop5(void *fn) +{ + int i; + + for (i = 0; i < 10; i++) { + if (!memcmp(nop5, fn + i, 5)) + return fn + i; + } + return NULL; +} + +typedef void (__attribute__((nocf_check)) *trigger_t)(void); + +static bool shstk_is_enabled; + +static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger, + void *addr, int executed) +{ + struct __arch_relative_insn { + __u8 op; + __s32 raddr; + } __packed *call; + void *tramp = NULL; + __u8 *bp; + + /* Uprobe gets optimized after first trigger, so let's press twice. */ + trigger(); + trigger(); + + /* Make sure bpf program got executed.. */ + ASSERT_EQ(skel->bss->executed, executed, "executed"); + + if (shstk_is_enabled) { + /* .. and check optimization is disabled under shadow stack. */ + bp = (__u8 *) addr; + ASSERT_EQ(*bp, 0xcc, "int3"); + } else { + /* .. and check the trampoline is as expected. */ + call = (struct __arch_relative_insn *) addr; + tramp = (void *) (call + 1) + call->raddr; + ASSERT_EQ(call->op, 0xe8, "call"); + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); + } + + return tramp; +} + +static void check_detach(void *addr, void *tramp) +{ + /* [uprobes_trampoline] stays after detach */ + ASSERT_OK(!shstk_is_enabled && find_uprobes_trampoline(tramp), "uprobes_trampoline"); + ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); +} + +static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link, + trigger_t trigger, void *addr, int executed) +{ + void *tramp; + + tramp = check_attach(skel, trigger, addr, executed); + bpf_link__destroy(link); + check_detach(addr, tramp); +} + +static void test_uprobe_legacy(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + /* uprobe */ + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe */ + skel->bss->executed = 0; + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_multi(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + /* uprobe.multi */ + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe.multi */ + skel->bss->executed = 0; + opts.retprobe = true; + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_session(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .session = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 4); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_usdt(void) +{ + struct uprobe_syscall_executed *skel; + struct bpf_link *link; + void *addr; + + errno = 0; + addr = find_nop5(usdt_test); + if (!ASSERT_OK_PTR(addr, "find_nop5")) + return; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_usdt(skel->progs.test_usdt, + -1 /* all PIDs */, "/proc/self/exe", + "optimized_uprobe", "usdt", NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt")) + goto cleanup; + + check(skel, link, usdt_test, addr, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + /* * Borrowed from tools/testing/selftests/x86/test_shadow_stack.c. * @@ -347,11 +612,20 @@ static void test_uretprobe_shadow_stack(void) return; } - /* Run all of the uretprobe tests. */ + /* Run all the tests with shadow stack in place. */ + shstk_is_enabled = true; + test_uretprobe_regs_equal(); test_uretprobe_regs_change(); test_uretprobe_syscall_call(); + test_uprobe_legacy(); + test_uprobe_multi(); + test_uprobe_session(); + test_uprobe_usdt(); + + shstk_is_enabled = false; + ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } @@ -365,6 +639,14 @@ static void __test_uprobe_syscall(void) test_uretprobe_syscall_call(); if (test__start_subtest("uretprobe_shadow_stack")) test_uretprobe_shadow_stack(); + if (test__start_subtest("uprobe_legacy")) + test_uprobe_legacy(); + if (test__start_subtest("uprobe_multi")) + test_uprobe_multi(); + if (test__start_subtest("uprobe_session")) + test_uprobe_session(); + if (test__start_subtest("uprobe_usdt")) + test_uprobe_usdt(); } #else static void __test_uprobe_syscall(void) diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c index 8f48976a33aa59..915d38591bf6a4 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" #include +#include +#include #include struct pt_regs regs; @@ -10,6 +12,36 @@ char _license[] SEC("license") = "GPL"; int executed = 0; int pid; +SEC("uprobe") +int BPF_UPROBE(test_uprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(test_uretprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.multi") +int test_uprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + SEC("uretprobe.multi") int test_uretprobe_multi(struct pt_regs *ctx) { @@ -19,3 +51,23 @@ int test_uretprobe_multi(struct pt_regs *ctx) executed++; return 0; } + +SEC("uprobe.session") +int test_uprobe_session(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("usdt") +int test_usdt(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} From c8be59667cf17f281adc9a9387d7a0de60268fcd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:25 +0200 Subject: [PATCH 0383/3206] selftests/bpf: Add hit/attach/detach race optimized uprobe test Adding test that makes sure parallel execution of the uprobe and attach/detach of optimized uprobe on it works properly. By default the test runs for 500ms, which is adjustable by using BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC env variable. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-16-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index b91135abcf8abb..3d27c8bc019e6a 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -15,6 +15,7 @@ #include #include "uprobe_syscall.skel.h" #include "uprobe_syscall_executed.skel.h" +#include "bpf/libbpf_internal.h" #define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00 #include "usdt.h" @@ -629,6 +630,111 @@ static void test_uretprobe_shadow_stack(void) ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } +static volatile bool race_stop; + +static USDT_DEFINE_SEMA(race); + +static void *worker_trigger(void *arg) +{ + unsigned long rounds = 0; + + while (!race_stop) { + uprobe_test(); + rounds++; + } + + printf("tid %d trigger rounds: %lu\n", gettid(), rounds); + return NULL; +} + +static void *worker_attach(void *arg) +{ + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct uprobe_syscall_executed *skel; + unsigned long rounds = 0, offset; + const char *sema[2] = { + __stringify(USDT_SEMA(race)), + NULL, + }; + unsigned long *ref; + int err; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return NULL; + + err = elf_resolve_syms_offsets("/proc/self/exe", 1, (const char **) &sema, &ref, STT_OBJECT); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema")) + return NULL; + + opts.ref_ctr_offset = *ref; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return NULL; + + skel->bss->pid = getpid(); + + while (!race_stop) { + skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts")) + break; + + bpf_link__destroy(skel->links.test_uprobe); + skel->links.test_uprobe = NULL; + rounds++; + } + + printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed); + uprobe_syscall_executed__destroy(skel); + free(ref); + return NULL; +} + +static useconds_t race_msec(void) +{ + char *env; + + env = getenv("BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC"); + if (env) + return atoi(env); + + /* default duration is 500ms */ + return 500; +} + +static void test_uprobe_race(void) +{ + int err, i, nr_threads; + pthread_t *threads; + + nr_threads = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_threads, 0, "libbpf_num_possible_cpus")) + return; + nr_threads = max(2, nr_threads); + + threads = alloca(sizeof(*threads) * nr_threads); + if (!ASSERT_OK_PTR(threads, "malloc")) + return; + + for (i = 0; i < nr_threads; i++) { + err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach, + NULL); + if (!ASSERT_OK(err, "pthread_create")) + goto cleanup; + } + + usleep(race_msec() * 1000); + +cleanup: + race_stop = true; + for (nr_threads = i, i = 0; i < nr_threads; i++) + pthread_join(threads[i], NULL); + + ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore"); +} + static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) @@ -647,6 +753,8 @@ static void __test_uprobe_syscall(void) test_uprobe_session(); if (test__start_subtest("uprobe_usdt")) test_uprobe_usdt(); + if (test__start_subtest("uprobe_race")) + test_uprobe_race(); } #else static void __test_uprobe_syscall(void) From c11661bd9adf6831a75bb79299de793039dd8b9b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:26 +0200 Subject: [PATCH 0384/3206] selftests/bpf: Add uprobe syscall sigill signal test Make sure that calling uprobe syscall from outside uprobe trampoline results in sigill signal. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-17-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index 3d27c8bc019e6a..02e98cba5cc698 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -735,6 +735,40 @@ static void test_uprobe_race(void) ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore"); } +#ifndef __NR_uprobe +#define __NR_uprobe 336 +#endif + +static void test_uprobe_sigill(void) +{ + int status, err, pid; + + pid = fork(); + if (!ASSERT_GE(pid, 0, "fork")) + return; + /* child */ + if (pid == 0) { + asm volatile ( + "pushq %rax\n" + "pushq %rcx\n" + "pushq %r11\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "popq %r11\n" + "popq %rcx\n" + "retq\n" + ); + exit(0); + } + + err = waitpid(pid, &status, 0); + ASSERT_EQ(err, pid, "waitpid"); + + /* verify the child got killed with SIGILL */ + ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED"); + ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG"); +} + static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) @@ -755,6 +789,8 @@ static void __test_uprobe_syscall(void) test_uprobe_usdt(); if (test__start_subtest("uprobe_race")) test_uprobe_race(); + if (test__start_subtest("uprobe_sigill")) + test_uprobe_sigill(); } #else static void __test_uprobe_syscall(void) From 875e1705ad9962f2642d098d6bfaabfa6f9c7ace Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:27 +0200 Subject: [PATCH 0385/3206] selftests/bpf: Add optimized usdt variant for basic usdt test Adding optimized usdt variant for basic usdt test to check that usdt arguments are properly passed in optimized code path. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-18-jolsa@kernel.org --- tools/testing/selftests/bpf/prog_tests/usdt.c | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 9057e983cc5405..833eb87483a1cd 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -40,12 +40,19 @@ static void __always_inline trigger_func(int x) { } } -static void subtest_basic_usdt(void) +static void subtest_basic_usdt(bool optimized) { LIBBPF_OPTS(bpf_usdt_opts, opts); struct test_usdt *skel; struct test_usdt__bss *bss; - int err, i; + int err, i, called; + +#define TRIGGER(x) ({ \ + trigger_func(x); \ + if (optimized) \ + trigger_func(x); \ + optimized ? 2 : 1; \ + }) skel = test_usdt__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) @@ -66,11 +73,11 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) goto cleanup; - trigger_func(1); + called = TRIGGER(1); - ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); @@ -119,11 +126,11 @@ static void subtest_basic_usdt(void) * bpf_program__attach_usdt() handles this properly and attaches to * all possible places of USDT invocation. */ - trigger_func(2); + called += TRIGGER(2); - ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); /* only check values that depend on trigger_func()'s input value */ ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1"); @@ -142,9 +149,9 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach")) goto cleanup; - trigger_func(3); + called += TRIGGER(3); - ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); /* this time usdt3 has custom cookie */ ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie"); ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); @@ -158,6 +165,7 @@ static void subtest_basic_usdt(void) cleanup: test_usdt__destroy(skel); +#undef TRIGGER } unsigned short test_usdt_100_semaphore SEC(".probes"); @@ -425,7 +433,11 @@ static void subtest_urandom_usdt(bool auto_attach) void test_usdt(void) { if (test__start_subtest("basic")) - subtest_basic_usdt(); + subtest_basic_usdt(false); +#ifdef __x86_64__ + if (test__start_subtest("basic_optimized")) + subtest_basic_usdt(true); +#endif if (test__start_subtest("multispec")) subtest_multispec_usdt(); if (test__start_subtest("urand_auto_attach")) From 275eae6789864904a7319fbb4e993734a0fb4310 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:28 +0200 Subject: [PATCH 0386/3206] selftests/bpf: Add uprobe_regs_equal test Changing uretprobe_regs_trigger to allow the test for both uprobe and uretprobe and renaming it to uprobe_regs_equal. We check that both uprobe and uretprobe probes (bpf programs) see expected registers with few exceptions. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-19-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 56 ++++++++++++++----- .../selftests/bpf/progs/uprobe_syscall.c | 4 +- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index 02e98cba5cc698..36ce9e261b5c2f 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -22,15 +22,17 @@ #pragma GCC diagnostic ignored "-Wattributes" -__naked unsigned long uretprobe_regs_trigger(void) +__attribute__((aligned(16))) +__nocf_check __weak __naked unsigned long uprobe_regs_trigger(void) { asm volatile ( + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */ "movq $0xdeadbeef, %rax\n" "ret\n" ); } -__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) +__naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after) { asm volatile ( "movq %r15, 0(%rdi)\n" @@ -51,15 +53,17 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) "movq $0, 120(%rdi)\n" /* orig_rax */ "movq $0, 128(%rdi)\n" /* rip */ "movq $0, 136(%rdi)\n" /* cs */ + "pushq %rax\n" "pushf\n" "pop %rax\n" "movq %rax, 144(%rdi)\n" /* eflags */ + "pop %rax\n" "movq %rsp, 152(%rdi)\n" /* rsp */ "movq $0, 160(%rdi)\n" /* ss */ /* save 2nd argument */ "pushq %rsi\n" - "call uretprobe_regs_trigger\n" + "call uprobe_regs_trigger\n" /* save return value and load 2nd argument pointer to rax */ "pushq %rax\n" @@ -99,25 +103,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) ); } -static void test_uretprobe_regs_equal(void) +static void test_uprobe_regs_equal(bool retprobe) { + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = retprobe, + ); struct uprobe_syscall *skel = NULL; struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; unsigned long *pa = (unsigned long *) &after; unsigned long *pp; + unsigned long offset; unsigned int i, cnt; - int err; + + offset = get_uprobe_offset(&uprobe_regs_trigger); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return; skel = uprobe_syscall__open_and_load(); if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load")) goto cleanup; - err = uprobe_syscall__attach(skel); - if (!ASSERT_OK(err, "uprobe_syscall__attach")) + skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts")) goto cleanup; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + if (!retprobe) + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); pp = (unsigned long *) &skel->bss->regs; cnt = sizeof(before)/sizeof(*pb); @@ -126,7 +142,7 @@ static void test_uretprobe_regs_equal(void) unsigned int offset = i * sizeof(unsigned long); /* - * Check register before and after uretprobe_regs_trigger call + * Check register before and after uprobe_regs_trigger call * that triggers the uretprobe. */ switch (offset) { @@ -140,7 +156,7 @@ static void test_uretprobe_regs_equal(void) /* * Check register seen from bpf program and register after - * uretprobe_regs_trigger call + * uprobe_regs_trigger call (with rax exception, check below). */ switch (offset) { /* @@ -153,6 +169,15 @@ static void test_uretprobe_regs_equal(void) case offsetof(struct pt_regs, rsp): case offsetof(struct pt_regs, ss): break; + /* + * uprobe does not see return value in rax, it needs to see the + * original (before) rax value + */ + case offsetof(struct pt_regs, rax): + if (!retprobe) { + ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check"); + break; + } default: if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check")) fprintf(stdout, "failed register offset %u\n", offset); @@ -190,13 +215,13 @@ static void test_uretprobe_regs_change(void) unsigned long cnt = sizeof(before)/sizeof(*pb); unsigned int i, err, offset; - offset = get_uprobe_offset(uretprobe_regs_trigger); + offset = get_uprobe_offset(uprobe_regs_trigger); err = write_bpf_testmod_uprobe(offset); if (!ASSERT_OK(err, "register_uprobe")) return; - uretprobe_regs(&before, &after); + uprobe_regs(&before, &after); err = write_bpf_testmod_uprobe(0); if (!ASSERT_OK(err, "unregister_uprobe")) @@ -616,7 +641,8 @@ static void test_uretprobe_shadow_stack(void) /* Run all the tests with shadow stack in place. */ shstk_is_enabled = true; - test_uretprobe_regs_equal(); + test_uprobe_regs_equal(false); + test_uprobe_regs_equal(true); test_uretprobe_regs_change(); test_uretprobe_syscall_call(); @@ -772,7 +798,7 @@ static void test_uprobe_sigill(void) static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) - test_uretprobe_regs_equal(); + test_uprobe_regs_equal(true); if (test__start_subtest("uretprobe_regs_change")) test_uretprobe_regs_change(); if (test__start_subtest("uretprobe_syscall_call")) @@ -791,6 +817,8 @@ static void __test_uprobe_syscall(void) test_uprobe_race(); if (test__start_subtest("uprobe_sigill")) test_uprobe_sigill(); + if (test__start_subtest("uprobe_regs_equal")) + test_uprobe_regs_equal(false); } #else static void __test_uprobe_syscall(void) diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall.c b/tools/testing/selftests/bpf/progs/uprobe_syscall.c index 8a4fa6c7ef5900..e08c31669e5a76 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall.c @@ -7,8 +7,8 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; -SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger") -int uretprobe(struct pt_regs *ctx) +SEC("uprobe") +int probe(struct pt_regs *ctx) { __builtin_memcpy(®s, ctx, sizeof(regs)); return 0; From 3abf4298c6139cf10a41472d87b2f608666302b0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:29 +0200 Subject: [PATCH 0387/3206] selftests/bpf: Change test_uretprobe_regs_change for uprobe and uretprobe Changing the test_uretprobe_regs_change test to test both uprobe and uretprobe by adding entry consumer handler to the testmod and making it to change one of the registers. Making sure that changed values both uprobe and uretprobe handlers propagate to the user space. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-20-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 12 ++++++++---- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 11 +++++++++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index 36ce9e261b5c2f..c1f945cacebc27 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -207,7 +207,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset) return ret != n ? (int) ret : 0; } -static void test_uretprobe_regs_change(void) +static void test_regs_change(void) { struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; @@ -221,6 +221,9 @@ static void test_uretprobe_regs_change(void) if (!ASSERT_OK(err, "register_uprobe")) return; + /* make sure uprobe gets optimized */ + uprobe_regs_trigger(); + uprobe_regs(&before, &after); err = write_bpf_testmod_uprobe(0); @@ -643,7 +646,6 @@ static void test_uretprobe_shadow_stack(void) test_uprobe_regs_equal(false); test_uprobe_regs_equal(true); - test_uretprobe_regs_change(); test_uretprobe_syscall_call(); test_uprobe_legacy(); @@ -651,6 +653,8 @@ static void test_uretprobe_shadow_stack(void) test_uprobe_session(); test_uprobe_usdt(); + test_regs_change(); + shstk_is_enabled = false; ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); @@ -799,8 +803,6 @@ static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) test_uprobe_regs_equal(true); - if (test__start_subtest("uretprobe_regs_change")) - test_uretprobe_regs_change(); if (test__start_subtest("uretprobe_syscall_call")) test_uretprobe_syscall_call(); if (test__start_subtest("uretprobe_shadow_stack")) @@ -819,6 +821,8 @@ static void __test_uprobe_syscall(void) test_uprobe_sigill(); if (test__start_subtest("uprobe_regs_equal")) test_uprobe_regs_equal(false); + if (test__start_subtest("regs_change")) + test_regs_change(); } #else static void __test_uprobe_syscall(void) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index e9e918cdf31ff2..511911053bdc54 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -500,15 +500,21 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { */ #ifdef __x86_64__ +static int +uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data) +{ + regs->cx = 0x87654321feebdaed; + return 0; +} + static int uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs, __u64 *data) { regs->ax = 0x12345678deadbeef; - regs->cx = 0x87654321feebdaed; regs->r11 = (u64) -1; - return true; + return 0; } struct testmod_uprobe { @@ -520,6 +526,7 @@ struct testmod_uprobe { static DEFINE_MUTEX(testmod_uprobe_mutex); static struct testmod_uprobe uprobe = { + .consumer.handler = uprobe_handler, .consumer.ret_handler = uprobe_ret_handler, }; From 52718438af2ac8323aeea41b6f59da0962cb73b6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 21 Aug 2025 16:15:57 +0200 Subject: [PATCH 0388/3206] selftests/bpf: Fix uprobe syscall shadow stack test Now that we have uprobe syscall working properly with shadow stack, we can remove testing limitations for shadow stack tests and make sure uprobe gets properly optimized. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821141557.13233-1-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index c1f945cacebc27..5da0b49eeacaed 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -403,8 +403,6 @@ static void *find_nop5(void *fn) typedef void (__attribute__((nocf_check)) *trigger_t)(void); -static bool shstk_is_enabled; - static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger, void *addr, int executed) { @@ -413,7 +411,6 @@ static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigge __s32 raddr; } __packed *call; void *tramp = NULL; - __u8 *bp; /* Uprobe gets optimized after first trigger, so let's press twice. */ trigger(); @@ -422,17 +419,11 @@ static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigge /* Make sure bpf program got executed.. */ ASSERT_EQ(skel->bss->executed, executed, "executed"); - if (shstk_is_enabled) { - /* .. and check optimization is disabled under shadow stack. */ - bp = (__u8 *) addr; - ASSERT_EQ(*bp, 0xcc, "int3"); - } else { - /* .. and check the trampoline is as expected. */ - call = (struct __arch_relative_insn *) addr; - tramp = (void *) (call + 1) + call->raddr; - ASSERT_EQ(call->op, 0xe8, "call"); - ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); - } + /* .. and check the trampoline is as expected. */ + call = (struct __arch_relative_insn *) addr; + tramp = (void *) (call + 1) + call->raddr; + ASSERT_EQ(call->op, 0xe8, "call"); + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); return tramp; } @@ -440,7 +431,7 @@ static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigge static void check_detach(void *addr, void *tramp) { /* [uprobes_trampoline] stays after detach */ - ASSERT_OK(!shstk_is_enabled && find_uprobes_trampoline(tramp), "uprobes_trampoline"); + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); } @@ -642,7 +633,6 @@ static void test_uretprobe_shadow_stack(void) } /* Run all the tests with shadow stack in place. */ - shstk_is_enabled = true; test_uprobe_regs_equal(false); test_uprobe_regs_equal(true); @@ -655,8 +645,6 @@ static void test_uretprobe_shadow_stack(void) test_regs_change(); - shstk_is_enabled = false; - ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } From 89d1d8434d246c96309a6068dfcf9e36dc61227b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:30 +0200 Subject: [PATCH 0389/3206] seccomp: passthrough uprobe systemcall without filtering Adding uprobe as another exception to the seccomp filter alongside with the uretprobe syscall. Same as the uretprobe the uprobe syscall is installed by kernel as replacement for the breakpoint exception and is limited to x86_64 arch and isn't expected to ever be supported in i386. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250720112133.244369-21-jolsa@kernel.org --- kernel/seccomp.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 41aa761c7738ce..7daf2da09e8e16 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -741,6 +741,26 @@ seccomp_prepare_user_filter(const char __user *user_filter) } #ifdef SECCOMP_ARCH_NATIVE +static bool seccomp_uprobe_exception(struct seccomp_data *sd) +{ +#if defined __NR_uretprobe || defined __NR_uprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + { +#ifdef __NR_uretprobe + if (sd->nr == __NR_uretprobe) + return true; +#endif +#ifdef __NR_uprobe + if (sd->nr == __NR_uprobe) + return true; +#endif + } +#endif + return false; +} + /** * seccomp_is_const_allow - check if filter is constant allow with given data * @fprog: The BPF programs @@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, return false; /* Our single exception to filtering. */ -#ifdef __NR_uretprobe -#ifdef SECCOMP_ARCH_COMPAT - if (sd->arch == SECCOMP_ARCH_NATIVE) -#endif - if (sd->nr == __NR_uretprobe) - return true; -#endif + if (seccomp_uprobe_exception(sd)) + return true; for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; @@ -1042,6 +1057,9 @@ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, #ifdef __NR_uretprobe __NR_uretprobe, +#endif +#ifdef __NR_uprobe + __NR_uprobe, #endif -1, /* negative terminated */ }; From 9ffc7a635c35ad61349a36e9f52d46df9ba67dc3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:31 +0200 Subject: [PATCH 0390/3206] selftests/seccomp: validate uprobe syscall passes through seccomp Adding uprobe checks into the current uretprobe tests. All the related tests are now executed with attached uprobe or uretprobe or without any probe. Renaming the test fixture to uprobe, because it seems better. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250720112133.244369-22-jolsa@kernel.org --- tools/testing/selftests/seccomp/seccomp_bpf.c | 107 ++++++++++++++---- 1 file changed, 86 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 61acbd45ffaaf8..2cf6fc825d8650 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -73,6 +73,14 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + +#ifndef __naked +#define __naked __attribute__((__naked__)) +#endif + #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #define PR_GET_NO_NEW_PRIVS 39 @@ -4896,7 +4904,36 @@ TEST(tsync_vs_dead_thread_leader) EXPECT_EQ(0, status); } -noinline int probed(void) +#ifdef __x86_64__ + +/* + * We need naked probed_uprobe function. Using __nocf_check + * check to skip possible endbr64 instruction and ignoring + * -Wattributes, otherwise the compilation might fail. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" + +__naked __nocf_check noinline int probed_uprobe(void) +{ + /* + * Optimized uprobe is possible only on top of nop5 instruction. + */ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} +#pragma GCC diagnostic pop + +#else +noinline int probed_uprobe(void) +{ + return 1; +} +#endif + +noinline int probed_uretprobe(void) { return 1; } @@ -4949,35 +4986,46 @@ static ssize_t get_uprobe_offset(const void *addr) return found ? (uintptr_t)addr - start + base : -1; } -FIXTURE(URETPROBE) { +FIXTURE(UPROBE) { int fd; }; -FIXTURE_VARIANT(URETPROBE) { +FIXTURE_VARIANT(UPROBE) { /* - * All of the URETPROBE behaviors can be tested with either - * uretprobe attached or not + * All of the U(RET)PROBE behaviors can be tested with either + * u(ret)probe attached or not */ bool attach; + /* + * Test both uprobe and uretprobe. + */ + bool uretprobe; }; -FIXTURE_VARIANT_ADD(URETPROBE, attached) { +FIXTURE_VARIANT_ADD(UPROBE, not_attached) { + .attach = false, + .uretprobe = false, +}; + +FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) { .attach = true, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { - .attach = false, +FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) { + .attach = true, + .uretprobe = true, }; -FIXTURE_SETUP(URETPROBE) +FIXTURE_SETUP(UPROBE) { const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; ssize_t offset; int type, bit; -#ifndef __NR_uretprobe - SKIP(return, "__NR_uretprobe syscall not defined"); +#if !defined(__NR_uprobe) || !defined(__NR_uretprobe) + SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined"); #endif if (!variant->attach) @@ -4987,12 +5035,17 @@ FIXTURE_SETUP(URETPROBE) type = determine_uprobe_perf_type(); ASSERT_GE(type, 0); - bit = determine_uprobe_retprobe_bit(); - ASSERT_GE(bit, 0); - offset = get_uprobe_offset(probed); + + if (variant->uretprobe) { + bit = determine_uprobe_retprobe_bit(); + ASSERT_GE(bit, 0); + } + + offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe); ASSERT_GE(offset, 0); - attr.config |= 1 << bit; + if (variant->uretprobe) + attr.config |= 1 << bit; attr.size = attr_sz; attr.type = type; attr.config1 = ptr_to_u64("/proc/self/exe"); @@ -5003,7 +5056,7 @@ FIXTURE_SETUP(URETPROBE) PERF_FLAG_FD_CLOEXEC); } -FIXTURE_TEARDOWN(URETPROBE) +FIXTURE_TEARDOWN(UPROBE) { /* we could call close(self->fd), but we'd need extra filter for * that and since we are calling _exit right away.. @@ -5017,11 +5070,17 @@ static int run_probed_with_filter(struct sock_fprog *prog) return -1; } - probed(); + /* + * Uprobe is optimized after first hit, so let's hit twice. + */ + probed_uprobe(); + probed_uprobe(); + + probed_uretprobe(); return 0; } -TEST_F(URETPROBE, uretprobe_default_allow) +TEST_F(UPROBE, uprobe_default_allow) { struct sock_filter filter[] = { BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), @@ -5034,7 +5093,7 @@ TEST_F(URETPROBE, uretprobe_default_allow) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block) +TEST_F(UPROBE, uprobe_default_block) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, @@ -5051,11 +5110,14 @@ TEST_F(URETPROBE, uretprobe_default_block) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) +TEST_F(UPROBE, uprobe_block_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), #endif @@ -5070,11 +5132,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) +TEST_F(UPROBE, uprobe_default_block_with_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), #endif From e173287b5d2119971fc329473a020171836d14c9 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 5 Aug 2025 10:50:00 +0800 Subject: [PATCH 0391/3206] uprobes: Remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. No functional changes. Signed-off-by: Qianfeng Rong Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250805025000.346647-1-rongqianfeng@vivo.com --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4a194d7c838b52..996a81080d5630 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1220,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), - GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + GFP_NOWAIT | __GFP_NOMEMALLOC); if (prev) prev->next = NULL; } From d9cf9c6884d21e01483c4e17479d27636ea4bb50 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:26 +0800 Subject: [PATCH 0392/3206] perf/x86/intel: Use early_initcall() to hook bts_init() After the commit 'd971342d38bf ("perf/x86/intel: Decouple BTS initialization from PEBS initialization")' is introduced, x86_pmu.bts would initialized in bts_init() which is hooked by arch_initcall(). Whereas init_hw_perf_events() is hooked by early_initcall(). Once the core PMU is initialized, nmi watchdog initialization is called immediately before bts_init() is called. It leads to the BTS buffer is not really initialized since bts_init() is not called and x86_pmu.bts is still false at that time. Worse, BTS buffer would never be initialized then unless all core PMU events are freed and reserve_ds_buffers() is called again. Thus aligning with init_hw_perf_events(), use early_initcall() to hook bts_init() to ensure x86_pmu.bts is initialized before nmi watchdog initialization. Fixes: d971342d38bf ("perf/x86/intel: Decouple BTS initialization from PEBS initialization") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250820023032.17128-2-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/bts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 61da6b8a3d519f..cbac54cb3a9ec5 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -643,4 +643,4 @@ static __init int bts_init(void) return perf_pmu_register(&bts_pmu, "intel_bts", -1); } -arch_initcall(bts_init); +early_initcall(bts_init); From 43796f30507802d93ead2dc44fc9637f34671a89 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:27 +0800 Subject: [PATCH 0393/3206] perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error When running perf_fuzzer on PTL, sometimes the below "unchecked MSR access error" is seen when accessing IA32_PMC_x_CFG_B MSRs. [ 55.611268] unchecked MSR access error: WRMSR to 0x1986 (tried to write 0x0000000200000001) at rIP: 0xffffffffac564b28 (native_write_msr+0x8/0x30) [ 55.611280] Call Trace: [ 55.611282] [ 55.611284] ? intel_pmu_config_acr+0x87/0x160 [ 55.611289] intel_pmu_enable_acr+0x6d/0x80 [ 55.611291] intel_pmu_enable_event+0xce/0x460 [ 55.611293] x86_pmu_start+0x78/0xb0 [ 55.611297] x86_pmu_enable+0x218/0x3a0 [ 55.611300] ? x86_pmu_enable+0x121/0x3a0 [ 55.611302] perf_pmu_enable+0x40/0x50 [ 55.611307] ctx_resched+0x19d/0x220 [ 55.611309] __perf_install_in_context+0x284/0x2f0 [ 55.611311] ? __pfx_remote_function+0x10/0x10 [ 55.611314] remote_function+0x52/0x70 [ 55.611317] ? __pfx_remote_function+0x10/0x10 [ 55.611319] generic_exec_single+0x84/0x150 [ 55.611323] smp_call_function_single+0xc5/0x1a0 [ 55.611326] ? __pfx_remote_function+0x10/0x10 [ 55.611329] perf_install_in_context+0xd1/0x1e0 [ 55.611331] ? __pfx___perf_install_in_context+0x10/0x10 [ 55.611333] __do_sys_perf_event_open+0xa76/0x1040 [ 55.611336] __x64_sys_perf_event_open+0x26/0x30 [ 55.611337] x64_sys_call+0x1d8e/0x20c0 [ 55.611339] do_syscall_64+0x4f/0x120 [ 55.611343] entry_SYSCALL_64_after_hwframe+0x76/0x7e On PTL, GP counter 0 and 1 doesn't support auto counter reload feature, thus it would trigger a #GP when trying to write 1 on bit 0 of CFG_B MSR which requires to enable auto counter reload on GP counter 0. The root cause of causing this issue is the check for auto counter reload (ACR) counter mask from user space is incorrect in intel_pmu_acr_late_setup() helper. It leads to an invalid ACR counter mask from user space could be set into hw.config1 and then written into CFG_B MSRs and trigger the MSR access warning. e.g., User may create a perf event with ACR counter mask (config2=0xcb), and there is only 1 event created, so "cpuc->n_events" is 1. The correct check condition should be "i + idx >= cpuc->n_events" instead of "i + idx > cpuc->n_events" (it looks a typo). Otherwise, the counter mask would traverse twice and an invalid "cpuc->assign[1]" bit (bit 0) is set into hw.config1 and cause MSR accessing error. Besides, also check if the ACR counter mask corresponding events are ACR events. If not, filter out these counter mask. If a event is not a ACR event, it could be scheduled to an HW counter which doesn't support ACR. It's invalid to add their counter index in ACR counter mask. Furthermore, remove the WARN_ON_ONCE() since it's easily triggered as user could set any invalid ACR counter mask and the warning message could mislead users. Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250820023032.17128-3-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c2fb729c270ec4..15da60cf69f20c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2997,7 +2997,8 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) if (event->group_leader != leader->group_leader) break; for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { - if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + if (i + idx >= cpuc->n_events || + !is_acr_event_group(cpuc->event_list[i + idx])) return; __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); } From 0c5caea762de31a85cbcce65d978cec83449f699 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:29 +0800 Subject: [PATCH 0394/3206] perf/x86: Add PERF_CAP_PEBS_TIMING_INFO flag IA32_PERF_CAPABILITIES.PEBS_TIMING_INFO[bit 17] is introduced to indicate whether timed PEBS is supported. Timed PEBS adds a new "retired latency" field in basic info group to show the timing info. Please find detailed information about timed PEBS in section 8.4.1 "Timed Processor Event Based Sampling" of "Intel Architecture Instruction Set Extensions and Future Features". This patch adds PERF_CAP_PEBS_TIMING_INFO flag and KVM module leverages this flag to expose timed PEBS feature to guest. Moreover, opportunistically refine the indents and make the macros share consistent indents. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Yi Lai Link: https://lore.kernel.org/r/20250820023032.17128-5-dapeng1.mi@linux.intel.com --- arch/x86/include/asm/msr-index.h | 14 ++++++++------ tools/arch/x86/include/asm/msr-index.h | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa1410..f627196eb79662 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -315,12 +315,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 5cfb5d74dd5f58..daebfd926f08e7 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -315,12 +315,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) From 9b3e119784bc3671fde5043001a5c9a607c7d920 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:30 +0800 Subject: [PATCH 0395/3206] perf/x86/intel: Change macro GLOBAL_CTRL_EN_PERF_METRICS to BIT_ULL(48) Macro GLOBAL_CTRL_EN_PERF_METRICS is defined to 48 instead of BIT_ULL(48), it's inconsistent with other similar macros. This leads to this macro is quite easily used wrongly since users thinks it's a bit-mask just like other similar macros. Thus change GLOBAL_CTRL_EN_PERF_METRICS to BIT_ULL(48) and eliminate this potential misuse. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Yi Lai Link: https://lore.kernel.org/r/20250820023032.17128-6-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 8 ++++---- arch/x86/include/asm/perf_event.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 15da60cf69f20c..f88a99d8d125cf 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5319,9 +5319,9 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); if (pmu->intel_cap.perf_metrics) - pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + pmu->intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; else - pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; intel_pmu_check_event_constraints(pmu->event_constraints, pmu->cntr_mask64, @@ -5456,7 +5456,7 @@ static void intel_pmu_cpu_starting(int cpu) rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); if (!perf_cap.perf_metrics) { x86_pmu.intel_cap.perf_metrics = 0; - x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; } } @@ -7790,7 +7790,7 @@ __init int intel_pmu_init(void) } if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) - x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + x86_pmu.intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; if (x86_pmu.intel_cap.pebs_timing_info) x86_pmu.flags |= PMU_FL_RETIRE_LATENCY; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 70d1d94aca7e63..f8247ac276c411 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -430,7 +430,7 @@ static inline bool is_topdown_idx(int idx) #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 -#define GLOBAL_CTRL_EN_PERF_METRICS 48 +#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48) /* * We model guest LBR event tracing as another fixed-mode PMC like BTS. * From 2676dbf9f4fb7f6739d1207c0f1deaf63124642a Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:31 +0800 Subject: [PATCH 0396/3206] perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK ICL_FIXED_0_ADAPTIVE is missed to be added into INTEL_FIXED_BITS_MASK, add it. With help of this new INTEL_FIXED_BITS_MASK, intel_pmu_enable_fixed() can be optimized. The old fixed counter control bits can be unconditionally cleared with INTEL_FIXED_BITS_MASK and then set new control bits base on new configuration. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Yi Lai Link: https://lore.kernel.org/r/20250820023032.17128-7-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 10 +++------- arch/x86/include/asm/perf_event.h | 6 +++++- arch/x86/kvm/pmu.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f88a99d8d125cf..28f5468a6ea36b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 mask, bits = 0; int idx = hwc->idx; + u64 bits = 0; if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event) idx -= INTEL_PMC_IDX_FIXED; bits = intel_fixed_bits_by_idx(idx, bits); - mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); - - if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - } - cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); cpuc->fixed_ctrl_val |= bits; } diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index f8247ac276c411..49a4d442f3fc21 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -35,7 +35,6 @@ #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) -#define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 #define INTEL_FIXED_0_KERNEL (1ULL << 0) #define INTEL_FIXED_0_USER (1ULL << 1) @@ -48,6 +47,11 @@ #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) +#define INTEL_FIXED_BITS_MASK \ + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ + INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ + ICL_FIXED_0_ADAPTIVE) + #define intel_fixed_bits_by_idx(_idx, _bits) \ ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ad89d0bd600581..103604c4b33b58 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -13,7 +13,7 @@ #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) -/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ +/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) \ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) From f49e1be19542487921e82b29004908966cb99d7c Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:32 +0800 Subject: [PATCH 0397/3206] perf/x86: Print PMU counters bitmap in x86_pmu_show_pmu_cap() Along with the introduction Perfmon v6, pmu counters could be incontinuous, like fixed counters on CWF, only fixed counters 0-3 and 5-7 are supported, there is no fixed counter 4 on CWF. To accommodate this change, archPerfmonExt CPUID (0x23) leaves are introduced to enumerate the true-view of counters bitmap. Current perf code already supports archPerfmonExt CPUID and uses counters-bitmap to enumerate HW really supported counters, but x86_pmu_show_pmu_cap() still only dumps the absolute counter number instead of true-view bitmap, it's out-dated and may mislead readers. So dump counters true-view bitmap in x86_pmu_show_pmu_cap() and opportunistically change the dump sequence and words. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250820023032.17128-8-dapeng1.mi@linux.intel.com --- arch/x86/events/core.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7610f26dfbd90c..745caa6c15a32d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2069,13 +2069,15 @@ static void _x86_pmu_read(struct perf_event *event) void x86_pmu_show_pmu_cap(struct pmu *pmu) { - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); - pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu)); + pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64)); + pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu)); + pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64)); + pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016llx\n", x86_pmu.max_period); + pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) From 21aeabb68258ce17b91af113a768760b3a491d93 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Thu, 21 Aug 2025 03:02:54 +0000 Subject: [PATCH 0398/3206] selftests/bpf: Use vmlinux.h for BPF programs Some of the bpf test progs still use linux/libc headers. Let's use vmlinux.h instead like the rest of test progs. This will also ease cross compiling. Signed-off-by: Hengqi Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250821030254.398826-1-hengqi.chen@gmail.com --- tools/testing/selftests/bpf/progs/loop1.c | 7 +------ tools/testing/selftests/bpf/progs/loop2.c | 7 +------ tools/testing/selftests/bpf/progs/loop3.c | 7 +------ tools/testing/selftests/bpf/progs/loop6.c | 21 +++++++------------ .../selftests/bpf/progs/test_overhead.c | 5 +---- 5 files changed, 11 insertions(+), 36 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/loop1.c b/tools/testing/selftests/bpf/progs/loop1.c index 50e66772c0467c..b0fa26fb476080 100644 --- a/tools/testing/selftests/bpf/progs/loop1.c +++ b/tools/testing/selftests/bpf/progs/loop1.c @@ -1,11 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include -#include -#include -#include -#include -#include +#include "vmlinux.h" #include #include diff --git a/tools/testing/selftests/bpf/progs/loop2.c b/tools/testing/selftests/bpf/progs/loop2.c index 947bb7e988c21d..0227409d4b0e0c 100644 --- a/tools/testing/selftests/bpf/progs/loop2.c +++ b/tools/testing/selftests/bpf/progs/loop2.c @@ -1,11 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include -#include -#include -#include -#include -#include +#include "vmlinux.h" #include #include diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c index 717dab14322be5..5d1c9a775e6bbb 100644 --- a/tools/testing/selftests/bpf/progs/loop3.c +++ b/tools/testing/selftests/bpf/progs/loop3.c @@ -1,11 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include -#include -#include -#include -#include -#include +#include "vmlinux.h" #include #include diff --git a/tools/testing/selftests/bpf/progs/loop6.c b/tools/testing/selftests/bpf/progs/loop6.c index e4ff97fbcce184..dd36aff4fba360 100644 --- a/tools/testing/selftests/bpf/progs/loop6.c +++ b/tools/testing/selftests/bpf/progs/loop6.c @@ -1,8 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include +#include +#include #include #include #include "bpf_misc.h" @@ -26,12 +25,6 @@ char _license[] SEC("license") = "GPL"; #define SG_CHAIN 0x01UL #define SG_END 0x02UL -struct scatterlist { - unsigned long page_link; - unsigned int offset; - unsigned int length; -}; - #define sg_is_chain(sg) ((sg)->page_link & SG_CHAIN) #define sg_is_last(sg) ((sg)->page_link & SG_END) #define sg_chain_ptr(sg) \ @@ -62,7 +55,7 @@ static inline struct scatterlist *get_sgp(struct scatterlist **sgs, int i) return sgp; } -int config = 0; +int run_once = 0; int result = 0; SEC("kprobe/virtqueue_add_sgs") @@ -73,14 +66,14 @@ int BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, __u64 length1 = 0, length2 = 0; unsigned int i, n, len; - if (config != 0) + if (run_once != 0) return 0; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < out_sgs); i++) { __sink(out_sgs); for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); sgp = __sg_next(sgp)) { - bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + len = BPF_CORE_READ(sgp, length); length1 += len; n++; } @@ -90,13 +83,13 @@ int BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, __sink(in_sgs); for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); sgp = __sg_next(sgp)) { - bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + len = BPF_CORE_READ(sgp, length); length2 += len; n++; } } - config = 1; + run_once = 1; result = length2 - length1; return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c index abb7344b531f45..5edf3cdc213d04 100644 --- a/tools/testing/selftests/bpf/progs/test_overhead.c +++ b/tools/testing/selftests/bpf/progs/test_overhead.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ -#include -#include -#include -#include +#include "vmlinux.h" #include #include From 349a64256534aa2c73787b22f7bc0517a211cdab Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 21 Aug 2025 09:20:55 +0900 Subject: [PATCH 0399/3206] rust: Add read_poll_timeout function Add read_poll_timeout function which polls periodically until a condition is met, an error occurs, or the timeout is reached. The C's read_poll_timeout (include/linux/iopoll.h) is a complicated macro and a simple wrapper for Rust doesn't work. So this implements the same functionality in Rust. The C version uses usleep_range() while the Rust version uses fsleep(), which uses the best sleep method so it works with spans that usleep_range() doesn't work nicely with. The sleep_before_read argument isn't supported since there is no user for now. It's rarely used in the C version. Reviewed-by: Andreas Hindborg Reviewed-by: Fiona Behrens Reviewed-by: Alexandre Courbot Tested-by: Alexandre Courbot Reviewed-by: Daniel Almeida Tested-by: Daniel Almeida Reviewed-by: Alice Ryhl Signed-off-by: FUJITA Tomonori Link: https://lore.kernel.org/r/20250821002055.3654160-3-fujita.tomonori@gmail.com [ Fix a minor typo and add missing backticks. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/io.rs | 1 + rust/kernel/io/poll.rs | 104 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 rust/kernel/io/poll.rs diff --git a/rust/kernel/io.rs b/rust/kernel/io.rs index 03b467722b8651..ee182b0b5452df 100644 --- a/rust/kernel/io.rs +++ b/rust/kernel/io.rs @@ -8,6 +8,7 @@ use crate::error::{code::EINVAL, Result}; use crate::{bindings, build_assert, ffi::c_void}; pub mod mem; +pub mod poll; pub mod resource; pub use resource::Resource; diff --git a/rust/kernel/io/poll.rs b/rust/kernel/io/poll.rs new file mode 100644 index 00000000000000..613eb25047efc4 --- /dev/null +++ b/rust/kernel/io/poll.rs @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! IO polling. +//! +//! C header: [`include/linux/iopoll.h`](srctree/include/linux/iopoll.h). + +use crate::{ + error::{code::*, Result}, + processor::cpu_relax, + task::might_sleep, + time::{delay::fsleep, Delta, Instant, Monotonic}, +}; + +/// Polls periodically until a condition is met, an error occurs, +/// or the timeout is reached. +/// +/// The function repeatedly executes the given operation `op` closure and +/// checks its result using the condition closure `cond`. +/// +/// If `cond` returns `true`, the function returns successfully with +/// the result of `op`. Otherwise, it waits for a duration specified +/// by `sleep_delta` before executing `op` again. +/// +/// This process continues until either `op` returns an error, `cond` +/// returns `true`, or the timeout specified by `timeout_delta` is +/// reached. +/// +/// This function can only be used in a nonatomic context. +/// +/// # Errors +/// +/// If `op` returns an error, then that error is returned directly. +/// +/// If the timeout specified by `timeout_delta` is reached, then +/// `Err(ETIMEDOUT)` is returned. +/// +/// # Examples +/// +/// ```no_run +/// use kernel::io::{Io, poll::read_poll_timeout}; +/// use kernel::time::Delta; +/// +/// const HW_READY: u16 = 0x01; +/// +/// fn wait_for_hardware(io: &Io) -> Result<()> { +/// match read_poll_timeout( +/// // The `op` closure reads the value of a specific status register. +/// || io.try_read16(0x1000), +/// // The `cond` closure takes a reference to the value returned by `op` +/// // and checks whether the hardware is ready. +/// |val: &u16| *val == HW_READY, +/// Delta::from_millis(50), +/// Delta::from_secs(3), +/// ) { +/// Ok(_) => { +/// // The hardware is ready. The returned value of the `op` closure +/// // isn't used. +/// Ok(()) +/// } +/// Err(e) => Err(e), +/// } +/// } +/// ``` +#[track_caller] +pub fn read_poll_timeout( + mut op: Op, + mut cond: Cond, + sleep_delta: Delta, + timeout_delta: Delta, +) -> Result +where + Op: FnMut() -> Result, + Cond: FnMut(&T) -> bool, +{ + let start: Instant = Instant::now(); + + // Unlike the C version, we always call `might_sleep()` unconditionally, + // as conditional calls are error-prone. We clearly separate + // `read_poll_timeout()` and `read_poll_timeout_atomic()` to aid + // tools like klint. + might_sleep(); + + loop { + let val = op()?; + if cond(&val) { + // Unlike the C version, we immediately return. + // We know the condition is met so we don't need to check again. + return Ok(val); + } + + if start.elapsed() > timeout_delta { + // Unlike the C version, we immediately return. + // We have just called `op()` so we don't need to call it again. + return Err(ETIMEDOUT); + } + + if !sleep_delta.is_zero() { + fsleep(sleep_delta); + } + + // `fsleep()` could be a busy-wait loop so we always call `cpu_relax()`. + cpu_relax(); + } +} From 82b523f369c9f279c98d04475283283e01030405 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 21 Aug 2025 09:30:14 +0900 Subject: [PATCH 0400/3206] firewire: ohci: remove obsolete debug logging for IRQ events A commit 0d8914165dd1 ("firewire: ohci: add tracepoints event for hardIRQ event") added "firewire_ohci:irqs" event in v6.11, which can provide equivalent information to the existing debug logging. This commit removes the logging. Link: https://lore.kernel.org/r/20250821003017.186752-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/ohci.c | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 5d8301b0f3aa8c..d3ed43f4d0c30d 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -395,50 +395,18 @@ MODULE_PARM_DESC(quirks, "Chip quirks (default = 0" #define OHCI_PARAM_DEBUG_AT_AR 1 #define OHCI_PARAM_DEBUG_SELFIDS 2 -#define OHCI_PARAM_DEBUG_IRQS 4 static int param_debug; module_param_named(debug, param_debug, int, 0644); MODULE_PARM_DESC(debug, "Verbose logging, deprecated in v6.11 kernel or later. (default = 0" ", AT/AR events = " __stringify(OHCI_PARAM_DEBUG_AT_AR) ", self-IDs = " __stringify(OHCI_PARAM_DEBUG_SELFIDS) - ", IRQs = " __stringify(OHCI_PARAM_DEBUG_IRQS) ", or a combination, or all = -1)"); static bool param_remote_dma; module_param_named(remote_dma, param_remote_dma, bool, 0444); MODULE_PARM_DESC(remote_dma, "Enable unfiltered remote DMA (default = N)"); -static void log_irqs(struct fw_ohci *ohci, u32 evt) -{ - if (likely(!(param_debug & OHCI_PARAM_DEBUG_IRQS))) - return; - - ohci_notice(ohci, "IRQ %08x%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", evt, - evt & OHCI1394_selfIDComplete ? " selfID" : "", - evt & OHCI1394_RQPkt ? " AR_req" : "", - evt & OHCI1394_RSPkt ? " AR_resp" : "", - evt & OHCI1394_reqTxComplete ? " AT_req" : "", - evt & OHCI1394_respTxComplete ? " AT_resp" : "", - evt & OHCI1394_isochRx ? " IR" : "", - evt & OHCI1394_isochTx ? " IT" : "", - evt & OHCI1394_postedWriteErr ? " postedWriteErr" : "", - evt & OHCI1394_cycleTooLong ? " cycleTooLong" : "", - evt & OHCI1394_cycle64Seconds ? " cycle64Seconds" : "", - evt & OHCI1394_cycleInconsistent ? " cycleInconsistent" : "", - evt & OHCI1394_regAccessFail ? " regAccessFail" : "", - evt & OHCI1394_unrecoverableError ? " unrecoverableError" : "", - evt & OHCI1394_busReset ? " busReset" : "", - evt & ~(OHCI1394_selfIDComplete | OHCI1394_RQPkt | - OHCI1394_RSPkt | OHCI1394_reqTxComplete | - OHCI1394_respTxComplete | OHCI1394_isochRx | - OHCI1394_isochTx | OHCI1394_postedWriteErr | - OHCI1394_cycleTooLong | OHCI1394_cycle64Seconds | - OHCI1394_cycleInconsistent | - OHCI1394_regAccessFail | OHCI1394_busReset) - ? " ?" : ""); -} - static void log_selfids(struct fw_ohci *ohci, int generation, int self_id_count) { static const char *const speed[] = { @@ -2226,7 +2194,7 @@ static irqreturn_t irq_handler(int irq, void *data) reg_write(ohci, OHCI1394_IntEventClear, event & ~(OHCI1394_busReset | OHCI1394_postedWriteErr)); trace_irqs(ohci->card.index, event); - log_irqs(ohci, event); + // The flag is masked again at bus_reset_work() scheduled by selfID event. if (event & OHCI1394_busReset) reg_write(ohci, OHCI1394_IntMaskClear, OHCI1394_busReset); From c579f1fe08cc34c7f0af8df44d2badfee53eb7e7 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 21 Aug 2025 09:30:15 +0900 Subject: [PATCH 0401/3206] firewire: ohci: remove obsolete debug logging for selfID sequence A commit 677ceae19073 ("firewire: core: add tracepoints event for self_id_sequence") added the "firewire:self_id_sequence" event in v6.11. A commit 526e21a2aa6f ("firewire: ohci: add tracepoints event for data of Self-ID DMA") added the "firewire_ohci:self_id_complete" event in v6.12. These tracepoints replace the equivalent debug logging. This commit removes the logging. Link: https://lore.kernel.org/r/20250821003017.186752-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/ohci.c | 72 ----------------------------------------- 1 file changed, 72 deletions(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index d3ed43f4d0c30d..8cecdf4c657234 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -394,87 +394,17 @@ MODULE_PARM_DESC(quirks, "Chip quirks (default = 0" ")"); #define OHCI_PARAM_DEBUG_AT_AR 1 -#define OHCI_PARAM_DEBUG_SELFIDS 2 static int param_debug; module_param_named(debug, param_debug, int, 0644); MODULE_PARM_DESC(debug, "Verbose logging, deprecated in v6.11 kernel or later. (default = 0" ", AT/AR events = " __stringify(OHCI_PARAM_DEBUG_AT_AR) - ", self-IDs = " __stringify(OHCI_PARAM_DEBUG_SELFIDS) ", or a combination, or all = -1)"); static bool param_remote_dma; module_param_named(remote_dma, param_remote_dma, bool, 0444); MODULE_PARM_DESC(remote_dma, "Enable unfiltered remote DMA (default = N)"); -static void log_selfids(struct fw_ohci *ohci, int generation, int self_id_count) -{ - static const char *const speed[] = { - [0] = "S100", [1] = "S200", [2] = "S400", [3] = "beta", - }; - static const char *const power[] = { - [0] = "+0W", [1] = "+15W", [2] = "+30W", [3] = "+45W", - [4] = "-3W", [5] = " ?W", [6] = "-3..-6W", [7] = "-3..-10W", - }; - static const char port[] = { - [PHY_PACKET_SELF_ID_PORT_STATUS_NONE] = '.', - [PHY_PACKET_SELF_ID_PORT_STATUS_NCONN] = '-', - [PHY_PACKET_SELF_ID_PORT_STATUS_PARENT] = 'p', - [PHY_PACKET_SELF_ID_PORT_STATUS_CHILD] = 'c', - }; - struct self_id_sequence_enumerator enumerator = { - .cursor = ohci->self_id_buffer, - .quadlet_count = self_id_count, - }; - - if (likely(!(param_debug & OHCI_PARAM_DEBUG_SELFIDS))) - return; - - ohci_notice(ohci, "%d selfIDs, generation %d, local node ID %04x\n", - self_id_count, generation, ohci->node_id); - - while (enumerator.quadlet_count > 0) { - unsigned int quadlet_count; - unsigned int port_index; - const u32 *s; - int i; - - s = self_id_sequence_enumerator_next(&enumerator, &quadlet_count); - if (IS_ERR(s)) - break; - - ohci_notice(ohci, - "selfID 0: %08x, phy %d [%c%c%c] %s gc=%d %s %s%s%s\n", - *s, - phy_packet_self_id_get_phy_id(*s), - port[self_id_sequence_get_port_status(s, quadlet_count, 0)], - port[self_id_sequence_get_port_status(s, quadlet_count, 1)], - port[self_id_sequence_get_port_status(s, quadlet_count, 2)], - speed[*s >> 14 & 3], *s >> 16 & 63, - power[*s >> 8 & 7], *s >> 22 & 1 ? "L" : "", - *s >> 11 & 1 ? "c" : "", *s & 2 ? "i" : ""); - - port_index = 3; - for (i = 1; i < quadlet_count; ++i) { - ohci_notice(ohci, - "selfID n: %08x, phy %d [%c%c%c%c%c%c%c%c]\n", - s[i], - phy_packet_self_id_get_phy_id(s[i]), - port[self_id_sequence_get_port_status(s, quadlet_count, port_index)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 1)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 2)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 3)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 4)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 5)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 6)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 7)] - ); - - port_index += 8; - } - } -} - static const char *evts[] = { [0x00] = "evt_no_status", [0x01] = "-reserved-", [0x02] = "evt_long_packet", [0x03] = "evt_missing_ack", @@ -2163,8 +2093,6 @@ static void bus_reset_work(struct work_struct *work) if (free_rom) dmam_free_coherent(ohci->card.device, CONFIG_ROM_SIZE, free_rom, free_rom_bus); - log_selfids(ohci, generation, self_id_count); - fw_core_handle_bus_reset(&ohci->card, ohci->node_id, generation, self_id_count, ohci->self_id_buffer, ohci->csr_state_setclear_abdicate); From 6354cc95193696f1698e01ef6884f0fd9d613a6a Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 21 Aug 2025 09:30:16 +0900 Subject: [PATCH 0402/3206] firewire: ohci: remove obsolete debug logging for AT/AR results Between v6.11 and v6.12, a set of tracepoints was added to record asynchronous communication events: - firewire:async_phy_inbound - firewire:async_phy_outbound_initiate - firewire:async_phy_outbound_complete - firewire:async_response_inbound - firewire:async_response_outbound_initiate - firewire:async_response_outbound_complete - firewire:async_request_inbound - firewire:async_request_outbound_initiate - firewire:async_request_outbound_complete These tracepoints cover the functionality of the existing debug logging. This commit removes the logging. Link: https://lore.kernel.org/r/20250821003017.186752-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/ohci.c | 132 ++++++---------------------------------- 1 file changed, 19 insertions(+), 113 deletions(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 8cecdf4c657234..ae7f75fade8dd3 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -393,123 +393,15 @@ MODULE_PARM_DESC(quirks, "Chip quirks (default = 0" ", IR wake unreliable = " __stringify(QUIRK_IR_WAKE) ")"); -#define OHCI_PARAM_DEBUG_AT_AR 1 - static int param_debug; module_param_named(debug, param_debug, int, 0644); MODULE_PARM_DESC(debug, "Verbose logging, deprecated in v6.11 kernel or later. (default = 0" - ", AT/AR events = " __stringify(OHCI_PARAM_DEBUG_AT_AR) ", or a combination, or all = -1)"); static bool param_remote_dma; module_param_named(remote_dma, param_remote_dma, bool, 0444); MODULE_PARM_DESC(remote_dma, "Enable unfiltered remote DMA (default = N)"); -static const char *evts[] = { - [0x00] = "evt_no_status", [0x01] = "-reserved-", - [0x02] = "evt_long_packet", [0x03] = "evt_missing_ack", - [0x04] = "evt_underrun", [0x05] = "evt_overrun", - [0x06] = "evt_descriptor_read", [0x07] = "evt_data_read", - [0x08] = "evt_data_write", [0x09] = "evt_bus_reset", - [0x0a] = "evt_timeout", [0x0b] = "evt_tcode_err", - [0x0c] = "-reserved-", [0x0d] = "-reserved-", - [0x0e] = "evt_unknown", [0x0f] = "evt_flushed", - [0x10] = "-reserved-", [0x11] = "ack_complete", - [0x12] = "ack_pending ", [0x13] = "-reserved-", - [0x14] = "ack_busy_X", [0x15] = "ack_busy_A", - [0x16] = "ack_busy_B", [0x17] = "-reserved-", - [0x18] = "-reserved-", [0x19] = "-reserved-", - [0x1a] = "-reserved-", [0x1b] = "ack_tardy", - [0x1c] = "-reserved-", [0x1d] = "ack_data_error", - [0x1e] = "ack_type_error", [0x1f] = "-reserved-", - [0x20] = "pending/cancelled", -}; - -static void log_ar_at_event(struct fw_ohci *ohci, - char dir, int speed, u32 *header, int evt) -{ - static const char *const tcodes[] = { - [TCODE_WRITE_QUADLET_REQUEST] = "QW req", - [TCODE_WRITE_BLOCK_REQUEST] = "BW req", - [TCODE_WRITE_RESPONSE] = "W resp", - [0x3] = "-reserved-", - [TCODE_READ_QUADLET_REQUEST] = "QR req", - [TCODE_READ_BLOCK_REQUEST] = "BR req", - [TCODE_READ_QUADLET_RESPONSE] = "QR resp", - [TCODE_READ_BLOCK_RESPONSE] = "BR resp", - [TCODE_CYCLE_START] = "cycle start", - [TCODE_LOCK_REQUEST] = "Lk req", - [TCODE_STREAM_DATA] = "async stream packet", - [TCODE_LOCK_RESPONSE] = "Lk resp", - [0xc] = "-reserved-", - [0xd] = "-reserved-", - [TCODE_LINK_INTERNAL] = "link internal", - [0xf] = "-reserved-", - }; - int tcode = async_header_get_tcode(header); - char specific[12]; - - if (likely(!(param_debug & OHCI_PARAM_DEBUG_AT_AR))) - return; - - if (unlikely(evt >= ARRAY_SIZE(evts))) - evt = 0x1f; - - if (evt == OHCI1394_evt_bus_reset) { - ohci_notice(ohci, "A%c evt_bus_reset, generation %d\n", - dir, (header[2] >> 16) & 0xff); - return; - } - - switch (tcode) { - case TCODE_WRITE_QUADLET_REQUEST: - case TCODE_READ_QUADLET_RESPONSE: - case TCODE_CYCLE_START: - snprintf(specific, sizeof(specific), " = %08x", - be32_to_cpu((__force __be32)header[3])); - break; - case TCODE_WRITE_BLOCK_REQUEST: - case TCODE_READ_BLOCK_REQUEST: - case TCODE_READ_BLOCK_RESPONSE: - case TCODE_LOCK_REQUEST: - case TCODE_LOCK_RESPONSE: - snprintf(specific, sizeof(specific), " %x,%x", - async_header_get_data_length(header), - async_header_get_extended_tcode(header)); - break; - default: - specific[0] = '\0'; - } - - switch (tcode) { - case TCODE_STREAM_DATA: - ohci_notice(ohci, "A%c %s, %s\n", - dir, evts[evt], tcodes[tcode]); - break; - case TCODE_LINK_INTERNAL: - ohci_notice(ohci, "A%c %s, PHY %08x %08x\n", - dir, evts[evt], header[1], header[2]); - break; - case TCODE_WRITE_QUADLET_REQUEST: - case TCODE_WRITE_BLOCK_REQUEST: - case TCODE_READ_QUADLET_REQUEST: - case TCODE_READ_BLOCK_REQUEST: - case TCODE_LOCK_REQUEST: - ohci_notice(ohci, - "A%c spd %x tl %02x, %04x -> %04x, %s, %s, %012llx%s\n", - dir, speed, async_header_get_tlabel(header), - async_header_get_source(header), async_header_get_destination(header), - evts[evt], tcodes[tcode], async_header_get_offset(header), specific); - break; - default: - ohci_notice(ohci, - "A%c spd %x tl %02x, %04x -> %04x, %s, %s%s\n", - dir, speed, async_header_get_tlabel(header), - async_header_get_source(header), async_header_get_destination(header), - evts[evt], tcodes[tcode], specific); - } -} - static inline void reg_write(const struct fw_ohci *ohci, int offset, u32 data) { writel(data, ohci->registers + offset); @@ -855,8 +747,6 @@ static __le32 *handle_ar_packet(struct ar_context *ctx, __le32 *buffer) p.timestamp = status & 0xffff; p.generation = ohci->request_generation; - log_ar_at_event(ohci, 'R', p.speed, p.header, evt); - /* * Several controllers, notably from NEC and VIA, forget to * write ack_complete status at PHY packet reception. @@ -1464,8 +1354,6 @@ static int handle_at_packet(struct context *context, evt = le16_to_cpu(last->transfer_status) & 0x1f; packet->timestamp = le16_to_cpu(last->res_count); - log_ar_at_event(ohci, 'T', packet->speed, packet->header, evt); - switch (evt) { case OHCI1394_evt_timeout: /* Async response transmit timed out. */ @@ -1670,6 +1558,25 @@ static void at_context_transmit(struct at_context *ctx, struct fw_packet *packet static void detect_dead_context(struct fw_ohci *ohci, const char *name, unsigned int regs) { + static const char *const evts[] = { + [0x00] = "evt_no_status", [0x01] = "-reserved-", + [0x02] = "evt_long_packet", [0x03] = "evt_missing_ack", + [0x04] = "evt_underrun", [0x05] = "evt_overrun", + [0x06] = "evt_descriptor_read", [0x07] = "evt_data_read", + [0x08] = "evt_data_write", [0x09] = "evt_bus_reset", + [0x0a] = "evt_timeout", [0x0b] = "evt_tcode_err", + [0x0c] = "-reserved-", [0x0d] = "-reserved-", + [0x0e] = "evt_unknown", [0x0f] = "evt_flushed", + [0x10] = "-reserved-", [0x11] = "ack_complete", + [0x12] = "ack_pending ", [0x13] = "-reserved-", + [0x14] = "ack_busy_X", [0x15] = "ack_busy_A", + [0x16] = "ack_busy_B", [0x17] = "-reserved-", + [0x18] = "-reserved-", [0x19] = "-reserved-", + [0x1a] = "-reserved-", [0x1b] = "ack_tardy", + [0x1c] = "-reserved-", [0x1d] = "ack_data_error", + [0x1e] = "ack_type_error", [0x1f] = "-reserved-", + [0x20] = "pending/cancelled", + }; u32 ctl; ctl = reg_read(ohci, CONTROL_SET(regs)); @@ -2601,7 +2508,6 @@ static int ohci_cancel_packet(struct fw_card *card, struct fw_packet *packet) dma_unmap_single(ohci->card.device, packet->payload_bus, packet->payload_length, DMA_TO_DEVICE); - log_ar_at_event(ohci, 'T', packet->speed, packet->header, 0x20); driver_data->packet = NULL; packet->ack = RCODE_CANCELLED; From 8748368c3d92f7bdef67c90d3f62ab92083b3677 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 21 Aug 2025 09:30:17 +0900 Subject: [PATCH 0403/3206] firewire: ohci: remove obsolete module-level debug parameter The module-level debug parameter was added in v2.6.26 by a commit ad3c0fe8b8d16 ("firewire: debug interrupt events"). Its functionality has long been superseded by tracepoints. This commit removes the module parameter, bye. Link: https://lore.kernel.org/r/20250821003017.186752-5-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/ohci.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index ae7f75fade8dd3..c8c5d598e3c8f4 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -393,11 +393,6 @@ MODULE_PARM_DESC(quirks, "Chip quirks (default = 0" ", IR wake unreliable = " __stringify(QUIRK_IR_WAKE) ")"); -static int param_debug; -module_param_named(debug, param_debug, int, 0644); -MODULE_PARM_DESC(debug, "Verbose logging, deprecated in v6.11 kernel or later. (default = 0" - ", or a combination, or all = -1)"); - static bool param_remote_dma; module_param_named(remote_dma, param_remote_dma, bool, 0444); MODULE_PARM_DESC(remote_dma, "Enable unfiltered remote DMA (default = N)"); @@ -2017,11 +2012,6 @@ static irqreturn_t irq_handler(int irq, void *data) if (!event || !~event) return IRQ_NONE; - if (unlikely(param_debug > 0)) { - dev_notice_ratelimited(ohci->card.device, - "The debug parameter is superseded by tracepoints events, and deprecated."); - } - /* * busReset and postedWriteErr events must not be cleared yet * (OHCI 1.1 clauses 7.2.3.2 and 13.2.8.1) From 54b962fa14dc82a625436fcbcee7b3f2067a4f6d Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 14 Aug 2025 13:34:30 -0500 Subject: [PATCH 0404/3206] pinctrl: amd: Add PM debugging message for turning on/off wakes The GPIOs for devices not in _AEI/_EVT such as touchpad or touchscreen won't have wakeup turned on until the suspend sequence starts. Due to code in amd_gpio_suspend_hibernate_common() masking the interrupt can make this difficult to follow what's going on. Add an explicit debugging message to tell when that was turned on/off. Signed-off-by: Mario Limonciello (AMD) Link: https://lore.kernel.org/20250814183430.3887973-2-superm1@kernel.org Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 09a5425d54ba38..127eeb0104d856 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -448,6 +448,9 @@ static int amd_gpio_irq_set_wake(struct irq_data *d, unsigned int on) u32 wake_mask = BIT(WAKE_CNTRL_OFF_S0I3) | BIT(WAKE_CNTRL_OFF_S3); int err; + pm_pr_dbg("Setting wake for GPIO %lu to %s\n", + d->hwirq, str_enable_disable(on)); + raw_spin_lock_irqsave(&gpio_dev->lock, flags); pin_reg = readl(gpio_dev->base + (d->hwirq)*4); From 7a399ce67e981f5f28afc1ff50772acd1f4e7bbf Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 21 Aug 2025 09:49:11 -0500 Subject: [PATCH 0405/3206] pinctrl: amd: Don't access irq_data's hwirq member directly There is an irqd_to_hwirq() intended to get the hwirq number. Switch all use to it. Suggested-by: Andy Shevchenko Signed-off-by: Mario Limonciello (AMD) Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/20250821144942.2463014-1-superm1@kernel.org Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 40 ++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 127eeb0104d856..2dac5c71eb008f 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -383,14 +383,15 @@ static void amd_gpio_irq_enable(struct irq_data *d) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); - gpiochip_enable_irq(gc, d->hwirq); + gpiochip_enable_irq(gc, hwirq); raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); pin_reg |= BIT(INTERRUPT_ENABLE_OFF); pin_reg |= BIT(INTERRUPT_MASK_OFF); - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); } @@ -400,15 +401,16 @@ static void amd_gpio_irq_disable(struct irq_data *d) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); pin_reg &= ~BIT(INTERRUPT_ENABLE_OFF); pin_reg &= ~BIT(INTERRUPT_MASK_OFF); - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); - gpiochip_disable_irq(gc, d->hwirq); + gpiochip_disable_irq(gc, hwirq); } static void amd_gpio_irq_mask(struct irq_data *d) @@ -417,11 +419,12 @@ static void amd_gpio_irq_mask(struct irq_data *d) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); pin_reg &= ~BIT(INTERRUPT_MASK_OFF); - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); } @@ -431,11 +434,12 @@ static void amd_gpio_irq_unmask(struct irq_data *d) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); pin_reg |= BIT(INTERRUPT_MASK_OFF); - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); } @@ -446,20 +450,21 @@ static int amd_gpio_irq_set_wake(struct irq_data *d, unsigned int on) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); u32 wake_mask = BIT(WAKE_CNTRL_OFF_S0I3) | BIT(WAKE_CNTRL_OFF_S3); + irq_hw_number_t hwirq = irqd_to_hwirq(d); int err; pm_pr_dbg("Setting wake for GPIO %lu to %s\n", - d->hwirq, str_enable_disable(on)); + hwirq, str_enable_disable(on)); raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); if (on) pin_reg |= wake_mask; else pin_reg &= ~wake_mask; - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); if (on) @@ -495,9 +500,10 @@ static int amd_gpio_irq_set_type(struct irq_data *d, unsigned int type) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); switch (type & IRQ_TYPE_SENSE_MASK) { case IRQ_TYPE_EDGE_RISING: @@ -563,10 +569,10 @@ static int amd_gpio_irq_set_type(struct irq_data *d, unsigned int type) pin_reg_irq_en = pin_reg; pin_reg_irq_en |= mask; pin_reg_irq_en &= ~BIT(INTERRUPT_MASK_OFF); - writel(pin_reg_irq_en, gpio_dev->base + (d->hwirq)*4); - while ((readl(gpio_dev->base + (d->hwirq)*4) & mask) != mask) + writel(pin_reg_irq_en, gpio_dev->base + hwirq * 4); + while ((readl(gpio_dev->base + hwirq * 4) & mask) != mask) continue; - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); return ret; From 5a22df198bd3f515dfd9ff064a81f41567427dfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= Date: Thu, 21 Aug 2025 13:20:35 +0200 Subject: [PATCH 0406/3206] dt-bindings: mmc: sdhci-pxa: add state_uhs pinctrl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On the pxav3 controller, increasing the drive strength of the data pins might be required to maintain stability on fast bus clocks (above 100 MHz). Add a state_uhs pinctrl to allow this. Reviewed-by: Rob Herring (Arm) Signed-off-by: Duje Mihanović Link: https://lore.kernel.org/r/20250821-pxav3-uhs-v4-1-bb588314f3c3@dujemihanovic.xyz Signed-off-by: Ulf Hansson --- .../devicetree/bindings/mmc/sdhci-pxa.yaml | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml b/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml index e7c06032048a3a..fba1cc50fdf07c 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml +++ b/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml @@ -44,12 +44,27 @@ allOf: items: - const: default - const: state_cmd_gpio - pinctrl-0: - description: - Should contain default pinctrl. + pinctrl-1: description: Should switch CMD pin to GPIO mode as a high output. + - if: + properties: + compatible: + contains: + const: mrvl,pxav3-mmc + then: + properties: + pinctrl-names: + description: + Optional for increasing stability of the controller at fast bus clocks. + items: + - const: default + - const: state_uhs + + pinctrl-1: + description: + Should switch the drive strength of the data pins to high. properties: compatible: @@ -82,6 +97,14 @@ properties: - const: io - const: core + pinctrl-names: true + + pinctrl-0: + description: + Should contain default pinctrl. + + pinctrl-1: true + mrvl,clk-delay-cycles: description: Specify a number of cycles to delay for tuning. $ref: /schemas/types.yaml#/definitions/uint32 From 9674f9e325e4b13f54ab108affe9928a3088d9c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= Date: Thu, 21 Aug 2025 13:20:36 +0200 Subject: [PATCH 0407/3206] mmc: sdhci-pxav3: add state_uhs pinctrl setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Different bus clocks require different pinctrl states to remain stable. Add support for selecting between a default and UHS state according to the bus clock. Acked-by: Adrian Hunter Signed-off-by: Duje Mihanović Link: https://lore.kernel.org/r/20250821-pxav3-uhs-v4-2-bb588314f3c3@dujemihanovic.xyz Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pxav3.c | 41 +++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-pxav3.c b/drivers/mmc/host/sdhci-pxav3.c index 1371960e34ebbb..238f508f2fb0bd 100644 --- a/drivers/mmc/host/sdhci-pxav3.c +++ b/drivers/mmc/host/sdhci-pxav3.c @@ -20,9 +20,11 @@ #include #include #include +#include #include #include #include +#include #include "sdhci.h" #include "sdhci-pltfm.h" @@ -51,6 +53,9 @@ struct sdhci_pxa { struct clk *clk_io; u8 power_mode; void __iomem *sdio3_conf_reg; + struct pinctrl *pinctrl; + struct pinctrl_state *pins_default; + struct pinctrl_state *pins_uhs; }; /* @@ -313,8 +318,20 @@ static void pxav3_set_power(struct sdhci_host *host, unsigned char mode, mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, vdd); } +static void pxav3_set_clock(struct sdhci_host *host, unsigned int clock) +{ + struct sdhci_pltfm_host *phost = sdhci_priv(host); + struct sdhci_pxa *pxa = sdhci_pltfm_priv(phost); + struct pinctrl_state *pins = clock < 100 * HZ_PER_MHZ ? pxa->pins_default : pxa->pins_uhs; + + if (pins) + pinctrl_select_state(pxa->pinctrl, pins); + + sdhci_set_clock(host, clock); +} + static const struct sdhci_ops pxav3_sdhci_ops = { - .set_clock = sdhci_set_clock, + .set_clock = pxav3_set_clock, .set_power = pxav3_set_power, .platform_send_init_74_clocks = pxav3_gen_init_74_clocks, .get_max_clock = sdhci_pltfm_clk_get_max_clock, @@ -366,6 +383,19 @@ static inline struct sdhci_pxa_platdata *pxav3_get_mmc_pdata(struct device *dev) } #endif +static struct pinctrl_state *pxav3_lookup_pinstate(struct device *dev, struct pinctrl *pinctrl, + const char *name) +{ + struct pinctrl_state *pins = pinctrl_lookup_state(pinctrl, name); + + if (IS_ERR(pins)) { + dev_dbg(dev, "could not get pinstate '%s': %ld\n", name, PTR_ERR(pins)); + return NULL; + } + + return pins; +} + static int sdhci_pxav3_probe(struct platform_device *pdev) { struct sdhci_pltfm_host *pltfm_host; @@ -440,6 +470,15 @@ static int sdhci_pxav3_probe(struct platform_device *pdev) host->mmc->pm_caps |= pdata->pm_caps; } + pxa->pinctrl = devm_pinctrl_get(dev); + if (!IS_ERR(pxa->pinctrl)) { + pxa->pins_default = pxav3_lookup_pinstate(dev, pxa->pinctrl, "default"); + if (pxa->pins_default) + pxa->pins_uhs = pxav3_lookup_pinstate(dev, pxa->pinctrl, "state_uhs"); + } else { + dev_dbg(dev, "could not get pinctrl handle: %ld\n", PTR_ERR(pxa->pinctrl)); + } + pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); pm_runtime_set_autosuspend_delay(&pdev->dev, PXAV3_RPM_DELAY_MS); From 1f69220b0998b609000c6ea1783772fdaecaec56 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 21 Aug 2025 15:32:25 +0300 Subject: [PATCH 0408/3206] mmc: mmc_spi: remove unnecessary check in mmc_spi_setup_data_message() An earlier commit changed the outer if statement from "if (multiple || write) {" to "if (write) {" so now we know that "write" is true and no longer need to check. Delete the unnecessary check. Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/aKcR2ea747xkw_it@stanley.mountain Signed-off-by: Ulf Hansson --- drivers/mmc/host/mmc_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 95a32ff29ee166..42936e248c5536 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -566,7 +566,7 @@ mmc_spi_setup_data_message(struct mmc_spi_host *host, bool multiple, bool write) if (write) { t = &host->early_status; memset(t, 0, sizeof(*t)); - t->len = write ? sizeof(scratch->status) : 1; + t->len = sizeof(scratch->status); t->tx_buf = host->ones; t->rx_buf = scratch->status; t->cs_change = 1; From 6fb942b85a1ab9728a4551d4161ec6fd6fab94f3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 21 Aug 2025 15:32:49 +0300 Subject: [PATCH 0409/3206] mmc: rtsx_usb_sdmmc: Fix uninitialized variable issue If rtsx_usb_get_card_status() fails then "val" isn't initialized. Move the use of "val" until after the error checking. Fixes: d2e6fb2c31a0 ("misc: rtsx: usb card reader: add OCP support") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/aKcR8QD81TjVqIhl@stanley.mountain Signed-off-by: Ulf Hansson --- drivers/mmc/host/rtsx_usb_sdmmc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/rtsx_usb_sdmmc.c b/drivers/mmc/host/rtsx_usb_sdmmc.c index e1ed39c657c37c..70bd21084b977c 100644 --- a/drivers/mmc/host/rtsx_usb_sdmmc.c +++ b/drivers/mmc/host/rtsx_usb_sdmmc.c @@ -785,13 +785,13 @@ static int sdmmc_get_cd(struct mmc_host *mmc) mutex_unlock(&ucr->dev_mutex); - /* get OCP status */ - host->ocp_stat = (val >> 4) & 0x03; - /* Treat failed detection as non-exist */ if (err) goto no_card; + /* get OCP status */ + host->ocp_stat = (val >> 4) & 0x03; + if (val & SD_CD) { host->card_exist = true; return 1; From 3202d6ed9368fc1e842fda73727553ae614633f8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 21 Aug 2025 15:07:50 +0200 Subject: [PATCH 0410/3206] mmc: core: Add infrastructure for undervoltage handling Implement the core infrastructure to allow MMC bus types to handle REGULATOR_EVENT_UNDER_VOLTAGE events from power regulators. This is primarily aimed at allowing devices like eMMC to perform an emergency shutdown to prevent data corruption when a power failure is imminent. This patch introduces: - A new 'handle_undervoltage' function pointer to 'struct mmc_bus_ops'. Bus drivers (e.g., for eMMC) can implement this to define their emergency procedures. - A workqueue ('uv_work') in 'struct mmc_supply' to handle the event asynchronously in a high-priority context. - A new function 'mmc_handle_undervoltage()' which is called from the workqueue. It stops the host queue to prevent races with card removal, checks for the bus op, and invokes the handler. - Functions to register and unregister the regulator notifier, intended to be called by bus drivers like 'mmc_attach_mmc' when a compatible card is detected. The notifier is only registered for the main vmmc supply, as undervoltage handling for vqmmc or vqmmc2 is not required at this time. Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20250821130751.2089587-2-o.rempel@pengutronix.de Signed-off-by: Ulf Hansson --- drivers/mmc/core/bus.c | 12 ++++++ drivers/mmc/core/core.c | 23 +++++++++++ drivers/mmc/core/core.h | 5 +++ drivers/mmc/core/host.c | 2 + drivers/mmc/core/regulator.c | 77 ++++++++++++++++++++++++++++++++++++ include/linux/mmc/host.h | 11 ++++++ 6 files changed, 130 insertions(+) diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index 1cf64e0952fbe2..ec4f3462bf8092 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -19,6 +19,7 @@ #include #include +#include #include "core.h" #include "card.h" @@ -383,6 +384,14 @@ int mmc_add_card(struct mmc_card *card) mmc_card_set_present(card); + /* + * Register for undervoltage notification if the card supports + * power-off notification, enabling emergency shutdowns. + */ + if (mmc_card_mmc(card) && + card->ext_csd.power_off_notification == EXT_CSD_POWER_ON) + mmc_regulator_register_undervoltage_notifier(card->host); + return 0; } @@ -394,6 +403,9 @@ void mmc_remove_card(struct mmc_card *card) { struct mmc_host *host = card->host; + if (mmc_card_present(card)) + mmc_regulator_unregister_undervoltage_notifier(host); + mmc_remove_card_debugfs(card); if (mmc_card_present(card)) { diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 88fd231fee1d15..860378bea557b3 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -1398,6 +1398,29 @@ void mmc_power_cycle(struct mmc_host *host, u32 ocr) mmc_power_up(host, ocr); } +/** + * mmc_handle_undervoltage - Handle an undervoltage event on the MMC bus + * @host: The MMC host that detected the undervoltage condition + * + * This function is called when an undervoltage event is detected on one of + * the MMC regulators. + * + * Returns: 0 on success or a negative error code on failure. + */ +int mmc_handle_undervoltage(struct mmc_host *host) +{ + /* Stop the host to prevent races with card removal */ + __mmc_stop_host(host); + + if (!host->bus_ops || !host->bus_ops->handle_undervoltage) + return 0; + + dev_warn(mmc_dev(host), "%s: Undervoltage detected, initiating emergency stop\n", + mmc_hostname(host)); + + return host->bus_ops->handle_undervoltage(host); +} + /* * Assign a mmc bus handler to a host. Only one bus handler may control a * host at any given time. diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index 73f5d3d8c77d5d..a028b48be16447 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -31,6 +31,7 @@ struct mmc_bus_ops { int (*sw_reset)(struct mmc_host *); bool (*cache_enabled)(struct mmc_host *); int (*flush_cache)(struct mmc_host *); + int (*handle_undervoltage)(struct mmc_host *host); }; void mmc_attach_bus(struct mmc_host *host, const struct mmc_bus_ops *ops); @@ -59,6 +60,10 @@ void mmc_power_off(struct mmc_host *host); void mmc_power_cycle(struct mmc_host *host, u32 ocr); void mmc_set_initial_state(struct mmc_host *host); u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max); +int mmc_handle_undervoltage(struct mmc_host *host); +void mmc_regulator_register_undervoltage_notifier(struct mmc_host *host); +void mmc_regulator_unregister_undervoltage_notifier(struct mmc_host *host); +void mmc_undervoltage_workfn(struct work_struct *work); static inline void mmc_delay(unsigned int ms) { diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index f14671ea571628..5f0ec23aeff553 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -564,6 +564,8 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev) INIT_WORK(&host->sdio_irq_work, sdio_irq_work); timer_setup(&host->retune_timer, mmc_retune_timer, 0); + INIT_WORK(&host->supply.uv_work, mmc_undervoltage_workfn); + /* * By default, hosts do not support SGIO or large requests. * They have to set these according to their abilities. diff --git a/drivers/mmc/core/regulator.c b/drivers/mmc/core/regulator.c index 3dae2e9b797813..a85179f1a4de6d 100644 --- a/drivers/mmc/core/regulator.c +++ b/drivers/mmc/core/regulator.c @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -262,6 +263,82 @@ static inline int mmc_regulator_get_ocrmask(struct regulator *supply) #endif /* CONFIG_REGULATOR */ +/* To be called from a high-priority workqueue */ +void mmc_undervoltage_workfn(struct work_struct *work) +{ + struct mmc_supply *supply; + struct mmc_host *host; + + supply = container_of(work, struct mmc_supply, uv_work); + host = container_of(supply, struct mmc_host, supply); + + mmc_handle_undervoltage(host); +} + +static int mmc_handle_regulator_event(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct mmc_supply *supply = container_of(nb, struct mmc_supply, + vmmc_nb); + struct mmc_host *host = container_of(supply, struct mmc_host, supply); + unsigned long flags; + + switch (event) { + case REGULATOR_EVENT_UNDER_VOLTAGE: + spin_lock_irqsave(&host->lock, flags); + if (host->undervoltage) { + spin_unlock_irqrestore(&host->lock, flags); + return NOTIFY_OK; + } + + host->undervoltage = true; + spin_unlock_irqrestore(&host->lock, flags); + + queue_work(system_highpri_wq, &host->supply.uv_work); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +/** + * mmc_regulator_register_undervoltage_notifier - Register for undervoltage + * events + * @host: MMC host + * + * To be called by a bus driver when a card supporting graceful shutdown + * is attached. + */ +void mmc_regulator_register_undervoltage_notifier(struct mmc_host *host) +{ + int ret; + + if (IS_ERR_OR_NULL(host->supply.vmmc)) + return; + + host->supply.vmmc_nb.notifier_call = mmc_handle_regulator_event; + ret = regulator_register_notifier(host->supply.vmmc, + &host->supply.vmmc_nb); + if (ret) + dev_warn(mmc_dev(host), "Failed to register vmmc notifier: %d\n", ret); +} + +/** + * mmc_regulator_unregister_undervoltage_notifier - Unregister undervoltage + * notifier + * @host: MMC host + */ +void mmc_regulator_unregister_undervoltage_notifier(struct mmc_host *host) +{ + if (IS_ERR_OR_NULL(host->supply.vmmc)) + return; + + regulator_unregister_notifier(host->supply.vmmc, &host->supply.vmmc_nb); + cancel_work_sync(&host->supply.uv_work); +} + /** * mmc_regulator_get_supply - try to get VMMC and VQMMC regulators for a host * @mmc: the host to regulate diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 5ed5d203de23c1..e0d935a4ac1d5a 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -337,11 +337,15 @@ struct mmc_slot { struct regulator; struct mmc_pwrseq; +struct notifier_block; struct mmc_supply { struct regulator *vmmc; /* Card power supply */ struct regulator *vqmmc; /* Optional Vccq supply */ struct regulator *vqmmc2; /* Optional supply for phy */ + + struct notifier_block vmmc_nb; /* Notifier for vmmc */ + struct work_struct uv_work; /* Undervoltage work */ }; struct mmc_ctx { @@ -494,6 +498,13 @@ struct mmc_host { unsigned int can_dma_map_merge:1; /* merging can be used */ unsigned int vqmmc_enabled:1; /* vqmmc regulator is enabled */ + /* + * Indicates if an undervoltage event has already been handled. + * This prevents repeated regulator notifiers from triggering + * multiple REGULATOR_EVENT_UNDER_VOLTAGE events. + */ + unsigned int undervoltage:1; /* Undervoltage state */ + int rescan_disable; /* disable card detection */ int rescan_entered; /* used with nonremovable devices */ From 533c62f5c1d5a52807bdd434a44e3cbc6e77b416 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 21 Aug 2025 15:07:51 +0200 Subject: [PATCH 0411/3206] mmc: core: add undervoltage handler for MMC/eMMC devices Add infrastructure to handle regulator undervoltage events for MMC/eMMC cards. When an undervoltage is detected, the new handler performs a controlled emergency suspend using a short power-off notification, skipping the cache flush to maximize the chance of a safe shutdown. After the operation, the card is marked as removed to prevent further I/O and possible data corruption. This is implemented by introducing MMC_POWEROFF_UNDERVOLTAGE to the mmc_poweroff_type enum and refactoring the suspend logic into an internal __mmc_suspend() helper that allows the caller to skip the cache flush if required. The undervoltage handler is registered as a bus operation and invoked from the core undervoltage path. If power-off notification is not supported by the card, the handler falls back to sleep or deselecting the card. Additionally, update the shutdown path to avoid redundant shutdown steps if the card is already removed Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20250821130751.2089587-3-o.rempel@pengutronix.de Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc.c | 70 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 5be9b42d5057eb..3e7d9437477c76 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -36,6 +36,7 @@ enum mmc_poweroff_type { MMC_POWEROFF_SUSPEND, MMC_POWEROFF_SHUTDOWN, + MMC_POWEROFF_UNDERVOLTAGE, MMC_POWEROFF_UNBIND, }; @@ -2132,9 +2133,15 @@ static int _mmc_suspend(struct mmc_host *host, enum mmc_poweroff_type pm_type) if (mmc_card_suspended(host->card)) goto out; - err = _mmc_flush_cache(host); - if (err) - goto out; + /* + * For the undervoltage case, we care more about device integrity. + * Avoid cache flush and notify the device to power off quickly. + */ + if (pm_type != MMC_POWEROFF_UNDERVOLTAGE) { + err = _mmc_flush_cache(host); + if (err) + goto out; + } if (mmc_card_can_poweroff_notify(host->card) && mmc_host_can_poweroff_notify(host, pm_type)) @@ -2212,6 +2219,13 @@ static int mmc_shutdown(struct mmc_host *host) { int err = 0; + /* + * In case of undervoltage, the card will be powered off (removed) by + * _mmc_handle_undervoltage() + */ + if (mmc_card_removed(host->card)) + return 0; + /* * If the card remains suspended at this point and it was done by using * the sleep-cmd (CMD5), we may need to re-initialize it first, to allow @@ -2302,6 +2316,55 @@ static int _mmc_hw_reset(struct mmc_host *host) return mmc_init_card(host, card->ocr, card); } +/** + * _mmc_handle_undervoltage - Handle an undervoltage event for MMC/eMMC devices + * @host: MMC host structure + * + * This function is triggered when an undervoltage condition is detected. + * It attempts to transition the device into a low-power or safe state to + * prevent data corruption. + * + * Steps performed: + * - Perform an emergency suspend using EXT_CSD_POWER_OFF_SHORT if possible. + * - If power-off notify is not supported, fallback mechanisms like sleep or + * deselecting the card are attempted. + * - Cache flushing is skipped to reduce execution time. + * - Mark the card as removed to prevent further interactions after + * undervoltage. + * + * Note: This function does not handle host claiming or releasing. The caller + * must ensure that the host is properly claimed before calling this + * function and released afterward. + * + * Returns: 0 on success, or a negative error code if any step fails. + */ +static int _mmc_handle_undervoltage(struct mmc_host *host) +{ + struct mmc_card *card = host->card; + int err; + + /* + * Perform an emergency suspend to power off the eMMC quickly. + * This ensures the device enters a safe state before power is lost. + * We first attempt EXT_CSD_POWER_OFF_SHORT, but if power-off notify + * is not supported, we fall back to sleep mode or deselecting the card. + * Cache flushing is skipped to minimize delay. + */ + err = _mmc_suspend(host, MMC_POWEROFF_UNDERVOLTAGE); + if (err) + pr_err("%s: undervoltage suspend failed: %pe\n", + mmc_hostname(host), ERR_PTR(err)); + + /* + * Mark the card as removed to prevent further operations. + * This ensures the system does not attempt to access the device + * after an undervoltage event, avoiding potential corruption. + */ + mmc_card_set_removed(card); + + return err; +} + static const struct mmc_bus_ops mmc_ops = { .remove = mmc_remove, .detect = mmc_detect, @@ -2314,6 +2377,7 @@ static const struct mmc_bus_ops mmc_ops = { .hw_reset = _mmc_hw_reset, .cache_enabled = _mmc_cache_enabled, .flush_cache = _mmc_flush_cache, + .handle_undervoltage = _mmc_handle_undervoltage, }; /* From fc232394c09a0a50a1d33b231f4b579a58d4688f Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:36 +0800 Subject: [PATCH 0412/3206] mmc: sdhci: add some simple inline functions for !CONFIG_PM In next commits, we will switch to the modern PM macros. Signed-off-by: Jisheng Zhang Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20250815013413.28641-2-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 58fcbeaf281e02..b6a571d866fa53 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -880,6 +880,13 @@ int sdhci_suspend_host(struct sdhci_host *host); int sdhci_resume_host(struct sdhci_host *host); void sdhci_runtime_suspend_host(struct sdhci_host *host); void sdhci_runtime_resume_host(struct sdhci_host *host, int soft_reset); +#else +static inline bool sdhci_enable_irq_wakeups(struct sdhci_host *host) { return false; } +static inline void sdhci_disable_irq_wakeups(struct sdhci_host *host) {} +static inline int sdhci_suspend_host(struct sdhci_host *host) { return -EOPNOTSUPP; } +static inline int sdhci_resume_host(struct sdhci_host *host) { return -EOPNOTSUPP; } +static inline void sdhci_runtime_suspend_host(struct sdhci_host *host) {} +static inline void sdhci_runtime_resume_host(struct sdhci_host *host, int soft_reset) {} #endif void sdhci_cqe_enable(struct mmc_host *mmc); From fc39a30c6ddb122ae687e4360f3578eca6749276 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:37 +0800 Subject: [PATCH 0413/3206] mmc: sdhci-of-dwcmshc: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Tested-by: Drew Fustini # for TH1520 on LPi4a Link: https://lore.kernel.org/r/20250815013413.28641-3-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-dwcmshc.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/mmc/host/sdhci-of-dwcmshc.c b/drivers/mmc/host/sdhci-of-dwcmshc.c index ee6b1096f70921..eebd4538995663 100644 --- a/drivers/mmc/host/sdhci-of-dwcmshc.c +++ b/drivers/mmc/host/sdhci-of-dwcmshc.c @@ -1499,7 +1499,6 @@ static void dwcmshc_remove(struct platform_device *pdev) clk_bulk_disable_unprepare(priv->num_other_clks, priv->other_clks); } -#ifdef CONFIG_PM_SLEEP static int dwcmshc_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -1570,9 +1569,6 @@ static int dwcmshc_resume(struct device *dev) clk_disable_unprepare(pltfm_host->clk); return ret; } -#endif - -#ifdef CONFIG_PM static void dwcmshc_enable_card_clk(struct sdhci_host *host) { @@ -1603,12 +1599,9 @@ static int dwcmshc_runtime_resume(struct device *dev) return 0; } -#endif - static const struct dev_pm_ops dwcmshc_pmops = { - SET_SYSTEM_SLEEP_PM_OPS(dwcmshc_suspend, dwcmshc_resume) - SET_RUNTIME_PM_OPS(dwcmshc_runtime_suspend, - dwcmshc_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(dwcmshc_suspend, dwcmshc_resume) + RUNTIME_PM_OPS(dwcmshc_runtime_suspend, dwcmshc_runtime_resume, NULL) }; static struct platform_driver sdhci_dwcmshc_driver = { @@ -1617,7 +1610,7 @@ static struct platform_driver sdhci_dwcmshc_driver = { .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_dwcmshc_dt_ids, .acpi_match_table = ACPI_PTR(sdhci_dwcmshc_acpi_ids), - .pm = &dwcmshc_pmops, + .pm = pm_ptr(&dwcmshc_pmops), }, .probe = dwcmshc_probe, .remove = dwcmshc_remove, From a013431a806ed8d39b6f4b9ecbf3c4ce66581f5b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:38 +0800 Subject: [PATCH 0414/3206] mmc: sdhci-xenon: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-4-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-xenon.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/mmc/host/sdhci-xenon.c b/drivers/mmc/host/sdhci-xenon.c index b12bee8342bdd5..046e8100dd0899 100644 --- a/drivers/mmc/host/sdhci-xenon.c +++ b/drivers/mmc/host/sdhci-xenon.c @@ -622,7 +622,6 @@ static void xenon_remove(struct platform_device *pdev) clk_disable_unprepare(pltfm_host->clk); } -#ifdef CONFIG_PM_SLEEP static int xenon_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -635,9 +634,7 @@ static int xenon_suspend(struct device *dev) priv->restore_needed = true; return ret; } -#endif -#ifdef CONFIG_PM static int xenon_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -685,14 +682,10 @@ static int xenon_runtime_resume(struct device *dev) clk_disable_unprepare(pltfm_host->clk); return ret; } -#endif /* CONFIG_PM */ static const struct dev_pm_ops sdhci_xenon_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(xenon_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(xenon_runtime_suspend, - xenon_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(xenon_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(xenon_runtime_suspend, xenon_runtime_resume, NULL) }; static const struct of_device_id sdhci_xenon_dt_ids[] = { @@ -721,7 +714,7 @@ static struct platform_driver sdhci_xenon_driver = { .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_xenon_dt_ids, .acpi_match_table = ACPI_PTR(sdhci_xenon_acpi_ids), - .pm = &sdhci_xenon_dev_pm_ops, + .pm = pm_ptr(&sdhci_xenon_dev_pm_ops), }, .probe = xenon_probe, .remove = xenon_remove, From 4129dbbb53d7cec08a959aaaf3300394dfdc15cc Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:39 +0800 Subject: [PATCH 0415/3206] mmc: sdhci-pxav3: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-5-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pxav3.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/sdhci-pxav3.c b/drivers/mmc/host/sdhci-pxav3.c index 238f508f2fb0bd..d082c4e21aa9ed 100644 --- a/drivers/mmc/host/sdhci-pxav3.c +++ b/drivers/mmc/host/sdhci-pxav3.c @@ -523,7 +523,6 @@ static void sdhci_pxav3_remove(struct platform_device *pdev) clk_disable_unprepare(pxa->clk_core); } -#ifdef CONFIG_PM_SLEEP static int sdhci_pxav3_suspend(struct device *dev) { int ret; @@ -549,9 +548,7 @@ static int sdhci_pxav3_resume(struct device *dev) return ret; } -#endif -#ifdef CONFIG_PM static int sdhci_pxav3_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -583,12 +580,10 @@ static int sdhci_pxav3_runtime_resume(struct device *dev) sdhci_runtime_resume_host(host, 0); return 0; } -#endif static const struct dev_pm_ops sdhci_pxav3_pmops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_pxav3_suspend, sdhci_pxav3_resume) - SET_RUNTIME_PM_OPS(sdhci_pxav3_runtime_suspend, - sdhci_pxav3_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_pxav3_suspend, sdhci_pxav3_resume) + RUNTIME_PM_OPS(sdhci_pxav3_runtime_suspend, sdhci_pxav3_runtime_resume, NULL) }; static struct platform_driver sdhci_pxav3_driver = { @@ -596,7 +591,7 @@ static struct platform_driver sdhci_pxav3_driver = { .name = "sdhci-pxav3", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_pxav3_of_match), - .pm = &sdhci_pxav3_pmops, + .pm = pm_ptr(&sdhci_pxav3_pmops), }, .probe = sdhci_pxav3_probe, .remove = sdhci_pxav3_remove, From f98ad5c31c0fa02795ea005ddf2a4d4b38409127 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:40 +0800 Subject: [PATCH 0416/3206] mmc: sunxi: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-6-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sunxi-mmc.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c index ee4a65b0a22dce..8dbcff53a63133 100644 --- a/drivers/mmc/host/sunxi-mmc.c +++ b/drivers/mmc/host/sunxi-mmc.c @@ -1495,7 +1495,6 @@ static void sunxi_mmc_remove(struct platform_device *pdev) dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma); } -#ifdef CONFIG_PM static int sunxi_mmc_runtime_resume(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); @@ -1530,14 +1529,10 @@ static int sunxi_mmc_runtime_suspend(struct device *dev) return 0; } -#endif static const struct dev_pm_ops sunxi_mmc_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(sunxi_mmc_runtime_suspend, - sunxi_mmc_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(sunxi_mmc_runtime_suspend, sunxi_mmc_runtime_resume, NULL) }; static struct platform_driver sunxi_mmc_driver = { @@ -1545,7 +1540,7 @@ static struct platform_driver sunxi_mmc_driver = { .name = "sunxi-mmc", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sunxi_mmc_of_match, - .pm = &sunxi_mmc_pm_ops, + .pm = pm_ptr(&sunxi_mmc_pm_ops), }, .probe = sunxi_mmc_probe, .remove = sunxi_mmc_remove, From 5d6f42f6c455aea0a32eb6444c21b05e4cd049a0 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:41 +0800 Subject: [PATCH 0417/3206] mmc: alcor: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-7-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/alcor.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/alcor.c b/drivers/mmc/host/alcor.c index 288c3a91a0aff7..721db54739c165 100644 --- a/drivers/mmc/host/alcor.c +++ b/drivers/mmc/host/alcor.c @@ -1129,7 +1129,6 @@ static void alcor_pci_sdmmc_drv_remove(struct platform_device *pdev) mmc_remove_host(mmc); } -#ifdef CONFIG_PM_SLEEP static int alcor_pci_sdmmc_suspend(struct device *dev) { struct alcor_sdmmc_host *host = dev_get_drvdata(dev); @@ -1150,10 +1149,9 @@ static int alcor_pci_sdmmc_resume(struct device *dev) return 0; } -#endif /* CONFIG_PM_SLEEP */ -static SIMPLE_DEV_PM_OPS(alcor_mmc_pm_ops, alcor_pci_sdmmc_suspend, - alcor_pci_sdmmc_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(alcor_mmc_pm_ops, alcor_pci_sdmmc_suspend, + alcor_pci_sdmmc_resume); static const struct platform_device_id alcor_pci_sdmmc_ids[] = { { @@ -1171,7 +1169,7 @@ static struct platform_driver alcor_pci_sdmmc_driver = { .driver = { .name = DRV_NAME_ALCOR_PCI_SDMMC, .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &alcor_mmc_pm_ops + .pm = pm_sleep_ptr(&alcor_mmc_pm_ops), }, }; module_platform_driver(alcor_pci_sdmmc_driver); From ce4b13cb3001ccb2636ecee94febf81b031c32a2 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:42 +0800 Subject: [PATCH 0418/3206] mmc: atmel: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-8-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/atmel-mci.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index 777342fb76576f..d1fbc6811563a3 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -2622,7 +2622,6 @@ static void atmci_remove(struct platform_device *pdev) pm_runtime_put_noidle(dev); } -#ifdef CONFIG_PM static int atmci_runtime_suspend(struct device *dev) { struct atmel_mci *host = dev_get_drvdata(dev); @@ -2642,12 +2641,10 @@ static int atmci_runtime_resume(struct device *dev) return clk_prepare_enable(host->mck); } -#endif static const struct dev_pm_ops atmci_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(atmci_runtime_suspend, atmci_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(atmci_runtime_suspend, atmci_runtime_resume, NULL) }; static struct platform_driver atmci_driver = { @@ -2657,7 +2654,7 @@ static struct platform_driver atmci_driver = { .name = "atmel_mci", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = atmci_dt_ids, - .pm = &atmci_dev_pm_ops, + .pm = pm_ptr(&atmci_dev_pm_ops), }, }; module_platform_driver(atmci_driver); From 3d3c95796a31808515c26279961ddc6e8a655254 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:43 +0800 Subject: [PATCH 0419/3206] mmc: au1xmmc: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. At the same time, replace the platform_driver's .suspend and .resume usage with modern device_driver's .pm usage. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-9-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/au1xmmc.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/mmc/host/au1xmmc.c b/drivers/mmc/host/au1xmmc.c index 85470773650d68..cc6e05f9b96faa 100644 --- a/drivers/mmc/host/au1xmmc.c +++ b/drivers/mmc/host/au1xmmc.c @@ -1150,10 +1150,9 @@ static void au1xmmc_remove(struct platform_device *pdev) } } -#ifdef CONFIG_PM -static int au1xmmc_suspend(struct platform_device *pdev, pm_message_t state) +static int au1xmmc_suspend(struct device *dev) { - struct au1xmmc_host *host = platform_get_drvdata(pdev); + struct au1xmmc_host *host = dev_get_drvdata(dev); __raw_writel(0, HOST_CONFIG2(host)); __raw_writel(0, HOST_CONFIG(host)); @@ -1164,27 +1163,24 @@ static int au1xmmc_suspend(struct platform_device *pdev, pm_message_t state) return 0; } -static int au1xmmc_resume(struct platform_device *pdev) +static int au1xmmc_resume(struct device *dev) { - struct au1xmmc_host *host = platform_get_drvdata(pdev); + struct au1xmmc_host *host = dev_get_drvdata(dev); au1xmmc_reset_controller(host); return 0; } -#else -#define au1xmmc_suspend NULL -#define au1xmmc_resume NULL -#endif + +static DEFINE_SIMPLE_DEV_PM_OPS(au1xmmc_pmops, au1xmmc_suspend, au1xmmc_resume); static struct platform_driver au1xmmc_driver = { .probe = au1xmmc_probe, .remove = au1xmmc_remove, - .suspend = au1xmmc_suspend, - .resume = au1xmmc_resume, .driver = { .name = DRIVER_NAME, .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .pm = pm_sleep_ptr(&au1xmmc_pmops), }, }; From 8f50276e733a2e4ca5c5adace8bfc80d9493af50 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:44 +0800 Subject: [PATCH 0420/3206] mmc: cb710-mmc: use modern PM macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. At the same time, replace the platform_driver's .suspend and .resume usage with modern device_driver's .pm usage. Signed-off-by: Jisheng Zhang Acked-by: Michał Mirosław Link: https://lore.kernel.org/r/20250815013413.28641-10-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/cb710-mmc.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/mmc/host/cb710-mmc.c b/drivers/mmc/host/cb710-mmc.c index 448d2f9159eab4..31daec78749521 100644 --- a/drivers/mmc/host/cb710-mmc.c +++ b/drivers/mmc/host/cb710-mmc.c @@ -664,25 +664,25 @@ static const struct mmc_host_ops cb710_mmc_host = { .get_cd = cb710_mmc_get_cd, }; -#ifdef CONFIG_PM - -static int cb710_mmc_suspend(struct platform_device *pdev, pm_message_t state) +static int cb710_mmc_suspend(struct device *dev) { + struct platform_device *pdev = to_platform_device(dev); struct cb710_slot *slot = cb710_pdev_to_slot(pdev); cb710_mmc_enable_irq(slot, 0, ~0); return 0; } -static int cb710_mmc_resume(struct platform_device *pdev) +static int cb710_mmc_resume(struct device *dev) { + struct platform_device *pdev = to_platform_device(dev); struct cb710_slot *slot = cb710_pdev_to_slot(pdev); cb710_mmc_enable_irq(slot, 0, ~0); return 0; } -#endif /* CONFIG_PM */ +static DEFINE_SIMPLE_DEV_PM_OPS(cb710_mmc_pmops, cb710_mmc_suspend, cb710_mmc_resume); static int cb710_mmc_init(struct platform_device *pdev) { @@ -767,13 +767,12 @@ static void cb710_mmc_exit(struct platform_device *pdev) } static struct platform_driver cb710_mmc_driver = { - .driver.name = "cb710-mmc", + .driver = { + .name = "cb710-mmc", + .pm = pm_sleep_ptr(&cb710_mmc_pmops), + }, .probe = cb710_mmc_init, .remove = cb710_mmc_exit, -#ifdef CONFIG_PM - .suspend = cb710_mmc_suspend, - .resume = cb710_mmc_resume, -#endif }; module_platform_driver(cb710_mmc_driver); From aa18042bd1bd7a752ea9657074ee3c19d8666624 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:45 +0800 Subject: [PATCH 0421/3206] mmc: davinci_mmc: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-11-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/davinci_mmc.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c index 2a3c8058b0fbf3..2b7d6d9bcde514 100644 --- a/drivers/mmc/host/davinci_mmc.c +++ b/drivers/mmc/host/davinci_mmc.c @@ -1347,7 +1347,6 @@ static void davinci_mmcsd_remove(struct platform_device *pdev) clk_disable_unprepare(host->clk); } -#ifdef CONFIG_PM static int davinci_mmcsd_suspend(struct device *dev) { struct mmc_davinci_host *host = dev_get_drvdata(dev); @@ -1373,21 +1372,14 @@ static int davinci_mmcsd_resume(struct device *dev) return 0; } -static const struct dev_pm_ops davinci_mmcsd_pm = { - .suspend = davinci_mmcsd_suspend, - .resume = davinci_mmcsd_resume, -}; - -#define davinci_mmcsd_pm_ops (&davinci_mmcsd_pm) -#else -#define davinci_mmcsd_pm_ops NULL -#endif +static DEFINE_SIMPLE_DEV_PM_OPS(davinci_mmcsd_pm_ops, + davinci_mmcsd_suspend, davinci_mmcsd_resume); static struct platform_driver davinci_mmcsd_driver = { .driver = { .name = "davinci_mmc", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = davinci_mmcsd_pm_ops, + .pm = pm_sleep_ptr(&davinci_mmcsd_pm_ops), .of_match_table = davinci_mmc_dt_ids, }, .probe = davinci_mmcsd_probe, From 773a98f84dbb837b1f9a96be01826c769dc176b4 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:46 +0800 Subject: [PATCH 0422/3206] mmc: mmci: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-12-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/mmci.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index 8367283647a9b2..e500051bd572f7 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -2516,7 +2516,6 @@ static void mmci_remove(struct amba_device *dev) } } -#ifdef CONFIG_PM static void mmci_save(struct mmci_host *host) { unsigned long flags; @@ -2581,12 +2580,10 @@ static int mmci_runtime_resume(struct device *dev) return 0; } -#endif static const struct dev_pm_ops mmci_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(mmci_runtime_suspend, mmci_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(mmci_runtime_suspend, mmci_runtime_resume, NULL) }; static const struct amba_id mmci_ids[] = { @@ -2675,7 +2672,7 @@ MODULE_DEVICE_TABLE(amba, mmci_ids); static struct amba_driver mmci_driver = { .drv = { .name = DRIVER_NAME, - .pm = &mmci_dev_pm_ops, + .pm = pm_ptr(&mmci_dev_pm_ops), .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, .probe = mmci_probe, From 79565d5b3019d007ef76ee3b94e9cdc7b49b38b2 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:47 +0800 Subject: [PATCH 0423/3206] mmc: mxs-mmc: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-13-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/mxs-mmc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c index a6e44e4061061e..7c7c52d9e8e72b 100644 --- a/drivers/mmc/host/mxs-mmc.c +++ b/drivers/mmc/host/mxs-mmc.c @@ -680,7 +680,6 @@ static void mxs_mmc_remove(struct platform_device *pdev) clk_disable_unprepare(ssp->clk); } -#ifdef CONFIG_PM_SLEEP static int mxs_mmc_suspend(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); @@ -699,9 +698,8 @@ static int mxs_mmc_resume(struct device *dev) return clk_prepare_enable(ssp->clk); } -#endif -static SIMPLE_DEV_PM_OPS(mxs_mmc_pm_ops, mxs_mmc_suspend, mxs_mmc_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(mxs_mmc_pm_ops, mxs_mmc_suspend, mxs_mmc_resume); static struct platform_driver mxs_mmc_driver = { .probe = mxs_mmc_probe, @@ -709,7 +707,7 @@ static struct platform_driver mxs_mmc_driver = { .driver = { .name = DRIVER_NAME, .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &mxs_mmc_pm_ops, + .pm = pm_sleep_ptr(&mxs_mmc_pm_ops), .of_match_table = mxs_mmc_dt_ids, }, }; From 8e8214d98b5ffdc0368112b3dd3a709cdbffcd19 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:48 +0800 Subject: [PATCH 0424/3206] mmc: omap_hsmmc: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-14-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/omap_hsmmc.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index adc0d0b6ae377e..09e4354d1f1db8 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -620,8 +620,6 @@ static void omap_hsmmc_set_bus_mode(struct omap_hsmmc_host *host) OMAP_HSMMC_WRITE(host->base, CON, con & ~OD); } -#ifdef CONFIG_PM - /* * Restore the MMC host context, if it was lost as result of a * power state change. @@ -689,6 +687,7 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host) return 0; } +#ifdef CONFIG_PM /* * Save the MMC host context (store the number of power state changes so far). */ @@ -1990,7 +1989,6 @@ static void omap_hsmmc_remove(struct platform_device *pdev) clk_disable_unprepare(host->dbclk); } -#ifdef CONFIG_PM_SLEEP static int omap_hsmmc_suspend(struct device *dev) { struct omap_hsmmc_host *host = dev_get_drvdata(dev); @@ -2032,9 +2030,7 @@ static int omap_hsmmc_resume(struct device *dev) pm_runtime_put_autosuspend(host->dev); return 0; } -#endif -#ifdef CONFIG_PM static int omap_hsmmc_runtime_suspend(struct device *dev) { struct omap_hsmmc_host *host; @@ -2102,11 +2098,10 @@ static int omap_hsmmc_runtime_resume(struct device *dev) spin_unlock_irqrestore(&host->irq_lock, flags); return 0; } -#endif static const struct dev_pm_ops omap_hsmmc_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(omap_hsmmc_suspend, omap_hsmmc_resume) - SET_RUNTIME_PM_OPS(omap_hsmmc_runtime_suspend, omap_hsmmc_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(omap_hsmmc_suspend, omap_hsmmc_resume) + RUNTIME_PM_OPS(omap_hsmmc_runtime_suspend, omap_hsmmc_runtime_resume, NULL) }; static struct platform_driver omap_hsmmc_driver = { @@ -2115,7 +2110,7 @@ static struct platform_driver omap_hsmmc_driver = { .driver = { .name = DRIVER_NAME, .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &omap_hsmmc_dev_pm_ops, + .pm = pm_ptr(&omap_hsmmc_dev_pm_ops), .of_match_table = of_match_ptr(omap_mmc_of_match), }, }; From 99a44c327f68db78f48343ba6ad48ef06d91fb6f Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:49 +0800 Subject: [PATCH 0425/3206] mmc: rtsx_usb_sdmmc: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-15-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/rtsx_usb_sdmmc.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/rtsx_usb_sdmmc.c b/drivers/mmc/host/rtsx_usb_sdmmc.c index 70bd21084b977c..84674659a84d4a 100644 --- a/drivers/mmc/host/rtsx_usb_sdmmc.c +++ b/drivers/mmc/host/rtsx_usb_sdmmc.c @@ -1455,7 +1455,6 @@ static void rtsx_usb_sdmmc_drv_remove(struct platform_device *pdev) ": Realtek USB SD/MMC module has been removed\n"); } -#ifdef CONFIG_PM static int rtsx_usb_sdmmc_runtime_suspend(struct device *dev) { struct rtsx_usb_sdmmc *host = dev_get_drvdata(dev); @@ -1473,11 +1472,9 @@ static int rtsx_usb_sdmmc_runtime_resume(struct device *dev) mmc_detect_change(host->mmc, 0); return 0; } -#endif static const struct dev_pm_ops rtsx_usb_sdmmc_dev_pm_ops = { - SET_RUNTIME_PM_OPS(rtsx_usb_sdmmc_runtime_suspend, - rtsx_usb_sdmmc_runtime_resume, NULL) + RUNTIME_PM_OPS(rtsx_usb_sdmmc_runtime_suspend, rtsx_usb_sdmmc_runtime_resume, NULL) }; static const struct platform_device_id rtsx_usb_sdmmc_ids[] = { @@ -1496,7 +1493,7 @@ static struct platform_driver rtsx_usb_sdmmc_driver = { .driver = { .name = "rtsx_usb_sdmmc", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &rtsx_usb_sdmmc_dev_pm_ops, + .pm = pm_ptr(&rtsx_usb_sdmmc_dev_pm_ops), }, }; module_platform_driver(rtsx_usb_sdmmc_driver); From f94509c94254d932dd7b1e6730032f512476af0b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:50 +0800 Subject: [PATCH 0426/3206] mmc: sdhci-acpi: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20250815013413.28641-16-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-acpi.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 2d46d4854fa1b6..84c7054607fcb9 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -973,8 +973,7 @@ static void sdhci_acpi_remove(struct platform_device *pdev) c->slot->free_slot(pdev); } -static void __maybe_unused sdhci_acpi_reset_signal_voltage_if_needed( - struct device *dev) +static void sdhci_acpi_reset_signal_voltage_if_needed(struct device *dev) { struct sdhci_acpi_host *c = dev_get_drvdata(dev); struct sdhci_host *host = c->host; @@ -989,8 +988,6 @@ static void __maybe_unused sdhci_acpi_reset_signal_voltage_if_needed( } } -#ifdef CONFIG_PM_SLEEP - static int sdhci_acpi_suspend(struct device *dev) { struct sdhci_acpi_host *c = dev_get_drvdata(dev); @@ -1017,10 +1014,6 @@ static int sdhci_acpi_resume(struct device *dev) return sdhci_resume_host(c->host); } -#endif - -#ifdef CONFIG_PM - static int sdhci_acpi_runtime_suspend(struct device *dev) { struct sdhci_acpi_host *c = dev_get_drvdata(dev); @@ -1045,12 +1038,9 @@ static int sdhci_acpi_runtime_resume(struct device *dev) return 0; } -#endif - static const struct dev_pm_ops sdhci_acpi_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_acpi_suspend, sdhci_acpi_resume) - SET_RUNTIME_PM_OPS(sdhci_acpi_runtime_suspend, - sdhci_acpi_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_acpi_suspend, sdhci_acpi_resume) + RUNTIME_PM_OPS(sdhci_acpi_runtime_suspend, sdhci_acpi_runtime_resume, NULL) }; static struct platform_driver sdhci_acpi_driver = { @@ -1058,7 +1048,7 @@ static struct platform_driver sdhci_acpi_driver = { .name = "sdhci-acpi", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .acpi_match_table = sdhci_acpi_ids, - .pm = &sdhci_acpi_pm_ops, + .pm = pm_ptr(&sdhci_acpi_pm_ops), }, .probe = sdhci_acpi_probe, .remove = sdhci_acpi_remove, From 5861ff20fda6aeddc837fd9e10d09e288e089565 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:51 +0800 Subject: [PATCH 0427/3206] mmc: sdhci_am654: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-17-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index ffc45930c240af..d235b0aecfdb12 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -1035,7 +1035,6 @@ static void sdhci_am654_remove(struct platform_device *pdev) pm_runtime_put_noidle(dev); } -#ifdef CONFIG_PM static int sdhci_am654_restore(struct sdhci_host *host) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1123,20 +1122,17 @@ static int sdhci_am654_runtime_resume(struct device *dev) return 0; } -#endif static const struct dev_pm_ops sdhci_am654_dev_pm_ops = { - SET_RUNTIME_PM_OPS(sdhci_am654_runtime_suspend, - sdhci_am654_runtime_resume, NULL) - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) + RUNTIME_PM_OPS(sdhci_am654_runtime_suspend, sdhci_am654_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) }; static struct platform_driver sdhci_am654_driver = { .driver = { .name = "sdhci-am654", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_am654_dev_pm_ops, + .pm = pm_ptr(&sdhci_am654_dev_pm_ops), .of_match_table = sdhci_am654_of_match, }, .probe = sdhci_am654_probe, From 1e4f1d8cc1211c87d5bc384c187174bed772d7d6 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:52 +0800 Subject: [PATCH 0428/3206] mmc: sdhci-brcmstb: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-18-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-brcmstb.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/sdhci-brcmstb.c b/drivers/mmc/host/sdhci-brcmstb.c index efc2f3bdc63158..15705e85417f40 100644 --- a/drivers/mmc/host/sdhci-brcmstb.c +++ b/drivers/mmc/host/sdhci-brcmstb.c @@ -496,7 +496,6 @@ static void sdhci_brcmstb_shutdown(struct platform_device *pdev) MODULE_DEVICE_TABLE(of, sdhci_brcm_of_match); -#ifdef CONFIG_PM_SLEEP static int sdhci_brcmstb_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -540,17 +539,14 @@ static int sdhci_brcmstb_resume(struct device *dev) return ret; } -#endif -static const struct dev_pm_ops sdhci_brcmstb_pmops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_brcmstb_suspend, sdhci_brcmstb_resume) -}; +static DEFINE_SIMPLE_DEV_PM_OPS(sdhci_brcmstb_pmops, sdhci_brcmstb_suspend, sdhci_brcmstb_resume); static struct platform_driver sdhci_brcmstb_driver = { .driver = { .name = "sdhci-brcmstb", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_brcmstb_pmops, + .pm = pm_sleep_ptr(&sdhci_brcmstb_pmops), .of_match_table = of_match_ptr(sdhci_brcm_of_match), }, .probe = sdhci_brcmstb_probe, From 1072eaf162d6247f10ac33d79c89cdb3ce775ccb Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:53 +0800 Subject: [PATCH 0429/3206] mmc: sdhci-esdhc-imx: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Reviewed-by: Haibo Chen Link: https://lore.kernel.org/r/20250815013413.28641-19-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-esdhc-imx.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index a040c0896a7b30..a7a5df673b0f6d 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -1650,7 +1650,6 @@ static void sdhci_esdhc_imx_hwinit(struct sdhci_host *host) } } -#ifdef CONFIG_PM_SLEEP static void sdhc_esdhc_tuning_save(struct sdhci_host *host) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1707,7 +1706,6 @@ static void sdhc_esdhc_tuning_restore(struct sdhci_host *host) host->ioaddr + ESDHC_TUNE_CTRL_STATUS); } } -#endif static void esdhc_cqe_enable(struct mmc_host *mmc) { @@ -2016,7 +2014,6 @@ static void sdhci_esdhc_imx_remove(struct platform_device *pdev) cpu_latency_qos_remove_request(&imx_data->pm_qos_req); } -#ifdef CONFIG_PM_SLEEP static int sdhci_esdhc_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -2112,9 +2109,7 @@ static int sdhci_esdhc_resume(struct device *dev) return ret; } -#endif -#ifdef CONFIG_PM static int sdhci_esdhc_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -2188,12 +2183,10 @@ static int sdhci_esdhc_runtime_resume(struct device *dev) cpu_latency_qos_remove_request(&imx_data->pm_qos_req); return err; } -#endif static const struct dev_pm_ops sdhci_esdhc_pmops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_esdhc_suspend, sdhci_esdhc_resume) - SET_RUNTIME_PM_OPS(sdhci_esdhc_runtime_suspend, - sdhci_esdhc_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_esdhc_suspend, sdhci_esdhc_resume) + RUNTIME_PM_OPS(sdhci_esdhc_runtime_suspend, sdhci_esdhc_runtime_resume, NULL) }; static struct platform_driver sdhci_esdhc_imx_driver = { @@ -2201,7 +2194,7 @@ static struct platform_driver sdhci_esdhc_imx_driver = { .name = "sdhci-esdhc-imx", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = imx_esdhc_dt_ids, - .pm = &sdhci_esdhc_pmops, + .pm = pm_ptr(&sdhci_esdhc_pmops), }, .probe = sdhci_esdhc_imx_probe, .remove = sdhci_esdhc_imx_remove, From bb7b1709873facacd9f1267c3e79a578d55c3699 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:54 +0800 Subject: [PATCH 0430/3206] mmc: sdhci-of-arasan: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-20-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-arasan.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/sdhci-of-arasan.c b/drivers/mmc/host/sdhci-of-arasan.c index 60dbc815e5019f..c6f09b53325d2d 100644 --- a/drivers/mmc/host/sdhci-of-arasan.c +++ b/drivers/mmc/host/sdhci-of-arasan.c @@ -605,7 +605,6 @@ static const struct sdhci_pltfm_data sdhci_arasan_cqe_pdata = { SDHCI_QUIRK2_CLOCK_DIV_ZERO_BROKEN, }; -#ifdef CONFIG_PM_SLEEP /** * sdhci_arasan_suspend - Suspend method for the driver * @dev: Address of the device structure @@ -699,10 +698,9 @@ static int sdhci_arasan_resume(struct device *dev) return 0; } -#endif /* ! CONFIG_PM_SLEEP */ -static SIMPLE_DEV_PM_OPS(sdhci_arasan_dev_pm_ops, sdhci_arasan_suspend, - sdhci_arasan_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(sdhci_arasan_dev_pm_ops, sdhci_arasan_suspend, + sdhci_arasan_resume); /** * sdhci_arasan_sdcardclk_recalc_rate - Return the card clock rate @@ -2080,7 +2078,7 @@ static struct platform_driver sdhci_arasan_driver = { .name = "sdhci-arasan", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_arasan_of_match, - .pm = &sdhci_arasan_dev_pm_ops, + .pm = pm_sleep_ptr(&sdhci_arasan_dev_pm_ops), }, .probe = sdhci_arasan_probe, .remove = sdhci_arasan_remove, From 6b66e69da2df7472d6fc6feca2b8c5e25d3857bb Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:55 +0800 Subject: [PATCH 0431/3206] mmc: sdhci-of-at91: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-21-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-at91.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c index 1ba2effaf376af..7c4ac65f247d39 100644 --- a/drivers/mmc/host/sdhci-of-at91.c +++ b/drivers/mmc/host/sdhci-of-at91.c @@ -229,7 +229,6 @@ static int sdhci_at91_set_clks_presets(struct device *dev) return 0; } -#ifdef CONFIG_PM_SLEEP static int sdhci_at91_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -243,9 +242,7 @@ static int sdhci_at91_suspend(struct device *dev) return ret; } -#endif /* CONFIG_PM_SLEEP */ -#ifdef CONFIG_PM static int sdhci_at91_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -302,13 +299,10 @@ static int sdhci_at91_runtime_resume(struct device *dev) sdhci_runtime_resume_host(host, 0); return 0; } -#endif /* CONFIG_PM */ static const struct dev_pm_ops sdhci_at91_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_at91_suspend, pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(sdhci_at91_runtime_suspend, - sdhci_at91_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_at91_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(sdhci_at91_runtime_suspend, sdhci_at91_runtime_resume, NULL) }; static int sdhci_at91_probe(struct platform_device *pdev) @@ -460,7 +454,7 @@ static struct platform_driver sdhci_at91_driver = { .name = "sdhci-at91", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_at91_dt_match, - .pm = &sdhci_at91_dev_pm_ops, + .pm = pm_ptr(&sdhci_at91_dev_pm_ops), }, .probe = sdhci_at91_probe, .remove = sdhci_at91_remove, From 7340c260d0f4ffc2f4068f9c80d1d198f1df9cad Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:56 +0800 Subject: [PATCH 0432/3206] mmc: sdhci-of-esdhc: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-22-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-esdhc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index c6ee0099ead096..8345e2c5a03413 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -1234,7 +1234,6 @@ static u32 esdhc_irq(struct sdhci_host *host, u32 intmask) return intmask; } -#ifdef CONFIG_PM_SLEEP static u32 esdhc_proctl; static int esdhc_of_suspend(struct device *dev) { @@ -1260,11 +1259,8 @@ static int esdhc_of_resume(struct device *dev) } return ret; } -#endif -static SIMPLE_DEV_PM_OPS(esdhc_of_dev_pm_ops, - esdhc_of_suspend, - esdhc_of_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(esdhc_of_dev_pm_ops, esdhc_of_suspend, esdhc_of_resume); static const struct sdhci_ops sdhci_esdhc_be_ops = { .read_l = esdhc_be_readl, @@ -1511,7 +1507,7 @@ static struct platform_driver sdhci_esdhc_driver = { .name = "sdhci-esdhc", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_esdhc_of_match, - .pm = &esdhc_of_dev_pm_ops, + .pm = pm_sleep_ptr(&esdhc_of_dev_pm_ops), }, .probe = sdhci_esdhc_probe, .remove = sdhci_pltfm_remove, From d8d84cf91a1bbfd117649cb677cadcfcb5ed9e65 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:57 +0800 Subject: [PATCH 0433/3206] mmc: sdhci-omap: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-23-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-omap.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/mmc/host/sdhci-omap.c b/drivers/mmc/host/sdhci-omap.c index cdb09605e009aa..b5d7c1a80a92f2 100644 --- a/drivers/mmc/host/sdhci-omap.c +++ b/drivers/mmc/host/sdhci-omap.c @@ -1400,8 +1400,7 @@ static void sdhci_omap_remove(struct platform_device *pdev) pm_runtime_force_suspend(dev); } -#ifdef CONFIG_PM -static void __maybe_unused sdhci_omap_context_save(struct sdhci_omap_host *omap_host) +static void sdhci_omap_context_save(struct sdhci_omap_host *omap_host) { omap_host->con = sdhci_omap_readl(omap_host, SDHCI_OMAP_CON); omap_host->hctl = sdhci_omap_readl(omap_host, SDHCI_OMAP_HCTL); @@ -1412,7 +1411,7 @@ static void __maybe_unused sdhci_omap_context_save(struct sdhci_omap_host *omap_ } /* Order matters here, HCTL must be restored in two phases */ -static void __maybe_unused sdhci_omap_context_restore(struct sdhci_omap_host *omap_host) +static void sdhci_omap_context_restore(struct sdhci_omap_host *omap_host) { sdhci_omap_writel(omap_host, SDHCI_OMAP_HCTL, omap_host->hctl); sdhci_omap_writel(omap_host, SDHCI_OMAP_CAPA, omap_host->capa); @@ -1424,7 +1423,7 @@ static void __maybe_unused sdhci_omap_context_restore(struct sdhci_omap_host *om sdhci_omap_writel(omap_host, SDHCI_OMAP_ISE, omap_host->ise); } -static int __maybe_unused sdhci_omap_runtime_suspend(struct device *dev) +static int sdhci_omap_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1443,7 +1442,7 @@ static int __maybe_unused sdhci_omap_runtime_suspend(struct device *dev) return 0; } -static int __maybe_unused sdhci_omap_runtime_resume(struct device *dev) +static int sdhci_omap_runtime_resume(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1458,13 +1457,10 @@ static int __maybe_unused sdhci_omap_runtime_resume(struct device *dev) return 0; } -#endif static const struct dev_pm_ops sdhci_omap_dev_pm_ops = { - SET_RUNTIME_PM_OPS(sdhci_omap_runtime_suspend, - sdhci_omap_runtime_resume, NULL) - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) + RUNTIME_PM_OPS(sdhci_omap_runtime_suspend, sdhci_omap_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) }; static struct platform_driver sdhci_omap_driver = { @@ -1473,7 +1469,7 @@ static struct platform_driver sdhci_omap_driver = { .driver = { .name = "sdhci-omap", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_omap_dev_pm_ops, + .pm = pm_ptr(&sdhci_omap_dev_pm_ops), .of_match_table = omap_sdhci_match, }, }; From 1c6316a5ce4ab6775d24f4a0aa24c5cfe274e2b8 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:58 +0800 Subject: [PATCH 0434/3206] mmc: sdhci-cadence: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-24-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-cadence.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/sdhci-cadence.c b/drivers/mmc/host/sdhci-cadence.c index eaa88897a256c0..435603c8c00b2a 100644 --- a/drivers/mmc/host/sdhci-cadence.c +++ b/drivers/mmc/host/sdhci-cadence.c @@ -611,7 +611,6 @@ static int sdhci_cdns_probe(struct platform_device *pdev) return sdhci_add_host(host); } -#ifdef CONFIG_PM_SLEEP static int sdhci_cdns_resume(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -638,11 +637,8 @@ static int sdhci_cdns_resume(struct device *dev) return ret; } -#endif -static const struct dev_pm_ops sdhci_cdns_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_pltfm_suspend, sdhci_cdns_resume) -}; +static DEFINE_SIMPLE_DEV_PM_OPS(sdhci_cdns_pm_ops, sdhci_pltfm_suspend, sdhci_cdns_resume); static const struct of_device_id sdhci_cdns_match[] = { { @@ -666,7 +662,7 @@ static struct platform_driver sdhci_cdns_driver = { .driver = { .name = "sdhci-cdns", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_cdns_pm_ops, + .pm = pm_sleep_ptr(&sdhci_cdns_pm_ops), .of_match_table = sdhci_cdns_match, }, .probe = sdhci_cdns_probe, From 550c5a8d9a48ee99e0737e14418ceb7d0fa28d59 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:33:59 +0800 Subject: [PATCH 0435/3206] mmc: sdhci-s3c: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-25-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-s3c.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c index 40857fc2e21b4a..6bf66aaa86a6df 100644 --- a/drivers/mmc/host/sdhci-s3c.c +++ b/drivers/mmc/host/sdhci-s3c.c @@ -681,7 +681,6 @@ static void sdhci_s3c_remove(struct platform_device *pdev) clk_disable_unprepare(sc->clk_io); } -#ifdef CONFIG_PM_SLEEP static int sdhci_s3c_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -698,9 +697,7 @@ static int sdhci_s3c_resume(struct device *dev) return sdhci_resume_host(host); } -#endif -#ifdef CONFIG_PM static int sdhci_s3c_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -730,12 +727,10 @@ static int sdhci_s3c_runtime_resume(struct device *dev) sdhci_runtime_resume_host(host, 0); return 0; } -#endif static const struct dev_pm_ops sdhci_s3c_pmops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_s3c_suspend, sdhci_s3c_resume) - SET_RUNTIME_PM_OPS(sdhci_s3c_runtime_suspend, sdhci_s3c_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_s3c_suspend, sdhci_s3c_resume) + RUNTIME_PM_OPS(sdhci_s3c_runtime_suspend, sdhci_s3c_runtime_resume, NULL) }; static const struct platform_device_id sdhci_s3c_driver_ids[] = { @@ -770,7 +765,7 @@ static struct platform_driver sdhci_s3c_driver = { .name = "s3c-sdhci", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_s3c_dt_match), - .pm = &sdhci_s3c_pmops, + .pm = pm_ptr(&sdhci_s3c_pmops), }, }; From 02517359c9da7f07c2bafd1e152866142ac1d709 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:00 +0800 Subject: [PATCH 0436/3206] mmc: sdhci-spear: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20250815013413.28641-26-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-spear.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/sdhci-spear.c b/drivers/mmc/host/sdhci-spear.c index fa0f8aeb7ee0e8..72d21dc0cb698a 100644 --- a/drivers/mmc/host/sdhci-spear.c +++ b/drivers/mmc/host/sdhci-spear.c @@ -130,7 +130,6 @@ static void sdhci_remove(struct platform_device *pdev) clk_disable_unprepare(sdhci->clk); } -#ifdef CONFIG_PM_SLEEP static int sdhci_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -161,9 +160,8 @@ static int sdhci_resume(struct device *dev) return sdhci_resume_host(host); } -#endif -static SIMPLE_DEV_PM_OPS(sdhci_pm_ops, sdhci_suspend, sdhci_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(sdhci_pm_ops, sdhci_suspend, sdhci_resume); static const struct of_device_id sdhci_spear_id_table[] = { { .compatible = "st,spear300-sdhci" }, @@ -175,7 +173,7 @@ static struct platform_driver sdhci_driver = { .driver = { .name = "sdhci", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_pm_ops, + .pm = pm_sleep_ptr(&sdhci_pm_ops), .of_match_table = sdhci_spear_id_table, }, .probe = sdhci_probe, From 693223205b27e3876eba103fa13bf6ed35197067 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:01 +0800 Subject: [PATCH 0437/3206] mmc: sdhci-sprd: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-27-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-sprd.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/sdhci-sprd.c b/drivers/mmc/host/sdhci-sprd.c index fe2fe52b23b27a..3584a2b314a9f9 100644 --- a/drivers/mmc/host/sdhci-sprd.c +++ b/drivers/mmc/host/sdhci-sprd.c @@ -903,7 +903,6 @@ static const struct of_device_id sdhci_sprd_of_match[] = { }; MODULE_DEVICE_TABLE(of, sdhci_sprd_of_match); -#ifdef CONFIG_PM static int sdhci_sprd_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -950,13 +949,10 @@ static int sdhci_sprd_runtime_resume(struct device *dev) return ret; } -#endif static const struct dev_pm_ops sdhci_sprd_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(sdhci_sprd_runtime_suspend, - sdhci_sprd_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(sdhci_sprd_runtime_suspend, sdhci_sprd_runtime_resume, NULL) }; static struct platform_driver sdhci_sprd_driver = { @@ -966,7 +962,7 @@ static struct platform_driver sdhci_sprd_driver = { .name = "sdhci_sprd_r11", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_sprd_of_match, - .pm = &sdhci_sprd_pm_ops, + .pm = pm_ptr(&sdhci_sprd_pm_ops), }, }; module_platform_driver(sdhci_sprd_driver); From dc8b0e5766804c58d02c1caf6e11c7db718c18f1 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:02 +0800 Subject: [PATCH 0438/3206] mmc: sdhci-st: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Reviewed-by: Patrice Chotard Link: https://lore.kernel.org/r/20250815013413.28641-28-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-st.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/sdhci-st.c b/drivers/mmc/host/sdhci-st.c index 9157342ff7a4bb..bf668580513778 100644 --- a/drivers/mmc/host/sdhci-st.c +++ b/drivers/mmc/host/sdhci-st.c @@ -445,7 +445,6 @@ static void sdhci_st_remove(struct platform_device *pdev) reset_control_assert(rstc); } -#ifdef CONFIG_PM_SLEEP static int sdhci_st_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -492,9 +491,8 @@ static int sdhci_st_resume(struct device *dev) return sdhci_resume_host(host); } -#endif -static SIMPLE_DEV_PM_OPS(sdhci_st_pmops, sdhci_st_suspend, sdhci_st_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(sdhci_st_pmops, sdhci_st_suspend, sdhci_st_resume); static const struct of_device_id st_sdhci_match[] = { { .compatible = "st,sdhci" }, @@ -509,7 +507,7 @@ static struct platform_driver sdhci_st_driver = { .driver = { .name = "sdhci-st", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_st_pmops, + .pm = pm_sleep_ptr(&sdhci_st_pmops), .of_match_table = st_sdhci_match, }, }; From 7aa59bfe06875cc4c3cb5c04eed2cdf6f6288153 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:03 +0800 Subject: [PATCH 0439/3206] mmc: sdhci-tegra: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-29-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-tegra.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index c811297185d8f8..820ce4dae58bac 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -1831,7 +1831,7 @@ static void sdhci_tegra_remove(struct platform_device *pdev) clk_disable_unprepare(tegra_host->tmclk); } -static int __maybe_unused sdhci_tegra_runtime_suspend(struct device *dev) +static int sdhci_tegra_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1841,7 +1841,7 @@ static int __maybe_unused sdhci_tegra_runtime_suspend(struct device *dev) return 0; } -static int __maybe_unused sdhci_tegra_runtime_resume(struct device *dev) +static int sdhci_tegra_runtime_resume(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1849,7 +1849,6 @@ static int __maybe_unused sdhci_tegra_runtime_resume(struct device *dev) return clk_prepare_enable(pltfm_host->clk); } -#ifdef CONFIG_PM_SLEEP static int sdhci_tegra_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -1910,12 +1909,10 @@ static int sdhci_tegra_resume(struct device *dev) pm_runtime_force_suspend(dev); return ret; } -#endif static const struct dev_pm_ops sdhci_tegra_dev_pm_ops = { - SET_RUNTIME_PM_OPS(sdhci_tegra_runtime_suspend, sdhci_tegra_runtime_resume, - NULL) - SET_SYSTEM_SLEEP_PM_OPS(sdhci_tegra_suspend, sdhci_tegra_resume) + RUNTIME_PM_OPS(sdhci_tegra_runtime_suspend, sdhci_tegra_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_tegra_suspend, sdhci_tegra_resume) }; static struct platform_driver sdhci_tegra_driver = { @@ -1923,7 +1920,7 @@ static struct platform_driver sdhci_tegra_driver = { .name = "sdhci-tegra", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_tegra_dt_match, - .pm = &sdhci_tegra_dev_pm_ops, + .pm = pm_ptr(&sdhci_tegra_dev_pm_ops), }, .probe = sdhci_tegra_probe, .remove = sdhci_tegra_remove, From b2af65aee264a496c766ea595ecdb3f2c610b8b5 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:04 +0800 Subject: [PATCH 0440/3206] mmc: sh_mmicf: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-30-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sh_mmcif.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 19f84584ecfa0e..413c34585c90d5 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -1568,7 +1568,6 @@ static void sh_mmcif_remove(struct platform_device *pdev) pm_runtime_disable(&pdev->dev); } -#ifdef CONFIG_PM_SLEEP static int sh_mmcif_suspend(struct device *dev) { struct sh_mmcif_host *host = dev_get_drvdata(dev); @@ -1584,11 +1583,8 @@ static int sh_mmcif_resume(struct device *dev) { return 0; } -#endif -static const struct dev_pm_ops sh_mmcif_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(sh_mmcif_suspend, sh_mmcif_resume) -}; +static DEFINE_SIMPLE_DEV_PM_OPS(sh_mmcif_dev_pm_ops, sh_mmcif_suspend, sh_mmcif_resume); static struct platform_driver sh_mmcif_driver = { .probe = sh_mmcif_probe, @@ -1596,7 +1592,7 @@ static struct platform_driver sh_mmcif_driver = { .driver = { .name = DRIVER_NAME, .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sh_mmcif_dev_pm_ops, + .pm = pm_sleep_ptr(&sh_mmcif_dev_pm_ops), .of_match_table = sh_mmcif_of_match, }, }; From a120f2175d0f10e985fd7eb6007119f8ed53e083 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:05 +0800 Subject: [PATCH 0441/3206] mmc: toshsd: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-31-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/toshsd.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/toshsd.c b/drivers/mmc/host/toshsd.c index e5f7f8abafc055..aa5d2511a62b8f 100644 --- a/drivers/mmc/host/toshsd.c +++ b/drivers/mmc/host/toshsd.c @@ -567,7 +567,6 @@ static void toshsd_powerdown(struct toshsd_host *host) pci_write_config_byte(host->pdev, SD_PCICFG_CLKSTOP, 0); } -#ifdef CONFIG_PM_SLEEP static int toshsd_pm_suspend(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); @@ -599,7 +598,6 @@ static int toshsd_pm_resume(struct device *dev) return 0; } -#endif /* CONFIG_PM_SLEEP */ static int toshsd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -688,16 +686,14 @@ static void toshsd_remove(struct pci_dev *pdev) pci_disable_device(pdev); } -static const struct dev_pm_ops toshsd_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(toshsd_pm_suspend, toshsd_pm_resume) -}; +static DEFINE_SIMPLE_DEV_PM_OPS(toshsd_pm_ops, toshsd_pm_suspend, toshsd_pm_resume); static struct pci_driver toshsd_driver = { .name = DRIVER_NAME, .id_table = pci_ids, .probe = toshsd_probe, .remove = toshsd_remove, - .driver.pm = &toshsd_pm_ops, + .driver.pm = pm_sleep_ptr(&toshsd_pm_ops), }; module_pci_driver(toshsd_driver); From 8a40405a5f18adb6065762b651bac9a71aa1119c Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:06 +0800 Subject: [PATCH 0442/3206] mmc: wmt-sdmmc: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-32-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/wmt-sdmmc.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/mmc/host/wmt-sdmmc.c b/drivers/mmc/host/wmt-sdmmc.c index 0d2929cfe39756..1b1d691e19fcc7 100644 --- a/drivers/mmc/host/wmt-sdmmc.c +++ b/drivers/mmc/host/wmt-sdmmc.c @@ -911,7 +911,6 @@ static void wmt_mci_remove(struct platform_device *pdev) dev_info(&pdev->dev, "WMT MCI device removed\n"); } -#ifdef CONFIG_PM static int wmt_mci_suspend(struct device *dev) { u32 reg_tmp; @@ -963,18 +962,7 @@ static int wmt_mci_resume(struct device *dev) return 0; } -static const struct dev_pm_ops wmt_mci_pm = { - .suspend = wmt_mci_suspend, - .resume = wmt_mci_resume, -}; - -#define wmt_mci_pm_ops (&wmt_mci_pm) - -#else /* !CONFIG_PM */ - -#define wmt_mci_pm_ops NULL - -#endif +static DEFINE_SIMPLE_DEV_PM_OPS(wmt_mci_pm_ops, wmt_mci_suspend, wmt_mci_resume); static struct platform_driver wmt_mci_driver = { .probe = wmt_mci_probe, @@ -982,7 +970,7 @@ static struct platform_driver wmt_mci_driver = { .driver = { .name = DRIVER_NAME, .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = wmt_mci_pm_ops, + .pm = pm_sleep_ptr(&wmt_mci_pm_ops), .of_match_table = wmt_mci_dt_ids, }, }; From 2f7c7a18c352bf5fc9c44a034162be6dfd242f31 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:07 +0800 Subject: [PATCH 0443/3206] mmc: mtk-sd: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use __maybe_unused. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-33-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index d7020e06dd55aa..79074291e9d22e 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -3278,7 +3278,7 @@ static void msdc_restore_reg(struct msdc_host *host) __msdc_enable_sdio_irq(host, 1); } -static int __maybe_unused msdc_runtime_suspend(struct device *dev) +static int msdc_runtime_suspend(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct msdc_host *host = mmc_priv(mmc); @@ -3300,7 +3300,7 @@ static int __maybe_unused msdc_runtime_suspend(struct device *dev) return 0; } -static int __maybe_unused msdc_runtime_resume(struct device *dev) +static int msdc_runtime_resume(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct msdc_host *host = mmc_priv(mmc); @@ -3323,7 +3323,7 @@ static int __maybe_unused msdc_runtime_resume(struct device *dev) return 0; } -static int __maybe_unused msdc_suspend(struct device *dev) +static int msdc_suspend(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct msdc_host *host = mmc_priv(mmc); @@ -3348,7 +3348,7 @@ static int __maybe_unused msdc_suspend(struct device *dev) return pm_runtime_force_suspend(dev); } -static int __maybe_unused msdc_resume(struct device *dev) +static int msdc_resume(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct msdc_host *host = mmc_priv(mmc); @@ -3360,8 +3360,8 @@ static int __maybe_unused msdc_resume(struct device *dev) } static const struct dev_pm_ops msdc_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(msdc_suspend, msdc_resume) - SET_RUNTIME_PM_OPS(msdc_runtime_suspend, msdc_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(msdc_suspend, msdc_resume) + RUNTIME_PM_OPS(msdc_runtime_suspend, msdc_runtime_resume, NULL) }; static struct platform_driver mt_msdc_driver = { @@ -3371,7 +3371,7 @@ static struct platform_driver mt_msdc_driver = { .name = "mtk-msdc", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = msdc_of_ids, - .pm = &msdc_dev_pm_ops, + .pm = pm_ptr(&msdc_dev_pm_ops), }, }; From 1d955e4affecc6e14d04e77741c53c9bf7095b5e Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:08 +0800 Subject: [PATCH 0444/3206] mmc: sdhci-msm: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use __maybe_unused. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-34-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 9d8e20dc8ca11a..d2906bf6e598c3 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -1943,7 +1943,7 @@ static void sdhci_msm_ice_enable(struct sdhci_msm_host *msm_host) qcom_ice_enable(msm_host->ice); } -static __maybe_unused int sdhci_msm_ice_resume(struct sdhci_msm_host *msm_host) +static int sdhci_msm_ice_resume(struct sdhci_msm_host *msm_host) { if (msm_host->mmc->caps2 & MMC_CAP2_CRYPTO) return qcom_ice_resume(msm_host->ice); @@ -1951,7 +1951,7 @@ static __maybe_unused int sdhci_msm_ice_resume(struct sdhci_msm_host *msm_host) return 0; } -static __maybe_unused int sdhci_msm_ice_suspend(struct sdhci_msm_host *msm_host) +static int sdhci_msm_ice_suspend(struct sdhci_msm_host *msm_host) { if (msm_host->mmc->caps2 & MMC_CAP2_CRYPTO) return qcom_ice_suspend(msm_host->ice); @@ -2011,13 +2011,13 @@ static inline void sdhci_msm_ice_enable(struct sdhci_msm_host *msm_host) { } -static inline __maybe_unused int +static inline int sdhci_msm_ice_resume(struct sdhci_msm_host *msm_host) { return 0; } -static inline __maybe_unused int +static inline int sdhci_msm_ice_suspend(struct sdhci_msm_host *msm_host) { return 0; @@ -2801,7 +2801,7 @@ static void sdhci_msm_remove(struct platform_device *pdev) clk_disable_unprepare(msm_host->bus_clk); } -static __maybe_unused int sdhci_msm_runtime_suspend(struct device *dev) +static int sdhci_msm_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -2820,7 +2820,7 @@ static __maybe_unused int sdhci_msm_runtime_suspend(struct device *dev) return sdhci_msm_ice_suspend(msm_host); } -static __maybe_unused int sdhci_msm_runtime_resume(struct device *dev) +static int sdhci_msm_runtime_resume(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -2856,11 +2856,8 @@ static __maybe_unused int sdhci_msm_runtime_resume(struct device *dev) } static const struct dev_pm_ops sdhci_msm_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(sdhci_msm_runtime_suspend, - sdhci_msm_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(sdhci_msm_runtime_suspend, sdhci_msm_runtime_resume, NULL) }; static struct platform_driver sdhci_msm_driver = { @@ -2869,7 +2866,7 @@ static struct platform_driver sdhci_msm_driver = { .driver = { .name = "sdhci_msm", .of_match_table = sdhci_msm_dt_match, - .pm = &sdhci_msm_pm_ops, + .pm = pm_ptr(&sdhci_msm_pm_ops), .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; From 9ae88dc98d93fc8d283e85d7d6dd4091af7e5086 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:09 +0800 Subject: [PATCH 0445/3206] mmc: via-sdmmc: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use __maybe_unused. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-35-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/via-sdmmc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/via-sdmmc.c b/drivers/mmc/host/via-sdmmc.c index 3bd49f64899d62..c628b3bbfd7aad 100644 --- a/drivers/mmc/host/via-sdmmc.c +++ b/drivers/mmc/host/via-sdmmc.c @@ -1218,7 +1218,7 @@ static void via_sd_remove(struct pci_dev *pcidev) pci_name(pcidev), (int)pcidev->vendor, (int)pcidev->device); } -static void __maybe_unused via_init_sdc_pm(struct via_crdr_mmc_host *host) +static void via_init_sdc_pm(struct via_crdr_mmc_host *host) { struct sdhcreg *pm_sdhcreg; void __iomem *addrbase; @@ -1252,7 +1252,7 @@ static void __maybe_unused via_init_sdc_pm(struct via_crdr_mmc_host *host) via_print_sdchc(host); } -static int __maybe_unused via_sd_suspend(struct device *dev) +static int via_sd_suspend(struct device *dev) { struct via_crdr_mmc_host *host; unsigned long flags; @@ -1269,7 +1269,7 @@ static int __maybe_unused via_sd_suspend(struct device *dev) return 0; } -static int __maybe_unused via_sd_resume(struct device *dev) +static int via_sd_resume(struct device *dev) { struct via_crdr_mmc_host *sdhost; u8 gatt; @@ -1295,14 +1295,14 @@ static int __maybe_unused via_sd_resume(struct device *dev) return 0; } -static SIMPLE_DEV_PM_OPS(via_sd_pm_ops, via_sd_suspend, via_sd_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(via_sd_pm_ops, via_sd_suspend, via_sd_resume); static struct pci_driver via_sd_driver = { .name = DRV_NAME, .id_table = via_ids, .probe = via_sd_probe, .remove = via_sd_remove, - .driver.pm = &via_sd_pm_ops, + .driver.pm = pm_sleep_ptr(&via_sd_pm_ops), }; module_pci_driver(via_sd_driver); From a01c660c8f7da8d4c0c220aa7880471a9c1921ef Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:10 +0800 Subject: [PATCH 0446/3206] mmc: dw_mmc: exynos: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-36-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc-exynos.c | 13 +++---------- drivers/mmc/host/dw_mmc.h | 3 +++ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/mmc/host/dw_mmc-exynos.c b/drivers/mmc/host/dw_mmc-exynos.c index e3548408ca392c..384609671a9ab9 100644 --- a/drivers/mmc/host/dw_mmc-exynos.c +++ b/drivers/mmc/host/dw_mmc-exynos.c @@ -189,7 +189,6 @@ static void dw_mci_exynos_set_clksel_timing(struct dw_mci *host, u32 timing) set_bit(DW_MMC_CARD_NO_USE_HOLD, &host->slot->flags); } -#ifdef CONFIG_PM static int dw_mci_exynos_runtime_resume(struct device *dev) { struct dw_mci *host = dev_get_drvdata(dev); @@ -203,9 +202,7 @@ static int dw_mci_exynos_runtime_resume(struct device *dev) return ret; } -#endif /* CONFIG_PM */ -#ifdef CONFIG_PM_SLEEP /** * dw_mci_exynos_suspend_noirq - Exynos-specific suspend code * @dev: Device to suspend (this device) @@ -265,7 +262,6 @@ static int dw_mci_exynos_resume_noirq(struct device *dev) return 0; } -#endif /* CONFIG_PM_SLEEP */ static void dw_mci_exynos_config_hs400(struct dw_mci *host, u32 timing) { @@ -712,11 +708,8 @@ static void dw_mci_exynos_remove(struct platform_device *pdev) } static const struct dev_pm_ops dw_mci_exynos_pmops = { - SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(dw_mci_exynos_suspend_noirq, - dw_mci_exynos_resume_noirq) - SET_RUNTIME_PM_OPS(dw_mci_runtime_suspend, - dw_mci_exynos_runtime_resume, - NULL) + NOIRQ_SYSTEM_SLEEP_PM_OPS(dw_mci_exynos_suspend_noirq, dw_mci_exynos_resume_noirq) + RUNTIME_PM_OPS(dw_mci_runtime_suspend, dw_mci_exynos_runtime_resume, NULL) }; static struct platform_driver dw_mci_exynos_pltfm_driver = { @@ -726,7 +719,7 @@ static struct platform_driver dw_mci_exynos_pltfm_driver = { .name = "dwmmc_exynos", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_exynos_match, - .pm = &dw_mci_exynos_pmops, + .pm = pm_ptr(&dw_mci_exynos_pmops), }, }; diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h index 5463392dc81105..648b4a5641bf24 100644 --- a/drivers/mmc/host/dw_mmc.h +++ b/drivers/mmc/host/dw_mmc.h @@ -541,6 +541,9 @@ extern void dw_mci_remove(struct dw_mci *host); #ifdef CONFIG_PM extern int dw_mci_runtime_suspend(struct device *device); extern int dw_mci_runtime_resume(struct device *device); +#else +static inline int dw_mci_runtime_suspend(struct device *device) { return -EOPNOTSUPP; } +static inline int dw_mci_runtime_resume(struct device *device) { return -EOPNOTSUPP; } #endif /** From b6a668e0f1c453a32013d047c5cd9a46d3ef54a0 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:11 +0800 Subject: [PATCH 0447/3206] mmc: dw_mmc-k3: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-37-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc-k3.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/dw_mmc-k3.c b/drivers/mmc/host/dw_mmc-k3.c index 0311a37dd4abfa..ad6aa1aea54915 100644 --- a/drivers/mmc/host/dw_mmc-k3.c +++ b/drivers/mmc/host/dw_mmc-k3.c @@ -461,11 +461,8 @@ static int dw_mci_k3_probe(struct platform_device *pdev) } static const struct dev_pm_ops dw_mci_k3_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(dw_mci_runtime_suspend, - dw_mci_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(dw_mci_runtime_suspend, dw_mci_runtime_resume, NULL) }; static struct platform_driver dw_mci_k3_pltfm_driver = { @@ -475,7 +472,7 @@ static struct platform_driver dw_mci_k3_pltfm_driver = { .name = "dwmmc_k3", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_k3_match, - .pm = &dw_mci_k3_dev_pm_ops, + .pm = pm_ptr(&dw_mci_k3_dev_pm_ops), }, }; From eddc91781aadb4f7de2026dda83558a458fc7fd9 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:12 +0800 Subject: [PATCH 0448/3206] mmc: dw_mmc-pci: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-38-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc-pci.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/dw_mmc-pci.c b/drivers/mmc/host/dw_mmc-pci.c index e7ab699f488e67..092cc99175af0f 100644 --- a/drivers/mmc/host/dw_mmc-pci.c +++ b/drivers/mmc/host/dw_mmc-pci.c @@ -75,11 +75,8 @@ static void dw_mci_pci_remove(struct pci_dev *pdev) } static const struct dev_pm_ops dw_mci_pci_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(dw_mci_runtime_suspend, - dw_mci_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(dw_mci_runtime_suspend, dw_mci_runtime_resume, NULL) }; static const struct pci_device_id dw_mci_pci_id[] = { @@ -94,7 +91,7 @@ static struct pci_driver dw_mci_pci_driver = { .probe = dw_mci_pci_probe, .remove = dw_mci_pci_remove, .driver = { - .pm = &dw_mci_pci_dev_pm_ops, + .pm = pm_ptr(&dw_mci_pci_dev_pm_ops), }, }; From 4b43f2bcc84dd550c1a847318db02165d2829573 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 15 Aug 2025 09:34:13 +0800 Subject: [PATCH 0449/3206] mmc: dw_mmc-rockchip: use modern PM macros Use the modern PM macros for the suspend and resume functions to be automatically dropped by the compiler when CONFIG_PM or CONFIG_PM_SLEEP are disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250815013413.28641-39-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc-rockchip.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/dw_mmc-rockchip.c b/drivers/mmc/host/dw_mmc-rockchip.c index baa23b51773127..d2aec6cf9773dc 100644 --- a/drivers/mmc/host/dw_mmc-rockchip.c +++ b/drivers/mmc/host/dw_mmc-rockchip.c @@ -568,11 +568,8 @@ static void dw_mci_rockchip_remove(struct platform_device *pdev) } static const struct dev_pm_ops dw_mci_rockchip_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(dw_mci_runtime_suspend, - dw_mci_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(dw_mci_runtime_suspend, dw_mci_runtime_resume, NULL) }; static struct platform_driver dw_mci_rockchip_pltfm_driver = { @@ -582,7 +579,7 @@ static struct platform_driver dw_mci_rockchip_pltfm_driver = { .name = "dwmmc_rockchip", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_rockchip_match, - .pm = &dw_mci_rockchip_dev_pm_ops, + .pm = pm_ptr(&dw_mci_rockchip_dev_pm_ops), }, }; From 85f5d8e369c24d678442f6bb4ec5bbdc431a09d9 Mon Sep 17 00:00:00 2001 From: Jihed Chaibi Date: Thu, 24 Jul 2025 00:45:04 +0200 Subject: [PATCH 0450/3206] ARM: dts: armada-370-db: Fix stereo audio input routing on Armada 370 The simple-audio-card configuration for the Armada 370 development board incorrectly routed the left channel signal ("AIN1L") to both sides of the stereo "In Jack". This commit corrects the typo for the right channel, changing the second "AIN1L" entry to "AIN1R" to enable proper stereo input recording. Signed-off-by: Jihed Chaibi Signed-off-by: Gregory CLEMENT --- arch/arm/boot/dts/marvell/armada-370-db.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/marvell/armada-370-db.dts b/arch/arm/boot/dts/marvell/armada-370-db.dts index a7dc4c04d10bdf..a9a05d826f2233 100644 --- a/arch/arm/boot/dts/marvell/armada-370-db.dts +++ b/arch/arm/boot/dts/marvell/armada-370-db.dts @@ -119,7 +119,7 @@ "Out Jack", "HPL", "Out Jack", "HPR", "AIN1L", "In Jack", - "AIN1L", "In Jack"; + "AIN1R", "In Jack"; status = "okay"; simple-audio-card,dai-link@0 { From 2aeadea47f5bdec94d04b890bc9a0d25a1340fa8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 21 Aug 2025 18:43:28 +0100 Subject: [PATCH 0451/3206] ARM64: dts: mcbin: fix SATA ports on Macchiatobin Booting 6.16 on the Macchiatobin, I discover that I can no longer access my disks, and thus the userspace boot fails. The cause appears to be that one of the SATA controllers doesn't have any ports: [ 1.190312] ahci f4540000.sata: supply ahci not found, using dummy regulator [ 1.196255] ahci f4540000.sata: supply phy not found, using dummy regulator [ 1.202026] ahci f4540000.sata: No port enabled This is as a result of the blamed commit below which added a default disabled status to the .dtsi, but didn't properly update the mcbin dtsi file. Fix this regression. Fixes: 30023876aef4 ("arm64: dts: marvell: only enable complete sata nodes") Signed-off-by: Russell King (Oracle) Signed-off-by: Gregory CLEMENT --- arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi b/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi index 0d4a5fd9503f29..f2d278d171eb19 100644 --- a/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi @@ -345,11 +345,13 @@ /* CPS Lane 1 - U32 */ sata-port@0 { phys = <&cp1_comphy1 0>; + status = "okay"; }; /* CPS Lane 3 - U31 */ sata-port@1 { phys = <&cp1_comphy3 1>; + status = "okay"; }; }; From f41345f47fb267a9c95ca710c33448f8d0d81d83 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 20 Aug 2025 15:18:06 +0200 Subject: [PATCH 0452/3206] bpf: Use tnums for JEQ/JNE is_branch_taken logic In the following toy program (reg states minimized for readability), R0 and R1 always have different values at instruction 6. This is obvious when reading the program but cannot be guessed from ranges alone as they overlap (R0 in [0; 0xc0000000], R1 in [1024; 0xc0000400]). 0: call bpf_get_prandom_u32#7 ; R0_w=scalar() 1: w0 = w0 ; R0_w=scalar(var_off=(0x0; 0xffffffff)) 2: r0 >>= 30 ; R0_w=scalar(var_off=(0x0; 0x3)) 3: r0 <<= 30 ; R0_w=scalar(var_off=(0x0; 0xc0000000)) 4: r1 = r0 ; R1_w=scalar(var_off=(0x0; 0xc0000000)) 5: r1 += 1024 ; R1_w=scalar(var_off=(0x400; 0xc0000000)) 6: if r1 != r0 goto pc+1 Looking at tnums however, we can deduce that R1 is always different from R0 because their tnums don't agree on known bits. This patch uses this logic to improve is_scalar_branch_taken in case of BPF_JEQ and BPF_JNE. This change has a tiny impact on complexity, which was measured with the Cilium complexity CI test. That test covers 72 programs with various build and load time configurations for a total of 970 test cases. For 80% of test cases, the patch has no impact. On the other test cases, the patch decreases complexity by only 0.08% on average. In the best case, the verifier needs to walk 3% less instructions and, in the worst case, 1.5% more. Overall, the patch has a small positive impact, especially for our largest programs. Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/be3ee70b6e489c49881cb1646114b1d861b5c334.1755694147.git.paul.chaignon@gmail.com --- include/linux/tnum.h | 3 +++ kernel/bpf/tnum.c | 8 ++++++++ kernel/bpf/verifier.c | 4 ++++ 3 files changed, 15 insertions(+) diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 57ed3035cc3093..0ffb77ffe0e873 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -51,6 +51,9 @@ struct tnum tnum_xor(struct tnum a, struct tnum b); /* Multiply two tnums, return @a * @b */ struct tnum tnum_mul(struct tnum a, struct tnum b); +/* Return true if the known bits of both tnums have the same value */ +bool tnum_overlap(struct tnum a, struct tnum b); + /* Return a tnum representing numbers satisfying both @a and @b */ struct tnum tnum_intersect(struct tnum a, struct tnum b); diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index fa353c5d550fc9..d9328bbb3680b5 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -143,6 +143,14 @@ struct tnum tnum_mul(struct tnum a, struct tnum b) return tnum_add(TNUM(acc_v, 0), acc_m); } +bool tnum_overlap(struct tnum a, struct tnum b) +{ + u64 mu; + + mu = ~a.mask & ~b.mask; + return (a.value & mu) == (b.value & mu); +} + /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has * a 'known 0' - this will return a 'known 1' for that bit. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4e47992361ea10..5c9dd16b2c56be 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15897,6 +15897,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value == t2.value; + if (!tnum_overlap(t1, t2)) + return 0; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 0; @@ -15921,6 +15923,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value != t2.value; + if (!tnum_overlap(t1, t2)) + return 1; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 1; From 0780f54ab129b28e2b29689c95ad579a9db04fab Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 20 Aug 2025 15:19:13 +0200 Subject: [PATCH 0453/3206] selftests/bpf: Tests for is_scalar_branch_taken tnum logic This patch adds tests for the new jeq and jne logic in is_scalar_branch_taken. The following shows the first test failing before the previous patch is applied. Once the previous patch is applied, the verifier can use the tnum values to deduce that instruction 7 is dead code. 0: call bpf_get_prandom_u32#7 ; R0_w=scalar() 1: w0 = w0 ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 2: r0 >>= 30 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=3,var_off=(0x0; 0x3)) 3: r0 <<= 30 ; R0_w=scalar(smin=0,smax=umax=umax32=0xc0000000,smax32=0x40000000,var_off=(0x0; 0xc0000000)) 4: r1 = r0 ; R0_w=scalar(id=1,smin=0,smax=umax=umax32=0xc0000000,smax32=0x40000000,var_off=(0x0; 0xc0000000)) R1_w=scalar(id=1,smin=0,smax=umax=umax32=0xc0000000,smax32=0x40000000,var_off=(0x0; 0xc0000000)) 5: r1 += 1024 ; R1_w=scalar(smin=umin=umin32=1024,smax=umax=umax32=0xc0000400,smin32=0x80000400,smax32=0x40000400,var_off=(0x400; 0xc0000000)) 6: if r1 != r0 goto pc+1 ; R0_w=scalar(id=1,smin=umin=umin32=1024,smax=umax=umax32=0xc0000000,smin32=0x80000400,smax32=0x40000000,var_off=(0x400; 0xc0000000)) R1_w=scalar(smin=umin=umin32=1024,smax=umax=umax32=0xc0000000,smin32=0x80000400,smax32=0x40000400,var_off=(0x400; 0xc0000000)) 7: r10 = 0 frame pointer is read only Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/550004f935e2553bdb2fb1f09cbde7d0452112d0.1755694148.git.paul.chaignon@gmail.com --- .../selftests/bpf/progs/verifier_bounds.c | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 87a2c60d86e6ee..fbccc20555f48d 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1668,4 +1668,45 @@ l0_%=: r0 = 0; \ : __clobber_all); } +SEC("socket") +__description("dead jne branch due to disagreeing tnums") +__success __log_level(2) +__naked void jne_disagreeing_tnums(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + w0 = w0; \ + r0 >>= 30; \ + r0 <<= 30; \ + r1 = r0; \ + r1 += 1024; \ + if r1 != r0 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("dead jeq branch due to disagreeing tnums") +__success __log_level(2) +__naked void jeq_disagreeing_tnums(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + w0 = w0; \ + r0 >>= 30; \ + r0 <<= 30; \ + r1 = r0; \ + r1 += 1024; \ + if r1 == r0 goto +1; \ + exit; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; From 79f919a89c9d06816dbdbbd168fa41d27411a7f9 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 19 Aug 2025 01:07:24 +0000 Subject: [PATCH 0454/3206] cgroup: split cgroup_destroy_wq into 3 workqueues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A hung task can occur during [1] LTP cgroup testing when repeatedly mounting/unmounting perf_event and net_prio controllers with systemd.unified_cgroup_hierarchy=1. The hang manifests in cgroup_lock_and_drain_offline() during root destruction. Related case: cgroup_fj_function_perf_event cgroup_fj_function.sh perf_event cgroup_fj_function_net_prio cgroup_fj_function.sh net_prio Call Trace: cgroup_lock_and_drain_offline+0x14c/0x1e8 cgroup_destroy_root+0x3c/0x2c0 css_free_rwork_fn+0x248/0x338 process_one_work+0x16c/0x3b8 worker_thread+0x22c/0x3b0 kthread+0xec/0x100 ret_from_fork+0x10/0x20 Root Cause: CPU0 CPU1 mount perf_event umount net_prio cgroup1_get_tree cgroup_kill_sb rebind_subsystems // root destruction enqueues // cgroup_destroy_wq // kill all perf_event css // one perf_event css A is dying // css A offline enqueues cgroup_destroy_wq // root destruction will be executed first css_free_rwork_fn cgroup_destroy_root cgroup_lock_and_drain_offline // some perf descendants are dying // cgroup_destroy_wq max_active = 1 // waiting for css A to die Problem scenario: 1. CPU0 mounts perf_event (rebind_subsystems) 2. CPU1 unmounts net_prio (cgroup_kill_sb), queuing root destruction work 3. A dying perf_event CSS gets queued for offline after root destruction 4. Root destruction waits for offline completion, but offline work is blocked behind root destruction in cgroup_destroy_wq (max_active=1) Solution: Split cgroup_destroy_wq into three dedicated workqueues: cgroup_offline_wq – Handles CSS offline operations cgroup_release_wq – Manages resource release cgroup_free_wq – Performs final memory deallocation This separation eliminates blocking in the CSS free path while waiting for offline operations to complete. [1] https://github.com/linux-test-project/ltp/blob/master/runtest/controllers Fixes: 334c3679ec4b ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends") Reported-by: Gao Yingjie Signed-off-by: Chen Ridong Suggested-by: Teju Heo Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 43 +++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb73..79b1d79f86a342 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -126,8 +126,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -5558,7 +5581,7 @@ static void css_release_work_fn(struct work_struct *work) cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5567,7 +5590,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5701,7 +5724,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, list_del_rcu(&css->sibling); err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5939,7 +5962,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6325,8 +6348,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); From 94a4acfec14615e971eb2c9e1fa6c992c85ff6c6 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 22 Aug 2025 07:07:15 +0000 Subject: [PATCH 0455/3206] cgroup/psi: Set of->priv to NULL upon file release Setting of->priv to NULL when the file is released enables earlier bug detection. This allows potential bugs to manifest as NULL pointer dereferences rather than use-after-free errors[1], which are generally more difficult to diagnose. [1] https://lore.kernel.org/cgroups/38ef3ff9-b380-44f0-9315-8b3714b0948d@huaweicloud.com/T/#m8a3b3f88f0ff3da5925d342e90043394f8b2091b Signed-off-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 79b1d79f86a342..77d02f87f3f121 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4182,6 +4182,7 @@ static void cgroup_file_release(struct kernfs_open_file *of) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); + of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, From afa3701c0e45ecb9e4d160048ca4e353c7489948 Mon Sep 17 00:00:00 2001 From: Tiffany Yang Date: Thu, 21 Aug 2025 18:37:52 -0700 Subject: [PATCH 0456/3206] cgroup: cgroup.stat.local time accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There isn't yet a clear way to identify a set of "lost" time that everyone (or at least a wider group of users) cares about. However, users can perform some delay accounting by iterating over components of interest. This patch allows cgroup v2 freezing time to be one of those components. Track the cumulative time that each v2 cgroup spends freezing and expose it to userland via a new local stat file in cgroupfs. Thank you to Michal, who provided the ASCII art in the updated documentation. To access this value: $ mkdir /sys/fs/cgroup/test $ cat /sys/fs/cgroup/test/cgroup.stat.local freeze_time_total 0 Ensure consistent freeze time reads with freeze_seq, a per-cgroup sequence counter. Writes are serialized using the css_set_lock. Signed-off-by: Tiffany Yang Cc: Tejun Heo Cc: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 18 ++++++++++++++++ include/linux/cgroup-defs.h | 17 +++++++++++++++ kernel/cgroup/cgroup.c | 28 +++++++++++++++++++++++++ kernel/cgroup/freezer.c | 16 ++++++++++---- 4 files changed, 75 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index d9d3cc7df3488e..9a3a909ee40b28 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1001,6 +1001,24 @@ All cgroup core files are prefixed with "cgroup." Total number of dying cgroup subsystems (e.g. memory cgroup) at and beneath the current cgroup. + cgroup.stat.local + A read-only flat-keyed file which exists in non-root cgroups. + The following entry is defined: + + frozen_usec + Cumulative time that this cgroup has spent between freezing and + thawing, regardless of whether by self or ancestor groups. + NB: (not) reaching "frozen" state is not accounted here. + + Using the following ASCII representation of a cgroup's freezer + state, :: + + 1 _____ + frozen 0 __/ \__ + ab cd + + the duration being measured is the span between a and c. + cgroup.freeze A read-write single value file which exists on non-root cgroups. Allowed values are "0" and "1". The default is "0". diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 6b93a64115fe94..539c64eeef38f3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -433,6 +433,23 @@ struct cgroup_freezer_state { * frozen, SIGSTOPped, and PTRACEd. */ int nr_frozen_tasks; + + /* Freeze time data consistency protection */ + seqcount_t freeze_seq; + + /* + * Most recent time the cgroup was requested to freeze. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 freeze_start_nsec; + + /* + * Total duration the cgroup has spent freezing. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 frozen_nsec; }; struct cgroup { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb73..ab096b884bbc73 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3763,6 +3763,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + unsigned int sequence; + u64 freeze_time; + + do { + sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); + freeze_time = cgrp->freezer.frozen_nsec; + /* Add in current freezer interval if the cgroup is freezing. */ + if (test_bit(CGRP_FREEZE, &cgrp->flags)) + freeze_time += (ktime_get_ns() - + cgrp->freezer.freeze_start_nsec); + } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); + + seq_printf(seq, "frozen_usec %llu\n", + (unsigned long long) freeze_time / NSEC_PER_USEC); + + return 0; +} + #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem @@ -5354,6 +5375,11 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, + { + .name = "cgroup.stat.local", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_core_local_stat_show, + }, { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, @@ -5763,6 +5789,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; + seqcount_init(&cgrp->freezer.freeze_seq); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be @@ -5771,6 +5798,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.freeze_start_nsec = ktime_get_ns(); set_bit(CGRP_FROZEN, &cgrp->flags); } diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index bf1690a167dda0..6c18854bff3485 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze) /* * Freeze or unfreeze all tasks in the given cgroup. */ -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) { struct css_task_iter it; struct task_struct *task; @@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - if (freeze) + write_seqcount_begin(&cgrp->freezer.freeze_seq); + if (freeze) { set_bit(CGRP_FREEZE, &cgrp->flags); - else + cgrp->freezer.freeze_start_nsec = ts_nsec; + } else { clear_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.frozen_nsec += (ts_nsec - + cgrp->freezer.freeze_start_nsec); + } + write_seqcount_end(&cgrp->freezer.freeze_seq); spin_unlock_irq(&css_set_lock); if (freeze) @@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) struct cgroup *parent; struct cgroup *dsct; bool applied = false; + u64 ts_nsec; bool old_e; lockdep_assert_held(&cgroup_mutex); @@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) return; cgrp->freezer.freeze = freeze; + ts_nsec = ktime_get_ns(); /* * Propagate changes downwards the cgroup tree. @@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) /* * Do change actual state: freeze or unfreeze. */ - cgroup_do_freeze(dsct, freeze); + cgroup_do_freeze(dsct, freeze, ts_nsec); applied = true; } From 7b281a4582c4408add22cc99f221886b50dd0548 Mon Sep 17 00:00:00 2001 From: Tiffany Yang Date: Thu, 21 Aug 2025 18:37:53 -0700 Subject: [PATCH 0457/3206] cgroup: selftests: Add tests for freezer time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test cgroup v2 freezer time stat. Freezer time accounting should be independent of other cgroups in the hierarchy and should increase iff a cgroup is CGRP_FREEZE (regardless of whether it reaches CGRP_FROZEN). Skip these tests on systems without freeze time accounting. Signed-off-by: Tiffany Yang Cc: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_freezer.c | 663 ++++++++++++++++++ 1 file changed, 663 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 8730645d363a73..dfb763819581fa 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -804,6 +804,662 @@ static int test_cgfreezer_vfork(const char *root) return ret; } +/* + * Get the current frozen_usec for the cgroup. + */ +static long cg_check_freezetime(const char *cgroup) +{ + return cg_read_key_long(cgroup, "cgroup.stat.local", + "frozen_usec "); +} + +/* + * Test that the freeze time will behave as expected for an empty cgroup. + */ +static int test_cgfreezer_time_empty(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_empty"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create an empty cgroup and check that its freeze time + * is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 2) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 3) Unfreeze the cgroup. Check that the freeze time is + * larger than at 2). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Check the freeze time again to ensure that it has not + * changed. + */ + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * A simple test for cgroup freezer time accounting. This test follows + * the same flow as test_cgfreezer_time_empty, but with a single process + * in the cgroup. + */ +static int test_cgfreezer_time_simple(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_simple"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create a cgroup and check that its freeze time is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Populate the cgroup with one child and check that the + * freeze time is still 0. + */ + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr > prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 3) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 4) Unfreeze the cgroup. Check that the freeze time is + * larger than at 3). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Sleep for 1000 us. Check that the freeze time is the + * same as at 4). + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that freezer time accounting works as expected, even while we're + * populating a cgroup with processes. + */ +static int test_cgfreezer_time_populate(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + int i; + + cgroup = cg_name(root, "cg_time_test_populate"); + if (!cgroup) + goto cleanup; + + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 1) Populate the cgroup with 100 processes. Check that + * the freeze time is 0. + */ + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Wait for the group to become fully populated. Check + * that the freeze time is 0. + */ + if (cg_wait_for_proc_count(cgroup, 100)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 3) Freeze the cgroup and then populate it with 100 more + * processes. Check that the freeze time continues to grow. + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Wait for the group to become fully populated. Check + * that the freeze time is larger than at 3). + */ + if (cg_wait_for_proc_count(cgroup, 200)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Unfreeze the cgroup. Check that the freeze time is + * larger than at 4). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 6) Kill the processes. Check that the freeze time is the + * same as it was at 5). + */ + if (cg_killall(cgroup)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 7) Freeze and unfreeze the cgroup. Check that the freeze + * time is larger than it was at 6). + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that frozen time for a cgroup continues to work as expected, + * even as processes are migrated. Frozen cgroup A's freeze time should + * continue to increase and running cgroup B's should stay 0. + */ +static int test_cgfreezer_time_migrate(const char *root) +{ + long prev_A, curr_A, curr_B; + char *cgroup[2] = {0}; + int ret = KSFT_FAIL; + int pid; + + cgroup[0] = cg_name(root, "cg_time_test_migrate_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(root, "cg_time_test_migrate_B"); + if (!cgroup[1]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + pid = cg_run_nowait(cgroup[0], child_fn, NULL); + if (pid < 0) + goto cleanup; + + if (cg_wait_for_proc_count(cgroup[0], 1)) + goto cleanup; + + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A) { + debug("Expect time (%ld) to be 0\n", curr_A); + goto cleanup; + } + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + /* + * Freeze cgroup A. + */ + if (cg_freeze_wait(cgroup[0], true)) + goto cleanup; + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be > 0\n", curr_A); + goto cleanup; + } + + /* + * Migrate from A (frozen) to B (running). + */ + if (cg_enter(cgroup[1], pid)) + goto cleanup; + + usleep(1000); + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr_A, prev_A); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup[0]) + cg_destroy(cgroup[0]); + free(cgroup[0]); + if (cgroup[1]) + cg_destroy(cgroup[1]); + free(cgroup[1]); + return ret; +} + +/* + * The test creates a cgroup and freezes it. Then it creates a child cgroup. + * After that it checks that the child cgroup has a non-zero freeze time + * that is less than the parent's. Next, it freezes the child, unfreezes + * the parent, and sleeps. Finally, it checks that the child's freeze + * time has grown larger than the parent's. + */ +static int test_cgfreezer_time_parent(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_parent_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_parent_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_freeze_wait(parent, true)) + goto cleanup; + + usleep(1000); + if (cg_create(child)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + /* + * Since the parent was frozen the entire time the child cgroup + * was being created, we expect the parent's freeze time to be + * larger than the child's. + * + * Ideally, we would be able to check both times simultaneously, + * but here we get the child's after we get the parent's. + */ + ptime = cg_check_freezetime(parent); + ctime = cg_check_freezetime(child); + if (ptime <= ctime) { + debug("Expect ptime (%ld) > ctime (%ld)\n", ptime, ctime); + goto cleanup; + } + + if (cg_freeze_nowait(child, true)) + goto cleanup; + + if (cg_freeze_wait(parent, false)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + usleep(100000); + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + + if (ctime <= ptime) { + debug("Expect ctime (%ld) > ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates a parent cgroup and a child cgroup. Then, it freezes + * the child and checks that the child's freeze time is greater than the + * parent's, which should be zero. + */ +static int test_cgfreezer_time_child(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_child_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_child_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(child)) + goto cleanup; + + if (cg_freeze_wait(child, true)) + goto cleanup; + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + if (ptime != 0) { + debug("Expect ptime (%ld) to be 0\n", ptime); + goto cleanup; + } + + if (ctime <= ptime) { + debug("Expect ctime (%ld) <= ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates the following hierarchy: + * A + * | + * B + * | + * C + * + * Then it freezes the cgroups in the order C, B, A. + * Then it unfreezes the cgroups in the order A, B, C. + * Then it checks that C's freeze time is larger than B's and + * that B's is larger than A's. + */ +static int test_cgfreezer_time_nested(const char *root) +{ + char *cgroup[3] = {0}; + int ret = KSFT_FAIL; + long time[3] = {0}; + int i; + + cgroup[0] = cg_name(root, "cg_test_time_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(cgroup[0], "B"); + if (!cgroup[1]) + goto cleanup; + + cgroup[2] = cg_name(cgroup[1], "C"); + if (!cgroup[2]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + if (cg_create(cgroup[2])) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[0], true)) + goto cleanup; + + usleep(1000); + + if (cg_freeze_nowait(cgroup[0], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], false)) + goto cleanup; + + time[2] = cg_check_freezetime(cgroup[2]); + time[1] = cg_check_freezetime(cgroup[1]); + time[0] = cg_check_freezetime(cgroup[0]); + + if (time[2] <= time[1]) { + debug("Expect C's time (%ld) > B's time (%ld)", time[2], time[1]); + goto cleanup; + } + + if (time[1] <= time[0]) { + debug("Expect B's time (%ld) > A's time (%ld)", time[1], time[0]); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + for (i = 2; i >= 0 && cgroup[i]; i--) { + cg_destroy(cgroup[i]); + free(cgroup[i]); + } + + return ret; +} + #define T(x) { x, #x } struct cgfreezer_test { int (*fn)(const char *root); @@ -819,6 +1475,13 @@ struct cgfreezer_test { T(test_cgfreezer_stopped), T(test_cgfreezer_ptraced), T(test_cgfreezer_vfork), + T(test_cgfreezer_time_empty), + T(test_cgfreezer_time_simple), + T(test_cgfreezer_time_populate), + T(test_cgfreezer_time_migrate), + T(test_cgfreezer_time_parent), + T(test_cgfreezer_time_child), + T(test_cgfreezer_time_nested), }; #undef T From d47cc4dea17391c99b943fa8d70a279e906b2843 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 Aug 2025 13:16:15 -0700 Subject: [PATCH 0458/3206] bpf: Use sha1() instead of sha1_transform() in bpf_prog_calc_tag() Now that there's a proper SHA-1 library API, just use that instead of the low-level SHA-1 compression function. This eliminates the need for bpf_prog_calc_tag() to implement the SHA-1 padding itself. No functional change; the computed tags remain the same. Signed-off-by: Eric Biggers Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20250811201615.564461-1-ebiggers@kernel.org --- include/linux/filter.h | 6 ----- kernel/bpf/core.c | 50 ++++++++---------------------------------- 2 files changed, 9 insertions(+), 47 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index c0a74fb9fcb16d..9092d8ea95c831 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -997,12 +997,6 @@ static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) return prog->len * sizeof(struct bpf_insn); } -static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) -{ - return round_up(bpf_prog_insn_size(prog) + - sizeof(__be64) + 1, SHA1_BLOCK_SIZE); -} - static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5d1650af899d04..ef01cc644a9659 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -293,28 +294,19 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { - const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); - u32 raw_size = bpf_prog_tag_scratch_size(fp); - u32 digest[SHA1_DIGEST_WORDS]; - u32 ws[SHA1_WORKSPACE_WORDS]; - u32 i, bsize, psize, blocks; + size_t size = bpf_prog_insn_size(fp); + u8 digest[SHA1_DIGEST_SIZE]; struct bpf_insn *dst; bool was_ld_map; - u8 *raw, *todo; - __be32 *result; - __be64 *bits; + u32 i; - raw = vmalloc(raw_size); - if (!raw) + dst = vmalloc(size); + if (!dst) return -ENOMEM; - sha1_init_raw(digest); - memset(ws, 0, sizeof(ws)); - /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ - dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && @@ -334,33 +326,9 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) was_ld_map = false; } } - - psize = bpf_prog_insn_size(fp); - memset(&raw[psize], 0, raw_size - psize); - raw[psize++] = 0x80; - - bsize = round_up(psize, SHA1_BLOCK_SIZE); - blocks = bsize / SHA1_BLOCK_SIZE; - todo = raw; - if (bsize - psize >= sizeof(__be64)) { - bits = (__be64 *)(todo + bsize - sizeof(__be64)); - } else { - bits = (__be64 *)(todo + bsize + bits_offset); - blocks++; - } - *bits = cpu_to_be64((psize - 1) << 3); - - while (blocks--) { - sha1_transform(digest, todo, ws); - todo += SHA1_BLOCK_SIZE; - } - - result = (__force __be32 *)digest; - for (i = 0; i < SHA1_DIGEST_WORDS; i++) - result[i] = cpu_to_be32(digest[i]); - memcpy(fp->tag, result, sizeof(fp->tag)); - - vfree(raw); + sha1((const u8 *)dst, size, digest); + memcpy(fp->tag, digest, sizeof(fp->tag)); + vfree(dst); return 0; } From 4223bf833c8495e40ae2886acbc0ecbe88fa6306 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Tue, 19 Aug 2025 20:56:38 +0800 Subject: [PATCH 0459/3206] bpf: Remove preempt_disable in bpf_try_get_buffers Now BPF program will run with migration disabled, so it is safe to access this_cpu_inc_return(bpf_bprintf_nest_level). Fixes: d9c9e4db186a ("bpf: Factorize bpf_trace_printk and bpf_seq_printf") Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250819125638.2544715-1-chen.dylane@linux.dev --- kernel/bpf/helpers.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cdffd74ddbe65d..401b4932cc49f6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -774,11 +774,9 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; - preempt_disable(); nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { this_cpu_dec(bpf_bprintf_nest_level); - preempt_enable(); return -EBUSY; } *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); @@ -791,7 +789,6 @@ void bpf_put_buffers(void) if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); - preempt_enable(); } void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) From 9c7419568b28855274d6d1dac1684f6627da9547 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 14 Aug 2025 17:56:07 -0400 Subject: [PATCH 0460/3206] MAINTAINERS: add the associated Rust helper to the LSM section Suggested-by: Benno Lossin Signed-off-by: Paul Moore --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..d61f7246e5bf5a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22790,6 +22790,7 @@ F: include/linux/security.h F: include/uapi/linux/lsm.h F: security/ F: tools/testing/selftests/lsm/ +F: rust/kernel/security.rs X: security/selinux/ K: \bsecurity_[a-z_0-9]\+\b From 67fe7be7dffd0e27bcba472777d34b59b90f7330 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 19 Aug 2025 15:52:47 -0400 Subject: [PATCH 0461/3206] MAINTAINERS: add the associated Rust helper to the CREDENTIALS section Acked-by: Miguel Ojeda Acked-by: Alice Ryhl Signed-off-by: Paul Moore --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index d61f7246e5bf5a..0ee0098f2df890 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6484,6 +6484,7 @@ S: Supported T: git https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git F: include/linux/cred.h F: kernel/cred.c +F: rust/kernel/cred.rs F: Documentation/security/credentials.rst INTEL CRPS COMMON REDUNDANT PSU DRIVER From 696968262aeee51e1c0529c3c060ddd180702e02 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sat, 23 Aug 2025 12:09:52 +0900 Subject: [PATCH 0462/3206] firewire: ohci: move self_id_complete tracepoint after validating register The value of OHCI1394_SelfIDCount register includes an error-indicating bit. It is safer to place the tracepoint probe after validating the register value. Link: https://lore.kernel.org/r/20250823030954.268412-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/ohci.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index c8c5d598e3c8f4..b3a187e4cba7a4 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -1863,6 +1863,9 @@ static void bus_reset_work(struct work_struct *work) ohci_notice(ohci, "self ID receive error\n"); return; } + + trace_self_id_complete(ohci->card.index, reg, ohci->self_id, has_be_header_quirk(ohci)); + /* * The count in the SelfIDCount register is the number of * bytes in the self ID receive buffer. Since we also receive @@ -2024,15 +2027,8 @@ static irqreturn_t irq_handler(int irq, void *data) if (event & OHCI1394_busReset) reg_write(ohci, OHCI1394_IntMaskClear, OHCI1394_busReset); - if (event & OHCI1394_selfIDComplete) { - if (trace_self_id_complete_enabled()) { - u32 reg = reg_read(ohci, OHCI1394_SelfIDCount); - - trace_self_id_complete(ohci->card.index, reg, ohci->self_id, - has_be_header_quirk(ohci)); - } + if (event & OHCI1394_selfIDComplete) queue_work(selfid_workqueue, &ohci->bus_reset_work); - } if (event & OHCI1394_RQPkt) queue_work(ohci->card.async_wq, &ohci->ar_request_ctx.work); From 61efd0e1afb24f8a71cff80f7600ff7556378c53 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sat, 23 Aug 2025 12:09:53 +0900 Subject: [PATCH 0463/3206] firewire: ohci: use threaded IRQ handler to handle SelfIDComplete event The first step maintaining the bus topology is to handle SelfIDComplete event. This event occurs after initiating bus reset when 1394 OHCI link layer is enabled, or when the bus topology changes (e.g. when a device is added). Because enumeration of the selfID sequence can take some time, it should be processed in a bottom half. Currently, this is done in a module-local workqueue with the WQ_MEM_RECLAIM flag, to allow invocation during memory reclaim paths. A threaded IRQ handler is a preferable alternative, as it eliminates the need to manage workqueue attributes manually. Although SelfIDComplete events are not so frequent in normal usage, handling them correctly is critical for proper bus topology management. This commit switches SelfIDComplete handling to a threaded IRQ handler. Link: https://lore.kernel.org/r/20250823030954.268412-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/ohci.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index b3a187e4cba7a4..5b16286280e00e 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -760,7 +760,7 @@ static __le32 *handle_ar_packet(struct ar_context *ctx, __le32 *buffer) * * Alas some chips sometimes emit bus reset packets with a * wrong generation. We set the correct generation for these - * at a slightly incorrect time (in bus_reset_work). + * at a slightly incorrect time (in handle_selfid_complete_event). */ if (evt == OHCI1394_evt_bus_reset) { if (!(ohci->quirks & QUIRK_RESET_PACKET)) @@ -1830,9 +1830,9 @@ static int find_and_insert_self_id(struct fw_ohci *ohci, int self_id_count) return self_id_count; } -static void bus_reset_work(struct work_struct *work) +static irqreturn_t handle_selfid_complete_event(int irq, void *data) { - struct fw_ohci *ohci = from_work(ohci, work, bus_reset_work); + struct fw_ohci *ohci = data; int self_id_count, generation, new_generation, i, j; u32 reg, quadlet; void *free_rom = NULL; @@ -1843,11 +1843,11 @@ static void bus_reset_work(struct work_struct *work) if (!(reg & OHCI1394_NodeID_idValid)) { ohci_notice(ohci, "node ID not valid, new bus reset in progress\n"); - return; + goto end; } if ((reg & OHCI1394_NodeID_nodeNumber) == 63) { ohci_notice(ohci, "malconfigured bus\n"); - return; + goto end; } ohci->node_id = reg & (OHCI1394_NodeID_busNumber | OHCI1394_NodeID_nodeNumber); @@ -1861,7 +1861,7 @@ static void bus_reset_work(struct work_struct *work) reg = reg_read(ohci, OHCI1394_SelfIDCount); if (ohci1394_self_id_count_is_error(reg)) { ohci_notice(ohci, "self ID receive error\n"); - return; + goto end; } trace_self_id_complete(ohci->card.index, reg, ohci->self_id, has_be_header_quirk(ohci)); @@ -1876,7 +1876,7 @@ static void bus_reset_work(struct work_struct *work) if (self_id_count > 252) { ohci_notice(ohci, "bad selfIDSize (%08x)\n", reg); - return; + goto end; } quadlet = cond_le32_to_cpu(ohci->self_id[0], has_be_header_quirk(ohci)); @@ -1903,7 +1903,7 @@ static void bus_reset_work(struct work_struct *work) ohci_notice(ohci, "bad self ID %d/%d (%08x != ~%08x)\n", j, self_id_count, id, id2); - return; + goto end; } ohci->self_id_buffer[j] = id; } @@ -1913,13 +1913,13 @@ static void bus_reset_work(struct work_struct *work) if (self_id_count < 0) { ohci_notice(ohci, "could not construct local self ID\n"); - return; + goto end; } } if (self_id_count == 0) { ohci_notice(ohci, "no self IDs\n"); - return; + goto end; } rmb(); @@ -1941,7 +1941,7 @@ static void bus_reset_work(struct work_struct *work) new_generation = ohci1394_self_id_count_get_generation(reg); if (new_generation != generation) { ohci_notice(ohci, "new bus reset, discarding self ids\n"); - return; + goto end; } // FIXME: Document how the locking works. @@ -2002,6 +2002,8 @@ static void bus_reset_work(struct work_struct *work) self_id_count, ohci->self_id_buffer, ohci->csr_state_setclear_abdicate); ohci->csr_state_setclear_abdicate = false; +end: + return IRQ_HANDLED; } static irqreturn_t irq_handler(int irq, void *data) @@ -2023,13 +2025,10 @@ static irqreturn_t irq_handler(int irq, void *data) event & ~(OHCI1394_busReset | OHCI1394_postedWriteErr)); trace_irqs(ohci->card.index, event); - // The flag is masked again at bus_reset_work() scheduled by selfID event. + // The flag is masked again at handle_selfid_complete_event() scheduled by selfID event. if (event & OHCI1394_busReset) reg_write(ohci, OHCI1394_IntMaskClear, OHCI1394_busReset); - if (event & OHCI1394_selfIDComplete) - queue_work(selfid_workqueue, &ohci->bus_reset_work); - if (event & OHCI1394_RQPkt) queue_work(ohci->card.async_wq, &ohci->ar_request_ctx.work); @@ -2100,7 +2099,10 @@ static irqreturn_t irq_handler(int irq, void *data) } else flush_writes(ohci); - return IRQ_HANDLED; + if (event & OHCI1394_selfIDComplete) + return IRQ_WAKE_THREAD; + else + return IRQ_HANDLED; } static int software_reset(struct fw_ohci *ohci) @@ -2413,7 +2415,7 @@ static int ohci_set_config_rom(struct fw_card *card, * then set up the real values for the two registers. * * We use ohci->lock to avoid racing with the code that sets - * ohci->next_config_rom to NULL (see bus_reset_work). + * ohci->next_config_rom to NULL (see handle_selfid_complete_event). */ next_config_rom = dmam_alloc_coherent(ohci->card.device, CONFIG_ROM_SIZE, @@ -3620,7 +3622,9 @@ static int pci_probe(struct pci_dev *dev, goto fail_msi; } - err = request_threaded_irq(irq, irq_handler, NULL, + // IRQF_ONESHOT is not applied so that any events are handled in the hardIRQ handler during + // invoking the threaded IRQ handler for SelfIDComplete event. + err = request_threaded_irq(irq, irq_handler, handle_selfid_complete_event, pci_dev_msi_enabled(dev) ? 0 : IRQF_SHARED, ohci_driver_name, ohci); if (err < 0) { From a901f493d06631091bf1f644fdbb5cb4f566645d Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sat, 23 Aug 2025 12:09:54 +0900 Subject: [PATCH 0464/3206] firewire: ohci: remove module-local workqueue Now module-local workqueue has been replaced by a threaded IRQ handler. This commit removes the workqueue and the associated work item accordingly. Link: https://lore.kernel.org/r/20250823030954.268412-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/ohci.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 5b16286280e00e..40851b12061513 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -228,13 +228,10 @@ struct fw_ohci { __le32 *self_id; dma_addr_t self_id_bus; - struct work_struct bus_reset_work; u32 self_id_buffer[512]; }; -static struct workqueue_struct *selfid_workqueue; - static inline struct fw_ohci *fw_ohci(struct fw_card *card) { return container_of(card, struct fw_ohci, card); @@ -3512,8 +3509,6 @@ static int pci_probe(struct pci_dev *dev, spin_lock_init(&ohci->lock); mutex_init(&ohci->phy_reg_mutex); - INIT_WORK(&ohci->bus_reset_work, bus_reset_work); - if (!(pci_resource_flags(dev, 0) & IORESOURCE_MEM) || pci_resource_len(dev, 0) < OHCI1394_REGISTER_SIZE) { ohci_err(ohci, "invalid MMIO resource\n"); @@ -3668,7 +3663,6 @@ static void pci_remove(struct pci_dev *dev) reg_write(ohci, OHCI1394_IntMaskClear, ~0); flush_writes(ohci); } - cancel_work_sync(&ohci->bus_reset_work); fw_core_remove_card(&ohci->card); /* @@ -3741,17 +3735,12 @@ static struct pci_driver fw_ohci_pci_driver = { static int __init fw_ohci_init(void) { - selfid_workqueue = alloc_workqueue(KBUILD_MODNAME, WQ_MEM_RECLAIM, 0); - if (!selfid_workqueue) - return -ENOMEM; - return pci_register_driver(&fw_ohci_pci_driver); } static void __exit fw_ohci_cleanup(void) { pci_unregister_driver(&fw_ohci_pci_driver); - destroy_workqueue(selfid_workqueue); } module_init(fw_ohci_init); From 3c716487936aa54083c130d46ad5747769695e09 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 14 Aug 2025 18:59:49 +0200 Subject: [PATCH 0465/3206] genirq: Remove GENERIC_IRQ_LEGACY IA64 is gone and with it the last GENERIC_IRQ_LEGACY user. Remove GENERIC_IRQ_LEGACY. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250814165949.hvtP03r4@linutronix.de --- include/linux/irq.h | 4 ---- kernel/irq/Kconfig | 4 ---- kernel/irq/irqdesc.c | 7 ------- 3 files changed, 15 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 1d6b606a81efe5..c9bcdbf6bc6358 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -976,10 +976,6 @@ static inline void irq_free_desc(unsigned int irq) irq_free_descs(irq, 1); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq); -#endif - /** * struct irq_chip_regs - register offsets for struct irq_gci * @enable: Enable register offset to reg_base diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 1da5e9d9da7193..36673640c4fc25 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -6,10 +6,6 @@ menu "IRQ subsystem" config MAY_HAVE_SPARSE_IRQ bool -# Legacy support, required for itanic -config GENERIC_IRQ_LEGACY - bool - # Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index b64c57b44c2037..db714d3014b5f7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -653,13 +653,6 @@ void irq_mark_irq(unsigned int irq) irq_insert_desc(irq, irq_desc + irq); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq) -{ - free_desc(irq); -} -#endif - #endif /* !CONFIG_SPARSE_IRQ */ int handle_irq_desc(struct irq_desc *desc) From 7a721a2fee2bce01af26699a87739db8ca8ea3c8 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Thu, 14 Aug 2025 07:28:31 +0800 Subject: [PATCH 0466/3206] genirq: Add irq_chip_(startup/shutdown)_parent() As the MSI controller on SG2044 uses PLIC as the underlying interrupt controller, it needs to call irq_enable() and irq_disable() to startup/shutdown interrupts. Otherwise, the MSI interrupt can not be startup correctly and will not respond any incoming interrupt. Introduce irq_chip_startup_parent() and irq_chip_shutdown_parent() to allow the interrupt controller to call the irq_startup()/irq_shutdown() callbacks of the parent interrupt chip. In case the irq_startup()/irq_shutdown() callbacks are not implemented for the parent interrupt chip, this will fallback to irq_chip_enable_parent() or irq_chip_disable_parent(). Suggested-by: Thomas Gleixner Signed-off-by: Inochi Amaoto Signed-off-by: Thomas Gleixner Tested-by: Chen Wang # Pioneerbox Reviewed-by: Chen Wang Link: https://lore.kernel.org/all/20250813232835.43458-2-inochiama@gmail.com Link: https://lore.kernel.org/lkml/20250722224513.22125-1-inochiama@gmail.com/ --- include/linux/irq.h | 2 ++ kernel/irq/chip.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/include/linux/irq.h b/include/linux/irq.h index c9bcdbf6bc6358..c67e76fbcc0775 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -669,6 +669,8 @@ extern int irq_chip_set_parent_state(struct irq_data *data, extern int irq_chip_get_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); +extern void irq_chip_shutdown_parent(struct irq_data *data); +extern unsigned int irq_chip_startup_parent(struct irq_data *data); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0d0276378c707c..3ffa0d80ddd19c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1259,6 +1259,43 @@ int irq_chip_get_parent_state(struct irq_data *data, } EXPORT_SYMBOL_GPL(irq_chip_get_parent_state); +/** + * irq_chip_shutdown_parent - Shutdown the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_shutdown() callback of the parent if available or falls + * back to irq_chip_disable_parent(). + */ +void irq_chip_shutdown_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_shutdown) + parent->chip->irq_shutdown(parent); + else + irq_chip_disable_parent(data); +} +EXPORT_SYMBOL_GPL(irq_chip_shutdown_parent); + +/** + * irq_chip_startup_parent - Startup the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_startup() callback of the parent if available or falls + * back to irq_chip_enable_parent(). + */ +unsigned int irq_chip_startup_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_startup) + return parent->chip->irq_startup(parent); + + irq_chip_enable_parent(data); + return 0; +} +EXPORT_SYMBOL_GPL(irq_chip_startup_parent); + /** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) From 54f45a30c0d0153d2be091ba2d683ab6db6d1d5b Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Thu, 14 Aug 2025 07:28:32 +0800 Subject: [PATCH 0467/3206] PCI/MSI: Add startup/shutdown for per device domains As the RISC-V PLIC cannot apply affinity settings without invoking irq_enable(), it will make the interrupt unavailble when used as an underlying interrupt chip for the MSI controller. Implement the irq_startup() and irq_shutdown() callbacks for the PCI MSI and MSI-X templates. For chips that specify MSI_FLAG_PCI_MSI_STARTUP_PARENT, the parent startup and shutdown functions are invoked. That allows the interrupt on the parent chip to be enabled if the interrupt has not been enabled during allocation. This is necessary for MSI controllers which use PLIC as underlying parent interrupt chip. Suggested-by: Thomas Gleixner Signed-off-by: Inochi Amaoto Signed-off-by: Thomas Gleixner Tested-by: Chen Wang # Pioneerbox Reviewed-by: Chen Wang Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/all/20250813232835.43458-3-inochiama@gmail.com --- drivers/pci/msi/irqdomain.c | 52 +++++++++++++++++++++++++++++++++++++ include/linux/msi.h | 2 ++ 2 files changed, 54 insertions(+) diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 0938ef7ebabf2d..e0a800f918e812 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -148,6 +148,23 @@ static void pci_device_domain_set_desc(msi_alloc_info_t *arg, struct msi_desc *d arg->hwirq = desc->msi_index; } +static void cond_shutdown_parent(struct irq_data *data) +{ + struct msi_domain_info *info = data->domain->host_data; + + if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) + irq_chip_shutdown_parent(data); +} + +static unsigned int cond_startup_parent(struct irq_data *data) +{ + struct msi_domain_info *info = data->domain->host_data; + + if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) + return irq_chip_startup_parent(data); + return 0; +} + static __always_inline void cond_mask_parent(struct irq_data *data) { struct msi_domain_info *info = data->domain->host_data; @@ -164,6 +181,23 @@ static __always_inline void cond_unmask_parent(struct irq_data *data) irq_chip_unmask_parent(data); } +static void pci_irq_shutdown_msi(struct irq_data *data) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + + pci_msi_mask(desc, BIT(data->irq - desc->irq)); + cond_shutdown_parent(data); +} + +static unsigned int pci_irq_startup_msi(struct irq_data *data) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + unsigned int ret = cond_startup_parent(data); + + pci_msi_unmask(desc, BIT(data->irq - desc->irq)); + return ret; +} + static void pci_irq_mask_msi(struct irq_data *data) { struct msi_desc *desc = irq_data_get_msi_desc(data); @@ -194,6 +228,8 @@ static void pci_irq_unmask_msi(struct irq_data *data) static const struct msi_domain_template pci_msi_template = { .chip = { .name = "PCI-MSI", + .irq_startup = pci_irq_startup_msi, + .irq_shutdown = pci_irq_shutdown_msi, .irq_mask = pci_irq_mask_msi, .irq_unmask = pci_irq_unmask_msi, .irq_write_msi_msg = pci_msi_domain_write_msg, @@ -210,6 +246,20 @@ static const struct msi_domain_template pci_msi_template = { }, }; +static void pci_irq_shutdown_msix(struct irq_data *data) +{ + pci_msix_mask(irq_data_get_msi_desc(data)); + cond_shutdown_parent(data); +} + +static unsigned int pci_irq_startup_msix(struct irq_data *data) +{ + unsigned int ret = cond_startup_parent(data); + + pci_msix_unmask(irq_data_get_msi_desc(data)); + return ret; +} + static void pci_irq_mask_msix(struct irq_data *data) { pci_msix_mask(irq_data_get_msi_desc(data)); @@ -234,6 +284,8 @@ EXPORT_SYMBOL_GPL(pci_msix_prepare_desc); static const struct msi_domain_template pci_msix_template = { .chip = { .name = "PCI-MSIX", + .irq_startup = pci_irq_startup_msix, + .irq_shutdown = pci_irq_shutdown_msix, .irq_mask = pci_irq_mask_msix, .irq_unmask = pci_irq_unmask_msix, .irq_write_msi_msg = pci_msi_domain_write_msg, diff --git a/include/linux/msi.h b/include/linux/msi.h index e5e86a8529fb6f..3111ba95fbde49 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -568,6 +568,8 @@ enum { MSI_FLAG_PARENT_PM_DEV = (1 << 8), /* Support for parent mask/unmask */ MSI_FLAG_PCI_MSI_MASK_PARENT = (1 << 9), + /* Support for parent startup/shutdown */ + MSI_FLAG_PCI_MSI_STARTUP_PARENT = (1 << 10), /* Mask for the generic functionality */ MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0), From 9d8c41816bac518b4824f83b346ae30a1be83f68 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Thu, 14 Aug 2025 07:28:33 +0800 Subject: [PATCH 0468/3206] irqchip/sg2042-msi: Fix broken affinity setting When using NVME on SG2044, the NVME drvier always complains about "I/O tag XXX (XXX) QID XX timeout, completion polled", which is caused by the broken affinity setting mechanism of the sg2042-msi driver. The PLIC driver can only the set the affinity when enabled, but the sg2042-msi driver invokes the affinity setter in disabled state, which causes the change to be lost. Cure this by implementing the irq_startup()/shutdown() callbacks, which allow to startup (enabled) the underlying PLIC first. Fixes: e96b93a97c90 ("irqchip/sg2042-msi: Add the Sophgo SG2044 MSI interrupt controller") Reported-by: Han Gao Suggested-by: Thomas Gleixner Signed-off-by: Inochi Amaoto Signed-off-by: Thomas Gleixner Tested-by: Chen Wang # Pioneerbox Reviewed-by: Chen Wang Link: https://lore.kernel.org/all/20250813232835.43458-4-inochiama@gmail.com --- drivers/irqchip/irq-sg2042-msi.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-sg2042-msi.c b/drivers/irqchip/irq-sg2042-msi.c index bcfddc51bc6a18..2fd4d94f9bd76d 100644 --- a/drivers/irqchip/irq-sg2042-msi.c +++ b/drivers/irqchip/irq-sg2042-msi.c @@ -85,6 +85,8 @@ static void sg2042_msi_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *m static const struct irq_chip sg2042_msi_middle_irq_chip = { .name = "SG2042 MSI", + .irq_startup = irq_chip_startup_parent, + .irq_shutdown = irq_chip_shutdown_parent, .irq_ack = sg2042_msi_irq_ack, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, @@ -114,6 +116,8 @@ static void sg2044_msi_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *m static struct irq_chip sg2044_msi_middle_irq_chip = { .name = "SG2044 MSI", + .irq_startup = irq_chip_startup_parent, + .irq_shutdown = irq_chip_shutdown_parent, .irq_ack = sg2044_msi_irq_ack, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, @@ -185,8 +189,10 @@ static const struct irq_domain_ops sg204x_msi_middle_domain_ops = { .select = msi_lib_irq_domain_select, }; -#define SG2042_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ - MSI_FLAG_USE_DEF_CHIP_OPS) +#define SG2042_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT | \ + MSI_FLAG_PCI_MSI_STARTUP_PARENT) #define SG2042_MSI_FLAGS_SUPPORTED MSI_GENERIC_FLAGS_MASK @@ -200,10 +206,12 @@ static const struct msi_parent_ops sg2042_msi_parent_ops = { .init_dev_msi_info = msi_lib_init_dev_msi_info, }; -#define SG2044_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ - MSI_FLAG_USE_DEF_CHIP_OPS) +#define SG2044_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT | \ + MSI_FLAG_PCI_MSI_STARTUP_PARENT) -#define SG2044_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ +#define SG2044_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ MSI_FLAG_PCI_MSIX) static const struct msi_parent_ops sg2044_msi_parent_ops = { From 7ee4a5a2ec3748facfb4ca96e4cce6cabbdecab2 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Thu, 14 Aug 2025 07:28:34 +0800 Subject: [PATCH 0469/3206] irqchip/sg2042-msi: Set MSI_FLAG_MULTI_PCI_MSI flags for SG2044 The MSI controller on SG2044 has the ability to allocate multiple PCI MSI interrupts. So the PCIe controller driver can use this feature if the hardware supports multiple PCI MSI interrupts. Add the MSI_FLAG_MULTI_PCI_MSI flag to the supported_flags of SG2044 msi_parent_ops to enable this functionality. Signed-off-by: Inochi Amaoto Signed-off-by: Thomas Gleixner Tested-by: Chen Wang # Pioneerbox Reviewed-by: Chen Wang Link: https://lore.kernel.org/all/20250813232835.43458-5-inochiama@gmail.com --- drivers/irqchip/irq-sg2042-msi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/irq-sg2042-msi.c b/drivers/irqchip/irq-sg2042-msi.c index 2fd4d94f9bd76d..3b13dbbfdb51c0 100644 --- a/drivers/irqchip/irq-sg2042-msi.c +++ b/drivers/irqchip/irq-sg2042-msi.c @@ -212,6 +212,7 @@ static const struct msi_parent_ops sg2042_msi_parent_ops = { MSI_FLAG_PCI_MSI_STARTUP_PARENT) #define SG2044_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ + MSI_FLAG_MULTI_PCI_MSI | \ MSI_FLAG_PCI_MSIX) static const struct msi_parent_ops sg2044_msi_parent_ops = { From b92ff23b12046f70f7f41f1e57e77c498dec35d7 Mon Sep 17 00:00:00 2001 From: Fushuai Wang Date: Mon, 11 Aug 2025 14:47:01 +0800 Subject: [PATCH 0470/3206] irqchip/sifive-plic: Use for_each_present_cpu() instead of for_each_cpu() Replace the open coded for_each_cpu(cpu, cpu_present_mask) loop with the more readable and equivalent for_each_present_cpu(cpu) macro. Signed-off-by: Fushuai Wang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250811064701.2906-1-wangfushuai@baidu.com --- drivers/irqchip/irq-sifive-plic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index bf69a4802b71e7..3de54606b8f8a6 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -257,7 +257,7 @@ static int plic_irq_suspend(void) readl(priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID)); } - for_each_cpu(cpu, cpu_present_mask) { + for_each_present_cpu(cpu) { struct plic_handler *handler = per_cpu_ptr(&plic_handlers, cpu); if (!handler->present) @@ -289,7 +289,7 @@ static void plic_irq_resume(void) priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID); } - for_each_cpu(cpu, cpu_present_mask) { + for_each_present_cpu(cpu) { struct plic_handler *handler = per_cpu_ptr(&plic_handlers, cpu); if (!handler->present) From adecf78df945f4c7a1d29111b0002827f487df51 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Mon, 11 Aug 2025 08:26:32 +0800 Subject: [PATCH 0471/3206] irqchip/sifive-plic: Respect mask state when setting affinity plic_set_affinity() always calls plic_irq_enable(), which clears up the priority setting even the interrupt is only masked. This unmasks the interrupt unexpectly. Replace the plic_irq_enable/disable() with plic_irq_toggle() to avoid changing the priority setting. Suggested-by: Thomas Gleixner Signed-off-by: Inochi Amaoto Signed-off-by: Thomas Gleixner Tested-by: Nam Cao # VisionFive 2 Tested-by: Chen Wang # Pioneerbox Reviewed-by: Nam Cao Reviewed-by: Chen Wang Link: https://lore.kernel.org/all/20250811002633.55275-1-inochiama@gmail.com Link: https://lore.kernel.org/lkml/20250722224513.22125-1-inochiama@gmail.com/ --- drivers/irqchip/irq-sifive-plic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index 3de54606b8f8a6..559fda8fb3a86a 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -179,12 +179,14 @@ static int plic_set_affinity(struct irq_data *d, if (cpu >= nr_cpu_ids) return -EINVAL; - plic_irq_disable(d); + /* Invalidate the original routing entry */ + plic_irq_toggle(irq_data_get_effective_affinity_mask(d), d, 0); irq_data_update_effective_affinity(d, cpumask_of(cpu)); + /* Setting the new routing entry if irq is enabled */ if (!irqd_irq_disabled(d)) - plic_irq_enable(d); + plic_irq_toggle(irq_data_get_effective_affinity_mask(d), d, 1); return IRQ_SET_MASK_OK_DONE; } From 7fb83eb664e9b3a0438dd28859e9f0fd49d4c165 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 4 Aug 2025 16:19:45 +0800 Subject: [PATCH 0472/3206] irqchip/loongson-eiointc: Route interrupt parsed from bios table Interrupt controller eiointc routes interrupts to CPU interface IP0 - IP7. It is currently hard-coded that eiointc routes interrupts to the CPU starting from IP1, but it should base that decision on the parent interrupt, which is provided by ACPI or DTS. Retrieve the parent's hardware interrupt number and store it in the descriptor of the eointc instance, so that the routing function can utilize it for the correct route settings. [ tglx: Massaged change log ] Signed-off-by: Bibo Mao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250804081946.1456573-2-maobibo@loongson.cn --- drivers/irqchip/irq-loongson-eiointc.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c index b2860eb2d32c55..baa406904de55a 100644 --- a/drivers/irqchip/irq-loongson-eiointc.c +++ b/drivers/irqchip/irq-loongson-eiointc.c @@ -68,6 +68,7 @@ struct eiointc_priv { struct fwnode_handle *domain_handle; struct irq_domain *eiointc_domain; int flags; + irq_hw_number_t parent_hwirq; }; static struct eiointc_priv *eiointc_priv[MAX_IO_PICS]; @@ -211,7 +212,12 @@ static int eiointc_router_init(unsigned int cpu) } for (i = 0; i < eiointc_priv[0]->vec_count / 32 / 4; i++) { - bit = BIT(1 + index); /* Route to IP[1 + index] */ + /* + * Route to interrupt pin, relative offset used here + * Offset 0 means routing to IP0 and so on + * Every 32 vector routing to one interrupt pin + */ + bit = BIT(eiointc_priv[index]->parent_hwirq - INT_HWI0); data = bit | (bit << 8) | (bit << 16) | (bit << 24); iocsr_write32(data, EIOINTC_REG_IPMAP + i * 4); } @@ -495,7 +501,7 @@ int __init eiointc_acpi_init(struct irq_domain *parent, priv->vec_count = VEC_COUNT; priv->node = acpi_eiointc->node; - + priv->parent_hwirq = acpi_eiointc->cascade; parent_irq = irq_create_mapping(parent, acpi_eiointc->cascade); ret = eiointc_init(priv, parent_irq, acpi_eiointc->node_map); @@ -527,8 +533,9 @@ int __init eiointc_acpi_init(struct irq_domain *parent, static int __init eiointc_of_init(struct device_node *of_node, struct device_node *parent) { - int parent_irq, ret; struct eiointc_priv *priv; + struct irq_data *irq_data; + int parent_irq, ret; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) @@ -544,6 +551,12 @@ static int __init eiointc_of_init(struct device_node *of_node, if (ret < 0) goto out_free_priv; + irq_data = irq_get_irq_data(parent_irq); + if (!irq_data) { + ret = -ENODEV; + goto out_free_priv; + } + /* * In particular, the number of devices supported by the LS2K0500 * extended I/O interrupt vector is 128. @@ -552,7 +565,7 @@ static int __init eiointc_of_init(struct device_node *of_node, priv->vec_count = 128; else priv->vec_count = VEC_COUNT; - + priv->parent_hwirq = irqd_to_hwirq(irq_data); priv->node = 0; priv->domain_handle = of_fwnode_handle(of_node); From 8ff1c16c753e293c3ba20583cb64f81ea7b9a451 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 4 Aug 2025 16:19:46 +0800 Subject: [PATCH 0473/3206] irqchip/loongson-eiointc: Add multiple interrupt pin routing support The eiointc interrupt controller supports 256 interrupt vectors at most, and the interrupt handler gets the interrupt status from the base register group EIOINTC_REG_ISR at the interrupt specific offset. It needs to read the register group EIOINTC_REG_ISR four times to get all 256 interrupt vectors status. Eiointc registers including EIOINTC_REG_ISR are software emulated for VMs, so there will be VM-exits when accessing eiointc registers. Introduce a method to make the eiointc interrupt controller route to different CPU interrupt pins for every 64 interrupt vectors. The interrupt handler can then reduce the read to one specific EIOINTC_REG_ISR register instead of all four, which reduces VM exits. [ tglx: Massage change log ] Signed-off-by: Bibo Mao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250804081946.1456573-3-maobibo@loongson.cn --- drivers/irqchip/irq-loongson-eiointc.c | 88 +++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 10 deletions(-) diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c index baa406904de55a..39e5a72ccd3c83 100644 --- a/drivers/irqchip/irq-loongson-eiointc.c +++ b/drivers/irqchip/irq-loongson-eiointc.c @@ -46,6 +46,7 @@ #define EIOINTC_ALL_ENABLE_VEC_MASK(vector) (EIOINTC_ALL_ENABLE & ~BIT(vector & 0x1f)) #define EIOINTC_REG_ENABLE_VEC(vector) (EIOINTC_REG_ENABLE + ((vector >> 5) << 2)) #define EIOINTC_USE_CPU_ENCODE BIT(0) +#define EIOINTC_ROUTE_MULT_IP BIT(1) #define MAX_EIO_NODES (NR_CPUS / CORES_PER_EIO_NODE) @@ -59,6 +60,14 @@ #define EIOINTC_REG_ROUTE_VEC_MASK(vector) (0xff << EIOINTC_REG_ROUTE_VEC_SHIFT(vector)) static int nr_pics; +struct eiointc_priv; + +struct eiointc_ip_route { + struct eiointc_priv *priv; + /* Offset Routed destination IP */ + int start; + int end; +}; struct eiointc_priv { u32 node; @@ -69,6 +78,7 @@ struct eiointc_priv { struct irq_domain *eiointc_domain; int flags; irq_hw_number_t parent_hwirq; + struct eiointc_ip_route route_info[VEC_REG_COUNT]; }; static struct eiointc_priv *eiointc_priv[MAX_IO_PICS]; @@ -189,6 +199,7 @@ static int eiointc_router_init(unsigned int cpu) { int i, bit, cores, index, node; unsigned int data; + int hwirq, mask; node = cpu_to_eio_node(cpu); index = eiointc_index(node); @@ -198,6 +209,13 @@ static int eiointc_router_init(unsigned int cpu) return -EINVAL; } + /* Enable cpu interrupt pin from eiointc */ + hwirq = eiointc_priv[index]->parent_hwirq; + mask = BIT(hwirq); + if (eiointc_priv[index]->flags & EIOINTC_ROUTE_MULT_IP) + mask |= BIT(hwirq + 1) | BIT(hwirq + 2) | BIT(hwirq + 3); + set_csr_ecfg(mask); + if (!(eiointc_priv[index]->flags & EIOINTC_USE_CPU_ENCODE)) cores = CORES_PER_EIO_NODE; else @@ -215,10 +233,28 @@ static int eiointc_router_init(unsigned int cpu) /* * Route to interrupt pin, relative offset used here * Offset 0 means routing to IP0 and so on - * Every 32 vector routing to one interrupt pin + * + * If flags is set with EIOINTC_ROUTE_MULT_IP, + * every 64 vector routes to different consecutive + * IPs, otherwise all vector routes to the same IP */ - bit = BIT(eiointc_priv[index]->parent_hwirq - INT_HWI0); - data = bit | (bit << 8) | (bit << 16) | (bit << 24); + if (eiointc_priv[index]->flags & EIOINTC_ROUTE_MULT_IP) { + /* The first 64 vectors route to hwirq */ + bit = BIT(hwirq++ - INT_HWI0); + data = bit | (bit << 8); + + /* The second 64 vectors route to hwirq + 1 */ + bit = BIT(hwirq++ - INT_HWI0); + data |= (bit << 16) | (bit << 24); + + /* + * Route to hwirq + 2/hwirq + 3 separately + * in next loop + */ + } else { + bit = BIT(hwirq - INT_HWI0); + data = bit | (bit << 8) | (bit << 16) | (bit << 24); + } iocsr_write32(data, EIOINTC_REG_IPMAP + i * 4); } @@ -247,15 +283,22 @@ static int eiointc_router_init(unsigned int cpu) static void eiointc_irq_dispatch(struct irq_desc *desc) { - int i; - u64 pending; - bool handled = false; + struct eiointc_ip_route *info = irq_desc_get_handler_data(desc); struct irq_chip *chip = irq_desc_get_chip(desc); - struct eiointc_priv *priv = irq_desc_get_handler_data(desc); + bool handled = false; + u64 pending; + int i; chained_irq_enter(chip, desc); - for (i = 0; i < eiointc_priv[0]->vec_count / VEC_COUNT_PER_REG; i++) { + /* + * If EIOINTC_ROUTE_MULT_IP is set, every 64 interrupt vectors in + * eiointc interrupt controller routes to different cpu interrupt pins + * + * Every cpu interrupt pin has its own irq handler, it is ok to + * read ISR for these 64 interrupt vectors rather than all vectors + */ + for (i = info->start; i < info->end; i++) { pending = iocsr_read64(EIOINTC_REG_ISR + (i << 3)); /* Skip handling if pending bitmap is zero */ @@ -268,7 +311,7 @@ static void eiointc_irq_dispatch(struct irq_desc *desc) int bit = __ffs(pending); int irq = bit + VEC_COUNT_PER_REG * i; - generic_handle_domain_irq(priv->eiointc_domain, irq); + generic_handle_domain_irq(info->priv->eiointc_domain, irq); pending &= ~BIT(bit); handled = true; } @@ -468,8 +511,33 @@ static int __init eiointc_init(struct eiointc_priv *priv, int parent_irq, } eiointc_priv[nr_pics++] = priv; + /* + * Only the first eiointc device on VM supports routing to + * different CPU interrupt pins. The later eiointc devices use + * generic method if there are multiple eiointc devices in future + */ + if (cpu_has_hypervisor && (nr_pics == 1)) { + priv->flags |= EIOINTC_ROUTE_MULT_IP; + priv->parent_hwirq = INT_HWI0; + } + + if (priv->flags & EIOINTC_ROUTE_MULT_IP) { + for (i = 0; i < priv->vec_count / VEC_COUNT_PER_REG; i++) { + priv->route_info[i].start = priv->parent_hwirq - INT_HWI0 + i; + priv->route_info[i].end = priv->route_info[i].start + 1; + priv->route_info[i].priv = priv; + parent_irq = get_percpu_irq(priv->parent_hwirq + i); + irq_set_chained_handler_and_data(parent_irq, eiointc_irq_dispatch, + &priv->route_info[i]); + } + } else { + priv->route_info[0].start = 0; + priv->route_info[0].end = priv->vec_count / VEC_COUNT_PER_REG; + priv->route_info[0].priv = priv; + irq_set_chained_handler_and_data(parent_irq, eiointc_irq_dispatch, + &priv->route_info[0]); + } eiointc_router_init(0); - irq_set_chained_handler_and_data(parent_irq, eiointc_irq_dispatch, priv); if (nr_pics == 1) { register_syscore_ops(&eiointc_syscore_ops); From 55b48e23f5c4b6f5ca9b7ab09599b17dcf501c10 Mon Sep 17 00:00:00 2001 From: Pan Chuang Date: Tue, 5 Aug 2025 17:29:22 +0800 Subject: [PATCH 0474/3206] genirq/devres: Add error handling in devm_request_*_irq() devm_request_threaded_irq() and devm_request_any_context_irq() currently don't print any error message when interrupt registration fails. This forces each driver to implement redundant error logging - over 2,000 lines of error messages exist across drivers. Additionally, when upper-layer functions propagate these errors without logging, critical debugging information is lost. Add devm_request_result() helper to unify error reporting via dev_err_probe(), Use it in devm_request_threaded_irq() and devm_request_any_context_irq() printing device name, IRQ number, handler functions, and error code on failure automatically. Co-developed-by: Yangtao Li Signed-off-by: Yangtao Li Signed-off-by: Pan Chuang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250805092922.135500-2-panchuang@vivo.com --- kernel/irq/devres.c | 127 ++++++++++++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 40 deletions(-) diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index eb16a58e0322e2..b411886986222d 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -30,29 +30,22 @@ static int devm_irq_match(struct device *dev, void *res, void *data) return this->irq == match->irq && this->dev_id == match->dev_id; } -/** - * devm_request_threaded_irq - allocate an interrupt line for a managed device - * @dev: device to request interrupt for - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @thread_fn: function to be called in a threaded interrupt context. NULL - * for devices which handle everything in @handler - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device, dev_name(dev) if NULL - * @dev_id: A cookie passed back to the handler function - * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as - * request_threaded_irq(). IRQs requested with this function will be - * automatically freed on driver detach. - * - * If an IRQ allocated with this function needs to be freed - * separately, devm_free_irq() must be used. - */ -int devm_request_threaded_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, irq_handler_t thread_fn, - unsigned long irqflags, const char *devname, - void *dev_id) +static int devm_request_result(struct device *dev, int rc, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + const char *devname) +{ + if (rc >= 0) + return rc; + + return dev_err_probe(dev, rc, "request_irq(%u) %ps %ps %s\n", + irq, handler, thread_fn, devname ? : ""); +} + +static int __devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, + irq_handler_t thread_fn, + unsigned long irqflags, + const char *devname, void *dev_id) { struct irq_devres *dr; int rc; @@ -78,28 +71,48 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq, return 0; } -EXPORT_SYMBOL(devm_request_threaded_irq); /** - * devm_request_any_context_irq - allocate an interrupt line for a managed device - * @dev: device to request interrupt for - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device, dev_name(dev) if NULL - * @dev_id: A cookie passed back to the handler function + * devm_request_threaded_irq - allocate an interrupt line for a managed device with error logging + * @dev: Device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the interrupt occurs + * @thread_fn: Function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device, dev_name(dev) if NULL + * @dev_id: A cookie passed back to the handler function * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as - * request_any_context_irq(). IRQs requested with this function will be - * automatically freed on driver detach. + * Except for the extra @dev argument, this function takes the same + * arguments and performs the same function as request_threaded_irq(). + * Interrupts requested with this function will be automatically freed on + * driver detach. + * + * If an interrupt allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + * + * When the request fails, an error message is printed with contextual + * information (device name, interrupt number, handler functions and + * error code). Don't add extra error messages at the call sites. * - * If an IRQ allocated with this function needs to be freed - * separately, devm_free_irq() must be used. + * Return: 0 on success or a negative error number. */ -int devm_request_any_context_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, unsigned long irqflags, - const char *devname, void *dev_id) +int devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + unsigned long irqflags, const char *devname, + void *dev_id) +{ + int rc = __devm_request_threaded_irq(dev, irq, handler, thread_fn, + irqflags, devname, dev_id); + + return devm_request_result(dev, rc, irq, handler, thread_fn, devname); +} +EXPORT_SYMBOL(devm_request_threaded_irq); + +static int __devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) { struct irq_devres *dr; int rc; @@ -124,6 +137,40 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq, return rc; } + +/** + * devm_request_any_context_irq - allocate an interrupt line for a managed device with error logging + * @dev: Device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the interrupt occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device, dev_name(dev) if NULL + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the same + * arguments and performs the same function as request_any_context_irq(). + * Interrupts requested with this function will be automatically freed on + * driver detach. + * + * If an interrupt allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + * + * When the request fails, an error message is printed with contextual + * information (device name, interrupt number, handler functions and + * error code). Don't add extra error messages at the call sites. + * + * Return: IRQC_IS_HARDIRQ or IRQC_IS_NESTED on success, or a negative error + * number. + */ +int devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + int rc = __devm_request_any_context_irq(dev, irq, handler, irqflags, + devname, dev_id); + + return devm_request_result(dev, rc, irq, handler, NULL, devname); +} EXPORT_SYMBOL(devm_request_any_context_irq); /** From 17d5efcbfe6f3da23afb79d84c27cefb2b3f331a Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 26 Jul 2025 20:07:50 +0200 Subject: [PATCH 0475/3206] rust: kernel: remove support for unused host `#[test]`s Since commit 028df914e546 ("rust: str: convert `rusttest` tests into KUnit"), we do not have anymore host `#[test]`s that run in the host. Moreover, we do not plan to add any new ones -- tests should generally run within KUnit, since there they are built the same way the kernel does. While we may want to have some way to define tests that can also be run outside the kernel, we still want to test within the kernel too [1], and thus would likely use a custom syntax anyway to define them. Thus simplify the `rusttest` target by removing support for host `#[test]`s for the `kernel` crate. This still maintains the support for the `macros` crate, even though we do not have any such tests there. Link: https://lore.kernel.org/rust-for-linux/CABVgOS=AKHSfifp0S68K3jgNZAkALBr=7iFb=niryG5WDxjSrg@mail.gmail.com/ [1] Signed-off-by: Miguel Ojeda Reviewed-by: Danilo Krummrich Reviewed-by: David Gow Link: https://lore.kernel.org/r/20250726180750.2735836-1-ojeda@kernel.org Signed-off-by: Danilo Krummrich --- rust/Makefile | 9 +-------- rust/kernel/alloc.rs | 6 +++--- rust/kernel/error.rs | 4 ++-- rust/kernel/lib.rs | 2 +- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/rust/Makefile b/rust/Makefile index 4263462b84709f..a934946bbf2137 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -238,7 +238,7 @@ quiet_cmd_rustc_test = $(RUSTC_OR_CLIPPY_QUIET) T $< $(objtree)/$(obj)/test/$(subst rusttest-,,$@) $(rust_test_quiet) \ $(rustc_test_run_flags) -rusttest: rusttest-macros rusttest-kernel +rusttest: rusttest-macros rusttest-macros: private rustc_target_flags = --extern proc_macro \ --extern macros --extern kernel --extern pin_init @@ -248,13 +248,6 @@ rusttest-macros: $(src)/macros/lib.rs \ +$(call if_changed,rustc_test) +$(call if_changed,rustdoc_test) -rusttest-kernel: private rustc_target_flags = --extern ffi --extern pin_init \ - --extern build_error --extern macros --extern bindings --extern uapi -rusttest-kernel: $(src)/kernel/lib.rs rusttestlib-ffi rusttestlib-kernel \ - rusttestlib-build_error rusttestlib-macros rusttestlib-bindings \ - rusttestlib-uapi rusttestlib-pin_init FORCE - +$(call if_changed,rustc_test) - ifdef CONFIG_CC_IS_CLANG bindgen_c_flags = $(c_flags) else diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index 907301334d8c11..25a2df50f59a89 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -2,16 +2,16 @@ //! Implementation of the kernel's memory allocation infrastructure. -#[cfg(not(any(test, testlib)))] +#[cfg(not(testlib))] pub mod allocator; pub mod kbox; pub mod kvec; pub mod layout; -#[cfg(any(test, testlib))] +#[cfg(testlib)] pub mod allocator_test; -#[cfg(any(test, testlib))] +#[cfg(testlib)] pub use self::allocator_test as allocator; pub use self::kbox::Box; diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs index a41de293dcd11b..67da2d118e656f 100644 --- a/rust/kernel/error.rs +++ b/rust/kernel/error.rs @@ -158,7 +158,7 @@ impl Error { } /// Returns a string representing the error, if one exists. - #[cfg(not(any(test, testlib)))] + #[cfg(not(testlib))] pub fn name(&self) -> Option<&'static CStr> { // SAFETY: Just an FFI call, there are no extra safety requirements. let ptr = unsafe { bindings::errname(-self.0.get()) }; @@ -175,7 +175,7 @@ impl Error { /// When `testlib` is configured, this always returns `None` to avoid the dependency on a /// kernel function so that tests that use this (e.g., by calling [`Result::unwrap`]) can still /// run in userspace. - #[cfg(any(test, testlib))] + #[cfg(testlib)] pub fn name(&self) -> Option<&'static CStr> { None } diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index ed53169e795c0b..821e74eee1bb06 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -206,7 +206,7 @@ impl ThisModule { } } -#[cfg(not(any(testlib, test)))] +#[cfg(not(testlib))] #[panic_handler] fn panic(info: &core::panic::PanicInfo<'_>) -> ! { pr_emerg!("{}\n", info); From fe927defbb4f31c15a52f0372d7f5d608f161086 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 16 Aug 2025 23:19:00 +0200 Subject: [PATCH 0476/3206] rust: alloc: remove `allocator_test` Given we do not have tests that rely on it anymore, remove `allocator_test`, which simplifies the complexity of the build. In particular, it avoids potential issues with `rusttest`, such as the one fixed at [1], where a public function was added to `Kmalloc` and used elsewhere, but it was not added to `Cmalloc`; or trivial issues like a missing import [2] due to not many people testing that target. The only downside is that we cannot use it in the `macros`' crate examples anymore, but we did not feel a need for that so far, and anyway we could support that by running those within the kernel too, which we may do regardless. Link: https://lore.kernel.org/rust-for-linux/20250816204215.2719559-1-ojeda@kernel.org/ [1] Link: https://lore.kernel.org/rust-for-linux/20250816210214.2729269-1-ojeda@kernel.org/ [2] Signed-off-by: Miguel Ojeda Link: https://lore.kernel.org/r/20250816211900.2731720-1-ojeda@kernel.org Signed-off-by: Danilo Krummrich --- rust/kernel/alloc.rs | 7 -- rust/kernel/alloc/allocator_test.rs | 113 ---------------------------- 2 files changed, 120 deletions(-) delete mode 100644 rust/kernel/alloc/allocator_test.rs diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index 25a2df50f59a89..9c154209423cb2 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -2,18 +2,11 @@ //! Implementation of the kernel's memory allocation infrastructure. -#[cfg(not(testlib))] pub mod allocator; pub mod kbox; pub mod kvec; pub mod layout; -#[cfg(testlib)] -pub mod allocator_test; - -#[cfg(testlib)] -pub use self::allocator_test as allocator; - pub use self::kbox::Box; pub use self::kbox::KBox; pub use self::kbox::KVBox; diff --git a/rust/kernel/alloc/allocator_test.rs b/rust/kernel/alloc/allocator_test.rs deleted file mode 100644 index a3074480bd8d74..00000000000000 --- a/rust/kernel/alloc/allocator_test.rs +++ /dev/null @@ -1,113 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -//! So far the kernel's `Box` and `Vec` types can't be used by userspace test cases, since all users -//! of those types (e.g. `CString`) use kernel allocators for instantiation. -//! -//! In order to allow userspace test cases to make use of such types as well, implement the -//! `Cmalloc` allocator within the `allocator_test` module and type alias all kernel allocators to -//! `Cmalloc`. The `Cmalloc` allocator uses libc's `realloc()` function as allocator backend. - -#![allow(missing_docs)] - -use super::{flags::*, AllocError, Allocator, Flags}; -use core::alloc::Layout; -use core::cmp; -use core::ptr; -use core::ptr::NonNull; - -/// The userspace allocator based on libc. -pub struct Cmalloc; - -pub type Kmalloc = Cmalloc; -pub type Vmalloc = Kmalloc; -pub type KVmalloc = Kmalloc; - -extern "C" { - #[link_name = "aligned_alloc"] - fn libc_aligned_alloc(align: usize, size: usize) -> *mut crate::ffi::c_void; - - #[link_name = "free"] - fn libc_free(ptr: *mut crate::ffi::c_void); -} - -// SAFETY: -// - memory remains valid until it is explicitly freed, -// - passing a pointer to a valid memory allocation created by this `Allocator` is always OK, -// - `realloc` provides the guarantees as provided in the `# Guarantees` section. -unsafe impl Allocator for Cmalloc { - unsafe fn realloc( - ptr: Option>, - layout: Layout, - old_layout: Layout, - flags: Flags, - ) -> Result, AllocError> { - let src = match ptr { - Some(src) => { - if old_layout.size() == 0 { - ptr::null_mut() - } else { - src.as_ptr() - } - } - None => ptr::null_mut(), - }; - - if layout.size() == 0 { - // SAFETY: `src` is either NULL or was previously allocated with this `Allocator` - unsafe { libc_free(src.cast()) }; - - return Ok(NonNull::slice_from_raw_parts( - crate::alloc::dangling_from_layout(layout), - 0, - )); - } - - // ISO C (ISO/IEC 9899:2011) defines `aligned_alloc`: - // - // > The value of alignment shall be a valid alignment supported by the implementation - // [...]. - // - // As an example of the "supported by the implementation" requirement, POSIX.1-2001 (IEEE - // 1003.1-2001) defines `posix_memalign`: - // - // > The value of alignment shall be a power of two multiple of sizeof (void *). - // - // and POSIX-based implementations of `aligned_alloc` inherit this requirement. At the time - // of writing, this is known to be the case on macOS (but not in glibc). - // - // Satisfy the stricter requirement to avoid spurious test failures on some platforms. - let min_align = core::mem::size_of::<*const crate::ffi::c_void>(); - let layout = layout.align_to(min_align).map_err(|_| AllocError)?; - let layout = layout.pad_to_align(); - - // SAFETY: Returns either NULL or a pointer to a memory allocation that satisfies or - // exceeds the given size and alignment requirements. - let dst = unsafe { libc_aligned_alloc(layout.align(), layout.size()) }.cast::(); - let dst = NonNull::new(dst).ok_or(AllocError)?; - - if flags.contains(__GFP_ZERO) { - // SAFETY: The preceding calls to `libc_aligned_alloc` and `NonNull::new` - // guarantee that `dst` points to memory of at least `layout.size()` bytes. - unsafe { dst.as_ptr().write_bytes(0, layout.size()) }; - } - - if !src.is_null() { - // SAFETY: - // - `src` has previously been allocated with this `Allocator`; `dst` has just been - // newly allocated, hence the memory regions do not overlap. - // - both` src` and `dst` are properly aligned and valid for reads and writes - unsafe { - ptr::copy_nonoverlapping( - src, - dst.as_ptr(), - cmp::min(layout.size(), old_layout.size()), - ) - }; - } - - // SAFETY: `src` is either NULL or was previously allocated with this `Allocator` - unsafe { libc_free(src.cast()) }; - - Ok(NonNull::slice_from_raw_parts(dst, layout.size())) - } -} From 1b558e14f3c17dc29ce2e8cd0b8bd385e108734b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Jul 2025 16:12:19 +0200 Subject: [PATCH 0477/3206] x86/apic: Make the ISR clearing sane apic_pending_intr_clear() is fundamentally voodoo programming. It's primary purpose is to clear stale ISR bits in the local APIC, which would otherwise lock the corresponding interrupt priority level. The comments and the implementation claim falsely that after clearing the stale ISR bits, eventually stale IRR bits would be turned into ISR bits and can be cleared as well. That's just wishful thinking because: 1) If interrupts are disabled, the APIC does not propagate an IRR bit to the ISR. 2) If interrupts are enabled, then the APIC propagates the IRR bit to the ISR and raises the interrupt in the CPU, which means that code _cannot_ observe the ISR bit for any of those IRR bits. Rename the function to reflect the purpose and make exactly _one_ attempt to EOI the pending ISR bits and add comments why traversing the pending bit map in low to high priority order is correct. Instead of trying to "clear" IRR bits, simply print a warning message when the IRR is not empty. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/871ppwih4s.ffs@tglx --- arch/x86/kernel/apic/apic.c | 77 ++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d73ba5a7b623d4..ff4029b57e9567 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1428,63 +1428,61 @@ union apic_ir { u32 regs[APIC_IR_REGS]; }; -static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) +static bool apic_check_and_eoi_isr(union apic_ir *isr) { int i, bit; - /* Read the IRRs */ - for (i = 0; i < APIC_IR_REGS; i++) - irr->regs[i] = apic_read(APIC_IRR + i * 0x10); - /* Read the ISRs */ for (i = 0; i < APIC_IR_REGS; i++) isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + /* If the ISR map empty, nothing to do here. */ + if (bitmap_empty(isr->map, APIC_IR_BITS)) + return true; + /* - * If the ISR map is not empty. ACK the APIC and run another round - * to verify whether a pending IRR has been unblocked and turned - * into a ISR. + * There can be multiple ISR bits set when a high priority + * interrupt preempted a lower priority one. Issue an EOI for each + * set bit. The priority traversal order does not matter as there + * can't be new ISR bits raised at this point. What matters is that + * an EOI is issued for each ISR bit. */ - if (!bitmap_empty(isr->map, APIC_IR_BITS)) { - /* - * There can be multiple ISR bits set when a high priority - * interrupt preempted a lower priority one. Issue an ACK - * per set bit. - */ - for_each_set_bit(bit, isr->map, APIC_IR_BITS) - apic_eoi(); - return true; - } + for_each_set_bit(bit, isr->map, APIC_IR_BITS) + apic_eoi(); - return !bitmap_empty(irr->map, APIC_IR_BITS); + /* Reread the ISRs, they should be empty now */ + for (i = 0; i < APIC_IR_REGS; i++) + isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + + return bitmap_empty(isr->map, APIC_IR_BITS); } /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. + * If a CPU services an interrupt and crashes before issuing EOI to the + * local APIC, the corresponding ISR bit is still set when the crashing CPU + * jumps into a crash kernel. Read the ISR and issue an EOI for each set + * bit to acknowledge it as otherwise these slots would be locked forever + * waiting for an EOI. * - * Most probably by now the CPU has serviced that pending interrupt and it - * might not have done the apic_eoi() because it thought, interrupt - * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear - * the ISR bit and cpu thinks it has already serviced the interrupt. Hence - * a vector might get locked. It was noticed for timer irq (vector - * 0x31). Issue an extra EOI to clear ISR. + * If there are pending bits in the IRR, then they won't be converted into + * ISR bits as the CPU has interrupts disabled. They will be delivered once + * the CPU enables interrupts and there is nothing which can prevent that. * - * If there are pending IRR bits they turn into ISR bits after a higher - * priority ISR bit has been acked. + * In the worst case this results in spurious interrupt warnings. */ -static void apic_pending_intr_clear(void) +static void apic_clear_isr(void) { - union apic_ir irr, isr; + union apic_ir ir; unsigned int i; - /* 512 loops are way oversized and give the APIC a chance to obey. */ - for (i = 0; i < 512; i++) { - if (!apic_check_and_ack(&irr, &isr)) - return; - } - /* Dump the IRR/ISR content if that failed */ - pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map); + if (!apic_check_and_eoi_isr(&ir)) + pr_warn("APIC: Stale ISR: %256pb\n", ir.map); + + for (i = 0; i < APIC_IR_REGS; i++) + ir.regs[i] = apic_read(APIC_IRR + i * 0x10); + + if (!bitmap_empty(ir.map, APIC_IR_BITS)) + pr_warn("APIC: Stale IRR: %256pb\n", ir.map); } /** @@ -1541,8 +1539,7 @@ static void setup_local_APIC(void) value |= 0x10; apic_write(APIC_TASKPRI, value); - /* Clear eventually stale ISR/IRR bits */ - apic_pending_intr_clear(); + apic_clear_isr(); /* * Now that we are all set up, enable the APIC From 894751730a0d1674e43167b187e6605371667c8b Mon Sep 17 00:00:00 2001 From: Mohammad Rafi Shaik Date: Thu, 21 Aug 2025 10:19:07 +0530 Subject: [PATCH 0478/3206] dt-bindings: pinctrl: qcom,sc7280-lpass-lpi-pinctrl: Document the clock property Document the clock property in sc7280 LPASS LPI pinctrl node. Clock settings required for Audioreach solution. The existing SC7280 platform only supports non-ADSP audio solutions. To enable audio functionality on ADSP with the AudioReach solution. additional core and audio hardware clocks must be configured. Without these clocks, the ADSP will crash. Signed-off-by: Mohammad Rafi Shaik Co-developed-by: Prasad Kumpatla Signed-off-by: Prasad Kumpatla Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/20250821044914.710044-3-quic_pkumpatl@quicinc.com Signed-off-by: Linus Walleij --- .../pinctrl/qcom,sc7280-lpass-lpi-pinctrl.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-lpass-lpi-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-lpass-lpi-pinctrl.yaml index 08801cc4e476ff..bc7b8dda883765 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-lpass-lpi-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-lpass-lpi-pinctrl.yaml @@ -20,6 +20,16 @@ properties: reg: maxItems: 2 + clocks: + items: + - description: LPASS Core voting clock + - description: LPASS Audio voting clock + + clock-names: + items: + - const: core + - const: audio + patternProperties: "-state$": oneOf: @@ -70,10 +80,16 @@ unevaluatedProperties: false examples: - | + #include lpass_tlmm: pinctrl@33c0000 { compatible = "qcom,sc7280-lpass-lpi-pinctrl"; reg = <0x33c0000 0x20000>, <0x3550000 0x10000>; + + clocks = <&q6prmcc LPASS_HW_MACRO_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>, + <&q6prmcc LPASS_HW_DCODEC_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>; + clock-names = "core", "audio"; + gpio-controller; #gpio-cells = <2>; gpio-ranges = <&lpass_tlmm 0 0 15>; From 6e376f245f19feeadddafb2c3fa5fbd6469ecdfe Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:42 +0200 Subject: [PATCH 0479/3206] gpio: generic: provide to_gpio_generic_chip() Provide a helper allowing to convert a struct gpio_chip address to the struct gpio_generic_chip that wraps it. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-1-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/generic.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index f3a8db4598bb59..5a85ecbef8d234 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -55,6 +55,12 @@ struct gpio_generic_chip { struct gpio_chip gc; }; +static inline struct gpio_generic_chip * +to_gpio_generic_chip(struct gpio_chip *gc) +{ + return container_of(gc, struct gpio_generic_chip, gc); +} + /** * gpio_generic_chip_init() - Initialize a generic GPIO chip. * @chip: Generic GPIO chip to set up. From 16397871b6e35fa46a2bec27b3558f93b050c6fc Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:43 +0200 Subject: [PATCH 0480/3206] gpio: generic: provide helpers for reading and writing registers Provide helpers wrapping the read_reg() and write_reg() callbacks of the generic GPIO API that are called directly by many users. This is done to hide their implementation ahead of moving them into the separate generic GPIO struct. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-2-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/generic.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index 5a85ecbef8d234..4c0626b53ec903 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -100,6 +100,37 @@ gpio_generic_chip_set(struct gpio_generic_chip *chip, unsigned int offset, return chip->gc.set(&chip->gc, offset, value); } +/** + * gpio_generic_read_reg() - Read a register using the underlying callback. + * @chip: Generic GPIO chip to use. + * @reg: Register to read. + * + * Returns: value read from register. + */ +static inline unsigned long +gpio_generic_read_reg(struct gpio_generic_chip *chip, void __iomem *reg) +{ + if (WARN_ON(!chip->gc.read_reg)) + return 0; + + return chip->gc.read_reg(reg); +} + +/** + * gpio_generic_write_reg() - Write a register using the underlying callback. + * @chip: Generic GPIO chip to use. + * @reg: Register to write to. + * @val: New value to write. + */ +static inline void gpio_generic_write_reg(struct gpio_generic_chip *chip, + void __iomem *reg, unsigned long val) +{ + if (WARN_ON(!chip->gc.write_reg)) + return; + + chip->gc.write_reg(reg, val); +} + #define gpio_generic_chip_lock(gen_gc) \ raw_spin_lock(&(gen_gc)->gc.bgpio_lock) From 13ba232ed8455a5decb187d509a0d326fd326b19 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:44 +0200 Subject: [PATCH 0481/3206] gpio: hisi: use the BGPIOF_UNREADABLE_REG_DIR flag There's no reason for this driver to touch the gpio-mmio internals, we have a dedicated flag passed to bgpio_init() indicating to the module that the DIR register is unreadable. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-3-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-hisi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-hisi.c b/drivers/gpio/gpio-hisi.c index ef5cc654a24e23..6016e6f0ed0fb8 100644 --- a/drivers/gpio/gpio-hisi.c +++ b/drivers/gpio/gpio-hisi.c @@ -295,7 +295,7 @@ static int hisi_gpio_probe(struct platform_device *pdev) hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_CLR_WX, hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_SET_WX, hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_CLR_WX, - BGPIOF_NO_SET_ON_INPUT); + BGPIOF_NO_SET_ON_INPUT | BGPIOF_UNREADABLE_REG_DIR); if (ret) { dev_err(dev, "failed to init, ret = %d\n", ret); return ret; @@ -303,7 +303,6 @@ static int hisi_gpio_probe(struct platform_device *pdev) hisi_gpio->chip.set_config = hisi_gpio_set_config; hisi_gpio->chip.ngpio = hisi_gpio->line_num; - hisi_gpio->chip.bgpio_dir_unreadable = 1; hisi_gpio->chip.base = -1; if (hisi_gpio->irq > 0) From d6307707d50b468d614a80daf60ead8b7036f156 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:45 +0200 Subject: [PATCH 0482/3206] gpio: ts4800: remove the unnecessary call to platform_set_drvdata() There's no corresponding call to platform_get_drvdata() or dev_get_drvdata(). Remove the call to platform_set_drvdata() from .probe(). Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-4-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ts4800.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpio/gpio-ts4800.c b/drivers/gpio/gpio-ts4800.c index 4748e3d47106cd..86f7947ca9b2d2 100644 --- a/drivers/gpio/gpio-ts4800.c +++ b/drivers/gpio/gpio-ts4800.c @@ -51,8 +51,6 @@ static int ts4800_gpio_probe(struct platform_device *pdev) chip->ngpio = ngpios; - platform_set_drvdata(pdev, chip); - return devm_gpiochip_add_data(&pdev->dev, chip, NULL); } From 8a8e9a1a9272f262699960ca2782de87ea69dd32 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:46 +0200 Subject: [PATCH 0483/3206] gpio: ts4800: use generic device properties Avoid pulling in linux/of.h by using the generic device properties. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-5-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ts4800.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/gpio/gpio-ts4800.c b/drivers/gpio/gpio-ts4800.c index 86f7947ca9b2d2..f4ae87325393c9 100644 --- a/drivers/gpio/gpio-ts4800.c +++ b/drivers/gpio/gpio-ts4800.c @@ -7,8 +7,8 @@ #include #include -#include #include +#include #define DEFAULT_PIN_NUMBER 16 #define INPUT_REG_OFFSET 0x00 @@ -17,7 +17,7 @@ static int ts4800_gpio_probe(struct platform_device *pdev) { - struct device_node *node; + struct device *dev = &pdev->dev; struct gpio_chip *chip; void __iomem *base_addr; int retval; @@ -31,11 +31,7 @@ static int ts4800_gpio_probe(struct platform_device *pdev) if (IS_ERR(base_addr)) return PTR_ERR(base_addr); - node = pdev->dev.of_node; - if (!node) - return -EINVAL; - - retval = of_property_read_u32(node, "ngpios", &ngpios); + retval = device_property_read_u32(dev, "ngpios", &ngpios); if (retval == -EINVAL) ngpios = DEFAULT_PIN_NUMBER; else if (retval) From ac1eca3ab9fc4d17b59da12597c671df7739f934 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:47 +0200 Subject: [PATCH 0484/3206] gpio: ts4800: use dev_err_probe() Use dev_err_probe() where applicable. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-6-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ts4800.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-ts4800.c b/drivers/gpio/gpio-ts4800.c index f4ae87325393c9..cb3eeeb1e9df9a 100644 --- a/drivers/gpio/gpio-ts4800.c +++ b/drivers/gpio/gpio-ts4800.c @@ -40,10 +40,8 @@ static int ts4800_gpio_probe(struct platform_device *pdev) retval = bgpio_init(chip, &pdev->dev, 2, base_addr + INPUT_REG_OFFSET, base_addr + OUTPUT_REG_OFFSET, NULL, base_addr + DIRECTION_REG_OFFSET, NULL, 0); - if (retval) { - dev_err(&pdev->dev, "bgpio_init failed\n"); - return retval; - } + if (retval) + return dev_err_probe(dev, retval, "bgpio_init failed\n"); chip->ngpio = ngpios; From 9215a4fb59425588233d3362e886dc156c1739af Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:48 +0200 Subject: [PATCH 0485/3206] gpio: ts4800: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-7-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ts4800.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/gpio/gpio-ts4800.c b/drivers/gpio/gpio-ts4800.c index cb3eeeb1e9df9a..844347945e8e71 100644 --- a/drivers/gpio/gpio-ts4800.c +++ b/drivers/gpio/gpio-ts4800.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -17,13 +18,14 @@ static int ts4800_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; - struct gpio_chip *chip; + struct gpio_generic_chip *chip; void __iomem *base_addr; int retval; u32 ngpios; - chip = devm_kzalloc(&pdev->dev, sizeof(struct gpio_chip), GFP_KERNEL); + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); if (!chip) return -ENOMEM; @@ -37,15 +39,22 @@ static int ts4800_gpio_probe(struct platform_device *pdev) else if (retval) return retval; - retval = bgpio_init(chip, &pdev->dev, 2, base_addr + INPUT_REG_OFFSET, - base_addr + OUTPUT_REG_OFFSET, NULL, - base_addr + DIRECTION_REG_OFFSET, NULL, 0); + config = (typeof(config)){ + .dev = dev, + .sz = 2, + .dat = base_addr + INPUT_REG_OFFSET, + .set = base_addr + OUTPUT_REG_OFFSET, + .dirout = base_addr + DIRECTION_REG_OFFSET, + }; + + retval = gpio_generic_chip_init(chip, &config); if (retval) - return dev_err_probe(dev, retval, "bgpio_init failed\n"); + return dev_err_probe(dev, retval, + "failed to initialize the generic GPIO chip\n"); - chip->ngpio = ngpios; + chip->gc.ngpio = ngpios; - return devm_gpiochip_add_data(&pdev->dev, chip, NULL); + return devm_gpiochip_add_data(dev, &chip->gc, NULL); } static const struct of_device_id ts4800_gpio_of_match[] = { From 4ba2193ce0b94c28ec2095a1bb79353c82214d35 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:49 +0200 Subject: [PATCH 0486/3206] gpio: loongson-64bit: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-8-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-loongson-64bit.c | 42 +++++++++++++++++------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/drivers/gpio/gpio-loongson-64bit.c b/drivers/gpio/gpio-loongson-64bit.c index 818c606fbc5149..482e64ba9b4209 100644 --- a/drivers/gpio/gpio-loongson-64bit.c +++ b/drivers/gpio/gpio-loongson-64bit.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -30,7 +31,7 @@ struct loongson_gpio_chip_data { }; struct loongson_gpio_chip { - struct gpio_chip chip; + struct gpio_generic_chip chip; spinlock_t lock; void __iomem *reg_base; const struct loongson_gpio_chip_data *chip_data; @@ -38,7 +39,8 @@ struct loongson_gpio_chip { static inline struct loongson_gpio_chip *to_loongson_gpio_chip(struct gpio_chip *chip) { - return container_of(chip, struct loongson_gpio_chip, chip); + return container_of(to_gpio_generic_chip(chip), + struct loongson_gpio_chip, chip); } static inline void loongson_commit_direction(struct loongson_gpio_chip *lgpio, unsigned int pin, @@ -138,36 +140,40 @@ static int loongson_gpio_to_irq(struct gpio_chip *chip, unsigned int offset) static int loongson_gpio_init(struct device *dev, struct loongson_gpio_chip *lgpio, void __iomem *reg_base) { + struct gpio_generic_chip_config config; int ret; lgpio->reg_base = reg_base; if (lgpio->chip_data->mode == BIT_CTRL_MODE) { - ret = bgpio_init(&lgpio->chip, dev, 8, - lgpio->reg_base + lgpio->chip_data->in_offset, - lgpio->reg_base + lgpio->chip_data->out_offset, - NULL, NULL, - lgpio->reg_base + lgpio->chip_data->conf_offset, - 0); + config = (typeof(config)){ + .dev = dev, + .sz = 8, + .dat = lgpio->reg_base + lgpio->chip_data->in_offset, + .set = lgpio->reg_base + lgpio->chip_data->out_offset, + .dirin = lgpio->reg_base + lgpio->chip_data->conf_offset, + }; + + ret = gpio_generic_chip_init(&lgpio->chip, &config); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; } } else { - lgpio->chip.direction_input = loongson_gpio_direction_input; - lgpio->chip.get = loongson_gpio_get; - lgpio->chip.get_direction = loongson_gpio_get_direction; - lgpio->chip.direction_output = loongson_gpio_direction_output; - lgpio->chip.set = loongson_gpio_set; - lgpio->chip.parent = dev; + lgpio->chip.gc.direction_input = loongson_gpio_direction_input; + lgpio->chip.gc.get = loongson_gpio_get; + lgpio->chip.gc.get_direction = loongson_gpio_get_direction; + lgpio->chip.gc.direction_output = loongson_gpio_direction_output; + lgpio->chip.gc.set = loongson_gpio_set; + lgpio->chip.gc.parent = dev; spin_lock_init(&lgpio->lock); } - lgpio->chip.label = lgpio->chip_data->label; - lgpio->chip.can_sleep = false; + lgpio->chip.gc.label = lgpio->chip_data->label; + lgpio->chip.gc.can_sleep = false; if (lgpio->chip_data->inten_offset) - lgpio->chip.to_irq = loongson_gpio_to_irq; + lgpio->chip.gc.to_irq = loongson_gpio_to_irq; - return devm_gpiochip_add_data(dev, &lgpio->chip, lgpio); + return devm_gpiochip_add_data(dev, &lgpio->chip.gc, lgpio); } static int loongson_gpio_probe(struct platform_device *pdev) From 84bebb7e7ed000a2c4786094698536a3a3f67083 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:50 +0200 Subject: [PATCH 0487/3206] gpio: dwapb: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-9-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-dwapb.c | 162 ++++++++++++++++++++------------------ 1 file changed, 87 insertions(+), 75 deletions(-) diff --git a/drivers/gpio/gpio-dwapb.c b/drivers/gpio/gpio-dwapb.c index 43b667b41f5dce..0fb781ca9da295 100644 --- a/drivers/gpio/gpio-dwapb.c +++ b/drivers/gpio/gpio-dwapb.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -99,7 +100,7 @@ struct dwapb_gpio_port_irqchip { }; struct dwapb_gpio_port { - struct gpio_chip gc; + struct gpio_generic_chip chip; struct dwapb_gpio_port_irqchip *pirq; struct dwapb_gpio *gpio; #ifdef CONFIG_PM_SLEEP @@ -107,8 +108,12 @@ struct dwapb_gpio_port { #endif unsigned int idx; }; -#define to_dwapb_gpio(_gc) \ - (container_of(_gc, struct dwapb_gpio_port, gc)->gpio) + +static inline struct dwapb_gpio *to_dwapb_gpio(struct gpio_chip *gc) +{ + return container_of(to_gpio_generic_chip(gc), + struct dwapb_gpio_port, chip)->gpio; +} struct dwapb_gpio { struct device *dev; @@ -148,19 +153,19 @@ static inline u32 gpio_reg_convert(struct dwapb_gpio *gpio, unsigned int offset) static inline u32 dwapb_read(struct dwapb_gpio *gpio, unsigned int offset) { - struct gpio_chip *gc = &gpio->ports[0].gc; - void __iomem *reg_base = gpio->regs; + struct gpio_generic_chip *chip = &gpio->ports[0].chip; + void __iomem *reg_base = gpio->regs; - return gc->read_reg(reg_base + gpio_reg_convert(gpio, offset)); + return gpio_generic_read_reg(chip, reg_base + gpio_reg_convert(gpio, offset)); } static inline void dwapb_write(struct dwapb_gpio *gpio, unsigned int offset, u32 val) { - struct gpio_chip *gc = &gpio->ports[0].gc; - void __iomem *reg_base = gpio->regs; + struct gpio_generic_chip *chip = &gpio->ports[0].chip; + void __iomem *reg_base = gpio->regs; - gc->write_reg(reg_base + gpio_reg_convert(gpio, offset), val); + gpio_generic_write_reg(chip, reg_base + gpio_reg_convert(gpio, offset), val); } static struct dwapb_gpio_port *dwapb_offs_to_port(struct dwapb_gpio *gpio, unsigned int offs) @@ -186,7 +191,7 @@ static void dwapb_toggle_trigger(struct dwapb_gpio *gpio, unsigned int offs) if (!port) return; - gc = &port->gc; + gc = &port->chip.gc; pol = dwapb_read(gpio, GPIO_INT_POLARITY); /* Just read the current value right out of the data register */ @@ -201,13 +206,13 @@ static void dwapb_toggle_trigger(struct dwapb_gpio *gpio, unsigned int offs) static u32 dwapb_do_irq(struct dwapb_gpio *gpio) { - struct gpio_chip *gc = &gpio->ports[0].gc; + struct gpio_generic_chip *gen_gc = &gpio->ports[0].chip; unsigned long irq_status; irq_hw_number_t hwirq; irq_status = dwapb_read(gpio, GPIO_INTSTATUS); for_each_set_bit(hwirq, &irq_status, DWAPB_MAX_GPIOS) { - int gpio_irq = irq_find_mapping(gc->irq.domain, hwirq); + int gpio_irq = irq_find_mapping(gen_gc->gc.irq.domain, hwirq); u32 irq_type = irq_get_trigger_type(gpio_irq); generic_handle_irq(gpio_irq); @@ -237,27 +242,27 @@ static irqreturn_t dwapb_irq_handler_mfd(int irq, void *dev_id) static void dwapb_irq_ack(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); u32 val = BIT(irqd_to_hwirq(d)); - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); + dwapb_write(gpio, GPIO_PORTA_EOI, val); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static void dwapb_irq_mask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); irq_hw_number_t hwirq = irqd_to_hwirq(d); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - val = dwapb_read(gpio, GPIO_INTMASK) | BIT(hwirq); - dwapb_write(gpio, GPIO_INTMASK, val); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + scoped_guard(gpio_generic_lock_irqsave, gen_gc) { + val = dwapb_read(gpio, GPIO_INTMASK) | BIT(hwirq); + dwapb_write(gpio, GPIO_INTMASK, val); + } gpiochip_disable_irq(gc, hwirq); } @@ -265,59 +270,61 @@ static void dwapb_irq_mask(struct irq_data *d) static void dwapb_irq_unmask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); irq_hw_number_t hwirq = irqd_to_hwirq(d); - unsigned long flags; u32 val; gpiochip_enable_irq(gc, hwirq); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); + val = dwapb_read(gpio, GPIO_INTMASK) & ~BIT(hwirq); dwapb_write(gpio, GPIO_INTMASK, val); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static void dwapb_irq_enable(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); irq_hw_number_t hwirq = irqd_to_hwirq(d); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); + val = dwapb_read(gpio, GPIO_INTEN) | BIT(hwirq); dwapb_write(gpio, GPIO_INTEN, val); val = dwapb_read(gpio, GPIO_INTMASK) & ~BIT(hwirq); dwapb_write(gpio, GPIO_INTMASK, val); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static void dwapb_irq_disable(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); irq_hw_number_t hwirq = irqd_to_hwirq(d); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); + val = dwapb_read(gpio, GPIO_INTMASK) | BIT(hwirq); dwapb_write(gpio, GPIO_INTMASK, val); val = dwapb_read(gpio, GPIO_INTEN) & ~BIT(hwirq); dwapb_write(gpio, GPIO_INTEN, val); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static int dwapb_irq_set_type(struct irq_data *d, u32 type) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); irq_hw_number_t bit = irqd_to_hwirq(d); - unsigned long level, polarity, flags; + unsigned long level, polarity; + + guard(gpio_generic_lock_irqsave)(gen_gc); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); level = dwapb_read(gpio, GPIO_INTTYPE_LEVEL); polarity = dwapb_read(gpio, GPIO_INT_POLARITY); @@ -352,7 +359,6 @@ static int dwapb_irq_set_type(struct irq_data *d, u32 type) dwapb_write(gpio, GPIO_INTTYPE_LEVEL, level); if (type != IRQ_TYPE_EDGE_BOTH) dwapb_write(gpio, GPIO_INT_POLARITY, polarity); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); return 0; } @@ -393,11 +399,12 @@ static int dwapb_gpio_set_debounce(struct gpio_chip *gc, unsigned offset, unsigned debounce) { struct dwapb_gpio_port *port = gpiochip_get_data(gc); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = port->gpio; - unsigned long flags, val_deb; + unsigned long val_deb; unsigned long mask = BIT(offset); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); val_deb = dwapb_read(gpio, GPIO_PORTA_DEBOUNCE); if (debounce) @@ -406,8 +413,6 @@ static int dwapb_gpio_set_debounce(struct gpio_chip *gc, val_deb &= ~mask; dwapb_write(gpio, GPIO_PORTA_DEBOUNCE, val_deb); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); - return 0; } @@ -445,7 +450,7 @@ static void dwapb_configure_irqs(struct dwapb_gpio *gpio, struct dwapb_port_property *pp) { struct dwapb_gpio_port_irqchip *pirq; - struct gpio_chip *gc = &port->gc; + struct gpio_chip *gc = &port->chip.gc; struct gpio_irq_chip *girq; int err; @@ -501,6 +506,7 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio, struct dwapb_port_property *pp, unsigned int offs) { + struct gpio_generic_chip_config config; struct dwapb_gpio_port *port; void __iomem *dat, *set, *dirout; int err; @@ -519,32 +525,39 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio, set = gpio->regs + GPIO_SWPORTA_DR + pp->idx * GPIO_SWPORT_DR_STRIDE; dirout = gpio->regs + GPIO_SWPORTA_DDR + pp->idx * GPIO_SWPORT_DDR_STRIDE; + config = (typeof(config)){ + .dev = gpio->dev, + .sz = 4, + .dat = dat, + .set = set, + .dirout = dirout, + }; + /* This registers 32 GPIO lines per port */ - err = bgpio_init(&port->gc, gpio->dev, 4, dat, set, NULL, dirout, - NULL, 0); + err = gpio_generic_chip_init(&port->chip, &config); if (err) { dev_err(gpio->dev, "failed to init gpio chip for port%d\n", port->idx); return err; } - port->gc.fwnode = pp->fwnode; - port->gc.ngpio = pp->ngpio; - port->gc.base = pp->gpio_base; - port->gc.request = gpiochip_generic_request; - port->gc.free = gpiochip_generic_free; + port->chip.gc.fwnode = pp->fwnode; + port->chip.gc.ngpio = pp->ngpio; + port->chip.gc.base = pp->gpio_base; + port->chip.gc.request = gpiochip_generic_request; + port->chip.gc.free = gpiochip_generic_free; /* Only port A support debounce */ if (pp->idx == 0) - port->gc.set_config = dwapb_gpio_set_config; + port->chip.gc.set_config = dwapb_gpio_set_config; else - port->gc.set_config = gpiochip_generic_config; + port->chip.gc.set_config = gpiochip_generic_config; /* Only port A can provide interrupts in all configurations of the IP */ if (pp->idx == 0) dwapb_configure_irqs(gpio, port, pp); - err = devm_gpiochip_add_data(gpio->dev, &port->gc, port); + err = devm_gpiochip_add_data(gpio->dev, &port->chip.gc, port); if (err) { dev_err(gpio->dev, "failed to register gpiochip for port%d\n", port->idx); @@ -750,38 +763,37 @@ static int dwapb_gpio_probe(struct platform_device *pdev) static int dwapb_gpio_suspend(struct device *dev) { struct dwapb_gpio *gpio = dev_get_drvdata(dev); - struct gpio_chip *gc = &gpio->ports[0].gc; - unsigned long flags; + struct gpio_generic_chip *gen_gc = &gpio->ports[0].chip; int i; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - for (i = 0; i < gpio->nr_ports; i++) { - unsigned int offset; - unsigned int idx = gpio->ports[i].idx; - struct dwapb_context *ctx = gpio->ports[i].ctx; + scoped_guard(gpio_generic_lock_irqsave, gen_gc) { + for (i = 0; i < gpio->nr_ports; i++) { + unsigned int offset; + unsigned int idx = gpio->ports[i].idx; + struct dwapb_context *ctx = gpio->ports[i].ctx; - offset = GPIO_SWPORTA_DDR + idx * GPIO_SWPORT_DDR_STRIDE; - ctx->dir = dwapb_read(gpio, offset); + offset = GPIO_SWPORTA_DDR + idx * GPIO_SWPORT_DDR_STRIDE; + ctx->dir = dwapb_read(gpio, offset); - offset = GPIO_SWPORTA_DR + idx * GPIO_SWPORT_DR_STRIDE; - ctx->data = dwapb_read(gpio, offset); + offset = GPIO_SWPORTA_DR + idx * GPIO_SWPORT_DR_STRIDE; + ctx->data = dwapb_read(gpio, offset); - offset = GPIO_EXT_PORTA + idx * GPIO_EXT_PORT_STRIDE; - ctx->ext = dwapb_read(gpio, offset); + offset = GPIO_EXT_PORTA + idx * GPIO_EXT_PORT_STRIDE; + ctx->ext = dwapb_read(gpio, offset); - /* Only port A can provide interrupts */ - if (idx == 0) { - ctx->int_mask = dwapb_read(gpio, GPIO_INTMASK); - ctx->int_en = dwapb_read(gpio, GPIO_INTEN); - ctx->int_pol = dwapb_read(gpio, GPIO_INT_POLARITY); - ctx->int_type = dwapb_read(gpio, GPIO_INTTYPE_LEVEL); - ctx->int_deb = dwapb_read(gpio, GPIO_PORTA_DEBOUNCE); - - /* Mask out interrupts */ - dwapb_write(gpio, GPIO_INTMASK, ~ctx->wake_en); + /* Only port A can provide interrupts */ + if (idx == 0) { + ctx->int_mask = dwapb_read(gpio, GPIO_INTMASK); + ctx->int_en = dwapb_read(gpio, GPIO_INTEN); + ctx->int_pol = dwapb_read(gpio, GPIO_INT_POLARITY); + ctx->int_type = dwapb_read(gpio, GPIO_INTTYPE_LEVEL); + ctx->int_deb = dwapb_read(gpio, GPIO_PORTA_DEBOUNCE); + + /* Mask out interrupts */ + dwapb_write(gpio, GPIO_INTMASK, ~ctx->wake_en); + } } } - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); clk_bulk_disable_unprepare(DWAPB_NR_CLOCKS, gpio->clks); @@ -791,8 +803,8 @@ static int dwapb_gpio_suspend(struct device *dev) static int dwapb_gpio_resume(struct device *dev) { struct dwapb_gpio *gpio = dev_get_drvdata(dev); - struct gpio_chip *gc = &gpio->ports[0].gc; - unsigned long flags; + struct gpio_chip *gc = &gpio->ports[0].chip.gc; + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); int i, err; err = clk_bulk_prepare_enable(DWAPB_NR_CLOCKS, gpio->clks); @@ -801,7 +813,8 @@ static int dwapb_gpio_resume(struct device *dev) return err; } - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); + for (i = 0; i < gpio->nr_ports; i++) { unsigned int offset; unsigned int idx = gpio->ports[i].idx; @@ -828,7 +841,6 @@ static int dwapb_gpio_resume(struct device *dev) dwapb_write(gpio, GPIO_PORTA_EOI, 0xffffffff); } } - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); return 0; } From 728e0ca4e196d65e00775ea3f7a49ce008fbd3a7 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:51 +0200 Subject: [PATCH 0488/3206] gpio: amdpt: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-10-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-amdpt.c | 44 ++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/gpio/gpio-amdpt.c b/drivers/gpio/gpio-amdpt.c index b70036587d9c3f..0a9b870705b90b 100644 --- a/drivers/gpio/gpio-amdpt.c +++ b/drivers/gpio/gpio-amdpt.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -24,54 +25,50 @@ #define PT_SYNC_REG 0x28 struct pt_gpio_chip { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *reg_base; }; static int pt_gpio_request(struct gpio_chip *gc, unsigned offset) { + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct pt_gpio_chip *pt_gpio = gpiochip_get_data(gc); - unsigned long flags; u32 using_pins; dev_dbg(gc->parent, "pt_gpio_request offset=%x\n", offset); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); using_pins = readl(pt_gpio->reg_base + PT_SYNC_REG); if (using_pins & BIT(offset)) { dev_warn(gc->parent, "PT GPIO pin %x reconfigured\n", offset); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); return -EINVAL; } writel(using_pins | BIT(offset), pt_gpio->reg_base + PT_SYNC_REG); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); - return 0; } static void pt_gpio_free(struct gpio_chip *gc, unsigned offset) { + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct pt_gpio_chip *pt_gpio = gpiochip_get_data(gc); - unsigned long flags; u32 using_pins; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); using_pins = readl(pt_gpio->reg_base + PT_SYNC_REG); using_pins &= ~BIT(offset); writel(using_pins, pt_gpio->reg_base + PT_SYNC_REG); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); - dev_dbg(gc->parent, "pt_gpio_free offset=%x\n", offset); } static int pt_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct pt_gpio_chip *pt_gpio; int ret = 0; @@ -91,22 +88,27 @@ static int pt_gpio_probe(struct platform_device *pdev) return PTR_ERR(pt_gpio->reg_base); } - ret = bgpio_init(&pt_gpio->gc, dev, 4, - pt_gpio->reg_base + PT_INPUTDATA_REG, - pt_gpio->reg_base + PT_OUTPUTDATA_REG, NULL, - pt_gpio->reg_base + PT_DIRECTION_REG, NULL, - BGPIOF_READ_OUTPUT_REG_SET); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = pt_gpio->reg_base + PT_INPUTDATA_REG, + .set = pt_gpio->reg_base + PT_OUTPUTDATA_REG, + .dirout = pt_gpio->reg_base + PT_DIRECTION_REG, + .flags = BGPIOF_READ_OUTPUT_REG_SET, + }; + + ret = gpio_generic_chip_init(&pt_gpio->chip, &config); if (ret) { - dev_err(dev, "bgpio_init failed\n"); + dev_err(dev, "failed to initialize the generic GPIO chip\n"); return ret; } - pt_gpio->gc.owner = THIS_MODULE; - pt_gpio->gc.request = pt_gpio_request; - pt_gpio->gc.free = pt_gpio_free; - pt_gpio->gc.ngpio = (uintptr_t)device_get_match_data(dev); + pt_gpio->chip.gc.owner = THIS_MODULE; + pt_gpio->chip.gc.request = pt_gpio_request; + pt_gpio->chip.gc.free = pt_gpio_free; + pt_gpio->chip.gc.ngpio = (uintptr_t)device_get_match_data(dev); - ret = devm_gpiochip_add_data(dev, &pt_gpio->gc, pt_gpio); + ret = devm_gpiochip_add_data(dev, &pt_gpio->chip.gc, pt_gpio); if (ret) { dev_err(dev, "Failed to register GPIO lib\n"); return ret; From ebd63ab0f20f4e1960191da6989797ac78fedc4c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:52 +0200 Subject: [PATCH 0489/3206] gpio: rda: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-11-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-rda.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/gpio/gpio-rda.c b/drivers/gpio/gpio-rda.c index cb2f63eee2aa10..bcd85a2237a532 100644 --- a/drivers/gpio/gpio-rda.c +++ b/drivers/gpio/gpio-rda.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -35,7 +36,7 @@ #define RDA_GPIO_BANK_NR 32 struct rda_gpio { - struct gpio_chip chip; + struct gpio_generic_chip chip; void __iomem *base; spinlock_t lock; int irq; @@ -208,6 +209,7 @@ static const struct irq_chip rda_gpio_irq_chip = { static int rda_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct gpio_irq_chip *girq; struct rda_gpio *rda_gpio; @@ -235,24 +237,29 @@ static int rda_gpio_probe(struct platform_device *pdev) spin_lock_init(&rda_gpio->lock); - ret = bgpio_init(&rda_gpio->chip, dev, 4, - rda_gpio->base + RDA_GPIO_VAL, - rda_gpio->base + RDA_GPIO_SET, - rda_gpio->base + RDA_GPIO_CLR, - rda_gpio->base + RDA_GPIO_OEN_SET_OUT, - rda_gpio->base + RDA_GPIO_OEN_SET_IN, - BGPIOF_READ_OUTPUT_REG_SET); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = rda_gpio->base + RDA_GPIO_VAL, + .set = rda_gpio->base + RDA_GPIO_SET, + .clr = rda_gpio->base + RDA_GPIO_CLR, + .dirout = rda_gpio->base + RDA_GPIO_OEN_SET_OUT, + .dirin = rda_gpio->base + RDA_GPIO_OEN_SET_IN, + .flags = BGPIOF_READ_OUTPUT_REG_SET, + }; + + ret = gpio_generic_chip_init(&rda_gpio->chip, &config); if (ret) { - dev_err(dev, "bgpio_init failed\n"); + dev_err(dev, "failed to initialize the generic GPIO chip\n"); return ret; } - rda_gpio->chip.label = dev_name(dev); - rda_gpio->chip.ngpio = ngpios; - rda_gpio->chip.base = -1; + rda_gpio->chip.gc.label = dev_name(dev); + rda_gpio->chip.gc.ngpio = ngpios; + rda_gpio->chip.gc.base = -1; if (rda_gpio->irq >= 0) { - girq = &rda_gpio->chip.irq; + girq = &rda_gpio->chip.gc.irq; gpio_irq_chip_set_chip(girq, &rda_gpio_irq_chip); girq->handler = handle_bad_irq; girq->default_type = IRQ_TYPE_NONE; @@ -269,7 +276,7 @@ static int rda_gpio_probe(struct platform_device *pdev) platform_set_drvdata(pdev, rda_gpio); - return devm_gpiochip_add_data(dev, &rda_gpio->chip, rda_gpio); + return devm_gpiochip_add_data(dev, &rda_gpio->chip.gc, rda_gpio); } static const struct of_device_id rda_gpio_of_match[] = { From 67e4be48f409ba70738eef3c195a81455f526f83 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:53 +0200 Subject: [PATCH 0490/3206] gpio: grgpio: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-12-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-grgpio.c | 87 +++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/drivers/gpio/gpio-grgpio.c b/drivers/gpio/gpio-grgpio.c index f3f8bab62f94ce..3b77fad00749cd 100644 --- a/drivers/gpio/gpio-grgpio.c +++ b/drivers/gpio/gpio-grgpio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -59,7 +60,7 @@ struct grgpio_lirq { }; struct grgpio_priv { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *regs; struct device *dev; @@ -91,13 +92,12 @@ struct grgpio_priv { static void grgpio_set_imask(struct grgpio_priv *priv, unsigned int offset, int val) { - struct gpio_chip *gc = &priv->gc; - if (val) priv->imask |= BIT(offset); else priv->imask &= ~BIT(offset); - gc->write_reg(priv->regs + GRGPIO_IMASK, priv->imask); + + gpio_generic_write_reg(&priv->chip, priv->regs + GRGPIO_IMASK, priv->imask); } static int grgpio_to_irq(struct gpio_chip *gc, unsigned offset) @@ -118,7 +118,6 @@ static int grgpio_to_irq(struct gpio_chip *gc, unsigned offset) static int grgpio_irq_set_type(struct irq_data *d, unsigned int type) { struct grgpio_priv *priv = irq_data_get_irq_chip_data(d); - unsigned long flags; u32 mask = BIT(d->hwirq); u32 ipol; u32 iedge; @@ -146,15 +145,13 @@ static int grgpio_irq_set_type(struct irq_data *d, unsigned int type) return -EINVAL; } - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&priv->chip); - ipol = priv->gc.read_reg(priv->regs + GRGPIO_IPOL) & ~mask; - iedge = priv->gc.read_reg(priv->regs + GRGPIO_IEDGE) & ~mask; + ipol = gpio_generic_read_reg(&priv->chip, priv->regs + GRGPIO_IPOL) & ~mask; + iedge = gpio_generic_read_reg(&priv->chip, priv->regs + GRGPIO_IEDGE) & ~mask; - priv->gc.write_reg(priv->regs + GRGPIO_IPOL, ipol | pol); - priv->gc.write_reg(priv->regs + GRGPIO_IEDGE, iedge | edge); - - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + gpio_generic_write_reg(&priv->chip, priv->regs + GRGPIO_IPOL, ipol | pol); + gpio_generic_write_reg(&priv->chip, priv->regs + GRGPIO_IEDGE, iedge | edge); return 0; } @@ -163,29 +160,23 @@ static void grgpio_irq_mask(struct irq_data *d) { struct grgpio_priv *priv = irq_data_get_irq_chip_data(d); int offset = d->hwirq; - unsigned long flags; - - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); - grgpio_set_imask(priv, offset, 0); + scoped_guard(gpio_generic_lock_irqsave, &priv->chip) + grgpio_set_imask(priv, offset, 0); - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); - - gpiochip_disable_irq(&priv->gc, d->hwirq); + gpiochip_disable_irq(&priv->chip.gc, d->hwirq); } static void grgpio_irq_unmask(struct irq_data *d) { struct grgpio_priv *priv = irq_data_get_irq_chip_data(d); int offset = d->hwirq; - unsigned long flags; - gpiochip_enable_irq(&priv->gc, d->hwirq); - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + gpiochip_enable_irq(&priv->chip.gc, d->hwirq); - grgpio_set_imask(priv, offset, 1); + guard(gpio_generic_lock_irqsave)(&priv->chip); - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + grgpio_set_imask(priv, offset, 1); } static const struct irq_chip grgpio_irq_chip = { @@ -200,12 +191,11 @@ static const struct irq_chip grgpio_irq_chip = { static irqreturn_t grgpio_irq_handler(int irq, void *dev) { struct grgpio_priv *priv = dev; - int ngpio = priv->gc.ngpio; - unsigned long flags; + int ngpio = priv->chip.gc.ngpio; int i; int match = 0; - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&priv->chip); /* * For each gpio line, call its interrupt handler if it its underlying @@ -221,8 +211,6 @@ static irqreturn_t grgpio_irq_handler(int irq, void *dev) } } - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); - if (!match) dev_warn(priv->dev, "No gpio line matched irq %d\n", irq); @@ -253,13 +241,18 @@ static int grgpio_irq_map(struct irq_domain *d, unsigned int irq, dev_dbg(priv->dev, "Mapping irq %d for gpio line %d\n", irq, offset); - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_lock_irqsave(&priv->chip, flags); /* Request underlying irq if not already requested */ lirq->irq = irq; uirq = &priv->uirqs[lirq->index]; if (uirq->refcnt == 0) { - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + /* + * FIXME: This is not how locking works at all, you can't just + * release the lock for a moment to do something that can't + * sleep... + */ + gpio_generic_chip_unlock_irqrestore(&priv->chip, flags); ret = request_irq(uirq->uirq, grgpio_irq_handler, 0, dev_name(priv->dev), priv); if (ret) { @@ -268,11 +261,11 @@ static int grgpio_irq_map(struct irq_domain *d, unsigned int irq, uirq->uirq); return ret; } - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_lock_irqsave(&priv->chip, flags); } uirq->refcnt++; - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_unlock_irqrestore(&priv->chip, flags); /* Setup irq */ irq_set_chip_data(irq, priv); @@ -290,13 +283,13 @@ static void grgpio_irq_unmap(struct irq_domain *d, unsigned int irq) struct grgpio_lirq *lirq; struct grgpio_uirq *uirq; unsigned long flags; - int ngpio = priv->gc.ngpio; + int ngpio = priv->chip.gc.ngpio; int i; irq_set_chip_and_handler(irq, NULL, NULL); irq_set_chip_data(irq, NULL); - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_lock_irqsave(&priv->chip, flags); /* Free underlying irq if last user unmapped */ index = -1; @@ -315,13 +308,13 @@ static void grgpio_irq_unmap(struct irq_domain *d, unsigned int irq) uirq = &priv->uirqs[lirq->index]; uirq->refcnt--; if (uirq->refcnt == 0) { - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_unlock_irqrestore(&priv->chip, flags); free_irq(uirq->uirq, priv); return; } } - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_unlock_irqrestore(&priv->chip, flags); } static void grgpio_irq_domain_remove(void *data) @@ -341,6 +334,7 @@ static const struct irq_domain_ops grgpio_irq_domain_ops = { static int grgpio_probe(struct platform_device *ofdev) { struct device_node *np = ofdev->dev.of_node; + struct gpio_generic_chip_config config; struct device *dev = &ofdev->dev; void __iomem *regs; struct gpio_chip *gc; @@ -359,17 +353,24 @@ static int grgpio_probe(struct platform_device *ofdev) if (IS_ERR(regs)) return PTR_ERR(regs); - gc = &priv->gc; - err = bgpio_init(gc, dev, 4, regs + GRGPIO_DATA, - regs + GRGPIO_OUTPUT, NULL, regs + GRGPIO_DIR, NULL, - BGPIOF_BIG_ENDIAN_BYTE_ORDER); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = regs + GRGPIO_DATA, + .set = regs + GRGPIO_OUTPUT, + .dirout = regs + GRGPIO_DIR, + .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + }; + + gc = &priv->chip.gc; + err = gpio_generic_chip_init(&priv->chip, &config); if (err) { - dev_err(dev, "bgpio_init() failed\n"); + dev_err(dev, "failed to initialize the generic GPIO chip\n"); return err; } priv->regs = regs; - priv->imask = gc->read_reg(regs + GRGPIO_IMASK); + priv->imask = gpio_generic_read_reg(&priv->chip, regs + GRGPIO_IMASK); priv->dev = dev; gc->owner = THIS_MODULE; From 90ab7050358ffd3311c23b1697df2ba3c8f840c6 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:54 +0200 Subject: [PATCH 0491/3206] gpio: mpc8xxx: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-13-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mpc8xxx.c | 102 ++++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 40 deletions(-) diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index 121efdd71e451d..38643fb813c562 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -34,7 +35,7 @@ #define GPIO_IBE 0x18 struct mpc8xxx_gpio_chip { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *regs; raw_spinlock_t lock; @@ -66,8 +67,10 @@ static int mpc8572_gpio_get(struct gpio_chip *gc, unsigned int gpio) struct mpc8xxx_gpio_chip *mpc8xxx_gc = gpiochip_get_data(gc); u32 out_mask, out_shadow; - out_mask = gc->read_reg(mpc8xxx_gc->regs + GPIO_DIR); - val = gc->read_reg(mpc8xxx_gc->regs + GPIO_DAT) & ~out_mask; + out_mask = gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_DIR); + val = gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_DAT) & ~out_mask; out_shadow = gc->bgpio_data & out_mask; return !!((val | out_shadow) & mpc_pin2mask(gpio)); @@ -108,12 +111,13 @@ static int mpc8xxx_gpio_to_irq(struct gpio_chip *gc, unsigned offset) static irqreturn_t mpc8xxx_gpio_irq_cascade(int irq, void *data) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = data; - struct gpio_chip *gc = &mpc8xxx_gc->gc; unsigned long mask; int i; - mask = gc->read_reg(mpc8xxx_gc->regs + GPIO_IER) - & gc->read_reg(mpc8xxx_gc->regs + GPIO_IMR); + mask = gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IER) & + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IMR); for_each_set_bit(i, &mask, 32) generic_handle_domain_irq(mpc8xxx_gc->irq, 31 - i); @@ -124,15 +128,17 @@ static void mpc8xxx_irq_unmask(struct irq_data *d) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = irq_data_get_irq_chip_data(d); irq_hw_number_t hwirq = irqd_to_hwirq(d); - struct gpio_chip *gc = &mpc8xxx_gc->gc; + struct gpio_chip *gc = &mpc8xxx_gc->chip.gc; unsigned long flags; gpiochip_enable_irq(gc, hwirq); raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(mpc8xxx_gc->regs + GPIO_IMR, - gc->read_reg(mpc8xxx_gc->regs + GPIO_IMR) + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IMR, + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IMR) | mpc_pin2mask(irqd_to_hwirq(d))); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); @@ -142,13 +148,14 @@ static void mpc8xxx_irq_mask(struct irq_data *d) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = irq_data_get_irq_chip_data(d); irq_hw_number_t hwirq = irqd_to_hwirq(d); - struct gpio_chip *gc = &mpc8xxx_gc->gc; + struct gpio_chip *gc = &mpc8xxx_gc->chip.gc; unsigned long flags; raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(mpc8xxx_gc->regs + GPIO_IMR, - gc->read_reg(mpc8xxx_gc->regs + GPIO_IMR) + gpio_generic_write_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_IMR, + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IMR) & ~mpc_pin2mask(irqd_to_hwirq(d))); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); @@ -159,32 +166,34 @@ static void mpc8xxx_irq_mask(struct irq_data *d) static void mpc8xxx_irq_ack(struct irq_data *d) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = irq_data_get_irq_chip_data(d); - struct gpio_chip *gc = &mpc8xxx_gc->gc; - gc->write_reg(mpc8xxx_gc->regs + GPIO_IER, + gpio_generic_write_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_IER, mpc_pin2mask(irqd_to_hwirq(d))); } static int mpc8xxx_irq_set_type(struct irq_data *d, unsigned int flow_type) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = irq_data_get_irq_chip_data(d); - struct gpio_chip *gc = &mpc8xxx_gc->gc; unsigned long flags; switch (flow_type) { case IRQ_TYPE_EDGE_FALLING: case IRQ_TYPE_LEVEL_LOW: raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(mpc8xxx_gc->regs + GPIO_ICR, - gc->read_reg(mpc8xxx_gc->regs + GPIO_ICR) + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_ICR, + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_ICR) | mpc_pin2mask(irqd_to_hwirq(d))); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); break; case IRQ_TYPE_EDGE_BOTH: raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(mpc8xxx_gc->regs + GPIO_ICR, - gc->read_reg(mpc8xxx_gc->regs + GPIO_ICR) + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_ICR, + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_ICR) & ~mpc_pin2mask(irqd_to_hwirq(d))); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); break; @@ -199,7 +208,6 @@ static int mpc8xxx_irq_set_type(struct irq_data *d, unsigned int flow_type) static int mpc512x_irq_set_type(struct irq_data *d, unsigned int flow_type) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = irq_data_get_irq_chip_data(d); - struct gpio_chip *gc = &mpc8xxx_gc->gc; unsigned long gpio = irqd_to_hwirq(d); void __iomem *reg; unsigned int shift; @@ -217,7 +225,9 @@ static int mpc512x_irq_set_type(struct irq_data *d, unsigned int flow_type) case IRQ_TYPE_EDGE_FALLING: case IRQ_TYPE_LEVEL_LOW: raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(reg, (gc->read_reg(reg) & ~(3 << shift)) + gpio_generic_write_reg(&mpc8xxx_gc->chip, reg, + (gpio_generic_read_reg(&mpc8xxx_gc->chip, + reg) & ~(3 << shift)) | (2 << shift)); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); break; @@ -225,14 +235,18 @@ static int mpc512x_irq_set_type(struct irq_data *d, unsigned int flow_type) case IRQ_TYPE_EDGE_RISING: case IRQ_TYPE_LEVEL_HIGH: raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(reg, (gc->read_reg(reg) & ~(3 << shift)) + gpio_generic_write_reg(&mpc8xxx_gc->chip, reg, + (gpio_generic_read_reg(&mpc8xxx_gc->chip, + reg) & ~(3 << shift)) | (1 << shift)); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); break; case IRQ_TYPE_EDGE_BOTH: raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(reg, (gc->read_reg(reg) & ~(3 << shift))); + gpio_generic_write_reg(&mpc8xxx_gc->chip, reg, + (gpio_generic_read_reg(&mpc8xxx_gc->chip, + reg) & ~(3 << shift))); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); break; @@ -309,6 +323,7 @@ static const struct of_device_id mpc8xxx_gpio_ids[] = { static int mpc8xxx_probe(struct platform_device *pdev) { const struct mpc8xxx_gpio_devtype *devtype = NULL; + struct gpio_generic_chip_config config; struct mpc8xxx_gpio_chip *mpc8xxx_gc; struct device *dev = &pdev->dev; struct fwnode_handle *fwnode; @@ -327,26 +342,28 @@ static int mpc8xxx_probe(struct platform_device *pdev) if (IS_ERR(mpc8xxx_gc->regs)) return PTR_ERR(mpc8xxx_gc->regs); - gc = &mpc8xxx_gc->gc; + gc = &mpc8xxx_gc->chip.gc; gc->parent = dev; + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = mpc8xxx_gc->regs + GPIO_DAT, + .dirout = mpc8xxx_gc->regs + GPIO_DIR, + .flags = BGPIOF_BIG_ENDIAN + }; + if (device_property_read_bool(dev, "little-endian")) { - ret = bgpio_init(gc, dev, 4, mpc8xxx_gc->regs + GPIO_DAT, - NULL, NULL, mpc8xxx_gc->regs + GPIO_DIR, - NULL, BGPIOF_BIG_ENDIAN); - if (ret) - return ret; dev_dbg(dev, "GPIO registers are LITTLE endian\n"); } else { - ret = bgpio_init(gc, dev, 4, mpc8xxx_gc->regs + GPIO_DAT, - NULL, NULL, mpc8xxx_gc->regs + GPIO_DIR, - NULL, BGPIOF_BIG_ENDIAN - | BGPIOF_BIG_ENDIAN_BYTE_ORDER); - if (ret) - return ret; + config.flags |= BGPIOF_BIG_ENDIAN_BYTE_ORDER; dev_dbg(dev, "GPIO registers are BIG endian\n"); } + ret = gpio_generic_chip_init(&mpc8xxx_gc->chip, &config); + if (ret) + return ret; + mpc8xxx_gc->direction_output = gc->direction_output; devtype = device_get_match_data(dev); @@ -379,10 +396,13 @@ static int mpc8xxx_probe(struct platform_device *pdev) device_is_compatible(dev, "fsl,ls1028a-gpio") || device_is_compatible(dev, "fsl,ls1088a-gpio") || is_acpi_node(fwnode)) { - gc->write_reg(mpc8xxx_gc->regs + GPIO_IBE, 0xffffffff); + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IBE, 0xffffffff); /* Also, latch state of GPIOs configured as output by bootloader. */ - gc->bgpio_data = gc->read_reg(mpc8xxx_gc->regs + GPIO_DAT) & - gc->read_reg(mpc8xxx_gc->regs + GPIO_DIR); + gc->bgpio_data = gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_DAT) & + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_DIR); } ret = devm_gpiochip_add_data(dev, gc, mpc8xxx_gc); @@ -405,8 +425,10 @@ static int mpc8xxx_probe(struct platform_device *pdev) return 0; /* ack and mask all irqs */ - gc->write_reg(mpc8xxx_gc->regs + GPIO_IER, 0xffffffff); - gc->write_reg(mpc8xxx_gc->regs + GPIO_IMR, 0); + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IER, 0xffffffff); + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IMR, 0); ret = devm_request_irq(dev, mpc8xxx_gc->irqn, mpc8xxx_gpio_irq_cascade, From 56f548840ed90c30269f29c3bdf6037a8a9414a6 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:55 +0200 Subject: [PATCH 0492/3206] gpio: ge: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-14-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ge.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/gpio/gpio-ge.c b/drivers/gpio/gpio-ge.c index 5dc49648d8e378..a02dd322e0d4ce 100644 --- a/drivers/gpio/gpio-ge.c +++ b/drivers/gpio/gpio-ge.c @@ -16,6 +16,7 @@ */ #include +#include #include #include #include @@ -51,24 +52,36 @@ MODULE_DEVICE_TABLE(of, gef_gpio_ids); static int __init gef_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; + struct gpio_generic_chip *chip; struct gpio_chip *gc; void __iomem *regs; int ret; - gc = devm_kzalloc(dev, sizeof(*gc), GFP_KERNEL); - if (!gc) + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); + if (!chip) return -ENOMEM; regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(regs)) return PTR_ERR(regs); - ret = bgpio_init(gc, dev, 4, regs + GEF_GPIO_IN, regs + GEF_GPIO_OUT, - NULL, NULL, regs + GEF_GPIO_DIRECT, - BGPIOF_BIG_ENDIAN_BYTE_ORDER); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = regs + GEF_GPIO_IN, + .set = regs + GEF_GPIO_OUT, + .dirin = regs + GEF_GPIO_DIRECT, + .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + }; + + ret = gpio_generic_chip_init(chip, &config); if (ret) - return dev_err_probe(dev, ret, "bgpio_init failed\n"); + return dev_err_probe(dev, ret, + "failed to initialize the generic GPIO chip\n"); + + gc = &chip->gc; /* Setup pointers to chip functions */ gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pfw", dev_fwnode(dev)); From 331b05b12d25cd2d2f2d7c4114e9ee15146dafd0 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 14 Mar 2025 08:09:51 +0100 Subject: [PATCH 0493/3206] nios2: Replace __ASSEMBLY__ with __ASSEMBLER__ in uapi headers __ASSEMBLY__ is only defined by the Makefile of the kernel, so this is not really useful for uapi headers (unless the userspace Makefile defines it, too). Let's switch to __ASSEMBLER__ which gets set automatically by the compiler when compiling assembly code. This is a completely mechanical patch (done with a simple "sed -i" statement). Cc: Dinh Nguyen Signed-off-by: Thomas Huth Signed-off-by: Dinh Nguyen --- arch/nios2/include/uapi/asm/ptrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/nios2/include/uapi/asm/ptrace.h b/arch/nios2/include/uapi/asm/ptrace.h index 2b91dbe5bcfee5..1298db9f0fc98e 100644 --- a/arch/nios2/include/uapi/asm/ptrace.h +++ b/arch/nios2/include/uapi/asm/ptrace.h @@ -13,7 +13,7 @@ #ifndef _UAPI_ASM_NIOS2_PTRACE_H #define _UAPI_ASM_NIOS2_PTRACE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -80,5 +80,5 @@ struct user_pt_regs { __u32 regs[49]; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_NIOS2_PTRACE_H */ From e6d8afd2ca731105bf7eba69a61c668fead9cf48 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 14 Mar 2025 08:09:52 +0100 Subject: [PATCH 0494/3206] nios2: Replace __ASSEMBLY__ with __ASSEMBLER__ in non-uapi headers While the GCC and Clang compilers already define __ASSEMBLER__ automatically when compiling assembly code, __ASSEMBLY__ is a macro that only gets defined by the Makefiles in the kernel. This can be very confusing when switching between userspace and kernelspace coding, or when dealing with uapi headers that rather should use __ASSEMBLER__ instead. So let's standardize on the __ASSEMBLER__ macro that is provided by the compilers now. This is a completely mechanical patch (done with a simple "sed -i" statement). Cc: Dinh Nguyen Signed-off-by: Thomas Huth Signed-off-by: Dinh Nguyen --- arch/nios2/include/asm/entry.h | 4 ++-- arch/nios2/include/asm/page.h | 4 ++-- arch/nios2/include/asm/processor.h | 4 ++-- arch/nios2/include/asm/ptrace.h | 4 ++-- arch/nios2/include/asm/registers.h | 4 ++-- arch/nios2/include/asm/setup.h | 4 ++-- arch/nios2/include/asm/thread_info.h | 4 ++-- arch/nios2/include/asm/traps.h | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/nios2/include/asm/entry.h b/arch/nios2/include/asm/entry.h index bafb7b2ca59fcb..cb25ed56450ab9 100644 --- a/arch/nios2/include/asm/entry.h +++ b/arch/nios2/include/asm/entry.h @@ -10,7 +10,7 @@ #ifndef _ASM_NIOS2_ENTRY_H #define _ASM_NIOS2_ENTRY_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include #include @@ -117,5 +117,5 @@ addi sp, sp, SWITCH_STACK_SIZE .endm -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_NIOS2_ENTRY_H */ diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 2897ec1b74f618..00a51623d38a54 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -26,7 +26,7 @@ #define PAGE_OFFSET \ (CONFIG_NIOS2_MEM_BASE + CONFIG_NIOS2_KERNEL_REGION_BASE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * This gives the physical RAM offset. @@ -90,6 +90,6 @@ extern struct page *mem_map; #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_NIOS2_PAGE_H */ diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index eb44130364a9a1..d9521c3c2df98e 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -36,7 +36,7 @@ /* Kuser helpers is mapped to this user space address */ #define KUSER_BASE 0x1000 #define KUSER_SIZE (PAGE_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ # define TASK_SIZE 0x7FFF0000UL # define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) @@ -72,6 +72,6 @@ extern unsigned long __get_wchan(struct task_struct *p); #define cpu_relax() barrier() -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_NIOS2_PROCESSOR_H */ diff --git a/arch/nios2/include/asm/ptrace.h b/arch/nios2/include/asm/ptrace.h index 9da34c3022a272..96cbcd40c7ce56 100644 --- a/arch/nios2/include/asm/ptrace.h +++ b/arch/nios2/include/asm/ptrace.h @@ -18,7 +18,7 @@ /* This struct defines the way the registers are stored on the stack during a system call. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct pt_regs { unsigned long r8; /* r8-r15 Caller-saved GP registers */ unsigned long r9; @@ -78,5 +78,5 @@ extern void show_regs(struct pt_regs *); int do_syscall_trace_enter(void); void do_syscall_trace_exit(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_NIOS2_PTRACE_H */ diff --git a/arch/nios2/include/asm/registers.h b/arch/nios2/include/asm/registers.h index 95b67dd16f8188..165dab26221f23 100644 --- a/arch/nios2/include/asm/registers.h +++ b/arch/nios2/include/asm/registers.h @@ -6,7 +6,7 @@ #ifndef _ASM_NIOS2_REGISTERS_H #define _ASM_NIOS2_REGISTERS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #endif @@ -44,7 +44,7 @@ /* tlbmisc register bits */ #define TLBMISC_PID_SHIFT 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define TLBMISC_PID_MASK ((1UL << cpuinfo.tlb_pid_num_bits) - 1) #endif #define TLBMISC_WAY_MASK 0xf diff --git a/arch/nios2/include/asm/setup.h b/arch/nios2/include/asm/setup.h index 908a1526d1bd78..6d3f26a71cb513 100644 --- a/arch/nios2/include/asm/setup.h +++ b/arch/nios2/include/asm/setup.h @@ -8,7 +8,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef __KERNEL__ extern char exception_handler_hook[]; @@ -18,6 +18,6 @@ extern char fast_handler_end[]; extern void pagetable_init(void); #endif/* __KERNEL__ */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_NIOS2_SETUP_H */ diff --git a/arch/nios2/include/asm/thread_info.h b/arch/nios2/include/asm/thread_info.h index 5abac9893b32b5..83df79286d62ee 100644 --- a/arch/nios2/include/asm/thread_info.h +++ b/arch/nios2/include/asm/thread_info.h @@ -24,7 +24,7 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE 8192 /* 2 * PAGE_SIZE */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * low level task data that entry.S needs immediate access to @@ -61,7 +61,7 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)(sp & ~(THREAD_SIZE - 1)); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * thread information flags diff --git a/arch/nios2/include/asm/traps.h b/arch/nios2/include/asm/traps.h index afd77bef01c65d..133a3dedbc3e8d 100644 --- a/arch/nios2/include/asm/traps.h +++ b/arch/nios2/include/asm/traps.h @@ -12,7 +12,7 @@ #define TRAP_ID_SYSCALL 0 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr); void do_page_fault(struct pt_regs *regs, unsigned long cause, unsigned long address); From a20b83cf45be2057f3d073506779e52c7fa17f94 Mon Sep 17 00:00:00 2001 From: Simon Schuster Date: Thu, 21 Aug 2025 12:37:07 +0200 Subject: [PATCH 0495/3206] nios2: ensure that memblock.current_limit is set when setting pfn limits On nios2, with CONFIG_FLATMEM set, the kernel relies on memblock_get_current_limit() to determine the limits of mem_map, in particular for max_low_pfn. Unfortunately, memblock.current_limit is only default initialized to MEMBLOCK_ALLOC_ANYWHERE at this point of the bootup, potentially leading to situations where max_low_pfn can erroneously exceed the value of max_pfn and, thus, the valid range of available DRAM. This can in turn cause kernel-level paging failures, e.g.: [ 76.900000] Unable to handle kernel paging request at virtual address 20303000 [ 76.900000] ea = c0080890, ra = c000462c, cause = 14 [ 76.900000] Kernel panic - not syncing: Oops [ 76.900000] ---[ end Kernel panic - not syncing: Oops ]--- This patch fixes this by pre-calculating memblock.current_limit based on the upper limits of the available memory ranges via adjust_lowmem_bounds, a simplified version of the equivalent implementation within the arm architecture. Signed-off-by: Simon Schuster Signed-off-by: Andreas Oetken Signed-off-by: Dinh Nguyen --- arch/nios2/kernel/setup.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index 2a40150142c36f..f43f01c4ab934b 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -142,6 +142,20 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low, *max_high = PFN_DOWN(memblock_end_of_DRAM()); } +static void __init adjust_lowmem_bounds(void) +{ + phys_addr_t block_start, block_end; + u64 i; + phys_addr_t memblock_limit = 0; + + for_each_mem_range(i, &block_start, &block_end) { + if (block_end > memblock_limit) + memblock_limit = block_end; + } + + memblock_set_current_limit(memblock_limit); +} + void __init setup_arch(char **cmdline_p) { console_verbose(); @@ -157,6 +171,7 @@ void __init setup_arch(char **cmdline_p) /* Keep a copy of command line */ *cmdline_p = boot_command_line; + adjust_lowmem_bounds(); find_limits(&min_low_pfn, &max_low_pfn, &max_pfn); memblock_reserve(__pa_symbol(_stext), _end - _stext); From 14498ca7e0f18cd73ab0010cbfbf9413a9f0a96f Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 11 Aug 2025 14:52:38 +0200 Subject: [PATCH 0496/3206] fs: Use try_cmpxchg() in start_dir_add() Use try_cmpxchg() instead of cmpxchg(*ptr, old, new) == old. The x86 CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after CMPXCHG (and related move instruction in front of CMPXCHG). Note that the value from *ptr should be read using READ_ONCE() to prevent the compiler from merging, refetching or reordering the read. No functional change intended. Signed-off-by: Uros Bizjak Link: https://lore.kernel.org/20250811125308.616717-1-ubizjak@gmail.com Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Christian Brauner --- fs/dcache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 60046ae23d5148..336bdb4c4b1f4e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2509,8 +2509,8 @@ static inline unsigned start_dir_add(struct inode *dir) { preempt_disable_nested(); for (;;) { - unsigned n = dir->i_dir_seq; - if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) + unsigned n = READ_ONCE(dir->i_dir_seq); + if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1)) return n; cpu_relax(); } From ec6f613ef376410753173f8236bad5f07a86503a Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 11 Aug 2025 15:23:03 +0200 Subject: [PATCH 0497/3206] fs: Use try_cmpxchg() in sb_init_done_wq() Use !try_cmpxchg() instead of cmpxchg(*ptr, old, new) != old. The x86 CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after CMPXCHG. No functional change intended. Signed-off-by: Uros Bizjak Link: https://lore.kernel.org/20250811132326.620521-1-ubizjak@gmail.com Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/super.c b/fs/super.c index 7f876f32343ad4..e9171801770182 100644 --- a/fs/super.c +++ b/fs/super.c @@ -2318,13 +2318,15 @@ int sb_init_dio_done_wq(struct super_block *sb) sb->s_id); if (!wq) return -ENOMEM; + + old = NULL; /* * This has to be atomic as more DIOs can race to create the workqueue */ - old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); - /* Someone created workqueue before us? Free ours... */ - if (old) + if (!try_cmpxchg(&sb->s_dio_done_wq, &old, wq)) { + /* Someone created workqueue before us? Free ours... */ destroy_workqueue(wq); + } return 0; } EXPORT_SYMBOL_GPL(sb_init_dio_done_wq); From 5e088248375d171b80c643051e77ade6b97bc386 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Mon, 25 Aug 2025 15:36:09 +0800 Subject: [PATCH 0498/3206] exec: Fix incorrect type for ret In the setup_arg_pages(), ret is declared as an unsigned long. The ret might take a negative value. Therefore, its type should be changed to int. Signed-off-by: Xichao Zhao Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20250825073609.219855-1-zhao.xichao@vivo.com Signed-off-by: Kees Cook --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 2a1e5e4042a149..5d236bb87df5f6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -599,7 +599,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { - unsigned long ret; + int ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; From 85fe9f565d2d5af95ac2bbaa5082b8ce62b039f5 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Wed, 13 Aug 2025 15:43:20 +0300 Subject: [PATCH 0499/3206] IB/mlx5: Fix obj_type mismatch for SRQ event subscriptions Fix a bug where the driver's event subscription logic for SRQ-related events incorrectly sets obj_type for RMP objects. When subscribing to SRQ events, get_legacy_obj_type() did not handle the MLX5_CMD_OP_CREATE_RMP case, which caused obj_type to be 0 (default). This led to a mismatch between the obj_type used during subscription (0) and the value used during notification (1, taken from the event's type field). As a result, event mapping for SRQ objects could fail and event notification would not be delivered correctly. This fix adds handling for MLX5_CMD_OP_CREATE_RMP in get_legacy_obj_type, returning MLX5_EVENT_QUEUE_TYPE_RQ so obj_type is consistent between subscription and notification. Fixes: 759738537142 ("IB/mlx5: Enable subscription for device events over DEVX") Link: https://patch.msgid.link/r/8f1048e3fdd1fde6b90607ce0ed251afaf8a148c.1755088962.git.leon@kernel.org Signed-off-by: Or Har-Toov Reviewed-by: Edward Srouji Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 028d9f031ddee3..8b506417ad2fbe 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -233,6 +233,7 @@ static u16 get_legacy_obj_type(u16 opcode) { switch (opcode) { case MLX5_CMD_OP_CREATE_RQ: + case MLX5_CMD_OP_CREATE_RMP: return MLX5_EVENT_QUEUE_TYPE_RQ; case MLX5_CMD_OP_CREATE_QP: return MLX5_EVENT_QUEUE_TYPE_QP; From 8d2a75589599fd8cd5c542c8052642445652ca5e Mon Sep 17 00:00:00 2001 From: Tiffany Yang Date: Fri, 22 Aug 2025 19:21:28 -0700 Subject: [PATCH 0500/3206] cgroup: Fix 64-bit division in cgroup.stat.local Fix the following build error for 32-bit systems: arm-linux-gnueabi-ld: kernel/cgroup/cgroup.o: in function `cgroup_core_local_stat_show': >> kernel/cgroup/cgroup.c:3781:(.text+0x28f4): undefined reference to `__aeabi_uldivmod' arm-linux-gnueabi-ld: (__aeabi_uldivmod): Unknown destination type (ARM/Thumb) in kernel/cgroup/cgroup.o >> kernel/cgroup/cgroup.c:3781:(.text+0x28f4): dangerous relocation: unsupported relocation Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508230604.KyvqOy81-lkp@intel.com/ Signed-off-by: Tiffany Yang Cc: Tejun Heo Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ab096b884bbc73..b38d7a847ed4a5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3778,8 +3778,8 @@ static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) cgrp->freezer.freeze_start_nsec); } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); - seq_printf(seq, "frozen_usec %llu\n", - (unsigned long long) freeze_time / NSEC_PER_USEC); + do_div(freeze_time, NSEC_PER_USEC); + seq_printf(seq, "frozen_usec %llu\n", freeze_time); return 0; } From 5806b3d05165568eee665399d3c04349c151a0b9 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Mon, 25 Aug 2025 03:23:50 +0000 Subject: [PATCH 0501/3206] cpuset: decouple tmpmasks and cpumasks freeing in cgroup Currently, free_cpumasks() can free both tmpmasks and cpumasks of a cpuset (cs). However, these two operations are not logically coupled. To improve code clarity: 1. Move cpumask freeing to free_cpuset() 2. Rename free_cpumasks() to free_tmpmasks() This change enforces the single responsibility principle. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 3466ebbf10164a..aebda14cc67ff3 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -459,23 +459,14 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) } /** - * free_cpumasks - free cpumasks in a tmpmasks structure - * @cs: the cpuset that have cpumasks to be free. + * free_tmpmasks - free cpumasks in a tmpmasks structure * @tmp: the tmpmasks structure pointer */ -static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline void free_tmpmasks(struct tmpmasks *tmp) { - if (cs) { - free_cpumask_var(cs->cpus_allowed); - free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->effective_xcpus); - free_cpumask_var(cs->exclusive_cpus); - } - if (tmp) { - free_cpumask_var(tmp->new_cpus); - free_cpumask_var(tmp->addmask); - free_cpumask_var(tmp->delmask); - } + free_cpumask_var(tmp->new_cpus); + free_cpumask_var(tmp->addmask); + free_cpumask_var(tmp->delmask); } /** @@ -508,7 +499,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static inline void free_cpuset(struct cpuset *cs) { - free_cpumasks(cs, NULL); + free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->effective_cpus); + free_cpumask_var(cs->effective_xcpus); + free_cpumask_var(cs->exclusive_cpus); kfree(cs); } @@ -2427,7 +2421,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); out_free: - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return retval; } @@ -2530,7 +2524,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return 0; } @@ -2983,7 +2977,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) notify_partition_change(cs, old_prs); if (force_sd_rebuild) rebuild_sched_domains_locked(); - free_cpumasks(NULL, &tmpmask); + free_tmpmasks(&tmpmask); return 0; } @@ -4006,7 +4000,7 @@ static void cpuset_handle_hotplug(void) if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); - free_cpumasks(NULL, ptmp); + free_tmpmasks(ptmp); } void cpuset_update_active_cpus(void) From ada00d51622822b151e1b8cc2bc85a20d2191349 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Mon, 25 Aug 2025 03:23:51 +0000 Subject: [PATCH 0502/3206] cpuset: separate tmpmasks and cpuset allocation logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original alloc_cpumasks() served dual purposes: allocating cpumasks for both temporary masks (tmpmasks) and cpuset structures. This patch: 1. Decouples these allocation paths for better code clarity 2. Introduces dedicated alloc_tmpmasks() and dup_or_alloc_cpuset() functions 3. Maintains symmetric pairing: - alloc_tmpmasks() ↔ free_tmpmasks() - dup_or_alloc_cpuset() ↔ free_cpuset() Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 127 ++++++++++++++++++++++------------------- 1 file changed, 69 insertions(+), 58 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index aebda14cc67ff3..7b0b81c835bf2f 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -411,51 +411,47 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) } /** - * alloc_cpumasks - allocate three cpumasks for cpuset - * @cs: the cpuset that have cpumasks to be allocated. - * @tmp: the tmpmasks structure pointer + * alloc_cpumasks - Allocate an array of cpumask variables + * @pmasks: Pointer to array of cpumask_var_t pointers + * @size: Number of cpumasks to allocate * Return: 0 if successful, -ENOMEM otherwise. * - * Only one of the two input arguments should be non-NULL. + * Allocates @size cpumasks and initializes them to empty. Returns 0 on + * success, -ENOMEM on allocation failure. On failure, any previously + * allocated cpumasks are freed. */ -static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) { - cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; + int i; - if (cs) { - pmask1 = &cs->cpus_allowed; - pmask2 = &cs->effective_cpus; - pmask3 = &cs->effective_xcpus; - pmask4 = &cs->exclusive_cpus; - } else { - pmask1 = &tmp->new_cpus; - pmask2 = &tmp->addmask; - pmask3 = &tmp->delmask; - pmask4 = NULL; + for (i = 0; i < size; i++) { + if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) { + while (--i >= 0) + free_cpumask_var(*pmasks[i]); + return -ENOMEM; + } } - - if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) - goto free_one; - - if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) - goto free_two; - - if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) - goto free_three; - - return 0; +} -free_three: - free_cpumask_var(*pmask3); -free_two: - free_cpumask_var(*pmask2); -free_one: - free_cpumask_var(*pmask1); - return -ENOMEM; +/** + * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. + * @tmp: Pointer to tmpmasks structure to populate + * Return: 0 on success, -ENOMEM on allocation failure + */ +static inline int alloc_tmpmasks(struct tmpmasks *tmp) +{ + /* + * Array of pointers to the three cpumask_var_t fields in tmpmasks. + * Note: Array size must match actual number of masks (3) + */ + cpumask_var_t *pmask[3] = { + &tmp->new_cpus, + &tmp->addmask, + &tmp->delmask + }; + + return alloc_cpumasks(pmask, ARRAY_SIZE(pmask)); } /** @@ -470,26 +466,46 @@ static inline void free_tmpmasks(struct tmpmasks *tmp) } /** - * alloc_trial_cpuset - allocate a trial cpuset - * @cs: the cpuset that the trial cpuset duplicates + * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset + * @cs: Source cpuset to duplicate (NULL for a fresh allocation) + * + * Creates a new cpuset by either: + * 1. Duplicating an existing cpuset (if @cs is non-NULL), or + * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) + * + * Return: Pointer to newly allocated cpuset on success, NULL on failure */ -static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) +static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) { struct cpuset *trial; - trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + /* Allocate base structure */ + trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : + kzalloc(sizeof(*cs), GFP_KERNEL); if (!trial) return NULL; - if (alloc_cpumasks(trial, NULL)) { + /* Setup cpumask pointer array */ + cpumask_var_t *pmask[4] = { + &trial->cpus_allowed, + &trial->effective_cpus, + &trial->effective_xcpus, + &trial->exclusive_cpus + }; + + if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) { kfree(trial); return NULL; } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); - cpumask_copy(trial->effective_cpus, cs->effective_cpus); - cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); - cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + /* Copy masks if duplicating */ + if (cs) { + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + } + return trial; } @@ -2332,7 +2348,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - if (alloc_cpumasks(NULL, &tmp)) + if (alloc_tmpmasks(&tmp)) return -ENOMEM; if (old_prs) { @@ -2476,7 +2492,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval) return retval; - if (alloc_cpumasks(NULL, &tmp)) + if (alloc_tmpmasks(&tmp)) return -ENOMEM; if (old_prs) { @@ -2820,7 +2836,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int spread_flag_changed; int err; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) return -ENOMEM; @@ -2881,7 +2897,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (new_prs && is_prs_invalid(old_prs)) old_prs = PRS_MEMBER; - if (alloc_cpumasks(NULL, &tmpmask)) + if (alloc_tmpmasks(&tmpmask)) return -ENOMEM; err = update_partition_exclusive_flag(cs, new_prs); @@ -3223,7 +3239,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, if (!is_cpuset_online(cs)) goto out_unlock; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) { retval = -ENOMEM; goto out_unlock; @@ -3456,15 +3472,10 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) return &top_cpuset.css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); + cs = dup_or_alloc_cpuset(NULL); if (!cs) return ERR_PTR(-ENOMEM); - if (alloc_cpumasks(cs, NULL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } - __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; @@ -3920,7 +3931,7 @@ static void cpuset_handle_hotplug(void) bool on_dfl = is_in_v2_mode(); struct tmpmasks tmp, *ptmp = NULL; - if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + if (on_dfl && !alloc_tmpmasks(&tmp)) ptmp = &tmp; lockdep_assert_cpus_held(); From 2c98144fc832b35c4e9293a3bfc518608d6f5145 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Mon, 25 Aug 2025 03:23:52 +0000 Subject: [PATCH 0503/3206] cpuset: add helpers for cpus read and cpuset_mutex locks cpuset: add helpers for cpus_read_lock and cpuset_mutex locks. Replace repetitive locking patterns with new helpers: - cpuset_full_lock() - cpuset_full_unlock() This makes the code cleaner and ensures consistent lock ordering. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 2 ++ kernel/cgroup/cpuset-v1.c | 12 +++---- kernel/cgroup/cpuset.c | 60 +++++++++++++++++++-------------- 3 files changed, 40 insertions(+), 34 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 75b3aef39231dd..337608f408ce0a 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -276,6 +276,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int cpuset_common_seq_show(struct seq_file *sf, void *v); +void cpuset_full_lock(void); +void cpuset_full_unlock(void); /* * cpuset-v1.c diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index b69a7db67090d8..12e76774c75b0e 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } @@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 7b0b81c835bf2f..a78ccd11ce9b43 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -250,6 +250,12 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); +/** + * cpuset_lock - Acquire the global cpuset mutex + * + * This locks the global cpuset mutex to prevent modifications to cpuset + * hierarchy and configurations. This helper is not enough to make modification. + */ void cpuset_lock(void) { mutex_lock(&cpuset_mutex); @@ -260,6 +266,24 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +/** + * cpuset_full_lock - Acquire full protection for cpuset modification + * + * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex + * to safely modify cpuset data. + */ +void cpuset_full_lock(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +void cpuset_full_unlock(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + static DEFINE_SPINLOCK(callback_lock); void cpuset_callback_lock_irq(void) @@ -3234,8 +3258,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, int retval = -ENODEV; buf = strstrip(buf); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -3264,8 +3287,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, if (force_sd_rebuild) rebuild_sched_domains_locked(); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -3368,12 +3390,10 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, else return -EINVAL; - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); return retval ?: nbytes; } @@ -3498,9 +3518,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) @@ -3552,8 +3570,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); return 0; } @@ -3568,16 +3585,12 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); } /* @@ -3589,16 +3602,11 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); /* Reset valid partition back to member */ if (is_partition_valid(cs)) update_prstate(cs, PRS_MEMBER); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - + cpuset_full_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) From 545908a9fb9c01e1bd7afe040f5623f561d50ea0 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 22 Aug 2025 17:46:26 +0200 Subject: [PATCH 0504/3206] dt-bindings: gpio-mmio: Support hogs We use hogs on some MMIO GPIO controllers so make sure the bindings support this using a pattern property. Reviewed-by: Rob Herring (Arm) Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20250822-ixp4xx-eb-mmio-gpio-v2-1-bd2edd4a9c74@linaro.org Signed-off-by: Bartosz Golaszewski --- Documentation/devicetree/bindings/gpio/gpio-mmio.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml b/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml index 87e986386f32a4..ca32317dff85bc 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml +++ b/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml @@ -89,6 +89,12 @@ properties: description: If this property is present, the controller cannot drive the GPIO lines. +patternProperties: + "^.+-hog(-[0-9]+)?$": + type: object + required: + - gpio-hog + required: - compatible - reg From 1f3c076063f03999c351c0725adf744ef5660733 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 22 Aug 2025 17:46:27 +0200 Subject: [PATCH 0505/3206] dt-bindings: gpio-mmio: Add MMIO for IXP4xx expansion bus This expansion is a simple MMIO-mapped GPIO device but the bus has a number of additional properties that we need to bring in using a reference to the bus child node schema. Reviewed-by: Rob Herring (Arm) Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20250822-ixp4xx-eb-mmio-gpio-v2-2-bd2edd4a9c74@linaro.org Signed-off-by: Bartosz Golaszewski --- .../devicetree/bindings/gpio/gpio-mmio.yaml | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml b/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml index ca32317dff85bc..b4d55bf6a28548 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml +++ b/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml @@ -22,6 +22,7 @@ properties: - brcm,bcm6345-gpio - ni,169445-nand-gpio - wd,mbl-gpio # Western Digital MyBook Live memory-mapped GPIO controller + - intel,ixp4xx-expansion-bus-mmio-gpio big-endian: true @@ -89,6 +90,14 @@ properties: description: If this property is present, the controller cannot drive the GPIO lines. +if: + properties: + compatible: + contains: + const: intel,ixp4xx-expansion-bus-mmio-gpio +then: + $ref: /schemas/memory-controllers/intel,ixp4xx-expansion-peripheral-props.yaml# + patternProperties: "^.+-hog(-[0-9]+)?$": type: object @@ -102,7 +111,7 @@ required: - '#gpio-cells' - gpio-controller -additionalProperties: false +unevaluatedProperties: false examples: - | @@ -132,3 +141,22 @@ examples: gpio-controller; #gpio-cells = <2>; }; + + bus@c4000000 { + compatible = "intel,ixp42x-expansion-bus-controller", "syscon"; + reg = <0xc4000000 0x30>; + native-endian; + #address-cells = <2>; + #size-cells = <1>; + ranges = <0 0x0 0x50000000 0x01000000>; + dma-ranges = <0 0x0 0x50000000 0x01000000>; + gpio@1,0 { + compatible = "intel,ixp4xx-expansion-bus-mmio-gpio"; + gpio-controller; + #gpio-cells = <2>; + big-endian; + reg = <1 0x00000000 0x2>; + reg-names = "dat"; + intel,ixp4xx-eb-write-enable = <1>; + }; + }; From 38623d532c99ebd926f4eebb7c7de19cb7e5aef4 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 22 Aug 2025 17:46:28 +0200 Subject: [PATCH 0506/3206] gpio: mmio: Add compatible for the ixp4xx eb MMIO The IXP4xx expansion bus can have simple memory-mapped GPIO on it. Using the proper device tree bindings, support probing this directly from the device tree. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20250822-ixp4xx-eb-mmio-gpio-v2-3-bd2edd4a9c74@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mmio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index 021ad62778c2f4..79e1be149c9484 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -730,6 +730,7 @@ static const struct of_device_id bgpio_of_match[] = { { .compatible = "brcm,bcm6345-gpio" }, { .compatible = "wd,mbl-gpio" }, { .compatible = "ni,169445-nand-gpio" }, + { .compatible = "intel,ixp4xx-expansion-bus-mmio-gpio" }, { } }; MODULE_DEVICE_TABLE(of, bgpio_of_match); From a16a3cb07140a094ffd918290cef76135876b532 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 15 Aug 2025 13:17:33 +0200 Subject: [PATCH 0507/3206] gpio: sim: don't use GPIO base in debugfs output We're in the process of removing unneeded references to the global GPIO base number treewide. Use the HW offset instead of the base number. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250815111733.79283-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sim.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index 050092583f799b..a83f5238427cdb 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -262,8 +262,7 @@ static void gpio_sim_dbg_show(struct seq_file *seq, struct gpio_chip *gc) guard(mutex)(&chip->lock); for_each_hwgpio(gc, i, label) - seq_printf(seq, " gpio-%-3d (%s) %s,%s\n", - gc->base + i, + seq_printf(seq, " gpio-%-3d (%s) %s,%s\n", i, label ?: "", test_bit(i, chip->direction_map) ? "input" : test_bit(i, chip->value_map) ? "output-high" : From a728ce8ffbd27954fdb2826dcc15a6576e574b83 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Wed, 13 Aug 2025 19:50:58 +0800 Subject: [PATCH 0508/3206] binfmt_elf: Replace offsetof() with struct_size() in fill_note_info() When dealing with structures containing flexible arrays, struct_size() provides additional compile-time checks compared to offsetof(). This enhances code robustness and reduces the risk of potential errors. Signed-off-by: Xichao Zhao Link: https://lore.kernel.org/r/20250813115058.635742-1-zhao.xichao@vivo.com Signed-off-by: Kees Cook --- fs/binfmt_elf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 264fba0d44bdf4..4aacf9c9cc2dfe 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1845,16 +1845,14 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, /* * Allocate a structure for each thread. */ - info->thread = kzalloc(offsetof(struct elf_thread_core_info, - notes[info->thread_notes]), - GFP_KERNEL); + info->thread = kzalloc(struct_size(info->thread, notes, info->thread_notes), + GFP_KERNEL); if (unlikely(!info->thread)) return 0; info->thread->task = dump_task; for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) { - t = kzalloc(offsetof(struct elf_thread_core_info, - notes[info->thread_notes]), + t = kzalloc(struct_size(t, notes, info->thread_notes), GFP_KERNEL); if (unlikely(!t)) return 0; From 1b93c03fb319d72a1f5f4723abd5df15ce40f4e2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 21 Aug 2025 17:06:03 +0800 Subject: [PATCH 0509/3206] rcu: add rcu_read_lock_dont_migrate() migrate_disable() is called to disable migration in the kernel, and it is often used together with rcu_read_lock(). However, with PREEMPT_RCU disabled, it's unnecessary, as rcu_read_lock() will always disable preemption, which will also disable migration. Introduce rcu_read_lock_dont_migrate() and rcu_read_unlock_migrate(), which will do the migration enable and disable only when PREEMPT_RCU. Signed-off-by: Menglong Dong Reviewed-by: Paul E. McKenney Link: https://lore.kernel.org/r/20250821090609.42508-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/rcupdate.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb1d..9691ca380a4fe1 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -962,6 +962,20 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) preempt_enable_notrace(); } +static __always_inline void rcu_read_lock_dont_migrate(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + migrate_disable(); + rcu_read_lock(); +} + +static inline void rcu_read_unlock_migrate(void) +{ + rcu_read_unlock(); + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + migrate_enable(); +} + /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. From 8c0afc7c9c112eb6884bc7e443247f242b6d8a3e Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 21 Aug 2025 17:06:04 +0800 Subject: [PATCH 0510/3206] bpf: use rcu_read_lock_dont_migrate() for bpf_cgrp_storage_free() Use rcu_read_lock_dont_migrate() and rcu_read_unlock_migrate() in bpf_cgrp_storage_free to obtain better performance when PREEMPT_RCU is not enabled. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20250821090609.42508-3-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_cgrp_storage.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 148da8f7ff3685..0687a760974a42 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -45,8 +45,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) goto out; @@ -55,8 +54,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static struct bpf_local_storage_data * From f2fa9b906911407ca99d14aa9f4353fda7566bbc Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 21 Aug 2025 17:06:05 +0800 Subject: [PATCH 0511/3206] bpf: use rcu_read_lock_dont_migrate() for bpf_inode_storage_free() Use rcu_read_lock_dont_migrate() and rcu_read_unlock_migrate() in bpf_inode_storage_free to obtain better performance when PREEMPT_RCU is not enabled. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20250821090609.42508-4-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_inode_storage.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 15a3eb9b02d94c..e54cce2b91754c 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -62,8 +62,7 @@ void bpf_inode_storage_free(struct inode *inode) if (!bsb) return; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); local_storage = rcu_dereference(bsb->storage); if (!local_storage) @@ -71,8 +70,7 @@ void bpf_inode_storage_free(struct inode *inode) bpf_local_storage_destroy(local_storage); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) From 68748f0397a356122bf9cf33ef77cabbff9c3e51 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 21 Aug 2025 17:06:06 +0800 Subject: [PATCH 0512/3206] bpf: use rcu_read_lock_dont_migrate() for bpf_iter_run_prog() Use rcu_read_lock_dont_migrate() and rcu_read_unlock_migrate() in bpf_iter_run_prog to obtain better performance when PREEMPT_RCU is not enabled. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20250821090609.42508-5-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_iter.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0cbcae7270790a..6ac35430c57344 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -705,13 +705,11 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) migrate_enable(); rcu_read_unlock_trace(); } else { - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); bpf_reset_run_ctx(old_run_ctx); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } /* bpf program can only return 0 or 1: From cf4303b70dfa0163753e9b03ca78e5122727d4b8 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 21 Aug 2025 17:06:07 +0800 Subject: [PATCH 0513/3206] bpf: use rcu_read_lock_dont_migrate() for bpf_task_storage_free() Use rcu_read_lock_dont_migrate() and rcu_read_unlock_migrate() in bpf_task_storage_free to obtain better performance when PREEMPT_RCU is not enabled. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20250821090609.42508-6-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_task_storage.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 1109475953c01f..a1dc1bf0848a52 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -70,8 +70,7 @@ void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) @@ -81,8 +80,7 @@ void bpf_task_storage_free(struct task_struct *task) bpf_local_storage_destroy(local_storage); bpf_task_storage_unlock(); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) From 427a36bb5504e0fb33398a4ccd523fce95514d83 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 21 Aug 2025 17:06:08 +0800 Subject: [PATCH 0514/3206] bpf: use rcu_read_lock_dont_migrate() for bpf_prog_run_array_cg() Use rcu_read_lock_dont_migrate() and rcu_read_unlock_migrate() in bpf_prog_run_array_cg to obtain better performance when PREEMPT_RCU is not enabled. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20250821090609.42508-7-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 180b630279b9cb..9912c7b9a26604 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -71,8 +71,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, u32 func_ret; run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); array = rcu_dereference(cgrp->effective[atype]); item = &array->items[0]; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); @@ -88,8 +87,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, item++; } bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); return run_ctx.retval; } From 8e4f0b1ebcf2180ab594f204f01279a666dadf3b Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 21 Aug 2025 17:06:09 +0800 Subject: [PATCH 0515/3206] bpf: use rcu_read_lock_dont_migrate() for trampoline.c Use rcu_read_lock_dont_migrate() and rcu_read_unlock_migrate() in trampoline.c to obtain better performance when PREEMPT_RCU is not enabled. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20250821090609.42508-8-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 0e364614c3a291..5949095e51c3d0 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -899,8 +899,7 @@ static __always_inline u64 notrace bpf_prog_start_time(void) static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); @@ -949,8 +948,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, update_prog_stats(prog, start); this_cpu_dec(*(prog->active)); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, @@ -960,8 +958,7 @@ static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, /* Runtime stats are exported via actual BPF_LSM_CGROUP * programs, not the shims. */ - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); @@ -974,8 +971,7 @@ static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, { bpf_reset_run_ctx(run_ctx->saved_run_ctx); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, @@ -1033,8 +1029,7 @@ static u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); @@ -1048,8 +1043,7 @@ static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) From e649bcda25b5ae1a30a182cc450f928a0b282c93 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Aug 2025 14:03:39 -0400 Subject: [PATCH 0516/3206] perf: Remove get_perf_callchain() init_nr argument The 'init_nr' argument has double duty: it's used to initialize both the number of contexts and the number of stack entries. That's confusing and the callers always pass zero anyway. Hard code the zero. Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250820180428.259565081@kernel.org --- include/linux/perf_event.h | 2 +- kernel/bpf/stackmap.c | 4 ++-- kernel/events/callchain.c | 12 ++++++------ kernel/events/core.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bfbf9ea53f259f..fd1d91017b99b3 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1719,7 +1719,7 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3615c06b7dfa98..ec3a57a5fba1f8 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, if (max_depth > sysctl_perf_event_max_stack) max_depth = sysctl_perf_event_max_stack; - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, false, false); if (unlikely(!trace)) @@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else if (kernel && task) trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, crosstask, false); if (unlikely(!trace) || trace->nr < skip) { diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6c83ad674d0104..b0f5bd228cd8c3 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,7 +217,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr } struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; @@ -228,11 +228,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (!entry) return NULL; - ctx.entry = entry; - ctx.max_stack = max_stack; - ctx.nr = entry->nr = init_nr; - ctx.contexts = 0; - ctx.contexts_maxed = false; + ctx.entry = entry; + ctx.max_stack = max_stack; + ctx.nr = entry->nr = 0; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) diff --git a/kernel/events/core.c b/kernel/events/core.c index ea357044d78062..bade8e0fced79c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8210,7 +8210,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, 0, kernel, user, + callchain = get_perf_callchain(regs, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } From 153f9e74dec230f2e070e16fa061bc7adfd2c450 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Aug 2025 14:03:40 -0400 Subject: [PATCH 0517/3206] perf: Have get_perf_callchain() return NULL if crosstask and user are set get_perf_callchain() doesn't support cross-task unwinding for user space stacks, have it return NULL if both the crosstask and user arguments are set. Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250820180428.426423415@kernel.org --- kernel/events/callchain.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index b0f5bd228cd8c3..cd0e3fc7ed05af 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -224,6 +224,10 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, struct perf_callchain_entry_ctx ctx; int rctx, start_entry_idx; + /* crosstask is not supported for user stacks */ + if (crosstask && user && !kernel) + return NULL; + entry = get_callchain_entry(&rctx); if (!entry) return NULL; @@ -240,7 +244,7 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, perf_callchain_kernel(&ctx, regs); } - if (user) { + if (user && !crosstask) { if (!user_mode(regs)) { if (current->mm) regs = task_pt_regs(current); @@ -249,9 +253,6 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, } if (regs) { - if (crosstask) - goto exit_put; - if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); @@ -261,7 +262,6 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, } } -exit_put: put_callchain_entry(rctx); return entry; From 90942f9fac05702065ff82ed0bade0d08168d4ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Aug 2025 14:03:41 -0400 Subject: [PATCH 0518/3206] perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of current->mm == NULL To determine if a task is a kernel thread or not, it is more reliable to use (current->flags & (PF_KTHREAD|PF_USER_WORKERi)) than to rely on current->mm being NULL. That is because some kernel tasks (io_uring helpers) may have a mm field. Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250820180428.592367294@kernel.org --- kernel/events/callchain.c | 6 +++--- kernel/events/core.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index cd0e3fc7ed05af..5982d18f169bd2 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -246,10 +246,10 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, if (user && !crosstask) { if (!user_mode(regs)) { - if (current->mm) - regs = task_pt_regs(current); - else + if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) regs = NULL; + else + regs = task_pt_regs(current); } if (regs) { diff --git a/kernel/events/core.c b/kernel/events/core.c index bade8e0fced79c..f880cec0c98095 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7446,7 +7446,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (!(current->flags & PF_KTHREAD)) { + } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; @@ -8086,7 +8086,7 @@ static u64 perf_virt_to_phys(u64 virt) * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ - if (current->mm != NULL) { + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { struct page *p; pagefault_disable(); From d77e3319e31098a6cb97b7ce4e71ba676e327fd7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Aug 2025 14:03:42 -0400 Subject: [PATCH 0519/3206] perf: Simplify get_perf_callchain() user logic Simplify the get_perf_callchain() user logic a bit. task_pt_regs() should never be NULL. Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20250820180428.760066227@kernel.org --- kernel/events/callchain.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 5982d18f169bd2..808c0d7a31faf0 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -247,21 +247,19 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, if (user && !crosstask) { if (!user_mode(regs)) { if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) - regs = NULL; - else - regs = task_pt_regs(current); + goto exit_put; + regs = task_pt_regs(current); } - if (regs) { - if (add_mark) - perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - start_entry_idx = entry->nr; - perf_callchain_user(&ctx, regs); - fixup_uretprobe_trampoline_entries(entry, start_entry_idx); - } + start_entry_idx = entry->nr; + perf_callchain_user(&ctx, regs); + fixup_uretprobe_trampoline_entries(entry, start_entry_idx); } +exit_put: put_callchain_entry(rctx); return entry; From 16ed389227651330879e17bd83d43bd234006722 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Aug 2025 14:03:43 -0400 Subject: [PATCH 0520/3206] perf: Skip user unwind if the task is a kernel thread If the task is not a user thread, there's no user stack to unwind. Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250820180428.930791978@kernel.org --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index f880cec0c98095..28de3baff79222 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8198,7 +8198,8 @@ struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; - bool user = !event->attr.exclude_callchain_user; + bool user = !event->attr.exclude_callchain_user && + !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; From cba70aff623b104085ab5613fedd21f6ea19095a Mon Sep 17 00:00:00 2001 From: Fabio Porcedda Date: Wed, 6 Aug 2025 14:09:26 +0200 Subject: [PATCH 0521/3206] USB: serial: option: add Telit Cinterion FN990A w/audio compositions Add the following Telit Cinterion FN990A w/audio compositions: 0x1077: tty (diag) + adb + rmnet + audio + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=09 Cnt=01 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1077 Rev=05.04 S: Manufacturer=Telit Wireless Solutions S: Product=FN990 S: SerialNumber=67e04c35 C: #Ifs=10 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 0 Cls=01(audio) Sub=01 Prot=20 Driver=snd-usb-audio I: If#= 4 Alt= 1 #EPs= 1 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio E: Ad=03(O) Atr=0d(Isoc) MxPS= 68 Ivl=1ms I: If#= 5 Alt= 1 #EPs= 1 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio E: Ad=84(I) Atr=0d(Isoc) MxPS= 68 Ivl=1ms I: If#= 6 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 7 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 8 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 9 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8c(I) Atr=03(Int.) MxPS= 10 Ivl=32ms 0x1078: tty (diag) + adb + MBIM + audio + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=09 Cnt=01 Dev#= 21 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1078 Rev=05.04 S: Manufacturer=Telit Wireless Solutions S: Product=FN990 S: SerialNumber=67e04c35 C: #Ifs=11 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#=10 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8c(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 3 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 0 Cls=01(audio) Sub=01 Prot=20 Driver=snd-usb-audio I: If#= 5 Alt= 0 #EPs= 0 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio I: If#= 6 Alt= 1 #EPs= 1 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio E: Ad=84(I) Atr=0d(Isoc) MxPS= 68 Ivl=1ms I: If#= 7 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 8 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 9 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms 0x1079: RNDIS + tty (diag) + adb + audio + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=09 Cnt=01 Dev#= 23 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1079 Rev=05.04 S: Manufacturer=Telit Wireless Solutions S: Product=FN990 S: SerialNumber=67e04c35 C: #Ifs=11 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=ef(misc ) Sub=04 Prot=01 Driver=rndis_host E: Ad=81(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#=10 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8c(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 0 Cls=01(audio) Sub=01 Prot=20 Driver=snd-usb-audio I: If#= 5 Alt= 0 #EPs= 0 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio I: If#= 6 Alt= 1 #EPs= 1 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio E: Ad=84(I) Atr=0d(Isoc) MxPS= 68 Ivl=1ms I: If#= 7 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 8 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 9 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms Cc: stable@vger.kernel.org Signed-off-by: Fabio Porcedda Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index e5cd3309342364..1f4da81201112d 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1369,6 +1369,12 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1075, 0xff), /* Telit FN990A (PCIe) */ .driver_info = RSVD(0) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1077, 0xff), /* Telit FN990A (rmnet + audio) */ + .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1078, 0xff), /* Telit FN990A (MBIM + audio) */ + .driver_info = NCTRL(0) | RSVD(1) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1079, 0xff), /* Telit FN990A (RNDIS + audio) */ + .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1080, 0xff), /* Telit FE990A (rmnet) */ .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1081, 0xff), /* Telit FE990A (MBIM) */ From 57834ce5a6a47df282c8419019ba5495eac58fb9 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 21 Aug 2025 19:00:03 +0200 Subject: [PATCH 0522/3206] s390/mm: Prevent possible preempt_count overflow The s390 implementation of ptep_modify_prot_start() currently does preempt_disable(), and the preempt_enable() is done later in ptep_modify_prot_commit(). This logic is not really required, because the PTE lock must be held over the complete prot_start/commit transaction, as described in the comment of the generic implementation of ptep_modify_prot_start(). That comment also mentions that this interface should be batchable, and modify_prot_start_ptes() might start a transaction over a batch of PTEs, implemented as a simple loop over ptep_modify_prot_start(). In this case, the preempt_disable() in ptep_modify_prot_start() would be called multiple times, before the corresponding preempt_enable() calls happen, and this can lead to a preempt_count overflow. To fix this, simply remove the preempt_disable/enable() calls in ptep_modify_prot_start/commit(), and rely on the PTE lock being held. Commit cac1db8c3aad ("mm: optimize mprotect() by PTE batching") made use of this PTE batching for the first time, and triggers warnings like this: DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10) BUG: sleeping function called from invalid context at mm/mprotect.c:576 Hence, add a Fixes tag on that commit. Not because it is broken, but to make sure that it won't get backported w/o also this fix for s390. Fixes: cac1db8c3aad ("mm: optimize mprotect() by PTE batching") Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Alexander Gordeev --- arch/s390/mm/pgtable.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 60688be4e87666..50eb57c976bc30 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -335,7 +335,6 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, int nodat; struct mm_struct *mm = vma->vm_mm; - preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); old = ptep_flush_lazy(mm, addr, ptep, nodat); @@ -360,7 +359,6 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, } else { set_pte(ptep, pte); } - preempt_enable(); } static inline void pmdp_idte_local(struct mm_struct *mm, From 66edbb1e32eede16b261a90014451d67119fc875 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 20 Aug 2025 12:49:45 -0400 Subject: [PATCH 0523/3206] dt-bindings: gpio: Move fsl,mxs-pinctrl.txt into gpio-mxs.yaml Move mxs-pinctrl part into gpio-mxs.yaml and add pinctrl examples to fix below CHECK_DTB warning: arch/arm/boot/dts/nxp/mxs/imx28-xea.dtb: pinctrl@80018000 (fsl,imx28-pinctrl): 'auart0-2pins@0', 'auart0@0', ... 'usb1@1' do not match any of the regexes: 'gpio@[0-9]+$', 'pinctrl-[0-9]+' Signed-off-by: Frank Li Reviewed-by: Linus Walleij Reviewed-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250820164946.3782702-1-Frank.Li@nxp.com Signed-off-by: Bartosz Golaszewski --- .../devicetree/bindings/gpio/gpio-mxs.yaml | 80 ++++++++++- .../bindings/pinctrl/fsl,mxs-pinctrl.txt | 127 ------------------ 2 files changed, 75 insertions(+), 132 deletions(-) delete mode 100644 Documentation/devicetree/bindings/pinctrl/fsl,mxs-pinctrl.txt diff --git a/Documentation/devicetree/bindings/gpio/gpio-mxs.yaml b/Documentation/devicetree/bindings/gpio/gpio-mxs.yaml index b58e08c8ecd8a1..aaf97124803f42 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-mxs.yaml +++ b/Documentation/devicetree/bindings/gpio/gpio-mxs.yaml @@ -18,9 +18,13 @@ description: | properties: compatible: - enum: - - fsl,imx23-pinctrl - - fsl,imx28-pinctrl + items: + - enum: + - fsl,imx23-pinctrl + - fsl,imx28-pinctrl + # Over 10 years old devices, driver use simple-bus to probe child gpio + # Devices. Keep it as it to be compatible existed dts files. + - const: simple-bus '#address-cells': const: 1 @@ -31,7 +35,65 @@ properties: maxItems: 1 patternProperties: - "gpio@[0-9]+$": + "^(?!gpio@)[^@]+@[0-9]+$": + type: object + properties: + fsl,pinmux-ids: + $ref: /schemas/types.yaml#/definitions/uint32-array + description: | + An integer array. Each integer in the array specify a pin + with given mux function, with bank, pin and mux packed as below. + + [15..12] : bank number + [11..4] : pin number + [3..0] : mux selection + + This integer with mux selection packed is used as an entity by both group + and config nodes to identify a pin. The mux selection in the integer takes + effects only on group node, and will get ignored by driver with config node, + since config node is only meant to set up pin configurations. + + Valid values for these integers are listed below. + + reg: + items: + - description: | + pin group index. NOTE: it is supposed wrong use reg property + here. But it is over 10 years devices. Just keep it as it. + + fsl,drive-strength: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + description: | + 0: MXS_DRIVE_4mA + 1: MXS_DRIVE_8mA + 2: MXS_DRIVE_12mA + 3: MXS_DRIVE_16mA + + fsl,voltage: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1] + description: | + 0: MXS_VOLTAGE_LOW - 1.8 V + 1: MXS_VOLTAGE_HIGH - 3.3 V + + fsl,pull-up: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1] + description: | + 0: MXS_PULL_DISABLE - Disable the internal pull-up + 1: MXS_PULL_ENABLE - Enable the internal pull-up + + Note that when enabling the pull-up, the internal pad keeper gets disabled. + Also, some pins doesn't have a pull up, in that case, setting the fsl,pull-up + will only disable the internal pad keeper. + + required: + - fsl,pinmux-ids + + additionalProperties: false + + "^gpio@[0-9]+$": type: object properties: compatible: @@ -80,7 +142,7 @@ examples: pinctrl@80018000 { #address-cells = <1>; #size-cells = <0>; - compatible = "fsl,imx28-pinctrl"; + compatible = "fsl,imx28-pinctrl", "simple-bus"; reg = <0x80018000 0x2000>; gpio@0 { @@ -132,4 +194,12 @@ examples: interrupt-controller; #interrupt-cells = <2>; }; + + lcdif-apx4@5 { + reg = <5>; + fsl,pinmux-ids = <0x1181 0x1191>; + fsl,drive-strength = <0>; + fsl,voltage = <0>; + fsl,pull-up = <0>; + }; }; diff --git a/Documentation/devicetree/bindings/pinctrl/fsl,mxs-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/fsl,mxs-pinctrl.txt deleted file mode 100644 index 1e70a8aff2600e..00000000000000 --- a/Documentation/devicetree/bindings/pinctrl/fsl,mxs-pinctrl.txt +++ /dev/null @@ -1,127 +0,0 @@ -* Freescale MXS Pin Controller - -The pins controlled by mxs pin controller are organized in banks, each bank -has 32 pins. Each pin has 4 multiplexing functions, and generally, the 4th -function is GPIO. The configuration on the pins includes drive strength, -voltage and pull-up. - -Required properties: -- compatible: "fsl,imx23-pinctrl" or "fsl,imx28-pinctrl" -- reg: Should contain the register physical address and length for the - pin controller. - -Please refer to pinctrl-bindings.txt in this directory for details of the -common pinctrl bindings used by client devices. - -The node of mxs pin controller acts as a container for an arbitrary number of -subnodes. Each of these subnodes represents some desired configuration for -a group of pins, and only affects those parameters that are explicitly listed. -In other words, a subnode that describes a drive strength parameter implies no -information about pull-up. For this reason, even seemingly boolean values are -actually tristates in this binding: unspecified, off, or on. Unspecified is -represented as an absent property, and off/on are represented as integer -values 0 and 1. - -Those subnodes under mxs pin controller node will fall into two categories. -One is to set up a group of pins for a function, both mux selection and pin -configurations, and it's called group node in the binding document. The other -one is to adjust the pin configuration for some particular pins that need a -different configuration than what is defined in group node. The binding -document calls this type of node config node. - -On mxs, there is no hardware pin group. The pin group in this binding only -means a group of pins put together for particular peripheral to work in -particular function, like SSP0 functioning as mmc0-8bit. That said, the -group node should include all the pins needed for one function rather than -having these pins defined in several group nodes. It also means each of -"pinctrl-*" phandle in client device node should only have one group node -pointed in there, while the phandle can have multiple config node referenced -there to adjust configurations for some pins in the group. - -Required subnode-properties: -- fsl,pinmux-ids: An integer array. Each integer in the array specify a pin - with given mux function, with bank, pin and mux packed as below. - - [15..12] : bank number - [11..4] : pin number - [3..0] : mux selection - - This integer with mux selection packed is used as an entity by both group - and config nodes to identify a pin. The mux selection in the integer takes - effects only on group node, and will get ignored by driver with config node, - since config node is only meant to set up pin configurations. - - Valid values for these integers are listed below. - -- reg: Should be the index of the group nodes for same function. This property - is required only for group nodes, and should not be present in any config - nodes. - -Optional subnode-properties: -- fsl,drive-strength: Integer. - 0: MXS_DRIVE_4mA - 1: MXS_DRIVE_8mA - 2: MXS_DRIVE_12mA - 3: MXS_DRIVE_16mA -- fsl,voltage: Integer. - 0: MXS_VOLTAGE_LOW - 1.8 V - 1: MXS_VOLTAGE_HIGH - 3.3 V -- fsl,pull-up: Integer. - 0: MXS_PULL_DISABLE - Disable the internal pull-up - 1: MXS_PULL_ENABLE - Enable the internal pull-up - -Note that when enabling the pull-up, the internal pad keeper gets disabled. -Also, some pins doesn't have a pull up, in that case, setting the fsl,pull-up -will only disable the internal pad keeper. - -Examples: - -pinctrl@80018000 { - #address-cells = <1>; - #size-cells = <0>; - compatible = "fsl,imx28-pinctrl"; - reg = <0x80018000 2000>; - - mmc0_8bit_pins_a: mmc0-8bit@0 { - reg = <0>; - fsl,pinmux-ids = < - MX28_PAD_SSP0_DATA0__SSP0_D0 - MX28_PAD_SSP0_DATA1__SSP0_D1 - MX28_PAD_SSP0_DATA2__SSP0_D2 - MX28_PAD_SSP0_DATA3__SSP0_D3 - MX28_PAD_SSP0_DATA4__SSP0_D4 - MX28_PAD_SSP0_DATA5__SSP0_D5 - MX28_PAD_SSP0_DATA6__SSP0_D6 - MX28_PAD_SSP0_DATA7__SSP0_D7 - MX28_PAD_SSP0_CMD__SSP0_CMD - MX28_PAD_SSP0_DETECT__SSP0_CARD_DETECT - MX28_PAD_SSP0_SCK__SSP0_SCK - >; - fsl,drive-strength = ; - fsl,voltage = ; - fsl,pull-up = ; - }; - - mmc_cd_cfg: mmc-cd-cfg { - fsl,pinmux-ids = ; - fsl,pull-up = ; - }; - - mmc_sck_cfg: mmc-sck-cfg { - fsl,pinmux-ids = ; - fsl,drive-strength = ; - fsl,pull-up = ; - }; -}; - -In this example, group node mmc0-8bit defines a group of pins for mxs SSP0 -to function as a 8-bit mmc device, with 8mA, 3.3V and pull-up configurations -applied on all these pins. And config nodes mmc-cd-cfg and mmc-sck-cfg are -adjusting the configuration for pins card-detection and clock from what group -node mmc0-8bit defines. Only the configuration properties to be adjusted need -to be listed in the config nodes. - -Valid values for i.MX28/i.MX23 pinmux-id are defined in -arch/arm/boot/dts/imx28-pinfunc.h and arch/arm/boot/dts/imx23-pinfunc.h. -The definitions for the padconfig properties can be found in -arch/arm/boot/dts/mxs-pinfunc.h. From a5a261bea9bf8444300d1067b4a73bedee5b5227 Mon Sep 17 00:00:00 2001 From: Fabio Porcedda Date: Fri, 22 Aug 2025 11:08:39 +0200 Subject: [PATCH 0524/3206] USB: serial: option: add Telit Cinterion LE910C4-WWX new compositions Add the following Telit Cinterion LE910C4-WWX new compositions: 0x1034: tty (AT) + tty (AT) + rmnet T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1034 Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x1036: tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 10 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1036 Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x1037: tty (diag) + tty (Telit custom) + tty (AT) + tty (AT) + rmnet T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 15 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1037 Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x1038: tty (Telit custom) + tty (AT) + tty (AT) + rmnet T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 9 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1038 Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x103b: tty (diag) + tty (Telit custom) + tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 10 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=103b Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x103c: tty (Telit custom) + tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 11 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=103c Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Cc: stable@vger.kernel.org Signed-off-by: Fabio Porcedda Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 1f4da81201112d..fc869b7f803f04 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1322,7 +1322,18 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1033, 0xff), /* Telit LE910C1-EUX (ECM) */ .driver_info = NCTRL(0) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1034, 0xff), /* Telit LE910C4-WWX (rmnet) */ + .driver_info = RSVD(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1035, 0xff) }, /* Telit LE910C4-WWX (ECM) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1036, 0xff) }, /* Telit LE910C4-WWX */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1037, 0xff), /* Telit LE910C4-WWX (rmnet) */ + .driver_info = NCTRL(0) | NCTRL(1) | RSVD(4) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1038, 0xff), /* Telit LE910C4-WWX (rmnet) */ + .driver_info = NCTRL(0) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x103b, 0xff), /* Telit LE910C4-WWX */ + .driver_info = NCTRL(0) | NCTRL(1) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x103c, 0xff), /* Telit LE910C4-WWX */ + .driver_info = NCTRL(0) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG0), .driver_info = RSVD(0) | RSVD(1) | NCTRL(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG1), From e9c8da670e749f7dedc53e3af54a87b041918092 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 10 Jul 2025 12:08:30 +0200 Subject: [PATCH 0525/3206] fuse: do not allow mapping a non-regular backing file We do not support passthrough operations other than read/write on regular file, so allowing non-regular backing files makes no sense. Fixes: efad7153bf93 ("fuse: allow O_PATH fd for FUSE_DEV_IOC_BACKING_OPEN") Cc: stable@vger.kernel.org Signed-off-by: Amir Goldstein Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/passthrough.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 607ef735ad4ab3..eb97ac009e75d9 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -237,6 +237,11 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) if (!file) goto out; + /* read/write/splice/mmap passthrough only relevant for regular files */ + res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; + if (!d_is_reg(file->f_path.dentry)) + goto out_fput; + backing_sb = file_inode(file)->i_sb; res = -ELOOP; if (backing_sb->s_stack_depth >= fc->max_stack_depth) From e5203209b3935041dac541bc5b37efb44220cc0b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 12 Aug 2025 14:07:54 +0200 Subject: [PATCH 0526/3206] fuse: check if copy_file_range() returns larger than requested size Just like write(), copy_file_range() should check if the return value is less or equal to the requested number of bytes. Reported-by: Chunsheng Luo Closes: https://lore.kernel.org/all/20250807062425.694-1-luochunsheng@ustc.edu/ Fixes: 88bc7d5097a1 ("fuse: add support for copy_file_range()") Cc: # v4.20 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5525a4520b0f89..45207a6bb85fb6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3026,6 +3026,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, fc->no_copy_file_range = 1; err = -EOPNOTSUPP; } + if (!err && outarg.size > len) + err = -EIO; + if (err) goto out; From 1e08938c3694f707bb165535df352ac97a8c75c9 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 12 Aug 2025 14:46:34 +0200 Subject: [PATCH 0527/3206] fuse: prevent overflow in copy_file_range return value The FUSE protocol uses struct fuse_write_out to convey the return value of copy_file_range, which is restricted to uint32_t. But the COPY_FILE_RANGE interface supports a 64-bit size copies. Currently the number of bytes copied is silently truncated to 32-bit, which may result in poor performance or even failure to copy in case of truncation to zero. Reported-by: Florian Weimer Closes: https://lore.kernel.org/all/lhuh5ynl8z5.fsf@oldenburg.str.redhat.com/ Fixes: 88bc7d5097a1 ("fuse: add support for copy_file_range()") Cc: # v4.20 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 45207a6bb85fb6..4adcf09d4b01a6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2960,7 +2960,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, - .len = len, + .len = min_t(size_t, len, UINT_MAX & PAGE_MASK), .flags = flags }; struct fuse_write_out outarg; From 79569946502258ef53984f3000bffa77e469d8dc Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 15 Aug 2025 11:25:38 -0700 Subject: [PATCH 0528/3206] fuse: reflect cached blocksize if blocksize was changed As pointed out by Miklos[1], in the fuse_update_get_attr() path, the attributes returned to stat may be cached values instead of fresh ones fetched from the server. In the case where the server returned a modified blocksize value, we need to cache it and reflect it back to stat if values are not re-fetched since we now no longer directly change inode->i_blkbits. Link: https://lore.kernel.org/linux-fsdevel/CAJfpeguCOxeVX88_zPd1hqziB_C+tmfuDhZP5qO2nKmnb-dTUA@mail.gmail.com/ [1] Fixes: 542ede096e48 ("fuse: keep inode->i_blkbits constant") Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 1 + fs/fuse/fuse_i.h | 6 ++++++ fs/fuse/inode.c | 5 +++++ 3 files changed, 12 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2d817d7cab2649..ebee7e0b1cd37a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1377,6 +1377,7 @@ static int fuse_update_get_attr(struct mnt_idmap *idmap, struct inode *inode, generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; + stat->blksize = 1 << fi->cached_i_blkbits; if (test_bit(FUSE_I_BTIME, &fi->state)) { stat->btime = fi->i_btime; stat->result_mask |= STATX_BTIME; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ec248d13c8bfd9..1647eb7ca6fa6b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -210,6 +210,12 @@ struct fuse_inode { /** Reference to backing file in passthrough mode */ struct fuse_backing *fb; #endif + + /* + * The underlying inode->i_blkbits value will not be modified, + * so preserve the blocksize specified by the server. + */ + u8 cached_i_blkbits; }; /** FUSE inode state bits */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 67c2318bfc4294..3bfd83469d9f9d 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -289,6 +289,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, } } + if (attr->blksize) + fi->cached_i_blkbits = ilog2(attr->blksize); + else + fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits; + /* * Don't set the sticky bit in i_mode, unless we want the VFS * to check permissions. This prevents failures due to the From bd24d2108e9c8459d2c9f3d6d910b0053887df57 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 15 Aug 2025 11:25:39 -0700 Subject: [PATCH 0529/3206] fuse: fix fuseblk i_blkbits for iomap partial writes On regular fuse filesystems, i_blkbits is set to PAGE_SHIFT which means any iomap partial writes will mark the entire folio as uptodate. However fuseblk filesystems work differently and allow the blocksize to be less than the page size. As such, this may lead to data corruption if fuseblk sets its blocksize to less than the page size, uses the writeback cache, and does a partial write, then a read and the read happens before the write has undergone writeback, since the folio will not be marked uptodate from the partial write so the read will read in the entire folio from disk, which will overwrite the partial write. The long-term solution for this, which will also be needed for fuse to enable large folios with the writeback cache on, is to have fuse also use iomap for folio reads, but until that is done, the cleanest workaround is to use the page size for fuseblk's internal kernel inode blksize/blkbits values while maintaining current behavior for stat(). This was verified using ntfs-3g: $ sudo mkfs.ntfs -f -c 512 /dev/vdd1 $ sudo ntfs-3g /dev/vdd1 ~/fuseblk $ stat ~/fuseblk/hi.txt IO Block: 512 Fixes: a4c9ab1d4975 ("fuse: use iomap for buffered writes") Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- fs/fuse/fuse_i.h | 8 ++++++++ fs/fuse/inode.c | 13 ++++++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ebee7e0b1cd37a..5c569c3cb53f3d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1199,7 +1199,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else - blkbits = inode->i_sb->s_blocksize_bits; + blkbits = fc->blkbits; stat->blksize = 1 << blkbits; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1647eb7ca6fa6b..cc428d04be3e14 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -975,6 +975,14 @@ struct fuse_conn { /* Request timeout (in jiffies). 0 = no timeout */ unsigned int req_timeout; } timeout; + + /* + * This is a workaround until fuse uses iomap for reads. + * For fuseblk servers, this represents the blocksize passed in at + * mount time and for regular fuse servers, this is equivalent to + * inode->i_blkbits. + */ + u8 blkbits; }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3bfd83469d9f9d..7ddfd2b3cc9c4f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -292,7 +292,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (attr->blksize) fi->cached_i_blkbits = ilog2(attr->blksize); else - fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits; + fi->cached_i_blkbits = fc->blkbits; /* * Don't set the sticky bit in i_mode, unless we want the VFS @@ -1810,10 +1810,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) goto err; + /* + * This is a workaround until fuse hooks into iomap for reads. + * Use PAGE_SIZE for the blocksize else if the writeback cache + * is enabled, buffered writes go through iomap and a read may + * overwrite partially written data if blocksize < PAGE_SIZE + */ + fc->blkbits = sb->s_blocksize_bits; + if (ctx->blksize != PAGE_SIZE && + !sb_set_blocksize(sb, PAGE_SIZE)) + goto err; #endif } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; + fc->blkbits = sb->s_blocksize_bits; } sb->s_subtype = ctx->subtype; From d0f27ff27c048a3bdc49877255a7e7f8b49f5603 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 26 Aug 2025 14:50:57 +0800 Subject: [PATCH 0530/3206] selftests/bpf: Remove entries from config.{arch} already present in config `config.{arch}` had entries already present in `config`. When generating the config used by vmtest, concatenate the `config` file with the `config.{arch}` one, making those entries duplicated, so remove those duplications. Use the following command to get the differences: $ comm -1 -2 <(sort tools/testing/selftests/bpf/config.x86_64) <(sort tools/testing/selftests/bpf/config) $ comm -1 -2 <(sort tools/testing/selftests/bpf/config.aarch64) <(sort tools/testing/selftests/bpf/config) $ comm -1 -2 <(sort tools/testing/selftests/bpf/config.riscv64) <(sort tools/testing/selftests/bpf/config) $ comm -1 -2 <(sort tools/testing/selftests/bpf/config.ppc64el) <(sort tools/testing/selftests/bpf/config) $ comm -1 -2 <(sort tools/testing/selftests/bpf/config.s390x) <(sort tools/testing/selftests/bpf/config) This is similar with commit 7a42af4b94f1 ("selftests/bpf: Remove entries from config.s390x already present in config"). Signed-off-by: Tiezhu Yang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250826065057.11415-1-yangtiezhu@loongson.cn --- tools/testing/selftests/bpf/config.aarch64 | 12 ------------ tools/testing/selftests/bpf/config.ppc64el | 1 - tools/testing/selftests/bpf/config.riscv64 | 1 - tools/testing/selftests/bpf/config.s390x | 11 ----------- tools/testing/selftests/bpf/config.x86_64 | 5 ----- 5 files changed, 30 deletions(-) diff --git a/tools/testing/selftests/bpf/config.aarch64 b/tools/testing/selftests/bpf/config.aarch64 index e1495a4bbc99ca..7efad36ceb26f2 100644 --- a/tools/testing/selftests/bpf/config.aarch64 +++ b/tools/testing/selftests/bpf/config.aarch64 @@ -31,10 +31,7 @@ CONFIG_COMPAT=y CONFIG_CPUSETS=y CONFIG_CRASH_DUMP=y CONFIG_CRYPTO_USER_API_RNG=y -CONFIG_CRYPTO_USER_API_SKCIPHER=y CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_INFO_BTF=y -CONFIG_DEBUG_INFO_DWARF4=y CONFIG_DEBUG_INFO_REDUCED=n CONFIG_DEBUG_LIST=y CONFIG_DEBUG_LOCKDEP=y @@ -46,7 +43,6 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEVTMPFS=y CONFIG_DRM=y -CONFIG_DUMMY=y CONFIG_EXPERT=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -70,13 +66,11 @@ CONFIG_HZ_100=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_IKHEADERS=y CONFIG_INET6_ESP=y -CONFIG_INET_ESP=y CONFIG_INET=y CONFIG_INPUT_EVDEV=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTICAST=y CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_IPTABLES=y CONFIG_IPV6_SEG6_LWTUNNEL=y CONFIG_IPVLAN=y CONFIG_JUMP_LABEL=y @@ -97,22 +91,18 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_NAMESPACES=y CONFIG_NET_ACT_BPF=y -CONFIG_NET_ACT_GACT=y CONFIG_NETDEVICES=y CONFIG_NETFILTER_XT_MATCH_BPF=y CONFIG_NETFILTER_XT_TARGET_MARK=y CONFIG_NET_KEY=y -CONFIG_NET_SCH_FQ=y CONFIG_NET_VRF=y CONFIG_NET=y -CONFIG_NF_TABLES=y CONFIG_NLMON=y CONFIG_NO_HZ_IDLE=y CONFIG_NR_CPUS=256 CONFIG_NUMA=y CONFIG_OVERLAY_FS=y CONFIG_PACKET_DIAG=y -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI_HOST_GENERIC=y @@ -149,7 +139,6 @@ CONFIG_TASK_XACCT=y CONFIG_TCG_TIS=y CONFIG_TCG_TPM=y CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_DCTCP=y CONFIG_TLS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS=y @@ -161,6 +150,5 @@ CONFIG_UPROBES=y CONFIG_USER_NS=y CONFIG_VETH=y CONFIG_VLAN_8021Q=y -CONFIG_VSOCKETS=y CONFIG_VSOCKETS_LOOPBACK=y CONFIG_XFRM_USER=y diff --git a/tools/testing/selftests/bpf/config.ppc64el b/tools/testing/selftests/bpf/config.ppc64el index 9acf389dc4ce67..b53afb5e0b71e2 100644 --- a/tools/testing/selftests/bpf/config.ppc64el +++ b/tools/testing/selftests/bpf/config.ppc64el @@ -54,7 +54,6 @@ CONFIG_NET=y CONFIG_NO_HZ_IDLE=y CONFIG_NONPORTABLE=y CONFIG_NR_CPUS=256 -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI_HOST_GENERIC=y diff --git a/tools/testing/selftests/bpf/config.riscv64 b/tools/testing/selftests/bpf/config.riscv64 index bb7043a80e1ac0..7bee24a79a7103 100644 --- a/tools/testing/selftests/bpf/config.riscv64 +++ b/tools/testing/selftests/bpf/config.riscv64 @@ -48,7 +48,6 @@ CONFIG_NET_VRF=y CONFIG_NONPORTABLE=y CONFIG_NO_HZ_IDLE=y CONFIG_NR_CPUS=256 -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI=y diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x index 26c3bc2ce11d5e..db61878148e425 100644 --- a/tools/testing/selftests/bpf/config.s390x +++ b/tools/testing/selftests/bpf/config.s390x @@ -22,10 +22,7 @@ CONFIG_CHECKPOINT_RESTORE=y CONFIG_CPUSETS=y CONFIG_CRASH_DUMP=y CONFIG_CRYPTO_USER_API_RNG=y -CONFIG_CRYPTO_USER_API_SKCIPHER=y CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_INFO_BTF=y -CONFIG_DEBUG_INFO_DWARF4=y CONFIG_DEBUG_LIST=y CONFIG_DEBUG_LOCKDEP=y CONFIG_DEBUG_NOTIFIERS=y @@ -56,11 +53,9 @@ CONFIG_IDLE_PAGE_TRACKING=y CONFIG_IKHEADERS=y CONFIG_INET6_ESP=y CONFIG_INET=y -CONFIG_INET_ESP=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTICAST=y CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_IPTABLES=y CONFIG_IPV6_SEG6_LWTUNNEL=y CONFIG_IPVLAN=y CONFIG_JUMP_LABEL=y @@ -83,18 +78,14 @@ CONFIG_MEMORY_HOTREMOVE=y CONFIG_NAMESPACES=y CONFIG_NET=y CONFIG_NET_ACT_BPF=y -CONFIG_NET_ACT_GACT=y CONFIG_NET_KEY=y -CONFIG_NET_SCH_FQ=y CONFIG_NET_VRF=y CONFIG_NETDEVICES=y CONFIG_NETFILTER_XT_MATCH_BPF=y CONFIG_NETFILTER_XT_TARGET_MARK=y -CONFIG_NF_TABLES=y CONFIG_NO_HZ_IDLE=y CONFIG_NR_CPUS=256 CONFIG_NUMA=y -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI=y @@ -119,7 +110,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_TASK_XACCT=y CONFIG_TASKSTATS=y CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_DCTCP=y CONFIG_TLS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y @@ -131,6 +121,5 @@ CONFIG_UPROBES=y CONFIG_USER_NS=y CONFIG_VETH=y CONFIG_VLAN_8021Q=y -CONFIG_VSOCKETS=y CONFIG_VSOCKETS_LOOPBACK=y CONFIG_XFRM_USER=y diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64 index 5e713ef7caa307..42ad817b00aea5 100644 --- a/tools/testing/selftests/bpf/config.x86_64 +++ b/tools/testing/selftests/bpf/config.x86_64 @@ -44,7 +44,6 @@ CONFIG_CRYPTO_SEQIV=y CONFIG_CRYPTO_XXHASH=y CONFIG_DCB=y CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_INFO_BTF=y CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_DEFAULT_FQ_CODEL=y @@ -104,12 +103,10 @@ CONFIG_HZ_1000=y CONFIG_INET=y CONFIG_INPUT_EVDEV=y CONFIG_INTEL_POWERCLAMP=y -CONFIG_IP6_NF_IPTABLES=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MROUTE=y CONFIG_IP_MULTICAST=y CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_IPTABLES=y CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y CONFIG_IP_ROUTE_MULTIPATH=y @@ -162,7 +159,6 @@ CONFIG_NUMA=y CONFIG_NUMA_BALANCING=y CONFIG_NVMEM=y CONFIG_OSF_PARTITION=y -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI=y @@ -220,7 +216,6 @@ CONFIG_VALIDATE_FS_PARSER=y CONFIG_VETH=y CONFIG_VIRT_DRIVERS=y CONFIG_VLAN_8021Q=y -CONFIG_VSOCKETS=y CONFIG_VSOCKETS_LOOPBACK=y CONFIG_X86_ACPI_CPUFREQ=y CONFIG_X86_CPUID=y From 8003235b10e54c1be57374c6224751b39750f16c Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 17 Aug 2025 14:30:20 -0700 Subject: [PATCH 0531/3206] Documentation: gpio: add documentation about using software nodes Introduce documentation regarding use of software nodes to describe GPIOs on legacy boards that have not been converted to device tree. Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Dmitry Torokhov Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/tnaaz2qlk5jpbonfle7uy7pb54qx6ixwuczfbkwtxxwpj7hwas@y7a2rwko3k6c Signed-off-by: Bartosz Golaszewski --- Documentation/driver-api/gpio/board.rst | 65 ++++ Documentation/driver-api/gpio/index.rst | 1 + .../driver-api/gpio/legacy-boards.rst | 298 ++++++++++++++++++ 3 files changed, 364 insertions(+) create mode 100644 Documentation/driver-api/gpio/legacy-boards.rst diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst index 4fd1cbd8296e19..069b54d8591bde 100644 --- a/Documentation/driver-api/gpio/board.rst +++ b/Documentation/driver-api/gpio/board.rst @@ -94,6 +94,71 @@ with the help of _DSD (Device Specific Data), introduced in ACPI 5.1:: For more information about the ACPI GPIO bindings see Documentation/firmware-guide/acpi/gpio-properties.rst. +Software Nodes +-------------- + +Software nodes allow board-specific code to construct an in-memory, +device-tree-like structure using struct software_node and struct +property_entry. This structure can then be associated with a platform device, +allowing drivers to use the standard device properties API to query +configuration, just as they would on an ACPI or device tree system. + +Software-node-backed GPIOs are described using the ``PROPERTY_ENTRY_GPIO()`` +macro, which ties a software node representing the GPIO controller with +consumer device. It allows consumers to use regular gpiolib APIs, such as +gpiod_get(), gpiod_get_optional(). + +The software node representing a GPIO controller need not be attached to the +GPIO controller device. The only requirement is that the node must be +registered and its name must match the GPIO controller's label. + +For example, here is how to describe a single GPIO-connected LED. This is an +alternative to using platform_data on legacy systems. + +.. code-block:: c + + #include + #include + #include + + /* + * 1. Define a node for the GPIO controller. Its .name must match the + * controller's label. + */ + static const struct software_node gpio_controller_node = { + .name = "gpio-foo", + }; + + /* 2. Define the properties for the LED device. */ + static const struct property_entry led_device_props[] = { + PROPERTY_ENTRY_STRING("label", "myboard:green:status"), + PROPERTY_ENTRY_STRING("linux,default-trigger", "heartbeat"), + PROPERTY_ENTRY_GPIO("gpios", &gpio_controller_node, 42, GPIO_ACTIVE_HIGH), + { } + }; + + /* 3. Define the software node for the LED device. */ + static const struct software_node led_device_swnode = { + .name = "status-led", + .properties = led_device_props, + }; + + /* + * 4. Register the software nodes and the platform device. + */ + const struct software_node *swnodes[] = { + &gpio_controller_node, + &led_device_swnode, + NULL + }; + software_node_register_node_group(swnodes); + + // Then register a platform_device for "leds-gpio" and associate + // it with &led_device_swnode via .fwnode. + +For a complete guide on converting board files to use software nodes, see +Documentation/driver-api/gpio/legacy-boards.rst. + Platform Data ------------- Finally, GPIOs can be bound to devices and functions using platform data. Board diff --git a/Documentation/driver-api/gpio/index.rst b/Documentation/driver-api/gpio/index.rst index 43f6a3afe10b55..87929840e85a29 100644 --- a/Documentation/driver-api/gpio/index.rst +++ b/Documentation/driver-api/gpio/index.rst @@ -12,6 +12,7 @@ Contents: driver consumer board + legacy-boards drivers-on-gpio bt8xxgpio diff --git a/Documentation/driver-api/gpio/legacy-boards.rst b/Documentation/driver-api/gpio/legacy-boards.rst new file mode 100644 index 00000000000000..46e3a26dba772e --- /dev/null +++ b/Documentation/driver-api/gpio/legacy-boards.rst @@ -0,0 +1,298 @@ +Supporting Legacy Boards +======================== + +Many drivers in the kernel, such as ``leds-gpio`` and ``gpio-keys``, are +migrating away from using board-specific ``platform_data`` to a unified device +properties interface. This interface allows drivers to be simpler and more +generic, as they can query properties in a standardized way. + +On modern systems, these properties are provided via device tree. However, some +older platforms have not been converted to device tree and instead rely on +board files to describe their hardware configuration. To bridge this gap and +allow these legacy boards to work with modern, generic drivers, the kernel +provides a mechanism called **software nodes**. + +This document provides a guide on how to convert a legacy board file from using +``platform_data`` and ``gpiod_lookup_table`` to the modern software node +approach for describing GPIO-connected devices. + +The Core Idea: Software Nodes +----------------------------- + +Software nodes allow board-specific code to construct an in-memory, +device-tree-like structure using struct software_node and struct +property_entry. This structure can then be associated with a platform device, +allowing drivers to use the standard device properties API (e.g., +device_property_read_u32(), device_property_read_string()) to query +configuration, just as they would on an ACPI or device tree system. + +The gpiolib code has support for handling software nodes, so that if GPIO is +described properly, as detailed in the section below, then regular gpiolib APIs, +such as gpiod_get(), gpiod_get_optional(), and others will work. + +Requirements for GPIO Properties +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using software nodes to describe GPIO connections, the following +requirements must be met for the GPIO core to correctly resolve the reference: + +1. **The GPIO controller's software node "name" must match the controller's + "label".** The gpiolib core uses this name to find the corresponding + struct gpio_chip at runtime. + This software node has to be registered, but need not be attached to the + device representing the GPIO controller that is providing the GPIO in + question. It may be left as a "free floating" node. + +2. **The GPIO property must be a reference.** The ``PROPERTY_ENTRY_GPIO()`` + macro handles this as it is an alias for ``PROPERTY_ENTRY_REF()``. + +3. **The reference must have exactly two arguments:** + + - The first argument is the GPIO offset within the controller. + - The second argument is the flags for the GPIO line (e.g., + GPIO_ACTIVE_HIGH, GPIO_ACTIVE_LOW). + +The ``PROPERTY_ENTRY_GPIO()`` macro is the preferred way of defining GPIO +properties in software nodes. + +Conversion Example +------------------ + +Let's walk through an example of converting a board file that defines a GPIO- +connected LED and a button. + +Before: Using Platform Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A typical legacy board file might look like this: + +.. code-block:: c + + #include + #include + #include + #include + + #define MYBOARD_GPIO_CONTROLLER "gpio-foo" + + /* LED setup */ + static const struct gpio_led myboard_leds[] = { + { + .name = "myboard:green:status", + .default_trigger = "heartbeat", + }, + }; + + static const struct gpio_led_platform_data myboard_leds_pdata = { + .num_leds = ARRAY_SIZE(myboard_leds), + .leds = myboard_leds, + }; + + static struct gpiod_lookup_table myboard_leds_gpios = { + .dev_id = "leds-gpio", + .table = { + GPIO_LOOKUP_IDX(MYBOARD_GPIO_CONTROLLER, 42, NULL, 0, GPIO_ACTIVE_HIGH), + { }, + }, + }; + + /* Button setup */ + static struct gpio_keys_button myboard_buttons[] = { + { + .code = KEY_WPS_BUTTON, + .desc = "WPS Button", + .active_low = 1, + }, + }; + + static const struct gpio_keys_platform_data myboard_buttons_pdata = { + .buttons = myboard_buttons, + .nbuttons = ARRAY_SIZE(myboard_buttons), + }; + + static struct gpiod_lookup_table myboard_buttons_gpios = { + .dev_id = "gpio-keys", + .table = { + GPIO_LOOKUP_IDX(MYBOARD_GPIO_CONTROLLER, 15, NULL, 0, GPIO_ACTIVE_LOW), + { }, + }, + }; + + /* Device registration */ + static int __init myboard_init(void) + { + gpiod_add_lookup_table(&myboard_leds_gpios); + gpiod_add_lookup_table(&myboard_buttons_gpios); + + platform_device_register_data(NULL, "leds-gpio", -1, + &myboard_leds_pdata, sizeof(myboard_leds_pdata)); + platform_device_register_data(NULL, "gpio-keys", -1, + &myboard_buttons_pdata, sizeof(myboard_buttons_pdata)); + + return 0; + } + +After: Using Software Nodes +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Here is how the same configuration can be expressed using software nodes. + +Step 1: Define the GPIO Controller Node +*************************************** + +First, define a software node that represents the GPIO controller that the +LEDs and buttons are connected to. The ``name`` of this node must match the +name of the driver for the GPIO controller (e.g., "gpio-foo"). + +.. code-block:: c + + #include + #include + + #define MYBOARD_GPIO_CONTROLLER "gpio-foo" + + static const struct software_node myboard_gpio_controller_node = { + .name = MYBOARD_GPIO_CONTROLLER, + }; + +Step 2: Define Consumer Device Nodes and Properties +*************************************************** + +Next, define the software nodes for the consumer devices (the LEDs and buttons). +This involves creating a parent node for each device type and child nodes for +each individual LED or button. + +.. code-block:: c + + /* LED setup */ + static const struct software_node myboard_leds_node = { + .name = "myboard-leds", + }; + + static const struct property_entry myboard_status_led_props[] = { + PROPERTY_ENTRY_STRING("label", "myboard:green:status"), + PROPERTY_ENTRY_STRING("linux,default-trigger", "heartbeat"), + PROPERTY_ENTRY_GPIO("gpios", &myboard_gpio_controller_node, 42, GPIO_ACTIVE_HIGH), + { } + }; + + static const struct software_node myboard_status_led_swnode = { + .name = "status-led", + .parent = &myboard_leds_node, + .properties = myboard_status_led_props, + }; + + /* Button setup */ + static const struct software_node myboard_keys_node = { + .name = "myboard-keys", + }; + + static const struct property_entry myboard_wps_button_props[] = { + PROPERTY_ENTRY_STRING("label", "WPS Button"), + PROPERTY_ENTRY_U32("linux,code", KEY_WPS_BUTTON), + PROPERTY_ENTRY_GPIO("gpios", &myboard_gpio_controller_node, 15, GPIO_ACTIVE_LOW), + { } + }; + + static const struct software_node myboard_wps_button_swnode = { + .name = "wps-button", + .parent = &myboard_keys_node, + .properties = myboard_wps_button_props, + }; + + + +Step 3: Group and Register the Nodes +************************************ + +For maintainability, it is often beneficial to group all software nodes into a +single array and register them with one call. + +.. code-block:: c + + static const struct software_node * const myboard_swnodes[] = { + &myboard_gpio_controller_node, + &myboard_leds_node, + &myboard_status_led_swnode, + &myboard_keys_node, + &myboard_wps_button_swnode, + NULL + }; + + static int __init myboard_init(void) + { + int error; + + error = software_node_register_node_group(myboard_swnodes); + if (error) { + pr_err("Failed to register software nodes: %d\n", error); + return error; + } + + // ... platform device registration follows + } + +.. note:: + When splitting registration of nodes by devices that they represent, it is + essential that the software node representing the GPIO controller itself + is registered first, before any of the nodes that reference it. + +Step 4: Register Platform Devices with Software Nodes +***************************************************** + +Finally, register the platform devices and associate them with their respective +software nodes using the ``fwnode`` field in struct platform_device_info. + +.. code-block:: c + + static struct platform_device *leds_pdev; + static struct platform_device *keys_pdev; + + static int __init myboard_init(void) + { + struct platform_device_info pdev_info; + int error; + + error = software_node_register_node_group(myboard_swnodes); + if (error) + return error; + + memset(&pdev_info, 0, sizeof(pdev_info)); + pdev_info.name = "leds-gpio"; + pdev_info.id = PLATFORM_DEVID_NONE; + pdev_info.fwnode = software_node_fwnode(&myboard_leds_node); + leds_pdev = platform_device_register_full(&pdev_info); + if (IS_ERR(leds_pdev)) { + error = PTR_ERR(leds_pdev); + goto err_unregister_nodes; + } + + memset(&pdev_info, 0, sizeof(pdev_info)); + pdev_info.name = "gpio-keys"; + pdev_info.id = PLATFORM_DEVID_NONE; + pdev_info.fwnode = software_node_fwnode(&myboard_keys_node); + keys_pdev = platform_device_register_full(&pdev_info); + if (IS_ERR(keys_pdev)) { + error = PTR_ERR(keys_pdev); + platform_device_unregister(leds_pdev); + goto err_unregister_nodes; + } + + return 0; + + err_unregister_nodes: + software_node_unregister_node_group(myboard_swnodes); + return error; + } + + static void __exit myboard_exit(void) + { + platform_device_unregister(keys_pdev); + platform_device_unregister(leds_pdev); + software_node_unregister_node_group(myboard_swnodes); + } + +With these changes, the generic ``leds-gpio`` and ``gpio-keys`` drivers will +be able to probe successfully and get their configuration from the properties +defined in the software nodes, removing the need for board-specific platform +data. From 604642fc148b5d98fbe5f55e4c2688f9ee0b5868 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 21 Aug 2025 10:32:14 +0200 Subject: [PATCH 0532/3206] dt-bindings: gpio: Minor whitespace cleanup in example The DTS code coding style expects exactly one space around '=' character. Signed-off-by: Krzysztof Kozlowski Acked-by: Rob Herring (Arm) Acked-by: Yixun Lan Link: https://lore.kernel.org/r/20250821083213.46642-2-krzysztof.kozlowski@linaro.org Signed-off-by: Bartosz Golaszewski --- Documentation/devicetree/bindings/gpio/maxim,max31910.yaml | 6 +++--- .../devicetree/bindings/gpio/spacemit,k1-gpio.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/gpio/maxim,max31910.yaml b/Documentation/devicetree/bindings/gpio/maxim,max31910.yaml index 82a190a715f940..4d200f9dffd5fe 100644 --- a/Documentation/devicetree/bindings/gpio/maxim,max31910.yaml +++ b/Documentation/devicetree/bindings/gpio/maxim,max31910.yaml @@ -95,9 +95,9 @@ examples: #gpio-cells = <2>; maxim,modesel-gpios = <&gpio2 23>; - maxim,fault-gpios = <&gpio2 24 GPIO_ACTIVE_LOW>; - maxim,db0-gpios = <&gpio2 25>; - maxim,db1-gpios = <&gpio2 26>; + maxim,fault-gpios = <&gpio2 24 GPIO_ACTIVE_LOW>; + maxim,db0-gpios = <&gpio2 25>; + maxim,db1-gpios = <&gpio2 26>; spi-max-frequency = <25000000>; }; diff --git a/Documentation/devicetree/bindings/gpio/spacemit,k1-gpio.yaml b/Documentation/devicetree/bindings/gpio/spacemit,k1-gpio.yaml index ec0232e72c7122..83e0b2d14c9f8c 100644 --- a/Documentation/devicetree/bindings/gpio/spacemit,k1-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/spacemit,k1-gpio.yaml @@ -80,7 +80,7 @@ examples: gpio@d4019000 { compatible = "spacemit,k1-gpio"; reg = <0xd4019000 0x800>; - clocks =<&ccu 9>, <&ccu 61>; + clocks = <&ccu 9>, <&ccu 61>; clock-names = "core", "bus"; gpio-controller; #gpio-cells = <3>; From 640d31ea83c6f67133d47df9a0973f3281c91cf4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 31 Jul 2025 15:35:10 -0700 Subject: [PATCH 0533/3206] lib/crypto: sha256: Use underlying functions instead of crypto_simd_usable() Since sha256_kunit tests the fallback code paths without using crypto_simd_disabled_for_test, make the SHA-256 code just use the underlying may_use_simd() and irq_fpu_usable() functions directly instead of crypto_simd_usable(). This eliminates an unnecessary layer. While doing this, also add likely() annotations, and fix a minor inconsistency where the static keys in the sha256.h files were in a different place than in the corresponding sha1.h and sha512.h files. Link: https://lore.kernel.org/r/20250731223510.136650-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/arm/sha256.h | 10 +++++----- lib/crypto/arm64/sha256.h | 10 +++++----- lib/crypto/riscv/sha256.h | 8 ++++---- lib/crypto/x86/sha256.h | 3 +-- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h index da75cbdc51d413..eab713e650f33e 100644 --- a/lib/crypto/arm/sha256.h +++ b/lib/crypto/arm/sha256.h @@ -5,7 +5,10 @@ * Copyright 2025 Google LLC */ #include -#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); asmlinkage void sha256_block_data_order(struct sha256_block_state *state, const u8 *data, size_t nblocks); @@ -14,14 +17,11 @@ asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state, asmlinkage void sha256_ce_transform(struct sha256_block_state *state, const u8 *data, size_t nblocks); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); - static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon) && crypto_simd_usable()) { + static_branch_likely(&have_neon) && likely(may_use_simd())) { kernel_neon_begin(); if (static_branch_likely(&have_ce)) sha256_ce_transform(state, data, nblocks); diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h index a211966c124a96..d95f1077c32bdf 100644 --- a/lib/crypto/arm64/sha256.h +++ b/lib/crypto/arm64/sha256.h @@ -5,9 +5,12 @@ * Copyright 2025 Google LLC */ #include -#include +#include #include +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + asmlinkage void sha256_block_data_order(struct sha256_block_state *state, const u8 *data, size_t nblocks); asmlinkage void sha256_block_neon(struct sha256_block_state *state, @@ -15,14 +18,11 @@ asmlinkage void sha256_block_neon(struct sha256_block_state *state, asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state, const u8 *data, size_t nblocks); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); - static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon) && crypto_simd_usable()) { + static_branch_likely(&have_neon) && likely(may_use_simd())) { if (static_branch_likely(&have_ce)) { do { size_t rem; diff --git a/lib/crypto/riscv/sha256.h b/lib/crypto/riscv/sha256.h index c0f79c18f11998..f36f68d2e88cc7 100644 --- a/lib/crypto/riscv/sha256.h +++ b/lib/crypto/riscv/sha256.h @@ -9,19 +9,19 @@ * Author: Jerry Shih */ +#include #include -#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state, const u8 *data, size_t nblocks); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); - static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { - if (static_branch_likely(&have_extensions) && crypto_simd_usable()) { + if (static_branch_likely(&have_extensions) && likely(may_use_simd())) { kernel_vector_begin(); sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks); kernel_vector_end(); diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h index 669bc06538b67e..c852396ef31908 100644 --- a/lib/crypto/x86/sha256.h +++ b/lib/crypto/x86/sha256.h @@ -5,7 +5,6 @@ * Copyright 2025 Google LLC */ #include -#include #include DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); @@ -16,7 +15,7 @@ DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); static void c_fn(struct sha256_block_state *state, const u8 *data, \ size_t nblocks) \ { \ - if (likely(crypto_simd_usable())) { \ + if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ From bce5816672ec27085489f096ec27739a4a233b7b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 31 Jul 2025 15:36:51 -0700 Subject: [PATCH 0534/3206] lib/crypto: sha512: Use underlying functions instead of crypto_simd_usable() Since sha512_kunit tests the fallback code paths without using crypto_simd_disabled_for_test, make the SHA-512 code just use the underlying may_use_simd() and irq_fpu_usable() functions directly instead of crypto_simd_usable(). This eliminates an unnecessary layer. Link: https://lore.kernel.org/r/20250731223651.136939-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/arm/sha512.h | 5 ++--- lib/crypto/arm64/sha512.h | 5 ++--- lib/crypto/riscv/sha512.h | 4 +--- lib/crypto/x86/sha512.h | 4 +--- 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h index f147b6490d6cd4..cc2447acd56213 100644 --- a/lib/crypto/arm/sha512.h +++ b/lib/crypto/arm/sha512.h @@ -4,9 +4,8 @@ * * Copyright 2025 Google LLC */ - #include -#include +#include static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); @@ -19,7 +18,7 @@ static void sha512_blocks(struct sha512_block_state *state, const u8 *data, size_t nblocks) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon) && likely(crypto_simd_usable())) { + static_branch_likely(&have_neon) && likely(may_use_simd())) { kernel_neon_begin(); sha512_block_data_order_neon(state, data, nblocks); kernel_neon_end(); diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h index 6abb40b467f2ec..7539ea3fef10d0 100644 --- a/lib/crypto/arm64/sha512.h +++ b/lib/crypto/arm64/sha512.h @@ -4,9 +4,8 @@ * * Copyright 2025 Google LLC */ - #include -#include +#include #include static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns); @@ -21,7 +20,7 @@ static void sha512_blocks(struct sha512_block_state *state, { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && static_branch_likely(&have_sha512_insns) && - likely(crypto_simd_usable())) { + likely(may_use_simd())) { do { size_t rem; diff --git a/lib/crypto/riscv/sha512.h b/lib/crypto/riscv/sha512.h index 9d0abede322f75..59dc0294a9a7e9 100644 --- a/lib/crypto/riscv/sha512.h +++ b/lib/crypto/riscv/sha512.h @@ -11,7 +11,6 @@ #include #include -#include static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); @@ -21,8 +20,7 @@ asmlinkage void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state, static void sha512_blocks(struct sha512_block_state *state, const u8 *data, size_t nblocks) { - if (static_branch_likely(&have_extensions) && - likely(crypto_simd_usable())) { + if (static_branch_likely(&have_extensions) && likely(may_use_simd())) { kernel_vector_begin(); sha512_transform_zvknhb_zvkb(state, data, nblocks); kernel_vector_end(); diff --git a/lib/crypto/x86/sha512.h b/lib/crypto/x86/sha512.h index c13503d9d57d92..be2c8fc1224699 100644 --- a/lib/crypto/x86/sha512.h +++ b/lib/crypto/x86/sha512.h @@ -4,9 +4,7 @@ * * Copyright 2025 Google LLC */ - #include -#include #include DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); @@ -17,7 +15,7 @@ DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); static void c_fn(struct sha512_block_state *state, const u8 *data, \ size_t nblocks) \ { \ - if (likely(crypto_simd_usable())) { \ + if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ From e164461349444ad27873e4ab2f492eb4465dbbb0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 5 Aug 2025 15:28:49 -0700 Subject: [PATCH 0535/3206] lib/crypto: md5: Add MD5 and HMAC-MD5 library functions Add library functions for MD5, including HMAC support. The MD5 implementation is derived from crypto/md5.c. This closely mirrors the corresponding SHA-1 and SHA-2 changes. Like SHA-1 and SHA-2, support for architecture-optimized MD5 implementations is included. I originally proposed dropping those, but unfortunately there is an AF_ALG user of the PowerPC MD5 code (https://lore.kernel.org/r/c4191597-341d-4fd7-bc3d-13daf7666c41@csgroup.eu/), and dropping that code would be viewed as a performance regression. We don't add new software algorithm implementations purely for AF_ALG, as escalating to kernel mode merely to do calculations that could be done in userspace is inefficient and is completely the wrong design. But since this one already existed, it gets grandfathered in for now. An objection was also raised to dropping the SPARC64 MD5 code because it utilizes the CPU's direct support for MD5, although it remains unclear that anyone is using that. Regardless, we'll keep these around for now. Note that while MD5 is a legacy algorithm that is vulnerable to practical collision attacks, it still has various in-kernel users that implement legacy protocols. Switching to a simple library API, which is the way the code should have been organized originally, will greatly simplify their code. For example: MD5: drivers/md/dm-crypt.c (for lmk IV generation) fs/nfsd/nfs4recover.c fs/ecryptfs/ fs/smb/client/ net/{ipv4,ipv6}/ (for TCP-MD5 signatures) HMAC-MD5: fs/smb/client/ fs/smb/server/ (Also net/sctp/ if it continues using HMAC-MD5 for cookie generation. However, that use case has the flexibility to upgrade to a more modern algorithm, which I'll be proposing instead.) As usual, the "md5" and "hmac(md5)" crypto_shash algorithms will also be reimplemented on top of these library functions. For "hmac(md5)" this will provide a faster, more streamlined implementation. Link: https://lore.kernel.org/r/20250805222855.10362-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/md5.h | 181 +++++++++++++++++++++++- lib/crypto/Kconfig | 10 ++ lib/crypto/Makefile | 10 ++ lib/crypto/md5.c | 322 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 522 insertions(+), 1 deletion(-) create mode 100644 lib/crypto/md5.c diff --git a/include/crypto/md5.h b/include/crypto/md5.h index 28ee533a0507a7..c9aa5c3abc5324 100644 --- a/include/crypto/md5.h +++ b/include/crypto/md5.h @@ -7,6 +7,7 @@ #define MD5_DIGEST_SIZE 16 #define MD5_HMAC_BLOCK_SIZE 64 +#define MD5_BLOCK_SIZE 64 #define MD5_BLOCK_WORDS 16 #define MD5_HASH_WORDS 4 #define MD5_STATE_SIZE 24 @@ -27,4 +28,182 @@ struct md5_state { u32 block[MD5_BLOCK_WORDS]; }; -#endif +/* State for the MD5 compression function */ +struct md5_block_state { + u32 h[MD5_HASH_WORDS]; +}; + +/** + * struct md5_ctx - Context for hashing a message with MD5 + * @state: the compression function state + * @bytecount: number of bytes processed so far + * @buf: partial block buffer; bytecount % MD5_BLOCK_SIZE bytes are valid + */ +struct md5_ctx { + struct md5_block_state state; + u64 bytecount; + u8 buf[MD5_BLOCK_SIZE] __aligned(__alignof__(__le64)); +}; + +/** + * md5_init() - Initialize an MD5 context for a new message + * @ctx: the context to initialize + * + * If you don't need incremental computation, consider md5() instead. + * + * Context: Any context. + */ +void md5_init(struct md5_ctx *ctx); + +/** + * md5_update() - Update an MD5 context with message data + * @ctx: the context to update; must have been initialized + * @data: the message data + * @len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len); + +/** + * md5_final() - Finish computing an MD5 message digest + * @ctx: the context to finalize; must have been initialized + * @out: (output) the resulting MD5 message digest + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]); + +/** + * md5() - Compute MD5 message digest in one shot + * @data: the message data + * @len: the data length in bytes + * @out: (output) the resulting MD5 message digest + * + * Context: Any context. + */ +void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE]); + +/** + * struct hmac_md5_key - Prepared key for HMAC-MD5 + * @istate: private + * @ostate: private + */ +struct hmac_md5_key { + struct md5_block_state istate; + struct md5_block_state ostate; +}; + +/** + * struct hmac_md5_ctx - Context for computing HMAC-MD5 of a message + * @hash_ctx: private + * @ostate: private + */ +struct hmac_md5_ctx { + struct md5_ctx hash_ctx; + struct md5_block_state ostate; +}; + +/** + * hmac_md5_preparekey() - Prepare a key for HMAC-MD5 + * @key: (output) the key structure to initialize + * @raw_key: the raw HMAC-MD5 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * Note: the caller is responsible for zeroizing both the struct hmac_md5_key + * and the raw key once they are no longer needed. + * + * Context: Any context. + */ +void hmac_md5_preparekey(struct hmac_md5_key *key, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_md5_init() - Initialize an HMAC-MD5 context for a new message + * @ctx: (output) the HMAC context to initialize + * @key: the prepared HMAC key + * + * If you don't need incremental computation, consider hmac_md5() instead. + * + * Context: Any context. + */ +void hmac_md5_init(struct hmac_md5_ctx *ctx, const struct hmac_md5_key *key); + +/** + * hmac_md5_init_usingrawkey() - Initialize an HMAC-MD5 context for a new + * message, using a raw key + * @ctx: (output) the HMAC context to initialize + * @raw_key: the raw HMAC-MD5 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * If you don't need incremental computation, consider hmac_md5_usingrawkey() + * instead. + * + * Context: Any context. + */ +void hmac_md5_init_usingrawkey(struct hmac_md5_ctx *ctx, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_md5_update() - Update an HMAC-MD5 context with message data + * @ctx: the HMAC context to update; must have been initialized + * @data: the message data + * @data_len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +static inline void hmac_md5_update(struct hmac_md5_ctx *ctx, + const u8 *data, size_t data_len) +{ + md5_update(&ctx->hash_ctx, data, data_len); +} + +/** + * hmac_md5_final() - Finish computing an HMAC-MD5 value + * @ctx: the HMAC context to finalize; must have been initialized + * @out: (output) the resulting HMAC-MD5 value + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]); + +/** + * hmac_md5() - Compute HMAC-MD5 in one shot, using a prepared key + * @key: the prepared HMAC key + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-MD5 value + * + * If you're using the key only once, consider using hmac_md5_usingrawkey(). + * + * Context: Any context. + */ +void hmac_md5(const struct hmac_md5_key *key, + const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE]); + +/** + * hmac_md5_usingrawkey() - Compute HMAC-MD5 in one shot, using a raw key + * @raw_key: the raw HMAC-MD5 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-MD5 value + * + * If you're using the key multiple times, prefer to use hmac_md5_preparekey() + * followed by multiple calls to hmac_md5() instead. + * + * Context: Any context. + */ +void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[MD5_DIGEST_SIZE]); + +#endif /* _CRYPTO_MD5_H */ diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 1e6b008f8fca45..e38e8ed779d843 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -101,6 +101,16 @@ config CRYPTO_LIB_CURVE25519 config CRYPTO_LIB_DES tristate +config CRYPTO_LIB_MD5 + tristate + help + The MD5 and HMAC-MD5 library functions. Select this if your module + uses any of the functions from . + +config CRYPTO_LIB_MD5_ARCH + bool + depends on CRYPTO_LIB_MD5 && !UML + config CRYPTO_LIB_POLY1305_RSIZE int default 2 if MIPS diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 539d5d59a50e48..429573e8f8b36b 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -59,6 +59,16 @@ libcurve25519-$(CONFIG_CRYPTO_SELFTESTS) += curve25519-selftest.o obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o libdes-y := des.o +################################################################################ + +obj-$(CONFIG_CRYPTO_LIB_MD5) += libmd5.o +libmd5-y := md5.o +ifeq ($(CONFIG_CRYPTO_LIB_MD5_ARCH),y) +CFLAGS_md5.o += -I$(src)/$(SRCARCH) +endif # CONFIG_CRYPTO_LIB_MD5_ARCH + +################################################################################ + obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o libpoly1305-y += poly1305.o diff --git a/lib/crypto/md5.c b/lib/crypto/md5.c new file mode 100644 index 00000000000000..c0610ea1370e62 --- /dev/null +++ b/lib/crypto/md5.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * MD5 and HMAC-MD5 library functions + * + * md5_block_generic() is derived from cryptoapi implementation, originally + * based on the public domain implementation written by Colin Plumb in 1993. + * + * Copyright (c) Cryptoapi developers. + * Copyright (c) 2002 James Morris + * Copyright 2025 Google LLC + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct md5_block_state md5_iv = { + .h = { MD5_H0, MD5_H1, MD5_H2, MD5_H3 }, +}; + +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +#define MD5STEP(f, w, x, y, z, in, s) \ + (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x) + +static void md5_block_generic(struct md5_block_state *state, + const u8 data[MD5_BLOCK_SIZE]) +{ + u32 in[MD5_BLOCK_WORDS]; + u32 a, b, c, d; + + memcpy(in, data, MD5_BLOCK_SIZE); + le32_to_cpu_array(in, ARRAY_SIZE(in)); + + a = state->h[0]; + b = state->h[1]; + c = state->h[2]; + d = state->h[3]; + + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + + state->h[0] += a; + state->h[1] += b; + state->h[2] += c; + state->h[3] += d; +} + +static void __maybe_unused md5_blocks_generic(struct md5_block_state *state, + const u8 *data, size_t nblocks) +{ + do { + md5_block_generic(state, data); + data += MD5_BLOCK_SIZE; + } while (--nblocks); +} + +#ifdef CONFIG_CRYPTO_LIB_MD5_ARCH +#include "md5.h" /* $(SRCARCH)/md5.h */ +#else +#define md5_blocks md5_blocks_generic +#endif + +void md5_init(struct md5_ctx *ctx) +{ + ctx->state = md5_iv; + ctx->bytecount = 0; +} +EXPORT_SYMBOL_GPL(md5_init); + +void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len) +{ + size_t partial = ctx->bytecount % MD5_BLOCK_SIZE; + + ctx->bytecount += len; + + if (partial + len >= MD5_BLOCK_SIZE) { + size_t nblocks; + + if (partial) { + size_t l = MD5_BLOCK_SIZE - partial; + + memcpy(&ctx->buf[partial], data, l); + data += l; + len -= l; + + md5_blocks(&ctx->state, ctx->buf, 1); + } + + nblocks = len / MD5_BLOCK_SIZE; + len %= MD5_BLOCK_SIZE; + + if (nblocks) { + md5_blocks(&ctx->state, data, nblocks); + data += nblocks * MD5_BLOCK_SIZE; + } + partial = 0; + } + if (len) + memcpy(&ctx->buf[partial], data, len); +} +EXPORT_SYMBOL_GPL(md5_update); + +static void __md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]) +{ + u64 bitcount = ctx->bytecount << 3; + size_t partial = ctx->bytecount % MD5_BLOCK_SIZE; + + ctx->buf[partial++] = 0x80; + if (partial > MD5_BLOCK_SIZE - 8) { + memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - partial); + md5_blocks(&ctx->state, ctx->buf, 1); + partial = 0; + } + memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - 8 - partial); + *(__le64 *)&ctx->buf[MD5_BLOCK_SIZE - 8] = cpu_to_le64(bitcount); + md5_blocks(&ctx->state, ctx->buf, 1); + + cpu_to_le32_array(ctx->state.h, ARRAY_SIZE(ctx->state.h)); + memcpy(out, ctx->state.h, MD5_DIGEST_SIZE); +} + +void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]) +{ + __md5_final(ctx, out); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(md5_final); + +void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE]) +{ + struct md5_ctx ctx; + + md5_init(&ctx); + md5_update(&ctx, data, len); + md5_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(md5); + +static void __hmac_md5_preparekey(struct md5_block_state *istate, + struct md5_block_state *ostate, + const u8 *raw_key, size_t raw_key_len) +{ + union { + u8 b[MD5_BLOCK_SIZE]; + unsigned long w[MD5_BLOCK_SIZE / sizeof(unsigned long)]; + } derived_key = { 0 }; + + if (unlikely(raw_key_len > MD5_BLOCK_SIZE)) + md5(raw_key, raw_key_len, derived_key.b); + else + memcpy(derived_key.b, raw_key, raw_key_len); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); + *istate = md5_iv; + md5_blocks(istate, derived_key.b, 1); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ + HMAC_IPAD_VALUE); + *ostate = md5_iv; + md5_blocks(ostate, derived_key.b, 1); + + memzero_explicit(&derived_key, sizeof(derived_key)); +} + +void hmac_md5_preparekey(struct hmac_md5_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_md5_preparekey(&key->istate, &key->ostate, raw_key, raw_key_len); +} +EXPORT_SYMBOL_GPL(hmac_md5_preparekey); + +void hmac_md5_init(struct hmac_md5_ctx *ctx, const struct hmac_md5_key *key) +{ + ctx->hash_ctx.state = key->istate; + ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE; + ctx->ostate = key->ostate; +} +EXPORT_SYMBOL_GPL(hmac_md5_init); + +void hmac_md5_init_usingrawkey(struct hmac_md5_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_md5_preparekey(&ctx->hash_ctx.state, &ctx->ostate, + raw_key, raw_key_len); + ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE; +} +EXPORT_SYMBOL_GPL(hmac_md5_init_usingrawkey); + +void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]) +{ + /* Generate the padded input for the outer hash in ctx->hash_ctx.buf. */ + __md5_final(&ctx->hash_ctx, ctx->hash_ctx.buf); + memset(&ctx->hash_ctx.buf[MD5_DIGEST_SIZE], 0, + MD5_BLOCK_SIZE - MD5_DIGEST_SIZE); + ctx->hash_ctx.buf[MD5_DIGEST_SIZE] = 0x80; + *(__le64 *)&ctx->hash_ctx.buf[MD5_BLOCK_SIZE - 8] = + cpu_to_le64(8 * (MD5_BLOCK_SIZE + MD5_DIGEST_SIZE)); + + /* Compute the outer hash, which gives the HMAC value. */ + md5_blocks(&ctx->ostate, ctx->hash_ctx.buf, 1); + cpu_to_le32_array(ctx->ostate.h, ARRAY_SIZE(ctx->ostate.h)); + memcpy(out, ctx->ostate.h, MD5_DIGEST_SIZE); + + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(hmac_md5_final); + +void hmac_md5(const struct hmac_md5_key *key, + const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE]) +{ + struct hmac_md5_ctx ctx; + + hmac_md5_init(&ctx, key); + hmac_md5_update(&ctx, data, data_len); + hmac_md5_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_md5); + +void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[MD5_DIGEST_SIZE]) +{ + struct hmac_md5_ctx ctx; + + hmac_md5_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_md5_update(&ctx, data, data_len); + hmac_md5_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_md5_usingrawkey); + +#ifdef md5_mod_init_arch +static int __init md5_mod_init(void) +{ + md5_mod_init_arch(); + return 0; +} +subsys_initcall(md5_mod_init); + +static void __exit md5_mod_exit(void) +{ +} +module_exit(md5_mod_exit); +#endif + +MODULE_DESCRIPTION("MD5 and HMAC-MD5 library functions"); +MODULE_LICENSE("GPL"); From c9e5ac0ab9d1235f1a91852ec3d3c5c5f3e8ba0e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 5 Aug 2025 15:28:50 -0700 Subject: [PATCH 0536/3206] lib/crypto: mips/md5: Migrate optimized code into library Instead of exposing the mips-optimized MD5 code via mips-specific crypto_shash algorithms, instead just implement the md5_blocks() library function. This is much simpler, it makes the MD5 library functions be mips-optimized, and it fixes the longstanding issue where the mips-optimized MD5 code was disabled by default. MD5 still remains available through crypto_shash, but individual architectures no longer need to handle it. Note: to see the diff from arch/mips/cavium-octeon/crypto/octeon-md5.c to lib/crypto/mips/md5.h, view this commit with 'git show -M10'. Link: https://lore.kernel.org/r/20250805222855.10362-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/mips/cavium-octeon/crypto/Makefile | 2 - arch/mips/cavium-octeon/crypto/octeon-md5.c | 214 -------------------- arch/mips/configs/cavium_octeon_defconfig | 1 - arch/mips/crypto/Kconfig | 10 - lib/crypto/Kconfig | 1 + lib/crypto/mips/md5.h | 65 ++++++ 6 files changed, 66 insertions(+), 227 deletions(-) delete mode 100644 arch/mips/cavium-octeon/crypto/octeon-md5.c create mode 100644 lib/crypto/mips/md5.h diff --git a/arch/mips/cavium-octeon/crypto/Makefile b/arch/mips/cavium-octeon/crypto/Makefile index 83f2f5dd93cccd..b7d03e8a03187e 100644 --- a/arch/mips/cavium-octeon/crypto/Makefile +++ b/arch/mips/cavium-octeon/crypto/Makefile @@ -4,5 +4,3 @@ # obj-y += octeon-crypto.o - -obj-$(CONFIG_CRYPTO_MD5_OCTEON) += octeon-md5.o diff --git a/arch/mips/cavium-octeon/crypto/octeon-md5.c b/arch/mips/cavium-octeon/crypto/octeon-md5.c deleted file mode 100644 index a8ce831e2cebd9..00000000000000 --- a/arch/mips/cavium-octeon/crypto/octeon-md5.c +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Cryptographic API. - * - * MD5 Message Digest Algorithm (RFC1321). - * - * Adapted for OCTEON by Aaro Koskinen . - * - * Based on crypto/md5.c, which is: - * - * Derived from cryptoapi implementation, originally based on the - * public domain implementation written by Colin Plumb in 1993. - * - * Copyright (c) Cryptoapi developers. - * Copyright (c) 2002 James Morris - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct octeon_md5_state { - __le32 hash[MD5_HASH_WORDS]; - u64 byte_count; -}; - -/* - * We pass everything as 64-bit. OCTEON can handle misaligned data. - */ - -static void octeon_md5_store_hash(struct octeon_md5_state *ctx) -{ - u64 *hash = (u64 *)ctx->hash; - - write_octeon_64bit_hash_dword(hash[0], 0); - write_octeon_64bit_hash_dword(hash[1], 1); -} - -static void octeon_md5_read_hash(struct octeon_md5_state *ctx) -{ - u64 *hash = (u64 *)ctx->hash; - - hash[0] = read_octeon_64bit_hash_dword(0); - hash[1] = read_octeon_64bit_hash_dword(1); -} - -static void octeon_md5_transform(const void *_block) -{ - const u64 *block = _block; - - write_octeon_64bit_block_dword(block[0], 0); - write_octeon_64bit_block_dword(block[1], 1); - write_octeon_64bit_block_dword(block[2], 2); - write_octeon_64bit_block_dword(block[3], 3); - write_octeon_64bit_block_dword(block[4], 4); - write_octeon_64bit_block_dword(block[5], 5); - write_octeon_64bit_block_dword(block[6], 6); - octeon_md5_start(block[7]); -} - -static int octeon_md5_init(struct shash_desc *desc) -{ - struct octeon_md5_state *mctx = shash_desc_ctx(desc); - - mctx->hash[0] = cpu_to_le32(MD5_H0); - mctx->hash[1] = cpu_to_le32(MD5_H1); - mctx->hash[2] = cpu_to_le32(MD5_H2); - mctx->hash[3] = cpu_to_le32(MD5_H3); - mctx->byte_count = 0; - - return 0; -} - -static int octeon_md5_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct octeon_md5_state *mctx = shash_desc_ctx(desc); - struct octeon_cop2_state state; - unsigned long flags; - - mctx->byte_count += len; - flags = octeon_crypto_enable(&state); - octeon_md5_store_hash(mctx); - - do { - octeon_md5_transform(data); - data += MD5_HMAC_BLOCK_SIZE; - len -= MD5_HMAC_BLOCK_SIZE; - } while (len >= MD5_HMAC_BLOCK_SIZE); - - octeon_md5_read_hash(mctx); - octeon_crypto_disable(&state, flags); - mctx->byte_count -= len; - return len; -} - -static int octeon_md5_finup(struct shash_desc *desc, const u8 *src, - unsigned int offset, u8 *out) -{ - struct octeon_md5_state *mctx = shash_desc_ctx(desc); - int padding = 56 - (offset + 1); - struct octeon_cop2_state state; - u32 block[MD5_BLOCK_WORDS]; - unsigned long flags; - char *p; - - p = memcpy(block, src, offset); - p += offset; - *p++ = 0x80; - - flags = octeon_crypto_enable(&state); - octeon_md5_store_hash(mctx); - - if (padding < 0) { - memset(p, 0x00, padding + sizeof(u64)); - octeon_md5_transform(block); - p = (char *)block; - padding = 56; - } - - memset(p, 0, padding); - mctx->byte_count += offset; - block[14] = mctx->byte_count << 3; - block[15] = mctx->byte_count >> 29; - cpu_to_le32_array(block + 14, 2); - octeon_md5_transform(block); - - octeon_md5_read_hash(mctx); - octeon_crypto_disable(&state, flags); - - memzero_explicit(block, sizeof(block)); - memcpy(out, mctx->hash, sizeof(mctx->hash)); - - return 0; -} - -static int octeon_md5_export(struct shash_desc *desc, void *out) -{ - struct octeon_md5_state *ctx = shash_desc_ctx(desc); - union { - u8 *u8; - u32 *u32; - u64 *u64; - } p = { .u8 = out }; - int i; - - for (i = 0; i < MD5_HASH_WORDS; i++) - put_unaligned(le32_to_cpu(ctx->hash[i]), p.u32++); - put_unaligned(ctx->byte_count, p.u64); - return 0; -} - -static int octeon_md5_import(struct shash_desc *desc, const void *in) -{ - struct octeon_md5_state *ctx = shash_desc_ctx(desc); - union { - const u8 *u8; - const u32 *u32; - const u64 *u64; - } p = { .u8 = in }; - int i; - - for (i = 0; i < MD5_HASH_WORDS; i++) - ctx->hash[i] = cpu_to_le32(get_unaligned(p.u32++)); - ctx->byte_count = get_unaligned(p.u64); - return 0; -} - -static struct shash_alg alg = { - .digestsize = MD5_DIGEST_SIZE, - .init = octeon_md5_init, - .update = octeon_md5_update, - .finup = octeon_md5_finup, - .export = octeon_md5_export, - .import = octeon_md5_import, - .statesize = MD5_STATE_SIZE, - .descsize = sizeof(struct octeon_md5_state), - .base = { - .cra_name = "md5", - .cra_driver_name= "octeon-md5", - .cra_priority = OCTEON_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = MD5_HMAC_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init md5_mod_init(void) -{ - if (!octeon_has_crypto()) - return -ENOTSUPP; - return crypto_register_shash(&alg); -} - -static void __exit md5_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(md5_mod_init); -module_exit(md5_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MD5 Message Digest Algorithm (OCTEON)"); -MODULE_AUTHOR("Aaro Koskinen "); diff --git a/arch/mips/configs/cavium_octeon_defconfig b/arch/mips/configs/cavium_octeon_defconfig index 3f50e1d78894a1..68c363366bceb8 100644 --- a/arch/mips/configs/cavium_octeon_defconfig +++ b/arch/mips/configs/cavium_octeon_defconfig @@ -155,7 +155,6 @@ CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_MD5_OCTEON=y CONFIG_CRYPTO_DES=y CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/crypto/Kconfig b/arch/mips/crypto/Kconfig index 7b91f4ec65bffb..6a5bd5074867e0 100644 --- a/arch/mips/crypto/Kconfig +++ b/arch/mips/crypto/Kconfig @@ -2,14 +2,4 @@ menu "Accelerated Cryptographic Algorithms for CPU (mips)" -config CRYPTO_MD5_OCTEON - tristate "Digests: MD5 (OCTEON)" - depends on CPU_CAVIUM_OCTEON - select CRYPTO_MD5 - select CRYPTO_HASH - help - MD5 message digest algorithm (RFC1321) - - Architecture: mips OCTEON using crypto instructions, when available - endmenu diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index e38e8ed779d843..7b4e47ce37bbe4 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -110,6 +110,7 @@ config CRYPTO_LIB_MD5 config CRYPTO_LIB_MD5_ARCH bool depends on CRYPTO_LIB_MD5 && !UML + default y if MIPS && CPU_CAVIUM_OCTEON config CRYPTO_LIB_POLY1305_RSIZE int diff --git a/lib/crypto/mips/md5.h b/lib/crypto/mips/md5.h new file mode 100644 index 00000000000000..e08e28aeffa469 --- /dev/null +++ b/lib/crypto/mips/md5.h @@ -0,0 +1,65 @@ +/* + * Cryptographic API. + * + * MD5 Message Digest Algorithm (RFC1321). + * + * Adapted for OCTEON by Aaro Koskinen . + * + * Based on crypto/md5.c, which is: + * + * Derived from cryptoapi implementation, originally based on the + * public domain implementation written by Colin Plumb in 1993. + * + * Copyright (c) Cryptoapi developers. + * Copyright (c) 2002 James Morris + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include + +/* + * We pass everything as 64-bit. OCTEON can handle misaligned data. + */ + +static void md5_blocks(struct md5_block_state *state, + const u8 *data, size_t nblocks) +{ + struct octeon_cop2_state cop2_state; + u64 *state64 = (u64 *)state; + unsigned long flags; + + if (!octeon_has_crypto()) + return md5_blocks_generic(state, data, nblocks); + + cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); + + flags = octeon_crypto_enable(&cop2_state); + write_octeon_64bit_hash_dword(state64[0], 0); + write_octeon_64bit_hash_dword(state64[1], 1); + + do { + const u64 *block = (const u64 *)data; + + write_octeon_64bit_block_dword(block[0], 0); + write_octeon_64bit_block_dword(block[1], 1); + write_octeon_64bit_block_dword(block[2], 2); + write_octeon_64bit_block_dword(block[3], 3); + write_octeon_64bit_block_dword(block[4], 4); + write_octeon_64bit_block_dword(block[5], 5); + write_octeon_64bit_block_dword(block[6], 6); + octeon_md5_start(block[7]); + + data += MD5_BLOCK_SIZE; + } while (--nblocks); + + state64[0] = read_octeon_64bit_hash_dword(0); + state64[1] = read_octeon_64bit_hash_dword(1); + octeon_crypto_disable(&cop2_state, flags); + + le32_to_cpu_array(state->h, ARRAY_SIZE(state->h)); +} From cddd17868a8069f583a4fd6a2cc7b2a9ea15c26c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 5 Aug 2025 15:28:51 -0700 Subject: [PATCH 0537/3206] mips: cavium-octeon: Move octeon-crypto.c into parent dir Since octeon-crypto.c is the only remaining source file in arch/mips/cavium-octeon/crypto/, move it into its parent directory arch/mips/cavium-octeon/. Then remove the directory arch/mips/cavium-octeon/crypto/, including its Makefile. Link: https://lore.kernel.org/r/20250805222855.10362-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/mips/cavium-octeon/Makefile | 2 +- arch/mips/cavium-octeon/crypto/Makefile | 6 ------ arch/mips/cavium-octeon/{crypto => }/octeon-crypto.c | 0 3 files changed, 1 insertion(+), 7 deletions(-) delete mode 100644 arch/mips/cavium-octeon/crypto/Makefile rename arch/mips/cavium-octeon/{crypto => }/octeon-crypto.c (100%) diff --git a/arch/mips/cavium-octeon/Makefile b/arch/mips/cavium-octeon/Makefile index 2a59265788413e..ab84ede0cbe0e8 100644 --- a/arch/mips/cavium-octeon/Makefile +++ b/arch/mips/cavium-octeon/Makefile @@ -11,9 +11,9 @@ obj-y := cpu.o setup.o octeon-platform.o octeon-irq.o csrc-octeon.o obj-y += dma-octeon.o +obj-y += octeon-crypto.o obj-y += octeon-memcpy.o obj-y += executive/ -obj-y += crypto/ obj-$(CONFIG_MTD) += flash_setup.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/mips/cavium-octeon/crypto/Makefile b/arch/mips/cavium-octeon/crypto/Makefile deleted file mode 100644 index b7d03e8a03187e..00000000000000 --- a/arch/mips/cavium-octeon/crypto/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# OCTEON-specific crypto modules. -# - -obj-y += octeon-crypto.o diff --git a/arch/mips/cavium-octeon/crypto/octeon-crypto.c b/arch/mips/cavium-octeon/octeon-crypto.c similarity index 100% rename from arch/mips/cavium-octeon/crypto/octeon-crypto.c rename to arch/mips/cavium-octeon/octeon-crypto.c From 09371e1349c9bb34ac030973c7867016a8a8914d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 5 Aug 2025 15:28:52 -0700 Subject: [PATCH 0538/3206] lib/crypto: powerpc/md5: Migrate optimized code into library Instead of exposing the powerpc-optimized MD5 code via powerpc-specific crypto_shash algorithms, instead just implement the md5_blocks() library function. This is much simpler, it makes the MD5 library functions be powerpc-optimized, and it fixes the longstanding issue where the powerpc-optimized MD5 code was disabled by default. MD5 still remains available through crypto_shash, but individual architectures no longer need to handle it. Link: https://lore.kernel.org/r/20250805222855.10362-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/powerpc/configs/powernv_defconfig | 1 - arch/powerpc/configs/ppc64_defconfig | 1 - arch/powerpc/crypto/Kconfig | 8 -- arch/powerpc/crypto/Makefile | 2 - arch/powerpc/crypto/md5-glue.c | 99 ------------------- lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 1 + .../crypto => lib/crypto/powerpc}/md5-asm.S | 0 lib/crypto/powerpc/md5.h | 12 +++ 9 files changed, 14 insertions(+), 111 deletions(-) delete mode 100644 arch/powerpc/crypto/md5-glue.c rename {arch/powerpc/crypto => lib/crypto/powerpc}/md5-asm.S (100%) create mode 100644 lib/crypto/powerpc/md5.h diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index d06388b0f66e31..bd4685612de6dd 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -320,7 +320,6 @@ CONFIG_XMON=y CONFIG_CRYPTO_BENCHMARK=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_WP512=m diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index ce34597e9f3e14..2d92c11eea7e47 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -387,7 +387,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_LZO=m -CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_AES_GCM_P10=m CONFIG_CRYPTO_DEV_NX=y CONFIG_CRYPTO_DEV_NX_ENCRYPT=m diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index cfe39fc221cf81..f4b779c7352de9 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -15,14 +15,6 @@ config CRYPTO_CURVE25519_PPC64 Architecture: PowerPC64 - Little-endian -config CRYPTO_MD5_PPC - tristate "Digests: MD5" - select CRYPTO_HASH - help - MD5 message digest algorithm (RFC1321) - - Architecture: powerpc - config CRYPTO_AES_PPC_SPE tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)" depends on SPE diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index bc8fd27344b8bb..9eb59dce67f367 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -6,13 +6,11 @@ # obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o -obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o -md5-ppc-y := md5-asm.o md5-glue.o aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c deleted file mode 100644 index 204440a90cd84c..00000000000000 --- a/arch/powerpc/crypto/md5-glue.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Glue code for MD5 implementation for PPC assembler - * - * Based on generic implementation. - * - * Copyright (c) 2015 Markus Stockhausen - */ - -#include -#include -#include -#include -#include - -extern void ppc_md5_transform(u32 *state, const u8 *src, u32 blocks); - -static int ppc_md5_init(struct shash_desc *desc) -{ - struct md5_state *sctx = shash_desc_ctx(desc); - - sctx->hash[0] = MD5_H0; - sctx->hash[1] = MD5_H1; - sctx->hash[2] = MD5_H2; - sctx->hash[3] = MD5_H3; - sctx->byte_count = 0; - - return 0; -} - -static int ppc_md5_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct md5_state *sctx = shash_desc_ctx(desc); - - sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE); - ppc_md5_transform(sctx->hash, data, len >> 6); - return len - round_down(len, MD5_HMAC_BLOCK_SIZE); -} - -static int ppc_md5_finup(struct shash_desc *desc, const u8 *src, - unsigned int offset, u8 *out) -{ - struct md5_state *sctx = shash_desc_ctx(desc); - __le64 block[MD5_BLOCK_WORDS] = {}; - u8 *p = memcpy(block, src, offset); - __le32 *dst = (__le32 *)out; - __le64 *pbits; - - src = p; - p += offset; - *p++ = 0x80; - sctx->byte_count += offset; - pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1]; - *pbits = cpu_to_le64(sctx->byte_count << 3); - ppc_md5_transform(sctx->hash, src, (pbits - block + 1) / 8); - memzero_explicit(block, sizeof(block)); - - dst[0] = cpu_to_le32(sctx->hash[0]); - dst[1] = cpu_to_le32(sctx->hash[1]); - dst[2] = cpu_to_le32(sctx->hash[2]); - dst[3] = cpu_to_le32(sctx->hash[3]); - return 0; -} - -static struct shash_alg alg = { - .digestsize = MD5_DIGEST_SIZE, - .init = ppc_md5_init, - .update = ppc_md5_update, - .finup = ppc_md5_finup, - .descsize = MD5_STATE_SIZE, - .base = { - .cra_name = "md5", - .cra_driver_name= "md5-ppc", - .cra_priority = 200, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = MD5_HMAC_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init ppc_md5_mod_init(void) -{ - return crypto_register_shash(&alg); -} - -static void __exit ppc_md5_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(ppc_md5_mod_init); -module_exit(ppc_md5_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MD5 Secure Hash Algorithm, PPC assembler"); - -MODULE_ALIAS_CRYPTO("md5"); -MODULE_ALIAS_CRYPTO("md5-ppc"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 7b4e47ce37bbe4..7334ddc70e599b 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -111,6 +111,7 @@ config CRYPTO_LIB_MD5_ARCH bool depends on CRYPTO_LIB_MD5 && !UML default y if MIPS && CPU_CAVIUM_OCTEON + default y if PPC config CRYPTO_LIB_POLY1305_RSIZE int diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 429573e8f8b36b..b3986dde676b3d 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_CRYPTO_LIB_MD5) += libmd5.o libmd5-y := md5.o ifeq ($(CONFIG_CRYPTO_LIB_MD5_ARCH),y) CFLAGS_md5.o += -I$(src)/$(SRCARCH) +libmd5-$(CONFIG_PPC) += powerpc/md5-asm.o endif # CONFIG_CRYPTO_LIB_MD5_ARCH ################################################################################ diff --git a/arch/powerpc/crypto/md5-asm.S b/lib/crypto/powerpc/md5-asm.S similarity index 100% rename from arch/powerpc/crypto/md5-asm.S rename to lib/crypto/powerpc/md5-asm.S diff --git a/lib/crypto/powerpc/md5.h b/lib/crypto/powerpc/md5.h new file mode 100644 index 00000000000000..540b08e34d1d58 --- /dev/null +++ b/lib/crypto/powerpc/md5.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * MD5 optimized for PowerPC + */ + +void ppc_md5_transform(u32 *state, const u8 *data, size_t nblocks); + +static void md5_blocks(struct md5_block_state *state, + const u8 *data, size_t nblocks) +{ + ppc_md5_transform(state->h, data, nblocks); +} From a1848f6e382145e0200843549f24f5af4b5c8136 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 5 Aug 2025 15:28:53 -0700 Subject: [PATCH 0539/3206] lib/crypto: sparc/md5: Migrate optimized code into library Instead of exposing the sparc-optimized MD5 code via sparc-specific crypto_shash algorithms, instead just implement the md5_blocks() library function. This is much simpler, it makes the MD5 library functions be sparc-optimized, and it fixes the longstanding issue where the sparc-optimized MD5 code was disabled by default. MD5 still remains available through crypto_shash, but individual architectures no longer need to handle it. Note: to see the diff from arch/sparc/crypto/md5_glue.c to lib/crypto/sparc/md5.h, view this commit with 'git show -M10'. Link: https://lore.kernel.org/r/20250805222855.10362-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/sparc/crypto/Kconfig | 10 - arch/sparc/crypto/Makefile | 4 - arch/sparc/crypto/md5_glue.c | 174 ------------------ lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 1 + lib/crypto/sparc/md5.h | 48 +++++ .../crypto => lib/crypto/sparc}/md5_asm.S | 0 7 files changed, 50 insertions(+), 188 deletions(-) delete mode 100644 arch/sparc/crypto/md5_glue.c create mode 100644 lib/crypto/sparc/md5.h rename {arch/sparc/crypto => lib/crypto/sparc}/md5_asm.S (100%) diff --git a/arch/sparc/crypto/Kconfig b/arch/sparc/crypto/Kconfig index f5b2e720fec3c1..f755da97953462 100644 --- a/arch/sparc/crypto/Kconfig +++ b/arch/sparc/crypto/Kconfig @@ -16,16 +16,6 @@ config CRYPTO_DES_SPARC64 Architecture: sparc64 -config CRYPTO_MD5_SPARC64 - tristate "Digests: MD5" - depends on SPARC64 - select CRYPTO_MD5 - select CRYPTO_HASH - help - MD5 message digest algorithm (RFC1321) - - Architecture: sparc64 using crypto instructions, when available - config CRYPTO_AES_SPARC64 tristate "Ciphers: AES, modes: ECB, CBC, CTR" depends on SPARC64 diff --git a/arch/sparc/crypto/Makefile b/arch/sparc/crypto/Makefile index 0d05a17988c4cd..7b4796842ddd7c 100644 --- a/arch/sparc/crypto/Makefile +++ b/arch/sparc/crypto/Makefile @@ -3,14 +3,10 @@ # Arch-specific CryptoAPI modules. # -obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o - obj-$(CONFIG_CRYPTO_AES_SPARC64) += aes-sparc64.o obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o obj-$(CONFIG_CRYPTO_CAMELLIA_SPARC64) += camellia-sparc64.o -md5-sparc64-y := md5_asm.o md5_glue.o - aes-sparc64-y := aes_asm.o aes_glue.o des-sparc64-y := des_asm.o des_glue.o camellia-sparc64-y := camellia_asm.o camellia_glue.o diff --git a/arch/sparc/crypto/md5_glue.c b/arch/sparc/crypto/md5_glue.c deleted file mode 100644 index b3615f0cdf6262..00000000000000 --- a/arch/sparc/crypto/md5_glue.c +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Glue code for MD5 hashing optimized for sparc64 crypto opcodes. - * - * This is based largely upon arch/x86/crypto/sha1_ssse3_glue.c - * and crypto/md5.c which are: - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald - * Copyright (c) Jean-Francois Dive - * Copyright (c) Mathias Krause - * Copyright (c) Cryptoapi developers. - * Copyright (c) 2002 James Morris - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct sparc_md5_state { - __le32 hash[MD5_HASH_WORDS]; - u64 byte_count; -}; - -asmlinkage void md5_sparc64_transform(__le32 *digest, const char *data, - unsigned int rounds); - -static int md5_sparc64_init(struct shash_desc *desc) -{ - struct sparc_md5_state *mctx = shash_desc_ctx(desc); - - mctx->hash[0] = cpu_to_le32(MD5_H0); - mctx->hash[1] = cpu_to_le32(MD5_H1); - mctx->hash[2] = cpu_to_le32(MD5_H2); - mctx->hash[3] = cpu_to_le32(MD5_H3); - mctx->byte_count = 0; - - return 0; -} - -static int md5_sparc64_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sparc_md5_state *sctx = shash_desc_ctx(desc); - - sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE); - md5_sparc64_transform(sctx->hash, data, len / MD5_HMAC_BLOCK_SIZE); - return len - round_down(len, MD5_HMAC_BLOCK_SIZE); -} - -/* Add padding and return the message digest. */ -static int md5_sparc64_finup(struct shash_desc *desc, const u8 *src, - unsigned int offset, u8 *out) -{ - struct sparc_md5_state *sctx = shash_desc_ctx(desc); - __le64 block[MD5_BLOCK_WORDS] = {}; - u8 *p = memcpy(block, src, offset); - __le32 *dst = (__le32 *)out; - __le64 *pbits; - int i; - - src = p; - p += offset; - *p++ = 0x80; - sctx->byte_count += offset; - pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1]; - *pbits = cpu_to_le64(sctx->byte_count << 3); - md5_sparc64_transform(sctx->hash, src, (pbits - block + 1) / 8); - memzero_explicit(block, sizeof(block)); - - /* Store state in digest */ - for (i = 0; i < MD5_HASH_WORDS; i++) - dst[i] = sctx->hash[i]; - - return 0; -} - -static int md5_sparc64_export(struct shash_desc *desc, void *out) -{ - struct sparc_md5_state *sctx = shash_desc_ctx(desc); - union { - u8 *u8; - u32 *u32; - u64 *u64; - } p = { .u8 = out }; - int i; - - for (i = 0; i < MD5_HASH_WORDS; i++) - put_unaligned(le32_to_cpu(sctx->hash[i]), p.u32++); - put_unaligned(sctx->byte_count, p.u64); - return 0; -} - -static int md5_sparc64_import(struct shash_desc *desc, const void *in) -{ - struct sparc_md5_state *sctx = shash_desc_ctx(desc); - union { - const u8 *u8; - const u32 *u32; - const u64 *u64; - } p = { .u8 = in }; - int i; - - for (i = 0; i < MD5_HASH_WORDS; i++) - sctx->hash[i] = cpu_to_le32(get_unaligned(p.u32++)); - sctx->byte_count = get_unaligned(p.u64); - return 0; -} - -static struct shash_alg alg = { - .digestsize = MD5_DIGEST_SIZE, - .init = md5_sparc64_init, - .update = md5_sparc64_update, - .finup = md5_sparc64_finup, - .export = md5_sparc64_export, - .import = md5_sparc64_import, - .descsize = sizeof(struct sparc_md5_state), - .statesize = sizeof(struct sparc_md5_state), - .base = { - .cra_name = "md5", - .cra_driver_name= "md5-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = MD5_HMAC_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static bool __init sparc64_has_md5_opcode(void) -{ - unsigned long cfr; - - if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) - return false; - - __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); - if (!(cfr & CFR_MD5)) - return false; - - return true; -} - -static int __init md5_sparc64_mod_init(void) -{ - if (sparc64_has_md5_opcode()) { - pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n"); - return crypto_register_shash(&alg); - } - pr_info("sparc64 md5 opcode not available.\n"); - return -ENODEV; -} - -static void __exit md5_sparc64_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(md5_sparc64_mod_init); -module_exit(md5_sparc64_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MD5 Message Digest Algorithm, sparc64 md5 opcode accelerated"); - -MODULE_ALIAS_CRYPTO("md5"); - -#include "crop_devid.c" diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 7334ddc70e599b..79b848448e07f1 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -112,6 +112,7 @@ config CRYPTO_LIB_MD5_ARCH depends on CRYPTO_LIB_MD5 && !UML default y if MIPS && CPU_CAVIUM_OCTEON default y if PPC + default y if SPARC64 config CRYPTO_LIB_POLY1305_RSIZE int diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index b3986dde676b3d..d362636a22d38e 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -66,6 +66,7 @@ libmd5-y := md5.o ifeq ($(CONFIG_CRYPTO_LIB_MD5_ARCH),y) CFLAGS_md5.o += -I$(src)/$(SRCARCH) libmd5-$(CONFIG_PPC) += powerpc/md5-asm.o +libmd5-$(CONFIG_SPARC) += sparc/md5_asm.o endif # CONFIG_CRYPTO_LIB_MD5_ARCH ################################################################################ diff --git a/lib/crypto/sparc/md5.h b/lib/crypto/sparc/md5.h new file mode 100644 index 00000000000000..3f1b0ed8c0b3f3 --- /dev/null +++ b/lib/crypto/sparc/md5.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * MD5 accelerated using the sparc64 crypto opcodes + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald + * Copyright (c) Jean-Francois Dive + * Copyright (c) Mathias Krause + * Copyright (c) Cryptoapi developers. + * Copyright (c) 2002 James Morris + */ + +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_md5_opcodes); + +asmlinkage void md5_sparc64_transform(struct md5_block_state *state, + const u8 *data, size_t nblocks); + +static void md5_blocks(struct md5_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_md5_opcodes)) { + cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); + md5_sparc64_transform(state, data, nblocks); + le32_to_cpu_array(state->h, ARRAY_SIZE(state->h)); + } else { + md5_blocks_generic(state, data, nblocks); + } +} + +#define md5_mod_init_arch md5_mod_init_arch +static inline void md5_mod_init_arch(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_MD5)) + return; + + static_branch_enable(&have_md5_opcodes); + pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n"); +} diff --git a/arch/sparc/crypto/md5_asm.S b/lib/crypto/sparc/md5_asm.S similarity index 100% rename from arch/sparc/crypto/md5_asm.S rename to lib/crypto/sparc/md5_asm.S From ba8ee22a7f924ba10571b31f8a2e5aa1cf479308 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 5 Aug 2025 15:28:54 -0700 Subject: [PATCH 0540/3206] crypto: md5 - Wrap library and add HMAC support Reimplement crypto/md5.c on top of the new MD5 library functions. Also add support for HMAC-MD5, again just wrapping the library functions. This closely mirrors crypto/sha1.c. Link: https://lore.kernel.org/r/20250805222855.10362-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/Kconfig | 3 +- crypto/md5.c | 359 ++++++++++++++++++-------------------- crypto/testmgr.c | 3 + drivers/crypto/img-hash.c | 2 +- 4 files changed, 172 insertions(+), 195 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index 23bd98981ae8e9..1575dbec084d63 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -938,8 +938,9 @@ config CRYPTO_MD4 config CRYPTO_MD5 tristate "MD5" select CRYPTO_HASH + select CRYPTO_LIB_MD5 help - MD5 message digest algorithm (RFC1321) + MD5 message digest algorithm (RFC1321), including HMAC support. config CRYPTO_MICHAEL_MIC tristate "Michael MIC" diff --git a/crypto/md5.c b/crypto/md5.c index 32c0819f511858..d05c53e6f3c2c5 100644 --- a/crypto/md5.c +++ b/crypto/md5.c @@ -1,25 +1,50 @@ -/* - * Cryptographic API. - * - * MD5 Message Digest Algorithm (RFC1321). - * - * Derived from cryptoapi implementation, originally based on the - * public domain implementation written by Colin Plumb in 1993. - * - * Copyright (c) Cryptoapi developers. - * Copyright (c) 2002 James Morris - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Crypto API support for MD5 and HMAC-MD5 * + * Copyright 2025 Google LLC */ #include #include #include #include -#include + +/* + * Export and import functions. crypto_shash wants a particular format that + * matches that used by some legacy drivers. It currently is the same as the + * library MD5 context, except the value in bytecount must be block-aligned and + * the remainder must be stored in an extra u8 appended to the struct. + */ + +#define MD5_SHASH_STATE_SIZE (sizeof(struct md5_ctx) + 1) +static_assert(sizeof(struct md5_ctx) == sizeof(struct md5_state)); +static_assert(offsetof(struct md5_ctx, state) == offsetof(struct md5_state, hash)); +static_assert(offsetof(struct md5_ctx, bytecount) == offsetof(struct md5_state, byte_count)); +static_assert(offsetof(struct md5_ctx, buf) == offsetof(struct md5_state, block)); + +static int __crypto_md5_export(const struct md5_ctx *ctx0, void *out) +{ + struct md5_ctx ctx = *ctx0; + unsigned int partial; + u8 *p = out; + + partial = ctx.bytecount % MD5_BLOCK_SIZE; + ctx.bytecount -= partial; + memcpy(p, &ctx, sizeof(ctx)); + p += sizeof(ctx); + *p = partial; + return 0; +} + +static int __crypto_md5_import(struct md5_ctx *ctx, const void *in) +{ + const u8 *p = in; + + memcpy(ctx, p, sizeof(*ctx)); + p += sizeof(*ctx); + ctx->bytecount += *p; + return 0; +} const u8 md5_zero_message_hash[MD5_DIGEST_SIZE] = { 0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04, @@ -27,198 +52,146 @@ const u8 md5_zero_message_hash[MD5_DIGEST_SIZE] = { }; EXPORT_SYMBOL_GPL(md5_zero_message_hash); -#define F1(x, y, z) (z ^ (x & (y ^ z))) -#define F2(x, y, z) F1(z, x, y) -#define F3(x, y, z) (x ^ y ^ z) -#define F4(x, y, z) (y ^ (x | ~z)) - -#define MD5STEP(f, w, x, y, z, in, s) \ - (w += f(x, y, z) + in, w = (w<>(32-s)) + x) - -static void md5_transform(__u32 *hash, __u32 const *in) -{ - u32 a, b, c, d; - - a = hash[0]; - b = hash[1]; - c = hash[2]; - d = hash[3]; - - MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); - MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); - MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); - MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); - MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); - MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); - MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); - MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); - MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); - MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); - MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); - MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); - MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); - MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); - MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); - MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); - - MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); - MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); - MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); - MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); - MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); - MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); - MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); - MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); - MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); - MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); - MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); - MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); - MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); - MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); - MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); - MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); - - MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); - MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); - MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); - MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); - MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); - MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); - MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); - MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); - MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); - MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); - MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); - MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); - MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); - MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); - MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); - MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); - - MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); - MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); - MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); - MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); - MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); - MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); - MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); - MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); - MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); - MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); - MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); - MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); - MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); - MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); - MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); - MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); - - hash[0] += a; - hash[1] += b; - hash[2] += c; - hash[3] += d; -} - -static inline void md5_transform_helper(struct md5_state *ctx, - u32 block[MD5_BLOCK_WORDS]) -{ - le32_to_cpu_array(block, MD5_BLOCK_WORDS); - md5_transform(ctx->hash, block); -} - -static int md5_init(struct shash_desc *desc) -{ - struct md5_state *mctx = shash_desc_ctx(desc); - - mctx->hash[0] = MD5_H0; - mctx->hash[1] = MD5_H1; - mctx->hash[2] = MD5_H2; - mctx->hash[3] = MD5_H3; - mctx->byte_count = 0; +#define MD5_CTX(desc) ((struct md5_ctx *)shash_desc_ctx(desc)) +static int crypto_md5_init(struct shash_desc *desc) +{ + md5_init(MD5_CTX(desc)); return 0; } -static int md5_update(struct shash_desc *desc, const u8 *data, unsigned int len) -{ - struct md5_state *mctx = shash_desc_ctx(desc); - u32 block[MD5_BLOCK_WORDS]; - - mctx->byte_count += len; - do { - memcpy(block, data, sizeof(block)); - md5_transform_helper(mctx, block); - data += sizeof(block); - len -= sizeof(block); - } while (len >= sizeof(block)); - memzero_explicit(block, sizeof(block)); - mctx->byte_count -= len; - return len; -} - -static int md5_finup(struct shash_desc *desc, const u8 *data, unsigned int len, - u8 *out) -{ - struct md5_state *mctx = shash_desc_ctx(desc); - u32 block[MD5_BLOCK_WORDS]; - unsigned int offset; - int padding; - char *p; - - memcpy(block, data, len); - - offset = len; - p = (char *)block + offset; - padding = 56 - (offset + 1); - - *p++ = 0x80; - if (padding < 0) { - memset(p, 0x00, padding + sizeof (u64)); - md5_transform_helper(mctx, block); - p = (char *)block; - padding = 56; - } - - memset(p, 0, padding); - mctx->byte_count += len; - block[14] = mctx->byte_count << 3; - block[15] = mctx->byte_count >> 29; - le32_to_cpu_array(block, (sizeof(block) - sizeof(u64)) / sizeof(u32)); - md5_transform(mctx->hash, block); - memzero_explicit(block, sizeof(block)); - cpu_to_le32_array(mctx->hash, sizeof(mctx->hash) / sizeof(u32)); - memcpy(out, mctx->hash, sizeof(mctx->hash)); +static int crypto_md5_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + md5_update(MD5_CTX(desc), data, len); + return 0; +} +static int crypto_md5_final(struct shash_desc *desc, u8 *out) +{ + md5_final(MD5_CTX(desc), out); return 0; } -static struct shash_alg alg = { - .digestsize = MD5_DIGEST_SIZE, - .init = md5_init, - .update = md5_update, - .finup = md5_finup, - .descsize = MD5_STATE_SIZE, - .base = { - .cra_name = "md5", - .cra_driver_name = "md5-generic", - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = MD5_HMAC_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; +static int crypto_md5_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, u8 *out) +{ + md5(data, len, out); + return 0; +} + +static int crypto_md5_export(struct shash_desc *desc, void *out) +{ + return __crypto_md5_export(MD5_CTX(desc), out); +} + +static int crypto_md5_import(struct shash_desc *desc, const void *in) +{ + return __crypto_md5_import(MD5_CTX(desc), in); +} -static int __init md5_mod_init(void) +#define HMAC_MD5_KEY(tfm) ((struct hmac_md5_key *)crypto_shash_ctx(tfm)) +#define HMAC_MD5_CTX(desc) ((struct hmac_md5_ctx *)shash_desc_ctx(desc)) + +static int crypto_hmac_md5_setkey(struct crypto_shash *tfm, + const u8 *raw_key, unsigned int keylen) +{ + hmac_md5_preparekey(HMAC_MD5_KEY(tfm), raw_key, keylen); + return 0; +} + +static int crypto_hmac_md5_init(struct shash_desc *desc) +{ + hmac_md5_init(HMAC_MD5_CTX(desc), HMAC_MD5_KEY(desc->tfm)); + return 0; +} + +static int crypto_hmac_md5_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + hmac_md5_update(HMAC_MD5_CTX(desc), data, len); + return 0; +} + +static int crypto_hmac_md5_final(struct shash_desc *desc, u8 *out) +{ + hmac_md5_final(HMAC_MD5_CTX(desc), out); + return 0; +} + +static int crypto_hmac_md5_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, u8 *out) +{ + hmac_md5(HMAC_MD5_KEY(desc->tfm), data, len, out); + return 0; +} + +static int crypto_hmac_md5_export(struct shash_desc *desc, void *out) { - return crypto_register_shash(&alg); + return __crypto_md5_export(&HMAC_MD5_CTX(desc)->hash_ctx, out); } -static void __exit md5_mod_fini(void) +static int crypto_hmac_md5_import(struct shash_desc *desc, const void *in) { - crypto_unregister_shash(&alg); + struct hmac_md5_ctx *ctx = HMAC_MD5_CTX(desc); + + ctx->ostate = HMAC_MD5_KEY(desc->tfm)->ostate; + return __crypto_md5_import(&ctx->hash_ctx, in); } -module_init(md5_mod_init); -module_exit(md5_mod_fini); +static struct shash_alg algs[] = { + { + .base.cra_name = "md5", + .base.cra_driver_name = "md5-lib", + .base.cra_priority = 300, + .base.cra_blocksize = MD5_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .digestsize = MD5_DIGEST_SIZE, + .init = crypto_md5_init, + .update = crypto_md5_update, + .final = crypto_md5_final, + .digest = crypto_md5_digest, + .export = crypto_md5_export, + .import = crypto_md5_import, + .descsize = sizeof(struct md5_ctx), + .statesize = MD5_SHASH_STATE_SIZE, + }, + { + .base.cra_name = "hmac(md5)", + .base.cra_driver_name = "hmac-md5-lib", + .base.cra_priority = 300, + .base.cra_blocksize = MD5_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct hmac_md5_key), + .base.cra_module = THIS_MODULE, + .digestsize = MD5_DIGEST_SIZE, + .setkey = crypto_hmac_md5_setkey, + .init = crypto_hmac_md5_init, + .update = crypto_hmac_md5_update, + .final = crypto_hmac_md5_final, + .digest = crypto_hmac_md5_digest, + .export = crypto_hmac_md5_export, + .import = crypto_hmac_md5_import, + .descsize = sizeof(struct hmac_md5_ctx), + .statesize = MD5_SHASH_STATE_SIZE, + }, +}; + +static int __init crypto_md5_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} +module_init(crypto_md5_mod_init); + +static void __exit crypto_md5_mod_exit(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} +module_exit(crypto_md5_mod_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MD5 Message Digest Algorithm"); +MODULE_DESCRIPTION("Crypto API support for MD5 and HMAC-MD5"); + MODULE_ALIAS_CRYPTO("md5"); +MODULE_ALIAS_CRYPTO("md5-lib"); +MODULE_ALIAS_CRYPTO("hmac(md5)"); +MODULE_ALIAS_CRYPTO("hmac-md5-lib"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index ee33ba21ae2bc0..beab926ba102e5 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4178,6 +4178,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "authenc(hmac(md5),ecb(cipher_null))", + .generic_driver = "authenc(hmac-md5-lib,ecb-cipher_null)", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_md5_ecb_cipher_null_tv_template) @@ -5064,6 +5065,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "hmac(md5)", + .generic_driver = "hmac-md5-lib", .test = alg_test_hash, .suite = { .hash = __VECS(hmac_md5_tv_template) @@ -5250,6 +5252,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "md5", + .generic_driver = "md5-lib", .test = alg_test_hash, .suite = { .hash = __VECS(md5_tv_template) diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c index 76b7ecb5624b16..f22c12e36b56cc 100644 --- a/drivers/crypto/img-hash.c +++ b/drivers/crypto/img-hash.c @@ -700,7 +700,7 @@ static int img_hash_cra_init(struct crypto_tfm *tfm, const char *alg_name) static int img_hash_cra_md5_init(struct crypto_tfm *tfm) { - return img_hash_cra_init(tfm, "md5-generic"); + return img_hash_cra_init(tfm, "md5-lib"); } static int img_hash_cra_sha1_init(struct crypto_tfm *tfm) From ac9c408ed19d535289ca59200dd6a44a6a2d6036 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 16 Jun 2025 11:52:57 +0200 Subject: [PATCH 0541/3206] x86/vdso: Fix output operand size of RDPID RDPID instruction outputs to a word-sized register (64-bit on x86_64 and 32-bit on x86_32). Use an unsigned long variable to store the correct size. LSL outputs to 32-bit register, use %k operand prefix to always print the 32-bit name of the register. Use RDPID insn mnemonic while at it as the minimum binutils version of 2.30 supports it. [ bp: Merge two patches touching the same function into a single one. ] Fixes: ffebbaedc861 ("x86/vdso: Introduce helper functions for CPU and node number") Signed-off-by: Uros Bizjak Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250616095315.230620-1-ubizjak@gmail.com --- arch/x86/include/asm/segment.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 77d8f49b92bdd0..f59ae7186940a9 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -244,7 +244,7 @@ static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node) static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) { - unsigned int p; + unsigned long p; /* * Load CPU and node number from the GDT. LSL is faster than RDTSCP @@ -254,10 +254,10 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) * * If RDPID is available, use it. */ - alternative_io ("lsl %[seg],%[p]", - ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + alternative_io ("lsl %[seg],%k[p]", + "rdpid %[p]", X86_FEATURE_RDPID, - [p] "=a" (p), [seg] "r" (__CPUNODE_SEG)); + [p] "=r" (p), [seg] "r" (__CPUNODE_SEG)); if (cpu) *cpu = (p & VDSO_CPUNODE_MASK); From b8efa810c1db201dece99e4113229e5ecc00db5c Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 21 Aug 2025 13:25:55 +0200 Subject: [PATCH 0542/3206] s390/bpf: Add s390 JIT support for timed may_goto The verifier provides an architecture-independent implementation of the may_goto instruction, which is currently used on s390x, but it has a downside: there is no way to prevent progs using it from running for a very long time. The solution to this problem is an alternative timed implementation, which requires architecture-specific bits. Its availability is signaled to the verifier by bpf_jit_supports_timed_may_goto() returning true. The verifier then emits a call to arch_bpf_timed_may_goto() using a non-standard calling convention. This function must act as a trampoline for bpf_check_timed_may_goto(). Implement bpf_jit_supports_timed_may_goto(), account for the special calling convention in the BPF_CALL implementation, and implement arch_bpf_timed_may_goto(). Signed-off-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20250821113339.292434-2-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- arch/s390/net/Makefile | 2 +- arch/s390/net/bpf_jit_comp.c | 25 ++++++++++++++--- arch/s390/net/bpf_timed_may_goto.S | 45 ++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 arch/s390/net/bpf_timed_may_goto.S diff --git a/arch/s390/net/Makefile b/arch/s390/net/Makefile index 8cab6deb0403df..9275cf63192aa7 100644 --- a/arch/s390/net/Makefile +++ b/arch/s390/net/Makefile @@ -2,5 +2,5 @@ # # Arch-specific network modules # -obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o obj-$(CONFIG_HAVE_PNETID) += pnet.o diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index fd45f03a213cf1..8b57d8532f362e 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1806,10 +1806,22 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, } } - /* brasl %r14,func */ - EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, (void *)func); - /* lgr %b0,%r2: load return value into %b0 */ - EMIT4(0xb9040000, BPF_REG_0, REG_2); + if ((void *)func == arch_bpf_timed_may_goto) { + /* + * arch_bpf_timed_may_goto() has a special ABI: the + * parameters are in BPF_REG_AX and BPF_REG_10; the + * return value is in BPF_REG_AX; and all GPRs except + * REG_W0, REG_W1, and BPF_REG_AX are callee-saved. + */ + + /* brasl %r0,func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_0, (void *)func); + } else { + /* brasl %r14,func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, (void *)func); + /* lgr %b0,%r2: load return value into %b0 */ + EMIT4(0xb9040000, BPF_REG_0, REG_2); + } /* * Copy the potentially updated tail call counter back. @@ -2993,3 +3005,8 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *, u64, u64, u64), prev_addr = addr; } } + +bool bpf_jit_supports_timed_may_goto(void) +{ + return true; +} diff --git a/arch/s390/net/bpf_timed_may_goto.S b/arch/s390/net/bpf_timed_may_goto.S new file mode 100644 index 00000000000000..06f567a460d7bd --- /dev/null +++ b/arch/s390/net/bpf_timed_may_goto.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include + +#define R2_OFF 0 +#define R5_OFF (R2_OFF + (5 - 2 + 1) * 8) +#define R14_OFF (R5_OFF + 8) +#define RETADDR_OFF (R14_OFF + 8) +#define R15_OFF (RETADDR_OFF + 8) +#define BACKCHAIN_OFF (R15_OFF + 8) +#define FRAME_SIZE (BACKCHAIN_OFF + 8) +#define FRAME_OFF (STACK_FRAME_OVERHEAD - FRAME_SIZE) +#if (FRAME_OFF + BACKCHAIN_OFF) != __SF_BACKCHAIN +#error Stack frame layout calculation is broken +#endif + + GEN_BR_THUNK %r1 + +SYM_FUNC_START(arch_bpf_timed_may_goto) + /* + * This function has a special ABI: the parameters are in %r12 and + * %r13; the return value is in %r12; all GPRs except %r0, %r1, and + * %r12 are callee-saved; and the return address is in %r0. + */ + stmg %r2,%r5,FRAME_OFF+R2_OFF(%r15) + stg %r14,FRAME_OFF+R14_OFF(%r15) + stg %r0,FRAME_OFF+RETADDR_OFF(%r15) + stg %r15,FRAME_OFF+R15_OFF(%r15) + lgr %r1,%r15 + lay %r15,-FRAME_SIZE(%r15) + stg %r1,__SF_BACKCHAIN(%r15) + + lay %r2,0(%r12,%r13) + brasl %r14,bpf_check_timed_may_goto + lgr %r12,%r2 + + lg %r15,FRAME_SIZE+FRAME_OFF+R15_OFF(%r15) + lmg %r2,%r5,FRAME_OFF+R2_OFF(%r15) + lg %r14,FRAME_OFF+R14_OFF(%r15) + lg %r1,FRAME_OFF+RETADDR_OFF(%r15) + BR_EX %r1 +SYM_FUNC_END(arch_bpf_timed_may_goto) From b68dfcc12a320b85cd33d0d673ea620da08d0aec Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 21 Aug 2025 13:25:56 +0200 Subject: [PATCH 0543/3206] selftests/bpf: Add a missing newline to the "bad arch spec" message Fix error messages like this one: parse_test_spec:FAIL:569 bad arch spec: 's390x'process_subtest:FAIL:1153 Can't parse test spec for program 'may_goto_simple' Signed-off-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20250821113339.292434-3-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 78423cf89e01bb..e1987d1959fd89 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -566,7 +566,7 @@ static int parse_test_spec(struct test_loader *tester, } else if (strcmp(val, "RISCV64") == 0) { arch = ARCH_RISCV64; } else { - PRINT_FAIL("bad arch spec: '%s'", val); + PRINT_FAIL("bad arch spec: '%s'\n", val); err = -EINVAL; goto cleanup; } From 1e4e6b9e260dffc82fa49c2caa4828540082cb58 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 21 Aug 2025 13:25:57 +0200 Subject: [PATCH 0544/3206] selftests/bpf: Add __arch_s390x macro Make it possible to limit certain tests to s390x, just like it's already done for x86_64, arm64, and riscv64. Signed-off-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20250821113339.292434-4-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_misc.h | 1 + tools/testing/selftests/bpf/test_loader.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index c1cfd297aabf11..72c2d72a245e5b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -136,6 +136,7 @@ #define __arch_x86_64 __arch("X86_64") #define __arch_arm64 __arch("ARM64") #define __arch_riscv64 __arch("RISCV64") +#define __arch_s390x __arch("s390x") #define __caps_unpriv(caps) __attribute__((btf_decl_tag("comment:test_caps_unpriv=" EXPAND_QUOTE(caps)))) #define __load_if_JITed() __attribute__((btf_decl_tag("comment:load_mode=jited"))) #define __load_if_no_JITed() __attribute__((btf_decl_tag("comment:load_mode=no_jited"))) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index e1987d1959fd89..a9388ac8835871 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -374,6 +374,7 @@ enum arch { ARCH_X86_64 = 0x2, ARCH_ARM64 = 0x4, ARCH_RISCV64 = 0x8, + ARCH_S390X = 0x10, }; static int get_current_arch(void) @@ -384,6 +385,8 @@ static int get_current_arch(void) return ARCH_ARM64; #elif defined(__riscv) && __riscv_xlen == 64 return ARCH_RISCV64; +#elif defined(__s390x__) + return ARCH_S390X; #endif return ARCH_UNKNOWN; } @@ -565,6 +568,8 @@ static int parse_test_spec(struct test_loader *tester, arch = ARCH_ARM64; } else if (strcmp(val, "RISCV64") == 0) { arch = ARCH_RISCV64; + } else if (strcmp(val, "s390x") == 0) { + arch = ARCH_S390X; } else { PRINT_FAIL("bad arch spec: '%s'\n", val); err = -EINVAL; From 7197dbcba230c8a869e7f5bd250b8b7c9b3c0fa8 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 21 Aug 2025 13:25:58 +0200 Subject: [PATCH 0545/3206] selftests/bpf: Enable timed may_goto verifier tests on s390x Now that the timed may_goto implementation is available on s390x, enable the respective verifier tests. Signed-off-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20250821113339.292434-5-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/stream.c | 2 +- tools/testing/selftests/bpf/progs/verifier_may_goto_1.c | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/stream.c b/tools/testing/selftests/bpf/prog_tests/stream.c index d9f0185dca61b8..36a1a1ebde6929 100644 --- a/tools/testing/selftests/bpf/prog_tests/stream.c +++ b/tools/testing/selftests/bpf/prog_tests/stream.c @@ -77,7 +77,7 @@ void test_stream_errors(void) ASSERT_OK(ret, "ret"); ASSERT_OK(opts.retval, "retval"); -#if !defined(__x86_64__) +#if !defined(__x86_64__) && !defined(__s390x__) ASSERT_TRUE(1, "Timed may_goto unsupported, skip."); if (i == 0) { ret = bpf_prog_stream_read(prog_fd, 2, buf, sizeof(buf), &ropts); diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c index 3966d827f28892..cc1063863569d8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c +++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c @@ -9,6 +9,7 @@ SEC("raw_tp") __description("may_goto 0") __arch_x86_64 +__arch_s390x __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -27,6 +28,7 @@ __naked void may_goto_simple(void) SEC("raw_tp") __description("batch 2 of may_goto 0") __arch_x86_64 +__arch_s390x __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -47,6 +49,7 @@ __naked void may_goto_batch_0(void) SEC("raw_tp") __description("may_goto batch with offsets 2/1/0") __arch_x86_64 +__arch_s390x __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -69,8 +72,9 @@ __naked void may_goto_batch_1(void) } SEC("raw_tp") -__description("may_goto batch with offsets 2/0 - x86_64") +__description("may_goto batch with offsets 2/0 - x86_64 and s390x") __arch_x86_64 +__arch_s390x __xlated("0: *(u64 *)(r10 -16) = 65535") __xlated("1: *(u64 *)(r10 -8) = 0") __xlated("2: r11 = *(u64 *)(r10 -16)") @@ -84,7 +88,7 @@ __xlated("9: r0 = 1") __xlated("10: r0 = 2") __xlated("11: exit") __success -__naked void may_goto_batch_2_x86_64(void) +__naked void may_goto_batch_2_x86_64_s390x(void) { asm volatile ( ".8byte %[may_goto1];" From 21bce56940549a2473134b44cd1d8e21da268e1e Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 21 Aug 2025 13:25:59 +0200 Subject: [PATCH 0546/3206] selftests/bpf: Remove may_goto tests from DENYLIST.s390x The may_goto instruction is now fully supported on s390x, including the timed implementation, so remove the respective test from the denylist. Signed-off-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20250821113339.292434-6-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 3ebd77206f98f9..a17baf8c6fd75a 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -2,4 +2,3 @@ # Alphabetical order get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) stacktrace_build_id # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2 (?) -verifier_iterating_callbacks From bc986b1d756482a5ec2d7d9625229d9b9df95ae1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 21 Aug 2025 16:18:21 -0400 Subject: [PATCH 0547/3206] fs: stop accessing ->i_count directly in f2fs and gfs2 Instead of accessing ->i_count directly in these file systems, use the appropriate __iget and iput helpers. Signed-off-by: Josef Bacik Link: https://lore.kernel.org/b8e6eb8a3e690ce082828d3580415bf70dfa93aa.1755806649.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- fs/f2fs/super.c | 4 ++-- fs/gfs2/ops_fstype.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1db024b20e29b4..2045642cfe3b44 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1750,7 +1750,7 @@ static int f2fs_drop_inode(struct inode *inode) if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ - atomic_inc(&inode->i_count); + __iget(inode); spin_unlock(&inode->i_lock); /* should remain fi->extent_tree for writepage */ @@ -1769,7 +1769,7 @@ static int f2fs_drop_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); spin_lock(&inode->i_lock); - atomic_dec(&inode->i_count); + iput(inode); } trace_f2fs_drop_inode(inode, 0); return 0; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index efe99b73255137..c770006f888935 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1754,7 +1754,7 @@ static void gfs2_evict_inodes(struct super_block *sb) spin_unlock(&inode->i_lock); continue; } - atomic_inc(&inode->i_count); + __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); From c2ef7a03f5c8f0a612a8beff666483b725adf8fc Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 27 Aug 2025 13:12:36 +0300 Subject: [PATCH 0548/3206] mmc: sdio: Drop dev_pm_domain_detach() call Starting with commit f99508074e78 ("PM: domains: Detach on device_unbind_cleanup()"), there is no longer a need to call dev_pm_domain_detach() in the bus remove function. The device_unbind_cleanup() function now handles this to avoid invoking devres cleanup handlers while the PM domain is powered off, which could otherwise lead to failures as described in the above-mentioned commit. Drop the explicit dev_pm_domain_detach() call and rely instead on the flags passed to dev_pm_domain_attach() to power off the domain. Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20250827101236.927313-1-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/sdio_bus.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index 656601754966b2..10799772494a2c 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -200,7 +200,6 @@ static int sdio_bus_probe(struct device *dev) atomic_dec(&func->card->sdio_funcs_probed); if (func->card->host->caps & MMC_CAP_POWER_OFF_CARD) pm_runtime_put_noidle(dev); - dev_pm_domain_detach(dev, false); return ret; } @@ -231,8 +230,6 @@ static void sdio_bus_remove(struct device *dev) /* Then undo the runtime PM settings in sdio_bus_probe() */ if (func->card->host->caps & MMC_CAP_POWER_OFF_CARD) pm_runtime_put_sync(dev); - - dev_pm_domain_detach(dev, false); } static const struct dev_pm_ops sdio_bus_pm_ops = { From 9d81ba6d49a7457784f0b6a71046818b86ec7e44 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Wed, 27 Aug 2025 09:45:55 +0800 Subject: [PATCH 0549/3206] fuse: Block access to folio overlimit syz reported a slab-out-of-bounds Write in fuse_dev_do_write. When the number of bytes to be retrieved is truncated to the upper limit by fc->max_pages and there is an offset, the oob is triggered. Add a loop termination condition to prevent overruns. Fixes: 3568a9569326 ("fuse: support large folios for retrieves") Reported-by: syzbot+2d215d165f9354b9c4ea@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2d215d165f9354b9c4ea Tested-by: syzbot+2d215d165f9354b9c4ea@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e80cd8f2c049f9..5150aa25e64be9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1893,7 +1893,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num) { + while (num && ap->num_folios < num_pages) { struct folio *folio; unsigned int folio_offset; unsigned int nr_bytes; From 9a52827a9bbbabb461e87bb41174a96a82c0e8ae Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 8 Jul 2025 10:56:13 +0200 Subject: [PATCH 0550/3206] reset: eyeq: fix OF node leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure to drop the OF node reference taken when probing the auxiliary device when the device is later unbound. Fixes: 487b1b32e317 ("reset: eyeq: add platform driver") Cc: Théo Lebrun Signed-off-by: Johan Hovold Reviewed-by: Philipp Zabel Link: https://lore.kernel.org/r/20250708085613.15823-1-johan@kernel.org Signed-off-by: Philipp Zabel --- drivers/reset/reset-eyeq.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/reset/reset-eyeq.c b/drivers/reset/reset-eyeq.c index 02d50041048b42..2d3998368a1c5e 100644 --- a/drivers/reset/reset-eyeq.c +++ b/drivers/reset/reset-eyeq.c @@ -410,6 +410,13 @@ static int eqr_of_xlate_twocells(struct reset_controller_dev *rcdev, return eqr_of_xlate_internal(rcdev, reset_spec->args[0], reset_spec->args[1]); } +static void eqr_of_node_put(void *_dev) +{ + struct device *dev = _dev; + + of_node_put(dev->of_node); +} + static int eqr_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { @@ -428,6 +435,10 @@ static int eqr_probe(struct auxiliary_device *adev, if (!dev->of_node) return -ENODEV; + ret = devm_add_action_or_reset(dev, eqr_of_node_put, dev); + if (ret) + return ret; + /* * Using our newfound OF node, we can get match data. We cannot use * device_get_match_data() because it does not match reused OF nodes. From d6b6aac0cdb4b4f81cccc531ed76211d56c17444 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 5 Aug 2025 15:28:55 -0700 Subject: [PATCH 0551/3206] lib/crypto: tests: Add KUnit tests for MD5 and HMAC-MD5 Add a KUnit test suite for the MD5 library functions, including the corresponding HMAC support. The core test logic is in the previously-added hash-test-template.h. This commit just adds the actual KUnit suite, and it adds the generated test vectors to the tree so that gen-hash-testvecs.py won't have to be run at build time. Link: https://lore.kernel.org/r/20250805222855.10362-8-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/tests/Kconfig | 10 ++ lib/crypto/tests/Makefile | 1 + lib/crypto/tests/md5-testvecs.h | 186 ++++++++++++++++++++++++++++++++ lib/crypto/tests/md5_kunit.c | 39 +++++++ 4 files changed, 236 insertions(+) create mode 100644 lib/crypto/tests/md5-testvecs.h create mode 100644 lib/crypto/tests/md5_kunit.c diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig index de7e8babb6afc5..c21d53fd4b0ce2 100644 --- a/lib/crypto/tests/Kconfig +++ b/lib/crypto/tests/Kconfig @@ -1,5 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-or-later +config CRYPTO_LIB_MD5_KUNIT_TEST + tristate "KUnit tests for MD5" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS + select CRYPTO_LIB_BENCHMARK_VISIBLE + select CRYPTO_LIB_MD5 + help + KUnit tests for the MD5 cryptographic hash function and its + corresponding HMAC. + config CRYPTO_LIB_POLY1305_KUNIT_TEST tristate "KUnit tests for Poly1305" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile index 8601dccd6fddab..f6f82c6f9cb5da 100644 --- a/lib/crypto/tests/Makefile +++ b/lib/crypto/tests/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-or-later +obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o obj-$(CONFIG_CRYPTO_LIB_SHA256_KUNIT_TEST) += sha224_kunit.o sha256_kunit.o diff --git a/lib/crypto/tests/md5-testvecs.h b/lib/crypto/tests/md5-testvecs.h new file mode 100644 index 00000000000000..be6727feb29660 --- /dev/null +++ b/lib/crypto/tests/md5-testvecs.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py md5 */ + +static const struct { + size_t data_len; + u8 digest[MD5_DIGEST_SIZE]; +} hash_testvecs[] = { + { + .data_len = 0, + .digest = { + 0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04, + 0xe9, 0x80, 0x09, 0x98, 0xec, 0xf8, 0x42, 0x7e, + }, + }, + { + .data_len = 1, + .digest = { + 0x16, 0x7b, 0x86, 0xf2, 0x1d, 0xf3, 0x76, 0xc9, + 0x6f, 0x10, 0xa0, 0x61, 0x5b, 0x14, 0x20, 0x0b, + }, + }, + { + .data_len = 2, + .digest = { + 0x2d, 0x30, 0x96, 0xc7, 0x43, 0x40, 0xed, 0xb2, + 0xfb, 0x84, 0x63, 0x9a, 0xec, 0xc7, 0x3c, 0x3c, + }, + }, + { + .data_len = 3, + .digest = { + 0xe5, 0x0f, 0xce, 0xe0, 0xc8, 0xff, 0x4e, 0x08, + 0x5e, 0x19, 0xe5, 0xf2, 0x08, 0x11, 0x19, 0x16, + }, + }, + { + .data_len = 16, + .digest = { + 0xe8, 0xca, 0x29, 0x05, 0x2f, 0xd1, 0xf3, 0x99, + 0x40, 0x71, 0xf5, 0xc2, 0xf7, 0xf8, 0x17, 0x3e, + }, + }, + { + .data_len = 32, + .digest = { + 0xe3, 0x20, 0xc1, 0xd8, 0x21, 0x14, 0x44, 0x59, + 0x1a, 0xf5, 0x91, 0xaf, 0x69, 0xbe, 0x93, 0x9d, + }, + }, + { + .data_len = 48, + .digest = { + 0xfb, 0x06, 0xb0, 0xf0, 0x00, 0x10, 0x4b, 0x68, + 0x3d, 0x75, 0xf9, 0x70, 0xde, 0xbb, 0x32, 0x16, + }, + }, + { + .data_len = 49, + .digest = { + 0x52, 0x86, 0x48, 0x8b, 0xae, 0x91, 0x7c, 0x4e, + 0xc2, 0x2a, 0x69, 0x07, 0x35, 0xcc, 0xb2, 0x88, + }, + }, + { + .data_len = 63, + .digest = { + 0xfa, 0xd3, 0xf6, 0xe6, 0x7b, 0x1a, 0xc6, 0x05, + 0x73, 0x35, 0x02, 0xab, 0xc7, 0xb3, 0x47, 0xcb, + }, + }, + { + .data_len = 64, + .digest = { + 0xc5, 0x59, 0x29, 0xe9, 0x0a, 0x4a, 0x86, 0x43, + 0x7c, 0xaf, 0xdf, 0x83, 0xd3, 0xb8, 0x33, 0x5f, + }, + }, + { + .data_len = 65, + .digest = { + 0x80, 0x05, 0x75, 0x39, 0xec, 0x44, 0x8a, 0x81, + 0xe7, 0x6e, 0x8d, 0xd1, 0xc6, 0xeb, 0xc2, 0xf0, + }, + }, + { + .data_len = 127, + .digest = { + 0x3f, 0x02, 0xe8, 0xc6, 0xb8, 0x6a, 0x39, 0xc3, + 0xa4, 0x1c, 0xd9, 0x8f, 0x4a, 0x71, 0x40, 0x30, + }, + }, + { + .data_len = 128, + .digest = { + 0x89, 0x4f, 0x79, 0x3e, 0xff, 0x0c, 0x22, 0x60, + 0xa2, 0xdc, 0x10, 0x5f, 0x23, 0x0a, 0xe7, 0xc6, + }, + }, + { + .data_len = 129, + .digest = { + 0x06, 0x56, 0x61, 0xb8, 0x8a, 0x82, 0x77, 0x1b, + 0x2c, 0x35, 0xb8, 0x9f, 0xd6, 0xf7, 0xbd, 0x5a, + }, + }, + { + .data_len = 256, + .digest = { + 0x5d, 0xdf, 0x7d, 0xc8, 0x43, 0x96, 0x3b, 0xdb, + 0xc7, 0x0e, 0x44, 0x42, 0x23, 0xf7, 0xed, 0xdf, + }, + }, + { + .data_len = 511, + .digest = { + 0xf6, 0x5f, 0x26, 0x51, 0x8a, 0x5a, 0x46, 0x8f, + 0x48, 0x72, 0x90, 0x74, 0x9d, 0x87, 0xbd, 0xdf, + }, + }, + { + .data_len = 513, + .digest = { + 0xd8, 0x2c, 0xc9, 0x76, 0xfa, 0x67, 0x2e, 0xa6, + 0xc8, 0x12, 0x4a, 0x64, 0xaa, 0x0b, 0x3d, 0xbd, + }, + }, + { + .data_len = 1000, + .digest = { + 0xe2, 0x7e, 0xb4, 0x5f, 0xe1, 0x74, 0x51, 0xfc, + 0xe0, 0xc8, 0xd5, 0xe6, 0x8b, 0x40, 0xd2, 0x0e, + }, + }, + { + .data_len = 3333, + .digest = { + 0xcd, 0x7d, 0x56, 0xa9, 0x4c, 0x47, 0xea, 0xc2, + 0x34, 0x0b, 0x84, 0x05, 0xf9, 0xad, 0xbb, 0x46, + }, + }, + { + .data_len = 4096, + .digest = { + 0x63, 0x6e, 0x58, 0xb3, 0x94, 0x6b, 0x83, 0x5f, + 0x1f, 0x0e, 0xd3, 0x66, 0x78, 0x71, 0x98, 0x42, + }, + }, + { + .data_len = 4128, + .digest = { + 0x9d, 0x68, 0xfc, 0x26, 0x8b, 0x4c, 0xa8, 0xe7, + 0x30, 0x0b, 0x19, 0x52, 0x6e, 0xa5, 0x65, 0x1c, + }, + }, + { + .data_len = 4160, + .digest = { + 0x1c, 0xaa, 0x7d, 0xee, 0x91, 0x01, 0xe2, 0x5a, + 0xec, 0xe9, 0xde, 0x57, 0x0a, 0xb6, 0x4c, 0x2f, + }, + }, + { + .data_len = 4224, + .digest = { + 0x1b, 0x31, 0xe3, 0x14, 0x07, 0x16, 0x17, 0xc6, + 0x98, 0x79, 0x88, 0x23, 0xb6, 0x3b, 0x25, 0xc4, + }, + }, + { + .data_len = 16384, + .digest = { + 0xc6, 0x3d, 0x56, 0x90, 0xf0, 0xf6, 0xe6, 0x50, + 0xf4, 0x76, 0x78, 0x67, 0xa3, 0xdd, 0x62, 0x7b, + }, + }, +}; + +static const u8 hash_testvec_consolidated[MD5_DIGEST_SIZE] = { + 0x70, 0x86, 0x9e, 0x6c, 0xa4, 0xc6, 0x71, 0x43, + 0x26, 0x02, 0x1b, 0x3f, 0xfd, 0x56, 0x9f, 0xa6, +}; + +static const u8 hmac_testvec_consolidated[MD5_DIGEST_SIZE] = { + 0x10, 0x02, 0x74, 0xf6, 0x4d, 0xb3, 0x3c, 0xc7, + 0xa1, 0xf7, 0xe6, 0xd4, 0x32, 0x64, 0xfa, 0x6d, +}; diff --git a/lib/crypto/tests/md5_kunit.c b/lib/crypto/tests/md5_kunit.c new file mode 100644 index 00000000000000..38bd52c25ae3e5 --- /dev/null +++ b/lib/crypto/tests/md5_kunit.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2025 Google LLC + */ +#include +#include "md5-testvecs.h" + +#define HASH md5 +#define HASH_CTX md5_ctx +#define HASH_SIZE MD5_DIGEST_SIZE +#define HASH_INIT md5_init +#define HASH_UPDATE md5_update +#define HASH_FINAL md5_final +#define HMAC_KEY hmac_md5_key +#define HMAC_CTX hmac_md5_ctx +#define HMAC_PREPAREKEY hmac_md5_preparekey +#define HMAC_INIT hmac_md5_init +#define HMAC_UPDATE hmac_md5_update +#define HMAC_FINAL hmac_md5_final +#define HMAC hmac_md5 +#define HMAC_USINGRAWKEY hmac_md5_usingrawkey +#include "hash-test-template.h" + +static struct kunit_case hash_test_cases[] = { + HASH_KUNIT_CASES, + KUNIT_CASE(benchmark_hash), + {}, +}; + +static struct kunit_suite hash_test_suite = { + .name = "md5", + .test_cases = hash_test_cases, + .suite_init = hash_suite_init, + .suite_exit = hash_suite_exit, +}; +kunit_test_suite(hash_test_suite); + +MODULE_DESCRIPTION("KUnit tests and benchmark for MD5 and HMAC-MD5"); +MODULE_LICENSE("GPL"); From 5012bd2dc6ab0c4499923b3b6c6113def9b0c88b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Aug 2025 19:04:57 -0700 Subject: [PATCH 0552/3206] lib/crypto: Drop inline from all *_mod_init_arch() functions Drop 'inline' from all the *_mod_init_arch() functions so that the compiler will warn about any bugs where they are unused due to not being wired up properly. (There are no such bugs currently, so this just establishes a more robust convention for the future. Of course, these functions also tend to get inlined anyway, regardless of the keyword.) Link: https://lore.kernel.org/r/20250816020457.432040-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/arm/sha1.h | 2 +- lib/crypto/arm/sha256.h | 2 +- lib/crypto/arm/sha512.h | 2 +- lib/crypto/arm64/sha1.h | 2 +- lib/crypto/arm64/sha256.h | 2 +- lib/crypto/arm64/sha512.h | 2 +- lib/crypto/riscv/sha256.h | 2 +- lib/crypto/riscv/sha512.h | 2 +- lib/crypto/s390/sha1.h | 2 +- lib/crypto/s390/sha256.h | 2 +- lib/crypto/s390/sha512.h | 2 +- lib/crypto/sparc/md5.h | 2 +- lib/crypto/sparc/sha1.h | 2 +- lib/crypto/sparc/sha256.h | 2 +- lib/crypto/sparc/sha512.h | 2 +- lib/crypto/x86/sha1.h | 2 +- lib/crypto/x86/sha256.h | 2 +- lib/crypto/x86/sha512.h | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h index fa1e9241900062..29f8bcad0447c3 100644 --- a/lib/crypto/arm/sha1.h +++ b/lib/crypto/arm/sha1.h @@ -35,7 +35,7 @@ static void sha1_blocks(struct sha1_block_state *state, #ifdef CONFIG_KERNEL_MODE_NEON #define sha1_mod_init_arch sha1_mod_init_arch -static inline void sha1_mod_init_arch(void) +static void sha1_mod_init_arch(void) { if (elf_hwcap & HWCAP_NEON) { static_branch_enable(&have_neon); diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h index eab713e650f33e..7556457b3094b4 100644 --- a/lib/crypto/arm/sha256.h +++ b/lib/crypto/arm/sha256.h @@ -35,7 +35,7 @@ static void sha256_blocks(struct sha256_block_state *state, #ifdef CONFIG_KERNEL_MODE_NEON #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { if (elf_hwcap & HWCAP_NEON) { static_branch_enable(&have_neon); diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h index cc2447acd56213..d1b485dd275db8 100644 --- a/lib/crypto/arm/sha512.h +++ b/lib/crypto/arm/sha512.h @@ -29,7 +29,7 @@ static void sha512_blocks(struct sha512_block_state *state, #ifdef CONFIG_KERNEL_MODE_NEON #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { if (cpu_has_neon()) static_branch_enable(&have_neon); diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h index f822563538cc87..aaef4ebfc5e34a 100644 --- a/lib/crypto/arm64/sha1.h +++ b/lib/crypto/arm64/sha1.h @@ -32,7 +32,7 @@ static void sha1_blocks(struct sha1_block_state *state, } #define sha1_mod_init_arch sha1_mod_init_arch -static inline void sha1_mod_init_arch(void) +static void sha1_mod_init_arch(void) { if (cpu_have_named_feature(SHA1)) static_branch_enable(&have_ce); diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h index d95f1077c32bdf..be4aeda9d0e6ee 100644 --- a/lib/crypto/arm64/sha256.h +++ b/lib/crypto/arm64/sha256.h @@ -46,7 +46,7 @@ static void sha256_blocks(struct sha256_block_state *state, #ifdef CONFIG_KERNEL_MODE_NEON #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { if (cpu_have_named_feature(ASIMD)) { static_branch_enable(&have_neon); diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h index 7539ea3fef10d0..ddb0d256f73aa3 100644 --- a/lib/crypto/arm64/sha512.h +++ b/lib/crypto/arm64/sha512.h @@ -37,7 +37,7 @@ static void sha512_blocks(struct sha512_block_state *state, #ifdef CONFIG_KERNEL_MODE_NEON #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { if (cpu_have_named_feature(SHA512)) static_branch_enable(&have_sha512_insns); diff --git a/lib/crypto/riscv/sha256.h b/lib/crypto/riscv/sha256.h index f36f68d2e88cc7..1def18b0a4fb55 100644 --- a/lib/crypto/riscv/sha256.h +++ b/lib/crypto/riscv/sha256.h @@ -31,7 +31,7 @@ static void sha256_blocks(struct sha256_block_state *state, } #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { /* Both zvknha and zvknhb provide the SHA-256 instructions. */ if ((riscv_isa_extension_available(NULL, ZVKNHA) || diff --git a/lib/crypto/riscv/sha512.h b/lib/crypto/riscv/sha512.h index 59dc0294a9a7e9..145bdab1214e37 100644 --- a/lib/crypto/riscv/sha512.h +++ b/lib/crypto/riscv/sha512.h @@ -30,7 +30,7 @@ static void sha512_blocks(struct sha512_block_state *state, } #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { if (riscv_isa_extension_available(NULL, ZVKNHB) && riscv_isa_extension_available(NULL, ZVKB) && diff --git a/lib/crypto/s390/sha1.h b/lib/crypto/s390/sha1.h index 08bd138e881cce..73d94476a157a8 100644 --- a/lib/crypto/s390/sha1.h +++ b/lib/crypto/s390/sha1.h @@ -20,7 +20,7 @@ static void sha1_blocks(struct sha1_block_state *state, } #define sha1_mod_init_arch sha1_mod_init_arch -static inline void sha1_mod_init_arch(void) +static void sha1_mod_init_arch(void) { if (cpu_have_feature(S390_CPU_FEATURE_MSA) && cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1)) diff --git a/lib/crypto/s390/sha256.h b/lib/crypto/s390/sha256.h index 70a81cbc06b2c2..acd48350878975 100644 --- a/lib/crypto/s390/sha256.h +++ b/lib/crypto/s390/sha256.h @@ -20,7 +20,7 @@ static void sha256_blocks(struct sha256_block_state *state, } #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { if (cpu_have_feature(S390_CPU_FEATURE_MSA) && cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) diff --git a/lib/crypto/s390/sha512.h b/lib/crypto/s390/sha512.h index 24744651550cbd..46699d43df7eb6 100644 --- a/lib/crypto/s390/sha512.h +++ b/lib/crypto/s390/sha512.h @@ -20,7 +20,7 @@ static void sha512_blocks(struct sha512_block_state *state, } #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { if (cpu_have_feature(S390_CPU_FEATURE_MSA) && cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512)) diff --git a/lib/crypto/sparc/md5.h b/lib/crypto/sparc/md5.h index 3f1b0ed8c0b3f3..3995f3e075eb6b 100644 --- a/lib/crypto/sparc/md5.h +++ b/lib/crypto/sparc/md5.h @@ -32,7 +32,7 @@ static void md5_blocks(struct md5_block_state *state, } #define md5_mod_init_arch md5_mod_init_arch -static inline void md5_mod_init_arch(void) +static void md5_mod_init_arch(void) { unsigned long cfr; diff --git a/lib/crypto/sparc/sha1.h b/lib/crypto/sparc/sha1.h index 5015f93584b7e3..bdf771fcc1f73a 100644 --- a/lib/crypto/sparc/sha1.h +++ b/lib/crypto/sparc/sha1.h @@ -27,7 +27,7 @@ static void sha1_blocks(struct sha1_block_state *state, } #define sha1_mod_init_arch sha1_mod_init_arch -static inline void sha1_mod_init_arch(void) +static void sha1_mod_init_arch(void) { unsigned long cfr; diff --git a/lib/crypto/sparc/sha256.h b/lib/crypto/sparc/sha256.h index 1d10108eb19543..b2f4419ec77810 100644 --- a/lib/crypto/sparc/sha256.h +++ b/lib/crypto/sparc/sha256.h @@ -27,7 +27,7 @@ static void sha256_blocks(struct sha256_block_state *state, } #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { unsigned long cfr; diff --git a/lib/crypto/sparc/sha512.h b/lib/crypto/sparc/sha512.h index 55303ab6b15f7d..a8c37a7d4c3937 100644 --- a/lib/crypto/sparc/sha512.h +++ b/lib/crypto/sparc/sha512.h @@ -26,7 +26,7 @@ static void sha512_blocks(struct sha512_block_state *state, } #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { unsigned long cfr; diff --git a/lib/crypto/x86/sha1.h b/lib/crypto/x86/sha1.h index e308379d89bcf5..c48a0131fd12c9 100644 --- a/lib/crypto/x86/sha1.h +++ b/lib/crypto/x86/sha1.h @@ -55,7 +55,7 @@ static void sha1_blocks(struct sha1_block_state *state, } #define sha1_mod_init_arch sha1_mod_init_arch -static inline void sha1_mod_init_arch(void) +static void sha1_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha1_blocks_x86, sha1_blocks_ni); diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h index c852396ef31908..41fa95fbc3bf85 100644 --- a/lib/crypto/x86/sha256.h +++ b/lib/crypto/x86/sha256.h @@ -36,7 +36,7 @@ static void sha256_blocks(struct sha256_block_state *state, } #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha256_blocks_x86, sha256_blocks_ni); diff --git a/lib/crypto/x86/sha512.h b/lib/crypto/x86/sha512.h index be2c8fc1224699..0213c70cedd01c 100644 --- a/lib/crypto/x86/sha512.h +++ b/lib/crypto/x86/sha512.h @@ -35,7 +35,7 @@ static void sha512_blocks(struct sha512_block_state *state, } #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { From 3b0dec689a6301845761681b852f9538cb75a1d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Wed, 27 Aug 2025 17:53:00 +0200 Subject: [PATCH 0553/3206] selftests: cgroup: Make test_pids backwards compatible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The predicates in test expect event counting from 73e75e6fc352b ("cgroup/pids: Separate semantics of pids.events related to pids.max") and the test would fail on older kernels. We want to have one version of tests for all, so detect the feature and skip the test on old kernels. (The test could even switch to check v1 semantics based on the flag but keep it simple for now.) Fixes: 9f34c566027b6 ("selftests: cgroup: Add basic tests for pids controller") Signed-off-by: Michal Koutný Tested-by: Sebastian Chlad Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/lib/cgroup_util.c | 12 ++++++++++++ .../selftests/cgroup/lib/include/cgroup_util.h | 1 + tools/testing/selftests/cgroup/test_pids.c | 3 +++ 3 files changed, 16 insertions(+) diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 0e89fcff4d05d3..44c52f620fda17 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -522,6 +522,18 @@ int proc_mount_contains(const char *option) return strstr(buf, option) != NULL; } +int cgroup_feature(const char *feature) +{ + char buf[PAGE_SIZE]; + ssize_t read; + + read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf)); + if (read < 0) + return read; + + return strstr(buf, feature) != NULL; +} + ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) { char path[PATH_MAX]; diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index c69cab66254b41..9dc90a1b386d77 100644 --- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -60,6 +60,7 @@ extern int cg_run_nowait(const char *cgroup, extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); int proc_mount_contains(const char *option); +int cgroup_feature(const char *feature); extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); extern pid_t clone_into_cgroup(int cgroup_fd); diff --git a/tools/testing/selftests/cgroup/test_pids.c b/tools/testing/selftests/cgroup/test_pids.c index 9ecb83c6cc5cbf..d8a1d1cd500727 100644 --- a/tools/testing/selftests/cgroup/test_pids.c +++ b/tools/testing/selftests/cgroup/test_pids.c @@ -77,6 +77,9 @@ static int test_pids_events(const char *root) char *cg_parent = NULL, *cg_child = NULL; int pid; + if (cgroup_feature("pids_localevents") <= 0) + return KSFT_SKIP; + cg_parent = cg_name(root, "pids_parent"); cg_child = cg_name(cg_parent, "pids_child"); if (!cg_parent || !cg_child) From 1df7dad4d5c49335b72e26d833def960b2de76e3 Mon Sep 17 00:00:00 2001 From: Nandakumar Edamana Date: Tue, 26 Aug 2025 09:15:23 +0530 Subject: [PATCH 0554/3206] bpf: Improve the general precision of tnum_mul Drop the value-mask decomposition technique and adopt straightforward long-multiplication with a twist: when LSB(a) is uncertain, find the two partial products (for LSB(a) = known 0 and LSB(a) = known 1) and take a union. Experiment shows that applying this technique in long multiplication improves the precision in a significant number of cases (at the cost of losing precision in a relatively lower number of cases). Signed-off-by: Nandakumar Edamana Signed-off-by: Andrii Nakryiko Tested-by: Harishankar Vishwanathan Reviewed-by: Harishankar Vishwanathan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20250826034524.2159515-1-nandakumar@nandakumar.co.in --- include/linux/tnum.h | 3 +++ kernel/bpf/tnum.c | 55 +++++++++++++++++++++++++++++++++----------- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 0ffb77ffe0e873..c52b862dad45be 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -57,6 +57,9 @@ bool tnum_overlap(struct tnum a, struct tnum b); /* Return a tnum representing numbers satisfying both @a and @b */ struct tnum tnum_intersect(struct tnum a, struct tnum b); +/* Returns a tnum representing numbers satisfying either @a or @b */ +struct tnum tnum_union(struct tnum t1, struct tnum t2); + /* Return @a with all but the lowest @size bytes cleared */ struct tnum tnum_cast(struct tnum a, u8 size); diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index d9328bbb3680b5..f8e70e9c3998d4 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -116,31 +116,47 @@ struct tnum tnum_xor(struct tnum a, struct tnum b) return TNUM(v & ~mu, mu); } -/* Generate partial products by multiplying each bit in the multiplier (tnum a) - * with the multiplicand (tnum b), and add the partial products after - * appropriately bit-shifting them. Instead of directly performing tnum addition - * on the generated partial products, equivalenty, decompose each partial - * product into two tnums, consisting of the value-sum (acc_v) and the - * mask-sum (acc_m) and then perform tnum addition on them. The following paper - * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398. +/* Perform long multiplication, iterating through the bits in a using rshift: + * - if LSB(a) is a known 0, keep current accumulator + * - if LSB(a) is a known 1, add b to current accumulator + * - if LSB(a) is unknown, take a union of the above cases. + * + * For example: + * + * acc_0: acc_1: + * + * 11 * -> 11 * -> 11 * -> union(0011, 1001) == x0x1 + * x1 01 11 + * ------ ------ ------ + * 11 11 11 + * xx 00 11 + * ------ ------ ------ + * ???? 0011 1001 */ struct tnum tnum_mul(struct tnum a, struct tnum b) { - u64 acc_v = a.value * b.value; - struct tnum acc_m = TNUM(0, 0); + struct tnum acc = TNUM(0, 0); while (a.value || a.mask) { /* LSB of tnum a is a certain 1 */ if (a.value & 1) - acc_m = tnum_add(acc_m, TNUM(0, b.mask)); + acc = tnum_add(acc, b); /* LSB of tnum a is uncertain */ - else if (a.mask & 1) - acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask)); + else if (a.mask & 1) { + /* acc = tnum_union(acc_0, acc_1), where acc_0 and + * acc_1 are partial accumulators for cases + * LSB(a) = certain 0 and LSB(a) = certain 1. + * acc_0 = acc + 0 * b = acc. + * acc_1 = acc + 1 * b = tnum_add(acc, b). + */ + + acc = tnum_union(acc, tnum_add(acc, b)); + } /* Note: no case for LSB is certain 0 */ a = tnum_rshift(a, 1); b = tnum_lshift(b, 1); } - return tnum_add(TNUM(acc_v, 0), acc_m); + return acc; } bool tnum_overlap(struct tnum a, struct tnum b) @@ -163,6 +179,19 @@ struct tnum tnum_intersect(struct tnum a, struct tnum b) return TNUM(v & ~mu, mu); } +/* Returns a tnum with the uncertainty from both a and b, and in addition, new + * uncertainty at any position that a and b disagree. This represents a + * superset of the union of the concrete sets of both a and b. Despite the + * overapproximation, it is optimal. + */ +struct tnum tnum_union(struct tnum a, struct tnum b) +{ + u64 v = a.value & b.value; + u64 mu = (a.value ^ b.value) | a.mask | b.mask; + + return TNUM(v & ~mu, mu); +} + struct tnum tnum_cast(struct tnum a, u8 size) { a.value &= (1ULL << (size * 8)) - 1; From 2660b9d477501c3dbb09705640b70d0a80072f0c Mon Sep 17 00:00:00 2001 From: Nandakumar Edamana Date: Tue, 26 Aug 2025 09:15:24 +0530 Subject: [PATCH 0555/3206] bpf: Add selftest to check the verifier's abstract multiplication Add new selftest to test the abstract multiplication technique(s) used by the verifier, following the recent improvement in tnum multiplication (tnum_mul). One of the newly added programs, verifier_mul/mul_precise, results in a false positive with the old tnum_mul, while the program passes with the latest one. Signed-off-by: Nandakumar Edamana Signed-off-by: Andrii Nakryiko Reviewed-by: Harishankar Vishwanathan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20250826034524.2159515-2-nandakumar@nandakumar.co.in --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_mul.c | 38 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_mul.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 77ec95d4ffaae9..e35c216dbaf21c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -59,6 +59,7 @@ #include "verifier_meta_access.skel.h" #include "verifier_movsx.skel.h" #include "verifier_mtu.skel.h" +#include "verifier_mul.skel.h" #include "verifier_netfilter_ctx.skel.h" #include "verifier_netfilter_retcode.skel.h" #include "verifier_bpf_fastcall.skel.h" @@ -194,6 +195,7 @@ void test_verifier_may_goto_1(void) { RUN(verifier_may_goto_1); } void test_verifier_may_goto_2(void) { RUN(verifier_may_goto_2); } void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } +void test_verifier_mul(void) { RUN(verifier_mul); } void test_verifier_netfilter_ctx(void) { RUN(verifier_netfilter_ctx); } void test_verifier_netfilter_retcode(void) { RUN(verifier_netfilter_retcode); } void test_verifier_bpf_fastcall(void) { RUN(verifier_bpf_fastcall); } diff --git a/tools/testing/selftests/bpf/progs/verifier_mul.c b/tools/testing/selftests/bpf/progs/verifier_mul.c new file mode 100644 index 00000000000000..7145fe3351d5c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_mul.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Nandakumar Edamana */ +#include +#include +#include +#include "bpf_misc.h" + +/* Intended to test the abstract multiplication technique(s) used by + * the verifier. Using assembly to avoid compiler optimizations. + */ +SEC("fentry/bpf_fentry_test1") +void BPF_PROG(mul_precise, int x) +{ + /* First, force the verifier to be uncertain about the value: + * unsigned int a = (bpf_get_prandom_u32() & 0x2) | 0x1; + * + * Assuming the verifier is using tnum, a must be tnum{.v=0x1, .m=0x2}. + * Then a * 0x3 would be m0m1 (m for uncertain). Added imprecision + * would cause the following to fail, because the required return value + * is 0: + * return (a * 0x3) & 0x4); + */ + asm volatile ("\ + call %[bpf_get_prandom_u32];\ + r0 &= 0x2;\ + r0 |= 0x1;\ + r0 *= 0x3;\ + r0 &= 0x4;\ + if r0 != 0 goto l0_%=;\ + r0 = 0;\ + goto l1_%=;\ +l0_%=:\ + r0 = 1;\ +l1_%=:\ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} From d3abefe897408718799ae3bd06295b89b870a38e Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Tue, 26 Aug 2025 18:27:46 +0530 Subject: [PATCH 0556/3206] selftests/bpf: Fix typos and grammar in test sources Fix spelling typos and grammar errors in BPF selftests source code. Signed-off-by: Shubham Sharma Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250826125746.17983-1-slopixelz@gmail.com --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/bench.c | 2 +- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +- tools/testing/selftests/bpf/prog_tests/fd_array.c | 2 +- .../testing/selftests/bpf/prog_tests/kprobe_multi_test.c | 2 +- tools/testing/selftests/bpf/prog_tests/module_attach.c | 2 +- tools/testing/selftests/bpf/prog_tests/reg_bounds.c | 4 ++-- .../selftests/bpf/prog_tests/stacktrace_build_id.c | 2 +- .../selftests/bpf/prog_tests/stacktrace_build_id_nmi.c | 2 +- tools/testing/selftests/bpf/prog_tests/stacktrace_map.c | 2 +- .../selftests/bpf/prog_tests/stacktrace_map_raw_tp.c | 2 +- .../selftests/bpf/prog_tests/stacktrace_map_skip.c | 2 +- tools/testing/selftests/bpf/progs/bpf_cc_cubic.c | 2 +- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 2 +- .../selftests/bpf/progs/freplace_connect_v4_prog.c | 2 +- tools/testing/selftests/bpf/progs/iters_state_safety.c | 2 +- tools/testing/selftests/bpf/progs/rbtree_search.c | 2 +- .../testing/selftests/bpf/progs/struct_ops_kptr_return.c | 2 +- tools/testing/selftests/bpf/progs/struct_ops_refcounted.c | 2 +- tools/testing/selftests/bpf/progs/test_cls_redirect.c | 2 +- .../selftests/bpf/progs/test_cls_redirect_dynptr.c | 2 +- tools/testing/selftests/bpf/progs/uretprobe_stack.c | 4 ++-- tools/testing/selftests/bpf/progs/verifier_scalar_ids.c | 2 +- tools/testing/selftests/bpf/progs/verifier_var_off.c | 6 +++--- tools/testing/selftests/bpf/test_sockmap.c | 2 +- tools/testing/selftests/bpf/verifier/calls.c | 8 ++++---- tools/testing/selftests/bpf/xdping.c | 2 +- tools/testing/selftests/bpf/xsk.h | 4 ++-- 28 files changed, 36 insertions(+), 36 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 77794efc020ea6..fd6b370c816984 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -398,7 +398,7 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif -# vmlinux.h is first dumped to a temprorary file and then compared to +# vmlinux.h is first dumped to a temporary file and then compared to # the previous version. This helps to avoid unnecessary re-builds of # $(TRUNNER_BPF_OBJS) $(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index ddd73d06a1eb27..3ecc226ea7b25d 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -499,7 +499,7 @@ extern const struct bench bench_rename_rawtp; extern const struct bench bench_rename_fentry; extern const struct bench bench_rename_fexit; -/* pure counting benchmarks to establish theoretical lmits */ +/* pure counting benchmarks to establish theoretical limits */ extern const struct bench bench_trig_usermode_count; extern const struct bench bench_trig_syscall_count; extern const struct bench bench_trig_kernel_count; diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 82903585c8700c..10cba526d3e631 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -63,7 +63,7 @@ static int test_btf_dump_case(int n, struct btf_dump_test_case *t) /* tests with t->known_ptr_sz have no "long" or "unsigned long" type, * so it's impossible to determine correct pointer size; but if they - * do, it should be 8 regardless of host architecture, becaues BPF + * do, it should be 8 regardless of host architecture, because BPF * target is always 64-bit */ if (!t->known_ptr_sz) { diff --git a/tools/testing/selftests/bpf/prog_tests/fd_array.c b/tools/testing/selftests/bpf/prog_tests/fd_array.c index 241b2c8c6e0f15..c534b4d5f9da80 100644 --- a/tools/testing/selftests/bpf/prog_tests/fd_array.c +++ b/tools/testing/selftests/bpf/prog_tests/fd_array.c @@ -293,7 +293,7 @@ static int get_btf_id_by_fd(int btf_fd, __u32 *id) * 1) Create a new btf, it's referenced only by a file descriptor, so refcnt=1 * 2) Load a BPF prog with fd_array[0] = btf_fd; now btf's refcnt=2 * 3) Close the btf_fd, now refcnt=1 - * Wait and check that BTF stil exists. + * Wait and check that BTF still exists. */ static void check_fd_array_cnt__referenced_btfs(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index e19ef509ebf85e..f377bea0b82d4f 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -463,7 +463,7 @@ static bool skip_entry(char *name) return false; } -/* Do comparision by ignoring '.llvm.' suffixes. */ +/* Do comparison by ignoring '.llvm.' suffixes. */ static int compare_name(const char *name1, const char *name2) { const char *res1, *res2; diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index 6d391d95f96e00..70fa7ae93173b6 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -90,7 +90,7 @@ void test_module_attach(void) test_module_attach__detach(skel); - /* attach fentry/fexit and make sure it get's module reference */ + /* attach fentry/fexit and make sure it gets module reference */ link = bpf_program__attach(skel->progs.handle_fentry); if (!ASSERT_OK_PTR(link, "attach_fentry")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index e261b0e872dbba..d93a0c7b1786f1 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -623,7 +623,7 @@ static void range_cond(enum num_t t, struct range x, struct range y, *newx = range(t, x.a, x.b); *newy = range(t, y.a + 1, y.b); } else if (x.a == x.b && x.b == y.b) { - /* X is a constant matching rigth side of Y */ + /* X is a constant matching right side of Y */ *newx = range(t, x.a, x.b); *newy = range(t, y.a, y.b - 1); } else if (y.a == y.b && x.a == y.a) { @@ -631,7 +631,7 @@ static void range_cond(enum num_t t, struct range x, struct range y, *newx = range(t, x.a + 1, x.b); *newy = range(t, y.a, y.b); } else if (y.a == y.b && x.b == y.b) { - /* Y is a constant matching rigth side of X */ + /* Y is a constant matching right side of X */ *newx = range(t, x.a, x.b - 1); *newy = range(t, y.a, y.b); } else { diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c index b7ba5cd47d96fa..271b5cc9fc0153 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c @@ -39,7 +39,7 @@ void test_stacktrace_build_id(void) bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 0832fd7874575c..b277dddd5af7ff 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -66,7 +66,7 @@ void test_stacktrace_build_id_nmi(void) bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c index df59e4ae295100..84a7e405e9129d 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c @@ -50,7 +50,7 @@ void test_stacktrace_map(void) bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c index c6ef06f55cdb46..e0cb4697b4b3cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c @@ -46,7 +46,7 @@ void test_stacktrace_map_raw_tp(void) bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c index 1932b1e0685cfd..dc2ccf6a14d133 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c @@ -40,7 +40,7 @@ void test_stacktrace_map_skip(void) skel->bss->control = 1; /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (!ASSERT_OK(err, "compare_map_keys stackid_hmap vs. stackmap")) diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c index 1654a530aa3dc6..4e51785e7606e7 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -101,7 +101,7 @@ static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, tp->snd_cwnd = pkts_in_flight + sndcnt; } -/* Decide wheather to run the increase function of congestion control. */ +/* Decide whether to run the increase function of congestion control. */ static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { if (tcp_sk(sk)->reordering > TCP_REORDERING) diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 7cd73e75f52a2b..32c511bcd60b3a 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ -/* WARNING: This implemenation is not necessarily the same +/* WARNING: This implementation is not necessarily the same * as the tcp_dctcp.c. The purpose is mainly for testing * the kernel BPF logic. */ diff --git a/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c b/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c index 544e5ac9046106..d09bbd8ae8a85b 100644 --- a/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c +++ b/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c @@ -12,7 +12,7 @@ SEC("freplace/connect_v4_prog") int new_connect_v4_prog(struct bpf_sock_addr *ctx) { - // return value thats in invalid range + // return value that's in invalid range return 255; } diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c index f41257eadbb258..b381ac0c736cf0 100644 --- a/tools/testing/selftests/bpf/progs/iters_state_safety.c +++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c @@ -345,7 +345,7 @@ int __naked read_from_iter_slot_fail(void) "r3 = 1000;" "call %[bpf_iter_num_new];" - /* attemp to leak bpf_iter_num state */ + /* attempt to leak bpf_iter_num state */ "r7 = *(u64 *)(r6 + 0);" "r8 = *(u64 *)(r6 + 8);" diff --git a/tools/testing/selftests/bpf/progs/rbtree_search.c b/tools/testing/selftests/bpf/progs/rbtree_search.c index 098ef970fac160..b05565d1db0d47 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_search.c +++ b/tools/testing/selftests/bpf/progs/rbtree_search.c @@ -183,7 +183,7 @@ long test_##op##_spinlock_##dolock(void *ctx) \ } /* - * Use a spearate MSG macro instead of passing to TEST_XXX(..., MSG) + * Use a separate MSG macro instead of passing to TEST_XXX(..., MSG) * to ensure the message itself is not in the bpf prog lineinfo * which the verifier includes in its log. * Otherwise, the test_loader will incorrectly match the prog lineinfo diff --git a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c index 36386b3c23a1f6..2b98b7710816dc 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c @@ -9,7 +9,7 @@ void bpf_task_release(struct task_struct *p) __ksym; /* This test struct_ops BPF programs returning referenced kptr. The verifier should * allow a referenced kptr or a NULL pointer to be returned. A referenced kptr to task - * here is acquried automatically as the task argument is tagged with "__ref". + * here is acquired automatically as the task argument is tagged with "__ref". */ SEC("struct_ops/test_return_ref_kptr") struct task_struct *BPF_PROG(kptr_return, int dummy, diff --git a/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c b/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c index 76dcb6089d7f8e..9c0a65466356c9 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c @@ -9,7 +9,7 @@ __attribute__((nomerge)) extern void bpf_task_release(struct task_struct *p) __k /* This is a test BPF program that uses struct_ops to access a referenced * kptr argument. This is a test for the verifier to ensure that it - * 1) recongnizes the task as a referenced object (i.e., ref_obj_id > 0), and + * 1) recognizes the task as a referenced object (i.e., ref_obj_id > 0), and * 2) the same reference can be acquired from multiple paths as long as it * has not been released. */ diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index f344c6835e84e7..823169fb6e4c7f 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -129,7 +129,7 @@ typedef uint8_t *net_ptr __attribute__((align_value(8))); typedef struct buf { struct __sk_buff *skb; net_ptr head; - /* NB: tail musn't have alignment other than 1, otherwise + /* NB: tail mustn't have alignment other than 1, otherwise * LLVM will go and eliminate code, e.g. when checking packet lengths. */ uint8_t *const tail; diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c index d0f7670351e587..dfd4a2710391d9 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c @@ -494,7 +494,7 @@ static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_header *offset += sizeof(*next_hop); - /* Skip the remainig next hops (may be zero). */ + /* Skip the remaining next hops (may be zero). */ return skip_next_hops(offset, encap->unigue.hop_count - encap->unigue.next_hop - 1); } diff --git a/tools/testing/selftests/bpf/progs/uretprobe_stack.c b/tools/testing/selftests/bpf/progs/uretprobe_stack.c index 9fdcf396b8f467..a2951e2f1711b8 100644 --- a/tools/testing/selftests/bpf/progs/uretprobe_stack.c +++ b/tools/testing/selftests/bpf/progs/uretprobe_stack.c @@ -26,8 +26,8 @@ int usdt_len; SEC("uprobe//proc/self/exe:target_1") int BPF_UPROBE(uprobe_1) { - /* target_1 is recursive wit depth of 2, so we capture two separate - * stack traces, depending on which occurence it is + /* target_1 is recursive with depth of 2, so we capture two separate + * stack traces, depending on which occurrence it is */ static bool recur = false; diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index 7c5e5e6d10ebc2..dba3ca728f6e6f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -349,7 +349,7 @@ __naked void precision_two_ids(void) SEC("socket") __success __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) -/* check thar r0 and r6 have different IDs after 'if', +/* check that r0 and r6 have different IDs after 'if', * collect_linked_regs() can't tie more than 6 registers for a single insn. */ __msg("8: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") diff --git a/tools/testing/selftests/bpf/progs/verifier_var_off.c b/tools/testing/selftests/bpf/progs/verifier_var_off.c index 1d36d01b746e78..f345466bca6868 100644 --- a/tools/testing/selftests/bpf/progs/verifier_var_off.c +++ b/tools/testing/selftests/bpf/progs/verifier_var_off.c @@ -114,8 +114,8 @@ __naked void stack_write_priv_vs_unpriv(void) } /* Similar to the previous test, but this time also perform a read from the - * address written to with a variable offset. The read is allowed, showing that, - * after a variable-offset write, a priviledged program can read the slots that + * address written to with a variable offet. The read is allowed, showing that, + * after a variable-offset write, a privileged program can read the slots that * were in the range of that write (even if the verifier doesn't actually know if * the slot being read was really written to or not. * @@ -157,7 +157,7 @@ __naked void stack_write_followed_by_read(void) SEC("socket") __description("variable-offset stack write clobbers spilled regs") __failure -/* In the priviledged case, dereferencing a spilled-and-then-filled +/* In the privileged case, dereferencing a spilled-and-then-filled * register is rejected because the previous variable offset stack * write might have overwritten the spilled pointer (i.e. we lose track * of the spilled register when we analyze the write). diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index fd2da2234cc9b4..76568db7a66422 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1372,7 +1372,7 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) } else fprintf(stderr, "unknown test\n"); out: - /* Detatch and zero all the maps */ + /* Detach and zero all the maps */ bpf_prog_detach2(bpf_program__fd(progs[3]), cg_fd, BPF_CGROUP_SOCK_OPS); for (i = 0; i < ARRAY_SIZE(links); i++) { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index f3492efc88346e..c8d640802cce41 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -1375,7 +1375,7 @@ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), /* write into map value */ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), - /* fetch secound map_value_ptr from the stack */ + /* fetch second map_value_ptr from the stack */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), /* write into map value */ @@ -1439,7 +1439,7 @@ /* second time with fp-16 */ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), - /* fetch secound map_value_ptr from the stack */ + /* fetch second map_value_ptr from the stack */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), /* write into map value */ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), @@ -1493,7 +1493,7 @@ /* second time with fp-16 */ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), - /* fetch secound map_value_ptr from the stack */ + /* fetch second map_value_ptr from the stack */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), /* write into map value */ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), @@ -2380,7 +2380,7 @@ */ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1), BPF_MOV64_REG(BPF_REG_9, BPF_REG_8), - /* r9 = *r9 ; verifier get's to this point via two paths: + /* r9 = *r9 ; verifier gets to this point via two paths: * ; (I) one including r9 = r8, verified first; * ; (II) one excluding r9 = r8, verified next. * ; After load of *r9 to r9 the frame[0].fp[-24].id == r9.id. diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c index 1503a1d2faa090..9ed8c796645d00 100644 --- a/tools/testing/selftests/bpf/xdping.c +++ b/tools/testing/selftests/bpf/xdping.c @@ -155,7 +155,7 @@ int main(int argc, char **argv) } if (!server) { - /* Only supports IPv4; see hints initiailization above. */ + /* Only supports IPv4; see hints initialization above. */ if (getaddrinfo(argv[optind], NULL, &hints, &a) || !a) { fprintf(stderr, "Could not resolve %s\n", argv[optind]); return 1; diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h index 93c2cc413cfcd0..48729da142c249 100644 --- a/tools/testing/selftests/bpf/xsk.h +++ b/tools/testing/selftests/bpf/xsk.h @@ -93,8 +93,8 @@ static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) /* Refresh the local tail pointer. * cached_cons is r->size bigger than the real consumer pointer so * that this addition can be avoided in the more frequently - * executed code that computs free_entries in the beginning of - * this function. Without this optimization it whould have been + * executed code that computes free_entries in the beginning of + * this function. Without this optimization it would have been * free_entries = r->cached_prod - r->cached_cons + r->size. */ r->cached_cons = __atomic_load_n(r->consumer, __ATOMIC_ACQUIRE); From 758acb9ccfdbf854b55abaceaf1f3f229cde3d19 Mon Sep 17 00:00:00 2001 From: Jiawei Zhao Date: Wed, 27 Aug 2025 05:31:27 +0000 Subject: [PATCH 0557/3206] libbpf: Fix USDT SIB argument handling causing unrecognized register error On x86-64, USDT arguments can be specified using Scale-Index-Base (SIB) addressing, e.g. "1@-96(%rbp,%rax,8)". The current USDT implementation in libbpf cannot parse this format, causing `bpf_program__attach_usdt()` to fail with -ENOENT (unrecognized register). This patch fixes this by implementing the necessary changes: - add correct handling for SIB-addressed arguments in `bpf_usdt_arg`. - add adaptive support to `__bpf_usdt_arg_type` and `__bpf_usdt_arg_spec` to represent SIB addressing parameters. Signed-off-by: Jiawei Zhao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250827053128.1301287-2-phoenix500526@163.com --- tools/lib/bpf/usdt.bpf.h | 44 ++++++++++++++++++++++++++-- tools/lib/bpf/usdt.c | 62 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 99 insertions(+), 7 deletions(-) diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index 2a7865c8e3fe3c..43deb05a51970d 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -34,13 +34,32 @@ enum __bpf_usdt_arg_type { BPF_USDT_ARG_CONST, BPF_USDT_ARG_REG, BPF_USDT_ARG_REG_DEREF, + BPF_USDT_ARG_SIB, }; +/* + * This struct layout is designed specifically to be backwards/forward + * compatible between libbpf versions for ARG_CONST, ARG_REG, and + * ARG_REG_DEREF modes. ARG_SIB requires libbpf v1.7+. + */ struct __bpf_usdt_arg_spec { /* u64 scalar interpreted depending on arg_type, see below */ __u64 val_off; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* arg location case, see bpf_usdt_arg() for details */ - enum __bpf_usdt_arg_type arg_type; + enum __bpf_usdt_arg_type arg_type: 8; + /* index register offset within struct pt_regs */ + __u16 idx_reg_off: 12; + /* scale factor for index register (1, 2, 4, or 8) */ + __u16 scale_bitshift: 4; + /* reserved for future use, keeps reg_off offset stable */ + __u8 __reserved: 8; +#else + __u8 __reserved: 8; + __u16 idx_reg_off: 12; + __u16 scale_bitshift: 4; + enum __bpf_usdt_arg_type arg_type: 8; +#endif /* offset of referenced register within struct pt_regs */ short reg_off; /* whether arg should be interpreted as signed value */ @@ -149,7 +168,7 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) { struct __bpf_usdt_spec *spec; struct __bpf_usdt_arg_spec *arg_spec; - unsigned long val; + unsigned long val, idx; int err, spec_id; *res = 0; @@ -202,6 +221,27 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) return err; #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ val >>= arg_spec->arg_bitshift; +#endif + break; + case BPF_USDT_ARG_SIB: + /* Arg is in memory addressed by SIB (Scale-Index-Base) mode + * (e.g., "-1@-96(%rbp,%rax,8)" in USDT arg spec). We first + * fetch the base register contents and the index register + * contents from pt_regs. Then we calculate the final address + * as base + (index * scale) + offset, and do a user-space + * probe read to fetch the argument value. + */ + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + if (err) + return err; + err = bpf_probe_read_kernel(&idx, sizeof(idx), (void *)ctx + arg_spec->idx_reg_off); + if (err) + return err; + err = bpf_probe_read_user(&val, sizeof(val), (void *)(val + (idx << arg_spec->scale_bitshift) + arg_spec->val_off)); + if (err) + return err; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + val >>= arg_spec->arg_bitshift; #endif break; default: diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 3373b9d45ac448..867bff6b06990e 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -200,12 +200,23 @@ enum usdt_arg_type { USDT_ARG_CONST, USDT_ARG_REG, USDT_ARG_REG_DEREF, + USDT_ARG_SIB, }; /* should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h */ struct usdt_arg_spec { __u64 val_off; - enum usdt_arg_type arg_type; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + enum usdt_arg_type arg_type: 8; + __u16 idx_reg_off: 12; + __u16 scale_bitshift: 4; + __u8 __reserved: 8; /* keep reg_off offset stable */ +#else + __u8 __reserved: 8; /* keep reg_off offset stable */ + __u16 idx_reg_off: 12; + __u16 scale_bitshift: 4; + enum usdt_arg_type arg_type: 8; +#endif short reg_off; bool arg_signed; char arg_bitshift; @@ -1283,11 +1294,51 @@ static int calc_pt_regs_off(const char *reg_name) static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { - char reg_name[16]; - int len, reg_off; - long off; + char reg_name[16] = {0}, idx_reg_name[16] = {0}; + int len, reg_off, idx_reg_off, scale = 1; + long off = 0; + + if (sscanf(arg_str, " %d @ %ld ( %%%15[^,] , %%%15[^,] , %d ) %n", + arg_sz, &off, reg_name, idx_reg_name, &scale, &len) == 5 || + sscanf(arg_str, " %d @ ( %%%15[^,] , %%%15[^,] , %d ) %n", + arg_sz, reg_name, idx_reg_name, &scale, &len) == 4 || + sscanf(arg_str, " %d @ %ld ( %%%15[^,] , %%%15[^)] ) %n", + arg_sz, &off, reg_name, idx_reg_name, &len) == 4 || + sscanf(arg_str, " %d @ ( %%%15[^,] , %%%15[^)] ) %n", + arg_sz, reg_name, idx_reg_name, &len) == 3 + ) { + /* + * Scale Index Base case: + * 1@-96(%rbp,%rax,8) + * 1@(%rbp,%rax,8) + * 1@-96(%rbp,%rax) + * 1@(%rbp,%rax) + */ + arg->arg_type = USDT_ARG_SIB; + arg->val_off = off; - if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", arg_sz, &off, reg_name, &len) == 3) { + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + + idx_reg_off = calc_pt_regs_off(idx_reg_name); + if (idx_reg_off < 0) + return idx_reg_off; + arg->idx_reg_off = idx_reg_off; + + /* validate scale factor and set fields directly */ + switch (scale) { + case 1: arg->scale_bitshift = 0; break; + case 2: arg->scale_bitshift = 1; break; + case 4: arg->scale_bitshift = 2; break; + case 8: arg->scale_bitshift = 3; break; + default: + pr_warn("usdt: invalid SIB scale %d, expected 1, 2, 4, 8\n", scale); + return -EINVAL; + } + } else if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", + arg_sz, &off, reg_name, &len) == 3) { /* Memory dereference case, e.g., -4@-20(%rbp) */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = off; @@ -1306,6 +1357,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec } else if (sscanf(arg_str, " %d @ %%%15s %n", arg_sz, reg_name, &len) == 2) { /* Register read case, e.g., -4@%eax */ arg->arg_type = USDT_ARG_REG; + /* register read has no memory offset */ arg->val_off = 0; reg_off = calc_pt_regs_off(reg_name); From 69424097ee1061280dce111ca43b7bc17868413c Mon Sep 17 00:00:00 2001 From: Jiawei Zhao Date: Wed, 27 Aug 2025 05:31:28 +0000 Subject: [PATCH 0558/3206] selftests/bpf: Enrich subtest_basic_usdt case in selftests to cover SIB handling logic When using GCC on x86-64 to compile an usdt prog with -O1 or higher optimization, the compiler will generate SIB addressing mode for global array, e.g. "1@-96(%rbp,%rax,8)". In this patch: - enrich subtest_basic_usdt test case to cover SIB addressing usdt argument spec handling logic Signed-off-by: Jiawei Zhao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250827053128.1301287-3-phoenix500526@163.com --- tools/testing/selftests/bpf/prog_tests/usdt.c | 83 ++++++++++++++++++- tools/testing/selftests/bpf/progs/test_usdt.c | 31 +++++++ 2 files changed, 112 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 9057e983cc5405..615e9c3e93bf0a 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -40,12 +40,72 @@ static void __always_inline trigger_func(int x) { } } +#if defined(__x86_64__) || defined(__i386__) +/* + * SIB (Scale-Index-Base) addressing format: "size@(base_reg, index_reg, scale)" + * - 'size' is the size in bytes of the array element, and its sign indicates + * whether the type is signed (negative) or unsigned (positive). + * - 'base_reg' is the register holding the base address, normally rdx or edx + * - 'index_reg' is the register holding the index, normally rax or eax + * - 'scale' is the scaling factor (typically 1, 2, 4, or 8), which matches the + * size of the element type. + * + * For example, for an array of 'short' (signed 2-byte elements), the SIB spec would be: + * - size: -2 (negative because 'short' is signed) + * - scale: 2 (since sizeof(short) == 2) + * + * The resulting SIB format: "-2@(%%rdx,%%rax,2)" for x86_64, "-2@(%%edx,%%eax,2)" for i386 + */ +static volatile short array[] = {-1, -2, -3, -4}; + +#if defined(__x86_64__) +#define USDT_SIB_ARG_SPEC -2@(%%rdx,%%rax,2) +#else +#define USDT_SIB_ARG_SPEC -2@(%%edx,%%eax,2) +#endif + +unsigned short test_usdt_sib_semaphore SEC(".probes"); + +static void trigger_sib_spec(void) +{ + /* + * Force SIB addressing with inline assembly. + * + * You must compile with -std=gnu99 or -std=c99 to use the + * STAP_PROBE_ASM macro. + * + * The STAP_PROBE_ASM macro generates a quoted string that gets + * inserted between the surrounding assembly instructions. In this + * case, USDT_SIB_ARG_SPEC is embedded directly into the instruction + * stream, creating a probe point between the asm statement boundaries. + * It works fine with gcc/clang. + * + * Register constraints: + * - "d"(array): Binds the 'array' variable to %rdx or %edx register + * - "a"(0): Binds the constant 0 to %rax or %eax register + * These ensure that when USDT_SIB_ARG_SPEC references %%rdx(%edx) and + * %%rax(%eax), they contain the expected values for SIB addressing. + * + * The "memory" clobber prevents the compiler from reordering memory + * accesses around the probe point, ensuring that the probe behavior + * is predictable and consistent. + */ + asm volatile( + STAP_PROBE_ASM(test, usdt_sib, USDT_SIB_ARG_SPEC) + : + : "d"(array), "a"(0) + : "memory" + ); +} +#endif + static void subtest_basic_usdt(void) { LIBBPF_OPTS(bpf_usdt_opts, opts); struct test_usdt *skel; struct test_usdt__bss *bss; int err, i; + const __u64 expected_cookie = 0xcafedeadbeeffeed; skel = test_usdt__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) @@ -59,20 +119,29 @@ static void subtest_basic_usdt(void) goto cleanup; /* usdt0 won't be auto-attached */ - opts.usdt_cookie = 0xcafedeadbeeffeed; + opts.usdt_cookie = expected_cookie; skel->links.usdt0 = bpf_program__attach_usdt(skel->progs.usdt0, 0 /*self*/, "/proc/self/exe", "test", "usdt0", &opts); if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) goto cleanup; +#if defined(__x86_64__) || defined(__i386__) + opts.usdt_cookie = expected_cookie; + skel->links.usdt_sib = bpf_program__attach_usdt(skel->progs.usdt_sib, + 0 /*self*/, "/proc/self/exe", + "test", "usdt_sib", &opts); + if (!ASSERT_OK_PTR(skel->links.usdt_sib, "usdt_sib_link")) + goto cleanup; +#endif + trigger_func(1); ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); - ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); + ASSERT_EQ(bss->usdt0_cookie, expected_cookie, "usdt0_cookie"); ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); ASSERT_EQ(bss->usdt0_arg_ret, -ENOENT, "usdt0_arg_ret"); ASSERT_EQ(bss->usdt0_arg_size, -ENOENT, "usdt0_arg_size"); @@ -156,6 +225,16 @@ static void subtest_basic_usdt(void) ASSERT_EQ(bss->usdt3_args[1], 42, "usdt3_arg2"); ASSERT_EQ(bss->usdt3_args[2], (uintptr_t)&bla, "usdt3_arg3"); +#if defined(__x86_64__) || defined(__i386__) + trigger_sib_spec(); + ASSERT_EQ(bss->usdt_sib_called, 1, "usdt_sib_called"); + ASSERT_EQ(bss->usdt_sib_cookie, expected_cookie, "usdt_sib_cookie"); + ASSERT_EQ(bss->usdt_sib_arg_cnt, 1, "usdt_sib_arg_cnt"); + ASSERT_EQ(bss->usdt_sib_arg, nums[0], "usdt_sib_arg"); + ASSERT_EQ(bss->usdt_sib_arg_ret, 0, "usdt_sib_arg_ret"); + ASSERT_EQ(bss->usdt_sib_arg_size, sizeof(nums[0]), "usdt_sib_arg_size"); +#endif + cleanup: test_usdt__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_usdt.c b/tools/testing/selftests/bpf/progs/test_usdt.c index 096488f47fbc82..a78c87537b0784 100644 --- a/tools/testing/selftests/bpf/progs/test_usdt.c +++ b/tools/testing/selftests/bpf/progs/test_usdt.c @@ -107,4 +107,35 @@ int BPF_USDT(usdt12, int a1, int a2, long a3, long a4, unsigned a5, return 0; } +int usdt_sib_called; +u64 usdt_sib_cookie; +int usdt_sib_arg_cnt; +int usdt_sib_arg_ret; +short usdt_sib_arg; +int usdt_sib_arg_size; + +/* + * usdt_sib is only tested on x86-related architectures, so it requires + * manual attach since auto-attach will panic tests under other architectures + */ +SEC("usdt") +int usdt_sib(struct pt_regs *ctx) +{ + long tmp; + + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + __sync_fetch_and_add(&usdt_sib_called, 1); + + usdt_sib_cookie = bpf_usdt_cookie(ctx); + usdt_sib_arg_cnt = bpf_usdt_arg_cnt(ctx); + + usdt_sib_arg_ret = bpf_usdt_arg(ctx, 0, &tmp); + usdt_sib_arg = (short)tmp; + usdt_sib_arg_size = bpf_usdt_arg_size(ctx, 0); + + return 0; +} + char _license[] SEC("license") = "GPL"; From 16175375da369fcfdcc0127c9ca39b767ae4f885 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 27 Aug 2025 11:32:43 +0000 Subject: [PATCH 0559/3206] bpf, arm64: Add JIT support for timed may_goto When verifier sees a timed may_goto instruction, it emits a call to arch_bpf_timed_may_goto() with a stack offset in BPF_REG_AX (arm64 r9) and expects a count value to be returned in the same register. The verifier doesn't save or restore any registers before emitting this call. arch_bpf_timed_may_goto() should act as a trampoline to call bpf_check_timed_may_goto() with AAPCS64 calling convention. To support this custom calling convention, implement arch_bpf_timed_may_goto() in assembly and make sure BPF caller saved registers are saved and restored, call bpf_check_timed_may_goto with arm64 calling convention where first argument and return value both are in x0, then put the result back into BPF_REG_AX before returning. Signed-off-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Acked-by: Xu Kuohai Link: https://lore.kernel.org/r/20250827113245.52629-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/Makefile | 2 +- arch/arm64/net/bpf_jit_comp.c | 13 +++++++++- arch/arm64/net/bpf_timed_may_goto.S | 40 +++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/net/bpf_timed_may_goto.S diff --git a/arch/arm64/net/Makefile b/arch/arm64/net/Makefile index 5c540efb7d9b9a..3ae382bfca8797 100644 --- a/arch/arm64/net/Makefile +++ b/arch/arm64/net/Makefile @@ -2,4 +2,4 @@ # # ARM64 networking code # -obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 52ffe115a8c47c..a98b8132479a75 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1558,7 +1558,13 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (ret < 0) return ret; emit_call(func_addr, ctx); - emit(A64_MOV(1, r0, A64_R(0)), ctx); + /* + * Call to arch_bpf_timed_may_goto() is emitted by the + * verifier and called with custom calling convention with + * first argument and return value in BPF_REG_AX (x9). + */ + if (func_addr != (u64)arch_bpf_timed_may_goto) + emit(A64_MOV(1, r0, A64_R(0)), ctx); break; } /* tail call */ @@ -3038,6 +3044,11 @@ bool bpf_jit_bypass_spec_v4(void) return true; } +bool bpf_jit_supports_timed_may_goto(void) +{ + return true; +} + bool bpf_jit_inlines_helper_call(s32 imm) { switch (imm) { diff --git a/arch/arm64/net/bpf_timed_may_goto.S b/arch/arm64/net/bpf_timed_may_goto.S new file mode 100644 index 00000000000000..894cfcd7b2416d --- /dev/null +++ b/arch/arm64/net/bpf_timed_may_goto.S @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Puranjay Mohan */ + +#include + +SYM_FUNC_START(arch_bpf_timed_may_goto) + /* Allocate stack space and emit frame record */ + stp x29, x30, [sp, #-64]! + mov x29, sp + + /* Save BPF registers R0 - R5 (x7, x0-x4)*/ + stp x7, x0, [sp, #16] + stp x1, x2, [sp, #32] + stp x3, x4, [sp, #48] + + /* + * Stack depth was passed in BPF_REG_AX (x9), add it to the BPF_FP + * (x25) to get the pointer to count and timestamp and pass it as the + * first argument in x0. + * + * Before generating the call to arch_bpf_timed_may_goto, the verifier + * generates a load instruction using FP, i.e. REG_AX = *(u64 *)(FP - + * stack_off_cnt), so BPF_REG_FP (x25) is always set up by the arm64 + * jit in this case. + */ + add x0, x9, x25 + bl bpf_check_timed_may_goto + /* BPF_REG_AX(x9) will be stored into count, so move return value to it. */ + mov x9, x0 + + /* Restore BPF registers R0 - R5 (x7, x0-x4) */ + ldp x7, x0, [sp, #16] + ldp x1, x2, [sp, #32] + ldp x3, x4, [sp, #48] + + /* Restore FP and LR */ + ldp x29, x30, [sp], #64 + + ret +SYM_FUNC_END(arch_bpf_timed_may_goto) From 22b22bf9ee48580f18bc75bd7a53d010e5b8f598 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 27 Aug 2025 11:32:44 +0000 Subject: [PATCH 0560/3206] selftests/bpf: Enable timed may_goto tests for arm64 As arm64 JIT now supports timed may_goto instruction, make sure all relevant tests run on this architecture. Some tests were enabled and other required modifications to work properly on arm64. $ ./test_progs -a "stream*","*may_goto*",verifier_bpf_fastcall #404 stream_errors:OK [...] #406/2 stream_success/stream_cond_break:OK [...] #494/23 verifier_bpf_fastcall/may_goto_interaction_x86_64:SKIP #494/24 verifier_bpf_fastcall/may_goto_interaction_arm64:OK [...] #539/1 verifier_may_goto_1/may_goto 0:OK #539/2 verifier_may_goto_1/batch 2 of may_goto 0:OK #539/3 verifier_may_goto_1/may_goto batch with offsets 2/1/0:OK #539/4 verifier_may_goto_1/may_goto batch with offsets 2/0:OK #539 verifier_may_goto_1:OK #540/1 verifier_may_goto_2/C code with may_goto 0:OK #540 verifier_may_goto_2:OK Summary: 7/16 PASSED, 25 SKIPPED, 0 FAILED Signed-off-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Acked-by: Xu Kuohai Link: https://lore.kernel.org/r/20250827113245.52629-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/stream.c | 2 +- .../bpf/progs/verifier_bpf_fastcall.c | 27 +++++++++------ .../selftests/bpf/progs/verifier_may_goto_1.c | 34 ++++--------------- 3 files changed, 23 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/stream.c b/tools/testing/selftests/bpf/prog_tests/stream.c index 36a1a1ebde6929..9d0e5d93edee73 100644 --- a/tools/testing/selftests/bpf/prog_tests/stream.c +++ b/tools/testing/selftests/bpf/prog_tests/stream.c @@ -77,7 +77,7 @@ void test_stream_errors(void) ASSERT_OK(ret, "ret"); ASSERT_OK(opts.retval, "retval"); -#if !defined(__x86_64__) && !defined(__s390x__) +#if !defined(__x86_64__) && !defined(__s390x__) && !defined(__aarch64__) ASSERT_TRUE(1, "Timed may_goto unsupported, skip."); if (i == 0) { ret = bpf_prog_stream_read(prog_fd, 2, buf, sizeof(buf), &ropts); diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index c258b0722e045e..fb4fa465d67c62 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -660,19 +660,24 @@ __naked void may_goto_interaction_x86_64(void) SEC("raw_tp") __arch_arm64 -__log_level(4) __msg("stack depth 16") -/* may_goto counter at -16 */ -__xlated("0: *(u64 *)(r10 -16) =") -__xlated("1: r1 = 1") -__xlated("2: call bpf_get_smp_processor_id") +__log_level(4) __msg("stack depth 24") +/* may_goto counter at -24 */ +__xlated("0: *(u64 *)(r10 -24) =") +/* may_goto timestamp at -16 */ +__xlated("1: *(u64 *)(r10 -16) =") +__xlated("2: r1 = 1") +__xlated("3: call bpf_get_smp_processor_id") /* may_goto expansion starts */ -__xlated("3: r11 = *(u64 *)(r10 -16)") -__xlated("4: if r11 == 0x0 goto pc+3") -__xlated("5: r11 -= 1") -__xlated("6: *(u64 *)(r10 -16) = r11") +__xlated("4: r11 = *(u64 *)(r10 -24)") +__xlated("5: if r11 == 0x0 goto pc+6") +__xlated("6: r11 -= 1") +__xlated("7: if r11 != 0x0 goto pc+2") +__xlated("8: r11 = -24") +__xlated("9: call unknown") +__xlated("10: *(u64 *)(r10 -24) = r11") /* may_goto expansion ends */ -__xlated("7: *(u64 *)(r10 -8) = r1") -__xlated("8: exit") +__xlated("11: *(u64 *)(r10 -8) = r1") +__xlated("12: exit") __success __naked void may_goto_interaction_arm64(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c index cc1063863569d8..6d1edaef921383 100644 --- a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c +++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c @@ -10,6 +10,7 @@ SEC("raw_tp") __description("may_goto 0") __arch_x86_64 __arch_s390x +__arch_arm64 __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -29,6 +30,7 @@ SEC("raw_tp") __description("batch 2 of may_goto 0") __arch_x86_64 __arch_s390x +__arch_arm64 __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -50,6 +52,7 @@ SEC("raw_tp") __description("may_goto batch with offsets 2/1/0") __arch_x86_64 __arch_s390x +__arch_arm64 __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -72,9 +75,10 @@ __naked void may_goto_batch_1(void) } SEC("raw_tp") -__description("may_goto batch with offsets 2/0 - x86_64 and s390x") +__description("may_goto batch with offsets 2/0") __arch_x86_64 __arch_s390x +__arch_arm64 __xlated("0: *(u64 *)(r10 -16) = 65535") __xlated("1: *(u64 *)(r10 -8) = 0") __xlated("2: r11 = *(u64 *)(r10 -16)") @@ -88,33 +92,7 @@ __xlated("9: r0 = 1") __xlated("10: r0 = 2") __xlated("11: exit") __success -__naked void may_goto_batch_2_x86_64_s390x(void) -{ - asm volatile ( - ".8byte %[may_goto1];" - ".8byte %[may_goto3];" - "r0 = 1;" - "r0 = 2;" - "exit;" - : - : __imm_insn(may_goto1, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 2 /* offset */, 0)), - __imm_insn(may_goto3, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0 /* offset */, 0)) - : __clobber_all); -} - -SEC("raw_tp") -__description("may_goto batch with offsets 2/0 - arm64") -__arch_arm64 -__xlated("0: *(u64 *)(r10 -8) = 8388608") -__xlated("1: r11 = *(u64 *)(r10 -8)") -__xlated("2: if r11 == 0x0 goto pc+3") -__xlated("3: r11 -= 1") -__xlated("4: *(u64 *)(r10 -8) = r11") -__xlated("5: r0 = 1") -__xlated("6: r0 = 2") -__xlated("7: exit") -__success -__naked void may_goto_batch_2_arm64(void) +__naked void may_goto_batch_2(void) { asm volatile ( ".8byte %[may_goto1];" From 737433c6a559c4e8acb065cfe9b6e2ff45ad655c Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 27 Aug 2025 15:01:49 +0100 Subject: [PATCH 0561/3206] selftests/bpf: Add LPM trie microbenchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add benchmarks for the standard set of operations: LOOKUP, INSERT, UPDATE, DELETE. Also include benchmarks to measure the overhead of the bench framework itself (NOOP) as well as the overhead of generating keys (BASELINE). Lastly, this includes a benchmark for FREE (trie_free()) which is known to have terrible performance for maps with many entries. Benchmarks operate on tries without gaps in the key range, i.e. each test begins or ends with a trie with valid keys in the range [0, nr_entries). This is intended to cause maximum branching when traversing the trie. LOOKUP, UPDATE, DELETE, and FREE fill a BPF LPM trie from userspace using bpf_map_update_batch() and run the corresponding benchmark operation via bpf_loop(). INSERT starts with an empty map and fills it kernel-side from bpf_loop(). FREE records the time to free a filled LPM trie by attaching and destroying a BPF prog. NOOP measures the overhead of the test harness by running an empty function with bpf_loop(). BASELINE is similar to NOOP except that the function generates a key. Each operation runs 10,000 times using bpf_loop(). Note that this value is intentionally independent of the number of entries in the LPM trie so that the stability of the results isn't affected by the number of entries. For those benchmarks that need to reset the LPM trie once it's full (INSERT) or empty (DELETE), throughput and latency results are scaled by the fraction of a second the operation actually ran to ignore any time spent reinitialising the trie. By default, benchmarks run using sequential keys in the range [0, nr_entries). BASELINE, LOOKUP, and UPDATE can use random keys via the --random parameter but beware there is a runtime cost involved in generating random keys. Other benchmarks are prohibited from using random keys because it can skew the results, e.g. when inserting an existing key or deleting a missing one. All measurements are recorded from within the kernel to eliminate syscall overhead. Most benchmarks run an XDP program to generate stats but FREE needs to collect latencies using fentry/fexit on map_free_deferred() because it's not possible to use fentry directly on lpm_trie.c since commit c83508da5620 ("bpf: Avoid deadlock caused by nested kprobe and fentry bpf programs") and there's no way to create/destroy a map from within an XDP program. Here is example output from an AMD EPYC 9684X 96-Core machine for each of the benchmarks using a trie with 10K entries and a 32-bit prefix length, e.g. $ ./bench lpm-trie-$op \ --prefix_len=32 \ --producers=1 \ --nr_entries=10000 noop: throughput 74.417 ± 0.032 M ops/s ( 74.417M ops/prod), latency 13.438 ns/op baseline: throughput 70.107 ± 0.171 M ops/s ( 70.107M ops/prod), latency 14.264 ns/op lookup: throughput 8.467 ± 0.047 M ops/s ( 8.467M ops/prod), latency 118.109 ns/op insert: throughput 2.440 ± 0.015 M ops/s ( 2.440M ops/prod), latency 409.290 ns/op update: throughput 2.806 ± 0.042 M ops/s ( 2.806M ops/prod), latency 356.322 ns/op delete: throughput 4.625 ± 0.011 M ops/s ( 4.625M ops/prod), latency 215.613 ns/op free: throughput 0.578 ± 0.006 K ops/s ( 0.578K ops/prod), latency 1.730 ms/op And the same benchmarks using random keys: $ ./bench lpm-trie-$op \ --prefix_len=32 \ --producers=1 \ --nr_entries=10000 \ --random noop: throughput 74.259 ± 0.335 M ops/s ( 74.259M ops/prod), latency 13.466 ns/op baseline: throughput 35.150 ± 0.144 M ops/s ( 35.150M ops/prod), latency 28.450 ns/op lookup: throughput 7.119 ± 0.048 M ops/s ( 7.119M ops/prod), latency 140.469 ns/op insert: N/A update: throughput 2.736 ± 0.012 M ops/s ( 2.736M ops/prod), latency 365.523 ns/op delete: N/A free: N/A Signed-off-by: Matt Fleming Signed-off-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20250827140149.1001557-1-matt@readmodwrite.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 + tools/testing/selftests/bpf/bench.c | 16 + tools/testing/selftests/bpf/bench.h | 1 + .../selftests/bpf/benchs/bench_lpm_trie_map.c | 555 ++++++++++++++++++ tools/testing/selftests/bpf/progs/lpm_trie.h | 30 + .../selftests/bpf/progs/lpm_trie_bench.c | 230 ++++++++ .../selftests/bpf/progs/lpm_trie_map.c | 19 + 7 files changed, 853 insertions(+) create mode 100644 tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c create mode 100644 tools/testing/selftests/bpf/progs/lpm_trie.h create mode 100644 tools/testing/selftests/bpf/progs/lpm_trie_bench.c create mode 100644 tools/testing/selftests/bpf/progs/lpm_trie_map.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index fd6b370c816984..11d2a368db3eaa 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -816,6 +816,7 @@ $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h $(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench_sockmap.o: $(OUTPUT)/bench_sockmap_prog.skel.h +$(OUTPUT)/bench_lpm_trie_map.o: $(OUTPUT)/lpm_trie_bench.skel.h $(OUTPUT)/lpm_trie_map.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -837,6 +838,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_htab_mem.o \ $(OUTPUT)/bench_bpf_crypto.o \ $(OUTPUT)/bench_sockmap.o \ + $(OUTPUT)/bench_lpm_trie_map.o \ # $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 3ecc226ea7b25d..df4489ac14a0c9 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -284,6 +284,7 @@ extern struct argp bench_htab_mem_argp; extern struct argp bench_trigger_batch_argp; extern struct argp bench_crypto_argp; extern struct argp bench_sockmap_argp; +extern struct argp bench_lpm_trie_map_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -299,6 +300,7 @@ static const struct argp_child bench_parsers[] = { { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, { &bench_sockmap_argp, 0, "bpf sockmap benchmark", 0 }, + { &bench_lpm_trie_map_argp, 0, "LPM trie map benchmark", 0 }, {}, }; @@ -558,6 +560,13 @@ extern const struct bench bench_htab_mem; extern const struct bench bench_crypto_encrypt; extern const struct bench bench_crypto_decrypt; extern const struct bench bench_sockmap; +extern const struct bench bench_lpm_trie_noop; +extern const struct bench bench_lpm_trie_baseline; +extern const struct bench bench_lpm_trie_lookup; +extern const struct bench bench_lpm_trie_insert; +extern const struct bench bench_lpm_trie_update; +extern const struct bench bench_lpm_trie_delete; +extern const struct bench bench_lpm_trie_free; static const struct bench *benchs[] = { &bench_count_global, @@ -625,6 +634,13 @@ static const struct bench *benchs[] = { &bench_crypto_encrypt, &bench_crypto_decrypt, &bench_sockmap, + &bench_lpm_trie_noop, + &bench_lpm_trie_baseline, + &bench_lpm_trie_lookup, + &bench_lpm_trie_insert, + &bench_lpm_trie_update, + &bench_lpm_trie_delete, + &bench_lpm_trie_free, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index 005c401b3e2275..bea323820ffb88 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -46,6 +46,7 @@ struct bench_res { unsigned long gp_ns; unsigned long gp_ct; unsigned int stime; + unsigned long duration_ns; }; struct bench { diff --git a/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c b/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c new file mode 100644 index 00000000000000..246f6cb3387dda --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Cloudflare */ + +/* + * All of these benchmarks operate on tries with keys in the range + * [0, args.nr_entries), i.e. there are no gaps or partially filled + * branches of the trie for any key < args.nr_entries. + * + * This gives an idea of worst-case behaviour. + */ + +#include +#include +#include +#include "lpm_trie_bench.skel.h" +#include "lpm_trie_map.skel.h" +#include "bench.h" +#include "testing_helpers.h" +#include "progs/lpm_trie.h" + +static struct ctx { + struct lpm_trie_bench *bench; +} ctx; + +static struct { + __u32 nr_entries; + __u32 prefixlen; + bool random; +} args = { + .nr_entries = 0, + .prefixlen = 32, + .random = false, +}; + +enum { + ARG_NR_ENTRIES = 9000, + ARG_PREFIX_LEN, + ARG_RANDOM, +}; + +static const struct argp_option opts[] = { + { "nr_entries", ARG_NR_ENTRIES, "NR_ENTRIES", 0, + "Number of unique entries in the LPM trie" }, + { "prefix_len", ARG_PREFIX_LEN, "PREFIX_LEN", 0, + "Number of prefix bits to use in the LPM trie" }, + { "random", ARG_RANDOM, NULL, 0, "Access random keys during op" }, + {}, +}; + +static error_t lpm_parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_NR_ENTRIES: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "Invalid nr_entries count."); + argp_usage(state); + } + args.nr_entries = ret; + break; + case ARG_PREFIX_LEN: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "Invalid prefix_len value."); + argp_usage(state); + } + args.prefixlen = ret; + break; + case ARG_RANDOM: + args.random = true; + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +const struct argp bench_lpm_trie_map_argp = { + .options = opts, + .parser = lpm_parse_arg, +}; + +static void validate_common(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "benchmark doesn't support consumer\n"); + exit(1); + } + + if (args.nr_entries == 0) { + fprintf(stderr, "Missing --nr_entries parameter\n"); + exit(1); + } + + if ((1UL << args.prefixlen) < args.nr_entries) { + fprintf(stderr, "prefix_len value too small for nr_entries\n"); + exit(1); + } +} + +static void lpm_insert_validate(void) +{ + validate_common(); + + if (env.producer_cnt != 1) { + fprintf(stderr, "lpm-trie-insert requires a single producer\n"); + exit(1); + } + + if (args.random) { + fprintf(stderr, "lpm-trie-insert does not support --random\n"); + exit(1); + } +} + +static void lpm_delete_validate(void) +{ + validate_common(); + + if (env.producer_cnt != 1) { + fprintf(stderr, "lpm-trie-delete requires a single producer\n"); + exit(1); + } + + if (args.random) { + fprintf(stderr, "lpm-trie-delete does not support --random\n"); + exit(1); + } +} + +static void lpm_free_validate(void) +{ + validate_common(); + + if (env.producer_cnt != 1) { + fprintf(stderr, "lpm-trie-free requires a single producer\n"); + exit(1); + } + + if (args.random) { + fprintf(stderr, "lpm-trie-free does not support --random\n"); + exit(1); + } +} + +static struct trie_key *keys; +static __u32 *vals; + +static void fill_map(int map_fd) +{ + int err; + + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + err = bpf_map_update_batch(map_fd, keys, vals, &args.nr_entries, &opts); + if (err) { + fprintf(stderr, "failed to batch update keys to map: %d\n", + -err); + exit(1); + } +} + +static void empty_map(int map_fd) +{ + int err; + + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + err = bpf_map_delete_batch(map_fd, keys, &args.nr_entries, &opts); + if (err) { + fprintf(stderr, "failed to batch delete keys for map: %d\n", + -err); + exit(1); + } +} + +static void attach_prog(void) +{ + int i; + + ctx.bench = lpm_trie_bench__open_and_load(); + if (!ctx.bench) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + ctx.bench->bss->nr_entries = args.nr_entries; + ctx.bench->bss->prefixlen = args.prefixlen; + ctx.bench->bss->random = args.random; + + if (lpm_trie_bench__attach(ctx.bench)) { + fprintf(stderr, "failed to attach skeleton\n"); + exit(1); + } + + keys = calloc(args.nr_entries, sizeof(*keys)); + vals = calloc(args.nr_entries, sizeof(*vals)); + + for (i = 0; i < args.nr_entries; i++) { + struct trie_key *k = &keys[i]; + __u32 *v = &vals[i]; + + k->prefixlen = args.prefixlen; + k->data = i; + *v = 1; + } +} + +static void attach_prog_and_fill_map(void) +{ + int fd; + + attach_prog(); + + fd = bpf_map__fd(ctx.bench->maps.trie_map); + fill_map(fd); +} + +static void lpm_noop_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_NOOP; +} + +static void lpm_baseline_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_BASELINE; +} + +static void lpm_lookup_setup(void) +{ + attach_prog_and_fill_map(); + ctx.bench->bss->op = LPM_OP_LOOKUP; +} + +static void lpm_insert_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_INSERT; +} + +static void lpm_update_setup(void) +{ + attach_prog_and_fill_map(); + ctx.bench->bss->op = LPM_OP_UPDATE; +} + +static void lpm_delete_setup(void) +{ + attach_prog_and_fill_map(); + ctx.bench->bss->op = LPM_OP_DELETE; +} + +static void lpm_free_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_FREE; +} + +static void lpm_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.bench->bss->hits, 0); + res->duration_ns = atomic_swap(&ctx.bench->bss->duration_ns, 0); +} + +static void bench_reinit_map(void) +{ + int fd = bpf_map__fd(ctx.bench->maps.trie_map); + + switch (ctx.bench->bss->op) { + case LPM_OP_INSERT: + /* trie_map needs to be emptied */ + empty_map(fd); + break; + case LPM_OP_DELETE: + /* trie_map needs to be refilled */ + fill_map(fd); + break; + default: + fprintf(stderr, "Unexpected REINIT return code for op %d\n", + ctx.bench->bss->op); + exit(1); + } +} + +/* For NOOP, BASELINE, LOOKUP, INSERT, UPDATE, and DELETE */ +static void *lpm_producer(void *unused __always_unused) +{ + int err; + char in[ETH_HLEN]; /* unused */ + + LIBBPF_OPTS(bpf_test_run_opts, opts, .data_in = in, + .data_size_in = sizeof(in), .repeat = 1, ); + + while (true) { + int fd = bpf_program__fd(ctx.bench->progs.run_bench); + err = bpf_prog_test_run_opts(fd, &opts); + if (err) { + fprintf(stderr, "failed to run BPF prog: %d\n", err); + exit(1); + } + + /* Check for kernel error code */ + if ((int)opts.retval < 0) { + fprintf(stderr, "BPF prog returned error: %d\n", + opts.retval); + exit(1); + } + + switch (opts.retval) { + case LPM_BENCH_SUCCESS: + break; + case LPM_BENCH_REINIT_MAP: + bench_reinit_map(); + break; + default: + fprintf(stderr, "Unexpected BPF prog return code %d for op %d\n", + opts.retval, ctx.bench->bss->op); + exit(1); + } + } + + return NULL; +} + +static void *lpm_free_producer(void *unused __always_unused) +{ + while (true) { + struct lpm_trie_map *skel; + + skel = lpm_trie_map__open_and_load(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + fill_map(bpf_map__fd(skel->maps.trie_free_map)); + lpm_trie_map__destroy(skel); + } + + return NULL; +} + +/* + * The standard bench op_report_*() functions assume measurements are + * taken over a 1-second interval but operations that modify the map + * (INSERT, DELETE, and FREE) cannot run indefinitely without + * "resetting" the map to the initial state. Depending on the size of + * the map, this likely needs to happen before the 1-second timer fires. + * + * Calculate the fraction of a second over which the op measurement was + * taken (to ignore any time spent doing the reset) and report the + * throughput results per second. + */ +static void frac_second_report_progress(int iter, struct bench_res *res, + long delta_ns, double rate_divisor, + char rate) +{ + double hits_per_sec, hits_per_prod; + + hits_per_sec = res->hits / rate_divisor / + (res->duration_ns / (double)NSEC_PER_SEC); + hits_per_prod = hits_per_sec / env.producer_cnt; + + printf("Iter %3d (%7.3lfus): ", iter, + (delta_ns - NSEC_PER_SEC) / 1000.0); + printf("hits %8.3lf%c/s (%7.3lf%c/prod)\n", hits_per_sec, rate, + hits_per_prod, rate); +} + +static void frac_second_report_final(struct bench_res res[], int res_cnt, + double lat_divisor, double rate_divisor, + char rate, const char *unit) +{ + double hits_mean = 0.0, hits_stddev = 0.0; + double latency = 0.0; + int i; + + for (i = 0; i < res_cnt; i++) { + double val = res[i].hits / rate_divisor / + (res[i].duration_ns / (double)NSEC_PER_SEC); + hits_mean += val / (0.0 + res_cnt); + latency += res[i].duration_ns / res[i].hits / (0.0 + res_cnt); + } + + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) { + double val = + res[i].hits / rate_divisor / + (res[i].duration_ns / (double)NSEC_PER_SEC); + hits_stddev += (hits_mean - val) * (hits_mean - val) / + (res_cnt - 1.0); + } + + hits_stddev = sqrt(hits_stddev); + } + printf("Summary: throughput %8.3lf \u00B1 %5.3lf %c ops/s (%7.3lf%c ops/prod), ", + hits_mean, hits_stddev, rate, hits_mean / env.producer_cnt, + rate); + printf("latency %8.3lf %s/op\n", + latency / lat_divisor / env.producer_cnt, unit); +} + +static void insert_ops_report_progress(int iter, struct bench_res *res, + long delta_ns) +{ + double rate_divisor = 1000000.0; + char rate = 'M'; + + frac_second_report_progress(iter, res, delta_ns, rate_divisor, rate); +} + +static void delete_ops_report_progress(int iter, struct bench_res *res, + long delta_ns) +{ + double rate_divisor = 1000000.0; + char rate = 'M'; + + frac_second_report_progress(iter, res, delta_ns, rate_divisor, rate); +} + +static void free_ops_report_progress(int iter, struct bench_res *res, + long delta_ns) +{ + double rate_divisor = 1000.0; + char rate = 'K'; + + frac_second_report_progress(iter, res, delta_ns, rate_divisor, rate); +} + +static void insert_ops_report_final(struct bench_res res[], int res_cnt) +{ + double lat_divisor = 1.0; + double rate_divisor = 1000000.0; + const char *unit = "ns"; + char rate = 'M'; + + frac_second_report_final(res, res_cnt, lat_divisor, rate_divisor, rate, + unit); +} + +static void delete_ops_report_final(struct bench_res res[], int res_cnt) +{ + double lat_divisor = 1.0; + double rate_divisor = 1000000.0; + const char *unit = "ns"; + char rate = 'M'; + + frac_second_report_final(res, res_cnt, lat_divisor, rate_divisor, rate, + unit); +} + +static void free_ops_report_final(struct bench_res res[], int res_cnt) +{ + double lat_divisor = 1000000.0; + double rate_divisor = 1000.0; + const char *unit = "ms"; + char rate = 'K'; + + frac_second_report_final(res, res_cnt, lat_divisor, rate_divisor, rate, + unit); +} + +/* noop bench measures harness-overhead */ +const struct bench bench_lpm_trie_noop = { + .name = "lpm-trie-noop", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_noop_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* baseline overhead for lookup and update */ +const struct bench bench_lpm_trie_baseline = { + .name = "lpm-trie-baseline", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_baseline_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* measure cost of doing a lookup on existing entries in a full trie */ +const struct bench bench_lpm_trie_lookup = { + .name = "lpm-trie-lookup", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_lookup_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* measure cost of inserting new entries into an empty trie */ +const struct bench bench_lpm_trie_insert = { + .name = "lpm-trie-insert", + .argp = &bench_lpm_trie_map_argp, + .validate = lpm_insert_validate, + .setup = lpm_insert_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = insert_ops_report_progress, + .report_final = insert_ops_report_final, +}; + +/* measure cost of updating existing entries in a full trie */ +const struct bench bench_lpm_trie_update = { + .name = "lpm-trie-update", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_update_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* measure cost of deleting existing entries from a full trie */ +const struct bench bench_lpm_trie_delete = { + .name = "lpm-trie-delete", + .argp = &bench_lpm_trie_map_argp, + .validate = lpm_delete_validate, + .setup = lpm_delete_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = delete_ops_report_progress, + .report_final = delete_ops_report_final, +}; + +/* measure cost of freeing a full trie */ +const struct bench bench_lpm_trie_free = { + .name = "lpm-trie-free", + .argp = &bench_lpm_trie_map_argp, + .validate = lpm_free_validate, + .setup = lpm_free_setup, + .producer_thread = lpm_free_producer, + .measure = lpm_measure, + .report_progress = free_ops_report_progress, + .report_final = free_ops_report_final, +}; diff --git a/tools/testing/selftests/bpf/progs/lpm_trie.h b/tools/testing/selftests/bpf/progs/lpm_trie.h new file mode 100644 index 00000000000000..76aa5821807f25 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lpm_trie.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PROGS_LPM_TRIE_H +#define __PROGS_LPM_TRIE_H + +struct trie_key { + __u32 prefixlen; + __u32 data; +}; + +/* Benchmark operations */ +enum { + LPM_OP_NOOP = 0, + LPM_OP_BASELINE, + LPM_OP_LOOKUP, + LPM_OP_INSERT, + LPM_OP_UPDATE, + LPM_OP_DELETE, + LPM_OP_FREE +}; + +/* + * Return values from run_bench. + * + * Negative values are also allowed and represent kernel error codes. + */ +#define LPM_BENCH_SUCCESS 0 +#define LPM_BENCH_REINIT_MAP 1 /* Reset trie to initial state for current op */ + +#endif diff --git a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c new file mode 100644 index 00000000000000..a0e6ebd5507a92 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Cloudflare */ + +#include +#include +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_atomic.h" +#include "progs/lpm_trie.h" + +#define BPF_OBJ_NAME_LEN 16U +#define MAX_ENTRIES 100000000 +#define NR_LOOPS 10000 + +char _license[] SEC("license") = "GPL"; + +/* Filled by userspace. See fill_map() in bench_lpm_trie_map.c */ +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __type(key, struct trie_key); + __type(value, __u32); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, MAX_ENTRIES); +} trie_map SEC(".maps"); + +long hits; +long duration_ns; + +/* Configured from userspace */ +__u32 nr_entries; +__u32 prefixlen; +bool random; +__u8 op; + +static __u64 latency_free_start; + +SEC("fentry/bpf_map_free_deferred") +int BPF_PROG(trie_free_entry, struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + char name[BPF_OBJ_NAME_LEN]; + u32 map_type; + + map_type = BPF_CORE_READ(map, map_type); + if (map_type != BPF_MAP_TYPE_LPM_TRIE) + return 0; + + /* + * Ideally we'd have access to the map ID but that's already + * freed before we enter trie_free(). + */ + BPF_CORE_READ_STR_INTO(&name, map, name); + if (bpf_strncmp(name, BPF_OBJ_NAME_LEN, "trie_free_map")) + return 0; + + latency_free_start = bpf_ktime_get_ns(); + + return 0; +} + +SEC("fexit/bpf_map_free_deferred") +int BPF_PROG(trie_free_exit, struct work_struct *work) +{ + __u64 val; + + if (!latency_free_start) + return 0; + + val = bpf_ktime_get_ns() - latency_free_start; + latency_free_start = 0; + + __sync_add_and_fetch(&duration_ns, val); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +static __u32 cur_key; + +static __always_inline void generate_key(struct trie_key *key) +{ + key->prefixlen = prefixlen; + + if (random) + key->data = bpf_get_prandom_u32() % nr_entries; + else + key->data = cur_key++ % nr_entries; +} + +static int noop(__u32 index, __u32 *unused) +{ + return 0; +} + +static int baseline(__u32 index, __u32 *unused) +{ + struct trie_key key; + __u32 blackbox = 0; + + generate_key(&key); + /* Avoid compiler optimizing out the modulo */ + barrier_var(blackbox); + blackbox = READ_ONCE(key.data); + + return 0; +} + +static int lookup(__u32 index, int *retval) +{ + struct trie_key key; + + generate_key(&key); + if (!bpf_map_lookup_elem(&trie_map, &key)) { + *retval = -ENOENT; + return 1; + } + + return 0; +} + +static int insert(__u32 index, int *retval) +{ + struct trie_key key; + u32 val = 1; + int err; + + generate_key(&key); + err = bpf_map_update_elem(&trie_map, &key, &val, BPF_NOEXIST); + if (err) { + *retval = err; + return 1; + } + + /* Is this the last entry? */ + if (key.data == nr_entries - 1) { + /* For atomicity concerns, see the comment in delete() */ + *retval = LPM_BENCH_REINIT_MAP; + return 1; + } + + return 0; +} + +static int update(__u32 index, int *retval) +{ + struct trie_key key; + u32 val = 1; + int err; + + generate_key(&key); + err = bpf_map_update_elem(&trie_map, &key, &val, BPF_EXIST); + if (err) { + *retval = err; + return 1; + } + + return 0; +} + +static int delete(__u32 index, int *retval) +{ + struct trie_key key; + int err; + + generate_key(&key); + err = bpf_map_delete_elem(&trie_map, &key); + if (err) { + *retval = err; + return 1; + } + + /* Do we need to refill the map? */ + if (key.data == nr_entries - 1) { + /* + * Atomicity isn't required because DELETE only supports + * one producer running concurrently. What we need is a + * way to track how many entries have been deleted from + * the trie between consecutive invocations of the BPF + * prog because a single bpf_loop() call might not + * delete all entries, e.g. when NR_LOOPS < nr_entries. + */ + *retval = LPM_BENCH_REINIT_MAP; + return 1; + } + + return 0; +} + +SEC("xdp") +int BPF_PROG(run_bench) +{ + int err = LPM_BENCH_SUCCESS; + u64 start, delta; + int loops; + + start = bpf_ktime_get_ns(); + + switch (op) { + case LPM_OP_NOOP: + loops = bpf_loop(NR_LOOPS, noop, NULL, 0); + break; + case LPM_OP_BASELINE: + loops = bpf_loop(NR_LOOPS, baseline, NULL, 0); + break; + case LPM_OP_LOOKUP: + loops = bpf_loop(NR_LOOPS, lookup, &err, 0); + break; + case LPM_OP_INSERT: + loops = bpf_loop(NR_LOOPS, insert, &err, 0); + break; + case LPM_OP_UPDATE: + loops = bpf_loop(NR_LOOPS, update, &err, 0); + break; + case LPM_OP_DELETE: + loops = bpf_loop(NR_LOOPS, delete, &err, 0); + break; + default: + bpf_printk("invalid benchmark operation\n"); + return -1; + } + + delta = bpf_ktime_get_ns() - start; + + __sync_add_and_fetch(&duration_ns, delta); + __sync_add_and_fetch(&hits, loops); + + return err; +} diff --git a/tools/testing/selftests/bpf/progs/lpm_trie_map.c b/tools/testing/selftests/bpf/progs/lpm_trie_map.c new file mode 100644 index 00000000000000..6e60d686b664fc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lpm_trie_map.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#define MAX_ENTRIES 100000000 + +struct trie_key { + __u32 prefixlen; + __u32 data; +}; + +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __type(key, struct trie_key); + __type(value, __u32); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, MAX_ENTRIES); +} trie_free_map SEC(".maps"); From 131897c65e2b86cf14bec7379f44aa8fbb407526 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sun, 24 Aug 2025 23:11:57 +0800 Subject: [PATCH 0562/3206] erofs: fix invalid algorithm for encoded extents The current algorithm sanity checks do not properly apply to new encoded extents. Unify the algorithm check with Z_EROFS_COMPRESSION(_RUNTIME)_MAX and ensure consistency with sbi->available_compr_algs. Reported-and-tested-by: syzbot+5a398eb460ddaa6f242f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/68a8bd20.050a0220.37038e.005a.GAE@google.com Fixes: 1d191b4ca51d ("erofs: implement encoded extent metadata") Thanks-to: Edward Adam Davis Signed-off-by: Gao Xiang --- fs/erofs/zmap.c | 67 +++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index a93efd95c5556d..798223e6da9ce1 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -394,10 +394,10 @@ static int z_erofs_map_blocks_fo(struct inode *inode, .map = map, .in_mbox = erofs_inode_in_metabox(inode), }; - int err = 0; - unsigned int endoff, afmt; + unsigned int endoff; unsigned long initial_lcn; unsigned long long ofs, end; + int err; ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) && @@ -482,20 +482,15 @@ static int z_erofs_map_blocks_fo(struct inode *inode, err = -EFSCORRUPTED; goto unmap_out; } - afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? - Z_EROFS_COMPRESSION_INTERLACED : - Z_EROFS_COMPRESSION_SHIFTED; + if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) + map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { + map->m_algorithmformat = vi->z_algorithmtype[1]; } else { - afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? - vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; - if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { - erofs_err(sb, "inconsistent algorithmtype %u for nid %llu", - afmt, vi->nid); - err = -EFSCORRUPTED; - goto unmap_out; - } + map->m_algorithmformat = vi->z_algorithmtype[0]; } - map->m_algorithmformat = afmt; if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && @@ -626,9 +621,9 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; - int err, headnr; - erofs_off_t pos; struct z_erofs_map_header *h; + erofs_off_t pos; + int err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { /* @@ -642,7 +637,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) return -ERESTARTSYS; - err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; @@ -679,15 +673,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) vi->z_idata_size = le16_to_cpu(h->h_idata_size); - headnr = 0; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || - vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", - headnr + 1, vi->z_algorithmtype[headnr], vi->nid); - err = -EOPNOTSUPP; - goto out_unlock; - } - if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { @@ -726,6 +711,30 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) return err; } +static int z_erofs_map_sanity_check(struct inode *inode, + struct erofs_map_blocks *map) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + + if (!(map->m_flags & EROFS_MAP_ENCODED)) + return 0; + if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) { + erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel", + map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid); + return -EOPNOTSUPP; + } + if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX && + !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) { + erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + map->m_algorithmformat, EROFS_I(inode)->nid); + return -EFSCORRUPTED; + } + if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || + map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) + return -EOPNOTSUPP; + return 0; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { @@ -746,10 +755,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, else err = z_erofs_map_blocks_fo(inode, map, flags); } - if (!err && (map->m_flags & EROFS_MAP_ENCODED) && - unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || - map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) - err = -EOPNOTSUPP; + if (!err) + err = z_erofs_map_sanity_check(inode, map); if (err) map->m_llen = 0; } From e2ab5f600bb01d3625d667d97b3eb7538e388336 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20=C3=96zkan?= Date: Thu, 21 Aug 2025 12:07:20 +0300 Subject: [PATCH 0563/3206] rust: regulator: use `to_result` for error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplifies error handling by replacing the manual check of the return value with the `to_result` helper. Signed-off-by: Onur Özkan Reviewed-by: Daniel Almeida Message-ID: <20250821090720.23939-1-work@onurozkan.dev> Signed-off-by: Mark Brown --- rust/kernel/regulator.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/rust/kernel/regulator.rs b/rust/kernel/regulator.rs index 704147e18bfc9c..34bb24ec8d4d43 100644 --- a/rust/kernel/regulator.rs +++ b/rust/kernel/regulator.rs @@ -267,11 +267,8 @@ impl Regulator { pub fn get_voltage(&self) -> Result { // SAFETY: Safe as per the type invariants of `Regulator`. let voltage = unsafe { bindings::regulator_get_voltage(self.inner.as_ptr()) }; - if voltage < 0 { - Err(kernel::error::Error::from_errno(voltage)) - } else { - Ok(Voltage::from_microvolts(voltage)) - } + + to_result(voltage).map(|()| Voltage::from_microvolts(voltage)) } fn get_internal(dev: &Device, name: &CStr) -> Result> { From c73c378dc05ba8060558b9b50e37f3afa4763ea1 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Tue, 26 Aug 2025 14:24:11 -0700 Subject: [PATCH 0564/3206] spi: rb4xx: depend on OF There's no support for non OF platforms. Better to depend on OF explicitly. Also fixes a warning/error about the dt table being unused because of of_match_ptr on non OF platforms. Signed-off-by: Rosen Penev Message-ID: <20250826212413.15065-2-rosenp@gmail.com> Signed-off-by: Mark Brown --- drivers/spi/Kconfig | 1 + drivers/spi/spi-rb4xx.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 891729c9c5642a..cdeaa8e711fd7c 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -917,6 +917,7 @@ config SPI_ROCKCHIP_SFC config SPI_RB4XX tristate "Mikrotik RB4XX SPI master" depends on SPI_MASTER && ATH79 + depends on OF help SPI controller driver for the Mikrotik RB4xx series boards. diff --git a/drivers/spi/spi-rb4xx.c b/drivers/spi/spi-rb4xx.c index e71d3805b150de..a795e263299e69 100644 --- a/drivers/spi/spi-rb4xx.c +++ b/drivers/spi/spi-rb4xx.c @@ -199,7 +199,7 @@ static struct platform_driver rb4xx_spi_drv = { .remove = rb4xx_spi_remove, .driver = { .name = "rb4xx-spi", - .of_match_table = of_match_ptr(rb4xx_spi_dt_match), + .of_match_table = rb4xx_spi_dt_match, }, }; From f18f0ac5331f8f13737e87cc1416837fb5b27b0a Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Tue, 26 Aug 2025 14:24:12 -0700 Subject: [PATCH 0565/3206] spi: rb4xx: add COMPILE_TEST support Copy macros from ath79 SPI driver to allow compilation on all platforms and remove ath79 specific header. Signed-off-by: Rosen Penev Message-ID: <20250826212413.15065-3-rosenp@gmail.com> Signed-off-by: Mark Brown --- drivers/spi/Kconfig | 2 +- drivers/spi/spi-rb4xx.c | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index cdeaa8e711fd7c..f7020d35b3a52c 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -916,7 +916,7 @@ config SPI_ROCKCHIP_SFC config SPI_RB4XX tristate "Mikrotik RB4XX SPI master" - depends on SPI_MASTER && ATH79 + depends on SPI_MASTER && (ATH79 || COMPILE_TEST) depends on OF help SPI controller driver for the Mikrotik RB4xx series boards. diff --git a/drivers/spi/spi-rb4xx.c b/drivers/spi/spi-rb4xx.c index a795e263299e69..bae802e9422697 100644 --- a/drivers/spi/spi-rb4xx.c +++ b/drivers/spi/spi-rb4xx.c @@ -16,7 +16,16 @@ #include #include -#include +#define AR71XX_SPI_REG_FS 0x00 /* Function Select */ +#define AR71XX_SPI_REG_CTRL 0x04 /* SPI Control */ +#define AR71XX_SPI_REG_IOC 0x08 /* SPI I/O Control */ +#define AR71XX_SPI_REG_RDS 0x0c /* Read Data Shift */ + +#define AR71XX_SPI_FS_GPIO BIT(0) /* Enable GPIO mode */ + +#define AR71XX_SPI_IOC_DO BIT(0) /* Data Out pin */ +#define AR71XX_SPI_IOC_CLK BIT(8) /* CLK pin */ +#define AR71XX_SPI_IOC_CS(n) BIT(16 + (n)) struct rb4xx_spi { void __iomem *base; @@ -63,7 +72,7 @@ static inline void do_spi_clk_two(struct rb4xx_spi *rbspi, u32 spi_ioc, if (value & BIT(1)) regval |= AR71XX_SPI_IOC_DO; if (value & BIT(0)) - regval |= AR71XX_SPI_IOC_CS2; + regval |= AR71XX_SPI_IOC_CS(2); rb4xx_write(rbspi, AR71XX_SPI_REG_IOC, regval); rb4xx_write(rbspi, AR71XX_SPI_REG_IOC, regval | AR71XX_SPI_IOC_CLK); @@ -89,7 +98,7 @@ static void rb4xx_set_cs(struct spi_device *spi, bool enable) */ if (enable) rb4xx_write(rbspi, AR71XX_SPI_REG_IOC, - AR71XX_SPI_IOC_CS0 | AR71XX_SPI_IOC_CS1); + AR71XX_SPI_IOC_CS(0) | AR71XX_SPI_IOC_CS(1)); } static int rb4xx_transfer_one(struct spi_controller *host, @@ -109,10 +118,10 @@ static int rb4xx_transfer_one(struct spi_controller *host, */ if (spi_get_chipselect(spi, 0) == 2) /* MMC */ - spi_ioc = AR71XX_SPI_IOC_CS0; + spi_ioc = AR71XX_SPI_IOC_CS(0); else /* Boot flash and CPLD */ - spi_ioc = AR71XX_SPI_IOC_CS1; + spi_ioc = AR71XX_SPI_IOC_CS(1); tx_buf = t->tx_buf; rx_buf = t->rx_buf; From ff9a7857b7848227788f113d6dc6a72e989084e0 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Tue, 26 Aug 2025 14:24:13 -0700 Subject: [PATCH 0566/3206] spi: rb4xx: use devm for clk_prepare_enable Remove the remove function as it no longer does anything useful. Also remove platform_set_drvdata as get becomes completely unused. Signed-off-by: Rosen Penev Message-ID: <20250826212413.15065-4-rosenp@gmail.com> Signed-off-by: Mark Brown --- drivers/spi/spi-rb4xx.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/spi/spi-rb4xx.c b/drivers/spi/spi-rb4xx.c index bae802e9422697..22b86fc8913262 100644 --- a/drivers/spi/spi-rb4xx.c +++ b/drivers/spi/spi-rb4xx.c @@ -156,7 +156,7 @@ static int rb4xx_spi_probe(struct platform_device *pdev) if (!host) return -ENOMEM; - ahb_clk = devm_clk_get(&pdev->dev, "ahb"); + ahb_clk = devm_clk_get_enabled(&pdev->dev, "ahb"); if (IS_ERR(ahb_clk)) return PTR_ERR(ahb_clk); @@ -172,7 +172,6 @@ static int rb4xx_spi_probe(struct platform_device *pdev) rbspi = spi_controller_get_devdata(host); rbspi->base = spi_base; rbspi->clk = ahb_clk; - platform_set_drvdata(pdev, rbspi); err = devm_spi_register_controller(&pdev->dev, host); if (err) { @@ -180,23 +179,12 @@ static int rb4xx_spi_probe(struct platform_device *pdev) return err; } - err = clk_prepare_enable(ahb_clk); - if (err) - return err; - /* Enable SPI */ rb4xx_write(rbspi, AR71XX_SPI_REG_FS, AR71XX_SPI_FS_GPIO); return 0; } -static void rb4xx_spi_remove(struct platform_device *pdev) -{ - struct rb4xx_spi *rbspi = platform_get_drvdata(pdev); - - clk_disable_unprepare(rbspi->clk); -} - static const struct of_device_id rb4xx_spi_dt_match[] = { { .compatible = "mikrotik,rb4xx-spi" }, { }, @@ -205,7 +193,6 @@ MODULE_DEVICE_TABLE(of, rb4xx_spi_dt_match); static struct platform_driver rb4xx_spi_drv = { .probe = rb4xx_spi_probe, - .remove = rb4xx_spi_remove, .driver = { .name = "rb4xx-spi", .of_match_table = rb4xx_spi_dt_match, From c42e36a488c7e01f833fc9f4814f735b66b2d494 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 27 Aug 2025 13:16:12 +0300 Subject: [PATCH 0567/3206] spi: Drop dev_pm_domain_detach() call Starting with commit f99508074e78 ("PM: domains: Detach on device_unbind_cleanup()"), there is no longer a need to call dev_pm_domain_detach() in the bus remove function. The device_unbind_cleanup() function now handles this to avoid invoking devres cleanup handlers while the PM domain is powered off, which could otherwise lead to failures as described in the above-mentioned commit. Drop the explicit dev_pm_domain_detach() call and rely instead on the flags passed to dev_pm_domain_attach() to power off the domain. Signed-off-by: Claudiu Beznea Reviewed-by: Ulf Hansson Message-ID: <20250827101612.928008-1-claudiu.beznea.uj@bp.renesas.com> Signed-off-by: Mark Brown --- drivers/spi/spi.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index a388f372b27a7f..f95c4304df8e91 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -427,15 +427,13 @@ static int spi_probe(struct device *dev) if (spi->irq < 0) spi->irq = 0; - ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON); + ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON | + PD_FLAG_DETACH_POWER_OFF); if (ret) return ret; - if (sdrv->probe) { + if (sdrv->probe) ret = sdrv->probe(spi); - if (ret) - dev_pm_domain_detach(dev, true); - } return ret; } @@ -446,8 +444,6 @@ static void spi_remove(struct device *dev) if (sdrv->remove) sdrv->remove(to_spi_device(dev)); - - dev_pm_domain_detach(dev, true); } static void spi_shutdown(struct device *dev) From 8d2f9f5c64f16e0717854fb66d795ebe8c30103b Mon Sep 17 00:00:00 2001 From: Andrey Albershteyn Date: Mon, 4 Aug 2025 14:08:14 +0200 Subject: [PATCH 0568/3206] xfs: allow renames of project-less inodes Special file inodes cannot have project ID set from userspace and are skipped during initial project setup. Those inodes are left project-less in the project directory. New inodes created after project initialization do have an ID. Creating hard links or renaming those project-less inodes then fails on different ID check. In commit e23d7e82b707 ("xfs: allow cross-linking special files without project quota"), we relaxed the project id checks to allow hardlinking special files with differing project ids since the projid cannot be changed. Apply the same workaround for renaming operations. Signed-off-by: Andrey Albershteyn Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_inode.c | 64 ++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9c39251961a32a..0ddb9ce0f5e36f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -877,6 +877,35 @@ xfs_create_tmpfile( return error; } +static inline int +xfs_projid_differ( + struct xfs_inode *tdp, + struct xfs_inode *sip) +{ + /* + * If we are using project inheritance, we only allow hard link/renames + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && + tdp->i_projid != sip->i_projid)) { + /* + * Project quota setup skips special files which can + * leave inodes in a PROJINHERIT directory without a + * project ID set. We need to allow links to be made + * to these "project-less" inodes because userspace + * expects them to succeed after project ID setup, + * but everything else should be rejected. + */ + if (!special_file(VFS_I(sip)->i_mode) || + sip->i_projid != 0) { + return -EXDEV; + } + } + + return 0; +} + int xfs_link( struct xfs_inode *tdp, @@ -930,27 +959,9 @@ xfs_link( goto error_return; } - /* - * If we are using project inheritance, we only allow hard link - * creation in our tree when the project IDs are the same; else - * the tree quota mechanism could be circumvented. - */ - if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - tdp->i_projid != sip->i_projid)) { - /* - * Project quota setup skips special files which can - * leave inodes in a PROJINHERIT directory without a - * project ID set. We need to allow links to be made - * to these "project-less" inodes because userspace - * expects them to succeed after project ID setup, - * but everything else should be rejected. - */ - if (!special_file(VFS_I(sip)->i_mode) || - sip->i_projid != 0) { - error = -EXDEV; - goto error_return; - } - } + error = xfs_projid_differ(tdp, sip); + if (error) + goto error_return; error = xfs_dir_add_child(tp, resblks, &du); if (error) @@ -2227,16 +2238,9 @@ xfs_rename( if (du_wip.ip) xfs_trans_ijoin(tp, du_wip.ip, 0); - /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - target_dp->i_projid != src_ip->i_projid)) { - error = -EXDEV; + error = xfs_projid_differ(target_dp, src_ip); + if (error) goto out_trans_cancel; - } /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) { From 8a221004fe5288b66503699a329a6b623be13f91 Mon Sep 17 00:00:00 2001 From: Andrey Albershteyn Date: Mon, 4 Aug 2025 14:08:15 +0200 Subject: [PATCH 0569/3206] xfs: add .fileattr_set and fileattr_get callbacks for symlinks As there are now file_getattr() and file_setattr(), xfs_quota will call them on special files. These new syscalls call ->fileattr_get/set. Symlink inodes don't have callbacks to set file attributes. This patch adds them. The attribute values combinations are checked in fileattr_set_prepare(). Signed-off-by: Andrey Albershteyn Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_iops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 603effabe1ee12..c0f49bfbed6824 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1335,6 +1335,8 @@ static const struct inode_operations xfs_symlink_inode_operations = { .setattr = xfs_vn_setattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; /* Figure out if this file actually supports DAX. */ From 0239bd9fa445a21def88f7e76fe6e0414b2a4da0 Mon Sep 17 00:00:00 2001 From: Andrey Albershteyn Date: Mon, 4 Aug 2025 14:08:16 +0200 Subject: [PATCH 0570/3206] xfs: allow setting file attributes on special files XFS does't have file attributes manipulation ioctls for special files. Changing or reading file attributes is rejected for them in xfs_fileattr_*et(). In XFS, this is necessary to work for project quota directories. When project is set up, xfs_quota opens and calls FS_IOC_SETFSXATTR on every inode in the directory. However, special files are skipped due to open() returning a special inode for them. So, they don't even get to this check. The recently added file_getattr/file_setattr will call xfs_fileattr_*et, on special files. This patch allows reading/changing file attributes on special files. Signed-off-by: Andrey Albershteyn Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_ioctl.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index e1051a530a50fe..10a3a5160f4eb0 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -512,9 +512,6 @@ xfs_fileattr_get( { struct xfs_inode *ip = XFS_I(d_inode(dentry)); - if (d_is_special(dentry)) - return -ENOTTY; - xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -736,9 +733,6 @@ xfs_fileattr_set( trace_xfs_ioctl_setattr(ip); - if (d_is_special(dentry)) - return -ENOTTY; - if (!fa->fsx_valid) { if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | FS_NODUMP_FL | From 851c4c96db001f51bdad1432aa54549c7fe2c63e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 25 Aug 2025 13:15:00 +0200 Subject: [PATCH 0571/3206] xfs: implement XFS_IOC_DIOINFO in terms of vfs_getattr Use the direct I/O alignment reporting from ->getattr instead of reimplementing it. This exposes the relaxation of the memory alignment in the XFS_IOC_DIOINFO info and ensure the information will stay in sync. Note that randholes.c in xfstests has a bug where it incorrectly fails when the required memory alignment is smaller than the pointer size. Round up the reported value as there is a fair chance that this code got copied into various applications. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_ioctl.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 10a3a5160f4eb0..a6bb7ee7a27ad5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1203,21 +1203,21 @@ xfs_file_ioctl( current->comm); return -ENOTTY; case XFS_IOC_DIOINFO: { - struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct kstat st; struct dioattr da; - da.d_mem = target->bt_logical_sectorsize; + error = vfs_getattr(&filp->f_path, &st, STATX_DIOALIGN, 0); + if (error) + return error; /* - * See xfs_report_dioalign() for an explanation about why this - * reports a value larger than the sector size for COW inodes. + * Some userspace directly feeds the return value to + * posix_memalign, which fails for values that are smaller than + * the pointer size. Round up the value to not break userspace. */ - if (xfs_is_cow_inode(ip)) - da.d_miniosz = xfs_inode_alloc_unitsize(ip); - else - da.d_miniosz = target->bt_logical_sectorsize; + da.d_mem = roundup(st.dio_mem_align, sizeof(void *)); + da.d_miniosz = st.dio_offset_align; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); - if (copy_to_user(arg, &da, sizeof(da))) return -EFAULT; return 0; From f544bf03a771ee746b908e9a08ecb97c65a35055 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 31 Jul 2025 10:35:14 +0200 Subject: [PATCH 0572/3206] mtd: MTD_INTEL_DG should depend on DRM_I915 or DRM_XE Intel Discrete Graphics non-volatile memory is only present on Intel discrete graphics devices, and its auxiliary device is instantiated by the Intel i915 and Xe2 DRM drivers. Hence add dependencies on DRM_I915 and DRM_XE, to prevent asking the user about this driver when configuring a kernel without Intel graphics support. Fixes: ceb5ab3cb6463795 ("mtd: add driver for intel graphics non-volatile memory device") Signed-off-by: Geert Uytterhoeven Signed-off-by: Miquel Raynal --- drivers/mtd/devices/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig index 46cebde79f34b0..e518dfeee65426 100644 --- a/drivers/mtd/devices/Kconfig +++ b/drivers/mtd/devices/Kconfig @@ -185,8 +185,8 @@ config MTD_POWERNV_FLASH config MTD_INTEL_DG tristate "Intel Discrete Graphics non-volatile memory driver" - depends on AUXILIARY_BUS - depends on MTD + depends on AUXILIARY_BUS && MTD + depends on DRM_I915!=n || DRM_XE!=n || COMPILE_TEST help This provides an MTD device to access Intel Discrete Graphics non-volatile memory. From 1eae113dd5ff5192cfd3e11b6ab7b96193b42c01 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 30 Jul 2025 21:47:46 +0200 Subject: [PATCH 0573/3206] mtd: rawnand: nuvoton: Fix an error handling path in ma35_nand_chips_init() If a ma35_nand_chip_init() call fails, then a reference to 'nand_np' still needs to be released. Use for_each_child_of_node_scoped() to fix the issue. Fixes: 5abb5d414d55 ("mtd: rawnand: nuvoton: add new driver for the Nuvoton MA35 SoC") Signed-off-by: Christophe JAILLET Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c b/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c index c23b537948d5e6..1a285cd8fad62a 100644 --- a/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c +++ b/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c @@ -935,10 +935,10 @@ static void ma35_chips_cleanup(struct ma35_nand_info *nand) static int ma35_nand_chips_init(struct device *dev, struct ma35_nand_info *nand) { - struct device_node *np = dev->of_node, *nand_np; + struct device_node *np = dev->of_node; int ret; - for_each_child_of_node(np, nand_np) { + for_each_child_of_node_scoped(np, nand_np) { ret = ma35_nand_chip_init(dev, nand, nand_np); if (ret) { ma35_chips_cleanup(nand); From 513c40e59d5a414ab763a9c84797534b5e8c208d Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Tue, 12 Aug 2025 09:26:58 +0200 Subject: [PATCH 0574/3206] mtd: rawnand: stm32_fmc2: avoid overlapping mappings on ECC buffer Avoid below overlapping mappings by using a contiguous non-cacheable buffer. [ 4.077708] DMA-API: stm32_fmc2_nfc 48810000.nand-controller: cacheline tracking EEXIST, overlapping mappings aren't supported [ 4.089103] WARNING: CPU: 1 PID: 44 at kernel/dma/debug.c:568 add_dma_entry+0x23c/0x300 [ 4.097071] Modules linked in: [ 4.100101] CPU: 1 PID: 44 Comm: kworker/u4:2 Not tainted 6.1.82 #1 [ 4.106346] Hardware name: STMicroelectronics STM32MP257F VALID1 SNOR / MB1704 (LPDDR4 Power discrete) + MB1703 + MB1708 (SNOR MB1730) (DT) [ 4.118824] Workqueue: events_unbound deferred_probe_work_func [ 4.124674] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 4.131624] pc : add_dma_entry+0x23c/0x300 [ 4.135658] lr : add_dma_entry+0x23c/0x300 [ 4.139792] sp : ffff800009dbb490 [ 4.143016] x29: ffff800009dbb4a0 x28: 0000000004008022 x27: ffff8000098a6000 [ 4.150174] x26: 0000000000000000 x25: ffff8000099e7000 x24: ffff8000099e7de8 [ 4.157231] x23: 00000000ffffffff x22: 0000000000000000 x21: ffff8000098a6a20 [ 4.164388] x20: ffff000080964180 x19: ffff800009819ba0 x18: 0000000000000006 [ 4.171545] x17: 6361727420656e69 x16: 6c6568636163203a x15: 72656c6c6f72746e [ 4.178602] x14: 6f632d646e616e2e x13: ffff800009832f58 x12: 00000000000004ec [ 4.185759] x11: 00000000000001a4 x10: ffff80000988af58 x9 : ffff800009832f58 [ 4.192916] x8 : 00000000ffffefff x7 : ffff80000988af58 x6 : 80000000fffff000 [ 4.199972] x5 : 000000000000bff4 x4 : 0000000000000000 x3 : 0000000000000000 [ 4.207128] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0000812d2c40 [ 4.214185] Call trace: [ 4.216605] add_dma_entry+0x23c/0x300 [ 4.220338] debug_dma_map_sg+0x198/0x350 [ 4.224373] __dma_map_sg_attrs+0xa0/0x110 [ 4.228411] dma_map_sg_attrs+0x10/0x2c [ 4.232247] stm32_fmc2_nfc_xfer.isra.0+0x1c8/0x3fc [ 4.237088] stm32_fmc2_nfc_seq_read_page+0xc8/0x174 [ 4.242127] nand_read_oob+0x1d4/0x8e0 [ 4.245861] mtd_read_oob_std+0x58/0x84 [ 4.249596] mtd_read_oob+0x90/0x150 [ 4.253231] mtd_read+0x68/0xac Signed-off-by: Christophe Kerello Cc: stable@vger.kernel.org Fixes: 2cd457f328c1 ("mtd: rawnand: stm32_fmc2: add STM32 FMC2 NAND flash controller driver") Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/stm32_fmc2_nand.c | 28 +++++++++----------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/drivers/mtd/nand/raw/stm32_fmc2_nand.c b/drivers/mtd/nand/raw/stm32_fmc2_nand.c index a960403081f110..222c3c3b684841 100644 --- a/drivers/mtd/nand/raw/stm32_fmc2_nand.c +++ b/drivers/mtd/nand/raw/stm32_fmc2_nand.c @@ -272,6 +272,7 @@ struct stm32_fmc2_nfc { struct sg_table dma_data_sg; struct sg_table dma_ecc_sg; u8 *ecc_buf; + dma_addr_t dma_ecc_addr; int dma_ecc_len; u32 tx_dma_max_burst; u32 rx_dma_max_burst; @@ -902,17 +903,10 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, if (!write_data && !raw) { /* Configure DMA ECC status */ - p = nfc->ecc_buf; for_each_sg(nfc->dma_ecc_sg.sgl, sg, eccsteps, s) { - sg_set_buf(sg, p, nfc->dma_ecc_len); - p += nfc->dma_ecc_len; - } - - ret = dma_map_sg(nfc->dev, nfc->dma_ecc_sg.sgl, - eccsteps, dma_data_dir); - if (!ret) { - ret = -EIO; - goto err_unmap_data; + sg_dma_address(sg) = nfc->dma_ecc_addr + + s * nfc->dma_ecc_len; + sg_dma_len(sg) = nfc->dma_ecc_len; } desc_ecc = dmaengine_prep_slave_sg(nfc->dma_ecc_ch, @@ -921,7 +915,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, DMA_PREP_INTERRUPT); if (!desc_ecc) { ret = -ENOMEM; - goto err_unmap_ecc; + goto err_unmap_data; } reinit_completion(&nfc->dma_ecc_complete); @@ -929,7 +923,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, desc_ecc->callback_param = &nfc->dma_ecc_complete; ret = dma_submit_error(dmaengine_submit(desc_ecc)); if (ret) - goto err_unmap_ecc; + goto err_unmap_data; dma_async_issue_pending(nfc->dma_ecc_ch); } @@ -949,7 +943,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, if (!write_data && !raw) dmaengine_terminate_all(nfc->dma_ecc_ch); ret = -ETIMEDOUT; - goto err_unmap_ecc; + goto err_unmap_data; } /* Wait DMA data transfer completion */ @@ -969,11 +963,6 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, } } -err_unmap_ecc: - if (!write_data && !raw) - dma_unmap_sg(nfc->dev, nfc->dma_ecc_sg.sgl, - eccsteps, dma_data_dir); - err_unmap_data: dma_unmap_sg(nfc->dev, nfc->dma_data_sg.sgl, eccsteps, dma_data_dir); @@ -1610,7 +1599,8 @@ static int stm32_fmc2_nfc_dma_setup(struct stm32_fmc2_nfc *nfc) return ret; /* Allocate a buffer to store ECC status registers */ - nfc->ecc_buf = devm_kzalloc(nfc->dev, FMC2_MAX_ECC_BUF_LEN, GFP_KERNEL); + nfc->ecc_buf = dmam_alloc_coherent(nfc->dev, FMC2_MAX_ECC_BUF_LEN, + &nfc->dma_ecc_addr, GFP_KERNEL); if (!nfc->ecc_buf) return -ENOMEM; From 811c0da4542df3c065f6cb843ced68780e27bb44 Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Tue, 12 Aug 2025 09:30:08 +0200 Subject: [PATCH 0575/3206] mtd: rawnand: stm32_fmc2: fix ECC overwrite In case OOB write is requested during a data write, ECC is currently lost. Avoid this issue by only writing in the free spare area. This issue has been seen with a YAFFS2 file system. Signed-off-by: Christophe Kerello Cc: stable@vger.kernel.org Fixes: 2cd457f328c1 ("mtd: rawnand: stm32_fmc2: add STM32 FMC2 NAND flash controller driver") Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/stm32_fmc2_nand.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/nand/raw/stm32_fmc2_nand.c b/drivers/mtd/nand/raw/stm32_fmc2_nand.c index 222c3c3b684841..d957327fb4fa04 100644 --- a/drivers/mtd/nand/raw/stm32_fmc2_nand.c +++ b/drivers/mtd/nand/raw/stm32_fmc2_nand.c @@ -985,9 +985,21 @@ static int stm32_fmc2_nfc_seq_write(struct nand_chip *chip, const u8 *buf, /* Write oob */ if (oob_required) { - ret = nand_change_write_column_op(chip, mtd->writesize, - chip->oob_poi, mtd->oobsize, - false); + unsigned int offset_in_page = mtd->writesize; + const void *buf = chip->oob_poi; + unsigned int len = mtd->oobsize; + + if (!raw) { + struct mtd_oob_region oob_free; + + mtd_ooblayout_free(mtd, 0, &oob_free); + offset_in_page += oob_free.offset; + buf += oob_free.offset; + len = oob_free.length; + } + + ret = nand_change_write_column_op(chip, offset_in_page, + buf, len, false); if (ret) return ret; } From fd779eac2d659668be4d3dbdac0710afd5d6db12 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Thu, 21 Aug 2025 14:00:57 +0200 Subject: [PATCH 0576/3206] mtd: nand: raw: atmel: Respect tAR, tCLR in read setup timing Having setup time 0 violates tAR, tCLR of some chips, for instance TOSHIBA TC58NVG2S3ETAI0 cannot be detected successfully (first ID byte being read duplicated, i.e. 98 98 dc 90 15 76 14 03 instead of 98 dc 90 15 76 ...). Atmel Application Notes postulated 1 cycle NRD_SETUP without explanation [1], but it looks more appropriate to just calculate setup time properly. [1] Link: https://ww1.microchip.com/downloads/aemDocuments/documents/MPU32/ApplicationNotes/ApplicationNotes/doc6255.pdf Cc: stable@vger.kernel.org Fixes: f9ce2eddf176 ("mtd: nand: atmel: Add ->setup_data_interface() hooks") Signed-off-by: Alexander Sverdlin Tested-by: Alexander Dahl Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/atmel/nand-controller.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/nand/raw/atmel/nand-controller.c b/drivers/mtd/nand/raw/atmel/nand-controller.c index 84ab4a83cbd686..db94d14a3807f5 100644 --- a/drivers/mtd/nand/raw/atmel/nand-controller.c +++ b/drivers/mtd/nand/raw/atmel/nand-controller.c @@ -1377,14 +1377,24 @@ static int atmel_smc_nand_prepare_smcconf(struct atmel_nand *nand, if (ret) return ret; + /* + * Read setup timing depends on the operation done on the NAND: + * + * NRD_SETUP = max(tAR, tCLR) + */ + timeps = max(conf->timings.sdr.tAR_min, conf->timings.sdr.tCLR_min); + ncycles = DIV_ROUND_UP(timeps, mckperiodps); + totalcycles += ncycles; + ret = atmel_smc_cs_conf_set_setup(smcconf, ATMEL_SMC_NRD_SHIFT, ncycles); + if (ret) + return ret; + /* * The read cycle timing is directly matching tRC, but is also * dependent on the setup and hold timings we calculated earlier, * which gives: * - * NRD_CYCLE = max(tRC, NRD_PULSE + NRD_HOLD) - * - * NRD_SETUP is always 0. + * NRD_CYCLE = max(tRC, NRD_SETUP + NRD_PULSE + NRD_HOLD) */ ncycles = DIV_ROUND_UP(conf->timings.sdr.tRC_min, mckperiodps); ncycles = max(totalcycles, ncycles); From 899fb38dd76dd3ede425bbaf8a96d390180a5d1c Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Thu, 28 Aug 2025 20:27:37 +0800 Subject: [PATCH 0577/3206] regulator: core: Remove redundant ternary operators For ternary operators in the form of "a ? true : false", if 'a' itself returns a boolean result, the ternary operator can be omitted. Remove redundant ternary operators to clean up the code. Signed-off-by: Liao Yuanhong Message-ID: <20250828122737.43488-1-liaoyuanhong@vivo.com> Signed-off-by: Mark Brown --- drivers/regulator/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 554d83c4af0c1c..dd7b10e768c06c 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -1586,8 +1586,8 @@ static int set_machine_constraints(struct regulator_dev *rdev) } if (rdev->constraints->active_discharge && ops->set_active_discharge) { - bool ad_state = (rdev->constraints->active_discharge == - REGULATOR_ACTIVE_DISCHARGE_ENABLE) ? true : false; + bool ad_state = rdev->constraints->active_discharge == + REGULATOR_ACTIVE_DISCHARGE_ENABLE; ret = ops->set_active_discharge(rdev, ad_state); if (ret < 0) { From 85941afd2c404247e583c827fae0a45da1c1d92c Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 25 Aug 2025 09:53:27 +0200 Subject: [PATCH 0578/3206] s390/pai: Deny all events not handled by this PMU Each PAI PMU device driver returns -EINVAL when an event is out of its accepted range. This return value aborts the search for an alternative PMU device driver to handle this event. Change the return value to -ENOENT. This return value is used to try other PMUs instead. This makes the PMUs more robust when the sequence of PMU device driver initialization changes (at boot time) or by using modules. Fixes: 39d62336f5c12 ("s390/pai: add support for cryptography counters") Acked-by: Sumanth Korikkar Signed-off-by: Thomas Richter Signed-off-by: Alexander Gordeev --- arch/s390/kernel/perf_pai_crypto.c | 4 ++-- arch/s390/kernel/perf_pai_ext.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index f373a1009c456e..9455f213dc2021 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -285,10 +285,10 @@ static int paicrypt_event_init(struct perf_event *event) /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */ if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) return -ENOENT; - /* PAI crypto event must be in valid range */ + /* PAI crypto event must be in valid range, try others if not */ if (a->config < PAI_CRYPTO_BASE || a->config > PAI_CRYPTO_BASE + paicrypt_cnt) - return -EINVAL; + return -ENOENT; /* Allow only CRYPTO_ALL for sampling */ if (a->sample_period && a->config != PAI_CRYPTO_BASE) return -EINVAL; diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index d827473e7f87f8..7b32935273ced1 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -265,7 +265,7 @@ static int paiext_event_valid(struct perf_event *event) event->hw.config_base = offsetof(struct paiext_cb, acc); return 0; } - return -EINVAL; + return -ENOENT; } /* Might be called on different CPU than the one the event is intended for. */ From ce971233242b5391d99442271f3ca096fb49818d Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 22 Aug 2025 14:05:57 +0200 Subject: [PATCH 0579/3206] s390/cpum_cf: Deny all sampling events by counter PMU Deny all sampling event by the CPUMF counter facility device driver and return -ENOENT. This return value is used to try other PMUs. Up to now events for type PERF_TYPE_HARDWARE were not tested for sampling and returned later on -EOPNOTSUPP. This ends the search for alternative PMUs. Change that behavior and try other PMUs instead. Fixes: 613a41b0d16e ("s390/cpum_cf: Reject request for sampling in event initialization") Acked-by: Sumanth Korikkar Signed-off-by: Thomas Richter Signed-off-by: Alexander Gordeev --- arch/s390/kernel/perf_cpum_cf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 4d09954ebf49e8..04457d88e5892c 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -760,8 +760,6 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) break; case PERF_TYPE_HARDWARE: - if (is_sampling_event(event)) /* No sampling support */ - return -ENOENT; ev = attr->config; if (!attr->exclude_user && attr->exclude_kernel) { /* @@ -859,6 +857,8 @@ static int cpumf_pmu_event_init(struct perf_event *event) unsigned int type = event->attr.type; int err = -ENOENT; + if (is_sampling_event(event)) /* No sampling support */ + return err; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW) err = __hw_perf_event_init(event, type); else if (event->pmu->type == type) From e2e29752357f32feb69a68e9e6e7361405b3f289 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:04 +0200 Subject: [PATCH 0580/3206] x86/sev: Separate MSR and GHCB based snp_cpuid() via a callback There are two distinct callers of snp_cpuid(): the MSR protocol and the GHCB page based interface. The snp_cpuid() logic does not care about the distinction, which only matters at a lower level. But the fact that it supports both interfaces means that the GHCB page based logic is pulled into the early startup code where PA to VA conversions are problematic, given that it runs from the 1:1 mapping of memory. So keep snp_cpuid() itself in the startup code, but factor out the hypervisor calls via a callback, so that the GHCB page handling can be moved out. Code refactoring only - no functional change intended. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/20250828102202.1849035-25-ardb+git@google.com --- arch/x86/boot/startup/sev-shared.c | 59 ++++++------------------------ arch/x86/coco/sev/vc-shared.c | 49 ++++++++++++++++++++++++- arch/x86/include/asm/sev.h | 3 +- 3 files changed, 61 insertions(+), 50 deletions(-) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index a34cd19796f9ac..ed88dfe7605e53 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -342,44 +342,7 @@ static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf) return ret; } -static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - u32 cr4 = native_read_cr4(); - int ret; - - ghcb_set_rax(ghcb, leaf->fn); - ghcb_set_rcx(ghcb, leaf->subfn); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #UD - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - leaf->eax = ghcb->save.rax; - leaf->ebx = ghcb->save.rbx; - leaf->ecx = ghcb->save.rcx; - leaf->edx = ghcb->save.rdx; - - return ES_OK; -} - -static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf) - : __sev_cpuid_hv_msr(leaf); -} /* * This may be called early while still running on the initial identity @@ -484,21 +447,21 @@ snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) return false; } -static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +static void snp_cpuid_hv_msr(void *ctx, struct cpuid_leaf *leaf) { - if (sev_cpuid_hv(ghcb, ctxt, leaf)) + if (__sev_cpuid_hv_msr(leaf)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } static int __head -snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - struct cpuid_leaf *leaf) +snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { struct cpuid_leaf leaf_hv = *leaf; switch (leaf->fn) { case 0x1: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* initial APIC ID */ leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); @@ -517,7 +480,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, break; case 0xB: leaf_hv.subfn = 0; - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* extended APIC ID */ leaf->edx = leaf_hv.edx; @@ -565,7 +528,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, } break; case 0x8000001E: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* extended APIC ID */ leaf->eax = leaf_hv.eax; @@ -586,8 +549,8 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -int __head -snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +int __head snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -621,7 +584,7 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) return 0; } - return snp_cpuid_postprocess(ghcb, ctxt, leaf); + return snp_cpuid_postprocess(cpuid_fn, ctx, leaf); } /* @@ -648,7 +611,7 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) leaf.fn = fn; leaf.subfn = subfn; - ret = snp_cpuid(NULL, NULL, &leaf); + ret = snp_cpuid(snp_cpuid_hv_msr, NULL, &leaf); if (!ret) goto cpuid_done; diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c index 2c0ab0fdc0603e..b4688f69102e26 100644 --- a/arch/x86/coco/sev/vc-shared.c +++ b/arch/x86/coco/sev/vc-shared.c @@ -409,15 +409,62 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + u32 cr4 = native_read_cr4(); + int ret; + + ghcb_set_rax(ghcb, leaf->fn); + ghcb_set_rcx(ghcb, leaf->subfn); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #UD - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + leaf->eax = ghcb->save.rax; + leaf->ebx = ghcb->save.rbx; + leaf->ecx = ghcb->save.rcx; + leaf->edx = ghcb->save.rdx; + + return ES_OK; +} + +struct cpuid_ctx { + struct ghcb *ghcb; + struct es_em_ctxt *ctxt; +}; + +static void snp_cpuid_hv_ghcb(void *p, struct cpuid_leaf *leaf) +{ + struct cpuid_ctx *ctx = p; + + if (__sev_cpuid_hv_ghcb(ctx->ghcb, ctx->ctxt, leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { + struct cpuid_ctx ctx = { ghcb, ctxt }; struct pt_regs *regs = ctxt->regs; struct cpuid_leaf leaf; int ret; leaf.fn = regs->ax; leaf.subfn = regs->cx; - ret = snp_cpuid(ghcb, ctxt, &leaf); + ret = snp_cpuid(snp_cpuid_hv_ghcb, &ctx, &leaf); if (!ret) { regs->ax = leaf.eax; regs->bx = leaf.ebx; diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 02236962fdb108..e4622e470ceb91 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -552,7 +552,8 @@ struct cpuid_leaf { u32 edx; }; -int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf); +int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf); void __noreturn sev_es_terminate(unsigned int set, unsigned int reason); enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, From b3597eb51aad4a6e985c701c129bd7fc2cf0d682 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 12 Aug 2025 16:58:43 +0200 Subject: [PATCH 0581/3206] s390/boot: Add common boot_panic() code Introduce a common boot_panic() helper macro, and use it to get rid of three more or less identical implementations. Signed-off-by: Heiko Carstens Reviewed-by: Alexander Gordeev Signed-off-by: Alexander Gordeev --- arch/s390/boot/boot.h | 8 ++++++++ arch/s390/boot/decompressor.c | 4 +--- arch/s390/boot/physmem_info.c | 4 +--- arch/s390/boot/startup.c | 13 +++---------- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index c0152db285f0b9..37d5b097ede5f5 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -10,6 +10,7 @@ #include #include +#include struct vmlinux_info { unsigned long entry; @@ -89,6 +90,13 @@ void __noreturn jump_to_kernel(psw_t *psw); #define boot_info(fmt, ...) boot_printk(KERN_INFO boot_fmt(fmt), ##__VA_ARGS__) #define boot_debug(fmt, ...) boot_printk(KERN_DEBUG boot_fmt(fmt), ##__VA_ARGS__) +#define boot_panic(...) do { \ + boot_emerg(__VA_ARGS__); \ + print_stacktrace(current_frame_address()); \ + boot_emerg(" -- System halted\n"); \ + disabled_wait(); \ +} while (0) + extern struct machine_info machine; extern int boot_console_loglevel; extern bool boot_ignore_loglevel; diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c index 03500b9d9fb9ad..8d1bc25a6bf4ed 100644 --- a/arch/s390/boot/decompressor.c +++ b/arch/s390/boot/decompressor.c @@ -68,9 +68,7 @@ static void decompress_error(char *m) { if (bootdebug) boot_rb_dump(); - boot_emerg("Decompression error: %s\n", m); - boot_emerg(" -- System halted\n"); - disabled_wait(); + boot_panic("Decompression error: %s\n", m); } unsigned long mem_safe_offset(void) diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c index 45e3d057cfaa31..1f2ca5435838e8 100644 --- a/arch/s390/boot/physmem_info.c +++ b/arch/s390/boot/physmem_info.c @@ -228,9 +228,7 @@ static void die_oom(unsigned long size, unsigned long align, unsigned long min, boot_emerg("Usable online memory total: %lu Reserved: %lu Free: %lu\n", total_mem, total_reserved_mem, total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0); - print_stacktrace(current_frame_address()); - boot_emerg(" -- System halted\n"); - disabled_wait(); + boot_panic("Oom\n"); } static void _physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 93684a7757161c..3fbd25b9498f35 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -44,13 +44,6 @@ u64 __bootdata_preserved(clock_comparator_max) = -1UL; u64 __bootdata_preserved(stfle_fac_list[16]); struct oldmem_data __bootdata_preserved(oldmem_data); -void error(char *x) -{ - boot_emerg("%s\n", x); - boot_emerg(" -- System halted\n"); - disabled_wait(); -} - static char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE); static void detect_machine_type(void) @@ -220,10 +213,10 @@ static void rescue_initrd(unsigned long min, unsigned long max) static void copy_bootdata(void) { if (__boot_data_end - __boot_data_start != vmlinux.bootdata_size) - error(".boot.data section size mismatch"); + boot_panic(".boot.data section size mismatch\n"); memcpy((void *)vmlinux.bootdata_off, __boot_data_start, vmlinux.bootdata_size); if (__boot_data_preserved_end - __boot_data_preserved_start != vmlinux.bootdata_preserved_size) - error(".boot.preserved.data section size mismatch"); + boot_panic(".boot.preserved.data section size mismatch\n"); memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); } @@ -237,7 +230,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, for (reloc = (int *)__vmlinux_relocs_64_start; reloc < (int *)__vmlinux_relocs_64_end; reloc++) { loc = (long)*reloc + phys_offset; if (loc < min_addr || loc > max_addr) - error("64-bit relocation outside of kernel!\n"); + boot_panic("64-bit relocation outside of kernel!\n"); *(u64 *)loc += offset; } } From 11aa54ba4cfa5390ea47c9a1fc62502abce1f6b9 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Wed, 13 Aug 2025 11:43:50 +0200 Subject: [PATCH 0582/3206] s390/pkey: Forward keygenflags to ep11_unwrapkey The pkey ioctl PKEY_CLR2SECK2 describes in the pkey.h header file the parameter 'keygenflags' which is forwarded to the handler functions which actually deal with the clear key to secure key operation. The ep11 handler module function ep11_clr2keyblob() function receives this parameter but does not forward it to the underlying function ep11_unwrapkey() on invocation. So in the end the user of this ioctl could not forward additional key generation flags to the ep11 implementation and thus was unable to modify the key generation process in any way. So now call ep11_unwrapkey() with the real keygenflags instead of 0 and thus the user of this ioctl can for example via keygenflags provide valid combinations of XCP_BLOB_* flags. Suggested-by: Ingo Franzki Signed-off-by: Harald Freudenberger Reviewed-by: Ingo Franzki Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/zcrypt_ep11misc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c index 3bf09a89a08940..e92e2fd8ce5da0 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.c +++ b/drivers/s390/crypto/zcrypt_ep11misc.c @@ -1405,7 +1405,9 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, /* Step 3: import the encrypted key value as a new key */ rc = ep11_unwrapkey(card, domain, kek, keklen, encbuf, encbuflen, 0, def_iv, - keybitsize, 0, keybuf, keybufsize, keytype, xflags); + keybitsize, keygenflags, + keybuf, keybufsize, + keytype, xflags); if (rc) { ZCRYPT_DBF_ERR("%s importing key value as new key failed, rc=%d\n", __func__, rc); From 30c2b98aa84c76f2ae60e66dd4ec2d9497713359 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 12:33:17 +0530 Subject: [PATCH 0583/3206] x86/apic: Add new driver for Secure AVIC The Secure AVIC feature provides SEV-SNP guests hardware acceleration for performance sensitive APIC accesses while securely managing the guest-owned APIC state through the use of a private APIC backing page. This helps prevent the hypervisor from generating unexpected interrupts for a vCPU or otherwise violate architectural assumptions around the APIC behavior. Add a new x2APIC driver that will serve as the base of the Secure AVIC support. It is initially the same as the x2APIC physical driver (without IPI callbacks), but will be modified as features are implemented. As the new driver does not implement Secure AVIC features yet, if the hypervisor sets the Secure AVIC bit in SEV_STATUS, maintain the existing behavior to enforce the guest termination. [ bp: Massage commit message. ] Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828070334.208401-2-Neeraj.Upadhyay@amd.com --- arch/x86/Kconfig | 13 ++++++ arch/x86/boot/compressed/sev.c | 1 + arch/x86/coco/core.c | 3 ++ arch/x86/coco/sev/core.c | 1 + arch/x86/include/asm/msr-index.h | 4 +- arch/x86/kernel/apic/Makefile | 1 + arch/x86/kernel/apic/x2apic_savic.c | 63 +++++++++++++++++++++++++++++ include/linux/cc_platform.h | 8 ++++ 8 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/apic/x2apic_savic.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100eb..e32952728d9aa2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -483,6 +483,19 @@ config X86_X2APIC If in doubt, say Y. +config AMD_SECURE_AVIC + bool "AMD Secure AVIC" + depends on AMD_MEM_ENCRYPT && X86_X2APIC + help + Enable this to get AMD Secure AVIC support on guests that have this feature. + + AMD Secure AVIC provides hardware acceleration for performance sensitive + APIC accesses and support for managing guest owned APIC state for SEV-SNP + guests. Secure AVIC does not support xAPIC mode. It has functional + dependency on x2apic being enabled in the guest. + + If you don't know what to do here, say N. + config X86_POSTED_MSI bool "Enable MSI and MSI-x delivery by posted interrupts" depends on X86_64 && IRQ_REMAP diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index fd1b67dfea228f..74e083feb2d952 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -235,6 +235,7 @@ bool sev_es_check_ghcb_fault(unsigned long address) MSR_AMD64_SNP_VMSA_REG_PROT | \ MSR_AMD64_SNP_RESERVED_BIT13 | \ MSR_AMD64_SNP_RESERVED_BIT15 | \ + MSR_AMD64_SNP_SECURE_AVIC | \ MSR_AMD64_SNP_RESERVED_MASK) /* diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index d4610af6811434..989ca9f72ba30b 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -104,6 +104,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_HOST_SEV_SNP: return cc_flags.host_sev_snp; + case CC_ATTR_SNP_SECURE_AVIC: + return sev_status & MSR_AMD64_SNP_SECURE_AVIC; + default: return false; } diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 14ef5908fb2735..f7a549f650e9c4 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -79,6 +79,7 @@ static const char * const sev_status_feat_names[] = { [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt", [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", + [MSR_AMD64_SNP_SECURE_AVIC_BIT] = "SecureAVIC", }; /* diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa1410..2a6d4fd8659a22 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -699,7 +699,9 @@ #define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) #define MSR_AMD64_SNP_SMT_PROT_BIT 17 #define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) -#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT) +#define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52d1808ee360b0..581db89477f923 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -18,6 +18,7 @@ ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_AMD_SECURE_AVIC) += x2apic_savic.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c new file mode 100644 index 00000000000000..bea844f28192c1 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure AVIC Support (SEV-SNP Guests) + * + * Copyright (C) 2024 Advanced Micro Devices, Inc. + * + * Author: Neeraj Upadhyay + */ + +#include + +#include +#include + +#include "local.h" + +static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); +} + +static int savic_probe(void) +{ + if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return 0; + + if (!x2apic_mode) { + pr_err("Secure AVIC enabled in non x2APIC mode\n"); + snp_abort(); + /* unreachable */ + } + + return 1; +} + +static struct apic apic_x2apic_savic __ro_after_init = { + + .name = "secure avic x2apic", + .probe = savic_probe, + .acpi_madt_oem_check = savic_acpi_madt_oem_check, + + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, + .get_apic_id = x2apic_get_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .nmi_to_offline_cpu = true, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi = native_apic_msr_eoi, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, +}; + +apic_driver(apic_x2apic_savic); diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index 0bf7d33a1048cf..7fcec025c5e01d 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -96,6 +96,14 @@ enum cc_attr { * enabled to run SEV-SNP guests. */ CC_ATTR_HOST_SEV_SNP, + + /** + * @CC_ATTR_SNP_SECURE_AVIC: Secure AVIC mode is active. + * + * The host kernel is running with the necessary features enabled + * to run SEV-SNP guests with full Secure AVIC capabilities. + */ + CC_ATTR_SNP_SECURE_AVIC, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM From 98857d111c53954aa038fcbc4cf48873e4240f7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20B=2E=20Marli=C3=A8re?= Date: Thu, 28 Aug 2025 10:12:33 -0300 Subject: [PATCH 0584/3206] selftests/bpf: Fix bpf_prog_detach2 usage in test_lirc_mode2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e9fc3ce99b34 ("libbpf: Streamline error reporting for high-level APIs") redefined the way that bpf_prog_detach2() returns. Therefore, adapt the usage in test_lirc_mode2_user.c. Signed-off-by: Ricardo B. Marlière Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250828-selftests-bpf-v1-1-c7811cd8b98c@suse.com --- tools/testing/selftests/bpf/test_lirc_mode2_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c index 4694422aa76c36..88e4aeab21b7bc 100644 --- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c +++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c @@ -74,7 +74,7 @@ int main(int argc, char **argv) /* Let's try detach it before it was ever attached */ ret = bpf_prog_detach2(progfd, lircfd, BPF_LIRC_MODE2); - if (ret != -1 || errno != ENOENT) { + if (ret != -ENOENT) { printf("bpf_prog_detach2 not attached should fail: %m\n"); return 1; } From d8c2a9edd181f0cc4a66eec954b3d8f6a1d954a7 Mon Sep 17 00:00:00 2001 From: Da Xue Date: Thu, 21 Aug 2025 19:33:34 -0400 Subject: [PATCH 0585/3206] pinctrl: meson-gxl: add missing i2c_d pinmux Amlogic GXL has 4 I2C attached to gpio-periphs. I2C_D is on GPIOX_10/11. Add the relevant func 3 pinmux per the datasheet for S805X/S905X/S905D. Fixes: 0f15f500ff2c ("pinctrl: meson: Add GXL pinctrl definitions") Signed-off-by: Da Xue Link: https://lore.kernel.org/20250821233335.1707559-1-da@libre.computer Signed-off-by: Linus Walleij --- drivers/pinctrl/meson/pinctrl-meson-gxl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/pinctrl/meson/pinctrl-meson-gxl.c b/drivers/pinctrl/meson/pinctrl-meson-gxl.c index 9171de657f9780..a75762e4d26418 100644 --- a/drivers/pinctrl/meson/pinctrl-meson-gxl.c +++ b/drivers/pinctrl/meson/pinctrl-meson-gxl.c @@ -187,6 +187,9 @@ static const unsigned int i2c_sda_c_pins[] = { GPIODV_28 }; static const unsigned int i2c_sck_c_dv19_pins[] = { GPIODV_19 }; static const unsigned int i2c_sda_c_dv18_pins[] = { GPIODV_18 }; +static const unsigned int i2c_sck_d_pins[] = { GPIOX_11 }; +static const unsigned int i2c_sda_d_pins[] = { GPIOX_10 }; + static const unsigned int eth_mdio_pins[] = { GPIOZ_0 }; static const unsigned int eth_mdc_pins[] = { GPIOZ_1 }; static const unsigned int eth_clk_rx_clk_pins[] = { GPIOZ_2 }; @@ -411,6 +414,8 @@ static const struct meson_pmx_group meson_gxl_periphs_groups[] = { GPIO_GROUP(GPIO_TEST_N), /* Bank X */ + GROUP(i2c_sda_d, 5, 5), + GROUP(i2c_sck_d, 5, 4), GROUP(sdio_d0, 5, 31), GROUP(sdio_d1, 5, 30), GROUP(sdio_d2, 5, 29), @@ -651,6 +656,10 @@ static const char * const i2c_c_groups[] = { "i2c_sck_c", "i2c_sda_c", "i2c_sda_c_dv18", "i2c_sck_c_dv19", }; +static const char * const i2c_d_groups[] = { + "i2c_sck_d", "i2c_sda_d", +}; + static const char * const eth_groups[] = { "eth_mdio", "eth_mdc", "eth_clk_rx_clk", "eth_rx_dv", "eth_rxd0", "eth_rxd1", "eth_rxd2", "eth_rxd3", @@ -777,6 +786,7 @@ static const struct meson_pmx_func meson_gxl_periphs_functions[] = { FUNCTION(i2c_a), FUNCTION(i2c_b), FUNCTION(i2c_c), + FUNCTION(i2c_d), FUNCTION(eth), FUNCTION(pwm_a), FUNCTION(pwm_b), From bb585591ebf00fb1f6a1fdd1ea96b5848bd9112d Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 27 Aug 2025 21:43:09 +0200 Subject: [PATCH 0586/3206] fhandle: use more consistent rules for decoding file handle from userns Commit 620c266f39493 ("fhandle: relax open_by_handle_at() permission checks") relaxed the coditions for decoding a file handle from non init userns. The conditions are that that decoded dentry is accessible from the user provided mountfd (or to fs root) and that all the ancestors along the path have a valid id mapping in the userns. These conditions are intentionally more strict than the condition that the decoded dentry should be "lookable" by path from the mountfd. For example, the path /home/amir/dir/subdir is lookable by path from unpriv userns of user amir, because /home perms is 755, but the owner of /home does not have a valid id mapping in unpriv userns of user amir. The current code did not check that the decoded dentry itself has a valid id mapping in the userns. There is no security risk in that, because that final open still performs the needed permission checks, but this is inconsistent with the checks performed on the ancestors, so the behavior can be a bit confusing. Add the check for the decoded dentry itself, so that the entire path, including the last component has a valid id mapping in the userns. Fixes: 620c266f39493 ("fhandle: relax open_by_handle_at() permission checks") Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/20250827194309.1259650-1-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/fhandle.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/fhandle.c b/fs/fhandle.c index 68a7d2861c58fe..a907ddfac4d51c 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -207,6 +207,14 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry) if (!ctx->flags) return 1; + /* + * Verify that the decoded dentry itself has a valid id mapping. + * In case the decoded dentry is the mountfd root itself, this + * verifies that the mountfd inode itself has a valid id mapping. + */ + if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(dentry))) + return 0; + /* * It's racy as we're not taking rename_lock but we're able to ignore * permissions and we just need an approximation whether we were able From 38d1227fa71d96b470172df50e241775a802a8e7 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Fri, 29 Aug 2025 17:15:10 +0800 Subject: [PATCH 0587/3206] fs: Replace offsetof() with struct_size() in ioctl_file_dedupe_range() When dealing with structures containing flexible arrays, struct_size() provides additional compile-time checks compared to offsetof(). This enhances code robustness and reduces the risk of potential errors. Signed-off-by: Xichao Zhao Link: https://lore.kernel.org/20250829091510.597858-1-zhao.xichao@vivo.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ioctl.c b/fs/ioctl.c index 0248cb8db2d363..83d07218b6cd08 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -426,7 +426,7 @@ static int ioctl_file_dedupe_range(struct file *file, goto out; } - size = offsetof(struct file_dedupe_range, info[count]); + size = struct_size(same, info, count); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; From 2e62688d583809e832433f461194334408b10817 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 27 Aug 2025 15:00:07 +0200 Subject: [PATCH 0588/3206] selftests/futex: Remove the -g parameter from futex_priv_hash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The -g parameter was meant to the test the immutable global hash instead of the private hash which has been made immutable. The global hash is tested as part at the end of the regular test. The immutable private hash been removed. Remove last traces of the immutable private hash. Fixes: 16adc7f136dc1 ("selftests/futex: Remove support for IMMUTABLE") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov (AMD) Reviewed-by: André Almeida Link: https://lore.kernel.org/20250827130011.677600-2-bigeasy@linutronix.de --- tools/testing/selftests/futex/functional/futex_priv_hash.c | 1 - tools/testing/selftests/futex/functional/run.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_priv_hash.c b/tools/testing/selftests/futex/functional/futex_priv_hash.c index aea001ac494604..ec032faca6a91f 100644 --- a/tools/testing/selftests/futex/functional/futex_priv_hash.c +++ b/tools/testing/selftests/futex/functional/futex_priv_hash.c @@ -132,7 +132,6 @@ static void usage(char *prog) { printf("Usage: %s\n", prog); printf(" -c Use color\n"); - printf(" -g Test global hash instead intead local immutable \n"); printf(" -h Display this help message\n"); printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", VQUIET, VCRITICAL, VINFO); diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 81739849f2994d..5470088dc4dfb6 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -85,7 +85,6 @@ echo echo ./futex_priv_hash $COLOR -./futex_priv_hash -g $COLOR echo ./futex_numa_mpol $COLOR From 9a98f9e84cfbeaa51af42ba2b8bbbde046c709a7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Aug 2025 11:39:01 -0400 Subject: [PATCH 0589/3206] fs: make the i_state flags an enum Adjusting i_state flags always means updating the values manually. Bring these forward into the 2020's and make a nice clean macro for defining the i_state values as an enum, providing __ variants for the cases where we need the bit position instead of the actual value, and leaving the actual NAME as the 1U << bit value. Reviewed-by: Christian Brauner Signed-off-by: Josef Bacik Link: https://lore.kernel.org/0da9348da6ece0dce12fccec07b1dd2b8e4cfdab.1756222464.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 231 +++++++++++++++++++++++---------------------- 1 file changed, 119 insertions(+), 112 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 12ecc6b0e6f96b..c34554d8c4feb9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -664,6 +664,124 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 +/* + * Inode state bits. Protected by inode->i_lock + * + * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, + * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. + * + * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, + * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at + * various stages of removing an inode. + * + * Two bits are used for locking and completion notification, I_NEW and I_SYNC. + * + * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on + * fdatasync() (unless I_DIRTY_DATASYNC is also set). + * Timestamp updates are the usual cause. + * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of + * these changes separately from I_DIRTY_SYNC so that we + * don't have to write inode on fdatasync() when only + * e.g. the timestamps have changed. + * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. + * I_DIRTY_TIME The inode itself has dirty timestamps, and the + * lazytime mount option is enabled. We keep track of this + * separately from I_DIRTY_SYNC in order to implement + * lazytime. This gets cleared if I_DIRTY_INODE + * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But + * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already + * in place because writeback might already be in progress + * and we don't want to lose the time update + * I_NEW Serves as both a mutex and completion notification. + * New inodes set I_NEW. If two processes both create + * the same inode, one of them will release its inode and + * wait for I_NEW to be released before returning. + * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can + * also cause waiting on I_NEW, without I_NEW actually + * being set. find_inode() uses this to prevent returning + * nearly-dead inodes. + * I_WILL_FREE Must be set when calling write_inode_now() if i_count + * is zero. I_FREEING must be set when I_WILL_FREE is + * cleared. + * I_FREEING Set when inode is about to be freed but still has dirty + * pages or buffers attached or the inode itself is still + * dirty. + * I_CLEAR Added by clear_inode(). In this state the inode is + * clean and can be destroyed. Inode keeps I_FREEING. + * + * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are + * prohibited for many purposes. iget() must wait for + * the inode to be completely released, then create it + * anew. Other functions will just ignore such inodes, + * if appropriate. I_NEW is used for waiting. + * + * I_SYNC Writeback of inode is running. The bit is set during + * data writeback, and cleared with a wakeup on the bit + * address once it is done. The bit is also used to pin + * the inode in memory for flusher thread. + * + * I_REFERENCED Marks the inode as recently references on the LRU list. + * + * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to + * synchronize competing switching instances and to tell + * wb stat updates to grab the i_pages lock. See + * inode_switch_wbs_work_fn() for details. + * + * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper + * and work dirs among overlayfs mounts. + * + * I_CREATING New object's inode in the middle of setting up. + * + * I_DONTCACHE Evict inode as soon as it is not used anymore. + * + * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. + * Used to detect that mark_inode_dirty() should not move + * inode between dirty lists. + * + * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. + * + * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding + * i_count. + * + * Q: What is the difference between I_WILL_FREE and I_FREEING? + * + * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait + * upon. There's one free address left. + */ + +enum inode_state_bits { + __I_NEW = 0U, + __I_SYNC = 1U, + __I_LRU_ISOLATING = 2U + /* reserved wait address bit 3 */ +}; + +enum inode_state_flags_t { + I_NEW = (1U << __I_NEW), + I_SYNC = (1U << __I_SYNC), + I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING), + /* reserved flag bit 3 */ + I_DIRTY_SYNC = (1U << 4), + I_DIRTY_DATASYNC = (1U << 5), + I_DIRTY_PAGES = (1U << 6), + I_WILL_FREE = (1U << 7), + I_FREEING = (1U << 8), + I_CLEAR = (1U << 9), + I_REFERENCED = (1U << 10), + I_LINKABLE = (1U << 11), + I_DIRTY_TIME = (1U << 12), + I_WB_SWITCH = (1U << 13), + I_OVL_INUSE = (1U << 14), + I_CREATING = (1U << 15), + I_DONTCACHE = (1U << 16), + I_SYNC_QUEUED = (1U << 17), + I_PINNING_NETFS_WB = (1U << 18) +}; + +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) +#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) +#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) + /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning @@ -722,7 +840,7 @@ struct inode { #endif /* Misc */ - u32 i_state; + enum inode_state_flags_t i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -2482,117 +2600,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, }; } -/* - * Inode state bits. Protected by inode->i_lock - * - * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, - * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. - * - * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, - * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at - * various stages of removing an inode. - * - * Two bits are used for locking and completion notification, I_NEW and I_SYNC. - * - * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on - * fdatasync() (unless I_DIRTY_DATASYNC is also set). - * Timestamp updates are the usual cause. - * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of - * these changes separately from I_DIRTY_SYNC so that we - * don't have to write inode on fdatasync() when only - * e.g. the timestamps have changed. - * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. - * I_DIRTY_TIME The inode itself has dirty timestamps, and the - * lazytime mount option is enabled. We keep track of this - * separately from I_DIRTY_SYNC in order to implement - * lazytime. This gets cleared if I_DIRTY_INODE - * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But - * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already - * in place because writeback might already be in progress - * and we don't want to lose the time update - * I_NEW Serves as both a mutex and completion notification. - * New inodes set I_NEW. If two processes both create - * the same inode, one of them will release its inode and - * wait for I_NEW to be released before returning. - * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can - * also cause waiting on I_NEW, without I_NEW actually - * being set. find_inode() uses this to prevent returning - * nearly-dead inodes. - * I_WILL_FREE Must be set when calling write_inode_now() if i_count - * is zero. I_FREEING must be set when I_WILL_FREE is - * cleared. - * I_FREEING Set when inode is about to be freed but still has dirty - * pages or buffers attached or the inode itself is still - * dirty. - * I_CLEAR Added by clear_inode(). In this state the inode is - * clean and can be destroyed. Inode keeps I_FREEING. - * - * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are - * prohibited for many purposes. iget() must wait for - * the inode to be completely released, then create it - * anew. Other functions will just ignore such inodes, - * if appropriate. I_NEW is used for waiting. - * - * I_SYNC Writeback of inode is running. The bit is set during - * data writeback, and cleared with a wakeup on the bit - * address once it is done. The bit is also used to pin - * the inode in memory for flusher thread. - * - * I_REFERENCED Marks the inode as recently references on the LRU list. - * - * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to - * synchronize competing switching instances and to tell - * wb stat updates to grab the i_pages lock. See - * inode_switch_wbs_work_fn() for details. - * - * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper - * and work dirs among overlayfs mounts. - * - * I_CREATING New object's inode in the middle of setting up. - * - * I_DONTCACHE Evict inode as soon as it is not used anymore. - * - * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. - * Used to detect that mark_inode_dirty() should not move - * inode between dirty lists. - * - * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. - * - * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding - * i_count. - * - * Q: What is the difference between I_WILL_FREE and I_FREEING? - * - * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait - * upon. There's one free address left. - */ -#define __I_NEW 0 -#define I_NEW (1 << __I_NEW) -#define __I_SYNC 1 -#define I_SYNC (1 << __I_SYNC) -#define __I_LRU_ISOLATING 2 -#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) - -#define I_DIRTY_SYNC (1 << 3) -#define I_DIRTY_DATASYNC (1 << 4) -#define I_DIRTY_PAGES (1 << 5) -#define I_WILL_FREE (1 << 6) -#define I_FREEING (1 << 7) -#define I_CLEAR (1 << 8) -#define I_REFERENCED (1 << 9) -#define I_LINKABLE (1 << 10) -#define I_DIRTY_TIME (1 << 11) -#define I_WB_SWITCH (1 << 12) -#define I_OVL_INUSE (1 << 13) -#define I_CREATING (1 << 14) -#define I_DONTCACHE (1 << 15) -#define I_SYNC_QUEUED (1 << 16) -#define I_PINNING_NETFS_WB (1 << 17) - -#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) -#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) -#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) - extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { From db2ab24a341ce89351a1bede37a96a3e3ce1726a Mon Sep 17 00:00:00 2001 From: Lauri Vasama Date: Wed, 27 Aug 2025 16:39:00 +0300 Subject: [PATCH 0590/3206] Add RWF_NOSIGNAL flag for pwritev2 For a user mode library to avoid generating SIGPIPE signals (e.g. because this behaviour is not portable across operating systems) is cumbersome. It is generally bad form to change the process-wide signal mask in a library, so a local solution is needed instead. For I/O performed directly using system calls (synchronous or readiness based asynchronous) this currently involves applying a thread-specific signal mask before the operation and reverting it afterwards. This can be avoided when it is known that the file descriptor refers to neither a pipe nor a socket, but a conservative implementation must always apply the mask. This incurs the cost of two additional system calls. In the case of sockets, the existing MSG_NOSIGNAL flag can be used with send. For asynchronous I/O performed using io_uring, currently the only option (apart from MSG_NOSIGNAL for sockets), is to mask SIGPIPE entirely in the call to io_uring_enter. Thankfully io_uring_enter takes a signal mask, so only a single syscall is needed. However, copying the signal mask on every call incurs a non-zero performance penalty. Furthermore, this mask applies to all completions, meaning that if the non-signaling behaviour is desired only for some subset of operations, the desired signals must be raised manually from user-mode depending on the completed operation. Add RWF_NOSIGNAL flag for pwritev2. This flag prevents the SIGPIPE signal from being raised when writing on disconnected pipes or sockets. The flag is handled directly by the pipe filesystem and converted to the existing MSG_NOSIGNAL flag for sockets. Signed-off-by: Lauri Vasama Link: https://lore.kernel.org/20250827133901.1820771-1-git@vasama.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/pipe.c | 6 ++++-- include/linux/fs.h | 1 + include/uapi/linux/fs.h | 5 ++++- net/socket.c | 3 +++ 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 731622d0738d41..42fead1efe5204 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&pipe->mutex); if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } @@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; diff --git a/include/linux/fs.h b/include/linux/fs.h index 780e9c774c5496..34693cae15a298 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,6 +356,7 @@ struct readahead_control; #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE +#define IOCB_NOSIGNAL (__force int) RWF_NOSIGNAL /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0bd678a4a10ef8..beb4c2d1e41cb1 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t; /* buffered IO that drops the cache after reading or writing data */ #define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) +/* prevent pipe and socket writes from raising SIGPIPE */ +#define RWF_NOSIGNAL ((__force __kernel_rwf_t)0x00000100) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ - RWF_DONTCACHE) + RWF_DONTCACHE | RWF_NOSIGNAL) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/net/socket.c b/net/socket.c index 682969deaed35d..bac335ecee4c54 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1176,6 +1176,9 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; + if (iocb->ki_flags & IOCB_NOSIGNAL) + msg.msg_flags |= MSG_NOSIGNAL; + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; From df220cc5e689213c34a0eec7ef26d25f503c77ae Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Aug 2025 08:25:11 -0700 Subject: [PATCH 0591/3206] lib/crypto: poly1305: Remove unused function poly1305_is_arch_optimized() poly1305_is_arch_optimized() is unused, so remove it. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250829152513.92459-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/poly1305.h | 9 --------- lib/crypto/arm/poly1305-glue.c | 7 ------- lib/crypto/arm64/poly1305-glue.c | 7 ------- lib/crypto/mips/poly1305-glue.c | 6 ------ lib/crypto/powerpc/poly1305-p10-glue.c | 6 ------ lib/crypto/x86/poly1305_glue.c | 6 ------ 6 files changed, 41 deletions(-) diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h index e54abda8cfe95e..d4daeec8da19d8 100644 --- a/include/crypto/poly1305.h +++ b/include/crypto/poly1305.h @@ -64,13 +64,4 @@ void poly1305_update(struct poly1305_desc_ctx *desc, const u8 *src, unsigned int nbytes); void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest); -#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305) -bool poly1305_is_arch_optimized(void); -#else -static inline bool poly1305_is_arch_optimized(void) -{ - return false; -} -#endif - #endif diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c index 2d86c78af8837b..9e513e319e37c9 100644 --- a/lib/crypto/arm/poly1305-glue.c +++ b/lib/crypto/arm/poly1305-glue.c @@ -51,13 +51,6 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, } EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -bool poly1305_is_arch_optimized(void) -{ - /* We always can use at least the ARM scalar implementation. */ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - static int __init arm_poly1305_mod_init(void) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c index 31aea21ce42f79..d4a522e7d25a96 100644 --- a/lib/crypto/arm64/poly1305-glue.c +++ b/lib/crypto/arm64/poly1305-glue.c @@ -50,13 +50,6 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, } EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -bool poly1305_is_arch_optimized(void) -{ - /* We always can use at least the ARM64 scalar implementation. */ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - static int __init neon_poly1305_mod_init(void) { if (cpu_have_named_feature(ASIMD)) diff --git a/lib/crypto/mips/poly1305-glue.c b/lib/crypto/mips/poly1305-glue.c index 764a38a652002a..002f50f710aba9 100644 --- a/lib/crypto/mips/poly1305-glue.c +++ b/lib/crypto/mips/poly1305-glue.c @@ -23,11 +23,5 @@ asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, const u32 nonce[4]); EXPORT_SYMBOL_GPL(poly1305_emit_arch); -bool poly1305_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated"); MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/powerpc/poly1305-p10-glue.c b/lib/crypto/powerpc/poly1305-p10-glue.c index 3f1664a724b655..184a71f9c1dee0 100644 --- a/lib/crypto/powerpc/poly1305-p10-glue.c +++ b/lib/crypto/powerpc/poly1305-p10-glue.c @@ -72,12 +72,6 @@ void poly1305_emit_arch(const struct poly1305_state *state, } EXPORT_SYMBOL_GPL(poly1305_emit_arch); -bool poly1305_is_arch_optimized(void) -{ - return static_key_enabled(&have_p10); -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - static int __init poly1305_p10_init(void) { if (cpu_has_feature(CPU_FTR_ARCH_31)) diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305_glue.c index 856d48fd422b02..deb5841cb0ada6 100644 --- a/lib/crypto/x86/poly1305_glue.c +++ b/lib/crypto/x86/poly1305_glue.c @@ -141,12 +141,6 @@ void poly1305_emit_arch(const struct poly1305_state *ctx, } EXPORT_SYMBOL_GPL(poly1305_emit_arch); -bool poly1305_is_arch_optimized(void) -{ - return static_key_enabled(&poly1305_use_avx); -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - static int __init poly1305_simd_mod_init(void) { if (boot_cpu_has(X86_FEATURE_AVX) && From b646b782e522da3509e61f971e5502fccb3a3723 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Aug 2025 08:25:12 -0700 Subject: [PATCH 0592/3206] lib/crypto: poly1305: Consolidate into single module Consolidate the Poly1305 code into a single module, similar to various other algorithms (SHA-1, SHA-256, SHA-512, etc.): - Each arch now provides a header file lib/crypto/$(SRCARCH)/poly1305.h, replacing lib/crypto/$(SRCARCH)/poly1305*.c. The header defines poly1305_block_init(), poly1305_blocks(), poly1305_emit(), and optionally poly1305_mod_init_arch(). It is included by lib/crypto/poly1305.c, and thus the code gets built into the single libpoly1305 module, with improved inlining in some cases. - Whether arch-optimized Poly1305 is buildable is now controlled centrally by lib/crypto/Kconfig instead of by lib/crypto/$(SRCARCH)/Kconfig. The conditions for enabling it remain the same as before, and it remains enabled by default. (The PPC64 one remains unconditionally disabled due to 'depends on BROKEN'.) - Any additional arch-specific translation units for the optimized Poly1305 code, such as assembly files, are now compiled by lib/crypto/Makefile instead of lib/crypto/$(SRCARCH)/Makefile. A special consideration is needed because the Adiantum code uses the poly1305_core_*() functions directly. For now, just carry forward that approach. This means retaining the CRYPTO_LIB_POLY1305_GENERIC kconfig symbol, and keeping the poly1305_core_*() functions in separate translation units. So it's not quite as streamlined I've done with the other hash functions, but we still get a single libpoly1305 module. Note: to see the diff from the arm, arm64, and x86 .c files to the new .h files, view this commit with 'git show -M10'. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250829152513.92459-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/Kconfig | 2 + include/crypto/internal/poly1305.h | 16 ++-- lib/crypto/Kconfig | 50 ++++++------ lib/crypto/Makefile | 59 ++++++++++++-- lib/crypto/arm/Kconfig | 5 -- lib/crypto/arm/Makefile | 18 ----- lib/crypto/arm/poly1305-armv4.pl | 3 +- lib/crypto/arm/poly1305-glue.c | 69 ---------------- lib/crypto/arm/poly1305.h | 53 ++++++++++++ lib/crypto/arm64/Kconfig | 6 -- lib/crypto/arm64/Makefile | 13 --- lib/crypto/arm64/poly1305-armv8.pl | 3 + lib/crypto/arm64/poly1305-glue.c | 67 --------------- lib/crypto/arm64/poly1305.h | 50 ++++++++++++ lib/crypto/mips/Kconfig | 5 -- lib/crypto/mips/Makefile | 14 ---- lib/crypto/mips/poly1305-glue.c | 27 ------- lib/crypto/mips/poly1305-mips.pl | 8 +- lib/crypto/mips/poly1305.h | 14 ++++ lib/crypto/poly1305-generic.c | 25 ------ lib/crypto/poly1305.c | 81 ++++++++++++------- lib/crypto/powerpc/Kconfig | 8 -- lib/crypto/powerpc/Makefile | 3 - .../{poly1305-p10-glue.c => poly1305.h} | 34 +++----- lib/crypto/x86/Kconfig | 6 -- lib/crypto/x86/Makefile | 10 --- lib/crypto/x86/poly1305-x86_64-cryptogams.pl | 33 +++----- .../x86/{poly1305_glue.c => poly1305.h} | 41 ++++------ 28 files changed, 299 insertions(+), 424 deletions(-) delete mode 100644 lib/crypto/arm/poly1305-glue.c create mode 100644 lib/crypto/arm/poly1305.h delete mode 100644 lib/crypto/arm64/poly1305-glue.c create mode 100644 lib/crypto/arm64/poly1305.h delete mode 100644 lib/crypto/mips/poly1305-glue.c create mode 100644 lib/crypto/mips/poly1305.h delete mode 100644 lib/crypto/poly1305-generic.c rename lib/crypto/powerpc/{poly1305-p10-glue.c => poly1305.h} (66%) rename lib/crypto/x86/{poly1305_glue.c => poly1305.h} (85%) diff --git a/crypto/Kconfig b/crypto/Kconfig index 1575dbec084d63..e8ccf5f51b8553 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -609,6 +609,7 @@ menu "Length-preserving ciphers and modes" config CRYPTO_ADIANTUM tristate "Adiantum" select CRYPTO_CHACHA20 + select CRYPTO_LIB_POLY1305 select CRYPTO_LIB_POLY1305_GENERIC select CRYPTO_NHPOLY1305 select CRYPTO_MANAGER @@ -770,6 +771,7 @@ config CRYPTO_XTS config CRYPTO_NHPOLY1305 tristate select CRYPTO_HASH + select CRYPTO_LIB_POLY1305 select CRYPTO_LIB_POLY1305_GENERIC endmenu diff --git a/include/crypto/internal/poly1305.h b/include/crypto/internal/poly1305.h index c60315f475623f..a72fff409ab852 100644 --- a/include/crypto/internal/poly1305.h +++ b/include/crypto/internal/poly1305.h @@ -30,12 +30,13 @@ void poly1305_core_blocks(struct poly1305_state *state, void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4], void *dst); -void poly1305_block_init_arch(struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -void poly1305_block_init_generic(struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit); +static inline void +poly1305_block_init_generic(struct poly1305_block_state *desc, + const u8 raw_key[POLY1305_BLOCK_SIZE]) +{ + poly1305_core_init(&desc->h); + poly1305_core_setkey(&desc->core_r, raw_key); +} static inline void poly1305_blocks_generic(struct poly1305_block_state *state, const u8 *src, unsigned int len, @@ -45,9 +46,6 @@ static inline void poly1305_blocks_generic(struct poly1305_block_state *state, len / POLY1305_BLOCK_SIZE, padbit); } -void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4]); - static inline void poly1305_emit_generic(const struct poly1305_state *state, u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4]) diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 79b848448e07f1..9991118c41a9d7 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -114,6 +114,33 @@ config CRYPTO_LIB_MD5_ARCH default y if PPC default y if SPARC64 +config CRYPTO_LIB_POLY1305 + tristate + help + The Poly1305 library functions. Select this if your module uses any + of the functions from . + +config CRYPTO_LIB_POLY1305_ARCH + bool + depends on CRYPTO_LIB_POLY1305 && !UML + default y if ARM + default y if ARM64 && KERNEL_MODE_NEON + default y if MIPS + # The PPC64 code needs to be fixed to work in softirq context. + default y if PPC64 && CPU_LITTLE_ENDIAN && VSX && BROKEN + default y if X86_64 + +# This symbol controls the inclusion of the Poly1305 generic code. This differs +# from most of the other algorithms, which handle the generic code +# "automatically" via __maybe_unused. This is needed so that the Adiantum code, +# which calls the poly1305_core_*() functions directly, can enable them. +config CRYPTO_LIB_POLY1305_GENERIC + bool + depends on CRYPTO_LIB_POLY1305 + # Enable if there's no arch impl or the arch impl requires the generic + # impl as a fallback. (Or if selected explicitly.) + default y if !CRYPTO_LIB_POLY1305_ARCH || PPC64 + config CRYPTO_LIB_POLY1305_RSIZE int default 2 if MIPS @@ -121,29 +148,6 @@ config CRYPTO_LIB_POLY1305_RSIZE default 9 if ARM || ARM64 default 1 -config CRYPTO_ARCH_HAVE_LIB_POLY1305 - bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the Poly1305 library interface, - either builtin or as a module. - -config CRYPTO_LIB_POLY1305_GENERIC - tristate - default CRYPTO_LIB_POLY1305 if !CRYPTO_ARCH_HAVE_LIB_POLY1305 - help - This symbol can be selected by arch implementations of the Poly1305 - library interface that require the generic code as a fallback, e.g., - for SIMD implementations. If no arch specific implementation is - enabled, this implementation serves the users of CRYPTO_LIB_POLY1305. - -config CRYPTO_LIB_POLY1305 - tristate - help - Enable the Poly1305 library interface. This interface may be fulfilled - by either the generic implementation or an arch-specific one, if one - is available and enabled. - config CRYPTO_LIB_CHACHA20POLY1305 tristate select CRYPTO_LIB_CHACHA diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index d362636a22d38e..e0536e3b3a04c7 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -71,13 +71,60 @@ endif # CONFIG_CRYPTO_LIB_MD5_ARCH ################################################################################ -obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o -libpoly1305-y += poly1305.o +obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o +libpoly1305-y := poly1305.o +ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y) +libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna64.o +else +libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna32.o +endif + +ifeq ($(CONFIG_CRYPTO_LIB_POLY1305_ARCH),y) +CFLAGS_poly1305.o += -I$(src)/$(SRCARCH) + +ifeq ($(CONFIG_ARM),y) +libpoly1305-y += arm/poly1305-core.o +$(obj)/arm/poly1305-core.S: $(src)/arm/poly1305-armv4.pl + $(call cmd,perlasm) +# massage the perlasm code a bit so we only get the NEON routine if we need it +poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 +poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 +AFLAGS_arm/poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) +endif + +ifeq ($(CONFIG_ARM64),y) +libpoly1305-y += arm64/poly1305-core.o +$(obj)/arm64/poly1305-core.S: $(src)/arm64/poly1305-armv8.pl + $(call cmd,perlasm_with_args) +endif + +ifeq ($(CONFIG_MIPS),y) +libpoly1305-y += mips/poly1305-core.o +poly1305-perlasm-flavour-$(CONFIG_32BIT) := o32 +poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64 +quiet_cmd_perlasm_poly1305 = PERLASM $@ + cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@ +# Use if_changed instead of cmd, in case the flavour changed. +$(obj)/mips/poly1305-core.S: $(src)/mips/poly1305-mips.pl FORCE + $(call if_changed,perlasm_poly1305) +targets += mips/poly1305-core.S +endif -obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += libpoly1305-generic.o -libpoly1305-generic-y := poly1305-donna32.o -libpoly1305-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o -libpoly1305-generic-y += poly1305-generic.o +libpoly1305-$(CONFIG_PPC) += powerpc/poly1305-p10le_64.o + +ifeq ($(CONFIG_X86),y) +libpoly1305-y += x86/poly1305-x86_64-cryptogams.o +$(obj)/x86/poly1305-x86_64-cryptogams.S: $(src)/x86/poly1305-x86_64-cryptogams.pl + $(call cmd,perlasm) +endif + +endif # CONFIG_CRYPTO_LIB_POLY1305_ARCH + +# clean-files must be defined unconditionally +clean-files += arm/poly1305-core.S \ + arm64/poly1305-core.S \ + mips/poly1305-core.S \ + x86/poly1305-x86_64-cryptogams.S ################################################################################ diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig index e8444fd0aae303..0d821e282c645e 100644 --- a/lib/crypto/arm/Kconfig +++ b/lib/crypto/arm/Kconfig @@ -17,8 +17,3 @@ config CRYPTO_CHACHA20_NEON tristate default CRYPTO_LIB_CHACHA select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_ARM - tristate - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile index 4c042a4c77ed6e..9f70e61d419e21 100644 --- a/lib/crypto/arm/Makefile +++ b/lib/crypto/arm/Makefile @@ -6,21 +6,3 @@ libblake2s-arm-y := blake2s-core.o blake2s-glue.o obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o chacha-neon-y := chacha-scalar-core.o chacha-glue.o chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o - -obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o -poly1305-arm-y := poly1305-core.o poly1305-glue.o - -quiet_cmd_perl = PERL $@ - cmd_perl = $(PERL) $(<) > $(@) - -$(obj)/%-core.S: $(src)/%-armv4.pl - $(call cmd,perl) - -clean-files += poly1305-core.S - -aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 - -# massage the perlasm code a bit so we only get the NEON routine if we need it -poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 -poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 -AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl index dd7a996361a719..34c11b7b44bd76 100644 --- a/lib/crypto/arm/poly1305-armv4.pl +++ b/lib/crypto/arm/poly1305-armv4.pl @@ -43,9 +43,8 @@ #else # define __ARM_ARCH__ __LINUX_ARM_ARCH__ # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ -# define poly1305_init poly1305_block_init_arch +# define poly1305_init poly1305_block_init # define poly1305_blocks poly1305_blocks_arm -# define poly1305_emit poly1305_emit_arch #endif #if defined(__thumb2__) diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c deleted file mode 100644 index 9e513e319e37c9..00000000000000 --- a/lib/crypto/arm/poly1305-glue.c +++ /dev/null @@ -1,69 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) -{ - len = round_down(len, POLY1305_BLOCK_SIZE); - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon) && likely(may_use_simd())) { - do { - unsigned int todo = min_t(unsigned int, len, SZ_4K); - - kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, padbit); - kernel_neon_end(); - - len -= todo; - src += todo; - } while (len); - } else - poly1305_blocks_arm(state, src, len, padbit); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -static int __init arm_poly1305_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - (elf_hwcap & HWCAP_NEON)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(arm_poly1305_mod_init); - -static void __exit arm_poly1305_mod_exit(void) -{ -} -module_exit(arm_poly1305_mod_exit); - -MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm/poly1305.h b/lib/crypto/arm/poly1305.h new file mode 100644 index 00000000000000..0021cf368307c7 --- /dev/null +++ b/lib/crypto/arm/poly1305.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM + * + * Copyright (C) 2019 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include + +asmlinkage void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(may_use_simd())) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(state, src, todo, padbit); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else + poly1305_blocks_arm(state, src, len, padbit); +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) + static_branch_enable(&have_neon); +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig index 0b903ef524d857..07c8a4f0ab03a4 100644 --- a/lib/crypto/arm64/Kconfig +++ b/lib/crypto/arm64/Kconfig @@ -6,9 +6,3 @@ config CRYPTO_CHACHA20_NEON default CRYPTO_LIB_CHACHA select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_NEON - tristate - depends on KERNEL_MODE_NEON - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile index 6207088397a73e..d49cceca3d1ca7 100644 --- a/lib/crypto/arm64/Makefile +++ b/lib/crypto/arm64/Makefile @@ -2,16 +2,3 @@ obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o -poly1305-neon-y := poly1305-core.o poly1305-glue.o -AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch -AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) void $(@) - -$(obj)/%-core.S: $(src)/%-armv8.pl - $(call cmd,perlasm) - -clean-files += poly1305-core.S diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl index 22c9069c065054..f1930c6b55cee5 100644 --- a/lib/crypto/arm64/poly1305-armv8.pl +++ b/lib/crypto/arm64/poly1305-armv8.pl @@ -50,6 +50,9 @@ #ifndef __KERNEL__ # include "arm_arch.h" .extern OPENSSL_armcap_P +#else +# define poly1305_init poly1305_block_init +# define poly1305_blocks poly1305_blocks_arm64 #endif .text diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c deleted file mode 100644 index d4a522e7d25a96..00000000000000 --- a/lib/crypto/arm64/poly1305-glue.c +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) -{ - len = round_down(len, POLY1305_BLOCK_SIZE); - if (static_branch_likely(&have_neon) && likely(may_use_simd())) { - do { - unsigned int todo = min_t(unsigned int, len, SZ_4K); - - kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, padbit); - kernel_neon_end(); - - len -= todo; - src += todo; - } while (len); - } else - poly1305_blocks(state, src, len, padbit); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -static int __init neon_poly1305_mod_init(void) -{ - if (cpu_have_named_feature(ASIMD)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(neon_poly1305_mod_init); - -static void __exit neon_poly1305_mod_exit(void) -{ -} -module_exit(neon_poly1305_mod_exit); - -MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/poly1305.h b/lib/crypto/arm64/poly1305.h new file mode 100644 index 00000000000000..aed5921ccd9a12 --- /dev/null +++ b/lib/crypto/arm64/poly1305.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 + * + * Copyright (C) 2019 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include + +asmlinkage void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks_arm64(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + if (static_branch_likely(&have_neon) && likely(may_use_simd())) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(state, src, todo, padbit); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else + poly1305_blocks_arm64(state, src, len, padbit); +} + +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) +{ + if (cpu_have_named_feature(ASIMD)) + static_branch_enable(&have_neon); +} diff --git a/lib/crypto/mips/Kconfig b/lib/crypto/mips/Kconfig index 0670a170c1be04..94c1a0892c203b 100644 --- a/lib/crypto/mips/Kconfig +++ b/lib/crypto/mips/Kconfig @@ -5,8 +5,3 @@ config CRYPTO_CHACHA_MIPS depends on CPU_MIPS32_R2 default CRYPTO_LIB_CHACHA select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_MIPS - tristate - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/mips/Makefile b/lib/crypto/mips/Makefile index 804488c7adedcc..b5ea0e25c21ef5 100644 --- a/lib/crypto/mips/Makefile +++ b/lib/crypto/mips/Makefile @@ -3,17 +3,3 @@ obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o chacha-mips-y := chacha-core.o chacha-glue.o AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots - -obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o -poly1305-mips-y := poly1305-core.o poly1305-glue.o - -perlasm-flavour-$(CONFIG_32BIT) := o32 -perlasm-flavour-$(CONFIG_64BIT) := 64 - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) - -$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE - $(call if_changed,perlasm) - -targets += poly1305-core.S diff --git a/lib/crypto/mips/poly1305-glue.c b/lib/crypto/mips/poly1305-glue.c deleted file mode 100644 index 002f50f710aba9..00000000000000 --- a/lib/crypto/mips/poly1305-glue.c +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl index 399f10c3e3850a..71347f34f4f9fd 100644 --- a/lib/crypto/mips/poly1305-mips.pl +++ b/lib/crypto/mips/poly1305-mips.pl @@ -93,9 +93,7 @@ #endif #ifdef __KERNEL__ -# define poly1305_init poly1305_block_init_arch -# define poly1305_blocks poly1305_blocks_arch -# define poly1305_emit poly1305_emit_arch +# define poly1305_init poly1305_block_init #endif #if defined(__MIPSEB__) && !defined(MIPSEB) @@ -565,9 +563,7 @@ #endif #ifdef __KERNEL__ -# define poly1305_init poly1305_block_init_arch -# define poly1305_blocks poly1305_blocks_arch -# define poly1305_emit poly1305_emit_arch +# define poly1305_init poly1305_block_init #endif #if defined(__MIPSEB__) && !defined(MIPSEB) diff --git a/lib/crypto/mips/poly1305.h b/lib/crypto/mips/poly1305.h new file mode 100644 index 00000000000000..85de450f1a93d5 --- /dev/null +++ b/lib/crypto/mips/poly1305.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS + * + * Copyright (C) 2019 Linaro Ltd. + */ + +asmlinkage void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); diff --git a/lib/crypto/poly1305-generic.c b/lib/crypto/poly1305-generic.c deleted file mode 100644 index 71a16c5c538b40..00000000000000 --- a/lib/crypto/poly1305-generic.c +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Poly1305 authenticator algorithm, RFC7539 - * - * Copyright (C) 2015 Martin Willi - * - * Based on public domain code by Andrew Moon and Daniel J. Bernstein. - */ - -#include -#include -#include -#include - -void poly1305_block_init_generic(struct poly1305_block_state *desc, - const u8 raw_key[POLY1305_BLOCK_SIZE]) -{ - poly1305_core_init(&desc->h); - poly1305_core_setkey(&desc->core_r, raw_key); -} -EXPORT_SYMBOL_GPL(poly1305_block_init_generic); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); -MODULE_DESCRIPTION("Poly1305 algorithm (generic implementation)"); diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c index a6dc182b6c22d7..f313ccc4b4dd22 100644 --- a/lib/crypto/poly1305.c +++ b/lib/crypto/poly1305.c @@ -7,7 +7,6 @@ * Based on public domain code by Andrew Moon and Daniel J. Bernstein. */ -#include #include #include #include @@ -15,6 +14,14 @@ #include #include +#ifdef CONFIG_CRYPTO_LIB_POLY1305_ARCH +#include "poly1305.h" /* $(SRCARCH)/poly1305.h */ +#else +#define poly1305_block_init poly1305_block_init_generic +#define poly1305_blocks poly1305_blocks_generic +#define poly1305_emit poly1305_emit_generic +#endif + void poly1305_init(struct poly1305_desc_ctx *desc, const u8 key[POLY1305_KEY_SIZE]) { @@ -23,28 +30,40 @@ void poly1305_init(struct poly1305_desc_ctx *desc, desc->s[2] = get_unaligned_le32(key + 24); desc->s[3] = get_unaligned_le32(key + 28); desc->buflen = 0; - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_block_init_arch(&desc->state, key); - else - poly1305_block_init_generic(&desc->state, key); + poly1305_block_init(&desc->state, key); } EXPORT_SYMBOL(poly1305_init); -static inline void poly1305_blocks(struct poly1305_block_state *state, - const u8 *src, unsigned int len) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_blocks_arch(state, src, len, 1); - else - poly1305_blocks_generic(state, src, len, 1); -} - void poly1305_update(struct poly1305_desc_ctx *desc, const u8 *src, unsigned int nbytes) { - desc->buflen = BLOCK_HASH_UPDATE(poly1305_blocks, &desc->state, - src, nbytes, POLY1305_BLOCK_SIZE, - desc->buf, desc->buflen); + if (desc->buflen + nbytes >= POLY1305_BLOCK_SIZE) { + unsigned int bulk_len; + + if (desc->buflen) { + unsigned int l = POLY1305_BLOCK_SIZE - desc->buflen; + + memcpy(&desc->buf[desc->buflen], src, l); + src += l; + nbytes -= l; + + poly1305_blocks(&desc->state, desc->buf, + POLY1305_BLOCK_SIZE, 1); + desc->buflen = 0; + } + + bulk_len = round_down(nbytes, POLY1305_BLOCK_SIZE); + nbytes %= POLY1305_BLOCK_SIZE; + + if (bulk_len) { + poly1305_blocks(&desc->state, src, bulk_len, 1); + src += bulk_len; + } + } + if (nbytes) { + memcpy(&desc->buf[desc->buflen], src, nbytes); + desc->buflen += nbytes; + } } EXPORT_SYMBOL(poly1305_update); @@ -54,22 +73,28 @@ void poly1305_final(struct poly1305_desc_ctx *desc, u8 *dst) desc->buf[desc->buflen++] = 1; memset(desc->buf + desc->buflen, 0, POLY1305_BLOCK_SIZE - desc->buflen); - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_blocks_arch(&desc->state, desc->buf, - POLY1305_BLOCK_SIZE, 0); - else - poly1305_blocks_generic(&desc->state, desc->buf, - POLY1305_BLOCK_SIZE, 0); + poly1305_blocks(&desc->state, desc->buf, POLY1305_BLOCK_SIZE, + 0); } - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_emit_arch(&desc->state.h, dst, desc->s); - else - poly1305_emit_generic(&desc->state.h, dst, desc->s); + poly1305_emit(&desc->state.h, dst, desc->s); *desc = (struct poly1305_desc_ctx){}; } EXPORT_SYMBOL(poly1305_final); +#ifdef poly1305_mod_init_arch +static int __init poly1305_mod_init(void) +{ + poly1305_mod_init_arch(); + return 0; +} +subsys_initcall(poly1305_mod_init); + +static void __exit poly1305_mod_exit(void) +{ +} +module_exit(poly1305_mod_exit); +#endif + MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); MODULE_DESCRIPTION("Poly1305 authenticator algorithm, RFC7539"); diff --git a/lib/crypto/powerpc/Kconfig b/lib/crypto/powerpc/Kconfig index 2eaeb7665a6a0e..e41012a61876ee 100644 --- a/lib/crypto/powerpc/Kconfig +++ b/lib/crypto/powerpc/Kconfig @@ -6,11 +6,3 @@ config CRYPTO_CHACHA20_P10 default CRYPTO_LIB_CHACHA select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_P10 - tristate - depends on PPC64 && CPU_LITTLE_ENDIAN && VSX - depends on BROKEN # Needs to be fixed to work in softirq context - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - select CRYPTO_LIB_POLY1305_GENERIC diff --git a/lib/crypto/powerpc/Makefile b/lib/crypto/powerpc/Makefile index 5709ae14258a06..778a04edd226cf 100644 --- a/lib/crypto/powerpc/Makefile +++ b/lib/crypto/powerpc/Makefile @@ -2,6 +2,3 @@ obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o - -obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o -poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o diff --git a/lib/crypto/powerpc/poly1305-p10-glue.c b/lib/crypto/powerpc/poly1305.h similarity index 66% rename from lib/crypto/powerpc/poly1305-p10-glue.c rename to lib/crypto/powerpc/poly1305.h index 184a71f9c1dee0..b8ed098a0e95fc 100644 --- a/lib/crypto/powerpc/poly1305-p10-glue.c +++ b/lib/crypto/powerpc/poly1305.h @@ -1,15 +1,13 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Poly1305 authenticator algorithm, RFC7539. * * Copyright 2023- IBM Corp. All rights reserved. */ #include -#include #include #include #include -#include #include asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen); @@ -30,8 +28,8 @@ static void vsx_end(void) preempt_enable(); } -void poly1305_block_init_arch(struct poly1305_block_state *dctx, - const u8 raw_key[POLY1305_BLOCK_SIZE]) +static void poly1305_block_init(struct poly1305_block_state *dctx, + const u8 raw_key[POLY1305_BLOCK_SIZE]) { if (!static_key_enabled(&have_p10)) return poly1305_block_init_generic(dctx, raw_key); @@ -40,10 +38,9 @@ void poly1305_block_init_arch(struct poly1305_block_state *dctx, dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0); dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8); } -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) { if (!static_key_enabled(&have_p10)) return poly1305_blocks_generic(state, src, len, padbit); @@ -60,31 +57,18 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, } vsx_end(); } -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]) +static void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4]) { if (!static_key_enabled(&have_p10)) return poly1305_emit_generic(state, digest, nonce); poly1305_emit_64(state, nonce, digest); } -EXPORT_SYMBOL_GPL(poly1305_emit_arch); -static int __init poly1305_p10_init(void) +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) { if (cpu_has_feature(CPU_FTR_ARCH_31)) static_branch_enable(&have_p10); - return 0; } -subsys_initcall(poly1305_p10_init); - -static void __exit poly1305_p10_exit(void) -{ -} -module_exit(poly1305_p10_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Danny Tsen "); -MODULE_DESCRIPTION("Optimized Poly1305 for P10"); diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig index 546fe2afe0b51e..24dc9a59b27289 100644 --- a/lib/crypto/x86/Kconfig +++ b/lib/crypto/x86/Kconfig @@ -18,9 +18,3 @@ config CRYPTO_CHACHA20_X86_64 default CRYPTO_LIB_CHACHA select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile index c2ff8c5f1046e2..16c9d76f994729 100644 --- a/lib/crypto/x86/Makefile +++ b/lib/crypto/x86/Makefile @@ -5,13 +5,3 @@ libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o -poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o -targets += poly1305-x86_64-cryptogams.S - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $< > $@ - -$(obj)/%.S: $(src)/%.pl FORCE - $(call if_changed,perlasm) diff --git a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl index 501827254fed75..409ec6955733ad 100644 --- a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl +++ b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl @@ -118,19 +118,6 @@ () } } -sub declare_typed_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= "SYM_TYPED_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - sub end_function() { my ($name) = @_; if($kernel) { @@ -141,7 +128,7 @@ () } $code.=<<___ if $kernel; -#include +#include ___ if ($avx) { @@ -249,14 +236,14 @@ sub poly1305_iteration { $code.=<<___ if (!$kernel); .extern OPENSSL_ia32cap_P -.globl poly1305_block_init_arch -.hidden poly1305_block_init_arch +.globl poly1305_init_x86_64 +.hidden poly1305_init_x86_64 .globl poly1305_blocks_x86_64 .hidden poly1305_blocks_x86_64 .globl poly1305_emit_x86_64 .hidden poly1305_emit_x86_64 ___ -&declare_typed_function("poly1305_block_init_arch", 32, 3); +&declare_function("poly1305_init_x86_64", 32, 3); $code.=<<___; xor %eax,%eax mov %rax,0($ctx) # initialize hash value @@ -311,7 +298,7 @@ sub poly1305_iteration { .Lno_key: RET ___ -&end_function("poly1305_block_init_arch"); +&end_function("poly1305_init_x86_64"); &declare_function("poly1305_blocks_x86_64", 32, 4); $code.=<<___; @@ -4118,9 +4105,9 @@ sub poly1305_blocks_avxN { .section .pdata .align 4 - .rva .LSEH_begin_poly1305_block_init_arch - .rva .LSEH_end_poly1305_block_init_arch - .rva .LSEH_info_poly1305_block_init_arch + .rva .LSEH_begin_poly1305_init_x86_64 + .rva .LSEH_end_poly1305_init_x86_64 + .rva .LSEH_info_poly1305_init_x86_64 .rva .LSEH_begin_poly1305_blocks_x86_64 .rva .LSEH_end_poly1305_blocks_x86_64 @@ -4168,10 +4155,10 @@ sub poly1305_blocks_avxN { $code.=<<___; .section .xdata .align 8 -.LSEH_info_poly1305_block_init_arch: +.LSEH_info_poly1305_init_x86_64: .byte 9,0,0,0 .rva se_handler - .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch + .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 .LSEH_info_poly1305_blocks_x86_64: .byte 9,0,0,0 diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305.h similarity index 85% rename from lib/crypto/x86/poly1305_glue.c rename to lib/crypto/x86/poly1305.h index deb5841cb0ada6..ee92e3740a7876 100644 --- a/lib/crypto/x86/poly1305_glue.c +++ b/lib/crypto/x86/poly1305.h @@ -1,16 +1,13 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. */ #include #include -#include #include #include -#include #include -#include struct poly1305_arch_internal { union { @@ -61,10 +58,8 @@ static void convert_to_base2_64(void *ctx) state->is_base2_26 = 0; } -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_init_x86_64(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx, const u8 *inp, const size_t len, const u32 padbit); @@ -88,8 +83,14 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, - unsigned int len, u32 padbit) +static void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]) +{ + poly1305_init_x86_64(state, raw_key); +} + +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *inp, + unsigned int len, u32 padbit) { struct poly1305_arch_internal *ctx = container_of(&state->h.h, struct poly1305_arch_internal, h); @@ -129,19 +130,18 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, inp += bytes; } while (len); } -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -void poly1305_emit_arch(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) +static void poly1305_emit(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) { if (!static_branch_likely(&poly1305_use_avx)) poly1305_emit_x86_64(ctx, mac, nonce); else poly1305_emit_avx(ctx, mac, nonce); } -EXPORT_SYMBOL_GPL(poly1305_emit_arch); -static int __init poly1305_simd_mod_init(void) +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) @@ -155,15 +155,4 @@ static int __init poly1305_simd_mod_init(void) /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) static_branch_enable(&poly1305_use_avx512); - return 0; } -subsys_initcall(poly1305_simd_mod_init); - -static void __exit poly1305_simd_mod_exit(void) -{ -} -module_exit(poly1305_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jason A. Donenfeld "); -MODULE_DESCRIPTION("Poly1305 authenticator"); From bef9c755986980eecc18e9cecd847bc3c037aebb Mon Sep 17 00:00:00 2001 From: Zhihang Shao Date: Fri, 29 Aug 2025 08:25:13 -0700 Subject: [PATCH 0593/3206] lib/crypto: riscv/poly1305: Import OpenSSL/CRYPTOGAMS implementation This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation for riscv authored by Andy Polyakov. The file 'poly1305-riscv.pl' is taken straight from https://github.com/dot-asm/cryptogams commit 5e3fba73576244708a752fa61a8e93e587f271bb. This patch was tested on SpacemiT X60, with 2~2.5x improvement over generic implementation. Signed-off-by: Chunyan Zhang Signed-off-by: Zhihang Shao [EB: ported to lib/crypto/riscv/] Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250829152513.92459-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/Kconfig | 3 +- lib/crypto/Makefile | 14 + lib/crypto/riscv/poly1305-riscv.pl | 847 +++++++++++++++++++++++++++++ lib/crypto/riscv/poly1305.h | 14 + 4 files changed, 877 insertions(+), 1 deletion(-) create mode 100644 lib/crypto/riscv/poly1305-riscv.pl create mode 100644 lib/crypto/riscv/poly1305.h diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 9991118c41a9d7..cb4e056a98faf4 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -128,6 +128,7 @@ config CRYPTO_LIB_POLY1305_ARCH default y if MIPS # The PPC64 code needs to be fixed to work in softirq context. default y if PPC64 && CPU_LITTLE_ENDIAN && VSX && BROKEN + default y if RISCV default y if X86_64 # This symbol controls the inclusion of the Poly1305 generic code. This differs @@ -143,7 +144,7 @@ config CRYPTO_LIB_POLY1305_GENERIC config CRYPTO_LIB_POLY1305_RSIZE int - default 2 if MIPS + default 2 if MIPS || RISCV default 11 if X86_64 default 9 if ARM || ARM64 default 1 diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index e0536e3b3a04c7..cd460e5e3dd240 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -112,6 +112,19 @@ endif libpoly1305-$(CONFIG_PPC) += powerpc/poly1305-p10le_64.o +ifeq ($(CONFIG_RISCV),y) +libpoly1305-y += riscv/poly1305-core.o +poly1305-perlasm-flavour-$(CONFIG_32BIT) := 32 +poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64 +quiet_cmd_perlasm_poly1305 = PERLASM $@ + cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@ +# Use if_changed instead of cmd, in case the flavour changed. +$(obj)/riscv/poly1305-core.S: $(src)/riscv/poly1305-riscv.pl FORCE + $(call if_changed,perlasm_poly1305) +targets += riscv/poly1305-core.S +AFLAGS_riscv/poly1305-core.o += -Dpoly1305_init=poly1305_block_init +endif + ifeq ($(CONFIG_X86),y) libpoly1305-y += x86/poly1305-x86_64-cryptogams.o $(obj)/x86/poly1305-x86_64-cryptogams.S: $(src)/x86/poly1305-x86_64-cryptogams.pl @@ -124,6 +137,7 @@ endif # CONFIG_CRYPTO_LIB_POLY1305_ARCH clean-files += arm/poly1305-core.S \ arm64/poly1305-core.S \ mips/poly1305-core.S \ + riscv/poly1305-core.S \ x86/poly1305-x86_64-cryptogams.S ################################################################################ diff --git a/lib/crypto/riscv/poly1305-riscv.pl b/lib/crypto/riscv/poly1305-riscv.pl new file mode 100644 index 00000000000000..e25e6338a9ac10 --- /dev/null +++ b/lib/crypto/riscv/poly1305-riscv.pl @@ -0,0 +1,847 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL. +# ==================================================================== +# +# Poly1305 hash for RISC-V. +# +# February 2019 +# +# In the essence it's pretty straightforward transliteration of MIPS +# module [without big-endian option]. +# +# 1.8 cycles per byte on U74, >100% faster than compiler-generated +# code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69% +# improvement. +# +# June 2024. +# +# Add CHERI support. +# +###################################################################### +# +($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4)); +($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27)); +# +###################################################################### + +$flavour = shift || "64"; + +for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +$code.=<<___; +#ifdef __KERNEL__ +# ifdef __riscv_zicfilp +# undef __riscv_zicfilp // calls are expected to be direct +# endif +#endif + +#if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast) +# define __riscv_misaligned_fast 1 +#endif +___ + +if ($flavour =~ /64/) {{{ +###################################################################### +# 64-bit code path... +# +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2); + +$code.=<<___; +#if __riscv_xlen == 64 +# if __SIZEOF_POINTER__ == 16 +# define PUSH csc +# define POP clc +# else +# define PUSH sd +# define POP ld +# endif +#else +# error "unsupported __riscv_xlen" +#endif + +.option pic +.text + +.globl poly1305_init +.type poly1305_init,\@function +poly1305_init: +#ifdef __riscv_zicfilp + lpad 0 +#endif + sd $zero,0($ctx) + sd $zero,8($ctx) + sd $zero,16($ctx) + + beqz $inp,.Lno_key + +#ifndef __riscv_misaligned_fast + andi $tmp0,$inp,7 # $inp % 8 + andi $inp,$inp,-8 # align $inp + slli $tmp0,$tmp0,3 # byte to bit offset +#endif + ld $in0,0($inp) + ld $in1,8($inp) +#ifndef __riscv_misaligned_fast + beqz $tmp0,.Laligned_key + + ld $tmp2,16($inp) + neg $tmp1,$tmp0 # implicit &63 in sll + srl $in0,$in0,$tmp0 + sll $tmp3,$in1,$tmp1 + srl $in1,$in1,$tmp0 + sll $tmp2,$tmp2,$tmp1 + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 + +.Laligned_key: +#endif + li $tmp0,1 + slli $tmp0,$tmp0,32 # 0x0000000100000000 + addi $tmp0,$tmp0,-63 # 0x00000000ffffffc1 + slli $tmp0,$tmp0,28 # 0x0ffffffc10000000 + addi $tmp0,$tmp0,-1 # 0x0ffffffc0fffffff + + and $in0,$in0,$tmp0 + addi $tmp0,$tmp0,-3 # 0x0ffffffc0ffffffc + and $in1,$in1,$tmp0 + + sd $in0,24($ctx) + srli $tmp0,$in1,2 + sd $in1,32($ctx) + add $tmp0,$tmp0,$in1 # s1 = r1 + (r1 >> 2) + sd $tmp0,40($ctx) + +.Lno_key: + li $a0,0 # return 0 + ret +.size poly1305_init,.-poly1305_init +___ +{ +my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = + ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2); +my ($shr,$shl) = ($t5,$t6); # used on R6 + +$code.=<<___; +.globl poly1305_blocks +.type poly1305_blocks,\@function +poly1305_blocks: +#ifdef __riscv_zicfilp + lpad 0 +#endif + andi $len,$len,-16 # complete blocks only + beqz $len,.Lno_data + + caddi $sp,$sp,-4*__SIZEOF_POINTER__ + PUSH $s0,3*__SIZEOF_POINTER__($sp) + PUSH $s1,2*__SIZEOF_POINTER__($sp) + PUSH $s2,1*__SIZEOF_POINTER__($sp) + PUSH $s3,0*__SIZEOF_POINTER__($sp) + +#ifndef __riscv_misaligned_fast + andi $shr,$inp,7 + andi $inp,$inp,-8 # align $inp + slli $shr,$shr,3 # byte to bit offset + neg $shl,$shr # implicit &63 in sll +#endif + + ld $h0,0($ctx) # load hash value + ld $h1,8($ctx) + ld $h2,16($ctx) + + ld $r0,24($ctx) # load key + ld $r1,32($ctx) + ld $rs1,40($ctx) + + add $len,$len,$inp # end of buffer + +.Loop: + ld $in0,0($inp) # load input + ld $in1,8($inp) +#ifndef __riscv_misaligned_fast + beqz $shr,.Laligned_inp + + ld $tmp2,16($inp) + srl $in0,$in0,$shr + sll $tmp3,$in1,$shl + srl $in1,$in1,$shr + sll $tmp2,$tmp2,$shl + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 + +.Laligned_inp: +#endif + caddi $inp,$inp,16 + + andi $tmp0,$h2,-4 # modulo-scheduled reduction + srli $tmp1,$h2,2 + andi $h2,$h2,3 + + add $d0,$h0,$in0 # accumulate input + add $tmp1,$tmp1,$tmp0 + sltu $tmp0,$d0,$h0 + add $d0,$d0,$tmp1 # ... and residue + sltu $tmp1,$d0,$tmp1 + add $d1,$h1,$in1 + add $tmp0,$tmp0,$tmp1 + sltu $tmp1,$d1,$h1 + add $d1,$d1,$tmp0 + + add $d2,$h2,$padbit + sltu $tmp0,$d1,$tmp0 + mulhu $h1,$r0,$d0 # h0*r0 + mul $h0,$r0,$d0 + + add $d2,$d2,$tmp1 + add $d2,$d2,$tmp0 + mulhu $tmp1,$rs1,$d1 # h1*5*r1 + mul $tmp0,$rs1,$d1 + + mulhu $h2,$r1,$d0 # h0*r1 + mul $tmp2,$r1,$d0 + add $h0,$h0,$tmp0 + add $h1,$h1,$tmp1 + sltu $tmp0,$h0,$tmp0 + + add $h1,$h1,$tmp0 + add $h1,$h1,$tmp2 + mulhu $tmp1,$r0,$d1 # h1*r0 + mul $tmp0,$r0,$d1 + + sltu $tmp2,$h1,$tmp2 + add $h2,$h2,$tmp2 + mul $tmp2,$rs1,$d2 # h2*5*r1 + + add $h1,$h1,$tmp0 + add $h2,$h2,$tmp1 + mul $tmp3,$r0,$d2 # h2*r0 + sltu $tmp0,$h1,$tmp0 + add $h2,$h2,$tmp0 + + add $h1,$h1,$tmp2 + sltu $tmp2,$h1,$tmp2 + add $h2,$h2,$tmp2 + add $h2,$h2,$tmp3 + + bne $inp,$len,.Loop + + sd $h0,0($ctx) # store hash value + sd $h1,8($ctx) + sd $h2,16($ctx) + + POP $s0,3*__SIZEOF_POINTER__($sp) # epilogue + POP $s1,2*__SIZEOF_POINTER__($sp) + POP $s2,1*__SIZEOF_POINTER__($sp) + POP $s3,0*__SIZEOF_POINTER__($sp) + caddi $sp,$sp,4*__SIZEOF_POINTER__ + +.Lno_data: + ret +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); + +$code.=<<___; +.globl poly1305_emit +.type poly1305_emit,\@function +poly1305_emit: +#ifdef __riscv_zicfilp + lpad 0 +#endif + ld $tmp2,16($ctx) + ld $tmp0,0($ctx) + ld $tmp1,8($ctx) + + andi $in0,$tmp2,-4 # final reduction + srl $in1,$tmp2,2 + andi $tmp2,$tmp2,3 + add $in0,$in0,$in1 + + add $tmp0,$tmp0,$in0 + sltu $in1,$tmp0,$in0 + addi $in0,$tmp0,5 # compare to modulus + add $tmp1,$tmp1,$in1 + sltiu $tmp3,$in0,5 + sltu $tmp4,$tmp1,$in1 + add $in1,$tmp1,$tmp3 + add $tmp2,$tmp2,$tmp4 + sltu $tmp3,$in1,$tmp3 + add $tmp2,$tmp2,$tmp3 + + srli $tmp2,$tmp2,2 # see if it carried/borrowed + neg $tmp2,$tmp2 + + xor $in0,$in0,$tmp0 + xor $in1,$in1,$tmp1 + and $in0,$in0,$tmp2 + and $in1,$in1,$tmp2 + xor $in0,$in0,$tmp0 + xor $in1,$in1,$tmp1 + + lwu $tmp0,0($nonce) # load nonce + lwu $tmp1,4($nonce) + lwu $tmp2,8($nonce) + lwu $tmp3,12($nonce) + slli $tmp1,$tmp1,32 + slli $tmp3,$tmp3,32 + or $tmp0,$tmp0,$tmp1 + or $tmp2,$tmp2,$tmp3 + + add $in0,$in0,$tmp0 # accumulate nonce + add $in1,$in1,$tmp2 + sltu $tmp0,$in0,$tmp0 + add $in1,$in1,$tmp0 + +#ifdef __riscv_misaligned_fast + sd $in0,0($mac) # write mac value + sd $in1,8($mac) +#else + srli $tmp0,$in0,8 # write mac value + srli $tmp1,$in0,16 + srli $tmp2,$in0,24 + sb $in0,0($mac) + srli $tmp3,$in0,32 + sb $tmp0,1($mac) + srli $tmp0,$in0,40 + sb $tmp1,2($mac) + srli $tmp1,$in0,48 + sb $tmp2,3($mac) + srli $tmp2,$in0,56 + sb $tmp3,4($mac) + srli $tmp3,$in1,8 + sb $tmp0,5($mac) + srli $tmp0,$in1,16 + sb $tmp1,6($mac) + srli $tmp1,$in1,24 + sb $tmp2,7($mac) + + sb $in1,8($mac) + srli $tmp2,$in1,32 + sb $tmp3,9($mac) + srli $tmp3,$in1,40 + sb $tmp0,10($mac) + srli $tmp0,$in1,48 + sb $tmp1,11($mac) + srli $tmp1,$in1,56 + sb $tmp2,12($mac) + sb $tmp3,13($mac) + sb $tmp0,14($mac) + sb $tmp1,15($mac) +#endif + + ret +.size poly1305_emit,.-poly1305_emit +.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm" +___ +} +}}} else {{{ +###################################################################### +# 32-bit code path +# + +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = + ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3); + +$code.=<<___; +#if __riscv_xlen == 32 +# if __SIZEOF_POINTER__ == 8 +# define PUSH csc +# define POP clc +# else +# define PUSH sw +# define POP lw +# endif +# define MULX(hi,lo,a,b) mulhu hi,a,b; mul lo,a,b +# define srliw srli +# define srlw srl +# define sllw sll +# define addw add +# define addiw addi +# define mulw mul +#elif __riscv_xlen == 64 +# if __SIZEOF_POINTER__ == 16 +# define PUSH csc +# define POP clc +# else +# define PUSH sd +# define POP ld +# endif +# define MULX(hi,lo,a,b) slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32 +#else +# error "unsupported __riscv_xlen" +#endif + +.option pic +.text + +.globl poly1305_init +.type poly1305_init,\@function +poly1305_init: +#ifdef __riscv_zicfilp + lpad 0 +#endif + sw $zero,0($ctx) + sw $zero,4($ctx) + sw $zero,8($ctx) + sw $zero,12($ctx) + sw $zero,16($ctx) + + beqz $inp,.Lno_key + +#ifndef __riscv_misaligned_fast + andi $tmp0,$inp,3 # $inp % 4 + sub $inp,$inp,$tmp0 # align $inp + sll $tmp0,$tmp0,3 # byte to bit offset +#endif + lw $in0,0($inp) + lw $in1,4($inp) + lw $in2,8($inp) + lw $in3,12($inp) +#ifndef __riscv_misaligned_fast + beqz $tmp0,.Laligned_key + + lw $tmp2,16($inp) + sub $tmp1,$zero,$tmp0 + srlw $in0,$in0,$tmp0 + sllw $tmp3,$in1,$tmp1 + srlw $in1,$in1,$tmp0 + or $in0,$in0,$tmp3 + sllw $tmp3,$in2,$tmp1 + srlw $in2,$in2,$tmp0 + or $in1,$in1,$tmp3 + sllw $tmp3,$in3,$tmp1 + srlw $in3,$in3,$tmp0 + or $in2,$in2,$tmp3 + sllw $tmp2,$tmp2,$tmp1 + or $in3,$in3,$tmp2 +.Laligned_key: +#endif + + lui $tmp0,0x10000 + addi $tmp0,$tmp0,-1 # 0x0fffffff + and $in0,$in0,$tmp0 + addi $tmp0,$tmp0,-3 # 0x0ffffffc + and $in1,$in1,$tmp0 + and $in2,$in2,$tmp0 + and $in3,$in3,$tmp0 + + sw $in0,20($ctx) + sw $in1,24($ctx) + sw $in2,28($ctx) + sw $in3,32($ctx) + + srlw $tmp1,$in1,2 + srlw $tmp2,$in2,2 + srlw $tmp3,$in3,2 + addw $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) + addw $in2,$in2,$tmp2 + addw $in3,$in3,$tmp3 + sw $in1,36($ctx) + sw $in2,40($ctx) + sw $in3,44($ctx) +.Lno_key: + li $a0,0 + ret +.size poly1305_init,.-poly1305_init +___ +{ +my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = + ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2); +my ($d0,$d1,$d2,$d3) = + ($a4,$a5,$a6,$a7); +my $shr = $ra; # used on R6 + +$code.=<<___; +.globl poly1305_blocks +.type poly1305_blocks,\@function +poly1305_blocks: +#ifdef __riscv_zicfilp + lpad 0 +#endif + andi $len,$len,-16 # complete blocks only + beqz $len,.Labort + +#ifdef __riscv_zcmp + cm.push {ra,s0-s8}, -48 +#else + caddi $sp,$sp,-__SIZEOF_POINTER__*12 + PUSH $ra, __SIZEOF_POINTER__*11($sp) + PUSH $s0, __SIZEOF_POINTER__*10($sp) + PUSH $s1, __SIZEOF_POINTER__*9($sp) + PUSH $s2, __SIZEOF_POINTER__*8($sp) + PUSH $s3, __SIZEOF_POINTER__*7($sp) + PUSH $s4, __SIZEOF_POINTER__*6($sp) + PUSH $s5, __SIZEOF_POINTER__*5($sp) + PUSH $s6, __SIZEOF_POINTER__*4($sp) + PUSH $s7, __SIZEOF_POINTER__*3($sp) + PUSH $s8, __SIZEOF_POINTER__*2($sp) +#endif + +#ifndef __riscv_misaligned_fast + andi $shr,$inp,3 + andi $inp,$inp,-4 # align $inp + slli $shr,$shr,3 # byte to bit offset +#endif + + lw $h0,0($ctx) # load hash value + lw $h1,4($ctx) + lw $h2,8($ctx) + lw $h3,12($ctx) + lw $h4,16($ctx) + + lw $r0,20($ctx) # load key + lw $r1,24($ctx) + lw $r2,28($ctx) + lw $r3,32($ctx) + lw $rs1,36($ctx) + lw $rs2,40($ctx) + lw $rs3,44($ctx) + + add $len,$len,$inp # end of buffer + +.Loop: + lw $d0,0($inp) # load input + lw $d1,4($inp) + lw $d2,8($inp) + lw $d3,12($inp) +#ifndef __riscv_misaligned_fast + beqz $shr,.Laligned_inp + + lw $t4,16($inp) + sub $t5,$zero,$shr + srlw $d0,$d0,$shr + sllw $t3,$d1,$t5 + srlw $d1,$d1,$shr + or $d0,$d0,$t3 + sllw $t3,$d2,$t5 + srlw $d2,$d2,$shr + or $d1,$d1,$t3 + sllw $t3,$d3,$t5 + srlw $d3,$d3,$shr + or $d2,$d2,$t3 + sllw $t4,$t4,$t5 + or $d3,$d3,$t4 + +.Laligned_inp: +#endif + srliw $t3,$h4,2 # modulo-scheduled reduction + andi $t4,$h4,-4 + andi $h4,$h4,3 + + addw $d0,$d0,$h0 # accumulate input + addw $t4,$t4,$t3 + sltu $h0,$d0,$h0 + addw $d0,$d0,$t4 # ... and residue + sltu $t4,$d0,$t4 + + addw $d1,$d1,$h1 + addw $h0,$h0,$t4 # carry + sltu $h1,$d1,$h1 + addw $d1,$d1,$h0 + sltu $h0,$d1,$h0 + + addw $d2,$d2,$h2 + addw $h1,$h1,$h0 # carry + sltu $h2,$d2,$h2 + addw $d2,$d2,$h1 + sltu $h1,$d2,$h1 + + addw $d3,$d3,$h3 + addw $h2,$h2,$h1 # carry + sltu $h3,$d3,$h3 + addw $d3,$d3,$h2 + + MULX ($h1,$h0,$r0,$d0) # d0*r0 + + sltu $h2,$d3,$h2 + addw $h3,$h3,$h2 # carry + + MULX ($t4,$t3,$rs3,$d1) # d1*s3 + + addw $h4,$h4,$padbit + caddi $inp,$inp,16 + addw $h4,$h4,$h3 + + MULX ($t6,$a3,$rs2,$d2) # d2*s2 + addw $h0,$h0,$t3 + addw $h1,$h1,$t4 + sltu $t3,$h0,$t3 + addw $h1,$h1,$t3 + + MULX ($t4,$t3,$rs1,$d3) # d3*s1 + addw $h0,$h0,$a3 + addw $h1,$h1,$t6 + sltu $a3,$h0,$a3 + addw $h1,$h1,$a3 + + + MULX ($h2,$a3,$r1,$d0) # d0*r1 + addw $h0,$h0,$t3 + addw $h1,$h1,$t4 + sltu $t3,$h0,$t3 + addw $h1,$h1,$t3 + + MULX ($t4,$t3,$r0,$d1) # d1*r0 + addw $h1,$h1,$a3 + sltu $a3,$h1,$a3 + addw $h2,$h2,$a3 + + MULX ($t6,$a3,$rs3,$d2) # d2*s3 + addw $h1,$h1,$t3 + addw $h2,$h2,$t4 + sltu $t3,$h1,$t3 + addw $h2,$h2,$t3 + + MULX ($t4,$t3,$rs2,$d3) # d3*s2 + addw $h1,$h1,$a3 + addw $h2,$h2,$t6 + sltu $a3,$h1,$a3 + addw $h2,$h2,$a3 + + mulw $a3,$rs1,$h4 # h4*s1 + addw $h1,$h1,$t3 + addw $h2,$h2,$t4 + sltu $t3,$h1,$t3 + addw $h2,$h2,$t3 + + + MULX ($h3,$t3,$r2,$d0) # d0*r2 + addw $h1,$h1,$a3 + sltu $a3,$h1,$a3 + addw $h2,$h2,$a3 + + MULX ($t6,$a3,$r1,$d1) # d1*r1 + addw $h2,$h2,$t3 + sltu $t3,$h2,$t3 + addw $h3,$h3,$t3 + + MULX ($t4,$t3,$r0,$d2) # d2*r0 + addw $h2,$h2,$a3 + addw $h3,$h3,$t6 + sltu $a3,$h2,$a3 + addw $h3,$h3,$a3 + + MULX ($t6,$a3,$rs3,$d3) # d3*s3 + addw $h2,$h2,$t3 + addw $h3,$h3,$t4 + sltu $t3,$h2,$t3 + addw $h3,$h3,$t3 + + mulw $t3,$rs2,$h4 # h4*s2 + addw $h2,$h2,$a3 + addw $h3,$h3,$t6 + sltu $a3,$h2,$a3 + addw $h3,$h3,$a3 + + + MULX ($t6,$a3,$r3,$d0) # d0*r3 + addw $h2,$h2,$t3 + sltu $t3,$h2,$t3 + addw $h3,$h3,$t3 + + MULX ($t4,$t3,$r2,$d1) # d1*r2 + addw $h3,$h3,$a3 + sltu $a3,$h3,$a3 + addw $t6,$t6,$a3 + + MULX ($a3,$d3,$r0,$d3) # d3*r0 + addw $h3,$h3,$t3 + addw $t6,$t6,$t4 + sltu $t3,$h3,$t3 + addw $t6,$t6,$t3 + + MULX ($t4,$t3,$r1,$d2) # d2*r1 + addw $h3,$h3,$d3 + addw $t6,$t6,$a3 + sltu $d3,$h3,$d3 + addw $t6,$t6,$d3 + + mulw $a3,$rs3,$h4 # h4*s3 + addw $h3,$h3,$t3 + addw $t6,$t6,$t4 + sltu $t3,$h3,$t3 + addw $t6,$t6,$t3 + + + mulw $h4,$r0,$h4 # h4*r0 + addw $h3,$h3,$a3 + sltu $a3,$h3,$a3 + addw $t6,$t6,$a3 + addw $h4,$t6,$h4 + + li $padbit,1 # if we loop, padbit is 1 + + bne $inp,$len,.Loop + + sw $h0,0($ctx) # store hash value + sw $h1,4($ctx) + sw $h2,8($ctx) + sw $h3,12($ctx) + sw $h4,16($ctx) + +#ifdef __riscv_zcmp + cm.popret {ra,s0-s8}, 48 +#else + POP $ra, __SIZEOF_POINTER__*11($sp) + POP $s0, __SIZEOF_POINTER__*10($sp) + POP $s1, __SIZEOF_POINTER__*9($sp) + POP $s2, __SIZEOF_POINTER__*8($sp) + POP $s3, __SIZEOF_POINTER__*7($sp) + POP $s4, __SIZEOF_POINTER__*6($sp) + POP $s5, __SIZEOF_POINTER__*5($sp) + POP $s6, __SIZEOF_POINTER__*4($sp) + POP $s7, __SIZEOF_POINTER__*3($sp) + POP $s8, __SIZEOF_POINTER__*2($sp) + caddi $sp,$sp,__SIZEOF_POINTER__*12 +#endif +.Labort: + ret +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); + +$code.=<<___; +.globl poly1305_emit +.type poly1305_emit,\@function +poly1305_emit: +#ifdef __riscv_zicfilp + lpad 0 +#endif + lw $tmp4,16($ctx) + lw $tmp0,0($ctx) + lw $tmp1,4($ctx) + lw $tmp2,8($ctx) + lw $tmp3,12($ctx) + + srliw $ctx,$tmp4,2 # final reduction + andi $in0,$tmp4,-4 + andi $tmp4,$tmp4,3 + addw $ctx,$ctx,$in0 + + addw $tmp0,$tmp0,$ctx + sltu $ctx,$tmp0,$ctx + addiw $in0,$tmp0,5 # compare to modulus + addw $tmp1,$tmp1,$ctx + sltiu $in1,$in0,5 + sltu $ctx,$tmp1,$ctx + addw $in1,$in1,$tmp1 + addw $tmp2,$tmp2,$ctx + sltu $in2,$in1,$tmp1 + sltu $ctx,$tmp2,$ctx + addw $in2,$in2,$tmp2 + addw $tmp3,$tmp3,$ctx + sltu $in3,$in2,$tmp2 + sltu $ctx,$tmp3,$ctx + addw $in3,$in3,$tmp3 + addw $tmp4,$tmp4,$ctx + sltu $ctx,$in3,$tmp3 + addw $ctx,$ctx,$tmp4 + + srl $ctx,$ctx,2 # see if it carried/borrowed + sub $ctx,$zero,$ctx + + xor $in0,$in0,$tmp0 + xor $in1,$in1,$tmp1 + xor $in2,$in2,$tmp2 + xor $in3,$in3,$tmp3 + and $in0,$in0,$ctx + and $in1,$in1,$ctx + and $in2,$in2,$ctx + and $in3,$in3,$ctx + xor $in0,$in0,$tmp0 + xor $in1,$in1,$tmp1 + xor $in2,$in2,$tmp2 + xor $in3,$in3,$tmp3 + + lw $tmp0,0($nonce) # load nonce + lw $tmp1,4($nonce) + lw $tmp2,8($nonce) + lw $tmp3,12($nonce) + + addw $in0,$in0,$tmp0 # accumulate nonce + sltu $ctx,$in0,$tmp0 + + addw $in1,$in1,$tmp1 + sltu $tmp1,$in1,$tmp1 + addw $in1,$in1,$ctx + sltu $ctx,$in1,$ctx + addw $ctx,$ctx,$tmp1 + + addw $in2,$in2,$tmp2 + sltu $tmp2,$in2,$tmp2 + addw $in2,$in2,$ctx + sltu $ctx,$in2,$ctx + addw $ctx,$ctx,$tmp2 + + addw $in3,$in3,$tmp3 + addw $in3,$in3,$ctx + +#ifdef __riscv_misaligned_fast + sw $in0,0($mac) # write mac value + sw $in1,4($mac) + sw $in2,8($mac) + sw $in3,12($mac) +#else + srl $tmp0,$in0,8 # write mac value + srl $tmp1,$in0,16 + srl $tmp2,$in0,24 + sb $in0, 0($mac) + sb $tmp0,1($mac) + srl $tmp0,$in1,8 + sb $tmp1,2($mac) + srl $tmp1,$in1,16 + sb $tmp2,3($mac) + srl $tmp2,$in1,24 + sb $in1, 4($mac) + sb $tmp0,5($mac) + srl $tmp0,$in2,8 + sb $tmp1,6($mac) + srl $tmp1,$in2,16 + sb $tmp2,7($mac) + srl $tmp2,$in2,24 + sb $in2, 8($mac) + sb $tmp0,9($mac) + srl $tmp0,$in3,8 + sb $tmp1,10($mac) + srl $tmp1,$in3,16 + sb $tmp2,11($mac) + srl $tmp2,$in3,24 + sb $in3, 12($mac) + sb $tmp0,13($mac) + sb $tmp1,14($mac) + sb $tmp2,15($mac) +#endif + + ret +.size poly1305_emit,.-poly1305_emit +.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm" +___ +} +}}} + +foreach (split("\n", $code)) { + if ($flavour =~ /^cheri/) { + s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/; + s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or + s/\b(ret|jal)\b/c$1/; + s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or + m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g; + } else { + s/\bcaddi?\b/add/ or + s/\bcmove\b/mv/; + } + print $_, "\n"; +} + +close STDOUT; diff --git a/lib/crypto/riscv/poly1305.h b/lib/crypto/riscv/poly1305.h new file mode 100644 index 00000000000000..88f3df44e355ea --- /dev/null +++ b/lib/crypto/riscv/poly1305.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for riscv + * + * Copyright (C) 2025 Institute of Software, CAS. + */ + +asmlinkage void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); From 8bc48d603c95917f48145f3a5a0fa25172fdba2c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:20 -0700 Subject: [PATCH 0594/3206] arm: configs: Remove obsolete assignments to CRYPTO_CHACHA20_NEON Since v6.15, CRYPTO_CHACHA20_NEON is a hidden option and is selected automatically. Therefore, assigning a value to it in a defconfig no longer has any effect. Remove it from all files that did this. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm/configs/exynos_defconfig | 1 - arch/arm/configs/milbeaut_m10v_defconfig | 1 - arch/arm/configs/multi_v7_defconfig | 1 - arch/arm/configs/omap2plus_defconfig | 1 - 4 files changed, 4 deletions(-) diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig index 6915c766923a2f..84070e9698e8cc 100644 --- a/arch/arm/configs/exynos_defconfig +++ b/arch/arm/configs/exynos_defconfig @@ -364,7 +364,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_CRYPTO_AES_ARM_BS=m -CONFIG_CRYPTO_CHACHA20_NEON=m CONFIG_CRYPTO_DEV_EXYNOS_RNG=y CONFIG_CRYPTO_DEV_S5P=y CONFIG_DMA_CMA=y diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig index a3be0b2ede09c7..a2995eb390c603 100644 --- a/arch/arm/configs/milbeaut_m10v_defconfig +++ b/arch/arm/configs/milbeaut_m10v_defconfig @@ -101,7 +101,6 @@ CONFIG_CRYPTO_GHASH_ARM_CE=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_AES_ARM_CE=m -CONFIG_CRYPTO_CHACHA20_NEON=m # CONFIG_CRYPTO_HW is not set CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=64 diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index f2822eeefb9577..cc0e0e4a879cb1 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -1291,7 +1291,6 @@ CONFIG_CRYPTO_GHASH_ARM_CE=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_AES_ARM_CE=m -CONFIG_CRYPTO_CHACHA20_NEON=m CONFIG_CRYPTO_DEV_SUN4I_SS=m CONFIG_CRYPTO_DEV_FSL_CAAM=m CONFIG_CRYPTO_DEV_EXYNOS_RNG=m diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 939913ed9a73bd..1d5f752417398c 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -708,7 +708,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=y CONFIG_CRYPTO_GHASH_ARM_CE=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m -CONFIG_CRYPTO_CHACHA20_NEON=m CONFIG_CRYPTO_DEV_OMAP=m CONFIG_CRYPTO_DEV_OMAP_SHAM=m CONFIG_CRYPTO_DEV_OMAP_AES=m From 9dd6bb667ecccba0e895c920473c14f374d925c5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:21 -0700 Subject: [PATCH 0595/3206] crypto: chacha - register only "-lib" drivers For the "chacha20", "xchacha20", and "xchacha12" skcipher algorithms, instead of registering "*-generic" drivers as well as conditionally registering "*-$(ARCH)" drivers, instead just register "*-lib" drivers. These just use the regular library functions, so they just do the right thing and are fully accelerated when supported by the CPU. This eliminates the need for the ChaCha library to support chacha_crypt_generic() and hchacha_block_generic() as part of its external interface. A later commit will make chacha_crypt_generic() a static function. Since this commit removes several "*-generic" driver names which crypto/testmgr.c expects to exist, update testmgr.c accordingly. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/Kconfig | 1 - crypto/chacha.c | 129 ++++++++--------------------------------------- crypto/testmgr.c | 9 +++- 3 files changed, 29 insertions(+), 110 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index e8ccf5f51b8553..09e8fb6ee0813f 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -648,7 +648,6 @@ config CRYPTO_ARC4 config CRYPTO_CHACHA20 tristate "ChaCha" select CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_SKCIPHER help The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms diff --git a/crypto/chacha.c b/crypto/chacha.c index c3a11f4e2d13dc..ec16d5a33f3cd6 100644 --- a/crypto/chacha.c +++ b/crypto/chacha.c @@ -47,7 +47,7 @@ static int chacha12_setkey(struct crypto_skcipher *tfm, static int chacha_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, - const u8 iv[CHACHA_IV_SIZE], bool arch) + const u8 iv[CHACHA_IV_SIZE]) { struct skcipher_walk walk; struct chacha_state state; @@ -63,36 +63,23 @@ static int chacha_stream_xor(struct skcipher_request *req, if (nbytes < walk.total) nbytes = round_down(nbytes, CHACHA_BLOCK_SIZE); - if (arch) - chacha_crypt(&state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, ctx->nrounds); - else - chacha_crypt_generic(&state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); + chacha_crypt(&state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes, ctx->nrounds); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } -static int crypto_chacha_crypt_generic(struct skcipher_request *req) +static int crypto_chacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - return chacha_stream_xor(req, ctx, req->iv, false); + return chacha_stream_xor(req, ctx, req->iv); } -static int crypto_chacha_crypt_arch(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - - return chacha_stream_xor(req, ctx, req->iv, true); -} - -static int crypto_xchacha_crypt(struct skcipher_request *req, bool arch) +static int crypto_xchacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -102,10 +89,7 @@ static int crypto_xchacha_crypt(struct skcipher_request *req, bool arch) /* Compute the subkey given the original key and first 128 nonce bits */ chacha_init(&state, ctx->key, req->iv); - if (arch) - hchacha_block(&state, subctx.key, ctx->nrounds); - else - hchacha_block_generic(&state, subctx.key, ctx->nrounds); + hchacha_block(&state, subctx.key, ctx->nrounds); subctx.nrounds = ctx->nrounds; /* Build the real IV */ @@ -113,71 +97,13 @@ static int crypto_xchacha_crypt(struct skcipher_request *req, bool arch) memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */ /* Generate the stream and XOR it with the data */ - return chacha_stream_xor(req, &subctx, real_iv, arch); -} - -static int crypto_xchacha_crypt_generic(struct skcipher_request *req) -{ - return crypto_xchacha_crypt(req, false); -} - -static int crypto_xchacha_crypt_arch(struct skcipher_request *req) -{ - return crypto_xchacha_crypt(req, true); + return chacha_stream_xor(req, &subctx, real_iv); } static struct skcipher_alg algs[] = { { .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-generic", - .base.cra_priority = 100, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = crypto_chacha_crypt_generic, - .decrypt = crypto_chacha_crypt_generic, - }, - { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-generic", - .base.cra_priority = 100, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = crypto_xchacha_crypt_generic, - .decrypt = crypto_xchacha_crypt_generic, - }, - { - .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-generic", - .base.cra_priority = 100, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha12_setkey, - .encrypt = crypto_xchacha_crypt_generic, - .decrypt = crypto_xchacha_crypt_generic, - }, - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-" __stringify(ARCH), + .base.cra_driver_name = "chacha20-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), @@ -188,12 +114,12 @@ static struct skcipher_alg algs[] = { .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, - .encrypt = crypto_chacha_crypt_arch, - .decrypt = crypto_chacha_crypt_arch, + .encrypt = crypto_chacha_crypt, + .decrypt = crypto_chacha_crypt, }, { .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-" __stringify(ARCH), + .base.cra_driver_name = "xchacha20-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), @@ -204,12 +130,12 @@ static struct skcipher_alg algs[] = { .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, - .encrypt = crypto_xchacha_crypt_arch, - .decrypt = crypto_xchacha_crypt_arch, + .encrypt = crypto_xchacha_crypt, + .decrypt = crypto_xchacha_crypt, }, { .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-" __stringify(ARCH), + .base.cra_driver_name = "xchacha12-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), @@ -220,27 +146,19 @@ static struct skcipher_alg algs[] = { .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha12_setkey, - .encrypt = crypto_xchacha_crypt_arch, - .decrypt = crypto_xchacha_crypt_arch, + .encrypt = crypto_xchacha_crypt, + .decrypt = crypto_xchacha_crypt, } }; -static unsigned int num_algs; - static int __init crypto_chacha_mod_init(void) { - /* register the arch flavours only if they differ from generic */ - num_algs = ARRAY_SIZE(algs); - BUILD_BUG_ON(ARRAY_SIZE(algs) % 2 != 0); - if (!chacha_is_arch_optimized()) - num_algs /= 2; - - return crypto_register_skciphers(algs, num_algs); + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit crypto_chacha_mod_fini(void) { - crypto_unregister_skciphers(algs, num_algs); + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(crypto_chacha_mod_init); @@ -250,11 +168,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi "); MODULE_DESCRIPTION("Crypto API wrappers for the ChaCha20, XChaCha20, and XChaCha12 stream ciphers"); MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-generic"); -MODULE_ALIAS_CRYPTO("chacha20-" __stringify(ARCH)); +MODULE_ALIAS_CRYPTO("chacha20-lib"); MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-generic"); -MODULE_ALIAS_CRYPTO("xchacha20-" __stringify(ARCH)); +MODULE_ALIAS_CRYPTO("xchacha20-lib"); MODULE_ALIAS_CRYPTO("xchacha12"); -MODULE_ALIAS_CRYPTO("xchacha12-generic"); -MODULE_ALIAS_CRYPTO("xchacha12-" __stringify(ARCH)); +MODULE_ALIAS_CRYPTO("xchacha12-lib"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index beab926ba102e5..781445f5f56a61 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4152,14 +4152,14 @@ static int alg_test_null(const struct alg_test_desc *desc, static const struct alg_test_desc alg_test_descs[] = { { .alg = "adiantum(xchacha12,aes)", - .generic_driver = "adiantum(xchacha12-generic,aes-generic,nhpoly1305-generic)", + .generic_driver = "adiantum(xchacha12-lib,aes-generic,nhpoly1305-generic)", .test = alg_test_skcipher, .suite = { .cipher = __VECS(adiantum_xchacha12_aes_tv_template) }, }, { .alg = "adiantum(xchacha20,aes)", - .generic_driver = "adiantum(xchacha20-generic,aes-generic,nhpoly1305-generic)", + .generic_driver = "adiantum(xchacha20-lib,aes-generic,nhpoly1305-generic)", .test = alg_test_skcipher, .suite = { .cipher = __VECS(adiantum_xchacha20_aes_tv_template) @@ -4485,6 +4485,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "chacha20", + .generic_driver = "chacha20-lib", .test = alg_test_skcipher, .suite = { .cipher = __VECS(chacha20_tv_template) @@ -5420,12 +5421,14 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "rfc7539(chacha20,poly1305)", + .generic_driver = "rfc7539(chacha20-lib,poly1305-generic)", .test = alg_test_aead, .suite = { .aead = __VECS(rfc7539_tv_template) } }, { .alg = "rfc7539esp(chacha20,poly1305)", + .generic_driver = "rfc7539esp(chacha20-lib,poly1305-generic)", .test = alg_test_aead, .suite = { .aead = { @@ -5591,12 +5594,14 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "xchacha12", + .generic_driver = "xchacha12-lib", .test = alg_test_skcipher, .suite = { .cipher = __VECS(xchacha12_tv_template) }, }, { .alg = "xchacha20", + .generic_driver = "xchacha20-lib", .test = alg_test_skcipher, .suite = { .cipher = __VECS(xchacha20_tv_template) From c4b846ff6ecab0427cc7dcccbe0af60b244a6d56 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:22 -0700 Subject: [PATCH 0596/3206] lib/crypto: chacha: Remove unused function chacha_is_arch_optimized() chacha_is_arch_optimized() is no longer used, so remove it. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/chacha.h | 9 --------- lib/crypto/arm/chacha-glue.c | 7 ------- lib/crypto/arm64/chacha-neon-glue.c | 6 ------ lib/crypto/mips/chacha-glue.c | 6 ------ lib/crypto/powerpc/chacha-p10-glue.c | 6 ------ lib/crypto/riscv/chacha-riscv64-glue.c | 6 ------ lib/crypto/s390/chacha-glue.c | 6 ------ lib/crypto/x86/chacha_glue.c | 6 ------ 8 files changed, 52 deletions(-) diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h index 91f6b4cf561c76..be25a0b65a05ff 100644 --- a/include/crypto/chacha.h +++ b/include/crypto/chacha.h @@ -119,13 +119,4 @@ static inline void chacha_zeroize_state(struct chacha_state *state) memzero_explicit(state, sizeof(*state)); } -#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA) -bool chacha_is_arch_optimized(void); -#else -static inline bool chacha_is_arch_optimized(void) -{ - return false; -} -#endif - #endif /* _CRYPTO_CHACHA_H */ diff --git a/lib/crypto/arm/chacha-glue.c b/lib/crypto/arm/chacha-glue.c index 88ec9641528319..67ba045cae3597 100644 --- a/lib/crypto/arm/chacha-glue.c +++ b/lib/crypto/arm/chacha-glue.c @@ -101,13 +101,6 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - /* We always can use at least the ARM scalar implementation. */ - return true; -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - static int __init chacha_arm_mod_init(void) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha-neon-glue.c index d0188f974ca5c4..48097aa34af7c0 100644 --- a/lib/crypto/arm64/chacha-neon-glue.c +++ b/lib/crypto/arm64/chacha-neon-glue.c @@ -95,12 +95,6 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&have_neon); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - static int __init chacha_simd_mod_init(void) { if (cpu_have_named_feature(ASIMD)) diff --git a/lib/crypto/mips/chacha-glue.c b/lib/crypto/mips/chacha-glue.c index 88c097594eb0f0..f8390af21dc925 100644 --- a/lib/crypto/mips/chacha-glue.c +++ b/lib/crypto/mips/chacha-glue.c @@ -18,12 +18,6 @@ asmlinkage void hchacha_block_arch(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds); EXPORT_SYMBOL(hchacha_block_arch); -bool chacha_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/powerpc/chacha-p10-glue.c b/lib/crypto/powerpc/chacha-p10-glue.c index fcd23c6f1590bb..5d3d5506d7f948 100644 --- a/lib/crypto/powerpc/chacha-p10-glue.c +++ b/lib/crypto/powerpc/chacha-p10-glue.c @@ -76,12 +76,6 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&have_p10); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - static int __init chacha_p10_init(void) { if (cpu_has_feature(CPU_FTR_ARCH_31)) diff --git a/lib/crypto/riscv/chacha-riscv64-glue.c b/lib/crypto/riscv/chacha-riscv64-glue.c index 8c3f11d79be31c..a15f0aca3fc43e 100644 --- a/lib/crypto/riscv/chacha-riscv64-glue.c +++ b/lib/crypto/riscv/chacha-riscv64-glue.c @@ -50,12 +50,6 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&use_zvkb); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - static int __init riscv64_chacha_mod_init(void) { if (riscv_isa_extension_available(NULL, ZVKB) && diff --git a/lib/crypto/s390/chacha-glue.c b/lib/crypto/s390/chacha-glue.c index c57dc851214fa3..d8137387fe28f1 100644 --- a/lib/crypto/s390/chacha-glue.c +++ b/lib/crypto/s390/chacha-glue.c @@ -47,11 +47,5 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return cpu_has_vx(); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)"); MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/x86/chacha_glue.c b/lib/crypto/x86/chacha_glue.c index 10b2c945f54127..de7da9d512af26 100644 --- a/lib/crypto/x86/chacha_glue.c +++ b/lib/crypto/x86/chacha_glue.c @@ -160,12 +160,6 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&chacha_use_simd); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - static int __init chacha_simd_mod_init(void) { if (!boot_cpu_has(X86_FEATURE_SSSE3)) From 20a1acb68d7a16481b70a693d49c2a42882f57a9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:23 -0700 Subject: [PATCH 0597/3206] lib/crypto: chacha: Rename chacha.c to chacha-block-generic.c Rename chacha.c to chacha-block-generic.c to free up the name chacha.c for the high-level API entry points (chacha_crypt() and hchacha_block()), similar to the other algorithms. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/Makefile | 4 ++-- lib/crypto/{chacha.c => chacha-block-generic.c} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename lib/crypto/{chacha.c => chacha-block-generic.c} (100%) diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index cd460e5e3dd240..e71c4bee831062 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -15,8 +15,8 @@ obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o obj-$(CONFIG_CRYPTO_LIB_UTILS) += libcryptoutils.o libcryptoutils-y := memneq.o utils.o -# chacha is used by the /dev/random driver which is always builtin -obj-y += chacha.o +# chacha20_block() is used by the /dev/random driver which is always builtin +obj-y += chacha-block-generic.o obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC) += libchacha.o obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha-block-generic.c similarity index 100% rename from lib/crypto/chacha.c rename to lib/crypto/chacha-block-generic.c From 1ae46b6eb5b9a97978fe12a71f5de53ab977297f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:24 -0700 Subject: [PATCH 0598/3206] lib/crypto: chacha: Rename libchacha.c to chacha.c Rename libchacha.c to chacha.c to make the naming consistent with other algorithms and allow additional source files to be added to the libchacha module. This file currently contains chacha_crypt_generic(), but it will soon be updated to contain chacha_crypt(). Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/Makefile | 1 + lib/crypto/{libchacha.c => chacha.c} | 0 2 files changed, 1 insertion(+) rename lib/crypto/{libchacha.c => chacha.c} (100%) diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index e71c4bee831062..a006048ba2bd72 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -18,6 +18,7 @@ libcryptoutils-y := memneq.o utils.o # chacha20_block() is used by the /dev/random driver which is always builtin obj-y += chacha-block-generic.o obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC) += libchacha.o +libchacha-y := chacha.o obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o libaes-y := aes.o diff --git a/lib/crypto/libchacha.c b/lib/crypto/chacha.c similarity index 100% rename from lib/crypto/libchacha.c rename to lib/crypto/chacha.c From 13cecc526d8fe7eeb9b136159738688a1a10cd82 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:25 -0700 Subject: [PATCH 0599/3206] lib/crypto: chacha: Consolidate into single module Consolidate the ChaCha code into a single module (excluding chacha-block-generic.c which remains always built-in for random.c), similar to various other algorithms: - Each arch now provides a header file lib/crypto/$(SRCARCH)/chacha.h, replacing lib/crypto/$(SRCARCH)/chacha*.c. The header defines chacha_crypt_arch() and hchacha_block_arch(). It is included by lib/crypto/chacha.c, and thus the code gets built into the single libchacha module, with improved inlining in some cases. - Whether arch-optimized ChaCha is buildable is now controlled centrally by lib/crypto/Kconfig instead of by lib/crypto/$(SRCARCH)/Kconfig. The conditions for enabling it remain the same as before, and it remains enabled by default. - Any additional arch-specific translation units for the optimized ChaCha code, such as assembly files, are now compiled by lib/crypto/Makefile instead of lib/crypto/$(SRCARCH)/Makefile. This removes the last use for the Makefile and Kconfig files in the arm64, mips, powerpc, riscv, and s390 subdirectories of lib/crypto/. So also remove those files and the references to them. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/chacha.h | 28 ++--------- lib/crypto/Kconfig | 47 +++++-------------- lib/crypto/Makefile | 43 +++++++++++++---- lib/crypto/arm/Kconfig | 5 -- lib/crypto/arm/Makefile | 4 -- lib/crypto/arm/{chacha-glue.c => chacha.h} | 28 +++-------- lib/crypto/arm64/Kconfig | 8 ---- lib/crypto/arm64/Makefile | 4 -- .../arm64/{chacha-neon-glue.c => chacha.h} | 26 +++------- lib/crypto/chacha.c | 41 ++++++++++++++-- lib/crypto/mips/Kconfig | 7 --- lib/crypto/mips/Makefile | 5 -- lib/crypto/mips/{chacha-glue.c => chacha.h} | 11 +---- lib/crypto/powerpc/Kconfig | 8 ---- lib/crypto/powerpc/Makefile | 4 -- .../powerpc/{chacha-p10-glue.c => chacha.h} | 30 +++--------- lib/crypto/riscv/Kconfig | 8 ---- lib/crypto/riscv/Makefile | 4 -- .../riscv/{chacha-riscv64-glue.c => chacha.h} | 30 +++--------- lib/crypto/s390/Kconfig | 7 --- lib/crypto/s390/Makefile | 4 -- lib/crypto/s390/{chacha-glue.c => chacha.h} | 23 ++------- lib/crypto/x86/Kconfig | 7 --- lib/crypto/x86/Makefile | 3 -- lib/crypto/x86/{chacha_glue.c => chacha.h} | 30 ++++-------- 25 files changed, 126 insertions(+), 289 deletions(-) rename lib/crypto/arm/{chacha-glue.c => chacha.h} (79%) delete mode 100644 lib/crypto/arm64/Kconfig delete mode 100644 lib/crypto/arm64/Makefile rename lib/crypto/arm64/{chacha-neon-glue.c => chacha.h} (78%) delete mode 100644 lib/crypto/mips/Kconfig delete mode 100644 lib/crypto/mips/Makefile rename lib/crypto/mips/{chacha-glue.c => chacha.h} (55%) delete mode 100644 lib/crypto/powerpc/Kconfig delete mode 100644 lib/crypto/powerpc/Makefile rename lib/crypto/powerpc/{chacha-p10-glue.c => chacha.h} (66%) delete mode 100644 lib/crypto/riscv/Kconfig delete mode 100644 lib/crypto/riscv/Makefile rename lib/crypto/riscv/{chacha-riscv64-glue.c => chacha.h} (61%) delete mode 100644 lib/crypto/s390/Kconfig delete mode 100644 lib/crypto/s390/Makefile rename lib/crypto/s390/{chacha-glue.c => chacha.h} (55%) rename lib/crypto/x86/{chacha_glue.c => chacha.h} (87%) diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h index be25a0b65a05ff..38e26dff27b00a 100644 --- a/include/crypto/chacha.h +++ b/include/crypto/chacha.h @@ -45,19 +45,11 @@ static inline void chacha20_block(struct chacha_state *state, chacha_block_generic(state, out, 20); } -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); void hchacha_block_generic(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds); -static inline void hchacha_block(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) - hchacha_block_arch(state, out, nrounds); - else - hchacha_block_generic(state, out, nrounds); -} +void hchacha_block(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); enum chacha_constants { /* expand 32-byte k */ CHACHA_CONSTANT_EXPA = 0x61707865U, @@ -93,20 +85,8 @@ static inline void chacha_init(struct chacha_state *state, state->x[15] = get_unaligned_le32(iv + 12); } -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds); -void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds); - -static inline void chacha_crypt(struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) - chacha_crypt_arch(state, dst, src, bytes, nrounds); - else - chacha_crypt_generic(state, dst, src, bytes, nrounds); -} +void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds); static inline void chacha20_crypt(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes) diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index cb4e056a98faf4..c1db483bc23067 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -44,29 +44,23 @@ config CRYPTO_LIB_BLAKE2S_GENERIC implementation is enabled, this implementation serves the users of CRYPTO_LIB_BLAKE2S. -config CRYPTO_ARCH_HAVE_LIB_CHACHA - bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the ChaCha library interface, - either builtin or as a module. - -config CRYPTO_LIB_CHACHA_GENERIC +config CRYPTO_LIB_CHACHA tristate - default CRYPTO_LIB_CHACHA if !CRYPTO_ARCH_HAVE_LIB_CHACHA select CRYPTO_LIB_UTILS help - This symbol can be selected by arch implementations of the ChaCha - library interface that require the generic code as a fallback, e.g., - for SIMD implementations. If no arch specific implementation is - enabled, this implementation serves the users of CRYPTO_LIB_CHACHA. + Enable the ChaCha library interface. Select this if your module uses + chacha_crypt() or hchacha_block(). -config CRYPTO_LIB_CHACHA - tristate - help - Enable the ChaCha library interface. This interface may be fulfilled - by either the generic implementation or an arch-specific one, if one - is available and enabled. +config CRYPTO_LIB_CHACHA_ARCH + bool + depends on CRYPTO_LIB_CHACHA && !UML && !KMSAN + default y if ARM + default y if ARM64 && KERNEL_MODE_NEON + default y if MIPS && CPU_MIPS32_R2 + default y if PPC64 && CPU_LITTLE_ENDIAN && VSX + default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if S390 + default y if X86_64 config CRYPTO_ARCH_HAVE_LIB_CURVE25519 bool @@ -218,21 +212,6 @@ if !KMSAN # avoid false positives from assembly if ARM source "lib/crypto/arm/Kconfig" endif -if ARM64 -source "lib/crypto/arm64/Kconfig" -endif -if MIPS -source "lib/crypto/mips/Kconfig" -endif -if PPC -source "lib/crypto/powerpc/Kconfig" -endif -if RISCV -source "lib/crypto/riscv/Kconfig" -endif -if S390 -source "lib/crypto/s390/Kconfig" -endif if X86 source "lib/crypto/x86/Kconfig" endif diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index a006048ba2bd72..baafd58b5dfd5a 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -15,11 +15,6 @@ obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o obj-$(CONFIG_CRYPTO_LIB_UTILS) += libcryptoutils.o libcryptoutils-y := memneq.o utils.o -# chacha20_block() is used by the /dev/random driver which is always builtin -obj-y += chacha-block-generic.o -obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC) += libchacha.o -libchacha-y := chacha.o - obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o libaes-y := aes.o @@ -40,6 +35,39 @@ libblake2s-y := blake2s.o libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) += blake2s-generic.o libblake2s-$(CONFIG_CRYPTO_SELFTESTS) += blake2s-selftest.o +################################################################################ + +# chacha20_block() is used by the /dev/random driver which is always builtin +obj-y += chacha-block-generic.o + +obj-$(CONFIG_CRYPTO_LIB_CHACHA) += libchacha.o +libchacha-y := chacha.o + +ifeq ($(CONFIG_CRYPTO_LIB_CHACHA_ARCH),y) +CFLAGS_chacha.o += -I$(src)/$(SRCARCH) + +ifeq ($(CONFIG_ARM),y) +libchacha-y += arm/chacha-scalar-core.o +libchacha-$(CONFIG_KERNEL_MODE_NEON) += arm/chacha-neon-core.o +endif + +libchacha-$(CONFIG_ARM64) += arm64/chacha-neon-core.o + +ifeq ($(CONFIG_MIPS),y) +libchacha-y += mips/chacha-core.o +AFLAGS_mips/chacha-core.o += -O2 # needed to fill branch delay slots +endif + +libchacha-$(CONFIG_PPC) += powerpc/chacha-p10le-8x.o +libchacha-$(CONFIG_RISCV) += riscv/chacha-riscv64-zvkb.o +libchacha-$(CONFIG_S390) += s390/chacha-s390.o +libchacha-$(CONFIG_X86) += x86/chacha-ssse3-x86_64.o \ + x86/chacha-avx2-x86_64.o \ + x86/chacha-avx512vl-x86_64.o +endif # CONFIG_CRYPTO_LIB_CHACHA_ARCH + +################################################################################ + obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305) += libchacha20poly1305.o libchacha20poly1305-y += chacha20poly1305.o libchacha20poly1305-$(CONFIG_CRYPTO_SELFTESTS) += chacha20poly1305-selftest.o @@ -231,11 +259,6 @@ obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o libsm3-y := sm3.o obj-$(CONFIG_ARM) += arm/ -obj-$(CONFIG_ARM64) += arm64/ -obj-$(CONFIG_MIPS) += mips/ -obj-$(CONFIG_PPC) += powerpc/ -obj-$(CONFIG_RISCV) += riscv/ -obj-$(CONFIG_S390) += s390/ obj-$(CONFIG_X86) += x86/ # clean-files must be defined unconditionally diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig index 0d821e282c645e..740341aa35d213 100644 --- a/lib/crypto/arm/Kconfig +++ b/lib/crypto/arm/Kconfig @@ -12,8 +12,3 @@ config CRYPTO_BLAKE2S_ARM BLAKE2b, but slower than the NEON implementation of BLAKE2b. There is no NEON implementation of BLAKE2s, since NEON doesn't really help with it. - -config CRYPTO_CHACHA20_NEON - tristate - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile index 9f70e61d419e21..0574b0e9739e31 100644 --- a/lib/crypto/arm/Makefile +++ b/lib/crypto/arm/Makefile @@ -2,7 +2,3 @@ obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o libblake2s-arm-y := blake2s-core.o blake2s-glue.o - -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o -chacha-neon-y := chacha-scalar-core.o chacha-glue.o -chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o diff --git a/lib/crypto/arm/chacha-glue.c b/lib/crypto/arm/chacha.h similarity index 79% rename from lib/crypto/arm/chacha-glue.c rename to lib/crypto/arm/chacha.h index 67ba045cae3597..0cae30f8ee5d15 100644 --- a/lib/crypto/arm/chacha-glue.c +++ b/lib/crypto/arm/chacha.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * ChaCha and HChaCha functions (ARM optimized) * @@ -6,11 +6,9 @@ * Copyright (C) 2015 Martin Willi */ -#include #include #include #include -#include #include #include @@ -64,8 +62,8 @@ static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, } } -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) +static void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) { if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { hchacha_block_arm(state, out, nrounds); @@ -75,10 +73,9 @@ void hchacha_block_arch(const struct chacha_state *state, kernel_neon_end(); } } -EXPORT_SYMBOL(hchacha_block_arch); -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || bytes <= CHACHA_BLOCK_SIZE) { @@ -99,9 +96,9 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, dst += todo; } while (bytes); } -EXPORT_SYMBOL(chacha_crypt_arch); -static int __init chacha_arm_mod_init(void) +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { switch (read_cpuid_part()) { @@ -117,15 +114,4 @@ static int __init chacha_arm_mod_init(void) static_branch_enable(&use_neon); } } - return 0; } -subsys_initcall(chacha_arm_mod_init); - -static void __exit chacha_arm_mod_exit(void) -{ -} -module_exit(chacha_arm_mod_exit); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig deleted file mode 100644 index 07c8a4f0ab03a4..00000000000000 --- a/lib/crypto/arm64/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA20_NEON - tristate - depends on KERNEL_MODE_NEON - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile deleted file mode 100644 index d49cceca3d1ca7..00000000000000 --- a/lib/crypto/arm64/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o -chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha.h similarity index 78% rename from lib/crypto/arm64/chacha-neon-glue.c rename to lib/crypto/arm64/chacha.h index 48097aa34af7c0..ba6c22d4608633 100644 --- a/lib/crypto/arm64/chacha-neon-glue.c +++ b/lib/crypto/arm64/chacha.h @@ -18,11 +18,9 @@ * (at your option) any later version. */ -#include #include #include #include -#include #include #include @@ -61,8 +59,8 @@ static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, } } -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) +static void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) { if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { hchacha_block_generic(state, out, nrounds); @@ -72,10 +70,9 @@ void hchacha_block_arch(const struct chacha_state *state, kernel_neon_end(); } } -EXPORT_SYMBOL(hchacha_block_arch); -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) @@ -93,21 +90,10 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, dst += todo; } while (bytes); } -EXPORT_SYMBOL(chacha_crypt_arch); -static int __init chacha_simd_mod_init(void) +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) { if (cpu_have_named_feature(ASIMD)) static_branch_enable(&have_neon); - return 0; } -subsys_initcall(chacha_simd_mod_init); - -static void __exit chacha_simd_mod_exit(void) -{ -} -module_exit(chacha_simd_mod_exit); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c index 26862ad90a9640..e0c7cb4af31800 100644 --- a/lib/crypto/chacha.c +++ b/lib/crypto/chacha.c @@ -11,8 +11,9 @@ #include #include -void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void __maybe_unused +chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) { /* aligned to potentially speed up crypto_xor() */ u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long)); @@ -29,7 +30,41 @@ void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, crypto_xor_cpy(dst, src, stream, bytes); } } -EXPORT_SYMBOL(chacha_crypt_generic); + +#ifdef CONFIG_CRYPTO_LIB_CHACHA_ARCH +#include "chacha.h" /* $(SRCARCH)/chacha.h */ +#else +#define chacha_crypt_arch chacha_crypt_generic +#define hchacha_block_arch hchacha_block_generic +#endif + +void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + chacha_crypt_arch(state, dst, src, bytes, nrounds); +} +EXPORT_SYMBOL_GPL(chacha_crypt); + +void hchacha_block(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + hchacha_block_arch(state, out, nrounds); +} +EXPORT_SYMBOL_GPL(hchacha_block); + +#ifdef chacha_mod_init_arch +static int __init chacha_mod_init(void) +{ + chacha_mod_init_arch(); + return 0; +} +subsys_initcall(chacha_mod_init); + +static void __exit chacha_mod_exit(void) +{ +} +module_exit(chacha_mod_exit); +#endif MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/mips/Kconfig b/lib/crypto/mips/Kconfig deleted file mode 100644 index 94c1a0892c203b..00000000000000 --- a/lib/crypto/mips/Kconfig +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_MIPS - tristate - depends on CPU_MIPS32_R2 - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/mips/Makefile b/lib/crypto/mips/Makefile deleted file mode 100644 index b5ea0e25c21ef5..00000000000000 --- a/lib/crypto/mips/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o -chacha-mips-y := chacha-core.o chacha-glue.o -AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots diff --git a/lib/crypto/mips/chacha-glue.c b/lib/crypto/mips/chacha.h similarity index 55% rename from lib/crypto/mips/chacha-glue.c rename to lib/crypto/mips/chacha.h index f8390af21dc925..0c18c0dc2a4066 100644 --- a/lib/crypto/mips/chacha-glue.c +++ b/lib/crypto/mips/chacha.h @@ -1,23 +1,14 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * ChaCha and HChaCha functions (MIPS optimized) * * Copyright (C) 2019 Linaro, Ltd. */ -#include #include -#include asmlinkage void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds); -EXPORT_SYMBOL(chacha_crypt_arch); - asmlinkage void hchacha_block_arch(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds); -EXPORT_SYMBOL(hchacha_block_arch); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/powerpc/Kconfig b/lib/crypto/powerpc/Kconfig deleted file mode 100644 index e41012a61876ee..00000000000000 --- a/lib/crypto/powerpc/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA20_P10 - tristate - depends on PPC64 && CPU_LITTLE_ENDIAN && VSX - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/powerpc/Makefile b/lib/crypto/powerpc/Makefile deleted file mode 100644 index 778a04edd226cf..00000000000000 --- a/lib/crypto/powerpc/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o -chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o diff --git a/lib/crypto/powerpc/chacha-p10-glue.c b/lib/crypto/powerpc/chacha.h similarity index 66% rename from lib/crypto/powerpc/chacha-p10-glue.c rename to lib/crypto/powerpc/chacha.h index 5d3d5506d7f948..1df6e1ce31c460 100644 --- a/lib/crypto/powerpc/chacha-p10-glue.c +++ b/lib/crypto/powerpc/chacha.h @@ -1,14 +1,12 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ChaCha stream cipher (P10 accelerated) * * Copyright 2023- IBM Corp. All rights reserved. */ -#include #include #include -#include #include #include #include @@ -48,15 +46,10 @@ static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src, chacha_crypt_generic(state, dst, src, bytes, nrounds); } -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); +#define hchacha_block_arch hchacha_block_generic /* not implemented yet */ -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) @@ -74,21 +67,10 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, dst += todo; } while (bytes); } -EXPORT_SYMBOL(chacha_crypt_arch); -static int __init chacha_p10_init(void) +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) { if (cpu_has_feature(CPU_FTR_ARCH_31)) static_branch_enable(&have_p10); - return 0; -} -subsys_initcall(chacha_p10_init); - -static void __exit chacha_p10_exit(void) -{ } -module_exit(chacha_p10_exit); - -MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)"); -MODULE_AUTHOR("Danny Tsen "); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/riscv/Kconfig b/lib/crypto/riscv/Kconfig deleted file mode 100644 index bc7a43f33eb3a3..00000000000000 --- a/lib/crypto/riscv/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_RISCV64 - tristate - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC diff --git a/lib/crypto/riscv/Makefile b/lib/crypto/riscv/Makefile deleted file mode 100644 index e27b78f317fc8e..00000000000000 --- a/lib/crypto/riscv/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o -chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o diff --git a/lib/crypto/riscv/chacha-riscv64-glue.c b/lib/crypto/riscv/chacha.h similarity index 61% rename from lib/crypto/riscv/chacha-riscv64-glue.c rename to lib/crypto/riscv/chacha.h index a15f0aca3fc43e..5c000c6aef4be1 100644 --- a/lib/crypto/riscv/chacha-riscv64-glue.c +++ b/lib/crypto/riscv/chacha.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only +/* SPDX-License-Identifier: GPL-2.0-only */ /* * ChaCha stream cipher (RISC-V optimized) * @@ -8,25 +8,18 @@ #include #include -#include #include #include -#include static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb); asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out, size_t nblocks, int nrounds); -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); +#define hchacha_block_arch hchacha_block_generic /* not implemented yet */ -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { u8 block_buffer[CHACHA_BLOCK_SIZE]; unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE; @@ -48,22 +41,11 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } kernel_vector_end(); } -EXPORT_SYMBOL(chacha_crypt_arch); -static int __init riscv64_chacha_mod_init(void) +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) { if (riscv_isa_extension_available(NULL, ZVKB) && riscv_vector_vlen() >= 128) static_branch_enable(&use_zvkb); - return 0; -} -subsys_initcall(riscv64_chacha_mod_init); - -static void __exit riscv64_chacha_mod_exit(void) -{ } -module_exit(riscv64_chacha_mod_exit); - -MODULE_DESCRIPTION("ChaCha stream cipher (RISC-V optimized)"); -MODULE_AUTHOR("Jerry Shih "); -MODULE_LICENSE("GPL"); diff --git a/lib/crypto/s390/Kconfig b/lib/crypto/s390/Kconfig deleted file mode 100644 index 069b355fe51aa9..00000000000000 --- a/lib/crypto/s390/Kconfig +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_S390 - tristate - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/s390/Makefile b/lib/crypto/s390/Makefile deleted file mode 100644 index 06c2cf77178ef2..00000000000000 --- a/lib/crypto/s390/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o -chacha_s390-y := chacha-glue.o chacha-s390.o diff --git a/lib/crypto/s390/chacha-glue.c b/lib/crypto/s390/chacha.h similarity index 55% rename from lib/crypto/s390/chacha-glue.c rename to lib/crypto/s390/chacha.h index d8137387fe28f1..fd9c4a42236568 100644 --- a/lib/crypto/s390/chacha-glue.c +++ b/lib/crypto/s390/chacha.h @@ -1,32 +1,21 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * ChaCha stream cipher (s390 optimized) * * Copyright IBM Corp. 2021 */ -#define KMSG_COMPONENT "chacha_s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include #include #include #include -#include #include #include #include "chacha-s390.h" -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - /* TODO: implement hchacha_block_arch() in assembly */ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); +#define hchacha_block_arch hchacha_block_generic /* not implemented yet */ -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { /* s390 chacha20 implementation has 20 rounds hard-coded, * it cannot handle a block of data or less, but otherwise @@ -45,7 +34,3 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, CHACHA_BLOCK_SIZE; } } -EXPORT_SYMBOL(chacha_crypt_arch); - -MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig index 24dc9a59b27289..eb47da71aa6b6d 100644 --- a/lib/crypto/x86/Kconfig +++ b/lib/crypto/x86/Kconfig @@ -11,10 +11,3 @@ config CRYPTO_BLAKE2S_X86 Architecture: x86_64 using: - SSSE3 (Supplemental SSE3) - AVX-512 (Advanced Vector Extensions-512) - -config CRYPTO_CHACHA20_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile index 16c9d76f994729..4454556b243e71 100644 --- a/lib/crypto/x86/Makefile +++ b/lib/crypto/x86/Makefile @@ -2,6 +2,3 @@ obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o - -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o -chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o diff --git a/lib/crypto/x86/chacha_glue.c b/lib/crypto/x86/chacha.h similarity index 87% rename from lib/crypto/x86/chacha_glue.c rename to lib/crypto/x86/chacha.h index de7da9d512af26..10cf8f1c569dca 100644 --- a/lib/crypto/x86/chacha_glue.c +++ b/lib/crypto/x86/chacha.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ChaCha and HChaCha functions (x86_64 optimized) * @@ -6,10 +6,8 @@ */ #include -#include #include #include -#include #include asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, @@ -126,8 +124,8 @@ static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, } } -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) +static void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) { if (!static_branch_likely(&chacha_use_simd)) { hchacha_block_generic(state, out, nrounds); @@ -137,10 +135,9 @@ void hchacha_block_arch(const struct chacha_state *state, kernel_fpu_end(); } } -EXPORT_SYMBOL(hchacha_block_arch); -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { if (!static_branch_likely(&chacha_use_simd) || bytes <= CHACHA_BLOCK_SIZE) @@ -158,12 +155,12 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, dst += todo; } while (bytes); } -EXPORT_SYMBOL(chacha_crypt_arch); -static int __init chacha_simd_mod_init(void) +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) { if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return 0; + return; static_branch_enable(&chacha_use_simd); @@ -176,15 +173,4 @@ static int __init chacha_simd_mod_init(void) boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ static_branch_enable(&chacha_use_avx512vl); } - return 0; } -subsys_initcall(chacha_simd_mod_init); - -static void __exit chacha_simd_mod_exit(void) -{ -} -module_exit(chacha_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); -MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); From 453eda46b7f807f6fc4283f9639085697100ec08 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:26 -0700 Subject: [PATCH 0600/3206] lib/crypto: x86/blake2s: Reduce size of BLAKE2S_SIGMA2 Save 480 bytes of .rodata by replacing the .long constants with .bytes, and using the vpmovzxbd instruction to expand them. Also update the code to do the loads before incrementing %rax rather than after. This avoids the need for the first load to use an offset. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-8-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/x86/blake2s-core.S | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S index ac1c845445a4d5..ef8e9f427aab34 100644 --- a/lib/crypto/x86/blake2s-core.S +++ b/lib/crypto/x86/blake2s-core.S @@ -29,19 +29,19 @@ SIGMA: .byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 .byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 .byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 -.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 +.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 160 .align 64 SIGMA2: -.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 -.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 -.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 -.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 -.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 -.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 -.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 -.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 -.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 +.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 +.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 +.byte 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 +.byte 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 +.byte 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 +.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 +.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 +.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 +.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 .text SYM_FUNC_START(blake2s_compress_ssse3) @@ -193,9 +193,9 @@ SYM_FUNC_START(blake2s_compress_avx512) leaq SIGMA2(%rip),%rax movb $0xa,%cl .Lblake2s_compress_avx512_roundloop: - addq $0x40,%rax - vmovdqa -0x40(%rax),%ymm8 - vmovdqa -0x20(%rax),%ymm9 + vpmovzxbd (%rax),%ymm8 + vpmovzxbd 0x8(%rax),%ymm9 + addq $0x10,%rax vpermi2d %ymm7,%ymm6,%ymm8 vpermi2d %ymm7,%ymm6,%ymm9 vmovdqa %ymm8,%ymm6 From 126f5d90f6c855b39eebec17f93c2f9d2ce01ebb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:27 -0700 Subject: [PATCH 0601/3206] lib/crypto: blake2s: Remove obsolete self-test Remove the original BLAKE2s self-test, since it will be superseded by blake2s_kunit. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-9-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/internal/blake2s.h | 2 - lib/crypto/Makefile | 1 - lib/crypto/blake2s-selftest.c | 651 ------------------------------ lib/crypto/blake2s.c | 10 - 4 files changed, 664 deletions(-) delete mode 100644 lib/crypto/blake2s-selftest.c diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h index 506d56530ca93f..3b82572c943375 100644 --- a/include/crypto/internal/blake2s.h +++ b/include/crypto/internal/blake2s.h @@ -16,6 +16,4 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, void blake2s_compress(struct blake2s_state *state, const u8 *block, size_t nblocks, const u32 inc); -bool blake2s_selftest(void); - #endif /* _CRYPTO_INTERNAL_BLAKE2S_H */ diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index baafd58b5dfd5a..b2d2745879d1dc 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o obj-y += libblake2s.o libblake2s-y := blake2s.o libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) += blake2s-generic.o -libblake2s-$(CONFIG_CRYPTO_SELFTESTS) += blake2s-selftest.o ################################################################################ diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c deleted file mode 100644 index d0634ed6a937fb..00000000000000 --- a/lib/crypto/blake2s-selftest.c +++ /dev/null @@ -1,651 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - */ - -#include -#include -#include -#include - -/* - * blake2s_testvecs[] generated with the program below (using libb2-dev and - * libssl-dev [OpenSSL]) - * - * #include - * #include - * #include - * - * #include - * - * #define BLAKE2S_TESTVEC_COUNT 256 - * - * static void print_vec(const uint8_t vec[], int len) - * { - * int i; - * - * printf(" { "); - * for (i = 0; i < len; i++) { - * if (i && (i % 12) == 0) - * printf("\n "); - * printf("0x%02x, ", vec[i]); - * } - * printf("},\n"); - * } - * - * int main(void) - * { - * uint8_t key[BLAKE2S_KEYBYTES]; - * uint8_t buf[BLAKE2S_TESTVEC_COUNT]; - * uint8_t hash[BLAKE2S_OUTBYTES]; - * int i, j; - * - * key[0] = key[1] = 1; - * for (i = 2; i < BLAKE2S_KEYBYTES; ++i) - * key[i] = key[i - 2] + key[i - 1]; - * - * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) - * buf[i] = (uint8_t)i; - * - * printf("static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n"); - * - * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) { - * int outlen = 1 + i % BLAKE2S_OUTBYTES; - * int keylen = (13 * i) % (BLAKE2S_KEYBYTES + 1); - * - * blake2s(hash, buf, key + BLAKE2S_KEYBYTES - keylen, outlen, i, - * keylen); - * print_vec(hash, outlen); - * } - * printf("};\n\n"); - * - * return 0; - *} - */ -static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = { - { 0xa1, }, - { 0x7c, 0x89, }, - { 0x74, 0x0e, 0xd4, }, - { 0x47, 0x0c, 0x21, 0x15, }, - { 0x18, 0xd6, 0x9c, 0xa6, 0xc4, }, - { 0x13, 0x5d, 0x16, 0x63, 0x2e, 0xf9, }, - { 0x2c, 0xb5, 0x04, 0xb7, 0x99, 0xe2, 0x73, }, - { 0x9a, 0x0f, 0xd2, 0x39, 0xd6, 0x68, 0x1b, 0x92, }, - { 0xc8, 0xde, 0x7a, 0xea, 0x2f, 0xf4, 0xd2, 0xe3, 0x2b, }, - { 0x5b, 0xf9, 0x43, 0x52, 0x0c, 0x12, 0xba, 0xb5, 0x93, 0x9f, }, - { 0xc6, 0x2c, 0x4e, 0x80, 0xfc, 0x32, 0x5b, 0x33, 0xb8, 0xb8, 0x0a, }, - { 0xa7, 0x5c, 0xfd, 0x3a, 0xcc, 0xbf, 0x90, 0xca, 0xb7, 0x97, 0xde, 0xd8, }, - { 0x66, 0xca, 0x3c, 0xc4, 0x19, 0xef, 0x92, 0x66, 0x3f, 0x21, 0x8f, 0xda, - 0xb7, }, - { 0xba, 0xe5, 0xbb, 0x30, 0x25, 0x94, 0x6d, 0xc3, 0x89, 0x09, 0xc4, 0x25, - 0x52, 0x3e, }, - { 0xa2, 0xef, 0x0e, 0x52, 0x0b, 0x5f, 0xa2, 0x01, 0x6d, 0x0a, 0x25, 0xbc, - 0x57, 0xe2, 0x27, }, - { 0x4f, 0xe0, 0xf9, 0x52, 0x12, 0xda, 0x84, 0xb7, 0xab, 0xae, 0xb0, 0xa6, - 0x47, 0x2a, 0xc7, 0xf5, }, - { 0x56, 0xe7, 0xa8, 0x1c, 0x4c, 0xca, 0xed, 0x90, 0x31, 0xec, 0x87, 0x43, - 0xe7, 0x72, 0x08, 0xec, 0xbe, }, - { 0x7e, 0xdf, 0x80, 0x1c, 0x93, 0x33, 0xfd, 0x53, 0x44, 0xba, 0xfd, 0x96, - 0xe1, 0xbb, 0xb5, 0x65, 0xa5, 0x00, }, - { 0xec, 0x6b, 0xed, 0xf7, 0x7b, 0x62, 0x1d, 0x7d, 0xf4, 0x82, 0xf3, 0x1e, - 0x18, 0xff, 0x2b, 0xc4, 0x06, 0x20, 0x2a, }, - { 0x74, 0x98, 0xd7, 0x68, 0x63, 0xed, 0x87, 0xe4, 0x5d, 0x8d, 0x9e, 0x1d, - 0xfd, 0x2a, 0xbb, 0x86, 0xac, 0xe9, 0x2a, 0x89, }, - { 0x89, 0xc3, 0x88, 0xce, 0x2b, 0x33, 0x1e, 0x10, 0xd1, 0x37, 0x20, 0x86, - 0x28, 0x43, 0x70, 0xd9, 0xfb, 0x96, 0xd9, 0xb5, 0xd3, }, - { 0xcb, 0x56, 0x74, 0x41, 0x8d, 0x80, 0x01, 0x9a, 0x6b, 0x38, 0xe1, 0x41, - 0xad, 0x9c, 0x62, 0x74, 0xce, 0x35, 0xd5, 0x6c, 0x89, 0x6e, }, - { 0x79, 0xaf, 0x94, 0x59, 0x99, 0x26, 0xe1, 0xc9, 0x34, 0xfe, 0x7c, 0x22, - 0xf7, 0x43, 0xd7, 0x65, 0xd4, 0x48, 0x18, 0xac, 0x3d, 0xfd, 0x93, }, - { 0x85, 0x0d, 0xff, 0xb8, 0x3e, 0x87, 0x41, 0xb0, 0x95, 0xd3, 0x3d, 0x00, - 0x47, 0x55, 0x9e, 0xd2, 0x69, 0xea, 0xbf, 0xe9, 0x7a, 0x2d, 0x61, 0x45, }, - { 0x03, 0xe0, 0x85, 0xec, 0x54, 0xb5, 0x16, 0x53, 0xa8, 0xc4, 0x71, 0xe9, - 0x6a, 0xe7, 0xcb, 0xc4, 0x15, 0x02, 0xfc, 0x34, 0xa4, 0xa4, 0x28, 0x13, - 0xd1, }, - { 0xe3, 0x34, 0x4b, 0xe1, 0xd0, 0x4b, 0x55, 0x61, 0x8f, 0xc0, 0x24, 0x05, - 0xe6, 0xe0, 0x3d, 0x70, 0x24, 0x4d, 0xda, 0xb8, 0x91, 0x05, 0x29, 0x07, - 0x01, 0x3e, }, - { 0x61, 0xff, 0x01, 0x72, 0xb1, 0x4d, 0xf6, 0xfe, 0xd1, 0xd1, 0x08, 0x74, - 0xe6, 0x91, 0x44, 0xeb, 0x61, 0xda, 0x40, 0xaf, 0xfc, 0x8c, 0x91, 0x6b, - 0xec, 0x13, 0xed, }, - { 0xd4, 0x40, 0xd2, 0xa0, 0x7f, 0xc1, 0x58, 0x0c, 0x85, 0xa0, 0x86, 0xc7, - 0x86, 0xb9, 0x61, 0xc9, 0xea, 0x19, 0x86, 0x1f, 0xab, 0x07, 0xce, 0x37, - 0x72, 0x67, 0x09, 0xfc, }, - { 0x9e, 0xf8, 0x18, 0x67, 0x93, 0x10, 0x9b, 0x39, 0x75, 0xe8, 0x8b, 0x38, - 0x82, 0x7d, 0xb8, 0xb7, 0xa5, 0xaf, 0xe6, 0x6a, 0x22, 0x5e, 0x1f, 0x9c, - 0x95, 0x29, 0x19, 0xf2, 0x4b, }, - { 0xc8, 0x62, 0x25, 0xf5, 0x98, 0xc9, 0xea, 0xe5, 0x29, 0x3a, 0xd3, 0x22, - 0xeb, 0xeb, 0x07, 0x7c, 0x15, 0x07, 0xee, 0x15, 0x61, 0xbb, 0x05, 0x30, - 0x99, 0x7f, 0x11, 0xf6, 0x0a, 0x1d, }, - { 0x68, 0x70, 0xf7, 0x90, 0xa1, 0x8b, 0x1f, 0x0f, 0xbb, 0xce, 0xd2, 0x0e, - 0x33, 0x1f, 0x7f, 0xa9, 0x78, 0xa8, 0xa6, 0x81, 0x66, 0xab, 0x8d, 0xcd, - 0x58, 0x55, 0x3a, 0x0b, 0x7a, 0xdb, 0xb5, }, - { 0xdd, 0x35, 0xd2, 0xb4, 0xf6, 0xc7, 0xea, 0xab, 0x64, 0x24, 0x4e, 0xfe, - 0xe5, 0x3d, 0x4e, 0x95, 0x8b, 0x6d, 0x6c, 0xbc, 0xb0, 0xf8, 0x88, 0x61, - 0x09, 0xb7, 0x78, 0xa3, 0x31, 0xfe, 0xd9, 0x2f, }, - { 0x0a, }, - { 0x6e, 0xd4, }, - { 0x64, 0xe9, 0xd1, }, - { 0x30, 0xdd, 0x71, 0xef, }, - { 0x11, 0xb5, 0x0c, 0x87, 0xc9, }, - { 0x06, 0x1c, 0x6d, 0x04, 0x82, 0xd0, }, - { 0x5c, 0x42, 0x0b, 0xee, 0xc5, 0x9c, 0xb2, }, - { 0xe8, 0x29, 0xd6, 0xb4, 0x5d, 0xf7, 0x2b, 0x93, }, - { 0x18, 0xca, 0x27, 0x72, 0x43, 0x39, 0x16, 0xbc, 0x6a, }, - { 0x39, 0x8f, 0xfd, 0x64, 0xf5, 0x57, 0x23, 0xb0, 0x45, 0xf8, }, - { 0xbb, 0x3a, 0x78, 0x6b, 0x02, 0x1d, 0x0b, 0x16, 0xe3, 0xb2, 0x9a, }, - { 0xb8, 0xb4, 0x0b, 0xe5, 0xd4, 0x1d, 0x0d, 0x85, 0x49, 0x91, 0x35, 0xfa, }, - { 0x6d, 0x48, 0x2a, 0x0c, 0x42, 0x08, 0xbd, 0xa9, 0x78, 0x6f, 0x18, 0xaf, - 0xe2, }, - { 0x10, 0x45, 0xd4, 0x58, 0x88, 0xec, 0x4e, 0x1e, 0xf6, 0x14, 0x92, 0x64, - 0x7e, 0xb0, }, - { 0x8b, 0x0b, 0x95, 0xee, 0x92, 0xc6, 0x3b, 0x91, 0xf1, 0x1e, 0xeb, 0x51, - 0x98, 0x0a, 0x8d, }, - { 0xa3, 0x50, 0x4d, 0xa5, 0x1d, 0x03, 0x68, 0xe9, 0x57, 0x78, 0xd6, 0x04, - 0xf1, 0xc3, 0x94, 0xd8, }, - { 0xb8, 0x66, 0x6e, 0xdd, 0x46, 0x15, 0xae, 0x3d, 0x83, 0x7e, 0xcf, 0xe7, - 0x2c, 0xe8, 0x8f, 0xc7, 0x34, }, - { 0x2e, 0xc0, 0x1f, 0x29, 0xea, 0xf6, 0xb9, 0xe2, 0xc2, 0x93, 0xeb, 0x41, - 0x0d, 0xf0, 0x0a, 0x13, 0x0e, 0xa2, }, - { 0x71, 0xb8, 0x33, 0xa9, 0x1b, 0xac, 0xf1, 0xb5, 0x42, 0x8f, 0x5e, 0x81, - 0x34, 0x43, 0xb7, 0xa4, 0x18, 0x5c, 0x47, }, - { 0xda, 0x45, 0xb8, 0x2e, 0x82, 0x1e, 0xc0, 0x59, 0x77, 0x9d, 0xfa, 0xb4, - 0x1c, 0x5e, 0xa0, 0x2b, 0x33, 0x96, 0x5a, 0x58, }, - { 0xe3, 0x09, 0x05, 0xa9, 0xeb, 0x48, 0x13, 0xad, 0x71, 0x88, 0x81, 0x9a, - 0x3e, 0x2c, 0xe1, 0x23, 0x99, 0x13, 0x35, 0x9f, 0xb5, }, - { 0xb7, 0x86, 0x2d, 0x16, 0xe1, 0x04, 0x00, 0x47, 0x47, 0x61, 0x31, 0xfb, - 0x14, 0xac, 0xd8, 0xe9, 0xe3, 0x49, 0xbd, 0xf7, 0x9c, 0x3f, }, - { 0x7f, 0xd9, 0x95, 0xa8, 0xa7, 0xa0, 0xcc, 0xba, 0xef, 0xb1, 0x0a, 0xa9, - 0x21, 0x62, 0x08, 0x0f, 0x1b, 0xff, 0x7b, 0x9d, 0xae, 0xb2, 0x95, }, - { 0x85, 0x99, 0xea, 0x33, 0xe0, 0x56, 0xff, 0x13, 0xc6, 0x61, 0x8c, 0xf9, - 0x57, 0x05, 0x03, 0x11, 0xf9, 0xfb, 0x3a, 0xf7, 0xce, 0xbb, 0x52, 0x30, }, - { 0xb2, 0x72, 0x9c, 0xf8, 0x77, 0x4e, 0x8f, 0x6b, 0x01, 0x6c, 0xff, 0x4e, - 0x4f, 0x02, 0xd2, 0xbc, 0xeb, 0x51, 0x28, 0x99, 0x50, 0xab, 0xc4, 0x42, - 0xe3, }, - { 0x8b, 0x0a, 0xb5, 0x90, 0x8f, 0xf5, 0x7b, 0xdd, 0xba, 0x47, 0x37, 0xc9, - 0x2a, 0xd5, 0x4b, 0x25, 0x08, 0x8b, 0x02, 0x17, 0xa7, 0x9e, 0x6b, 0x6e, - 0xe3, 0x90, }, - { 0x90, 0xdd, 0xf7, 0x75, 0xa7, 0xa3, 0x99, 0x5e, 0x5b, 0x7d, 0x75, 0xc3, - 0x39, 0x6b, 0xa0, 0xe2, 0x44, 0x53, 0xb1, 0x9e, 0xc8, 0xf1, 0x77, 0x10, - 0x58, 0x06, 0x9a, }, - { 0x99, 0x52, 0xf0, 0x49, 0xa8, 0x8c, 0xec, 0xa6, 0x97, 0x32, 0x13, 0xb5, - 0xf7, 0xa3, 0x8e, 0xfb, 0x4b, 0x59, 0x31, 0x3d, 0x01, 0x59, 0x98, 0x5d, - 0x53, 0x03, 0x1a, 0x39, }, - { 0x9f, 0xe0, 0xc2, 0xe5, 0x5d, 0x93, 0xd6, 0x9b, 0x47, 0x8f, 0x9b, 0xe0, - 0x26, 0x35, 0x84, 0x20, 0x1d, 0xc5, 0x53, 0x10, 0x0f, 0x22, 0xb9, 0xb5, - 0xd4, 0x36, 0xb1, 0xac, 0x73, }, - { 0x30, 0x32, 0x20, 0x3b, 0x10, 0x28, 0xec, 0x1f, 0x4f, 0x9b, 0x47, 0x59, - 0xeb, 0x7b, 0xee, 0x45, 0xfb, 0x0c, 0x49, 0xd8, 0x3d, 0x69, 0xbd, 0x90, - 0x2c, 0xf0, 0x9e, 0x8d, 0xbf, 0xd5, }, - { 0x2a, 0x37, 0x73, 0x7f, 0xf9, 0x96, 0x19, 0xaa, 0x25, 0xd8, 0x13, 0x28, - 0x01, 0x29, 0x89, 0xdf, 0x6e, 0x0c, 0x9b, 0x43, 0x44, 0x51, 0xe9, 0x75, - 0x26, 0x0c, 0xb7, 0x87, 0x66, 0x0b, 0x5f, }, - { 0x23, 0xdf, 0x96, 0x68, 0x91, 0x86, 0xd0, 0x93, 0x55, 0x33, 0x24, 0xf6, - 0xba, 0x08, 0x75, 0x5b, 0x59, 0x11, 0x69, 0xb8, 0xb9, 0xe5, 0x2c, 0x77, - 0x02, 0xf6, 0x47, 0xee, 0x81, 0xdd, 0xb9, 0x06, }, - { 0x9d, }, - { 0x9d, 0x7d, }, - { 0xfd, 0xc3, 0xda, }, - { 0xe8, 0x82, 0xcd, 0x21, }, - { 0xc3, 0x1d, 0x42, 0x4c, 0x74, }, - { 0xe9, 0xda, 0xf1, 0xa2, 0xe5, 0x7c, }, - { 0x52, 0xb8, 0x6f, 0x81, 0x5c, 0x3a, 0x4c, }, - { 0x5b, 0x39, 0x26, 0xfc, 0x92, 0x5e, 0xe0, 0x49, }, - { 0x59, 0xe4, 0x7c, 0x93, 0x1c, 0xf9, 0x28, 0x93, 0xde, }, - { 0xde, 0xdf, 0xb2, 0x43, 0x61, 0x0b, 0x86, 0x16, 0x4c, 0x2e, }, - { 0x14, 0x8f, 0x75, 0x51, 0xaf, 0xb9, 0xee, 0x51, 0x5a, 0xae, 0x23, }, - { 0x43, 0x5f, 0x50, 0xd5, 0x70, 0xb0, 0x5b, 0x87, 0xf5, 0xd9, 0xb3, 0x6d, }, - { 0x66, 0x0a, 0x64, 0x93, 0x79, 0x71, 0x94, 0x40, 0xb7, 0x68, 0x2d, 0xd3, - 0x63, }, - { 0x15, 0x00, 0xc4, 0x0c, 0x7d, 0x1b, 0x10, 0xa9, 0x73, 0x1b, 0x90, 0x6f, - 0xe6, 0xa9, }, - { 0x34, 0x75, 0xf3, 0x86, 0x8f, 0x56, 0xcf, 0x2a, 0x0a, 0xf2, 0x62, 0x0a, - 0xf6, 0x0e, 0x20, }, - { 0xb1, 0xde, 0xc9, 0xf5, 0xdb, 0xf3, 0x2f, 0x4c, 0xd6, 0x41, 0x7d, 0x39, - 0x18, 0x3e, 0xc7, 0xc3, }, - { 0xc5, 0x89, 0xb2, 0xf8, 0xb8, 0xc0, 0xa3, 0xb9, 0x3b, 0x10, 0x6d, 0x7c, - 0x92, 0xfc, 0x7f, 0x34, 0x41, }, - { 0xc4, 0xd8, 0xef, 0xba, 0xef, 0xd2, 0xaa, 0xc5, 0x6c, 0x8e, 0x3e, 0xbb, - 0x12, 0xfc, 0x0f, 0x72, 0xbf, 0x0f, }, - { 0xdd, 0x91, 0xd1, 0x15, 0x9e, 0x7d, 0xf8, 0xc1, 0xb9, 0x14, 0x63, 0x96, - 0xb5, 0xcb, 0x83, 0x1d, 0x35, 0x1c, 0xec, }, - { 0xa9, 0xf8, 0x52, 0xc9, 0x67, 0x76, 0x2b, 0xad, 0xfb, 0xd8, 0x3a, 0xa6, - 0x74, 0x02, 0xae, 0xb8, 0x25, 0x2c, 0x63, 0x49, }, - { 0x77, 0x1f, 0x66, 0x70, 0xfd, 0x50, 0x29, 0xaa, 0xeb, 0xdc, 0xee, 0xba, - 0x75, 0x98, 0xdc, 0x93, 0x12, 0x3f, 0xdc, 0x7c, 0x38, }, - { 0xe2, 0xe1, 0x89, 0x5c, 0x37, 0x38, 0x6a, 0xa3, 0x40, 0xac, 0x3f, 0xb0, - 0xca, 0xfc, 0xa7, 0xf3, 0xea, 0xf9, 0x0f, 0x5d, 0x8e, 0x39, }, - { 0x0f, 0x67, 0xc8, 0x38, 0x01, 0xb1, 0xb7, 0xb8, 0xa2, 0xe7, 0x0a, 0x6d, - 0xd2, 0x63, 0x69, 0x9e, 0xcc, 0xf0, 0xf2, 0xbe, 0x9b, 0x98, 0xdd, }, - { 0x13, 0xe1, 0x36, 0x30, 0xfe, 0xc6, 0x01, 0x8a, 0xa1, 0x63, 0x96, 0x59, - 0xc2, 0xa9, 0x68, 0x3f, 0x58, 0xd4, 0x19, 0x0c, 0x40, 0xf3, 0xde, 0x02, }, - { 0xa3, 0x9e, 0xce, 0xda, 0x42, 0xee, 0x8c, 0x6c, 0x5a, 0x7d, 0xdc, 0x89, - 0x02, 0x77, 0xdd, 0xe7, 0x95, 0xbb, 0xff, 0x0d, 0xa4, 0xb5, 0x38, 0x1e, - 0xaf, }, - { 0x9a, 0xf6, 0xb5, 0x9a, 0x4f, 0xa9, 0x4f, 0x2c, 0x35, 0x3c, 0x24, 0xdc, - 0x97, 0x6f, 0xd9, 0xa1, 0x7d, 0x1a, 0x85, 0x0b, 0xf5, 0xda, 0x2e, 0xe7, - 0xb1, 0x1d, }, - { 0x84, 0x1e, 0x8e, 0x3d, 0x45, 0xa5, 0xf2, 0x27, 0xf3, 0x31, 0xfe, 0xb9, - 0xfb, 0xc5, 0x45, 0x99, 0x99, 0xdd, 0x93, 0x43, 0x02, 0xee, 0x58, 0xaf, - 0xee, 0x6a, 0xbe, }, - { 0x07, 0x2f, 0xc0, 0xa2, 0x04, 0xc4, 0xab, 0x7c, 0x26, 0xbb, 0xa8, 0xd8, - 0xe3, 0x1c, 0x75, 0x15, 0x64, 0x5d, 0x02, 0x6a, 0xf0, 0x86, 0xe9, 0xcd, - 0x5c, 0xef, 0xa3, 0x25, }, - { 0x2f, 0x3b, 0x1f, 0xb5, 0x91, 0x8f, 0x86, 0xe0, 0xdc, 0x31, 0x48, 0xb6, - 0xa1, 0x8c, 0xfd, 0x75, 0xbb, 0x7d, 0x3d, 0xc1, 0xf0, 0x10, 0x9a, 0xd8, - 0x4b, 0x0e, 0xe3, 0x94, 0x9f, }, - { 0x29, 0xbb, 0x8f, 0x6c, 0xd1, 0xf2, 0xb6, 0xaf, 0xe5, 0xe3, 0x2d, 0xdc, - 0x6f, 0xa4, 0x53, 0x88, 0xd8, 0xcf, 0x4d, 0x45, 0x42, 0x62, 0xdb, 0xdf, - 0xf8, 0x45, 0xc2, 0x13, 0xec, 0x35, }, - { 0x06, 0x3c, 0xe3, 0x2c, 0x15, 0xc6, 0x43, 0x03, 0x81, 0xfb, 0x08, 0x76, - 0x33, 0xcb, 0x02, 0xc1, 0xba, 0x33, 0xe5, 0xe0, 0xd1, 0x92, 0xa8, 0x46, - 0x28, 0x3f, 0x3e, 0x9d, 0x2c, 0x44, 0x54, }, - { 0xea, 0xbb, 0x96, 0xf8, 0xd1, 0x8b, 0x04, 0x11, 0x40, 0x78, 0x42, 0x02, - 0x19, 0xd1, 0xbc, 0x65, 0x92, 0xd3, 0xc3, 0xd6, 0xd9, 0x19, 0xe7, 0xc3, - 0x40, 0x97, 0xbd, 0xd4, 0xed, 0xfa, 0x5e, 0x28, }, - { 0x02, }, - { 0x52, 0xa8, }, - { 0x38, 0x25, 0x0d, }, - { 0xe3, 0x04, 0xd4, 0x92, }, - { 0x97, 0xdb, 0xf7, 0x81, 0xca, }, - { 0x8a, 0x56, 0x9d, 0x62, 0x56, 0xcc, }, - { 0xa1, 0x8e, 0x3c, 0x72, 0x8f, 0x63, 0x03, }, - { 0xf7, 0xf3, 0x39, 0x09, 0x0a, 0xa1, 0xbb, 0x23, }, - { 0x6b, 0x03, 0xc0, 0xe9, 0xd9, 0x83, 0x05, 0x22, 0x01, }, - { 0x1b, 0x4b, 0xf5, 0xd6, 0x4f, 0x05, 0x75, 0x91, 0x4c, 0x7f, }, - { 0x4c, 0x8c, 0x25, 0x20, 0x21, 0xcb, 0xc2, 0x4b, 0x3a, 0x5b, 0x8d, }, - { 0x56, 0xe2, 0x77, 0xa0, 0xb6, 0x9f, 0x81, 0xec, 0x83, 0x75, 0xc4, 0xf9, }, - { 0x71, 0x70, 0x0f, 0xad, 0x4d, 0x35, 0x81, 0x9d, 0x88, 0x69, 0xf9, 0xaa, - 0xd3, }, - { 0x50, 0x6e, 0x86, 0x6e, 0x43, 0xc0, 0xc2, 0x44, 0xc2, 0xe2, 0xa0, 0x1c, - 0xb7, 0x9a, }, - { 0xe4, 0x7e, 0x72, 0xc6, 0x12, 0x8e, 0x7c, 0xfc, 0xbd, 0xe2, 0x08, 0x31, - 0x3d, 0x47, 0x3d, }, - { 0x08, 0x97, 0x5b, 0x80, 0xae, 0xc4, 0x1d, 0x50, 0x77, 0xdf, 0x1f, 0xd0, - 0x24, 0xf0, 0x17, 0xc0, }, - { 0x01, 0xb6, 0x29, 0xf4, 0xaf, 0x78, 0x5f, 0xb6, 0x91, 0xdd, 0x76, 0x76, - 0xd2, 0xfd, 0x0c, 0x47, 0x40, }, - { 0xa1, 0xd8, 0x09, 0x97, 0x7a, 0xa6, 0xc8, 0x94, 0xf6, 0x91, 0x7b, 0xae, - 0x2b, 0x9f, 0x0d, 0x83, 0x48, 0xf7, }, - { 0x12, 0xd5, 0x53, 0x7d, 0x9a, 0xb0, 0xbe, 0xd9, 0xed, 0xe9, 0x9e, 0xee, - 0x61, 0x5b, 0x42, 0xf2, 0xc0, 0x73, 0xc0, }, - { 0xd5, 0x77, 0xd6, 0x5c, 0x6e, 0xa5, 0x69, 0x2b, 0x3b, 0x8c, 0xd6, 0x7d, - 0x1d, 0xbe, 0x2c, 0xa1, 0x02, 0x21, 0xcd, 0x29, }, - { 0xa4, 0x98, 0x80, 0xca, 0x22, 0xcf, 0x6a, 0xab, 0x5e, 0x40, 0x0d, 0x61, - 0x08, 0x21, 0xef, 0xc0, 0x6c, 0x52, 0xb4, 0xb0, 0x53, }, - { 0xbf, 0xaf, 0x8f, 0x3b, 0x7a, 0x97, 0x33, 0xe5, 0xca, 0x07, 0x37, 0xfd, - 0x15, 0xdf, 0xce, 0x26, 0x2a, 0xb1, 0xa7, 0x0b, 0xb3, 0xac, }, - { 0x16, 0x22, 0xe1, 0xbc, 0x99, 0x4e, 0x01, 0xf0, 0xfa, 0xff, 0x8f, 0xa5, - 0x0c, 0x61, 0xb0, 0xad, 0xcc, 0xb1, 0xe1, 0x21, 0x46, 0xfa, 0x2e, }, - { 0x11, 0x5b, 0x0b, 0x2b, 0xe6, 0x14, 0xc1, 0xd5, 0x4d, 0x71, 0x5e, 0x17, - 0xea, 0x23, 0xdd, 0x6c, 0xbd, 0x1d, 0xbe, 0x12, 0x1b, 0xee, 0x4c, 0x1a, }, - { 0x40, 0x88, 0x22, 0xf3, 0x20, 0x6c, 0xed, 0xe1, 0x36, 0x34, 0x62, 0x2c, - 0x98, 0x83, 0x52, 0xe2, 0x25, 0xee, 0xe9, 0xf5, 0xe1, 0x17, 0xf0, 0x5c, - 0xae, }, - { 0xc3, 0x76, 0x37, 0xde, 0x95, 0x8c, 0xca, 0x2b, 0x0c, 0x23, 0xe7, 0xb5, - 0x38, 0x70, 0x61, 0xcc, 0xff, 0xd3, 0x95, 0x7b, 0xf3, 0xff, 0x1f, 0x9d, - 0x59, 0x00, }, - { 0x0c, 0x19, 0x52, 0x05, 0x22, 0x53, 0xcb, 0x48, 0xd7, 0x10, 0x0e, 0x7e, - 0x14, 0x69, 0xb5, 0xa2, 0x92, 0x43, 0xa3, 0x9e, 0x4b, 0x8f, 0x51, 0x2c, - 0x5a, 0x2c, 0x3b, }, - { 0xe1, 0x9d, 0x70, 0x70, 0x28, 0xec, 0x86, 0x40, 0x55, 0x33, 0x56, 0xda, - 0x88, 0xca, 0xee, 0xc8, 0x6a, 0x20, 0xb1, 0xe5, 0x3d, 0x57, 0xf8, 0x3c, - 0x10, 0x07, 0x2a, 0xc4, }, - { 0x0b, 0xae, 0xf1, 0xc4, 0x79, 0xee, 0x1b, 0x3d, 0x27, 0x35, 0x8d, 0x14, - 0xd6, 0xae, 0x4e, 0x3c, 0xe9, 0x53, 0x50, 0xb5, 0xcc, 0x0c, 0xf7, 0xdf, - 0xee, 0xa1, 0x74, 0xd6, 0x71, }, - { 0xe6, 0xa4, 0xf4, 0x99, 0x98, 0xb9, 0x80, 0xea, 0x96, 0x7f, 0x4f, 0x33, - 0xcf, 0x74, 0x25, 0x6f, 0x17, 0x6c, 0xbf, 0xf5, 0x5c, 0x38, 0xd0, 0xff, - 0x96, 0xcb, 0x13, 0xf9, 0xdf, 0xfd, }, - { 0xbe, 0x92, 0xeb, 0xba, 0x44, 0x2c, 0x24, 0x74, 0xd4, 0x03, 0x27, 0x3c, - 0x5d, 0x5b, 0x03, 0x30, 0x87, 0x63, 0x69, 0xe0, 0xb8, 0x94, 0xf4, 0x44, - 0x7e, 0xad, 0xcd, 0x20, 0x12, 0x16, 0x79, }, - { 0x30, 0xf1, 0xc4, 0x8e, 0x05, 0x90, 0x2a, 0x97, 0x63, 0x94, 0x46, 0xff, - 0xce, 0xd8, 0x67, 0xa7, 0xac, 0x33, 0x8c, 0x95, 0xb7, 0xcd, 0xa3, 0x23, - 0x98, 0x9d, 0x76, 0x6c, 0x9d, 0xa8, 0xd6, 0x8a, }, - { 0xbe, }, - { 0x17, 0x6c, }, - { 0x1a, 0x42, 0x4f, }, - { 0xba, 0xaf, 0xb7, 0x65, }, - { 0xc2, 0x63, 0x43, 0x6a, 0xea, }, - { 0xe4, 0x4d, 0xad, 0xf2, 0x0b, 0x02, }, - { 0x04, 0xc7, 0xc4, 0x7f, 0xa9, 0x2b, 0xce, }, - { 0x66, 0xf6, 0x67, 0xcb, 0x03, 0x53, 0xc8, 0xf1, }, - { 0x56, 0xa3, 0x60, 0x78, 0xc9, 0x5f, 0x70, 0x1b, 0x5e, }, - { 0x99, 0xff, 0x81, 0x7c, 0x13, 0x3c, 0x29, 0x79, 0x4b, 0x65, }, - { 0x51, 0x10, 0x50, 0x93, 0x01, 0x93, 0xb7, 0x01, 0xc9, 0x18, 0xb7, }, - { 0x8e, 0x3c, 0x42, 0x1e, 0x5e, 0x7d, 0xc1, 0x50, 0x70, 0x1f, 0x00, 0x98, }, - { 0x5f, 0xd9, 0x9b, 0xc8, 0xd7, 0xb2, 0x72, 0x62, 0x1a, 0x1e, 0xba, 0x92, - 0xe9, }, - { 0x70, 0x2b, 0xba, 0xfe, 0xad, 0x5d, 0x96, 0x3f, 0x27, 0xc2, 0x41, 0x6d, - 0xc4, 0xb3, }, - { 0xae, 0xe0, 0xd5, 0xd4, 0xc7, 0xae, 0x15, 0x5e, 0xdc, 0xdd, 0x33, 0x60, - 0xd7, 0xd3, 0x5e, }, - { 0x79, 0x8e, 0xbc, 0x9e, 0x20, 0xb9, 0x19, 0x4b, 0x63, 0x80, 0xf3, 0x16, - 0xaf, 0x39, 0xbd, 0x92, }, - { 0xc2, 0x0e, 0x85, 0xa0, 0x0b, 0x9a, 0xb0, 0xec, 0xde, 0x38, 0xd3, 0x10, - 0xd9, 0xa7, 0x66, 0x27, 0xcf, }, - { 0x0e, 0x3b, 0x75, 0x80, 0x67, 0x14, 0x0c, 0x02, 0x90, 0xd6, 0xb3, 0x02, - 0x81, 0xf6, 0xa6, 0x87, 0xce, 0x58, }, - { 0x79, 0xb5, 0xe9, 0x5d, 0x52, 0x4d, 0xf7, 0x59, 0xf4, 0x2e, 0x27, 0xdd, - 0xb3, 0xed, 0x57, 0x5b, 0x82, 0xea, 0x6f, }, - { 0xa2, 0x97, 0xf5, 0x80, 0x02, 0x3d, 0xde, 0xa3, 0xf9, 0xf6, 0xab, 0xe3, - 0x57, 0x63, 0x7b, 0x9b, 0x10, 0x42, 0x6f, 0xf2, }, - { 0x12, 0x7a, 0xfc, 0xb7, 0x67, 0x06, 0x0c, 0x78, 0x1a, 0xfe, 0x88, 0x4f, - 0xc6, 0xac, 0x52, 0x96, 0x64, 0x28, 0x97, 0x84, 0x06, }, - { 0xc5, 0x04, 0x44, 0x6b, 0xb2, 0xa5, 0xa4, 0x66, 0xe1, 0x76, 0xa2, 0x51, - 0xf9, 0x59, 0x69, 0x97, 0x56, 0x0b, 0xbf, 0x50, 0xb3, 0x34, }, - { 0x21, 0x32, 0x6b, 0x42, 0xb5, 0xed, 0x71, 0x8d, 0xf7, 0x5a, 0x35, 0xe3, - 0x90, 0xe2, 0xee, 0xaa, 0x89, 0xf6, 0xc9, 0x9c, 0x4d, 0x73, 0xf4, }, - { 0x4c, 0xa6, 0x09, 0xf4, 0x48, 0xe7, 0x46, 0xbc, 0x49, 0xfc, 0xe5, 0xda, - 0xd1, 0x87, 0x13, 0x17, 0x4c, 0x59, 0x71, 0x26, 0x5b, 0x2c, 0x42, 0xb7, }, - { 0x13, 0x63, 0xf3, 0x40, 0x02, 0xe5, 0xa3, 0x3a, 0x5e, 0x8e, 0xf8, 0xb6, - 0x8a, 0x49, 0x60, 0x76, 0x34, 0x72, 0x94, 0x73, 0xf6, 0xd9, 0x21, 0x6a, - 0x26, }, - { 0xdf, 0x75, 0x16, 0x10, 0x1b, 0x5e, 0x81, 0xc3, 0xc8, 0xde, 0x34, 0x24, - 0xb0, 0x98, 0xeb, 0x1b, 0x8f, 0xa1, 0x9b, 0x05, 0xee, 0xa5, 0xe9, 0x35, - 0xf4, 0x1d, }, - { 0xcd, 0x21, 0x93, 0x6e, 0x5b, 0xa0, 0x26, 0x2b, 0x21, 0x0e, 0xa0, 0xb9, - 0x1c, 0xb5, 0xbb, 0xb8, 0xf8, 0x1e, 0xff, 0x5c, 0xa8, 0xf9, 0x39, 0x46, - 0x4e, 0x29, 0x26, }, - { 0x73, 0x7f, 0x0e, 0x3b, 0x0b, 0x5c, 0xf9, 0x60, 0xaa, 0x88, 0xa1, 0x09, - 0xb1, 0x5d, 0x38, 0x7b, 0x86, 0x8f, 0x13, 0x7a, 0x8d, 0x72, 0x7a, 0x98, - 0x1a, 0x5b, 0xff, 0xc9, }, - { 0xd3, 0x3c, 0x61, 0x71, 0x44, 0x7e, 0x31, 0x74, 0x98, 0x9d, 0x9a, 0xd2, - 0x27, 0xf3, 0x46, 0x43, 0x42, 0x51, 0xd0, 0x5f, 0xe9, 0x1c, 0x5c, 0x69, - 0xbf, 0xf6, 0xbe, 0x3c, 0x40, }, - { 0x31, 0x99, 0x31, 0x9f, 0xaa, 0x43, 0x2e, 0x77, 0x3e, 0x74, 0x26, 0x31, - 0x5e, 0x61, 0xf1, 0x87, 0xe2, 0xeb, 0x9b, 0xcd, 0xd0, 0x3a, 0xee, 0x20, - 0x7e, 0x10, 0x0a, 0x0b, 0x7e, 0xfa, }, - { 0xa4, 0x27, 0x80, 0x67, 0x81, 0x2a, 0xa7, 0x62, 0xf7, 0x6e, 0xda, 0xd4, - 0x5c, 0x39, 0x74, 0xad, 0x7e, 0xbe, 0xad, 0xa5, 0x84, 0x7f, 0xa9, 0x30, - 0x5d, 0xdb, 0xe2, 0x05, 0x43, 0xf7, 0x1b, }, - { 0x0b, 0x37, 0xd8, 0x02, 0xe1, 0x83, 0xd6, 0x80, 0xf2, 0x35, 0xc2, 0xb0, - 0x37, 0xef, 0xef, 0x5e, 0x43, 0x93, 0xf0, 0x49, 0x45, 0x0a, 0xef, 0xb5, - 0x76, 0x70, 0x12, 0x44, 0xc4, 0xdb, 0xf5, 0x7a, }, - { 0x1f, }, - { 0x82, 0x60, }, - { 0xcc, 0xe3, 0x08, }, - { 0x56, 0x17, 0xe4, 0x59, }, - { 0xe2, 0xd7, 0x9e, 0xc4, 0x4c, }, - { 0xb2, 0xad, 0xd3, 0x78, 0x58, 0x5a, }, - { 0xce, 0x43, 0xb4, 0x02, 0x96, 0xab, 0x3c, }, - { 0xe6, 0x05, 0x1a, 0x73, 0x22, 0x32, 0xbb, 0x77, }, - { 0x23, 0xe7, 0xda, 0xfe, 0x2c, 0xef, 0x8c, 0x22, 0xec, }, - { 0xe9, 0x8e, 0x55, 0x38, 0xd1, 0xd7, 0x35, 0x23, 0x98, 0xc7, }, - { 0xb5, 0x81, 0x1a, 0xe5, 0xb5, 0xa5, 0xd9, 0x4d, 0xca, 0x41, 0xe7, }, - { 0x41, 0x16, 0x16, 0x95, 0x8d, 0x9e, 0x0c, 0xea, 0x8c, 0x71, 0x9a, 0xc1, }, - { 0x7c, 0x33, 0xc0, 0xa4, 0x00, 0x62, 0xea, 0x60, 0x67, 0xe4, 0x20, 0xbc, - 0x5b, }, - { 0xdb, 0xb1, 0xdc, 0xfd, 0x08, 0xc0, 0xde, 0x82, 0xd1, 0xde, 0x38, 0xc0, - 0x90, 0x48, }, - { 0x37, 0x18, 0x2e, 0x0d, 0x61, 0xaa, 0x61, 0xd7, 0x86, 0x20, 0x16, 0x60, - 0x04, 0xd9, 0xd5, }, - { 0xb0, 0xcf, 0x2c, 0x4c, 0x5e, 0x5b, 0x4f, 0x2a, 0x23, 0x25, 0x58, 0x47, - 0xe5, 0x31, 0x06, 0x70, }, - { 0x91, 0xa0, 0xa3, 0x86, 0x4e, 0xe0, 0x72, 0x38, 0x06, 0x67, 0x59, 0x5c, - 0x70, 0x25, 0xdb, 0x33, 0x27, }, - { 0x44, 0x58, 0x66, 0xb8, 0x58, 0xc7, 0x13, 0xed, 0x4c, 0xc0, 0xf4, 0x9a, - 0x1e, 0x67, 0x75, 0x33, 0xb6, 0xb8, }, - { 0x7f, 0x98, 0x4a, 0x8e, 0x50, 0xa2, 0x5c, 0xcd, 0x59, 0xde, 0x72, 0xb3, - 0x9d, 0xc3, 0x09, 0x8a, 0xab, 0x56, 0xf1, }, - { 0x80, 0x96, 0x49, 0x1a, 0x59, 0xa2, 0xc5, 0xd5, 0xa7, 0x20, 0x8a, 0xb7, - 0x27, 0x62, 0x84, 0x43, 0xc6, 0xe1, 0x1b, 0x5d, }, - { 0x6b, 0xb7, 0x2b, 0x26, 0x62, 0x14, 0x70, 0x19, 0x3d, 0x4d, 0xac, 0xac, - 0x63, 0x58, 0x5e, 0x94, 0xb5, 0xb7, 0xe8, 0xe8, 0xa2, }, - { 0x20, 0xa8, 0xc0, 0xfd, 0x63, 0x3d, 0x6e, 0x98, 0xcf, 0x0c, 0x49, 0x98, - 0xe4, 0x5a, 0xfe, 0x8c, 0xaa, 0x70, 0x82, 0x1c, 0x7b, 0x74, }, - { 0xc8, 0xe8, 0xdd, 0xdf, 0x69, 0x30, 0x01, 0xc2, 0x0f, 0x7e, 0x2f, 0x11, - 0xcc, 0x3e, 0x17, 0xa5, 0x69, 0x40, 0x3f, 0x0e, 0x79, 0x7f, 0xcf, }, - { 0xdb, 0x61, 0xc0, 0xe2, 0x2e, 0x49, 0x07, 0x31, 0x1d, 0x91, 0x42, 0x8a, - 0xfc, 0x5e, 0xd3, 0xf8, 0x56, 0x1f, 0x2b, 0x73, 0xfd, 0x9f, 0xb2, 0x8e, }, - { 0x0c, 0x89, 0x55, 0x0c, 0x1f, 0x59, 0x2c, 0x9d, 0x1b, 0x29, 0x1d, 0x41, - 0x1d, 0xe6, 0x47, 0x8f, 0x8c, 0x2b, 0xea, 0x8f, 0xf0, 0xff, 0x21, 0x70, - 0x88, }, - { 0x12, 0x18, 0x95, 0xa6, 0x59, 0xb1, 0x31, 0x24, 0x45, 0x67, 0x55, 0xa4, - 0x1a, 0x2d, 0x48, 0x67, 0x1b, 0x43, 0x88, 0x2d, 0x8e, 0xa0, 0x70, 0xb3, - 0xc6, 0xbb, }, - { 0xe7, 0xb1, 0x1d, 0xb2, 0x76, 0x4d, 0x68, 0x68, 0x68, 0x23, 0x02, 0x55, - 0x3a, 0xe2, 0xe5, 0xd5, 0x4b, 0x43, 0xf9, 0x34, 0x77, 0x5c, 0xa1, 0xf5, - 0x55, 0xfd, 0x4f, }, - { 0x8c, 0x87, 0x5a, 0x08, 0x3a, 0x73, 0xad, 0x61, 0xe1, 0xe7, 0x99, 0x7e, - 0xf0, 0x5d, 0xe9, 0x5d, 0x16, 0x43, 0x80, 0x2f, 0xd0, 0x66, 0x34, 0xe2, - 0x42, 0x64, 0x3b, 0x1a, }, - { 0x39, 0xc1, 0x99, 0xcf, 0x22, 0xbf, 0x16, 0x8f, 0x9f, 0x80, 0x7f, 0x95, - 0x0a, 0x05, 0x67, 0x27, 0xe7, 0x15, 0xdf, 0x9d, 0xb2, 0xfe, 0x1c, 0xb5, - 0x1d, 0x60, 0x8f, 0x8a, 0x1d, }, - { 0x9b, 0x6e, 0x08, 0x09, 0x06, 0x73, 0xab, 0x68, 0x02, 0x62, 0x1a, 0xe4, - 0xd4, 0xdf, 0xc7, 0x02, 0x4c, 0x6a, 0x5f, 0xfd, 0x23, 0xac, 0xae, 0x6d, - 0x43, 0xa4, 0x7a, 0x50, 0x60, 0x3c, }, - { 0x1d, 0xb4, 0xc6, 0xe1, 0xb1, 0x4b, 0xe3, 0xf2, 0xe2, 0x1a, 0x73, 0x1b, - 0xa0, 0x92, 0xa7, 0xf5, 0xff, 0x8f, 0x8b, 0x5d, 0xdf, 0xa8, 0x04, 0xb3, - 0xb0, 0xf7, 0xcc, 0x12, 0xfa, 0x35, 0x46, }, - { 0x49, 0x45, 0x97, 0x11, 0x0f, 0x1c, 0x60, 0x8e, 0xe8, 0x47, 0x30, 0xcf, - 0x60, 0xa8, 0x71, 0xc5, 0x1b, 0xe9, 0x39, 0x4d, 0x49, 0xb6, 0x12, 0x1f, - 0x24, 0xab, 0x37, 0xff, 0x83, 0xc2, 0xe1, 0x3a, }, - { 0x60, }, - { 0x24, 0x26, }, - { 0x47, 0xeb, 0xc9, }, - { 0x4a, 0xd0, 0xbc, 0xf0, }, - { 0x8e, 0x2b, 0xc9, 0x85, 0x3c, }, - { 0xa2, 0x07, 0x15, 0xb8, 0x12, 0x74, }, - { 0x0f, 0xdb, 0x5b, 0x33, 0x69, 0xfe, 0x4b, }, - { 0xa2, 0x86, 0x54, 0xf4, 0xfd, 0xb2, 0xd4, 0xe6, }, - { 0xbb, 0x84, 0x78, 0x49, 0x27, 0x8e, 0x61, 0xda, 0x60, }, - { 0x04, 0xc3, 0xcd, 0xaa, 0x8f, 0xa7, 0x03, 0xc9, 0xf9, 0xb6, }, - { 0xf8, 0x27, 0x1d, 0x61, 0xdc, 0x21, 0x42, 0xdd, 0xad, 0x92, 0x40, }, - { 0x12, 0x87, 0xdf, 0xc2, 0x41, 0x45, 0x5a, 0x36, 0x48, 0x5b, 0x51, 0x2b, }, - { 0xbb, 0x37, 0x5d, 0x1f, 0xf1, 0x68, 0x7a, 0xc4, 0xa5, 0xd2, 0xa4, 0x91, - 0x8d, }, - { 0x5b, 0x27, 0xd1, 0x04, 0x54, 0x52, 0x9f, 0xa3, 0x47, 0x86, 0x33, 0x33, - 0xbf, 0xa0, }, - { 0xcf, 0x04, 0xea, 0xf8, 0x03, 0x2a, 0x43, 0xff, 0xa6, 0x68, 0x21, 0x4c, - 0xd5, 0x4b, 0xed, }, - { 0xaf, 0xb8, 0xbc, 0x63, 0x0f, 0x18, 0x4d, 0xe2, 0x7a, 0xdd, 0x46, 0x44, - 0xc8, 0x24, 0x0a, 0xb7, }, - { 0x3e, 0xdc, 0x36, 0xe4, 0x89, 0xb1, 0xfa, 0xc6, 0x40, 0x93, 0x2e, 0x75, - 0xb2, 0x15, 0xd1, 0xb1, 0x10, }, - { 0x6c, 0xd8, 0x20, 0x3b, 0x82, 0x79, 0xf9, 0xc8, 0xbc, 0x9d, 0xe0, 0x35, - 0xbe, 0x1b, 0x49, 0x1a, 0xbc, 0x3a, }, - { 0x78, 0x65, 0x2c, 0xbe, 0x35, 0x67, 0xdc, 0x78, 0xd4, 0x41, 0xf6, 0xc9, - 0xde, 0xde, 0x1f, 0x18, 0x13, 0x31, 0x11, }, - { 0x8a, 0x7f, 0xb1, 0x33, 0x8f, 0x0c, 0x3c, 0x0a, 0x06, 0x61, 0xf0, 0x47, - 0x29, 0x1b, 0x29, 0xbc, 0x1c, 0x47, 0xef, 0x7a, }, - { 0x65, 0x91, 0xf1, 0xe6, 0xb3, 0x96, 0xd3, 0x8c, 0xc2, 0x4a, 0x59, 0x35, - 0x72, 0x8e, 0x0b, 0x9a, 0x87, 0xca, 0x34, 0x7b, 0x63, }, - { 0x5f, 0x08, 0x87, 0x80, 0x56, 0x25, 0x89, 0x77, 0x61, 0x8c, 0x64, 0xa1, - 0x59, 0x6d, 0x59, 0x62, 0xe8, 0x4a, 0xc8, 0x58, 0x99, 0xd1, }, - { 0x23, 0x87, 0x1d, 0xed, 0x6f, 0xf2, 0x91, 0x90, 0xe2, 0xfe, 0x43, 0x21, - 0xaf, 0x97, 0xc6, 0xbc, 0xd7, 0x15, 0xc7, 0x2d, 0x08, 0x77, 0x91, }, - { 0x90, 0x47, 0x9a, 0x9e, 0x3a, 0xdf, 0xf3, 0xc9, 0x4c, 0x1e, 0xa7, 0xd4, - 0x6a, 0x32, 0x90, 0xfe, 0xb7, 0xb6, 0x7b, 0xfa, 0x96, 0x61, 0xfb, 0xa4, }, - { 0xb1, 0x67, 0x60, 0x45, 0xb0, 0x96, 0xc5, 0x15, 0x9f, 0x4d, 0x26, 0xd7, - 0x9d, 0xf1, 0xf5, 0x6d, 0x21, 0x00, 0x94, 0x31, 0x64, 0x94, 0xd3, 0xa7, - 0xd3, }, - { 0x02, 0x3e, 0xaf, 0xf3, 0x79, 0x73, 0xa5, 0xf5, 0xcc, 0x7a, 0x7f, 0xfb, - 0x79, 0x2b, 0x85, 0x8c, 0x88, 0x72, 0x06, 0xbe, 0xfe, 0xaf, 0xc1, 0x16, - 0xa6, 0xd6, }, - { 0x2a, 0xb0, 0x1a, 0xe5, 0xaa, 0x6e, 0xb3, 0xae, 0x53, 0x85, 0x33, 0x80, - 0x75, 0xae, 0x30, 0xe6, 0xb8, 0x72, 0x42, 0xf6, 0x25, 0x4f, 0x38, 0x88, - 0x55, 0xd1, 0xa9, }, - { 0x90, 0xd8, 0x0c, 0xc0, 0x93, 0x4b, 0x4f, 0x9e, 0x65, 0x6c, 0xa1, 0x54, - 0xa6, 0xf6, 0x6e, 0xca, 0xd2, 0xbb, 0x7e, 0x6a, 0x1c, 0xd3, 0xce, 0x46, - 0xef, 0xb0, 0x00, 0x8d, }, - { 0xed, 0x9c, 0x49, 0xcd, 0xc2, 0xde, 0x38, 0x0e, 0xe9, 0x98, 0x6c, 0xc8, - 0x90, 0x9e, 0x3c, 0xd4, 0xd3, 0xeb, 0x88, 0x32, 0xc7, 0x28, 0xe3, 0x94, - 0x1c, 0x9f, 0x8b, 0xf3, 0xcb, }, - { 0xac, 0xe7, 0x92, 0x16, 0xb4, 0x14, 0xa0, 0xe4, 0x04, 0x79, 0xa2, 0xf4, - 0x31, 0xe6, 0x0c, 0x26, 0xdc, 0xbf, 0x2f, 0x69, 0x1b, 0x55, 0x94, 0x67, - 0xda, 0x0c, 0xd7, 0x32, 0x1f, 0xef, }, - { 0x68, 0x63, 0x85, 0x57, 0x95, 0x9e, 0x42, 0x27, 0x41, 0x43, 0x42, 0x02, - 0xa5, 0x78, 0xa7, 0xc6, 0x43, 0xc1, 0x6a, 0xba, 0x70, 0x80, 0xcd, 0x04, - 0xb6, 0x78, 0x76, 0x29, 0xf3, 0xe8, 0xa0, }, - { 0xe6, 0xac, 0x8d, 0x9d, 0xf0, 0xc0, 0xf7, 0xf7, 0xe3, 0x3e, 0x4e, 0x28, - 0x0f, 0x59, 0xb2, 0x67, 0x9e, 0x84, 0x34, 0x42, 0x96, 0x30, 0x2b, 0xca, - 0x49, 0xb6, 0xc5, 0x9a, 0x84, 0x59, 0xa7, 0x81, }, - { 0x7e, }, - { 0x1e, 0x21, }, - { 0x26, 0xd3, 0xdd, }, - { 0x2c, 0xd4, 0xb3, 0x3d, }, - { 0x86, 0x7b, 0x76, 0x3c, 0xf0, }, - { 0x12, 0xc3, 0x70, 0x1d, 0x55, 0x18, }, - { 0x96, 0xc2, 0xbd, 0x61, 0x55, 0xf4, 0x24, }, - { 0x20, 0x51, 0xf7, 0x86, 0x58, 0x8f, 0x07, 0x2a, }, - { 0x93, 0x15, 0xa8, 0x1d, 0xda, 0x97, 0xee, 0x0e, 0x6c, }, - { 0x39, 0x93, 0xdf, 0xd5, 0x0e, 0xca, 0xdc, 0x7a, 0x92, 0xce, }, - { 0x60, 0xd5, 0xfd, 0xf5, 0x1b, 0x26, 0x82, 0x26, 0x73, 0x02, 0xbc, }, - { 0x98, 0xf2, 0x34, 0xe1, 0xf5, 0xfb, 0x00, 0xac, 0x10, 0x4a, 0x38, 0x9f, }, - { 0xda, 0x3a, 0x92, 0x8a, 0xd0, 0xcd, 0x12, 0xcd, 0x15, 0xbb, 0xab, 0x77, - 0x66, }, - { 0xa2, 0x92, 0x1a, 0xe5, 0xca, 0x0c, 0x30, 0x75, 0xeb, 0xaf, 0x00, 0x31, - 0x55, 0x66, }, - { 0x06, 0xea, 0xfd, 0x3e, 0x86, 0x38, 0x62, 0x4e, 0xa9, 0x12, 0xa4, 0x12, - 0x43, 0xbf, 0xa1, }, - { 0xe4, 0x71, 0x7b, 0x94, 0xdb, 0xa0, 0xd2, 0xff, 0x9b, 0xeb, 0xad, 0x8e, - 0x95, 0x8a, 0xc5, 0xed, }, - { 0x25, 0x5a, 0x77, 0x71, 0x41, 0x0e, 0x7a, 0xe9, 0xed, 0x0c, 0x10, 0xef, - 0xf6, 0x2b, 0x3a, 0xba, 0x60, }, - { 0xee, 0xe2, 0xa3, 0x67, 0x64, 0x1d, 0xc6, 0x04, 0xc4, 0xe1, 0x68, 0xd2, - 0x6e, 0xd2, 0x91, 0x75, 0x53, 0x07, }, - { 0xe0, 0xf6, 0x4d, 0x8f, 0x68, 0xfc, 0x06, 0x7e, 0x18, 0x79, 0x7f, 0x2b, - 0x6d, 0xef, 0x46, 0x7f, 0xab, 0xb2, 0xad, }, - { 0x3d, 0x35, 0x88, 0x9f, 0x2e, 0xcf, 0x96, 0x45, 0x07, 0x60, 0x71, 0x94, - 0x00, 0x8d, 0xbf, 0xf4, 0xef, 0x46, 0x2e, 0x3c, }, - { 0x43, 0xcf, 0x98, 0xf7, 0x2d, 0xf4, 0x17, 0xe7, 0x8c, 0x05, 0x2d, 0x9b, - 0x24, 0xfb, 0x4d, 0xea, 0x4a, 0xec, 0x01, 0x25, 0x29, }, - { 0x8e, 0x73, 0x9a, 0x78, 0x11, 0xfe, 0x48, 0xa0, 0x3b, 0x1a, 0x26, 0xdf, - 0x25, 0xe9, 0x59, 0x1c, 0x70, 0x07, 0x9f, 0xdc, 0xa0, 0xa6, }, - { 0xe8, 0x47, 0x71, 0xc7, 0x3e, 0xdf, 0xb5, 0x13, 0xb9, 0x85, 0x13, 0xa8, - 0x54, 0x47, 0x6e, 0x59, 0x96, 0x09, 0x13, 0x5f, 0x82, 0x16, 0x0b, }, - { 0xfb, 0xc0, 0x8c, 0x03, 0x21, 0xb3, 0xc4, 0xb5, 0x43, 0x32, 0x6c, 0xea, - 0x7f, 0xa8, 0x43, 0x91, 0xe8, 0x4e, 0x3f, 0xbf, 0x45, 0x58, 0x6a, 0xa3, }, - { 0x55, 0xf8, 0xf3, 0x00, 0x76, 0x09, 0xef, 0x69, 0x5d, 0xd2, 0x8a, 0xf2, - 0x65, 0xc3, 0xcb, 0x9b, 0x43, 0xfd, 0xb1, 0x7e, 0x7f, 0xa1, 0x94, 0xb0, - 0xd7, }, - { 0xaa, 0x13, 0xc1, 0x51, 0x40, 0x6d, 0x8d, 0x4c, 0x0a, 0x95, 0x64, 0x7b, - 0xd1, 0x96, 0xb6, 0x56, 0xb4, 0x5b, 0xcf, 0xd6, 0xd9, 0x15, 0x97, 0xdd, - 0xb6, 0xef, }, - { 0xaf, 0xb7, 0x36, 0xb0, 0x04, 0xdb, 0xd7, 0x9c, 0x9a, 0x44, 0xc4, 0xf6, - 0x1f, 0x12, 0x21, 0x2d, 0x59, 0x30, 0x54, 0xab, 0x27, 0x61, 0xa3, 0x57, - 0xef, 0xf8, 0x53, }, - { 0x97, 0x34, 0x45, 0x3e, 0xce, 0x7c, 0x35, 0xa2, 0xda, 0x9f, 0x4b, 0x46, - 0x6c, 0x11, 0x67, 0xff, 0x2f, 0x76, 0x58, 0x15, 0x71, 0xfa, 0x44, 0x89, - 0x89, 0xfd, 0xf7, 0x99, }, - { 0x1f, 0xb1, 0x62, 0xeb, 0x83, 0xc5, 0x9c, 0x89, 0xf9, 0x2c, 0xd2, 0x03, - 0x61, 0xbc, 0xbb, 0xa5, 0x74, 0x0e, 0x9b, 0x7e, 0x82, 0x3e, 0x70, 0x0a, - 0xa9, 0x8f, 0x2b, 0x59, 0xfb, }, - { 0xf8, 0xca, 0x5e, 0x3a, 0x4f, 0x9e, 0x10, 0x69, 0x10, 0xd5, 0x4c, 0xeb, - 0x1a, 0x0f, 0x3c, 0x6a, 0x98, 0xf5, 0xb0, 0x97, 0x5b, 0x37, 0x2f, 0x0d, - 0xbd, 0x42, 0x4b, 0x69, 0xa1, 0x82, }, - { 0x12, 0x8c, 0x6d, 0x52, 0x08, 0xef, 0x74, 0xb2, 0xe6, 0xaa, 0xd3, 0xb0, - 0x26, 0xb0, 0xd9, 0x94, 0xb6, 0x11, 0x45, 0x0e, 0x36, 0x71, 0x14, 0x2d, - 0x41, 0x8c, 0x21, 0x53, 0x31, 0xe9, 0x68, }, - { 0xee, 0xea, 0x0d, 0x89, 0x47, 0x7e, 0x72, 0xd1, 0xd8, 0xce, 0x58, 0x4c, - 0x94, 0x1f, 0x0d, 0x51, 0x08, 0xa3, 0xb6, 0x3d, 0xe7, 0x82, 0x46, 0x92, - 0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, }, -}; - -static bool __init noinline_for_stack blake2s_digest_test(void) -{ - u8 key[BLAKE2S_KEY_SIZE]; - u8 buf[ARRAY_SIZE(blake2s_testvecs)]; - u8 hash[BLAKE2S_HASH_SIZE]; - struct blake2s_state state; - bool success = true; - int i, l; - - key[0] = key[1] = 1; - for (i = 2; i < sizeof(key); ++i) - key[i] = key[i - 2] + key[i - 1]; - - for (i = 0; i < sizeof(buf); ++i) - buf[i] = (u8)i; - - for (i = l = 0; i < ARRAY_SIZE(blake2s_testvecs); l = (l + 37) % ++i) { - int outlen = 1 + i % BLAKE2S_HASH_SIZE; - int keylen = (13 * i) % (BLAKE2S_KEY_SIZE + 1); - - blake2s(hash, buf, key + BLAKE2S_KEY_SIZE - keylen, outlen, i, - keylen); - if (memcmp(hash, blake2s_testvecs[i], outlen)) { - pr_err("blake2s self-test %d: FAIL\n", i + 1); - success = false; - } - - if (!keylen) - blake2s_init(&state, outlen); - else - blake2s_init_key(&state, outlen, - key + BLAKE2S_KEY_SIZE - keylen, - keylen); - - blake2s_update(&state, buf, l); - blake2s_update(&state, buf + l, i - l); - blake2s_final(&state, hash); - if (memcmp(hash, blake2s_testvecs[i], outlen)) { - pr_err("blake2s init/update/final self-test %d: FAIL\n", - i + 1); - success = false; - } - } - - return success; -} - -static bool __init noinline_for_stack blake2s_random_test(void) -{ - struct blake2s_state state; - bool success = true; - int i, l; - - for (i = 0; i < 32; ++i) { - enum { TEST_ALIGNMENT = 16 }; - u8 blocks[BLAKE2S_BLOCK_SIZE * 2 + TEST_ALIGNMENT - 1] - __aligned(TEST_ALIGNMENT); - u8 *unaligned_block = blocks + BLAKE2S_BLOCK_SIZE; - struct blake2s_state state1, state2; - - get_random_bytes(blocks, sizeof(blocks)); - get_random_bytes(&state, sizeof(state)); - -#if defined(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) && \ - defined(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S) - memcpy(&state1, &state, sizeof(state1)); - memcpy(&state2, &state, sizeof(state2)); - blake2s_compress(&state1, blocks, 2, BLAKE2S_BLOCK_SIZE); - blake2s_compress_generic(&state2, blocks, 2, BLAKE2S_BLOCK_SIZE); - if (memcmp(&state1, &state2, sizeof(state1))) { - pr_err("blake2s random compress self-test %d: FAIL\n", - i + 1); - success = false; - } -#endif - - memcpy(&state1, &state, sizeof(state1)); - blake2s_compress(&state1, blocks, 1, BLAKE2S_BLOCK_SIZE); - for (l = 1; l < TEST_ALIGNMENT; ++l) { - memcpy(unaligned_block + l, blocks, - BLAKE2S_BLOCK_SIZE); - memcpy(&state2, &state, sizeof(state2)); - blake2s_compress(&state2, unaligned_block + l, 1, - BLAKE2S_BLOCK_SIZE); - if (memcmp(&state1, &state2, sizeof(state1))) { - pr_err("blake2s random compress align %d self-test %d: FAIL\n", - l, i + 1); - success = false; - } - } - } - - return success; -} - -bool __init blake2s_selftest(void) -{ - bool success; - - success = blake2s_digest_test(); - success &= blake2s_random_test(); - - return success; -} diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index f6ec68c3dcdae6..51f2dd7a38a4e1 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -59,14 +58,5 @@ void blake2s_final(struct blake2s_state *state, u8 *out) } EXPORT_SYMBOL(blake2s_final); -static int __init blake2s_mod_init(void) -{ - if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) && - WARN_ON(!blake2s_selftest())) - return -ENODEV; - return 0; -} - -module_init(blake2s_mod_init); MODULE_DESCRIPTION("BLAKE2s hash function"); MODULE_AUTHOR("Jason A. Donenfeld "); From 56e48d4e138cb105a17e0f8c257f3bc41b1bd69d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:28 -0700 Subject: [PATCH 0602/3206] lib/crypto: blake2s: Always enable arch-optimized BLAKE2s code When support for a crypto algorithm is enabled, the arch-optimized implementation of that algorithm should be enabled too. We've learned this the hard way many times over the years: people regularly forget to enable the arch-optimized implementations of the crypto algorithms, resulting in significant performance being left on the table. Currently, BLAKE2s support is always enabled ('obj-y'), since random.c uses it. Therefore, the arch-optimized BLAKE2s code, which exists for ARM and x86_64, should be always enabled too. Let's do that. Note that the effect on kernel image size is very small and should not be a concern. On ARM, enabling CRYPTO_BLAKE2S_ARM actually *shrinks* the kernel size by about 1200 bytes, since the ARM-optimized blake2s_compress() completely replaces the generic blake2s_compress(). On x86_64, enabling CRYPTO_BLAKE2S_X86 increases the kernel size by about 1400 bytes, as the generic blake2s_compress() is still included as a fallback; however, for context, that is only about a quarter the size of the generic blake2s_compress(). The x86_64 optimized BLAKE2s code uses much less icache at runtime than the generic code. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-10-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/arm/Kconfig | 2 +- lib/crypto/x86/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig index 740341aa35d213..a5607ad079c4ff 100644 --- a/lib/crypto/arm/Kconfig +++ b/lib/crypto/arm/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config CRYPTO_BLAKE2S_ARM - bool "Hash functions: BLAKE2s" + def_bool y select CRYPTO_ARCH_HAVE_LIB_BLAKE2S help BLAKE2s cryptographic hash function (RFC 7693) diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig index eb47da71aa6b6d..ffa718321369f7 100644 --- a/lib/crypto/x86/Kconfig +++ b/lib/crypto/x86/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config CRYPTO_BLAKE2S_X86 - bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" + def_bool y depends on 64BIT select CRYPTO_LIB_BLAKE2S_GENERIC select CRYPTO_ARCH_HAVE_LIB_BLAKE2S From 5d313a7625fabe9b92eaca7c5ce0e6d1019d279a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:29 -0700 Subject: [PATCH 0603/3206] lib/crypto: blake2s: Move generic code into blake2s.c Move blake2s_compress_generic() from blake2s-generic.c to blake2s.c. For now it's still guarded by CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC, but this prepares for changing it to a 'static __maybe_unused' function and just using the compiler to automatically decide its inclusion. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-11-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/Makefile | 1 - lib/crypto/blake2s-generic.c | 111 ----------------------------------- lib/crypto/blake2s.c | 94 +++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 112 deletions(-) delete mode 100644 lib/crypto/blake2s-generic.c diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index b2d2745879d1dc..c9193f0604d91d 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o # blake2s is used by the /dev/random driver which is always builtin obj-y += libblake2s.o libblake2s-y := blake2s.o -libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) += blake2s-generic.o ################################################################################ diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c deleted file mode 100644 index 9828176a2efec7..00000000000000 --- a/lib/crypto/blake2s-generic.c +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - * - * This is an implementation of the BLAKE2s hash and PRF functions. - * - * Information: https://blake2.net/ - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -static const u8 blake2s_sigma[10][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, -}; - -static inline void blake2s_increment_counter(struct blake2s_state *state, - const u32 inc) -{ - state->t[0] += inc; - state->t[1] += (state->t[0] < inc); -} - -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) - __weak __alias(blake2s_compress_generic); - -void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) -{ - u32 m[16]; - u32 v[16]; - int i; - - WARN_ON(IS_ENABLED(DEBUG) && - (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE)); - - while (nblocks > 0) { - blake2s_increment_counter(state, inc); - memcpy(m, block, BLAKE2S_BLOCK_SIZE); - le32_to_cpu_array(m, ARRAY_SIZE(m)); - memcpy(v, state->h, 32); - v[ 8] = BLAKE2S_IV0; - v[ 9] = BLAKE2S_IV1; - v[10] = BLAKE2S_IV2; - v[11] = BLAKE2S_IV3; - v[12] = BLAKE2S_IV4 ^ state->t[0]; - v[13] = BLAKE2S_IV5 ^ state->t[1]; - v[14] = BLAKE2S_IV6 ^ state->f[0]; - v[15] = BLAKE2S_IV7 ^ state->f[1]; - -#define G(r, i, a, b, c, d) do { \ - a += b + m[blake2s_sigma[r][2 * i + 0]]; \ - d = ror32(d ^ a, 16); \ - c += d; \ - b = ror32(b ^ c, 12); \ - a += b + m[blake2s_sigma[r][2 * i + 1]]; \ - d = ror32(d ^ a, 8); \ - c += d; \ - b = ror32(b ^ c, 7); \ -} while (0) - -#define ROUND(r) do { \ - G(r, 0, v[0], v[ 4], v[ 8], v[12]); \ - G(r, 1, v[1], v[ 5], v[ 9], v[13]); \ - G(r, 2, v[2], v[ 6], v[10], v[14]); \ - G(r, 3, v[3], v[ 7], v[11], v[15]); \ - G(r, 4, v[0], v[ 5], v[10], v[15]); \ - G(r, 5, v[1], v[ 6], v[11], v[12]); \ - G(r, 6, v[2], v[ 7], v[ 8], v[13]); \ - G(r, 7, v[3], v[ 4], v[ 9], v[14]); \ -} while (0) - ROUND(0); - ROUND(1); - ROUND(2); - ROUND(3); - ROUND(4); - ROUND(5); - ROUND(6); - ROUND(7); - ROUND(8); - ROUND(9); - -#undef G -#undef ROUND - - for (i = 0; i < 8; ++i) - state->h[i] ^= v[i] ^ v[i + 8]; - - block += BLAKE2S_BLOCK_SIZE; - --nblocks; - } -} - -EXPORT_SYMBOL(blake2s_compress_generic); diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index 51f2dd7a38a4e1..b5b75ade465887 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -16,6 +16,100 @@ #include #include +#ifdef CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC +static const u8 blake2s_sigma[10][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, +}; + +static inline void blake2s_increment_counter(struct blake2s_state *state, + const u32 inc) +{ + state->t[0] += inc; + state->t[1] += (state->t[0] < inc); +} + +void blake2s_compress(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) + __weak __alias(blake2s_compress_generic); + +void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) +{ + u32 m[16]; + u32 v[16]; + int i; + + WARN_ON(IS_ENABLED(DEBUG) && + (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE)); + + while (nblocks > 0) { + blake2s_increment_counter(state, inc); + memcpy(m, block, BLAKE2S_BLOCK_SIZE); + le32_to_cpu_array(m, ARRAY_SIZE(m)); + memcpy(v, state->h, 32); + v[ 8] = BLAKE2S_IV0; + v[ 9] = BLAKE2S_IV1; + v[10] = BLAKE2S_IV2; + v[11] = BLAKE2S_IV3; + v[12] = BLAKE2S_IV4 ^ state->t[0]; + v[13] = BLAKE2S_IV5 ^ state->t[1]; + v[14] = BLAKE2S_IV6 ^ state->f[0]; + v[15] = BLAKE2S_IV7 ^ state->f[1]; + +#define G(r, i, a, b, c, d) do { \ + a += b + m[blake2s_sigma[r][2 * i + 0]]; \ + d = ror32(d ^ a, 16); \ + c += d; \ + b = ror32(b ^ c, 12); \ + a += b + m[blake2s_sigma[r][2 * i + 1]]; \ + d = ror32(d ^ a, 8); \ + c += d; \ + b = ror32(b ^ c, 7); \ +} while (0) + +#define ROUND(r) do { \ + G(r, 0, v[0], v[ 4], v[ 8], v[12]); \ + G(r, 1, v[1], v[ 5], v[ 9], v[13]); \ + G(r, 2, v[2], v[ 6], v[10], v[14]); \ + G(r, 3, v[3], v[ 7], v[11], v[15]); \ + G(r, 4, v[0], v[ 5], v[10], v[15]); \ + G(r, 5, v[1], v[ 6], v[11], v[12]); \ + G(r, 6, v[2], v[ 7], v[ 8], v[13]); \ + G(r, 7, v[3], v[ 4], v[ 9], v[14]); \ +} while (0) + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + +#undef G +#undef ROUND + + for (i = 0; i < 8; ++i) + state->h[i] ^= v[i] ^ v[i + 8]; + + block += BLAKE2S_BLOCK_SIZE; + --nblocks; + } +} +EXPORT_SYMBOL(blake2s_compress_generic); +#endif /* CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC */ + static inline void blake2s_set_lastblock(struct blake2s_state *state) { state->f[0] = -1; From 39ee3970f26d55b57343da392d45117d7f893205 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:30 -0700 Subject: [PATCH 0604/3206] lib/crypto: blake2s: Consolidate into single C translation unit As was done with the other algorithms, reorganize the BLAKE2s code so that the generic implementation and the arch-specific "glue" code is consolidated into a single translation unit, so that the compiler will inline the functions and automatically decide whether to include the generic code in the resulting binary or not. Similarly, also consolidate the build rules into lib/crypto/{Makefile,Kconfig}. This removes the last uses of lib/crypto/{arm,x86}/{Makefile,Kconfig}, so remove those too. Don't keep the !KMSAN dependency. It was needed only for other algorithms such as ChaCha that initialize memory from assembly code. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-12-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/internal/blake2s.h | 19 ------------- lib/crypto/Kconfig | 29 ++++---------------- lib/crypto/Makefile | 13 +++++---- lib/crypto/arm/Kconfig | 14 ---------- lib/crypto/arm/Makefile | 4 --- lib/crypto/arm/blake2s-core.S | 5 +++- lib/crypto/arm/blake2s-glue.c | 7 ----- lib/crypto/arm/blake2s.h | 5 ++++ lib/crypto/blake2s.c | 29 +++++++++++++------- lib/crypto/x86/Kconfig | 13 --------- lib/crypto/x86/Makefile | 4 --- lib/crypto/x86/{blake2s-glue.c => blake2s.h} | 16 ++++------- 12 files changed, 47 insertions(+), 111 deletions(-) delete mode 100644 include/crypto/internal/blake2s.h delete mode 100644 lib/crypto/arm/Kconfig delete mode 100644 lib/crypto/arm/Makefile delete mode 100644 lib/crypto/arm/blake2s-glue.c create mode 100644 lib/crypto/arm/blake2s.h delete mode 100644 lib/crypto/x86/Kconfig delete mode 100644 lib/crypto/x86/Makefile rename lib/crypto/x86/{blake2s-glue.c => blake2s.h} (83%) diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h deleted file mode 100644 index 3b82572c943375..00000000000000 --- a/include/crypto/internal/blake2s.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Helper functions for BLAKE2s implementations. - * Keep this in sync with the corresponding BLAKE2b header. - */ - -#ifndef _CRYPTO_INTERNAL_BLAKE2S_H -#define _CRYPTO_INTERNAL_BLAKE2S_H - -#include -#include - -void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc); - -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc); - -#endif /* _CRYPTO_INTERNAL_BLAKE2S_H */ diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index c1db483bc23067..37d85e0c9b979b 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -28,21 +28,13 @@ config CRYPTO_LIB_ARC4 config CRYPTO_LIB_GF128MUL tristate -config CRYPTO_ARCH_HAVE_LIB_BLAKE2S - bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the Blake2s library interface, - either builtin or as a module. +# BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option. -config CRYPTO_LIB_BLAKE2S_GENERIC - def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - This symbol can be depended upon by arch implementations of the - Blake2s library interface that require the generic code as a - fallback, e.g., for SIMD implementations. If no arch specific - implementation is enabled, this implementation serves the users - of CRYPTO_LIB_BLAKE2S. +config CRYPTO_LIB_BLAKE2S_ARCH + bool + depends on !UML + default y if ARM + default y if X86_64 config CRYPTO_LIB_CHACHA tristate @@ -208,13 +200,4 @@ config CRYPTO_LIB_SM3 source "lib/crypto/tests/Kconfig" -if !KMSAN # avoid false positives from assembly -if ARM -source "lib/crypto/arm/Kconfig" -endif -if X86 -source "lib/crypto/x86/Kconfig" -endif -endif - endmenu diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index c9193f0604d91d..ad27c5bf99e11c 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -29,9 +29,15 @@ libarc4-y := arc4.o obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o +################################################################################ + # blake2s is used by the /dev/random driver which is always builtin -obj-y += libblake2s.o -libblake2s-y := blake2s.o +obj-y += blake2s.o +ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y) +CFLAGS_blake2s.o += -I$(src)/$(SRCARCH) +obj-$(CONFIG_ARM) += arm/blake2s-core.o +obj-$(CONFIG_X86) += x86/blake2s-core.o +endif ################################################################################ @@ -256,9 +262,6 @@ obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o libsm3-y := sm3.o -obj-$(CONFIG_ARM) += arm/ -obj-$(CONFIG_X86) += x86/ - # clean-files must be defined unconditionally clean-files += arm/sha256-core.S arm/sha512-core.S clean-files += arm64/sha256-core.S arm64/sha512-core.S diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig deleted file mode 100644 index a5607ad079c4ff..00000000000000 --- a/lib/crypto/arm/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_BLAKE2S_ARM - def_bool y - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: arm - - This is faster than the generic implementations of BLAKE2s and - BLAKE2b, but slower than the NEON implementation of BLAKE2b. - There is no NEON implementation of BLAKE2s, since NEON doesn't - really help with it. diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile deleted file mode 100644 index 0574b0e9739e31..00000000000000 --- a/lib/crypto/arm/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o -libblake2s-arm-y := blake2s-core.o blake2s-glue.o diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S index df40e46601f100..293f44fa8f316f 100644 --- a/lib/crypto/arm/blake2s-core.S +++ b/lib/crypto/arm/blake2s-core.S @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * BLAKE2s digest algorithm, ARM scalar implementation + * BLAKE2s digest algorithm, ARM scalar implementation. This is faster + * than the generic implementations of BLAKE2s and BLAKE2b, but slower + * than the NEON implementation of BLAKE2b. There is no NEON + * implementation of BLAKE2s, since NEON doesn't really help with it. * * Copyright 2020 Google LLC * diff --git a/lib/crypto/arm/blake2s-glue.c b/lib/crypto/arm/blake2s-glue.c deleted file mode 100644 index 0238a70d9581e9..00000000000000 --- a/lib/crypto/arm/blake2s-glue.c +++ /dev/null @@ -1,7 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include - -/* defined in blake2s-core.S */ -EXPORT_SYMBOL(blake2s_compress); diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h new file mode 100644 index 00000000000000..aa7a97139ea74a --- /dev/null +++ b/lib/crypto/arm/blake2s.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +/* defined in blake2s-core.S */ +void blake2s_compress(struct blake2s_state *state, const u8 *block, + size_t nblocks, u32 inc); diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index b5b75ade465887..5638ed9d882d8b 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -8,7 +8,7 @@ * */ -#include +#include #include #include #include @@ -16,7 +16,6 @@ #include #include -#ifdef CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC static const u8 blake2s_sigma[10][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, @@ -37,12 +36,9 @@ static inline void blake2s_increment_counter(struct blake2s_state *state, state->t[1] += (state->t[0] < inc); } -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) - __weak __alias(blake2s_compress_generic); - -void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) +static void __maybe_unused +blake2s_compress_generic(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) { u32 m[16]; u32 v[16]; @@ -107,8 +103,12 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, --nblocks; } } -EXPORT_SYMBOL(blake2s_compress_generic); -#endif /* CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC */ + +#ifdef CONFIG_CRYPTO_LIB_BLAKE2S_ARCH +#include "blake2s.h" /* $(SRCARCH)/blake2s.h */ +#else +#define blake2s_compress blake2s_compress_generic +#endif static inline void blake2s_set_lastblock(struct blake2s_state *state) { @@ -152,5 +152,14 @@ void blake2s_final(struct blake2s_state *state, u8 *out) } EXPORT_SYMBOL(blake2s_final); +#ifdef blake2s_mod_init_arch +static int __init blake2s_mod_init(void) +{ + blake2s_mod_init_arch(); + return 0; +} +subsys_initcall(blake2s_mod_init); +#endif + MODULE_DESCRIPTION("BLAKE2s hash function"); MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig deleted file mode 100644 index ffa718321369f7..00000000000000 --- a/lib/crypto/x86/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_BLAKE2S_X86 - def_bool y - depends on 64BIT - select CRYPTO_LIB_BLAKE2S_GENERIC - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX-512 (Advanced Vector Extensions-512) diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile deleted file mode 100644 index 4454556b243e71..00000000000000 --- a/lib/crypto/x86/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o -libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o diff --git a/lib/crypto/x86/blake2s-glue.c b/lib/crypto/x86/blake2s.h similarity index 83% rename from lib/crypto/x86/blake2s-glue.c rename to lib/crypto/x86/blake2s.h index adc296cd17c933..b6d30d2fa045e3 100644 --- a/lib/crypto/x86/blake2s-glue.c +++ b/lib/crypto/x86/blake2s.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. */ @@ -7,8 +7,6 @@ #include #include #include -#include -#include #include #include #include @@ -23,8 +21,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) +static void blake2s_compress(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) { /* SIMD disables preemption, so relax after processing each page. */ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); @@ -49,9 +47,9 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block, block += blocks * BLAKE2S_BLOCK_SIZE; } while (nblocks); } -EXPORT_SYMBOL(blake2s_compress); -static int __init blake2s_mod_init(void) +#define blake2s_mod_init_arch blake2s_mod_init_arch +static void blake2s_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) static_branch_enable(&blake2s_use_ssse3); @@ -63,8 +61,4 @@ static int __init blake2s_mod_init(void) cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL)) static_branch_enable(&blake2s_use_avx512); - - return 0; } - -subsys_initcall(blake2s_mod_init); From 362f92286065d9f8282da5def89e173a12191568 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:31 -0700 Subject: [PATCH 0605/3206] lib/crypto: tests: Add KUnit tests for BLAKE2s Add a KUnit test suite for BLAKE2s. Most of the core test logic is in the previously-added hash-test-template.h. This commit just adds the actual KUnit suite, commits the generated test vectors to the tree so that gen-hash-testvecs.py won't have to be run at build time, and adds a few BLAKE2s-specific test cases. This is the replacement for blake2s-selftest, which an earlier commit removed. Improvements over blake2s-selftest include integration with KUnit, more comprehensive test cases, and support for benchmarking. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-13-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/tests/Kconfig | 10 ++ lib/crypto/tests/Makefile | 1 + lib/crypto/tests/blake2s-testvecs.h | 238 ++++++++++++++++++++++++++++ lib/crypto/tests/blake2s_kunit.c | 134 ++++++++++++++++ scripts/crypto/gen-hash-testvecs.py | 27 +++- 5 files changed, 407 insertions(+), 3 deletions(-) create mode 100644 lib/crypto/tests/blake2s-testvecs.h create mode 100644 lib/crypto/tests/blake2s_kunit.c diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig index c21d53fd4b0ce2..fd341aa12f1579 100644 --- a/lib/crypto/tests/Kconfig +++ b/lib/crypto/tests/Kconfig @@ -1,5 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-or-later +config CRYPTO_LIB_BLAKE2S_KUNIT_TEST + tristate "KUnit tests for BLAKE2s" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS + select CRYPTO_LIB_BENCHMARK_VISIBLE + # No need to select CRYPTO_LIB_BLAKE2S here, as that option doesn't + # exist; the BLAKE2s code is always built-in for the /dev/random driver. + help + KUnit tests for the BLAKE2s cryptographic hash function. + config CRYPTO_LIB_MD5_KUNIT_TEST tristate "KUnit tests for MD5" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile index f6f82c6f9cb5da..be7de929af2cc7 100644 --- a/lib/crypto/tests/Makefile +++ b/lib/crypto/tests/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-or-later +obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_KUNIT_TEST) += blake2s_kunit.o obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o diff --git a/lib/crypto/tests/blake2s-testvecs.h b/lib/crypto/tests/blake2s-testvecs.h new file mode 100644 index 00000000000000..6f978b79a59b07 --- /dev/null +++ b/lib/crypto/tests/blake2s-testvecs.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py blake2s */ + +static const struct { + size_t data_len; + u8 digest[BLAKE2S_HASH_SIZE]; +} hash_testvecs[] = { + { + .data_len = 0, + .digest = { + 0x69, 0x21, 0x7a, 0x30, 0x79, 0x90, 0x80, 0x94, + 0xe1, 0x11, 0x21, 0xd0, 0x42, 0x35, 0x4a, 0x7c, + 0x1f, 0x55, 0xb6, 0x48, 0x2c, 0xa1, 0xa5, 0x1e, + 0x1b, 0x25, 0x0d, 0xfd, 0x1e, 0xd0, 0xee, 0xf9, + }, + }, + { + .data_len = 1, + .digest = { + 0x7c, 0xab, 0x53, 0xe2, 0x48, 0x87, 0xdf, 0x64, + 0x98, 0x6a, 0xc1, 0x7e, 0xf0, 0x01, 0x4d, 0xc9, + 0x07, 0x4f, 0xb8, 0x2f, 0x46, 0xd7, 0xee, 0xa9, + 0xad, 0xe5, 0xf8, 0x21, 0xac, 0xfe, 0x17, 0x58, + }, + }, + { + .data_len = 2, + .digest = { + 0x5e, 0x63, 0x2c, 0xd0, 0xf8, 0x7b, 0xf5, 0xae, + 0x61, 0x97, 0x94, 0x57, 0xc8, 0x76, 0x22, 0xd9, + 0x8b, 0x04, 0x5e, 0xf1, 0x5d, 0xd0, 0xfc, 0xd9, + 0x0c, 0x19, 0x2e, 0xe2, 0xc5, 0xd9, 0x73, 0x51, + }, + }, + { + .data_len = 3, + .digest = { + 0x33, 0x65, 0xa6, 0x37, 0xbf, 0xf8, 0x4f, 0x15, + 0x4c, 0xac, 0x9e, 0xa4, 0x3b, 0x02, 0x07, 0x0c, + 0x80, 0x86, 0x0d, 0x6c, 0xe4, 0xaf, 0x1c, 0xbc, + 0x0b, 0x9c, 0x0a, 0x98, 0xc2, 0x99, 0x71, 0xcd, + }, + }, + { + .data_len = 16, + .digest = { + 0x59, 0xd2, 0x10, 0xd3, 0x75, 0xac, 0x48, 0x32, + 0xb1, 0xea, 0xee, 0xcf, 0x0a, 0xd2, 0x8b, 0x15, + 0x5d, 0x72, 0x71, 0x4c, 0xa7, 0x29, 0xb0, 0x7a, + 0x44, 0x48, 0x8a, 0x54, 0x54, 0x54, 0x41, 0xf5, + }, + }, + { + .data_len = 32, + .digest = { + 0xdc, 0xfc, 0x46, 0x81, 0xc6, 0x1b, 0x2b, 0x47, + 0x8b, 0xed, 0xe0, 0x73, 0x34, 0x38, 0x53, 0x92, + 0x97, 0x2f, 0xfb, 0x51, 0xab, 0x4f, 0x2d, 0x9d, + 0x69, 0x04, 0xa9, 0x5d, 0x33, 0xef, 0xcb, 0x1c, + }, + }, + { + .data_len = 48, + .digest = { + 0xd6, 0x2a, 0x7f, 0x96, 0x04, 0x4d, 0x16, 0xc8, + 0x49, 0xe0, 0x37, 0x33, 0xe3, 0x7b, 0x34, 0x56, + 0x99, 0xc5, 0x78, 0x57, 0x06, 0x02, 0xb4, 0xea, + 0x80, 0xc4, 0xf8, 0x8f, 0x8d, 0x2b, 0xe4, 0x05, + }, + }, + { + .data_len = 49, + .digest = { + 0x8b, 0x58, 0x62, 0xb5, 0x85, 0xf6, 0x83, 0x36, + 0xf5, 0x34, 0xb8, 0xd4, 0xbc, 0x5c, 0x8b, 0x38, + 0xfd, 0x15, 0xcd, 0x44, 0x83, 0x25, 0x71, 0xe1, + 0xd5, 0xe8, 0xa1, 0xa4, 0x36, 0x98, 0x7e, 0x68, + }, + }, + { + .data_len = 63, + .digest = { + 0x7e, 0xeb, 0x06, 0x87, 0xdf, 0x1a, 0xdc, 0xe5, + 0xfb, 0x64, 0xd4, 0xd1, 0x5d, 0x9e, 0x75, 0xc0, + 0xb9, 0xad, 0x55, 0x6c, 0xe6, 0xba, 0x4d, 0x98, + 0x2f, 0xbf, 0x72, 0xad, 0x61, 0x37, 0xf6, 0x11, + }, + }, + { + .data_len = 64, + .digest = { + 0x72, 0xdb, 0x43, 0x16, 0x57, 0x8e, 0x3a, 0x96, + 0xf3, 0x98, 0x19, 0x24, 0x17, 0x3b, 0xe8, 0xad, + 0xa1, 0x9b, 0xa4, 0x1b, 0x74, 0x85, 0x2e, 0x24, + 0x70, 0xea, 0x31, 0x5a, 0x1c, 0xbe, 0x43, 0xb5, + }, + }, + { + .data_len = 65, + .digest = { + 0x32, 0x48, 0xb0, 0xf0, 0x3f, 0xbb, 0xd2, 0xa3, + 0xfd, 0xf6, 0x28, 0x4a, 0x2a, 0xc5, 0xbe, 0x4b, + 0x73, 0x50, 0x63, 0xd6, 0x16, 0x00, 0xef, 0xed, + 0xfe, 0x97, 0x41, 0x29, 0xb2, 0x84, 0xc4, 0xa3, + }, + }, + { + .data_len = 127, + .digest = { + 0x17, 0xda, 0x6b, 0x96, 0x6a, 0xa6, 0xa4, 0xa6, + 0xa6, 0xf3, 0x9d, 0x18, 0x19, 0x8d, 0x98, 0x7c, + 0x66, 0x38, 0xe8, 0x99, 0xe7, 0x0a, 0x50, 0x92, + 0xaf, 0x11, 0x80, 0x05, 0x66, 0xed, 0xab, 0x74, + }, + }, + { + .data_len = 128, + .digest = { + 0x13, 0xd5, 0x8b, 0x22, 0xae, 0x90, 0x7b, 0x67, + 0x87, 0x4e, 0x3c, 0x35, 0x4e, 0x01, 0xf0, 0xb1, + 0xd3, 0xd1, 0x67, 0xbb, 0x43, 0xdb, 0x7c, 0x75, + 0xa4, 0xc7, 0x64, 0x83, 0x1e, 0x9b, 0x98, 0xad, + }, + }, + { + .data_len = 129, + .digest = { + 0x6f, 0xe0, 0x5d, 0x9d, 0xd5, 0x78, 0x29, 0xfb, + 0xd0, 0x77, 0xd1, 0x8a, 0xf0, 0x80, 0xcb, 0x81, + 0x71, 0x9e, 0x4d, 0x49, 0xde, 0x74, 0x2a, 0x37, + 0xc0, 0xd5, 0xf0, 0xfa, 0x50, 0xe6, 0x23, 0xfe, + }, + }, + { + .data_len = 256, + .digest = { + 0x89, 0xac, 0xf6, 0xe7, 0x5e, 0xba, 0x53, 0xf4, + 0x92, 0x32, 0xd5, 0x64, 0xfb, 0xc4, 0x08, 0xac, + 0x2c, 0x19, 0x6e, 0x63, 0x13, 0x75, 0xd0, 0x60, + 0x54, 0x35, 0x82, 0xc4, 0x6d, 0x03, 0x1a, 0x05, + }, + }, + { + .data_len = 511, + .digest = { + 0x1c, 0xaf, 0x94, 0x7d, 0x9c, 0xce, 0x57, 0x64, + 0xf8, 0xa8, 0x25, 0x45, 0x32, 0x86, 0x2b, 0x04, + 0xb3, 0x2e, 0x67, 0xca, 0x73, 0x04, 0x2f, 0xab, + 0xcc, 0xda, 0x9e, 0x42, 0xa1, 0xaf, 0x83, 0x5a, + }, + }, + { + .data_len = 513, + .digest = { + 0x21, 0xdf, 0xdc, 0x29, 0xd9, 0xfc, 0x7b, 0xe7, + 0x3a, 0xc4, 0xe1, 0x61, 0xc5, 0xb5, 0xe1, 0xee, + 0x7a, 0x9d, 0x0c, 0x66, 0x36, 0x63, 0xe4, 0x12, + 0x62, 0xe2, 0xf5, 0x68, 0x72, 0xfc, 0x1e, 0x18, + }, + }, + { + .data_len = 1000, + .digest = { + 0x6e, 0xc7, 0x2e, 0xac, 0xd0, 0xbb, 0x22, 0xe0, + 0xc2, 0x40, 0xb2, 0xfe, 0x8c, 0xaf, 0x9e, 0xcf, + 0x32, 0x06, 0xc6, 0x45, 0x29, 0xbd, 0xe0, 0x7f, + 0x53, 0x32, 0xc3, 0x2b, 0x2f, 0x68, 0x12, 0xcd, + }, + }, + { + .data_len = 3333, + .digest = { + 0x76, 0xba, 0x52, 0xb5, 0x09, 0xf5, 0x19, 0x09, + 0x70, 0x1c, 0x09, 0x28, 0xb4, 0xaa, 0x98, 0x6a, + 0x79, 0xe7, 0x5e, 0xcd, 0xe8, 0xa4, 0x73, 0x69, + 0x1f, 0xf8, 0x05, 0x0a, 0xb4, 0xfe, 0xf9, 0x63, + }, + }, + { + .data_len = 4096, + .digest = { + 0xf7, 0xad, 0xf9, 0xc8, 0x0e, 0x04, 0x2f, 0xdf, + 0xbe, 0x39, 0x79, 0x07, 0x0d, 0xd8, 0x1b, 0x06, + 0x42, 0x3a, 0x43, 0x93, 0xf6, 0x7c, 0xc4, 0xe5, + 0xc2, 0xd5, 0xd0, 0xa6, 0x35, 0x6c, 0xbd, 0x17, + }, + }, + { + .data_len = 4128, + .digest = { + 0x38, 0xd7, 0xab, 0x7e, 0x08, 0xdc, 0x1e, 0xab, + 0x55, 0xbb, 0x3b, 0x7b, 0x6a, 0x17, 0xcc, 0x79, + 0xa7, 0x02, 0x62, 0x66, 0x9b, 0xca, 0xee, 0xc0, + 0x3d, 0x75, 0x34, 0x2e, 0x55, 0x82, 0x26, 0x3c, + }, + }, + { + .data_len = 4160, + .digest = { + 0xf7, 0xeb, 0x2f, 0x24, 0x98, 0x54, 0x04, 0x5a, + 0x19, 0xe4, 0x12, 0x9d, 0x97, 0xbc, 0x87, 0xa5, + 0x0b, 0x85, 0x29, 0xa1, 0x36, 0x89, 0xc9, 0xba, + 0xa0, 0xe0, 0xac, 0x99, 0x7d, 0xa4, 0x51, 0x9f, + }, + }, + { + .data_len = 4224, + .digest = { + 0x8f, 0xe8, 0xa7, 0x79, 0x02, 0xbb, 0x4a, 0x56, + 0x66, 0x91, 0xef, 0x22, 0xd1, 0x09, 0x26, 0x6c, + 0xa9, 0x13, 0xd7, 0x44, 0xc7, 0x19, 0x9c, 0x0b, + 0xfb, 0x4f, 0xca, 0x72, 0x8f, 0x34, 0xf7, 0x82, + }, + }, + { + .data_len = 16384, + .digest = { + 0xaa, 0x21, 0xbb, 0x25, 0x4b, 0x66, 0x6e, 0x29, + 0x71, 0xc1, 0x44, 0x67, 0x19, 0xed, 0xe6, 0xe6, + 0x61, 0x13, 0xf4, 0xb7, 0x02, 0x94, 0x81, 0x0f, + 0xa7, 0x4d, 0xbb, 0x2c, 0xb8, 0xeb, 0x41, 0x0e, + }, + }, +}; + +static const u8 hash_testvec_consolidated[BLAKE2S_HASH_SIZE] = { + 0x84, 0x21, 0xbb, 0x73, 0x64, 0x47, 0x45, 0xe0, + 0xc1, 0x83, 0x78, 0xf1, 0xea, 0xe5, 0xfd, 0xdb, + 0x01, 0xda, 0xb7, 0x86, 0x70, 0x3b, 0x83, 0xb3, + 0xbc, 0xd9, 0xfd, 0x96, 0xbd, 0x50, 0x06, 0x67, +}; + +static const u8 blake2s_keyed_testvec_consolidated[BLAKE2S_HASH_SIZE] = { + 0xa6, 0xad, 0xcd, 0xb8, 0xd9, 0xdd, 0xc7, 0x70, + 0x07, 0x09, 0x7f, 0x9f, 0x41, 0xa9, 0x70, 0xa4, + 0x1c, 0xca, 0x61, 0xbb, 0x58, 0xb5, 0xb2, 0x1d, + 0xd1, 0x71, 0x16, 0xb0, 0x49, 0x4f, 0x9e, 0x1b, +}; diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c new file mode 100644 index 00000000000000..057c40132246f1 --- /dev/null +++ b/lib/crypto/tests/blake2s_kunit.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2025 Google LLC + */ +#include +#include "blake2s-testvecs.h" + +/* + * The following are compatibility functions that present BLAKE2s as an unkeyed + * hash function that produces hashes of fixed length BLAKE2S_HASH_SIZE, so that + * hash-test-template.h can be reused to test it. + */ + +static void blake2s_default(const u8 *data, size_t len, + u8 out[BLAKE2S_HASH_SIZE]) +{ + blake2s(out, data, NULL, BLAKE2S_HASH_SIZE, len, 0); +} + +static void blake2s_init_default(struct blake2s_state *state) +{ + blake2s_init(state, BLAKE2S_HASH_SIZE); +} + +/* + * Generate the HASH_KUNIT_CASES using hash-test-template.h. These test BLAKE2s + * with a key length of 0 and a hash length of BLAKE2S_HASH_SIZE. + */ +#define HASH blake2s_default +#define HASH_CTX blake2s_state +#define HASH_SIZE BLAKE2S_HASH_SIZE +#define HASH_INIT blake2s_init_default +#define HASH_UPDATE blake2s_update +#define HASH_FINAL blake2s_final +#include "hash-test-template.h" + +/* + * BLAKE2s specific test case which tests all possible combinations of key + * length and hash length. + */ +static void test_blake2s_all_key_and_hash_lens(struct kunit *test) +{ + const size_t data_len = 100; + u8 *data = &test_buf[0]; + u8 *key = data + data_len; + u8 *hash = key + BLAKE2S_KEY_SIZE; + struct blake2s_state main_state; + u8 main_hash[BLAKE2S_HASH_SIZE]; + + rand_bytes_seeded_from_len(data, data_len); + blake2s_init(&main_state, BLAKE2S_HASH_SIZE); + for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) { + rand_bytes_seeded_from_len(key, key_len); + for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) { + blake2s(hash, data, key, out_len, data_len, key_len); + blake2s_update(&main_state, hash, out_len); + } + } + blake2s_final(&main_state, main_hash); + KUNIT_ASSERT_MEMEQ(test, main_hash, blake2s_keyed_testvec_consolidated, + BLAKE2S_HASH_SIZE); +} + +/* + * BLAKE2s specific test case which tests using a guarded buffer for all allowed + * key lengths. Also tests both blake2s() and blake2s_init_key(). + */ +static void test_blake2s_with_guarded_key_buf(struct kunit *test) +{ + const size_t data_len = 100; + + rand_bytes(test_buf, data_len); + for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) { + u8 key[BLAKE2S_KEY_SIZE]; + u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len]; + u8 hash1[BLAKE2S_HASH_SIZE]; + u8 hash2[BLAKE2S_HASH_SIZE]; + struct blake2s_state state; + + rand_bytes(key, key_len); + memcpy(guarded_key, key, key_len); + + blake2s(hash1, test_buf, key, + BLAKE2S_HASH_SIZE, data_len, key_len); + blake2s(hash2, test_buf, guarded_key, + BLAKE2S_HASH_SIZE, data_len, key_len); + KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE); + + blake2s_init_key(&state, BLAKE2S_HASH_SIZE, + guarded_key, key_len); + blake2s_update(&state, test_buf, data_len); + blake2s_final(&state, hash2); + KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE); + } +} + +/* + * BLAKE2s specific test case which tests using a guarded output buffer for all + * allowed output lengths. + */ +static void test_blake2s_with_guarded_out_buf(struct kunit *test) +{ + const size_t data_len = 100; + + rand_bytes(test_buf, data_len); + for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) { + u8 hash[BLAKE2S_HASH_SIZE]; + u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len]; + + blake2s(hash, test_buf, NULL, out_len, data_len, 0); + blake2s(guarded_hash, test_buf, NULL, out_len, data_len, 0); + KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len); + } +} + +static struct kunit_case blake2s_test_cases[] = { + HASH_KUNIT_CASES, + KUNIT_CASE(test_blake2s_all_key_and_hash_lens), + KUNIT_CASE(test_blake2s_with_guarded_key_buf), + KUNIT_CASE(test_blake2s_with_guarded_out_buf), + KUNIT_CASE(benchmark_hash), + {}, +}; + +static struct kunit_suite blake2s_test_suite = { + .name = "blake2s", + .test_cases = blake2s_test_cases, + .suite_init = hash_suite_init, + .suite_exit = hash_suite_exit, +}; +kunit_test_suite(blake2s_test_suite); + +MODULE_DESCRIPTION("KUnit tests and benchmark for BLAKE2s"); +MODULE_LICENSE("GPL"); diff --git a/scripts/crypto/gen-hash-testvecs.py b/scripts/crypto/gen-hash-testvecs.py index 4ac927d40cf5c3..fc063f2ee95f1e 100755 --- a/scripts/crypto/gen-hash-testvecs.py +++ b/scripts/crypto/gen-hash-testvecs.py @@ -84,11 +84,16 @@ def print_c_struct_u8_array_field(name, value): print_bytes('\t\t\t', value, 8) print('\t\t},') +def alg_digest_size_const(alg): + if alg == 'blake2s': + return 'BLAKE2S_HASH_SIZE' + return f'{alg.upper()}_DIGEST_SIZE' + def gen_unkeyed_testvecs(alg): print('') print('static const struct {') print('\tsize_t data_len;') - print(f'\tu8 digest[{alg.upper()}_DIGEST_SIZE];') + print(f'\tu8 digest[{alg_digest_size_const(alg)}];') print('} hash_testvecs[] = {') for data_len in DATA_LENS: data = rand_bytes(data_len) @@ -103,7 +108,7 @@ def gen_unkeyed_testvecs(alg): for data_len in range(len(data) + 1): hash_update(ctx, compute_hash(alg, data[:data_len])) print_static_u8_array_definition( - f'hash_testvec_consolidated[{alg.upper()}_DIGEST_SIZE]', + f'hash_testvec_consolidated[{alg_digest_size_const(alg)}]', hash_final(ctx)) def gen_hmac_testvecs(alg): @@ -119,6 +124,20 @@ def gen_hmac_testvecs(alg): f'hmac_testvec_consolidated[{alg.upper()}_DIGEST_SIZE]', ctx.digest()) +BLAKE2S_KEY_SIZE = 32 +BLAKE2S_HASH_SIZE = 32 + +def gen_additional_blake2s_testvecs(): + hashes = b'' + for key_len in range(BLAKE2S_KEY_SIZE + 1): + for out_len in range(1, BLAKE2S_HASH_SIZE + 1): + h = hashlib.blake2s(digest_size=out_len, key=rand_bytes(key_len)) + h.update(rand_bytes(100)) + hashes += h.digest() + print_static_u8_array_definition( + 'blake2s_keyed_testvec_consolidated[BLAKE2S_HASH_SIZE]', + compute_hash('blake2s', hashes)) + def gen_additional_poly1305_testvecs(): key = b'\xff' * POLY1305_KEY_SIZE data = b'' @@ -141,7 +160,9 @@ def gen_additional_poly1305_testvecs(): print('/* SPDX-License-Identifier: GPL-2.0-or-later */') print(f'/* This file was generated by: {sys.argv[0]} {" ".join(sys.argv[1:])} */') gen_unkeyed_testvecs(alg) -if alg == 'poly1305': +if alg == 'blake2s': + gen_additional_blake2s_testvecs() +elif alg == 'poly1305': gen_additional_poly1305_testvecs() else: gen_hmac_testvecs(alg) From 31f1a960ad1a14def94fa0b8c25d62b4c032813f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Aug 2025 09:02:16 -0700 Subject: [PATCH 0606/3206] NFSv4: Don't clear capabilities that won't be reset Don't clear the capabilities that are not going to get reset by the call to _nfs4_server_capabilities(). Reported-by: Scott Haiden Fixes: b01f21cacde9 ("NFS: Fix the setting of capabilities when automounting a new filesystem") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7d2b67e06cc37f..5b92fcf45dd7d9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4092,7 +4092,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) }; int err; - nfs_server_set_init_caps(server); do { err = nfs4_handle_exception(server, _nfs4_server_capabilities(server, fhandle), From dd5a8621b886b02f8341c5d4ea68eb2c552ebd3e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Aug 2025 09:07:22 -0700 Subject: [PATCH 0607/3206] NFSv4: Clear the NFS_CAP_FS_LOCATIONS flag if it is not set _nfs4_server_capabilities() is expected to clear any flags that are not supported by the server. Fixes: 8a59bb93b7e3 ("NFSv4 store server support for fs_location attribute") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5b92fcf45dd7d9..d0f91d9430f6d0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4013,8 +4013,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f res.attr_bitmask[2]; } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); - server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | - NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL); + server->caps &= + ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS | + NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS); server->fattr_valid = NFS_ATTR_FATTR_V4; if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) From b3ac33436030bce37ecb3dcae581ecfaad28078c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Aug 2025 09:12:30 -0700 Subject: [PATCH 0608/3206] NFSv4: Clear NFS_CAP_OPEN_XOR and NFS_CAP_DELEGTIME if not supported _nfs4_server_capabilities() should clear capabilities that are not supported by the server. Fixes: d2a00cceb93a ("NFSv4: Detect support for OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d0f91d9430f6d0..ce61253efd45b4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4015,7 +4015,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS | - NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS); + NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS | + NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME); server->fattr_valid = NFS_ATTR_FATTR_V4; if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) From 4fb2b677fc1f70ee642c0beecc3cabf226ef5707 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Aug 2025 09:15:12 -0700 Subject: [PATCH 0609/3206] NFSv4: Clear the NFS_CAP_XATTR flag if not supported by the server nfs_server_set_fsinfo() shouldn't assume that NFS_CAP_XATTR is unset on entry to the function. Fixes: b78ef845c35d ("NFSv4.2: query the server for extended attribute support") Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 8fb4a950dd5581..4e3dcc157a83c8 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -888,6 +888,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; + else + server->caps &= ~NFS_CAP_XATTR; #endif } From 8a68d64bb10334426834e8c273319601878e961e Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 29 Aug 2025 15:28:52 -0700 Subject: [PATCH 0610/3206] x86/vmscape: Add old Intel CPUs to affected list These old CPUs are not tested against VMSCAPE, but are likely vulnerable. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen --- arch/x86/kernel/cpu/common.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2b87c93e660963..f98ec9c7fc07fe 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1240,15 +1240,18 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define VMSCAPE BIT(11) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { - VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), From 2b31c1c7134a316a0c97da5d7bdbe15552101db3 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 12 Aug 2025 15:33:34 -0500 Subject: [PATCH 0611/3206] dt-bindings: pinctrl: Convert brcm,bcm2835-gpio to DT schema Convert the Broadcom BCM2835 GPIO (and pinmux) controller binding to DT schema format. The structure of the child nodes wasn't well defined. The schema is based on the .dts users. The legacy binding is a single level of child nodes while the standard binding is 2 levels of child nodes. The "all banks" interrupt is treated as optional following actual users. Signed-off-by: Rob Herring (Arm) Reviewed-by: Linus Walleij Link: https://lore.kernel.org/20250812203337.731648-1-robh@kernel.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/brcm,bcm2835-gpio.txt | 99 --------------- .../bindings/pinctrl/brcm,bcm2835-gpio.yaml | 120 ++++++++++++++++++ 2 files changed, 120 insertions(+), 99 deletions(-) delete mode 100644 Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt create mode 100644 Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.yaml diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt deleted file mode 100644 index 5682b2010e5009..00000000000000 --- a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt +++ /dev/null @@ -1,99 +0,0 @@ -Broadcom BCM2835 GPIO (and pinmux) controller - -The BCM2835 GPIO module is a combined GPIO controller, (GPIO) interrupt -controller, and pinmux/control device. - -Required properties: -- compatible: "brcm,bcm2835-gpio" -- compatible: should be one of: - "brcm,bcm2835-gpio" - BCM2835 compatible pinctrl - "brcm,bcm7211-gpio" - BCM7211 compatible pinctrl - "brcm,bcm2711-gpio" - BCM2711 compatible pinctrl - "brcm,bcm7211-gpio" - BCM7211 compatible pinctrl -- reg: Should contain the physical address of the GPIO module's registers. -- gpio-controller: Marks the device node as a GPIO controller. -- #gpio-cells : Should be two. The first cell is the pin number and the - second cell is used to specify optional parameters: - - bit 0 specifies polarity (0 for normal, 1 for inverted) -- interrupts : The interrupt outputs from the controller. One interrupt per - individual bank followed by the "all banks" interrupt. For BCM7211, an - additional set of per-bank interrupt line and an "all banks" wake-up - interrupt may be specified. -- interrupt-controller: Marks the device node as an interrupt controller. -- #interrupt-cells : Should be 2. - The first cell is the GPIO number. - The second cell is used to specify flags: - bits[3:0] trigger type and level flags: - 1 = low-to-high edge triggered. - 2 = high-to-low edge triggered. - 4 = active high level-sensitive. - 8 = active low level-sensitive. - Valid combinations are 1, 2, 3, 4, 8. - -Please refer to ../gpio/gpio.txt for a general description of GPIO bindings. - -Please refer to pinctrl-bindings.txt in this directory for details of the -common pinctrl bindings used by client devices, including the meaning of the -phrase "pin configuration node". - -Each pin configuration node lists the pin(s) to which it applies, and one or -more of the mux function to select on those pin(s), and pull-up/down -configuration. Each subnode only affects those parameters that are explicitly -listed. In other words, a subnode that lists only a mux function implies no -information about any pull configuration. Similarly, a subnode that lists only -a pul parameter implies no information about the mux function. - -The BCM2835 pin configuration and multiplexing supports the generic bindings. -For details on each properties, you can refer to ./pinctrl-bindings.txt. - -Required sub-node properties: - - pins - - function - -Optional sub-node properties: - - bias-disable - - bias-pull-up - - bias-pull-down - - output-high - - output-low - -Legacy pin configuration and multiplexing binding: -*** (Its use is deprecated, use generic multiplexing and configuration -bindings instead) - -Required subnode-properties: -- brcm,pins: An array of cells. Each cell contains the ID of a pin. Valid IDs - are the integer GPIO IDs; 0==GPIO0, 1==GPIO1, ... 53==GPIO53. - -Optional subnode-properties: -- brcm,function: Integer, containing the function to mux to the pin(s): - 0: GPIO in - 1: GPIO out - 2: alt5 - 3: alt4 - 4: alt0 - 5: alt1 - 6: alt2 - 7: alt3 -- brcm,pull: Integer, representing the pull-down/up to apply to the pin(s): - 0: none - 1: down - 2: up - -Each of brcm,function and brcm,pull may contain either a single value which -will be applied to all pins in brcm,pins, or 1 value for each entry in -brcm,pins. - -Example: - - gpio: gpio { - compatible = "brcm,bcm2835-gpio"; - reg = <0x2200000 0xb4>; - interrupts = <2 17>, <2 19>, <2 18>, <2 20>; - - gpio-controller; - #gpio-cells = <2>; - - interrupt-controller; - #interrupt-cells = <2>; - }; diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.yaml b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.yaml new file mode 100644 index 00000000000000..6514f347f6bc6a --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.yaml @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/brcm,bcm2835-gpio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Broadcom BCM2835 GPIO (and pinmux) controller + +maintainers: + - Florian Fainelli + +description: > + The BCM2835 GPIO module is a combined GPIO controller, (GPIO) interrupt + controller, and pinmux/control device. + +properties: + compatible: + enum: + - brcm,bcm2835-gpio + - brcm,bcm2711-gpio + - brcm,bcm7211-gpio + + reg: + maxItems: 1 + + '#gpio-cells': + const: 2 + + gpio-controller: true + gpio-ranges: true + gpio-line-names: true + + interrupts: + description: > + Interrupt outputs: one per bank, then the combined “all banks” line. + BCM7211 may specify up to four per-bank wake-up lines and one combined + wake-up interrupt. + minItems: 4 + maxItems: 10 + + '#interrupt-cells': + const: 2 + + interrupt-controller: true + +additionalProperties: + oneOf: + - type: object + additionalProperties: false + + patternProperties: + '^pins?-': + type: object + allOf: + - $ref: /schemas/pinctrl/pincfg-node.yaml# + - $ref: /schemas/pinctrl/pinmux-node.yaml# + additionalProperties: false + + properties: + pins: true + function: true + bias-disable: true + bias-pull-up: true + bias-pull-down: true + output-high: true + output-low: true + + required: + - pins + - function + + - type: object + additionalProperties: false + deprecated: true + + properties: + brcm,pins: + description: + GPIO pin numbers for legacy configuration. + $ref: /schemas/types.yaml#/definitions/uint32-array + + brcm,function: + description: + Legacy mux function for the pins (0=input, 1=output, 2–7=alt functions). + $ref: /schemas/types.yaml#/definitions/uint32-array + maximum: 7 + + brcm,pull: + description: > + Legacy pull setting for the pins (0=none, 1=pull-down, 2=pull-up). + $ref: /schemas/types.yaml#/definitions/uint32-array + maximum: 2 + + required: + - brcm,pins + +allOf: + - if: + properties: + compatible: + contains: + enum: + - brcm,bcm2835-gpio + - brcm,bcm2711-gpio + then: + properties: + interrupts: + maxItems: 5 + +examples: + - | + gpio@2200000 { + compatible = "brcm,bcm2835-gpio"; + reg = <0x2200000 0xb4>; + interrupts = <2 17>, <2 19>, <2 18>, <2 20>, <2 21>; + #gpio-cells = <2>; + gpio-controller; + #interrupt-cells = <2>; + interrupt-controller; + }; From 8898cf86f043468d285a474d1614e5cf1a170b16 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 12 Aug 2025 15:33:47 -0500 Subject: [PATCH 0612/3206] dt-bindings: pinctrl: Convert brcm,iproc-gpio to DT schema Convert the Broadcom iProc/Cygnus GPIO/Pinconf binding to DT schema format. The child node structure is based on the example as there's not any actual .dts files with child nodes. The binding wasn't clear that "reg" can be 1 or 2 entries. The number of "reg" entries doesn't appear to be based on compatible, so no per compatible constraints for it The "brcm,iproc-stingray-gpio" could possibly be dropped. There are no .dts files using it, but the driver uses it. Signed-off-by: Rob Herring (Arm) Reviewed-by: Linus Walleij Link: https://lore.kernel.org/20250812203348.733749-1-robh@kernel.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/brcm,iproc-gpio.txt | 123 ------------------ .../bindings/pinctrl/brcm,iproc-gpio.yaml | 111 ++++++++++++++++ 2 files changed, 111 insertions(+), 123 deletions(-) delete mode 100644 Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.txt create mode 100644 Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.yaml diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.txt b/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.txt deleted file mode 100644 index a73cbeb0f309de..00000000000000 --- a/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.txt +++ /dev/null @@ -1,123 +0,0 @@ -Broadcom iProc GPIO/PINCONF Controller - -Required properties: - -- compatible: - "brcm,iproc-gpio" for the generic iProc based GPIO controller IP that - supports full-featured pinctrl and GPIO functions used in various iProc - based SoCs - - May contain an SoC-specific compatibility string to accommodate any - SoC-specific features - - "brcm,cygnus-ccm-gpio", "brcm,cygnus-asiu-gpio", or - "brcm,cygnus-crmu-gpio" for Cygnus SoCs - - "brcm,iproc-nsp-gpio" for the iProc NSP SoC that has drive strength support - disabled - - "brcm,iproc-stingray-gpio" for the iProc Stingray SoC that has the general - pinctrl support completely disabled in this IP block. In Stingray, a - different IP block is used to handle pinctrl related functions - -- reg: - Define the base and range of the I/O address space that contains SoC -GPIO/PINCONF controller registers - -- ngpios: - Total number of in-use slots in GPIO controller - -- #gpio-cells: - Must be two. The first cell is the GPIO pin number (within the -controller's pin space) and the second cell is used for the following: - bit[0]: polarity (0 for active high and 1 for active low) - -- gpio-controller: - Specifies that the node is a GPIO controller - -Optional properties: - -- interrupts: - Interrupt ID - -- interrupt-controller: - Specifies that the node is an interrupt controller - -- gpio-ranges: - Specifies the mapping between gpio controller and pin-controllers pins. - This requires 4 fields in cells defined as - - 1. Phandle of pin-controller. - 2. GPIO base pin offset. - 3 Pin-control base pin offset. - 4. number of gpio pins which are linearly mapped from pin base. - -Supported generic PINCONF properties in child nodes: - -- pins: - The list of pins (within the controller's own pin space) that properties -in the node apply to. Pin names are "gpio-" - -- bias-disable: - Disable pin bias - -- bias-pull-up: - Enable internal pull up resistor - -- bias-pull-down: - Enable internal pull down resistor - -- drive-strength: - Valid drive strength values include 2, 4, 6, 8, 10, 12, 14, 16 (mA) - -Example: - gpio_ccm: gpio@1800a000 { - compatible = "brcm,cygnus-ccm-gpio"; - reg = <0x1800a000 0x50>, - <0x0301d164 0x20>; - ngpios = <24>; - #gpio-cells = <2>; - gpio-controller; - interrupts = ; - interrupt-controller; - - touch_pins: touch_pins { - pwr: pwr { - pins = "gpio-0"; - drive-strength = <16>; - }; - - event: event { - pins = "gpio-1"; - bias-pull-up; - }; - }; - }; - - gpio_asiu: gpio@180a5000 { - compatible = "brcm,cygnus-asiu-gpio"; - reg = <0x180a5000 0x668>; - ngpios = <146>; - #gpio-cells = <2>; - gpio-controller; - interrupts = ; - interrupt-controller; - gpio-ranges = <&pinctrl 0 42 1>, - <&pinctrl 1 44 3>; - }; - - /* - * Touchscreen that uses the CCM GPIO 0 and 1 - */ - tsc { - ... - ... - gpio-pwr = <&gpio_ccm 0 0>; - gpio-event = <&gpio_ccm 1 0>; - }; - - /* Bluetooth that uses the ASIU GPIO 5, with polarity inverted */ - bluetooth { - ... - ... - bcm,rfkill-bank-sel = <&gpio_asiu 5 1> - } diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.yaml b/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.yaml new file mode 100644 index 00000000000000..a0ed308b7fc848 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.yaml @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/brcm,iproc-gpio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Broadcom iProc GPIO/PINCONF Controller + +maintainers: + - Ray Jui + - Scott Branden + +properties: + compatible: + oneOf: + - enum: + - brcm,cygnus-asiu-gpio + - brcm,cygnus-ccm-gpio + - brcm,cygnus-crmu-gpio + - brcm,iproc-gpio + - brcm,iproc-stingray-gpio + - items: + - enum: + - brcm,iproc-hr2-gpio + - brcm,iproc-nsp-gpio + - const: brcm,iproc-gpio + + reg: + minItems: 1 + items: + - description: GPIO Bank registers + - description: IO Ctrl registers + + "#gpio-cells": + const: 2 + + gpio-controller: true + + gpio-ranges: true + + ngpios: true + + "#interrupt-cells": + const: 2 + + interrupts: + maxItems: 1 + + interrupt-controller: true + +required: + - compatible + - reg + - "#gpio-cells" + - gpio-controller + - ngpios + +patternProperties: + '-pins$': + type: object + additionalProperties: + description: Pin configuration child nodes. + allOf: + - $ref: pincfg-node.yaml# + - $ref: pinmux-node.yaml# + additionalProperties: false + + properties: + pins: + items: + pattern: '^gpio-' + + bias-disable: true + bias-pull-up: true + bias-pull-down: true + + drive-strength: + enum: [ 2, 4, 6, 8, 10, 12, 14, 16 ] + + required: + - pins + +additionalProperties: false + +examples: + - | + #include + + gpio@1800a000 { + compatible = "brcm,cygnus-ccm-gpio"; + reg = <0x1800a000 0x50>, + <0x0301d164 0x20>; + ngpios = <24>; + #gpio-cells = <2>; + gpio-controller; + #interrupt-cells = <2>; + interrupts = ; + interrupt-controller; + + touch-pins { + pwr { + pins = "gpio-0"; + drive-strength = <16>; + }; + + event { + pins = "gpio-1"; + bias-pull-up; + }; + }; + }; From 0a561e3904a92492fee8e02a9f69276e939fd990 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:56 -0700 Subject: [PATCH 0613/3206] audit: create audit_stamp structure Replace the timestamp and serial number pair used in audit records with a structure containing the two elements. Signed-off-by: Casey Schaufler [PM: subj tweak] Signed-off-by: Paul Moore --- kernel/audit.c | 17 +++++++++-------- kernel/audit.h | 13 +++++++++---- kernel/auditsc.c | 22 +++++++++------------- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/kernel/audit.c b/kernel/audit.c index 61b5744d0bb684..547967cb4266b9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1833,11 +1833,11 @@ unsigned int audit_serial(void) } static inline void audit_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial) + struct audit_stamp *stamp) { - if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - ktime_get_coarse_real_ts64(t); - *serial = audit_serial(); + if (!ctx || !auditsc_get_stamp(ctx, stamp)) { + ktime_get_coarse_real_ts64(&stamp->ctime); + stamp->serial = audit_serial(); } } @@ -1860,8 +1860,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct timespec64 t; - unsigned int serial; + struct audit_stamp stamp; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1916,12 +1915,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_get_stamp(ab->ctx, &t, &serial); + audit_get_stamp(ab->ctx, &stamp); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", - (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); + (unsigned long long)stamp.ctime.tv_sec, + stamp.ctime.tv_nsec/1000000, + stamp.serial); return ab; } diff --git a/kernel/audit.h b/kernel/audit.h index 2a24d01c5fb0e2..0f05933a173be0 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -99,6 +99,12 @@ struct audit_proctitle { char *value; /* the cmdline field */ }; +/* A timestamp/serial pair to identify an event */ +struct audit_stamp { + struct timespec64 ctime; /* time of syscall entry */ + unsigned int serial; /* serial number for record */ +}; + /* The per-task audit context. */ struct audit_context { int dummy; /* must be the first element */ @@ -108,10 +114,9 @@ struct audit_context { AUDIT_CTX_URING, /* in use by io_uring */ } context; enum audit_state state, current_state; - unsigned int serial; /* serial number for record */ + struct audit_stamp stamp; /* event identifier */ int major; /* syscall number */ int uring_op; /* uring operation */ - struct timespec64 ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ long return_code;/* syscall return code */ u64 prio; @@ -263,7 +268,7 @@ extern void audit_put_tty(struct tty_struct *tty); extern unsigned int audit_serial(void); #ifdef CONFIG_AUDITSYSCALL extern int auditsc_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial); + struct audit_stamp *stamp); extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); @@ -304,7 +309,7 @@ extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); #else /* CONFIG_AUDITSYSCALL */ -#define auditsc_get_stamp(c, t, s) 0 +#define auditsc_get_stamp(c, s) 0 #define audit_put_watch(w) do { } while (0) #define audit_get_watch(w) do { } while (0) #define audit_to_watch(k, p, l, o) (-EINVAL) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 497bda0043fb0b..8ec768e2c1e5eb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -994,10 +994,10 @@ static void audit_reset_context(struct audit_context *ctx) */ ctx->current_state = ctx->state; - ctx->serial = 0; + ctx->stamp.serial = 0; + ctx->stamp.ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; ctx->major = 0; ctx->uring_op = 0; - ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; memset(ctx->argv, 0, sizeof(ctx->argv)); ctx->return_code = 0; ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0); @@ -1918,7 +1918,7 @@ void __audit_uring_entry(u8 op) ctx->context = AUDIT_CTX_URING; ctx->current_state = ctx->state; - ktime_get_coarse_real_ts64(&ctx->ctime); + ktime_get_coarse_real_ts64(&ctx->stamp.ctime); } /** @@ -2040,7 +2040,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, context->argv[3] = a4; context->context = AUDIT_CTX_SYSCALL; context->current_state = state; - ktime_get_coarse_real_ts64(&context->ctime); + ktime_get_coarse_real_ts64(&context->stamp.ctime); } /** @@ -2509,21 +2509,17 @@ EXPORT_SYMBOL_GPL(__audit_inode_child); /** * auditsc_get_stamp - get local copies of audit_context values * @ctx: audit_context for the task - * @t: timespec64 to store time recorded in the audit_context - * @serial: serial value that is recorded in the audit_context + * @stamp: timestamp to record * * Also sets the context as auditable. */ -int auditsc_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial) +int auditsc_get_stamp(struct audit_context *ctx, struct audit_stamp *stamp) { if (ctx->context == AUDIT_CTX_UNUSED) return 0; - if (!ctx->serial) - ctx->serial = audit_serial(); - t->tv_sec = ctx->ctime.tv_sec; - t->tv_nsec = ctx->ctime.tv_nsec; - *serial = ctx->serial; + if (!ctx->stamp.serial) + ctx->stamp.serial = audit_serial(); + *stamp = ctx->stamp; if (!ctx->prio) { ctx->prio = 1; ctx->current_state = AUDIT_STATE_RECORD; From a59076f2669ec23a122549e1f4114e8d4255b632 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:57 -0700 Subject: [PATCH 0614/3206] lsm: security_lsmblob_to_secctx module selection Add a parameter lsmid to security_lsmblob_to_secctx() to identify which of the security modules that may be active should provide the security context. If the value of lsmid is LSM_ID_UNDEF the first LSM providing a hook is used. security_secid_to_secctx() is unchanged, and will always report the first LSM providing a hook. Signed-off-by: Casey Schaufler [PM: subj tweak] Signed-off-by: Paul Moore --- include/linux/security.h | 6 ++++-- kernel/audit.c | 4 ++-- kernel/auditsc.c | 8 +++++--- net/netlabel/netlabel_user.c | 3 ++- security/security.c | 18 ++++++++++++++++-- 5 files changed, 29 insertions(+), 10 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b97170c..6d1ed6e7387bc5 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -567,7 +567,8 @@ int security_getprocattr(struct task_struct *p, int lsmid, const char *name, int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, struct lsm_context *cp); -int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp); +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp, + int lsmid); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); @@ -1551,7 +1552,8 @@ static inline int security_secid_to_secctx(u32 secid, struct lsm_context *cp) } static inline int security_lsmprop_to_secctx(struct lsm_prop *prop, - struct lsm_context *cp) + struct lsm_context *cp, + int lsmid) { return -EOPNOTSUPP; } diff --git a/kernel/audit.c b/kernel/audit.c index 547967cb4266b9..226c8ae00d04ea 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1473,7 +1473,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, case AUDIT_SIGNAL_INFO: if (lsmprop_is_set(&audit_sig_lsm)) { err = security_lsmprop_to_secctx(&audit_sig_lsm, - &lsmctx); + &lsmctx, LSM_ID_UNDEF); if (err < 0) return err; } @@ -2188,7 +2188,7 @@ int audit_log_task_context(struct audit_buffer *ab) if (!lsmprop_is_set(&prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx); + error = security_lsmprop_to_secctx(&prop, &ctx, LSM_ID_UNDEF); if (error < 0) { if (error != -EINVAL) goto error_path; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8ec768e2c1e5eb..3b606fd4ae8e87 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1109,7 +1109,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx) < 0) { + if (security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF) < 0) { audit_log_format(ab, " obj=(none)"); rc = 1; } else { @@ -1395,7 +1395,8 @@ static void show_special(struct audit_context *context, int *call_panic) struct lsm_context lsmctx; if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx) < 0) { + &lsmctx, + LSM_ID_UNDEF) < 0) { *call_panic = 1; } else { audit_log_format(ab, " obj=%s", lsmctx.context); @@ -1560,7 +1561,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, if (lsmprop_is_set(&n->oprop)) { struct lsm_context ctx; - if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) { + if (security_lsmprop_to_secctx(&n->oprop, &ctx, + LSM_ID_UNDEF) < 0) { if (call_panic) *call_panic = 2; } else { diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 0d04d23aafe7fc..6d6545297ee30b 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -98,7 +98,8 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_info->sessionid); if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx) > 0) { + security_lsmprop_to_secctx(&audit_info->prop, &ctx, + LSM_ID_UNDEF) > 0) { audit_log_format(audit_buf, " subj=%s", ctx.context); security_release_secctx(&ctx); } diff --git a/security/security.c b/security/security.c index ad163f06bf7abc..dd588f548a2b0c 100644 --- a/security/security.c +++ b/security/security.c @@ -4342,17 +4342,31 @@ EXPORT_SYMBOL(security_secid_to_secctx); * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx * @prop: lsm specific information * @cp: the LSM context + * @lsmid: which security module to report * * Convert a @prop entry to security context. If @cp is NULL the * length of the result will be returned. This does mean that the * length could change between calls to check the length and the * next call which actually allocates and returns the @cp. * + * @lsmid identifies which LSM should supply the context. + * A value of LSM_ID_UNDEF indicates that the first LSM suppling + * the hook should be used. This is used in cases where the + * ID of the supplying LSM is unambiguous. + * * Return: Return length of data on success, error on failure. */ -int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp) +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp, + int lsmid) { - return call_int_hook(lsmprop_to_secctx, prop, cp); + struct lsm_static_call *scall; + + lsm_for_each_hook(scall, lsmprop_to_secctx) { + if (lsmid != LSM_ID_UNDEF && lsmid != scall->hl->lsmid->id) + continue; + return scall->hl->hook.lsmprop_to_secctx(prop, cp); + } + return LSM_RET_DEFAULT(lsmprop_to_secctx); } EXPORT_SYMBOL(security_lsmprop_to_secctx); From eb59d494eebd4c5414728a35cdea6a0ba78ff26e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:58 -0700 Subject: [PATCH 0615/3206] audit: add record for multiple task security contexts Replace the single skb pointer in an audit_buffer with a list of skb pointers. Add the audit_stamp information to the audit_buffer as there's no guarantee that there will be an audit_context containing the stamp associated with the event. At audit_log_end() time create auxiliary records as have been added to the list. Functions are created to manage the skb list in the audit_buffer. Create a new audit record AUDIT_MAC_TASK_CONTEXTS. An example of the MAC_TASK_CONTEXTS record is: type=MAC_TASK_CONTEXTS msg=audit(1600880931.832:113) subj_apparmor=unconfined subj_smack=_ When an audit event includes a AUDIT_MAC_TASK_CONTEXTS record the "subj=" field in other records in the event will be "subj=?". An AUDIT_MAC_TASK_CONTEXTS record is supplied when the system has multiple security modules that may make access decisions based on a subject security context. Refactor audit_log_task_context(), creating a new audit_log_subj_ctx(). This is used in netlabel auditing to provide multiple subject security contexts as necessary. Suggested-by: Paul Moore Signed-off-by: Casey Schaufler [PM: subj tweak, audit example readability indents] Signed-off-by: Paul Moore --- include/linux/audit.h | 16 +++ include/uapi/linux/audit.h | 1 + kernel/audit.c | 208 +++++++++++++++++++++++++++++------ net/netlabel/netlabel_user.c | 9 +- security/apparmor/lsm.c | 3 + security/selinux/hooks.c | 3 + security/smack/smack_lsm.c | 3 + 7 files changed, 202 insertions(+), 41 deletions(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index e3f06eba9c6e6e..a1f068bcb3a0cf 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -37,6 +37,8 @@ struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; +struct lsm_id; +struct lsm_prop; struct audit_krule { u32 pflags; @@ -147,6 +149,9 @@ extern unsigned compat_signal_class[]; #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) +/* bit values for audit_cfg_lsm */ +#define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) + struct filename; #define AUDIT_OFF 0 @@ -185,6 +190,7 @@ extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); +extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -210,6 +216,8 @@ extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); +extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags); + #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, @@ -245,6 +253,11 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } +static inline int audit_log_subj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; @@ -269,6 +282,9 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return 0; } +static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ } + #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 9a4ecc9f6dc5b1..8cad2f3077197c 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -148,6 +148,7 @@ #define AUDIT_IPE_POLICY_LOAD 1422 /* IPE policy load */ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ +#define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/kernel/audit.c b/kernel/audit.c index 226c8ae00d04ea..c924b30f25243e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,11 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; +/* Number of modules that provide a security context. + List of lsms that provide a security context */ +static u32 audit_subj_secctx_cnt; +static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; + /** * struct audit_net - audit private network namespace data * @sk: communication socket @@ -195,8 +201,10 @@ static struct audit_ctl_mutex { * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { - struct sk_buff *skb; /* formatted skb ready to send */ + struct sk_buff *skb; /* the skb for audit_log functions */ + struct sk_buff_head skb_list; /* formatted skbs, ready to send */ struct audit_context *ctx; /* NULL or associated context */ + struct audit_stamp stamp; /* audit stamp for these records */ gfp_t gfp_mask; }; @@ -278,6 +286,27 @@ static pid_t auditd_pid_vnr(void) return pid; } +/** + * audit_cfg_lsm - Identify a security module as providing a secctx. + * @lsmid: LSM identity + * @flags: which contexts are provided + * + * Description: + * Increments the count of the security modules providing a secctx. + * If the LSM id is already in the list leave it alone. + */ +void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ + int i; + + if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) { + for (i = 0 ; i < audit_subj_secctx_cnt; i++) + if (audit_subj_lsms[i] == lsmid) + return; + audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; + } +} + /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace @@ -1776,10 +1805,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { + struct sk_buff *skb; + if (!ab) return; - kfree_skb(ab->skb); + while ((skb = skb_dequeue(&ab->skb_list))) + kfree_skb(skb); kmem_cache_free(audit_buffer_cache, ab); } @@ -1795,6 +1827,10 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; + + skb_queue_head_init(&ab->skb_list); + skb_queue_tail(&ab->skb_list, ab->skb); + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; @@ -1860,7 +1896,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct audit_stamp stamp; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1915,14 +1950,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_get_stamp(ab->ctx, &stamp); + audit_get_stamp(ab->ctx, &ab->stamp); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", - (unsigned long long)stamp.ctime.tv_sec, - stamp.ctime.tv_nsec/1000000, - stamp.serial); + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); return ab; } @@ -2178,31 +2213,128 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -int audit_log_task_context(struct audit_buffer *ab) +/** + * audit_buffer_aux_new - Add an aux record buffer to the skb list + * @ab: audit_buffer + * @type: message type + * + * Aux records are allocated and added to the skb list of + * the "main" record. The ab->skb is reset to point to the + * aux record on its creation. When the aux record in complete + * ab->skb has to be reset to point to the "main" record. + * This allows the audit_log_ functions to be ignorant of + * which kind of record it is logging to. It also avoids adding + * special data for aux records. + * + * On success ab->skb will point to the new aux record. + * Returns 0 on success, -ENOMEM should allocation fail. + */ +static int audit_buffer_aux_new(struct audit_buffer *ab, int type) +{ + WARN_ON(ab->skb != skb_peek(&ab->skb_list)); + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask); + if (!ab->skb) + goto err; + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) + goto err; + skb_queue_tail(&ab->skb_list, ab->skb); + + audit_log_format(ab, "audit(%llu.%03lu:%u): ", + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); + + return 0; + +err: + kfree_skb(ab->skb); + ab->skb = skb_peek(&ab->skb_list); + return -ENOMEM; +} + +/** + * audit_buffer_aux_end - Switch back to the "main" record from an aux record + * @ab: audit_buffer + * + * Restores the "main" audit record to ab->skb. + */ +static void audit_buffer_aux_end(struct audit_buffer *ab) +{ + ab->skb = skb_peek(&ab->skb_list); +} + +/** + * audit_log_subj_ctx - Add LSM subject information + * @ab: audit_buffer + * @prop: LSM subject properties. + * + * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record. + */ +int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { - struct lsm_prop prop; struct lsm_context ctx; + char *space = ""; int error; + int i; - security_current_getlsmprop_subj(&prop); - if (!lsmprop_is_set(&prop)) + security_current_getlsmprop_subj(prop); + if (!lsmprop_is_set(prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx, LSM_ID_UNDEF); - if (error < 0) { - if (error != -EINVAL) - goto error_path; + if (audit_subj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return 0; + } + audit_log_format(ab, " subj=%s", ctx.context); + security_release_secctx(&ctx); return 0; } - - audit_log_format(ab, " subj=%s", ctx.context); - security_release_secctx(&ctx); + /* Multiple LSMs provide contexts. Include an aux record. */ + audit_log_format(ab, " subj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_subj_secctx_cnt; i++) { + error = security_lsmprop_to_secctx(prop, &ctx, + audit_subj_lsms[i]->id); + if (error < 0) { + /* + * Don't print anything. An LSM like BPF could + * claim to support contexts, but only do so under + * certain conditions. + */ + if (error == -EOPNOTSUPP) + continue; + if (error != -EINVAL) + audit_panic("error in audit_log_subj_ctx"); + } else { + audit_log_format(ab, "%ssubj_%s=%s", space, + audit_subj_lsms[i]->name, ctx.context); + space = " "; + security_release_secctx(&ctx); + } + } + audit_buffer_aux_end(ab); return 0; error_path: - audit_panic("error in audit_log_task_context"); + audit_panic("error in audit_log_subj_ctx"); return error; } +EXPORT_SYMBOL(audit_log_subj_ctx); + +int audit_log_task_context(struct audit_buffer *ab) +{ + struct lsm_prop prop; + + security_current_getlsmprop_subj(&prop); + return audit_log_subj_ctx(ab, &prop); +} EXPORT_SYMBOL(audit_log_task_context); void audit_log_d_path_exe(struct audit_buffer *ab, @@ -2411,6 +2543,26 @@ int audit_signal_info(int sig, struct task_struct *t) return audit_signal_info_syscall(t); } +/** + * __audit_log_end - enqueue one audit record + * @skb: the buffer to send + */ +static void __audit_log_end(struct sk_buff *skb) +{ + struct nlmsghdr *nlh; + + if (audit_rate_check()) { + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* queue the netlink packet */ + skb_queue_tail(&audit_queue, skb); + } else + audit_log_lost("rate limit exceeded"); +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer @@ -2423,25 +2575,15 @@ int audit_signal_info(int sig, struct task_struct *t) void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; - struct nlmsghdr *nlh; if (!ab) return; - if (audit_rate_check()) { - skb = ab->skb; - ab->skb = NULL; + while ((skb = skb_dequeue(&ab->skb_list))) + __audit_log_end(skb); - /* setup the netlink header, see the comments in - * kauditd_send_multicast_skb() for length quirks */ - nlh = nlmsg_hdr(skb); - nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; - - /* queue the netlink packet and poke the kauditd thread */ - skb_queue_tail(&audit_queue, skb); - wake_up_interruptible(&kauditd_wait); - } else - audit_log_lost("rate limit exceeded"); + /* poke the kauditd thread */ + wake_up_interruptible(&kauditd_wait); audit_buffer_free(ab); } diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 6d6545297ee30b..0da652844dd66a 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -84,7 +84,6 @@ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; - struct lsm_context ctx; if (audit_enabled == AUDIT_OFF) return NULL; @@ -96,13 +95,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_log_format(audit_buf, "netlabel: auid=%u ses=%u", from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); - - if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx, - LSM_ID_UNDEF) > 0) { - audit_log_format(audit_buf, " subj=%s", ctx.context); - security_release_secctx(&ctx); - } + audit_log_subj_ctx(audit_buf, &audit_info->prop); return audit_buf; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b39..220d1684b8d4cf 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -2530,6 +2530,9 @@ static int __init apparmor_init(void) security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&apparmor_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d40..975b84b466b4f8 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7618,6 +7618,9 @@ static __init int selinux_init(void) /* Set the security state for the initial task. */ cred_init_security(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) pr_notice("SELinux: virtual memory is executable by default\n"); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index fc340a6f0ddea8..eaff9b8901a797 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5267,6 +5267,9 @@ static __init int smack_init(void) /* initialize the smack_known_list */ init_smack_known_list(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&smack_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + return 0; } From 0ffbc876d03c80b83d70aeefac7bbb94a9f4e135 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:59 -0700 Subject: [PATCH 0616/3206] audit: add record for multiple object contexts Create a new audit record AUDIT_MAC_OBJ_CONTEXTS. An example of the MAC_OBJ_CONTEXTS record is: type=MAC_OBJ_CONTEXTS msg=audit(1601152467.009:1050): obj_selinux=unconfined_u:object_r:user_home_t:s0 When an audit event includes a AUDIT_MAC_OBJ_CONTEXTS record the "obj=" field in other records in the event will be "obj=?". An AUDIT_MAC_OBJ_CONTEXTS record is supplied when the system has multiple security modules that may make access decisions based on an object security context. Signed-off-by: Casey Schaufler [PM: subj tweak, audit example readability indents] Signed-off-by: Paul Moore --- include/linux/audit.h | 7 +++++ include/uapi/linux/audit.h | 1 + kernel/audit.c | 58 +++++++++++++++++++++++++++++++++++++- kernel/auditsc.c | 38 +++++-------------------- security/selinux/hooks.c | 4 ++- security/smack/smack_lsm.c | 4 ++- 6 files changed, 78 insertions(+), 34 deletions(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index a1f068bcb3a0cf..536f8ee8da818c 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -151,6 +151,7 @@ extern unsigned compat_signal_class[]; /* bit values for audit_cfg_lsm */ #define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) +#define AUDIT_CFG_LSM_SECCTX_OBJECT BIT(1) struct filename; @@ -191,6 +192,7 @@ extern void audit_log_path_denied(int type, extern void audit_log_lost(const char *message); extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); +extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -258,6 +260,11 @@ static inline int audit_log_subj_ctx(struct audit_buffer *ab, { return 0; } +static inline int audit_log_obj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 8cad2f3077197c..14a1c1fe013ace 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -149,6 +149,7 @@ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ #define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ +#define AUDIT_MAC_OBJ_CONTEXTS 1426 /* Multiple LSM objext contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/kernel/audit.c b/kernel/audit.c index c924b30f25243e..bd7474fd8d2c23 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -85,7 +85,9 @@ static unsigned int audit_net_id; /* Number of modules that provide a security context. List of lsms that provide a security context */ static u32 audit_subj_secctx_cnt; +static u32 audit_obj_secctx_cnt; static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; +static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT]; /** * struct audit_net - audit private network namespace data @@ -305,6 +307,12 @@ void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) return; audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; } + if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) { + for (i = 0 ; i < audit_obj_secctx_cnt; i++) + if (audit_obj_lsms[i] == lsmid) + return; + audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid; + } } /** @@ -1142,7 +1150,6 @@ static int is_audit_feature_set(int i) return af.features & AUDIT_FEATURE_TO_MASK(i); } - static int audit_get_feature(struct sk_buff *skb) { u32 seq; @@ -2337,6 +2344,55 @@ int audit_log_task_context(struct audit_buffer *ab) } EXPORT_SYMBOL(audit_log_task_context); +int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) +{ + int i; + int rc; + int error = 0; + char *space = ""; + struct lsm_context ctx; + + if (audit_obj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return error; + } + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); + return 0; + } + audit_log_format(ab, " obj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_obj_secctx_cnt; i++) { + rc = security_lsmprop_to_secctx(prop, &ctx, + audit_obj_lsms[i]->id); + if (rc < 0) { + audit_log_format(ab, "%sobj_%s=?", space, + audit_obj_lsms[i]->name); + if (rc != -EINVAL) + audit_panic("error in audit_log_obj_ctx"); + error = rc; + } else { + audit_log_format(ab, "%sobj_%s=%s", space, + audit_obj_lsms[i]->name, ctx.context); + security_release_secctx(&ctx); + } + space = " "; + } + + audit_buffer_aux_end(ab); + return error; + +error_path: + audit_panic("error in audit_log_obj_ctx"); + return error; +} + void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3b606fd4ae8e87..d1966144bdfe70 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1098,7 +1098,6 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, char *comm) { struct audit_buffer *ab; - struct lsm_context ctx; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); @@ -1108,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF) < 0) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop)) + rc = 1; + audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); @@ -1392,16 +1385,8 @@ static void show_special(struct audit_context *context, int *call_panic) from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { - struct lsm_context lsmctx; - - if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx, - LSM_ID_UNDEF) < 0) { + if (audit_log_obj_ctx(ab, &context->ipc.oprop)) *call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", lsmctx.context); - security_release_secctx(&lsmctx); - } } if (context->ipc.has_perm) { audit_log_end(ab); @@ -1558,18 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - if (lsmprop_is_set(&n->oprop)) { - struct lsm_context ctx; - - if (security_lsmprop_to_secctx(&n->oprop, &ctx, - LSM_ID_UNDEF) < 0) { - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(&n->oprop) && + audit_log_obj_ctx(ab, &n->oprop)) + *call_panic = 2; /* log the audit_names record type */ switch (n->type) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 975b84b466b4f8..3999f58a184299 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7619,7 +7619,9 @@ static __init int selinux_init(void) cred_init_security(); /* Inform the audit system that secctx is used */ - audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + audit_cfg_lsm(&selinux_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index eaff9b8901a797..fdf2f193a2910e 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5268,7 +5268,9 @@ static __init int smack_init(void) init_smack_known_list(); /* Inform the audit system that secctx is used */ - audit_cfg_lsm(&smack_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + audit_cfg_lsm(&smack_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); return 0; } From 71965cae7db394ff5ba3b2d2befe4e136ceec268 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 29 Aug 2025 03:19:54 +0800 Subject: [PATCH 0617/3206] EDAC: Fix wrong executable file modes for C source files Three EDAC source files were mistakenly marked as executable when adding the EDAC scrub controls. These are plain C source files and should not carry the executable bit. Correcting their modes follows the principle of least privilege and avoids unnecessary execute permissions in the repository. [ bp: Massage commit message. ] Signed-off-by: Kuan-Wei Chiu Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828191954.903125-1-visitorckw@gmail.com --- drivers/edac/ecs.c | 0 drivers/edac/mem_repair.c | 0 drivers/edac/scrub.c | 0 3 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 drivers/edac/ecs.c mode change 100755 => 100644 drivers/edac/mem_repair.c mode change 100755 => 100644 drivers/edac/scrub.c diff --git a/drivers/edac/ecs.c b/drivers/edac/ecs.c old mode 100755 new mode 100644 diff --git a/drivers/edac/mem_repair.c b/drivers/edac/mem_repair.c old mode 100755 new mode 100644 diff --git a/drivers/edac/scrub.c b/drivers/edac/scrub.c old mode 100755 new mode 100644 From c15a4705d59caeb44f4c373cf04e89041309e568 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:05 +0200 Subject: [PATCH 0618/3206] x86/sev: Use MSR protocol for remapping SVSM calling area As the preceding code comment already indicates, remapping the SVSM calling area occurs long before the GHCB page is configured, and so calling svsm_perform_call_protocol() is guaranteed to result in a call to svsm_perform_msr_protocol(). So just call the latter directly. This allows most of the GHCB based API infrastructure to be moved out of the startup code in a subsequent patch. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/20250828102202.1849035-26-ardb+git@google.com --- arch/x86/boot/startup/sev-shared.c | 11 +++++++++++ arch/x86/boot/startup/sev-startup.c | 5 ++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index ed88dfe7605e53..975d2b02926a9e 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -724,6 +724,17 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) } } +static int __head svsm_call_msr_protocol(struct svsm_call *call) +{ + int ret; + + do { + ret = svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + return ret; +} + static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) { struct svsm_pvalidate_call *pc; diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 0b7e3b95018389..8412807a865c87 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -295,7 +295,6 @@ static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) static __head void svsm_setup(struct cc_blob_sev_info *cc_info) { struct svsm_call call = {}; - int ret; u64 pa; /* @@ -325,8 +324,8 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) call.caa = svsm_get_caa(); call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); call.rcx = pa; - ret = svsm_perform_call_protocol(&call); - if (ret) + + if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); boot_svsm_caa = (struct svsm_ca *)pa; From 7cb7b6de9cb90311a917d65c0228b6aa223dc456 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:06 +0200 Subject: [PATCH 0619/3206] x86/sev: Use MSR protocol only for early SVSM PVALIDATE call The early page state change API performs an SVSM call to PVALIDATE each page when running under a SVSM, and this involves either a GHCB page based call or a call based on the MSR protocol. The GHCB page based variant involves VA to PA translation of the GHCB address, and this is best avoided in the startup code, where virtual addresses are ambiguous (1:1 or kernel virtual). As this is the last remaining occurrence of svsm_perform_call_protocol() in the startup code, switch to the MSR protocol exclusively in this particular case, so that the GHCB based plumbing can be moved out of the startup code entirely in a subsequent patch. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/20250828102202.1849035-27-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 20 -------------------- arch/x86/boot/startup/sev-shared.c | 9 ++++++--- 2 files changed, 6 insertions(+), 23 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index fd1b67dfea228f..b71c1ab6a2826e 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -50,31 +50,11 @@ u64 svsm_get_caa_pa(void) return boot_svsm_caa_pa; } -int svsm_perform_call_protocol(struct svsm_call *call); - u8 snp_vmpl; /* Include code for early handlers */ #include "../../boot/startup/sev-shared.c" -int svsm_perform_call_protocol(struct svsm_call *call) -{ - struct ghcb *ghcb; - int ret; - - if (boot_ghcb) - ghcb = boot_ghcb; - else - ghcb = NULL; - - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); - - return ret; -} - static bool sev_snp_enabled(void) { return sev_status & MSR_AMD64_SEV_SNP_ENABLED; diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 975d2b02926a9e..7bd73462c11eff 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -741,7 +741,6 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) struct svsm_call call = {}; unsigned long flags; u64 pc_pa; - int ret; /* * This can be called very early in the boot, use native functions in @@ -766,8 +765,12 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE); call.rcx = pc_pa; - ret = svsm_perform_call_protocol(&call); - if (ret) + /* + * Use the MSR protocol exclusively, so that this code is usable in + * startup code where VA/PA translations of the GHCB page's address may + * be problematic. + */ + if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); native_local_irq_restore(flags); From e349241b97a8b1169a4e90375159df4d22061f9a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:07 +0200 Subject: [PATCH 0620/3206] x86/sev: Run RMPADJUST on SVSM calling area page to test VMPL Determining the VMPL at which the kernel runs involves performing a RMPADJUST operation on an arbitrary page of memory, and observing whether it succeeds. The use of boot_ghcb_page in the core kernel in this case is completely arbitrary, but results in the need to provide a PIC alias for it. So use boot_svsm_ca_page instead, which already needs this alias for other reasons. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/20250828102202.1849035-28-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 2 +- arch/x86/boot/startup/sev-shared.c | 5 +++-- arch/x86/boot/startup/sev-startup.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index b71c1ab6a2826e..3628e9bddc6a13 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -327,7 +327,7 @@ static bool early_snp_init(struct boot_params *bp) * running at VMPL0. The CA will be used to communicate with the * SVSM and request its services. */ - svsm_setup_ca(cc_info); + svsm_setup_ca(cc_info, rip_rel_ptr(&boot_ghcb_page)); /* * Pass run-time kernel a pointer to CC info via boot_params so EFI diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 7bd73462c11eff..83c222a4f1fa9e 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -801,7 +801,8 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM * services needed when not running in VMPL0. */ -static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) +static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info, + void *page) { struct snp_secrets_page *secrets_page; struct snp_cpuid_table *cpuid_table; @@ -824,7 +825,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) * routine is running identity mapped when called, both by the decompressor * code and the early kernel code. */ - if (!rmpadjust((unsigned long)rip_rel_ptr(&boot_ghcb_page), RMP_PG_SIZE_4K, 1)) + if (!rmpadjust((unsigned long)page, RMP_PG_SIZE_4K, 1)) return false; /* diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 8412807a865c87..3da04a715831b4 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -302,7 +302,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) * running at VMPL0. The CA will be used to communicate with the * SVSM to perform the SVSM services. */ - if (!svsm_setup_ca(cc_info)) + if (!svsm_setup_ca(cc_info, rip_rel_ptr(&boot_svsm_ca_page))) return; /* From b8c3c9f5d0505905e21c03731d1665c67053b47e Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 12:33:18 +0530 Subject: [PATCH 0621/3206] x86/apic: Initialize Secure AVIC APIC backing page With Secure AVIC, the APIC backing page is owned and managed by the guest. Allocate and initialize APIC backing page for all guest CPUs. The NPT entry for a vCPU's APIC backing page must always be present when the vCPU is running in order for Secure AVIC to function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot be resumed otherwise. To handle this, notify GPA of the vCPU's APIC backing page to the hypervisor by using the SVM_VMGEXIT_SECURE_AVIC GHCB protocol event. Before executing VMRUN, the hypervisor makes use of this information to make sure the APIC backing page is mapped in the NPT. [ bp: Massage commit message. ] Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828070334.208401-3-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 22 ++++++++++++++++++ arch/x86/include/asm/apic.h | 1 + arch/x86/include/asm/sev.h | 2 ++ arch/x86/include/uapi/asm/svm.h | 4 ++++ arch/x86/kernel/apic/apic.c | 3 +++ arch/x86/kernel/apic/x2apic_savic.c | 35 +++++++++++++++++++++++++++++ 6 files changed, 67 insertions(+) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index f7a549f650e9c4..7669aafcad95be 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1108,6 +1108,28 @@ int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd) return 0; } +enum es_result savic_register_gpa(u64 gpa) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA); + ghcb_set_rbx(ghcb, gpa); + res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC, + SVM_VMGEXIT_SAVIC_REGISTER_GPA, 0); + + __sev_put_ghcb(&state); + + return res; +} + static void snp_register_per_cpu_ghcb(void) { struct sev_es_runtime_data *data; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 07ba4935e87320..44b4080721a625 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -305,6 +305,7 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); + void (*setup)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); void (*init_apic_ldr)(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 02236962fdb108..9036122a6d455d 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -533,6 +533,7 @@ int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); +enum es_result savic_register_gpa(u64 gpa); static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { @@ -605,6 +606,7 @@ static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } +static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 9c640a521a6700..650e3256ea7d73 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -118,6 +118,10 @@ #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 #define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 +#define SVM_VMGEXIT_SAVIC 0x8000001a +#define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0 +#define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1 +#define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ff4029b57e9567..7874284c1ca7fd 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1501,6 +1501,9 @@ static void setup_local_APIC(void) return; } + if (apic->setup) + apic->setup(); + /* * If this comes from kexec/kcrash the APIC might be enabled in * SPIV. Soft disable it before doing further initialization. diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index bea844f28192c1..948d89497baafd 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -8,17 +8,47 @@ */ #include +#include #include #include #include "local.h" +struct secure_avic_page { + u8 regs[PAGE_SIZE]; +} __aligned(PAGE_SIZE); + +static struct secure_avic_page __percpu *savic_page __ro_after_init; + static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); } +static void savic_setup(void) +{ + void *ap = this_cpu_ptr(savic_page); + enum es_result res; + unsigned long gpa; + + gpa = __pa(ap); + + /* + * The NPT entry for a vCPU's APIC backing page must always be + * present when the vCPU is running in order for Secure AVIC to + * function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot + * be resumed if the NPT entry for the APIC backing page is not + * present. Notify GPA of the vCPU's APIC backing page to the + * hypervisor by calling savic_register_gpa(). Before executing + * VMRUN, the hypervisor makes use of this information to make sure + * the APIC backing page is mapped in NPT. + */ + res = savic_register_gpa(gpa); + if (res != ES_OK) + snp_abort(); +} + static int savic_probe(void) { if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) @@ -30,6 +60,10 @@ static int savic_probe(void) /* unreachable */ } + savic_page = alloc_percpu(struct secure_avic_page); + if (!savic_page) + snp_abort(); + return 1; } @@ -38,6 +72,7 @@ static struct apic apic_x2apic_savic __ro_after_init = { .name = "secure avic x2apic", .probe = savic_probe, .acpi_madt_oem_check = savic_acpi_madt_oem_check, + .setup = savic_setup, .dest_mode_logical = false, From c822f58a4fab25944ba66768c1d6c563aa6ac077 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:32:40 +0530 Subject: [PATCH 0622/3206] x86/apic: Populate .read()/.write() callbacks of Secure AVIC driver Add read() and write() APIC callback functions to read and write the x2APIC registers directly from the guest APIC backing page of a vCPU. The x2APIC registers are mapped at an offset within the guest APIC backing page which is the same as their x2APIC MMIO offset. Secure AVIC adds new registers such as ALLOWED_IRRs (which are at 4-byte offset within the IRR register offset range) and NMI_REQ to the APIC register space. When Secure AVIC is enabled, accessing the guest's APIC registers through RD/WRMSR results in a #VC exception (for non-accelerated register accesses) with error code VMEXIT_AVIC_NOACCEL. The #VC exception handler can read/write the x2APIC register in the guest APIC backing page to complete the RDMSR/WRMSR. Since doing this would increase the latency of accessing the x2APIC registers, instead of doing RDMSR/WRMSR based register accesses and handling reads/writes in the #VC exception, directly read/write the APIC registers from/to the guest APIC backing page of the vCPU in read() and write() callbacks of the Secure AVIC APIC driver. [ bp: Massage commit message. ] Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828110255.208779-1-Neeraj.Upadhyay@amd.com --- arch/x86/include/asm/apicdef.h | 2 + arch/x86/kernel/apic/x2apic_savic.c | 122 +++++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 094106b6a5384f..be39a543fbe5d0 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -135,6 +135,8 @@ #define APIC_TDR_DIV_128 0xA #define APIC_EFEAT 0x400 #define APIC_ECTRL 0x410 +#define APIC_SEOI 0x420 +#define APIC_IER 0x480 #define APIC_EILVTn(n) (0x500 + 0x10 * n) #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ #define APIC_EILVT_NR_AMD_10H 4 diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 948d89497baafd..5479605429c1a8 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -26,6 +27,123 @@ static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); } +#define SAVIC_ALLOWED_IRR 0x204 + +/* + * When Secure AVIC is enabled, RDMSR/WRMSR of the APIC registers + * result in #VC exception (for non-accelerated register accesses) + * with VMEXIT_AVIC_NOACCEL error code. The #VC exception handler + * can read/write the x2APIC register in the guest APIC backing page. + * + * Since doing this would increase the latency of accessing x2APIC + * registers, instead of doing RDMSR/WRMSR based accesses and + * handling the APIC register reads/writes in the #VC exception handler, + * the read() and write() callbacks directly read/write the APIC register + * from/to the vCPU's APIC backing page. + */ +static u32 savic_read(u32 reg) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_TMICT: + case APIC_TMCCT: + case APIC_TDCR: + case APIC_ID: + case APIC_LVR: + case APIC_TASKPRI: + case APIC_ARBPRI: + case APIC_PROCPRI: + case APIC_LDR: + case APIC_SPIV: + case APIC_ESR: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + case APIC_EFEAT: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + return apic_get_reg(ap, reg); + case APIC_ICR: + return (u32)apic_get_reg64(ap, reg); + case APIC_ISR ... APIC_ISR + 0x70: + case APIC_TMR ... APIC_TMR + 0x70: + if (WARN_ONCE(!IS_ALIGNED(reg, 16), + "APIC register read offset 0x%x not aligned at 16 bytes", reg)) + return 0; + return apic_get_reg(ap, reg); + /* IRR and ALLOWED_IRR offset range */ + case APIC_IRR ... APIC_IRR + 0x74: + /* + * Valid APIC_IRR/SAVIC_ALLOWED_IRR registers are at 16 bytes strides from + * their respective base offset. APIC_IRRs are in the range + * + * (0x200, 0x210, ..., 0x270) + * + * while the SAVIC_ALLOWED_IRR range starts 4 bytes later, in the range + * + * (0x204, 0x214, ..., 0x274). + * + * Filter out everything else. + */ + if (WARN_ONCE(!(IS_ALIGNED(reg, 16) || + IS_ALIGNED(reg - 4, 16)), + "Misaligned APIC_IRR/ALLOWED_IRR APIC register read offset 0x%x", reg)) + return 0; + return apic_get_reg(ap, reg); + default: + pr_err("Error reading unknown Secure AVIC reg offset 0x%x\n", reg); + return 0; + } +} + +#define SAVIC_NMI_REQ 0x278 + +static void savic_write(u32 reg, u32 data) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_LVT0: + case APIC_LVT1: + case APIC_TMICT: + case APIC_TDCR: + case APIC_SELF_IPI: + case APIC_TASKPRI: + case APIC_EOI: + case APIC_SPIV: + case SAVIC_NMI_REQ: + case APIC_ESR: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVTERR: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + apic_set_reg(ap, reg, data); + break; + case APIC_ICR: + apic_set_reg64(ap, reg, (u64)data); + break; + /* ALLOWED_IRR offsets are writable */ + case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70: + if (IS_ALIGNED(reg - 4, 16)) { + apic_set_reg(ap, reg, data); + break; + } + fallthrough; + default: + pr_err("Error writing unknown Secure AVIC reg offset 0x%x\n", reg); + } +} + static void savic_setup(void) { void *ap = this_cpu_ptr(savic_page); @@ -88,8 +206,8 @@ static struct apic apic_x2apic_savic __ro_after_init = { .nmi_to_offline_cpu = true, - .read = native_apic_msr_read, - .write = native_apic_msr_write, + .read = savic_read, + .write = savic_write, .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, From 44fd3e4fc9cdd49c0db170459c3eed9977aae9e7 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 29 Jul 2025 10:01:57 -0600 Subject: [PATCH 0623/3206] MAINTAINERS: update location of hfs&hfsplus trees Update it at MAINTAINERS file. Signed-off-by: Yangtao Li Signed-off-by: Viacheslav Dubeyko Link: https://lore.kernel.org/r/20250729160158.2157843-1-frank.li@vivo.com Signed-off-by: Viacheslav Dubeyko --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..f5e1540cb70244 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10791,6 +10791,7 @@ M: John Paul Adrian Glaubitz M: Yangtao Li L: linux-fsdevel@vger.kernel.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git F: Documentation/filesystems/hfs.rst F: fs/hfs/ @@ -10800,6 +10801,7 @@ M: John Paul Adrian Glaubitz M: Yangtao Li L: linux-fsdevel@vger.kernel.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git F: Documentation/filesystems/hfsplus.rst F: fs/hfsplus/ From 9282bc905f0949fab8cf86c0f620ca988761254c Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 5 Aug 2025 10:58:59 -0600 Subject: [PATCH 0624/3206] hfsplus: return EIO when type of hidden directory mismatch in hfsplus_fill_super() If Catalog File contains corrupted record for the case of hidden directory's type, regard it as I/O error instead of Invalid argument. Signed-off-by: Yangtao Li Reviewed-by: Viacheslav Dubeyko Link: https://lore.kernel.org/r/20250805165905.3390154-1-frank.li@vivo.com Signed-off-by: Viacheslav Dubeyko --- fs/hfsplus/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 86351bdc898591..55f42b349a5ee8 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -524,7 +524,7 @@ static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc) if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) { - err = -EINVAL; + err = -EIO; goto out_put_root; } inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); From 738d5a51864ed8d7a68600b8c0c63fe6fe5c4f20 Mon Sep 17 00:00:00 2001 From: Yang Chenzhi Date: Mon, 18 Aug 2025 22:17:34 +0800 Subject: [PATCH 0625/3206] hfs: validate record offset in hfsplus_bmap_alloc hfsplus_bmap_alloc can trigger a crash if a record offset or length is larger than node_size [ 15.264282] BUG: KASAN: slab-out-of-bounds in hfsplus_bmap_alloc+0x887/0x8b0 [ 15.265192] Read of size 8 at addr ffff8881085ca188 by task test/183 [ 15.265949] [ 15.266163] CPU: 0 UID: 0 PID: 183 Comm: test Not tainted 6.17.0-rc2-gc17b750b3ad9 #14 PREEMPT(voluntary) [ 15.266165] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 15.266167] Call Trace: [ 15.266168] [ 15.266169] dump_stack_lvl+0x53/0x70 [ 15.266173] print_report+0xd0/0x660 [ 15.266181] kasan_report+0xce/0x100 [ 15.266185] hfsplus_bmap_alloc+0x887/0x8b0 [ 15.266208] hfs_btree_inc_height.isra.0+0xd5/0x7c0 [ 15.266217] hfsplus_brec_insert+0x870/0xb00 [ 15.266222] __hfsplus_ext_write_extent+0x428/0x570 [ 15.266225] __hfsplus_ext_cache_extent+0x5e/0x910 [ 15.266227] hfsplus_ext_read_extent+0x1b2/0x200 [ 15.266233] hfsplus_file_extend+0x5a7/0x1000 [ 15.266237] hfsplus_get_block+0x12b/0x8c0 [ 15.266238] __block_write_begin_int+0x36b/0x12c0 [ 15.266251] block_write_begin+0x77/0x110 [ 15.266252] cont_write_begin+0x428/0x720 [ 15.266259] hfsplus_write_begin+0x51/0x100 [ 15.266262] cont_write_begin+0x272/0x720 [ 15.266270] hfsplus_write_begin+0x51/0x100 [ 15.266274] generic_perform_write+0x321/0x750 [ 15.266285] generic_file_write_iter+0xc3/0x310 [ 15.266289] __kernel_write_iter+0x2fd/0x800 [ 15.266296] dump_user_range+0x2ea/0x910 [ 15.266301] elf_core_dump+0x2a94/0x2ed0 [ 15.266320] vfs_coredump+0x1d85/0x45e0 [ 15.266349] get_signal+0x12e3/0x1990 [ 15.266357] arch_do_signal_or_restart+0x89/0x580 [ 15.266362] irqentry_exit_to_user_mode+0xab/0x110 [ 15.266364] asm_exc_page_fault+0x26/0x30 [ 15.266366] RIP: 0033:0x41bd35 [ 15.266367] Code: bc d1 f3 0f 7f 27 f3 0f 7f 6f 10 f3 0f 7f 77 20 f3 0f 7f 7f 30 49 83 c0 0f 49 29 d0 48 8d 7c 17 31 e9 9f 0b 00 00 66 0f ef c0 0f 6f 0e f3 0f 6f 56 10 66 0f 74 c1 66 0f d7 d0 49 83 f8f [ 15.266369] RSP: 002b:00007ffc9e62d078 EFLAGS: 00010283 [ 15.266371] RAX: 00007ffc9e62d100 RBX: 0000000000000000 RCX: 0000000000000000 [ 15.266372] RDX: 00000000000000e0 RSI: 0000000000000000 RDI: 00007ffc9e62d100 [ 15.266373] RBP: 0000400000000040 R08: 00000000000000e0 R09: 0000000000000000 [ 15.266374] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 15.266375] R13: 0000000000000000 R14: 0000000000000000 R15: 0000400000000000 [ 15.266376] When calling hfsplus_bmap_alloc to allocate a free node, this function first retrieves the bitmap from header node and map node using node->page together with the offset and length from hfs_brec_lenoff ``` len = hfs_brec_lenoff(node, 2, &off16); off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); data = kmap_local_page(*pagep); ``` However, if the retrieved offset or length is invalid(i.e. exceeds node_size), the code may end up accessing pages outside the allocated range for this node. This patch adds proper validation of both offset and length before use, preventing out-of-bounds page access. Move is_bnode_offset_valid and check_and_correct_requested_length to hfsplus_fs.h, as they may be required by other functions. Reported-by: syzbot+356aed408415a56543cd@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67bcb4a6.050a0220.bbfd1.008f.GAE@google.com/ Signed-off-by: Yang Chenzhi Reviewed-by: Viacheslav Dubeyko Signed-off-by: Viacheslav Dubeyko Link: https://lore.kernel.org/r/20250818141734.8559-2-yang.chenzhi@vivo.com Signed-off-by: Viacheslav Dubeyko --- fs/hfsplus/bnode.c | 41 ---------------------------------------- fs/hfsplus/btree.c | 6 ++++++ fs/hfsplus/hfsplus_fs.h | 42 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 41 deletions(-) diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 14f4995588ff03..407d5152eb411e 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -18,47 +18,6 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" -static inline -bool is_bnode_offset_valid(struct hfs_bnode *node, int off) -{ - bool is_valid = off < node->tree->node_size; - - if (!is_valid) { - pr_err("requested invalid offset: " - "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d\n", - node->this, node->type, node->height, - node->tree->node_size, off); - } - - return is_valid; -} - -static inline -int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) -{ - unsigned int node_size; - - if (!is_bnode_offset_valid(node, off)) - return 0; - - node_size = node->tree->node_size; - - if ((off + len) > node_size) { - int new_len = (int)node_size - off; - - pr_err("requested length has been corrected: " - "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d, " - "requested_len %d, corrected_len %d\n", - node->this, node->type, node->height, - node->tree->node_size, off, len, new_len); - - return new_len; - } - - return len; -} /* Copy a specified range of bytes from the raw data of a node */ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 9e1732a2b92a8c..fe6a54c4083c34 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -393,6 +393,12 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) len = hfs_brec_lenoff(node, 2, &off16); off = off16; + if (!is_bnode_offset_valid(node, off)) { + hfs_bnode_put(node); + return ERR_PTR(-EIO); + } + len = check_and_correct_requested_length(node, off, len); + off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); data = kmap_local_page(*pagep); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 96a5c24813dd6d..49965cd4526124 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -577,6 +577,48 @@ hfsplus_btree_lock_class(struct hfs_btree *tree) return class; } +static inline +bool is_bnode_offset_valid(struct hfs_bnode *node, int off) +{ + bool is_valid = off < node->tree->node_size; + + if (!is_valid) { + pr_err("requested invalid offset: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d\n", + node->this, node->type, node->height, + node->tree->node_size, off); + } + + return is_valid; +} + +static inline +int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) +{ + unsigned int node_size; + + if (!is_bnode_offset_valid(node, off)) + return 0; + + node_size = node->tree->node_size; + + if ((off + len) > node_size) { + int new_len = (int)node_size - off; + + pr_err("requested length has been corrected: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d, " + "requested_len %d, corrected_len %d\n", + node->this, node->type, node->height, + node->tree->node_size, off, len, new_len); + + return new_len; + } + + return len; +} + /* compatibility */ #define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) } #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) From 4840ceadef4290c56cc422f0fc697655f3cbf070 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Mon, 18 Aug 2025 15:52:32 -0700 Subject: [PATCH 0626/3206] hfsplus: fix KMSAN uninit-value issue in __hfsplus_ext_cache_extent() The syzbot reported issue in __hfsplus_ext_cache_extent(): [ 70.194323][ T9350] BUG: KMSAN: uninit-value in __hfsplus_ext_cache_extent+0x7d0/0x990 [ 70.195022][ T9350] __hfsplus_ext_cache_extent+0x7d0/0x990 [ 70.195530][ T9350] hfsplus_file_extend+0x74f/0x1cf0 [ 70.195998][ T9350] hfsplus_get_block+0xe16/0x17b0 [ 70.196458][ T9350] __block_write_begin_int+0x962/0x2ce0 [ 70.196959][ T9350] cont_write_begin+0x1000/0x1950 [ 70.197416][ T9350] hfsplus_write_begin+0x85/0x130 [ 70.197873][ T9350] generic_perform_write+0x3e8/0x1060 [ 70.198374][ T9350] __generic_file_write_iter+0x215/0x460 [ 70.198892][ T9350] generic_file_write_iter+0x109/0x5e0 [ 70.199393][ T9350] vfs_write+0xb0f/0x14e0 [ 70.199771][ T9350] ksys_write+0x23e/0x490 [ 70.200149][ T9350] __x64_sys_write+0x97/0xf0 [ 70.200570][ T9350] x64_sys_call+0x3015/0x3cf0 [ 70.201065][ T9350] do_syscall_64+0xd9/0x1d0 [ 70.201506][ T9350] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 70.202054][ T9350] [ 70.202279][ T9350] Uninit was created at: [ 70.202693][ T9350] __kmalloc_noprof+0x621/0xf80 [ 70.203149][ T9350] hfsplus_find_init+0x8d/0x1d0 [ 70.203602][ T9350] hfsplus_file_extend+0x6ca/0x1cf0 [ 70.204087][ T9350] hfsplus_get_block+0xe16/0x17b0 [ 70.204561][ T9350] __block_write_begin_int+0x962/0x2ce0 [ 70.205074][ T9350] cont_write_begin+0x1000/0x1950 [ 70.205547][ T9350] hfsplus_write_begin+0x85/0x130 [ 70.206017][ T9350] generic_perform_write+0x3e8/0x1060 [ 70.206519][ T9350] __generic_file_write_iter+0x215/0x460 [ 70.207042][ T9350] generic_file_write_iter+0x109/0x5e0 [ 70.207552][ T9350] vfs_write+0xb0f/0x14e0 [ 70.207961][ T9350] ksys_write+0x23e/0x490 [ 70.208375][ T9350] __x64_sys_write+0x97/0xf0 [ 70.208810][ T9350] x64_sys_call+0x3015/0x3cf0 [ 70.209255][ T9350] do_syscall_64+0xd9/0x1d0 [ 70.209680][ T9350] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 70.210230][ T9350] [ 70.210454][ T9350] CPU: 2 UID: 0 PID: 9350 Comm: repro Not tainted 6.12.0-rc5 #5 [ 70.211174][ T9350] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 70.212115][ T9350] ===================================================== [ 70.212734][ T9350] Disabling lock debugging due to kernel taint [ 70.213284][ T9350] Kernel panic - not syncing: kmsan.panic set ... [ 70.213858][ T9350] CPU: 2 UID: 0 PID: 9350 Comm: repro Tainted: G B 6.12.0-rc5 #5 [ 70.214679][ T9350] Tainted: [B]=BAD_PAGE [ 70.215057][ T9350] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 70.215999][ T9350] Call Trace: [ 70.216309][ T9350] [ 70.216585][ T9350] dump_stack_lvl+0x1fd/0x2b0 [ 70.217025][ T9350] dump_stack+0x1e/0x30 [ 70.217421][ T9350] panic+0x502/0xca0 [ 70.217803][ T9350] ? kmsan_get_metadata+0x13e/0x1c0 [ 70.218294][ Message fromT sy9350] kmsan_report+0x296/slogd@syzkaller 0x2aat Aug 18 22:11:058 ... kernel :[ 70.213284][ T9350] Kernel panic - not syncing: kmsan.panic [ 70.220179][ T9350] ? kmsan_get_metadata+0x13e/0x1c0 set ... [ 70.221254][ T9350] ? __msan_warning+0x96/0x120 [ 70.222066][ T9350] ? __hfsplus_ext_cache_extent+0x7d0/0x990 [ 70.223023][ T9350] ? hfsplus_file_extend+0x74f/0x1cf0 [ 70.224120][ T9350] ? hfsplus_get_block+0xe16/0x17b0 [ 70.224946][ T9350] ? __block_write_begin_int+0x962/0x2ce0 [ 70.225756][ T9350] ? cont_write_begin+0x1000/0x1950 [ 70.226337][ T9350] ? hfsplus_write_begin+0x85/0x130 [ 70.226852][ T9350] ? generic_perform_write+0x3e8/0x1060 [ 70.227405][ T9350] ? __generic_file_write_iter+0x215/0x460 [ 70.227979][ T9350] ? generic_file_write_iter+0x109/0x5e0 [ 70.228540][ T9350] ? vfs_write+0xb0f/0x14e0 [ 70.228997][ T9350] ? ksys_write+0x23e/0x490 [ 70.229458][ T9350] ? __x64_sys_write+0x97/0xf0 [ 70.229939][ T9350] ? x64_sys_call+0x3015/0x3cf0 [ 70.230432][ T9350] ? do_syscall_64+0xd9/0x1d0 [ 70.230941][ T9350] ? entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 70.231926][ T9350] ? kmsan_get_metadata+0x13e/0x1c0 [ 70.232738][ T9350] ? kmsan_internal_set_shadow_origin+0x77/0x110 [ 70.233711][ T9350] ? kmsan_get_metadata+0x13e/0x1c0 [ 70.234516][ T9350] ? kmsan_get_shadow_origin_ptr+0x4a/0xb0 [ 70.235398][ T9350] ? __msan_metadata_ptr_for_load_4+0x24/0x40 [ 70.236323][ T9350] ? hfsplus_brec_find+0x218/0x9f0 [ 70.237090][ T9350] ? __pfx_hfs_find_rec_by_key+0x10/0x10 [ 70.237938][ T9350] ? __msan_instrument_asm_store+0xbf/0xf0 [ 70.238827][ T9350] ? __msan_metadata_ptr_for_store_4+0x27/0x40 [ 70.239772][ T9350] ? __hfsplus_ext_write_extent+0x536/0x620 [ 70.240666][ T9350] ? kmsan_get_metadata+0x13e/0x1c0 [ 70.241175][ T9350] __msan_warning+0x96/0x120 [ 70.241645][ T9350] __hfsplus_ext_cache_extent+0x7d0/0x990 [ 70.242223][ T9350] hfsplus_file_extend+0x74f/0x1cf0 [ 70.242748][ T9350] hfsplus_get_block+0xe16/0x17b0 [ 70.243255][ T9350] ? kmsan_internal_set_shadow_origin+0x77/0x110 [ 70.243878][ T9350] ? kmsan_get_metadata+0x13e/0x1c0 [ 70.244400][ T9350] ? kmsan_get_shadow_origin_ptr+0x4a/0xb0 [ 70.244967][ T9350] __block_write_begin_int+0x962/0x2ce0 [ 70.245531][ T9350] ? __pfx_hfsplus_get_block+0x10/0x10 [ 70.246079][ T9350] cont_write_begin+0x1000/0x1950 [ 70.246598][ T9350] hfsplus_write_begin+0x85/0x130 [ 70.247105][ T9350] ? __pfx_hfsplus_get_block+0x10/0x10 [ 70.247650][ T9350] ? __pfx_hfsplus_write_begin+0x10/0x10 [ 70.248211][ T9350] generic_perform_write+0x3e8/0x1060 [ 70.248752][ T9350] __generic_file_write_iter+0x215/0x460 [ 70.249314][ T9350] generic_file_write_iter+0x109/0x5e0 [ 70.249856][ T9350] ? kmsan_internal_set_shadow_origin+0x77/0x110 [ 70.250487][ T9350] vfs_write+0xb0f/0x14e0 [ 70.250930][ T9350] ? __pfx_generic_file_write_iter+0x10/0x10 [ 70.251530][ T9350] ksys_write+0x23e/0x490 [ 70.251974][ T9350] __x64_sys_write+0x97/0xf0 [ 70.252450][ T9350] x64_sys_call+0x3015/0x3cf0 [ 70.252924][ T9350] do_syscall_64+0xd9/0x1d0 [ 70.253384][ T9350] ? irqentry_exit+0x16/0x60 [ 70.253844][ T9350] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 70.254430][ T9350] RIP: 0033:0x7f7a92adffc9 [ 70.254873][ T9350] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 48 [ 70.256674][ T9350] RSP: 002b:00007fff0bca3188 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [ 70.257485][ T9350] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f7a92adffc9 [ 70.258246][ T9350] RDX: 000000000208e24b RSI: 0000000020000100 RDI: 0000000000000004 [ 70.258998][ T9350] RBP: 00007fff0bca31a0 R08: 00007fff0bca31a0 R09: 00007fff0bca31a0 [ 70.259769][ T9350] R10: 0000000000000000 R11: 0000000000000202 R12: 000055e0d75f8250 [ 70.260520][ T9350] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 70.261286][ T9350] [ 70.262026][ T9350] Kernel Offset: disabled (gdb) l *__hfsplus_ext_cache_extent+0x7d0 0xffffffff8318aef0 is in __hfsplus_ext_cache_extent (fs/hfsplus/extents.c:168). 163 fd->key->ext.cnid = 0; 164 res = hfs_brec_find(fd, hfs_find_rec_by_key); 165 if (res && res != -ENOENT) 166 return res; 167 if (fd->key->ext.cnid != fd->search_key->ext.cnid || 168 fd->key->ext.fork_type != fd->search_key->ext.fork_type) 169 return -ENOENT; 170 if (fd->entrylength != sizeof(hfsplus_extent_rec)) 171 return -EIO; 172 hfs_bnode_read(fd->bnode, extent, fd->entryoffset, The __hfsplus_ext_cache_extent() calls __hfsplus_ext_read_extent(): res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino, block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); And if inode->i_ino could be equal to zero or any non-available CNID, then hfs_brec_find() could not find the record in the tree. As a result, fd->key could be compared with fd->search_key. But hfsplus_find_init() uses kmalloc() for fd->key and fd->search_key allocation: int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) { ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; } Finally, fd->key is still not initialized if hfs_brec_find() has found nothing. This patch changes kmalloc() on kzalloc() in hfs_find_init() and intializes fd->record, fd->keyoffset, fd->keylength, fd->entryoffset, fd->entrylength for the case if hfs_brec_find() has been found nothing in the b-tree node. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=55ad87f38795d6787521 Signed-off-by: Viacheslav Dubeyko cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250818225232.126402-1-slava@dubeyko.com Signed-off-by: Viacheslav Dubeyko --- fs/hfsplus/bfind.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 901e83d65d2021..26ebac4c604242 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -18,7 +18,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->tree = tree; fd->bnode = NULL; - ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); + ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; @@ -158,6 +158,12 @@ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) __be32 data; int height, res; + fd->record = -1; + fd->keyoffset = -1; + fd->keylength = -1; + fd->entryoffset = -1; + fd->entrylength = -1; + tree = fd->tree; if (fd->bnode) hfs_bnode_put(fd->bnode); From c62663a986acee7c4485c1fa9de5fc40194b6290 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Mon, 18 Aug 2025 15:52:52 -0700 Subject: [PATCH 0627/3206] hfs: make proper initalization of struct hfs_find_data Potenatially, __hfs_ext_read_extent() could operate by not initialized values of fd->key after hfs_brec_find() call: static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent, u32 cnid, u32 block, u8 type) { int res; hfs_ext_build_key(fd->search_key, cnid, block, type); fd->key->ext.FNum = 0; res = hfs_brec_find(fd); if (res && res != -ENOENT) return res; if (fd->key->ext.FNum != fd->search_key->ext.FNum || fd->key->ext.FkType != fd->search_key->ext.FkType) return -ENOENT; if (fd->entrylength != sizeof(hfs_extent_rec)) return -EIO; hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfs_extent_rec)); return 0; } This patch changes kmalloc() on kzalloc() in hfs_find_init() and intializes fd->record, fd->keyoffset, fd->keylength, fd->entryoffset, fd->entrylength for the case if hfs_brec_find() has been found nothing in the b-tree node. Signed-off-by: Viacheslav Dubeyko cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250818225252.126427-1-slava@dubeyko.com Signed-off-by: Viacheslav Dubeyko --- fs/hfs/bfind.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index 34e9804e0f3601..e46f650b5e9c26 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -21,7 +21,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->tree = tree; fd->bnode = NULL; - ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); + ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; @@ -115,6 +115,12 @@ int hfs_brec_find(struct hfs_find_data *fd) __be32 data; int height, res; + fd->record = -1; + fd->keyoffset = -1; + fd->keylength = -1; + fd->entryoffset = -1; + fd->entrylength = -1; + tree = fd->tree; if (fd->bnode) hfs_bnode_put(fd->bnode); From 2048ec5b98dbdfe0b929d2e42dc7a54c389c53dd Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Wed, 20 Aug 2025 16:06:38 -0700 Subject: [PATCH 0628/3206] hfs: fix KMSAN uninit-value issue in hfs_find_set_zero_bits() The syzbot reported issue in hfs_find_set_zero_bits(): ===================================================== BUG: KMSAN: uninit-value in hfs_find_set_zero_bits+0x74d/0xb60 fs/hfs/bitmap.c:45 hfs_find_set_zero_bits+0x74d/0xb60 fs/hfs/bitmap.c:45 hfs_vbm_search_free+0x13c/0x5b0 fs/hfs/bitmap.c:151 hfs_extend_file+0x6a5/0x1b00 fs/hfs/extent.c:408 hfs_get_block+0x435/0x1150 fs/hfs/extent.c:353 __block_write_begin_int+0xa76/0x3030 fs/buffer.c:2151 block_write_begin fs/buffer.c:2262 [inline] cont_write_begin+0x10e1/0x1bc0 fs/buffer.c:2601 hfs_write_begin+0x85/0x130 fs/hfs/inode.c:52 cont_expand_zero fs/buffer.c:2528 [inline] cont_write_begin+0x35a/0x1bc0 fs/buffer.c:2591 hfs_write_begin+0x85/0x130 fs/hfs/inode.c:52 hfs_file_truncate+0x1d6/0xe60 fs/hfs/extent.c:494 hfs_inode_setattr+0x964/0xaa0 fs/hfs/inode.c:654 notify_change+0x1993/0x1aa0 fs/attr.c:552 do_truncate+0x28f/0x310 fs/open.c:68 do_ftruncate+0x698/0x730 fs/open.c:195 do_sys_ftruncate fs/open.c:210 [inline] __do_sys_ftruncate fs/open.c:215 [inline] __se_sys_ftruncate fs/open.c:213 [inline] __x64_sys_ftruncate+0x11b/0x250 fs/open.c:213 x64_sys_call+0xfe3/0x3db0 arch/x86/include/generated/asm/syscalls_64.h:78 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xd9/0x210 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was created at: slab_post_alloc_hook mm/slub.c:4154 [inline] slab_alloc_node mm/slub.c:4197 [inline] __kmalloc_cache_noprof+0x7f7/0xed0 mm/slub.c:4354 kmalloc_noprof include/linux/slab.h:905 [inline] hfs_mdb_get+0x1cc8/0x2a90 fs/hfs/mdb.c:175 hfs_fill_super+0x3d0/0xb80 fs/hfs/super.c:337 get_tree_bdev_flags+0x6e3/0x920 fs/super.c:1681 get_tree_bdev+0x38/0x50 fs/super.c:1704 hfs_get_tree+0x35/0x40 fs/hfs/super.c:388 vfs_get_tree+0xb0/0x5c0 fs/super.c:1804 do_new_mount+0x738/0x1610 fs/namespace.c:3902 path_mount+0x6db/0x1e90 fs/namespace.c:4226 do_mount fs/namespace.c:4239 [inline] __do_sys_mount fs/namespace.c:4450 [inline] __se_sys_mount+0x6eb/0x7d0 fs/namespace.c:4427 __x64_sys_mount+0xe4/0x150 fs/namespace.c:4427 x64_sys_call+0xfa7/0x3db0 arch/x86/include/generated/asm/syscalls_64.h:166 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xd9/0x210 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f CPU: 1 UID: 0 PID: 12609 Comm: syz.1.2692 Not tainted 6.16.0-syzkaller #0 PREEMPT(none) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 ===================================================== The HFS_SB(sb)->bitmap buffer is allocated in hfs_mdb_get(): HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL); Finally, it can trigger the reported issue because kmalloc() doesn't clear the allocated memory. If allocated memory contains only zeros, then everything will work pretty fine. But if the allocated memory contains the "garbage", then it can affect the bitmap operations and it triggers the reported issue. This patch simply exchanges the kmalloc() on kzalloc() with the goal to guarantee the correctness of bitmap operations. Because, newly created allocation bitmap should have all available blocks free. Potentially, initialization bitmap's read operation could not fill the whole allocated memory and "garbage" in the not initialized memory will be the reason of volume coruptions and file system driver bugs. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=773fa9d79b29bd8b6831 Signed-off-by: Viacheslav Dubeyko cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250820230636.179085-1-slava@dubeyko.com Signed-off-by: Viacheslav Dubeyko --- fs/hfs/mdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 8082eb01127cdf..bf811347bb07d3 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -172,7 +172,7 @@ int hfs_mdb_get(struct super_block *sb) pr_warn("continuing without an alternate MDB\n"); } - HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL); + HFS_SB(sb)->bitmap = kzalloc(8192, GFP_KERNEL); if (!HFS_SB(sb)->bitmap) goto out; From 9b3d15a758910bb98ba8feb4109d99cc67450ee4 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Mon, 25 Aug 2025 15:51:04 -0700 Subject: [PATCH 0629/3206] hfsplus: fix KMSAN uninit-value issue in hfsplus_delete_cat() The syzbot reported issue in hfsplus_delete_cat(): [ 70.682285][ T9333] ===================================================== [ 70.682943][ T9333] BUG: KMSAN: uninit-value in hfsplus_subfolders_dec+0x1d7/0x220 [ 70.683640][ T9333] hfsplus_subfolders_dec+0x1d7/0x220 [ 70.684141][ T9333] hfsplus_delete_cat+0x105d/0x12b0 [ 70.684621][ T9333] hfsplus_rmdir+0x13d/0x310 [ 70.685048][ T9333] vfs_rmdir+0x5ba/0x810 [ 70.685447][ T9333] do_rmdir+0x964/0xea0 [ 70.685833][ T9333] __x64_sys_rmdir+0x71/0xb0 [ 70.686260][ T9333] x64_sys_call+0xcd8/0x3cf0 [ 70.686695][ T9333] do_syscall_64+0xd9/0x1d0 [ 70.687119][ T9333] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 70.687646][ T9333] [ 70.687856][ T9333] Uninit was stored to memory at: [ 70.688311][ T9333] hfsplus_subfolders_inc+0x1c2/0x1d0 [ 70.688779][ T9333] hfsplus_create_cat+0x148e/0x1800 [ 70.689231][ T9333] hfsplus_mknod+0x27f/0x600 [ 70.689730][ T9333] hfsplus_mkdir+0x5a/0x70 [ 70.690146][ T9333] vfs_mkdir+0x483/0x7a0 [ 70.690545][ T9333] do_mkdirat+0x3f2/0xd30 [ 70.690944][ T9333] __x64_sys_mkdir+0x9a/0xf0 [ 70.691380][ T9333] x64_sys_call+0x2f89/0x3cf0 [ 70.691816][ T9333] do_syscall_64+0xd9/0x1d0 [ 70.692229][ T9333] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 70.692773][ T9333] [ 70.692990][ T9333] Uninit was stored to memory at: [ 70.693469][ T9333] hfsplus_subfolders_inc+0x1c2/0x1d0 [ 70.693960][ T9333] hfsplus_create_cat+0x148e/0x1800 [ 70.694438][ T9333] hfsplus_fill_super+0x21c1/0x2700 [ 70.694911][ T9333] mount_bdev+0x37b/0x530 [ 70.695320][ T9333] hfsplus_mount+0x4d/0x60 [ 70.695729][ T9333] legacy_get_tree+0x113/0x2c0 [ 70.696167][ T9333] vfs_get_tree+0xb3/0x5c0 [ 70.696588][ T9333] do_new_mount+0x73e/0x1630 [ 70.697013][ T9333] path_mount+0x6e3/0x1eb0 [ 70.697425][ T9333] __se_sys_mount+0x733/0x830 [ 70.697857][ T9333] __x64_sys_mount+0xe4/0x150 [ 70.698269][ T9333] x64_sys_call+0x2691/0x3cf0 [ 70.698704][ T9333] do_syscall_64+0xd9/0x1d0 [ 70.699117][ T9333] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 70.699730][ T9333] [ 70.699946][ T9333] Uninit was created at: [ 70.700378][ T9333] __alloc_pages_noprof+0x714/0xe60 [ 70.700843][ T9333] alloc_pages_mpol_noprof+0x2a2/0x9b0 [ 70.701331][ T9333] alloc_pages_noprof+0xf8/0x1f0 [ 70.701774][ T9333] allocate_slab+0x30e/0x1390 [ 70.702194][ T9333] ___slab_alloc+0x1049/0x33a0 [ 70.702635][ T9333] kmem_cache_alloc_lru_noprof+0x5ce/0xb20 [ 70.703153][ T9333] hfsplus_alloc_inode+0x5a/0xd0 [ 70.703598][ T9333] alloc_inode+0x82/0x490 [ 70.703984][ T9333] iget_locked+0x22e/0x1320 [ 70.704428][ T9333] hfsplus_iget+0x5c/0xba0 [ 70.704827][ T9333] hfsplus_btree_open+0x135/0x1dd0 [ 70.705291][ T9333] hfsplus_fill_super+0x1132/0x2700 [ 70.705776][ T9333] mount_bdev+0x37b/0x530 [ 70.706171][ T9333] hfsplus_mount+0x4d/0x60 [ 70.706579][ T9333] legacy_get_tree+0x113/0x2c0 [ 70.707019][ T9333] vfs_get_tree+0xb3/0x5c0 [ 70.707444][ T9333] do_new_mount+0x73e/0x1630 [ 70.707865][ T9333] path_mount+0x6e3/0x1eb0 [ 70.708270][ T9333] __se_sys_mount+0x733/0x830 [ 70.708711][ T9333] __x64_sys_mount+0xe4/0x150 [ 70.709158][ T9333] x64_sys_call+0x2691/0x3cf0 [ 70.709630][ T9333] do_syscall_64+0xd9/0x1d0 [ 70.710053][ T9333] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 70.710611][ T9333] [ 70.710842][ T9333] CPU: 3 UID: 0 PID: 9333 Comm: repro Not tainted 6.12.0-rc6-dirty #17 [ 70.711568][ T9333] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 70.712490][ T9333] ===================================================== [ 70.713085][ T9333] Disabling lock debugging due to kernel taint [ 70.713618][ T9333] Kernel panic - not syncing: kmsan.panic set ... [ 70.714159][ T9333] CPU: 3 UID: 0 PID: 9333 Comm: repro Tainted: G B 6.12.0-rc6-dirty #17 [ 70.715007][ T9333] Tainted: [B]=BAD_PAGE [ 70.715365][ T9333] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 70.716311][ T9333] Call Trace: [ 70.716621][ T9333] [ 70.716899][ T9333] dump_stack_lvl+0x1fd/0x2b0 [ 70.717350][ T9333] dump_stack+0x1e/0x30 [ 70.717743][ T9333] panic+0x502/0xca0 [ 70.718116][ T9333] ? kmsan_get_metadata+0x13e/0x1c0 [ 70.718611][ T9333] kmsan_report+0x296/0x2a0 [ 70.719038][ T9333] ? __msan_metadata_ptr_for_load_4+0x24/0x40 [ 70.719859][ T9333] ? __msan_warning+0x96/0x120 [ 70.720345][ T9333] ? hfsplus_subfolders_dec+0x1d7/0x220 [ 70.720881][ T9333] ? hfsplus_delete_cat+0x105d/0x12b0 [ 70.721412][ T9333] ? hfsplus_rmdir+0x13d/0x310 [ 70.721880][ T9333] ? vfs_rmdir+0x5ba/0x810 [ 70.722458][ T9333] ? do_rmdir+0x964/0xea0 [ 70.722883][ T9333] ? __x64_sys_rmdir+0x71/0xb0 [ 70.723397][ T9333] ? x64_sys_call+0xcd8/0x3cf0 [ 70.723915][ T9333] ? do_syscall_64+0xd9/0x1d0 [ 70.724454][ T9333] ? entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 70.725110][ T9333] ? vprintk_emit+0xd1f/0xe60 [ 70.725616][ T9333] ? vprintk_default+0x3f/0x50 [ 70.726175][ T9333] ? vprintk+0xce/0xd0 [ 70.726628][ T9333] ? _printk+0x17e/0x1b0 [ 70.727129][ T9333] ? __msan_metadata_ptr_for_load_4+0x24/0x40 [ 70.727739][ T9333] ? kmsan_get_metadata+0x13e/0x1c0 [ 70.728324][ T9333] __msan_warning+0x96/0x120 [ 70.728854][ T9333] hfsplus_subfolders_dec+0x1d7/0x220 [ 70.729479][ T9333] hfsplus_delete_cat+0x105d/0x12b0 [ 70.729984][ T9333] ? kmsan_get_shadow_origin_ptr+0x4a/0xb0 [ 70.730646][ T9333] ? __msan_metadata_ptr_for_load_4+0x24/0x40 [ 70.731296][ T9333] ? kmsan_get_metadata+0x13e/0x1c0 [ 70.731863][ T9333] hfsplus_rmdir+0x13d/0x310 [ 70.732390][ T9333] ? __pfx_hfsplus_rmdir+0x10/0x10 [ 70.732919][ T9333] vfs_rmdir+0x5ba/0x810 [ 70.733416][ T9333] ? kmsan_get_shadow_origin_ptr+0x4a/0xb0 [ 70.734044][ T9333] do_rmdir+0x964/0xea0 [ 70.734537][ T9333] __x64_sys_rmdir+0x71/0xb0 [ 70.735032][ T9333] x64_sys_call+0xcd8/0x3cf0 [ 70.735579][ T9333] do_syscall_64+0xd9/0x1d0 [ 70.736092][ T9333] ? irqentry_exit+0x16/0x60 [ 70.736637][ T9333] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 70.737269][ T9333] RIP: 0033:0x7fa9424eafc9 [ 70.737775][ T9333] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 48 [ 70.739844][ T9333] RSP: 002b:00007fff099cd8d8 EFLAGS: 00000202 ORIG_RAX: 0000000000000054 [ 70.740760][ T9333] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fa9424eafc9 [ 70.741642][ T9333] RDX: 006c6f72746e6f63 RSI: 000000000000000a RDI: 0000000020000100 [ 70.742543][ T9333] RBP: 00007fff099cd8e0 R08: 00007fff099cd910 R09: 00007fff099cd910 [ 70.743376][ T9333] R10: 0000000000000000 R11: 0000000000000202 R12: 0000565430642260 [ 70.744247][ T9333] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 70.745082][ T9333] The main reason of the issue that struct hfsplus_inode_info has not been properly initialized for the case of root folder. In the case of root folder, hfsplus_fill_super() calls the hfsplus_iget() that implements only partial initialization of struct hfsplus_inode_info and subfolders field is not initialized by hfsplus_iget() logic. This patch implements complete initialization of struct hfsplus_inode_info in the hfsplus_iget() logic with the goal to prevent likewise issues for the case of root folder. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=fdedff847a0e5e84c39f Signed-off-by: Viacheslav Dubeyko cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250825225103.326401-1-slava@dubeyko.com Signed-off-by: Viacheslav Dubeyko --- fs/hfsplus/super.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 55f42b349a5ee8..77ec048021a01c 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -68,13 +68,26 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) if (!(inode->i_state & I_NEW)) return inode; - INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); - spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock); - mutex_init(&HFSPLUS_I(inode)->extents_lock); - HFSPLUS_I(inode)->flags = 0; + atomic_set(&HFSPLUS_I(inode)->opencnt, 0); + HFSPLUS_I(inode)->first_blocks = 0; + HFSPLUS_I(inode)->clump_blocks = 0; + HFSPLUS_I(inode)->alloc_blocks = 0; + HFSPLUS_I(inode)->cached_start = U32_MAX; + HFSPLUS_I(inode)->cached_blocks = 0; + memset(HFSPLUS_I(inode)->first_extents, 0, sizeof(hfsplus_extent_rec)); + memset(HFSPLUS_I(inode)->cached_extents, 0, sizeof(hfsplus_extent_rec)); HFSPLUS_I(inode)->extent_state = 0; + mutex_init(&HFSPLUS_I(inode)->extents_lock); HFSPLUS_I(inode)->rsrc_inode = NULL; - atomic_set(&HFSPLUS_I(inode)->opencnt, 0); + HFSPLUS_I(inode)->create_date = 0; + HFSPLUS_I(inode)->linkid = 0; + HFSPLUS_I(inode)->flags = 0; + HFSPLUS_I(inode)->fs_blocks = 0; + HFSPLUS_I(inode)->userflags = 0; + HFSPLUS_I(inode)->subfolders = 0; + INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); + spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock); + HFSPLUS_I(inode)->phys_size = 0; if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || inode->i_ino == HFSPLUS_ROOT_CNID) { From a06ec283e125e334155fe13005c76c9f484ce759 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Tue, 10 Jun 2025 16:16:09 -0700 Subject: [PATCH 0630/3206] hfs: add logic of correcting a next unused CNID The generic/736 xfstest fails for HFS case: BEGIN TEST default (1 test): hfs Mon May 5 03:18:32 UTC 2025 DEVICE: /dev/vdb HFS_MKFS_OPTIONS: MOUNT_OPTIONS: MOUNT_OPTIONS FSTYP -- hfs PLATFORM -- Linux/x86_64 kvm-xfstests 6.15.0-rc4-xfstests-g00b827f0cffa #1 SMP PREEMPT_DYNAMIC Fri May 25 MKFS_OPTIONS -- /dev/vdc MOUNT_OPTIONS -- /dev/vdc /vdc generic/736 [03:18:33][ 3.510255] run fstests generic/736 at 2025-05-05 03:18:33 _check_generic_filesystem: filesystem on /dev/vdb is inconsistent (see /results/hfs/results-default/generic/736.full for details) Ran: generic/736 Failures: generic/736 Failed 1 of 1 tests The HFS volume becomes corrupted after the test run: sudo fsck.hfs -d /dev/loop50 ** /dev/loop50 Using cacheBlockSize=32K cacheTotalBlock=1024 cacheSize=32768K. Executing fsck_hfs (version 540.1-Linux). ** Checking HFS volume. The volume name is untitled ** Checking extents overflow file. ** Checking catalog file. ** Checking catalog hierarchy. ** Checking volume bitmap. ** Checking volume information. invalid MDB drNxtCNID Master Directory Block needs minor repair (1, 0) Verify Status: VIStat = 0x8000, ABTStat = 0x0000 EBTStat = 0x0000 CBTStat = 0x0000 CatStat = 0x00000000 ** Repairing volume. ** Rechecking volume. ** Checking HFS volume. The volume name is untitled ** Checking extents overflow file. ** Checking catalog file. ** Checking catalog hierarchy. ** Checking volume bitmap. ** Checking volume information. ** The volume untitled was repaired successfully. The main reason of the issue is the absence of logic that corrects mdb->drNxtCNID/HFS_SB(sb)->next_id (next unused CNID) after deleting a record in Catalog File. This patch introduces a hfs_correct_next_unused_CNID() method that implements the necessary logic. In the case of Catalog File's record delete operation, the function logic checks that (deleted_CNID + 1) == next_unused_CNID and it finds/sets the new value of next_unused_CNID. sudo ./check generic/736 FSTYP -- hfs PLATFORM -- Linux/x86_64 hfsplus-testing-0001 6.15.0+ #6 SMP PREEMPT_DYNAMIC Tue Jun 10 15:02:48 PDT 2025 MKFS_OPTIONS -- /dev/loop51 MOUNT_OPTIONS -- /dev/loop51 /mnt/scratch generic/736 33s Ran: generic/736 Passed all 1 tests sudo fsck.hfs -d /dev/loop50 ** /dev/loop50 Using cacheBlockSize=32K cacheTotalBlock=1024 cacheSize=32768K. Executing fsck_hfs (version 540.1-Linux). ** Checking HFS volume. The volume name is untitled ** Checking extents overflow file. ** Checking catalog file. ** Checking catalog hierarchy. ** Checking volume bitmap. ** Checking volume information. ** The volume untitled appears to be OK Signed-off-by: Viacheslav Dubeyko Link: https://lore.kernel.org/r/20250610231609.551930-1-slava@dubeyko.com Signed-off-by: Viacheslav Dubeyko --- fs/hfs/catalog.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++ fs/hfs/hfs_fs.h | 6 +-- fs/hfs/inode.c | 21 ++++++-- fs/hfs/mdb.c | 18 ++++--- fs/hfs/super.c | 4 ++ 5 files changed, 158 insertions(+), 14 deletions(-) diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index d63880e7d9d672..b6b18ee6813510 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -211,6 +211,124 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid, return hfs_brec_find(fd); } +static inline +void hfs_set_next_unused_CNID(struct super_block *sb, + u32 deleted_cnid, u32 found_cnid) +{ + if (found_cnid < HFS_FIRSTUSER_CNID) { + atomic64_cmpxchg(&HFS_SB(sb)->next_id, + deleted_cnid + 1, HFS_FIRSTUSER_CNID); + } else { + atomic64_cmpxchg(&HFS_SB(sb)->next_id, + deleted_cnid + 1, found_cnid + 1); + } +} + +/* + * hfs_correct_next_unused_CNID() + * + * Correct the next unused CNID of Catalog Tree. + */ +static +int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) +{ + struct hfs_btree *cat_tree; + struct hfs_bnode *node; + s64 leaf_head; + s64 leaf_tail; + s64 node_id; + + hfs_dbg(CAT_MOD, "correct next unused CNID: cnid %u, next_id %lld\n", + cnid, atomic64_read(&HFS_SB(sb)->next_id)); + + if ((cnid + 1) < atomic64_read(&HFS_SB(sb)->next_id)) { + /* next ID should be unchanged */ + return 0; + } + + cat_tree = HFS_SB(sb)->cat_tree; + leaf_head = cat_tree->leaf_head; + leaf_tail = cat_tree->leaf_tail; + + if (leaf_head > leaf_tail) { + pr_err("node is corrupted: leaf_head %lld, leaf_tail %lld\n", + leaf_head, leaf_tail); + return -ERANGE; + } + + node = hfs_bnode_find(cat_tree, leaf_tail); + if (IS_ERR(node)) { + pr_err("fail to find leaf node: node ID %lld\n", + leaf_tail); + return -ENOENT; + } + + node_id = leaf_tail; + + do { + int i; + + if (node_id != leaf_tail) { + node = hfs_bnode_find(cat_tree, node_id); + if (IS_ERR(node)) + return -ENOENT; + } + + hfs_dbg(CAT_MOD, "node_id %lld, leaf_tail %lld, leaf_head %lld\n", + node_id, leaf_tail, leaf_head); + + hfs_bnode_dump(node); + + for (i = node->num_recs - 1; i >= 0; i--) { + hfs_cat_rec rec; + u16 off, len, keylen; + int entryoffset; + int entrylength; + u32 found_cnid; + + len = hfs_brec_lenoff(node, i, &off); + keylen = hfs_brec_keylen(node, i); + if (keylen == 0) { + pr_err("fail to get the keylen: " + "node_id %lld, record index %d\n", + node_id, i); + return -EINVAL; + } + + entryoffset = off + keylen; + entrylength = len - keylen; + + if (entrylength > sizeof(rec)) { + pr_err("unexpected record length: " + "entrylength %d\n", + entrylength); + return -EINVAL; + } + + hfs_bnode_read(node, &rec, entryoffset, entrylength); + + if (rec.type == HFS_CDR_DIR) { + found_cnid = be32_to_cpu(rec.dir.DirID); + hfs_dbg(CAT_MOD, "found_cnid %u\n", found_cnid); + hfs_set_next_unused_CNID(sb, cnid, found_cnid); + hfs_bnode_put(node); + return 0; + } else if (rec.type == HFS_CDR_FIL) { + found_cnid = be32_to_cpu(rec.file.FlNum); + hfs_dbg(CAT_MOD, "found_cnid %u\n", found_cnid); + hfs_set_next_unused_CNID(sb, cnid, found_cnid); + hfs_bnode_put(node); + return 0; + } + } + + hfs_bnode_put(node); + + node_id = node->prev; + } while (node_id >= leaf_head); + + return -ENOENT; +} /* * hfs_cat_delete() @@ -271,6 +389,11 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) dir->i_size--; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); + + res = hfs_correct_next_unused_CNID(sb, cnid); + if (res) + goto out; + res = 0; out: hfs_find_exit(&fd); diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 7c5a7ecfa2465a..c732a066de787b 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -112,13 +112,13 @@ struct hfs_sb_info { the extents b-tree */ struct hfs_btree *cat_tree; /* Information about the catalog b-tree */ - u32 file_count; /* The number of + atomic64_t file_count; /* The number of regular files in the filesystem */ - u32 folder_count; /* The number of + atomic64_t folder_count; /* The number of directories in the filesystem */ - u32 next_id; /* The next available + atomic64_t next_id; /* The next available file id number */ u32 clumpablks; /* The number of allocation blocks to try to add when diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index bf4cb7e78396bd..490b6ad5d9817a 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -183,6 +183,10 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t { struct super_block *sb = dir->i_sb; struct inode *inode = new_inode(sb); + s64 next_id; + s64 file_count; + s64 folder_count; + if (!inode) return NULL; @@ -190,7 +194,9 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); spin_lock_init(&HFS_I(inode)->open_dir_lock); hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name); - inode->i_ino = HFS_SB(sb)->next_id++; + next_id = atomic64_inc_return(&HFS_SB(sb)->next_id); + BUG_ON(next_id > U32_MAX); + inode->i_ino = (u32)next_id; inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); @@ -202,7 +208,8 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60; if (S_ISDIR(mode)) { inode->i_size = 2; - HFS_SB(sb)->folder_count++; + folder_count = atomic64_inc_return(&HFS_SB(sb)->folder_count); + BUG_ON(folder_count > U32_MAX); if (dir->i_ino == HFS_ROOT_CNID) HFS_SB(sb)->root_dirs++; inode->i_op = &hfs_dir_inode_operations; @@ -211,7 +218,8 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask; } else if (S_ISREG(mode)) { HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks; - HFS_SB(sb)->file_count++; + file_count = atomic64_inc_return(&HFS_SB(sb)->file_count); + BUG_ON(file_count > U32_MAX); if (dir->i_ino == HFS_ROOT_CNID) HFS_SB(sb)->root_files++; inode->i_op = &hfs_file_inode_operations; @@ -243,14 +251,17 @@ void hfs_delete_inode(struct inode *inode) hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { - HFS_SB(sb)->folder_count--; + BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX); + atomic64_dec(&HFS_SB(sb)->folder_count); if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) HFS_SB(sb)->root_dirs--; set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); hfs_mark_mdb_dirty(sb); return; } - HFS_SB(sb)->file_count--; + + BUG_ON(atomic64_read(&HFS_SB(sb)->file_count) > U32_MAX); + atomic64_dec(&HFS_SB(sb)->file_count); if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) HFS_SB(sb)->root_files--; if (S_ISREG(inode->i_mode)) { diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index bf811347bb07d3..53f3fae6021797 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -150,11 +150,11 @@ int hfs_mdb_get(struct super_block *sb) /* These parameters are read from and written to the MDB */ HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks); - HFS_SB(sb)->next_id = be32_to_cpu(mdb->drNxtCNID); + atomic64_set(&HFS_SB(sb)->next_id, be32_to_cpu(mdb->drNxtCNID)); HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls); HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs); - HFS_SB(sb)->file_count = be32_to_cpu(mdb->drFilCnt); - HFS_SB(sb)->folder_count = be32_to_cpu(mdb->drDirCnt); + atomic64_set(&HFS_SB(sb)->file_count, be32_to_cpu(mdb->drFilCnt)); + atomic64_set(&HFS_SB(sb)->folder_count, be32_to_cpu(mdb->drDirCnt)); /* TRY to get the alternate (backup) MDB. */ sect = part_start + part_size - 2; @@ -273,11 +273,17 @@ void hfs_mdb_commit(struct super_block *sb) /* These parameters may have been modified, so write them back */ mdb->drLsMod = hfs_mtime(); mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks); - mdb->drNxtCNID = cpu_to_be32(HFS_SB(sb)->next_id); + BUG_ON(atomic64_read(&HFS_SB(sb)->next_id) > U32_MAX); + mdb->drNxtCNID = + cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->next_id)); mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files); mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs); - mdb->drFilCnt = cpu_to_be32(HFS_SB(sb)->file_count); - mdb->drDirCnt = cpu_to_be32(HFS_SB(sb)->folder_count); + BUG_ON(atomic64_read(&HFS_SB(sb)->file_count) > U32_MAX); + mdb->drFilCnt = + cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->file_count)); + BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX); + mdb->drDirCnt = + cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->folder_count)); /* write MDB to disk */ mark_buffer_dirty(HFS_SB(sb)->mdb_bh); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 388a318297ece2..47f50fa555a457 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -319,6 +319,10 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc) int silent = fc->sb_flags & SB_SILENT; int res; + atomic64_set(&sbi->file_count, 0); + atomic64_set(&sbi->folder_count, 0); + atomic64_set(&sbi->next_id, 0); + /* load_nls_default does not fail */ if (sbi->nls_disk && !sbi->nls_io) sbi->nls_io = load_nls_default(); From 18b07c44f245beb03588b00b212b38fce9af7cc9 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Fri, 15 Aug 2025 12:49:19 -0700 Subject: [PATCH 0631/3206] hfs: clear offset and space out of valid records in b-tree node Currently, hfs_brec_remove() executes moving records towards the location of deleted record and it updates offsets of moved records. However, the hfs_brec_remove() logic ignores the "mess" of b-tree node's free space and it doesn't touch the offsets out of records number. Potentially, it could confuse fsck or driver logic or to be a reason of potential corruption cases. This patch reworks the logic of hfs_brec_remove() by means of clearing freed space of b-tree node after the records moving. And it clear the last offset that keeping old location of free space because now the offset before this one is keeping the actual offset to the free space after the record deletion. Signed-off-by: Viacheslav Dubeyko cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250815194918.38165-1-slava@dubeyko.com Signed-off-by: Viacheslav Dubeyko --- fs/hfs/brec.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 896396554bcc17..b01db1fae147cd 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -179,6 +179,7 @@ int hfs_brec_remove(struct hfs_find_data *fd) struct hfs_btree *tree; struct hfs_bnode *node, *parent; int end_off, rec_off, data_off, size; + int src, dst, len; tree = fd->tree; node = fd->bnode; @@ -208,10 +209,14 @@ int hfs_brec_remove(struct hfs_find_data *fd) } hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); - if (rec_off == end_off) - goto skip; size = fd->keylength + fd->entrylength; + if (rec_off == end_off) { + src = fd->keyoffset; + hfs_bnode_clear(node, src, size); + goto skip; + } + do { data_off = hfs_bnode_read_u16(node, rec_off); hfs_bnode_write_u16(node, rec_off + 2, data_off - size); @@ -219,9 +224,23 @@ int hfs_brec_remove(struct hfs_find_data *fd) } while (rec_off >= end_off); /* fill hole */ - hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size, - data_off - fd->keyoffset - size); + dst = fd->keyoffset; + src = fd->keyoffset + size; + len = data_off - src; + + hfs_bnode_move(node, dst, src, len); + + src = dst + len; + len = data_off - src; + + hfs_bnode_clear(node, src, len); + skip: + /* + * Remove the obsolete offset to free space. + */ + hfs_bnode_write_u16(node, end_off, 0); + hfs_bnode_dump(node); if (!fd->record) hfs_brec_update_parent(fd); From 03724b3496cb0272a5050a989ebf2494b1ff5a55 Mon Sep 17 00:00:00 2001 From: SeonGu Kang Date: Mon, 1 Sep 2025 10:49:21 +0530 Subject: [PATCH 0632/3206] dt-bindings: pinctrl: samsung: Add compatible for ARTPEC-8 SoC Document the compatible string for ARTPEC-8 SoC pinctrl block, which is similar to other Samsung SoC pinctrl blocks. Signed-off-by: SeonGu Kang Signed-off-by: Ravi Patel Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250901051926.59970-2-ravi.patel@samsung.com Signed-off-by: Krzysztof Kozlowski --- Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml index de846085614166..9386dcd418c269 100644 --- a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml @@ -35,6 +35,7 @@ properties: compatible: enum: + - axis,artpec8-pinctrl - google,gs101-pinctrl - samsung,s3c2412-pinctrl - samsung,s3c2416-pinctrl From d4ac729964d8967261fe15fdc8f249729f923120 Mon Sep 17 00:00:00 2001 From: SeonGu Kang Date: Mon, 1 Sep 2025 10:49:22 +0530 Subject: [PATCH 0633/3206] pinctrl: samsung: Add ARTPEC-8 SoC specific configuration Add Axis ARTPEC-8 SoC specific configuration data to enable pinctrl. Signed-off-by: SeonGu Kang Signed-off-by: Priyadarsini G Signed-off-by: Ravi Patel Link: https://lore.kernel.org/r/20250901051926.59970-3-ravi.patel@samsung.com Signed-off-by: Krzysztof Kozlowski --- .../pinctrl/samsung/pinctrl-exynos-arm64.c | 50 +++++++++++++++++++ drivers/pinctrl/samsung/pinctrl-exynos.h | 10 ++++ drivers/pinctrl/samsung/pinctrl-samsung.c | 2 + drivers/pinctrl/samsung/pinctrl-samsung.h | 1 + 4 files changed, 63 insertions(+) diff --git a/drivers/pinctrl/samsung/pinctrl-exynos-arm64.c b/drivers/pinctrl/samsung/pinctrl-exynos-arm64.c index 5fe7c4b9f7bd42..323487dfa8c2cb 100644 --- a/drivers/pinctrl/samsung/pinctrl-exynos-arm64.c +++ b/drivers/pinctrl/samsung/pinctrl-exynos-arm64.c @@ -76,6 +76,15 @@ static const struct samsung_pin_bank_type exynos8895_bank_type_off = { .reg_offset = { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, }, }; +/* + * Bank type for non-alive type. Bit fields: + * CON: 4, DAT: 1, PUD: 4, DRV: 4 + */ +static const struct samsung_pin_bank_type artpec_bank_type_off = { + .fld_width = { 4, 1, 4, 4, }, + .reg_offset = { 0x00, 0x04, 0x08, 0x0c, }, +}; + /* Pad retention control code for accessing PMU regmap */ static atomic_t exynos_shared_retention_refcnt; @@ -1816,3 +1825,44 @@ const struct samsung_pinctrl_of_match_data gs101_of_data __initconst = { .ctrl = gs101_pin_ctrl, .num_ctrl = ARRAY_SIZE(gs101_pin_ctrl), }; + +/* pin banks of artpec8 pin-controller (FSYS0) */ +static const struct samsung_pin_bank_data artpec8_pin_banks0[] __initconst = { + ARTPEC_PIN_BANK_EINTG(5, 0x000, "gpf0", 0x00), + ARTPEC_PIN_BANK_EINTG(4, 0x020, "gpf1", 0x04), + ARTPEC_PIN_BANK_EINTG(8, 0x040, "gpf2", 0x08), + ARTPEC_PIN_BANK_EINTG(4, 0x060, "gpf3", 0x0c), + ARTPEC_PIN_BANK_EINTG(7, 0x080, "gpf4", 0x10), + ARTPEC_PIN_BANK_EINTG(8, 0x0a0, "gpe0", 0x14), + ARTPEC_PIN_BANK_EINTG(8, 0x0c0, "gpe1", 0x18), + ARTPEC_PIN_BANK_EINTG(6, 0x0e0, "gpe2", 0x1c), + ARTPEC_PIN_BANK_EINTG(8, 0x100, "gps0", 0x20), + ARTPEC_PIN_BANK_EINTG(8, 0x120, "gps1", 0x24), +}; + +/* pin banks of artpec8 pin-controller (PERIC) */ +static const struct samsung_pin_bank_data artpec8_pin_banks1[] __initconst = { + ARTPEC_PIN_BANK_EINTG(8, 0x000, "gpa0", 0x00), + ARTPEC_PIN_BANK_EINTG(8, 0x020, "gpa1", 0x04), + ARTPEC_PIN_BANK_EINTG(8, 0x040, "gpa2", 0x08), + ARTPEC_PIN_BANK_EINTG(2, 0x060, "gpk0", 0x0c), +}; + +static const struct samsung_pin_ctrl artpec8_pin_ctrl[] __initconst = { + { + /* pin-controller instance 0 FSYS data */ + .pin_banks = artpec8_pin_banks0, + .nr_banks = ARRAY_SIZE(artpec8_pin_banks0), + .eint_gpio_init = exynos_eint_gpio_init, + }, { + /* pin-controller instance 1 PERIC data */ + .pin_banks = artpec8_pin_banks1, + .nr_banks = ARRAY_SIZE(artpec8_pin_banks1), + .eint_gpio_init = exynos_eint_gpio_init, + }, +}; + +const struct samsung_pinctrl_of_match_data artpec8_of_data __initconst = { + .ctrl = artpec8_pin_ctrl, + .num_ctrl = ARRAY_SIZE(artpec8_pin_ctrl), +}; diff --git a/drivers/pinctrl/samsung/pinctrl-exynos.h b/drivers/pinctrl/samsung/pinctrl-exynos.h index 362dc533186fb4..c9c38f8988dd2c 100644 --- a/drivers/pinctrl/samsung/pinctrl-exynos.h +++ b/drivers/pinctrl/samsung/pinctrl-exynos.h @@ -236,6 +236,16 @@ .name = id \ } +#define ARTPEC_PIN_BANK_EINTG(pins, reg, id, offs) \ + { \ + .type = &artpec_bank_type_off, \ + .pctl_offset = reg, \ + .nr_pins = pins, \ + .eint_type = EINT_TYPE_GPIO, \ + .eint_offset = offs, \ + .name = id \ + } + /** * struct exynos_weint_data: irq specific data for all the wakeup interrupts * generated by the external wakeup interrupt controller. diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.c b/drivers/pinctrl/samsung/pinctrl-samsung.c index 24745e1d78cec5..c099195fc464e3 100644 --- a/drivers/pinctrl/samsung/pinctrl-samsung.c +++ b/drivers/pinctrl/samsung/pinctrl-samsung.c @@ -1482,6 +1482,8 @@ static const struct of_device_id samsung_pinctrl_dt_match[] = { .data = &s5pv210_of_data }, #endif #ifdef CONFIG_PINCTRL_EXYNOS_ARM64 + { .compatible = "axis,artpec8-pinctrl", + .data = &artpec8_of_data }, { .compatible = "google,gs101-pinctrl", .data = &gs101_of_data }, { .compatible = "samsung,exynos2200-pinctrl", diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.h b/drivers/pinctrl/samsung/pinctrl-samsung.h index 1cabcbe1401a61..be2dee886d81a9 100644 --- a/drivers/pinctrl/samsung/pinctrl-samsung.h +++ b/drivers/pinctrl/samsung/pinctrl-samsung.h @@ -381,6 +381,7 @@ struct samsung_pmx_func { }; /* list of all exported SoC specific data */ +extern const struct samsung_pinctrl_of_match_data artpec8_of_data; extern const struct samsung_pinctrl_of_match_data exynos2200_of_data; extern const struct samsung_pinctrl_of_match_data exynos3250_of_data; extern const struct samsung_pinctrl_of_match_data exynos4210_of_data; From cd8ae32e4e4652db55bce6b9c79267d8946765a9 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 29 Aug 2025 10:54:15 +0200 Subject: [PATCH 0634/3206] xfrm: xfrm_alloc_spi shouldn't use 0 as SPI x->id.spi == 0 means "no SPI assigned", but since commit 94f39804d891 ("xfrm: Duplicate SPI Handling"), we now create states and add them to the byspi list with this value. __xfrm_state_delete doesn't remove those states from the byspi list, since they shouldn't be there, and this shows up as a UAF the next time we go through the byspi list. Reported-by: syzbot+a25ee9d20d31e483ba7b@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a25ee9d20d31e483ba7b Fixes: 94f39804d891 ("xfrm: Duplicate SPI Handling") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 78fcbb89cf3256..d213ca3653a8f2 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2583,6 +2583,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, for (h = 0; h < range; h++) { u32 spi = (low == high) ? low : get_random_u32_inclusive(low, high); + if (spi == 0) + goto next; newspi = htonl(spi); spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -2598,6 +2600,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, xfrm_state_put(x0); spin_unlock_bh(&net->xfrm.xfrm_state_lock); +next: if (signal_pending(current)) { err = -ERESTARTSYS; goto unlock; From 8e0665eb85394ea9823fb965a73ae9a6265af4b0 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 1 Aug 2025 10:20:06 +0200 Subject: [PATCH 0635/3206] powerpc: Replace __ASSEMBLY__ with __ASSEMBLER__ in uapi headers __ASSEMBLY__ is only defined by the Makefile of the kernel, so this is not really useful for uapi headers (unless the userspace Makefile defines it, too). Let's switch to __ASSEMBLER__ which gets set automatically by the compiler when compiling assembler code. This is a completely mechanical patch (done with a simple "sed -i" statement). Acked-by: Segher Boessenkool Signed-off-by: Thomas Huth Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250801082007.32904-2-thuth@redhat.com --- arch/powerpc/include/uapi/asm/opal-prd.h | 4 ++-- arch/powerpc/include/uapi/asm/ptrace.h | 12 ++++++------ arch/powerpc/include/uapi/asm/types.h | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/uapi/asm/opal-prd.h b/arch/powerpc/include/uapi/asm/opal-prd.h index 1869cf83a870ed..11abcf0192ca1a 100644 --- a/arch/powerpc/include/uapi/asm/opal-prd.h +++ b/arch/powerpc/include/uapi/asm/opal-prd.h @@ -40,7 +40,7 @@ #define OPAL_PRD_SCOM_READ _IOR('o', 0x02, struct opal_prd_scom) #define OPAL_PRD_SCOM_WRITE _IOW('o', 0x03, struct opal_prd_scom) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct opal_prd_info { __u64 version; @@ -54,6 +54,6 @@ struct opal_prd_scom { __s64 rc; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_POWERPC_OPAL_PRD_H */ diff --git a/arch/powerpc/include/uapi/asm/ptrace.h b/arch/powerpc/include/uapi/asm/ptrace.h index 7004cfea3f5ff8..01e630149d48e1 100644 --- a/arch/powerpc/include/uapi/asm/ptrace.h +++ b/arch/powerpc/include/uapi/asm/ptrace.h @@ -27,7 +27,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef __KERNEL__ struct user_pt_regs @@ -57,7 +57,7 @@ struct pt_regs unsigned long result; /* Result of a system call */ }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* @@ -200,7 +200,7 @@ struct pt_regs #define PPC_PTRACE_SETHWDEBUG 0x88 #define PPC_PTRACE_DELHWDEBUG 0x87 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct ppc_debug_info { __u32 version; /* Only version 1 exists to date */ @@ -212,7 +212,7 @@ struct ppc_debug_info { __u64 features; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * features will have bits indication whether there is support for: @@ -224,7 +224,7 @@ struct ppc_debug_info { #define PPC_DEBUG_FEATURE_DATA_BP_DAWR 0x0000000000000010 #define PPC_DEBUG_FEATURE_DATA_BP_ARCH_31 0x0000000000000020 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct ppc_hw_breakpoint { __u32 version; /* currently, version must be 1 */ @@ -236,7 +236,7 @@ struct ppc_hw_breakpoint { __u64 condition_value; /* contents of the DVC register */ }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * Trigger Type diff --git a/arch/powerpc/include/uapi/asm/types.h b/arch/powerpc/include/uapi/asm/types.h index 327616fb70e449..9dbf55e38ea58b 100644 --- a/arch/powerpc/include/uapi/asm/types.h +++ b/arch/powerpc/include/uapi/asm/types.h @@ -28,14 +28,14 @@ # include #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef struct { __u32 u[4]; } __attribute__((aligned(16))) __vector128; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_POWERPC_TYPES_H */ From 74db6cc331b0da5c48c62b7af68d747ec9af1984 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 1 Aug 2025 10:20:07 +0200 Subject: [PATCH 0636/3206] powerpc: Replace __ASSEMBLY__ with __ASSEMBLER__ in non-uapi headers While the GCC and Clang compilers already define __ASSEMBLER__ automatically when compiling assembler code, __ASSEMBLY__ is a macro that only gets defined by the Makefiles in the kernel. This is bad since macros starting with two underscores are names that are reserved by the C language. It can also be very confusing for the developers when switching between userspace and kernelspace coding, or when dealing with uapi headers that rather should use __ASSEMBLER__ instead. So let's standardize now on the __ASSEMBLER__ macro that is provided by the compilers. This is almost a completely mechanical patch (done with a simple "sed -i" statement), apart from tweaking two comments manually in arch/powerpc/include/asm/bug.h and arch/powerpc/include/asm/kasan.h (which did not have proper underscores at the end) and fixing a checkpatch error about spaces in arch/powerpc/include/asm/spu_csa.h. Signed-off-by: Thomas Huth Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250801082007.32904-3-thuth@redhat.com --- arch/powerpc/boot/page.h | 2 +- arch/powerpc/include/asm/asm-const.h | 2 +- arch/powerpc/include/asm/barrier.h | 2 +- arch/powerpc/include/asm/book3s/32/kup.h | 4 ++-- arch/powerpc/include/asm/book3s/32/mmu-hash.h | 8 ++++---- arch/powerpc/include/asm/book3s/32/pgtable.h | 12 ++++++------ arch/powerpc/include/asm/book3s/64/hash-4k.h | 4 ++-- arch/powerpc/include/asm/book3s/64/hash-64k.h | 4 ++-- arch/powerpc/include/asm/book3s/64/hash.h | 4 ++-- arch/powerpc/include/asm/book3s/64/kup.h | 6 +++--- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 12 ++++++------ arch/powerpc/include/asm/book3s/64/mmu.h | 8 ++++---- arch/powerpc/include/asm/book3s/64/pgtable-64k.h | 4 ++-- arch/powerpc/include/asm/book3s/64/pgtable.h | 10 +++++----- arch/powerpc/include/asm/book3s/64/radix.h | 8 ++++---- arch/powerpc/include/asm/book3s/64/slice.h | 4 ++-- arch/powerpc/include/asm/bug.h | 14 +++++++------- arch/powerpc/include/asm/cache.h | 4 ++-- arch/powerpc/include/asm/cpu_has_feature.h | 4 ++-- arch/powerpc/include/asm/cpuidle.h | 2 +- arch/powerpc/include/asm/cputable.h | 8 ++++---- arch/powerpc/include/asm/cputhreads.h | 4 ++-- arch/powerpc/include/asm/dcr-native.h | 4 ++-- arch/powerpc/include/asm/dcr.h | 4 ++-- arch/powerpc/include/asm/epapr_hcalls.h | 4 ++-- arch/powerpc/include/asm/exception-64e.h | 2 +- arch/powerpc/include/asm/exception-64s.h | 6 +++--- arch/powerpc/include/asm/extable.h | 2 +- arch/powerpc/include/asm/feature-fixups.h | 6 +++--- arch/powerpc/include/asm/firmware.h | 4 ++-- arch/powerpc/include/asm/fixmap.h | 4 ++-- arch/powerpc/include/asm/ftrace.h | 8 ++++---- arch/powerpc/include/asm/head-64.h | 4 ++-- arch/powerpc/include/asm/hvcall.h | 4 ++-- arch/powerpc/include/asm/hw_irq.h | 4 ++-- arch/powerpc/include/asm/interrupt.h | 4 ++-- arch/powerpc/include/asm/irqflags.h | 2 +- arch/powerpc/include/asm/jump_label.h | 2 +- arch/powerpc/include/asm/kasan.h | 4 ++-- arch/powerpc/include/asm/kdump.h | 4 ++-- arch/powerpc/include/asm/kexec.h | 4 ++-- arch/powerpc/include/asm/kgdb.h | 4 ++-- arch/powerpc/include/asm/kup.h | 8 ++++---- arch/powerpc/include/asm/kvm_asm.h | 2 +- arch/powerpc/include/asm/kvm_book3s_asm.h | 6 +++--- arch/powerpc/include/asm/kvm_booke_hv_asm.h | 4 ++-- arch/powerpc/include/asm/lv1call.h | 4 ++-- arch/powerpc/include/asm/mmu.h | 8 ++++---- arch/powerpc/include/asm/mpc52xx.h | 12 ++++++------ arch/powerpc/include/asm/nohash/32/kup-8xx.h | 4 ++-- arch/powerpc/include/asm/nohash/32/mmu-44x.h | 4 ++-- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 4 ++-- arch/powerpc/include/asm/nohash/32/pgtable.h | 12 ++++++------ arch/powerpc/include/asm/nohash/32/pte-8xx.h | 2 +- arch/powerpc/include/asm/nohash/64/pgtable-4k.h | 8 ++++---- arch/powerpc/include/asm/nohash/64/pgtable.h | 4 ++-- arch/powerpc/include/asm/nohash/kup-booke.h | 4 ++-- arch/powerpc/include/asm/nohash/mmu-e500.h | 4 ++-- arch/powerpc/include/asm/nohash/pgtable.h | 6 +++--- arch/powerpc/include/asm/nohash/pte-e500.h | 4 ++-- arch/powerpc/include/asm/opal-api.h | 4 ++-- arch/powerpc/include/asm/opal.h | 4 ++-- arch/powerpc/include/asm/page.h | 14 +++++++------- arch/powerpc/include/asm/page_32.h | 4 ++-- arch/powerpc/include/asm/page_64.h | 4 ++-- arch/powerpc/include/asm/pgtable.h | 8 ++++---- arch/powerpc/include/asm/ppc_asm.h | 4 ++-- arch/powerpc/include/asm/processor.h | 8 ++++---- arch/powerpc/include/asm/ptrace.h | 6 +++--- arch/powerpc/include/asm/reg.h | 6 +++--- arch/powerpc/include/asm/reg_booke.h | 4 ++-- arch/powerpc/include/asm/reg_fsl_emb.h | 4 ++-- arch/powerpc/include/asm/setup.h | 4 ++-- arch/powerpc/include/asm/smp.h | 4 ++-- arch/powerpc/include/asm/spu_csa.h | 4 ++-- arch/powerpc/include/asm/synch.h | 4 ++-- arch/powerpc/include/asm/thread_info.h | 8 ++++---- arch/powerpc/include/asm/tm.h | 4 ++-- arch/powerpc/include/asm/types.h | 4 ++-- arch/powerpc/include/asm/unistd.h | 4 ++-- arch/powerpc/include/asm/vdso.h | 6 +++--- arch/powerpc/include/asm/vdso/getrandom.h | 4 ++-- arch/powerpc/include/asm/vdso/gettimeofday.h | 4 ++-- arch/powerpc/include/asm/vdso/processor.h | 4 ++-- arch/powerpc/include/asm/vdso/vsyscall.h | 4 ++-- arch/powerpc/include/asm/vdso_datapage.h | 6 +++--- arch/powerpc/kernel/head_booke.h | 4 ++-- arch/powerpc/net/bpf_jit.h | 2 +- arch/powerpc/platforms/powernv/subcore.h | 4 ++-- arch/powerpc/xmon/xmon_bpts.h | 4 ++-- .../selftests/powerpc/include/instructions.h | 2 +- 91 files changed, 230 insertions(+), 230 deletions(-) diff --git a/arch/powerpc/boot/page.h b/arch/powerpc/boot/page.h index c3d55fc8f34c4e..e44a3119720db5 100644 --- a/arch/powerpc/boot/page.h +++ b/arch/powerpc/boot/page.h @@ -5,7 +5,7 @@ * Copyright (C) 2001 PPC64 Team, IBM Corp */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define ASM_CONST(x) x #else #define __ASM_CONST(x) x##UL diff --git a/arch/powerpc/include/asm/asm-const.h b/arch/powerpc/include/asm/asm-const.h index bfb3c3534877a0..392bdb1f104f4e 100644 --- a/arch/powerpc/include/asm/asm-const.h +++ b/arch/powerpc/include/asm/asm-const.h @@ -1,7 +1,7 @@ #ifndef _ASM_POWERPC_ASM_CONST_H #define _ASM_POWERPC_ASM_CONST_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define stringify_in_c(...) __VA_ARGS__ # define ASM_CONST(x) x #else diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index b95b666f03744e..9e9833faa4af87 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -7,7 +7,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #endif diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index 4e14a5427a6323..873c5146e32610 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -7,7 +7,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC_KUAP @@ -170,6 +170,6 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) #endif /* CONFIG_PPC_KUAP */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_32_KUP_H */ diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 78c6a5fde1d615..8435bf3cdabfaa 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -29,7 +29,7 @@ #define BPP_RX 0x01 /* Read only */ #define BPP_RW 0x02 /* Read/write */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Contort a phys_addr_t into the right format/bits for a BAT */ #ifdef CONFIG_PHYS_64BIT #define BAT_PHYS_ADDR(x) ((u32)((x & 0x00000000fffe0000ULL) | \ @@ -47,7 +47,7 @@ struct ppc_bat { u32 batu; u32 batl; }; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * Hash table @@ -64,7 +64,7 @@ struct ppc_bat { #define SR_KP 0x20000000 /* User key */ #define SR_KS 0x40000000 /* Supervisor key */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include @@ -225,7 +225,7 @@ static __always_inline void update_user_segments(u32 val) int __init find_free_bat(void); unsigned int bat_block_size(unsigned long base, unsigned long top); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* We happily ignore the smaller BATs on 601, we don't actually use * those definitions on hash32 at the moment anyway diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 92d21c6faf1e57..87dcca962be786 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -102,7 +102,7 @@ #define PMD_CACHE_INDEX PMD_INDEX_SIZE #define PUD_CACHE_INDEX PUD_INDEX_SIZE -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) #define PMD_TABLE_SIZE 0 #define PUD_TABLE_SIZE 0 @@ -110,7 +110,7 @@ /* Bits to mask out from a PMD to get to the PTE page */ #define PMD_MASKED_BITS (PTE_TABLE_SIZE - 1) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) @@ -132,12 +132,12 @@ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); void unmap_kernel_page(unsigned long va); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary @@ -199,7 +199,7 @@ void unmap_kernel_page(unsigned long va); #define MODULES_SIZE (CONFIG_MODULES_SIZE * SZ_1M) #define MODULES_VADDR (MODULES_END - MODULES_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -602,6 +602,6 @@ static inline pgprot_t pgprot_writecombine(pgprot_t prot) return pgprot_noncached_wc(prot); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_32_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 7132392fa7cdfe..8e5bd9902bed4e 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -32,7 +32,7 @@ */ #define H_KERN_VIRT_START ASM_CONST(0xc0003d0000000000) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) #define H_PMD_TABLE_SIZE (sizeof(pmd_t) << H_PMD_INDEX_SIZE) #define H_PUD_TABLE_SIZE (sizeof(pud_t) << H_PUD_INDEX_SIZE) @@ -168,6 +168,6 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, extern int hash__has_transparent_hugepage(void); #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 0fb5b7da94783e..7deb3a66890bcd 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -79,7 +79,7 @@ #endif #define H_PMD_FRAG_NR (PAGE_SIZE >> H_PMD_FRAG_SIZE_SHIFT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include /* @@ -281,6 +281,6 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, extern int hash__has_transparent_hugepage(void); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 0755f2567021dc..5a8cbd496731ef 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -112,7 +112,7 @@ #define H_PMD_BAD_BITS (PTE_TABLE_SIZE-1) #define H_PUD_BAD_BITS (PMD_TABLE_SIZE-1) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline int get_region_id(unsigned long ea) { int region_id; @@ -295,6 +295,6 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, int nid, pgprot_t prot); int hash__remove_section_mapping(unsigned long start, unsigned long end); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */ diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index 497a7bd31ecc01..03aec3c6c851c1 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -10,7 +10,7 @@ #define AMR_KUEP_BLOCKED UL(0x5455555555555555) #define AMR_KUAP_BLOCKED (AMR_KUAP_BLOCK_READ | AMR_KUAP_BLOCK_WRITE) -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro kuap_user_restore gpr1, gpr2 #if defined(CONFIG_PPC_PKEY) @@ -191,7 +191,7 @@ #endif .endm -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #include #include @@ -413,6 +413,6 @@ static __always_inline void restore_user_access(unsigned long flags) if (static_branch_unlikely(&uaccess_flush_key) && flags == AMR_KUAP_BLOCKED) do_uaccess_flush(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_KUP_H */ diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 1c4eebbc69c94b..3463514232071e 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -130,7 +130,7 @@ #define POWER9_TLB_SETS_HASH 256 /* # sets in POWER9 TLB Hash mode */ #define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct mmu_hash_ops { void (*hpte_invalidate)(unsigned long slot, @@ -220,7 +220,7 @@ static inline unsigned long get_sllp_encoding(int psize) return sllp; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * Segment sizes. @@ -248,7 +248,7 @@ static inline unsigned long get_sllp_encoding(int psize) #define LP_BITS 8 #define LP_MASK(i) ((0xFF >> (i)) << LP_SHIFT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline int slb_vsid_shift(int ssize) { @@ -532,7 +532,7 @@ void slb_set_size(u16 size); static inline void slb_set_size(u16 size) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * VSID allocation (256MB segment) @@ -668,7 +668,7 @@ static inline void slb_set_size(u16 size) { } #define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) #define LOW_SLICE_ARRAY_SZ (BITS_PER_LONG / BITS_PER_BYTE) #define TASK_SLICE_ARRAY_SZ(x) ((x)->hash_context->slb_addr_limit >> 41) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC_SUBPAGE_PROT /* @@ -881,5 +881,5 @@ static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index fedbc5d381917c..48631365b48cfc 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -4,7 +4,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Page size definition * @@ -26,12 +26,12 @@ struct mmu_psize_def { }; }; extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* 64-bit classic hash table MMU */ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * ISA 3.0 partition and process table entry format */ @@ -288,5 +288,5 @@ static inline unsigned long get_user_vsid(mm_context_t *ctx, } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index 4d8d7b4ea16ba5..004a03e97e58ee 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H #define _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_HUGETLB_PAGE #endif /* CONFIG_HUGETLB_PAGE */ @@ -14,5 +14,5 @@ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, BUG(); return hash__remap_4k_pfn(vma, addr, pfn, prot); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c1980036531599..aac8ce30cd3b39 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -4,7 +4,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -143,7 +143,7 @@ #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * page table defines */ @@ -291,7 +291,7 @@ static inline unsigned long pud_leaf_size(pud_t pud) else return PUD_SIZE; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #include #include @@ -327,7 +327,7 @@ static inline unsigned long pud_leaf_size(pud_t pud) #define FIXADDR_SIZE SZ_32M #define FIXADDR_TOP (IOREMAP_END + FIXADDR_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long clr, @@ -1381,5 +1381,5 @@ static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va return false; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index df23a8267e4d09..da954e7797441f 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -4,7 +4,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #endif @@ -14,7 +14,7 @@ #include #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #endif @@ -132,7 +132,7 @@ #define RADIX_VMEMMAP_SIZE RADIX_KERN_MAP_SIZE #define RADIX_VMEMMAP_END (RADIX_VMEMMAP_START + RADIX_VMEMMAP_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) #define RADIX_PMD_TABLE_SIZE (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE) #define RADIX_PUD_TABLE_SIZE (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE) @@ -362,5 +362,5 @@ int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, unsigned long start, unsigned long end, int node, struct dev_pagemap *pgmap); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h index 5fbe18544cbd1b..6e2f7a74cd7591 100644 --- a/arch/powerpc/include/asm/book3s/64/slice.h +++ b/arch/powerpc/include/asm/book3s/64/slice.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_BOOK3S_64_SLICE_H #define _ASM_POWERPC_BOOK3S_64_SLICE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC_64S_HASH_MMU #ifdef CONFIG_HUGETLB_PAGE @@ -37,6 +37,6 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, void slice_init_new_context_exec(struct mm_struct *mm); void slice_setup_new_exec(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_SLICE_H */ diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 1db485aacbd9b7..bbaa7e81f82134 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -7,7 +7,7 @@ #ifdef CONFIG_BUG -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include #ifdef CONFIG_DEBUG_BUGVERBOSE .macro EMIT_BUG_ENTRY addr,file,line,flags @@ -31,7 +31,7 @@ .endm #endif /* verbose */ -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ /* _EMIT_BUG_ENTRY expects args %0,%1,%2,%3 to be FILE, LINE, flags and sizeof(struct bug_entry), respectively */ #ifdef CONFIG_DEBUG_BUGVERBOSE @@ -101,12 +101,12 @@ #define HAVE_ARCH_WARN_ON #endif -#endif /* __ASSEMBLY __ */ +#endif /* __ASSEMBLER__ */ #else -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro EMIT_BUG_ENTRY addr,file,line,flags .endm -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #define _EMIT_BUG_ENTRY #endif #endif /* CONFIG_BUG */ @@ -115,7 +115,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct pt_regs; void hash__do_page_fault(struct pt_regs *); @@ -128,7 +128,7 @@ void die_mce(const char *str, struct pt_regs *regs, long err); extern bool die_will_crash(void); extern void panic_flush_kmsg_start(void); extern void panic_flush_kmsg_end(void); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BUG_H */ diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h index 69232231d27080..6796babc4d310c 100644 --- a/arch/powerpc/include/asm/cache.h +++ b/arch/powerpc/include/asm/cache.h @@ -37,7 +37,7 @@ #define ARCH_DMA_MINALIGN L1_CACHE_BYTES #endif -#if !defined(__ASSEMBLY__) +#if !defined(__ASSEMBLER__) #ifdef CONFIG_PPC64 struct ppc_cache_info { @@ -145,6 +145,6 @@ static inline void iccci(void *addr) asm volatile ("iccci 0, %0" : : "r"(addr) : "memory"); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_CACHE_H */ diff --git a/arch/powerpc/include/asm/cpu_has_feature.h b/arch/powerpc/include/asm/cpu_has_feature.h index bf8a228229fa92..604fa3b6c33d47 100644 --- a/arch/powerpc/include/asm/cpu_has_feature.h +++ b/arch/powerpc/include/asm/cpu_has_feature.h @@ -2,7 +2,7 @@ #ifndef __ASM_POWERPC_CPU_HAS_FEATURE_H #define __ASM_POWERPC_CPU_HAS_FEATURE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -51,5 +51,5 @@ static __always_inline bool cpu_has_feature(unsigned long feature) } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_POWERPC_CPU_HAS_FEATURE_H */ diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 0cce5dc7fb1c2d..054cd2fcfd551d 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -68,7 +68,7 @@ #define ERR_EC_ESL_MISMATCH -1 #define ERR_DEEP_STATE_ESL_MISMATCH -2 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define PNV_IDLE_NAME_LEN 16 struct pnv_idle_states_t { diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 29a529d2ab8b44..ec16c12296da80 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -7,7 +7,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* This structure can grow, it's real size is used by head.S code * via the mkdefs mechanism. @@ -103,7 +103,7 @@ extern void cpu_feature_keys_init(void); static inline void cpu_feature_keys_init(void) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* CPU kernel features */ @@ -195,7 +195,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_DEXCR_NPHIE LONG_ASM_CONST(0x0010000000000000) #define CPU_FTR_P11_PVR LONG_ASM_CONST(0x0020000000000000) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define CPU_FTR_PPCAS_ARCH_V2 (CPU_FTR_NOEXECUTE) @@ -602,6 +602,6 @@ enum { */ #define HBP_NUM_MAX 2 -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_POWERPC_CPUTABLE_H */ diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h index f26c430f398264..d06f2b20b81057 100644 --- a/arch/powerpc/include/asm/cputhreads.h +++ b/arch/powerpc/include/asm/cputhreads.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_CPUTHREADS_H #define _ASM_POWERPC_CPUTHREADS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -107,7 +107,7 @@ static inline u32 get_tensr(void) void book3e_start_thread(int thread, unsigned long addr); void book3e_stop_thread(int thread); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define INVALID_THREAD_HWID 0x0fff diff --git a/arch/powerpc/include/asm/dcr-native.h b/arch/powerpc/include/asm/dcr-native.h index a92059964579b8..65b3fc2dc4043d 100644 --- a/arch/powerpc/include/asm/dcr-native.h +++ b/arch/powerpc/include/asm/dcr-native.h @@ -7,7 +7,7 @@ #ifndef _ASM_POWERPC_DCR_NATIVE_H #define _ASM_POWERPC_DCR_NATIVE_H #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -139,6 +139,6 @@ static inline void __dcri_clrset(int base_addr, int base_data, int reg, DCRN_ ## base ## _CONFIG_DATA, \ reg, clr, set) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_DCR_NATIVE_H */ diff --git a/arch/powerpc/include/asm/dcr.h b/arch/powerpc/include/asm/dcr.h index 180021cd0b30f7..3c0fac2cc2b257 100644 --- a/arch/powerpc/include/asm/dcr.h +++ b/arch/powerpc/include/asm/dcr.h @@ -7,7 +7,7 @@ #ifndef _ASM_POWERPC_DCR_H #define _ASM_POWERPC_DCR_H #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC_DCR #include @@ -28,6 +28,6 @@ extern unsigned int dcr_resource_start(const struct device_node *np, extern unsigned int dcr_resource_len(const struct device_node *np, unsigned int index); #endif /* CONFIG_PPC_DCR */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_DCR_H */ diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index cdf3c6df5123a7..8fc5aaa4bbbad8 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -52,7 +52,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -571,5 +571,5 @@ static inline long epapr_hypercall4(unsigned int nr, unsigned long p1, in[3] = p4; return epapr_hypercall(in, out, nr); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _EPAPR_HCALLS_H */ diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index b1ef1e92c34a1b..1a83b1ff3578a7 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -149,7 +149,7 @@ exc_##label##_book3e: addi r11,r13,PACA_EXTLB; \ TLB_MISS_RESTORE(r11) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern unsigned int interrupt_base_book3e; #endif diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index bb6f78fcf981cc..a9437e89f69f72 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -53,7 +53,7 @@ */ #define MAX_MCE_DEPTH 4 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define STF_ENTRY_BARRIER_SLOT \ STF_ENTRY_BARRIER_FIXUP_SECTION; \ @@ -170,9 +170,9 @@ RFSCV; \ b rfscv_flush_fallback -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ /* Prototype for function defined in exceptions-64s.S */ void do_uaccess_flush(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_EXCEPTION_H */ diff --git a/arch/powerpc/include/asm/extable.h b/arch/powerpc/include/asm/extable.h index 26ce2e5c0fa8e9..d483a9c24ba96f 100644 --- a/arch/powerpc/include/asm/extable.h +++ b/arch/powerpc/include/asm/extable.h @@ -17,7 +17,7 @@ #define ARCH_HAS_RELATIVE_EXTABLE -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct exception_table_entry { int insn; diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 17d168dd8b4912..756a6c694018ce 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -168,7 +168,7 @@ label##5: \ #define ALT_FW_FTR_SECTION_END_IFCLR(msk) \ ALT_FW_FTR_SECTION_END_NESTED_IFCLR(msk, 97) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define ASM_FTR_IF(section_if, section_else, msk, val) \ stringify_in_c(BEGIN_FTR_SECTION) \ @@ -196,7 +196,7 @@ label##5: \ #define ASM_MMU_FTR_IFCLR(section_if, section_else, msk) \ ASM_MMU_FTR_IF(section_if, section_else, (msk), 0) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* LWSYNC feature sections */ #define START_LWSYNC_SECTION(label) label##1: @@ -276,7 +276,7 @@ label##3: \ FTR_ENTRY_OFFSET 956b-957b; \ .popsection; -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include extern long stf_barrier_fallback; diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 69ae9cf57d50b6..abd7c56f4d55ca 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -58,7 +58,7 @@ #define FW_FEATURE_WATCHDOG ASM_CONST(0x0000080000000000) #define FW_FEATURE_PLPKS ASM_CONST(0x0000100000000000) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ enum { #ifdef CONFIG_PPC64 @@ -146,6 +146,6 @@ void pseries_probe_fw_features(void); static inline void pseries_probe_fw_features(void) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_FIRMWARE_H */ diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index f9068dd8dfce7a..bc5109eab5b744 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -14,7 +14,7 @@ #ifndef _ASM_FIXMAP_H #define _ASM_FIXMAP_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -111,5 +111,5 @@ static inline void __set_fixmap(enum fixed_addresses idx, #define VIRT_IMMR_BASE (__fix_to_virt(FIX_IMMR_BASE)) #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 82da7c7a1d1256..bd61a230b19d08 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -15,7 +15,7 @@ #define FTRACE_MCOUNT_MAX_OFFSET 8 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void _mcount(void); unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, @@ -69,14 +69,14 @@ struct ftrace_ops; void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif #endif /* CONFIG_FUNCTION_TRACER */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_FTRACE_SYSCALLS /* * Some syscall entry functions on powerpc start with "ppc_" (fork and clone, @@ -160,6 +160,6 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsi static inline void ftrace_free_init_tramp(void) { } static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; } #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_FTRACE */ diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index d73153b0275d65..3966bd5810cb65 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -4,7 +4,7 @@ #include -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* * We can't do CPP stringification and concatination directly into the section * name for some reason, so these macros can do it for us. @@ -167,6 +167,6 @@ end_##sname: // find label from _within_ sname #define ABS_ADDR(label, sname) (label - start_ ## sname + sname ## _start) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_HEAD_64_H */ diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index ea6c8dc400d294..9aef16149d9274 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -534,7 +534,7 @@ #define H_HTM_TARGET_NODAL_CHIP_INDEX(x) ((unsigned long)(x)<<(63-31)) #define H_HTM_TARGET_CORE_INDEX_ON_CHIP(x) ((unsigned long)(x)<<(63-47)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include /** @@ -735,6 +735,6 @@ struct hv_gpci_request_buffer { uint8_t bytes[HGPCI_MAX_DATA_BYTES]; } __packed; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 569ac1165b0693..1078ba88efaf46 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -59,7 +59,7 @@ #define IRQS_PMI_DISABLED 2 #define IRQS_ALL_DISABLED (IRQS_DISABLED | IRQS_PMI_DISABLED) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline void __hard_irq_enable(void) { @@ -516,6 +516,6 @@ static inline unsigned long mtmsr_isync_irqsafe(unsigned long msr) #define ARCH_IRQ_INIT_FLAGS IRQ_NOREQUEST -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HW_IRQ_H */ diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 23638d4e73ac03..eb0e4a20b81883 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -64,7 +64,7 @@ #define INTERRUPT_DATA_LOAD_TLB_MISS_603 0x1100 #define INTERRUPT_DATA_STORE_TLB_MISS_603 0x1200 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -675,6 +675,6 @@ unsigned long interrupt_exit_user_restart(struct pt_regs *regs); unsigned long interrupt_exit_kernel_restart(struct pt_regs *regs); #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_INTERRUPT_H */ diff --git a/arch/powerpc/include/asm/irqflags.h b/arch/powerpc/include/asm/irqflags.h index 47d46712928ac6..1351fb40fe7491 100644 --- a/arch/powerpc/include/asm/irqflags.h +++ b/arch/powerpc/include/asm/irqflags.h @@ -5,7 +5,7 @@ #ifndef _ASM_IRQFLAGS_H #define _ASM_IRQFLAGS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Get definitions for arch_local_save_flags(x), etc. */ diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 2f2a86ed2280aa..d4eaba459a0ed5 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -6,7 +6,7 @@ * Copyright 2010 Michael Ellerman, IBM Corp. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index b5bbb94c51f6db..db121494462248 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -12,7 +12,7 @@ #define EXPORT_SYMBOL_KASAN(fn) #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -80,5 +80,5 @@ void kasan_update_early_region(unsigned long k_start, unsigned long k_end, pte_t int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end); int kasan_init_region(void *start, size_t size); -#endif /* __ASSEMBLY */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/powerpc/include/asm/kdump.h b/arch/powerpc/include/asm/kdump.h index fd128d1e52b3bb..802644178f4329 100644 --- a/arch/powerpc/include/asm/kdump.h +++ b/arch/powerpc/include/asm/kdump.h @@ -31,7 +31,7 @@ #endif /* CONFIG_CRASH_DUMP */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #if defined(CONFIG_CRASH_DUMP) && !defined(CONFIG_NONSTATIC_KERNEL) extern void reserve_kdump_trampoline(void); @@ -42,6 +42,6 @@ static inline void reserve_kdump_trampoline(void) { ; } static inline void setup_kdump_trampoline(void) { ; } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __PPC64_KDUMP_H */ diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 70f2f0517509e0..4bbf9f699aaaf4 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -49,7 +49,7 @@ #define KEXEC_STATE_IRQS_OFF 1 #define KEXEC_STATE_REAL_MODE 2 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include typedef void (*crash_shutdown_t)(void); @@ -210,6 +210,6 @@ static inline void reset_sprs(void) } #endif -#endif /* ! __ASSEMBLY__ */ +#endif /* ! __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_KEXEC_H */ diff --git a/arch/powerpc/include/asm/kgdb.h b/arch/powerpc/include/asm/kgdb.h index 715c18b7533463..f39531903325a6 100644 --- a/arch/powerpc/include/asm/kgdb.h +++ b/arch/powerpc/include/asm/kgdb.h @@ -21,7 +21,7 @@ #ifndef __POWERPC_KGDB_H__ #define __POWERPC_KGDB_H__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define BREAK_INSTR_SIZE 4 #define BUFMAX ((NUMREGBYTES * 2) + 512) @@ -62,6 +62,6 @@ static inline void arch_kgdb_breakpoint(void) /* CR/LR, R1, R2, R13-R31 inclusive. */ #define NUMCRITREGBYTES (23 * sizeof(int)) #endif /* 32/64 */ -#endif /* !(__ASSEMBLY__) */ +#endif /* !(__ASSEMBLER__) */ #endif /* !__POWERPC_KGDB_H__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 2bb03d941e3e8a..dab63b82a8d4f3 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -6,7 +6,7 @@ #define KUAP_WRITE 2 #define KUAP_READ_WRITE (KUAP_READ | KUAP_WRITE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include static __always_inline bool kuap_is_disabled(void); @@ -28,14 +28,14 @@ static __always_inline bool kuap_is_disabled(void); #include #endif -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifndef CONFIG_PPC_KUAP .macro kuap_check_amr gpr1, gpr2 .endm #endif -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ extern bool disable_kuep; extern bool disable_kuap; @@ -181,6 +181,6 @@ static __always_inline void prevent_current_write_to_user(void) prevent_user_access(KUAP_WRITE); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_KUAP_H_ */ diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index d68d71987d5cf3..f9af8df0907757 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -9,7 +9,7 @@ #ifndef __POWERPC_KVM_ASM_H__ #define __POWERPC_KVM_ASM_H__ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifdef CONFIG_64BIT #define PPC_STD(sreg, offset, areg) std sreg, (offset)(areg) #define PPC_LD(treg, offset, areg) ld treg, (offset)(areg) diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index a36797938620f0..3435fe144908f4 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -20,7 +20,7 @@ /* Maximum number of subcores per physical core */ #define MAX_SUBCORES 4 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifdef CONFIG_KVM_BOOK3S_HANDLER @@ -58,7 +58,7 @@ kvmppc_resume_\intno: #endif /* CONFIG_KVM_BOOK3S_HANDLER */ -#else /*__ASSEMBLY__ */ +#else /*__ASSEMBLER__ */ struct kvmppc_vcore; @@ -150,7 +150,7 @@ struct kvmppc_book3s_shadow_vcpu { #endif }; -#endif /*__ASSEMBLY__ */ +#endif /*__ASSEMBLER__ */ /* Values for kvm_state */ #define KVM_HWTHREAD_IN_KERNEL 0 diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h index 7487ef5821210f..3acf2995d364ce 100644 --- a/arch/powerpc/include/asm/kvm_booke_hv_asm.h +++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h @@ -8,7 +8,7 @@ #include -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* * All exceptions from guest state must go through KVM @@ -64,5 +64,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) #endif .endm -#endif /*__ASSEMBLY__ */ +#endif /*__ASSEMBLER__ */ #endif /* ASM_KVM_BOOKE_HV_ASM_H */ diff --git a/arch/powerpc/include/asm/lv1call.h b/arch/powerpc/include/asm/lv1call.h index b11501b30193ba..ae70120953a85a 100644 --- a/arch/powerpc/include/asm/lv1call.h +++ b/arch/powerpc/include/asm/lv1call.h @@ -10,7 +10,7 @@ #if !defined(_ASM_POWERPC_LV1CALL_H) #define _ASM_POWERPC_LV1CALL_H -#if !defined(__ASSEMBLY__) +#if !defined(__ASSEMBLER__) #include #include @@ -211,7 +211,7 @@ {return _lv1_##name(LV1_##in##_IN_##out##_OUT_ARGS);} #endif -#endif /* !defined(__ASSEMBLY__) */ +#endif /* !defined(__ASSEMBLER__) */ /* lv1 call table */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 4182d68d9cd179..5f9c5d436e1713 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -137,7 +137,7 @@ MMU_FTR_CI_LARGE_PAGE #define MMU_FTRS_PA6T MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \ MMU_FTR_CI_LARGE_PAGE | MMU_FTR_NO_SLBIE_B -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -332,7 +332,7 @@ static inline bool strict_module_rwx_enabled(void) { return IS_ENABLED(CONFIG_STRICT_MODULE_RWX) && strict_kernel_rwx_enabled(); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* The kernel use the constants below to index in the page sizes array. * The use of fixed constants for this purpose is better for performances @@ -377,7 +377,7 @@ static inline bool strict_module_rwx_enabled(void) #include #else /* CONFIG_PPC_BOOK3S_64 */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* MMU initialization */ extern void early_init_mmu(void); extern void early_init_mmu_secondary(void); @@ -388,7 +388,7 @@ static inline void mmu_early_init_devtree(void) { } static inline void pkey_early_init_devtree(void) {} extern void *abatron_pteptrs[2]; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif #if defined(CONFIG_PPC_BOOK3S_32) diff --git a/arch/powerpc/include/asm/mpc52xx.h b/arch/powerpc/include/asm/mpc52xx.h index 01ae6c351e5028..d7ffbd06797d2d 100644 --- a/arch/powerpc/include/asm/mpc52xx.h +++ b/arch/powerpc/include/asm/mpc52xx.h @@ -13,10 +13,10 @@ #ifndef __ASM_POWERPC_MPC52xx_H__ #define __ASM_POWERPC_MPC52xx_H__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #include @@ -30,7 +30,7 @@ /* Structures mapping of some unit register set */ /* ======================================================================== */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Memory Mapping Control */ struct mpc52xx_mmap_ctl { @@ -258,14 +258,14 @@ struct mpc52xx_intr { u32 per_error; /* INTR + 0x38 */ }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* ========================================================================= */ /* Prototypes for MPC52xx sysdev */ /* ========================================================================= */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct device_node; @@ -297,7 +297,7 @@ extern void __init mpc52xx_setup_pci(void); static inline void mpc52xx_setup_pci(void) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #ifdef CONFIG_PM struct mpc52xx_suspend { diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index 46bc5925e5fdc1..08486b15b20751 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -7,7 +7,7 @@ #ifdef CONFIG_PPC_KUAP -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -82,7 +82,7 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) return !((regs->kuap ^ MD_APG_KUAP) & 0xff000000); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* CONFIG_PPC_KUAP */ diff --git a/arch/powerpc/include/asm/nohash/32/mmu-44x.h b/arch/powerpc/include/asm/nohash/32/mmu-44x.h index 2d92a39d8f2e80..c3d19219432440 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-44x.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-44x.h @@ -100,7 +100,7 @@ #define PPC47x_TLB2_S_RW (PPC47x_TLB2_SW | PPC47x_TLB2_SR) #define PPC47x_TLB2_IMG (PPC47x_TLB2_I | PPC47x_TLB2_M | PPC47x_TLB2_G) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern unsigned int tlb_44x_hwater; extern unsigned int tlb_44x_index; @@ -114,7 +114,7 @@ typedef struct { /* patch sites */ extern s32 patch__tlb_44x_hwater_D, patch__tlb_44x_hwater_I; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifndef CONFIG_PPC_EARLY_DEBUG_44x #define PPC44x_EARLY_TLBS 1 diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 2986f9ba40b88b..f19115db8072fd 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -174,7 +174,7 @@ #define MODULES_SIZE (CONFIG_MODULES_SIZE * SZ_1M) #define MODULES_VADDR (MODULES_END - MODULES_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -265,6 +265,6 @@ static inline int arch_vmap_pte_supported_shift(unsigned long size) extern s32 patch__itlbmiss_exit_1, patch__dtlbmiss_exit_1; extern s32 patch__itlbmiss_perf, patch__dtlbmiss_perf; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_MMU_8XX_H_ */ diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index b481738c4bb520..2d71e4b7cd09c1 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -4,12 +4,12 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include /* For sub-arch specific PPC_PIN_SIZE */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define PTE_INDEX_SIZE PTE_SHIFT #define PMD_INDEX_SIZE 0 @@ -19,14 +19,14 @@ #define PMD_CACHE_INDEX PMD_INDEX_SIZE #define PUD_CACHE_INDEX PUD_INDEX_SIZE -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) #define PMD_TABLE_SIZE 0 #define PUD_TABLE_SIZE 0 #define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) #define PMD_MASKED_BITS (PTE_TABLE_SIZE - 1) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) @@ -149,7 +149,7 @@ #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (pmd_val(pmd) & _PMD_BAD) @@ -199,6 +199,6 @@ static inline void pmd_clear(pmd_t *pmdp) /* We borrow LSB 2 to store the exclusive marker in swap PTEs. */ #define _PAGE_SWP_EXCLUSIVE 0x000004 -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_POWERPC_NOHASH_32_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 54ebb91dbdcf37..e2ea8ba9f8caeb 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -83,7 +83,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline pte_t pte_wrprotect(pte_t pte) { return __pte(pte_val(pte) | _PAGE_RO); diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h index 10f5cf444d72a8..fb6fa1d4e0749a 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h @@ -14,12 +14,12 @@ #define PUD_INDEX_SIZE 9 #define PGD_INDEX_SIZE 9 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) #define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) #define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) #define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) @@ -57,7 +57,7 @@ #define p4d_bad(p4d) (p4d_val(p4d) == 0) #define p4d_present(p4d) (p4d_val(p4d) != 0) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline pud_t *p4d_pgtable(p4d_t p4d) { @@ -80,7 +80,7 @@ static inline p4d_t pte_p4d(pte_t pte) } extern struct page *p4d_page(p4d_t p4d); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #define pud_ERROR(e) \ pr_err("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e)) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 2202c78730e8eb..2deb955b7bc89e 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -77,7 +77,7 @@ #define H_PAGE_4K_PFN 0 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* pte_clear moved to later in this file */ #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) @@ -209,6 +209,6 @@ void __patch_exception(int exc, unsigned long addr); __patch_exception((exc), (unsigned long)&name); \ } while (0) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/nohash/kup-booke.h b/arch/powerpc/include/asm/nohash/kup-booke.h index 0c7c3258134c56..d6bbb6d78bbe43 100644 --- a/arch/powerpc/include/asm/nohash/kup-booke.h +++ b/arch/powerpc/include/asm/nohash/kup-booke.h @@ -7,7 +7,7 @@ #ifdef CONFIG_PPC_KUAP -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro kuap_check_amr gpr1, gpr2 .endm @@ -105,7 +105,7 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) return !regs->kuap; } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* CONFIG_PPC_KUAP */ diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h b/arch/powerpc/include/asm/nohash/mmu-e500.h index b281d9eeaf1e65..2fad5ff426a0a4 100644 --- a/arch/powerpc/include/asm/nohash/mmu-e500.h +++ b/arch/powerpc/include/asm/nohash/mmu-e500.h @@ -230,7 +230,7 @@ #define MAS2_M_IF_NEEDED 0 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include extern unsigned int tlbcam_index; @@ -318,6 +318,6 @@ extern int book3e_htw_mode; #include DECLARE_PER_CPU(int, next_tlbcam_idx); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_MMU_BOOK3E_H_ */ diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 7d6b9e5b286ef9..5af168b7f29241 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_NOHASH_PGTABLE_H #define _ASM_POWERPC_NOHASH_PGTABLE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, unsigned long clr, unsigned long set, int huge); #endif @@ -27,7 +27,7 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern int icache_44x_need_flush; @@ -373,5 +373,5 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); void unmap_kernel_page(unsigned long va); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h index cb78392494da0c..b61efc3ee9040c 100644 --- a/arch/powerpc/include/asm/nohash/pte-e500.h +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -86,7 +86,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline pte_t pte_mkexec(pte_t pte) { return __pte((pte_val(pte) & ~_PAGE_BAP_SX) | _PAGE_BAP_UX); @@ -134,7 +134,7 @@ static inline unsigned long pud_leaf_size(pud_t pud) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_PTE_E500_H */ diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 8c9d4b26bf579e..d3eaa342579708 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -246,7 +246,7 @@ #define OPAL_CONFIG_IDLE_UNDO 0 #define OPAL_CONFIG_IDLE_APPLY 1 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Other enums */ enum OpalFreezeState { @@ -1183,6 +1183,6 @@ struct opal_mpipl_fadump { struct opal_mpipl_region region[]; } __packed; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __OPAL_API_H */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index af304e6cb486c5..0a398265ba04e5 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -10,7 +10,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -390,6 +390,6 @@ void opal_powercap_init(void); void opal_psr_init(void); void opal_sensor_groups_init(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_OPAL_H */ diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index af9a2628d1df05..b28fbb1d57eb90 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -6,7 +6,7 @@ * Copyright (C) 2001,2005 IBM Corporation. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -23,7 +23,7 @@ */ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifndef CONFIG_HUGETLB_PAGE #define HPAGE_SHIFT PAGE_SHIFT #elif defined(CONFIG_PPC_BOOK3S_64) @@ -75,7 +75,7 @@ extern unsigned int hpage_shift; #define LOAD_OFFSET ASM_CONST((CONFIG_KERNEL_START-CONFIG_PHYSICAL_START)) #if defined(CONFIG_NONSTATIC_KERNEL) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern phys_addr_t memstart_addr; extern phys_addr_t kernstart_addr; @@ -84,7 +84,7 @@ extern phys_addr_t kernstart_addr; extern long long virt_phys_offset; #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define PHYSICAL_START kernstart_addr #else /* !CONFIG_NONSTATIC_KERNEL */ @@ -216,7 +216,7 @@ extern long long virt_phys_offset; #endif #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline unsigned long virt_to_pfn(const void *kaddr) { return __pa(kaddr) >> PAGE_SHIFT; @@ -261,7 +261,7 @@ static inline const void *pfn_to_kaddr(unsigned long pfn) #define is_kernel_addr(x) ((x) >= TASK_SIZE) #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC_BOOK3S_64 #include @@ -290,6 +290,6 @@ static inline unsigned long kaslr_offset(void) } #include -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_PAGE_H */ diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h index b9ac9e3a771cbb..25482405a8111d 100644 --- a/arch/powerpc/include/asm/page_32.h +++ b/arch/powerpc/include/asm/page_32.h @@ -19,7 +19,7 @@ #define PTE_SHIFT (PAGE_SHIFT - PTE_T_LOG2) /* full page */ #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * The basic type of a PTE - 64 bits for those CPUs with > 32 bit * physical addressing. @@ -53,6 +53,6 @@ extern void copy_page(void *to, void *from); #define PGD_T_LOG2 (__builtin_ffs(sizeof(pgd_t)) - 1) #define PTE_T_LOG2 (__builtin_ffs(sizeof(pte_t)) - 1) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_PAGE_32_H */ diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 79a9b7c6a132ca..0f564a06bf6843 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -35,7 +35,7 @@ #define ESID_MASK_1T 0xffffff0000000000UL #define GET_ESID_1T(x) (((x) >> SID_SHIFT_1T) & SID_MASK_1T) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include typedef unsigned long pte_basic_t; @@ -82,7 +82,7 @@ extern void copy_page(void *to, void *from); /* Log 2 of page table size */ extern u64 ppc64_pft_size; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define VM_DATA_DEFAULT_FLAGS \ (is_32bit_task() ? \ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 93d77ad5a92fa4..fd29fb289d1eb2 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_PGTABLE_H #define _ASM_POWERPC_PGTABLE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include /* For TASK_SIZE */ @@ -12,7 +12,7 @@ struct mm_struct; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifdef CONFIG_PPC_BOOK3S #include @@ -39,7 +39,7 @@ struct mm_struct; #define PAGE_AGP (PAGE_KERNEL_NC) #define HAVE_PAGE_AGP -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define PFN_PTE_SHIFT PTE_RPN_SHIFT @@ -214,6 +214,6 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) #endif /* CONFIG_PPC64 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index b891910fce8a69..46947c82a7127d 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -12,7 +12,7 @@ #include #include -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define SZL (BITS_PER_LONG/8) @@ -868,7 +868,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) #endif /* !CONFIG_PPC_BOOK3E_64 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define SOFT_MASK_TABLE(_start, _end) \ stringify_in_c(.section __soft_mask_table,"a";)\ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 6b94de17201c76..f156bdb43e2be1 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -29,14 +29,14 @@ #ifdef CONFIG_PPC64 /* Default SMT priority is set to 3. Use 11- 13bits to save priority. */ #define PPR_PRIORITY 3 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define DEFAULT_PPR (PPR_PRIORITY << 50) #else #define DEFAULT_PPR ((u64)PPR_PRIORITY << 50) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_PPC64 */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -460,5 +460,5 @@ int enter_vmx_ops(void); void *exit_vmx_ops(void *dest); #endif /* __KERNEL__ */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_PROCESSOR_H */ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 7b9350756875a7..94aa1de2b06e19 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -24,7 +24,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct pt_regs { union { @@ -165,7 +165,7 @@ struct pt_regs #define STACK_INT_FRAME_SIZE (KERNEL_REDZONE_SIZE + STACK_USER_INT_FRAME_SIZE) #define STACK_INT_FRAME_MARKER_LONGS (STACK_INT_FRAME_MARKER/sizeof(long)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #ifdef CONFIG_SMP @@ -414,7 +414,7 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, unsig return 0; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #ifndef __powerpc64__ /* We need PT_SOFTE defined at all time to avoid #ifdefs */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 0228c90bbcc7bf..3fe1866354323b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -60,7 +60,7 @@ #define MSR_RI_LG 1 /* Recoverable Exception */ #define MSR_LE_LG 0 /* Little Endian */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __MASK(X) (1<<(X)) #else #define __MASK(X) (1UL<<(X)) @@ -1358,7 +1358,7 @@ #define PVR_ARCH_31_P11 0x0f000007 /* Macros for setting and retrieving special purpose registers */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #if defined(CONFIG_PPC64) || defined(__CHECKER__) typedef struct { @@ -1450,6 +1450,6 @@ extern void scom970_write(unsigned int address, unsigned long value); struct pt_regs; extern void ppc_save_regs(struct pt_regs *regs); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_REG_H */ diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 656bfaf91526e0..56f9d3b1de859f 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -576,7 +576,7 @@ #define TEN_THREAD(x) (1 << (x)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define mftmr(rn) ({unsigned long rval; \ asm volatile(MFTMR(rn, %0) : "=r" (rval)); rval;}) #define mttmr(rn, v) asm volatile(MTTMR(rn, %0) : \ @@ -585,7 +585,7 @@ extern unsigned long global_dbcr0[]; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_POWERPC_REG_BOOKE_H__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/reg_fsl_emb.h b/arch/powerpc/include/asm/reg_fsl_emb.h index 9893d2001b6801..ec459c3d9498a1 100644 --- a/arch/powerpc/include/asm/reg_fsl_emb.h +++ b/arch/powerpc/include/asm/reg_fsl_emb.h @@ -9,7 +9,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Performance Monitor Registers */ static __always_inline unsigned int mfpmr(unsigned int rn) { @@ -32,7 +32,7 @@ static __always_inline void mtpmr(unsigned int rn, unsigned int val) ".machine pop;" : [val] "=r" (val) : [rn] "i" (rn)); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* Freescale Book E Performance Monitor APU Registers */ #define PMRN_PMC0 0x010 /* Performance Monitor Counter 0 */ diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index eed74c1fb832fc..50a92b24628daf 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -4,7 +4,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void ppc_printk_progress(char *s, unsigned short hex); extern unsigned long long memory_limit; @@ -89,7 +89,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, extern struct seq_buf ppc_hw_desc; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_SETUP_H */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index b77927ccb0ab00..e41b9ea42122be 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -18,7 +18,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC64 #include @@ -266,7 +266,7 @@ extern char __secondary_hold; extern unsigned int booting_thread_hwid; extern void __early_start(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_SMP_H) */ diff --git a/arch/powerpc/include/asm/spu_csa.h b/arch/powerpc/include/asm/spu_csa.h index c33df961c04549..1b3271a033928a 100644 --- a/arch/powerpc/include/asm/spu_csa.h +++ b/arch/powerpc/include/asm/spu_csa.h @@ -43,7 +43,7 @@ #define SPU_DECR_STATUS_RUNNING 0x1 #define SPU_DECR_STATUS_WRAPPED 0x2 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /** * spu_reg128 - generic 128-bit register definition. */ @@ -243,5 +243,5 @@ struct spu_state { #endif /* !__SPU__ */ #endif /* __KERNEL__ */ -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _SPU_CSA_H_ */ diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h index b0b4c64870d77c..0d3ccb34adfb2e 100644 --- a/arch/powerpc/include/asm/synch.h +++ b/arch/powerpc/include/asm/synch.h @@ -7,7 +7,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup; extern void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end); @@ -40,7 +40,7 @@ static inline void ppc_after_tlbiel_barrier(void) */ asm volatile(ASM_FTR_IFSET(PPC_CP_ABORT, "", %0) : : "i" (CPU_FTR_ARCH_31) : "memory"); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #if defined(__powerpc64__) # define LWSYNC lwsync diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 2785c7462ebf7b..b0f200aba2b3df 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -41,7 +41,7 @@ #define THREAD_ALIGN (1 << THREAD_ALIGN_SHIFT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -89,7 +89,7 @@ extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src void arch_setup_new_exec(void); #define arch_setup_new_exec arch_setup_new_exec -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * thread information flag bit numbers @@ -162,7 +162,7 @@ void arch_setup_new_exec(void); #define _TLF_LAZY_MMU (1 << TLF_LAZY_MMU) #define _TLF_RUNLATCH (1 << TLF_RUNLATCH) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline void clear_thread_local_flags(unsigned int flags) { @@ -233,7 +233,7 @@ static inline int arch_within_stack_frames(const void * const stack, extern void *emergency_ctx[]; #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h index e94f6db5e367b3..d700affba4480a 100644 --- a/arch/powerpc/include/asm/tm.h +++ b/arch/powerpc/include/asm/tm.h @@ -8,7 +8,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void tm_reclaim(struct thread_struct *thread, uint8_t cause); @@ -19,4 +19,4 @@ extern void tm_restore_sprs(struct thread_struct *thread); extern bool tm_suspend_disabled; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ diff --git a/arch/powerpc/include/asm/types.h b/arch/powerpc/include/asm/types.h index 93157a661dcc75..55d7ba6d910bdb 100644 --- a/arch/powerpc/include/asm/types.h +++ b/arch/powerpc/include/asm/types.h @@ -11,10 +11,10 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef __vector128 vector128; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_TYPES_H */ diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 027ef94a12fbdd..b873fbb6d712f7 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -9,7 +9,7 @@ #define NR_syscalls __NR_syscalls -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -52,5 +52,5 @@ #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_UNISTD_H_ */ diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h index 1ca23fbfe087ae..07af3257607254 100644 --- a/arch/powerpc/include/asm/vdso.h +++ b/arch/powerpc/include/asm/vdso.h @@ -5,7 +5,7 @@ #define VDSO_VERSION_STRING LINUX_2.6.15 #define __VDSO_PAGES 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC64 #include @@ -21,7 +21,7 @@ int vdso_getcpu_init(void); -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #ifdef __VDSO64__ #define V_FUNCTION_BEGIN(name) \ @@ -49,6 +49,6 @@ int vdso_getcpu_init(void); #endif /* __VDSO32__ */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_VDSO_H */ diff --git a/arch/powerpc/include/asm/vdso/getrandom.h b/arch/powerpc/include/asm/vdso/getrandom.h index 067a5396aac6e9..4c24976061f495 100644 --- a/arch/powerpc/include/asm/vdso/getrandom.h +++ b/arch/powerpc/include/asm/vdso/getrandom.h @@ -5,7 +5,7 @@ #ifndef _ASM_POWERPC_VDSO_GETRANDOM_H #define _ASM_POWERPC_VDSO_GETRANDOM_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -62,6 +62,6 @@ static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(vo ssize_t __c_kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_VDSO_GETRANDOM_H */ diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index 99c9d6f43fde2e..ab3df12c8d947e 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_VDSO_GETTIMEOFDAY_H #define _ASM_POWERPC_VDSO_GETTIMEOFDAY_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -141,6 +141,6 @@ int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz __kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_time_data *vd); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/powerpc/include/asm/vdso/processor.h b/arch/powerpc/include/asm/vdso/processor.h index 80d13207c5688d..c1f3d7aaf3ee97 100644 --- a/arch/powerpc/include/asm/vdso/processor.h +++ b/arch/powerpc/include/asm/vdso/processor.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_VDSO_PROCESSOR_H #define _ASM_POWERPC_VDSO_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Macros for adjusting thread priority (hardware multi-threading) */ #ifdef CONFIG_PPC64 @@ -33,6 +33,6 @@ #define cpu_relax() barrier() #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_VDSO_PROCESSOR_H */ diff --git a/arch/powerpc/include/asm/vdso/vsyscall.h b/arch/powerpc/include/asm/vdso/vsyscall.h index c2c9ae1b22e71a..bee18e8660a027 100644 --- a/arch/powerpc/include/asm/vdso/vsyscall.h +++ b/arch/powerpc/include/asm/vdso/vsyscall.h @@ -2,13 +2,13 @@ #ifndef _ASM_POWERPC_VDSO_VSYSCALL_H #define _ASM_POWERPC_VDSO_VSYSCALL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include /* The asm-generic header needs to be included after the definitions above */ #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_VDSO_VSYSCALL_H */ diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h index 95d45a50355d26..441264af0e3669 100644 --- a/arch/powerpc/include/asm/vdso_datapage.h +++ b/arch/powerpc/include/asm/vdso_datapage.h @@ -9,11 +9,11 @@ * IBM Corp. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ .macro get_datapage ptr symbol bcl 20, 31, .+4 @@ -23,7 +23,7 @@ addi \ptr, \ptr, (\symbol - 999b)@l .endm -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _SYSTEMCFG_H */ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 0b5c1993809eb0..75471fb6fb1014 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -7,7 +7,7 @@ #include #include /* for THREAD_SHIFT */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* * Macros used for common Book-e exception handling @@ -522,5 +522,5 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) bl kernel_fp_unavailable_exception; \ b interrupt_return -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __HEAD_BOOKE_H__ */ diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 4c26912c2e3c36..64c7e9a576fb06 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -8,7 +8,7 @@ #ifndef _BPF_JIT_H #define _BPF_JIT_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h index 77feee8436d48c..413fd85d9bc285 100644 --- a/arch/powerpc/platforms/powernv/subcore.h +++ b/arch/powerpc/platforms/powernv/subcore.h @@ -9,7 +9,7 @@ #define SYNC_STEP_REAL_MODE 2 /* Set by secondary when in real mode */ #define SYNC_STEP_FINISHED 3 /* Set by secondary when split/unsplit is done */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_SMP void split_core_secondary_loop(u8 *state); @@ -18,4 +18,4 @@ extern void update_subcore_sibling_mask(void); static inline void update_subcore_sibling_mask(void) { } #endif /* CONFIG_SMP */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ diff --git a/arch/powerpc/xmon/xmon_bpts.h b/arch/powerpc/xmon/xmon_bpts.h index 377068f52edb9a..e14e4fb862e0c0 100644 --- a/arch/powerpc/xmon/xmon_bpts.h +++ b/arch/powerpc/xmon/xmon_bpts.h @@ -3,12 +3,12 @@ #define XMON_BPTS_H #define NBPTS 256 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #define BPT_SIZE (sizeof(ppc_inst_t) * 2) #define BPT_WORDS (BPT_SIZE / sizeof(ppc_inst_t)) extern unsigned int bpt_table[NBPTS * BPT_WORDS]; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* XMON_BPTS_H */ diff --git a/tools/testing/selftests/powerpc/include/instructions.h b/tools/testing/selftests/powerpc/include/instructions.h index 4efa6314bd9630..864f0c9f1afcb0 100644 --- a/tools/testing/selftests/powerpc/include/instructions.h +++ b/tools/testing/selftests/powerpc/include/instructions.h @@ -67,7 +67,7 @@ static inline int paste_last(void *i) #define PPC_INST_PASTE_LAST __PASTE(0, 0, 1, 1) /* This defines the prefixed load/store instructions */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define stringify_in_c(...) __VA_ARGS__ #else # define __stringify_in_c(...) #__VA_ARGS__ From 4f61d54d2245c15b23ad78a89f854fb2496b6216 Mon Sep 17 00:00:00 2001 From: "Nysal Jan K.A." Date: Thu, 31 Jul 2025 11:48:53 +0530 Subject: [PATCH 0637/3206] powerpc/qspinlock: Add spinlock contention tracepoint Add a lock contention tracepoint in the queued spinlock slowpath. Also add the __lockfunc annotation so that in_lock_functions() works as expected. Signed-off-by: Nysal Jan K.A. Reviewed-by: Shrikanth Hegde Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250731061856.1858898-1-nysal@linux.ibm.com --- arch/powerpc/lib/qspinlock.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index bcc7e4dff8c305..95ab4cdf582ebd 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -9,6 +9,7 @@ #include #include #include +#include #define MAX_NODES 4 @@ -708,26 +709,26 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b qnodesp->count--; } -void queued_spin_lock_slowpath(struct qspinlock *lock) +void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock) { + trace_contention_begin(lock, LCB_F_SPIN); /* * This looks funny, but it induces the compiler to inline both * sides of the branch rather than share code as when the condition * is passed as the paravirt argument to the functions. */ if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) { - if (try_to_steal_lock(lock, true)) { + if (try_to_steal_lock(lock, true)) spec_barrier(); - return; - } - queued_spin_lock_mcs_queue(lock, true); + else + queued_spin_lock_mcs_queue(lock, true); } else { - if (try_to_steal_lock(lock, false)) { + if (try_to_steal_lock(lock, false)) spec_barrier(); - return; - } - queued_spin_lock_mcs_queue(lock, false); + else + queued_spin_lock_mcs_queue(lock, false); } + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_spin_lock_slowpath); From 3443ff3be6e59b80d74036bb39f5b6409eb23cc9 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 4 Aug 2025 12:07:27 +0200 Subject: [PATCH 0638/3206] powerpc/pseries/msi: Fix potential underflow and leak issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pseries_irq_domain_alloc() allocates interrupts at parent's interrupt domain. If it fails in the progress, all allocated interrupts are freed. The number of successfully allocated interrupts so far is stored "i". However, "i - 1" interrupts are freed. This is broken: - One interrupt is not be freed - If "i" is zero, "i - 1" wraps around Correct the number of freed interrupts to 'i'. Fixes: a5f3d2c17b07 ("powerpc/pseries/pci: Add MSI domains") Signed-off-by: Nam Cao Cc: stable@vger.kernel.org Reviewed-by: Cédric Le Goater Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/a980067f2b256bf716b4cd713bc1095966eed8cd.1754300646.git.namcao@linutronix.de --- arch/powerpc/platforms/pseries/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index ee1c8c6898a3c7..9dc294de631f1d 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -593,7 +593,7 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq out: /* TODO: handle RTAS cleanup in ->msi_finish() ? */ - irq_domain_free_irqs_parent(domain, virq, i - 1); + irq_domain_free_irqs_parent(domain, virq, i); return ret; } From a39087905af9ffecaa237a918a2c03a04e479934 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 4 Aug 2025 12:07:28 +0200 Subject: [PATCH 0639/3206] powerpc/powernv/pci: Fix underflow and leak issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pnv_irq_domain_alloc() allocates interrupts at parent's interrupt domain. If it fails in the progress, all allocated interrupts are freed. The number of successfully allocated interrupts so far is stored "i". However, "i - 1" interrupts are freed. This is broken: - One interrupt is not be freed - If "i" is zero, "i - 1" wraps around Correct the number of freed interrupts to "i". Fixes: 0fcfe2247e75 ("powerpc/powernv/pci: Add MSI domains") Signed-off-by: Nam Cao Cc: stable@vger.kernel.org Reviewed-by: Cédric Le Goater Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/70f8debe8688e0b467367db769b71c20146a836d.1754300646.git.namcao@linutronix.de --- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d8ccf2c9b98ad0..0166bf39ce1e52 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1854,7 +1854,7 @@ static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, return 0; out: - irq_domain_free_irqs_parent(domain, virq, i - 1); + irq_domain_free_irqs_parent(domain, virq, i); msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs); return ret; } From 3d9c9e1da9886d91c24d97ca20c854c29212b736 Mon Sep 17 00:00:00 2001 From: Ruben Wauters Date: Sat, 19 Jul 2025 23:43:10 +0100 Subject: [PATCH 0640/3206] powerpc/xmon: replace sizeof calculations with ARRAY_SIZE macro The calculations for operand/opcode/macro numbers are done in an identical manner to the already existing ARRAY_SIZE macro in linux/array_size.h This patch replaces the sizeof calculations with the macro to make the code cleaner and more immediately obvious what it is doing. Signed-off-by: Ruben Wauters Reviewed-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250719225225.2132-2-rubenru09@aol.com --- arch/powerpc/xmon/ppc-opc.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c index 0774d711453efa..de9b4236728c4e 100644 --- a/arch/powerpc/xmon/ppc-opc.c +++ b/arch/powerpc/xmon/ppc-opc.c @@ -954,8 +954,7 @@ const struct powerpc_operand powerpc_operands[] = { 0xff, 11, NULL, NULL, PPC_OPERAND_SIGNOPT }, }; -const unsigned int num_powerpc_operands = (sizeof (powerpc_operands) - / sizeof (powerpc_operands[0])); +const unsigned int num_powerpc_operands = ARRAY_SIZE(powerpc_operands); /* The functions used to insert and extract complicated operands. */ @@ -6968,9 +6967,8 @@ const struct powerpc_opcode powerpc_opcodes[] = { {"fcfidu.", XRC(63,974,1), XRA_MASK, POWER7|PPCA2, PPCVLE, {FRT, FRB}}, }; -const int powerpc_num_opcodes = - sizeof (powerpc_opcodes) / sizeof (powerpc_opcodes[0]); - +const int powerpc_num_opcodes = ARRAY_SIZE(powerpc_opcodes); + /* The VLE opcode table. The format of this opcode table is the same as the main opcode table. */ @@ -7207,9 +7205,8 @@ const struct powerpc_opcode vle_opcodes[] = { {"se_bl", BD8(58,0,1), BD8_MASK, PPCVLE, 0, {B8}}, }; -const int vle_num_opcodes = - sizeof (vle_opcodes) / sizeof (vle_opcodes[0]); - +const int vle_num_opcodes = ARRAY_SIZE(vle_opcodes); + /* The macro table. This is only used by the assembler. */ /* The expressions of the form (-x ! 31) & (x | 31) have the value 0 @@ -7276,5 +7273,4 @@ const struct powerpc_macro powerpc_macros[] = { {"e_clrlslwi",4, PPCVLE, "e_rlwinm %0,%1,%3,(%2)-(%3),31-(%3)"}, }; -const int powerpc_num_macros = - sizeof (powerpc_macros) / sizeof (powerpc_macros[0]); +const int powerpc_num_macros = ARRAY_SIZE(powerpc_macros); From 46104a7d3ccd2acfe508e661393add0615c27a22 Mon Sep 17 00:00:00 2001 From: Kienan Stewart Date: Tue, 18 Feb 2025 15:26:39 -0500 Subject: [PATCH 0641/3206] kbuild: Add missing $(objtree) prefix to powerpc crtsavres.o artifact In the upstream commit 214c0eea43b2ea66bcd6467ea57e47ce8874191b ("kbuild: add $(objtree)/ prefix to some in-kernel build artifacts") artifacts required for building out-of-tree kernel modules had $(objtree) prepended to them to prepare for building in other directories. When building external modules for powerpc, arch/powerpc/lib/crtsavres.o is required for certain configurations. This artifact is missing the prepended $(objtree). Fixes: 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") Acked-by: Masahiro Yamada Reviewed-by: Nicolas Schier Tested-by: Nicolas Schier Signed-off-by: Kienan Stewart Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250218-buildfix-extmod-powerpc-v2-1-1e78fcf12b56@efficios.com --- arch/powerpc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 9753fb87217c35..a58b1029592ce2 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -58,7 +58,7 @@ ifeq ($(CONFIG_PPC64)$(CONFIG_LD_IS_BFD),yy) # There is a corresponding test in arch/powerpc/lib/Makefile KBUILD_LDFLAGS_MODULE += --save-restore-funcs else -KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o +KBUILD_LDFLAGS_MODULE += $(objtree)/arch/powerpc/lib/crtsavres.o endif ifdef CONFIG_CPU_LITTLE_ENDIAN From eef6dcbc52fa83c392a2f4a52845f347b233a584 Mon Sep 17 00:00:00 2001 From: Prathamesh Shete Date: Sat, 23 Aug 2025 11:24:19 +0530 Subject: [PATCH 0642/3206] dt-bindings: gpio: Add Tegra256 support Extend the existing Tegra186 GPIO controller device tree bindings with support for the GPIO controller found on Tegra256. The number of pins is slightly different, but the programming model remains the same Add a new header, include/dt-bindings/gpio/tegra256-gpio.h, that defines port IDs as well as the TEGRA256_MAIN_GPIO() helper, both of which are used in conjunction to create a unique specifier for each pin. The OS can reconstruct the port ID and pin from these values to determine the register region for the corresponding GPIO. However, the OS does not use the macro definitions in this file. The symbolic names help associate these GPIO specifiers with the names used in the technical documentation available for the chip. Signed-off-by: Prathamesh Shete Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250823055420.24664-1-pshete@nvidia.com Signed-off-by: Bartosz Golaszewski --- .../bindings/gpio/nvidia,tegra186-gpio.yaml | 2 ++ include/dt-bindings/gpio/tegra256-gpio.h | 28 +++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 include/dt-bindings/gpio/tegra256-gpio.h diff --git a/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml b/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml index 065f5761a93f61..2bd620a1099b9a 100644 --- a/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml @@ -85,6 +85,7 @@ properties: - nvidia,tegra194-gpio-aon - nvidia,tegra234-gpio - nvidia,tegra234-gpio-aon + - nvidia,tegra256-gpio reg-names: items: @@ -155,6 +156,7 @@ allOf: - nvidia,tegra186-gpio - nvidia,tegra194-gpio - nvidia,tegra234-gpio + - nvidia,tegra256-gpio then: properties: interrupts: diff --git a/include/dt-bindings/gpio/tegra256-gpio.h b/include/dt-bindings/gpio/tegra256-gpio.h new file mode 100644 index 00000000000000..a0353a302aeb41 --- /dev/null +++ b/include/dt-bindings/gpio/tegra256-gpio.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. */ + +/* + * This header provides constants for the nvidia,tegra256-gpio DT binding. + * + * The first cell in Tegra's GPIO specifier is the GPIO ID. + * The macros below provide names for this. + * + * The second cell contains standard flag values specified in gpio.h. + */ + +#ifndef _DT_BINDINGS_GPIO_TEGRA256_GPIO_H +#define _DT_BINDINGS_GPIO_TEGRA256_GPIO_H + +#include + +/* GPIOs implemented by main GPIO controller */ +#define TEGRA256_MAIN_GPIO_PORT_A 0 +#define TEGRA256_MAIN_GPIO_PORT_B 1 +#define TEGRA256_MAIN_GPIO_PORT_C 2 +#define TEGRA256_MAIN_GPIO_PORT_D 3 + +#define TEGRA256_MAIN_GPIO(port, offset) \ + ((TEGRA256_MAIN_GPIO_PORT_##port * 8) + (offset)) + +#endif + From db12ee08726e55c8a1a70c2308f98d121d96edc6 Mon Sep 17 00:00:00 2001 From: Prathamesh Shete Date: Sat, 23 Aug 2025 11:24:20 +0530 Subject: [PATCH 0643/3206] gpio: tegra186: Add support for Tegra256 Extend the existing Tegra186 GPIO controller driver with support for the GPIO controller found on Tegra256. While the programming model remains the same, the number of pins has slightly changed. Signed-off-by: Prathamesh Shete Link: https://lore.kernel.org/r/20250823055420.24664-2-pshete@nvidia.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-tegra186.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/gpio/gpio-tegra186.c b/drivers/gpio/gpio-tegra186.c index 5fd3ec3e2c53d2..4d3db6e06eeb27 100644 --- a/drivers/gpio/gpio-tegra186.c +++ b/drivers/gpio/gpio-tegra186.c @@ -20,6 +20,7 @@ #include #include #include +#include /* security registers */ #define TEGRA186_GPIO_CTL_SCR 0x0c @@ -1279,6 +1280,30 @@ static const struct tegra_gpio_soc tegra241_aon_soc = { .has_vm_support = false, }; +#define TEGRA256_MAIN_GPIO_PORT(_name, _bank, _port, _pins) \ + [TEGRA256_MAIN_GPIO_PORT_##_name] = { \ + .name = #_name, \ + .bank = _bank, \ + .port = _port, \ + .pins = _pins, \ + } + +static const struct tegra_gpio_port tegra256_main_ports[] = { + TEGRA256_MAIN_GPIO_PORT(A, 0, 0, 8), + TEGRA256_MAIN_GPIO_PORT(B, 0, 1, 8), + TEGRA256_MAIN_GPIO_PORT(C, 0, 2, 8), + TEGRA256_MAIN_GPIO_PORT(D, 0, 3, 8), +}; + +static const struct tegra_gpio_soc tegra256_main_soc = { + .num_ports = ARRAY_SIZE(tegra256_main_ports), + .ports = tegra256_main_ports, + .name = "tegra256-gpio-main", + .instance = 1, + .num_irqs_per_bank = 8, + .has_vm_support = true, +}; + static const struct of_device_id tegra186_gpio_of_match[] = { { .compatible = "nvidia,tegra186-gpio", @@ -1298,6 +1323,9 @@ static const struct of_device_id tegra186_gpio_of_match[] = { }, { .compatible = "nvidia,tegra234-gpio-aon", .data = &tegra234_aon_soc + }, { + .compatible = "nvidia,tegra256-gpio", + .data = &tegra256_main_soc }, { /* sentinel */ } From 3712ce9fa501617cdc4466d30ae3894d50887743 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Wed, 27 Aug 2025 19:58:42 +0200 Subject: [PATCH 0644/3206] gpiolib: acpi: Ignore touchpad wakeup on GPD G1619-05 Same issue as G1619-04 in commit 805c74eac8cb ("gpiolib: acpi: Ignore touchpad wakeup on GPD G1619-04"), Strix Point lineup uses 05. Signed-off-by: Antheas Kapenekakis Reviewed-by: Mika Westerberg Reviewed-by: Mario Limonciello Signed-off-by: Andy Shevchenko --- drivers/gpio/gpiolib-acpi-quirks.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpio/gpiolib-acpi-quirks.c b/drivers/gpio/gpiolib-acpi-quirks.c index c13545dce3492d..bb63138c4b5b7d 100644 --- a/drivers/gpio/gpiolib-acpi-quirks.c +++ b/drivers/gpio/gpiolib-acpi-quirks.c @@ -317,6 +317,18 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = { .ignore_wake = "PNP0C50:00@8", }, }, + { + /* + * Same as G1619-04. New model. + */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1619-05"), + }, + .driver_data = &(struct acpi_gpiolib_dmi_quirk) { + .ignore_wake = "PNP0C50:00@8", + }, + }, { /* * Spurious wakeups from GPIO 11 From 5c5a41a75452e9eb5810a7d0d9916b61fe9fd1c5 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Fri, 29 Aug 2025 00:39:40 +0200 Subject: [PATCH 0645/3206] gpu: nova-core: depend on CONFIG_64BIT If built on architectures with CONFIG_ARCH_DMA_ADDR_T_64BIT=y nova-core produces that following build failures: error[E0308]: mismatched types --> drivers/gpu/nova-core/fb.rs:49:59 | 49 | hal::fb_hal(chipset).write_sysmem_flush_page(bar, page.dma_handle())?; | ----------------------- ^^^^^^^^^^^^^^^^^ expected `u64`, found `u32` | | | arguments to this method are incorrect | note: method defined here --> drivers/gpu/nova-core/fb/hal.rs:19:8 | 19 | fn write_sysmem_flush_page(&self, bar: &Bar0, addr: u64) -> Result; | ^^^^^^^^^^^^^^^^^^^^^^^ help: you can convert a `u32` to a `u64` | 49 | hal::fb_hal(chipset).write_sysmem_flush_page(bar, page.dma_handle().into())?; | +++++++ error[E0308]: mismatched types --> drivers/gpu/nova-core/fb.rs:65:47 | 65 | if hal.read_sysmem_flush_page(bar) == self.page.dma_handle() { | ------------------------------- ^^^^^^^^^^^^^^^^^^^^^^ expected `u64`, found `u32` | | | expected because this is `u64` | help: you can convert a `u32` to a `u64` | 65 | if hal.read_sysmem_flush_page(bar) == self.page.dma_handle().into() { | +++++++ error: this arithmetic operation will overflow --> drivers/gpu/nova-core/falcon.rs:469:23 | 469 | .set_base((dma_start >> 40) as u16) | ^^^^^^^^^^^^^^^^^ attempt to shift right by `40_i32`, which would overflow | = note: `#[deny(arithmetic_overflow)]` on by default This is due to the code making assumptions on the width of dma_addr_t to be 64 bit. While this could technically be handled, it is rather painful to deal with, as the following example illustrates: pub(super) fn read_sysmem_flush_page_ga100(bar: &Bar0) -> DmaAddress { let addr = u64::from(regs::NV_PFB_NISO_FLUSH_SYSMEM_ADDR::read(bar).adr_39_08()) << FLUSH_SYSMEM_ADDR_SHIFT | u64::from(regs::NV_PFB_NISO_FLUSH_SYSMEM_ADDR_HI::read(bar).adr_63_40()) << FLUSH_SYSMEM_ADDR_SHIFT_HI; addr.try_into().unwrap_or_else(|_| { kernel::warn_on!(true); 0 }) } At the same time there's not much value for nova-core to support 32-bit, given that the supported GPU architectures are Turing and later, hence depend on CONFIG_64BIT. Cc: John Hubbard Reported-by: Miguel Ojeda Closes: https://lore.kernel.org/lkml/20250828160247.37492-1-ojeda@kernel.org/ Fixes: 6554ad65b589 ("gpu: nova-core: register sysmem flush page") Fixes: 69f5cd67ce41 ("gpu: nova-core: add falcon register definitions and base code") Reviewed-by: Alexandre Courbot Reviewed-by: John Hubbard Link: https://lore.kernel.org/r/20250828223954.351348-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- drivers/gpu/nova-core/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/nova-core/Kconfig b/drivers/gpu/nova-core/Kconfig index 8726d80d6ba401..20d3e6d0d796ef 100644 --- a/drivers/gpu/nova-core/Kconfig +++ b/drivers/gpu/nova-core/Kconfig @@ -1,5 +1,6 @@ config NOVA_CORE tristate "Nova Core GPU driver" + depends on 64BIT depends on PCI depends on RUST depends on RUST_FW_LOADER_ABSTRACTIONS From 45e2cef568cdf87cb06c9783b45c8f08d1ab1cec Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:32:41 +0530 Subject: [PATCH 0646/3206] x86/apic: Initialize APIC ID for Secure AVIC Initialize the APIC ID in the Secure AVIC APIC backing page with the APIC_ID MSR value read from the hypervisor. CPU topology evaluation later during boot would catch and report any duplicate APIC ID for two CPUs. Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828110255.208779-2-Neeraj.Upadhyay@amd.com --- arch/x86/kernel/apic/x2apic_savic.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 5479605429c1a8..56c51ea4e5ab7b 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -150,6 +150,12 @@ static void savic_setup(void) enum es_result res; unsigned long gpa; + /* + * Before Secure AVIC is enabled, APIC MSR reads are intercepted. + * APIC_ID MSR read returns the value from the hypervisor. + */ + apic_set_reg(ap, APIC_ID, native_apic_msr_read(APIC_ID)); + gpa = __pa(ap); /* From 60791ef3751cb0ceccd6f5ac98276153745c7980 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:32:42 +0530 Subject: [PATCH 0647/3206] x86/apic: Add update_vector() callback for APIC drivers Add an update_vector() callback to allow APIC drivers to perform driver specific operations on external vector allocation/teardown on a CPU. This callback will be used by the Secure AVIC APIC driver to configure the vectors which a guest vCPU allows the hypervisor to send to it. As system vectors have fixed vector assignments and are not dynamically allocated, add an apic_update_vector() public API to facilitate update_vector() callback invocation for them. This will be used for Secure AVIC enabled guests to allow the hypervisor to inject system vectors which are emulated by the hypervisor such as APIC timer vector and HYPERVISOR_CALLBACK_VECTOR. While at it, cleanup line break in apic_update_irq_cfg(). Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828110255.208779-3-Neeraj.Upadhyay@amd.com --- arch/x86/include/asm/apic.h | 9 +++++++++ arch/x86/kernel/apic/vector.c | 28 +++++++++++++++++----------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 44b4080721a625..0683318470beb7 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -318,6 +318,8 @@ struct apic { /* wakeup secondary CPU using 64-bit wakeup point */ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); + void (*update_vector)(unsigned int cpu, unsigned int vector, bool set); + char *name; }; @@ -471,6 +473,12 @@ static __always_inline bool apic_id_valid(u32 apic_id) return apic_id <= apic->max_apic_id; } +static __always_inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) +{ + if (apic->update_vector) + apic->update_vector(cpu, vector, set); +} + #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 apic_read(u32 reg) { return 0; } @@ -482,6 +490,7 @@ static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); } static inline void apic_setup_apic_calls(void) { } +static inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) { } #define apic_update_callback(_callback, _fn) do { } while (0) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a947b46a8b642b..bddc544653999a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -134,13 +134,20 @@ static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector, apicd->hw_irq_cfg.vector = vector; apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu); + + apic_update_vector(cpu, vector, true); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); - trace_vector_config(irqd->irq, vector, cpu, - apicd->hw_irq_cfg.dest_apicid); + trace_vector_config(irqd->irq, vector, cpu, apicd->hw_irq_cfg.dest_apicid); } -static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, - unsigned int newcpu) +static void apic_free_vector(unsigned int cpu, unsigned int vector, bool managed) +{ + apic_update_vector(cpu, vector, false); + irq_matrix_free(vector_matrix, cpu, vector, managed); +} + +static void chip_data_update(struct irq_data *irqd, unsigned int newvec, unsigned int newcpu) { struct apic_chip_data *apicd = apic_chip_data(irqd); struct irq_desc *desc = irq_data_to_desc(irqd); @@ -174,8 +181,7 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, apicd->prev_cpu = apicd->cpu; WARN_ON_ONCE(apicd->cpu == newcpu); } else { - irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, - managed); + apic_free_vector(apicd->cpu, apicd->vector, managed); } setnew: @@ -261,7 +267,7 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc(irqd->irq, vector, resvd, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -337,7 +343,7 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc_managed(irqd->irq, vector, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -357,7 +363,7 @@ static void clear_irq_vector(struct irq_data *irqd) apicd->prev_cpu); per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); + apic_free_vector(apicd->cpu, vector, managed); apicd->vector = 0; /* Clean up move in progress */ @@ -366,7 +372,7 @@ static void clear_irq_vector(struct irq_data *irqd) return; per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); + apic_free_vector(apicd->prev_cpu, vector, managed); apicd->prev_vector = 0; apicd->move_in_progress = 0; hlist_del_init(&apicd->clist); @@ -905,7 +911,7 @@ static void free_moved_vector(struct apic_chip_data *apicd) * affinity mask comes online. */ trace_vector_free_moved(apicd->irq, cpu, vector, managed); - irq_matrix_free(vector_matrix, cpu, vector, managed); + apic_free_vector(cpu, vector, managed); per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; hlist_del_init(&apicd->clist); apicd->prev_vector = 0; From 9e70e985bdc2c6fe7a160e4d59ddd7c0a39bc077 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Aug 2025 11:39:03 -0400 Subject: [PATCH 0648/3206] fs: rework iput logic Currently, if we are the last iput, and we have the I_DIRTY_TIME bit set, we will grab a reference on the inode again and then mark it dirty and then redo the put. This is to make sure we delay the time update for as long as possible. We can rework this logic to simply dec i_count if it is not 1, and if it is do the time update while still holding the i_count reference. Then we can replace the atomic_dec_and_lock with locking the ->i_lock and doing atomic_dec_and_test, since we did the atomic_add_unless above. Co-developed-by: Mateusz Guzik Signed-off-by: Mateusz Guzik Signed-off-by: Josef Bacik Link: https://lore.kernel.org/be208b89bdb650202e712ce2bcfc407ac7044c7a.1756222464.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- fs/inode.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 01ebdc40021e2d..01a554e1127940 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1908,20 +1908,44 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { - if (!inode) + if (unlikely(!inode)) return; - BUG_ON(inode->i_state & I_CLEAR); + retry: - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { - if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { - atomic_inc(&inode->i_count); - spin_unlock(&inode->i_lock); - trace_writeback_lazytime_iput(inode); - mark_inode_dirty_sync(inode); - goto retry; - } - iput_final(inode); + lockdep_assert_not_held(&inode->i_lock); + VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode); + /* + * Note this assert is technically racy as if the count is bogusly + * equal to one, then two CPUs racing to further drop it can both + * conclude it's fine. + */ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode); + + if (atomic_add_unless(&inode->i_count, -1, 1)) + return; + + if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) { + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; } + + spin_lock(&inode->i_lock); + if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) { + spin_unlock(&inode->i_lock); + goto retry; + } + + if (!atomic_dec_and_test(&inode->i_count)) { + spin_unlock(&inode->i_lock); + return; + } + + /* + * iput_final() drops ->i_lock, we can't assert on it as the inode may + * be deallocated by the time the call returns. + */ + iput_final(inode); } EXPORT_SYMBOL(iput); From 37b27bd5d6217b75d315f28b4399aad0a336f299 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Aug 2025 11:39:02 -0400 Subject: [PATCH 0649/3206] fs: add an icount_read helper Instead of doing direct access to ->i_count, add a helper to handle this. This will make it easier to convert i_count to a refcount later. Signed-off-by: Josef Bacik Link: https://lore.kernel.org/9bc62a84c6b9d6337781203f60837bd98fbc4a96.1756222464.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- arch/powerpc/platforms/cell/spufs/file.c | 2 +- fs/btrfs/inode.c | 2 +- fs/ceph/mds_client.c | 2 +- fs/ext4/ialloc.c | 4 ++-- fs/fs-writeback.c | 2 +- fs/hpfs/inode.c | 2 +- fs/inode.c | 8 ++++---- fs/nfs/inode.c | 4 ++-- fs/notify/fsnotify.c | 2 +- fs/smb/client/inode.c | 2 +- fs/ubifs/super.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_trace.h | 2 +- include/linux/fs.h | 5 +++++ include/trace/events/filelock.h | 2 +- security/landlock/fs.c | 2 +- 16 files changed, 25 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index d5a2c77bc90871..ce839783c0df6a 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1430,7 +1430,7 @@ static int spufs_mfc_open(struct inode *inode, struct file *file) if (ctx->owner != current->mm) return -EINVAL; - if (atomic_read(&inode->i_count) != 1) + if (icount_read(inode) != 1) return -EBUSY; mutex_lock(&ctx->mapping_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index de722b232ec145..5bcd8e25fa78f7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4538,7 +4538,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) inode = btrfs_find_first_inode(root, min_ino); while (inode) { - if (atomic_read(&inode->vfs_inode.i_count) > 1) + if (icount_read(&inode->vfs_inode) > 1) d_prune_aliases(&inode->vfs_inode); min_ino = btrfs_ino(inode) + 1; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0f497c39ff8246..62dba710504d52 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2221,7 +2221,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) int count; dput(dentry); d_prune_aliases(inode); - count = atomic_read(&inode->i_count); + count = icount_read(inode); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df4051613b290a..ba4fd9aba1c14d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) "nonexistent device\n", __func__, __LINE__); return; } - if (atomic_read(&inode->i_count) > 1) { + if (icount_read(inode) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, - atomic_read(&inode->i_count)); + icount_read(inode)); return; } if (inode->i_nlink) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641d7..6088a67b2aaebb 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1767,7 +1767,7 @@ static int writeback_single_inode(struct inode *inode, int ret = 0; spin_lock(&inode->i_lock); - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index a59e8fa630db67..34008442ee265f 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i) struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; - if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { + if (hpfs_inode->i_rddir_off && !icount_read(i)) { if (*hpfs_inode->i_rddir_off) pr_err("write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); diff --git a/fs/inode.c b/fs/inode.c index 01a554e1127940..fe4868e2a9545b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -534,7 +534,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate) { if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) return; if (!(inode->i_sb->s_flags & SB_ACTIVE)) return; @@ -871,11 +871,11 @@ void evict_inodes(struct super_block *sb) again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) continue; spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_count)) { + if (icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } @@ -937,7 +937,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * unreclaimable for a while. Remove them lazily here; iput, * sync, or the last page cache deletion will requeue them. */ - if (atomic_read(&inode->i_count) || + if (icount_read(inode) || (inode->i_state & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae42308..b52805951856b5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - atomic_read(&inode->i_count)); + icount_read(inode)); out: return inode; @@ -2229,7 +2229,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - atomic_read(&inode->i_count), fattr->valid); + icount_read(inode), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 079b868552c21d..46bfc543f9467c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -66,7 +66,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * removed all zero refcount inodes, in any case. Test to * be sure. */ - if (!atomic_read(&inode->i_count)) { + if (!icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 75be4b46bc6f18..211d5b8b42f4ad 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2749,7 +2749,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) } cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", - full_path, inode, inode->i_count.counter, + full_path, inode, icount_read(inode), dentry, cifs_get_time(dentry), jiffies); again: diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f3e3b20686085e..a0269ba96e3de2 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode) goto out; dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); - ubifs_assert(c, !atomic_read(&inode->i_count)); + ubifs_assert(c, !icount_read(inode)); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9c39251961a32a..df8eab11dc48d0 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1035,7 +1035,7 @@ xfs_itruncate_extents_flags( int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - if (atomic_read(&VFS_I(ip)->i_count)) + if (icount_read(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e1794e3e3156b1..34001503fc8b9a 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1151,7 +1151,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->count = icount_read(VFS_I(ip)); __entry->pincount = atomic_read(&ip->i_pincount); __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; diff --git a/include/linux/fs.h b/include/linux/fs.h index c34554d8c4feb9..c4fd010cf5bff1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2611,6 +2611,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline int icount_read(const struct inode *inode) +{ + return atomic_read(&inode->i_count); +} + /* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index b8d1e00a7982c9..fdd36b1daa2578 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -189,7 +189,7 @@ TRACE_EVENT(generic_add_lease, __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); - __entry->icount = atomic_read(&inode->i_count); + __entry->icount = icount_read(inode); __entry->owner = fl->c.flc_owner; __entry->flags = fl->c.flc_flags; __entry->type = fl->c.flc_type; diff --git a/security/landlock/fs.c b/security/landlock/fs.c index c04f8879ad03ce..0bade2c5aa1d00 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1281,7 +1281,7 @@ static void hook_sb_delete(struct super_block *const sb) struct landlock_object *object; /* Only handles referenced inodes. */ - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) continue; /* From 90ccf10de527c0c9b117beddd09ee7ac38efaa5b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 1 Sep 2025 12:40:38 +0200 Subject: [PATCH 0650/3206] inode: fix whitespace issues Fix two minor whitespace issues. Signed-off-by: Christian Brauner --- fs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index fe4868e2a9545b..833de5457a065b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -550,11 +550,11 @@ static void __inode_add_lru(struct inode *inode, bool rotate) struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit) { - void *bit_address; + void *bit_address; - bit_address = inode_state_wait_address(inode, bit); - init_wait_var_entry(wqe, bit_address, 0); - return __var_waitqueue(bit_address); + bit_address = inode_state_wait_address(inode, bit); + init_wait_var_entry(wqe, bit_address, 0); + return __var_waitqueue(bit_address); } EXPORT_SYMBOL(inode_bit_waitqueue); @@ -2938,7 +2938,7 @@ EXPORT_SYMBOL(mode_strip_sgid); */ void dump_inode(struct inode *inode, const char *reason) { - pr_warn("%s encountered for inode %px", reason, inode); + pr_warn("%s encountered for inode %px", reason, inode); } EXPORT_SYMBOL(dump_inode); From 8c79a68de1d2d63537f2a318e5a3b27744c835ad Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:32:43 +0530 Subject: [PATCH 0651/3206] x86/apic: Add an update_vector() callback for Secure AVIC Add an update_vector() callback to set/clear the ALLOWED_IRR field in a vCPU's APIC backing page for vectors which are emulated by the hypervisor. The ALLOWED_IRR field indicates the interrupt vectors which the guest allows the hypervisor to inject (typically for emulated devices). Interrupt vectors used exclusively by the guest itself and the vectors which are not emulated by the hypervisor, such as IPI vectors, should not be set by the guest in the ALLOWED_IRR fields. As clearing/setting state of a vector will also be used in subsequent commits for other APIC registers (such as APIC_IRR update for sending IPI), add a common update_vector() in the Secure AVIC driver. [ bp: Massage commit message. ] Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828110255.208779-4-Neeraj.Upadhyay@amd.com --- arch/x86/kernel/apic/x2apic_savic.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 56c51ea4e5ab7b..942d3aa25082b6 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -27,6 +27,22 @@ static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); } +static inline void *get_reg_bitmap(unsigned int cpu, unsigned int offset) +{ + return &per_cpu_ptr(savic_page, cpu)->regs[offset]; +} + +static inline void update_vector(unsigned int cpu, unsigned int offset, + unsigned int vector, bool set) +{ + void *bitmap = get_reg_bitmap(cpu, offset); + + if (set) + apic_set_vector(vector, bitmap); + else + apic_clear_vector(vector, bitmap); +} + #define SAVIC_ALLOWED_IRR 0x204 /* @@ -144,6 +160,11 @@ static void savic_write(u32 reg, u32 data) } } +static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set) +{ + update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set); +} + static void savic_setup(void) { void *ap = this_cpu_ptr(savic_page); @@ -217,6 +238,8 @@ static struct apic apic_x2apic_savic __ro_after_init = { .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, + + .update_vector = savic_update_vector, }; apic_driver(apic_x2apic_savic); From 2c6978ea1a85603fe7d401f7bb3a1fbcab21fde2 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:38:24 +0530 Subject: [PATCH 0652/3206] x86/apic: Add support to send IPI for Secure AVIC Secure AVIC hardware accelerates only Self-IPI, i.e. on WRMSR to APIC_SELF_IPI and APIC_ICR (with destination shorthand equal to Self) registers, hardware takes care of updating the APIC_IRR in the APIC backing page of the vCPU. For other IPI types (cross-vCPU, broadcast IPIs), software needs to take care of updating the APIC_IRR state of the target vCPUs and to ensure that the target vCPUs notice the new pending interrupt. Add new callbacks in the Secure AVIC driver for sending IPI requests. These callbacks update the IRR in the target guest vCPU's APIC backing page. To ensure that the remote vCPU notices the new pending interrupt, reuse the GHCB MSR handling code in vc_handle_msr() to issue APIC_ICR MSR-write GHCB protocol event to the hypervisor. For Secure AVIC guests, on APIC_ICR write MSR exits, the hypervisor notifies the target vCPU by either sending an AVIC doorbell (if target vCPU is running) or by waking up the non-running target vCPU. Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828110824.208851-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 28 ++++++ arch/x86/coco/sev/vc-handle.c | 11 ++- arch/x86/include/asm/sev-internal.h | 2 + arch/x86/include/asm/sev.h | 2 + arch/x86/kernel/apic/x2apic_savic.c | 138 +++++++++++++++++++++++++++- 5 files changed, 173 insertions(+), 8 deletions(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 7669aafcad95be..bb33fc2265dba8 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1108,6 +1108,34 @@ int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd) return 0; } +void savic_ghcb_msr_write(u32 reg, u64 value) +{ + u64 msr = APIC_BASE_MSR + (reg >> 4); + struct pt_regs regs = { + .cx = msr, + .ax = lower_32_bits(value), + .dx = upper_32_bits(value) + }; + struct es_em_ctxt ctxt = { .regs = ®s }; + struct ghcb_state state; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + res = sev_es_ghcb_handle_msr(ghcb, &ctxt, true); + if (res != ES_OK) { + pr_err("Secure AVIC MSR (0x%llx) write returned error (%d)\n", msr, res); + /* MSR writes should never fail. Any failure is fatal error for SNP guest */ + snp_abort(); + } + + __sev_put_ghcb(&state); +} + enum es_result savic_register_gpa(u64 gpa) { struct ghcb_state state; diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index c3b4acbde0d8c6..c1aa10ce9d5443 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -402,14 +402,10 @@ static enum es_result __vc_handle_secure_tsc_msrs(struct es_em_ctxt *ctxt, bool return ES_OK; } -static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write) { struct pt_regs *regs = ctxt->regs; enum es_result ret; - bool write; - - /* Is it a WRMSR? */ - write = ctxt->insn.opcode.bytes[1] == 0x30; switch (regs->cx) { case MSR_SVSM_CAA: @@ -439,6 +435,11 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_handle_msr(ghcb, ctxt, ctxt->insn.opcode.bytes[1] == 0x30); +} + static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) { int trapnr = ctxt->fi.vector; diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index 3dfd306d1c9e88..6876655183a6bb 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -97,6 +97,8 @@ static __always_inline void sev_es_wr_ghcb_msr(u64 val) native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); } +enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write); + void snp_register_ghcb_early(unsigned long paddr); bool sev_es_negotiate_protocol(void); bool sev_es_check_cpu_features(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 9036122a6d455d..fa2864eb3e20fe 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -534,6 +534,7 @@ int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); enum es_result savic_register_gpa(u64 gpa); +void savic_ghcb_msr_write(u32 reg, u64 value); static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { @@ -607,6 +608,7 @@ static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; } +static inline void savic_ghcb_msr_write(u32 reg, u64 value) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 942d3aa25082b6..47dfbf0c5ec502 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -8,6 +8,7 @@ */ #include +#include #include #include @@ -120,6 +121,73 @@ static u32 savic_read(u32 reg) #define SAVIC_NMI_REQ 0x278 +/* + * On WRMSR to APIC_SELF_IPI register by the guest, Secure AVIC hardware + * updates the APIC_IRR in the APIC backing page of the vCPU. In addition, + * hardware evaluates the new APIC_IRR update for interrupt injection to + * the vCPU. So, self IPIs are hardware-accelerated. + */ +static inline void self_ipi_reg_write(unsigned int vector) +{ + native_apic_msr_write(APIC_SELF_IPI, vector); +} + +static void send_ipi_dest(unsigned int cpu, unsigned int vector) +{ + update_vector(cpu, APIC_IRR, vector, true); +} + +static void send_ipi_allbut(unsigned int vector) +{ + unsigned int cpu, src_cpu; + + guard(irqsave)(); + + src_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, cpu_online_mask) { + if (cpu == src_cpu) + continue; + send_ipi_dest(cpu, vector); + } +} + +static inline void self_ipi(unsigned int vector) +{ + u32 icr_low = APIC_SELF_IPI | vector; + + native_x2apic_icr_write(icr_low, 0); +} + +static void savic_icr_write(u32 icr_low, u32 icr_high) +{ + unsigned int dsh, vector; + u64 icr_data; + + dsh = icr_low & APIC_DEST_ALLBUT; + vector = icr_low & APIC_VECTOR_MASK; + + switch (dsh) { + case APIC_DEST_SELF: + self_ipi(vector); + break; + case APIC_DEST_ALLINC: + self_ipi(vector); + fallthrough; + case APIC_DEST_ALLBUT: + send_ipi_allbut(vector); + break; + default: + send_ipi_dest(icr_high, vector); + break; + } + + icr_data = ((u64)icr_high) << 32 | icr_low; + if (dsh != APIC_DEST_SELF) + savic_ghcb_msr_write(APIC_ICR, icr_data); + apic_set_reg64(this_cpu_ptr(savic_page), APIC_ICR, icr_data); +} + static void savic_write(u32 reg, u32 data) { void *ap = this_cpu_ptr(savic_page); @@ -130,7 +198,6 @@ static void savic_write(u32 reg, u32 data) case APIC_LVT1: case APIC_TMICT: case APIC_TDCR: - case APIC_SELF_IPI: case APIC_TASKPRI: case APIC_EOI: case APIC_SPIV: @@ -146,7 +213,10 @@ static void savic_write(u32 reg, u32 data) apic_set_reg(ap, reg, data); break; case APIC_ICR: - apic_set_reg64(ap, reg, (u64)data); + savic_icr_write(data, 0); + break; + case APIC_SELF_IPI: + self_ipi_reg_write(data); break; /* ALLOWED_IRR offsets are writable */ case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70: @@ -160,6 +230,61 @@ static void savic_write(u32 reg, u32 data) } } +static void send_ipi(u32 dest, unsigned int vector, unsigned int dsh) +{ + unsigned int icr_low; + + icr_low = __prepare_ICR(dsh, vector, APIC_DEST_PHYSICAL); + savic_icr_write(icr_low, dest); +} + +static void savic_send_ipi(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_apicid, cpu); + + send_ipi(dest, vector, 0); +} + +static void send_ipi_mask(const struct cpumask *mask, unsigned int vector, bool excl_self) +{ + unsigned int cpu, this_cpu; + + guard(irqsave)(); + + this_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, mask) { + if (excl_self && cpu == this_cpu) + continue; + send_ipi(per_cpu(x86_cpu_to_apicid, cpu), vector, 0); + } +} + +static void savic_send_ipi_mask(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, false); +} + +static void savic_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, true); +} + +static void savic_send_ipi_allbutself(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLBUT); +} + +static void savic_send_ipi_all(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLINC); +} + +static void savic_send_ipi_self(int vector) +{ + self_ipi_reg_write(vector); +} + static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set) { update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set); @@ -231,13 +356,20 @@ static struct apic apic_x2apic_savic __ro_after_init = { .calc_dest_apicid = apic_default_calc_apicid, + .send_IPI = savic_send_ipi, + .send_IPI_mask = savic_send_ipi_mask, + .send_IPI_mask_allbutself = savic_send_ipi_mask_allbutself, + .send_IPI_allbutself = savic_send_ipi_allbutself, + .send_IPI_all = savic_send_ipi_all, + .send_IPI_self = savic_send_ipi_self, + .nmi_to_offline_cpu = true, .read = savic_read, .write = savic_write, .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, - .icr_write = native_x2apic_icr_write, + .icr_write = savic_icr_write, .update_vector = savic_update_vector, }; From ea7d792e11e10f502933c39f3836cb73d35dac36 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:39:26 +0530 Subject: [PATCH 0653/3206] x86/apic: Support LAPIC timer for Secure AVIC Secure AVIC requires the LAPIC timer to be emulated by the hypervisor. KVM already supports emulating the LAPIC timer using hrtimers. In order to emulate it, APIC_LVTT, APIC_TMICT and APIC_TDCR register values need to be propagated to the hypervisor for arming the timer. APIC_TMCCT register value has to be read from the hypervisor, which is required for calibrating the APIC timer. So, read/write all APIC timer registers from/to the hypervisor. Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828110926.208866-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 26 ++++++++++++++++++++++++++ arch/x86/include/asm/sev.h | 2 ++ arch/x86/kernel/apic/apic.c | 2 ++ arch/x86/kernel/apic/x2apic_savic.c | 7 +++++-- 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index bb33fc2265dba8..da9fa9d7254b61 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1108,6 +1108,32 @@ int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd) return 0; } +u64 savic_ghcb_msr_read(u32 reg) +{ + u64 msr = APIC_BASE_MSR + (reg >> 4); + struct pt_regs regs = { .cx = msr }; + struct es_em_ctxt ctxt = { .regs = ®s }; + struct ghcb_state state; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + res = sev_es_ghcb_handle_msr(ghcb, &ctxt, false); + if (res != ES_OK) { + pr_err("Secure AVIC MSR (0x%llx) read returned error (%d)\n", msr, res); + /* MSR read failures are treated as fatal errors */ + snp_abort(); + } + + __sev_put_ghcb(&state); + + return regs.ax | regs.dx << 32; +} + void savic_ghcb_msr_write(u32 reg, u64 value) { u64 msr = APIC_BASE_MSR + (reg >> 4); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index fa2864eb3e20fe..875c7669ba95f7 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -534,6 +534,7 @@ int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); enum es_result savic_register_gpa(u64 gpa); +u64 savic_ghcb_msr_read(u32 reg); void savic_ghcb_msr_write(u32 reg, u64 value); static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) @@ -609,6 +610,7 @@ static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; } static inline void savic_ghcb_msr_write(u32 reg, u64 value) { } +static inline u64 savic_ghcb_msr_read(u32 reg) { return 0; } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7874284c1ca7fd..db18810576bc05 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -592,6 +592,8 @@ static void setup_APIC_timer(void) 0xF, ~0UL); } else clockevents_register_device(levt); + + apic_update_vector(smp_processor_id(), LOCAL_TIMER_VECTOR, true); } /* diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 47dfbf0c5ec502..bdefe4cd4e2925 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -67,6 +67,7 @@ static u32 savic_read(u32 reg) case APIC_TMICT: case APIC_TMCCT: case APIC_TDCR: + return savic_ghcb_msr_read(reg); case APIC_ID: case APIC_LVR: case APIC_TASKPRI: @@ -194,10 +195,12 @@ static void savic_write(u32 reg, u32 data) switch (reg) { case APIC_LVTT: - case APIC_LVT0: - case APIC_LVT1: case APIC_TMICT: case APIC_TDCR: + savic_ghcb_msr_write(reg, data); + break; + case APIC_LVT0: + case APIC_LVT1: case APIC_TASKPRI: case APIC_EOI: case APIC_SPIV: From c77683eccf53428a6934df76702e33c0faf46fe5 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 28 Aug 2025 16:41:41 +0530 Subject: [PATCH 0654/3206] x86/sev: Initialize VGIF for secondary vCPUs for Secure AVIC Virtual GIF (VGIF) provides masking capability for when virtual interrupts (virtual maskable interrupts, virtual NMIs) can be taken by the guest vCPU. The Secure AVIC hardware reads VGIF state from the vCPU's VMSA. So, set VGIF for secondary CPUs (the configuration for the boot CPU is done by the hypervisor), to unmask delivery of virtual interrupts to the vCPU. Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111141.208920-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index da9fa9d7254b61..37b1d41e68d0d7 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -974,6 +974,9 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + vmsa->vintr_ctrl |= V_GIF_MASK; + /* SVME must be set. */ vmsa->efer = EFER_SVME; From 9de196f519a505cf104216d6f1d8688570dacca4 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:42:13 +0530 Subject: [PATCH 0655/3206] x86/apic: Add support to send NMI IPI for Secure AVIC Secure AVIC introduces a new field in the APIC backing page "NmiReq" that has to be set by the guest to request a NMI IPI through APIC_ICR write. Add support to set NmiReq appropriately to send NMI IPI. Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111213.208933-1-Neeraj.Upadhyay@amd.com --- arch/x86/kernel/apic/x2apic_savic.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index bdefe4cd4e2925..8ed56e87c32f6f 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -133,12 +133,15 @@ static inline void self_ipi_reg_write(unsigned int vector) native_apic_msr_write(APIC_SELF_IPI, vector); } -static void send_ipi_dest(unsigned int cpu, unsigned int vector) +static void send_ipi_dest(unsigned int cpu, unsigned int vector, bool nmi) { - update_vector(cpu, APIC_IRR, vector, true); + if (nmi) + apic_set_reg(per_cpu_ptr(savic_page, cpu), SAVIC_NMI_REQ, 1); + else + update_vector(cpu, APIC_IRR, vector, true); } -static void send_ipi_allbut(unsigned int vector) +static void send_ipi_allbut(unsigned int vector, bool nmi) { unsigned int cpu, src_cpu; @@ -149,14 +152,17 @@ static void send_ipi_allbut(unsigned int vector) for_each_cpu(cpu, cpu_online_mask) { if (cpu == src_cpu) continue; - send_ipi_dest(cpu, vector); + send_ipi_dest(cpu, vector, nmi); } } -static inline void self_ipi(unsigned int vector) +static inline void self_ipi(unsigned int vector, bool nmi) { u32 icr_low = APIC_SELF_IPI | vector; + if (nmi) + icr_low |= APIC_DM_NMI; + native_x2apic_icr_write(icr_low, 0); } @@ -164,22 +170,24 @@ static void savic_icr_write(u32 icr_low, u32 icr_high) { unsigned int dsh, vector; u64 icr_data; + bool nmi; dsh = icr_low & APIC_DEST_ALLBUT; vector = icr_low & APIC_VECTOR_MASK; + nmi = ((icr_low & APIC_DM_FIXED_MASK) == APIC_DM_NMI); switch (dsh) { case APIC_DEST_SELF: - self_ipi(vector); + self_ipi(vector, nmi); break; case APIC_DEST_ALLINC: - self_ipi(vector); + self_ipi(vector, nmi); fallthrough; case APIC_DEST_ALLBUT: - send_ipi_allbut(vector); + send_ipi_allbut(vector, nmi); break; default: - send_ipi_dest(icr_high, vector); + send_ipi_dest(icr_high, vector, nmi); break; } From 869e36b9660dd72ab960b74c55d7a200c22588d0 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:42:43 +0530 Subject: [PATCH 0656/3206] x86/apic: Allow NMI to be injected from hypervisor for Secure AVIC Secure AVIC requires the "AllowedNmi" bit in the Secure AVIC Control MSR to be set for an NMI to be injected from the hypervisor. So set it. Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111243.208946-1-Neeraj.Upadhyay@amd.com --- arch/x86/include/asm/msr-index.h | 3 +++ arch/x86/kernel/apic/x2apic_savic.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 2a6d4fd8659a22..1291e053e40c24 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -703,6 +703,9 @@ #define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT) #define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) +#define MSR_AMD64_SAVIC_CONTROL 0xc0010138 +#define MSR_AMD64_SAVIC_ALLOWEDNMI_BIT 1 +#define MSR_AMD64_SAVIC_ALLOWEDNMI BIT_ULL(MSR_AMD64_SAVIC_ALLOWEDNMI_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 #define MSR_AMD64_RMP_CFG 0xc0010136 diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 8ed56e87c32f6f..bbaedb48a7fbbb 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -328,6 +328,8 @@ static void savic_setup(void) res = savic_register_gpa(gpa); if (res != ES_OK) snp_abort(); + + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, gpa | MSR_AMD64_SAVIC_ALLOWEDNMI); } static int savic_probe(void) From 28bbfad229e4addf9990279c73c07b762b4a04e4 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 28 Aug 2025 16:43:15 +0530 Subject: [PATCH 0657/3206] x86/sev: Enable NMI support for Secure AVIC Now that support to send NMI IPI and support to inject NMI from the hypervisor has been added, set V_NMI_ENABLE in the VINTR_CTRL field of the VMSA to enable NMI for Secure AVIC guests. [ bp: Zap useless brackets. ] Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111315.208959-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 37b1d41e68d0d7..e4740611228dc7 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -975,7 +975,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) - vmsa->vintr_ctrl |= V_GIF_MASK; + vmsa->vintr_ctrl |= V_GIF_MASK | V_NMI_ENABLE_MASK; /* SVME must be set. */ vmsa->efer = EFER_SVME; From 8e3714305ad29866d27aa354f09fd03036f44375 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:43:56 +0530 Subject: [PATCH 0658/3206] x86/apic: Read and write LVT* APIC registers from HV for SAVIC guests The Hypervisor needs information about the current state of the LVT registers for device emulation and NMIs. So, forward reads and write of these registers to the hypervisor for Secure AVIC enabled guests. Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111356.208972-1-Neeraj.Upadhyay@amd.com --- arch/x86/kernel/apic/x2apic_savic.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index bbaedb48a7fbbb..b6d6e7a69c89de 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -67,6 +67,11 @@ static u32 savic_read(u32 reg) case APIC_TMICT: case APIC_TMCCT: case APIC_TDCR: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: return savic_ghcb_msr_read(reg); case APIC_ID: case APIC_LVR: @@ -76,11 +81,6 @@ static u32 savic_read(u32 reg) case APIC_LDR: case APIC_SPIV: case APIC_ESR: - case APIC_LVTTHMR: - case APIC_LVTPC: - case APIC_LVT0: - case APIC_LVT1: - case APIC_LVTERR: case APIC_EFEAT: case APIC_ECTRL: case APIC_SEOI: @@ -205,18 +205,18 @@ static void savic_write(u32 reg, u32 data) case APIC_LVTT: case APIC_TMICT: case APIC_TDCR: - savic_ghcb_msr_write(reg, data); - break; case APIC_LVT0: case APIC_LVT1: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVTERR: + savic_ghcb_msr_write(reg, data); + break; case APIC_TASKPRI: case APIC_EOI: case APIC_SPIV: case SAVIC_NMI_REQ: case APIC_ESR: - case APIC_LVTTHMR: - case APIC_LVTPC: - case APIC_LVTERR: case APIC_ECTRL: case APIC_SEOI: case APIC_IER: From 43b6687ac8777821973d790ff9e9565a84cf6b98 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:46:54 +0530 Subject: [PATCH 0659/3206] x86/apic: Handle EOI writes for Secure AVIC guests Secure AVIC accelerates the guest's EOI MSR writes for edge-triggered interrupts. For level-triggered interrupts, EOI MSR writes trigger a #VC exception with an SVM_EXIT_AVIC_UNACCELERATED_ACCESS error code. To complete EOI handling, the #VC exception handler would need to trigger a GHCB protocol MSR write event to notify the hypervisor about completion of the level-triggered interrupt. Hypervisor notification is required for cases like emulated IO-APIC, to complete and clear interrupt in the IO-APIC's interrupt state. However, #VC exception handling adds extra performance overhead for APIC register writes. In addition, for Secure AVIC, some unaccelerated APIC register MSR writes are trapped, whereas others are faulted. This results in additional complexity in #VC exception handling for unaccelerated APIC MSR accesses. So, directly do a GHCB protocol based APIC EOI MSR write from apic->eoi() callback for level-triggered interrupts. Use WRMSR for edge-triggered interrupts, so that hardware re-evaluates any pending interrupt which can be delivered to the guest vCPU. For level-triggered interrupts, re-evaluation happens on return from VMGEXIT corresponding to the GHCB event for APIC EOI MSR write. Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111654.208987-1-Neeraj.Upadhyay@amd.com --- arch/x86/kernel/apic/x2apic_savic.c | 31 ++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index b6d6e7a69c89de..d76faeaced83ee 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -301,6 +301,35 @@ static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set) update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set); } +static void savic_eoi(void) +{ + unsigned int cpu; + int vec; + + cpu = raw_smp_processor_id(); + vec = apic_find_highest_vector(get_reg_bitmap(cpu, APIC_ISR)); + if (WARN_ONCE(vec == -1, "EOI write while no active interrupt in APIC_ISR")) + return; + + /* Is level-triggered interrupt? */ + if (apic_test_vector(vec, get_reg_bitmap(cpu, APIC_TMR))) { + update_vector(cpu, APIC_ISR, vec, false); + /* + * Propagate the EOI write to the hypervisor for level-triggered + * interrupts. Return to the guest from GHCB protocol event takes + * care of re-evaluating interrupt state. + */ + savic_ghcb_msr_write(APIC_EOI, 0); + } else { + /* + * Hardware clears APIC_ISR and re-evaluates the interrupt state + * to determine if there is any pending interrupt which can be + * delivered to CPU. + */ + native_apic_msr_eoi(); + } +} + static void savic_setup(void) { void *ap = this_cpu_ptr(savic_page); @@ -380,7 +409,7 @@ static struct apic apic_x2apic_savic __ro_after_init = { .read = savic_read, .write = savic_write, - .eoi = native_apic_msr_eoi, + .eoi = savic_eoi, .icr_read = native_x2apic_icr_read, .icr_write = savic_icr_write, From c8018325dd3e7c75c19b1e9263c358c4c96214f9 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:50:08 +0530 Subject: [PATCH 0660/3206] x86/apic: Add kexec support for Secure AVIC Add a apic->teardown() callback to disable Secure AVIC before rebooting into the new kernel. This ensures that the new kernel does not access the old APIC backing page which was allocated by the previous kernel. Such accesses can happen if there are any APIC accesses done during the guest boot before Secure AVIC driver probe is done by the new kernel (as Secure AVIC would have remained enabled in the Secure AVIC control MSR). Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828112008.209013-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 23 +++++++++++++++++++++++ arch/x86/include/asm/apic.h | 1 + arch/x86/include/asm/sev.h | 2 ++ arch/x86/kernel/apic/apic.c | 3 +++ arch/x86/kernel/apic/x2apic_savic.c | 8 ++++++++ 5 files changed, 37 insertions(+) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index e4740611228dc7..b64f43010a1289 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1187,6 +1187,29 @@ enum es_result savic_register_gpa(u64 gpa) return res; } +enum es_result savic_unregister_gpa(u64 *gpa) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA); + res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC, + SVM_VMGEXIT_SAVIC_UNREGISTER_GPA, 0); + if (gpa && res == ES_OK) + *gpa = ghcb->save.rbx; + + __sev_put_ghcb(&state); + + return res; +} + static void snp_register_per_cpu_ghcb(void) { struct sev_es_runtime_data *data; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 0683318470beb7..a26e66d66444a8 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,6 +306,7 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); void (*setup)(void); + void (*teardown)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); void (*init_apic_ldr)(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 875c7669ba95f7..46915dd163edca 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -534,6 +534,7 @@ int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); enum es_result savic_register_gpa(u64 gpa); +enum es_result savic_unregister_gpa(u64 *gpa); u64 savic_ghcb_msr_read(u32 reg); void savic_ghcb_msr_write(u32 reg, u64 value); @@ -609,6 +610,7 @@ static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; } +static inline enum es_result savic_unregister_gpa(u64 *gpa) { return ES_UNSUPPORTED; } static inline void savic_ghcb_msr_write(u32 reg, u64 value) { } static inline u64 savic_ghcb_msr_read(u32 reg) { return 0; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index db18810576bc05..680d305589a3ac 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1170,6 +1170,9 @@ void disable_local_APIC(void) if (!apic_accessible()) return; + if (apic->teardown) + apic->teardown(); + apic_soft_disable(); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index d76faeaced83ee..36e6d0dbcc9ce6 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -330,6 +330,13 @@ static void savic_eoi(void) } } +static void savic_teardown(void) +{ + /* Disable Secure AVIC */ + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, 0); + savic_unregister_gpa(NULL); +} + static void savic_setup(void) { void *ap = this_cpu_ptr(savic_page); @@ -385,6 +392,7 @@ static struct apic apic_x2apic_savic __ro_after_init = { .probe = savic_probe, .acpi_madt_oem_check = savic_acpi_madt_oem_check, .setup = savic_setup, + .teardown = savic_teardown, .dest_mode_logical = false, From e5bca063c150877c45b88ff134b6ef7a5eae8e7a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 30 Aug 2025 12:55:39 +0200 Subject: [PATCH 0661/3206] fs: remove vfs_ioctl export vfs_ioctl() is no longer called by anything outside of fs/ioctl.c, so remove the global symbol and export as it is not needed. Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/2025083038-carving-amuck-a4ae@gregkh Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/ioctl.c | 3 +-- include/linux/fs.h | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ioctl.c b/fs/ioctl.c index 83d07218b6cd08..1c152c2b1b67de 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -41,7 +41,7 @@ * * Returns 0 on success, -errno on error. */ -int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -54,7 +54,6 @@ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) out: return error; } -EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 34693cae15a298..4daf9b30a64117 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2053,8 +2053,6 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); -int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); - #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); From 7f9d34b0a7cb93d678ee7207f0634dbf79e47fe5 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 30 Aug 2025 19:01:01 +0900 Subject: [PATCH 0662/3206] cramfs: Verify inode mode when loading from disk The inode mode loaded from corrupted disk can be invalid. Do like what commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") does. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d Signed-off-by: Tetsuo Handa Link: https://lore.kernel.org/429b3ef1-13de-4310-9a8e-c2dc9a36234a@I-love.SAKURA.ne.jp Acked-by: Nicolas Pitre Signed-off-by: Christian Brauner --- fs/cramfs/inode.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index b002e9b734f99c..12daa85ed941b7 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -116,9 +116,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, inode_nohighmem(inode); inode->i_data.a_ops = &cramfs_aops; break; - default: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: init_special_inode(inode, cramfs_inode->mode, old_decode_dev(cramfs_inode->size)); + break; + default: + printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + iget_failed(inode); + return ERR_PTR(-EIO); } inode->i_mode = cramfs_inode->mode; From c4074ab87f3483deb15f277f302f199cdb997738 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:51:26 +0530 Subject: [PATCH 0663/3206] x86/apic: Enable Secure AVIC in the control MSR With all the pieces in place now, enable Secure AVIC in the Secure AVIC Control MSR. Any access to x2APIC MSRs are emulated by the hypervisor before Secure AVIC is enabled in the control MSR. Post Secure AVIC enablement, all x2APIC MSR accesses (whether accelerated by AVIC hardware or trapped as a #VC exception) operate on the vCPU's APIC backing page. Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828112126.209028-1-Neeraj.Upadhyay@amd.com --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/apic/x2apic_savic.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1291e053e40c24..5951344009f1f2 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -704,6 +704,8 @@ #define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_SAVIC_CONTROL 0xc0010138 +#define MSR_AMD64_SAVIC_EN_BIT 0 +#define MSR_AMD64_SAVIC_EN BIT_ULL(MSR_AMD64_SAVIC_EN_BIT) #define MSR_AMD64_SAVIC_ALLOWEDNMI_BIT 1 #define MSR_AMD64_SAVIC_ALLOWEDNMI BIT_ULL(MSR_AMD64_SAVIC_ALLOWEDNMI_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 36e6d0dbcc9ce6..b846de0fbcfadf 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -365,7 +365,8 @@ static void savic_setup(void) if (res != ES_OK) snp_abort(); - native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, gpa | MSR_AMD64_SAVIC_ALLOWEDNMI); + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, + gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI); } static int savic_probe(void) From 952aefeebb3339d8129f7ca7fdb8f4344b6543a7 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 17:01:19 +0530 Subject: [PATCH 0664/3206] x86/sev: Prevent SECURE_AVIC_CONTROL MSR interception for Secure AVIC guests The SECURE_AVIC_CONTROL MSR holds the GPA of the guest APIC backing page and bitfields to control enablement of Secure AVIC and whether the guest allows NMIs to be injected by the hypervisor. This MSR is populated by the guest and can be read by the guest to get the GPA of the APIC backing page. The MSR can only be accessed in Secure AVIC mode. Any attempt to access it when not in Secure AVIC mode results in #GP. So, the hypervisor should not intercept it. A #VC exception will be generated otherwise. If this occurs and Secure AVIC is enabled, terminate the guest execution. Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828113119.209135-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/vc-handle.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index c1aa10ce9d5443..0fd94b7ce19120 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -415,6 +415,15 @@ enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt if (sev_status & MSR_AMD64_SNP_SECURE_TSC) return __vc_handle_secure_tsc_msrs(ctxt, write); break; + case MSR_AMD64_SAVIC_CONTROL: + /* + * AMD64_SAVIC_CONTROL should not be intercepted when + * Secure AVIC is enabled. Terminate the Secure AVIC guest + * if the interception is enabled. + */ + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return ES_VMM_ERROR; + break; default: break; } From 27a17e02418e978198513edfb389b65237f4eaf5 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 17:02:24 +0530 Subject: [PATCH 0665/3206] x86/sev: Indicate the SEV-SNP guest supports Secure AVIC Now that Secure AVIC support is complete, make it part of to the SNP present features. Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828113225.209174-1-Neeraj.Upadhyay@amd.com --- arch/x86/boot/compressed/sev.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 74e083feb2d952..048d3e8839c3b9 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -238,13 +238,20 @@ bool sev_es_check_ghcb_fault(unsigned long address) MSR_AMD64_SNP_SECURE_AVIC | \ MSR_AMD64_SNP_RESERVED_MASK) +#ifdef CONFIG_AMD_SECURE_AVIC +#define SNP_FEATURE_SECURE_AVIC MSR_AMD64_SNP_SECURE_AVIC +#else +#define SNP_FEATURE_SECURE_AVIC 0 +#endif + /* * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented * by the guest kernel. As and when a new feature is implemented in the * guest kernel, a corresponding bit should be added to the mask. */ #define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \ - MSR_AMD64_SNP_SECURE_TSC) + MSR_AMD64_SNP_SECURE_TSC | \ + SNP_FEATURE_SECURE_AVIC) u64 snp_get_unsupported_features(u64 status) { From c37adf34a5dc511e017b5a3bab15fe3171269e46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20=C3=96zkan?= Date: Thu, 21 Aug 2025 12:10:01 +0300 Subject: [PATCH 0666/3206] rust: file: use to_result for error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplifies error handling by replacing the manual check of the return value with the `to_result` helper. Signed-off-by: Onur Özkan Link: https://lore.kernel.org/20250821091001.28563-1-work@onurozkan.dev Signed-off-by: Christian Brauner --- rust/kernel/fs/file.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/rust/kernel/fs/file.rs b/rust/kernel/fs/file.rs index 18cf579d3312fa..f1a3fa6987451d 100644 --- a/rust/kernel/fs/file.rs +++ b/rust/kernel/fs/file.rs @@ -10,7 +10,7 @@ use crate::{ bindings, cred::Credential, - error::{code::*, Error, Result}, + error::{code::*, to_result, Error, Result}, sync::aref::{ARef, AlwaysRefCounted}, types::{NotThreadSafe, Opaque}, }; @@ -399,9 +399,8 @@ impl FileDescriptorReservation { pub fn get_unused_fd_flags(flags: u32) -> Result { // SAFETY: FFI call, there are no safety requirements on `flags`. let fd: i32 = unsafe { bindings::get_unused_fd_flags(flags) }; - if fd < 0 { - return Err(Error::from_errno(fd)); - } + to_result(fd)?; + Ok(Self { fd: fd as u32, _not_send: NotThreadSafe, From 9d35d068fb138160709e04e3ee97fe29a6f8615b Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Fri, 29 Aug 2025 18:14:11 +0800 Subject: [PATCH 0667/3206] regulator: scmi: Use int type to store negative error codes Change the 'ret' variable from u32 to int to store negative error codes or zero returned by of_property_read_u32(). Storing the negative error codes in unsigned type, doesn't cause an issue at runtime but it's ugly as pants. Additionally, assigning negative error codes to unsigned type may trigger a GCC warning when the -Wsign-conversion flag is enabled. No effect on runtime. Signed-off-by: Qianfeng Rong Reviewed-by: Sudeep Holla Fixes: 0fbeae70ee7c ("regulator: add SCMI driver") Link: https://patch.msgid.link/20250829101411.625214-1-rongqianfeng@vivo.com Signed-off-by: Mark Brown --- drivers/regulator/scmi-regulator.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/regulator/scmi-regulator.c b/drivers/regulator/scmi-regulator.c index 9df726f10ad121..6d609c42e4793b 100644 --- a/drivers/regulator/scmi-regulator.c +++ b/drivers/regulator/scmi-regulator.c @@ -257,7 +257,8 @@ static int process_scmi_regulator_of_node(struct scmi_device *sdev, struct device_node *np, struct scmi_regulator_info *rinfo) { - u32 dom, ret; + u32 dom; + int ret; ret = of_property_read_u32(np, "reg", &dom); if (ret) From 04ff48239f46e8b493571e260bd0e6c3a6400371 Mon Sep 17 00:00:00 2001 From: Simon Schuster Date: Mon, 1 Sep 2025 15:09:50 +0200 Subject: [PATCH 0668/3206] copy_sighand: Handle architectures where sizeof(unsigned long) < sizeof(u64) With the introduction of clone3 in commit 7f192e3cd316 ("fork: add clone3") the effective bit width of clone_flags on all architectures was increased from 32-bit to 64-bit. However, the signature of the copy_* helper functions (e.g., copy_sighand) used by copy_process was not adapted. As such, they truncate the flags on any 32-bit architectures that supports clone3 (arc, arm, csky, m68k, microblaze, mips32, openrisc, parisc32, powerpc32, riscv32, x86-32 and xtensa). For copy_sighand with CLONE_CLEAR_SIGHAND being an actual u64 constant, this triggers an observable bug in kernel selftest clone3_clear_sighand: if (clone_flags & CLONE_CLEAR_SIGHAND) in function copy_sighand within fork.c will always fail given: unsigned long /* == uint32_t */ clone_flags #define CLONE_CLEAR_SIGHAND 0x100000000ULL This commit fixes the bug by always passing clone_flags to copy_sighand via their declared u64 type, invariant of architecture-dependent integer sizes. Fixes: b612e5df4587 ("clone3: add CLONE_CLEAR_SIGHAND") Cc: stable@vger.kernel.org # linux-5.5+ Signed-off-by: Simon Schuster Link: https://lore.kernel.org/20250901-nios2-implement-clone3-v2-1-53fcf5577d57@siemens-energy.com Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Signed-off-by: Christian Brauner --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index af673856499dca..4e2c5a3e898911 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1596,7 +1596,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, return 0; } -static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) +static int copy_sighand(u64 clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; From edd3cb05c00a040dc72bed20b14b5ba865188bce Mon Sep 17 00:00:00 2001 From: Simon Schuster Date: Mon, 1 Sep 2025 15:09:51 +0200 Subject: [PATCH 0669/3206] copy_process: pass clone_flags as u64 across calltree With the introduction of clone3 in commit 7f192e3cd316 ("fork: add clone3") the effective bit width of clone_flags on all architectures was increased from 32-bit to 64-bit, with a new type of u64 for the flags. However, for most consumers of clone_flags the interface was not changed from the previous type of unsigned long. While this works fine as long as none of the new 64-bit flag bits (CLONE_CLEAR_SIGHAND and CLONE_INTO_CGROUP) are evaluated, this is still undesirable in terms of the principle of least surprise. Thus, this commit fixes all relevant interfaces of callees to sys_clone3/copy_process (excluding the architecture-specific copy_thread) to consistently pass clone_flags as u64, so that no truncation to 32-bit integers occurs on 32-bit architectures. Signed-off-by: Simon Schuster Link: https://lore.kernel.org/20250901-nios2-implement-clone3-v2-2-53fcf5577d57@siemens-energy.com Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Signed-off-by: Christian Brauner --- block/blk-ioc.c | 2 +- fs/namespace.c | 2 +- include/linux/cgroup.h | 4 ++-- include/linux/cred.h | 2 +- include/linux/iocontext.h | 6 +++--- include/linux/ipc_namespace.h | 4 ++-- include/linux/lsm_hook_defs.h | 2 +- include/linux/mnt_namespace.h | 2 +- include/linux/nsproxy.h | 2 +- include/linux/pid_namespace.h | 4 ++-- include/linux/rseq.h | 4 ++-- include/linux/sched/task.h | 2 +- include/linux/security.h | 4 ++-- include/linux/sem.h | 4 ++-- include/linux/time_namespace.h | 4 ++-- include/linux/uprobes.h | 4 ++-- include/linux/user_events.h | 4 ++-- include/linux/utsname.h | 4 ++-- include/net/net_namespace.h | 4 ++-- include/trace/events/task.h | 6 +++--- ipc/namespace.c | 2 +- ipc/sem.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/cred.c | 2 +- kernel/events/uprobes.c | 2 +- kernel/fork.c | 8 ++++---- kernel/nsproxy.c | 4 ++-- kernel/pid_namespace.c | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 ++-- kernel/time/namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- security/apparmor/lsm.c | 2 +- security/security.c | 2 +- security/selinux/hooks.c | 2 +- security/tomoyo/tomoyo.c | 2 +- 38 files changed, 59 insertions(+), 59 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9fda3906e5f5d6..d15918d7fabb33 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -286,7 +286,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) } EXPORT_SYMBOL_GPL(set_task_ioprio); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk) { struct io_context *ioc = current->io_context; diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d33837..d9c190ffa7df8f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4200,7 +4200,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } __latent_entropy -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, +struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e23..56d9556a181a82 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -796,7 +796,7 @@ extern struct cgroup_namespace init_cgroup_ns; void free_cgroup_ns(struct cgroup_namespace *ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); @@ -818,7 +818,7 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns) static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * -copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, +copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; diff --git a/include/linux/cred.h b/include/linux/cred.h index a102a10f833fb8..89ae50ad2acea9 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -148,7 +148,7 @@ struct cred { extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); -extern int copy_creds(struct task_struct *, unsigned long); +extern int copy_creds(struct task_struct *, u64); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 14f7eaf1b4437c..079d8773790c07 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -118,8 +118,8 @@ struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk); -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk); +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { if (!current->io_context) return 0; @@ -129,7 +129,7 @@ static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611ad6..4b399893e2b3c6 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,7 +129,7 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) -extern struct ipc_namespace *copy_ipcs(unsigned long flags, +extern struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) @@ -151,7 +151,7 @@ static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns extern void put_ipc_ns(struct ipc_namespace *ns); #else -static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +static inline struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index fd11fffdd3c380..adbe234a6f6c61 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -211,7 +211,7 @@ LSM_HOOK(int, 0, file_open, struct file *file) LSM_HOOK(int, 0, file_post_open, struct file *file, int mask) LSM_HOOK(int, 0, file_truncate, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task) LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 70b366b6481605..ff290c87b2e7d9 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,7 +11,7 @@ struct fs_struct; struct user_namespace; struct ns_common; -extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, +extern struct mnt_namespace *copy_mnt_ns(u64, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T)) diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a2265..82533e899ff4fa 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -103,7 +103,7 @@ static inline struct cred *nsset_cred(struct nsset *set) * */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk); +int copy_namespaces(u64 flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a58111998d..0620a3e08e8311 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -78,7 +78,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) } #endif -extern struct pid_namespace *copy_pid_ns(unsigned long flags, +extern struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); @@ -97,7 +97,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) return 0; } -static inline struct pid_namespace *copy_pid_ns(unsigned long flags, +static inline struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bc8af3eb559876..a96fd345aa3810 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -65,7 +65,7 @@ static inline void rseq_migrate(struct task_struct *t) * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; @@ -107,7 +107,7 @@ static inline void rseq_preempt(struct task_struct *t) static inline void rseq_migrate(struct task_struct *t) { } -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ea41795a352bca..34d6a0e108c3e6 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,7 @@ extern int lockdep_tasklist_lock_is_held(void); extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); -extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern int sched_fork(u64 clone_flags, struct task_struct *p); extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b97170c..9a1d4a6c867380 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -489,7 +489,7 @@ int security_file_receive(struct file *file); int security_file_open(struct file *file); int security_file_post_open(struct file *file, int mask); int security_file_truncate(struct file *file); -int security_task_alloc(struct task_struct *task, unsigned long clone_flags); +int security_task_alloc(struct task_struct *task, u64 clone_flags); void security_task_free(struct task_struct *task); int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); void security_cred_free(struct cred *cred); @@ -1215,7 +1215,7 @@ static inline int security_file_truncate(struct file *file) } static inline int security_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { return 0; } diff --git a/include/linux/sem.h b/include/linux/sem.h index c4deefe42aeb3c..275269ce2ec892 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -9,12 +9,12 @@ struct task_struct; #ifdef CONFIG_SYSVIPC -extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern int copy_semundo(u64 clone_flags, struct task_struct *tsk); extern void exit_sem(struct task_struct *tsk); #else -static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_semundo(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb2c52f4fc9497..b6e36525e0be4d 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -43,7 +43,7 @@ static inline struct time_namespace *get_time_ns(struct time_namespace *ns) return ns; } -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns); void free_time_ns(struct time_namespace *ns); @@ -129,7 +129,7 @@ static inline void put_time_ns(struct time_namespace *ns) } static inline -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c390946c..915303a82d846f 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -205,7 +205,7 @@ extern void uprobe_start_dup_mmap(void); extern void uprobe_end_dup_mmap(void); extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); -extern void uprobe_copy_process(struct task_struct *t, unsigned long flags); +extern void uprobe_copy_process(struct task_struct *t, u64 flags); extern int uprobe_post_sstep_notifier(struct pt_regs *regs); extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); @@ -281,7 +281,7 @@ static inline bool uprobe_deny_signal(void) static inline void uprobe_free_utask(struct task_struct *t) { } -static inline void uprobe_copy_process(struct task_struct *t, unsigned long flags) +static inline void uprobe_copy_process(struct task_struct *t, u64 flags) { } static inline void uprobe_clear_state(struct mm_struct *mm) diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 8afa8c3a097308..57d1ff0060901a 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -33,7 +33,7 @@ extern void user_event_mm_dup(struct task_struct *t, extern void user_event_mm_remove(struct task_struct *t); static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { struct user_event_mm *old_mm; @@ -68,7 +68,7 @@ static inline void user_events_exit(struct task_struct *t) } #else static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { } diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412bfe..ba34ec0e2f95d7 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -35,7 +35,7 @@ static inline void get_uts_ns(struct uts_namespace *ns) refcount_inc(&ns->ns.count); } -extern struct uts_namespace *copy_utsname(unsigned long flags, +extern struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns); extern void free_uts_ns(struct uts_namespace *ns); @@ -55,7 +55,7 @@ static inline void put_uts_ns(struct uts_namespace *ns) { } -static inline struct uts_namespace *copy_utsname(unsigned long flags, +static inline struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { if (flags & CLONE_NEWUTS) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275f3..0e008cfe159d6e 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -204,7 +204,7 @@ struct net { extern struct net init_net; #ifdef CONFIG_NET_NS -struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net); void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); @@ -218,7 +218,7 @@ extern struct task_struct *cleanup_net_task; #else /* CONFIG_NET_NS */ #include #include -static inline struct net *copy_net_ns(unsigned long flags, +static inline struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { if (flags & CLONE_NEWNET) diff --git a/include/trace/events/task.h b/include/trace/events/task.h index af535b05303304..4f0759634306c7 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h @@ -8,14 +8,14 @@ TRACE_EVENT(task_newtask, - TP_PROTO(struct task_struct *task, unsigned long clone_flags), + TP_PROTO(struct task_struct *task, u64 clone_flags), TP_ARGS(task, clone_flags), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN) - __field( unsigned long, clone_flags) + __field( u64, clone_flags) __field( short, oom_score_adj) ), @@ -26,7 +26,7 @@ TRACE_EVENT(task_newtask, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd", + TP_printk("pid=%d comm=%s clone_flags=%llx oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->clone_flags, __entry->oom_score_adj) ); diff --git a/ipc/namespace.c b/ipc/namespace.c index 4df91ceeeafe9f..a712ec27209cc9 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -106,7 +106,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, return ERR_PTR(err); } -struct ipc_namespace *copy_ipcs(unsigned long flags, +struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) diff --git a/ipc/sem.c b/ipc/sem.c index a39cdc7bf88fa2..0f06e4bd4673f5 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2303,7 +2303,7 @@ SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops, * parent and child tasks. */ -int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +int copy_semundo(u64 clone_flags, struct task_struct *tsk) { struct sem_undo_list *undo_list; int error; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c664..dedadb52588078 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -47,7 +47,7 @@ void free_cgroup_ns(struct cgroup_namespace *ns) } EXPORT_SYMBOL(free_cgroup_ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { diff --git a/kernel/cred.c b/kernel/cred.c index 9676965c0981a0..dbf6b687dc5c5a 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -287,7 +287,7 @@ struct cred *prepare_exec_creds(void) * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ -int copy_creds(struct task_struct *p, unsigned long clone_flags) +int copy_creds(struct task_struct *p, u64 clone_flags) { struct cred *new; int ret; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd83..b2753014c6dd0b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2160,7 +2160,7 @@ static void dup_xol_work(struct callback_head *work) /* * Called in context of a new clone/fork from copy_process. */ -void uprobe_copy_process(struct task_struct *t, unsigned long flags) +void uprobe_copy_process(struct task_struct *t, u64 flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; diff --git a/kernel/fork.c b/kernel/fork.c index 4e2c5a3e898911..d6e1fb11eff9e6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1507,7 +1507,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) +static int copy_mm(u64 clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; @@ -1545,7 +1545,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) +static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { @@ -1566,7 +1566,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk, +static int copy_files(u64 clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; @@ -1645,7 +1645,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) posix_cputimers_group_init(pct, cpu_limit); } -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) +static int copy_signal(u64 clone_flags, struct task_struct *tsk) { struct signal_struct *sig; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38f5..8af3b9ec3aa81a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -64,7 +64,7 @@ static inline struct nsproxy *create_nsproxy(void) * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(unsigned long flags, +static struct nsproxy *create_new_namespaces(u64 flags, struct task_struct *tsk, struct user_namespace *user_ns, struct fs_struct *new_fs) { @@ -144,7 +144,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) +int copy_namespaces(u64 flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717d3..06bc7c7f78e095 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -171,7 +171,7 @@ static void destroy_pid_namespace_work(struct work_struct *work) } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); } -struct pid_namespace *copy_pid_ns(unsigned long flags, +struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4cc..6fa85d30d96599 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4472,7 +4472,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * __sched_fork() is basic setup which is also used by sched_init() to * initialize the boot CPU's idle task. */ -static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +static void __sched_fork(u64 clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -4707,7 +4707,7 @@ late_initcall(sched_core_sysctl_init); /* * fork()/clone()-time setup: */ -int sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(u64 clone_flags, struct task_struct *p) { __sched_fork(clone_flags, p); /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c2e..af0866ce2dfc8f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3542,7 +3542,7 @@ static void task_numa_work(struct callback_head *work) } } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +void init_numa_balancing(u64 clone_flags, struct task_struct *p) { int mm_users = 0; struct mm_struct *mm = p->mm; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f751..f9adfc912ddc5f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1935,12 +1935,12 @@ extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); -extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +extern void init_numa_balancing(u64 clone_flags, struct task_struct *p); #else /* !CONFIG_NUMA_BALANCING: */ static inline void -init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +init_numa_balancing(u64 clone_flags, struct task_struct *p) { } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 667452768ed3b5..888872bcc5bbfa 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -130,7 +130,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) diff --git a/kernel/utsname.c b/kernel/utsname.c index b1ac3ca870f24e..00d8d7922f860a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -86,7 +86,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, +struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1b6f3826dd0e1f..8ec9d83475bf86 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -539,7 +539,7 @@ void net_drop_ns(void *p) net_passive_dec(net); } -struct net *copy_net_ns(unsigned long flags, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b39..ba39cfe0cd08e5 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -112,7 +112,7 @@ static void apparmor_task_free(struct task_struct *task) } static int apparmor_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct aa_task_ctx *new = task_ctx(task); diff --git a/security/security.c b/security/security.c index ad163f06bf7abc..a769140553bc3c 100644 --- a/security/security.c +++ b/security/security.c @@ -3185,7 +3185,7 @@ int security_file_truncate(struct file *file) * * Return: Returns a zero on success, negative values on failure. */ -int security_task_alloc(struct task_struct *task, unsigned long clone_flags) +int security_task_alloc(struct task_struct *task, u64 clone_flags) { int rc = lsm_task_alloc(task); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d40..bb016dd511c14d 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4144,7 +4144,7 @@ static int selinux_file_open(struct file *file) /* task security operations */ static int selinux_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { u32 sid = current_sid(); diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index d6ebcd9db80a37..48fc59d38ab217 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -514,7 +514,7 @@ struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = { * Returns 0. */ static int tomoyo_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct tomoyo_task *old = tomoyo_task(current); struct tomoyo_task *new = tomoyo_task(task); From bbc46b23af5bb934cd1cf066ef4342cee457a24e Mon Sep 17 00:00:00 2001 From: Simon Schuster Date: Mon, 1 Sep 2025 15:09:52 +0200 Subject: [PATCH 0670/3206] arch: copy_thread: pass clone_flags as u64 With the introduction of clone3 in commit 7f192e3cd316 ("fork: add clone3") the effective bit width of clone_flags on all architectures was increased from 32-bit to 64-bit, with a new type of u64 for the flags. However, for most consumers of clone_flags the interface was not changed from the previous type of unsigned long. While this works fine as long as none of the new 64-bit flag bits (CLONE_CLEAR_SIGHAND and CLONE_INTO_CGROUP) are evaluated, this is still undesirable in terms of the principle of least surprise. Thus, this commit fixes all relevant interfaces of the copy_thread function that is called from copy_process to consistently pass clone_flags as u64, so that no truncation to 32-bit integers occurs on 32-bit architectures. Signed-off-by: Simon Schuster Link: https://lore.kernel.org/20250901-nios2-implement-clone3-v2-3-53fcf5577d57@siemens-energy.com Fixes: c5febea0956fd387 ("fork: Pass struct kernel_clone_args into copy_thread") Acked-by: Guo Ren (Alibaba Damo Academy) Acked-by: Andreas Larsson # sparc Acked-by: David Hildenbrand Acked-by: Geert Uytterhoeven # m68k Reviewed-by: Arnd Bergmann Signed-off-by: Christian Brauner --- arch/alpha/kernel/process.c | 2 +- arch/arc/kernel/process.c | 2 +- arch/arm/kernel/process.c | 2 +- arch/arm64/kernel/process.c | 2 +- arch/csky/kernel/process.c | 2 +- arch/hexagon/kernel/process.c | 2 +- arch/loongarch/kernel/process.c | 2 +- arch/m68k/kernel/process.c | 2 +- arch/microblaze/kernel/process.c | 2 +- arch/mips/kernel/process.c | 2 +- arch/nios2/kernel/process.c | 2 +- arch/openrisc/kernel/process.c | 2 +- arch/parisc/kernel/process.c | 2 +- arch/powerpc/kernel/process.c | 2 +- arch/riscv/kernel/process.c | 2 +- arch/s390/kernel/process.c | 2 +- arch/sh/kernel/process_32.c | 2 +- arch/sparc/kernel/process_32.c | 2 +- arch/sparc/kernel/process_64.c | 2 +- arch/um/kernel/process.c | 2 +- arch/x86/include/asm/fpu/sched.h | 2 +- arch/x86/include/asm/shstk.h | 4 ++-- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/shstk.c | 2 +- arch/xtensa/kernel/process.c | 2 +- 26 files changed, 27 insertions(+), 27 deletions(-) diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 582d96548385dd..06522451f018f3 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -231,7 +231,7 @@ flush_thread(void) */ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; extern void ret_from_fork(void); diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index 186ceab661eb02..8166d090871304 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -166,7 +166,7 @@ asmlinkage void ret_from_fork(void); */ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *c_regs; /* child's pt_regs */ diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index e16ed102960cb0..d7aa95225c70bd 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -234,7 +234,7 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct thread_info *thread = task_thread_info(p); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 96482a1412c6aa..fba7ca102a8c42 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -409,7 +409,7 @@ asmlinkage void ret_from_fork(void) asm("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index 0c6e4b17fe00fd..a7a90340042a5f 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -32,7 +32,7 @@ void flush_thread(void){} int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct switch_stack *childstack; diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index 2a77bfd7569450..15b4992bfa298a 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -52,7 +52,7 @@ void arch_cpu_idle(void) */ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c index 3582f591bab286..efd9edf65603cc 100644 --- a/arch/loongarch/kernel/process.c +++ b/arch/loongarch/kernel/process.c @@ -167,7 +167,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long childksp; unsigned long tls = args->tls; unsigned long usp = args->stack; - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; struct pt_regs *childregs, *regs = current_pt_regs(); childksp = (unsigned long)task_stack_page(p) + THREAD_SIZE; diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index fda7eac23f872d..f5a07a70e9385a 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -141,7 +141,7 @@ asmlinkage int m68k_clone3(struct pt_regs *regs) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct fork_frame { diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 56342e11442d2a..6cbf642d7b801d 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -54,7 +54,7 @@ void flush_thread(void) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 02aa6a04a21da4..29191fa1801e2a 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -107,7 +107,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) */ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index f84021303f6a82..151404139085cf 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -101,7 +101,7 @@ void flush_thread(void) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index eef99fee2110cb..73ffb9fa3118bb 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -165,7 +165,7 @@ extern asmlinkage void ret_from_fork(void); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *userregs; diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index ed93bd8c154533..e64ab5d2a40d61 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -201,7 +201,7 @@ arch_initcall(parisc_idle_init); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *cregs = &(p->thread.regs); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 855e0988650326..eb23966ac0a9f0 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1805,7 +1805,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) f = ret_from_kernel_user_thread; } else { struct pt_regs *regs = current_pt_regs(); - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; /* Copy registers */ diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index a0a40889d79a53..31a392993cb452 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -223,7 +223,7 @@ asmlinkage void ret_from_fork_user(struct pt_regs *regs) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index f55f09cda6f889..b107dbca4ed7df 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -106,7 +106,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long new_stackp = args->stack; unsigned long tls = args->tls; struct fake_frame diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 92b6649d492952..62f753a85b89c7 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -89,7 +89,7 @@ asmlinkage void ret_from_kernel_thread(void); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 9c7c662cb5659e..5a28c0e91bf15f 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -260,7 +260,7 @@ extern void ret_from_kernel_thread(void); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 529adfecd58ca1..25781923788a03 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -567,7 +567,7 @@ void fault_in_user_windows(struct pt_regs *regs) */ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; struct thread_info *t = task_thread_info(p); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 1be644de9e41ec..9c9c66dc45f054 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -143,7 +143,7 @@ static void fork_handler(void) int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; void (*handler)(void); diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index c060549c6c9407..89004f4ca208da 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -11,7 +11,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); extern void fpu__drop(struct task_struct *tsk); -extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, +extern int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal, unsigned long shstk_addr); extern void fpu_flush_thread(void); diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h index ba6f2fe438488d..0f50e01259430c 100644 --- a/arch/x86/include/asm/shstk.h +++ b/arch/x86/include/asm/shstk.h @@ -16,7 +16,7 @@ struct thread_shstk { long shstk_prctl(struct task_struct *task, int option, unsigned long arg2); void reset_thread_features(void); -unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags, +unsigned long shstk_alloc_thread_stack(struct task_struct *p, u64 clone_flags, unsigned long stack_size); void shstk_free(struct task_struct *p); int setup_signal_shadow_stack(struct ksignal *ksig); @@ -28,7 +28,7 @@ static inline long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { return -EINVAL; } static inline void reset_thread_features(void) {} static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p, - unsigned long clone_flags, + u64 clone_flags, unsigned long stack_size) { return 0; } static inline void shstk_free(struct task_struct *p) {} static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index aefd412a23dc24..1f71cc135e9ade 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -631,7 +631,7 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) } /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, +int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal, unsigned long ssp) { /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1b7960cf6eb0c1..e3a3987b0c4fb6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -159,7 +159,7 @@ __visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; struct inactive_task_frame *frame; diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 2ddf23387c7ef7..5eba6c5a67757d 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -191,7 +191,7 @@ void reset_thread_features(void) current->thread.features_locked = 0; } -unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags, unsigned long stack_size) { struct thread_shstk *shstk = &tsk->thread.shstk; diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 7bd66677f7b6de..94d43f44be1315 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -267,7 +267,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp_thread_fn = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); From c6ac444ff20ce301020a41e35f1942c6524e0864 Mon Sep 17 00:00:00 2001 From: Simon Schuster Date: Mon, 1 Sep 2025 15:09:53 +0200 Subject: [PATCH 0671/3206] nios2: implement architecture-specific portion of sys_clone3 This commit adds the sys_clone3 entry point for nios2. An architecture-specific wrapper (__sys_clone3) is required to save and restore additional registers to the kernel stack via SAVE_SWITCH_STACK and RESTORE_SWITCH_STACK. Signed-off-by: Simon Schuster Link: https://lore.kernel.org/20250901-nios2-implement-clone3-v2-4-53fcf5577d57@siemens-energy.com Reviewed-by: Arnd Bergmann Signed-off-by: Christian Brauner --- arch/nios2/include/asm/syscalls.h | 1 + arch/nios2/include/asm/unistd.h | 2 -- arch/nios2/kernel/entry.S | 6 ++++++ arch/nios2/kernel/syscall_table.c | 1 + 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/nios2/include/asm/syscalls.h b/arch/nios2/include/asm/syscalls.h index b4d4ed3bf9c86c..0e214b0a0ac89f 100644 --- a/arch/nios2/include/asm/syscalls.h +++ b/arch/nios2/include/asm/syscalls.h @@ -7,6 +7,7 @@ int sys_cacheflush(unsigned long addr, unsigned long len, unsigned int op); +asmlinkage long __sys_clone3(struct clone_args __user *uargs, size_t size); #include diff --git a/arch/nios2/include/asm/unistd.h b/arch/nios2/include/asm/unistd.h index 1146e56473c512..213f6de3cf7b10 100644 --- a/arch/nios2/include/asm/unistd.h +++ b/arch/nios2/include/asm/unistd.h @@ -7,6 +7,4 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SET_GET_RLIMIT -#define __ARCH_BROKEN_SYS_CLONE3 - #endif diff --git a/arch/nios2/kernel/entry.S b/arch/nios2/kernel/entry.S index 99f0a65e62347e..dd40dfd908e59c 100644 --- a/arch/nios2/kernel/entry.S +++ b/arch/nios2/kernel/entry.S @@ -403,6 +403,12 @@ ENTRY(sys_clone) addi sp, sp, 4 RESTORE_SWITCH_STACK ret +/* long syscall(SYS_clone3, struct clone_args *cl_args, size_t size); */ +ENTRY(__sys_clone3) + SAVE_SWITCH_STACK + call sys_clone3 + RESTORE_SWITCH_STACK + ret ENTRY(sys_rt_sigreturn) SAVE_SWITCH_STACK diff --git a/arch/nios2/kernel/syscall_table.c b/arch/nios2/kernel/syscall_table.c index 434694067d8f55..c99818aac9e1b8 100644 --- a/arch/nios2/kernel/syscall_table.c +++ b/arch/nios2/kernel/syscall_table.c @@ -13,6 +13,7 @@ #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) #define sys_mmap2 sys_mmap_pgoff +#define sys_clone3 __sys_clone3 void *sys_call_table[__NR_syscalls] = { [0 ... __NR_syscalls-1] = sys_ni_syscall, From d8e2f919997b14665e4509ef9a5278f291598d6e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 27 Aug 2025 15:00:08 +0200 Subject: [PATCH 0672/3206] selftests/futex: Fix some futex_numa_mpol subtests The "Memory out of range" subtest of futex_numa_mpol assumes that memory access outside of the mmap'ed area is invalid. That may not be the case depending on the actual memory layout of the test application. When that subtest was run on an x86-64 system with latest upstream kernel, the test passed as an error was returned from futex_wake(). On another PowerPC system, the same subtest failed because futex_wake() returned 0. Bail out! futex2_wake(64, 0x86) should fail, but didn't Looking further into the passed subtest on x86-64, it was found that an -EINVAL was returned instead of -EFAULT. The -EINVAL error was returned because the node value test with FLAGS_NUMA set failed with a node value of 0x7f7f. IOW, the futex memory was accessible and futex_wake() failed because the supposed node number wasn't valid. If that memory location happens to have a very small value (e.g. 0), the test will pass and no error will be returned. Since this subtest is non-deterministic, drop it unless a guard page beyond the mmap region is explicitly set. The other problematic test is the "Memory too small" test. The futex_wake() function returns the -EINVAL error code because the given futex address isn't 8-byte aligned, not because only 4 of the 8 bytes are valid and the other 4 bytes are not. So change the name of this subtest to "Mis-aligned futex" to reflect the reality. [ bp: Massage commit message. ] Fixes: 3163369407ba ("selftests/futex: Add futex_numa_mpol") Signed-off-by: Waiman Long Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/20250827130011.677600-3-bigeasy@linutronix.de --- tools/testing/selftests/futex/functional/futex_numa_mpol.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index a9ecfb2d3932ad..802c15c8219063 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -182,12 +182,10 @@ int main(int argc, char *argv[]) if (futex_numa->numa == FUTEX_NO_NODE) ksft_exit_fail_msg("NUMA node is left uninitialized\n"); - ksft_print_msg("Memory too small\n"); + /* FUTEX2_NUMA futex must be 8-byte aligned */ + ksft_print_msg("Mis-aligned futex\n"); test_futex(futex_ptr + mem_size - 4, 1); - ksft_print_msg("Memory out of range\n"); - test_futex(futex_ptr + mem_size, 1); - futex_numa->numa = FUTEX_NO_NODE; mprotect(futex_ptr, mem_size, PROT_READ); ksft_print_msg("Memory, RO\n"); From f8ef9c24029c85cd0328a9c668283017d8c292ad Mon Sep 17 00:00:00 2001 From: Nai-Chen Cheng Date: Wed, 27 Aug 2025 15:00:09 +0200 Subject: [PATCH 0673/3206] selftests/futex: Fix format-security warnings in futex_priv_hash Fix format-security warnings by using proper format strings when passing message variables to ksft_exit_fail_msg(), ksft_test_result_pass(), and ksft_test_result_skip() function. Thus prevent potential security issues and eliminate compiler warnings when building with -Wformat-security. Signed-off-by: Nai-Chen Cheng Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250827130011.677600-4-bigeasy@linutronix.de --- .../selftests/futex/functional/futex_priv_hash.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_priv_hash.c b/tools/testing/selftests/futex/functional/futex_priv_hash.c index ec032faca6a91f..ffd60d03a992b7 100644 --- a/tools/testing/selftests/futex/functional/futex_priv_hash.c +++ b/tools/testing/selftests/futex/functional/futex_priv_hash.c @@ -192,10 +192,10 @@ int main(int argc, char *argv[]) futex_slots1 = futex_hash_slots_get(); if (futex_slots1 <= 0) { ksft_print_msg("Current hash buckets: %d\n", futex_slots1); - ksft_exit_fail_msg(test_msg_auto_create); + ksft_exit_fail_msg("%s", test_msg_auto_create); } - ksft_test_result_pass(test_msg_auto_create); + ksft_test_result_pass("%s", test_msg_auto_create); online_cpus = sysconf(_SC_NPROCESSORS_ONLN); ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS + 1); @@ -236,11 +236,11 @@ int main(int argc, char *argv[]) } ksft_print_msg("Expected increase of hash buckets but got: %d -> %d\n", futex_slots1, futex_slotsn); - ksft_exit_fail_msg(test_msg_auto_inc); + ksft_exit_fail_msg("%s", test_msg_auto_inc); } - ksft_test_result_pass(test_msg_auto_inc); + ksft_test_result_pass("%s", test_msg_auto_inc); } else { - ksft_test_result_skip(test_msg_auto_inc); + ksft_test_result_skip("%s", test_msg_auto_inc); } ret = pthread_mutex_unlock(&global_lock); From 5fdb877b4916a1eb2b2c85018c2311855893405b Mon Sep 17 00:00:00 2001 From: Gopi Krishna Menon Date: Wed, 27 Aug 2025 15:00:10 +0200 Subject: [PATCH 0674/3206] selftests/futex: Fix typos and grammar in futex_priv_hash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix multiple typos and small grammar issues in help text, comments and test messages in the futex_priv_hash test. Signed-off-by: Gopi Krishna Menon Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov (AMD) Reviewed-by: André Almeida Link: https://lore.kernel.org/20250827130011.677600-5-bigeasy@linutronix.de --- .../testing/selftests/futex/functional/futex_priv_hash.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_priv_hash.c b/tools/testing/selftests/futex/functional/futex_priv_hash.c index ffd60d03a992b7..95f01603a6813c 100644 --- a/tools/testing/selftests/futex/functional/futex_priv_hash.c +++ b/tools/testing/selftests/futex/functional/futex_priv_hash.c @@ -188,7 +188,7 @@ int main(int argc, char *argv[]) if (ret != 0) ksft_exit_fail_msg("pthread_join() failed: %d, %m\n", ret); - /* First thread, has to initialiaze private hash */ + /* First thread, has to initialize private hash */ futex_slots1 = futex_hash_slots_get(); if (futex_slots1 <= 0) { ksft_print_msg("Current hash buckets: %d\n", futex_slots1); @@ -256,17 +256,17 @@ int main(int argc, char *argv[]) futex_hash_slots_set_verify(2); join_max_threads(); - ksft_test_result(counter == MAX_THREADS, "Created of waited for %d of %d threads\n", + ksft_test_result(counter == MAX_THREADS, "Created and waited for %d of %d threads\n", counter, MAX_THREADS); counter = 0; - /* Once the user set something, auto reisze must be disabled */ + /* Once the user set something, auto resize must be disabled */ ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS); create_max_threads(thread_lock_fn); join_max_threads(); ret = futex_hash_slots_get(); - ksft_test_result(ret == 2, "No more auto-resize after manaul setting, got %d\n", + ksft_test_result(ret == 2, "No more auto-resize after manual setting, got %d\n", ret); futex_hash_slots_set_must_fail(1 << 29); From 237bfb76c90b184f57bb18fe35ff366c19393dc8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 27 Aug 2025 15:00:11 +0200 Subject: [PATCH 0675/3206] selftests/futex: Fix futex_wait() for 32bit ARM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 32bit ARM systems gcc-12 will use 32bit timestamps while gcc-13 and later will use 64bit timestamps. The problem is that SYS_futex will continue pointing at the 32bit system call. This makes the futex_wait test fail like this: waiter failed errno 110 not ok 1 futex_wake private returned: 0 Success waiter failed errno 110 not ok 2 futex_wake shared (page anon) returned: 0 Success waiter failed errno 110 not ok 3 futex_wake shared (file backed) returned: 0 Success Instead of compiling differently depending on the gcc version, use the -D_FILE_OFFSET_BITS=64 -D_TIME_BITS=64 options to ensure that 64bit timestamps are used. Then use ifdefs to make SYS_futex point to the 64bit system call. Signed-off-by: Dan Carpenter Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov (AMD) Reviewed-by: André Almeida Tested-by: Anders Roxell Link: https://lore.kernel.org/20250827130011.677600-6-bigeasy@linutronix.de --- tools/testing/selftests/futex/functional/Makefile | 2 +- tools/testing/selftests/futex/include/futextest.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 8cfb87f7f7c505..ddfa61d857b9bf 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 INCLUDES := -I../include -I../../ $(KHDR_INCLUDES) -CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread $(INCLUDES) $(KHDR_INCLUDES) +CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread -D_FILE_OFFSET_BITS=64 -D_TIME_BITS=64 $(INCLUDES) $(KHDR_INCLUDES) LDLIBS := -lpthread -lrt -lnuma LOCAL_HDRS := \ diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h index 7a5fd1d5355e7e..3d48e9789d9fe6 100644 --- a/tools/testing/selftests/futex/include/futextest.h +++ b/tools/testing/selftests/futex/include/futextest.h @@ -58,6 +58,17 @@ typedef volatile u_int32_t futex_t; #define SYS_futex SYS_futex_time64 #endif +/* + * On 32bit systems if we use "-D_FILE_OFFSET_BITS=64 -D_TIME_BITS=64" or if + * we are using a newer compiler then the size of the timestamps will be 64bit, + * however, the SYS_futex will still point to the 32bit futex system call. + */ +#if __SIZEOF_POINTER__ == 4 && defined(SYS_futex_time64) && \ + defined(_TIME_BITS) && _TIME_BITS == 64 +# undef SYS_futex +# define SYS_futex SYS_futex_time64 +#endif + /** * futex() - SYS_futex syscall wrapper * @uaddr: address of first futex From a22d3b0d49d411e64ed07e30c2095035ecb30ed2 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 19 Aug 2025 08:56:22 +0200 Subject: [PATCH 0676/3206] phy: ti: gmii-sel: Always write the RGMII ID setting Some SoCs are just validated with the TX delay enabled. With commit ca13b249f291 ("net: ethernet: ti: am65-cpsw: fixup PHY mode for fixed RGMII TX delay"), the network driver will patch the delay setting on the fly assuming that the TX delay setting is fixed. In reality, the TX delay is configurable and just skipped in the documentation. There are bootloaders, which will disable the TX delay and this will lead to a transmit path which doesn't add any delays at all. Fix that by always writing the RGMII_ID setting and report an error for unsupported RGMII delay modes. This is safe to do and shouldn't break any boards in mainline because the fixed delay is only introduced for gmii-sel compatibles which are used together with the am65-cpsw-nuss driver and also contains the commit above. Fixes: ca13b249f291 ("net: ethernet: ti: am65-cpsw: fixup PHY mode for fixed RGMII TX delay") Signed-off-by: Michael Walle Reviewed-by: Maxime Chevallier Link: https://lore.kernel.org/r/20250819065622.1019537-1-mwalle@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/ti/phy-gmii-sel.c | 47 +++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/drivers/phy/ti/phy-gmii-sel.c b/drivers/phy/ti/phy-gmii-sel.c index ff5d5e29629fab..50adabb867cb12 100644 --- a/drivers/phy/ti/phy-gmii-sel.c +++ b/drivers/phy/ti/phy-gmii-sel.c @@ -34,6 +34,7 @@ enum { PHY_GMII_SEL_PORT_MODE = 0, PHY_GMII_SEL_RGMII_ID_MODE, PHY_GMII_SEL_RMII_IO_CLK_EN, + PHY_GMII_SEL_FIXED_TX_DELAY, PHY_GMII_SEL_LAST, }; @@ -127,6 +128,11 @@ static int phy_gmii_sel_mode(struct phy *phy, enum phy_mode mode, int submode) goto unsupported; } + /* With a fixed delay, some modes are not supported at all. */ + if (soc_data->features & BIT(PHY_GMII_SEL_FIXED_TX_DELAY) && + rgmii_id != 0) + return -EINVAL; + if_phy->phy_if_mode = submode; dev_dbg(dev, "%s id:%u mode:%u rgmii_id:%d rmii_clk_ext:%d\n", @@ -210,25 +216,46 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_soc_dm814 = { static const struct reg_field phy_gmii_sel_fields_am654[][PHY_GMII_SEL_LAST] = { - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x0, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x4, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x8, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0xC, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x10, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x14, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x18, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x1C, 0, 2), }, + { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x0, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x0, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x4, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x4, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x8, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x8, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0xC, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0xC, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x10, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x10, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x14, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x14, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x18, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x18, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x1C, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x1C, 4, 4), + }, }; static const struct phy_gmii_sel_soc_data phy_gmii_sel_soc_am654 = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, }; static const struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw5g_soc_j7200 = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII) | BIT(PHY_INTERFACE_MODE_USXGMII), @@ -239,6 +266,8 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw5g_soc_j7200 = { static const struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j721e = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII), .num_ports = 8, @@ -248,6 +277,8 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j721e = { static const struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j784s4 = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII) | BIT(PHY_INTERFACE_MODE_USXGMII), From 6cb8c1f957f674ca20b7d7c96b1f1bb11b83b679 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Thu, 21 Aug 2025 10:01:47 +0200 Subject: [PATCH 0677/3206] phy: qcom: qmp-pcie: Fix PHY initialization when powered down by firmware Commit 0cc22f5a861c ("phy: qcom: qmp-pcie: Add PHY register retention support") added support for using the "no_csr" reset to skip configuration of the PHY if the init sequence was already applied by the boot firmware. The expectation is that the PHY is only turned on/off by using the "no_csr" reset, instead of powering it down and re-programming it after a full reset. The boot firmware on X1E does not fully conform to this expectation: If the PCIe3 link fails to come up (e.g. because no PCIe card is inserted), the firmware powers down the PHY using the QPHY_PCS_POWER_DOWN_CONTROL register. The QPHY_START_CTRL register is kept as-is, so the driver assumes the PHY is already initialized and skips the configuration/power up sequence. The PHY won't come up again without clearing the QPHY_PCS_POWER_DOWN_CONTROL, so eventually initialization fails: qcom-qmp-pcie-phy 1be0000.phy: phy initialization timed-out phy phy-1be0000.phy.0: phy poweron failed --> -110 qcom-pcie 1bd0000.pcie: cannot initialize host qcom-pcie 1bd0000.pcie: probe with driver qcom-pcie failed with error -110 This can be reliably reproduced on the X1E CRD, QCP and Devkit when no card is inserted for PCIe3. Fix this by checking the QPHY_PCS_POWER_DOWN_CONTROL register in addition to QPHY_START_CTRL. If the PHY is powered down with the register, it doesn't conform to the expectations for using the "no_csr" reset, so we fully re-initialize with the normal reset sequence. Also check the register more carefully to ensure all of the bits we expect are actually set. A simple !!(readl()) is not enough, because the PHY might be only partially set up with some of the expected bits set. Cc: stable@vger.kernel.org Fixes: 0cc22f5a861c ("phy: qcom: qmp-pcie: Add PHY register retention support") Signed-off-by: Stephan Gerhold Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250821-phy-qcom-qmp-pcie-nocsr-fix-v3-1-4898db0cc07c@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-pcie.c | 25 ++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c index 95830dcfdec9b1..0fa63b734b67b8 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c @@ -3067,6 +3067,14 @@ struct qmp_pcie { struct clk_fixed_rate aux_clk_fixed; }; +static bool qphy_checkbits(const void __iomem *base, u32 offset, u32 val) +{ + u32 reg; + + reg = readl(base + offset); + return (reg & val) == val; +} + static inline void qphy_setbits(void __iomem *base, u32 offset, u32 val) { u32 reg; @@ -4339,16 +4347,21 @@ static int qmp_pcie_init(struct phy *phy) struct qmp_pcie *qmp = phy_get_drvdata(phy); const struct qmp_phy_cfg *cfg = qmp->cfg; void __iomem *pcs = qmp->pcs; - bool phy_initialized = !!(readl(pcs + cfg->regs[QPHY_START_CTRL])); int ret; - qmp->skip_init = qmp->nocsr_reset && phy_initialized; /* - * We need to check the existence of init sequences in two cases: - * 1. The PHY doesn't support no_csr reset. - * 2. The PHY supports no_csr reset but isn't initialized by bootloader. - * As we can't skip init in these two cases. + * We can skip PHY initialization if all of the following conditions + * are met: + * 1. The PHY supports the nocsr_reset that preserves the PHY config. + * 2. The PHY was started (and not powered down again) by the + * bootloader, with all of the expected bits set correctly. + * In this case, we can continue without having the init sequence + * defined in the driver. */ + qmp->skip_init = qmp->nocsr_reset && + qphy_checkbits(pcs, cfg->regs[QPHY_START_CTRL], SERDES_START | PCS_START) && + qphy_checkbits(pcs, cfg->regs[QPHY_PCS_POWER_DOWN_CONTROL], cfg->pwrdn_ctrl); + if (!qmp->skip_init && !cfg->tbls.serdes_num) { dev_err(qmp->dev, "Init sequence not available\n"); return -ENODATA; From ed78a01887e2257cff0412b640db68b70a2654dc Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 29 Aug 2025 15:36:27 -0700 Subject: [PATCH 0678/3206] rust: pci: provide access to PCI Class and Class-related items Allow callers to write Class::STORAGE_SCSI instead of bindings::PCI_CLASS_STORAGE_SCSI, for example. New APIs: Class::STORAGE_SCSI, Class::NETWORK_ETHERNET, etc. Class::from_raw() -- Only callable from pci module. Class::as_raw() ClassMask: Full, ClassSubclass Device::pci_class() Cc: Danilo Krummrich Cc: Elle Rhumsaa Reviewed-by: Alexandre Courbot Signed-off-by: John Hubbard Link: https://lore.kernel.org/r/20250829223632.144030-2-jhubbard@nvidia.com [ Minor doc-comment improvements, align Debug and Display. - Danilo ] Signed-off-by: Danilo Krummrich --- MAINTAINERS | 1 + rust/kernel/pci.rs | 11 ++ rust/kernel/pci/id.rs | 236 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 248 insertions(+) create mode 100644 rust/kernel/pci/id.rs diff --git a/MAINTAINERS b/MAINTAINERS index 494aa08165330f..b4528c1ce4453b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19575,6 +19575,7 @@ C: irc://irc.oftc.net/linux-pci T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git F: rust/helpers/pci.c F: rust/kernel/pci.rs +F: rust/kernel/pci/ F: samples/rust/rust_driver_pci.rs PCIE BANDWIDTH CONTROLLER diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index cae4e274f7766f..72d38f23f91486 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -24,6 +24,10 @@ use core::{ }; use kernel::prelude::*; +mod id; + +pub use self::id::{Class, ClassMask}; + /// An adapter for the registration of PCI drivers. pub struct Adapter(T); @@ -459,6 +463,13 @@ impl Device { // - by its type invariant `self.as_raw` is always a valid pointer to a `struct pci_dev`. Ok(unsafe { bindings::pci_resource_len(self.as_raw(), bar.try_into()?) }) } + + /// Returns the PCI class as a `Class` struct. + #[inline] + pub fn pci_class(&self) -> Class { + // SAFETY: `self.as_raw` is a valid pointer to a `struct pci_dev`. + Class::from_raw(unsafe { (*self.as_raw()).class }) + } } impl Device { diff --git a/rust/kernel/pci/id.rs b/rust/kernel/pci/id.rs new file mode 100644 index 00000000000000..f534133aed3def --- /dev/null +++ b/rust/kernel/pci/id.rs @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! PCI device identifiers and related types. +//! +//! This module contains PCI class codes and supporting types. + +use crate::{bindings, error::code::EINVAL, error::Error, prelude::*}; +use core::fmt; + +/// PCI device class codes. +/// +/// Each entry contains the full 24-bit PCI class code (base class in bits +/// 23-16, subclass in bits 15-8, programming interface in bits 7-0). +/// +/// # Examples +/// +/// ``` +/// # use kernel::{device::Core, pci::{self, Class}, prelude::*}; +/// fn probe_device(pdev: &pci::Device) -> Result { +/// let pci_class = pdev.pci_class(); +/// dev_info!( +/// pdev.as_ref(), +/// "Detected PCI class: {}\n", +/// pci_class +/// ); +/// Ok(()) +/// } +/// ``` +#[derive(Clone, Copy, PartialEq, Eq)] +#[repr(transparent)] +pub struct Class(u32); + +/// PCI class mask constants for matching [`Class`] codes. +#[repr(u32)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ClassMask { + /// Match the full 24-bit class code. + Full = 0xffffff, + /// Match the upper 16 bits of the class code (base class and subclass only) + ClassSubclass = 0xffff00, +} + +macro_rules! define_all_pci_classes { + ( + $($variant:ident = $binding:expr,)+ + ) => { + impl Class { + $( + #[allow(missing_docs)] + pub const $variant: Self = Self(Self::to_24bit_class($binding)); + )+ + } + }; +} + +/// Once constructed, a [`Class`] contains a valid PCI class code. +impl Class { + /// Create a [`Class`] from a raw 24-bit class code. + #[inline] + pub(super) fn from_raw(class_code: u32) -> Self { + Self(class_code) + } + + /// Get the raw 24-bit class code value. + #[inline] + pub const fn as_raw(self) -> u32 { + self.0 + } + + // Converts a PCI class constant to 24-bit format. + // + // Many device drivers use only the upper 16 bits (base class and subclass), + // but some use the full 24 bits. In order to support both cases, store the + // class code as a 24-bit value, where 16-bit values are shifted up 8 bits. + const fn to_24bit_class(val: u32) -> u32 { + if val > 0xFFFF { + val + } else { + val << 8 + } + } +} + +impl fmt::Debug for Class { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "0x{:06x}", self.0) + } +} + +impl fmt::Display for Class { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ::fmt(self, f) + } +} + +impl ClassMask { + /// Get the raw mask value. + #[inline] + pub const fn as_raw(self) -> u32 { + self as u32 + } +} + +impl TryFrom for ClassMask { + type Error = Error; + + fn try_from(value: u32) -> Result { + match value { + 0xffffff => Ok(ClassMask::Full), + 0xffff00 => Ok(ClassMask::ClassSubclass), + _ => Err(EINVAL), + } + } +} + +define_all_pci_classes! { + NOT_DEFINED = bindings::PCI_CLASS_NOT_DEFINED, // 0x000000 + NOT_DEFINED_VGA = bindings::PCI_CLASS_NOT_DEFINED_VGA, // 0x000100 + + STORAGE_SCSI = bindings::PCI_CLASS_STORAGE_SCSI, // 0x010000 + STORAGE_IDE = bindings::PCI_CLASS_STORAGE_IDE, // 0x010100 + STORAGE_FLOPPY = bindings::PCI_CLASS_STORAGE_FLOPPY, // 0x010200 + STORAGE_IPI = bindings::PCI_CLASS_STORAGE_IPI, // 0x010300 + STORAGE_RAID = bindings::PCI_CLASS_STORAGE_RAID, // 0x010400 + STORAGE_SATA = bindings::PCI_CLASS_STORAGE_SATA, // 0x010600 + STORAGE_SATA_AHCI = bindings::PCI_CLASS_STORAGE_SATA_AHCI, // 0x010601 + STORAGE_SAS = bindings::PCI_CLASS_STORAGE_SAS, // 0x010700 + STORAGE_EXPRESS = bindings::PCI_CLASS_STORAGE_EXPRESS, // 0x010802 + STORAGE_OTHER = bindings::PCI_CLASS_STORAGE_OTHER, // 0x018000 + + NETWORK_ETHERNET = bindings::PCI_CLASS_NETWORK_ETHERNET, // 0x020000 + NETWORK_TOKEN_RING = bindings::PCI_CLASS_NETWORK_TOKEN_RING, // 0x020100 + NETWORK_FDDI = bindings::PCI_CLASS_NETWORK_FDDI, // 0x020200 + NETWORK_ATM = bindings::PCI_CLASS_NETWORK_ATM, // 0x020300 + NETWORK_OTHER = bindings::PCI_CLASS_NETWORK_OTHER, // 0x028000 + + DISPLAY_VGA = bindings::PCI_CLASS_DISPLAY_VGA, // 0x030000 + DISPLAY_XGA = bindings::PCI_CLASS_DISPLAY_XGA, // 0x030100 + DISPLAY_3D = bindings::PCI_CLASS_DISPLAY_3D, // 0x030200 + DISPLAY_OTHER = bindings::PCI_CLASS_DISPLAY_OTHER, // 0x038000 + + MULTIMEDIA_VIDEO = bindings::PCI_CLASS_MULTIMEDIA_VIDEO, // 0x040000 + MULTIMEDIA_AUDIO = bindings::PCI_CLASS_MULTIMEDIA_AUDIO, // 0x040100 + MULTIMEDIA_PHONE = bindings::PCI_CLASS_MULTIMEDIA_PHONE, // 0x040200 + MULTIMEDIA_HD_AUDIO = bindings::PCI_CLASS_MULTIMEDIA_HD_AUDIO, // 0x040300 + MULTIMEDIA_OTHER = bindings::PCI_CLASS_MULTIMEDIA_OTHER, // 0x048000 + + MEMORY_RAM = bindings::PCI_CLASS_MEMORY_RAM, // 0x050000 + MEMORY_FLASH = bindings::PCI_CLASS_MEMORY_FLASH, // 0x050100 + MEMORY_CXL = bindings::PCI_CLASS_MEMORY_CXL, // 0x050200 + MEMORY_OTHER = bindings::PCI_CLASS_MEMORY_OTHER, // 0x058000 + + BRIDGE_HOST = bindings::PCI_CLASS_BRIDGE_HOST, // 0x060000 + BRIDGE_ISA = bindings::PCI_CLASS_BRIDGE_ISA, // 0x060100 + BRIDGE_EISA = bindings::PCI_CLASS_BRIDGE_EISA, // 0x060200 + BRIDGE_MC = bindings::PCI_CLASS_BRIDGE_MC, // 0x060300 + BRIDGE_PCI_NORMAL = bindings::PCI_CLASS_BRIDGE_PCI_NORMAL, // 0x060400 + BRIDGE_PCI_SUBTRACTIVE = bindings::PCI_CLASS_BRIDGE_PCI_SUBTRACTIVE, // 0x060401 + BRIDGE_PCMCIA = bindings::PCI_CLASS_BRIDGE_PCMCIA, // 0x060500 + BRIDGE_NUBUS = bindings::PCI_CLASS_BRIDGE_NUBUS, // 0x060600 + BRIDGE_CARDBUS = bindings::PCI_CLASS_BRIDGE_CARDBUS, // 0x060700 + BRIDGE_RACEWAY = bindings::PCI_CLASS_BRIDGE_RACEWAY, // 0x060800 + BRIDGE_OTHER = bindings::PCI_CLASS_BRIDGE_OTHER, // 0x068000 + + COMMUNICATION_SERIAL = bindings::PCI_CLASS_COMMUNICATION_SERIAL, // 0x070000 + COMMUNICATION_PARALLEL = bindings::PCI_CLASS_COMMUNICATION_PARALLEL, // 0x070100 + COMMUNICATION_MULTISERIAL = bindings::PCI_CLASS_COMMUNICATION_MULTISERIAL, // 0x070200 + COMMUNICATION_MODEM = bindings::PCI_CLASS_COMMUNICATION_MODEM, // 0x070300 + COMMUNICATION_OTHER = bindings::PCI_CLASS_COMMUNICATION_OTHER, // 0x078000 + + SYSTEM_PIC = bindings::PCI_CLASS_SYSTEM_PIC, // 0x080000 + SYSTEM_PIC_IOAPIC = bindings::PCI_CLASS_SYSTEM_PIC_IOAPIC, // 0x080010 + SYSTEM_PIC_IOXAPIC = bindings::PCI_CLASS_SYSTEM_PIC_IOXAPIC, // 0x080020 + SYSTEM_DMA = bindings::PCI_CLASS_SYSTEM_DMA, // 0x080100 + SYSTEM_TIMER = bindings::PCI_CLASS_SYSTEM_TIMER, // 0x080200 + SYSTEM_RTC = bindings::PCI_CLASS_SYSTEM_RTC, // 0x080300 + SYSTEM_PCI_HOTPLUG = bindings::PCI_CLASS_SYSTEM_PCI_HOTPLUG, // 0x080400 + SYSTEM_SDHCI = bindings::PCI_CLASS_SYSTEM_SDHCI, // 0x080500 + SYSTEM_RCEC = bindings::PCI_CLASS_SYSTEM_RCEC, // 0x080700 + SYSTEM_OTHER = bindings::PCI_CLASS_SYSTEM_OTHER, // 0x088000 + + INPUT_KEYBOARD = bindings::PCI_CLASS_INPUT_KEYBOARD, // 0x090000 + INPUT_PEN = bindings::PCI_CLASS_INPUT_PEN, // 0x090100 + INPUT_MOUSE = bindings::PCI_CLASS_INPUT_MOUSE, // 0x090200 + INPUT_SCANNER = bindings::PCI_CLASS_INPUT_SCANNER, // 0x090300 + INPUT_GAMEPORT = bindings::PCI_CLASS_INPUT_GAMEPORT, // 0x090400 + INPUT_OTHER = bindings::PCI_CLASS_INPUT_OTHER, // 0x098000 + + DOCKING_GENERIC = bindings::PCI_CLASS_DOCKING_GENERIC, // 0x0a0000 + DOCKING_OTHER = bindings::PCI_CLASS_DOCKING_OTHER, // 0x0a8000 + + PROCESSOR_386 = bindings::PCI_CLASS_PROCESSOR_386, // 0x0b0000 + PROCESSOR_486 = bindings::PCI_CLASS_PROCESSOR_486, // 0x0b0100 + PROCESSOR_PENTIUM = bindings::PCI_CLASS_PROCESSOR_PENTIUM, // 0x0b0200 + PROCESSOR_ALPHA = bindings::PCI_CLASS_PROCESSOR_ALPHA, // 0x0b1000 + PROCESSOR_POWERPC = bindings::PCI_CLASS_PROCESSOR_POWERPC, // 0x0b2000 + PROCESSOR_MIPS = bindings::PCI_CLASS_PROCESSOR_MIPS, // 0x0b3000 + PROCESSOR_CO = bindings::PCI_CLASS_PROCESSOR_CO, // 0x0b4000 + + SERIAL_FIREWIRE = bindings::PCI_CLASS_SERIAL_FIREWIRE, // 0x0c0000 + SERIAL_FIREWIRE_OHCI = bindings::PCI_CLASS_SERIAL_FIREWIRE_OHCI, // 0x0c0010 + SERIAL_ACCESS = bindings::PCI_CLASS_SERIAL_ACCESS, // 0x0c0100 + SERIAL_SSA = bindings::PCI_CLASS_SERIAL_SSA, // 0x0c0200 + SERIAL_USB_UHCI = bindings::PCI_CLASS_SERIAL_USB_UHCI, // 0x0c0300 + SERIAL_USB_OHCI = bindings::PCI_CLASS_SERIAL_USB_OHCI, // 0x0c0310 + SERIAL_USB_EHCI = bindings::PCI_CLASS_SERIAL_USB_EHCI, // 0x0c0320 + SERIAL_USB_XHCI = bindings::PCI_CLASS_SERIAL_USB_XHCI, // 0x0c0330 + SERIAL_USB_CDNS = bindings::PCI_CLASS_SERIAL_USB_CDNS, // 0x0c0380 + SERIAL_USB_DEVICE = bindings::PCI_CLASS_SERIAL_USB_DEVICE, // 0x0c03fe + SERIAL_FIBER = bindings::PCI_CLASS_SERIAL_FIBER, // 0x0c0400 + SERIAL_SMBUS = bindings::PCI_CLASS_SERIAL_SMBUS, // 0x0c0500 + SERIAL_IPMI_SMIC = bindings::PCI_CLASS_SERIAL_IPMI_SMIC, // 0x0c0700 + SERIAL_IPMI_KCS = bindings::PCI_CLASS_SERIAL_IPMI_KCS, // 0x0c0701 + SERIAL_IPMI_BT = bindings::PCI_CLASS_SERIAL_IPMI_BT, // 0x0c0702 + + WIRELESS_RF_CONTROLLER = bindings::PCI_CLASS_WIRELESS_RF_CONTROLLER, // 0x0d1000 + WIRELESS_WHCI = bindings::PCI_CLASS_WIRELESS_WHCI, // 0x0d1010 + + INTELLIGENT_I2O = bindings::PCI_CLASS_INTELLIGENT_I2O, // 0x0e0000 + + SATELLITE_TV = bindings::PCI_CLASS_SATELLITE_TV, // 0x0f0000 + SATELLITE_AUDIO = bindings::PCI_CLASS_SATELLITE_AUDIO, // 0x0f0100 + SATELLITE_VOICE = bindings::PCI_CLASS_SATELLITE_VOICE, // 0x0f0300 + SATELLITE_DATA = bindings::PCI_CLASS_SATELLITE_DATA, // 0x0f0400 + + CRYPT_NETWORK = bindings::PCI_CLASS_CRYPT_NETWORK, // 0x100000 + CRYPT_ENTERTAINMENT = bindings::PCI_CLASS_CRYPT_ENTERTAINMENT, // 0x100100 + CRYPT_OTHER = bindings::PCI_CLASS_CRYPT_OTHER, // 0x108000 + + SP_DPIO = bindings::PCI_CLASS_SP_DPIO, // 0x110000 + SP_OTHER = bindings::PCI_CLASS_SP_OTHER, // 0x118000 + + ACCELERATOR_PROCESSING = bindings::PCI_CLASS_ACCELERATOR_PROCESSING, // 0x120000 + + OTHERS = bindings::PCI_CLASS_OTHERS, // 0xff0000 +} From 5e20962a9fc8a0b5b91f0989d3baf03f02bc99cb Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 29 Aug 2025 15:36:28 -0700 Subject: [PATCH 0679/3206] rust: pci: provide access to PCI Vendor values This allows callers to write Vendor::SOME_COMPANY instead of bindings::PCI_VENDOR_ID_SOME_COMPANY. New APIs: Vendor::SOME_COMPANY Vendor::from_raw() -- Only accessible from the pci (parent) module. Vendor::as_raw() Vendor: fmt::Display for Vendor Cc: Danilo Krummrich Cc: Elle Rhumsaa Reviewed-by: Alexandre Courbot Signed-off-by: John Hubbard Link: https://lore.kernel.org/r/20250829223632.144030-3-jhubbard@nvidia.com [ Minor doc-comment improvements, align Debug and Display. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/pci.rs | 2 +- rust/kernel/pci/id.rs | 336 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 336 insertions(+), 2 deletions(-) diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 72d38f23f91486..4f7ef82492ae58 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -26,7 +26,7 @@ use kernel::prelude::*; mod id; -pub use self::id::{Class, ClassMask}; +pub use self::id::{Class, ClassMask, Vendor}; /// An adapter for the registration of PCI drivers. pub struct Adapter(T); diff --git a/rust/kernel/pci/id.rs b/rust/kernel/pci/id.rs index f534133aed3def..f56a024f80aa87 100644 --- a/rust/kernel/pci/id.rs +++ b/rust/kernel/pci/id.rs @@ -2,7 +2,7 @@ //! PCI device identifiers and related types. //! -//! This module contains PCI class codes and supporting types. +//! This module contains PCI class codes, Vendor IDs, and supporting types. use crate::{bindings, error::code::EINVAL, error::Error, prelude::*}; use core::fmt; @@ -113,6 +113,56 @@ impl TryFrom for ClassMask { } } +/// PCI vendor IDs. +/// +/// Each entry contains the 16-bit PCI vendor ID as assigned by the PCI SIG. +#[derive(Clone, Copy, PartialEq, Eq)] +#[repr(transparent)] +pub struct Vendor(u16); + +macro_rules! define_all_pci_vendors { + ( + $($variant:ident = $binding:expr,)+ + ) => { + impl Vendor { + $( + #[allow(missing_docs)] + pub const $variant: Self = Self($binding as u16); + )+ + } + }; +} + +/// Once constructed, a `Vendor` contains a valid PCI Vendor ID. +impl Vendor { + /// Create a Vendor from a raw 16-bit vendor ID. + #[expect(dead_code)] + #[inline] + pub(super) fn from_raw(vendor_id: u16) -> Self { + Self(vendor_id) + } + + /// Get the raw 16-bit vendor ID value. + #[inline] + pub const fn as_raw(self) -> u16 { + self.0 + } +} + +impl fmt::Debug for Vendor { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "0x{:04x}", self.0) + } +} + +impl fmt::Display for Vendor { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ::fmt(self, f) + } +} + define_all_pci_classes! { NOT_DEFINED = bindings::PCI_CLASS_NOT_DEFINED, // 0x000000 NOT_DEFINED_VGA = bindings::PCI_CLASS_NOT_DEFINED_VGA, // 0x000100 @@ -234,3 +284,287 @@ define_all_pci_classes! { OTHERS = bindings::PCI_CLASS_OTHERS, // 0xff0000 } + +define_all_pci_vendors! { + PCI_SIG = bindings::PCI_VENDOR_ID_PCI_SIG, // 0x0001 + LOONGSON = bindings::PCI_VENDOR_ID_LOONGSON, // 0x0014 + SOLIDIGM = bindings::PCI_VENDOR_ID_SOLIDIGM, // 0x025e + TTTECH = bindings::PCI_VENDOR_ID_TTTECH, // 0x0357 + DYNALINK = bindings::PCI_VENDOR_ID_DYNALINK, // 0x0675 + UBIQUITI = bindings::PCI_VENDOR_ID_UBIQUITI, // 0x0777 + BERKOM = bindings::PCI_VENDOR_ID_BERKOM, // 0x0871 + ITTIM = bindings::PCI_VENDOR_ID_ITTIM, // 0x0b48 + COMPAQ = bindings::PCI_VENDOR_ID_COMPAQ, // 0x0e11 + LSI_LOGIC = bindings::PCI_VENDOR_ID_LSI_LOGIC, // 0x1000 + ATI = bindings::PCI_VENDOR_ID_ATI, // 0x1002 + VLSI = bindings::PCI_VENDOR_ID_VLSI, // 0x1004 + ADL = bindings::PCI_VENDOR_ID_ADL, // 0x1005 + NS = bindings::PCI_VENDOR_ID_NS, // 0x100b + TSENG = bindings::PCI_VENDOR_ID_TSENG, // 0x100c + WEITEK = bindings::PCI_VENDOR_ID_WEITEK, // 0x100e + DEC = bindings::PCI_VENDOR_ID_DEC, // 0x1011 + CIRRUS = bindings::PCI_VENDOR_ID_CIRRUS, // 0x1013 + IBM = bindings::PCI_VENDOR_ID_IBM, // 0x1014 + UNISYS = bindings::PCI_VENDOR_ID_UNISYS, // 0x1018 + COMPEX2 = bindings::PCI_VENDOR_ID_COMPEX2, // 0x101a + WD = bindings::PCI_VENDOR_ID_WD, // 0x101c + AMI = bindings::PCI_VENDOR_ID_AMI, // 0x101e + AMD = bindings::PCI_VENDOR_ID_AMD, // 0x1022 + TRIDENT = bindings::PCI_VENDOR_ID_TRIDENT, // 0x1023 + AI = bindings::PCI_VENDOR_ID_AI, // 0x1025 + DELL = bindings::PCI_VENDOR_ID_DELL, // 0x1028 + MATROX = bindings::PCI_VENDOR_ID_MATROX, // 0x102B + MOBILITY_ELECTRONICS = bindings::PCI_VENDOR_ID_MOBILITY_ELECTRONICS, // 0x14f2 + CT = bindings::PCI_VENDOR_ID_CT, // 0x102c + MIRO = bindings::PCI_VENDOR_ID_MIRO, // 0x1031 + NEC = bindings::PCI_VENDOR_ID_NEC, // 0x1033 + FD = bindings::PCI_VENDOR_ID_FD, // 0x1036 + SI = bindings::PCI_VENDOR_ID_SI, // 0x1039 + HP = bindings::PCI_VENDOR_ID_HP, // 0x103c + HP_3PAR = bindings::PCI_VENDOR_ID_HP_3PAR, // 0x1590 + PCTECH = bindings::PCI_VENDOR_ID_PCTECH, // 0x1042 + ASUSTEK = bindings::PCI_VENDOR_ID_ASUSTEK, // 0x1043 + DPT = bindings::PCI_VENDOR_ID_DPT, // 0x1044 + OPTI = bindings::PCI_VENDOR_ID_OPTI, // 0x1045 + ELSA = bindings::PCI_VENDOR_ID_ELSA, // 0x1048 + STMICRO = bindings::PCI_VENDOR_ID_STMICRO, // 0x104A + BUSLOGIC = bindings::PCI_VENDOR_ID_BUSLOGIC, // 0x104B + TI = bindings::PCI_VENDOR_ID_TI, // 0x104c + SONY = bindings::PCI_VENDOR_ID_SONY, // 0x104d + WINBOND2 = bindings::PCI_VENDOR_ID_WINBOND2, // 0x1050 + ANIGMA = bindings::PCI_VENDOR_ID_ANIGMA, // 0x1051 + EFAR = bindings::PCI_VENDOR_ID_EFAR, // 0x1055 + MOTOROLA = bindings::PCI_VENDOR_ID_MOTOROLA, // 0x1057 + PROMISE = bindings::PCI_VENDOR_ID_PROMISE, // 0x105a + FOXCONN = bindings::PCI_VENDOR_ID_FOXCONN, // 0x105b + UMC = bindings::PCI_VENDOR_ID_UMC, // 0x1060 + PICOPOWER = bindings::PCI_VENDOR_ID_PICOPOWER, // 0x1066 + MYLEX = bindings::PCI_VENDOR_ID_MYLEX, // 0x1069 + APPLE = bindings::PCI_VENDOR_ID_APPLE, // 0x106b + YAMAHA = bindings::PCI_VENDOR_ID_YAMAHA, // 0x1073 + QLOGIC = bindings::PCI_VENDOR_ID_QLOGIC, // 0x1077 + CYRIX = bindings::PCI_VENDOR_ID_CYRIX, // 0x1078 + CONTAQ = bindings::PCI_VENDOR_ID_CONTAQ, // 0x1080 + OLICOM = bindings::PCI_VENDOR_ID_OLICOM, // 0x108d + SUN = bindings::PCI_VENDOR_ID_SUN, // 0x108e + NI = bindings::PCI_VENDOR_ID_NI, // 0x1093 + CMD = bindings::PCI_VENDOR_ID_CMD, // 0x1095 + BROOKTREE = bindings::PCI_VENDOR_ID_BROOKTREE, // 0x109e + SGI = bindings::PCI_VENDOR_ID_SGI, // 0x10a9 + WINBOND = bindings::PCI_VENDOR_ID_WINBOND, // 0x10ad + PLX = bindings::PCI_VENDOR_ID_PLX, // 0x10b5 + MADGE = bindings::PCI_VENDOR_ID_MADGE, // 0x10b6 + THREECOM = bindings::PCI_VENDOR_ID_3COM, // 0x10b7 + AL = bindings::PCI_VENDOR_ID_AL, // 0x10b9 + NEOMAGIC = bindings::PCI_VENDOR_ID_NEOMAGIC, // 0x10c8 + TCONRAD = bindings::PCI_VENDOR_ID_TCONRAD, // 0x10da + ROHM = bindings::PCI_VENDOR_ID_ROHM, // 0x10db + NVIDIA = bindings::PCI_VENDOR_ID_NVIDIA, // 0x10de + IMS = bindings::PCI_VENDOR_ID_IMS, // 0x10e0 + AMCC = bindings::PCI_VENDOR_ID_AMCC, // 0x10e8 + AMPERE = bindings::PCI_VENDOR_ID_AMPERE, // 0x1def + INTERG = bindings::PCI_VENDOR_ID_INTERG, // 0x10ea + REALTEK = bindings::PCI_VENDOR_ID_REALTEK, // 0x10ec + XILINX = bindings::PCI_VENDOR_ID_XILINX, // 0x10ee + INIT = bindings::PCI_VENDOR_ID_INIT, // 0x1101 + CREATIVE = bindings::PCI_VENDOR_ID_CREATIVE, // 0x1102 + TTI = bindings::PCI_VENDOR_ID_TTI, // 0x1103 + SIGMA = bindings::PCI_VENDOR_ID_SIGMA, // 0x1105 + VIA = bindings::PCI_VENDOR_ID_VIA, // 0x1106 + SIEMENS = bindings::PCI_VENDOR_ID_SIEMENS, // 0x110A + VORTEX = bindings::PCI_VENDOR_ID_VORTEX, // 0x1119 + EF = bindings::PCI_VENDOR_ID_EF, // 0x111a + IDT = bindings::PCI_VENDOR_ID_IDT, // 0x111d + FORE = bindings::PCI_VENDOR_ID_FORE, // 0x1127 + PHILIPS = bindings::PCI_VENDOR_ID_PHILIPS, // 0x1131 + EICON = bindings::PCI_VENDOR_ID_EICON, // 0x1133 + CISCO = bindings::PCI_VENDOR_ID_CISCO, // 0x1137 + ZIATECH = bindings::PCI_VENDOR_ID_ZIATECH, // 0x1138 + SYSKONNECT = bindings::PCI_VENDOR_ID_SYSKONNECT, // 0x1148 + DIGI = bindings::PCI_VENDOR_ID_DIGI, // 0x114f + XIRCOM = bindings::PCI_VENDOR_ID_XIRCOM, // 0x115d + SERVERWORKS = bindings::PCI_VENDOR_ID_SERVERWORKS, // 0x1166 + ALTERA = bindings::PCI_VENDOR_ID_ALTERA, // 0x1172 + SBE = bindings::PCI_VENDOR_ID_SBE, // 0x1176 + TOSHIBA = bindings::PCI_VENDOR_ID_TOSHIBA, // 0x1179 + TOSHIBA_2 = bindings::PCI_VENDOR_ID_TOSHIBA_2, // 0x102f + ATTO = bindings::PCI_VENDOR_ID_ATTO, // 0x117c + RICOH = bindings::PCI_VENDOR_ID_RICOH, // 0x1180 + DLINK = bindings::PCI_VENDOR_ID_DLINK, // 0x1186 + ARTOP = bindings::PCI_VENDOR_ID_ARTOP, // 0x1191 + ZEITNET = bindings::PCI_VENDOR_ID_ZEITNET, // 0x1193 + FUJITSU_ME = bindings::PCI_VENDOR_ID_FUJITSU_ME, // 0x119e + MARVELL = bindings::PCI_VENDOR_ID_MARVELL, // 0x11ab + MARVELL_EXT = bindings::PCI_VENDOR_ID_MARVELL_EXT, // 0x1b4b + V3 = bindings::PCI_VENDOR_ID_V3, // 0x11b0 + ATT = bindings::PCI_VENDOR_ID_ATT, // 0x11c1 + SPECIALIX = bindings::PCI_VENDOR_ID_SPECIALIX, // 0x11cb + ANALOG_DEVICES = bindings::PCI_VENDOR_ID_ANALOG_DEVICES, // 0x11d4 + ZORAN = bindings::PCI_VENDOR_ID_ZORAN, // 0x11de + COMPEX = bindings::PCI_VENDOR_ID_COMPEX, // 0x11f6 + MICROSEMI = bindings::PCI_VENDOR_ID_MICROSEMI, // 0x11f8 + RP = bindings::PCI_VENDOR_ID_RP, // 0x11fe + CYCLADES = bindings::PCI_VENDOR_ID_CYCLADES, // 0x120e + ESSENTIAL = bindings::PCI_VENDOR_ID_ESSENTIAL, // 0x120f + O2 = bindings::PCI_VENDOR_ID_O2, // 0x1217 + THREEDX = bindings::PCI_VENDOR_ID_3DFX, // 0x121a + AVM = bindings::PCI_VENDOR_ID_AVM, // 0x1244 + STALLION = bindings::PCI_VENDOR_ID_STALLION, // 0x124d + AT = bindings::PCI_VENDOR_ID_AT, // 0x1259 + ASIX = bindings::PCI_VENDOR_ID_ASIX, // 0x125b + ESS = bindings::PCI_VENDOR_ID_ESS, // 0x125d + SATSAGEM = bindings::PCI_VENDOR_ID_SATSAGEM, // 0x1267 + ENSONIQ = bindings::PCI_VENDOR_ID_ENSONIQ, // 0x1274 + TRANSMETA = bindings::PCI_VENDOR_ID_TRANSMETA, // 0x1279 + ROCKWELL = bindings::PCI_VENDOR_ID_ROCKWELL, // 0x127A + ITE = bindings::PCI_VENDOR_ID_ITE, // 0x1283 + ALTEON = bindings::PCI_VENDOR_ID_ALTEON, // 0x12ae + NVIDIA_SGS = bindings::PCI_VENDOR_ID_NVIDIA_SGS, // 0x12d2 + PERICOM = bindings::PCI_VENDOR_ID_PERICOM, // 0x12D8 + AUREAL = bindings::PCI_VENDOR_ID_AUREAL, // 0x12eb + ELECTRONICDESIGNGMBH = bindings::PCI_VENDOR_ID_ELECTRONICDESIGNGMBH, // 0x12f8 + ESDGMBH = bindings::PCI_VENDOR_ID_ESDGMBH, // 0x12fe + CB = bindings::PCI_VENDOR_ID_CB, // 0x1307 + SIIG = bindings::PCI_VENDOR_ID_SIIG, // 0x131f + RADISYS = bindings::PCI_VENDOR_ID_RADISYS, // 0x1331 + MICRO_MEMORY = bindings::PCI_VENDOR_ID_MICRO_MEMORY, // 0x1332 + DOMEX = bindings::PCI_VENDOR_ID_DOMEX, // 0x134a + INTASHIELD = bindings::PCI_VENDOR_ID_INTASHIELD, // 0x135a + QUATECH = bindings::PCI_VENDOR_ID_QUATECH, // 0x135C + SEALEVEL = bindings::PCI_VENDOR_ID_SEALEVEL, // 0x135e + HYPERCOPE = bindings::PCI_VENDOR_ID_HYPERCOPE, // 0x1365 + DIGIGRAM = bindings::PCI_VENDOR_ID_DIGIGRAM, // 0x1369 + KAWASAKI = bindings::PCI_VENDOR_ID_KAWASAKI, // 0x136b + CNET = bindings::PCI_VENDOR_ID_CNET, // 0x1371 + LMC = bindings::PCI_VENDOR_ID_LMC, // 0x1376 + NETGEAR = bindings::PCI_VENDOR_ID_NETGEAR, // 0x1385 + APPLICOM = bindings::PCI_VENDOR_ID_APPLICOM, // 0x1389 + MOXA = bindings::PCI_VENDOR_ID_MOXA, // 0x1393 + CCD = bindings::PCI_VENDOR_ID_CCD, // 0x1397 + EXAR = bindings::PCI_VENDOR_ID_EXAR, // 0x13a8 + MICROGATE = bindings::PCI_VENDOR_ID_MICROGATE, // 0x13c0 + THREEWARE = bindings::PCI_VENDOR_ID_3WARE, // 0x13C1 + IOMEGA = bindings::PCI_VENDOR_ID_IOMEGA, // 0x13ca + ABOCOM = bindings::PCI_VENDOR_ID_ABOCOM, // 0x13D1 + SUNDANCE = bindings::PCI_VENDOR_ID_SUNDANCE, // 0x13f0 + CMEDIA = bindings::PCI_VENDOR_ID_CMEDIA, // 0x13f6 + ADVANTECH = bindings::PCI_VENDOR_ID_ADVANTECH, // 0x13fe + MEILHAUS = bindings::PCI_VENDOR_ID_MEILHAUS, // 0x1402 + LAVA = bindings::PCI_VENDOR_ID_LAVA, // 0x1407 + TIMEDIA = bindings::PCI_VENDOR_ID_TIMEDIA, // 0x1409 + ICE = bindings::PCI_VENDOR_ID_ICE, // 0x1412 + MICROSOFT = bindings::PCI_VENDOR_ID_MICROSOFT, // 0x1414 + OXSEMI = bindings::PCI_VENDOR_ID_OXSEMI, // 0x1415 + CHELSIO = bindings::PCI_VENDOR_ID_CHELSIO, // 0x1425 + EDIMAX = bindings::PCI_VENDOR_ID_EDIMAX, // 0x1432 + ADLINK = bindings::PCI_VENDOR_ID_ADLINK, // 0x144a + SAMSUNG = bindings::PCI_VENDOR_ID_SAMSUNG, // 0x144d + GIGABYTE = bindings::PCI_VENDOR_ID_GIGABYTE, // 0x1458 + AMBIT = bindings::PCI_VENDOR_ID_AMBIT, // 0x1468 + MYRICOM = bindings::PCI_VENDOR_ID_MYRICOM, // 0x14c1 + MEDIATEK = bindings::PCI_VENDOR_ID_MEDIATEK, // 0x14c3 + TITAN = bindings::PCI_VENDOR_ID_TITAN, // 0x14D2 + PANACOM = bindings::PCI_VENDOR_ID_PANACOM, // 0x14d4 + SIPACKETS = bindings::PCI_VENDOR_ID_SIPACKETS, // 0x14d9 + AFAVLAB = bindings::PCI_VENDOR_ID_AFAVLAB, // 0x14db + AMPLICON = bindings::PCI_VENDOR_ID_AMPLICON, // 0x14dc + BCM_GVC = bindings::PCI_VENDOR_ID_BCM_GVC, // 0x14a4 + BROADCOM = bindings::PCI_VENDOR_ID_BROADCOM, // 0x14e4 + TOPIC = bindings::PCI_VENDOR_ID_TOPIC, // 0x151f + MAINPINE = bindings::PCI_VENDOR_ID_MAINPINE, // 0x1522 + ENE = bindings::PCI_VENDOR_ID_ENE, // 0x1524 + SYBA = bindings::PCI_VENDOR_ID_SYBA, // 0x1592 + MORETON = bindings::PCI_VENDOR_ID_MORETON, // 0x15aa + VMWARE = bindings::PCI_VENDOR_ID_VMWARE, // 0x15ad + ZOLTRIX = bindings::PCI_VENDOR_ID_ZOLTRIX, // 0x15b0 + MELLANOX = bindings::PCI_VENDOR_ID_MELLANOX, // 0x15b3 + DFI = bindings::PCI_VENDOR_ID_DFI, // 0x15bd + QUICKNET = bindings::PCI_VENDOR_ID_QUICKNET, // 0x15e2 + ADDIDATA = bindings::PCI_VENDOR_ID_ADDIDATA, // 0x15B8 + PDC = bindings::PCI_VENDOR_ID_PDC, // 0x15e9 + FARSITE = bindings::PCI_VENDOR_ID_FARSITE, // 0x1619 + ARIMA = bindings::PCI_VENDOR_ID_ARIMA, // 0x161f + BROCADE = bindings::PCI_VENDOR_ID_BROCADE, // 0x1657 + SIBYTE = bindings::PCI_VENDOR_ID_SIBYTE, // 0x166d + ATHEROS = bindings::PCI_VENDOR_ID_ATHEROS, // 0x168c + NETCELL = bindings::PCI_VENDOR_ID_NETCELL, // 0x169c + CENATEK = bindings::PCI_VENDOR_ID_CENATEK, // 0x16CA + SYNOPSYS = bindings::PCI_VENDOR_ID_SYNOPSYS, // 0x16c3 + USR = bindings::PCI_VENDOR_ID_USR, // 0x16ec + VITESSE = bindings::PCI_VENDOR_ID_VITESSE, // 0x1725 + LINKSYS = bindings::PCI_VENDOR_ID_LINKSYS, // 0x1737 + ALTIMA = bindings::PCI_VENDOR_ID_ALTIMA, // 0x173b + CAVIUM = bindings::PCI_VENDOR_ID_CAVIUM, // 0x177d + TECHWELL = bindings::PCI_VENDOR_ID_TECHWELL, // 0x1797 + BELKIN = bindings::PCI_VENDOR_ID_BELKIN, // 0x1799 + RDC = bindings::PCI_VENDOR_ID_RDC, // 0x17f3 + GLI = bindings::PCI_VENDOR_ID_GLI, // 0x17a0 + LENOVO = bindings::PCI_VENDOR_ID_LENOVO, // 0x17aa + QCOM = bindings::PCI_VENDOR_ID_QCOM, // 0x17cb + CDNS = bindings::PCI_VENDOR_ID_CDNS, // 0x17cd + ARECA = bindings::PCI_VENDOR_ID_ARECA, // 0x17d3 + S2IO = bindings::PCI_VENDOR_ID_S2IO, // 0x17d5 + SITECOM = bindings::PCI_VENDOR_ID_SITECOM, // 0x182d + TOPSPIN = bindings::PCI_VENDOR_ID_TOPSPIN, // 0x1867 + COMMTECH = bindings::PCI_VENDOR_ID_COMMTECH, // 0x18f7 + SILAN = bindings::PCI_VENDOR_ID_SILAN, // 0x1904 + RENESAS = bindings::PCI_VENDOR_ID_RENESAS, // 0x1912 + SOLARFLARE = bindings::PCI_VENDOR_ID_SOLARFLARE, // 0x1924 + TDI = bindings::PCI_VENDOR_ID_TDI, // 0x192E + NXP = bindings::PCI_VENDOR_ID_NXP, // 0x1957 + PASEMI = bindings::PCI_VENDOR_ID_PASEMI, // 0x1959 + ATTANSIC = bindings::PCI_VENDOR_ID_ATTANSIC, // 0x1969 + JMICRON = bindings::PCI_VENDOR_ID_JMICRON, // 0x197B + KORENIX = bindings::PCI_VENDOR_ID_KORENIX, // 0x1982 + HUAWEI = bindings::PCI_VENDOR_ID_HUAWEI, // 0x19e5 + NETRONOME = bindings::PCI_VENDOR_ID_NETRONOME, // 0x19ee + QMI = bindings::PCI_VENDOR_ID_QMI, // 0x1a32 + AZWAVE = bindings::PCI_VENDOR_ID_AZWAVE, // 0x1a3b + REDHAT_QUMRANET = bindings::PCI_VENDOR_ID_REDHAT_QUMRANET, // 0x1af4 + ASMEDIA = bindings::PCI_VENDOR_ID_ASMEDIA, // 0x1b21 + REDHAT = bindings::PCI_VENDOR_ID_REDHAT, // 0x1b36 + WCHIC = bindings::PCI_VENDOR_ID_WCHIC, // 0x1c00 + SILICOM_DENMARK = bindings::PCI_VENDOR_ID_SILICOM_DENMARK, // 0x1c2c + AMAZON_ANNAPURNA_LABS = bindings::PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, // 0x1c36 + CIRCUITCO = bindings::PCI_VENDOR_ID_CIRCUITCO, // 0x1cc8 + AMAZON = bindings::PCI_VENDOR_ID_AMAZON, // 0x1d0f + ZHAOXIN = bindings::PCI_VENDOR_ID_ZHAOXIN, // 0x1d17 + ROCKCHIP = bindings::PCI_VENDOR_ID_ROCKCHIP, // 0x1d87 + HYGON = bindings::PCI_VENDOR_ID_HYGON, // 0x1d94 + META = bindings::PCI_VENDOR_ID_META, // 0x1d9b + FUNGIBLE = bindings::PCI_VENDOR_ID_FUNGIBLE, // 0x1dad + HXT = bindings::PCI_VENDOR_ID_HXT, // 0x1dbf + TEKRAM = bindings::PCI_VENDOR_ID_TEKRAM, // 0x1de1 + RPI = bindings::PCI_VENDOR_ID_RPI, // 0x1de4 + ALIBABA = bindings::PCI_VENDOR_ID_ALIBABA, // 0x1ded + CXL = bindings::PCI_VENDOR_ID_CXL, // 0x1e98 + TEHUTI = bindings::PCI_VENDOR_ID_TEHUTI, // 0x1fc9 + SUNIX = bindings::PCI_VENDOR_ID_SUNIX, // 0x1fd4 + HINT = bindings::PCI_VENDOR_ID_HINT, // 0x3388 + THREEDLABS = bindings::PCI_VENDOR_ID_3DLABS, // 0x3d3d + NETXEN = bindings::PCI_VENDOR_ID_NETXEN, // 0x4040 + AKS = bindings::PCI_VENDOR_ID_AKS, // 0x416c + WCHCN = bindings::PCI_VENDOR_ID_WCHCN, // 0x4348 + ACCESSIO = bindings::PCI_VENDOR_ID_ACCESSIO, // 0x494f + S3 = bindings::PCI_VENDOR_ID_S3, // 0x5333 + DUNORD = bindings::PCI_VENDOR_ID_DUNORD, // 0x5544 + DCI = bindings::PCI_VENDOR_ID_DCI, // 0x6666 + GLENFLY = bindings::PCI_VENDOR_ID_GLENFLY, // 0x6766 + INTEL = bindings::PCI_VENDOR_ID_INTEL, // 0x8086 + WANGXUN = bindings::PCI_VENDOR_ID_WANGXUN, // 0x8088 + SCALEMP = bindings::PCI_VENDOR_ID_SCALEMP, // 0x8686 + COMPUTONE = bindings::PCI_VENDOR_ID_COMPUTONE, // 0x8e0e + KTI = bindings::PCI_VENDOR_ID_KTI, // 0x8e2e + ADAPTEC = bindings::PCI_VENDOR_ID_ADAPTEC, // 0x9004 + ADAPTEC2 = bindings::PCI_VENDOR_ID_ADAPTEC2, // 0x9005 + HOLTEK = bindings::PCI_VENDOR_ID_HOLTEK, // 0x9412 + NETMOS = bindings::PCI_VENDOR_ID_NETMOS, // 0x9710 + THREECOM_2 = bindings::PCI_VENDOR_ID_3COM_2, // 0xa727 + SOLIDRUN = bindings::PCI_VENDOR_ID_SOLIDRUN, // 0xd063 + DIGIUM = bindings::PCI_VENDOR_ID_DIGIUM, // 0xd161 + TIGERJET = bindings::PCI_VENDOR_ID_TIGERJET, // 0xe159 + XILINX_RME = bindings::PCI_VENDOR_ID_XILINX_RME, // 0xea60 + XEN = bindings::PCI_VENDOR_ID_XEN, // 0x5853 + OCZ = bindings::PCI_VENDOR_ID_OCZ, // 0x1b85 + NCUBE = bindings::PCI_VENDOR_ID_NCUBE, // 0x10ff +} From dd3933e9b572aed775fd632ed6124aa67457de72 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 29 Aug 2025 15:36:29 -0700 Subject: [PATCH 0680/3206] rust: pci: add DeviceId::from_class_and_vendor() method Add a new method to create PCI DeviceIds that match both a specific vendor and PCI class. This is more targeted than the existing from_class() method as it filters on both vendor and class criteria. Cc: Danilo Krummrich Cc: Elle Rhumsaa Reviewed-by: Alexandre Courbot Signed-off-by: John Hubbard Link: https://lore.kernel.org/r/20250829223632.144030-4-jhubbard@nvidia.com [ Minor doc-comment improvements. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/pci.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 4f7ef82492ae58..5d95081346e61d 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -162,6 +162,29 @@ impl DeviceId { override_only: 0, }) } + + /// Create a new [`DeviceId`] from a class number, mask, and specific vendor. + /// + /// This is more targeted than [`DeviceId::from_class`]: in addition to matching by [`Vendor`], + /// it also matches the PCI [`Class`] (up to the entire 24 bits, depending on the + /// [`ClassMask`]). + #[inline] + pub const fn from_class_and_vendor( + class: Class, + class_mask: ClassMask, + vendor: Vendor, + ) -> Self { + Self(bindings::pci_device_id { + vendor: vendor.as_raw() as u32, + device: DeviceId::PCI_ANY_ID, + subvendor: DeviceId::PCI_ANY_ID, + subdevice: DeviceId::PCI_ANY_ID, + class: class.as_raw(), + class_mask: class_mask.as_raw(), + driver_data: 0, + override_only: 0, + }) + } } // SAFETY: `DeviceId` is a `#[repr(transparent)]` wrapper of `pci_device_id` and does not add From 6783d3b08595e932938a244e97d92cda0c0833a1 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 29 Aug 2025 15:36:30 -0700 Subject: [PATCH 0681/3206] gpu: nova-core: avoid probing non-display/compute PCI functions NovaCore has so far been too imprecise about figuring out if .probe() has found a supported PCI PF (Physical Function). By that I mean: .probe() sets up BAR0 (which involves a lot of very careful devres and Device details behind the scenes). And then if it is dealing with a non-supported device such as the .1 audio PF on many GPUs, it fails out due to an unexpected BAR0 size. We have been fortunate that the BAR0 sizes are different. Really, we should be filtering on PCI class ID instead. These days I think we can confidently pick out Nova's supported PF's via PCI class ID. And if not, then we'll revisit. The approach here is to filter on "Display VGA" or "Display 3D", which is how PCI class IDs express "this is a modern GPU's PF". Cc: Danilo Krummrich Cc: Elle Rhumsaa Reviewed-by: Alexandre Courbot Signed-off-by: John Hubbard Link: https://lore.kernel.org/r/20250829223632.144030-5-jhubbard@nvidia.com Signed-off-by: Danilo Krummrich --- drivers/gpu/nova-core/driver.rs | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/nova-core/driver.rs b/drivers/gpu/nova-core/driver.rs index 274989ea1fb4a5..5d23a91f51ddbd 100644 --- a/drivers/gpu/nova-core/driver.rs +++ b/drivers/gpu/nova-core/driver.rs @@ -1,6 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 -use kernel::{auxiliary, bindings, c_str, device::Core, pci, prelude::*, sizes::SZ_16M, sync::Arc}; +use kernel::{ + auxiliary, c_str, + device::Core, + pci, + pci::{Class, ClassMask, Vendor}, + prelude::*, + sizes::SZ_16M, + sync::Arc, +}; use crate::gpu::Gpu; @@ -18,10 +26,25 @@ kernel::pci_device_table!( PCI_TABLE, MODULE_PCI_TABLE, ::IdInfo, - [( - pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_NVIDIA, bindings::PCI_ANY_ID as u32), - () - )] + [ + // Modern NVIDIA GPUs will show up as either VGA or 3D controllers. + ( + pci::DeviceId::from_class_and_vendor( + Class::DISPLAY_VGA, + ClassMask::ClassSubclass, + Vendor::NVIDIA + ), + () + ), + ( + pci::DeviceId::from_class_and_vendor( + Class::DISPLAY_3D, + ClassMask::ClassSubclass, + Vendor::NVIDIA + ), + () + ), + ] ); impl pci::Driver for NovaCore { From 1b8ac37677d307cd0fc10f6bf9bceae2c282bdb4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 29 Aug 2025 15:36:31 -0700 Subject: [PATCH 0682/3206] rust: pci: use pci::Vendor instead of bindings::PCI_VENDOR_ID_* Change Device::vendor_id() to return a Vendor type, and change DeviceId::from_id() to accept a Vendor type. Use the new pci::Vendor in the various Rust for Linux callers who were previously using bindings::PCI_VENDOR_ID_*. Doing so also allows removing "use kernel::bindings" entirely from most of the affected files here. Also, mark vendor_id() as inline. Cc: Danilo Krummrich Cc: Elle Rhumsaa Reviewed-by: Alexandre Courbot Signed-off-by: John Hubbard Link: https://lore.kernel.org/r/20250829223632.144030-6-jhubbard@nvidia.com [ Replace "as a validated vendor" with "as [`Vendor`]". - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/pci.rs | 35 ++++++++++++++++++++------- rust/kernel/pci/id.rs | 1 - samples/rust/rust_dma.rs | 6 +---- samples/rust/rust_driver_auxiliary.rs | 12 ++++----- samples/rust/rust_driver_pci.rs | 9 ++++--- 5 files changed, 37 insertions(+), 26 deletions(-) diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 5d95081346e61d..391baf95929a8b 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -133,10 +133,10 @@ impl DeviceId { /// Equivalent to C's `PCI_DEVICE` macro. /// - /// Create a new `pci::DeviceId` from a vendor and device ID number. - pub const fn from_id(vendor: u32, device: u32) -> Self { + /// Create a new `pci::DeviceId` from a vendor and device ID. + pub const fn from_id(vendor: Vendor, device: u32) -> Self { Self(bindings::pci_device_id { - vendor, + vendor: vendor.as_raw() as u32, device, subvendor: DeviceId::PCI_ANY_ID, subdevice: DeviceId::PCI_ANY_ID, @@ -234,7 +234,7 @@ macro_rules! pci_device_table { /// ::IdInfo, /// [ /// ( -/// pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_REDHAT, bindings::PCI_ANY_ID as u32), +/// pci::DeviceId::from_id(pci::Vendor::REDHAT, bindings::PCI_ANY_ID as u32), /// (), /// ) /// ] @@ -415,12 +415,29 @@ impl Device { } impl Device { - /// Returns the PCI vendor ID. + /// Returns the PCI vendor ID as [`Vendor`]. + /// + /// # Examples + /// + /// ``` + /// # use kernel::{device::Core, pci::{self, Vendor}, prelude::*}; + /// fn log_device_info(pdev: &pci::Device) -> Result { + /// // Get an instance of `Vendor`. + /// let vendor = pdev.vendor_id(); + /// dev_info!( + /// pdev.as_ref(), + /// "Device: Vendor={}, Device=0x{:x}\n", + /// vendor, + /// pdev.device_id() + /// ); + /// Ok(()) + /// } + /// ``` #[inline] - pub fn vendor_id(&self) -> u16 { - // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a - // `struct pci_dev`. - unsafe { (*self.as_raw()).vendor } + pub fn vendor_id(&self) -> Vendor { + // SAFETY: `self.as_raw` is a valid pointer to a `struct pci_dev`. + let vendor_id = unsafe { (*self.as_raw()).vendor }; + Vendor::from_raw(vendor_id) } /// Returns the PCI device ID. diff --git a/rust/kernel/pci/id.rs b/rust/kernel/pci/id.rs index f56a024f80aa87..8ee1dc5c3057f5 100644 --- a/rust/kernel/pci/id.rs +++ b/rust/kernel/pci/id.rs @@ -136,7 +136,6 @@ macro_rules! define_all_pci_vendors { /// Once constructed, a `Vendor` contains a valid PCI Vendor ID. impl Vendor { /// Create a Vendor from a raw 16-bit vendor ID. - #[expect(dead_code)] #[inline] pub(super) fn from_raw(vendor_id: u16) -> Self { Self(vendor_id) diff --git a/samples/rust/rust_dma.rs b/samples/rust/rust_dma.rs index c5e7cce6865402..f3385c4a7e5b51 100644 --- a/samples/rust/rust_dma.rs +++ b/samples/rust/rust_dma.rs @@ -5,7 +5,6 @@ //! To make this driver probe, QEMU must be run with `-device pci-testdev`. use kernel::{ - bindings, device::Core, dma::{CoherentAllocation, Device, DmaMask}, pci, @@ -45,10 +44,7 @@ kernel::pci_device_table!( PCI_TABLE, MODULE_PCI_TABLE, ::IdInfo, - [( - pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_REDHAT, 0x5), - () - )] + [(pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5), ())] ); impl pci::Driver for DmaSampleDriver { diff --git a/samples/rust/rust_driver_auxiliary.rs b/samples/rust/rust_driver_auxiliary.rs index f2a820683fc39e..55ece336ee45ae 100644 --- a/samples/rust/rust_driver_auxiliary.rs +++ b/samples/rust/rust_driver_auxiliary.rs @@ -5,7 +5,7 @@ //! To make this driver probe, QEMU must be run with `-device pci-testdev`. use kernel::{ - auxiliary, bindings, c_str, device::Core, driver, error::Error, pci, prelude::*, InPlaceModule, + auxiliary, c_str, device::Core, driver, error::Error, pci, prelude::*, InPlaceModule, }; use pin_init::PinInit; @@ -50,10 +50,7 @@ kernel::pci_device_table!( PCI_TABLE, MODULE_PCI_TABLE, ::IdInfo, - [( - pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_REDHAT, 0x5), - () - )] + [(pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5), ())] ); impl pci::Driver for ParentDriver { @@ -81,11 +78,12 @@ impl ParentDriver { let parent = adev.parent().ok_or(EINVAL)?; let pdev: &pci::Device = parent.try_into()?; + let vendor = pdev.vendor_id(); dev_info!( adev.as_ref(), - "Connect auxiliary {} with parent: VendorID={:#x}, DeviceID={:#x}\n", + "Connect auxiliary {} with parent: VendorID={}, DeviceID={:#x}\n", adev.id(), - pdev.vendor_id(), + vendor, pdev.device_id() ); diff --git a/samples/rust/rust_driver_pci.rs b/samples/rust/rust_driver_pci.rs index 0798019014cd95..97baec8df9bcdb 100644 --- a/samples/rust/rust_driver_pci.rs +++ b/samples/rust/rust_driver_pci.rs @@ -4,7 +4,7 @@ //! //! To make this driver probe, QEMU must be run with `-device pci-testdev`. -use kernel::{bindings, c_str, device::Core, devres::Devres, pci, prelude::*, sync::aref::ARef}; +use kernel::{c_str, device::Core, devres::Devres, pci, prelude::*, sync::aref::ARef}; struct Regs; @@ -38,7 +38,7 @@ kernel::pci_device_table!( MODULE_PCI_TABLE, ::IdInfo, [( - pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_REDHAT, 0x5), + pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5), TestIndex::NO_EVENTFD )] ); @@ -66,10 +66,11 @@ impl pci::Driver for SampleDriver { const ID_TABLE: pci::IdTable = &PCI_TABLE; fn probe(pdev: &pci::Device, info: &Self::IdInfo) -> Result>> { + let vendor = pdev.vendor_id(); dev_dbg!( pdev.as_ref(), - "Probe Rust PCI driver sample (PCI ID: 0x{:x}, 0x{:x}).\n", - pdev.vendor_id(), + "Probe Rust PCI driver sample (PCI ID: {}, 0x{:x}).\n", + vendor, pdev.device_id() ); From 7bb02685fb5aa8f97568a2beb568a15c5c544454 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 29 Aug 2025 15:36:32 -0700 Subject: [PATCH 0683/3206] rust: pci: inline several tiny functions Several previous commits added Vendor and Class functionality. As part of that, the new functions were inlined where appropriate. But that left this file with inconsistent use of inlining. Fix that by inlining the remaining items that should be. Cc: Danilo Krummrich Cc: Elle Rhumsaa Reviewed-by: Alexandre Courbot Signed-off-by: John Hubbard Link: https://lore.kernel.org/r/20250829223632.144030-7-jhubbard@nvidia.com Signed-off-by: Danilo Krummrich --- rust/kernel/pci.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 391baf95929a8b..78271bf88ceafc 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -134,6 +134,7 @@ impl DeviceId { /// Equivalent to C's `PCI_DEVICE` macro. /// /// Create a new `pci::DeviceId` from a vendor and device ID. + #[inline] pub const fn from_id(vendor: Vendor, device: u32) -> Self { Self(bindings::pci_device_id { vendor: vendor.as_raw() as u32, @@ -150,6 +151,7 @@ impl DeviceId { /// Equivalent to C's `PCI_DEVICE_CLASS` macro. /// /// Create a new `pci::DeviceId` from a class number and mask. + #[inline] pub const fn from_class(class: u32, class_mask: u32) -> Self { Self(bindings::pci_device_id { vendor: DeviceId::PCI_ANY_ID, @@ -387,6 +389,7 @@ impl Bar { } impl Bar { + #[inline] fn index_is_valid(index: u32) -> bool { // A `struct pci_dev` owns an array of resources with at most `PCI_NUM_RESOURCES` entries. index < bindings::PCI_NUM_RESOURCES @@ -409,6 +412,7 @@ impl Deref for Bar { } impl Device { + #[inline] fn as_raw(&self) -> *mut bindings::pci_dev { self.0.get() } @@ -582,6 +586,7 @@ impl Device { } /// Enable bus-mastering for this device. + #[inline] pub fn set_master(&self) { // SAFETY: `self.as_raw` is guaranteed to be a pointer to a valid `struct pci_dev`. unsafe { bindings::pci_set_master(self.as_raw()) }; From d1ff0e2d13d6ac3a15be7870e15216726b0a809a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 20 Aug 2025 10:29:27 +0200 Subject: [PATCH 0684/3206] tools/nolibc: avoid error in dup2() if old fd equals new fd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dup2() allows both 'old' and 'new' to have the same value, which dup3() does not. If libc dup2() is implemented through the dup3() system call, then it would incorrectly fail in this case. Avoid the error by handling old == new explicitly. Fixes: 30ca20517ac1 ("tools headers: Move the nolibc header from rcutorture to tools/include/nolibc/") Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250820-nolibc-dup2-einval-v2-1-807185a45c56@linutronix.de Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/sys.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 295e71d34abadb..90aadad31f6cb5 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -238,6 +238,19 @@ static __attribute__((unused)) int sys_dup2(int old, int new) { #if defined(__NR_dup3) + int ret, nr_fcntl; + +#ifdef __NR_fcntl64 + nr_fcntl = __NR_fcntl64; +#else + nr_fcntl = __NR_fcntl; +#endif + + if (old == new) { + ret = my_syscall2(nr_fcntl, old, F_GETFD); + return ret < 0 ? ret : old; + } + return my_syscall3(__NR_dup3, old, new, 0); #elif defined(__NR_dup2) return my_syscall2(__NR_dup2, old, new); From b22d81ed319cbe2d5b45f605cab18aaf82a66a50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 24 Aug 2025 08:58:46 +0200 Subject: [PATCH 0685/3206] tools/nolibc: use tabs instead of spaces for indentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some lines are using spaces for indentation instead of the standard tabs. Fix them. Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/sys/random.h | 4 ++-- tools/include/nolibc/unistd.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/include/nolibc/sys/random.h b/tools/include/nolibc/sys/random.h index 8d9749f1c84572..cd5d25c571a8b7 100644 --- a/tools/include/nolibc/sys/random.h +++ b/tools/include/nolibc/sys/random.h @@ -22,13 +22,13 @@ static __attribute__((unused)) ssize_t sys_getrandom(void *buf, size_t buflen, unsigned int flags) { - return my_syscall3(__NR_getrandom, buf, buflen, flags); + return my_syscall3(__NR_getrandom, buf, buflen, flags); } static __attribute__((unused)) ssize_t getrandom(void *buf, size_t buflen, unsigned int flags) { - return __sysret(sys_getrandom(buf, buflen, flags)); + return __sysret(sys_getrandom(buf, buflen, flags)); } #endif /* _NOLIBC_SYS_RANDOM_H */ diff --git a/tools/include/nolibc/unistd.h b/tools/include/nolibc/unistd.h index 25bfc7732ec7e7..7405fa2b89baa8 100644 --- a/tools/include/nolibc/unistd.h +++ b/tools/include/nolibc/unistd.h @@ -33,7 +33,7 @@ static __attribute__((unused)) int sys_faccessat(int fd, const char *path, int amode, int flag) { - return my_syscall4(__NR_faccessat, fd, path, amode, flag); + return my_syscall4(__NR_faccessat, fd, path, amode, flag); } static __attribute__((unused)) From e6366101ce1fab9e42ae66ff0fed5360fce65bb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 17:40:32 +0200 Subject: [PATCH 0686/3206] tools/nolibc: remove __nolibc_enosys() fallback from time64-related functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These fallbacks where added when no explicit fallbacks for time64 was implemented. Now that these fallbacks are in place, the additional fallback to __nolibc_enosys() is superfluous. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250821-nolibc-enosys-v1-1-4b63f2caaa89@weissschuh.net --- tools/include/nolibc/poll.h | 4 +--- tools/include/nolibc/sys.h | 4 +--- tools/include/nolibc/sys/timerfd.h | 8 ++------ tools/include/nolibc/time.h | 8 ++------ 4 files changed, 6 insertions(+), 18 deletions(-) diff --git a/tools/include/nolibc/poll.h b/tools/include/nolibc/poll.h index 1765acb17ea01f..0d053f93ea99b0 100644 --- a/tools/include/nolibc/poll.h +++ b/tools/include/nolibc/poll.h @@ -39,10 +39,8 @@ int sys_poll(struct pollfd *fds, int nfds, int timeout) t.tv_nsec = (timeout % 1000) * 1000000; } return my_syscall5(__NR_ppoll_time64, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); -#elif defined(__NR_poll) - return my_syscall3(__NR_poll, fds, nfds, timeout); #else - return __nolibc_enosys(__func__, fds, nfds, timeout); + return my_syscall3(__NR_poll, fds, nfds, timeout); #endif } diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 90aadad31f6cb5..80a5cef3bdf424 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -814,7 +814,7 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva t.tv_nsec = timeout->tv_usec * 1000; } return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); -#elif defined(__NR_pselect6_time64) +#else struct __kernel_timespec t; if (timeout) { @@ -822,8 +822,6 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva t.tv_nsec = timeout->tv_usec * 1000; } return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); -#else - return __nolibc_enosys(__func__, nfds, rfds, wfds, efds, timeout); #endif } diff --git a/tools/include/nolibc/sys/timerfd.h b/tools/include/nolibc/sys/timerfd.h index 4375d546ba58f8..5dd61030c9914d 100644 --- a/tools/include/nolibc/sys/timerfd.h +++ b/tools/include/nolibc/sys/timerfd.h @@ -34,7 +34,7 @@ int sys_timerfd_gettime(int fd, struct itimerspec *curr_value) { #if defined(__NR_timerfd_gettime) return my_syscall2(__NR_timerfd_gettime, fd, curr_value); -#elif defined(__NR_timerfd_gettime64) +#else struct __kernel_itimerspec kcurr_value; int ret; @@ -42,8 +42,6 @@ int sys_timerfd_gettime(int fd, struct itimerspec *curr_value) __nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval); __nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value); return ret; -#else - return __nolibc_enosys(__func__, fd, curr_value); #endif } @@ -60,7 +58,7 @@ int sys_timerfd_settime(int fd, int flags, { #if defined(__NR_timerfd_settime) return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value); -#elif defined(__NR_timerfd_settime64) +#else struct __kernel_itimerspec knew_value, kold_value; int ret; @@ -72,8 +70,6 @@ int sys_timerfd_settime(int fd, int flags, __nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value); } return ret; -#else - return __nolibc_enosys(__func__, fd, flags, new_value, old_value); #endif } diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h index e9c1b976791a65..6c276b8d646a4e 100644 --- a/tools/include/nolibc/time.h +++ b/tools/include/nolibc/time.h @@ -45,7 +45,7 @@ int sys_clock_getres(clockid_t clockid, struct timespec *res) { #if defined(__NR_clock_getres) return my_syscall2(__NR_clock_getres, clockid, res); -#elif defined(__NR_clock_getres_time64) +#else struct __kernel_timespec kres; int ret; @@ -53,8 +53,6 @@ int sys_clock_getres(clockid_t clockid, struct timespec *res) if (res) __nolibc_timespec_kernel_to_user(&kres, res); return ret; -#else - return __nolibc_enosys(__func__, clockid, res); #endif } @@ -69,7 +67,7 @@ int sys_clock_gettime(clockid_t clockid, struct timespec *tp) { #if defined(__NR_clock_gettime) return my_syscall2(__NR_clock_gettime, clockid, tp); -#elif defined(__NR_clock_gettime64) +#else struct __kernel_timespec ktp; int ret; @@ -77,8 +75,6 @@ int sys_clock_gettime(clockid_t clockid, struct timespec *tp) if (tp) __nolibc_timespec_kernel_to_user(&ktp, tp); return ret; -#else - return __nolibc_enosys(__func__, clockid, tp); #endif } From 4b6ffb2d87a76b02f63d029533cb3e8c1482f358 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 17:40:33 +0200 Subject: [PATCH 0687/3206] tools/nolibc: remove __nolibc_enosys() fallback from *at() functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All architectures have had one of the real functions available since Linux 2.6.12. The additional fallback to __nolibc_enosys() is superfluous. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250821-nolibc-enosys-v1-2-4b63f2caaa89@weissschuh.net --- tools/include/nolibc/sys.h | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 80a5cef3bdf424..12030c80317375 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -142,10 +142,8 @@ int sys_chmod(const char *path, mode_t mode) { #if defined(__NR_fchmodat) return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0); -#elif defined(__NR_chmod) - return my_syscall2(__NR_chmod, path, mode); #else - return __nolibc_enosys(__func__, path, mode); + return my_syscall2(__NR_chmod, path, mode); #endif } @@ -165,10 +163,8 @@ int sys_chown(const char *path, uid_t owner, gid_t group) { #if defined(__NR_fchownat) return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0); -#elif defined(__NR_chown) - return my_syscall3(__NR_chown, path, owner, group); #else - return __nolibc_enosys(__func__, path, owner, group); + return my_syscall3(__NR_chown, path, owner, group); #endif } @@ -582,10 +578,8 @@ int sys_link(const char *old, const char *new) { #if defined(__NR_linkat) return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0); -#elif defined(__NR_link) - return my_syscall2(__NR_link, old, new); #else - return __nolibc_enosys(__func__, old, new); + return my_syscall2(__NR_link, old, new); #endif } @@ -653,10 +647,8 @@ int sys_mkdir(const char *path, mode_t mode) { #if defined(__NR_mkdirat) return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode); -#elif defined(__NR_mkdir) - return my_syscall2(__NR_mkdir, path, mode); #else - return __nolibc_enosys(__func__, path, mode); + return my_syscall2(__NR_mkdir, path, mode); #endif } @@ -675,10 +667,8 @@ int sys_rmdir(const char *path) { #if defined(__NR_rmdir) return my_syscall1(__NR_rmdir, path); -#elif defined(__NR_unlinkat) - return my_syscall3(__NR_unlinkat, AT_FDCWD, path, AT_REMOVEDIR); #else - return __nolibc_enosys(__func__, path); + return my_syscall3(__NR_unlinkat, AT_FDCWD, path, AT_REMOVEDIR); #endif } @@ -698,10 +688,8 @@ long sys_mknod(const char *path, mode_t mode, dev_t dev) { #if defined(__NR_mknodat) return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev); -#elif defined(__NR_mknod) - return my_syscall3(__NR_mknod, path, mode, dev); #else - return __nolibc_enosys(__func__, path, mode, dev); + return my_syscall3(__NR_mknod, path, mode, dev); #endif } @@ -885,10 +873,8 @@ int sys_symlink(const char *old, const char *new) { #if defined(__NR_symlinkat) return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new); -#elif defined(__NR_symlink) - return my_syscall2(__NR_symlink, old, new); #else - return __nolibc_enosys(__func__, old, new); + return my_syscall2(__NR_symlink, old, new); #endif } @@ -942,10 +928,8 @@ int sys_unlink(const char *path) { #if defined(__NR_unlinkat) return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0); -#elif defined(__NR_unlink) - return my_syscall1(__NR_unlink, path); #else - return __nolibc_enosys(__func__, path); + return my_syscall1(__NR_unlink, path); #endif } From 09adec1f4b446e8788c70022fae41356ee916260 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 17:40:34 +0200 Subject: [PATCH 0688/3206] tools/nolibc: remove __nolibc_enosys() fallback from dup2() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All architectures have one of the real functions available. The additional fallback to __nolibc_enosys() is superfluous. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250821-nolibc-enosys-v1-3-4b63f2caaa89@weissschuh.net --- tools/include/nolibc/sys.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 12030c80317375..32cc741f2f7d70 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -248,10 +248,8 @@ int sys_dup2(int old, int new) } return my_syscall3(__NR_dup3, old, new, 0); -#elif defined(__NR_dup2) - return my_syscall2(__NR_dup2, old, new); #else - return __nolibc_enosys(__func__, old, new); + return my_syscall2(__NR_dup2, old, new); #endif } From fbd47de75502c63933016912325582309d72cd03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 17:40:35 +0200 Subject: [PATCH 0689/3206] tools/nolibc: remove __nolibc_enosys() fallback from fork functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All architectures have one of the real functions available. The additional fallback to __nolibc_enosys() is superfluous. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250821-nolibc-enosys-v1-4-4b63f2caaa89@weissschuh.net --- tools/include/nolibc/sys.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 32cc741f2f7d70..34f20182792a05 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -334,10 +334,8 @@ pid_t sys_fork(void) * will not use the rest with no other flag. */ return my_syscall5(__NR_clone, SIGCHLD, 0, 0, 0, 0); -#elif defined(__NR_fork) - return my_syscall0(__NR_fork); #else - return __nolibc_enosys(__func__); + return my_syscall0(__NR_fork); #endif } #endif @@ -354,7 +352,7 @@ pid_t sys_vfork(void) { #if defined(__NR_vfork) return my_syscall0(__NR_vfork); -#elif defined(__NR_clone3) +#else /* * clone() could be used but has different argument orders per * architecture. @@ -365,8 +363,6 @@ pid_t sys_vfork(void) }; return my_syscall2(__NR_clone3, &args, sizeof(args)); -#else - return __nolibc_enosys(__func__); #endif } #endif From f11e156e0f1a582744fc04b614599fc8bcfc1da6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 17:40:36 +0200 Subject: [PATCH 0690/3206] tools/nolibc: fold llseek fallback into lseek() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Align the implementation of the fallback handling inside sys_lseek() with the rest of nolibc. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250821-nolibc-enosys-v1-5-4b63f2caaa89@weissschuh.net --- tools/include/nolibc/sys.h | 42 +++++++++++++------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 34f20182792a05..c5564f57deec88 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -594,41 +594,27 @@ off_t sys_lseek(int fd, off_t offset, int whence) #if defined(__NR_lseek) return my_syscall3(__NR_lseek, fd, offset, whence); #else - return __nolibc_enosys(__func__, fd, offset, whence); -#endif -} + __kernel_loff_t loff = 0; + off_t result; + int ret; -static __attribute__((unused)) -int sys_llseek(int fd, unsigned long offset_high, unsigned long offset_low, - __kernel_loff_t *result, int whence) -{ -#if defined(__NR_llseek) - return my_syscall5(__NR_llseek, fd, offset_high, offset_low, result, whence); -#else - return __nolibc_enosys(__func__, fd, offset_high, offset_low, result, whence); + /* Only exists on 32bit where nolibc off_t is also 32bit */ + ret = my_syscall5(__NR_llseek, fd, 0, offset, &loff, whence); + if (ret < 0) + result = ret; + else if (loff != (off_t)loff) + result = -EOVERFLOW; + else + result = loff; + + return result; #endif } static __attribute__((unused)) off_t lseek(int fd, off_t offset, int whence) { - __kernel_loff_t loff = 0; - off_t result; - int ret; - - result = sys_lseek(fd, offset, whence); - if (result == -ENOSYS) { - /* Only exists on 32bit where nolibc off_t is also 32bit */ - ret = sys_llseek(fd, 0, offset, &loff, whence); - if (ret < 0) - result = ret; - else if (loff != (off_t)loff) - result = -EOVERFLOW; - else - result = loff; - } - - return __sysret(result); + return __sysret(sys_lseek(fd, offset, whence)); } From 61a3cf7934b6da3c926cd9961860dd94eb7192ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 17:40:37 +0200 Subject: [PATCH 0691/3206] kselftest/arm64: tpidr2: Switch to waitpid() over wait4() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit wait4() is deprecated, non-standard and about to be removed from nolibc. Switch to the equivalent waitpid() call. Signed-off-by: Thomas Weißschuh Reviewed-by: Mark Brown Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20250821-nolibc-enosys-v1-6-4b63f2caaa89@weissschuh.net --- tools/testing/selftests/arm64/abi/tpidr2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index f58a9f89b952c4..3b520b7efa49e0 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -182,16 +182,16 @@ static int write_clone_read(void) } for (;;) { - waiting = wait4(ret, &status, __WCLONE, NULL); + waiting = waitpid(ret, &status, __WCLONE); if (waiting < 0) { if (errno == EINTR) continue; - ksft_print_msg("wait4() failed: %d\n", errno); + ksft_print_msg("waitpid() failed: %d\n", errno); return 0; } if (waiting != ret) { - ksft_print_msg("wait4() returned wrong PID %d\n", + ksft_print_msg("waitpid() returned wrong PID %d\n", waiting); return 0; } From 4c2ef951cfe68bcd823d59a9e7ef9e3c16aa3974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 17:40:38 +0200 Subject: [PATCH 0692/3206] tools/nolibc: drop wait4() support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all architectures implement the wait4() syscall. It can be implemented in terms of the waitid() syscall, but that would require some rework of the other wait-related functions in wait.h. As wait4() is non-standard and deprecated, remove it. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250821-nolibc-enosys-v1-7-4b63f2caaa89@weissschuh.net --- tools/include/nolibc/sys/wait.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/tools/include/nolibc/sys/wait.h b/tools/include/nolibc/sys/wait.h index 56ddb806da7f24..4e66e1f7a03e45 100644 --- a/tools/include/nolibc/sys/wait.h +++ b/tools/include/nolibc/sys/wait.h @@ -16,27 +16,10 @@ /* * pid_t wait(int *status); - * pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage); * pid_t waitpid(pid_t pid, int *status, int options); * int waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options); */ -static __attribute__((unused)) -pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ -#ifdef __NR_wait4 - return my_syscall4(__NR_wait4, pid, status, options, rusage); -#else - return __nolibc_enosys(__func__, pid, status, options, rusage); -#endif -} - -static __attribute__((unused)) -pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ - return __sysret(sys_wait4(pid, status, options, rusage)); -} - static __attribute__((unused)) int sys_waitid(int which, pid_t pid, siginfo_t *infop, int options, struct rusage *rusage) { From 6d33ce3634f99e0c6c9ce9fc111261f2c411cb48 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 31 Jul 2025 22:12:22 +0200 Subject: [PATCH 0693/3206] selftests/nolibc: fix EXPECT_NZ macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The expect non-zero macro was incorrect and never used. Fix its definition. Fixes: 362aecb2d8cfa ("selftests/nolibc: add basic infrastructure to ease creation of nolibc tests") Signed-off-by: Benjamin Berg Link: https://lore.kernel.org/r/20250731201225.323254-2-benjamin@sipsolutions.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/nolibc-test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index cc4d730ac4656f..d074878eb23412 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -196,8 +196,8 @@ int expect_zr(int expr, int llen) } -#define EXPECT_NZ(cond, expr, val) \ - do { if (!(cond)) result(llen, SKIPPED); else ret += expect_nz(expr, llen; } while (0) +#define EXPECT_NZ(cond, expr) \ + do { if (!(cond)) result(llen, SKIPPED); else ret += expect_nz(expr, llen); } while (0) static __attribute__((unused)) int expect_nz(int expr, int llen) From ca38943e83c7274a7b03b36c02807ea6d6adaac2 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 31 Jul 2025 22:12:23 +0200 Subject: [PATCH 0694/3206] selftests/nolibc: remove outdated comment about construct order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The constructor order is not (and should not) be tested. Remove the comment. Fixes: a782d45c867c ("selftests/nolibc: stop testing constructor order") Signed-off-by: Benjamin Berg Link: https://lore.kernel.org/r/20250731201225.323254-3-benjamin@sipsolutions.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/nolibc-test.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index d074878eb23412..29de21595fc953 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -686,7 +686,6 @@ int expect_strtox(int llen, void *func, const char *input, int base, intmax_t ex #define CASE_TEST(name) \ case __LINE__: llen += printf("%d %s", test, #name); -/* constructors validate that they are executed in definition order */ __attribute__((constructor)) static void constructor1(void) { From e63419dbf2ceb083c1651852209c7f048089ac0f Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Sat, 30 Aug 2025 11:49:53 +0200 Subject: [PATCH 0695/3206] dmaengine: ti: edma: Fix memory allocation size for queue_priority_map Fix a critical memory allocation bug in edma_setup_from_hw() where queue_priority_map was allocated with insufficient memory. The code declared queue_priority_map as s8 (*)[2] (pointer to array of 2 s8), but allocated memory using sizeof(s8) instead of the correct size. This caused out-of-bounds memory writes when accessing: queue_priority_map[i][0] = i; queue_priority_map[i][1] = i; The bug manifested as kernel crashes with "Oops - undefined instruction" on ARM platforms (BeagleBoard-X15) during EDMA driver probe, as the memory corruption triggered kernel hardening features on Clang. Change the allocation to use sizeof(*queue_priority_map) which automatically gets the correct size for the 2D array structure. Fixes: 2b6b3b742019 ("ARM/dmaengine: edma: Merge the two drivers under drivers/dma/") Signed-off-by: Anders Roxell Link: https://lore.kernel.org/r/20250830094953.3038012-1-anders.roxell@linaro.org Signed-off-by: Vinod Koul --- drivers/dma/ti/edma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/ti/edma.c b/drivers/dma/ti/edma.c index 3ed406f08c442e..552be71db6c47b 100644 --- a/drivers/dma/ti/edma.c +++ b/drivers/dma/ti/edma.c @@ -2064,8 +2064,8 @@ static int edma_setup_from_hw(struct device *dev, struct edma_soc_info *pdata, * priority. So Q0 is the highest priority queue and the last queue has * the lowest priority. */ - queue_priority_map = devm_kcalloc(dev, ecc->num_tc + 1, sizeof(s8), - GFP_KERNEL); + queue_priority_map = devm_kcalloc(dev, ecc->num_tc + 1, + sizeof(*queue_priority_map), GFP_KERNEL); if (!queue_priority_map) return -ENOMEM; From fe4ffdbab4bb3cda1d72b93f7d2c3f7ea3df415d Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 4 Aug 2025 13:49:48 +0200 Subject: [PATCH 0696/3206] leds: qnap-mcu: Fix state numbering for USB LED The "@Cx" commands span a number of different functions, from the status and USB LEDs to the buzzer and power button. So change the USB-LED enum to start at 0 and adapt the offset accordingly to not suggest @CD would relate to the USB-LED - while in fact "@CD" is a state of the status LED. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250804114949.3127417-2-heiko@sntech.de Signed-off-by: Lee Jones --- drivers/leds/leds-qnap-mcu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/leds/leds-qnap-mcu.c b/drivers/leds/leds-qnap-mcu.c index 4e470945626170..fe055c5511b8f2 100644 --- a/drivers/leds/leds-qnap-mcu.c +++ b/drivers/leds/leds-qnap-mcu.c @@ -104,9 +104,9 @@ static int qnap_mcu_register_err_led(struct device *dev, struct qnap_mcu *mcu, i } enum qnap_mcu_usb_led_mode { - QNAP_MCU_USB_LED_ON = 1, - QNAP_MCU_USB_LED_OFF = 3, - QNAP_MCU_USB_LED_BLINK = 2, + QNAP_MCU_USB_LED_ON = 0, + QNAP_MCU_USB_LED_OFF = 2, + QNAP_MCU_USB_LED_BLINK = 1, }; struct qnap_mcu_usb_led { @@ -137,7 +137,7 @@ static int qnap_mcu_usb_led_set(struct led_classdev *led_cdev, * Byte 3 is shared between the usb led target on/off/blink * and also the buzzer control (in the input driver) */ - cmd[2] = 'D' + usb_led->mode; + cmd[2] = 'E' + usb_led->mode; return qnap_mcu_exec_with_ack(usb_led->mcu, cmd, sizeof(cmd)); } @@ -161,7 +161,7 @@ static int qnap_mcu_usb_led_blink_set(struct led_classdev *led_cdev, * Byte 3 is shared between the USB LED target on/off/blink * and also the buzzer control (in the input driver) */ - cmd[2] = 'D' + usb_led->mode; + cmd[2] = 'E' + usb_led->mode; return qnap_mcu_exec_with_ack(usb_led->mcu, cmd, sizeof(cmd)); } From c2d5d8f247049e22ef0afe8188cf8a6430144a17 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 4 Aug 2025 13:49:49 +0200 Subject: [PATCH 0697/3206] leds: qnap-mcu: Add support for the red and green status LEDs There is one more set of two LEDs on the qnap devices to indicate status. One LED is green, the other is red and while they occupy the same space on the front panel, they cannot be enabled at the same time. But they can interact via blink functions, the MCU can flash them alternately, going red -> green -> red -> ... either in 500ms or 1s intervals. They can of course also blink individually. Add specific LED functions for them and register them on probe. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250804114949.3127417-3-heiko@sntech.de Signed-off-by: Lee Jones --- drivers/leds/leds-qnap-mcu.c | 165 +++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) diff --git a/drivers/leds/leds-qnap-mcu.c b/drivers/leds/leds-qnap-mcu.c index fe055c5511b8f2..6df110e33ac9c4 100644 --- a/drivers/leds/leds-qnap-mcu.c +++ b/drivers/leds/leds-qnap-mcu.c @@ -190,6 +190,166 @@ static int qnap_mcu_register_usb_led(struct device *dev, struct qnap_mcu *mcu) return qnap_mcu_usb_led_set(&usb_led->cdev, 0); } +enum qnap_mcu_status_led_mode { + QNAP_MCU_STATUS_LED_OFF = 0, + QNAP_MCU_STATUS_LED_ON = 1, + QNAP_MCU_STATUS_LED_BLINK_FAST = 2, /* 500ms / 500ms */ + QNAP_MCU_STATUS_LED_BLINK_SLOW = 3, /* 1s / 1s */ +}; + +struct qnap_mcu_status_led { + struct led_classdev cdev; + struct qnap_mcu_status_led *red; + u8 mode; +}; + +struct qnap_mcu_status { + struct qnap_mcu *mcu; + struct qnap_mcu_status_led red; + struct qnap_mcu_status_led green; +}; + +static inline struct qnap_mcu_status_led *cdev_to_qnap_mcu_status_led(struct led_classdev *led_cdev) +{ + return container_of(led_cdev, struct qnap_mcu_status_led, cdev); +} + +static inline struct qnap_mcu_status *statusled_to_qnap_mcu_status(struct qnap_mcu_status_led *led) +{ + return container_of(led->red, struct qnap_mcu_status, red); +} + +static u8 qnap_mcu_status_led_encode(struct qnap_mcu_status *status) +{ + if (status->red.mode == QNAP_MCU_STATUS_LED_OFF) { + switch (status->green.mode) { + case QNAP_MCU_STATUS_LED_OFF: + return '9'; + case QNAP_MCU_STATUS_LED_ON: + return '6'; + case QNAP_MCU_STATUS_LED_BLINK_FAST: + return '5'; + case QNAP_MCU_STATUS_LED_BLINK_SLOW: + return 'A'; + } + } else if (status->green.mode == QNAP_MCU_STATUS_LED_OFF) { + switch (status->red.mode) { + case QNAP_MCU_STATUS_LED_OFF: + return '9'; + case QNAP_MCU_STATUS_LED_ON: + return '7'; + case QNAP_MCU_STATUS_LED_BLINK_FAST: + return '4'; + case QNAP_MCU_STATUS_LED_BLINK_SLOW: + return 'B'; + } + } else if (status->green.mode == QNAP_MCU_STATUS_LED_ON && + status->red.mode == QNAP_MCU_STATUS_LED_ON) { + return 'D'; + } else if (status->green.mode == QNAP_MCU_STATUS_LED_BLINK_SLOW && + status->red.mode == QNAP_MCU_STATUS_LED_BLINK_SLOW) { + return 'C'; + } + + /* + * Here both LEDs are on in some fashion, either both blinking fast, + * or in different speeds, so default to fast blinking for both. + */ + return '8'; +} + +static int qnap_mcu_status_led_update(struct qnap_mcu *mcu, + struct qnap_mcu_status *status) +{ + u8 cmd[] = { '@', 'C', 0 }; + + cmd[2] = qnap_mcu_status_led_encode(status); + + return qnap_mcu_exec_with_ack(mcu, cmd, sizeof(cmd)); +} + +static int qnap_mcu_status_led_set(struct led_classdev *led_cdev, + enum led_brightness brightness) +{ + struct qnap_mcu_status_led *status_led = cdev_to_qnap_mcu_status_led(led_cdev); + struct qnap_mcu_status *base = statusled_to_qnap_mcu_status(status_led); + + /* Don't disturb a possible set blink-mode if LED stays on */ + if (brightness != 0 && status_led->mode >= QNAP_MCU_STATUS_LED_BLINK_FAST) + return 0; + + status_led->mode = brightness ? QNAP_MCU_STATUS_LED_ON : + QNAP_MCU_STATUS_LED_OFF; + + return qnap_mcu_status_led_update(base->mcu, base); +} + +static int qnap_mcu_status_led_blink_set(struct led_classdev *led_cdev, + unsigned long *delay_on, + unsigned long *delay_off) +{ + struct qnap_mcu_status_led *status_led = cdev_to_qnap_mcu_status_led(led_cdev); + struct qnap_mcu_status *base = statusled_to_qnap_mcu_status(status_led); + + if (status_led->mode == QNAP_MCU_STATUS_LED_OFF) + return 0; + + if (*delay_on <= 500) { + *delay_on = 500; + *delay_off = 500; + status_led->mode = QNAP_MCU_STATUS_LED_BLINK_FAST; + } else { + *delay_on = 1000; + *delay_off = 1000; + status_led->mode = QNAP_MCU_STATUS_LED_BLINK_SLOW; + } + + return qnap_mcu_status_led_update(base->mcu, base); +} + +static int qnap_mcu_register_status_leds(struct device *dev, struct qnap_mcu *mcu) +{ + struct qnap_mcu_status *status; + int ret; + + status = devm_kzalloc(dev, sizeof(*status), GFP_KERNEL); + if (!status) + return -ENOMEM; + + status->mcu = mcu; + + /* + * point to the red led, so that statusled_to_qnap_mcu_status + * can resolve the main status struct containing both leds + */ + status->red.red = &status->red; + status->green.red = &status->red; + + status->red.mode = QNAP_MCU_STATUS_LED_OFF; + status->red.cdev.name = "red:status"; + status->red.cdev.brightness_set_blocking = qnap_mcu_status_led_set; + status->red.cdev.blink_set = qnap_mcu_status_led_blink_set; + status->red.cdev.brightness = 0; + status->red.cdev.max_brightness = 1; + + status->green.mode = QNAP_MCU_STATUS_LED_OFF; + status->green.cdev.name = "green:status"; + status->green.cdev.brightness_set_blocking = qnap_mcu_status_led_set; + status->green.cdev.blink_set = qnap_mcu_status_led_blink_set; + status->green.cdev.brightness = 0; + status->green.cdev.max_brightness = 1; + + ret = devm_led_classdev_register(dev, &status->red.cdev); + if (ret) + return ret; + + ret = devm_led_classdev_register(dev, &status->green.cdev); + if (ret) + return ret; + + return qnap_mcu_status_led_update(status->mcu, status); +} + static int qnap_mcu_leds_probe(struct platform_device *pdev) { struct qnap_mcu *mcu = dev_get_drvdata(pdev->dev.parent); @@ -210,6 +370,11 @@ static int qnap_mcu_leds_probe(struct platform_device *pdev) "failed to register USB LED\n"); } + ret = qnap_mcu_register_status_leds(&pdev->dev, mcu); + if (ret) + return dev_err_probe(&pdev->dev, ret, + "failed to register status LEDs\n"); + return 0; } From 7e2368a21741e2db542330b32aa6fdd8908e7cff Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 28 Aug 2025 16:17:33 +0800 Subject: [PATCH 0698/3206] dma-debug: don't enforce dma mapping check on noncoherent allocations As discussed in [1], there is no need to enforce dma mapping check on noncoherent allocations, a simple test on the returned CPU address is good enough. Add a new pair of debug helpers and use them for noncoherent alloc/free to fix this issue. Fixes: efa70f2fdc84 ("dma-mapping: add a new dma_alloc_pages API") Link: https://lore.kernel.org/all/ff6c1fe6-820f-4e58-8395-df06aa91706c@oss.qualcomm.com # 1 Signed-off-by: Baochen Qiang Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250828-dma-debug-fix-noncoherent-dma-check-v1-1-76e9be0dd7fc@oss.qualcomm.com --- kernel/dma/debug.c | 48 +++++++++++++++++++++++++++++++++++++++++++- kernel/dma/debug.h | 20 ++++++++++++++++++ kernel/dma/mapping.c | 4 ++-- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index e43c6de2bce4e7..b82399437db031 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -39,6 +39,7 @@ enum { dma_debug_sg, dma_debug_coherent, dma_debug_resource, + dma_debug_noncoherent, }; enum map_err_types { @@ -141,6 +142,7 @@ static const char *type2name[] = { [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", [dma_debug_resource] = "resource", + [dma_debug_noncoherent] = "noncoherent", }; static const char *dir2name[] = { @@ -993,7 +995,8 @@ static void check_unmap(struct dma_debug_entry *ref) "[mapped as %s] [unmapped as %s]\n", ref->dev_addr, ref->size, type2name[entry->type], type2name[ref->type]); - } else if (entry->type == dma_debug_coherent && + } else if ((entry->type == dma_debug_coherent || + entry->type == dma_debug_noncoherent) && ref->paddr != entry->paddr) { err_printk(ref->dev, entry, "device driver frees " "DMA memory with different CPU address " @@ -1581,6 +1584,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } } +void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_noncoherent; + entry->dev = dev; + entry->paddr = page_to_phys(page); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + + add_dma_entry(entry, attrs); +} + +void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ + struct dma_debug_entry ref = { + .type = dma_debug_noncoherent, + .dev = dev, + .paddr = page_to_phys(page), + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} + static int __init dma_debug_driver_setup(char *str) { int i; diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index f525197d3cae60..48757ca13f3140 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -54,6 +54,13 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev, extern void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction); +extern void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs); +extern void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, @@ -126,5 +133,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev, int nelems, int direction) { } + +static inline void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ +} + +static inline void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ +} #endif /* CONFIG_DMA_API_DEBUG */ #endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 107e4a4d251df6..56de28a3b1799f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -712,7 +712,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, if (page) { trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, size, dir, gfp, 0); - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); + debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0); } else { trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0); } @@ -738,7 +738,7 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0); - debug_dma_unmap_page(dev, dma_handle, size, dir); + debug_dma_free_pages(dev, page, size, dir, dma_handle); __dma_free_pages(dev, size, page, dma_handle, dir); } EXPORT_SYMBOL_GPL(dma_free_pages); From e51bd0e595476c1527bb0b4def095a6fd16b2563 Mon Sep 17 00:00:00 2001 From: Xing Guo Date: Wed, 13 Aug 2025 11:16:47 +0800 Subject: [PATCH 0699/3206] selftests/fs/mount-notify: Fix compilation failure. Commit c6d9775c2066 ("selftests/fs/mount-notify: build with tools include dir") introduces the struct __kernel_fsid_t to decouple dependency with headers_install. The commit forgets to define a macro for __kernel_fsid_t and it will cause type re-definition issue. Signed-off-by: Xing Guo Link: https://lore.kernel.org/20250813031647.96411-1-higuoxing@gmail.com Acked-by: Amir Goldstein Closes: https://lore.kernel.org/oe-lkp/202508110628.65069d92-lkp@intel.com Signed-off-by: Christian Brauner --- .../mount-notify/mount-notify_test.c | 17 ++++++++--------- .../mount-notify/mount-notify_test_ns.c | 18 ++++++++---------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c index 63ce708d93ed06..e4b7c2b457ee7a 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c @@ -2,6 +2,13 @@ // Copyright (c) 2025 Miklos Szeredi #define _GNU_SOURCE + +// Needed for linux/fanotify.h +typedef struct { + int val[2]; +} __kernel_fsid_t; +#define __kernel_fsid_t __kernel_fsid_t + #include #include #include @@ -10,20 +17,12 @@ #include #include #include +#include #include "../../kselftest_harness.h" #include "../statmount/statmount.h" #include "../utils.h" -// Needed for linux/fanotify.h -#ifndef __kernel_fsid_t -typedef struct { - int val[2]; -} __kernel_fsid_t; -#endif - -#include - static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; static const int mark_cmds[] = { diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c index 090a5ca65004a0..9f57ca46e3afa0 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c @@ -2,6 +2,13 @@ // Copyright (c) 2025 Miklos Szeredi #define _GNU_SOURCE + +// Needed for linux/fanotify.h +typedef struct { + int val[2]; +} __kernel_fsid_t; +#define __kernel_fsid_t __kernel_fsid_t + #include #include #include @@ -10,21 +17,12 @@ #include #include #include +#include #include "../../kselftest_harness.h" -#include "../../pidfd/pidfd.h" #include "../statmount/statmount.h" #include "../utils.h" -// Needed for linux/fanotify.h -#ifndef __kernel_fsid_t -typedef struct { - int val[2]; -} __kernel_fsid_t; -#endif - -#include - static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; static const int mark_types[] = { From 11c2b7ec2e1865bbc6a65e7d7312a5043e3cc1aa Mon Sep 17 00:00:00 2001 From: Askar Safin Date: Mon, 25 Aug 2025 18:12:30 +0000 Subject: [PATCH 0700/3206] namei: move cross-device check to traverse_mounts This is preparation to RESOLVE_NO_XDEV fix in following commits. No functional change intended Signed-off-by: Askar Safin Link: https://lore.kernel.org/20250825181233.2464822-2-safinaskar@zohomail.com Signed-off-by: Christian Brauner --- fs/namei.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 138a693c234614..f81fdc7bbfedac 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1518,6 +1518,7 @@ static inline int traverse_mounts(struct path *path, bool *jumped, int *count, unsigned lookup_flags) { unsigned flags = smp_load_acquire(&path->dentry->d_flags); + int ret; /* fastpath */ if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { @@ -1526,7 +1527,11 @@ static inline int traverse_mounts(struct path *path, bool *jumped, return -ENOENT; return 0; } - return __traverse_mounts(path, flags, jumped, count, lookup_flags); + + ret = __traverse_mounts(path, flags, jumped, count, lookup_flags); + if (*jumped && unlikely(lookup_flags & LOOKUP_NO_XDEV)) + return -EXDEV; + return ret; } int follow_down_one(struct path *path) @@ -1631,9 +1636,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - ret = -EXDEV; - else + if (!unlikely(nd->flags & LOOKUP_NO_XDEV)) nd->state |= ND_JUMPED; } if (unlikely(ret)) { From 8b966d00b3ece6b1ffa4b6d73d484cf0ecf967e6 Mon Sep 17 00:00:00 2001 From: Askar Safin Date: Mon, 25 Aug 2025 18:12:31 +0000 Subject: [PATCH 0701/3206] namei: remove LOOKUP_NO_XDEV check from handle_mounts This is preparation to RESOLVE_NO_XDEV fix in following commits. No functional change intended. The only place that ever looks at ND_JUMPED in nd->state is complete_walk() and we are not going to reach it if handle_mounts() returns an error Signed-off-by: Askar Safin Link: https://lore.kernel.org/20250825181233.2464822-3-safinaskar@zohomail.com Signed-off-by: Christian Brauner --- fs/namei.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index f81fdc7bbfedac..6e34c331742198 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1635,10 +1635,8 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); - if (jumped) { - if (!unlikely(nd->flags & LOOKUP_NO_XDEV)) - nd->state |= ND_JUMPED; - } + if (jumped) + nd->state |= ND_JUMPED; if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) From 8ded1fde0827e52f3962d7931193f5a16d87a52c Mon Sep 17 00:00:00 2001 From: Askar Safin Date: Mon, 25 Aug 2025 18:12:32 +0000 Subject: [PATCH 0702/3206] namei: move cross-device check to __traverse_mounts This is preparation to RESOLVE_NO_XDEV fix in following commits. Also this commit makes LOOKUP_NO_XDEV logic more clear: now we immediately fail with EXDEV on first mount crossing instead of waiting for very end. No functional change intended Signed-off-by: Askar Safin Link: https://lore.kernel.org/20250825181233.2464822-4-safinaskar@zohomail.com Signed-off-by: Christian Brauner --- fs/namei.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 6e34c331742198..f0ca6f8d0a5f55 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1489,6 +1489,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; + if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) { + ret = -EXDEV; + break; + } continue; } } @@ -1518,7 +1522,6 @@ static inline int traverse_mounts(struct path *path, bool *jumped, int *count, unsigned lookup_flags) { unsigned flags = smp_load_acquire(&path->dentry->d_flags); - int ret; /* fastpath */ if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { @@ -1527,11 +1530,7 @@ static inline int traverse_mounts(struct path *path, bool *jumped, return -ENOENT; return 0; } - - ret = __traverse_mounts(path, flags, jumped, count, lookup_flags); - if (*jumped && unlikely(lookup_flags & LOOKUP_NO_XDEV)) - return -EXDEV; - return ret; + return __traverse_mounts(path, flags, jumped, count, lookup_flags); } int follow_down_one(struct path *path) From 042a60680de43175eb4df0977ff04a4eba9da082 Mon Sep 17 00:00:00 2001 From: Askar Safin Date: Mon, 25 Aug 2025 18:12:33 +0000 Subject: [PATCH 0703/3206] openat2: don't trigger automounts with RESOLVE_NO_XDEV openat2 had a bug: if we pass RESOLVE_NO_XDEV, then openat2 doesn't traverse through automounts, but may still trigger them. (See the link for full bug report with reproducer.) This commit fixes this bug. Link: https://lore.kernel.org/linux-fsdevel/20250817075252.4137628-1-safinaskar@zohomail.com/ Fixes: fddb5d430ad9fa91b49b1 ("open: introduce openat2(2) syscall") Reviewed-by: Aleksa Sarai Cc: stable@vger.kernel.org Signed-off-by: Askar Safin Link: https://lore.kernel.org/20250825181233.2464822-5-safinaskar@zohomail.com Signed-off-by: Christian Brauner --- fs/namei.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index f0ca6f8d0a5f55..44856b70ea3b26 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1449,6 +1449,10 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags dentry->d_inode) return -EISDIR; + /* No need to trigger automounts if mountpoint crossing is disabled. */ + if (lookup_flags & LOOKUP_NO_XDEV) + return -EXDEV; + if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; @@ -1472,6 +1476,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, /* Allow the filesystem to manage the transit without i_rwsem * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { + if (lookup_flags & LOOKUP_NO_XDEV) { + ret = -EXDEV; + break; + } ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) From f1b55db08d527240fbc3b8c84229134a98d8d080 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Fri, 29 Aug 2025 21:57:42 +0200 Subject: [PATCH 0704/3206] rust: device: fix unresolved link to drm::Device drm::Device is only available when CONFIG_DRM=y, which we have to consider for intra-doc links, otherwise the rustdoc make target produces the following warning. >> warning: unresolved link to `kernel::drm::Device` --> rust/kernel/device.rs:154:22 | 154 | /// [`drm::Device`]: kernel::drm::Device | ^^^^^^^^^^^^^^^^^^^ no item named `drm` in module `kernel` | = note: `#[warn(rustdoc::broken_intra_doc_links)]` on by default Fix this by making the intra-doc link conditional on CONFIG_DRM being enabled. Fixes: d6e26c1ae4a6 ("device: rust: expand documentation for Device") Suggested-by: Alice Ryhl Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508261644.9LclwUgt-lkp@intel.com/ Reviewed-by: Alice Ryhl Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250829195745.31174-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- rust/kernel/device.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rust/kernel/device.rs b/rust/kernel/device.rs index 5902b3714a1662..a1db49eb159a37 100644 --- a/rust/kernel/device.rs +++ b/rust/kernel/device.rs @@ -138,7 +138,9 @@ pub mod property; /// } /// ``` /// -/// An example for a class device implementation is [`drm::Device`]. +/// An example for a class device implementation is +#[cfg_attr(CONFIG_DRM = "y", doc = "[`drm::Device`](kernel::drm::Device).")] +#[cfg_attr(not(CONFIG_DRM = "y"), doc = "`drm::Device`.")] /// /// # Invariants /// @@ -151,7 +153,6 @@ pub mod property; /// dropped from any thread. /// /// [`AlwaysRefCounted`]: kernel::types::AlwaysRefCounted -/// [`drm::Device`]: kernel::drm::Device /// [`impl_device_context_deref`]: kernel::impl_device_context_deref /// [`pci::Device`]: kernel::pci::Device /// [`platform::Device`]: kernel::platform::Device From aa2e1e4563d3ab689ffa86ca1412ecbf9fd3b308 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 2 Sep 2025 17:03:58 +0800 Subject: [PATCH 0705/3206] dmaengine: dw: dmamux: Fix device reference leak in rzn1_dmamux_route_allocate The reference taken by of_find_device_by_node() must be released when not needed anymore. Add missing put_device() call to fix device reference leaks. Fixes: 134d9c52fca2 ("dmaengine: dw: dmamux: Introduce RZN1 DMA router support") Cc: stable@vger.kernel.org Signed-off-by: Miaoqian Lin Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20250902090358.2423285-1-linmq006@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/dw/rzn1-dmamux.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/dma/dw/rzn1-dmamux.c b/drivers/dma/dw/rzn1-dmamux.c index 4fb8508419dbd8..deadf135681b67 100644 --- a/drivers/dma/dw/rzn1-dmamux.c +++ b/drivers/dma/dw/rzn1-dmamux.c @@ -48,12 +48,16 @@ static void *rzn1_dmamux_route_allocate(struct of_phandle_args *dma_spec, u32 mask; int ret; - if (dma_spec->args_count != RNZ1_DMAMUX_NCELLS) - return ERR_PTR(-EINVAL); + if (dma_spec->args_count != RNZ1_DMAMUX_NCELLS) { + ret = -EINVAL; + goto put_device; + } map = kzalloc(sizeof(*map), GFP_KERNEL); - if (!map) - return ERR_PTR(-ENOMEM); + if (!map) { + ret = -ENOMEM; + goto put_device; + } chan = dma_spec->args[0]; map->req_idx = dma_spec->args[4]; @@ -94,12 +98,15 @@ static void *rzn1_dmamux_route_allocate(struct of_phandle_args *dma_spec, if (ret) goto clear_bitmap; + put_device(&pdev->dev); return map; clear_bitmap: clear_bit(map->req_idx, dmamux->used_chans); free_map: kfree(map); +put_device: + put_device(&pdev->dev); return ERR_PTR(ret); } From 7df87820122acd3204565109f636a1367912655a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 5 Aug 2025 15:45:08 +1000 Subject: [PATCH 0706/3206] pidns: move is-ancestor logic to helper This check will be needed in later patches, and there's no point open-coding it each time. Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250805-procfs-pidns-api-v4-1-705f984940e7@cyphar.com Signed-off-by: Christian Brauner --- include/linux/pid_namespace.h | 9 +++++++++ kernel/pid_namespace.c | 22 ++++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a58111998d..17fdc059f8da83 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -84,6 +84,9 @@ extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); +extern bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor); + #else /* !CONFIG_PID_NS */ #include @@ -118,6 +121,12 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } + +static inline bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + return false; +} #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717d3..b7b45c2597ec2f 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -390,11 +390,23 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } +bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + struct pid_namespace *ns; + + if (child->level < ancestor->level) + return false; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return ns == ancestor; +} + static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = to_pid_ns(ns); + struct pid_namespace *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) @@ -408,13 +420,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns) * this maintains the property that processes and their * children can not escape their current pid namespace. */ - if (new->level < active->level) - return -EINVAL; - - ancestor = new; - while (ancestor->level > active->level) - ancestor = ancestor->parent; - if (ancestor != active) + if (!pidns_is_ancestor(new, active)) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); From fe49652e36bb4337330d24dce7d571a49f824605 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 5 Aug 2025 15:45:09 +1000 Subject: [PATCH 0707/3206] procfs: add "pidns" mount option Since the introduction of pid namespaces, their interaction with procfs has been entirely implicit in ways that require a lot of dancing around by programs that need to construct sandboxes with different PID namespaces. Being able to explicitly specify the pid namespace to use when constructing a procfs super block will allow programs to no longer need to fork off a process which does then does unshare(2) / setns(2) and forks again in order to construct a procfs in a pidns. So, provide a "pidns" mount option which allows such users to just explicitly state which pid namespace they want that procfs instance to use. This interface can be used with fsconfig(2) either with a file descriptor or a path: fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd); fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0); or with classic mount(2) / mount(8): // mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid"); As this new API is effectively shorthand for setns(2) followed by mount(2), the permission model for this mirrors pidns_install() to avoid opening up new attack surfaces by loosening the existing permission model. In order to avoid having to RCU-protect all users of proc_pid_ns() (to avoid UAFs), attempting to reconfigure an existing procfs instance's pid namespace will error out with -EBUSY. Creating new procfs instances is quite cheap, so this should not be an impediment to most users, and lets us avoid a lot of churn in fs/proc/* for a feature that it seems unlikely userspace would use. Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250805-procfs-pidns-api-v4-2-705f984940e7@cyphar.com Signed-off-by: Christian Brauner --- Documentation/filesystems/proc.rst | 8 +++ fs/proc/root.c | 98 ++++++++++++++++++++++++++++-- 2 files changed, 100 insertions(+), 6 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 2971551b723534..b7e3147ba3d457 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -2362,6 +2362,7 @@ The following mount options are supported: hidepid= Set /proc// access mode. gid= Set the group authorized to learn processes information. subset= Show only the specified subset of procfs. + pidns= Specify a the namespace used by this procfs. ========= ======================================================== hidepid=off or hidepid=0 means classic mode - everybody may access all @@ -2394,6 +2395,13 @@ information about processes information, just add identd to this group. subset=pid hides all top level files and directories in the procfs that are not related to tasks. +pidns= specifies a pid namespace (either as a string path to something like +`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that +will be used by the procfs instance when translating pids. By default, procfs +will use the calling process's active pid namespace. Note that the pid +namespace of an existing procfs instance cannot be modified (attempting to do +so will give an `-EBUSY` error). + Chapter 5: Filesystem behavior ============================== diff --git a/fs/proc/root.c b/fs/proc/root.c index ed86ac7103843b..fd1f1c8a939ad4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -38,12 +38,14 @@ enum proc_param { Opt_gid, Opt_hidepid, Opt_subset, + Opt_pidns, }; static const struct fs_parameter_spec proc_fs_parameters[] = { - fsparam_u32("gid", Opt_gid), + fsparam_u32("gid", Opt_gid), fsparam_string("hidepid", Opt_hidepid), fsparam_string("subset", Opt_subset), + fsparam_file_or_string("pidns", Opt_pidns), {} }; @@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value) return 0; } +#ifdef CONFIG_PID_NS +static int proc_parse_pidns_param(struct fs_context *fc, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct pid_namespace *target, *active = task_active_pid_ns(current); + struct ns_common *ns; + struct file *ns_filp __free(fput) = NULL; + + switch (param->type) { + case fs_value_is_file: + /* came through fsconfig, steal the file reference */ + ns_filp = no_free_ptr(param->file); + break; + case fs_value_is_string: + ns_filp = filp_open(param->string, O_RDONLY, 0); + break; + default: + WARN_ON_ONCE(true); + break; + } + if (!ns_filp) + ns_filp = ERR_PTR(-EBADF); + if (IS_ERR(ns_filp)) { + errorfc(fc, "could not get file from pidns argument"); + return PTR_ERR(ns_filp); + } + + if (!proc_ns_file(ns_filp)) + return invalfc(fc, "pidns argument is not an nsfs file"); + ns = get_proc_ns(file_inode(ns_filp)); + if (ns->ops->type != CLONE_NEWPID) + return invalfc(fc, "pidns argument is not a pidns file"); + target = container_of(ns, struct pid_namespace, ns); + + /* + * pidns= is shorthand for joining the pidns to get a fsopen fd, so the + * permission model should be the same as pidns_install(). + */ + if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) { + errorfc(fc, "insufficient permissions to set pidns"); + return -EPERM; + } + if (!pidns_is_ancestor(target, active)) + return invalfc(fc, "cannot set pidns to non-descendant pidns"); + + put_pid_ns(ctx->pid_ns); + ctx->pid_ns = get_pid_ns(target); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + return 0; +} +#endif /* CONFIG_PID_NS */ + static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + int opt, err; opt = fs_parse(fc, proc_fs_parameters, param, &result); if (opt < 0) @@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_hidepid: - if (proc_parse_hidepid_param(fc, param)) - return -EINVAL; + err = proc_parse_hidepid_param(fc, param); + if (err) + return err; break; case Opt_subset: - if (proc_parse_subset_param(fc, param->string) < 0) - return -EINVAL; + err = proc_parse_subset_param(fc, param->string); + if (err) + return err; + break; + + case Opt_pidns: +#ifdef CONFIG_PID_NS + /* + * We would have to RCU-protect every proc_pid_ns() or + * proc_sb_info() access if we allowed this to be reconfigured + * for an existing procfs instance. Luckily, procfs instances + * are cheap to create, and mount-beneath would let you + * atomically replace an instance even with overmounts. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + errorfc(fc, "cannot reconfigure pidns for existing procfs"); + return -EBUSY; + } + err = proc_parse_pidns_param(fc, param, &result); + if (err) + return err; break; +#else + errorfc(fc, "pidns mount flag not supported on this system"); + return -EOPNOTSUPP; +#endif default: return -EINVAL; @@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info, fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) fs_info->pidonly = ctx->pidonly; + if (ctx->mask & (1 << Opt_pidns) && + !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { + put_pid_ns(fs_info->pid_ns); + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + } } static int proc_fill_super(struct super_block *s, struct fs_context *fc) From 5554d820f71c72fbe64e12c3d171908c5ef7257d Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 5 Aug 2025 15:45:11 +1000 Subject: [PATCH 0708/3206] selftests/proc: add tests for new pidns APIs Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250805-procfs-pidns-api-v4-4-705f984940e7@cyphar.com Signed-off-by: Christian Brauner --- tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + tools/testing/selftests/proc/proc-pidns.c | 211 ++++++++++++++++++++++ 3 files changed, 213 insertions(+) create mode 100644 tools/testing/selftests/proc/proc-pidns.c diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 19bb333e2485f5..6b78a8382d40e4 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -18,6 +18,7 @@ /proc-tid0 /proc-uptime-001 /proc-uptime-002 +/proc-pidns /read /self /setns-dcache diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 50aba102201a9d..be3013515aae57 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -28,5 +28,6 @@ TEST_GEN_PROGS += setns-sysvipc TEST_GEN_PROGS += thread-self TEST_GEN_PROGS += proc-multiple-procfs TEST_GEN_PROGS += proc-fsconfig-hidepid +TEST_GEN_PROGS += proc-pidns include ../lib.mk diff --git a/tools/testing/selftests/proc/proc-pidns.c b/tools/testing/selftests/proc/proc-pidns.c new file mode 100644 index 00000000000000..52500597f95148 --- /dev/null +++ b/tools/testing/selftests/proc/proc-pidns.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Aleksa Sarai + * Copyright (C) 2025 SUSE LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#define ASSERT_ERRNO(expected, _t, seen) \ + __EXPECT(expected, #expected, \ + ({__typeof__(seen) _tmp_seen = (seen); \ + _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1) + +#define ASSERT_ERRNO_EQ(expected, seen) \ + ASSERT_ERRNO(expected, ==, seen) + +#define ASSERT_SUCCESS(seen) \ + ASSERT_ERRNO(0, <=, seen) + +static int touch(char *path) +{ + int fd = open(path, O_WRONLY|O_CREAT|O_CLOEXEC, 0644); + if (fd < 0) + return -1; + return close(fd); +} + +FIXTURE(ns) +{ + int host_mntns, host_pidns; + int dummy_pidns; +}; + +FIXTURE_SETUP(ns) +{ + /* Stash the old mntns. */ + self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_mntns); + + /* Create a new mount namespace and make it private. */ + ASSERT_SUCCESS(unshare(CLONE_NEWNS)); + ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL)); + + /* + * Create a proper tmpfs that we can use and will disappear once we + * leave this mntns. + */ + ASSERT_SUCCESS(mount("tmpfs", "/tmp", "tmpfs", 0, NULL)); + + /* + * Create a pidns we can use for later tests. We need to fork off a + * child so that we get a usable nsfd that we can bind-mount and open. + */ + ASSERT_SUCCESS(mkdir("/tmp/dummy", 0755)); + ASSERT_SUCCESS(touch("/tmp/dummy/pidns")); + ASSERT_SUCCESS(mkdir("/tmp/dummy/proc", 0755)); + + self->host_pidns = open("/proc/self/ns/pid", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_pidns); + ASSERT_SUCCESS(unshare(CLONE_NEWPID)); + + pid_t pid = fork(); + ASSERT_SUCCESS(pid); + if (!pid) { + prctl(PR_SET_PDEATHSIG, SIGKILL); + ASSERT_SUCCESS(mount("/proc/self/ns/pid", "/tmp/dummy/pidns", NULL, MS_BIND, NULL)); + ASSERT_SUCCESS(mount("proc", "/tmp/dummy/proc", "proc", 0, NULL)); + exit(0); + } + + int wstatus; + ASSERT_EQ(waitpid(pid, &wstatus, 0), pid); + ASSERT_TRUE(WIFEXITED(wstatus)); + ASSERT_EQ(WEXITSTATUS(wstatus), 0); + + ASSERT_SUCCESS(setns(self->host_pidns, CLONE_NEWPID)); + + self->dummy_pidns = open("/tmp/dummy/pidns", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->dummy_pidns); +} + +FIXTURE_TEARDOWN(ns) +{ + ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS)); + ASSERT_SUCCESS(close(self->host_mntns)); + + ASSERT_SUCCESS(close(self->host_pidns)); + ASSERT_SUCCESS(close(self->dummy_pidns)); +} + +TEST_F(ns, pidns_mount_string_path) +{ + ASSERT_SUCCESS(mkdir("/tmp/proc-host", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc-host", "proc", 0, "pidns=/proc/self/ns/pid")); + ASSERT_SUCCESS(access("/tmp/proc-host/self/", X_OK)); + + ASSERT_SUCCESS(mkdir("/tmp/proc-dummy", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc-dummy", "proc", 0, "pidns=/tmp/dummy/pidns")); + ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/1/", X_OK)); + ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/self/", X_OK)); +} + +TEST_F(ns, pidns_fsconfig_string_path) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_fsconfig_fd) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_reconfigure_remount) +{ + ASSERT_SUCCESS(mkdir("/tmp/proc", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc", "proc", 0, "")); + + ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK)); + ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK)); + + ASSERT_ERRNO_EQ(-EBUSY, mount(NULL, "/tmp/proc", NULL, MS_REMOUNT, "pidns=/tmp/dummy/pidns")); + + ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK)); + ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK)); +} + +TEST_F(ns, pidns_reconfigure_fsconfig_string_path) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */ + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_reconfigure_fsconfig_fd) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */ + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_HARNESS_MAIN From 19c5010e8ae23faf7b98fd738ff9970bb9066b78 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 11 Aug 2025 15:54:03 +0200 Subject: [PATCH 0709/3206] dt-bindings: leds: issi,is31fl319x: Drop 'db' suffix duplicating dtschema A common property unit suffix '-db' was added to dtschema, thus in-kernel bindings should not reference the type. Signed-off-by: Krzysztof Kozlowski Acked-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250811-dt-bindings-db-v1-1-457301523bb5@linaro.org Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/leds/issi,is31fl319x.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/leds/issi,is31fl319x.yaml b/Documentation/devicetree/bindings/leds/issi,is31fl319x.yaml index 3c0431c51159e5..906735acfbaf94 100644 --- a/Documentation/devicetree/bindings/leds/issi,is31fl319x.yaml +++ b/Documentation/devicetree/bindings/leds/issi,is31fl319x.yaml @@ -42,7 +42,6 @@ properties: description: GPIO attached to the SDB pin. audio-gain-db: - $ref: /schemas/types.yaml#/definitions/uint32 default: 0 description: Audio gain selection for external analog modulation input. enum: [0, 3, 6, 9, 12, 15, 18, 21] From 358253fa8179ab4217ac283b56adde0174186f87 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 30 Aug 2025 13:16:58 +0200 Subject: [PATCH 0710/3206] pinctrl: samsung: Drop unused S3C24xx driver data Drop unused declarations after S3C24xx SoC family removal in the commit 61b7f8920b17 ("ARM: s3c: remove all s3c24xx support"). Fixes: 1ea35b355722 ("ARM: s3c: remove s3c24xx specific hacks") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250830111657.126190-3-krzysztof.kozlowski@linaro.org Signed-off-by: Krzysztof Kozlowski --- drivers/pinctrl/samsung/pinctrl-samsung.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.h b/drivers/pinctrl/samsung/pinctrl-samsung.h index be2dee886d81a9..3e8ef91d94a36e 100644 --- a/drivers/pinctrl/samsung/pinctrl-samsung.h +++ b/drivers/pinctrl/samsung/pinctrl-samsung.h @@ -403,10 +403,6 @@ extern const struct samsung_pinctrl_of_match_data exynosautov920_of_data; extern const struct samsung_pinctrl_of_match_data fsd_of_data; extern const struct samsung_pinctrl_of_match_data gs101_of_data; extern const struct samsung_pinctrl_of_match_data s3c64xx_of_data; -extern const struct samsung_pinctrl_of_match_data s3c2412_of_data; -extern const struct samsung_pinctrl_of_match_data s3c2416_of_data; -extern const struct samsung_pinctrl_of_match_data s3c2440_of_data; -extern const struct samsung_pinctrl_of_match_data s3c2450_of_data; extern const struct samsung_pinctrl_of_match_data s5pv210_of_data; #endif /* __PINCTRL_SAMSUNG_H */ From d37db94b078197ec4be1cadebbfd7bf144e3c5e4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 30 Aug 2025 13:16:59 +0200 Subject: [PATCH 0711/3206] dt-bindings: pinctrl: samsung: Drop S3C2410 Samsung S3C24xx family of SoCs was removed the Linux kernel in the commit 61b7f8920b17 ("ARM: s3c: remove all s3c24xx support"), in January 2023. There are no in-kernel users of remaining S3C24xx compatibles. Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250830111657.126190-4-krzysztof.kozlowski@linaro.org Signed-off-by: Krzysztof Kozlowski --- .../samsung,pinctrl-wakeup-interrupt.yaml | 19 +------------------ .../bindings/pinctrl/samsung,pinctrl.yaml | 4 ---- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-wakeup-interrupt.yaml b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-wakeup-interrupt.yaml index 0da6d69f599171..dd11c73a55da3f 100644 --- a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-wakeup-interrupt.yaml +++ b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-wakeup-interrupt.yaml @@ -30,8 +30,6 @@ properties: compatible: oneOf: - enum: - - samsung,s3c2410-wakeup-eint - - samsung,s3c2412-wakeup-eint - samsung,s3c64xx-wakeup-eint - samsung,s5pv210-wakeup-eint - samsung,exynos4210-wakeup-eint @@ -59,27 +57,12 @@ properties: description: Interrupt used by multiplexed external wake-up interrupts. minItems: 1 - maxItems: 6 + maxItems: 4 required: - compatible allOf: - - if: - properties: - compatible: - contains: - enum: - - samsung,s3c2410-wakeup-eint - - samsung,s3c2412-wakeup-eint - then: - properties: - interrupts: - minItems: 6 - maxItems: 6 - required: - - interrupts - - if: properties: compatible: diff --git a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml index 9386dcd418c269..f1094d65e84603 100644 --- a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml @@ -37,10 +37,6 @@ properties: enum: - axis,artpec8-pinctrl - google,gs101-pinctrl - - samsung,s3c2412-pinctrl - - samsung,s3c2416-pinctrl - - samsung,s3c2440-pinctrl - - samsung,s3c2450-pinctrl - samsung,s3c64xx-pinctrl - samsung,s5pv210-pinctrl - samsung,exynos2200-pinctrl From 2720c87b76215595a94fb70515438c5b64c360eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 1 Jul 2025 11:22:36 +0200 Subject: [PATCH 0712/3206] backlight: mp3309c: Drop pwm_apply_args() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pwm_apply_args() sole purpose is to initialize all parameters specified in the device tree for consumers that rely on pwm_config() and pwm_enable(). The mp3309c backlight driver uses pwm_apply_might_sleep() which gets passed the full configuration and so doesn't rely on the default being explicitly applied. Signed-off-by: Uwe Kleine-König Tested-by: Flavio Suligoi Reviewed-by: Daniel Thompson (RISCstar) Link: https://lore.kernel.org/r/2d1075f5dd45c7c135e4326e279468de699f9d17.1751361465.git.u.kleine-koenig@baylibre.com Signed-off-by: Lee Jones --- drivers/video/backlight/mp3309c.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/video/backlight/mp3309c.c b/drivers/video/backlight/mp3309c.c index 372058e2612962..bb4e85531ceab3 100644 --- a/drivers/video/backlight/mp3309c.c +++ b/drivers/video/backlight/mp3309c.c @@ -222,7 +222,6 @@ static int mp3309c_parse_fwnode(struct mp3309c_chip *chip, if (IS_ERR(chip->pwmd)) return dev_err_probe(dev, PTR_ERR(chip->pwmd), "error getting pwm data\n"); pdata->dimming_mode = DIMMING_PWM; - pwm_apply_args(chip->pwmd); } /* From fe85a39d8402e64a60ad3b23e190140a71d98dd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 1 Jul 2025 11:22:37 +0200 Subject: [PATCH 0713/3206] backlight: mp3309c: Initialize backlight properties without memset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Assigning values to a struct using a compound literal (since C99) also guarantees that all unspecified struct members are empty-initialized, so it properly replaces the memset to zero. The code looks a bit nicer and more idiomatic (though that might be subjective?). The resulting binary is a bit smaller. On ARCH=arm with an allnoconfig + minimal changes to enable the mp3309c driver the difference is 12 bytes. Signed-off-by: Uwe Kleine-König Tested-by: Flavio Suligoi Reviewed-by: Daniel Thompson (RISCstar) Link: https://lore.kernel.org/r/14514a1b0d3df6438aa10bb74f1c4fc2367d9987.1751361465.git.u.kleine-koenig@baylibre.com Signed-off-by: Lee Jones --- drivers/video/backlight/mp3309c.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/video/backlight/mp3309c.c b/drivers/video/backlight/mp3309c.c index bb4e85531ceab3..9337110ce6e593 100644 --- a/drivers/video/backlight/mp3309c.c +++ b/drivers/video/backlight/mp3309c.c @@ -352,12 +352,13 @@ static int mp3309c_probe(struct i2c_client *client) chip->pdata = pdata; /* Backlight properties */ - memset(&props, 0, sizeof(struct backlight_properties)); - props.brightness = pdata->default_brightness; - props.max_brightness = pdata->max_brightness; - props.scale = BACKLIGHT_SCALE_LINEAR; - props.type = BACKLIGHT_RAW; - props.power = BACKLIGHT_POWER_ON; + props = (typeof(props)){ + .brightness = pdata->default_brightness, + .max_brightness = pdata->max_brightness, + .scale = BACKLIGHT_SCALE_LINEAR, + .type = BACKLIGHT_RAW, + .power = BACKLIGHT_POWER_ON, + }; chip->bl = devm_backlight_device_register(dev, "mp3309c", dev, chip, &mp3309c_bl_ops, &props); if (IS_ERR(chip->bl)) From e551fa3159e3050c26ff010c3b595b45d7eb071a Mon Sep 17 00:00:00 2001 From: Qunqin Zhao Date: Sat, 5 Jul 2025 15:20:42 +0800 Subject: [PATCH 0714/3206] mfd: Add support for Loongson Security Engine chip controller Loongson Security Engine chip supports RNG, SM2, SM3 and SM4 accelerator engines. This is the base driver for other specific engine drivers. Co-developed-by: Yinggang Gu Signed-off-by: Yinggang Gu Signed-off-by: Qunqin Zhao Reviewed-by: Huacai Chen Link: https://lore.kernel.org/r/20250705072045.1067-2-zhaoqunqin@loongson.cn Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 11 ++ drivers/mfd/Makefile | 2 + drivers/mfd/loongson-se.c | 253 ++++++++++++++++++++++++++++++++ include/linux/mfd/loongson-se.h | 53 +++++++ 4 files changed, 319 insertions(+) create mode 100644 drivers/mfd/loongson-se.c create mode 100644 include/linux/mfd/loongson-se.h diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1e7..d30806ae4298fd 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -2428,6 +2428,17 @@ config MFD_INTEL_M10_BMC_PMCI additional drivers must be enabled in order to use the functionality of the device. +config MFD_LOONGSON_SE + tristate "Loongson Security Engine chip controller driver" + depends on LOONGARCH && ACPI + select MFD_CORE + help + The Loongson Security Engine chip supports RNG, SM2, SM3 and + SM4 accelerator engines. Each engine have its own DMA buffer + provided by the controller. The kernel cannot directly send + commands to the engine and must first send them to the controller, + which will forward them to the corresponding engine. + config MFD_QNAP_MCU tristate "QNAP microcontroller unit core driver" depends on SERIAL_DEV_BUS diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d16..73df7fc8b5ffbb 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -295,3 +295,5 @@ obj-$(CONFIG_MFD_RSMU_I2C) += rsmu_i2c.o rsmu_core.o obj-$(CONFIG_MFD_RSMU_SPI) += rsmu_spi.o rsmu_core.o obj-$(CONFIG_MFD_UPBOARD_FPGA) += upboard-fpga.o + +obj-$(CONFIG_MFD_LOONGSON_SE) += loongson-se.o diff --git a/drivers/mfd/loongson-se.c b/drivers/mfd/loongson-se.c new file mode 100644 index 00000000000000..3902ba377d6908 --- /dev/null +++ b/drivers/mfd/loongson-se.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2025 Loongson Technology Corporation Limited + * + * Author: Yinggang Gu + * Author: Qunqin Zhao + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct loongson_se { + void __iomem *base; + spinlock_t dev_lock; + struct completion cmd_completion; + + void *dmam_base; + int dmam_size; + + struct mutex engine_init_lock; + struct loongson_se_engine engines[SE_ENGINE_MAX]; +}; + +struct loongson_se_controller_cmd { + u32 command_id; + u32 info[7]; +}; + +static int loongson_se_poll(struct loongson_se *se, u32 int_bit) +{ + u32 status; + int err; + + spin_lock_irq(&se->dev_lock); + + /* Notify the controller that the engine needs to be started */ + writel(int_bit, se->base + SE_L2SINT_SET); + + /* Polling until the controller has forwarded the engine command */ + err = readl_relaxed_poll_timeout_atomic(se->base + SE_L2SINT_STAT, status, + !(status & int_bit), + 1, LOONGSON_ENGINE_CMD_TIMEOUT_US); + + spin_unlock_irq(&se->dev_lock); + + return err; +} + +static int loongson_se_send_controller_cmd(struct loongson_se *se, + struct loongson_se_controller_cmd *cmd) +{ + u32 *send_cmd = (u32 *)cmd; + int err, i; + + for (i = 0; i < SE_SEND_CMD_REG_LEN; i++) + writel(send_cmd[i], se->base + SE_SEND_CMD_REG + i * 4); + + err = loongson_se_poll(se, SE_INT_CONTROLLER); + if (err) + return err; + + return wait_for_completion_interruptible(&se->cmd_completion); +} + +int loongson_se_send_engine_cmd(struct loongson_se_engine *engine) +{ + /* + * After engine initialization, the controller already knows + * where to obtain engine commands from. Now all we need to + * do is notify the controller that the engine needs to be started. + */ + int err = loongson_se_poll(engine->se, BIT(engine->id)); + + if (err) + return err; + + return wait_for_completion_interruptible(&engine->completion); +} +EXPORT_SYMBOL_GPL(loongson_se_send_engine_cmd); + +struct loongson_se_engine *loongson_se_init_engine(struct device *dev, int id) +{ + struct loongson_se *se = dev_get_drvdata(dev); + struct loongson_se_engine *engine = &se->engines[id]; + struct loongson_se_controller_cmd cmd; + + engine->se = se; + engine->id = id; + init_completion(&engine->completion); + + /* Divide DMA memory equally among all engines */ + engine->buffer_size = se->dmam_size / SE_ENGINE_MAX; + engine->buffer_off = (se->dmam_size / SE_ENGINE_MAX) * id; + engine->data_buffer = se->dmam_base + engine->buffer_off; + + /* + * There has no engine0, use its data buffer as command buffer for other + * engines. The DMA memory size is obtained from the ACPI table, which + * ensures that the data buffer size of engine0 is larger than the + * command buffer size of all engines. + */ + engine->command = se->dmam_base + id * (2 * SE_ENGINE_CMD_SIZE); + engine->command_ret = engine->command + SE_ENGINE_CMD_SIZE; + + mutex_lock(&se->engine_init_lock); + + /* Tell the controller where to find engine command */ + cmd.command_id = SE_CMD_SET_ENGINE_CMDBUF; + cmd.info[0] = id; + cmd.info[1] = engine->command - se->dmam_base; + cmd.info[2] = 2 * SE_ENGINE_CMD_SIZE; + + if (loongson_se_send_controller_cmd(se, &cmd)) + engine = NULL; + + mutex_unlock(&se->engine_init_lock); + + return engine; +} +EXPORT_SYMBOL_GPL(loongson_se_init_engine); + +static irqreturn_t se_irq_handler(int irq, void *dev_id) +{ + struct loongson_se *se = dev_id; + u32 int_status; + int id; + + spin_lock(&se->dev_lock); + + int_status = readl(se->base + SE_S2LINT_STAT); + + /* For controller */ + if (int_status & SE_INT_CONTROLLER) { + complete(&se->cmd_completion); + int_status &= ~SE_INT_CONTROLLER; + writel(SE_INT_CONTROLLER, se->base + SE_S2LINT_CL); + } + + /* For engines */ + while (int_status) { + id = __ffs(int_status); + complete(&se->engines[id].completion); + int_status &= ~BIT(id); + writel(BIT(id), se->base + SE_S2LINT_CL); + } + + spin_unlock(&se->dev_lock); + + return IRQ_HANDLED; +} + +static int loongson_se_init(struct loongson_se *se, dma_addr_t addr, int size) +{ + struct loongson_se_controller_cmd cmd; + int err; + + cmd.command_id = SE_CMD_START; + err = loongson_se_send_controller_cmd(se, &cmd); + if (err) + return err; + + cmd.command_id = SE_CMD_SET_DMA; + cmd.info[0] = lower_32_bits(addr); + cmd.info[1] = upper_32_bits(addr); + cmd.info[2] = size; + + return loongson_se_send_controller_cmd(se, &cmd); +} + +static const struct mfd_cell engines[] = { + { .name = "loongson-rng" }, + { .name = "tpm_loongson" }, +}; + +static int loongson_se_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct loongson_se *se; + int nr_irq, irq, err, i; + dma_addr_t paddr; + + se = devm_kmalloc(dev, sizeof(*se), GFP_KERNEL); + if (!se) + return -ENOMEM; + + dev_set_drvdata(dev, se); + init_completion(&se->cmd_completion); + spin_lock_init(&se->dev_lock); + mutex_init(&se->engine_init_lock); + + dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); + if (device_property_read_u32(dev, "dmam_size", &se->dmam_size)) + return -ENODEV; + + se->dmam_base = dmam_alloc_coherent(dev, se->dmam_size, &paddr, GFP_KERNEL); + if (!se->dmam_base) + return -ENOMEM; + + se->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(se->base)) + return PTR_ERR(se->base); + + writel(SE_INT_ALL, se->base + SE_S2LINT_EN); + + nr_irq = platform_irq_count(pdev); + if (nr_irq <= 0) + return -ENODEV; + + for (i = 0; i < nr_irq; i++) { + irq = platform_get_irq(pdev, i); + err = devm_request_irq(dev, irq, se_irq_handler, 0, "loongson-se", se); + if (err) + dev_err(dev, "failed to request IRQ: %d\n", irq); + } + + err = loongson_se_init(se, paddr, se->dmam_size); + if (err) + return err; + + return devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, engines, + ARRAY_SIZE(engines), NULL, 0, NULL); +} + +static const struct acpi_device_id loongson_se_acpi_match[] = { + { "LOON0011", 0 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, loongson_se_acpi_match); + +static struct platform_driver loongson_se_driver = { + .probe = loongson_se_probe, + .driver = { + .name = "loongson-se", + .acpi_match_table = loongson_se_acpi_match, + }, +}; +module_platform_driver(loongson_se_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yinggang Gu "); +MODULE_AUTHOR("Qunqin Zhao "); +MODULE_DESCRIPTION("Loongson Security Engine chip controller driver"); diff --git a/include/linux/mfd/loongson-se.h b/include/linux/mfd/loongson-se.h new file mode 100644 index 00000000000000..07afa0c2524d5e --- /dev/null +++ b/include/linux/mfd/loongson-se.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (C) 2025 Loongson Technology Corporation Limited */ + +#ifndef __MFD_LOONGSON_SE_H__ +#define __MFD_LOONGSON_SE_H__ + +#define LOONGSON_ENGINE_CMD_TIMEOUT_US 10000 +#define SE_SEND_CMD_REG 0x0 +#define SE_SEND_CMD_REG_LEN 0x8 +/* Controller command ID */ +#define SE_CMD_START 0x0 +#define SE_CMD_SET_DMA 0x3 +#define SE_CMD_SET_ENGINE_CMDBUF 0x4 + +#define SE_S2LINT_STAT 0x88 +#define SE_S2LINT_EN 0x8c +#define SE_S2LINT_CL 0x94 +#define SE_L2SINT_STAT 0x98 +#define SE_L2SINT_SET 0xa0 + +#define SE_INT_ALL 0xffffffff +#define SE_INT_CONTROLLER BIT(0) + +#define SE_ENGINE_MAX 16 +#define SE_ENGINE_RNG 1 +#define SE_CMD_RNG 0x100 + +#define SE_ENGINE_TPM 5 +#define SE_CMD_TPM 0x500 + +#define SE_ENGINE_CMD_SIZE 32 + +struct loongson_se_engine { + struct loongson_se *se; + int id; + + /* Command buffer */ + void *command; + void *command_ret; + + void *data_buffer; + uint buffer_size; + /* Data buffer offset to DMA base */ + uint buffer_off; + + struct completion completion; + +}; + +struct loongson_se_engine *loongson_se_init_engine(struct device *dev, int id); +int loongson_se_send_engine_cmd(struct loongson_se_engine *engine); + +#endif From 766b2d724c8df071031412eea902b566a0049c31 Mon Sep 17 00:00:00 2001 From: Qunqin Zhao Date: Sat, 5 Jul 2025 15:20:43 +0800 Subject: [PATCH 0715/3206] crypto: loongson - add Loongson RNG driver support Loongson's Random Number Generator is found inside Loongson Security Engine chip. Co-developed-by: Yinggang Gu Signed-off-by: Yinggang Gu Signed-off-by: Qunqin Zhao Reviewed-by: Huacai Chen Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20250705072045.1067-3-zhaoqunqin@loongson.cn Signed-off-by: Lee Jones --- drivers/crypto/Kconfig | 1 + drivers/crypto/Makefile | 1 + drivers/crypto/loongson/Kconfig | 5 + drivers/crypto/loongson/Makefile | 1 + drivers/crypto/loongson/loongson-rng.c | 209 +++++++++++++++++++++++++ 5 files changed, 217 insertions(+) create mode 100644 drivers/crypto/loongson/Kconfig create mode 100644 drivers/crypto/loongson/Makefile create mode 100644 drivers/crypto/loongson/loongson-rng.c diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 04b4c43b6bae77..c7a1060ba57a6b 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -840,6 +840,7 @@ config CRYPTO_DEV_CCREE If unsure say Y. source "drivers/crypto/hisilicon/Kconfig" +source "drivers/crypto/loongson/Kconfig" source "drivers/crypto/amlogic/Kconfig" diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile index 22eadcc8f4a25a..125b99b24af141 100644 --- a/drivers/crypto/Makefile +++ b/drivers/crypto/Makefile @@ -44,6 +44,7 @@ obj-y += inside-secure/ obj-$(CONFIG_CRYPTO_DEV_ARTPEC6) += axis/ obj-y += xilinx/ obj-y += hisilicon/ +obj-y += loongson/ obj-$(CONFIG_CRYPTO_DEV_AMLOGIC_GXL) += amlogic/ obj-y += intel/ obj-y += starfive/ diff --git a/drivers/crypto/loongson/Kconfig b/drivers/crypto/loongson/Kconfig new file mode 100644 index 00000000000000..15475da8fc11df --- /dev/null +++ b/drivers/crypto/loongson/Kconfig @@ -0,0 +1,5 @@ +config CRYPTO_DEV_LOONGSON_RNG + tristate "Support for Loongson RNG Driver" + depends on MFD_LOONGSON_SE + help + Support for Loongson RNG Driver. diff --git a/drivers/crypto/loongson/Makefile b/drivers/crypto/loongson/Makefile new file mode 100644 index 00000000000000..1ce5ec32b5534c --- /dev/null +++ b/drivers/crypto/loongson/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_CRYPTO_DEV_LOONGSON_RNG) += loongson-rng.o diff --git a/drivers/crypto/loongson/loongson-rng.c b/drivers/crypto/loongson/loongson-rng.c new file mode 100644 index 00000000000000..3a4940260f9e59 --- /dev/null +++ b/drivers/crypto/loongson/loongson-rng.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 HiSilicon Limited. */ +/* Copyright (c) 2025 Loongson Technology Corporation Limited. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SE_SEED_SIZE 32 + +struct loongson_rng_list { + struct mutex lock; + struct list_head list; + int registered; +}; + +struct loongson_rng { + u32 used; + struct loongson_se_engine *engine; + struct list_head list; + struct mutex lock; +}; + +struct loongson_rng_ctx { + struct loongson_rng *rng; +}; + +struct loongson_rng_cmd { + u32 cmd_id; + union { + u32 len; + u32 ret; + } u; + u32 seed_off; + u32 out_off; + u32 pad[4]; +}; + +static struct loongson_rng_list rng_devices = { + .lock = __MUTEX_INITIALIZER(rng_devices.lock), + .list = LIST_HEAD_INIT(rng_devices.list), +}; + +static int loongson_rng_generate(struct crypto_rng *tfm, const u8 *src, + unsigned int slen, u8 *dstn, unsigned int dlen) +{ + struct loongson_rng_ctx *ctx = crypto_rng_ctx(tfm); + struct loongson_rng *rng = ctx->rng; + struct loongson_rng_cmd *cmd = rng->engine->command; + int err, len; + + mutex_lock(&rng->lock); + cmd->seed_off = 0; + do { + len = min(dlen, rng->engine->buffer_size); + cmd = rng->engine->command; + cmd->u.len = len; + err = loongson_se_send_engine_cmd(rng->engine); + if (err) + break; + + cmd = rng->engine->command_ret; + if (cmd->u.ret) { + err = -EIO; + break; + } + + memcpy(dstn, rng->engine->data_buffer, len); + dlen -= len; + dstn += len; + } while (dlen > 0); + mutex_unlock(&rng->lock); + + return err; +} + +static int loongson_rng_init(struct crypto_tfm *tfm) +{ + struct loongson_rng_ctx *ctx = crypto_tfm_ctx(tfm); + struct loongson_rng *rng; + u32 min_used = U32_MAX; + + mutex_lock(&rng_devices.lock); + list_for_each_entry(rng, &rng_devices.list, list) { + if (rng->used < min_used) { + ctx->rng = rng; + min_used = rng->used; + } + } + ctx->rng->used++; + mutex_unlock(&rng_devices.lock); + + return 0; +} + +static void loongson_rng_exit(struct crypto_tfm *tfm) +{ + struct loongson_rng_ctx *ctx = crypto_tfm_ctx(tfm); + + mutex_lock(&rng_devices.lock); + ctx->rng->used--; + mutex_unlock(&rng_devices.lock); +} + +static int loongson_rng_seed(struct crypto_rng *tfm, const u8 *seed, + unsigned int slen) +{ + struct loongson_rng_ctx *ctx = crypto_rng_ctx(tfm); + struct loongson_rng *rng = ctx->rng; + struct loongson_rng_cmd *cmd; + int err; + + if (slen < SE_SEED_SIZE) + return -EINVAL; + + slen = min(slen, rng->engine->buffer_size); + + mutex_lock(&rng->lock); + cmd = rng->engine->command; + cmd->u.len = slen; + cmd->seed_off = rng->engine->buffer_off; + memcpy(rng->engine->data_buffer, seed, slen); + err = loongson_se_send_engine_cmd(rng->engine); + if (err) + goto out; + + cmd = rng->engine->command_ret; + if (cmd->u.ret) + err = -EIO; +out: + mutex_unlock(&rng->lock); + + return err; +} + +static struct rng_alg loongson_rng_alg = { + .generate = loongson_rng_generate, + .seed = loongson_rng_seed, + .seedsize = SE_SEED_SIZE, + .base = { + .cra_name = "stdrng", + .cra_driver_name = "loongson_stdrng", + .cra_priority = 300, + .cra_ctxsize = sizeof(struct loongson_rng_ctx), + .cra_module = THIS_MODULE, + .cra_init = loongson_rng_init, + .cra_exit = loongson_rng_exit, + }, +}; + +static int loongson_rng_probe(struct platform_device *pdev) +{ + struct loongson_rng_cmd *cmd; + struct loongson_rng *rng; + int ret = 0; + + rng = devm_kzalloc(&pdev->dev, sizeof(*rng), GFP_KERNEL); + if (!rng) + return -ENOMEM; + + rng->engine = loongson_se_init_engine(pdev->dev.parent, SE_ENGINE_RNG); + if (!rng->engine) + return -ENODEV; + cmd = rng->engine->command; + cmd->cmd_id = SE_CMD_RNG; + cmd->out_off = rng->engine->buffer_off; + mutex_init(&rng->lock); + + mutex_lock(&rng_devices.lock); + + if (!rng_devices.registered) { + ret = crypto_register_rng(&loongson_rng_alg); + if (ret) { + dev_err(&pdev->dev, "failed to register crypto(%d)\n", ret); + goto out; + } + rng_devices.registered = 1; + } + + list_add_tail(&rng->list, &rng_devices.list); +out: + mutex_unlock(&rng_devices.lock); + + return ret; +} + +static struct platform_driver loongson_rng_driver = { + .probe = loongson_rng_probe, + .driver = { + .name = "loongson-rng", + }, +}; +module_platform_driver(loongson_rng_driver); + +MODULE_ALIAS("platform:loongson-rng"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yinggang Gu "); +MODULE_AUTHOR("Qunqin Zhao "); +MODULE_DESCRIPTION("Loongson Random Number Generator driver"); From 5c83b07df9c5a6e063328c5b885de79f8e8f0263 Mon Sep 17 00:00:00 2001 From: Qunqin Zhao Date: Sat, 5 Jul 2025 15:20:44 +0800 Subject: [PATCH 0716/3206] tpm: Add a driver for Loongson TPM device Loongson Security Engine supports random number generation, hash, symmetric encryption and asymmetric encryption. Based on these encryption functions, TPM2 have been implemented in the Loongson Security Engine firmware. This driver is responsible for copying data into the memory visible to the firmware and receiving data from the firmware. Co-developed-by: Yinggang Gu Signed-off-by: Yinggang Gu Signed-off-by: Qunqin Zhao Reviewed-by: Huacai Chen Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20250705072045.1067-4-zhaoqunqin@loongson.cn Signed-off-by: Lee Jones --- drivers/char/tpm/Kconfig | 9 ++++ drivers/char/tpm/Makefile | 1 + drivers/char/tpm/tpm_loongson.c | 84 +++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 drivers/char/tpm/tpm_loongson.c diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index dddd702b2454a6..ba3924eb13ba89 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -189,6 +189,15 @@ config TCG_IBMVTPM will be accessible from within Linux. To compile this driver as a module, choose M here; the module will be called tpm_ibmvtpm. +config TCG_LOONGSON + tristate "Loongson TPM Interface" + depends on MFD_LOONGSON_SE + help + If you want to make Loongson TPM support available, say Yes and + it will be accessible from within Linux. To compile this + driver as a module, choose M here; the module will be called + tpm_loongson. + config TCG_XEN tristate "XEN TPM Interface" depends on TCG_TPM && XEN diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile index 9de1b3ea34a9f2..5b5cdc0d32e4e5 100644 --- a/drivers/char/tpm/Makefile +++ b/drivers/char/tpm/Makefile @@ -46,3 +46,4 @@ obj-$(CONFIG_TCG_ARM_CRB_FFA) += tpm_crb_ffa.o obj-$(CONFIG_TCG_VTPM_PROXY) += tpm_vtpm_proxy.o obj-$(CONFIG_TCG_FTPM_TEE) += tpm_ftpm_tee.o obj-$(CONFIG_TCG_SVSM) += tpm_svsm.o +obj-$(CONFIG_TCG_LOONGSON) += tpm_loongson.o diff --git a/drivers/char/tpm/tpm_loongson.c b/drivers/char/tpm/tpm_loongson.c new file mode 100644 index 00000000000000..a4ec23639911ce --- /dev/null +++ b/drivers/char/tpm/tpm_loongson.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Loongson Technology Corporation Limited. */ + +#include +#include +#include +#include + +#include "tpm.h" + +struct tpm_loongson_cmd { + u32 cmd_id; + u32 data_off; + u32 data_len; + u32 pad[5]; +}; + +static int tpm_loongson_recv(struct tpm_chip *chip, u8 *buf, size_t count) +{ + struct loongson_se_engine *tpm_engine = dev_get_drvdata(&chip->dev); + struct tpm_loongson_cmd *cmd_ret = tpm_engine->command_ret; + + if (cmd_ret->data_len > count) + return -EIO; + + memcpy(buf, tpm_engine->data_buffer, cmd_ret->data_len); + + return cmd_ret->data_len; +} + +static int tpm_loongson_send(struct tpm_chip *chip, u8 *buf, size_t count) +{ + struct loongson_se_engine *tpm_engine = dev_get_drvdata(&chip->dev); + struct tpm_loongson_cmd *cmd = tpm_engine->command; + + if (count > tpm_engine->buffer_size) + return -E2BIG; + + cmd->data_len = count; + memcpy(tpm_engine->data_buffer, buf, count); + + return loongson_se_send_engine_cmd(tpm_engine); +} + +static const struct tpm_class_ops tpm_loongson_ops = { + .flags = TPM_OPS_AUTO_STARTUP, + .recv = tpm_loongson_recv, + .send = tpm_loongson_send, +}; + +static int tpm_loongson_probe(struct platform_device *pdev) +{ + struct loongson_se_engine *tpm_engine; + struct device *dev = &pdev->dev; + struct tpm_loongson_cmd *cmd; + struct tpm_chip *chip; + + tpm_engine = loongson_se_init_engine(dev->parent, SE_ENGINE_TPM); + if (!tpm_engine) + return -ENODEV; + cmd = tpm_engine->command; + cmd->cmd_id = SE_CMD_TPM; + cmd->data_off = tpm_engine->buffer_off; + + chip = tpmm_chip_alloc(dev, &tpm_loongson_ops); + if (IS_ERR(chip)) + return PTR_ERR(chip); + chip->flags = TPM_CHIP_FLAG_TPM2 | TPM_CHIP_FLAG_IRQ; + dev_set_drvdata(&chip->dev, tpm_engine); + + return tpm_chip_register(chip); +} + +static struct platform_driver tpm_loongson = { + .probe = tpm_loongson_probe, + .driver = { + .name = "tpm_loongson", + }, +}; +module_platform_driver(tpm_loongson); + +MODULE_ALIAS("platform:tpm_loongson"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Loongson TPM driver"); From 74fddd5fbab879a7d039d9fb49af923927a64811 Mon Sep 17 00:00:00 2001 From: Qunqin Zhao Date: Sat, 5 Jul 2025 15:20:45 +0800 Subject: [PATCH 0717/3206] MAINTAINERS: Add entry for Loongson Security Engine drivers This patch adds an entry for Loongson Security Engine drivers in the list of maintainers. Signed-off-by: Qunqin Zhao Reviewed-by: Huacai Chen Link: https://lore.kernel.org/r/20250705072045.1067-5-zhaoqunqin@loongson.cn Signed-off-by: Lee Jones --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..ba3b03fd79d571 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14353,6 +14353,15 @@ S: Maintained F: Documentation/devicetree/bindings/pwm/loongson,ls7a-pwm.yaml F: drivers/pwm/pwm-loongson.c +LOONGSON SECURITY ENGINE DRIVERS +M: Qunqin Zhao +L: linux-crypto@vger.kernel.org +S: Maintained +F: drivers/char/tpm_loongson.c +F: drivers/crypto/loongson/ +F: drivers/mfd/loongson-se.c +F: include/linux/mfd/loongson-se.h + LOONGSON-2 SOC SERIES CLOCK DRIVER M: Yinbo Zhu L: linux-clk@vger.kernel.org From d6058316d16ee0d1861c0550051b2492efb54b79 Mon Sep 17 00:00:00 2001 From: Andrei Lalaev Date: Wed, 20 Aug 2025 10:47:12 +0200 Subject: [PATCH 0718/3206] leds: leds-lp55xx: Use correct address for memory programming Memory programming doesn't work for devices without page support. For example, LP5562 has 3 engines but doesn't support pages, the start address is changed depending on engine number. According to datasheet [1], the PROG MEM register addresses for each engine are as follows: Engine 1: 0x10 Engine 2: 0x30 Engine 3: 0x50 However, the current implementation incorrectly calculates the address of PROG MEM register using the engine index starting from 1: prog_mem_base = 0x10 LP55xx_BYTES_PER_PAGE = 0x20 Engine 1: 0x10 + 0x20 * 1 = 0x30 Engine 2: 0x10 + 0x20 * 2 = 0x50 Engine 3: 0x10 + 0x20 * 3 = 0x70 This results in writing to the wrong engine memory, causing pattern programming to fail. To correct it, the engine index should be decreased: Engine 1: 0x10 + 0x20 * 0 = 0x10 Engine 2: 0x10 + 0x20 * 1 = 0x30 Engine 3: 0x10 + 0x20 * 2 = 0x50 1 - https://www.ti.com/lit/ds/symlink/lp5562.pdf Fixes: 31379a57cf2f ("leds: leds-lp55xx: Generalize update_program_memory function") Signed-off-by: Andrei Lalaev Link: https://lore.kernel.org/r/20250820-lp5562-prog-mem-address-v1-1-8569647fa71d@anton-paar.com Signed-off-by: Lee Jones --- drivers/leds/leds-lp55xx-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/leds-lp55xx-common.c b/drivers/leds/leds-lp55xx-common.c index e71456a56ab8da..fd447eb7eb15e2 100644 --- a/drivers/leds/leds-lp55xx-common.c +++ b/drivers/leds/leds-lp55xx-common.c @@ -212,7 +212,7 @@ int lp55xx_update_program_memory(struct lp55xx_chip *chip, * For LED chip that support page, PAGE is already set in load_engine. */ if (!cfg->pages_per_engine) - start_addr += LP55xx_BYTES_PER_PAGE * idx; + start_addr += LP55xx_BYTES_PER_PAGE * (idx - 1); for (page = 0; page < program_length / LP55xx_BYTES_PER_PAGE; page++) { /* Write to the next page each 32 bytes (if supported) */ From 727e914bbfbbda9e6efa5cb1abe4e96a949d576f Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Thu, 28 Aug 2025 07:09:42 +0800 Subject: [PATCH 0719/3206] PCI/MSI: Check MSI_FLAG_PCI_MSI_MASK_PARENT in cond_[startup|shutdown]_parent() For MSI controllers which only support MSI_FLAG_PCI_MSI_MASK_PARENT, the newly added callback irq_startup() and irq_shutdown() for pci_msi[x]_template will not unmask or mask the interrupt when startup() resp. shutdown() is invoked. This prevents the interrupt from being enabled resp. disabled. Invoke irq_[un]mask_parent() in cond_[startup|shutdown]_parent(), when the interrupt has the MSI_FLAG_PCI_MSI_MASK_PARENT flag set. Fixes: 54f45a30c0d0 ("PCI/MSI: Add startup/shutdown for per device domains") Reported-by: Linux Kernel Functional Testing Reported-by: Nathan Chancellor Reported-by: Wei Fang Signed-off-by: Inochi Amaoto Signed-off-by: Thomas Gleixner Tested-by: Nathan Chancellor Tested-by: Linux Kernel Functional Testing Tested-by: Jon Hunter Tested-by: Wei Fang Tested-by: Chen Wang # Pioneerbox/SG2042 Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/all/20250827230943.17829-1-inochiama@gmail.com Closes: https://lore.kernel.org/regressions/aK4O7Hl8NCVEMznB@monster/ Closes: https://lore.kernel.org/regressions/20250826220959.GA4119563@ax162/ Closes: https://lore.kernel.org/all/20250827093911.1218640-1-wei.fang@nxp.com/ --- drivers/pci/msi/irqdomain.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index e0a800f918e812..b11b7f63f0d6fc 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -154,6 +154,8 @@ static void cond_shutdown_parent(struct irq_data *data) if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) irq_chip_shutdown_parent(data); + else if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) + irq_chip_mask_parent(data); } static unsigned int cond_startup_parent(struct irq_data *data) @@ -162,6 +164,9 @@ static unsigned int cond_startup_parent(struct irq_data *data) if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) return irq_chip_startup_parent(data); + else if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) + irq_chip_unmask_parent(data); + return 0; } From 70f32a10ad423fd19e22e71d05d0968e61316278 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Fri, 29 Aug 2025 07:11:06 +0100 Subject: [PATCH 0720/3206] bpftool: Refactor kernel config reading into common helper Extract the kernel configuration file parsing logic from feature.c into a new read_kernel_config() function in common.c. This includes: 1. Moving the config file handling and option parsing code 2. Adding required headers and struct definition 3. Keeping all existing functionality The refactoring enables sharing this logic with other components while maintaining current behavior. This will be used by subsequent patches that need to check kernel config options. Signed-off-by: Yuan Chen Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Acked-by: Yonghong Song Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250829061107.23905-2-chenyuan_fl@163.com --- tools/bpf/bpftool/common.c | 93 +++++++++++++++++++++++++++++++++++++ tools/bpf/bpftool/feature.c | 86 ++-------------------------------- tools/bpf/bpftool/main.h | 9 ++++ 3 files changed, 106 insertions(+), 82 deletions(-) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index b07317d2842fe8..e8daf963ecef4d 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,7 @@ #include #include /* libbpf_num_possible_cpus */ #include +#include #include "main.h" @@ -1208,3 +1210,94 @@ int pathname_concat(char *buf, int buf_sz, const char *path, return 0; } + +static bool read_next_kernel_config_option(gzFile file, char *buf, size_t n, + char **value) +{ + char *sep; + + while (gzgets(file, buf, n)) { + if (strncmp(buf, "CONFIG_", 7)) + continue; + + sep = strchr(buf, '='); + if (!sep) + continue; + + /* Trim ending '\n' */ + buf[strlen(buf) - 1] = '\0'; + + /* Split on '=' and ensure that a value is present. */ + *sep = '\0'; + if (!sep[1]) + continue; + + *value = sep + 1; + return true; + } + + return false; +} + +int read_kernel_config(const struct kernel_config_option *requested_options, + size_t num_options, char **out_values, + const char *define_prefix) +{ + struct utsname utsn; + char path[PATH_MAX]; + gzFile file = NULL; + char buf[4096]; + char *value; + size_t i; + int ret = 0; + + if (!requested_options || !out_values || num_options == 0) + return -1; + + if (!uname(&utsn)) { + snprintf(path, sizeof(path), "/boot/config-%s", utsn.release); + + /* gzopen also accepts uncompressed files. */ + file = gzopen(path, "r"); + } + + if (!file) { + /* Some distributions build with CONFIG_IKCONFIG=y and put the + * config file at /proc/config.gz. + */ + file = gzopen("/proc/config.gz", "r"); + } + + if (!file) { + p_info("skipping kernel config, can't open file: %s", + strerror(errno)); + return -1; + } + + if (!gzgets(file, buf, sizeof(buf)) || !gzgets(file, buf, sizeof(buf))) { + p_info("skipping kernel config, can't read from file: %s", + strerror(errno)); + ret = -1; + goto end_parse; + } + + if (strcmp(buf, "# Automatically generated file; DO NOT EDIT.\n")) { + p_info("skipping kernel config, can't find correct file"); + ret = -1; + goto end_parse; + } + + while (read_next_kernel_config_option(file, buf, sizeof(buf), &value)) { + for (i = 0; i < num_options; i++) { + if ((define_prefix && !requested_options[i].macro_dump) || + out_values[i] || strcmp(buf, requested_options[i].name)) + continue; + + out_values[i] = strdup(value); + } + } + +end_parse: + gzclose(file); + return ret; +} diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 24fecdf8e43078..0f6070a0c8e717 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -10,7 +10,6 @@ #ifdef USE_LIBCAP #include #endif -#include #include #include @@ -18,7 +17,6 @@ #include #include -#include #include "main.h" @@ -327,40 +325,9 @@ static void probe_jit_limit(void) } } -static bool read_next_kernel_config_option(gzFile file, char *buf, size_t n, - char **value) -{ - char *sep; - - while (gzgets(file, buf, n)) { - if (strncmp(buf, "CONFIG_", 7)) - continue; - - sep = strchr(buf, '='); - if (!sep) - continue; - - /* Trim ending '\n' */ - buf[strlen(buf) - 1] = '\0'; - - /* Split on '=' and ensure that a value is present. */ - *sep = '\0'; - if (!sep[1]) - continue; - - *value = sep + 1; - return true; - } - - return false; -} - static void probe_kernel_image_config(const char *define_prefix) { - static const struct { - const char * const name; - bool macro_dump; - } options[] = { + struct kernel_config_option options[] = { /* Enable BPF */ { "CONFIG_BPF", }, /* Enable bpf() syscall */ @@ -435,52 +402,11 @@ static void probe_kernel_image_config(const char *define_prefix) { "CONFIG_HZ", true, } }; char *values[ARRAY_SIZE(options)] = { }; - struct utsname utsn; - char path[PATH_MAX]; - gzFile file = NULL; - char buf[4096]; - char *value; size_t i; - if (!uname(&utsn)) { - snprintf(path, sizeof(path), "/boot/config-%s", utsn.release); - - /* gzopen also accepts uncompressed files. */ - file = gzopen(path, "r"); - } - - if (!file) { - /* Some distributions build with CONFIG_IKCONFIG=y and put the - * config file at /proc/config.gz. - */ - file = gzopen("/proc/config.gz", "r"); - } - if (!file) { - p_info("skipping kernel config, can't open file: %s", - strerror(errno)); - goto end_parse; - } - /* Sanity checks */ - if (!gzgets(file, buf, sizeof(buf)) || - !gzgets(file, buf, sizeof(buf))) { - p_info("skipping kernel config, can't read from file: %s", - strerror(errno)); - goto end_parse; - } - if (strcmp(buf, "# Automatically generated file; DO NOT EDIT.\n")) { - p_info("skipping kernel config, can't find correct file"); - goto end_parse; - } - - while (read_next_kernel_config_option(file, buf, sizeof(buf), &value)) { - for (i = 0; i < ARRAY_SIZE(options); i++) { - if ((define_prefix && !options[i].macro_dump) || - values[i] || strcmp(buf, options[i].name)) - continue; - - values[i] = strdup(value); - } - } + if (read_kernel_config(options, ARRAY_SIZE(options), values, + define_prefix)) + return; for (i = 0; i < ARRAY_SIZE(options); i++) { if (define_prefix && !options[i].macro_dump) @@ -488,10 +414,6 @@ static void probe_kernel_image_config(const char *define_prefix) print_kernel_option(options[i].name, values[i], define_prefix); free(values[i]); } - -end_parse: - if (file) - gzclose(file); } static bool probe_bpf_syscall(const char *define_prefix) diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index a2bb0714b3d6cf..374cac2a8c66ca 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -275,4 +275,13 @@ int pathname_concat(char *buf, int buf_sz, const char *path, /* print netfilter bpf_link info */ void netfilter_dump_plain(const struct bpf_link_info *info); void netfilter_dump_json(const struct bpf_link_info *info, json_writer_t *wtr); + +struct kernel_config_option { + const char *name; + bool macro_dump; +}; + +int read_kernel_config(const struct kernel_config_option *requested_options, + size_t num_options, char **out_values, + const char *define_prefix); #endif From 6417ca85305ecaffef13cf9063ac35da8fba8500 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Fri, 29 Aug 2025 07:11:07 +0100 Subject: [PATCH 0721/3206] bpftool: Add CET-aware symbol matching for x86_64 architectures Adjust symbol matching logic to account for Control-flow Enforcement Technology (CET) on x86_64 systems. CET prefixes functions with a 4-byte 'endbr' instruction, shifting the actual hook entry point to symbol + 4. Signed-off-by: Yuan Chen Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Acked-by: Yonghong Song Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250829061107.23905-3-chenyuan_fl@163.com --- tools/bpf/bpftool/link.c | 54 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index a773e05d5ade44..bdcd717b0348f8 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -282,11 +282,52 @@ get_addr_cookie_array(__u64 *addrs, __u64 *cookies, __u32 count) return data; } +static bool is_x86_ibt_enabled(void) +{ +#if defined(__x86_64__) + struct kernel_config_option options[] = { + { "CONFIG_X86_KERNEL_IBT", }, + }; + char *values[ARRAY_SIZE(options)] = { }; + bool ret; + + if (read_kernel_config(options, ARRAY_SIZE(options), values, NULL)) + return false; + + ret = !!values[0]; + free(values[0]); + return ret; +#else + return false; +#endif +} + +static bool +symbol_matches_target(__u64 sym_addr, __u64 target_addr, bool is_ibt_enabled) +{ + if (sym_addr == target_addr) + return true; + + /* + * On x86_64 architectures with CET (Control-flow Enforcement Technology), + * function entry points have a 4-byte 'endbr' instruction prefix. + * This causes kprobe hooks to target the address *after* 'endbr' + * (symbol address + 4), preserving the CET instruction. + * Here we check if the symbol address matches the hook target address + * minus 4, indicating a CET-enabled function entry point. + */ + if (is_ibt_enabled && sym_addr == target_addr - 4) + return true; + + return false; +} + static void show_kprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr) { struct addr_cookie *data; __u32 i, j = 0; + bool is_ibt_enabled; jsonw_bool_field(json_wtr, "retprobe", info->kprobe_multi.flags & BPF_F_KPROBE_MULTI_RETURN); @@ -306,11 +347,13 @@ show_kprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr) if (!dd.sym_count) goto error; + is_ibt_enabled = is_x86_ibt_enabled(); for (i = 0; i < dd.sym_count; i++) { - if (dd.sym_mapping[i].address != data[j].addr) + if (!symbol_matches_target(dd.sym_mapping[i].address, + data[j].addr, is_ibt_enabled)) continue; jsonw_start_object(json_wtr); - jsonw_uint_field(json_wtr, "addr", dd.sym_mapping[i].address); + jsonw_uint_field(json_wtr, "addr", (unsigned long)data[j].addr); jsonw_string_field(json_wtr, "func", dd.sym_mapping[i].name); /* Print null if it is vmlinux */ if (dd.sym_mapping[i].module[0] == '\0') { @@ -719,6 +762,7 @@ static void show_kprobe_multi_plain(struct bpf_link_info *info) { struct addr_cookie *data; __u32 i, j = 0; + bool is_ibt_enabled; if (!info->kprobe_multi.count) return; @@ -742,12 +786,14 @@ static void show_kprobe_multi_plain(struct bpf_link_info *info) if (!dd.sym_count) goto error; + is_ibt_enabled = is_x86_ibt_enabled(); printf("\n\t%-16s %-16s %s", "addr", "cookie", "func [module]"); for (i = 0; i < dd.sym_count; i++) { - if (dd.sym_mapping[i].address != data[j].addr) + if (!symbol_matches_target(dd.sym_mapping[i].address, + data[j].addr, is_ibt_enabled)) continue; printf("\n\t%016lx %-16llx %s", - dd.sym_mapping[i].address, data[j].cookie, dd.sym_mapping[i].name); + (unsigned long)data[j].addr, data[j].cookie, dd.sym_mapping[i].name); if (dd.sym_mapping[i].module[0] != '\0') printf(" [%s] ", dd.sym_mapping[i].module); else From e4980fa6463624cb608c0f49882e11e8d6ccd590 Mon Sep 17 00:00:00 2001 From: Feng Yang Date: Wed, 27 Aug 2025 11:28:12 +0800 Subject: [PATCH 0722/3206] bpf: Replace kvfree with kfree for kzalloc memory These pointers are allocated by kzalloc. Therefore, replace kvfree() with kfree() to avoid unnecessary is_vmalloc_addr() check in kvfree(). This is the remaining unmodified part from [1]. Signed-off-by: Feng Yang Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250811123949.552885-1-rongqianfeng@vivo.com [1] Link: https://lore.kernel.org/bpf/20250827032812.498216-1-yangfeng59949@163.com --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c9dd16b2c56be..b9394f8fac0ed5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2021,7 +2021,7 @@ static void free_backedges(struct bpf_scc_visit *visit) for (backedge = visit->backedges; backedge; backedge = next) { free_verifier_state(&backedge->state, false); next = backedge->next; - kvfree(backedge); + kfree(backedge); } visit->backedges = NULL; } @@ -19651,7 +19651,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) err = maybe_enter_scc(env, new); if (err) { free_verifier_state(new, false); - kvfree(new_sl); + kfree(new_sl); return err; } From 81ac63321eb936b1a1f7045b37674661f8ffb4a5 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 5 Aug 2025 10:36:29 +0800 Subject: [PATCH 0723/3206] trace: Remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. No functional changes. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250805023630.335719-1-rongqianfeng@vivo.com Signed-off-by: Qianfeng Rong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index af42aaa3d172cc..2ab283fd3032e3 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -496,7 +496,7 @@ static bool user_event_enabler_queue_fault(struct user_event_mm *mm, { struct user_event_enabler_fault *fault; - fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN); + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT); if (!fault) return false; From 3d62ab32df065e4a7797204a918f6489ddb8a237 Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Tue, 19 Aug 2025 10:51:52 +0000 Subject: [PATCH 0724/3206] tracing: Fix tracing_marker may trigger page fault during preempt_disable Both tracing_mark_write and tracing_mark_raw_write call __copy_from_user_inatomic during preempt_disable. But in some case, __copy_from_user_inatomic may trigger page fault, and will call schedule() subtly. And if a task is migrated to other cpu, the following warning will be trigger: if (RB_WARN_ON(cpu_buffer, !local_read(&cpu_buffer->committing))) An example can illustrate this issue: process flow CPU --------------------------------------------------------------------- tracing_mark_raw_write(): cpu:0 ... ring_buffer_lock_reserve(): cpu:0 ... cpu = raw_smp_processor_id() cpu:0 cpu_buffer = buffer->buffers[cpu] cpu:0 ... ... __copy_from_user_inatomic(): cpu:0 ... # page fault do_mem_abort(): cpu:0 ... # Call schedule schedule() cpu:0 ... # the task schedule to cpu1 __buffer_unlock_commit(): cpu:1 ... ring_buffer_unlock_commit(): cpu:1 ... cpu = raw_smp_processor_id() cpu:1 cpu_buffer = buffer->buffers[cpu] cpu:1 As shown above, the process will acquire cpuid twice and the return values are not the same. To fix this problem using copy_from_user_nofault instead of __copy_from_user_inatomic, as the former performs 'access_ok' before copying. Link: https://lore.kernel.org/20250819105152.2766363-1-luogengkun@huaweicloud.com Fixes: 656c7f0d2d2b ("tracing: Replace kmap with copy_from_user() in trace_marker writing") Signed-off-by: Luo Gengkun Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1b7db732c0b1eb..2f1ae6c0ee81e7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7209,7 +7209,7 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user entry = ring_buffer_event_data(event); entry->ip = ip; - len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + len = copy_from_user_nofault(&entry->buf, ubuf, cnt); if (len) { memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; @@ -7306,7 +7306,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, entry = ring_buffer_event_data(event); - len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + len = copy_from_user_nofault(&entry->id, ubuf, cnt); if (len) { entry->id = -1; memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); From de134cb54c3a67644ff95b1c9bffe545e752c912 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 20 Aug 2025 14:52:05 -0700 Subject: [PATCH 0725/3206] btrfs: fix squota compressed stats leak The following workload on a squota enabled fs: btrfs subvol create mnt/subvol # ensure subvol extents get accounted sync btrfs qgroup create 1/1 mnt btrfs qgroup assign mnt/subvol 1/1 mnt btrfs qgroup delete mnt/subvol # make the cleaner thread run btrfs filesystem sync mnt sleep 1 btrfs filesystem sync mnt btrfs qgroup destroy 1/1 mnt will fail with EBUSY. The reason is that 1/1 does the quick accounting when we assign subvol to it, gaining its exclusive usage as excl and excl_cmpr. But then when we delete subvol, the decrement happens via record_squota_delta() which does not update excl_cmpr, as squotas does not make any distinction between compressed and normal extents. Thus, we increment excl_cmpr but never decrement it, and are unable to delete 1/1. The two possible fixes are to make squota always mirror excl and excl_cmpr or to make the fast accounting separately track the plain and cmpr numbers. The latter felt cleaner to me so that is what I opted for. Fixes: 1e0e9d5771c3 ("btrfs: add helper for recording simple quota deltas") CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ccaa9a3cf1ce37..da102da169fde2 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1455,6 +1455,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; + u64 num_bytes_cmpr = src->excl_cmpr; int ret = 0; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -1466,11 +1467,12 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup_list *glist; qgroup->rfer += sign * num_bytes; - qgroup->rfer_cmpr += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes_cmpr; WARN_ON(sign < 0 && qgroup->excl < num_bytes); + WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr); qgroup->excl += sign * num_bytes; - qgroup->excl_cmpr += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes_cmpr; if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); From 6db1df415d73fcad12134a54f97dc6c8a64ab181 Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Mon, 25 Aug 2025 18:32:04 +0930 Subject: [PATCH 0726/3206] btrfs: accept and ignore compression level for lzo The compression level is meaningless for lzo, but before commit 3f093ccb95f30 ("btrfs: harden parsing of compression mount options"), it was silently ignored if passed. After that commit, passing a level with lzo fails to mount: BTRFS error: unrecognized compression value lzo:1 It seems reasonable for users to expect that lzo would permit a numeric level option, as all the other algos do, even though the kernel's implementation of LZO currently only supports a single level. Because it has always worked to pass a level, it seems likely to me that users in the real world are relying on doing so. This patch restores the old behavior, giving "lzo:N" the same semantics as all of the other compression algos. To be clear, silly variants like "lzo:one", "lzo:the_first_option", or "lzo:armageddon" also used to work. This isn't meant to suggest that any possible mis-interpretation of mount options that once worked must continue to work forever. This is an exceptional case where it makes sense to preserve compatibility, both because the mis-interpretation is reasonable, and because nothing tangible is sacrificed. Finally update btrfs_show_options() to ignore the level of LZO, as it is only the default level without any extra meaning. Fixes: 3f093ccb95f30 ("btrfs: harden parsing of compression mount options") Reviewed-by: Daniel Vacek Reviewed-by: Qu Wenruo Signed-off-by: Calvin Owens Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7f31f8bd63ba4d..e708faf1892f47 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -299,9 +299,12 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (btrfs_match_compress_type(string, "lzo", false)) { + } else if (btrfs_match_compress_type(string, "lzo", true)) { ctx->compress_type = BTRFS_COMPRESS_LZO; - ctx->compress_level = 0; + ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, + string + 3); + if (string[3] == ':' && string[4]) + btrfs_warn(NULL, "Compression level ignored for LZO"); btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); @@ -1079,7 +1082,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); - if (info->compress_level) + if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO) seq_printf(seq, ":%d", info->compress_level); } if (btrfs_test_opt(info, NOSSD)) From 9786531399a679fc2f4630d2c0a186205282ab2f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 22 Aug 2025 16:06:13 +0930 Subject: [PATCH 0727/3206] btrfs: fix corruption reading compressed range when block size is smaller than page size [BUG] With 64K page size (aarch64 with 64K page size config) and 4K btrfs block size, the following workload can easily lead to a corrupted read: mkfs.btrfs -f -s 4k $dev > /dev/null mount -o compress $dev $mnt xfs_io -f -c "pwrite -S 0xff 0 64k" $mnt/base > /dev/null echo "correct result:" od -Ad -t x1 $mnt/base xfs_io -f -c "reflink $mnt/base 32k 0 32k" \ -c "reflink $mnt/base 0 32k 32k" \ -c "pwrite -S 0xff 60k 4k" $mnt/new > /dev/null echo "incorrect result:" od -Ad -t x1 $mnt/new umount $mnt This shows the following result: correct result: 0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0065536 incorrect result: 0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0032768 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0061440 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0065536 Notice the zero in the range [32K, 60K), which is incorrect. [CAUSE] With extra trace printk, it shows the following events during od: (some unrelated info removed like CPU and context) od-3457 btrfs_do_readpage: enter r/i=5/258 folio=0(65536) prev_em_start=0000000000000000 The "r/i" is indicating the root and inode number. In our case the file "new" is using ino 258 from fs tree (root 5). Here notice the @prev_em_start pointer is NULL. This means the btrfs_do_readpage() is called from btrfs_read_folio(), not from btrfs_readahead(). od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=0 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=4096 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=8192 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=12288 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=16384 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=20480 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=24576 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=28672 got em start=0 len=32768 These above 32K blocks will be read from the first half of the compressed data extent. od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=32768 got em start=32768 len=32768 Note here there is no btrfs_submit_compressed_read() call. Which is incorrect now. Although both extent maps at 0 and 32K are pointing to the same compressed data, their offsets are different thus can not be merged into the same read. So this means the compressed data read merge check is doing something wrong. od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=36864 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=40960 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=45056 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=49152 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=53248 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=57344 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=61440 skip uptodate od-3457 btrfs_submit_compressed_read: cb orig_bio: file off=0 len=61440 The function btrfs_submit_compressed_read() is only called at the end of folio read. The compressed bio will only have an extent map of range [0, 32K), but the original bio passed in is for the whole 64K folio. This will cause the decompression part to only fill the first 32K, leaving the rest untouched (aka, filled with zero). This incorrect compressed read merge leads to the above data corruption. There were similar problems that happened in the past, commit 808f80b46790 ("Btrfs: update fix for read corruption of compressed and shared extents") is doing pretty much the same fix for readahead. But that's back to 2015, where btrfs still only supports bs (block size) == ps (page size) cases. This means btrfs_do_readpage() only needs to handle a folio which contains exactly one block. Only btrfs_readahead() can lead to a read covering multiple blocks. Thus only btrfs_readahead() passes a non-NULL @prev_em_start pointer. With v5.15 kernel btrfs introduced bs < ps support. This breaks the above assumption that a folio can only contain one block. Now btrfs_read_folio() can also read multiple blocks in one go. But btrfs_read_folio() doesn't pass a @prev_em_start pointer, thus the existing bio force submission check will never be triggered. In theory, this can also happen for btrfs with large folios, but since large folio is still experimental, we don't need to bother it, thus only bs < ps support is affected for now. [FIX] Instead of passing @prev_em_start to do the proper compressed extent check, introduce one new member, btrfs_bio_ctrl::last_em_start, so that the existing bio force submission logic will always be triggered. CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c953297aa89a01..b21cb72835ccf4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -111,6 +111,24 @@ struct btrfs_bio_ctrl { */ unsigned long submit_bitmap; struct readahead_control *ractl; + + /* + * The start offset of the last used extent map by a read operation. + * + * This is for proper compressed read merge. + * U64_MAX means we are starting the read and have made no progress yet. + * + * The current btrfs_bio_is_contig() only uses disk_bytenr as + * the condition to check if the read can be merged with previous + * bio, which is not correct. E.g. two file extents pointing to the + * same extent but with different offset. + * + * So here we need to do extra checks to only merge reads that are + * covered by the same extent map. + * Just extent_map::start will be enough, as they are unique + * inside the same inode. + */ + u64 last_em_start; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -909,7 +927,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, * return 0 on success, otherwise return error */ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) + struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -1019,12 +1037,11 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, * non-optimal behavior (submitting 2 bios for the same extent). */ if (compress_type != BTRFS_COMPRESS_NONE && - prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->start) + bio_ctrl->last_em_start != U64_MAX && + bio_ctrl->last_em_start != em->start) force_bio_submit = true; - if (prev_em_start) - *prev_em_start = em->start; + bio_ctrl->last_em_start = em->start; btrfs_free_extent_map(em); em = NULL; @@ -1238,12 +1255,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio) const u64 start = folio_pos(folio); const u64 end = start + folio_size(folio) - 1; struct extent_state *cached_state = NULL; - struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ, + .last_em_start = U64_MAX, + }; struct extent_map *em_cached = NULL; int ret; lock_extents_for_read(inode, start, end, &cached_state); - ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); btrfs_free_extent_map(em_cached); @@ -2583,7 +2603,8 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD, - .ractl = rac + .ractl = rac, + .last_em_start = U64_MAX, }; struct folio *folio; struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); @@ -2591,12 +2612,11 @@ void btrfs_readahead(struct readahead_control *rac) const u64 end = start + readahead_length(rac) - 1; struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; - u64 prev_em_start = (u64)-1; lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) - btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); From f6a6c280059c4ddc23e12e3de1b01098e240036f Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 26 Aug 2025 11:24:38 -0700 Subject: [PATCH 0728/3206] btrfs: fix subvolume deletion lockup caused by inodes xarray race There is a race condition between inode eviction and inode caching that can cause a live struct btrfs_inode to be missing from the root->inodes xarray. Specifically, there is a window during evict() between the inode being unhashed and deleted from the xarray. If btrfs_iget() is called for the same inode in that window, it will be recreated and inserted into the xarray, but then eviction will delete the new entry, leaving nothing in the xarray: Thread 1 Thread 2 --------------------------------------------------------------- evict() remove_inode_hash() btrfs_iget_path() btrfs_iget_locked() btrfs_read_locked_inode() btrfs_add_inode_to_root() destroy_inode() btrfs_destroy_inode() btrfs_del_inode_from_root() __xa_erase In turn, this can cause issues for subvolume deletion. Specifically, if an inode is in this lost state, and all other inodes are evicted, then btrfs_del_inode_from_root() will call btrfs_add_dead_root() prematurely. If the lost inode has a delayed_node attached to it, then when btrfs_clean_one_deleted_snapshot() calls btrfs_kill_all_delayed_nodes(), it will loop forever because the delayed_nodes xarray will never become empty (unless memory pressure forces the inode out). We saw this manifest as soft lockups in production. Fix it by only deleting the xarray entry if it matches the given inode (using __xa_cmpxchg()). Fixes: 310b2f5d5a94 ("btrfs: use an xarray to track open inodes in a root") Cc: stable@vger.kernel.org # 6.11+ Reviewed-by: Josef Bacik Reviewed-by: Filipe Manana Co-authored-by: Leo Martins Signed-off-by: Leo Martins Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd82dcc7b2b7b2..e7218e78bff456 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5696,7 +5696,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode) bool empty = false; xa_lock(&root->inodes); - entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + /* + * This btrfs_inode is being freed and has already been unhashed at this + * point. It's possible that another btrfs_inode has already been + * allocated for the same inode and inserted itself into the root, so + * don't delete it in that case. + * + * Note that this shouldn't need to allocate memory, so the gfp flags + * don't really matter. + */ + entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL, + GFP_ATOMIC); if (entry == inode) empty = xa_empty(&root->inodes); xa_unlock(&root->inodes); From 38d98a822c143a4a7337d08f50968cbd7b701ca2 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:02 +0200 Subject: [PATCH 0729/3206] gpio: xgene-sb: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-1-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-xgene-sb.c | 53 ++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c index b51b1fa726bb5a..c559a89aadf7a7 100644 --- a/drivers/gpio/gpio-xgene-sb.c +++ b/drivers/gpio/gpio-xgene-sb.c @@ -21,6 +21,7 @@ #include #include +#include #include "gpiolib-acpi.h" @@ -40,7 +41,7 @@ /** * struct xgene_gpio_sb - GPIO-Standby private data structure. - * @gc: memory-mapped GPIO controllers. + * @chip: Generic GPIO chip data * @regs: GPIO register base offset * @irq_domain: GPIO interrupt domain * @irq_start: GPIO pin that start support interrupt @@ -48,7 +49,7 @@ * @parent_irq_base: Start parent HWIRQ */ struct xgene_gpio_sb { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *regs; struct irq_domain *irq_domain; u16 irq_start; @@ -91,9 +92,9 @@ static int xgene_gpio_sb_irq_set_type(struct irq_data *d, unsigned int type) break; } - xgene_gpio_set_bit(&priv->gc, priv->regs + MPA_GPIO_SEL_LO, + xgene_gpio_set_bit(&priv->chip.gc, priv->regs + MPA_GPIO_SEL_LO, gpio * 2, 1); - xgene_gpio_set_bit(&priv->gc, priv->regs + MPA_GPIO_INT_LVL, + xgene_gpio_set_bit(&priv->chip.gc, priv->regs + MPA_GPIO_INT_LVL, d->hwirq, lvl_type); /* Propagate IRQ type setting to parent */ @@ -109,14 +110,14 @@ static void xgene_gpio_sb_irq_mask(struct irq_data *d) irq_chip_mask_parent(d); - gpiochip_disable_irq(&priv->gc, d->hwirq); + gpiochip_disable_irq(&priv->chip.gc, d->hwirq); } static void xgene_gpio_sb_irq_unmask(struct irq_data *d) { struct xgene_gpio_sb *priv = irq_data_get_irq_chip_data(d); - gpiochip_enable_irq(&priv->gc, d->hwirq); + gpiochip_enable_irq(&priv->chip.gc, d->hwirq); irq_chip_unmask_parent(d); } @@ -155,15 +156,15 @@ static int xgene_gpio_sb_domain_activate(struct irq_domain *d, u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq); int ret; - ret = gpiochip_lock_as_irq(&priv->gc, gpio); + ret = gpiochip_lock_as_irq(&priv->chip.gc, gpio); if (ret) { - dev_err(priv->gc.parent, + dev_err(priv->chip.gc.parent, "Unable to configure XGene GPIO standby pin %d as IRQ\n", gpio); return ret; } - xgene_gpio_set_bit(&priv->gc, priv->regs + MPA_GPIO_SEL_LO, + xgene_gpio_set_bit(&priv->chip.gc, priv->regs + MPA_GPIO_SEL_LO, gpio * 2, 1); return 0; } @@ -174,8 +175,8 @@ static void xgene_gpio_sb_domain_deactivate(struct irq_domain *d, struct xgene_gpio_sb *priv = d->host_data; u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq); - gpiochip_unlock_as_irq(&priv->gc, gpio); - xgene_gpio_set_bit(&priv->gc, priv->regs + MPA_GPIO_SEL_LO, + gpiochip_unlock_as_irq(&priv->chip.gc, gpio); + xgene_gpio_set_bit(&priv->chip.gc, priv->regs + MPA_GPIO_SEL_LO, gpio * 2, 0); } @@ -237,6 +238,7 @@ static const struct irq_domain_ops xgene_gpio_sb_domain_ops = { static int xgene_gpio_sb_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct xgene_gpio_sb *priv; int ret; void __iomem *regs; @@ -263,14 +265,19 @@ static int xgene_gpio_sb_probe(struct platform_device *pdev) return -ENODEV; } - ret = bgpio_init(&priv->gc, &pdev->dev, 4, - regs + MPA_GPIO_IN_ADDR, - regs + MPA_GPIO_OUT_ADDR, NULL, - regs + MPA_GPIO_OE_ADDR, NULL, 0); + config = (typeof(config)){ + .dev = &pdev->dev, + .sz = 4, + .dat = regs + MPA_GPIO_IN_ADDR, + .set = regs + MPA_GPIO_OUT_ADDR, + .dirout = regs + MPA_GPIO_OE_ADDR, + }; + + ret = gpio_generic_chip_init(&priv->chip, &config); if (ret) return ret; - priv->gc.to_irq = xgene_gpio_sb_to_irq; + priv->chip.gc.to_irq = xgene_gpio_sb_to_irq; /* Retrieve start irq pin, use default if property not found */ priv->irq_start = XGENE_DFLT_IRQ_START_PIN; @@ -283,12 +290,12 @@ static int xgene_gpio_sb_probe(struct platform_device *pdev) priv->nirq = val32; /* Retrieve number gpio, use default if property not found */ - priv->gc.ngpio = XGENE_DFLT_MAX_NGPIO; + priv->chip.gc.ngpio = XGENE_DFLT_MAX_NGPIO; if (!device_property_read_u32(&pdev->dev, "apm,nr-gpios", &val32)) - priv->gc.ngpio = val32; + priv->chip.gc.ngpio = val32; dev_info(&pdev->dev, "Support %d gpios, %d irqs start from pin %d\n", - priv->gc.ngpio, priv->nirq, priv->irq_start); + priv->chip.gc.ngpio, priv->nirq, priv->irq_start); platform_set_drvdata(pdev, priv); @@ -298,9 +305,9 @@ static int xgene_gpio_sb_probe(struct platform_device *pdev) if (!priv->irq_domain) return -ENODEV; - priv->gc.irq.domain = priv->irq_domain; + priv->chip.gc.irq.domain = priv->irq_domain; - ret = devm_gpiochip_add_data(&pdev->dev, &priv->gc, priv); + ret = devm_gpiochip_add_data(&pdev->dev, &priv->chip.gc, priv); if (ret) { dev_err(&pdev->dev, "failed to register X-Gene GPIO Standby driver\n"); @@ -311,7 +318,7 @@ static int xgene_gpio_sb_probe(struct platform_device *pdev) dev_info(&pdev->dev, "X-Gene GPIO Standby driver registered\n"); /* Register interrupt handlers for GPIO signaled ACPI Events */ - acpi_gpiochip_request_interrupts(&priv->gc); + acpi_gpiochip_request_interrupts(&priv->chip.gc); return ret; } @@ -320,7 +327,7 @@ static void xgene_gpio_sb_remove(struct platform_device *pdev) { struct xgene_gpio_sb *priv = platform_get_drvdata(pdev); - acpi_gpiochip_free_interrupts(&priv->gc); + acpi_gpiochip_free_interrupts(&priv->chip.gc); irq_domain_remove(priv->irq_domain); } From d3332dd1f6e2cf82744dbab37d05857e2d028fa0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:03 +0200 Subject: [PATCH 0730/3206] gpio: mxs: order includes alphabetically For easier maintenance: put includes in alphabetical order. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-2-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mxs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-mxs.c b/drivers/gpio/gpio-mxs.c index 0ea46f3d04e128..bf0c97f589c96a 100644 --- a/drivers/gpio/gpio-mxs.c +++ b/drivers/gpio/gpio-mxs.c @@ -7,17 +7,17 @@ // Copyright (C) 2004-2010 Freescale Semiconductor, Inc. All Rights Reserved. #include +#include #include #include #include #include #include +#include #include #include #include #include -#include -#include #define MXS_SET 0x4 #define MXS_CLR 0x8 From c7357c8b6703d4bc0db6198782fcbf0cf3033844 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:04 +0200 Subject: [PATCH 0731/3206] gpio: mxs: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-3-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mxs.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/gpio/gpio-mxs.c b/drivers/gpio/gpio-mxs.c index bf0c97f589c96a..af45d1b1af6e04 100644 --- a/drivers/gpio/gpio-mxs.c +++ b/drivers/gpio/gpio-mxs.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -48,7 +49,7 @@ struct mxs_gpio_port { int id; int irq; struct irq_domain *domain; - struct gpio_chip gc; + struct gpio_generic_chip chip; struct device *dev; enum mxs_gpio_id devid; u32 both_edges; @@ -258,6 +259,7 @@ MODULE_DEVICE_TABLE(of, mxs_gpio_dt_ids); static int mxs_gpio_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; + struct gpio_generic_chip_config config; struct device_node *parent; static void __iomem *base; struct mxs_gpio_port *port; @@ -319,19 +321,24 @@ static int mxs_gpio_probe(struct platform_device *pdev) irq_set_chained_handler_and_data(port->irq, mxs_gpio_irq_handler, port); - err = bgpio_init(&port->gc, &pdev->dev, 4, - port->base + PINCTRL_DIN(port), - port->base + PINCTRL_DOUT(port) + MXS_SET, - port->base + PINCTRL_DOUT(port) + MXS_CLR, - port->base + PINCTRL_DOE(port), NULL, 0); + config = (typeof(config)){ + .dev = &pdev->dev, + .sz = 4, + .dat = port->base + PINCTRL_DIN(port), + .set = port->base + PINCTRL_DOUT(port) + MXS_SET, + .clr = port->base + PINCTRL_DOUT(port) + MXS_CLR, + .dirout = port->base + PINCTRL_DOE(port), + }; + + err = gpio_generic_chip_init(&port->chip, &config); if (err) goto out_irqdomain_remove; - port->gc.to_irq = mxs_gpio_to_irq; - port->gc.get_direction = mxs_gpio_get_direction; - port->gc.base = port->id * 32; + port->chip.gc.to_irq = mxs_gpio_to_irq; + port->chip.gc.get_direction = mxs_gpio_get_direction; + port->chip.gc.base = port->id * 32; - err = gpiochip_add_data(&port->gc, port); + err = gpiochip_add_data(&port->chip.gc, port); if (err) goto out_irqdomain_remove; From 7cb9086790a0de526ee40508a4deaebfd82a5bca Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:05 +0200 Subject: [PATCH 0732/3206] gpio: mlxbf2: use dev_err_probe() where applicable Simplify error handling and shrink the code by using dev_err_probe() consistently across the driver. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-4-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mlxbf2.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c index 390f2e74a9d819..bc4bba8b567c26 100644 --- a/drivers/gpio/gpio-mlxbf2.c +++ b/drivers/gpio/gpio-mlxbf2.c @@ -369,10 +369,8 @@ mlxbf2_gpio_probe(struct platform_device *pdev) return PTR_ERR(gs->gpio_io); ret = mlxbf2_gpio_get_lock_res(pdev); - if (ret) { - dev_err(dev, "Failed to get yu_arm_gpio_lock resource\n"); - return ret; - } + if (ret) + return dev_err_probe(dev, ret, "Failed to get yu_arm_gpio_lock resource\n"); if (device_property_read_u32(dev, "npins", &npins)) npins = MLXBF2_GPIO_MAX_PINS_PER_BLOCK; @@ -387,10 +385,8 @@ mlxbf2_gpio_probe(struct platform_device *pdev) NULL, 0); - if (ret) { - dev_err(dev, "bgpio_init failed\n"); - return ret; - } + if (ret) + return dev_err_probe(dev, ret, "bgpio_init failed\n"); gc->direction_input = mlxbf2_gpio_direction_input; gc->direction_output = mlxbf2_gpio_direction_output; @@ -414,19 +410,15 @@ mlxbf2_gpio_probe(struct platform_device *pdev) */ ret = devm_request_irq(dev, irq, mlxbf2_gpio_irq_handler, IRQF_SHARED, name, gs); - if (ret) { - dev_err(dev, "failed to request IRQ"); - return ret; - } + if (ret) + return dev_err_probe(dev, ret, "failed to request IRQ"); } platform_set_drvdata(pdev, gs); ret = devm_gpiochip_add_data(dev, &gs->gc, gs); - if (ret) { - dev_err(dev, "Failed adding memory mapped gpiochip\n"); - return ret; - } + if (ret) + return dev_err_probe(dev, ret, "Failed adding memory mapped gpiochip\n"); return 0; } From 72fdbf35ec7273bb1c885696680e611c47b261b4 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:06 +0200 Subject: [PATCH 0733/3206] gpio: mlxbf2: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-5-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mlxbf2.c | 59 +++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c index bc4bba8b567c26..f99f66cd189ca7 100644 --- a/drivers/gpio/gpio-mlxbf2.c +++ b/drivers/gpio/gpio-mlxbf2.c @@ -6,8 +6,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -65,7 +67,7 @@ struct mlxbf2_gpio_context_save_regs { /* BlueField-2 gpio block context structure. */ struct mlxbf2_gpio_context { - struct gpio_chip gc; + struct gpio_generic_chip chip; /* YU GPIO blocks address */ void __iomem *gpio_io; @@ -132,7 +134,7 @@ static int mlxbf2_gpio_lock_acquire(struct mlxbf2_gpio_context *gs) u32 arm_gpio_lock_val; mutex_lock(yu_arm_gpio_lock_param.lock); - raw_spin_lock(&gs->gc.bgpio_lock); + gpio_generic_chip_lock(&gs->chip); arm_gpio_lock_val = readl(yu_arm_gpio_lock_param.io); @@ -140,7 +142,7 @@ static int mlxbf2_gpio_lock_acquire(struct mlxbf2_gpio_context *gs) * When lock active bit[31] is set, ModeX is write enabled */ if (YU_LOCK_ACTIVE_BIT(arm_gpio_lock_val)) { - raw_spin_unlock(&gs->gc.bgpio_lock); + gpio_generic_chip_unlock(&gs->chip); mutex_unlock(yu_arm_gpio_lock_param.lock); return -EINVAL; } @@ -154,11 +156,11 @@ static int mlxbf2_gpio_lock_acquire(struct mlxbf2_gpio_context *gs) * Release the YU arm_gpio_lock after changing the direction mode. */ static void mlxbf2_gpio_lock_release(struct mlxbf2_gpio_context *gs) - __releases(&gs->gc.bgpio_lock) + __releases(&gs->chip.gc.bgpio_lock) __releases(yu_arm_gpio_lock_param.lock) { writel(YU_ARM_GPIO_LOCK_RELEASE, yu_arm_gpio_lock_param.io); - raw_spin_unlock(&gs->gc.bgpio_lock); + gpio_generic_chip_unlock(&gs->chip); mutex_unlock(yu_arm_gpio_lock_param.lock); } @@ -235,11 +237,10 @@ static void mlxbf2_gpio_irq_enable(struct irq_data *irqd) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf2_gpio_context *gs = gpiochip_get_data(gc); int offset = irqd_to_hwirq(irqd); - unsigned long flags; u32 val; gpiochip_enable_irq(gc, irqd_to_hwirq(irqd)); - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&gs->chip); val = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_CLRCAUSE); val |= BIT(offset); writel(val, gs->gpio_io + YU_GPIO_CAUSE_OR_CLRCAUSE); @@ -247,7 +248,6 @@ static void mlxbf2_gpio_irq_enable(struct irq_data *irqd) val = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); val |= BIT(offset); writel(val, gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); } static void mlxbf2_gpio_irq_disable(struct irq_data *irqd) @@ -255,21 +255,21 @@ static void mlxbf2_gpio_irq_disable(struct irq_data *irqd) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf2_gpio_context *gs = gpiochip_get_data(gc); int offset = irqd_to_hwirq(irqd); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); - val = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); - val &= ~BIT(offset); - writel(val, gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); + scoped_guard(gpio_generic_lock_irqsave, &gs->chip) { + val = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); + val &= ~BIT(offset); + writel(val, gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); + } + gpiochip_disable_irq(gc, irqd_to_hwirq(irqd)); } static irqreturn_t mlxbf2_gpio_irq_handler(int irq, void *ptr) { struct mlxbf2_gpio_context *gs = ptr; - struct gpio_chip *gc = &gs->gc; + struct gpio_chip *gc = &gs->chip.gc; unsigned long pending; u32 level; @@ -288,7 +288,6 @@ mlxbf2_gpio_irq_set_type(struct irq_data *irqd, unsigned int type) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf2_gpio_context *gs = gpiochip_get_data(gc); int offset = irqd_to_hwirq(irqd); - unsigned long flags; bool fall = false; bool rise = false; u32 val; @@ -308,7 +307,8 @@ mlxbf2_gpio_irq_set_type(struct irq_data *irqd, unsigned int type) return -EINVAL; } - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&gs->chip); + if (fall) { val = readl(gs->gpio_io + YU_GPIO_CAUSE_FALL_EN); val |= BIT(offset); @@ -320,7 +320,6 @@ mlxbf2_gpio_irq_set_type(struct irq_data *irqd, unsigned int type) val |= BIT(offset); writel(val, gs->gpio_io + YU_GPIO_CAUSE_RISE_EN); } - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); return 0; } @@ -347,6 +346,7 @@ static const struct irq_chip mlxbf2_gpio_irq_chip = { static int mlxbf2_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct mlxbf2_gpio_context *gs; struct device *dev = &pdev->dev; struct gpio_irq_chip *girq; @@ -375,18 +375,19 @@ mlxbf2_gpio_probe(struct platform_device *pdev) if (device_property_read_u32(dev, "npins", &npins)) npins = MLXBF2_GPIO_MAX_PINS_PER_BLOCK; - gc = &gs->gc; + gc = &gs->chip.gc; - ret = bgpio_init(gc, dev, 4, - gs->gpio_io + YU_GPIO_DATAIN, - gs->gpio_io + YU_GPIO_DATASET, - gs->gpio_io + YU_GPIO_DATACLEAR, - NULL, - NULL, - 0); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = gs->gpio_io + YU_GPIO_DATAIN, + .set = gs->gpio_io + YU_GPIO_DATASET, + .clr = gs->gpio_io + YU_GPIO_DATACLEAR, + }; + ret = gpio_generic_chip_init(&gs->chip, &config); if (ret) - return dev_err_probe(dev, ret, "bgpio_init failed\n"); + return dev_err_probe(dev, ret, "failed to initialize the generic GPIO chip\n"); gc->direction_input = mlxbf2_gpio_direction_input; gc->direction_output = mlxbf2_gpio_direction_output; @@ -395,7 +396,7 @@ mlxbf2_gpio_probe(struct platform_device *pdev) irq = platform_get_irq_optional(pdev, 0); if (irq >= 0) { - girq = &gs->gc.irq; + girq = &gs->chip.gc.irq; gpio_irq_chip_set_chip(girq, &mlxbf2_gpio_irq_chip); girq->handler = handle_simple_irq; girq->default_type = IRQ_TYPE_NONE; @@ -416,7 +417,7 @@ mlxbf2_gpio_probe(struct platform_device *pdev) platform_set_drvdata(pdev, gs); - ret = devm_gpiochip_add_data(dev, &gs->gc, gs); + ret = devm_gpiochip_add_data(dev, &gs->chip.gc, gs); if (ret) return dev_err_probe(dev, ret, "Failed adding memory mapped gpiochip\n"); From 6821e5d5877ca80b6989dfba2648a7ecbe3d9a64 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:07 +0200 Subject: [PATCH 0734/3206] gpio: xgs-iproc: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-6-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-xgs-iproc.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/gpio/gpio-xgs-iproc.c b/drivers/gpio/gpio-xgs-iproc.c index 93544e98ccbd3f..9cffdedd31b1c3 100644 --- a/drivers/gpio/gpio-xgs-iproc.c +++ b/drivers/gpio/gpio-xgs-iproc.c @@ -3,11 +3,12 @@ * Copyright (C) 2017 Broadcom */ -#include #include #include #include #include +#include +#include #include #include #include @@ -28,7 +29,7 @@ #define IPROC_GPIO_CCA_INT_EDGE 0x24 struct iproc_gpio_chip { - struct gpio_chip gc; + struct gpio_generic_chip gen_gc; spinlock_t lock; struct device *dev; void __iomem *base; @@ -38,7 +39,7 @@ struct iproc_gpio_chip { static inline struct iproc_gpio_chip * to_iproc_gpio(struct gpio_chip *gc) { - return container_of(gc, struct iproc_gpio_chip, gc); + return container_of(to_gpio_generic_chip(gc), struct iproc_gpio_chip, gen_gc); } static void iproc_gpio_irq_ack(struct irq_data *d) @@ -213,6 +214,7 @@ static const struct irq_chip iproc_gpio_irq_chip = { static int iproc_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct device_node *dn = pdev->dev.of_node; struct iproc_gpio_chip *chip; @@ -231,21 +233,23 @@ static int iproc_gpio_probe(struct platform_device *pdev) if (IS_ERR(chip->base)) return PTR_ERR(chip->base); - ret = bgpio_init(&chip->gc, dev, 4, - chip->base + IPROC_GPIO_CCA_DIN, - chip->base + IPROC_GPIO_CCA_DOUT, - NULL, - chip->base + IPROC_GPIO_CCA_OUT_EN, - NULL, - 0); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = chip->base + IPROC_GPIO_CCA_DIN, + .set = chip->base + IPROC_GPIO_CCA_DOUT, + .dirout = chip->base + IPROC_GPIO_CCA_OUT_EN, + }; + + ret = gpio_generic_chip_init(&chip->gen_gc, &config); if (ret) { dev_err(dev, "unable to init GPIO chip\n"); return ret; } - chip->gc.label = dev_name(dev); + chip->gen_gc.gc.label = dev_name(dev); if (!of_property_read_u32(dn, "ngpios", &num_gpios)) - chip->gc.ngpio = num_gpios; + chip->gen_gc.gc.ngpio = num_gpios; irq = platform_get_irq(pdev, 0); if (irq > 0) { @@ -266,13 +270,13 @@ static int iproc_gpio_probe(struct platform_device *pdev) * a flow-handler because the irq is shared. */ ret = devm_request_irq(dev, irq, iproc_gpio_irq_handler, - IRQF_SHARED, chip->gc.label, &chip->gc); + IRQF_SHARED, chip->gen_gc.gc.label, &chip->gen_gc.gc); if (ret) { dev_err(dev, "Fail to request IRQ%d: %d\n", irq, ret); return ret; } - girq = &chip->gc.irq; + girq = &chip->gen_gc.gc.irq; gpio_irq_chip_set_chip(girq, &iproc_gpio_irq_chip); /* This will let us handle the parent IRQ in the driver */ girq->parent_handler = NULL; @@ -282,7 +286,7 @@ static int iproc_gpio_probe(struct platform_device *pdev) girq->handler = handle_simple_irq; } - ret = devm_gpiochip_add_data(dev, &chip->gc, chip); + ret = devm_gpiochip_add_data(dev, &chip->gen_gc.gc, chip); if (ret) { dev_err(dev, "unable to add GPIO chip\n"); return ret; From cf0257d3ce05259a74265fe0a0bd7de063cc6793 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:08 +0200 Subject: [PATCH 0735/3206] gpio: ftgpio010: order includes alphabetically For easier maintenance: put includes in alphabetical order. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-7-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ftgpio010.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-ftgpio010.c b/drivers/gpio/gpio-ftgpio010.c index c35eaa2851d853..56666ca8889bc9 100644 --- a/drivers/gpio/gpio-ftgpio010.c +++ b/drivers/gpio/gpio-ftgpio010.c @@ -10,12 +10,13 @@ * MXC GPIO support. (c) 2008 Daniel Mack * Copyright 2008 Juergen Beisert, kernel@pengutronix.de */ + +#include +#include #include -#include #include +#include #include -#include -#include /* GPIO registers definition */ #define GPIO_DATA_OUT 0x00 From 3ff7ab070b4804aad5b1d3e3d82a793710ef1f27 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:09 +0200 Subject: [PATCH 0736/3206] gpio: ftgpio010: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-8-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ftgpio010.c | 39 ++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/drivers/gpio/gpio-ftgpio010.c b/drivers/gpio/gpio-ftgpio010.c index 56666ca8889bc9..dfa2c9444960a3 100644 --- a/drivers/gpio/gpio-ftgpio010.c +++ b/drivers/gpio/gpio-ftgpio010.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -41,13 +42,13 @@ /** * struct ftgpio_gpio - Gemini GPIO state container * @dev: containing device for this instance - * @gc: gpiochip for this instance + * @chip: generic GPIO chip for this instance * @base: remapped I/O-memory base * @clk: silicon clock */ struct ftgpio_gpio { struct device *dev; - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *base; struct clk *clk; }; @@ -234,6 +235,7 @@ static const struct irq_chip ftgpio_irq_chip = { static int ftgpio_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct ftgpio_gpio *g; struct gpio_irq_chip *girq; @@ -262,27 +264,30 @@ static int ftgpio_gpio_probe(struct platform_device *pdev) */ return PTR_ERR(g->clk); - ret = bgpio_init(&g->gc, dev, 4, - g->base + GPIO_DATA_IN, - g->base + GPIO_DATA_SET, - g->base + GPIO_DATA_CLR, - g->base + GPIO_DIR, - NULL, - 0); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = g->base + GPIO_DATA_IN, + .set = g->base + GPIO_DATA_SET, + .clr = g->base + GPIO_DATA_CLR, + .dirout = g->base + GPIO_DIR, + }; + + ret = gpio_generic_chip_init(&g->chip, &config); if (ret) return dev_err_probe(dev, ret, "unable to init generic GPIO\n"); - g->gc.label = dev_name(dev); - g->gc.base = -1; - g->gc.parent = dev; - g->gc.owner = THIS_MODULE; - /* ngpio is set by bgpio_init() */ + g->chip.gc.label = dev_name(dev); + g->chip.gc.base = -1; + g->chip.gc.parent = dev; + g->chip.gc.owner = THIS_MODULE; + /* ngpio is set by gpio_generic_chip_init() */ /* We need a silicon clock to do debounce */ if (!IS_ERR(g->clk)) - g->gc.set_config = ftgpio_gpio_set_config; + g->chip.gc.set_config = ftgpio_gpio_set_config; - girq = &g->gc.irq; + girq = &g->chip.gc.irq; gpio_irq_chip_set_chip(girq, &ftgpio_irq_chip); girq->parent_handler = ftgpio_gpio_irq_handler; girq->num_parents = 1; @@ -303,7 +308,7 @@ static int ftgpio_gpio_probe(struct platform_device *pdev) /* Clear any use of debounce */ writel(0x0, g->base + GPIO_DEBOUNCE_EN); - return devm_gpiochip_add_data(dev, &g->gc, g); + return devm_gpiochip_add_data(dev, &g->chip.gc, g); } static const struct of_device_id ftgpio_gpio_of_match[] = { From b9dac8251e7e6aa433f54a7da45cb05c66627695 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:10 +0200 Subject: [PATCH 0737/3206] gpio: realtek-otto: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-9-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-realtek-otto.c | 41 +++++++++++++++++++------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/drivers/gpio/gpio-realtek-otto.c b/drivers/gpio/gpio-realtek-otto.c index d6418f89d3f63d..ab711422254e9e 100644 --- a/drivers/gpio/gpio-realtek-otto.c +++ b/drivers/gpio/gpio-realtek-otto.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only -#include #include +#include +#include #include #include #include @@ -41,7 +42,7 @@ /** * realtek_gpio_ctrl - Realtek Otto GPIO driver data * - * @gc: Associated gpio_chip instance + * @chip: Associated gpio_generic_chip instance * @base: Base address of the register block for a GPIO bank * @lock: Lock for accessing the IRQ registers and values * @intr_mask: Mask for interrupts lines @@ -64,7 +65,7 @@ * IMR on changes. */ struct realtek_gpio_ctrl { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *base; void __iomem *cpumask_base; struct cpumask cpu_irq_maskable; @@ -101,7 +102,7 @@ static struct realtek_gpio_ctrl *irq_data_to_ctrl(struct irq_data *data) { struct gpio_chip *gc = irq_data_get_irq_chip_data(data); - return container_of(gc, struct realtek_gpio_ctrl, gc); + return container_of(to_gpio_generic_chip(gc), struct realtek_gpio_ctrl, chip); } /* @@ -194,7 +195,7 @@ static void realtek_gpio_irq_unmask(struct irq_data *data) unsigned int line = irqd_to_hwirq(data); unsigned long flags; - gpiochip_enable_irq(&ctrl->gc, line); + gpiochip_enable_irq(&ctrl->chip.gc, line); raw_spin_lock_irqsave(&ctrl->lock, flags); ctrl->intr_mask[line] = REALTEK_GPIO_IMR_LINE_MASK; @@ -213,7 +214,7 @@ static void realtek_gpio_irq_mask(struct irq_data *data) realtek_gpio_update_line_imr(ctrl, line); raw_spin_unlock_irqrestore(&ctrl->lock, flags); - gpiochip_disable_irq(&ctrl->gc, line); + gpiochip_disable_irq(&ctrl->chip.gc, line); } static int realtek_gpio_irq_set_type(struct irq_data *data, unsigned int flow_type) @@ -356,8 +357,9 @@ MODULE_DEVICE_TABLE(of, realtek_gpio_of_match); static int realtek_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; - unsigned long bgpio_flags; + unsigned long gen_gc_flags; unsigned int dev_flags; struct gpio_irq_chip *girq; struct realtek_gpio_ctrl *ctrl; @@ -388,32 +390,37 @@ static int realtek_gpio_probe(struct platform_device *pdev) raw_spin_lock_init(&ctrl->lock); if (dev_flags & GPIO_PORTS_REVERSED) { - bgpio_flags = 0; + gen_gc_flags = 0; ctrl->bank_read = realtek_gpio_bank_read; ctrl->bank_write = realtek_gpio_bank_write; ctrl->line_imr_pos = realtek_gpio_line_imr_pos; } else { - bgpio_flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + gen_gc_flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; ctrl->bank_read = realtek_gpio_bank_read_swapped; ctrl->bank_write = realtek_gpio_bank_write_swapped; ctrl->line_imr_pos = realtek_gpio_line_imr_pos_swapped; } - err = bgpio_init(&ctrl->gc, dev, 4, - ctrl->base + REALTEK_GPIO_REG_DATA, NULL, NULL, - ctrl->base + REALTEK_GPIO_REG_DIR, NULL, - bgpio_flags); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = ctrl->base + REALTEK_GPIO_REG_DATA, + .dirout = ctrl->base + REALTEK_GPIO_REG_DIR, + .flags = gen_gc_flags, + }; + + err = gpio_generic_chip_init(&ctrl->chip, &config); if (err) { dev_err(dev, "unable to init generic GPIO"); return err; } - ctrl->gc.ngpio = ngpios; - ctrl->gc.owner = THIS_MODULE; + ctrl->chip.gc.ngpio = ngpios; + ctrl->chip.gc.owner = THIS_MODULE; irq = platform_get_irq_optional(pdev, 0); if (!(dev_flags & GPIO_INTERRUPTS_DISABLED) && irq > 0) { - girq = &ctrl->gc.irq; + girq = &ctrl->chip.gc.irq; gpio_irq_chip_set_chip(girq, &realtek_gpio_irq_chip); girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_bad_irq; @@ -442,7 +449,7 @@ static int realtek_gpio_probe(struct platform_device *pdev) cpumask_set_cpu(cpu, &ctrl->cpu_irq_maskable); } - return devm_gpiochip_add_data(dev, &ctrl->gc, ctrl); + return devm_gpiochip_add_data(dev, &ctrl->chip.gc, ctrl); } static struct platform_driver realtek_gpio_driver = { From c0378e59a6af2efa470a384b69fd24d3f3f3dd97 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:11 +0200 Subject: [PATCH 0738/3206] gpio: hisi: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-10-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-hisi.c | 46 +++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/drivers/gpio/gpio-hisi.c b/drivers/gpio/gpio-hisi.c index 6016e6f0ed0fb8..01a99ac613d94e 100644 --- a/drivers/gpio/gpio-hisi.c +++ b/drivers/gpio/gpio-hisi.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2020 HiSilicon Limited. */ + #include +#include #include #include #include @@ -33,7 +35,7 @@ #define HISI_GPIO_DRIVER_NAME "gpio-hisi" struct hisi_gpio { - struct gpio_chip chip; + struct gpio_generic_chip chip; struct device *dev; void __iomem *reg_base; unsigned int line_num; @@ -43,8 +45,8 @@ struct hisi_gpio { static inline u32 hisi_gpio_read_reg(struct gpio_chip *chip, unsigned int off) { - struct hisi_gpio *hisi_gpio = - container_of(chip, struct hisi_gpio, chip); + struct hisi_gpio *hisi_gpio = container_of(to_gpio_generic_chip(chip), + struct hisi_gpio, chip); void __iomem *reg = hisi_gpio->reg_base + off; return readl(reg); @@ -53,8 +55,8 @@ static inline u32 hisi_gpio_read_reg(struct gpio_chip *chip, static inline void hisi_gpio_write_reg(struct gpio_chip *chip, unsigned int off, u32 val) { - struct hisi_gpio *hisi_gpio = - container_of(chip, struct hisi_gpio, chip); + struct hisi_gpio *hisi_gpio = container_of(to_gpio_generic_chip(chip), + struct hisi_gpio, chip); void __iomem *reg = hisi_gpio->reg_base + off; writel(val, reg); @@ -180,14 +182,14 @@ static void hisi_gpio_irq_disable(struct irq_data *d) static void hisi_gpio_irq_handler(struct irq_desc *desc) { struct hisi_gpio *hisi_gpio = irq_desc_get_handler_data(desc); - unsigned long irq_msk = hisi_gpio_read_reg(&hisi_gpio->chip, + unsigned long irq_msk = hisi_gpio_read_reg(&hisi_gpio->chip.gc, HISI_GPIO_INTSTATUS_WX); struct irq_chip *irq_c = irq_desc_get_chip(desc); int hwirq; chained_irq_enter(irq_c, desc); for_each_set_bit(hwirq, &irq_msk, HISI_GPIO_LINE_NUM_MAX) - generic_handle_domain_irq(hisi_gpio->chip.irq.domain, + generic_handle_domain_irq(hisi_gpio->chip.gc.irq.domain, hwirq); chained_irq_exit(irq_c, desc); } @@ -206,7 +208,7 @@ static const struct irq_chip hisi_gpio_irq_chip = { static void hisi_gpio_init_irq(struct hisi_gpio *hisi_gpio) { - struct gpio_chip *chip = &hisi_gpio->chip; + struct gpio_chip *chip = &hisi_gpio->chip.gc; struct gpio_irq_chip *girq_chip = &chip->irq; gpio_irq_chip_set_chip(girq_chip, &hisi_gpio_irq_chip); @@ -264,6 +266,7 @@ static void hisi_gpio_get_pdata(struct device *dev, static int hisi_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct hisi_gpio *hisi_gpio; int port_num; @@ -289,26 +292,31 @@ static int hisi_gpio_probe(struct platform_device *pdev) hisi_gpio->dev = dev; - ret = bgpio_init(&hisi_gpio->chip, hisi_gpio->dev, 0x4, - hisi_gpio->reg_base + HISI_GPIO_EXT_PORT_WX, - hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_SET_WX, - hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_CLR_WX, - hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_SET_WX, - hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_CLR_WX, - BGPIOF_NO_SET_ON_INPUT | BGPIOF_UNREADABLE_REG_DIR); + config = (typeof(config)){ + .dev = hisi_gpio->dev, + .sz = 4, + .dat = hisi_gpio->reg_base + HISI_GPIO_EXT_PORT_WX, + .set = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_SET_WX, + .clr = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_CLR_WX, + .dirout = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_SET_WX, + .dirin = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_CLR_WX, + .flags = BGPIOF_NO_SET_ON_INPUT | BGPIOF_UNREADABLE_REG_DIR, + }; + + ret = gpio_generic_chip_init(&hisi_gpio->chip, &config); if (ret) { dev_err(dev, "failed to init, ret = %d\n", ret); return ret; } - hisi_gpio->chip.set_config = hisi_gpio_set_config; - hisi_gpio->chip.ngpio = hisi_gpio->line_num; - hisi_gpio->chip.base = -1; + hisi_gpio->chip.gc.set_config = hisi_gpio_set_config; + hisi_gpio->chip.gc.ngpio = hisi_gpio->line_num; + hisi_gpio->chip.gc.base = -1; if (hisi_gpio->irq > 0) hisi_gpio_init_irq(hisi_gpio); - ret = devm_gpiochip_add_data(dev, &hisi_gpio->chip, hisi_gpio); + ret = devm_gpiochip_add_data(dev, &hisi_gpio->chip.gc, hisi_gpio); if (ret) { dev_err(dev, "failed to register gpiochip, ret = %d\n", ret); return ret; From 656dc0c6f725a29c9e48657ae3db78f9016f518c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:12 +0200 Subject: [PATCH 0739/3206] gpio: vf610: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-11-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-vf610.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c index 7de0d5b53d5604..fa7e322a834cc2 100644 --- a/drivers/gpio/gpio-vf610.c +++ b/drivers/gpio/gpio-vf610.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,7 @@ struct fsl_gpio_soc_data { }; struct vf610_gpio_port { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *base; void __iomem *gpio_base; const struct fsl_gpio_soc_data *sdata; @@ -108,7 +109,7 @@ static void vf610_gpio_irq_handler(struct irq_desc *desc) for_each_set_bit(pin, &irq_isfr, VF610_GPIO_PER_PORT) { vf610_gpio_writel(BIT(pin), port->base + PORT_ISFR); - generic_handle_domain_irq(port->gc.irq.domain, pin); + generic_handle_domain_irq(port->chip.gc.irq.domain, pin); } chained_irq_exit(chip, desc); @@ -214,6 +215,7 @@ static void vf610_gpio_disable_clk(void *data) static int vf610_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct vf610_gpio_port *port; struct gpio_chip *gc; @@ -293,7 +295,7 @@ static int vf610_gpio_probe(struct platform_device *pdev) return ret; } - gc = &port->gc; + gc = &port->chip.gc; flags = BGPIOF_PINCTRL_BACKEND; /* * We only read the output register for current value on output @@ -302,13 +304,18 @@ static int vf610_gpio_probe(struct platform_device *pdev) */ if (port->sdata->have_paddr) flags |= BGPIOF_READ_OUTPUT_REG_SET; - ret = bgpio_init(gc, dev, 4, - port->gpio_base + GPIO_PDIR, - port->gpio_base + GPIO_PDOR, - NULL, - port->sdata->have_paddr ? port->gpio_base + GPIO_PDDR : NULL, - NULL, - flags); + + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = port->gpio_base + GPIO_PDIR, + .set = port->gpio_base + GPIO_PDOR, + .dirout = port->sdata->have_paddr ? + port->gpio_base + GPIO_PDDR : NULL, + .flags = flags, + }; + + ret = gpio_generic_chip_init(&port->chip, &config); if (ret) return dev_err_probe(dev, ret, "unable to init generic GPIO\n"); gc->label = dev_name(dev); From a6f03347debb7c2c6d04cd4be67ed766e19633ba Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:35:13 +0200 Subject: [PATCH 0740/3206] gpio: visconti: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-mmio-gpio-conv-part2-v1-12-f67603e4b27e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-visconti.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/gpio/gpio-visconti.c b/drivers/gpio/gpio-visconti.c index 5bd965c18a465f..cde1581a91033e 100644 --- a/drivers/gpio/gpio-visconti.c +++ b/drivers/gpio/gpio-visconti.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -32,7 +33,7 @@ struct visconti_gpio { void __iomem *base; spinlock_t lock; /* protect gpio register */ - struct gpio_chip gpio_chip; + struct gpio_generic_chip chip; struct device *dev; }; @@ -158,6 +159,7 @@ static const struct irq_chip visconti_gpio_irq_chip = { static int visconti_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct visconti_gpio *priv; struct gpio_irq_chip *girq; @@ -189,19 +191,22 @@ static int visconti_gpio_probe(struct platform_device *pdev) return -ENODEV; } - ret = bgpio_init(&priv->gpio_chip, dev, 4, - priv->base + GPIO_IDATA, - priv->base + GPIO_OSET, - priv->base + GPIO_OCLR, - priv->base + GPIO_DIR, - NULL, - 0); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = priv->base + GPIO_IDATA, + .set = priv->base + GPIO_OSET, + .clr = priv->base + GPIO_OCLR, + .dirout = priv->base + GPIO_DIR, + }; + + ret = gpio_generic_chip_init(&priv->chip, &config); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; } - girq = &priv->gpio_chip.irq; + girq = &priv->chip.gc.irq; gpio_irq_chip_set_chip(girq, &visconti_gpio_irq_chip); girq->fwnode = dev_fwnode(dev); girq->parent_domain = parent; @@ -210,7 +215,7 @@ static int visconti_gpio_probe(struct platform_device *pdev) girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_level_irq; - return devm_gpiochip_add_data(dev, &priv->gpio_chip, priv); + return devm_gpiochip_add_data(dev, &priv->chip.gc, priv); } static const struct of_device_id visconti_gpio_of_match[] = { From 246b889c704e3209050eb0aa5a3733564aee1b38 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:54:35 +0200 Subject: [PATCH 0741/3206] gpio: stmpe: don't print out global GPIO numbers in debugfs callbacks In order to further limit the number of references to the GPIO base number stored in struct gpio_chip, replace the global GPIO numbers in the output of debugfs callbacks by hardware offsets. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-dbg-show-base-v1-1-7f27cd7f2256@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-stmpe.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/gpio/gpio-stmpe.c b/drivers/gpio/gpio-stmpe.c index 7bf270af07fe49..6faf30347a3639 100644 --- a/drivers/gpio/gpio-stmpe.c +++ b/drivers/gpio/gpio-stmpe.c @@ -262,9 +262,8 @@ static void stmpe_gpio_irq_unmask(struct irq_data *d) stmpe_gpio->regs[REG_IE][regoffset] |= mask; } -static void stmpe_dbg_show_one(struct seq_file *s, - struct gpio_chip *gc, - unsigned offset, unsigned gpio) +static void stmpe_dbg_show_one(struct seq_file *s, struct gpio_chip *gc, + unsigned int offset) { struct stmpe_gpio *stmpe_gpio = gpiochip_get_data(gc); struct stmpe *stmpe = stmpe_gpio->stmpe; @@ -286,7 +285,7 @@ static void stmpe_dbg_show_one(struct seq_file *s, if (dir) { seq_printf(s, " gpio-%-3d (%-20.20s) out %s", - gpio, label ?: "(none)", str_hi_lo(val)); + offset, label ?: "(none)", str_hi_lo(val)); } else { u8 edge_det_reg; u8 rise_reg; @@ -354,7 +353,7 @@ static void stmpe_dbg_show_one(struct seq_file *s, irqen = !!(ret & mask); seq_printf(s, " gpio-%-3d (%-20.20s) in %s %13s %13s %25s %25s", - gpio, label ?: "(none)", + offset, label ?: "(none)", str_hi_lo(val), edge_det_values[edge_det], irqen ? "IRQ-enabled" : "IRQ-disabled", @@ -366,10 +365,9 @@ static void stmpe_dbg_show_one(struct seq_file *s, static void stmpe_dbg_show(struct seq_file *s, struct gpio_chip *gc) { unsigned i; - unsigned gpio = gc->base; - for (i = 0; i < gc->ngpio; i++, gpio++) { - stmpe_dbg_show_one(s, gc, i, gpio); + for (i = 0; i < gc->ngpio; i++) { + stmpe_dbg_show_one(s, gc, i); seq_putc(s, '\n'); } } From ddeb66d2cb10f03a43d97a0ff2c3869d1951c87d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:54:36 +0200 Subject: [PATCH 0742/3206] gpio: nomadik: don't print out global GPIO numbers in debugfs callbacks In order to further limit the number of references to the GPIO base number stored in struct gpio_chip, replace the global GPIO numbers in the output of debugfs callbacks by hardware offsets. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-dbg-show-base-v1-2-7f27cd7f2256@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-nomadik.c | 25 +++++++++++------------ drivers/pinctrl/nomadik/pinctrl-nomadik.c | 2 +- include/linux/gpio/gpio-nomadik.h | 3 +-- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/gpio/gpio-nomadik.c b/drivers/gpio/gpio-nomadik.c index bcf4b07dd4584f..fde4b416faa818 100644 --- a/drivers/gpio/gpio-nomadik.c +++ b/drivers/gpio/gpio-nomadik.c @@ -20,6 +20,7 @@ */ #include #include +#include #include #include #include @@ -396,10 +397,10 @@ static int nmk_gpio_get_mode(struct nmk_gpio_chip *nmk_chip, int offset) } void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, - struct gpio_chip *chip, unsigned int offset, - unsigned int gpio) + struct gpio_chip *chip, unsigned int offset) { struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(chip); + struct gpio_desc *desc; int mode; bool is_out; bool data_out; @@ -425,15 +426,15 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, data_out = !!(readl(nmk_chip->addr + NMK_GPIO_DAT) & BIT(offset)); mode = nmk_gpio_get_mode(nmk_chip, offset); #ifdef CONFIG_PINCTRL_NOMADIK - if (mode == NMK_GPIO_ALT_C && pctldev) - mode = nmk_prcm_gpiocr_get_mode(pctldev, gpio); + if (mode == NMK_GPIO_ALT_C && pctldev) { + desc = gpio_device_get_desc(chip->gpiodev, offset); + mode = nmk_prcm_gpiocr_get_mode(pctldev, desc_to_gpio(desc)); + } #endif if (is_out) { seq_printf(s, " gpio-%-3d (%-20.20s) out %s %s", - gpio, - label ?: "(none)", - str_hi_lo(data_out), + offset, label ?: "(none)", str_hi_lo(data_out), (mode < 0) ? "unknown" : modes[mode]); } else { int irq = chip->to_irq(chip, offset); @@ -445,9 +446,7 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, }; seq_printf(s, " gpio-%-3d (%-20.20s) in %s %s", - gpio, - label ?: "(none)", - pulls[pullidx], + offset, label ?: "(none)", pulls[pullidx], (mode < 0) ? "unknown" : modes[mode]); val = nmk_gpio_get_input(chip, offset); @@ -479,10 +478,10 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, static void nmk_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) { - unsigned int i, gpio = chip->base; + unsigned int i; - for (i = 0; i < chip->ngpio; i++, gpio++) { - nmk_gpio_dbg_show_one(s, NULL, chip, i, gpio); + for (i = 0; i < chip->ngpio; i++) { + nmk_gpio_dbg_show_one(s, NULL, chip, i); seq_puts(s, "\n"); } } diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.c b/drivers/pinctrl/nomadik/pinctrl-nomadik.c index 8940e04fcf4cc4..db0311b1413227 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c @@ -584,7 +584,7 @@ static void nmk_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s, seq_printf(s, "invalid pin offset"); return; } - nmk_gpio_dbg_show_one(s, pctldev, chip, offset - chip->base, offset); + nmk_gpio_dbg_show_one(s, pctldev, chip, offset - chip->base); } static int nmk_dt_add_map_mux(struct pinctrl_map **map, unsigned int *reserved_maps, diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h index b5a84864650d0c..7ba53b499e164e 100644 --- a/include/linux/gpio/gpio-nomadik.h +++ b/include/linux/gpio/gpio-nomadik.h @@ -261,8 +261,7 @@ struct platform_device; * true. */ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, - struct gpio_chip *chip, unsigned int offset, - unsigned int gpio); + struct gpio_chip *chip, unsigned int offset); #else From 3767426b234f0d7b82ccfa05e53c47c83e0a12c2 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:54:37 +0200 Subject: [PATCH 0743/3206] gpio: wm831x: don't print out global GPIO numbers in debugfs callbacks In order to further limit the number of references to the GPIO base number stored in struct gpio_chip, replace the global GPIO numbers in the output of debugfs callbacks by hardware offsets. Reviewed-by: Charles Keepax Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-dbg-show-base-v1-3-7f27cd7f2256@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-wm831x.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-wm831x.c b/drivers/gpio/gpio-wm831x.c index f03c0e808fab27..489479d6f32b3a 100644 --- a/drivers/gpio/gpio-wm831x.c +++ b/drivers/gpio/gpio-wm831x.c @@ -159,7 +159,6 @@ static void wm831x_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) int i, tristated; for (i = 0; i < chip->ngpio; i++) { - int gpio = i + chip->base; int reg; const char *pull, *powerdomain; @@ -175,13 +174,13 @@ static void wm831x_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) } seq_printf(s, " gpio-%-3d (%-20.20s) ", - gpio, label ?: "Unrequested"); + i, label ?: "Unrequested"); reg = wm831x_reg_read(wm831x, WM831X_GPIO1_CONTROL + i); if (reg < 0) { dev_err(wm831x->dev, "GPIO control %d read failed: %d\n", - gpio, reg); + i, reg); seq_putc(s, '\n'); continue; } From aaa1279b8b5b46cc42b6175c2bcd83d8ac5fd2b3 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:54:38 +0200 Subject: [PATCH 0744/3206] gpio: wm8994: don't print out global GPIO numbers in debugfs callbacks In order to further limit the number of references to the GPIO base number stored in struct gpio_chip, replace the global GPIO numbers in the output of debugfs callbacks by hardware offsets. Reviewed-by: Charles Keepax Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-dbg-show-base-v1-4-7f27cd7f2256@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-wm8994.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-wm8994.c b/drivers/gpio/gpio-wm8994.c index df47a27f508d94..a0665cf3ff2f45 100644 --- a/drivers/gpio/gpio-wm8994.c +++ b/drivers/gpio/gpio-wm8994.c @@ -194,7 +194,6 @@ static void wm8994_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) int i; for (i = 0; i < chip->ngpio; i++) { - int gpio = i + chip->base; int reg; /* We report the GPIO even if it's not requested since @@ -208,14 +207,13 @@ static void wm8994_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) continue; } - seq_printf(s, " gpio-%-3d (%-20.20s) ", gpio, + seq_printf(s, " gpio-%-3d (%-20.20s) ", i, label ?: "Unrequested"); reg = wm8994_reg_read(wm8994, WM8994_GPIO_1 + i); if (reg < 0) { dev_err(wm8994->dev, - "GPIO control %d read failed: %d\n", - gpio, reg); + "GPIO control %d read failed: %d\n", i, reg); seq_printf(s, "\n"); continue; } From 2d71156cfea8391625ea146eff32b3d2ef059345 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:54:39 +0200 Subject: [PATCH 0745/3206] gpio: mvebu: don't print out global GPIO numbers in debugfs callbacks In order to further limit the number of references to the GPIO base number stored in struct gpio_chip, replace the global GPIO numbers in the output of debugfs callbacks by hardware offsets. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-dbg-show-base-v1-5-7f27cd7f2256@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mvebu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c index 261ffd0c614b71..ac799fced950e3 100644 --- a/drivers/gpio/gpio-mvebu.c +++ b/drivers/gpio/gpio-mvebu.c @@ -898,7 +898,7 @@ static void mvebu_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) msk = BIT(i); is_out = !(io_conf & msk); - seq_printf(s, " gpio-%-3d (%-20.20s)", chip->base + i, label); + seq_printf(s, " gpio-%-3d (%-20.20s)", i, label); if (is_out) { seq_printf(s, " out %s %s\n", From 3be2d43534aab7291b59c4e66526f911854aa3a7 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:54:40 +0200 Subject: [PATCH 0746/3206] gpio: xra1403: don't print out global GPIO numbers in debugfs callbacks In order to further limit the number of references to the GPIO base number stored in struct gpio_chip, replace the global GPIO numbers in the output of debugfs callbacks by hardware offsets. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-dbg-show-base-v1-6-7f27cd7f2256@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-xra1403.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-xra1403.c b/drivers/gpio/gpio-xra1403.c index faadcb4b0b2df0..7f3c98f9f90201 100644 --- a/drivers/gpio/gpio-xra1403.c +++ b/drivers/gpio/gpio-xra1403.c @@ -135,8 +135,7 @@ static void xra1403_dbg_show(struct seq_file *s, struct gpio_chip *chip) gcr = value[XRA_GCR + 1] << 8 | value[XRA_GCR]; gsr = value[XRA_GSR + 1] << 8 | value[XRA_GSR]; for_each_requested_gpio(chip, i, label) { - seq_printf(s, " gpio-%-3d (%-12s) %s %s\n", - chip->base + i, label, + seq_printf(s, " gpio-%-3d (%-12s) %s %s\n", i, label, (gcr & BIT(i)) ? "in" : "out", str_hi_lo(gsr & BIT(i))); } From 8fd5485fb4f3d9da3977fd783fcb8e5452463420 Mon Sep 17 00:00:00 2001 From: Harshit Agarwal Date: Tue, 8 Apr 2025 04:50:21 +0000 Subject: [PATCH 0747/3206] sched/deadline: Fix race in push_dl_task() When a CPU chooses to call push_dl_task and picks a task to push to another CPU's runqueue then it will call find_lock_later_rq method which would take a double lock on both CPUs' runqueues. If one of the locks aren't readily available, it may lead to dropping the current runqueue lock and reacquiring both the locks at once. During this window it is possible that the task is already migrated and is running on some other CPU. These cases are already handled. However, if the task is migrated and has already been executed and another CPU is now trying to wake it up (ttwu) such that it is queued again on the runqeue (on_rq is 1) and also if the task was run by the same CPU, then the current checks will pass even though the task was migrated out and is no longer in the pushable tasks list. Please go through the original rt change for more details on the issue. To fix this, after the lock is obtained inside the find_lock_later_rq, it ensures that the task is still at the head of pushable tasks list. Also removed some checks that are no longer needed with the addition of this new check. However, the new check of pushable tasks list only applies when find_lock_later_rq is called by push_dl_task. For the other caller i.e. dl_task_offline_migration, existing checks are used. Signed-off-by: Harshit Agarwal Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250408045021.3283624-1-harshit@nutanix.com --- kernel/sched/deadline.c | 73 +++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f25301267e4714..5b64bc621993d1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2580,6 +2580,25 @@ static int find_later_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); + + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); + + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); + + return p; +} + /* Locks the rq it finds */ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) { @@ -2607,12 +2626,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { - if (unlikely(task_rq(task) != rq || + /* + * double_lock_balance had to release rq->lock, in the + * meantime, task may no longer be fit to be migrated. + * Check the following to ensure that the task is + * still suitable for migration: + * 1. It is possible the task was scheduled, + * migrate_disabled was set and then got preempted, + * so we must check the task migration disable + * flag. + * 2. The CPU picked is in the task's affinity. + * 3. For throttled task (dl_task_offline_migration), + * check the following: + * - the task is not on the rq anymore (it was + * migrated) + * - the task is not on CPU anymore + * - the task is still a dl task + * - the task is not queued on the rq anymore + * 4. For the non-throttled task (push_dl_task), the + * check to ensure that this task is still at the + * head of the pushable tasks list is enough. + */ + if (unlikely(is_migration_disabled(task) || !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || - task_on_cpu(rq, task) || - !dl_task(task) || - is_migration_disabled(task) || - !task_on_rq_queued(task))) { + (task->dl.dl_throttled && + (task_rq(task) != rq || + task_on_cpu(rq, task) || + !dl_task(task) || + !task_on_rq_queued(task))) || + (!task->dl.dl_throttled && + task != pick_next_pushable_dl_task(rq)))) { + double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -2635,25 +2679,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) return later_rq; } -static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_dl_tasks(rq)) - return NULL; - - p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - - WARN_ON_ONCE(rq->cpu != task_cpu(p)); - WARN_ON_ONCE(task_current(rq, p)); - WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - - WARN_ON_ONCE(!task_on_rq_queued(p)); - WARN_ON_ONCE(!dl_task(p)); - - return p; -} - /* * See if the non running -deadline tasks on this rq * can be sent to some other CPU where they can preempt From 661f951e371cc134ea31c84238dbdc9a898b8403 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Aug 2025 12:02:44 +0000 Subject: [PATCH 0748/3206] sched/fair: Get rid of sched_domains_curr_level hack for tl->cpumask() Leon [1] and Vinicius [2] noted a topology_span_sane() warning during their testing starting from v6.16-rc1. Debug that followed pointed to the tl->mask() for the NODE domain being incorrectly resolved to that of the highest NUMA domain. tl->mask() for NODE is set to the sd_numa_mask() which depends on the global "sched_domains_curr_level" hack. "sched_domains_curr_level" is set to the "tl->numa_level" during tl traversal in build_sched_domains() calling sd_init() but was not reset before topology_span_sane(). Since "tl->numa_level" still reflected the old value from build_sched_domains(), topology_span_sane() for the NODE domain trips when the span of the last NUMA domain overlaps. Instead of replicating the "sched_domains_curr_level" hack, get rid of it entirely and instead, pass the entire "sched_domain_topology_level" object to tl->cpumask() function to prevent such mishap in the future. sd_numa_mask() now directly references "tl->numa_level" instead of relying on the global "sched_domains_curr_level" hack to index into sched_domains_numa_masks[]. The original warning was reproducible on the following NUMA topology reported by Leon: $ sudo numactl -H available: 5 nodes (0-4) node 0 cpus: 0 1 node 0 size: 2927 MB node 0 free: 1603 MB node 1 cpus: 2 3 node 1 size: 3023 MB node 1 free: 3008 MB node 2 cpus: 4 5 node 2 size: 3023 MB node 2 free: 3007 MB node 3 cpus: 6 7 node 3 size: 3023 MB node 3 free: 3002 MB node 4 cpus: 8 9 node 4 size: 3022 MB node 4 free: 2718 MB node distances: node 0 1 2 3 4 0: 10 39 38 37 36 1: 39 10 38 37 36 2: 38 38 10 37 36 3: 37 37 37 10 36 4: 36 36 36 36 10 The above topology can be mimicked using the following QEMU cmd that was used to reproduce the warning and test the fix: sudo qemu-system-x86_64 -enable-kvm -cpu host \ -m 20G -smp cpus=10,sockets=10 -machine q35 \ -object memory-backend-ram,size=4G,id=m0 \ -object memory-backend-ram,size=4G,id=m1 \ -object memory-backend-ram,size=4G,id=m2 \ -object memory-backend-ram,size=4G,id=m3 \ -object memory-backend-ram,size=4G,id=m4 \ -numa node,cpus=0-1,memdev=m0,nodeid=0 \ -numa node,cpus=2-3,memdev=m1,nodeid=1 \ -numa node,cpus=4-5,memdev=m2,nodeid=2 \ -numa node,cpus=6-7,memdev=m3,nodeid=3 \ -numa node,cpus=8-9,memdev=m4,nodeid=4 \ -numa dist,src=0,dst=1,val=39 \ -numa dist,src=0,dst=2,val=38 \ -numa dist,src=0,dst=3,val=37 \ -numa dist,src=0,dst=4,val=36 \ -numa dist,src=1,dst=0,val=39 \ -numa dist,src=1,dst=2,val=38 \ -numa dist,src=1,dst=3,val=37 \ -numa dist,src=1,dst=4,val=36 \ -numa dist,src=2,dst=0,val=38 \ -numa dist,src=2,dst=1,val=38 \ -numa dist,src=2,dst=3,val=37 \ -numa dist,src=2,dst=4,val=36 \ -numa dist,src=3,dst=0,val=37 \ -numa dist,src=3,dst=1,val=37 \ -numa dist,src=3,dst=2,val=37 \ -numa dist,src=3,dst=4,val=36 \ -numa dist,src=4,dst=0,val=36 \ -numa dist,src=4,dst=1,val=36 \ -numa dist,src=4,dst=2,val=36 \ -numa dist,src=4,dst=3,val=36 \ ... [ prateek: Moved common functions to include/linux/sched/topology.h, reuse the common bits for s390 and ppc, commit message ] Closes: https://lore.kernel.org/lkml/20250610110701.GA256154@unreal/ [1] Fixes: ccf74128d66c ("sched/topology: Assert non-NUMA topology masks don't (partially) overlap") # ce29a7da84cd, f55dac1dafb3 Signed-off-by: Peter Zijlstra (Intel) Reported-by: Leon Romanovsky Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Shrikanth Hegde Tested-by: Valentin Schneider # x86 Tested-by: Shrikanth Hegde # powerpc Link: https://lore.kernel.org/lkml/a3de98387abad28592e6ab591f3ff6107fe01dc1.1755893468.git.tim.c.chen@linux.intel.com/ [2] --- arch/powerpc/Kconfig | 4 ++++ arch/powerpc/include/asm/topology.h | 2 ++ arch/powerpc/kernel/smp.c | 27 +++++++++++---------------- arch/s390/kernel/topology.c | 20 +++++++------------- arch/x86/kernel/smpboot.c | 8 ++++---- include/linux/sched/topology.h | 28 +++++++++++++++++++++++++++- include/linux/topology.h | 2 +- kernel/sched/topology.c | 28 ++++++++++------------------ 8 files changed, 66 insertions(+), 53 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 93402a1d9c9fc6..e51a595a06228a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -971,6 +971,10 @@ config SCHED_SMT when dealing with POWER5 cpus at a cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_MC + def_bool y + depends on SMP + config PPC_DENORMALISATION bool "PowerPC denormalisation exception handling" depends on PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index da15b5efe8071a..f19ca44512d1e8 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -131,6 +131,8 @@ static inline int cpu_to_coregroup_id(int cpu) #ifdef CONFIG_SMP #include +struct cpumask *cpu_coregroup_mask(int cpu); + #ifdef CONFIG_PPC64 #include diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f59e4b9cc20743..68edb66c2964ba 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1028,19 +1028,19 @@ static int powerpc_shared_proc_flags(void) * We can't just pass cpu_l2_cache_mask() directly because * returns a non-const pointer and the compiler barfs on that. */ -static const struct cpumask *shared_cache_mask(int cpu) +static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu) { return per_cpu(cpu_l2_cache_map, cpu); } #ifdef CONFIG_SCHED_SMT -static const struct cpumask *smallcore_smt_mask(int cpu) +static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu) { return cpu_smallcore_mask(cpu); } #endif -static struct cpumask *cpu_coregroup_mask(int cpu) +struct cpumask *cpu_coregroup_mask(int cpu) { return per_cpu(cpu_coregroup_map, cpu); } @@ -1054,11 +1054,6 @@ static bool has_coregroup_support(void) return coregroup_enabled; } -static const struct cpumask *cpu_mc_mask(int cpu) -{ - return cpu_coregroup_mask(cpu); -} - static int __init init_big_cores(void) { int cpu; @@ -1448,7 +1443,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask) return false; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update l2-cache mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask); @@ -1538,7 +1533,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask) return; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update coregroup mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask); @@ -1601,7 +1596,7 @@ static void add_cpu_to_masks(int cpu) /* If chip_id is -1; limit the cpu_core_mask to within PKG */ if (chip_id == -1) - cpumask_and(mask, mask, cpu_cpu_mask(cpu)); + cpumask_and(mask, mask, cpu_node_mask(cpu)); for_each_cpu(i, mask) { if (chip_id == cpu_to_chip_id(i)) { @@ -1701,22 +1696,22 @@ static void __init build_sched_topology(void) if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); powerpc_topology[i++] = - SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); + SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT); } else { - powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); + powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT); } #endif if (shared_caches) { powerpc_topology[i++] = - SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); + SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE); } if (has_coregroup_support()) { powerpc_topology[i++] = - SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); + SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC); } - powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); + powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG); /* There must be one trailing NULL entry left. */ BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 46569b8e47dde3..1594c80e9bc4db 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -509,33 +509,27 @@ int topology_cpu_init(struct cpu *cpu) return rc; } -static const struct cpumask *cpu_thread_mask(int cpu) -{ - return &cpu_topology[cpu].thread_mask; -} - - const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_mask; } -static const struct cpumask *cpu_book_mask(int cpu) +static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].book_mask; } -static const struct cpumask *cpu_drawer_mask(int cpu) +static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].drawer_mask; } static struct sched_domain_topology_level s390_topology[] = { - SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), - SDTL_INIT(cpu_book_mask, NULL, BOOK), - SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), + SDTL_INIT(tl_book_mask, NULL, BOOK), + SDTL_INIT(tl_drawer_mask, NULL, DRAWER), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 33e166f6ab1224..eb289abece2370 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -479,14 +479,14 @@ static int x86_cluster_flags(void) static bool x86_has_numa_in_package; static struct sched_domain_topology_level x86_topology[] = { - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), + SDTL_INIT(tl_mc_mask, x86_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), + SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG), { NULL }, }; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 5263746b63e8c3..a3a24e115d4468 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -30,11 +30,19 @@ struct sd_flag_debug { }; extern const struct sd_flag_debug sd_flag_debug[]; +struct sched_domain_topology_level; + #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; } + +static inline const +struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_smt_mask(cpu); +} #endif #ifdef CONFIG_SCHED_CLUSTER @@ -42,6 +50,12 @@ static inline int cpu_cluster_flags(void) { return SD_CLUSTER | SD_SHARE_LLC; } + +static inline const +struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_clustergroup_mask(cpu); +} #endif #ifdef CONFIG_SCHED_MC @@ -49,8 +63,20 @@ static inline int cpu_core_flags(void) { return SD_SHARE_LLC; } + +static inline const +struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_coregroup_mask(cpu); +} #endif +static inline const +struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_node_mask(cpu); +} + #ifdef CONFIG_NUMA static inline int cpu_numa_flags(void) { @@ -172,7 +198,7 @@ bool cpus_equal_capacity(int this_cpu, int that_cpu); bool cpus_share_cache(int this_cpu, int that_cpu); bool cpus_share_resources(int this_cpu, int that_cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu); typedef int (*sched_domain_flags_f)(void); struct sd_data { diff --git a/include/linux/topology.h b/include/linux/topology.h index 33b7fda97d3902..6575af39fd10f7 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -260,7 +260,7 @@ static inline bool topology_is_primary_thread(unsigned int cpu) #endif -static inline const struct cpumask *cpu_cpu_mask(int cpu) +static inline const struct cpumask *cpu_node_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 977e133bb8a443..18889bd97e22ba 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1591,7 +1591,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd) enum numa_topology_type sched_numa_topology_type; static int sched_domains_numa_levels; -static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; @@ -1632,14 +1631,7 @@ sd_init(struct sched_domain_topology_level *tl, int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; -#ifdef CONFIG_NUMA - /* - * Ugly hack to pass state to sd_numa_mask()... - */ - sched_domains_curr_level = tl->numa_level; -#endif - - sd_weight = cpumask_weight(tl->mask(cpu)); + sd_weight = cpumask_weight(tl->mask(tl, cpu)); if (tl->sd_flags) sd_flags = (*tl->sd_flags)(); @@ -1677,7 +1669,7 @@ sd_init(struct sched_domain_topology_level *tl, }; sd_span = sched_domain_span(sd); - cpumask_and(sd_span, cpu_map, tl->mask(cpu)); + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); sd_id = cpumask_first(sd_span); sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); @@ -1737,17 +1729,17 @@ sd_init(struct sched_domain_topology_level *tl, */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; @@ -1769,9 +1761,9 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl) #ifdef CONFIG_NUMA -static const struct cpumask *sd_numa_mask(int cpu) +static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) { - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; + return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)]; } static void sched_numa_warn(const char *str) @@ -2411,7 +2403,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) * breaks the linking done for an earlier span. */ for_each_cpu(cpu, cpu_map) { - const struct cpumask *tl_cpu_mask = tl->mask(cpu); + const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu); int id; /* lowest bit set in this mask is used as a unique id */ @@ -2419,7 +2411,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) if (cpumask_test_cpu(id, id_seen)) { /* First CPU has already been seen, ensure identical spans */ - if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) + if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask)) return false; } else { /* First CPU hasn't been seen before, ensure it's a completely new span */ From 91c614f09abf1d45aac6b475d82a36c704b527ee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Aug 2025 10:55:55 +0200 Subject: [PATCH 0749/3206] sched: Move STDL_INIT() functions out-of-line Since all these functions are address-taken in SDTL_INIT() and called indirectly, it doesn't really make sense for them to be inline. Suggested-by: Christophe Leroy Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched/topology.h | 49 +++++----------------------------- kernel/sched/topology.c | 45 +++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 42 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a3a24e115d4468..bbcfdf12aa6e57 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -33,56 +33,21 @@ extern const struct sd_flag_debug sd_flag_debug[]; struct sched_domain_topology_level; #ifdef CONFIG_SCHED_SMT -static inline int cpu_smt_flags(void) -{ - return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; -} - -static inline const -struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_smt_mask(cpu); -} +extern int cpu_smt_flags(void); +extern const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_CLUSTER -static inline int cpu_cluster_flags(void) -{ - return SD_CLUSTER | SD_SHARE_LLC; -} - -static inline const -struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_clustergroup_mask(cpu); -} +extern int cpu_cluster_flags(void); +extern const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_MC -static inline int cpu_core_flags(void) -{ - return SD_SHARE_LLC; -} - -static inline const -struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_coregroup_mask(cpu); -} +extern int cpu_core_flags(void); +extern const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu); #endif -static inline const -struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_node_mask(cpu); -} - -#ifdef CONFIG_NUMA -static inline int cpu_numa_flags(void) -{ - return SD_NUMA; -} -#endif +extern const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu); extern int arch_asym_cpu_priority(int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 18889bd97e22ba..1104d931c0153f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1724,6 +1724,47 @@ sd_init(struct sched_domain_topology_level *tl, return sd; } +#ifdef CONFIG_SCHED_SMT +int cpu_smt_flags(void) +{ + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; +} + +const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_smt_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_CLUSTER +int cpu_cluster_flags(void) +{ + return SD_CLUSTER | SD_SHARE_LLC; +} + +const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_clustergroup_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_MC +int cpu_core_flags(void) +{ + return SD_SHARE_LLC; +} + +const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_coregroup_mask(cpu); +} +#endif + +const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_node_mask(cpu); +} + /* * Topology list, bottom-up. */ @@ -1760,6 +1801,10 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl) } #ifdef CONFIG_NUMA +static int cpu_numa_flags(void) +{ + return SD_NUMA; +} static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) { From 7bd291abe2da09f59dca81f35a4ec220e5e138a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Aug 2025 11:08:57 +0200 Subject: [PATCH 0750/3206] sched: Unify the SCHED_{SMT,CLUSTER,MC} Kconfig Like many Kconfig symbols, SCHED_{SMT,CLUSTER,MC} are duplicated across arch/*/Kconfig. Try and clean up a little. Suggested-by: Christophe Leroy Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Shrikanth Hegde Tested-by: Shrikanth Hegde # powerpc Link: https://lkml.kernel.org/r/20250826094358.GG3245006@noisy.programming.kicks-ass.net --- arch/Kconfig | 38 ++++++++++++++++++++++++++++++++++++++ arch/arm/Kconfig | 18 ++---------------- arch/arm64/Kconfig | 26 +++----------------------- arch/loongarch/Kconfig | 19 ++----------------- arch/mips/Kconfig | 16 ++-------------- arch/parisc/Kconfig | 9 +-------- arch/powerpc/Kconfig | 15 +++------------ arch/riscv/Kconfig | 9 +-------- arch/s390/Kconfig | 8 ++------ arch/sparc/Kconfig | 20 ++------------------ arch/x86/Kconfig | 27 ++++----------------------- 11 files changed, 60 insertions(+), 145 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index d1b4ffd6e08564..4466af478950f1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -41,6 +41,44 @@ config HOTPLUG_SMT config SMT_NUM_THREADS_DYNAMIC bool +config ARCH_SUPPORTS_SCHED_SMT + bool + +config ARCH_SUPPORTS_SCHED_CLUSTER + bool + +config ARCH_SUPPORTS_SCHED_MC + bool + +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on ARCH_SUPPORTS_SCHED_SMT + default y + help + Improves the CPU scheduler's decision making when dealing with + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + +config SCHED_CLUSTER + bool "Cluster scheduler support" + depends on ARCH_SUPPORTS_SCHED_CLUSTER + default y + help + Cluster scheduler support improves the CPU scheduler's decision + making when dealing with machines that have clusters of CPUs. + Cluster usually means a couple of CPUs which are placed closely + by sharing mid-level caches, last-level cache tags or internal + busses. + +config SCHED_MC + bool "Multi-Core Cache (MC) scheduler support" + depends on ARCH_SUPPORTS_SCHED_MC + default y + help + Multi-core scheduler support improves the CPU scheduler's decision + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. If unsure say N here. + # Selected by HOTPLUG_CORE_SYNC_DEAD or HOTPLUG_CORE_SYNC_FULL config HOTPLUG_CORE_SYNC bool diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b1f3df39ed4068..d13422f97bc304 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -941,28 +941,14 @@ config IRQSTACKS config ARM_CPU_TOPOLOGY bool "Support cpu topology definition" depends on SMP && CPU_V7 + select ARCH_SUPPORTS_SCHED_MC + select ARCH_SUPPORTS_SCHED_SMT default y help Support ARM cpu topology definition. The MPIDR register defines affinity between processors which is then used to describe the cpu topology of an ARM System. -config SCHED_MC - bool "Multi-core scheduler support" - depends on ARM_CPU_TOPOLOGY - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SCHED_SMT - bool "SMT scheduler support" - depends on ARM_CPU_TOPOLOGY - help - Improves the CPU scheduler's decision making when dealing with - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - config HAVE_ARM_SCU bool help diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a64d..30733e0b8ac5fa 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -108,6 +108,9 @@ config ARM64 select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_RT + select ARCH_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_CLUSTER + select ARCH_SUPPORTS_SCHED_MC select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT @@ -1505,29 +1508,6 @@ config CPU_LITTLE_ENDIAN endchoice -config SCHED_MC - bool "Multi-core scheduler support" - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SCHED_CLUSTER - bool "Cluster scheduler support" - help - Cluster scheduler support improves the CPU scheduler's decision - making when dealing with machines that have clusters of CPUs. - Cluster usually means a couple of CPUs which are placed closely - by sharing mid-level caches, last-level cache tags or internal - busses. - -config SCHED_SMT - bool "SMT scheduler support" - help - Improves the CPU scheduler's decision making when dealing with - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - config NR_CPUS int "Maximum number of CPUs (2-4096)" range 2 4096 diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index f0abc38c40ac9e..d1c8cb3358ef01 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -70,6 +70,8 @@ config LOONGARCH select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_RT + select ARCH_SUPPORTS_SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST @@ -448,23 +450,6 @@ config EFI_STUB This kernel feature allows the kernel to be loaded directly by EFI firmware without the use of a bootloader. -config SCHED_SMT - bool "SMT scheduler support" - depends on SMP - default y - help - Improves scheduler's performance when there are multiple - threads in one physical core. - -config SCHED_MC - bool "Multi-core scheduler support" - depends on SMP - default y - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. - config SMP bool "Multi-Processing support" help diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index caf508f6e9ec8e..447b2fcaa316ae 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2223,7 +2223,7 @@ config MIPS_MT_SMP select SMP select SMP_UP select SYS_SUPPORTS_SMP - select SYS_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_SMT select MIPS_PERF_SHARED_TC_COUNTERS help This is a kernel model which is known as SMVP. This is supported @@ -2235,18 +2235,6 @@ config MIPS_MT_SMP config MIPS_MT bool -config SCHED_SMT - bool "SMT (multithreading) scheduler support" - depends on SYS_SUPPORTS_SCHED_SMT - default n - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with MIPS MT enabled cores at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SYS_SUPPORTS_SCHED_SMT - bool - config SYS_SUPPORTS_MULTITHREADING bool @@ -2318,7 +2306,7 @@ config MIPS_CPS select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU select SYNC_R4K if (CEVT_R4K || CSRC_R4K) select SYS_SUPPORTS_HOTPLUG_CPU - select SYS_SUPPORTS_SCHED_SMT if CPU_MIPSR6 + select ARCH_SUPPORTS_SCHED_SMT if CPU_MIPSR6 select SYS_SUPPORTS_SMP select WEAK_ORDERING select GENERIC_IRQ_MIGRATION if HOTPLUG_CPU diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 2efa4b08b7b841..0940c162f1f7b4 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -44,6 +44,7 @@ config PARISC select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD select GENERIC_ARCH_TOPOLOGY if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP && PA8X00 select GENERIC_CPU_DEVICES if !SMP select GENERIC_LIB_DEVMEM_IS_ALLOWED select SYSCTL_ARCH_UNALIGN_ALLOW @@ -319,14 +320,6 @@ config SMP If you don't know what to do here, say N. -config SCHED_MC - bool "Multi-core scheduler support" - depends on GENERIC_ARCH_TOPOLOGY && PA8X00 - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config IRQSTACKS bool "Use separate kernel stacks when processing interrupts" default y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e51a595a06228a..f87d169f67475a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -170,6 +170,9 @@ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx + select ARCH_SUPPORTS_SCHED_MC if SMP + select ARCH_SUPPORTS_SCHED_SMT if PPC64 && SMP + select SCHED_MC if ARCH_SUPPORTS_SCHED_MC select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST @@ -963,18 +966,6 @@ config PPC_PROT_SAO_LPAR config PPC_COPRO_BASE bool -config SCHED_SMT - bool "SMT (Hyperthreading) scheduler support" - depends on PPC64 && SMP - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with POWER5 cpus at a cost of slightly increased - overhead in some places. If unsure say N here. - -config SCHED_MC - def_bool y - depends on SMP - config PPC_DENORMALISATION bool "PowerPC denormalisation exception handling" depends on PPC_BOOK3S_64 diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a4b233a0659ed8..2a08e06207a93c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -72,6 +72,7 @@ config RISCV select ARCH_SUPPORTS_PER_VMA_LOCK if MMU select ARCH_SUPPORTS_RT select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK + select ARCH_SUPPORTS_SCHED_MC if SMP select ARCH_USE_CMPXCHG_LOCKREF if 64BIT select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS @@ -453,14 +454,6 @@ config SMP If you don't know what to do here, say N. -config SCHED_MC - bool "Multi-core scheduler support" - depends on SMP - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config NR_CPUS int "Maximum number of CPUs (2-512)" depends on SMP diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf680c26a33cf7..87ddd4968c4b8e 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -547,15 +547,11 @@ config NODES_SHIFT depends on NUMA default "1" -config SCHED_SMT - def_bool n - -config SCHED_MC - def_bool n - config SCHED_TOPOLOGY def_bool y prompt "Topology scheduler support" + select ARCH_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_MC select SCHED_SMT select SCHED_MC help diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 7b595092cbfb65..a630d373e6453c 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -110,6 +110,8 @@ config SPARC64 select HAVE_SETUP_PER_CPU_AREA select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK + select ARCH_SUPPORTS_SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP config ARCH_PROC_KCORE_TEXT def_bool y @@ -288,24 +290,6 @@ if SPARC64 || COMPILE_TEST source "kernel/power/Kconfig" endif -config SCHED_SMT - bool "SMT (Hyperthreading) scheduler support" - depends on SPARC64 && SMP - default y - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with SPARC cpus at a cost of slightly increased overhead - in some places. If unsure say N here. - -config SCHED_MC - bool "Multi-core scheduler support" - depends on SPARC64 && SMP - default y - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config CMDLINE_BOOL bool "Default bootloader kernel arguments" depends on SPARC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100eb..c8df03237bdf60 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -330,6 +330,10 @@ config X86 imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE select ARCH_SUPPORTS_PT_RECLAIM if X86_64 + select ARCH_SUPPORTS_SCHED_SMT if SMP + select SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_CLUSTER if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP config INSTRUCTION_DECODER def_bool y @@ -1031,29 +1035,6 @@ config NR_CPUS This is purely to save memory: each supported CPU adds about 8KB to the kernel image. -config SCHED_CLUSTER - bool "Cluster scheduler support" - depends on SMP - default y - help - Cluster scheduler support improves the CPU scheduler's decision - making when dealing with machines that have clusters of CPUs. - Cluster usually means a couple of CPUs which are placed closely - by sharing mid-level caches, last-level cache tags or internal - busses. - -config SCHED_SMT - def_bool y if SMP - -config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" - depends on SMP - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config SCHED_MC_PRIO bool "CPU core priorities scheduler support" depends on SCHED_MC From 2cd571245b43492867bf1b4252485f3e6647b643 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 29 Aug 2025 16:11:16 +0800 Subject: [PATCH 0751/3206] sched/fair: Add related data structure for task based throttle Add related data structures for this new throttle functionality. Tesed-by: K Prateek Nayak Signed-off-by: Valentin Schneider Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Tested-by: Valentin Schneider Tested-by: Matteo Martelli Link: https://lore.kernel.org/r/20250829081120.806-2-ziqianlu@bytedance.com --- include/linux/sched.h | 5 +++++ kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 13 +++++++++++++ kernel/sched/sched.h | 3 +++ 4 files changed, 24 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index f8188b8333503c..644a01bdae700d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -883,6 +883,11 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; +#ifdef CONFIG_CFS_BANDWIDTH + struct callback_head sched_throttle_work; + struct list_head throttle_node; + bool throttled; +#endif #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4cc..feb750aae71b64 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4490,6 +4490,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; +#ifdef CONFIG_CFS_BANDWIDTH + init_cfs_throttle_work(p); +#endif #endif #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c2e..8fff40fcbc425b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5748,6 +5748,18 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } +static void throttle_cfs_rq_work(struct callback_head *work) +{ +} + +void init_cfs_throttle_work(struct task_struct *p) +{ + init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work); + /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */ + p->sched_throttle_work.next = &p->sched_throttle_work; + INIT_LIST_HEAD(&p->throttle_node); +} + static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; @@ -6472,6 +6484,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f751..a6493d255397f8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -739,6 +739,7 @@ struct cfs_rq { int throttle_count; struct list_head throttled_list; struct list_head throttled_csd_list; + struct list_head throttled_limbo_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -2658,6 +2659,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_entity(struct sched_dl_entity *dl_se); +extern void init_cfs_throttle_work(struct task_struct *p); + #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 From 7fc2d14392475e368a2a7be458aba4eecdf2439b Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 29 Aug 2025 16:11:17 +0800 Subject: [PATCH 0752/3206] sched/fair: Implement throttle task work and related helpers Implement throttle_cfs_rq_work() task work which gets executed on task's ret2user path where the task is dequeued and marked as throttled. Signed-off-by: Valentin Schneider Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Tested-by: Valentin Schneider Tested-by: Matteo Martelli Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250829081120.806-3-ziqianlu@bytedance.com --- kernel/sched/fair.c | 65 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8fff40fcbc425b..dab4ed86d0c826 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5748,8 +5748,51 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } +static inline bool task_is_throttled(struct task_struct *p) +{ + return cfs_bandwidth_used() && p->throttled; +} + +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags); static void throttle_cfs_rq_work(struct callback_head *work) { + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work); + struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; + + WARN_ON_ONCE(p != current); + p->sched_throttle_work.next = &p->sched_throttle_work; + + /* + * If task is exiting, then there won't be a return to userspace, so we + * don't have to bother with any of this. + */ + if ((p->flags & PF_EXITING)) + return; + + scoped_guard(task_rq_lock, p) { + se = &p->se; + cfs_rq = cfs_rq_of(se); + + /* Raced, forget */ + if (p->sched_class != &fair_sched_class) + return; + + /* + * If not in limbo, then either replenish has happened or this + * task got migrated out of the throttled cfs_rq, move along. + */ + if (!cfs_rq->throttle_count) + return; + rq = scope.rq; + update_rq_clock(rq); + WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL); + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + p->throttled = true; + resched_curr(rq); + } } void init_cfs_throttle_work(struct task_struct *p) @@ -5789,6 +5832,26 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) return 0; } +static inline bool task_has_throttle_work(struct task_struct *p) +{ + return p->sched_throttle_work.next != &p->sched_throttle_work; +} + +static inline void task_throttle_setup_work(struct task_struct *p) +{ + if (task_has_throttle_work(p)) + return; + + /* + * Kthreads and exiting tasks don't return to userspace, so adding the + * work is pointless + */ + if ((p->flags & (PF_EXITING | PF_KTHREAD))) + return; + + task_work_add(p, &p->sched_throttle_work, TWA_RESUME); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; @@ -6652,6 +6715,8 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void task_throttle_setup_work(struct task_struct *p) {} +static bool task_is_throttled(struct task_struct *p) { return false; } static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { From e1fad12dcb66b7f35573c52b665830a1538f9886 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 29 Aug 2025 16:11:18 +0800 Subject: [PATCH 0753/3206] sched/fair: Switch to task based throttle model In current throttle model, when a cfs_rq is throttled, its entity will be dequeued from cpu's rq, making tasks attached to it not able to run, thus achiveing the throttle target. This has a drawback though: assume a task is a reader of percpu_rwsem and is waiting. When it gets woken, it can not run till its task group's next period comes, which can be a relatively long time. Waiting writer will have to wait longer due to this and it also makes further reader build up and eventually trigger task hung. To improve this situation, change the throttle model to task based, i.e. when a cfs_rq is throttled, record its throttled status but do not remove it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when they get picked, add a task work to them so that when they return to user, they can be dequeued there. In this way, tasks throttled will not hold any kernel resources. And on unthrottle, enqueue back those tasks so they can continue to run. Throttled cfs_rq's PELT clock is handled differently now: previously the cfs_rq's PELT clock is stopped once it entered throttled state but since now tasks(in kernel mode) can continue to run, change the behaviour to stop PELT clock when the throttled cfs_rq has no tasks left. Suggested-by: Chengming Zhou # tag on pick Signed-off-by: Valentin Schneider Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Tested-by: Chen Yu Tested-by: Matteo Martelli Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250829081120.806-4-ziqianlu@bytedance.com --- kernel/sched/fair.c | 341 ++++++++++++++++++++++--------------------- kernel/sched/pelt.h | 4 +- kernel/sched/sched.h | 3 +- 3 files changed, 181 insertions(+), 167 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dab4ed86d0c826..25b1014d4ef866 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5291,18 +5291,23 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); - if (!throttled_hierarchy(cfs_rq)) { - list_add_leaf_cfs_rq(cfs_rq); - } else { + list_add_leaf_cfs_rq(cfs_rq); #ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { struct rq *rq = rq_of(cfs_rq); if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) cfs_rq->throttled_clock = rq_clock(rq); if (!cfs_rq->throttled_clock_self) cfs_rq->throttled_clock_self = rq_clock(rq); -#endif + + if (cfs_rq->pelt_clock_throttled) { + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; + } } +#endif } } @@ -5341,8 +5346,6 @@ static void set_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable--; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5363,8 +5366,6 @@ static void clear_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable++; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5450,8 +5451,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_queued == 0) + if (cfs_rq->nr_queued == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); +#ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { + struct rq *rq = rq_of(cfs_rq); + + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; + } +#endif + } return true; } @@ -5790,6 +5801,10 @@ static void throttle_cfs_rq_work(struct callback_head *work) WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL); list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + /* + * Must not set throttled before dequeue or dequeue will + * mistakenly regard this task as an already throttled one. + */ p->throttled = true; resched_curr(rq); } @@ -5803,32 +5818,124 @@ void init_cfs_throttle_work(struct task_struct *p) INIT_LIST_HEAD(&p->throttle_node); } +/* + * Task is throttled and someone wants to dequeue it again: + * it could be sched/core when core needs to do things like + * task affinity change, task group change, task sched class + * change etc. and in these cases, DEQUEUE_SLEEP is not set; + * or the task is blocked after throttled due to freezer etc. + * and in these cases, DEQUEUE_SLEEP is set. + */ +static void detach_task_cfs_rq(struct task_struct *p); +static void dequeue_throttled_task(struct task_struct *p, int flags) +{ + WARN_ON_ONCE(p->se.on_rq); + list_del_init(&p->throttle_node); + + /* task blocked after throttled */ + if (flags & DEQUEUE_SLEEP) { + p->throttled = false; + return; + } + + /* + * task is migrating off its old cfs_rq, detach + * the task's load from its old cfs_rq. + */ + if (task_on_rq_migrating(p)) + detach_task_cfs_rq(p); +} + +static bool enqueue_throttled_task(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + + /* @p should have gone through dequeue_throttled_task() first */ + WARN_ON_ONCE(!list_empty(&p->throttle_node)); + + /* + * If the throttled task @p is enqueued to a throttled cfs_rq, + * take the fast path by directly putting the task on the + * target cfs_rq's limbo list. + * + * Do not do that when @p is current because the following race can + * cause @p's group_node to be incorectly re-insterted in its rq's + * cfs_tasks list, despite being throttled: + * + * cpuX cpuY + * p ret2user + * throttle_cfs_rq_work() sched_move_task(p) + * LOCK task_rq_lock + * dequeue_task_fair(p) + * UNLOCK task_rq_lock + * LOCK task_rq_lock + * task_current_donor(p) == true + * task_on_rq_queued(p) == true + * dequeue_task(p) + * put_prev_task(p) + * sched_change_group() + * enqueue_task(p) -> p's new cfs_rq + * is throttled, go + * fast path and skip + * actual enqueue + * set_next_task(p) + * list_move(&se->group_node, &rq->cfs_tasks); // bug + * schedule() + * + * In the above race case, @p current cfs_rq is in the same rq as + * its previous cfs_rq because sched_move_task() only moves a task + * to a different group from the same rq, so we can use its current + * cfs_rq to derive rq and test if the task is current. + */ + if (throttled_hierarchy(cfs_rq) && + !task_current_donor(rq_of(cfs_rq), p)) { + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + return true; + } + + /* we can't take the fast path, do an actual enqueue*/ + p->throttled = false; + return false; +} + +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct task_struct *p, *tmp; + + if (--cfs_rq->throttle_count) + return 0; - cfs_rq->throttle_count--; - if (!cfs_rq->throttle_count) { + if (cfs_rq->pelt_clock_throttled) { cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; + } - /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; - if (cfs_rq->throttled_clock_self) { - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + cfs_rq->throttled_clock_self = 0; - cfs_rq->throttled_clock_self = 0; + if (WARN_ON_ONCE((s64)delta < 0)) + delta = 0; - if (WARN_ON_ONCE((s64)delta < 0)) - delta = 0; + cfs_rq->throttled_clock_self_time += delta; + } - cfs_rq->throttled_clock_self_time += delta; - } + /* Re-enqueue the tasks that have been throttled at this level. */ + list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { + list_del_init(&p->throttle_node); + p->throttled = false; + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); } + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + return 0; } @@ -5857,17 +5964,25 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + if (cfs_rq->throttle_count++) + return 0; + + /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + WARN_ON_ONCE(cfs_rq->throttled_clock_self); + if (cfs_rq->nr_queued) + cfs_rq->throttled_clock_self = rq_clock(rq); + else { + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ list_del_leaf_cfs_rq(cfs_rq); - - WARN_ON_ONCE(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock_self = rq_clock(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; } - cfs_rq->throttle_count++; + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } @@ -5875,8 +5990,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta, dequeue = 1; + int dequeue = 1; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5899,68 +6013,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!dequeue) return false; /* Throttle no longer required. */ - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - int flags; - - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - /* - * Abuse SPECIAL to avoid delayed dequeue in this instance. - * This avoids teaching dequeue_entities() about throttled - * entities and keeps things relatively simple. - */ - flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; - if (se->sched_delayed) - flags |= DEQUEUE_DELAYED; - dequeue_entity(qcfs_rq, se, flags); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - - if (qcfs_rq->load.weight) { - /* Avoid re-evaluating load for this entity: */ - se = parent_entity(se); - break; - } - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - } - - /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, queued_delta); -done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. @@ -5976,9 +6033,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta; - long rq_h_nr_queued = rq->cfs.h_nr_queued; + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + + /* + * It's possible we are called with !runtime_remaining due to things + * like user changed quota setting(see tg_set_cfs_bandwidth()) or async + * unthrottled us with a positive runtime_remaining but other still + * running entities consumed those runtime before we reached here. + * + * Anyway, we can't unthrottle this cfs_rq without any runtime remaining + * because any enqueue in tg_unthrottle_up() will immediately trigger a + * throttle, which is not supposed to happen on unthrottle path. + */ + if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) + return; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -6008,62 +6076,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (list_add_leaf_cfs_rq(cfs_rq_of(se))) break; } - goto unthrottle_throttle; - } - - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - /* Handle any unfinished DELAY_DEQUEUE business first. */ - if (se->sched_delayed) { - int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; - - dequeue_entity(qcfs_rq, se, flags); - } else if (se->on_rq) - break; - enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; } - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - update_load_avg(qcfs_rq, se, UPDATE_TG); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; - } - - /* Start the fair server if un-throttling resulted in new runnable tasks */ - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) - dl_server_start(&rq->fair_server); - - /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, queued_delta); - -unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ @@ -6717,6 +6731,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void task_throttle_setup_work(struct task_struct *p) {} static bool task_is_throttled(struct task_struct *p) { return false; } +static void dequeue_throttled_task(struct task_struct *p, int flags) {} +static bool enqueue_throttled_task(struct task_struct *p) { return false; } static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { @@ -6909,6 +6925,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; + if (task_is_throttled(p) && enqueue_throttled_task(p)) + return; + /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. @@ -6961,10 +6980,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; - flags = ENQUEUE_WAKEUP; } @@ -6986,10 +7001,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; } if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { @@ -7019,7 +7030,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) check_update_overutilized_status(rq); -enqueue_throttle: assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -7074,10 +7084,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; - /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { slice = cfs_rq_min_slice(cfs_rq); @@ -7114,10 +7120,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; } sub_nr_running(rq, h_nr_queued); @@ -7151,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) */ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { + if (task_is_throttled(p)) { + dequeue_throttled_task(p, flags); + return true; + } + if (!p->se.sched_delayed) util_est_dequeue(&rq->cfs, p); @@ -8819,19 +8826,22 @@ static struct task_struct *pick_task_fair(struct rq *rq) { struct sched_entity *se; struct cfs_rq *cfs_rq; + struct task_struct *p; + bool throttled; again: cfs_rq = &rq->cfs; if (!cfs_rq->nr_queued) return NULL; + throttled = false; + do { /* Might not have done put_prev_entity() */ if (cfs_rq->curr && cfs_rq->curr->on_rq) update_curr(cfs_rq); - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto again; + throttled |= check_cfs_rq_runtime(cfs_rq); se = pick_next_entity(rq, cfs_rq); if (!se) @@ -8839,7 +8849,10 @@ static struct task_struct *pick_task_fair(struct rq *rq) cfs_rq = group_cfs_rq(se); } while (cfs_rq); - return task_of(se); + p = task_of(se); + if (unlikely(throttled)) + task_throttle_setup_work(p); + return p; } static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 62c3fa543c0f20..f921302dc40fb4 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { u64 throttled; - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) throttled = U64_MAX; else throttled = cfs_rq->throttled_clock_pelt_time; @@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a6493d255397f8..6e1b37bb8c9b1c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -735,7 +735,8 @@ struct cfs_rq { u64 throttled_clock_pelt_time; u64 throttled_clock_self; u64 throttled_clock_self_time; - int throttled; + bool throttled:1; + bool pelt_clock_throttled:1; int throttle_count; struct list_head throttled_list; struct list_head throttled_csd_list; From eb962f251fbba251a0d34897d6170f7616d70c52 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 29 Aug 2025 16:11:19 +0800 Subject: [PATCH 0754/3206] sched/fair: Task based throttle time accounting With task based throttle model, the previous way to check cfs_rq's nr_queued to decide if throttled time should be accounted doesn't work as expected, e.g. when a cfs_rq which has a single task is throttled, that task could later block in kernel mode instead of being dequeued on limbo list and accounting this as throttled time is not accurate. Rework throttle time accounting for a cfs_rq as follows: - start accounting when the first task gets throttled in its hierarchy; - stop accounting on unthrottle. Note that there will be a time gap between when a cfs_rq is throttled and when a task in its hierarchy is actually throttled. This accounting mechanism only starts accounting in the latter case. Suggested-by: Chengming Zhou # accounting mechanism Co-developed-by: K Prateek Nayak # simplify implementation Signed-off-by: K Prateek Nayak Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Tested-by: Matteo Martelli Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250829081120.806-5-ziqianlu@bytedance.com --- kernel/sched/fair.c | 56 ++++++++++++++++++++++++-------------------- kernel/sched/sched.h | 1 + 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25b1014d4ef866..bdc9bfa0b9efa2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5293,19 +5293,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) check_enqueue_throttle(cfs_rq); list_add_leaf_cfs_rq(cfs_rq); #ifdef CONFIG_CFS_BANDWIDTH - if (throttled_hierarchy(cfs_rq)) { + if (cfs_rq->pelt_clock_throttled) { struct rq *rq = rq_of(cfs_rq); - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) - cfs_rq->throttled_clock = rq_clock(rq); - if (!cfs_rq->throttled_clock_self) - cfs_rq->throttled_clock_self = rq_clock(rq); - - if (cfs_rq->pelt_clock_throttled) { - cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - - cfs_rq->throttled_clock_pelt; - cfs_rq->pelt_clock_throttled = 0; - } + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; } #endif } @@ -5393,7 +5386,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * DELAY_DEQUEUE relies on spurious wakeups, special task * states must not suffer spurious wakeups, excempt them. */ - if (flags & DEQUEUE_SPECIAL) + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) delay = false; WARN_ON_ONCE(delay && se->sched_delayed); @@ -5799,7 +5792,7 @@ static void throttle_cfs_rq_work(struct callback_head *work) rq = scope.rq; update_rq_clock(rq); WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); - dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); /* * Must not set throttled before dequeue or dequeue will @@ -5959,6 +5952,17 @@ static inline void task_throttle_setup_work(struct task_struct *p) task_work_add(p, &p->sched_throttle_work, TWA_RESUME); } +static void record_throttle_clock(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; @@ -5967,21 +5971,17 @@ static int tg_throttle_down(struct task_group *tg, void *data) if (cfs_rq->throttle_count++) return 0; - - /* group is entering throttled state, stop time */ - WARN_ON_ONCE(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock_self = rq_clock(rq); - else { - /* - * For cfs_rqs that still have entities enqueued, PELT clock - * stop happens at dequeue time when all entities are dequeued. - */ + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ + if (!cfs_rq->nr_queued) { list_del_leaf_cfs_rq(cfs_rq); cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); cfs_rq->pelt_clock_throttled = 1; } + WARN_ON_ONCE(cfs_rq->throttled_clock_self); WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } @@ -6024,8 +6024,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) */ cfs_rq->throttled = 1; WARN_ON_ONCE(cfs_rq->throttled_clock); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -6733,6 +6731,7 @@ static void task_throttle_setup_work(struct task_struct *p) {} static bool task_is_throttled(struct task_struct *p) { return false; } static void dequeue_throttled_task(struct task_struct *p, int flags) {} static bool enqueue_throttled_task(struct task_struct *p) { return false; } +static void record_throttle_clock(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { @@ -7051,6 +7050,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) bool was_sched_idle = sched_idle_rq(rq); bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; + bool task_throttled = flags & DEQUEUE_THROTTLE; struct task_struct *p = NULL; int h_nr_idle = 0; int h_nr_queued = 0; @@ -7084,6 +7084,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { slice = cfs_rq_min_slice(cfs_rq); @@ -7120,6 +7123,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; + + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); } sub_nr_running(rq, h_nr_queued); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6e1b37bb8c9b1c..b5367c514c143f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2344,6 +2344,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SPECIAL 0x10 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_THROTTLE 0x800 #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 From 5b726e9bf9544a349090879a513a5e00da486c14 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 29 Aug 2025 16:11:20 +0800 Subject: [PATCH 0755/3206] sched/fair: Get rid of throttled_lb_pair() Now that throttled tasks are dequeued and can not stay on rq's cfs_tasks list, there is no need to take special care of these throttled tasks anymore in load balance. Suggested-by: K Prateek Nayak Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Tested-by: Matteo Martelli Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250829081120.806-6-ziqianlu@bytedance.com --- kernel/sched/fair.c | 35 ++++------------------------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bdc9bfa0b9efa2..df8dc389af8e14 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5735,23 +5735,6 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttle_count; } -/* - * Ensure that neither of the group entities corresponding to src_cpu or - * dest_cpu are members of a throttled hierarchy when performing group - * load-balance operations. - */ -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) -{ - struct cfs_rq *src_cfs_rq, *dest_cfs_rq; - - src_cfs_rq = tg->cfs_rq[src_cpu]; - dest_cfs_rq = tg->cfs_rq[dest_cpu]; - - return throttled_hierarchy(src_cfs_rq) || - throttled_hierarchy(dest_cfs_rq); -} - static inline bool task_is_throttled(struct task_struct *p) { return cfs_bandwidth_used() && p->throttled; @@ -6743,12 +6726,6 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return 0; } -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) -{ - return 0; -} - #ifdef CONFIG_FAIR_GROUP_SCHED void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -9385,18 +9362,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or - * 2) throttled_lb_pair, or - * 3) cannot be migrated to this CPU due to cpus_ptr, or - * 4) running (obviously), or - * 5) are cache-hot on their current CPU, or - * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) + * 2) cannot be migrated to this CPU due to cpus_ptr, or + * 3) running (obviously), or + * 4) are cache-hot on their current CPU, or + * 5) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) */ if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) - return 0; - /* * We want to prioritize the migration of eligible tasks. * For ineligible tasks we soft-limit them and only allow From 0a26e5eb78fb1627beb8e3eb172737f8492d2799 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 25 Aug 2025 15:34:23 -0500 Subject: [PATCH 0756/3206] jiffies: Remove obsolete SHIFTED_HZ comment b3c869d35b9b ("jiffies: Remove compile time assumptions about CLOCK_TICK_RATE") removed the last definition of SHIFTED_HZ but left behind comments about it. Remove the comments as well. Signed-off-by: Bjorn Helgaas Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250825203425.796034-1-helgaas@kernel.org --- include/linux/jiffies.h | 2 +- include/vdso/jiffies.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 91b20788273dfa..0d1927da805513 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -61,7 +61,7 @@ extern void register_refined_jiffies(long clock_tick_rate); -/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ +/* TICK_USEC is the time between ticks in usec */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ diff --git a/include/vdso/jiffies.h b/include/vdso/jiffies.h index 2f9d596c8b2977..8ca04a141412ec 100644 --- a/include/vdso/jiffies.h +++ b/include/vdso/jiffies.h @@ -5,7 +5,7 @@ #include /* for HZ */ #include -/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */ +/* TICK_NSEC is the time between ticks in nsec */ #define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ) #endif /* __VDSO_JIFFIES_H */ From 170031ff27dd7a07fdedee7f3710a19dcdf889bd Mon Sep 17 00:00:00 2001 From: Job Sava Date: Tue, 26 Aug 2025 15:46:29 +0200 Subject: [PATCH 0757/3206] input: tps6594-pwrbutton: Add power button functionality TPS6594 defines two interrupts for the power button one for push and one for release. This driver is very simple in that it maps the push interrupt to a key input and the release interrupt to a key release. Signed-off-by: Job Sava Signed-off-by: Michael Walle Acked-by: Dmitry Torokhov Link: https://lore.kernel.org/r/20250826134631.1499936-2-mwalle@kernel.org Signed-off-by: Lee Jones --- drivers/input/misc/Kconfig | 10 ++ drivers/input/misc/Makefile | 1 + drivers/input/misc/tps6594-pwrbutton.c | 126 +++++++++++++++++++++++++ 3 files changed, 137 insertions(+) create mode 100644 drivers/input/misc/tps6594-pwrbutton.c diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig index 0fb21c99a5e3d1..cce51358242f51 100644 --- a/drivers/input/misc/Kconfig +++ b/drivers/input/misc/Kconfig @@ -506,6 +506,16 @@ config INPUT_TPS65219_PWRBUTTON To compile this driver as a module, choose M here. The module will be called tps65219-pwrbutton. +config INPUT_TPS6594_PWRBUTTON + tristate "TPS6594 Power button driver" + depends on MFD_TPS6594 + help + Say Y here if you want to enable power button reporting for + TPS6594 Power Management IC devices. + + To compile this driver as a module, choose M here. The module will + be called tps6594-pwrbutton. + config INPUT_AXP20X_PEK tristate "X-Powers AXP20X power button driver" depends on MFD_AXP20X diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile index d468c8140b93da..bb12f883851cea 100644 --- a/drivers/input/misc/Makefile +++ b/drivers/input/misc/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_INPUT_SPARCSPKR) += sparcspkr.o obj-$(CONFIG_INPUT_STPMIC1_ONKEY) += stpmic1_onkey.o obj-$(CONFIG_INPUT_TPS65218_PWRBUTTON) += tps65218-pwrbutton.o obj-$(CONFIG_INPUT_TPS65219_PWRBUTTON) += tps65219-pwrbutton.o +obj-$(CONFIG_INPUT_TPS6594_PWRBUTTON) += tps6594-pwrbutton.o obj-$(CONFIG_INPUT_TWL4030_PWRBUTTON) += twl4030-pwrbutton.o obj-$(CONFIG_INPUT_TWL4030_VIBRA) += twl4030-vibra.o obj-$(CONFIG_INPUT_TWL6040_VIBRA) += twl6040-vibra.o diff --git a/drivers/input/misc/tps6594-pwrbutton.c b/drivers/input/misc/tps6594-pwrbutton.c new file mode 100644 index 00000000000000..cd039b3866dc21 --- /dev/null +++ b/drivers/input/misc/tps6594-pwrbutton.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * power button driver for TI TPS6594 PMICs + * + * Copyright (C) 2025 Critical Link LLC - https://www.criticallink.com/ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct tps6594_pwrbutton { + struct device *dev; + struct input_dev *idev; + char phys[32]; +}; + +static irqreturn_t tps6594_pb_push_irq(int irq, void *_pwr) +{ + struct tps6594_pwrbutton *pwr = _pwr; + + input_report_key(pwr->idev, KEY_POWER, 1); + pm_wakeup_event(pwr->dev, 0); + input_sync(pwr->idev); + + return IRQ_HANDLED; +} + +static irqreturn_t tps6594_pb_release_irq(int irq, void *_pwr) +{ + struct tps6594_pwrbutton *pwr = _pwr; + + input_report_key(pwr->idev, KEY_POWER, 0); + input_sync(pwr->idev); + + return IRQ_HANDLED; +} + +static int tps6594_pb_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct tps6594_pwrbutton *pwr; + struct input_dev *idev; + int error; + int push_irq; + int release_irq; + + pwr = devm_kzalloc(dev, sizeof(*pwr), GFP_KERNEL); + if (!pwr) + return -ENOMEM; + + idev = devm_input_allocate_device(dev); + if (!idev) + return -ENOMEM; + + idev->name = pdev->name; + snprintf(pwr->phys, sizeof(pwr->phys), "%s/input0", + pdev->name); + idev->phys = pwr->phys; + idev->id.bustype = BUS_I2C; + + input_set_capability(idev, EV_KEY, KEY_POWER); + + pwr->dev = dev; + pwr->idev = idev; + device_init_wakeup(dev, true); + + push_irq = platform_get_irq(pdev, 0); + if (push_irq < 0) + return -EINVAL; + + release_irq = platform_get_irq(pdev, 1); + if (release_irq < 0) + return -EINVAL; + + error = devm_request_threaded_irq(dev, push_irq, NULL, + tps6594_pb_push_irq, + IRQF_ONESHOT, + pdev->resource[0].name, pwr); + if (error) { + dev_err(dev, "failed to request push IRQ #%d: %d\n", push_irq, + error); + return error; + } + + error = devm_request_threaded_irq(dev, release_irq, NULL, + tps6594_pb_release_irq, + IRQF_ONESHOT, + pdev->resource[1].name, pwr); + if (error) { + dev_err(dev, "failed to request release IRQ #%d: %d\n", + release_irq, error); + return error; + } + + error = input_register_device(idev); + if (error) { + dev_err(dev, "Can't register power button: %d\n", error); + return error; + } + + return 0; +} + +static const struct platform_device_id tps6594_pwrbtn_id_table[] = { + { "tps6594-pwrbutton", }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(platform, tps6594_pwrbtn_id_table); + +static struct platform_driver tps6594_pb_driver = { + .probe = tps6594_pb_probe, + .driver = { + .name = "tps6594_pwrbutton", + }, + .id_table = tps6594_pwrbtn_id_table, +}; +module_platform_driver(tps6594_pb_driver); + +MODULE_DESCRIPTION("TPS6594 Power Button"); +MODULE_LICENSE("GPL"); From d766ca01c208bdf0f36098607efe1e250ccf41c5 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 26 Aug 2025 15:46:30 +0200 Subject: [PATCH 0758/3206] mfd: tps6594: Add power button functionality The PMIC has a multi-function pin PB/EN/VSENSE. If it is configured as push-button (PB), add the corresponding device for it. Co-developed-by: Job Sava Signed-off-by: Job Sava Signed-off-by: Michael Walle Link: https://lore.kernel.org/r/20250826134631.1499936-3-mwalle@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/tps6594-core.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/mfd/tps6594-core.c b/drivers/mfd/tps6594-core.c index c16c37e3661787..9195c90594898f 100644 --- a/drivers/mfd/tps6594-core.c +++ b/drivers/mfd/tps6594-core.c @@ -20,6 +20,8 @@ #include #define TPS6594_CRC_SYNC_TIMEOUT_MS 150 +#define TPS65224_EN_SEL_PB 1 +#define TPS65224_GPIO3_SEL_PB 3 /* Completion to synchronize CRC feature enabling on all PMICs */ static DECLARE_COMPLETION(tps6594_crc_comp); @@ -128,6 +130,12 @@ static const struct resource tps6594_rtc_resources[] = { DEFINE_RES_IRQ_NAMED(TPS6594_IRQ_POWER_UP, TPS6594_IRQ_NAME_POWERUP), }; +static const struct resource tps6594_pwrbutton_resources[] = { + DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_FALL, TPS65224_IRQ_NAME_PB_FALL), + DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_RISE, TPS65224_IRQ_NAME_PB_RISE), + DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_SHORT, TPS65224_IRQ_NAME_PB_SHORT), +}; + static const struct mfd_cell tps6594_common_cells[] = { MFD_CELL_RES("tps6594-regulator", tps6594_regulator_resources), MFD_CELL_RES("tps6594-pinctrl", tps6594_pinctrl_resources), @@ -318,8 +326,6 @@ static const struct resource tps65224_pfsm_resources[] = { DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_REG_UNLOCK, TPS65224_IRQ_NAME_REG_UNLOCK), DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_TWARN, TPS65224_IRQ_NAME_TWARN), DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_LONG, TPS65224_IRQ_NAME_PB_LONG), - DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_FALL, TPS65224_IRQ_NAME_PB_FALL), - DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_RISE, TPS65224_IRQ_NAME_PB_RISE), DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_TSD_ORD, TPS65224_IRQ_NAME_TSD_ORD), DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_BIST_FAIL, TPS65224_IRQ_NAME_BIST_FAIL), DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_REG_CRC_ERR, TPS65224_IRQ_NAME_REG_CRC_ERR), @@ -347,6 +353,12 @@ static const struct mfd_cell tps65224_common_cells[] = { MFD_CELL_RES("tps6594-regulator", tps65224_regulator_resources), }; +static const struct mfd_cell tps6594_pwrbutton_cell = { + .name = "tps6594-pwrbutton", + .resources = tps6594_pwrbutton_resources, + .num_resources = ARRAY_SIZE(tps6594_pwrbutton_resources), +}; + static const struct regmap_irq tps65224_irqs[] = { /* INT_BUCK register */ REGMAP_IRQ_REG(TPS65224_IRQ_BUCK1_UVOV, 0, TPS65224_BIT_BUCK1_UVOV_INT), @@ -681,6 +693,7 @@ int tps6594_device_init(struct tps6594 *tps, bool enable_crc) struct device *dev = tps->dev; int ret; struct regmap_irq_chip *irq_chip; + unsigned int pwr_on, gpio3_cfg; const struct mfd_cell *cells; int n_cells; @@ -727,6 +740,27 @@ int tps6594_device_init(struct tps6594 *tps, bool enable_crc) if (ret) return dev_err_probe(dev, ret, "Failed to add common child devices\n"); + /* If either the PB/EN/VSENSE or GPIO3 is configured as PB, register a driver for it */ + if (tps->chip_id == TPS65224 || tps->chip_id == TPS652G1) { + ret = regmap_read(tps->regmap, TPS6594_REG_NPWRON_CONF, &pwr_on); + if (ret) + return dev_err_probe(dev, ret, "Failed to read PB/EN/VSENSE config\n"); + + ret = regmap_read(tps->regmap, TPS6594_REG_GPIOX_CONF(2), &gpio3_cfg); + if (ret) + return dev_err_probe(dev, ret, "Failed to read GPIO3 config\n"); + + if (FIELD_GET(TPS65224_MASK_EN_PB_VSENSE_CONFIG, pwr_on) == TPS65224_EN_SEL_PB || + FIELD_GET(TPS65224_MASK_GPIO_SEL, gpio3_cfg) == TPS65224_GPIO3_SEL_PB) { + ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, + &tps6594_pwrbutton_cell, 1, NULL, 0, + regmap_irq_get_domain(tps->irq_data)); + if (ret) + return dev_err_probe(dev, ret, + "Failed to add power button device.\n"); + } + } + /* No RTC for LP8764, TPS65224 and TPS652G1 */ if (tps->chip_id != LP8764 && tps->chip_id != TPS65224 && tps->chip_id != TPS652G1) { ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, tps6594_rtc_cells, From 2215a87b02ad8d353cd3edebd1bed01db2458986 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 26 Aug 2025 15:46:31 +0200 Subject: [PATCH 0759/3206] mfd: tps6594: Add board power-off support Add a system level power-off handler if the "system-power-controller" flag is set for this device in the device tree. A power-off request is triggered by writing the TRIGGER_I2C_0 bit (which is actually just a convention and really depends on the freely programmable FSM). Co-developed-by: Job Sava Signed-off-by: Job Sava Signed-off-by: Michael Walle Link: https://lore.kernel.org/r/20250826134631.1499936-4-mwalle@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/tps6594-core.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/mfd/tps6594-core.c b/drivers/mfd/tps6594-core.c index 9195c90594898f..7127af7142f54b 100644 --- a/drivers/mfd/tps6594-core.c +++ b/drivers/mfd/tps6594-core.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -688,6 +689,19 @@ static int tps6594_enable_crc(struct tps6594 *tps) return ret; } +static int tps6594_power_off_handler(struct sys_off_data *data) +{ + struct tps6594 *tps = data->cb_data; + int ret; + + ret = regmap_update_bits(tps->regmap, TPS6594_REG_FSM_I2C_TRIGGERS, + TPS6594_BIT_TRIGGER_I2C(0), TPS6594_BIT_TRIGGER_I2C(0)); + if (ret) + return notifier_from_errno(ret); + + return NOTIFY_DONE; +} + int tps6594_device_init(struct tps6594 *tps, bool enable_crc) { struct device *dev = tps->dev; @@ -770,6 +784,12 @@ int tps6594_device_init(struct tps6594 *tps, bool enable_crc) return dev_err_probe(dev, ret, "Failed to add RTC child device\n"); } + if (of_device_is_system_power_controller(dev->of_node)) { + ret = devm_register_power_off_handler(tps->dev, tps6594_power_off_handler, tps); + if (ret) + return dev_err_probe(dev, ret, "Failed to register power-off handler\n"); + } + return 0; } EXPORT_SYMBOL_GPL(tps6594_device_init); From ad64c073c9a031850de1542e6e976b0249e7e650 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Wed, 3 Sep 2025 12:08:41 +0200 Subject: [PATCH 0760/3206] ALSA: docs: Remove 3rd person singular s in *to indicate* Fixes: 78811dd56def ("ALSA: docs: Add documents for recently changes in snd-usb-audio") Signed-off-by: Paul Menzel Link: https://patch.msgid.link/20250903100842.267194-1-pmenzel@molgen.mpg.de Signed-off-by: Takashi Iwai --- Documentation/sound/alsa-configuration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/sound/alsa-configuration.rst b/Documentation/sound/alsa-configuration.rst index 062b86522e4d9c..accaebbdd64283 100644 --- a/Documentation/sound/alsa-configuration.rst +++ b/Documentation/sound/alsa-configuration.rst @@ -2293,7 +2293,7 @@ delayed_register notice the need. skip_validation Skip unit descriptor validation (default: no). - The option is used to ignores the validation errors with the hexdump + The option is used to ignore the validation errors with the hexdump of the unit descriptor instead of a driver probe error, so that we can check its details. quirk_flags From 1efbee6852f1ff698a9981bd731308dd027189fb Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 11 Aug 2025 15:36:16 +0200 Subject: [PATCH 0761/3206] mfd: vexpress-sysreg: Check the return value of devm_gpiochip_add_data() Commit 974cc7b93441 ("mfd: vexpress: Define the device as MFD cells") removed the return value check from the call to gpiochip_add_data() (or rather gpiochip_add() back then and later converted to devres) with no explanation. This function however can still fail, so check the return value and bail-out if it does. Cc: stable@vger.kernel.org Fixes: 974cc7b93441 ("mfd: vexpress: Define the device as MFD cells") Signed-off-by: Bartosz Golaszewski Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250811-gpio-mmio-mfd-conv-v1-1-68c5c958cf80@linaro.org Signed-off-by: Lee Jones --- drivers/mfd/vexpress-sysreg.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mfd/vexpress-sysreg.c b/drivers/mfd/vexpress-sysreg.c index fc2daffc4352cc..77245c1e5d7df4 100644 --- a/drivers/mfd/vexpress-sysreg.c +++ b/drivers/mfd/vexpress-sysreg.c @@ -99,6 +99,7 @@ static int vexpress_sysreg_probe(struct platform_device *pdev) struct resource *mem; void __iomem *base; struct gpio_chip *mmc_gpio_chip; + int ret; mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!mem) @@ -119,7 +120,10 @@ static int vexpress_sysreg_probe(struct platform_device *pdev) bgpio_init(mmc_gpio_chip, &pdev->dev, 0x4, base + SYS_MCI, NULL, NULL, NULL, NULL, 0); mmc_gpio_chip->ngpio = 2; - devm_gpiochip_add_data(&pdev->dev, mmc_gpio_chip, NULL); + + ret = devm_gpiochip_add_data(&pdev->dev, mmc_gpio_chip, NULL); + if (ret) + return ret; return devm_mfd_add_devices(&pdev->dev, PLATFORM_DEVID_AUTO, vexpress_sysreg_cells, From 9b33bbc084accb4ebde3c6888758b31e3bdf1c57 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 11 Aug 2025 15:36:17 +0200 Subject: [PATCH 0762/3206] mfd: vexpress-sysreg: Use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Signed-off-by: Bartosz Golaszewski Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250811-gpio-mmio-mfd-conv-v1-2-68c5c958cf80@linaro.org Signed-off-by: Lee Jones --- drivers/mfd/vexpress-sysreg.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/mfd/vexpress-sysreg.c b/drivers/mfd/vexpress-sysreg.c index 77245c1e5d7df4..9399eb850ca29b 100644 --- a/drivers/mfd/vexpress-sysreg.c +++ b/drivers/mfd/vexpress-sysreg.c @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -96,9 +97,10 @@ static struct mfd_cell vexpress_sysreg_cells[] = { static int vexpress_sysreg_probe(struct platform_device *pdev) { + struct gpio_generic_chip *mmc_gpio_chip; + struct gpio_generic_chip_config config; struct resource *mem; void __iomem *base; - struct gpio_chip *mmc_gpio_chip; int ret; mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); @@ -117,11 +119,20 @@ static int vexpress_sysreg_probe(struct platform_device *pdev) GFP_KERNEL); if (!mmc_gpio_chip) return -ENOMEM; - bgpio_init(mmc_gpio_chip, &pdev->dev, 0x4, base + SYS_MCI, - NULL, NULL, NULL, NULL, 0); - mmc_gpio_chip->ngpio = 2; - ret = devm_gpiochip_add_data(&pdev->dev, mmc_gpio_chip, NULL); + config = (typeof(config)){ + .dev = &pdev->dev, + .sz = 4, + .dat = base + SYS_MCI, + }; + + ret = gpio_generic_chip_init(mmc_gpio_chip, &config); + if (ret) + return ret; + + mmc_gpio_chip->gc.ngpio = 2; + + ret = devm_gpiochip_add_data(&pdev->dev, &mmc_gpio_chip->gc, NULL); if (ret) return ret; From 54a1726d2e4c0c7b33f4e5ef35fcc118a4d74ea3 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 3 Sep 2025 10:33:19 +0800 Subject: [PATCH 0763/3206] irqchip/gic-v5: Remove the redundant ITS cache invalidation An ITS cache invalidation has been performed immediately after programming the L2 DTE in gicv5_its_device_register(). No need to perform it again right after a successful gicv5_its_device_register(). Remove it. Signed-off-by: Zenghui Yu Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/all/20250903023319.1820-1-yuzenghui@huawei.com --- drivers/irqchip/irq-gic-v5-its.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c index 9290ac741949ca..81d813cd8fbbe9 100644 --- a/drivers/irqchip/irq-gic-v5-its.c +++ b/drivers/irqchip/irq-gic-v5-its.c @@ -768,8 +768,6 @@ static struct gicv5_its_dev *gicv5_its_alloc_device(struct gicv5_its_chip_data * goto out_dev_free; } - gicv5_its_device_cache_inv(its, its_dev); - its_dev->its_node = its; its_dev->event_map = (unsigned long *)bitmap_zalloc(its_dev->num_events, GFP_KERNEL); From 40c26230a1bfdf46f93e4136dbb96d093744c80d Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Fri, 29 Aug 2025 21:20:19 +0800 Subject: [PATCH 0764/3206] irqchip: Use int type to store negative error codes Change the 'ret' variable from unsigned int to int to store negative error codes or zero returned by other functions. Storing the negative error codes in unsigned type, doesn't cause an issue at runtime but assigning negative error codes to unsigned type may trigger a compiler warning when the -Wsign-conversion flag is enabled. Signed-off-by: Qianfeng Rong Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Link: https://lore.kernel.org/all/20250829132020.82077-1-rongqianfeng@vivo.com --- drivers/irqchip/irq-gic-v3.c | 3 ++- drivers/irqchip/irq-nvic.c | 3 ++- drivers/irqchip/irq-renesas-rza1.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index dbeb85677b08cb..3de351e66ee844 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1766,8 +1766,9 @@ static int gic_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { - unsigned int type, ret, ppi_idx; + unsigned int type, ppi_idx; irq_hw_number_t hwirq; + int ret; /* Not for us */ if (fwspec->fwnode != d->fwnode) diff --git a/drivers/irqchip/irq-nvic.c b/drivers/irqchip/irq-nvic.c index 76e11cac96318c..2191a2b795785b 100644 --- a/drivers/irqchip/irq-nvic.c +++ b/drivers/irqchip/irq-nvic.c @@ -73,8 +73,9 @@ static int __init nvic_of_init(struct device_node *node, struct device_node *parent) { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; - unsigned int irqs, i, ret, numbanks; + unsigned int irqs, i, numbanks; void __iomem *nvic_base; + int ret; numbanks = (readl_relaxed(V7M_SCS_ICTR) & V7M_SCS_ICTR_INTLINESNUM_MASK) + 1; diff --git a/drivers/irqchip/irq-renesas-rza1.c b/drivers/irqchip/irq-renesas-rza1.c index a697eb55ac90e4..6047a524ac77e2 100644 --- a/drivers/irqchip/irq-renesas-rza1.c +++ b/drivers/irqchip/irq-renesas-rza1.c @@ -142,11 +142,12 @@ static const struct irq_domain_ops rza1_irqc_domain_ops = { static int rza1_irqc_parse_map(struct rza1_irqc_priv *priv, struct device_node *gic_node) { - unsigned int imaplen, i, j, ret; struct device *dev = priv->dev; + unsigned int imaplen, i, j; struct device_node *ipar; const __be32 *imap; u32 intsize; + int ret; imap = of_get_property(dev->of_node, "interrupt-map", &imaplen); if (!imap) From d36bf356068cdb5499b9bc458db9149c0fd938a2 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Thu, 21 Aug 2025 17:38:45 +0800 Subject: [PATCH 0765/3206] irqchip/renesas-rzg2l: Remove dev_err_probe() if error is -ENOMEM The dev_err_probe() doesn't do anything when error is '-ENOMEM'. Therefore, remove the useless call to dev_err_probe(), and just return the value instead. Signed-off-by: Xichao Zhao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250821093845.564496-1-zhao.xichao@vivo.com --- drivers/irqchip/irq-renesas-rzg2l.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 360d88687e4f58..2a54adeb4cc714 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -578,7 +578,7 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * &rzg2l_irqc_domain_ops, rzg2l_irqc_data); if (!irq_domain) { pm_runtime_put(dev); - return dev_err_probe(dev, -ENOMEM, "failed to add irq domain\n"); + return -ENOMEM; } register_syscore_ops(&rzg2l_irqc_syscore_ops); From 94b39cb3ad6db935b585988b36378884199cd5fc Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Wed, 3 Sep 2025 10:03:47 +0800 Subject: [PATCH 0766/3206] spi: mxs: fix "transfered"->"transferred" Trivial fix to spelling mistake in comment text. Signed-off-by: Xichao Zhao Message-ID: <20250903020347.563003-1-zhao.xichao@vivo.com> Signed-off-by: Mark Brown --- drivers/spi/spi-mxs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-mxs.c b/drivers/spi/spi-mxs.c index 43455305fdf40e..0ebcbdb1b1f713 100644 --- a/drivers/spi/spi-mxs.c +++ b/drivers/spi/spi-mxs.c @@ -388,7 +388,7 @@ static int mxs_spi_transfer_one(struct spi_controller *host, TXRX_DEASSERT_CS : 0; /* - * Small blocks can be transfered via PIO. + * Small blocks can be transferred via PIO. * Measured by empiric means: * * dd if=/dev/mtdblock0 of=/dev/null bs=1024k count=1 From bfddd34d67a0402c0476b36fc1b1f373cd5b0054 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 2 Sep 2025 13:44:53 +0100 Subject: [PATCH 0767/3206] spi: fsl-dspi: Avoid using -EINPROGRESS error code Refactor dspi_rxtx() and dspi_poll() to not return -EINPROGRESS because this isn't actually a status that is ever returned to the core layer but some internal state. Use true/false return value on dspi_rxtx() for this instead. This will help separate internal vs external status for the later change to store the external status directly in cur_msg->status. No functional changes intended. Co-developed-by: Vladimir Oltean Signed-off-by: Vladimir Oltean Signed-off-by: James Clark Reviewed-by: Vladimir Oltean Message-ID: <20250902-james-nxp-spi-dma-v6-1-f7aa2c5e56e2@linaro.org> Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-dspi.c | 41 ++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 4bd4377551b5d2..654905a358e85d 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -986,36 +986,45 @@ static void dspi_fifo_write(struct fsl_dspi *dspi) dspi->progress, !dspi->irq); } -static int dspi_rxtx(struct fsl_dspi *dspi) +/* + * Read the previous transfer from the FIFO and transmit the next one. + * + * Returns false if the buffer to be transmitted is empty, and true if there is + * still data to transmit. + */ +static bool dspi_rxtx(struct fsl_dspi *dspi) { dspi_fifo_read(dspi); if (!dspi->len) /* Success! */ - return 0; + return false; dspi_fifo_write(dspi); - return -EINPROGRESS; + return true; } static int dspi_poll(struct fsl_dspi *dspi) { - int tries = 1000; + int tries; + int err = 0; u32 spi_sr; do { - regmap_read(dspi->regmap, SPI_SR, &spi_sr); - regmap_write(dspi->regmap, SPI_SR, spi_sr); - - if (spi_sr & SPI_SR_CMDTCF) + for (tries = 1000; tries > 0; --tries) { + regmap_read(dspi->regmap, SPI_SR, &spi_sr); + regmap_write(dspi->regmap, SPI_SR, spi_sr); + if (spi_sr & SPI_SR_CMDTCF) + break; + } + if (!tries) { + err = -ETIMEDOUT; break; - } while (--tries); - - if (!tries) - return -ETIMEDOUT; + } + } while (dspi_rxtx(dspi)); - return dspi_rxtx(dspi); + return err; } static irqreturn_t dspi_interrupt(int irq, void *dev_id) @@ -1029,7 +1038,7 @@ static irqreturn_t dspi_interrupt(int irq, void *dev_id) if (!(spi_sr & SPI_SR_CMDTCF)) return IRQ_NONE; - if (dspi_rxtx(dspi) == 0) + if (dspi_rxtx(dspi) == false) complete(&dspi->xfer_done); return IRQ_HANDLED; @@ -1137,9 +1146,7 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, if (dspi->irq) { wait_for_completion(&dspi->xfer_done); } else { - do { - status = dspi_poll(dspi); - } while (status == -EINPROGRESS); + status = dspi_poll(dspi); } } if (status) From 5484440aa0a98b76e7829cd06dda33cf62830d73 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 2 Sep 2025 13:44:54 +0100 Subject: [PATCH 0768/3206] spi: fsl-dspi: Store status directly in cur_msg->status This will allow us to return a status from the interrupt handler in a later commit and avoids copying it at the end of dspi_transfer_one_message(). For consistency make polling and DMA modes use the same mechanism. No functional changes intended. Signed-off-by: James Clark Signed-off-by: Vladimir Oltean Reviewed-by: Vladimir Oltean Message-ID: <20250902-james-nxp-spi-dma-v6-2-f7aa2c5e56e2@linaro.org> Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-dspi.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 654905a358e85d..48054932d51713 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -591,11 +591,10 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) static void dspi_setup_accel(struct fsl_dspi *dspi); -static int dspi_dma_xfer(struct fsl_dspi *dspi) +static void dspi_dma_xfer(struct fsl_dspi *dspi) { struct spi_message *message = dspi->cur_msg; struct device *dev = &dspi->pdev->dev; - int ret = 0; /* * dspi->len gets decremented by dspi_pop_tx_pushr in @@ -612,14 +611,12 @@ static int dspi_dma_xfer(struct fsl_dspi *dspi) message->actual_length += dspi->words_in_flight * dspi->oper_word_size; - ret = dspi_next_xfer_dma_submit(dspi); - if (ret) { + message->status = dspi_next_xfer_dma_submit(dspi); + if (message->status) { dev_err(dev, "DMA transfer failed\n"); break; } } - - return ret; } static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) @@ -1005,7 +1002,7 @@ static bool dspi_rxtx(struct fsl_dspi *dspi) return true; } -static int dspi_poll(struct fsl_dspi *dspi) +static void dspi_poll(struct fsl_dspi *dspi) { int tries; int err = 0; @@ -1024,7 +1021,7 @@ static int dspi_poll(struct fsl_dspi *dspi) } } while (dspi_rxtx(dspi)); - return err; + dspi->cur_msg->status = err; } static irqreturn_t dspi_interrupt(int irq, void *dev_id) @@ -1038,8 +1035,11 @@ static irqreturn_t dspi_interrupt(int irq, void *dev_id) if (!(spi_sr & SPI_SR_CMDTCF)) return IRQ_NONE; - if (dspi_rxtx(dspi) == false) + if (dspi_rxtx(dspi) == false) { + if (dspi->cur_msg) + WRITE_ONCE(dspi->cur_msg->status, 0); complete(&dspi->xfer_done); + } return IRQ_HANDLED; } @@ -1069,7 +1069,6 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, struct spi_device *spi = message->spi; struct spi_transfer *transfer; bool cs = false; - int status = 0; u32 val = 0; bool cs_change = false; @@ -1129,7 +1128,7 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, dspi->progress, !dspi->irq); if (dspi->devtype_data->trans_mode == DSPI_DMA_MODE) { - status = dspi_dma_xfer(dspi); + dspi_dma_xfer(dspi); } else { /* * Reinitialize the completion before transferring data @@ -1143,13 +1142,12 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, dspi_fifo_write(dspi); - if (dspi->irq) { + if (dspi->irq) wait_for_completion(&dspi->xfer_done); - } else { - status = dspi_poll(dspi); - } + else + dspi_poll(dspi); } - if (status) + if (READ_ONCE(message->status)) break; spi_transfer_delay_exec(transfer); @@ -1158,7 +1156,8 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, dspi_deassert_cs(spi, &cs); } - if (status || !cs_change) { + dspi->cur_msg = NULL; + if (message->status || !cs_change) { /* Put DSPI in stop mode */ regmap_update_bits(dspi->regmap, SPI_MCR, SPI_MCR_HALT, SPI_MCR_HALT); @@ -1167,10 +1166,9 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, ; } - message->status = status; spi_finalize_current_message(ctlr); - return status; + return message->status; } static int dspi_set_mtf(struct fsl_dspi *dspi) From 4850f158c06eeaf4997fc65c47544f2c82ad5a45 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 2 Sep 2025 13:44:55 +0100 Subject: [PATCH 0769/3206] spi: spi-fsl-dspi: Stub out DMA functions In a later commit we'll use dma_alloc_noncoherent() which isn't stubbed out for builds without CONFIG_DMA_ENGINE and results in the following build error: spi-fsl-dspi.c:(.text+0x644): undefined reference to `dma_free_pages' m68k-linux-ld: spi-fsl-dspi.c:(.text+0x67a): undefined reference to `dma_free_pages' To continue to support devices that only need XSPI mode and so that randconfig builds work, stub out DMA functionality in the DSPI driver. Although older parts of the DMA API have their own stubs, it's intentional that newer parts don't follow the same pattern. Therefore individual drivers should not compile in calls unless CONFIG_DMA_ENGINE is set. Link: https://lore.kernel.org/oe-kbuild-all/202506160036.t9VDxF6p-lkp@intel.com/ Signed-off-by: James Clark Reported-by: kernel test robot Reviewed-by: Vladimir Oltean Message-ID: <20250902-james-nxp-spi-dma-v6-3-f7aa2c5e56e2@linaro.org> Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-dspi.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 48054932d51713..0baf7e0608f2f4 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -373,6 +373,8 @@ struct fsl_dspi { void (*dev_to_host)(struct fsl_dspi *dspi, u32 rxdata); }; +static void dspi_setup_accel(struct fsl_dspi *dspi); + static bool is_s32g_dspi(struct fsl_dspi *data) { return data->devtype_data == &devtype_data[S32G] || @@ -468,6 +470,16 @@ static u32 dspi_pop_tx(struct fsl_dspi *dspi) return txdata; } +/* Push one word to the RX buffer from the POPR register (RX FIFO) */ +static void dspi_push_rx(struct fsl_dspi *dspi, u32 rxdata) +{ + if (!dspi->rx) + return; + dspi->dev_to_host(dspi, rxdata); +} + +#if IS_ENABLED(CONFIG_DMA_ENGINE) + /* Prepare one TX FIFO entry (txdata plus cmd) */ static u32 dspi_pop_tx_pushr(struct fsl_dspi *dspi) { @@ -481,14 +493,6 @@ static u32 dspi_pop_tx_pushr(struct fsl_dspi *dspi) return cmd << 16 | data; } -/* Push one word to the RX buffer from the POPR register (RX FIFO) */ -static void dspi_push_rx(struct fsl_dspi *dspi, u32 rxdata) -{ - if (!dspi->rx) - return; - dspi->dev_to_host(dspi, rxdata); -} - static void dspi_tx_dma_callback(void *arg) { struct fsl_dspi *dspi = arg; @@ -589,8 +593,6 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) return 0; } -static void dspi_setup_accel(struct fsl_dspi *dspi); - static void dspi_dma_xfer(struct fsl_dspi *dspi) { struct spi_message *message = dspi->cur_msg; @@ -722,6 +724,18 @@ static void dspi_release_dma(struct fsl_dspi *dspi) dma_release_channel(dma->chan_rx); } } +#else +static void dspi_dma_xfer(struct fsl_dspi *dspi) +{ + dspi->cur_msg->status = -EINVAL; +} +static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) +{ + dev_err(&dspi->pdev->dev, "DMA support not enabled in kernel\n"); + return -EINVAL; +} +static void dspi_release_dma(struct fsl_dspi *dspi) {} +#endif static void hz_to_spi_baud(char *pbr, char *br, int speed_hz, unsigned long clkrate, bool mtf_enabled) From 36db0b03d3745700aca4fced26f6eb624f6ea4bc Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 2 Sep 2025 13:44:56 +0100 Subject: [PATCH 0770/3206] spi: spi-fsl-dspi: Use non-coherent memory for DMA Using coherent memory here isn't functionally necessary, we're only either sending data to the device or reading from it. This means explicit synchronizations are only required around those points and the change is fairly trivial. This gives us around a 10% increase in throughput for large DMA transfers and no loss for small transfers. Suggested-by: Arnd Bergmann Reviewed-by: Frank Li Acked-by: Arnd Bergmann Signed-off-by: James Clark Reviewed-by: Vladimir Oltean Message-ID: <20250902-james-nxp-spi-dma-v6-4-f7aa2c5e56e2@linaro.org> Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-dspi.c | 65 ++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 0baf7e0608f2f4..81d1e4470f948e 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -493,11 +493,19 @@ static u32 dspi_pop_tx_pushr(struct fsl_dspi *dspi) return cmd << 16 | data; } +static size_t dspi_dma_transfer_size(struct fsl_dspi *dspi) +{ + return dspi->words_in_flight * DMA_SLAVE_BUSWIDTH_4_BYTES; +} + static void dspi_tx_dma_callback(void *arg) { struct fsl_dspi *dspi = arg; struct fsl_dspi_dma *dma = dspi->dma; + struct device *dev = &dspi->pdev->dev; + dma_sync_single_for_cpu(dev, dma->tx_dma_phys, + dspi_dma_transfer_size(dspi), DMA_TO_DEVICE); complete(&dma->cmd_tx_complete); } @@ -505,9 +513,13 @@ static void dspi_rx_dma_callback(void *arg) { struct fsl_dspi *dspi = arg; struct fsl_dspi_dma *dma = dspi->dma; + struct device *dev = &dspi->pdev->dev; int i; if (dspi->rx) { + dma_sync_single_for_cpu(dev, dma->rx_dma_phys, + dspi_dma_transfer_size(dspi), + DMA_FROM_DEVICE); for (i = 0; i < dspi->words_in_flight; i++) dspi_push_rx(dspi, dspi->dma->rx_dma_buf[i]); } @@ -517,6 +529,7 @@ static void dspi_rx_dma_callback(void *arg) static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) { + size_t size = dspi_dma_transfer_size(dspi); struct device *dev = &dspi->pdev->dev; struct fsl_dspi_dma *dma = dspi->dma; int time_left; @@ -525,12 +538,12 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) for (i = 0; i < dspi->words_in_flight; i++) dspi->dma->tx_dma_buf[i] = dspi_pop_tx_pushr(dspi); + dma_sync_single_for_device(dev, dma->tx_dma_phys, size, DMA_TO_DEVICE); dma->tx_desc = dmaengine_prep_slave_single(dma->chan_tx, - dma->tx_dma_phys, - dspi->words_in_flight * - DMA_SLAVE_BUSWIDTH_4_BYTES, - DMA_MEM_TO_DEV, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + dma->tx_dma_phys, size, + DMA_MEM_TO_DEV, + DMA_PREP_INTERRUPT | + DMA_CTRL_ACK); if (!dma->tx_desc) { dev_err(dev, "Not able to get desc for DMA xfer\n"); return -EIO; @@ -543,12 +556,13 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) return -EINVAL; } + dma_sync_single_for_device(dev, dma->rx_dma_phys, size, + DMA_FROM_DEVICE); dma->rx_desc = dmaengine_prep_slave_single(dma->chan_rx, - dma->rx_dma_phys, - dspi->words_in_flight * - DMA_SLAVE_BUSWIDTH_4_BYTES, - DMA_DEV_TO_MEM, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + dma->rx_dma_phys, size, + DMA_DEV_TO_MEM, + DMA_PREP_INTERRUPT | + DMA_CTRL_ACK); if (!dma->rx_desc) { dev_err(dev, "Not able to get desc for DMA xfer\n"); return -EIO; @@ -643,17 +657,17 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) goto err_tx_channel; } - dma->tx_dma_buf = dma_alloc_coherent(dma->chan_tx->device->dev, - dma_bufsize, &dma->tx_dma_phys, - GFP_KERNEL); + dma->tx_dma_buf = dma_alloc_noncoherent(dma->chan_tx->device->dev, + dma_bufsize, &dma->tx_dma_phys, + DMA_TO_DEVICE, GFP_KERNEL); if (!dma->tx_dma_buf) { ret = -ENOMEM; goto err_tx_dma_buf; } - dma->rx_dma_buf = dma_alloc_coherent(dma->chan_rx->device->dev, - dma_bufsize, &dma->rx_dma_phys, - GFP_KERNEL); + dma->rx_dma_buf = dma_alloc_noncoherent(dma->chan_rx->device->dev, + dma_bufsize, &dma->rx_dma_phys, + DMA_FROM_DEVICE, GFP_KERNEL); if (!dma->rx_dma_buf) { ret = -ENOMEM; goto err_rx_dma_buf; @@ -688,11 +702,12 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) return 0; err_slave_config: - dma_free_coherent(dma->chan_rx->device->dev, - dma_bufsize, dma->rx_dma_buf, dma->rx_dma_phys); + dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize, + dma->rx_dma_buf, dma->rx_dma_phys, + DMA_FROM_DEVICE); err_rx_dma_buf: - dma_free_coherent(dma->chan_tx->device->dev, - dma_bufsize, dma->tx_dma_buf, dma->tx_dma_phys); + dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize, + dma->tx_dma_buf, dma->tx_dma_phys, DMA_TO_DEVICE); err_tx_dma_buf: dma_release_channel(dma->chan_tx); err_tx_channel: @@ -713,14 +728,16 @@ static void dspi_release_dma(struct fsl_dspi *dspi) return; if (dma->chan_tx) { - dma_free_coherent(dma->chan_tx->device->dev, dma_bufsize, - dma->tx_dma_buf, dma->tx_dma_phys); + dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize, + dma->tx_dma_buf, dma->tx_dma_phys, + DMA_TO_DEVICE); dma_release_channel(dma->chan_tx); } if (dma->chan_rx) { - dma_free_coherent(dma->chan_rx->device->dev, dma_bufsize, - dma->rx_dma_buf, dma->rx_dma_phys); + dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize, + dma->rx_dma_buf, dma->rx_dma_phys, + DMA_FROM_DEVICE); dma_release_channel(dma->chan_rx); } } From fbb618e11fa7976c5295facb28afbf1a08393f51 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 2 Sep 2025 13:44:57 +0100 Subject: [PATCH 0771/3206] spi: spi-fsl-dspi: Use whole page for DMA buffers dma_alloc_noncoherent() allocations are backed by a full page anyway, so use it all. VF610 devices used to use the full page before commit a957499bd437 ("spi: spi-fsl-dspi: Fix bits-per-word acceleration in DMA mode"), but others still used the FIFO size. After that commit, all devices used the FIFO size. Now all devices use the full page. Signed-off-by: James Clark Reviewed-by: Vladimir Oltean Message-ID: <20250902-james-nxp-spi-dma-v6-5-f7aa2c5e56e2@linaro.org> Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-dspi.c | 41 ++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 81d1e4470f948e..6bf87ef01c13d1 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -331,6 +331,8 @@ struct fsl_dspi_dma { dma_addr_t rx_dma_phys; struct completion cmd_rx_complete; struct dma_async_tx_descriptor *rx_desc; + + size_t bufsize; }; struct fsl_dspi { @@ -493,6 +495,24 @@ static u32 dspi_pop_tx_pushr(struct fsl_dspi *dspi) return cmd << 16 | data; } +static size_t dspi_dma_max_datawords(struct fsl_dspi *dspi) +{ + /* + * Transfers look like one of these, so we always use a full DMA word + * regardless of SPI word size: + * + * 31 16 15 0 + * ----------------------------------------- + * | CONTROL WORD | 16-bit DATA | + * ----------------------------------------- + * or + * ----------------------------------------- + * | CONTROL WORD | UNUSED | 8-bit DATA | + * ----------------------------------------- + */ + return dspi->dma->bufsize / DMA_SLAVE_BUSWIDTH_4_BYTES; +} + static size_t dspi_dma_transfer_size(struct fsl_dspi *dspi) { return dspi->words_in_flight * DMA_SLAVE_BUSWIDTH_4_BYTES; @@ -620,9 +640,8 @@ static void dspi_dma_xfer(struct fsl_dspi *dspi) /* Figure out operational bits-per-word for this chunk */ dspi_setup_accel(dspi); - dspi->words_in_flight = dspi->len / dspi->oper_word_size; - if (dspi->words_in_flight > dspi->devtype_data->fifo_size) - dspi->words_in_flight = dspi->devtype_data->fifo_size; + dspi->words_in_flight = min(dspi->len / dspi->oper_word_size, + dspi_dma_max_datawords(dspi)); message->actual_length += dspi->words_in_flight * dspi->oper_word_size; @@ -637,7 +656,6 @@ static void dspi_dma_xfer(struct fsl_dspi *dspi) static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) { - int dma_bufsize = dspi->devtype_data->fifo_size * 2; struct device *dev = &dspi->pdev->dev; struct dma_slave_config cfg; struct fsl_dspi_dma *dma; @@ -657,8 +675,10 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) goto err_tx_channel; } + dma->bufsize = PAGE_SIZE; + dma->tx_dma_buf = dma_alloc_noncoherent(dma->chan_tx->device->dev, - dma_bufsize, &dma->tx_dma_phys, + dma->bufsize, &dma->tx_dma_phys, DMA_TO_DEVICE, GFP_KERNEL); if (!dma->tx_dma_buf) { ret = -ENOMEM; @@ -666,7 +686,7 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) } dma->rx_dma_buf = dma_alloc_noncoherent(dma->chan_rx->device->dev, - dma_bufsize, &dma->rx_dma_phys, + dma->bufsize, &dma->rx_dma_phys, DMA_FROM_DEVICE, GFP_KERNEL); if (!dma->rx_dma_buf) { ret = -ENOMEM; @@ -702,11 +722,11 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) return 0; err_slave_config: - dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize, + dma_free_noncoherent(dma->chan_rx->device->dev, dma->bufsize, dma->rx_dma_buf, dma->rx_dma_phys, DMA_FROM_DEVICE); err_rx_dma_buf: - dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize, + dma_free_noncoherent(dma->chan_tx->device->dev, dma->bufsize, dma->tx_dma_buf, dma->tx_dma_phys, DMA_TO_DEVICE); err_tx_dma_buf: dma_release_channel(dma->chan_tx); @@ -721,21 +741,20 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) static void dspi_release_dma(struct fsl_dspi *dspi) { - int dma_bufsize = dspi->devtype_data->fifo_size * 2; struct fsl_dspi_dma *dma = dspi->dma; if (!dma) return; if (dma->chan_tx) { - dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize, + dma_free_noncoherent(dma->chan_tx->device->dev, dma->bufsize, dma->tx_dma_buf, dma->tx_dma_phys, DMA_TO_DEVICE); dma_release_channel(dma->chan_tx); } if (dma->chan_rx) { - dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize, + dma_free_noncoherent(dma->chan_rx->device->dev, dma->bufsize, dma->rx_dma_buf, dma->rx_dma_phys, DMA_FROM_DEVICE); dma_release_channel(dma->chan_rx); From 7d9baf1e530930e28b45805e3855a4a465a9e36e Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 2 Sep 2025 13:44:58 +0100 Subject: [PATCH 0772/3206] spi: spi-fsl-dspi: Increase target mode DMA buffer size When the device is configured as a target, the host won't stop sending data while we're draining the buffer which leads to FIFO underflows and corruption. Increase the DMA buffer size to the maximum words that edma can transfer once to reduce the chance of this happening. In host mode, the driver is able to split up a transfer into smaller chunks so we don't need to increase the size. While in target mode, the length of the transfer is determined by the remote host and can be larger than whatever default buffer size we pick. Keeping the buffer small in host mode avoids wasting memory, but allocating the largest possible in target mode gives the lowest possible chance of dropping any data from the host. While we could allocate per-transfer using the exact size of the transfer, 128K is quite a large allocation and there is a chance it could fail due to memory fragmentation unless it's allocated once at init time. Signed-off-by: Larisa Grigore Signed-off-by: James Clark Reviewed-by: Vladimir Oltean Message-ID: <20250902-james-nxp-spi-dma-v6-6-f7aa2c5e56e2@linaro.org> Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-dspi.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 6bf87ef01c13d1..3d29285c772c44 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -675,7 +675,18 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) goto err_tx_channel; } - dma->bufsize = PAGE_SIZE; + if (spi_controller_is_target(dspi->ctlr)) { + /* + * In target mode we have to be ready to receive the maximum + * that can possibly be transferred at once by EDMA without any + * FIFO underflows. + */ + dma->bufsize = min(dma_get_max_seg_size(dma->chan_rx->device->dev), + dma_get_max_seg_size(dma->chan_tx->device->dev)) * + DMA_SLAVE_BUSWIDTH_4_BYTES; + } else { + dma->bufsize = PAGE_SIZE; + } dma->tx_dma_buf = dma_alloc_noncoherent(dma->chan_tx->device->dev, dma->bufsize, &dma->tx_dma_phys, From 5cc49b5a36b32a2dba41441ea13b93fb5ea21cfd Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 2 Sep 2025 13:44:59 +0100 Subject: [PATCH 0773/3206] spi: spi-fsl-dspi: Report FIFO overflows as errors In target mode, the host sending more data than can be consumed would be a common problem for any message exceeding the FIFO or DMA buffer size. Cancel the whole message as soon as this condition is hit as the message will be corrupted. Only do this for target mode in a DMA transfer, it's not likely these flags will be set in host mode so it's not worth adding extra checks. In IRQ and polling modes we use the same transfer functions for hosts and targets so the error flags always get checked. This is slightly inconsistent but it's not worth doing the check conditionally because it may catch some host programming errors in the future. Signed-off-by: James Clark Reviewed-by: Vladimir Oltean Message-ID: <20250902-james-nxp-spi-dma-v6-7-f7aa2c5e56e2@linaro.org> Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-dspi.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 3d29285c772c44..83ea296597e946 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -480,6 +480,17 @@ static void dspi_push_rx(struct fsl_dspi *dspi, u32 rxdata) dspi->dev_to_host(dspi, rxdata); } +static int dspi_fifo_error(struct fsl_dspi *dspi, u32 spi_sr) +{ + if (spi_sr & (SPI_SR_TFUF | SPI_SR_RFOF)) { + dev_err_ratelimited(&dspi->pdev->dev, "FIFO errors:%s%s\n", + spi_sr & SPI_SR_TFUF ? " TX underflow," : "", + spi_sr & SPI_SR_RFOF ? " RX overflow," : ""); + return -EIO; + } + return 0; +} + #if IS_ENABLED(CONFIG_DMA_ENGINE) /* Prepare one TX FIFO entry (txdata plus cmd) */ @@ -553,6 +564,7 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) struct device *dev = &dspi->pdev->dev; struct fsl_dspi_dma *dma = dspi->dma; int time_left; + u32 spi_sr; int i; for (i = 0; i < dspi->words_in_flight; i++) @@ -603,7 +615,8 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) if (spi_controller_is_target(dspi->ctlr)) { wait_for_completion_interruptible(&dspi->dma->cmd_rx_complete); - return 0; + regmap_read(dspi->regmap, SPI_SR, &spi_sr); + return dspi_fifo_error(dspi, spi_sr); } time_left = wait_for_completion_timeout(&dspi->dma->cmd_tx_complete, @@ -1073,6 +1086,10 @@ static void dspi_poll(struct fsl_dspi *dspi) for (tries = 1000; tries > 0; --tries) { regmap_read(dspi->regmap, SPI_SR, &spi_sr); regmap_write(dspi->regmap, SPI_SR, spi_sr); + + dspi->cur_msg->status = dspi_fifo_error(dspi, spi_sr); + if (dspi->cur_msg->status) + return; if (spi_sr & SPI_SR_CMDTCF) break; } @@ -1088,6 +1105,7 @@ static void dspi_poll(struct fsl_dspi *dspi) static irqreturn_t dspi_interrupt(int irq, void *dev_id) { struct fsl_dspi *dspi = (struct fsl_dspi *)dev_id; + int status; u32 spi_sr; regmap_read(dspi->regmap, SPI_SR, &spi_sr); @@ -1096,6 +1114,14 @@ static irqreturn_t dspi_interrupt(int irq, void *dev_id) if (!(spi_sr & SPI_SR_CMDTCF)) return IRQ_NONE; + status = dspi_fifo_error(dspi, spi_sr); + if (status) { + if (dspi->cur_msg) + WRITE_ONCE(dspi->cur_msg->status, status); + complete(&dspi->xfer_done); + return IRQ_HANDLED; + } + if (dspi_rxtx(dspi) == false) { if (dspi->cur_msg) WRITE_ONCE(dspi->cur_msg->status, 0); From 673f1244b3d47c9b41cda3473c062bec586387be Mon Sep 17 00:00:00 2001 From: Wladislav Wiebe Date: Mon, 4 Aug 2025 11:35:25 +0200 Subject: [PATCH 0774/3206] genirq: Add support for warning on long-running interrupt handlers Introduce a mechanism to detect and warn about prolonged interrupt handlers. With a new command-line parameter (irqhandler.duration_warn_us=), users can configure the duration threshold in microseconds when a warning in such format should be emitted: "[CPU14] long duration of IRQ[159:bad_irq_handler [long_irq]], took: 1330 us" The implementation uses local_clock() to measure the execution duration of the generic IRQ per-CPU event handler. Signed-off-by: Wladislav Wiebe Signed-off-by: Thomas Gleixner Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/all/20250804093525.851-1-wladislav.wiebe@nokia.com --- .../admin-guide/kernel-parameters.txt | 5 ++ kernel/irq/handle.c | 49 ++++++++++++++++++- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 747a55abf4946b..bdbc44f52e4a43 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2606,6 +2606,11 @@ for it. Intended to get systems with badly broken firmware running. + irqhandler.duration_warn_us= [KNL] + Warn if an IRQ handler exceeds the specified duration + threshold in microseconds. Useful for identifying + long-running IRQs in the system. + irqpoll [HW] When an interrupt is not handled search all handlers for it. Also check all handlers each timer diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9489f93b3db344..e103451243a0b6 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,6 +136,44 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } +static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled); +static u64 irqhandler_duration_threshold_ns __ro_after_init; + +static int __init irqhandler_duration_check_setup(char *arg) +{ + unsigned long val; + int ret; + + ret = kstrtoul(arg, 0, &val); + if (ret) { + pr_err("Unable to parse irqhandler.duration_warn_us setting: ret=%d\n", ret); + return 0; + } + + if (!val) { + pr_err("Invalid irqhandler.duration_warn_us setting, must be > 0\n"); + return 0; + } + + irqhandler_duration_threshold_ns = val * 1000; + static_branch_enable(&irqhandler_duration_check_enabled); + + return 1; +} +__setup("irqhandler.duration_warn_us=", irqhandler_duration_check_setup); + +static inline void irqhandler_duration_check(u64 ts_start, unsigned int irq, + const struct irqaction *action) +{ + u64 delta_ns = local_clock() - ts_start; + + if (unlikely(delta_ns > irqhandler_duration_threshold_ns)) { + pr_warn_ratelimited("[CPU%u] long duration of IRQ[%u:%ps], took: %llu us\n", + smp_processor_id(), irq, action->handler, + div_u64(delta_ns, NSEC_PER_USEC)); + } +} + irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; @@ -155,7 +193,16 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) lockdep_hardirq_threaded(); trace_irq_handler_entry(irq, action); - res = action->handler(irq, action->dev_id); + + if (static_branch_unlikely(&irqhandler_duration_check_enabled)) { + u64 ts_start = local_clock(); + + res = action->handler(irq, action->dev_id); + irqhandler_duration_check(ts_start, irq, action); + } else { + res = action->handler(irq, action->dev_id); + } + trace_irq_handler_exit(irq, action, res); if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", From 2292c8061c783e6900fcedcee388f69deab765ef Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Wed, 3 Sep 2025 11:06:48 +0800 Subject: [PATCH 0775/3206] EDAC/skx_common: Use topology_physical_package_id() instead of open coding Use topology_physical_package_id() to get the CPU package ID instead of open coding. Suggested-by: Zhang Rui Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20250903030648.3285935-1-qiuxu.zhuo@intel.com --- drivers/edac/skx_common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 5899a111049551..724842f512acac 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -14,6 +14,7 @@ * Copyright (c) 2018, Intel Corporation. */ +#include #include #include #include @@ -278,7 +279,7 @@ static int skx_get_pkg_id(struct skx_dev *d, u8 *id) struct cpuinfo_x86 *c = &cpu_data(cpu); if (c->initialized && cpu_to_node(cpu) == node) { - *id = c->topo.pkg_id; + *id = topology_physical_package_id(cpu); return 0; } } From c9163915a93d40e32c4e4aeb942c0adcb190d72e Mon Sep 17 00:00:00 2001 From: David Gow Date: Sat, 16 Aug 2025 17:45:28 +0800 Subject: [PATCH 0776/3206] genirq/test: Fix depth tests on architectures with NOREQUEST by default. The new irq KUnit tests fail on some architectures (notably PowerPC and 32-bit ARM), as the request_irq() call fails due to the ARCH_IRQ_INIT_FLAGS containing IRQ_NOREQUEST, yielding the following errors: [10:17:45] # irq_free_disabled_test: EXPECTATION FAILED at kernel/irq/irq_test.c:88 [10:17:45] Expected ret == 0, but [10:17:45] ret == -22 (0xffffffffffffffea) [10:17:45] # irq_free_disabled_test: EXPECTATION FAILED at kernel/irq/irq_test.c:90 [10:17:45] Expected desc->depth == 0, but [10:17:45] desc->depth == 1 (0x1) [10:17:45] # irq_free_disabled_test: EXPECTATION FAILED at kernel/irq/irq_test.c:93 [10:17:45] Expected desc->depth == 1, but [10:17:45] desc->depth == 2 (0x2) By clearing IRQ_NOREQUEST from the interrupt descriptor, these tests now pass on ARM and PowerPC. Fixes: 66067c3c8a1e ("genirq: Add kunit tests for depth counts") Signed-off-by: David Gow Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Tested-by: Brian Norris Reviewed-by: Brian Norris Link: https://lore.kernel.org/all/20250816094528.3560222-2-davidgow@google.com --- kernel/irq/irq_test.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c index a75abebed7f248..e220e7b2fc1877 100644 --- a/kernel/irq/irq_test.c +++ b/kernel/irq/irq_test.c @@ -54,6 +54,9 @@ static void irq_disable_depth_test(struct kunit *test) desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); + /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */ + irq_settings_clr_norequest(desc); + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); KUNIT_EXPECT_EQ(test, ret, 0); @@ -81,6 +84,9 @@ static void irq_free_disabled_test(struct kunit *test) desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); + /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */ + irq_settings_clr_norequest(desc); + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); KUNIT_EXPECT_EQ(test, ret, 0); @@ -120,6 +126,9 @@ static void irq_shutdown_depth_test(struct kunit *test) desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); + /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */ + irq_settings_clr_norequest(desc); + data = irq_desc_get_irq_data(desc); KUNIT_ASSERT_PTR_NE(test, data, NULL); @@ -180,6 +189,9 @@ static void irq_cpuhotplug_test(struct kunit *test) desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); + /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */ + irq_settings_clr_norequest(desc); + data = irq_desc_get_irq_data(desc); KUNIT_ASSERT_PTR_NE(test, data, NULL); From f8a44f9babd054ff19e20a30cab661d716ad5459 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 22 Aug 2025 11:59:02 -0700 Subject: [PATCH 0777/3206] genirq/test: Select IRQ_DOMAIN These tests use irq_domain_alloc_descs() and so require CONFIG_IRQ_DOMAIN. Fixes: 66067c3c8a1e ("genirq: Add kunit tests for depth counts") Reported-by: Guenter Roeck Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Reviewed-by: David Gow Link: https://lore.kernel.org/all/20250822190140.2154646-2-briannorris@chromium.org Closes: https://lore.kernel.org/lkml/ded44edf-eeb7-420c-b8a8-d6543b955e6e@roeck-us.net/ --- kernel/irq/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 36673640c4fc25..8bc4de38d7afd5 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -144,6 +144,7 @@ config IRQ_KUNIT_TEST bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS depends on KUNIT=y default KUNIT_ALL_TESTS + select IRQ_DOMAIN imply SMP help This option enables KUnit tests for the IRQ subsystem API. These are From 59405c248acea65d534497bbe29f34858b0fdd3c Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 22 Aug 2025 11:59:03 -0700 Subject: [PATCH 0778/3206] genirq/test: Factor out fake-virq setup A few things need to be repeated in tests. Factor out the creation of fake interrupts. Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Reviewed-by: David Gow Link: https://lore.kernel.org/all/20250822190140.2154646-3-briannorris@chromium.org --- kernel/irq/irq_test.c | 45 +++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c index e220e7b2fc1877..f8f4532c28055e 100644 --- a/kernel/irq/irq_test.c +++ b/kernel/irq/irq_test.c @@ -41,15 +41,15 @@ static struct irq_chip fake_irq_chip = { .flags = IRQCHIP_SKIP_SET_WAKE, }; -static void irq_disable_depth_test(struct kunit *test) +static int irq_test_setup_fake_irq(struct kunit *test, struct irq_affinity_desc *affd) { struct irq_desc *desc; - int virq, ret; + int virq; - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, affd); KUNIT_ASSERT_GE(test, virq, 0); - irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); @@ -57,6 +57,19 @@ static void irq_disable_depth_test(struct kunit *test) /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */ irq_settings_clr_norequest(desc); + return virq; +} + +static void irq_disable_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_test_setup_fake_irq(test, NULL); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); KUNIT_EXPECT_EQ(test, ret, 0); @@ -76,17 +89,11 @@ static void irq_free_disabled_test(struct kunit *test) struct irq_desc *desc; int virq, ret; - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); - KUNIT_ASSERT_GE(test, virq, 0); - - irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + virq = irq_test_setup_fake_irq(test, NULL); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); - /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */ - irq_settings_clr_norequest(desc); - ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); KUNIT_EXPECT_EQ(test, ret, 0); @@ -118,17 +125,11 @@ static void irq_shutdown_depth_test(struct kunit *test) if (!IS_ENABLED(CONFIG_SMP)) kunit_skip(test, "requires CONFIG_SMP for managed shutdown"); - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); - KUNIT_ASSERT_GE(test, virq, 0); - - irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + virq = irq_test_setup_fake_irq(test, &affinity); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); - /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */ - irq_settings_clr_norequest(desc); - data = irq_desc_get_irq_data(desc); KUNIT_ASSERT_PTR_NE(test, data, NULL); @@ -181,17 +182,11 @@ static void irq_cpuhotplug_test(struct kunit *test) cpumask_copy(&affinity.mask, cpumask_of(1)); - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); - KUNIT_ASSERT_GE(test, virq, 0); - - irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq); + virq = irq_test_setup_fake_irq(test, &affinity); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); - /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */ - irq_settings_clr_norequest(desc); - data = irq_desc_get_irq_data(desc); KUNIT_ASSERT_PTR_NE(test, data, NULL); From 988f45467f13c038f73a91f5154b66f278f495d4 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 22 Aug 2025 11:59:04 -0700 Subject: [PATCH 0779/3206] genirq/test: Fail early if interrupt request fails Requesting an interrupt is part of the basic test setup. If it fails, most of the subsequent tests are likely to fail, and the output gets noisy. Use "assert" to fail early. Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Reviewed-by: David Gow Link: https://lore.kernel.org/all/20250822190140.2154646-4-briannorris@chromium.org --- kernel/irq/irq_test.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c index f8f4532c28055e..56baeb5041d6b0 100644 --- a/kernel/irq/irq_test.c +++ b/kernel/irq/irq_test.c @@ -71,7 +71,7 @@ static void irq_disable_depth_test(struct kunit *test) KUNIT_ASSERT_PTR_NE(test, desc, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_EQ(test, desc->depth, 0); @@ -95,7 +95,7 @@ static void irq_free_disabled_test(struct kunit *test) KUNIT_ASSERT_PTR_NE(test, desc, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_EQ(test, desc->depth, 0); @@ -106,7 +106,7 @@ static void irq_free_disabled_test(struct kunit *test) KUNIT_EXPECT_GE(test, desc->depth, 1); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_EQ(test, desc->depth, 0); free_irq(virq, NULL); @@ -134,7 +134,7 @@ static void irq_shutdown_depth_test(struct kunit *test) KUNIT_ASSERT_PTR_NE(test, data, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); @@ -191,7 +191,7 @@ static void irq_cpuhotplug_test(struct kunit *test) KUNIT_ASSERT_PTR_NE(test, data, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); From 0c888bc86d672e551ce5c58b891c8b44f8967643 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 22 Aug 2025 11:59:05 -0700 Subject: [PATCH 0780/3206] genirq/test: Depend on SPARSE_IRQ Some architectures have a static interrupt layout, with a limited number of interrupts. Without SPARSE_IRQ, the test may not be able to allocate any fake interrupts, and the test will fail. (This occurs on ARCH=m68k, for example.) Additionally, managed-affinity is only supported with CONFIG_SPARSE_IRQ=y, so irq_shutdown_depth_test() and irq_cpuhotplug_test() would fail without it. Add a 'SPARSE_IRQ' dependency to avoid these problems. Many architectures 'select SPARSE_IRQ', so this is easy to miss. Notably, this also excludes ARCH=um from running any of these tests, even though some of them might work. Fixes: 66067c3c8a1e ("genirq: Add kunit tests for depth counts") Reported-by: Guenter Roeck Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Reviewed-by: David Gow Link: https://lore.kernel.org/all/20250822190140.2154646-5-briannorris@chromium.org --- kernel/irq/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 8bc4de38d7afd5..1b4254d19a73ec 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -143,6 +143,7 @@ config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD config IRQ_KUNIT_TEST bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS depends on KUNIT=y + depends on SPARSE_IRQ default KUNIT_ALL_TESTS select IRQ_DOMAIN imply SMP From add03fdb9d52411cabb3872fb7692df6f4c67586 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 22 Aug 2025 11:59:06 -0700 Subject: [PATCH 0781/3206] genirq/test: Drop CONFIG_GENERIC_IRQ_MIGRATION assumptions Not all platforms use the generic IRQ migration code, even if they select GENERIC_IRQ_MIGRATION. (See, for example, powerpc / pseries_cpu_disable().) If such platforms don't perform managed shutdown the same way, the interrupt may not actually shut down, and these tests fail: [ 4.357022][ T101] # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:211 [ 4.357022][ T101] Expected irqd_is_activated(data) to be false, but is true [ 4.358128][ T101] # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:212 [ 4.358128][ T101] Expected irqd_is_started(data) to be false, but is true [ 4.375558][ T101] # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:216 [ 4.375558][ T101] Expected irqd_is_activated(data) to be false, but is true [ 4.376088][ T101] # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:217 [ 4.376088][ T101] Expected irqd_is_started(data) to be false, but is true [ 4.377851][ T1] # irq_cpuhotplug_test: pass:0 fail:1 skip:0 total:1 [ 4.377901][ T1] not ok 4 irq_cpuhotplug_test [ 4.378073][ T1] # irq_test_cases: pass:3 fail:1 skip:0 total:4 Rather than test that PowerPC performs migration the same way as the unterrupt core, just drop the state checks. The point of the test was to ensure that the code kept |depth| balanced, which still can be tested for. Fixes: 66067c3c8a1e ("genirq: Add kunit tests for depth counts") Reported-by: Guenter Roeck Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Reviewed-by: David Gow Link: https://lore.kernel.org/all/20250822190140.2154646-6-briannorris@chromium.org --- kernel/irq/irq_test.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c index 56baeb5041d6b0..bbb89a3e11538d 100644 --- a/kernel/irq/irq_test.c +++ b/kernel/irq/irq_test.c @@ -203,13 +203,9 @@ static void irq_cpuhotplug_test(struct kunit *test) KUNIT_EXPECT_EQ(test, desc->depth, 1); KUNIT_EXPECT_EQ(test, remove_cpu(1), 0); - KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); - KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); KUNIT_EXPECT_GE(test, desc->depth, 1); KUNIT_EXPECT_EQ(test, add_cpu(1), 0); - KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); - KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); KUNIT_EXPECT_EQ(test, desc->depth, 1); enable_irq(virq); From 8ad25ebfa70e86860559b306bbc923c7db4fcac6 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 22 Aug 2025 11:59:07 -0700 Subject: [PATCH 0782/3206] genirq/test: Ensure CPU 1 is online for hotplug test It's possible to run these tests on platforms that think they have a hotpluggable CPU1, but for whatever reason, CPU1 is not online and can't be brought online: # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:210 Expected remove_cpu(1) == 0, but remove_cpu(1) == 1 (0x1) CPU1: failed to boot: -38 # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:214 Expected add_cpu(1) == 0, but add_cpu(1) == -38 (0xffffffffffffffda) Check that CPU1 is actually online before trying to run the test. Fixes: 66067c3c8a1e ("genirq: Add kunit tests for depth counts") Reported-by: Guenter Roeck Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Reviewed-by: David Gow Link: https://lore.kernel.org/all/20250822190140.2154646-7-briannorris@chromium.org --- kernel/irq/irq_test.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c index bbb89a3e11538d..e2d31914b3c443 100644 --- a/kernel/irq/irq_test.c +++ b/kernel/irq/irq_test.c @@ -179,6 +179,8 @@ static void irq_cpuhotplug_test(struct kunit *test) kunit_skip(test, "requires more than 1 CPU for CPU hotplug"); if (!cpu_is_hotpluggable(1)) kunit_skip(test, "CPU 1 must be hotpluggable"); + if (!cpu_online(1)) + kunit_skip(test, "CPU 1 must be online"); cpumask_copy(&affinity.mask, cpumask_of(1)); From 37dbd78f98a80e89b5413f4649d0fbd023d99b2f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:08 +0200 Subject: [PATCH 0783/3206] x86/sev: Move GHCB page based HV communication out of startup code Both the decompressor and the core kernel implement an early #VC handler, which only deals with CPUID instructions, and full featured one, which can handle any #VC exception. The former communicates with the hypervisor using the MSR based protocol, whereas the latter uses a shared GHCB page, which is configured a bit later during the boot, when the kernel runs from its ordinary virtual mapping, rather than the 1:1 mapping that the startup code uses. Accessing this shared GHCB page from the core kernel's startup code is problematic, because it involves converting the GHCB address provided by the caller to a physical address. In the startup code, virtual to physical address translations are problematic, given that the virtual address might be a 1:1 mapped address, and such translations should therefore be avoided. This means that exposing startup code dealing with the GHCB to callers that execute from the ordinary kernel virtual mapping should be avoided too. So move all GHCB page based communication out of the startup code, now that all communication occurring before the kernel virtual mapping is up relies on the MSR protocol only. As an exception, add a flag representing the need to apply the coherency fix in order to avoid exporting CPUID* helpers because of the code running too early for the *cpu_has* infrastructure. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-29-ardb+git@google.com --- arch/x86/boot/compressed/sev-handle-vc.c | 3 + arch/x86/boot/compressed/sev.c | 2 + arch/x86/boot/cpuflags.c | 13 -- arch/x86/boot/startup/sev-shared.c | 145 +---------------------- arch/x86/boot/startup/sev-startup.c | 42 ------- arch/x86/boot/startup/sme.c | 1 + arch/x86/coco/sev/core.c | 76 ++++++++++++ arch/x86/coco/sev/vc-handle.c | 2 + arch/x86/coco/sev/vc-shared.c | 94 +++++++++++++++ arch/x86/include/asm/sev-internal.h | 7 +- arch/x86/include/asm/sev.h | 12 +- 11 files changed, 196 insertions(+), 201 deletions(-) diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c index 89dd02de2a0f05..7530ad8b768b1d 100644 --- a/arch/x86/boot/compressed/sev-handle-vc.c +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" +#include "error.h" #include "sev.h" #include @@ -14,6 +15,8 @@ #include #define __BOOT_COMPRESSED +#undef __init +#define __init /* Basic instruction decoding support needed */ #include "../../lib/inat.c" diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 3628e9bddc6a13..f197173d60e671 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -371,6 +371,8 @@ static int sev_check_cpu_support(void) if (!(eax & BIT(1))) return -ENODEV; + sev_snp_needs_sfw = !(ebx & BIT(31)); + return ebx & 0x3f; } diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index 63e037e94e4c03..916bac09b464da 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -106,18 +106,5 @@ void get_cpuflags(void) cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6], &cpu.flags[1]); } - - if (max_amd_level >= 0x8000001f) { - u32 ebx; - - /* - * The X86_FEATURE_COHERENCY_SFW_NO feature bit is in - * the virtualization flags entry (word 8) and set by - * scattered.c, so the bit needs to be explicitly set. - */ - cpuid(0x8000001f, &ignored, &ebx, &ignored, &ignored); - if (ebx & BIT(31)) - set_bit(X86_FEATURE_COHERENCY_SFW_NO, cpu.flags); - } } } diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 83c222a4f1fa9e..348811aa784713 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -13,12 +13,9 @@ #ifndef __BOOT_COMPRESSED #define error(v) pr_err(v) -#define has_cpuflag(f) boot_cpu_has(f) #else #undef WARN #define WARN(condition, format...) (!!(condition)) -#undef vc_forward_exception -#define vc_forward_exception(c) panic("SNP: Hypervisor requested exception\n") #endif /* @@ -39,7 +36,7 @@ u64 boot_svsm_caa_pa __ro_after_init; * * GHCB protocol version negotiated with the hypervisor. */ -static u16 ghcb_version __ro_after_init; +u16 ghcb_version __ro_after_init; /* Copy of the SNP firmware's CPUID page. */ static struct snp_cpuid_table cpuid_table_copy __ro_after_init; @@ -54,15 +51,7 @@ static u32 cpuid_std_range_max __ro_after_init; static u32 cpuid_hyp_range_max __ro_after_init; static u32 cpuid_ext_range_max __ro_after_init; -bool __init sev_es_check_cpu_features(void) -{ - if (!has_cpuflag(X86_FEATURE_RDRAND)) { - error("RDRAND instruction not supported - no trusted source of randomness available\n"); - return false; - } - - return true; -} +bool sev_snp_needs_sfw; void __head __noreturn sev_es_terminate(unsigned int set, unsigned int reason) @@ -100,72 +89,7 @@ u64 get_hv_features(void) return GHCB_MSR_HV_FT_RESP_VAL(val); } -void snp_register_ghcb_early(unsigned long paddr) -{ - unsigned long pfn = paddr >> PAGE_SHIFT; - u64 val; - - sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - /* If the response GPA is not ours then abort the guest */ - if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || - (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); -} - -bool sev_es_negotiate_protocol(void) -{ - u64 val; - - /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - - if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) - return false; - - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) - return false; - - ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); - - return true; -} - -static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - u32 ret; - - ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); - if (!ret) - return ES_OK; - - if (ret == 1) { - u64 info = ghcb->save.sw_exit_info_2; - unsigned long v = info & SVM_EVTINJ_VEC_MASK; - - /* Check if exception information from hypervisor is sane. */ - if ((info & SVM_EVTINJ_VALID) && - ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && - ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { - ctxt->fi.vector = v; - - if (info & SVM_EVTINJ_VALID_ERR) - ctxt->fi.error_code = info >> 32; - - return ES_EXCEPTION; - } - } - - return ES_VMM_ERROR; -} - -static inline int svsm_process_result_codes(struct svsm_call *call) +int svsm_process_result_codes(struct svsm_call *call) { switch (call->rax_out) { case SVSM_SUCCESS: @@ -193,7 +117,7 @@ static inline int svsm_process_result_codes(struct svsm_call *call) * - RAX specifies the SVSM protocol/callid as input and the return code * as output. */ -static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending) +void svsm_issue_call(struct svsm_call *call, u8 *pending) { register unsigned long rax asm("rax") = call->rax; register unsigned long rcx asm("rcx") = call->rcx; @@ -216,7 +140,7 @@ static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending) call->r9_out = r9; } -static int svsm_perform_msr_protocol(struct svsm_call *call) +int svsm_perform_msr_protocol(struct svsm_call *call) { u8 pending = 0; u64 val, resp; @@ -247,63 +171,6 @@ static int svsm_perform_msr_protocol(struct svsm_call *call) return svsm_process_result_codes(call); } -static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) -{ - struct es_em_ctxt ctxt; - u8 pending = 0; - - vc_ghcb_invalidate(ghcb); - - /* - * Fill in protocol and format specifiers. This can be called very early - * in the boot, so use rip-relative references as needed. - */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - - svsm_issue_call(call, &pending); - - if (pending) - return -EINVAL; - - switch (verify_exception_info(ghcb, &ctxt)) { - case ES_OK: - break; - case ES_EXCEPTION: - vc_forward_exception(&ctxt); - fallthrough; - default: - return -EINVAL; - } - - return svsm_process_result_codes(call); -} - -enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) -{ - /* Fill in protocol and format specifiers */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, exit_code); - ghcb_set_sw_exit_info_1(ghcb, exit_info_1); - ghcb_set_sw_exit_info_2(ghcb, exit_info_2); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - return verify_exception_info(ghcb, ctxt); -} - static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) { u64 val; @@ -793,7 +660,7 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, * If validating memory (making it private) and affected by the * cache-coherency vulnerability, perform the cache eviction mitigation. */ - if (validate && !has_cpuflag(X86_FEATURE_COHERENCY_SFW_NO)) + if (validate && sev_snp_needs_sfw) sev_evict_cache((void *)vaddr, 1); } diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 3da04a715831b4..fd18a00f000e87 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -41,15 +41,6 @@ #include #include -/* For early boot hypervisor communication in SEV-ES enabled guests */ -struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); - -/* - * Needs to be in the .data section because we need it NULL before bss is - * cleared - */ -struct ghcb *boot_ghcb __section(".data"); - /* Bitmap of SEV features supported by the hypervisor */ u64 sev_hv_features __ro_after_init; @@ -139,39 +130,6 @@ noinstr void __sev_put_ghcb(struct ghcb_state *state) } } -int svsm_perform_call_protocol(struct svsm_call *call) -{ - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - int ret; - - /* - * This can be called very early in the boot, use native functions in - * order to avoid paravirt issues. - */ - flags = native_local_irq_save(); - - if (sev_cfg.ghcbs_initialized) - ghcb = __sev_get_ghcb(&state); - else if (boot_ghcb) - ghcb = boot_ghcb; - else - ghcb = NULL; - - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); - - if (sev_cfg.ghcbs_initialized) - __sev_put_ghcb(&state); - - native_local_irq_restore(flags); - - return ret; -} - void __head early_set_pages_state(unsigned long vaddr, unsigned long paddr, unsigned long npages, enum psc_op op) diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index 70ea1748c0a786..bf9153b9a3d9f2 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -521,6 +521,7 @@ void __head sme_enable(struct boot_params *bp) return; me_mask = 1UL << (ebx & 0x3f); + sev_snp_needs_sfw = !(ebx & BIT(31)); /* Check the SEV MSR whether SEV or SME is enabled */ sev_status = msr = native_rdmsrq(MSR_AMD64_SEV); diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 14ef5908fb2735..2a28d14425d497 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -101,6 +101,15 @@ DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); u8 snp_vmpl __ro_after_init; EXPORT_SYMBOL_GPL(snp_vmpl); +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +struct ghcb *boot_ghcb __section(".data"); + static u64 __init get_snp_jump_table_addr(void) { struct snp_secrets_page *secrets; @@ -154,6 +163,73 @@ static u64 __init get_jump_table_addr(void) return ret; } +static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) +{ + struct es_em_ctxt ctxt; + u8 pending = 0; + + vc_ghcb_invalidate(ghcb); + + /* + * Fill in protocol and format specifiers. This can be called very early + * in the boot, so use rip-relative references as needed. + */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + + svsm_issue_call(call, &pending); + + if (pending) + return -EINVAL; + + switch (verify_exception_info(ghcb, &ctxt)) { + case ES_OK: + break; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + fallthrough; + default: + return -EINVAL; + } + + return svsm_process_result_codes(call); +} + +static int svsm_perform_call_protocol(struct svsm_call *call) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + flags = native_local_irq_save(); + + if (sev_cfg.ghcbs_initialized) + ghcb = __sev_get_ghcb(&state); + else if (boot_ghcb) + ghcb = boot_ghcb; + else + ghcb = NULL; + + do { + ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) + : svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + if (sev_cfg.ghcbs_initialized) + __sev_put_ghcb(&state); + + native_local_irq_restore(flags); + + return ret; +} + static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size, int ret, u64 svsm_ret) { diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index c3b4acbde0d8c6..357389456296eb 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -351,6 +351,8 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, } #define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) +#define error(v) +#define has_cpuflag(f) boot_cpu_has(f) #include "vc-shared.c" diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c index b4688f69102e26..9b01c9ad81be62 100644 --- a/arch/x86/coco/sev/vc-shared.c +++ b/arch/x86/coco/sev/vc-shared.c @@ -409,6 +409,53 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + u32 ret; + + ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); + if (!ret) + return ES_OK; + + if (ret == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + + return ES_EXCEPTION; + } + } + + return ES_VMM_ERROR; +} + +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + return verify_exception_info(ghcb, ctxt); +} + static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) { u32 cr4 = native_read_cr4(); @@ -549,3 +596,50 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, return ES_OK; } + +void snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + +bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) + return false; + + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) + return false; + + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + + return true; +} diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index 3dfd306d1c9e88..6199b35a82e4e4 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -2,7 +2,6 @@ #define DR7_RESET_VALUE 0x400 -extern struct ghcb boot_ghcb_page; extern u64 sev_hv_features; extern u64 sev_secrets_pa; @@ -80,7 +79,8 @@ static __always_inline u64 svsm_get_caa_pa(void) return boot_svsm_caa_pa; } -int svsm_perform_call_protocol(struct svsm_call *call); +enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt); +void vc_forward_exception(struct es_em_ctxt *ctxt); static inline u64 sev_es_rd_ghcb_msr(void) { @@ -97,9 +97,6 @@ static __always_inline void sev_es_wr_ghcb_msr(u64 val) native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); } -void snp_register_ghcb_early(unsigned long paddr); -bool sev_es_negotiate_protocol(void); -bool sev_es_check_cpu_features(void); u64 get_hv_features(void); const struct snp_cpuid_table *snp_cpuid_get_table(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index e4622e470ceb91..be9d7cb87ad0e7 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -503,6 +503,7 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) } void setup_ghcb(void); +void snp_register_ghcb_early(unsigned long paddr); void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages); void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, @@ -540,8 +541,6 @@ static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -void vc_forward_exception(struct es_em_ctxt *ctxt); - /* I/O parameters for CPUID-related helpers */ struct cpuid_leaf { u32 fn; @@ -552,16 +551,25 @@ struct cpuid_leaf { u32 edx; }; +int svsm_perform_msr_protocol(struct svsm_call *call); int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), void *ctx, struct cpuid_leaf *leaf); +void svsm_issue_call(struct svsm_call *call, u8 *pending); +int svsm_process_result_codes(struct svsm_call *call); + void __noreturn sev_es_terminate(unsigned int set, unsigned int reason); enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, struct es_em_ctxt *ctxt, u64 exit_code, u64 exit_info_1, u64 exit_info_2); +bool sev_es_negotiate_protocol(void); +bool sev_es_check_cpu_features(void); + +extern u16 ghcb_version; extern struct ghcb *boot_ghcb; +extern bool sev_snp_needs_sfw; #else /* !CONFIG_AMD_MEM_ENCRYPT */ From a5f03880f06a6da6ea5f1d966fffffcb3fc65462 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:09 +0200 Subject: [PATCH 0784/3206] x86/sev: Avoid global variable to store virtual address of SVSM area The boottime SVSM calling area is used both by the startup code running from a 1:1 mapping, and potentially later on running from the ordinary kernel mapping. This SVSM calling area is statically allocated, and so its physical address doesn't change. However, its virtual address depends on the calling context (1:1 mapping or kernel virtual mapping), and even though the variable that holds the virtual address of this calling area gets updated from 1:1 address to kernel address during the boot, it is hard to reason about why this is guaranteed to be safe. So instead, take the RIP-relative address of the boottime SVSM calling area whenever its virtual address is required, and only use a global variable for the physical address. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/20250828102202.1849035-30-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 5 ++--- arch/x86/boot/startup/sev-shared.c | 7 +------ arch/x86/boot/startup/sev-startup.c | 9 +++++---- arch/x86/coco/sev/core.c | 9 --------- arch/x86/include/asm/sev-internal.h | 3 +-- arch/x86/include/asm/sev.h | 2 -- arch/x86/mm/mem_encrypt_amd.c | 6 ------ 7 files changed, 9 insertions(+), 32 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index f197173d60e671..f2b8dfbd453ccc 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -37,12 +37,12 @@ struct ghcb *boot_ghcb; #define __BOOT_COMPRESSED -extern struct svsm_ca *boot_svsm_caa; extern u64 boot_svsm_caa_pa; struct svsm_ca *svsm_get_caa(void) { - return boot_svsm_caa; + /* The decompressor is mapped 1:1 so VA == PA */ + return (struct svsm_ca *)boot_svsm_caa_pa; } u64 svsm_get_caa_pa(void) @@ -532,7 +532,6 @@ bool early_is_sevsnp_guest(void) /* Obtain the address of the calling area to use */ boot_rdmsr(MSR_SVSM_CAA, &m); - boot_svsm_caa = (void *)m.q; boot_svsm_caa_pa = m.q; /* diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 348811aa784713..80d4fdada33a82 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -13,6 +13,7 @@ #ifndef __BOOT_COMPRESSED #define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) #else #undef WARN #define WARN(condition, format...) (!!(condition)) @@ -26,7 +27,6 @@ * early boot, both with identity mapped virtual addresses and proper kernel * virtual addresses. */ -struct svsm_ca *boot_svsm_caa __ro_after_init; u64 boot_svsm_caa_pa __ro_after_init; /* @@ -720,11 +720,6 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info, if (caa & (PAGE_SIZE - 1)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CAA); - /* - * The CA is identity mapped when this routine is called, both by the - * decompressor code and the early kernel code. - */ - boot_svsm_caa = (struct svsm_ca *)caa; boot_svsm_caa_pa = caa; /* Advertise the SVSM presence via CPUID. */ diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index fd18a00f000e87..8a06f602610132 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -252,6 +252,7 @@ static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) static __head void svsm_setup(struct cc_blob_sev_info *cc_info) { + struct snp_secrets_page *secrets = (void *)cc_info->secrets_phys; struct svsm_call call = {}; u64 pa; @@ -272,21 +273,21 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) pa = (u64)rip_rel_ptr(&boot_svsm_ca_page); /* - * Switch over to the boot SVSM CA while the current CA is still - * addressable. There is no GHCB at this point so use the MSR protocol. + * Switch over to the boot SVSM CA while the current CA is still 1:1 + * mapped and thus addressable with VA == PA. There is no GHCB at this + * point so use the MSR protocol. * * SVSM_CORE_REMAP_CA call: * RAX = 0 (Protocol=0, CallID=0) * RCX = New CA GPA */ - call.caa = svsm_get_caa(); + call.caa = (struct svsm_ca *)secrets->svsm_caa; call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); call.rcx = pa; if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); - boot_svsm_caa = (struct svsm_ca *)pa; boot_svsm_caa_pa = pa; } diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 2a28d14425d497..ff1e2be8b5a844 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1666,15 +1666,6 @@ void sev_show_status(void) pr_cont("\n"); } -void __init snp_update_svsm_ca(void) -{ - if (!snp_vmpl) - return; - - /* Update the CAA to a proper kernel address */ - boot_svsm_caa = &boot_svsm_ca_page; -} - #ifdef CONFIG_SYSFS static ssize_t vmpl_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index 6199b35a82e4e4..ffe4755962fe8e 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -60,7 +60,6 @@ void early_set_pages_state(unsigned long vaddr, unsigned long paddr, DECLARE_PER_CPU(struct svsm_ca *, svsm_caa); DECLARE_PER_CPU(u64, svsm_caa_pa); -extern struct svsm_ca *boot_svsm_caa; extern u64 boot_svsm_caa_pa; static __always_inline struct svsm_ca *svsm_get_caa(void) @@ -68,7 +67,7 @@ static __always_inline struct svsm_ca *svsm_get_caa(void) if (sev_cfg.use_cas) return this_cpu_read(svsm_caa); else - return boot_svsm_caa; + return rip_rel_ptr(&boot_svsm_ca_page); } static __always_inline u64 svsm_get_caa_pa(void) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index be9d7cb87ad0e7..92b1269c877e41 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -519,7 +519,6 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); void sev_show_status(void); -void snp_update_svsm_ca(void); int prepare_pte_enc(struct pte_enc_desc *d); void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot); void snp_kexec_finish(void); @@ -601,7 +600,6 @@ static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } static inline void sev_show_status(void) { } -static inline void snp_update_svsm_ca(void) { } static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; } static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { } static inline void snp_kexec_finish(void) { } diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index faf3a13fb6ba0b..2f8c32173972d6 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -536,12 +536,6 @@ void __init sme_early_init(void) x86_init.resources.dmi_setup = snp_dmi_setup; } - /* - * Switch the SVSM CA mapping (if active) from identity mapped to - * kernel mapped. - */ - snp_update_svsm_ca(); - if (sev_status & MSR_AMD64_SNP_SECURE_TSC) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } From d5949ea50c5642ab7e3c4dd6020e23725c079b25 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:10 +0200 Subject: [PATCH 0785/3206] x86/sev: Share implementation of MSR-based page state change Both the decompressor and the SEV startup code implement the exact same sequence for invoking the MSR based communication protocol to effectuate a page state change. Before tweaking the internal APIs used in both versions, merge them and share them so those tweaks are only needed in a single place. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-31-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 40 +++-------------------------- arch/x86/boot/startup/sev-shared.c | 35 +++++++++++++++++++++++++ arch/x86/boot/startup/sev-startup.c | 29 +-------------------- 3 files changed, 39 insertions(+), 65 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index f2b8dfbd453ccc..6b62c75ea9117e 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -60,46 +60,12 @@ static bool sev_snp_enabled(void) return sev_status & MSR_AMD64_SEV_SNP_ENABLED; } -static void __page_state_change(unsigned long paddr, enum psc_op op) -{ - u64 val, msr; - - /* - * If private -> shared then invalidate the page before requesting the - * state change in the RMP table. - */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(paddr, paddr, false); - - /* Save the current GHCB MSR value */ - msr = sev_es_rd_ghcb_msr(); - - /* Issue VMGEXIT to change the page state in RMP table. */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - /* Read the response of the VMGEXIT. */ - val = sev_es_rd_ghcb_msr(); - if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); - - /* Restore the GHCB MSR value */ - sev_es_wr_ghcb_msr(msr); - - /* - * Now that page state is changed in the RMP table, validate it so that it is - * consistent with the RMP entry. - */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(paddr, paddr, true); -} - void snp_set_page_private(unsigned long paddr) { if (!sev_snp_enabled()) return; - __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE); + __page_state_change(paddr, paddr, SNP_PAGE_STATE_PRIVATE); } void snp_set_page_shared(unsigned long paddr) @@ -107,7 +73,7 @@ void snp_set_page_shared(unsigned long paddr) if (!sev_snp_enabled()) return; - __page_state_change(paddr, SNP_PAGE_STATE_SHARED); + __page_state_change(paddr, paddr, SNP_PAGE_STATE_SHARED); } bool early_setup_ghcb(void) @@ -133,7 +99,7 @@ bool early_setup_ghcb(void) void snp_accept_memory(phys_addr_t start, phys_addr_t end) { for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE) - __page_state_change(pa, SNP_PAGE_STATE_PRIVATE); + __page_state_change(pa, pa, SNP_PAGE_STATE_PRIVATE); } void sev_es_shutdown_ghcb(void) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 80d4fdada33a82..00220d7b981b0b 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -664,6 +664,41 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, sev_evict_cache((void *)vaddr, 1); } +static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, + enum psc_op op) +{ + u64 val, msr; + + /* + * If private -> shared then invalidate the page before requesting the + * state change in the RMP table. + */ + if (op == SNP_PAGE_STATE_SHARED) + pvalidate_4k_page(vaddr, paddr, false); + + /* Save the current GHCB MSR value */ + msr = sev_es_rd_ghcb_msr(); + + /* Issue VMGEXIT to change the page state in RMP table. */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + /* Read the response of the VMGEXIT. */ + val = sev_es_rd_ghcb_msr(); + if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + /* Restore the GHCB MSR value */ + sev_es_wr_ghcb_msr(msr); + + /* + * Now that page state is changed in the RMP table, validate it so that it is + * consistent with the RMP entry. + */ + if (op == SNP_PAGE_STATE_PRIVATE) + pvalidate_4k_page(vaddr, paddr, true); +} + /* * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM * services needed when not running in VMPL0. diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 8a06f602610132..5eb7d939ebd345 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -135,7 +135,6 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, unsigned long npages, enum psc_op op) { unsigned long paddr_end; - u64 val; vaddr = vaddr & PAGE_MASK; @@ -143,37 +142,11 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, paddr_end = paddr + (npages << PAGE_SHIFT); while (paddr < paddr_end) { - /* Page validation must be rescinded before changing to shared */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(vaddr, paddr, false); - - /* - * Use the MSR protocol because this function can be called before - * the GHCB is established. - */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) - goto e_term; - - if (GHCB_MSR_PSC_RESP_VAL(val)) - goto e_term; - - /* Page validation must be performed after changing to private */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(vaddr, paddr, true); + __page_state_change(vaddr, paddr, op); vaddr += PAGE_SIZE; paddr += PAGE_SIZE; } - - return; - -e_term: - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); } void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, From 00d25566761746ba53934ad3a89ea79923a38d01 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:11 +0200 Subject: [PATCH 0786/3206] x86/sev: Pass SVSM calling area down to early page state change API The early page state change API is mostly only used very early, when only the boot time SVSM calling area is in use. However, this API is also called by the kexec finishing code, which runs very late, and potentially from a different CPU (which uses a different calling area). To avoid pulling the per-CPU SVSM calling area pointers and related SEV state into the startup code, refactor the page state change API so the SVSM calling area virtual and physical addresses can be provided by the caller. No functional change intended. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-32-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 24 +++++++++++++++++++++--- arch/x86/boot/startup/sev-shared.c | 23 ++++++++++++----------- arch/x86/boot/startup/sev-startup.c | 16 ++++++++++++---- arch/x86/coco/sev/core.c | 7 +++++-- arch/x86/include/asm/sev-internal.h | 2 +- arch/x86/include/asm/sev.h | 6 ++++++ 6 files changed, 57 insertions(+), 21 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 6b62c75ea9117e..0e567410d24d7f 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -62,18 +62,30 @@ static bool sev_snp_enabled(void) void snp_set_page_private(unsigned long paddr) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + if (!sev_snp_enabled()) return; - __page_state_change(paddr, paddr, SNP_PAGE_STATE_PRIVATE); + __page_state_change(paddr, paddr, &d); } void snp_set_page_shared(unsigned long paddr) { + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + if (!sev_snp_enabled()) return; - __page_state_change(paddr, paddr, SNP_PAGE_STATE_SHARED); + __page_state_change(paddr, paddr, &d); } bool early_setup_ghcb(void) @@ -98,8 +110,14 @@ bool early_setup_ghcb(void) void snp_accept_memory(phys_addr_t start, phys_addr_t end) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE) - __page_state_change(pa, pa, SNP_PAGE_STATE_PRIVATE); + __page_state_change(pa, pa, &d); } void sev_es_shutdown_ghcb(void) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 00220d7b981b0b..fb86f03da66317 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -602,7 +602,8 @@ static int __head svsm_call_msr_protocol(struct svsm_call *call) return ret; } -static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) +static void __head svsm_pval_4k_page(unsigned long paddr, bool validate, + struct svsm_ca *caa, u64 caa_pa) { struct svsm_pvalidate_call *pc; struct svsm_call call = {}; @@ -615,10 +616,10 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) */ flags = native_local_irq_save(); - call.caa = svsm_get_caa(); + call.caa = caa; pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer; - pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer); + pc_pa = caa_pa + offsetof(struct svsm_ca, svsm_buffer); pc->num_entries = 1; pc->cur_index = 0; @@ -644,12 +645,12 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) } static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, - bool validate) + bool validate, struct svsm_ca *caa, u64 caa_pa) { int ret; if (snp_vmpl) { - svsm_pval_4k_page(paddr, validate); + svsm_pval_4k_page(paddr, validate, caa, caa_pa); } else { ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); if (ret) @@ -665,7 +666,7 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, } static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, - enum psc_op op) + const struct psc_desc *desc) { u64 val, msr; @@ -673,14 +674,14 @@ static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, * If private -> shared then invalidate the page before requesting the * state change in the RMP table. */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(vaddr, paddr, false); + if (desc->op == SNP_PAGE_STATE_SHARED) + pvalidate_4k_page(vaddr, paddr, false, desc->ca, desc->caa_pa); /* Save the current GHCB MSR value */ msr = sev_es_rd_ghcb_msr(); /* Issue VMGEXIT to change the page state in RMP table. */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, desc->op)); VMGEXIT(); /* Read the response of the VMGEXIT. */ @@ -695,8 +696,8 @@ static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, * Now that page state is changed in the RMP table, validate it so that it is * consistent with the RMP entry. */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(vaddr, paddr, true); + if (desc->op == SNP_PAGE_STATE_PRIVATE) + pvalidate_4k_page(vaddr, paddr, true, desc->ca, desc->caa_pa); } /* diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 5eb7d939ebd345..8009a37d53c1fb 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -132,7 +132,7 @@ noinstr void __sev_put_ghcb(struct ghcb_state *state) void __head early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op) + unsigned long npages, const struct psc_desc *desc) { unsigned long paddr_end; @@ -142,7 +142,7 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, paddr_end = paddr + (npages << PAGE_SHIFT); while (paddr < paddr_end) { - __page_state_change(vaddr, paddr, op); + __page_state_change(vaddr, paddr, desc); vaddr += PAGE_SIZE; paddr += PAGE_SIZE; @@ -152,6 +152,10 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, svsm_get_caa(), svsm_get_caa_pa() + }; + /* * This can be invoked in early boot while running identity mapped, so * use an open coded check for SNP instead of using cc_platform_has(). @@ -165,12 +169,16 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd * Ask the hypervisor to mark the memory pages as private in the RMP * table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); + early_set_pages_state(vaddr, paddr, npages, &d); } void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, svsm_get_caa(), svsm_get_caa_pa() + }; + /* * This can be invoked in early boot while running identity mapped, so * use an open coded check for SNP instead of using cc_platform_has(). @@ -181,7 +189,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr return; /* Ask hypervisor to mark the memory pages shared in the RMP table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); + early_set_pages_state(vaddr, paddr, npages, &d); } /* diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index ff1e2be8b5a844..a833b2b31d3db3 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -607,8 +607,11 @@ static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) unsigned long vaddr_end; /* Use the MSR protocol when a GHCB is not available. */ - if (!boot_ghcb) - return early_set_pages_state(vaddr, __pa(vaddr), npages, op); + if (!boot_ghcb) { + struct psc_desc d = { op, svsm_get_caa(), svsm_get_caa_pa() }; + + return early_set_pages_state(vaddr, __pa(vaddr), npages, &d); + } vaddr = vaddr & PAGE_MASK; vaddr_end = vaddr + (npages << PAGE_SHIFT); diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index ffe4755962fe8e..9ff824540b48cf 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -55,7 +55,7 @@ DECLARE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DECLARE_PER_CPU(struct sev_es_save_area *, sev_vmsa); void early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op); + unsigned long npages, const struct psc_desc *desc); DECLARE_PER_CPU(struct svsm_ca *, svsm_caa); DECLARE_PER_CPU(u64, svsm_caa_pa); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 92b1269c877e41..0030c7125b291c 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -570,6 +570,12 @@ extern u16 ghcb_version; extern struct ghcb *boot_ghcb; extern bool sev_snp_needs_sfw; +struct psc_desc { + enum psc_op op; + struct svsm_ca *ca; + u64 caa_pa; +}; + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define snp_vmpl 0 From c54604fb7f2522fec5b97e86103ec49e539e80fe Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:12 +0200 Subject: [PATCH 0787/3206] x86/sev: Use boot SVSM CA for all startup and init code To avoid having to reason about whether or not to use the per-CPU SVSM calling area when running startup and init code on the boot CPU, reuse the boot SVSM calling area as the per-CPU area for the BSP. Thus, remove the need to make the per-CPU variables and associated state in sev_cfg accessible to the startup code once confined. [ bp: Massage commit message. ] Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-33-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 13 -------- arch/x86/boot/startup/sev-startup.c | 11 ++++--- arch/x86/coco/sev/core.c | 47 ++++++++++++++--------------- arch/x86/include/asm/sev-internal.h | 16 ---------- 4 files changed, 28 insertions(+), 59 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 0e567410d24d7f..4873469b2a39ae 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -37,19 +37,6 @@ struct ghcb *boot_ghcb; #define __BOOT_COMPRESSED -extern u64 boot_svsm_caa_pa; - -struct svsm_ca *svsm_get_caa(void) -{ - /* The decompressor is mapped 1:1 so VA == PA */ - return (struct svsm_ca *)boot_svsm_caa_pa; -} - -u64 svsm_get_caa_pa(void) -{ - return boot_svsm_caa_pa; -} - u8 snp_vmpl; /* Include code for early handlers */ diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 8009a37d53c1fb..b0fc63f8dee171 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -50,9 +50,6 @@ u64 sev_secrets_pa __ro_after_init; /* For early boot SVSM communication */ struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); -DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); -DEFINE_PER_CPU(u64, svsm_caa_pa); - /* * Nothing shall interrupt this code path while holding the per-CPU * GHCB. The backup GHCB is only for NMIs interrupting this path. @@ -153,7 +150,9 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd unsigned long npages) { struct psc_desc d = { - SNP_PAGE_STATE_PRIVATE, svsm_get_caa(), svsm_get_caa_pa() + SNP_PAGE_STATE_PRIVATE, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa }; /* @@ -176,7 +175,9 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr unsigned long npages) { struct psc_desc d = { - SNP_PAGE_STATE_SHARED, svsm_get_caa(), svsm_get_caa_pa() + SNP_PAGE_STATE_SHARED, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa }; /* diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index a833b2b31d3db3..9782ebe306750f 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -46,6 +46,25 @@ #include #include +DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); +DEFINE_PER_CPU(u64, svsm_caa_pa); + +static inline struct svsm_ca *svsm_get_caa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa); + else + return rip_rel_ptr(&boot_svsm_ca_page); +} + +static inline u64 svsm_get_caa_pa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa_pa); + else + return boot_svsm_caa_pa; +} + /* AP INIT values as documented in the APM2 section "Processor Initialization State" */ #define AP_INIT_CS_LIMIT 0xffff #define AP_INIT_DS_LIMIT 0xffff @@ -1312,7 +1331,8 @@ static void __init alloc_runtime_data(int cpu) struct svsm_ca *caa; /* Allocate the SVSM CA page if an SVSM is present */ - caa = memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE); + caa = cpu ? memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE) + : &boot_svsm_ca_page; per_cpu(svsm_caa, cpu) = caa; per_cpu(svsm_caa_pa, cpu) = __pa(caa); @@ -1366,32 +1386,9 @@ void __init sev_es_init_vc_handling(void) init_ghcb(cpu); } - /* If running under an SVSM, switch to the per-cpu CA */ - if (snp_vmpl) { - struct svsm_call call = {}; - unsigned long flags; - int ret; - - local_irq_save(flags); - - /* - * SVSM_CORE_REMAP_CA call: - * RAX = 0 (Protocol=0, CallID=0) - * RCX = New CA GPA - */ - call.caa = svsm_get_caa(); - call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); - call.rcx = this_cpu_read(svsm_caa_pa); - ret = svsm_perform_call_protocol(&call); - if (ret) - panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", - ret, call.rax_out); - + if (snp_vmpl) sev_cfg.use_cas = true; - local_irq_restore(flags); - } - sev_es_setup_play_dead(); /* Secondary CPUs use the runtime #VC handler */ diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index 9ff824540b48cf..f98f080410ad8f 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -62,22 +62,6 @@ DECLARE_PER_CPU(u64, svsm_caa_pa); extern u64 boot_svsm_caa_pa; -static __always_inline struct svsm_ca *svsm_get_caa(void) -{ - if (sev_cfg.use_cas) - return this_cpu_read(svsm_caa); - else - return rip_rel_ptr(&boot_svsm_ca_page); -} - -static __always_inline u64 svsm_get_caa_pa(void) -{ - if (sev_cfg.use_cas) - return this_cpu_read(svsm_caa_pa); - else - return boot_svsm_caa_pa; -} - enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt); void vc_forward_exception(struct es_em_ctxt *ctxt); From 68a501d7fd82454525797971c6a0005ceeb93153 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:13 +0200 Subject: [PATCH 0788/3206] x86/boot: Drop redundant RMPADJUST in SEV SVSM presence check snp_vmpl will be assigned a non-zero value when executing at a VMPL other than 0, and this is inferred from a call to RMPADJUST, which only works when running at VMPL0. This means that testing snp_vmpl is sufficient, and there is no need to perform the same check again. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-34-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 4873469b2a39ae..26aa389f802d77 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -406,30 +406,16 @@ void sev_enable(struct boot_params *bp) */ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { u64 hv_features; - int ret; hv_features = get_hv_features(); if (!(hv_features & GHCB_HV_FT_SNP)) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); /* - * Enforce running at VMPL0 or with an SVSM. - * - * Use RMPADJUST (see the rmpadjust() function for a description of - * what the instruction does) to update the VMPL1 permissions of a - * page. If the guest is running at VMPL0, this will succeed. If the - * guest is running at any other VMPL, this will fail. Linux SNP guests - * only ever run at a single VMPL level so permission mask changes of a - * lesser-privileged VMPL are a don't-care. + * Running at VMPL0 is required unless an SVSM is present and + * the hypervisor supports the required SVSM GHCB events. */ - ret = rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, 1); - - /* - * Running at VMPL0 is not required if an SVSM is present and the hypervisor - * supports the required SVSM GHCB events. - */ - if (ret && - !(snp_vmpl && (hv_features & GHCB_HV_FT_SNP_MULTI_VMPL))) + if (snp_vmpl && !(hv_features & GHCB_HV_FT_SNP_MULTI_VMPL)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); } From f27906b287403af53be26341cf86d73798f15fe8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:14 +0200 Subject: [PATCH 0789/3206] x86/boot: Provide PIC aliases for 5-level paging related constants Provide PIC aliases for the global variables related to 5-level paging, so that the startup code can access them in order to populate the kernel page tables. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-35-ardb+git@google.com --- arch/x86/kernel/head64.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 533fcf5636fc92..1bc40d0785ee3e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -52,10 +52,13 @@ SYM_PIC_ALIAS(next_early_pgt); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); unsigned int __pgtable_l5_enabled __ro_after_init; +SYM_PIC_ALIAS(__pgtable_l5_enabled); unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); +SYM_PIC_ALIAS(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; EXPORT_SYMBOL(ptrs_per_p4d); +SYM_PIC_ALIAS(ptrs_per_p4d); unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); From 9723dd0c705eb626bac2cd06b83a2c8514ed697a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:15 +0200 Subject: [PATCH 0790/3206] x86/sev: Provide PIC aliases for SEV related data objects Provide PIC aliases for data objects that are shared between the SEV startup code and the SEV code that executes later. This is needed so that the confined startup code is permitted to access them. This requires some of these variables to be moved into a source file that is not part of the startup code, as the PIC alias is already implied, and exporting variables in the opposite direction is not supported. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-36-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 3 +++ arch/x86/boot/startup/sev-shared.c | 19 ---------------- arch/x86/boot/startup/sev-startup.c | 9 -------- arch/x86/coco/sev/core.c | 34 +++++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 26aa389f802d77..a5e002ff6bff9b 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -38,6 +38,9 @@ struct ghcb *boot_ghcb; #define __BOOT_COMPRESSED u8 snp_vmpl; +u16 ghcb_version; + +u64 boot_svsm_caa_pa; /* Include code for early handlers */ #include "../../boot/startup/sev-shared.c" diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index fb86f03da66317..2a28463edd9960 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -19,25 +19,6 @@ #define WARN(condition, format...) (!!(condition)) #endif -/* - * SVSM related information: - * During boot, the page tables are set up as identity mapped and later - * changed to use kernel virtual addresses. Maintain separate virtual and - * physical addresses for the CAA to allow SVSM functions to be used during - * early boot, both with identity mapped virtual addresses and proper kernel - * virtual addresses. - */ -u64 boot_svsm_caa_pa __ro_after_init; - -/* - * Since feature negotiation related variables are set early in the boot - * process they must reside in the .data section so as not to be zeroed - * out when the .bss section is later cleared. - * - * GHCB protocol version negotiated with the hypervisor. - */ -u16 ghcb_version __ro_after_init; - /* Copy of the SNP firmware's CPUID page. */ static struct snp_cpuid_table cpuid_table_copy __ro_after_init; diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index b0fc63f8dee171..138b26f14ff126 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -41,15 +41,6 @@ #include #include -/* Bitmap of SEV features supported by the hypervisor */ -u64 sev_hv_features __ro_after_init; - -/* Secrets page physical address from the CC blob */ -u64 sev_secrets_pa __ro_after_init; - -/* For early boot SVSM communication */ -struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); - /* * Nothing shall interrupt this code path while holding the per-CPU * GHCB. The backup GHCB is only for NMIs interrupting this path. diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 9782ebe306750f..b9133c825f90c9 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -46,6 +46,29 @@ #include #include +/* Bitmap of SEV features supported by the hypervisor */ +u64 sev_hv_features __ro_after_init; +SYM_PIC_ALIAS(sev_hv_features); + +/* Secrets page physical address from the CC blob */ +u64 sev_secrets_pa __ro_after_init; +SYM_PIC_ALIAS(sev_secrets_pa); + +/* For early boot SVSM communication */ +struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); +SYM_PIC_ALIAS(boot_svsm_ca_page); + +/* + * SVSM related information: + * During boot, the page tables are set up as identity mapped and later + * changed to use kernel virtual addresses. Maintain separate virtual and + * physical addresses for the CAA to allow SVSM functions to be used during + * early boot, both with identity mapped virtual addresses and proper kernel + * virtual addresses. + */ +u64 boot_svsm_caa_pa __ro_after_init; +SYM_PIC_ALIAS(boot_svsm_caa_pa); + DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); DEFINE_PER_CPU(u64, svsm_caa_pa); @@ -119,6 +142,17 @@ DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); */ u8 snp_vmpl __ro_after_init; EXPORT_SYMBOL_GPL(snp_vmpl); +SYM_PIC_ALIAS(snp_vmpl); + +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +u16 ghcb_version __ro_after_init; +SYM_PIC_ALIAS(ghcb_version); /* For early boot hypervisor communication in SEV-ES enabled guests */ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); From d4077e6ad35121b97f3233da5d60763de3d23df9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:16 +0200 Subject: [PATCH 0791/3206] x86/sev: Move __sev_[get|put]_ghcb() into separate noinstr object Rename sev-nmi.c to noinstr.c, and move the get/put GHCB routines into it too, which are also annotated as 'noinstr' and suffer from the same problem as the NMI code, i.e., that GCC may ignore the __no_sanitize_address__ function attribute implied by 'noinstr' and insert KASAN instrumentation anyway. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-37-ardb+git@google.com --- arch/x86/boot/startup/sev-startup.c | 74 ---------------------- arch/x86/coco/sev/Makefile | 8 +-- arch/x86/coco/sev/{sev-nmi.c => noinstr.c} | 74 ++++++++++++++++++++++ 3 files changed, 78 insertions(+), 78 deletions(-) rename arch/x86/coco/sev/{sev-nmi.c => noinstr.c} (61%) diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 138b26f14ff126..9f4b4ca7deaac9 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -41,83 +41,9 @@ #include #include -/* - * Nothing shall interrupt this code path while holding the per-CPU - * GHCB. The backup GHCB is only for NMIs interrupting this path. - * - * Callers must disable local interrupts around it. - */ -noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (unlikely(data->ghcb_active)) { - /* GHCB is already in use - save its contents */ - - if (unlikely(data->backup_ghcb_active)) { - /* - * Backup-GHCB is also already in use. There is no way - * to continue here so just kill the machine. To make - * panic() work, mark GHCBs inactive so that messages - * can be printed out. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - instrumentation_begin(); - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - instrumentation_end(); - } - - /* Mark backup_ghcb active before writing to it */ - data->backup_ghcb_active = true; - - state->ghcb = &data->backup_ghcb; - - /* Backup GHCB content */ - *state->ghcb = *ghcb; - } else { - state->ghcb = NULL; - data->ghcb_active = true; - } - - return ghcb; -} - /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" -noinstr void __sev_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - /* - * Invalidate the GHCB so a VMGEXIT instruction issued - * from userspace won't appear to be valid. - */ - vc_ghcb_invalidate(ghcb); - data->ghcb_active = false; - } -} - void __head early_set_pages_state(unsigned long vaddr, unsigned long paddr, unsigned long npages, const struct psc_desc *desc) diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index 342d79f0ab6a8a..3b8ae214a6a64d 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += core.o sev-nmi.o vc-handle.o +obj-y += core.o noinstr.o vc-handle.o # Clang 14 and older may fail to respect __no_sanitize_undefined when inlining -UBSAN_SANITIZE_sev-nmi.o := n +UBSAN_SANITIZE_noinstr.o := n # GCC may fail to respect __no_sanitize_address or __no_kcsan when inlining -KASAN_SANITIZE_sev-nmi.o := n -KCSAN_SANITIZE_sev-nmi.o := n +KASAN_SANITIZE_noinstr.o := n +KCSAN_SANITIZE_noinstr.o := n diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/noinstr.c similarity index 61% rename from arch/x86/coco/sev/sev-nmi.c rename to arch/x86/coco/sev/noinstr.c index d8dfaddfb3671e..b527eafb631235 100644 --- a/arch/x86/coco/sev/sev-nmi.c +++ b/arch/x86/coco/sev/noinstr.c @@ -106,3 +106,77 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } + +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + instrumentation_begin(); + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); + } + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +noinstr void __sev_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); + data->ghcb_active = false; + } +} From 05ce314ba5155d57c86f8f276cb17f78ac5fb4f0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:17 +0200 Subject: [PATCH 0792/3206] x86/sev: Export startup routines for later use Create aliases that expose routines that are part of the startup code to other code in the core kernel, so that they can be called later as well. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-38-ardb+git@google.com --- arch/x86/boot/startup/exports.h | 14 ++++++++++++++ arch/x86/kernel/vmlinux.lds.S | 2 ++ 2 files changed, 16 insertions(+) create mode 100644 arch/x86/boot/startup/exports.h diff --git a/arch/x86/boot/startup/exports.h b/arch/x86/boot/startup/exports.h new file mode 100644 index 00000000000000..01d2363dc445f0 --- /dev/null +++ b/arch/x86/boot/startup/exports.h @@ -0,0 +1,14 @@ + +/* + * The symbols below are functions that are implemented by the startup code, + * but called at runtime by the SEV code residing in the core kernel. + */ +PROVIDE(early_set_pages_state = __pi_early_set_pages_state); +PROVIDE(early_snp_set_memory_private = __pi_early_snp_set_memory_private); +PROVIDE(early_snp_set_memory_shared = __pi_early_snp_set_memory_shared); +PROVIDE(get_hv_features = __pi_get_hv_features); +PROVIDE(sev_es_terminate = __pi_sev_es_terminate); +PROVIDE(snp_cpuid = __pi_snp_cpuid); +PROVIDE(snp_cpuid_get_table = __pi_snp_cpuid_get_table); +PROVIDE(svsm_issue_call = __pi_svsm_issue_call); +PROVIDE(svsm_process_result_codes = __pi_svsm_process_result_codes); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4fa0be732af10f..5d5e3a95e1f9bf 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -535,3 +535,5 @@ xen_elfnote_entry_value = xen_elfnote_phys32_entry_value = ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); #endif + +#include "../boot/startup/exports.h" From 0d6e4563fc03d83f948e6a6f7963cc31a4c81914 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:18 +0200 Subject: [PATCH 0793/3206] objtool: Add action to check for absence of absolute relocations The x86 startup code must not use absolute references to code or data, as it executes before the kernel virtual mapping is up. Add an action to objtool to check all allocatable sections (with the exception of __patchable_function_entries, which uses absolute references for nebulous reasons) and raise an error if any absolute references are found. Note that debug sections typically contain lots of absolute references too, but those are not allocatable so they will be ignored. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/20250828102202.1849035-39-ardb+git@google.com --- tools/objtool/arch/x86/decode.c | 12 +++++++ tools/objtool/builtin-check.c | 2 ++ tools/objtool/check.c | 44 +++++++++++++++++++++++++ tools/objtool/include/objtool/arch.h | 1 + tools/objtool/include/objtool/builtin.h | 1 + 5 files changed, 60 insertions(+) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 98c4713c1b091b..0ad5cc70ecbe74 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -880,3 +880,15 @@ unsigned int arch_reloc_size(struct reloc *reloc) return 8; } } + +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_X86_64_32: + case R_X86_64_32S: + case R_X86_64_64: + return true; + default: + return false; + } +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 80239843e9f02c..0f6b197cfcb032 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -87,6 +87,7 @@ static const struct option check_options[] = { OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), OPT_BOOLEAN(0 , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), + OPT_BOOLEAN(0 , "noabs", &opts.noabs, "reject absolute references in allocatable sections"), OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), @@ -162,6 +163,7 @@ static bool opts_valid(void) opts.hack_noinstr || opts.ibt || opts.mcount || + opts.noabs || opts.noinstr || opts.orc || opts.retpoline || diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d14f20ef1db13f..fb47327075fbaf 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4644,6 +4644,47 @@ static void disas_warned_funcs(struct objtool_file *file) disas_funcs(funcs); } +__weak bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + unsigned int type = reloc_type(reloc); + size_t sz = elf_addr_size(elf); + + return (sz == 8) ? (type == R_ABS64) : (type == R_ABS32); +} + +static int check_abs_references(struct objtool_file *file) +{ + struct section *sec; + struct reloc *reloc; + int ret = 0; + + for_each_sec(file, sec) { + /* absolute references in non-loadable sections are fine */ + if (!(sec->sh.sh_flags & SHF_ALLOC)) + continue; + + /* section must have an associated .rela section */ + if (!sec->rsec) + continue; + + /* + * Special case for compiler generated metadata that is not + * consumed until after boot. + */ + if (!strcmp(sec->name, "__patchable_function_entries")) + continue; + + for_each_reloc(sec->rsec, reloc) { + if (arch_absolute_reloc(file->elf, reloc)) { + WARN("section %s has absolute relocation at offset 0x%lx", + sec->name, reloc_offset(reloc)); + ret++; + } + } + } + return ret; +} + struct insn_chunk { void *addr; struct insn_chunk *next; @@ -4777,6 +4818,9 @@ int check(struct objtool_file *file) goto out; } + if (opts.noabs) + warnings += check_abs_references(file); + if (opts.orc && nr_insns) { ret = orc_create(file); if (ret) diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 01ef6f415adf64..be33c7b43180aa 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -97,6 +97,7 @@ bool arch_is_embedded_insn(struct symbol *sym); int arch_rewrite_retpolines(struct objtool_file *file); bool arch_pc_relative_reloc(struct reloc *reloc); +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc); unsigned int arch_reloc_size(struct reloc *reloc); unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table); diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 6b08666fa69d64..ab22673862e1b1 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -26,6 +26,7 @@ struct opts { bool uaccess; int prefix; bool cfi; + bool noabs; /* options: */ bool backtrace; From 296650c8ac4f18e886dd2a606152c00adf527219 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:19 +0200 Subject: [PATCH 0794/3206] x86/boot: Check startup code for absence of absolute relocations Invoke objtool on each startup code object individually to check for the absence of absolute relocations. This is needed because this code will be invoked from the 1:1 mapping of memory before those absolute virtual addresses (which are derived from the kernel virtual base address provided to the linker and possibly shifted at boot) are mapped. Only objects built under arch/x86/boot/startup/ have this restriction, and once they have been incorporated into vmlinux.o, this distinction is difficult to make. So force the invocation of objtool for each object file individually, even if objtool is deferred to vmlinux.o for the rest of the build. In the latter case, only pass --noabs and nothing else; otherwise, append it to the existing objtool command line. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-40-ardb+git@google.com --- arch/x86/boot/startup/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile index b514f7e81332aa..32737f4ab5a807 100644 --- a/arch/x86/boot/startup/Makefile +++ b/arch/x86/boot/startup/Makefile @@ -19,6 +19,7 @@ KCOV_INSTRUMENT := n obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += sme.o sev-startup.o +pi-objs := $(patsubst %.o,$(obj)/%.o,$(obj-y)) lib-$(CONFIG_X86_64) += la57toggle.o lib-$(CONFIG_EFI_MIXED) += efi-mixed.o @@ -28,3 +29,10 @@ lib-$(CONFIG_EFI_MIXED) += efi-mixed.o # to be linked into the decompressor or the EFI stub but not vmlinux # $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y + +# +# Invoke objtool for each object individually to check for absolute +# relocations, even if other objtool actions are being deferred. +# +$(pi-objs): objtool-enabled = 1 +$(pi-objs): objtool-args = $(if $(delay-objtool),,$(objtool-args-y)) --noabs From 2578560d2259735d7d51364e7991ea92d85fd56c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:20 +0200 Subject: [PATCH 0795/3206] x86/boot: Revert "Reject absolute references in .head.text" This reverts commit faf0ed487415 ("x86/boot: Reject absolute references in .head.text") The startup code is checked directly for the absence of absolute symbol references, so checking the .head.text section in the relocs tool is no longer needed. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-41-ardb+git@google.com --- arch/x86/tools/relocs.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 5778bc4984153f..e5a2b9a912d198 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -740,10 +740,10 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, const char *symname) { - int headtext = !strcmp(sec_name(sec->shdr.sh_info), ".head.text"); unsigned r_type = ELF64_R_TYPE(rel->r_info); ElfW(Addr) offset = rel->r_offset; int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname); + if (sym->st_shndx == SHN_UNDEF) return 0; @@ -783,12 +783,6 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, break; } - if (headtext) { - die("Absolute reference to symbol '%s' not permitted in .head.text\n", - symname); - break; - } - /* * Relocation offsets for 64 bit kernels are output * as 32 bits and sign extended back to 64 bits when From 749627c3980e4421b709857e979e8aa16a4c7147 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:21 +0200 Subject: [PATCH 0796/3206] x86/kbuild: Incorporate boot/startup/ via Kbuild makefile Using core-y is not the correct way to get kbuild to descend into arch/x86/boot/startup. For instance, building an individual object does not work as expected when the pattern rule is local to the Makefile $ make arch/x86/boot/startup/map_kernel.pi.o GEN Makefile CALL /home/ardb/linux/scripts/checksyscalls.sh DESCEND objtool INSTALL libsubcmd_headers make[3]: *** No rule to make target 'arch/x86/boot/startup/map_kernel.pi.o'. Stop. make[2]: *** [/home/ardb/linux/scripts/Makefile.build:461: arch/x86] Error 2 make[1]: *** [/home/ardb/linux/Makefile:2011: .] Error 2 make: *** [/home/ardb/linux/Makefile:248: __sub-make] Error 2 So use obj-y from arch.x86/Kbuild instead, which makes things work as expected. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-42-ardb+git@google.com --- arch/x86/Kbuild | 2 ++ arch/x86/Makefile | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index f7fb3d88c57bd8..36b985d0e7bf8a 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -3,6 +3,8 @@ # Branch profiling isn't noinstr-safe. Disable it for arch/x86/* subdir-ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING +obj-y += boot/startup/ + obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += coco/ obj-y += entry/ diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1913d342969ba2..9b76e77ff7f788 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -286,7 +286,6 @@ archprepare: $(cpufeaturemasks.hdr) ### # Kernel objects -core-y += arch/x86/boot/startup/ libs-y += arch/x86/lib/ # drivers-y are linked after core-y From 7b38dec3c5af54665a4b29483aa02bd1c1e71cf1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:22 +0200 Subject: [PATCH 0797/3206] x86/boot: Create a confined code area for startup code In order to be able to have tight control over which code may execute from the early 1:1 mapping of memory, but still link vmlinux as a single executable, prefix all symbol references in startup code with __pi_, and invoke it from outside using the __pi_ prefix. Use objtool to check that no absolute symbol references are present in the startup code, as these cannot be used from code running from the 1:1 mapping. Note that this also requires disabling the latent-entropy GCC plugin, as the global symbol references that it injects would require explicit exports, and given that the startup code rarely executes more than once, it is not a useful source of entropy anyway. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-43-ardb+git@google.com --- arch/x86/boot/startup/Makefile | 14 ++++++++++++++ arch/x86/boot/startup/sev-shared.c | 1 - arch/x86/boot/startup/sme.c | 1 - arch/x86/coco/sev/core.c | 2 +- arch/x86/include/asm/setup.h | 1 + arch/x86/include/asm/sev.h | 1 + arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/head_64.S | 8 ++++---- arch/x86/mm/mem_encrypt_boot.S | 6 +++--- tools/objtool/check.c | 3 ++- 10 files changed, 27 insertions(+), 12 deletions(-) diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile index 32737f4ab5a807..e8fdf020b4223e 100644 --- a/arch/x86/boot/startup/Makefile +++ b/arch/x86/boot/startup/Makefile @@ -4,6 +4,7 @@ KBUILD_AFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += -D__DISABLE_EXPORTS -mcmodel=small -fPIC \ -Os -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) \ + $(DISABLE_LATENT_ENTROPY_PLUGIN) \ -fno-stack-protector -D__NO_FORTIFY \ -fno-jump-tables \ -include $(srctree)/include/linux/hidden.h @@ -36,3 +37,16 @@ $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y # $(pi-objs): objtool-enabled = 1 $(pi-objs): objtool-args = $(if $(delay-objtool),,$(objtool-args-y)) --noabs + +# +# Confine the startup code by prefixing all symbols with __pi_ (for position +# independent). This ensures that startup code can only call other startup +# code, or code that has explicitly been made accessible to it via a symbol +# alias. +# +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ +$(obj)/%.pi.o: $(obj)/%.o FORCE + $(call if_changed,objcopy) + +targets += $(obj-y) +obj-y := $(patsubst %.o,%.pi.o,$(obj-y)) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 2a28463edd9960..e09c66845e43ea 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -12,7 +12,6 @@ #include #ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) #define has_cpuflag(f) boot_cpu_has(f) #else #undef WARN diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index bf9153b9a3d9f2..52b98e7624fe81 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -568,7 +568,6 @@ void __head sme_enable(struct boot_params *bp) #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* Local version for startup code, which never operates on user page tables */ -__weak pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index b9133c825f90c9..cf9a511b47e094 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -272,7 +272,7 @@ static int svsm_perform_call_protocol(struct svsm_call *call) do { ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); + : __pi_svsm_perform_msr_protocol(call); } while (ret == -EAGAIN); if (sev_cfg.ghcbs_initialized) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 692af46603a175..914eb32581c73d 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -53,6 +53,7 @@ extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp); extern void startup_64_setup_gdt_idt(void); extern void startup_64_load_idt(void *vc_handler); +extern void __pi_startup_64_load_idt(void *vc_handler); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 0030c7125b291c..f222bef9dca88c 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -551,6 +551,7 @@ struct cpuid_leaf { }; int svsm_perform_msr_protocol(struct svsm_call *call); +int __pi_svsm_perform_msr_protocol(struct svsm_call *call); int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), void *ctx, struct cpuid_leaf *leaf); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1bc40d0785ee3e..fd28b53dbac51f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -319,5 +319,5 @@ void early_setup_idt(void) handler = vc_boot_ghcb; } - startup_64_load_idt(handler); + __pi_startup_64_load_idt(handler); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3e9b3a3bd03961..d219963ecb605c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -71,7 +71,7 @@ SYM_CODE_START_NOALIGN(startup_64) xorl %edx, %edx wrmsr - call startup_64_setup_gdt_idt + call __pi_startup_64_setup_gdt_idt /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS @@ -91,7 +91,7 @@ SYM_CODE_START_NOALIGN(startup_64) * subsequent code. Pass the boot_params pointer as the first argument. */ movq %r15, %rdi - call sme_enable + call __pi_sme_enable #endif /* Sanitize CPU configuration */ @@ -111,7 +111,7 @@ SYM_CODE_START_NOALIGN(startup_64) * programmed into CR3. */ movq %r15, %rsi - call __startup_64 + call __pi___startup_64 /* Form the CR3 value being sure to include the CR3 modifier */ leaq early_top_pgt(%rip), %rcx @@ -562,7 +562,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Call C handler */ movq %rsp, %rdi movq ORIG_RAX(%rsp), %rsi - call do_vc_no_ghcb + call __pi_do_vc_no_ghcb /* Unwind pt_regs */ POP_REGS diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index f8a33b25ae869e..edbf9c99884846 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -16,7 +16,7 @@ .text .code64 -SYM_FUNC_START(sme_encrypt_execute) +SYM_FUNC_START(__pi_sme_encrypt_execute) /* * Entry parameters: @@ -69,9 +69,9 @@ SYM_FUNC_START(sme_encrypt_execute) ANNOTATE_UNRET_SAFE ret int3 -SYM_FUNC_END(sme_encrypt_execute) +SYM_FUNC_END(__pi_sme_encrypt_execute) -SYM_FUNC_START(__enc_copy) +SYM_FUNC_START_LOCAL(__enc_copy) ANNOTATE_NOENDBR /* * Routine used to encrypt memory in place. diff --git a/tools/objtool/check.c b/tools/objtool/check.c index fb47327075fbaf..d0d20666e87275 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3564,7 +3564,8 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { /* Ignore KCFI type preambles, which always fall through */ if (!strncmp(func->name, "__cfi_", 6) || - !strncmp(func->name, "__pfx_", 6)) + !strncmp(func->name, "__pfx_", 6) || + !strncmp(func->name, "__pi___pfx_", 11)) return 0; if (file->ignore_unreachables) From e7b88bc0051c5062bdd73b58837cf277d0057358 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:23 +0200 Subject: [PATCH 0798/3206] efistub/x86: Remap inittext read-execute when needed Recent EFI x86 systems are more strict when it comes to mapping boot images, and require that mappings are either read-write or read-execute. Now that the boot code is being cleaned up and refactored, most of it is being moved into .init.text [where it arguably belongs] but that implies that when booting on such strict EFI firmware, we need to take care to map .init.text (and the .altinstr_aux section that follows it) read-execute as well. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-44-ardb+git@google.com --- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/misc.c | 2 ++ arch/x86/include/asm/boot.h | 2 ++ arch/x86/kernel/vmlinux.lds.S | 2 ++ drivers/firmware/efi/libstub/x86-stub.c | 4 +++- 5 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a38fdcdb9bd39..74657589264dfa 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -73,7 +73,7 @@ LDFLAGS_vmlinux += -T hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|_sinittext\|__inittext_end\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 94b5991da001a7..0f41ca0e52c0fb 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -332,6 +332,8 @@ static size_t parse_elf(void *output) } const unsigned long kernel_text_size = VO___start_rodata - VO__text; +const unsigned long kernel_inittext_offset = VO__sinittext - VO__text; +const unsigned long kernel_inittext_size = VO___inittext_end - VO__sinittext; const unsigned long kernel_total_size = VO__end - VO__text; static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4); diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 02b23aa78955fb..f7b67cb7391562 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -82,6 +82,8 @@ #ifndef __ASSEMBLER__ extern unsigned int output_len; extern const unsigned long kernel_text_size; +extern const unsigned long kernel_inittext_offset; +extern const unsigned long kernel_inittext_size; extern const unsigned long kernel_total_size; unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 5d5e3a95e1f9bf..4277efb263588a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -227,6 +227,8 @@ SECTIONS */ .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) { *(.altinstr_aux) + . = ALIGN(PAGE_SIZE); + __inittext_end = .; } INIT_DATA_SECTION(16) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index cafc90d4caafa6..0d05eac7c72b24 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -788,7 +788,9 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry, *kernel_entry = addr + entry; - return efi_adjust_memory_range_protection(addr, kernel_text_size); + return efi_adjust_memory_range_protection(addr, kernel_text_size) ?: + efi_adjust_memory_range_protection(addr + kernel_inittext_offset, + kernel_inittext_size); } static void __noreturn enter_kernel(unsigned long kernel_addr, From c5c30a37369313d1f8b84e96e6a4397b4e2b4eb8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:24 +0200 Subject: [PATCH 0799/3206] x86/boot: Move startup code out of __head section Move startup code out of the __head section, now that this no longer has a special significance. Move everything into .text or .init.text as appropriate, so that startup code is not kept around unnecessarily. [ bp: Fold in hunk to fix 32-bit CPU hotplug: Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202509022207.56fd97f4-lkp@intel.com ] Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-45-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 3 --- arch/x86/boot/startup/gdt_idt.c | 4 ++-- arch/x86/boot/startup/map_kernel.c | 4 ++-- arch/x86/boot/startup/sev-shared.c | 36 ++++++++++++++--------------- arch/x86/boot/startup/sev-startup.c | 14 +++++------ arch/x86/boot/startup/sme.c | 26 ++++++++++----------- arch/x86/include/asm/init.h | 6 ----- arch/x86/kernel/head_32.S | 5 +++- arch/x86/kernel/head_64.S | 2 +- arch/x86/platform/pvh/head.S | 2 +- 10 files changed, 48 insertions(+), 54 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index a5e002ff6bff9b..57670c172b2510 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -32,9 +32,6 @@ struct ghcb *boot_ghcb; #undef __init #define __init -#undef __head -#define __head - #define __BOOT_COMPRESSED u8 snp_vmpl; diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c index a3112a69b06a77..d16102abdaec34 100644 --- a/arch/x86/boot/startup/gdt_idt.c +++ b/arch/x86/boot/startup/gdt_idt.c @@ -24,7 +24,7 @@ static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; /* This may run while still in the direct mapping */ -void __head startup_64_load_idt(void *vc_handler) +void startup_64_load_idt(void *vc_handler) { struct desc_ptr desc = { .address = (unsigned long)rip_rel_ptr(bringup_idt_table), @@ -46,7 +46,7 @@ void __head startup_64_load_idt(void *vc_handler) /* * Setup boot CPU state needed before kernel switches to virtual addresses. */ -void __head startup_64_setup_gdt_idt(void) +void __init startup_64_setup_gdt_idt(void) { struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page); void *handler = NULL; diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c index 332dbe6688c4d9..83ba98d6157272 100644 --- a/arch/x86/boot/startup/map_kernel.c +++ b/arch/x86/boot/startup/map_kernel.c @@ -30,7 +30,7 @@ static inline bool check_la57_support(void) return true; } -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, +static unsigned long __init sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd, unsigned long p2v_offset) { @@ -84,7 +84,7 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, * the 1:1 mapping of memory. Kernel virtual addresses can be determined by * subtracting p2v_offset from the RIP-relative address. */ -unsigned long __head __startup_64(unsigned long p2v_offset, +unsigned long __init __startup_64(unsigned long p2v_offset, struct boot_params *bp) { pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts); diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index e09c66845e43ea..08cc1568d8af27 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -33,7 +33,7 @@ static u32 cpuid_ext_range_max __ro_after_init; bool sev_snp_needs_sfw; -void __head __noreturn +void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; @@ -52,7 +52,7 @@ sev_es_terminate(unsigned int set, unsigned int reason) /* * The hypervisor features are available from GHCB version 2 onward. */ -u64 get_hv_features(void) +u64 __init get_hv_features(void) { u64 val; @@ -222,7 +222,7 @@ const struct snp_cpuid_table *snp_cpuid_get_table(void) * * Return: XSAVE area size on success, 0 otherwise. */ -static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); u64 xfeatures_found = 0; @@ -258,7 +258,7 @@ static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) return xsave_size; } -static bool __head +static bool snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -300,7 +300,7 @@ static void snp_cpuid_hv_msr(void *ctx, struct cpuid_leaf *leaf) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } -static int __head +static int snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), void *ctx, struct cpuid_leaf *leaf) { @@ -396,8 +396,8 @@ snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -int __head snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), - void *ctx, struct cpuid_leaf *leaf) +int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -439,7 +439,7 @@ int __head snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), * page yet, so it only supports the MSR based communication with the * hypervisor and only the CPUID exit-code. */ -void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); @@ -515,7 +515,7 @@ struct cc_setup_data { * Search for a Confidential Computing blob passed in as a setup_data entry * via the Linux Boot Protocol. */ -static __head +static __init struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) { struct cc_setup_data *sd = NULL; @@ -543,7 +543,7 @@ struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) * mapping needs to be updated in sync with all the changes to virtual memory * layout and related mapping facilities throughout the boot process. */ -static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) { const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; int i; @@ -571,7 +571,7 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) } } -static int __head svsm_call_msr_protocol(struct svsm_call *call) +static int svsm_call_msr_protocol(struct svsm_call *call) { int ret; @@ -582,8 +582,8 @@ static int __head svsm_call_msr_protocol(struct svsm_call *call) return ret; } -static void __head svsm_pval_4k_page(unsigned long paddr, bool validate, - struct svsm_ca *caa, u64 caa_pa) +static void svsm_pval_4k_page(unsigned long paddr, bool validate, + struct svsm_ca *caa, u64 caa_pa) { struct svsm_pvalidate_call *pc; struct svsm_call call = {}; @@ -624,8 +624,8 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate, native_local_irq_restore(flags); } -static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, - bool validate, struct svsm_ca *caa, u64 caa_pa) +static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, + bool validate, struct svsm_ca *caa, u64 caa_pa) { int ret; @@ -645,8 +645,8 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, sev_evict_cache((void *)vaddr, 1); } -static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, - const struct psc_desc *desc) +static void __page_state_change(unsigned long vaddr, unsigned long paddr, + const struct psc_desc *desc) { u64 val, msr; @@ -684,7 +684,7 @@ static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM * services needed when not running in VMPL0. */ -static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info, +static bool __init svsm_setup_ca(const struct cc_blob_sev_info *cc_info, void *page) { struct snp_secrets_page *secrets_page; diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 9f4b4ca7deaac9..39465a0ff4e5f3 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -44,7 +44,7 @@ /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" -void __head +void __init early_set_pages_state(unsigned long vaddr, unsigned long paddr, unsigned long npages, const struct psc_desc *desc) { @@ -63,7 +63,7 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, } } -void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { struct psc_desc d = { @@ -88,7 +88,7 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd early_set_pages_state(vaddr, paddr, npages, &d); } -void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { struct psc_desc d = { @@ -123,7 +123,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr * * Scan for the blob in that order. */ -static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +static struct cc_blob_sev_info *__init find_cc_blob(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -149,7 +149,7 @@ static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) return cc_info; } -static __head void svsm_setup(struct cc_blob_sev_info *cc_info) +static void __init svsm_setup(struct cc_blob_sev_info *cc_info) { struct snp_secrets_page *secrets = (void *)cc_info->secrets_phys; struct svsm_call call = {}; @@ -190,7 +190,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) boot_svsm_caa_pa = pa; } -bool __head snp_init(struct boot_params *bp) +bool __init snp_init(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -219,7 +219,7 @@ bool __head snp_init(struct boot_params *bp) return true; } -void __head __noreturn snp_abort(void) +void __init __noreturn snp_abort(void) { sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); } diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index 52b98e7624fe81..2ddde901c8c5b7 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -91,7 +91,7 @@ struct sme_populate_pgd_data { */ static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); -static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; pgd_t *pgd_p; @@ -106,7 +106,7 @@ static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) memset(pgd_p, 0, pgd_size); } -static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { pgd_t *pgd; p4d_t *p4d; @@ -143,7 +143,7 @@ static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) return pud; } -static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -159,7 +159,7 @@ static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); } -static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -185,7 +185,7 @@ static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); } -static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd_large(ppd); @@ -195,7 +195,7 @@ static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) } } -static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd(ppd); @@ -205,7 +205,7 @@ static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) } } -static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, pmdval_t pmd_flags, pteval_t pte_flags) { unsigned long vaddr_end; @@ -229,22 +229,22 @@ static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, __sme_map_range_pte(ppd); } -static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); } -static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); } -static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); } -static unsigned long __head sme_pgtable_calc(unsigned long len) +static unsigned long __init sme_pgtable_calc(unsigned long len) { unsigned long entries = 0, tables = 0; @@ -281,7 +281,7 @@ static unsigned long __head sme_pgtable_calc(unsigned long len) return entries + tables; } -void __head sme_encrypt_kernel(struct boot_params *bp) +void __init sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; @@ -485,7 +485,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp) native_write_cr3(__native_read_cr3()); } -void __head sme_enable(struct boot_params *bp) +void __init sme_enable(struct boot_params *bp) { unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 5a68e9db651893..01ccdd168df080 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,12 +2,6 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H -#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 -#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector -#else -#define __head __section(".head.text") __no_sanitize_undefined __no_kstack_erase -#endif - struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void (*free_pgt_page)(void *, void *); /* free buf for page table */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 76743dfad6ab9d..80ef5d386b03dc 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -61,7 +61,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * any particular GDT layout, because we load our own as soon as we * can. */ -__HEAD + __INIT SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx @@ -136,6 +136,9 @@ SYM_CODE_END(startup_32) * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ +#ifdef CONFIG_HOTPLUG_CPU + .text +#endif SYM_FUNC_START(startup_32_smp) cld movl $(__BOOT_DS),%eax diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d219963ecb605c..21816b48537c3a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -33,7 +33,7 @@ * because we need identity-mapped pages. */ - __HEAD + __INIT .code64 SYM_CODE_START_NOALIGN(startup_64) UNWIND_HINT_END_OF_STACK diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 1d78e5631bb81c..344030c1a81d46 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -24,7 +24,7 @@ #include #include - __HEAD + __INIT /* * Entry point for PVH guests. From ce39a6aa8802e718f9b68bf6892612e4fd7f9d2d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:25 +0200 Subject: [PATCH 0800/3206] x86/boot: Get rid of the .head.text section The .head.text section is now empty, so it can be dropped from the linker script. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-46-ardb+git@google.com --- arch/x86/kernel/vmlinux.lds.S | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4277efb263588a..d7af4a64c211b7 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -160,11 +160,6 @@ SECTIONS } :text = 0xcccccccc - /* bootstrapping code */ - .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { - HEAD_TEXT - } :text = 0xcccccccc - /* End of text section, which should occupy whole number of pages */ _etext = .; . = ALIGN(PAGE_SIZE); From e117ff1129daa7d63536833f39285e50ad52379d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 2 Sep 2025 14:15:37 -0400 Subject: [PATCH 0801/3206] cgroup/cpuset: Prevent NULL pointer access in free_tmpmasks() Commit 5806b3d05165 ("cpuset: decouple tmpmasks and cpumasks freeing in cgroup") separates out the freeing of tmpmasks into a new free_tmpmask() helper but removes the NULL pointer check in the process. Unfortunately a NULL pointer can be passed to free_tmpmasks() in cpuset_handle_hotplug() if cpuset v1 is active. This can cause segmentation fault and crash the kernel. Fix that by adding the NULL pointer check to free_tmpmasks(). Fixes: 5806b3d05165 ("cpuset: decouple tmpmasks and cpumasks freeing in cgroup") Reported-by: Ashay Jaiswal Closes: https://lore.kernel.org/lkml/20250902-cpuset-free-on-condition-v1-1-f46ffab53eac@quicinc.com/ Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a78ccd11ce9b43..c0c281a8860d35 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -484,6 +484,9 @@ static inline int alloc_tmpmasks(struct tmpmasks *tmp) */ static inline void free_tmpmasks(struct tmpmasks *tmp) { + if (!tmp) + return; + free_cpumask_var(tmp->new_cpus); free_cpumask_var(tmp->addmask); free_cpumask_var(tmp->delmask); From d5067034725b1a0b2c785cea9cfce68776a94042 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Mon, 1 Sep 2025 10:31:08 +0200 Subject: [PATCH 0802/3206] Revert "drm/nouveau: Remove waitque for sched teardown" This reverts: commit bead88002227 ("drm/nouveau: Remove waitque for sched teardown") commit 5f46f5c7af8c ("drm/nouveau: Add new callback for scheduler teardown") from the drm/sched teardown leak fix series: https://lore.kernel.org/dri-devel/20250710125412.128476-2-phasta@kernel.org/ The aforementioned series removed a blocking waitqueue from nouveau_sched_fini(). It was mistakenly assumed that this waitqueue only prevents jobs from leaking, which the series fixed. The waitqueue, however, also guarantees that all VM_BIND related jobs are finished in order, cleaning up mappings in the GPU's MMU. These jobs must be executed sequentially. Without the waitqueue, this is no longer guaranteed, because entity and scheduler teardown can race with each other. Revert all patches related to the waitqueue removal. Fixes: bead88002227 ("drm/nouveau: Remove waitque for sched teardown") Suggested-by: Danilo Krummrich Signed-off-by: Philipp Stanner Link: https://lore.kernel.org/r/20250901083107.10206-2-phasta@kernel.org Signed-off-by: Danilo Krummrich --- drivers/gpu/drm/nouveau/nouveau_fence.c | 15 ----------- drivers/gpu/drm/nouveau/nouveau_fence.h | 1 - drivers/gpu/drm/nouveau/nouveau_sched.c | 35 ++++++++++--------------- drivers/gpu/drm/nouveau/nouveau_sched.h | 9 ++++--- drivers/gpu/drm/nouveau/nouveau_uvmm.c | 8 +++--- 5 files changed, 24 insertions(+), 44 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c index 9f345a0087175c..869d4335c0f45c 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fence.c +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c @@ -240,21 +240,6 @@ nouveau_fence_emit(struct nouveau_fence *fence) return ret; } -void -nouveau_fence_cancel(struct nouveau_fence *fence) -{ - struct nouveau_fence_chan *fctx = nouveau_fctx(fence); - unsigned long flags; - - spin_lock_irqsave(&fctx->lock, flags); - if (!dma_fence_is_signaled_locked(&fence->base)) { - dma_fence_set_error(&fence->base, -ECANCELED); - if (nouveau_fence_signal(fence)) - nvif_event_block(&fctx->event); - } - spin_unlock_irqrestore(&fctx->lock, flags); -} - bool nouveau_fence_done(struct nouveau_fence *fence) { diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h b/drivers/gpu/drm/nouveau/nouveau_fence.h index 9957a919bd38e7..183dd43ecfff4a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fence.h +++ b/drivers/gpu/drm/nouveau/nouveau_fence.h @@ -29,7 +29,6 @@ void nouveau_fence_unref(struct nouveau_fence **); int nouveau_fence_emit(struct nouveau_fence *); bool nouveau_fence_done(struct nouveau_fence *); -void nouveau_fence_cancel(struct nouveau_fence *fence); int nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool intr); int nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel *, bool exclusive, bool intr); diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c index 0cc0bc9f9952b1..e60f7892f5ce9a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sched.c +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c @@ -11,7 +11,6 @@ #include "nouveau_exec.h" #include "nouveau_abi16.h" #include "nouveau_sched.h" -#include "nouveau_chan.h" #define NOUVEAU_SCHED_JOB_TIMEOUT_MS 10000 @@ -122,9 +121,11 @@ nouveau_job_done(struct nouveau_job *job) { struct nouveau_sched *sched = job->sched; - spin_lock(&sched->job_list.lock); + spin_lock(&sched->job.list.lock); list_del(&job->entry); - spin_unlock(&sched->job_list.lock); + spin_unlock(&sched->job.list.lock); + + wake_up(&sched->job.wq); } void @@ -305,9 +306,9 @@ nouveau_job_submit(struct nouveau_job *job) } /* Submit was successful; add the job to the schedulers job list. */ - spin_lock(&sched->job_list.lock); - list_add(&job->entry, &sched->job_list.head); - spin_unlock(&sched->job_list.lock); + spin_lock(&sched->job.list.lock); + list_add(&job->entry, &sched->job.list.head); + spin_unlock(&sched->job.list.lock); drm_sched_job_arm(&job->base); job->done_fence = dma_fence_get(&job->base.s_fence->finished); @@ -392,23 +393,10 @@ nouveau_sched_free_job(struct drm_sched_job *sched_job) nouveau_job_fini(job); } -static void -nouveau_sched_cancel_job(struct drm_sched_job *sched_job) -{ - struct nouveau_fence *fence; - struct nouveau_job *job; - - job = to_nouveau_job(sched_job); - fence = to_nouveau_fence(job->done_fence); - - nouveau_fence_cancel(fence); -} - static const struct drm_sched_backend_ops nouveau_sched_ops = { .run_job = nouveau_sched_run_job, .timedout_job = nouveau_sched_timedout_job, .free_job = nouveau_sched_free_job, - .cancel_job = nouveau_sched_cancel_job, }; static int @@ -458,8 +446,9 @@ nouveau_sched_init(struct nouveau_sched *sched, struct nouveau_drm *drm, goto fail_sched; mutex_init(&sched->mutex); - spin_lock_init(&sched->job_list.lock); - INIT_LIST_HEAD(&sched->job_list.head); + spin_lock_init(&sched->job.list.lock); + INIT_LIST_HEAD(&sched->job.list.head); + init_waitqueue_head(&sched->job.wq); return 0; @@ -493,12 +482,16 @@ nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm, return 0; } + static void nouveau_sched_fini(struct nouveau_sched *sched) { struct drm_gpu_scheduler *drm_sched = &sched->base; struct drm_sched_entity *entity = &sched->entity; + rmb(); /* for list_empty to work without lock */ + wait_event(sched->job.wq, list_empty(&sched->job.list.head)); + drm_sched_entity_fini(entity); drm_sched_fini(drm_sched); diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h b/drivers/gpu/drm/nouveau/nouveau_sched.h index b98c3f0bef3029..20cd1da8db73c3 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sched.h +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h @@ -103,9 +103,12 @@ struct nouveau_sched { struct mutex mutex; struct { - struct list_head head; - spinlock_t lock; - } job_list; + struct { + struct list_head head; + spinlock_t lock; + } list; + struct wait_queue_head wq; + } job; }; int nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm, diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c index ddfc46bc1b3e26..48f105239f42d8 100644 --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c @@ -1019,8 +1019,8 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range) u64 end = addr + range; again: - spin_lock(&sched->job_list.lock); - list_for_each_entry(__job, &sched->job_list.head, entry) { + spin_lock(&sched->job.list.lock); + list_for_each_entry(__job, &sched->job.list.head, entry) { struct nouveau_uvmm_bind_job *bind_job = to_uvmm_bind_job(__job); list_for_each_op(op, &bind_job->ops) { @@ -1030,7 +1030,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range) if (!(end <= op_addr || addr >= op_end)) { nouveau_uvmm_bind_job_get(bind_job); - spin_unlock(&sched->job_list.lock); + spin_unlock(&sched->job.list.lock); wait_for_completion(&bind_job->complete); nouveau_uvmm_bind_job_put(bind_job); goto again; @@ -1038,7 +1038,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range) } } } - spin_unlock(&sched->job_list.lock); + spin_unlock(&sched->job.list.lock); } static int From b7975c48695cdc2cc308df342f85ccaf9dac0888 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2025 11:33:28 -1000 Subject: [PATCH 0803/3206] sched_ext: Make explicit scx_task_iter_relock() calls unnecessary During tasks iteration, the locks can be dropped using scx_task_iter_unlock() to perform e.g. sleepable allocations. Afterwards, scx_task_iter_relock() has to be called prior to other iteration operations, which is error-prone. This can be easily automated by tracking whether scx_tasks_lock is held in scx_task_iter and re-acquiring when necessary. It already tracks whether the task's rq is locked after all. - Add scx_task_iter->list_locked which remembers whether scx_tasks_lock is held. - Rename scx_task_iter->locked to scx_task_iter->locked_task to better distinguish it from ->list_locked. - Replace scx_task_iter_relock() with __scx_task_iter_maybe_relock() which is automatically called by scx_task_iter_next() and scx_task_iter_stop(). - Drop explicit scx_task_iter_relock() calls. The resulting behavior should be equivalent. Signed-off-by: Tejun Heo Acked-by: Andrea Righi --- kernel/sched/ext.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7dedc9a16281b3..7f799345c89944 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1488,10 +1488,11 @@ struct bpf_iter_scx_dsq { */ struct scx_task_iter { struct sched_ext_entity cursor; - struct task_struct *locked; + struct task_struct *locked_task; struct rq *rq; struct rq_flags rf; u32 cnt; + bool list_locked; }; /** @@ -1519,15 +1520,16 @@ static void scx_task_iter_start(struct scx_task_iter *iter) iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); - iter->locked = NULL; + iter->locked_task = NULL; iter->cnt = 0; + iter->list_locked = true; } static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) { - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; + if (iter->locked_task) { + task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); + iter->locked_task = NULL; } } @@ -1537,24 +1539,24 @@ static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) * * If @iter is in the middle of a locked iteration, it may be locking the rq of * the task currently being visited in addition to scx_tasks_lock. Unlock both. - * This function can be safely called anytime during an iteration. + * This function can be safely called anytime during an iteration. The next + * iterator operation will automatically restore the necessary locking. */ static void scx_task_iter_unlock(struct scx_task_iter *iter) { __scx_task_iter_rq_unlock(iter); - spin_unlock_irq(&scx_tasks_lock); + if (iter->list_locked) { + iter->list_locked = false; + spin_unlock_irq(&scx_tasks_lock); + } } -/** - * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() - * @iter: iterator to re-lock - * - * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it - * doesn't re-lock the rq lock. Must be called before other iterator operations. - */ -static void scx_task_iter_relock(struct scx_task_iter *iter) +static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { - spin_lock_irq(&scx_tasks_lock); + if (!iter->list_locked) { + spin_lock_irq(&scx_tasks_lock); + iter->list_locked = true; + } } /** @@ -1567,6 +1569,7 @@ static void scx_task_iter_relock(struct scx_task_iter *iter) */ static void scx_task_iter_stop(struct scx_task_iter *iter) { + __scx_task_iter_maybe_relock(iter); list_del_init(&iter->cursor.tasks_node); scx_task_iter_unlock(iter); } @@ -1584,10 +1587,12 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; + __scx_task_iter_maybe_relock(iter); + if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); - scx_task_iter_relock(iter); + __scx_task_iter_maybe_relock(iter); } list_for_each_entry(pos, cursor, tasks_node) { @@ -1650,7 +1655,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return NULL; iter->rq = task_rq_lock(p, &iter->rf); - iter->locked = p; + iter->locked_task = p; return p; } @@ -5713,7 +5718,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) ret = scx_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - scx_task_iter_relock(&sti); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); @@ -5723,7 +5727,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); - scx_task_iter_relock(&sti); } scx_task_iter_stop(&sti); scx_cgroup_unlock(); From 4a1d9d73aabc8f97f48c4f84f936de3b265ffd6f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2025 11:33:28 -1000 Subject: [PATCH 0804/3206] sched_ext: Keep bypass on between enable failure and scx_disable_workfn() scx_enable() turns on the bypass mode while enable is in progress. If enabling fails, it turns off the bypass mode and then triggers scx_error(). scx_error() will trigger scx_disable_workfn() which will turn on the bypass mode again and unload the failed scheduler. This moves the system out of bypass mode between the enable error path and the disable path, which is unnecessary and can be brittle - e.g. the thread running scx_enable() may already be on the failed scheduler and can be switched out before it triggers scx_error() leading to a stall. The watchdog would eventually kick in, so the situation isn't critical but is still suboptimal. There is nothing to be gained by turning off the bypass mode between scx_enable() failure and scx_disable_workfn(). Keep bypass on. Signed-off-by: Tejun Heo Acked-by: Andrea Righi --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7f799345c89944..fda2b4e85ee377 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5794,7 +5794,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); - scx_bypass(false); + /* we'll soon enter disable path, keep bypass on */ err_disable: mutex_unlock(&scx_enable_mutex); /* From 0c2b8356e430229efef42b03bd765a2a7ecf73fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2025 11:33:28 -1000 Subject: [PATCH 0805/3206] sched_ext: Move internal type and accessor definitions to ext_internal.h There currently isn't a place to place SCX-internal types and accessors to be shared between ext.c and ext_idle.c. Create kernel/sched/ext_internal.h and move internal type and accessor definitions there. This trims ext.c a bit and makes future additions easier. Pure code reorganization. No functional changes. Signed-off-by: Tejun Heo Acked-by: Andrea Righi --- kernel/sched/build_policy.c | 1 + kernel/sched/ext.c | 1034 ---------------------------------- kernel/sched/ext.h | 23 - kernel/sched/ext_internal.h | 1061 +++++++++++++++++++++++++++++++++++ 4 files changed, 1062 insertions(+), 1057 deletions(-) create mode 100644 kernel/sched/ext_internal.h diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index c4a488e67aa7d8..755883faf75186 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -58,6 +58,7 @@ #include "deadline.c" #ifdef CONFIG_SCHED_CLASS_EXT +# include "ext_internal.h" # include "ext.c" # include "ext_idle.c" #endif diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index fda2b4e85ee377..7e15e852370cd2 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,1040 +9,6 @@ #include #include "ext_idle.h" -#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) - -enum scx_consts { - SCX_DSP_DFL_MAX_BATCH = 32, - SCX_DSP_MAX_LOOPS = 32, - SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, - - SCX_EXIT_BT_LEN = 64, - SCX_EXIT_MSG_LEN = 1024, - SCX_EXIT_DUMP_DFL_LEN = 32768, - - SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, - - /* - * Iterating all tasks may take a while. Periodically drop - * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. - */ - SCX_TASK_ITER_BATCH = 32, -}; - -enum scx_exit_kind { - SCX_EXIT_NONE, - SCX_EXIT_DONE, - - SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ - SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ - SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ - SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ - - SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ - SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ - SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ -}; - -/* - * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), - * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes - * are 64bit of the format: - * - * Bits: [63 .. 48 47 .. 32 31 .. 0] - * [ SYS ACT ] [ SYS RSN ] [ USR ] - * - * SYS ACT: System-defined exit actions - * SYS RSN: System-defined exit reasons - * USR : User-defined exit codes and reasons - * - * Using the above, users may communicate intention and context by ORing system - * actions and/or system reasons with a user-defined exit code. - */ -enum scx_exit_code { - /* Reasons */ - SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, - - /* Actions */ - SCX_ECODE_ACT_RESTART = 1LLU << 48, -}; - -/* - * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is - * being disabled. - */ -struct scx_exit_info { - /* %SCX_EXIT_* - broad category of the exit reason */ - enum scx_exit_kind kind; - - /* exit code if gracefully exiting */ - s64 exit_code; - - /* textual representation of the above */ - const char *reason; - - /* backtrace if exiting due to an error */ - unsigned long *bt; - u32 bt_len; - - /* informational message */ - char *msg; - - /* debug dump */ - char *dump; -}; - -/* sched_ext_ops.flags */ -enum scx_ops_flags { - /* - * Keep built-in idle tracking even if ops.update_idle() is implemented. - */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, - - /* - * By default, if there are no other task to run on the CPU, ext core - * keeps running the current task even after its slice expires. If this - * flag is specified, such tasks are passed to ops.enqueue() with - * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. - */ - SCX_OPS_ENQ_LAST = 1LLU << 1, - - /* - * An exiting task may schedule after PF_EXITING is set. In such cases, - * bpf_task_from_pid() may not be able to find the task and if the BPF - * scheduler depends on pid lookup for dispatching, the task will be - * lost leading to various issues including RCU grace period stalls. - * - * To mask this problem, by default, unhashed tasks are automatically - * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't - * depend on pid lookups and wants to handle these tasks directly, the - * following flag can be used. - */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, - - /* - * If set, only tasks with policy set to SCHED_EXT are attached to - * sched_ext. If clear, SCHED_NORMAL tasks are also included. - */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, - - /* - * A migration disabled task can only execute on its current CPU. By - * default, such tasks are automatically put on the CPU's local DSQ with - * the default slice on enqueue. If this ops flag is set, they also go - * through ops.enqueue(). - * - * A migration disabled task never invokes ops.select_cpu() as it can - * only select the current CPU. Also, p->cpus_ptr will only contain its - * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr - * and thus may disagree with cpumask_weight(p->cpus_ptr). - */ - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, - - /* - * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes - * ops.enqueue() on the ops.select_cpu() selected or the wakee's - * previous CPU via IPI (inter-processor interrupt) to reduce cacheline - * transfers. When this optimization is enabled, ops.select_cpu() is - * skipped in some cases (when racing against the wakee switching out). - * As the BPF scheduler may depend on ops.select_cpu() being invoked - * during wakeups, queued wakeup is disabled by default. - * - * If this ops flag is set, queued wakeup optimization is enabled and - * the BPF scheduler must be able to handle ops.enqueue() invoked on the - * wakee's CPU without preceding ops.select_cpu() even for tasks which - * may be executed on multiple CPUs. - */ - SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, - - /* - * If set, enable per-node idle cpumasks. If clear, use a single global - * flat idle cpumask. - */ - SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, - - /* - * CPU cgroup support flags - */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ - - SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | - SCX_OPS_ENQ_LAST | - SCX_OPS_ENQ_EXITING | - SCX_OPS_ENQ_MIGRATION_DISABLED | - SCX_OPS_ALLOW_QUEUED_WAKEUP | - SCX_OPS_SWITCH_PARTIAL | - SCX_OPS_BUILTIN_IDLE_PER_NODE | - SCX_OPS_HAS_CGROUP_WEIGHT, - - /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ - __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, - - SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, -}; - -/* argument container for ops.init_task() */ -struct scx_init_task_args { - /* - * Set if ops.init_task() is being invoked on the fork path, as opposed - * to the scheduler transition path. - */ - bool fork; -#ifdef CONFIG_EXT_GROUP_SCHED - /* the cgroup the task is joining */ - struct cgroup *cgroup; -#endif -}; - -/* argument container for ops.exit_task() */ -struct scx_exit_task_args { - /* Whether the task exited before running on sched_ext. */ - bool cancelled; -}; - -/* argument container for ops->cgroup_init() */ -struct scx_cgroup_init_args { - /* the weight of the cgroup [1..10000] */ - u32 weight; - - /* bandwidth control parameters from cpu.max and cpu.max.burst */ - u64 bw_period_us; - u64 bw_quota_us; - u64 bw_burst_us; -}; - -enum scx_cpu_preempt_reason { - /* next task is being scheduled by &sched_class_rt */ - SCX_CPU_PREEMPT_RT, - /* next task is being scheduled by &sched_class_dl */ - SCX_CPU_PREEMPT_DL, - /* next task is being scheduled by &sched_class_stop */ - SCX_CPU_PREEMPT_STOP, - /* unknown reason for SCX being preempted */ - SCX_CPU_PREEMPT_UNKNOWN, -}; - -/* - * Argument container for ops->cpu_acquire(). Currently empty, but may be - * expanded in the future. - */ -struct scx_cpu_acquire_args {}; - -/* argument container for ops->cpu_release() */ -struct scx_cpu_release_args { - /* the reason the CPU was preempted */ - enum scx_cpu_preempt_reason reason; - - /* the task that's going to be scheduled on the CPU */ - struct task_struct *task; -}; - -/* - * Informational context provided to dump operations. - */ -struct scx_dump_ctx { - enum scx_exit_kind kind; - s64 exit_code; - const char *reason; - u64 at_ns; - u64 at_jiffies; -}; - -/** - * struct sched_ext_ops - Operation table for BPF scheduler implementation - * - * A BPF scheduler can implement an arbitrary scheduling policy by - * implementing and loading operations in this table. Note that a userland - * scheduling policy can also be implemented using the BPF scheduler - * as a shim layer. - */ -struct sched_ext_ops { - /** - * @select_cpu: Pick the target CPU for a task which is being woken up - * @p: task being woken up - * @prev_cpu: the cpu @p was on before sleeping - * @wake_flags: SCX_WAKE_* - * - * Decision made here isn't final. @p may be moved to any CPU while it - * is getting dispatched for execution later. However, as @p is not on - * the rq at this point, getting the eventual execution CPU right here - * saves a small bit of overhead down the line. - * - * If an idle CPU is returned, the CPU is kicked and will try to - * dispatch. While an explicit custom mechanism can be added, - * select_cpu() serves as the default way to wake up idle CPUs. - * - * @p may be inserted into a DSQ directly by calling - * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. - * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ - * of the CPU returned by this operation. - * - * Note that select_cpu() is never called for tasks that can only run - * on a single CPU or tasks with migration disabled, as they don't have - * the option to select a different CPU. See select_task_rq() for - * details. - */ - s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); - - /** - * @enqueue: Enqueue a task on the BPF scheduler - * @p: task being enqueued - * @enq_flags: %SCX_ENQ_* - * - * @p is ready to run. Insert directly into a DSQ by calling - * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly - * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, - * the task will stall. - * - * If @p was inserted into a DSQ from ops.select_cpu(), this callback is - * skipped. - */ - void (*enqueue)(struct task_struct *p, u64 enq_flags); - - /** - * @dequeue: Remove a task from the BPF scheduler - * @p: task being dequeued - * @deq_flags: %SCX_DEQ_* - * - * Remove @p from the BPF scheduler. This is usually called to isolate - * the task while updating its scheduling properties (e.g. priority). - * - * The ext core keeps track of whether the BPF side owns a given task or - * not and can gracefully ignore spurious dispatches from BPF side, - * which makes it safe to not implement this method. However, depending - * on the scheduling logic, this can lead to confusing behaviors - e.g. - * scheduling position not being updated across a priority change. - */ - void (*dequeue)(struct task_struct *p, u64 deq_flags); - - /** - * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs - * @cpu: CPU to dispatch tasks for - * @prev: previous task being switched out - * - * Called when a CPU's local dsq is empty. The operation should dispatch - * one or more tasks from the BPF scheduler into the DSQs using - * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ - * using scx_bpf_dsq_move_to_local(). - * - * The maximum number of times scx_bpf_dsq_insert() can be called - * without an intervening scx_bpf_dsq_move_to_local() is specified by - * ops.dispatch_max_batch. See the comments on top of the two functions - * for more details. - * - * When not %NULL, @prev is an SCX task with its slice depleted. If - * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in - * @prev->scx.flags, it is not enqueued yet and will be enqueued after - * ops.dispatch() returns. To keep executing @prev, return without - * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. - */ - void (*dispatch)(s32 cpu, struct task_struct *prev); - - /** - * @tick: Periodic tick - * @p: task running currently - * - * This operation is called every 1/HZ seconds on CPUs which are - * executing an SCX task. Setting @p->scx.slice to 0 will trigger an - * immediate dispatch cycle on the CPU. - */ - void (*tick)(struct task_struct *p); - - /** - * @runnable: A task is becoming runnable on its associated CPU - * @p: task becoming runnable - * @enq_flags: %SCX_ENQ_* - * - * This and the following three functions can be used to track a task's - * execution state transitions. A task becomes ->runnable() on a CPU, - * and then goes through one or more ->running() and ->stopping() pairs - * as it runs on the CPU, and eventually becomes ->quiescent() when it's - * done running on the CPU. - * - * @p is becoming runnable on the CPU because it's - * - * - waking up (%SCX_ENQ_WAKEUP) - * - being moved from another CPU - * - being restored after temporarily taken off the queue for an - * attribute change. - * - * This and ->enqueue() are related but not coupled. This operation - * notifies @p's state transition and may not be followed by ->enqueue() - * e.g. when @p is being dispatched to a remote CPU, or when @p is - * being enqueued on a CPU experiencing a hotplug event. Likewise, a - * task may be ->enqueue()'d without being preceded by this operation - * e.g. after exhausting its slice. - */ - void (*runnable)(struct task_struct *p, u64 enq_flags); - - /** - * @running: A task is starting to run on its associated CPU - * @p: task starting to run - * - * Note that this callback may be called from a CPU other than the - * one the task is going to run on. This can happen when a task - * property is changed (i.e., affinity), since scx_next_task_scx(), - * which triggers this callback, may run on a CPU different from - * the task's assigned CPU. - * - * Therefore, always use scx_bpf_task_cpu(@p) to determine the - * target CPU the task is going to use. - * - * See ->runnable() for explanation on the task state notifiers. - */ - void (*running)(struct task_struct *p); - - /** - * @stopping: A task is stopping execution - * @p: task stopping to run - * @runnable: is task @p still runnable? - * - * Note that this callback may be called from a CPU other than the - * one the task was running on. This can happen when a task - * property is changed (i.e., affinity), since dequeue_task_scx(), - * which triggers this callback, may run on a CPU different from - * the task's assigned CPU. - * - * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU - * the task was running on. - * - * See ->runnable() for explanation on the task state notifiers. If - * !@runnable, ->quiescent() will be invoked after this operation - * returns. - */ - void (*stopping)(struct task_struct *p, bool runnable); - - /** - * @quiescent: A task is becoming not runnable on its associated CPU - * @p: task becoming not runnable - * @deq_flags: %SCX_DEQ_* - * - * See ->runnable() for explanation on the task state notifiers. - * - * @p is becoming quiescent on the CPU because it's - * - * - sleeping (%SCX_DEQ_SLEEP) - * - being moved to another CPU - * - being temporarily taken off the queue for an attribute change - * (%SCX_DEQ_SAVE) - * - * This and ->dequeue() are related but not coupled. This operation - * notifies @p's state transition and may not be preceded by ->dequeue() - * e.g. when @p is being dispatched to a remote CPU. - */ - void (*quiescent)(struct task_struct *p, u64 deq_flags); - - /** - * @yield: Yield CPU - * @from: yielding task - * @to: optional yield target task - * - * If @to is NULL, @from is yielding the CPU to other runnable tasks. - * The BPF scheduler should ensure that other available tasks are - * dispatched before the yielding task. Return value is ignored in this - * case. - * - * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf - * scheduler can implement the request, return %true; otherwise, %false. - */ - bool (*yield)(struct task_struct *from, struct task_struct *to); - - /** - * @core_sched_before: Task ordering for core-sched - * @a: task A - * @b: task B - * - * Used by core-sched to determine the ordering between two tasks. See - * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on - * core-sched. - * - * Both @a and @b are runnable and may or may not currently be queued on - * the BPF scheduler. Should return %true if @a should run before @b. - * %false if there's no required ordering or @b should run before @a. - * - * If not specified, the default is ordering them according to when they - * became runnable. - */ - bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); - - /** - * @set_weight: Set task weight - * @p: task to set weight for - * @weight: new weight [1..10000] - * - * Update @p's weight to @weight. - */ - void (*set_weight)(struct task_struct *p, u32 weight); - - /** - * @set_cpumask: Set CPU affinity - * @p: task to set CPU affinity for - * @cpumask: cpumask of cpus that @p can run on - * - * Update @p's CPU affinity to @cpumask. - */ - void (*set_cpumask)(struct task_struct *p, - const struct cpumask *cpumask); - - /** - * @update_idle: Update the idle state of a CPU - * @cpu: CPU to update the idle state for - * @idle: whether entering or exiting the idle state - * - * This operation is called when @rq's CPU goes or leaves the idle - * state. By default, implementing this operation disables the built-in - * idle CPU tracking and the following helpers become unavailable: - * - * - scx_bpf_select_cpu_dfl() - * - scx_bpf_select_cpu_and() - * - scx_bpf_test_and_clear_cpu_idle() - * - scx_bpf_pick_idle_cpu() - * - * The user also must implement ops.select_cpu() as the default - * implementation relies on scx_bpf_select_cpu_dfl(). - * - * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle - * tracking. - */ - void (*update_idle)(s32 cpu, bool idle); - - /** - * @cpu_acquire: A CPU is becoming available to the BPF scheduler - * @cpu: The CPU being acquired by the BPF scheduler. - * @args: Acquire arguments, see the struct definition. - * - * A CPU that was previously released from the BPF scheduler is now once - * again under its control. - */ - void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); - - /** - * @cpu_release: A CPU is taken away from the BPF scheduler - * @cpu: The CPU being released by the BPF scheduler. - * @args: Release arguments, see the struct definition. - * - * The specified CPU is no longer under the control of the BPF - * scheduler. This could be because it was preempted by a higher - * priority sched_class, though there may be other reasons as well. The - * caller should consult @args->reason to determine the cause. - */ - void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); - - /** - * @init_task: Initialize a task to run in a BPF scheduler - * @p: task to initialize for BPF scheduling - * @args: init arguments, see the struct definition - * - * Either we're loading a BPF scheduler or a new task is being forked. - * Initialize @p for BPF scheduling. This operation may block and can - * be used for allocations, and is called exactly once for a task. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During a fork, it - * will abort that specific fork. - */ - s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); - - /** - * @exit_task: Exit a previously-running task from the system - * @p: task to exit - * @args: exit arguments, see the struct definition - * - * @p is exiting or the BPF scheduler is being unloaded. Perform any - * necessary cleanup for @p. - */ - void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); - - /** - * @enable: Enable BPF scheduling for a task - * @p: task to enable BPF scheduling for - * - * Enable @p for BPF scheduling. enable() is called on @p any time it - * enters SCX, and is always paired with a matching disable(). - */ - void (*enable)(struct task_struct *p); - - /** - * @disable: Disable BPF scheduling for a task - * @p: task to disable BPF scheduling for - * - * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. - * Disable BPF scheduling for @p. A disable() call is always matched - * with a prior enable() call. - */ - void (*disable)(struct task_struct *p); - - /** - * @dump: Dump BPF scheduler state on error - * @ctx: debug dump context - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. - */ - void (*dump)(struct scx_dump_ctx *ctx); - - /** - * @dump_cpu: Dump BPF scheduler state for a CPU on error - * @ctx: debug dump context - * @cpu: CPU to generate debug dump for - * @idle: @cpu is currently idle without any runnable tasks - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @cpu. If @idle is %true and this operation doesn't produce any - * output, @cpu is skipped for dump. - */ - void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); - - /** - * @dump_task: Dump BPF scheduler state for a runnable task on error - * @ctx: debug dump context - * @p: runnable task to generate debug dump for - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @p. - */ - void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); - -#ifdef CONFIG_EXT_GROUP_SCHED - /** - * @cgroup_init: Initialize a cgroup - * @cgrp: cgroup being initialized - * @args: init arguments, see the struct definition - * - * Either the BPF scheduler is being loaded or @cgrp created, initialize - * @cgrp for sched_ext. This operation may block. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During cgroup - * creation, it will abort the specific cgroup creation. - */ - s32 (*cgroup_init)(struct cgroup *cgrp, - struct scx_cgroup_init_args *args); - - /** - * @cgroup_exit: Exit a cgroup - * @cgrp: cgroup being exited - * - * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit - * @cgrp for sched_ext. This operation my block. - */ - void (*cgroup_exit)(struct cgroup *cgrp); - - /** - * @cgroup_prep_move: Prepare a task to be moved to a different cgroup - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Prepare @p for move from cgroup @from to @to. This operation may - * block and can be used for allocations. - * - * Return 0 for success, -errno for failure. An error return aborts the - * migration. - */ - s32 (*cgroup_prep_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_move: Commit cgroup move - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Commit the move. @p is dequeued during this operation. - */ - void (*cgroup_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_cancel_move: Cancel cgroup move - * @p: task whose cgroup move is being canceled - * @from: cgroup @p was being moved from - * @to: cgroup @p was being moved to - * - * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). - * Undo the preparation. - */ - void (*cgroup_cancel_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_set_weight: A cgroup's weight is being changed - * @cgrp: cgroup whose weight is being updated - * @weight: new weight [1..10000] - * - * Update @cgrp's weight to @weight. - */ - void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); - - /** - * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed - * @cgrp: cgroup whose bandwidth is being updated - * @period_us: bandwidth control period - * @quota_us: bandwidth control quota - * @burst_us: bandwidth control burst - * - * Update @cgrp's bandwidth control parameters. This is from the cpu.max - * cgroup interface. - * - * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled - * to. For example, if @period_us is 1_000_000 and @quota_us is - * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be - * interpreted in the same fashion and specifies how much @cgrp can - * burst temporarily. The specific control mechanism and thus the - * interpretation of @period_us and burstiness is upto to the BPF - * scheduler. - */ - void (*cgroup_set_bandwidth)(struct cgroup *cgrp, - u64 period_us, u64 quota_us, u64 burst_us); - -#endif /* CONFIG_EXT_GROUP_SCHED */ - - /* - * All online ops must come before ops.cpu_online(). - */ - - /** - * @cpu_online: A CPU became online - * @cpu: CPU which just came up - * - * @cpu just came online. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs beforehand. - */ - void (*cpu_online)(s32 cpu); - - /** - * @cpu_offline: A CPU is going offline - * @cpu: CPU which is going offline - * - * @cpu is going offline. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs afterwards. - */ - void (*cpu_offline)(s32 cpu); - - /* - * All CPU hotplug ops must come before ops.init(). - */ - - /** - * @init: Initialize the BPF scheduler - */ - s32 (*init)(void); - - /** - * @exit: Clean up after the BPF scheduler - * @info: Exit info - * - * ops.exit() is also called on ops.init() failure, which is a bit - * unusual. This is to allow rich reporting through @info on how - * ops.init() failed. - */ - void (*exit)(struct scx_exit_info *info); - - /** - * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch - */ - u32 dispatch_max_batch; - - /** - * @flags: %SCX_OPS_* flags - */ - u64 flags; - - /** - * @timeout_ms: The maximum amount of time, in milliseconds, that a - * runnable task should be able to wait before being scheduled. The - * maximum timeout may not exceed the default timeout of 30 seconds. - * - * Defaults to the maximum allowed timeout value of 30 seconds. - */ - u32 timeout_ms; - - /** - * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default - * value of 32768 is used. - */ - u32 exit_dump_len; - - /** - * @hotplug_seq: A sequence number that may be set by the scheduler to - * detect when a hotplug event has occurred during the loading process. - * If 0, no detection occurs. Otherwise, the scheduler will fail to - * load if the sequence number does not match @scx_hotplug_seq on the - * enable path. - */ - u64 hotplug_seq; - - /** - * @name: BPF scheduler's name - * - * Must be a non-zero valid BPF object name including only isalnum(), - * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the - * BPF scheduler is enabled. - */ - char name[SCX_OPS_NAME_LEN]; - - /* internal use only, must be NULL */ - void *priv; -}; - -enum scx_opi { - SCX_OPI_BEGIN = 0, - SCX_OPI_NORMAL_BEGIN = 0, - SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), - SCX_OPI_END = SCX_OP_IDX(init), -}; - -/* - * Collection of event counters. Event types are placed in descending order. - */ -struct scx_event_stats { - /* - * If ops.select_cpu() returns a CPU which can't be used by the task, - * the core scheduler code silently picks a fallback CPU. - */ - s64 SCX_EV_SELECT_CPU_FALLBACK; - - /* - * When dispatching to a local DSQ, the CPU may have gone offline in - * the meantime. In this case, the task is bounced to the global DSQ. - */ - s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; - - /* - * If SCX_OPS_ENQ_LAST is not set, the number of times that a task - * continued to run because there were no other tasks on the CPU. - */ - s64 SCX_EV_DISPATCH_KEEP_LAST; - - /* - * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task - * is dispatched to a local DSQ when exiting. - */ - s64 SCX_EV_ENQ_SKIP_EXITING; - - /* - * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a - * migration disabled task skips ops.enqueue() and is dispatched to its - * local DSQ. - */ - s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; - - /* - * Total number of times a task's time slice was refilled with the - * default value (SCX_SLICE_DFL). - */ - s64 SCX_EV_REFILL_SLICE_DFL; - - /* - * The total duration of bypass modes in nanoseconds. - */ - s64 SCX_EV_BYPASS_DURATION; - - /* - * The number of tasks dispatched in the bypassing mode. - */ - s64 SCX_EV_BYPASS_DISPATCH; - - /* - * The number of times the bypassing mode has been activated. - */ - s64 SCX_EV_BYPASS_ACTIVATE; -}; - -struct scx_sched { - struct sched_ext_ops ops; - DECLARE_BITMAP(has_op, SCX_OPI_END); - - /* - * Dispatch queues. - * - * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. - * This is to avoid live-locking in bypass mode where all tasks are - * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If - * per-node split isn't sufficient, it can be further split. - */ - struct rhashtable dsq_hash; - struct scx_dispatch_q **global_dsqs; - - /* - * The event counters are in a per-CPU variable to minimize the - * accounting overhead. A system-wide view on the event counter is - * constructed when requested by scx_bpf_events(). - */ - struct scx_event_stats __percpu *event_stats_cpu; - - bool warned_zero_slice; - - atomic_t exit_kind; - struct scx_exit_info *exit_info; - - struct kobject kobj; - - struct kthread_worker *helper; - struct irq_work error_irq_work; - struct kthread_work disable_work; - struct rcu_work rcu_work; -}; - -enum scx_wake_flags { - /* expose select WF_* flags as enums */ - SCX_WAKE_FORK = WF_FORK, - SCX_WAKE_TTWU = WF_TTWU, - SCX_WAKE_SYNC = WF_SYNC, -}; - -enum scx_enq_flags { - /* expose select ENQUEUE_* flags as enums */ - SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, - SCX_ENQ_HEAD = ENQUEUE_HEAD, - SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, - - /* high 32bits are SCX specific */ - - /* - * Set the following to trigger preemption when calling - * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the - * current task is cleared to zero and the CPU is kicked into the - * scheduling path. Implies %SCX_ENQ_HEAD. - */ - SCX_ENQ_PREEMPT = 1LLU << 32, - - /* - * The task being enqueued was previously enqueued on the current CPU's - * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was - * invoked in a ->cpu_release() callback, and the task is again - * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the - * task will not be scheduled on the CPU until at least the next invocation - * of the ->cpu_acquire() callback. - */ - SCX_ENQ_REENQ = 1LLU << 40, - - /* - * The task being enqueued is the only task available for the cpu. By - * default, ext core keeps executing such tasks but when - * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the - * %SCX_ENQ_LAST flag set. - * - * The BPF scheduler is responsible for triggering a follow-up - * scheduling event. Otherwise, Execution may stall. - */ - SCX_ENQ_LAST = 1LLU << 41, - - /* high 8 bits are internal */ - __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, - - SCX_ENQ_CLEAR_OPSS = 1LLU << 56, - SCX_ENQ_DSQ_PRIQ = 1LLU << 57, -}; - -enum scx_deq_flags { - /* expose select DEQUEUE_* flags as enums */ - SCX_DEQ_SLEEP = DEQUEUE_SLEEP, - - /* high 32bits are SCX specific */ - - /* - * The generic core-sched layer decided to execute the task even though - * it hasn't been dispatched yet. Dequeue from the BPF side. - */ - SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, -}; - -enum scx_pick_idle_cpu_flags { - SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ - SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ -}; - -enum scx_kick_flags { - /* - * Kick the target CPU if idle. Guarantees that the target CPU goes - * through at least one full scheduling cycle before going idle. If the - * target CPU can be determined to be currently not idle and going to go - * through a scheduling cycle before going idle, noop. - */ - SCX_KICK_IDLE = 1LLU << 0, - - /* - * Preempt the current task and execute the dispatch path. If the - * current task of the target CPU is an SCX task, its ->scx.slice is - * cleared to zero before the scheduling path is invoked so that the - * task expires and the dispatch path is invoked. - */ - SCX_KICK_PREEMPT = 1LLU << 1, - - /* - * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will - * return after the target CPU finishes picking the next task. - */ - SCX_KICK_WAIT = 1LLU << 2, -}; - -enum scx_tg_flags { - SCX_TG_ONLINE = 1U << 0, - SCX_TG_INITED = 1U << 1, -}; - -enum scx_enable_state { - SCX_ENABLING, - SCX_ENABLED, - SCX_DISABLING, - SCX_DISABLED, -}; - -static const char *scx_enable_state_str[] = { - [SCX_ENABLING] = "enabling", - [SCX_ENABLED] = "enabled", - [SCX_DISABLING] = "disabling", - [SCX_DISABLED] = "disabled", -}; - -/* - * sched_ext_entity->ops_state - * - * Used to track the task ownership between the SCX core and the BPF scheduler. - * State transitions look as follows: - * - * NONE -> QUEUEING -> QUEUED -> DISPATCHING - * ^ | | - * | v v - * \-------------------------------/ - * - * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call - * sites for explanations on the conditions being waited upon and why they are - * safe. Transitions out of them into NONE or QUEUED must store_release and the - * waiters should load_acquire. - * - * Tracking scx_ops_state enables sched_ext core to reliably determine whether - * any given task can be dispatched by the BPF scheduler at all times and thus - * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler - * to try to dispatch any task anytime regardless of its state as the SCX core - * can safely reject invalid dispatches. - */ -enum scx_ops_state { - SCX_OPSS_NONE, /* owned by the SCX core */ - SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ - SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ - SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ - - /* - * QSEQ brands each QUEUED instance so that, when dispatch races - * dequeue/requeue, the dispatcher can tell whether it still has a claim - * on the task being dispatched. - * - * As some 32bit archs can't do 64bit store_release/load_acquire, - * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on - * 32bit machines. The dispatch race window QSEQ protects is very narrow - * and runs with IRQ disabled. 30 bits should be sufficient. - */ - SCX_OPSS_QSEQ_SHIFT = 2, -}; - -/* Use macros to ensure that the type is unsigned long for the masks */ -#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) -#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) - /* * NOTE: sched_ext is in the process of growing multiple scheduler support and * scx_root usage is in a transitional state. Naked dereferences are safe if the diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 292bb41a242ec1..33858607bc97f5 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,29 +8,6 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT -static inline bool scx_kf_allowed_if_unlocked(void) -{ - return !current->scx.kf_mask; -} - -static inline bool scx_rq_bypassing(struct rq *rq) -{ - return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); -} - -DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); - -DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); - -/* - * Return the rq currently locked from an scx callback, or NULL if no rq is - * locked. - */ -static inline struct rq *scx_locked_rq(void) -{ - return __this_cpu_read(scx_locked_rq_state); -} - void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h new file mode 100644 index 00000000000000..76690ede8700ff --- /dev/null +++ b/kernel/sched/ext_internal.h @@ -0,0 +1,1061 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo + */ +#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) + +enum scx_consts { + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, + + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_TASK_ITER_BATCH = 32, +}; + +enum scx_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ + SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ + SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +/* + * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), + * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes + * are 64bit of the format: + * + * Bits: [63 .. 48 47 .. 32 31 .. 0] + * [ SYS ACT ] [ SYS RSN ] [ USR ] + * + * SYS ACT: System-defined exit actions + * SYS RSN: System-defined exit reasons + * USR : User-defined exit codes and reasons + * + * Using the above, users may communicate intention and context by ORing system + * actions and/or system reasons with a user-defined exit code. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +/* + * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is + * being disabled. + */ +struct scx_exit_info { + /* %SCX_EXIT_* - broad category of the exit reason */ + enum scx_exit_kind kind; + + /* exit code if gracefully exiting */ + s64 exit_code; + + /* textual representation of the above */ + const char *reason; + + /* backtrace if exiting due to an error */ + unsigned long *bt; + u32 bt_len; + + /* informational message */ + char *msg; + + /* debug dump */ + char *dump; +}; + +/* sched_ext_ops.flags */ +enum scx_ops_flags { + /* + * Keep built-in idle tracking even if ops.update_idle() is implemented. + */ + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + + /* + * By default, if there are no other task to run on the CPU, ext core + * keeps running the current task even after its slice expires. If this + * flag is specified, such tasks are passed to ops.enqueue() with + * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. + */ + SCX_OPS_ENQ_LAST = 1LLU << 1, + + /* + * An exiting task may schedule after PF_EXITING is set. In such cases, + * bpf_task_from_pid() may not be able to find the task and if the BPF + * scheduler depends on pid lookup for dispatching, the task will be + * lost leading to various issues including RCU grace period stalls. + * + * To mask this problem, by default, unhashed tasks are automatically + * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't + * depend on pid lookups and wants to handle these tasks directly, the + * following flag can be used. + */ + SCX_OPS_ENQ_EXITING = 1LLU << 2, + + /* + * If set, only tasks with policy set to SCHED_EXT are attached to + * sched_ext. If clear, SCHED_NORMAL tasks are also included. + */ + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, + + /* + * CPU cgroup support flags + */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ + + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | + SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | + SCX_OPS_HAS_CGROUP_WEIGHT, + + /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ + __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, + + SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, +}; + +/* argument container for ops.init_task() */ +struct scx_init_task_args { + /* + * Set if ops.init_task() is being invoked on the fork path, as opposed + * to the scheduler transition path. + */ + bool fork; +#ifdef CONFIG_EXT_GROUP_SCHED + /* the cgroup the task is joining */ + struct cgroup *cgroup; +#endif +}; + +/* argument container for ops.exit_task() */ +struct scx_exit_task_args { + /* Whether the task exited before running on sched_ext. */ + bool cancelled; +}; + +/* argument container for ops->cgroup_init() */ +struct scx_cgroup_init_args { + /* the weight of the cgroup [1..10000] */ + u32 weight; + + /* bandwidth control parameters from cpu.max and cpu.max.burst */ + u64 bw_period_us; + u64 bw_quota_us; + u64 bw_burst_us; +}; + +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, + /* next task is being scheduled by &sched_class_dl */ + SCX_CPU_PREEMPT_DL, + /* next task is being scheduled by &sched_class_stop */ + SCX_CPU_PREEMPT_STOP, + /* unknown reason for SCX being preempted */ + SCX_CPU_PREEMPT_UNKNOWN, +}; + +/* + * Argument container for ops->cpu_acquire(). Currently empty, but may be + * expanded in the future. + */ +struct scx_cpu_acquire_args {}; + +/* argument container for ops->cpu_release() */ +struct scx_cpu_release_args { + /* the reason the CPU was preempted */ + enum scx_cpu_preempt_reason reason; + + /* the task that's going to be scheduled on the CPU */ + struct task_struct *task; +}; + +/* + * Informational context provided to dump operations. + */ +struct scx_dump_ctx { + enum scx_exit_kind kind; + s64 exit_code; + const char *reason; + u64 at_ns; + u64 at_jiffies; +}; + +/** + * struct sched_ext_ops - Operation table for BPF scheduler implementation + * + * A BPF scheduler can implement an arbitrary scheduling policy by + * implementing and loading operations in this table. Note that a userland + * scheduling policy can also be implemented using the BPF scheduler + * as a shim layer. + */ +struct sched_ext_ops { + /** + * @select_cpu: Pick the target CPU for a task which is being woken up + * @p: task being woken up + * @prev_cpu: the cpu @p was on before sleeping + * @wake_flags: SCX_WAKE_* + * + * Decision made here isn't final. @p may be moved to any CPU while it + * is getting dispatched for execution later. However, as @p is not on + * the rq at this point, getting the eventual execution CPU right here + * saves a small bit of overhead down the line. + * + * If an idle CPU is returned, the CPU is kicked and will try to + * dispatch. While an explicit custom mechanism can be added, + * select_cpu() serves as the default way to wake up idle CPUs. + * + * @p may be inserted into a DSQ directly by calling + * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. + * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ + * of the CPU returned by this operation. + * + * Note that select_cpu() is never called for tasks that can only run + * on a single CPU or tasks with migration disabled, as they don't have + * the option to select a different CPU. See select_task_rq() for + * details. + */ + s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); + + /** + * @enqueue: Enqueue a task on the BPF scheduler + * @p: task being enqueued + * @enq_flags: %SCX_ENQ_* + * + * @p is ready to run. Insert directly into a DSQ by calling + * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly + * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, + * the task will stall. + * + * If @p was inserted into a DSQ from ops.select_cpu(), this callback is + * skipped. + */ + void (*enqueue)(struct task_struct *p, u64 enq_flags); + + /** + * @dequeue: Remove a task from the BPF scheduler + * @p: task being dequeued + * @deq_flags: %SCX_DEQ_* + * + * Remove @p from the BPF scheduler. This is usually called to isolate + * the task while updating its scheduling properties (e.g. priority). + * + * The ext core keeps track of whether the BPF side owns a given task or + * not and can gracefully ignore spurious dispatches from BPF side, + * which makes it safe to not implement this method. However, depending + * on the scheduling logic, this can lead to confusing behaviors - e.g. + * scheduling position not being updated across a priority change. + */ + void (*dequeue)(struct task_struct *p, u64 deq_flags); + + /** + * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs + * @cpu: CPU to dispatch tasks for + * @prev: previous task being switched out + * + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ + * using scx_bpf_dsq_move_to_local(). + * + * The maximum number of times scx_bpf_dsq_insert() can be called + * without an intervening scx_bpf_dsq_move_to_local() is specified by + * ops.dispatch_max_batch. See the comments on top of the two functions + * for more details. + * + * When not %NULL, @prev is an SCX task with its slice depleted. If + * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in + * @prev->scx.flags, it is not enqueued yet and will be enqueued after + * ops.dispatch() returns. To keep executing @prev, return without + * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. + */ + void (*dispatch)(s32 cpu, struct task_struct *prev); + + /** + * @tick: Periodic tick + * @p: task running currently + * + * This operation is called every 1/HZ seconds on CPUs which are + * executing an SCX task. Setting @p->scx.slice to 0 will trigger an + * immediate dispatch cycle on the CPU. + */ + void (*tick)(struct task_struct *p); + + /** + * @runnable: A task is becoming runnable on its associated CPU + * @p: task becoming runnable + * @enq_flags: %SCX_ENQ_* + * + * This and the following three functions can be used to track a task's + * execution state transitions. A task becomes ->runnable() on a CPU, + * and then goes through one or more ->running() and ->stopping() pairs + * as it runs on the CPU, and eventually becomes ->quiescent() when it's + * done running on the CPU. + * + * @p is becoming runnable on the CPU because it's + * + * - waking up (%SCX_ENQ_WAKEUP) + * - being moved from another CPU + * - being restored after temporarily taken off the queue for an + * attribute change. + * + * This and ->enqueue() are related but not coupled. This operation + * notifies @p's state transition and may not be followed by ->enqueue() + * e.g. when @p is being dispatched to a remote CPU, or when @p is + * being enqueued on a CPU experiencing a hotplug event. Likewise, a + * task may be ->enqueue()'d without being preceded by this operation + * e.g. after exhausting its slice. + */ + void (*runnable)(struct task_struct *p, u64 enq_flags); + + /** + * @running: A task is starting to run on its associated CPU + * @p: task starting to run + * + * Note that this callback may be called from a CPU other than the + * one the task is going to run on. This can happen when a task + * property is changed (i.e., affinity), since scx_next_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to determine the + * target CPU the task is going to use. + * + * See ->runnable() for explanation on the task state notifiers. + */ + void (*running)(struct task_struct *p); + + /** + * @stopping: A task is stopping execution + * @p: task stopping to run + * @runnable: is task @p still runnable? + * + * Note that this callback may be called from a CPU other than the + * one the task was running on. This can happen when a task + * property is changed (i.e., affinity), since dequeue_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU + * the task was running on. + * + * See ->runnable() for explanation on the task state notifiers. If + * !@runnable, ->quiescent() will be invoked after this operation + * returns. + */ + void (*stopping)(struct task_struct *p, bool runnable); + + /** + * @quiescent: A task is becoming not runnable on its associated CPU + * @p: task becoming not runnable + * @deq_flags: %SCX_DEQ_* + * + * See ->runnable() for explanation on the task state notifiers. + * + * @p is becoming quiescent on the CPU because it's + * + * - sleeping (%SCX_DEQ_SLEEP) + * - being moved to another CPU + * - being temporarily taken off the queue for an attribute change + * (%SCX_DEQ_SAVE) + * + * This and ->dequeue() are related but not coupled. This operation + * notifies @p's state transition and may not be preceded by ->dequeue() + * e.g. when @p is being dispatched to a remote CPU. + */ + void (*quiescent)(struct task_struct *p, u64 deq_flags); + + /** + * @yield: Yield CPU + * @from: yielding task + * @to: optional yield target task + * + * If @to is NULL, @from is yielding the CPU to other runnable tasks. + * The BPF scheduler should ensure that other available tasks are + * dispatched before the yielding task. Return value is ignored in this + * case. + * + * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf + * scheduler can implement the request, return %true; otherwise, %false. + */ + bool (*yield)(struct task_struct *from, struct task_struct *to); + + /** + * @core_sched_before: Task ordering for core-sched + * @a: task A + * @b: task B + * + * Used by core-sched to determine the ordering between two tasks. See + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on + * core-sched. + * + * Both @a and @b are runnable and may or may not currently be queued on + * the BPF scheduler. Should return %true if @a should run before @b. + * %false if there's no required ordering or @b should run before @a. + * + * If not specified, the default is ordering them according to when they + * became runnable. + */ + bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); + + /** + * @set_weight: Set task weight + * @p: task to set weight for + * @weight: new weight [1..10000] + * + * Update @p's weight to @weight. + */ + void (*set_weight)(struct task_struct *p, u32 weight); + + /** + * @set_cpumask: Set CPU affinity + * @p: task to set CPU affinity for + * @cpumask: cpumask of cpus that @p can run on + * + * Update @p's CPU affinity to @cpumask. + */ + void (*set_cpumask)(struct task_struct *p, + const struct cpumask *cpumask); + + /** + * @update_idle: Update the idle state of a CPU + * @cpu: CPU to update the idle state for + * @idle: whether entering or exiting the idle state + * + * This operation is called when @rq's CPU goes or leaves the idle + * state. By default, implementing this operation disables the built-in + * idle CPU tracking and the following helpers become unavailable: + * + * - scx_bpf_select_cpu_dfl() + * - scx_bpf_select_cpu_and() + * - scx_bpf_test_and_clear_cpu_idle() + * - scx_bpf_pick_idle_cpu() + * + * The user also must implement ops.select_cpu() as the default + * implementation relies on scx_bpf_select_cpu_dfl(). + * + * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle + * tracking. + */ + void (*update_idle)(s32 cpu, bool idle); + + /** + * @cpu_acquire: A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * @cpu_release: A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); + + /** + * @init_task: Initialize a task to run in a BPF scheduler + * @p: task to initialize for BPF scheduling + * @args: init arguments, see the struct definition + * + * Either we're loading a BPF scheduler or a new task is being forked. + * Initialize @p for BPF scheduling. This operation may block and can + * be used for allocations, and is called exactly once for a task. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During a fork, it + * will abort that specific fork. + */ + s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); + + /** + * @exit_task: Exit a previously-running task from the system + * @p: task to exit + * @args: exit arguments, see the struct definition + * + * @p is exiting or the BPF scheduler is being unloaded. Perform any + * necessary cleanup for @p. + */ + void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); + + /** + * @enable: Enable BPF scheduling for a task + * @p: task to enable BPF scheduling for + * + * Enable @p for BPF scheduling. enable() is called on @p any time it + * enters SCX, and is always paired with a matching disable(). + */ + void (*enable)(struct task_struct *p); + + /** + * @disable: Disable BPF scheduling for a task + * @p: task to disable BPF scheduling for + * + * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. + * Disable BPF scheduling for @p. A disable() call is always matched + * with a prior enable() call. + */ + void (*disable)(struct task_struct *p); + + /** + * @dump: Dump BPF scheduler state on error + * @ctx: debug dump context + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. + */ + void (*dump)(struct scx_dump_ctx *ctx); + + /** + * @dump_cpu: Dump BPF scheduler state for a CPU on error + * @ctx: debug dump context + * @cpu: CPU to generate debug dump for + * @idle: @cpu is currently idle without any runnable tasks + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @cpu. If @idle is %true and this operation doesn't produce any + * output, @cpu is skipped for dump. + */ + void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); + + /** + * @dump_task: Dump BPF scheduler state for a runnable task on error + * @ctx: debug dump context + * @p: runnable task to generate debug dump for + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @p. + */ + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); + +#ifdef CONFIG_EXT_GROUP_SCHED + /** + * @cgroup_init: Initialize a cgroup + * @cgrp: cgroup being initialized + * @args: init arguments, see the struct definition + * + * Either the BPF scheduler is being loaded or @cgrp created, initialize + * @cgrp for sched_ext. This operation may block. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During cgroup + * creation, it will abort the specific cgroup creation. + */ + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + + /** + * @cgroup_exit: Exit a cgroup + * @cgrp: cgroup being exited + * + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit + * @cgrp for sched_ext. This operation my block. + */ + void (*cgroup_exit)(struct cgroup *cgrp); + + /** + * @cgroup_prep_move: Prepare a task to be moved to a different cgroup + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Prepare @p for move from cgroup @from to @to. This operation may + * block and can be used for allocations. + * + * Return 0 for success, -errno for failure. An error return aborts the + * migration. + */ + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_move: Commit cgroup move + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Commit the move. @p is dequeued during this operation. + */ + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_cancel_move: Cancel cgroup move + * @p: task whose cgroup move is being canceled + * @from: cgroup @p was being moved from + * @to: cgroup @p was being moved to + * + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). + * Undo the preparation. + */ + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_set_weight: A cgroup's weight is being changed + * @cgrp: cgroup whose weight is being updated + * @weight: new weight [1..10000] + * + * Update @cgrp's weight to @weight. + */ + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); + + /** + * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed + * @cgrp: cgroup whose bandwidth is being updated + * @period_us: bandwidth control period + * @quota_us: bandwidth control quota + * @burst_us: bandwidth control burst + * + * Update @cgrp's bandwidth control parameters. This is from the cpu.max + * cgroup interface. + * + * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled + * to. For example, if @period_us is 1_000_000 and @quota_us is + * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be + * interpreted in the same fashion and specifies how much @cgrp can + * burst temporarily. The specific control mechanism and thus the + * interpretation of @period_us and burstiness is upto to the BPF + * scheduler. + */ + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us); + +#endif /* CONFIG_EXT_GROUP_SCHED */ + + /* + * All online ops must come before ops.cpu_online(). + */ + + /** + * @cpu_online: A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * @cpu_offline: A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). + */ + + /** + * @init: Initialize the BPF scheduler + */ + s32 (*init)(void); + + /** + * @exit: Clean up after the BPF scheduler + * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. + */ + void (*exit)(struct scx_exit_info *info); + + /** + * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch + */ + u32 dispatch_max_batch; + + /** + * @flags: %SCX_OPS_* flags + */ + u64 flags; + + /** + * @timeout_ms: The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + + /** + * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default + * value of 32768 is used. + */ + u32 exit_dump_len; + + /** + * @hotplug_seq: A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + + /** + * @name: BPF scheduler's name + * + * Must be a non-zero valid BPF object name including only isalnum(), + * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the + * BPF scheduler is enabled. + */ + char name[SCX_OPS_NAME_LEN]; + + /* internal use only, must be NULL */ + void *priv; +}; + +enum scx_opi { + SCX_OPI_BEGIN = 0, + SCX_OPI_NORMAL_BEGIN = 0, + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), + SCX_OPI_END = SCX_OP_IDX(init), +}; + +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * Total number of times a task's time slice was refilled with the + * default value (SCX_SLICE_DFL). + */ + s64 SCX_EV_REFILL_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +struct scx_sched { + struct sched_ext_ops ops; + DECLARE_BITMAP(has_op, SCX_OPI_END); + + /* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. + * This is to avoid live-locking in bypass mode where all tasks are + * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If + * per-node split isn't sufficient, it can be further split. + */ + struct rhashtable dsq_hash; + struct scx_dispatch_q **global_dsqs; + + /* + * The event counters are in a per-CPU variable to minimize the + * accounting overhead. A system-wide view on the event counter is + * constructed when requested by scx_bpf_events(). + */ + struct scx_event_stats __percpu *event_stats_cpu; + + bool warned_zero_slice; + + atomic_t exit_kind; + struct scx_exit_info *exit_info; + + struct kobject kobj; + + struct kthread_worker *helper; + struct irq_work error_irq_work; + struct kthread_work disable_work; + struct rcu_work rcu_work; +}; + +enum scx_wake_flags { + /* expose select WF_* flags as enums */ + SCX_WAKE_FORK = WF_FORK, + SCX_WAKE_TTWU = WF_TTWU, + SCX_WAKE_SYNC = WF_SYNC, +}; + +enum scx_enq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, + SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, + + /* high 32bits are SCX specific */ + + /* + * Set the following to trigger preemption when calling + * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + + /* + * The task being enqueued is the only task available for the cpu. By + * default, ext core keeps executing such tasks but when + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the + * %SCX_ENQ_LAST flag set. + * + * The BPF scheduler is responsible for triggering a follow-up + * scheduling event. Otherwise, Execution may stall. + */ + SCX_ENQ_LAST = 1LLU << 41, + + /* high 8 bits are internal */ + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, + + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +}; + +enum scx_deq_flags { + /* expose select DEQUEUE_* flags as enums */ + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ +}; + +enum scx_kick_flags { + /* + * Kick the target CPU if idle. Guarantees that the target CPU goes + * through at least one full scheduling cycle before going idle. If the + * target CPU can be determined to be currently not idle and going to go + * through a scheduling cycle before going idle, noop. + */ + SCX_KICK_IDLE = 1LLU << 0, + + /* + * Preempt the current task and execute the dispatch path. If the + * current task of the target CPU is an SCX task, its ->scx.slice is + * cleared to zero before the scheduling path is invoked so that the + * task expires and the dispatch path is invoked. + */ + SCX_KICK_PREEMPT = 1LLU << 1, + + /* + * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will + * return after the target CPU finishes picking the next task. + */ + SCX_KICK_WAIT = 1LLU << 2, +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + +enum scx_enable_state { + SCX_ENABLING, + SCX_ENABLED, + SCX_DISABLING, + SCX_DISABLED, +}; + +static const char *scx_enable_state_str[] = { + [SCX_ENABLING] = "enabling", + [SCX_ENABLED] = "enabled", + [SCX_DISABLING] = "disabling", + [SCX_DISABLED] = "disabled", +}; + +/* + * sched_ext_entity->ops_state + * + * Used to track the task ownership between the SCX core and the BPF scheduler. + * State transitions look as follows: + * + * NONE -> QUEUEING -> QUEUED -> DISPATCHING + * ^ | | + * | v v + * \-------------------------------/ + * + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call + * sites for explanations on the conditions being waited upon and why they are + * safe. Transitions out of them into NONE or QUEUED must store_release and the + * waiters should load_acquire. + * + * Tracking scx_ops_state enables sched_ext core to reliably determine whether + * any given task can be dispatched by the BPF scheduler at all times and thus + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler + * to try to dispatch any task anytime regardless of its state as the SCX core + * can safely reject invalid dispatches. + */ +enum scx_ops_state { + SCX_OPSS_NONE, /* owned by the SCX core */ + SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ + SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ + SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ + + /* + * QSEQ brands each QUEUED instance so that, when dispatch races + * dequeue/requeue, the dispatcher can tell whether it still has a claim + * on the task being dispatched. + * + * As some 32bit archs can't do 64bit store_release/load_acquire, + * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on + * 32bit machines. The dispatch race window QSEQ protects is very narrow + * and runs with IRQ disabled. 30 bits should be sufficient. + */ + SCX_OPSS_QSEQ_SHIFT = 2, +}; + +/* Use macros to ensure that the type is unsigned long for the masks */ +#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) + +DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(scx_locked_rq_state); +} + +static inline bool scx_kf_allowed_if_unlocked(void) +{ + return !current->scx.kf_mask; +} + +static inline bool scx_rq_bypassing(struct rq *rq) +{ + return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +} From bcb7c2305682c77a8bfdbfe37106b314ac10110f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2025 11:33:28 -1000 Subject: [PATCH 0806/3206] sched_ext: Put event_stats_cpu in struct scx_sched_pcpu scx_sched.event_stats_cpu is the percpu counters that are used to track stats. Introduce struct scx_sched_pcpu and move the counters inside. This will ease adding more per-cpu fields. No functional changes. Signed-off-by: Tejun Heo Acked-by: Andrea Righi --- kernel/sched/ext.c | 18 +++++++++--------- kernel/sched/ext_internal.h | 17 ++++++++++------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7e15e852370cd2..701ca239ad003a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -635,7 +635,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * This can be used when preemption is not disabled. */ #define scx_add_event(sch, name, cnt) do { \ - this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ trace_sched_ext_event(#name, (cnt)); \ } while(0) @@ -648,7 +648,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * This should be used only when preemption is disabled. */ #define __scx_add_event(sch, name, cnt) do { \ - __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ trace_sched_ext_event(#name, cnt); \ } while(0) @@ -3543,7 +3543,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work) int node; kthread_stop(sch->helper->task); - free_percpu(sch->event_stats_cpu); + free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); @@ -4444,13 +4444,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sch->global_dsqs[node] = dsq; } - sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); - if (!sch->event_stats_cpu) + sch->pcpu = alloc_percpu(struct scx_sched_pcpu); + if (!sch->pcpu) goto err_free_gdsqs; sch->helper = kthread_run_worker(0, "sched_ext_helper"); if (!sch->helper) - goto err_free_event_stats; + goto err_free_pcpu; sched_set_fifo(sch->helper->task); atomic_set(&sch->exit_kind, SCX_EXIT_NONE); @@ -4468,8 +4468,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) err_stop_helper: kthread_stop(sch->helper->task); -err_free_event_stats: - free_percpu(sch->event_stats_cpu); +err_free_pcpu: + free_percpu(sch->pcpu); err_free_gdsqs: for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); @@ -6493,7 +6493,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event /* Aggregate per-CPU event counters into @events. */ memset(events, 0, sizeof(*events)); for_each_possible_cpu(cpu) { - e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); + e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 76690ede8700ff..af4c054fb6f852 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -846,6 +846,15 @@ struct scx_event_stats { s64 SCX_EV_BYPASS_ACTIVATE; }; +struct scx_sched_pcpu { + /* + * The event counters are in a per-CPU variable to minimize the + * accounting overhead. A system-wide view on the event counter is + * constructed when requested by scx_bpf_events(). + */ + struct scx_event_stats event_stats; +}; + struct scx_sched { struct sched_ext_ops ops; DECLARE_BITMAP(has_op, SCX_OPI_END); @@ -860,13 +869,7 @@ struct scx_sched { */ struct rhashtable dsq_hash; struct scx_dispatch_q **global_dsqs; - - /* - * The event counters are in a per-CPU variable to minimize the - * accounting overhead. A system-wide view on the event counter is - * constructed when requested by scx_bpf_events(). - */ - struct scx_event_stats __percpu *event_stats_cpu; + struct scx_sched_pcpu __percpu *pcpu; bool warned_zero_slice; From 59ffc9beeb8b332940d36f4b9803352b7f893f5a Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Tue, 2 Sep 2025 09:11:08 -0400 Subject: [PATCH 0807/3206] selinux: fix sel_read_bool() allocation and error handling Switch sel_read_bool() from using get_zeroed_page() and free_page() to a stack-allocated buffer. This also fixes a memory leak in the error path when security_get_bool_value() returns an error. Reported-by: Matthew Wilcox Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/selinuxfs.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 9aa1d03ab6120a..232e087bce3eea 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -1203,7 +1203,7 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, size_t count, loff_t *ppos) { struct selinux_fs_info *fsi = file_inode(filep)->i_sb->s_fs_info; - char *page = NULL; + char buffer[4]; ssize_t length; ssize_t ret; int cur_enforcing; @@ -1217,27 +1217,19 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, fsi->bool_pending_names[index])) goto out_unlock; - ret = -ENOMEM; - page = (char *)get_zeroed_page(GFP_KERNEL); - if (!page) - goto out_unlock; - cur_enforcing = security_get_bool_value(index); if (cur_enforcing < 0) { ret = cur_enforcing; goto out_unlock; } - length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing, - fsi->bool_pending_values[index]); + length = scnprintf(buffer, sizeof(buffer), "%d %d", !!cur_enforcing, + !!fsi->bool_pending_values[index]); mutex_unlock(&selinux_state.policy_mutex); - ret = simple_read_from_buffer(buf, count, ppos, page, length); -out_free: - free_page((unsigned long)page); - return ret; + return simple_read_from_buffer(buf, count, ppos, buffer, length); out_unlock: mutex_unlock(&selinux_state.policy_mutex); - goto out_free; + return ret; } static ssize_t sel_write_bool(struct file *filep, const char __user *buf, From a5bd6ba30b3364354269b81ac55c2edca9a96d6d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2025 11:36:07 -1000 Subject: [PATCH 0808/3206] sched_ext: Use cgroup_lock/unlock() to synchronize against cgroup operations SCX hooks into CPU cgroup controller operations and read-locks scx_cgroup_rwsem to exclude them while enabling and disable schedulers. While this works, it's unnecessarily complicated given that cgroup_[un]lock() are available and thus the cgroup operations can be locked out that way. Drop scx_cgroup_rwsem locking from the tg on/offline and cgroup [can_]attach operations. Instead, grab cgroup_lock() from scx_cgroup_lock(). Drop scx_cgroup_finish_attach() which is no longer necessary. Drop the now unnecessary rcu locking and css ref bumping in scx_cgroup_init() and scx_cgroup_exit(). As scx_cgroup_set_weight/bandwidth() paths aren't protected by cgroup_lock(), rename scx_cgroup_rwsem to scx_cgroup_ops_rwsem and retain the locking there. This is overall simpler and will also allow enable/disable paths to synchronize against cgroup changes independent of the CPU controller. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Acked-by: Andrea Righi --- kernel/sched/core.c | 2 -- kernel/sched/ext.c | 66 ++++++++++----------------------------------- kernel/sched/ext.h | 2 -- 3 files changed, 14 insertions(+), 56 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4cc..27dda808ed8391 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9362,8 +9362,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task, false); - - scx_cgroup_finish_attach(); } static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 701ca239ad003a..520f20ffb7bfda 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3055,7 +3055,7 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED -DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); static bool scx_cgroup_enabled; void scx_tg_init(struct task_group *tg) @@ -3072,8 +3072,6 @@ int scx_tg_online(struct task_group *tg) WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled) { if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = @@ -3093,7 +3091,6 @@ int scx_tg_online(struct task_group *tg) tg->scx.flags |= SCX_TG_ONLINE; } - percpu_up_read(&scx_cgroup_rwsem); return ret; } @@ -3103,15 +3100,11 @@ void scx_tg_offline(struct task_group *tg) WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && (tg->scx.flags & SCX_TG_INITED)) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); - - percpu_up_read(&scx_cgroup_rwsem); } int scx_cgroup_can_attach(struct cgroup_taskset *tset) @@ -3121,9 +3114,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) struct task_struct *p; int ret; - /* released in scx_finish/cancel_attach() */ - percpu_down_read(&scx_cgroup_rwsem); - if (!scx_cgroup_enabled) return 0; @@ -3163,7 +3153,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) p->scx.cgrp_moving_from = NULL; } - percpu_up_read(&scx_cgroup_rwsem); return ops_sanitize_err(sch, "cgroup_prep_move", ret); } @@ -3186,11 +3175,6 @@ void scx_cgroup_move_task(struct task_struct *p) p->scx.cgrp_moving_from = NULL; } -void scx_cgroup_finish_attach(void) -{ - percpu_up_read(&scx_cgroup_rwsem); -} - void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) { struct scx_sched *sch = scx_root; @@ -3198,7 +3182,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) struct task_struct *p; if (!scx_cgroup_enabled) - goto out_unlock; + return; cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(sch, cgroup_cancel_move) && @@ -3207,15 +3191,13 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } -out_unlock: - percpu_up_read(&scx_cgroup_rwsem); } void scx_group_set_weight(struct task_group *tg, unsigned long weight) { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) @@ -3224,7 +3206,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) tg->scx.weight = weight; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_idle(struct task_group *tg, bool idle) @@ -3237,7 +3219,7 @@ void scx_group_set_bandwidth(struct task_group *tg, { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && (tg->scx.bw_period_us != period_us || @@ -3250,23 +3232,25 @@ void scx_group_set_bandwidth(struct task_group *tg, tg->scx.bw_quota_us = quota_us; tg->scx.bw_burst_us = burst_us; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } static void scx_cgroup_lock(void) { - percpu_down_write(&scx_cgroup_rwsem); + percpu_down_write(&scx_cgroup_ops_rwsem); + cgroup_lock(); } static void scx_cgroup_unlock(void) { - percpu_up_write(&scx_cgroup_rwsem); + cgroup_unlock(); + percpu_up_write(&scx_cgroup_ops_rwsem); } #else /* CONFIG_EXT_GROUP_SCHED */ -static inline void scx_cgroup_lock(void) {} -static inline void scx_cgroup_unlock(void) {} +static void scx_cgroup_lock(void) {} +static void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED */ @@ -3382,15 +3366,12 @@ static void scx_cgroup_exit(struct scx_sched *sch) { struct cgroup_subsys_state *css; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - scx_cgroup_enabled = false; /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and exit all the inited ones, all online cgroups are exited. */ - rcu_read_lock(); css_for_each_descendant_post(css, &root_task_group.css) { struct task_group *tg = css_tg(css); @@ -3401,17 +3382,9 @@ static void scx_cgroup_exit(struct scx_sched *sch) if (!sch->ops.cgroup_exit) continue; - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup); - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); } static int scx_cgroup_init(struct scx_sched *sch) @@ -3419,13 +3392,10 @@ static int scx_cgroup_init(struct scx_sched *sch) struct cgroup_subsys_state *css; int ret; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and init, all online cgroups are initialized. */ - rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); struct scx_cgroup_init_args args = { @@ -3444,10 +3414,6 @@ static int scx_cgroup_init(struct scx_sched *sch) continue; } - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { @@ -3456,11 +3422,7 @@ static int scx_cgroup_init(struct scx_sched *sch) return ret; } tg->scx.flags |= SCX_TG_INITED; - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); WARN_ON_ONCE(scx_cgroup_enabled); scx_cgroup_enabled = true; diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 33858607bc97f5..43429b33e52c11 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -77,7 +77,6 @@ int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); void scx_cgroup_move_task(struct task_struct *p); -void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); @@ -88,7 +87,6 @@ static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } static inline void scx_cgroup_move_task(struct task_struct *p) {} -static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} From e0ca169638be12a0a861e3439e6117c58972cd08 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Wed, 3 Sep 2025 22:23:09 +0100 Subject: [PATCH 0809/3206] sched_ext: Introduce scx_bpf_locked_rq() Most fields in scx_bpf_cpu_rq() assume that its rq_lock is held. Furthermore they become meaningless without rq lock, too. Make a safer version of scx_bpf_cpu_rq() that only returns a rq if we hold rq lock of that rq. Also mark the new scx_bpf_locked_rq() as returning NULL as scx_bpf_cpu_rq() should've been too. Signed-off-by: Christian Loehle Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 23 +++++++++++++++++++++++ tools/sched_ext/include/scx/common.bpf.h | 1 + 2 files changed, 24 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 520f20ffb7bfda..a319ea5bb25a3d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6357,6 +6357,28 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) return cpu_rq(cpu); } +/** + * scx_bpf_locked_rq - Return the rq currently locked by SCX + * + * Returns the rq if a rq lock is currently held by SCX. + * Otherwise emits an error and returns NULL. + */ +__bpf_kfunc struct rq *scx_bpf_locked_rq(void) +{ + struct rq *rq; + + preempt_disable(); + rq = scx_locked_rq(); + if (!rq) { + preempt_enable(); + scx_kf_error("accessing rq without holding rq lock"); + return NULL; + } + preempt_enable(); + + return rq; +} + /** * scx_bpf_task_cgroup - Return the sched cgroup of a task * @p: task of interest @@ -6521,6 +6543,7 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) +BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL) #ifdef CONFIG_CGROUP_SCHED BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 86abdb3c3142ac..e2a57b7458625b 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -103,6 +103,7 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; +struct rq *scx_bpf_locked_rq(void) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; From 20b158094a1adc9bbfdcc41780059b5cd8866ad8 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Wed, 3 Sep 2025 22:23:10 +0100 Subject: [PATCH 0810/3206] sched_ext: Introduce scx_bpf_cpu_curr() Provide scx_bpf_cpu_curr() as a way for scx schedulers to check the curr task of a remote rq without assuming its lock is held. Many scx schedulers make use of scx_bpf_cpu_rq() to check a remote curr (e.g. to see if it should be preempted). This is problematic because scx_bpf_cpu_rq() provides access to all fields of struct rq, most of which aren't safe to use without holding the associated rq lock. Signed-off-by: Christian Loehle Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 14 ++++++++++++++ tools/sched_ext/include/scx/common.bpf.h | 1 + 2 files changed, 15 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index a319ea5bb25a3d..4bd9b491b376ac 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6379,6 +6379,19 @@ __bpf_kfunc struct rq *scx_bpf_locked_rq(void) return rq; } +/** + * scx_bpf_cpu_curr - Return remote CPU's curr task + * @cpu: CPU of interest + * + * Callers must hold RCU read lock (KF_RCU). + */ +__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) +{ + if (!kf_cpu_valid(cpu, NULL)) + return NULL; + return rcu_dereference(cpu_rq(cpu)->curr); +} + /** * scx_bpf_task_cgroup - Return the sched cgroup of a task * @p: task of interest @@ -6544,6 +6557,7 @@ BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL) +BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU) #ifdef CONFIG_CGROUP_SCHED BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index e2a57b7458625b..342c7c48df5ab6 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -104,6 +104,7 @@ bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct rq *scx_bpf_locked_rq(void) __ksym; +struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; From 5c48d88fe004988ec508923dbdd00549e65f4055 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Wed, 3 Sep 2025 22:23:11 +0100 Subject: [PATCH 0811/3206] sched_ext: deprecation warn for scx_bpf_cpu_rq() scx_bpf_cpu_rq() works on an unlocked rq which generally isn't safe. For the common use-cases scx_bpf_locked_rq() and scx_bpf_cpu_curr() work, so add a deprecation warning to scx_bpf_cpu_rq() so it can eventually be removed. Signed-off-by: Christian Loehle Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 9 +++++++++ kernel/sched/ext_internal.h | 1 + 2 files changed, 10 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4bd9b491b376ac..4160a4a7af67e6 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6351,9 +6351,18 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) */ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) { + struct scx_sched *sch = scx_root; + if (!kf_cpu_valid(cpu, NULL)) return NULL; + if (!sch->warned_deprecated_rq) { + printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " + "use scx_bpf_locked_rq() when holding rq lock " + "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); + sch->warned_deprecated_rq = true; + } + return cpu_rq(cpu); } diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index af4c054fb6f852..2e289931e567fe 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -872,6 +872,7 @@ struct scx_sched { struct scx_sched_pcpu __percpu *pcpu; bool warned_zero_slice; + bool warned_deprecated_rq; atomic_t exit_kind; struct scx_exit_info *exit_info; From 2a912258c90e895363c0ffc0be8a47f112ab67b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20B=2E=20Marli=C3=A8re?= Date: Thu, 28 Aug 2025 15:48:30 -0300 Subject: [PATCH 0812/3206] selftests/bpf: Upon failures, exit with code 1 in test_xsk.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, even if some subtests fails, the end result will still yield "ok 1 selftests: bpf: test_xsk.sh". Fix it by exiting with 1 if there are any failures. Signed-off-by: Ricardo B. Marlière Signed-off-by: Andrii Nakryiko Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20250828-selftests-bpf-test_xsk_ret-v1-1-e6656c01f397@suse.com --- tools/testing/selftests/bpf/test_xsk.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 65aafe0003db05..62db060298a4a3 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -241,4 +241,6 @@ done if [ $failures -eq 0 ]; then echo "All tests successful!" +else + exit 1 fi From c9110e6f7237f4a314e2b87b75a8a158b9877a7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20B=2E=20Marli=C3=A8re?= Date: Fri, 29 Aug 2025 16:33:49 -0300 Subject: [PATCH 0813/3206] selftests/bpf: Fix count write in testapp_xdp_metadata_copy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4b302092553c ("selftests/xsk: Add tail adjustment tests and support check") added a new global to xsk_xdp_progs.c, but left out the access in the testapp_xdp_metadata_copy() function. Since bpf_map_update_elem() will write to the whole bss section, it gets truncated. Fix by writing to skel_rx->bss->count directly. Fixes: 4b302092553c ("selftests/xsk: Add tail adjustment tests and support check") Signed-off-by: Ricardo B. Marlière Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250829-selftests-bpf-xsk_regression_fix-v1-1-5f5acdb9fe6b@suse.com --- tools/testing/selftests/bpf/xskxceiver.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index a29de0713f19f0..352adc8df2d1cd 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -2276,25 +2276,13 @@ static int testapp_xdp_metadata_copy(struct test_spec *test) { struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; - struct bpf_map *data_map; - int count = 0; - int key = 0; test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_populate_metadata, skel_tx->progs.xsk_xdp_populate_metadata, skel_rx->maps.xsk, skel_tx->maps.xsk); test->ifobj_rx->use_metadata = true; - data_map = bpf_object__find_map_by_name(skel_rx->obj, "xsk_xdp_.bss"); - if (!data_map || !bpf_map__is_internal(data_map)) { - ksft_print_msg("Error: could not find bss section of XDP program\n"); - return TEST_FAILURE; - } - - if (bpf_map_update_elem(bpf_map__fd(data_map), &key, &count, BPF_ANY)) { - ksft_print_msg("Error: could not update count element\n"); - return TEST_FAILURE; - } + skel_rx->bss->count = 0; return testapp_validate_traffic(test); } From 394bfac1c7f7b701c2c93834c5761b9c9ceeebcf Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 22 Aug 2025 06:33:18 +0000 Subject: [PATCH 0814/3206] mm/khugepaged: fix the address passed to notifier on testing young Commit 8ee53820edfd ("thp: mmu_notifier_test_young") introduced mmu_notifier_test_young(), but we are passing the wrong address. In xxx_scan_pmd(), the actual iteration address is "_address" not "address". We seem to misuse the variable on the very beginning. Change it to the right one. [akpm@linux-foundation.org fix whitespace, per everyone] Link: https://lkml.kernel.org/r/20250822063318.11644-1-richard.weiyang@gmail.com Fixes: 8ee53820edfd ("thp: mmu_notifier_test_young") Signed-off-by: Wei Yang Reviewed-by: Dev Jain Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Liam R. Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Barry Song Cc: Signed-off-by: Andrew Morton --- mm/khugepaged.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b40bdfd224c3c..b486c1d19b2dd2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1417,8 +1417,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || - folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, - address))) + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, _address))) referenced++; } if (!writable) { From 397f6d14f9c370e4910e6885294c340f39dedbf5 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Fri, 27 Jun 2025 20:57:47 +0800 Subject: [PATCH 0815/3206] mm/memory_hotplug: fix hwpoisoned large folio handling in do_migrate_range() In do_migrate_range(), the hwpoisoned folio may be large folio, which can't be handled by unmap_poisoned_folio(). I can reproduce this issue in qemu after adding delay in memory_failure() BUG: kernel NULL pointer dereference, address: 0000000000000000 Workqueue: kacpi_hotplug acpi_hotplug_work_fn RIP: 0010:try_to_unmap_one+0x16a/0xfc0 rmap_walk_anon+0xda/0x1f0 try_to_unmap+0x78/0x80 ? __pfx_try_to_unmap_one+0x10/0x10 ? __pfx_folio_not_mapped+0x10/0x10 ? __pfx_folio_lock_anon_vma_read+0x10/0x10 unmap_poisoned_folio+0x60/0x140 do_migrate_range+0x4d1/0x600 ? slab_memory_callback+0x6a/0x190 ? notifier_call_chain+0x56/0xb0 offline_pages+0x3e6/0x460 memory_subsys_offline+0x130/0x1f0 device_offline+0xba/0x110 acpi_bus_offline+0xb7/0x130 acpi_scan_hot_remove+0x77/0x290 acpi_device_hotplug+0x1e0/0x240 acpi_hotplug_work_fn+0x1a/0x30 process_one_work+0x186/0x340 Besides, do_migrate_range() may be called between memory_failure set hwpoison flag and isolate the folio from lru, so remove WARN_ON(). In other places, unmap_poisoned_folio() is called when the folio is isolated, obey it in do_migrate_range() too. [david@redhat.com: don't abort offlining, fixed typo, add comment] Link: https://lkml.kernel.org/r/3c214dff-9649-4015-840f-10de0e03ebe4@redhat.com Fixes: b15c87263a69 ("hwpoison, memory_hotplug: allow hwpoisoned pages to be offlined") Signed-off-by: Jinjiang Tu Signed-off-by: David Hildenbrand Acked-by: Zi Yan Reviewed-by: Miaohe Lin Cc: Kefeng Wang Cc: Luis Chamberalin Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1f15af712bc346..74318c78771567 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1815,8 +1815,14 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; if (folio_contain_hwpoisoned_page(folio)) { - if (WARN_ON(folio_test_lru(folio))) - folio_isolate_lru(folio); + /* + * unmap_poisoned_folio() cannot handle large folios + * in all cases yet. + */ + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + goto put_folio; + if (folio_test_lru(folio) && !folio_isolate_lru(folio)) + goto put_folio; if (folio_mapped(folio)) { folio_lock(folio); unmap_poisoned_folio(folio, pfn, false); From 669602b5b7386e4fa00fc67b045ca3fd816e685d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 24 Aug 2025 16:07:59 +0300 Subject: [PATCH 0816/3206] init/main.c: fix boot time tracing crash Steven Rostedt reported a crash with "ftrace=function" kernel command line: [ 0.159269] BUG: kernel NULL pointer dereference, address: 000000000000001c [ 0.160254] #PF: supervisor read access in kernel mode [ 0.160975] #PF: error_code(0x0000) - not-present page [ 0.161697] PGD 0 P4D 0 [ 0.162055] Oops: Oops: 0000 [#1] SMP PTI [ 0.162619] CPU: 0 UID: 0 PID: 0 Comm: swapper Not tainted 6.17.0-rc2-test-00006-g48d06e78b7cb-dirty #9 PREEMPT(undef) [ 0.164141] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 0.165439] RIP: 0010:kmem_cache_alloc_noprof (mm/slub.c:4237) [ 0.166186] Code: 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 83 e4 f0 48 83 ec 20 8b 05 c9 b6 7e 01 <44> 8b 77 1c 65 4c 8b 2d b5 ea 20 02 4c 89 6c 24 18 41 89 f5 21 f0 [ 0.168811] RSP: 0000:ffffffffb2e03b30 EFLAGS: 00010086 [ 0.169545] RAX: 0000000001fff33f RBX: 0000000000000000 RCX: 0000000000000000 [ 0.170544] RDX: 0000000000002800 RSI: 0000000000002800 RDI: 0000000000000000 [ 0.171554] RBP: ffffffffb2e03b80 R08: 0000000000000004 R09: ffffffffb2e03c90 [ 0.172549] R10: ffffffffb2e03c90 R11: 0000000000000000 R12: 0000000000000000 [ 0.173544] R13: ffffffffb2e03c90 R14: ffffffffb2e03c90 R15: 0000000000000001 [ 0.174542] FS: 0000000000000000(0000) GS:ffff9d2808114000(0000) knlGS:0000000000000000 [ 0.175684] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 0.176486] CR2: 000000000000001c CR3: 000000007264c001 CR4: 00000000000200b0 [ 0.177483] Call Trace: [ 0.177828] [ 0.178123] mas_alloc_nodes (lib/maple_tree.c:176 (discriminator 2) lib/maple_tree.c:1255 (discriminator 2)) [ 0.178692] mas_store_gfp (lib/maple_tree.c:5468) [ 0.179223] execmem_cache_add_locked (mm/execmem.c:207) [ 0.179870] execmem_alloc (mm/execmem.c:213 mm/execmem.c:313 mm/execmem.c:335 mm/execmem.c:475) [ 0.180397] ? ftrace_caller (arch/x86/kernel/ftrace_64.S:169) [ 0.180922] ? __pfx_ftrace_caller (arch/x86/kernel/ftrace_64.S:158) [ 0.181517] execmem_alloc_rw (mm/execmem.c:487) [ 0.182052] arch_ftrace_update_trampoline (arch/x86/kernel/ftrace.c:266 arch/x86/kernel/ftrace.c:344 arch/x86/kernel/ftrace.c:474) [ 0.182778] ? ftrace_caller_op_ptr (arch/x86/kernel/ftrace_64.S:182) [ 0.183388] ftrace_update_trampoline (kernel/trace/ftrace.c:7947) [ 0.184024] __register_ftrace_function (kernel/trace/ftrace.c:368) [ 0.184682] ftrace_startup (kernel/trace/ftrace.c:3048) [ 0.185205] ? __pfx_function_trace_call (kernel/trace/trace_functions.c:210) [ 0.185877] register_ftrace_function_nolock (kernel/trace/ftrace.c:8717) [ 0.186595] register_ftrace_function (kernel/trace/ftrace.c:8745) [ 0.187254] ? __pfx_function_trace_call (kernel/trace/trace_functions.c:210) [ 0.187924] function_trace_init (kernel/trace/trace_functions.c:170) [ 0.188499] tracing_set_tracer (kernel/trace/trace.c:5916 kernel/trace/trace.c:6349) [ 0.189088] register_tracer (kernel/trace/trace.c:2391) [ 0.189642] early_trace_init (kernel/trace/trace.c:11075 kernel/trace/trace.c:11149) [ 0.190204] start_kernel (init/main.c:970) [ 0.190732] x86_64_start_reservations (arch/x86/kernel/head64.c:307) [ 0.191381] x86_64_start_kernel (??:?) [ 0.191955] common_startup_64 (arch/x86/kernel/head_64.S:419) [ 0.192534] [ 0.192839] Modules linked in: [ 0.193267] CR2: 000000000000001c [ 0.193730] ---[ end trace 0000000000000000 ]--- The crash happens because on x86 ftrace allocations from execmem require maple tree to be initialized. Move maple tree initialization that depends only on slab availability earlier in boot so that it will happen right after mm_core_init(). Link: https://lkml.kernel.org/r/20250824130759.1732736-1-rppt@kernel.org Fixes: 5d79c2be5081 ("x86/ftrace: enable EXECMEM_ROX_CACHE for ftrace allocations") Signed-off-by: Mike Rapoport (Microsoft) Reported-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Closes: https://lore.kernel.org/all/20250820184743.0302a8b5@gandalf.local.home/ Reviewed-by: Masami Hiramatsu (Google) Reviewed-by: Liam R. Howlett Cc: Borislav Betkov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index 0ee0ee7b7c2c0a..5753e9539ae6fb 100644 --- a/init/main.c +++ b/init/main.c @@ -956,6 +956,7 @@ void start_kernel(void) sort_main_extable(); trap_init(); mm_core_init(); + maple_tree_init(); poking_init(); ftrace_init(); @@ -973,7 +974,6 @@ void start_kernel(void) "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); radix_tree_init(); - maple_tree_init(); /* * Set up housekeeping before setting up workqueues to allow the unbound From 21cc2b5c5062a256ae9064442d37ebbc23f5aef7 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Sun, 24 Aug 2025 03:21:15 +0900 Subject: [PATCH 0817/3206] mm/hugetlb: add missing hugetlb_lock in __unmap_hugepage_range() When restoring a reservation for an anonymous page, we need to check to freeing a surplus. However, __unmap_hugepage_range() causes data race because it reads h->surplus_huge_pages without the protection of hugetlb_lock. And adjust_reservation is a boolean variable that indicates whether reservations for anonymous pages in each folio should be restored. Therefore, it should be initialized to false for each round of the loop. However, this variable is not initialized to false except when defining the current adjust_reservation variable. This means that once adjust_reservation is set to true even once within the loop, reservations for anonymous pages will be restored unconditionally in all subsequent rounds, regardless of the folio's state. To fix this, we need to add the missing hugetlb_lock, unlock the page_table_lock earlier so that we don't lock the hugetlb_lock inside the page_table_lock lock, and initialize adjust_reservation to false on each round within the loop. Link: https://lkml.kernel.org/r/20250823182115.1193563-1-aha310510@gmail.com Fixes: df7a6d1f6405 ("mm/hugetlb: restore the reservation if needed") Signed-off-by: Jeongjun Park Reported-by: syzbot+417aeb05fd190f3a6da9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=417aeb05fd190f3a6da9 Reviewed-by: Sidhartha Kumar Cc: Breno Leitao Cc: David Hildenbrand Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 753f99b4c71866..eed59cfb5d218a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5851,7 +5851,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, spinlock_t *ptl; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - bool adjust_reservation = false; + bool adjust_reservation; unsigned long last_addr_mask; bool force_flush = false; @@ -5944,6 +5944,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, sz); hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(folio); + spin_unlock(ptl); /* * Restore the reservation for anonymous page, otherwise the @@ -5951,14 +5952,16 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * If there we are freeing a surplus, do not set the restore * reservation bit. */ + adjust_reservation = false; + + spin_lock_irq(&hugetlb_lock); if (!h->surplus_huge_pages && __vma_private_lock(vma) && folio_test_anon(folio)) { folio_set_hugetlb_restore_reserve(folio); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } - - spin_unlock(ptl); + spin_unlock_irq(&hugetlb_lock); /* * Adjust the reservation for the region that will have the From ce652aac9c90a96c6536681d17518efb1f660fb8 Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Fri, 22 Aug 2025 11:50:57 +0900 Subject: [PATCH 0818/3206] mm/damon/core: set quota->charged_from to jiffies at first charge window Kernel initializes the "jiffies" timer as 5 minutes below zero, as shown in include/linux/jiffies.h /* * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) And jiffies comparison help functions cast unsigned value to signed to cover wraparound #define time_after_eq(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((a) - (b)) >= 0)) When quota->charged_from is initialized to 0, time_after_eq() can incorrectly return FALSE even after reset_interval has elapsed. This occurs when (jiffies - reset_interval) produces a value with MSB=1, which is interpreted as negative in signed arithmetic. This issue primarily affects 32-bit systems because: On 64-bit systems: MSB=1 values occur after ~292 million years from boot (assuming HZ=1000), almost impossible. On 32-bit systems: MSB=1 values occur during the first 5 minutes after boot, and the second half of every jiffies wraparound cycle, starting from day 25 (assuming HZ=1000) When above unexpected FALSE return from time_after_eq() occurs, the charging window will not reset. The user impact depends on esz value at that time. If esz is 0, scheme ignores configured quotas and runs without any limits. If esz is not 0, scheme stops working once the quota is exhausted. It remains until the charging window finally resets. So, change quota->charged_from to jiffies at damos_adjust_quota() when it is considered as the first charge window. By this change, we can avoid unexpected FALSE return from time_after_eq() Link: https://lkml.kernel.org/r/20250822025057.1740854-1-ekffu200098@gmail.com Fixes: 2b8a248d5873 ("mm/damon/schemes: implement size quota for schemes application speed control") # 5.16 Signed-off-by: Sang-Heon Jeon Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 106ee8b0f2d5f1..c2e0b469fd4327 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2111,6 +2111,10 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) if (!quota->ms && !quota->sz && list_empty("a->goals)) return; + /* First charge window */ + if (!quota->total_charged_sz && !quota->charged_from) + quota->charged_from = jiffies; + /* New charge window starts */ if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { From 711f19dfd783ffb37ca4324388b9c4cb87e71363 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Wed, 27 Aug 2025 19:58:57 +0800 Subject: [PATCH 0819/3206] mm/damon/lru_sort: avoid divide-by-zero in damon_lru_sort_apply_parameters() Patch series "mm/damon: avoid divide-by-zero in DAMON module's parameters application". DAMON's RECLAIM and LRU_SORT modules perform no validation on user-configured parameters during application, which may lead to division-by-zero errors. Avoid the divide-by-zero by adding validation checks when DAMON modules attempt to apply the parameters. This patch (of 2): During the calculation of 'hot_thres' and 'cold_thres', either 'sample_interval' or 'aggr_interval' is used as the divisor, which may lead to division-by-zero errors. Fix it by directly returning -EINVAL when such a case occurs. Additionally, since 'aggr_interval' is already required to be set no smaller than 'sample_interval' in damon_set_attrs(), only the case where 'sample_interval' is zero needs to be checked. Link: https://lkml.kernel.org/r/20250827115858.1186261-2-yanquanmin1@huawei.com Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Cc: [6.0+] Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 151a9de5ad8b81..b5a5ed16a7a5db 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -198,6 +198,11 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; + if (!damon_lru_sort_mon_attrs.sample_interval) { + err = -EINVAL; + goto out; + } + err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); if (err) goto out; From e6b543ca9806d7bced863f43020e016ee996c057 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Wed, 27 Aug 2025 19:58:58 +0800 Subject: [PATCH 0820/3206] mm/damon/reclaim: avoid divide-by-zero in damon_reclaim_apply_parameters() When creating a new scheme of DAMON_RECLAIM, the calculation of 'min_age_region' uses 'aggr_interval' as the divisor, which may lead to division-by-zero errors. Fix it by directly returning -EINVAL when such a case occurs. Link: https://lkml.kernel.org/r/20250827115858.1186261-3-yanquanmin1@huawei.com Fixes: f5a79d7c0c87 ("mm/damon: introduce struct damos_access_pattern") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Cc: [6.1+] Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3c71b459667675..fb7c982a0018c2 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -194,6 +194,11 @@ static int damon_reclaim_apply_parameters(void) if (err) return err; + if (!damon_reclaim_mon_attrs.aggr_interval) { + err = -EINVAL; + goto out; + } + err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs); if (err) goto out; From 04d3cd43700a2d0fe4bfb1012a8ec7f2e34a3507 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 27 Aug 2025 03:42:21 -0700 Subject: [PATCH 0821/3206] arm64: kexec: initialize kexec_buf struct in load_other_segments() Patch series "kexec: Fix invalid field access". The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. An initial fix was already landed for arm64[0], and this patchset fixes the problem on the remaining arm64 code and on riscv, as raised by Mark. Discussions about this problem could be found at[1][2]. This patch (of 3): The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-0-1df9882bb01a@debian.org Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-1-1df9882bb01a@debian.org Link: https://lore.kernel.org/all/20250826180742.f2471131255ec1c43683ea07@linux-foundation.org/ [0] Link: https://lore.kernel.org/all/oninomspajhxp4omtdapxnckxydbk2nzmrix7rggmpukpnzadw@c67o7njgdgm3/ [1] Link: https://lore.kernel.org/all/20250826-akpm-v1-1-3c831f0e3799@debian.org/ [2] Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly") Signed-off-by: Breno Leitao Acked-by: Baoquan He Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Coiby Xu Cc: Heiko Carstens Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- arch/arm64/kernel/machine_kexec_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index af1ca875c52ce2..410060ebd86dfd 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -94,7 +94,7 @@ int load_other_segments(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline) { - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; void *dtb = NULL; unsigned long initrd_load_addr = 0, dtb_len, orig_segments = image->nr_segments; From 8afbd0045922b8146acf1a78ae818693e0468dbd Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 27 Aug 2025 03:42:22 -0700 Subject: [PATCH 0822/3206] riscv: kexec: initialize kexec_buf struct The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-2-1df9882bb01a@debian.org Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly") Signed-off-by: Breno Leitao Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Baoquan He Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Coiby Xu Cc: Heiko Carstens Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- arch/riscv/kernel/kexec_elf.c | 4 ++-- arch/riscv/kernel/kexec_image.c | 2 +- arch/riscv/kernel/machine_kexec_file.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c index 56444c7bd34eb1..531d348db84d73 100644 --- a/arch/riscv/kernel/kexec_elf.c +++ b/arch/riscv/kernel/kexec_elf.c @@ -28,7 +28,7 @@ static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, int i; int ret = 0; size_t size; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; const struct elf_phdr *phdr; kbuf.image = image; @@ -66,7 +66,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, { int i; int ret; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; const struct elf_phdr *phdr; unsigned long lowest_paddr = ULONG_MAX; unsigned long lowest_vaddr = ULONG_MAX; diff --git a/arch/riscv/kernel/kexec_image.c b/arch/riscv/kernel/kexec_image.c index 26a81774a78a36..8f2eb900910b14 100644 --- a/arch/riscv/kernel/kexec_image.c +++ b/arch/riscv/kernel/kexec_image.c @@ -41,7 +41,7 @@ static void *image_load(struct kimage *image, struct riscv_image_header *h; u64 flags; bool be_image, be_kernel; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; int ret; /* Check Image header */ diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c index e36104af2e247f..b9eb41b0a97519 100644 --- a/arch/riscv/kernel/machine_kexec_file.c +++ b/arch/riscv/kernel/machine_kexec_file.c @@ -261,7 +261,7 @@ int load_extra_segments(struct kimage *image, unsigned long kernel_start, int ret; void *fdt; unsigned long initrd_pbase = 0UL; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; char *modified_cmdline = NULL; kbuf.image = image; From e67f0bd05519012eaabaae68618ffc4ed30ab680 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 27 Aug 2025 03:42:23 -0700 Subject: [PATCH 0823/3206] s390: kexec: initialize kexec_buf struct The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-3-1df9882bb01a@debian.org Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly") Signed-off-by: Breno Leitao Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Baoquan He Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Coiby Xu Cc: Heiko Carstens Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- arch/s390/kernel/kexec_elf.c | 2 +- arch/s390/kernel/kexec_image.c | 2 +- arch/s390/kernel/machine_kexec_file.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 4d364de4379921..143e34a4eca57c 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_elf(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; Elf_Addr entry; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index a32ce8bea745cf..9a439175723cad 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_image(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; buf.image = image; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index c2bac14dd668ae..a36d7311c6683b 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -129,7 +129,7 @@ static int kexec_file_update_purgatory(struct kimage *image, static int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; @@ -152,7 +152,7 @@ static int kexec_file_add_purgatory(struct kimage *image, static int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; @@ -184,7 +184,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, { __u32 *lc_ipl_parmblock_ptr; unsigned int len, ncerts; - struct kexec_buf buf; + struct kexec_buf buf = {}; unsigned long addr; void *ptr, *end; int ret; From 3be306cccdccede13e3cefd0c14e430cc2b7c9c7 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Thu, 28 Aug 2025 13:38:20 -0500 Subject: [PATCH 0824/3206] mm/memory-failure: fix redundant updates for already poisoned pages Duplicate memory errors can be reported by multiple sources. Passing an already poisoned page to action_result() causes issues: * The amount of hardware corrupted memory is incorrectly updated. * Per NUMA node MF stats are incorrectly updated. * Redundant "already poisoned" messages are printed. Avoid those issues by: * Skipping hardware corrupted memory updates for already poisoned pages. * Skipping per NUMA node MF stats updates for already poisoned pages. * Dropping redundant "already poisoned" messages. Make MF_MSG_ALREADY_POISONED consistent with other action_page_types and make calls to action_result() consistent for already poisoned normal pages and huge pages. Link: https://lkml.kernel.org/r/aLCiHMy12Ck3ouwC@hpe.com Fixes: b8b9488d50b7 ("mm/memory-failure: improve memory failure action_result messages") Signed-off-by: Kyle Meyer Reviewed-by: Jiaqi Yan Acked-by: David Hildenbrand Reviewed-by: Jane Chu Acked-by: Miaohe Lin Cc: Borislav Betkov Cc: Kyle Meyer Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Luck, Tony" Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Russ Anderson Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fc30ca4804bf47..10b3c281c2aee0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -956,7 +956,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", - [MF_MSG_ALREADY_POISONED] = "already poisoned", + [MF_MSG_ALREADY_POISONED] = "already poisoned page", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1349,9 +1349,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(pfn); - - update_per_node_mf_stats(pfn, result); + if (type != MF_MSG_ALREADY_POISONED) { + num_poisoned_pages_inc(pfn); + update_per_node_mf_stats(pfn, result); + } pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -2094,12 +2095,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb *hugetlb = 0; return 0; } else if (res == -EHWPOISON) { - pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { folio = page_folio(p); res = kill_accessing_process(current, folio_pfn(folio), flags); - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); } + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); return res; } else if (res == -EBUSY) { if (!(flags & MF_NO_RETRY)) { @@ -2285,7 +2285,6 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; if (TestSetPageHWPoison(p)) { - pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); From 8c94db0ae97c72c253a615f990bd466b456e94f6 Mon Sep 17 00:00:00 2001 From: Svetlana Parfenova Date: Mon, 1 Sep 2025 20:53:50 +0700 Subject: [PATCH 0825/3206] binfmt_elf: preserve original ELF e_flags for core dumps Some architectures, such as RISC-V, use the ELF e_flags field to encode ABI-specific information (e.g., ISA extensions, fpu support). Debuggers like GDB rely on these flags in core dumps to correctly interpret optional register sets. If the flags are missing or incorrect, GDB may warn and ignore valid data, for example: warning: Unexpected size of section '.reg2/213' in core file. This can prevent access to fpu or other architecture-specific registers even when they were dumped. Save the e_flags field during ELF binary loading (in load_elf_binary()) into the mm_struct, and later retrieve it during core dump generation (in fill_note_info()). Kconfig option CONFIG_ARCH_HAS_ELF_CORE_EFLAGS is introduced for architectures that require this behaviour. Signed-off-by: Svetlana Parfenova Link: https://lore.kernel.org/r/20250901135350.619485-1-svetlana.parfenova@syntacore.com Signed-off-by: Kees Cook --- arch/riscv/Kconfig | 1 + fs/Kconfig.binfmt | 9 +++++++++ fs/binfmt_elf.c | 40 ++++++++++++++++++++++++++++++++++------ include/linux/mm_types.h | 5 +++++ 4 files changed, 49 insertions(+), 6 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a4b233a0659ed8..b06a758abb63b0 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -28,6 +28,7 @@ config RISCV select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX + select ARCH_HAS_ELF_CORE_EFLAGS select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index bd2f530e574086..1949e25c7741b1 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -184,4 +184,13 @@ config EXEC_KUNIT_TEST This builds the exec KUnit tests, which tests boundary conditions of various aspects of the exec internals. +config ARCH_HAS_ELF_CORE_EFLAGS + bool + depends on BINFMT_ELF && ELF_CORE + default n + help + Select this option if the architecture makes use of the e_flags + field in the ELF header to store ABI or other architecture-specific + information that should be preserved in core dumps. + endmenu diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 4aacf9c9cc2dfe..e4653bb99946b1 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -103,6 +103,21 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) +static inline void elf_coredump_set_mm_eflags(struct mm_struct *mm, u32 flags) +{ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + mm->saved_e_flags = flags; +#endif +} + +static inline u32 elf_coredump_get_mm_eflags(struct mm_struct *mm, u32 flags) +{ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + flags = mm->saved_e_flags; +#endif + return flags; +} + /* * We need to explicitly zero any trailing portion of the page that follows * p_filesz when it ends before the page ends (e.g. bss), otherwise this @@ -1290,6 +1305,8 @@ static int load_elf_binary(struct linux_binprm *bprm) mm->end_data = end_data; mm->start_stack = bprm->p; + elf_coredump_set_mm_eflags(mm, elf_ex->e_flags); + /** * DOC: "brk" handling * @@ -1804,6 +1821,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; struct core_thread *ct; + u16 machine; + u32 flags; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) @@ -1831,17 +1850,26 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 0; } - /* - * Initialize the ELF file header. - */ - fill_elf_header(elf, phdrs, - view->e_machine, view->e_flags); + machine = view->e_machine; + flags = view->e_flags; #else view = NULL; info->thread_notes = 2; - fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); + machine = ELF_ARCH; + flags = ELF_CORE_EFLAGS; #endif + /* + * Override ELF e_flags with value taken from process, + * if arch needs that. + */ + flags = elf_coredump_get_mm_eflags(dump_task->mm, flags); + + /* + * Initialize the ELF file header. + */ + fill_elf_header(elf, phdrs, machine, flags); + /* * Allocate a structure for each thread. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db9348..04a2857f12f260 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1102,6 +1102,11 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + /* the ABI-related flags from the ELF header. Used for core dump */ + unsigned long saved_e_flags; +#endif + struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; From a08b4dcad9fae495bcd88b91fb3410abf77d268e Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 4 Sep 2025 07:31:00 +0200 Subject: [PATCH 0826/3206] tools/sched_ext: Add compat helper for scx_bpf_cpu_curr() Introduce a compatibility helper that allows BPF schedulers to use scx_bpf_cpu_curr() on older kernels. Fixes: 20b158094a1ad ("sched_ext: Introduce scx_bpf_cpu_curr()") Cc: Christian Loehle Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 2 +- tools/sched_ext/include/scx/compat.bpf.h | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 342c7c48df5ab6..06e2551033cb19 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -104,7 +104,7 @@ bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct rq *scx_bpf_locked_rq(void) __ksym; -struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym; +struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 36e0cd2fd4edaf..dd9144624dc99e 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -230,6 +230,23 @@ static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) : \ scx_bpf_pick_any_cpu(cpus_allowed, flags)) +/* + * v6.18: Add a helper to retrieve the current task running on a CPU. + * + * Keep this helper available until v6.20 for compatibility. + */ +static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu) +{ + struct rq *rq; + + if (bpf_ksym_exists(scx_bpf_cpu_curr)) + return scx_bpf_cpu_curr(cpu); + + rq = scx_bpf_cpu_rq(cpu); + + return rq ? rq->curr : NULL; +} + /* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). From 47ddf62b43eced739b56422cd55e86b9b4bb7dac Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Wed, 3 Sep 2025 18:51:14 +0200 Subject: [PATCH 0827/3206] Input: xpad - add support for Flydigi Apex 5 Add Flydigi Apex 5 to the list of recognized controllers. Reported-by: Brandon Lin Reported-by: Sergey Belozyorcev Signed-off-by: Antheas Kapenekakis Link: https://lore.kernel.org/r/20250903165114.2987905-1-lkml@antheas.dev Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 4c94297e17e66c..d72e89c25e5031 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -422,6 +422,7 @@ static const struct xpad_device { { 0x3537, 0x1010, "GameSir G7 SE", 0, XTYPE_XBOXONE }, { 0x366c, 0x0005, "ByoWave Proteus Controller", MAP_SHARE_BUTTON, XTYPE_XBOXONE, FLAG_DELAY_INIT }, { 0x3767, 0x0101, "Fanatec Speedster 3 Forceshock Wheel", 0, XTYPE_XBOX }, + { 0x37d7, 0x2501, "Flydigi Apex 5", 0, XTYPE_XBOX360 }, { 0x413d, 0x2104, "Black Shark Green Ghost Gamepad", 0, XTYPE_XBOX360 }, { 0xffff, 0xffff, "Chinese-made Xbox Controller", 0, XTYPE_XBOX }, { 0x0000, 0x0000, "Generic X-Box pad", 0, XTYPE_UNKNOWN } @@ -578,6 +579,7 @@ static const struct usb_device_id xpad_table[] = { XPAD_XBOX360_VENDOR(0x3537), /* GameSir Controllers */ XPAD_XBOXONE_VENDOR(0x3537), /* GameSir Controllers */ XPAD_XBOXONE_VENDOR(0x366c), /* ByoWave controllers */ + XPAD_XBOX360_VENDOR(0x37d7), /* Flydigi Controllers */ XPAD_XBOX360_VENDOR(0x413d), /* Black Shark Green Ghost Controller */ { } }; From 7c0c01a216e6d9e1d169c0f6f3b5522e6da708ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 26 Aug 2025 08:17:04 +0200 Subject: [PATCH 0828/3206] vdso/datastore: Gate time data behind CONFIG_GENERIC_GETTIMEOFDAY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the generic vDSO does not provide time functions, as for example on riscv32, then the time data store is not necessary. Avoid allocating these time data pages when not used. Fixes: df7fcbefa710 ("vdso: Add generic time data storage") Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-1-d9b65750e49f@linutronix.de --- lib/vdso/datastore.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c index 3693c6caf2c4d4..a565c30c71a04f 100644 --- a/lib/vdso/datastore.c +++ b/lib/vdso/datastore.c @@ -11,14 +11,14 @@ /* * The vDSO data page. */ -#ifdef CONFIG_HAVE_GENERIC_VDSO +#ifdef CONFIG_GENERIC_GETTIMEOFDAY static union { struct vdso_time_data data; u8 page[PAGE_SIZE]; } vdso_time_data_store __page_aligned_data; struct vdso_time_data *vdso_k_time_data = &vdso_time_data_store.data; static_assert(sizeof(vdso_time_data_store) == PAGE_SIZE); -#endif /* CONFIG_HAVE_GENERIC_VDSO */ +#endif /* CONFIG_GENERIC_GETTIMEOFDAY */ #ifdef CONFIG_VDSO_GETRANDOM static union { @@ -46,7 +46,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, switch (vmf->pgoff) { case VDSO_TIME_PAGE_OFFSET: - if (!IS_ENABLED(CONFIG_HAVE_GENERIC_VDSO)) + if (!IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)) return VM_FAULT_SIGBUS; pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); if (timens_page) { From 39f1ee1299c9bab9c87dc3087b9f82f346b8190b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 26 Aug 2025 08:17:05 +0200 Subject: [PATCH 0829/3206] ARM: VDSO: Remove cntvct_ok global variable The cntvct_ok variable has not had any external user since commit c7a18100bdff ("lib/vdso: Avoid highres update if clocksource is not VDSO capable"). It also only has one user in vdso.c, once during init, so rather than having the caller of patch_vdso() initialize cntvct_ok, just call cntvct_functional() directly and avoid the global variable entirely. Signed-off-by: Rasmus Villemoes Signed-off-by: Thomas Gleixner Reviewed-by: Vincenzo Frascino Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-2-d9b65750e49f@linutronix.de --- arch/arm/include/asm/vdso/vsyscall.h | 2 -- arch/arm/kernel/vdso.c | 10 +++------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/arm/include/asm/vdso/vsyscall.h b/arch/arm/include/asm/vdso/vsyscall.h index 4e7226ad02ec4d..ff1c729af05f03 100644 --- a/arch/arm/include/asm/vdso/vsyscall.h +++ b/arch/arm/include/asm/vdso/vsyscall.h @@ -7,8 +7,6 @@ #include #include -extern bool cntvct_ok; - static __always_inline void __arch_sync_vdso_time_data(struct vdso_time_data *vdata) { diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 325448ffbba0c2..e38a30477f3d70 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -54,11 +54,9 @@ struct elfinfo { char *dynstr; /* ptr to .dynstr section */ }; -/* Cached result of boot-time check for whether the arch timer exists, - * and if so, whether the virtual counter is useable. +/* Boot-time check for whether the arch timer exists, and if so, + * whether the virtual counter is usable. */ -bool cntvct_ok __ro_after_init; - static bool __init cntvct_functional(void) { struct device_node *np; @@ -159,7 +157,7 @@ static void __init patch_vdso(void *ehdr) * want programs to incur the slight additional overhead of * dispatching through the VDSO only to fall back to syscalls. */ - if (!cntvct_ok) { + if (!cntvct_functional()) { vdso_nullpatch_one(&einfo, "__vdso_gettimeofday"); vdso_nullpatch_one(&einfo, "__vdso_clock_gettime"); vdso_nullpatch_one(&einfo, "__vdso_clock_gettime64"); @@ -197,8 +195,6 @@ static int __init vdso_init(void) vdso_total_pages = VDSO_NR_PAGES; /* for the data/vvar pages */ vdso_total_pages += text_pages; - cntvct_ok = cntvct_functional(); - patch_vdso(vdso_start); return 0; From 7d298d25ce81251068bb4ea1d92813ec764a9fec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 26 Aug 2025 08:17:06 +0200 Subject: [PATCH 0830/3206] vdso: Move ENABLE_COMPAT_VDSO from core to arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ENABLE_COMAPT_VDSO symbol is only used by arm64 and only for the time-related functionality. There should be no new users, so it doesn't need to be in the generic vDSO code. Move the logic into arm64 architecture-specific code and replace the explicit define by the standard '#ifdef __aarch64__'. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Catalin Marinas Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-3-d9b65750e49f@linutronix.de --- arch/arm64/include/asm/vdso/compat_barrier.h | 7 +++---- arch/arm64/include/asm/vdso/compat_gettimeofday.h | 6 +++--- arch/arm64/include/asm/vdso/gettimeofday.h | 8 ++++++++ arch/arm64/kernel/vdso32/Makefile | 1 - include/vdso/datapage.h | 4 ---- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/vdso/compat_barrier.h b/arch/arm64/include/asm/vdso/compat_barrier.h index 3ac35f4a667cfc..6d75e03d38274a 100644 --- a/arch/arm64/include/asm/vdso/compat_barrier.h +++ b/arch/arm64/include/asm/vdso/compat_barrier.h @@ -7,11 +7,10 @@ #ifndef __ASSEMBLY__ /* - * Warning: This code is meant to be used with - * ENABLE_COMPAT_VDSO only. + * Warning: This code is meant to be used from the compat vDSO only. */ -#ifndef ENABLE_COMPAT_VDSO -#error This header is meant to be used with ENABLE_COMPAT_VDSO only +#ifdef __arch64__ +#error This header is meant to be used with from the compat vDSO only #endif #ifdef dmb diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index d60ea7a72a9cb3..7d1a116549b1b9 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -2,8 +2,8 @@ /* * Copyright (C) 2018 ARM Limited */ -#ifndef __ASM_VDSO_GETTIMEOFDAY_H -#define __ASM_VDSO_GETTIMEOFDAY_H +#ifndef __ASM_VDSO_COMPAT_GETTIMEOFDAY_H +#define __ASM_VDSO_COMPAT_GETTIMEOFDAY_H #ifndef __ASSEMBLY__ @@ -163,4 +163,4 @@ static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) #endif /* !__ASSEMBLY__ */ -#endif /* __ASM_VDSO_GETTIMEOFDAY_H */ +#endif /* __ASM_VDSO_COMPAT_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index da1ab87595925f..c59e84105b43cd 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -5,6 +5,8 @@ #ifndef __ASM_VDSO_GETTIMEOFDAY_H #define __ASM_VDSO_GETTIMEOFDAY_H +#ifdef __aarch64__ + #ifndef __ASSEMBLY__ #include @@ -96,4 +98,10 @@ static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data( #endif /* !__ASSEMBLY__ */ +#else /* !__aarch64__ */ + +#include "compat_gettimeofday.h" + +#endif /* __aarch64__ */ + #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index f2dfdc7dc8185b..230fdc26796aa5 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -59,7 +59,6 @@ VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING VDSO_CAFLAGS += -march=armv8-a VDSO_CFLAGS := $(VDSO_CAFLAGS) -VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 # KBUILD_CFLAGS from top-level Makefile VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 02533038640e53..0b1982f15de427 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -196,11 +196,7 @@ enum vdso_pages { * - clock_gettime_fallback(): fallback for clock_gettime. * - clock_getres_fallback(): fallback for clock_getres. */ -#ifdef ENABLE_COMPAT_VDSO -#include -#else #include -#endif /* ENABLE_COMPAT_VDSO */ #else /* !__ASSEMBLY__ */ From f145d6bf8d590954672a4d0581c92e1799e9c8da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 26 Aug 2025 08:17:07 +0200 Subject: [PATCH 0831/3206] vdso/gettimeofday: Remove !CONFIG_TIME_NS stubs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All calls of these functions are already gated behind CONFIG_TIME_NS. The compiler will already optimize them away if time namespaces are disabled. Drop the unnecessary stubs. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-4-d9b65750e49f@linutronix.de --- lib/vdso/gettimeofday.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 02ea19f671647e..1e2a40b8d2c6c2 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -108,8 +108,6 @@ bool vdso_get_timestamp(const struct vdso_time_data *vd, const struct vdso_clock return true; } -#ifdef CONFIG_TIME_NS - #ifdef CONFIG_GENERIC_VDSO_DATA_STORE static __always_inline const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) @@ -149,20 +147,6 @@ bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock * return true; } -#else -static __always_inline -const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) -{ - return NULL; -} - -static __always_inline -bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) -{ - return false; -} -#endif static __always_inline bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, @@ -204,7 +188,6 @@ bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, return true; } -#ifdef CONFIG_TIME_NS static __always_inline bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, clockid_t clk, struct __kernel_timespec *ts) @@ -233,14 +216,6 @@ bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock return true; } -#else -static __always_inline -bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) -{ - return false; -} -#endif static __always_inline bool do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, From ea1a1fa919a5b4f39fa46073e7b3a19b12521f05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 26 Aug 2025 08:17:08 +0200 Subject: [PATCH 0832/3206] time: Build generic update_vsyscall() only with generic time vDSO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The generic vDSO can be used without the time-related functionality. In that case the generic update_vsyscall() from kernel/time/vsyscall.c should not be built. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-5-d9b65750e49f@linutronix.de --- kernel/time/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e6e9b85d4db5f8..f7d52d9543cc7a 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -26,7 +26,7 @@ obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o ifeq ($(CONFIG_SMP),y) obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o endif -obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o +obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o obj-$(CONFIG_TIME_NS) += namespace.o From eb3b66aab72c10632865afaf8e46f4667c21ef7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 26 Aug 2025 08:17:09 +0200 Subject: [PATCH 0833/3206] riscv: vdso: Untangle Kconfig logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On riscv32 the generic vDSO infrastructure is used but without its time-related functionality. The Kconfig logic to implement this treats HAVE_GENERIC_VDSO as a synonym for GENERIC_GETTIMEOFDAY. This works today due to some underlying issues in how the generic vDSO library works. Some future cleanups will break this logic. Restructure the Kconfig logic, so HAVE_GENERIC_VDSO refers to the generic library in general and GENERIC_GETTIMEOFDAY refers to its time-related functionality. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-6-d9b65750e49f@linutronix.de --- arch/riscv/Kconfig | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a4b233a0659ed8..e4ac0e833ecfdb 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -52,7 +52,7 @@ config RISCV select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN - select ARCH_HAS_VDSO_ARCH_DATA if GENERIC_VDSO_DATA_STORE + select ARCH_HAS_VDSO_ARCH_DATA if HAVE_GENERIC_VDSO select ARCH_KEEP_MEMBLOCK if ACPI select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if 64BIT && MMU select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX @@ -107,7 +107,7 @@ config RISCV select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_ENTRY - select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO + select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO && 64BIT select GENERIC_IDLE_POLL_SETUP select GENERIC_IOREMAP if MMU select GENERIC_IRQ_IPI if SMP @@ -120,9 +120,9 @@ config RISCV select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD - select GENERIC_TIME_VSYSCALL if MMU && 64BIT - select GENERIC_VDSO_DATA_STORE if MMU - select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO + select GENERIC_TIME_VSYSCALL if GENERIC_GETTIMEOFDAY + select GENERIC_VDSO_DATA_STORE if HAVE_GENERIC_VDSO + select GENERIC_VDSO_TIME_NS if GENERIC_GETTIMEOFDAY select HARDIRQS_SW_RESEND select HAS_IOPORT if MMU select HAVE_ALIGNED_STRUCT_PAGE @@ -165,7 +165,7 @@ config RISCV select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_GCC_PLUGINS - select HAVE_GENERIC_VDSO if MMU && 64BIT + select HAVE_GENERIC_VDSO if MMU select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_BZIP2 if !XIP_KERNEL && !EFI_ZBOOT select HAVE_KERNEL_GZIP if !XIP_KERNEL && !EFI_ZBOOT @@ -221,7 +221,7 @@ config RISCV select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU - select VDSO_GETRANDOM if HAVE_GENERIC_VDSO + select VDSO_GETRANDOM if HAVE_GENERIC_VDSO && 64BIT select USER_STACKTRACE_SUPPORT select ZONE_DMA32 if 64BIT From 278f1c933c3fab6f249b995a1a13608246b76181 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 26 Aug 2025 08:17:10 +0200 Subject: [PATCH 0834/3206] vdso: Drop kconfig GENERIC_VDSO_32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This configuration is never used. Remove it. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-7-d9b65750e49f@linutronix.de --- arch/arm/mm/Kconfig | 1 - arch/x86/Kconfig | 1 - lib/vdso/Kconfig | 7 ------- 3 files changed, 9 deletions(-) diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 5c1023a6d78c1b..2347988cf6417b 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -926,7 +926,6 @@ config VDSO default y if ARM_ARCH_TIMER select HAVE_GENERIC_VDSO select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_32 select GENERIC_GETTIMEOFDAY select GENERIC_VDSO_DATA_STORE help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100eb..4f120070a51bd4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -14,7 +14,6 @@ config X86_32 select ARCH_WANT_IPC_PARSE_VERSION select CLKSRC_I8253 select CLONE_BACKWARDS - select GENERIC_VDSO_32 select HAVE_DEBUG_STACKOVERFLOW select KMAP_LOCAL select MODULES_USE_ELF_REL diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 45df764b49ad62..76157c26931d28 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -12,13 +12,6 @@ config GENERIC_GETTIMEOFDAY Each architecture that enables this feature has to provide the fallback implementation. -config GENERIC_VDSO_32 - bool - depends on GENERIC_GETTIMEOFDAY && !64BIT - help - This config option helps to avoid possible performance issues - in 32 bit only architectures. - config GENERIC_COMPAT_VDSO bool help From bb5bc7bfab06b9a6a2ccecba6dc40783fe9b4231 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 26 Aug 2025 08:17:11 +0200 Subject: [PATCH 0835/3206] vdso: Drop kconfig GENERIC_COMPAT_VDSO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This configuration is never used. Remove it. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Catalin Marinas Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-8-d9b65750e49f@linutronix.de --- arch/arm64/Kconfig | 1 - lib/vdso/Kconfig | 5 ----- 2 files changed, 6 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a64d..5c61b19ea9c805 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1782,7 +1782,6 @@ config COMPAT_VDSO bool "Enable vDSO for 32-bit applications" depends on !CPU_BIG_ENDIAN depends on (CC_IS_CLANG && LD_IS_LLD) || "$(CROSS_COMPILE_COMPAT)" != "" - select GENERIC_COMPAT_VDSO default y help Place in the process address space of 32-bit applications an diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 76157c26931d28..2594dd7185be76 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -12,11 +12,6 @@ config GENERIC_GETTIMEOFDAY Each architecture that enables this feature has to provide the fallback implementation. -config GENERIC_COMPAT_VDSO - bool - help - This config option enables the compat VDSO layer. - config GENERIC_VDSO_TIME_NS bool help From 7b338f6d4e3d6baa057e3505592a86f6410d68ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 26 Aug 2025 08:17:12 +0200 Subject: [PATCH 0836/3206] vdso: Drop Kconfig GENERIC_VDSO_DATA_STORE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All users of the generic vDSO library also use the generic vDSO datastore. Remove the now unnecessary Kconfig symbol. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Catalin Marinas Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-9-d9b65750e49f@linutronix.de --- arch/Kconfig | 2 +- arch/arm/mm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/loongarch/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/x86/Kconfig | 1 - include/asm-generic/vdso/vsyscall.h | 4 ---- include/vdso/datapage.h | 5 +---- lib/vdso/Kconfig | 5 ----- lib/vdso/Makefile | 2 +- lib/vdso/gettimeofday.c | 2 -- 14 files changed, 3 insertions(+), 25 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index d1b4ffd6e08564..f6ca7f3031726e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1609,7 +1609,7 @@ config HAVE_SPARSE_SYSCALL_NR related optimizations for a given architecture. config ARCH_HAS_VDSO_ARCH_DATA - depends on GENERIC_VDSO_DATA_STORE + depends on HAVE_GENERIC_VDSO bool config ARCH_HAS_VDSO_TIME_DATA diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 2347988cf6417b..7b27ee9482b3eb 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -927,7 +927,6 @@ config VDSO select HAVE_GENERIC_VDSO select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE help Place in the process address space an ELF shared object providing fast implementations of gettimeofday and diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5c61b19ea9c805..b0f007b396c819 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -162,7 +162,6 @@ config ARM64 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select HARDIRQS_SW_RESEND select HAS_IOPORT diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index f0abc38c40ac9e..d15b201d55f951 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -108,7 +108,6 @@ config LOONGARCH select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GPIOLIB select HAS_IOPORT diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index caf508f6e9ec8e..f7e6bbd755e0ed 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -51,7 +51,6 @@ config MIPS select GENERIC_SMP_IDLE_THREAD select GENERIC_IDLE_POLL_SETUP select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE select GUP_GET_PXX_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT select HAS_IOPORT if !NO_IOPORT_MAP || ISA select HAVE_ARCH_COMPILER_H diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 93402a1d9c9fc6..78c82af955616c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -207,7 +207,6 @@ config PPC select GENERIC_PCI_IOMAP if PCI select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select HAS_IOPORT if PCI select HAVE_ARCH_AUDITSYSCALL diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e4ac0e833ecfdb..f6cf9180ccf2c9 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -121,7 +121,6 @@ config RISCV select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL if GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE if HAVE_GENERIC_VDSO select GENERIC_VDSO_TIME_NS if GENERIC_GETTIMEOFDAY select HARDIRQS_SW_RESEND select HAS_IOPORT if MMU diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf680c26a33cf7..696d2243d64b04 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -167,7 +167,6 @@ config S390 select GENERIC_GETTIMEOFDAY select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4f120070a51bd4..1e74b2a356e42b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -181,7 +181,6 @@ config X86 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GENERIC_VDSO_OVERFLOW_PROTECT select GUP_GET_PXX_LOW_HIGH if X86_PAE diff --git a/include/asm-generic/vdso/vsyscall.h b/include/asm-generic/vdso/vsyscall.h index 7fc0b560007dd8..5c6d9799f4e746 100644 --- a/include/asm-generic/vdso/vsyscall.h +++ b/include/asm-generic/vdso/vsyscall.h @@ -4,8 +4,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE - #ifndef __arch_get_vdso_u_time_data static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { @@ -20,8 +18,6 @@ static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(vo } #endif -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ - #ifndef __arch_update_vdso_clock static __always_inline void __arch_update_vdso_clock(struct vdso_clock *vc) { diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 0b1982f15de427..23c39b96190fdf 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -31,7 +31,7 @@ struct arch_vdso_time_data {}; #if defined(CONFIG_ARCH_HAS_VDSO_ARCH_DATA) #include -#elif defined(CONFIG_GENERIC_VDSO_DATA_STORE) +#else struct vdso_arch_data { /* Needed for the generic code, never actually used at runtime */ char __unused; @@ -164,7 +164,6 @@ struct vdso_rng_data { * With the hidden visibility, the compiler simply generates a PC-relative * relocation, and this is what we need. */ -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE extern struct vdso_time_data vdso_u_time_data __attribute__((visibility("hidden"))); extern struct vdso_rng_data vdso_u_rng_data __attribute__((visibility("hidden"))); extern struct vdso_arch_data vdso_u_arch_data __attribute__((visibility("hidden"))); @@ -185,8 +184,6 @@ enum vdso_pages { VDSO_NR_PAGES }; -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ - /* * The generic vDSO implementation requires that gettimeofday.h * provides: diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 2594dd7185be76..48ffb0f6fa413e 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -31,8 +31,3 @@ config VDSO_GETRANDOM bool help Selected by architectures that support vDSO getrandom(). - -config GENERIC_VDSO_DATA_STORE - bool - help - Selected by architectures that use the generic vDSO data store. diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile index aedd40aaa950c8..405f743253d72b 100644 --- a/lib/vdso/Makefile +++ b/lib/vdso/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_GENERIC_VDSO_DATA_STORE) += datastore.o +obj-$(CONFIG_HAVE_GENERIC_VDSO) += datastore.o diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 1e2a40b8d2c6c2..95df0153f05ab4 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -108,13 +108,11 @@ bool vdso_get_timestamp(const struct vdso_time_data *vd, const struct vdso_clock return true; } -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE static __always_inline const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) { return (void *)vd + PAGE_SIZE; } -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ static __always_inline bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, From bad53ae2dc4296acb8cbcee385e0238cea484100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 26 Aug 2025 08:17:13 +0200 Subject: [PATCH 0837/3206] vdso: Drop Kconfig GENERIC_VDSO_TIME_NS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All architectures implementing time-related functionality in the vDSO are using the generic vDSO library which handles time namespaces properly. Remove the now unnecessary Kconfig symbol. Enables the use of time namespaces on architectures, which use the generic vDSO but did not enable GENERIC_VDSO_TIME_NS, namely MIPS and arm. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Catalin Marinas Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-10-d9b65750e49f@linutronix.de --- arch/arm64/Kconfig | 1 - arch/loongarch/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/x86/Kconfig | 1 - init/Kconfig | 2 +- lib/vdso/Kconfig | 6 ------ tools/testing/selftests/pidfd/config | 1 - 9 files changed, 1 insertion(+), 14 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b0f007b396c819..e19b006842e223 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -162,7 +162,6 @@ config ARM64 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_TIME_NS select HARDIRQS_SW_RESEND select HAS_IOPORT select HAVE_MOVE_PMD diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index d15b201d55f951..754626b58b929e 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -108,7 +108,6 @@ config LOONGARCH select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_TIME_NS select GPIOLIB select HAS_IOPORT select HAVE_ARCH_AUDITSYSCALL diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 78c82af955616c..d715e3d65b5c29 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -207,7 +207,6 @@ config PPC select GENERIC_PCI_IOMAP if PCI select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_TIME_NS select HAS_IOPORT if PCI select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index f6cf9180ccf2c9..6e5efbeb83d7f0 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -121,7 +121,6 @@ config RISCV select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL if GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_TIME_NS if GENERIC_GETTIMEOFDAY select HARDIRQS_SW_RESEND select HAS_IOPORT if MMU select HAVE_ALIGNED_STRUCT_PAGE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 696d2243d64b04..e06ebbd860da93 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -167,7 +167,6 @@ config S390 select GENERIC_GETTIMEOFDAY select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_TIME_NS select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1e74b2a356e42b..d1961811e291e8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -181,7 +181,6 @@ config X86 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_TIME_NS select GENERIC_VDSO_OVERFLOW_PROTECT select GUP_GET_PXX_LOW_HIGH if X86_PAE select HARDIRQS_SW_RESEND diff --git a/init/Kconfig b/init/Kconfig index d811cad02a7509..497bd326d39a6b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1347,7 +1347,7 @@ config UTS_NS config TIME_NS bool "TIME namespace" - depends on GENERIC_VDSO_TIME_NS + depends on GENERIC_GETTIMEOFDAY default y help In this namespace boottime and monotonic clocks can be set. diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 48ffb0f6fa413e..3d2c2b90d193f9 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -12,12 +12,6 @@ config GENERIC_GETTIMEOFDAY Each architecture that enables this feature has to provide the fallback implementation. -config GENERIC_VDSO_TIME_NS - bool - help - Selected by architectures which support time namespaces in the - VDSO - config GENERIC_VDSO_OVERFLOW_PROTECT bool help diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config index 6133524710f790..cf7cc0ce02484e 100644 --- a/tools/testing/selftests/pidfd/config +++ b/tools/testing/selftests/pidfd/config @@ -4,6 +4,5 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_TIME_NS=y -CONFIG_GENERIC_VDSO_TIME_NS=y CONFIG_CGROUPS=y CONFIG_CHECKPOINT_RESTORE=y From 258b37c6e626625fe441e16ab90e6a542a66eaee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 26 Aug 2025 08:17:14 +0200 Subject: [PATCH 0838/3206] vdso: Gate VDSO_GETRANDOM behind HAVE_GENERIC_VDSO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All architectures which want to implement getrandom() in the vDSO need to use the generic vDSO library. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-11-d9b65750e49f@linutronix.de --- lib/vdso/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 3d2c2b90d193f9..db87ba34ef1928 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -19,9 +19,9 @@ config GENERIC_VDSO_OVERFLOW_PROTECT time getter functions for the price of an extra conditional in the hotpath. -endif - config VDSO_GETRANDOM bool help Selected by architectures that support vDSO getrandom(). + +endif From 9428fff44f0c5823fde733b64a344d7a6eda3873 Mon Sep 17 00:00:00 2001 From: "hongyu.chen1" Date: Fri, 22 Aug 2025 13:39:55 +0800 Subject: [PATCH 0839/3206] dt-bindings: power: add Amlogic S6 S7 S7D power domains Add devicetree binding document and related header file for Amlogic S6 S7 S7D secure power domains. Signed-off-by: hongyu.chen1 Signed-off-by: Xianwei Zhao Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250822-pm-s6-s7-s7d-v1-1-82e3f3aff327@amlogic.com Signed-off-by: Ulf Hansson --- .../power/amlogic,meson-sec-pwrc.yaml | 3 ++ include/dt-bindings/power/amlogic,s6-pwrc.h | 29 +++++++++++++++++++ include/dt-bindings/power/amlogic,s7-pwrc.h | 20 +++++++++++++ include/dt-bindings/power/amlogic,s7d-pwrc.h | 27 +++++++++++++++++ 4 files changed, 79 insertions(+) create mode 100644 include/dt-bindings/power/amlogic,s6-pwrc.h create mode 100644 include/dt-bindings/power/amlogic,s7-pwrc.h create mode 100644 include/dt-bindings/power/amlogic,s7d-pwrc.h diff --git a/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml b/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml index 15d74138baa343..12b71688dd3407 100644 --- a/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml +++ b/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml @@ -24,6 +24,9 @@ properties: - amlogic,a5-pwrc - amlogic,c3-pwrc - amlogic,t7-pwrc + - amlogic,s6-pwrc + - amlogic,s7-pwrc + - amlogic,s7d-pwrc "#power-domain-cells": const: 1 diff --git a/include/dt-bindings/power/amlogic,s6-pwrc.h b/include/dt-bindings/power/amlogic,s6-pwrc.h new file mode 100644 index 00000000000000..2c005864ae73fa --- /dev/null +++ b/include/dt-bindings/power/amlogic,s6-pwrc.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (C) 2025 Amlogic, Inc. All rights reserved + */ +#ifndef _DT_BINDINGS_AMLOGIC_S6_POWER_H +#define _DT_BINDINGS_AMLOGIC_S6_POWER_H + +#define PWRC_S6_DSPA_ID 0 +#define PWRC_S6_DOS_HEVC_ID 1 +#define PWRC_S6_DOS_VDEC_ID 2 +#define PWRC_S6_VPU_HDMI_ID 3 +#define PWRC_S6_U2DRD_ID 4 +#define PWRC_S6_U3DRD_ID 5 +#define PWRC_S6_SD_EMMC_C_ID 6 +#define PWRC_S6_GE2D_ID 7 +#define PWRC_S6_AMFC_ID 8 +#define PWRC_S6_VC9000E_ID 9 +#define PWRC_S6_DEWARP_ID 10 +#define PWRC_S6_VICP_ID 11 +#define PWRC_S6_SD_EMMC_A_ID 12 +#define PWRC_S6_SD_EMMC_B_ID 13 +#define PWRC_S6_ETH_ID 14 +#define PWRC_S6_PCIE_ID 15 +#define PWRC_S6_NNA_4T_ID 16 +#define PWRC_S6_AUDIO_ID 17 +#define PWRC_S6_AUCPU_ID 18 +#define PWRC_S6_ADAPT_ID 19 + +#endif diff --git a/include/dt-bindings/power/amlogic,s7-pwrc.h b/include/dt-bindings/power/amlogic,s7-pwrc.h new file mode 100644 index 00000000000000..3f21d095f784ef --- /dev/null +++ b/include/dt-bindings/power/amlogic,s7-pwrc.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (C) 2025 Amlogic, Inc. All rights reserved + */ +#ifndef _DT_BINDINGS_AMLOGIC_S7_POWER_H +#define _DT_BINDINGS_AMLOGIC_S7_POWER_H + +#define PWRC_S7_DOS_HEVC_ID 0 +#define PWRC_S7_DOS_VDEC_ID 1 +#define PWRC_S7_VPU_HDMI_ID 2 +#define PWRC_S7_USB_COMB_ID 3 +#define PWRC_S7_SD_EMMC_C_ID 4 +#define PWRC_S7_GE2D_ID 5 +#define PWRC_S7_SD_EMMC_A_ID 6 +#define PWRC_S7_SD_EMMC_B_ID 7 +#define PWRC_S7_ETH_ID 8 +#define PWRC_S7_AUCPU_ID 9 +#define PWRC_S7_AUDIO_ID 10 + +#endif diff --git a/include/dt-bindings/power/amlogic,s7d-pwrc.h b/include/dt-bindings/power/amlogic,s7d-pwrc.h new file mode 100644 index 00000000000000..c6998553670aae --- /dev/null +++ b/include/dt-bindings/power/amlogic,s7d-pwrc.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (C) 2025 Amlogic, Inc. All rights reserved + */ +#ifndef _DT_BINDINGS_AMLOGIC_S7D_POWER_H +#define _DT_BINDINGS_AMLOGIC_S7D_POWER_H + +#define PWRC_S7D_DOS_HCODEC_ID 0 +#define PWRC_S7D_DOS_HEVC_ID 1 +#define PWRC_S7D_DOS_VDEC_ID 2 +#define PWRC_S7D_VPU_HDMI_ID 3 +#define PWRC_S7D_USB_U2DRD_ID 4 +#define PWRC_S7D_USB_U2H_ID 5 +#define PWRC_S7D_SSD_EMMC_C_ID 6 +#define PWRC_S7D_GE2D_ID 7 +#define PWRC_S7D_AMFC_ID 8 +#define PWRC_S7D_EMMC_A_ID 9 +#define PWRC_S7D_EMMC_B_ID 10 +#define PWRC_S7D_ETH_ID 11 +#define PWRC_S7D_AUCPU_ID 12 +#define PWRC_S7D_AUDIO_ID 13 +#define PWRC_S7D_SRAMA_ID 14 +#define PWRC_S7D_DMC0_ID 15 +#define PWRC_S7D_DMC1_ID 16 +#define PWRC_S7D_DDR_ID 17 + +#endif From b283b70967c0553b66e924f40e1c35b68fa8a1f7 Mon Sep 17 00:00:00 2001 From: "hongyu.chen1" Date: Fri, 22 Aug 2025 13:39:56 +0800 Subject: [PATCH 0840/3206] pmdomain: amlogic: Add support for S6 S7 S7D power domains controller Add support for the S6 S7 S7D power controller, whose registers are in the secure domain and should be accessed via SMC. Signed-off-by: hongyu.chen1 Signed-off-by: Xianwei Zhao Link: https://lore.kernel.org/r/20250822-pm-s6-s7-s7d-v1-2-82e3f3aff327@amlogic.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/amlogic/meson-secure-pwrc.c | 95 ++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/drivers/pmdomain/amlogic/meson-secure-pwrc.c b/drivers/pmdomain/amlogic/meson-secure-pwrc.c index e8bda60078c455..1d2f371d2d7f04 100644 --- a/drivers/pmdomain/amlogic/meson-secure-pwrc.c +++ b/drivers/pmdomain/amlogic/meson-secure-pwrc.c @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -201,6 +204,71 @@ static const struct meson_secure_pwrc_domain_desc s4_pwrc_domains[] = { SEC_PD(S4_AUDIO, 0), }; +static const struct meson_secure_pwrc_domain_desc s6_pwrc_domains[] = { + SEC_PD(S6_DSPA, 0), + SEC_PD(S6_DOS_HEVC, 0), + SEC_PD(S6_DOS_VDEC, 0), + SEC_PD(S6_VPU_HDMI, 0), + SEC_PD(S6_U2DRD, 0), + SEC_PD(S6_U3DRD, 0), + SEC_PD(S6_SD_EMMC_C, 0), + SEC_PD(S6_GE2D, 0), + SEC_PD(S6_AMFC, 0), + SEC_PD(S6_VC9000E, 0), + SEC_PD(S6_DEWARP, 0), + SEC_PD(S6_VICP, 0), + SEC_PD(S6_SD_EMMC_A, 0), + SEC_PD(S6_SD_EMMC_B, 0), + /* ETH is for ethernet online wakeup, and should be always on */ + SEC_PD(S6_ETH, GENPD_FLAG_ALWAYS_ON), + SEC_PD(S6_PCIE, 0), + SEC_PD(S6_NNA_4T, 0), + SEC_PD(S6_AUDIO, 0), + SEC_PD(S6_AUCPU, 0), + SEC_PD(S6_ADAPT, 0), +}; + +static const struct meson_secure_pwrc_domain_desc s7_pwrc_domains[] = { + SEC_PD(S7_DOS_HEVC, 0), + SEC_PD(S7_DOS_VDEC, 0), + SEC_PD(S7_VPU_HDMI, 0), + SEC_PD(S7_USB_COMB, 0), + SEC_PD(S7_SD_EMMC_C, 0), + SEC_PD(S7_GE2D, 0), + SEC_PD(S7_SD_EMMC_A, 0), + SEC_PD(S7_SD_EMMC_B, 0), + /* ETH is for ethernet online wakeup, and should be always on */ + SEC_PD(S7_ETH, GENPD_FLAG_ALWAYS_ON), + SEC_PD(S7_AUCPU, 0), + SEC_PD(S7_AUDIO, 0), +}; + +static const struct meson_secure_pwrc_domain_desc s7d_pwrc_domains[] = { + SEC_PD(S7D_DOS_HCODEC, 0), + SEC_PD(S7D_DOS_HEVC, 0), + SEC_PD(S7D_DOS_VDEC, 0), + SEC_PD(S7D_VPU_HDMI, 0), + SEC_PD(S7D_USB_U2DRD, 0), + SEC_PD(S7D_USB_U2H, 0), + SEC_PD(S7D_SSD_EMMC_C, 0), + SEC_PD(S7D_GE2D, 0), + SEC_PD(S7D_AMFC, 0), + SEC_PD(S7D_EMMC_A, 0), + SEC_PD(S7D_EMMC_B, 0), + /* ETH is for ethernet online wakeup, and should be always on */ + SEC_PD(S7D_ETH, GENPD_FLAG_ALWAYS_ON), + SEC_PD(S7D_AUCPU, 0), + SEC_PD(S7D_AUDIO, 0), + /* SRAMA is used as ATF runtime memory, and should be always on */ + SEC_PD(S7D_SRAMA, GENPD_FLAG_ALWAYS_ON), + /* DMC0 is for DDR PHY ana/dig and DMC, and should be always on */ + SEC_PD(S7D_DMC0, GENPD_FLAG_ALWAYS_ON), + /* DMC1 is for DDR PHY ana/dig and DMC, and should be always on */ + SEC_PD(S7D_DMC1, GENPD_FLAG_ALWAYS_ON), + /* DDR should be always on */ + SEC_PD(S7D_DDR, GENPD_FLAG_ALWAYS_ON), +}; + static const struct meson_secure_pwrc_domain_desc t7_pwrc_domains[] = { SEC_PD(T7_DSPA, 0), SEC_PD(T7_DSPB, 0), @@ -367,6 +435,21 @@ static const struct meson_secure_pwrc_domain_data meson_secure_s4_pwrc_data = { .count = ARRAY_SIZE(s4_pwrc_domains), }; +static const struct meson_secure_pwrc_domain_data amlogic_secure_s6_pwrc_data = { + .domains = s6_pwrc_domains, + .count = ARRAY_SIZE(s6_pwrc_domains), +}; + +static const struct meson_secure_pwrc_domain_data amlogic_secure_s7_pwrc_data = { + .domains = s7_pwrc_domains, + .count = ARRAY_SIZE(s7_pwrc_domains), +}; + +static const struct meson_secure_pwrc_domain_data amlogic_secure_s7d_pwrc_data = { + .domains = s7d_pwrc_domains, + .count = ARRAY_SIZE(s7d_pwrc_domains), +}; + static const struct meson_secure_pwrc_domain_data amlogic_secure_t7_pwrc_data = { .domains = t7_pwrc_domains, .count = ARRAY_SIZE(t7_pwrc_domains), @@ -393,6 +476,18 @@ static const struct of_device_id meson_secure_pwrc_match_table[] = { .compatible = "amlogic,meson-s4-pwrc", .data = &meson_secure_s4_pwrc_data, }, + { + .compatible = "amlogic,s6-pwrc", + .data = &amlogic_secure_s6_pwrc_data, + }, + { + .compatible = "amlogic,s7-pwrc", + .data = &amlogic_secure_s7_pwrc_data, + }, + { + .compatible = "amlogic,s7d-pwrc", + .data = &amlogic_secure_s7d_pwrc_data, + }, { .compatible = "amlogic,t7-pwrc", .data = &amlogic_secure_t7_pwrc_data, From d4bc3b11c12b41fdb5650f5ad797de97f8dce869 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 3 Sep 2025 17:42:05 +0200 Subject: [PATCH 0841/3206] x86/apic/savic: Do not use snp_abort() This function is going away so replace the callsites with the equivalent functionality. Add a new SAVIC-specific termination reason. If more granularity is needed there, it will be revisited in the future. Signed-off-by: Borislav Petkov (AMD) --- arch/x86/coco/sev/core.c | 4 ++-- arch/x86/include/asm/sev-common.h | 1 + arch/x86/kernel/apic/x2apic_savic.c | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index b64f43010a1289..e858e2979db044 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1129,7 +1129,7 @@ u64 savic_ghcb_msr_read(u32 reg) if (res != ES_OK) { pr_err("Secure AVIC MSR (0x%llx) read returned error (%d)\n", msr, res); /* MSR read failures are treated as fatal errors */ - snp_abort(); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); } __sev_put_ghcb(&state); @@ -1159,7 +1159,7 @@ void savic_ghcb_msr_write(u32 reg, u64 value) if (res != ES_OK) { pr_err("Secure AVIC MSR (0x%llx) write returned error (%d)\n", msr, res); /* MSR writes should never fail. Any failure is fatal error for SNP guest */ - snp_abort(); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); } __sev_put_ghcb(&state); diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 0020d77a080001..01a6e4dbe42357 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -208,6 +208,7 @@ struct snp_psc_desc { #define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */ #define GHCB_TERM_SECURE_TSC 10 /* Secure TSC initialization failed */ #define GHCB_TERM_SVSM_CA_REMAP_FAIL 11 /* SVSM is present but CA could not be remapped */ +#define GHCB_TERM_SAVIC_FAIL 12 /* Secure AVIC-specific failure */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index b846de0fbcfadf..dbc5678bc3b689 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -363,7 +363,7 @@ static void savic_setup(void) */ res = savic_register_gpa(gpa); if (res != ES_OK) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI); @@ -376,13 +376,13 @@ static int savic_probe(void) if (!x2apic_mode) { pr_err("Secure AVIC enabled in non x2APIC mode\n"); - snp_abort(); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); /* unreachable */ } savic_page = alloc_percpu(struct secure_avic_page); if (!savic_page) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); return 1; } From 9f8d92a1fbb5a08e17f9d405a1ab27be64096d8c Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 3 Sep 2025 18:14:54 +0200 Subject: [PATCH 0842/3206] x86/sev: Zap snp_abort() It is a silly oneliner anyway. Replace it with its equivalent. No functional changes. Signed-off-by: Borislav Petkov (AMD) --- arch/x86/boot/startup/sev-startup.c | 7 +------ arch/x86/boot/startup/sme.c | 2 +- arch/x86/include/asm/sev.h | 2 -- tools/objtool/noreturns.h | 1 - 4 files changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 39465a0ff4e5f3..a9b0a9c32d8ff8 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -144,7 +144,7 @@ static struct cc_blob_sev_info *__init find_cc_blob(struct boot_params *bp) found_cc_info: if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); return cc_info; } @@ -218,8 +218,3 @@ bool __init snp_init(struct boot_params *bp) return true; } - -void __init __noreturn snp_abort(void) -{ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); -} diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index 2ddde901c8c5b7..e7ea65f3f1d6a5 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -532,7 +532,7 @@ void __init sme_enable(struct boot_params *bp) * enablement abort the guest. */ if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED)) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index f222bef9dca88c..32c7dd916e4b93 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -512,7 +512,6 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); void snp_set_memory_private(unsigned long vaddr, unsigned long npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); -void __noreturn snp_abort(void); void snp_dmi_setup(void); int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input); void snp_accept_memory(phys_addr_t start, phys_addr_t end); @@ -597,7 +596,6 @@ static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npag static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } -static inline void snp_abort(void) { } static inline void snp_dmi_setup(void) { } static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input) { diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 6a922d046b8e2c..802895fae3cac0 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -45,7 +45,6 @@ NORETURN(rewind_stack_and_make_dead) NORETURN(rust_begin_unwind) NORETURN(rust_helper_BUG) NORETURN(sev_es_terminate) -NORETURN(snp_abort) NORETURN(start_kernel) NORETURN(stop_this_cpu) NORETURN(usercopy_abort) From c9ddc41cdd522f2db5d492eda3df8994d928be34 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Sun, 17 Aug 2025 19:20:22 -0500 Subject: [PATCH 0843/3206] Input: iqs7222 - avoid enabling unused interrupts If a proximity event node is defined so as to specify the wake-up properties of the touch surface, the proximity event interrupt is enabled unconditionally. This may result in unwanted interrupts. Solve this problem by enabling the interrupt only if the event is mapped to a key or switch code. Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/aKJxxgEWpNaNcUaW@nixie71 Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/misc/iqs7222.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/misc/iqs7222.c b/drivers/input/misc/iqs7222.c index 6fac31c0d99f2b..ff23219a582ab8 100644 --- a/drivers/input/misc/iqs7222.c +++ b/drivers/input/misc/iqs7222.c @@ -2427,6 +2427,9 @@ static int iqs7222_parse_chan(struct iqs7222_private *iqs7222, if (error) return error; + if (!iqs7222->kp_type[chan_index][i]) + continue; + if (!dev_desc->event_offset) continue; From 2ef3886ce626dcdab0cbc452dbbebc19f57133d8 Mon Sep 17 00:00:00 2001 From: Christian Bruel Date: Tue, 2 Sep 2025 11:10:45 +0200 Subject: [PATCH 0844/3206] irqchip/gic-v2m: Handle Multiple MSI base IRQ Alignment The PCI Local Bus Specification 3.0 (section 6.8.1.6) allows modifying the low-order bits of the MSI Message DATA register to encode nr_irqs interrupt numbers in the log2(nr_irqs) bits for the domain. The problem arises if the base vector (GICV2m base spi) is not aligned with nr_irqs; in this case, the low-order log2(nr_irqs) bits from the base vector conflict with the nr_irqs masking, causing the wrong MSI interrupt to be identified. To fix this, use bitmap_find_next_zero_area_off() instead of bitmap_find_free_region() to align the initial base vector with nr_irqs. Signed-off-by: Christian Bruel Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/all/20250902091045.220847-1-christian.bruel@foss.st.com --- drivers/irqchip/irq-gic-v2m.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index 24ef5af569fe4e..8a3410c2b7b575 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -153,14 +153,19 @@ static int gicv2m_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, { msi_alloc_info_t *info = args; struct v2m_data *v2m = NULL, *tmp; - int hwirq, offset, i, err = 0; + int hwirq, i, err = 0; + unsigned long offset; + unsigned long align_mask = nr_irqs - 1; spin_lock(&v2m_lock); list_for_each_entry(tmp, &v2m_nodes, entry) { - offset = bitmap_find_free_region(tmp->bm, tmp->nr_spis, - get_count_order(nr_irqs)); - if (offset >= 0) { + unsigned long align_off = tmp->spi_start - (tmp->spi_start & ~align_mask); + + offset = bitmap_find_next_zero_area_off(tmp->bm, tmp->nr_spis, 0, + nr_irqs, align_mask, align_off); + if (offset < tmp->nr_spis) { v2m = tmp; + bitmap_set(v2m->bm, offset, nr_irqs); break; } } From a4bd4c330d5deaaa54db3a2ca4d2dd402d3a7248 Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Thu, 4 Sep 2025 11:00:37 +0800 Subject: [PATCH 0845/3206] riscv: sophgo: dts: sg2042: Change msi irq type to IRQ_TYPE_EDGE_RISING Fix msi irq type to be the correct type, although this field is not used yet. Signed-off-by: Chen Wang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/831c1b650c575380d56ef3e2faed9bee278c9006.1756953919.git.unicorn_wang@outlook.com --- arch/riscv/boot/dts/sophgo/sg2042.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/sophgo/sg2042.dtsi b/arch/riscv/boot/dts/sophgo/sg2042.dtsi index b3e4d3c18fdcf9..6430c6e25c0017 100644 --- a/arch/riscv/boot/dts/sophgo/sg2042.dtsi +++ b/arch/riscv/boot/dts/sophgo/sg2042.dtsi @@ -190,7 +190,7 @@ reg-names = "clr", "doorbell"; msi-controller; #msi-cells = <0>; - msi-ranges = <&intc 64 IRQ_TYPE_LEVEL_HIGH 32>; + msi-ranges = <&intc 64 IRQ_TYPE_EDGE_RISING 32>; }; rpgate: clock-controller@7030010368 { From 8aefd2724451dedea1368d3915ab2dd5ecebc3cb Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Thu, 4 Sep 2025 11:00:59 +0800 Subject: [PATCH 0846/3206] riscv: sophgo: dts: sg2044: Change msi irq type to IRQ_TYPE_EDGE_RISING Fix msi irq type to be the correct type, although this field is not used yet. Signed-off-by: Chen Wang Signed-off-by: Thomas Gleixner Tested-by: Inochi Amaoto # Sophgo SRD3-10 Link: https://lore.kernel.org/all/c38b9b1682af978473705b7e70b6faaa36fe5024.1756953919.git.unicorn_wang@outlook.com --- arch/riscv/boot/dts/sophgo/sg2044.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/sophgo/sg2044.dtsi b/arch/riscv/boot/dts/sophgo/sg2044.dtsi index 6ec955744b0cbf..320c4d1d08e69c 100644 --- a/arch/riscv/boot/dts/sophgo/sg2044.dtsi +++ b/arch/riscv/boot/dts/sophgo/sg2044.dtsi @@ -214,7 +214,7 @@ reg-names = "clr", "doorbell"; #msi-cells = <0>; msi-controller; - msi-ranges = <&intc 352 IRQ_TYPE_LEVEL_HIGH 512>; + msi-ranges = <&intc 352 IRQ_TYPE_EDGE_RISING 512>; status = "disabled"; }; From c2616c5696e85efb2679499d7260f7766b93cff6 Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Thu, 4 Sep 2025 11:01:19 +0800 Subject: [PATCH 0847/3206] irqchip/sg2042-msi: Set irq type according to DT configuration Read the device tree configuration and use it to set the interrupt type. Signed-off-by: Chen Wang Signed-off-by: Thomas Gleixner Tested-by: Inochi Amaoto # Sophgo SRD3-10 Link: https://lore.kernel.org/all/b22d2b0a00a96161253435d17b3c66538f3ba1c2.1756953919.git.unicorn_wang@outlook.com --- drivers/irqchip/irq-sg2042-msi.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-sg2042-msi.c b/drivers/irqchip/irq-sg2042-msi.c index 3b13dbbfdb51c0..f7cf0dc72eabbf 100644 --- a/drivers/irqchip/irq-sg2042-msi.c +++ b/drivers/irqchip/irq-sg2042-msi.c @@ -30,6 +30,7 @@ struct sg204x_msi_chip_info { * @doorbell_addr: see TRM, 10.1.32, GP_INTR0_SET * @irq_first: First vectors number that MSIs starts * @num_irqs: Number of vectors for MSIs + * @irq_type: IRQ type for MSIs * @msi_map: mapping for allocated MSI vectors. * @msi_map_lock: Lock for msi_map * @chip_info: chip specific infomations @@ -41,6 +42,7 @@ struct sg204x_msi_chipdata { u32 irq_first; u32 num_irqs; + unsigned int irq_type; unsigned long *msi_map; struct mutex msi_map_lock; @@ -137,14 +139,14 @@ static int sg204x_msi_parent_domain_alloc(struct irq_domain *domain, unsigned in fwspec.fwnode = domain->parent->fwnode; fwspec.param_count = 2; fwspec.param[0] = data->irq_first + hwirq; - fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + fwspec.param[1] = data->irq_type; ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); if (ret) return ret; d = irq_domain_get_irq_data(domain->parent, virq); - return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); + return d->chip->irq_set_type(d, data->irq_type); } static int sg204x_msi_middle_domain_alloc(struct irq_domain *domain, unsigned int virq, @@ -298,6 +300,7 @@ static int sg2042_msi_probe(struct platform_device *pdev) } data->irq_first = (u32)args.args[0]; + data->irq_type = (unsigned int)args.args[1]; data->num_irqs = (u32)args.args[args.nargs - 1]; mutex_init(&data->msi_map_lock); From df6a44003953fb23ad67f82d299e439e7ff7150a Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Tue, 19 Aug 2025 09:04:57 +0200 Subject: [PATCH 0848/3206] mfd: stmpe: Allow building as module Export the core probe and remove function to be used by i2c and spi drivers. Also add necessary module information so the drivers can be built as modules. This reduces footprint of the driver is enabled but unused. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20250819070458.1027883-1-alexander.stein@ew.tq-group.com Signed-off-by: Bartosz Golaszewski --- drivers/mfd/Kconfig | 10 +++++----- drivers/mfd/stmpe.c | 6 ++++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1e7..bfaffb74c99113 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -1539,8 +1539,8 @@ config MFD_DB8500_PRCMU through a register map. config MFD_STMPE - bool "STMicroelectronics STMPE" - depends on I2C=y || SPI_MASTER=y + tristate "STMicroelectronics STMPE" + depends on I2C || SPI_MASTER depends on OF select MFD_CORE help @@ -1568,14 +1568,14 @@ menu "STMicroelectronics STMPE Interface Drivers" depends on MFD_STMPE config STMPE_I2C - bool "STMicroelectronics STMPE I2C Interface" - depends on I2C=y + tristate "STMicroelectronics STMPE I2C Interface" + depends on I2C default y help This is used to enable I2C interface of STMPE config STMPE_SPI - bool "STMicroelectronics STMPE SPI Interface" + tristate "STMicroelectronics STMPE SPI Interface" depends on SPI_MASTER help This is used to enable SPI interface of STMPE diff --git a/drivers/mfd/stmpe.c b/drivers/mfd/stmpe.c index 819d19dc9b4a91..0cb7af11ea6071 100644 --- a/drivers/mfd/stmpe.c +++ b/drivers/mfd/stmpe.c @@ -1482,6 +1482,7 @@ int stmpe_probe(struct stmpe_client_info *ci, enum stmpe_partnum partnum) return ret; } +EXPORT_SYMBOL_GPL(stmpe_probe); void stmpe_remove(struct stmpe *stmpe) { @@ -1494,6 +1495,7 @@ void stmpe_remove(struct stmpe *stmpe) mfd_remove_devices(stmpe->dev); } +EXPORT_SYMBOL_GPL(stmpe_remove); static int stmpe_suspend(struct device *dev) { @@ -1517,3 +1519,7 @@ static int stmpe_resume(struct device *dev) EXPORT_GPL_SIMPLE_DEV_PM_OPS(stmpe_dev_pm_ops, stmpe_suspend, stmpe_resume); + +MODULE_DESCRIPTION("STMPE Core driver"); +MODULE_AUTHOR("Rabin Vincent "); +MODULE_LICENSE("GPL"); From 79d15f23f232f90bfd8823239fd57b964d053548 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 15:19:03 +0200 Subject: [PATCH 0849/3206] gpio: nomadik: wrap a local variable in a necessary ifdef The 'desc' local variable in nmk_gpio_dbg_show_one() is now only used with CONFIG_PINCTRL_NOMADIK enabled so wrap its declaration with an appropriate ifdef. Fixes: ddeb66d2cb10 ("gpio: nomadik: don't print out global GPIO numbers in debugfs callbacks") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509032125.nXBcPuaf-lkp@intel.com/ Link: https://lore.kernel.org/r/20250903131903.95100-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-nomadik.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpio/gpio-nomadik.c b/drivers/gpio/gpio-nomadik.c index fde4b416faa818..97c5cd33279d56 100644 --- a/drivers/gpio/gpio-nomadik.c +++ b/drivers/gpio/gpio-nomadik.c @@ -400,7 +400,9 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, struct gpio_chip *chip, unsigned int offset) { struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(chip); +#ifdef CONFIG_PINCTRL_NOMADIK struct gpio_desc *desc; +#endif int mode; bool is_out; bool data_out; From 855042367e3f1ddc1e896fcd00af9c397d1174cb Mon Sep 17 00:00:00 2001 From: Sohil Mehta Date: Mon, 18 Aug 2025 12:01:36 -0700 Subject: [PATCH 0850/3206] x86/microcode/intel: Refresh the revisions that determine old_microcode Update the minimum expected revisions of Intel microcode based on the microcode-20250512 (May 2025) release. Signed-off-by: Sohil Mehta Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/all/20250818190137.3525414-2-sohil.mehta%40intel.com --- .../kernel/cpu/microcode/intel-ucode-defs.h | 86 +++++++++++-------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h index cb6e601701ab6e..2d48e6593540f2 100644 --- a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h +++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h @@ -67,9 +67,8 @@ { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003901 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002b01 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a }, @@ -81,51 +80,62 @@ { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd000404 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002d0 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x26 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xca }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xbc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x3c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x56 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000639 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c0003f7 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x437 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x437 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xfa }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x104 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0x102 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x64 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x24 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xad, .steppings = 0x0002, .driver_data = 0xa0000d1 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaf, .steppings = 0x0008, .driver_data = 0x3000341 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb5, .steppings = 0x0001, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0010, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbd, .steppings = 0x0002, .driver_data = 0x11f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0040, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0080, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc5, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0010, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xca, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x210002a9 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x210002a9 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e }, From 632ff61706473127cdc3b779bf24d368e3856ab3 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 20 Aug 2025 15:50:42 +0200 Subject: [PATCH 0851/3206] x86/microcode: Add microcode= cmdline parsing Add a "microcode=" command line argument after which all options can be passed in a comma-separated list. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Sohil Mehta Reviewed-by: Chang S. Bae Link: https://lore.kernel.org/20250820135043.19048-2-bp@kernel.org --- .../admin-guide/kernel-parameters.txt | 8 ++++-- arch/x86/Kconfig | 4 +-- arch/x86/kernel/cpu/microcode/core.c | 26 ++++++++++++++++--- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 747a55abf4946b..9e3bbce6583f20 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3767,8 +3767,12 @@ mga= [HW,DRM] - microcode.force_minrev= [X86] - Format: + microcode= [X86] Control the behavior of the microcode loader. + Available options, comma separated: + + dis_ucode_ldr: disable the microcode loader + + force_minrev: Enable or disable the microcode minimal revision enforcement for the runtime microcode loader. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100eb..aa250d90f927a2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1340,7 +1340,7 @@ config MICROCODE_LATE_LOADING use this at your own risk. Late loading taints the kernel unless the microcode header indicates that it is safe for late loading via the minimal revision check. This minimal revision check can be enforced on - the kernel command line with "microcode.minrev=Y". + the kernel command line with "microcode=force_minrev". config MICROCODE_LATE_FORCE_MINREV bool "Enforce late microcode loading minimal revision check" @@ -1356,7 +1356,7 @@ config MICROCODE_LATE_FORCE_MINREV revision check fails. This minimal revision check can also be controlled via the - "microcode.minrev" parameter on the kernel command line. + "microcode=force_minrev" parameter on the kernel command line. If unsure say Y. diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b92e09a87c6997..7d590630673b42 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -43,10 +43,9 @@ #include "internal.h" static struct microcode_ops *microcode_ops; -static bool dis_ucode_ldr = false; +static bool dis_ucode_ldr; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); -module_param(force_minrev, bool, S_IRUSR | S_IWUSR); /* * Synchronization. @@ -126,13 +125,32 @@ bool __init microcode_loader_disabled(void) return dis_ucode_ldr; } +static void early_parse_cmdline(void) +{ + char cmd_buf[64] = {}; + char *s, *p = cmd_buf; + + if (cmdline_find_option(boot_command_line, "microcode", cmd_buf, sizeof(cmd_buf)) > 0) { + while ((s = strsep(&p, ","))) { + if (!strcmp("force_minrev", s)) + force_minrev = true; + + if (!strcmp(s, "dis_ucode_ldr")) + dis_ucode_ldr = true; + } + } + + /* old, compat option */ + if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) + dis_ucode_ldr = true; +} + void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; bool intel = true; - if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) - dis_ucode_ldr = true; + early_parse_cmdline(); if (microcode_loader_disabled()) return; From 43181a47263dd9f2bee0afd688a841b09f9b7d12 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 20 Aug 2025 15:50:43 +0200 Subject: [PATCH 0852/3206] x86/microcode: Add microcode loader debugging functionality Instead of adding ad-hoc debugging glue to the microcode loader each time I need it, add debugging functionality which is not built by default. Simulate all patch handling the loader does except the actual loading of the microcode patch into the hardware. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250820135043.19048-3-bp@kernel.org --- .../admin-guide/kernel-parameters.txt | 4 + arch/x86/Kconfig | 12 +++ arch/x86/kernel/cpu/microcode/amd.c | 73 +++++++++++++------ arch/x86/kernel/cpu/microcode/core.c | 21 +++++- arch/x86/kernel/cpu/microcode/internal.h | 9 +++ 5 files changed, 96 insertions(+), 23 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9e3bbce6583f20..2c142e5f9f0617 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3770,6 +3770,10 @@ microcode= [X86] Control the behavior of the microcode loader. Available options, comma separated: + base_rev=X - with with format: + Set the base microcode revision of each thread when in + debug mode. + dis_ucode_ldr: disable the microcode loader force_minrev: diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index aa250d90f927a2..77f72f075d896f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1360,6 +1360,18 @@ config MICROCODE_LATE_FORCE_MINREV If unsure say Y. +config MICROCODE_DBG + bool "Enable microcode loader debugging" + default n + depends on MICROCODE + help + Enable code which allows for debugging the microcode loader in + a guest. Meaning the patch loading is simulated but everything else + related to patch parsing and handling is done as on baremetal with + the purpose of debugging solely the software side of things. + + You almost certainly want to say n here. + config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" help diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 514f63340880fd..cdce885e2fd50a 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -269,15 +269,6 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi return true; } -static u32 get_patch_level(void) -{ - u32 rev, dummy __always_unused; - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - return rev; -} - static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) { union zen_patch_rev p; @@ -295,6 +286,30 @@ static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) return c; } +static u32 get_patch_level(void) +{ + u32 rev, dummy __always_unused; + + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + int cpu = smp_processor_id(); + + if (!microcode_rev[cpu]) { + if (!base_rev) + base_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax); + + microcode_rev[cpu] = base_rev; + + ucode_dbg("CPU%d, base_rev: 0x%x\n", cpu, base_rev); + } + + return microcode_rev[cpu]; + } + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + return rev; +} + static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) { unsigned int i; @@ -324,13 +339,13 @@ static bool verify_container(const u8 *buf, size_t buf_size) u32 cont_magic; if (buf_size <= CONTAINER_HDR_SZ) { - pr_debug("Truncated microcode container header.\n"); + ucode_dbg("Truncated microcode container header.\n"); return false; } cont_magic = *(const u32 *)buf; if (cont_magic != UCODE_MAGIC) { - pr_debug("Invalid magic value (0x%08x).\n", cont_magic); + ucode_dbg("Invalid magic value (0x%08x).\n", cont_magic); return false; } @@ -355,8 +370,8 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) cont_type = hdr[1]; if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { - pr_debug("Wrong microcode container equivalence table type: %u.\n", - cont_type); + ucode_dbg("Wrong microcode container equivalence table type: %u.\n", + cont_type); return false; } @@ -365,7 +380,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) equiv_tbl_len = hdr[2]; if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) || buf_size < equiv_tbl_len) { - pr_debug("Truncated equivalence table.\n"); + ucode_dbg("Truncated equivalence table.\n"); return false; } @@ -385,7 +400,7 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize const u32 *hdr; if (buf_size < SECTION_HDR_SIZE) { - pr_debug("Truncated patch section.\n"); + ucode_dbg("Truncated patch section.\n"); return false; } @@ -394,13 +409,13 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize p_size = hdr[1]; if (p_type != UCODE_UCODE_TYPE) { - pr_debug("Invalid type field (0x%x) in container file section header.\n", - p_type); + ucode_dbg("Invalid type field (0x%x) in container file section header.\n", + p_type); return false; } if (p_size < sizeof(struct microcode_header_amd)) { - pr_debug("Patch of size %u too short.\n", p_size); + ucode_dbg("Patch of size %u too short.\n", p_size); return false; } @@ -477,12 +492,12 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) * size sh_psize, as the section claims. */ if (buf_size < sh_psize) { - pr_debug("Patch of size %u truncated.\n", sh_psize); + ucode_dbg("Patch of size %u truncated.\n", sh_psize); return -1; } if (!__verify_patch_size(sh_psize, buf_size)) { - pr_debug("Per-family patch size mismatch.\n"); + ucode_dbg("Per-family patch size mismatch.\n"); return -1; } @@ -496,6 +511,9 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) proc_id = mc_hdr->processor_rev_id; patch_fam = 0xf + (proc_id >> 12); + + ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); + if (patch_fam != family) return 1; @@ -566,9 +584,14 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) } mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); + + ucode_dbg("patch_id: 0x%x\n", mc->hdr.patch_id); + if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; + + ucode_dbg(" match: size: %d\n", patch_size); } skip: @@ -639,8 +662,14 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, invlpg(p_addr_end); } + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) + microcode_rev[smp_processor_id()] = mc->hdr.patch_id; + /* verify patch application was successful */ *cur_rev = get_patch_level(); + + ucode_dbg("updated rev: 0x%x\n", *cur_rev); + if (*cur_rev != mc->hdr.patch_id) return false; @@ -1026,7 +1055,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; - pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", + ucode_dbg("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", __func__, patch->patch_id, proc_id); /* ... and add to cache. */ @@ -1169,7 +1198,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); if (request_firmware_direct(&fw, (const char *)fw_name, device)) { - pr_debug("failed to load file %s\n", fw_name); + ucode_dbg("failed to load file %s\n", fw_name); goto out; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 7d590630673b42..f75c140906d002 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -47,6 +47,16 @@ static bool dis_ucode_ldr; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); +/* + * Those below should be behind CONFIG_MICROCODE_DBG ifdeffery but in + * order to not uglify the code with ifdeffery and use IS_ENABLED() + * instead, leave them in. When microcode debugging is not enabled, + * those are meaningless anyway. + */ +/* base microcode revision for debugging */ +u32 base_rev; +u32 microcode_rev[NR_CPUS] = {}; + /* * Synchronization. * @@ -118,7 +128,8 @@ bool __init microcode_loader_disabled(void) * overwritten. */ if (!cpuid_feature() || - native_cpuid_ecx(1) & BIT(31) || + ((native_cpuid_ecx(1) & BIT(31)) && + !IS_ENABLED(CONFIG_MICROCODE_DBG)) || amd_check_current_patch_level()) dis_ucode_ldr = true; @@ -132,6 +143,14 @@ static void early_parse_cmdline(void) if (cmdline_find_option(boot_command_line, "microcode", cmd_buf, sizeof(cmd_buf)) > 0) { while ((s = strsep(&p, ","))) { + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + if (strstr(s, "base_rev=")) { + /* advance to the option arg */ + strsep(&s, "="); + if (kstrtouint(s, 16, &base_rev)) { ; } + } + } + if (!strcmp("force_minrev", s)) force_minrev = true; diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 50a9702ae4e2b5..ae8dbc2b908d72 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -44,6 +44,9 @@ struct early_load_data { extern struct early_load_data early_data; extern struct ucode_cpu_info ucode_cpu_info[]; +extern u32 microcode_rev[NR_CPUS]; +extern u32 base_rev; + struct cpio_data find_microcode_in_initrd(const char *path); #define MAX_UCODE_COUNT 128 @@ -122,4 +125,10 @@ static inline void reload_ucode_intel(void) { } static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } #endif /* !CONFIG_CPU_SUP_INTEL */ +#define ucode_dbg(fmt, ...) \ +({ \ + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) \ + pr_info(fmt, ##__VA_ARGS__); \ +}) + #endif /* _X86_MICROCODE_INTERNAL_H */ From 1939a9fcb80353dd8b111aa1e79c691afbde08b4 Mon Sep 17 00:00:00 2001 From: Christoffer Sandberg Date: Tue, 26 Aug 2025 16:26:06 +0200 Subject: [PATCH 0853/3206] Input: i8042 - add TUXEDO InfinityBook Pro Gen10 AMD to i8042 quirk table Occasionally wakes up from suspend with missing input on the internal keyboard. Setting the quirks appears to fix the issue for this device as well. Signed-off-by: Christoffer Sandberg Signed-off-by: Werner Sembach Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250826142646.13516-1-wse@tuxedocomputers.com Signed-off-by: Dmitry Torokhov --- drivers/input/serio/i8042-acpipnpio.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index 6ed9fc34948cbe..1caa6c4ca435c7 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -1155,6 +1155,20 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = { .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "XxHP4NAx"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | + SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "XxKK4NAx_XxSP4NAx"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | + SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) + }, /* * A lot of modern Clevo barebones have touchpad and/or keyboard issues * after suspend fixable with the forcenorestore quirk. From a984da24e7ac411c95bab7b6c127bae4927f9d03 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 21 Aug 2025 15:32:41 -0400 Subject: [PATCH 0854/3206] rust: hrtimer: Document the return value for HrTimerHandle::cancel() Just a drive-by fix I noticed: we don't actually document what the return value from cancel() does, so do that. Signed-off-by: Lyude Paul Reviewed-by: Andreas Hindborg Reviewed-by: Daniel Almeida Link: https://lore.kernel.org/r/20250821193259.964504-2-lyude@redhat.com Signed-off-by: Andreas Hindborg --- rust/kernel/time/hrtimer.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/kernel/time/hrtimer.rs b/rust/kernel/time/hrtimer.rs index 144e3b57cc7800..6bfc0223f4f579 100644 --- a/rust/kernel/time/hrtimer.rs +++ b/rust/kernel/time/hrtimer.rs @@ -324,6 +324,8 @@ pub unsafe trait HrTimerHandle { /// Note that the timer might be started by a concurrent start operation. If /// so, the timer might not be in the **stopped** state when this function /// returns. + /// + /// Returns `true` if the timer was running. fn cancel(&mut self) -> bool; } From 0e2aab67f2d5716e8db73b6d0719208b44086ed5 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 21 Aug 2025 15:32:42 -0400 Subject: [PATCH 0855/3206] rust: hrtimer: Add HrTimerInstant Since we want to add HrTimer methods that can accept Instants, we will want to make sure that for each method we are using the correct Clocksource for the given HrTimer. This would get a bit overly-verbose, so add a simple HrTimerInstant type-alias to handle this for us. Signed-off-by: Lyude Paul Reviewed-by: Andreas Hindborg Reviewed-by: Daniel Almeida Link: https://lore.kernel.org/r/20250821193259.964504-3-lyude@redhat.com Signed-off-by: Andreas Hindborg --- rust/kernel/time/hrtimer.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rust/kernel/time/hrtimer.rs b/rust/kernel/time/hrtimer.rs index 6bfc0223f4f579..be1bad4aacaad8 100644 --- a/rust/kernel/time/hrtimer.rs +++ b/rust/kernel/time/hrtimer.rs @@ -72,6 +72,11 @@ use crate::{prelude::*, types::Opaque}; use core::marker::PhantomData; use pin_init::PinInit; +/// A type-alias to refer to the [`Instant`] for a given `T` from [`HrTimer`]. +/// +/// Where `C` is the [`ClockSource`] of the [`HrTimer`]. +pub type HrTimerInstant = Instant<<>::TimerMode as HrTimerMode>::Clock>; + /// A timer backed by a C `struct hrtimer`. /// /// # Invariants From 3efb9ce91c5279d7ea73563d1fb136077f52dd2e Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 21 Aug 2025 15:32:43 -0400 Subject: [PATCH 0856/3206] rust: hrtimer: Add HrTimer::raw_forward() and forward() Within the hrtimer API there are quite a number of functions that can only be safely called from one of two contexts: * When we have exclusive access to the hrtimer and the timer is not active. * When we're within the hrtimer's callback context as it is being executed. This commit adds bindings for hrtimer_forward() for the first such context, along with HrTimer::raw_forward() for later use in implementing the hrtimer_forward() in the latter context. Signed-off-by: Lyude Paul Reviewed-by: Daniel Almeida Reviewed-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250821193259.964504-4-lyude@redhat.com Signed-off-by: Andreas Hindborg --- rust/kernel/time/hrtimer.rs | 40 +++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/rust/kernel/time/hrtimer.rs b/rust/kernel/time/hrtimer.rs index be1bad4aacaad8..79fed14b2d98e7 100644 --- a/rust/kernel/time/hrtimer.rs +++ b/rust/kernel/time/hrtimer.rs @@ -168,6 +168,46 @@ impl HrTimer { // handled on the C side. unsafe { bindings::hrtimer_cancel(c_timer_ptr) != 0 } } + + /// Forward the timer expiry for a given timer pointer. + /// + /// # Safety + /// + /// - `self_ptr` must point to a valid `Self`. + /// - The caller must either have exclusive access to the data pointed at by `self_ptr`, or be + /// within the context of the timer callback. + #[inline] + unsafe fn raw_forward(self_ptr: *mut Self, now: HrTimerInstant, interval: Delta) -> u64 + where + T: HasHrTimer, + { + // SAFETY: + // * The C API requirements for this function are fulfilled by our safety contract. + // * `self_ptr` is guaranteed to point to a valid `Self` via our safety contract + unsafe { + bindings::hrtimer_forward(Self::raw_get(self_ptr), now.as_nanos(), interval.as_nanos()) + } + } + + /// Conditionally forward the timer. + /// + /// If the timer expires after `now`, this function does nothing and returns 0. If the timer + /// expired at or before `now`, this function forwards the timer by `interval` until the timer + /// expires after `now` and then returns the number of times the timer was forwarded by + /// `interval`. + /// + /// Returns the number of overruns that occurred as a result of the timer expiry change. + pub fn forward(self: Pin<&mut Self>, now: HrTimerInstant, interval: Delta) -> u64 + where + T: HasHrTimer, + { + // SAFETY: `raw_forward` does not move `Self` + let this = unsafe { self.get_unchecked_mut() }; + + // SAFETY: By existence of `Pin<&mut Self>`, the pointer passed to `raw_forward` points to a + // valid `Self` that we have exclusive access to. + unsafe { Self::raw_forward(this, now, interval) } + } } /// Implemented by pointer types that point to structs that contain a [`HrTimer`]. From 3f2a5ba784b808109cac0aac921213e43143a216 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 21 Aug 2025 15:32:44 -0400 Subject: [PATCH 0857/3206] rust: hrtimer: Add HrTimerCallbackContext and ::forward() With Linux's hrtimer API, there's a number of methods that can only be called in two situations: * When we have exclusive access to the hrtimer and it is not currently active * When we're within the context of an hrtimer callback context This commit handles the second situation and implements hrtimer_forward() support in the context of a timer callback. We do this by introducing a HrTimerCallbackContext type which is provided to users during the RawHrTimerCallback::run() callback, and then add a forward() function to the type. Signed-off-by: Lyude Paul Reviewed-by: Daniel Almeida Reviewed-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250821193259.964504-5-lyude@redhat.com Signed-off-by: Andreas Hindborg --- rust/kernel/time/hrtimer.rs | 63 +++++++++++++++++++++++++++-- rust/kernel/time/hrtimer/arc.rs | 9 ++++- rust/kernel/time/hrtimer/pin.rs | 9 ++++- rust/kernel/time/hrtimer/pin_mut.rs | 12 ++++-- rust/kernel/time/hrtimer/tbox.rs | 9 ++++- 5 files changed, 93 insertions(+), 9 deletions(-) diff --git a/rust/kernel/time/hrtimer.rs b/rust/kernel/time/hrtimer.rs index 79fed14b2d98e7..1e8839d2772928 100644 --- a/rust/kernel/time/hrtimer.rs +++ b/rust/kernel/time/hrtimer.rs @@ -69,7 +69,7 @@ use super::{ClockSource, Delta, Instant}; use crate::{prelude::*, types::Opaque}; -use core::marker::PhantomData; +use core::{marker::PhantomData, ptr::NonNull}; use pin_init::PinInit; /// A type-alias to refer to the [`Instant`] for a given `T` from [`HrTimer`]. @@ -196,6 +196,10 @@ impl HrTimer { /// expires after `now` and then returns the number of times the timer was forwarded by /// `interval`. /// + /// This function is mainly useful for timer types which can provide exclusive access to the + /// timer when the timer is not running. For forwarding the timer from within the timer callback + /// context, see [`HrTimerCallbackContext::forward()`]. + /// /// Returns the number of overruns that occurred as a result of the timer expiry change. pub fn forward(self: Pin<&mut Self>, now: HrTimerInstant, interval: Delta) -> u64 where @@ -345,9 +349,13 @@ pub trait HrTimerCallback { type Pointer<'a>: RawHrTimerCallback; /// Called by the timer logic when the timer fires. - fn run(this: as RawHrTimerCallback>::CallbackTarget<'_>) -> HrTimerRestart + fn run( + this: as RawHrTimerCallback>::CallbackTarget<'_>, + ctx: HrTimerCallbackContext<'_, Self>, + ) -> HrTimerRestart where - Self: Sized; + Self: Sized, + Self: HasHrTimer; } /// A handle representing a potentially running timer. @@ -632,6 +640,55 @@ impl HrTimerMode for RelativePinnedHardMode { type Expires = Delta; } +/// Privileged smart-pointer for a [`HrTimer`] callback context. +/// +/// Many [`HrTimer`] methods can only be called in two situations: +/// +/// * When the caller has exclusive access to the `HrTimer` and the `HrTimer` is guaranteed not to +/// be running. +/// * From within the context of an `HrTimer`'s callback method. +/// +/// This type provides access to said methods from within a timer callback context. +/// +/// # Invariants +/// +/// * The existence of this type means the caller is currently within the callback for an +/// [`HrTimer`]. +/// * `self.0` always points to a live instance of [`HrTimer`]. +pub struct HrTimerCallbackContext<'a, T: HasHrTimer>(NonNull>, PhantomData<&'a ()>); + +impl<'a, T: HasHrTimer> HrTimerCallbackContext<'a, T> { + /// Create a new [`HrTimerCallbackContext`]. + /// + /// # Safety + /// + /// This function relies on the caller being within the context of a timer callback, so it must + /// not be used anywhere except for within implementations of [`RawHrTimerCallback::run`]. The + /// caller promises that `timer` points to a valid initialized instance of + /// [`bindings::hrtimer`]. + /// + /// The returned `Self` must not outlive the function context of [`RawHrTimerCallback::run`] + /// where this function is called. + pub(crate) unsafe fn from_raw(timer: *mut HrTimer) -> Self { + // SAFETY: The caller guarantees `timer` is a valid pointer to an initialized + // `bindings::hrtimer` + // INVARIANT: Our safety contract ensures that we're within the context of a timer callback + // and that `timer` points to a live instance of `HrTimer`. + Self(unsafe { NonNull::new_unchecked(timer) }, PhantomData) + } + + /// Conditionally forward the timer. + /// + /// This function is identical to [`HrTimer::forward()`] except that it may only be used from + /// within the context of a [`HrTimer`] callback. + pub fn forward(&mut self, now: HrTimerInstant, interval: Delta) -> u64 { + // SAFETY: + // - We are guaranteed to be within the context of a timer callback by our type invariants + // - By our type invariants, `self.0` always points to a valid `HrTimer` + unsafe { HrTimer::::raw_forward(self.0.as_ptr(), now, interval) } + } +} + /// Use to implement the [`HasHrTimer`] trait. /// /// See [`module`] documentation for an example. diff --git a/rust/kernel/time/hrtimer/arc.rs b/rust/kernel/time/hrtimer/arc.rs index ed490a7a895038..7be82bcb352ac4 100644 --- a/rust/kernel/time/hrtimer/arc.rs +++ b/rust/kernel/time/hrtimer/arc.rs @@ -3,6 +3,7 @@ use super::HasHrTimer; use super::HrTimer; use super::HrTimerCallback; +use super::HrTimerCallbackContext; use super::HrTimerHandle; use super::HrTimerMode; use super::HrTimerPointer; @@ -99,6 +100,12 @@ where // allocation from other `Arc` clones. let receiver = unsafe { ArcBorrow::from_raw(data_ptr) }; - T::run(receiver).into_c() + // SAFETY: + // - By C API contract `timer_ptr` is the pointer that we passed when queuing the timer, so + // it is a valid pointer to a `HrTimer` embedded in a `T`. + // - We are within `RawHrTimerCallback::run` + let context = unsafe { HrTimerCallbackContext::from_raw(timer_ptr) }; + + T::run(receiver, context).into_c() } } diff --git a/rust/kernel/time/hrtimer/pin.rs b/rust/kernel/time/hrtimer/pin.rs index aef16d9ee2f0c5..4d39ef7816971b 100644 --- a/rust/kernel/time/hrtimer/pin.rs +++ b/rust/kernel/time/hrtimer/pin.rs @@ -3,6 +3,7 @@ use super::HasHrTimer; use super::HrTimer; use super::HrTimerCallback; +use super::HrTimerCallbackContext; use super::HrTimerHandle; use super::HrTimerMode; use super::RawHrTimerCallback; @@ -103,6 +104,12 @@ where // here. let receiver_pin = unsafe { Pin::new_unchecked(receiver_ref) }; - T::run(receiver_pin).into_c() + // SAFETY: + // - By C API contract `timer_ptr` is the pointer that we passed when queuing the timer, so + // it is a valid pointer to a `HrTimer` embedded in a `T`. + // - We are within `RawHrTimerCallback::run` + let context = unsafe { HrTimerCallbackContext::from_raw(timer_ptr) }; + + T::run(receiver_pin, context).into_c() } } diff --git a/rust/kernel/time/hrtimer/pin_mut.rs b/rust/kernel/time/hrtimer/pin_mut.rs index 767d0a4e8a2c11..9d9447d4d57e83 100644 --- a/rust/kernel/time/hrtimer/pin_mut.rs +++ b/rust/kernel/time/hrtimer/pin_mut.rs @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 use super::{ - HasHrTimer, HrTimer, HrTimerCallback, HrTimerHandle, HrTimerMode, RawHrTimerCallback, - UnsafeHrTimerPointer, + HasHrTimer, HrTimer, HrTimerCallback, HrTimerCallbackContext, HrTimerHandle, HrTimerMode, + RawHrTimerCallback, UnsafeHrTimerPointer, }; use core::{marker::PhantomData, pin::Pin, ptr::NonNull}; @@ -107,6 +107,12 @@ where // here. let receiver_pin = unsafe { Pin::new_unchecked(receiver_ref) }; - T::run(receiver_pin).into_c() + // SAFETY: + // - By C API contract `timer_ptr` is the pointer that we passed when queuing the timer, so + // it is a valid pointer to a `HrTimer` embedded in a `T`. + // - We are within `RawHrTimerCallback::run` + let context = unsafe { HrTimerCallbackContext::from_raw(timer_ptr) }; + + T::run(receiver_pin, context).into_c() } } diff --git a/rust/kernel/time/hrtimer/tbox.rs b/rust/kernel/time/hrtimer/tbox.rs index ec08303315f280..aa1ee31a71953c 100644 --- a/rust/kernel/time/hrtimer/tbox.rs +++ b/rust/kernel/time/hrtimer/tbox.rs @@ -3,6 +3,7 @@ use super::HasHrTimer; use super::HrTimer; use super::HrTimerCallback; +use super::HrTimerCallbackContext; use super::HrTimerHandle; use super::HrTimerMode; use super::HrTimerPointer; @@ -119,6 +120,12 @@ where // `data_ptr` exist. let data_mut_ref = unsafe { Pin::new_unchecked(&mut *data_ptr) }; - T::run(data_mut_ref).into_c() + // SAFETY: + // - By C API contract `timer_ptr` is the pointer that we passed when queuing the timer, so + // it is a valid pointer to a `HrTimer` embedded in a `T`. + // - We are within `RawHrTimerCallback::run` + let context = unsafe { HrTimerCallbackContext::from_raw(timer_ptr) }; + + T::run(data_mut_ref, context).into_c() } } From ac0a7bd27f6593dfe9ea6b65538bebbbd8322241 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 21 Aug 2025 15:32:45 -0400 Subject: [PATCH 0858/3206] rust: hrtimer: Add forward_now() to HrTimer and HrTimerCallbackContext Signed-off-by: Lyude Paul Reviewed-by: Daniel Almeida Reviewed-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250821193259.964504-6-lyude@redhat.com Signed-off-by: Andreas Hindborg --- rust/kernel/time/hrtimer.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/rust/kernel/time/hrtimer.rs b/rust/kernel/time/hrtimer.rs index 1e8839d2772928..e0d78a88599030 100644 --- a/rust/kernel/time/hrtimer.rs +++ b/rust/kernel/time/hrtimer.rs @@ -212,6 +212,17 @@ impl HrTimer { // valid `Self` that we have exclusive access to. unsafe { Self::raw_forward(this, now, interval) } } + + /// Conditionally forward the timer. + /// + /// This is a variant of [`forward()`](Self::forward) that uses an interval after the current + /// time of the base clock for the [`HrTimer`]. + pub fn forward_now(self: Pin<&mut Self>, interval: Delta) -> u64 + where + T: HasHrTimer, + { + self.forward(HrTimerInstant::::now(), interval) + } } /// Implemented by pointer types that point to structs that contain a [`HrTimer`]. @@ -687,6 +698,14 @@ impl<'a, T: HasHrTimer> HrTimerCallbackContext<'a, T> { // - By our type invariants, `self.0` always points to a valid `HrTimer` unsafe { HrTimer::::raw_forward(self.0.as_ptr(), now, interval) } } + + /// Conditionally forward the timer. + /// + /// This is a variant of [`HrTimerCallbackContext::forward()`] that uses an interval after the + /// current time of the base clock for the [`HrTimer`]. + pub fn forward_now(&mut self, duration: Delta) -> u64 { + self.forward(HrTimerInstant::::now(), duration) + } } /// Use to implement the [`HasHrTimer`] trait. From 583802cc99bdac95d173aaf1fc58e99993aa1d1d Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 21 Aug 2025 15:32:46 -0400 Subject: [PATCH 0859/3206] rust: time: Add Instant::from_ktime() For implementing Rust bindings which can return a point in time. Signed-off-by: Lyude Paul Reviewed-by: Andreas Hindborg Reviewed-by: Alice Ryhl Reviewed-by: FUJITA Tomonori Link: https://lore.kernel.org/r/20250821193259.964504-7-lyude@redhat.com Signed-off-by: Andreas Hindborg --- rust/kernel/time.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/rust/kernel/time.rs b/rust/kernel/time.rs index 64c8dcf548d630..874a1023dcdf92 100644 --- a/rust/kernel/time.rs +++ b/rust/kernel/time.rs @@ -200,6 +200,29 @@ impl Instant { pub(crate) fn as_nanos(&self) -> i64 { self.inner } + + /// Create an [`Instant`] from a `ktime_t` without checking if it is non-negative. + /// + /// # Panics + /// + /// On debug builds, this function will panic if `ktime` is not in the range from 0 to + /// `KTIME_MAX`. + /// + /// # Safety + /// + /// The caller promises that `ktime` is in the range from 0 to `KTIME_MAX`. + #[expect(unused)] + #[inline] + pub(crate) unsafe fn from_ktime(ktime: bindings::ktime_t) -> Self { + debug_assert!(ktime >= 0); + + // INVARIANT: Our safety contract ensures that `ktime` is in the range from 0 to + // `KTIME_MAX`. + Self { + inner: ktime, + _c: PhantomData, + } + } } impl core::ops::Sub for Instant { From 4b0147494275fdbea98305f4ddad7e2def7b556f Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 21 Aug 2025 15:32:47 -0400 Subject: [PATCH 0860/3206] rust: hrtimer: Add HrTimer::expires() Add a simple callback for retrieving the current expiry time for an HrTimer. In rvkms, we use the HrTimer expiry value in order to calculate the approximate vblank timestamp during each emulated vblank interrupt. Signed-off-by: Lyude Paul Reviewed-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250821193259.964504-8-lyude@redhat.com Signed-off-by: Andreas Hindborg --- rust/kernel/time.rs | 1 - rust/kernel/time/hrtimer.rs | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/rust/kernel/time.rs b/rust/kernel/time.rs index 874a1023dcdf92..7320d8715bcc21 100644 --- a/rust/kernel/time.rs +++ b/rust/kernel/time.rs @@ -211,7 +211,6 @@ impl Instant { /// # Safety /// /// The caller promises that `ktime` is in the range from 0 to `KTIME_MAX`. - #[expect(unused)] #[inline] pub(crate) unsafe fn from_ktime(ktime: bindings::ktime_t) -> Self { debug_assert!(ktime >= 0); diff --git a/rust/kernel/time/hrtimer.rs b/rust/kernel/time/hrtimer.rs index e0d78a88599030..856d2d929a0089 100644 --- a/rust/kernel/time/hrtimer.rs +++ b/rust/kernel/time/hrtimer.rs @@ -223,6 +223,29 @@ impl HrTimer { { self.forward(HrTimerInstant::::now(), interval) } + + /// Return the time expiry for this [`HrTimer`]. + /// + /// This value should only be used as a snapshot, as the actual expiry time could change after + /// this function is called. + pub fn expires(&self) -> HrTimerInstant + where + T: HasHrTimer, + { + // SAFETY: `self` is an immutable reference and thus always points to a valid `HrTimer`. + let c_timer_ptr = unsafe { HrTimer::raw_get(self) }; + + // SAFETY: + // - Timers cannot have negative ktime_t values as their expiration time. + // - There's no actual locking here, a racy read is fine and expected + unsafe { + Instant::from_ktime( + // This `read_volatile` is intended to correspond to a READ_ONCE call. + // FIXME(read_once): Replace with `read_once` when available on the Rust side. + core::ptr::read_volatile(&raw const ((*c_timer_ptr).node.expires)), + ) + } + } } /// Implemented by pointer types that point to structs that contain a [`HrTimer`]. From 22b65a40574e597e1919ca22cc3535c2b541f764 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Wed, 20 Aug 2025 16:26:43 -0400 Subject: [PATCH 0861/3206] rust: time: Implement Add/Sub for Instant In order to copy the behavior rust currently follows for basic arithmetic operations and panic if the result of an addition or subtraction results in a value that would violate the invariants of Instant, but only if the kernel has overflow checking for rust enabled. Signed-off-by: Lyude Paul Reviewed-by: Alice Ryhl Reviewed-by: FUJITA Tomonori Reviewed-by: Andreas Hindborg Reviewed-by: Daniel Almeida Link: https://lore.kernel.org/r/20250820203704.731588-2-lyude@redhat.com Signed-off-by: Andreas Hindborg --- rust/kernel/time.rs | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/rust/kernel/time.rs b/rust/kernel/time.rs index 7320d8715bcc21..642c42a520a804 100644 --- a/rust/kernel/time.rs +++ b/rust/kernel/time.rs @@ -25,6 +25,7 @@ //! C header: [`include/linux/ktime.h`](srctree/include/linux/ktime.h). use core::marker::PhantomData; +use core::ops; pub mod delay; pub mod hrtimer; @@ -224,7 +225,7 @@ impl Instant { } } -impl core::ops::Sub for Instant { +impl ops::Sub for Instant { type Output = Delta; // By the type invariant, it never overflows. @@ -236,6 +237,46 @@ impl core::ops::Sub for Instant { } } +impl ops::Add for Instant { + type Output = Self; + + #[inline] + fn add(self, rhs: Delta) -> Self::Output { + // INVARIANT: With arithmetic over/underflow checks enabled, this will panic if we overflow + // (e.g. go above `KTIME_MAX`) + let res = self.inner + rhs.nanos; + + // INVARIANT: With overflow checks enabled, we verify here that the value is >= 0 + #[cfg(CONFIG_RUST_OVERFLOW_CHECKS)] + assert!(res >= 0); + + Self { + inner: res, + _c: PhantomData, + } + } +} + +impl ops::Sub for Instant { + type Output = Self; + + #[inline] + fn sub(self, rhs: Delta) -> Self::Output { + // INVARIANT: With arithmetic over/underflow checks enabled, this will panic if we overflow + // (e.g. go above `KTIME_MAX`) + let res = self.inner - rhs.nanos; + + // INVARIANT: With overflow checks enabled, we verify here that the value is >= 0 + #[cfg(CONFIG_RUST_OVERFLOW_CHECKS)] + assert!(res >= 0); + + Self { + inner: res, + _c: PhantomData, + } + } +} + /// A span of time. /// /// This struct represents a span of time, with its value stored as nanoseconds. From 4521438fb076f8a6a52f45b0e508f6ef10ac0c49 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Wed, 20 Aug 2025 16:26:44 -0400 Subject: [PATCH 0862/3206] rust: time: Implement basic arithmetic operations for Delta While rvkms is only going to be using a few of these, since Deltas are basically the same as i64 it's easy enough to just implement all of the basic arithmetic operations for Delta types. Keep in mind there's one quirk here - the kernel has no support for i64 % i64 on 32 bit platforms, the closest we have is i64 % i32 through div_s64_rem(). So, instead of implementing ops::Rem or ops::RemAssign we simply provide Delta::rem_nanos(). Signed-off-by: Lyude Paul Reviewed-by: Daniel Almeida Reviewed-by: FUJITA Tomonori Reviewed-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250820203704.731588-3-lyude@redhat.com Signed-off-by: Andreas Hindborg --- rust/kernel/time.rs | 98 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/rust/kernel/time.rs b/rust/kernel/time.rs index 642c42a520a804..6ea98dfcd02787 100644 --- a/rust/kernel/time.rs +++ b/rust/kernel/time.rs @@ -287,6 +287,78 @@ pub struct Delta { nanos: i64, } +impl ops::Add for Delta { + type Output = Self; + + #[inline] + fn add(self, rhs: Self) -> Self { + Self { + nanos: self.nanos + rhs.nanos, + } + } +} + +impl ops::AddAssign for Delta { + #[inline] + fn add_assign(&mut self, rhs: Self) { + self.nanos += rhs.nanos; + } +} + +impl ops::Sub for Delta { + type Output = Self; + + #[inline] + fn sub(self, rhs: Self) -> Self::Output { + Self { + nanos: self.nanos - rhs.nanos, + } + } +} + +impl ops::SubAssign for Delta { + #[inline] + fn sub_assign(&mut self, rhs: Self) { + self.nanos -= rhs.nanos; + } +} + +impl ops::Mul for Delta { + type Output = Self; + + #[inline] + fn mul(self, rhs: i64) -> Self::Output { + Self { + nanos: self.nanos * rhs, + } + } +} + +impl ops::MulAssign for Delta { + #[inline] + fn mul_assign(&mut self, rhs: i64) { + self.nanos *= rhs; + } +} + +impl ops::Div for Delta { + type Output = i64; + + #[inline] + fn div(self, rhs: Self) -> Self::Output { + #[cfg(CONFIG_64BIT)] + { + self.nanos / rhs.nanos + } + + #[cfg(not(CONFIG_64BIT))] + { + // SAFETY: This function is always safe to call regardless of the input values + unsafe { bindings::div64_s64(self.nanos, rhs.nanos) } + } + } +} + impl Delta { /// A span of time equal to zero. pub const ZERO: Self = Self { nanos: 0 }; @@ -375,4 +447,30 @@ impl Delta { bindings::ktime_to_ms(self.as_nanos()) } } + + /// Return `self % dividend` where `dividend` is in nanoseconds. + /// + /// The kernel doesn't have any emulation for `s64 % s64` on 32 bit platforms, so this is + /// limited to 32 bit dividends. + #[inline] + pub fn rem_nanos(self, dividend: i32) -> Self { + #[cfg(CONFIG_64BIT)] + { + Self { + nanos: self.as_nanos() % i64::from(dividend), + } + } + + #[cfg(not(CONFIG_64BIT))] + { + let mut rem = 0; + + // SAFETY: `rem` is in the stack, so we can always provide a valid pointer to it. + unsafe { bindings::div_s64_rem(self.as_nanos(), dividend, &mut rem) }; + + Self { + nanos: i64::from(rem), + } + } + } } From a00f2015acdbd8a4b3d2382eaeebe11db1925fad Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Wed, 3 Sep 2025 12:21:33 -0700 Subject: [PATCH 0863/3206] drm/panthor: validate group queue count A panthor group can have at most MAX_CS_PER_CSG panthor queues. Fixes: 4bdca11507928 ("drm/panthor: Add the driver frontend block") Signed-off-by: Chia-I Wu Reviewed-by: Boris Brezillon # v1 Reviewed-by: Steven Price Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20250903192133.288477-1-olvaffe@gmail.com --- drivers/gpu/drm/panthor/panthor_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c index 1116f2d2826eeb..4d8e9b34702a76 100644 --- a/drivers/gpu/drm/panthor/panthor_drv.c +++ b/drivers/gpu/drm/panthor/panthor_drv.c @@ -1094,7 +1094,7 @@ static int panthor_ioctl_group_create(struct drm_device *ddev, void *data, struct drm_panthor_queue_create *queue_args; int ret; - if (!args->queues.count) + if (!args->queues.count || args->queues.count > MAX_CS_PER_CSG) return -EINVAL; ret = PANTHOR_UOBJ_GET_ARRAY(queue_args, &args->queues); From 2aef21a6a60305244ab3508c5d15b24a8ac8e66f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Sep 2025 07:25:37 +0000 Subject: [PATCH 0864/3206] audit: init ab->skb_list earlier in audit_buffer_alloc() syzbot found a bug in audit_buffer_alloc() if nlmsg_new() returns NULL. We need to initialize ab->skb_list before calling audit_buffer_free() which will use both the skb_list spinlock and list pointers. Fixes: eb59d494eebd ("audit: add record for multiple task security contexts") Reported-by: syzbot+bb185b018a51f8d91fd2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/68b93e3c.a00a0220.eb3d.0000.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Casey Schaufler Cc: Paul Moore Cc: Eric Paris Cc: audit@vger.kernel.org Signed-off-by: Paul Moore --- kernel/audit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/audit.c b/kernel/audit.c index bd7474fd8d2c23..7074838796485a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1831,11 +1831,12 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, if (!ab) return NULL; + skb_queue_head_init(&ab->skb_list); + ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; - skb_queue_head_init(&ab->skb_list); skb_queue_tail(&ab->skb_list, ab->skb); if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) From 8bad31edf5490a38dc26163502cd7005a033ee05 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 4 Sep 2025 10:10:09 +0800 Subject: [PATCH 0865/3206] selftests/bpf: move get_ksyms and get_addrs to trace_helpers.c We need to get all the kernel function that can be traced sometimes, so we move the get_syms() and get_addrs() in kprobe_multi_test.c to trace_helpers.c and rename it to bpf_get_ksyms() and bpf_get_addrs(). Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20250904021011.14069-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/kprobe_multi_test.c | 220 +----------------- tools/testing/selftests/bpf/trace_helpers.c | 214 +++++++++++++++++ tools/testing/selftests/bpf/trace_helpers.h | 3 + 3 files changed, 220 insertions(+), 217 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index f377bea0b82d4f..171706e78da88c 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -422,220 +422,6 @@ static void test_unique_match(void) kprobe_multi__destroy(skel); } -static size_t symbol_hash(long key, void *ctx __maybe_unused) -{ - return str_hash((const char *) key); -} - -static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) -{ - return strcmp((const char *) key1, (const char *) key2) == 0; -} - -static bool is_invalid_entry(char *buf, bool kernel) -{ - if (kernel && strchr(buf, '[')) - return true; - if (!kernel && !strchr(buf, '[')) - return true; - return false; -} - -static bool skip_entry(char *name) -{ - /* - * We attach to almost all kernel functions and some of them - * will cause 'suspicious RCU usage' when fprobe is attached - * to them. Filter out the current culprits - arch_cpu_idle - * default_idle and rcu_* functions. - */ - if (!strcmp(name, "arch_cpu_idle")) - return true; - if (!strcmp(name, "default_idle")) - return true; - if (!strncmp(name, "rcu_", 4)) - return true; - if (!strcmp(name, "bpf_dispatcher_xdp_func")) - return true; - if (!strncmp(name, "__ftrace_invalid_address__", - sizeof("__ftrace_invalid_address__") - 1)) - return true; - return false; -} - -/* Do comparison by ignoring '.llvm.' suffixes. */ -static int compare_name(const char *name1, const char *name2) -{ - const char *res1, *res2; - int len1, len2; - - res1 = strstr(name1, ".llvm."); - res2 = strstr(name2, ".llvm."); - len1 = res1 ? res1 - name1 : strlen(name1); - len2 = res2 ? res2 - name2 : strlen(name2); - - if (len1 == len2) - return strncmp(name1, name2, len1); - if (len1 < len2) - return strncmp(name1, name2, len1) <= 0 ? -1 : 1; - return strncmp(name1, name2, len2) >= 0 ? 1 : -1; -} - -static int load_kallsyms_compare(const void *p1, const void *p2) -{ - return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); -} - -static int search_kallsyms_compare(const void *p1, const struct ksym *p2) -{ - return compare_name(p1, p2->name); -} - -static int get_syms(char ***symsp, size_t *cntp, bool kernel) -{ - size_t cap = 0, cnt = 0; - char *name = NULL, *ksym_name, **syms = NULL; - struct hashmap *map; - struct ksyms *ksyms; - struct ksym *ks; - char buf[256]; - FILE *f; - int err = 0; - - ksyms = load_kallsyms_custom_local(load_kallsyms_compare); - if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local")) - return -EINVAL; - - /* - * The available_filter_functions contains many duplicates, - * but other than that all symbols are usable in kprobe multi - * interface. - * Filtering out duplicates by using hashmap__add, which won't - * add existing entry. - */ - - if (access("/sys/kernel/tracing/trace", F_OK) == 0) - f = fopen("/sys/kernel/tracing/available_filter_functions", "r"); - else - f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r"); - - if (!f) - return -EINVAL; - - map = hashmap__new(symbol_hash, symbol_equal, NULL); - if (IS_ERR(map)) { - err = libbpf_get_error(map); - goto error; - } - - while (fgets(buf, sizeof(buf), f)) { - if (is_invalid_entry(buf, kernel)) - continue; - - free(name); - if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) - continue; - if (skip_entry(name)) - continue; - - ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); - if (!ks) { - err = -EINVAL; - goto error; - } - - ksym_name = ks->name; - err = hashmap__add(map, ksym_name, 0); - if (err == -EEXIST) { - err = 0; - continue; - } - if (err) - goto error; - - err = libbpf_ensure_mem((void **) &syms, &cap, - sizeof(*syms), cnt + 1); - if (err) - goto error; - - syms[cnt++] = ksym_name; - } - - *symsp = syms; - *cntp = cnt; - -error: - free(name); - fclose(f); - hashmap__free(map); - if (err) - free(syms); - return err; -} - -static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) -{ - unsigned long *addr, *addrs, *tmp_addrs; - int err = 0, max_cnt, inc_cnt; - char *name = NULL; - size_t cnt = 0; - char buf[256]; - FILE *f; - - if (access("/sys/kernel/tracing/trace", F_OK) == 0) - f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); - else - f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); - - if (!f) - return -ENOENT; - - /* In my local setup, the number of entries is 50k+ so Let us initially - * allocate space to hold 64k entries. If 64k is not enough, incrementally - * increase 1k each time. - */ - max_cnt = 65536; - inc_cnt = 1024; - addrs = malloc(max_cnt * sizeof(long)); - if (addrs == NULL) { - err = -ENOMEM; - goto error; - } - - while (fgets(buf, sizeof(buf), f)) { - if (is_invalid_entry(buf, kernel)) - continue; - - free(name); - if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) - continue; - if (skip_entry(name)) - continue; - - if (cnt == max_cnt) { - max_cnt += inc_cnt; - tmp_addrs = realloc(addrs, max_cnt); - if (!tmp_addrs) { - err = -ENOMEM; - goto error; - } - addrs = tmp_addrs; - } - - addrs[cnt++] = (unsigned long)addr; - } - - *addrsp = addrs; - *cntp = cnt; - -error: - free(name); - fclose(f); - if (err) - free(addrs); - return err; -} - static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts) { long attach_start_ns, attach_end_ns; @@ -670,7 +456,7 @@ static void test_kprobe_multi_bench_attach(bool kernel) char **syms = NULL; size_t cnt = 0; - if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) + if (!ASSERT_OK(bpf_get_ksyms(&syms, &cnt, kernel), "bpf_get_ksyms")) return; skel = kprobe_multi_empty__open_and_load(); @@ -696,13 +482,13 @@ static void test_kprobe_multi_bench_attach_addr(bool kernel) size_t cnt = 0; int err; - err = get_addrs(&addrs, &cnt, kernel); + err = bpf_get_addrs(&addrs, &cnt, kernel); if (err == -ENOENT) { test__skip(); return; } - if (!ASSERT_OK(err, "get_addrs")) + if (!ASSERT_OK(err, "bpf_get_addrs")) return; skel = kprobe_multi_empty__open_and_load(); diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 81943c6254e6bc..d24baf244d1f34 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -17,6 +17,7 @@ #include #include #include +#include "bpf/hashmap.h" #include "bpf/libbpf_internal.h" #define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" @@ -519,3 +520,216 @@ void read_trace_pipe(void) { read_trace_pipe_iter(trace_pipe_cb, NULL, 0); } + +static size_t symbol_hash(long key, void *ctx __maybe_unused) +{ + return str_hash((const char *) key); +} + +static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) +{ + return strcmp((const char *) key1, (const char *) key2) == 0; +} + +static bool is_invalid_entry(char *buf, bool kernel) +{ + if (kernel && strchr(buf, '[')) + return true; + if (!kernel && !strchr(buf, '[')) + return true; + return false; +} + +static bool skip_entry(char *name) +{ + /* + * We attach to almost all kernel functions and some of them + * will cause 'suspicious RCU usage' when fprobe is attached + * to them. Filter out the current culprits - arch_cpu_idle + * default_idle and rcu_* functions. + */ + if (!strcmp(name, "arch_cpu_idle")) + return true; + if (!strcmp(name, "default_idle")) + return true; + if (!strncmp(name, "rcu_", 4)) + return true; + if (!strcmp(name, "bpf_dispatcher_xdp_func")) + return true; + if (!strncmp(name, "__ftrace_invalid_address__", + sizeof("__ftrace_invalid_address__") - 1)) + return true; + return false; +} + +/* Do comparison by ignoring '.llvm.' suffixes. */ +static int compare_name(const char *name1, const char *name2) +{ + const char *res1, *res2; + int len1, len2; + + res1 = strstr(name1, ".llvm."); + res2 = strstr(name2, ".llvm."); + len1 = res1 ? res1 - name1 : strlen(name1); + len2 = res2 ? res2 - name2 : strlen(name2); + + if (len1 == len2) + return strncmp(name1, name2, len1); + if (len1 < len2) + return strncmp(name1, name2, len1) <= 0 ? -1 : 1; + return strncmp(name1, name2, len2) >= 0 ? 1 : -1; +} + +static int load_kallsyms_compare(const void *p1, const void *p2) +{ + return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); +} + +static int search_kallsyms_compare(const void *p1, const struct ksym *p2) +{ + return compare_name(p1, p2->name); +} + +int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel) +{ + size_t cap = 0, cnt = 0; + char *name = NULL, *ksym_name, **syms = NULL; + struct hashmap *map; + struct ksyms *ksyms; + struct ksym *ks; + char buf[256]; + FILE *f; + int err = 0; + + ksyms = load_kallsyms_custom_local(load_kallsyms_compare); + if (!ksyms) + return -EINVAL; + + /* + * The available_filter_functions contains many duplicates, + * but other than that all symbols are usable to trace. + * Filtering out duplicates by using hashmap__add, which won't + * add existing entry. + */ + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r"); + + if (!f) + return -EINVAL; + + map = hashmap__new(symbol_hash, symbol_equal, NULL); + if (IS_ERR(map)) { + err = libbpf_get_error(map); + goto error; + } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) + continue; + if (skip_entry(name)) + continue; + + ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); + if (!ks) { + err = -EINVAL; + goto error; + } + + ksym_name = ks->name; + err = hashmap__add(map, ksym_name, 0); + if (err == -EEXIST) { + err = 0; + continue; + } + if (err) + goto error; + + err = libbpf_ensure_mem((void **) &syms, &cap, + sizeof(*syms), cnt + 1); + if (err) + goto error; + + syms[cnt++] = ksym_name; + } + + *symsp = syms; + *cntp = cnt; + +error: + free(name); + fclose(f); + hashmap__free(map); + if (err) + free(syms); + return err; +} + +int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) +{ + unsigned long *addr, *addrs, *tmp_addrs; + int err = 0, max_cnt, inc_cnt; + char *name = NULL; + size_t cnt = 0; + char buf[256]; + FILE *f; + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); + + if (!f) + return -ENOENT; + + /* In my local setup, the number of entries is 50k+ so Let us initially + * allocate space to hold 64k entries. If 64k is not enough, incrementally + * increase 1k each time. + */ + max_cnt = 65536; + inc_cnt = 1024; + addrs = malloc(max_cnt * sizeof(long)); + if (addrs == NULL) { + err = -ENOMEM; + goto error; + } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) + continue; + if (skip_entry(name)) + continue; + + if (cnt == max_cnt) { + max_cnt += inc_cnt; + tmp_addrs = realloc(addrs, max_cnt); + if (!tmp_addrs) { + err = -ENOMEM; + goto error; + } + addrs = tmp_addrs; + } + + addrs[cnt++] = (unsigned long)addr; + } + + *addrsp = addrs; + *cntp = cnt; + +error: + free(name); + fclose(f); + if (err) + free(addrs); + return err; +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 2ce873c9f9aad6..9437bdd4afa505 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -41,4 +41,7 @@ ssize_t get_rel_offset(uintptr_t addr); int read_build_id(const char *path, char *build_id, size_t size); +int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel); +int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel); + #endif From adf6b57ce46ce685a65d54397c4389df754b6a99 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 4 Sep 2025 10:10:10 +0800 Subject: [PATCH 0866/3206] selftests/bpf: skip recursive functions for kprobe_multi Some functions is recursive for the kprobe_multi and impact the benchmark results. So just skip them. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20250904021011.14069-3-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/trace_helpers.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index d24baf244d1f34..9577979bd84da4 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -19,6 +19,7 @@ #include #include "bpf/hashmap.h" #include "bpf/libbpf_internal.h" +#include "bpf_util.h" #define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" #define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" @@ -540,8 +541,20 @@ static bool is_invalid_entry(char *buf, bool kernel) return false; } +static const char * const trace_blacklist[] = { + "migrate_disable", + "migrate_enable", + "rcu_read_unlock_strict", + "preempt_count_add", + "preempt_count_sub", + "__rcu_read_lock", + "__rcu_read_unlock", +}; + static bool skip_entry(char *name) { + int i; + /* * We attach to almost all kernel functions and some of them * will cause 'suspicious RCU usage' when fprobe is attached @@ -559,6 +572,12 @@ static bool skip_entry(char *name) if (!strncmp(name, "__ftrace_invalid_address__", sizeof("__ftrace_invalid_address__") - 1)) return true; + + for (i = 0; i < ARRAY_SIZE(trace_blacklist); i++) { + if (!strcmp(name, trace_blacklist[i])) + return true; + } + return false; } From a85d888768ea0e024dcc9d5fb172e7be8fd7d631 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 4 Sep 2025 10:10:11 +0800 Subject: [PATCH 0867/3206] selftests/bpf: add benchmark testing for kprobe-multi-all For now, the benchmark for kprobe-multi is single, which means there is only 1 function is hooked during testing. Add the testing "kprobe-multi-all", which will hook all the kernel functions during the benchmark. And the "kretprobe-multi-all" is added too. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20250904021011.14069-4-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 4 ++ .../selftests/bpf/benchs/bench_trigger.c | 61 +++++++++++++++++++ .../selftests/bpf/benchs/run_bench_trigger.sh | 4 +- .../selftests/bpf/progs/trigger_bench.c | 12 ++++ tools/testing/selftests/bpf/trace_helpers.c | 1 + 5 files changed, 80 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index df4489ac14a0c9..bd29bb2e6cb529 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -512,6 +512,8 @@ extern const struct bench bench_trig_kretprobe; extern const struct bench bench_trig_kprobe_multi; extern const struct bench bench_trig_kretprobe_multi; extern const struct bench bench_trig_fentry; +extern const struct bench bench_trig_kprobe_multi_all; +extern const struct bench bench_trig_kretprobe_multi_all; extern const struct bench bench_trig_fexit; extern const struct bench bench_trig_fmodret; extern const struct bench bench_trig_tp; @@ -587,6 +589,8 @@ static const struct bench *benchs[] = { &bench_trig_kprobe_multi, &bench_trig_kretprobe_multi, &bench_trig_fentry, + &bench_trig_kprobe_multi_all, + &bench_trig_kretprobe_multi_all, &bench_trig_fexit, &bench_trig_fmodret, &bench_trig_tp, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 82327657846e5f..1e2aff007c2a40 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -226,6 +226,65 @@ static void trigger_fentry_setup(void) attach_bpf(ctx.skel->progs.bench_trigger_fentry); } +static void attach_ksyms_all(struct bpf_program *empty, bool kretprobe) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + char **syms = NULL; + size_t cnt = 0; + + /* Some recursive functions will be skipped in + * bpf_get_ksyms -> skip_entry, as they can introduce sufficient + * overhead. However, it's difficut to skip all the recursive + * functions for a debug kernel. + * + * So, don't run the kprobe-multi-all and kretprobe-multi-all on + * a debug kernel. + */ + if (bpf_get_ksyms(&syms, &cnt, true)) { + fprintf(stderr, "failed to get ksyms\n"); + exit(1); + } + + opts.syms = (const char **) syms; + opts.cnt = cnt; + opts.retprobe = kretprobe; + /* attach empty to all the kernel functions except bpf_get_numa_node_id. */ + if (!bpf_program__attach_kprobe_multi_opts(empty, NULL, &opts)) { + fprintf(stderr, "failed to attach bpf_program__attach_kprobe_multi_opts to all\n"); + exit(1); + } +} + +static void trigger_kprobe_multi_all_setup(void) +{ + struct bpf_program *prog, *empty; + + setup_ctx(); + empty = ctx.skel->progs.bench_kprobe_multi_empty; + prog = ctx.skel->progs.bench_trigger_kprobe_multi; + bpf_program__set_autoload(empty, true); + bpf_program__set_autoload(prog, true); + load_ctx(); + + attach_ksyms_all(empty, false); + attach_bpf(prog); +} + +static void trigger_kretprobe_multi_all_setup(void) +{ + struct bpf_program *prog, *empty; + + setup_ctx(); + empty = ctx.skel->progs.bench_kretprobe_multi_empty; + prog = ctx.skel->progs.bench_trigger_kretprobe_multi; + bpf_program__set_autoload(empty, true); + bpf_program__set_autoload(prog, true); + load_ctx(); + + attach_ksyms_all(empty, true); + attach_bpf(prog); +} + static void trigger_fexit_setup(void) { setup_ctx(); @@ -512,6 +571,8 @@ BENCH_TRIG_KERNEL(kretprobe, "kretprobe"); BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi"); BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi"); BENCH_TRIG_KERNEL(fentry, "fentry"); +BENCH_TRIG_KERNEL(kprobe_multi_all, "kprobe-multi-all"); +BENCH_TRIG_KERNEL(kretprobe_multi_all, "kretprobe-multi-all"); BENCH_TRIG_KERNEL(fexit, "fexit"); BENCH_TRIG_KERNEL(fmodret, "fmodret"); BENCH_TRIG_KERNEL(tp, "tp"); diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index a690f5a68b6b02..f7573708a0c33f 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -6,8 +6,8 @@ def_tests=( \ usermode-count kernel-count syscall-count \ fentry fexit fmodret \ rawtp tp \ - kprobe kprobe-multi \ - kretprobe kretprobe-multi \ + kprobe kprobe-multi kprobe-multi-all \ + kretprobe kretprobe-multi kretprobe-multi-all \ ) tests=("$@") diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 044a6d78923edf..3d5f30c29ae339 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -97,6 +97,12 @@ int bench_trigger_kprobe_multi(void *ctx) return 0; } +SEC("?kprobe.multi/bpf_get_numa_node_id") +int bench_kprobe_multi_empty(void *ctx) +{ + return 0; +} + SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { @@ -104,6 +110,12 @@ int bench_trigger_kretprobe_multi(void *ctx) return 0; } +SEC("?kretprobe.multi/bpf_get_numa_node_id") +int bench_kretprobe_multi_empty(void *ctx) +{ + return 0; +} + SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 9577979bd84da4..171987627f3a7b 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -549,6 +549,7 @@ static const char * const trace_blacklist[] = { "preempt_count_sub", "__rcu_read_lock", "__rcu_read_unlock", + "bpf_get_numa_node_id", }; static bool skip_entry(char *name) From 19559e8441843bb36ecc78bb8d5cf82d500352fd Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Wed, 3 Sep 2025 07:47:10 +0800 Subject: [PATCH 0868/3206] bpf: add bpf_strcasecmp kfunc bpf_strcasecmp() function performs same like bpf_strcmp() except ignoring the case of the characters. Signed-off-by: Rong Tao Acked-by: Yonghong Song Acked-by: Viktor Malik Link: https://lore.kernel.org/r/tencent_292BD3682A628581AA904996D8E59F4ACD06@qq.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 68 +++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 401b4932cc49f6..588bc7e3643679 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3349,45 +3349,72 @@ __bpf_kfunc void __bpf_trap(void) * __get_kernel_nofault instead of plain dereference to make them safe. */ -/** - * bpf_strcmp - Compare two strings - * @s1__ign: One string - * @s2__ign: Another string - * - * Return: - * * %0 - Strings are equal - * * %-1 - @s1__ign is smaller - * * %1 - @s2__ign is smaller - * * %-EFAULT - Cannot read one of the strings - * * %-E2BIG - One of strings is too large - * * %-ERANGE - One of strings is outside of kernel address space - */ -__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) +static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) { char c1, c2; int i; - if (!copy_from_kernel_nofault_allowed(s1__ign, 1) || - !copy_from_kernel_nofault_allowed(s2__ign, 1)) { + if (!copy_from_kernel_nofault_allowed(s1, 1) || + !copy_from_kernel_nofault_allowed(s2, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { - __get_kernel_nofault(&c1, s1__ign, char, err_out); - __get_kernel_nofault(&c2, s2__ign, char, err_out); + __get_kernel_nofault(&c1, s1, char, err_out); + __get_kernel_nofault(&c2, s2, char, err_out); + if (ignore_case) { + c1 = tolower(c1); + c2 = tolower(c2); + } if (c1 != c2) return c1 < c2 ? -1 : 1; if (c1 == '\0') return 0; - s1__ign++; - s2__ign++; + s1++; + s2++; } return -E2BIG; err_out: return -EFAULT; } +/** + * bpf_strcmp - Compare two strings + * @s1__ign: One string + * @s2__ign: Another string + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strcasecmp(s1__ign, s2__ign, false); +} + +/** + * bpf_strcasecmp - Compare two strings, ignoring the case of the characters + * @s1__ign: One string + * @s2__ign: Another string + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strcasecmp(s1__ign, s2__ign, true); +} + /** * bpf_strnchr - Find a character in a length limited string * @s__ign: The string to be searched @@ -3832,6 +3859,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) #endif BTF_ID_FLAGS(func, __bpf_trap) BTF_ID_FLAGS(func, bpf_strcmp); +BTF_ID_FLAGS(func, bpf_strcasecmp); BTF_ID_FLAGS(func, bpf_strchr); BTF_ID_FLAGS(func, bpf_strchrnul); BTF_ID_FLAGS(func, bpf_strnchr); From abc8a952d4aa1cdb72e90df280d1fe6fc50ed631 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Wed, 3 Sep 2025 07:48:36 +0800 Subject: [PATCH 0869/3206] selftests/bpf: Test kfunc bpf_strcasecmp Add testsuites for kfunc bpf_strcasecmp. Signed-off-by: Rong Tao Link: https://lore.kernel.org/r/tencent_81A1A0ACC04B68158C57C4D151C46A832B07@qq.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/string_kfuncs.c | 1 + tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c | 6 ++++++ tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c | 1 + tools/testing/selftests/bpf/progs/string_kfuncs_success.c | 5 +++++ 4 files changed, 13 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c index 35af8044d0590a..4d66fad3c8bdb9 100644 --- a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c +++ b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c @@ -8,6 +8,7 @@ static const char * const test_cases[] = { "strcmp", + "strcasecmp", "strchr", "strchrnul", "strnchr", diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c index 53af438bd998e8..99d72c68f76af8 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c @@ -31,6 +31,8 @@ char *invalid_kern_ptr = (char *)-1; /* Passing NULL to string kfuncs (treated as a userspace ptr) */ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_null1(void *ctx) { return bpf_strcmp(NULL, "hello"); } SEC("syscall") __retval(USER_PTR_ERR)int test_strcmp_null2(void *ctx) { return bpf_strcmp("hello", NULL); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_null1(void *ctx) { return bpf_strcasecmp(NULL, "HELLO"); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strcasecmp_null2(void *ctx) { return bpf_strcasecmp("HELLO", NULL); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchr_null(void *ctx) { return bpf_strchr(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchrnul_null(void *ctx) { return bpf_strchrnul(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strnchr_null(void *ctx) { return bpf_strnchr(NULL, 1, 'a'); } @@ -49,6 +51,8 @@ SEC("syscall") __retval(USER_PTR_ERR)int test_strnstr_null2(void *ctx) { return /* Passing userspace ptr to string kfuncs */ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr1(void *ctx) { return bpf_strcmp(user_ptr, "hello"); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr2(void *ctx) { return bpf_strcmp("hello", user_ptr); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr1(void *ctx) { return bpf_strcasecmp(user_ptr, "HELLO"); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr2(void *ctx) { return bpf_strcasecmp("HELLO", user_ptr); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchr_user_ptr(void *ctx) { return bpf_strchr(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchrnul_user_ptr(void *ctx) { return bpf_strchrnul(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strnchr_user_ptr(void *ctx) { return bpf_strnchr(user_ptr, 1, 'a'); } @@ -69,6 +73,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strnstr_user_ptr2(void *ctx) { re /* Passing invalid kernel ptr to string kfuncs should always return -EFAULT */ SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault1(void *ctx) { return bpf_strcmp(invalid_kern_ptr, "hello"); } SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault2(void *ctx) { return bpf_strcmp("hello", invalid_kern_ptr); } +SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault1(void *ctx) { return bpf_strcasecmp(invalid_kern_ptr, "HELLO"); } +SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault2(void *ctx) { return bpf_strcasecmp("HELLO", invalid_kern_ptr); } SEC("syscall") __retval(-EFAULT) int test_strchr_pagefault(void *ctx) { return bpf_strchr(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strchrnul_pagefault(void *ctx) { return bpf_strchrnul(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strnchr_pagefault(void *ctx) { return bpf_strnchr(invalid_kern_ptr, 1, 'a'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c index 89fb4669b0e943..e41cc560199430 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c @@ -7,6 +7,7 @@ char long_str[XATTR_SIZE_MAX + 1]; SEC("syscall") int test_strcmp_too_long(void *ctx) { return bpf_strcmp(long_str, long_str); } +SEC("syscall") int test_strcasecmp_too_long(void *ctx) { return bpf_strcasecmp(long_str, long_str); } SEC("syscall") int test_strchr_too_long(void *ctx) { return bpf_strchr(long_str, 'b'); } SEC("syscall") int test_strchrnul_too_long(void *ctx) { return bpf_strchrnul(long_str, 'b'); } SEC("syscall") int test_strnchr_too_long(void *ctx) { return bpf_strnchr(long_str, sizeof(long_str), 'b'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index 46697f3818789a..67830456637b00 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -12,6 +12,11 @@ char str[] = "hello world"; /* Functional tests */ __test(0) int test_strcmp_eq(void *ctx) { return bpf_strcmp(str, "hello world"); } __test(1) int test_strcmp_neq(void *ctx) { return bpf_strcmp(str, "hello"); } +__test(0) int test_strcasecmp_eq1(void *ctx) { return bpf_strcasecmp(str, "hello world"); } +__test(0) int test_strcasecmp_eq2(void *ctx) { return bpf_strcasecmp(str, "HELLO WORLD"); } +__test(0) int test_strcasecmp_eq3(void *ctx) { return bpf_strcasecmp(str, "HELLO world"); } +__test(1) int test_strcasecmp_neq1(void *ctx) { return bpf_strcasecmp(str, "hello"); } +__test(1) int test_strcasecmp_neq2(void *ctx) { return bpf_strcasecmp(str, "HELLO"); } __test(1) int test_strchr_found(void *ctx) { return bpf_strchr(str, 'e'); } __test(11) int test_strchr_null(void *ctx) { return bpf_strchr(str, '\0'); } __test(-ENOENT) int test_strchr_notfound(void *ctx) { return bpf_strchr(str, 'x'); } From 929adf8838f7765f0bd57235f0e6f365ff628a0d Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Thu, 4 Sep 2025 07:57:03 +0000 Subject: [PATCH 0870/3206] bpf, arm64: Remove duplicated bpf_flush_icache() The bpf_flush_icache() is done by bpf_arch_text_copy() already. Remove the duplicated one in arch_prepare_bpf_trampoline(). Signed-off-by: Hengqi Chen Acked-by: Puranjay Mohan Link: https://lore.kernel.org/r/20250904075703.49404-1-hengqi.chen@gmail.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index a98b8132479a75..f0b1cb2c3bc485 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2773,7 +2773,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, goto out; } - bpf_flush_icache(ro_image, ro_image + size); out: kvfree(image); return ret; From 4b69e31329b6ef22dc80b45d4a865f57c2346d44 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 3 Sep 2025 22:04:37 +0800 Subject: [PATCH 0871/3206] selftests/bpf: Introduce experimental bpf_in_interrupt() Filtering pid_tgid is meanlingless when the current task is preempted by an interrupt. To address this, introduce 'bpf_in_interrupt()' helper function, which allows BPF programs to determine whether they are executing in interrupt context. 'get_preempt_count()': * On x86, '*(int *) bpf_this_cpu_ptr(&__preempt_count)'. * On arm64, 'bpf_get_current_task_btf()->thread_info.preempt.count'. Then 'bpf_in_interrupt()' will be: * If !PREEMPT_RT, 'get_preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK)'. * If PREEMPT_RT, '(get_preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | (bpf_get_current_task_btf()->softirq_disable_cnt & SOFTIRQ_MASK)'. As for other archs, it can be added support by updating 'get_preempt_count()'. Suggested-by: Alexei Starovoitov Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250903140438.59517-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/bpf_experimental.h | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index da7e230f2781e1..d89eda3fd8a357 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -599,4 +599,58 @@ extern void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it) __weak __ksym; extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, struct bpf_dynptr *value_p) __weak __ksym; +#define PREEMPT_BITS 8 +#define SOFTIRQ_BITS 8 +#define HARDIRQ_BITS 4 +#define NMI_BITS 4 + +#define PREEMPT_SHIFT 0 +#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) +#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) +#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) + +#define __IRQ_MASK(x) ((1UL << (x))-1) + +#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) +#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) +#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) + +extern bool CONFIG_PREEMPT_RT __kconfig __weak; +#ifdef bpf_target_x86 +extern const int __preempt_count __ksym; +#endif + +struct task_struct___preempt_rt { + int softirq_disable_cnt; +} __attribute__((preserve_access_index)); + +static inline int get_preempt_count(void) +{ +#if defined(bpf_target_x86) + return *(int *) bpf_this_cpu_ptr(&__preempt_count); +#elif defined(bpf_target_arm64) + return bpf_get_current_task_btf()->thread_info.preempt.count; +#endif + return 0; +} + +/* Description + * Report whether it is in interrupt context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_interrupt(void) +{ + struct task_struct___preempt_rt *tsk; + int pcnt; + + pcnt = get_preempt_count(); + if (!CONFIG_PREEMPT_RT) + return pcnt & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK); + + tsk = (void *) bpf_get_current_task_btf(); + return (pcnt & (NMI_MASK | HARDIRQ_MASK)) | + (tsk->softirq_disable_cnt & SOFTIRQ_MASK); +} + #endif From 88a3bde432b8263660793bce84910dd44f46ef25 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 3 Sep 2025 22:04:38 +0800 Subject: [PATCH 0872/3206] selftests/bpf: Add case to test bpf_in_interrupt() Add a timer test case to test 'bpf_in_interrupt()'. cd tools/testing/selftests/bpf ./test_progs -t timer_interrupt 462 timer_interrupt:OK Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250903140438.59517-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/timer.c | 30 ++++++++++++ .../selftests/bpf/progs/timer_interrupt.c | 48 +++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/timer_interrupt.c diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index d66687f1ee6a8d..049efb5e782392 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -3,6 +3,7 @@ #include #include "timer.skel.h" #include "timer_failure.skel.h" +#include "timer_interrupt.skel.h" #define NUM_THR 8 @@ -95,3 +96,32 @@ void serial_test_timer(void) RUN_TESTS(timer_failure); } + +void test_timer_interrupt(void) +{ + struct timer_interrupt *skel = NULL; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, opts); + + skel = timer_interrupt__open_and_load(); + if (!ASSERT_OK_PTR(skel, "timer_interrupt__open_and_load")) + return; + + err = timer_interrupt__attach(skel); + if (!ASSERT_OK(err, "timer_interrupt__attach")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.test_timer_interrupt); + err = bpf_prog_test_run_opts(prog_fd, &opts); + if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) + goto out; + + usleep(50); + + ASSERT_EQ(skel->bss->in_interrupt, 0, "in_interrupt"); + if (skel->bss->preempt_count) + ASSERT_NEQ(skel->bss->in_interrupt_cb, 0, "in_interrupt_cb"); + +out: + timer_interrupt__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/timer_interrupt.c b/tools/testing/selftests/bpf/progs/timer_interrupt.c new file mode 100644 index 00000000000000..19180a455f4095 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_interrupt.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "bpf_experimental.h" + +char _license[] SEC("license") = "GPL"; + +#define CLOCK_MONOTONIC 1 + +int preempt_count; +int in_interrupt; +int in_interrupt_cb; + +struct elem { + struct bpf_timer t; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +static int timer_in_interrupt(void *map, int *key, struct bpf_timer *timer) +{ + preempt_count = get_preempt_count(); + in_interrupt_cb = bpf_in_interrupt(); + return 0; +} + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test_timer_interrupt) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&array, &key); + if (!timer) + return 0; + + in_interrupt = bpf_in_interrupt(); + bpf_timer_init(timer, &array, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, timer_in_interrupt); + bpf_timer_start(timer, 0, 0); + return 0; +} From fd5081f4ef3325b49d26e41b5976d1f34032ca9b Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 4 Sep 2025 19:31:32 +0800 Subject: [PATCH 0873/3206] workqueue: Remove redundant rcu_read_lock/unlock() in workqueue_congested() The preempt_disable/enable() has already formed RCU read crtical section, this commit therefore remove rcu_read_lock/unlock() in workqueue_congested(). Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c6b79b3675c314..831754e900714d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6046,7 +6046,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) struct pool_workqueue *pwq; bool ret; - rcu_read_lock(); preempt_disable(); if (cpu == WORK_CPU_UNBOUND) @@ -6056,7 +6055,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) ret = !list_empty(&pwq->inactive_works); preempt_enable(); - rcu_read_unlock(); return ret; } From cda2b2d647f7e467e53655b56ff430732fb1fa17 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 4 Sep 2025 19:31:33 +0800 Subject: [PATCH 0874/3206] workqueue: Remove rcu_read_lock/unlock() in wq_watchdog_timer_fn() The wq_watchdog_timer_fn() is executed in the softirq context, this is already in the RCU read critical section, this commit therefore remove rcu_read_lock/unlock() in wq_watchdog_timer_fn(). Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 831754e900714d..63b2685c2cb4f9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7544,8 +7544,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (!thresh) return; - rcu_read_lock(); - for_each_pool(pool, pi) { unsigned long pool_ts, touched, ts; @@ -7587,8 +7585,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) } - rcu_read_unlock(); - if (lockup_detected) show_all_workqueues(); From c0fb16ef887d364766d03574ec824509939cf9cc Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Thu, 4 Sep 2025 15:45:03 +0800 Subject: [PATCH 0875/3206] cpuset: Don't always flush cpuset_migrate_mm_wq in cpuset_write_resmask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is unnecessary to always wait for the flush operation of cpuset_migrate_mm_wq to complete in cpuset_write_resmask, as modifying cpuset.cpus or cpuset.exclusive does not trigger mm migrations. The flush_workqueue can be executed only when cpuset.mems is modified. Signed-off-by: Chuyi Zhou Reviewed-by: Michal Koutný Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c0c281a8860d35..9fc20ef97d7e5b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3291,7 +3291,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, rebuild_sched_domains_locked(); out_unlock: cpuset_full_unlock(); - flush_workqueue(cpuset_migrate_mm_wq); + if (of_cft(of)->private == FILE_MEMLIST) + flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } From 3514309e03222c0ad06cd3fda0f0d2c98e786bf8 Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Thu, 4 Sep 2025 15:45:04 +0800 Subject: [PATCH 0876/3206] cpuset: Defer flushing of the cpuset_migrate_mm_wq to task_work Now in cpuset_attach(), we need to synchronously wait for flush_workqueue to complete. The execution time of flushing cpuset_migrate_mm_wq depends on the amount of mm migration initiated by cpusets at that time. When the cpuset.mems of a cgroup occupying a large amount of memory is modified, it may trigger extensive mm migration, causing cpuset_attach() to block on flush_workqueue for an extended period. This could be dangerous because cpuset_attach() is within the critical section of cgroup_mutex, which may ultimately cause all cgroup-related operations in the system to be blocked. This patch attempts to defer the flush_workqueue() operation until returning to userspace using the task_work which is originally proposed by tejun[1], so that flush happens after cgroup_mutex is dropped. That way we maintain the operation synchronicity while avoiding bothering anyone else. [1]: https://lore.kernel.org/cgroups/ZgMFPMjZRZCsq9Q-@slm.duckdns.org/T/#m117f606fa24f66f0823a60f211b36f24bd9e1883 Originally-by: Tejun Heo Signed-off-by: Chuyi Zhou Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 9fc20ef97d7e5b..0d41b4993f8cf4 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -40,6 +40,7 @@ #include #include #include +#include DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -2619,9 +2620,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -static void cpuset_post_attach(void) +static void flush_migrate_mm_task_workfn(struct callback_head *head) { flush_workqueue(cpuset_migrate_mm_wq); + kfree(head); +} + +static void schedule_flush_migrate_mm(void) +{ + struct callback_head *flush_cb; + + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); + if (!flush_cb) + return; + + init_task_work(flush_cb, flush_migrate_mm_task_workfn); + + if (task_work_add(current, flush_cb, TWA_RESUME)) + kfree(flush_cb); } /* @@ -3178,6 +3194,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; bool cpus_updated, mems_updated; + bool queue_task_work = false; cgroup_taskset_first(tset, &css); cs = css_cs(css); @@ -3228,15 +3245,18 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) + if (is_memory_migrate(cs)) { cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - else + queue_task_work = true; + } else mmput(mm); } } out: + if (queue_task_work) + schedule_flush_migrate_mm(); cs->old_mems_allowed = cpuset_attach_nodemask_to; if (cs->nr_migrate_dl_tasks) { @@ -3292,7 +3312,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, out_unlock: cpuset_full_unlock(); if (of_cft(of)->private == FILE_MEMLIST) - flush_workqueue(cpuset_migrate_mm_wq); + schedule_flush_migrate_mm(); return retval ?: nbytes; } @@ -3739,7 +3759,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .post_attach = cpuset_post_attach, .bind = cpuset_bind, .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, From d8b269e009bbc471cb2735b5f737839495efce3b Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Thu, 4 Sep 2025 15:45:05 +0800 Subject: [PATCH 0877/3206] cgroup: Remove unused cgroup_subsys::post_attach cgroup_subsys::post_attach callback was introduced in commit 5cf1cacb49ae ("cgroup, cpuset: replace cpuset_post_attach_flush() with cgroup_subsys->post_attach callback") and only cpuset would use this callback to wait for the mm migration to complete at the end of __cgroup_procs_write(). Since the previous patch defer the flush operation until returning to userspace, no one use this callback now. Remove this callback from cgroup_subsys. Signed-off-by: Chuyi Zhou Acked-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 - kernel/cgroup/cgroup.c | 4 ---- 2 files changed, 5 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 539c64eeef38f3..92ed6d18266d13 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -763,7 +763,6 @@ struct cgroup_subsys { int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); - void (*post_attach)(void); int (*can_fork)(struct task_struct *task, struct css_set *cset); void (*cancel_fork)(struct task_struct *task, struct css_set *cset); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b38d7a847ed4a5..0dc6ad71f175b8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3033,10 +3033,6 @@ void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked put_task_struct(task); cgroup_attach_unlock(threadgroup_locked); - - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) From ad7c7f4b9c6c2950778e5bd305392a333de73912 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Sep 2025 16:25:23 +0200 Subject: [PATCH 0878/3206] workqueue: Provide a handshake for canceling BH workers While a BH work item is canceled, the core code spins until it determines that the item completed. On PREEMPT_RT the spinning relies on a lock in local_bh_disable() to avoid a live lock if the canceling thread has higher priority than the BH-worker and preempts it. This lock ensures that the BH-worker makes progress by PI-boosting it. This lock in local_bh_disable() is a central per-CPU BKL and about to be removed. To provide the required synchronisation add a per pool lock. The lock is acquired by the bh_worker at the begin while the individual callbacks are invoked. To enforce progress in case of interruption, __flush_work() needs to acquire the lock. This will flush all BH-work items assigned to that pool. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 50 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 63b2685c2cb4f9..59faf857ee4f5a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -222,7 +222,9 @@ struct worker_pool { struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ - +#ifdef CONFIG_PREEMPT_RT + spinlock_t cb_lock; /* BH worker cancel lock */ +#endif /* * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). @@ -3078,6 +3080,31 @@ __acquires(&pool->lock) goto restart; } +#ifdef CONFIG_PREEMPT_RT +static void worker_lock_callback(struct worker_pool *pool) +{ + spin_lock(&pool->cb_lock); +} + +static void worker_unlock_callback(struct worker_pool *pool) +{ + spin_unlock(&pool->cb_lock); +} + +static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) +{ + spin_lock(&pool->cb_lock); + spin_unlock(&pool->cb_lock); +} + +#else + +static void worker_lock_callback(struct worker_pool *pool) { } +static void worker_unlock_callback(struct worker_pool *pool) { } +static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { } + +#endif + /** * manage_workers - manage worker pool * @worker: self @@ -3557,6 +3584,7 @@ static void bh_worker(struct worker *worker) int nr_restarts = BH_WORKER_RESTARTS; unsigned long end = jiffies + BH_WORKER_JIFFIES; + worker_lock_callback(pool); raw_spin_lock_irq(&pool->lock); worker_leave_idle(worker); @@ -3585,6 +3613,7 @@ static void bh_worker(struct worker *worker) worker_enter_idle(worker); kick_pool(pool); raw_spin_unlock_irq(&pool->lock); + worker_unlock_callback(pool); } /* @@ -4222,17 +4251,17 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) (data & WORK_OFFQ_BH)) { /* * On RT, prevent a live lock when %current preempted - * soft interrupt processing or prevents ksoftirqd from - * running by keeping flipping BH. If the BH work item - * runs on a different CPU then this has no effect other - * than doing the BH disable/enable dance for nothing. - * This is copied from - * kernel/softirq.c::tasklet_unlock_spin_wait(). + * soft interrupt processing by blocking on lock which + * is owned by the thread invoking the callback. */ while (!try_wait_for_completion(&barr.done)) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - local_bh_disable(); - local_bh_enable(); + struct worker_pool *pool; + + guard(rcu)(); + pool = get_work_pool(work); + if (pool) + workqueue_callback_cancel_wait_running(pool); } else { cpu_relax(); } @@ -4782,6 +4811,9 @@ static int init_worker_pool(struct worker_pool *pool) ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; +#ifdef CONFIG_PREEMPT_RT + spin_lock_init(&pool->cb_lock); +#endif /* shouldn't fail above this point */ pool->attrs = alloc_workqueue_attrs(); From b338cf849ec82a3063119c9902c5ae21bc9dccb1 Mon Sep 17 00:00:00 2001 From: Jiawei Zhao Date: Thu, 4 Sep 2025 03:05:23 +0000 Subject: [PATCH 0879/3206] libbpf: Remove unused args in parse_usdt_note Remove unused 'elf' and 'path' parameters from parse_usdt_note function signature. These parameters are not referenced within the function body and only add unnecessary complexity. The function only requires the note header, data buffer, offsets, and output structure to perform USDT note parsing. Update function declaration, definition, and the single call site in collect_usdt_targets() to match the simplified signature. This is a safe internal cleanup as parse_usdt_note is a static function. Signed-off-by: Jiawei Zhao Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20250904030525.1932293-1-phoenix500526@163.com --- tools/lib/bpf/usdt.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 867bff6b06990e..fc2785eecc1776 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -581,9 +581,8 @@ static struct elf_seg *find_vma_seg(struct elf_seg *segs, size_t seg_cnt, long o return NULL; } -static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, - const char *data, size_t name_off, size_t desc_off, - struct usdt_note *usdt_note); +static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off, + size_t desc_off, struct usdt_note *usdt_note); static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie); @@ -637,7 +636,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * struct elf_seg *seg = NULL; void *tmp; - err = parse_usdt_note(elf, path, &nhdr, data->d_buf, name_off, desc_off, ¬e); + err = parse_usdt_note(&nhdr, data->d_buf, name_off, desc_off, ¬e); if (err) goto err_out; @@ -1143,8 +1142,7 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct /* Parse out USDT ELF note from '.note.stapsdt' section. * Logic inspired by perf's code. */ -static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, - const char *data, size_t name_off, size_t desc_off, +static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, struct usdt_note *note) { const char *provider, *name, *args; From 47d9f8212826753c482df8189d18ca212eb5ae73 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 4 Sep 2025 20:23:48 +0200 Subject: [PATCH 0880/3206] sched_ext: Fix NULL dereference in scx_bpf_cpu_rq() warning When printing the deprecation warning for scx_bpf_cpu_rq(), we may hit a NULL pointer dereference if the kfunc is called before a BPF scheduler is fully attached, for example, when invoked from a BPF timer or during ops.init(): [ 50.752775] BUG: kernel NULL pointer dereference, address: 0000000000000331 ... [ 50.764205] RIP: 0010:scx_bpf_cpu_rq+0x30/0xa0 ... [ 50.787661] Call Trace: [ 50.788398] [ 50.789061] bpf_prog_08f7fd2dcb187aaf_wakeup_timerfn+0x75/0x1a8 [ 50.792477] bpf_timer_cb+0x7e/0x140 [ 50.796003] hrtimer_run_softirq+0x91/0xe0 [ 50.796952] handle_softirqs+0xce/0x3c0 [ 50.799087] run_ksoftirqd+0x3e/0x70 [ 50.800197] smpboot_thread_fn+0x133/0x290 [ 50.802320] kthread+0x115/0x220 [ 50.804984] ret_from_fork+0x17a/0x1d0 [ 50.806920] ret_from_fork_asm+0x1a/0x30 [ 50.807799] Fix this by only printing the warning once the scheduler is fully registered. Fixes: 5c48d88fe0049 ("sched_ext: deprecation warn for scx_bpf_cpu_rq()") Cc: Christian Loehle Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4160a4a7af67e6..477eccf0233888 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6351,17 +6351,20 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) */ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; if (!kf_cpu_valid(cpu, NULL)) return NULL; - if (!sch->warned_deprecated_rq) { + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (likely(sch) && !sch->warned_deprecated_rq) { printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " "use scx_bpf_locked_rq() when holding rq lock " "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); sch->warned_deprecated_rq = true; } + rcu_read_unlock(); return cpu_rq(cpu); } From 157cf360c4a8751f7f511a71cc3a283b5d27f889 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Thu, 4 Sep 2025 10:43:22 +0800 Subject: [PATCH 0881/3206] net: libwx: fix to enable RSS Now when SRIOV is enabled, PF with multiple queues can only receive all packets on queue 0. This is caused by an incorrect flag judgement, which prevents RSS from being enabled. In fact, RSS is supported for the functions when SRIOV is enabled. Remove the flag judgement to fix it. Fixes: c52d4b898901 ("net: libwx: Redesign flow when sriov is enabled") Cc: stable@vger.kernel.org Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Link: https://patch.msgid.link/A3B7449A08A044D0+20250904024322.87145-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index bcd07a71575241..5cb353a97d6d80 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -2078,10 +2078,6 @@ static void wx_setup_mrqc(struct wx *wx) { u32 rss_field = 0; - /* VT, and RSS do not coexist at the same time */ - if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) - return; - /* Disable indicating checksum in descriptor, enables RSS hash */ wr32m(wx, WX_PSR_CTL, WX_PSR_CTL_PCSD, WX_PSR_CTL_PCSD); From 54728bd535fb3899ad51489dc1e05eb5bb53cb95 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 1 Sep 2025 15:27:43 +0200 Subject: [PATCH 0882/3206] bpf: Return an error pointer for skb metadata when CONFIG_NET=n Kernel Test Robot reported a compiler warning - a null pointer may be passed to memmove in __bpf_dynptr_{read,write} when building without networking support. The warning is correct from a static analysis standpoint, but not actually reachable. Without CONFIG_NET, creating dynptrs to skb metadata is impossible since the constructor kfunc is missing. Silence the false-postive diagnostic message by returning an error pointer from bpf_skb_meta_pointer stub when CONFIG_NET=n. Fixes: 6877cd392bae ("bpf: Enable read/write access to skb metadata through a dynptr") Closes: https://lore.kernel.org/oe-kbuild-all/202508212031.ir9b3B6Q-lkp@intel.com/ Reported-by: kernel test robot Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250901-dynptr-skb-meta-no-net-v2-1-ce607fcb6091@cloudflare.com --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 9ed21b65e2e968..af6d9354662c78 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1822,7 +1822,7 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_NET */ From 222f83d5ab86344010f9e121799202b9ab25375b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 4 Sep 2025 11:23:43 -1000 Subject: [PATCH 0883/3206] cgroup: Remove unused local variables from cgroup_procs_write_finish() d8b269e009bb ("cgroup: Remove unused cgroup_subsys::post_attach") made $ss and $ssid unused but didn't drop them leading to compilation warnings. Drop them. Signed-off-by: Tejun Heo Cc: Chuyi Zhou --- kernel/cgroup/cgroup.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0dc6ad71f175b8..e7acfaa495179a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3026,9 +3026,6 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) { - struct cgroup_subsys *ss; - int ssid; - /* release reference from cgroup_procs_write_start() */ put_task_struct(task); From 349510052f765b6eb9c2a21d0ffe08ba61fa683c Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 1 Sep 2025 22:26:39 +0200 Subject: [PATCH 0884/3206] MAINTAINERS: Add drm-rust tree for Rust DRM drivers and infrastructure Multiple DRM Rust drivers (e.g. nova-core, nova-drm, Tyr, rvkms) are in development, with at least Nova and (soon) Tyr already upstream. Having a shared tree will ease and accelerate development, since all drivers can consume new infrastructure in the same release cycle. This includes infrastructure shared with other subsystem trees (e.g. Rust or driver-core). By consolidating in drm-rust, we avoid adding extra burden to drm-misc maintainers, e.g. dealing with cross-tree topic branches. The drm-misc tree is not a good fit for this stage of development, since its documented scope is small drivers with occasional large series. Rust drivers in development upstream, however, regularly involve large patch series, new infrastructure, and shared topic branches, which may not align well with drm-misc at this stage. The drm-rust tree may not be a permanent solution. Once the core Rust, DRM, and KMS infrastructure have stabilized, drivers and infrastructure changes are expected to transition into drm-misc or standalone driver trees respectively. Until then, drm-rust provides a dedicated place to coordinate development without disrupting existing workflows too much. Cc: David Airlie Cc: Simona Vetter Cc: Maarten Lankhorst Cc: Thomas Zimmermann Signed-off-by: Danilo Krummrich Acked-by: Janne Grunau Acked-by: Maxime Ripard Acked-by: Alexandre Courbot Acked-by: Jani Nikula Acked-by: Daniel Almeida Acked-by: Alice Ryhl Link: https://lore.kernel.org/r/20250901202850.208116-1-dakr@kernel.org Signed-off-by: Alice Ryhl --- MAINTAINERS | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6dcfbd11efef87..2b155a12f776b0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8079,7 +8079,6 @@ F: Documentation/devicetree/bindings/gpu/ F: Documentation/gpu/ F: drivers/gpu/drm/ F: drivers/gpu/vga/ -F: rust/kernel/drm/ F: include/drm/drm F: include/linux/vga* F: include/uapi/drm/ @@ -8096,6 +8095,16 @@ X: drivers/gpu/drm/radeon/ X: drivers/gpu/drm/tegra/ X: drivers/gpu/drm/xe/ +DRM DRIVERS AND COMMON INFRASTRUCTURE [RUST] +M: Danilo Krummrich +M: Alice Ryhl +S: Supported +W: https://drm.pages.freedesktop.org/maintainer-tools/drm-rust.html +T: git https://gitlab.freedesktop.org/drm/rust/kernel.git +F: drivers/gpu/drm/nova/ +F: drivers/gpu/nova-core/ +F: rust/kernel/drm/ + DRM DRIVERS FOR ALLWINNER A10 M: Maxime Ripard M: Chen-Yu Tsai From 34b8f4adedd54c19b0008914d2bb6311e1fb0d3b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 5 Sep 2025 08:28:59 +0100 Subject: [PATCH 0885/3206] KVM: arm64: Mark freed S2 MMUs as invalid When freeing an S2 MMU, we free the associated pgd, but omit to mark the structure as invalid. Subsequently, a call to kvm_nested_s2_unmap() would pick these invalid S2 MMUs and pass them down the teardown path. This ends up with a nasty warning as we try to unmap an unallocated set of page tables. Fix this by making the S2 MMU invalid on freeing the pgd by calling kvm_init_nested_s2_mmu(). Fixes: 4f128f8e1aaa ("KVM: arm64: nv: Support multiple nested Stage-2 mmu structures") Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250905072859.211369-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 86f3d80daf37af..0f4271458a079e 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1106,6 +1106,10 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) mmu->pgt = NULL; free_percpu(mmu->last_vcpu_ran); } + + if (kvm_is_nested_s2_mmu(kvm, mmu)) + kvm_init_nested_s2_mmu(mmu); + write_unlock(&kvm->mmu_lock); if (pgt) { From 33ddc796ecbd50cd6211aa9e9eddbf4567038b49 Mon Sep 17 00:00:00 2001 From: Marcelo Moreira Date: Wed, 27 Aug 2025 19:17:07 -0300 Subject: [PATCH 0886/3206] xfs: Replace strncpy with memcpy The changes modernizes the code by aligning it with current kernel best practices. It improves code clarity and consistency, as strncpy is deprecated as explained in Documentation/process/deprecated.rst. This change does not alter the functionality or introduce any behavioral changes. Suggested-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Marcelo Moreira Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/symlink_repair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index 953ce7be78dc2f..5902398185a898 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -185,7 +185,7 @@ xrep_symlink_salvage_inline( return 0; nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); - strncpy(target_buf, ifp->if_data, nr); + memcpy(target_buf, ifp->if_data, nr); return nr; } From 9af8b441cf6953f683b825fbf241a979ea7521e8 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 24 Jun 2025 14:16:00 +0000 Subject: [PATCH 0887/3206] x86/mce/amd: Rename threshold restart function It operates per block rather than per bank. So rename it for clarity. No functional changes. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250624-wip-mca-updates-v4-5-236dd74f645f@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 5c4eb28c3ac930..9b980aecb6b3de 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -419,8 +419,8 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return true; }; -/* Reprogram MCx_MISC MSR behind this threshold bank. */ -static void threshold_restart_bank(void *_tr) +/* Reprogram MCx_MISC MSR behind this threshold block. */ +static void threshold_restart_block(void *_tr) { struct thresh_restart *tr = _tr; u32 hi, lo; @@ -478,7 +478,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) }; b->threshold_limit = THRESHOLD_MAX; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); }; static int setup_APIC_mce_threshold(int reserved, int new) @@ -921,7 +921,7 @@ static void log_and_reset_block(struct threshold_block *block) /* Reset threshold block after logging error. */ memset(&tr, 0, sizeof(tr)); tr.b = block; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); } /* @@ -995,7 +995,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) memset(&tr, 0, sizeof(tr)); tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1020,7 +1020,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) b->threshold_limit = new; tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; From 4d2161b9e8ba64076f520ec2f00eefb00722c15e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 24 Jun 2025 14:16:01 +0000 Subject: [PATCH 0888/3206] x86/mce/amd: Remove return value for mce_threshold_{create,remove}_device() The return values are not checked, so set return type to 'void'. Also, move function declarations to internal.h, since these functions are only used within the MCE subsystem. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/20250624-wip-mca-updates-v4-6-236dd74f645f@amd.com --- arch/x86/include/asm/mce.h | 6 ------ arch/x86/kernel/cpu/mce/amd.c | 22 ++++++++++------------ arch/x86/kernel/cpu/mce/internal.h | 4 ++++ 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6c77c03139f7fa..752802bf966b9c 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -371,15 +371,9 @@ enum smca_bank_types { extern bool amd_mce_is_memory_error(struct mce *m); -extern int mce_threshold_create_device(unsigned int cpu); -extern int mce_threshold_remove_device(unsigned int cpu); - void mce_amd_feature_init(struct cpuinfo_x86 *c); enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank); #else - -static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; -static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 9b980aecb6b3de..f429451cafc8f1 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1296,12 +1296,12 @@ static void __threshold_remove_device(struct threshold_bank **bp) kfree(bp); } -int mce_threshold_remove_device(unsigned int cpu) +void mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); if (!bp) - return 0; + return; /* * Clear the pointer before cleaning up, so that the interrupt won't @@ -1310,7 +1310,7 @@ int mce_threshold_remove_device(unsigned int cpu) this_cpu_write(threshold_banks, NULL); __threshold_remove_device(bp); - return 0; + return; } /** @@ -1324,36 +1324,34 @@ int mce_threshold_remove_device(unsigned int cpu) * thread running on @cpu. The callback is invoked on all CPUs which are * online when the callback is installed or during a real hotplug event. */ -int mce_threshold_create_device(unsigned int cpu) +void mce_threshold_create_device(unsigned int cpu) { unsigned int numbanks, bank; struct threshold_bank **bp; - int err; if (!mce_flags.amd_threshold) - return 0; + return; bp = this_cpu_read(threshold_banks); if (bp) - return 0; + return; numbanks = this_cpu_read(mce_num_banks); bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); if (!bp) - return -ENOMEM; + return; for (bank = 0; bank < numbanks; ++bank) { if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) continue; - err = threshold_create_bank(bp, cpu, bank); - if (err) { + if (threshold_create_bank(bp, cpu, bank)) { __threshold_remove_device(bp); - return err; + return; } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; - return 0; + return; } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b5ba598e54cb48..64ac25b953603b 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -265,6 +265,8 @@ void mce_prep_record_common(struct mce *m); void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD +void mce_threshold_create_device(unsigned int cpu); +void mce_threshold_remove_device(unsigned int cpu); extern bool amd_filter_mce(struct mce *m); bool amd_mce_usable_address(struct mce *m); @@ -293,6 +295,8 @@ static __always_inline void smca_extract_err_addr(struct mce *m) } #else +static inline void mce_threshold_create_device(unsigned int cpu) { } +static inline void mce_threshold_remove_device(unsigned int cpu) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline bool amd_mce_usable_address(struct mce *m) { return false; } static inline void smca_extract_err_addr(struct mce *m) { } From b249288abde5190bb113ea5acef8af4ceac4957c Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 25 Aug 2025 17:33:00 +0000 Subject: [PATCH 0889/3206] x86/mce/amd: Remove smca_banks_map The MCx_MISC0[BlkPtr] field was used on legacy systems to hold a register offset for the next MCx_MISC* register. In this way, an implementation-specific number of registers can be discovered at runtime. The MCAX/SMCA register space simplifies this by always including the MCx_MISC[1-4] registers. The MCx_MISC0[BlkPtr] field is used to indicate (true/false) whether any MCx_MISC[1-4] registers are present. Currently, MCx_MISC0[BlkPtr] is checked early and cached to be used during sysfs init later. This is unnecessary as the MCx_MISC0 register is read again later anyway. Remove the smca_banks_map variable as it is effectively redundant, and use a direct register/bit check instead. [ bp: Zap smca_get_block_address() too. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Tested-by: Tony Luck Link: https://lore.kernel.org/20250825-wip-mca-updates-v5-3-865768a2eef8@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 50 +++++++---------------------------- 1 file changed, 9 insertions(+), 41 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index f429451cafc8f1..7e36bc0d0e6c0f 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -252,9 +252,6 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); */ static DEFINE_PER_CPU(u64, bank_map); -/* Map of banks that have more than MCA_MISC0 available. */ -static DEFINE_PER_CPU(u64, smca_misc_banks_map); - static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -264,28 +261,6 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) -{ - u32 low, high; - - /* - * For SMCA enabled processors, BLKPTR field of the first MISC register - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). - */ - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return; - - if (!(low & MCI_CONFIG_MCAX)) - return; - - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) - return; - - if (low & MASK_BLKPTR_LO) - per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); - -} - static void smca_configure(unsigned int bank, unsigned int cpu) { u8 *bank_counts = this_cpu_ptr(smca_bank_counts); @@ -326,8 +301,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } - smca_set_misc_banks_map(bank, cpu); - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -525,18 +498,6 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 smca_get_block_address(unsigned int bank, unsigned int block, - unsigned int cpu) -{ - if (!block) - return MSR_AMD64_SMCA_MCx_MISC(bank); - - if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) - return 0; - - return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); -} - static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block, unsigned int cpu) @@ -546,8 +507,15 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return addr; - if (mce_flags.smca) - return smca_get_block_address(bank, block, cpu); + if (mce_flags.smca) { + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + if (!(low & MASK_BLKPTR_LO)) + return 0; + + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + } /* Fall back to method we used for older processors: */ switch (block) { From c4bac5c640e3782bf30c07c4d82042d0202fe224 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 24 Jun 2025 14:16:03 +0000 Subject: [PATCH 0890/3206] x86/mce/amd: Put list_head in threshold_bank The threshold_bank structure is a container for one or more threshold_block structures. Currently, the container has a single pointer to the 'first' threshold_block structure which then has a linked list of the remaining threshold_block structures. This results in an extra level of indirection where the 'first' block is checked before iterating over the remaining blocks. Remove the indirection by including the head of the block list in the threshold_bank structure which already acts as a container for all the bank's thresholding blocks. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Tested-by: Tony Luck Link: https://lore.kernel.org/20250624-wip-mca-updates-v4-8-236dd74f645f@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 43 ++++++++++------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 7e36bc0d0e6c0f..e9b9be2215edfb 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -241,7 +241,8 @@ struct threshold_block { struct threshold_bank { struct kobject *kobj; - struct threshold_block *blocks; + /* List of threshold blocks within this MCA bank. */ + struct list_head miscj; }; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); @@ -898,9 +899,9 @@ static void log_and_reset_block(struct threshold_block *block) */ static void amd_threshold_interrupt(void) { - struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; - struct threshold_bank **bp = this_cpu_read(threshold_banks); + struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank; unsigned int bank, cpu = smp_processor_id(); + struct threshold_block *block, *tmp; /* * Validate that the threshold bank has been initialized already. The @@ -914,16 +915,11 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) continue; - first_block = bp[bank]->blocks; - if (!first_block) + thr_bank = bp[bank]; + if (!thr_bank) continue; - /* - * The first block is also the head of the list. Check it first - * before iterating over the rest. - */ - log_and_reset_block(first_block); - list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj) log_and_reset_block(block); } } @@ -1149,13 +1145,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb default_attrs[2] = NULL; } - INIT_LIST_HEAD(&b->miscj); - - /* This is safe as @tb is not visible yet */ - if (tb->blocks) - list_add(&b->miscj, &tb->blocks->miscj); - else - tb->blocks = b; + list_add(&b->miscj, &tb->miscj); err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) @@ -1206,6 +1196,8 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } + INIT_LIST_HEAD(&b->miscj); + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; @@ -1226,26 +1218,15 @@ static void threshold_block_release(struct kobject *kobj) kfree(to_block(kobj)); } -static void deallocate_threshold_blocks(struct threshold_bank *bank) +static void threshold_remove_bank(struct threshold_bank *bank) { struct threshold_block *pos, *tmp; - list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { + list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) { list_del(&pos->miscj); kobject_put(&pos->kobj); } - kobject_put(&bank->blocks->kobj); -} - -static void threshold_remove_bank(struct threshold_bank *bank) -{ - if (!bank->blocks) - goto out_free; - - deallocate_threshold_blocks(bank); - -out_free: kobject_put(bank->kobj); kfree(bank); } From 0f134c53246366c00664b640f9edc9be5db255b3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 25 Aug 2025 17:33:02 +0000 Subject: [PATCH 0891/3206] x86/mce: Cleanup bank processing on init Unify the bank preparation into __mcheck_cpu_init_clear_banks(), rename that function to what it does now - prepares banks. Do this so that generic and vendor banks init goes first so that settings done during that init can take effect before the first bank polling takes place. Move __mcheck_cpu_check_banks() into __mcheck_cpu_init_prepare_banks() as it already loops over the banks. The MCP_DONTLOG flag is no longer needed, since the MCA polling function is now called only if boot-time logging should be done. Signed-off-by: Borislav Petkov Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Yazen Ghannam Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Tested-by: Tony Luck Link: https://lore.kernel.org/20250825-wip-mca-updates-v5-5-865768a2eef8@amd.com --- arch/x86/include/asm/mce.h | 3 +- arch/x86/kernel/cpu/mce/core.c | 63 ++++++++++------------------------ 2 files changed, 19 insertions(+), 47 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 752802bf966b9c..3224f3862dc86f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -290,8 +290,7 @@ DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); enum mcp_flags { MCP_TIMESTAMP = BIT(0), /* log time stamp */ MCP_UC = BIT(1), /* log uncorrected errors */ - MCP_DONTLOG = BIT(2), /* only clear, don't log */ - MCP_QUEUE_LOG = BIT(3), /* only queue to genpool */ + MCP_QUEUE_LOG = BIT(2), /* only queue to genpool */ }; void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4da4eab56c81de..311876e3f3f475 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -807,9 +807,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; log_it: - if (flags & MCP_DONTLOG) - goto clear_it; - mce_read_aux(&err, i); m->severity = mce_severity(m, NULL, NULL, false); /* @@ -1812,7 +1809,7 @@ static void __mcheck_cpu_mce_banks_init(void) /* * Init them all, __mcheck_cpu_apply_quirks() is going to apply * the required vendor quirks before - * __mcheck_cpu_init_clear_banks() does the final bank setup. + * __mcheck_cpu_init_prepare_banks() does the final bank setup. */ b->ctl = -1ULL; b->init = true; @@ -1851,21 +1848,8 @@ static void __mcheck_cpu_cap_init(void) static void __mcheck_cpu_init_generic(void) { - enum mcp_flags m_fl = 0; - mce_banks_t all_banks; u64 cap; - if (!mca_cfg.bootlog) - m_fl = MCP_DONTLOG; - - /* - * Log the machine checks left over from the previous reset. Log them - * only, do not start processing them. That will happen in mcheck_late_init() - * when all consumers have been registered on the notifier chain. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); - cr4_set_bits(X86_CR4_MCE); rdmsrq(MSR_IA32_MCG_CAP, cap); @@ -1873,36 +1857,23 @@ static void __mcheck_cpu_init_generic(void) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } -static void __mcheck_cpu_init_clear_banks(void) +static void __mcheck_cpu_init_prepare_banks(void) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; int i; - for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - struct mce_bank *b = &mce_banks[i]; + /* + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. + */ + if (mca_cfg.bootlog) { + mce_banks_t all_banks; - if (!b->init) - continue; - wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks); } -} - -/* - * Do a final check to see if there are any unused/RAZ banks. - * - * This must be done after the banks have been initialized and any quirks have - * been applied. - * - * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. - * Otherwise, a user who disables a bank will not be able to re-enable it - * without a system reboot. - */ -static void __mcheck_cpu_check_banks(void) -{ - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - u64 msrval; - int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; @@ -1910,6 +1881,9 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + rdmsrq(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } @@ -2314,8 +2288,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); - __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_check_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_setup_timer(); } @@ -2483,7 +2456,7 @@ static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); } static struct syscore_ops mce_syscore_ops = { @@ -2501,7 +2474,7 @@ static void mce_cpu_restart(void *data) if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_init_timer(); } From 9f34032ec0deef58bd0eb7475f1981adfa998648 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 25 Aug 2025 17:33:03 +0000 Subject: [PATCH 0892/3206] x86/mce: Remove __mcheck_cpu_init_early() The __mcheck_cpu_init_early() function was introduced so that some vendor-specific features are detected before the first MCA polling event done in __mcheck_cpu_init_generic(). Currently, __mcheck_cpu_init_early() is only used on AMD-based systems and additional code will be needed to support various system configurations. However, the current and future vendor-specific code should be done during vendor init. This keeps all the vendor code in a common location and simplifies the generic init flow. Move all the __mcheck_cpu_init_early() code into mce_amd_feature_init(). Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Tested-by: Tony Luck Link: https://lore.kernel.org/20250825-wip-mca-updates-v5-6-865768a2eef8@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 4 ++++ arch/x86/kernel/cpu/mce/core.c | 14 -------------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index e9b9be2215edfb..aa13c9304ad892 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -653,6 +653,10 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); + mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + mce_flags.amd_threshold = 1; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (mce_flags.smca) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 311876e3f3f475..0326fbb83adc5d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2034,19 +2034,6 @@ static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return false; } -/* - * Init basic CPU features needed for early decoding of MCEs. - */ -static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) -{ - if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { - mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); - mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - mce_flags.amd_threshold = 1; - } -} - static void mce_centaur_feature_init(struct cpuinfo_x86 *c) { struct mca_config *cfg = &mca_cfg; @@ -2285,7 +2272,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) mca_cfg.initialized = 1; - __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_prepare_banks(); From 923b70581cb6acede90f8aaf4afe5d1c58c67b71 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Fri, 22 Aug 2025 10:49:15 +0800 Subject: [PATCH 0893/3206] iommu/amd: Fix ivrs_base memleak in early_amd_iommu_init() Fix a permanent ACPI table memory leak in early_amd_iommu_init() when CMPXCHG16B feature is not supported Fixes: 82582f85ed22 ("iommu/amd: Disable AMD IOMMU if CMPXCHG16B feature is not supported") Cc: stable@vger.kernel.org Signed-off-by: Zhen Ni Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20250822024915.673427-1-zhen.ni@easystack.cn Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 8de689b2c5ed57..6795326a6524b8 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3067,7 +3067,8 @@ static int __init early_amd_iommu_init(void) if (!boot_cpu_has(X86_FEATURE_CX16)) { pr_err("Failed to initialize. The CMPXCHG16B feature is required.\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } /* From b3506e9bcc777ed6af2ab631c86a9990ed97b474 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Wed, 27 Aug 2025 17:08:27 -0400 Subject: [PATCH 0894/3206] iommu/s390: Fix memory corruption when using identity domain zpci_get_iommu_ctrs() returns counter information to be reported as part of device statistics; these counters are stored as part of the s390_domain. The problem, however, is that the identity domain is not backed by an s390_domain and so the conversion via to_s390_domain() yields a bad address that is zero'd initially and read on-demand later via a sysfs read. These counters aren't necessary for the identity domain; just return NULL in this case. This issue was discovered via KASAN with reports that look like: BUG: KASAN: global-out-of-bounds in zpci_fmb_enable_device when using the identity domain for a device on s390. Cc: stable@vger.kernel.org Fixes: 64af12c6ec3a ("iommu/s390: implement iommu passthrough via identity domain") Reported-by: Cam Miller Signed-off-by: Matthew Rosato Tested-by: Cam Miller Reviewed-by: Farhan Ali Reviewed-by: Niklas Schnelle Link: https://lore.kernel.org/r/20250827210828.274527-1-mjrosato@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 9c80d61deb2c0b..d7370347c91021 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -1032,7 +1032,8 @@ struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev) lockdep_assert_held(&zdev->dom_lock); - if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED) + if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED || + zdev->s390_domain->type == IOMMU_DOMAIN_IDENTITY) return NULL; s390_domain = to_s390_domain(zdev->s390_domain); From dce043c07ca1ac19cfbe2844a6dc71e35c322353 Mon Sep 17 00:00:00 2001 From: Eugene Koira Date: Wed, 3 Sep 2025 13:53:29 +0800 Subject: [PATCH 0895/3206] iommu/vt-d: Fix __domain_mapping()'s usage of switch_to_super_page() switch_to_super_page() assumes the memory range it's working on is aligned to the target large page level. Unfortunately, __domain_mapping() doesn't take this into account when using it, and will pass unaligned ranges ultimately freeing a PTE range larger than expected. Take for example a mapping with the following iov_pfn range [0x3fe400, 0x4c0600), which should be backed by the following mappings: iov_pfn [0x3fe400, 0x3fffff] covered by 2MiB pages iov_pfn [0x400000, 0x4bffff] covered by 1GiB pages iov_pfn [0x4c0000, 0x4c05ff] covered by 2MiB pages Under this circumstance, __domain_mapping() will pass [0x400000, 0x4c05ff] to switch_to_super_page() at a 1 GiB granularity, which will in turn free PTEs all the way to iov_pfn 0x4fffff. Mitigate this by rounding down the iov_pfn range passed to switch_to_super_page() in __domain_mapping() to the target large page level. Additionally add range alignment checks to switch_to_super_page. Fixes: 9906b9352a35 ("iommu/vt-d: Avoid duplicate removing in __domain_mapping()") Signed-off-by: Eugene Koira Cc: stable@vger.kernel.org Reviewed-by: Nicolas Saenz Julienne Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9c3ab9d9f69a3e..dff2d895b8abd7 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1575,6 +1575,10 @@ static void switch_to_super_page(struct dmar_domain *domain, unsigned long lvl_pages = lvl_to_nr_pages(level); struct dma_pte *pte = NULL; + if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) || + !IS_ALIGNED(end_pfn + 1, lvl_pages))) + return; + while (start_pfn <= end_pfn) { if (!pte) pte = pfn_to_dma_pte(domain, start_pfn, &level, @@ -1650,7 +1654,8 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long pages_to_remove; pteval |= DMA_PTE_LARGE_PAGE; - pages_to_remove = min_t(unsigned long, nr_pages, + pages_to_remove = min_t(unsigned long, + round_down(nr_pages, lvl_pages), nr_pte_to_next_page(pte) * lvl_pages); end_pfn = iov_pfn + pages_to_remove - 1; switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); From 9ffaf5229055fcfbb3b3d6f1c7e58d63715c3f73 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 4 Sep 2025 10:59:49 +0200 Subject: [PATCH 0896/3206] iommu/s390: Make attach succeed when the device was surprise removed When a PCI device is removed with surprise hotplug, there may still be attempts to attach the device to the default domain as part of tear down via (__iommu_release_dma_ownership()), or because the removal happens during probe (__iommu_probe_device()). In both cases zpci_register_ioat() fails with a cc value indicating that the device handle is invalid. This is because the device is no longer part of the instance as far as the hypervisor is concerned. Currently this leads to an error return and s390_iommu_attach_device() fails. This triggers the WARN_ON() in __iommu_group_set_domain_nofail() because attaching to the default domain must never fail. With the device fenced by the hypervisor no DMAs to or from memory are possible and the IOMMU translations have no effect. Proceed as if the registration was successful and let the hotplug event handling clean up the device. This is similar to how devices in the error state are handled since commit 59bbf596791b ("iommu/s390: Make attach succeed even if the device is in error state") except that for removal the domain will not be registered later. This approach was also previously discussed at the link. Handle both cases, error state and removal, in a helper which checks if the error needs to be propagated or ignored. Avoid magic number condition codes by using the pre-existing, but never used, defines for PCI load/store condition codes and rename them to reflect that they apply to all PCI instructions. Cc: stable@vger.kernel.org # v6.2 Link: https://lore.kernel.org/linux-iommu/20240808194155.GD1985367@ziepe.ca/ Suggested-by: Jason Gunthorpe Signed-off-by: Niklas Schnelle Reviewed-by: Matthew Rosato Reviewed-by: Benjamin Block Link: https://lore.kernel.org/r/20250904-iommu_succeed_attach_removed-v1-1-e7f333d2f80f@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/include/asm/pci_insn.h | 10 +++++----- drivers/iommu/s390-iommu.c | 26 +++++++++++++++++++------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index e5f57cfe1d4582..025c6dcbf89331 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -16,11 +16,11 @@ #define ZPCI_PCI_ST_FUNC_NOT_AVAIL 40 #define ZPCI_PCI_ST_ALREADY_IN_RQ_STATE 44 -/* Load/Store return codes */ -#define ZPCI_PCI_LS_OK 0 -#define ZPCI_PCI_LS_ERR 1 -#define ZPCI_PCI_LS_BUSY 2 -#define ZPCI_PCI_LS_INVAL_HANDLE 3 +/* PCI instruction condition codes */ +#define ZPCI_CC_OK 0 +#define ZPCI_CC_ERR 1 +#define ZPCI_CC_BUSY 2 +#define ZPCI_CC_INVAL_HANDLE 3 /* Load/Store address space identifiers */ #define ZPCI_PCIAS_MEMIO_0 0 diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index d7370347c91021..aa576736d60baa 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -612,6 +612,23 @@ static u64 get_iota_region_flag(struct s390_domain *domain) } } +static bool reg_ioat_propagate_error(int cc, u8 status) +{ + /* + * If the device is in the error state the reset routine + * will register the IOAT of the newly set domain on re-enable + */ + if (cc == ZPCI_CC_ERR && status == ZPCI_PCI_ST_FUNC_NOT_AVAIL) + return false; + /* + * If the device was removed treat registration as success + * and let the subsequent error event trigger tear down. + */ + if (cc == ZPCI_CC_INVAL_HANDLE) + return false; + return cc != ZPCI_CC_OK; +} + static int s390_iommu_domain_reg_ioat(struct zpci_dev *zdev, struct iommu_domain *domain, u8 *status) { @@ -696,7 +713,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); - if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) + if (reg_ioat_propagate_error(cc, status)) return -EIO; zdev->dma_table = s390_domain->dma_table; zdev_s390_domain_update(zdev, domain); @@ -1124,12 +1141,7 @@ static int s390_attach_dev_identity(struct iommu_domain *domain, /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); - - /* - * If the device is undergoing error recovery the reset code - * will re-establish the new domain. - */ - if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) + if (reg_ioat_propagate_error(cc, status)) return -EIO; zdev_s390_domain_update(zdev, domain); From f5d15ff664a32593e0152d21ea747fb77e32ce7e Mon Sep 17 00:00:00 2001 From: Woodrow Douglass Date: Fri, 5 Sep 2025 08:39:42 -0400 Subject: [PATCH 0897/3206] regulator: dt-bindings: nxp,pf530x: Add NXP PF5300/PF5301/PF5302 PMICs Bindings for the pf530x series of voltage regulators Signed-off-by: Woodrow Douglass Reviewed-by: Krzysztof Kozlowski Message-ID: <20250902-pf530x-v7-1-10eb2542f944@carnegierobotics.com> Signed-off-by: Mark Brown --- .../bindings/regulator/nxp,pf5300.yaml | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 Documentation/devicetree/bindings/regulator/nxp,pf5300.yaml diff --git a/Documentation/devicetree/bindings/regulator/nxp,pf5300.yaml b/Documentation/devicetree/bindings/regulator/nxp,pf5300.yaml new file mode 100644 index 00000000000000..5b9d5d4e48d09a --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/nxp,pf5300.yaml @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/nxp,pf5300.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: NXP PF5300/PF5301/PF5302 PMIC regulators + +maintainers: + - Woodrow Douglass + +description: | + The PF5300, PF5301, and PF5302 integrate high-performance buck converters, + 12 A, 8 A, and 15 A, respectively, to power high-end automotive and industrial + processors. With adaptive voltage positioning and a high-bandwidth loop, they + offer transient regulation to minimize capacitor requirements. + +allOf: + - $ref: regulator.yaml# + +properties: + compatible: + oneOf: + - const: nxp,pf5300 + - items: + - enum: + - nxp,pf5301 + - nxp,pf5302 + - const: nxp,pf5300 + reg: + maxItems: 1 + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + regulator@28 { + compatible = "nxp,pf5302", "nxp,pf5300"; + reg = <0x28>; + + regulator-always-on; + regulator-boot-on; + regulator-max-microvolt = <1200000>; + regulator-min-microvolt = <500000>; + }; + }; From b497e1a1a2b10c4ddb28064fba229365ae03311a Mon Sep 17 00:00:00 2001 From: Woodrow Douglass Date: Fri, 5 Sep 2025 08:39:43 -0400 Subject: [PATCH 0898/3206] regulator: pf530x: Add a driver for the NXP PF5300 Regulator This driver allows reading some regulator settings and adjusting output voltage. It is based on information from the datasheet at https://www.nxp.com/docs/en/data-sheet/PF5300.pdf Signed-off-by: Woodrow Douglass Message-ID: <20250902-pf530x-v7-2-10eb2542f944@carnegierobotics.com> Signed-off-by: Mark Brown --- MAINTAINERS | 6 + drivers/regulator/Kconfig | 12 + drivers/regulator/Makefile | 1 + drivers/regulator/pf530x-regulator.c | 375 +++++++++++++++++++++++++++ 4 files changed, 394 insertions(+) create mode 100644 drivers/regulator/pf530x-regulator.c diff --git a/MAINTAINERS b/MAINTAINERS index fed6cd812d796a..9abcd2d4c10f9b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18292,6 +18292,12 @@ F: Documentation/devicetree/bindings/clock/*imx* F: drivers/clk/imx/ F: include/dt-bindings/clock/*imx* +NXP PF5300/PF5301/PF5302 PMIC REGULATOR DEVICE DRIVER +M: Woodrow Douglass +S: Maintained +F: Documentation/devicetree/bindings/regulator/nxp,pf5300.yaml +F: drivers/regulator/pf530x-regulator.c + NXP PF8100/PF8121A/PF8200 PMIC REGULATOR DEVICE DRIVER M: Jagan Teki S: Maintained diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index eaa6df1c9f8066..611356bea09d2a 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -1006,6 +1006,18 @@ config REGULATOR_PCAP This driver provides support for the voltage regulators of the PCAP2 PMIC. +config REGULATOR_PF530X + tristate "NXP PF5300/PF5301/PF5302 regulator driver" + depends on I2C && OF + select REGMAP_I2C + help + Say y here to support the regulators found on the NXP + PF5300/PF5301/PF5302 PMIC. + + Say M here if you want to support for the regulators found + on the NXP PF5300/PF5301/PF5302 PMIC. The module will be named + "pf530x-regulator". + config REGULATOR_PF8X00 tristate "NXP PF8100/PF8121A/PF8200 regulator driver" depends on I2C && OF diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index be98b29d6675d8..60ca55d04aef5c 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -125,6 +125,7 @@ obj-$(CONFIG_REGULATOR_QCOM_USB_VBUS) += qcom_usb_vbus-regulator.o obj-$(CONFIG_REGULATOR_PALMAS) += palmas-regulator.o obj-$(CONFIG_REGULATOR_PCA9450) += pca9450-regulator.o obj-$(CONFIG_REGULATOR_PF9453) += pf9453-regulator.o +obj-$(CONFIG_REGULATOR_PF530X) += pf530x-regulator.o obj-$(CONFIG_REGULATOR_PF8X00) += pf8x00-regulator.o obj-$(CONFIG_REGULATOR_PFUZE100) += pfuze100-regulator.o obj-$(CONFIG_REGULATOR_PV88060) += pv88060-regulator.o diff --git a/drivers/regulator/pf530x-regulator.c b/drivers/regulator/pf530x-regulator.c new file mode 100644 index 00000000000000..f789c4b6a499e1 --- /dev/null +++ b/drivers/regulator/pf530x-regulator.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0+ + +// documentation of this device is available at +// https://www.nxp.com/docs/en/data-sheet/PF5300.pdf + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* registers */ +#define PF530X_DEVICEID 0x00 +#define PF530X_REV 0x01 +#define PF530X_EMREV 0x02 +#define PF530X_PROGID 0x03 +#define PF530X_CONFIG1 0x04 +#define PF530X_INT_STATUS1 0x05 +#define PF530X_INT_SENSE1 0x06 +#define PF530X_INT_STATUS2 0x07 +#define PF530X_INT_SENSE2 0x08 +#define PF530X_BIST_STAT1 0x09 +#define PF530X_BIST_CTRL 0x0a +#define PF530X_STATE 0x0b +#define PF530X_STATE_CTRL 0x0c +#define PF530X_SW1_VOLT 0x0d +#define PF530X_SW1_STBY_VOLT 0x0e +#define PF530X_SW1_CTRL1 0x0f +#define PF530X_SW1_CTRL2 0x10 +#define PF530X_CLK_CTRL 0x11 +#define PF530X_SEQ_CTRL1 0x12 +#define PF530X_SEQ_CTRL2 0x13 +#define PF530X_RANDOM_CHK 0x14 +#define PF530X_RANDOM_GEN 0x15 +#define PF530X_WD_CTRL1 0x16 +#define PF530X_WD_SEED 0x17 +#define PF530X_WD_ANSWER 0x18 +#define PF530X_FLT_CNT1 0x19 +#define PF530X_FLT_CNT2 0x1a +#define PF530X_OTP_MODE 0x2f + +enum pf530x_states { + PF530X_STATE_POF, + PF530X_STATE_FUSE_LOAD, + PF530X_STATE_LP_OFF, + PF530X_STATE_SELF_TEST, + PF530X_STATE_POWER_UP, + PF530X_STATE_INIT, + PF530X_STATE_IO_RELEASE, + PF530X_STATE_RUN, + PF530X_STATE_STANDBY, + PF530X_STATE_FAULT, + PF530X_STATE_FAILSAFE, + PF530X_STATE_POWER_DOWN, + PF530X_STATE_2MS_SELFTEST_RETRY, + PF530X_STATE_OFF_DLY, +}; + +#define PF530_FAM 0x50 +enum pf530x_devid { + PF5300 = 0x3, + PF5301 = 0x4, + PF5302 = 0x5, +}; + +#define PF530x_FAM 0x50 +#define PF530x_DEVICE_FAM_MASK GENMASK(7, 4) +#define PF530x_DEVICE_ID_MASK GENMASK(3, 0) + +#define PF530x_STATE_MASK GENMASK(3, 0) +#define PF530x_STATE_RUN 0x07 +#define PF530x_STATE_STANDBY 0x08 +#define PF530x_STATE_LP_OFF 0x02 + +#define PF530X_OTP_STBY_MODE GENMASK(3, 2) +#define PF530X_OTP_RUN_MODE GENMASK(1, 0) + +#define PF530X_INT_STATUS_OV BIT(1) +#define PF530X_INT_STATUS_UV BIT(2) +#define PF530X_INT_STATUS_ILIM BIT(3) + +#define SW1_ILIM_S BIT(0) +#define VMON_UV_S BIT(1) +#define VMON_OV_S BIT(2) +#define VIN_OVLO_S BIT(3) +#define BG_ERR_S BIT(6) + +#define THERM_155_S BIT(3) +#define THERM_140_S BIT(2) +#define THERM_125_S BIT(1) +#define THERM_110_S BIT(0) + +struct pf530x_chip { + struct regmap *regmap; + struct device *dev; +}; + +static const struct regmap_config pf530x_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = PF530X_OTP_MODE, + .cache_type = REGCACHE_MAPLE, +}; + +static int pf530x_get_status(struct regulator_dev *rdev) +{ + unsigned int state; + int ret; + + ret = regmap_read(rdev->regmap, PF530X_INT_SENSE1, &state); + if (ret != 0) + return ret; + + if ((state & (BG_ERR_S | SW1_ILIM_S | VMON_UV_S | VMON_OV_S | VIN_OVLO_S)) + != 0) + return REGULATOR_STATUS_ERROR; + + // no errors, check if what non-error state we're in + ret = regmap_read(rdev->regmap, PF530X_STATE, &state); + if (ret != 0) + return ret; + + state &= PF530x_STATE_MASK; + + switch (state) { + case PF530x_STATE_RUN: + ret = REGULATOR_STATUS_NORMAL; + break; + case PF530x_STATE_STANDBY: + ret = REGULATOR_STATUS_STANDBY; + break; + case PF530x_STATE_LP_OFF: + ret = REGULATOR_STATUS_OFF; + break; + default: + ret = REGULATOR_STATUS_ERROR; + break; + } + return ret; +} + +static int pf530x_get_error_flags(struct regulator_dev *rdev, unsigned int *flags) +{ + unsigned int status; + int ret; + + ret = regmap_read(rdev->regmap, PF530X_INT_STATUS1, &status); + + if (ret != 0) + return ret; + + *flags = 0; + + if (status & PF530X_INT_STATUS_OV) + *flags |= REGULATOR_ERROR_OVER_VOLTAGE_WARN; + + if (status & PF530X_INT_STATUS_UV) + *flags |= REGULATOR_ERROR_UNDER_VOLTAGE; + + if (status & PF530X_INT_STATUS_ILIM) + *flags |= REGULATOR_ERROR_OVER_CURRENT; + + ret = regmap_read(rdev->regmap, PF530X_INT_SENSE2, &status); + + if (ret != 0) + return ret; + + if ((status & (THERM_155_S | + THERM_140_S | + THERM_125_S | + THERM_110_S)) != 0) + *flags |= REGULATOR_ERROR_OVER_TEMP_WARN; + + return 0; +} + +static const struct regulator_ops pf530x_regulator_ops = { + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .map_voltage = regulator_map_voltage_linear_range, + .list_voltage = regulator_list_voltage_linear_range, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .get_status = pf530x_get_status, + .get_error_flags = pf530x_get_error_flags, + .set_bypass = regulator_set_bypass_regmap, + .get_bypass = regulator_get_bypass_regmap, +}; + +static const struct linear_range vrange = REGULATOR_LINEAR_RANGE(500000, 0, 140, 5000); + +static const struct regulator_desc pf530x_reg_desc = { + .name = "SW1", + .ops = &pf530x_regulator_ops, + .linear_ranges = &vrange, + .n_linear_ranges = 1, + .type = REGULATOR_VOLTAGE, + .id = 0, + .owner = THIS_MODULE, + .vsel_reg = PF530X_SW1_VOLT, + .vsel_mask = 0xFF, + .bypass_reg = PF530X_SW1_CTRL2, + .bypass_mask = 0x07, + .bypass_val_on = 0x07, + .bypass_val_off = 0x00, + .enable_reg = PF530X_SW1_CTRL1, + .enable_mask = GENMASK(5, 2), + .enable_val = GENMASK(5, 2), + .disable_val = 0, +}; + +static int pf530x_identify(struct pf530x_chip *chip) +{ + unsigned int value; + u8 dev_fam, dev_id, full_layer_rev, metal_layer_rev, prog_idh, prog_idl, emrev; + const char *name = NULL; + int ret; + + ret = regmap_read(chip->regmap, PF530X_DEVICEID, &value); + if (ret) { + dev_err(chip->dev, "failed to read chip family\n"); + return ret; + } + + dev_fam = value & PF530x_DEVICE_FAM_MASK; + switch (dev_fam) { + case PF530x_FAM: + break; + default: + dev_err(chip->dev, + "Chip 0x%x is not from PF530X family\n", dev_fam); + return ret; + } + + dev_id = value & PF530x_DEVICE_ID_MASK; + switch (dev_id) { + case PF5300: + name = "PF5300"; + break; + case PF5301: + name = "PF5301"; + break; + case PF5302: + name = "PF5302"; + break; + default: + dev_err(chip->dev, "Unknown pf530x device id 0x%x\n", dev_id); + return -ENODEV; + } + + ret = regmap_read(chip->regmap, PF530X_REV, &value); + if (ret) { + dev_err(chip->dev, "failed to read chip rev\n"); + return ret; + } + + full_layer_rev = ((value & 0xF0) == 0) ? '0' : ((((value & 0xF0) >> 4) - 1) + 'A'); + metal_layer_rev = value & 0xF; + + ret = regmap_read(chip->regmap, PF530X_EMREV, &value); + if (ret) { + dev_err(chip->dev, "failed to read chip emrev register\n"); + return ret; + } + + prog_idh = (value >> 4) + 'A'; + // prog_idh skips 'O', per page 96 of the datasheet + if (prog_idh >= 'O') + prog_idh += 1; + + emrev = value & 0x7; + + ret = regmap_read(chip->regmap, PF530X_PROGID, &value); + if (ret) { + dev_err(chip->dev, "failed to read chip progid register\n"); + return ret; + } + + if (value >= 0x22) { + dev_err(chip->dev, "invalid value for progid register\n"); + return -ENODEV; + } else if (value < 10) { + prog_idl = value + '0'; + } else { + prog_idl = (value - 10) + 'A'; + // prog_idh skips 'O', per page 97 of the datasheet + if (prog_idl >= 'O') + prog_idl += 1; + } + + dev_info(chip->dev, "%s Regulator found (Rev %c%d ProgID %c%c EMREV %x).\n", + name, full_layer_rev, metal_layer_rev, prog_idh, prog_idl, emrev); + + return 0; +} + +static int pf530x_i2c_probe(struct i2c_client *client) +{ + struct regulator_config config = { NULL, }; + struct pf530x_chip *chip; + int ret; + struct regulator_dev *rdev; + struct regulator_init_data *init_data; + + chip = devm_kzalloc(&client->dev, sizeof(*chip), GFP_KERNEL); + if (!chip) + return -ENOMEM; + + i2c_set_clientdata(client, chip); + chip->dev = &client->dev; + + chip->regmap = devm_regmap_init_i2c(client, &pf530x_regmap_config); + if (IS_ERR(chip->regmap)) { + ret = PTR_ERR(chip->regmap); + dev_err(&client->dev, + "regmap allocation failed with err %d\n", ret); + return ret; + } + + ret = pf530x_identify(chip); + if (ret) + return ret; + + init_data = of_get_regulator_init_data(chip->dev, chip->dev->of_node, &pf530x_reg_desc); + if (!init_data) + return -ENODATA; + + config.dev = chip->dev; + config.of_node = chip->dev->of_node; + config.regmap = chip->regmap; + config.init_data = init_data; + + // the config parameter gets copied, it's ok to pass a pointer on the stack here + rdev = devm_regulator_register(&client->dev, &pf530x_reg_desc, &config); + if (IS_ERR(rdev)) { + dev_err(&client->dev, "failed to register %s regulator\n", pf530x_reg_desc.name); + return PTR_ERR(rdev); + } + + return 0; +} + +static const struct of_device_id pf530x_dt_ids[] = { + { .compatible = "nxp,pf5300",}, + { } +}; +MODULE_DEVICE_TABLE(of, pf530x_dt_ids); + +static const struct i2c_device_id pf530x_i2c_id[] = { + { "pf5300", 0 }, + { "pf5301", 0 }, + { "pf5302", 0 }, + {}, +}; +MODULE_DEVICE_TABLE(i2c, pf530x_i2c_id); + +static struct i2c_driver pf530x_regulator_driver = { + .id_table = pf530x_i2c_id, + .driver = { + .name = "pf530x", + .of_match_table = pf530x_dt_ids, + }, + .probe = pf530x_i2c_probe, +}; +module_i2c_driver(pf530x_regulator_driver); + +MODULE_AUTHOR("Woodrow Douglass "); +MODULE_DESCRIPTION("Regulator Driver for NXP's PF5300/PF5301/PF5302 PMIC"); +MODULE_LICENSE("GPL"); From 0c43094f8cc9d3d99d835c0ac9c4fe1ccc62babd Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Tue, 15 Jul 2025 14:46:34 +0200 Subject: [PATCH 0899/3206] eventpoll: Replace rwlock with spinlock The ready event list of an epoll object is protected by read-write semaphore: - The consumer (waiter) acquires the write lock and takes items. - the producer (waker) takes the read lock and adds items. The point of this design is enabling epoll to scale well with large number of producers, as multiple producers can hold the read lock at the same time. Unfortunately, this implementation may cause scheduling priority inversion problem. Suppose the consumer has higher scheduling priority than the producer. The consumer needs to acquire the write lock, but may be blocked by the producer holding the read lock. Since read-write semaphore does not support priority-boosting for the readers (even with CONFIG_PREEMPT_RT=y), we have a case of priority inversion: a higher priority consumer is blocked by a lower priority producer. This problem was reported in [1]. Furthermore, this could also cause stall problem, as described in [2]. Fix this problem by replacing rwlock with spinlock. This reduces the event bandwidth, as the producers now have to contend with each other for the spinlock. According to the benchmark from https://github.com/rouming/test-tools/blob/master/stress-epoll.c: On 12 x86 CPUs: Before After Diff threads events/ms events/ms 8 7162 4956 -31% 16 8733 5383 -38% 32 7968 5572 -30% 64 10652 5739 -46% 128 11236 5931 -47% On 4 riscv CPUs: Before After Diff threads events/ms events/ms 8 2958 2833 -4% 16 3323 3097 -7% 32 3451 3240 -6% 64 3554 3178 -11% 128 3601 3235 -10% Although the numbers look bad, it should be noted that this benchmark creates multiple threads who do nothing except constantly generating new epoll events, thus contention on the spinlock is high. For real workload, the event rate is likely much lower, and the performance drop is not as bad. Using another benchmark (perf bench epoll wait) where spinlock contention is lower, improvement is even observed on x86: On 12 x86 CPUs: Before: Averaged 110279 operations/sec (+- 1.09%), total secs = 8 After: Averaged 114577 operations/sec (+- 2.25%), total secs = 8 On 4 riscv CPUs: Before: Averaged 175767 operations/sec (+- 0.62%), total secs = 8 After: Averaged 167396 operations/sec (+- 0.23%), total secs = 8 In conclusion, no one is likely to be upset over this change. After all, spinlock was used originally for years, and the commit which converted to rwlock didn't mention a real workload, just that the benchmark numbers are nice. This patch is not exactly the revert of commit a218cc491420 ("epoll: use rwlock in order to reduce ep_poll_callback() contention"), because git revert conflicts in some places which are not obvious on the resolution. This patch is intended to be backported, therefore go with the obvious approach: - Replace rwlock_t with spinlock_t one to one - Delete list_add_tail_lockless() and chain_epi_lockless(). These were introduced to allow producers to concurrently add items to the list. But now that spinlock no longer allows producers to touch the event list concurrently, these two functions are not necessary anymore. Fixes: a218cc491420 ("epoll: use rwlock in order to reduce ep_poll_callback() contention") Signed-off-by: Nam Cao Link: https://lore.kernel.org/ec92458ea357ec503c737ead0f10b2c6e4c37d47.1752581388.git.namcao@linutronix.de Tested-by: K Prateek Nayak Cc: stable@vger.kernel.org Reported-by: Frederic Weisbecker Closes: https://lore.kernel.org/linux-rt-users/20210825132754.GA895675@lothringen/ [1] Reported-by: Valentin Schneider Closes: https://lore.kernel.org/linux-rt-users/xhsmhttqvnall.mognet@vschneid.remote.csb/ [2] Signed-off-by: Christian Brauner --- fs/eventpoll.c | 139 +++++++++---------------------------------------- 1 file changed, 26 insertions(+), 113 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b22d6f819f782d..ee7c4b683ec3d2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -46,10 +46,10 @@ * * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) - * 3) ep->lock (rwlock) + * 3) ep->lock (spinlock) * * The acquire order is the one listed above, from 1 to 3. - * We need a rwlock (ep->lock) because we manipulate objects + * We need a spinlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need @@ -195,7 +195,7 @@ struct eventpoll { struct list_head rdllist; /* Lock which protects rdllist and ovflist */ - rwlock_t lock; + spinlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; @@ -741,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) * in a lockless way. */ lockdep_assert_irqs_enabled(); - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, @@ -752,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep, { struct epitem *epi, *nepi; - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. @@ -793,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep, wake_up(&ep->wq); } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } static void ep_get(struct eventpoll *ep) @@ -868,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) rb_erase_cached(&epi->rbn, &ep->rbr); - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -1152,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep) return -ENOMEM; mutex_init(&ep->mtx); - rwlock_init(&ep->lock); + spin_lock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); @@ -1239,100 +1239,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, } #endif /* CONFIG_KCMP */ -/* - * Adds a new entry to the tail of the list in a lockless way, i.e. - * multiple CPUs are allowed to call this function concurrently. - * - * Beware: it is necessary to prevent any other modifications of the - * existing list until all changes are completed, in other words - * concurrent list_add_tail_lockless() calls should be protected - * with a read lock, where write lock acts as a barrier which - * makes sure all list_add_tail_lockless() calls are fully - * completed. - * - * Also an element can be locklessly added to the list only in one - * direction i.e. either to the tail or to the head, otherwise - * concurrent access will corrupt the list. - * - * Return: %false if element has been already added to the list, %true - * otherwise. - */ -static inline bool list_add_tail_lockless(struct list_head *new, - struct list_head *head) -{ - struct list_head *prev; - - /* - * This is simple 'new->next = head' operation, but cmpxchg() - * is used in order to detect that same element has been just - * added to the list from another CPU: the winner observes - * new->next == new. - */ - if (!try_cmpxchg(&new->next, &new, head)) - return false; - - /* - * Initially ->next of a new element must be updated with the head - * (we are inserting to the tail) and only then pointers are atomically - * exchanged. XCHG guarantees memory ordering, thus ->next should be - * updated before pointers are actually swapped and pointers are - * swapped before prev->next is updated. - */ - - prev = xchg(&head->prev, new); - - /* - * It is safe to modify prev->next and new->prev, because a new element - * is added only to the tail and new->next is updated before XCHG. - */ - - prev->next = new; - new->prev = prev; - - return true; -} - -/* - * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, - * i.e. multiple CPUs are allowed to call this function concurrently. - * - * Return: %false if epi element has been already chained, %true otherwise. - */ -static inline bool chain_epi_lockless(struct epitem *epi) -{ - struct eventpoll *ep = epi->ep; - - /* Fast preliminary check */ - if (epi->next != EP_UNACTIVE_PTR) - return false; - - /* Check that the same epi has not been just chained from another CPU */ - if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) - return false; - - /* Atomically exchange tail */ - epi->next = xchg(&ep->ovflist, epi); - - return true; -} - /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. - * - * This callback takes a read lock in order not to contend with concurrent - * events from another file descriptor, thus all modifications to ->rdllist - * or ->ovflist are lockless. Read lock is paired with the write lock from - * ep_start/done_scan(), which stops all list modifications and guarantees - * that lists state is seen correctly. - * - * Another thing worth to mention is that ep_poll_callback() can be called - * concurrently for the same @epi from different CPUs if poll table was inited - * with several wait queues entries. Plural wakeup from different CPUs of a - * single wait queue is serialized by wq.lock, but the case when multiple wait - * queues are used should be detected accordingly. This is detected using - * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { @@ -1343,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v unsigned long flags; int ewake = 0; - read_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1372,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (chain_epi_lockless(epi)) + if (epi->next == EP_UNACTIVE_PTR) { + epi->next = READ_ONCE(ep->ovflist); + WRITE_ONCE(ep->ovflist, epi); ep_pm_stay_awake_rcu(epi); + } } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ - if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) - ep_pm_stay_awake_rcu(epi); + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake_rcu(epi); } /* @@ -1410,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v pwake++; out_unlock: - read_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -1745,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, } /* We have to drop the new item inside our item list to keep track of it */ - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); @@ -1762,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, pwake++; } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) @@ -1826,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1837,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, if (waitqueue_active(&ep->poll_wait)) pwake++; } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ @@ -2089,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, init_wait(&wait); wait.func = ep_autoremove_wake_function; - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it @@ -2108,7 +2021,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); if (!eavail) timed_out = !ep_schedule_timeout(to) || @@ -2124,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, eavail = 1; if (!list_empty_careful(&wait.entry)) { - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its @@ -2135,7 +2048,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } } } From c593b9d6c446510684da400833f9d632651942f0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 3 Sep 2025 11:23:33 -0400 Subject: [PATCH 0900/3206] filelock: add FL_RECLAIM to show_fl_flags() macro Show the FL_RECLAIM flag symbolically in tracepoints. Fixes: bb0a55bb7148 ("nfs: don't allow reexport reclaims") Signed-off-by: Jeff Layton Link: https://lore.kernel.org/20250903-filelock-v1-1-f2926902962d@kernel.org Signed-off-by: Christian Brauner --- include/trace/events/filelock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index b8d1e00a7982c9..2dfeb158e848a5 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -27,7 +27,8 @@ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ - { FL_OFDLCK, "FL_OFDLCK" }) + { FL_OFDLCK, "FL_OFDLCK" }, \ + { FL_RECLAIM, "FL_RECLAIM"}) #define show_fl_type(val) \ __print_symbolic(val, \ From e1bf212d0604d2cbb5514e47ccec252b656071fb Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Thu, 4 Sep 2025 20:01:19 +0800 Subject: [PATCH 0901/3206] fuse: virtio_fs: fix page fault for DAX page address The commit ced17ee32a99 ("Revert "virtio: reject shm region if length is zero"") exposes the following DAX page fault bug (this fix the failure that getting shm region alway returns false because of zero length): The commit 21aa65bf82a7 ("mm: remove callers of pfn_t functionality") handles the DAX physical page address incorrectly: the removed macro 'phys_to_pfn_t()' should be replaced with 'PHYS_PFN()'. [ 1.390321] BUG: unable to handle page fault for address: ffffd3fb40000008 [ 1.390875] #PF: supervisor read access in kernel mode [ 1.391257] #PF: error_code(0x0000) - not-present page [ 1.391509] PGD 0 P4D 0 [ 1.391626] Oops: Oops: 0000 [#1] SMP NOPTI [ 1.391806] CPU: 6 UID: 1000 PID: 162 Comm: weston Not tainted 6.17.0-rc3-WSL2-STABLE #2 PREEMPT(none) [ 1.392361] RIP: 0010:dax_to_folio+0x14/0x60 [ 1.392653] Code: 52 c9 c3 00 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48 c1 ef 05 48 c1 e7 06 48 03 3d 34 b5 31 01 <48> 8b 57 08 48 89 f8 f6 c2 01 75 2b 66 90 c3 cc cc cc cc f7 c7 ff [ 1.393727] RSP: 0000:ffffaf7d04407aa8 EFLAGS: 00010086 [ 1.394003] RAX: 000000a000000000 RBX: ffffaf7d04407bb0 RCX: 0000000000000000 [ 1.394524] RDX: ffffd17b40000008 RSI: 0000000000000083 RDI: ffffd3fb40000000 [ 1.394967] RBP: 0000000000000011 R08: 000000a000000000 R09: 0000000000000000 [ 1.395400] R10: 0000000000001000 R11: ffffaf7d04407c10 R12: 0000000000000000 [ 1.395806] R13: ffffa020557be9c0 R14: 0000014000000001 R15: 0000725970e94000 [ 1.396268] FS: 000072596d6d2ec0(0000) GS:ffffa0222dc59000(0000) knlGS:0000000000000000 [ 1.396715] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.397100] CR2: ffffd3fb40000008 CR3: 000000011579c005 CR4: 0000000000372ef0 [ 1.397518] Call Trace: [ 1.397663] [ 1.397900] dax_insert_entry+0x13b/0x390 [ 1.398179] dax_fault_iter+0x2a5/0x6c0 [ 1.398443] dax_iomap_pte_fault+0x193/0x3c0 [ 1.398750] __fuse_dax_fault+0x8b/0x270 [ 1.398997] ? vm_mmap_pgoff+0x161/0x210 [ 1.399175] __do_fault+0x30/0x180 [ 1.399360] do_fault+0xc4/0x550 [ 1.399547] __handle_mm_fault+0x8e3/0xf50 [ 1.399731] ? do_syscall_64+0x72/0x1e0 [ 1.399958] handle_mm_fault+0x192/0x2f0 [ 1.400204] do_user_addr_fault+0x20e/0x700 [ 1.400418] exc_page_fault+0x66/0x150 [ 1.400602] asm_exc_page_fault+0x26/0x30 [ 1.400831] RIP: 0033:0x72596d1bf703 [ 1.401076] Code: 31 f6 45 31 e4 48 8d 15 b3 73 00 00 e8 06 03 00 00 8b 83 68 01 00 00 e9 8e fa ff ff 0f 1f 00 48 8b 44 24 08 4c 89 ee 48 89 df 00 21 43 34 12 e8 72 09 00 00 e9 6a fa ff ff 0f 1f 44 00 00 e8 [ 1.402172] RSP: 002b:00007ffc350f6dc0 EFLAGS: 00010202 [ 1.402488] RAX: 0000725970e94000 RBX: 00005b7c642c2560 RCX: 0000725970d359a7 [ 1.402898] RDX: 0000000000000003 RSI: 00007ffc350f6dc0 RDI: 00005b7c642c2560 [ 1.403284] RBP: 00007ffc350f6e90 R08: 000000000000000d R09: 0000000000000000 [ 1.403634] R10: 00007ffc350f6dd8 R11: 0000000000000246 R12: 0000000000000001 [ 1.404078] R13: 00007ffc350f6dc0 R14: 0000725970e29ce0 R15: 0000000000000003 [ 1.404450] [ 1.404570] Modules linked in: [ 1.404821] CR2: ffffd3fb40000008 [ 1.405029] ---[ end trace 0000000000000000 ]--- [ 1.405323] RIP: 0010:dax_to_folio+0x14/0x60 [ 1.405556] Code: 52 c9 c3 00 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48 c1 ef 05 48 c1 e7 06 48 03 3d 34 b5 31 01 <48> 8b 57 08 48 89 f8 f6 c2 01 75 2b 66 90 c3 cc cc cc cc f7 c7 ff [ 1.406639] RSP: 0000:ffffaf7d04407aa8 EFLAGS: 00010086 [ 1.406910] RAX: 000000a000000000 RBX: ffffaf7d04407bb0 RCX: 0000000000000000 [ 1.407379] RDX: ffffd17b40000008 RSI: 0000000000000083 RDI: ffffd3fb40000000 [ 1.407800] RBP: 0000000000000011 R08: 000000a000000000 R09: 0000000000000000 [ 1.408246] R10: 0000000000001000 R11: ffffaf7d04407c10 R12: 0000000000000000 [ 1.408666] R13: ffffa020557be9c0 R14: 0000014000000001 R15: 0000725970e94000 [ 1.409170] FS: 000072596d6d2ec0(0000) GS:ffffa0222dc59000(0000) knlGS:0000000000000000 [ 1.409608] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.409977] CR2: ffffd3fb40000008 CR3: 000000011579c005 CR4: 0000000000372ef0 [ 1.410437] Kernel panic - not syncing: Fatal exception [ 1.410857] Kernel Offset: 0xc000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Fixes: 21aa65bf82a7 ("mm: remove callers of pfn_t functionality") Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/20250904120339.972-1-haiyuewa@163.com Acked-by: David Hildenbrand Reviewed-by: Miklos Szeredi Signed-off-by: Christian Brauner --- fs/fuse/virtio_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index c826e7ca49f580..76c8fd0bfc75d5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1016,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) - *pfn = fs->window_phys_addr + offset; + *pfn = PHYS_PFN(fs->window_phys_addr + offset); return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } From 4550d33e18112a11a740424c4eec063cd58e918c Mon Sep 17 00:00:00 2001 From: Santhosh Kumar K Date: Thu, 4 Sep 2025 18:47:41 +0530 Subject: [PATCH 0902/3206] mtd: spinand: winbond: Fix oob_layout for W25N01JW Fix the W25N01JW's oob_layout according to the datasheet [1] [1] https://www.winbond.com/hq/product/code-storage-flash-memory/qspinand-flash/?__locale=en&partNo=W25N01JW Fixes: 6a804fb72de5 ("mtd: spinand: winbond: add support for serial NAND flash") Cc: Sridharan S N Cc: stable@vger.kernel.org Signed-off-by: Santhosh Kumar K Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/winbond.c | 37 +++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c index 87053389a1fc7a..4870b2d5edb2a1 100644 --- a/drivers/mtd/nand/spi/winbond.c +++ b/drivers/mtd/nand/spi/winbond.c @@ -176,6 +176,36 @@ static const struct mtd_ooblayout_ops w25n02kv_ooblayout = { .free = w25n02kv_ooblayout_free, }; +static int w25n01jw_ooblayout_ecc(struct mtd_info *mtd, int section, + struct mtd_oob_region *region) +{ + if (section > 3) + return -ERANGE; + + region->offset = (16 * section) + 12; + region->length = 4; + + return 0; +} + +static int w25n01jw_ooblayout_free(struct mtd_info *mtd, int section, + struct mtd_oob_region *region) +{ + if (section > 3) + return -ERANGE; + + region->offset = (16 * section); + region->length = 12; + + /* Extract BBM */ + if (!section) { + region->offset += 2; + region->length -= 2; + } + + return 0; +} + static int w35n01jw_ooblayout_ecc(struct mtd_info *mtd, int section, struct mtd_oob_region *region) { @@ -206,6 +236,11 @@ static int w35n01jw_ooblayout_free(struct mtd_info *mtd, int section, return 0; } +static const struct mtd_ooblayout_ops w25n01jw_ooblayout = { + .ecc = w25n01jw_ooblayout_ecc, + .free = w25n01jw_ooblayout_free, +}; + static const struct mtd_ooblayout_ops w35n01jw_ooblayout = { .ecc = w35n01jw_ooblayout_ecc, .free = w35n01jw_ooblayout_free, @@ -394,7 +429,7 @@ static const struct spinand_info winbond_spinand_table[] = { &write_cache_variants, &update_cache_variants), 0, - SPINAND_ECCINFO(&w25m02gv_ooblayout, NULL), + SPINAND_ECCINFO(&w25n01jw_ooblayout, NULL), SPINAND_CONFIGURE_CHIP(w25n0xjw_hs_cfg)), SPINAND_INFO("W25N01KV", /* 3.3V */ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xae, 0x21), From cd32a0c0dcdf634f2e0e71f41c272e19dece6264 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:32 -0700 Subject: [PATCH 0903/3206] xfs: use deferred intent items for reaping crosslinked blocks When we're removing rmap records for crosslinked blocks, use deferred intent items so that we can try to free/unmap as many of the old data structure's blocks as we can in the same transaction as the commit. Cc: # v6.6 Fixes: 1c7ce115e52106 ("xfs: reap large AG metadata extents when possible") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 8703897c0a9ccb..86d3d104b8d950 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -416,8 +416,6 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno, *aglenp); - rs->force_roll = true; - if (rs->oinfo == &XFS_RMAP_OINFO_COW) { /* * If we're unmapping CoW staging extents, remove the @@ -426,11 +424,14 @@ xreap_agextent_iter( */ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); + rs->force_roll = true; return 0; } - return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, - *aglenp, rs->oinfo); + xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp, + rs->oinfo->oi_owner); + rs->deferred++; + return 0; } trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp); From 82e374405e85af2ce3149c74766168df14864697 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:30 -0700 Subject: [PATCH 0904/3206] xfs: prepare reaping code for dynamic limits The online repair block reaping code employs static limits to decide if it's time to roll the transaction or finish the deferred item chains to avoid overflowing the scrub transaction's reservation. However, the use of static limits aren't great -- btree blocks are assumed to be scattered around the AG and the buffers need to be invalidated, whereas COW staging extents are usually contiguous and do not have buffers. We would like to configure the limits dynamically. To get ready for this, reorganize struct xreap_state to store dynamic limits, and add helpers to hide some of the details of how the limits are enforced. Also rename the "xreap roll" functions to include the word "binval" because they only exist to decide when we should roll the transaction to deal with buffer invalidations. No functional changes intended here. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 149 +++++++++++++++++++++++--------------------- 1 file changed, 79 insertions(+), 70 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 86d3d104b8d950..16bd298330a4cc 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -95,17 +95,17 @@ struct xreap_state { const struct xfs_owner_info *oinfo; enum xfs_ag_resv_type resv; - /* If true, roll the transaction before reaping the next extent. */ - bool force_roll; + /* Number of invalidated buffers logged to the current transaction. */ + unsigned int nr_binval; - /* Number of deferred reaps attached to the current transaction. */ - unsigned int deferred; + /* Maximum number of buffers we can invalidate in a single tx. */ + unsigned int max_binval; - /* Number of invalidated buffers logged to the current transaction. */ - unsigned int invalidated; + /* Number of deferred reaps attached to the current transaction. */ + unsigned int nr_deferred; - /* Number of deferred reaps queued during the whole reap sequence. */ - unsigned long long total_deferred; + /* Maximum number of intents we can reap in a single transaction. */ + unsigned int max_deferred; }; /* Put a block back on the AGFL. */ @@ -148,44 +148,36 @@ xreap_put_freelist( } /* Are there any uncommitted reap operations? */ -static inline bool xreap_dirty(const struct xreap_state *rs) +static inline bool xreap_is_dirty(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred) - return true; - if (rs->invalidated) - return true; - if (rs->total_deferred) - return true; - return false; + return rs->nr_binval > 0 || rs->nr_deferred > 0; } #define XREAP_MAX_BINVAL (2048) /* - * Decide if we want to roll the transaction after reaping an extent. We don't - * want to overrun the transaction reservation, so we prohibit more than - * 128 EFIs per transaction. For the same reason, we limit the number - * of buffer invalidations to 2048. + * Decide if we need to roll the transaction to clear out the the log + * reservation that we allocated to buffer invalidations. */ -static inline bool xreap_want_roll(const struct xreap_state *rs) +static inline bool xreap_want_binval_roll(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS) - return true; - if (rs->invalidated > XREAP_MAX_BINVAL) - return true; - return false; + return rs->nr_binval >= rs->max_binval; } -static inline void xreap_reset(struct xreap_state *rs) +/* Reset the buffer invalidation count after rolling. */ +static inline void xreap_binval_reset(struct xreap_state *rs) { - rs->total_deferred += rs->deferred; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_binval = 0; +} + +/* + * Bump the number of invalidated buffers, and return true if we can continue, + * or false if we need to roll the transaction. + */ +static inline bool xreap_inc_binval(struct xreap_state *rs) +{ + rs->nr_binval++; + return rs->nr_binval < rs->max_binval; } #define XREAP_MAX_DEFER_CHAIN (2048) @@ -194,25 +186,36 @@ static inline void xreap_reset(struct xreap_state *rs) * Decide if we want to finish the deferred ops that are attached to the scrub * transaction. We don't want to queue huge chains of deferred ops because * that can consume a lot of log space and kernel memory. Hence we trigger a - * xfs_defer_finish if there are more than 2048 deferred reap operations or the - * caller did some real work. + * xfs_defer_finish if there are too many deferred reap operations or we've run + * out of space for invalidations. */ -static inline bool -xreap_want_defer_finish(const struct xreap_state *rs) +static inline bool xreap_want_defer_finish(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN) - return true; - return false; + return rs->nr_deferred >= rs->max_deferred; } +/* + * Reset the defer chain length and buffer invalidation count after finishing + * items. + */ static inline void xreap_defer_finish_reset(struct xreap_state *rs) { - rs->total_deferred = 0; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_deferred = 0; + rs->nr_binval = 0; +} + +/* + * Bump the number of deferred extent reaps. + */ +static inline void xreap_inc_defer(struct xreap_state *rs) +{ + rs->nr_deferred++; +} + +/* Force the caller to finish a deferred item chain. */ +static inline void xreap_force_defer_finish(struct xreap_state *rs) +{ + rs->nr_deferred = rs->max_deferred; } /* @@ -297,14 +300,13 @@ xreap_agextent_binval( while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { xfs_trans_bjoin(sc->tp, bp); xfs_trans_binval(sc->tp, bp); - rs->invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however * far we've gotten. */ - if (rs->invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { *aglenp -= agbno_next - bno; goto out; } @@ -424,13 +426,13 @@ xreap_agextent_iter( */ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); - rs->force_roll = true; + xreap_force_defer_finish(rs); return 0; } xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp, rs->oinfo->oi_owner); - rs->deferred++; + xreap_inc_defer(rs); return 0; } @@ -444,7 +446,7 @@ xreap_agextent_iter( */ xreap_agextent_binval(rs, agbno, aglenp); if (*aglenp == 0) { - ASSERT(xreap_want_roll(rs)); + ASSERT(xreap_want_binval_roll(rs)); return 0; } @@ -464,7 +466,7 @@ xreap_agextent_iter( if (error) return error; - rs->force_roll = true; + xreap_force_defer_finish(rs); return 0; } @@ -475,7 +477,7 @@ xreap_agextent_iter( if (error) return error; - rs->force_roll = true; + xreap_force_defer_finish(rs); return 0; } @@ -490,8 +492,8 @@ xreap_agextent_iter( if (error) return error; - rs->deferred++; - if (rs->deferred % 2 == 0) + xreap_inc_defer(rs); + if (rs->nr_deferred % 2 == 0) xfs_defer_add_barrier(sc->tp); return 0; } @@ -532,11 +534,11 @@ xreap_agmeta_extent( if (error) return error; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xrep_roll_ag_trans(sc); if (error) return error; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -557,6 +559,8 @@ xrep_reap_agblocks( .sc = sc, .oinfo = oinfo, .resv = type, + .max_binval = XREAP_MAX_BINVAL, + .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; @@ -567,7 +571,7 @@ xrep_reap_agblocks( if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -629,7 +633,7 @@ xreap_fsmeta_extent( if (error) goto out_agf; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { /* * Hold the AGF buffer across the transaction roll so * that we don't have to reattach it to the scrub @@ -640,7 +644,7 @@ xreap_fsmeta_extent( xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); if (error) goto out_agf; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -669,6 +673,8 @@ xrep_reap_fsblocks( .sc = sc, .oinfo = oinfo, .resv = XFS_AG_RESV_NONE, + .max_binval = XREAP_MAX_BINVAL, + .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; @@ -679,7 +685,7 @@ xrep_reap_fsblocks( if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -779,7 +785,7 @@ xreap_rgextent_iter( *rglenp); xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); - rs->deferred++; + xreap_inc_defer(rs); return 0; } @@ -800,7 +806,7 @@ xreap_rgextent_iter( if (error) return error; - rs->deferred++; + xreap_inc_defer(rs); return 0; } @@ -856,11 +862,11 @@ xreap_rtmeta_extent( if (error) goto out_unlock; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xfs_trans_roll_inode(&sc->tp, sc->ip); if (error) goto out_unlock; - xreap_reset(rs); + xreap_binval_reset(rs); } rgbno += rglen; @@ -887,6 +893,8 @@ xrep_reap_rtblocks( .sc = sc, .oinfo = oinfo, .resv = XFS_AG_RESV_NONE, + .max_binval = XREAP_MAX_BINVAL, + .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; @@ -897,7 +905,7 @@ xrep_reap_rtblocks( if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -923,6 +931,8 @@ xrep_reap_metadir_fsblocks( .sc = sc, .oinfo = &oinfo, .resv = XFS_AG_RESV_NONE, + .max_binval = XREAP_MAX_BINVAL, + .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; @@ -931,12 +941,11 @@ xrep_reap_metadir_fsblocks( ASSERT(xfs_is_metadir_inode(sc->ip)); xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); - error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) { + if (xreap_is_dirty(&rs)) { error = xrep_defer_finish(sc); if (error) return error; From ef930cc371f033d9ce21292ca577741dfeb8c7f3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:31 -0700 Subject: [PATCH 0905/3206] xfs: convert the ifork reap code to use xreap_state Convert the file fork reaping code to use struct xreap_state so that we can reuse the dynamic state tracking code. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 78 ++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 16bd298330a4cc..33272729249f64 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -91,9 +91,21 @@ struct xreap_state { struct xfs_scrub *sc; - /* Reverse mapping owner and metadata reservation type. */ - const struct xfs_owner_info *oinfo; - enum xfs_ag_resv_type resv; + union { + struct { + /* + * For AG blocks, this is reverse mapping owner and + * metadata reservation type. + */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; + }; + struct { + /* For file blocks, this is the inode and fork. */ + struct xfs_inode *ip; + int whichfork; + }; + }; /* Number of invalidated buffers logged to the current transaction. */ unsigned int nr_binval; @@ -965,13 +977,12 @@ xrep_reap_metadir_fsblocks( */ STATIC int xreap_bmapi_select( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool *crosslinked) { struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rs->sc; struct xfs_btree_cur *cur; xfs_filblks_t len = 1; xfs_agblock_t bno; @@ -985,7 +996,8 @@ xreap_bmapi_select( cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, sc->sa.pag); - xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff); + xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork, + imap->br_startoff); error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); if (error) goto out_cur; @@ -1048,21 +1060,19 @@ xreap_buf_loggable( */ STATIC int xreap_bmapi_binval( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; struct xfs_mount *mp = sc->mp; struct xfs_perag *pag = sc->sa.pag; - int bmap_flags = xfs_bmapi_aflag(whichfork); + int bmap_flags = xfs_bmapi_aflag(rs->whichfork); xfs_fileoff_t off; xfs_fileoff_t max_off; xfs_extlen_t scan_blocks; xfs_agblock_t bno; xfs_agblock_t agbno; xfs_agblock_t agbno_next; - unsigned int invalidated = 0; int error; /* @@ -1089,7 +1099,7 @@ xreap_bmapi_binval( struct xfs_bmbt_irec hmap; int nhmaps = 1; - error = xfs_bmapi_read(ip, off, max_off - off, &hmap, + error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap, &nhmaps, bmap_flags); if (error) return error; @@ -1130,14 +1140,13 @@ xreap_bmapi_binval( xfs_buf_stale(bp); xfs_buf_relse(bp); } - invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however - * much of the mapping we've seen so far. + * far we've gotten. */ - if (invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { imap->br_blockcount = agbno_next - bno; goto out; } @@ -1159,12 +1168,11 @@ xreap_bmapi_binval( */ STATIC int xrep_reap_bmapi_iter( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool crosslinked) { + struct xfs_scrub *sc = rs->sc; int error; if (crosslinked) { @@ -1185,10 +1193,10 @@ xrep_reap_bmapi_iter( * deferred log intents in this function to control the exact * sequence of metadata updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); - xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); return 0; } @@ -1209,7 +1217,7 @@ xrep_reap_bmapi_iter( * transaction is full of logged buffer invalidations, so we need to * return early so that we can roll and retry. */ - error = xreap_bmapi_binval(sc, ip, whichfork, imap); + error = xreap_bmapi_binval(rs, imap); if (error || imap->br_blockcount == 0) return error; @@ -1218,8 +1226,8 @@ xrep_reap_bmapi_iter( * intents in this function to control the exact sequence of metadata * updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); return xfs_free_extent_later(sc->tp, imap->br_startblock, imap->br_blockcount, NULL, XFS_AG_RESV_NONE, @@ -1232,18 +1240,17 @@ xrep_reap_bmapi_iter( */ STATIC int xreap_ifork_extent( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; xfs_agnumber_t agno; bool crosslinked; int error; ASSERT(sc->sa.pag == NULL); - trace_xreap_ifork_extent(sc, ip, whichfork, imap); + trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap); agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); sc->sa.pag = xfs_perag_get(sc->mp, agno); @@ -1258,11 +1265,11 @@ xreap_ifork_extent( * Decide the fate of the blocks at the beginning of the mapping, then * update the mapping to use it with the unmap calls. */ - error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked); + error = xreap_bmapi_select(rs, imap, &crosslinked); if (error) goto out_agf; - error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked); + error = xrep_reap_bmapi_iter(rs, imap, crosslinked); if (error) goto out_agf; @@ -1286,6 +1293,12 @@ xrep_reap_ifork( struct xfs_inode *ip, int whichfork) { + struct xreap_state rs = { + .sc = sc, + .ip = ip, + .whichfork = whichfork, + .max_binval = XREAP_MAX_BINVAL, + }; xfs_fileoff_t off = 0; int bmap_flags = xfs_bmapi_aflag(whichfork); int error; @@ -1313,13 +1326,14 @@ xrep_reap_ifork( * can in a single transaction. */ if (xfs_bmap_is_real_extent(&imap)) { - error = xreap_ifork_extent(sc, ip, whichfork, &imap); + error = xreap_ifork_extent(&rs, &imap); if (error) return error; error = xfs_defer_finish(&sc->tp); if (error) return error; + xreap_defer_finish_reset(&rs); } off = imap.br_startoff + imap.br_blockcount; From b2311ec6778fcde9d40cd9e22899f8bd594d8465 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:33 -0700 Subject: [PATCH 0906/3206] xfs: compute per-AG extent reap limits dynamically Calculate the maximum number of extents that can be reaped in a single transaction chain, and the number of buffers that can be invalidated in a single transaction. The rough calculation here is: nr_extents = (logres - reservation used by any one step) / (space used by intents per extent + space used per binval) Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 140 +++++++++++++++++++++++++++++++++++++++---- fs/xfs/scrub/trace.c | 1 + fs/xfs/scrub/trace.h | 42 +++++++++++++ 3 files changed, 171 insertions(+), 12 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 33272729249f64..929ea3c453d313 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -36,6 +36,12 @@ #include "xfs_metafile.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_item.h" +#include "xfs_bmap_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -230,6 +236,15 @@ static inline void xreap_force_defer_finish(struct xreap_state *rs) rs->nr_deferred = rs->max_deferred; } +/* Maximum number of fsblocks that we might find in a buffer to invalidate. */ +static inline unsigned int +xrep_binval_max_fsblocks( + struct xfs_mount *mp) +{ + /* Remote xattr values are the largest buffers that we support. */ + return xfs_attr3_max_rmt_blocks(mp); +} + /* * Compute the maximum length of a buffer cache scan (in units of sectors), * given a quantity of fs blocks. @@ -239,12 +254,8 @@ xrep_bufscan_max_sectors( struct xfs_mount *mp, xfs_extlen_t fsblocks) { - int max_fsbs; - - /* Remote xattr values are the largest buffers that we support. */ - max_fsbs = xfs_attr3_max_rmt_blocks(mp); - - return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs)); + return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, + xrep_binval_max_fsblocks(mp))); } /* @@ -442,6 +453,7 @@ xreap_agextent_iter( return 0; } + /* t1: unmap crosslinked metadata blocks */ xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp, rs->oinfo->oi_owner); xreap_inc_defer(rs); @@ -482,7 +494,7 @@ xreap_agextent_iter( return 0; } - /* Put blocks back on the AGFL one at a time. */ + /* t3: Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); error = xreap_put_freelist(sc, agbno); @@ -494,7 +506,7 @@ xreap_agextent_iter( } /* - * Use deferred frees to get rid of the old btree blocks to try to + * t4: Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. * Add a defer ops barrier every other extent to avoid stressing the * system with large EFIs. @@ -510,6 +522,110 @@ xreap_agextent_iter( return 0; } +/* Configure the deferral and invalidation limits */ +static inline void +xreap_configure_limits( + struct xreap_state *rs, + unsigned int fixed_overhead, + unsigned int variable_overhead, + unsigned int per_intent, + unsigned int per_binval) +{ + struct xfs_scrub *sc = rs->sc; + unsigned int res = sc->tp->t_log_res - fixed_overhead; + + /* Don't underflow the reservation */ + if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) { + ASSERT(sc->tp->t_log_res >= + (fixed_overhead + variable_overhead)); + xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE); + return; + } + + rs->max_deferred = res / variable_overhead; + res -= rs->max_deferred * per_intent; + rs->max_binval = per_binval ? res / per_binval : 0; +} + +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single per-AG space extent. This is not for freeing CoW + * staging extents. + */ +STATIC void +xreap_configure_agextent_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap + * record. + * + * t3: Freeing to AGFL: roll and finish deferred items for every block. + * Limits here do not matter. + * + * t4: Freeing metadata blocks: deferred freeing of the space, which + * also removes the rmap record. + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = rui; + const unsigned int t4 = rui + efi; + const unsigned int per_intent = max(t1, t4); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of EFI or + * RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int step_size = max(f1, f2); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Maximum overhead of invalidating one buffer. */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that btree + * blocks aren't usually contiguous; and that scrub likely pulled all + * the buffers into memory. From these assumptions, set the maximum + * number of deferrals we can queue before flushing the defer chain, + * and the number of invalidations we can queue before rolling to a + * clean transaction (and possibly relogging some of the deferrals) to + * the same quantity. + */ + const unsigned int variable_overhead = per_intent + per_binval; + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, rs->max_deferred); +} + /* * Break an AG metadata extent into sub-extents by fate (crosslinked, not * crosslinked), and dispose of each sub-extent separately. @@ -571,14 +687,13 @@ xrep_reap_agblocks( .sc = sc, .oinfo = oinfo, .resv = type, - .max_binval = XREAP_MAX_BINVAL, - .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip == NULL); + xreap_configure_agextent_limits(&rs); error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); if (error) return error; @@ -693,6 +808,8 @@ xrep_reap_fsblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + if (oinfo != &XFS_RMAP_OINFO_COW) + xreap_configure_agextent_limits(&rs); error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; @@ -943,8 +1060,6 @@ xrep_reap_metadir_fsblocks( .sc = sc, .oinfo = &oinfo, .resv = XFS_AG_RESV_NONE, - .max_binval = XREAP_MAX_BINVAL, - .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; @@ -952,6 +1067,7 @@ xrep_reap_metadir_fsblocks( ASSERT(sc->ip != NULL); ASSERT(xfs_is_metadir_inode(sc->ip)); + xreap_configure_agextent_limits(&rs); xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 2450e214103fed..987313a52e6401 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -22,6 +22,7 @@ #include "xfs_parent.h" #include "xfs_metafile.h" #include "xfs_rtgroup.h" +#include "xfs_trans.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index a8187281eb96b9..d39da0e67024fb 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2000,6 +2000,48 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); +DECLARE_EVENT_CLASS(xrep_reap_limits_class, + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, + unsigned int max_binval, unsigned int step_size, + unsigned int per_intent, + unsigned int max_deferred), + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, log_res) + __field(unsigned int, per_binval) + __field(unsigned int, max_binval) + __field(unsigned int, step_size) + __field(unsigned int, per_intent) + __field(unsigned int, max_deferred) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->log_res = tp->t_log_res; + __entry->per_binval = per_binval; + __entry->max_binval = max_binval; + __entry->step_size = step_size; + __entry->per_intent = per_intent; + __entry->max_deferred = max_deferred; + ), + TP_printk("dev %d:%d logres %u per_binval %u max_binval %u step_size %u per_intent %u max_deferred %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->log_res, + __entry->per_binval, + __entry->max_binval, + __entry->step_size, + __entry->per_intent, + __entry->max_deferred) +); +#define DEFINE_REPAIR_REAP_LIMITS_EVENT(name) \ +DEFINE_EVENT(xrep_reap_limits_class, name, \ + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, \ + unsigned int max_binval, unsigned int step_size, \ + unsigned int per_intent, \ + unsigned int max_deferred), \ + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred)) +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); + DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, xfs_extlen_t len, bool crosslinked), From 442bc127d460a807858ef9258e77518b9597eed1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:33 -0700 Subject: [PATCH 0907/3206] xfs: compute data device CoW staging extent reap limits dynamically Calculate the maximum number of CoW staging extents that can be reaped in a single transaction chain. The rough calculation here is: nr_extents = (logres - reservation used by any one step) / (space used by intents per extent + space used for a few buffer invalidations) Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 88 +++++++++++++++++++++++++++++++++++++++++--- fs/xfs/scrub/trace.h | 1 + 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 929ea3c453d313..aaef7e6771a045 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -443,7 +443,7 @@ xreap_agextent_iter( if (rs->oinfo == &XFS_RMAP_OINFO_COW) { /* - * If we're unmapping CoW staging extents, remove the + * t0: Unmapping CoW staging extents, remove the * records from the refcountbt, which will remove the * rmap record as well. */ @@ -475,7 +475,7 @@ xreap_agextent_iter( } /* - * If we're getting rid of CoW staging extents, use deferred work items + * t2: To get rid of CoW staging extents, use deferred work items * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -626,6 +626,84 @@ xreap_configure_agextent_limits( step_size, per_intent, rs->max_deferred); } +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_agcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t0 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t0, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Overhead of invalidating one buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that CoW + * staging extents are usually more than 1 fsblock, and that there + * shouldn't be any buffers for those blocks. From the assumptions, + * set the number of deferrals to use as much of the reservation as + * it can, but leave space to invalidate 1/8th that number of buffers. + */ + const unsigned int variable_overhead = per_intent + + (per_binval / 8); + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size, + per_intent, rs->max_deferred); +} + /* * Break an AG metadata extent into sub-extents by fate (crosslinked, not * crosslinked), and dispose of each sub-extent separately. @@ -800,15 +878,15 @@ xrep_reap_fsblocks( .sc = sc, .oinfo = oinfo, .resv = XFS_AG_RESV_NONE, - .max_binval = XREAP_MAX_BINVAL, - .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); - if (oinfo != &XFS_RMAP_OINFO_COW) + if (oinfo == &XFS_RMAP_OINFO_COW) + xreap_configure_agcow_limits(&rs); + else xreap_configure_agextent_limits(&rs); error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index d39da0e67024fb..a9da22f50534cb 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2041,6 +2041,7 @@ DEFINE_EVENT(xrep_reap_limits_class, name, \ unsigned int max_deferred), \ TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred)) DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits); DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, From 74fc66ee17fcd994a49ed80ddba90bc3b7046684 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:34 -0700 Subject: [PATCH 0908/3206] xfs: compute realtime device CoW staging extent reap limits dynamically Calculate the maximum number of CoW staging extents that can be reaped in a single transaction chain. The rough calculation here is: nr_extents = (logres - reservation used by any one step) / (space used by intents per extent) Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 71 +++++++++++++++++++++++++++++++++++++++++--- fs/xfs/scrub/trace.h | 1 + 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index aaef7e6771a045..b2f089e2c49daa 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -984,7 +984,7 @@ xreap_rgextent_iter( rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno); /* - * If there are other rmappings, this block is cross linked and must + * t1: There are other rmappings; this block is cross linked and must * not be freed. Remove the forward and reverse mapping and move on. */ if (crosslinked) { @@ -999,7 +999,7 @@ xreap_rgextent_iter( trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp); /* - * The CoW staging extent is not crosslinked. Use deferred work items + * t2: The CoW staging extent is not crosslinked. Use deferred work * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -1017,6 +1017,69 @@ xreap_rgextent_iter( return 0; } +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_rgcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_rt_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rt_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_rt_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * The only buffer for the rt device is the rtgroup super, so we don't + * need to save space for buffer invalidations. + */ + xreap_configure_limits(rs, step_size, per_intent, per_intent, 0); + + trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent, + rs->max_deferred); +} + #define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ XFS_RTGLOCK_RMAP | \ XFS_RTGLOCK_REFCOUNT) @@ -1100,14 +1163,14 @@ xrep_reap_rtblocks( .sc = sc, .oinfo = oinfo, .resv = XFS_AG_RESV_NONE, - .max_binval = XREAP_MAX_BINVAL, - .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + ASSERT(oinfo == &XFS_RMAP_OINFO_COW); + xreap_configure_rgcow_limits(&rs); error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs); if (error) return error; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index a9da22f50534cb..1a994d339c42cf 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2042,6 +2042,7 @@ DEFINE_EVENT(xrep_reap_limits_class, name, \ TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred)) DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits); DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, From e4c7eece7676bac436a8f98b91d646a8b55a67af Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:35 -0700 Subject: [PATCH 0909/3206] xfs: compute file mapping reap limits dynamically Reaping file fork mappings is a little different -- log recovery can free the blocks for us, so we only try to process a single mapping at a time. Therefore, we only need to figure out the maximum number of blocks that we can invalidate in a single transaction. The rough calculation here is: nr_extents = (logres - reservation used by any one step) / (space used per binval) Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 109 +++++++++++++++++++++++++++++++++++++++++-- fs/xfs/scrub/trace.h | 1 + 2 files changed, 105 insertions(+), 5 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index b2f089e2c49daa..d58fd57aaebb43 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -542,7 +542,7 @@ xreap_configure_limits( return; } - rs->max_deferred = res / variable_overhead; + rs->max_deferred = per_intent ? res / variable_overhead : 0; res -= rs->max_deferred * per_intent; rs->max_binval = per_binval ? res / per_binval : 0; } @@ -1446,7 +1446,7 @@ xrep_reap_bmapi_iter( imap->br_blockcount); /* - * Schedule removal of the mapping from the fork. We use + * t0: Schedule removal of the mapping from the fork. We use * deferred log intents in this function to control the exact * sequence of metadata updates. */ @@ -1479,8 +1479,8 @@ xrep_reap_bmapi_iter( return error; /* - * Schedule removal of the mapping from the fork. We use deferred log - * intents in this function to control the exact sequence of metadata + * t1: Schedule removal of the mapping from the fork. We use deferred + * work in this function to control the exact sequence of metadata * updates. */ xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); @@ -1491,6 +1491,105 @@ xrep_reap_bmapi_iter( XFS_FREE_EXTENT_SKIP_DISCARD); } +/* Compute the maximum mapcount of a file buffer. */ +static unsigned int +xreap_bmapi_binval_mapcount( + struct xfs_scrub *sc) +{ + /* directory blocks can span multiple fsblocks and be discontiguous */ + if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR) + return sc->mp->m_dir_geo->fsbcount; + + /* all other file xattr/symlink blocks must be contiguous */ + return 1; +} + +/* Compute the maximum block size of a file buffer. */ +static unsigned int +xreap_bmapi_binval_blocksize( + struct xfs_scrub *sc) +{ + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_DIR: + return sc->mp->m_dir_geo->blksize; + case XFS_SCRUB_TYPE_XATTR: + case XFS_SCRUB_TYPE_PARENT: + /* + * The xattr structure itself consists of single fsblocks, but + * there could be remote xattr blocks to invalidate. + */ + return XFS_XATTR_SIZE_MAX; + } + + /* everything else is a single block */ + return sc->mp->m_sb.sb_blocksize; +} + +/* + * Compute the maximum number of buffer invalidations that we can do while + * reaping a single extent from a file fork. + */ +STATIC void +xreap_configure_bmapi_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* overhead of invalidating a buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc), + xreap_bmapi_binval_blocksize(sc)); + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int bui = xfs_bui_log_space(1) + + xfs_bud_log_space(); + + /* + * t1: Unmapping crosslinked file data blocks: one bmap deletion, + * possibly an EFI for underfilled bmbt blocks, and an rmap deletion. + * + * t2: Freeing freeing file data blocks: one bmap deletion, possibly an + * EFI for underfilled bmbt blocks, and another EFI for the space + * itself. + */ + const unsigned int t1 = (bui + efi) + rui; + const unsigned int t2 = (bui + efi) + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_bui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * Each call to xreap_ifork_extent starts with a clean transaction and + * operates on a single mapping by creating a chain of log intent items + * for that mapping. We need to leave enough reservation in the + * transaction to log btree buffer and inode updates for each step in + * the chain, and to relog the log intents. + */ + const unsigned int per_extent_res = per_intent + step_size; + + xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval); + + trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, 1); +} + /* * Dispose of as much of this file extent as we can. Upon successful return, * the imap will reflect the mapping that was removed from the fork. @@ -1554,7 +1653,6 @@ xrep_reap_ifork( .sc = sc, .ip = ip, .whichfork = whichfork, - .max_binval = XREAP_MAX_BINVAL, }; xfs_fileoff_t off = 0; int bmap_flags = xfs_bmapi_aflag(whichfork); @@ -1564,6 +1662,7 @@ xrep_reap_ifork( ASSERT(ip == sc->ip || ip == sc->tempip); ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); + xreap_configure_bmapi_limits(&rs); while (off < XFS_MAX_FILEOFF) { struct xfs_bmbt_irec imap; int nimaps = 1; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 1a994d339c42cf..39ea651cbb7510 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2043,6 +2043,7 @@ DEFINE_EVENT(xrep_reap_limits_class, name, \ DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits); DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_bmapi_limits); DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, From f69260511c69888bdec048153d0d475d4c8b0b3e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 27 Aug 2025 09:30:58 -0700 Subject: [PATCH 0910/3206] xfs: disable deprecated features by default in Kconfig We promised to turn off these old features by default in September 2025. Do so now. Signed-off-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino --- Documentation/admin-guide/xfs.rst | 5 ++--- fs/xfs/Kconfig | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index a18328a5fb93be..693b09ca62922f 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -253,9 +253,8 @@ latest version and try again. The deprecation will take place in two parts. Support for mounting V4 filesystems can now be disabled at kernel build time via Kconfig option. -The option will default to yes until September 2025, at which time it -will be changed to default to no. In September 2030, support will be -removed from the codebase entirely. +These options were changed to default to no in September 2025. In +September 2030, support will be removed from the codebase entirely. Note: Distributors may choose to withdraw V4 format support earlier than the dates listed above. diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 065953475cf5eb..ecebd3ebab1342 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -25,7 +25,7 @@ config XFS_FS config XFS_SUPPORT_V4 bool "Support deprecated V4 (crc=0) format" depends on XFS_FS - default y + default n help The V4 filesystem format lacks certain features that are supported by the V5 format, such as metadata checksumming, strengthened @@ -40,7 +40,7 @@ config XFS_SUPPORT_V4 filesystem is a V4 filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the V4 format will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. @@ -50,7 +50,7 @@ config XFS_SUPPORT_V4 config XFS_SUPPORT_ASCII_CI bool "Support deprecated case-insensitive ascii (ascii-ci=1) format" depends on XFS_FS - default y + default n help The ASCII case insensitivity filesystem feature only works correctly on systems that have been coerced into using ISO 8859-1, and it does @@ -67,7 +67,7 @@ config XFS_SUPPORT_ASCII_CI filesystem is a case-insensitive filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the feature will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. From b9a176e54162f890aaf50ac8a467d725ed2f00df Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Sep 2025 14:33:53 -0700 Subject: [PATCH 0911/3206] xfs: remove deprecated mount options These four mount options were scheduled for removal in September 2025, so remove them now. Cc: preichl@redhat.com Signed-off-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino --- Documentation/admin-guide/xfs.rst | 26 ++------------ fs/xfs/libxfs/xfs_attr_leaf.c | 23 +++--------- fs/xfs/libxfs/xfs_bmap.c | 14 ++------ fs/xfs/libxfs/xfs_ialloc.c | 4 +-- fs/xfs/libxfs/xfs_sb.c | 9 ++--- fs/xfs/xfs_icache.c | 6 ++-- fs/xfs/xfs_mount.c | 13 ------- fs/xfs/xfs_mount.h | 12 +++---- fs/xfs/xfs_super.c | 60 ++----------------------------- 9 files changed, 25 insertions(+), 142 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 693b09ca62922f..7ad746a3e66c25 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -34,22 +34,6 @@ When mounting an XFS filesystem, the following options are accepted. to the file. Specifying a fixed ``allocsize`` value turns off the dynamic behaviour. - attr2 or noattr2 - The options enable/disable an "opportunistic" improvement to - be made in the way inline extended attributes are stored - on-disk. When the new form is used for the first time when - ``attr2`` is selected (either when setting or removing extended - attributes) the on-disk superblock feature bit field will be - updated to reflect this format being in use. - - The default behaviour is determined by the on-disk feature - bit indicating that ``attr2`` behaviour is active. If either - mount option is set, then that becomes the new default used - by the filesystem. - - CRC enabled filesystems always use the ``attr2`` format, and so - will reject the ``noattr2`` mount option if it is set. - discard or nodiscard (default) Enable/disable the issuing of commands to let the block device reclaim space freed by the filesystem. This is @@ -75,12 +59,6 @@ When mounting an XFS filesystem, the following options are accepted. across the entire filesystem rather than just on directories configured to use it. - ikeep or noikeep (default) - When ``ikeep`` is specified, XFS does not delete empty inode - clusters and keeps them around on disk. When ``noikeep`` is - specified, empty inode clusters are returned to the free - space pool. - inode32 or inode64 (default) When ``inode32`` is specified, it indicates that XFS limits inode creation to locations which will not result in inode @@ -267,8 +245,6 @@ Deprecated Mount Options ============================ ================ Mounting with V4 filesystem September 2030 Mounting ascii-ci filesystem September 2030 -ikeep/noikeep September 2025 -attr2/noattr2 September 2025 ============================ ================ @@ -284,6 +260,8 @@ Removed Mount Options osyncisdsync/osyncisosync v4.0 barrier v4.19 nobarrier v4.19 + ikeep/noikeep v6.18 + attr2/noattr2 v6.18 =========================== ======= sysctls diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index fddb55605e0cc6..47213e6023dd4b 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -667,12 +667,8 @@ xfs_attr_shortform_bytesfit( /* * For attr2 we can try to move the forkoff if there is space in the - * literal area, but for the old format we are done if there is no - * space in the fixed attribute fork. + * literal area */ - if (!xfs_has_attr2(mp)) - return 0; - dsize = dp->i_df.if_bytes; switch (dp->i_df.if_format) { @@ -723,22 +719,16 @@ xfs_attr_shortform_bytesfit( } /* - * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless: - * - noattr2 mount option is set, - * - on-disk version bit says it is already set, or - * - the attr2 mount option is not set to enable automatic upgrade from attr1. + * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless + * on-disk version bit says it is already set */ STATIC void xfs_sbversion_add_attr2( struct xfs_mount *mp, struct xfs_trans *tp) { - if (xfs_has_noattr2(mp)) - return; if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT) return; - if (!xfs_has_attr2(mp)) - return; spin_lock(&mp->m_sb_lock); xfs_add_attr2(mp); @@ -889,7 +879,7 @@ xfs_attr_sf_removename( /* * Fix up the start offset of the attribute fork */ - if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && + if (totsize == sizeof(struct xfs_attr_sf_hdr) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && !xfs_has_parent(mp)) { @@ -900,7 +890,6 @@ xfs_attr_sf_removename( ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || - !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE || xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, @@ -1040,8 +1029,7 @@ xfs_attr_shortform_allfit( bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } - if (xfs_has_attr2(dp->i_mount) && - (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && + if ((dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); @@ -1161,7 +1149,6 @@ xfs_attr3_leaf_to_shortform( * this case. */ if (!(args->op_flags & XFS_DA_OP_REPLACE)) { - ASSERT(xfs_has_attr2(dp->i_mount)); ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d954f9b8071f4b..80bdb537fcf783 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -997,8 +997,7 @@ xfs_bmap_add_attrfork_local( static int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, - int size, - int *version) + int size) { int default_size = xfs_default_attroffset(ip) >> 3; @@ -1012,8 +1011,6 @@ xfs_bmap_set_attrforkoff( ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size); if (!ip->i_forkoff) ip->i_forkoff = default_size; - else if (xfs_has_attr2(ip->i_mount) && version) - *version = 2; break; default: ASSERT(0); @@ -1035,7 +1032,6 @@ xfs_bmap_add_attrfork( int rsvd) /* xact may use reserved blks */ { struct xfs_mount *mp = tp->t_mountp; - int version = 1; /* superblock attr version */ int logflags; /* logging flags */ int error; /* error return value */ @@ -1045,7 +1041,7 @@ xfs_bmap_add_attrfork( ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_bmap_set_attrforkoff(ip, size, &version); + error = xfs_bmap_set_attrforkoff(ip, size); if (error) return error; @@ -1069,16 +1065,12 @@ xfs_bmap_add_attrfork( xfs_trans_log_inode(tp, ip, logflags); if (error) return error; - if (!xfs_has_attr(mp) || - (!xfs_has_attr2(mp) && version == 2)) { + if (!xfs_has_attr(mp)) { bool log_sb = false; spin_lock(&mp->m_sb_lock); if (!xfs_has_attr(mp)) { xfs_add_attr(mp); - log_sb = true; - } - if (!xfs_has_attr2(mp) && version == 2) { xfs_add_attr2(mp); log_sb = true; } diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 750111634d9f7b..5fefdd4fe75dbd 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2140,7 +2140,7 @@ xfs_difree_inobt( * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); @@ -2286,7 +2286,7 @@ xfs_difree_finobt( * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 711e180f9ebb83..cdd16dd805d77c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -142,8 +142,6 @@ xfs_sb_version_to_features( if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) { if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT) features |= XFS_FEAT_LAZYSBCOUNT; - if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT) - features |= XFS_FEAT_ATTR2; if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT) features |= XFS_FEAT_PROJID32; if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE) @@ -155,7 +153,7 @@ xfs_sb_version_to_features( /* Always on V5 features */ features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG | - XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 | + XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 | XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO; /* Optional V5 features */ @@ -1524,7 +1522,8 @@ xfs_fs_geometry( geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | XFS_FSOP_GEOM_FLAGS_DIRV2 | - XFS_FSOP_GEOM_FLAGS_EXTFLG; + XFS_FSOP_GEOM_FLAGS_EXTFLG | + XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_attr(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; if (xfs_has_quota(mp)) @@ -1537,8 +1536,6 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; if (xfs_has_lazysbcount(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; - if (xfs_has_attr2(mp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_projid32(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; if (xfs_has_crc(mp)) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4cf7abe5014371..e44040206851fc 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -646,8 +646,7 @@ xfs_iget_cache_miss( goto out_destroy; /* - * For version 5 superblocks, if we are initialising a new inode and we - * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can + * For version 5 superblocks, if we are initialising a new inode, we * simply build the new inode core with a random generation number. * * For version 4 (and older) superblocks, log recovery is dependent on @@ -655,8 +654,7 @@ xfs_iget_cache_miss( * value and hence we must also read the inode off disk even when * initializing new inodes. */ - if (xfs_has_v3inodes(mp) && - (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { + if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) { VFS_I(ip)->i_generation = get_random_u32(); } else { struct xfs_buf *bp; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index dc32c5e34d8176..0953f6ae94abc8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1057,19 +1057,6 @@ xfs_mountfs( xfs_inodegc_start(mp); xfs_blockgc_start(mp); - /* - * Now that we've recovered any pending superblock feature bit - * additions, we can finish setting up the attr2 behaviour for the - * mount. The noattr2 option overrides the superblock flag, so only - * check the superblock feature flag if the mount option is not set. - */ - if (xfs_has_noattr2(mp)) { - mp->m_features &= ~XFS_FEAT_ATTR2; - } else if (!xfs_has_attr2(mp) && - (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) { - mp->m_features |= XFS_FEAT_ATTR2; - } - if (xfs_has_metadir(mp)) { error = xfs_mount_setup_metadir(mp); if (error) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 97de44c32272f2..f046d1215b043c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -363,7 +363,6 @@ typedef struct xfs_mount { #define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */ #define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */ #define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */ -#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */ #define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */ #define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */ #define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */ @@ -386,7 +385,6 @@ typedef struct xfs_mount { /* Mount features */ #define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ -#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ #define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred @@ -396,7 +394,6 @@ typedef struct xfs_mount { #define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */ #define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */ #define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */ -#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/ #define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */ #define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */ #define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */ @@ -504,12 +501,17 @@ __XFS_HAS_V4_FEAT(align, ALIGN) __XFS_HAS_V4_FEAT(logv2, LOGV2) __XFS_HAS_V4_FEAT(extflg, EXTFLG) __XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT) -__XFS_ADD_V4_FEAT(attr2, ATTR2) __XFS_ADD_V4_FEAT(projid32, PROJID32) __XFS_HAS_V4_FEAT(v3inodes, V3INODES) __XFS_HAS_V4_FEAT(crc, CRC) __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) +static inline void xfs_add_attr2(struct xfs_mount *mp) +{ + if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) + xfs_sb_version_addattr2(&mp->m_sb); +} + /* * Mount features * @@ -517,7 +519,6 @@ __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) * bit inodes and read-only state, are kept as operational state rather than * features. */ -__XFS_HAS_FEAT(noattr2, NOATTR2) __XFS_HAS_FEAT(noalign, NOALIGN) __XFS_HAS_FEAT(allocsize, ALLOCSIZE) __XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE) @@ -526,7 +527,6 @@ __XFS_HAS_FEAT(dirsync, DIRSYNC) __XFS_HAS_FEAT(discard, DISCARD) __XFS_HAS_FEAT(grpid, GRPID) __XFS_HAS_FEAT(small_inums, SMALL_INUMS) -__XFS_HAS_FEAT(ikeep, IKEEP) __XFS_HAS_FEAT(swalloc, SWALLOC) __XFS_HAS_FEAT(filestreams, FILESTREAMS) __XFS_HAS_FEAT(dax_always, DAX_ALWAYS) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a770d..77acb3e5a4eca1 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -105,8 +105,8 @@ enum { Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, - Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep, - Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, + Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, + Opt_largeio, Opt_nolargeio, Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, @@ -133,12 +133,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("norecovery", Opt_norecovery), fsparam_flag("inode64", Opt_inode64), fsparam_flag("inode32", Opt_inode32), - fsparam_flag("ikeep", Opt_ikeep), - fsparam_flag("noikeep", Opt_noikeep), fsparam_flag("largeio", Opt_largeio), fsparam_flag("nolargeio", Opt_nolargeio), - fsparam_flag("attr2", Opt_attr2), - fsparam_flag("noattr2", Opt_noattr2), fsparam_flag("filestreams", Opt_filestreams), fsparam_flag("quota", Opt_quota), fsparam_flag("noquota", Opt_noquota), @@ -175,13 +171,11 @@ xfs_fs_show_options( { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ - { XFS_FEAT_IKEEP, ",ikeep" }, { XFS_FEAT_WSYNC, ",wsync" }, { XFS_FEAT_NOALIGN, ",noalign" }, { XFS_FEAT_SWALLOC, ",swalloc" }, { XFS_FEAT_NOUUID, ",nouuid" }, { XFS_FEAT_NORECOVERY, ",norecovery" }, - { XFS_FEAT_ATTR2, ",attr2" }, { XFS_FEAT_FILESTREAMS, ",filestreams" }, { XFS_FEAT_GRPID, ",grpid" }, { XFS_FEAT_DISCARD, ",discard" }, @@ -1087,15 +1081,6 @@ xfs_finish_flags( } } - /* - * V5 filesystems always use attr2 format for attributes. - */ - if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. " - "attr2 is always enabled for V5 filesystems."); - return -EINVAL; - } - /* * prohibit r/w mounts of read-only filesystems */ @@ -1542,22 +1527,6 @@ xfs_fs_parse_param( return 0; #endif /* Following mount options will be removed in September 2025 */ - case Opt_ikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true); - parsing_mp->m_features |= XFS_FEAT_IKEEP; - return 0; - case Opt_noikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false); - parsing_mp->m_features &= ~XFS_FEAT_IKEEP; - return 0; - case Opt_attr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true); - parsing_mp->m_features |= XFS_FEAT_ATTR2; - return 0; - case Opt_noattr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); - parsing_mp->m_features |= XFS_FEAT_NOATTR2; - return 0; case Opt_max_open_zones: parsing_mp->m_max_open_zones = result.uint_32; return 0; @@ -1593,16 +1562,6 @@ xfs_fs_validate_params( return -EINVAL; } - /* - * We have not read the superblock at this point, so only the attr2 - * mount option can set the attr2 feature by this stage. - */ - if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "attr2 and noattr2 cannot both be specified."); - return -EINVAL; - } - - if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) { xfs_warn(mp, "sunit and swidth options incompatible with the noalign option"); @@ -2177,21 +2136,6 @@ xfs_fs_reconfigure( if (error) return error; - /* attr2 -> noattr2 */ - if (xfs_has_noattr2(new_mp)) { - if (xfs_has_crc(mp)) { - xfs_warn(mp, - "attr2 is always enabled for a V5 filesystem - can't be changed."); - return -EINVAL; - } - mp->m_features &= ~XFS_FEAT_ATTR2; - mp->m_features |= XFS_FEAT_NOATTR2; - } else if (xfs_has_attr2(new_mp)) { - /* noattr2 -> attr2 */ - mp->m_features &= ~XFS_FEAT_NOATTR2; - mp->m_features |= XFS_FEAT_ATTR2; - } - /* Validate new max_atomic_write option before making other changes */ if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) { error = xfs_set_max_atomic_write_opt(mp, From d5b157e088c9fb72e2f245fd5cd43a373c4de677 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:36 -0700 Subject: [PATCH 0912/3206] xfs: remove static reap limits from repair.h Delete XREAP_MAX_BINVAL and XREAP_MAX_DEFER_CHAIN because the reap code now calculates those limits dynamically, so they're no longer needed. Move the third limit (XREP_MAX_ITRUNCATE_EFIS) to the one file that uses it. Note that the btree rebuilding code should reserve exactly the number of blocks needed to rebuild a btree, so it is rare that the newbt code will need to add any EFIs to the commit transaction. That's why that static limit remains. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/newbt.c | 9 +++++++++ fs/xfs/scrub/reap.c | 4 ---- fs/xfs/scrub/repair.h | 8 -------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 1588ce971cb8e1..951ae8b71566c2 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -27,6 +27,15 @@ #include "scrub/repair.h" #include "scrub/newbt.h" +/* + * This is the maximum number of deferred extent freeing item extents (EFIs) + * that we'll attach to a transaction without rolling the transaction to avoid + * overrunning a tr_itruncate reservation. The newbt code should reserve + * exactly the correct number of blocks to rebuild the btree, so there should + * not be any excess blocks to free when committing a new btree. + */ +#define XREP_MAX_ITRUNCATE_EFIS (128) + /* * Estimate proper slack values for a btree that's being reloaded. * diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index d58fd57aaebb43..82910188111dd7 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -171,8 +171,6 @@ static inline bool xreap_is_dirty(const struct xreap_state *rs) return rs->nr_binval > 0 || rs->nr_deferred > 0; } -#define XREAP_MAX_BINVAL (2048) - /* * Decide if we need to roll the transaction to clear out the the log * reservation that we allocated to buffer invalidations. @@ -198,8 +196,6 @@ static inline bool xreap_inc_binval(struct xreap_state *rs) return rs->nr_binval < rs->max_binval; } -#define XREAP_MAX_DEFER_CHAIN (2048) - /* * Decide if we want to finish the deferred ops that are attached to the scrub * transaction. We don't want to queue huge chains of deferred ops because diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 9c04295742c85f..2bb125c4f9bf2b 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -18,14 +18,6 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) #ifdef CONFIG_XFS_ONLINE_REPAIR -/* - * This is the maximum number of deferred extent freeing item extents (EFIs) - * that we'll attach to a transaction without rolling the transaction to avoid - * overrunning a tr_itruncate reservation. - */ -#define XREP_MAX_ITRUNCATE_EFIS (128) - - /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); From 21d59d00221e4ecbcb597eec0021c667477d3335 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Sep 2025 14:38:27 -0700 Subject: [PATCH 0913/3206] xfs: remove deprecated sysctl knobs These sysctl knobs were scheduled for removal in September 2025. That time has come, so remove them. Signed-off-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino --- Documentation/admin-guide/xfs.rst | 26 ++++---------------------- fs/xfs/libxfs/xfs_inode_util.c | 11 ----------- fs/xfs/xfs_globals.c | 2 -- fs/xfs/xfs_iops.c | 12 +++++------- fs/xfs/xfs_linux.h | 2 -- fs/xfs/xfs_sysctl.c | 29 +---------------------------- fs/xfs/xfs_sysctl.h | 3 --- 7 files changed, 10 insertions(+), 75 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 7ad746a3e66c25..d6f531f2c0e694 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -289,9 +289,6 @@ The following sysctls are available for the XFS filesystem: removes unused preallocation from clean inodes and releases the unused space back to the free pool. - fs.xfs.speculative_cow_prealloc_lifetime - This is an alias for speculative_prealloc_lifetime. - fs.xfs.error_level (Min: 0 Default: 3 Max: 11) A volume knob for error reporting when internal errors occur. This will generate detailed messages & backtraces for filesystem @@ -318,17 +315,6 @@ The following sysctls are available for the XFS filesystem: This option is intended for debugging only. - fs.xfs.irix_symlink_mode (Min: 0 Default: 0 Max: 1) - Controls whether symlinks are created with mode 0777 (default) - or whether their mode is affected by the umask (irix mode). - - fs.xfs.irix_sgid_inherit (Min: 0 Default: 0 Max: 1) - Controls files created in SGID directories. - If the group ID of the new file does not match the effective group - ID or one of the supplementary group IDs of the parent dir, the - ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl - is set. - fs.xfs.inherit_sync (Min: 0 Default: 1 Max: 1) Setting this to "1" will cause the "sync" flag set by the **xfs_io(8)** chattr command on a directory to be @@ -364,14 +350,7 @@ The following sysctls are available for the XFS filesystem: Deprecated Sysctls ================== -=========================================== ================ - Name Removal Schedule -=========================================== ================ -fs.xfs.irix_sgid_inherit September 2025 -fs.xfs.irix_symlink_mode September 2025 -fs.xfs.speculative_cow_prealloc_lifetime September 2025 -=========================================== ================ - +None currently. Removed Sysctls =============== @@ -381,6 +360,9 @@ Removed Sysctls ============================= ======= fs.xfs.xfsbufd_centisec v4.0 fs.xfs.age_buffer_centisecs v4.0 + fs.xfs.irix_symlink_mode v6.18 + fs.xfs.irix_sgid_inherit v6.18 + fs.xfs.speculative_cow_prealloc_lifetime v6.18 ============================= ======= Error handling diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 48fe49a5f050f3..309ce6dd555383 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -299,17 +299,6 @@ xfs_inode_init( } else { inode_init_owner(args->idmap, inode, dir, args->mode); } - - /* - * If the group ID of the new file does not match the effective - * group ID or one of the supplementary group IDs, the S_ISGID - * bit is cleared (and only if the irix_sgid_inherit - * compatibility variable is set). - */ - if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && - !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) - inode->i_mode &= ~S_ISGID; - ip->i_projid = xfs_get_initial_prid(pip); } diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f6f628c01febaf..566fd663c95bba 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -14,8 +14,6 @@ */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ - .sgid_inherit = { 0, 0, 1 }, - .symlink_mode = { 0, 0, 1 }, .panic_mask = { 0, 0, XFS_PTAG_MASK}, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 603effabe1ee12..afd041e28bb26a 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -431,14 +431,12 @@ xfs_vn_symlink( struct dentry *dentry, const char *symname) { - struct inode *inode; - struct xfs_inode *cip = NULL; - struct xfs_name name; - int error; - umode_t mode; + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; + int error; + umode_t mode = S_IFLNK | S_IRWXUGO; - mode = S_IFLNK | - (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); error = xfs_dentry_mode_to_name(&name, dentry, mode); if (unlikely(error)) goto out; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 9a2221b4aa21ed..4dd747bdbccab2 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -89,8 +89,6 @@ typedef __u32 xfs_nlink_t; #undef XFS_NATIVE_HOST #endif -#define irix_sgid_inherit xfs_params.sgid_inherit.val -#define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val #define xfs_error_level xfs_params.error_level.val #define xfs_syncd_centisecs xfs_params.syncd_timer.val diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 751dc74a30671a..9918f14b4874fd 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -50,7 +50,7 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ -STATIC int +static inline int xfs_deprecated_dointvec_minmax( const struct ctl_table *ctl, int write, @@ -67,24 +67,6 @@ xfs_deprecated_dointvec_minmax( } static const struct ctl_table xfs_table[] = { - { - .procname = "irix_sgid_inherit", - .data = &xfs_params.sgid_inherit.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.sgid_inherit.min, - .extra2 = &xfs_params.sgid_inherit.max - }, - { - .procname = "irix_symlink_mode", - .data = &xfs_params.symlink_mode.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.symlink_mode.min, - .extra2 = &xfs_params.symlink_mode.max - }, { .procname = "panic_mask", .data = &xfs_params.panic_mask.val, @@ -185,15 +167,6 @@ static const struct ctl_table xfs_table[] = { .extra1 = &xfs_params.blockgc_timer.min, .extra2 = &xfs_params.blockgc_timer.max, }, - { - .procname = "speculative_cow_prealloc_lifetime", - .data = &xfs_params.blockgc_timer.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.blockgc_timer.min, - .extra2 = &xfs_params.blockgc_timer.max, - }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 51646f066c4f7d..ed9d896079c1a8 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -19,9 +19,6 @@ typedef struct xfs_sysctl_val { } xfs_sysctl_val_t; typedef struct xfs_param { - xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is - * not a member of parent dir GID. */ - xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */ From 07c34f8cef69cb8eeef69c18d6cf0c04fbee3cb3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 18:04:57 -0700 Subject: [PATCH 0914/3206] xfs: use deferred reaping for data device cow extents Don't roll the whole transaction after every extent, that's rather inefficient. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 82910188111dd7..07f5bb8a642124 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -445,7 +445,7 @@ xreap_agextent_iter( */ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); - xreap_force_defer_finish(rs); + xreap_inc_defer(rs); return 0; } @@ -486,7 +486,7 @@ xreap_agextent_iter( if (error) return error; - xreap_force_defer_finish(rs); + xreap_inc_defer(rs); return 0; } From 0ff51a1fd786f47ba435ede6209046959bad54a8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 27 Aug 2025 09:31:59 -0700 Subject: [PATCH 0915/3206] xfs: enable online fsck by default in Kconfig Online fsck has been a part of upstream for over a year now without any serious problems. Turn it on by default in time for the 2025 LTS kernel, and get rid of the "say N if unsure" messages for the default Y options. Signed-off-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino --- fs/xfs/Kconfig | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index ecebd3ebab1342..8930d5254e1da6 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -137,7 +137,7 @@ config XFS_BTREE_IN_MEM config XFS_ONLINE_SCRUB bool "XFS online metadata check support" - default n + default y depends on XFS_FS depends on TMPFS && SHMEM select XFS_LIVE_HOOKS @@ -150,12 +150,8 @@ config XFS_ONLINE_SCRUB advantage here is to look for problems proactively so that they can be dealt with in a controlled manner. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y @@ -171,11 +167,9 @@ config XFS_ONLINE_SCRUB_STATS Usage data are collected in /sys/kernel/debug/xfs/scrub. - If unsure, say N. - config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" - default n + default y depends on XFS_FS && XFS_ONLINE_SCRUB select XFS_BTREE_IN_MEM help @@ -186,12 +180,8 @@ config XFS_ONLINE_REPAIR formatted with secondary metadata, such as reverse mappings and inode parent pointers. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_WARN bool "XFS Verbose Warnings" depends on XFS_FS && !XFS_DEBUG From d6256771d106172cf7b6e6bfc49f5a1f2ff8c76c Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 10:54:35 +0200 Subject: [PATCH 0916/3206] cgroup: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue, yet nothing in its name tells about that CPU affinity constraint, which is very often not required by users. Make it clear by adding a system_percpu_wq. queue_work() / queue_delayed_work() mod_delayed_work() will now use the new per-cpu wq: whether the user still stick on the old name a warn will be printed along a wq redirect to the new one. This patch add the new system_percpu_wq except for mm, fs and net subsystem, whom are handled in separated patches. The old wq will be kept for a few release cylces. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e7acfaa495179a..06a9b4a17d4172 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -124,7 +124,7 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup - * destruction work items don't end up filling up max_active of system_wq + * destruction work items don't end up filling up max_active of system_percpu_wq * which may lead to deadlock. */ static struct workqueue_struct *cgroup_destroy_wq; From 7fa33aa3b001758352c8d0abb9f212a5bb9ed46a Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 10:54:36 +0200 Subject: [PATCH 0917/3206] cgroup: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch adds a new WQ_PERCPU flag to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 +- kernel/cgroup/cgroup.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 763343fbd5a174..0a23b65de013fc 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1326,7 +1326,7 @@ static int __init cgroup1_wq_init(void) * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); + WQ_PERCPU, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 06a9b4a17d4172..99d3b6c0f328c4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6346,7 +6346,7 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); + cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", WQ_PERCPU, 1); BUG_ON(!cgroup_destroy_wq); return 0; } From f6cfa602d2ba7e5ca9dc65ec4141521aca80bda2 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:13:23 +0200 Subject: [PATCH 0918/3206] workqueue: replace use of system_unbound_wq with system_dfl_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. queue_work() / queue_delayed_work() / mod_delayed_work() will now use the new unbound wq: whether the user still use the old wq a warn will be printed along with a wq redirect to the new one. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 4 ++-- kernel/workqueue.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 45d5dd470ff609..af860e8f848145 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -783,8 +783,8 @@ extern void __warn_flushing_systemwide_wq(void) _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ - (__builtin_constant_p(_wq == system_unbound_wq) && \ - _wq == system_unbound_wq) || \ + (__builtin_constant_p(_wq == system_dfl_wq) && \ + _wq == system_dfl_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ _wq == system_freezable_wq) || \ (__builtin_constant_p(_wq == system_power_efficient_wq) && \ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 59faf857ee4f5a..2888f4399acdbd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2932,7 +2932,7 @@ static void idle_worker_timeout(struct timer_list *t) raw_spin_unlock_irq(&pool->lock); if (do_cull) - queue_work(system_unbound_wq, &pool->idle_cull_work); + queue_work(system_dfl_wq, &pool->idle_cull_work); } /** From a2be943b46b4a7478ea8ddf9bb8e5251c59fceb7 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:13:24 +0200 Subject: [PATCH 0919/3206] workqueue: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue, yet nothing in its name tells about that CPU affinity constraint, which is very often not required by users. Make it clear by adding a system_percpu_wq. queue_work() / queue_delayed_work() mod_delayed_work() will now use the new per-cpu wq: whether the user still stick on the old name a warn will be printed along a wq redirect to the new one. This patch add the new system_percpu_wq except for mm, fs and net subsystem, whom are handled in separated patches. The old wq will be kept for a few release cylces. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 22 +++++++++++----------- kernel/workqueue.c | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index af860e8f848145..b6834b7aee4bce 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -434,10 +434,10 @@ enum wq_consts { * short queue flush time. Don't queue works which can run for too * long. * - * system_highpri_wq is similar to system_wq but for work items which + * system_highpri_wq is similar to system_percpu_wq but for work items which * require WQ_HIGHPRI. * - * system_long_wq is similar to system_wq but may host long running + * system_long_wq is similar to system_percpu_wq but may host long running * works. Queue flushing might take relatively long. * * system_dfl_wq is unbound workqueue. Workers are not bound to @@ -445,13 +445,13 @@ enum wq_consts { * executed immediately as long as max_active limit is not reached and * resources are available. * - * system_freezable_wq is equivalent to system_wq except that it's + * system_freezable_wq is equivalent to system_percpu_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. - * system_power_efficient_wq is identical to system_wq if + * system_power_efficient_wq is identical to system_percpu_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. * * system_bh[_highpri]_wq are convenience interface to softirq. BH work items @@ -708,7 +708,7 @@ static inline bool mod_delayed_work(struct workqueue_struct *wq, */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { - return queue_work_on(cpu, system_wq, work); + return queue_work_on(cpu, system_percpu_wq, work); } /** @@ -727,7 +727,7 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work) */ static inline bool schedule_work(struct work_struct *work) { - return queue_work(system_wq, work); + return queue_work(system_percpu_wq, work); } /** @@ -770,15 +770,15 @@ extern void __warn_flushing_systemwide_wq(void) #define flush_scheduled_work() \ ({ \ __warn_flushing_systemwide_wq(); \ - __flush_workqueue(system_wq); \ + __flush_workqueue(system_percpu_wq); \ }) #define flush_workqueue(wq) \ ({ \ struct workqueue_struct *_wq = (wq); \ \ - if ((__builtin_constant_p(_wq == system_wq) && \ - _wq == system_wq) || \ + if ((__builtin_constant_p(_wq == system_percpu_wq) && \ + _wq == system_percpu_wq) || \ (__builtin_constant_p(_wq == system_highpri_wq) && \ _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ @@ -807,7 +807,7 @@ extern void __warn_flushing_systemwide_wq(void) static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, system_wq, dwork, delay); + return queue_delayed_work_on(cpu, system_percpu_wq, dwork, delay); } /** @@ -821,7 +821,7 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(system_wq, dwork, delay); + return queue_delayed_work(system_percpu_wq, dwork, delay); } #ifndef CONFIG_SMP diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2888f4399acdbd..90db8cf015c2fb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7668,7 +7668,7 @@ static int wq_watchdog_param_set_thresh(const char *val, if (ret) return ret; - if (system_wq) + if (system_percpu_wq) wq_watchdog_set_thresh(thresh); else wq_watchdog_thresh = thresh; From 5a91f52c8650334aaf8c4c7c90f40c6906994225 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 12 Aug 2025 10:29:23 -0400 Subject: [PATCH 0920/3206] MAINTAINERS: update btrfs entry This is an update to reflect reality, not a signal of any seismic change. Dave Sterba has been the acting maintainer for almost a decade, I've simply been here as a backstop in case he gets hit by a bus. The fact is we have a strong and thriving community with any number of more active developers that can take on that role if it's necessary. I'm exceedingly happy and proud of the work that Dave has done in keeping us all in line, and know that if further changes need to be made it'll be with the development community we've built throughout the lifetime of btrfs. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 10850512c1186d..bc4f57d179d7cf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5157,7 +5157,6 @@ F: drivers/gpio/gpio-bt8xx.c BTRFS FILE SYSTEM M: Chris Mason -M: Josef Bacik M: David Sterba L: linux-btrfs@vger.kernel.org S: Maintained From 3d1267475b94b3df7a61e4ea6788c7c5d9e473c4 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 2 Sep 2025 11:34:10 +0100 Subject: [PATCH 0921/3206] btrfs: don't allow adding block device of less than 1 MB Commit 15ae0410c37a79 ("btrfs-progs: add error handling for device_get_partition_size_fd_stat()") in btrfs-progs inadvertently changed it so that if the BLKGETSIZE64 ioctl on a block device returned a size of 0, this was no longer seen as an error condition. Unfortunately this is how disconnected NBD devices behave, meaning that with btrfs-progs 6.16 it's now possible to add a device you can't remove: # btrfs device add /dev/nbd0 /root/temp # btrfs device remove /dev/nbd0 /root/temp ERROR: error removing device '/dev/nbd0': Invalid argument This check should always have been done kernel-side anyway, so add a check in btrfs_init_new_device() that the new device doesn't have a size less than BTRFS_DEVICE_RANGE_RESERVED (i.e. 1 MB). Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fa7a929a046190..c6e3efd6f60218 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2722,6 +2722,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error; } + if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) { + ret = -EINVAL; + goto error; + } + if (fs_devices->seeding) { seeding_dev = true; down_write(&sb->s_umount); From e12873ee856ffa6f104869b8ea10c0f741606f13 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Sep 2025 15:08:35 +0100 Subject: [PATCH 0922/3206] selftests/bpf: Add BPF program dump in veristat Add the ability to dump BPF program instructions directly from veristat. Previously, inspecting a program required separate bpftool invocations: one to load and another to dump it, which meant running multiple commands. During active development, it's common for developers to use veristat for testing verification. Integrating instruction dumping into veristat reduces the need to switch tools and simplifies the workflow. By making this information more readily accessible, this change aims to streamline the BPF development cycle and improve usability for developers. This implementation leverages bpftool, by running it directly via popen to avoid any code duplication and keep veristat simple. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250905140835.1416179-1-mykyta.yatsenko5@gmail.com --- tools/testing/selftests/bpf/veristat.c | 56 +++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index d532dd82a3a83e..e962f133250c7d 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -181,6 +181,12 @@ struct var_preset { bool applied; }; +enum dump_mode { + DUMP_NONE = 0, + DUMP_XLATED = 1, + DUMP_JITED = 2, +}; + static struct env { char **filenames; int filename_cnt; @@ -227,6 +233,7 @@ static struct env { char orig_cgroup[PATH_MAX]; char stat_cgroup[PATH_MAX]; int memory_peak_fd; + __u32 dump_mode; } env; static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) @@ -271,6 +278,7 @@ const char argp_program_doc[] = enum { OPT_LOG_FIXED = 1000, OPT_LOG_SIZE = 1001, + OPT_DUMP = 1002, }; static const struct argp_option opts[] = { @@ -295,6 +303,7 @@ static const struct argp_option opts[] = { "Force BPF verifier failure on register invariant violation (BPF_F_TEST_REG_INVARIANTS program flag)" }, { "top-src-lines", 'S', "N", 0, "Emit N most frequent source code lines" }, { "set-global-vars", 'G', "GLOBAL", 0, "Set global variables provided in the expression, for example \"var1 = 1\"" }, + { "dump", OPT_DUMP, "DUMP_MODE", OPTION_ARG_OPTIONAL, "Print BPF program dump (xlated, jited)" }, {}, }; @@ -427,6 +436,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return err; } break; + case OPT_DUMP: + if (!arg || strcasecmp(arg, "xlated") == 0) { + env.dump_mode |= DUMP_XLATED; + } else if (strcasecmp(arg, "jited") == 0) { + env.dump_mode |= DUMP_JITED; + } else { + fprintf(stderr, "Unrecognized dump mode '%s'\n", arg); + return -EINVAL; + } + break; default: return ARGP_ERR_UNKNOWN; } @@ -1554,6 +1573,36 @@ static int parse_rvalue(const char *val, struct rvalue *rvalue) return 0; } +static void dump(__u32 prog_id, enum dump_mode mode, const char *file_name, const char *prog_name) +{ + char command[64], buf[4096]; + FILE *fp; + int status; + + status = system("command -v bpftool > /dev/null 2>&1"); + if (status != 0) { + fprintf(stderr, "bpftool is not available, can't print program dump\n"); + return; + } + snprintf(command, sizeof(command), "bpftool prog dump %s id %u", + mode == DUMP_JITED ? "jited" : "xlated", prog_id); + fp = popen(command, "r"); + if (!fp) { + fprintf(stderr, "bpftool failed with error: %d\n", errno); + return; + } + + printf("DUMP (%s) %s/%s:\n", mode == DUMP_JITED ? "JITED" : "XLATED", file_name, prog_name); + while (fgets(buf, sizeof(buf), fp)) + fputs(buf, stdout); + fprintf(stdout, "\n"); + + if (ferror(fp)) + fprintf(stderr, "Failed to dump BPF prog with error: %d\n", errno); + + pclose(fp); +} + static int process_prog(const char *filename, struct bpf_object *obj, struct bpf_program *prog) { const char *base_filename = basename(strdupa(filename)); @@ -1630,8 +1679,13 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf memset(&info, 0, info_len); fd = bpf_program__fd(prog); - if (fd > 0 && bpf_prog_get_info_by_fd(fd, &info, &info_len) == 0) + if (fd > 0 && bpf_prog_get_info_by_fd(fd, &info, &info_len) == 0) { stats->stats[JITED_SIZE] = info.jited_prog_len; + if (env.dump_mode & DUMP_JITED) + dump(info.id, DUMP_JITED, base_filename, prog_name); + if (env.dump_mode & DUMP_XLATED) + dump(info.id, DUMP_XLATED, base_filename, prog_name); + } parse_verif_log(buf, buf_sz, stats); From 992203a1fba51b025c60ec0c8b0d9223343dea95 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Thu, 7 Aug 2025 12:49:38 -0400 Subject: [PATCH 0923/3206] nfs/localio: restore creds before releasing pageio data Otherwise if the nfsd filecache code releases the nfsd_file immediately, it can trigger the BUG_ON(cred == current->cred) in __put_cred() when it puts the nfsd_file->nf_file->f-cred. Fixes: b9f5dd57f4a5 ("nfs/localio: use dedicated workqueues for filesystem read and write") Signed-off-by: Scott Mayhew Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20250807164938.2395136-1-smayhew@redhat.com Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index bd5fca28589989..bdb82a19136aaa 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -453,12 +453,13 @@ static void nfs_local_call_read(struct work_struct *work) nfs_local_iter_init(&iter, iocb, READ); status = filp->f_op->read_iter(&iocb->kiocb, &iter); + + revert_creds(save_cred); + if (status != -EIOCBQUEUED) { nfs_local_read_done(iocb, status); nfs_local_pgio_release(iocb); } - - revert_creds(save_cred); } static int @@ -648,14 +649,15 @@ static void nfs_local_call_write(struct work_struct *work) file_start_write(filp); status = filp->f_op->write_iter(&iocb->kiocb, &iter); file_end_write(filp); + + revert_creds(save_cred); + current->flags = old_flags; + if (status != -EIOCBQUEUED) { nfs_local_write_done(iocb, status); nfs_local_vfs_getattr(iocb); nfs_local_pgio_release(iocb); } - - revert_creds(save_cred); - current->flags = old_flags; } static int From ca50b295fd473ef797b69b8538036cca716f3d55 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 4 Jul 2025 10:54:13 +0300 Subject: [PATCH 0924/3206] HSI: omap_ssi_port: Remove redundant pm_runtime_mark_last_busy() calls pm_runtime_put_autosuspend(), pm_runtime_put_sync_autosuspend(), pm_runtime_autosuspend() and pm_request_autosuspend() now include a call to pm_runtime_mark_last_busy(). Remove the now-reduntant explicit call to pm_runtime_mark_last_busy(). Signed-off-by: Sakari Ailus Link: https://lore.kernel.org/r/20250704075413.3218357-1-sakari.ailus@linux.intel.com Signed-off-by: Sebastian Reichel --- drivers/hsi/controllers/omap_ssi_port.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/hsi/controllers/omap_ssi_port.c b/drivers/hsi/controllers/omap_ssi_port.c index aeb92b803a177a..50dde968febe87 100644 --- a/drivers/hsi/controllers/omap_ssi_port.c +++ b/drivers/hsi/controllers/omap_ssi_port.c @@ -362,7 +362,6 @@ static int ssi_async_break(struct hsi_msg *msg) spin_unlock_bh(&omap_port->lock); } out: - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); return err; @@ -401,7 +400,6 @@ static int ssi_async(struct hsi_msg *msg) msg->status = HSI_STATUS_ERROR; } spin_unlock_bh(&omap_port->lock); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); dev_dbg(&port->device, "msg status %d ttype %d ch %d\n", msg->status, msg->ttype, msg->channel); @@ -504,7 +502,6 @@ static int ssi_setup(struct hsi_client *cl) omap_port->ssr.mode = cl->rx_cfg.mode; out: spin_unlock_bh(&omap_port->lock); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); return err; @@ -570,7 +567,6 @@ static int ssi_flush(struct hsi_client *cl) pinctrl_pm_select_default_state(omap_port->pdev); spin_unlock_bh(&omap_port->lock); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); return 0; @@ -625,7 +621,6 @@ static int ssi_stop_tx(struct hsi_client *cl) writel(SSI_WAKE(0), omap_ssi->sys + SSI_CLEAR_WAKE_REG(port->num)); spin_unlock_bh(&omap_port->wk_lock); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); /* Release clocks */ @@ -653,7 +648,6 @@ static void ssi_transfer(struct omap_ssi_port *omap_port, } } spin_unlock_bh(&omap_port->lock); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); } @@ -683,7 +677,6 @@ static void ssi_cleanup_queues(struct hsi_client *cl) txbufstate |= (1 << i); status |= SSI_DATAACCEPT(i); /* Release the clocks writes, also GDD ones */ - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); } ssi_flush_queue(&omap_port->txqueue[i], cl); @@ -739,7 +732,6 @@ static void ssi_cleanup_gdd(struct hsi_controller *ssi, struct hsi_client *cl) * ssi_cleanup_queues */ if (msg->ttype == HSI_MSG_READ) { - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); } omap_ssi->gdd_trn[i].msg = NULL; @@ -936,7 +928,6 @@ static void ssi_pio_complete(struct hsi_port *port, struct list_head *queue) reg = readl(omap_ssi->sys + SSI_MPU_ENABLE_REG(port->num, 0)); if (msg->ttype == HSI_MSG_WRITE) { /* Release clocks for write transfer */ - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); } reg &= ~val; @@ -981,7 +972,6 @@ static irqreturn_t ssi_pio_thread(int irq, void *ssi_port) /* TODO: sleep if we retry? */ } while (status_reg); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); return IRQ_HANDLED; @@ -1018,7 +1008,6 @@ static irqreturn_t ssi_wake_thread(int irq __maybe_unused, void *ssi_port) } hsi_event(port, HSI_EVENT_STOP_RX); if (test_and_clear_bit(SSI_WAKE_EN, &omap_port->flags)) { - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); } } From 437054b1bbe11be87ab0a522b8ccbae3f785c642 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 15 Aug 2025 12:41:10 +0200 Subject: [PATCH 0925/3206] vdso: Add struct __kernel_old_timeval forward declaration to gettime.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prototype of __vdso_gettimeofday() uses this struct. However gettime.h's own includes do not provide a definition for it. Add a forward declaration, similar to other used structs. Fixes: 42874e4eb35b ("arch: vdso: consolidate gettime prototypes") Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250815-vdso-sparc64-generic-2-v2-1-b5ff80672347@linutronix.de --- include/vdso/gettime.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/vdso/gettime.h b/include/vdso/gettime.h index c50d152e7b3e06..9ac161866653a0 100644 --- a/include/vdso/gettime.h +++ b/include/vdso/gettime.h @@ -5,6 +5,7 @@ #include struct __kernel_timespec; +struct __kernel_old_timeval; struct timezone; #if !defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) From 80d03a40837a9b26750a25122b906c052cc846c9 Mon Sep 17 00:00:00 2001 From: Vladimir Riabchun Date: Tue, 26 Aug 2025 18:16:46 +0200 Subject: [PATCH 0926/3206] ftrace/samples: Fix function size computation In my_tramp1 function .size directive was placed above ASM_RET instruction, leading to a wrong function size. Link: https://lore.kernel.org/aK3d7vxNcO52kEmg@vova-pc Fixes: 9d907f1ae80b ("samples/ftrace: Fix asm function ELF annotations") Signed-off-by: Vladimir Riabchun Signed-off-by: Steven Rostedt (Google) --- samples/ftrace/ftrace-direct-modify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index cfea7a38befb05..da3a9f2091f55b 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -75,8 +75,8 @@ asm ( CALL_DEPTH_ACCOUNT " call my_direct_func1\n" " leave\n" -" .size my_tramp1, .-my_tramp1\n" ASM_RET +" .size my_tramp1, .-my_tramp1\n" " .type my_tramp2, @function\n" " .globl my_tramp2\n" From 6a93f54333979c2948e9c1e71ea0b377b486a3f5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 31 Aug 2025 14:29:37 +0200 Subject: [PATCH 0927/3206] power: supply: Add adc-battery-helper The TI PMIC used on some Intel Bay/Cherry Trail systems has some builtin fuel-gauge functionality which just like the UG3105 fuel-gauge is not a full featured autonomous fuel-gauge. These fuel-gauges offer accurate current and voltage measurements but their coulomb-counters are intended to work together with an always on micro-controller monitoring the fuel-gauge. Add an adc-battery-helper offering open-circuit-voltage (ocv) and through that capacity estimation for devices where such limited functionality fuel-gauges are exposed directly to Linux. This is a copy of the existing UG3105 estimating code, generalized so that it can be re-used in other drivers. The next commit will replace the UG3105 driver's version of this code with using the adc-battery-helper. The API has been designed for easy integration into existing power-supply drivers. For example this functionality might also be a useful addition to the generic-adc-battery driver. The requirement of needing the adc_battery_helper struct to be the first member of a battery driver's data struct is not ideal. This is a compromise which is necessary to allow directly using the helper's get_property(), external_power_changed() and suspend()/resume() functions as power-supply / suspend-resume callbacks. Reviewed-by: Linus Walleij Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20250831122942.47875-2-hansg@kernel.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/Kconfig | 3 + drivers/power/supply/Makefile | 1 + drivers/power/supply/adc-battery-helper.c | 315 ++++++++++++++++++++++ drivers/power/supply/adc-battery-helper.h | 59 ++++ 4 files changed, 378 insertions(+) create mode 100644 drivers/power/supply/adc-battery-helper.c create mode 100644 drivers/power/supply/adc-battery-helper.h diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig index 79ddb006e2dad6..8cfaded2cc5c7e 100644 --- a/drivers/power/supply/Kconfig +++ b/drivers/power/supply/Kconfig @@ -35,6 +35,9 @@ config APM_POWER Say Y here to enable support APM status emulation using battery class devices. +config ADC_BATTERY_HELPER + tristate + config GENERIC_ADC_BATTERY tristate "Generic battery support using IIO" depends on IIO diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile index f943c9150b326d..c85c5c441e0fe3 100644 --- a/drivers/power/supply/Makefile +++ b/drivers/power/supply/Makefile @@ -7,6 +7,7 @@ power_supply-$(CONFIG_LEDS_TRIGGERS) += power_supply_leds.o obj-$(CONFIG_POWER_SUPPLY) += power_supply.o obj-$(CONFIG_POWER_SUPPLY_HWMON) += power_supply_hwmon.o +obj-$(CONFIG_ADC_BATTERY_HELPER) += adc-battery-helper.o obj-$(CONFIG_GENERIC_ADC_BATTERY) += generic-adc-battery.o obj-$(CONFIG_APM_POWER) += apm_power.o diff --git a/drivers/power/supply/adc-battery-helper.c b/drivers/power/supply/adc-battery-helper.c new file mode 100644 index 00000000000000..f8f6d2f8ec89a5 --- /dev/null +++ b/drivers/power/supply/adc-battery-helper.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Helper for batteries with accurate current and voltage measurement, but + * without temperature measurement or without a "resistance-temp-table". + * + * Some fuel-gauges are not full-featured autonomous fuel-gauges. + * These fuel-gauges offer accurate current and voltage measurements but + * their coulomb-counters are intended to work together with an always on + * micro-controller monitoring the fuel-gauge. + * + * This adc-battery-helper code offers open-circuit-voltage (ocv) and through + * that capacity estimation for devices where such limited functionality + * fuel-gauges are exposed directly to Linux. + * + * This helper requires the hw to provide accurate battery current_now and + * voltage_now measurement and this helper the provides the following properties + * based on top of those readings: + * + * POWER_SUPPLY_PROP_STATUS + * POWER_SUPPLY_PROP_VOLTAGE_OCV + * POWER_SUPPLY_PROP_VOLTAGE_NOW + * POWER_SUPPLY_PROP_CURRENT_NOW + * POWER_SUPPLY_PROP_CAPACITY + * + * As well as optional the following properties assuming an always present + * system-scope battery, allowing direct use of adc_battery_helper_get_prop() + * in this common case: + * POWER_SUPPLY_PROP_PRESENT + * POWER_SUPPLY_PROP_SCOPE + * + * Using this helper is as simple as: + * + * 1. Embed a struct adc_battery_helper this MUST be the first member of + * the battery driver's data struct. + * 2. Use adc_battery_helper_props[] or add the above properties to + * the list of properties in power_supply_desc + * 3. Call adc_battery_helper_init() after registering the power_supply and + * before returning from the probe() function + * 4. Use adc_battery_helper_get_prop() as the power-supply's get_property() + * method, or call it for the above properties. + * 5. Use adc_battery_helper_external_power_changed() as the power-supply's + * external_power_changed() method or call it from that method. + * 6. Use adc_battery_helper_[suspend|resume]() as suspend-resume methods or + * call them from the driver's suspend-resume methods. + * + * The provided get_voltage_and_current_now() method will be called by this + * helper at adc_battery_helper_init() time and later. + * + * Copyright (c) 2021-2025 Hans de Goede + */ + +#include +#include +#include +#include +#include + +#include "adc-battery-helper.h" + +#define MOV_AVG_WINDOW_SIZE ADC_BAT_HELPER_MOV_AVG_WINDOW_SIZE +#define INIT_POLL_TIME (5 * HZ) +#define POLL_TIME (30 * HZ) +#define SETTLE_TIME (1 * HZ) + +#define INIT_POLL_COUNT 30 + +#define CURR_HYST_UA 65000 + +#define LOW_BAT_UV 3700000 +#define FULL_BAT_HYST_UV 38000 + +#define AMBIENT_TEMP_CELSIUS 25 + +static int adc_battery_helper_get_status(struct adc_battery_helper *help) +{ + int full_uv = + help->psy->battery_info->constant_charge_voltage_max_uv - FULL_BAT_HYST_UV; + + if (help->curr_ua > CURR_HYST_UA) + return POWER_SUPPLY_STATUS_CHARGING; + + if (help->curr_ua < -CURR_HYST_UA) + return POWER_SUPPLY_STATUS_DISCHARGING; + + if (help->supplied && help->ocv_avg_uv > full_uv) + return POWER_SUPPLY_STATUS_FULL; + + return POWER_SUPPLY_STATUS_NOT_CHARGING; +} + +static void adc_battery_helper_work(struct work_struct *work) +{ + struct adc_battery_helper *help = container_of(work, struct adc_battery_helper, + work.work); + int i, curr_diff_ua, volt_diff_uv, res_mohm, ret, win_size; + struct device *dev = help->psy->dev.parent; + int volt_uv, prev_volt_uv = help->volt_uv; + int curr_ua, prev_curr_ua = help->curr_ua; + bool prev_supplied = help->supplied; + int prev_status = help->status; + + guard(mutex)(&help->lock); + + ret = help->get_voltage_and_current_now(help->psy, &volt_uv, &curr_ua); + if (ret) + goto out; + + help->volt_uv = volt_uv; + help->curr_ua = curr_ua; + + help->ocv_uv[help->ocv_avg_index] = + help->volt_uv - help->curr_ua * help->intern_res_avg_mohm / 1000; + dev_dbg(dev, "volt-now: %d, curr-now: %d, volt-ocv: %d\n", + help->volt_uv, help->curr_ua, help->ocv_uv[help->ocv_avg_index]); + help->ocv_avg_index = (help->ocv_avg_index + 1) % MOV_AVG_WINDOW_SIZE; + help->poll_count++; + + help->ocv_avg_uv = 0; + win_size = min(help->poll_count, MOV_AVG_WINDOW_SIZE); + for (i = 0; i < win_size; i++) + help->ocv_avg_uv += help->ocv_uv[i]; + help->ocv_avg_uv /= win_size; + + help->supplied = power_supply_am_i_supplied(help->psy); + help->status = adc_battery_helper_get_status(help); + if (help->status == POWER_SUPPLY_STATUS_FULL) + help->capacity = 100; + else + help->capacity = power_supply_batinfo_ocv2cap(help->psy->battery_info, + help->ocv_avg_uv, + AMBIENT_TEMP_CELSIUS); + + /* + * Skip internal resistance calc on charger [un]plug and + * when the battery is almost empty (voltage low). + */ + if (help->supplied != prev_supplied || + help->volt_uv < LOW_BAT_UV || + help->poll_count < 2) + goto out; + + /* + * Assuming that the OCV voltage does not change significantly + * between 2 polls, then we can calculate the internal resistance + * on a significant current change by attributing all voltage + * change between the 2 readings to the internal resistance. + */ + curr_diff_ua = abs(help->curr_ua - prev_curr_ua); + if (curr_diff_ua < CURR_HYST_UA) + goto out; + + volt_diff_uv = abs(help->volt_uv - prev_volt_uv); + res_mohm = volt_diff_uv * 1000 / curr_diff_ua; + + if ((res_mohm < (help->intern_res_avg_mohm * 2 / 3)) || + (res_mohm > (help->intern_res_avg_mohm * 4 / 3))) { + dev_dbg(dev, "Ignoring outlier internal resistance %d mOhm\n", res_mohm); + goto out; + } + + dev_dbg(dev, "Internal resistance %d mOhm\n", res_mohm); + + help->intern_res_mohm[help->intern_res_avg_index] = res_mohm; + help->intern_res_avg_index = (help->intern_res_avg_index + 1) % MOV_AVG_WINDOW_SIZE; + help->intern_res_poll_count++; + + help->intern_res_avg_mohm = 0; + win_size = min(help->intern_res_poll_count, MOV_AVG_WINDOW_SIZE); + for (i = 0; i < win_size; i++) + help->intern_res_avg_mohm += help->intern_res_mohm[i]; + help->intern_res_avg_mohm /= win_size; + +out: + queue_delayed_work(system_wq, &help->work, + (help->poll_count <= INIT_POLL_COUNT) ? + INIT_POLL_TIME : POLL_TIME); + + if (help->status != prev_status) + power_supply_changed(help->psy); +} + +const enum power_supply_property adc_battery_helper_properties[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_VOLTAGE_OCV, + POWER_SUPPLY_PROP_CURRENT_NOW, + POWER_SUPPLY_PROP_CAPACITY, + POWER_SUPPLY_PROP_PRESENT, + POWER_SUPPLY_PROP_SCOPE, +}; +EXPORT_SYMBOL_GPL(adc_battery_helper_properties); + +static_assert(ARRAY_SIZE(adc_battery_helper_properties) == + ADC_HELPER_NUM_PROPERTIES); + +int adc_battery_helper_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct adc_battery_helper *help = power_supply_get_drvdata(psy); + int dummy, ret = 0; + + /* + * Avoid racing with adc_battery_helper_work() while it is updating + * variables and avoid calling get_voltage_and_current_now() reentrantly. + */ + guard(mutex)(&help->lock); + + switch (psp) { + case POWER_SUPPLY_PROP_STATUS: + val->intval = help->status; + break; + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + ret = help->get_voltage_and_current_now(psy, &val->intval, &dummy); + break; + case POWER_SUPPLY_PROP_VOLTAGE_OCV: + val->intval = help->ocv_avg_uv; + break; + case POWER_SUPPLY_PROP_CURRENT_NOW: + ret = help->get_voltage_and_current_now(psy, &dummy, &val->intval); + break; + case POWER_SUPPLY_PROP_CAPACITY: + val->intval = help->capacity; + break; + case POWER_SUPPLY_PROP_PRESENT: + val->intval = 1; + break; + case POWER_SUPPLY_PROP_SCOPE: + val->intval = POWER_SUPPLY_SCOPE_SYSTEM; + break; + default: + return -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(adc_battery_helper_get_property); + +void adc_battery_helper_external_power_changed(struct power_supply *psy) +{ + struct adc_battery_helper *help = power_supply_get_drvdata(psy); + + dev_dbg(help->psy->dev.parent, "external power changed\n"); + mod_delayed_work(system_wq, &help->work, SETTLE_TIME); +} +EXPORT_SYMBOL_GPL(adc_battery_helper_external_power_changed); + +static void adc_battery_helper_start_work(struct adc_battery_helper *help) +{ + help->poll_count = 0; + help->ocv_avg_index = 0; + + queue_delayed_work(system_wq, &help->work, 0); + flush_delayed_work(&help->work); +} + +int adc_battery_helper_init(struct adc_battery_helper *help, struct power_supply *psy, + adc_battery_helper_get_func get_voltage_and_current_now) +{ + struct device *dev = psy->dev.parent; + int ret; + + help->psy = psy; + help->get_voltage_and_current_now = get_voltage_and_current_now; + + ret = devm_mutex_init(dev, &help->lock); + if (ret) + return ret; + + ret = devm_delayed_work_autocancel(dev, &help->work, adc_battery_helper_work); + if (ret) + return ret; + + if (!help->psy->battery_info || + help->psy->battery_info->factory_internal_resistance_uohm == -EINVAL || + help->psy->battery_info->constant_charge_voltage_max_uv == -EINVAL || + !psy->battery_info->ocv_table[0]) { + dev_err(dev, "error required properties are missing\n"); + return -ENODEV; + } + + /* Use provided internal resistance as start point (in milli-ohm) */ + help->intern_res_avg_mohm = + help->psy->battery_info->factory_internal_resistance_uohm / 1000; + /* Also add it to the internal resistance moving average window */ + help->intern_res_mohm[0] = help->intern_res_avg_mohm; + help->intern_res_avg_index = 1; + help->intern_res_poll_count = 1; + + adc_battery_helper_start_work(help); + return 0; +} +EXPORT_SYMBOL_GPL(adc_battery_helper_init); + +int adc_battery_helper_suspend(struct device *dev) +{ + struct adc_battery_helper *help = dev_get_drvdata(dev); + + cancel_delayed_work_sync(&help->work); + return 0; +} +EXPORT_SYMBOL_GPL(adc_battery_helper_suspend); + +int adc_battery_helper_resume(struct device *dev) +{ + struct adc_battery_helper *help = dev_get_drvdata(dev); + + adc_battery_helper_start_work(help); + return 0; +} +EXPORT_SYMBOL_GPL(adc_battery_helper_resume); + +MODULE_AUTHOR("Hans de Goede "); +MODULE_DESCRIPTION("ADC battery capacity estimation helper"); +MODULE_LICENSE("GPL"); diff --git a/drivers/power/supply/adc-battery-helper.h b/drivers/power/supply/adc-battery-helper.h new file mode 100644 index 00000000000000..90c7edcb9ab1d1 --- /dev/null +++ b/drivers/power/supply/adc-battery-helper.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Helper for batteries with accurate current and voltage measurement, but + * without temperature measurement or without a "resistance-temp-table". + * Copyright (c) 2021-2025 Hans de Goede + */ + +#include +#include + +#define ADC_BAT_HELPER_MOV_AVG_WINDOW_SIZE 8 + +struct power_supply; + +/* + * The adc battery helper code needs voltage- and current-now to be sampled as + * close to each other (in sample-time) as possible. A single getter function is + * used to allow the battery driver to handle this in the best way possible. + */ +typedef int (*adc_battery_helper_get_func)(struct power_supply *psy, int *volt, int *curr); + +struct adc_battery_helper { + struct power_supply *psy; + struct delayed_work work; + struct mutex lock; + adc_battery_helper_get_func get_voltage_and_current_now; + int ocv_uv[ADC_BAT_HELPER_MOV_AVG_WINDOW_SIZE]; /* micro-volt */ + int intern_res_mohm[ADC_BAT_HELPER_MOV_AVG_WINDOW_SIZE]; /* milli-ohm */ + int poll_count; + int ocv_avg_index; + int ocv_avg_uv; /* micro-volt */ + int intern_res_poll_count; + int intern_res_avg_index; + int intern_res_avg_mohm; /* milli-ohm */ + int volt_uv; /* micro-volt */ + int curr_ua; /* micro-ampere */ + int capacity; /* percent */ + int status; + bool supplied; +}; + +extern const enum power_supply_property adc_battery_helper_properties[]; +/* Must be const cannot be an external. Asserted in adc-battery-helper.c */ +#define ADC_HELPER_NUM_PROPERTIES 7 + +int adc_battery_helper_init(struct adc_battery_helper *help, struct power_supply *psy, + adc_battery_helper_get_func get_voltage_and_current_now); +/* + * The below functions can be directly used as power-supply / suspend-resume + * callbacks. They cast the power_supply_get_drvdata() / dev_get_drvdata() data + * directly to struct adc_battery_helper. Therefor struct adc_battery_helper + * MUST be the first member of the battery driver's data struct. + */ +int adc_battery_helper_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val); +void adc_battery_helper_external_power_changed(struct power_supply *psy); +int adc_battery_helper_suspend(struct device *dev); +int adc_battery_helper_resume(struct device *dev); From dcf1e7b73797399f8706226c7ff6ad8f18634db4 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 31 Aug 2025 14:29:38 +0200 Subject: [PATCH 0928/3206] power: supply: ug3105_battery: Switch to adc-battery-helper Switch ug3105_battery to using the new adc-battery-helper, since the helper's algorithms are a copy of the replaced ug3105_battery code this should not cause any functional differences. Reviewed-by: Linus Walleij Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20250831122942.47875-3-hansg@kernel.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/Kconfig | 1 + drivers/power/supply/ug3105_battery.c | 344 +++++--------------------- 2 files changed, 65 insertions(+), 280 deletions(-) diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig index 8cfaded2cc5c7e..e33a5360d33a49 100644 --- a/drivers/power/supply/Kconfig +++ b/drivers/power/supply/Kconfig @@ -1046,6 +1046,7 @@ config CHARGER_SURFACE config BATTERY_UG3105 tristate "uPI uG3105 battery monitor driver" depends on I2C + select ADC_BATTERY_HELPER help Battery monitor driver for the uPI uG3105 battery monitor. diff --git a/drivers/power/supply/ug3105_battery.c b/drivers/power/supply/ug3105_battery.c index e8a1de7cade0ce..70dd58e121e361 100644 --- a/drivers/power/supply/ug3105_battery.c +++ b/drivers/power/supply/ug3105_battery.c @@ -10,7 +10,22 @@ * is off or suspended, the coulomb counter is not used atm. * * Possible improvements: - * 1. Activate commented out total_coulomb_count code + * 1. Add coulumb counter reading, e.g. something like this: + * Read + reset coulomb counter every 10 polls (every 300 seconds) + * + * if ((chip->poll_count % 10) == 0) { + * val = ug3105_read_word(chip->client, UG3105_REG_COULOMB_CNT); + * if (val < 0) + * goto out; + * + * i2c_smbus_write_byte_data(chip->client, UG3105_REG_CTRL1, + * UG3105_CTRL1_RESET_COULOMB_CNT); + * + * chip->total_coulomb_count += (s16)val; + * dev_dbg(&chip->client->dev, "coulomb count %d total %d\n", + * (s16)val, chip->total_coulomb_count); + * } + * * 2. Reset total_coulomb_count val to 0 when the battery is as good as empty * and remember that we did this (and clear the flag for this on susp/resume) * 3. When the battery is full check if the flag that we set total_coulomb_count @@ -31,24 +46,16 @@ * has shown that an estimated 7404mWh increase of the battery's energy results * in a total_coulomb_count increase of 3277 units with a 5 milli-ohm sense R. * - * Copyright (C) 2021 Hans de Goede + * Copyright (C) 2021 - 2025 Hans de Goede */ -#include #include -#include #include #include #include #include -#include - -#define UG3105_MOV_AVG_WINDOW 8 -#define UG3105_INIT_POLL_TIME (5 * HZ) -#define UG3105_POLL_TIME (30 * HZ) -#define UG3105_SETTLE_TIME (1 * HZ) -#define UG3105_INIT_POLL_COUNT 30 +#include "adc-battery-helper.h" #define UG3105_REG_MODE 0x00 #define UG3105_REG_CTRL1 0x01 @@ -61,34 +68,13 @@ #define UG3105_CTRL1_RESET_COULOMB_CNT 0x03 -#define UG3105_CURR_HYST_UA 65000 - -#define UG3105_LOW_BAT_UV 3700000 -#define UG3105_FULL_BAT_HYST_UV 38000 - -#define AMBIENT_TEMP_CELCIUS 25 - struct ug3105_chip { + /* Must be the first member see adc-battery-helper documentation */ + struct adc_battery_helper helper; struct i2c_client *client; struct power_supply *psy; - struct delayed_work work; - struct mutex lock; - int ocv[UG3105_MOV_AVG_WINDOW]; /* micro-volt */ - int intern_res[UG3105_MOV_AVG_WINDOW]; /* milli-ohm */ - int poll_count; - int ocv_avg_index; - int ocv_avg; /* micro-volt */ - int intern_res_poll_count; - int intern_res_avg_index; - int intern_res_avg; /* milli-ohm */ - int volt; /* micro-volt */ - int curr; /* micro-ampere */ - int total_coulomb_count; int uv_per_unit; int ua_per_unit; - int status; - int capacity; - bool supplied; }; static int ug3105_read_word(struct i2c_client *client, u8 reg) @@ -102,230 +88,43 @@ static int ug3105_read_word(struct i2c_client *client, u8 reg) return val; } -static int ug3105_get_status(struct ug3105_chip *chip) -{ - int full = chip->psy->battery_info->constant_charge_voltage_max_uv - - UG3105_FULL_BAT_HYST_UV; - - if (chip->curr > UG3105_CURR_HYST_UA) - return POWER_SUPPLY_STATUS_CHARGING; - - if (chip->curr < -UG3105_CURR_HYST_UA) - return POWER_SUPPLY_STATUS_DISCHARGING; - - if (chip->supplied && chip->ocv_avg > full) - return POWER_SUPPLY_STATUS_FULL; - - return POWER_SUPPLY_STATUS_NOT_CHARGING; -} - -static void ug3105_work(struct work_struct *work) -{ - struct ug3105_chip *chip = container_of(work, struct ug3105_chip, - work.work); - int i, val, curr_diff, volt_diff, res, win_size; - bool prev_supplied = chip->supplied; - int prev_status = chip->status; - int prev_volt = chip->volt; - int prev_curr = chip->curr; - struct power_supply *psy; - - mutex_lock(&chip->lock); - - psy = chip->psy; - if (!psy) - goto out; - - val = ug3105_read_word(chip->client, UG3105_REG_BAT_VOLT); - if (val < 0) - goto out; - chip->volt = val * chip->uv_per_unit; - - val = ug3105_read_word(chip->client, UG3105_REG_BAT_CURR); - if (val < 0) - goto out; - chip->curr = (s16)val * chip->ua_per_unit; - - chip->ocv[chip->ocv_avg_index] = - chip->volt - chip->curr * chip->intern_res_avg / 1000; - chip->ocv_avg_index = (chip->ocv_avg_index + 1) % UG3105_MOV_AVG_WINDOW; - chip->poll_count++; - - /* - * See possible improvements comment above. - * - * Read + reset coulomb counter every 10 polls (every 300 seconds) - * if ((chip->poll_count % 10) == 0) { - * val = ug3105_read_word(chip->client, UG3105_REG_COULOMB_CNT); - * if (val < 0) - * goto out; - * - * i2c_smbus_write_byte_data(chip->client, UG3105_REG_CTRL1, - * UG3105_CTRL1_RESET_COULOMB_CNT); - * - * chip->total_coulomb_count += (s16)val; - * dev_dbg(&chip->client->dev, "coulomb count %d total %d\n", - * (s16)val, chip->total_coulomb_count); - * } - */ - - chip->ocv_avg = 0; - win_size = min(chip->poll_count, UG3105_MOV_AVG_WINDOW); - for (i = 0; i < win_size; i++) - chip->ocv_avg += chip->ocv[i]; - chip->ocv_avg /= win_size; - - chip->supplied = power_supply_am_i_supplied(psy); - chip->status = ug3105_get_status(chip); - if (chip->status == POWER_SUPPLY_STATUS_FULL) - chip->capacity = 100; - else - chip->capacity = power_supply_batinfo_ocv2cap(chip->psy->battery_info, - chip->ocv_avg, - AMBIENT_TEMP_CELCIUS); - - /* - * Skip internal resistance calc on charger [un]plug and - * when the battery is almost empty (voltage low). - */ - if (chip->supplied != prev_supplied || - chip->volt < UG3105_LOW_BAT_UV || - chip->poll_count < 2) - goto out; - - /* - * Assuming that the OCV voltage does not change significantly - * between 2 polls, then we can calculate the internal resistance - * on a significant current change by attributing all voltage - * change between the 2 readings to the internal resistance. - */ - curr_diff = abs(chip->curr - prev_curr); - if (curr_diff < UG3105_CURR_HYST_UA) - goto out; - - volt_diff = abs(chip->volt - prev_volt); - res = volt_diff * 1000 / curr_diff; - - if ((res < (chip->intern_res_avg * 2 / 3)) || - (res > (chip->intern_res_avg * 4 / 3))) { - dev_dbg(&chip->client->dev, "Ignoring outlier internal resistance %d mOhm\n", res); - goto out; - } - - dev_dbg(&chip->client->dev, "Internal resistance %d mOhm\n", res); - - chip->intern_res[chip->intern_res_avg_index] = res; - chip->intern_res_avg_index = (chip->intern_res_avg_index + 1) % UG3105_MOV_AVG_WINDOW; - chip->intern_res_poll_count++; - - chip->intern_res_avg = 0; - win_size = min(chip->intern_res_poll_count, UG3105_MOV_AVG_WINDOW); - for (i = 0; i < win_size; i++) - chip->intern_res_avg += chip->intern_res[i]; - chip->intern_res_avg /= win_size; - -out: - mutex_unlock(&chip->lock); - - queue_delayed_work(system_wq, &chip->work, - (chip->poll_count <= UG3105_INIT_POLL_COUNT) ? - UG3105_INIT_POLL_TIME : UG3105_POLL_TIME); - - if (chip->status != prev_status && psy) - power_supply_changed(psy); -} - -static enum power_supply_property ug3105_battery_props[] = { - POWER_SUPPLY_PROP_STATUS, - POWER_SUPPLY_PROP_PRESENT, - POWER_SUPPLY_PROP_SCOPE, - POWER_SUPPLY_PROP_VOLTAGE_NOW, - POWER_SUPPLY_PROP_VOLTAGE_OCV, - POWER_SUPPLY_PROP_CURRENT_NOW, - POWER_SUPPLY_PROP_CAPACITY, -}; - -static int ug3105_get_property(struct power_supply *psy, - enum power_supply_property psp, - union power_supply_propval *val) +static int ug3105_get_voltage_and_current_now(struct power_supply *psy, int *volt, int *curr) { struct ug3105_chip *chip = power_supply_get_drvdata(psy); - int ret = 0; - - mutex_lock(&chip->lock); + int ret; - if (!chip->psy) { - ret = -EAGAIN; - goto out; - } + ret = ug3105_read_word(chip->client, UG3105_REG_BAT_VOLT); + if (ret < 0) + return ret; - switch (psp) { - case POWER_SUPPLY_PROP_STATUS: - val->intval = chip->status; - break; - case POWER_SUPPLY_PROP_PRESENT: - val->intval = 1; - break; - case POWER_SUPPLY_PROP_SCOPE: - val->intval = POWER_SUPPLY_SCOPE_SYSTEM; - break; - case POWER_SUPPLY_PROP_VOLTAGE_NOW: - ret = ug3105_read_word(chip->client, UG3105_REG_BAT_VOLT); - if (ret < 0) - break; - val->intval = ret * chip->uv_per_unit; - ret = 0; - break; - case POWER_SUPPLY_PROP_VOLTAGE_OCV: - val->intval = chip->ocv_avg; - break; - case POWER_SUPPLY_PROP_CURRENT_NOW: - ret = ug3105_read_word(chip->client, UG3105_REG_BAT_CURR); - if (ret < 0) - break; - val->intval = (s16)ret * chip->ua_per_unit; - ret = 0; - break; - case POWER_SUPPLY_PROP_CAPACITY: - val->intval = chip->capacity; - break; - default: - ret = -EINVAL; - } + *volt = ret * chip->uv_per_unit; -out: - mutex_unlock(&chip->lock); - return ret; -} - -static void ug3105_external_power_changed(struct power_supply *psy) -{ - struct ug3105_chip *chip = power_supply_get_drvdata(psy); + ret = ug3105_read_word(chip->client, UG3105_REG_BAT_CURR); + if (ret < 0) + return ret; - dev_dbg(&chip->client->dev, "external power changed\n"); - mod_delayed_work(system_wq, &chip->work, UG3105_SETTLE_TIME); + *curr = (s16)ret * chip->ua_per_unit; + return 0; } static const struct power_supply_desc ug3105_psy_desc = { .name = "ug3105_battery", .type = POWER_SUPPLY_TYPE_BATTERY, - .get_property = ug3105_get_property, - .external_power_changed = ug3105_external_power_changed, - .properties = ug3105_battery_props, - .num_properties = ARRAY_SIZE(ug3105_battery_props), + .get_property = adc_battery_helper_get_property, + .external_power_changed = adc_battery_helper_external_power_changed, + .properties = adc_battery_helper_properties, + .num_properties = ADC_HELPER_NUM_PROPERTIES, }; -static void ug3105_init(struct ug3105_chip *chip) +static void ug3105_start(struct i2c_client *client) +{ + i2c_smbus_write_byte_data(client, UG3105_REG_MODE, UG3105_MODE_RUN); + i2c_smbus_write_byte_data(client, UG3105_REG_CTRL1, UG3105_CTRL1_RESET_COULOMB_CNT); +} + +static void ug3105_stop(struct i2c_client *client) { - chip->poll_count = 0; - chip->ocv_avg_index = 0; - chip->total_coulomb_count = 0; - i2c_smbus_write_byte_data(chip->client, UG3105_REG_MODE, - UG3105_MODE_RUN); - i2c_smbus_write_byte_data(chip->client, UG3105_REG_CTRL1, - UG3105_CTRL1_RESET_COULOMB_CNT); - queue_delayed_work(system_wq, &chip->work, 0); - flush_delayed_work(&chip->work); + i2c_smbus_write_byte_data(client, UG3105_REG_MODE, UG3105_MODE_STANDBY); } static int ug3105_probe(struct i2c_client *client) @@ -333,7 +132,6 @@ static int ug3105_probe(struct i2c_client *client) struct power_supply_config psy_cfg = {}; struct device *dev = &client->dev; u32 curr_sense_res_uohm = 10000; - struct power_supply *psy; struct ug3105_chip *chip; int ret; @@ -342,23 +140,8 @@ static int ug3105_probe(struct i2c_client *client) return -ENOMEM; chip->client = client; - mutex_init(&chip->lock); - ret = devm_delayed_work_autocancel(dev, &chip->work, ug3105_work); - if (ret) - return ret; - psy_cfg.drv_data = chip; - psy = devm_power_supply_register(dev, &ug3105_psy_desc, &psy_cfg); - if (IS_ERR(psy)) - return PTR_ERR(psy); - - if (!psy->battery_info || - psy->battery_info->factory_internal_resistance_uohm == -EINVAL || - psy->battery_info->constant_charge_voltage_max_uv == -EINVAL || - !psy->battery_info->ocv_table[0]) { - dev_err(dev, "error required properties are missing\n"); - return -ENODEV; - } + ug3105_start(client); device_property_read_u32(dev, "upisemi,rsns-microohm", &curr_sense_res_uohm); @@ -366,35 +149,36 @@ static int ug3105_probe(struct i2c_client *client) * DAC maximum is 4.5V divided by 65536 steps + an unknown factor of 10 * coming from somewhere for some reason (verified with a volt-meter). */ - chip->uv_per_unit = 45000000/65536; + chip->uv_per_unit = 45000000 / 65536; /* Datasheet says 8.1 uV per unit for the current ADC */ chip->ua_per_unit = 8100000 / curr_sense_res_uohm; - /* Use provided internal resistance as start point (in milli-ohm) */ - chip->intern_res_avg = psy->battery_info->factory_internal_resistance_uohm / 1000; - /* Also add it to the internal resistance moving average window */ - chip->intern_res[0] = chip->intern_res_avg; - chip->intern_res_avg_index = 1; - chip->intern_res_poll_count = 1; - - mutex_lock(&chip->lock); - chip->psy = psy; - mutex_unlock(&chip->lock); + psy_cfg.drv_data = chip; + chip->psy = devm_power_supply_register(dev, &ug3105_psy_desc, &psy_cfg); + if (IS_ERR(chip->psy)) { + ret = PTR_ERR(chip->psy); + goto stop; + } - ug3105_init(chip); + ret = adc_battery_helper_init(&chip->helper, chip->psy, + ug3105_get_voltage_and_current_now); + if (ret) + goto stop; i2c_set_clientdata(client, chip); return 0; + +stop: + ug3105_stop(client); + return ret; } static int __maybe_unused ug3105_suspend(struct device *dev) { struct ug3105_chip *chip = dev_get_drvdata(dev); - cancel_delayed_work_sync(&chip->work); - i2c_smbus_write_byte_data(chip->client, UG3105_REG_MODE, - UG3105_MODE_STANDBY); - + adc_battery_helper_suspend(dev); + ug3105_stop(chip->client); return 0; } @@ -402,8 +186,8 @@ static int __maybe_unused ug3105_resume(struct device *dev) { struct ug3105_chip *chip = dev_get_drvdata(dev); - ug3105_init(chip); - + ug3105_start(chip->client); + adc_battery_helper_resume(dev); return 0; } @@ -426,6 +210,6 @@ static struct i2c_driver ug3105_i2c_driver = { }; module_i2c_driver(ug3105_i2c_driver); -MODULE_AUTHOR("Hans de Goede Date: Sun, 31 Aug 2025 14:29:39 +0200 Subject: [PATCH 0929/3206] power: supply: ug3105_battery: Put FG in standby on remove and shutdown Put the fuel-gauge in standby mode when the driver is unbound and on system shutdown. This avoids unnecessary battery drain when the system is off. Reviewed-by: Linus Walleij Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20250831122942.47875-4-hansg@kernel.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/ug3105_battery.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/power/supply/ug3105_battery.c b/drivers/power/supply/ug3105_battery.c index 70dd58e121e361..c4d4ac859fa41f 100644 --- a/drivers/power/supply/ug3105_battery.c +++ b/drivers/power/supply/ug3105_battery.c @@ -206,6 +206,8 @@ static struct i2c_driver ug3105_i2c_driver = { .pm = &ug3105_pm_ops, }, .probe = ug3105_probe, + .remove = ug3105_stop, + .shutdown = ug3105_stop, .id_table = ug3105_id, }; module_i2c_driver(ug3105_i2c_driver); From 926b144366c589a0c0c471d02a71590ed24284e0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 31 Aug 2025 14:29:40 +0200 Subject: [PATCH 0930/3206] power: supply: adc-battery-helper: Add support for optional charge_finished GPIO Charger ICs often have a status pin which indicates when the charger has finished charging the battery. Sometimes the status of this pin can be read over a GPIO. Add support for optionally reading a charge-finished GPIO and when available use this to determine when to return POWER_SUPPLY_STATUS_FULL. Reviewed-by: Linus Walleij Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20250831122942.47875-5-hansg@kernel.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/adc-battery-helper.c | 18 +++++++++++++++--- drivers/power/supply/adc-battery-helper.h | 5 ++++- drivers/power/supply/ug3105_battery.c | 2 +- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/power/supply/adc-battery-helper.c b/drivers/power/supply/adc-battery-helper.c index f8f6d2f8ec89a5..229b49fef8ad1c 100644 --- a/drivers/power/supply/adc-battery-helper.c +++ b/drivers/power/supply/adc-battery-helper.c @@ -51,6 +51,7 @@ #include #include +#include #include #include #include @@ -82,8 +83,17 @@ static int adc_battery_helper_get_status(struct adc_battery_helper *help) if (help->curr_ua < -CURR_HYST_UA) return POWER_SUPPLY_STATUS_DISCHARGING; - if (help->supplied && help->ocv_avg_uv > full_uv) - return POWER_SUPPLY_STATUS_FULL; + if (help->supplied) { + bool full; + + if (help->charge_finished) + full = gpiod_get_value_cansleep(help->charge_finished); + else + full = help->ocv_avg_uv > full_uv; + + if (full) + return POWER_SUPPLY_STATUS_FULL; + } return POWER_SUPPLY_STATUS_NOT_CHARGING; } @@ -255,13 +265,15 @@ static void adc_battery_helper_start_work(struct adc_battery_helper *help) } int adc_battery_helper_init(struct adc_battery_helper *help, struct power_supply *psy, - adc_battery_helper_get_func get_voltage_and_current_now) + adc_battery_helper_get_func get_voltage_and_current_now, + struct gpio_desc *charge_finished_gpio) { struct device *dev = psy->dev.parent; int ret; help->psy = psy; help->get_voltage_and_current_now = get_voltage_and_current_now; + help->charge_finished = charge_finished_gpio; ret = devm_mutex_init(dev, &help->lock); if (ret) diff --git a/drivers/power/supply/adc-battery-helper.h b/drivers/power/supply/adc-battery-helper.h index 90c7edcb9ab1d1..4e42181c898375 100644 --- a/drivers/power/supply/adc-battery-helper.h +++ b/drivers/power/supply/adc-battery-helper.h @@ -11,6 +11,7 @@ #define ADC_BAT_HELPER_MOV_AVG_WINDOW_SIZE 8 struct power_supply; +struct gpio_desc; /* * The adc battery helper code needs voltage- and current-now to be sampled as @@ -21,6 +22,7 @@ typedef int (*adc_battery_helper_get_func)(struct power_supply *psy, int *volt, struct adc_battery_helper { struct power_supply *psy; + struct gpio_desc *charge_finished; struct delayed_work work; struct mutex lock; adc_battery_helper_get_func get_voltage_and_current_now; @@ -44,7 +46,8 @@ extern const enum power_supply_property adc_battery_helper_properties[]; #define ADC_HELPER_NUM_PROPERTIES 7 int adc_battery_helper_init(struct adc_battery_helper *help, struct power_supply *psy, - adc_battery_helper_get_func get_voltage_and_current_now); + adc_battery_helper_get_func get_voltage_and_current_now, + struct gpio_desc *charge_finished_gpio); /* * The below functions can be directly used as power-supply / suspend-resume * callbacks. They cast the power_supply_get_drvdata() / dev_get_drvdata() data diff --git a/drivers/power/supply/ug3105_battery.c b/drivers/power/supply/ug3105_battery.c index c4d4ac859fa41f..210e0f9aa5e094 100644 --- a/drivers/power/supply/ug3105_battery.c +++ b/drivers/power/supply/ug3105_battery.c @@ -161,7 +161,7 @@ static int ug3105_probe(struct i2c_client *client) } ret = adc_battery_helper_init(&chip->helper, chip->psy, - ug3105_get_voltage_and_current_now); + ug3105_get_voltage_and_current_now, NULL); if (ret) goto stop; From 8c5795fe55278fa6a656bf4a0398fb22d5079298 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 31 Aug 2025 14:29:41 +0200 Subject: [PATCH 0931/3206] power: supply: Add new Intel Dollar Cove TI battery driver Intel has 2 completely different "Dollar Cove" PMICs for its Bay Trail / Cherry Trail SoCs. One is made by X-Powers and is called the AXP288. The AXP288's builtin charger and fuel-gauge functions are already supported by the axp288_charger / axp288_fuel_gauge drivers. The other "Dollar Cove" PMIC is made by TI and does not have any clear TI denomination, its MFD driver calls it the "Intel Dollar Cove TI PMIC". The Intel Dollar Cove TI PMIC comes with a coulomb-counters with limited functionality which is intended to work together with an always on micro-controller monitoring it for fuel-gauge functionality. Most devices with the Dollar Cove TI PMIC have full-featured fuel-gauge functionality exposed through ACPI with the information coming from either the embedded-controller or a separate full-featured fuel-gauge IC. But some designs lack this, add a battery-monitoring driver using the PMIC's coulomb-counter combined with the adc-battery-helper for capacity estimation for these designs. Register definitions were taken from kernel/drivers/platform/x86/dc_ti_cc.c from the Acer A1-840 Android kernel source-code archive named: "App. Guide_Acer_20151221_A_A.zip" which is distributed by Acer from the Acer A1-840 support page: https://www.acer.com/us-en/support/product-support/A1-840/downloads Reviewed-by: Linus Walleij Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20250831122942.47875-6-hansg@kernel.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/Kconfig | 12 + drivers/power/supply/Makefile | 1 + drivers/power/supply/intel_dc_ti_battery.c | 394 +++++++++++++++++++++ 3 files changed, 407 insertions(+) create mode 100644 drivers/power/supply/intel_dc_ti_battery.c diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig index e33a5360d33a49..72f3b2b4d346df 100644 --- a/drivers/power/supply/Kconfig +++ b/drivers/power/supply/Kconfig @@ -247,6 +247,18 @@ config BATTERY_INGENIC This driver can also be built as a module. If so, the module will be called ingenic-battery. +config BATTERY_INTEL_DC_TI + tristate "Intel Bay / Cherry Trail Dollar Cove TI battery driver" + depends on INTEL_SOC_PMIC_CHTDC_TI && INTEL_DC_TI_ADC && IIO && ACPI + select ADC_BATTERY_HELPER + help + Choose this option if you want to monitor battery status on Intel + Bay Trail / Cherry Trail tablets using the Dollar Cove TI PMIC's + coulomb-counter as fuel-gauge. + + To compile this driver as a module, choose M here: the module will be + called intel_dc_ti_battery. + config BATTERY_IPAQ_MICRO tristate "iPAQ Atmel Micro ASIC battery driver" depends on MFD_IPAQ_MICRO diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile index c85c5c441e0fe3..51e37e8bdeb35a 100644 --- a/drivers/power/supply/Makefile +++ b/drivers/power/supply/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_BATTERY_OLPC) += olpc_battery.o obj-$(CONFIG_BATTERY_SAMSUNG_SDI) += samsung-sdi-battery.o obj-$(CONFIG_BATTERY_COLLIE) += collie_battery.o obj-$(CONFIG_BATTERY_INGENIC) += ingenic-battery.o +obj-$(CONFIG_BATTERY_INTEL_DC_TI) += intel_dc_ti_battery.o obj-$(CONFIG_BATTERY_IPAQ_MICRO) += ipaq_micro_battery.o obj-$(CONFIG_BATTERY_WM97XX) += wm97xx_battery.o obj-$(CONFIG_BATTERY_SBS) += sbs-battery.o diff --git a/drivers/power/supply/intel_dc_ti_battery.c b/drivers/power/supply/intel_dc_ti_battery.c new file mode 100644 index 00000000000000..457d23b689e988 --- /dev/null +++ b/drivers/power/supply/intel_dc_ti_battery.c @@ -0,0 +1,394 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Battery driver for the coulomb-counter of the Intel Dollar Cove TI PMIC + * + * Note the Intel Dollar Cove TI PMIC coulomb-counter is not a full-featured + * autonomous fuel-gauge. It is intended to work together with an always on + * micro-controller monitoring it. + * + * Since Linux does not monitor coulomb-counter changes while the device + * is off or suspended, voltage based capacity estimation from + * the adc-battery-helper code is used. + * + * Copyright (C) 2024 Hans de Goede + * + * Register definitions and calibration code was taken from + * kernel/drivers/platform/x86/dc_ti_cc.c from the Acer A1-840 Android kernel + * which has the following copyright header: + * + * Copyright (C) 2014 Intel Corporation + * Author: Ramakrishna Pallala + * + * dc_ti_cc.c is part of the Acer A1-840 Android kernel source-code archive + * named: "App. Guide_Acer_20151221_A_A.zip" + * which is distributed by Acer from the Acer A1-840 support page: + * https://www.acer.com/us-en/support/product-support/A1-840/downloads + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "adc-battery-helper.h" + +#define DC_TI_PMIC_VERSION_REG 0x00 +#define PMIC_VERSION_A0 0xC0 +#define PMIC_VERSION_A1 0xC1 + +#define DC_TI_CC_CNTL_REG 0x60 +#define CC_CNTL_CC_CTR_EN BIT(0) +#define CC_CNTL_CC_CLR_EN BIT(1) +#define CC_CNTL_CC_CAL_EN BIT(2) +#define CC_CNTL_CC_OFFSET_EN BIT(3) +#define CC_CNTL_SMPL_INTVL GENMASK(5, 4) +#define CC_CNTL_SMPL_INTVL_15MS FIELD_PREP(CC_CNTL_SMPL_INTVL, 0) +#define CC_CNTL_SMPL_INTVL_62MS FIELD_PREP(CC_CNTL_SMPL_INTVL, 1) +#define CC_CNTL_SMPL_INTVL_125MS FIELD_PREP(CC_CNTL_SMPL_INTVL, 2) +#define CC_CNTL_SMPL_INTVL_250MS FIELD_PREP(CC_CNTL_SMPL_INTVL, 3) + +#define DC_TI_SMPL_CTR0_REG 0x69 +#define DC_TI_SMPL_CTR1_REG 0x68 +#define DC_TI_SMPL_CTR2_REG 0x67 + +#define DC_TI_CC_OFFSET_HI_REG 0x61 +#define CC_OFFSET_HI_MASK 0x3F +#define DC_TI_CC_OFFSET_LO_REG 0x62 + +#define DC_TI_SW_OFFSET_REG 0x6C + +#define DC_TI_CC_ACC3_REG 0x63 +#define DC_TI_CC_ACC2_REG 0x64 +#define DC_TI_CC_ACC1_REG 0x65 +#define DC_TI_CC_ACC0_REG 0x66 + +#define DC_TI_CC_INTG1_REG 0x6A +#define DC_TI_CC_INTG1_MASK 0x3F +#define DC_TI_CC_INTG0_REG 0x6B + +#define DC_TI_EEPROM_ACCESS_CONTROL 0x88 +#define EEPROM_UNLOCK 0xDA +#define EEPROM_LOCK 0x00 + +#define DC_TI_EEPROM_CC_GAIN_REG 0xF4 +#define CC_TRIM_REVISION GENMASK(3, 0) +#define CC_GAIN_CORRECTION GENMASK(7, 4) + +#define PMIC_VERSION_A0_TRIM_REV 3 +#define PMIC_VERSION_A1_MIN_TRIM_REV 1 + +#define DC_TI_EEPROM_CC_OFFSET_REG 0xFD + +#define DC_TI_EEPROM_CTRL 0xFE +#define EEPROM_BANK0_SEL 0x01 +#define EEPROM_BANK1_SEL 0x02 + +#define SMPL_INTVL_US 15000 +#define SMPL_INTVL_MS (SMPL_INTVL_US / USEC_PER_MSEC) +#define CALIBRATION_TIME_US (10 * SMPL_INTVL_US) +#define SLEEP_SLACK_US 2500 + +/* CC gain correction is in 0.0025 increments */ +#define CC_GAIN_STEP 25 +#define CC_GAIN_DIV 10000 + +/* CC offset is in 0.5 units per 250ms (default sample interval) */ +#define CC_OFFSET_DIV 2 +#define CC_OFFSET_SMPL_INTVL_MS 250 + +/* CC accumulator scale is 366.2 ųCoulumb / unit */ +#define CC_ACC_TO_UA(acc, smpl_ctr) \ + ((acc) * (3662 * MSEC_PER_SEC / 10) / ((smpl_ctr) * SMPL_INTVL_MS)) + +#define DEV_NAME "chtdc_ti_battery" + +struct dc_ti_battery_chip { + /* Must be the first member see adc-battery-helper documentation */ + struct adc_battery_helper helper; + struct device *dev; + struct regmap *regmap; + struct iio_channel *vbat_channel; + struct power_supply *psy; + int cc_gain; + int cc_offset; +}; + +static int dc_ti_battery_get_voltage_and_current_now(struct power_supply *psy, int *volt, int *curr) +{ + struct dc_ti_battery_chip *chip = power_supply_get_drvdata(psy); + s64 cnt_start_usec, now_usec, sleep_usec; + unsigned int reg_val; + s32 acc, smpl_ctr; + int ret; + + /* + * Enable coulomb-counter before reading Vbat from ADC, so that the CC + * samples are from the same time period as the Vbat reading. + */ + ret = regmap_write(chip->regmap, DC_TI_CC_CNTL_REG, + CC_CNTL_SMPL_INTVL_15MS | CC_CNTL_CC_OFFSET_EN | CC_CNTL_CC_CTR_EN); + if (ret) + goto out_err; + + cnt_start_usec = ktime_get_ns() / NSEC_PER_USEC; + + /* Read Vbat, convert IIO mV to power-supply ųV */ + ret = iio_read_channel_processed_scale(chip->vbat_channel, volt, 1000); + if (ret < 0) + goto out_err; + + /* Sleep at least 3 sample-times + slack to get 3+ CC samples */ + now_usec = ktime_get_ns() / NSEC_PER_USEC; + sleep_usec = 3 * SMPL_INTVL_US + SLEEP_SLACK_US - (now_usec - cnt_start_usec); + if (sleep_usec > 0 && sleep_usec < 1000000) + usleep_range(sleep_usec, sleep_usec + SLEEP_SLACK_US); + + /* + * The PMIC latches the coulomb- and sample-counters upon reading the + * CC_ACC0 register. Reading multiple registers at once is not supported. + * + * Step 1: Read CC_ACC0 - CC_ACC3 + */ + ret = regmap_read(chip->regmap, DC_TI_CC_ACC0_REG, ®_val); + if (ret) + goto out_err; + + acc = reg_val; + + ret = regmap_read(chip->regmap, DC_TI_CC_ACC1_REG, ®_val); + if (ret) + goto out_err; + + acc |= reg_val << 8; + + ret = regmap_read(chip->regmap, DC_TI_CC_ACC2_REG, ®_val); + if (ret) + goto out_err; + + acc |= reg_val << 16; + + ret = regmap_read(chip->regmap, DC_TI_CC_ACC3_REG, ®_val); + if (ret) + goto out_err; + + acc |= reg_val << 24; + + /* Step 2: Read SMPL_CTR0 - SMPL_CTR2 */ + ret = regmap_read(chip->regmap, DC_TI_SMPL_CTR0_REG, ®_val); + if (ret) + goto out_err; + + smpl_ctr = reg_val; + + ret = regmap_read(chip->regmap, DC_TI_SMPL_CTR1_REG, ®_val); + if (ret) + goto out_err; + + smpl_ctr |= reg_val << 8; + + ret = regmap_read(chip->regmap, DC_TI_SMPL_CTR2_REG, ®_val); + if (ret) + goto out_err; + + smpl_ctr |= reg_val << 16; + + /* Disable the coulumb-counter again */ + ret = regmap_write(chip->regmap, DC_TI_CC_CNTL_REG, + CC_CNTL_SMPL_INTVL_15MS | CC_CNTL_CC_OFFSET_EN); + if (ret) + goto out_err; + + /* Apply calibration */ + acc -= chip->cc_offset * smpl_ctr * SMPL_INTVL_MS / + (CC_OFFSET_SMPL_INTVL_MS * CC_OFFSET_DIV); + acc = acc * (CC_GAIN_DIV - chip->cc_gain * CC_GAIN_STEP) / CC_GAIN_DIV; + *curr = CC_ACC_TO_UA(acc, smpl_ctr); + + return 0; + +out_err: + dev_err(chip->dev, "IO-error %d communicating with PMIC\n", ret); + return ret; +} + +static const struct power_supply_desc dc_ti_battery_psy_desc = { + .name = "intel_dc_ti_battery", + .type = POWER_SUPPLY_TYPE_BATTERY, + .get_property = adc_battery_helper_get_property, + .external_power_changed = adc_battery_helper_external_power_changed, + .properties = adc_battery_helper_properties, + .num_properties = ADC_HELPER_NUM_PROPERTIES, +}; + +static int dc_ti_battery_hw_init(struct dc_ti_battery_chip *chip) +{ + u8 pmic_version, cc_trim_rev; + unsigned int reg_val; + int ret; + + /* Set sample rate to 15 ms and calibrate the coulomb-counter */ + ret = regmap_write(chip->regmap, DC_TI_CC_CNTL_REG, + CC_CNTL_SMPL_INTVL_15MS | CC_CNTL_CC_OFFSET_EN | + CC_CNTL_CC_CAL_EN | CC_CNTL_CC_CTR_EN); + if (ret) + goto out; + + fsleep(CALIBRATION_TIME_US); + + /* Disable coulomb-counter it is only used while getting the current */ + ret = regmap_write(chip->regmap, DC_TI_CC_CNTL_REG, + CC_CNTL_SMPL_INTVL_15MS | CC_CNTL_CC_OFFSET_EN); + if (ret) + goto out; + + ret = regmap_read(chip->regmap, DC_TI_PMIC_VERSION_REG, ®_val); + if (ret) + goto out; + + pmic_version = reg_val; + + /* + * As per the PMIC vendor (TI), the calibration offset and gain err + * values are stored in EEPROM Bank 0 and Bank 1 of the PMIC. + * We need to read the stored offset and gain margins and need + * to apply the corrections to the raw coulomb counter value. + */ + + /* Unlock the EEPROM Access */ + ret = regmap_write(chip->regmap, DC_TI_EEPROM_ACCESS_CONTROL, EEPROM_UNLOCK); + if (ret) + goto out; + + /* Select Bank 1 to read CC GAIN Err correction */ + ret = regmap_write(chip->regmap, DC_TI_EEPROM_CTRL, EEPROM_BANK1_SEL); + if (ret) + goto out; + + ret = regmap_read(chip->regmap, DC_TI_EEPROM_CC_GAIN_REG, ®_val); + if (ret) + goto out; + + cc_trim_rev = FIELD_GET(CC_TRIM_REVISION, reg_val); + + dev_dbg(chip->dev, "pmic-ver 0x%02x trim-rev %d\n", pmic_version, cc_trim_rev); + + if (!(pmic_version == PMIC_VERSION_A0 && cc_trim_rev == PMIC_VERSION_A0_TRIM_REV) && + !(pmic_version == PMIC_VERSION_A1 && cc_trim_rev >= PMIC_VERSION_A1_MIN_TRIM_REV)) { + dev_dbg(chip->dev, "unsupported trim-revision, using uncalibrated CC values\n"); + goto out_relock; + } + + chip->cc_gain = 1 - (int)FIELD_GET(CC_GAIN_CORRECTION, reg_val); + + /* Select Bank 0 to read CC OFFSET Correction */ + ret = regmap_write(chip->regmap, DC_TI_EEPROM_CTRL, EEPROM_BANK0_SEL); + if (ret) + goto out_relock; + + ret = regmap_read(chip->regmap, DC_TI_EEPROM_CC_OFFSET_REG, ®_val); + if (ret) + goto out_relock; + + chip->cc_offset = (s8)reg_val; + + dev_dbg(chip->dev, "cc-offset %d cc-gain %d\n", chip->cc_offset, chip->cc_gain); + +out_relock: + /* Re-lock the EEPROM Access */ + regmap_write(chip->regmap, DC_TI_EEPROM_ACCESS_CONTROL, EEPROM_LOCK); +out: + if (ret) + dev_err(chip->dev, "IO-error %d initializing PMIC\n", ret); + + return ret; +} + +static int dc_ti_battery_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct intel_soc_pmic *pmic = dev_get_drvdata(dev->parent); + struct power_supply_config psy_cfg = {}; + struct fwnode_reference_args args; + struct gpio_desc *charge_finished; + struct dc_ti_battery_chip *chip; + int ret; + + /* On most devices with a Dollar Cove TI the battery is handled by ACPI */ + if (!acpi_quirk_skip_acpi_ac_and_battery()) + return -ENODEV; + + /* ACPI glue code adds a "monitored-battery" fwnode, wait for this */ + ret = fwnode_property_get_reference_args(dev_fwnode(dev), "monitored-battery", + NULL, 0, 0, &args); + if (ret) { + dev_dbg(dev, "fwnode_property_get_ref() ret %d\n", ret); + return dev_err_probe(dev, -EPROBE_DEFER, "Waiting for monitored-battery fwnode\n"); + } + + fwnode_handle_put(args.fwnode); + + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); + if (!chip) + return -ENOMEM; + + chip->dev = dev; + chip->regmap = pmic->regmap; + + /* + * Note cannot use devm_iio_channel_get because ACPI systems lack + * the device<->channel maps which iio_channel_get will uses when passed + * a non NULL device pointer. + */ + chip->vbat_channel = devm_iio_channel_get(dev, "VBAT"); + if (IS_ERR(chip->vbat_channel)) { + dev_dbg(dev, "devm_iio_channel_get() ret %ld\n", PTR_ERR(chip->vbat_channel)); + return dev_err_probe(dev, -EPROBE_DEFER, "Waiting for VBAT IIO channel\n"); + } + + charge_finished = devm_gpiod_get_optional(dev, "charged", GPIOD_IN); + if (IS_ERR(charge_finished)) + return dev_err_probe(dev, PTR_ERR(charge_finished), "Getting charged GPIO\n"); + + ret = dc_ti_battery_hw_init(chip); + if (ret) + return ret; + + platform_set_drvdata(pdev, chip); + + psy_cfg.drv_data = chip; + chip->psy = devm_power_supply_register(dev, &dc_ti_battery_psy_desc, &psy_cfg); + if (IS_ERR(chip->psy)) + return PTR_ERR(chip->psy); + + return adc_battery_helper_init(&chip->helper, chip->psy, + dc_ti_battery_get_voltage_and_current_now, + charge_finished); +} + +static DEFINE_RUNTIME_DEV_PM_OPS(dc_ti_battery_pm_ops, adc_battery_helper_suspend, + adc_battery_helper_resume, NULL); + +static struct platform_driver dc_ti_battery_driver = { + .driver = { + .name = DEV_NAME, + .pm = pm_sleep_ptr(&dc_ti_battery_pm_ops), + }, + .probe = dc_ti_battery_probe, +}; +module_platform_driver(dc_ti_battery_driver); + +MODULE_ALIAS("platform:" DEV_NAME); +MODULE_AUTHOR("Hans de Goede "); +MODULE_DESCRIPTION("Intel Dollar Cove (TI) battery driver"); +MODULE_LICENSE("GPL"); From 2c334d038466ac509468fbe06905a32d202117db Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Sat, 23 Aug 2025 12:34:56 +0200 Subject: [PATCH 0932/3206] power: supply: bq27xxx: fix error return in case of no bq27000 hdq battery Since commit commit f16d9fb6cf03 ("power: supply: bq27xxx: Retrieve again when busy") the console log of some devices with hdq enabled but no bq27000 battery (like e.g. the Pandaboard) is flooded with messages like: [ 34.247833] power_supply bq27000-battery: driver failed to report 'status' property: -1 as soon as user-space is finding a /sys entry and trying to read the "status" property. It turns out that the offending commit changes the logic to now return the value of cache.flags if it is <0. This is likely under the assumption that it is an error number. In normal errors from bq27xxx_read() this is indeed the case. But there is special code to detect if no bq27000 is installed or accessible through hdq/1wire and wants to report this. In that case, the cache.flags are set historically by commit 3dd843e1c26a ("bq27000: report missing device better.") to constant -1 which did make reading properties return -ENODEV. So everything appeared to be fine before the return value was passed upwards. Now the -1 is returned as -EPERM instead of -ENODEV, triggering the error condition in power_supply_format_property() which then floods the console log. So we change the detection of missing bq27000 battery to simply set cache.flags = -ENODEV instead of -1. Fixes: f16d9fb6cf03 ("power: supply: bq27xxx: Retrieve again when busy") Cc: Jerry Lv Cc: stable@vger.kernel.org Signed-off-by: H. Nikolaus Schaller Link: https://lore.kernel.org/r/692f79eb6fd541adb397038ea6e750d4de2deddf.1755945297.git.hns@goldelico.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq27xxx_battery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index 93dcebbe114175..efe02ad695a620 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -1920,7 +1920,7 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di) cache.flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag); if ((cache.flags & 0xff) == 0xff) - cache.flags = -1; /* read error */ + cache.flags = -ENODEV; /* read error */ if (cache.flags >= 0) { cache.capacity = bq27xxx_battery_read_soc(di); From 1e451977e1703b6db072719b37cd1b8e250b9cc9 Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Sat, 23 Aug 2025 12:34:57 +0200 Subject: [PATCH 0933/3206] power: supply: bq27xxx: restrict no-battery detection to bq27000 There are fuel gauges in the bq27xxx series (e.g. bq27z561) which may in some cases report 0xff as the value of BQ27XXX_REG_FLAGS that should not be interpreted as "no battery" like for a disconnected battery with some built in bq27000 chip. So restrict the no-battery detection originally introduced by commit 3dd843e1c26a ("bq27000: report missing device better.") to the bq27000. There is no need to backport further because this was hidden before commit f16d9fb6cf03 ("power: supply: bq27xxx: Retrieve again when busy") Fixes: f16d9fb6cf03 ("power: supply: bq27xxx: Retrieve again when busy") Suggested-by: Jerry Lv Cc: stable@vger.kernel.org Signed-off-by: H. Nikolaus Schaller Link: https://lore.kernel.org/r/dd979fa6855fd051ee5117016c58daaa05966e24.1755945297.git.hns@goldelico.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq27xxx_battery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index efe02ad695a620..ad2d9ecf32a5ae 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -1919,8 +1919,8 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di) bool has_singe_flag = di->opts & BQ27XXX_O_ZERO; cache.flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag); - if ((cache.flags & 0xff) == 0xff) - cache.flags = -ENODEV; /* read error */ + if (di->chip == BQ27000 && (cache.flags & 0xff) == 0xff) + cache.flags = -ENODEV; /* bq27000 hdq read error */ if (cache.flags >= 0) { cache.capacity = bq27xxx_battery_read_soc(di); From 03e79de4608bdd48ad6eec272e196124cefaf798 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Thu, 4 Sep 2025 11:13:34 +0200 Subject: [PATCH 0934/3206] net: fec: Fix possible NPD in fec_enet_phy_reset_after_clk_enable() The function of_phy_find_device may return NULL, so we need to take care before dereferencing phy_dev. Fixes: 64a632da538a ("net: fec: Fix phy_device lookup for phy_reset_after_clk_enable()") Signed-off-by: Stefan Wahren Cc: Christoph Niedermaier Cc: Richard Leitner Reviewed-by: Simon Horman Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250904091334.53965-1-wahrenst@gmx.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 1383918f8a3fc6..adf1f2bbcbb16f 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2363,7 +2363,8 @@ static void fec_enet_phy_reset_after_clk_enable(struct net_device *ndev) */ phy_dev = of_phy_find_device(fep->phy_node); phy_reset_after_clk_enable(phy_dev); - put_device(&phy_dev->mdio.dev); + if (phy_dev) + put_device(&phy_dev->mdio.dev); } } From 0ba5b2f2c381dbec9ed9e4ab3ae5d3e667de0dc3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 4 Sep 2025 15:52:37 +0300 Subject: [PATCH 0935/3206] net: phylink: add lock for serializing concurrent pl->phydev writes with resolver Currently phylink_resolve() protects itself against concurrent phylink_bringup_phy() or phylink_disconnect_phy() calls which modify pl->phydev by relying on pl->state_mutex. The problem is that in phylink_resolve(), pl->state_mutex is in a lock inversion state with pl->phydev->lock. So pl->phydev->lock needs to be acquired prior to pl->state_mutex. But that requires dereferencing pl->phydev in the first place, and without pl->state_mutex, that is racy. Hence the reason for the extra lock. Currently it is redundant, but it will serve a functional purpose once mutex_lock(&phy->lock) will be moved outside of the mutex_lock(&pl->state_mutex) section. Another alternative considered would have been to let phylink_resolve() acquire the rtnl_mutex, which is also held when phylink_bringup_phy() and phylink_disconnect_phy() are called. But since phylink_disconnect_phy() runs under rtnl_lock(), it would deadlock with phylink_resolve() when calling flush_work(&pl->resolve). Additionally, it would have been undesirable because it would have unnecessarily blocked many other call paths as well in the entire kernel, so the smaller-scoped lock was preferred. Link: https://lore.kernel.org/netdev/aLb6puGVzR29GpPx@shell.armlinux.org.uk/ Signed-off-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250904125238.193990-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index c7cb95aa80074a..aa17ad2622fc7c 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -67,6 +67,8 @@ struct phylink { struct timer_list link_poll; struct mutex state_mutex; + /* Serialize updates to pl->phydev with phylink_resolve() */ + struct mutex phydev_mutex; struct phylink_link_state phy_state; unsigned int phy_ib_mode; struct work_struct resolve; @@ -1591,8 +1593,11 @@ static void phylink_resolve(struct work_struct *w) struct phylink_link_state link_state; bool mac_config = false; bool retrigger = false; + struct phy_device *phy; bool cur_link_state; + mutex_lock(&pl->phydev_mutex); + phy = pl->phydev; mutex_lock(&pl->state_mutex); cur_link_state = phylink_link_is_up(pl); @@ -1626,11 +1631,11 @@ static void phylink_resolve(struct work_struct *w) /* If we have a phy, the "up" state is the union of both the * PHY and the MAC */ - if (pl->phydev) + if (phy) link_state.link &= pl->phy_state.link; /* Only update if the PHY link is up */ - if (pl->phydev && pl->phy_state.link) { + if (phy && pl->phy_state.link) { /* If the interface has changed, force a link down * event if the link isn't already down, and re-resolve. */ @@ -1694,6 +1699,7 @@ static void phylink_resolve(struct work_struct *w) queue_work(system_power_efficient_wq, &pl->resolve); } mutex_unlock(&pl->state_mutex); + mutex_unlock(&pl->phydev_mutex); } static void phylink_run_resolve(struct phylink *pl) @@ -1829,6 +1835,7 @@ struct phylink *phylink_create(struct phylink_config *config, if (!pl) return ERR_PTR(-ENOMEM); + mutex_init(&pl->phydev_mutex); mutex_init(&pl->state_mutex); INIT_WORK(&pl->resolve, phylink_resolve); @@ -2089,6 +2096,7 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, dev_name(&phy->mdio.dev), phy->drv->name, irq_str); kfree(irq_str); + mutex_lock(&pl->phydev_mutex); mutex_lock(&phy->lock); mutex_lock(&pl->state_mutex); pl->phydev = phy; @@ -2134,6 +2142,7 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, mutex_unlock(&pl->state_mutex); mutex_unlock(&phy->lock); + mutex_unlock(&pl->phydev_mutex); phylink_dbg(pl, "phy: %s setting supported %*pb advertising %*pb\n", @@ -2312,6 +2321,7 @@ void phylink_disconnect_phy(struct phylink *pl) ASSERT_RTNL(); + mutex_lock(&pl->phydev_mutex); phy = pl->phydev; if (phy) { mutex_lock(&phy->lock); @@ -2321,8 +2331,11 @@ void phylink_disconnect_phy(struct phylink *pl) pl->mac_tx_clk_stop = false; mutex_unlock(&pl->state_mutex); mutex_unlock(&phy->lock); - flush_work(&pl->resolve); + } + mutex_unlock(&pl->phydev_mutex); + if (phy) { + flush_work(&pl->resolve); phy_disconnect(phy); } } From e2a10daba84968f6b5777d150985fd7d6abc9c84 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 4 Sep 2025 15:52:38 +0300 Subject: [PATCH 0936/3206] net: phy: transfer phy_config_inband() locking responsibility to phylink Problem description =================== Lockdep reports a possible circular locking dependency (AB/BA) between &pl->state_mutex and &phy->lock, as follows. phylink_resolve() // acquires &pl->state_mutex -> phylink_major_config() -> phy_config_inband() // acquires &pl->phydev->lock whereas all the other call sites where &pl->state_mutex and &pl->phydev->lock have the locking scheme reversed. Everywhere else, &pl->phydev->lock is acquired at the top level, and &pl->state_mutex at the lower level. A clear example is phylink_bringup_phy(). The outlier is the newly introduced phy_config_inband() and the existing lock order is the correct one. To understand why it cannot be the other way around, it is sufficient to consider phylink_phy_change(), phylink's callback from the PHY device's phy->phy_link_change() virtual method, invoked by the PHY state machine. phy_link_up() and phy_link_down(), the (indirect) callers of phylink_phy_change(), are called with &phydev->lock acquired. Then phylink_phy_change() acquires its own &pl->state_mutex, to serialize changes made to its pl->phy_state and pl->link_config. So all other instances of &pl->state_mutex and &phydev->lock must be consistent with this order. Problem impact ============== I think the kernel runs a serious deadlock risk if an existing phylink_resolve() thread, which results in a phy_config_inband() call, is concurrent with a phy_link_up() or phy_link_down() call, which will deadlock on &pl->state_mutex in phylink_phy_change(). Practically speaking, the impact may be limited by the slow speed of the medium auto-negotiation protocol, which makes it unlikely for the current state to still be unresolved when a new one is detected, but I think the problem is there. Nonetheless, the problem was discovered using lockdep. Proposed solution ================= Practically speaking, the phy_config_inband() requirement of having phydev->lock acquired must transfer to the caller (phylink is the only caller). There, it must bubble up until immediately before &pl->state_mutex is acquired, for the cases where that takes place. Solution details, considerations, notes ======================================= This is the phy_config_inband() call graph: sfp_upstream_ops :: connect_phy() | v phylink_sfp_connect_phy() | v phylink_sfp_config_phy() | | sfp_upstream_ops :: module_insert() | | | v | phylink_sfp_module_insert() | | | | sfp_upstream_ops :: module_start() | | | | | v | | phylink_sfp_module_start() | | | | v v | phylink_sfp_config_optical() phylink_start() | | | phylink_resume() v v | | phylink_sfp_set_config() | | | v v v phylink_mac_initial_config() | phylink_resolve() | | phylink_ethtool_ksettings_set() v v v phylink_major_config() | v phy_config_inband() phylink_major_config() caller #1, phylink_mac_initial_config(), does not acquire &pl->state_mutex nor do its callers. It must acquire &pl->phydev->lock prior to calling phylink_major_config(). phylink_major_config() caller #2, phylink_resolve() acquires &pl->state_mutex, thus also needs to acquire &pl->phydev->lock. phylink_major_config() caller #3, phylink_ethtool_ksettings_set(), is completely uninteresting, because it only calls phylink_major_config() if pl->phydev is NULL (otherwise it calls phy_ethtool_ksettings_set()). We need to change nothing there. Other solutions =============== The lock inversion between &pl->state_mutex and &pl->phydev->lock has occurred at least once before, as seen in commit c718af2d00a3 ("net: phylink: fix ethtool -A with attached PHYs"). The solution there was to simply not call phy_set_asym_pause() under the &pl->state_mutex. That cannot be extended to our case though, where the phy_config_inband() call is much deeper inside the &pl->state_mutex section. Fixes: 5fd0f1a02e75 ("net: phylink: add negotiation of in-band capabilities") Signed-off-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250904125238.193990-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 12 ++++-------- drivers/net/phy/phylink.c | 9 +++++++++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 13df28445f0201..c02da57a4da5e3 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1065,23 +1065,19 @@ EXPORT_SYMBOL_GPL(phy_inband_caps); */ int phy_config_inband(struct phy_device *phydev, unsigned int modes) { - int err; + lockdep_assert_held(&phydev->lock); if (!!(modes & LINK_INBAND_DISABLE) + !!(modes & LINK_INBAND_ENABLE) + !!(modes & LINK_INBAND_BYPASS) != 1) return -EINVAL; - mutex_lock(&phydev->lock); if (!phydev->drv) - err = -EIO; + return -EIO; else if (!phydev->drv->config_inband) - err = -EOPNOTSUPP; - else - err = phydev->drv->config_inband(phydev, modes); - mutex_unlock(&phydev->lock); + return -EOPNOTSUPP; - return err; + return phydev->drv->config_inband(phydev, modes); } EXPORT_SYMBOL(phy_config_inband); diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index aa17ad2622fc7c..1988b7d2089a6c 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1434,6 +1434,7 @@ static void phylink_get_fixed_state(struct phylink *pl, static void phylink_mac_initial_config(struct phylink *pl, bool force_restart) { struct phylink_link_state link_state; + struct phy_device *phy = pl->phydev; switch (pl->req_link_an_mode) { case MLO_AN_PHY: @@ -1457,7 +1458,11 @@ static void phylink_mac_initial_config(struct phylink *pl, bool force_restart) link_state.link = false; phylink_apply_manual_flow(pl, &link_state); + if (phy) + mutex_lock(&phy->lock); phylink_major_config(pl, force_restart, &link_state); + if (phy) + mutex_unlock(&phy->lock); } static const char *phylink_pause_to_str(int pause) @@ -1598,6 +1603,8 @@ static void phylink_resolve(struct work_struct *w) mutex_lock(&pl->phydev_mutex); phy = pl->phydev; + if (phy) + mutex_lock(&phy->lock); mutex_lock(&pl->state_mutex); cur_link_state = phylink_link_is_up(pl); @@ -1699,6 +1706,8 @@ static void phylink_resolve(struct work_struct *w) queue_work(system_power_efficient_wq, &pl->resolve); } mutex_unlock(&pl->state_mutex); + if (phy) + mutex_unlock(&phy->lock); mutex_unlock(&pl->phydev_mutex); } From 19591f7e781fd1e68228f5b3bee60be6425af886 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Sep 2025 20:59:13 -0700 Subject: [PATCH 0937/3206] fscrypt: use HMAC-SHA512 library for HKDF For the HKDF-SHA512 key derivation needed by fscrypt, just use the HMAC-SHA512 library functions directly. These functions were introduced in v6.17, and they provide simple and efficient direct support for HMAC-SHA512. This ends up being quite a bit simpler and more efficient than using crypto/hkdf.c, as it avoids the generic crypto layer: - The HMAC library can't fail, so callers don't need to handle errors - No inefficient indirect calls - No inefficient and error-prone dynamic allocations - No inefficient and error-prone loading of algorithm by name - Less stack usage Benchmarks on x86_64 show that deriving a per-file key gets about 30% faster, and FS_IOC_ADD_ENCRYPTION_KEY gets nearly twice as fast. The only small downside is the HKDF-Expand logic gets duplicated again. Then again, even considering that, the new fscrypt_hkdf_expand() is only 7 lines longer than the version that called hkdf_expand(). Later we could add HKDF support to lib/crypto/, but for now let's just do this. Link: https://lore.kernel.org/r/20250906035913.1141532-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/Kconfig | 5 +- fs/crypto/fname.c | 1 - fs/crypto/fscrypt_private.h | 26 ++++----- fs/crypto/hkdf.c | 109 +++++++++++++----------------------- fs/crypto/hooks.c | 2 +- fs/crypto/keyring.c | 30 +++------- fs/crypto/keysetup.c | 65 +++++++-------------- fs/crypto/policy.c | 4 +- 8 files changed, 82 insertions(+), 160 deletions(-) diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index b5dfb0aa405ab0..464b54610fd346 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -2,10 +2,9 @@ config FS_ENCRYPTION bool "FS Encryption (Per-file encryption)" select CRYPTO - select CRYPTO_HASH - select CRYPTO_HKDF select CRYPTO_SKCIPHER select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_SHA512 select KEYS help Enable encryption of files and directories. This @@ -32,8 +31,6 @@ config FS_ENCRYPTION_ALGS select CRYPTO_CBC select CRYPTO_CTS select CRYPTO_ECB - select CRYPTO_HMAC - select CRYPTO_SHA512 select CRYPTO_XTS config FS_ENCRYPTION_INLINE_CRYPT diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index f9f6713e144f7a..5fa1eb58bb1d54 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -11,7 +11,6 @@ * This has not yet undergone a rigorous security audit. */ -#include #include #include #include diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d8b485b9881c50..3a09f45887a47a 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -11,10 +11,10 @@ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H +#include #include #include #include -#include #include #define CONST_STRLEN(str) (sizeof(str) - 1) @@ -381,12 +381,8 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 *encrypted_len_ret); /* hkdf.c */ -struct fscrypt_hkdf { - struct crypto_shash *hmac_tfm; -}; - -int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, - unsigned int master_key_size); +void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key, + unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as @@ -405,11 +401,9 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, #define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY \ 8 /* info= */ -int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen); - -void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); +void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT @@ -517,7 +511,7 @@ struct fscrypt_master_key_secret { * ->is_hw_wrapped=false, or by the "software secret" that hardware * derived from this master key if ->is_hw_wrapped=true. */ - struct fscrypt_hkdf hkdf; + struct hmac_sha512_key hkdf; /* * True if this key is a hardware-wrapped key; false if this key is a @@ -696,7 +690,7 @@ struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); -int fscrypt_get_test_dummy_key_identifier( +void fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_add_test_dummy_key(struct super_block *sb, @@ -732,8 +726,8 @@ void fscrypt_destroy_prepared_key(struct super_block *sb, int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key); -int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, - const struct fscrypt_master_key *mk); +void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, + const struct fscrypt_master_key *mk); void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index b1ef506cd341de..706f56d0076eed 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation + * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): + * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". + * * This is used to derive keys from the fscrypt master keys (or from the * "software secrets" which hardware derives from the fscrypt master keys, in * the case that the fscrypt master keys are hardware-wrapped keys). @@ -7,10 +11,6 @@ * Copyright 2019 Google LLC */ -#include -#include -#include - #include "fscrypt_private.h" /* @@ -24,7 +24,6 @@ * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two. */ -#define HKDF_HMAC_ALG "hmac(sha512)" #define HKDF_HASHLEN SHA512_DIGEST_SIZE /* @@ -44,54 +43,24 @@ */ /* - * Compute HKDF-Extract using the given master key as the input keying material, - * and prepare an HMAC transform object keyed by the resulting pseudorandom key. - * - * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many - * times without having to recompute HKDF-Extract each time. + * Compute HKDF-Extract using 'master_key' as the input keying material, and + * prepare the resulting HMAC key in 'hkdf'. Afterwards, 'hkdf' can be used for + * HKDF-Expand many times without having to recompute HKDF-Extract each time. */ -int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, - unsigned int master_key_size) +void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key, + unsigned int master_key_size) { - struct crypto_shash *hmac_tfm; static const u8 default_salt[HKDF_HASHLEN]; u8 prk[HKDF_HASHLEN]; - int err; - - hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, FSCRYPT_CRYPTOAPI_MASK); - if (IS_ERR(hmac_tfm)) { - fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld", - PTR_ERR(hmac_tfm)); - return PTR_ERR(hmac_tfm); - } - - if (WARN_ON_ONCE(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { - err = -EINVAL; - goto err_free_tfm; - } - - err = hkdf_extract(hmac_tfm, master_key, master_key_size, - default_salt, HKDF_HASHLEN, prk); - if (err) - goto err_free_tfm; - - err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk)); - if (err) - goto err_free_tfm; - hkdf->hmac_tfm = hmac_tfm; - goto out; - -err_free_tfm: - crypto_free_shash(hmac_tfm); -out: + hmac_sha512_usingrawkey(default_salt, sizeof(default_salt), + master_key, master_key_size, prk); + hmac_sha512_preparekey(hkdf, prk, sizeof(prk)); memzero_explicit(prk, sizeof(prk)); - return err; } /* - * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which - * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen' + * HKDF-Expand (RFC 5869 section 2.3). Expand the HMAC key 'hkdf' into 'okmlen' * bytes of output keying material parameterized by the application-specific * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context' * byte. This is thread-safe and may be called by multiple threads in parallel. @@ -100,30 +69,32 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, * adds to its application-specific info strings to guarantee that it doesn't * accidentally repeat an info string when using HKDF for different purposes.) */ -int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen) -{ - SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); - u8 *full_info; - int err; - - full_info = kzalloc(infolen + 9, GFP_KERNEL); - if (!full_info) - return -ENOMEM; - desc->tfm = hkdf->hmac_tfm; - - memcpy(full_info, "fscrypt\0", 8); - full_info[8] = context; - memcpy(full_info + 9, info, infolen); - - err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9, - okm, okmlen); - kfree_sensitive(full_info); - return err; -} - -void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf) +void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen) { - crypto_free_shash(hkdf->hmac_tfm); + struct hmac_sha512_ctx ctx; + u8 counter = 1; + u8 tmp[HKDF_HASHLEN]; + + WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN); + + for (unsigned int i = 0; i < okmlen; i += HKDF_HASHLEN) { + hmac_sha512_init(&ctx, hkdf); + if (i != 0) + hmac_sha512_update(&ctx, &okm[i - HKDF_HASHLEN], + HKDF_HASHLEN); + hmac_sha512_update(&ctx, "fscrypt\0", 8); + hmac_sha512_update(&ctx, &context, 1); + hmac_sha512_update(&ctx, info, infolen); + hmac_sha512_update(&ctx, &counter, 1); + if (okmlen - i < HKDF_HASHLEN) { + hmac_sha512_final(&ctx, tmp); + memcpy(&okm[i], tmp, okmlen - i); + memzero_explicit(tmp, sizeof(tmp)); + } else { + hmac_sha512_final(&ctx, &okm[i]); + } + counter++; + } } diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index e0b32ac841f765..0fb3496551b941 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -205,7 +205,7 @@ int fscrypt_prepare_setflags(struct inode *inode, mk = ci->ci_master_key; down_read(&mk->mk_sem); if (mk->mk_present) - err = fscrypt_derive_dirhash_key(ci, mk); + fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; up_read(&mk->mk_sem); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 7557f6a88b8f32..3adbd7167055a9 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -42,7 +42,6 @@ struct fscrypt_keyring { static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) { - fscrypt_destroy_hkdf(&secret->hkdf); memzero_explicit(secret, sizeof(*secret)); } @@ -587,21 +586,17 @@ static int add_master_key(struct super_block *sb, keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY; } - err = fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size); + fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size); /* * Now that the KDF context is initialized, the raw KDF key is * no longer needed. */ memzero_explicit(kdf_key, kdf_key_size); - if (err) - return err; /* Calculate the key identifier */ - err = fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0, - key_spec->u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - if (err) - return err; + fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0, + key_spec->u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); } return do_add_master_key(sb, secret, key_spec); } @@ -835,24 +830,17 @@ fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret) memcpy(secret->bytes, test_key, sizeof(test_key)); } -int fscrypt_get_test_dummy_key_identifier( +void fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_master_key_secret secret; - int err; fscrypt_get_test_dummy_secret(&secret); - - err = fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size); - if (err) - goto out; - err = fscrypt_hkdf_expand(&secret.hkdf, - HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, - NULL, 0, key_identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); -out: + fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size); + fscrypt_hkdf_expand(&secret.hkdf, + HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, NULL, 0, + key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); wipe_master_key_secret(&secret); - return err; } /** diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 4f3b9ecbfe4e66..a3ed96eaebc857 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -253,11 +253,8 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, sizeof(sb->s_uuid)); hkdf_infolen += sizeof(sb->s_uuid); } - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - hkdf_context, hkdf_info, hkdf_infolen, - mode_key, mode->keysize); - if (err) - goto out_unlock; + fscrypt_hkdf_expand(&mk->mk_secret.hkdf, hkdf_context, hkdf_info, + hkdf_infolen, mode_key, mode->keysize); err = fscrypt_prepare_key(prep_key, mode_key, ci); memzero_explicit(mode_key, mode->keysize); if (err) @@ -278,36 +275,25 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, * as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an * endianness swap in order to get the same results as on little endian CPUs. */ -static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, - u8 context, const u8 *info, - unsigned int infolen, siphash_key_t *key) +static void fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, + u8 context, const u8 *info, + unsigned int infolen, siphash_key_t *key) { - int err; - - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, - (u8 *)key, sizeof(*key)); - if (err) - return err; - + fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, + (u8 *)key, sizeof(*key)); BUILD_BUG_ON(sizeof(*key) != 16); BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2); le64_to_cpus(&key->key[0]); le64_to_cpus(&key->key[1]); - return 0; } -int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, - const struct fscrypt_master_key *mk) +void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, + const struct fscrypt_master_key *mk) { - int err; - - err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, - ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, - &ci->ci_dirhash_key); - if (err) - return err; + fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, + ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, + &ci->ci_dirhash_key); ci->ci_dirhash_key_initialized = true; - return 0; } void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, @@ -338,17 +324,12 @@ static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci, if (mk->mk_ino_hash_key_initialized) goto unlock; - err = fscrypt_derive_siphash_key(mk, - HKDF_CONTEXT_INODE_HASH_KEY, - NULL, 0, &mk->mk_ino_hash_key); - if (err) - goto unlock; + fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_INODE_HASH_KEY, + NULL, 0, &mk->mk_ino_hash_key); /* pairs with smp_load_acquire() above */ smp_store_release(&mk->mk_ino_hash_key_initialized, true); unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); - if (err) - return err; } /* @@ -402,13 +383,10 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, } else { u8 derived_key[FSCRYPT_MAX_RAW_KEY_SIZE]; - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - HKDF_CONTEXT_PER_FILE_ENC_KEY, - ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, - derived_key, ci->ci_mode->keysize); - if (err) - return err; - + fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_PER_FILE_ENC_KEY, + ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, + derived_key, ci->ci_mode->keysize); err = fscrypt_set_per_file_enc_key(ci, derived_key); memzero_explicit(derived_key, ci->ci_mode->keysize); } @@ -416,11 +394,8 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, return err; /* Derive a secret dirhash key for directories that need it. */ - if (need_dirhash_key) { - err = fscrypt_derive_dirhash_key(ci, mk); - if (err) - return err; - } + if (need_dirhash_key) + fscrypt_derive_dirhash_key(ci, mk); return 0; } diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6ad30ae07c065c..cf38b36d7bdfda 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -826,10 +826,8 @@ int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, policy->version = FSCRYPT_POLICY_V2; policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - err = fscrypt_get_test_dummy_key_identifier( + fscrypt_get_test_dummy_key_identifier( policy->v2.master_key_identifier); - if (err) - goto out; } else { err = -EINVAL; goto out; From 47c7f3b7038787ee42cb9ed69825e6790dba9410 Mon Sep 17 00:00:00 2001 From: Saket Kumar Bhaskar Date: Thu, 4 Sep 2025 15:38:32 +0530 Subject: [PATCH 0938/3206] powerpc64/bpf: Implement PROBE_MEM32 pseudo instructions Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW] instructions. They are similar to PROBE_MEM instructions with the following differences: - PROBE_MEM32 supports store. - PROBE_MEM32 relies on the verifier to clear upper 32-bit of the src/dst register - PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in _R26 in the prologue). Due to bpf_arena constructions such _R26 + reg + off16 access is guaranteed to be within arena virtual range, so no address check at run-time. - PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When LDX faults the destination register is zeroed. To support these on powerpc, we do tmp1 = _R26 + src/dst reg and then use tmp1 as the new src/dst register. This allows us to reuse most of the code for normal [LDX | STX | ST]. Additionally, bpf_jit_emit_probe_mem_store() is introduced to emit instructions for storing memory values depending on the size (byte, halfword, word, doubleword). Stack layout is adjusted to introduce a new NVR (_R26) and to make BPF_PPC_STACKFRAME quadword aligned (local_tmp_var is increased by 8 bytes). Reviewed-by: Hari Bathini Tested-by: Venkat Rao Bagalkote Signed-off-by: Saket Kumar Bhaskar Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250904100835.1100423-2-skb99@linux.ibm.com --- arch/powerpc/net/bpf_jit.h | 5 +- arch/powerpc/net/bpf_jit_comp.c | 10 +- arch/powerpc/net/bpf_jit_comp32.c | 2 +- arch/powerpc/net/bpf_jit_comp64.c | 162 ++++++++++++++++++++++++++---- 4 files changed, 155 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 64c7e9a576fb06..b4a5f58b7e3d48 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -161,9 +161,10 @@ struct codegen_context { unsigned int seen; unsigned int idx; unsigned int stack_size; - int b2p[MAX_BPF_JIT_REG + 2]; + int b2p[MAX_BPF_JIT_REG + 3]; unsigned int exentry_idx; unsigned int alt_exit_addr; + u64 arena_vm_start; }; #define bpf_to_ppc(r) (ctx->b2p[r]) @@ -201,7 +202,7 @@ int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int pass, struct codegen_context *ctx, int insn_idx, - int jmp_off, int dst_reg); + int jmp_off, int dst_reg, u32 code); #endif diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index c0684733e9d6ac..7d070232159f2b 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -204,6 +204,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) /* Make sure that the stack is quadword aligned. */ cgctx.stack_size = round_up(fp->aux->stack_depth, 16); + cgctx.arena_vm_start = bpf_arena_get_kern_vm_start(fp->aux->arena); /* Scouting faux-generate pass 0 */ if (bpf_jit_build_body(fp, NULL, NULL, &cgctx, addrs, 0, false)) { @@ -326,7 +327,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) */ int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int pass, struct codegen_context *ctx, int insn_idx, int jmp_off, - int dst_reg) + int dst_reg, u32 code) { off_t offset; unsigned long pc; @@ -355,6 +356,9 @@ int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int pass (ctx->exentry_idx * BPF_FIXUP_LEN * 4); fixup[0] = PPC_RAW_LI(dst_reg, 0); + if (BPF_CLASS(code) == BPF_ST || BPF_CLASS(code) == BPF_STX) + fixup[0] = PPC_RAW_NOP(); + if (IS_ENABLED(CONFIG_PPC32)) fixup[1] = PPC_RAW_LI(dst_reg - 1, 0); /* clear higher 32-bit register too */ @@ -579,7 +583,7 @@ static void bpf_trampoline_setup_tail_call_cnt(u32 *image, struct codegen_contex { if (IS_ENABLED(CONFIG_PPC64)) { /* See bpf_jit_stack_tailcallcnt() */ - int tailcallcnt_offset = 6 * 8; + int tailcallcnt_offset = 7 * 8; EMIT(PPC_RAW_LL(_R3, _R1, func_frame_offset - tailcallcnt_offset)); EMIT(PPC_RAW_STL(_R3, _R1, -tailcallcnt_offset)); @@ -594,7 +598,7 @@ static void bpf_trampoline_restore_tail_call_cnt(u32 *image, struct codegen_cont { if (IS_ENABLED(CONFIG_PPC64)) { /* See bpf_jit_stack_tailcallcnt() */ - int tailcallcnt_offset = 6 * 8; + int tailcallcnt_offset = 7 * 8; EMIT(PPC_RAW_LL(_R3, _R1, -tailcallcnt_offset)); EMIT(PPC_RAW_STL(_R3, _R1, func_frame_offset - tailcallcnt_offset)); diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 0aace304dfe191..3087e744fb2504 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -1087,7 +1087,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code } ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, insn_idx, - jmp_off, dst_reg); + jmp_off, dst_reg, code); if (ret) return ret; } diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 025524378443e6..569619f1b31c34 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -25,18 +25,18 @@ * with our redzone usage. * * [ prev sp ] <------------- - * [ nv gpr save area ] 5*8 | + * [ nv gpr save area ] 6*8 | * [ tail_call_cnt ] 8 | - * [ local_tmp_var ] 16 | + * [ local_tmp_var ] 24 | * fp (r31) --> [ ebpf stack space ] upto 512 | * [ frame header ] 32/112 | * sp (r1) ---> [ stack pointer ] -------------- */ /* for gpr non volatile registers BPG_REG_6 to 10 */ -#define BPF_PPC_STACK_SAVE (5*8) +#define BPF_PPC_STACK_SAVE (6*8) /* for bpf JIT code internal usage */ -#define BPF_PPC_STACK_LOCALS 24 +#define BPF_PPC_STACK_LOCALS 32 /* stack frame excluding BPF stack, ensure this is quadword aligned */ #define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + \ BPF_PPC_STACK_LOCALS + BPF_PPC_STACK_SAVE) @@ -44,6 +44,7 @@ /* BPF register usage */ #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) +#define ARENA_VM_START (MAX_BPF_JIT_REG + 2) /* BPF to ppc register mappings */ void bpf_jit_init_reg_mapping(struct codegen_context *ctx) @@ -67,10 +68,12 @@ void bpf_jit_init_reg_mapping(struct codegen_context *ctx) ctx->b2p[BPF_REG_AX] = _R12; ctx->b2p[TMP_REG_1] = _R9; ctx->b2p[TMP_REG_2] = _R10; + /* non volatile register for kern_vm_start address */ + ctx->b2p[ARENA_VM_START] = _R26; } -/* PPC NVR range -- update this if we ever use NVRs below r27 */ -#define BPF_PPC_NVR_MIN _R27 +/* PPC NVR range -- update this if we ever use NVRs below r26 */ +#define BPF_PPC_NVR_MIN _R26 static inline bool bpf_has_stack_frame(struct codegen_context *ctx) { @@ -89,9 +92,9 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) * [ prev sp ] <------------- * [ ... ] | * sp (r1) ---> [ stack pointer ] -------------- - * [ nv gpr save area ] 5*8 + * [ nv gpr save area ] 6*8 * [ tail_call_cnt ] 8 - * [ local_tmp_var ] 16 + * [ local_tmp_var ] 24 * [ unused red zone ] 224 */ static int bpf_jit_stack_local(struct codegen_context *ctx) @@ -99,12 +102,12 @@ static int bpf_jit_stack_local(struct codegen_context *ctx) if (bpf_has_stack_frame(ctx)) return STACK_FRAME_MIN_SIZE + ctx->stack_size; else - return -(BPF_PPC_STACK_SAVE + 24); + return -(BPF_PPC_STACK_SAVE + 32); } static int bpf_jit_stack_tailcallcnt(struct codegen_context *ctx) { - return bpf_jit_stack_local(ctx) + 16; + return bpf_jit_stack_local(ctx) + 24; } static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) @@ -170,10 +173,17 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) if (bpf_is_seen_register(ctx, bpf_to_ppc(i))) EMIT(PPC_RAW_STD(bpf_to_ppc(i), _R1, bpf_jit_stack_offsetof(ctx, bpf_to_ppc(i)))); + if (ctx->arena_vm_start) + EMIT(PPC_RAW_STD(bpf_to_ppc(ARENA_VM_START), _R1, + bpf_jit_stack_offsetof(ctx, bpf_to_ppc(ARENA_VM_START)))); + /* Setup frame pointer to point to the bpf stack area */ if (bpf_is_seen_register(ctx, bpf_to_ppc(BPF_REG_FP))) EMIT(PPC_RAW_ADDI(bpf_to_ppc(BPF_REG_FP), _R1, STACK_FRAME_MIN_SIZE + ctx->stack_size)); + + if (ctx->arena_vm_start) + PPC_LI64(bpf_to_ppc(ARENA_VM_START), ctx->arena_vm_start); } static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx) @@ -185,6 +195,10 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx if (bpf_is_seen_register(ctx, bpf_to_ppc(i))) EMIT(PPC_RAW_LD(bpf_to_ppc(i), _R1, bpf_jit_stack_offsetof(ctx, bpf_to_ppc(i)))); + if (ctx->arena_vm_start) + EMIT(PPC_RAW_LD(bpf_to_ppc(ARENA_VM_START), _R1, + bpf_jit_stack_offsetof(ctx, bpf_to_ppc(ARENA_VM_START)))); + /* Tear down our stack frame */ if (bpf_has_stack_frame(ctx)) { EMIT(PPC_RAW_ADDI(_R1, _R1, BPF_PPC_STACKFRAME + ctx->stack_size)); @@ -396,11 +410,11 @@ void bpf_stf_barrier(void); asm ( " .global bpf_stf_barrier ;" " bpf_stf_barrier: ;" -" std 21,-64(1) ;" -" std 22,-56(1) ;" +" std 21,-80(1) ;" +" std 22,-72(1) ;" " sync ;" -" ld 21,-64(1) ;" -" ld 22,-56(1) ;" +" ld 21,-80(1) ;" +" ld 22,-72(1) ;" " ori 31,31,0 ;" " .rept 14 ;" " b 1f ;" @@ -409,6 +423,36 @@ asm ( " blr ;" ); +static int bpf_jit_emit_probe_mem_store(struct codegen_context *ctx, u32 src_reg, s16 off, + u32 code, u32 *image) +{ + u32 tmp1_reg = bpf_to_ppc(TMP_REG_1); + u32 tmp2_reg = bpf_to_ppc(TMP_REG_2); + + switch (BPF_SIZE(code)) { + case BPF_B: + EMIT(PPC_RAW_STB(src_reg, tmp1_reg, off)); + break; + case BPF_H: + EMIT(PPC_RAW_STH(src_reg, tmp1_reg, off)); + break; + case BPF_W: + EMIT(PPC_RAW_STW(src_reg, tmp1_reg, off)); + break; + case BPF_DW: + if (off % 4) { + EMIT(PPC_RAW_LI(tmp2_reg, off)); + EMIT(PPC_RAW_STDX(src_reg, tmp1_reg, tmp2_reg)); + } else { + EMIT(PPC_RAW_STD(src_reg, tmp1_reg, off)); + } + break; + default: + return -EINVAL; + } + return 0; +} + static int emit_atomic_ld_st(const struct bpf_insn insn, struct codegen_context *ctx, u32 *image) { u32 code = insn.code; @@ -960,6 +1004,50 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code } break; + case BPF_STX | BPF_PROBE_MEM32 | BPF_B: + case BPF_STX | BPF_PROBE_MEM32 | BPF_H: + case BPF_STX | BPF_PROBE_MEM32 | BPF_W: + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: + + EMIT(PPC_RAW_ADD(tmp1_reg, dst_reg, bpf_to_ppc(ARENA_VM_START))); + + ret = bpf_jit_emit_probe_mem_store(ctx, src_reg, off, code, image); + if (ret) + return ret; + + ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, + ctx->idx - 1, 4, -1, code); + if (ret) + return ret; + + break; + + case BPF_ST | BPF_PROBE_MEM32 | BPF_B: + case BPF_ST | BPF_PROBE_MEM32 | BPF_H: + case BPF_ST | BPF_PROBE_MEM32 | BPF_W: + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: + + EMIT(PPC_RAW_ADD(tmp1_reg, dst_reg, bpf_to_ppc(ARENA_VM_START))); + + if (BPF_SIZE(code) == BPF_W || BPF_SIZE(code) == BPF_DW) { + PPC_LI32(tmp2_reg, imm); + src_reg = tmp2_reg; + } else { + EMIT(PPC_RAW_LI(tmp2_reg, imm)); + src_reg = tmp2_reg; + } + + ret = bpf_jit_emit_probe_mem_store(ctx, src_reg, off, code, image); + if (ret) + return ret; + + ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, + ctx->idx - 1, 4, -1, code); + if (ret) + return ret; + + break; + /* * BPF_STX ATOMIC (atomic ops) */ @@ -1112,9 +1200,10 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code * Check if 'off' is word aligned for BPF_DW, because * we might generate two instructions. */ - if ((BPF_SIZE(code) == BPF_DW || - (BPF_SIZE(code) == BPF_B && BPF_MODE(code) == BPF_PROBE_MEMSX)) && - (off & 3)) + if ((BPF_SIZE(code) == BPF_DW && (off & 3)) || + (BPF_SIZE(code) == BPF_B && + BPF_MODE(code) == BPF_PROBE_MEMSX) || + (BPF_SIZE(code) == BPF_B && BPF_MODE(code) == BPF_MEMSX)) PPC_JMP((ctx->idx + 3) * 4); else PPC_JMP((ctx->idx + 2) * 4); @@ -1160,12 +1249,49 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code if (BPF_MODE(code) == BPF_PROBE_MEM) { ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, - ctx->idx - 1, 4, dst_reg); + ctx->idx - 1, 4, dst_reg, code); if (ret) return ret; } break; + /* dst = *(u64 *)(ul) (src + ARENA_VM_START + off) */ + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + + EMIT(PPC_RAW_ADD(tmp1_reg, src_reg, bpf_to_ppc(ARENA_VM_START))); + + switch (size) { + case BPF_B: + EMIT(PPC_RAW_LBZ(dst_reg, tmp1_reg, off)); + break; + case BPF_H: + EMIT(PPC_RAW_LHZ(dst_reg, tmp1_reg, off)); + break; + case BPF_W: + EMIT(PPC_RAW_LWZ(dst_reg, tmp1_reg, off)); + break; + case BPF_DW: + if (off % 4) { + EMIT(PPC_RAW_LI(tmp2_reg, off)); + EMIT(PPC_RAW_LDX(dst_reg, tmp1_reg, tmp2_reg)); + } else { + EMIT(PPC_RAW_LD(dst_reg, tmp1_reg, off)); + } + break; + } + + if (size != BPF_DW && insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; + + ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, + ctx->idx - 1, 4, dst_reg, code); + if (ret) + return ret; + break; + /* * Doubleword load * 16 byte instruction that uses two 'struct bpf_insn' From a2485d06cad3741303b9414e44f9b6d76d3713f2 Mon Sep 17 00:00:00 2001 From: Saket Kumar Bhaskar Date: Thu, 4 Sep 2025 15:38:33 +0530 Subject: [PATCH 0939/3206] powerpc64/bpf: Implement bpf_addr_space_cast instruction LLVM generates bpf_addr_space_cast instruction while translating pointers between native (zero) address space and __attribute__((address_space(N))). The addr_space=0 is reserved as bpf_arena address space. rY = addr_space_cast(rX, 0, 1) is processed by the verifier and converted to normal 32-bit move: wX = wY. rY = addr_space_cast(rX, 1, 0) : used to convert a bpf arena pointer to a pointer in the userspace vma. This has to be converted by the JIT. PPC_RAW_RLDICL_DOT, a variant of PPC_RAW_RLDICL is introduced to set condition register as well. Reviewed-by: Hari Bathini Tested-by: Venkat Rao Bagalkote Signed-off-by: Saket Kumar Bhaskar Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250904100835.1100423-3-skb99@linux.ibm.com --- arch/powerpc/include/asm/ppc-opcode.h | 1 + arch/powerpc/net/bpf_jit.h | 1 + arch/powerpc/net/bpf_jit_comp.c | 6 ++++++ arch/powerpc/net/bpf_jit_comp64.c | 10 ++++++++++ 4 files changed, 18 insertions(+) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 8053b24afc3956..55ca49d183196f 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -571,6 +571,7 @@ (0x54000001 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH(i) | __PPC_MB(mb) | __PPC_ME(me)) #define PPC_RAW_RLWIMI(d, a, i, mb, me) (0x50000000 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH(i) | __PPC_MB(mb) | __PPC_ME(me)) #define PPC_RAW_RLDICL(d, a, i, mb) (0x78000000 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH64(i) | __PPC_MB64(mb)) +#define PPC_RAW_RLDICL_DOT(d, a, i, mb) (0x78000000 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH64(i) | __PPC_MB64(mb) | 0x1) #define PPC_RAW_RLDICR(d, a, i, me) (0x78000004 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH64(i) | __PPC_ME64(me)) /* slwi = rlwinm Rx, Ry, n, 0, 31-n */ diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index b4a5f58b7e3d48..8334cd667bba1e 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -165,6 +165,7 @@ struct codegen_context { unsigned int exentry_idx; unsigned int alt_exit_addr; u64 arena_vm_start; + u64 user_vm_start; }; #define bpf_to_ppc(r) (ctx->b2p[r]) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 7d070232159f2b..cfa84cab0a1814 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -205,6 +205,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) /* Make sure that the stack is quadword aligned. */ cgctx.stack_size = round_up(fp->aux->stack_depth, 16); cgctx.arena_vm_start = bpf_arena_get_kern_vm_start(fp->aux->arena); + cgctx.user_vm_start = bpf_arena_get_user_vm_start(fp->aux->arena); /* Scouting faux-generate pass 0 */ if (bpf_jit_build_body(fp, NULL, NULL, &cgctx, addrs, 0, false)) { @@ -439,6 +440,11 @@ bool bpf_jit_supports_kfunc_call(void) return true; } +bool bpf_jit_supports_arena(void) +{ + return IS_ENABLED(CONFIG_PPC64); +} + bool bpf_jit_supports_far_kfunc_call(void) { return IS_ENABLED(CONFIG_PPC64); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 569619f1b31c34..76efb47f02a675 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -812,6 +812,16 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code */ case BPF_ALU | BPF_MOV | BPF_X: /* (u32) dst = src */ case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */ + + if (insn_is_cast_user(&insn[i])) { + EMIT(PPC_RAW_RLDICL_DOT(tmp1_reg, src_reg, 0, 32)); + PPC_LI64(dst_reg, (ctx->user_vm_start & 0xffffffff00000000UL)); + PPC_BCC_SHORT(COND_EQ, (ctx->idx + 2) * 4); + EMIT(PPC_RAW_OR(tmp1_reg, dst_reg, tmp1_reg)); + EMIT(PPC_RAW_MR(dst_reg, tmp1_reg)); + break; + } + if (imm == 1) { /* special mov32 for zext */ EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 0, 31)); From 45ed2e8b0591eb6211d79f436f76c3af31e626af Mon Sep 17 00:00:00 2001 From: Saket Kumar Bhaskar Date: Thu, 4 Sep 2025 15:38:34 +0530 Subject: [PATCH 0940/3206] powerpc64/bpf: Introduce bpf_jit_emit_atomic_ops() to emit atomic instructions The existing code for emitting bpf atomic instruction sequences for atomic operations such as XCHG, CMPXCHG, ADD, AND, OR, and XOR has been refactored into a reusable function, bpf_jit_emit_ppc_atomic_op(). It also computes the jump offset and tracks the instruction index for jited LDARX/LWARX to be used in case it causes a fault. Reviewed-by: Hari Bathini Tested-by: Venkat Rao Bagalkote Signed-off-by: Saket Kumar Bhaskar Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250904100835.1100423-4-skb99@linux.ibm.com --- arch/powerpc/net/bpf_jit_comp64.c | 203 +++++++++++++++++------------- 1 file changed, 115 insertions(+), 88 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 76efb47f02a675..cb4d1e95496132 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -423,6 +423,111 @@ asm ( " blr ;" ); +static int bpf_jit_emit_atomic_ops(u32 *image, struct codegen_context *ctx, + const struct bpf_insn *insn, u32 *jmp_off, + u32 *tmp_idx, u32 *addrp) +{ + u32 tmp1_reg = bpf_to_ppc(TMP_REG_1); + u32 tmp2_reg = bpf_to_ppc(TMP_REG_2); + u32 size = BPF_SIZE(insn->code); + u32 src_reg = bpf_to_ppc(insn->src_reg); + u32 dst_reg = bpf_to_ppc(insn->dst_reg); + s32 imm = insn->imm; + + u32 save_reg = tmp2_reg; + u32 ret_reg = src_reg; + u32 fixup_idx; + + /* Get offset into TMP_REG_1 */ + EMIT(PPC_RAW_LI(tmp1_reg, insn->off)); + /* + * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' + * before and after the operation. + * + * This is a requirement in the Linux Kernel Memory Model. + * See __cmpxchg_u64() in asm/cmpxchg.h as an example. + */ + if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); + + *tmp_idx = ctx->idx; + + /* load value from memory into TMP_REG_2 */ + if (size == BPF_DW) + EMIT(PPC_RAW_LDARX(tmp2_reg, tmp1_reg, dst_reg, 0)); + else + EMIT(PPC_RAW_LWARX(tmp2_reg, tmp1_reg, dst_reg, 0)); + /* Save old value in _R0 */ + if (imm & BPF_FETCH) + EMIT(PPC_RAW_MR(_R0, tmp2_reg)); + + switch (imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + EMIT(PPC_RAW_ADD(tmp2_reg, tmp2_reg, src_reg)); + break; + case BPF_AND: + case BPF_AND | BPF_FETCH: + EMIT(PPC_RAW_AND(tmp2_reg, tmp2_reg, src_reg)); + break; + case BPF_OR: + case BPF_OR | BPF_FETCH: + EMIT(PPC_RAW_OR(tmp2_reg, tmp2_reg, src_reg)); + break; + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + EMIT(PPC_RAW_XOR(tmp2_reg, tmp2_reg, src_reg)); + break; + case BPF_CMPXCHG: + /* + * Return old value in BPF_REG_0 for BPF_CMPXCHG & + * in src_reg for other cases. + */ + ret_reg = bpf_to_ppc(BPF_REG_0); + + /* Compare with old value in BPF_R0 */ + if (size == BPF_DW) + EMIT(PPC_RAW_CMPD(bpf_to_ppc(BPF_REG_0), tmp2_reg)); + else + EMIT(PPC_RAW_CMPW(bpf_to_ppc(BPF_REG_0), tmp2_reg)); + /* Don't set if different from old value */ + PPC_BCC_SHORT(COND_NE, (ctx->idx + 3) * 4); + fallthrough; + case BPF_XCHG: + save_reg = src_reg; + break; + default: + return -EOPNOTSUPP; + } + + /* store new value */ + if (size == BPF_DW) + EMIT(PPC_RAW_STDCX(save_reg, tmp1_reg, dst_reg)); + else + EMIT(PPC_RAW_STWCX(save_reg, tmp1_reg, dst_reg)); + /* we're done if this succeeded */ + PPC_BCC_SHORT(COND_NE, *tmp_idx * 4); + fixup_idx = ctx->idx; + + if (imm & BPF_FETCH) { + /* Emit 'sync' to enforce full ordering */ + if (IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); + EMIT(PPC_RAW_MR(ret_reg, _R0)); + /* + * Skip unnecessary zero-extension for 32-bit cmpxchg. + * For context, see commit 39491867ace5. + */ + if (size != BPF_DW && imm == BPF_CMPXCHG && + insn_is_zext(insn + 1)) + *addrp = ctx->idx * 4; + } + + *jmp_off = (fixup_idx - *tmp_idx) * 4; + + return 0; +} + static int bpf_jit_emit_probe_mem_store(struct codegen_context *ctx, u32 src_reg, s16 off, u32 code, u32 *image) { @@ -538,7 +643,6 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code u32 size = BPF_SIZE(code); u32 tmp1_reg = bpf_to_ppc(TMP_REG_1); u32 tmp2_reg = bpf_to_ppc(TMP_REG_2); - u32 save_reg, ret_reg; s16 off = insn[i].off; s32 imm = insn[i].imm; bool func_addr_fixed; @@ -546,6 +650,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code u64 imm64; u32 true_cond; u32 tmp_idx; + u32 jmp_off; /* * addrs[] maps a BPF bytecode address into a real offset from @@ -1080,93 +1185,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code return -EOPNOTSUPP; } - save_reg = tmp2_reg; - ret_reg = src_reg; - - /* Get offset into TMP_REG_1 */ - EMIT(PPC_RAW_LI(tmp1_reg, off)); - /* - * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' - * before and after the operation. - * - * This is a requirement in the Linux Kernel Memory Model. - * See __cmpxchg_u64() in asm/cmpxchg.h as an example. - */ - if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) - EMIT(PPC_RAW_SYNC()); - tmp_idx = ctx->idx * 4; - /* load value from memory into TMP_REG_2 */ - if (size == BPF_DW) - EMIT(PPC_RAW_LDARX(tmp2_reg, tmp1_reg, dst_reg, 0)); - else - EMIT(PPC_RAW_LWARX(tmp2_reg, tmp1_reg, dst_reg, 0)); - - /* Save old value in _R0 */ - if (imm & BPF_FETCH) - EMIT(PPC_RAW_MR(_R0, tmp2_reg)); - - switch (imm) { - case BPF_ADD: - case BPF_ADD | BPF_FETCH: - EMIT(PPC_RAW_ADD(tmp2_reg, tmp2_reg, src_reg)); - break; - case BPF_AND: - case BPF_AND | BPF_FETCH: - EMIT(PPC_RAW_AND(tmp2_reg, tmp2_reg, src_reg)); - break; - case BPF_OR: - case BPF_OR | BPF_FETCH: - EMIT(PPC_RAW_OR(tmp2_reg, tmp2_reg, src_reg)); - break; - case BPF_XOR: - case BPF_XOR | BPF_FETCH: - EMIT(PPC_RAW_XOR(tmp2_reg, tmp2_reg, src_reg)); - break; - case BPF_CMPXCHG: - /* - * Return old value in BPF_REG_0 for BPF_CMPXCHG & - * in src_reg for other cases. - */ - ret_reg = bpf_to_ppc(BPF_REG_0); - - /* Compare with old value in BPF_R0 */ - if (size == BPF_DW) - EMIT(PPC_RAW_CMPD(bpf_to_ppc(BPF_REG_0), tmp2_reg)); - else - EMIT(PPC_RAW_CMPW(bpf_to_ppc(BPF_REG_0), tmp2_reg)); - /* Don't set if different from old value */ - PPC_BCC_SHORT(COND_NE, (ctx->idx + 3) * 4); - fallthrough; - case BPF_XCHG: - save_reg = src_reg; - break; - default: - pr_err_ratelimited( - "eBPF filter atomic op code %02x (@%d) unsupported\n", - code, i); - return -EOPNOTSUPP; - } - - /* store new value */ - if (size == BPF_DW) - EMIT(PPC_RAW_STDCX(save_reg, tmp1_reg, dst_reg)); - else - EMIT(PPC_RAW_STWCX(save_reg, tmp1_reg, dst_reg)); - /* we're done if this succeeded */ - PPC_BCC_SHORT(COND_NE, tmp_idx); - - if (imm & BPF_FETCH) { - /* Emit 'sync' to enforce full ordering */ - if (IS_ENABLED(CONFIG_SMP)) - EMIT(PPC_RAW_SYNC()); - EMIT(PPC_RAW_MR(ret_reg, _R0)); - /* - * Skip unnecessary zero-extension for 32-bit cmpxchg. - * For context, see commit 39491867ace5. - */ - if (size != BPF_DW && imm == BPF_CMPXCHG && - insn_is_zext(&insn[i + 1])) - addrs[++i] = ctx->idx * 4; + ret = bpf_jit_emit_atomic_ops(image, ctx, &insn[i], + &jmp_off, &tmp_idx, &addrs[i + 1]); + if (ret) { + if (ret == -EOPNOTSUPP) { + pr_err_ratelimited( + "eBPF filter atomic op code %02x (@%d) unsupported\n", + code, i); + } + return ret; } break; From 0c1da35b0188dd565cec907a16cb5d1bd425e0e4 Mon Sep 17 00:00:00 2001 From: Saket Kumar Bhaskar Date: Thu, 4 Sep 2025 15:38:35 +0530 Subject: [PATCH 0941/3206] powerpc64/bpf: Implement PROBE_ATOMIC instructions powerpc supports BPF atomic operations using a loop around Load-And-Reserve(LDARX/LWARX) and Store-Conditional(STDCX/STWCX) instructions gated by sync instructions to enforce full ordering. To implement arena_atomics, arena vm start address is added to the dst_reg to be used for both the LDARX/LWARX and STDCX/STWCX instructions. Further, an exception table entry is added for LDARX/LWARX instruction to land after the loop on fault. At the end of sequence, dst_reg is restored by subtracting arena vm start address. bpf_jit_supports_insn() is introduced to selectively enable instruction support as in other architectures like x86 and arm64. Reviewed-by: Hari Bathini Tested-by: Venkat Rao Bagalkote Signed-off-by: Saket Kumar Bhaskar Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250904100835.1100423-5-skb99@linux.ibm.com --- arch/powerpc/net/bpf_jit_comp.c | 16 ++++++++++++++++ arch/powerpc/net/bpf_jit_comp64.c | 26 ++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index cfa84cab0a1814..88ad5ba7b87fd0 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -450,6 +450,22 @@ bool bpf_jit_supports_far_kfunc_call(void) return IS_ENABLED(CONFIG_PPC64); } +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + if (!in_arena) + return true; + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_H: + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (bpf_atomic_is_load_store(insn)) + return false; + return IS_ENABLED(CONFIG_PPC64); + } + return true; +} + void *arch_alloc_bpf_trampoline(unsigned int size) { return bpf_prog_pack_alloc(size, bpf_jit_fill_ill_insns); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index cb4d1e95496132..1fe37128c87640 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -1163,6 +1163,32 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code break; + /* + * BPF_STX PROBE_ATOMIC (arena atomic ops) + */ + case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: + EMIT(PPC_RAW_ADD(dst_reg, dst_reg, bpf_to_ppc(ARENA_VM_START))); + ret = bpf_jit_emit_atomic_ops(image, ctx, &insn[i], + &jmp_off, &tmp_idx, &addrs[i + 1]); + if (ret) { + if (ret == -EOPNOTSUPP) { + pr_err_ratelimited( + "eBPF filter atomic op code %02x (@%d) unsupported\n", + code, i); + } + return ret; + } + /* LDARX/LWARX should land here on exception. */ + ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, + tmp_idx, jmp_off, dst_reg, code); + if (ret) + return ret; + + /* Retrieve the dst_reg */ + EMIT(PPC_RAW_SUB(dst_reg, dst_reg, bpf_to_ppc(ARENA_VM_START))); + break; + /* * BPF_STX ATOMIC (atomic ops) */ From a41de5ccff85a0fe45ea1b4078230bf53faa7dc4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 1 Jul 2025 12:50:07 +0200 Subject: [PATCH 0942/3206] arch/powerpc: Remove support for older GCC and binutils Commit 118c40b7b503 ("kbuild: require gcc-8 and binutils-2.30") raised minimum GCC_VERSION and LD_VERSION. Simplify powerpc build accordingly. Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/b6b94cba7492c8581e8d5d25b15962e5ad7a37c2.1751366979.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 2 +- arch/powerpc/boot/wrapper | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 93402a1d9c9fc6..ba9750c8e0cd8b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -248,7 +248,7 @@ config PPC select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER if !COMPILE_TEST && (PPC64 || (PPC32 && CC_IS_GCC)) - select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC + select HAVE_GCC_PLUGINS select HAVE_GENERIC_VDSO select HAVE_HARDLOCKUP_DETECTOR_ARCH if PPC_BOOK3S_64 && SMP select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 3d8dc822282ac8..a75baefd1cffba 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -226,11 +226,7 @@ ld_is_lld() # Do not include PT_INTERP segment when linking pie. Non-pie linking # just ignores this option. -LD_VERSION=$(${CROSS}ld --version | ld_version) -LD_NO_DL_MIN_VERSION=$(echo 2.26 | ld_version) -if [ "$LD_VERSION" -ge "$LD_NO_DL_MIN_VERSION" ] ; then - nodl="--no-dynamic-linker" -fi +nodl="--no-dynamic-linker" # suppress some warnings in recent ld versions nowarn="-z noexecstack" From 96c79fef228157baddae6574555353535772f109 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 18 Aug 2025 11:24:22 +0200 Subject: [PATCH 0943/3206] powerpc/8xx: Remove offset in SPRN_M_TWB SPRN_M_TWB contains the address of task PGD minus an offset which compensates the offset required when accessing the kernel PGDIR. However, since commit ac9f97ff8b32 ("powerpc/8xx: Inconditionally use task PGDIR in DTLB misses") and commit 33c527522f39 ("powerpc/8xx: Inconditionally use task PGDIR in ITLB misses") kernel PGDIR is not used anymore in hot paths. Remove this offset which was added by commit fde5a9057fcf ("powerpc/8xx: Optimise access to swapper_pg_dir") Signed-off-by: Christophe Leroy [maddy: Fixed checkpatch.pl warning for "pathes"] Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/9710d960b512996e64beebfd368cfeaadb28b3ba.1755509047.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_8xx.S | 16 +++++++++------- arch/powerpc/mm/nohash/mmu_context.c | 10 +--------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 56c5ebe21b99a4..78942fd6b4b97c 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -190,7 +190,7 @@ instruction_counter: INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11) mtspr SPRN_MD_EPN, r10 mfspr r10, SPRN_M_TWB /* Get level 1 table */ - lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r10) /* Get level 1 entry */ + lwz r11, 0(r10) /* Get level 1 entry */ mtspr SPRN_MD_TWC, r11 mfspr r10, SPRN_MD_TWC lwz r10, 0(r10) /* Get the pte */ @@ -233,7 +233,7 @@ instruction_counter: */ mfspr r10, SPRN_MD_EPN mfspr r10, SPRN_M_TWB /* Get level 1 table */ - lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r10) /* Get level 1 entry */ + lwz r11, 0(r10) /* Get level 1 entry */ mtspr SPRN_MD_TWC, r11 mfspr r10, SPRN_MD_TWC @@ -375,7 +375,7 @@ FixupPGD: mfspr r10, SPRN_DAR mtspr SPRN_MD_EPN, r10 mfspr r11, SPRN_M_TWB /* Get level 1 table */ - lwz r10, (swapper_pg_dir - PAGE_OFFSET)@l(r11) /* Get the level 1 entry */ + lwz r10, 0(r11) /* Get the level 1 entry */ cmpwi cr1, r10, 0 bne cr1, 1f @@ -384,7 +384,7 @@ FixupPGD: lwz r10, (swapper_pg_dir - PAGE_OFFSET)@l(r10) /* Get the level 1 entry */ cmpwi cr1, r10, 0 beq cr1, 1f - stw r10, (swapper_pg_dir - PAGE_OFFSET)@l(r11) /* Set the level 1 entry */ + stw r10, 0(r11) /* Set the level 1 entry */ mfspr r10, SPRN_M_TW mtcr r10 mfspr r10, SPRN_SPRG_SCRATCH0 @@ -412,9 +412,10 @@ FixupDAR:/* Entry point for dcbx workaround. */ tophys(r11, r10) mfspr r11, SPRN_M_TWB /* Get level 1 table */ rlwinm r11, r11, 0, 20, 31 - oris r11, r11, (swapper_pg_dir - PAGE_OFFSET)@ha + oris r11, r11, (swapper_pg_dir - PAGE_OFFSET)@h + ori r11, r11, (swapper_pg_dir - PAGE_OFFSET)@l 3: - lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11) /* Get the level 1 entry */ + lwz r11, 0(r11) /* Get the level 1 entry */ rlwinm r11, r11, 0, ~_PMD_PAGE_8M mtspr SPRN_MD_TWC, r11 mfspr r11, SPRN_MD_TWC @@ -535,7 +536,8 @@ start_here: li r0,0 stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) - lis r6, swapper_pg_dir@ha + lis r6, swapper_pg_dir@h + ori r6, r6, swapper_pg_dir@l tophys(r6,r6) mtspr SPRN_M_TWB, r6 diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c index a1a4e697251aa6..28a96a10c90778 100644 --- a/arch/powerpc/mm/nohash/mmu_context.c +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -203,15 +203,7 @@ static unsigned int steal_context_up(unsigned int id) static void set_context(unsigned long id, pgd_t *pgd) { if (IS_ENABLED(CONFIG_PPC_8xx)) { - s16 offset = (s16)(__pa(swapper_pg_dir)); - - /* - * Register M_TWB will contain base address of level 1 table minus the - * lower part of the kernel PGDIR base address, so that all accesses to - * level 1 table are done relative to lower part of kernel PGDIR base - * address. - */ - mtspr(SPRN_M_TWB, __pa(pgd) - offset); + mtspr(SPRN_M_TWB, __pa(pgd)); /* Update context */ mtspr(SPRN_M_CASID, id - 1); From 58f5382a482ba7313085de57767bb784f211b475 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 18 Aug 2025 11:26:44 +0200 Subject: [PATCH 0944/3206] powerpc/vdso: Include asm/syscalls.h for sys_ni_syscall() Include asm/syscalls.h to get the correct prototype for sys_ni_syscall() Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/e2215a515ae0e21393c50e2f38791a6567cf1dec.1755509195.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/vdso.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 219d67bcf747e7..ab7c4cc80943ce 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -40,8 +41,6 @@ static_assert(__VDSO_PAGES == VDSO_NR_PAGES); extern char vdso32_start, vdso32_end; extern char vdso64_start, vdso64_end; -long sys_ni_syscall(void); - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma, unsigned long text_size) { From 7f9bcf13069731fac48d8b44086fab179fbc04c9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 18 Aug 2025 14:14:36 +0200 Subject: [PATCH 0945/3206] powerpc/cpm2: Drop legacy-of-mm-gpiochip.h header Remove legacy-of-mm-gpiochip.h header file. The above mentioned file provides an OF API that's deprecated. There is no agnostic alternatives to it and we have to open code the logic which was hidden behind of_mm_gpiochip_add_data(). Note, most of the GPIO drivers are using their own labeling schemas and resource retrieval that only a few may gain of the code deduplication, so whenever alternative is appear we can move drivers again to use that one. As a side effect this change fixes a potential memory leak on an error path, if of_mm_gpiochip_add_data() fails. [text copied from commit 34064c8267a6 ("powerpc/8xx: Drop legacy-of-mm-gpiochip.h header")] Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/2662f24c539db393f11b27f0feae2dc14bb2f08f.1755518891.git.christophe.leroy@csgroup.eu --- arch/powerpc/platforms/8xx/Kconfig | 1 - arch/powerpc/platforms/Kconfig | 1 - arch/powerpc/sysdev/cpm_common.c | 56 +++++++++++++++--------------- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/platforms/8xx/Kconfig b/arch/powerpc/platforms/8xx/Kconfig index 8623aebfac482f..abb2b45b2789eb 100644 --- a/arch/powerpc/platforms/8xx/Kconfig +++ b/arch/powerpc/platforms/8xx/Kconfig @@ -101,7 +101,6 @@ comment "Generic MPC8xx Options" config 8xx_GPIO bool "GPIO API Support" select GPIOLIB - select OF_GPIO_MM_GPIOCHIP help Saying Y here will cause the ports on an MPC8xx processor to be used with the GPIO API. If you say N here, the kernel needs less memory. diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index fea3766eac0f31..364eef32ddcc0f 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -243,7 +243,6 @@ config CPM2 select CPM select HAVE_PCI select GPIOLIB - select OF_GPIO_MM_GPIOCHIP help The CPM2 (Communications Processor Module) is a coprocessor on embedded CPUs made by Freescale. Selecting this option means that diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c index f469f6a9f6e0ff..07ea605ab0e628 100644 --- a/arch/powerpc/sysdev/cpm_common.c +++ b/arch/powerpc/sysdev/cpm_common.c @@ -28,10 +28,6 @@ #include -#if defined(CONFIG_CPM2) || defined(CONFIG_8xx_GPIO) -#include -#endif - static int __init cpm_init(void) { struct device_node *np; @@ -91,32 +87,33 @@ void __init udbg_init_cpm(void) #if defined(CONFIG_CPM2) || defined(CONFIG_8xx_GPIO) +#include + struct cpm2_ioports { u32 dir, par, sor, odr, dat; u32 res[3]; }; struct cpm2_gpio32_chip { - struct of_mm_gpio_chip mm_gc; + struct gpio_chip gc; + void __iomem *regs; spinlock_t lock; /* shadowed data register to clear/set bits safely */ u32 cpdata; }; -static void cpm2_gpio32_save_regs(struct of_mm_gpio_chip *mm_gc) +static void cpm2_gpio32_save_regs(struct cpm2_gpio32_chip *cpm2_gc) { - struct cpm2_gpio32_chip *cpm2_gc = - container_of(mm_gc, struct cpm2_gpio32_chip, mm_gc); - struct cpm2_ioports __iomem *iop = mm_gc->regs; + struct cpm2_ioports __iomem *iop = cpm2_gc->regs; cpm2_gc->cpdata = in_be32(&iop->dat); } static int cpm2_gpio32_get(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct cpm2_ioports __iomem *iop = mm_gc->regs; + struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc); + struct cpm2_ioports __iomem *iop = cpm2_gc->regs; u32 pin_mask; pin_mask = 1 << (31 - gpio); @@ -124,11 +121,9 @@ static int cpm2_gpio32_get(struct gpio_chip *gc, unsigned int gpio) return !!(in_be32(&iop->dat) & pin_mask); } -static void __cpm2_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask, - int value) +static void __cpm2_gpio32_set(struct cpm2_gpio32_chip *cpm2_gc, u32 pin_mask, int value) { - struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(&mm_gc->gc); - struct cpm2_ioports __iomem *iop = mm_gc->regs; + struct cpm2_ioports __iomem *iop = cpm2_gc->regs; if (value) cpm2_gc->cpdata |= pin_mask; @@ -140,14 +135,13 @@ static void __cpm2_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask, static int cpm2_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc); unsigned long flags; u32 pin_mask = 1 << (31 - gpio); spin_lock_irqsave(&cpm2_gc->lock, flags); - __cpm2_gpio32_set(mm_gc, pin_mask, value); + __cpm2_gpio32_set(cpm2_gc, pin_mask, value); spin_unlock_irqrestore(&cpm2_gc->lock, flags); @@ -156,16 +150,15 @@ static int cpm2_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value) static int cpm2_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc); - struct cpm2_ioports __iomem *iop = mm_gc->regs; + struct cpm2_ioports __iomem *iop = cpm2_gc->regs; unsigned long flags; u32 pin_mask = 1 << (31 - gpio); spin_lock_irqsave(&cpm2_gc->lock, flags); setbits32(&iop->dir, pin_mask); - __cpm2_gpio32_set(mm_gc, pin_mask, val); + __cpm2_gpio32_set(cpm2_gc, pin_mask, val); spin_unlock_irqrestore(&cpm2_gc->lock, flags); @@ -174,9 +167,8 @@ static int cpm2_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static int cpm2_gpio32_dir_in(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc); - struct cpm2_ioports __iomem *iop = mm_gc->regs; + struct cpm2_ioports __iomem *iop = cpm2_gc->regs; unsigned long flags; u32 pin_mask = 1 << (31 - gpio); @@ -193,19 +185,17 @@ int cpm2_gpiochip_add32(struct device *dev) { struct device_node *np = dev->of_node; struct cpm2_gpio32_chip *cpm2_gc; - struct of_mm_gpio_chip *mm_gc; struct gpio_chip *gc; - cpm2_gc = kzalloc(sizeof(*cpm2_gc), GFP_KERNEL); + cpm2_gc = devm_kzalloc(dev, sizeof(*cpm2_gc), GFP_KERNEL); if (!cpm2_gc) return -ENOMEM; spin_lock_init(&cpm2_gc->lock); - mm_gc = &cpm2_gc->mm_gc; - gc = &mm_gc->gc; + gc = &cpm2_gc->gc; - mm_gc->save_regs = cpm2_gpio32_save_regs; + gc->base = -1; gc->ngpio = 32; gc->direction_input = cpm2_gpio32_dir_in; gc->direction_output = cpm2_gpio32_dir_out; @@ -214,6 +204,16 @@ int cpm2_gpiochip_add32(struct device *dev) gc->parent = dev; gc->owner = THIS_MODULE; - return of_mm_gpiochip_add_data(np, mm_gc, cpm2_gc); + gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np); + if (!gc->label) + return -ENOMEM; + + cpm2_gc->regs = devm_of_iomap(dev, np, 0, NULL); + if (IS_ERR(cpm2_gc->regs)) + return PTR_ERR(cpm2_gc->regs); + + cpm2_gpio32_save_regs(cpm2_gc); + + return devm_gpiochip_add_data(dev, gc, cpm2_gc); } #endif /* CONFIG_CPM2 || CONFIG_8xx_GPIO */ From 1044dbaf2a77c9e1143753a620d830943da6a193 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 18 Aug 2025 14:17:34 +0200 Subject: [PATCH 0946/3206] powerpc/44x: Change GPIO driver to a proper platform driver In order to drop legacy-of-mm-gpiochip dependency, first change the 44x GPIO driver to a proper platform driver. Signed-off-by: Christophe Leroy Reviewed-by: Linus Walleij Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/29d89aa43536714b193d9710301341f838fcb5b7.1755519343.git.christophe.leroy@csgroup.eu --- arch/powerpc/platforms/44x/gpio.c | 83 +++++++++++++++++-------------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/platforms/44x/gpio.c b/arch/powerpc/platforms/44x/gpio.c index 08ab7658256872..a025b3248342ac 100644 --- a/arch/powerpc/platforms/44x/gpio.c +++ b/arch/powerpc/platforms/44x/gpio.c @@ -18,6 +18,7 @@ #include #include #include +#include #define GPIO_MASK(gpio) (0x80000000 >> (gpio)) #define GPIO_MASK2(gpio) (0xc0000000 >> ((gpio) * 2)) @@ -155,42 +156,50 @@ ppc4xx_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) return 0; } -static int __init ppc4xx_add_gpiochips(void) +static int ppc4xx_gpio_probe(struct platform_device *ofdev) { - struct device_node *np; - - for_each_compatible_node(np, NULL, "ibm,ppc4xx-gpio") { - int ret; - struct ppc4xx_gpio_chip *ppc4xx_gc; - struct of_mm_gpio_chip *mm_gc; - struct gpio_chip *gc; - - ppc4xx_gc = kzalloc(sizeof(*ppc4xx_gc), GFP_KERNEL); - if (!ppc4xx_gc) { - ret = -ENOMEM; - goto err; - } - - spin_lock_init(&ppc4xx_gc->lock); - - mm_gc = &ppc4xx_gc->mm_gc; - gc = &mm_gc->gc; - - gc->ngpio = 32; - gc->direction_input = ppc4xx_gpio_dir_in; - gc->direction_output = ppc4xx_gpio_dir_out; - gc->get = ppc4xx_gpio_get; - gc->set = ppc4xx_gpio_set; - - ret = of_mm_gpiochip_add_data(np, mm_gc, ppc4xx_gc); - if (ret) - goto err; - continue; -err: - pr_err("%pOF: registration failed with status %d\n", np, ret); - kfree(ppc4xx_gc); - /* try others anyway */ - } - return 0; + struct device *dev = &ofdev->dev; + struct device_node *np = dev->of_node; + struct ppc4xx_gpio_chip *chip; + struct of_mm_gpio_chip *mm_gc; + struct gpio_chip *gc; + + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); + if (!chip) + return -ENOMEM; + + spin_lock_init(&chip->lock); + + mm_gc = &chip->mm_gc; + gc = &mm_gc->gc; + + gc->ngpio = 32; + gc->direction_input = ppc4xx_gpio_dir_in; + gc->direction_output = ppc4xx_gpio_dir_out; + gc->get = ppc4xx_gpio_get; + gc->set = ppc4xx_gpio_set; + + return of_mm_gpiochip_add_data(np, mm_gc, chip); +} + +static const struct of_device_id ppc4xx_gpio_match[] = { + { + .compatible = "ibm,ppc4xx-gpio", + }, + {}, +}; +MODULE_DEVICE_TABLE(of, ppc4xx_gpio_match); + +static struct platform_driver ppc4xx_gpio_driver = { + .probe = ppc4xx_gpio_probe, + .driver = { + .name = "ppc4xx-gpio", + .of_match_table = ppc4xx_gpio_match, + }, +}; + +static int __init ppc4xx_gpio_init(void) +{ + return platform_driver_register(&ppc4xx_gpio_driver); } -arch_initcall(ppc4xx_add_gpiochips); +arch_initcall(ppc4xx_gpio_init); From d2ad26e7b67e5a3d73221daf8207e848dd819bd4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 18 Aug 2025 14:17:35 +0200 Subject: [PATCH 0947/3206] powerpc/44x: Drop legacy-of-mm-gpiochip.h header Remove legacy-of-mm-gpiochip.h header file. The above mentioned file provides an OF API that's deprecated. There is no agnostic alternatives to it and we have to open code the logic which was hidden behind of_mm_gpiochip_add_data(). Note, most of the GPIO drivers are using their own labeling schemas and resource retrieval that only a few may gain of the code deduplication, so whenever alternative is appear we can move drivers again to use that one. As a side effect this change fixes a potential memory leak on an error path, if of_mm_gpiochip_add_data() fails. [Text copied from commit 34064c8267a6 ("powerpc/8xx: Drop legacy-of-mm-gpiochip.h header")] Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/2f8f16eac72b9ec202b6e593939b44308891a661.1755519343.git.christophe.leroy@csgroup.eu --- arch/powerpc/platforms/44x/Kconfig | 1 - arch/powerpc/platforms/44x/gpio.c | 33 +++++++++++++++++------------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index 35a1f4b9f8272b..fc79f846693352 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -231,7 +231,6 @@ config PPC4xx_GPIO bool "PPC4xx GPIO support" depends on 44x select GPIOLIB - select OF_GPIO_MM_GPIOCHIP help Enable gpiolib support for ppc440 based boards diff --git a/arch/powerpc/platforms/44x/gpio.c b/arch/powerpc/platforms/44x/gpio.c index a025b3248342ac..aea0d913b59d0b 100644 --- a/arch/powerpc/platforms/44x/gpio.c +++ b/arch/powerpc/platforms/44x/gpio.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -46,7 +45,8 @@ struct ppc4xx_gpio { }; struct ppc4xx_gpio_chip { - struct of_mm_gpio_chip mm_gc; + struct gpio_chip gc; + void __iomem *regs; spinlock_t lock; }; @@ -58,8 +58,8 @@ struct ppc4xx_gpio_chip { static int ppc4xx_gpio_get(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct ppc4xx_gpio __iomem *regs = mm_gc->regs; + struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc); + struct ppc4xx_gpio __iomem *regs = chip->regs; return !!(in_be32(®s->ir) & GPIO_MASK(gpio)); } @@ -67,8 +67,8 @@ static int ppc4xx_gpio_get(struct gpio_chip *gc, unsigned int gpio) static inline void __ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct ppc4xx_gpio __iomem *regs = mm_gc->regs; + struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc); + struct ppc4xx_gpio __iomem *regs = chip->regs; if (val) setbits32(®s->or, GPIO_MASK(gpio)); @@ -94,9 +94,8 @@ static int ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static int ppc4xx_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc); - struct ppc4xx_gpio __iomem *regs = mm_gc->regs; + struct ppc4xx_gpio __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&chip->lock, flags); @@ -124,9 +123,8 @@ static int ppc4xx_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) static int ppc4xx_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc); - struct ppc4xx_gpio __iomem *regs = mm_gc->regs; + struct ppc4xx_gpio __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&chip->lock, flags); @@ -161,7 +159,6 @@ static int ppc4xx_gpio_probe(struct platform_device *ofdev) struct device *dev = &ofdev->dev; struct device_node *np = dev->of_node; struct ppc4xx_gpio_chip *chip; - struct of_mm_gpio_chip *mm_gc; struct gpio_chip *gc; chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); @@ -170,16 +167,24 @@ static int ppc4xx_gpio_probe(struct platform_device *ofdev) spin_lock_init(&chip->lock); - mm_gc = &chip->mm_gc; - gc = &mm_gc->gc; + gc = &chip->gc; + gc->base = -1; gc->ngpio = 32; gc->direction_input = ppc4xx_gpio_dir_in; gc->direction_output = ppc4xx_gpio_dir_out; gc->get = ppc4xx_gpio_get; gc->set = ppc4xx_gpio_set; - return of_mm_gpiochip_add_data(np, mm_gc, chip); + gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np); + if (!gc->label) + return -ENOMEM; + + chip->regs = devm_of_iomap(dev, np, 0, NULL); + if (IS_ERR(chip->regs)) + return PTR_ERR(chip->regs); + + return devm_gpiochip_add_data(dev, gc, chip); } static const struct of_device_id ppc4xx_gpio_match[] = { From e7a6475cc0c32213e87ec190d90b8f1440ff05ae Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 18 Aug 2025 14:42:24 +0200 Subject: [PATCH 0948/3206] gpio: mpc5200: Drop legacy-of-mm-gpiochip.h header Remove legacy-of-mm-gpiochip.h header file. The above mentioned file provides an OF API that's deprecated. There is no agnostic alternatives to it and we have to open code the logic which was hidden behind of_mm_gpiochip_add_data(). Note, most of the GPIO drivers are using their own labeling schemas and resource retrieval that only a few may gain of the code deduplication, so whenever alternative is appear we can move drivers again to use that one. [text copied from commit 34064c8267a6 ("powerpc/8xx: Drop legacy-of-mm-gpiochip.h header")] Signed-off-by: Christophe Leroy Reviewed-by: Linus Walleij Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/9652736ef05b94d9113ea5ce7899734ef82343d1.1755520794.git.christophe.leroy@csgroup.eu --- drivers/gpio/Kconfig | 1 - drivers/gpio/gpio-mpc5200.c | 78 ++++++++++++++++++++----------------- 2 files changed, 43 insertions(+), 36 deletions(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index e43abb322fa6e1..07be52f6db7ad6 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -485,7 +485,6 @@ config GPIO_MM_LANTIQ config GPIO_MPC5200 def_bool y depends on PPC_MPC52xx - select OF_GPIO_MM_GPIOCHIP config GPIO_MPC8XXX bool "MPC512x/MPC8xxx/QorIQ GPIO support" diff --git a/drivers/gpio/gpio-mpc5200.c b/drivers/gpio/gpio-mpc5200.c index dad0eca1ca2ead..00f209157fd0f0 100644 --- a/drivers/gpio/gpio-mpc5200.c +++ b/drivers/gpio/gpio-mpc5200.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -19,7 +19,8 @@ static DEFINE_SPINLOCK(gpio_lock); struct mpc52xx_gpiochip { - struct of_mm_gpio_chip mmchip; + struct gpio_chip gc; + void __iomem *regs; unsigned int shadow_dvo; unsigned int shadow_gpioe; unsigned int shadow_ddr; @@ -43,8 +44,8 @@ struct mpc52xx_gpiochip { */ static int mpc52xx_wkup_gpio_get(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs; + struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); + struct mpc52xx_gpio_wkup __iomem *regs = chip->regs; unsigned int ret; ret = (in_8(®s->wkup_ival) >> (7 - gpio)) & 1; @@ -57,9 +58,8 @@ static int mpc52xx_wkup_gpio_get(struct gpio_chip *gc, unsigned int gpio) static inline void __mpc52xx_wkup_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); - struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs; + struct mpc52xx_gpio_wkup __iomem *regs = chip->regs; if (val) chip->shadow_dvo |= 1 << (7 - gpio); @@ -87,9 +87,8 @@ mpc52xx_wkup_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static int mpc52xx_wkup_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); - struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs; + struct mpc52xx_gpio_wkup __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&gpio_lock, flags); @@ -110,9 +109,8 @@ static int mpc52xx_wkup_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) static int mpc52xx_wkup_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs; struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); + struct mpc52xx_gpio_wkup __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&gpio_lock, flags); @@ -136,30 +134,41 @@ mpc52xx_wkup_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static int mpc52xx_wkup_gpiochip_probe(struct platform_device *ofdev) { + struct device *dev = &ofdev->dev; + struct device_node *np = dev->of_node; struct mpc52xx_gpiochip *chip; struct mpc52xx_gpio_wkup __iomem *regs; struct gpio_chip *gc; int ret; - chip = devm_kzalloc(&ofdev->dev, sizeof(*chip), GFP_KERNEL); + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); if (!chip) return -ENOMEM; platform_set_drvdata(ofdev, chip); - gc = &chip->mmchip.gc; + gc = &chip->gc; + gc->base = -1; gc->ngpio = 8; gc->direction_input = mpc52xx_wkup_gpio_dir_in; gc->direction_output = mpc52xx_wkup_gpio_dir_out; gc->get = mpc52xx_wkup_gpio_get; gc->set = mpc52xx_wkup_gpio_set; - ret = of_mm_gpiochip_add_data(ofdev->dev.of_node, &chip->mmchip, chip); + gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np); + if (!gc->label) + return -ENOMEM; + + chip->regs = devm_of_iomap(dev, np, 0, NULL); + if (IS_ERR(chip->regs)) + return PTR_ERR(chip->regs); + + ret = devm_gpiochip_add_data(dev, gc, chip); if (ret) return ret; - regs = chip->mmchip.regs; + regs = chip->regs; chip->shadow_gpioe = in_8(®s->wkup_gpioe); chip->shadow_ddr = in_8(®s->wkup_ddr); chip->shadow_dvo = in_8(®s->wkup_dvo); @@ -167,13 +176,6 @@ static int mpc52xx_wkup_gpiochip_probe(struct platform_device *ofdev) return 0; } -static void mpc52xx_gpiochip_remove(struct platform_device *ofdev) -{ - struct mpc52xx_gpiochip *chip = platform_get_drvdata(ofdev); - - of_mm_gpiochip_remove(&chip->mmchip); -} - static const struct of_device_id mpc52xx_wkup_gpiochip_match[] = { { .compatible = "fsl,mpc5200-gpio-wkup", }, {} @@ -185,7 +187,6 @@ static struct platform_driver mpc52xx_wkup_gpiochip_driver = { .of_match_table = mpc52xx_wkup_gpiochip_match, }, .probe = mpc52xx_wkup_gpiochip_probe, - .remove = mpc52xx_gpiochip_remove, }; /* @@ -207,8 +208,8 @@ static struct platform_driver mpc52xx_wkup_gpiochip_driver = { */ static int mpc52xx_simple_gpio_get(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct mpc52xx_gpio __iomem *regs = mm_gc->regs; + struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); + struct mpc52xx_gpio __iomem *regs = chip->regs; unsigned int ret; ret = (in_be32(®s->simple_ival) >> (31 - gpio)) & 1; @@ -219,9 +220,8 @@ static int mpc52xx_simple_gpio_get(struct gpio_chip *gc, unsigned int gpio) static inline void __mpc52xx_simple_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); - struct mpc52xx_gpio __iomem *regs = mm_gc->regs; + struct mpc52xx_gpio __iomem *regs = chip->regs; if (val) chip->shadow_dvo |= 1 << (31 - gpio); @@ -248,9 +248,8 @@ mpc52xx_simple_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static int mpc52xx_simple_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); - struct mpc52xx_gpio __iomem *regs = mm_gc->regs; + struct mpc52xx_gpio __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&gpio_lock, flags); @@ -271,9 +270,8 @@ static int mpc52xx_simple_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) static int mpc52xx_simple_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); - struct mpc52xx_gpio __iomem *regs = mm_gc->regs; + struct mpc52xx_gpio __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&gpio_lock, flags); @@ -298,30 +296,41 @@ mpc52xx_simple_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static int mpc52xx_simple_gpiochip_probe(struct platform_device *ofdev) { + struct device *dev = &ofdev->dev; + struct device_node *np = dev->of_node; struct mpc52xx_gpiochip *chip; struct gpio_chip *gc; struct mpc52xx_gpio __iomem *regs; int ret; - chip = devm_kzalloc(&ofdev->dev, sizeof(*chip), GFP_KERNEL); + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); if (!chip) return -ENOMEM; platform_set_drvdata(ofdev, chip); - gc = &chip->mmchip.gc; + gc = &chip->gc; + gc->base = -1; gc->ngpio = 32; gc->direction_input = mpc52xx_simple_gpio_dir_in; gc->direction_output = mpc52xx_simple_gpio_dir_out; gc->get = mpc52xx_simple_gpio_get; gc->set = mpc52xx_simple_gpio_set; - ret = of_mm_gpiochip_add_data(ofdev->dev.of_node, &chip->mmchip, chip); + gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np); + if (!gc->label) + return -ENOMEM; + + chip->regs = devm_of_iomap(dev, np, 0, NULL); + if (IS_ERR(chip->regs)) + return PTR_ERR(chip->regs); + + ret = devm_gpiochip_add_data(dev, gc, chip); if (ret) return ret; - regs = chip->mmchip.regs; + regs = chip->regs; chip->shadow_gpioe = in_be32(®s->simple_gpioe); chip->shadow_ddr = in_be32(®s->simple_ddr); chip->shadow_dvo = in_be32(®s->simple_dvo); @@ -340,7 +349,6 @@ static struct platform_driver mpc52xx_simple_gpiochip_driver = { .of_match_table = mpc52xx_simple_gpiochip_match, }, .probe = mpc52xx_simple_gpiochip_probe, - .remove = mpc52xx_gpiochip_remove, }; static struct platform_driver * const drivers[] = { From 6dc5d0770dc99d6bf9b786fa3c8fc04be2091648 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 19 Aug 2025 18:58:29 +0200 Subject: [PATCH 0949/3206] powerpc/ps3: Use str_write_read() in ps3stor_read_write_sectors() Remove hard-coded strings by using the str_write_read() helper function and silence the following Coccinelle/coccicheck warning reported by string_choices.cocci: opportunity for str_write_read(write) Signed-off-by: Thorsten Blum Reviewed-by: Geert Uytterhoeven Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250819165829.714344-2-thorsten.blum@linux.dev --- drivers/ps3/ps3stor_lib.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ps3/ps3stor_lib.c b/drivers/ps3/ps3stor_lib.c index a12a1ad9b5fe35..3d4d343ee0c8fb 100644 --- a/drivers/ps3/ps3stor_lib.c +++ b/drivers/ps3/ps3stor_lib.c @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -265,7 +266,7 @@ u64 ps3stor_read_write_sectors(struct ps3_storage_device *dev, u64 lpar, u64 start_sector, u64 sectors, int write) { unsigned int region_id = dev->regions[dev->region_idx].id; - const char *op = write ? "write" : "read"; + const char *op = str_write_read(write); int res; dev_dbg(&dev->sbd.core, "%s:%u: %s %llu sectors starting at %llu\n", From b034baff11d050dd314d5f2645eb52988a93e4a1 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Wed, 13 Aug 2025 17:53:18 +0530 Subject: [PATCH 0950/3206] powerpc: Remove duplicate definition for ppc_msgsnd_sync() Remove duplicate definition of ppc_msgsnd_sync() introduced in commit b87ac0218355 ("powerpc: Introduce msgsnd/doorbell barrier primitives"). No functional change intended. Signed-off-by: Gautam Menghani Reviewed-by: Ritesh Harjani (IBM) [maddy: Updated commit message to fixed checkpatch.pl warning] Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250813122319.62278-1-gautam@linux.ibm.com --- arch/powerpc/include/asm/dbell.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h index 3e9da22a277962..0b9ef726f92c94 100644 --- a/arch/powerpc/include/asm/dbell.h +++ b/arch/powerpc/include/asm/dbell.h @@ -40,12 +40,6 @@ static inline void _ppc_msgsnd(u32 msg) : : "i" (CPU_FTR_HVMODE), "r" (msg)); } -/* sync before sending message */ -static inline void ppc_msgsnd_sync(void) -{ - __asm__ __volatile__ ("sync" : : : "memory"); -} - /* sync after taking message interrupt */ static inline void ppc_msgsync(void) { @@ -76,12 +70,6 @@ static inline void _ppc_msgsnd(u32 msg) __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); } -/* sync before sending message */ -static inline void ppc_msgsnd_sync(void) -{ - __asm__ __volatile__ ("sync" : : : "memory"); -} - /* sync after taking message interrupt */ static inline void ppc_msgsync(void) { @@ -91,6 +79,12 @@ static inline void ppc_msgsync(void) extern void doorbell_exception(struct pt_regs *regs); +/* sync before sending message */ +static inline void ppc_msgsnd_sync(void) +{ + __asm__ __volatile__ ("sync" : : : "memory"); +} + static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag) { u32 msg = PPC_DBELL_TYPE(type) | (flags & PPC_DBELL_MSG_BRDCAST) | From 1ad926459970444af1140f9b393f416536e1a828 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Aug 2025 12:56:41 +0200 Subject: [PATCH 0951/3206] driver core: faux: Set power.no_pm for faux devices Since faux devices are not supposed to be involved in any kind of power management, set the no_pm flag for all of them. Signed-off-by: Rafael J. Wysocki Cc: stable Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/6206518.lOV4Wx5bFT@rafael.j.wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/base/faux.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/base/faux.c b/drivers/base/faux.c index f5fbda0a9a44bd..21dd02124231a9 100644 --- a/drivers/base/faux.c +++ b/drivers/base/faux.c @@ -155,6 +155,7 @@ struct faux_device *faux_device_create_with_groups(const char *name, dev->parent = &faux_bus_root; dev->bus = &faux_bus_type; dev_set_name(dev, "%s", name); + device_set_pm_not_required(dev); ret = device_add(dev); if (ret) { From c2ce2453413d429e302659abc5ace634e873f6f5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Aug 2025 12:59:24 +0200 Subject: [PATCH 0952/3206] driver core/PM: Set power.no_callbacks along with power.no_pm Devices with power.no_pm set are not expected to need any power management at all, so modify device_set_pm_not_required() to set power.no_callbacks for them too in case runtime PM will be enabled for any of them (which in principle may be done for convenience if such a device participates in a dependency chain). Since device_set_pm_not_required() must be called before device_add() or it would not have any effect, it can update power.no_callbacks without locking, unlike pm_runtime_no_callbacks() that can be called after registering the target device. Signed-off-by: Rafael J. Wysocki Cc: stable Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/1950054.tdWV9SEqCh@rafael.j.wysocki Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/device.h b/include/linux/device.h index 0470d19da7f2ca..b031ff71a5bdfe 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -851,6 +851,9 @@ static inline bool device_pm_not_required(struct device *dev) static inline void device_set_pm_not_required(struct device *dev) { dev->power.no_pm = true; +#ifdef CONFIG_PM + dev->power.no_callbacks = true; +#endif } static inline void dev_pm_syscore_device(struct device *dev, bool val) From cc0cc23babc979e399f34f53e4bccf702a389558 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 11 Aug 2025 11:28:54 +0200 Subject: [PATCH 0953/3206] powerpc/xive: Untangle xive from child interrupt controller drivers xive-specific data is stored in handler_data. This creates a mess, as xive has to rely on child interrupt controller drivers to clean up this data, as was done by 9a014f45688 ("powerpc/pseries/pci: Add a msi_free() handler to clear XIVE data"). Instead, store xive-specific data in chip_data and untangle the child drivers. Signed-off-by: Nam Cao Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/83968073022a4cc211dcbd0faccd20ec05e58c3e.1754903590.git.namcao@linutronix.de --- arch/powerpc/include/asm/xive.h | 1 - arch/powerpc/platforms/powernv/pci-ioda.c | 21 +------- arch/powerpc/platforms/pseries/msi.c | 18 +------ arch/powerpc/sysdev/xive/common.c | 63 +++++++++++------------ 4 files changed, 33 insertions(+), 70 deletions(-) diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index 92930b0b5d0e17..efb0f5effcc694 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -111,7 +111,6 @@ void xive_native_free_vp_block(u32 vp_base); int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data); void xive_cleanup_irq_data(struct xive_irq_data *xd); -void xive_irq_free_data(unsigned int virq); void xive_native_free_irq(u32 irq); int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 0166bf39ce1e52..1c730e6b7283eb 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -37,7 +37,6 @@ #include #include #include -#include #include "powernv.h" #include "pci.h" @@ -1707,23 +1706,6 @@ static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, return 0; } -/* - * The msi_free() op is called before irq_domain_free_irqs_top() when - * the handler data is still available. Use that to clear the XIVE - * controller. - */ -static void pnv_msi_ops_msi_free(struct irq_domain *domain, - struct msi_domain_info *info, - unsigned int irq) -{ - if (xive_enabled()) - xive_irq_free_data(irq); -} - -static struct msi_domain_ops pnv_pci_msi_domain_ops = { - .msi_free = pnv_msi_ops_msi_free, -}; - static void pnv_msi_shutdown(struct irq_data *d) { d = d->parent_data; @@ -1754,7 +1736,6 @@ static struct irq_chip pnv_pci_msi_irq_chip = { static struct msi_domain_info pnv_msi_domain_info = { .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX), - .ops = &pnv_pci_msi_domain_ops, .chip = &pnv_pci_msi_irq_chip, }; @@ -1870,7 +1851,7 @@ static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq, virq, d->hwirq, nr_irqs); msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs); - /* XIVE domain is cleared through ->msi_free() */ + irq_domain_free_irqs_parent(domain, virq, nr_irqs); } static const struct irq_domain_ops pnv_irq_domain_ops = { diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 9dc294de631f1d..e5cf061c519ee5 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -15,7 +15,6 @@ #include #include #include -#include #include "pseries.h" @@ -436,19 +435,6 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev return rtas_prepare_msi_irqs(pdev, nvec, type, arg); } -/* - * ->msi_free() is called before irq_domain_free_irqs_top() when the - * handler data is still available. Use that to clear the XIVE - * controller data. - */ -static void pseries_msi_ops_msi_free(struct irq_domain *domain, - struct msi_domain_info *info, - unsigned int irq) -{ - if (xive_enabled()) - xive_irq_free_data(irq); -} - /* * RTAS can not disable one MSI at a time. It's all or nothing. Do it * at the end after all IRQs have been freed. @@ -463,7 +449,6 @@ static void pseries_msi_post_free(struct irq_domain *domain, struct device *dev) static struct msi_domain_ops pseries_pci_msi_domain_ops = { .msi_prepare = pseries_msi_ops_prepare, - .msi_free = pseries_msi_ops_msi_free, .msi_post_free = pseries_msi_post_free, }; @@ -604,8 +589,7 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq struct pci_controller *phb = irq_data_get_irq_chip_data(d); pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs); - - /* XIVE domain data is cleared through ->msi_free() */ + irq_domain_free_irqs_parent(domain, virq, nr_irqs); } static const struct irq_domain_ops pseries_irq_domain_ops = { diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index f1059240502479..625361a15424e5 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -317,7 +317,7 @@ int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d) if (d) { char buffer[128]; - xive_irq_data_dump(irq_data_get_irq_handler_data(d), + xive_irq_data_dump(irq_data_get_irq_chip_data(d), buffer, sizeof(buffer)); xmon_printf("%s", buffer); } @@ -437,7 +437,7 @@ static void xive_do_source_eoi(struct xive_irq_data *xd) /* irq_chip eoi callback, called with irq descriptor lock held */ static void xive_irq_eoi(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); struct xive_cpu *xc = __this_cpu_read(xive_cpu); DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n", @@ -595,7 +595,7 @@ static int xive_pick_irq_target(struct irq_data *d, const struct cpumask *affinity) { static unsigned int fuzz; - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); cpumask_var_t mask; int cpu = -1; @@ -628,7 +628,7 @@ static int xive_pick_irq_target(struct irq_data *d, static unsigned int xive_irq_startup(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); int target, rc; @@ -673,7 +673,7 @@ static unsigned int xive_irq_startup(struct irq_data *d) /* called with irq descriptor lock held */ static void xive_irq_shutdown(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); pr_debug("%s: irq %d [0x%x] data @%p\n", __func__, d->irq, hw_irq, d); @@ -698,7 +698,7 @@ static void xive_irq_shutdown(struct irq_data *d) static void xive_irq_unmask(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd); @@ -707,7 +707,7 @@ static void xive_irq_unmask(struct irq_data *d) static void xive_irq_mask(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd); @@ -718,7 +718,7 @@ static int xive_irq_set_affinity(struct irq_data *d, const struct cpumask *cpumask, bool force) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); u32 target, old_target; int rc = 0; @@ -776,7 +776,7 @@ static int xive_irq_set_affinity(struct irq_data *d, static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); /* * We only support these. This has really no effect other than setting @@ -815,7 +815,7 @@ static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type) static int xive_irq_retrigger(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); /* This should be only for MSIs */ if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) @@ -837,7 +837,7 @@ static int xive_irq_retrigger(struct irq_data *d) */ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); int rc; u8 pq; @@ -951,7 +951,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) static int xive_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(data); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(data); u8 pq; switch (which) { @@ -1011,21 +1011,20 @@ void xive_cleanup_irq_data(struct xive_irq_data *xd) } EXPORT_SYMBOL_GPL(xive_cleanup_irq_data); -static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) +static struct xive_irq_data *xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) { struct xive_irq_data *xd; int rc; xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL); if (!xd) - return -ENOMEM; + return ERR_PTR(-ENOMEM); rc = xive_ops->populate_irq_data(hw, xd); if (rc) { kfree(xd); - return rc; + return ERR_PTR(rc); } xd->target = XIVE_INVALID_TARGET; - irq_set_handler_data(virq, xd); /* * Turn OFF by default the interrupt being mapped. A side @@ -1036,20 +1035,19 @@ static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) */ xive_esb_read(xd, XIVE_ESB_SET_PQ_01); - return 0; + return xd; } -void xive_irq_free_data(unsigned int virq) +static void xive_irq_free_data(unsigned int virq) { - struct xive_irq_data *xd = irq_get_handler_data(virq); + struct xive_irq_data *xd = irq_get_chip_data(virq); if (!xd) return; - irq_set_handler_data(virq, NULL); + irq_set_chip_data(virq, NULL); xive_cleanup_irq_data(xd); kfree(xd); } -EXPORT_SYMBOL_GPL(xive_irq_free_data); #ifdef CONFIG_SMP @@ -1286,7 +1284,7 @@ void __init xive_smp_probe(void) static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, irq_hw_number_t hw) { - int rc; + struct xive_irq_data *xd; /* * Mark interrupts as edge sensitive by default so that resend @@ -1294,11 +1292,12 @@ static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, */ irq_clear_status_flags(virq, IRQ_LEVEL); - rc = xive_irq_alloc_data(virq, hw); - if (rc) - return rc; + xd = xive_irq_alloc_data(virq, hw); + if (IS_ERR(xd)) + return PTR_ERR(xd); irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq); + irq_set_chip_data(virq, xd); return 0; } @@ -1366,7 +1365,7 @@ static void xive_irq_domain_debug_show(struct seq_file *m, struct irq_domain *d, seq_printf(m, "%*sXIVE:\n", ind, ""); ind++; - xd = irq_data_get_irq_handler_data(irqd); + xd = irq_data_get_irq_chip_data(irqd); if (!xd) { seq_printf(m, "%*snot assigned\n", ind, ""); return; @@ -1403,6 +1402,7 @@ static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { struct irq_fwspec *fwspec = arg; + struct xive_irq_data *xd; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; int i, rc; @@ -1423,12 +1423,11 @@ static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, irq_clear_status_flags(virq, IRQ_LEVEL); /* allocates and sets handler data */ - rc = xive_irq_alloc_data(virq + i, hwirq + i); - if (rc) - return rc; + xd = xive_irq_alloc_data(virq + i, hwirq + i); + if (IS_ERR(xd)) + return PTR_ERR(xd); - irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, - &xive_irq_chip, domain->host_data); + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, &xive_irq_chip, xd); irq_set_handler(virq + i, handle_fasteoi_irq); } @@ -1764,7 +1763,7 @@ static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d) seq_printf(m, "IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ", hw_irq, target, prio, lirq); - xive_irq_data_dump(irq_data_get_irq_handler_data(d), buffer, sizeof(buffer)); + xive_irq_data_dump(irq_data_get_irq_chip_data(d), buffer, sizeof(buffer)); seq_puts(m, buffer); seq_puts(m, "\n"); } From f0ac60e6e311062f1a452d93376055787db4b070 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 11 Aug 2025 11:28:55 +0200 Subject: [PATCH 0954/3206] powerpc/powernv/pci: Switch to use msi_create_parent_irq_domain() Move away from the legacy MSI domain setup, switch to use msi_create_parent_irq_domain(). Signed-off-by: Nam Cao Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/a4c2c363ac7b94fffc79d5b92086135be4c57e06.1754903590.git.namcao@linutronix.de --- arch/powerpc/platforms/powernv/Kconfig | 1 + arch/powerpc/platforms/powernv/pci-ioda.c | 75 ++++++++++------------- 2 files changed, 35 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 95d7ba73d43d0d..b5ad7c173ef0c1 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -9,6 +9,7 @@ config PPC_POWERNV select PPC_P7_NAP select FORCE_PCI select PCI_MSI + select IRQ_MSI_LIB select EPAPR_BOOT select PPC_INDIRECT_PIO select PPC_UDBG_16550 diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 1c730e6b7283eb..b0c1d9d16fb52c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1713,30 +1714,33 @@ static void pnv_msi_shutdown(struct irq_data *d) d->chip->irq_shutdown(d); } -static void pnv_msi_mask(struct irq_data *d) +static bool pnv_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) { - pci_msi_mask_irq(d); - irq_chip_mask_parent(d); -} + struct irq_chip *chip = info->chip; -static void pnv_msi_unmask(struct irq_data *d) -{ - pci_msi_unmask_irq(d); - irq_chip_unmask_parent(d); -} + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; -static struct irq_chip pnv_pci_msi_irq_chip = { - .name = "PNV-PCI-MSI", - .irq_shutdown = pnv_msi_shutdown, - .irq_mask = pnv_msi_mask, - .irq_unmask = pnv_msi_unmask, - .irq_eoi = irq_chip_eoi_parent, -}; + chip->irq_shutdown = pnv_msi_shutdown; + return true; +} -static struct msi_domain_info pnv_msi_domain_info = { - .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX), - .chip = &pnv_pci_msi_irq_chip, +#define PNV_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT) +#define PNV_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ + MSI_FLAG_PCI_MSIX | \ + MSI_FLAG_MULTI_PCI_MSI) + +static const struct msi_parent_ops pnv_msi_parent_ops = { + .required_flags = PNV_PCI_MSI_FLAGS_REQUIRED, + .supported_flags = PNV_PCI_MSI_FLAGS_SUPPORTED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI, + .bus_select_token = DOMAIN_BUS_NEXUS, + .bus_select_mask = MATCH_PCI_MSI, + .prefix = "PNV-", + .init_dev_msi_info = pnv_init_dev_msi_info, }; static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) @@ -1855,37 +1859,26 @@ static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq, } static const struct irq_domain_ops pnv_irq_domain_ops = { + .select = msi_lib_irq_domain_select, .alloc = pnv_irq_domain_alloc, .free = pnv_irq_domain_free, }; static int __init pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count) { - struct pnv_phb *phb = hose->private_data; struct irq_domain *parent = irq_get_default_domain(); - - hose->fwnode = irq_domain_alloc_named_id_fwnode("PNV-MSI", phb->opal_id); - if (!hose->fwnode) - return -ENOMEM; - - hose->dev_domain = irq_domain_create_hierarchy(parent, 0, count, - hose->fwnode, - &pnv_irq_domain_ops, hose); + struct irq_domain_info info = { + .fwnode = of_fwnode_handle(hose->dn), + .ops = &pnv_irq_domain_ops, + .host_data = hose, + .size = count, + .parent = parent, + }; + + hose->dev_domain = msi_create_parent_irq_domain(&info, &pnv_msi_parent_ops); if (!hose->dev_domain) { - pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n", - hose->dn, hose->global_number); - irq_domain_free_fwnode(hose->fwnode); - return -ENOMEM; - } - - hose->msi_domain = pci_msi_create_irq_domain(of_fwnode_handle(hose->dn), - &pnv_msi_domain_info, - hose->dev_domain); - if (!hose->msi_domain) { pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", hose->dn, hose->global_number); - irq_domain_free_fwnode(hose->fwnode); - irq_domain_remove(hose->dev_domain); return -ENOMEM; } From daaa574aba6f9c683408b58a7ab2dc775ece2f98 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 11 Aug 2025 11:28:56 +0200 Subject: [PATCH 0955/3206] powerpc/pseries/msi: Switch to msi_create_parent_irq_domain() Move away from the legacy MSI domain setup, switch to use msi_create_parent_irq_domain(). Signed-off-by: Nam Cao Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/c7a6d8f27fd217021dea4daad777e81a525ae460.1754903590.git.namcao@linutronix.de --- arch/powerpc/include/asm/pci-bridge.h | 2 - arch/powerpc/platforms/pseries/Kconfig | 1 + arch/powerpc/platforms/pseries/msi.c | 114 +++++++++++-------------- 3 files changed, 49 insertions(+), 68 deletions(-) diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 2aa3a091ef20ea..1dae53130782a0 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -133,8 +133,6 @@ struct pci_controller { /* IRQ domain hierarchy */ struct irq_domain *dev_domain; - struct irq_domain *msi_domain; - struct fwnode_handle *fwnode; /* iommu_ops support */ struct iommu_device iommu; diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index fa3c2fff082a87..3e042218d6cd8c 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -7,6 +7,7 @@ config PPC_PSERIES select OF_DYNAMIC select FORCE_PCI select PCI_MSI + select IRQ_MSI_LIB select GENERIC_ALLOCATOR select PPC_XICS select PPC_XIVE_SPAPR diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index e5cf061c519ee5..825f9432e03d7d 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -429,8 +430,9 @@ static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int nvec_in, int type, static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg) { + struct msi_domain_info *info = domain->host_data; struct pci_dev *pdev = to_pci_dev(dev); - int type = pdev->msix_enabled ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; + int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; return rtas_prepare_msi_irqs(pdev, nvec, type, arg); } @@ -439,19 +441,14 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev * RTAS can not disable one MSI at a time. It's all or nothing. Do it * at the end after all IRQs have been freed. */ -static void pseries_msi_post_free(struct irq_domain *domain, struct device *dev) +static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg) { - if (WARN_ON_ONCE(!dev_is_pci(dev))) - return; + struct msi_desc *desc = arg->desc; + struct pci_dev *pdev = msi_desc_to_pci_dev(desc); - rtas_disable_msi(to_pci_dev(dev)); + rtas_disable_msi(pdev); } -static struct msi_domain_ops pseries_pci_msi_domain_ops = { - .msi_prepare = pseries_msi_ops_prepare, - .msi_post_free = pseries_msi_post_free, -}; - static void pseries_msi_shutdown(struct irq_data *d) { d = d->parent_data; @@ -459,18 +456,6 @@ static void pseries_msi_shutdown(struct irq_data *d) d->chip->irq_shutdown(d); } -static void pseries_msi_mask(struct irq_data *d) -{ - pci_msi_mask_irq(d); - irq_chip_mask_parent(d); -} - -static void pseries_msi_unmask(struct irq_data *d) -{ - pci_msi_unmask_irq(d); - irq_chip_unmask_parent(d); -} - static void pseries_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { struct msi_desc *entry = irq_data_get_msi_desc(data); @@ -485,27 +470,39 @@ static void pseries_msi_write_msg(struct irq_data *data, struct msi_msg *msg) entry->msg = *msg; } -static struct irq_chip pseries_pci_msi_irq_chip = { - .name = "pSeries-PCI-MSI", - .irq_shutdown = pseries_msi_shutdown, - .irq_mask = pseries_msi_mask, - .irq_unmask = pseries_msi_unmask, - .irq_eoi = irq_chip_eoi_parent, - .irq_write_msi_msg = pseries_msi_write_msg, -}; +static bool pseries_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; -/* - * Set MSI_FLAG_MSIX_CONTIGUOUS as there is no way to express to - * firmware to request a discontiguous or non-zero based range of - * MSI-X entries. Core code will reject such setup attempts. - */ -static struct msi_domain_info pseries_msi_domain_info = { - .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX | - MSI_FLAG_MSIX_CONTIGUOUS), - .ops = &pseries_pci_msi_domain_ops, - .chip = &pseries_pci_msi_irq_chip, + chip->irq_shutdown = pseries_msi_shutdown; + chip->irq_write_msi_msg = pseries_msi_write_msg; + + info->ops->msi_prepare = pseries_msi_ops_prepare; + info->ops->msi_teardown = pseries_msi_ops_teardown; + + return true; +} + +#define PSERIES_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT) +#define PSERIES_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ + MSI_FLAG_PCI_MSIX | \ + MSI_FLAG_MSIX_CONTIGUOUS | \ + MSI_FLAG_MULTI_PCI_MSI) + +static const struct msi_parent_ops pseries_msi_parent_ops = { + .required_flags = PSERIES_PCI_MSI_FLAGS_REQUIRED, + .supported_flags = PSERIES_PCI_MSI_FLAGS_SUPPORTED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI, + .bus_select_token = DOMAIN_BUS_NEXUS, + .bus_select_mask = MATCH_PCI_MSI, + .prefix = "pSeries-", + .init_dev_msi_info = pseries_init_dev_msi_info, }; static void pseries_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) @@ -593,6 +590,7 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq } static const struct irq_domain_ops pseries_irq_domain_ops = { + .select = msi_lib_irq_domain_select, .alloc = pseries_irq_domain_alloc, .free = pseries_irq_domain_free, }; @@ -601,30 +599,18 @@ static int __pseries_msi_allocate_domains(struct pci_controller *phb, unsigned int count) { struct irq_domain *parent = irq_get_default_domain(); - - phb->fwnode = irq_domain_alloc_named_id_fwnode("pSeries-MSI", - phb->global_number); - if (!phb->fwnode) - return -ENOMEM; - - phb->dev_domain = irq_domain_create_hierarchy(parent, 0, count, - phb->fwnode, - &pseries_irq_domain_ops, phb); + struct irq_domain_info info = { + .fwnode = of_fwnode_handle(phb->dn), + .ops = &pseries_irq_domain_ops, + .host_data = phb, + .size = count, + .parent = parent, + }; + + phb->dev_domain = msi_create_parent_irq_domain(&info, &pseries_msi_parent_ops); if (!phb->dev_domain) { - pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n", - phb->dn, phb->global_number); - irq_domain_free_fwnode(phb->fwnode); - return -ENOMEM; - } - - phb->msi_domain = pci_msi_create_irq_domain(of_fwnode_handle(phb->dn), - &pseries_msi_domain_info, - phb->dev_domain); - if (!phb->msi_domain) { pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", phb->dn, phb->global_number); - irq_domain_free_fwnode(phb->fwnode); - irq_domain_remove(phb->dev_domain); return -ENOMEM; } @@ -646,12 +632,8 @@ int pseries_msi_allocate_domains(struct pci_controller *phb) void pseries_msi_free_domains(struct pci_controller *phb) { - if (phb->msi_domain) - irq_domain_remove(phb->msi_domain); if (phb->dev_domain) irq_domain_remove(phb->dev_domain); - if (phb->fwnode) - irq_domain_free_fwnode(phb->fwnode); } static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) From 220a0ffde02f962c13bc752b01aa570b8c65a37b Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 2 Sep 2025 13:53:04 +0300 Subject: [PATCH 0956/3206] xhci: dbc: decouple endpoint allocation from initialization Decouple allocation of endpoint ring buffer from initialization of the buffer, and initialization of endpoint context parts from from the rest of the contexts. It allows driver to clear up and reinitialize endpoint rings after disconnect without reallocating everything. This is a prerequisite for the next patch that prevents the transfer ring from filling up with cancelled (no-op) TRBs if a debug cable is reconnected several times without transferring anything. Cc: stable@vger.kernel.org Fixes: dfba2174dc42 ("usb: xhci: Add DbC support in xHCI driver") Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250902105306.877476-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-dbgcap.c | 71 ++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/drivers/usb/host/xhci-dbgcap.c b/drivers/usb/host/xhci-dbgcap.c index 06a2edb9e86ef7..d0faff233e3e33 100644 --- a/drivers/usb/host/xhci-dbgcap.c +++ b/drivers/usb/host/xhci-dbgcap.c @@ -101,13 +101,34 @@ static u32 xhci_dbc_populate_strings(struct dbc_str_descs *strings) return string_length; } +static void xhci_dbc_init_ep_contexts(struct xhci_dbc *dbc) +{ + struct xhci_ep_ctx *ep_ctx; + unsigned int max_burst; + dma_addr_t deq; + + max_burst = DBC_CTRL_MAXBURST(readl(&dbc->regs->control)); + + /* Populate bulk out endpoint context: */ + ep_ctx = dbc_bulkout_ctx(dbc); + deq = dbc_bulkout_enq(dbc); + ep_ctx->ep_info = 0; + ep_ctx->ep_info2 = dbc_epctx_info2(BULK_OUT_EP, 1024, max_burst); + ep_ctx->deq = cpu_to_le64(deq | dbc->ring_out->cycle_state); + + /* Populate bulk in endpoint context: */ + ep_ctx = dbc_bulkin_ctx(dbc); + deq = dbc_bulkin_enq(dbc); + ep_ctx->ep_info = 0; + ep_ctx->ep_info2 = dbc_epctx_info2(BULK_IN_EP, 1024, max_burst); + ep_ctx->deq = cpu_to_le64(deq | dbc->ring_in->cycle_state); +} + static void xhci_dbc_init_contexts(struct xhci_dbc *dbc, u32 string_length) { struct dbc_info_context *info; - struct xhci_ep_ctx *ep_ctx; u32 dev_info; - dma_addr_t deq, dma; - unsigned int max_burst; + dma_addr_t dma; if (!dbc) return; @@ -121,20 +142,8 @@ static void xhci_dbc_init_contexts(struct xhci_dbc *dbc, u32 string_length) info->serial = cpu_to_le64(dma + DBC_MAX_STRING_LENGTH * 3); info->length = cpu_to_le32(string_length); - /* Populate bulk out endpoint context: */ - ep_ctx = dbc_bulkout_ctx(dbc); - max_burst = DBC_CTRL_MAXBURST(readl(&dbc->regs->control)); - deq = dbc_bulkout_enq(dbc); - ep_ctx->ep_info = 0; - ep_ctx->ep_info2 = dbc_epctx_info2(BULK_OUT_EP, 1024, max_burst); - ep_ctx->deq = cpu_to_le64(deq | dbc->ring_out->cycle_state); - - /* Populate bulk in endpoint context: */ - ep_ctx = dbc_bulkin_ctx(dbc); - deq = dbc_bulkin_enq(dbc); - ep_ctx->ep_info = 0; - ep_ctx->ep_info2 = dbc_epctx_info2(BULK_IN_EP, 1024, max_burst); - ep_ctx->deq = cpu_to_le64(deq | dbc->ring_in->cycle_state); + /* Populate bulk in and out endpoint contexts: */ + xhci_dbc_init_ep_contexts(dbc); /* Set DbC context and info registers: */ lo_hi_writeq(dbc->ctx->dma, &dbc->regs->dccp); @@ -436,6 +445,23 @@ dbc_alloc_ctx(struct device *dev, gfp_t flags) return ctx; } +static void xhci_dbc_ring_init(struct xhci_ring *ring) +{ + struct xhci_segment *seg = ring->first_seg; + + /* clear all trbs on ring in case of old ring */ + memset(seg->trbs, 0, TRB_SEGMENT_SIZE); + + /* Only event ring does not use link TRB */ + if (ring->type != TYPE_EVENT) { + union xhci_trb *trb = &seg->trbs[TRBS_PER_SEGMENT - 1]; + + trb->link.segment_ptr = cpu_to_le64(ring->first_seg->dma); + trb->link.control = cpu_to_le32(LINK_TOGGLE | TRB_TYPE(TRB_LINK)); + } + xhci_initialize_ring_info(ring); +} + static struct xhci_ring * xhci_dbc_ring_alloc(struct device *dev, enum xhci_ring_type type, gfp_t flags) { @@ -464,15 +490,10 @@ xhci_dbc_ring_alloc(struct device *dev, enum xhci_ring_type type, gfp_t flags) seg->dma = dma; - /* Only event ring does not use link TRB */ - if (type != TYPE_EVENT) { - union xhci_trb *trb = &seg->trbs[TRBS_PER_SEGMENT - 1]; - - trb->link.segment_ptr = cpu_to_le64(dma); - trb->link.control = cpu_to_le32(LINK_TOGGLE | TRB_TYPE(TRB_LINK)); - } INIT_LIST_HEAD(&ring->td_list); - xhci_initialize_ring_info(ring); + + xhci_dbc_ring_init(ring); + return ring; dma_fail: kfree(seg); From a5c98e8b1398534ae1feb6e95e2d3ee5215538ed Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 2 Sep 2025 13:53:05 +0300 Subject: [PATCH 0957/3206] xhci: dbc: Fix full DbC transfer ring after several reconnects Pending requests will be flushed on disconnect, and the corresponding TRBs will be turned into No-op TRBs, which are ignored by the xHC controller once it starts processing the ring. If the USB debug cable repeatedly disconnects before ring is started then the ring will eventually be filled with No-op TRBs. No new transfers can be queued when the ring is full, and driver will print the following error message: "xhci_hcd 0000:00:14.0: failed to queue trbs" This is a normal case for 'in' transfers where TRBs are always enqueued in advance, ready to take on incoming data. If no data arrives, and device is disconnected, then ring dequeue will remain at beginning of the ring while enqueue points to first free TRB after last cancelled No-op TRB. s Solve this by reinitializing the rings when the debug cable disconnects and DbC is leaving the configured state. Clear the whole ring buffer and set enqueue and dequeue to the beginning of ring, and set cycle bit to its initial state. Cc: stable@vger.kernel.org Fixes: dfba2174dc42 ("usb: xhci: Add DbC support in xHCI driver") Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250902105306.877476-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-dbgcap.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/xhci-dbgcap.c b/drivers/usb/host/xhci-dbgcap.c index d0faff233e3e33..63edf2d8f24501 100644 --- a/drivers/usb/host/xhci-dbgcap.c +++ b/drivers/usb/host/xhci-dbgcap.c @@ -462,6 +462,25 @@ static void xhci_dbc_ring_init(struct xhci_ring *ring) xhci_initialize_ring_info(ring); } +static int xhci_dbc_reinit_ep_rings(struct xhci_dbc *dbc) +{ + struct xhci_ring *in_ring = dbc->eps[BULK_IN].ring; + struct xhci_ring *out_ring = dbc->eps[BULK_OUT].ring; + + if (!in_ring || !out_ring || !dbc->ctx) { + dev_warn(dbc->dev, "Can't re-init unallocated endpoints\n"); + return -ENODEV; + } + + xhci_dbc_ring_init(in_ring); + xhci_dbc_ring_init(out_ring); + + /* set ep context enqueue, dequeue, and cycle to initial values */ + xhci_dbc_init_ep_contexts(dbc); + + return 0; +} + static struct xhci_ring * xhci_dbc_ring_alloc(struct device *dev, enum xhci_ring_type type, gfp_t flags) { @@ -885,7 +904,7 @@ static enum evtreturn xhci_dbc_do_handle_events(struct xhci_dbc *dbc) dev_info(dbc->dev, "DbC cable unplugged\n"); dbc->state = DS_ENABLED; xhci_dbc_flush_requests(dbc); - + xhci_dbc_reinit_ep_rings(dbc); return EVT_DISC; } @@ -895,7 +914,7 @@ static enum evtreturn xhci_dbc_do_handle_events(struct xhci_dbc *dbc) writel(portsc, &dbc->regs->portsc); dbc->state = DS_ENABLED; xhci_dbc_flush_requests(dbc); - + xhci_dbc_reinit_ep_rings(dbc); return EVT_DISC; } From edcbe06453ddfde21f6aa763f7cab655f26133cc Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 2 Sep 2025 13:53:06 +0300 Subject: [PATCH 0958/3206] xhci: fix memory leak regression when freeing xhci vdev devices depth first Suspend-resume cycle test revealed a memory leak in 6.17-rc3 Turns out the slot_id race fix changes accidentally ends up calling xhci_free_virt_device() with an incorrect vdev parameter. The vdev variable was reused for temporary purposes right before calling xhci_free_virt_device(). Fix this by passing the correct vdev parameter. The slot_id race fix that caused this regression was targeted for stable, so this needs to be applied there as well. Fixes: 2eb03376151b ("usb: xhci: Fix slot_id resource race conflict") Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/linux-usb/20250829181354.4450-1-00107082@163.com Suggested-by: Michal Pecio Suggested-by: David Wang <00107082@163.com> Cc: stable@vger.kernel.org Tested-by: David Wang <00107082@163.com> Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250902105306.877476-4-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 81eaad87a3d9d0..c4a6544aa10751 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -962,7 +962,7 @@ static void xhci_free_virt_devices_depth_first(struct xhci_hcd *xhci, int slot_i out: /* we are now at a leaf device */ xhci_debugfs_remove_slot(xhci, slot_id); - xhci_free_virt_device(xhci, vdev, slot_id); + xhci_free_virt_device(xhci, xhci->devs[slot_id], slot_id); } int xhci_alloc_virt_device(struct xhci_hcd *xhci, int slot_id, From 8d63c83d8eb922f6c316320f50c82fa88d099bea Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 25 Aug 2025 12:00:22 -0400 Subject: [PATCH 0959/3206] USB: gadget: dummy-hcd: Fix locking bug in RT-enabled kernels Yunseong Kim and the syzbot fuzzer both reported a problem in RT-enabled kernels caused by the way dummy-hcd mixes interrupt management and spin-locking. The pattern was: local_irq_save(flags); spin_lock(&dum->lock); ... spin_unlock(&dum->lock); ... // calls usb_gadget_giveback_request() local_irq_restore(flags); The code was written this way because usb_gadget_giveback_request() needs to be called with interrupts disabled and the private lock not held. While this pattern works fine in non-RT kernels, it's not good when RT is enabled. RT kernels handle spinlocks much like mutexes; in particular, spin_lock() may sleep. But sleeping is not allowed while local interrupts are disabled. To fix the problem, rewrite the code to conform to the pattern used elsewhere in dummy-hcd and other UDC drivers: spin_lock_irqsave(&dum->lock, flags); ... spin_unlock(&dum->lock); usb_gadget_giveback_request(...); spin_lock(&dum->lock); ... spin_unlock_irqrestore(&dum->lock, flags); This approach satisfies the RT requirements. Signed-off-by: Alan Stern Cc: stable Fixes: b4dbda1a22d2 ("USB: dummy-hcd: disable interrupts during req->complete") Reported-by: Yunseong Kim Closes: Reported-by: syzbot+8baacc4139f12fa77909@syzkaller.appspotmail.com Closes: Tested-by: syzbot+8baacc4139f12fa77909@syzkaller.appspotmail.com CC: Sebastian Andrzej Siewior CC: stable@vger.kernel.org Reviewed-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/bb192ae2-4eee-48ee-981f-3efdbbd0d8f0@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index 21dbfb0b3baca8..1cefca660773c4 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -765,8 +765,7 @@ static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req) if (!dum->driver) return -ESHUTDOWN; - local_irq_save(flags); - spin_lock(&dum->lock); + spin_lock_irqsave(&dum->lock, flags); list_for_each_entry(iter, &ep->queue, queue) { if (&iter->req != _req) continue; @@ -776,15 +775,16 @@ static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req) retval = 0; break; } - spin_unlock(&dum->lock); if (retval == 0) { dev_dbg(udc_dev(dum), "dequeued req %p from %s, len %d buf %p\n", req, _ep->name, _req->length, _req->buf); + spin_unlock(&dum->lock); usb_gadget_giveback_request(_ep, _req); + spin_lock(&dum->lock); } - local_irq_restore(flags); + spin_unlock_irqrestore(&dum->lock, flags); return retval; } From f34bfcc77b18375a87091c289c2eb53c249787b4 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Thu, 21 Aug 2025 20:37:57 +0000 Subject: [PATCH 0960/3206] usb: typec: tcpm: properly deliver cable vdms to altmode drivers tcpm_handle_vdm_request delivers messages to the partner altmode or the cable altmode depending on the SVDM response type, which is incorrect. The partner or cable should be chosen based on the received message type instead. Also add this filter to ADEV_NOTIFY_USB_AND_QUEUE_VDM, which is used when the Enter Mode command is responded to by a NAK on SOP or SOP' and when the Exit Mode command is responded to by an ACK on SOP. Fixes: 7e7877c55eb1 ("usb: typec: tcpm: add alt mode enter/exit/vdm support for sop'") Cc: stable@vger.kernel.org Signed-off-by: RD Babiera Reviewed-by: Badhri Jagan Sridharan Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250821203759.1720841-2-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 1f6fdfaa34bf12..b2a568a5bc9b0b 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -2426,17 +2426,21 @@ static void tcpm_handle_vdm_request(struct tcpm_port *port, case ADEV_NONE: break; case ADEV_NOTIFY_USB_AND_QUEUE_VDM: - WARN_ON(typec_altmode_notify(adev, TYPEC_STATE_USB, NULL)); - typec_altmode_vdm(adev, p[0], &p[1], cnt); + if (rx_sop_type == TCPC_TX_SOP_PRIME) { + typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P, p[0], &p[1], cnt); + } else { + WARN_ON(typec_altmode_notify(adev, TYPEC_STATE_USB, NULL)); + typec_altmode_vdm(adev, p[0], &p[1], cnt); + } break; case ADEV_QUEUE_VDM: - if (response_tx_sop_type == TCPC_TX_SOP_PRIME) + if (rx_sop_type == TCPC_TX_SOP_PRIME) typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P, p[0], &p[1], cnt); else typec_altmode_vdm(adev, p[0], &p[1], cnt); break; case ADEV_QUEUE_VDM_SEND_EXIT_MODE_ON_FAIL: - if (response_tx_sop_type == TCPC_TX_SOP_PRIME) { + if (rx_sop_type == TCPC_TX_SOP_PRIME) { if (typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P, p[0], &p[1], cnt)) { int svdm_version = typec_get_cable_svdm_version( From 21d8525d2e061cde034277d518411b02eac764e2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 4 Sep 2025 17:39:24 +0200 Subject: [PATCH 0961/3206] usb: gadget: midi2: Fix missing UMP group attributes initialization The gadget card driver forgot to call snd_ump_update_group_attrs() after adding FBs, and this leaves the UMP group attributes uninitialized. As a result, -ENODEV error is returned at opening a legacy rawmidi device as an inactive group. This patch adds the missing call to address the behavior above. Fixes: 8b645922b223 ("usb: gadget: Add support for USB MIDI 2.0 function driver") Cc: stable Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20250904153932.13589-1-tiwai@suse.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/f_midi2.c b/drivers/usb/gadget/function/f_midi2.c index 0a800ba53816a8..b351f9ae0fc508 100644 --- a/drivers/usb/gadget/function/f_midi2.c +++ b/drivers/usb/gadget/function/f_midi2.c @@ -1599,6 +1599,7 @@ static int f_midi2_create_card(struct f_midi2 *midi2) strscpy(fb->info.name, ump_fb_name(b), sizeof(fb->info.name)); } + snd_ump_update_group_attrs(ump); } for (i = 0; i < midi2->num_eps; i++) { From 116e79c679a1530cf833d0ff3007061d7a716bd9 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 5 Sep 2025 15:32:34 +0200 Subject: [PATCH 0962/3206] usb: gadget: midi2: Fix MIDI2 IN EP max packet size The EP-IN of MIDI2 (altset 1) wasn't initialized in f_midi2_create_usb_configs() as it's an INT EP unlike others BULK EPs. But this leaves rather the max packet size unchanged no matter which speed is used, resulting in the very slow access. And the wMaxPacketSize values set there look legit for INT EPs, so let's initialize the MIDI2 EP-IN there for achieving the equivalent speed as well. Fixes: 8b645922b223 ("usb: gadget: Add support for USB MIDI 2.0 function driver") Cc: stable Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20250905133240.20966-1-tiwai@suse.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi2.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_midi2.c b/drivers/usb/gadget/function/f_midi2.c index b351f9ae0fc508..de16b02d857e07 100644 --- a/drivers/usb/gadget/function/f_midi2.c +++ b/drivers/usb/gadget/function/f_midi2.c @@ -1737,9 +1737,12 @@ static int f_midi2_create_usb_configs(struct f_midi2 *midi2, case USB_SPEED_HIGH: midi2_midi1_ep_out_desc.wMaxPacketSize = cpu_to_le16(512); midi2_midi1_ep_in_desc.wMaxPacketSize = cpu_to_le16(512); - for (i = 0; i < midi2->num_eps; i++) + for (i = 0; i < midi2->num_eps; i++) { midi2_midi2_ep_out_desc[i].wMaxPacketSize = cpu_to_le16(512); + midi2_midi2_ep_in_desc[i].wMaxPacketSize = + cpu_to_le16(512); + } fallthrough; case USB_SPEED_FULL: midi1_in_eps = midi2_midi1_ep_in_descs; @@ -1748,9 +1751,12 @@ static int f_midi2_create_usb_configs(struct f_midi2 *midi2, case USB_SPEED_SUPER: midi2_midi1_ep_out_desc.wMaxPacketSize = cpu_to_le16(1024); midi2_midi1_ep_in_desc.wMaxPacketSize = cpu_to_le16(1024); - for (i = 0; i < midi2->num_eps; i++) + for (i = 0; i < midi2->num_eps; i++) { midi2_midi2_ep_out_desc[i].wMaxPacketSize = cpu_to_le16(1024); + midi2_midi2_ep_in_desc[i].wMaxPacketSize = + cpu_to_le16(1024); + } midi1_in_eps = midi2_midi1_ep_in_ss_descs; midi1_out_eps = midi2_midi1_ep_out_ss_descs; break; From b5e3277c0f1c3439dd02b58997c06201d0ee8dbf Mon Sep 17 00:00:00 2001 From: Harshit Shah Date: Tue, 2 Sep 2025 12:16:29 -0700 Subject: [PATCH 0963/3206] serial: xilinx_uartps: read reg size from DTS Current implementation uses `CDNS_UART_REGISTER_SPACE(0x1000)` for request_mem_region() and ioremap() in cdns_uart_request_port() API. The cadence/xilinx IP has register space defined from offset 0x0 to 0x48. It also mentions that the register map is defined as [6:0]. So, the upper region may/maynot be used based on the IP integration. In Axiado AX3000 SoC two UART instances are defined 0x100 apart. That is creating issue in some other instance due to overlap with addresses. Since, this address space is already being defined in the devicetree, use the same when requesting the register space. Fixes: 1f7055779001 ("arm64: dts: axiado: Add initial support for AX3000 SoC and eval board") Acked-by: Michal Simek Signed-off-by: Harshit Shah Link: https://lore.kernel.org/r/20250902-xilinx-uartps-reg-size-v3-1-d11cfa7258e3@axiado.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/xilinx_uartps.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c index fe457bf1e15bb4..a66b44d21fba25 100644 --- a/drivers/tty/serial/xilinx_uartps.c +++ b/drivers/tty/serial/xilinx_uartps.c @@ -33,7 +33,6 @@ #define CDNS_UART_MINOR 0 /* works best with devtmpfs */ #define CDNS_UART_NR_PORTS 16 #define CDNS_UART_FIFO_SIZE 64 /* FIFO size */ -#define CDNS_UART_REGISTER_SPACE 0x1000 #define TX_TIMEOUT 500000 /* Rx Trigger level */ @@ -1098,15 +1097,15 @@ static int cdns_uart_verify_port(struct uart_port *port, */ static int cdns_uart_request_port(struct uart_port *port) { - if (!request_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE, + if (!request_mem_region(port->mapbase, port->mapsize, CDNS_UART_NAME)) { return -ENOMEM; } - port->membase = ioremap(port->mapbase, CDNS_UART_REGISTER_SPACE); + port->membase = ioremap(port->mapbase, port->mapsize); if (!port->membase) { dev_err(port->dev, "Unable to map registers\n"); - release_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE); + release_mem_region(port->mapbase, port->mapsize); return -ENOMEM; } return 0; @@ -1121,7 +1120,7 @@ static int cdns_uart_request_port(struct uart_port *port) */ static void cdns_uart_release_port(struct uart_port *port) { - release_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE); + release_mem_region(port->mapbase, port->mapsize); iounmap(port->membase); port->membase = NULL; } @@ -1780,6 +1779,7 @@ static int cdns_uart_probe(struct platform_device *pdev) * and triggers invocation of the config_port() entry point. */ port->mapbase = res->start; + port->mapsize = resource_size(res); port->irq = irq; port->dev = &pdev->dev; port->uartclk = clk_get_rate(cdns_uart_data->uartclk); From ab1396af7595e7d49a3850481b24d7fe7cbdfd31 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 5 Sep 2025 22:06:18 -0700 Subject: [PATCH 0964/3206] trace/fgraph: Fix error handling Commit edede7a6dcd7 ("trace/fgraph: Fix the warning caused by missing unregister notifier") added a call to unregister the PM notifier if register_ftrace_graph() failed. It does so unconditionally. However, the PM notifier is only registered with the first call to register_ftrace_graph(). If the first registration was successful and a subsequent registration failed, the notifier is now unregistered even if ftrace graphs are still registered. Fix the problem by only unregistering the PM notifier during error handling if there are no active fgraph registrations. Fixes: edede7a6dcd7 ("trace/fgraph: Fix the warning caused by missing unregister notifier") Closes: https://lore.kernel.org/all/63b0ba5a-a928-438e-84f9-93028dd72e54@roeck-us.net/ Cc: Ye Weihua Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250906050618.2634078-1-linux@roeck-us.net Signed-off-by: Guenter Roeck Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 2a42c1036ea87c..1e3b32b1e82cd5 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1397,7 +1397,8 @@ int register_ftrace_graph(struct fgraph_ops *gops) ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); - unregister_pm_notifier(&ftrace_suspend_notifier); + if (!ftrace_graph_active) + unregister_pm_notifier(&ftrace_suspend_notifier); } return ret; } From c1628c00c4351dd0727ef7f670694f68d9e663d8 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Sat, 6 Sep 2025 11:56:10 +0800 Subject: [PATCH 0965/3206] tracing/osnoise: Fix null-ptr-deref in bitmap_parselist() A crash was observed with the following output: BUG: kernel NULL pointer dereference, address: 0000000000000010 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 2 UID: 0 PID: 92 Comm: osnoise_cpus Not tainted 6.17.0-rc4-00201-gd69eb204c255 #138 PREEMPT(voluntary) RIP: 0010:bitmap_parselist+0x53/0x3e0 Call Trace: osnoise_cpus_write+0x7a/0x190 vfs_write+0xf8/0x410 ? do_sys_openat2+0x88/0xd0 ksys_write+0x60/0xd0 do_syscall_64+0xa4/0x260 entry_SYSCALL_64_after_hwframe+0x77/0x7f This issue can be reproduced by below code: fd=open("/sys/kernel/debug/tracing/osnoise/cpus", O_WRONLY); write(fd, "0-2", 0); When user pass 'count=0' to osnoise_cpus_write(), kmalloc() will return ZERO_SIZE_PTR (16) and cpulist_parse() treat it as a normal value, which trigger the null pointer dereference. Add check for the parameter 'count'. Cc: Cc: Cc: Link: https://lore.kernel.org/20250906035610.3880282-1-wangliang74@huawei.com Fixes: 17f89102fe23 ("tracing/osnoise: Allow arbitrarily long CPU string") Signed-off-by: Wang Liang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index fd259da0aa6456..337bc0eb5d71bf 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2322,6 +2322,9 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, int running, err; char *buf __free(kfree) = NULL; + if (count < 1) + return 0; + buf = kmalloc(count, GFP_KERNEL); if (!buf) return -ENOMEM; From 5f9efb6b7667043527d377421af2070cc0aa2ecd Mon Sep 17 00:00:00 2001 From: Julien Massot Date: Fri, 5 Sep 2025 13:51:58 +0200 Subject: [PATCH 0966/3206] Input: mtk-pmic-keys - MT6359 has a specific release irq Support for MT6359 PMIC keys has been added recently. However, the key release event is not properly handled: only key press events are generated, leaving key states stuck in "pressed". This patch ensures that both key press and key release events are properly emitted by handling the release logic correctly. Introduce a 'key_release_irq' member to the 'mtk_pmic_regs' to identify the devices that have a separate irq for the release event. Fixes: bc25e6bf032e ("Input: mtk-pmic-keys - add support for MT6359 PMIC keys") Signed-off-by: Julien Massot Link: https://lore.kernel.org/r/20250905-radxa-nio-12-l-gpio-v3-1-40f11377fb55@collabora.com Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/mtk-pmic-keys.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/input/keyboard/mtk-pmic-keys.c b/drivers/input/keyboard/mtk-pmic-keys.c index 50e2e792c91d26..c78d9f6d97c4f7 100644 --- a/drivers/input/keyboard/mtk-pmic-keys.c +++ b/drivers/input/keyboard/mtk-pmic-keys.c @@ -55,6 +55,7 @@ struct mtk_pmic_regs { const struct mtk_pmic_keys_regs keys_regs[MTK_PMIC_MAX_KEY_COUNT]; u32 pmic_rst_reg; u32 rst_lprst_mask; /* Long-press reset timeout bitmask */ + bool key_release_irq; }; static const struct mtk_pmic_regs mt6397_regs = { @@ -116,6 +117,7 @@ static const struct mtk_pmic_regs mt6358_regs = { MTK_PMIC_HOMEKEY_RST), .pmic_rst_reg = MT6358_TOP_RST_MISC, .rst_lprst_mask = MTK_PMIC_RST_DU_MASK, + .key_release_irq = true, }; static const struct mtk_pmic_regs mt6359_regs = { @@ -129,6 +131,7 @@ static const struct mtk_pmic_regs mt6359_regs = { MTK_PMIC_HOMEKEY_RST), .pmic_rst_reg = MT6359_TOP_RST_MISC, .rst_lprst_mask = MTK_PMIC_RST_DU_MASK, + .key_release_irq = true, }; struct mtk_pmic_keys_info { @@ -368,7 +371,7 @@ static int mtk_pmic_keys_probe(struct platform_device *pdev) if (keys->keys[index].irq < 0) return keys->keys[index].irq; - if (of_device_is_compatible(node, "mediatek,mt6358-keys")) { + if (mtk_pmic_regs->key_release_irq) { keys->keys[index].irq_r = platform_get_irq_byname(pdev, irqnames_r[index]); From 2b2d4c744e1ab8b8d41c5c2ccf889f5441e50105 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Wed, 27 Aug 2025 19:40:21 +0800 Subject: [PATCH 0967/3206] drivers: base: fix "publically"->"publicly" Trivial fix to spelling mistake in comment text. Signed-off-by: Xichao Zhao Link: https://lore.kernel.org/r/20250827114021.476668-1-zhao.xichao@vivo.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index efc575a00edda9..ccaebb8ac656a7 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -325,7 +325,7 @@ static void cpu_device_release(struct device *dev) * This is an empty function to prevent the driver core from spitting a * warning at us. Yes, I know this is directly opposite of what the * documentation for the driver core and kobjects say, and the author - * of this code has already been publically ridiculed for doing + * of this code has already been publicly ridiculed for doing * something as foolish as this. However, at this point in time, it is * the only way to handle the issue of statically allocated cpu * devices. The different architectures will have their cpu device From a86537ad21c7ba587241b0dcdd186f894a69fac7 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Fri, 29 Aug 2025 22:58:57 +0200 Subject: [PATCH 0968/3206] driver core: get_dev_from_fwnode(): document potential race Commit 9a4681a485ee ("driver core: Export get_dev_from_fwnode()") made get_dev_from_fwnode() publicly available, but didn't document the guarantees a caller must uphold: get_dev_from_fwnode() obtains a reference count from the device pointer stored in a struct fwnode_handle. While having its own reference count, struct fwnode_handle does not keep a reference count of the device it has a pointer to. Consequently, a caller must guarantee that it is impossible that the last device reference is dropped and the device is released concurrently while calling get_dev_from_fwnode(), otherwise this is a potential UAF and hence a bug. Thus, document this potential race condition for get_dev_from_fwnode(). Cc: Ulf Hansson Cc: Saravana Kannan Signed-off-by: Danilo Krummrich Link: https://lore.kernel.org/r/20250829205911.33142-1-dakr@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/base/core.c b/drivers/base/core.c index d22d6b23e75898..cd806be435cdd0 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -5278,6 +5278,25 @@ void device_set_node(struct device *dev, struct fwnode_handle *fwnode) } EXPORT_SYMBOL_GPL(device_set_node); +/** + * get_dev_from_fwnode - Obtain a reference count of the struct device the + * struct fwnode_handle is associated with. + * @fwnode: The pointer to the struct fwnode_handle to obtain the struct device + * reference count of. + * + * This function obtains a reference count of the device the device pointer + * embedded in the struct fwnode_handle points to. + * + * Note that the struct device pointer embedded in struct fwnode_handle does + * *not* have a reference count of the struct device itself. + * + * Hence, it is a UAF (and thus a bug) to call this function if the caller can't + * guarantee that the last reference count of the corresponding struct device is + * not dropped concurrently. + * + * This is possible since struct fwnode_handle has its own reference count and + * hence can out-live the struct device it is associated with. + */ struct device *get_dev_from_fwnode(struct fwnode_handle *fwnode) { return get_device((fwnode)->dev); From 716cec5fc92f6d4090a54ddb01042bce02b3c771 Mon Sep 17 00:00:00 2001 From: Gil Fine Date: Sun, 31 Aug 2025 22:49:30 +0300 Subject: [PATCH 0969/3206] driver core: Fix order of the kernel-doc parameters Fix the order of the kernel-doc parameters in device_find_child() and device_for_each_child*() functions to match the actual functions signature. No functional changes. Signed-off-by: Gil Fine Link: https://lore.kernel.org/r/20250831194930.2063390-1-gil.fine@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index cd806be435cdd0..fa8093119602db 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3994,8 +3994,8 @@ const char *device_get_devnode(const struct device *dev, /** * device_for_each_child - device child iterator. * @parent: parent struct device. - * @fn: function to be called for each device. * @data: data for the callback. + * @fn: function to be called for each device. * * Iterate over @parent's child devices, and call @fn for each, * passing it @data. @@ -4024,8 +4024,8 @@ EXPORT_SYMBOL_GPL(device_for_each_child); /** * device_for_each_child_reverse - device child iterator in reversed order. * @parent: parent struct device. - * @fn: function to be called for each device. * @data: data for the callback. + * @fn: function to be called for each device. * * Iterate over @parent's child devices, and call @fn for each, * passing it @data. @@ -4055,8 +4055,8 @@ EXPORT_SYMBOL_GPL(device_for_each_child_reverse); * device_for_each_child_reverse_from - device child iterator in reversed order. * @parent: parent struct device. * @from: optional starting point in child list - * @fn: function to be called for each device. * @data: data for the callback. + * @fn: function to be called for each device. * * Iterate over @parent's child devices, starting at @from, and call @fn * for each, passing it @data. This helper is identical to @@ -4089,8 +4089,8 @@ EXPORT_SYMBOL_GPL(device_for_each_child_reverse_from); /** * device_find_child - device iterator for locating a particular device. * @parent: parent struct device - * @match: Callback function to check device * @data: Data to pass to match function + * @match: Callback function to check device * * This is similar to the device_for_each_child() function above, but it * returns a reference to a device that is 'found' for later use, as From eca710386972f2a72708b0b2a615eb150d218e18 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 27 Aug 2025 13:05:41 +0300 Subject: [PATCH 0970/3206] driver core: auxiliary bus: Drop dev_pm_domain_detach() call Starting with commit f99508074e78 ("PM: domains: Detach on device_unbind_cleanup()"), there is no longer a need to call dev_pm_domain_detach() in the bus remove function. The device_unbind_cleanup() function now handles this to avoid invoking devres cleanup handlers while the PM domain is powered off, which could otherwise lead to failures as described in the above-mentioned commit. Drop the explicit dev_pm_domain_detach() call and rely instead on the flags passed to dev_pm_domain_attach() to power off the domain. Signed-off-by: Claudiu Beznea Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20250827100541.926350-1-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/auxiliary.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index 12ffdd8437567f..f65ba30d0617ff 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -217,17 +217,14 @@ static int auxiliary_bus_probe(struct device *dev) struct auxiliary_device *auxdev = to_auxiliary_dev(dev); int ret; - ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON); + ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON | + PD_FLAG_DETACH_POWER_OFF); if (ret) { dev_warn(dev, "Failed to attach to PM Domain : %d\n", ret); return ret; } - ret = auxdrv->probe(auxdev, auxiliary_match_id(auxdrv->id_table, auxdev)); - if (ret) - dev_pm_domain_detach(dev, true); - - return ret; + return auxdrv->probe(auxdev, auxiliary_match_id(auxdrv->id_table, auxdev)); } static void auxiliary_bus_remove(struct device *dev) @@ -237,7 +234,6 @@ static void auxiliary_bus_remove(struct device *dev) if (auxdrv->remove) auxdrv->remove(auxdev); - dev_pm_domain_detach(dev, true); } static void auxiliary_bus_shutdown(struct device *dev) From 3c9ba2777d6c86025e1ba4186dc5cd930e40ec5f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 22 Aug 2025 07:07:14 +0000 Subject: [PATCH 0971/3206] kernfs: Fix UAF in polling when open file is released A use-after-free (UAF) vulnerability was identified in the PSI (Pressure Stall Information) monitoring mechanism: BUG: KASAN: slab-use-after-free in psi_trigger_poll+0x3c/0x140 Read of size 8 at addr ffff3de3d50bd308 by task systemd/1 psi_trigger_poll+0x3c/0x140 cgroup_pressure_poll+0x70/0xa0 cgroup_file_poll+0x8c/0x100 kernfs_fop_poll+0x11c/0x1c0 ep_item_poll.isra.0+0x188/0x2c0 Allocated by task 1: cgroup_file_open+0x88/0x388 kernfs_fop_open+0x73c/0xaf0 do_dentry_open+0x5fc/0x1200 vfs_open+0xa0/0x3f0 do_open+0x7e8/0xd08 path_openat+0x2fc/0x6b0 do_filp_open+0x174/0x368 Freed by task 8462: cgroup_file_release+0x130/0x1f8 kernfs_drain_open_files+0x17c/0x440 kernfs_drain+0x2dc/0x360 kernfs_show+0x1b8/0x288 cgroup_file_show+0x150/0x268 cgroup_pressure_write+0x1dc/0x340 cgroup_file_write+0x274/0x548 Reproduction Steps: 1. Open test/cpu.pressure and establish epoll monitoring 2. Disable monitoring: echo 0 > test/cgroup.pressure 3. Re-enable monitoring: echo 1 > test/cgroup.pressure The race condition occurs because: 1. When cgroup.pressure is disabled (echo 0 > cgroup.pressure), it: - Releases PSI triggers via cgroup_file_release() - Frees of->priv through kernfs_drain_open_files() 2. While epoll still holds reference to the file and continues polling 3. Re-enabling (echo 1 > cgroup.pressure) accesses freed of->priv epolling disable/enable cgroup.pressure fd=open(cpu.pressure) while(1) ... epoll_wait kernfs_fop_poll kernfs_get_active = true echo 0 > cgroup.pressure ... cgroup_file_show kernfs_show // inactive kn kernfs_drain_open_files cft->release(of); kfree(ctx); ... kernfs_get_active = false echo 1 > cgroup.pressure kernfs_show kernfs_activate_one(kn); kernfs_fop_poll kernfs_get_active = true cgroup_file_poll psi_trigger_poll // UAF ... end: close(fd) To address this issue, introduce kernfs_get_active_of() for kernfs open files to obtain active references. This function will fail if the open file has been released. Replace kernfs_get_active() with kernfs_get_active_of() to prevent further operations on released file descriptors. Fixes: 34f26a15611a ("sched/psi: Per-cgroup PSI accounting disable/re-enable interface") Cc: stable Reported-by: Zhang Zhaotian Signed-off-by: Chen Ridong Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250822070715.1565236-2-chenridong@huaweicloud.com Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 58 +++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index a6c692cac61659..9adf36e6364b7d 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -70,6 +70,24 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) !list_empty(&of->list)); } +/* Get active reference to kernfs node for an open file */ +static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of) +{ + /* Skip if file was already released */ + if (unlikely(of->released)) + return NULL; + + if (!kernfs_get_active(of->kn)) + return NULL; + + return of; +} + +static void kernfs_put_active_of(struct kernfs_open_file *of) +{ + return kernfs_put_active(of->kn); +} + /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * @@ -139,7 +157,7 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) if (ops->seq_stop) ops->seq_stop(sf, v); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) @@ -152,7 +170,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); @@ -238,7 +256,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; @@ -252,7 +270,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) else len = -EINVAL; - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len < 0) @@ -323,7 +341,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; @@ -335,7 +353,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) else len = -EINVAL; - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len > 0) @@ -357,13 +375,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma) if (!of->vm_ops) return; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return; if (of->vm_ops->open) of->vm_ops->open(vma); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) @@ -375,14 +393,14 @@ static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) if (!of->vm_ops) return VM_FAULT_SIGBUS; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -395,7 +413,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) if (!of->vm_ops) return VM_FAULT_SIGBUS; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = 0; @@ -404,7 +422,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) else file_update_time(file); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -418,14 +436,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, if (!of->vm_ops) return -EINVAL; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -455,7 +473,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) mutex_lock(&of->mutex); rc = -ENODEV; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) goto out_unlock; ops = kernfs_ops(of->kn); @@ -490,7 +508,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) } vma->vm_ops = &kernfs_vm_ops; out_put: - kernfs_put_active(of->kn); + kernfs_put_active_of(of); out_unlock: mutex_unlock(&of->mutex); @@ -852,7 +870,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; - if (!kernfs_get_active(kn)) + if (!kernfs_get_active_of(of)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) @@ -860,7 +878,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) else ret = kernfs_generic_poll(of, wait); - kernfs_put_active(kn); + kernfs_put_active_of(of); return ret; } @@ -875,7 +893,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); return -ENODEV; } @@ -886,7 +904,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) else ret = generic_file_llseek(file, offset, whence); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); return ret; } From 4c48aed6dfcd32ea23e52adc1072405a62facf46 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Wed, 3 Sep 2025 19:37:22 +0800 Subject: [PATCH 0972/3206] driver core: auxiliary bus: Optimize logic of auxiliary_match_id() auxiliary_match_id() repeatedly calculates variable @match_size in the for loop, however, the variable is fixed actually, so it is enough to only calculate the variable once. Besides, the function should return directly if name of the @auxdev does not include '.', but it still iterates over the ID table. Additionally, statement 'dev_name(&auxdev->dev)' is fixed, but may be evaluated more than 3 times. Optimize logic of the function by: - Move the logic calculating the variable out of the for loop - Return NULL directly if @p == NULL - Give the statement an dedicated local variable @auxdev_name Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250903-fix_auxbus-v2-1-3eae8374fd65@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/auxiliary.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index f65ba30d0617ff..04bdbff4dbe53f 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -171,17 +171,18 @@ static const struct auxiliary_device_id *auxiliary_match_id(const struct auxiliary_device_id *id, const struct auxiliary_device *auxdev) { - for (; id->name[0]; id++) { - const char *p = strrchr(dev_name(&auxdev->dev), '.'); - int match_size; + const char *auxdev_name = dev_name(&auxdev->dev); + const char *p = strrchr(auxdev_name, '.'); + int match_size; - if (!p) - continue; - match_size = p - dev_name(&auxdev->dev); + if (!p) + return NULL; + match_size = p - auxdev_name; + for (; id->name[0]; id++) { /* use dev_name(&auxdev->dev) prefix before last '.' char to match to */ if (strlen(id->name) == match_size && - !strncmp(dev_name(&auxdev->dev), id->name, match_size)) + !strncmp(auxdev_name, id->name, match_size)) return id; } return NULL; From 980927603c4b4fa499771bf36f445808d6859a61 Mon Sep 17 00:00:00 2001 From: "Darshan R." Date: Wed, 23 Jul 2025 07:06:59 +0000 Subject: [PATCH 0973/3206] power: supply: gpio-charger: Clean up spacing for better readability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed some minor style issues reported by checkpatch.pl. Mainly adjusted the spacing around operators and type casts to match the kernel coding conventions. For example: - Changed `gpios[ndescs-i-1]` to `gpios[ndescs - i - 1]` - Added space in `(u32*)` to make it `(u32 *)` - Cleaned up spacing in a `for` loop No functional changes — just making the code easier to read and consistent with the rest of the kernel. Signed-off-by: Darshan R. Signed-off-by: Sebastian Reichel --- drivers/power/supply/gpio-charger.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/power/supply/gpio-charger.c b/drivers/power/supply/gpio-charger.c index 1b2da9b5fb6541..2504190eba82e6 100644 --- a/drivers/power/supply/gpio-charger.c +++ b/drivers/power/supply/gpio-charger.c @@ -79,7 +79,8 @@ static int set_charge_current_limit(struct gpio_charger *gpio_charger, int val) for (i = 0; i < ndescs; i++) { bool val = (mapping.gpiodata >> i) & 1; - gpiod_set_value_cansleep(gpios[ndescs-i-1], val); + + gpiod_set_value_cansleep(gpios[ndescs - i - 1], val); } gpio_charger->charge_current_limit = mapping.limit_ua; @@ -226,14 +227,14 @@ static int init_charge_current_limit(struct device *dev, gpio_charger->current_limit_map_size = len / 2; len = device_property_read_u32_array(dev, "charge-current-limit-mapping", - (u32*) gpio_charger->current_limit_map, len); + (u32 *) gpio_charger->current_limit_map, len); if (len < 0) return len; set_def_limit = !device_property_read_u32(dev, "charge-current-limit-default-microamp", &def_limit); - for (i=0; i < gpio_charger->current_limit_map_size; i++) { + for (i = 0; i < gpio_charger->current_limit_map_size; i++) { if (gpio_charger->current_limit_map[i].limit_ua > cur_limit) { dev_err(dev, "charge-current-limit-mapping not sorted by current in descending order\n"); return -EINVAL; From d3684397ea9ba2edf02be0aa2b4dcab3bd74c503 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Aug 2025 19:29:55 -0400 Subject: [PATCH 0974/3206] nfs/localio: avoid bouncing LOCALIO if nfs_client_is_local() Previously nfs_local_probe() was made to disable and then attempt to re-enable LOCALIO (via LOCALIO protocol handshake) if/when it was called and LOCALIO already enabled. Vague memory for _why_ this was the case is that this was useful if/when a local NFS server were to be restarted with a local NFS client connected to it. But as it happens this causes an absurd amount of LOCALIO flapping which has a side-effect of too much IO being needlessly sent to NFSD (using RPC over the loopback network interface). This is the definition of "serious performance loss" (that negates the point of having LOCALIO). So remove this mis-optimization for re-enabling LOCALIO if/when an NFS server is restarted (which is an extremely rare thing to do). Will revisit testing that scenario again but in the meantime this patch restores the full benefit of LOCALIO. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index bdb82a19136aaa..97abf62f109d2e 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -180,10 +180,8 @@ static void nfs_local_probe(struct nfs_client *clp) return; } - if (nfs_client_is_local(clp)) { - /* If already enabled, disable and re-enable */ - nfs_localio_disable_client(clp); - } + if (nfs_client_is_local(clp)) + return; if (!nfs_uuid_begin(&clp->cl_uuid)) return; @@ -244,7 +242,8 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, case -ENOMEM: case -ENXIO: case -ENOENT: - /* Revalidate localio, will disable if unsupported */ + /* Revalidate localio */ + nfs_localio_disable_client(clp); nfs_local_probe(clp); } } From 5a46d2339a5ae268ede53a221f20433d8ea4f2f9 Mon Sep 17 00:00:00 2001 From: Tigran Mkrtchyan Date: Thu, 28 Aug 2025 16:51:00 +0200 Subject: [PATCH 0975/3206] flexfiles/pNFS: fix NULL checks on result of ff_layout_choose_ds_for_read Recent commit f06bedfa62d5 ("pNFS/flexfiles: don't attempt pnfs on fatal DS errors") has changed the error return type of ff_layout_choose_ds_for_read() from NULL to an error pointer. However, not all code paths have been updated to match the change. Thus, some non-NULL checks will accept error pointers as a valid return value. Reported-by: Dan Carpenter Suggested-by: Dan Carpenter Fixes: f06bedfa62d5 ("pNFS/flexfiles: don't attempt pnfs on fatal DS errors") Signed-off-by: Tigran Mkrtchyan Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 8dc921d835388e..f8ab7b4e09e7e2 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -773,8 +773,11 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, continue; if (check_device && - nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) + nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) { + // reinitialize the error state in case if this is the last iteration + ds = ERR_PTR(-EINVAL); continue; + } *best_idx = idx; break; @@ -804,7 +807,7 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, struct nfs4_pnfs_ds *ds; ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); - if (ds) + if (!IS_ERR(ds)) return ds; return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); } @@ -818,7 +821,7 @@ ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, best_idx); - if (ds || !pgio->pg_mirror_idx) + if (!IS_ERR(ds) || !pgio->pg_mirror_idx) return ds; return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx); } @@ -868,7 +871,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, req->wb_nio = 0; ds = ff_layout_get_ds_for_read(pgio, &ds_idx); - if (!ds) { + if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; pnfs_generic_pg_cleanup(pgio); @@ -1072,11 +1075,13 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; u32 new_idx = 0; + struct nfs4_pnfs_ds *ds; - if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx)) - ff_layout_send_layouterror(hdr->lseg); - else + ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx); + if (IS_ERR(ds)) pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); + else + ff_layout_send_layouterror(hdr->lseg); pnfs_read_resend_pnfs(hdr, new_idx); } From b1817b18ff20e69f5accdccefaf78bf5454bede2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 4 Sep 2025 18:46:16 -0400 Subject: [PATCH 0976/3206] NFS: Protect against 'eof page pollution' This commit fixes the failing xfstest 'generic/363'. When the user mmaps() an area that extends beyond the end of file, and proceeds to write data into the folio that straddles that eof, we're required to discard that folio data if the user calls some function that extends the file length. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 33 +++++++++++++++++++++++++++++++++ fs/nfs/inode.c | 9 +++++++-- fs/nfs/internal.h | 2 ++ fs/nfs/nfs42proc.c | 14 +++++++++++--- fs/nfs/nfstrace.h | 1 + 5 files changed, 54 insertions(+), 5 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 86e36c630f09ea..a3105f944a0ee3 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -280,6 +281,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL_GPL(nfs_file_fsync); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to) +{ + struct folio *folio; + + if (from >= to) + return; + + folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + + if (folio_test_uptodate(folio)) { + loff_t fpos = folio_pos(folio); + size_t offset = from - fpos; + size_t end = folio_size(folio); + + if (to - fpos < end) + end = to - fpos; + folio_zero_segment(folio, offset, end); + trace_nfs_size_truncate_folio(mapping->host, to); + } + + folio_unlock(folio); + folio_put(folio); +} +EXPORT_SYMBOL_GPL(nfs_truncate_last_folio); + /* * Decide whether a read/modify/write cycle may be more efficient * then a modify/write/read cycle when writing to a page in the @@ -356,6 +388,7 @@ static int nfs_write_begin(const struct kiocb *iocb, dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); + nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); fgp |= fgf_set_order(len); start: diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae42308..0b141feacc525b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -716,6 +716,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; + loff_t oldsize = i_size_read(inode); int error = 0; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -731,7 +732,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (error) return error; - if (attr->ia_size == i_size_read(inode)) + if (attr->ia_size == oldsize) attr->ia_valid &= ~ATTR_SIZE; } @@ -777,8 +778,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); - if (error == 0) + if (error == 0) { + if (attr->ia_valid & ATTR_SIZE) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + attr->ia_size); error = nfs_refresh_inode(inode, fattr); + } nfs_free_fattr(fattr); out: trace_nfs_setattr_exit(inode, error); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 74d712b584238d..1433ae13dba045 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -437,6 +437,8 @@ int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 01c01f45358b7c..4420b8740e2ff7 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -137,6 +137,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) @@ -145,7 +146,11 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == -EOPNOTSUPP) + + if (err == 0) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); + else if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | NFS_CAP_ZERO_RANGE); @@ -183,6 +188,7 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) @@ -191,9 +197,11 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == 0) + if (err == 0) { + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); truncate_pagecache_range(inode, offset, (offset + len) -1); - if (err == -EOPNOTSUPP) + } else if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; inode_unlock(inode); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 96b1323318c2f5..627115179795fc 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -272,6 +272,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class, TP_ARGS(inode, new_size)) DEFINE_NFS_UPDATE_SIZE_EVENT(truncate); +DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio); DEFINE_NFS_UPDATE_SIZE_EVENT(wcc); DEFINE_NFS_UPDATE_SIZE_EVENT(update); DEFINE_NFS_UPDATE_SIZE_EVENT(grow); From b2036bb65114c01caf4a1afe553026e081703c8c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Sep 2025 10:25:35 -0400 Subject: [PATCH 0977/3206] NFSv4.2: Protect copy offload and clone against 'eof page pollution' The NFSv4.2 copy offload and clone functions can also end up extending the size of the destination file, so they too need to call nfs_truncate_last_folio(). Reported-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 4420b8740e2ff7..e2fea37c53484b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -362,22 +362,27 @@ static int process_copy_commit(struct file *dst, loff_t pos_dst, /** * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload - * @inode: pointer to destination inode + * @file: pointer to destination file * @pos: destination offset * @len: copy length + * @oldsize: length of the file prior to clone/copy * * Punch a hole in the inode page cache, so that the NFS client will * know to retrieve new data. * Update the file size if necessary, and then mark the inode as having * invalid cached values for change attribute, ctime, mtime and space used. */ -static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len) +static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len, + loff_t oldsize) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; loff_t newsize = pos + len; loff_t end = newsize - 1; - WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_SHIFT, end >> PAGE_SHIFT)); + nfs_truncate_last_folio(mapping, oldsize, pos); + WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT)); spin_lock(&inode->i_lock); if (newsize > i_size_read(inode)) @@ -410,6 +415,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_server *src_server = NFS_SERVER(src_inode); loff_t pos_src = args->src_pos; loff_t pos_dst = args->dst_pos; + loff_t oldsize_dst = i_size_read(dst_inode); size_t count = args->count; ssize_t status; @@ -483,7 +489,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, goto out; } - nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count); + nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst); nfs_invalidate_atime(src_inode); status = res->write_res.count; out: @@ -1250,6 +1256,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct nfs42_clone_res res = { .server = server, }; + loff_t oldsize_dst = i_size_read(dst_inode); int status; msg->rpc_argp = &args; @@ -1284,7 +1291,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, /* a zero-length count means clone to EOF in src */ if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE) count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset; - nfs42_copy_dest_done(dst_inode, dst_offset, count); + nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); } From 9eb90f435415c7da4800974ed943e39b5578ee7f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 5 Sep 2025 12:06:23 -0400 Subject: [PATCH 0978/3206] NFS: Serialise O_DIRECT i/o and truncate() Ensure that all O_DIRECT reads and writes are complete, and prevent the initiation of new i/o until the setattr operation that will truncate the file is complete. Fixes: a5864c999de6 ("NFS: Do not serialise O_DIRECT reads and writes") Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 4 +++- fs/nfs/internal.h | 10 ++++++++++ fs/nfs/io.c | 13 ++----------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0b141feacc525b..49df9debb1a691 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -768,8 +768,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { + nfs_file_block_o_direct(NFS_I(inode)); nfs_sync_inode(inode); + } fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) { diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1433ae13dba045..c0a44f389f8f42 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -532,6 +532,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } +/* Must be called with exclusively locked inode->i_rwsem */ +static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(&nfsi->vfs_inode); + } +} + + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 3388faf2acb9f5..d275b0a250bf3b 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -14,15 +14,6 @@ #include "internal.h" -/* Call with exclusively locked inode->i_rwsem */ -static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) -{ - if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { - clear_bit(NFS_INO_ODIRECT, &nfsi->flags); - inode_dio_wait(inode); - } -} - /** * nfs_start_io_read - declare the file is being used for buffered reads * @inode: file inode @@ -57,7 +48,7 @@ nfs_start_io_read(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (err) return err; - nfs_block_o_direct(nfsi, inode); + nfs_file_block_o_direct(nfsi); downgrade_write(&inode->i_rwsem); return 0; @@ -90,7 +81,7 @@ nfs_start_io_write(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (!err) - nfs_block_o_direct(NFS_I(inode), inode); + nfs_file_block_o_direct(NFS_I(inode)); return err; } From b93128f29733af5d427a335978a19884c2c230e2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 5 Sep 2025 12:11:17 -0400 Subject: [PATCH 0979/3206] NFSv4.2: Serialise O_DIRECT i/o and fallocate() Ensure that all O_DIRECT reads and writes complete before calling fallocate so that we don't race w.r.t. attribute updates. Fixes: 99f237832243 ("NFSv4.2: Always flush out writes in nfs42_proc_fallocate()") Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index e2fea37c53484b..1a169372ca16e1 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, exception.inode = inode; exception.state = lock->open_context->state; + nfs_file_block_o_direct(NFS_I(inode)); err = nfs_sync_inode(inode); if (err) goto out; From c80ebeba1198eac8811ab0dba36ecc13d51e4438 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Sep 2025 10:40:24 -0400 Subject: [PATCH 0980/3206] NFSv4.2: Serialise O_DIRECT i/o and clone range Ensure that all O_DIRECT reads and writes complete before cloning a file range, so that both the source and destination are up to date. Fixes: a5864c999de6 ("NFS: Do not serialise O_DIRECT reads and writes") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1d6b5f4230c9b2..c9a0d1e420c6cb 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -278,9 +278,11 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, lock_two_nondirectories(src_inode, dst_inode); /* flush all pending writes on both src and dst so that server * has the latest data */ + nfs_file_block_o_direct(NFS_I(src_inode)); ret = nfs_sync_inode(src_inode); if (ret) goto out_unlock; + nfs_file_block_o_direct(NFS_I(dst_inode)); ret = nfs_sync_inode(dst_inode); if (ret) goto out_unlock; From ca247c89900ae90207f4d321e260cd93b7c7d104 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Sep 2025 10:54:13 -0400 Subject: [PATCH 0981/3206] NFSv4.2: Serialise O_DIRECT i/o and copy range Ensure that all O_DIRECT reads and writes complete before copying a file range, so that the destination is up to date. Fixes: a5864c999de6 ("NFS: Do not serialise O_DIRECT reads and writes") Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 1a169372ca16e1..6a0b5871ba3b09 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -445,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, return status; } + nfs_file_block_o_direct(NFS_I(dst_inode)); status = nfs_sync_inode(dst_inode); if (status) return status; From b7b8574225e9d2b5f1fb5483886ab797892f43b5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Sep 2025 11:48:57 -0400 Subject: [PATCH 0982/3206] NFS: nfs_invalidate_folio() must observe the offset and size arguments If we're truncating part of the folio, then we need to write out the data on the part that is not covered by the cancellation. Fixes: d47992f86b30 ("mm: change invalidatepage prototype to accept length") Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 7 ++++--- fs/nfs/write.c | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index a3105f944a0ee3..8059ece82468d0 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -475,10 +475,11 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset, dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", folio->index, offset, length); - if (offset != 0 || length < folio_size(folio)) - return; /* Cancel any unstarted writes on this page */ - nfs_wb_folio_cancel(inode, folio); + if (offset != 0 || length < folio_size(folio)) + nfs_wb_folio(inode, folio); + else + nfs_wb_folio_cancel(inode, folio); folio_wait_private_2(folio); /* [DEPRECATED] */ trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 8b7c0473796755..e359fbcdc8a095 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2045,6 +2045,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) * release it */ nfs_inode_remove_request(req); nfs_unlock_and_release_request(req); + folio_cancel_dirty(folio); } return ret; From c12b6a7b12a13ccd3aece6be09345c1944e18d3e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Sep 2025 20:11:03 -0400 Subject: [PATCH 0983/3206] NFS: Fix the marking of the folio as up to date Since all callers of nfs_page_group_covers_page() have already ensured that there is only one group member, all that is required is to check that the entire folio contains dirty data. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 52 +++++--------------------------------------------- 1 file changed, 5 insertions(+), 47 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e359fbcdc8a095..647c53d1418ae6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -237,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error) } /* - * nfs_page_group_search_locked - * @head - head request of page group - * @page_offset - offset into page + * nfs_page_covers_folio + * @req: struct nfs_page * - * Search page group with head @head to find a request that contains the - * page offset @page_offset. - * - * Returns a pointer to the first matching nfs request, or NULL if no - * match is found. - * - * Must be called with the page group lock held - */ -static struct nfs_page * -nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) -{ - struct nfs_page *req; - - req = head; - do { - if (page_offset >= req->wb_pgbase && - page_offset < (req->wb_pgbase + req->wb_bytes)) - return req; - - req = req->wb_this_page; - } while (req != head); - - return NULL; -} - -/* - * nfs_page_group_covers_page - * @head - head request of page group - * - * Return true if the page group with head @head covers the whole page, - * returns false otherwise + * Return true if the request covers the whole folio. + * Note that the caller should ensure all subrequests have been joined */ static bool nfs_page_group_covers_page(struct nfs_page *req) { unsigned int len = nfs_folio_length(nfs_page_to_folio(req)); - struct nfs_page *tmp; - unsigned int pos = 0; - - nfs_page_group_lock(req); - for (;;) { - tmp = nfs_page_group_search_locked(req->wb_head, pos); - if (!tmp) - break; - pos = tmp->wb_pgbase + tmp->wb_bytes; - } - - nfs_page_group_unlock(req); - return pos >= len; + return req->wb_pgbase == 0 && req->wb_bytes == len; } /* We can set the PG_uptodate flag if we see that a write request From 199cd9e8d14bc14bdbd1fa3031ce26dac9781507 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Sep 2025 09:49:33 -0400 Subject: [PATCH 0984/3206] Revert "SUNRPC: Don't allow waiting for exiting tasks" This reverts commit 14e41b16e8cb677bb440dca2edba8b041646c742. This patch breaks the LTP acct02 test, so let's revert and look for a better solution. Reported-by: Mark Brown Reported-by: Harshvardhan Jha Link: https://lore.kernel.org/linux-nfs/7d4d57b0-39a3-49f1-8ada-60364743e3b4@sirena.org.uk/ Cc: stable@vger.kernel.org # 6.15.x Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 73bc39281ef5f5..9b45fbdc90cabe 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,8 +276,6 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (unlikely(current->flags & PF_EXITING)) - return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; From 9559d2fffd4f9b892165eed48198a0e5cb8504e6 Mon Sep 17 00:00:00 2001 From: Justin Worrell Date: Thu, 4 Sep 2025 16:09:57 -0500 Subject: [PATCH 0985/3206] SUNRPC: call xs_sock_process_cmsg for all cmsg xs_sock_recv_cmsg was failing to call xs_sock_process_cmsg for any cmsg type other than TLS_RECORD_TYPE_ALERT (TLS_RECORD_TYPE_DATA, and other values not handled.) Based on my reading of the previous commit (cc5d5908: sunrpc: fix client side handling of tls alerts), it looks like only iov_iter_revert should be conditional on TLS_RECORD_TYPE_ALERT (but that other cmsg types should still call xs_sock_process_cmsg). On my machine, I was unable to connect (over mtls) to an NFS share hosted on FreeBSD. With this patch applied, I am able to mount the share again. Fixes: cc5d59081fa2 ("sunrpc: fix client side handling of tls alerts") Signed-off-by: Justin Worrell Reviewed-and-tested-by: Scott Mayhew Link: https://lore.kernel.org/r/20250904211038.12874-3-jworrell@gmail.com Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c5f7bbf5775ff8..3aa987e7f0724d 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -407,9 +407,9 @@ xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, alert_kvec.iov_len); ret = sock_recvmsg(sock, &msg, flags); - if (ret > 0 && - tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { - iov_iter_revert(&msg.msg_iter, ret); + if (ret > 0) { + if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) + iov_iter_revert(&msg.msg_iter, ret); ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, -EAGAIN); } From 5afce048a9fa6de350110c7078e69b59f5cb3eb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Garc=C3=ADa?= Date: Thu, 24 Jul 2025 09:41:33 +0200 Subject: [PATCH 0986/3206] power: supply: bq2415x: replace deprecated strcpy() with strscpy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit strcpy() is deprecated for NUL-terminated strings. Replace it with strscpy() for revstr (local fixed-size buffer). Signed-off-by: Miguel García Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq2415x_charger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/power/supply/bq2415x_charger.c b/drivers/power/supply/bq2415x_charger.c index 917c26ee56bc9f..843957548fb9bc 100644 --- a/drivers/power/supply/bq2415x_charger.c +++ b/drivers/power/supply/bq2415x_charger.c @@ -1516,7 +1516,7 @@ static int bq2415x_power_supply_init(struct bq2415x_device *bq) ret = bq2415x_detect_revision(bq); if (ret < 0) - strcpy(revstr, "unknown"); + strscpy(revstr, "unknown", sizeof(revstr)); else sprintf(revstr, "1.%d", ret); From 32f350d58544e2529dc8798275684e97f0a2df6f Mon Sep 17 00:00:00 2001 From: Waqar Hameed Date: Tue, 5 Aug 2025 11:33:35 +0200 Subject: [PATCH 0987/3206] power: supply: Remove error prints for devm_add_action_or_reset() When `devm_add_action_or_reset()` fails, it is due to a failed memory allocation and will thus return `-ENOMEM`. `dev_err_probe()` doesn't do anything when error is `-ENOMEM`. Therefore, remove the useless call to `dev_err_probe()` when `devm_add_action_or_reset()` fails, and just return the value instead. Signed-off-by: Waqar Hameed Signed-off-by: Sebastian Reichel --- drivers/power/supply/mt6370-charger.c | 4 ++-- drivers/power/supply/rt9467-charger.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/power/supply/mt6370-charger.c b/drivers/power/supply/mt6370-charger.c index 98579998b300db..29510af4e59533 100644 --- a/drivers/power/supply/mt6370-charger.c +++ b/drivers/power/supply/mt6370-charger.c @@ -898,7 +898,7 @@ static int mt6370_chg_probe(struct platform_device *pdev) ret = devm_add_action_or_reset(dev, mt6370_chg_destroy_attach_lock, &priv->attach_lock); if (ret) - return dev_err_probe(dev, ret, "Failed to init attach lock\n"); + return ret; priv->attach = MT6370_ATTACH_STAT_DETACH; @@ -909,7 +909,7 @@ static int mt6370_chg_probe(struct platform_device *pdev) ret = devm_add_action_or_reset(dev, mt6370_chg_destroy_wq, priv->wq); if (ret) - return dev_err_probe(dev, ret, "Failed to init wq\n"); + return ret; ret = devm_work_autocancel(dev, &priv->bc12_work, mt6370_chg_bc12_work_func); if (ret) diff --git a/drivers/power/supply/rt9467-charger.c b/drivers/power/supply/rt9467-charger.c index e9aba9ad393c9c..e2ff9c4609ef1b 100644 --- a/drivers/power/supply/rt9467-charger.c +++ b/drivers/power/supply/rt9467-charger.c @@ -1218,25 +1218,25 @@ static int rt9467_charger_probe(struct i2c_client *i2c) ret = devm_add_action_or_reset(dev, rt9467_chg_destroy_adc_lock, &data->adc_lock); if (ret) - return dev_err_probe(dev, ret, "Failed to init ADC lock\n"); + return ret; mutex_init(&data->attach_lock); ret = devm_add_action_or_reset(dev, rt9467_chg_destroy_attach_lock, &data->attach_lock); if (ret) - return dev_err_probe(dev, ret, "Failed to init attach lock\n"); + return ret; mutex_init(&data->ichg_ieoc_lock); ret = devm_add_action_or_reset(dev, rt9467_chg_destroy_ichg_ieoc_lock, &data->ichg_ieoc_lock); if (ret) - return dev_err_probe(dev, ret, "Failed to init ICHG/IEOC lock\n"); + return ret; init_completion(&data->aicl_done); ret = devm_add_action_or_reset(dev, rt9467_chg_complete_aicl_done, &data->aicl_done); if (ret) - return dev_err_probe(dev, ret, "Failed to init AICL done completion\n"); + return ret; ret = rt9467_do_charger_init(data); if (ret) From cb03556acf83b235dfb2e9f86e14f5e5b8a5f1e7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 7 Aug 2025 13:13:49 +0100 Subject: [PATCH 0988/3206] power: supply: 88pm860x: make fsm_state array static const, simplify usage Don't populate the read-only array fsm_state on the stack at run time, instead make it static const, this reduces the object code size as the data is placed on the data segment and this removes the need to have code to set the array up on each call. Note that making the size of the strings to a more optimal 11 bytes long does not seem to reduce the overall size. Making the array an array of pointers to the strings increases the code size due to the dereferencing overhead. Simplify the array access with &fsm_state[info->state][0] with the simpler expression fsm_state[info->state] to clean up the code. Original: text data bss dec hex filename 22884 8272 64 31220 79f4 drivers/power/supply/88pm860x_charger.o Patched: text data bss dec hex filename 22695 8368 64 31127 7997 drivers/power/supply/88pm860x_charger.o Difference: text data bss dec -189 +96 0 -93 Reduction of 93 bytes total. gcc version 14.2.0 (x86-64) Signed-off-by: Colin Ian King Signed-off-by: Sebastian Reichel --- drivers/power/supply/88pm860x_charger.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/power/supply/88pm860x_charger.c b/drivers/power/supply/88pm860x_charger.c index 2b9fcb7e71d79e..8d99c6ff72edf6 100644 --- a/drivers/power/supply/88pm860x_charger.c +++ b/drivers/power/supply/88pm860x_charger.c @@ -284,8 +284,8 @@ static int set_charging_fsm(struct pm860x_charger_info *info) { struct power_supply *psy; union power_supply_propval data; - unsigned char fsm_state[][16] = { "init", "discharge", "precharge", - "fastcharge", + static const unsigned char fsm_state[][16] = { + "init", "discharge", "precharge", "fastcharge", }; int ret; int vbatt; @@ -313,7 +313,7 @@ static int set_charging_fsm(struct pm860x_charger_info *info) dev_dbg(info->dev, "Entering FSM:%s, Charger:%s, Battery:%s, " "Allowed:%d\n", - &fsm_state[info->state][0], + fsm_state[info->state], (info->online) ? "online" : "N/A", (info->present) ? "present" : "N/A", info->allowed); dev_dbg(info->dev, "set_charging_fsm:vbatt:%d(mV)\n", vbatt); @@ -385,7 +385,7 @@ static int set_charging_fsm(struct pm860x_charger_info *info) } dev_dbg(info->dev, "Out FSM:%s, Charger:%s, Battery:%s, Allowed:%d\n", - &fsm_state[info->state][0], + fsm_state[info->state], (info->online) ? "online" : "N/A", (info->present) ? "present" : "N/A", info->allowed); mutex_unlock(&info->lock); From fee0904441325d83e7578ca457ec65a9d3f21264 Mon Sep 17 00:00:00 2001 From: Christopher Ruehl Date: Mon, 11 Aug 2025 17:22:09 +0200 Subject: [PATCH 0989/3206] power: supply: qcom_battmgr: add OOI chemistry The ASUS S15 xElite model report the Li-ion battery with an OOI, hence this update the detection and return the appropriate type. Signed-off-by: Christopher Ruehl Reviewed-by: Dmitry Baryshkov Signed-off-by: Sebastian Reichel --- drivers/power/supply/qcom_battmgr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index 99808ea9851f6a..fdb2d1b883fc58 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -982,7 +982,8 @@ static void qcom_battmgr_sc8280xp_strcpy(char *dest, const char *src) static unsigned int qcom_battmgr_sc8280xp_parse_technology(const char *chemistry) { - if (!strncmp(chemistry, "LIO", BATTMGR_CHEMISTRY_LEN)) + if ((!strncmp(chemistry, "LIO", BATTMGR_CHEMISTRY_LEN)) || + (!strncmp(chemistry, "OOI", BATTMGR_CHEMISTRY_LEN))) return POWER_SUPPLY_TECHNOLOGY_LION; if (!strncmp(chemistry, "LIP", BATTMGR_CHEMISTRY_LEN)) return POWER_SUPPLY_TECHNOLOGY_LIPO; From 15a84d15a677533afa55010f1e2052a27fcd854d Mon Sep 17 00:00:00 2001 From: ChiYuan Huang Date: Thu, 14 Aug 2025 15:27:41 +0800 Subject: [PATCH 0990/3206] power: supply: rt9467: Add properties for VBUS and IBUS reading Since there's the existing ADC function, add properties 'VOLTAGE_NOW' and 'CURRENT_NOW' to report the current VBUS and IBUS value, respectively. Signed-off-by: ChiYuan Huang Signed-off-by: Sebastian Reichel --- drivers/power/supply/rt9467-charger.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/power/supply/rt9467-charger.c b/drivers/power/supply/rt9467-charger.c index e2ff9c4609ef1b..32e7c7620b91e8 100644 --- a/drivers/power/supply/rt9467-charger.c +++ b/drivers/power/supply/rt9467-charger.c @@ -633,7 +633,9 @@ static int rt9467_psy_set_ieoc(struct rt9467_chg_data *data, int microamp) static const enum power_supply_property rt9467_chg_properties[] = { POWER_SUPPLY_PROP_STATUS, POWER_SUPPLY_PROP_ONLINE, + POWER_SUPPLY_PROP_VOLTAGE_NOW, POWER_SUPPLY_PROP_CURRENT_MAX, + POWER_SUPPLY_PROP_CURRENT_NOW, POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT, POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX, POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE, @@ -656,6 +658,8 @@ static int rt9467_psy_get_property(struct power_supply *psy, return rt9467_psy_get_status(data, &val->intval); case POWER_SUPPLY_PROP_ONLINE: return regmap_field_read(data->rm_field[F_PWR_RDY], &val->intval); + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + return rt9467_get_adc(data, RT9467_ADC_VBUS_DIV5, &val->intval); case POWER_SUPPLY_PROP_CURRENT_MAX: mutex_lock(&data->attach_lock); if (data->psy_usb_type == POWER_SUPPLY_USB_TYPE_UNKNOWN || @@ -665,6 +669,8 @@ static int rt9467_psy_get_property(struct power_supply *psy, val->intval = 1500000; mutex_unlock(&data->attach_lock); return 0; + case POWER_SUPPLY_PROP_CURRENT_NOW: + return rt9467_get_adc(data, RT9467_ADC_IBUS, &val->intval); case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: mutex_lock(&data->ichg_ieoc_lock); val->intval = data->ichg_ua; From d48d4e4f141b38944da3b0a9c21ce6828ec31d83 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Thu, 14 Aug 2025 22:36:53 +0800 Subject: [PATCH 0991/3206] power: supply: use max() to improve code Use max() to reduce the code in cw_battery_get_property() and improve its readability. Signed-off-by: Qianfeng Rong Signed-off-by: Sebastian Reichel --- drivers/power/supply/cw2015_battery.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/power/supply/cw2015_battery.c b/drivers/power/supply/cw2015_battery.c index f63c3c41045155..afc607fee5c96f 100644 --- a/drivers/power/supply/cw2015_battery.c +++ b/drivers/power/supply/cw2015_battery.c @@ -506,10 +506,7 @@ static int cw_battery_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_CHARGE_FULL: case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN: - if (cw_bat->battery->charge_full_design_uah > 0) - val->intval = cw_bat->battery->charge_full_design_uah; - else - val->intval = 0; + val->intval = max(cw_bat->battery->charge_full_design_uah, 0); break; case POWER_SUPPLY_PROP_CHARGE_NOW: From ee289d3abef9f6fd4b6b4c3af1fb6e566ba94ebb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:12 -0700 Subject: [PATCH 0992/3206] crypto: hisilicon/hpre - Remove unused curve25519 kpp support Curve25519 is used only via the library API, not the crypto_kpp API. In preparation for removing the unused crypto_kpp API for Curve25519, remove the unused "hpre-curve25519" kpp algorithm. Cc: Longfang Liu Cc: Zhiqi Song Link: https://lore.kernel.org/r/20250906213523.84915-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- drivers/crypto/hisilicon/Kconfig | 1 - drivers/crypto/hisilicon/hpre/hpre_crypto.c | 403 +------------------- 2 files changed, 4 insertions(+), 400 deletions(-) diff --git a/drivers/crypto/hisilicon/Kconfig b/drivers/crypto/hisilicon/Kconfig index 4137a8bf131f0c..4835bdebdbb381 100644 --- a/drivers/crypto/hisilicon/Kconfig +++ b/drivers/crypto/hisilicon/Kconfig @@ -69,7 +69,6 @@ config CRYPTO_DEV_HISI_HPRE select CRYPTO_DEV_HISI_QM select CRYPTO_DH select CRYPTO_RSA - select CRYPTO_CURVE25519 select CRYPTO_ECDH help Support for HiSilicon HPRE(High Performance RSA Engine) diff --git a/drivers/crypto/hisilicon/hpre/hpre_crypto.c b/drivers/crypto/hisilicon/hpre/hpre_crypto.c index 1550c3818383ab..21ccf879f70c55 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_crypto.c +++ b/drivers/crypto/hisilicon/hpre/hpre_crypto.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 HiSilicon Limited. */ #include -#include #include #include #include @@ -106,16 +105,6 @@ struct hpre_ecdh_ctx { dma_addr_t dma_g; }; -struct hpre_curve25519_ctx { - /* low address: p->a->k */ - unsigned char *p; - dma_addr_t dma_p; - - /* gx coordinate */ - unsigned char *g; - dma_addr_t dma_g; -}; - struct hpre_ctx { struct hisi_qp *qp; struct device *dev; @@ -129,7 +118,6 @@ struct hpre_ctx { struct hpre_rsa_ctx rsa; struct hpre_dh_ctx dh; struct hpre_ecdh_ctx ecdh; - struct hpre_curve25519_ctx curve25519; }; /* for ecc algorithms */ unsigned int curve_id; @@ -146,7 +134,6 @@ struct hpre_asym_request { struct akcipher_request *rsa; struct kpp_request *dh; struct kpp_request *ecdh; - struct kpp_request *curve25519; } areq; int err; int req_id; @@ -1214,8 +1201,7 @@ static void hpre_key_to_big_end(u8 *data, int len) } } -static void hpre_ecc_clear_ctx(struct hpre_ctx *ctx, bool is_clear_all, - bool is_ecdh) +static void hpre_ecc_clear_ctx(struct hpre_ctx *ctx, bool is_clear_all) { struct device *dev = ctx->dev; unsigned int sz = ctx->key_sz; @@ -1224,17 +1210,11 @@ static void hpre_ecc_clear_ctx(struct hpre_ctx *ctx, bool is_clear_all, if (is_clear_all) hisi_qm_stop_qp(ctx->qp); - if (is_ecdh && ctx->ecdh.p) { + if (ctx->ecdh.p) { /* ecdh: p->a->k->b */ memzero_explicit(ctx->ecdh.p + shift, sz); dma_free_coherent(dev, sz << 3, ctx->ecdh.p, ctx->ecdh.dma_p); ctx->ecdh.p = NULL; - } else if (!is_ecdh && ctx->curve25519.p) { - /* curve25519: p->a->k */ - memzero_explicit(ctx->curve25519.p + shift, sz); - dma_free_coherent(dev, sz << 2, ctx->curve25519.p, - ctx->curve25519.dma_p); - ctx->curve25519.p = NULL; } hpre_ctx_clear(ctx, is_clear_all); @@ -1432,7 +1412,7 @@ static int hpre_ecdh_set_secret(struct crypto_kpp *tfm, const void *buf, return -EINVAL; } - hpre_ecc_clear_ctx(ctx, false, true); + hpre_ecc_clear_ctx(ctx, false); ret = hpre_ecdh_set_param(ctx, ¶ms); if (ret < 0) { @@ -1683,337 +1663,7 @@ static void hpre_ecdh_exit_tfm(struct crypto_kpp *tfm) { struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - hpre_ecc_clear_ctx(ctx, true, true); -} - -static void hpre_curve25519_fill_curve(struct hpre_ctx *ctx, const void *buf, - unsigned int len) -{ - u8 secret[CURVE25519_KEY_SIZE] = { 0 }; - unsigned int sz = ctx->key_sz; - const struct ecc_curve *curve; - unsigned int shift = sz << 1; - void *p; - - /* - * The key from 'buf' is in little-endian, we should preprocess it as - * the description in rfc7748: "k[0] &= 248, k[31] &= 127, k[31] |= 64", - * then convert it to big endian. Only in this way, the result can be - * the same as the software curve-25519 that exists in crypto. - */ - memcpy(secret, buf, len); - curve25519_clamp_secret(secret); - hpre_key_to_big_end(secret, CURVE25519_KEY_SIZE); - - p = ctx->curve25519.p + sz - len; - - curve = ecc_get_curve25519(); - - /* fill curve parameters */ - fill_curve_param(p, curve->p, len, curve->g.ndigits); - fill_curve_param(p + sz, curve->a, len, curve->g.ndigits); - memcpy(p + shift, secret, len); - fill_curve_param(p + shift + sz, curve->g.x, len, curve->g.ndigits); - memzero_explicit(secret, CURVE25519_KEY_SIZE); -} - -static int hpre_curve25519_set_param(struct hpre_ctx *ctx, const void *buf, - unsigned int len) -{ - struct device *dev = ctx->dev; - unsigned int sz = ctx->key_sz; - unsigned int shift = sz << 1; - - /* p->a->k->gx */ - if (!ctx->curve25519.p) { - ctx->curve25519.p = dma_alloc_coherent(dev, sz << 2, - &ctx->curve25519.dma_p, - GFP_KERNEL); - if (!ctx->curve25519.p) - return -ENOMEM; - } - - ctx->curve25519.g = ctx->curve25519.p + shift + sz; - ctx->curve25519.dma_g = ctx->curve25519.dma_p + shift + sz; - - hpre_curve25519_fill_curve(ctx, buf, len); - - return 0; -} - -static int hpre_curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, - unsigned int len) -{ - struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - struct device *dev = ctx->dev; - int ret = -EINVAL; - - if (len != CURVE25519_KEY_SIZE || - !crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) { - dev_err(dev, "key is null or key len is not 32bytes!\n"); - return ret; - } - - /* Free old secret if any */ - hpre_ecc_clear_ctx(ctx, false, false); - - ctx->key_sz = CURVE25519_KEY_SIZE; - ret = hpre_curve25519_set_param(ctx, buf, CURVE25519_KEY_SIZE); - if (ret) { - dev_err(dev, "failed to set curve25519 param, ret = %d!\n", ret); - hpre_ecc_clear_ctx(ctx, false, false); - return ret; - } - - return 0; -} - -static void hpre_curve25519_hw_data_clr_all(struct hpre_ctx *ctx, - struct hpre_asym_request *req, - struct scatterlist *dst, - struct scatterlist *src) -{ - struct device *dev = ctx->dev; - struct hpre_sqe *sqe = &req->req; - dma_addr_t dma; - - dma = le64_to_cpu(sqe->in); - if (unlikely(dma_mapping_error(dev, dma))) - return; - - if (src && req->src) - dma_free_coherent(dev, ctx->key_sz, req->src, dma); - - dma = le64_to_cpu(sqe->out); - if (unlikely(dma_mapping_error(dev, dma))) - return; - - if (req->dst) - dma_free_coherent(dev, ctx->key_sz, req->dst, dma); - if (dst) - dma_unmap_single(dev, dma, ctx->key_sz, DMA_FROM_DEVICE); -} - -static void hpre_curve25519_cb(struct hpre_ctx *ctx, void *resp) -{ - struct hpre_dfx *dfx = ctx->hpre->debug.dfx; - struct hpre_asym_request *req = NULL; - struct kpp_request *areq; - u64 overtime_thrhld; - int ret; - - ret = hpre_alg_res_post_hf(ctx, resp, (void **)&req); - areq = req->areq.curve25519; - areq->dst_len = ctx->key_sz; - - overtime_thrhld = atomic64_read(&dfx[HPRE_OVERTIME_THRHLD].value); - if (overtime_thrhld && hpre_is_bd_timeout(req, overtime_thrhld)) - atomic64_inc(&dfx[HPRE_OVER_THRHLD_CNT].value); - - /* Do unmap before data processing */ - hpre_curve25519_hw_data_clr_all(ctx, req, areq->dst, areq->src); - - hpre_key_to_big_end(sg_virt(areq->dst), CURVE25519_KEY_SIZE); - - kpp_request_complete(areq, ret); - - atomic64_inc(&dfx[HPRE_RECV_CNT].value); -} - -static int hpre_curve25519_msg_request_set(struct hpre_ctx *ctx, - struct kpp_request *req) -{ - struct hpre_asym_request *h_req; - struct hpre_sqe *msg; - int req_id; - void *tmp; - - if (unlikely(req->dst_len < ctx->key_sz)) { - req->dst_len = ctx->key_sz; - return -EINVAL; - } - - tmp = kpp_request_ctx(req); - h_req = PTR_ALIGN(tmp, hpre_align_sz()); - h_req->cb = hpre_curve25519_cb; - h_req->areq.curve25519 = req; - msg = &h_req->req; - memset(msg, 0, sizeof(*msg)); - msg->in = cpu_to_le64(DMA_MAPPING_ERROR); - msg->out = cpu_to_le64(DMA_MAPPING_ERROR); - msg->key = cpu_to_le64(ctx->curve25519.dma_p); - - msg->dw0 |= cpu_to_le32(0x1U << HPRE_SQE_DONE_SHIFT); - msg->task_len1 = (ctx->key_sz >> HPRE_BITS_2_BYTES_SHIFT) - 1; - h_req->ctx = ctx; - - req_id = hpre_add_req_to_ctx(h_req); - if (req_id < 0) - return -EBUSY; - - msg->tag = cpu_to_le16((u16)req_id); - return 0; -} - -static void hpre_curve25519_src_modulo_p(u8 *ptr) -{ - int i; - - for (i = 0; i < CURVE25519_KEY_SIZE - 1; i++) - ptr[i] = 0; - - /* The modulus is ptr's last byte minus '0xed'(last byte of p) */ - ptr[i] -= 0xed; -} - -static int hpre_curve25519_src_init(struct hpre_asym_request *hpre_req, - struct scatterlist *data, unsigned int len) -{ - struct hpre_sqe *msg = &hpre_req->req; - struct hpre_ctx *ctx = hpre_req->ctx; - struct device *dev = ctx->dev; - u8 p[CURVE25519_KEY_SIZE] = { 0 }; - const struct ecc_curve *curve; - dma_addr_t dma = 0; - u8 *ptr; - - if (len != CURVE25519_KEY_SIZE) { - dev_err(dev, "sourc_data len is not 32bytes, len = %u!\n", len); - return -EINVAL; - } - - ptr = dma_alloc_coherent(dev, ctx->key_sz, &dma, GFP_KERNEL); - if (unlikely(!ptr)) - return -ENOMEM; - - scatterwalk_map_and_copy(ptr, data, 0, len, 0); - - if (!crypto_memneq(ptr, curve25519_null_point, CURVE25519_KEY_SIZE)) { - dev_err(dev, "gx is null!\n"); - goto err; - } - - /* - * Src_data(gx) is in little-endian order, MSB in the final byte should - * be masked as described in RFC7748, then transform it to big-endian - * form, then hisi_hpre can use the data. - */ - ptr[31] &= 0x7f; - hpre_key_to_big_end(ptr, CURVE25519_KEY_SIZE); - - curve = ecc_get_curve25519(); - - fill_curve_param(p, curve->p, CURVE25519_KEY_SIZE, curve->g.ndigits); - - /* - * When src_data equals (2^255 - 19) ~ (2^255 - 1), it is out of p, - * we get its modulus to p, and then use it. - */ - if (memcmp(ptr, p, ctx->key_sz) == 0) { - dev_err(dev, "gx is p!\n"); - goto err; - } else if (memcmp(ptr, p, ctx->key_sz) > 0) { - hpre_curve25519_src_modulo_p(ptr); - } - - hpre_req->src = ptr; - msg->in = cpu_to_le64(dma); - return 0; - -err: - dma_free_coherent(dev, ctx->key_sz, ptr, dma); - return -EINVAL; -} - -static int hpre_curve25519_dst_init(struct hpre_asym_request *hpre_req, - struct scatterlist *data, unsigned int len) -{ - struct hpre_sqe *msg = &hpre_req->req; - struct hpre_ctx *ctx = hpre_req->ctx; - struct device *dev = ctx->dev; - dma_addr_t dma; - - if (!data || !sg_is_last(data) || len != ctx->key_sz) { - dev_err(dev, "data or data length is illegal!\n"); - return -EINVAL; - } - - hpre_req->dst = NULL; - dma = dma_map_single(dev, sg_virt(data), len, DMA_FROM_DEVICE); - if (unlikely(dma_mapping_error(dev, dma))) { - dev_err(dev, "dma map data err!\n"); - return -ENOMEM; - } - - msg->out = cpu_to_le64(dma); - return 0; -} - -static int hpre_curve25519_compute_value(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - struct device *dev = ctx->dev; - void *tmp = kpp_request_ctx(req); - struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, hpre_align_sz()); - struct hpre_sqe *msg = &hpre_req->req; - int ret; - - ret = hpre_curve25519_msg_request_set(ctx, req); - if (unlikely(ret)) { - dev_err(dev, "failed to set curve25519 request, ret = %d!\n", ret); - return ret; - } - - if (req->src) { - ret = hpre_curve25519_src_init(hpre_req, req->src, req->src_len); - if (unlikely(ret)) { - dev_err(dev, "failed to init src data, ret = %d!\n", - ret); - goto clear_all; - } - } else { - msg->in = cpu_to_le64(ctx->curve25519.dma_g); - } - - ret = hpre_curve25519_dst_init(hpre_req, req->dst, req->dst_len); - if (unlikely(ret)) { - dev_err(dev, "failed to init dst data, ret = %d!\n", ret); - goto clear_all; - } - - msg->dw0 = cpu_to_le32(le32_to_cpu(msg->dw0) | HPRE_ALG_CURVE25519_MUL); - ret = hpre_send(ctx, msg); - if (likely(!ret)) - return -EINPROGRESS; - -clear_all: - hpre_rm_req_from_ctx(hpre_req); - hpre_curve25519_hw_data_clr_all(ctx, hpre_req, req->dst, req->src); - return ret; -} - -static unsigned int hpre_curve25519_max_size(struct crypto_kpp *tfm) -{ - struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - - return ctx->key_sz; -} - -static int hpre_curve25519_init_tfm(struct crypto_kpp *tfm) -{ - struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - - kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + hpre_align_pd()); - - return hpre_ctx_init(ctx, HPRE_V3_ECC_ALG_TYPE); -} - -static void hpre_curve25519_exit_tfm(struct crypto_kpp *tfm) -{ - struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - - hpre_ecc_clear_ctx(ctx, true, false); + hpre_ecc_clear_ctx(ctx, true); } static struct akcipher_alg rsa = { @@ -2095,22 +1745,6 @@ static struct kpp_alg ecdh_curves[] = { } }; -static struct kpp_alg curve25519_alg = { - .set_secret = hpre_curve25519_set_secret, - .generate_public_key = hpre_curve25519_compute_value, - .compute_shared_secret = hpre_curve25519_compute_value, - .max_size = hpre_curve25519_max_size, - .init = hpre_curve25519_init_tfm, - .exit = hpre_curve25519_exit_tfm, - .base = { - .cra_ctxsize = sizeof(struct hpre_ctx), - .cra_priority = HPRE_CRYPTO_ALG_PRI, - .cra_name = "curve25519", - .cra_driver_name = "hpre-curve25519", - .cra_module = THIS_MODULE, - }, -}; - static int hpre_register_rsa(struct hisi_qm *qm) { int ret; @@ -2192,28 +1826,6 @@ static void hpre_unregister_ecdh(struct hisi_qm *qm) crypto_unregister_kpp(&ecdh_curves[i]); } -static int hpre_register_x25519(struct hisi_qm *qm) -{ - int ret; - - if (!hpre_check_alg_support(qm, HPRE_DRV_X25519_MASK_CAP)) - return 0; - - ret = crypto_register_kpp(&curve25519_alg); - if (ret) - dev_err(&qm->pdev->dev, "failed to register x25519 (%d)!\n", ret); - - return ret; -} - -static void hpre_unregister_x25519(struct hisi_qm *qm) -{ - if (!hpre_check_alg_support(qm, HPRE_DRV_X25519_MASK_CAP)) - return; - - crypto_unregister_kpp(&curve25519_alg); -} - int hpre_algs_register(struct hisi_qm *qm) { int ret = 0; @@ -2236,17 +1848,11 @@ int hpre_algs_register(struct hisi_qm *qm) if (ret) goto unreg_dh; - ret = hpre_register_x25519(qm); - if (ret) - goto unreg_ecdh; - hpre_available_devs++; mutex_unlock(&hpre_algs_lock); return ret; -unreg_ecdh: - hpre_unregister_ecdh(qm); unreg_dh: hpre_unregister_dh(qm); unreg_rsa: @@ -2262,7 +1868,6 @@ void hpre_algs_unregister(struct hisi_qm *qm) if (--hpre_available_devs) goto unlock; - hpre_unregister_x25519(qm); hpre_unregister_ecdh(qm); hpre_unregister_dh(qm); hpre_unregister_rsa(qm); From 11efae10263b88eab7eb81b9c942237109c4a6c8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:13 -0700 Subject: [PATCH 0993/3206] crypto: arm/curve25519 - Remove unused kpp support Curve25519 is used only via the library API, not the crypto_kpp API. In preparation for removing the unused crypto_kpp API for Curve25519, remove the unused "curve25519-neon" kpp algorithm. Note that the underlying NEON optimized Curve25519 code remains fully supported and accessible via the library API. It's also worth noting that even if the kpp support for Curve25519 comes back later, there is no need for arch-specific kpp glue code like this, as a single kpp algorithm that wraps the library API is sufficient. Link: https://lore.kernel.org/r/20250906213523.84915-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm/crypto/Kconfig | 1 - arch/arm/crypto/curve25519-glue.c | 77 +------------------------------ 2 files changed, 1 insertion(+), 77 deletions(-) diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 1e5f3cdf691c4f..97718d86f6007e 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -5,7 +5,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (arm)" config CRYPTO_CURVE25519_NEON tristate depends on KERNEL_MODE_NEON - select CRYPTO_KPP select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 default CRYPTO_LIB_CURVE25519_INTERNAL diff --git a/arch/arm/crypto/curve25519-glue.c b/arch/arm/crypto/curve25519-glue.c index e7b87e09dd99f4..3076020d8fbeb6 100644 --- a/arch/arm/crypto/curve25519-glue.c +++ b/arch/arm/crypto/curve25519-glue.c @@ -10,13 +10,11 @@ #include #include #include -#include #include #include #include #include #include -#include #include asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE], @@ -46,92 +44,19 @@ void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], } EXPORT_SYMBOL(curve25519_base_arch); -static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, - unsigned int len) -{ - u8 *secret = kpp_tfm_ctx(tfm); - - if (!len) - curve25519_generate_secret(secret); - else if (len == CURVE25519_KEY_SIZE && - crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) - memcpy(secret, buf, CURVE25519_KEY_SIZE); - else - return -EINVAL; - return 0; -} - -static int curve25519_compute_value(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 public_key[CURVE25519_KEY_SIZE]; - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - u8 const *bp; - - if (req->src) { - copied = sg_copy_to_buffer(req->src, - sg_nents_for_len(req->src, - CURVE25519_KEY_SIZE), - public_key, CURVE25519_KEY_SIZE); - if (copied != CURVE25519_KEY_SIZE) - return -EINVAL; - bp = public_key; - } else { - bp = curve25519_base_point; - } - - curve25519_arch(buf, secret, bp); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} - -static unsigned int curve25519_max_size(struct crypto_kpp *tfm) -{ - return CURVE25519_KEY_SIZE; -} - -static struct kpp_alg curve25519_alg = { - .base.cra_name = "curve25519", - .base.cra_driver_name = "curve25519-neon", - .base.cra_priority = 200, - .base.cra_module = THIS_MODULE, - .base.cra_ctxsize = CURVE25519_KEY_SIZE, - - .set_secret = curve25519_set_secret, - .generate_public_key = curve25519_compute_value, - .compute_shared_secret = curve25519_compute_value, - .max_size = curve25519_max_size, -}; - static int __init arm_curve25519_init(void) { - if (elf_hwcap & HWCAP_NEON) { + if (elf_hwcap & HWCAP_NEON) static_branch_enable(&have_neon); - return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? - crypto_register_kpp(&curve25519_alg) : 0; - } return 0; } static void __exit arm_curve25519_exit(void) { - if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON) - crypto_unregister_kpp(&curve25519_alg); } module_init(arm_curve25519_init); module_exit(arm_curve25519_exit); -MODULE_ALIAS_CRYPTO("curve25519"); -MODULE_ALIAS_CRYPTO("curve25519-neon"); MODULE_DESCRIPTION("Public key crypto: Curve25519 (NEON-accelerated)"); MODULE_LICENSE("GPL v2"); From 8c44847e2e56a412040c0c70c44d57f6eafc67b3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:14 -0700 Subject: [PATCH 0994/3206] crypto: powerpc/curve25519 - Remove unused kpp support Curve25519 is used only via the library API, not the crypto_kpp API. In preparation for removing the unused crypto_kpp API for Curve25519, remove the unused "curve25519-ppc64le" kpp algorithm. Note that the underlying PowerPC optimized Curve25519 code remains fully supported and accessible via the library API. It's also worth noting that even if the kpp support for Curve25519 comes back later, there is no need for arch-specific kpp glue code like this, as a single kpp algorithm that wraps the library API is sufficient. Link: https://lore.kernel.org/r/20250906213523.84915-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/powerpc/crypto/Kconfig | 1 - arch/powerpc/crypto/curve25519-ppc64le-core.c | 105 ------------------ 2 files changed, 106 deletions(-) diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index f4b779c7352de9..6106a219da6afe 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -5,7 +5,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (powerpc)" config CRYPTO_CURVE25519_PPC64 tristate depends on PPC64 && CPU_LITTLE_ENDIAN - select CRYPTO_KPP select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 default CRYPTO_LIB_CURVE25519_INTERNAL diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/arch/powerpc/crypto/curve25519-ppc64le-core.c index f7810be0b292b7..6eb18ee19cad3b 100644 --- a/arch/powerpc/crypto/curve25519-ppc64le-core.c +++ b/arch/powerpc/crypto/curve25519-ppc64le-core.c @@ -8,13 +8,11 @@ */ #include -#include #include #include #include #include -#include #include #include @@ -192,109 +190,6 @@ void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], } EXPORT_SYMBOL(curve25519_base_arch); -static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, - unsigned int len) -{ - u8 *secret = kpp_tfm_ctx(tfm); - - if (!len) - curve25519_generate_secret(secret); - else if (len == CURVE25519_KEY_SIZE && - crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) - memcpy(secret, buf, CURVE25519_KEY_SIZE); - else - return -EINVAL; - return 0; -} - -static int curve25519_generate_public_key(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - - if (req->src) - return -EINVAL; - - curve25519_base_arch(buf, secret); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} - -static int curve25519_compute_shared_secret(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 public_key[CURVE25519_KEY_SIZE]; - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - - if (!req->src) - return -EINVAL; - - copied = sg_copy_to_buffer(req->src, - sg_nents_for_len(req->src, - CURVE25519_KEY_SIZE), - public_key, CURVE25519_KEY_SIZE); - if (copied != CURVE25519_KEY_SIZE) - return -EINVAL; - - curve25519_arch(buf, secret, public_key); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} - -static unsigned int curve25519_max_size(struct crypto_kpp *tfm) -{ - return CURVE25519_KEY_SIZE; -} - -static struct kpp_alg curve25519_alg = { - .base.cra_name = "curve25519", - .base.cra_driver_name = "curve25519-ppc64le", - .base.cra_priority = 200, - .base.cra_module = THIS_MODULE, - .base.cra_ctxsize = CURVE25519_KEY_SIZE, - - .set_secret = curve25519_set_secret, - .generate_public_key = curve25519_generate_public_key, - .compute_shared_secret = curve25519_compute_shared_secret, - .max_size = curve25519_max_size, -}; - - -static int __init curve25519_mod_init(void) -{ - return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? - crypto_register_kpp(&curve25519_alg) : 0; -} - -static void __exit curve25519_mod_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_KPP)) - crypto_unregister_kpp(&curve25519_alg); -} - -module_init(curve25519_mod_init); -module_exit(curve25519_mod_exit); - -MODULE_ALIAS_CRYPTO("curve25519"); -MODULE_ALIAS_CRYPTO("curve25519-ppc64le"); MODULE_DESCRIPTION("PPC64le Curve25519 scalar multiplication with 51 bits limbs"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Danny Tsen "); From de3ea8e1c55774a2ef116942984815257cbb01a3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:15 -0700 Subject: [PATCH 0995/3206] crypto: x86/curve25519 - Remove unused kpp support Curve25519 is used only via the library API, not the crypto_kpp API. In preparation for removing the unused crypto_kpp API for Curve25519, remove the unused "curve25519-x86" kpp algorithm. Note that the underlying x86_64 optimized Curve25519 code remains fully supported and accessible via the library API. It's also worth noting that even if the kpp support for Curve25519 comes back later, there is no need for arch-specific kpp glue code like this, as a single kpp algorithm that wraps the library API is sufficient. Link: https://lore.kernel.org/r/20250906213523.84915-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/x86/crypto/Kconfig | 1 - arch/x86/crypto/curve25519-x86_64.c | 98 +---------------------------- 2 files changed, 1 insertion(+), 98 deletions(-) diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 94016c60561e28..6a895a571b00eb 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -5,7 +5,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (x86)" config CRYPTO_CURVE25519_X86 tristate depends on 64BIT - select CRYPTO_KPP select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 default CRYPTO_LIB_CURVE25519_INTERNAL diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index d587f05c3c8c36..ab91368284a473 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -5,14 +5,12 @@ */ #include -#include #include #include #include #include #include -#include #include #include @@ -1613,114 +1611,20 @@ void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], } EXPORT_SYMBOL(curve25519_base_arch); -static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, - unsigned int len) -{ - u8 *secret = kpp_tfm_ctx(tfm); - - if (!len) - curve25519_generate_secret(secret); - else if (len == CURVE25519_KEY_SIZE && - crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) - memcpy(secret, buf, CURVE25519_KEY_SIZE); - else - return -EINVAL; - return 0; -} - -static int curve25519_generate_public_key(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - - if (req->src) - return -EINVAL; - - curve25519_base_arch(buf, secret); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} - -static int curve25519_compute_shared_secret(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 public_key[CURVE25519_KEY_SIZE]; - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - - if (!req->src) - return -EINVAL; - - copied = sg_copy_to_buffer(req->src, - sg_nents_for_len(req->src, - CURVE25519_KEY_SIZE), - public_key, CURVE25519_KEY_SIZE); - if (copied != CURVE25519_KEY_SIZE) - return -EINVAL; - - curve25519_arch(buf, secret, public_key); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} - -static unsigned int curve25519_max_size(struct crypto_kpp *tfm) -{ - return CURVE25519_KEY_SIZE; -} - -static struct kpp_alg curve25519_alg = { - .base.cra_name = "curve25519", - .base.cra_driver_name = "curve25519-x86", - .base.cra_priority = 200, - .base.cra_module = THIS_MODULE, - .base.cra_ctxsize = CURVE25519_KEY_SIZE, - - .set_secret = curve25519_set_secret, - .generate_public_key = curve25519_generate_public_key, - .compute_shared_secret = curve25519_compute_shared_secret, - .max_size = curve25519_max_size, -}; - - static int __init curve25519_mod_init(void) { if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) static_branch_enable(&curve25519_use_bmi2_adx); - else - return 0; - return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? - crypto_register_kpp(&curve25519_alg) : 0; + return 0; } static void __exit curve25519_mod_exit(void) { - if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && - static_branch_likely(&curve25519_use_bmi2_adx)) - crypto_unregister_kpp(&curve25519_alg); } module_init(curve25519_mod_init); module_exit(curve25519_mod_exit); -MODULE_ALIAS_CRYPTO("curve25519"); -MODULE_ALIAS_CRYPTO("curve25519-x86"); MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jason A. Donenfeld "); From 77611cd221477700cacd7061baad7e84ebeadd06 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:16 -0700 Subject: [PATCH 0996/3206] crypto: testmgr - Remove curve25519 kpp tests Curve25519 is used only via the library API, not the crypto_kpp API. In preparation for removing the unused crypto_kpp API for Curve25519, remove the tests for the "curve25519" kpp from crypto/testmgr.c. Note that these tests just duplicated lib/crypto/curve25519-selftest.c, which uses the same list of test vectors. So they didn't really provide any additional value. Link: https://lore.kernel.org/r/20250906213523.84915-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/testmgr.c | 6 - crypto/testmgr.h | 1225 ---------------------------------------------- 2 files changed, 1231 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 781445f5f56a61..9dca41e7ee7381 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4641,12 +4641,6 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .cipher = __VECS(sm4_cts_tv_template) } - }, { - .alg = "curve25519", - .test = alg_test_kpp, - .suite = { - .kpp = __VECS(curve25519_tv_template) - } }, { .alg = "deflate", .test = alg_test_comp, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 32d099ac9e7378..2682312272824b 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -3798,1231 +3798,6 @@ static const struct kpp_testvec ffdhe8192_dh_tv_template[] __maybe_unused = { }, }; -static const struct kpp_testvec curve25519_tv_template[] = { -{ - .secret = (u8[32]){ 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, - 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45, - 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a, - 0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a }, - .b_public = (u8[32]){ 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4, - 0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37, - 0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d, - 0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f }, - .expected_ss = (u8[32]){ 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, - 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, - 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, - 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b, - 0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6, - 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd, - 0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb }, - .b_public = (u8[32]){ 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54, - 0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a, - 0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4, - 0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a }, - .expected_ss = (u8[32]){ 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, - 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, - 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, - 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 1 }, - .b_public = (u8[32]){ 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64, - 0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d, - 0x0b, 0x95, 0x48, 0xdc, 0x0c, 0xd8, 0x19, 0x98, - 0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 1 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f, - 0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d, - 0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x08, 0xed, 0xe3, - 0x0b, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, - 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd, - 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, - 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4 }, - .b_public = (u8[32]){ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, - 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c, - 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b, - 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c }, - .expected_ss = (u8[32]){ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, - 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, - 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, - 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0x0a, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x0a, 0x00, 0xfb, 0x9f }, - .expected_ss = (u8[32]){ 0x77, 0x52, 0xb6, 0x18, 0xc1, 0x2d, 0x48, 0xd2, - 0xc6, 0x93, 0x46, 0x83, 0x81, 0x7c, 0xc6, 0x57, - 0xf3, 0x31, 0x03, 0x19, 0x49, 0x48, 0x20, 0x05, - 0x42, 0x2b, 0x4e, 0xae, 0x8d, 0x1d, 0x43, 0x23 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 0x8e, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x06 }, - .expected_ss = (u8[32]){ 0x5a, 0xdf, 0xaa, 0x25, 0x86, 0x8e, 0x32, 0x3d, - 0xae, 0x49, 0x62, 0xc1, 0x01, 0x5c, 0xb3, 0x12, - 0xe1, 0xc5, 0xc7, 0x9e, 0x95, 0x3f, 0x03, 0x99, - 0xb0, 0xba, 0x16, 0x22, 0xf3, 0xb6, 0xf7, 0x0c }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - normal case */ -{ - .secret = (u8[32]){ 0x48, 0x52, 0x83, 0x4d, 0x9d, 0x6b, 0x77, 0xda, - 0xde, 0xab, 0xaa, 0xf2, 0xe1, 0x1d, 0xca, 0x66, - 0xd1, 0x9f, 0xe7, 0x49, 0x93, 0xa7, 0xbe, 0xc3, - 0x6c, 0x6e, 0x16, 0xa0, 0x98, 0x3f, 0xea, 0xba }, - .b_public = (u8[32]){ 0x9c, 0x64, 0x7d, 0x9a, 0xe5, 0x89, 0xb9, 0xf5, - 0x8f, 0xdc, 0x3c, 0xa4, 0x94, 0x7e, 0xfb, 0xc9, - 0x15, 0xc4, 0xb2, 0xe0, 0x8e, 0x74, 0x4a, 0x0e, - 0xdf, 0x46, 0x9d, 0xac, 0x59, 0xc8, 0xf8, 0x5a }, - .expected_ss = (u8[32]){ 0x87, 0xb7, 0xf2, 0x12, 0xb6, 0x27, 0xf7, 0xa5, - 0x4c, 0xa5, 0xe0, 0xbc, 0xda, 0xdd, 0xd5, 0x38, - 0x9d, 0x9d, 0xe6, 0x15, 0x6c, 0xdb, 0xcf, 0x8e, - 0xbe, 0x14, 0xff, 0xbc, 0xfb, 0x43, 0x65, 0x51 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key on twist */ -{ - .secret = (u8[32]){ 0x58, 0x8c, 0x06, 0x1a, 0x50, 0x80, 0x4a, 0xc4, - 0x88, 0xad, 0x77, 0x4a, 0xc7, 0x16, 0xc3, 0xf5, - 0xba, 0x71, 0x4b, 0x27, 0x12, 0xe0, 0x48, 0x49, - 0x13, 0x79, 0xa5, 0x00, 0x21, 0x19, 0x98, 0xa8 }, - .b_public = (u8[32]){ 0x63, 0xaa, 0x40, 0xc6, 0xe3, 0x83, 0x46, 0xc5, - 0xca, 0xf2, 0x3a, 0x6d, 0xf0, 0xa5, 0xe6, 0xc8, - 0x08, 0x89, 0xa0, 0x86, 0x47, 0xe5, 0x51, 0xb3, - 0x56, 0x34, 0x49, 0xbe, 0xfc, 0xfc, 0x97, 0x33 }, - .expected_ss = (u8[32]){ 0xb1, 0xa7, 0x07, 0x51, 0x94, 0x95, 0xff, 0xff, - 0xb2, 0x98, 0xff, 0x94, 0x17, 0x16, 0xb0, 0x6d, - 0xfa, 0xb8, 0x7c, 0xf8, 0xd9, 0x11, 0x23, 0xfe, - 0x2b, 0xe9, 0xa2, 0x33, 0xdd, 0xa2, 0x22, 0x12 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key on twist */ -{ - .secret = (u8[32]){ 0xb0, 0x5b, 0xfd, 0x32, 0xe5, 0x53, 0x25, 0xd9, - 0xfd, 0x64, 0x8c, 0xb3, 0x02, 0x84, 0x80, 0x39, - 0x00, 0x0b, 0x39, 0x0e, 0x44, 0xd5, 0x21, 0xe5, - 0x8a, 0xab, 0x3b, 0x29, 0xa6, 0x96, 0x0b, 0xa8 }, - .b_public = (u8[32]){ 0x0f, 0x83, 0xc3, 0x6f, 0xde, 0xd9, 0xd3, 0x2f, - 0xad, 0xf4, 0xef, 0xa3, 0xae, 0x93, 0xa9, 0x0b, - 0xb5, 0xcf, 0xa6, 0x68, 0x93, 0xbc, 0x41, 0x2c, - 0x43, 0xfa, 0x72, 0x87, 0xdb, 0xb9, 0x97, 0x79 }, - .expected_ss = (u8[32]){ 0x67, 0xdd, 0x4a, 0x6e, 0x16, 0x55, 0x33, 0x53, - 0x4c, 0x0e, 0x3f, 0x17, 0x2e, 0x4a, 0xb8, 0x57, - 0x6b, 0xca, 0x92, 0x3a, 0x5f, 0x07, 0xb2, 0xc0, - 0x69, 0xb4, 0xc3, 0x10, 0xff, 0x2e, 0x93, 0x5b }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key on twist */ -{ - .secret = (u8[32]){ 0x70, 0xe3, 0x4b, 0xcb, 0xe1, 0xf4, 0x7f, 0xbc, - 0x0f, 0xdd, 0xfd, 0x7c, 0x1e, 0x1a, 0xa5, 0x3d, - 0x57, 0xbf, 0xe0, 0xf6, 0x6d, 0x24, 0x30, 0x67, - 0xb4, 0x24, 0xbb, 0x62, 0x10, 0xbe, 0xd1, 0x9c }, - .b_public = (u8[32]){ 0x0b, 0x82, 0x11, 0xa2, 0xb6, 0x04, 0x90, 0x97, - 0xf6, 0x87, 0x1c, 0x6c, 0x05, 0x2d, 0x3c, 0x5f, - 0xc1, 0xba, 0x17, 0xda, 0x9e, 0x32, 0xae, 0x45, - 0x84, 0x03, 0xb0, 0x5b, 0xb2, 0x83, 0x09, 0x2a }, - .expected_ss = (u8[32]){ 0x4a, 0x06, 0x38, 0xcf, 0xaa, 0x9e, 0xf1, 0x93, - 0x3b, 0x47, 0xf8, 0x93, 0x92, 0x96, 0xa6, 0xb2, - 0x5b, 0xe5, 0x41, 0xef, 0x7f, 0x70, 0xe8, 0x44, - 0xc0, 0xbc, 0xc0, 0x0b, 0x13, 0x4d, 0xe6, 0x4a }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key on twist */ -{ - .secret = (u8[32]){ 0x68, 0xc1, 0xf3, 0xa6, 0x53, 0xa4, 0xcd, 0xb1, - 0xd3, 0x7b, 0xba, 0x94, 0x73, 0x8f, 0x8b, 0x95, - 0x7a, 0x57, 0xbe, 0xb2, 0x4d, 0x64, 0x6e, 0x99, - 0x4d, 0xc2, 0x9a, 0x27, 0x6a, 0xad, 0x45, 0x8d }, - .b_public = (u8[32]){ 0x34, 0x3a, 0xc2, 0x0a, 0x3b, 0x9c, 0x6a, 0x27, - 0xb1, 0x00, 0x81, 0x76, 0x50, 0x9a, 0xd3, 0x07, - 0x35, 0x85, 0x6e, 0xc1, 0xc8, 0xd8, 0xfc, 0xae, - 0x13, 0x91, 0x2d, 0x08, 0xd1, 0x52, 0xf4, 0x6c }, - .expected_ss = (u8[32]){ 0x39, 0x94, 0x91, 0xfc, 0xe8, 0xdf, 0xab, 0x73, - 0xb4, 0xf9, 0xf6, 0x11, 0xde, 0x8e, 0xa0, 0xb2, - 0x7b, 0x28, 0xf8, 0x59, 0x94, 0x25, 0x0b, 0x0f, - 0x47, 0x5d, 0x58, 0x5d, 0x04, 0x2a, 0xc2, 0x07 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key on twist */ -{ - .secret = (u8[32]){ 0xd8, 0x77, 0xb2, 0x6d, 0x06, 0xdf, 0xf9, 0xd9, - 0xf7, 0xfd, 0x4c, 0x5b, 0x37, 0x69, 0xf8, 0xcd, - 0xd5, 0xb3, 0x05, 0x16, 0xa5, 0xab, 0x80, 0x6b, - 0xe3, 0x24, 0xff, 0x3e, 0xb6, 0x9e, 0xa0, 0xb2 }, - .b_public = (u8[32]){ 0xfa, 0x69, 0x5f, 0xc7, 0xbe, 0x8d, 0x1b, 0xe5, - 0xbf, 0x70, 0x48, 0x98, 0xf3, 0x88, 0xc4, 0x52, - 0xba, 0xfd, 0xd3, 0xb8, 0xea, 0xe8, 0x05, 0xf8, - 0x68, 0x1a, 0x8d, 0x15, 0xc2, 0xd4, 0xe1, 0x42 }, - .expected_ss = (u8[32]){ 0x2c, 0x4f, 0xe1, 0x1d, 0x49, 0x0a, 0x53, 0x86, - 0x17, 0x76, 0xb1, 0x3b, 0x43, 0x54, 0xab, 0xd4, - 0xcf, 0x5a, 0x97, 0x69, 0x9d, 0xb6, 0xe6, 0xc6, - 0x8c, 0x16, 0x26, 0xd0, 0x76, 0x62, 0xf7, 0x58 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0x38, 0xdd, 0xe9, 0xf3, 0xe7, 0xb7, 0x99, 0x04, - 0x5f, 0x9a, 0xc3, 0x79, 0x3d, 0x4a, 0x92, 0x77, - 0xda, 0xde, 0xad, 0xc4, 0x1b, 0xec, 0x02, 0x90, - 0xf8, 0x1f, 0x74, 0x4f, 0x73, 0x77, 0x5f, 0x84 }, - .b_public = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x9a, 0x2c, 0xfe, 0x84, 0xff, 0x9c, 0x4a, 0x97, - 0x39, 0x62, 0x5c, 0xae, 0x4a, 0x3b, 0x82, 0xa9, - 0x06, 0x87, 0x7a, 0x44, 0x19, 0x46, 0xf8, 0xd7, - 0xb3, 0xd7, 0x95, 0xfe, 0x8f, 0x5d, 0x16, 0x39 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0x98, 0x57, 0xa9, 0x14, 0xe3, 0xc2, 0x90, 0x36, - 0xfd, 0x9a, 0x44, 0x2b, 0xa5, 0x26, 0xb5, 0xcd, - 0xcd, 0xf2, 0x82, 0x16, 0x15, 0x3e, 0x63, 0x6c, - 0x10, 0x67, 0x7a, 0xca, 0xb6, 0xbd, 0x6a, 0xa5 }, - .b_public = (u8[32]){ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x4d, 0xa4, 0xe0, 0xaa, 0x07, 0x2c, 0x23, 0x2e, - 0xe2, 0xf0, 0xfa, 0x4e, 0x51, 0x9a, 0xe5, 0x0b, - 0x52, 0xc1, 0xed, 0xd0, 0x8a, 0x53, 0x4d, 0x4e, - 0xf3, 0x46, 0xc2, 0xe1, 0x06, 0xd2, 0x1d, 0x60 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0x48, 0xe2, 0x13, 0x0d, 0x72, 0x33, 0x05, 0xed, - 0x05, 0xe6, 0xe5, 0x89, 0x4d, 0x39, 0x8a, 0x5e, - 0x33, 0x36, 0x7a, 0x8c, 0x6a, 0xac, 0x8f, 0xcd, - 0xf0, 0xa8, 0x8e, 0x4b, 0x42, 0x82, 0x0d, 0xb7 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0x03, 0x00, 0x00, 0xf8, 0xff, - 0xff, 0x1f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 0x00, - 0x00, 0xf0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x9e, 0xd1, 0x0c, 0x53, 0x74, 0x7f, 0x64, 0x7f, - 0x82, 0xf4, 0x51, 0x25, 0xd3, 0xde, 0x15, 0xa1, - 0xe6, 0xb8, 0x24, 0x49, 0x6a, 0xb4, 0x04, 0x10, - 0xff, 0xcc, 0x3c, 0xfe, 0x95, 0x76, 0x0f, 0x3b }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0x28, 0xf4, 0x10, 0x11, 0x69, 0x18, 0x51, 0xb3, - 0xa6, 0x2b, 0x64, 0x15, 0x53, 0xb3, 0x0d, 0x0d, - 0xfd, 0xdc, 0xb8, 0xff, 0xfc, 0xf5, 0x37, 0x00, - 0xa7, 0xbe, 0x2f, 0x6a, 0x87, 0x2e, 0x9f, 0xb0 }, - .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0xfc, 0xff, 0xff, 0x07, 0x00, - 0x00, 0xe0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0xf8, 0xff, - 0xff, 0x0f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0xcf, 0x72, 0xb4, 0xaa, 0x6a, 0xa1, 0xc9, 0xf8, - 0x94, 0xf4, 0x16, 0x5b, 0x86, 0x10, 0x9a, 0xa4, - 0x68, 0x51, 0x76, 0x48, 0xe1, 0xf0, 0xcc, 0x70, - 0xe1, 0xab, 0x08, 0x46, 0x01, 0x76, 0x50, 0x6b }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0x18, 0xa9, 0x3b, 0x64, 0x99, 0xb9, 0xf6, 0xb3, - 0x22, 0x5c, 0xa0, 0x2f, 0xef, 0x41, 0x0e, 0x0a, - 0xde, 0xc2, 0x35, 0x32, 0x32, 0x1d, 0x2d, 0x8e, - 0xf1, 0xa6, 0xd6, 0x02, 0xa8, 0xc6, 0x5b, 0x83 }, - .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x5d, 0x50, 0xb6, 0x28, 0x36, 0xbb, 0x69, 0x57, - 0x94, 0x10, 0x38, 0x6c, 0xf7, 0xbb, 0x81, 0x1c, - 0x14, 0xbf, 0x85, 0xb1, 0xc7, 0xb1, 0x7e, 0x59, - 0x24, 0xc7, 0xff, 0xea, 0x91, 0xef, 0x9e, 0x12 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0xc0, 0x1d, 0x13, 0x05, 0xa1, 0x33, 0x8a, 0x1f, - 0xca, 0xc2, 0xba, 0x7e, 0x2e, 0x03, 0x2b, 0x42, - 0x7e, 0x0b, 0x04, 0x90, 0x31, 0x65, 0xac, 0xa9, - 0x57, 0xd8, 0xd0, 0x55, 0x3d, 0x87, 0x17, 0xb0 }, - .b_public = (u8[32]){ 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x19, 0x23, 0x0e, 0xb1, 0x48, 0xd5, 0xd6, 0x7c, - 0x3c, 0x22, 0xab, 0x1d, 0xae, 0xff, 0x80, 0xa5, - 0x7e, 0xae, 0x42, 0x65, 0xce, 0x28, 0x72, 0x65, - 0x7b, 0x2c, 0x80, 0x99, 0xfc, 0x69, 0x8e, 0x50 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0x38, 0x6f, 0x7f, 0x16, 0xc5, 0x07, 0x31, 0xd6, - 0x4f, 0x82, 0xe6, 0xa1, 0x70, 0xb1, 0x42, 0xa4, - 0xe3, 0x4f, 0x31, 0xfd, 0x77, 0x68, 0xfc, 0xb8, - 0x90, 0x29, 0x25, 0xe7, 0xd1, 0xe2, 0x1a, 0xbe }, - .b_public = (u8[32]){ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x0f, 0xca, 0xb5, 0xd8, 0x42, 0xa0, 0x78, 0xd7, - 0xa7, 0x1f, 0xc5, 0x9b, 0x57, 0xbf, 0xb4, 0xca, - 0x0b, 0xe6, 0x87, 0x3b, 0x49, 0xdc, 0xdb, 0x9f, - 0x44, 0xe1, 0x4a, 0xe8, 0xfb, 0xdf, 0xa5, 0x42 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0xe0, 0x23, 0xa2, 0x89, 0xbd, 0x5e, 0x90, 0xfa, - 0x28, 0x04, 0xdd, 0xc0, 0x19, 0xa0, 0x5e, 0xf3, - 0xe7, 0x9d, 0x43, 0x4b, 0xb6, 0xea, 0x2f, 0x52, - 0x2e, 0xcb, 0x64, 0x3a, 0x75, 0x29, 0x6e, 0x95 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x54, 0xce, 0x8f, 0x22, 0x75, 0xc0, 0x77, 0xe3, - 0xb1, 0x30, 0x6a, 0x39, 0x39, 0xc5, 0xe0, 0x3e, - 0xef, 0x6b, 0xbb, 0x88, 0x06, 0x05, 0x44, 0x75, - 0x8d, 0x9f, 0xef, 0x59, 0xb0, 0xbc, 0x3e, 0x4f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0x68, 0xf0, 0x10, 0xd6, 0x2e, 0xe8, 0xd9, 0x26, - 0x05, 0x3a, 0x36, 0x1c, 0x3a, 0x75, 0xc6, 0xea, - 0x4e, 0xbd, 0xc8, 0x60, 0x6a, 0xb2, 0x85, 0x00, - 0x3a, 0x6f, 0x8f, 0x40, 0x76, 0xb0, 0x1e, 0x83 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 }, - .expected_ss = (u8[32]){ 0xf1, 0x36, 0x77, 0x5c, 0x5b, 0xeb, 0x0a, 0xf8, - 0x11, 0x0a, 0xf1, 0x0b, 0x20, 0x37, 0x23, 0x32, - 0x04, 0x3c, 0xab, 0x75, 0x24, 0x19, 0x67, 0x87, - 0x75, 0xa2, 0x23, 0xdf, 0x57, 0xc9, 0xd3, 0x0d }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0x58, 0xeb, 0xcb, 0x35, 0xb0, 0xf8, 0x84, 0x5c, - 0xaf, 0x1e, 0xc6, 0x30, 0xf9, 0x65, 0x76, 0xb6, - 0x2c, 0x4b, 0x7b, 0x6c, 0x36, 0xb2, 0x9d, 0xeb, - 0x2c, 0xb0, 0x08, 0x46, 0x51, 0x75, 0x5c, 0x96 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xfb, 0xff, - 0xff, 0xdf, 0xff, 0xff, 0xdf, 0xff, 0xff, 0xff, - 0xfe, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xf7, 0xff, - 0xff, 0xf7, 0xff, 0xff, 0xbf, 0xff, 0xff, 0x3f }, - .expected_ss = (u8[32]){ 0xbf, 0x9a, 0xff, 0xd0, 0x6b, 0x84, 0x40, 0x85, - 0x58, 0x64, 0x60, 0x96, 0x2e, 0xf2, 0x14, 0x6f, - 0xf3, 0xd4, 0x53, 0x3d, 0x94, 0x44, 0xaa, 0xb0, - 0x06, 0xeb, 0x88, 0xcc, 0x30, 0x54, 0x40, 0x7d }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0x18, 0x8c, 0x4b, 0xc5, 0xb9, 0xc4, 0x4b, 0x38, - 0xbb, 0x65, 0x8b, 0x9b, 0x2a, 0xe8, 0x2d, 0x5b, - 0x01, 0x01, 0x5e, 0x09, 0x31, 0x84, 0xb1, 0x7c, - 0xb7, 0x86, 0x35, 0x03, 0xa7, 0x83, 0xe1, 0xbb }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .expected_ss = (u8[32]){ 0xd4, 0x80, 0xde, 0x04, 0xf6, 0x99, 0xcb, 0x3b, - 0xe0, 0x68, 0x4a, 0x9c, 0xc2, 0xe3, 0x12, 0x81, - 0xea, 0x0b, 0xc5, 0xa9, 0xdc, 0xc1, 0x57, 0xd3, - 0xd2, 0x01, 0x58, 0xd4, 0x6c, 0xa5, 0x24, 0x6d }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0xe0, 0x6c, 0x11, 0xbb, 0x2e, 0x13, 0xce, 0x3d, - 0xc7, 0x67, 0x3f, 0x67, 0xf5, 0x48, 0x22, 0x42, - 0x90, 0x94, 0x23, 0xa9, 0xae, 0x95, 0xee, 0x98, - 0x6a, 0x98, 0x8d, 0x98, 0xfa, 0xee, 0x23, 0xa2 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, - 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, - 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, - 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x4c, 0x44, 0x01, 0xcc, 0xe6, 0xb5, 0x1e, 0x4c, - 0xb1, 0x8f, 0x27, 0x90, 0x24, 0x6c, 0x9b, 0xf9, - 0x14, 0xdb, 0x66, 0x77, 0x50, 0xa1, 0xcb, 0x89, - 0x06, 0x90, 0x92, 0xaf, 0x07, 0x29, 0x22, 0x76 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0xc0, 0x65, 0x8c, 0x46, 0xdd, 0xe1, 0x81, 0x29, - 0x29, 0x38, 0x77, 0x53, 0x5b, 0x11, 0x62, 0xb6, - 0xf9, 0xf5, 0x41, 0x4a, 0x23, 0xcf, 0x4d, 0x2c, - 0xbc, 0x14, 0x0a, 0x4d, 0x99, 0xda, 0x2b, 0x8f }, - .b_public = (u8[32]){ 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x57, 0x8b, 0xa8, 0xcc, 0x2d, 0xbd, 0xc5, 0x75, - 0xaf, 0xcf, 0x9d, 0xf2, 0xb3, 0xee, 0x61, 0x89, - 0xf5, 0x33, 0x7d, 0x68, 0x54, 0xc7, 0x9b, 0x4c, - 0xe1, 0x65, 0xea, 0x12, 0x29, 0x3b, 0x3a, 0x0f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xf0, 0x1e, 0x48, 0xda, 0xfa, 0xc9, 0xd7, 0xbc, - 0xf5, 0x89, 0xcb, 0xc3, 0x82, 0xc8, 0x78, 0xd1, - 0x8b, 0xda, 0x35, 0x50, 0x58, 0x9f, 0xfb, 0x5d, - 0x50, 0xb5, 0x23, 0xbe, 0xbe, 0x32, 0x9d, 0xae }, - .b_public = (u8[32]){ 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0xbd, 0x36, 0xa0, 0x79, 0x0e, 0xb8, 0x83, 0x09, - 0x8c, 0x98, 0x8b, 0x21, 0x78, 0x67, 0x73, 0xde, - 0x0b, 0x3a, 0x4d, 0xf1, 0x62, 0x28, 0x2c, 0xf1, - 0x10, 0xde, 0x18, 0xdd, 0x48, 0x4c, 0xe7, 0x4b }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x28, 0x87, 0x96, 0xbc, 0x5a, 0xff, 0x4b, 0x81, - 0xa3, 0x75, 0x01, 0x75, 0x7b, 0xc0, 0x75, 0x3a, - 0x3c, 0x21, 0x96, 0x47, 0x90, 0xd3, 0x86, 0x99, - 0x30, 0x8d, 0xeb, 0xc1, 0x7a, 0x6e, 0xaf, 0x8d }, - .b_public = (u8[32]){ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0xb4, 0xe0, 0xdd, 0x76, 0xda, 0x7b, 0x07, 0x17, - 0x28, 0xb6, 0x1f, 0x85, 0x67, 0x71, 0xaa, 0x35, - 0x6e, 0x57, 0xed, 0xa7, 0x8a, 0x5b, 0x16, 0x55, - 0xcc, 0x38, 0x20, 0xfb, 0x5f, 0x85, 0x4c, 0x5c }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x98, 0xdf, 0x84, 0x5f, 0x66, 0x51, 0xbf, 0x11, - 0x38, 0x22, 0x1f, 0x11, 0x90, 0x41, 0xf7, 0x2b, - 0x6d, 0xbc, 0x3c, 0x4a, 0xce, 0x71, 0x43, 0xd9, - 0x9f, 0xd5, 0x5a, 0xd8, 0x67, 0x48, 0x0d, 0xa8 }, - .b_public = (u8[32]){ 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x6f, 0xdf, 0x6c, 0x37, 0x61, 0x1d, 0xbd, 0x53, - 0x04, 0xdc, 0x0f, 0x2e, 0xb7, 0xc9, 0x51, 0x7e, - 0xb3, 0xc5, 0x0e, 0x12, 0xfd, 0x05, 0x0a, 0xc6, - 0xde, 0xc2, 0x70, 0x71, 0xd4, 0xbf, 0xc0, 0x34 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xf0, 0x94, 0x98, 0xe4, 0x6f, 0x02, 0xf8, 0x78, - 0x82, 0x9e, 0x78, 0xb8, 0x03, 0xd3, 0x16, 0xa2, - 0xed, 0x69, 0x5d, 0x04, 0x98, 0xa0, 0x8a, 0xbd, - 0xf8, 0x27, 0x69, 0x30, 0xe2, 0x4e, 0xdc, 0xb0 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x4c, 0x8f, 0xc4, 0xb1, 0xc6, 0xab, 0x88, 0xfb, - 0x21, 0xf1, 0x8f, 0x6d, 0x4c, 0x81, 0x02, 0x40, - 0xd4, 0xe9, 0x46, 0x51, 0xba, 0x44, 0xf7, 0xa2, - 0xc8, 0x63, 0xce, 0xc7, 0xdc, 0x56, 0x60, 0x2d }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x18, 0x13, 0xc1, 0x0a, 0x5c, 0x7f, 0x21, 0xf9, - 0x6e, 0x17, 0xf2, 0x88, 0xc0, 0xcc, 0x37, 0x60, - 0x7c, 0x04, 0xc5, 0xf5, 0xae, 0xa2, 0xdb, 0x13, - 0x4f, 0x9e, 0x2f, 0xfc, 0x66, 0xbd, 0x9d, 0xb8 }, - .b_public = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, - .expected_ss = (u8[32]){ 0x1c, 0xd0, 0xb2, 0x82, 0x67, 0xdc, 0x54, 0x1c, - 0x64, 0x2d, 0x6d, 0x7d, 0xca, 0x44, 0xa8, 0xb3, - 0x8a, 0x63, 0x73, 0x6e, 0xef, 0x5c, 0x4e, 0x65, - 0x01, 0xff, 0xbb, 0xb1, 0x78, 0x0c, 0x03, 0x3c }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x78, 0x57, 0xfb, 0x80, 0x86, 0x53, 0x64, 0x5a, - 0x0b, 0xeb, 0x13, 0x8a, 0x64, 0xf5, 0xf4, 0xd7, - 0x33, 0xa4, 0x5e, 0xa8, 0x4c, 0x3c, 0xda, 0x11, - 0xa9, 0xc0, 0x6f, 0x7e, 0x71, 0x39, 0x14, 0x9e }, - .b_public = (u8[32]){ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, - .expected_ss = (u8[32]){ 0x87, 0x55, 0xbe, 0x01, 0xc6, 0x0a, 0x7e, 0x82, - 0x5c, 0xff, 0x3e, 0x0e, 0x78, 0xcb, 0x3a, 0xa4, - 0x33, 0x38, 0x61, 0x51, 0x6a, 0xa5, 0x9b, 0x1c, - 0x51, 0xa8, 0xb2, 0xa5, 0x43, 0xdf, 0xa8, 0x22 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xe0, 0x3a, 0xa8, 0x42, 0xe2, 0xab, 0xc5, 0x6e, - 0x81, 0xe8, 0x7b, 0x8b, 0x9f, 0x41, 0x7b, 0x2a, - 0x1e, 0x59, 0x13, 0xc7, 0x23, 0xee, 0xd2, 0x8d, - 0x75, 0x2f, 0x8d, 0x47, 0xa5, 0x9f, 0x49, 0x8f }, - .b_public = (u8[32]){ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, - .expected_ss = (u8[32]){ 0x54, 0xc9, 0xa1, 0xed, 0x95, 0xe5, 0x46, 0xd2, - 0x78, 0x22, 0xa3, 0x60, 0x93, 0x1d, 0xda, 0x60, - 0xa1, 0xdf, 0x04, 0x9d, 0xa6, 0xf9, 0x04, 0x25, - 0x3c, 0x06, 0x12, 0xbb, 0xdc, 0x08, 0x74, 0x76 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xf8, 0xf7, 0x07, 0xb7, 0x99, 0x9b, 0x18, 0xcb, - 0x0d, 0x6b, 0x96, 0x12, 0x4f, 0x20, 0x45, 0x97, - 0x2c, 0xa2, 0x74, 0xbf, 0xc1, 0x54, 0xad, 0x0c, - 0x87, 0x03, 0x8c, 0x24, 0xc6, 0xd0, 0xd4, 0xb2 }, - .b_public = (u8[32]){ 0xda, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0xcc, 0x1f, 0x40, 0xd7, 0x43, 0xcd, 0xc2, 0x23, - 0x0e, 0x10, 0x43, 0xda, 0xba, 0x8b, 0x75, 0xe8, - 0x10, 0xf1, 0xfb, 0xab, 0x7f, 0x25, 0x52, 0x69, - 0xbd, 0x9e, 0xbb, 0x29, 0xe6, 0xbf, 0x49, 0x4f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xa0, 0x34, 0xf6, 0x84, 0xfa, 0x63, 0x1e, 0x1a, - 0x34, 0x81, 0x18, 0xc1, 0xce, 0x4c, 0x98, 0x23, - 0x1f, 0x2d, 0x9e, 0xec, 0x9b, 0xa5, 0x36, 0x5b, - 0x4a, 0x05, 0xd6, 0x9a, 0x78, 0x5b, 0x07, 0x96 }, - .b_public = (u8[32]){ 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x54, 0x99, 0x8e, 0xe4, 0x3a, 0x5b, 0x00, 0x7b, - 0xf4, 0x99, 0xf0, 0x78, 0xe7, 0x36, 0x52, 0x44, - 0x00, 0xa8, 0xb5, 0xc7, 0xe9, 0xb9, 0xb4, 0x37, - 0x71, 0x74, 0x8c, 0x7c, 0xdf, 0x88, 0x04, 0x12 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x30, 0xb6, 0xc6, 0xa0, 0xf2, 0xff, 0xa6, 0x80, - 0x76, 0x8f, 0x99, 0x2b, 0xa8, 0x9e, 0x15, 0x2d, - 0x5b, 0xc9, 0x89, 0x3d, 0x38, 0xc9, 0x11, 0x9b, - 0xe4, 0xf7, 0x67, 0xbf, 0xab, 0x6e, 0x0c, 0xa5 }, - .b_public = (u8[32]){ 0xdc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0xea, 0xd9, 0xb3, 0x8e, 0xfd, 0xd7, 0x23, 0x63, - 0x79, 0x34, 0xe5, 0x5a, 0xb7, 0x17, 0xa7, 0xae, - 0x09, 0xeb, 0x86, 0xa2, 0x1d, 0xc3, 0x6a, 0x3f, - 0xee, 0xb8, 0x8b, 0x75, 0x9e, 0x39, 0x1e, 0x09 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x90, 0x1b, 0x9d, 0xcf, 0x88, 0x1e, 0x01, 0xe0, - 0x27, 0x57, 0x50, 0x35, 0xd4, 0x0b, 0x43, 0xbd, - 0xc1, 0xc5, 0x24, 0x2e, 0x03, 0x08, 0x47, 0x49, - 0x5b, 0x0c, 0x72, 0x86, 0x46, 0x9b, 0x65, 0x91 }, - .b_public = (u8[32]){ 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x60, 0x2f, 0xf4, 0x07, 0x89, 0xb5, 0x4b, 0x41, - 0x80, 0x59, 0x15, 0xfe, 0x2a, 0x62, 0x21, 0xf0, - 0x7a, 0x50, 0xff, 0xc2, 0xc3, 0xfc, 0x94, 0xcf, - 0x61, 0xf1, 0x3d, 0x79, 0x04, 0xe8, 0x8e, 0x0e }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x80, 0x46, 0x67, 0x7c, 0x28, 0xfd, 0x82, 0xc9, - 0xa1, 0xbd, 0xb7, 0x1a, 0x1a, 0x1a, 0x34, 0xfa, - 0xba, 0x12, 0x25, 0xe2, 0x50, 0x7f, 0xe3, 0xf5, - 0x4d, 0x10, 0xbd, 0x5b, 0x0d, 0x86, 0x5f, 0x8e }, - .b_public = (u8[32]){ 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0xe0, 0x0a, 0xe8, 0xb1, 0x43, 0x47, 0x12, 0x47, - 0xba, 0x24, 0xf1, 0x2c, 0x88, 0x55, 0x36, 0xc3, - 0xcb, 0x98, 0x1b, 0x58, 0xe1, 0xe5, 0x6b, 0x2b, - 0xaf, 0x35, 0xc1, 0x2a, 0xe1, 0xf7, 0x9c, 0x26 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x60, 0x2f, 0x7e, 0x2f, 0x68, 0xa8, 0x46, 0xb8, - 0x2c, 0xc2, 0x69, 0xb1, 0xd4, 0x8e, 0x93, 0x98, - 0x86, 0xae, 0x54, 0xfd, 0x63, 0x6c, 0x1f, 0xe0, - 0x74, 0xd7, 0x10, 0x12, 0x7d, 0x47, 0x24, 0x91 }, - .b_public = (u8[32]){ 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x98, 0xcb, 0x9b, 0x50, 0xdd, 0x3f, 0xc2, 0xb0, - 0xd4, 0xf2, 0xd2, 0xbf, 0x7c, 0x5c, 0xfd, 0xd1, - 0x0c, 0x8f, 0xcd, 0x31, 0xfc, 0x40, 0xaf, 0x1a, - 0xd4, 0x4f, 0x47, 0xc1, 0x31, 0x37, 0x63, 0x62 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x60, 0x88, 0x7b, 0x3d, 0xc7, 0x24, 0x43, 0x02, - 0x6e, 0xbe, 0xdb, 0xbb, 0xb7, 0x06, 0x65, 0xf4, - 0x2b, 0x87, 0xad, 0xd1, 0x44, 0x0e, 0x77, 0x68, - 0xfb, 0xd7, 0xe8, 0xe2, 0xce, 0x5f, 0x63, 0x9d }, - .b_public = (u8[32]){ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x38, 0xd6, 0x30, 0x4c, 0x4a, 0x7e, 0x6d, 0x9f, - 0x79, 0x59, 0x33, 0x4f, 0xb5, 0x24, 0x5b, 0xd2, - 0xc7, 0x54, 0x52, 0x5d, 0x4c, 0x91, 0xdb, 0x95, - 0x02, 0x06, 0x92, 0x62, 0x34, 0xc1, 0xf6, 0x33 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x78, 0xd3, 0x1d, 0xfa, 0x85, 0x44, 0x97, 0xd7, - 0x2d, 0x8d, 0xef, 0x8a, 0x1b, 0x7f, 0xb0, 0x06, - 0xce, 0xc2, 0xd8, 0xc4, 0x92, 0x46, 0x47, 0xc9, - 0x38, 0x14, 0xae, 0x56, 0xfa, 0xed, 0xa4, 0x95 }, - .b_public = (u8[32]){ 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x78, 0x6c, 0xd5, 0x49, 0x96, 0xf0, 0x14, 0xa5, - 0xa0, 0x31, 0xec, 0x14, 0xdb, 0x81, 0x2e, 0xd0, - 0x83, 0x55, 0x06, 0x1f, 0xdb, 0x5d, 0xe6, 0x80, - 0xa8, 0x00, 0xac, 0x52, 0x1f, 0x31, 0x8e, 0x23 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xc0, 0x4c, 0x5b, 0xae, 0xfa, 0x83, 0x02, 0xdd, - 0xde, 0xd6, 0xa4, 0xbb, 0x95, 0x77, 0x61, 0xb4, - 0xeb, 0x97, 0xae, 0xfa, 0x4f, 0xc3, 0xb8, 0x04, - 0x30, 0x85, 0xf9, 0x6a, 0x56, 0x59, 0xb3, 0xa5 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x29, 0xae, 0x8b, 0xc7, 0x3e, 0x9b, 0x10, 0xa0, - 0x8b, 0x4f, 0x68, 0x1c, 0x43, 0xc3, 0xe0, 0xac, - 0x1a, 0x17, 0x1d, 0x31, 0xb3, 0x8f, 0x1a, 0x48, - 0xef, 0xba, 0x29, 0xae, 0x63, 0x9e, 0xa1, 0x34 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - RFC 7748 */ -{ - .secret = (u8[32]){ 0xa0, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, - 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd, - 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, - 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0x44 }, - .b_public = (u8[32]){ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, - 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c, - 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b, - 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c }, - .expected_ss = (u8[32]){ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, - 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, - 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, - 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - RFC 7748 */ -{ - .secret = (u8[32]){ 0x48, 0x66, 0xe9, 0xd4, 0xd1, 0xb4, 0x67, 0x3c, - 0x5a, 0xd2, 0x26, 0x91, 0x95, 0x7d, 0x6a, 0xf5, - 0xc1, 0x1b, 0x64, 0x21, 0xe0, 0xea, 0x01, 0xd4, - 0x2c, 0xa4, 0x16, 0x9e, 0x79, 0x18, 0xba, 0x4d }, - .b_public = (u8[32]){ 0xe5, 0x21, 0x0f, 0x12, 0x78, 0x68, 0x11, 0xd3, - 0xf4, 0xb7, 0x95, 0x9d, 0x05, 0x38, 0xae, 0x2c, - 0x31, 0xdb, 0xe7, 0x10, 0x6f, 0xc0, 0x3c, 0x3e, - 0xfc, 0x4c, 0xd5, 0x49, 0xc7, 0x15, 0xa4, 0x13 }, - .expected_ss = (u8[32]){ 0x95, 0xcb, 0xde, 0x94, 0x76, 0xe8, 0x90, 0x7d, - 0x7a, 0xad, 0xe4, 0x5c, 0xb4, 0xb8, 0x73, 0xf8, - 0x8b, 0x59, 0x5a, 0x68, 0x79, 0x9f, 0xa1, 0x52, - 0xe6, 0xf8, 0xf7, 0x64, 0x7a, 0xac, 0x79, 0x57 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x0a, 0xb4, 0xe7, 0x63, 0x80, 0xd8, 0x4d, 0xde, - 0x4f, 0x68, 0x33, 0xc5, 0x8f, 0x2a, 0x9f, 0xb8, - 0xf8, 0x3b, 0xb0, 0x16, 0x9b, 0x17, 0x2b, 0xe4, - 0xb6, 0xe0, 0x59, 0x28, 0x87, 0x74, 0x1a, 0x36 }, - .expected_ss = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x89, 0xe1, 0x0d, 0x57, 0x01, 0xb4, 0x33, 0x7d, - 0x2d, 0x03, 0x21, 0x81, 0x53, 0x8b, 0x10, 0x64, - 0xbd, 0x40, 0x84, 0x40, 0x1c, 0xec, 0xa1, 0xfd, - 0x12, 0x66, 0x3a, 0x19, 0x59, 0x38, 0x80, 0x00 }, - .expected_ss = (u8[32]){ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x2b, 0x55, 0xd3, 0xaa, 0x4a, 0x8f, 0x80, 0xc8, - 0xc0, 0xb2, 0xae, 0x5f, 0x93, 0x3e, 0x85, 0xaf, - 0x49, 0xbe, 0xac, 0x36, 0xc2, 0xfa, 0x73, 0x94, - 0xba, 0xb7, 0x6c, 0x89, 0x33, 0xf8, 0xf8, 0x1d }, - .expected_ss = (u8[32]){ 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x63, 0xe5, 0xb1, 0xfe, 0x96, 0x01, 0xfe, 0x84, - 0x38, 0x5d, 0x88, 0x66, 0xb0, 0x42, 0x12, 0x62, - 0xf7, 0x8f, 0xbf, 0xa5, 0xaf, 0xf9, 0x58, 0x5e, - 0x62, 0x66, 0x79, 0xb1, 0x85, 0x47, 0xd9, 0x59 }, - .expected_ss = (u8[32]){ 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0xe4, 0x28, 0xf3, 0xda, 0xc1, 0x78, 0x09, 0xf8, - 0x27, 0xa5, 0x22, 0xce, 0x32, 0x35, 0x50, 0x58, - 0xd0, 0x73, 0x69, 0x36, 0x4a, 0xa7, 0x89, 0x02, - 0xee, 0x10, 0x13, 0x9b, 0x9f, 0x9d, 0xd6, 0x53 }, - .expected_ss = (u8[32]){ 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0xb3, 0xb5, 0x0e, 0x3e, 0xd3, 0xa4, 0x07, 0xb9, - 0x5d, 0xe9, 0x42, 0xef, 0x74, 0x57, 0x5b, 0x5a, - 0xb8, 0xa1, 0x0c, 0x09, 0xee, 0x10, 0x35, 0x44, - 0xd6, 0x0b, 0xdf, 0xed, 0x81, 0x38, 0xab, 0x2b }, - .expected_ss = (u8[32]){ 0xf9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x21, 0x3f, 0xff, 0xe9, 0x3d, 0x5e, 0xa8, 0xcd, - 0x24, 0x2e, 0x46, 0x28, 0x44, 0x02, 0x99, 0x22, - 0xc4, 0x3c, 0x77, 0xc9, 0xe3, 0xe4, 0x2f, 0x56, - 0x2f, 0x48, 0x5d, 0x24, 0xc5, 0x01, 0xa2, 0x0b }, - .expected_ss = (u8[32]){ 0xf3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x91, 0xb2, 0x32, 0xa1, 0x78, 0xb3, 0xcd, 0x53, - 0x09, 0x32, 0x44, 0x1e, 0x61, 0x39, 0x41, 0x8f, - 0x72, 0x17, 0x22, 0x92, 0xf1, 0xda, 0x4c, 0x18, - 0x34, 0xfc, 0x5e, 0xbf, 0xef, 0xb5, 0x1e, 0x3f }, - .expected_ss = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x04, 0x5c, 0x6e, 0x11, 0xc5, 0xd3, 0x32, 0x55, - 0x6c, 0x78, 0x22, 0xfe, 0x94, 0xeb, 0xf8, 0x9b, - 0x56, 0xa3, 0x87, 0x8d, 0xc2, 0x7c, 0xa0, 0x79, - 0x10, 0x30, 0x58, 0x84, 0x9f, 0xab, 0xcb, 0x4f }, - .expected_ss = (u8[32]){ 0xe5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x1c, 0xa2, 0x19, 0x0b, 0x71, 0x16, 0x35, 0x39, - 0x06, 0x3c, 0x35, 0x77, 0x3b, 0xda, 0x0c, 0x9c, - 0x92, 0x8e, 0x91, 0x36, 0xf0, 0x62, 0x0a, 0xeb, - 0x09, 0x3f, 0x09, 0x91, 0x97, 0xb7, 0xf7, 0x4e }, - .expected_ss = (u8[32]){ 0xe3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0xf7, 0x6e, 0x90, 0x10, 0xac, 0x33, 0xc5, 0x04, - 0x3b, 0x2d, 0x3b, 0x76, 0xa8, 0x42, 0x17, 0x10, - 0x00, 0xc4, 0x91, 0x62, 0x22, 0xe9, 0xe8, 0x58, - 0x97, 0xa0, 0xae, 0xc7, 0xf6, 0x35, 0x0b, 0x3c }, - .expected_ss = (u8[32]){ 0xdd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0xbb, 0x72, 0x68, 0x8d, 0x8f, 0x8a, 0xa7, 0xa3, - 0x9c, 0xd6, 0x06, 0x0c, 0xd5, 0xc8, 0x09, 0x3c, - 0xde, 0xc6, 0xfe, 0x34, 0x19, 0x37, 0xc3, 0x88, - 0x6a, 0x99, 0x34, 0x6c, 0xd0, 0x7f, 0xaa, 0x55 }, - .expected_ss = (u8[32]){ 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x88, 0xfd, 0xde, 0xa1, 0x93, 0x39, 0x1c, 0x6a, - 0x59, 0x33, 0xef, 0x9b, 0x71, 0x90, 0x15, 0x49, - 0x44, 0x72, 0x05, 0xaa, 0xe9, 0xda, 0x92, 0x8a, - 0x6b, 0x91, 0xa3, 0x52, 0xba, 0x10, 0xf4, 0x1f }, - .expected_ss = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x30, 0x3b, 0x39, 0x2f, 0x15, 0x31, 0x16, 0xca, - 0xd9, 0xcc, 0x68, 0x2a, 0x00, 0xcc, 0xc4, 0x4c, - 0x95, 0xff, 0x0d, 0x3b, 0xbe, 0x56, 0x8b, 0xeb, - 0x6c, 0x4e, 0x73, 0x9b, 0xaf, 0xdc, 0x2c, 0x68 }, - .expected_ss = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - checking for overflow */ -{ - .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .b_public = (u8[32]){ 0xfd, 0x30, 0x0a, 0xeb, 0x40, 0xe1, 0xfa, 0x58, - 0x25, 0x18, 0x41, 0x2b, 0x49, 0xb2, 0x08, 0xa7, - 0x84, 0x2b, 0x1e, 0x1f, 0x05, 0x6a, 0x04, 0x01, - 0x78, 0xea, 0x41, 0x41, 0x53, 0x4f, 0x65, 0x2d }, - .expected_ss = (u8[32]){ 0xb7, 0x34, 0x10, 0x5d, 0xc2, 0x57, 0x58, 0x5d, - 0x73, 0xb5, 0x66, 0xcc, 0xb7, 0x6f, 0x06, 0x27, - 0x95, 0xcc, 0xbe, 0xc8, 0x91, 0x28, 0xe5, 0x2b, - 0x02, 0xf3, 0xe5, 0x96, 0x39, 0xf1, 0x3c, 0x46 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - checking for overflow */ -{ - .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .b_public = (u8[32]){ 0xc8, 0xef, 0x79, 0xb5, 0x14, 0xd7, 0x68, 0x26, - 0x77, 0xbc, 0x79, 0x31, 0xe0, 0x6e, 0xe5, 0xc2, - 0x7c, 0x9b, 0x39, 0x2b, 0x4a, 0xe9, 0x48, 0x44, - 0x73, 0xf5, 0x54, 0xe6, 0x67, 0x8e, 0xcc, 0x2e }, - .expected_ss = (u8[32]){ 0x64, 0x7a, 0x46, 0xb6, 0xfc, 0x3f, 0x40, 0xd6, - 0x21, 0x41, 0xee, 0x3c, 0xee, 0x70, 0x6b, 0x4d, - 0x7a, 0x92, 0x71, 0x59, 0x3a, 0x7b, 0x14, 0x3e, - 0x8e, 0x2e, 0x22, 0x79, 0x88, 0x3e, 0x45, 0x50 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - checking for overflow */ -{ - .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .b_public = (u8[32]){ 0x64, 0xae, 0xac, 0x25, 0x04, 0x14, 0x48, 0x61, - 0x53, 0x2b, 0x7b, 0xbc, 0xb6, 0xc8, 0x7d, 0x67, - 0xdd, 0x4c, 0x1f, 0x07, 0xeb, 0xc2, 0xe0, 0x6e, - 0xff, 0xb9, 0x5a, 0xec, 0xc6, 0x17, 0x0b, 0x2c }, - .expected_ss = (u8[32]){ 0x4f, 0xf0, 0x3d, 0x5f, 0xb4, 0x3c, 0xd8, 0x65, - 0x7a, 0x3c, 0xf3, 0x7c, 0x13, 0x8c, 0xad, 0xce, - 0xcc, 0xe5, 0x09, 0xe4, 0xeb, 0xa0, 0x89, 0xd0, - 0xef, 0x40, 0xb4, 0xe4, 0xfb, 0x94, 0x61, 0x55 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - checking for overflow */ -{ - .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .b_public = (u8[32]){ 0xbf, 0x68, 0xe3, 0x5e, 0x9b, 0xdb, 0x7e, 0xee, - 0x1b, 0x50, 0x57, 0x02, 0x21, 0x86, 0x0f, 0x5d, - 0xcd, 0xad, 0x8a, 0xcb, 0xab, 0x03, 0x1b, 0x14, - 0x97, 0x4c, 0xc4, 0x90, 0x13, 0xc4, 0x98, 0x31 }, - .expected_ss = (u8[32]){ 0x21, 0xce, 0xe5, 0x2e, 0xfd, 0xbc, 0x81, 0x2e, - 0x1d, 0x02, 0x1a, 0x4a, 0xf1, 0xe1, 0xd8, 0xbc, - 0x4d, 0xb3, 0xc4, 0x00, 0xe4, 0xd2, 0xa2, 0xc5, - 0x6a, 0x39, 0x26, 0xdb, 0x4d, 0x99, 0xc6, 0x5b }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - checking for overflow */ -{ - .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .b_public = (u8[32]){ 0x53, 0x47, 0xc4, 0x91, 0x33, 0x1a, 0x64, 0xb4, - 0x3d, 0xdc, 0x68, 0x30, 0x34, 0xe6, 0x77, 0xf5, - 0x3d, 0xc3, 0x2b, 0x52, 0xa5, 0x2a, 0x57, 0x7c, - 0x15, 0xa8, 0x3b, 0xf2, 0x98, 0xe9, 0x9f, 0x19 }, - .expected_ss = (u8[32]){ 0x18, 0xcb, 0x89, 0xe4, 0xe2, 0x0c, 0x0c, 0x2b, - 0xd3, 0x24, 0x30, 0x52, 0x45, 0x26, 0x6c, 0x93, - 0x27, 0x69, 0x0b, 0xbe, 0x79, 0xac, 0xb8, 0x8f, - 0x5b, 0x8f, 0xb3, 0xf7, 0x4e, 0xca, 0x3e, 0x52 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - private key == -1 (mod order) */ -{ - .secret = (u8[32]){ 0xa0, 0x23, 0xcd, 0xd0, 0x83, 0xef, 0x5b, 0xb8, - 0x2f, 0x10, 0xd6, 0x2e, 0x59, 0xe1, 0x5a, 0x68, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50 }, - .b_public = (u8[32]){ 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e, - 0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57, - 0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f, - 0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 }, - .expected_ss = (u8[32]){ 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e, - 0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57, - 0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f, - 0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - private key == 1 (mod order) on twist */ -{ - .secret = (u8[32]){ 0x58, 0x08, 0x3d, 0xd2, 0x61, 0xad, 0x91, 0xef, - 0xf9, 0x52, 0x32, 0x2e, 0xc8, 0x24, 0xc6, 0x82, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x5f }, - .b_public = (u8[32]){ 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f, - 0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6, - 0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64, - 0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 }, - .expected_ss = (u8[32]){ 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f, - 0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6, - 0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64, - 0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -} -}; - static const struct kpp_testvec ecdh_p192_tv_template[] = { { .secret = From 09e7652ddb688488a2954f1168b7f40e037694a4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:17 -0700 Subject: [PATCH 0997/3206] crypto: curve25519 - Remove unused kpp support Curve25519 has both a library API and a crypto_kpp API. However, the crypto_kpp API for Curve25519 had no users outside crypto/testmgr.c. I.e., no non-test code ever passed "curve25519" to crypto_alloc_kpp(). Remove this unused code. We'll instead focus on the Curve25519 library API (), which is a simpler and easier-to-use API and is the API that is actually being used. Acked-by: Geert Uytterhoeven # m68k Link: https://lore.kernel.org/r/20250906213523.84915-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/m68k/configs/amiga_defconfig | 1 - arch/m68k/configs/apollo_defconfig | 1 - arch/m68k/configs/atari_defconfig | 1 - arch/m68k/configs/bvme6000_defconfig | 1 - arch/m68k/configs/hp300_defconfig | 1 - arch/m68k/configs/mac_defconfig | 1 - arch/m68k/configs/multi_defconfig | 1 - arch/m68k/configs/mvme147_defconfig | 1 - arch/m68k/configs/mvme16x_defconfig | 1 - arch/m68k/configs/q40_defconfig | 1 - arch/m68k/configs/sun3_defconfig | 1 - arch/m68k/configs/sun3x_defconfig | 1 - arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - crypto/Kconfig | 8 --- crypto/Makefile | 1 - crypto/curve25519-generic.c | 91 ---------------------------- 17 files changed, 114 deletions(-) delete mode 100644 crypto/curve25519-generic.c diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 5171bb183967b9..24e7314ae4d3c1 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -560,7 +560,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 16f343ae48c675..4f13c2fa2097e1 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -517,7 +517,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index c08788728ea962..95ef2c838141d2 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -537,7 +537,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 962497e7c53fd6..d0aca54485f2d9 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -509,7 +509,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index ec28650189e406..faa5ec07cf9e53 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -519,7 +519,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 0afb3ad180dee3..313a52341dafae 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -536,7 +536,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index b311e953995d6d..44cc3461f56950 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -623,7 +623,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index f4e6224f137f99..0394fd631679e1 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -509,7 +509,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 498e167222f18c..d8ad11b7054c42 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -510,7 +510,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 8c6b1eef853423..bdc090d0c0ad09 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -526,7 +526,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index c34648f299efb9..05324e30c65f54 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -507,7 +507,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 73810d14660f21..a1a3fb24fb7b0b 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -507,7 +507,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 5e616bc988ac35..1c54348b812da2 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -761,7 +761,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_ARIA=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 094599cdaf4d9b..90a9dad7f80154 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -745,7 +745,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_ARIA=m diff --git a/crypto/Kconfig b/crypto/Kconfig index 09e8fb6ee0813f..a04595f9d0ca4b 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -344,14 +344,6 @@ config CRYPTO_ECRDSA One of the Russian cryptographic standard algorithms (called GOST algorithms). Only signature verification is implemented. -config CRYPTO_CURVE25519 - tristate "Curve25519" - select CRYPTO_KPP - select CRYPTO_LIB_CURVE25519_GENERIC - select CRYPTO_LIB_CURVE25519_INTERNAL - help - Curve25519 elliptic curve (RFC7748) - endmenu menu "Block ciphers" diff --git a/crypto/Makefile b/crypto/Makefile index 6c5d59369dacc6..e430e6e99b6a24 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -182,7 +182,6 @@ obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o obj-$(CONFIG_CRYPTO_ECC) += ecc.o obj-$(CONFIG_CRYPTO_ESSIV) += essiv.o -obj-$(CONFIG_CRYPTO_CURVE25519) += curve25519-generic.o ecdh_generic-y += ecdh.o ecdh_generic-y += ecdh_helper.o diff --git a/crypto/curve25519-generic.c b/crypto/curve25519-generic.c deleted file mode 100644 index f3e56e73c66ca2..00000000000000 --- a/crypto/curve25519-generic.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include -#include -#include - -static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, - unsigned int len) -{ - u8 *secret = kpp_tfm_ctx(tfm); - - if (!len) - curve25519_generate_secret(secret); - else if (len == CURVE25519_KEY_SIZE && - crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) - memcpy(secret, buf, CURVE25519_KEY_SIZE); - else - return -EINVAL; - return 0; -} - -static int curve25519_compute_value(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 public_key[CURVE25519_KEY_SIZE]; - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - u8 const *bp; - - if (req->src) { - copied = sg_copy_to_buffer(req->src, - sg_nents_for_len(req->src, - CURVE25519_KEY_SIZE), - public_key, CURVE25519_KEY_SIZE); - if (copied != CURVE25519_KEY_SIZE) - return -EINVAL; - bp = public_key; - } else { - bp = curve25519_base_point; - } - - curve25519_generic(buf, secret, bp); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} - -static unsigned int curve25519_max_size(struct crypto_kpp *tfm) -{ - return CURVE25519_KEY_SIZE; -} - -static struct kpp_alg curve25519_alg = { - .base.cra_name = "curve25519", - .base.cra_driver_name = "curve25519-generic", - .base.cra_priority = 100, - .base.cra_module = THIS_MODULE, - .base.cra_ctxsize = CURVE25519_KEY_SIZE, - - .set_secret = curve25519_set_secret, - .generate_public_key = curve25519_compute_value, - .compute_shared_secret = curve25519_compute_value, - .max_size = curve25519_max_size, -}; - -static int __init curve25519_init(void) -{ - return crypto_register_kpp(&curve25519_alg); -} - -static void __exit curve25519_exit(void) -{ - crypto_unregister_kpp(&curve25519_alg); -} - -module_init(curve25519_init); -module_exit(curve25519_exit); - -MODULE_ALIAS_CRYPTO("curve25519"); -MODULE_ALIAS_CRYPTO("curve25519-generic"); -MODULE_DESCRIPTION("Curve25519 elliptic curve (RFC7748)"); -MODULE_LICENSE("GPL"); From c3a49515225e44b2593839a4b3fec70c39dc0c89 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Wed, 20 Aug 2025 12:04:41 +0800 Subject: [PATCH 0998/3206] power: supply: Remove the use of dev_err_probe() The dev_err_probe() doesn't do anything when error is '-ENOMEM'. Therefore, remove the useless call to dev_err_probe(), and just return the value instead. Signed-off-by: Xichao Zhao Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 2 +- drivers/power/supply/mt6370-charger.c | 3 +-- drivers/power/supply/sbs-manager.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index 329b430d0e5065..59090703cc7a9f 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -546,7 +546,7 @@ static int max77705_charger_probe(struct i2c_client *i2c) chg->wqueue = create_singlethread_workqueue(dev_name(dev)); if (!chg->wqueue) - return dev_err_probe(dev, -ENOMEM, "failed to create workqueue\n"); + return -ENOMEM; ret = devm_work_autocancel(dev, &chg->chgin_work, max77705_chgin_isr_work); if (ret) { diff --git a/drivers/power/supply/mt6370-charger.c b/drivers/power/supply/mt6370-charger.c index 29510af4e59533..eb3bcf81f74166 100644 --- a/drivers/power/supply/mt6370-charger.c +++ b/drivers/power/supply/mt6370-charger.c @@ -904,8 +904,7 @@ static int mt6370_chg_probe(struct platform_device *pdev) priv->wq = create_singlethread_workqueue(dev_name(priv->dev)); if (!priv->wq) - return dev_err_probe(dev, -ENOMEM, - "Failed to create workqueue\n"); + return -ENOMEM; ret = devm_add_action_or_reset(dev, mt6370_chg_destroy_wq, priv->wq); if (ret) diff --git a/drivers/power/supply/sbs-manager.c b/drivers/power/supply/sbs-manager.c index 869729dfcd664c..6fe526222f7f4d 100644 --- a/drivers/power/supply/sbs-manager.c +++ b/drivers/power/supply/sbs-manager.c @@ -348,7 +348,7 @@ static int sbsm_probe(struct i2c_client *client) data->muxc = i2c_mux_alloc(adapter, dev, SBSM_MAX_BATS, 0, I2C_MUX_LOCKED, &sbsm_select, NULL); if (!data->muxc) - return dev_err_probe(dev, -ENOMEM, "failed to alloc i2c mux\n"); + return -ENOMEM; data->muxc->priv = data; ret = devm_add_action_or_reset(dev, sbsm_del_mux_adapter, data); From def5612170a8c6c4c6a3ea5bd6c3cfc8de6ba4b1 Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Mon, 18 Aug 2025 20:32:59 +0800 Subject: [PATCH 0999/3206] power: supply: cw2015: Fix a alignment coding style issue Fix the checkpatch warning: CHECK: Alignment should match open parenthesis Fixes: 0cb172a4918e ("power: supply: cw2015: Use device managed API to simplify the code") Signed-off-by: Andy Yan Signed-off-by: Sebastian Reichel --- drivers/power/supply/cw2015_battery.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/power/supply/cw2015_battery.c b/drivers/power/supply/cw2015_battery.c index afc607fee5c96f..2263d5d3448fdf 100644 --- a/drivers/power/supply/cw2015_battery.c +++ b/drivers/power/supply/cw2015_battery.c @@ -699,8 +699,7 @@ static int cw_bat_probe(struct i2c_client *client) if (!cw_bat->battery_workqueue) return -ENOMEM; - devm_delayed_work_autocancel(&client->dev, - &cw_bat->battery_delay_work, cw_bat_work); + devm_delayed_work_autocancel(&client->dev, &cw_bat->battery_delay_work, cw_bat_work); queue_delayed_work(cw_bat->battery_workqueue, &cw_bat->battery_delay_work, msecs_to_jiffies(10)); return 0; From afc4e4a5f122183b38095daba2264123cc86d8ab Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:18 -0700 Subject: [PATCH 1000/3206] lib/crypto: tests: Migrate Curve25519 self-test to KUnit Move the Curve25519 test from an ad-hoc self-test to a KUnit test. Generally keep the same test logic for now, just translated to KUnit. There's one exception, which is that I dropped the incomplete test of curve25519_generic(). The approach I'm taking to cover the different implementations with the KUnit tests is to just rely on booting kernels in QEMU with different '-cpu' options, rather than try to make the tests (incompletely) test multiple implementations on one CPU. This way, both the test and the library API are simpler. This commit makes the file lib/crypto/curve25519.c no longer needed, as its only purpose was to call the self-test. However, keep it for now, since a later commit will add code to it again. Temporarily omit the default value of CRYPTO_SELFTESTS that the other lib/crypto/ KUnit tests have. It would cause a recursive kconfig dependency, since the Curve25519 code is still entangled with CRYPTO. A later commit will fix that. Link: https://lore.kernel.org/r/20250906213523.84915-8-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/curve25519.h | 2 - lib/crypto/Makefile | 1 - lib/crypto/curve25519.c | 3 - lib/crypto/tests/Kconfig | 9 +++ lib/crypto/tests/Makefile | 1 + .../curve25519_kunit.c} | 73 +++++++++++-------- 6 files changed, 52 insertions(+), 37 deletions(-) rename lib/crypto/{curve25519-selftest.c => tests/curve25519_kunit.c} (97%) diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h index ece6a9b5fafc88..4e6dc840b1592c 100644 --- a/include/crypto/curve25519.h +++ b/include/crypto/curve25519.h @@ -28,8 +28,6 @@ void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE]); -bool curve25519_selftest(void); - static inline bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index ad27c5bf99e11c..6c3be971ace096 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -87,7 +87,6 @@ endif obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o libcurve25519-y += curve25519.o -libcurve25519-$(CONFIG_CRYPTO_SELFTESTS) += curve25519-selftest.o obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o libdes-y := des.o diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c index 6850b76a80c9e3..25f16777865bfc 100644 --- a/lib/crypto/curve25519.c +++ b/lib/crypto/curve25519.c @@ -15,9 +15,6 @@ static int __init curve25519_init(void) { - if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) && - WARN_ON(!curve25519_selftest())) - return -ENODEV; return 0; } diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig index fd341aa12f1579..eaca60d3e0a364 100644 --- a/lib/crypto/tests/Kconfig +++ b/lib/crypto/tests/Kconfig @@ -10,6 +10,15 @@ config CRYPTO_LIB_BLAKE2S_KUNIT_TEST help KUnit tests for the BLAKE2s cryptographic hash function. +config CRYPTO_LIB_CURVE25519_KUNIT_TEST + tristate "KUnit tests for Curve25519" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + select CRYPTO_LIB_BENCHMARK_VISIBLE + select CRYPTO_LIB_CURVE25519 + help + KUnit tests for the Curve25519 Diffie-Hellman function. + config CRYPTO_LIB_MD5_KUNIT_TEST tristate "KUnit tests for MD5" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile index be7de929af2cc7..a71fad19922baa 100644 --- a/lib/crypto/tests/Makefile +++ b/lib/crypto/tests/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-or-later obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_KUNIT_TEST) += blake2s_kunit.o +obj-$(CONFIG_CRYPTO_LIB_CURVE25519_KUNIT_TEST) += curve25519_kunit.o obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o diff --git a/lib/crypto/curve25519-selftest.c b/lib/crypto/tests/curve25519_kunit.c similarity index 97% rename from lib/crypto/curve25519-selftest.c rename to lib/crypto/tests/curve25519_kunit.c index c85e85381e7884..68eab75d40dcd5 100644 --- a/lib/crypto/curve25519-selftest.c +++ b/lib/crypto/tests/curve25519_kunit.c @@ -4,6 +4,7 @@ */ #include +#include struct curve25519_test_vector { u8 private[CURVE25519_KEY_SIZE]; @@ -11,7 +12,7 @@ struct curve25519_test_vector { u8 result[CURVE25519_KEY_SIZE]; bool valid; }; -static const struct curve25519_test_vector curve25519_test_vectors[] __initconst = { +static const struct curve25519_test_vector curve25519_test_vectors[] = { { .private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45, @@ -1280,42 +1281,52 @@ static const struct curve25519_test_vector curve25519_test_vectors[] __initconst } }; -bool __init curve25519_selftest(void) +static void test_curve25519(struct kunit *test) { - bool success = true, ret, ret2; - size_t i = 0, j; - u8 in[CURVE25519_KEY_SIZE]; - u8 out[CURVE25519_KEY_SIZE], out2[CURVE25519_KEY_SIZE], - out3[CURVE25519_KEY_SIZE]; + for (size_t i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) { + const struct curve25519_test_vector *vec = + &curve25519_test_vectors[i]; + u8 out[CURVE25519_KEY_SIZE] = {}; + bool ret; - for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) { - memset(out, 0, CURVE25519_KEY_SIZE); - ret = curve25519(out, curve25519_test_vectors[i].private, - curve25519_test_vectors[i].public); - if (ret != curve25519_test_vectors[i].valid || - memcmp(out, curve25519_test_vectors[i].result, - CURVE25519_KEY_SIZE)) { - pr_err("curve25519 self-test %zu: FAIL\n", i + 1); - success = false; - } + ret = curve25519(out, vec->private, vec->public); + KUNIT_EXPECT_EQ_MSG(test, ret, vec->valid, + "Wrong return value with test vector %zu", + i); + KUNIT_EXPECT_MEMEQ_MSG(test, out, vec->result, sizeof(out), + "Wrong output with test vector %zu", i); } +} + +static void test_curve25519_basepoint(struct kunit *test) +{ + for (size_t i = 0; i < 5; ++i) { + u8 in[CURVE25519_KEY_SIZE]; + u8 out[CURVE25519_KEY_SIZE]; + u8 out2[CURVE25519_KEY_SIZE]; + bool ret, ret2; - for (i = 0; i < 5; ++i) { get_random_bytes(in, sizeof(in)); ret = curve25519_generate_public(out, in); ret2 = curve25519(out2, in, (u8[CURVE25519_KEY_SIZE]){ 9 }); - curve25519_generic(out3, in, (u8[CURVE25519_KEY_SIZE]){ 9 }); - if (ret != ret2 || - memcmp(out, out2, CURVE25519_KEY_SIZE) || - memcmp(out, out3, CURVE25519_KEY_SIZE)) { - pr_err("curve25519 basepoint self-test %zu: FAIL: input - 0x", - i + 1); - for (j = CURVE25519_KEY_SIZE; j-- > 0;) - printk(KERN_CONT "%02x", in[j]); - printk(KERN_CONT "\n"); - success = false; - } + KUNIT_EXPECT_EQ_MSG(test, ret, ret2, + "in=%*phN", CURVE25519_KEY_SIZE, in); + KUNIT_EXPECT_MEMEQ_MSG(test, out, out2, CURVE25519_KEY_SIZE, + "in=%*phN", CURVE25519_KEY_SIZE, in); } - - return success; } + +static struct kunit_case curve25519_test_cases[] = { + KUNIT_CASE(test_curve25519), + KUNIT_CASE(test_curve25519_basepoint), + {}, +}; + +static struct kunit_suite curve25519_test_suite = { + .name = "curve25519", + .test_cases = curve25519_test_cases, +}; +kunit_test_suite(curve25519_test_suite); + +MODULE_DESCRIPTION("KUnit tests for Curve25519"); +MODULE_LICENSE("GPL"); From 643d79e531cc99da0dc5502dfccb6b3b305c65f3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:19 -0700 Subject: [PATCH 1001/3206] lib/crypto: tests: Add Curve25519 benchmark Add a benchmark to curve25519_kunit. This brings it in line with the other crypto KUnit tests and provides an easy way to measure performance. Link: https://lore.kernel.org/r/20250906213523.84915-9-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/tests/curve25519_kunit.c | 33 ++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/lib/crypto/tests/curve25519_kunit.c b/lib/crypto/tests/curve25519_kunit.c index 68eab75d40dcd5..248d05f66b35c1 100644 --- a/lib/crypto/tests/curve25519_kunit.c +++ b/lib/crypto/tests/curve25519_kunit.c @@ -5,6 +5,7 @@ #include #include +#include struct curve25519_test_vector { u8 private[CURVE25519_KEY_SIZE]; @@ -1316,9 +1317,39 @@ static void test_curve25519_basepoint(struct kunit *test) } } +static void benchmark_curve25519(struct kunit *test) +{ + const u8 *private = curve25519_test_vectors[0].private; + const u8 *public = curve25519_test_vectors[0].public; + const size_t warmup_niter = 5000; + const size_t benchmark_niter = 1024; + u8 out[CURVE25519_KEY_SIZE]; + bool ok = true; + u64 t; + + if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK)) + kunit_skip(test, "not enabled"); + + /* Warm-up */ + for (size_t i = 0; i < warmup_niter; i++) + ok &= curve25519(out, private, public); + + /* Benchmark */ + preempt_disable(); + t = ktime_get_ns(); + for (size_t i = 0; i < benchmark_niter; i++) + ok &= curve25519(out, private, public); + t = ktime_get_ns() - t; + preempt_enable(); + KUNIT_EXPECT_TRUE(test, ok); + kunit_info(test, "%llu ops/s", + div64_u64((u64)benchmark_niter * NSEC_PER_SEC, t ?: 1)); +} + static struct kunit_case curve25519_test_cases[] = { KUNIT_CASE(test_curve25519), KUNIT_CASE(test_curve25519_basepoint), + KUNIT_CASE(benchmark_curve25519), {}, }; @@ -1328,5 +1359,5 @@ static struct kunit_suite curve25519_test_suite = { }; kunit_test_suite(curve25519_test_suite); -MODULE_DESCRIPTION("KUnit tests for Curve25519"); +MODULE_DESCRIPTION("KUnit tests and benchmark for Curve25519"); MODULE_LICENSE("GPL"); From 8c06b330e8f79834924305362227e38e4e2469ae Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:20 -0700 Subject: [PATCH 1002/3206] lib/crypto: curve25519: Move a couple functions out-of-line Move curve25519() and curve25519_generate_public() from curve25519.h to curve25519.c. There's no good reason for them to be inline. Link: https://lore.kernel.org/r/20250906213523.84915-10-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/curve25519.h | 28 +++------------------------- lib/crypto/curve25519.c | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h index 4e6dc840b1592c..78aa5f28c847f2 100644 --- a/include/crypto/curve25519.h +++ b/include/crypto/curve25519.h @@ -6,7 +6,6 @@ #ifndef CURVE25519_H #define CURVE25519_H -#include // For crypto_memneq. #include #include @@ -28,33 +27,12 @@ void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE]); -static inline bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) - curve25519_arch(mypublic, secret, basepoint); - else - curve25519_generic(mypublic, secret, basepoint); - return crypto_memneq(mypublic, curve25519_null_point, - CURVE25519_KEY_SIZE); -} + const u8 basepoint[CURVE25519_KEY_SIZE]); -static inline bool -__must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) -{ - if (unlikely(!crypto_memneq(secret, curve25519_null_point, - CURVE25519_KEY_SIZE))) - return false; - - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) - curve25519_base_arch(pub, secret); - else - curve25519_generic(pub, secret, curve25519_base_point); - return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE); -} +bool __must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]); static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE]) { diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c index 25f16777865bfc..1b786389d714a0 100644 --- a/lib/crypto/curve25519.c +++ b/lib/crypto/curve25519.c @@ -10,8 +10,40 @@ */ #include -#include +#include +#include #include +#include + +bool __must_check +curve25519(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) + curve25519_arch(mypublic, secret, basepoint); + else + curve25519_generic(mypublic, secret, basepoint); + return crypto_memneq(mypublic, curve25519_null_point, + CURVE25519_KEY_SIZE); +} +EXPORT_SYMBOL(curve25519); + +bool __must_check +curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + if (unlikely(!crypto_memneq(secret, curve25519_null_point, + CURVE25519_KEY_SIZE))) + return false; + + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) + curve25519_base_arch(pub, secret); + else + curve25519_generic(pub, secret, curve25519_base_point); + return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE); +} +EXPORT_SYMBOL(curve25519_generate_public); static int __init curve25519_init(void) { From 68546e5632c0b982663af575ae12cc5d81facc91 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:21 -0700 Subject: [PATCH 1003/3206] lib/crypto: curve25519: Consolidate into single module Reorganize the Curve25519 library code: - Build a single libcurve25519 module, instead of up to three modules: libcurve25519, libcurve25519-generic, and an arch-specific module. - Move the arch-specific Curve25519 code from arch/$(SRCARCH)/crypto/ to lib/crypto/$(SRCARCH)/. Centralize the build rules into lib/crypto/Makefile and lib/crypto/Kconfig. - Include the arch-specific code directly in lib/crypto/curve25519.c via a header, rather than using a separate .c file. - Eliminate the entanglement with CRYPTO. CRYPTO_LIB_CURVE25519 no longer selects CRYPTO, and the arch-specific Curve25519 code no longer depends on CRYPTO. This brings Curve25519 in line with the latest conventions for lib/crypto/, used by other algorithms. The exception is that I kept the generic code in separate translation units for now. (Some of the function names collide between the x86 and generic Curve25519 code. And the Curve25519 functions are very long anyway, so inlining doesn't matter as much for Curve25519 as it does for some other algorithms.) Link: https://lore.kernel.org/r/20250906213523.84915-11-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm/crypto/Kconfig | 12 ----- arch/arm/crypto/Makefile | 2 - arch/powerpc/crypto/Kconfig | 12 ----- arch/powerpc/crypto/Makefile | 2 - arch/x86/crypto/Kconfig | 12 ----- arch/x86/crypto/Makefile | 5 -- include/crypto/curve25519.h | 10 ---- lib/crypto/Kconfig | 37 +++++--------- lib/crypto/Makefile | 26 +++++++--- .../crypto/arm}/curve25519-core.S | 0 .../crypto/arm/curve25519.h | 31 +++--------- lib/crypto/curve25519-generic.c | 25 ---------- lib/crypto/curve25519.c | 50 ++++++++++++------- .../crypto/powerpc}/curve25519-ppc64le_asm.S | 0 .../crypto/powerpc/curve25519.h | 19 ++----- .../crypto/x86/curve25519.h | 31 +++--------- 16 files changed, 86 insertions(+), 188 deletions(-) rename {arch/arm/crypto => lib/crypto/arm}/curve25519-core.S (100%) rename arch/arm/crypto/curve25519-glue.c => lib/crypto/arm/curve25519.h (58%) delete mode 100644 lib/crypto/curve25519-generic.c rename {arch/powerpc/crypto => lib/crypto/powerpc}/curve25519-ppc64le_asm.S (100%) rename arch/powerpc/crypto/curve25519-ppc64le-core.c => lib/crypto/powerpc/curve25519.h (88%) rename arch/x86/crypto/curve25519-x86_64.c => lib/crypto/x86/curve25519.h (98%) diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 97718d86f6007e..c436eec22d86ca 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -2,18 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (arm)" -config CRYPTO_CURVE25519_NEON - tristate - depends on KERNEL_MODE_NEON - select CRYPTO_LIB_CURVE25519_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CURVE25519 - default CRYPTO_LIB_CURVE25519_INTERNAL - help - Curve25519 algorithm - - Architecture: arm with - - NEON (Advanced SIMD) extensions - config CRYPTO_GHASH_ARM_CE tristate "Hash functions: GHASH (PMULL/NEON/ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 4f23999ae17dfe..6346a73effc06a 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -7,7 +7,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o -obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o @@ -18,4 +17,3 @@ blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o -curve25519-neon-y := curve25519-core.o curve25519-glue.o diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 6106a219da6afe..662aed46f9c795 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -2,18 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (powerpc)" -config CRYPTO_CURVE25519_PPC64 - tristate - depends on PPC64 && CPU_LITTLE_ENDIAN - select CRYPTO_LIB_CURVE25519_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CURVE25519 - default CRYPTO_LIB_CURVE25519_INTERNAL - help - Curve25519 algorithm - - Architecture: PowerPC64 - - Little-endian - config CRYPTO_AES_PPC_SPE tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)" depends on SPE diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 9eb59dce67f367..5960e5300db71e 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -8,12 +8,10 @@ obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o -obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o -curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) override flavour := linux-ppc64le diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 6a895a571b00eb..d9c6fc78cf3324 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -2,18 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (x86)" -config CRYPTO_CURVE25519_X86 - tristate - depends on 64BIT - select CRYPTO_LIB_CURVE25519_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CURVE25519 - default CRYPTO_LIB_CURVE25519_INTERNAL - help - Curve25519 algorithm - - Architecture: x86_64 using: - - ADX (large integer arithmetic) - config CRYPTO_AES_NI_INTEL tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)" select CRYPTO_AEAD diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index d402963d6b579a..dfba7e5e88ea69 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -62,8 +62,6 @@ nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o -obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o - obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o @@ -81,6 +79,3 @@ aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o - -# Disable GCOV in odd or sensitive code -GCOV_PROFILE_curve25519-x86_64.o := n diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h index 78aa5f28c847f2..db63a5577c0045 100644 --- a/include/crypto/curve25519.h +++ b/include/crypto/curve25519.h @@ -13,20 +13,10 @@ enum curve25519_lengths { CURVE25519_KEY_SIZE = 32 }; -extern const u8 curve25519_null_point[]; -extern const u8 curve25519_base_point[]; - void curve25519_generic(u8 out[CURVE25519_KEY_SIZE], const u8 scalar[CURVE25519_KEY_SIZE], const u8 point[CURVE25519_KEY_SIZE]); -void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], - const u8 scalar[CURVE25519_KEY_SIZE], - const u8 point[CURVE25519_KEY_SIZE]); - -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]); - bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], const u8 basepoint[CURVE25519_KEY_SIZE]); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 37d85e0c9b979b..eea17e36a22bed 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -54,35 +54,24 @@ config CRYPTO_LIB_CHACHA_ARCH default y if S390 default y if X86_64 -config CRYPTO_ARCH_HAVE_LIB_CURVE25519 - bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the Curve25519 library interface, - either builtin or as a module. - -config CRYPTO_LIB_CURVE25519_GENERIC +config CRYPTO_LIB_CURVE25519 tristate select CRYPTO_LIB_UTILS help - This symbol can be depended upon by arch implementations of the - Curve25519 library interface that require the generic code as a - fallback, e.g., for SIMD implementations. If no arch specific - implementation is enabled, this implementation serves the users - of CRYPTO_LIB_CURVE25519. + The Curve25519 library functions. Select this if your module uses any + of the functions from . -config CRYPTO_LIB_CURVE25519_INTERNAL - tristate - select CRYPTO_LIB_CURVE25519_GENERIC if CRYPTO_ARCH_HAVE_LIB_CURVE25519=n +config CRYPTO_LIB_CURVE25519_ARCH + bool + depends on CRYPTO_LIB_CURVE25519 && !UML && !KMSAN + default y if ARM && KERNEL_MODE_NEON + default y if PPC64 && CPU_LITTLE_ENDIAN + default y if X86_64 -config CRYPTO_LIB_CURVE25519 - tristate - select CRYPTO - select CRYPTO_LIB_CURVE25519_INTERNAL - help - Enable the Curve25519 library interface. This interface may be - fulfilled by either the generic implementation or an arch-specific - one, if one is available and enabled. +config CRYPTO_LIB_CURVE25519_GENERIC + bool + depends on CRYPTO_LIB_CURVE25519 + default y if !CRYPTO_LIB_CURVE25519_ARCH || ARM || X86_64 config CRYPTO_LIB_DES tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 6c3be971ace096..bded351aeacef2 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -76,17 +76,31 @@ obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305) += libchacha20poly1305.o libchacha20poly1305-y += chacha20poly1305.o libchacha20poly1305-$(CONFIG_CRYPTO_SELFTESTS) += chacha20poly1305-selftest.o -obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += libcurve25519-generic.o -libcurve25519-generic-y := curve25519-fiat32.o -libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := curve25519-hacl64.o -libcurve25519-generic-y += curve25519-generic.o +################################################################################ + +obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o +libcurve25519-y := curve25519.o + +# Disable GCOV in odd or sensitive code +GCOV_PROFILE_curve25519.o := n + +ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y) +libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-hacl64.o +else +libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-fiat32.o +endif # clang versions prior to 18 may blow out the stack with KASAN ifeq ($(call clang-min-version, 180000),) KASAN_SANITIZE_curve25519-hacl64.o := n endif -obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o -libcurve25519-y += curve25519.o +ifeq ($(CONFIG_CRYPTO_LIB_CURVE25519_ARCH),y) +CFLAGS_curve25519.o += -I$(src)/$(SRCARCH) +libcurve25519-$(CONFIG_ARM) += arm/curve25519-core.o +libcurve25519-$(CONFIG_PPC) += powerpc/curve25519-ppc64le_asm.o +endif + +################################################################################ obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o libdes-y := des.o diff --git a/arch/arm/crypto/curve25519-core.S b/lib/crypto/arm/curve25519-core.S similarity index 100% rename from arch/arm/crypto/curve25519-core.S rename to lib/crypto/arm/curve25519-core.S diff --git a/arch/arm/crypto/curve25519-glue.c b/lib/crypto/arm/curve25519.h similarity index 58% rename from arch/arm/crypto/curve25519-glue.c rename to lib/crypto/arm/curve25519.h index 3076020d8fbeb6..f6d66494eb8f88 100644 --- a/arch/arm/crypto/curve25519-glue.c +++ b/lib/crypto/arm/curve25519.h @@ -12,10 +12,7 @@ #include #include #include -#include -#include #include -#include asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], @@ -23,9 +20,9 @@ asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE], static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], - const u8 scalar[CURVE25519_KEY_SIZE], - const u8 point[CURVE25519_KEY_SIZE]) +static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], + const u8 scalar[CURVE25519_KEY_SIZE], + const u8 point[CURVE25519_KEY_SIZE]) { if (static_branch_likely(&have_neon) && crypto_simd_usable()) { kernel_neon_begin(); @@ -35,28 +32,16 @@ void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], curve25519_generic(out, scalar, point); } } -EXPORT_SYMBOL(curve25519_arch); -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) { - return curve25519_arch(pub, secret, curve25519_base_point); + curve25519_arch(pub, secret, curve25519_base_point); } -EXPORT_SYMBOL(curve25519_base_arch); -static int __init arm_curve25519_init(void) +#define curve25519_mod_init_arch curve25519_mod_init_arch +static void curve25519_mod_init_arch(void) { if (elf_hwcap & HWCAP_NEON) static_branch_enable(&have_neon); - return 0; } - -static void __exit arm_curve25519_exit(void) -{ -} - -module_init(arm_curve25519_init); -module_exit(arm_curve25519_exit); - -MODULE_DESCRIPTION("Public key crypto: Curve25519 (NEON-accelerated)"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/curve25519-generic.c b/lib/crypto/curve25519-generic.c deleted file mode 100644 index f8aa70c9f55988..00000000000000 --- a/lib/crypto/curve25519-generic.c +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - * - * This is an implementation of the Curve25519 ECDH algorithm, using either - * a 32-bit implementation or a 64-bit implementation with 128-bit integers, - * depending on what is supported by the target compiler. - * - * Information: https://cr.yp.to/ecdh.html - */ - -#include -#include -#include - -const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 }; -const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 }; - -EXPORT_SYMBOL(curve25519_null_point); -EXPORT_SYMBOL(curve25519_base_point); -EXPORT_SYMBOL(curve25519_generic); - -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Curve25519 scalar multiplication"); -MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c index 1b786389d714a0..01e265dfbcd904 100644 --- a/lib/crypto/curve25519.c +++ b/lib/crypto/curve25519.c @@ -2,8 +2,9 @@ /* * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. * - * This is an implementation of the Curve25519 ECDH algorithm, using either - * a 32-bit implementation or a 64-bit implementation with 128-bit integers, + * This is an implementation of the Curve25519 ECDH algorithm, using either an + * architecture-optimized implementation or a generic implementation. The + * generic implementation is either 32-bit, or 64-bit with 128-bit integers, * depending on what is supported by the target compiler. * * Information: https://cr.yp.to/ecdh.html @@ -15,15 +16,32 @@ #include #include +static const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 }; +static const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 }; + +#ifdef CONFIG_CRYPTO_LIB_CURVE25519_ARCH +#include "curve25519.h" /* $(SRCARCH)/curve25519.h */ +#else +static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + curve25519_generic(mypublic, secret, basepoint); +} + +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + curve25519_generic(pub, secret, curve25519_base_point); +} +#endif + bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], const u8 basepoint[CURVE25519_KEY_SIZE]) { - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) - curve25519_arch(mypublic, secret, basepoint); - else - curve25519_generic(mypublic, secret, basepoint); + curve25519_arch(mypublic, secret, basepoint); return crypto_memneq(mypublic, curve25519_null_point, CURVE25519_KEY_SIZE); } @@ -36,27 +54,25 @@ curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], if (unlikely(!crypto_memneq(secret, curve25519_null_point, CURVE25519_KEY_SIZE))) return false; - - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) - curve25519_base_arch(pub, secret); - else - curve25519_generic(pub, secret, curve25519_base_point); + curve25519_base_arch(pub, secret); return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE); } EXPORT_SYMBOL(curve25519_generate_public); -static int __init curve25519_init(void) +#ifdef curve25519_mod_init_arch +static int __init curve25519_mod_init(void) { + curve25519_mod_init_arch(); return 0; } +subsys_initcall(curve25519_mod_init); -static void __exit curve25519_exit(void) +static void __exit curve25519_mod_exit(void) { } - -module_init(curve25519_init); -module_exit(curve25519_exit); +module_exit(curve25519_mod_exit); +#endif MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Curve25519 scalar multiplication"); +MODULE_DESCRIPTION("Curve25519 algorithm"); MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/lib/crypto/powerpc/curve25519-ppc64le_asm.S similarity index 100% rename from arch/powerpc/crypto/curve25519-ppc64le_asm.S rename to lib/crypto/powerpc/curve25519-ppc64le_asm.S diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/lib/crypto/powerpc/curve25519.h similarity index 88% rename from arch/powerpc/crypto/curve25519-ppc64le-core.c rename to lib/crypto/powerpc/curve25519.h index 6eb18ee19cad3b..dee6234c48e92a 100644 --- a/arch/powerpc/crypto/curve25519-ppc64le-core.c +++ b/lib/crypto/powerpc/curve25519.h @@ -7,12 +7,9 @@ * - Algorithm 1 Scalar multiplication of a variable point */ -#include - #include #include #include -#include #include #include @@ -175,21 +172,15 @@ static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32], fe51_tobytes(out, x2); } -void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]) +static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) { curve25519_fe51(mypublic, secret, basepoint); } -EXPORT_SYMBOL(curve25519_arch); -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) { curve25519_fe51(pub, secret, curve25519_base_point); } -EXPORT_SYMBOL(curve25519_base_arch); - -MODULE_DESCRIPTION("PPC64le Curve25519 scalar multiplication with 51 bits limbs"); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Danny Tsen "); diff --git a/arch/x86/crypto/curve25519-x86_64.c b/lib/crypto/x86/curve25519.h similarity index 98% rename from arch/x86/crypto/curve25519-x86_64.c rename to lib/crypto/x86/curve25519.h index ab91368284a473..5c0b8408852dee 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/lib/crypto/x86/curve25519.h @@ -4,13 +4,9 @@ * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation */ -#include - -#include #include #include #include -#include #include #include @@ -1590,41 +1586,28 @@ static void curve25519_ever64_base(u8 *out, const u8 *priv) static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx); -void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]) +static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) { if (static_branch_likely(&curve25519_use_bmi2_adx)) curve25519_ever64(mypublic, secret, basepoint); else curve25519_generic(mypublic, secret, basepoint); } -EXPORT_SYMBOL(curve25519_arch); -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) { if (static_branch_likely(&curve25519_use_bmi2_adx)) curve25519_ever64_base(pub, secret); else curve25519_generic(pub, secret, curve25519_base_point); } -EXPORT_SYMBOL(curve25519_base_arch); -static int __init curve25519_mod_init(void) +#define curve25519_mod_init_arch curve25519_mod_init_arch +static void curve25519_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) static_branch_enable(&curve25519_use_bmi2_adx); - return 0; -} - -static void __exit curve25519_mod_exit(void) -{ } - -module_init(curve25519_mod_init); -module_exit(curve25519_mod_exit); - -MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized"); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Jason A. Donenfeld "); From cb2d6b132a44a140aed3562ef932754d39ddccf3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:22 -0700 Subject: [PATCH 1004/3206] lib/crypto: tests: Enable Curve25519 test when CRYPTO_SELFTESTS Now that the Curve25519 library has been disentangled from CRYPTO, adding CRYPTO_SELFTESTS as a default value of CRYPTO_LIB_CURVE25519_KUNIT_TEST no longer causes a recursive kconfig dependency. Do this, which makes this option consistent with the other crypto KUnit test options in the same file. Link: https://lore.kernel.org/r/20250906213523.84915-12-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/tests/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig index eaca60d3e0a364..578af717e13a7c 100644 --- a/lib/crypto/tests/Kconfig +++ b/lib/crypto/tests/Kconfig @@ -13,7 +13,7 @@ config CRYPTO_LIB_BLAKE2S_KUNIT_TEST config CRYPTO_LIB_CURVE25519_KUNIT_TEST tristate "KUnit tests for Curve25519" if !KUNIT_ALL_TESTS depends on KUNIT - default KUNIT_ALL_TESTS + default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS select CRYPTO_LIB_BENCHMARK_VISIBLE select CRYPTO_LIB_CURVE25519 help From ea9da67e2add7bd5f1e4b38dc2404480e711f4d8 Mon Sep 17 00:00:00 2001 From: Nobuhiro Iwamatsu Date: Thu, 21 Nov 2024 16:13:25 +0900 Subject: [PATCH 1005/3206] ARM: dts: socfpga: sodia: Fix mdio bus probe and PHY address On SoCFPGA/Sodia board, mdio bus cannot be probed, so the PHY cannot be found and the network device does not work. ``` stmmaceth ff702000.ethernet eth0: __stmmac_open: Cannot attach to PHY (error: -19) ``` To probe the mdio bus, add "snps,dwmac-mdio" as compatible string of the mdio bus. Also the PHY address connected to this board is 4. Therefore, change to 4. Cc: stable@vger.kernel.org # 6.3+ Signed-off-by: Nobuhiro Iwamatsu Signed-off-by: Dinh Nguyen --- arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts b/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts index ce0d6514eeb571..e4794ccb8e413f 100644 --- a/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts +++ b/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts @@ -66,8 +66,10 @@ mdio0 { #address-cells = <1>; #size-cells = <0>; - phy0: ethernet-phy@0 { - reg = <0>; + compatible = "snps,dwmac-mdio"; + + phy0: ethernet-phy@4 { + reg = <4>; rxd0-skew-ps = <0>; rxd1-skew-ps = <0>; rxd2-skew-ps = <0>; From 68e1e908cb7682db9fb7f79907f9352435a81c0f Mon Sep 17 00:00:00 2001 From: Neill Kapron Date: Thu, 28 Aug 2025 17:03:15 +0000 Subject: [PATCH 1006/3206] selinux: enable per-file labeling for functionfs This patch adds support for genfscon per-file labeling of functionfs files as well as support for userspace to apply labels after new functionfs endpoints are created. This allows for separate labels and therefore access control on a per-endpoint basis. An example use case would be for the default endpoint EP0 used as a restricted control endpoint, and additional usb endpoints to be used by other more permissive domains. It should be noted that if there are multiple functionfs mounts on a system, genfs file labels will apply to all mounts, and therefore will not likely be as useful as the userspace relabeling portion of this patch - the addition to selinux_is_genfs_special_handling(). This patch introduces the functionfs_seclabel policycap to maintain existing functionfs genfscon behavior unless explicitly enabled. Signed-off-by: Neill Kapron Acked-by: Stephen Smalley [PM: trim changelog, apply boolean logic fixup] Signed-off-by: Paul Moore --- security/selinux/hooks.c | 8 ++++++-- security/selinux/include/policycap.h | 1 + security/selinux/include/policycap_names.h | 1 + security/selinux/include/security.h | 6 ++++++ 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index e474cd7398effd..0e47b4bb8d4082 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -476,7 +476,9 @@ static int selinux_is_genfs_special_handling(struct super_block *sb) !strcmp(sb->s_type->name, "rootfs") || (selinux_policycap_cgroupseclabel() && (!strcmp(sb->s_type->name, "cgroup") || - !strcmp(sb->s_type->name, "cgroup2"))); + !strcmp(sb->s_type->name, "cgroup2"))) || + (selinux_policycap_functionfs_seclabel() && + !strcmp(sb->s_type->name, "functionfs")); } static int selinux_is_sblabel_mnt(struct super_block *sb) @@ -741,7 +743,9 @@ static int selinux_set_mnt_opts(struct super_block *sb, !strcmp(sb->s_type->name, "binder") || !strcmp(sb->s_type->name, "bpf") || !strcmp(sb->s_type->name, "pstore") || - !strcmp(sb->s_type->name, "securityfs")) + !strcmp(sb->s_type->name, "securityfs") || + (selinux_policycap_functionfs_seclabel() && + !strcmp(sb->s_type->name, "functionfs"))) sbsec->flags |= SE_SBGENFS; if (!strcmp(sb->s_type->name, "sysfs") || diff --git a/security/selinux/include/policycap.h b/security/selinux/include/policycap.h index 7405154e6c42c1..135a969f873cad 100644 --- a/security/selinux/include/policycap.h +++ b/security/selinux/include/policycap.h @@ -17,6 +17,7 @@ enum { POLICYDB_CAP_NETLINK_XPERM, POLICYDB_CAP_NETIF_WILDCARD, POLICYDB_CAP_GENFS_SECLABEL_WILDCARD, + POLICYDB_CAP_FUNCTIONFS_SECLABEL, __POLICYDB_CAP_MAX }; #define POLICYDB_CAP_MAX (__POLICYDB_CAP_MAX - 1) diff --git a/security/selinux/include/policycap_names.h b/security/selinux/include/policycap_names.h index d8962fcf2ff900..ff888288765171 100644 --- a/security/selinux/include/policycap_names.h +++ b/security/selinux/include/policycap_names.h @@ -20,6 +20,7 @@ const char *const selinux_policycap_names[__POLICYDB_CAP_MAX] = { "netlink_xperm", "netif_wildcard", "genfs_seclabel_wildcard", + "functionfs_seclabel", }; /* clang-format on */ diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 7f19972f7922e9..0f954a40d3fc74 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -203,6 +203,12 @@ static inline bool selinux_policycap_netlink_xperm(void) selinux_state.policycap[POLICYDB_CAP_NETLINK_XPERM]); } +static inline bool selinux_policycap_functionfs_seclabel(void) +{ + return READ_ONCE( + selinux_state.policycap[POLICYDB_CAP_FUNCTIONFS_SECLABEL]); +} + struct selinux_policy_convert_data; struct selinux_load_state { From 41194ee115685ec2dbbb80e20d1090cc3a4c64d8 Mon Sep 17 00:00:00 2001 From: Marcin Juszkiewicz Date: Fri, 5 Sep 2025 21:01:51 +0200 Subject: [PATCH 1007/3206] arm64: dts: rockchip: Add vcc supply for SPI Flash on NanoPC-T6 FriendlyELEC NanoPC-T6 LTS schematics shows VCC_1V8_S3 being used to power SPI NOR chip. This fixes the following kernel message: spi-nor spi5.0: supply vcc not found, using dummy regulator Signed-off-by: Marcin Juszkiewicz Link: https://lore.kernel.org/r/20250905-nanopc-t6-spi-nor-v2-1-098b476d9509@juszkiewicz.com.pl Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi index 3d8b6f0c554188..69833a0a94d0f1 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi @@ -731,6 +731,7 @@ spi-max-frequency = <104000000>; spi-rx-bus-width = <4>; spi-tx-bus-width = <1>; + vcc-supply = <&vcc_1v8_s3>; }; }; From 0f860eef417df93eb0ae70bbfa8d26cb7e29244d Mon Sep 17 00:00:00 2001 From: Jimmy Hon Date: Thu, 4 Sep 2025 03:01:50 +0000 Subject: [PATCH 1008/3206] arm64: dts: rockchip: Fix the headphone detection on the orangepi 5 The logic of the headphone detect pin seems to be inverted, with this change headphones actually output sound when plugged in. Does not need workaround of using pin-switches to enable output. Verified by checking /sys/kernel/debug/gpio. Fixes: ae46756faff8 ("arm64: dts: rockchip: analog audio on Orange Pi 5") Signed-off-by: Jimmy Hon Link: https://lore.kernel.org/r/20250904030150.986042-1-honyuenkwun@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi index 4fedc50cce8c86..11940c77f2bd01 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi @@ -42,9 +42,8 @@ simple-audio-card,bitclock-master = <&masterdai>; simple-audio-card,format = "i2s"; simple-audio-card,frame-master = <&masterdai>; - simple-audio-card,hp-det-gpios = <&gpio1 RK_PD5 GPIO_ACTIVE_LOW>; + simple-audio-card,hp-det-gpios = <&gpio1 RK_PD5 GPIO_ACTIVE_HIGH>; simple-audio-card,mclk-fs = <256>; - simple-audio-card,pin-switches = "Headphones"; simple-audio-card,routing = "Headphones", "LOUT1", "Headphones", "ROUT1", From 8a7c11af8e59e66d1e962fa863a77e59ef293ea7 Mon Sep 17 00:00:00 2001 From: Shankari Anand Date: Tue, 19 Aug 2025 00:40:34 +0530 Subject: [PATCH 1009/3206] rust: sync: Update ARef and AlwaysRefCounted imports from sync::aref Update the in-file reference of sync/aref.rs to import `ARef` and `AlwaysRefCounted` from `sync::aref` instead of `types`. This aligns with the ongoing effort to move `ARef` and `AlwaysRefCounted` to sync. Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1173 Signed-off-by: Shankari Anand Reviewed-by: Benno Lossin Signed-off-by: Miguel Ojeda --- rust/kernel/sync/aref.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/sync/aref.rs b/rust/kernel/sync/aref.rs index dbd77bb68617ca..c53d42cb073485 100644 --- a/rust/kernel/sync/aref.rs +++ b/rust/kernel/sync/aref.rs @@ -97,7 +97,7 @@ impl ARef { /// /// ``` /// use core::ptr::NonNull; - /// use kernel::types::{ARef, AlwaysRefCounted}; + /// use kernel::sync::aref::{ARef, AlwaysRefCounted}; /// /// struct Empty {} /// From 208d7f788e84e80992d7b1c82ff17b620eb1371e Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 30 Jul 2025 15:07:14 +0200 Subject: [PATCH 1010/3206] rust: block: fix `srctree/` links This `srctree/` link pointed to a file with an underscore, but the header used a dash instead. Thus fix it. This cleans a future warning that will check our `srctree/` links. Cc: stable@vger.kernel.org Fixes: 3253aba3408a ("rust: block: introduce `kernel::block::mq` module") Reviewed-by: Daniel Almeida Signed-off-by: Miguel Ojeda --- rust/kernel/block/mq/gen_disk.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index cd54cd64ea8878..e1af0fa302a372 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -3,7 +3,7 @@ //! Generic disk abstraction. //! //! C header: [`include/linux/blkdev.h`](srctree/include/linux/blkdev.h) -//! C header: [`include/linux/blk_mq.h`](srctree/include/linux/blk_mq.h) +//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h) use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet}; use crate::{bindings, error::from_err_ptr, error::Result, sync::Arc}; From c2783c7cfefd55b1a5be781679cbee5191c0fd87 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 30 Jul 2025 15:07:15 +0200 Subject: [PATCH 1011/3206] rust: drm: fix `srctree/` links These `srctree/` links pointed inside `linux/`, but they are directly under `drm/`. Thus fix them. This cleans a future warning that will check our `srctree/` links. Cc: stable@vger.kernel.org Fixes: a98a73be9ee9 ("rust: drm: file: Add File abstraction") Fixes: c284d3e42338 ("rust: drm: gem: Add GEM object abstraction") Fixes: 07c9016085f9 ("rust: drm: add driver abstractions") Fixes: 1e4b8896c0f3 ("rust: drm: add device abstraction") Fixes: 9a69570682b1 ("rust: drm: ioctl: Add DRM ioctl abstraction") Acked-by: Danilo Krummrich Reviewed-by: Daniel Almeida Signed-off-by: Miguel Ojeda --- rust/kernel/drm/device.rs | 2 +- rust/kernel/drm/driver.rs | 2 +- rust/kernel/drm/file.rs | 2 +- rust/kernel/drm/gem/mod.rs | 2 +- rust/kernel/drm/ioctl.rs | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/rust/kernel/drm/device.rs b/rust/kernel/drm/device.rs index d29c477e89a87d..f8f1db5eeb0f6f 100644 --- a/rust/kernel/drm/device.rs +++ b/rust/kernel/drm/device.rs @@ -2,7 +2,7 @@ //! DRM device. //! -//! C header: [`include/linux/drm/drm_device.h`](srctree/include/linux/drm/drm_device.h) +//! C header: [`include/drm/drm_device.h`](srctree/include/drm/drm_device.h) use crate::{ alloc::allocator::Kmalloc, diff --git a/rust/kernel/drm/driver.rs b/rust/kernel/drm/driver.rs index fe7e8d06961aa5..d2dad77274c4ca 100644 --- a/rust/kernel/drm/driver.rs +++ b/rust/kernel/drm/driver.rs @@ -2,7 +2,7 @@ //! DRM driver core. //! -//! C header: [`include/linux/drm/drm_drv.h`](srctree/include/linux/drm/drm_drv.h) +//! C header: [`include/drm/drm_drv.h`](srctree/include/drm/drm_drv.h) use crate::{ bindings, device, devres, drm, diff --git a/rust/kernel/drm/file.rs b/rust/kernel/drm/file.rs index e8789c9110d654..8c46f8d519516a 100644 --- a/rust/kernel/drm/file.rs +++ b/rust/kernel/drm/file.rs @@ -2,7 +2,7 @@ //! DRM File objects. //! -//! C header: [`include/linux/drm/drm_file.h`](srctree/include/linux/drm/drm_file.h) +//! C header: [`include/drm/drm_file.h`](srctree/include/drm/drm_file.h) use crate::{bindings, drm, error::Result, prelude::*, types::Opaque}; use core::marker::PhantomData; diff --git a/rust/kernel/drm/gem/mod.rs b/rust/kernel/drm/gem/mod.rs index b71821cfb5eaa0..b9f3248876baa3 100644 --- a/rust/kernel/drm/gem/mod.rs +++ b/rust/kernel/drm/gem/mod.rs @@ -2,7 +2,7 @@ //! DRM GEM API //! -//! C header: [`include/linux/drm/drm_gem.h`](srctree/include/linux/drm/drm_gem.h) +//! C header: [`include/drm/drm_gem.h`](srctree/include/drm/drm_gem.h) use crate::{ alloc::flags::*, diff --git a/rust/kernel/drm/ioctl.rs b/rust/kernel/drm/ioctl.rs index fdec01c371687c..8431cdcd3ae0ef 100644 --- a/rust/kernel/drm/ioctl.rs +++ b/rust/kernel/drm/ioctl.rs @@ -2,7 +2,7 @@ //! DRM IOCTL definitions. //! -//! C header: [`include/linux/drm/drm_ioctl.h`](srctree/include/linux/drm/drm_ioctl.h) +//! C header: [`include/drm/drm_ioctl.h`](srctree/include/drm/drm_ioctl.h) use crate::ioctl; From 9e3bbbf5f316aee4b9508db70d558a3f5532dcc0 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 30 Jul 2025 15:07:16 +0200 Subject: [PATCH 1012/3206] rust: warn if `srctree/` links do not exist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `srctree/` links may point to nonexistent files, e.g. due to renames that missed to update the files or simply because of typos. Since they can be easily checked for validity, do so and print a warning in the file does not exist. This found the following cases already in-tree: warning: srctree/ link to include/linux/blk_mq.h does not exist warning: srctree/ link to include/linux/drm/drm_gem.h does not exist warning: srctree/ link to include/linux/drm/drm_drv.h does not exist warning: srctree/ link to include/linux/drm/drm_ioctl.h does not exist warning: srctree/ link to include/linux/drm/drm_file.h does not exist warning: srctree/ link to include/linux/drm/drm_device.h does not exist Inspired-by: Onur Özkan Link: https://lore.kernel.org/rust-for-linux/CANiq72=xCYA7z7_rxpzzKkkhJs6m7L_xEaLMuArVn3ZAcyeHdA@mail.gmail.com/ Reviewed-by: Onur Özkan Reviewed-by: Daniel Almeida Tested-by: Daniel Almeida Signed-off-by: Miguel Ojeda --- rust/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rust/Makefile b/rust/Makefile index 62a98c731cc6df..29c941024e6f90 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -98,6 +98,12 @@ quiet_cmd_rustdoc = RUSTDOC $(if $(rustdoc_host),H, ) $< # and then retouch the generated files. rustdoc: rustdoc-core rustdoc-macros rustdoc-compiler_builtins \ rustdoc-kernel rustdoc-pin_init + $(Q)grep -Ehro ' Date: Thu, 31 Jul 2025 23:41:15 +0300 Subject: [PATCH 1013/3206] rust: error: add C header links MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The error codes come from several headers. Thus, add the other header links. Signed-off-by: Onur Özkan Reviewed-by: Daniel Almeida [ Sorted headers. Added line breaks. Reworded commit message. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/kernel/error.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs index 67da2d118e656f..db14da97672262 100644 --- a/rust/kernel/error.rs +++ b/rust/kernel/error.rs @@ -2,7 +2,9 @@ //! Kernel errors. //! -//! C header: [`include/uapi/asm-generic/errno-base.h`](srctree/include/uapi/asm-generic/errno-base.h) +//! C header: [`include/uapi/asm-generic/errno-base.h`](srctree/include/uapi/asm-generic/errno-base.h)\ +//! C header: [`include/uapi/asm-generic/errno.h`](srctree/include/uapi/asm-generic/errno.h)\ +//! C header: [`include/linux/errno.h`](srctree/include/linux/errno.h) use crate::{ alloc::{layout::LayoutError, AllocError}, From 72b04a8af7fb11e0f12e52985543d7e699fd4406 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 1 Aug 2025 18:17:52 +0200 Subject: [PATCH 1014/3206] rust: prelude: re-export `core::mem::{align,size}_of{,_val}` Rust 1.80.0 added: align_of align_of_val size_of size_of_val from `core::mem` to the prelude [1]. For similar reasons, and to minimize potential confusion when code may work in later versions but not in our current minimum, add it to our prelude too. Link: https://github.com/rust-lang/rust/pull/123168 [1] Link: https://lore.kernel.org/rust-for-linux/CANiq72kOLYR2A95o0ji2mDmEqOKh9e9_60zZKmgF=vZmsW6DRg@mail.gmail.com/ [2] Reviewed-by: Alice Ryhl Reviewed-by: Alexandre Courbot Reviewed-by: Andreas Hindborg Reviewed-by: Daniel Almeida Reviewed-by: Benno Lossin Signed-off-by: Miguel Ojeda --- rust/kernel/prelude.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rust/kernel/prelude.rs b/rust/kernel/prelude.rs index 25fe97aafd02a1..198d09a31449d8 100644 --- a/rust/kernel/prelude.rs +++ b/rust/kernel/prelude.rs @@ -12,7 +12,10 @@ //! ``` #[doc(no_inline)] -pub use core::pin::Pin; +pub use core::{ + mem::{align_of, align_of_val, size_of, size_of_val}, + pin::Pin, +}; pub use ::ffi::{ c_char, c_int, c_long, c_longlong, c_schar, c_short, c_uchar, c_uint, c_ulong, c_ulonglong, From 4710b47988fc028e1eb92b73192cb1ec2a508c65 Mon Sep 17 00:00:00 2001 From: Shankari Anand Date: Mon, 18 Aug 2025 18:55:51 +0530 Subject: [PATCH 1015/3206] rust: task: update ARef and AlwaysRefCounted imports from sync::aref Update call sites in `task.rs` to import `ARef` and `AlwaysRefCounted` from `sync::aref` instead of `types`. This aligns with the ongoing effort to move `ARef` and `AlwaysRefCounted` to sync. Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1173 Signed-off-by: Shankari Anand Reviewed-by: Benno Lossin Signed-off-by: Miguel Ojeda --- rust/kernel/task.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 7d0935bc325cb8..49fad6de06740a 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -9,7 +9,8 @@ use crate::{ ffi::{c_int, c_long, c_uint}, mm::MmWithUser, pid_namespace::PidNamespace, - types::{ARef, NotThreadSafe, Opaque}, + sync::aref::ARef, + types::{NotThreadSafe, Opaque}, }; use core::{ cmp::{Eq, PartialEq}, @@ -76,7 +77,7 @@ macro_rules! current { /// incremented when creating `State` and decremented when it is dropped: /// /// ``` -/// use kernel::{task::Task, types::ARef}; +/// use kernel::{task::Task, sync::aref::ARef}; /// /// struct State { /// creator: ARef, @@ -347,7 +348,7 @@ impl CurrentTask { } // SAFETY: The type invariants guarantee that `Task` is always refcounted. -unsafe impl crate::types::AlwaysRefCounted for Task { +unsafe impl crate::sync::aref::AlwaysRefCounted for Task { #[inline] fn inc_ref(&self) { // SAFETY: The existence of a shared reference means that the refcount is nonzero. From f116af2eb51ed9df24911537fda32a033f1c58da Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Tue, 29 Jul 2025 00:15:43 +0000 Subject: [PATCH 1016/3206] hwmon: (k10temp) Add thermal support for AMD Family 1Ah-based models Add thermal info support for newer AMD Family 1Ah-based models. Signed-off-by: Avadhut Naik Link: https://lore.kernel.org/r/20250729001644.257645-1-avadhut.naik@amd.com Signed-off-by: Guenter Roeck --- drivers/hwmon/k10temp.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index babf2413d666f7..2f90a2e9ad4960 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -84,6 +84,13 @@ static DEFINE_MUTEX(nb_smu_ind_mutex); */ #define AMD_I3255_STR "3255" +/* + * PCI Device IDs for AMD's Family 1Ah-based SOCs. + * Defining locally as IDs are not shared. + */ +#define PCI_DEVICE_ID_AMD_1AH_M50H_DF_F3 0x12cb +#define PCI_DEVICE_ID_AMD_1AH_M90H_DF_F3 0x127b + struct k10temp_data { struct pci_dev *pdev; void (*read_htcreg)(struct pci_dev *pdev, u32 *regval); @@ -556,7 +563,9 @@ static const struct pci_device_id k10temp_id_table[] = { { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M50H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M90H_DF_F3) }, { PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; From 25b2c02e5b1f834b405a7f3fa6854115c8245d4c Mon Sep 17 00:00:00 2001 From: Lucas Yunkyu Lee Date: Mon, 28 Jul 2025 22:49:08 +0200 Subject: [PATCH 1017/3206] hwmon: (asus-ec-sensors) Add STRIX B850-I GAMING WIFI Add support for the STRIX B850-I GAMING WIFI Signed-off-by: Lucas Yunkyu Lee Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250728205133.15487-2-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index de2f2985f06f89..558755f9fdd59a 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -25,6 +25,7 @@ Supported boards: * ROG MAXIMUS Z690 FORMULA * ROG STRIX B550-E GAMING * ROG STRIX B550-I GAMING + * ROG STRIX B850-I GAMING WIFI * ROG STRIX X570-E GAMING * ROG STRIX X570-E GAMING WIFI II * ROG STRIX X570-F GAMING diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 4ac554731e98a7..4d8b887b8a554b 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -495,6 +495,13 @@ static const struct ec_board_info board_info_strix_b550_i_gaming = { .family = family_amd_500_series, }; +static const struct ec_board_info board_info_strix_b850_i_gaming_wifi = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_800_series, +}; + static const struct ec_board_info board_info_strix_x570_e_gaming = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | @@ -628,6 +635,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_strix_b550_e_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B550-I GAMING", &board_info_strix_b550_i_gaming), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B850-I GAMING WIFI", + &board_info_strix_b850_i_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-E GAMING", &board_info_strix_x570_e_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-E GAMING WIFI II", From 32afccb263e48164e4f1596bdf4a9a67468ba330 Mon Sep 17 00:00:00 2001 From: Dylan Tackoor Date: Mon, 28 Jul 2025 22:49:09 +0200 Subject: [PATCH 1018/3206] hwmon: (asus-ec-sensors) Add B650E-I Add support for ROG STRIX B650E-I GAMING WIFI. Signed-off-by: Dylan Tackoor Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250728205133.15487-3-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index 558755f9fdd59a..1e8274dba35fac 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -25,6 +25,7 @@ Supported boards: * ROG MAXIMUS Z690 FORMULA * ROG STRIX B550-E GAMING * ROG STRIX B550-I GAMING + * ROG STRIX B650E-I GAMING WIFI * ROG STRIX B850-I GAMING WIFI * ROG STRIX X570-E GAMING * ROG STRIX X570-E GAMING WIFI II diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 4d8b887b8a554b..0b19d148f65d15 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -495,6 +495,13 @@ static const struct ec_board_info board_info_strix_b550_i_gaming = { .family = family_amd_500_series, }; +static const struct ec_board_info board_info_strix_b650e_i_gaming = { + .sensors = SENSOR_TEMP_VRM | SENSOR_TEMP_T_SENSOR | + SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_IN_CPU_CORE, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_600_series, +}; + static const struct ec_board_info board_info_strix_b850_i_gaming_wifi = { .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | SENSOR_TEMP_MB | SENSOR_TEMP_VRM, @@ -635,6 +642,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_strix_b550_e_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B550-I GAMING", &board_info_strix_b550_i_gaming), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B650E-I GAMING WIFI", + &board_info_strix_b650e_i_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B850-I GAMING WIFI", &board_info_strix_b850_i_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-E GAMING", From 0183cb21b8a87e7499a0ef3b94a7df43e266fa44 Mon Sep 17 00:00:00 2001 From: Nicholas Flintham Date: Mon, 28 Jul 2025 22:49:10 +0200 Subject: [PATCH 1019/3206] hwmon: (asus-ec-sensors) Add ROG STRIX Z790E GAMING WIFI II Add support for the ROG STRIX Z790E GAMING WIFI II board. Signed-off-by: Nicholas Flintham Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250728205133.15487-4-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index 1e8274dba35fac..da9a00111d1c54 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -34,6 +34,7 @@ Supported boards: * ROG STRIX Z390-F GAMING * ROG STRIX Z490-F GAMING * ROG STRIX Z690-A GAMING WIFI D4 + * ROG STRIX Z790-E GAMING WIFI II * ROG ZENITH II EXTREME * ROG ZENITH II EXTREME ALPHA * TUF GAMING X670E PLUS diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 0b19d148f65d15..b9543eda2522be 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -56,6 +56,8 @@ static char *mutex_path_override; #define ASUS_HW_ACCESS_MUTEX_RMTW_ASMX "\\RMTW.ASMX" +#define ASUS_HW_ACCESS_MUTEX_SB_PC00_LPCB_SIO1_MUT0 "\\_SB.PC00.LPCB.SIO1.MUT0" + #define ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0 "\\_SB_.PCI0.SBRG.SIO1.MUT0" #define MAX_IDENTICAL_BOARD_VARIATIONS 3 @@ -168,7 +170,8 @@ enum board_family { family_amd_800_series, family_intel_300_series, family_intel_400_series, - family_intel_600_series + family_intel_600_series, + family_intel_700_series }; /* @@ -323,6 +326,14 @@ static const struct ec_sensor_info sensors_family_intel_600[] = { EC_SENSOR("Water_Block_In", hwmon_temp, 1, 0x01, 0x02), }; +static const struct ec_sensor_info sensors_family_intel_700[] = { + [ec_sensor_temp_t_sensor] = + EC_SENSOR("T_Sensor", hwmon_temp, 1, 0x01, 0x09), + [ec_sensor_temp_vrm] = EC_SENSOR("VRM", hwmon_temp, 1, 0x00, 0x33), + [ec_sensor_fan_cpu_opt] = + EC_SENSOR("CPU_Opt", hwmon_fan, 2, 0x00, 0xb0), +}; + /* Shortcuts for common combinations */ #define SENSOR_SET_TEMP_CHIPSET_CPU_MB \ (SENSOR_TEMP_CHIPSET | SENSOR_TEMP_CPU | SENSOR_TEMP_MB) @@ -568,6 +579,13 @@ static const struct ec_board_info board_info_strix_z690_a_gaming_wifi_d4 = { .family = family_intel_600_series, }; +static const struct ec_board_info board_info_strix_z790_e_gaming_wifi_ii = { + .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | + SENSOR_FAN_CPU_OPT, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PC00_LPCB_SIO1_MUT0, + .family = family_intel_700_series, +}; + static const struct ec_board_info board_info_zenith_ii_extreme = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | @@ -660,6 +678,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_strix_z490_f_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z690-A GAMING WIFI D4", &board_info_strix_z690_a_gaming_wifi_d4), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z790-E GAMING WIFI II", + &board_info_strix_z790_e_gaming_wifi_ii), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG ZENITH II EXTREME", &board_info_zenith_ii_extreme), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG ZENITH II EXTREME ALPHA", @@ -1142,6 +1162,9 @@ static int asus_ec_probe(struct platform_device *pdev) case family_intel_600_series: ec_data->sensors_info = sensors_family_intel_600; break; + case family_intel_700_series: + ec_data->sensors_info = sensors_family_intel_700; + break; default: dev_err(dev, "Unknown board family: %d", ec_data->board_info->family); From 6a9b2fb8411e1bc9345f048c8061cfa96f45151f Mon Sep 17 00:00:00 2001 From: Lakshay Piplani Date: Mon, 28 Jul 2025 09:49:12 +0530 Subject: [PATCH 1020/3206] dt-bindings: hwmon: (lm75) Add binding for NXP P3T1750 Add "nxp,p3t1750" to the lm75 compatible list. Signed-off-by: Lakshay Piplani Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250728041913.3754236-1-lakshay.piplani@nxp.com Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/hwmon/lm75.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/hwmon/lm75.yaml b/Documentation/devicetree/bindings/hwmon/lm75.yaml index c38255243f5729..ecdd09a032e512 100644 --- a/Documentation/devicetree/bindings/hwmon/lm75.yaml +++ b/Documentation/devicetree/bindings/hwmon/lm75.yaml @@ -28,6 +28,7 @@ properties: - maxim,max31725 - maxim,max31726 - maxim,mcp980x + - nxp,p3t1750 - nxp,p3t1755 - nxp,pct2075 - st,stds75 From 83b3354a4ad9a784a3335a2c303c693f521d1e47 Mon Sep 17 00:00:00 2001 From: Lakshay Piplani Date: Mon, 28 Jul 2025 09:49:13 +0530 Subject: [PATCH 1021/3206] hwmon: (lm75) Add NXP P3T1750 support Add support for lm75 compatible NXP P3T1750 temperature sensor. Signed-off-by: Lakshay Piplani Link: https://lore.kernel.org/r/20250728041913.3754236-2-lakshay.piplani@nxp.com [groeck: Fixed alphabetic order for new chip entries] Signed-off-by: Guenter Roeck --- Documentation/hwmon/lm75.rst | 6 ++++-- drivers/hwmon/lm75.c | 13 +++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/Documentation/hwmon/lm75.rst b/Documentation/hwmon/lm75.rst index c6a54bbca3c51c..908b3a9df06e82 100644 --- a/Documentation/hwmon/lm75.rst +++ b/Documentation/hwmon/lm75.rst @@ -121,9 +121,9 @@ Supported chips: https://www.ti.com/product/TMP1075 - * NXP LM75B, P3T1755, PCT2075 + * NXP LM75B, P3T1750, P3T1755, PCT2075 - Prefix: 'lm75b', 'p3t1755', 'pct2075' + Prefix: 'lm75b', 'p3t1750', 'p3t1755', 'pct2075' Addresses scanned: none @@ -131,6 +131,8 @@ Supported chips: https://www.nxp.com/docs/en/data-sheet/LM75B.pdf + https://www.nxp.com/docs/en/data-sheet/P3T1750DP.pdf + https://www.nxp.com/docs/en/data-sheet/P3T1755.pdf https://www.nxp.com/docs/en/data-sheet/PCT2075.pdf diff --git a/drivers/hwmon/lm75.c b/drivers/hwmon/lm75.c index 9b4875e2fd8d84..3c23b6e8e1bf5c 100644 --- a/drivers/hwmon/lm75.c +++ b/drivers/hwmon/lm75.c @@ -39,6 +39,7 @@ enum lm75_type { /* keep sorted in alphabetical order */ max6626, max31725, mcp980x, + p3t1750, p3t1755, pct2075, stds75, @@ -222,6 +223,13 @@ static const struct lm75_params device_params[] = { .default_resolution = 9, .default_sample_time = MSEC_PER_SEC / 18, }, + [p3t1750] = { + .clr_mask = 1 << 1 | 1 << 7, /* disable SMBAlert and one-shot */ + .default_resolution = 12, + .default_sample_time = 55, + .num_sample_times = 4, + .sample_times = (unsigned int []){ 28, 55, 110, 220 }, + }, [p3t1755] = { .clr_mask = 1 << 1 | 1 << 7, /* disable SMBAlert and one-shot */ .default_resolution = 12, @@ -805,6 +813,7 @@ static const struct i2c_device_id lm75_i2c_ids[] = { { "max31725", max31725, }, { "max31726", max31725, }, { "mcp980x", mcp980x, }, + { "p3t1750", p3t1750, }, { "p3t1755", p3t1755, }, { "pct2075", pct2075, }, { "stds75", stds75, }, @@ -916,6 +925,10 @@ static const struct of_device_id __maybe_unused lm75_of_match[] = { .compatible = "maxim,mcp980x", .data = (void *)mcp980x }, + { + .compatible = "nxp,p3t1750", + .data = (void *)p3t1750 + }, { .compatible = "nxp,p3t1755", .data = (void *)p3t1755 From fec40c4837a95ae511d9e1b709943b6c243db4d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Runar=20Gr=C3=B8n=C3=A5s?= Date: Fri, 1 Aug 2025 21:50:08 +0200 Subject: [PATCH 1022/3206] hwmon: (asus-ec-sensors) Add X670E-I GAMING WIFI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for ROG STRIX X670E-I GAMING WIFI Signed-off-by: Runar Grønås Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250801195020.11106-1-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index da9a00111d1c54..49f6cac63d1946 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -31,6 +31,7 @@ Supported boards: * ROG STRIX X570-E GAMING WIFI II * ROG STRIX X570-F GAMING * ROG STRIX X570-I GAMING + * ROG STRIX X670E-I GAMING WIFI * ROG STRIX Z390-F GAMING * ROG STRIX Z490-F GAMING * ROG STRIX Z690-A GAMING WIFI D4 diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index b9543eda2522be..33c5fcb0a09ec9 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -553,6 +553,13 @@ static const struct ec_board_info board_info_strix_x570_i_gaming = { .family = family_amd_500_series, }; +static const struct ec_board_info board_info_strix_x670e_i_gaming_wifi = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_600_series, +}; + static const struct ec_board_info board_info_strix_z390_f_gaming = { .sensors = SENSOR_TEMP_CHIPSET | SENSOR_TEMP_VRM | SENSOR_TEMP_T_SENSOR | @@ -672,6 +679,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_strix_x570_f_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-I GAMING", &board_info_strix_x570_i_gaming), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X670E-I GAMING WIFI", + &board_info_strix_x670e_i_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z390-F GAMING", &board_info_strix_z390_f_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z490-F GAMING", From 15c8317366908c3b5a22573483a3dbbefba3fc38 Mon Sep 17 00:00:00 2001 From: Jamie Vickery Date: Sat, 2 Aug 2025 15:09:02 +0200 Subject: [PATCH 1023/3206] hwmon: (asus-ec-sensors) Add Z790-I GAMING WIFI Add support for the ROG STRIX Z790-I GAMING WIFI board Signed-off-by: Jamie Vickery Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250802130912.175543-1-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index 49f6cac63d1946..bedddb6bf9e1c4 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -36,6 +36,7 @@ Supported boards: * ROG STRIX Z490-F GAMING * ROG STRIX Z690-A GAMING WIFI D4 * ROG STRIX Z790-E GAMING WIFI II + * ROG STRIX Z790-I GAMING WIFI * ROG ZENITH II EXTREME * ROG ZENITH II EXTREME ALPHA * TUF GAMING X670E PLUS diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 33c5fcb0a09ec9..e2f7b8705cb13c 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -329,6 +329,8 @@ static const struct ec_sensor_info sensors_family_intel_600[] = { static const struct ec_sensor_info sensors_family_intel_700[] = { [ec_sensor_temp_t_sensor] = EC_SENSOR("T_Sensor", hwmon_temp, 1, 0x01, 0x09), + [ec_sensor_temp_t_sensor_2] = + EC_SENSOR("T_Sensor 2", hwmon_temp, 1, 0x01, 0x05), [ec_sensor_temp_vrm] = EC_SENSOR("VRM", hwmon_temp, 1, 0x00, 0x33), [ec_sensor_fan_cpu_opt] = EC_SENSOR("CPU_Opt", hwmon_fan, 2, 0x00, 0xb0), @@ -593,6 +595,13 @@ static const struct ec_board_info board_info_strix_z790_e_gaming_wifi_ii = { .family = family_intel_700_series, }; +static const struct ec_board_info board_info_strix_z790_i_gaming_wifi = { + .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_T_SENSOR_2 | + SENSOR_TEMP_VRM, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PC00_LPCB_SIO1_MUT0, + .family = family_intel_700_series, +}; + static const struct ec_board_info board_info_zenith_ii_extreme = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | @@ -689,6 +698,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_strix_z690_a_gaming_wifi_d4), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z790-E GAMING WIFI II", &board_info_strix_z790_e_gaming_wifi_ii), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z790-I GAMING WIFI", + &board_info_strix_z790_i_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG ZENITH II EXTREME", &board_info_zenith_ii_extreme), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG ZENITH II EXTREME ALPHA", From 3aa72cf03924d04c8d20f8b319df8f73550dd26c Mon Sep 17 00:00:00 2001 From: Eugene Shalygin Date: Tue, 5 Aug 2025 22:31:48 +0200 Subject: [PATCH 1024/3206] hwmon: (asus-ec-sensors) Narrow lock for X870E-CREATOR WIFI Use mutex from the SIO device rather than the global lock. Signed-off-by: Eugene Shalygin Fixes: 3e538b52157b ("hwmon: (asus-ec-sensors) add ProArt X870E-CREATOR WIFI") Link: https://lore.kernel.org/r/20250805203157.18446-1-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/asus-ec-sensors.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index e2f7b8705cb13c..83047c3263d35c 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -409,7 +409,7 @@ static const struct ec_board_info board_info_pro_art_x870E_creator_wifi = { .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | SENSOR_TEMP_MB | SENSOR_TEMP_VRM | SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CPU_OPT, - .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0, .family = family_amd_800_series, }; From ceb562d109ca6ae3ca77d0367ddd374473acbb2d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 8 Aug 2025 10:38:07 -0700 Subject: [PATCH 1025/3206] MAINTAINERS: Mark coretemp driver as orphaned This maintainer's email no longer works. Remove it from MAINTAINERS. Also mark the driver as Orphaned for now. Signed-off-by: Dave Hansen Cc: Jean Delvare Cc: Guenter Roeck Cc: linux-hwmon@vger.kernel.org Link: https://lore.kernel.org/r/20250808173807.96D134EA@davehans-spike.ostc.intel.com Signed-off-by: Guenter Roeck --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cd7ff55b5d3217..b69c0271951c14 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6282,9 +6282,8 @@ F: tools/testing/selftests/cgroup/test_kmem.c F: tools/testing/selftests/cgroup/test_memcontrol.c CORETEMP HARDWARE MONITORING DRIVER -M: Fenghua Yu L: linux-hwmon@vger.kernel.org -S: Maintained +S: Orphan F: Documentation/hwmon/coretemp.rst F: drivers/hwmon/coretemp.c From 3331e5469219d92ea8e9a85fe4fbf679b6aac672 Mon Sep 17 00:00:00 2001 From: ChiShih Tsai Date: Thu, 7 Aug 2025 06:37:23 +0800 Subject: [PATCH 1026/3206] dt-bindings: hwmon: adm1275: add sq24905c support Add support for sq24905c Hot-Swap Controller and Digital Power Monitor. Acked-by: Krzysztof Kozlowski Signed-off-by: ChiShih Tsai Link: https://lore.kernel.org/r/20250806223724.1207-2-tomtsai764@gmail.com Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml b/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml index ddb72857c84641..d6a7517f2a50c4 100644 --- a/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml +++ b/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml @@ -18,6 +18,13 @@ description: | Datasheets: https://www.analog.com/en/products/adm1294.html + The SQ24905C is also a Hot-swap controller compatibility to the ADM1278, + the PMBUS_MFR_MODEL is MC09C + + Datasheets: + https://www.silergy.com/ + download/downloadFile?id=5669&type=product&ftype=note + properties: compatible: enum: @@ -30,6 +37,7 @@ properties: - adi,adm1281 - adi,adm1293 - adi,adm1294 + - silergy,mc09c reg: maxItems: 1 @@ -96,6 +104,7 @@ allOf: - adi,adm1281 - adi,adm1293 - adi,adm1294 + - silergy,mc09c then: properties: adi,volt-curr-sample-average: From 1ba272bfdf55a32c1240a629fd8c98eaaa3f8351 Mon Sep 17 00:00:00 2001 From: ChiShih Tsai Date: Thu, 7 Aug 2025 06:37:24 +0800 Subject: [PATCH 1027/3206] hwmon: (pmbus/adm1275) add sq24905c support Add support for sq24905c which is similar to adm1275 and other chips of the series. Signed-off-by: ChiShih Tsai Link: https://lore.kernel.org/r/20250806223724.1207-3-tomtsai764@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/adm1275.rst | 24 ++++++++++++++++-------- drivers/hwmon/pmbus/Kconfig | 3 ++- drivers/hwmon/pmbus/adm1275.c | 11 ++++++++--- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/Documentation/hwmon/adm1275.rst b/Documentation/hwmon/adm1275.rst index 57bd7a8505589f..cf923f20fa523b 100644 --- a/Documentation/hwmon/adm1275.rst +++ b/Documentation/hwmon/adm1275.rst @@ -67,6 +67,14 @@ Supported chips: Datasheet: https://www.analog.com/media/en/technical-documentation/data-sheets/ADM1293_1294.pdf + * Silergy SQ24905C + + Prefix: 'mc09c' + + Addresses scanned: - + + Datasheet: https://www.silergy.com/download/downloadFile?id=5669&type=product&ftype=note + Author: Guenter Roeck @@ -74,14 +82,14 @@ Description ----------- This driver supports hardware monitoring for Analog Devices ADM1075, ADM1272, -ADM1273, ADM1275, ADM1276, ADM1278, ADM1281, ADM1293, and ADM1294 Hot-Swap -Controller and Digital Power Monitors. +ADM1273, ADM1275, ADM1276, ADM1278, ADM1281, ADM1293, ADM1294, and SQ24905C +Hot-Swap Controller and Digital Power Monitors. -ADM1075, ADM1272, ADM1273, ADM1275, ADM1276, ADM1278, ADM1281, ADM1293, and -ADM1294 are hot-swap controllers that allow a circuit board to be removed from -or inserted into a live backplane. They also feature current and voltage -readback via an integrated 12 bit analog-to-digital converter (ADC), accessed -using a PMBus interface. +ADM1075, ADM1272, ADM1273, ADM1275, ADM1276, ADM1278, ADM1281, ADM1293, +ADM1294 and SQ24905C are hot-swap controllers that allow a circuit board to be +removed from or inserted into a live backplane. They also feature current and +voltage readback via an integrated 12 bit analog-to-digital converter (ADC), +accessed using a PMBus interface. The driver is a client driver to the core PMBus driver. Please see Documentation/hwmon/pmbus.rst for details on PMBus client drivers. @@ -160,5 +168,5 @@ temp1_highest Highest observed temperature. temp1_reset_history Write any value to reset history. Temperature attributes are supported on ADM1272, - ADM1273, ADM1278, and ADM1281. + ADM1273, ADM1278, ADM1281 and SQ24905C. ======================= ======================================================= diff --git a/drivers/hwmon/pmbus/Kconfig b/drivers/hwmon/pmbus/Kconfig index 55e492452ce811..77add0c6ee53a4 100644 --- a/drivers/hwmon/pmbus/Kconfig +++ b/drivers/hwmon/pmbus/Kconfig @@ -52,7 +52,8 @@ config SENSORS_ADM1275 help If you say yes here you get hardware monitoring support for Analog Devices ADM1075, ADM1272, ADM1273, ADM1275, ADM1276, ADM1278, ADM1281, - ADM1293, and ADM1294 Hot-Swap Controller and Digital Power Monitors. + ADM1293, ADM1294 and SQ24905C Hot-Swap Controller and + Digital Power Monitors. This driver can also be built as a module. If so, the module will be called adm1275. diff --git a/drivers/hwmon/pmbus/adm1275.c b/drivers/hwmon/pmbus/adm1275.c index 7d175baa5de2fb..bc2a6a07dc3e2e 100644 --- a/drivers/hwmon/pmbus/adm1275.c +++ b/drivers/hwmon/pmbus/adm1275.c @@ -18,7 +18,8 @@ #include #include "pmbus.h" -enum chips { adm1075, adm1272, adm1273, adm1275, adm1276, adm1278, adm1281, adm1293, adm1294 }; +enum chips { adm1075, adm1272, adm1273, adm1275, adm1276, adm1278, adm1281, + adm1293, adm1294, sq24905c }; #define ADM1275_MFR_STATUS_IOUT_WARN2 BIT(0) #define ADM1293_MFR_STATUS_VAUX_UV_WARN BIT(5) @@ -486,6 +487,7 @@ static const struct i2c_device_id adm1275_id[] = { { "adm1281", adm1281 }, { "adm1293", adm1293 }, { "adm1294", adm1294 }, + { "mc09c", sq24905c }, { } }; MODULE_DEVICE_TABLE(i2c, adm1275_id); @@ -532,7 +534,8 @@ static int adm1275_probe(struct i2c_client *client) dev_err(&client->dev, "Failed to read Manufacturer ID\n"); return ret; } - if (ret != 3 || strncmp(block_buffer, "ADI", 3)) { + if ((ret != 3 || strncmp(block_buffer, "ADI", 3)) && + (ret != 2 || strncmp(block_buffer, "SY", 2))) { dev_err(&client->dev, "Unsupported Manufacturer ID\n"); return -ENODEV; } @@ -558,7 +561,8 @@ static int adm1275_probe(struct i2c_client *client) if (mid->driver_data == adm1272 || mid->driver_data == adm1273 || mid->driver_data == adm1278 || mid->driver_data == adm1281 || - mid->driver_data == adm1293 || mid->driver_data == adm1294) + mid->driver_data == adm1293 || mid->driver_data == adm1294 || + mid->driver_data == sq24905c) config_read_fn = i2c_smbus_read_word_data; else config_read_fn = i2c_smbus_read_byte_data; @@ -708,6 +712,7 @@ static int adm1275_probe(struct i2c_client *client) break; case adm1278: case adm1281: + case sq24905c: data->have_vout = true; data->have_pin_max = true; data->have_temp_max = true; From 71e5262374e7ccc7e692fc005273f1fe2f74830e Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Wed, 27 Aug 2025 11:13:44 +0200 Subject: [PATCH 1028/3206] hwmon: Remove Jean Delvare from maintainers I haven't been active in maintaining the hwmon subsystem in the last decade, so I think it's about time to admit that I do not have the time for this duty and update the MAINTAINERS file to reflect that. I would like to thank Guenter Roeck for taking over and doing an excellent work for so many years. Signed-off-by: Jean Delvare Link: https://lore.kernel.org/r/20250827111344.0debba2a@endymion Signed-off-by: Guenter Roeck --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b69c0271951c14..b392d34deb6272 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10717,7 +10717,6 @@ W: http://www.kernel.org/pub/linux/kernel/people/fseidel/hdaps/ F: drivers/platform/x86/hdaps.c HARDWARE MONITORING -M: Jean Delvare M: Guenter Roeck L: linux-hwmon@vger.kernel.org S: Maintained From a6461d2039fac16e7965bf91742f592986803194 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Wed, 20 Aug 2025 21:15:08 +0800 Subject: [PATCH 1029/3206] hwmon: (ltc4282) remove the use of dev_err_probe() Logging messages that show some type of "out of memory" error are generally unnecessary as there is a generic message and a stack dump done by the memory subsystem. These messages generally increase kernel size without much added value[1]. The dev_err_probe() doesn't do anything when error is '-ENOMEM'. Therefore, remove the useless call to dev_err_probe(), and just return the value instead. [1]: https://lore.kernel.org/lkml/1402419340.30479.18.camel@joe-AO725/ Signed-off-by: Liao Yuanhong Link: https://lore.kernel.org/r/20250820131509.502007-1-liaoyuanhong@vivo.com Signed-off-by: Guenter Roeck --- drivers/hwmon/ltc4282.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/hwmon/ltc4282.c b/drivers/hwmon/ltc4282.c index dbb30abcd343f3..1d664a2d7b3cb9 100644 --- a/drivers/hwmon/ltc4282.c +++ b/drivers/hwmon/ltc4282.c @@ -1693,8 +1693,7 @@ static int ltc4282_probe(struct i2c_client *i2c) st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL); if (!st) - return dev_err_probe(dev, -ENOMEM, - "Failed to allocate memory\n"); + return -ENOMEM; st->map = devm_regmap_init_i2c(i2c, <c4282_regmap_config); if (IS_ERR(st->map)) From fd1a9a68e64fc41bffd8bfbf987a11c53dda4cf2 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Thu, 14 Aug 2025 10:04:41 +0200 Subject: [PATCH 1030/3206] dt-bindings: hwmon: convert lantiq-cputemp to yaml Convert the Lantiq cpu temperature sensor bindings to yaml format. Signed-off-by: Aleksander Jan Bajkowski Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250814080708.3054732-1-olek2@wp.pl Signed-off-by: Guenter Roeck --- .../bindings/hwmon/lantiq,cputemp.yaml | 30 +++++++++++++++++++ .../devicetree/bindings/hwmon/ltq-cputemp.txt | 10 ------- 2 files changed, 30 insertions(+), 10 deletions(-) create mode 100644 Documentation/devicetree/bindings/hwmon/lantiq,cputemp.yaml delete mode 100644 Documentation/devicetree/bindings/hwmon/ltq-cputemp.txt diff --git a/Documentation/devicetree/bindings/hwmon/lantiq,cputemp.yaml b/Documentation/devicetree/bindings/hwmon/lantiq,cputemp.yaml new file mode 100644 index 00000000000000..9419b481ff35b1 --- /dev/null +++ b/Documentation/devicetree/bindings/hwmon/lantiq,cputemp.yaml @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/hwmon/lantiq,cputemp.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Lantiq cpu temperature sensor + +maintainers: + - Florian Eckert + +properties: + compatible: + const: lantiq,cputemp + + reg: + maxItems: 1 + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + cputemp@103040 { + compatible = "lantiq,cputemp"; + reg = <0x103040 0x4>; + }; diff --git a/Documentation/devicetree/bindings/hwmon/ltq-cputemp.txt b/Documentation/devicetree/bindings/hwmon/ltq-cputemp.txt deleted file mode 100644 index 473b34c876dd32..00000000000000 --- a/Documentation/devicetree/bindings/hwmon/ltq-cputemp.txt +++ /dev/null @@ -1,10 +0,0 @@ -Lantiq cpu temperature sensor - -Requires node properties: -- compatible value : - "lantiq,cputemp" - -Example: - cputemp@0 { - compatible = "lantiq,cputemp"; - }; From e5d1e313d7b6272d6dfda983906d99f97ad9062b Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Sun, 24 Aug 2025 02:04:41 +0800 Subject: [PATCH 1031/3206] hwmon: (k10temp) Add device ID for Strix Halo The device ID of Strix Halo Data Fabric Function 3 has been in the tree since commit 0e640f0a47d8 ("x86/amd_nb: Add new PCI IDs for AMD family 0x1a"), but is somehow missing from k10temp_id_table. Add it so that it works out of the box. Tested on Beelink GTR9 Pro Mini PC. Signed-off-by: Rong Zhang Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250823180443.85512-1-i@rong.moe Signed-off-by: Guenter Roeck --- drivers/hwmon/k10temp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index 2f90a2e9ad4960..b98d5ec72c4ff1 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -565,6 +565,7 @@ static const struct pci_device_id k10temp_id_table[] = { { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M50H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M90H_DF_F3) }, { PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} From f86e7a69ca9a4a9c2147d2bbff4dd7e025f80713 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Aug 2025 21:07:28 +0200 Subject: [PATCH 1032/3206] hwmon: (sch56xx-common) don't print superfluous errors The watchdog core will handle error messages already. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250813190728.3682-2-wsa+renesas@sang-engineering.com Signed-off-by: Guenter Roeck --- drivers/hwmon/sch56xx-common.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/hwmon/sch56xx-common.c b/drivers/hwmon/sch56xx-common.c index 71941b1bb57328..98e075e54e9dee 100644 --- a/drivers/hwmon/sch56xx-common.c +++ b/drivers/hwmon/sch56xx-common.c @@ -544,10 +544,8 @@ void sch56xx_watchdog_register(struct device *parent, u16 addr, u32 revision, watchdog_set_drvdata(&data->wddev, data); err = devm_watchdog_register_device(parent, &data->wddev); - if (err) { - pr_err("Registering watchdog chardev: %d\n", err); + if (err) devm_kfree(parent, data); - } } EXPORT_SYMBOL(sch56xx_watchdog_register); From 43c056ac85b60232861005765153707f1b0354b6 Mon Sep 17 00:00:00 2001 From: David Ober Date: Thu, 7 Aug 2025 06:32:28 -0400 Subject: [PATCH 1033/3206] hwmon: (lenovo-ec-sensors) Update P8 supprt This fixes differences for the P8 system that was initially set to the same thermal values as the P7, also adds in the PSU sensor for all of the supported systems Signed-off-by: David Ober Signed-off-by: David Ober Link: https://lore.kernel.org/r/20250807103228.10465-1-dober6023@gmail.com [groeck: Update subject] Signed-off-by: Guenter Roeck --- drivers/hwmon/lenovo-ec-sensors.c | 34 +++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/drivers/hwmon/lenovo-ec-sensors.c b/drivers/hwmon/lenovo-ec-sensors.c index 143fb79713f7d3..8681bbf6665b1e 100644 --- a/drivers/hwmon/lenovo-ec-sensors.c +++ b/drivers/hwmon/lenovo-ec-sensors.c @@ -66,7 +66,7 @@ enum systems { LENOVO_P8, }; -static int px_temp_map[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +static int px_temp_map[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31, 32}; static const char * const lenovo_px_ec_temp_label[] = { "CPU1", @@ -84,9 +84,29 @@ static const char * const lenovo_px_ec_temp_label[] = { "PCI_Z3", "PCI_Z4", "AMB", + "PSU1", + "PSU2", }; -static int gen_temp_map[] = {0, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +static int p8_temp_map[] = {0, 1, 2, 8, 9, 13, 14, 15, 16, 17, 19, 20, 33}; + +static const char * const lenovo_p8_ec_temp_label[] = { + "CPU1", + "CPU_DIMM_BANK1", + "CPU_DIMM_BANK2", + "M2_Z2R", + "M2_Z3R", + "DIMM_RIGHT", + "DIMM_LEFT", + "PCI_Z1", + "PCI_Z2", + "PCI_Z3", + "AMB", + "REAR_VR", + "PSU", +}; + +static int gen_temp_map[] = {0, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31}; static const char * const lenovo_gen_ec_temp_label[] = { "CPU1", @@ -101,6 +121,7 @@ static const char * const lenovo_gen_ec_temp_label[] = { "PCI_Z3", "PCI_Z4", "AMB", + "PSU", }; static int px_fan_map[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; @@ -293,6 +314,8 @@ static const struct hwmon_channel_info *lenovo_ec_hwmon_info_px[] = { HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL), HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT | HWMON_F_LABEL | HWMON_F_MAX, @@ -327,6 +350,7 @@ static const struct hwmon_channel_info *lenovo_ec_hwmon_info_p8[] = { HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL), HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT | HWMON_F_LABEL | HWMON_F_MAX, @@ -359,6 +383,7 @@ static const struct hwmon_channel_info *lenovo_ec_hwmon_info_p7[] = { HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL), HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT | HWMON_F_LABEL | HWMON_F_MAX, @@ -388,6 +413,7 @@ static const struct hwmon_channel_info *lenovo_ec_hwmon_info_p5[] = { HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL), HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT | HWMON_F_LABEL | HWMON_F_MAX, @@ -545,9 +571,9 @@ static int lenovo_ec_probe(struct platform_device *pdev) break; case 3: ec_data->fan_labels = p8_ec_fan_label; - ec_data->temp_labels = lenovo_gen_ec_temp_label; + ec_data->temp_labels = lenovo_p8_ec_temp_label; ec_data->fan_map = p8_fan_map; - ec_data->temp_map = gen_temp_map; + ec_data->temp_map = p8_temp_map; lenovo_ec_chip_info.info = lenovo_ec_hwmon_info_p8; break; default: From d9d61f1da35038793156c04bb13f0a1350709121 Mon Sep 17 00:00:00 2001 From: Chuande Chen Date: Thu, 14 Aug 2025 13:39:40 +0800 Subject: [PATCH 1034/3206] hwmon: (sbtsi_temp) AMD CPU extended temperature range support Many AMD CPUs can support this feature now. We would get a wrong CPU DIE temperature if don't consider this. In low-temperature environments, the CPU die temperature can drop below zero. So many platforms would like to make extended temperature range as their default configuration. Default temperature range (0C to 255.875C). Extended temperature range (-49C to +206.875C). Ref Doc: AMD V3000 PPR (Doc ID #56558). Signed-off-by: Chuande Chen Link: https://lore.kernel.org/r/20250814053940.96764-1-chenchuande@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/sbtsi_temp.c | 46 +++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/drivers/hwmon/sbtsi_temp.c b/drivers/hwmon/sbtsi_temp.c index 3c839f56c46038..a6c439e376ff7c 100644 --- a/drivers/hwmon/sbtsi_temp.c +++ b/drivers/hwmon/sbtsi_temp.c @@ -14,6 +14,7 @@ #include #include #include +#include /* * SB-TSI registers only support SMBus byte data access. "_INT" registers are @@ -29,8 +30,22 @@ #define SBTSI_REG_TEMP_HIGH_DEC 0x13 /* RW */ #define SBTSI_REG_TEMP_LOW_DEC 0x14 /* RW */ +/* + * Bit for reporting value with temperature measurement range. + * bit == 0: Use default temperature range (0C to 255.875C). + * bit == 1: Use extended temperature range (-49C to +206.875C). + */ +#define SBTSI_CONFIG_EXT_RANGE_SHIFT 2 +/* + * ReadOrder bit specifies the reading order of integer and decimal part of + * CPU temperature for atomic reads. If bit == 0, reading integer part triggers + * latching of the decimal part, so integer part should be read first. + * If bit == 1, read order should be reversed. + */ #define SBTSI_CONFIG_READ_ORDER_SHIFT 5 +#define SBTSI_TEMP_EXT_RANGE_ADJ 49000 + #define SBTSI_TEMP_MIN 0 #define SBTSI_TEMP_MAX 255875 @@ -38,6 +53,8 @@ struct sbtsi_data { struct i2c_client *client; struct mutex lock; + bool ext_range_mode; + bool read_order; }; /* @@ -74,23 +91,11 @@ static int sbtsi_read(struct device *dev, enum hwmon_sensor_types type, { struct sbtsi_data *data = dev_get_drvdata(dev); s32 temp_int, temp_dec; - int err; switch (attr) { case hwmon_temp_input: - /* - * ReadOrder bit specifies the reading order of integer and - * decimal part of CPU temp for atomic reads. If bit == 0, - * reading integer part triggers latching of the decimal part, - * so integer part should be read first. If bit == 1, read - * order should be reversed. - */ - err = i2c_smbus_read_byte_data(data->client, SBTSI_REG_CONFIG); - if (err < 0) - return err; - mutex_lock(&data->lock); - if (err & BIT(SBTSI_CONFIG_READ_ORDER_SHIFT)) { + if (data->read_order) { temp_dec = i2c_smbus_read_byte_data(data->client, SBTSI_REG_TEMP_DEC); temp_int = i2c_smbus_read_byte_data(data->client, SBTSI_REG_TEMP_INT); } else { @@ -122,6 +127,8 @@ static int sbtsi_read(struct device *dev, enum hwmon_sensor_types type, return temp_dec; *val = sbtsi_reg_to_mc(temp_int, temp_dec); + if (data->ext_range_mode) + *val -= SBTSI_TEMP_EXT_RANGE_ADJ; return 0; } @@ -146,6 +153,8 @@ static int sbtsi_write(struct device *dev, enum hwmon_sensor_types type, return -EINVAL; } + if (data->ext_range_mode) + val += SBTSI_TEMP_EXT_RANGE_ADJ; val = clamp_val(val, SBTSI_TEMP_MIN, SBTSI_TEMP_MAX); sbtsi_mc_to_reg(val, &temp_int, &temp_dec); @@ -203,6 +212,7 @@ static int sbtsi_probe(struct i2c_client *client) struct device *dev = &client->dev; struct device *hwmon_dev; struct sbtsi_data *data; + int err; data = devm_kzalloc(dev, sizeof(struct sbtsi_data), GFP_KERNEL); if (!data) @@ -211,8 +221,14 @@ static int sbtsi_probe(struct i2c_client *client) data->client = client; mutex_init(&data->lock); - hwmon_dev = devm_hwmon_device_register_with_info(dev, client->name, data, &sbtsi_chip_info, - NULL); + err = i2c_smbus_read_byte_data(data->client, SBTSI_REG_CONFIG); + if (err < 0) + return err; + data->ext_range_mode = FIELD_GET(BIT(SBTSI_CONFIG_EXT_RANGE_SHIFT), err); + data->read_order = FIELD_GET(BIT(SBTSI_CONFIG_READ_ORDER_SHIFT), err); + + hwmon_dev = devm_hwmon_device_register_with_info(dev, client->name, data, + &sbtsi_chip_info, NULL); return PTR_ERR_OR_ZERO(hwmon_dev); } From 5473fccb809a4cb6777572d420c03446d96101b0 Mon Sep 17 00:00:00 2001 From: Grant Peltier Date: Wed, 27 Aug 2025 17:44:19 -0500 Subject: [PATCH 1035/3206] dt-bindings: hwmon: (pmbus/isl68137) add RAA228244 and RAA228246 support Add device type support for raa228244 and raa228246. Signed-off-by: Grant Peltier Acked-by: Conor Dooley Link: https://lore.kernel.org/r/c0c6e99e51b6fd4c5dbab02e02e4d81abe31f085.1756331945.git.grantpeltier93@gmail.com Signed-off-by: Guenter Roeck --- .../devicetree/bindings/hwmon/pmbus/isil,isl68137.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/hwmon/pmbus/isil,isl68137.yaml b/Documentation/devicetree/bindings/hwmon/pmbus/isil,isl68137.yaml index 3dc7f15484d287..ae23a05375cb83 100644 --- a/Documentation/devicetree/bindings/hwmon/pmbus/isil,isl68137.yaml +++ b/Documentation/devicetree/bindings/hwmon/pmbus/isil,isl68137.yaml @@ -54,6 +54,8 @@ properties: - renesas,raa228004 - renesas,raa228006 - renesas,raa228228 + - renesas,raa228244 + - renesas,raa228246 - renesas,raa229001 - renesas,raa229004 - renesas,raa229621 From 2190ad55a601da0bf78ae9dc14b26e82a1c4c959 Mon Sep 17 00:00:00 2001 From: Grant Peltier Date: Wed, 27 Aug 2025 17:41:36 -0500 Subject: [PATCH 1036/3206] hwmon: (pmbus/isl68137) add support for Renesas RAA228244 and RAA228246 The RAA228244 and RAA228246 are both recently released digital dual-output multiphase PWM controllers. Signed-off-by: Grant Peltier Link: https://lore.kernel.org/r/70bb08e291bd57722b1b0edf732cd0017714ef07.1756331945.git.grantpeltier93@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/isl68137.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/hwmon/pmbus/isl68137.c b/drivers/hwmon/pmbus/isl68137.c index c52c55d2e7f48d..52cf62e45a86f1 100644 --- a/drivers/hwmon/pmbus/isl68137.c +++ b/drivers/hwmon/pmbus/isl68137.c @@ -61,6 +61,8 @@ enum chips { raa228004, raa228006, raa228228, + raa228244, + raa228246, raa229001, raa229004, raa229621, @@ -464,6 +466,8 @@ static const struct i2c_device_id raa_dmpvr_id[] = { {"raa228004", raa_dmpvr2_hv}, {"raa228006", raa_dmpvr2_hv}, {"raa228228", raa_dmpvr2_2rail_nontc}, + {"raa228244", raa_dmpvr2_2rail_nontc}, + {"raa228246", raa_dmpvr2_2rail_nontc}, {"raa229001", raa_dmpvr2_2rail}, {"raa229004", raa_dmpvr2_2rail}, {"raa229621", raa_dmpvr2_2rail}, @@ -512,6 +516,8 @@ static const struct of_device_id isl68137_of_match[] = { { .compatible = "renesas,raa228004", .data = (void *)raa_dmpvr2_hv }, { .compatible = "renesas,raa228006", .data = (void *)raa_dmpvr2_hv }, { .compatible = "renesas,raa228228", .data = (void *)raa_dmpvr2_2rail_nontc }, + { .compatible = "renesas,raa228244", .data = (void *)raa_dmpvr2_2rail_nontc }, + { .compatible = "renesas,raa228246", .data = (void *)raa_dmpvr2_2rail_nontc }, { .compatible = "renesas,raa229001", .data = (void *)raa_dmpvr2_2rail }, { .compatible = "renesas,raa229004", .data = (void *)raa_dmpvr2_2rail }, { .compatible = "renesas,raa229621", .data = (void *)raa_dmpvr2_2rail }, From 3d5fcffcdf6266ddf070110eb7d8a6387fd9d291 Mon Sep 17 00:00:00 2001 From: Grant Peltier Date: Wed, 27 Aug 2025 17:42:47 -0500 Subject: [PATCH 1037/3206] docs: hwmon: add RAA228244 and RAA228246 info to isl68137 documentation The Renesas RAA228244 and RAA228246 are recently released digital multiphase controllers. Signed-off-by: Grant Peltier Link: https://lore.kernel.org/r/ddeaf4d2fd1f9c85302ee9b5bf16cfaecf9b89ad.1756331945.git.grantpeltier93@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/isl68137.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Documentation/hwmon/isl68137.rst b/Documentation/hwmon/isl68137.rst index 0e71b22047f897..5bc029c98383d3 100644 --- a/Documentation/hwmon/isl68137.rst +++ b/Documentation/hwmon/isl68137.rst @@ -374,6 +374,26 @@ Supported chips: Publicly available (after August 2020 launch) at the Renesas website + * Renesas RAA228244 + + Prefix: 'raa228244' + + Addresses scanned: - + + Datasheet: + + Provided by Renesas upon request and NDA + + * Renesas RAA228246 + + Prefix: 'raa228246' + + Addresses scanned: - + + Datasheet: + + Provided by Renesas upon request and NDA + * Renesas RAA229001 Prefix: 'raa229001' From 468a20df2ba62ab55feabc4e4306e70824bcb26c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 28 Aug 2025 13:17:29 -0700 Subject: [PATCH 1038/3206] hwmon: (coretemp) Replace x86_model checks with VFM ones Intel CPUs have been using Family 6 for a while. The Family-model checks in the coretemp driver implicitly assume Family 6. With the upcoming Family 18 and 19 models, some of these checks fall apart. While reading the temperature target MSR, cpu_has_tjmax() performs model checks only to determine if a device warning should be printed. Instead of expanding the checks, get rid of the function and print the warning once unconditionally if the MSR read fails. The checks aren't worth preventing a single line warning to dmesg. Update the rest of the x86_model checks with VFM ones to make them more robust. This automatically covers the upcoming Family 18 and 19 as well as any future extended families. Add a code comment to reflect that none of the CPUs in Family 5 or Family 15 set X86_FEATURE_DTHERM. The VFM checks do not impact these CPUs since the driver does not load on them. Signed-off-by: Dave Hansen Signed-off-by: Sohil Mehta Reviewed-by: Dave Hansen Link: https://lore.kernel.org/r/20250828201729.1145420-1-sohil.mehta@intel.com Signed-off-by: Guenter Roeck --- drivers/hwmon/coretemp.c | 76 +++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 40 deletions(-) diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index 1b9203b20d7099..ad79db5a183e05 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -122,29 +122,29 @@ static const struct tjmax tjmax_table[] = { }; struct tjmax_model { - u8 model; - u8 mask; + u32 vfm; + u8 stepping_mask; int tjmax; }; #define ANY 0xff static const struct tjmax_model tjmax_model_table[] = { - { 0x1c, 10, 100000 }, /* D4xx, K4xx, N4xx, D5xx, K5xx, N5xx */ - { 0x1c, ANY, 90000 }, /* Z5xx, N2xx, possibly others - * Note: Also matches 230 and 330, - * which are covered by tjmax_table - */ - { 0x26, ANY, 90000 }, /* Atom Tunnel Creek (Exx), Lincroft (Z6xx) - * Note: TjMax for E6xxT is 110C, but CPU type - * is undetectable by software - */ - { 0x27, ANY, 90000 }, /* Atom Medfield (Z2460) */ - { 0x35, ANY, 90000 }, /* Atom Clover Trail/Cloverview (Z27x0) */ - { 0x36, ANY, 100000 }, /* Atom Cedar Trail/Cedarview (N2xxx, D2xxx) - * Also matches S12x0 (stepping 9), covered by - * PCI table - */ + { INTEL_ATOM_BONNELL, 10, 100000 }, /* D4xx, K4xx, N4xx, D5xx, K5xx, N5xx */ + { INTEL_ATOM_BONNELL, ANY, 90000 }, /* Z5xx, N2xx, possibly others + * Note: Also matches 230 and 330, + * which are covered by tjmax_table + */ + { INTEL_ATOM_BONNELL_MID, ANY, 90000 }, /* Atom Tunnel Creek (Exx), Lincroft (Z6xx) + * Note: TjMax for E6xxT is 110C, but CPU type + * is undetectable by software + */ + { INTEL_ATOM_SALTWELL_MID, ANY, 90000 }, /* Atom Medfield (Z2460) */ + { INTEL_ATOM_SALTWELL_TABLET, ANY, 90000 }, /* Atom Clover Trail/Cloverview (Z27x0) */ + { INTEL_ATOM_SALTWELL, ANY, 100000 }, /* Atom Cedar Trail/Cedarview (N2xxx, D2xxx) + * Also matches S12x0 (stepping 9), covered by + * PCI table + */ }; static bool is_pkg_temp_data(struct temp_data *tdata) @@ -180,6 +180,11 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev) } pci_dev_put(host_bridge); + /* + * This is literally looking for "CPU XXX" in the model string. + * Not checking it against the model as well. Just purely a + * string search. + */ for (i = 0; i < ARRAY_SIZE(tjmax_table); i++) { if (strstr(c->x86_model_id, tjmax_table[i].id)) return tjmax_table[i].tjmax; @@ -187,17 +192,18 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev) for (i = 0; i < ARRAY_SIZE(tjmax_model_table); i++) { const struct tjmax_model *tm = &tjmax_model_table[i]; - if (c->x86_model == tm->model && - (tm->mask == ANY || c->x86_stepping == tm->mask)) + if (c->x86_vfm == tm->vfm && + (tm->stepping_mask == ANY || + tm->stepping_mask == c->x86_stepping)) return tm->tjmax; } /* Early chips have no MSR for TjMax */ - if (c->x86_model == 0xf && c->x86_stepping < 4) + if (c->x86_vfm == INTEL_CORE2_MEROM && c->x86_stepping < 4) usemsr_ee = 0; - if (c->x86_model > 0xe && usemsr_ee) { + if (c->x86_vfm > INTEL_CORE_YONAH && usemsr_ee) { u8 platform_id; /* @@ -211,7 +217,8 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev) "Unable to access MSR 0x17, assuming desktop" " CPU\n"); usemsr_ee = 0; - } else if (c->x86_model < 0x17 && !(eax & 0x10000000)) { + } else if (c->x86_vfm < INTEL_CORE2_PENRYN && + !(eax & 0x10000000)) { /* * Trust bit 28 up to Penryn, I could not find any * documentation on that; if you happen to know @@ -226,7 +233,7 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev) * Mobile Penryn CPU seems to be platform ID 7 or 5 * (guesswork) */ - if (c->x86_model == 0x17 && + if (c->x86_vfm == INTEL_CORE2_PENRYN && (platform_id == 5 || platform_id == 7)) { /* * If MSR EE bit is set, set it to 90 degrees C, @@ -258,18 +265,6 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev) return tjmax; } -static bool cpu_has_tjmax(struct cpuinfo_x86 *c) -{ - u8 model = c->x86_model; - - return model > 0xe && - model != 0x1c && - model != 0x26 && - model != 0x27 && - model != 0x35 && - model != 0x36; -} - static int get_tjmax(struct temp_data *tdata, struct device *dev) { struct cpuinfo_x86 *c = &cpu_data(tdata->cpu); @@ -287,8 +282,7 @@ static int get_tjmax(struct temp_data *tdata, struct device *dev) */ err = rdmsr_safe_on_cpu(tdata->cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx); if (err) { - if (cpu_has_tjmax(c)) - dev_warn(dev, "Unable to read TjMax from CPU %u\n", tdata->cpu); + dev_warn_once(dev, "Unable to read TjMax from CPU %u\n", tdata->cpu); } else { val = (eax >> 16) & 0xff; if (val) @@ -460,7 +454,7 @@ static int chk_ucode_version(unsigned int cpu) * Readings might stop update when processor visited too deep sleep, * fixed for stepping D0 (6EC). */ - if (c->x86_model == 0xe && c->x86_stepping < 0xc && c->microcode < 0x39) { + if (c->x86_vfm == INTEL_CORE_YONAH && c->x86_stepping < 0xc && c->microcode < 0x39) { pr_err("Errata AE18 not fixed, update BIOS or microcode of the CPU!\n"); return -ENODEV; } @@ -580,7 +574,7 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu, * MSR_IA32_TEMPERATURE_TARGET register. Atoms don't have the register * at all. */ - if (c->x86_model > 0xe && c->x86_model != 0x1c) + if (c->x86_vfm > INTEL_CORE_YONAH && c->x86_vfm != INTEL_ATOM_BONNELL) if (get_ttarget(tdata, &pdev->dev) >= 0) tdata->attr_size++; @@ -793,7 +787,9 @@ static int __init coretemp_init(void) /* * CPUID.06H.EAX[0] indicates whether the CPU has thermal * sensors. We check this bit only, all the early CPUs - * without thermal sensors will be filtered out. + * without thermal sensors will be filtered out. This + * includes all the Family 5 and Family 15 (Pentium 4) + * models, since they never set the CPUID bit. */ if (!x86_match_cpu(coretemp_ids)) return -ENODEV; From d5c42cb4583c59e24262f0227fe9a0bf35a661e1 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sat, 30 Aug 2025 17:51:05 +0800 Subject: [PATCH 1039/3206] hwmon: (nct6775) Use int type to store negative error codes Change the 'ret' variable from u32 to int in nct6775_asuswmi_read() to store negative error codes or zero; Storing the negative error codes in unsigned type, doesn't cause an issue at runtime but can be confusing. Additionally, assigning negative error codes to unsigned type may trigger a GCC warning when the -Wsign-conversion flag is enabled. No effect on runtime. Signed-off-by: Qianfeng Rong Link: https://lore.kernel.org/r/20250830095105.3271-1-rongqianfeng@vivo.com Signed-off-by: Guenter Roeck --- drivers/hwmon/nct6775-platform.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/nct6775-platform.c b/drivers/hwmon/nct6775-platform.c index 0a040364b5127e..407945d2cd6a80 100644 --- a/drivers/hwmon/nct6775-platform.c +++ b/drivers/hwmon/nct6775-platform.c @@ -167,7 +167,8 @@ static inline int nct6775_asuswmi_write(u8 bank, u8 reg, u8 val) static inline int nct6775_asuswmi_read(u8 bank, u8 reg, u8 *val) { - u32 ret, tmp = 0; + u32 tmp = 0; + int ret; ret = nct6775_asuswmi_evaluate_method(ASUSWMI_METHODID_RHWM, bank, reg, 0, &tmp); From 9d4388466768c65691e6d554e89a0e57528c9edb Mon Sep 17 00:00:00 2001 From: Michael Tandy Date: Sat, 30 Aug 2025 14:00:59 +0200 Subject: [PATCH 1040/3206] (asus-ec-sensors) add Pro WS WRX90E-SAGE SE Add support for Pro WS WRX90E-SAGE SE Signed-off-by: Michael Tandy Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250830120121.738223-1-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 42 +++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index bedddb6bf9e1c4..12c6b5b266bb6b 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -9,6 +9,7 @@ Supported boards: * PRIME X570-PRO * PRIME X670E-PRO WIFI * Pro WS X570-ACE + * Pro WS WRX90E-SAGE SE * ProArt X570-CREATOR WIFI * ProArt X670E-CREATOR WIFI * ProArt X870E-CREATOR WIFI diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 83047c3263d35c..09a751b44ee2e6 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -117,10 +117,18 @@ enum ec_sensors { ec_sensor_fan_cpu_opt, /* VRM heat sink fan [RPM] */ ec_sensor_fan_vrm_hs, + /* VRM east heat sink fan [RPM] */ + ec_sensor_fan_vrme_hs, + /* VRM west heat sink fan [RPM] */ + ec_sensor_fan_vrmw_hs, /* Chipset fan [RPM] */ ec_sensor_fan_chipset, /* Water flow sensor reading [RPM] */ ec_sensor_fan_water_flow, + /* USB4 fan [RPM] */ + ec_sensor_fan_usb4, + /* M.2 fan [RPM] */ + ec_sensor_fan_m2, /* CPU current [A] */ ec_sensor_curr_cpu, /* "Water_In" temperature sensor reading [℃] */ @@ -150,8 +158,12 @@ enum ec_sensors { #define SENSOR_IN_CPU_CORE BIT(ec_sensor_in_cpu_core) #define SENSOR_FAN_CPU_OPT BIT(ec_sensor_fan_cpu_opt) #define SENSOR_FAN_VRM_HS BIT(ec_sensor_fan_vrm_hs) +#define SENSOR_FAN_VRME_HS BIT(ec_sensor_fan_vrme_hs) +#define SENSOR_FAN_VRMW_HS BIT(ec_sensor_fan_vrmw_hs) #define SENSOR_FAN_CHIPSET BIT(ec_sensor_fan_chipset) #define SENSOR_FAN_WATER_FLOW BIT(ec_sensor_fan_water_flow) +#define SENSOR_FAN_USB4 BIT(ec_sensor_fan_usb4) +#define SENSOR_FAN_M2 BIT(ec_sensor_fan_m2) #define SENSOR_CURR_CPU BIT(ec_sensor_curr_cpu) #define SENSOR_TEMP_WATER_IN BIT(ec_sensor_temp_water_in) #define SENSOR_TEMP_WATER_OUT BIT(ec_sensor_temp_water_out) @@ -168,6 +180,7 @@ enum board_family { family_amd_500_series, family_amd_600_series, family_amd_800_series, + family_amd_wrx_90, family_intel_300_series, family_intel_400_series, family_intel_600_series, @@ -278,6 +291,21 @@ static const struct ec_sensor_info sensors_family_amd_800[] = { EC_SENSOR("CPU_Opt", hwmon_fan, 2, 0x00, 0xb0), }; +static const struct ec_sensor_info sensors_family_amd_wrx_90[] = { + [ec_sensor_temp_cpu_package] = + EC_SENSOR("CPU Package", hwmon_temp, 1, 0x00, 0x31), + [ec_sensor_fan_cpu_opt] = + EC_SENSOR("CPU_Opt", hwmon_fan, 2, 0x00, 0xb0), + [ec_sensor_fan_vrmw_hs] = + EC_SENSOR("VRMW HS", hwmon_fan, 2, 0x00, 0xb4), + [ec_sensor_fan_usb4] = EC_SENSOR("USB4", hwmon_fan, 2, 0x00, 0xb6), + [ec_sensor_fan_vrme_hs] = + EC_SENSOR("VRME HS", hwmon_fan, 2, 0x00, 0xbc), + [ec_sensor_fan_m2] = EC_SENSOR("M.2", hwmon_fan, 2, 0x00, 0xbe), + [ec_sensor_temp_t_sensor] = + EC_SENSOR("T_Sensor", hwmon_temp, 1, 0x01, 0x04), +}; + static const struct ec_sensor_info sensors_family_intel_300[] = { [ec_sensor_temp_chipset] = EC_SENSOR("Chipset", hwmon_temp, 1, 0x00, 0x3a), @@ -421,6 +449,15 @@ static const struct ec_board_info board_info_pro_art_b550_creator = { .family = family_amd_500_series, }; +static const struct ec_board_info board_info_pro_ws_wrx90e_sage_se = { + /* Board also has a nct6798 with 7 more fans and temperatures */ + .sensors = SENSOR_TEMP_CPU_PACKAGE | SENSOR_TEMP_T_SENSOR | + SENSOR_FAN_CPU_OPT | SENSOR_FAN_USB4 | SENSOR_FAN_M2 | + SENSOR_FAN_VRME_HS | SENSOR_FAN_VRMW_HS, + .mutex_path = ASUS_HW_ACCESS_MUTEX_RMTW_ASMX, + .family = family_amd_wrx_90, +}; + static const struct ec_board_info board_info_pro_ws_x570_ace = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_VRM | SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CHIPSET | @@ -650,6 +687,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_pro_art_x870E_creator_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt B550-CREATOR", &board_info_pro_art_b550_creator), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("Pro WS WRX90E-SAGE SE", + &board_info_pro_ws_wrx90e_sage_se), DMI_EXACT_MATCH_ASUS_BOARD_NAME("Pro WS X570-ACE", &board_info_pro_ws_x570_ace), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII DARK HERO", @@ -1173,6 +1212,9 @@ static int asus_ec_probe(struct platform_device *pdev) case family_amd_800_series: ec_data->sensors_info = sensors_family_amd_800; break; + case family_amd_wrx_90: + ec_data->sensors_info = sensors_family_amd_wrx_90; + break; case family_intel_300_series: ec_data->sensors_info = sensors_family_intel_300; break; From aa52d636641bc9b38b033ab3dd1c6c4e059a863d Mon Sep 17 00:00:00 2001 From: Eugene Shalygin Date: Sat, 30 Aug 2025 15:12:15 +0200 Subject: [PATCH 1041/3206] hwmon: (asus-ec-sensors) refine config description Remove outdated mention of the supported mmotherboard families and add a hint which sensor readings are available via the module. Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250830131224.748481-1-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/Kconfig | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 9d28fcf7cd2a6f..c53c041c32ef70 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -2673,9 +2673,10 @@ config SENSORS_ASUS_EC depends on ACPI_EC help If you say yes here you get support for the ACPI embedded controller - hardware monitoring interface found in ASUS motherboards. The driver - currently supports B550/X570 boards, although other ASUS boards might - provide this monitoring interface as well. + hardware monitoring interface found in some ASUS motherboards. This is + where such sensors as water flow and temperature, optional fans, and + additional temperature sensors (T_Sensor, chipset temperatures) + find themselves. This driver can also be built as a module. If so, the module will be called asus_ec_sensors. From 402dfbe7ef6bd5d13341a970c47282c7d60cf9a0 Mon Sep 17 00:00:00 2001 From: Debanil Chowdhury Date: Sun, 31 Aug 2025 04:45:54 +0000 Subject: [PATCH 1042/3206] hwmon: crps: Fix typos in crps.rst documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changed a misspelling in crps.rst documentation: "Critial" → "Critical". Found using codespell tool. Signed-off-by: Debanil Chowdhury Link: https://lore.kernel.org/r/20250831045710.6009-1-kerneldev@debanilchowdhury.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/crps.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/hwmon/crps.rst b/Documentation/hwmon/crps.rst index 87380b4965580d..d42ea59d2dae16 100644 --- a/Documentation/hwmon/crps.rst +++ b/Documentation/hwmon/crps.rst @@ -43,7 +43,7 @@ curr1_label "iin" curr1_input Measured input current curr1_max Maximum input current curr1_max_alarm Input maximum current high alarm -curr1_crit Critial high input current +curr1_crit Critical high input current curr1_crit_alarm Input critical current high alarm curr1_rated_max Maximum rated input current @@ -51,7 +51,7 @@ curr2_label "iout1" curr2_input Measured output current curr2_max Maximum output current curr2_max_alarm Output maximum current high alarm -curr2_crit Critial high output current +curr2_crit Critical high output current curr2_crit_alarm Output critical current high alarm curr2_rated_max Maximum rated output current From fa1ab48bfe97c86278ddf7fb535bc597e64fd210 Mon Sep 17 00:00:00 2001 From: Flaviu Nistor Date: Mon, 25 Aug 2025 21:02:43 +0300 Subject: [PATCH 1043/3206] dt-bindings: hwmon: tmp102: Add label property Add support for an optional label property similar to other hwmon devices. This allows, in case of boards with multiple TMP102 sensors, to assign distinct names to each instance. Signed-off-by: Flaviu Nistor Reviewed-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250825180248.1943607-1-flaviu.nistor@gmail.com [groeck: Dropped unnecessary "|" after "description:"] Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml b/Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml index 4c89448eba0dc0..96b2e4969f78a1 100644 --- a/Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml +++ b/Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml @@ -20,6 +20,10 @@ properties: reg: maxItems: 1 + label: + description: + A descriptive name for this channel, like "ambient" or "psu". + "#thermal-sensor-cells": const: 1 @@ -45,6 +49,7 @@ examples: reg = <0x48>; interrupt-parent = <&gpio7>; interrupts = <16 IRQ_TYPE_LEVEL_LOW>; + label = "somelabel"; vcc-supply = <&supply>; #thermal-sensor-cells = <1>; }; From d41f80bd43e6f613078060f63ad60f98e4773b56 Mon Sep 17 00:00:00 2001 From: Flaviu Nistor Date: Mon, 25 Aug 2025 21:02:44 +0300 Subject: [PATCH 1044/3206] hwmon: tmp102: Add support for label Add support for label sysfs attribute similar to other hwmon devices. This is particularly useful for systems with multiple sensors on the same board, where identifying individual sensors is much easier since labels can be defined via device tree. Signed-off-by: Flaviu Nistor Link: https://lore.kernel.org/r/20250825180248.1943607-2-flaviu.nistor@gmail.com [groeck: Fixed continuation line alignment] Signed-off-by: Guenter Roeck --- drivers/hwmon/tmp102.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/tmp102.c b/drivers/hwmon/tmp102.c index a02daa496c9c44..376e0eac8cc1c9 100644 --- a/drivers/hwmon/tmp102.c +++ b/drivers/hwmon/tmp102.c @@ -53,6 +53,7 @@ #define CONVERSION_TIME_MS 35 /* in milli-seconds */ struct tmp102 { + const char *label; struct regmap *regmap; u16 config_orig; unsigned long ready_time; @@ -70,6 +71,16 @@ static inline u16 tmp102_mC_to_reg(int val) return (val * 128) / 1000; } +static int tmp102_read_string(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, const char **str) +{ + struct tmp102 *tmp102 = dev_get_drvdata(dev); + + *str = tmp102->label; + + return 0; +} + static int tmp102_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *temp) { @@ -128,12 +139,18 @@ static int tmp102_write(struct device *dev, enum hwmon_sensor_types type, static umode_t tmp102_is_visible(const void *data, enum hwmon_sensor_types type, u32 attr, int channel) { + const struct tmp102 *tmp102 = data; + if (type != hwmon_temp) return 0; switch (attr) { case hwmon_temp_input: return 0444; + case hwmon_temp_label: + if (tmp102->label) + return 0444; + return 0; case hwmon_temp_max_hyst: case hwmon_temp_max: return 0644; @@ -146,12 +163,13 @@ static const struct hwmon_channel_info * const tmp102_info[] = { HWMON_CHANNEL_INFO(chip, HWMON_C_REGISTER_TZ), HWMON_CHANNEL_INFO(temp, - HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MAX_HYST), + HWMON_T_INPUT | HWMON_T_LABEL | HWMON_T_MAX | HWMON_T_MAX_HYST), NULL }; static const struct hwmon_ops tmp102_hwmon_ops = { .is_visible = tmp102_is_visible, + .read_string = tmp102_read_string, .read = tmp102_read, .write = tmp102_write, }; @@ -213,6 +231,8 @@ static int tmp102_probe(struct i2c_client *client) if (!tmp102) return -ENOMEM; + of_property_read_string(dev->of_node, "label", &tmp102->label); + i2c_set_clientdata(client, tmp102); tmp102->regmap = devm_regmap_init_i2c(client, &tmp102_regmap_config); From 58639dfde0c29434e902a0efe213b35ede35604e Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Fri, 29 Aug 2025 15:05:09 +1200 Subject: [PATCH 1045/3206] dt-bindings: hwmon: ti,ina2xx: Add INA780 device Add a compatible string for the INA780 device. Signed-off-by: Chris Packham Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250829030512.1179998-2-chris.packham@alliedtelesis.co.nz Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml index fa68b99ef2e292..980ecba6d8117d 100644 --- a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml +++ b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml @@ -32,6 +32,7 @@ properties: - ti,ina237 - ti,ina238 - ti,ina260 + - ti,ina780 reg: maxItems: 1 From f19617d1478aa595b5e2439847d34f3ac414c836 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 1 Sep 2025 08:59:02 -0700 Subject: [PATCH 1046/3206] dt-bindings: hwmon: ti,ina2xx: Update details for various chips ti,maximum-expected-current-microamp, ti,shunt-gain, and shunt-resistor properties are not supported on all chips described in this bindings file. Update the bindings accordingly. Cc: Chris Packham Signed-off-by: Guenter Roeck Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250901155902.2667063-1-linux@roeck-us.net Signed-off-by: Guenter Roeck --- .../devicetree/bindings/hwmon/ti,ina2xx.yaml | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml index 980ecba6d8117d..8b491be9c49dbc 100644 --- a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml +++ b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml @@ -115,10 +115,39 @@ allOf: - ti,ina237 - ti,ina238 - ti,ina260 + - ti,ina780 then: properties: ti,maximum-expected-current-microamp: false + - if: + properties: + compatible: + contains: + enum: + - silergy,sy24655 + - ti,ina209 + - ti,ina219 + - ti,ina220 + - ti,ina226 + - ti,ina230 + - ti,ina231 + - ti,ina260 + - ti,ina780 + then: + properties: + ti,shunt-gain: false + + - if: + properties: + compatible: + contains: + enum: + - ti,ina780 + then: + properties: + shunt-resistor: false + unevaluatedProperties: false examples: From 34c61c198d06b83c1a9da88005e3f4eec85447da Mon Sep 17 00:00:00 2001 From: Tom Ingleby Date: Tue, 2 Sep 2025 20:17:56 -0700 Subject: [PATCH 1047/3206] hwmon: (asus-ec-sensors) add ROG STRIX Z690-E GAMING WIFI Add support for the ASUS ROG STRIX Z690-E GAMING WIFI Signed-off-by: Tom Ingleby Link: https://lore.kernel.org/r/20250903031800.4173-1-tom@ewsting.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index 12c6b5b266bb6b..e5052159ffa0da 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -36,6 +36,7 @@ Supported boards: * ROG STRIX Z390-F GAMING * ROG STRIX Z490-F GAMING * ROG STRIX Z690-A GAMING WIFI D4 + * ROG STRIX Z690-E GAMING WIFI * ROG STRIX Z790-E GAMING WIFI II * ROG STRIX Z790-I GAMING WIFI * ROG ZENITH II EXTREME diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 09a751b44ee2e6..728e21fddae891 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -625,6 +625,12 @@ static const struct ec_board_info board_info_strix_z690_a_gaming_wifi_d4 = { .family = family_intel_600_series, }; +static const struct ec_board_info board_info_strix_z690_e_gaming_wifi = { + .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM, + .mutex_path = ASUS_HW_ACCESS_MUTEX_RMTW_ASMX, + .family = family_intel_600_series, +}; + static const struct ec_board_info board_info_strix_z790_e_gaming_wifi_ii = { .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | SENSOR_FAN_CPU_OPT, @@ -735,6 +741,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_strix_z490_f_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z690-A GAMING WIFI D4", &board_info_strix_z690_a_gaming_wifi_d4), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z690-E GAMING WIFI", + &board_info_strix_z690_e_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z790-E GAMING WIFI II", &board_info_strix_z790_e_gaming_wifi_ii), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z790-I GAMING WIFI", From 5529bc1a2ff047ca94a64d91ec3ed52b01946837 Mon Sep 17 00:00:00 2001 From: Michael Tandy Date: Wed, 3 Sep 2025 20:47:36 +0200 Subject: [PATCH 1048/3206] hwmon: (asus-ec-sensors) sort declarations Sort all the declarations in the source file. Contributors are asked to insert new entries keeping alphabetical order, but the existing ones were not completely sorted. Signed-off-by: Michael Tandy Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250903184753.5876-1-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/asus-ec-sensors.c | 170 ++++++++++++++++---------------- 1 file changed, 85 insertions(+), 85 deletions(-) diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 728e21fddae891..ecb058e8755bd9 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -384,6 +384,52 @@ struct ec_board_info { enum board_family family; }; +static const struct ec_board_info board_info_crosshair_viii_dark_hero = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | + SENSOR_TEMP_T_SENSOR | + SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | + SENSOR_FAN_CPU_OPT | SENSOR_FAN_WATER_FLOW | + SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, + .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, + .family = family_amd_500_series, +}; + +static const struct ec_board_info board_info_crosshair_viii_hero = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | + SENSOR_TEMP_T_SENSOR | + SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | + SENSOR_FAN_CPU_OPT | SENSOR_FAN_CHIPSET | + SENSOR_FAN_WATER_FLOW | SENSOR_CURR_CPU | + SENSOR_IN_CPU_CORE, + .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, + .family = family_amd_500_series, +}; + +static const struct ec_board_info board_info_crosshair_viii_impact = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | + SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | + SENSOR_FAN_CHIPSET | SENSOR_CURR_CPU | + SENSOR_IN_CPU_CORE, + .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, + .family = family_amd_500_series, +}; + +static const struct ec_board_info board_info_crosshair_x670e_gene = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_T_SENSOR | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_600_series, +}; + +static const struct ec_board_info board_info_crosshair_x670e_hero = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM | + SENSOR_SET_TEMP_WATER, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_600_series, +}; + static const struct ec_board_info board_info_maximus_vi_hero = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | @@ -393,6 +439,22 @@ static const struct ec_board_info board_info_maximus_vi_hero = { .family = family_intel_300_series, }; +static const struct ec_board_info board_info_maximus_xi_hero = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | + SENSOR_TEMP_T_SENSOR | + SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | + SENSOR_FAN_CPU_OPT | SENSOR_FAN_WATER_FLOW, + .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, + .family = family_intel_300_series, +}; + +static const struct ec_board_info board_info_maximus_z690_formula = { + .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | + SENSOR_SET_TEMP_WATER | SENSOR_FAN_WATER_FLOW, + .mutex_path = ASUS_HW_ACCESS_MUTEX_RMTW_ASMX, + .family = family_intel_600_series, +}; + static const struct ec_board_info board_info_prime_x470_pro = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | @@ -417,6 +479,14 @@ static const struct ec_board_info board_info_prime_x670e_pro_wifi = { .family = family_amd_600_series, }; +static const struct ec_board_info board_info_pro_art_b550_creator = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | + SENSOR_TEMP_T_SENSOR | + SENSOR_FAN_CPU_OPT, + .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, + .family = family_amd_500_series, +}; + static const struct ec_board_info board_info_pro_art_x570_creator_wifi = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_VRM | SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CPU_OPT | @@ -441,14 +511,6 @@ static const struct ec_board_info board_info_pro_art_x870E_creator_wifi = { .family = family_amd_800_series, }; -static const struct ec_board_info board_info_pro_art_b550_creator = { - .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | - SENSOR_TEMP_T_SENSOR | - SENSOR_FAN_CPU_OPT, - .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, - .family = family_amd_500_series, -}; - static const struct ec_board_info board_info_pro_ws_wrx90e_sage_se = { /* Board also has a nct6798 with 7 more fans and temperatures */ .sensors = SENSOR_TEMP_CPU_PACKAGE | SENSOR_TEMP_T_SENSOR | @@ -466,68 +528,6 @@ static const struct ec_board_info board_info_pro_ws_x570_ace = { .family = family_amd_500_series, }; -static const struct ec_board_info board_info_crosshair_x670e_hero = { - .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | - SENSOR_TEMP_MB | SENSOR_TEMP_VRM | - SENSOR_SET_TEMP_WATER, - .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, - .family = family_amd_600_series, -}; - -static const struct ec_board_info board_info_crosshair_x670e_gene = { - .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | - SENSOR_TEMP_T_SENSOR | - SENSOR_TEMP_MB | SENSOR_TEMP_VRM, - .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, - .family = family_amd_600_series, -}; - -static const struct ec_board_info board_info_crosshair_viii_dark_hero = { - .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | - SENSOR_TEMP_T_SENSOR | - SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | - SENSOR_FAN_CPU_OPT | SENSOR_FAN_WATER_FLOW | - SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, - .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, - .family = family_amd_500_series, -}; - -static const struct ec_board_info board_info_crosshair_viii_hero = { - .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | - SENSOR_TEMP_T_SENSOR | - SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | - SENSOR_FAN_CPU_OPT | SENSOR_FAN_CHIPSET | - SENSOR_FAN_WATER_FLOW | SENSOR_CURR_CPU | - SENSOR_IN_CPU_CORE, - .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, - .family = family_amd_500_series, -}; - -static const struct ec_board_info board_info_maximus_xi_hero = { - .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | - SENSOR_TEMP_T_SENSOR | - SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | - SENSOR_FAN_CPU_OPT | SENSOR_FAN_WATER_FLOW, - .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, - .family = family_intel_300_series, -}; - -static const struct ec_board_info board_info_maximus_z690_formula = { - .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | - SENSOR_SET_TEMP_WATER | SENSOR_FAN_WATER_FLOW, - .mutex_path = ASUS_HW_ACCESS_MUTEX_RMTW_ASMX, - .family = family_intel_600_series, -}; - -static const struct ec_board_info board_info_crosshair_viii_impact = { - .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | - SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | - SENSOR_FAN_CHIPSET | SENSOR_CURR_CPU | - SENSOR_IN_CPU_CORE, - .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, - .family = family_amd_500_series, -}; - static const struct ec_board_info board_info_strix_b550_e_gaming = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | @@ -645,6 +645,15 @@ static const struct ec_board_info board_info_strix_z790_i_gaming_wifi = { .family = family_intel_700_series, }; +static const struct ec_board_info board_info_tuf_gaming_x670e_plus = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM | + SENSOR_TEMP_WATER_IN | SENSOR_TEMP_WATER_OUT | + SENSOR_FAN_CPU_OPT, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_600_series, +}; + static const struct ec_board_info board_info_zenith_ii_extreme = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | @@ -657,15 +666,6 @@ static const struct ec_board_info board_info_zenith_ii_extreme = { .family = family_amd_500_series, }; -static const struct ec_board_info board_info_tuf_gaming_x670e_plus = { - .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | - SENSOR_TEMP_MB | SENSOR_TEMP_VRM | - SENSOR_TEMP_WATER_IN | SENSOR_TEMP_WATER_OUT | - SENSOR_FAN_CPU_OPT, - .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, - .family = family_amd_600_series, -}; - #define DMI_EXACT_MATCH_ASUS_BOARD_NAME(name, board_info) \ { \ .matches = { \ @@ -685,14 +685,14 @@ static const struct dmi_system_id dmi_table[] = { &board_info_prime_x570_pro), DMI_EXACT_MATCH_ASUS_BOARD_NAME("PRIME X670E-PRO WIFI", &board_info_prime_x670e_pro_wifi), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt B550-CREATOR", + &board_info_pro_art_b550_creator), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt X570-CREATOR WIFI", &board_info_pro_art_x570_creator_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt X670E-CREATOR WIFI", &board_info_pro_art_x670E_creator_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt X870E-CREATOR WIFI", &board_info_pro_art_x870E_creator_wifi), - DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt B550-CREATOR", - &board_info_pro_art_b550_creator), DMI_EXACT_MATCH_ASUS_BOARD_NAME("Pro WS WRX90E-SAGE SE", &board_info_pro_ws_wrx90e_sage_se), DMI_EXACT_MATCH_ASUS_BOARD_NAME("Pro WS X570-ACE", @@ -705,18 +705,18 @@ static const struct dmi_system_id dmi_table[] = { &board_info_crosshair_viii_hero), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII HERO (WI-FI)", &board_info_crosshair_viii_hero), - DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR X670E HERO", - &board_info_crosshair_x670e_hero), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII IMPACT", + &board_info_crosshair_viii_impact), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR X670E GENE", &board_info_crosshair_x670e_gene), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR X670E HERO", + &board_info_crosshair_x670e_hero), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG MAXIMUS XI HERO", &board_info_maximus_xi_hero), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG MAXIMUS XI HERO (WI-FI)", &board_info_maximus_xi_hero), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG MAXIMUS Z690 FORMULA", &board_info_maximus_z690_formula), - DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII IMPACT", - &board_info_crosshair_viii_impact), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B550-E GAMING", &board_info_strix_b550_e_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B550-I GAMING", From 8702c8f53d93929a804cd16015e229cb0770b4ec Mon Sep 17 00:00:00 2001 From: Eugene Shalygin Date: Wed, 3 Sep 2025 21:17:26 +0200 Subject: [PATCH 1049/3206] hwmon: (asus-ec-sensors) add PRIME Z270-A MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the PRIME Z270-A board. Tested-by: Jan Philipp Groß Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250903191736.14451-1-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 27 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index e5052159ffa0da..baf9eba5957c52 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -8,6 +8,7 @@ Supported boards: * PRIME X470-PRO * PRIME X570-PRO * PRIME X670E-PRO WIFI + * PRIME Z270-A * Pro WS X570-ACE * Pro WS WRX90E-SAGE SE * ProArt X570-CREATOR WIFI diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index ecb058e8755bd9..f580ff2e212f2f 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -60,6 +60,8 @@ static char *mutex_path_override; #define ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0 "\\_SB_.PCI0.SBRG.SIO1.MUT0" +#define ASUS_HW_ACCESS_MUTEX_SB_PCI0_LPCB_SIO1_MUT0 "\\_SB_.PCI0.LPCB.SIO1.MUT0" + #define MAX_IDENTICAL_BOARD_VARIATIONS 3 /* Moniker for the ACPI global lock (':' is not allowed in ASL identifiers) */ @@ -181,6 +183,7 @@ enum board_family { family_amd_600_series, family_amd_800_series, family_amd_wrx_90, + family_intel_200_series, family_intel_300_series, family_intel_400_series, family_intel_600_series, @@ -306,6 +309,18 @@ static const struct ec_sensor_info sensors_family_amd_wrx_90[] = { EC_SENSOR("T_Sensor", hwmon_temp, 1, 0x01, 0x04), }; +static const struct ec_sensor_info sensors_family_intel_200[] = { + [ec_sensor_temp_chipset] = + EC_SENSOR("Chipset", hwmon_temp, 1, 0x00, 0x3a), + [ec_sensor_temp_cpu] = EC_SENSOR("CPU", hwmon_temp, 1, 0x00, 0x3b), + [ec_sensor_temp_mb] = + EC_SENSOR("Motherboard", hwmon_temp, 1, 0x00, 0x3c), + [ec_sensor_temp_t_sensor] = + EC_SENSOR("T_Sensor", hwmon_temp, 1, 0x00, 0x3d), + [ec_sensor_fan_cpu_opt] = + EC_SENSOR("CPU_Opt", hwmon_fan, 2, 0x00, 0xbc), +}; + static const struct ec_sensor_info sensors_family_intel_300[] = { [ec_sensor_temp_chipset] = EC_SENSOR("Chipset", hwmon_temp, 1, 0x00, 0x3a), @@ -479,6 +494,13 @@ static const struct ec_board_info board_info_prime_x670e_pro_wifi = { .family = family_amd_600_series, }; +static const struct ec_board_info board_info_prime_z270_a = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | + SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CPU_OPT, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_LPCB_SIO1_MUT0, + .family = family_intel_200_series, +}; + static const struct ec_board_info board_info_pro_art_b550_creator = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | @@ -685,6 +707,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_prime_x570_pro), DMI_EXACT_MATCH_ASUS_BOARD_NAME("PRIME X670E-PRO WIFI", &board_info_prime_x670e_pro_wifi), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("PRIME Z270-A", + &board_info_prime_z270_a), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt B550-CREATOR", &board_info_pro_art_b550_creator), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt X570-CREATOR WIFI", @@ -1223,6 +1247,9 @@ static int asus_ec_probe(struct platform_device *pdev) case family_amd_wrx_90: ec_data->sensors_info = sensors_family_amd_wrx_90; break; + case family_intel_200_series: + ec_data->sensors_info = sensors_family_intel_200; + break; case family_intel_300_series: ec_data->sensors_info = sensors_family_intel_300; break; From bd48b5a4e8d3e1ac2c1ba1e18ba1350ea2c9eb10 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 4 Sep 2025 22:21:09 +0200 Subject: [PATCH 1050/3206] dt-bindings: hwmon: pwm-fan: Document after shutdown fan settings Document fan-shutdown-percent property, used to describe fan RPM in percent set during shutdown. This is used to keep the fan running at fixed RPM after the kernel shut down, which is useful on hardware that does keep heating itself even after the kernel did shut down, for example from some sort of management core. Signed-off-by: Marek Vasut Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250904202157.170600-1-marek.vasut+renesas@mailbox.org Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/hwmon/pwm-fan.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/devicetree/bindings/hwmon/pwm-fan.yaml b/Documentation/devicetree/bindings/hwmon/pwm-fan.yaml index 8b4ed5ee962fb5..a84cc3a4cfdcaf 100644 --- a/Documentation/devicetree/bindings/hwmon/pwm-fan.yaml +++ b/Documentation/devicetree/bindings/hwmon/pwm-fan.yaml @@ -31,6 +31,15 @@ properties: it must be self resetting edge interrupts. maxItems: 1 + fan-shutdown-percent: + description: + Fan RPM in percent set during shutdown. This is used to keep the fan + running at fixed RPM after the kernel shut down, which is useful on + hardware that does keep heating itself even after the kernel did shut + down, for example from some sort of management core. + minimum: 0 + maximum: 100 + fan-stop-to-start-percent: description: Minimum fan RPM in percent to start when stopped. From da0a3cc73a2bffe8faef6227a74e5a2de2e79a0c Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 4 Sep 2025 22:21:10 +0200 Subject: [PATCH 1051/3206] hwmon: (pwm-fan) Implement after shutdown fan settings Add fan-shutdown-percent property, used to describe fan RPM in percent set during shutdown. This is used to keep the fan running at fixed RPM after the kernel shut down, which is useful on hardware that does keep heating itself even after the kernel did shut down, for example from some sort of management core. The current behavior of pwm-fan is to unconditionally stop the fan on shutdown, which is not always the safe and correct thing to do, so let the hardware description include the expected behavior. Signed-off-by: Marek Vasut Tested-by: Wolfram Sang Link: https://lore.kernel.org/r/20250904202157.170600-2-marek.vasut+renesas@mailbox.org Signed-off-by: Guenter Roeck --- drivers/hwmon/pwm-fan.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/pwm-fan.c b/drivers/hwmon/pwm-fan.c index d0fe53451bdf8b..37269db2de84fb 100644 --- a/drivers/hwmon/pwm-fan.c +++ b/drivers/hwmon/pwm-fan.c @@ -64,6 +64,7 @@ struct pwm_fan_ctx { u64 pwm_duty_cycle_from_stopped; u32 pwm_usec_from_stopped; + u8 pwm_shutdown; }; /* This handler assumes self resetting edge triggered interrupt. */ @@ -484,9 +485,14 @@ static void pwm_fan_cleanup(void *__ctx) struct pwm_fan_ctx *ctx = __ctx; timer_delete_sync(&ctx->rpm_timer); - /* Switch off everything */ - ctx->enable_mode = pwm_disable_reg_disable; - pwm_fan_power_off(ctx, true); + if (ctx->pwm_shutdown) { + ctx->enable_mode = pwm_enable_reg_enable; + __set_pwm(ctx, ctx->pwm_shutdown); + } else { + /* Switch off everything */ + ctx->enable_mode = pwm_disable_reg_disable; + pwm_fan_power_off(ctx, true); + } } static int pwm_fan_probe(struct platform_device *pdev) @@ -498,6 +504,7 @@ static int pwm_fan_probe(struct platform_device *pdev) int ret; const struct hwmon_channel_info **channels; u32 initial_pwm, pwm_min_from_stopped = 0; + u32 pwm_shutdown_percent = 0; u32 *fan_channel_config; int channel_count = 1; /* We always have a PWM channel. */ int i; @@ -648,6 +655,11 @@ static int pwm_fan_probe(struct platform_device *pdev) channels[1] = &ctx->fan_channel; } + ret = device_property_read_u32(dev, "fan-shutdown-percent", + &pwm_shutdown_percent); + if (!ret && pwm_shutdown_percent) + ctx->pwm_shutdown = (clamp(pwm_shutdown_percent, 0, 100) * 255) / 100; + ret = device_property_read_u32(dev, "fan-stop-to-start-percent", &pwm_min_from_stopped); if (!ret && pwm_min_from_stopped) { From 0bcd01f757bc06471c82a137eafee281ef1b6e38 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 28 Aug 2024 21:56:57 -0700 Subject: [PATCH 1052/3206] hwmon: Introduce 64-bit energy attribute support Many chips require 64-bit variables to display the accumulated energy, even more so since the energy units are micro-Joule. Add new sensor type "energy64" to support reporting the chip energy as 64-bit values. Changing the entire hardware monitoring API is not feasible, and it is only really necessary to support reading 64-bit values for the "energyX_input" attribute. For this reason, keep the API as-is and use type casts on both ends to pass 64-bit pointers when reading the accumulated energy. On the write side (which is only useful for the energyX_enable attribute), keep passing the written value as long. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- Documentation/hwmon/hwmon-kernel-api.rst | 3 +++ drivers/hwmon/hwmon.c | 14 ++++++++++---- include/linux/hwmon.h | 1 + include/trace/events/hwmon.h | 10 +++++----- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/Documentation/hwmon/hwmon-kernel-api.rst b/Documentation/hwmon/hwmon-kernel-api.rst index e47fc757e63ed2..037b69c23cb593 100644 --- a/Documentation/hwmon/hwmon-kernel-api.rst +++ b/Documentation/hwmon/hwmon-kernel-api.rst @@ -159,6 +159,7 @@ It contains following fields: hwmon_curr Current sensor hwmon_power Power sensor hwmon_energy Energy sensor + hwmon_energy64 Energy sensor, reported as 64-bit signed value hwmon_humidity Humidity sensor hwmon_fan Fan speed sensor hwmon_pwm PWM control @@ -288,6 +289,8 @@ Parameters: The sensor channel number. val: Pointer to attribute value. + For hwmon_energy64, `'val`' is passed as `long *` but needs + a typecast to `s64 *`. Return value: 0 on success, a negative error number otherwise. diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index 1688c210888abf..2e17f3a4c59bdc 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -426,18 +426,22 @@ static ssize_t hwmon_attr_show(struct device *dev, struct device_attribute *devattr, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + s64 val64; long val; int ret; ret = hattr->ops->read(dev, hattr->type, hattr->attr, hattr->index, - &val); + (hattr->type == hwmon_energy64) ? (long *)&val64 : &val); if (ret < 0) return ret; + if (hattr->type != hwmon_energy64) + val64 = val; + trace_hwmon_attr_show(hattr->index + hwmon_attr_base(hattr->type), - hattr->name, val); + hattr->name, val64); - return sprintf(buf, "%ld\n", val); + return sprintf(buf, "%lld\n", val64); } static ssize_t hwmon_attr_show_string(struct device *dev, @@ -478,7 +482,7 @@ static ssize_t hwmon_attr_store(struct device *dev, return ret; trace_hwmon_attr_store(hattr->index + hwmon_attr_base(hattr->type), - hattr->name, val); + hattr->name, (s64)val); return count; } @@ -734,6 +738,7 @@ static const char * const *__templates[] = { [hwmon_curr] = hwmon_curr_attr_templates, [hwmon_power] = hwmon_power_attr_templates, [hwmon_energy] = hwmon_energy_attr_templates, + [hwmon_energy64] = hwmon_energy_attr_templates, [hwmon_humidity] = hwmon_humidity_attr_templates, [hwmon_fan] = hwmon_fan_attr_templates, [hwmon_pwm] = hwmon_pwm_attr_templates, @@ -747,6 +752,7 @@ static const int __templates_size[] = { [hwmon_curr] = ARRAY_SIZE(hwmon_curr_attr_templates), [hwmon_power] = ARRAY_SIZE(hwmon_power_attr_templates), [hwmon_energy] = ARRAY_SIZE(hwmon_energy_attr_templates), + [hwmon_energy64] = ARRAY_SIZE(hwmon_energy_attr_templates), [hwmon_humidity] = ARRAY_SIZE(hwmon_humidity_attr_templates), [hwmon_fan] = ARRAY_SIZE(hwmon_fan_attr_templates), [hwmon_pwm] = ARRAY_SIZE(hwmon_pwm_attr_templates), diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 3a63dff62d0361..886fc90b2d2536 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -24,6 +24,7 @@ enum hwmon_sensor_types { hwmon_curr, hwmon_power, hwmon_energy, + hwmon_energy64, hwmon_humidity, hwmon_fan, hwmon_pwm, diff --git a/include/trace/events/hwmon.h b/include/trace/events/hwmon.h index d1ff560cd9b561..3865098f21f140 100644 --- a/include/trace/events/hwmon.h +++ b/include/trace/events/hwmon.h @@ -9,14 +9,14 @@ DECLARE_EVENT_CLASS(hwmon_attr_class, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val), TP_STRUCT__entry( __field(int, index) __string(attr_name, attr_name) - __field(long, val) + __field(long long, val) ), TP_fast_assign( @@ -25,20 +25,20 @@ DECLARE_EVENT_CLASS(hwmon_attr_class, __entry->val = val; ), - TP_printk("index=%d, attr_name=%s, val=%ld", + TP_printk("index=%d, attr_name=%s, val=%lld", __entry->index, __get_str(attr_name), __entry->val) ); DEFINE_EVENT(hwmon_attr_class, hwmon_attr_show, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val) ); DEFINE_EVENT(hwmon_attr_class, hwmon_attr_store, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val) ); From 9a8113a5c6eb23aaaafa9bde043b76a22a3a4e44 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 31 Aug 2025 14:53:58 -0700 Subject: [PATCH 1053/3206] hwmon: (ina238) Drop platform data support There are no in-tree users of ina2xx platform data. Drop support for it. The driver already supports device properties which can be used as alternative if needed. Also remove reference to the non-existing shunt_resistor sysfs attribute from the driver documentation. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- Documentation/hwmon/ina238.rst | 8 ++++---- drivers/hwmon/ina238.c | 8 ++------ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/Documentation/hwmon/ina238.rst b/Documentation/hwmon/ina238.rst index 9a24da4786a43f..9b830e37c986b4 100644 --- a/Documentation/hwmon/ina238.rst +++ b/Documentation/hwmon/ina238.rst @@ -29,10 +29,10 @@ The INA238 is a current shunt, power and temperature monitor with an I2C interface. It includes a number of programmable functions including alerts, conversion rate, sample averaging and selectable shunt voltage accuracy. -The shunt value in micro-ohms can be set via platform data or device tree at -compile-time or via the shunt_resistor attribute in sysfs at run-time. Please -refer to the Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml for bindings -if the device tree is used. +The shunt value in micro-ohms can be set via device properties, either from +platform code or from device tree data. Please refer to +Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml for bindings if +device tree is used. Sysfs entries ------------- diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index 59a2c8889fa2b1..22e2b862fb3363 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -16,8 +16,6 @@ #include #include -#include - /* INA238 register definitions */ #define INA238_CONFIG 0x0 #define INA238_ADC_CONFIG 0x1 @@ -745,7 +743,6 @@ ATTRIBUTE_GROUPS(ina238); static int ina238_probe(struct i2c_client *client) { - struct ina2xx_platform_data *pdata = dev_get_platdata(&client->dev); struct device *dev = &client->dev; struct device *hwmon_dev; struct ina238_data *data; @@ -772,9 +769,8 @@ static int ina238_probe(struct i2c_client *client) } /* load shunt value */ - data->rshunt = INA238_RSHUNT_DEFAULT; - if (device_property_read_u32(dev, "shunt-resistor", &data->rshunt) < 0 && pdata) - data->rshunt = pdata->shunt_uohms; + if (device_property_read_u32(dev, "shunt-resistor", &data->rshunt) < 0) + data->rshunt = INA238_RSHUNT_DEFAULT; if (data->rshunt == 0) { dev_err(dev, "invalid shunt resister value %u\n", data->rshunt); return -EINVAL; From f2711a19651f28b426cf78792822a37f7641f43c Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 5 Sep 2025 13:20:04 -0700 Subject: [PATCH 1054/3206] hwmon: (ina238) Update documentation and Kconfig entry Update driver documentation and Kconfig entry to list all chips supported by the driver. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- Documentation/hwmon/ina238.rst | 39 ++++++++++++++++++++++++++-------- drivers/hwmon/Kconfig | 9 ++++---- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/Documentation/hwmon/ina238.rst b/Documentation/hwmon/ina238.rst index 9b830e37c986b4..1ac4e2c419ac1e 100644 --- a/Documentation/hwmon/ina238.rst +++ b/Documentation/hwmon/ina238.rst @@ -5,6 +5,24 @@ Kernel driver ina238 Supported chips: + * Texas Instruments INA228 + + Prefix: 'ina228' + + Addresses: I2C 0x40 - 0x4f + + Datasheet: + https://www.ti.com/lit/gpn/ina228 + + * Texas Instruments INA237 + + Prefix: 'ina237' + + Addresses: I2C 0x40 - 0x4f + + Datasheet: + https://www.ti.com/lit/gpn/ina237 + * Texas Instruments INA238 Prefix: 'ina238' @@ -34,6 +52,13 @@ platform code or from device tree data. Please refer to Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml for bindings if device tree is used. +INA237 is a functionally equivalent variant of INA238 with slightly +different accuracy. INA228 is another variant of INA238 with higher ADC +resolution. This chip also reports the energy. + +SQ52206 is a mostly compatible chip from Sylergy. It reports the energy +as well as the peak power consumption. + Sysfs entries ------------- @@ -53,19 +78,15 @@ in1_max_alarm Maximum shunt voltage alarm power1_input Power measurement (uW) power1_max Maximum power threshold (uW) power1_max_alarm Maximum power alarm +power1_input_highest Peak Power (uW) + (SQ52206 only) curr1_input Current measurement (mA) +energy1_input Energy measurement (uJ) + (SQ52206 and INA237 only) + temp1_input Die temperature measurement (mC) temp1_max Maximum die temperature threshold (mC) temp1_max_alarm Maximum die temperature alarm ======================= ======================================================= - -Additional sysfs entries for sq52206 ------------------------------------- - -======================= ======================================================= -energy1_input Energy measurement (uJ) - -power1_input_highest Peak Power (uW) -======================= ======================================================= diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index c53c041c32ef70..6bdce991f5f01a 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -2252,13 +2252,14 @@ config SENSORS_INA2XX will be called ina2xx. config SENSORS_INA238 - tristate "Texas Instruments INA238" + tristate "Texas Instruments INA238 and compatibles" depends on I2C select REGMAP_I2C help - If you say yes here you get support for the INA238 power monitor - chip. This driver supports voltage, current, power and temperature - measurements as well as alarm configuration. + If you say yes here you get support for INA228, INA237, INA238, and + SQ52206 power monitor chips. This driver supports voltage, current, + power, energy, and temperature measurements as well as alarm + configuration. This driver can also be built as a module. If so, the module will be called ina238. From 8640f9ab1015741e22dff2dd1d6665ad024d5534 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 1 Sep 2025 11:01:53 -0700 Subject: [PATCH 1055/3206] hwmon: (ina238) Drop pointless power attribute check on attribute writes There is only a single writeable power attribute, and it is very unlikely that the chips supported by this driver will ever require another one. That means checking for that attribute during runtime is unnecessary. Drop the check. Rename the write function from ina238_write_power() to ina238_write_power_max() to reflect that a single attribute is written. No functional change intended. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- drivers/hwmon/ina238.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index 22e2b862fb3363..23195dead74f6c 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -503,14 +503,11 @@ static int ina238_read_power(struct device *dev, u32 attr, long *val) return 0; } -static int ina238_write_power(struct device *dev, u32 attr, long val) +static int ina238_write_power_max(struct device *dev, long val) { struct ina238_data *data = dev_get_drvdata(dev); long regval; - if (attr != hwmon_power_max) - return -EOPNOTSUPP; - /* * Unsigned postive values. Compared against the 24-bit power register, * lower 8-bits are truncated. Same conversion to/from uW as POWER @@ -628,7 +625,7 @@ static int ina238_write(struct device *dev, enum hwmon_sensor_types type, err = ina238_write_in(dev, attr, channel, val); break; case hwmon_power: - err = ina238_write_power(dev, attr, val); + err = ina238_write_power_max(dev, val); break; case hwmon_temp: err = ina238_write_temp(dev, attr, val); From 40a5da1ec101476dd6abb9cfa181f1b50dc24ad0 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 30 Aug 2025 17:38:51 -0700 Subject: [PATCH 1056/3206] hwmon: (ina238) Rework and simplify temperature calculations The temperature register is 16 bit wide for all chips. The decimal point is at the same location (bit 7 = 1 degree C). That means we can use the resolution to calculate temperatures. Do that to simplify the code. There is only a single writeable temperature attribute, and it is very unlikely that the chips supported by this driver will ever require another one. That means checking for that attribute in the write function is unnecessary. Drop the check. Rename the write function from ina238_write_temp() to ina238_write_temp_max() to reflect that a single attribute is written. Also extend the accepted temperature value range to the range supported by the chip registers. Limiting the accepted value range to the temperature range supported by the chip would make it impossible to read an out-of-range limit from the chip and to write the same value back into it. This is undesirable, especially since the maximum temperature register does contain the maximum register value after a chip reset, not the temperature limit supported by the chip. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- drivers/hwmon/ina238.c | 52 +++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index 23195dead74f6c..e386a0f83fbb7d 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -103,10 +103,7 @@ #define INA238_SHUNT_VOLTAGE_LSB 5 /* 5 uV/lsb */ #define INA238_BUS_VOLTAGE_LSB 3125 /* 3.125 mV/lsb */ -#define INA238_DIE_TEMP_LSB 1250000 /* 125.0000 mC/lsb */ #define SQ52206_BUS_VOLTAGE_LSB 3750 /* 3.75 mV/lsb */ -#define SQ52206_DIE_TEMP_LSB 78125 /* 7.8125 mC/lsb */ -#define INA228_DIE_TEMP_LSB 78125 /* 7.8125 mC/lsb */ static const struct regmap_config ina238_regmap_config = { .max_register = INA238_REGISTERS, @@ -120,11 +117,10 @@ struct ina238_config { bool has_20bit_voltage_current; /* vshunt, vbus and current are 20-bit fields */ bool has_power_highest; /* chip detection power peak */ bool has_energy; /* chip detection energy */ - u8 temp_shift; /* fixed parameters for temp calculate */ + u8 temp_resolution; /* temperature register resolution in bit */ u32 power_calculate_factor; /* fixed parameters for power calculate */ u16 config_default; /* Power-on default state */ int bus_voltage_lsb; /* use for temperature calculate, uV/lsb */ - int temp_lsb; /* use for temperature calculate */ }; struct ina238_data { @@ -141,41 +137,37 @@ static const struct ina238_config ina238_config[] = { .has_20bit_voltage_current = false, .has_energy = false, .has_power_highest = false, - .temp_shift = 4, .power_calculate_factor = 20, .config_default = INA238_CONFIG_DEFAULT, .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, - .temp_lsb = INA238_DIE_TEMP_LSB, + .temp_resolution = 12, }, [ina237] = { .has_20bit_voltage_current = false, .has_energy = false, .has_power_highest = false, - .temp_shift = 4, .power_calculate_factor = 20, .config_default = INA238_CONFIG_DEFAULT, .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, - .temp_lsb = INA238_DIE_TEMP_LSB, + .temp_resolution = 12, }, [sq52206] = { .has_20bit_voltage_current = false, .has_energy = true, .has_power_highest = true, - .temp_shift = 0, .power_calculate_factor = 24, .config_default = SQ52206_CONFIG_DEFAULT, .bus_voltage_lsb = SQ52206_BUS_VOLTAGE_LSB, - .temp_lsb = SQ52206_DIE_TEMP_LSB, + .temp_resolution = 16, }, [ina228] = { .has_20bit_voltage_current = true, .has_energy = true, .has_power_highest = false, - .temp_shift = 0, .power_calculate_factor = 20, .config_default = INA238_CONFIG_DEFAULT, .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, - .temp_lsb = INA228_DIE_TEMP_LSB, + .temp_resolution = 16, }, }; @@ -522,6 +514,11 @@ static int ina238_write_power_max(struct device *dev, long val) return regmap_write(data->regmap, INA238_POWER_LIMIT, regval); } +static int ina238_temp_from_reg(s16 regval, u8 resolution) +{ + return ((regval >> (16 - resolution)) * 1000) >> (resolution - 9); +} + static int ina238_read_temp(struct device *dev, u32 attr, long *val) { struct ina238_data *data = dev_get_drvdata(dev); @@ -533,17 +530,14 @@ static int ina238_read_temp(struct device *dev, u32 attr, long *val) err = regmap_read(data->regmap, INA238_DIE_TEMP, ®val); if (err) return err; - /* Signed, result in mC */ - *val = div_s64(((s64)((s16)regval) >> data->config->temp_shift) * - (s64)data->config->temp_lsb, 10000); + *val = ina238_temp_from_reg(regval, data->config->temp_resolution); break; case hwmon_temp_max: err = regmap_read(data->regmap, INA238_TEMP_LIMIT, ®val); if (err) return err; /* Signed, result in mC */ - *val = div_s64(((s64)((s16)regval) >> data->config->temp_shift) * - (s64)data->config->temp_lsb, 10000); + *val = ina238_temp_from_reg(regval, data->config->temp_resolution); break; case hwmon_temp_max_alarm: err = regmap_read(data->regmap, INA238_DIAG_ALERT, ®val); @@ -559,19 +553,21 @@ static int ina238_read_temp(struct device *dev, u32 attr, long *val) return 0; } -static int ina238_write_temp(struct device *dev, u32 attr, long val) +static u16 ina238_temp_to_reg(long val, u8 resolution) { - struct ina238_data *data = dev_get_drvdata(dev); - int regval; + int fraction = 1000 - DIV_ROUND_CLOSEST(1000, BIT(resolution - 9)); - if (attr != hwmon_temp_max) - return -EOPNOTSUPP; + val = clamp_val(val, -255000 - fraction, 255000 + fraction); + + return (DIV_ROUND_CLOSEST(val << (resolution - 9), 1000) << (16 - resolution)) & 0xffff; +} - /* Signed */ - val = clamp_val(val, -40000, 125000); - regval = div_s64(val * 10000, data->config->temp_lsb) << data->config->temp_shift; - regval = clamp_val(regval, S16_MIN, S16_MAX) & (0xffff << data->config->temp_shift); +static int ina238_write_temp_max(struct device *dev, long val) +{ + struct ina238_data *data = dev_get_drvdata(dev); + int regval; + regval = ina238_temp_to_reg(val, data->config->temp_resolution); return regmap_write(data->regmap, INA238_TEMP_LIMIT, regval); } @@ -628,7 +624,7 @@ static int ina238_write(struct device *dev, enum hwmon_sensor_types type, err = ina238_write_power_max(dev, val); break; case hwmon_temp: - err = ina238_write_temp(dev, attr, val); + err = ina238_write_temp_max(dev, val); break; default: err = -EOPNOTSUPP; From 4a4fcd611295af96af51574b31f9e19e7505f965 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 28 Aug 2025 17:44:17 -0700 Subject: [PATCH 1057/3206] hwmon: (ina238) Pre-calculate current, power, and energy LSB Current, power, and energy LSB do not change during runtime, so we can pre-calculate the respective values. The power LSB can be derived from the current LSB using the equation in the datasheets. Similar, the energy LSB can be derived from the power LSB. Also add support for chips with built-in shunt resistor by providing a chip specific configuration parameter for the current LSB. The relationship of current -> power -> energy LSB values in those chips is the same as in chips with external shunt resistor, so configuration parameters for power and energy LSB are not needed. Use ROUND_CLOSEST functions instead of divide operations to reduce rounding errors. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- drivers/hwmon/ina238.c | 47 ++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index e386a0f83fbb7d..316a7dc720f5e0 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -118,9 +118,10 @@ struct ina238_config { bool has_power_highest; /* chip detection power peak */ bool has_energy; /* chip detection energy */ u8 temp_resolution; /* temperature register resolution in bit */ - u32 power_calculate_factor; /* fixed parameters for power calculate */ + u32 power_calculate_factor; /* fixed parameter for power calculation, from datasheet */ u16 config_default; /* Power-on default state */ int bus_voltage_lsb; /* use for temperature calculate, uV/lsb */ + int current_lsb; /* current LSB, in uA */ }; struct ina238_data { @@ -130,6 +131,9 @@ struct ina238_data { struct regmap *regmap; u32 rshunt; int gain; + int current_lsb; /* current LSB, in uA */ + int power_lsb; /* power LSB, in uW */ + int energy_lsb; /* energy LSB, in uJ */ }; static const struct ina238_config ina238_config[] = { @@ -422,9 +426,8 @@ static int ina238_read_current(struct device *dev, u32 attr, long *val) regval = (s16)regval; } - /* Signed register, fixed 1mA current lsb. result in mA */ - *val = div_s64((s64)regval * INA238_FIXED_SHUNT * data->gain, - data->rshunt * 4); + /* Signed register. Result in mA */ + *val = DIV_S64_ROUND_CLOSEST((s64)regval * data->current_lsb, 1000); /* Account for 4 bit offset */ if (data->config->has_20bit_voltage_current) @@ -450,9 +453,7 @@ static int ina238_read_power(struct device *dev, u32 attr, long *val) if (err) return err; - /* Fixed 1mA lsb, scaled by 1000000 to have result in uW */ - power = div_u64(regval * 1000ULL * INA238_FIXED_SHUNT * data->gain * - data->config->power_calculate_factor, 4 * 100 * data->rshunt); + power = (long long)regval * data->power_lsb; /* Clamp value to maximum value of long */ *val = clamp_val(power, 0, LONG_MAX); break; @@ -461,9 +462,7 @@ static int ina238_read_power(struct device *dev, u32 attr, long *val) if (err) return err; - /* Fixed 1mA lsb, scaled by 1000000 to have result in uW */ - power = div_u64(regval * 1000ULL * INA238_FIXED_SHUNT * data->gain * - data->config->power_calculate_factor, 4 * 100 * data->rshunt); + power = (long long)regval * data->power_lsb; /* Clamp value to maximum value of long */ *val = clamp_val(power, 0, LONG_MAX); break; @@ -476,8 +475,7 @@ static int ina238_read_power(struct device *dev, u32 attr, long *val) * Truncated 24-bit compare register, lower 8-bits are * truncated. Same conversion to/from uW as POWER register. */ - power = div_u64((regval << 8) * 1000ULL * INA238_FIXED_SHUNT * data->gain * - data->config->power_calculate_factor, 4 * 100 * data->rshunt); + power = ((long long)regval << 8) * data->power_lsb; /* Clamp value to maximum value of long */ *val = clamp_val(power, 0, LONG_MAX); break; @@ -498,7 +496,6 @@ static int ina238_read_power(struct device *dev, u32 attr, long *val) static int ina238_write_power_max(struct device *dev, long val) { struct ina238_data *data = dev_get_drvdata(dev); - long regval; /* * Unsigned postive values. Compared against the 24-bit power register, @@ -506,12 +503,11 @@ static int ina238_write_power_max(struct device *dev, long val) * register. * The first clamp_val() is to establish a baseline to avoid overflows. */ - regval = clamp_val(val, 0, LONG_MAX / 2); - regval = div_u64(regval * 4 * 100 * data->rshunt, data->config->power_calculate_factor * - 1000ULL * INA238_FIXED_SHUNT * data->gain); - regval = clamp_val(regval >> 8, 0, U16_MAX); + val = clamp_val(val, 0, LONG_MAX / 2); + val = DIV_ROUND_CLOSEST(val, data->power_lsb); + val = clamp_val(val >> 8, 0, U16_MAX); - return regmap_write(data->regmap, INA238_POWER_LIMIT, regval); + return regmap_write(data->regmap, INA238_POWER_LIMIT, val); } static int ina238_temp_from_reg(s16 regval, u8 resolution) @@ -584,8 +580,7 @@ static ssize_t energy1_input_show(struct device *dev, return ret; /* result in uJ */ - energy = div_u64(regval * INA238_FIXED_SHUNT * data->gain * 16 * 10 * - data->config->power_calculate_factor, 4 * data->rshunt); + energy = regval * data->energy_lsb; return sysfs_emit(buf, "%llu\n", energy); } @@ -817,6 +812,18 @@ static int ina238_probe(struct i2c_client *client) return -ENODEV; } + if (data->config->current_lsb) + data->current_lsb = data->config->current_lsb; + else + data->current_lsb = DIV_U64_ROUND_CLOSEST(250ULL * INA238_FIXED_SHUNT * data->gain, + data->rshunt); + + data->power_lsb = DIV_ROUND_CLOSEST(data->current_lsb * + data->config->power_calculate_factor, + 100); + + data->energy_lsb = data->power_lsb * 16; + hwmon_dev = devm_hwmon_device_register_with_info(dev, client->name, data, &ina238_chip_info, data->config->has_energy ? From 7e420b6a274206f339fe3617943ba1ef9dd1fcb0 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 30 Aug 2025 21:48:29 -0700 Subject: [PATCH 1058/3206] hwmon: (ina238) Simplify voltage register accesses Calculate voltage LSB values in the probe function and use throughout the code. Use a single function to read all voltages, independently of the register width. Use the pre-calculated LSB values to convert register values to voltages and do not rely on runtime chip specific code. Use ROUND_CLOSEST functions instead of divide operations to reduce rounding errors. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- drivers/hwmon/ina238.c | 161 ++++++++++++++--------------------------- 1 file changed, 53 insertions(+), 108 deletions(-) diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index 316a7dc720f5e0..ae27ae2582f23a 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -101,9 +101,11 @@ #define INA238_CALIBRATION_VALUE 16384 #define INA238_FIXED_SHUNT 20000 -#define INA238_SHUNT_VOLTAGE_LSB 5 /* 5 uV/lsb */ -#define INA238_BUS_VOLTAGE_LSB 3125 /* 3.125 mV/lsb */ -#define SQ52206_BUS_VOLTAGE_LSB 3750 /* 3.75 mV/lsb */ +#define INA238_SHUNT_VOLTAGE_LSB 5000 /* 5 uV/lsb, in nV */ +#define INA238_BUS_VOLTAGE_LSB 3125000 /* 3.125 mV/lsb, in nV */ +#define SQ52206_BUS_VOLTAGE_LSB 3750000 /* 3.75 mV/lsb, in nV */ + +#define NUNIT_PER_MUNIT 1000000 /* n[AV] -> m[AV] */ static const struct regmap_config ina238_regmap_config = { .max_register = INA238_REGISTERS, @@ -118,9 +120,9 @@ struct ina238_config { bool has_power_highest; /* chip detection power peak */ bool has_energy; /* chip detection energy */ u8 temp_resolution; /* temperature register resolution in bit */ - u32 power_calculate_factor; /* fixed parameter for power calculation, from datasheet */ u16 config_default; /* Power-on default state */ - int bus_voltage_lsb; /* use for temperature calculate, uV/lsb */ + u32 power_calculate_factor; /* fixed parameter for power calculation, from datasheet */ + u32 bus_voltage_lsb; /* bus voltage LSB, in nV */ int current_lsb; /* current LSB, in uA */ }; @@ -131,6 +133,7 @@ struct ina238_data { struct regmap *regmap; u32 rshunt; int gain; + u32 voltage_lsb[2]; /* shunt, bus voltage LSB, in nV */ int current_lsb; /* current LSB, in uA */ int power_lsb; /* power LSB, in uW */ int energy_lsb; /* energy LSB, in uJ */ @@ -226,45 +229,28 @@ static int ina238_read_field_s20(const struct i2c_client *client, u8 reg, s32 *v return 0; } -static int ina228_read_shunt_voltage(struct device *dev, u32 attr, int channel, - long *val) -{ - struct ina238_data *data = dev_get_drvdata(dev); - int regval; - int err; - - err = ina238_read_field_s20(data->client, INA238_SHUNT_VOLTAGE, ®val); - if (err) - return err; - - /* - * gain of 1 -> LSB / 4 - * This field has 16 bit on ina238. ina228 adds another 4 bits of - * precision. ina238 conversion factors can still be applied when - * dividing by 16. - */ - *val = (regval * INA238_SHUNT_VOLTAGE_LSB) * data->gain / (1000 * 4) / 16; - return 0; -} - -static int ina228_read_bus_voltage(struct device *dev, u32 attr, int channel, - long *val) +static int ina228_read_voltage(struct ina238_data *data, int channel, long *val) { - struct ina238_data *data = dev_get_drvdata(dev); - int regval; - int err; + int reg = channel ? INA238_BUS_VOLTAGE : INA238_SHUNT_VOLTAGE; + u32 lsb = data->voltage_lsb[channel]; + u32 factor = NUNIT_PER_MUNIT; + int err, regval; - err = ina238_read_field_s20(data->client, INA238_BUS_VOLTAGE, ®val); - if (err) - return err; + if (data->config->has_20bit_voltage_current) { + err = ina238_read_field_s20(data->client, reg, ®val); + if (err) + return err; + /* Adjust accuracy: LSB in units of 500 pV */ + lsb /= 8; + factor *= 2; + } else { + err = regmap_read(data->regmap, reg, ®val); + if (err) + return err; + regval = (s16)regval; + } - /* - * gain of 1 -> LSB / 4 - * This field has 16 bit on ina238. ina228 adds another 4 bits of - * precision. ina238 conversion factors can still be applied when - * dividing by 16. - */ - *val = (regval * data->config->bus_voltage_lsb) / 1000 / 16; + *val = DIV_S64_ROUND_CLOSEST((s64)regval * lsb, factor); return 0; } @@ -272,18 +258,16 @@ static int ina238_read_in(struct device *dev, u32 attr, int channel, long *val) { struct ina238_data *data = dev_get_drvdata(dev); - int reg, mask; + int reg, mask = 0; int regval; int err; + if (attr == hwmon_in_input) + return ina228_read_voltage(data, channel, val); + switch (channel) { case 0: switch (attr) { - case hwmon_in_input: - if (data->config->has_20bit_voltage_current) - return ina228_read_shunt_voltage(dev, attr, channel, val); - reg = INA238_SHUNT_VOLTAGE; - break; case hwmon_in_max: reg = INA238_SHUNT_OVER_VOLTAGE; break; @@ -304,11 +288,6 @@ static int ina238_read_in(struct device *dev, u32 attr, int channel, break; case 1: switch (attr) { - case hwmon_in_input: - if (data->config->has_20bit_voltage_current) - return ina228_read_bus_voltage(dev, attr, channel, val); - reg = INA238_BUS_VOLTAGE; - break; case hwmon_in_max: reg = INA238_BUS_OVER_VOLTAGE; break; @@ -335,72 +314,35 @@ static int ina238_read_in(struct device *dev, u32 attr, int channel, if (err < 0) return err; - switch (attr) { - case hwmon_in_input: - case hwmon_in_max: - case hwmon_in_min: - /* signed register, value in mV */ - regval = (s16)regval; - if (channel == 0) - /* gain of 1 -> LSB / 4 */ - *val = (regval * INA238_SHUNT_VOLTAGE_LSB) * - data->gain / (1000 * 4); - else - *val = (regval * data->config->bus_voltage_lsb) / 1000; - break; - case hwmon_in_max_alarm: - case hwmon_in_min_alarm: + if (mask) *val = !!(regval & mask); - break; - } + else + *val = DIV_S64_ROUND_CLOSEST((s64)(s16)regval * data->voltage_lsb[channel], + NUNIT_PER_MUNIT); return 0; } -static int ina238_write_in(struct device *dev, u32 attr, int channel, - long val) +static int ina238_write_in(struct device *dev, u32 attr, int channel, long val) { struct ina238_data *data = dev_get_drvdata(dev); + static const int low_limits[2] = {-164, 0}; + static const int high_limits[2] = {164, 150000}; + static const u8 low_regs[2] = {INA238_SHUNT_UNDER_VOLTAGE, INA238_BUS_UNDER_VOLTAGE}; + static const u8 high_regs[2] = {INA238_SHUNT_OVER_VOLTAGE, INA238_BUS_OVER_VOLTAGE}; int regval; - if (attr != hwmon_in_max && attr != hwmon_in_min) - return -EOPNOTSUPP; - - /* convert decimal to register value */ - switch (channel) { - case 0: - /* signed value, clamp to max range +/-163 mV */ - regval = clamp_val(val, -163, 163); - regval = (regval * 1000 * 4) / - (INA238_SHUNT_VOLTAGE_LSB * data->gain); - regval = clamp_val(regval, S16_MIN, S16_MAX) & 0xffff; - - switch (attr) { - case hwmon_in_max: - return regmap_write(data->regmap, - INA238_SHUNT_OVER_VOLTAGE, regval); - case hwmon_in_min: - return regmap_write(data->regmap, - INA238_SHUNT_UNDER_VOLTAGE, regval); - default: - return -EOPNOTSUPP; - } - case 1: - /* signed value, positive values only. Clamp to max 102.396 V */ - regval = clamp_val(val, 0, 102396); - regval = (regval * 1000) / data->config->bus_voltage_lsb; - regval = clamp_val(regval, 0, S16_MAX); + /* Initial clamp to avoid overflows */ + val = clamp_val(val, low_limits[channel], high_limits[channel]); + val = DIV_S64_ROUND_CLOSEST((s64)val * NUNIT_PER_MUNIT, data->voltage_lsb[channel]); + /* Final clamp to register limits */ + regval = clamp_val(val, S16_MIN, S16_MAX) & 0xffff; - switch (attr) { - case hwmon_in_max: - return regmap_write(data->regmap, - INA238_BUS_OVER_VOLTAGE, regval); - case hwmon_in_min: - return regmap_write(data->regmap, - INA238_BUS_UNDER_VOLTAGE, regval); - default: - return -EOPNOTSUPP; - } + switch (attr) { + case hwmon_in_min: + return regmap_write(data->regmap, low_regs[channel], regval); + case hwmon_in_max: + return regmap_write(data->regmap, high_regs[channel], regval); default: return -EOPNOTSUPP; } @@ -812,6 +754,9 @@ static int ina238_probe(struct i2c_client *client) return -ENODEV; } + data->voltage_lsb[0] = INA238_SHUNT_VOLTAGE_LSB * data->gain / 4; + data->voltage_lsb[1] = data->config->bus_voltage_lsb; + if (data->config->current_lsb) data->current_lsb = data->config->current_lsb; else From bcac89eebefbcf886c2b971de4bf026073e0ee8e Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 3 Sep 2025 16:02:48 -0700 Subject: [PATCH 1059/3206] hwmon: (ina238) Improve current dynamic range The best possible dynamic range for current measurements is achieved if the shunt register value matches the current register value. Adjust the calibration register as well as fixed and default shunt resistor values accordingly to achieve this range. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- drivers/hwmon/ina238.c | 51 ++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index ae27ae2582f23a..c04481a8c6434a 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -51,7 +51,7 @@ #define INA238_REGISTERS 0x20 -#define INA238_RSHUNT_DEFAULT 10000 /* uOhm */ +#define INA238_RSHUNT_DEFAULT 2500 /* uOhm */ /* Default configuration of device on reset. */ #define INA238_CONFIG_DEFAULT 0 @@ -67,39 +67,26 @@ * relative to the shunt resistor value within the driver. This is similar to * how the ina2xx driver handles current/power scaling. * - * The end result of this is that increasing shunt values (from a fixed 20 mOhm - * shunt) increase the effective current/power accuracy whilst limiting the - * range and decreasing shunt values decrease the effective accuracy but - * increase the range. + * To achieve the best possible dynamic range, the value of the shunt voltage + * register should match the value of the current register. With that, the shunt + * voltage of 0x7fff = 32,767 uV = 163,785 uV matches the maximum current, + * and no accuracy is lost. Experiments with a real chip show that this is + * achieved by setting the SHUNT_CAL register to a value of 0x1000 = 4,096. + * Per datasheet, + * SHUNT_CAL = 819.2 x 10^6 x CURRENT_LSB x Rshunt + * = 819,200,000 x CURRENT_LSB x Rshunt + * With SHUNT_CAL set to 4,096, we get + * CURRENT_LSB = 4,096 / (819,200,000 x Rshunt) + * Assuming an Rshunt value of 5 mOhm, we get + * CURRENT_LSB = 4,096 / (819,200,000 x 0.005) = 1mA + * and thus a dynamic range of 1mA ... 32,767mA, which is sufficient for most + * applications. The actual dynamic range is of course determined by the actual + * shunt resistor value. * - * The value of the Current register is calculated given the following: - * Current (A) = (shunt voltage register * 5) * calibration / 81920 - * - * The maximum shunt voltage is 163.835 mV (0x7fff, ADC_RANGE = 0, gain = 4). - * With the maximum current value of 0x7fff and a fixed shunt value results in - * a calibration value of 16384 (0x4000). - * - * 0x7fff = (0x7fff * 5) * calibration / 81920 - * calibration = 0x4000 - * - * Equivalent calibration is applied for the Power register (maximum value for - * bus voltage is 102396.875 mV, 0x7fff), where the maximum power that can - * occur is ~16776192 uW (register value 0x147a8): - * - * This scaling means the resulting values for Current and Power registers need - * to be scaled by the difference between the fixed shunt resistor and the - * actual shunt resistor: - * - * shunt = 0x4000 / (819.2 * 10^6) / 0.001 = 20000 uOhms (with 1mA/lsb) - * - * Current (mA) = register value * 20000 / rshunt / 4 * gain - * Power (mW) = 0.2 * register value * 20000 / rshunt / 4 * gain - * (Specific for SQ52206) - * Power (mW) = 0.24 * register value * 20000 / rshunt / 4 * gain - * Energy (uJ) = 16 * 0.24 * register value * 20000 / rshunt / 4 * gain * 1000 + * Power and energy values are scaled accordingly. */ -#define INA238_CALIBRATION_VALUE 16384 -#define INA238_FIXED_SHUNT 20000 +#define INA238_CALIBRATION_VALUE 4096 +#define INA238_FIXED_SHUNT 5000 #define INA238_SHUNT_VOLTAGE_LSB 5000 /* 5 uV/lsb, in nV */ #define INA238_BUS_VOLTAGE_LSB 3125000 /* 3.125 mV/lsb, in nV */ From b7bce92f2890f6002bd3155c0510bd28b6ead4e1 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 3 Sep 2025 16:08:03 -0700 Subject: [PATCH 1060/3206] hwmon: (ina238) Stop using the shunt voltage register Since the value of the current register and the value of the shunt register now match each other, it is no longer necessary to read the shunt voltage register in the first place. Read the current register instead and use it to calculate the shunt voltage. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- drivers/hwmon/ina238.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index c04481a8c6434a..9dc94eccb7506e 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -218,7 +218,7 @@ static int ina238_read_field_s20(const struct i2c_client *client, u8 reg, s32 *v static int ina228_read_voltage(struct ina238_data *data, int channel, long *val) { - int reg = channel ? INA238_BUS_VOLTAGE : INA238_SHUNT_VOLTAGE; + int reg = channel ? INA238_BUS_VOLTAGE : INA238_CURRENT; u32 lsb = data->voltage_lsb[channel]; u32 factor = NUNIT_PER_MUNIT; int err, regval; From a1d5f8ecb934066220b4cb1b8ba831a8726bf52f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 31 Aug 2025 09:25:16 -0700 Subject: [PATCH 1061/3206] hwmon: (ina238) Add support for current limits Since the shunt voltage register and the current register now report the same values, use the shunt voltage limit registers to report and adjust current limits, using the same LSB as the LSB used for the actual current register. Handle current register accuracy differences in separate function to improve code readability. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- Documentation/hwmon/ina238.rst | 4 ++ drivers/hwmon/ina238.c | 105 ++++++++++++++++++++++++++------- 2 files changed, 87 insertions(+), 22 deletions(-) diff --git a/Documentation/hwmon/ina238.rst b/Documentation/hwmon/ina238.rst index 1ac4e2c419ac1e..3c7db4a47056a6 100644 --- a/Documentation/hwmon/ina238.rst +++ b/Documentation/hwmon/ina238.rst @@ -82,6 +82,10 @@ power1_input_highest Peak Power (uW) (SQ52206 only) curr1_input Current measurement (mA) +curr1_min Minimum current threshold (mA) +curr1_min_alarm Minimum current alarm +curr1_max Maximum current threshold (mA) +curr1_max_alarm Maximum current alarm energy1_input Energy measurement (uJ) (SQ52206 and INA237 only) diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index 9dc94eccb7506e..97f12efcaef49d 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -335,40 +335,92 @@ static int ina238_write_in(struct device *dev, u32 attr, int channel, long val) } } -static int ina238_read_current(struct device *dev, u32 attr, long *val) +static int __ina238_read_curr(struct ina238_data *data, long *val) +{ + u32 lsb = data->current_lsb; + int err, regval; + + if (data->config->has_20bit_voltage_current) { + err = ina238_read_field_s20(data->client, INA238_CURRENT, ®val); + if (err) + return err; + lsb /= 16; /* Adjust accuracy */ + } else { + err = regmap_read(data->regmap, INA238_CURRENT, ®val); + if (err) + return err; + regval = (s16)regval; + } + + *val = DIV_S64_ROUND_CLOSEST((s64)regval * lsb, 1000); + return 0; +} + +static int ina238_read_curr(struct device *dev, u32 attr, long *val) { struct ina238_data *data = dev_get_drvdata(dev); + int reg, mask = 0; int regval; int err; - switch (attr) { - case hwmon_curr_input: - if (data->config->has_20bit_voltage_current) { - err = ina238_read_field_s20(data->client, INA238_CURRENT, ®val); - if (err) - return err; - } else { - err = regmap_read(data->regmap, INA238_CURRENT, ®val); - if (err < 0) - return err; - /* sign-extend */ - regval = (s16)regval; - } + if (attr == hwmon_curr_input) + return __ina238_read_curr(data, val); - /* Signed register. Result in mA */ - *val = DIV_S64_ROUND_CLOSEST((s64)regval * data->current_lsb, 1000); - - /* Account for 4 bit offset */ - if (data->config->has_20bit_voltage_current) - *val /= 16; + switch (attr) { + case hwmon_curr_min: + reg = INA238_SHUNT_UNDER_VOLTAGE; + break; + case hwmon_curr_min_alarm: + reg = INA238_DIAG_ALERT; + mask = INA238_DIAG_ALERT_SHNTUL; + break; + case hwmon_curr_max: + reg = INA238_SHUNT_OVER_VOLTAGE; + break; + case hwmon_curr_max_alarm: + reg = INA238_DIAG_ALERT; + mask = INA238_DIAG_ALERT_SHNTOL; break; default: return -EOPNOTSUPP; } + err = regmap_read(data->regmap, reg, ®val); + if (err < 0) + return err; + + if (mask) + *val = !!(regval & mask); + else + *val = DIV_S64_ROUND_CLOSEST((s64)(s16)regval * data->current_lsb, 1000); + return 0; } +static int ina238_write_curr(struct device *dev, u32 attr, long val) +{ + struct ina238_data *data = dev_get_drvdata(dev); + int regval; + + /* Set baseline range to avoid over/underflows */ + val = clamp_val(val, -1000000, 1000000); + /* Scale */ + val = DIV_ROUND_CLOSEST(val * 1000, data->current_lsb); + /* Clamp to register size */ + regval = clamp_val(val, S16_MIN, S16_MAX) & 0xffff; + + switch (attr) { + case hwmon_curr_min: + return regmap_write(data->regmap, INA238_SHUNT_UNDER_VOLTAGE, + regval); + case hwmon_curr_max: + return regmap_write(data->regmap, INA238_SHUNT_OVER_VOLTAGE, + regval); + default: + return -EOPNOTSUPP; + } +} + static int ina238_read_power(struct device *dev, u32 attr, long *val) { struct ina238_data *data = dev_get_drvdata(dev); @@ -521,7 +573,7 @@ static int ina238_read(struct device *dev, enum hwmon_sensor_types type, case hwmon_in: return ina238_read_in(dev, attr, channel, val); case hwmon_curr: - return ina238_read_current(dev, attr, val); + return ina238_read_curr(dev, attr, val); case hwmon_power: return ina238_read_power(dev, attr, val); case hwmon_temp: @@ -544,6 +596,9 @@ static int ina238_write(struct device *dev, enum hwmon_sensor_types type, case hwmon_in: err = ina238_write_in(dev, attr, channel, val); break; + case hwmon_curr: + err = ina238_write_curr(dev, attr, val); + break; case hwmon_power: err = ina238_write_power_max(dev, val); break; @@ -582,7 +637,12 @@ static umode_t ina238_is_visible(const void *drvdata, case hwmon_curr: switch (attr) { case hwmon_curr_input: + case hwmon_curr_max_alarm: + case hwmon_curr_min_alarm: return 0444; + case hwmon_curr_max: + case hwmon_curr_min: + return 0644; default: return 0; } @@ -627,7 +687,8 @@ static const struct hwmon_channel_info * const ina238_info[] = { INA238_HWMON_IN_CONFIG), HWMON_CHANNEL_INFO(curr, /* 0: current through shunt */ - HWMON_C_INPUT), + HWMON_C_INPUT | HWMON_C_MIN | HWMON_C_MIN_ALARM | + HWMON_C_MAX | HWMON_C_MAX_ALARM), HWMON_CHANNEL_INFO(power, /* 0: power */ HWMON_P_INPUT | HWMON_P_MAX | From e7702d72c3e3fcc6017d29bba420deb04da3e1d6 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 31 Aug 2025 09:34:13 -0700 Subject: [PATCH 1062/3206] hwmon: (ina238) Order chip information alphabetically Order chip type enum and chip configuration data alphabetically to simplify adding support for additional chips. No functional change. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- drivers/hwmon/ina238.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index 97f12efcaef49d..4681325f58f03c 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -100,7 +100,7 @@ static const struct regmap_config ina238_regmap_config = { .val_bits = 16, }; -enum ina238_ids { ina238, ina237, sq52206, ina228 }; +enum ina238_ids { ina228, ina237, ina238, sq52206 }; struct ina238_config { bool has_20bit_voltage_current; /* vshunt, vbus and current are 20-bit fields */ @@ -127,7 +127,16 @@ struct ina238_data { }; static const struct ina238_config ina238_config[] = { - [ina238] = { + [ina228] = { + .has_20bit_voltage_current = true, + .has_energy = true, + .has_power_highest = false, + .power_calculate_factor = 20, + .config_default = INA238_CONFIG_DEFAULT, + .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, + .temp_resolution = 16, + }, + [ina237] = { .has_20bit_voltage_current = false, .has_energy = false, .has_power_highest = false, @@ -136,7 +145,7 @@ static const struct ina238_config ina238_config[] = { .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, .temp_resolution = 12, }, - [ina237] = { + [ina238] = { .has_20bit_voltage_current = false, .has_energy = false, .has_power_highest = false, @@ -154,15 +163,6 @@ static const struct ina238_config ina238_config[] = { .bus_voltage_lsb = SQ52206_BUS_VOLTAGE_LSB, .temp_resolution = 16, }, - [ina228] = { - .has_20bit_voltage_current = true, - .has_energy = true, - .has_power_highest = false, - .power_calculate_factor = 20, - .config_default = INA238_CONFIG_DEFAULT, - .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, - .temp_resolution = 16, - }, }; static int ina238_read_reg24(const struct i2c_client *client, u8 reg, u32 *val) From cc67b875c9e40ef7628219a37447c477d5324dc1 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 31 Aug 2025 15:06:05 -0700 Subject: [PATCH 1063/3206] hwmon: (ina238) Use the energy64 attribute type to report the energy Use the energy64 attribute type instead of locally defined sysfs attributes to report the accumulated energy. No functional change intended. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- drivers/hwmon/ina238.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index 4681325f58f03c..4d5b383b25215f 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -548,22 +548,19 @@ static int ina238_write_temp_max(struct device *dev, long val) return regmap_write(data->regmap, INA238_TEMP_LIMIT, regval); } -static ssize_t energy1_input_show(struct device *dev, - struct device_attribute *da, char *buf) +static int ina238_read_energy(struct device *dev, s64 *energy) { struct ina238_data *data = dev_get_drvdata(dev); - int ret; u64 regval; - u64 energy; + int ret; ret = ina238_read_reg40(data->client, SQ52206_ENERGY, ®val); if (ret) return ret; /* result in uJ */ - energy = regval * data->energy_lsb; - - return sysfs_emit(buf, "%llu\n", energy); + *energy = regval * data->energy_lsb; + return 0; } static int ina238_read(struct device *dev, enum hwmon_sensor_types type, @@ -576,6 +573,8 @@ static int ina238_read(struct device *dev, enum hwmon_sensor_types type, return ina238_read_curr(dev, attr, val); case hwmon_power: return ina238_read_power(dev, attr, val); + case hwmon_energy64: + return ina238_read_energy(dev, (s64 *)val); case hwmon_temp: return ina238_read_temp(dev, attr, val); default: @@ -620,6 +619,7 @@ static umode_t ina238_is_visible(const void *drvdata, { const struct ina238_data *data = drvdata; bool has_power_highest = data->config->has_power_highest; + bool has_energy = data->config->has_energy; switch (type) { case hwmon_in: @@ -660,6 +660,11 @@ static umode_t ina238_is_visible(const void *drvdata, default: return 0; } + case hwmon_energy64: + /* hwmon_energy_input */ + if (has_energy) + return 0444; + return 0; case hwmon_temp: switch (attr) { case hwmon_temp_input: @@ -693,6 +698,8 @@ static const struct hwmon_channel_info * const ina238_info[] = { /* 0: power */ HWMON_P_INPUT | HWMON_P_MAX | HWMON_P_MAX_ALARM | HWMON_P_INPUT_HIGHEST), + HWMON_CHANNEL_INFO(energy64, + HWMON_E_INPUT), HWMON_CHANNEL_INFO(temp, /* 0: die temperature */ HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MAX_ALARM), @@ -710,15 +717,6 @@ static const struct hwmon_chip_info ina238_chip_info = { .info = ina238_info, }; -/* energy attributes are 5 bytes wide so we need u64 */ -static DEVICE_ATTR_RO(energy1_input); - -static struct attribute *ina238_attrs[] = { - &dev_attr_energy1_input.attr, - NULL, -}; -ATTRIBUTE_GROUPS(ina238); - static int ina238_probe(struct i2c_client *client) { struct device *dev = &client->dev; @@ -818,9 +816,7 @@ static int ina238_probe(struct i2c_client *client) data->energy_lsb = data->power_lsb * 16; hwmon_dev = devm_hwmon_device_register_with_info(dev, client->name, data, - &ina238_chip_info, - data->config->has_energy ? - ina238_groups : NULL); + &ina238_chip_info, NULL); if (IS_ERR(hwmon_dev)) return PTR_ERR(hwmon_dev); From 807e315bf9509f4d57eca687c9732ce16c3ef843 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 1 Sep 2025 08:37:15 -0700 Subject: [PATCH 1064/3206] hwmon: (ina238) Support active-high alert polarity All chips supported by this driver support configurable active-high alert priority. This is already documented in the devicetree description. Add support for it to the driver. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- drivers/hwmon/ina238.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index 4d5b383b25215f..24e396c69ae250 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -60,6 +60,7 @@ #define INA238_ADC_CONFIG_DEFAULT 0xfb6a /* Configure alerts to be based on averaged value (SLOWALERT) */ #define INA238_DIAG_ALERT_DEFAULT 0x2000 +#define INA238_DIAG_ALERT_APOL BIT(12) /* * This driver uses a fixed calibration value in order to scale current/power * based on a fixed shunt resistor value. This allows for conversion within the @@ -793,8 +794,11 @@ static int ina238_probe(struct i2c_client *client) } /* Setup alert/alarm configuration */ - ret = regmap_write(data->regmap, INA238_DIAG_ALERT, - INA238_DIAG_ALERT_DEFAULT); + config = INA238_DIAG_ALERT_DEFAULT; + if (device_property_read_bool(dev, "ti,alert-polarity-active-high")) + config |= INA238_DIAG_ALERT_APOL; + + ret = regmap_write(data->regmap, INA238_DIAG_ALERT, config); if (ret < 0) { dev_err(dev, "error configuring the device: %d\n", ret); return -ENODEV; From d153106bd4fd63eb8d3557277a3d4ee5dbbed3b0 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 31 Aug 2025 09:39:53 -0700 Subject: [PATCH 1065/3206] hwmon: (ina238) Only configure calibration and shunt registers if needed Prepare for supporting chips with internal shunt resistor by only setting calibration and shunt resistor registers if no current LSB is configured. Do not display a log message during probe if a chip does not have shunt and gain registers since those would otherwise display 0, and a message just indicating that the driver was loaded would be just noise. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- drivers/hwmon/ina238.c | 82 +++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index 24e396c69ae250..da5b43184dd1e2 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -745,32 +745,48 @@ static int ina238_probe(struct i2c_client *client) return PTR_ERR(data->regmap); } - /* load shunt value */ - if (device_property_read_u32(dev, "shunt-resistor", &data->rshunt) < 0) - data->rshunt = INA238_RSHUNT_DEFAULT; - if (data->rshunt == 0) { - dev_err(dev, "invalid shunt resister value %u\n", data->rshunt); - return -EINVAL; - } - - /* load shunt gain value */ - if (device_property_read_u32(dev, "ti,shunt-gain", &data->gain) < 0) - data->gain = 4; /* Default of ADCRANGE = 0 */ - if (data->gain != 1 && data->gain != 2 && data->gain != 4) { - dev_err(dev, "invalid shunt gain value %u\n", data->gain); - return -EINVAL; - } - /* Setup CONFIG register */ config = data->config->config_default; - if (chip == sq52206) { - if (data->gain == 1) - config |= SQ52206_CONFIG_ADCRANGE_HIGH; /* ADCRANGE = 10/11 is /1 */ - else if (data->gain == 2) - config |= SQ52206_CONFIG_ADCRANGE_LOW; /* ADCRANGE = 01 is /2 */ - } else if (data->gain == 1) { - config |= INA238_CONFIG_ADCRANGE; /* ADCRANGE = 1 is /1 */ + if (data->config->current_lsb) { + data->voltage_lsb[0] = INA238_SHUNT_VOLTAGE_LSB; + data->current_lsb = data->config->current_lsb; + } else { + /* load shunt value */ + if (device_property_read_u32(dev, "shunt-resistor", &data->rshunt) < 0) + data->rshunt = INA238_RSHUNT_DEFAULT; + if (data->rshunt == 0) { + dev_err(dev, "invalid shunt resister value %u\n", data->rshunt); + return -EINVAL; + } + + /* load shunt gain value */ + if (device_property_read_u32(dev, "ti,shunt-gain", &data->gain) < 0) + data->gain = 4; /* Default of ADCRANGE = 0 */ + if (data->gain != 1 && data->gain != 2 && data->gain != 4) { + dev_err(dev, "invalid shunt gain value %u\n", data->gain); + return -EINVAL; + } + + /* Setup SHUNT_CALIBRATION register with fixed value */ + ret = regmap_write(data->regmap, INA238_SHUNT_CALIBRATION, + INA238_CALIBRATION_VALUE); + if (ret < 0) { + dev_err(dev, "error configuring the device: %d\n", ret); + return -ENODEV; + } + if (chip == sq52206) { + if (data->gain == 1) /* ADCRANGE = 10/11 is /1 */ + config |= SQ52206_CONFIG_ADCRANGE_HIGH; + else if (data->gain == 2) /* ADCRANGE = 01 is /2 */ + config |= SQ52206_CONFIG_ADCRANGE_LOW; + } else if (data->gain == 1) { /* ADCRANGE = 1 is /1 */ + config |= INA238_CONFIG_ADCRANGE; + } + data->voltage_lsb[0] = INA238_SHUNT_VOLTAGE_LSB * data->gain / 4; + data->current_lsb = DIV_U64_ROUND_CLOSEST(250ULL * INA238_FIXED_SHUNT * data->gain, + data->rshunt); } + ret = regmap_write(data->regmap, INA238_CONFIG, config); if (ret < 0) { dev_err(dev, "error configuring the device: %d\n", ret); @@ -785,14 +801,6 @@ static int ina238_probe(struct i2c_client *client) return -ENODEV; } - /* Setup SHUNT_CALIBRATION register with fixed value */ - ret = regmap_write(data->regmap, INA238_SHUNT_CALIBRATION, - INA238_CALIBRATION_VALUE); - if (ret < 0) { - dev_err(dev, "error configuring the device: %d\n", ret); - return -ENODEV; - } - /* Setup alert/alarm configuration */ config = INA238_DIAG_ALERT_DEFAULT; if (device_property_read_bool(dev, "ti,alert-polarity-active-high")) @@ -804,15 +812,8 @@ static int ina238_probe(struct i2c_client *client) return -ENODEV; } - data->voltage_lsb[0] = INA238_SHUNT_VOLTAGE_LSB * data->gain / 4; data->voltage_lsb[1] = data->config->bus_voltage_lsb; - if (data->config->current_lsb) - data->current_lsb = data->config->current_lsb; - else - data->current_lsb = DIV_U64_ROUND_CLOSEST(250ULL * INA238_FIXED_SHUNT * data->gain, - data->rshunt); - data->power_lsb = DIV_ROUND_CLOSEST(data->current_lsb * data->config->power_calculate_factor, 100); @@ -824,8 +825,9 @@ static int ina238_probe(struct i2c_client *client) if (IS_ERR(hwmon_dev)) return PTR_ERR(hwmon_dev); - dev_info(dev, "power monitor %s (Rshunt = %u uOhm, gain = %u)\n", - client->name, data->rshunt, data->gain); + if (data->rshunt) + dev_info(dev, "power monitor %s (Rshunt = %u uOhm, gain = %u)\n", + client->name, data->rshunt, data->gain); return 0; } From 248fd3b96d08c2a5736ca8395470544f23fe1f5c Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 31 Aug 2025 09:39:53 -0700 Subject: [PATCH 1066/3206] hwmon: (ina238) Add support for INA780 INA780 is similar to the other chips in the series, but does not support the shunt voltage register. Shunt voltage limit registers have been renamed to current limit registers, but are otherwise identical. While the chip does not directly report the shunt voltage, report it anyway by calculating its value from the current register. Cc: Chris Packham Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- Documentation/hwmon/ina238.rst | 10 +++++++++- drivers/hwmon/Kconfig | 6 +++--- drivers/hwmon/ina238.c | 17 ++++++++++++++++- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/Documentation/hwmon/ina238.rst b/Documentation/hwmon/ina238.rst index 3c7db4a47056a6..72276096182150 100644 --- a/Documentation/hwmon/ina238.rst +++ b/Documentation/hwmon/ina238.rst @@ -32,6 +32,11 @@ Supported chips: Datasheet: https://www.ti.com/lit/gpn/ina238 + * Texas Instruments INA780 + + Datasheet: + https://www.ti.com/product/ina780a + * Silergy SQ52206 Prefix: 'SQ52206' @@ -56,6 +61,9 @@ INA237 is a functionally equivalent variant of INA238 with slightly different accuracy. INA228 is another variant of INA238 with higher ADC resolution. This chip also reports the energy. +INA780 is a variant of the chip series with built-in shunt resistor. +It also reports the energy. + SQ52206 is a mostly compatible chip from Sylergy. It reports the energy as well as the peak power consumption. @@ -88,7 +96,7 @@ curr1_max Maximum current threshold (mA) curr1_max_alarm Maximum current alarm energy1_input Energy measurement (uJ) - (SQ52206 and INA237 only) + (SQ52206, INA237, and INA780 only) temp1_input Die temperature measurement (mC) temp1_max Maximum die temperature threshold (mC) diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 6bdce991f5f01a..c044c26fbd114d 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -2256,9 +2256,9 @@ config SENSORS_INA238 depends on I2C select REGMAP_I2C help - If you say yes here you get support for INA228, INA237, INA238, and - SQ52206 power monitor chips. This driver supports voltage, current, - power, energy, and temperature measurements as well as alarm + If you say yes here you get support for INA228, INA237, INA238, + INA780, and SQ52206 power monitor chips. This driver supports voltage, + current, power, energy, and temperature measurements as well as alarm configuration. This driver can also be built as a module. If so, the module diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index da5b43184dd1e2..98255619adeb72 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -101,7 +101,7 @@ static const struct regmap_config ina238_regmap_config = { .val_bits = 16, }; -enum ina238_ids { ina228, ina237, ina238, sq52206 }; +enum ina238_ids { ina228, ina237, ina238, ina780, sq52206 }; struct ina238_config { bool has_20bit_voltage_current; /* vshunt, vbus and current are 20-bit fields */ @@ -155,6 +155,16 @@ static const struct ina238_config ina238_config[] = { .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, .temp_resolution = 12, }, + [ina780] = { + .has_20bit_voltage_current = false, + .has_energy = true, + .has_power_highest = false, + .power_calculate_factor = 20, + .config_default = INA238_CONFIG_DEFAULT, + .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, + .temp_resolution = 12, + .current_lsb = 2400, + }, [sq52206] = { .has_20bit_voltage_current = false, .has_energy = true, @@ -836,6 +846,7 @@ static const struct i2c_device_id ina238_id[] = { { "ina228", ina228 }, { "ina237", ina237 }, { "ina238", ina238 }, + { "ina780", ina780 }, { "sq52206", sq52206 }, { } }; @@ -854,6 +865,10 @@ static const struct of_device_id __maybe_unused ina238_of_match[] = { .compatible = "ti,ina238", .data = (void *)ina238 }, + { + .compatible = "ti,ina780", + .data = (void *)ina780 + }, { .compatible = "silergy,sq52206", .data = (void *)sq52206 From 7942ca9a475115b9668e53c2d9cfb57a51d62e04 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 1 Sep 2025 09:02:07 -0700 Subject: [PATCH 1067/3206] dt-bindings: hwmon: ti,ina2xx: Add INA700 Add a compatible string for INA700. The chip is register compatible with INA780 but implements different ADC ranges and thus needs a separate compatible entry. Cc: Christian Kahr Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Reviewed-by: Krzysztof Kozlowski Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml index 8b491be9c49dbc..d3cde89366866b 100644 --- a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml +++ b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml @@ -32,6 +32,7 @@ properties: - ti,ina237 - ti,ina238 - ti,ina260 + - ti,ina700 - ti,ina780 reg: @@ -115,6 +116,7 @@ allOf: - ti,ina237 - ti,ina238 - ti,ina260 + - ti,ina700 - ti,ina780 then: properties: @@ -133,6 +135,7 @@ allOf: - ti,ina230 - ti,ina231 - ti,ina260 + - ti,ina700 - ti,ina780 then: properties: @@ -143,6 +146,7 @@ allOf: compatible: contains: enum: + - ti,ina700 - ti,ina780 then: properties: From 273bfedc003c40bb0fb471841c62f4fc7063be83 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 31 Aug 2025 17:54:16 -0700 Subject: [PATCH 1068/3206] hwmon: (ina238) Add support for INA700 INA700 is register compatible to INA780 but has different current, power, and energy LSB values. While the chip does not directly report the shunt voltage, report it anyway by calculating its value from the current register. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Cc: Christian Kahr Signed-off-by: Guenter Roeck --- Documentation/hwmon/ina238.rst | 9 +++++++-- drivers/hwmon/Kconfig | 6 +++--- drivers/hwmon/ina238.c | 17 ++++++++++++++++- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/Documentation/hwmon/ina238.rst b/Documentation/hwmon/ina238.rst index 72276096182150..43950d1ec551f7 100644 --- a/Documentation/hwmon/ina238.rst +++ b/Documentation/hwmon/ina238.rst @@ -32,6 +32,11 @@ Supported chips: Datasheet: https://www.ti.com/lit/gpn/ina238 + * Texas Instruments INA700 + + Datasheet: + https://www.ti.com/product/ina700 + * Texas Instruments INA780 Datasheet: @@ -61,8 +66,8 @@ INA237 is a functionally equivalent variant of INA238 with slightly different accuracy. INA228 is another variant of INA238 with higher ADC resolution. This chip also reports the energy. -INA780 is a variant of the chip series with built-in shunt resistor. -It also reports the energy. +INA700 and INA780 are variants of the chip series with built-in shunt resistor. +They also report the energy. SQ52206 is a mostly compatible chip from Sylergy. It reports the energy as well as the peak power consumption. diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index c044c26fbd114d..8a41275ca518b1 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -2257,9 +2257,9 @@ config SENSORS_INA238 select REGMAP_I2C help If you say yes here you get support for INA228, INA237, INA238, - INA780, and SQ52206 power monitor chips. This driver supports voltage, - current, power, energy, and temperature measurements as well as alarm - configuration. + INA700, INA780, and SQ52206 power monitor chips. This driver supports + voltage, current, power, energy, and temperature measurements as well + as alarm configuration. This driver can also be built as a module. If so, the module will be called ina238. diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index 98255619adeb72..356d19b7675cac 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -101,7 +101,7 @@ static const struct regmap_config ina238_regmap_config = { .val_bits = 16, }; -enum ina238_ids { ina228, ina237, ina238, ina780, sq52206 }; +enum ina238_ids { ina228, ina237, ina238, ina700, ina780, sq52206 }; struct ina238_config { bool has_20bit_voltage_current; /* vshunt, vbus and current are 20-bit fields */ @@ -155,6 +155,16 @@ static const struct ina238_config ina238_config[] = { .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, .temp_resolution = 12, }, + [ina700] = { + .has_20bit_voltage_current = false, + .has_energy = true, + .has_power_highest = false, + .power_calculate_factor = 20, + .config_default = INA238_CONFIG_DEFAULT, + .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, + .temp_resolution = 12, + .current_lsb = 480, + }, [ina780] = { .has_20bit_voltage_current = false, .has_energy = true, @@ -846,6 +856,7 @@ static const struct i2c_device_id ina238_id[] = { { "ina228", ina228 }, { "ina237", ina237 }, { "ina238", ina238 }, + { "ina700", ina700 }, { "ina780", ina780 }, { "sq52206", sq52206 }, { } @@ -865,6 +876,10 @@ static const struct of_device_id __maybe_unused ina238_of_match[] = { .compatible = "ti,ina238", .data = (void *)ina238 }, + { + .compatible = "ti,ina700", + .data = (void *)ina700 + }, { .compatible = "ti,ina780", .data = (void *)ina780 From 55cb81254333ab90985a9e861d33c6f8e1cf0118 Mon Sep 17 00:00:00 2001 From: Maciej Zonski Date: Sat, 6 Sep 2025 18:17:37 +0200 Subject: [PATCH 1069/3206] hwmon: (asus-ec-sensors) add ROG STRIX X870-I GAMING WIFI Add support for ROG STRIX X870-I GAMING WIFI Signed-off-by: Maciej Zonski Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250906161748.219567-1-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index baf9eba5957c52..6a9c5f30e016a3 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -34,6 +34,7 @@ Supported boards: * ROG STRIX X570-F GAMING * ROG STRIX X570-I GAMING * ROG STRIX X670E-I GAMING WIFI + * ROG STRIX X870-I GAMING WIFI * ROG STRIX Z390-F GAMING * ROG STRIX Z490-F GAMING * ROG STRIX Z690-A GAMING WIFI D4 diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index f580ff2e212f2f..32d4dd26aa84da 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -621,6 +621,13 @@ static const struct ec_board_info board_info_strix_x670e_i_gaming_wifi = { .family = family_amd_600_series, }; +static const struct ec_board_info board_info_strix_x870_i_gaming_wifi = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0, + .family = family_amd_800_series, +}; + static const struct ec_board_info board_info_strix_z390_f_gaming = { .sensors = SENSOR_TEMP_CHIPSET | SENSOR_TEMP_VRM | SENSOR_TEMP_T_SENSOR | @@ -759,6 +766,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_strix_x570_i_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X670E-I GAMING WIFI", &board_info_strix_x670e_i_gaming_wifi), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X870-I GAMING WIFI", + &board_info_strix_x870_i_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z390-F GAMING", &board_info_strix_z390_f_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z490-F GAMING", From 552e369db339c4d9f29a5569ec2d661cb4353f40 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 7 Sep 2025 12:20:20 +0200 Subject: [PATCH 1070/3206] hwmon: (nzxt-smart2) Use devm_mutex_init() Use devm_mutex_init() instead of hand-writing it. This saves some LoC, improves readability and saves some space in the generated .o file. Before: ====== text data bss dec hex filename 25878 11329 128 37335 91d7 drivers/hwmon/nzxt-smart2.o After: ===== text data bss dec hex filename 25551 11257 128 36936 9048 drivers/hwmon/nzxt-smart2.o Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/f51fac0871ec7dbe4e28447ee4f774d028a53426.1757240403.git.christophe.jaillet@wanadoo.fr Signed-off-by: Guenter Roeck --- drivers/hwmon/nzxt-smart2.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/hwmon/nzxt-smart2.c b/drivers/hwmon/nzxt-smart2.c index c2d1173f42fefb..58ef9fa0184be4 100644 --- a/drivers/hwmon/nzxt-smart2.c +++ b/drivers/hwmon/nzxt-smart2.c @@ -721,11 +721,6 @@ static int __maybe_unused nzxt_smart2_hid_reset_resume(struct hid_device *hdev) return init_device(drvdata, drvdata->update_interval); } -static void mutex_fini(void *lock) -{ - mutex_destroy(lock); -} - static int nzxt_smart2_hid_probe(struct hid_device *hdev, const struct hid_device_id *id) { @@ -741,8 +736,7 @@ static int nzxt_smart2_hid_probe(struct hid_device *hdev, init_waitqueue_head(&drvdata->wq); - mutex_init(&drvdata->mutex); - ret = devm_add_action_or_reset(&hdev->dev, mutex_fini, &drvdata->mutex); + ret = devm_mutex_init(&hdev->dev, &drvdata->mutex); if (ret) return ret; From c97c66e04c2294d59827835e6fd46e0e4a5e2c06 Mon Sep 17 00:00:00 2001 From: Wensheng Wang Date: Tue, 5 Aug 2025 18:20:18 +0800 Subject: [PATCH 1071/3206] dt-bindings: hwmon: Add MPS mp2869,mp29608,mp29612,mp29816 and mp29502 Add support for MPS mp2869/mp2869a,mp29608/mp29608a,mp29612/mp29612a, mp29816/mp29816a/mp29816b/mp29816c and mp29502 controller. Acked-by: Rob Herring (Arm) Signed-off-by: Wensheng Wang Link: https://lore.kernel.org/r/20250805102020.749850-1-wenswang@yeah.net Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/trivial-devices.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index f3dd18681aa6f8..0e6ba6e12a639a 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -293,10 +293,20 @@ properties: - mps,mp2856 # Monolithic Power Systems Inc. multi-phase controller mp2857 - mps,mp2857 + # Monolithic Power Systems Inc. multi-phase controller mp2869 + - mps,mp2869 # Monolithic Power Systems Inc. multi-phase controller mp2888 - mps,mp2888 # Monolithic Power Systems Inc. multi-phase controller mp2891 - mps,mp2891 + # Monolithic Power Systems Inc. multi-phase controller mp29502 + - mps,mp29502 + # Monolithic Power Systems Inc. multi-phase controller mp29608 + - mps,mp29608 + # Monolithic Power Systems Inc. multi-phase controller mp29612 + - mps,mp29612 + # Monolithic Power Systems Inc. multi-phase controller mp29816 + - mps,mp29816 # Monolithic Power Systems Inc. multi-phase controller mp2993 - mps,mp2993 # Monolithic Power Systems Inc. hot-swap protection device From a3a2923aaf7f2cf3aaa4649bddee2f936751825f Mon Sep 17 00:00:00 2001 From: Wensheng Wang Date: Tue, 5 Aug 2025 18:20:19 +0800 Subject: [PATCH 1072/3206] hwmon: add MP2869,MP29608,MP29612 and MP29816 series driver Add support for MPS VR mp2869/mp2869a,mp29608/mp29608a,mp29612/mp29612a and mp29816/mp29816a/mp29816b/mp29816c controller. This driver exposes telemetry and limit value readings and writtings. Signed-off-by: Wensheng Wang Link: https://lore.kernel.org/r/20250805102020.749850-2-wenswang@yeah.net Signed-off-by: Guenter Roeck --- Documentation/hwmon/index.rst | 1 + Documentation/hwmon/mp2869.rst | 175 +++++++++ MAINTAINERS | 7 + drivers/hwmon/pmbus/Kconfig | 9 + drivers/hwmon/pmbus/Makefile | 1 + drivers/hwmon/pmbus/mp2869.c | 659 +++++++++++++++++++++++++++++++++ 6 files changed, 852 insertions(+) create mode 100644 Documentation/hwmon/mp2869.rst create mode 100644 drivers/hwmon/pmbus/mp2869.c diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst index d292a86ac5da90..36303148dc43b2 100644 --- a/Documentation/hwmon/index.rst +++ b/Documentation/hwmon/index.rst @@ -173,6 +173,7 @@ Hardware Monitoring Kernel Drivers menf21bmc mlxreg-fan mp2856 + mp2869 mp2888 mp2891 mp2975 diff --git a/Documentation/hwmon/mp2869.rst b/Documentation/hwmon/mp2869.rst new file mode 100644 index 00000000000000..2d9d65fc86b6a1 --- /dev/null +++ b/Documentation/hwmon/mp2869.rst @@ -0,0 +1,175 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Kernel driver mp2869 +==================== + +Supported chips: + + * MPS mp2869 + + Prefix: 'mp2869' + + * MPS mp29608 + + Prefix: 'mp29608' + + * MPS mp29612 + + Prefix: 'mp29612' + + * MPS mp29816 + + Prefix: 'mp29816' + +Author: + + Wensheng Wang + +Description +----------- + +This driver implements support for Monolithic Power Systems, Inc. (MPS) +MP2869 Dual Loop Digital Multi-phase Controller. + +Device compliant with: + +- PMBus rev 1.3 interface. + +The driver exports the following attributes via the 'sysfs' files +for input voltage: + +**in1_input** + +**in1_label** + +**in1_crit** + +**in1_crit_alarm** + +**in1_lcrit** + +**in1_lcrit_alarm** + +**in1_min** + +**in1_min_alarm** + +The driver provides the following attributes for output voltage: + +**in2_input** + +**in2_label** + +**in2_crit** + +**in2_crit_alarm** + +**in2_lcrit** + +**in2_lcrit_alarm** + +**in3_input** + +**in3_label** + +**in3_crit** + +**in3_crit_alarm** + +**in3_lcrit** + +**in3_lcrit_alarm** + +The driver provides the following attributes for input current: + +**curr1_input** + +**curr1_label** + +**curr2_input** + +**curr2_label** + +The driver provides the following attributes for output current: + +**curr3_input** + +**curr3_label** + +**curr3_crit** + +**curr3_crit_alarm** + +**curr3_max** + +**curr3_max_alarm** + +**curr4_input** + +**curr4_label** + +**curr4_crit** + +**curr4_crit_alarm** + +**curr4_max** + +**curr4_max_alarm** + +The driver provides the following attributes for input power: + +**power1_input** + +**power1_label** + +**power2_input** + +**power2_label** + +The driver provides the following attributes for output power: + +**power3_input** + +**power3_label** + +**power3_input** + +**power3_label** + +**power3_max** + +**power3_max_alarm** + +**power4_input** + +**power4_label** + +**power4_input** + +**power4_label** + +**power4_max** + +**power4_max_alarm** + +The driver provides the following attributes for temperature: + +**temp1_input** + +**temp1_crit** + +**temp1_crit_alarm** + +**temp1_max** + +**temp1_max_alarm** + +**temp2_input** + +**temp2_crit** + +**temp2_crit_alarm** + +**temp2_max** + +**temp2_max_alarm** diff --git a/MAINTAINERS b/MAINTAINERS index b392d34deb6272..2f99cd97d62844 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17175,6 +17175,13 @@ S: Maintained F: Documentation/devicetree/bindings/leds/backlight/mps,mp3309c.yaml F: drivers/video/backlight/mp3309c.c +MPS MP2869 DRIVER +M: Wensheng Wang +L: linux-hwmon@vger.kernel.org +S: Maintained +F: Documentation/hwmon/mp2869.rst +F: drivers/hwmon/pmbus/mp2869.c + MPS MP2891 DRIVER M: Noah Wang L: linux-hwmon@vger.kernel.org diff --git a/drivers/hwmon/pmbus/Kconfig b/drivers/hwmon/pmbus/Kconfig index 77add0c6ee53a4..c5b2c5e9573760 100644 --- a/drivers/hwmon/pmbus/Kconfig +++ b/drivers/hwmon/pmbus/Kconfig @@ -374,6 +374,15 @@ config SENSORS_MP2856 This driver can also be built as a module. If so, the module will be called mp2856. +config SENSORS_MP2869 + tristate "MPS MP2869" + help + If you say yes here you get hardware monitoring support for MPS + MP2869 Dual Loop Digital Multi-Phase Controller. + + This driver can also be built as a module. If so, the module will + be called mp2869. + config SENSORS_MP2888 tristate "MPS MP2888" help diff --git a/drivers/hwmon/pmbus/Makefile b/drivers/hwmon/pmbus/Makefile index 29cd8a3317d29f..6177047414ee8a 100644 --- a/drivers/hwmon/pmbus/Makefile +++ b/drivers/hwmon/pmbus/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_SENSORS_MAX31785) += max31785.o obj-$(CONFIG_SENSORS_MAX34440) += max34440.o obj-$(CONFIG_SENSORS_MAX8688) += max8688.o obj-$(CONFIG_SENSORS_MP2856) += mp2856.o +obj-$(CONFIG_SENSORS_MP2869) += mp2869.o obj-$(CONFIG_SENSORS_MP2888) += mp2888.o obj-$(CONFIG_SENSORS_MP2891) += mp2891.o obj-$(CONFIG_SENSORS_MP2975) += mp2975.o diff --git a/drivers/hwmon/pmbus/mp2869.c b/drivers/hwmon/pmbus/mp2869.c new file mode 100644 index 00000000000000..cc69a1e91dfe8a --- /dev/null +++ b/drivers/hwmon/pmbus/mp2869.c @@ -0,0 +1,659 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Hardware monitoring driver for MPS Multi-phase Digital VR Controllers(MP2869) + */ + +#include +#include +#include +#include +#include "pmbus.h" + +/* + * Vender specific registers, the register MFR_SVI3_IOUT_PRT(0x67), + * READ_PIN_EST(0x94)and READ_IIN_EST(0x95) redefine the standard + * PMBUS register. The MFR_VOUT_LOOP_CTRL(0x29) is used to identify + * the vout scale and the MFR_SVI3_IOUT_PRT(0x67) is used to identify + * the iout scale. The READ_PIN_EST(0x94) is used to read input power + * per rail. The MP2891 does not have standard READ_IIN register(0x89), + * the iin telemetry can be obtained through the vendor redefined + * register READ_IIN_EST(0x95). + */ +#define MFR_SVI3_IOUT_PRT 0x67 +#define MFR_READ_PIN_EST 0x94 +#define MFR_READ_IIN_EST 0x95 +#define MFR_TSNS_FLT_SET 0xBB + +#define MP2869_VIN_OV_FAULT_GAIN 4 +#define MP2869_READ_VOUT_DIV 1024 +#define MP2869_READ_IOUT_DIV 32 +#define MP2869_OVUV_LIMIT_SCALE 10 +#define MP2869_OVUV_DELTA_SCALE 50 +#define MP2869_TEMP_LIMIT_OFFSET 40 +#define MP2869_IOUT_LIMIT_UINT 8 +#define MP2869_POUT_OP_GAIN 2 + +#define MP2869_PAGE_NUM 2 + +#define MP2869_RAIL1_FUNC (PMBUS_HAVE_VIN | PMBUS_HAVE_VOUT | \ + PMBUS_HAVE_IOUT | PMBUS_HAVE_POUT | \ + PMBUS_HAVE_TEMP | PMBUS_HAVE_PIN | \ + PMBUS_HAVE_IIN | \ + PMBUS_HAVE_STATUS_VOUT | \ + PMBUS_HAVE_STATUS_IOUT | \ + PMBUS_HAVE_STATUS_TEMP | \ + PMBUS_HAVE_STATUS_INPUT) + +#define MP2869_RAIL2_FUNC (PMBUS_HAVE_VOUT | PMBUS_HAVE_IOUT | \ + PMBUS_HAVE_POUT | PMBUS_HAVE_TEMP | \ + PMBUS_HAVE_PIN | PMBUS_HAVE_IIN | \ + PMBUS_HAVE_STATUS_VOUT | \ + PMBUS_HAVE_STATUS_IOUT | \ + PMBUS_HAVE_STATUS_TEMP | \ + PMBUS_HAVE_STATUS_INPUT) + +struct mp2869_data { + struct pmbus_driver_info info; + bool mfr_thwn_flt_en; + int vout_scale[MP2869_PAGE_NUM]; + int iout_scale[MP2869_PAGE_NUM]; +}; + +static const int mp2869_vout_sacle[8] = {6400, 5120, 2560, 2048, 1024, + 4, 2, 1}; +static const int mp2869_iout_sacle[8] = {32, 1, 2, 4, 8, 16, 32, 64}; + +#define to_mp2869_data(x) container_of(x, struct mp2869_data, info) + +static u16 mp2869_reg2data_linear11(u16 word) +{ + s16 exponent; + s32 mantissa; + s64 val; + + exponent = ((s16)word) >> 11; + mantissa = ((s16)((word & 0x7ff) << 5)) >> 5; + val = mantissa; + + if (exponent >= 0) + val <<= exponent; + else + val >>= -exponent; + + return val; +} + +static int +mp2869_identify_thwn_flt(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_TSNS_FLT_SET); + if (ret < 0) + return ret; + + data->mfr_thwn_flt_en = FIELD_GET(GENMASK(13, 13), ret); + + return 0; +} + +static int +mp2869_identify_vout_scale(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, PMBUS_VOUT_SCALE_LOOP); + if (ret < 0) + return ret; + + /* + * The output voltage is equal to the READ_VOUT(0x8B) register value multiply + * by vout_scale. + * Obtain vout scale from the register PMBUS_VOUT_SCALE_LOOP, bits 12-10 + * PMBUS_VOUT_SCALE_LOOP[12:10]: + * 000b - 6.25mV/LSB, 001b - 5mV/LSB, 010b - 2.5mV/LSB, 011b - 2mV/LSB + * 100b - 1mV/Lsb, 101b - (1/256)mV/LSB, 110b - (1/512)mV/LSB, + * 111b - (1/1024)mV/LSB + */ + data->vout_scale[page] = mp2869_vout_sacle[FIELD_GET(GENMASK(12, 10), ret)]; + + return 0; +} + +static int +mp2869_identify_iout_scale(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_SVI3_IOUT_PRT); + if (ret < 0) + return ret; + + /* + * The output current is equal to the READ_IOUT(0x8C) register value + * multiply by iout_scale. + * Obtain iout_scale from the register MFR_SVI3_IOUT_PRT[2:0]. + * The value is selected as below: + * 000b - 1A/LSB, 001b - (1/32)A/LSB, 010b - (1/16)A/LSB, + * 011b - (1/8)A/LSB, 100b - (1/4)A/LSB, 101b - (1/2)A/LSB + * 110b - 1A/LSB, 111b - 2A/LSB + */ + data->iout_scale[page] = mp2869_iout_sacle[FIELD_GET(GENMASK(2, 0), ret)]; + + return 0; +} + +static int mp2869_read_byte_data(struct i2c_client *client, int page, int reg) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + switch (reg) { + case PMBUS_VOUT_MODE: + /* + * The calculation of vout in this driver is based on direct format. + * As a result, the format of vout is enforced to direct. + */ + ret = PB_VOUT_MODE_DIRECT; + break; + case PMBUS_STATUS_BYTE: + /* + * If the tsns digital fault is enabled, the TEMPERATURE flag + * of PMBUS_STATUS_BYTE should come from STATUS_MFR_SPECIFIC + * register bit1. + */ + if (!data->mfr_thwn_flt_en) + return -ENODATA; + + ret = pmbus_read_byte_data(client, page, reg); + if (ret < 0) + return ret; + + ret = (ret & ~GENMASK(2, 2)) | + FIELD_PREP(GENMASK(2, 2), + FIELD_GET(GENMASK(1, 1), + pmbus_read_byte_data(client, page, + PMBUS_STATUS_MFR_SPECIFIC))); + break; + case PMBUS_STATUS_TEMPERATURE: + /* + * If the tsns digital fault is enabled, the OT Fault and OT Warning + * flag of PMBUS_STATUS_TEMPERATURE should come from STATUS_MFR_SPECIFIC + * register bit1. + */ + if (!data->mfr_thwn_flt_en) + return -ENODATA; + + ret = pmbus_read_byte_data(client, page, reg); + if (ret < 0) + return ret; + + ret = (ret & ~GENMASK(7, 6)) | + FIELD_PREP(GENMASK(6, 6), + FIELD_GET(GENMASK(1, 1), + pmbus_read_byte_data(client, page, + PMBUS_STATUS_MFR_SPECIFIC))) | + FIELD_PREP(GENMASK(7, 7), + FIELD_GET(GENMASK(1, 1), + pmbus_read_byte_data(client, page, + PMBUS_STATUS_MFR_SPECIFIC))); + break; + default: + ret = -ENODATA; + break; + } + + return ret; +} + +static int mp2869_read_word_data(struct i2c_client *client, int page, int phase, + int reg) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + switch (reg) { + case PMBUS_STATUS_WORD: + /* + * If the tsns digital fault is enabled, the OT Fault flag + * of PMBUS_STATUS_WORD should come from STATUS_MFR_SPECIFIC + * register bit1. + */ + if (!data->mfr_thwn_flt_en) + return -ENODATA; + + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = (ret & ~GENMASK(2, 2)) | + FIELD_PREP(GENMASK(2, 2), + FIELD_GET(GENMASK(1, 1), + pmbus_read_byte_data(client, page, + PMBUS_STATUS_MFR_SPECIFIC))); + break; + case PMBUS_READ_VIN: + /* + * The MP2869 PMBUS_READ_VIN[10:0] is the vin value, the vin scale is + * 31.25mV/LSB. And the vin scale is set to 31.25mV/Lsb(using r/m/b scale) + * in MP2869 pmbus_driver_info struct, so the word data bit0-bit10 can be + * returned to pmbus core directly. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(10, 0), ret); + break; + case PMBUS_READ_IIN: + /* + * The MP2869 redefine the standard 0x95 register as iin telemetry + * per rail. + */ + ret = pmbus_read_word_data(client, page, phase, MFR_READ_IIN_EST); + if (ret < 0) + return ret; + + break; + case PMBUS_READ_PIN: + /* + * The MP2869 redefine the standard 0x94 register as pin telemetry + * per rail. The MP2869 MFR_READ_PIN_EST register is linear11 format, + * but the pin scale is set to 1W/Lsb(using r/m/b scale). As a result, + * the pin read from MP2869 should be converted to W, then return + * the result to pmbus core. + */ + ret = pmbus_read_word_data(client, page, phase, MFR_READ_PIN_EST); + if (ret < 0) + return ret; + + ret = mp2869_reg2data_linear11(ret); + break; + case PMBUS_READ_VOUT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(11, 0)) * data->vout_scale[page], + MP2869_READ_VOUT_DIV); + break; + case PMBUS_READ_IOUT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(10, 0)) * data->iout_scale[page], + MP2869_READ_IOUT_DIV); + break; + case PMBUS_READ_POUT: + /* + * The MP2869 PMBUS_READ_POUT register is linear11 format, but the pout + * scale is set to 1W/Lsb(using r/m/b scale). As a result, the pout read + * from MP2869 should be converted to W, then return the result to pmbus + * core. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = mp2869_reg2data_linear11(ret); + break; + case PMBUS_READ_TEMPERATURE_1: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(10, 0), ret); + break; + case PMBUS_VOUT_OV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + if (FIELD_GET(GENMASK(12, 9), ret)) + ret = FIELD_GET(GENMASK(8, 0), ret) * MP2869_OVUV_LIMIT_SCALE + + (FIELD_GET(GENMASK(12, 9), ret) + 1) * MP2869_OVUV_DELTA_SCALE; + else + ret = FIELD_GET(GENMASK(8, 0), ret) * MP2869_OVUV_LIMIT_SCALE; + break; + case PMBUS_VOUT_UV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + if (FIELD_GET(GENMASK(12, 9), ret)) + ret = FIELD_GET(GENMASK(8, 0), ret) * MP2869_OVUV_LIMIT_SCALE - + (FIELD_GET(GENMASK(12, 9), ret) + 1) * MP2869_OVUV_DELTA_SCALE; + else + ret = FIELD_GET(GENMASK(8, 0), ret) * MP2869_OVUV_LIMIT_SCALE; + break; + case PMBUS_OT_FAULT_LIMIT: + case PMBUS_OT_WARN_LIMIT: + /* + * The scale of MP2869 PMBUS_OT_FAULT_LIMIT and PMBUS_OT_WARN_LIMIT + * is 1°C/LSB and they have 40°C offset. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = (ret & GENMASK(7, 0)) - MP2869_TEMP_LIMIT_OFFSET; + break; + case PMBUS_VIN_OV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = (ret & GENMASK(7, 0)) * MP2869_VIN_OV_FAULT_GAIN; + break; + case PMBUS_VIN_UV_WARN_LIMIT: + case PMBUS_VIN_UV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(9, 0), ret); + break; + case PMBUS_IOUT_OC_FAULT_LIMIT: + case PMBUS_IOUT_OC_WARN_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(7, 0)) * data->iout_scale[page] * + MP2869_IOUT_LIMIT_UINT, MP2869_READ_IOUT_DIV); + break; + case PMBUS_POUT_OP_WARN_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = (ret & GENMASK(7, 0)) * MP2869_POUT_OP_GAIN; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int mp2869_write_word_data(struct i2c_client *client, int page, int reg, + u16 word) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + switch (reg) { + case PMBUS_VOUT_UV_FAULT_LIMIT: + /* + * The MP2869 PMBUS_VOUT_UV_FAULT_LIMIT[8:0] is the limit value, + * and bit9-bit15 should not be changed. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + if (FIELD_GET(GENMASK(12, 9), ret)) + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(8, 0)) | + FIELD_PREP(GENMASK(8, 0), + DIV_ROUND_CLOSEST(word + + (FIELD_GET(GENMASK(12, 9), + ret) + 1) * + MP2869_OVUV_DELTA_SCALE, + MP2869_OVUV_LIMIT_SCALE))); + else + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(8, 0)) | + FIELD_PREP(GENMASK(8, 0), + DIV_ROUND_CLOSEST(word, + MP2869_OVUV_LIMIT_SCALE))); + break; + case PMBUS_VOUT_OV_FAULT_LIMIT: + /* + * The MP2869 PMBUS_VOUT_OV_FAULT_LIMIT[8:0] is the limit value, + * and bit9-bit15 should not be changed. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + if (FIELD_GET(GENMASK(12, 9), ret)) + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(8, 0)) | + FIELD_PREP(GENMASK(8, 0), + DIV_ROUND_CLOSEST(word - + (FIELD_GET(GENMASK(12, 9), + ret) + 1) * + MP2869_OVUV_DELTA_SCALE, + MP2869_OVUV_LIMIT_SCALE))); + else + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(8, 0)) | + FIELD_PREP(GENMASK(8, 0), + DIV_ROUND_CLOSEST(word, + MP2869_OVUV_LIMIT_SCALE))); + break; + case PMBUS_OT_FAULT_LIMIT: + case PMBUS_OT_WARN_LIMIT: + /* + * If the tsns digital fault is enabled, the PMBUS_OT_FAULT_LIMIT and + * PMBUS_OT_WARN_LIMIT can not be written. + */ + if (data->mfr_thwn_flt_en) + return -EINVAL; + + /* + * The MP2869 scale of MP2869 PMBUS_OT_FAULT_LIMIT and PMBUS_OT_WARN_LIMIT + * have 40°C offset. The bit0-bit7 is the limit value, and bit8-bit15 + * should not be changed. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(7, 0)) | + FIELD_PREP(GENMASK(7, 0), + word + MP2869_TEMP_LIMIT_OFFSET)); + break; + case PMBUS_VIN_OV_FAULT_LIMIT: + /* + * The MP2869 PMBUS_VIN_OV_FAULT_LIMIT[7:0] is the limit value, and bit8-bit15 + * should not be changed. The scale of PMBUS_VIN_OV_FAULT_LIMIT is 125mV/Lsb, + * but the vin scale is set to 31.25mV/Lsb(using r/m/b scale), so the word data + * should divide by MP2869_VIN_OV_FAULT_GAIN(4) + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(7, 0)) | + FIELD_PREP(GENMASK(7, 0), + DIV_ROUND_CLOSEST(word, + MP2869_VIN_OV_FAULT_GAIN))); + break; + case PMBUS_VIN_UV_WARN_LIMIT: + case PMBUS_VIN_UV_FAULT_LIMIT: + /* + * The PMBUS_VIN_UV_LIMIT[9:0] is the limit value, and bit10-bit15 should + * not be changed. The scale of PMBUS_VIN_UV_LIMIT is 31.25mV/Lsb, and the + * vin scale is set to 31.25mV/Lsb(using r/m/b scale), so the word data can + * be written directly. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(9, 0)) | + FIELD_PREP(GENMASK(9, 0), + word)); + break; + case PMBUS_IOUT_OC_FAULT_LIMIT: + case PMBUS_IOUT_OC_WARN_LIMIT: + ret = pmbus_write_word_data(client, page, reg, + DIV_ROUND_CLOSEST(word * MP2869_READ_IOUT_DIV, + MP2869_IOUT_LIMIT_UINT * + data->iout_scale[page])); + break; + case PMBUS_POUT_OP_WARN_LIMIT: + /* + * The POUT_OP_WARN_LIMIT[11:0] is the limit value, and bit12-bit15 should + * not be changed. The scale of POUT_OP_WARN_LIMIT is 2W/Lsb. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(11, 0)) | + FIELD_PREP(GENMASK(11, 0), + DIV_ROUND_CLOSEST(word, + MP2869_POUT_OP_GAIN))); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int mp2869_identify(struct i2c_client *client, struct pmbus_driver_info *info) +{ + int ret; + + /* Identify whether tsns digital fault is enable */ + ret = mp2869_identify_thwn_flt(client, info, 1); + if (ret < 0) + return 0; + + /* Identify vout scale for rail1. */ + ret = mp2869_identify_vout_scale(client, info, 0); + if (ret < 0) + return ret; + + /* Identify vout scale for rail2. */ + ret = mp2869_identify_vout_scale(client, info, 1); + if (ret < 0) + return ret; + + /* Identify iout scale for rail 1. */ + ret = mp2869_identify_iout_scale(client, info, 0); + if (ret < 0) + return ret; + + /* Identify iout scale for rail 2. */ + return mp2869_identify_iout_scale(client, info, 1); +} + +static const struct pmbus_driver_info mp2869_info = { + .pages = MP2869_PAGE_NUM, + .format[PSC_VOLTAGE_IN] = direct, + .format[PSC_CURRENT_IN] = linear, + .format[PSC_CURRENT_OUT] = direct, + .format[PSC_TEMPERATURE] = direct, + .format[PSC_POWER] = direct, + .format[PSC_VOLTAGE_OUT] = direct, + + .m[PSC_VOLTAGE_IN] = 32, + .R[PSC_VOLTAGE_IN] = 0, + .b[PSC_VOLTAGE_IN] = 0, + + .m[PSC_VOLTAGE_OUT] = 1, + .R[PSC_VOLTAGE_OUT] = 3, + .b[PSC_VOLTAGE_OUT] = 0, + + .m[PSC_CURRENT_OUT] = 1, + .R[PSC_CURRENT_OUT] = 0, + .b[PSC_CURRENT_OUT] = 0, + + .m[PSC_TEMPERATURE] = 1, + .R[PSC_TEMPERATURE] = 0, + .b[PSC_TEMPERATURE] = 0, + + .m[PSC_POWER] = 1, + .R[PSC_POWER] = 0, + .b[PSC_POWER] = 0, + + .func[0] = MP2869_RAIL1_FUNC, + .func[1] = MP2869_RAIL2_FUNC, + .read_word_data = mp2869_read_word_data, + .write_word_data = mp2869_write_word_data, + .read_byte_data = mp2869_read_byte_data, + .identify = mp2869_identify, +}; + +static int mp2869_probe(struct i2c_client *client) +{ + struct pmbus_driver_info *info; + struct mp2869_data *data; + + data = devm_kzalloc(&client->dev, sizeof(struct mp2869_data), + GFP_KERNEL); + if (!data) + return -ENOMEM; + + memcpy(&data->info, &mp2869_info, sizeof(*info)); + info = &data->info; + + return pmbus_do_probe(client, info); +} + +static const struct i2c_device_id mp2869_id[] = { + {"mp2869", 0}, + {"mp29608", 1}, + {"mp29612", 2}, + {"mp29816", 3}, + {} +}; +MODULE_DEVICE_TABLE(i2c, mp2869_id); + +static const struct of_device_id __maybe_unused mp2869_of_match[] = { + {.compatible = "mps,mp2869", .data = (void *)0}, + {.compatible = "mps,mp29608", .data = (void *)1}, + {.compatible = "mps,mp29612", .data = (void *)2}, + {.compatible = "mps,mp29816", .data = (void *)3}, + {} +}; +MODULE_DEVICE_TABLE(of, mp2869_of_match); + +static struct i2c_driver mp2869_driver = { + .driver = { + .name = "mp2869", + .of_match_table = mp2869_of_match, + }, + .probe = mp2869_probe, + .id_table = mp2869_id, +}; + +module_i2c_driver(mp2869_driver); + +MODULE_AUTHOR("Wensheng Wang "); +MODULE_DESCRIPTION("PMBus driver for MPS MP2869"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS("PMBUS"); From ada2e4091d20735537fab67d970ffdbb8e9f7672 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 8 Sep 2025 10:20:58 +0900 Subject: [PATCH 1073/3206] firewire: ohci: use kcalloc() variant for array allocation When allocating the list of isochronous context structure, a kzalloc() variant of managed device API is used. In this case, a kcalloc() variant is available. This commit replaces these lines with devm_kcalloc(). Link: https://lore.kernel.org/r/20250908012108.514698-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/ohci.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 40851b12061513..030aed5453a17d 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -3482,7 +3482,6 @@ static int pci_probe(struct pci_dev *dev, u32 bus_options, max_receive, link_speed, version; u64 guid; int i, flags, irq, err; - size_t size; if (dev->vendor == PCI_VENDOR_ID_PINNACLE_SYSTEMS) { dev_err(&dev->dev, "Pinnacle MovieBoard is not yet supported\n"); @@ -3576,8 +3575,7 @@ static int pci_probe(struct pci_dev *dev, reg_write(ohci, OHCI1394_IsoRecvIntMaskClear, ~0); ohci->ir_context_mask = ohci->ir_context_support; ohci->n_ir = hweight32(ohci->ir_context_mask); - size = sizeof(struct iso_context) * ohci->n_ir; - ohci->ir_context_list = devm_kzalloc(&dev->dev, size, GFP_KERNEL); + ohci->ir_context_list = devm_kcalloc(&dev->dev, ohci->n_ir, sizeof(struct iso_context), GFP_KERNEL); if (!ohci->ir_context_list) return -ENOMEM; @@ -3591,8 +3589,7 @@ static int pci_probe(struct pci_dev *dev, reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, ~0); ohci->it_context_mask = ohci->it_context_support; ohci->n_it = hweight32(ohci->it_context_mask); - size = sizeof(struct iso_context) * ohci->n_it; - ohci->it_context_list = devm_kzalloc(&dev->dev, size, GFP_KERNEL); + ohci->it_context_list = devm_kcalloc(&dev->dev, ohci->n_it, sizeof(struct iso_context), GFP_KERNEL); if (!ohci->it_context_list) return -ENOMEM; From c908e072b6932e2d40645f6916c9b4bfe8f3f12f Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 8 Sep 2025 10:20:59 +0900 Subject: [PATCH 1074/3206] firewire: core: utilize cleanup function to release workqueue in error path The helper macro, retain_and_null_ptr(), introduced by a commit 092d00ead733 ("cleanup: Provide retain_and_null_ptr()") in v6.16 kernel, is useful in the error path to release the part of structure member. This commit uses the relatively new function. Link: https://lore.kernel.org/r/20250908012108.514698-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 40 ++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index aae774e7a5c3eb..d128c7a8bf5fd9 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -570,9 +570,13 @@ void fw_card_initialize(struct fw_card *card, } EXPORT_SYMBOL(fw_card_initialize); +DEFINE_FREE(workqueue_destroy, struct workqueue_struct *, if (_T) destroy_workqueue(_T)) + int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid, unsigned int supported_isoc_contexts) { + struct workqueue_struct *isoc_wq __free(workqueue_destroy) = NULL; + struct workqueue_struct *async_wq __free(workqueue_destroy) = NULL; int ret; // This workqueue should be: @@ -587,10 +591,10 @@ int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid, // * == WQ_SYSFS Parameters are available via sysfs. // * max_active == n_it + n_ir A hardIRQ could notify events for multiple isochronous // contexts if they are scheduled to the same cycle. - card->isoc_wq = alloc_workqueue("firewire-isoc-card%u", - WQ_UNBOUND | WQ_FREEZABLE | WQ_HIGHPRI | WQ_SYSFS, - supported_isoc_contexts, card->index); - if (!card->isoc_wq) + isoc_wq = alloc_workqueue("firewire-isoc-card%u", + WQ_UNBOUND | WQ_FREEZABLE | WQ_HIGHPRI | WQ_SYSFS, + supported_isoc_contexts, card->index); + if (!isoc_wq) return -ENOMEM; // This workqueue should be: @@ -602,14 +606,14 @@ int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid, // * == WQ_SYSFS Parameters are available via sysfs. // * max_active == 4 A hardIRQ could notify events for a pair of requests and // response AR/AT contexts. - card->async_wq = alloc_workqueue("firewire-async-card%u", - WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI | WQ_SYSFS, - 4, card->index); - if (!card->async_wq) { - ret = -ENOMEM; - goto err_isoc; - } + async_wq = alloc_workqueue("firewire-async-card%u", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI | WQ_SYSFS, + 4, card->index); + if (!async_wq) + return -ENOMEM; + card->isoc_wq = isoc_wq; + card->async_wq = async_wq; card->max_receive = max_receive; card->link_speed = link_speed; card->guid = guid; @@ -617,18 +621,18 @@ int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid, scoped_guard(mutex, &card_mutex) { generate_config_rom(card, tmp_config_rom); ret = card->driver->enable(card, tmp_config_rom, config_rom_length); - if (ret < 0) - goto err_async; + if (ret < 0) { + card->isoc_wq = NULL; + card->async_wq = NULL; + return ret; + } + retain_and_null_ptr(isoc_wq); + retain_and_null_ptr(async_wq); list_add_tail(&card->link, &card_list); } return 0; -err_async: - destroy_workqueue(card->async_wq); -err_isoc: - destroy_workqueue(card->isoc_wq); - return ret; } EXPORT_SYMBOL(fw_card_add); From cbb13dceec65b6021a4f403d4fa7b237569a1007 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 8 Sep 2025 10:21:00 +0900 Subject: [PATCH 1075/3206] firewire: ohci: use return value from fw_node_get() The programming pattern, referring after increasing reference count, is supported by fw_node_get(). This commit simplify the programming pattern. Link: https://lore.kernel.org/r/20250908012108.514698-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index d128c7a8bf5fd9..41902dcc10a015 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -306,8 +306,7 @@ static void bm_work(struct work_struct *work) generation = card->generation; - root_node = card->root_node; - fw_node_get(root_node); + root_node = fw_node_get(card->root_node); root_device = root_node->data; root_device_is_running = root_device && atomic_read(&root_device->state) == FW_DEVICE_RUNNING; From a2bbb8602dc29a8a3fe4f0377c7b820fba384697 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 8 Sep 2025 10:21:01 +0900 Subject: [PATCH 1076/3206] firewire: core: add helper functions to access to fw_device data in fw_node structure The data mbmer in fw_node structure is an opaque pointer, while nowadays it is just used to refer to fw_device associated with the fw_node. This commit redefines the opaque pointer to a pointer to fw_device structure, and adds some helper functions to set/get it. Link: https://lore.kernel.org/r/20250908012108.514698-5-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 4 ++-- drivers/firewire/core-device.c | 18 +++++++++--------- drivers/firewire/core.h | 14 ++++++++++++-- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 41902dcc10a015..4a4210cda571db 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -307,12 +307,12 @@ static void bm_work(struct work_struct *work) generation = card->generation; root_node = fw_node_get(card->root_node); - root_device = root_node->data; + root_device = fw_node_get_device(root_node); root_device_is_running = root_device && atomic_read(&root_device->state) == FW_DEVICE_RUNNING; root_device_is_cmc = root_device && root_device->cmc; - irm_device = card->irm_node->data; + irm_device = fw_node_get_device(card->irm_node); irm_is_1394_1995_only = irm_device && irm_device->config_rom && (irm_device->config_rom[2] & 0x000000f0) == 0; diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index aeacd4cfd6944e..6a04a00146942d 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -887,7 +887,7 @@ static void fw_device_release(struct device *dev) * bus manager work looks at this node. */ scoped_guard(spinlock_irqsave, &card->lock) - device->node->data = NULL; + fw_node_set_device(device->node, NULL); fw_node_put(device->node); kfree(device->config_rom); @@ -1007,7 +1007,7 @@ static void fw_device_init(struct work_struct *work) int ret; /* - * All failure paths here set node->data to NULL, so that we + * All failure paths here call fw_node_set_device(node, NULL), so that we * don't try to do device_for_each_child() on a kfree()'d * device. */ @@ -1051,9 +1051,9 @@ static void fw_device_init(struct work_struct *work) struct fw_node *obsolete_node = reused->node; device->node = obsolete_node; - device->node->data = device; + fw_node_set_device(device->node, device); reused->node = current_node; - reused->node->data = reused; + fw_node_set_device(reused->node, reused); reused->max_speed = device->max_speed; reused->node_id = current_node->node_id; @@ -1292,7 +1292,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) * FW_NODE_UPDATED callbacks can update the node_id * and generation for the device. */ - node->data = device; + fw_node_set_device(node, device); /* * Many devices are slow to respond after bus resets, @@ -1307,7 +1307,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) case FW_NODE_INITIATED_RESET: case FW_NODE_LINK_ON: - device = node->data; + device = fw_node_get_device(node); if (device == NULL) goto create; @@ -1324,7 +1324,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) break; case FW_NODE_UPDATED: - device = node->data; + device = fw_node_get_device(node); if (device == NULL) break; @@ -1339,7 +1339,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) case FW_NODE_DESTROYED: case FW_NODE_LINK_OFF: - if (!node->data) + if (!fw_node_get_device(node)) break; /* @@ -1354,7 +1354,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) * the device in shutdown state to have that code fail * to create the device. */ - device = node->data; + device = fw_node_get_device(node); if (atomic_xchg(&device->state, FW_DEVICE_GONE) == FW_DEVICE_RUNNING) { device->workfn = fw_device_shutdown; diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index 9b298af1cac0b8..083e39034c378e 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -194,8 +194,8 @@ struct fw_node { /* For serializing node topology into a list. */ struct list_head link; - /* Upper layer specific data. */ - void *data; + // The device when already associated, else NULL. + struct fw_device *device; struct fw_node *ports[] __counted_by(port_count); }; @@ -219,6 +219,16 @@ static inline void fw_node_put(struct fw_node *node) kref_put(&node->kref, release_node); } +static inline struct fw_device *fw_node_get_device(struct fw_node *node) +{ + return node->device; +} + +static inline void fw_node_set_device(struct fw_node *node, struct fw_device *device) +{ + node->device = device; +} + void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, int self_id_count, u32 *self_ids, bool bm_abdicate); void fw_destroy_nodes(struct fw_card *card); From 25feb1a96e21ae37790c109ea2c675061ba11794 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 8 Sep 2025 10:21:02 +0900 Subject: [PATCH 1077/3206] firewire: core: use cleanup function in bm_work In "bm_work" function, the references to fw_card and fw_node are released at last. This is achieved by using goto statements. For this case, the kernel cleanup framework is available. This commit uses the framework to remove these statements. Link: https://lore.kernel.org/r/20250908012108.514698-6-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 4a4210cda571db..5bd89ddf501899 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -280,14 +280,17 @@ void fw_schedule_bm_work(struct fw_card *card, unsigned long delay) fw_card_put(card); } +DEFINE_FREE(node_unref, struct fw_node *, if (_T) fw_node_put(_T)) +DEFINE_FREE(card_unref, struct fw_card *, if (_T) fw_card_put(_T)) + static void bm_work(struct work_struct *work) { static const char gap_count_table[] = { 63, 5, 7, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 35, 37, 40 }; - struct fw_card *card = from_work(card, work, bm_work.work); + struct fw_card *card __free(card_unref) = from_work(card, work, bm_work.work); struct fw_device *root_device, *irm_device; - struct fw_node *root_node; + struct fw_node *root_node __free(node_unref) = NULL; int root_id, new_root_id, irm_id, bm_id, local_id; int gap_count, generation, grace, rcode; bool do_reset = false; @@ -297,11 +300,13 @@ static void bm_work(struct work_struct *work) bool keep_this_irm; __be32 transaction_data[2]; + lockdep_assert_held(&card->lock); + spin_lock_irq(&card->lock); if (card->local_node == NULL) { spin_unlock_irq(&card->lock); - goto out_put_card; + return; } generation = card->generation; @@ -366,9 +371,9 @@ static void bm_work(struct work_struct *work) CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, transaction_data, 8); + // Another bus reset, BM work has been rescheduled. if (rcode == RCODE_GENERATION) - /* Another bus reset, BM work has been rescheduled. */ - goto out; + return; bm_id = be32_to_cpu(transaction_data[0]); @@ -382,8 +387,7 @@ static void bm_work(struct work_struct *work) /* Somebody else is BM. Only act as IRM. */ if (local_id == irm_id) allocate_broadcast_channel(card, generation); - - goto out; + return; } if (rcode == RCODE_SEND_ERROR) { @@ -393,7 +397,7 @@ static void bm_work(struct work_struct *work) * that the problem has gone away by then. */ fw_schedule_bm_work(card, DIV_ROUND_UP(HZ, 8)); - goto out; + return; } spin_lock_irq(&card->lock); @@ -417,7 +421,7 @@ static void bm_work(struct work_struct *work) */ spin_unlock_irq(&card->lock); fw_schedule_bm_work(card, DIV_ROUND_UP(HZ, 8)); - goto out; + return; } /* @@ -455,7 +459,7 @@ static void bm_work(struct work_struct *work) * and let's try again once that's done. */ spin_unlock_irq(&card->lock); - goto out; + return; } else if (root_device_is_cmc) { /* * We will send out a force root packet for this @@ -512,7 +516,7 @@ static void bm_work(struct work_struct *work) */ reset_bus(card, card->gap_count != 0); /* Will allocate broadcast channel after the reset. */ - goto out; + return; } if (root_device_is_cmc) { @@ -525,16 +529,11 @@ static void bm_work(struct work_struct *work) CSR_REGISTER_BASE + CSR_STATE_SET, transaction_data, 4); if (rcode == RCODE_GENERATION) - goto out; + return; } if (local_id == irm_id) allocate_broadcast_channel(card, generation); - - out: - fw_node_put(root_node); - out_put_card: - fw_card_put(card); } void fw_card_initialize(struct fw_card *card, From b70a5f33381f7815f4a579f7b9de33f276c9d8f9 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 8 Sep 2025 10:21:03 +0900 Subject: [PATCH 1078/3206] firewire: ohci: localize transaction data and rcode per condition branch The function local variable, transaction_data, in bm_work function is conditionally used. In the case, the branch-level variable is sometimes useful. This commit uses this idea. Link: https://lore.kernel.org/r/20250908012108.514698-7-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 5bd89ddf501899..b98797e4f1d4b1 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -292,13 +292,12 @@ static void bm_work(struct work_struct *work) struct fw_device *root_device, *irm_device; struct fw_node *root_node __free(node_unref) = NULL; int root_id, new_root_id, irm_id, bm_id, local_id; - int gap_count, generation, grace, rcode; + int gap_count, generation, grace; bool do_reset = false; bool root_device_is_running; bool root_device_is_cmc; bool irm_is_1394_1995_only; bool keep_this_irm; - __be32 transaction_data[2]; lockdep_assert_held(&card->lock); @@ -346,6 +345,11 @@ static void bm_work(struct work_struct *work) * gap count. That could well save a reset in the * next generation. */ + __be32 data[2] = { + cpu_to_be32(0x3f), + cpu_to_be32(local_id), + }; + int rcode; if (!card->irm_node->link_on) { new_root_id = local_id; @@ -361,21 +365,18 @@ static void bm_work(struct work_struct *work) goto pick_me; } - transaction_data[0] = cpu_to_be32(0x3f); - transaction_data[1] = cpu_to_be32(local_id); - spin_unlock_irq(&card->lock); rcode = fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP, irm_id, generation, SCODE_100, CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, - transaction_data, 8); + data, sizeof(data)); // Another bus reset, BM work has been rescheduled. if (rcode == RCODE_GENERATION) return; - bm_id = be32_to_cpu(transaction_data[0]); + bm_id = be32_to_cpu(data[0]); scoped_guard(spinlock_irq, &card->lock) { if (rcode == RCODE_COMPLETE && generation == card->generation) @@ -523,11 +524,11 @@ static void bm_work(struct work_struct *work) /* * Make sure that the cycle master sends cycle start packets. */ - transaction_data[0] = cpu_to_be32(CSR_STATE_BIT_CMSTR); - rcode = fw_run_transaction(card, TCODE_WRITE_QUADLET_REQUEST, + __be32 data = cpu_to_be32(CSR_STATE_BIT_CMSTR); + int rcode = fw_run_transaction(card, TCODE_WRITE_QUADLET_REQUEST, root_id, generation, SCODE_100, CSR_REGISTER_BASE + CSR_STATE_SET, - transaction_data, 4); + &data, sizeof(data)); if (rcode == RCODE_GENERATION) return; } From 7dc12e84eff7f934e2456a858ad23d3743c69578 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 8 Sep 2025 10:21:04 +0900 Subject: [PATCH 1079/3206] firewire: core: code refactoring to evaluate transaction result to CSR_BUS_MANAGER_ID The call of bm_work should be done after acquiring spin lock of fw_card. For asynchronous transaction, the lock should be released temporarily due to event waiting. A commit 27310d561622 ("firewire: core: use guard macro to maintain properties of fw_card") applied scoped_guard() to the bm_work function, however it looks hard to follow to the control flow. This commit refactors the spin lock acquisition after the transaction. Link: https://lore.kernel.org/r/20250908012108.514698-8-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index b98797e4f1d4b1..e1a7a151b109d3 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -291,7 +291,7 @@ static void bm_work(struct work_struct *work) struct fw_card *card __free(card_unref) = from_work(card, work, bm_work.work); struct fw_device *root_device, *irm_device; struct fw_node *root_node __free(node_unref) = NULL; - int root_id, new_root_id, irm_id, bm_id, local_id; + int root_id, new_root_id, irm_id, local_id; int gap_count, generation, grace; bool do_reset = false; bool root_device_is_running; @@ -376,19 +376,22 @@ static void bm_work(struct work_struct *work) if (rcode == RCODE_GENERATION) return; - bm_id = be32_to_cpu(data[0]); + spin_lock_irq(&card->lock); - scoped_guard(spinlock_irq, &card->lock) { - if (rcode == RCODE_COMPLETE && generation == card->generation) - card->bm_node_id = - bm_id == 0x3f ? local_id : 0xffc0 | bm_id; - } + if (rcode == RCODE_COMPLETE) { + int bm_id = be32_to_cpu(data[0]); - if (rcode == RCODE_COMPLETE && bm_id != 0x3f) { - /* Somebody else is BM. Only act as IRM. */ - if (local_id == irm_id) - allocate_broadcast_channel(card, generation); - return; + if (generation == card->generation) + card->bm_node_id = bm_id == 0x3f ? local_id : 0xffc0 | bm_id; + + if (bm_id != 0x3f) { + spin_unlock_irq(&card->lock); + + // Somebody else is BM. Only act as IRM. + if (local_id == irm_id) + allocate_broadcast_channel(card, generation); + return; + } } if (rcode == RCODE_SEND_ERROR) { @@ -397,12 +400,11 @@ static void bm_work(struct work_struct *work) * some local problem. Let's try again later and hope * that the problem has gone away by then. */ + spin_unlock_irq(&card->lock); fw_schedule_bm_work(card, DIV_ROUND_UP(HZ, 8)); return; } - spin_lock_irq(&card->lock); - if (rcode != RCODE_COMPLETE && !keep_this_irm) { /* * The lock request failed, maybe the IRM From 8c2d2fcd6b7934c7d694148b542760144a2a1b5a Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 8 Sep 2025 10:21:05 +0900 Subject: [PATCH 1080/3206] firewire: core: refer fw_card member to initiate bus reset under acquiring lock The gap_count member of fw_card structure is referred when initiate bus reset. This reference is done out of acquiring lock. This is not good. This commit takes the reference within the acquiring lock, with additional code refactoring. Link: https://lore.kernel.org/r/20250908012108.514698-9-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 52 ++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index e1a7a151b109d3..630e229c9cc24e 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -292,7 +292,7 @@ static void bm_work(struct work_struct *work) struct fw_device *root_device, *irm_device; struct fw_node *root_node __free(node_unref) = NULL; int root_id, new_root_id, irm_id, local_id; - int gap_count, generation, grace; + int expected_gap_count, generation, grace; bool do_reset = false; bool root_device_is_running; bool root_device_is_cmc; @@ -485,9 +485,9 @@ static void bm_work(struct work_struct *work) */ if (!card->beta_repeaters_present && root_node->max_hops < ARRAY_SIZE(gap_count_table)) - gap_count = gap_count_table[root_node->max_hops]; + expected_gap_count = gap_count_table[root_node->max_hops]; else - gap_count = 63; + expected_gap_count = 63; /* * Finally, figure out if we should do a reset or not. If we have @@ -495,16 +495,17 @@ static void bm_work(struct work_struct *work) * have either a new root or a new gap count setting, let's do it. */ - if (card->bm_retries++ < 5 && - (card->gap_count != gap_count || new_root_id != root_id)) + if (card->bm_retries++ < 5 && (card->gap_count != expected_gap_count || new_root_id != root_id)) do_reset = true; - spin_unlock_irq(&card->lock); - if (do_reset) { + int card_gap_count = card->gap_count; + + spin_unlock_irq(&card->lock); + fw_notice(card, "phy config: new root=%x, gap_count=%d\n", - new_root_id, gap_count); - fw_send_phy_config(card, new_root_id, generation, gap_count); + new_root_id, expected_gap_count); + fw_send_phy_config(card, new_root_id, generation, expected_gap_count); /* * Where possible, use a short bus reset to minimize * disruption to isochronous transfers. But in the event @@ -517,26 +518,25 @@ static void bm_work(struct work_struct *work) * may treat it as two, causing a gap count inconsistency * again. Using a long bus reset prevents this. */ - reset_bus(card, card->gap_count != 0); + reset_bus(card, card_gap_count != 0); /* Will allocate broadcast channel after the reset. */ - return; - } + } else { + spin_unlock_irq(&card->lock); - if (root_device_is_cmc) { - /* - * Make sure that the cycle master sends cycle start packets. - */ - __be32 data = cpu_to_be32(CSR_STATE_BIT_CMSTR); - int rcode = fw_run_transaction(card, TCODE_WRITE_QUADLET_REQUEST, - root_id, generation, SCODE_100, - CSR_REGISTER_BASE + CSR_STATE_SET, - &data, sizeof(data)); - if (rcode == RCODE_GENERATION) - return; - } + if (root_device_is_cmc) { + // Make sure that the cycle master sends cycle start packets. + __be32 data = cpu_to_be32(CSR_STATE_BIT_CMSTR); + int rcode = fw_run_transaction(card, TCODE_WRITE_QUADLET_REQUEST, + root_id, generation, SCODE_100, + CSR_REGISTER_BASE + CSR_STATE_SET, + &data, sizeof(data)); + if (rcode == RCODE_GENERATION) + return; + } - if (local_id == irm_id) - allocate_broadcast_channel(card, generation); + if (local_id == irm_id) + allocate_broadcast_channel(card, generation); + } } void fw_card_initialize(struct fw_card *card, From ca17601b15d12bc8c435a4068fa2f907501d9305 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 8 Sep 2025 10:21:06 +0900 Subject: [PATCH 1081/3206] firewire: core: code refactoring to detect both IEEE 1394:1995 IRM and Canon MV5i The detection of IEEE 1394:1995 and Canon MV5i is just required within some of the condition branches. In this case, these check can be capsulated within these branches. This commit refactors the checks. Link: https://lore.kernel.org/r/20250908012108.514698-10-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 630e229c9cc24e..99aa98f195baf8 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -289,15 +289,13 @@ static void bm_work(struct work_struct *work) 63, 5, 7, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 35, 37, 40 }; struct fw_card *card __free(card_unref) = from_work(card, work, bm_work.work); - struct fw_device *root_device, *irm_device; + struct fw_device *root_device; struct fw_node *root_node __free(node_unref) = NULL; int root_id, new_root_id, irm_id, local_id; int expected_gap_count, generation, grace; bool do_reset = false; bool root_device_is_running; bool root_device_is_cmc; - bool irm_is_1394_1995_only; - bool keep_this_irm; lockdep_assert_held(&card->lock); @@ -316,14 +314,6 @@ static void bm_work(struct work_struct *work) atomic_read(&root_device->state) == FW_DEVICE_RUNNING; root_device_is_cmc = root_device && root_device->cmc; - irm_device = fw_node_get_device(card->irm_node); - irm_is_1394_1995_only = irm_device && irm_device->config_rom && - (irm_device->config_rom[2] & 0x000000f0) == 0; - - /* Canon MV5i works unreliably if it is not root node. */ - keep_this_irm = irm_device && irm_device->config_rom && - irm_device->config_rom[3] >> 8 == CANON_OUI; - root_id = root_node->node_id; irm_id = card->irm_node->node_id; local_id = card->local_node->node_id; @@ -349,6 +339,9 @@ static void bm_work(struct work_struct *work) cpu_to_be32(0x3f), cpu_to_be32(local_id), }; + struct fw_device *irm_device = fw_node_get_device(card->irm_node); + bool irm_is_1394_1995_only = false; + bool keep_this_irm = false; int rcode; if (!card->irm_node->link_on) { @@ -358,6 +351,13 @@ static void bm_work(struct work_struct *work) goto pick_me; } + if (irm_device && irm_device->config_rom) { + irm_is_1394_1995_only = (irm_device->config_rom[2] & 0x000000f0) == 0; + + // Canon MV5i works unreliably if it is not root node. + keep_this_irm = irm_device->config_rom[3] >> 8 == CANON_OUI; + } + if (irm_is_1394_1995_only && !keep_this_irm) { new_root_id = local_id; fw_notice(card, "%s, making local node (%02x) root\n", From cae2d92cdcae3f2f4510feb631661e86a26da55e Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 8 Sep 2025 10:21:07 +0900 Subject: [PATCH 1082/3206] firewire: core: code refactoring to investigate root node for bus manager In the middle of bm_work function, both the value of gap_count and the state of root node are investigated. Current implementation is not a good shape since the investigation is aligned to be flat. This commit refactors the investigation with two large branches. Link: https://lore.kernel.org/r/20250908012108.514698-11-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 56 +++++++++++++++++------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 99aa98f195baf8..b430a70a7eeb05 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -294,7 +294,6 @@ static void bm_work(struct work_struct *work) int root_id, new_root_id, irm_id, local_id; int expected_gap_count, generation, grace; bool do_reset = false; - bool root_device_is_running; bool root_device_is_cmc; lockdep_assert_held(&card->lock); @@ -310,8 +309,6 @@ static void bm_work(struct work_struct *work) root_node = fw_node_get(card->root_node); root_device = fw_node_get_device(root_node); - root_device_is_running = root_device && - atomic_read(&root_device->state) == FW_DEVICE_RUNNING; root_device_is_cmc = root_device && root_device->cmc; root_id = root_node->node_id; @@ -450,34 +447,35 @@ static void bm_work(struct work_struct *work) * is inconsistent, so bypass the 5-reset limit. */ card->bm_retries = 0; - } else if (root_device == NULL) { - /* - * Either link_on is false, or we failed to read the - * config rom. In either case, pick another root. - */ - new_root_id = local_id; - } else if (!root_device_is_running) { - /* - * If we haven't probed this device yet, bail out now - * and let's try again once that's done. - */ - spin_unlock_irq(&card->lock); - return; - } else if (root_device_is_cmc) { - /* - * We will send out a force root packet for this - * node as part of the gap count optimization. - */ - new_root_id = root_id; } else { - /* - * Current root has an active link layer and we - * successfully read the config rom, but it's not - * cycle master capable. - */ - new_root_id = local_id; - } + // Now investigate root node. + struct fw_device *root_device = fw_node_get_device(root_node); + + if (root_device == NULL) { + // Either link_on is false, or we failed to read the + // config rom. In either case, pick another root. + new_root_id = local_id; + } else { + bool root_device_is_running = + atomic_read(&root_device->state) == FW_DEVICE_RUNNING; + if (!root_device_is_running) { + // If we haven't probed this device yet, bail out now + // and let's try again once that's done. + spin_unlock_irq(&card->lock); + return; + } else if (root_device->cmc) { + // We will send out a force root packet for this + // node as part of the gap count optimization. + new_root_id = root_id; + } else { + // Current root has an active link layer and we + // successfully read the config rom, but it's not + // cycle master capable. + new_root_id = local_id; + } + } + } pick_me: /* * Pick a gap count from 1394a table E-1. The table doesn't cover From a4bac55d99d37976209e2fc2c32bd3dfc86b0447 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 8 Sep 2025 10:21:08 +0900 Subject: [PATCH 1083/3206] firewire: core: code refactoring whether root node is cycle master capable The check of cycle master capability in root node is currently just in a condition branch. In this case, the required variable should be within the branch. This commit is just for the purpose. Link: https://lore.kernel.org/r/20250908012108.514698-12-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index b430a70a7eeb05..474d8066e090ba 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -289,12 +289,10 @@ static void bm_work(struct work_struct *work) 63, 5, 7, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 35, 37, 40 }; struct fw_card *card __free(card_unref) = from_work(card, work, bm_work.work); - struct fw_device *root_device; struct fw_node *root_node __free(node_unref) = NULL; int root_id, new_root_id, irm_id, local_id; int expected_gap_count, generation, grace; bool do_reset = false; - bool root_device_is_cmc; lockdep_assert_held(&card->lock); @@ -308,8 +306,6 @@ static void bm_work(struct work_struct *work) generation = card->generation; root_node = fw_node_get(card->root_node); - root_device = fw_node_get_device(root_node); - root_device_is_cmc = root_device && root_device->cmc; root_id = root_node->node_id; irm_id = card->irm_node->node_id; @@ -519,9 +515,11 @@ static void bm_work(struct work_struct *work) reset_bus(card, card_gap_count != 0); /* Will allocate broadcast channel after the reset. */ } else { + struct fw_device *root_device = fw_node_get_device(root_node); + spin_unlock_irq(&card->lock); - if (root_device_is_cmc) { + if (root_device && root_device->cmc) { // Make sure that the cycle master sends cycle start packets. __be32 data = cpu_to_be32(CSR_STATE_BIT_CMSTR); int rcode = fw_run_transaction(card, TCODE_WRITE_QUADLET_REQUEST, From b23c22a8d715e6f681381592db377aaabc1a2178 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:05 +0200 Subject: [PATCH 1084/3206] gpio: ixp4xx: allow building the module with COMPILE_TEST enabled Increase build coverage by allowing the module to be built with COMPILE_TEST=y. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-1-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 6fd904e29c3e74..0fd5b09c499ac8 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -408,8 +408,7 @@ config GPIO_IMX_SCU config GPIO_IXP4XX bool "Intel IXP4xx GPIO" - depends on ARCH_IXP4XX - depends on OF + depends on (ARCH_IXP4XX && OF) || COMPILE_TEST select GPIO_GENERIC select GPIOLIB_IRQCHIP select IRQ_DOMAIN_HIERARCHY From bd9bfafae0239f9d7187116dace6e0d80d27b678 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:06 +0200 Subject: [PATCH 1085/3206] gpio: ixp4xx: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-2-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ixp4xx.c | 70 ++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/drivers/gpio/gpio-ixp4xx.c b/drivers/gpio/gpio-ixp4xx.c index 28a8a6a8f05fee..0cf10d0ba16ef7 100644 --- a/drivers/gpio/gpio-ixp4xx.c +++ b/drivers/gpio/gpio-ixp4xx.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -53,14 +54,14 @@ /** * struct ixp4xx_gpio - IXP4 GPIO state container + * @chip: generic GPIO chip for this instance * @dev: containing device for this instance - * @gc: gpiochip for this instance * @base: remapped I/O-memory base * @irq_edge: Each bit represents an IRQ: 1: edge-triggered, * 0: level triggered */ struct ixp4xx_gpio { - struct gpio_chip gc; + struct gpio_generic_chip chip; struct device *dev; void __iomem *base; unsigned long long irq_edge; @@ -100,7 +101,6 @@ static int ixp4xx_gpio_irq_set_type(struct irq_data *d, unsigned int type) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct ixp4xx_gpio *g = gpiochip_get_data(gc); int line = d->hwirq; - unsigned long flags; u32 int_style; u32 int_reg; u32 val; @@ -144,26 +144,24 @@ static int ixp4xx_gpio_irq_set_type(struct irq_data *d, unsigned int type) int_reg = IXP4XX_REG_GPIT1; } - raw_spin_lock_irqsave(&g->gc.bgpio_lock, flags); - - /* Clear the style for the appropriate pin */ - val = __raw_readl(g->base + int_reg); - val &= ~(IXP4XX_GPIO_STYLE_MASK << (line * IXP4XX_GPIO_STYLE_SIZE)); - __raw_writel(val, g->base + int_reg); - - __raw_writel(BIT(line), g->base + IXP4XX_REG_GPIS); + scoped_guard(gpio_generic_lock_irqsave, &g->chip) { + /* Clear the style for the appropriate pin */ + val = __raw_readl(g->base + int_reg); + val &= ~(IXP4XX_GPIO_STYLE_MASK << (line * IXP4XX_GPIO_STYLE_SIZE)); + __raw_writel(val, g->base + int_reg); - /* Set the new style */ - val = __raw_readl(g->base + int_reg); - val |= (int_style << (line * IXP4XX_GPIO_STYLE_SIZE)); - __raw_writel(val, g->base + int_reg); + __raw_writel(BIT(line), g->base + IXP4XX_REG_GPIS); - /* Force-configure this line as an input */ - val = __raw_readl(g->base + IXP4XX_REG_GPOE); - val |= BIT(d->hwirq); - __raw_writel(val, g->base + IXP4XX_REG_GPOE); + /* Set the new style */ + val = __raw_readl(g->base + int_reg); + val |= (int_style << (line * IXP4XX_GPIO_STYLE_SIZE)); + __raw_writel(val, g->base + int_reg); - raw_spin_unlock_irqrestore(&g->gc.bgpio_lock, flags); + /* Force-configure this line as an input */ + val = __raw_readl(g->base + IXP4XX_REG_GPOE); + val |= BIT(d->hwirq); + __raw_writel(val, g->base + IXP4XX_REG_GPOE); + } /* This parent only accept level high (asserted) */ return irq_chip_set_type_parent(d, IRQ_TYPE_LEVEL_HIGH); @@ -206,6 +204,7 @@ static int ixp4xx_gpio_child_to_parent_hwirq(struct gpio_chip *gc, static int ixp4xx_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; unsigned long flags; struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; @@ -295,30 +294,33 @@ static int ixp4xx_gpio_probe(struct platform_device *pdev) flags = 0; #endif + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = g->base + IXP4XX_REG_GPIN, + .set = g->base + IXP4XX_REG_GPOUT, + .dirin = g->base + IXP4XX_REG_GPOE, + .flags = flags, + }; + /* Populate and register gpio chip */ - ret = bgpio_init(&g->gc, dev, 4, - g->base + IXP4XX_REG_GPIN, - g->base + IXP4XX_REG_GPOUT, - NULL, - NULL, - g->base + IXP4XX_REG_GPOE, - flags); + ret = gpio_generic_chip_init(&g->chip, &config); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; } - g->gc.ngpio = 16; - g->gc.label = "IXP4XX_GPIO_CHIP"; + g->chip.gc.ngpio = 16; + g->chip.gc.label = "IXP4XX_GPIO_CHIP"; /* * TODO: when we have migrated to device tree and all GPIOs * are fetched using phandles, set this to -1 to get rid of * the fixed gpiochip base. */ - g->gc.base = 0; - g->gc.parent = &pdev->dev; - g->gc.owner = THIS_MODULE; + g->chip.gc.base = 0; + g->chip.gc.parent = &pdev->dev; + g->chip.gc.owner = THIS_MODULE; - girq = &g->gc.irq; + girq = &g->chip.gc.irq; gpio_irq_chip_set_chip(girq, &ixp4xx_gpio_irqchip); girq->fwnode = dev_fwnode(dev); girq->parent_domain = parent; @@ -326,7 +328,7 @@ static int ixp4xx_gpio_probe(struct platform_device *pdev) girq->handler = handle_bad_irq; girq->default_type = IRQ_TYPE_NONE; - ret = devm_gpiochip_add_data(dev, &g->gc, g); + ret = devm_gpiochip_add_data(dev, &g->chip.gc, g); if (ret) { dev_err(dev, "failed to add SoC gpiochip\n"); return ret; From f21c10649acc15802e942ca6ae78cb76657a0639 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:07 +0200 Subject: [PATCH 1086/3206] gpio: idt3243x: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-3-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-idt3243x.c | 45 ++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/drivers/gpio/gpio-idt3243x.c b/drivers/gpio/gpio-idt3243x.c index 535f255144556e..232a621ba086ef 100644 --- a/drivers/gpio/gpio-idt3243x.c +++ b/drivers/gpio/gpio-idt3243x.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -18,7 +19,7 @@ #define IDT_GPIO_ISTAT 0x0C struct idt_gpio_ctrl { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *pic; void __iomem *gpio; u32 mask_cache; @@ -50,14 +51,13 @@ static int idt_gpio_irq_set_type(struct irq_data *d, unsigned int flow_type) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct idt_gpio_ctrl *ctrl = gpiochip_get_data(gc); unsigned int sense = flow_type & IRQ_TYPE_SENSE_MASK; - unsigned long flags; u32 ilevel; /* hardware only supports level triggered */ if (sense == IRQ_TYPE_NONE || (sense & IRQ_TYPE_EDGE_BOTH)) return -EINVAL; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&ctrl->chip); ilevel = readl(ctrl->gpio + IDT_GPIO_ILEVEL); if (sense & IRQ_TYPE_LEVEL_HIGH) @@ -68,7 +68,6 @@ static int idt_gpio_irq_set_type(struct irq_data *d, unsigned int flow_type) writel(ilevel, ctrl->gpio + IDT_GPIO_ILEVEL); irq_set_handler_locked(d, handle_level_irq); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); return 0; } @@ -84,14 +83,11 @@ static void idt_gpio_mask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct idt_gpio_ctrl *ctrl = gpiochip_get_data(gc); - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - - ctrl->mask_cache |= BIT(d->hwirq); - writel(ctrl->mask_cache, ctrl->pic + IDT_PIC_IRQ_MASK); - - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + scoped_guard(gpio_generic_lock_irqsave, &ctrl->chip) { + ctrl->mask_cache |= BIT(d->hwirq); + writel(ctrl->mask_cache, ctrl->pic + IDT_PIC_IRQ_MASK); + } gpiochip_disable_irq(gc, irqd_to_hwirq(d)); } @@ -100,15 +96,13 @@ static void idt_gpio_unmask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct idt_gpio_ctrl *ctrl = gpiochip_get_data(gc); - unsigned long flags; gpiochip_enable_irq(gc, irqd_to_hwirq(d)); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + + guard(gpio_generic_lock_irqsave)(&ctrl->chip); ctrl->mask_cache &= ~BIT(d->hwirq); writel(ctrl->mask_cache, ctrl->pic + IDT_PIC_IRQ_MASK); - - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static int idt_gpio_irq_init_hw(struct gpio_chip *gc) @@ -134,6 +128,7 @@ static const struct irq_chip idt_gpio_irqchip = { static int idt_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct gpio_irq_chip *girq; struct idt_gpio_ctrl *ctrl; @@ -150,18 +145,24 @@ static int idt_gpio_probe(struct platform_device *pdev) if (IS_ERR(ctrl->gpio)) return PTR_ERR(ctrl->gpio); - ctrl->gc.parent = dev; + ctrl->chip.gc.parent = dev; + + config = (typeof(config)){ + .dev = &pdev->dev, + .sz = 4, + .dat = ctrl->gpio + IDT_GPIO_DATA, + .dirout = ctrl->gpio + IDT_GPIO_DIR, + }; - ret = bgpio_init(&ctrl->gc, &pdev->dev, 4, ctrl->gpio + IDT_GPIO_DATA, - NULL, NULL, ctrl->gpio + IDT_GPIO_DIR, NULL, 0); + ret = gpio_generic_chip_init(&ctrl->chip, &config); if (ret) { - dev_err(dev, "bgpio_init failed\n"); + dev_err(dev, "failed to initialize the generic GPIO chip\n"); return ret; } ret = device_property_read_u32(dev, "ngpios", &ngpios); if (!ret) - ctrl->gc.ngpio = ngpios; + ctrl->chip.gc.ngpio = ngpios; if (device_property_read_bool(dev, "interrupt-controller")) { ctrl->pic = devm_platform_ioremap_resource_byname(pdev, "pic"); @@ -172,7 +173,7 @@ static int idt_gpio_probe(struct platform_device *pdev) if (parent_irq < 0) return parent_irq; - girq = &ctrl->gc.irq; + girq = &ctrl->chip.gc.irq; gpio_irq_chip_set_chip(girq, &idt_gpio_irqchip); girq->init_hw = idt_gpio_irq_init_hw; girq->parent_handler = idt_gpio_dispatch; @@ -188,7 +189,7 @@ static int idt_gpio_probe(struct platform_device *pdev) girq->handler = handle_bad_irq; } - return devm_gpiochip_add_data(&pdev->dev, &ctrl->gc, ctrl); + return devm_gpiochip_add_data(&pdev->dev, &ctrl->chip.gc, ctrl); } static const struct of_device_id idt_gpio_of_match[] = { From 59b82bedbfe7452996a7163a9a19ff1239fb86c1 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:08 +0200 Subject: [PATCH 1087/3206] gpio: blzp1600: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-4-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-blzp1600.c | 39 ++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/drivers/gpio/gpio-blzp1600.c b/drivers/gpio/gpio-blzp1600.c index 055cb296ae5475..bfb35d59fa561c 100644 --- a/drivers/gpio/gpio-blzp1600.c +++ b/drivers/gpio/gpio-blzp1600.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -36,7 +37,7 @@ struct blzp1600_gpio { void __iomem *base; - struct gpio_chip gc; + struct gpio_generic_chip gen_gc; int irq; }; @@ -76,7 +77,7 @@ static void blzp1600_gpio_irq_mask(struct irq_data *d) { struct blzp1600_gpio *chip = get_blzp1600_gpio_from_irq_data(d); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); blzp1600_gpio_rmw(chip->base + GPIO_IM_REG, BIT(d->hwirq), 1); } @@ -84,7 +85,7 @@ static void blzp1600_gpio_irq_unmask(struct irq_data *d) { struct blzp1600_gpio *chip = get_blzp1600_gpio_from_irq_data(d); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); blzp1600_gpio_rmw(chip->base + GPIO_IM_REG, BIT(d->hwirq), 0); } @@ -99,9 +100,9 @@ static void blzp1600_gpio_irq_enable(struct irq_data *d) { struct blzp1600_gpio *chip = get_blzp1600_gpio_from_irq_data(d); - gpiochip_enable_irq(&chip->gc, irqd_to_hwirq(d)); + gpiochip_enable_irq(&chip->gen_gc.gc, irqd_to_hwirq(d)); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); blzp1600_gpio_rmw(chip->base + GPIO_DIR_REG, BIT(d->hwirq), 0); blzp1600_gpio_rmw(chip->base + GPIO_IEN_REG, BIT(d->hwirq), 1); } @@ -110,9 +111,9 @@ static void blzp1600_gpio_irq_disable(struct irq_data *d) { struct blzp1600_gpio *chip = get_blzp1600_gpio_from_irq_data(d); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); blzp1600_gpio_rmw(chip->base + GPIO_IEN_REG, BIT(d->hwirq), 0); - gpiochip_disable_irq(&chip->gc, irqd_to_hwirq(d)); + gpiochip_disable_irq(&chip->gen_gc.gc, irqd_to_hwirq(d)); } static int blzp1600_gpio_irq_set_type(struct irq_data *d, u32 type) @@ -121,7 +122,7 @@ static int blzp1600_gpio_irq_set_type(struct irq_data *d, u32 type) u32 edge_level, single_both, fall_rise; int mask = BIT(d->hwirq); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); edge_level = blzp1600_gpio_read(chip, GPIO_IS_REG); single_both = blzp1600_gpio_read(chip, GPIO_IBE_REG); fall_rise = blzp1600_gpio_read(chip, GPIO_IEV_REG); @@ -186,8 +187,8 @@ static void blzp1600_gpio_irqhandler(struct irq_desc *desc) chained_irq_enter(irqchip, desc); irq_status = blzp1600_gpio_read(gpio, GPIO_RIS_REG); - for_each_set_bit(hwirq, &irq_status, gpio->gc.ngpio) - generic_handle_domain_irq(gpio->gc.irq.domain, hwirq); + for_each_set_bit(hwirq, &irq_status, gpio->gen_gc.gc.ngpio) + generic_handle_domain_irq(gpio->gen_gc.gc.irq.domain, hwirq); chained_irq_exit(irqchip, desc); } @@ -197,7 +198,7 @@ static int blzp1600_gpio_set_debounce(struct gpio_chip *gc, unsigned int offset, { struct blzp1600_gpio *chip = gpiochip_get_data(gc); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); blzp1600_gpio_rmw(chip->base + GPIO_DB_REG, BIT(offset), debounce); return 0; @@ -216,6 +217,7 @@ static int blzp1600_gpio_set_config(struct gpio_chip *gc, unsigned int offset, u static int blzp1600_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct blzp1600_gpio *chip; struct gpio_chip *gc; int ret; @@ -228,14 +230,21 @@ static int blzp1600_gpio_probe(struct platform_device *pdev) if (IS_ERR(chip->base)) return PTR_ERR(chip->base); - ret = bgpio_init(&chip->gc, &pdev->dev, 4, chip->base + GPIO_IDATA_REG, - chip->base + GPIO_SET_REG, chip->base + GPIO_CLR_REG, - chip->base + GPIO_DIR_REG, NULL, 0); + config = (typeof(config)){ + .dev = &pdev->dev, + .sz = 4, + .dat = chip->base + GPIO_IDATA_REG, + .set = chip->base + GPIO_SET_REG, + .clr = chip->base + GPIO_CLR_REG, + .dirout = chip->base + GPIO_DIR_REG, + }; + + ret = gpio_generic_chip_init(&chip->gen_gc, &config); if (ret) return dev_err_probe(&pdev->dev, ret, "Failed to register generic gpio\n"); /* configure the gpio chip */ - gc = &chip->gc; + gc = &chip->gen_gc.gc; gc->set_config = blzp1600_gpio_set_config; if (device_property_present(&pdev->dev, "interrupt-controller")) { From 76e61b03d1228914928eb57e55dcb2c62b6caa0b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:09 +0200 Subject: [PATCH 1088/3206] gpio: tb10x: order includes alphabetically For better readability and easier maintenance, order the includes alphabetically. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-5-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-tb10x.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpio/gpio-tb10x.c b/drivers/gpio/gpio-tb10x.c index 1869ee7f9423ef..356d0a82e25f29 100644 --- a/drivers/gpio/gpio-tb10x.c +++ b/drivers/gpio/gpio-tb10x.c @@ -7,20 +7,20 @@ * Christian Ruppert */ -#include -#include -#include +#include #include -#include -#include -#include #include #include +#include +#include +#include +#include #include #include -#include -#include #include +#include +#include +#include #define TB10X_GPIO_DIR_IN (0x00000000) #define TB10X_GPIO_DIR_OUT (0x00000001) From 682fbb18e14cbb47dd30d2c65053be9d8f39a23c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:10 +0200 Subject: [PATCH 1089/3206] gpio: tb10x: allow building the module with COMPILE_TEST=y Increase build coverage by allowing the module to be built with COMPILE_TEST=y. We need an actual prompt entry in this case so add it. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-6-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 0fd5b09c499ac8..2fb77eff3b1f2e 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -734,7 +734,8 @@ config GPIO_TANGIER If built as a module its name will be gpio-tangier. config GPIO_TB10X - bool + bool "Abilis Systems TB10x GPIO controller" + depends on ARC_PLAT_TB10X || COMPILE_TEST select GPIO_GENERIC select GENERIC_IRQ_CHIP select OF_GPIO From 8bbe11bb2fa378016a7e764c6207f6f5360cc979 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:11 +0200 Subject: [PATCH 1090/3206] gpio: tb10x: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-7-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-tb10x.c | 60 +++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/gpio/gpio-tb10x.c b/drivers/gpio/gpio-tb10x.c index 356d0a82e25f29..f20b6654b86555 100644 --- a/drivers/gpio/gpio-tb10x.c +++ b/drivers/gpio/gpio-tb10x.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -20,7 +21,6 @@ #include #include #include -#include #define TB10X_GPIO_DIR_IN (0x00000000) #define TB10X_GPIO_DIR_OUT (0x00000001) @@ -36,13 +36,13 @@ * @base: register base address * @domain: IRQ domain of GPIO generated interrupts managed by this controller * @irq: Interrupt line of parent interrupt controller - * @gc: gpio_chip structure associated to this GPIO controller + * @chip: Generic GPIO chip structure associated with this GPIO controller */ struct tb10x_gpio { void __iomem *base; struct irq_domain *domain; int irq; - struct gpio_chip gc; + struct gpio_generic_chip chip; }; static inline u32 tb10x_reg_read(struct tb10x_gpio *gpio, unsigned int offs) @@ -60,16 +60,13 @@ static inline void tb10x_set_bits(struct tb10x_gpio *gpio, unsigned int offs, u32 mask, u32 val) { u32 r; - unsigned long flags; - raw_spin_lock_irqsave(&gpio->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&gpio->chip); r = tb10x_reg_read(gpio, offs); r = (r & ~mask) | (val & mask); tb10x_reg_write(gpio, offs, r); - - raw_spin_unlock_irqrestore(&gpio->gc.bgpio_lock, flags); } static int tb10x_gpio_to_irq(struct gpio_chip *chip, unsigned offset) @@ -107,6 +104,7 @@ static irqreturn_t tb10x_gpio_irq_cascade(int irq, void *data) static int tb10x_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct tb10x_gpio *tb10x_gpio; struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; @@ -127,9 +125,9 @@ static int tb10x_gpio_probe(struct platform_device *pdev) if (IS_ERR(tb10x_gpio->base)) return PTR_ERR(tb10x_gpio->base); - tb10x_gpio->gc.label = + tb10x_gpio->chip.gc.label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", pdev->dev.of_node); - if (!tb10x_gpio->gc.label) + if (!tb10x_gpio->chip.gc.label) return -ENOMEM; /* @@ -137,29 +135,30 @@ static int tb10x_gpio_probe(struct platform_device *pdev) * the lines, no special set or clear registers and a data direction register * wher 1 means "output". */ - ret = bgpio_init(&tb10x_gpio->gc, dev, 4, - tb10x_gpio->base + OFFSET_TO_REG_DATA, - NULL, - NULL, - tb10x_gpio->base + OFFSET_TO_REG_DDR, - NULL, - 0); + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = tb10x_gpio->base + OFFSET_TO_REG_DATA, + .dirout = tb10x_gpio->base + OFFSET_TO_REG_DDR, + }; + + ret = gpio_generic_chip_init(&tb10x_gpio->chip, &config); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; } - tb10x_gpio->gc.base = -1; - tb10x_gpio->gc.parent = dev; - tb10x_gpio->gc.owner = THIS_MODULE; + tb10x_gpio->chip.gc.base = -1; + tb10x_gpio->chip.gc.parent = dev; + tb10x_gpio->chip.gc.owner = THIS_MODULE; /* - * ngpio is set by bgpio_init() but we override it, this .request() - * callback also overrides the one set up by generic GPIO. + * ngpio is set by gpio_generic_chip_init() but we override it, this + * .request() callback also overrides the one set up by generic GPIO. */ - tb10x_gpio->gc.ngpio = ngpio; - tb10x_gpio->gc.request = gpiochip_generic_request; - tb10x_gpio->gc.free = gpiochip_generic_free; + tb10x_gpio->chip.gc.ngpio = ngpio; + tb10x_gpio->chip.gc.request = gpiochip_generic_request; + tb10x_gpio->chip.gc.free = gpiochip_generic_free; - ret = devm_gpiochip_add_data(dev, &tb10x_gpio->gc, tb10x_gpio); + ret = devm_gpiochip_add_data(dev, &tb10x_gpio->chip.gc, tb10x_gpio); if (ret < 0) { dev_err(dev, "Could not add gpiochip.\n"); return ret; @@ -174,7 +173,7 @@ static int tb10x_gpio_probe(struct platform_device *pdev) if (ret < 0) return ret; - tb10x_gpio->gc.to_irq = tb10x_gpio_to_irq; + tb10x_gpio->chip.gc.to_irq = tb10x_gpio_to_irq; tb10x_gpio->irq = ret; ret = devm_request_irq(dev, ret, tb10x_gpio_irq_cascade, @@ -183,14 +182,15 @@ static int tb10x_gpio_probe(struct platform_device *pdev) if (ret != 0) return ret; - tb10x_gpio->domain = irq_domain_create_linear(dev_fwnode(dev), tb10x_gpio->gc.ngpio, + tb10x_gpio->domain = irq_domain_create_linear(dev_fwnode(dev), + tb10x_gpio->chip.gc.ngpio, &irq_generic_chip_ops, NULL); if (!tb10x_gpio->domain) { return -ENOMEM; } ret = irq_alloc_domain_generic_chips(tb10x_gpio->domain, - tb10x_gpio->gc.ngpio, 1, tb10x_gpio->gc.label, + tb10x_gpio->chip.gc.ngpio, 1, tb10x_gpio->chip.gc.label, handle_edge_irq, IRQ_NOREQUEST, IRQ_NOPROBE, IRQ_GC_INIT_MASK_CACHE); if (ret) @@ -218,9 +218,9 @@ static void tb10x_gpio_remove(struct platform_device *pdev) { struct tb10x_gpio *tb10x_gpio = platform_get_drvdata(pdev); - if (tb10x_gpio->gc.to_irq) { + if (tb10x_gpio->chip.gc.to_irq) { irq_remove_generic_chip(tb10x_gpio->domain->gc->gc[0], - BIT(tb10x_gpio->gc.ngpio) - 1, 0, 0); + BIT(tb10x_gpio->chip.gc.ngpio) - 1, 0, 0); kfree(tb10x_gpio->domain->gc); irq_domain_remove(tb10x_gpio->domain); } From fff086ebc554b8f7942e3839a758cb8e8dfc945f Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:12 +0200 Subject: [PATCH 1091/3206] gpio: mlxbf: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-8-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mlxbf.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/gpio/gpio-mlxbf.c b/drivers/gpio/gpio-mlxbf.c index 1fa9973f55b96a..843f40496be7b7 100644 --- a/drivers/gpio/gpio-mlxbf.c +++ b/drivers/gpio/gpio-mlxbf.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,7 @@ struct mlxbf_gpio_context_save_regs { /* Device state structure. */ struct mlxbf_gpio_state { - struct gpio_chip gc; + struct gpio_generic_chip chip; /* Memory Address */ void __iomem *base; @@ -49,6 +50,7 @@ struct mlxbf_gpio_state { static int mlxbf_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct mlxbf_gpio_state *gs; struct device *dev = &pdev->dev; struct gpio_chip *gc; @@ -62,21 +64,24 @@ static int mlxbf_gpio_probe(struct platform_device *pdev) if (IS_ERR(gs->base)) return PTR_ERR(gs->base); - gc = &gs->gc; - ret = bgpio_init(gc, dev, 8, - gs->base + MLXBF_GPIO_PIN_STATE, - NULL, - NULL, - gs->base + MLXBF_GPIO_PIN_DIR_O, - gs->base + MLXBF_GPIO_PIN_DIR_I, - 0); + gc = &gs->chip.gc; + + config = (typeof(config)){ + .dev = dev, + .sz = 8, + .dat = gs->base + MLXBF_GPIO_PIN_STATE, + .dirout = gs->base + MLXBF_GPIO_PIN_DIR_O, + .dirin = gs->base + MLXBF_GPIO_PIN_DIR_I, + }; + + ret = gpio_generic_chip_init(&gs->chip, &config); if (ret) return -ENODEV; gc->owner = THIS_MODULE; gc->ngpio = MLXBF_GPIO_NR; - ret = devm_gpiochip_add_data(dev, &gs->gc, gs); + ret = devm_gpiochip_add_data(dev, &gs->chip.gc, gs); if (ret) { dev_err(&pdev->dev, "Failed adding memory mapped gpiochip\n"); return ret; From 74dcb9473054954fcd8b91f7155aba5af3e4d555 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:13 +0200 Subject: [PATCH 1092/3206] gpio: ep93xx: allow building the module with COMPILE_TEST enabled Increase build coverage by allowing the module to be built with COMPILE_TEST=y. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-9-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 2fb77eff3b1f2e..08e1fc13195452 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -303,7 +303,7 @@ config GPIO_EN7523 config GPIO_EP93XX def_bool y - depends on ARCH_EP93XX + depends on ARCH_EP93XX || COMPILE_TEST select GPIO_GENERIC select GPIOLIB_IRQCHIP From a685ac653958f7249e7503d69f4f3fabb0642f45 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:14 +0200 Subject: [PATCH 1093/3206] gpio: ep93xx: order includes alphabetically For better readability and easier maintenance, order the includes alphabetically. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-10-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ep93xx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpio-ep93xx.c b/drivers/gpio/gpio-ep93xx.c index 58d2464c07bc36..08e5ae8bf4d1a8 100644 --- a/drivers/gpio/gpio-ep93xx.c +++ b/drivers/gpio/gpio-ep93xx.c @@ -9,16 +9,16 @@ * linux/arch/arm/mach-ep93xx/core.c */ +#include +#include #include -#include -#include #include #include #include -#include -#include -#include +#include +#include #include +#include struct ep93xx_gpio_irq_chip { void __iomem *base; From ab61c8b6138f4987cd1cbe605d6cc698b4902aea Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:15 +0200 Subject: [PATCH 1094/3206] gpio: ep93xx: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-11-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ep93xx.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpio-ep93xx.c b/drivers/gpio/gpio-ep93xx.c index 08e5ae8bf4d1a8..c6c8170813331b 100644 --- a/drivers/gpio/gpio-ep93xx.c +++ b/drivers/gpio/gpio-ep93xx.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -31,11 +32,14 @@ struct ep93xx_gpio_irq_chip { struct ep93xx_gpio_chip { void __iomem *base; - struct gpio_chip gc; + struct gpio_generic_chip chip; struct ep93xx_gpio_irq_chip *eic; }; -#define to_ep93xx_gpio_chip(x) container_of(x, struct ep93xx_gpio_chip, gc) +static struct ep93xx_gpio_chip *to_ep93xx_gpio_chip(struct gpio_chip *gc) +{ + return container_of(to_gpio_generic_chip(gc), struct ep93xx_gpio_chip, chip); +} static struct ep93xx_gpio_irq_chip *to_ep93xx_gpio_irq_chip(struct gpio_chip *gc) { @@ -267,7 +271,7 @@ static const struct irq_chip gpio_eic_irq_chip = { static int ep93xx_setup_irqs(struct platform_device *pdev, struct ep93xx_gpio_chip *egc) { - struct gpio_chip *gc = &egc->gc; + struct gpio_chip *gc = &egc->chip.gc; struct device *dev = &pdev->dev; struct gpio_irq_chip *girq = &gc->irq; int ret, irq, i; @@ -327,6 +331,7 @@ static int ep93xx_setup_irqs(struct platform_device *pdev, static int ep93xx_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct ep93xx_gpio_chip *egc; struct gpio_chip *gc; void __iomem *data; @@ -345,8 +350,16 @@ static int ep93xx_gpio_probe(struct platform_device *pdev) if (IS_ERR(dir)) return PTR_ERR(dir); - gc = &egc->gc; - ret = bgpio_init(gc, &pdev->dev, 1, data, NULL, NULL, dir, NULL, 0); + gc = &egc->chip.gc; + + config = (typeof(config)){ + .dev = &pdev->dev, + .sz = 1, + .dat = data, + .dirout = dir, + }; + + ret = gpio_generic_chip_init(&egc->chip, &config); if (ret) return dev_err_probe(&pdev->dev, ret, "unable to init generic GPIO\n"); From f3c19e70eb897544160c4b96b9ad6b7d921c9fac Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Sep 2025 10:00:16 +0200 Subject: [PATCH 1095/3206] gpio: mlxbf3: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250903-gpio-mmio-gpio-conv-part3-v1-12-ff346509f408@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mlxbf3.c | 103 +++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/drivers/gpio/gpio-mlxbf3.c b/drivers/gpio/gpio-mlxbf3.c index ed29b07d16c190..c812011bdbe65a 100644 --- a/drivers/gpio/gpio-mlxbf3.c +++ b/drivers/gpio/gpio-mlxbf3.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -42,7 +43,7 @@ #define MLXBF_GPIO_CLR_ALL_INTS GENMASK(31, 0) struct mlxbf3_gpio_context { - struct gpio_chip gc; + struct gpio_generic_chip chip; /* YU GPIO block address */ void __iomem *gpio_set_io; @@ -58,18 +59,17 @@ static void mlxbf3_gpio_irq_enable(struct irq_data *irqd) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf3_gpio_context *gs = gpiochip_get_data(gc); irq_hw_number_t offset = irqd_to_hwirq(irqd); - unsigned long flags; u32 val; gpiochip_enable_irq(gc, offset); - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&gs->chip); + writel(BIT(offset), gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_CLRCAUSE); val = readl(gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); val |= BIT(offset); writel(val, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); } static void mlxbf3_gpio_irq_disable(struct irq_data *irqd) @@ -77,16 +77,15 @@ static void mlxbf3_gpio_irq_disable(struct irq_data *irqd) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf3_gpio_context *gs = gpiochip_get_data(gc); irq_hw_number_t offset = irqd_to_hwirq(irqd); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); - val = readl(gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); - val &= ~BIT(offset); - writel(val, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); + scoped_guard(gpio_generic_lock_irqsave, &gs->chip) { + val = readl(gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); + val &= ~BIT(offset); + writel(val, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); - writel(BIT(offset), gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_CLRCAUSE); - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); + writel(BIT(offset), gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_CLRCAUSE); + } gpiochip_disable_irq(gc, offset); } @@ -94,7 +93,7 @@ static void mlxbf3_gpio_irq_disable(struct irq_data *irqd) static irqreturn_t mlxbf3_gpio_irq_handler(int irq, void *ptr) { struct mlxbf3_gpio_context *gs = ptr; - struct gpio_chip *gc = &gs->gc; + struct gpio_chip *gc = &gs->chip.gc; unsigned long pending; u32 level; @@ -113,37 +112,33 @@ mlxbf3_gpio_irq_set_type(struct irq_data *irqd, unsigned int type) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf3_gpio_context *gs = gpiochip_get_data(gc); irq_hw_number_t offset = irqd_to_hwirq(irqd); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); - - switch (type & IRQ_TYPE_SENSE_MASK) { - case IRQ_TYPE_EDGE_BOTH: - val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); - val |= BIT(offset); - writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); - val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); - val |= BIT(offset); - writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); - break; - case IRQ_TYPE_EDGE_RISING: - val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); - val |= BIT(offset); - writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); - break; - case IRQ_TYPE_EDGE_FALLING: - val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); - val |= BIT(offset); - writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); - break; - default: - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); - return -EINVAL; + scoped_guard(gpio_generic_lock_irqsave, &gs->chip) { + switch (type & IRQ_TYPE_SENSE_MASK) { + case IRQ_TYPE_EDGE_BOTH: + val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); + val |= BIT(offset); + writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); + val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); + val |= BIT(offset); + writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); + break; + case IRQ_TYPE_EDGE_RISING: + val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); + val |= BIT(offset); + writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); + break; + case IRQ_TYPE_EDGE_FALLING: + val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); + val |= BIT(offset); + writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); + break; + default: + return -EINVAL; + } } - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); - irq_set_handler_locked(irqd, handle_edge_irq); return 0; @@ -186,6 +181,7 @@ static int mlxbf3_gpio_add_pin_ranges(struct gpio_chip *chip) static int mlxbf3_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct mlxbf3_gpio_context *gs; struct gpio_irq_chip *girq; @@ -211,16 +207,23 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) gs->gpio_clr_io = devm_platform_ioremap_resource(pdev, 3); if (IS_ERR(gs->gpio_clr_io)) return PTR_ERR(gs->gpio_clr_io); - gc = &gs->gc; - - ret = bgpio_init(gc, dev, 4, - gs->gpio_io + MLXBF_GPIO_READ_DATA_IN, - gs->gpio_set_io + MLXBF_GPIO_FW_DATA_OUT_SET, - gs->gpio_clr_io + MLXBF_GPIO_FW_DATA_OUT_CLEAR, - gs->gpio_set_io + MLXBF_GPIO_FW_OUTPUT_ENABLE_SET, - gs->gpio_clr_io + MLXBF_GPIO_FW_OUTPUT_ENABLE_CLEAR, 0); + gc = &gs->chip.gc; + + config = (typeof(config)){ + .dev = dev, + .sz = 4, + .dat = gs->gpio_io + MLXBF_GPIO_READ_DATA_IN, + .set = gs->gpio_set_io + MLXBF_GPIO_FW_DATA_OUT_SET, + .clr = gs->gpio_clr_io + MLXBF_GPIO_FW_DATA_OUT_CLEAR, + .dirout = gs->gpio_set_io + MLXBF_GPIO_FW_OUTPUT_ENABLE_SET, + .dirin = gs->gpio_clr_io + MLXBF_GPIO_FW_OUTPUT_ENABLE_CLEAR, + }; + + ret = gpio_generic_chip_init(&gs->chip, &config); if (ret) - return dev_err_probe(dev, ret, "%s: bgpio_init() failed", __func__); + return dev_err_probe(dev, ret, + "%s: failed to initialize the generic GPIO chip", + __func__); gc->request = gpiochip_generic_request; gc->free = gpiochip_generic_free; @@ -229,7 +232,7 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) irq = platform_get_irq_optional(pdev, 0); if (irq >= 0) { - girq = &gs->gc.irq; + girq = &gs->chip.gc.irq; gpio_irq_chip_set_chip(girq, &gpio_mlxbf3_irqchip); girq->default_type = IRQ_TYPE_NONE; /* This will let us handle the parent IRQ in the driver */ @@ -250,7 +253,7 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) platform_set_drvdata(pdev, gs); - ret = devm_gpiochip_add_data(dev, &gs->gc, gs); + ret = devm_gpiochip_add_data(dev, gc, gs); if (ret) dev_err_probe(dev, ret, "Failed adding memory mapped gpiochip\n"); From d3e7efad8fbaf0c2d6f039ae074a20c3aa89bd12 Mon Sep 17 00:00:00 2001 From: Akhilesh Patil Date: Thu, 4 Sep 2025 20:46:09 +0530 Subject: [PATCH 1096/3206] gpio: Kconfig: Update help for GPIO_PCA953X Update help description with supported ICs from gpio-pca953x.c Include missing IC names. Signed-off-by: Akhilesh Patil Link: https://lore.kernel.org/r/aLmtOWjAWPtWe/gH@bhairav-test.ee.iitb.ac.in Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 08e1fc13195452..a4b7b530f22598 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -1194,14 +1194,18 @@ config GPIO_PCA953X 4 bits: pca9536, pca9537 8 bits: max7310, max7315, pca6107, pca9534, pca9538, pca9554, - pca9556, pca9557, pca9574, tca6408, tca9554, xra1202 + pca9556, pca9557, pca9574, tca6408, tca9554, xra1202, + pcal6408, pcal9554b, tca9538 16 bits: max7312, max7313, pca9535, pca9539, pca9555, pca9575, - tca6416 + tca6416, pca6416, pcal6416, pcal9535, pcal9555a, max7318, + tca9539 - 24 bits: tca6424 + 18 bits: tca6418 - 40 bits: pca9505, pca9698 + 24 bits: tca6424, pcal6524 + + 40 bits: pca9505, pca9698, pca9506 config GPIO_PCA953X_IRQ bool "Interrupt controller support for PCA953x" From 084d01a173f5f41afd326b1dfe73085972530ca7 Mon Sep 17 00:00:00 2001 From: Yao Zi Date: Thu, 4 Sep 2025 01:34:36 +0000 Subject: [PATCH 1097/3206] dt-bindings: gpio: loongson: Document GPIO controller of LS2K0300 SoC Loongson-2K0300 ships a GPIO controller whose input/output control logic is similar to previous generation of SoCs. Additionally, it acts as an interrupt-controller supporting both level and edge interrupts and has a distinct reset signal. Describe its compatible in devicetree. We enlarge the maximum value of ngpios to 128, since the controller technically supports at most 128 pins, although only 106 are routed out of the package. Properties for interrupt-controllers and resets are introduced and limited as LS2K0300 only. Signed-off-by: Yao Zi Reviewed-by: Krzysztof Kozlowski Reviewed-by: Huacai Chen Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250904013438.2405-2-ziyao@disroot.org Signed-off-by: Bartosz Golaszewski --- .../bindings/gpio/loongson,ls-gpio.yaml | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml b/Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml index b68159600e2bd8..69852444df239e 100644 --- a/Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml @@ -14,6 +14,7 @@ properties: oneOf: - enum: - loongson,ls2k-gpio + - loongson,ls2k0300-gpio - loongson,ls2k0500-gpio0 - loongson,ls2k0500-gpio1 - loongson,ls2k2000-gpio0 @@ -36,7 +37,7 @@ properties: ngpios: minimum: 1 - maximum: 64 + maximum: 128 "#gpio-cells": const: 2 @@ -49,6 +50,14 @@ properties: minItems: 1 maxItems: 64 + "#interrupt-cells": + const: 2 + + interrupt-controller: true + + resets: + maxItems: 1 + required: - compatible - reg @@ -58,6 +67,23 @@ required: - gpio-ranges - interrupts +allOf: + - if: + properties: + compatible: + contains: + const: loongson,ls2k0300-gpio + then: + required: + - "#interrupt-cells" + - interrupt-controller + - resets + else: + properties: + "#interrupts-cells": false + interrupt-controller: false + resets: false + additionalProperties: false examples: From 03c146cb6cd14fdab2d2c7ab1b4e8035b54df8cc Mon Sep 17 00:00:00 2001 From: Yao Zi Date: Thu, 4 Sep 2025 01:34:37 +0000 Subject: [PATCH 1098/3206] gpio: loongson-64bit: Add support for Loongson-2K0300 SoC This controller's input and output logic is similar to previous generations of SoCs. Additionally, it's capable of interrupt masking, and could be configured to detect levels and edges, and is supplied with a distinct reset signal. The interrupt functionality is implemented through an irqchip, whose operations are written with previous generation SoCs in mind and could be reused. Since all Loongson SoCs with similar interrupt capability (LS2K1500, LS2K2000) support byte-control mode, these operations are for byte-control mode only for simplicity. Signed-off-by: Yao Zi Reviewed-by: Huacai Chen Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250904013438.2405-3-ziyao@disroot.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 1 + drivers/gpio/gpio-loongson-64bit.c | 189 +++++++++++++++++++++++++++-- 2 files changed, 183 insertions(+), 7 deletions(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index a4b7b530f22598..31f8bab4b09df1 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -436,6 +436,7 @@ config GPIO_LOONGSON_64BIT depends on LOONGARCH || COMPILE_TEST depends on OF_GPIO select GPIO_GENERIC + select GPIOLIB_IRQCHIP help Say yes here to support the GPIO functionality of a number of Loongson series of chips. The Loongson GPIO controller supports diff --git a/drivers/gpio/gpio-loongson-64bit.c b/drivers/gpio/gpio-loongson-64bit.c index 482e64ba9b4209..f84f8c53724943 100644 --- a/drivers/gpio/gpio-loongson-64bit.c +++ b/drivers/gpio/gpio-loongson-64bit.c @@ -7,6 +7,8 @@ #include #include +#include +#include #include #include #include @@ -14,6 +16,7 @@ #include #include #include +#include #include enum loongson_gpio_mode { @@ -28,6 +31,14 @@ struct loongson_gpio_chip_data { unsigned int out_offset; unsigned int in_offset; unsigned int inten_offset; + unsigned int intpol_offset; + unsigned int intedge_offset; + unsigned int intclr_offset; + unsigned int intsts_offset; + unsigned int intdual_offset; + unsigned int intr_num; + irq_flow_handler_t irq_handler; + const struct irq_chip *girqchip; }; struct loongson_gpio_chip { @@ -137,7 +148,140 @@ static int loongson_gpio_to_irq(struct gpio_chip *chip, unsigned int offset) return platform_get_irq(pdev, offset); } -static int loongson_gpio_init(struct device *dev, struct loongson_gpio_chip *lgpio, +static void loongson_gpio_irq_ack(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct loongson_gpio_chip *lgpio = to_loongson_gpio_chip(chip); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + writeb(0x1, lgpio->reg_base + lgpio->chip_data->intclr_offset + hwirq); +} + +static void loongson_gpio_irq_mask(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct loongson_gpio_chip *lgpio = to_loongson_gpio_chip(chip); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + writeb(0x0, lgpio->reg_base + lgpio->chip_data->inten_offset + hwirq); +} + +static void loongson_gpio_irq_unmask(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct loongson_gpio_chip *lgpio = to_loongson_gpio_chip(chip); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + writeb(0x1, lgpio->reg_base + lgpio->chip_data->inten_offset + hwirq); +} + +static int loongson_gpio_irq_set_type(struct irq_data *data, unsigned int type) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct loongson_gpio_chip *lgpio = to_loongson_gpio_chip(chip); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + u8 pol = 0, edge = 0, dual = 0; + + if ((type & IRQ_TYPE_SENSE_MASK) == IRQ_TYPE_EDGE_BOTH) { + edge = 1; + dual = 1; + irq_set_handler_locked(data, handle_edge_irq); + } else { + switch (type) { + case IRQ_TYPE_LEVEL_HIGH: + pol = 1; + fallthrough; + case IRQ_TYPE_LEVEL_LOW: + irq_set_handler_locked(data, handle_level_irq); + break; + + case IRQ_TYPE_EDGE_RISING: + pol = 1; + fallthrough; + case IRQ_TYPE_EDGE_FALLING: + edge = 1; + irq_set_handler_locked(data, handle_edge_irq); + break; + + default: + return -EINVAL; + }; + } + + writeb(pol, lgpio->reg_base + lgpio->chip_data->intpol_offset + hwirq); + writeb(edge, lgpio->reg_base + lgpio->chip_data->intedge_offset + hwirq); + writeb(dual, lgpio->reg_base + lgpio->chip_data->intdual_offset + hwirq); + + return 0; +} + +static void loongson_gpio_ls2k0300_irq_handler(struct irq_desc *desc) +{ + struct loongson_gpio_chip *lgpio = irq_desc_get_handler_data(desc); + struct irq_chip *girqchip = irq_desc_get_chip(desc); + int i; + + chained_irq_enter(girqchip, desc); + + for (i = 0; i < lgpio->chip.gc.ngpio; i++) { + /* + * For the GPIO controller of LS2K0300, interrupts status bits + * may be wrongly set even if the corresponding interrupt is + * disabled. Thus interrupt enable bits are checked along with + * status bits to detect interrupts reliably. + */ + if (readb(lgpio->reg_base + lgpio->chip_data->intsts_offset + i) && + readb(lgpio->reg_base + lgpio->chip_data->inten_offset + i)) + generic_handle_domain_irq(lgpio->chip.gc.irq.domain, i); + } + + chained_irq_exit(girqchip, desc); +} + +static const struct irq_chip loongson_gpio_ls2k0300_irqchip = { + .irq_ack = loongson_gpio_irq_ack, + .irq_mask = loongson_gpio_irq_mask, + .irq_unmask = loongson_gpio_irq_unmask, + .irq_set_type = loongson_gpio_irq_set_type, + .flags = IRQCHIP_IMMUTABLE | IRQCHIP_SKIP_SET_WAKE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + +static int loongson_gpio_init_irqchip(struct platform_device *pdev, + struct loongson_gpio_chip *lgpio) +{ + const struct loongson_gpio_chip_data *data = lgpio->chip_data; + struct gpio_chip *chip = &lgpio->chip.gc; + int i; + + chip->irq.default_type = IRQ_TYPE_NONE; + chip->irq.handler = handle_bad_irq; + chip->irq.parent_handler = data->irq_handler; + chip->irq.parent_handler_data = lgpio; + gpio_irq_chip_set_chip(&chip->irq, data->girqchip); + + chip->irq.num_parents = data->intr_num; + chip->irq.parents = devm_kcalloc(&pdev->dev, data->intr_num, + sizeof(*chip->irq.parents), GFP_KERNEL); + if (!chip->parent) + return -ENOMEM; + + for (i = 0; i < data->intr_num; i++) { + chip->irq.parents[i] = platform_get_irq(pdev, i); + if (chip->irq.parents[i] < 0) + return dev_err_probe(&pdev->dev, chip->irq.parents[i], + "failed to get IRQ %d\n", i); + } + + for (i = 0; i < data->intr_num; i++) { + writeb(0x0, lgpio->reg_base + data->inten_offset + i); + writeb(0x1, lgpio->reg_base + data->intclr_offset + i); + } + + return 0; +} + +static int loongson_gpio_init(struct platform_device *pdev, struct loongson_gpio_chip *lgpio, void __iomem *reg_base) { struct gpio_generic_chip_config config; @@ -146,7 +290,7 @@ static int loongson_gpio_init(struct device *dev, struct loongson_gpio_chip *lgp lgpio->reg_base = reg_base; if (lgpio->chip_data->mode == BIT_CTRL_MODE) { config = (typeof(config)){ - .dev = dev, + .dev = &pdev->dev, .sz = 8, .dat = lgpio->reg_base + lgpio->chip_data->in_offset, .set = lgpio->reg_base + lgpio->chip_data->out_offset, @@ -155,7 +299,7 @@ static int loongson_gpio_init(struct device *dev, struct loongson_gpio_chip *lgp ret = gpio_generic_chip_init(&lgpio->chip, &config); if (ret) { - dev_err(dev, "unable to init generic GPIO\n"); + dev_err(&pdev->dev, "unable to init generic GPIO\n"); return ret; } } else { @@ -164,16 +308,21 @@ static int loongson_gpio_init(struct device *dev, struct loongson_gpio_chip *lgp lgpio->chip.gc.get_direction = loongson_gpio_get_direction; lgpio->chip.gc.direction_output = loongson_gpio_direction_output; lgpio->chip.gc.set = loongson_gpio_set; - lgpio->chip.gc.parent = dev; + lgpio->chip.gc.parent = &pdev->dev; spin_lock_init(&lgpio->lock); } lgpio->chip.gc.label = lgpio->chip_data->label; lgpio->chip.gc.can_sleep = false; - if (lgpio->chip_data->inten_offset) + if (lgpio->chip_data->girqchip) { + ret = loongson_gpio_init_irqchip(pdev, lgpio); + if (ret) + return dev_err_probe(&pdev->dev, ret, "failed to initialize irqchip\n"); + } else if (lgpio->chip_data->inten_offset) { lgpio->chip.gc.to_irq = loongson_gpio_to_irq; + } - return devm_gpiochip_add_data(dev, &lgpio->chip.gc, lgpio); + return devm_gpiochip_add_data(&pdev->dev, &lgpio->chip.gc, lgpio); } static int loongson_gpio_probe(struct platform_device *pdev) @@ -181,6 +330,7 @@ static int loongson_gpio_probe(struct platform_device *pdev) void __iomem *reg_base; struct loongson_gpio_chip *lgpio; struct device *dev = &pdev->dev; + struct reset_control *rst; lgpio = devm_kzalloc(dev, sizeof(*lgpio), GFP_KERNEL); if (!lgpio) @@ -192,7 +342,11 @@ static int loongson_gpio_probe(struct platform_device *pdev) if (IS_ERR(reg_base)) return PTR_ERR(reg_base); - return loongson_gpio_init(dev, lgpio, reg_base); + rst = devm_reset_control_get_optional_exclusive_deasserted(&pdev->dev, NULL); + if (IS_ERR(rst)) + return dev_err_probe(&pdev->dev, PTR_ERR(rst), "failed to get reset control\n"); + + return loongson_gpio_init(pdev, lgpio, reg_base); } static const struct loongson_gpio_chip_data loongson_gpio_ls2k_data = { @@ -204,6 +358,23 @@ static const struct loongson_gpio_chip_data loongson_gpio_ls2k_data = { .inten_offset = 0x30, }; +static const struct loongson_gpio_chip_data loongson_gpio_ls2k0300_data = { + .label = "ls2k0300_gpio", + .mode = BYTE_CTRL_MODE, + .conf_offset = 0x800, + .in_offset = 0xa00, + .out_offset = 0x900, + .inten_offset = 0xb00, + .intpol_offset = 0xc00, + .intedge_offset = 0xd00, + .intclr_offset = 0xe00, + .intsts_offset = 0xf00, + .intdual_offset = 0xf80, + .intr_num = 7, + .irq_handler = loongson_gpio_ls2k0300_irq_handler, + .girqchip = &loongson_gpio_ls2k0300_irqchip, +}; + static const struct loongson_gpio_chip_data loongson_gpio_ls2k0500_data0 = { .label = "ls2k0500_gpio", .mode = BIT_CTRL_MODE, @@ -300,6 +471,10 @@ static const struct of_device_id loongson_gpio_of_match[] = { .compatible = "loongson,ls2k-gpio", .data = &loongson_gpio_ls2k_data, }, + { + .compatible = "loongson,ls2k0300-gpio", + .data = &loongson_gpio_ls2k0300_data, + }, { .compatible = "loongson,ls2k0500-gpio0", .data = &loongson_gpio_ls2k0500_data0, From 71d2893a235bf3b95baccead27b3d47f2f2cdc4c Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Mon, 8 Sep 2025 06:27:27 +0800 Subject: [PATCH 1099/3206] ALSA: hda/tas2781: Fix the order of TAS2781 calibrated-data A bug reported by one of my customers that the order of TAS2781 calibrated-data is incorrect, the correct way is to move R0_Low and insert it between R0 and InvR0. Fixes: 4fe238513407 ("ALSA: hda/tas2781: Move and unified the calibrated-data getting function for SPI and I2C into the tas2781_hda lib") Signed-off-by: Shenghao Ding Link: https://patch.msgid.link/20250907222728.988-1-shenghao-ding@ti.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/side-codecs/tas2781_hda.c | 25 +++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/sound/hda/codecs/side-codecs/tas2781_hda.c b/sound/hda/codecs/side-codecs/tas2781_hda.c index f46d2e06c64f34..536940c78f001d 100644 --- a/sound/hda/codecs/side-codecs/tas2781_hda.c +++ b/sound/hda/codecs/side-codecs/tas2781_hda.c @@ -33,6 +33,23 @@ const efi_guid_t tasdev_fct_efi_guid[] = { }; EXPORT_SYMBOL_NS_GPL(tasdev_fct_efi_guid, "SND_HDA_SCODEC_TAS2781"); +/* + * The order of calibrated-data writing function is a bit different from the + * order in UEFI. Here is the conversion to match the order of calibrated-data + * writing function. + */ +static void cali_cnv(unsigned char *data, unsigned int base, int offset) +{ + struct cali_reg reg_data; + + memcpy(®_data, &data[base], sizeof(reg_data)); + /* the data order has to be swapped between r0_low_reg and inv0_reg */ + swap(reg_data.r0_low_reg, reg_data.invr0_reg); + + cpu_to_be32_array((__force __be32 *)(data + offset + 1), + (u32 *)®_data, TASDEV_CALIB_N); +} + static void tas2781_apply_calib(struct tasdevice_priv *p) { struct calidata *cali_data = &p->cali_data; @@ -103,8 +120,7 @@ static void tas2781_apply_calib(struct tasdevice_priv *p) data[l] = k; oft++; - for (i = 0; i < TASDEV_CALIB_N * 4; i++) - data[l + i + 1] = data[4 * oft + i]; + cali_cnv(data, 4 * oft, l); k++; } } @@ -130,9 +146,8 @@ static void tas2781_apply_calib(struct tasdevice_priv *p) for (j = p->ndev - 1; j >= 0; j--) { l = j * (cali_data->cali_dat_sz_per_dev + 1); - for (i = TASDEV_CALIB_N * 4; i > 0 ; i--) - data[l + i] = data[p->index * 5 + i]; - data[l+i] = j; + cali_cnv(data, cali_data->cali_dat_sz_per_dev * j, l); + data[l] = j; } } From 474014cdec1758e1802082b94043189e198c58a4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 7 Sep 2025 15:25:38 +0200 Subject: [PATCH 1100/3206] gpio: pisosr: Use devm_mutex_init() Use devm_mutex_init() instead of hand-writing it. This saves some LoC, improves readability and saves some space in the generated .o file. Before: ====== text data bss dec hex filename 8431 1808 192 10431 28bf drivers/gpio/gpio-pisosr.o After: ===== text data bss dec hex filename 8112 1736 192 10040 2738 drivers/gpio/gpio-pisosr.o Signed-off-by: Christophe JAILLET Acked-by: Andrew Davis Link: https://lore.kernel.org/r/01910ebdaba7d8d0cdc4ac60eb70da8e29cb85f1.1757251512.git.christophe.jaillet@wanadoo.fr Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pisosr.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/gpio/gpio-pisosr.c b/drivers/gpio/gpio-pisosr.c index a69b74866a1344..7ec6a46ed60071 100644 --- a/drivers/gpio/gpio-pisosr.c +++ b/drivers/gpio/gpio-pisosr.c @@ -108,11 +108,6 @@ static const struct gpio_chip template_chip = { .can_sleep = true, }; -static void pisosr_mutex_destroy(void *lock) -{ - mutex_destroy(lock); -} - static int pisosr_gpio_probe(struct spi_device *spi) { struct device *dev = &spi->dev; @@ -139,8 +134,7 @@ static int pisosr_gpio_probe(struct spi_device *spi) return dev_err_probe(dev, PTR_ERR(gpio->load_gpio), "Unable to allocate load GPIO\n"); - mutex_init(&gpio->lock); - ret = devm_add_action_or_reset(dev, pisosr_mutex_destroy, &gpio->lock); + ret = devm_mutex_init(dev, &gpio->lock); if (ret) return ret; From cba4262a19afae21665ee242b3404bcede5a94d7 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 1 Sep 2025 17:04:15 +0000 Subject: [PATCH 1101/3206] x86/cpu/topology: Always try cpu_parse_topology_ext() on AMD/Hygon Support for parsing the topology on AMD/Hygon processors using CPUID leaf 0xb was added in 3986a0a805e6 ("x86/CPU/AMD: Derive CPU topology from CPUID function 0xB when available"). In an effort to keep all the topology parsing bits in one place, this commit also introduced a pseudo dependency on the TOPOEXT feature to parse the CPUID leaf 0xb. The TOPOEXT feature (CPUID 0x80000001 ECX[22]) advertises the support for Cache Properties leaf 0x8000001d and the CPUID leaf 0x8000001e EAX for "Extended APIC ID" however support for 0xb was introduced alongside the x2APIC support not only on AMD [1], but also historically on x86 [2]. Similar to 0xb, the support for extended CPU topology leaf 0x80000026 too does not depend on the TOPOEXT feature. The support for these leaves is expected to be confirmed by ensuring leaf <= {extended_}cpuid_level and then parsing the level 0 of the respective leaf to confirm EBX[15:0] (LogProcAtThisLevel) is non-zero as stated in the definition of "CPUID_Fn0000000B_EAX_x00 [Extended Topology Enumeration] (Core::X86::Cpuid::ExtTopEnumEax0)" in Processor Programming Reference (PPR) for AMD Family 19h Model 01h Rev B1 Vol1 [3] Sec. 2.1.15.1 "CPUID Instruction Functions". This has not been a problem on baremetal platforms since support for TOPOEXT (Fam 0x15 and later) predates the support for CPUID leaf 0xb (Fam 0x17[Zen2] and later), however, for AMD guests on QEMU, the "x2apic" feature can be enabled independent of the "topoext" feature where QEMU expects topology and the initial APICID to be parsed using the CPUID leaf 0xb (especially when number of cores > 255) which is populated independent of the "topoext" feature flag. Unconditionally call cpu_parse_topology_ext() on AMD and Hygon processors to first parse the topology using the XTOPOLOGY leaves (0x80000026 / 0xb) before using the TOPOEXT leaf (0x8000001e). While at it, break down the single large comment in parse_topology_amd() to better highlight the purpose of each CPUID leaf. Fixes: 3986a0a805e6 ("x86/CPU/AMD: Derive CPU topology from CPUID function 0xB when available") Suggested-by: Naveen N Rao (AMD) Signed-off-by: K Prateek Nayak Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org # Only v6.9 and above; depends on x86 topology rewrite Link: https://lore.kernel.org/lkml/1529686927-7665-1-git-send-email-suravee.suthikulpanit@amd.com/ [1] Link: https://lore.kernel.org/lkml/20080818181435.523309000@linux-os.sc.intel.com/ [2] Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 [3] --- arch/x86/kernel/cpu/topology_amd.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 827dd0dbb6e9d2..c79ebbb639cbff 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -175,27 +175,30 @@ static void topoext_fixup(struct topo_scan *tscan) static void parse_topology_amd(struct topo_scan *tscan) { - bool has_topoext = false; - /* - * If the extended topology leaf 0x8000_001e is available - * try to get SMT, CORE, TILE, and DIE shifts from extended + * Try to get SMT, CORE, TILE, and DIE shifts from extended * CPUID leaf 0x8000_0026 on supported processors first. If * extended CPUID leaf 0x8000_0026 is not supported, try to - * get SMT and CORE shift from leaf 0xb first, then try to - * get the CORE shift from leaf 0x8000_0008. + * get SMT and CORE shift from leaf 0xb. If either leaf is + * available, cpu_parse_topology_ext() will return true. */ - if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) - has_topoext = cpu_parse_topology_ext(tscan); + bool has_xtopology = cpu_parse_topology_ext(tscan); if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); - if (!has_topoext && !parse_8000_0008(tscan)) + /* + * If XTOPOLOGY leaves (0x26/0xb) are not available, try to + * get the CORE shift from leaf 0x8000_0008 first. + */ + if (!has_xtopology && !parse_8000_0008(tscan)) return; - /* Prefer leaf 0x8000001e if available */ - if (parse_8000_001e(tscan, has_topoext)) + /* + * Prefer leaf 0x8000001e if available to get the SMT shift and + * the initial APIC ID if XTOPOLOGY leaves are not available. + */ + if (parse_8000_001e(tscan, has_xtopology)) return; /* Try the NODEID MSR */ From 30247fc3254a4502465ab809d49e2167ace5a6f2 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Sun, 17 Aug 2025 15:30:19 +0100 Subject: [PATCH 1102/3206] pinctrl: renesas: rzg2l: Fix OEN resume Writing to the PFC_OEN register is controlled by the write protect register (PWPR). Currently the OEN register write in resume() is done without enabling write access in PWPR leading to incorrect operation. Fixes: cd39805be85b ("pinctrl: renesas: rzg2l: Unify OEN handling across RZ/{G2L,V2H,V2N}") Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven [geert: Move spin_*lock*() calls inside if-statements] Link: https://lore.kernel.org/20250817143024.165471-2-biju.das.jz@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index b182b3b8a542a2..f72814dab38ce3 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -3165,6 +3165,8 @@ static int rzg2l_pinctrl_resume_noirq(struct device *dev) const struct rzg2l_hwcfg *hwcfg = pctrl->data->hwcfg; const struct rzg2l_register_offsets *regs = &hwcfg->regs; struct rzg2l_pinctrl_reg_cache *cache = pctrl->cache; + unsigned long flags; + u8 pwpr; int ret; if (!atomic_read(&pctrl->wakeup_path)) { @@ -3174,7 +3176,16 @@ static int rzg2l_pinctrl_resume_noirq(struct device *dev) } writeb(cache->qspi, pctrl->base + QSPI); + if (pctrl->data->hwcfg->oen_pwpr_lock) { + spin_lock_irqsave(&pctrl->lock, flags); + pwpr = readb(pctrl->base + regs->pwpr); + writeb(pwpr | PWPR_REGWE_B, pctrl->base + regs->pwpr); + } writeb(cache->oen, pctrl->base + pctrl->data->hwcfg->regs.oen); + if (pctrl->data->hwcfg->oen_pwpr_lock) { + writeb(pwpr & ~PWPR_REGWE_B, pctrl->base + regs->pwpr); + spin_unlock_irqrestore(&pctrl->lock, flags); + } for (u8 i = 0; i < 2; i++) { if (regs->sd_ch) writeb(cache->sd_ch[i], pctrl->base + SD_CH(regs->sd_ch, i)); From 837afa592c6234be82acb5d23e0a39e9befdaa85 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 19 Aug 2025 09:40:20 +0100 Subject: [PATCH 1103/3206] pinctrl: renesas: rzg2l: Add suspend/resume support for Schmitt control registers Renesas RZ/G3E supports a power-saving mode where power to most of the SoC components is lost, including the PIN controller. Save and restore the Schmitt control register contents to ensure the functionality is preserved after a suspend/resume cycle. Signed-off-by: Biju Das Reviewed-by: Claudiu Beznea Tested-by: Claudiu Beznea # on RZ/G3S Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250819084022.20512-1-biju.das.jz@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index f72814dab38ce3..7b8219f2006399 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -321,6 +321,7 @@ struct rzg2l_pinctrl_pin_settings { * @iolh: IOLH registers cache * @pupd: PUPD registers cache * @ien: IEN registers cache + * @smt: SMT registers cache * @sd_ch: SD_CH registers cache * @eth_poc: ET_POC registers cache * @oen: Output Enable register cache @@ -334,6 +335,7 @@ struct rzg2l_pinctrl_reg_cache { u32 *iolh[2]; u32 *ien[2]; u32 *pupd[2]; + u32 *smt; u8 sd_ch[2]; u8 eth_poc[2]; u8 oen; @@ -2704,6 +2706,10 @@ static int rzg2l_pinctrl_reg_cache_alloc(struct rzg2l_pinctrl *pctrl) if (!cache->pfc) return -ENOMEM; + cache->smt = devm_kcalloc(pctrl->dev, nports, sizeof(*cache->smt), GFP_KERNEL); + if (!cache->smt) + return -ENOMEM; + for (u8 i = 0; i < 2; i++) { u32 n_dedicated_pins = pctrl->data->n_dedicated_pins; @@ -2965,7 +2971,7 @@ static void rzg2l_pinctrl_pm_setup_regs(struct rzg2l_pinctrl *pctrl, bool suspen struct rzg2l_pinctrl_reg_cache *cache = pctrl->cache; for (u32 port = 0; port < nports; port++) { - bool has_iolh, has_ien, has_pupd; + bool has_iolh, has_ien, has_pupd, has_smt; u32 off, caps; u8 pincnt; u64 cfg; @@ -2978,6 +2984,7 @@ static void rzg2l_pinctrl_pm_setup_regs(struct rzg2l_pinctrl *pctrl, bool suspen has_iolh = !!(caps & (PIN_CFG_IOLH_A | PIN_CFG_IOLH_B | PIN_CFG_IOLH_C)); has_ien = !!(caps & PIN_CFG_IEN); has_pupd = !!(caps & PIN_CFG_PUPD); + has_smt = !!(caps & PIN_CFG_SMT); if (suspend) RZG2L_PCTRL_REG_ACCESS32(suspend, pctrl->base + PFC(off), cache->pfc[port]); @@ -3016,6 +3023,9 @@ static void rzg2l_pinctrl_pm_setup_regs(struct rzg2l_pinctrl *pctrl, bool suspen cache->ien[1][port]); } } + + if (has_smt) + RZG2L_PCTRL_REG_ACCESS32(suspend, pctrl->base + SMT(off), cache->smt[port]); } } From 9f062fc5b0ff44550088912ab89f9da40226a826 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 31 Aug 2025 16:49:58 +0800 Subject: [PATCH 1104/3206] pinctrl: renesas: Use int type to store negative error codes Change the 'ret' variable in sh_pfc_pinconf_group_set() from unsigned int to int, as it needs to store either negative error codes or zero returned by sh_pfc_pinconf_set(). No effect on runtime. Signed-off-by: Qianfeng Rong Fixes: d0593c363f04ccc4 ("pinctrl: sh-pfc: Propagate errors on group config") Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250831084958.431913-4-rongqianfeng@vivo.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/renesas/pinctrl.c b/drivers/pinctrl/renesas/pinctrl.c index 29d16c9c1bd194..3a742f74ecd1dc 100644 --- a/drivers/pinctrl/renesas/pinctrl.c +++ b/drivers/pinctrl/renesas/pinctrl.c @@ -726,7 +726,8 @@ static int sh_pfc_pinconf_group_set(struct pinctrl_dev *pctldev, unsigned group, struct sh_pfc_pinctrl *pmx = pinctrl_dev_get_drvdata(pctldev); const unsigned int *pins; unsigned int num_pins; - unsigned int i, ret; + unsigned int i; + int ret; pins = pmx->pfc->info->groups[group].pins; num_pins = pmx->pfc->info->groups[group].nr_pins; From 512bf60226c621893729605121a396240e84a5d4 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 5 Sep 2025 00:27:44 +0200 Subject: [PATCH 1105/3206] pinctrl: renesas: r8a779g0: Fix trivial typo in SoC type comment Fix SoC type comment in the PFC table file, replace R8A779A0 likely copy-paste error with R8A779G0. No functional change. Signed-off-by: Marek Vasut Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250904222806.193260-1-marek.vasut+renesas@mailbox.org Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pfc-r8a779g0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/renesas/pfc-r8a779g0.c b/drivers/pinctrl/renesas/pfc-r8a779g0.c index cae3e65534997d..218c5eff9b67fe 100644 --- a/drivers/pinctrl/renesas/pfc-r8a779g0.c +++ b/drivers/pinctrl/renesas/pfc-r8a779g0.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * R8A779A0 processor support - PFC hardware block. + * R8A779G0 processor support - PFC hardware block. * * Copyright (C) 2021 Renesas Electronics Corp. * From 3c847e17225aa9481fc3f6c948f5c718dea526f1 Mon Sep 17 00:00:00 2001 From: Benno Lossin Date: Thu, 14 Aug 2025 11:30:28 +0200 Subject: [PATCH 1106/3206] rust: add `pin-init` as a dependency to `bindings` and `uapi` This allows `bindings` and `uapi` to implement `Zeroable` and use other items from pin-init. Co-developed-by: Miguel Ojeda Signed-off-by: Miguel Ojeda Link: https://rust-for-linux.zulipchat.com/#narrow/channel/291565-Help/topic/Zeroable.20trait.20for.20C.20structs/near/510264158 Signed-off-by: Benno Lossin Reviewed-by: Alice Ryhl Signed-off-by: Miguel Ojeda --- rust/Makefile | 14 ++++++++------ scripts/generate_rust_analyzer.py | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/rust/Makefile b/rust/Makefile index 29c941024e6f90..23c7ae905bd2f0 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -199,12 +199,12 @@ rusttestlib-kernel: $(src)/kernel/lib.rs rusttestlib-bindings rusttestlib-uapi \ $(obj)/bindings.o FORCE +$(call if_changed,rustc_test_library) -rusttestlib-bindings: private rustc_target_flags = --extern ffi -rusttestlib-bindings: $(src)/bindings/lib.rs rusttestlib-ffi FORCE +rusttestlib-bindings: private rustc_target_flags = --extern ffi --extern pin_init +rusttestlib-bindings: $(src)/bindings/lib.rs rusttestlib-ffi rusttestlib-pin_init FORCE +$(call if_changed,rustc_test_library) -rusttestlib-uapi: private rustc_target_flags = --extern ffi -rusttestlib-uapi: $(src)/uapi/lib.rs rusttestlib-ffi FORCE +rusttestlib-uapi: private rustc_target_flags = --extern ffi --extern pin_init +rusttestlib-uapi: $(src)/uapi/lib.rs rusttestlib-ffi rusttestlib-pin_init FORCE +$(call if_changed,rustc_test_library) quiet_cmd_rustdoc_test = RUSTDOC T $< @@ -530,17 +530,19 @@ $(obj)/ffi.o: private skip_gendwarfksyms = 1 $(obj)/ffi.o: $(src)/ffi.rs $(obj)/compiler_builtins.o FORCE +$(call if_changed_rule,rustc_library) -$(obj)/bindings.o: private rustc_target_flags = --extern ffi +$(obj)/bindings.o: private rustc_target_flags = --extern ffi --extern pin_init $(obj)/bindings.o: $(src)/bindings/lib.rs \ $(obj)/ffi.o \ + $(obj)/pin_init.o \ $(obj)/bindings/bindings_generated.rs \ $(obj)/bindings/bindings_helpers_generated.rs FORCE +$(call if_changed_rule,rustc_library) -$(obj)/uapi.o: private rustc_target_flags = --extern ffi +$(obj)/uapi.o: private rustc_target_flags = --extern ffi --extern pin_init $(obj)/uapi.o: private skip_gendwarfksyms = 1 $(obj)/uapi.o: $(src)/uapi/lib.rs \ $(obj)/ffi.o \ + $(obj)/pin_init.o \ $(obj)/uapi/uapi_generated.rs FORCE +$(call if_changed_rule,rustc_library) diff --git a/scripts/generate_rust_analyzer.py b/scripts/generate_rust_analyzer.py index 7c3ea2b55041f8..fc27f0cca752d3 100755 --- a/scripts/generate_rust_analyzer.py +++ b/scripts/generate_rust_analyzer.py @@ -139,8 +139,8 @@ def append_crate_with_generated( "exclude_dirs": [], } - append_crate_with_generated("bindings", ["core", "ffi"]) - append_crate_with_generated("uapi", ["core", "ffi"]) + append_crate_with_generated("bindings", ["core", "ffi", "pin_init"]) + append_crate_with_generated("uapi", ["core", "ffi", "pin_init"]) append_crate_with_generated("kernel", ["core", "macros", "build_error", "pin_init", "ffi", "bindings", "uapi"]) def is_root_crate(build_file, target): From 4846300ba8f9b725594cc2e77785057f536b50c1 Mon Sep 17 00:00:00 2001 From: Benno Lossin Date: Thu, 14 Aug 2025 11:30:29 +0200 Subject: [PATCH 1107/3206] rust: derive `Zeroable` for all structs & unions generated by bindgen where possible Using the `--with-derive-custom-{struct,union}` option of bindgen, add `#[derive(MaybeZeroable)]` to every struct & union. This makes those types implement `Zeroable` if all their fields implement it. Sadly bindgen doesn't add custom derives to the `__BindgenBitfieldUnit` struct. So manually implement `Zeroable` for that. Signed-off-by: Benno Lossin Reviewed-by: Alice Ryhl [ Formatted comment. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/bindgen_parameters | 4 ++++ rust/bindings/lib.rs | 8 ++++++++ rust/uapi/lib.rs | 2 ++ 3 files changed, 14 insertions(+) diff --git a/rust/bindgen_parameters b/rust/bindgen_parameters index 02b371b98b3912..e13c6f9dd17b22 100644 --- a/rust/bindgen_parameters +++ b/rust/bindgen_parameters @@ -35,3 +35,7 @@ # recognized, block generation of the non-helper constants. --blocklist-item ARCH_SLAB_MINALIGN --blocklist-item ARCH_KMALLOC_MINALIGN + +# Structs should implement `Zeroable` when all of their fields do. +--with-derive-custom-struct .*=MaybeZeroable +--with-derive-custom-union .*=MaybeZeroable diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs index 474cc98c48a323..0c57cf9b4004f1 100644 --- a/rust/bindings/lib.rs +++ b/rust/bindings/lib.rs @@ -31,11 +31,19 @@ #[allow(clippy::undocumented_unsafe_blocks)] #[cfg_attr(CONFIG_RUSTC_HAS_UNNECESSARY_TRANSMUTES, allow(unnecessary_transmutes))] mod bindings_raw { + use pin_init::{MaybeZeroable, Zeroable}; + // Manual definition for blocklisted types. type __kernel_size_t = usize; type __kernel_ssize_t = isize; type __kernel_ptrdiff_t = isize; + // `bindgen` doesn't automatically do this, see + // + // + // SAFETY: `__BindgenBitfieldUnit` is a newtype around `Storage`. + unsafe impl Zeroable for __BindgenBitfieldUnit where Storage: Zeroable {} + // Use glob import here to expose all helpers. // Symbols defined within the module will take precedence to the glob import. pub use super::bindings_helper::*; diff --git a/rust/uapi/lib.rs b/rust/uapi/lib.rs index 31c2f713313fe3..1d5fd9efb93e9d 100644 --- a/rust/uapi/lib.rs +++ b/rust/uapi/lib.rs @@ -34,4 +34,6 @@ type __kernel_size_t = usize; type __kernel_ssize_t = isize; type __kernel_ptrdiff_t = isize; +use pin_init::MaybeZeroable; + include!(concat!(env!("OBJTREE"), "/rust/uapi/uapi_generated.rs")); From 4fa9f72d657a76546849068b508a2c82983f8945 Mon Sep 17 00:00:00 2001 From: Benno Lossin Date: Thu, 14 Aug 2025 11:30:38 +0200 Subject: [PATCH 1108/3206] rust: cpufreq: replace `MaybeUninit::zeroed().assume_init()` with `pin_init::zeroed()` All types in `bindings` implement `Zeroable` if they can, so use `pin_init::zeroed` instead of relying on `unsafe` code. If this ends up not compiling in the future, something in bindgen or on the C side changed and is most likely incorrect. Signed-off-by: Benno Lossin Acked-by: Viresh Kumar Signed-off-by: Miguel Ojeda --- rust/kernel/cpufreq.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index afc15e72a7c37a..be2dffbdb54628 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -27,7 +27,6 @@ use crate::clk::Clk; use core::{ cell::UnsafeCell, marker::PhantomData, - mem::MaybeUninit, ops::{Deref, DerefMut}, pin::Pin, ptr, @@ -1013,8 +1012,7 @@ impl Registration { } else { None }, - // SAFETY: All zeros is a valid value for `bindings::cpufreq_driver`. - ..unsafe { MaybeUninit::zeroed().assume_init() } + ..pin_init::zeroed() }; const fn copy_name(name: &'static CStr) -> [c_char; CPUFREQ_NAME_LEN] { From decd50c19f2ccce3cbe34f6f59d04b252119329d Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 31 Aug 2025 16:49:56 +0800 Subject: [PATCH 1109/3206] pinctrl: armada-37xx: Use int type to store negative error codes In armada_37xx_gpio_direction_output(), the 'ret' variable might store the negative error codes returned by regmap_update_bits(), and in armada_37xx_edge_both_irq_swap_pol(), the 'ret' variable directly stores -1, so the type of the 'ret' variable needs to be changed to int in both cases. No effect on runtime. Signed-off-by: Qianfeng Rong Reviewed-by: Andrew Lunn Signed-off-by: Linus Walleij --- drivers/pinctrl/mvebu/pinctrl-armada-37xx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c index 881df5e08f6176..81dfbd5e7f0711 100644 --- a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c +++ b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c @@ -420,7 +420,8 @@ static int armada_37xx_gpio_direction_output(struct gpio_chip *chip, struct armada_37xx_pinctrl *info = gpiochip_get_data(chip); unsigned int en_offset = offset; unsigned int reg = OUTPUT_VAL; - unsigned int mask, val, ret; + unsigned int mask, val; + int ret; armada_37xx_update_reg(®, &offset); mask = BIT(offset); @@ -634,8 +635,9 @@ static int armada_37xx_edge_both_irq_swap_pol(struct armada_37xx_pinctrl *info, { u32 reg_idx = pin_idx / GPIO_PER_REG; u32 bit_num = pin_idx % GPIO_PER_REG; - u32 p, l, ret; unsigned long flags; + u32 p, l; + int ret; regmap_read(info->regmap, INPUT_VAL + 4*reg_idx, &l); From 0fd0f79aec237dd0a18ffdd63fcbf0015c7a3012 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 31 Aug 2025 16:49:57 +0800 Subject: [PATCH 1110/3206] pinctrl: ma35: Use int type to store negative error codes Change the 'ret' variable in ma35_pinctrl_parse_functions() from u32 to int, as it needs to store either negative error codes or zero returned by ma35_pinctrl_parse_groups(). No effect on runtime. Signed-off-by: Qianfeng Rong Signed-off-by: Linus Walleij --- drivers/pinctrl/nuvoton/pinctrl-ma35.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/nuvoton/pinctrl-ma35.c b/drivers/pinctrl/nuvoton/pinctrl-ma35.c index 54652bfbe6ac48..cdad01d68a37e3 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-ma35.c +++ b/drivers/pinctrl/nuvoton/pinctrl-ma35.c @@ -1038,7 +1038,8 @@ static int ma35_pinctrl_parse_functions(struct fwnode_handle *fwnode, struct ma3 struct group_desc *grp; static u32 grp_index; const char **groups; - u32 ret, i = 0; + u32 i = 0; + int ret; dev_dbg(npctl->dev, "parse function(%d): %s\n", index, np->name); From 4002ee98c022d671ecc1e4a84029e9ae7d8a5603 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:10 +0200 Subject: [PATCH 1111/3206] pinctrl: check the return value of pinmux_ops::get_function_name() While the API contract in docs doesn't specify it explicitly, the generic implementation of the get_function_name() callback from struct pinmux_ops - pinmux_generic_get_function_name() - can fail and return NULL. This is already checked in pinmux_check_ops() so add a similar check in pinmux_func_name_to_selector() instead of passing the returned pointer right down to strcmp() where the NULL can get dereferenced. This is normal operation when adding new pinfunctions. Cc: stable@vger.kernel.org Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/pinmux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c index 79814758a08457..07a478b2c48740 100644 --- a/drivers/pinctrl/pinmux.c +++ b/drivers/pinctrl/pinmux.c @@ -337,7 +337,7 @@ static int pinmux_func_name_to_selector(struct pinctrl_dev *pctldev, while (selector < nfuncs) { const char *fname = ops->get_function_name(pctldev, selector); - if (!strcmp(function, fname)) + if (fname && !strcmp(function, fname)) return selector; selector++; From d364d2ad07873dc4991b2a631a8536597272418b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:11 +0200 Subject: [PATCH 1112/3206] devres: provide devm_kmemdup_const() Provide a function similar to devm_strdup_const() but for copying blocks of memory that are likely to be placed in .rodata. Reviewed-by: Andy Shevchenko Acked-by: Greg Kroah-Hartman Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/base/devres.c | 21 +++++++++++++++++++++ include/linux/device/devres.h | 2 ++ 2 files changed, 23 insertions(+) diff --git a/drivers/base/devres.c b/drivers/base/devres.c index ff55e1bcfa3005..c948c88d395607 100644 --- a/drivers/base/devres.c +++ b/drivers/base/devres.c @@ -1117,6 +1117,27 @@ void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL_GPL(devm_kmemdup); +/** + * devm_kmemdup_const - conditionally duplicate and manage a region of memory + * + * @dev: Device this memory belongs to + * @src: memory region to duplicate + * @len: memory region length, + * @gfp: GFP mask to use + * + * Return: source address if it is in .rodata or the return value of kmemdup() + * to which the function falls back otherwise. + */ +const void * +devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp) +{ + if (is_kernel_rodata((unsigned long)src)) + return src; + + return devm_kmemdup(dev, src, len, gfp); +} +EXPORT_SYMBOL_GPL(devm_kmemdup_const); + struct pages_devres { unsigned long addr; unsigned int order; diff --git a/include/linux/device/devres.h b/include/linux/device/devres.h index ae696d10faff44..8c5f57e0d61349 100644 --- a/include/linux/device/devres.h +++ b/include/linux/device/devres.h @@ -80,6 +80,8 @@ void devm_kfree(struct device *dev, const void *p); void * __realloc_size(3) devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); +const void * +devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp); static inline void *devm_kmemdup_array(struct device *dev, const void *src, size_t n, size_t size, gfp_t flags) { From 77377744c29da2198a1d54b222263c64fe7b2f41 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:12 +0200 Subject: [PATCH 1113/3206] pinctrl: ingenic: use struct pinfunction instead of struct function_desc struct function_desc is a wrapper around struct pinfunction with an additional void *data pointer. This driver doesn't use the data pointer. We're also working towards reducing the usage of struct function_desc in pinctrl drivers - they should only be created by pinmux core and accessed by drivers using pinmux_generic_get_function(). Replace the struct function_desc objects in this driver with smaller struct pinfunction instances. Acked-by: Paul Cercueil Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-ingenic.c | 45 ++++++++++++++----------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c index 2900513467fa4e..af9d2f3081859b 100644 --- a/drivers/pinctrl/pinctrl-ingenic.c +++ b/drivers/pinctrl/pinctrl-ingenic.c @@ -96,11 +96,8 @@ .data = (void *)func, \ } -#define INGENIC_PIN_FUNCTION(_name_, id) \ - { \ - .func = PINCTRL_PINFUNCTION(_name_, id##_groups, ARRAY_SIZE(id##_groups)), \ - .data = NULL, \ - } +#define INGENIC_PIN_FUNCTION(_name_, id) \ + PINCTRL_PINFUNCTION(_name_, id##_groups, ARRAY_SIZE(id##_groups)) enum jz_version { ID_JZ4730, @@ -128,7 +125,7 @@ struct ingenic_chip_info { const struct group_desc *groups; unsigned int num_groups; - const struct function_desc *functions; + const struct pinfunction *functions; unsigned int num_functions; const u32 *pull_ups, *pull_downs; @@ -263,7 +260,7 @@ static const char *jz4730_pwm1_groups[] = { "pwm1", }; static const char *jz4730_mii_groups[] = { "mii", }; static const char *jz4730_i2s_groups[] = { "i2s-data", "i2s-master", "i2s-slave", }; -static const struct function_desc jz4730_functions[] = { +static const struct pinfunction jz4730_functions[] = { INGENIC_PIN_FUNCTION("mmc", jz4730_mmc), INGENIC_PIN_FUNCTION("uart0", jz4730_uart0), INGENIC_PIN_FUNCTION("uart1", jz4730_uart1), @@ -370,7 +367,7 @@ static const char *jz4740_pwm5_groups[] = { "pwm5", }; static const char *jz4740_pwm6_groups[] = { "pwm6", }; static const char *jz4740_pwm7_groups[] = { "pwm7", }; -static const struct function_desc jz4740_functions[] = { +static const struct pinfunction jz4740_functions[] = { INGENIC_PIN_FUNCTION("mmc", jz4740_mmc), INGENIC_PIN_FUNCTION("uart0", jz4740_uart0), INGENIC_PIN_FUNCTION("uart1", jz4740_uart1), @@ -474,7 +471,7 @@ static const char *jz4725b_pwm3_groups[] = { "pwm3", }; static const char *jz4725b_pwm4_groups[] = { "pwm4", }; static const char *jz4725b_pwm5_groups[] = { "pwm5", }; -static const struct function_desc jz4725b_functions[] = { +static const struct pinfunction jz4725b_functions[] = { INGENIC_PIN_FUNCTION("mmc0", jz4725b_mmc0), INGENIC_PIN_FUNCTION("mmc1", jz4725b_mmc1), INGENIC_PIN_FUNCTION("uart", jz4725b_uart), @@ -606,7 +603,7 @@ static const char *jz4750_pwm3_groups[] = { "pwm3", }; static const char *jz4750_pwm4_groups[] = { "pwm4", }; static const char *jz4750_pwm5_groups[] = { "pwm5", }; -static const struct function_desc jz4750_functions[] = { +static const struct pinfunction jz4750_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4750_uart0), INGENIC_PIN_FUNCTION("uart1", jz4750_uart1), INGENIC_PIN_FUNCTION("uart2", jz4750_uart2), @@ -771,7 +768,7 @@ static const char *jz4755_pwm3_groups[] = { "pwm3", }; static const char *jz4755_pwm4_groups[] = { "pwm4", }; static const char *jz4755_pwm5_groups[] = { "pwm5", }; -static const struct function_desc jz4755_functions[] = { +static const struct pinfunction jz4755_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4755_uart0), INGENIC_PIN_FUNCTION("uart1", jz4755_uart1), INGENIC_PIN_FUNCTION("uart2", jz4755_uart2), @@ -1106,7 +1103,7 @@ static const char *jz4760_pwm6_groups[] = { "pwm6", }; static const char *jz4760_pwm7_groups[] = { "pwm7", }; static const char *jz4760_otg_groups[] = { "otg-vbus", }; -static const struct function_desc jz4760_functions[] = { +static const struct pinfunction jz4760_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4760_uart0), INGENIC_PIN_FUNCTION("uart1", jz4760_uart1), INGENIC_PIN_FUNCTION("uart2", jz4760_uart2), @@ -1444,7 +1441,7 @@ static const char *jz4770_pwm6_groups[] = { "pwm6", }; static const char *jz4770_pwm7_groups[] = { "pwm7", }; static const char *jz4770_mac_groups[] = { "mac-rmii", "mac-mii", }; -static const struct function_desc jz4770_functions[] = { +static const struct pinfunction jz4770_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4770_uart0), INGENIC_PIN_FUNCTION("uart1", jz4770_uart1), INGENIC_PIN_FUNCTION("uart2", jz4770_uart2), @@ -1723,7 +1720,7 @@ static const char *jz4775_mac_groups[] = { }; static const char *jz4775_otg_groups[] = { "otg-vbus", }; -static const struct function_desc jz4775_functions[] = { +static const struct pinfunction jz4775_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4775_uart0), INGENIC_PIN_FUNCTION("uart1", jz4775_uart1), INGENIC_PIN_FUNCTION("uart2", jz4775_uart2), @@ -1976,7 +1973,7 @@ static const char *jz4780_dmic_groups[] = { "dmic", }; static const char *jz4780_cim_groups[] = { "cim-data", }; static const char *jz4780_hdmi_ddc_groups[] = { "hdmi-ddc", }; -static const struct function_desc jz4780_functions[] = { +static const struct pinfunction jz4780_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4770_uart0), INGENIC_PIN_FUNCTION("uart1", jz4770_uart1), INGENIC_PIN_FUNCTION("uart2", jz4780_uart2), @@ -2211,7 +2208,7 @@ static const char *x1000_pwm3_groups[] = { "pwm3", }; static const char *x1000_pwm4_groups[] = { "pwm4", }; static const char *x1000_mac_groups[] = { "mac", }; -static const struct function_desc x1000_functions[] = { +static const struct pinfunction x1000_functions[] = { INGENIC_PIN_FUNCTION("uart0", x1000_uart0), INGENIC_PIN_FUNCTION("uart1", x1000_uart1), INGENIC_PIN_FUNCTION("uart2", x1000_uart2), @@ -2341,7 +2338,7 @@ static const char *x1500_pwm2_groups[] = { "pwm2", }; static const char *x1500_pwm3_groups[] = { "pwm3", }; static const char *x1500_pwm4_groups[] = { "pwm4", }; -static const struct function_desc x1500_functions[] = { +static const struct pinfunction x1500_functions[] = { INGENIC_PIN_FUNCTION("uart0", x1500_uart0), INGENIC_PIN_FUNCTION("uart1", x1500_uart1), INGENIC_PIN_FUNCTION("uart2", x1500_uart2), @@ -2562,7 +2559,7 @@ static const char * const x1600_pwm7_groups[] = { "pwm7-b10", "pwm7-b21", }; static const char * const x1600_mac_groups[] = { "mac", }; -static const struct function_desc x1600_functions[] = { +static const struct pinfunction x1600_functions[] = { INGENIC_PIN_FUNCTION("uart0", x1600_uart0), INGENIC_PIN_FUNCTION("uart1", x1600_uart1), INGENIC_PIN_FUNCTION("uart2", x1600_uart2), @@ -2779,7 +2776,7 @@ static const char *x1830_pwm6_groups[] = { "pwm6-c-17", "pwm6-c-27", }; static const char *x1830_pwm7_groups[] = { "pwm7-c-18", "pwm7-c-28", }; static const char *x1830_mac_groups[] = { "mac", }; -static const struct function_desc x1830_functions[] = { +static const struct pinfunction x1830_functions[] = { INGENIC_PIN_FUNCTION("uart0", x1830_uart0), INGENIC_PIN_FUNCTION("uart1", x1830_uart1), INGENIC_PIN_FUNCTION("sfc", x1830_sfc), @@ -3225,7 +3222,7 @@ static const char *x2000_mac0_groups[] = { "mac0-rmii", "mac0-rgmii", }; static const char *x2000_mac1_groups[] = { "mac1-rmii", "mac1-rgmii", }; static const char *x2000_otg_groups[] = { "otg-vbus", }; -static const struct function_desc x2000_functions[] = { +static const struct pinfunction x2000_functions[] = { INGENIC_PIN_FUNCTION("uart0", x2000_uart0), INGENIC_PIN_FUNCTION("uart1", x2000_uart1), INGENIC_PIN_FUNCTION("uart2", x2000_uart2), @@ -3449,7 +3446,7 @@ static const struct group_desc x2100_groups[] = { static const char *x2100_mac_groups[] = { "mac", }; -static const struct function_desc x2100_functions[] = { +static const struct pinfunction x2100_functions[] = { INGENIC_PIN_FUNCTION("uart0", x2000_uart0), INGENIC_PIN_FUNCTION("uart1", x2000_uart1), INGENIC_PIN_FUNCTION("uart2", x2000_uart2), @@ -4571,11 +4568,9 @@ static int __init ingenic_pinctrl_probe(struct platform_device *pdev) } for (i = 0; i < chip_info->num_functions; i++) { - const struct function_desc *function = &chip_info->functions[i]; - const struct pinfunction *func = &function->func; + const struct pinfunction *func = &chip_info->functions[i]; - err = pinmux_generic_add_pinfunction(jzpc->pctl, func, - function->data); + err = pinmux_generic_add_pinfunction(jzpc->pctl, func, NULL); if (err < 0) { dev_err(dev, "Failed to register function %s\n", func->name); return err; From 17d4f2a9e6cb224012d85fed52e9794a84fa501d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:13 +0200 Subject: [PATCH 1114/3206] pinctrl: airoha: replace struct function_desc with struct pinfunction struct function_desc is a wrapper around struct pinfunction with an additional void *data pointer. This driver doesn't use the data pointer. We're also working towards reducing the usage of struct function_desc in pinctrl drivers - they should only be created by pinmux core and accessed by drivers using pinmux_generic_get_function(). Replace the struct function_desc objects in this driver with smaller struct pinfunction instances. Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/mediatek/pinctrl-airoha.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/pinctrl/mediatek/pinctrl-airoha.c b/drivers/pinctrl/mediatek/pinctrl-airoha.c index 5f1ec9e0de213d..eb6cd27955fbe4 100644 --- a/drivers/pinctrl/mediatek/pinctrl-airoha.c +++ b/drivers/pinctrl/mediatek/pinctrl-airoha.c @@ -35,13 +35,8 @@ #define PINCTRL_FUNC_DESC(id) \ { \ - .desc = { \ - .func = { \ - .name = #id, \ - .groups = id##_groups, \ - .ngroups = ARRAY_SIZE(id##_groups), \ - } \ - }, \ + .desc = PINCTRL_PINFUNCTION(#id, id##_groups, \ + ARRAY_SIZE(id##_groups)), \ .groups = id##_func_group, \ .group_size = ARRAY_SIZE(id##_func_group), \ } @@ -334,7 +329,7 @@ struct airoha_pinctrl_func_group { }; struct airoha_pinctrl_func { - const struct function_desc desc; + const struct pinfunction desc; const struct airoha_pinctrl_func_group *groups; u8 group_size; }; @@ -2908,11 +2903,11 @@ static int airoha_pinctrl_probe(struct platform_device *pdev) func = &airoha_pinctrl_funcs[i]; err = pinmux_generic_add_pinfunction(pinctrl->ctrl, - &func->desc.func, + &func->desc, (void *)func); if (err < 0) { dev_err(dev, "Failed to register function %s\n", - func->desc.func.name); + func->desc.name); return err; } } From 7a24f1f5b2147f99d2ffa40dfe137c040cab1fb8 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:14 +0200 Subject: [PATCH 1115/3206] pinctrl: mediatek: mt7988: use PINCTRL_PIN_FUNCTION() We have a dedicated initializer macro for defining pin functions for mediatek drivers so use it here. Reviewed-by: Chen-Yu Tsai Reviewed-by: Andy Shevchenko Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/mediatek/pinctrl-mt7988.c | 42 +++++++++-------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7988.c b/drivers/pinctrl/mediatek/pinctrl-mt7988.c index 68b4097792b883..55c8674d8d66f1 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7988.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7988.c @@ -1465,32 +1465,22 @@ static const char * const mt7988_usb_groups[] = { }; static const struct function_desc mt7988_functions[] = { - { { "audio", mt7988_audio_groups, ARRAY_SIZE(mt7988_audio_groups) }, - NULL }, - { { "jtag", mt7988_jtag_groups, ARRAY_SIZE(mt7988_jtag_groups) }, - NULL }, - { { "int_usxgmii", mt7988_int_usxgmii_groups, - ARRAY_SIZE(mt7988_int_usxgmii_groups) }, - NULL }, - { { "pwm", mt7988_pwm_groups, ARRAY_SIZE(mt7988_pwm_groups) }, NULL }, - { { "dfd", mt7988_dfd_groups, ARRAY_SIZE(mt7988_dfd_groups) }, NULL }, - { { "i2c", mt7988_i2c_groups, ARRAY_SIZE(mt7988_i2c_groups) }, NULL }, - { { "eth", mt7988_ethernet_groups, ARRAY_SIZE(mt7988_ethernet_groups) }, - NULL }, - { { "pcie", mt7988_pcie_groups, ARRAY_SIZE(mt7988_pcie_groups) }, - NULL }, - { { "pmic", mt7988_pmic_groups, ARRAY_SIZE(mt7988_pmic_groups) }, - NULL }, - { { "watchdog", mt7988_wdt_groups, ARRAY_SIZE(mt7988_wdt_groups) }, - NULL }, - { { "spi", mt7988_spi_groups, ARRAY_SIZE(mt7988_spi_groups) }, NULL }, - { { "flash", mt7988_flash_groups, ARRAY_SIZE(mt7988_flash_groups) }, - NULL }, - { { "uart", mt7988_uart_groups, ARRAY_SIZE(mt7988_uart_groups) }, - NULL }, - { { "udi", mt7988_udi_groups, ARRAY_SIZE(mt7988_udi_groups) }, NULL }, - { { "usb", mt7988_usb_groups, ARRAY_SIZE(mt7988_usb_groups) }, NULL }, - { { "led", mt7988_led_groups, ARRAY_SIZE(mt7988_led_groups) }, NULL }, + PINCTRL_PIN_FUNCTION("audio", mt7988_audio), + PINCTRL_PIN_FUNCTION("jtag", mt7988_jtag), + PINCTRL_PIN_FUNCTION("int_usxgmii", mt7988_int_usxgmii), + PINCTRL_PIN_FUNCTION("pwm", mt7988_pwm), + PINCTRL_PIN_FUNCTION("dfd", mt7988_dfd), + PINCTRL_PIN_FUNCTION("i2c", mt7988_i2c), + PINCTRL_PIN_FUNCTION("eth", mt7988_ethernet), + PINCTRL_PIN_FUNCTION("pcie", mt7988_pcie), + PINCTRL_PIN_FUNCTION("pmic", mt7988_pmic), + PINCTRL_PIN_FUNCTION("watchdog", mt7988_wdt), + PINCTRL_PIN_FUNCTION("spi", mt7988_spi), + PINCTRL_PIN_FUNCTION("flash", mt7988_flash), + PINCTRL_PIN_FUNCTION("uart", mt7988_uart), + PINCTRL_PIN_FUNCTION("udi", mt7988_udi), + PINCTRL_PIN_FUNCTION("usb", mt7988_usb), + PINCTRL_PIN_FUNCTION("led", mt7988_led), }; static const struct mtk_eint_hw mt7988_eint_hw = { From bd6f4a91401fc9847eef306a593df243c40c9d05 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:15 +0200 Subject: [PATCH 1116/3206] pinctrl: mediatek: moore: replace struct function_desc with struct pinfunction struct function_desc is a wrapper around struct pinfunction with an additional void *data pointer. This driver doesn't use the data pointer. We're also working towards reducing the usage of struct function_desc in pinctrl drivers - they should only be created by pinmux core and accessed by drivers using pinmux_generic_get_function(). Replace the struct function_desc objects in this driver with smaller struct pinfunction instances. Reviewed-by: Chen-Yu Tsai Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/mediatek/pinctrl-moore.c | 6 ++---- drivers/pinctrl/mediatek/pinctrl-moore.h | 7 ++----- drivers/pinctrl/mediatek/pinctrl-mt7622.c | 2 +- drivers/pinctrl/mediatek/pinctrl-mt7623.c | 2 +- drivers/pinctrl/mediatek/pinctrl-mt7629.c | 2 +- drivers/pinctrl/mediatek/pinctrl-mt7981.c | 2 +- drivers/pinctrl/mediatek/pinctrl-mt7986.c | 2 +- drivers/pinctrl/mediatek/pinctrl-mt7988.c | 2 +- drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h | 2 +- 9 files changed, 11 insertions(+), 16 deletions(-) diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.c b/drivers/pinctrl/mediatek/pinctrl-moore.c index 6e4f6c07a50932..982b9887521d03 100644 --- a/drivers/pinctrl/mediatek/pinctrl-moore.c +++ b/drivers/pinctrl/mediatek/pinctrl-moore.c @@ -622,11 +622,9 @@ static int mtk_build_functions(struct mtk_pinctrl *hw) int i, err; for (i = 0; i < hw->soc->nfuncs ; i++) { - const struct function_desc *function = hw->soc->funcs + i; - const struct pinfunction *func = &function->func; + const struct pinfunction *func = hw->soc->funcs + i; - err = pinmux_generic_add_pinfunction(hw->pctrl, func, - function->data); + err = pinmux_generic_add_pinfunction(hw->pctrl, func, NULL); if (err < 0) { dev_err(hw->dev, "Failed to register function %s\n", func->name); diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.h b/drivers/pinctrl/mediatek/pinctrl-moore.h index 229d19561e229c..fe1f087cacd044 100644 --- a/drivers/pinctrl/mediatek/pinctrl-moore.h +++ b/drivers/pinctrl/mediatek/pinctrl-moore.h @@ -43,11 +43,8 @@ .data = id##_funcs, \ } -#define PINCTRL_PIN_FUNCTION(_name_, id) \ - { \ - .func = PINCTRL_PINFUNCTION(_name_, id##_groups, ARRAY_SIZE(id##_groups)), \ - .data = NULL, \ - } +#define PINCTRL_PIN_FUNCTION(_name_, id) \ + PINCTRL_PINFUNCTION(_name_, id##_groups, ARRAY_SIZE(id##_groups)) int mtk_moore_pinctrl_probe(struct platform_device *pdev, const struct mtk_pin_soc *soc); diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7622.c b/drivers/pinctrl/mediatek/pinctrl-mt7622.c index 2dc1019910662a..d5777889448aab 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7622.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7622.c @@ -822,7 +822,7 @@ static const char *mt7622_uart_groups[] = { "uart0_0_tx_rx", "uart4_2_rts_cts",}; static const char *mt7622_wdt_groups[] = { "watchdog", }; -static const struct function_desc mt7622_functions[] = { +static const struct pinfunction mt7622_functions[] = { PINCTRL_PIN_FUNCTION("antsel", mt7622_antsel), PINCTRL_PIN_FUNCTION("emmc", mt7622_emmc), PINCTRL_PIN_FUNCTION("eth", mt7622_ethernet), diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7623.c b/drivers/pinctrl/mediatek/pinctrl-mt7623.c index 3e59eada282527..69c06c2c0e21e4 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7623.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7623.c @@ -1340,7 +1340,7 @@ static const char *mt7623_uart_groups[] = { "uart0_0_txd_rxd", "uart3_rts_cts", }; static const char *mt7623_wdt_groups[] = { "watchdog_0", "watchdog_1", }; -static const struct function_desc mt7623_functions[] = { +static const struct pinfunction mt7623_functions[] = { PINCTRL_PIN_FUNCTION("audck", mt7623_aud_clk), PINCTRL_PIN_FUNCTION("disp", mt7623_disp_pwm), PINCTRL_PIN_FUNCTION("eth", mt7623_ethernet), diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7629.c b/drivers/pinctrl/mediatek/pinctrl-mt7629.c index 98142e8c98011d..cc0694881ac9dc 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7629.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7629.c @@ -384,7 +384,7 @@ static const char *mt7629_wdt_groups[] = { "watchdog", }; static const char *mt7629_wifi_groups[] = { "wf0_5g", "wf0_2g", }; static const char *mt7629_flash_groups[] = { "snfi", "spi_nor" }; -static const struct function_desc mt7629_functions[] = { +static const struct pinfunction mt7629_functions[] = { PINCTRL_PIN_FUNCTION("eth", mt7629_ethernet), PINCTRL_PIN_FUNCTION("i2c", mt7629_i2c), PINCTRL_PIN_FUNCTION("led", mt7629_led), diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7981.c b/drivers/pinctrl/mediatek/pinctrl-mt7981.c index 83092be5b614cc..6216c2e057f649 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7981.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7981.c @@ -977,7 +977,7 @@ static const char *mt7981_ethernet_groups[] = { "smi_mdc_mdio", "gbe_ext_mdc_mdi "wf0_mode1", "wf0_mode3", "mt7531_int", }; static const char *mt7981_ant_groups[] = { "ant_sel", }; -static const struct function_desc mt7981_functions[] = { +static const struct pinfunction mt7981_functions[] = { PINCTRL_PIN_FUNCTION("wa_aice", mt7981_wa_aice), PINCTRL_PIN_FUNCTION("dfd", mt7981_dfd), PINCTRL_PIN_FUNCTION("jtag", mt7981_jtag), diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7986.c b/drivers/pinctrl/mediatek/pinctrl-mt7986.c index 5816b5fdb7ca91..2a762ade9c3550 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7986.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7986.c @@ -878,7 +878,7 @@ static const char *mt7986_uart_groups[] = { static const char *mt7986_wdt_groups[] = { "watchdog", }; static const char *mt7986_wf_groups[] = { "wf_2g", "wf_5g", "wf_dbdc", }; -static const struct function_desc mt7986_functions[] = { +static const struct pinfunction mt7986_functions[] = { PINCTRL_PIN_FUNCTION("audio", mt7986_audio), PINCTRL_PIN_FUNCTION("emmc", mt7986_emmc), PINCTRL_PIN_FUNCTION("eth", mt7986_ethernet), diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7988.c b/drivers/pinctrl/mediatek/pinctrl-mt7988.c index 55c8674d8d66f1..9569e8c0cec15f 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7988.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7988.c @@ -1464,7 +1464,7 @@ static const char * const mt7988_usb_groups[] = { "drv_vbus_p1", }; -static const struct function_desc mt7988_functions[] = { +static const struct pinfunction mt7988_functions[] = { PINCTRL_PIN_FUNCTION("audio", mt7988_audio), PINCTRL_PIN_FUNCTION("jtag", mt7988_jtag), PINCTRL_PIN_FUNCTION("int_usxgmii", mt7988_int_usxgmii), diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h index 36d2898037dd04..fa7c0ed4934648 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h +++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h @@ -238,7 +238,7 @@ struct mtk_pin_soc { unsigned int npins; const struct group_desc *grps; unsigned int ngrps; - const struct function_desc *funcs; + const struct pinfunction *funcs; unsigned int nfuncs; const struct mtk_eint_regs *eint_regs; const struct mtk_eint_hw *eint_hw; From ea22f777fc43e55fb9f0d1a1f873cfab29219d8e Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:16 +0200 Subject: [PATCH 1117/3206] pinctrl: imx: don't access the pin function radix tree directly The radix tree containing pin function descriptors should not be accessed directly by drivers. There are dedicated functions for it. I suppose this driver does it so that the memory containing the function description is not duplicated but we're going to address that shortly so convert it to using generic pinctrl APIs. Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/freescale/pinctrl-imx.c | 41 +++++++++++-------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/drivers/pinctrl/freescale/pinctrl-imx.c b/drivers/pinctrl/freescale/pinctrl-imx.c index 18de3132854045..c5b17c5ecfb5ee 100644 --- a/drivers/pinctrl/freescale/pinctrl-imx.c +++ b/drivers/pinctrl/freescale/pinctrl-imx.c @@ -580,33 +580,38 @@ static int imx_pinctrl_parse_functions(struct device_node *np, u32 index) { struct pinctrl_dev *pctl = ipctl->pctl; - struct function_desc *func; + struct pinfunction *func; struct group_desc *grp; const char **group_names; + int ret; u32 i; dev_dbg(pctl->dev, "parse function(%d): %pOFn\n", index, np); - func = pinmux_generic_get_function(pctl, index); + func = devm_kzalloc(ipctl->dev, sizeof(*func), GFP_KERNEL); if (!func) - return -EINVAL; + return -ENOMEM; /* Initialise function */ - func->func.name = np->name; - func->func.ngroups = of_get_child_count(np); - if (func->func.ngroups == 0) { + func->name = np->name; + func->ngroups = of_get_child_count(np); + if (func->ngroups == 0) { dev_info(ipctl->dev, "no groups defined in %pOF\n", np); return -EINVAL; } - group_names = devm_kcalloc(ipctl->dev, func->func.ngroups, - sizeof(*func->func.groups), GFP_KERNEL); + group_names = devm_kcalloc(ipctl->dev, func->ngroups, + sizeof(*func->groups), GFP_KERNEL); if (!group_names) return -ENOMEM; i = 0; for_each_child_of_node_scoped(np, child) group_names[i++] = child->name; - func->func.groups = group_names; + func->groups = group_names; + + ret = pinmux_generic_add_pinfunction(pctl, func, NULL); + if (ret < 0) + return ret; i = 0; for_each_child_of_node_scoped(np, child) { @@ -615,6 +620,10 @@ static int imx_pinctrl_parse_functions(struct device_node *np, return -ENOMEM; mutex_lock(&ipctl->mutex); + /* + * FIXME: This should use pinctrl_generic_add_group() and not + * access the private radix tree directly. + */ radix_tree_insert(&pctl->pin_group_tree, ipctl->group_index++, grp); mutex_unlock(&ipctl->mutex); @@ -669,20 +678,6 @@ static int imx_pinctrl_probe_dt(struct platform_device *pdev, } } - for (i = 0; i < nfuncs; i++) { - struct function_desc *function; - - function = devm_kzalloc(&pdev->dev, sizeof(*function), - GFP_KERNEL); - if (!function) - return -ENOMEM; - - mutex_lock(&ipctl->mutex); - radix_tree_insert(&pctl->pin_function_tree, i, function); - mutex_unlock(&ipctl->mutex); - } - pctl->num_functions = nfuncs; - ipctl->group_index = 0; if (flat_funcs) { pctl->num_groups = of_get_child_count(np); From aae7a2876c3b39d07aa7655ea082af8e7862f3a5 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:17 +0200 Subject: [PATCH 1118/3206] pinctrl: keembay: release allocated memory in detach path Unlike all the other allocations in this driver, the memory for storing the pin function descriptions allocated with kcalloc() and later resized with krealloc() is never freed. Use devres like elsewhere to handle that. While at it - replace krealloc() with more suitable devm_krealloc_array(). Note: the logic in this module is pretty convoluted and could probably use some revisiting, we should probably be able to calculate the exact amount of memory needed in advance or even skip the allocation altogether and just add each function to the radix tree separately. Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-keembay.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinctrl-keembay.c b/drivers/pinctrl/pinctrl-keembay.c index 60cf017498b32a..6aefcbc3130999 100644 --- a/drivers/pinctrl/pinctrl-keembay.c +++ b/drivers/pinctrl/pinctrl-keembay.c @@ -1603,7 +1603,8 @@ static int keembay_build_functions(struct keembay_pinctrl *kpc) * being part of 8 (hw maximum) globally unique muxes. */ kpc->nfuncs = 0; - keembay_funcs = kcalloc(kpc->npins * 8, sizeof(*keembay_funcs), GFP_KERNEL); + keembay_funcs = devm_kcalloc(kpc->dev, kpc->npins * 8, + sizeof(*keembay_funcs), GFP_KERNEL); if (!keembay_funcs) return -ENOMEM; @@ -1634,7 +1635,9 @@ static int keembay_build_functions(struct keembay_pinctrl *kpc) } /* Reallocate memory based on actual number of functions */ - new_funcs = krealloc(keembay_funcs, kpc->nfuncs * sizeof(*new_funcs), GFP_KERNEL); + new_funcs = devm_krealloc_array(kpc->dev, keembay_funcs, + kpc->nfuncs, sizeof(*new_funcs), + GFP_KERNEL); if (!new_funcs) { kfree(keembay_funcs); return -ENOMEM; From 071cdf5645e5b423735d34b86a001bf816707ec7 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:18 +0200 Subject: [PATCH 1119/3206] pinctrl: keembay: use a dedicated structure for the pinfunction description struct function_desc is a wrapper around struct pinfunction with an additional void *data pointer. We're working towards reducing the usage of struct function_desc in pinctrl drivers - they should only be created by pinmux core and accessed by drivers using pinmux_generic_get_function(). This driver uses the data pointer so in order to stop using struct function_desc, we need to provide an alternative that also wraps the mux mode which is passed to pinctrl core as user data. Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-keembay.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/pinctrl/pinctrl-keembay.c b/drivers/pinctrl/pinctrl-keembay.c index 6aefcbc3130999..e78c8b3ec245aa 100644 --- a/drivers/pinctrl/pinctrl-keembay.c +++ b/drivers/pinctrl/pinctrl-keembay.c @@ -135,6 +135,11 @@ struct keembay_pin_soc { const struct pinctrl_pin_desc *pins; }; +struct keembay_pinfunction { + struct pinfunction func; + u8 mux_mode; +}; + static const struct pinctrl_pin_desc keembay_pins[] = { KEEMBAY_PIN_DESC(0, "GPIO0", KEEMBAY_MUX(0x0, "I2S0_M0"), @@ -1556,13 +1561,13 @@ static int keembay_pinctrl_reg(struct keembay_pinctrl *kpc, struct device *dev) } static int keembay_add_functions(struct keembay_pinctrl *kpc, - struct function_desc *functions) + struct keembay_pinfunction *functions) { unsigned int i; /* Assign the groups for each function */ for (i = 0; i < kpc->nfuncs; i++) { - struct function_desc *func = &functions[i]; + struct keembay_pinfunction *func = &functions[i]; const char **group_names; unsigned int grp_idx = 0; int j; @@ -1588,14 +1593,14 @@ static int keembay_add_functions(struct keembay_pinctrl *kpc, /* Add all functions */ for (i = 0; i < kpc->nfuncs; i++) pinmux_generic_add_pinfunction(kpc->pctrl, &functions[i].func, - functions[i].data); + &functions[i].mux_mode); return 0; } static int keembay_build_functions(struct keembay_pinctrl *kpc) { - struct function_desc *keembay_funcs, *new_funcs; + struct keembay_pinfunction *keembay_funcs, *new_funcs; int i; /* @@ -1614,7 +1619,7 @@ static int keembay_build_functions(struct keembay_pinctrl *kpc) struct keembay_mux_desc *mux; for (mux = pdesc->drv_data; mux->name; mux++) { - struct function_desc *fdesc; + struct keembay_pinfunction *fdesc; /* Check if we already have function for this mux */ for (fdesc = keembay_funcs; fdesc->func.name; fdesc++) { @@ -1628,7 +1633,7 @@ static int keembay_build_functions(struct keembay_pinctrl *kpc) if (!fdesc->func.name) { fdesc->func.name = mux->name; fdesc->func.ngroups = 1; - fdesc->data = &mux->mode; + fdesc->mux_mode = mux->mode; kpc->nfuncs++; } } From fbba4a9e368f6ec521043ae4035edb2b74ed12e4 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:19 +0200 Subject: [PATCH 1120/3206] pinctrl: constify pinmux_generic_get_function() With all users of struct function_desc limited to only accessing it using the dedicated function and never modifying it, we can now constify the return value of pinmux_generic_get_function() treewide. Reviewed-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven # renesas Acked-by: Geert Uytterhoeven # renesas Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/freescale/pinctrl-imx.c | 2 +- drivers/pinctrl/mediatek/pinctrl-airoha.c | 2 +- drivers/pinctrl/mediatek/pinctrl-moore.c | 2 +- drivers/pinctrl/pinctrl-equilibrium.c | 2 +- drivers/pinctrl/pinctrl-ingenic.c | 2 +- drivers/pinctrl/pinctrl-keembay.c | 2 +- drivers/pinctrl/pinctrl-single.c | 4 ++-- drivers/pinctrl/pinmux.c | 4 ++-- drivers/pinctrl/pinmux.h | 4 ++-- drivers/pinctrl/renesas/pinctrl-rza1.c | 2 +- drivers/pinctrl/renesas/pinctrl-rza2.c | 2 +- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 2 +- drivers/pinctrl/renesas/pinctrl-rzv2m.c | 2 +- 13 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/pinctrl/freescale/pinctrl-imx.c b/drivers/pinctrl/freescale/pinctrl-imx.c index c5b17c5ecfb5ee..39c582a25d8fe9 100644 --- a/drivers/pinctrl/freescale/pinctrl-imx.c +++ b/drivers/pinctrl/freescale/pinctrl-imx.c @@ -245,7 +245,7 @@ static int imx_pmx_set(struct pinctrl_dev *pctldev, unsigned selector, { struct imx_pinctrl *ipctl = pinctrl_dev_get_drvdata(pctldev); const struct imx_pinctrl_soc_info *info = ipctl->info; - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; struct imx_pin *pin; unsigned int npins; diff --git a/drivers/pinctrl/mediatek/pinctrl-airoha.c b/drivers/pinctrl/mediatek/pinctrl-airoha.c index eb6cd27955fbe4..2eed83f9f209ca 100644 --- a/drivers/pinctrl/mediatek/pinctrl-airoha.c +++ b/drivers/pinctrl/mediatek/pinctrl-airoha.c @@ -2443,7 +2443,7 @@ static int airoha_pinmux_set_mux(struct pinctrl_dev *pctrl_dev, { struct airoha_pinctrl *pinctrl = pinctrl_dev_get_drvdata(pctrl_dev); const struct airoha_pinctrl_func *func; - struct function_desc *desc; + const struct function_desc *desc; struct group_desc *grp; int i; diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.c b/drivers/pinctrl/mediatek/pinctrl-moore.c index 982b9887521d03..17a08c73423fd0 100644 --- a/drivers/pinctrl/mediatek/pinctrl-moore.c +++ b/drivers/pinctrl/mediatek/pinctrl-moore.c @@ -43,7 +43,7 @@ static int mtk_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, unsigned int group) { struct mtk_pinctrl *hw = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; int i, err; diff --git a/drivers/pinctrl/pinctrl-equilibrium.c b/drivers/pinctrl/pinctrl-equilibrium.c index 01731d51a7c43e..7e655b0444b359 100644 --- a/drivers/pinctrl/pinctrl-equilibrium.c +++ b/drivers/pinctrl/pinctrl-equilibrium.c @@ -325,7 +325,7 @@ static int eqbr_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, unsigned int group) { struct eqbr_pinctrl_drv_data *pctl = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; unsigned int *pinmux; int i; diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c index af9d2f3081859b..e13ef07850386e 100644 --- a/drivers/pinctrl/pinctrl-ingenic.c +++ b/drivers/pinctrl/pinctrl-ingenic.c @@ -4000,7 +4000,7 @@ static int ingenic_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, unsigned int group) { struct ingenic_pinctrl *jzpc = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; unsigned int i; uintptr_t mode; diff --git a/drivers/pinctrl/pinctrl-keembay.c b/drivers/pinctrl/pinctrl-keembay.c index e78c8b3ec245aa..30122ca90cbef1 100644 --- a/drivers/pinctrl/pinctrl-keembay.c +++ b/drivers/pinctrl/pinctrl-keembay.c @@ -935,7 +935,7 @@ static int keembay_set_mux(struct pinctrl_dev *pctldev, unsigned int fun_sel, unsigned int grp_sel) { struct keembay_pinctrl *kpc = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; unsigned int val; u8 pin_mode; diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index 8aedee2720bcb9..6d580aa282ec98 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -336,7 +336,7 @@ static int pcs_get_function(struct pinctrl_dev *pctldev, unsigned pin, struct pcs_device *pcs = pinctrl_dev_get_drvdata(pctldev); struct pin_desc *pdesc = pin_desc_get(pctldev, pin); const struct pinctrl_setting_mux *setting; - struct function_desc *function; + const struct function_desc *function; unsigned fselector; /* If pin is not described in DTS & enabled, mux_setting is NULL. */ @@ -360,7 +360,7 @@ static int pcs_set_mux(struct pinctrl_dev *pctldev, unsigned fselector, unsigned group) { struct pcs_device *pcs; - struct function_desc *function; + const struct function_desc *function; struct pcs_function *func; int i; diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c index 07a478b2c48740..1529d7b6c6657e 100644 --- a/drivers/pinctrl/pinmux.c +++ b/drivers/pinctrl/pinmux.c @@ -847,8 +847,8 @@ EXPORT_SYMBOL_GPL(pinmux_generic_get_function_groups); * @pctldev: pin controller device * @selector: function number */ -struct function_desc *pinmux_generic_get_function(struct pinctrl_dev *pctldev, - unsigned int selector) +const struct function_desc * +pinmux_generic_get_function(struct pinctrl_dev *pctldev, unsigned int selector) { struct function_desc *function; diff --git a/drivers/pinctrl/pinmux.h b/drivers/pinctrl/pinmux.h index bdb5be1a636ead..549ab10f7afbda 100644 --- a/drivers/pinctrl/pinmux.h +++ b/drivers/pinctrl/pinmux.h @@ -152,8 +152,8 @@ int pinmux_generic_get_function_groups(struct pinctrl_dev *pctldev, const char * const **groups, unsigned int * const ngroups); -struct function_desc *pinmux_generic_get_function(struct pinctrl_dev *pctldev, - unsigned int selector); +const struct function_desc * +pinmux_generic_get_function(struct pinctrl_dev *pctldev, unsigned int selector); int pinmux_generic_add_function(struct pinctrl_dev *pctldev, const char *name, diff --git a/drivers/pinctrl/renesas/pinctrl-rza1.c b/drivers/pinctrl/renesas/pinctrl-rza1.c index 23812116ef4268..70f22e0ef307a0 100644 --- a/drivers/pinctrl/renesas/pinctrl-rza1.c +++ b/drivers/pinctrl/renesas/pinctrl-rza1.c @@ -1120,7 +1120,7 @@ static int rza1_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, { struct rza1_pinctrl *rza1_pctl = pinctrl_dev_get_drvdata(pctldev); struct rza1_mux_conf *mux_confs; - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; int i; diff --git a/drivers/pinctrl/renesas/pinctrl-rza2.c b/drivers/pinctrl/renesas/pinctrl-rza2.c index b78b5b4ec5afd9..29a9db19759906 100644 --- a/drivers/pinctrl/renesas/pinctrl-rza2.c +++ b/drivers/pinctrl/renesas/pinctrl-rza2.c @@ -442,7 +442,7 @@ static int rza2_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, unsigned int group) { struct rza2_pinctrl_priv *priv = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; unsigned int i, *psel_val; struct group_desc *grp; diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index b182b3b8a542a2..6a4eac8ffe2e0d 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -574,7 +574,7 @@ static int rzg2l_pinctrl_set_mux(struct pinctrl_dev *pctldev, { struct rzg2l_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); const struct rzg2l_hwcfg *hwcfg = pctrl->data->hwcfg; - struct function_desc *func; + const struct function_desc *func; unsigned int i, *psel_val; struct group_desc *group; const unsigned int *pins; diff --git a/drivers/pinctrl/renesas/pinctrl-rzv2m.c b/drivers/pinctrl/renesas/pinctrl-rzv2m.c index daaa986d994dac..dce68f93d2d57f 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzv2m.c +++ b/drivers/pinctrl/renesas/pinctrl-rzv2m.c @@ -162,7 +162,7 @@ static int rzv2m_pinctrl_set_mux(struct pinctrl_dev *pctldev, unsigned int group_selector) { struct rzv2m_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; unsigned int i, *psel_val; struct group_desc *group; const unsigned int *pins; From d57b7979ea4c43655e8772b91cbac0f613419f4a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:20 +0200 Subject: [PATCH 1121/3206] pinctrl: make struct pinfunction a pointer in struct function_desc We currently duplicate the entire struct pinfunction object in pinmux_generic_add_pinfunction(). While this is inevitable when the arguments come in split through pinmux_generic_add_function(), users of pinmux_generic_add_pinfunction() will typically pass addresses of structures in .rodata, meaning we can try to avoid the duplication with the help from kmemdup_const(). To that end: don't wrap the entire struct pinfunction in struct function_desc but rather just store the address. Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/freescale/pinctrl-imx.c | 2 +- drivers/pinctrl/mediatek/pinctrl-airoha.c | 2 +- drivers/pinctrl/mediatek/pinctrl-moore.c | 2 +- drivers/pinctrl/pinctrl-ingenic.c | 2 +- drivers/pinctrl/pinmux.c | 18 ++++++++++++++---- drivers/pinctrl/pinmux.h | 2 +- 6 files changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/pinctrl/freescale/pinctrl-imx.c b/drivers/pinctrl/freescale/pinctrl-imx.c index 39c582a25d8fe9..731c58ad43eea9 100644 --- a/drivers/pinctrl/freescale/pinctrl-imx.c +++ b/drivers/pinctrl/freescale/pinctrl-imx.c @@ -266,7 +266,7 @@ static int imx_pmx_set(struct pinctrl_dev *pctldev, unsigned selector, npins = grp->grp.npins; dev_dbg(ipctl->dev, "enable function %s group %s\n", - func->func.name, grp->grp.name); + func->func->name, grp->grp.name); for (i = 0; i < npins; i++) { /* diff --git a/drivers/pinctrl/mediatek/pinctrl-airoha.c b/drivers/pinctrl/mediatek/pinctrl-airoha.c index 2eed83f9f209ca..8fb3b65a1b775c 100644 --- a/drivers/pinctrl/mediatek/pinctrl-airoha.c +++ b/drivers/pinctrl/mediatek/pinctrl-airoha.c @@ -2456,7 +2456,7 @@ static int airoha_pinmux_set_mux(struct pinctrl_dev *pctrl_dev, return -EINVAL; dev_dbg(pctrl_dev->dev, "enable function %s group %s\n", - desc->func.name, grp->grp.name); + desc->func->name, grp->grp.name); func = desc->data; for (i = 0; i < func->group_size; i++) { diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.c b/drivers/pinctrl/mediatek/pinctrl-moore.c index 17a08c73423fd0..11dc525eb3a2dc 100644 --- a/drivers/pinctrl/mediatek/pinctrl-moore.c +++ b/drivers/pinctrl/mediatek/pinctrl-moore.c @@ -56,7 +56,7 @@ static int mtk_pinmux_set_mux(struct pinctrl_dev *pctldev, return -EINVAL; dev_dbg(pctldev->dev, "enable function %s group %s\n", - func->func.name, grp->grp.name); + func->func->name, grp->grp.name); for (i = 0; i < grp->grp.npins; i++) { const struct mtk_pin_desc *desc; diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c index e13ef07850386e..e5b24fab12e11e 100644 --- a/drivers/pinctrl/pinctrl-ingenic.c +++ b/drivers/pinctrl/pinctrl-ingenic.c @@ -4015,7 +4015,7 @@ static int ingenic_pinmux_set_mux(struct pinctrl_dev *pctldev, return -EINVAL; dev_dbg(pctldev->dev, "enable function %s group %s\n", - func->func.name, grp->grp.name); + func->func->name, grp->grp.name); mode = (uintptr_t)grp->data; if (mode <= 3) { diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c index 1529d7b6c6657e..07ec93f09334f8 100644 --- a/drivers/pinctrl/pinmux.c +++ b/drivers/pinctrl/pinmux.c @@ -810,7 +810,7 @@ pinmux_generic_get_function_name(struct pinctrl_dev *pctldev, if (!function) return NULL; - return function->func.name; + return function->func->name; } EXPORT_SYMBOL_GPL(pinmux_generic_get_function_name); @@ -835,8 +835,8 @@ int pinmux_generic_get_function_groups(struct pinctrl_dev *pctldev, __func__, selector); return -EINVAL; } - *groups = function->func.groups; - *ngroups = function->func.ngroups; + *groups = function->func->groups; + *ngroups = function->func->ngroups; return 0; } @@ -903,7 +903,17 @@ int pinmux_generic_add_pinfunction(struct pinctrl_dev *pctldev, if (!function) return -ENOMEM; - function->func = *func; + /* + * FIXME: It's generally a bad idea to use devres in subsystem core + * code - managed interfaces are aimed at drivers - but pinctrl already + * uses it all over the place so it's a larger piece of technical debt + * to fix. + */ + function->func = devm_kmemdup_const(pctldev->dev, func, + sizeof(*func), GFP_KERNEL); + if (!function->func) + return -ENOMEM; + function->data = data; error = radix_tree_insert(&pctldev->pin_function_tree, selector, function); diff --git a/drivers/pinctrl/pinmux.h b/drivers/pinctrl/pinmux.h index 549ab10f7afbda..653684290666d7 100644 --- a/drivers/pinctrl/pinmux.h +++ b/drivers/pinctrl/pinmux.h @@ -137,7 +137,7 @@ static inline void pinmux_init_device_debugfs(struct dentry *devroot, * @data: pin controller driver specific data */ struct function_desc { - struct pinfunction func; + const struct pinfunction *func; void *data; }; From 6f6835beab99c770f1cc6de99b3e3b73c333dec7 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:21 +0200 Subject: [PATCH 1122/3206] pinctrl: qcom: use generic pin function helpers With the pinmux core no longer duplicating memory used to store the struct pinfunction objects in .rodata, we can now use the existing infrastructure for storing and looking up pin functions in qualcomm drivers. Remove hand-crafted callbacks. Reviewed-by: Konrad Dybcio Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/Kconfig | 1 + drivers/pinctrl/qcom/pinctrl-msm.c | 43 +++++++++--------------------- 2 files changed, 14 insertions(+), 30 deletions(-) diff --git a/drivers/pinctrl/qcom/Kconfig b/drivers/pinctrl/qcom/Kconfig index dd9bbe8f3e11c3..f7594de4b1e9b9 100644 --- a/drivers/pinctrl/qcom/Kconfig +++ b/drivers/pinctrl/qcom/Kconfig @@ -8,6 +8,7 @@ config PINCTRL_MSM depends on OF select QCOM_SCM select PINMUX + select GENERIC_PINMUX_FUNCTIONS select PINCONF select GENERIC_PINCONF select GPIOLIB_IRQCHIP diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index 83eb075b6bfa17..96e40c2342bded 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -31,6 +31,7 @@ #include "../core.h" #include "../pinconf.h" #include "../pinctrl-utils.h" +#include "../pinmux.h" #include "pinctrl-msm.h" @@ -150,33 +151,6 @@ static int msm_pinmux_request(struct pinctrl_dev *pctldev, unsigned offset) return gpiochip_line_is_valid(chip, offset) ? 0 : -EINVAL; } -static int msm_get_functions_count(struct pinctrl_dev *pctldev) -{ - struct msm_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); - - return pctrl->soc->nfunctions; -} - -static const char *msm_get_function_name(struct pinctrl_dev *pctldev, - unsigned function) -{ - struct msm_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); - - return pctrl->soc->functions[function].name; -} - -static int msm_get_function_groups(struct pinctrl_dev *pctldev, - unsigned function, - const char * const **groups, - unsigned * const num_groups) -{ - struct msm_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); - - *groups = pctrl->soc->functions[function].groups; - *num_groups = pctrl->soc->functions[function].ngroups; - return 0; -} - static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned function, unsigned group) @@ -288,9 +262,9 @@ static int msm_pinmux_request_gpio(struct pinctrl_dev *pctldev, static const struct pinmux_ops msm_pinmux_ops = { .request = msm_pinmux_request, - .get_functions_count = msm_get_functions_count, - .get_function_name = msm_get_function_name, - .get_function_groups = msm_get_function_groups, + .get_functions_count = pinmux_generic_get_function_count, + .get_function_name = pinmux_generic_get_function_name, + .get_function_groups = pinmux_generic_get_function_groups, .gpio_request_enable = msm_pinmux_request_gpio, .set_mux = msm_pinmux_set_mux, }; @@ -1552,6 +1526,7 @@ EXPORT_SYMBOL(msm_pinctrl_dev_pm_ops); int msm_pinctrl_probe(struct platform_device *pdev, const struct msm_pinctrl_soc_data *soc_data) { + const struct pinfunction *func; struct msm_pinctrl *pctrl; struct resource *res; int ret; @@ -1606,6 +1581,14 @@ int msm_pinctrl_probe(struct platform_device *pdev, return PTR_ERR(pctrl->pctrl); } + for (i = 0; i < soc_data->nfunctions; i++) { + func = &soc_data->functions[i]; + + ret = pinmux_generic_add_pinfunction(pctrl->pctrl, func, NULL); + if (ret < 0) + return ret; + } + ret = msm_gpio_init(pctrl); if (ret) return ret; From 11aa02d6a9c222260490f952d041dec6d7f16a92 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:22 +0200 Subject: [PATCH 1123/3206] pinctrl: allow to mark pin functions as requestable GPIOs The name of the pin function has no real meaning to pinctrl core and is there only for human readability of device properties. Some pins are muxed as GPIOs but for "strict" pinmuxers it's impossible to request them as GPIOs if they're bound to a devide - even if their function name explicitly says "gpio". Add a new field to struct pinfunction that allows to pass additional flags to pinctrl core. While we could go with a boolean "is_gpio" field, a flags field is more future-proof. If the PINFUNCTION_FLAG_GPIO is set for a given function, the pin muxed to it can be requested as GPIO even on strict pin controllers. Add a new callback to struct pinmux_ops - function_is_gpio() - that allows pinmux core to inspect a function and see if it's a GPIO one. Provide a generic implementation of this callback. Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/pinmux.c | 46 ++++++++++++++++++++++++++++++--- drivers/pinctrl/pinmux.h | 3 +++ include/linux/pinctrl/pinctrl.h | 14 ++++++++++ include/linux/pinctrl/pinmux.h | 2 ++ 4 files changed, 62 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c index 07ec93f09334f8..3a8dd184ba3d67 100644 --- a/drivers/pinctrl/pinmux.c +++ b/drivers/pinctrl/pinmux.c @@ -89,13 +89,20 @@ bool pinmux_can_be_used_for_gpio(struct pinctrl_dev *pctldev, unsigned int pin) { struct pin_desc *desc = pin_desc_get(pctldev, pin); const struct pinmux_ops *ops = pctldev->desc->pmxops; + const struct pinctrl_setting_mux *mux_setting; + bool func_is_gpio = false; /* Can't inspect pin, assume it can be used */ if (!desc || !ops) return true; + mux_setting = desc->mux_setting; + guard(mutex)(&desc->mux_lock); - if (ops->strict && desc->mux_usecount) + if (mux_setting && ops->function_is_gpio) + func_is_gpio = ops->function_is_gpio(pctldev, mux_setting->func); + + if (ops->strict && desc->mux_usecount && !func_is_gpio) return false; return !(ops->strict && !!desc->gpio_owner); @@ -116,7 +123,9 @@ static int pin_request(struct pinctrl_dev *pctldev, { struct pin_desc *desc; const struct pinmux_ops *ops = pctldev->desc->pmxops; + const struct pinctrl_setting_mux *mux_setting; int status = -EINVAL; + bool gpio_ok = false; desc = pin_desc_get(pctldev, pin); if (desc == NULL) { @@ -126,11 +135,21 @@ static int pin_request(struct pinctrl_dev *pctldev, goto out; } + mux_setting = desc->mux_setting; + dev_dbg(pctldev->dev, "request pin %d (%s) for %s\n", pin, desc->name, owner); scoped_guard(mutex, &desc->mux_lock) { - if ((!gpio_range || ops->strict) && + if (mux_setting) { + if (ops->function_is_gpio) + gpio_ok = ops->function_is_gpio(pctldev, + mux_setting->func); + } else { + gpio_ok = true; + } + + if ((!gpio_range || ops->strict) && !gpio_ok && desc->mux_usecount && strcmp(desc->mux_owner, owner)) { dev_err(pctldev->dev, "pin %s already requested by %s; cannot claim for %s\n", @@ -138,7 +157,7 @@ static int pin_request(struct pinctrl_dev *pctldev, goto out; } - if ((gpio_range || ops->strict) && desc->gpio_owner) { + if ((gpio_range || ops->strict) && !gpio_ok && desc->gpio_owner) { dev_err(pctldev->dev, "pin %s already requested by %s; cannot claim for %s\n", desc->name, desc->gpio_owner, owner); @@ -861,6 +880,27 @@ pinmux_generic_get_function(struct pinctrl_dev *pctldev, unsigned int selector) } EXPORT_SYMBOL_GPL(pinmux_generic_get_function); +/** + * pinmux_generic_function_is_gpio() - returns true if given function is a GPIO + * @pctldev: pin controller device + * @selector: function number + * + * Returns: + * True if given function is a GPIO, false otherwise. + */ +bool pinmux_generic_function_is_gpio(struct pinctrl_dev *pctldev, + unsigned int selector) +{ + struct function_desc *function; + + function = radix_tree_lookup(&pctldev->pin_function_tree, selector); + if (!function) + return false; + + return function->func->flags & PINFUNCTION_FLAG_GPIO; +} +EXPORT_SYMBOL_GPL(pinmux_generic_function_is_gpio); + /** * pinmux_generic_add_function() - adds a function group * @pctldev: pin controller device diff --git a/drivers/pinctrl/pinmux.h b/drivers/pinctrl/pinmux.h index 653684290666d7..4e826c1a5246cf 100644 --- a/drivers/pinctrl/pinmux.h +++ b/drivers/pinctrl/pinmux.h @@ -169,6 +169,9 @@ int pinmux_generic_remove_function(struct pinctrl_dev *pctldev, void pinmux_generic_free_functions(struct pinctrl_dev *pctldev); +bool pinmux_generic_function_is_gpio(struct pinctrl_dev *pctldev, + unsigned int selector); + #else static inline void pinmux_generic_free_functions(struct pinctrl_dev *pctldev) diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index d138e18156452e..1a8084e2940537 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -11,6 +11,7 @@ #ifndef __LINUX_PINCTRL_PINCTRL_H #define __LINUX_PINCTRL_PINCTRL_H +#include #include struct device; @@ -206,16 +207,20 @@ extern int pinctrl_get_group_pins(struct pinctrl_dev *pctldev, const char *pin_group, const unsigned int **pins, unsigned int *num_pins); +#define PINFUNCTION_FLAG_GPIO BIT(0) + /** * struct pinfunction - Description about a function * @name: Name of the function * @groups: An array of groups for this function * @ngroups: Number of groups in @groups + * @flags: Additional pin function flags */ struct pinfunction { const char *name; const char * const *groups; size_t ngroups; + unsigned long flags; }; /* Convenience macro to define a single named pinfunction */ @@ -226,6 +231,15 @@ struct pinfunction { .ngroups = (_ngroups), \ } +/* Same as PINCTRL_PINFUNCTION() but for the GPIO category of functions */ +#define PINCTRL_GPIO_PINFUNCTION(_name, _groups, _ngroups) \ +(struct pinfunction) { \ + .name = (_name), \ + .groups = (_groups), \ + .ngroups = (_ngroups), \ + .flags = PINFUNCTION_FLAG_GPIO, \ + } + #if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_PINCTRL) extern struct pinctrl_dev *of_pinctrl_get(struct device_node *np); #else diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index d6f7b58d6ad0cc..6db6c3e1ccc224 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -66,6 +66,8 @@ struct pinmux_ops { unsigned int selector, const char * const **groups, unsigned int *num_groups); + bool (*function_is_gpio) (struct pinctrl_dev *pctldev, + unsigned int selector); int (*set_mux) (struct pinctrl_dev *pctldev, unsigned int func_selector, unsigned int group_selector); int (*gpio_request_enable) (struct pinctrl_dev *pctldev, From b65803da894ca494a985e4392e5539fb8f858a2d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:23 +0200 Subject: [PATCH 1124/3206] pinctrl: qcom: add infrastructure for marking pin functions as GPIOs Add a helper macro that wraps PINCTRL_GPIO_PINFUNCTION() for pinctrl-msm pin functions and assign the .function_is_gpio() callback in pinmux_ops. Reviewed-by: Konrad Dybcio Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-msm.c | 1 + drivers/pinctrl/qcom/pinctrl-msm.h | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index 96e40c2342bded..a5f69464827119 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -265,6 +265,7 @@ static const struct pinmux_ops msm_pinmux_ops = { .get_functions_count = pinmux_generic_get_function_count, .get_function_name = pinmux_generic_get_function_name, .get_function_groups = pinmux_generic_get_function_groups, + .function_is_gpio = pinmux_generic_function_is_gpio, .gpio_request_enable = msm_pinmux_request_gpio, .set_mux = msm_pinmux_set_mux, }; diff --git a/drivers/pinctrl/qcom/pinctrl-msm.h b/drivers/pinctrl/qcom/pinctrl-msm.h index d7dc0947bb1618..4625fa5320a95a 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.h +++ b/drivers/pinctrl/qcom/pinctrl-msm.h @@ -29,6 +29,11 @@ struct pinctrl_pin_desc; fname##_groups, \ ARRAY_SIZE(fname##_groups)) +#define MSM_GPIO_PIN_FUNCTION(fname) \ + [msm_mux_##fname] = PINCTRL_GPIO_PINFUNCTION(#fname, \ + fname##_groups, \ + ARRAY_SIZE(fname##_groups)) + #define QCA_PIN_FUNCTION(fname) \ [qca_mux_##fname] = PINCTRL_PINFUNCTION(#fname, \ fname##_groups, \ From 480dc195240422f78dae2edaa6666f85d56e2ade Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:24 +0200 Subject: [PATCH 1125/3206] pinctrl: qcom: mark the `gpio` and `egpio` pins function as non-strict functions Allow pins muxed to the "gpio" or "egpio" function to be requested as GPIOs even if pinmux_ops say the controller should be strict. Reviewed-by: Konrad Dybcio Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-ipq5018.c | 2 +- drivers/pinctrl/qcom/pinctrl-ipq5332.c | 2 +- drivers/pinctrl/qcom/pinctrl-ipq5424.c | 2 +- drivers/pinctrl/qcom/pinctrl-ipq6018.c | 2 +- drivers/pinctrl/qcom/pinctrl-ipq8074.c | 2 +- drivers/pinctrl/qcom/pinctrl-ipq9574.c | 2 +- drivers/pinctrl/qcom/pinctrl-mdm9607.c | 2 +- drivers/pinctrl/qcom/pinctrl-mdm9615.c | 2 +- drivers/pinctrl/qcom/pinctrl-milos.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8226.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8660.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8909.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8916.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8917.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8953.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8960.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8976.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8994.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8996.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8998.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm8x74.c | 2 +- drivers/pinctrl/qcom/pinctrl-qcm2290.c | 4 ++-- drivers/pinctrl/qcom/pinctrl-qcs404.c | 2 +- drivers/pinctrl/qcom/pinctrl-qcs615.c | 2 +- drivers/pinctrl/qcom/pinctrl-qcs8300.c | 4 ++-- drivers/pinctrl/qcom/pinctrl-qdu1000.c | 2 +- drivers/pinctrl/qcom/pinctrl-sa8775p.c | 4 ++-- drivers/pinctrl/qcom/pinctrl-sar2130p.c | 2 +- drivers/pinctrl/qcom/pinctrl-sc7180.c | 2 +- drivers/pinctrl/qcom/pinctrl-sc7280.c | 4 ++-- drivers/pinctrl/qcom/pinctrl-sc8180x.c | 2 +- drivers/pinctrl/qcom/pinctrl-sc8280xp.c | 4 ++-- drivers/pinctrl/qcom/pinctrl-sdm660.c | 2 +- drivers/pinctrl/qcom/pinctrl-sdm670.c | 2 +- drivers/pinctrl/qcom/pinctrl-sdm845.c | 2 +- drivers/pinctrl/qcom/pinctrl-sdx55.c | 2 +- drivers/pinctrl/qcom/pinctrl-sdx65.c | 2 +- drivers/pinctrl/qcom/pinctrl-sdx75.c | 2 +- drivers/pinctrl/qcom/pinctrl-sm4450.c | 2 +- drivers/pinctrl/qcom/pinctrl-sm6115.c | 2 +- drivers/pinctrl/qcom/pinctrl-sm6125.c | 2 +- drivers/pinctrl/qcom/pinctrl-sm6350.c | 2 +- drivers/pinctrl/qcom/pinctrl-sm6375.c | 2 +- drivers/pinctrl/qcom/pinctrl-sm7150.c | 2 +- drivers/pinctrl/qcom/pinctrl-sm8150.c | 2 +- drivers/pinctrl/qcom/pinctrl-sm8250.c | 2 +- drivers/pinctrl/qcom/pinctrl-sm8350.c | 2 +- drivers/pinctrl/qcom/pinctrl-sm8450.c | 4 ++-- drivers/pinctrl/qcom/pinctrl-sm8550.c | 2 +- drivers/pinctrl/qcom/pinctrl-sm8650.c | 4 ++-- drivers/pinctrl/qcom/pinctrl-sm8750.c | 4 ++-- drivers/pinctrl/qcom/pinctrl-x1e80100.c | 2 +- 52 files changed, 60 insertions(+), 60 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-ipq5018.c b/drivers/pinctrl/qcom/pinctrl-ipq5018.c index 10b99d5d8a11db..cbf34854f88265 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq5018.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq5018.c @@ -630,7 +630,7 @@ static const struct pinfunction ipq5018_functions[] = { MSM_PIN_FUNCTION(eud_gpio), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(led0), MSM_PIN_FUNCTION(led2), MSM_PIN_FUNCTION(mac0), diff --git a/drivers/pinctrl/qcom/pinctrl-ipq5332.c b/drivers/pinctrl/qcom/pinctrl-ipq5332.c index 1ac2fc09c11923..239cbe75f198d3 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq5332.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq5332.c @@ -692,7 +692,7 @@ static const struct pinfunction ipq5332_functions[] = { MSM_PIN_FUNCTION(dbg_out), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(lock_det), MSM_PIN_FUNCTION(mac0), MSM_PIN_FUNCTION(mac1), diff --git a/drivers/pinctrl/qcom/pinctrl-ipq5424.c b/drivers/pinctrl/qcom/pinctrl-ipq5424.c index 7ff1f8acc1a3a8..67b452a033d623 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq5424.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq5424.c @@ -641,7 +641,7 @@ static const struct pinfunction ipq5424_functions[] = { MSM_PIN_FUNCTION(dbg_out), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(i2c0_scl), MSM_PIN_FUNCTION(i2c0_sda), MSM_PIN_FUNCTION(i2c1_scl), diff --git a/drivers/pinctrl/qcom/pinctrl-ipq6018.c b/drivers/pinctrl/qcom/pinctrl-ipq6018.c index a4ba980252e187..be177fb0a92d95 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq6018.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq6018.c @@ -891,7 +891,7 @@ static const struct pinfunction ipq6018_functions[] = { MSM_PIN_FUNCTION(dbg_out), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(lpass_aud), MSM_PIN_FUNCTION(lpass_aud0), MSM_PIN_FUNCTION(lpass_aud1), diff --git a/drivers/pinctrl/qcom/pinctrl-ipq8074.c b/drivers/pinctrl/qcom/pinctrl-ipq8074.c index 482f13282fc2be..e94de90833140c 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq8074.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq8074.c @@ -838,7 +838,7 @@ static const struct pinfunction ipq8074_functions[] = { MSM_PIN_FUNCTION(dbg_out), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(ldo_en), MSM_PIN_FUNCTION(ldo_update), MSM_PIN_FUNCTION(led0), diff --git a/drivers/pinctrl/qcom/pinctrl-ipq9574.c b/drivers/pinctrl/qcom/pinctrl-ipq9574.c index 89c05d8eb55034..3ed093ea8eb907 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq9574.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq9574.c @@ -651,7 +651,7 @@ static const struct pinfunction ipq9574_functions[] = { MSM_PIN_FUNCTION(dwc_ddrphy), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(mac), MSM_PIN_FUNCTION(mdc), MSM_PIN_FUNCTION(mdio), diff --git a/drivers/pinctrl/qcom/pinctrl-mdm9607.c b/drivers/pinctrl/qcom/pinctrl-mdm9607.c index 3e18ba124fede9..cef330547ce78d 100644 --- a/drivers/pinctrl/qcom/pinctrl-mdm9607.c +++ b/drivers/pinctrl/qcom/pinctrl-mdm9607.c @@ -861,7 +861,7 @@ static const struct pinfunction mdm9607_functions[] = { MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), MSM_PIN_FUNCTION(gmac_mdio), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm0_tx), MSM_PIN_FUNCTION(lcd_rst), MSM_PIN_FUNCTION(ldo_en), diff --git a/drivers/pinctrl/qcom/pinctrl-mdm9615.c b/drivers/pinctrl/qcom/pinctrl-mdm9615.c index bea1ca3d1b7f84..729fe3d7e14efc 100644 --- a/drivers/pinctrl/qcom/pinctrl-mdm9615.c +++ b/drivers/pinctrl/qcom/pinctrl-mdm9615.c @@ -313,7 +313,7 @@ static const char * const cdc_mclk_groups[] = { }; static const struct pinfunction mdm9615_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsbi2_i2c), MSM_PIN_FUNCTION(gsbi3), MSM_PIN_FUNCTION(gsbi4), diff --git a/drivers/pinctrl/qcom/pinctrl-milos.c b/drivers/pinctrl/qcom/pinctrl-milos.c index d11a7bbcd73316..19abd5233a2c54 100644 --- a/drivers/pinctrl/qcom/pinctrl-milos.c +++ b/drivers/pinctrl/qcom/pinctrl-milos.c @@ -974,7 +974,7 @@ static const char *const wcn_sw_ctrl_groups[] = { }; static const struct pinfunction milos_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aoss_cti), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8226.c b/drivers/pinctrl/qcom/pinctrl-msm8226.c index f9a95734734081..a81aa092ef1240 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8226.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8226.c @@ -483,7 +483,7 @@ static const struct pinfunction msm8226_functions[] = { MSM_PIN_FUNCTION(cci_i2c0), MSM_PIN_FUNCTION(gp0_clk), MSM_PIN_FUNCTION(gp1_clk), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(sdc3), MSM_PIN_FUNCTION(wlan), }; diff --git a/drivers/pinctrl/qcom/pinctrl-msm8660.c b/drivers/pinctrl/qcom/pinctrl-msm8660.c index 4dbc19ffd80efc..5ded00396cd949 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8660.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8660.c @@ -714,7 +714,7 @@ static const char * const ebi2_groups[] = { }; static const struct pinfunction msm8660_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(cam_mclk), MSM_PIN_FUNCTION(dsub), MSM_PIN_FUNCTION(ext_gps), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8909.c b/drivers/pinctrl/qcom/pinctrl-msm8909.c index 0aa4f77b774f45..544a52fb8f3d6e 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8909.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8909.c @@ -696,7 +696,7 @@ static const struct pinfunction msm8909_functions[] = { MSM_PIN_FUNCTION(gcc_gp3_clk_a), MSM_PIN_FUNCTION(gcc_gp3_clk_b), MSM_PIN_FUNCTION(gcc_plltest), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm0_tx), MSM_PIN_FUNCTION(ldo_en), MSM_PIN_FUNCTION(ldo_update), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8916.c b/drivers/pinctrl/qcom/pinctrl-msm8916.c index 0dfc6dd33d58b2..b1b6934bb4b635 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8916.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8916.c @@ -743,7 +743,7 @@ static const struct pinfunction msm8916_functions[] = { MSM_PIN_FUNCTION(gcc_gp2_clk_b), MSM_PIN_FUNCTION(gcc_gp3_clk_a), MSM_PIN_FUNCTION(gcc_gp3_clk_b), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm0_tx0), MSM_PIN_FUNCTION(gsm0_tx1), MSM_PIN_FUNCTION(gsm1_tx0), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8917.c b/drivers/pinctrl/qcom/pinctrl-msm8917.c index 2e1a94ab18b219..f23d92d6615b89 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8917.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8917.c @@ -1302,7 +1302,7 @@ static const struct pinfunction msm8917_functions[] = { MSM_PIN_FUNCTION(gcc_gp3_clk_b), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm0_tx), MSM_PIN_FUNCTION(key_focus), MSM_PIN_FUNCTION(key_snapshot), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8953.c b/drivers/pinctrl/qcom/pinctrl-msm8953.c index 956383341a7a77..67db062fdf5628 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8953.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8953.c @@ -1533,7 +1533,7 @@ static const struct pinfunction msm8953_functions[] = { MSM_PIN_FUNCTION(gcc_gp3_clk_b), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm0_tx), MSM_PIN_FUNCTION(gsm1_tx), MSM_PIN_FUNCTION(gyro_int), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8960.c b/drivers/pinctrl/qcom/pinctrl-msm8960.c index a937ea867de709..2fb15208aba050 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8960.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8960.c @@ -974,7 +974,7 @@ static const struct pinfunction msm8960_functions[] = { MSM_PIN_FUNCTION(gp_pdm_1b), MSM_PIN_FUNCTION(gp_pdm_2a), MSM_PIN_FUNCTION(gp_pdm_2b), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsbi1), MSM_PIN_FUNCTION(gsbi1_spi_cs1_n), MSM_PIN_FUNCTION(gsbi1_spi_cs2a_n), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8976.c b/drivers/pinctrl/qcom/pinctrl-msm8976.c index 3bcb03387781f8..345539b9e696f0 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8976.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8976.c @@ -812,7 +812,7 @@ static const char * const ss_switch_groups[] = { }; static const struct pinfunction msm8976_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(blsp_spi1), MSM_PIN_FUNCTION(smb_int), MSM_PIN_FUNCTION(blsp_i2c1), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8994.c b/drivers/pinctrl/qcom/pinctrl-msm8994.c index 7a3b6cbccb687c..94e042d1f4b2a5 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8994.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8994.c @@ -1071,7 +1071,7 @@ static const struct pinfunction msm8994_functions[] = { MSM_PIN_FUNCTION(uim2), MSM_PIN_FUNCTION(uim3), MSM_PIN_FUNCTION(uim4), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), }; static const struct msm_pingroup msm8994_groups[] = { diff --git a/drivers/pinctrl/qcom/pinctrl-msm8996.c b/drivers/pinctrl/qcom/pinctrl-msm8996.c index d86d83106d3ba1..e5b55693d02377 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8996.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8996.c @@ -1532,7 +1532,7 @@ static const struct pinfunction msm8996_functions[] = { MSM_PIN_FUNCTION(gcc_gp2_clk_b), MSM_PIN_FUNCTION(gcc_gp3_clk_a), MSM_PIN_FUNCTION(gcc_gp3_clk_b), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm_tx), MSM_PIN_FUNCTION(hdmi_cec), MSM_PIN_FUNCTION(hdmi_ddc), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8998.c b/drivers/pinctrl/qcom/pinctrl-msm8998.c index 1daee815888f54..b727593af34af9 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8998.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8998.c @@ -1160,7 +1160,7 @@ static const char * const mss_lte_groups[] = { }; static const struct pinfunction msm8998_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(adsp_ext), MSM_PIN_FUNCTION(agera_pll), MSM_PIN_FUNCTION(atest_char), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8x74.c b/drivers/pinctrl/qcom/pinctrl-msm8x74.c index 8253aa25775b24..202bec003e96f1 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8x74.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8x74.c @@ -778,7 +778,7 @@ static const char * const slimbus_groups[] = { "gpio70", "gpio71" }; static const char * const hsic_ctl_groups[] = { "hsic_strobe", "hsic_data" }; static const struct pinfunction msm8x74_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(cci_i2c0), MSM_PIN_FUNCTION(cci_i2c1), MSM_PIN_FUNCTION(uim1), diff --git a/drivers/pinctrl/qcom/pinctrl-qcm2290.c b/drivers/pinctrl/qcom/pinctrl-qcm2290.c index eeeec6434f6a68..38200957451e19 100644 --- a/drivers/pinctrl/qcom/pinctrl-qcm2290.c +++ b/drivers/pinctrl/qcom/pinctrl-qcm2290.c @@ -870,11 +870,11 @@ static const struct pinfunction qcm2290_functions[] = { MSM_PIN_FUNCTION(ddr_pxi1), MSM_PIN_FUNCTION(ddr_pxi2), MSM_PIN_FUNCTION(ddr_pxi3), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gp_pdm0), MSM_PIN_FUNCTION(gp_pdm1), MSM_PIN_FUNCTION(gp_pdm2), diff --git a/drivers/pinctrl/qcom/pinctrl-qcs404.c b/drivers/pinctrl/qcom/pinctrl-qcs404.c index 54e3b44353494e..0b8db2c7e58a9b 100644 --- a/drivers/pinctrl/qcom/pinctrl-qcs404.c +++ b/drivers/pinctrl/qcom/pinctrl-qcs404.c @@ -1296,7 +1296,7 @@ static const char * const i2s_3_ws_a_groups[] = { }; static const struct pinfunction qcs404_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(hdmi_tx), MSM_PIN_FUNCTION(hdmi_ddc), MSM_PIN_FUNCTION(blsp_uart_tx_a2), diff --git a/drivers/pinctrl/qcom/pinctrl-qcs615.c b/drivers/pinctrl/qcom/pinctrl-qcs615.c index 2a943bc46a6299..4dfa820d4e77ce 100644 --- a/drivers/pinctrl/qcom/pinctrl-qcs615.c +++ b/drivers/pinctrl/qcom/pinctrl-qcs615.c @@ -819,7 +819,7 @@ static const char *const wsa_data_groups[] = { }; static const struct pinfunction qcs615_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(adsp_ext), MSM_PIN_FUNCTION(agera_pll), MSM_PIN_FUNCTION(aoss_cti), diff --git a/drivers/pinctrl/qcom/pinctrl-qcs8300.c b/drivers/pinctrl/qcom/pinctrl-qcs8300.c index d6437e26392b60..f1af1a620684cd 100644 --- a/drivers/pinctrl/qcom/pinctrl-qcs8300.c +++ b/drivers/pinctrl/qcom/pinctrl-qcs8300.c @@ -929,7 +929,7 @@ static const char *const vsense_trigger_groups[] = { }; static const struct pinfunction qcs8300_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aoss_cti), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb2), @@ -949,7 +949,7 @@ static const struct pinfunction qcs8300_functions[] = { MSM_PIN_FUNCTION(edp0_hot), MSM_PIN_FUNCTION(edp0_lcd), MSM_PIN_FUNCTION(edp1_lcd), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(emac0_mcg0), MSM_PIN_FUNCTION(emac0_mcg1), MSM_PIN_FUNCTION(emac0_mcg2), diff --git a/drivers/pinctrl/qcom/pinctrl-qdu1000.c b/drivers/pinctrl/qcom/pinctrl-qdu1000.c index eacb89fa388850..7c535698a78004 100644 --- a/drivers/pinctrl/qcom/pinctrl-qdu1000.c +++ b/drivers/pinctrl/qcom/pinctrl-qdu1000.c @@ -904,7 +904,7 @@ static const char * const vsense_trigger_groups[] = { }; static const struct pinfunction qdu1000_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(cmo_pri), MSM_PIN_FUNCTION(si5518_int), MSM_PIN_FUNCTION(atest_char), diff --git a/drivers/pinctrl/qcom/pinctrl-sa8775p.c b/drivers/pinctrl/qcom/pinctrl-sa8775p.c index 1b62eb3e6620c9..53f28b9c49ba2d 100644 --- a/drivers/pinctrl/qcom/pinctrl-sa8775p.c +++ b/drivers/pinctrl/qcom/pinctrl-sa8775p.c @@ -1181,7 +1181,7 @@ static const char * const vsense_trigger_groups[] = { }; static const struct pinfunction sa8775p_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb2), MSM_PIN_FUNCTION(audio_ref), @@ -1217,7 +1217,7 @@ static const struct pinfunction sa8775p_functions[] = { MSM_PIN_FUNCTION(edp2_lcd), MSM_PIN_FUNCTION(edp3_hot), MSM_PIN_FUNCTION(edp3_lcd), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(emac0_mcg0), MSM_PIN_FUNCTION(emac0_mcg1), MSM_PIN_FUNCTION(emac0_mcg2), diff --git a/drivers/pinctrl/qcom/pinctrl-sar2130p.c b/drivers/pinctrl/qcom/pinctrl-sar2130p.c index 3dd1b5e5cfee48..4a53f4ee20418e 100644 --- a/drivers/pinctrl/qcom/pinctrl-sar2130p.c +++ b/drivers/pinctrl/qcom/pinctrl-sar2130p.c @@ -1128,7 +1128,7 @@ static const char * const vsense_trigger_groups[] = { }; static const struct pinfunction sar2130p_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(qup0), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), diff --git a/drivers/pinctrl/qcom/pinctrl-sc7180.c b/drivers/pinctrl/qcom/pinctrl-sc7180.c index c43fe10b71add7..3eae51472b1373 100644 --- a/drivers/pinctrl/qcom/pinctrl-sc7180.c +++ b/drivers/pinctrl/qcom/pinctrl-sc7180.c @@ -903,7 +903,7 @@ static const struct pinfunction sc7180_functions[] = { MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gp_pdm0), MSM_PIN_FUNCTION(gp_pdm1), MSM_PIN_FUNCTION(gp_pdm2), diff --git a/drivers/pinctrl/qcom/pinctrl-sc7280.c b/drivers/pinctrl/qcom/pinctrl-sc7280.c index 1b070e9d41f597..44e09608aad07a 100644 --- a/drivers/pinctrl/qcom/pinctrl-sc7280.c +++ b/drivers/pinctrl/qcom/pinctrl-sc7280.c @@ -1153,11 +1153,11 @@ static const struct pinfunction sc7280_functions[] = { MSM_PIN_FUNCTION(dp_lcd), MSM_PIN_FUNCTION(edp_hot), MSM_PIN_FUNCTION(edp_lcd), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(host2wlan_sol), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), diff --git a/drivers/pinctrl/qcom/pinctrl-sc8180x.c b/drivers/pinctrl/qcom/pinctrl-sc8180x.c index ae5134fdebd995..d9f9e3dd9dd176 100644 --- a/drivers/pinctrl/qcom/pinctrl-sc8180x.c +++ b/drivers/pinctrl/qcom/pinctrl-sc8180x.c @@ -1272,7 +1272,7 @@ static const struct pinfunction sc8180x_functions[] = { MSM_PIN_FUNCTION(gcc_gp3), MSM_PIN_FUNCTION(gcc_gp4), MSM_PIN_FUNCTION(gcc_gp5), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gps), MSM_PIN_FUNCTION(grfc), MSM_PIN_FUNCTION(hs1_mi2s), diff --git a/drivers/pinctrl/qcom/pinctrl-sc8280xp.c b/drivers/pinctrl/qcom/pinctrl-sc8280xp.c index 6ccd7e5648d420..cf8297e8b8f8c9 100644 --- a/drivers/pinctrl/qcom/pinctrl-sc8280xp.c +++ b/drivers/pinctrl/qcom/pinctrl-sc8280xp.c @@ -1506,7 +1506,7 @@ static const struct pinfunction sc8280xp_functions[] = { MSM_PIN_FUNCTION(edp2_lcd), MSM_PIN_FUNCTION(edp3_lcd), MSM_PIN_FUNCTION(edp_hot), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(emac0_dll), MSM_PIN_FUNCTION(emac0_mcg0), MSM_PIN_FUNCTION(emac0_mcg1), @@ -1527,7 +1527,7 @@ static const struct pinfunction sc8280xp_functions[] = { MSM_PIN_FUNCTION(gcc_gp3), MSM_PIN_FUNCTION(gcc_gp4), MSM_PIN_FUNCTION(gcc_gp5), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(hs1_mi2s), MSM_PIN_FUNCTION(hs2_mi2s), MSM_PIN_FUNCTION(hs3_mi2s), diff --git a/drivers/pinctrl/qcom/pinctrl-sdm660.c b/drivers/pinctrl/qcom/pinctrl-sdm660.c index 1a78288f1bc832..687d986de75c4d 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdm660.c +++ b/drivers/pinctrl/qcom/pinctrl-sdm660.c @@ -1157,7 +1157,7 @@ static const struct pinfunction sdm660_functions[] = { MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gps_tx_a), MSM_PIN_FUNCTION(gps_tx_b), MSM_PIN_FUNCTION(gps_tx_c), diff --git a/drivers/pinctrl/qcom/pinctrl-sdm670.c b/drivers/pinctrl/qcom/pinctrl-sdm670.c index 0fe1fa94cd6da1..486b72edf7b4ec 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdm670.c +++ b/drivers/pinctrl/qcom/pinctrl-sdm670.c @@ -991,7 +991,7 @@ static const char * const mss_lte_groups[] = { }; static const struct pinfunction sdm670_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(adsp_ext), MSM_PIN_FUNCTION(agera_pll), MSM_PIN_FUNCTION(atest_char), diff --git a/drivers/pinctrl/qcom/pinctrl-sdm845.c b/drivers/pinctrl/qcom/pinctrl-sdm845.c index 0446e291aa4831..4cf8575797a0f4 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdm845.c +++ b/drivers/pinctrl/qcom/pinctrl-sdm845.c @@ -976,7 +976,7 @@ static const char * const tsif1_sync_groups[] = { }; static const struct pinfunction sdm845_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(adsp_ext), MSM_PIN_FUNCTION(agera_pll), MSM_PIN_FUNCTION(atest_char), diff --git a/drivers/pinctrl/qcom/pinctrl-sdx55.c b/drivers/pinctrl/qcom/pinctrl-sdx55.c index 2c17bf88914636..79a7010b73f187 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdx55.c +++ b/drivers/pinctrl/qcom/pinctrl-sdx55.c @@ -796,7 +796,7 @@ static const struct pinfunction sdx55_functions[] = { MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), MSM_PIN_FUNCTION(gcc_plltest), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(i2s_mclk), MSM_PIN_FUNCTION(jitter_bist), MSM_PIN_FUNCTION(ldo_en), diff --git a/drivers/pinctrl/qcom/pinctrl-sdx65.c b/drivers/pinctrl/qcom/pinctrl-sdx65.c index 85b5c0206dbd19..cc8a99a6a91ed2 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdx65.c +++ b/drivers/pinctrl/qcom/pinctrl-sdx65.c @@ -732,7 +732,7 @@ static const struct pinfunction sdx65_functions[] = { MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), MSM_PIN_FUNCTION(gcc_plltest), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(i2s_mclk), MSM_PIN_FUNCTION(jitter_bist), MSM_PIN_FUNCTION(ldo_en), diff --git a/drivers/pinctrl/qcom/pinctrl-sdx75.c b/drivers/pinctrl/qcom/pinctrl-sdx75.c index ab13a3a57a8307..4078d83d818c33 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdx75.c +++ b/drivers/pinctrl/qcom/pinctrl-sdx75.c @@ -852,7 +852,7 @@ static const struct pinfunction sdx75_functions[] = { MSM_PIN_FUNCTION(gcc_gp2_clk), MSM_PIN_FUNCTION(gcc_gp3_clk), MSM_PIN_FUNCTION(gcc_plltest), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(i2s_mclk), MSM_PIN_FUNCTION(jitter_bist), MSM_PIN_FUNCTION(ldo_en), diff --git a/drivers/pinctrl/qcom/pinctrl-sm4450.c b/drivers/pinctrl/qcom/pinctrl-sm4450.c index 1ecdf1ab4f275e..d51e271e336101 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm4450.c +++ b/drivers/pinctrl/qcom/pinctrl-sm4450.c @@ -722,7 +722,7 @@ static const char * const wlan1_adc_dtest1_groups[] = { }; static const struct pinfunction sm4450_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb0), MSM_PIN_FUNCTION(audio_ref_clk), diff --git a/drivers/pinctrl/qcom/pinctrl-sm6115.c b/drivers/pinctrl/qcom/pinctrl-sm6115.c index c273efa4399630..06700685ea2a38 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm6115.c +++ b/drivers/pinctrl/qcom/pinctrl-sm6115.c @@ -687,7 +687,7 @@ static const struct pinfunction sm6115_functions[] = { MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gp_pdm0), MSM_PIN_FUNCTION(gp_pdm1), MSM_PIN_FUNCTION(gp_pdm2), diff --git a/drivers/pinctrl/qcom/pinctrl-sm6125.c b/drivers/pinctrl/qcom/pinctrl-sm6125.c index 5092f20e0c1bde..5d3d1e402345eb 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm6125.c +++ b/drivers/pinctrl/qcom/pinctrl-sm6125.c @@ -943,7 +943,7 @@ static const char * const dmic1_data_groups[] = { static const struct pinfunction sm6125_functions[] = { MSM_PIN_FUNCTION(qup00), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(qdss), MSM_PIN_FUNCTION(qup01), MSM_PIN_FUNCTION(qup02), diff --git a/drivers/pinctrl/qcom/pinctrl-sm6350.c b/drivers/pinctrl/qcom/pinctrl-sm6350.c index ba4686c86c54b8..220fb582cac9fc 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm6350.c +++ b/drivers/pinctrl/qcom/pinctrl-sm6350.c @@ -1048,7 +1048,7 @@ static const struct pinfunction sm6350_functions[] = { MSM_PIN_FUNCTION(gp_pdm0), MSM_PIN_FUNCTION(gp_pdm1), MSM_PIN_FUNCTION(gp_pdm2), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gps_tx), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), diff --git a/drivers/pinctrl/qcom/pinctrl-sm6375.c b/drivers/pinctrl/qcom/pinctrl-sm6375.c index 49031571e65ee3..08b8ef6efaf097 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm6375.c +++ b/drivers/pinctrl/qcom/pinctrl-sm6375.c @@ -1172,7 +1172,7 @@ static const struct pinfunction sm6375_functions[] = { MSM_PIN_FUNCTION(gp_pdm0), MSM_PIN_FUNCTION(gp_pdm1), MSM_PIN_FUNCTION(gp_pdm2), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gps_tx), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), diff --git a/drivers/pinctrl/qcom/pinctrl-sm7150.c b/drivers/pinctrl/qcom/pinctrl-sm7150.c index 6e89966cd70e34..78dd8153a4d4e5 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm7150.c +++ b/drivers/pinctrl/qcom/pinctrl-sm7150.c @@ -960,7 +960,7 @@ static const char * const wsa_data_groups[] = { }; static const struct pinfunction sm7150_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(adsp_ext), MSM_PIN_FUNCTION(agera_pll), MSM_PIN_FUNCTION(aoss_cti), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8150.c b/drivers/pinctrl/qcom/pinctrl-sm8150.c index 794ed99463f760..ad861cd66958c4 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8150.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8150.c @@ -1217,7 +1217,7 @@ static const struct pinfunction sm8150_functions[] = { MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(hs1_mi2s), MSM_PIN_FUNCTION(hs2_mi2s), MSM_PIN_FUNCTION(hs3_mi2s), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8250.c b/drivers/pinctrl/qcom/pinctrl-sm8250.c index fb6f005d64f53f..6021d9f6e407ef 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8250.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8250.c @@ -1021,7 +1021,7 @@ static const struct pinfunction sm8250_functions[] = { MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), MSM_PIN_FUNCTION(lpass_slimbus), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8350.c b/drivers/pinctrl/qcom/pinctrl-sm8350.c index c8a3f39ce6f1b8..99949b55202113 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8350.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8350.c @@ -1267,7 +1267,7 @@ static const struct pinfunction sm8350_functions[] = { MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), MSM_PIN_FUNCTION(lpass_slimbus), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8450.c b/drivers/pinctrl/qcom/pinctrl-sm8450.c index f2e52d5a0f9369..9889fc5dc2cd20 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8450.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8450.c @@ -1269,7 +1269,7 @@ static const char * const vsense_trigger_groups[] = { }; static const struct pinfunction sm8450_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aon_cam), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb), @@ -1291,7 +1291,7 @@ static const struct pinfunction sm8450_functions[] = { MSM_PIN_FUNCTION(ddr_pxi2), MSM_PIN_FUNCTION(ddr_pxi3), MSM_PIN_FUNCTION(dp_hot), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8550.c b/drivers/pinctrl/qcom/pinctrl-sm8550.c index 1b4496cb39eb46..10a62031fdfd04 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8550.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8550.c @@ -1340,7 +1340,7 @@ static const char *const vsense_trigger_mirnat_groups[] = { }; static const struct pinfunction sm8550_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aon_cci), MSM_PIN_FUNCTION(aoss_cti), MSM_PIN_FUNCTION(atest_char), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8650.c b/drivers/pinctrl/qcom/pinctrl-sm8650.c index 449a0077f4b106..e2ae038002060d 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8650.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8650.c @@ -1328,7 +1328,7 @@ static const char *const vsense_trigger_mirnat_groups[] = { }; static const struct pinfunction sm8650_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aoss_cti), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb), @@ -1359,7 +1359,7 @@ static const struct pinfunction sm8650_functions[] = { MSM_PIN_FUNCTION(ddr_pxi3), MSM_PIN_FUNCTION(do_not), MSM_PIN_FUNCTION(dp_hot), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8750.c b/drivers/pinctrl/qcom/pinctrl-sm8750.c index 8516693d1db51d..6f92f176edd459 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8750.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8750.c @@ -1290,7 +1290,7 @@ static const char *const wcn_sw_ctrl_groups[] = { }; static const struct pinfunction sm8750_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aoss_cti), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb), @@ -1319,7 +1319,7 @@ static const struct pinfunction sm8750_functions[] = { MSM_PIN_FUNCTION(ddr_pxi2), MSM_PIN_FUNCTION(ddr_pxi3), MSM_PIN_FUNCTION(dp_hot), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), diff --git a/drivers/pinctrl/qcom/pinctrl-x1e80100.c b/drivers/pinctrl/qcom/pinctrl-x1e80100.c index d4b215f34c39bf..bb36f40b19fa53 100644 --- a/drivers/pinctrl/qcom/pinctrl-x1e80100.c +++ b/drivers/pinctrl/qcom/pinctrl-x1e80100.c @@ -1407,7 +1407,7 @@ static const char * const vsense_trigger_groups[] = { }; static const struct pinfunction x1e80100_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(RESOUT_GPIO), MSM_PIN_FUNCTION(aon_cci), MSM_PIN_FUNCTION(aoss_cti), From cc85cb96e2e4489826e372cde645b7823c435de0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:25 +0200 Subject: [PATCH 1126/3206] pinctrl: qcom: make the pinmuxing strict The strict flag in struct pinmux_ops disallows the usage of the same pin as a GPIO and for another function. Without it, a rouge user-space process with enough privileges (or even a buggy driver) can request a used pin as GPIO and drive it, potentially confusing devices or even crashing the system. Set it globally for all pinctrl-msm users. Reviewed-by: Konrad Dybcio Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-msm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index a5f69464827119..1751d838ce95d6 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -268,6 +268,7 @@ static const struct pinmux_ops msm_pinmux_ops = { .function_is_gpio = pinmux_generic_function_is_gpio, .gpio_request_enable = msm_pinmux_request_gpio, .set_mux = msm_pinmux_set_mux, + .strict = true, }; static int msm_config_reg(struct msm_pinctrl *pctrl, From 07333899650a5d4c785d3257ee4cd278006b1070 Mon Sep 17 00:00:00 2001 From: "Ivan T. Ivanov" Date: Thu, 28 Aug 2025 14:47:38 +0200 Subject: [PATCH 1127/3206] dt-bindings: pinctrl: Add support for Broadcom STB pin controller The STB pin controller represents a family whose silicon instances are found e.g. on BCM2712 SoC. In particular, on RaspberryPi 5, there are two separate instantiations of the same IP block which differ in the number of pins that are associated and the pinmux functions for each of those pins. The -aon- variant stands for 'Always On'. Depending on the revision of the BCM2712 (CO or D0), the pin controller instance has slight differences in the register layout. Signed-off-by: Ivan T. Ivanov Signed-off-by: Andrea della Porta Reviewed-by: Rob Herring (Arm) [linusw: Dropped extranous label and fixed whitespace] Signed-off-by: Linus Walleij --- .../pinctrl/brcm,bcm2712c0-pinctrl.yaml | 137 ++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/brcm,bcm2712c0-pinctrl.yaml diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2712c0-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2712c0-pinctrl.yaml new file mode 100644 index 00000000000000..ae6c13a746b9c6 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2712c0-pinctrl.yaml @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/brcm,bcm2712c0-pinctrl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Broadcom STB family pin controller + +maintainers: + - Ivan T. Ivanov + - A. della Porta + +description: > + Broadcom's STB family of memory-mapped pin controllers. + + This includes the pin controllers inside the BCM2712 SoC which + are instances of the STB family and has two silicon variants, + C0 and D0, which differs slightly in terms of registers layout. + + The -aon- (Always On) variant is the same IP block but differs + in the number of pins that are associated and the pinmux functions + for each of those pins. + +allOf: + - $ref: pinctrl.yaml# + +properties: + compatible: + enum: + - brcm,bcm2712c0-pinctrl + - brcm,bcm2712c0-aon-pinctrl + - brcm,bcm2712d0-pinctrl + - brcm,bcm2712d0-aon-pinctrl + + reg: + maxItems: 1 + +patternProperties: + '-state$': + oneOf: + - $ref: '#/$defs/brcmstb-pinctrl-state' + - patternProperties: + '-pins$': + $ref: '#/$defs/brcmstb-pinctrl-state' + additionalProperties: false + +$defs: + brcmstb-pinctrl-state: + allOf: + - $ref: pincfg-node.yaml# + - $ref: pinmux-node.yaml# + + description: > + Pin controller client devices use pin configuration subnodes (children + and grandchildren) for desired pin configuration. + + Client device subnodes use below standard properties. + + properties: + pins: + description: + List of gpio pins affected by the properties specified in this + subnode (either this or "groups" must be specified). + items: + pattern: '^((aon_)?s?gpio[0-6]?[0-9])|(emmc_(clk|cmd|dat[0-7]|ds))$' + + function: + description: + Specify the alternative function to be configured for the specified + pins. + enum: [ gpio, alt1, alt2, alt3, alt4, alt5, alt6, alt7, alt8, + aon_cpu_standbyb, aon_fp_4sec_resetb, aon_gpclk, aon_pwm, + arm_jtag, aud_fs_clk0, avs_pmu_bsc, bsc_m0, bsc_m1, bsc_m2, + bsc_m3, clk_observe, ctl_hdmi_5v, enet0, enet0_mii, enet0_rgmii, + ext_sc_clk, fl0, fl1, gpclk0, gpclk1, gpclk2, hdmi_tx0_auto_i2c, + hdmi_tx0_bsc, hdmi_tx1_auto_i2c, hdmi_tx1_bsc, i2s_in, i2s_out, + ir_in, mtsif, mtsif_alt, mtsif_alt1, pdm, pkt, pm_led_out, sc0, + sd0, sd2, sd_card_a, sd_card_b, sd_card_c, sd_card_d, sd_card_e, + sd_card_f, sd_card_g, spdif_out, spi_m, spi_s, sr_edm_sense, te0, + te1, tsio, uart0, uart1, uart2, usb_pwr, usb_vbus, uui, vc_i2c0, + vc_i2c3, vc_i2c4, vc_i2c5, vc_i2csl, vc_pcm, vc_pwm0, vc_pwm1, + vc_spi0, vc_spi3, vc_spi4, vc_spi5, vc_uart0, vc_uart2, vc_uart3, + vc_uart4 ] + + bias-disable: true + bias-pull-down: true + bias-pull-up: true + + required: + - pins + + if: + properties: + pins: + not: + contains: + pattern: "^emmc_(clk|cmd|dat[0-7]|ds)$" + then: + required: + - function + else: + properties: + function: false + + additionalProperties: false + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + pinctrl@7d504100 { + compatible = "brcm,bcm2712c0-pinctrl"; + reg = <0x7d504100 0x30>; + + bt-shutdown-default-state { + function = "gpio"; + pins = "gpio29"; + }; + + uarta-default-state { + rts-tx-pins { + function = "uart0"; + pins = "gpio24", "gpio26"; + bias-disable; + }; + + cts-rx-pins { + function = "uart0"; + pins = "gpio25", "gpio27"; + bias-pull-up; + }; + }; + }; From 657cbf9b24ba9bd35acd86160f115bfd93038229 Mon Sep 17 00:00:00 2001 From: "Ivan T. Ivanov" Date: Thu, 28 Aug 2025 14:47:39 +0200 Subject: [PATCH 1128/3206] pinctrl: bcm: Add STB family pin controller driver This driver provide pin muxing and configuration functionality for BCM2712 SoC used by RPi5. According to [1] this chip is an instance of the one used in Broadcom STB product line. [1] https://lore.kernel.org/lkml/f6601f73-cb22-4ba3-88c5-241be8421fc3@broadcom.com/ Cc: Jonathan Bell Cc: Phil Elwell Signed-off-by: Ivan T. Ivanov Reviewed-by: Phil Elwell Signed-off-by: Andrea della Porta [linusw: Enable also for ARCH_BCM2835] Signed-off-by: Linus Walleij --- drivers/pinctrl/bcm/Kconfig | 12 + drivers/pinctrl/bcm/Kconfig.stb | 10 + drivers/pinctrl/bcm/Makefile | 2 + drivers/pinctrl/bcm/pinctrl-brcmstb-bcm2712.c | 747 ++++++++++++++++++ drivers/pinctrl/bcm/pinctrl-brcmstb.c | 442 +++++++++++ drivers/pinctrl/bcm/pinctrl-brcmstb.h | 93 +++ 6 files changed, 1306 insertions(+) create mode 100644 drivers/pinctrl/bcm/Kconfig.stb create mode 100644 drivers/pinctrl/bcm/pinctrl-brcmstb-bcm2712.c create mode 100644 drivers/pinctrl/bcm/pinctrl-brcmstb.c create mode 100644 drivers/pinctrl/bcm/pinctrl-brcmstb.h diff --git a/drivers/pinctrl/bcm/Kconfig b/drivers/pinctrl/bcm/Kconfig index 35b51ce4298e25..096d0778427e27 100644 --- a/drivers/pinctrl/bcm/Kconfig +++ b/drivers/pinctrl/bcm/Kconfig @@ -106,6 +106,18 @@ config PINCTRL_BCM63268 help Say Y here to enable the Broadcom BCM63268 GPIO driver. +config PINCTRL_BRCMSTB + tristate "Broadcom STB product line pin controller driver" + depends on OF && (ARCH_BCM2835 || ARCH_BRCMSTB || COMPILE_TEST) + select PINMUX + select PINCONF + select GENERIC_PINCONF + help + Enable pin muxing and configuration functionality + for Broadcom STB product line chipsets. + +source "drivers/pinctrl/bcm/Kconfig.stb" + config PINCTRL_IPROC_GPIO bool "Broadcom iProc GPIO (with PINCONF) driver" depends on OF_GPIO && (ARCH_BCM_IPROC || COMPILE_TEST) diff --git a/drivers/pinctrl/bcm/Kconfig.stb b/drivers/pinctrl/bcm/Kconfig.stb new file mode 100644 index 00000000000000..c1ba361e1d78b0 --- /dev/null +++ b/drivers/pinctrl/bcm/Kconfig.stb @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +if PINCTRL_BRCMSTB + +config PINCTRL_BCM2712 + tristate "BCM2712 SoC pin controller driver" + help + Driver for BCM2712 integrated pin controller, + commonly found on Raspberry Pi 5. + +endif diff --git a/drivers/pinctrl/bcm/Makefile b/drivers/pinctrl/bcm/Makefile index 82b868ec14716d..482d769b1a81da 100644 --- a/drivers/pinctrl/bcm/Makefile +++ b/drivers/pinctrl/bcm/Makefile @@ -11,6 +11,8 @@ obj-$(CONFIG_PINCTRL_BCM6358) += pinctrl-bcm6358.o obj-$(CONFIG_PINCTRL_BCM6362) += pinctrl-bcm6362.o obj-$(CONFIG_PINCTRL_BCM6368) += pinctrl-bcm6368.o obj-$(CONFIG_PINCTRL_BCM63268) += pinctrl-bcm63268.o +obj-$(CONFIG_PINCTRL_BRCMSTB) += pinctrl-brcmstb.o +obj-$(CONFIG_PINCTRL_BCM2712) += pinctrl-brcmstb-bcm2712.o obj-$(CONFIG_PINCTRL_IPROC_GPIO) += pinctrl-iproc-gpio.o obj-$(CONFIG_PINCTRL_CYGNUS_MUX) += pinctrl-cygnus-mux.o obj-$(CONFIG_PINCTRL_NS) += pinctrl-ns.o diff --git a/drivers/pinctrl/bcm/pinctrl-brcmstb-bcm2712.c b/drivers/pinctrl/bcm/pinctrl-brcmstb-bcm2712.c new file mode 100644 index 00000000000000..752b78e2c0d8ce --- /dev/null +++ b/drivers/pinctrl/bcm/pinctrl-brcmstb-bcm2712.c @@ -0,0 +1,747 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Driver for Broadcom brcmstb GPIO units (pinctrl only) + * + * Copyright (C) 2024-2025 Ivan T. Ivanov, Andrea della Porta + * Copyright (C) 2021-3 Raspberry Pi Ltd. + * Copyright (C) 2012 Chris Boot, Simon Arlott, Stephen Warren + * + * Based heavily on the BCM2835 GPIO & pinctrl driver, which was inspired by: + * pinctrl-nomadik.c, please see original file for copyright information + * pinctrl-tegra.c, please see original file for copyright information + */ + +#include +#include +#include "pinctrl-brcmstb.h" + +#define BRCMSTB_FSEL_COUNT 8 +#define BRCMSTB_FSEL_MASK 0xf + +#define BRCMSTB_PIN(i, f1, f2, f3, f4, f5, f6, f7, f8) \ + [i] = { \ + .funcs = (u8[]) { \ + func_##f1, \ + func_##f2, \ + func_##f3, \ + func_##f4, \ + func_##f5, \ + func_##f6, \ + func_##f7, \ + func_##f8, \ + }, \ + .n_funcs = BRCMSTB_FSEL_COUNT, \ + .func_mask = BRCMSTB_FSEL_MASK, \ + } + +enum bcm2712_funcs { + func_gpio, + func_alt1, + func_alt2, + func_alt3, + func_alt4, + func_alt5, + func_alt6, + func_alt7, + func_alt8, + func_aon_cpu_standbyb, + func_aon_fp_4sec_resetb, + func_aon_gpclk, + func_aon_pwm, + func_arm_jtag, + func_aud_fs_clk0, + func_avs_pmu_bsc, + func_bsc_m0, + func_bsc_m1, + func_bsc_m2, + func_bsc_m3, + func_clk_observe, + func_ctl_hdmi_5v, + func_enet0, + func_enet0_mii, + func_enet0_rgmii, + func_ext_sc_clk, + func_fl0, + func_fl1, + func_gpclk0, + func_gpclk1, + func_gpclk2, + func_hdmi_tx0_auto_i2c, + func_hdmi_tx0_bsc, + func_hdmi_tx1_auto_i2c, + func_hdmi_tx1_bsc, + func_i2s_in, + func_i2s_out, + func_ir_in, + func_mtsif, + func_mtsif_alt, + func_mtsif_alt1, + func_pdm, + func_pkt, + func_pm_led_out, + func_sc0, + func_sd0, + func_sd2, + func_sd_card_a, + func_sd_card_b, + func_sd_card_c, + func_sd_card_d, + func_sd_card_e, + func_sd_card_f, + func_sd_card_g, + func_spdif_out, + func_spi_m, + func_spi_s, + func_sr_edm_sense, + func_te0, + func_te1, + func_tsio, + func_uart0, + func_uart1, + func_uart2, + func_usb_pwr, + func_usb_vbus, + func_uui, + func_vc_i2c0, + func_vc_i2c3, + func_vc_i2c4, + func_vc_i2c5, + func_vc_i2csl, + func_vc_pcm, + func_vc_pwm0, + func_vc_pwm1, + func_vc_spi0, + func_vc_spi3, + func_vc_spi4, + func_vc_spi5, + func_vc_uart0, + func_vc_uart2, + func_vc_uart3, + func_vc_uart4, + func__, + func_count = func__ +}; + +static const struct pin_regs bcm2712_c0_gpio_pin_regs[] = { + GPIO_REGS(0, 0, 0, 7, 7), + GPIO_REGS(1, 0, 1, 7, 8), + GPIO_REGS(2, 0, 2, 7, 9), + GPIO_REGS(3, 0, 3, 7, 10), + GPIO_REGS(4, 0, 4, 7, 11), + GPIO_REGS(5, 0, 5, 7, 12), + GPIO_REGS(6, 0, 6, 7, 13), + GPIO_REGS(7, 0, 7, 7, 14), + GPIO_REGS(8, 1, 0, 8, 0), + GPIO_REGS(9, 1, 1, 8, 1), + GPIO_REGS(10, 1, 2, 8, 2), + GPIO_REGS(11, 1, 3, 8, 3), + GPIO_REGS(12, 1, 4, 8, 4), + GPIO_REGS(13, 1, 5, 8, 5), + GPIO_REGS(14, 1, 6, 8, 6), + GPIO_REGS(15, 1, 7, 8, 7), + GPIO_REGS(16, 2, 0, 8, 8), + GPIO_REGS(17, 2, 1, 8, 9), + GPIO_REGS(18, 2, 2, 8, 10), + GPIO_REGS(19, 2, 3, 8, 11), + GPIO_REGS(20, 2, 4, 8, 12), + GPIO_REGS(21, 2, 5, 8, 13), + GPIO_REGS(22, 2, 6, 8, 14), + GPIO_REGS(23, 2, 7, 9, 0), + GPIO_REGS(24, 3, 0, 9, 1), + GPIO_REGS(25, 3, 1, 9, 2), + GPIO_REGS(26, 3, 2, 9, 3), + GPIO_REGS(27, 3, 3, 9, 4), + GPIO_REGS(28, 3, 4, 9, 5), + GPIO_REGS(29, 3, 5, 9, 6), + GPIO_REGS(30, 3, 6, 9, 7), + GPIO_REGS(31, 3, 7, 9, 8), + GPIO_REGS(32, 4, 0, 9, 9), + GPIO_REGS(33, 4, 1, 9, 10), + GPIO_REGS(34, 4, 2, 9, 11), + GPIO_REGS(35, 4, 3, 9, 12), + GPIO_REGS(36, 4, 4, 9, 13), + GPIO_REGS(37, 4, 5, 9, 14), + GPIO_REGS(38, 4, 6, 10, 0), + GPIO_REGS(39, 4, 7, 10, 1), + GPIO_REGS(40, 5, 0, 10, 2), + GPIO_REGS(41, 5, 1, 10, 3), + GPIO_REGS(42, 5, 2, 10, 4), + GPIO_REGS(43, 5, 3, 10, 5), + GPIO_REGS(44, 5, 4, 10, 6), + GPIO_REGS(45, 5, 5, 10, 7), + GPIO_REGS(46, 5, 6, 10, 8), + GPIO_REGS(47, 5, 7, 10, 9), + GPIO_REGS(48, 6, 0, 10, 10), + GPIO_REGS(49, 6, 1, 10, 11), + GPIO_REGS(50, 6, 2, 10, 12), + GPIO_REGS(51, 6, 3, 10, 13), + GPIO_REGS(52, 6, 4, 10, 14), + GPIO_REGS(53, 6, 5, 11, 0), + EMMC_REGS(54, 11, 1), /* EMMC_CMD */ + EMMC_REGS(55, 11, 2), /* EMMC_DS */ + EMMC_REGS(56, 11, 3), /* EMMC_CLK */ + EMMC_REGS(57, 11, 4), /* EMMC_DAT0 */ + EMMC_REGS(58, 11, 5), /* EMMC_DAT1 */ + EMMC_REGS(59, 11, 6), /* EMMC_DAT2 */ + EMMC_REGS(60, 11, 7), /* EMMC_DAT3 */ + EMMC_REGS(61, 11, 8), /* EMMC_DAT4 */ + EMMC_REGS(62, 11, 9), /* EMMC_DAT5 */ + EMMC_REGS(63, 11, 10), /* EMMC_DAT6 */ + EMMC_REGS(64, 11, 11), /* EMMC_DAT7 */ +}; + +static struct pin_regs bcm2712_c0_aon_gpio_pin_regs[] = { + AON_GPIO_REGS(0, 3, 0, 6, 10), + AON_GPIO_REGS(1, 3, 1, 6, 11), + AON_GPIO_REGS(2, 3, 2, 6, 12), + AON_GPIO_REGS(3, 3, 3, 6, 13), + AON_GPIO_REGS(4, 3, 4, 6, 14), + AON_GPIO_REGS(5, 3, 5, 7, 0), + AON_GPIO_REGS(6, 3, 6, 7, 1), + AON_GPIO_REGS(7, 3, 7, 7, 2), + AON_GPIO_REGS(8, 4, 0, 7, 3), + AON_GPIO_REGS(9, 4, 1, 7, 4), + AON_GPIO_REGS(10, 4, 2, 7, 5), + AON_GPIO_REGS(11, 4, 3, 7, 6), + AON_GPIO_REGS(12, 4, 4, 7, 7), + AON_GPIO_REGS(13, 4, 5, 7, 8), + AON_GPIO_REGS(14, 4, 6, 7, 9), + AON_GPIO_REGS(15, 4, 7, 7, 10), + AON_GPIO_REGS(16, 5, 0, 7, 11), + AON_SGPIO_REGS(0, 0, 0), + AON_SGPIO_REGS(1, 0, 1), + AON_SGPIO_REGS(2, 0, 2), + AON_SGPIO_REGS(3, 0, 3), + AON_SGPIO_REGS(4, 1, 0), + AON_SGPIO_REGS(5, 2, 0), +}; + +static const struct pinctrl_pin_desc bcm2712_c0_gpio_pins[] = { + GPIO_PIN(0), + GPIO_PIN(1), + GPIO_PIN(2), + GPIO_PIN(3), + GPIO_PIN(4), + GPIO_PIN(5), + GPIO_PIN(6), + GPIO_PIN(7), + GPIO_PIN(8), + GPIO_PIN(9), + GPIO_PIN(10), + GPIO_PIN(11), + GPIO_PIN(12), + GPIO_PIN(13), + GPIO_PIN(14), + GPIO_PIN(15), + GPIO_PIN(16), + GPIO_PIN(17), + GPIO_PIN(18), + GPIO_PIN(19), + GPIO_PIN(20), + GPIO_PIN(21), + GPIO_PIN(22), + GPIO_PIN(23), + GPIO_PIN(24), + GPIO_PIN(25), + GPIO_PIN(26), + GPIO_PIN(27), + GPIO_PIN(28), + GPIO_PIN(29), + GPIO_PIN(30), + GPIO_PIN(31), + GPIO_PIN(32), + GPIO_PIN(33), + GPIO_PIN(34), + GPIO_PIN(35), + GPIO_PIN(36), + GPIO_PIN(37), + GPIO_PIN(38), + GPIO_PIN(39), + GPIO_PIN(40), + GPIO_PIN(41), + GPIO_PIN(42), + GPIO_PIN(43), + GPIO_PIN(44), + GPIO_PIN(45), + GPIO_PIN(46), + GPIO_PIN(47), + GPIO_PIN(48), + GPIO_PIN(49), + GPIO_PIN(50), + GPIO_PIN(51), + GPIO_PIN(52), + GPIO_PIN(53), + PINCTRL_PIN(54, "emmc_cmd"), + PINCTRL_PIN(55, "emmc_ds"), + PINCTRL_PIN(56, "emmc_clk"), + PINCTRL_PIN(57, "emmc_dat0"), + PINCTRL_PIN(58, "emmc_dat1"), + PINCTRL_PIN(59, "emmc_dat2"), + PINCTRL_PIN(60, "emmc_dat3"), + PINCTRL_PIN(61, "emmc_dat4"), + PINCTRL_PIN(62, "emmc_dat5"), + PINCTRL_PIN(63, "emmc_dat6"), + PINCTRL_PIN(64, "emmc_dat7"), +}; + +static struct pinctrl_pin_desc bcm2712_c0_aon_gpio_pins[] = { + AON_GPIO_PIN(0), AON_GPIO_PIN(1), AON_GPIO_PIN(2), AON_GPIO_PIN(3), + AON_GPIO_PIN(4), AON_GPIO_PIN(5), AON_GPIO_PIN(6), AON_GPIO_PIN(7), + AON_GPIO_PIN(8), AON_GPIO_PIN(9), AON_GPIO_PIN(10), AON_GPIO_PIN(11), + AON_GPIO_PIN(12), AON_GPIO_PIN(13), AON_GPIO_PIN(14), AON_GPIO_PIN(15), + AON_GPIO_PIN(16), AON_SGPIO_PIN(0), AON_SGPIO_PIN(1), AON_SGPIO_PIN(2), + AON_SGPIO_PIN(3), AON_SGPIO_PIN(4), AON_SGPIO_PIN(5), +}; + +static const struct pin_regs bcm2712_d0_gpio_pin_regs[] = { + GPIO_REGS(1, 0, 0, 4, 5), + GPIO_REGS(2, 0, 1, 4, 6), + GPIO_REGS(3, 0, 2, 4, 7), + GPIO_REGS(4, 0, 3, 4, 8), + GPIO_REGS(10, 0, 4, 4, 9), + GPIO_REGS(11, 0, 5, 4, 10), + GPIO_REGS(12, 0, 6, 4, 11), + GPIO_REGS(13, 0, 7, 4, 12), + GPIO_REGS(14, 1, 0, 4, 13), + GPIO_REGS(15, 1, 1, 4, 14), + GPIO_REGS(18, 1, 2, 5, 0), + GPIO_REGS(19, 1, 3, 5, 1), + GPIO_REGS(20, 1, 4, 5, 2), + GPIO_REGS(21, 1, 5, 5, 3), + GPIO_REGS(22, 1, 6, 5, 4), + GPIO_REGS(23, 1, 7, 5, 5), + GPIO_REGS(24, 2, 0, 5, 6), + GPIO_REGS(25, 2, 1, 5, 7), + GPIO_REGS(26, 2, 2, 5, 8), + GPIO_REGS(27, 2, 3, 5, 9), + GPIO_REGS(28, 2, 4, 5, 10), + GPIO_REGS(29, 2, 5, 5, 11), + GPIO_REGS(30, 2, 6, 5, 12), + GPIO_REGS(31, 2, 7, 5, 13), + GPIO_REGS(32, 3, 0, 5, 14), + GPIO_REGS(33, 3, 1, 6, 0), + GPIO_REGS(34, 3, 2, 6, 1), + GPIO_REGS(35, 3, 3, 6, 2), + EMMC_REGS(36, 6, 3), /* EMMC_CMD */ + EMMC_REGS(37, 6, 4), /* EMMC_DS */ + EMMC_REGS(38, 6, 5), /* EMMC_CLK */ + EMMC_REGS(39, 6, 6), /* EMMC_DAT0 */ + EMMC_REGS(40, 6, 7), /* EMMC_DAT1 */ + EMMC_REGS(41, 6, 8), /* EMMC_DAT2 */ + EMMC_REGS(42, 6, 9), /* EMMC_DAT3 */ + EMMC_REGS(43, 6, 10), /* EMMC_DAT4 */ + EMMC_REGS(44, 6, 11), /* EMMC_DAT5 */ + EMMC_REGS(45, 6, 12), /* EMMC_DAT6 */ + EMMC_REGS(46, 6, 13), /* EMMC_DAT7 */ +}; + +static struct pin_regs bcm2712_d0_aon_gpio_pin_regs[] = { + AON_GPIO_REGS(0, 3, 0, 5, 9), + AON_GPIO_REGS(1, 3, 1, 5, 10), + AON_GPIO_REGS(2, 3, 2, 5, 11), + AON_GPIO_REGS(3, 3, 3, 5, 12), + AON_GPIO_REGS(4, 3, 4, 5, 13), + AON_GPIO_REGS(5, 3, 5, 5, 14), + AON_GPIO_REGS(6, 3, 6, 6, 0), + AON_GPIO_REGS(8, 3, 7, 6, 1), + AON_GPIO_REGS(9, 4, 0, 6, 2), + AON_GPIO_REGS(12, 4, 1, 6, 3), + AON_GPIO_REGS(13, 4, 2, 6, 4), + AON_GPIO_REGS(14, 4, 3, 6, 5), + AON_SGPIO_REGS(0, 0, 0), + AON_SGPIO_REGS(1, 0, 1), + AON_SGPIO_REGS(2, 0, 2), + AON_SGPIO_REGS(3, 0, 3), + AON_SGPIO_REGS(4, 1, 0), + AON_SGPIO_REGS(5, 2, 0), +}; + +static const struct pinctrl_pin_desc bcm2712_d0_gpio_pins[] = { + GPIO_PIN(1), + GPIO_PIN(2), + GPIO_PIN(3), + GPIO_PIN(4), + GPIO_PIN(10), + GPIO_PIN(11), + GPIO_PIN(12), + GPIO_PIN(13), + GPIO_PIN(14), + GPIO_PIN(15), + GPIO_PIN(18), + GPIO_PIN(19), + GPIO_PIN(20), + GPIO_PIN(21), + GPIO_PIN(22), + GPIO_PIN(23), + GPIO_PIN(24), + GPIO_PIN(25), + GPIO_PIN(26), + GPIO_PIN(27), + GPIO_PIN(28), + GPIO_PIN(29), + GPIO_PIN(30), + GPIO_PIN(31), + GPIO_PIN(32), + GPIO_PIN(33), + GPIO_PIN(34), + GPIO_PIN(35), + PINCTRL_PIN(36, "emmc_cmd"), + PINCTRL_PIN(37, "emmc_ds"), + PINCTRL_PIN(38, "emmc_clk"), + PINCTRL_PIN(39, "emmc_dat0"), + PINCTRL_PIN(40, "emmc_dat1"), + PINCTRL_PIN(41, "emmc_dat2"), + PINCTRL_PIN(42, "emmc_dat3"), + PINCTRL_PIN(43, "emmc_dat4"), + PINCTRL_PIN(44, "emmc_dat5"), + PINCTRL_PIN(45, "emmc_dat6"), + PINCTRL_PIN(46, "emmc_dat7"), +}; + +static struct pinctrl_pin_desc bcm2712_d0_aon_gpio_pins[] = { + AON_GPIO_PIN(0), AON_GPIO_PIN(1), AON_GPIO_PIN(2), AON_GPIO_PIN(3), + AON_GPIO_PIN(4), AON_GPIO_PIN(5), AON_GPIO_PIN(6), AON_GPIO_PIN(8), + AON_GPIO_PIN(9), AON_GPIO_PIN(12), AON_GPIO_PIN(13), AON_GPIO_PIN(14), + AON_SGPIO_PIN(0), AON_SGPIO_PIN(1), AON_SGPIO_PIN(2), + AON_SGPIO_PIN(3), AON_SGPIO_PIN(4), AON_SGPIO_PIN(5), +}; + +static const char * const bcm2712_func_names[] = { + BRCMSTB_FUNC(gpio), + BRCMSTB_FUNC(alt1), + BRCMSTB_FUNC(alt2), + BRCMSTB_FUNC(alt3), + BRCMSTB_FUNC(alt4), + BRCMSTB_FUNC(alt5), + BRCMSTB_FUNC(alt6), + BRCMSTB_FUNC(alt7), + BRCMSTB_FUNC(alt8), + BRCMSTB_FUNC(aon_cpu_standbyb), + BRCMSTB_FUNC(aon_fp_4sec_resetb), + BRCMSTB_FUNC(aon_gpclk), + BRCMSTB_FUNC(aon_pwm), + BRCMSTB_FUNC(arm_jtag), + BRCMSTB_FUNC(aud_fs_clk0), + BRCMSTB_FUNC(avs_pmu_bsc), + BRCMSTB_FUNC(bsc_m0), + BRCMSTB_FUNC(bsc_m1), + BRCMSTB_FUNC(bsc_m2), + BRCMSTB_FUNC(bsc_m3), + BRCMSTB_FUNC(clk_observe), + BRCMSTB_FUNC(ctl_hdmi_5v), + BRCMSTB_FUNC(enet0), + BRCMSTB_FUNC(enet0_mii), + BRCMSTB_FUNC(enet0_rgmii), + BRCMSTB_FUNC(ext_sc_clk), + BRCMSTB_FUNC(fl0), + BRCMSTB_FUNC(fl1), + BRCMSTB_FUNC(gpclk0), + BRCMSTB_FUNC(gpclk1), + BRCMSTB_FUNC(gpclk2), + BRCMSTB_FUNC(hdmi_tx0_auto_i2c), + BRCMSTB_FUNC(hdmi_tx0_bsc), + BRCMSTB_FUNC(hdmi_tx1_auto_i2c), + BRCMSTB_FUNC(hdmi_tx1_bsc), + BRCMSTB_FUNC(i2s_in), + BRCMSTB_FUNC(i2s_out), + BRCMSTB_FUNC(ir_in), + BRCMSTB_FUNC(mtsif), + BRCMSTB_FUNC(mtsif_alt), + BRCMSTB_FUNC(mtsif_alt1), + BRCMSTB_FUNC(pdm), + BRCMSTB_FUNC(pkt), + BRCMSTB_FUNC(pm_led_out), + BRCMSTB_FUNC(sc0), + BRCMSTB_FUNC(sd0), + BRCMSTB_FUNC(sd2), + BRCMSTB_FUNC(sd_card_a), + BRCMSTB_FUNC(sd_card_b), + BRCMSTB_FUNC(sd_card_c), + BRCMSTB_FUNC(sd_card_d), + BRCMSTB_FUNC(sd_card_e), + BRCMSTB_FUNC(sd_card_f), + BRCMSTB_FUNC(sd_card_g), + BRCMSTB_FUNC(spdif_out), + BRCMSTB_FUNC(spi_m), + BRCMSTB_FUNC(spi_s), + BRCMSTB_FUNC(sr_edm_sense), + BRCMSTB_FUNC(te0), + BRCMSTB_FUNC(te1), + BRCMSTB_FUNC(tsio), + BRCMSTB_FUNC(uart0), + BRCMSTB_FUNC(uart1), + BRCMSTB_FUNC(uart2), + BRCMSTB_FUNC(usb_pwr), + BRCMSTB_FUNC(usb_vbus), + BRCMSTB_FUNC(uui), + BRCMSTB_FUNC(vc_i2c0), + BRCMSTB_FUNC(vc_i2c3), + BRCMSTB_FUNC(vc_i2c4), + BRCMSTB_FUNC(vc_i2c5), + BRCMSTB_FUNC(vc_i2csl), + BRCMSTB_FUNC(vc_pcm), + BRCMSTB_FUNC(vc_pwm0), + BRCMSTB_FUNC(vc_pwm1), + BRCMSTB_FUNC(vc_spi0), + BRCMSTB_FUNC(vc_spi3), + BRCMSTB_FUNC(vc_spi4), + BRCMSTB_FUNC(vc_spi5), + BRCMSTB_FUNC(vc_uart0), + BRCMSTB_FUNC(vc_uart2), + BRCMSTB_FUNC(vc_uart3), + BRCMSTB_FUNC(vc_uart4), +}; + +static const struct brcmstb_pin_funcs bcm2712_c0_aon_gpio_pin_funcs[] = { + BRCMSTB_PIN(0, ir_in, vc_spi0, vc_uart3, vc_i2c3, te0, vc_i2c0, _, _), + BRCMSTB_PIN(1, vc_pwm0, vc_spi0, vc_uart3, vc_i2c3, te1, aon_pwm, vc_i2c0, vc_pwm1), + BRCMSTB_PIN(2, vc_pwm0, vc_spi0, vc_uart3, ctl_hdmi_5v, fl0, aon_pwm, ir_in, vc_pwm1), + BRCMSTB_PIN(3, ir_in, vc_spi0, vc_uart3, aon_fp_4sec_resetb, fl1, sd_card_g, aon_gpclk, _), + BRCMSTB_PIN(4, gpclk0, vc_spi0, vc_i2csl, aon_gpclk, pm_led_out, aon_pwm, sd_card_g, vc_pwm0), + BRCMSTB_PIN(5, gpclk1, ir_in, vc_i2csl, clk_observe, aon_pwm, sd_card_g, vc_pwm0, _), + BRCMSTB_PIN(6, uart1, vc_uart4, gpclk2, ctl_hdmi_5v, vc_uart0, vc_spi3, _, _), + BRCMSTB_PIN(7, uart1, vc_uart4, gpclk0, aon_pwm, vc_uart0, vc_spi3, _, _), + BRCMSTB_PIN(8, uart1, vc_uart4, vc_i2csl, ctl_hdmi_5v, vc_uart0, vc_spi3, _, _), + BRCMSTB_PIN(9, uart1, vc_uart4, vc_i2csl, aon_pwm, vc_uart0, vc_spi3, _, _), + BRCMSTB_PIN(10, tsio, ctl_hdmi_5v, sc0, spdif_out, vc_spi5, usb_pwr, aon_gpclk, sd_card_f), + BRCMSTB_PIN(11, tsio, uart0, sc0, aud_fs_clk0, vc_spi5, usb_vbus, vc_uart2, sd_card_f), + BRCMSTB_PIN(12, tsio, uart0, vc_uart0, tsio, vc_spi5, usb_pwr, vc_uart2, sd_card_f), + BRCMSTB_PIN(13, bsc_m1, uart0, vc_uart0, uui, vc_spi5, arm_jtag, vc_uart2, vc_i2c3), + BRCMSTB_PIN(14, bsc_m1, uart0, vc_uart0, uui, vc_spi5, arm_jtag, vc_uart2, vc_i2c3), + BRCMSTB_PIN(15, ir_in, aon_fp_4sec_resetb, vc_uart0, pm_led_out, ctl_hdmi_5v, aon_pwm, aon_gpclk, _), + BRCMSTB_PIN(16, aon_cpu_standbyb, gpclk0, pm_led_out, ctl_hdmi_5v, vc_pwm0, usb_pwr, aud_fs_clk0, _), +}; + +static const struct brcmstb_pin_funcs bcm2712_c0_gpio_pin_funcs[] = { + BRCMSTB_PIN(0, bsc_m3, vc_i2c0, gpclk0, enet0, vc_pwm1, vc_spi0, ir_in, _), + BRCMSTB_PIN(1, bsc_m3, vc_i2c0, gpclk1, enet0, vc_pwm1, sr_edm_sense, vc_spi0, vc_uart3), + BRCMSTB_PIN(2, pdm, i2s_in, gpclk2, vc_spi4, pkt, vc_spi0, vc_uart3, _), + BRCMSTB_PIN(3, pdm, i2s_in, vc_spi4, pkt, vc_spi0, vc_uart3, _, _), + BRCMSTB_PIN(4, pdm, i2s_in, arm_jtag, vc_spi4, pkt, vc_spi0, vc_uart3, _), + BRCMSTB_PIN(5, pdm, vc_i2c3, arm_jtag, sd_card_e, vc_spi4, pkt, vc_pcm, vc_i2c5), + BRCMSTB_PIN(6, pdm, vc_i2c3, arm_jtag, sd_card_e, vc_spi4, pkt, vc_pcm, vc_i2c5), + BRCMSTB_PIN(7, i2s_out, spdif_out, arm_jtag, sd_card_e, vc_i2c3, enet0_rgmii, vc_pcm, vc_spi4), + BRCMSTB_PIN(8, i2s_out, aud_fs_clk0, arm_jtag, sd_card_e, vc_i2c3, enet0_mii, vc_pcm, vc_spi4), + BRCMSTB_PIN(9, i2s_out, aud_fs_clk0, arm_jtag, sd_card_e, enet0_mii, sd_card_c, vc_spi4, _), + BRCMSTB_PIN(10, bsc_m3, mtsif_alt1, i2s_in, i2s_out, vc_spi5, enet0_mii, sd_card_c, vc_spi4), + BRCMSTB_PIN(11, bsc_m3, mtsif_alt1, i2s_in, i2s_out, vc_spi5, enet0_mii, sd_card_c, vc_spi4), + BRCMSTB_PIN(12, spi_s, mtsif_alt1, i2s_in, i2s_out, vc_spi5, vc_i2csl, sd0, sd_card_d), + BRCMSTB_PIN(13, spi_s, mtsif_alt1, i2s_out, usb_vbus, vc_spi5, vc_i2csl, sd0, sd_card_d), + BRCMSTB_PIN(14, spi_s, vc_i2csl, enet0_rgmii, arm_jtag, vc_spi5, vc_pwm0, vc_i2c4, sd_card_d), + BRCMSTB_PIN(15, spi_s, vc_i2csl, vc_spi3, arm_jtag, vc_pwm0, vc_i2c4, gpclk0, _), + BRCMSTB_PIN(16, sd_card_b, i2s_out, vc_spi3, i2s_in, sd0, enet0_rgmii, gpclk1, _), + BRCMSTB_PIN(17, sd_card_b, i2s_out, vc_spi3, i2s_in, ext_sc_clk, sd0, enet0_rgmii, gpclk2), + BRCMSTB_PIN(18, sd_card_b, i2s_out, vc_spi3, i2s_in, sd0, enet0_rgmii, vc_pwm1, _), + BRCMSTB_PIN(19, sd_card_b, usb_pwr, vc_spi3, pkt, spdif_out, sd0, ir_in, vc_pwm1), + BRCMSTB_PIN(20, sd_card_b, uui, vc_uart0, arm_jtag, uart2, usb_pwr, vc_pcm, vc_uart4), + BRCMSTB_PIN(21, usb_pwr, uui, vc_uart0, arm_jtag, uart2, sd_card_b, vc_pcm, vc_uart4), + BRCMSTB_PIN(22, usb_pwr, enet0, vc_uart0, mtsif, uart2, usb_vbus, vc_pcm, vc_i2c5), + BRCMSTB_PIN(23, usb_vbus, enet0, vc_uart0, mtsif, uart2, i2s_out, vc_pcm, vc_i2c5), + BRCMSTB_PIN(24, mtsif, pkt, uart0, enet0_rgmii, enet0_rgmii, vc_i2c4, vc_uart3, _), + BRCMSTB_PIN(25, mtsif, pkt, sc0, uart0, enet0_rgmii, enet0_rgmii, vc_i2c4, vc_uart3), + BRCMSTB_PIN(26, mtsif, pkt, sc0, uart0, enet0_rgmii, vc_uart4, vc_spi5, _), + BRCMSTB_PIN(27, mtsif, pkt, sc0, uart0, enet0_rgmii, vc_uart4, vc_spi5, _), + BRCMSTB_PIN(28, mtsif, pkt, sc0, enet0_rgmii, vc_uart4, vc_spi5, _, _), + BRCMSTB_PIN(29, mtsif, pkt, sc0, enet0_rgmii, vc_uart4, vc_spi5, _, _), + BRCMSTB_PIN(30, mtsif, pkt, sc0, sd2, enet0_rgmii, gpclk0, vc_pwm0, _), + BRCMSTB_PIN(31, mtsif, pkt, sc0, sd2, enet0_rgmii, vc_spi3, vc_pwm0, _), + BRCMSTB_PIN(32, mtsif, pkt, sc0, sd2, enet0_rgmii, vc_spi3, vc_uart3, _), + BRCMSTB_PIN(33, mtsif, pkt, sd2, enet0_rgmii, vc_spi3, vc_uart3, _, _), + BRCMSTB_PIN(34, mtsif, pkt, ext_sc_clk, sd2, enet0_rgmii, vc_spi3, vc_i2c5, _), + BRCMSTB_PIN(35, mtsif, pkt, sd2, enet0_rgmii, vc_spi3, vc_i2c5, _, _), + BRCMSTB_PIN(36, sd0, mtsif, sc0, i2s_in, vc_uart3, vc_uart2, _, _), + BRCMSTB_PIN(37, sd0, mtsif, sc0, vc_spi0, i2s_in, vc_uart3, vc_uart2, _), + BRCMSTB_PIN(38, sd0, mtsif_alt, sc0, vc_spi0, i2s_in, vc_uart3, vc_uart2, _), + BRCMSTB_PIN(39, sd0, mtsif_alt, sc0, vc_spi0, vc_uart3, vc_uart2, _, _), + BRCMSTB_PIN(40, sd0, mtsif_alt, sc0, vc_spi0, bsc_m3, _, _, _), + BRCMSTB_PIN(41, sd0, mtsif_alt, sc0, vc_spi0, bsc_m3, _, _, _), + BRCMSTB_PIN(42, vc_spi0, mtsif_alt, vc_i2c0, sd_card_a, mtsif_alt1, arm_jtag, pdm, spi_m), + BRCMSTB_PIN(43, vc_spi0, mtsif_alt, vc_i2c0, sd_card_a, mtsif_alt1, arm_jtag, pdm, spi_m), + BRCMSTB_PIN(44, vc_spi0, mtsif_alt, enet0, sd_card_a, mtsif_alt1, arm_jtag, pdm, spi_m), + BRCMSTB_PIN(45, vc_spi0, mtsif_alt, enet0, sd_card_a, mtsif_alt1, arm_jtag, pdm, spi_m), + BRCMSTB_PIN(46, vc_spi0, mtsif_alt, sd_card_a, mtsif_alt1, arm_jtag, pdm, spi_m, _), + BRCMSTB_PIN(47, enet0, mtsif_alt, i2s_out, mtsif_alt1, arm_jtag, _, _, _), + BRCMSTB_PIN(48, sc0, usb_pwr, spdif_out, mtsif, _, _, _, _), + BRCMSTB_PIN(49, sc0, usb_pwr, aud_fs_clk0, mtsif, _, _, _, _), + BRCMSTB_PIN(50, sc0, usb_vbus, sc0, _, _, _, _, _), + BRCMSTB_PIN(51, sc0, enet0, sc0, sr_edm_sense, _, _, _, _), + BRCMSTB_PIN(52, sc0, enet0, vc_pwm1, _, _, _, _, _), + BRCMSTB_PIN(53, sc0, enet0_rgmii, ext_sc_clk, _, _, _, _, _), +}; + +static const struct brcmstb_pin_funcs bcm2712_d0_aon_gpio_pin_funcs[] = { + BRCMSTB_PIN(0, ir_in, vc_spi0, vc_uart0, vc_i2c3, uart0, vc_i2c0, _, _), + BRCMSTB_PIN(1, vc_pwm0, vc_spi0, vc_uart0, vc_i2c3, uart0, aon_pwm, vc_i2c0, vc_pwm1), + BRCMSTB_PIN(2, vc_pwm0, vc_spi0, vc_uart0, ctl_hdmi_5v, uart0, aon_pwm, ir_in, vc_pwm1), + BRCMSTB_PIN(3, ir_in, vc_spi0, vc_uart0, uart0, sd_card_g, aon_gpclk, _, _), + BRCMSTB_PIN(4, gpclk0, vc_spi0, pm_led_out, aon_pwm, sd_card_g, vc_pwm0, _, _), + BRCMSTB_PIN(5, gpclk1, ir_in, aon_pwm, sd_card_g, vc_pwm0, _, _, _), + BRCMSTB_PIN(6, uart1, vc_uart2, ctl_hdmi_5v, gpclk2, vc_spi3, _, _, _), + BRCMSTB_PIN(7, _, _, _, _, _, _, _, _), /* non-existent on D0 silicon */ + BRCMSTB_PIN(8, uart1, vc_uart2, ctl_hdmi_5v, vc_spi0, vc_spi3, _, _, _), + BRCMSTB_PIN(9, uart1, vc_uart2, vc_uart0, aon_pwm, vc_spi0, vc_uart2, vc_spi3, _), + BRCMSTB_PIN(10, _, _, _, _, _, _, _, _), /* non-existent on D0 silicon */ + BRCMSTB_PIN(11, _, _, _, _, _, _, _, _), /* non-existent on D0 silicon */ + BRCMSTB_PIN(12, uart1, vc_uart2, vc_uart0, vc_spi0, usb_pwr, vc_uart2, vc_spi3, _), + BRCMSTB_PIN(13, bsc_m1, vc_uart0, uui, vc_spi0, arm_jtag, vc_uart2, vc_i2c3, _), + BRCMSTB_PIN(14, bsc_m1, aon_gpclk, vc_uart0, uui, vc_spi0, arm_jtag, vc_uart2, vc_i2c3), +}; + +static const struct brcmstb_pin_funcs bcm2712_d0_gpio_pin_funcs[] = { + BRCMSTB_PIN(1, vc_i2c0, usb_pwr, gpclk0, sd_card_e, vc_spi3, sr_edm_sense, vc_spi0, vc_uart0), + BRCMSTB_PIN(2, vc_i2c0, usb_pwr, gpclk1, sd_card_e, vc_spi3, clk_observe, vc_spi0, vc_uart0), + BRCMSTB_PIN(3, vc_i2c3, usb_vbus, gpclk2, sd_card_e, vc_spi3, vc_spi0, vc_uart0, _), + BRCMSTB_PIN(4, vc_i2c3, vc_pwm1, vc_spi3, sd_card_e, vc_spi3, vc_spi0, vc_uart0, _), + BRCMSTB_PIN(10, bsc_m3, vc_pwm1, vc_spi3, sd_card_e, vc_spi3, gpclk0, _, _), + BRCMSTB_PIN(11, bsc_m3, vc_spi3, clk_observe, sd_card_c, gpclk1, _, _, _), + BRCMSTB_PIN(12, spi_s, vc_spi3, sd_card_c, sd_card_d, _, _, _, _), + BRCMSTB_PIN(13, spi_s, vc_spi3, sd_card_c, sd_card_d, _, _, _, _), + BRCMSTB_PIN(14, spi_s, uui, arm_jtag, vc_pwm0, vc_i2c0, sd_card_d, _, _), + BRCMSTB_PIN(15, spi_s, uui, arm_jtag, vc_pwm0, vc_i2c0, gpclk0, _, _), + BRCMSTB_PIN(18, sd_card_f, vc_pwm1, _, _, _, _, _, _), + BRCMSTB_PIN(19, sd_card_f, usb_pwr, vc_pwm1, _, _, _, _, _), + BRCMSTB_PIN(20, vc_i2c3, uui, vc_uart0, arm_jtag, vc_uart2, _, _, _), + BRCMSTB_PIN(21, vc_i2c3, uui, vc_uart0, arm_jtag, vc_uart2, _, _, _), + BRCMSTB_PIN(22, sd_card_f, vc_uart0, vc_i2c3, _, _, _, _, _), + BRCMSTB_PIN(23, vc_uart0, vc_i2c3, _, _, _, _, _, _), + BRCMSTB_PIN(24, sd_card_b, vc_spi0, arm_jtag, uart0, usb_pwr, vc_uart2, vc_uart0, _), + BRCMSTB_PIN(25, sd_card_b, vc_spi0, arm_jtag, uart0, usb_pwr, vc_uart2, vc_uart0, _), + BRCMSTB_PIN(26, sd_card_b, vc_spi0, arm_jtag, uart0, usb_vbus, vc_uart2, vc_spi0, _), + BRCMSTB_PIN(27, sd_card_b, vc_spi0, arm_jtag, uart0, vc_uart2, vc_spi0, _, _), + BRCMSTB_PIN(28, sd_card_b, vc_spi0, arm_jtag, vc_i2c0, vc_spi0, _, _, _), + BRCMSTB_PIN(29, arm_jtag, vc_i2c0, vc_spi0, _, _, _, _, _), + BRCMSTB_PIN(30, sd2, gpclk0, vc_pwm0, _, _, _, _, _), + BRCMSTB_PIN(31, sd2, vc_spi3, vc_pwm0, _, _, _, _, _), + BRCMSTB_PIN(32, sd2, vc_spi3, vc_uart3, _, _, _, _, _), + BRCMSTB_PIN(33, sd2, vc_spi3, vc_uart3, _, _, _, _, _), + BRCMSTB_PIN(34, sd2, vc_spi3, vc_i2c5, _, _, _, _, _), + BRCMSTB_PIN(35, sd2, vc_spi3, vc_i2c5, _, _, _, _, _), +}; + +static const struct pinctrl_desc bcm2712_c0_pinctrl_desc = { + .name = "pinctrl-bcm2712", + .pins = bcm2712_c0_gpio_pins, + .npins = ARRAY_SIZE(bcm2712_c0_gpio_pins), +}; + +static const struct pinctrl_desc bcm2712_c0_aon_pinctrl_desc = { + .name = "aon-pinctrl-bcm2712", + .pins = bcm2712_c0_aon_gpio_pins, + .npins = ARRAY_SIZE(bcm2712_c0_aon_gpio_pins), +}; + +static const struct pinctrl_desc bcm2712_d0_pinctrl_desc = { + .name = "pinctrl-bcm2712", + .pins = bcm2712_d0_gpio_pins, + .npins = ARRAY_SIZE(bcm2712_d0_gpio_pins), +}; + +static const struct pinctrl_desc bcm2712_d0_aon_pinctrl_desc = { + .name = "aon-pinctrl-bcm2712", + .pins = bcm2712_d0_aon_gpio_pins, + .npins = ARRAY_SIZE(bcm2712_d0_aon_gpio_pins), +}; + +static const struct pinctrl_gpio_range bcm2712_c0_pinctrl_gpio_range = { + .name = "pinctrl-bcm2712", + .npins = ARRAY_SIZE(bcm2712_c0_gpio_pins), +}; + +static const struct pinctrl_gpio_range bcm2712_c0_aon_pinctrl_gpio_range = { + .name = "aon-pinctrl-bcm2712", + .npins = ARRAY_SIZE(bcm2712_c0_aon_gpio_pins), +}; + +static const struct pinctrl_gpio_range bcm2712_d0_pinctrl_gpio_range = { + .name = "pinctrl-bcm2712", + .npins = ARRAY_SIZE(bcm2712_d0_gpio_pins), +}; + +static const struct pinctrl_gpio_range bcm2712_d0_aon_pinctrl_gpio_range = { + .name = "aon-pinctrl-bcm2712", + .npins = ARRAY_SIZE(bcm2712_d0_aon_gpio_pins), +}; + +static const struct brcmstb_pdata bcm2712_c0_pdata = { + .pctl_desc = &bcm2712_c0_pinctrl_desc, + .gpio_range = &bcm2712_c0_pinctrl_gpio_range, + .pin_regs = bcm2712_c0_gpio_pin_regs, + .pin_funcs = bcm2712_c0_gpio_pin_funcs, + .func_count = func_count, + .func_gpio = func_gpio, + .func_names = bcm2712_func_names, +}; + +static const struct brcmstb_pdata bcm2712_c0_aon_pdata = { + .pctl_desc = &bcm2712_c0_aon_pinctrl_desc, + .gpio_range = &bcm2712_c0_aon_pinctrl_gpio_range, + .pin_regs = bcm2712_c0_aon_gpio_pin_regs, + .pin_funcs = bcm2712_c0_aon_gpio_pin_funcs, + .func_count = func_count, + .func_gpio = func_gpio, + .func_names = bcm2712_func_names, +}; + +static const struct brcmstb_pdata bcm2712_d0_pdata = { + .pctl_desc = &bcm2712_d0_pinctrl_desc, + .gpio_range = &bcm2712_d0_pinctrl_gpio_range, + .pin_regs = bcm2712_d0_gpio_pin_regs, + .pin_funcs = bcm2712_d0_gpio_pin_funcs, + .func_count = func_count, + .func_gpio = func_gpio, + .func_names = bcm2712_func_names, +}; + +static const struct brcmstb_pdata bcm2712_d0_aon_pdata = { + .pctl_desc = &bcm2712_d0_aon_pinctrl_desc, + .gpio_range = &bcm2712_d0_aon_pinctrl_gpio_range, + .pin_regs = bcm2712_d0_aon_gpio_pin_regs, + .pin_funcs = bcm2712_d0_aon_gpio_pin_funcs, + .func_count = func_count, + .func_gpio = func_gpio, + .func_names = bcm2712_func_names, +}; + +static int bcm2712_pinctrl_probe(struct platform_device *pdev) +{ + return brcmstb_pinctrl_probe(pdev); +} + +static const struct of_device_id bcm2712_pinctrl_match[] = { + { + .compatible = "brcm,bcm2712c0-pinctrl", + .data = &bcm2712_c0_pdata + }, + { + .compatible = "brcm,bcm2712c0-aon-pinctrl", + .data = &bcm2712_c0_aon_pdata + }, + + { + .compatible = "brcm,bcm2712d0-pinctrl", + .data = &bcm2712_d0_pdata + }, + { + .compatible = "brcm,bcm2712d0-aon-pinctrl", + .data = &bcm2712_d0_aon_pdata + }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, bcm2712_pinctrl_match); + +static struct platform_driver bcm2712_pinctrl_driver = { + .probe = bcm2712_pinctrl_probe, + .driver = { + .name = "pinctrl-bcm2712", + .of_match_table = bcm2712_pinctrl_match, + .suppress_bind_attrs = true, + }, +}; +module_platform_driver(bcm2712_pinctrl_driver); + +MODULE_AUTHOR("Phil Elwell"); +MODULE_AUTHOR("Jonathan Bell"); +MODULE_AUTHOR("Ivan T. Ivanov"); +MODULE_AUTHOR("Andrea della Porta"); +MODULE_DESCRIPTION("Broadcom BCM2712 pinctrl driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/pinctrl/bcm/pinctrl-brcmstb.c b/drivers/pinctrl/bcm/pinctrl-brcmstb.c new file mode 100644 index 00000000000000..f46b27155c3c40 --- /dev/null +++ b/drivers/pinctrl/bcm/pinctrl-brcmstb.c @@ -0,0 +1,442 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Driver for Broadcom brcmstb GPIO units (pinctrl only) + * + * Copyright (C) 2024-2025 Ivan T. Ivanov, Andrea della Porta + * Copyright (C) 2021-3 Raspberry Pi Ltd. + * Copyright (C) 2012 Chris Boot, Simon Arlott, Stephen Warren + * + * Based heavily on the BCM2835 GPIO & pinctrl driver, which was inspired by: + * pinctrl-nomadik.c, please see original file for copyright information + * pinctrl-tegra.c, please see original file for copyright information + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pinctrl-brcmstb.h" + +#define BRCMSTB_PULL_NONE 0 +#define BRCMSTB_PULL_DOWN 1 +#define BRCMSTB_PULL_UP 2 +#define BRCMSTB_PULL_MASK 0x3 + +#define BIT_TO_REG(b) (((b) >> 5) << 2) +#define BIT_TO_SHIFT(b) ((b) & 0x1f) + +struct brcmstb_pinctrl { + struct device *dev; + void __iomem *base; + struct pinctrl_dev *pctl_dev; + struct pinctrl_desc pctl_desc; + const struct pin_regs *pin_regs; + const struct brcmstb_pin_funcs *pin_funcs; + const char * const *func_names; + unsigned int func_count; + unsigned int func_gpio; + const char *const *gpio_groups; + struct pinctrl_gpio_range gpio_range; + /* Protect FSEL registers */ + spinlock_t fsel_lock; +}; + +static unsigned int brcmstb_pinctrl_fsel_get(struct brcmstb_pinctrl *pc, + unsigned int pin) +{ + u32 bit = pc->pin_regs[pin].mux_bit; + unsigned int func; + int fsel; + u32 val; + + if (!bit) + return pc->func_gpio; + + bit &= ~MUX_BIT_VALID; + + val = readl(pc->base + BIT_TO_REG(bit)); + fsel = (val >> BIT_TO_SHIFT(bit)) & pc->pin_funcs[pin].func_mask; + func = pc->pin_funcs[pin].funcs[fsel]; + + if (func >= pc->func_count) + func = fsel; + + dev_dbg(pc->dev, "get %04x: %08x (%u => %s)\n", + BIT_TO_REG(bit), val, pin, + pc->func_names[func]); + + return func; +} + +static int brcmstb_pinctrl_fsel_set(struct brcmstb_pinctrl *pc, + unsigned int pin, unsigned int func) +{ + u32 bit = pc->pin_regs[pin].mux_bit, val, fsel_mask; + const u8 *pin_funcs; + int fsel; + int cur; + int i; + + if (!bit || func >= pc->func_count) + return -EINVAL; + + bit &= ~MUX_BIT_VALID; + + fsel = pc->pin_funcs[pin].n_funcs + 1; + fsel_mask = pc->pin_funcs[pin].func_mask; + + if (func >= fsel) { + /* Convert to an fsel number */ + pin_funcs = pc->pin_funcs[pin].funcs; + for (i = 1; i < fsel; i++) { + if (pin_funcs[i - 1] == func) { + fsel = i; + break; + } + } + } else { + fsel = func; + } + + if (fsel >= pc->pin_funcs[pin].n_funcs + 1) + return -EINVAL; + + guard(spinlock_irqsave)(&pc->fsel_lock); + + val = readl(pc->base + BIT_TO_REG(bit)); + cur = (val >> BIT_TO_SHIFT(bit)) & fsel_mask; + + dev_dbg(pc->dev, "read %04x: %08x (%u => %s)\n", + BIT_TO_REG(bit), val, pin, + pc->func_names[cur]); + + if (cur != fsel) { + val &= ~(fsel_mask << BIT_TO_SHIFT(bit)); + val |= fsel << BIT_TO_SHIFT(bit); + + dev_dbg(pc->dev, "write %04x: %08x (%u <= %s)\n", + BIT_TO_REG(bit), val, pin, + pc->func_names[fsel]); + writel(val, pc->base + BIT_TO_REG(bit)); + } + + return 0; +} + +static int brcmstb_pctl_get_groups_count(struct pinctrl_dev *pctldev) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return pc->pctl_desc.npins; +} + +static const char *brcmstb_pctl_get_group_name(struct pinctrl_dev *pctldev, + unsigned int selector) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return pc->gpio_groups[selector]; +} + +static int brcmstb_pctl_get_group_pins(struct pinctrl_dev *pctldev, + unsigned int selector, + const unsigned int **pins, + unsigned int *num_pins) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + *pins = &pc->pctl_desc.pins[selector].number; + *num_pins = 1; + + return 0; +} + +static void brcmstb_pctl_pin_dbg_show(struct pinctrl_dev *pctldev, + struct seq_file *s, unsigned int offset) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + unsigned int fsel = brcmstb_pinctrl_fsel_get(pc, offset); + const char *fname = pc->func_names[fsel]; + + seq_printf(s, "function %s", fname); +} + +static void brcmstb_pctl_dt_free_map(struct pinctrl_dev *pctldev, + struct pinctrl_map *maps, + unsigned int num_maps) +{ + int i; + + for (i = 0; i < num_maps; i++) + if (maps[i].type == PIN_MAP_TYPE_CONFIGS_PIN) + kfree(maps[i].data.configs.configs); + + kfree(maps); +} + +static const struct pinctrl_ops brcmstb_pctl_ops = { + .get_groups_count = brcmstb_pctl_get_groups_count, + .get_group_name = brcmstb_pctl_get_group_name, + .get_group_pins = brcmstb_pctl_get_group_pins, + .pin_dbg_show = brcmstb_pctl_pin_dbg_show, + .dt_node_to_map = pinconf_generic_dt_node_to_map_all, + .dt_free_map = brcmstb_pctl_dt_free_map, +}; + +static int brcmstb_pmx_free(struct pinctrl_dev *pctldev, unsigned int offset) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + /* disable by setting to GPIO */ + return brcmstb_pinctrl_fsel_set(pc, offset, pc->func_gpio); +} + +static int brcmstb_pmx_get_functions_count(struct pinctrl_dev *pctldev) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return pc->func_count; +} + +static const char *brcmstb_pmx_get_function_name(struct pinctrl_dev *pctldev, + unsigned int selector) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return (selector < pc->func_count) ? pc->func_names[selector] : NULL; +} + +static int brcmstb_pmx_get_function_groups(struct pinctrl_dev *pctldev, + unsigned int selector, + const char *const **groups, + unsigned *const num_groups) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + *groups = pc->gpio_groups; + *num_groups = pc->pctl_desc.npins; + + return 0; +} + +static int brcmstb_pmx_set(struct pinctrl_dev *pctldev, + unsigned int func_selector, + unsigned int group_selector) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + const struct pinctrl_desc *pctldesc = &pc->pctl_desc; + const struct pinctrl_pin_desc *pindesc; + + if (group_selector >= pctldesc->npins) + return -EINVAL; + + pindesc = &pctldesc->pins[group_selector]; + return brcmstb_pinctrl_fsel_set(pc, pindesc->number, func_selector); +} + +static int brcmstb_pmx_gpio_request_enable(struct pinctrl_dev *pctldev, + struct pinctrl_gpio_range *range, + unsigned int pin) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return brcmstb_pinctrl_fsel_set(pc, pin, pc->func_gpio); +} + +static void brcmstb_pmx_gpio_disable_free(struct pinctrl_dev *pctldev, + struct pinctrl_gpio_range *range, + unsigned int offset) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + /* disable by setting to GPIO */ + (void)brcmstb_pinctrl_fsel_set(pc, offset, pc->func_gpio); +} + +static bool brcmstb_pmx_function_is_gpio(struct pinctrl_dev *pctldev, + unsigned int selector) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return pc->func_gpio == selector; +} + +static const struct pinmux_ops brcmstb_pmx_ops = { + .free = brcmstb_pmx_free, + .get_functions_count = brcmstb_pmx_get_functions_count, + .get_function_name = brcmstb_pmx_get_function_name, + .get_function_groups = brcmstb_pmx_get_function_groups, + .set_mux = brcmstb_pmx_set, + .gpio_request_enable = brcmstb_pmx_gpio_request_enable, + .gpio_disable_free = brcmstb_pmx_gpio_disable_free, + .function_is_gpio = brcmstb_pmx_function_is_gpio, + .strict = true, +}; + +static unsigned int brcmstb_pull_config_get(struct brcmstb_pinctrl *pc, + unsigned int pin) +{ + u32 bit = pc->pin_regs[pin].pad_bit, val; + + if (bit == PAD_BIT_INVALID) + return BRCMSTB_PULL_NONE; + + val = readl(pc->base + BIT_TO_REG(bit)); + return (val >> BIT_TO_SHIFT(bit)) & BRCMSTB_PULL_MASK; +} + +static int brcmstb_pull_config_set(struct brcmstb_pinctrl *pc, + unsigned int pin, unsigned int arg) +{ + u32 bit = pc->pin_regs[pin].pad_bit, val; + + if (bit == PAD_BIT_INVALID) { + dev_warn(pc->dev, "Can't set pulls for %s\n", + pc->gpio_groups[pin]); + return -EINVAL; + } + + guard(spinlock_irqsave)(&pc->fsel_lock); + + val = readl(pc->base + BIT_TO_REG(bit)); + val &= ~(BRCMSTB_PULL_MASK << BIT_TO_SHIFT(bit)); + val |= (arg << BIT_TO_SHIFT(bit)); + writel(val, pc->base + BIT_TO_REG(bit)); + + return 0; +} + +static int brcmstb_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, + unsigned long *config) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + enum pin_config_param param = pinconf_to_config_param(*config); + u32 arg; + + switch (param) { + case PIN_CONFIG_BIAS_DISABLE: + arg = (brcmstb_pull_config_get(pc, pin) == BRCMSTB_PULL_NONE); + break; + case PIN_CONFIG_BIAS_PULL_DOWN: + arg = (brcmstb_pull_config_get(pc, pin) == BRCMSTB_PULL_DOWN); + break; + case PIN_CONFIG_BIAS_PULL_UP: + arg = (brcmstb_pull_config_get(pc, pin) == BRCMSTB_PULL_UP); + break; + default: + return -ENOTSUPP; + } + + *config = pinconf_to_config_packed(param, arg); + + return 0; +} + +static int brcmstb_pinconf_set(struct pinctrl_dev *pctldev, + unsigned int pin, unsigned long *configs, + unsigned int num_configs) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + int ret = 0; + u32 param; + int i; + + for (i = 0; i < num_configs; i++) { + param = pinconf_to_config_param(configs[i]); + + switch (param) { + case PIN_CONFIG_BIAS_DISABLE: + ret = brcmstb_pull_config_set(pc, pin, BRCMSTB_PULL_NONE); + break; + case PIN_CONFIG_BIAS_PULL_DOWN: + ret = brcmstb_pull_config_set(pc, pin, BRCMSTB_PULL_DOWN); + break; + case PIN_CONFIG_BIAS_PULL_UP: + ret = brcmstb_pull_config_set(pc, pin, BRCMSTB_PULL_UP); + break; + default: + return -ENOTSUPP; + } + } + + return ret; +} + +static const struct pinconf_ops brcmstb_pinconf_ops = { + .is_generic = true, + .pin_config_get = brcmstb_pinconf_get, + .pin_config_set = brcmstb_pinconf_set, +}; + +int brcmstb_pinctrl_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + const struct brcmstb_pdata *pdata; + struct brcmstb_pinctrl *pc; + const char **names; + int num_pins, i; + + pdata = of_device_get_match_data(dev); + + pc = devm_kzalloc(dev, sizeof(*pc), GFP_KERNEL); + if (!pc) + return -ENOMEM; + + platform_set_drvdata(pdev, pc); + pc->dev = dev; + spin_lock_init(&pc->fsel_lock); + + pc->base = devm_of_iomap(dev, np, 0, NULL); + if (IS_ERR(pc->base)) + return dev_err_probe(&pdev->dev, PTR_ERR(pc->base), + "Could not get IO memory\n"); + + pc->pctl_desc = *pdata->pctl_desc; + pc->pctl_desc.pctlops = &brcmstb_pctl_ops; + pc->pctl_desc.pmxops = &brcmstb_pmx_ops; + pc->pctl_desc.confops = &brcmstb_pinconf_ops; + pc->pctl_desc.owner = THIS_MODULE; + num_pins = pc->pctl_desc.npins; + names = devm_kmalloc_array(dev, num_pins, sizeof(const char *), + GFP_KERNEL); + if (!names) + return -ENOMEM; + + for (i = 0; i < num_pins; i++) + names[i] = pc->pctl_desc.pins[i].name; + + pc->gpio_groups = names; + pc->pin_regs = pdata->pin_regs; + pc->pin_funcs = pdata->pin_funcs; + pc->func_count = pdata->func_count; + pc->func_names = pdata->func_names; + + pc->pctl_dev = devm_pinctrl_register(dev, &pc->pctl_desc, pc); + if (IS_ERR(pc->pctl_dev)) + return dev_err_probe(&pdev->dev, PTR_ERR(pc->pctl_dev), + "Failed to register pinctrl device\n"); + + pc->gpio_range = *pdata->gpio_range; + pinctrl_add_gpio_range(pc->pctl_dev, &pc->gpio_range); + + return 0; +} +EXPORT_SYMBOL(brcmstb_pinctrl_probe); + +MODULE_AUTHOR("Phil Elwell"); +MODULE_AUTHOR("Jonathan Bell"); +MODULE_AUTHOR("Ivan T. Ivanov"); +MODULE_AUTHOR("Andrea della Porta"); +MODULE_DESCRIPTION("Broadcom brcmstb pinctrl driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/pinctrl/bcm/pinctrl-brcmstb.h b/drivers/pinctrl/bcm/pinctrl-brcmstb.h new file mode 100644 index 00000000000000..c3459103e05639 --- /dev/null +++ b/drivers/pinctrl/bcm/pinctrl-brcmstb.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Header for Broadcom brcmstb GPIO based drivers + * + * Copyright (C) 2024-2025 Ivan T. Ivanov, Andrea della Porta + * Copyright (C) 2021-3 Raspberry Pi Ltd. + * Copyright (C) 2012 Chris Boot, Simon Arlott, Stephen Warren + * + * Based heavily on the BCM2835 GPIO & pinctrl driver, which was inspired by: + * pinctrl-nomadik.c, please see original file for copyright information + * pinctrl-tegra.c, please see original file for copyright information + */ + +#ifndef __PINCTRL_BRCMSTB_H__ +#define __PINCTRL_BRCMSTB_H__ + +#include +#include + +#define BRCMSTB_FUNC(f) \ + [func_##f] = #f + +#define MUX_BIT_VALID 0x8000 +#define PAD_BIT_INVALID 0xffff + +#define MUX_BIT(muxreg, muxshift) \ + (MUX_BIT_VALID + ((muxreg) << 5) + ((muxshift) << 2)) +#define PAD_BIT(padreg, padshift) \ + (((padreg) << 5) + ((padshift) << 1)) + +#define GPIO_REGS(n, muxreg, muxshift, padreg, padshift) \ + [n] = { MUX_BIT(muxreg, muxshift), PAD_BIT(padreg, padshift) } + +#define EMMC_REGS(n, padreg, padshift) \ + [n] = { 0, PAD_BIT(padreg, padshift) } + +#define AON_GPIO_REGS(n, muxreg, muxshift, padreg, padshift) \ + GPIO_REGS(n, muxreg, muxshift, padreg, padshift) + +#define AON_SGPIO_REGS(n, muxreg, muxshift) \ + [(n) + 32] = { MUX_BIT(muxreg, muxshift), PAD_BIT_INVALID } + +#define GPIO_PIN(n) PINCTRL_PIN(n, "gpio" #n) +/** + * AON pins are in the Always-On power domain. SGPIOs are also 'Safe' + * being 5V tolerant (necessary for the HDMI I2C pins), and can be driven + * while the power is off. + */ +#define AON_GPIO_PIN(n) PINCTRL_PIN(n, "aon_gpio" #n) +#define AON_SGPIO_PIN(n) PINCTRL_PIN((n) + 32, "aon_sgpio" #n) + +struct pin_regs { + u16 mux_bit; + u16 pad_bit; +}; + +/** + * struct brcmstb_pin_funcs - pins provide their primary/alternate + * functions in this struct + * @func_mask: mask representing valid bits of the function selector + * in the registers + * @funcs: array of function identifiers + * @n_funcs: number of identifiers of the @funcs array above + */ +struct brcmstb_pin_funcs { + const u32 func_mask; + const u8 *funcs; + const unsigned int n_funcs; +}; + +/** + * struct brcmstb_pdata - specific data for a pinctrl chip implementation + * @pctl_desc: pin controller descriptor for this implementation + * @gpio_range: range of GPIOs served by this controller + * @pin_regs: array of register descriptors for each pin + * @pin_funcs: array of all possible assignable function for each pin + * @func_count: total number of functions + * @func_gpio: which function number is GPIO (usually 0) + * @func_names: an array listing all function names + */ +struct brcmstb_pdata { + const struct pinctrl_desc *pctl_desc; + const struct pinctrl_gpio_range *gpio_range; + const struct pin_regs *pin_regs; + const struct brcmstb_pin_funcs *pin_funcs; + const unsigned int func_count; + const unsigned int func_gpio; + const char * const *func_names; +}; + +int brcmstb_pinctrl_probe(struct platform_device *pdev); + +#endif From a6a2f50ab1721343ee2d5d2be888709aa886e3aa Mon Sep 17 00:00:00 2001 From: Yulin Lu Date: Wed, 3 Sep 2025 17:19:15 +0800 Subject: [PATCH 1129/3206] pinctrl: eswin: Fix regulator error check and Kconfig dependency Smatch reported the following warning in eic7700_pinctrl_probe(): drivers/pinctrl/pinctrl-eic7700.c:638 eic7700_pinctrl_probe() warn: passing zero to 'PTR_ERR' The root cause is that devm_regulator_get() may return NULL when CONFIG_REGULATOR is disabled. In such case, IS_ERR_OR_NULL() triggers PTR_ERR(NULL) which evaluates to 0, leading to passing a success code as an error. However, this driver cannot work without a regulator. To fix this: - Change the check from IS_ERR_OR_NULL() to IS_ERR() - Update Kconfig to explicitly select REGULATOR and REGULATOR_FIXED_VOLTAGE, ensuring that the regulator framework is always available. This resolves the Smatch warning and enforces the correct dependency. Suggested-by: Dan Carpenter Fixes: 5b797bcc00ef ("pinctrl: eswin: Add EIC7700 pinctrl driver") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-gpio/aKRGiZ-fai0bv0tG@stanley.mountain/ Signed-off-by: Yulin Lu Signed-off-by: Linus Walleij --- drivers/pinctrl/Kconfig | 2 ++ drivers/pinctrl/pinctrl-eic7700.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index 6714aecce32587..a7dd60245db7b3 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -211,6 +211,8 @@ config PINCTRL_EIC7700 depends on ARCH_ESWIN || COMPILE_TEST select PINMUX select GENERIC_PINCONF + select REGULATOR + select REGULATOR_FIXED_VOLTAGE help This driver support for the pin controller in ESWIN's EIC7700 SoC, which supports pin multiplexing, pin configuration,and rgmii voltage diff --git a/drivers/pinctrl/pinctrl-eic7700.c b/drivers/pinctrl/pinctrl-eic7700.c index 4874b55323439a..ffcd0ec5c2dc6c 100644 --- a/drivers/pinctrl/pinctrl-eic7700.c +++ b/drivers/pinctrl/pinctrl-eic7700.c @@ -634,7 +634,7 @@ static int eic7700_pinctrl_probe(struct platform_device *pdev) return PTR_ERR(pc->base); regulator = devm_regulator_get(dev, "vrgmii"); - if (IS_ERR_OR_NULL(regulator)) { + if (IS_ERR(regulator)) { return dev_err_probe(dev, PTR_ERR(regulator), "failed to get vrgmii regulator\n"); } From 83d12f08ddfba09db414b611195af23fdeb70836 Mon Sep 17 00:00:00 2001 From: Hendrik Hamerlinck Date: Wed, 3 Sep 2025 12:01:04 +0200 Subject: [PATCH 1130/3206] pinctrl: spacemit: fix typo in PRI_TDI pin name The datasheet lists this signal as PRI_TDI, not PRI_DTI. Fix the pin name to match the documentation and JTAG naming convention (TDI = Test Data In). Signed-off-by: Hendrik Hamerlinck Reviewed-by: Yixun Lan Signed-off-by: Linus Walleij --- drivers/pinctrl/spacemit/pinctrl-k1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/spacemit/pinctrl-k1.c b/drivers/pinctrl/spacemit/pinctrl-k1.c index fb361f2acb54fb..33af9b5791c110 100644 --- a/drivers/pinctrl/spacemit/pinctrl-k1.c +++ b/drivers/pinctrl/spacemit/pinctrl-k1.c @@ -847,7 +847,7 @@ static const struct pinctrl_pin_desc k1_pin_desc[] = { PINCTRL_PIN(67, "GPIO_67"), PINCTRL_PIN(68, "GPIO_68"), PINCTRL_PIN(69, "GPIO_69"), - PINCTRL_PIN(70, "GPIO_70/PRI_DTI"), + PINCTRL_PIN(70, "GPIO_70/PRI_TDI"), PINCTRL_PIN(71, "GPIO_71/PRI_TMS"), PINCTRL_PIN(72, "GPIO_72/PRI_TCK"), PINCTRL_PIN(73, "GPIO_73/PRI_TDO"), From 6c3442b3b692769c9bced80da58e0130b9fa16cc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Sep 2025 21:58:54 +0300 Subject: [PATCH 1131/3206] pinctrl: keembay: fix double free in keembay_build_functions() This kfree() was accidentally left over when we converted to devm_ and it would lead to a double free. Delete it. Fixes: 995bc9f4826e ("pinctrl: keembay: release allocated memory in detach path") Signed-off-by: Dan Carpenter Reviewed-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-keembay.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pinctrl/pinctrl-keembay.c b/drivers/pinctrl/pinctrl-keembay.c index 30122ca90cbef1..3241d3ae621917 100644 --- a/drivers/pinctrl/pinctrl-keembay.c +++ b/drivers/pinctrl/pinctrl-keembay.c @@ -1643,10 +1643,8 @@ static int keembay_build_functions(struct keembay_pinctrl *kpc) new_funcs = devm_krealloc_array(kpc->dev, keembay_funcs, kpc->nfuncs, sizeof(*new_funcs), GFP_KERNEL); - if (!new_funcs) { - kfree(keembay_funcs); + if (!new_funcs) return -ENOMEM; - } return keembay_add_functions(kpc, new_funcs); } From 203a83112e097a501fbe12722b6342787497efe0 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 5 Sep 2025 11:21:50 +0200 Subject: [PATCH 1132/3206] pinctrl: generic: rename PIN_CONFIG_OUTPUT to LEVEL This generic pin config property is confusingly named so let's rename it to make things clearer. There are already drivers in the tree that use PIN_CONFIG_OUTPUT to *read* the value of an output driven pin, which is a big semantic confusion for the head: are we then reading the setting of the output or the actual value/level that is put out on the pin? We already have PIN_CONFIG_OUTPUT_ENABLE that turns on driver buffers for output, so this can by logical conclusion only drive the voltage level if it should be any different. But if we read the pin, are we then reading the *setting* of the output value or the *actual* value we can see on the line? If the pin has not first been set into output mode with PIN_CONFIG_OUTPUT_ENABLE, but is instead in some input mode or tristate, what will reading this property actually return? Reading the current users reading this property it is clear that what we read is the logical level of the pin as 0 or 1 depending on if it is low or high. Rename it to PIN_CONFIG_LEVEL so it is crystal clear that we set or read the voltage level of the pin and nothing else. Acked-by: Sudeep Holla Signed-off-by: Linus Walleij --- Documentation/driver-api/pin-control.rst | 4 ++-- drivers/gpio/gpio-rockchip.c | 2 +- drivers/pinctrl/bcm/pinctrl-bcm2835.c | 6 +++--- drivers/pinctrl/cirrus/pinctrl-madera-core.c | 4 ++-- drivers/pinctrl/mediatek/pinctrl-airoha.c | 4 ++-- drivers/pinctrl/mediatek/pinctrl-moore.c | 2 +- drivers/pinctrl/mediatek/pinctrl-mtk-common.c | 2 +- drivers/pinctrl/mediatek/pinctrl-paris.c | 4 ++-- drivers/pinctrl/meson/pinctrl-amlogic-a4.c | 6 +++--- drivers/pinctrl/meson/pinctrl-meson.c | 6 +++--- drivers/pinctrl/nomadik/pinctrl-abx500.c | 6 +++--- drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c | 6 +++--- drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c | 6 +++--- drivers/pinctrl/pinconf-generic.c | 6 +++--- drivers/pinctrl/pinctrl-at91-pio4.c | 2 +- drivers/pinctrl/pinctrl-aw9523.c | 6 +++--- drivers/pinctrl/pinctrl-cy8c95x0.c | 2 +- drivers/pinctrl/pinctrl-ingenic.c | 4 ++-- drivers/pinctrl/pinctrl-k210.c | 2 +- drivers/pinctrl/pinctrl-microchip-sgpio.c | 4 ++-- drivers/pinctrl/pinctrl-ocelot.c | 4 ++-- drivers/pinctrl/pinctrl-pic32.c | 4 ++-- drivers/pinctrl/pinctrl-rk805.c | 4 ++-- drivers/pinctrl/pinctrl-rockchip.c | 6 +++--- drivers/pinctrl/pinctrl-rp1.c | 2 +- drivers/pinctrl/pinctrl-scmi.c | 2 +- drivers/pinctrl/pinctrl-stmfx.c | 4 ++-- drivers/pinctrl/pinctrl-sx150x.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-lpass-lpi.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-msm.c | 6 +++--- drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-spmi-mpp.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c | 4 ++-- drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c | 4 ++-- drivers/pinctrl/renesas/pinctrl-rza1.c | 2 +- drivers/pinctrl/stm32/pinctrl-stm32.c | 2 +- drivers/pinctrl/sunplus/sppctl.c | 4 ++-- include/linux/pinctrl/pinconf-generic.h | 12 ++++++++---- sound/hda/codecs/side-codecs/cirrus_scodec_test.c | 2 +- 39 files changed, 91 insertions(+), 87 deletions(-) diff --git a/Documentation/driver-api/pin-control.rst b/Documentation/driver-api/pin-control.rst index 27ea1236307e84..8208924e513e1f 100644 --- a/Documentation/driver-api/pin-control.rst +++ b/Documentation/driver-api/pin-control.rst @@ -863,7 +863,7 @@ has to be handled by the ```` interface. Instead view thi a certain pin config setting. Look in e.g. ```` and you find this in the documentation: - PIN_CONFIG_OUTPUT: + PIN_CONFIG_LEVEL: this will configure the pin in output, use argument 1 to indicate high level, argument 0 to indicate low level. @@ -897,7 +897,7 @@ And your machine configuration may look like this: }; static unsigned long uart_sleep_mode[] = { - PIN_CONF_PACKED(PIN_CONFIG_OUTPUT, 0), + PIN_CONF_PACKED(PIN_CONFIG_LEVEL, 0), }; static struct pinctrl_map pinmap[] __initdata = { diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c index bcfc323a8315ef..47174eb3ba76fb 100644 --- a/drivers/gpio/gpio-rockchip.c +++ b/drivers/gpio/gpio-rockchip.c @@ -769,7 +769,7 @@ static int rockchip_gpio_probe(struct platform_device *pdev) list_del(&cfg->head); switch (cfg->param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = rockchip_gpio_direction_output(&bank->gpio_chip, cfg->pin, cfg->arg); if (ret) dev_warn(dev, "setting output pin %u to %u failed\n", cfg->pin, diff --git a/drivers/pinctrl/bcm/pinctrl-bcm2835.c b/drivers/pinctrl/bcm/pinctrl-bcm2835.c index 7dbf079739bcc1..c165674c5b4db0 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm2835.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm2835.c @@ -1023,7 +1023,7 @@ static int bcm2835_pinconf_get(struct pinctrl_dev *pctldev, /* No way to read back bias config in HW */ switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (fsel != BCM2835_FSEL_GPIO_OUT) return -EINVAL; @@ -1091,7 +1091,7 @@ static int bcm2835_pinconf_set(struct pinctrl_dev *pctldev, break; /* Set output-high or output-low */ - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bcm2835_gpio_set_bit(pc, arg ? GPSET0 : GPCLR0, pin); break; @@ -1202,7 +1202,7 @@ static int bcm2711_pinconf_set(struct pinctrl_dev *pctldev, break; /* Set output-high or output-low */ - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bcm2835_gpio_set_bit(pc, arg ? GPSET0 : GPCLR0, pin); break; diff --git a/drivers/pinctrl/cirrus/pinctrl-madera-core.c b/drivers/pinctrl/cirrus/pinctrl-madera-core.c index d19ef13224cca7..1d9481b1709178 100644 --- a/drivers/pinctrl/cirrus/pinctrl-madera-core.c +++ b/drivers/pinctrl/cirrus/pinctrl-madera-core.c @@ -804,7 +804,7 @@ static int madera_pin_conf_get(struct pinctrl_dev *pctldev, unsigned int pin, if (conf[0] & MADERA_GP1_IP_CFG_MASK) result = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if ((conf[1] & MADERA_GP1_DIR_MASK) && (conf[0] & MADERA_GP1_LVL_MASK)) result = 1; @@ -902,7 +902,7 @@ static int madera_pin_conf_set(struct pinctrl_dev *pctldev, unsigned int pin, mask[1] |= MADERA_GP1_DIR_MASK; conf[1] |= MADERA_GP1_DIR; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: val = pinconf_to_config_argument(*configs); mask[0] |= MADERA_GP1_LVL_MASK; if (val) diff --git a/drivers/pinctrl/mediatek/pinctrl-airoha.c b/drivers/pinctrl/mediatek/pinctrl-airoha.c index 8fb3b65a1b775c..c1e736786f5c36 100644 --- a/drivers/pinctrl/mediatek/pinctrl-airoha.c +++ b/drivers/pinctrl/mediatek/pinctrl-airoha.c @@ -2765,7 +2765,7 @@ static int airoha_pinconf_set(struct pinctrl_dev *pctrl_dev, break; case PIN_CONFIG_OUTPUT_ENABLE: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: { + case PIN_CONFIG_LEVEL: { bool input = param == PIN_CONFIG_INPUT_ENABLE; int err; @@ -2774,7 +2774,7 @@ static int airoha_pinconf_set(struct pinctrl_dev *pctrl_dev, if (err) return err; - if (param == PIN_CONFIG_OUTPUT) { + if (param == PIN_CONFIG_LEVEL) { err = airoha_pinconf_set_pin_value(pctrl_dev, pin, !!arg); if (err) diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.c b/drivers/pinctrl/mediatek/pinctrl-moore.c index 11dc525eb3a2dc..70f608347a5f68 100644 --- a/drivers/pinctrl/mediatek/pinctrl-moore.c +++ b/drivers/pinctrl/mediatek/pinctrl-moore.c @@ -332,7 +332,7 @@ static int mtk_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, goto err; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_DIR, MTK_OUTPUT); if (err) diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c index d10306024111c8..d6a46fe0cda891 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c +++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c @@ -384,7 +384,7 @@ static int mtk_pconf_parse_conf(struct pinctrl_dev *pctldev, mtk_pmx_gpio_set_direction(pctldev, NULL, pin, true); ret = mtk_pconf_set_ies_smt(pctl, pin, arg, param); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: mtk_gpio_set(pctl->chip, pin, arg); ret = mtk_pmx_gpio_set_direction(pctldev, NULL, pin, false); break; diff --git a/drivers/pinctrl/mediatek/pinctrl-paris.c b/drivers/pinctrl/mediatek/pinctrl-paris.c index 3e714554789d0e..6bf37d8085fae5 100644 --- a/drivers/pinctrl/mediatek/pinctrl-paris.c +++ b/drivers/pinctrl/mediatek/pinctrl-paris.c @@ -169,7 +169,7 @@ static int mtk_pinconf_get(struct pinctrl_dev *pctldev, if (!ret) err = -EINVAL; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_get_value(hw, desc, PINCTRL_PIN_REG_DIR, &ret); if (err) break; @@ -292,7 +292,7 @@ static int mtk_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, /* regard all non-zero value as enable */ err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_SR, !!arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_DO, arg); if (err) diff --git a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c index e34e984c2b38a8..d2b15f13356462 100644 --- a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c +++ b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c @@ -422,7 +422,7 @@ static int aml_pinconf_get(struct pinctrl_dev *pcdev, unsigned int pin, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = aml_pinconf_get_output(info, pin); if (ret <= 0) return -EINVAL; @@ -568,7 +568,7 @@ static int aml_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, switch (param) { case PIN_CONFIG_DRIVE_STRENGTH_UA: case PIN_CONFIG_OUTPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pinconf_to_config_argument(configs[i]); break; @@ -592,7 +592,7 @@ static int aml_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: ret = aml_pinconf_set_output(info, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = aml_pinconf_set_output_drive(info, pin, arg); break; default: diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c index 277e9c40490d71..18295b15ecd9dd 100644 --- a/drivers/pinctrl/meson/pinctrl-meson.c +++ b/drivers/pinctrl/meson/pinctrl-meson.c @@ -360,7 +360,7 @@ static int meson_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, switch (param) { case PIN_CONFIG_DRIVE_STRENGTH_UA: case PIN_CONFIG_OUTPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pinconf_to_config_argument(configs[i]); break; @@ -384,7 +384,7 @@ static int meson_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: ret = meson_pinconf_set_output(pc, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = meson_pinconf_set_output_drive(pc, pin, arg); break; default: @@ -502,7 +502,7 @@ static int meson_pinconf_get(struct pinctrl_dev *pcdev, unsigned int pin, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = meson_pinconf_get_output(pc, pin); if (ret <= 0) return -EINVAL; diff --git a/drivers/pinctrl/nomadik/pinctrl-abx500.c b/drivers/pinctrl/nomadik/pinctrl-abx500.c index 7b5f94d8cb23cd..fc7ebeda8440eb 100644 --- a/drivers/pinctrl/nomadik/pinctrl-abx500.c +++ b/drivers/pinctrl/nomadik/pinctrl-abx500.c @@ -860,8 +860,8 @@ static int abx500_pin_config_set(struct pinctrl_dev *pctldev, dev_dbg(chip->parent, "pin %d [%#lx]: %s %s\n", pin, configs[i], - (param == PIN_CONFIG_OUTPUT) ? "output " : "input", - (param == PIN_CONFIG_OUTPUT) ? + (param == PIN_CONFIG_LEVEL) ? "output " : "input", + (param == PIN_CONFIG_LEVEL) ? str_high_low(argument) : (argument ? "pull up" : "pull down")); @@ -907,7 +907,7 @@ static int abx500_pin_config_set(struct pinctrl_dev *pctldev, ret = abx500_gpio_direction_input(chip, offset); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = abx500_gpio_direction_output(chip, offset, argument); break; diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c index c2ca71ebb9736d..7f45c2897c3f27 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c @@ -1700,13 +1700,13 @@ static int npcm7xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, else if (param == PIN_CONFIG_BIAS_PULL_DOWN) rc = (!pu && pd); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: ie = ioread32(bank->base + NPCM7XX_GP_N_IEM) & pinmask; oe = ioread32(bank->base + NPCM7XX_GP_N_OE) & pinmask; if (param == PIN_CONFIG_INPUT_ENABLE) rc = (ie && !oe); - else if (param == PIN_CONFIG_OUTPUT) + else if (param == PIN_CONFIG_LEVEL) rc = (!ie && oe); break; case PIN_CONFIG_DRIVE_PUSH_PULL: @@ -1765,7 +1765,7 @@ static int npcm7xx_config_set_one(struct npcm7xx_pinctrl *npcm, iowrite32(gpio, bank->base + NPCM7XX_GP_N_OEC); bank->direction_input(&bank->chip.gc, pin % bank->chip.gc.ngpio); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bank->direction_output(&bank->chip.gc, pin % bank->chip.gc.ngpio, arg); iowrite32(gpio, bank->base + NPCM7XX_GP_N_OES); break; diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c index 0f155a685bbae7..920dd207792596 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c @@ -2187,13 +2187,13 @@ static int npcm8xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, else if (param == PIN_CONFIG_BIAS_PULL_DOWN) rc = !pu && pd; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: ie = ioread32(bank->base + NPCM8XX_GP_N_IEM) & pinmask; oe = ioread32(bank->base + NPCM8XX_GP_N_OE) & pinmask; if (param == PIN_CONFIG_INPUT_ENABLE) rc = (ie && !oe); - else if (param == PIN_CONFIG_OUTPUT) + else if (param == PIN_CONFIG_LEVEL) rc = (!ie && oe); break; case PIN_CONFIG_DRIVE_PUSH_PULL: @@ -2251,7 +2251,7 @@ static int npcm8xx_config_set_one(struct npcm8xx_pinctrl *npcm, iowrite32(gpio, bank->base + NPCM8XX_GP_N_OEC); bank->direction_input(&bank->chip.gc, pin % bank->chip.gc.ngpio); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bank->direction_output(&bank->chip.gc, pin % bank->chip.gc.ngpio, arg); iowrite32(gpio, bank->base + NPCM8XX_GP_N_OES); break; diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index d67838afb08574..5de6ff62c69bdb 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -48,7 +48,7 @@ static const struct pin_config_item conf_items[] = { PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT_ENABLE, "input schmitt enabled", NULL, false), PCONFDUMP(PIN_CONFIG_MODE_LOW_POWER, "pin low power", "mode", true), PCONFDUMP(PIN_CONFIG_OUTPUT_ENABLE, "output enabled", NULL, false), - PCONFDUMP(PIN_CONFIG_OUTPUT, "pin output", "level", true), + PCONFDUMP(PIN_CONFIG_LEVEL, "pin output", "level", true), PCONFDUMP(PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, "output impedance", "ohms", true), PCONFDUMP(PIN_CONFIG_POWER_SOURCE, "pin power source", "selector", true), PCONFDUMP(PIN_CONFIG_SLEEP_HARDWARE_STATE, "sleep hardware state", NULL, false), @@ -183,9 +183,9 @@ static const struct pinconf_generic_params dt_params[] = { { "low-power-enable", PIN_CONFIG_MODE_LOW_POWER, 1 }, { "output-disable", PIN_CONFIG_OUTPUT_ENABLE, 0 }, { "output-enable", PIN_CONFIG_OUTPUT_ENABLE, 1 }, - { "output-high", PIN_CONFIG_OUTPUT, 1, }, + { "output-high", PIN_CONFIG_LEVEL, 1, }, { "output-impedance-ohms", PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, 0 }, - { "output-low", PIN_CONFIG_OUTPUT, 0, }, + { "output-low", PIN_CONFIG_LEVEL, 0, }, { "power-source", PIN_CONFIG_POWER_SOURCE, 0 }, { "sleep-hardware-state", PIN_CONFIG_SLEEP_HARDWARE_STATE, 0 }, { "slew-rate", PIN_CONFIG_SLEW_RATE, 0 }, diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c index 35ea3414cb96d7..ec5351fc282e20 100644 --- a/drivers/pinctrl/pinctrl-at91-pio4.c +++ b/drivers/pinctrl/pinctrl-at91-pio4.c @@ -862,7 +862,7 @@ static int atmel_conf_pin_config_group_set(struct pinctrl_dev *pctldev, conf |= ATMEL_PIO_IFSCEN_MASK; } break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: conf |= ATMEL_PIO_DIR_MASK; bank = ATMEL_PIO_BANK(pin_id); pin = ATMEL_PIO_LINE(pin_id); diff --git a/drivers/pinctrl/pinctrl-aw9523.c b/drivers/pinctrl/pinctrl-aw9523.c index 890b83fddea3c5..479553a7921614 100644 --- a/drivers/pinctrl/pinctrl-aw9523.c +++ b/drivers/pinctrl/pinctrl-aw9523.c @@ -215,7 +215,7 @@ static int aw9523_pcfg_param_to_reg(enum pin_config_param pcp, int pin, u8 *r) case PIN_CONFIG_OUTPUT_ENABLE: reg = AW9523_REG_CONF_STATE(pin); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: reg = AW9523_REG_OUT_STATE(pin); break; default: @@ -249,7 +249,7 @@ static int aw9523_pconf_get(struct pinctrl_dev *pctldev, unsigned int pin, switch (param) { case PIN_CONFIG_BIAS_PULL_UP: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: val &= BIT(regbit); break; case PIN_CONFIG_BIAS_PULL_DOWN: @@ -301,7 +301,7 @@ static int aw9523_pconf_set(struct pinctrl_dev *pctldev, unsigned int pin, goto end; switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* First, enable pin output */ rc = regmap_update_bits(awi->regmap, AW9523_REG_CONF_STATE(pin), diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index cf7f80497fdeaf..a4b04bf6d081f6 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -808,7 +808,7 @@ static int cy8c95x0_gpio_get_pincfg(struct cy8c95x0_pinctrl *chip, case PIN_CONFIG_MODE_PWM: reg = CY8C95X0_SELPWM; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: reg = CY8C95X0_OUTPUT; break; case PIN_CONFIG_OUTPUT_ENABLE: diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c index e5b24fab12e11e..c7f14546de0528 100644 --- a/drivers/pinctrl/pinctrl-ingenic.c +++ b/drivers/pinctrl/pinctrl-ingenic.c @@ -4264,7 +4264,7 @@ static int ingenic_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_BIAS_PULL_UP: case PIN_CONFIG_BIAS_PULL_DOWN: case PIN_CONFIG_INPUT_SCHMITT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_SLEW_RATE: continue; default: @@ -4305,7 +4305,7 @@ static int ingenic_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, ingenic_set_schmitt_trigger(jzpc, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = pinctrl_gpio_direction_output(jzpc->gc, pin - jzpc->gc->base); if (ret) diff --git a/drivers/pinctrl/pinctrl-k210.c b/drivers/pinctrl/pinctrl-k210.c index 66c04120c29dec..ddd6d6bfd51399 100644 --- a/drivers/pinctrl/pinctrl-k210.c +++ b/drivers/pinctrl/pinctrl-k210.c @@ -551,7 +551,7 @@ static int k210_pinconf_set_param(struct pinctrl_dev *pctldev, else val &= ~K210_PC_ST; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: k210_pinmux_set_pin_function(pctldev, pin, K210_PCF_CONSTANT); val = readl(&pdata->fpioa->pins[pin]); val |= K210_PC_MODE_OUT; diff --git a/drivers/pinctrl/pinctrl-microchip-sgpio.c b/drivers/pinctrl/pinctrl-microchip-sgpio.c index 069cc665a4023a..b6363f3cdce94e 100644 --- a/drivers/pinctrl/pinctrl-microchip-sgpio.c +++ b/drivers/pinctrl/pinctrl-microchip-sgpio.c @@ -371,7 +371,7 @@ static int sgpio_pinconf_get(struct pinctrl_dev *pctldev, val = !bank->is_input; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (bank->is_input) return -EINVAL; val = sgpio_output_get(priv, &addr); @@ -402,7 +402,7 @@ static int sgpio_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, arg = pinconf_to_config_argument(configs[cfg]); switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (bank->is_input) return -EINVAL; err = sgpio_output_set(priv, &addr, arg); diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c index b82bf83fed25b6..70da3f37567a5c 100644 --- a/drivers/pinctrl/pinctrl-ocelot.c +++ b/drivers/pinctrl/pinctrl-ocelot.c @@ -1656,7 +1656,7 @@ static int ocelot_pinconf_get(struct pinctrl_dev *pctldev, return err; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = regmap_read(info->map, REG(OCELOT_GPIO_OUT, info, pin), &val); if (err) @@ -1735,7 +1735,7 @@ static int ocelot_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: p = pin % 32; if (arg) regmap_write(info->map, diff --git a/drivers/pinctrl/pinctrl-pic32.c b/drivers/pinctrl/pinctrl-pic32.c index 37c2bf752154d4..e8b481e87c7792 100644 --- a/drivers/pinctrl/pinctrl-pic32.c +++ b/drivers/pinctrl/pinctrl-pic32.c @@ -1905,7 +1905,7 @@ static int pic32_pinconf_get(struct pinctrl_dev *pctldev, unsigned pin, case PIN_CONFIG_INPUT_ENABLE: arg = !!(readl(bank->reg_base + TRIS_REG) & mask); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = !(readl(bank->reg_base + TRIS_REG) & mask); break; default: @@ -1960,7 +1960,7 @@ static int pic32_pinconf_set(struct pinctrl_dev *pctldev, unsigned pin, case PIN_CONFIG_INPUT_ENABLE: pic32_gpio_direction_input(&bank->gpio_chip, offset); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pic32_gpio_direction_output(&bank->gpio_chip, offset, arg); break; diff --git a/drivers/pinctrl/pinctrl-rk805.c b/drivers/pinctrl/pinctrl-rk805.c index 3acf770316c1aa..22f576337faa9c 100644 --- a/drivers/pinctrl/pinctrl-rk805.c +++ b/drivers/pinctrl/pinctrl-rk805.c @@ -541,7 +541,7 @@ static int rk805_pinconf_get(struct pinctrl_dev *pctldev, u32 arg = 0; switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: arg = rk805_gpio_get(&pci->gpio_chip, pin); break; @@ -568,7 +568,7 @@ static int rk805_pinconf_set(struct pinctrl_dev *pctldev, arg = pinconf_to_config_argument(configs[i]); switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rk805_gpio_set(&pci->gpio_chip, pin, arg); rk805_pmx_gpio_set_direction(pctldev, NULL, pin, false); break; diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 930c454e0cec7d..7a68a6237649c1 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -3272,7 +3272,7 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, param = pinconf_to_config_param(configs[i]); arg = pinconf_to_config_argument(configs[i]); - if (param == PIN_CONFIG_OUTPUT || param == PIN_CONFIG_INPUT_ENABLE) { + if (param == PIN_CONFIG_LEVEL || param == PIN_CONFIG_INPUT_ENABLE) { /* * Check for gpio driver not being probed yet. * The lock makes sure that either gpio-probe has completed @@ -3313,7 +3313,7 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, if (rc) return rc; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rc = rockchip_set_mux(bank, pin - bank->pin_base, RK_FUNC_GPIO); if (rc != RK_FUNC_GPIO) @@ -3392,7 +3392,7 @@ static int rockchip_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rc = rockchip_get_mux(bank, pin - bank->pin_base); if (rc != RK_FUNC_GPIO) return -EINVAL; diff --git a/drivers/pinctrl/pinctrl-rp1.c b/drivers/pinctrl/pinctrl-rp1.c index 605105aa0cbc7f..ffc2f0b460a641 100644 --- a/drivers/pinctrl/pinctrl-rp1.c +++ b/drivers/pinctrl/pinctrl-rp1.c @@ -1440,7 +1440,7 @@ static int rp1_pinconf_set(struct pinctrl_dev *pctldev, unsigned int offset, rp1_output_enable(pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rp1_set_value(pin, arg); rp1_set_dir(pin, RP1_DIR_OUTPUT); rp1_set_fsel(pin, RP1_FSEL_GPIO); diff --git a/drivers/pinctrl/pinctrl-scmi.c b/drivers/pinctrl/pinctrl-scmi.c index 383681041e4c05..d14528b9aa31ef 100644 --- a/drivers/pinctrl/pinctrl-scmi.c +++ b/drivers/pinctrl/pinctrl-scmi.c @@ -253,7 +253,7 @@ static int pinctrl_scmi_map_pinconf_type(enum pin_config_param param, case PIN_CONFIG_MODE_LOW_POWER: *type = SCMI_PIN_LOW_POWER_MODE; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: *type = SCMI_PIN_OUTPUT_VALUE; break; case PIN_CONFIG_OUTPUT_ENABLE: diff --git a/drivers/pinctrl/pinctrl-stmfx.c b/drivers/pinctrl/pinctrl-stmfx.c index c89b99003b7111..03ee13844b5073 100644 --- a/drivers/pinctrl/pinctrl-stmfx.c +++ b/drivers/pinctrl/pinctrl-stmfx.c @@ -267,7 +267,7 @@ static int stmfx_pinconf_get(struct pinctrl_dev *pctldev, if ((!dir && !type) || (dir && type)) arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (dir) return -EINVAL; @@ -334,7 +334,7 @@ static int stmfx_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, if (ret) return ret; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = stmfx_gpio_direction_output(&pctl->gpio_chip, pin, arg); if (ret) diff --git a/drivers/pinctrl/pinctrl-sx150x.c b/drivers/pinctrl/pinctrl-sx150x.c index b613568b42b73d..1d6760ffe809a2 100644 --- a/drivers/pinctrl/pinctrl-sx150x.c +++ b/drivers/pinctrl/pinctrl-sx150x.c @@ -611,7 +611,7 @@ static int sx150x_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, if (sx150x_pin_is_oscio(pctl, pin)) { switch (param) { case PIN_CONFIG_DRIVE_PUSH_PULL: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = regmap_read(pctl->regmap, pctl->data->pri.x789.reg_clock, &data); @@ -705,7 +705,7 @@ static int sx150x_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, } break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = sx150x_gpio_get_direction(&pctl->gpio, pin); if (ret < 0) return ret; @@ -744,7 +744,7 @@ static int sx150x_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, arg = pinconf_to_config_argument(configs[i]); if (sx150x_pin_is_oscio(pctl, pin)) { - if (param == PIN_CONFIG_OUTPUT) { + if (param == PIN_CONFIG_LEVEL) { ret = sx150x_gpio_direction_output(&pctl->gpio, pin, arg); if (ret < 0) @@ -816,7 +816,7 @@ static int sx150x_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = sx150x_gpio_direction_output(&pctl->gpio, pin, arg); if (ret < 0) diff --git a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c index 54c77e0b96e91d..28eb92a2b249fe 100644 --- a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c +++ b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c @@ -174,7 +174,7 @@ static int lpi_config_get(struct pinctrl_dev *pctldev, arg = 1; break; case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (is_out) arg = 1; break; @@ -252,7 +252,7 @@ static int lpi_config_set(struct pinctrl_dev *pctldev, unsigned int group, case PIN_CONFIG_INPUT_ENABLE: output_enabled = false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: output_enabled = true; value = arg; break; @@ -314,7 +314,7 @@ static int lpi_gpio_direction_output(struct gpio_chip *chip, struct lpi_pinctrl *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return lpi_config_set(state->ctrl, pin, &config, 1); } @@ -332,7 +332,7 @@ static int lpi_gpio_set(struct gpio_chip *chip, unsigned int pin, int value) struct lpi_pinctrl *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return lpi_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index 1751d838ce95d6..67525d542c5b7b 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -295,7 +295,7 @@ static int msm_config_reg(struct msm_pinctrl *pctrl, *bit = g->drv_bit; *mask = 7; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: case PIN_CONFIG_OUTPUT_ENABLE: *bit = g->oe_bit; @@ -385,7 +385,7 @@ static int msm_config_group_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_DRIVE_STRENGTH: arg = msm_regval_to_drive(arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* Pin is not output */ if (!arg) return -EINVAL; @@ -464,7 +464,7 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev, else arg = (arg / 2) - 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* set output value */ raw_spin_lock_irqsave(&pctrl->lock, flags); val = msm_readl_io(pctrl, g); diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index b7b15874e488a1..485b68cc93f8ed 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -438,7 +438,7 @@ static int pmic_gpio_config_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_OUTPUT_ENABLE: arg = pad->output_enabled; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pad->out_value; break; case PMIC_GPIO_CONF_PULL_UP: @@ -530,7 +530,7 @@ static int pmic_gpio_config_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: pad->output_enabled = arg ? true : false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pad->output_enabled = true; pad->out_value = arg; break; @@ -737,7 +737,7 @@ static int pmic_gpio_direction_output(struct gpio_chip *chip, struct pmic_gpio_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return pmic_gpio_config_set(state->ctrl, pin, &config, 1); } @@ -769,7 +769,7 @@ static int pmic_gpio_set(struct gpio_chip *chip, unsigned int pin, int value) struct pmic_gpio_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return pmic_gpio_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c b/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c index 22d76b1013a313..64f8024a865cd3 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c @@ -370,7 +370,7 @@ static int pmic_mpp_config_get(struct pinctrl_dev *pctldev, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pad->out_value; break; case PMIC_MPP_CONF_DTEST_SELECTOR: @@ -447,7 +447,7 @@ static int pmic_mpp_config_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_INPUT_ENABLE: pad->input_enabled = arg ? true : false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pad->output_enabled = true; pad->out_value = arg; break; @@ -576,7 +576,7 @@ static int pmic_mpp_direction_output(struct gpio_chip *chip, struct pmic_mpp_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return pmic_mpp_config_set(state->ctrl, pin, &config, 1); } @@ -605,7 +605,7 @@ static int pmic_mpp_set(struct gpio_chip *chip, unsigned int pin, int value) struct pmic_mpp_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return pmic_mpp_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c b/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c index fb37b1c1acb41f..5c966d51eda7bb 100644 --- a/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c @@ -282,7 +282,7 @@ static int pm8xxx_pin_config_get(struct pinctrl_dev *pctldev, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (pin->mode & PM8XXX_GPIO_MODE_OUTPUT) arg = pin->output_value; else @@ -364,7 +364,7 @@ static int pm8xxx_pin_config_set(struct pinctrl_dev *pctldev, pin->mode = PM8XXX_GPIO_MODE_INPUT; banks |= BIT(0) | BIT(1); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pin->mode = PM8XXX_GPIO_MODE_OUTPUT; pin->output_value = !!arg; banks |= BIT(0) | BIT(1); diff --git a/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c b/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c index 6103849af042d6..7970fa6e15579e 100644 --- a/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c +++ b/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c @@ -337,7 +337,7 @@ static int pm8xxx_pin_config_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_INPUT_ENABLE: arg = pin->input; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pin->output_value; break; case PIN_CONFIG_POWER_SOURCE: @@ -392,7 +392,7 @@ static int pm8xxx_pin_config_set(struct pinctrl_dev *pctldev, case PIN_CONFIG_INPUT_ENABLE: pin->input = true; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pin->output = true; pin->output_value = !!arg; break; diff --git a/drivers/pinctrl/renesas/pinctrl-rza1.c b/drivers/pinctrl/renesas/pinctrl-rza1.c index 70f22e0ef307a0..f24e5915cbe4b4 100644 --- a/drivers/pinctrl/renesas/pinctrl-rza1.c +++ b/drivers/pinctrl/renesas/pinctrl-rza1.c @@ -933,7 +933,7 @@ static int rza1_parse_pinmux_node(struct rza1_pinctrl *rza1_pctl, case PIN_CONFIG_INPUT_ENABLE: pinmux_flags |= MUX_FLAGS_SWIO_INPUT; break; - case PIN_CONFIG_OUTPUT: /* for DT backwards compatibility */ + case PIN_CONFIG_LEVEL: /* for DT backwards compatibility */ case PIN_CONFIG_OUTPUT_ENABLE: pinmux_flags |= MUX_FLAGS_SWIO_OUTPUT; break; diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index 823c8fe758e2c0..3ebb468de830db 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -1236,7 +1236,7 @@ static int stm32_pconf_parse_conf(struct pinctrl_dev *pctldev, case PIN_CONFIG_BIAS_PULL_DOWN: ret = stm32_pconf_set_bias(bank, offset, 2); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: __stm32_gpio_set(bank, offset, arg); ret = stm32_pmx_gpio_set_direction(pctldev, range, pin, false); break; diff --git a/drivers/pinctrl/sunplus/sppctl.c b/drivers/pinctrl/sunplus/sppctl.c index 3e924aa86cc2fa..fabe7efaa837a4 100644 --- a/drivers/pinctrl/sunplus/sppctl.c +++ b/drivers/pinctrl/sunplus/sppctl.c @@ -488,7 +488,7 @@ static int sppctl_gpio_set_config(struct gpio_chip *chip, unsigned int offset, case PIN_CONFIG_INPUT_ENABLE: break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: return sppctl_gpio_direction_output(chip, offset, 0); case PIN_CONFIG_PERSIST_STATE: @@ -580,7 +580,7 @@ static int sppctl_pin_config_get(struct pinctrl_dev *pctldev, unsigned int pin, arg = 0; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (!sppctl_first_get(&pctl->spp_gchip->chip, pin)) return -EINVAL; if (!sppctl_master_get(&pctl->spp_gchip->chip, pin)) diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 1bcf071b860ebb..d9245ecec71dc6 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -88,9 +88,13 @@ struct pinctrl_map; * passed in the argument on a custom form, else just use argument 1 * to indicate low power mode, argument 0 turns low power mode off. * @PIN_CONFIG_MODE_PWM: this will configure the pin for PWM - * @PIN_CONFIG_OUTPUT: this will configure the pin as an output and drive a - * value on the line. Use argument 1 to indicate high level, argument 0 to - * indicate low level. (Please see Documentation/driver-api/pin-control.rst, + * @PIN_CONFIG_LEVEL: setting this will configure the pin as an output and + * drive a value on the line. Use argument 1 to indicate high level, + * argument 0 to indicate low level. Conversely the value of the line + * can be read using this parameter, if and only if that value can be + * represented as a binary 0 or 1 where 0 indicate a low voltage level + * and 1 indicate a high voltage level. + * (Please see Documentation/driver-api/pin-control.rst, * section "GPIO mode pitfalls" for a discussion around this parameter.) * @PIN_CONFIG_OUTPUT_ENABLE: this will enable the pin's output mode * without driving a value there. For most platforms this reduces to @@ -137,7 +141,7 @@ enum pin_config_param { PIN_CONFIG_INPUT_SCHMITT_UV, PIN_CONFIG_MODE_LOW_POWER, PIN_CONFIG_MODE_PWM, - PIN_CONFIG_OUTPUT, + PIN_CONFIG_LEVEL, PIN_CONFIG_OUTPUT_ENABLE, PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, PIN_CONFIG_PERSIST_STATE, diff --git a/sound/hda/codecs/side-codecs/cirrus_scodec_test.c b/sound/hda/codecs/side-codecs/cirrus_scodec_test.c index 9ba14c09c07ffe..3cca750857b689 100644 --- a/sound/hda/codecs/side-codecs/cirrus_scodec_test.c +++ b/sound/hda/codecs/side-codecs/cirrus_scodec_test.c @@ -69,7 +69,7 @@ static int cirrus_scodec_test_gpio_set_config(struct gpio_chip *gc, unsigned long config) { switch (pinconf_to_config_param(config)) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_OUTPUT_ENABLE: return -EOPNOTSUPP; default: From b39e9199fbe1df6c8c510c52c24b5db35020e705 Mon Sep 17 00:00:00 2001 From: Sean Parker Date: Thu, 4 Sep 2025 10:04:26 -0700 Subject: [PATCH 1133/3206] pinctrl: qcom: sm8250: Add egpio support This mirrors the egpio support added to sc7280/sm8450/etc. This change is necessary for GPIOs 146 - 179 (34 GPIOs) to be used as normal GPIOs. Signed-off-by: Sean Parker Reviewed-by: Konrad Dybcio Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-sm8250.c | 81 ++++++++++++++++----------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-sm8250.c b/drivers/pinctrl/qcom/pinctrl-sm8250.c index 6021d9f6e407ef..f05361f3100db5 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8250.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8250.c @@ -49,6 +49,8 @@ enum { .mux_bit = 2, \ .pull_bit = 0, \ .drv_bit = 6, \ + .egpio_enable = 12, \ + .egpio_present = 11, \ .oe_bit = 9, \ .in_bit = 0, \ .out_bit = 1, \ @@ -511,6 +513,7 @@ enum sm8250_functions { msm_mux_ddr_pxi2, msm_mux_ddr_pxi3, msm_mux_dp_hot, + msm_mux_egpio, msm_mux_dp_lcd, msm_mux_gcc_gp1, msm_mux_gcc_gp2, @@ -830,6 +833,14 @@ static const char * const gpio_groups[] = { "gpio171", "gpio172", "gpio173", "gpio174", "gpio175", "gpio176", "gpio177", "gpio178", "gpio179", }; +static const char * const egpio_groups[] = { + "gpio146", "gpio147", "gpio148", "gpio149", "gpio150", "gpio151", + "gpio152", "gpio153", "gpio154", "gpio155", "gpio156", "gpio157", + "gpio158", "gpio159", "gpio160", "gpio161", "gpio162", "gpio163", + "gpio164", "gpio165", "gpio166", "gpio167", "gpio168", "gpio169", + "gpio170", "gpio171", "gpio172", "gpio173", "gpio174", "gpio175", + "gpio176", "gpio177", "gpio178", "gpio179", +}; static const char * const qdss_cti_groups[] = { "gpio0", "gpio2", "gpio2", "gpio44", "gpio45", "gpio46", "gpio92", "gpio93", @@ -1018,6 +1029,7 @@ static const struct pinfunction sm8250_functions[] = { MSM_PIN_FUNCTION(ddr_pxi3), MSM_PIN_FUNCTION(dp_hot), MSM_PIN_FUNCTION(dp_lcd), + MSM_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), @@ -1265,40 +1277,40 @@ static const struct msm_pingroup sm8250_groups[] = { [143] = PINGROUP(143, WEST, lpass_slimbus, mi2s1_data0, ddr_bist, _, _, _, _, _, _), [144] = PINGROUP(144, WEST, lpass_slimbus, mi2s1_data1, ddr_bist, _, _, _, _, _, _), [145] = PINGROUP(145, WEST, lpass_slimbus, mi2s1_ws, _, _, _, _, _, _, _), - [146] = PINGROUP(146, WEST, _, _, _, _, _, _, _, _, _), - [147] = PINGROUP(147, WEST, _, _, _, _, _, _, _, _, _), - [148] = PINGROUP(148, WEST, _, _, _, _, _, _, _, _, _), - [149] = PINGROUP(149, WEST, _, _, _, _, _, _, _, _, _), - [150] = PINGROUP(150, WEST, _, _, _, _, _, _, _, _, _), - [151] = PINGROUP(151, WEST, _, _, _, _, _, _, _, _, _), - [152] = PINGROUP(152, WEST, _, _, _, _, _, _, _, _, _), - [153] = PINGROUP(153, WEST, _, _, _, _, _, _, _, _, _), - [154] = PINGROUP(154, WEST, _, _, _, _, _, _, _, _, _), - [155] = PINGROUP(155, WEST, _, _, _, _, _, _, _, _, _), - [156] = PINGROUP(156, WEST, _, _, _, _, _, _, _, _, _), - [157] = PINGROUP(157, WEST, _, _, _, _, _, _, _, _, _), - [158] = PINGROUP(158, WEST, _, _, _, _, _, _, _, _, _), - [159] = PINGROUP(159, WEST, cri_trng0, _, _, _, _, _, _, _, _), - [160] = PINGROUP(160, WEST, cri_trng1, qdss_gpio, _, _, _, _, _, _, _), - [161] = PINGROUP(161, WEST, cri_trng, qdss_gpio, _, _, _, _, _, _, _), - [162] = PINGROUP(162, WEST, sp_cmu, qdss_gpio, _, _, _, _, _, _, _), - [163] = PINGROUP(163, WEST, prng_rosc, qdss_gpio, _, _, _, _, _, _, _), - [164] = PINGROUP(164, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [165] = PINGROUP(165, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [166] = PINGROUP(166, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [167] = PINGROUP(167, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [168] = PINGROUP(168, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [169] = PINGROUP(169, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [170] = PINGROUP(170, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [171] = PINGROUP(171, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [172] = PINGROUP(172, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [173] = PINGROUP(173, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [174] = PINGROUP(174, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [175] = PINGROUP(175, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [176] = PINGROUP(176, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [177] = PINGROUP(177, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [178] = PINGROUP(178, WEST, _, _, _, _, _, _, _, _, _), - [179] = PINGROUP(179, WEST, _, _, _, _, _, _, _, _, _), + [146] = PINGROUP(146, WEST, _, _, _, _, _, _, _, _, egpio), + [147] = PINGROUP(147, WEST, _, _, _, _, _, _, _, _, egpio), + [148] = PINGROUP(148, WEST, _, _, _, _, _, _, _, _, egpio), + [149] = PINGROUP(149, WEST, _, _, _, _, _, _, _, _, egpio), + [150] = PINGROUP(150, WEST, _, _, _, _, _, _, _, _, egpio), + [151] = PINGROUP(151, WEST, _, _, _, _, _, _, _, _, egpio), + [152] = PINGROUP(152, WEST, _, _, _, _, _, _, _, _, egpio), + [153] = PINGROUP(153, WEST, _, _, _, _, _, _, _, _, egpio), + [154] = PINGROUP(154, WEST, _, _, _, _, _, _, _, _, egpio), + [155] = PINGROUP(155, WEST, _, _, _, _, _, _, _, _, egpio), + [156] = PINGROUP(156, WEST, _, _, _, _, _, _, _, _, egpio), + [157] = PINGROUP(157, WEST, _, _, _, _, _, _, _, _, egpio), + [158] = PINGROUP(158, WEST, _, _, _, _, _, _, _, _, egpio), + [159] = PINGROUP(159, WEST, cri_trng0, _, _, _, _, _, _, _, egpio), + [160] = PINGROUP(160, WEST, cri_trng1, qdss_gpio, _, _, _, _, _, _, egpio), + [161] = PINGROUP(161, WEST, cri_trng, qdss_gpio, _, _, _, _, _, _, egpio), + [162] = PINGROUP(162, WEST, sp_cmu, qdss_gpio, _, _, _, _, _, _, egpio), + [163] = PINGROUP(163, WEST, prng_rosc, qdss_gpio, _, _, _, _, _, _, egpio), + [164] = PINGROUP(164, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [165] = PINGROUP(165, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [166] = PINGROUP(166, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [167] = PINGROUP(167, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [168] = PINGROUP(168, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [169] = PINGROUP(169, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [170] = PINGROUP(170, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [171] = PINGROUP(171, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [172] = PINGROUP(172, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [173] = PINGROUP(173, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [174] = PINGROUP(174, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [175] = PINGROUP(175, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [176] = PINGROUP(176, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [177] = PINGROUP(177, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [178] = PINGROUP(178, WEST, _, _, _, _, _, _, _, _, egpio), + [179] = PINGROUP(179, WEST, _, _, _, _, _, _, _, _, egpio), [180] = UFS_RESET(ufs_reset, 0xb8000), [181] = SDC_PINGROUP(sdc2_clk, 0xb7000, 14, 6), [182] = SDC_PINGROUP(sdc2_cmd, 0xb7000, 11, 3), @@ -1333,6 +1345,7 @@ static const struct msm_pinctrl_soc_data sm8250_pinctrl = { .ntiles = ARRAY_SIZE(sm8250_tiles), .wakeirq_map = sm8250_pdc_map, .nwakeirq_map = ARRAY_SIZE(sm8250_pdc_map), + .egpio_func = 9, }; static int sm8250_pinctrl_probe(struct platform_device *pdev) From 3c0979c64481f07a6bf9c1775601845ac5fa57f3 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 23 Jul 2025 13:27:45 +0100 Subject: [PATCH 1134/3206] arm64/sme: Drop inaccurate documentation of streaming mode switches The SME ABI documentation contains an inaccurate description of the architectural streaming mode entry/exit behaviour, just remove it since this is better documented by the architecture or with the rest of the documentation for the specific software interfaces concerned. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- Documentation/arch/arm64/sme.rst | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst index 4cb38330e7046b..583f2ee9cb9775 100644 --- a/Documentation/arch/arm64/sme.rst +++ b/Documentation/arch/arm64/sme.rst @@ -81,17 +81,7 @@ The ZA matrix is square with each side having as many bytes as a streaming mode SVE vector. -3. Sharing of streaming and non-streaming mode SVE state ---------------------------------------------------------- - -It is implementation defined which if any parts of the SVE state are shared -between streaming and non-streaming modes. When switching between modes -via software interfaces such as ptrace if no register content is provided as -part of switching no state will be assumed to be shared and everything will -be zeroed. - - -4. System call behaviour +3. System call behaviour ------------------------- * On syscall PSTATE.ZA is preserved, if PSTATE.ZA==1 then the contents of the @@ -112,7 +102,7 @@ be zeroed. exceptions for execve() described in section 6. -5. Signal handling +4. Signal handling ------------------- * Signal handlers are invoked with PSTATE.SM=0, PSTATE.ZA=0, and TPIDR2_EL0=0. From ebe397530638859853738ac1fcd082e1a0fdacdc Mon Sep 17 00:00:00 2001 From: Pankaj Patil Date: Fri, 5 Sep 2025 20:40:19 +0530 Subject: [PATCH 1135/3206] dt-bindings: pinctrl: qcom: Add Glymur pinctrl Add DeviceTree binding for Glymur SoC TLMM block Reviewed-by: Krzysztof Kozlowski Signed-off-by: Pankaj Patil Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,glymur-tlmm.yaml | 133 ++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/qcom,glymur-tlmm.yaml diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,glymur-tlmm.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,glymur-tlmm.yaml new file mode 100644 index 00000000000000..d2b0cfeffb501e --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/qcom,glymur-tlmm.yaml @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/qcom,glymur-tlmm.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm Technologies, Inc. Glymur TLMM block + +maintainers: + - Bjorn Andersson + +description: + Top Level Mode Multiplexer pin controller in Qualcomm Glymur SoC. + +allOf: + - $ref: /schemas/pinctrl/qcom,tlmm-common.yaml# + +properties: + compatible: + const: qcom,glymur-tlmm + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + gpio-reserved-ranges: + minItems: 1 + maxItems: 125 + + gpio-line-names: + maxItems: 250 + +patternProperties: + "-state$": + oneOf: + - $ref: "#/$defs/qcom-glymur-tlmm-state" + - patternProperties: + "-pins$": + $ref: "#/$defs/qcom-glymur-tlmm-state" + additionalProperties: false + +$defs: + qcom-glymur-tlmm-state: + type: object + description: + Pinctrl node's client devices use subnodes for desired pin configuration. + Client device subnodes use below standard properties. + $ref: qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state + unevaluatedProperties: false + + properties: + pins: + description: + List of gpio pins affected by the properties specified in this + subnode. + items: + oneOf: + - pattern: "^gpio([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9])$" + - enum: [ ufs_reset, sdc2_clk, sdc2_cmd, sdc2_data ] + minItems: 1 + maxItems: 36 + + function: + description: + Specify the alternative function to be configured for the specified + pins. + enum: [ gpio, resout_gpio_n, aoss_cti, asc_cci, atest_char, atest_usb, + audio_ext_mclk0, audio_ext_mclk1, audio_ref_clk, cam_asc_mclk4, + cam_mclk, cci_async_in, cci_i2c_scl, cci_i2c_sda, cci_timer, + cmu_rng, cri_trng, dbg_out_clk, ddr_bist_complete, + ddr_bist_fail, ddr_bist_start, ddr_bist_stop, ddr_pxi, + edp0_hot, edp0_lcd, edp1_lcd, egpio, eusb0_ac_en, eusb1_ac_en, + eusb2_ac_en, eusb3_ac_en, eusb5_ac_en, eusb6_ac_en, gcc_gp1, + gcc_gp2, gcc_gp3, host2wlan_sol, i2c0_s_scl, i2c0_s_sda, + i2s0_data, i2s0_sck, i2s0_ws, i2s1_data, i2s1_sck, i2s1_ws, + ibi_i3c, jitter_bist, mdp_vsync_out, mdp_vsync_e, mdp_vsync_p, + mdp_vsync_s, pcie3a_clk, pcie3a_rst_n, pcie3b_clk, + pcie4_clk_req_n, pcie5_clk_req_n, pcie6_clk_req_n, phase_flag, + pll_bist_sync, pll_clk_aux, pmc_oca_n, pmc_uva_n, prng_rosc, + qdss_cti, qdss_gpio, qspi, qup0_se0, qup0_se1, qup0_se2, + qup0_se3_l0, qup0_se3, qup0_se4, qup0_se5, qup0_se6, qup0_se7, + qup1_se0, qup1_se1, qup1_se2, qup1_se3, qup1_se4, qup1_se5, + qup1_se6, qup1_se7, qup2_se0, qup2_se1, qup2_se2, qup2_se3, + qup2_se4, qup2_se5, qup2_se6, qup2_se7, qup3_se0, qup3_se1, + sd_write_protect, sdc4_clk, sdc4_cmd, sdc4_data, smb_acok_n, + sys_throttle, tb_trig_sdc2, tb_trig_sdc4, tmess_prng, + tsense_pwm, tsense_therm, usb0_dp, usb0_phy_ps, usb0_sbrx, + usb0_sbtx, usb0_tmu, usb1_dbg, usb1_dp, usb1_phy_ps, usb1_sbrx, + usb1_sbtx, usb1_tmu, usb2_dp, usb2_phy_ps, usb2_sbrx, usb2_sbtx, + usb2_tmu, vsense_trigger_mirnat, wcn_sw, wcn_sw_ctrl ] + + required: + - pins + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + #include + tlmm: pinctrl@f100000 { + compatible = "qcom,glymur-tlmm"; + reg = <0x0f100000 0xf00000>; + interrupts = ; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + gpio-ranges = <&tlmm 0 0 249>; + wakeup-parent = <&pdc>; + gpio-reserved-ranges = <4 4>, <10 2>, <33 3>, <44 4>; + qup_uart21_default: qup-uart21-default-state { + tx-pins { + pins = "gpio86"; + function = "qup2_se5"; + drive-strength = <2>; + bias-disable; + }; + + rx-pins { + pins = "gpio87"; + function = "qup2_se5"; + drive-strength = <2>; + bias-disable; + }; + }; + }; +... From 87ebcd8baebf93b9d763dba5ee31e8fda62daec6 Mon Sep 17 00:00:00 2001 From: Pankaj Patil Date: Fri, 5 Sep 2025 20:40:20 +0530 Subject: [PATCH 1136/3206] pinctrl: qcom: Add glymur pinctrl driver Add TLMM pinctrl driver to support pin configuration with pinctrl framework for Glymur SoC. Reviewed-by: Bjorn Andersson Reviewed-by: Konrad Dybcio Signed-off-by: Pankaj Patil Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/Kconfig.msm | 10 + drivers/pinctrl/qcom/Makefile | 1 + drivers/pinctrl/qcom/pinctrl-glymur.c | 1777 +++++++++++++++++++++++++ 3 files changed, 1788 insertions(+) create mode 100644 drivers/pinctrl/qcom/pinctrl-glymur.c diff --git a/drivers/pinctrl/qcom/Kconfig.msm b/drivers/pinctrl/qcom/Kconfig.msm index 6dad942b00a35f..69a5b47adedc2a 100644 --- a/drivers/pinctrl/qcom/Kconfig.msm +++ b/drivers/pinctrl/qcom/Kconfig.msm @@ -15,6 +15,16 @@ config PINCTRL_APQ8084 This is the pinctrl, pinmux, pinconf and gpiolib driver for the Qualcomm TLMM block found in the Qualcomm APQ8084 platform. +config PINCTRL_GLYMUR + tristate "Qualcomm Technologies Inc Glymur pin controller driver" + depends on ARM64 || COMPILE_TEST + help + This is the pinctrl, pinmux, pinconf and gpiolib driver for the + Qualcomm Technologies Inc Top Level Mode Multiplexer block (TLMM) + block found on the Qualcomm Technologies Inc Glymur platform. + Say Y here to compile statically, or M here to compile it as a module. + If unsure, say N. + config PINCTRL_IPQ4019 tristate "Qualcomm IPQ4019 pin controller driver" depends on ARM || COMPILE_TEST diff --git a/drivers/pinctrl/qcom/Makefile b/drivers/pinctrl/qcom/Makefile index 2acff520a285a4..46ab3486c3ffcd 100644 --- a/drivers/pinctrl/qcom/Makefile +++ b/drivers/pinctrl/qcom/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_PINCTRL_MSM) += pinctrl-msm.o obj-$(CONFIG_PINCTRL_APQ8064) += pinctrl-apq8064.o obj-$(CONFIG_PINCTRL_APQ8084) += pinctrl-apq8084.o +obj-$(CONFIG_PINCTRL_GLYMUR) += pinctrl-glymur.o obj-$(CONFIG_PINCTRL_IPQ4019) += pinctrl-ipq4019.o obj-$(CONFIG_PINCTRL_IPQ5018) += pinctrl-ipq5018.o obj-$(CONFIG_PINCTRL_IPQ8064) += pinctrl-ipq8064.o diff --git a/drivers/pinctrl/qcom/pinctrl-glymur.c b/drivers/pinctrl/qcom/pinctrl-glymur.c new file mode 100644 index 00000000000000..9913f98e953110 --- /dev/null +++ b/drivers/pinctrl/qcom/pinctrl-glymur.c @@ -0,0 +1,1777 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025 Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#include +#include +#include +#include +#include + +#include "pinctrl-msm.h" + +#define REG_SIZE 0x1000 +#define PINGROUP(id, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11) \ + { \ + .grp = PINCTRL_PINGROUP("gpio" #id, \ + gpio##id##_pins, \ + ARRAY_SIZE(gpio##id##_pins)), \ + .ctl_reg = REG_SIZE * id, \ + .io_reg = 0x4 + REG_SIZE * id, \ + .intr_cfg_reg = 0x8 + REG_SIZE * id, \ + .intr_status_reg = 0xc + REG_SIZE * id, \ + .intr_target_reg = 0x8 + REG_SIZE * id, \ + .mux_bit = 2, \ + .pull_bit = 0, \ + .drv_bit = 6, \ + .egpio_enable = 12, \ + .egpio_present = 11, \ + .oe_bit = 9, \ + .in_bit = 0, \ + .out_bit = 1, \ + .intr_enable_bit = 0, \ + .intr_status_bit = 0, \ + .intr_target_bit = 5, \ + .intr_target_kpss_val = 3, \ + .intr_raw_status_bit = 4, \ + .intr_polarity_bit = 1, \ + .intr_detection_bit = 2, \ + .intr_detection_width = 2, \ + .funcs = (int[]){ \ + msm_mux_gpio, /* gpio mode */ \ + msm_mux_##f1, \ + msm_mux_##f2, \ + msm_mux_##f3, \ + msm_mux_##f4, \ + msm_mux_##f5, \ + msm_mux_##f6, \ + msm_mux_##f7, \ + msm_mux_##f8, \ + msm_mux_##f9, \ + msm_mux_##f10, \ + msm_mux_##f11 /* egpio mode */ \ + }, \ + .nfuncs = 12, \ + } + +#define SDC_QDSD_PINGROUP(pg_name, ctl, pull, drv) \ + { \ + .grp = PINCTRL_PINGROUP(#pg_name, \ + pg_name##_pins, \ + ARRAY_SIZE(pg_name##_pins)), \ + .ctl_reg = ctl, \ + .io_reg = 0, \ + .intr_cfg_reg = 0, \ + .intr_status_reg = 0, \ + .intr_target_reg = 0, \ + .mux_bit = -1, \ + .pull_bit = pull, \ + .drv_bit = drv, \ + .oe_bit = -1, \ + .in_bit = -1, \ + .out_bit = -1, \ + .intr_enable_bit = -1, \ + .intr_status_bit = -1, \ + .intr_target_bit = -1, \ + .intr_raw_status_bit = -1, \ + .intr_polarity_bit = -1, \ + .intr_detection_bit = -1, \ + .intr_detection_width = -1, \ + } + +#define UFS_RESET(pg_name, ctl, io) \ + { \ + .grp = PINCTRL_PINGROUP(#pg_name, \ + pg_name##_pins, \ + ARRAY_SIZE(pg_name##_pins)), \ + .ctl_reg = ctl, \ + .io_reg = io, \ + .intr_cfg_reg = 0, \ + .intr_status_reg = 0, \ + .intr_target_reg = 0, \ + .mux_bit = -1, \ + .pull_bit = 3, \ + .drv_bit = 0, \ + .oe_bit = -1, \ + .in_bit = -1, \ + .out_bit = 0, \ + .intr_enable_bit = -1, \ + .intr_status_bit = -1, \ + .intr_target_bit = -1, \ + .intr_raw_status_bit = -1, \ + .intr_polarity_bit = -1, \ + .intr_detection_bit = -1, \ + .intr_detection_width = -1, \ + } + +static const struct pinctrl_pin_desc glymur_pins[] = { + PINCTRL_PIN(0, "GPIO_0"), + PINCTRL_PIN(1, "GPIO_1"), + PINCTRL_PIN(2, "GPIO_2"), + PINCTRL_PIN(3, "GPIO_3"), + PINCTRL_PIN(4, "GPIO_4"), + PINCTRL_PIN(5, "GPIO_5"), + PINCTRL_PIN(6, "GPIO_6"), + PINCTRL_PIN(7, "GPIO_7"), + PINCTRL_PIN(8, "GPIO_8"), + PINCTRL_PIN(9, "GPIO_9"), + PINCTRL_PIN(10, "GPIO_10"), + PINCTRL_PIN(11, "GPIO_11"), + PINCTRL_PIN(12, "GPIO_12"), + PINCTRL_PIN(13, "GPIO_13"), + PINCTRL_PIN(14, "GPIO_14"), + PINCTRL_PIN(15, "GPIO_15"), + PINCTRL_PIN(16, "GPIO_16"), + PINCTRL_PIN(17, "GPIO_17"), + PINCTRL_PIN(18, "GPIO_18"), + PINCTRL_PIN(19, "GPIO_19"), + PINCTRL_PIN(20, "GPIO_20"), + PINCTRL_PIN(21, "GPIO_21"), + PINCTRL_PIN(22, "GPIO_22"), + PINCTRL_PIN(23, "GPIO_23"), + PINCTRL_PIN(24, "GPIO_24"), + PINCTRL_PIN(25, "GPIO_25"), + PINCTRL_PIN(26, "GPIO_26"), + PINCTRL_PIN(27, "GPIO_27"), + PINCTRL_PIN(28, "GPIO_28"), + PINCTRL_PIN(29, "GPIO_29"), + PINCTRL_PIN(30, "GPIO_30"), + PINCTRL_PIN(31, "GPIO_31"), + PINCTRL_PIN(32, "GPIO_32"), + PINCTRL_PIN(33, "GPIO_33"), + PINCTRL_PIN(34, "GPIO_34"), + PINCTRL_PIN(35, "GPIO_35"), + PINCTRL_PIN(36, "GPIO_36"), + PINCTRL_PIN(37, "GPIO_37"), + PINCTRL_PIN(38, "GPIO_38"), + PINCTRL_PIN(39, "GPIO_39"), + PINCTRL_PIN(40, "GPIO_40"), + PINCTRL_PIN(41, "GPIO_41"), + PINCTRL_PIN(42, "GPIO_42"), + PINCTRL_PIN(43, "GPIO_43"), + PINCTRL_PIN(44, "GPIO_44"), + PINCTRL_PIN(45, "GPIO_45"), + PINCTRL_PIN(46, "GPIO_46"), + PINCTRL_PIN(47, "GPIO_47"), + PINCTRL_PIN(48, "GPIO_48"), + PINCTRL_PIN(49, "GPIO_49"), + PINCTRL_PIN(50, "GPIO_50"), + PINCTRL_PIN(51, "GPIO_51"), + PINCTRL_PIN(52, "GPIO_52"), + PINCTRL_PIN(53, "GPIO_53"), + PINCTRL_PIN(54, "GPIO_54"), + PINCTRL_PIN(55, "GPIO_55"), + PINCTRL_PIN(56, "GPIO_56"), + PINCTRL_PIN(57, "GPIO_57"), + PINCTRL_PIN(58, "GPIO_58"), + PINCTRL_PIN(59, "GPIO_59"), + PINCTRL_PIN(60, "GPIO_60"), + PINCTRL_PIN(61, "GPIO_61"), + PINCTRL_PIN(62, "GPIO_62"), + PINCTRL_PIN(63, "GPIO_63"), + PINCTRL_PIN(64, "GPIO_64"), + PINCTRL_PIN(65, "GPIO_65"), + PINCTRL_PIN(66, "GPIO_66"), + PINCTRL_PIN(67, "GPIO_67"), + PINCTRL_PIN(68, "GPIO_68"), + PINCTRL_PIN(69, "GPIO_69"), + PINCTRL_PIN(70, "GPIO_70"), + PINCTRL_PIN(71, "GPIO_71"), + PINCTRL_PIN(72, "GPIO_72"), + PINCTRL_PIN(73, "GPIO_73"), + PINCTRL_PIN(74, "GPIO_74"), + PINCTRL_PIN(75, "GPIO_75"), + PINCTRL_PIN(76, "GPIO_76"), + PINCTRL_PIN(77, "GPIO_77"), + PINCTRL_PIN(78, "GPIO_78"), + PINCTRL_PIN(79, "GPIO_79"), + PINCTRL_PIN(80, "GPIO_80"), + PINCTRL_PIN(81, "GPIO_81"), + PINCTRL_PIN(82, "GPIO_82"), + PINCTRL_PIN(83, "GPIO_83"), + PINCTRL_PIN(84, "GPIO_84"), + PINCTRL_PIN(85, "GPIO_85"), + PINCTRL_PIN(86, "GPIO_86"), + PINCTRL_PIN(87, "GPIO_87"), + PINCTRL_PIN(88, "GPIO_88"), + PINCTRL_PIN(89, "GPIO_89"), + PINCTRL_PIN(90, "GPIO_90"), + PINCTRL_PIN(91, "GPIO_91"), + PINCTRL_PIN(92, "GPIO_92"), + PINCTRL_PIN(93, "GPIO_93"), + PINCTRL_PIN(94, "GPIO_94"), + PINCTRL_PIN(95, "GPIO_95"), + PINCTRL_PIN(96, "GPIO_96"), + PINCTRL_PIN(97, "GPIO_97"), + PINCTRL_PIN(98, "GPIO_98"), + PINCTRL_PIN(99, "GPIO_99"), + PINCTRL_PIN(100, "GPIO_100"), + PINCTRL_PIN(101, "GPIO_101"), + PINCTRL_PIN(102, "GPIO_102"), + PINCTRL_PIN(103, "GPIO_103"), + PINCTRL_PIN(104, "GPIO_104"), + PINCTRL_PIN(105, "GPIO_105"), + PINCTRL_PIN(106, "GPIO_106"), + PINCTRL_PIN(107, "GPIO_107"), + PINCTRL_PIN(108, "GPIO_108"), + PINCTRL_PIN(109, "GPIO_109"), + PINCTRL_PIN(110, "GPIO_110"), + PINCTRL_PIN(111, "GPIO_111"), + PINCTRL_PIN(112, "GPIO_112"), + PINCTRL_PIN(113, "GPIO_113"), + PINCTRL_PIN(114, "GPIO_114"), + PINCTRL_PIN(115, "GPIO_115"), + PINCTRL_PIN(116, "GPIO_116"), + PINCTRL_PIN(117, "GPIO_117"), + PINCTRL_PIN(118, "GPIO_118"), + PINCTRL_PIN(119, "GPIO_119"), + PINCTRL_PIN(120, "GPIO_120"), + PINCTRL_PIN(121, "GPIO_121"), + PINCTRL_PIN(122, "GPIO_122"), + PINCTRL_PIN(123, "GPIO_123"), + PINCTRL_PIN(124, "GPIO_124"), + PINCTRL_PIN(125, "GPIO_125"), + PINCTRL_PIN(126, "GPIO_126"), + PINCTRL_PIN(127, "GPIO_127"), + PINCTRL_PIN(128, "GPIO_128"), + PINCTRL_PIN(129, "GPIO_129"), + PINCTRL_PIN(130, "GPIO_130"), + PINCTRL_PIN(131, "GPIO_131"), + PINCTRL_PIN(132, "GPIO_132"), + PINCTRL_PIN(133, "GPIO_133"), + PINCTRL_PIN(134, "GPIO_134"), + PINCTRL_PIN(135, "GPIO_135"), + PINCTRL_PIN(136, "GPIO_136"), + PINCTRL_PIN(137, "GPIO_137"), + PINCTRL_PIN(138, "GPIO_138"), + PINCTRL_PIN(139, "GPIO_139"), + PINCTRL_PIN(140, "GPIO_140"), + PINCTRL_PIN(141, "GPIO_141"), + PINCTRL_PIN(142, "GPIO_142"), + PINCTRL_PIN(143, "GPIO_143"), + PINCTRL_PIN(144, "GPIO_144"), + PINCTRL_PIN(145, "GPIO_145"), + PINCTRL_PIN(146, "GPIO_146"), + PINCTRL_PIN(147, "GPIO_147"), + PINCTRL_PIN(148, "GPIO_148"), + PINCTRL_PIN(149, "GPIO_149"), + PINCTRL_PIN(150, "GPIO_150"), + PINCTRL_PIN(151, "GPIO_151"), + PINCTRL_PIN(152, "GPIO_152"), + PINCTRL_PIN(153, "GPIO_153"), + PINCTRL_PIN(154, "GPIO_154"), + PINCTRL_PIN(155, "GPIO_155"), + PINCTRL_PIN(156, "GPIO_156"), + PINCTRL_PIN(157, "GPIO_157"), + PINCTRL_PIN(158, "GPIO_158"), + PINCTRL_PIN(159, "GPIO_159"), + PINCTRL_PIN(160, "GPIO_160"), + PINCTRL_PIN(161, "GPIO_161"), + PINCTRL_PIN(162, "GPIO_162"), + PINCTRL_PIN(163, "GPIO_163"), + PINCTRL_PIN(164, "GPIO_164"), + PINCTRL_PIN(165, "GPIO_165"), + PINCTRL_PIN(166, "GPIO_166"), + PINCTRL_PIN(167, "GPIO_167"), + PINCTRL_PIN(168, "GPIO_168"), + PINCTRL_PIN(169, "GPIO_169"), + PINCTRL_PIN(170, "GPIO_170"), + PINCTRL_PIN(171, "GPIO_171"), + PINCTRL_PIN(172, "GPIO_172"), + PINCTRL_PIN(173, "GPIO_173"), + PINCTRL_PIN(174, "GPIO_174"), + PINCTRL_PIN(175, "GPIO_175"), + PINCTRL_PIN(176, "GPIO_176"), + PINCTRL_PIN(177, "GPIO_177"), + PINCTRL_PIN(178, "GPIO_178"), + PINCTRL_PIN(179, "GPIO_179"), + PINCTRL_PIN(180, "GPIO_180"), + PINCTRL_PIN(181, "GPIO_181"), + PINCTRL_PIN(182, "GPIO_182"), + PINCTRL_PIN(183, "GPIO_183"), + PINCTRL_PIN(184, "GPIO_184"), + PINCTRL_PIN(185, "GPIO_185"), + PINCTRL_PIN(186, "GPIO_186"), + PINCTRL_PIN(187, "GPIO_187"), + PINCTRL_PIN(188, "GPIO_188"), + PINCTRL_PIN(189, "GPIO_189"), + PINCTRL_PIN(190, "GPIO_190"), + PINCTRL_PIN(191, "GPIO_191"), + PINCTRL_PIN(192, "GPIO_192"), + PINCTRL_PIN(193, "GPIO_193"), + PINCTRL_PIN(194, "GPIO_194"), + PINCTRL_PIN(195, "GPIO_195"), + PINCTRL_PIN(196, "GPIO_196"), + PINCTRL_PIN(197, "GPIO_197"), + PINCTRL_PIN(198, "GPIO_198"), + PINCTRL_PIN(199, "GPIO_199"), + PINCTRL_PIN(200, "GPIO_200"), + PINCTRL_PIN(201, "GPIO_201"), + PINCTRL_PIN(202, "GPIO_202"), + PINCTRL_PIN(203, "GPIO_203"), + PINCTRL_PIN(204, "GPIO_204"), + PINCTRL_PIN(205, "GPIO_205"), + PINCTRL_PIN(206, "GPIO_206"), + PINCTRL_PIN(207, "GPIO_207"), + PINCTRL_PIN(208, "GPIO_208"), + PINCTRL_PIN(209, "GPIO_209"), + PINCTRL_PIN(210, "GPIO_210"), + PINCTRL_PIN(211, "GPIO_211"), + PINCTRL_PIN(212, "GPIO_212"), + PINCTRL_PIN(213, "GPIO_213"), + PINCTRL_PIN(214, "GPIO_214"), + PINCTRL_PIN(215, "GPIO_215"), + PINCTRL_PIN(216, "GPIO_216"), + PINCTRL_PIN(217, "GPIO_217"), + PINCTRL_PIN(218, "GPIO_218"), + PINCTRL_PIN(219, "GPIO_219"), + PINCTRL_PIN(220, "GPIO_220"), + PINCTRL_PIN(221, "GPIO_221"), + PINCTRL_PIN(222, "GPIO_222"), + PINCTRL_PIN(223, "GPIO_223"), + PINCTRL_PIN(224, "GPIO_224"), + PINCTRL_PIN(225, "GPIO_225"), + PINCTRL_PIN(226, "GPIO_226"), + PINCTRL_PIN(227, "GPIO_227"), + PINCTRL_PIN(228, "GPIO_228"), + PINCTRL_PIN(229, "GPIO_229"), + PINCTRL_PIN(230, "GPIO_230"), + PINCTRL_PIN(231, "GPIO_231"), + PINCTRL_PIN(232, "GPIO_232"), + PINCTRL_PIN(233, "GPIO_233"), + PINCTRL_PIN(234, "GPIO_234"), + PINCTRL_PIN(235, "GPIO_235"), + PINCTRL_PIN(236, "GPIO_236"), + PINCTRL_PIN(237, "GPIO_237"), + PINCTRL_PIN(238, "GPIO_238"), + PINCTRL_PIN(239, "GPIO_239"), + PINCTRL_PIN(240, "GPIO_240"), + PINCTRL_PIN(241, "GPIO_241"), + PINCTRL_PIN(242, "GPIO_242"), + PINCTRL_PIN(243, "GPIO_243"), + PINCTRL_PIN(244, "GPIO_244"), + PINCTRL_PIN(245, "GPIO_245"), + PINCTRL_PIN(246, "GPIO_246"), + PINCTRL_PIN(247, "GPIO_247"), + PINCTRL_PIN(248, "GPIO_248"), + PINCTRL_PIN(249, "GPIO_249"), +}; + +#define DECLARE_MSM_GPIO_PINS(pin) \ + static const unsigned int gpio##pin##_pins[] = { pin } +DECLARE_MSM_GPIO_PINS(0); +DECLARE_MSM_GPIO_PINS(1); +DECLARE_MSM_GPIO_PINS(2); +DECLARE_MSM_GPIO_PINS(3); +DECLARE_MSM_GPIO_PINS(4); +DECLARE_MSM_GPIO_PINS(5); +DECLARE_MSM_GPIO_PINS(6); +DECLARE_MSM_GPIO_PINS(7); +DECLARE_MSM_GPIO_PINS(8); +DECLARE_MSM_GPIO_PINS(9); +DECLARE_MSM_GPIO_PINS(10); +DECLARE_MSM_GPIO_PINS(11); +DECLARE_MSM_GPIO_PINS(12); +DECLARE_MSM_GPIO_PINS(13); +DECLARE_MSM_GPIO_PINS(14); +DECLARE_MSM_GPIO_PINS(15); +DECLARE_MSM_GPIO_PINS(16); +DECLARE_MSM_GPIO_PINS(17); +DECLARE_MSM_GPIO_PINS(18); +DECLARE_MSM_GPIO_PINS(19); +DECLARE_MSM_GPIO_PINS(20); +DECLARE_MSM_GPIO_PINS(21); +DECLARE_MSM_GPIO_PINS(22); +DECLARE_MSM_GPIO_PINS(23); +DECLARE_MSM_GPIO_PINS(24); +DECLARE_MSM_GPIO_PINS(25); +DECLARE_MSM_GPIO_PINS(26); +DECLARE_MSM_GPIO_PINS(27); +DECLARE_MSM_GPIO_PINS(28); +DECLARE_MSM_GPIO_PINS(29); +DECLARE_MSM_GPIO_PINS(30); +DECLARE_MSM_GPIO_PINS(31); +DECLARE_MSM_GPIO_PINS(32); +DECLARE_MSM_GPIO_PINS(33); +DECLARE_MSM_GPIO_PINS(34); +DECLARE_MSM_GPIO_PINS(35); +DECLARE_MSM_GPIO_PINS(36); +DECLARE_MSM_GPIO_PINS(37); +DECLARE_MSM_GPIO_PINS(38); +DECLARE_MSM_GPIO_PINS(39); +DECLARE_MSM_GPIO_PINS(40); +DECLARE_MSM_GPIO_PINS(41); +DECLARE_MSM_GPIO_PINS(42); +DECLARE_MSM_GPIO_PINS(43); +DECLARE_MSM_GPIO_PINS(44); +DECLARE_MSM_GPIO_PINS(45); +DECLARE_MSM_GPIO_PINS(46); +DECLARE_MSM_GPIO_PINS(47); +DECLARE_MSM_GPIO_PINS(48); +DECLARE_MSM_GPIO_PINS(49); +DECLARE_MSM_GPIO_PINS(50); +DECLARE_MSM_GPIO_PINS(51); +DECLARE_MSM_GPIO_PINS(52); +DECLARE_MSM_GPIO_PINS(53); +DECLARE_MSM_GPIO_PINS(54); +DECLARE_MSM_GPIO_PINS(55); +DECLARE_MSM_GPIO_PINS(56); +DECLARE_MSM_GPIO_PINS(57); +DECLARE_MSM_GPIO_PINS(58); +DECLARE_MSM_GPIO_PINS(59); +DECLARE_MSM_GPIO_PINS(60); +DECLARE_MSM_GPIO_PINS(61); +DECLARE_MSM_GPIO_PINS(62); +DECLARE_MSM_GPIO_PINS(63); +DECLARE_MSM_GPIO_PINS(64); +DECLARE_MSM_GPIO_PINS(65); +DECLARE_MSM_GPIO_PINS(66); +DECLARE_MSM_GPIO_PINS(67); +DECLARE_MSM_GPIO_PINS(68); +DECLARE_MSM_GPIO_PINS(69); +DECLARE_MSM_GPIO_PINS(70); +DECLARE_MSM_GPIO_PINS(71); +DECLARE_MSM_GPIO_PINS(72); +DECLARE_MSM_GPIO_PINS(73); +DECLARE_MSM_GPIO_PINS(74); +DECLARE_MSM_GPIO_PINS(75); +DECLARE_MSM_GPIO_PINS(76); +DECLARE_MSM_GPIO_PINS(77); +DECLARE_MSM_GPIO_PINS(78); +DECLARE_MSM_GPIO_PINS(79); +DECLARE_MSM_GPIO_PINS(80); +DECLARE_MSM_GPIO_PINS(81); +DECLARE_MSM_GPIO_PINS(82); +DECLARE_MSM_GPIO_PINS(83); +DECLARE_MSM_GPIO_PINS(84); +DECLARE_MSM_GPIO_PINS(85); +DECLARE_MSM_GPIO_PINS(86); +DECLARE_MSM_GPIO_PINS(87); +DECLARE_MSM_GPIO_PINS(88); +DECLARE_MSM_GPIO_PINS(89); +DECLARE_MSM_GPIO_PINS(90); +DECLARE_MSM_GPIO_PINS(91); +DECLARE_MSM_GPIO_PINS(92); +DECLARE_MSM_GPIO_PINS(93); +DECLARE_MSM_GPIO_PINS(94); +DECLARE_MSM_GPIO_PINS(95); +DECLARE_MSM_GPIO_PINS(96); +DECLARE_MSM_GPIO_PINS(97); +DECLARE_MSM_GPIO_PINS(98); +DECLARE_MSM_GPIO_PINS(99); +DECLARE_MSM_GPIO_PINS(100); +DECLARE_MSM_GPIO_PINS(101); +DECLARE_MSM_GPIO_PINS(102); +DECLARE_MSM_GPIO_PINS(103); +DECLARE_MSM_GPIO_PINS(104); +DECLARE_MSM_GPIO_PINS(105); +DECLARE_MSM_GPIO_PINS(106); +DECLARE_MSM_GPIO_PINS(107); +DECLARE_MSM_GPIO_PINS(108); +DECLARE_MSM_GPIO_PINS(109); +DECLARE_MSM_GPIO_PINS(110); +DECLARE_MSM_GPIO_PINS(111); +DECLARE_MSM_GPIO_PINS(112); +DECLARE_MSM_GPIO_PINS(113); +DECLARE_MSM_GPIO_PINS(114); +DECLARE_MSM_GPIO_PINS(115); +DECLARE_MSM_GPIO_PINS(116); +DECLARE_MSM_GPIO_PINS(117); +DECLARE_MSM_GPIO_PINS(118); +DECLARE_MSM_GPIO_PINS(119); +DECLARE_MSM_GPIO_PINS(120); +DECLARE_MSM_GPIO_PINS(121); +DECLARE_MSM_GPIO_PINS(122); +DECLARE_MSM_GPIO_PINS(123); +DECLARE_MSM_GPIO_PINS(124); +DECLARE_MSM_GPIO_PINS(125); +DECLARE_MSM_GPIO_PINS(126); +DECLARE_MSM_GPIO_PINS(127); +DECLARE_MSM_GPIO_PINS(128); +DECLARE_MSM_GPIO_PINS(129); +DECLARE_MSM_GPIO_PINS(130); +DECLARE_MSM_GPIO_PINS(131); +DECLARE_MSM_GPIO_PINS(132); +DECLARE_MSM_GPIO_PINS(133); +DECLARE_MSM_GPIO_PINS(134); +DECLARE_MSM_GPIO_PINS(135); +DECLARE_MSM_GPIO_PINS(136); +DECLARE_MSM_GPIO_PINS(137); +DECLARE_MSM_GPIO_PINS(138); +DECLARE_MSM_GPIO_PINS(139); +DECLARE_MSM_GPIO_PINS(140); +DECLARE_MSM_GPIO_PINS(141); +DECLARE_MSM_GPIO_PINS(142); +DECLARE_MSM_GPIO_PINS(143); +DECLARE_MSM_GPIO_PINS(144); +DECLARE_MSM_GPIO_PINS(145); +DECLARE_MSM_GPIO_PINS(146); +DECLARE_MSM_GPIO_PINS(147); +DECLARE_MSM_GPIO_PINS(148); +DECLARE_MSM_GPIO_PINS(149); +DECLARE_MSM_GPIO_PINS(150); +DECLARE_MSM_GPIO_PINS(151); +DECLARE_MSM_GPIO_PINS(152); +DECLARE_MSM_GPIO_PINS(153); +DECLARE_MSM_GPIO_PINS(154); +DECLARE_MSM_GPIO_PINS(155); +DECLARE_MSM_GPIO_PINS(156); +DECLARE_MSM_GPIO_PINS(157); +DECLARE_MSM_GPIO_PINS(158); +DECLARE_MSM_GPIO_PINS(159); +DECLARE_MSM_GPIO_PINS(160); +DECLARE_MSM_GPIO_PINS(161); +DECLARE_MSM_GPIO_PINS(162); +DECLARE_MSM_GPIO_PINS(163); +DECLARE_MSM_GPIO_PINS(164); +DECLARE_MSM_GPIO_PINS(165); +DECLARE_MSM_GPIO_PINS(166); +DECLARE_MSM_GPIO_PINS(167); +DECLARE_MSM_GPIO_PINS(168); +DECLARE_MSM_GPIO_PINS(169); +DECLARE_MSM_GPIO_PINS(170); +DECLARE_MSM_GPIO_PINS(171); +DECLARE_MSM_GPIO_PINS(172); +DECLARE_MSM_GPIO_PINS(173); +DECLARE_MSM_GPIO_PINS(174); +DECLARE_MSM_GPIO_PINS(175); +DECLARE_MSM_GPIO_PINS(176); +DECLARE_MSM_GPIO_PINS(177); +DECLARE_MSM_GPIO_PINS(178); +DECLARE_MSM_GPIO_PINS(179); +DECLARE_MSM_GPIO_PINS(180); +DECLARE_MSM_GPIO_PINS(181); +DECLARE_MSM_GPIO_PINS(182); +DECLARE_MSM_GPIO_PINS(183); +DECLARE_MSM_GPIO_PINS(184); +DECLARE_MSM_GPIO_PINS(185); +DECLARE_MSM_GPIO_PINS(186); +DECLARE_MSM_GPIO_PINS(187); +DECLARE_MSM_GPIO_PINS(188); +DECLARE_MSM_GPIO_PINS(189); +DECLARE_MSM_GPIO_PINS(190); +DECLARE_MSM_GPIO_PINS(191); +DECLARE_MSM_GPIO_PINS(192); +DECLARE_MSM_GPIO_PINS(193); +DECLARE_MSM_GPIO_PINS(194); +DECLARE_MSM_GPIO_PINS(195); +DECLARE_MSM_GPIO_PINS(196); +DECLARE_MSM_GPIO_PINS(197); +DECLARE_MSM_GPIO_PINS(198); +DECLARE_MSM_GPIO_PINS(199); +DECLARE_MSM_GPIO_PINS(200); +DECLARE_MSM_GPIO_PINS(201); +DECLARE_MSM_GPIO_PINS(202); +DECLARE_MSM_GPIO_PINS(203); +DECLARE_MSM_GPIO_PINS(204); +DECLARE_MSM_GPIO_PINS(205); +DECLARE_MSM_GPIO_PINS(206); +DECLARE_MSM_GPIO_PINS(207); +DECLARE_MSM_GPIO_PINS(208); +DECLARE_MSM_GPIO_PINS(209); +DECLARE_MSM_GPIO_PINS(210); +DECLARE_MSM_GPIO_PINS(211); +DECLARE_MSM_GPIO_PINS(212); +DECLARE_MSM_GPIO_PINS(213); +DECLARE_MSM_GPIO_PINS(214); +DECLARE_MSM_GPIO_PINS(215); +DECLARE_MSM_GPIO_PINS(216); +DECLARE_MSM_GPIO_PINS(217); +DECLARE_MSM_GPIO_PINS(218); +DECLARE_MSM_GPIO_PINS(219); +DECLARE_MSM_GPIO_PINS(220); +DECLARE_MSM_GPIO_PINS(221); +DECLARE_MSM_GPIO_PINS(222); +DECLARE_MSM_GPIO_PINS(223); +DECLARE_MSM_GPIO_PINS(224); +DECLARE_MSM_GPIO_PINS(225); +DECLARE_MSM_GPIO_PINS(226); +DECLARE_MSM_GPIO_PINS(227); +DECLARE_MSM_GPIO_PINS(228); +DECLARE_MSM_GPIO_PINS(229); +DECLARE_MSM_GPIO_PINS(230); +DECLARE_MSM_GPIO_PINS(231); +DECLARE_MSM_GPIO_PINS(232); +DECLARE_MSM_GPIO_PINS(233); +DECLARE_MSM_GPIO_PINS(234); +DECLARE_MSM_GPIO_PINS(235); +DECLARE_MSM_GPIO_PINS(236); +DECLARE_MSM_GPIO_PINS(237); +DECLARE_MSM_GPIO_PINS(238); +DECLARE_MSM_GPIO_PINS(239); +DECLARE_MSM_GPIO_PINS(240); +DECLARE_MSM_GPIO_PINS(241); +DECLARE_MSM_GPIO_PINS(242); +DECLARE_MSM_GPIO_PINS(243); +DECLARE_MSM_GPIO_PINS(244); +DECLARE_MSM_GPIO_PINS(245); +DECLARE_MSM_GPIO_PINS(246); +DECLARE_MSM_GPIO_PINS(247); +DECLARE_MSM_GPIO_PINS(248); +DECLARE_MSM_GPIO_PINS(249); + +static const unsigned int ufs_reset_pins[] = { 250 }; +static const unsigned int sdc2_clk_pins[] = { 251 }; +static const unsigned int sdc2_cmd_pins[] = { 252 }; +static const unsigned int sdc2_data_pins[] = { 253 }; + +enum glymur_functions { + msm_mux_gpio, + msm_mux_resout_gpio_n, + msm_mux_aoss_cti, + msm_mux_asc_cci, + msm_mux_atest_char, + msm_mux_atest_usb, + msm_mux_audio_ext_mclk0, + msm_mux_audio_ext_mclk1, + msm_mux_audio_ref_clk, + msm_mux_cam_asc_mclk4, + msm_mux_cam_mclk, + msm_mux_cci_async_in, + msm_mux_cci_i2c_scl, + msm_mux_cci_i2c_sda, + msm_mux_cci_timer, + msm_mux_cmu_rng, + msm_mux_cri_trng, + msm_mux_dbg_out_clk, + msm_mux_ddr_bist_complete, + msm_mux_ddr_bist_fail, + msm_mux_ddr_bist_start, + msm_mux_ddr_bist_stop, + msm_mux_ddr_pxi, + msm_mux_edp0_hot, + msm_mux_edp0_lcd, + msm_mux_edp1_lcd, + msm_mux_egpio, + msm_mux_eusb_ac_en, + msm_mux_gcc_gp1, + msm_mux_gcc_gp2, + msm_mux_gcc_gp3, + msm_mux_host2wlan_sol, + msm_mux_i2c0_s_scl, + msm_mux_i2c0_s_sda, + msm_mux_i2s0_data, + msm_mux_i2s0_sck, + msm_mux_i2s0_ws, + msm_mux_i2s1_data, + msm_mux_i2s1_sck, + msm_mux_i2s1_ws, + msm_mux_ibi_i3c, + msm_mux_jitter_bist, + msm_mux_mdp_vsync_out, + msm_mux_mdp_vsync_e, + msm_mux_mdp_vsync_p, + msm_mux_mdp_vsync_s, + msm_mux_pcie3a_clk, + msm_mux_pcie3a_rst_n, + msm_mux_pcie3b_clk, + msm_mux_pcie4_clk_req_n, + msm_mux_pcie5_clk_req_n, + msm_mux_pcie6_clk_req_n, + msm_mux_phase_flag, + msm_mux_pll_bist_sync, + msm_mux_pll_clk_aux, + msm_mux_pmc_oca_n, + msm_mux_pmc_uva_n, + msm_mux_prng_rosc, + msm_mux_qdss_cti, + msm_mux_qdss_gpio, + msm_mux_qspi0, + msm_mux_qup0_se0, + msm_mux_qup0_se1, + msm_mux_qup0_se2, + msm_mux_qup0_se3, + msm_mux_qup0_se4, + msm_mux_qup0_se5, + msm_mux_qup0_se6, + msm_mux_qup0_se7, + msm_mux_qup1_se0, + msm_mux_qup1_se1, + msm_mux_qup1_se2, + msm_mux_qup1_se3, + msm_mux_qup1_se4, + msm_mux_qup1_se5, + msm_mux_qup1_se6, + msm_mux_qup1_se7, + msm_mux_qup2_se0, + msm_mux_qup2_se1, + msm_mux_qup2_se2, + msm_mux_qup2_se3, + msm_mux_qup2_se4, + msm_mux_qup2_se5, + msm_mux_qup2_se6, + msm_mux_qup2_se7, + msm_mux_qup3_se0, + msm_mux_qup3_se1, + msm_mux_sd_write_protect, + msm_mux_sdc4_clk, + msm_mux_sdc4_cmd, + msm_mux_sdc4_data, + msm_mux_smb_acok_n, + msm_mux_sys_throttle, + msm_mux_tb_trig_sdc2, + msm_mux_tb_trig_sdc4, + msm_mux_tmess_prng, + msm_mux_tsense_pwm, + msm_mux_tsense_therm, + msm_mux_usb0_dp, + msm_mux_usb0_phy_ps, + msm_mux_usb0_sbrx, + msm_mux_usb0_sbtx, + msm_mux_usb0_tmu, + msm_mux_usb1_dbg, + msm_mux_usb1_dp, + msm_mux_usb1_phy_ps, + msm_mux_usb1_sbrx, + msm_mux_usb1_sbtx, + msm_mux_usb1_tmu, + msm_mux_usb2_dp, + msm_mux_usb2_phy_ps, + msm_mux_usb2_sbrx, + msm_mux_usb2_sbtx, + msm_mux_usb2_tmu, + msm_mux_vsense_trigger_mirnat, + msm_mux_wcn_sw, + msm_mux_wcn_sw_ctrl, + msm_mux__, +}; + +static const char *const gpio_groups[] = { + "gpio0", "gpio1", "gpio2", "gpio3", "gpio4", "gpio5", + "gpio6", "gpio7", "gpio8", "gpio9", "gpio10", "gpio11", + "gpio12", "gpio13", "gpio14", "gpio15", "gpio16", "gpio17", + "gpio18", "gpio19", "gpio20", "gpio21", "gpio22", "gpio23", + "gpio24", "gpio25", "gpio26", "gpio27", "gpio28", "gpio29", + "gpio30", "gpio31", "gpio32", "gpio33", "gpio34", "gpio35", + "gpio36", "gpio37", "gpio38", "gpio39", "gpio40", "gpio41", + "gpio42", "gpio43", "gpio44", "gpio45", "gpio46", "gpio47", + "gpio48", "gpio49", "gpio50", "gpio51", "gpio52", "gpio53", + "gpio54", "gpio55", "gpio56", "gpio57", "gpio58", "gpio59", + "gpio60", "gpio61", "gpio62", "gpio63", "gpio64", "gpio65", + "gpio66", "gpio67", "gpio68", "gpio69", "gpio70", "gpio71", + "gpio72", "gpio73", "gpio74", "gpio75", "gpio76", "gpio77", + "gpio78", "gpio79", "gpio80", "gpio81", "gpio82", "gpio83", + "gpio84", "gpio85", "gpio86", "gpio87", "gpio88", "gpio89", + "gpio90", "gpio91", "gpio92", "gpio93", "gpio94", "gpio95", + "gpio96", "gpio97", "gpio98", "gpio99", "gpio100", "gpio101", + "gpio102", "gpio103", "gpio104", "gpio105", "gpio106", "gpio107", + "gpio108", "gpio109", "gpio110", "gpio111", "gpio112", "gpio113", + "gpio114", "gpio115", "gpio116", "gpio117", "gpio118", "gpio119", + "gpio120", "gpio121", "gpio122", "gpio123", "gpio124", "gpio125", + "gpio126", "gpio127", "gpio128", "gpio129", "gpio130", "gpio131", + "gpio132", "gpio133", "gpio134", "gpio135", "gpio136", "gpio137", + "gpio138", "gpio139", "gpio140", "gpio141", "gpio142", "gpio143", + "gpio144", "gpio145", "gpio146", "gpio147", "gpio148", "gpio149", + "gpio150", "gpio151", "gpio152", "gpio153", "gpio154", "gpio155", + "gpio156", "gpio157", "gpio158", "gpio159", "gpio160", "gpio161", + "gpio162", "gpio163", "gpio164", "gpio165", "gpio166", "gpio167", + "gpio168", "gpio169", "gpio170", "gpio171", "gpio172", "gpio173", + "gpio174", "gpio175", "gpio176", "gpio177", "gpio178", "gpio179", + "gpio180", "gpio181", "gpio182", "gpio183", "gpio184", "gpio185", + "gpio186", "gpio187", "gpio188", "gpio189", "gpio190", "gpio191", + "gpio192", "gpio193", "gpio194", "gpio195", "gpio196", "gpio197", + "gpio198", "gpio199", "gpio200", "gpio201", "gpio202", "gpio203", + "gpio204", "gpio205", "gpio206", "gpio207", "gpio208", "gpio209", + "gpio210", "gpio211", "gpio212", "gpio213", "gpio214", "gpio215", + "gpio216", "gpio217", "gpio218", "gpio219", "gpio220", "gpio221", + "gpio222", "gpio223", "gpio224", "gpio225", "gpio226", "gpio227", + "gpio228", "gpio229", "gpio230", "gpio231", "gpio232", "gpio233", + "gpio234", "gpio235", "gpio236", "gpio237", "gpio238", "gpio239", + "gpio240", "gpio241", "gpio242", "gpio243", "gpio244", "gpio245", + "gpio246", "gpio247", "gpio248", "gpio249", +}; + +static const char *const resout_gpio_n_groups[] = { + "gpio160", +}; + +static const char *const aoss_cti_groups[] = { + "gpio60", + "gpio61", + "gpio62", + "gpio63", +}; + +static const char *const asc_cci_groups[] = { + "gpio235", + "gpio236", +}; + +static const char *const atest_char_groups[] = { + "gpio172", "gpio184", "gpio188", "gpio164", + "gpio163", +}; + +static const char *const atest_usb_groups[] = { + "gpio39", "gpio40", "gpio41", "gpio38", + "gpio44", "gpio45", "gpio42", "gpio43", + "gpio49", "gpio50", "gpio51", "gpio48", + "gpio54", "gpio55", "gpio52", "gpio53", + "gpio65", "gpio66", "gpio46", "gpio47", + "gpio72", "gpio73", "gpio80", "gpio81", +}; + +static const char *const audio_ext_mclk0_groups[] = { + "gpio134", +}; + +static const char *const audio_ext_mclk1_groups[] = { + "gpio142", +}; + +static const char *const audio_ref_clk_groups[] = { + "gpio142", +}; + +static const char *const cam_asc_mclk4_groups[] = { + "gpio100", +}; + +static const char *const cam_mclk_groups[] = { + "gpio96", + "gpio97", + "gpio98", + "gpio99", +}; + +static const char *const cci_async_in_groups[] = { + "gpio113", "gpio112", "gpio111", +}; + +static const char *const cci_i2c_scl_groups[] = { + "gpio102", "gpio104", "gpio106", +}; + +static const char *const cci_i2c_sda_groups[] = { + "gpio101", "gpio103", "gpio105", +}; + +static const char *const cci_timer_groups[] = { + "gpio109", "gpio110", "gpio111", "gpio112", + "gpio113", +}; + +static const char *const cmu_rng_groups[] = { + "gpio48", "gpio47", "gpio46", "gpio45", +}; + +static const char *const cri_trng_groups[] = { + "gpio173", +}; + +static const char *const dbg_out_clk_groups[] = { + "gpio51", +}; + +static const char *const ddr_bist_complete_groups[] = { + "gpio57", +}; + +static const char *const ddr_bist_fail_groups[] = { + "gpio56", +}; + +static const char *const ddr_bist_start_groups[] = { + "gpio54", +}; + +static const char *const ddr_bist_stop_groups[] = { + "gpio55", +}; + +static const char *const ddr_pxi_groups[] = { + "gpio38", "gpio39", "gpio40", "gpio41", + "gpio72", "gpio73", "gpio80", "gpio81", + "gpio42", "gpio43", "gpio44", "gpio45", + "gpio46", "gpio47", "gpio48", "gpio49", + "gpio50", "gpio51", "gpio52", "gpio53", + "gpio54", "gpio55", "gpio65", "gpio66", +}; + +static const char *const edp0_hot_groups[] = { + "gpio119", +}; + +static const char *const edp0_lcd_groups[] = { + "gpio120", +}; + +static const char *const edp1_lcd_groups[] = { + "gpio115", + "gpio119", +}; + +static const char *const egpio_groups[] = { + "gpio192", "gpio193", "gpio194", "gpio195", "gpio196", "gpio197", + "gpio198", "gpio199", "gpio200", "gpio201", "gpio202", "gpio203", + "gpio204", "gpio205", "gpio206", "gpio207", "gpio208", "gpio209", + "gpio210", "gpio211", "gpio212", "gpio213", "gpio214", "gpio215", + "gpio216", "gpio217", "gpio218", "gpio219", "gpio220", "gpio221", + "gpio222", "gpio223", "gpio224", "gpio225", "gpio226", "gpio227", + "gpio228", "gpio229", "gpio230", "gpio231", "gpio232", "gpio233", + "gpio234", "gpio235", "gpio236", "gpio237", "gpio238", "gpio239", + "gpio240", "gpio241", "gpio242", "gpio243", "gpio244", +}; + +static const char *const eusb_ac_en_groups[] = { + "gpio168", "gpio177", "gpio186", "gpio69", + "gpio187", "gpio178", +}; + +static const char *const gcc_gp1_groups[] = { + "gpio71", + "gpio72", +}; + +static const char *const gcc_gp2_groups[] = { + "gpio64", + "gpio73", +}; + +static const char *const gcc_gp3_groups[] = { + "gpio74", + "gpio82", +}; + +static const char *const host2wlan_sol_groups[] = { + "gpio118", +}; + +static const char *const i2c0_s_scl_groups[] = { + "gpio7", +}; + +static const char *const i2c0_s_sda_groups[] = { + "gpio6", +}; + +static const char *const i2s0_data_groups[] = { + "gpio136", "gpio137", +}; + +static const char *const i2s0_sck_groups[] = { + "gpio135", +}; + +static const char *const i2s0_ws_groups[] = { + "gpio138", +}; + +static const char *const i2s1_data_groups[] = { + "gpio140", "gpio142", +}; + +static const char *const i2s1_sck_groups[] = { + "gpio139", +}; + +static const char *const i2s1_ws_groups[] = { + "gpio141", +}; + +static const char *const ibi_i3c_groups[] = { + "gpio0", "gpio1", "gpio4", "gpio5", "gpio32", "gpio33", + "gpio36", "gpio37", "gpio64", "gpio65", "gpio68", "gpio69", +}; + +static const char *const jitter_bist_groups[] = { + "gpio52", +}; + +static const char *const mdp_vsync_out_groups[] = { + "gpio114", "gpio114", "gpio115", "gpio115", + "gpio109", "gpio110", "gpio111", "gpio112", + "gpio113", +}; + +static const char *const mdp_vsync_e_groups[] = { + "gpio106", +}; + +static const char *const mdp_vsync_p_groups[] = { + "gpio98", +}; + +static const char *const mdp_vsync_s_groups[] = { + "gpio105", +}; + +static const char *const pcie3a_clk_groups[] = { + "gpio144", +}; + +static const char *const pcie3a_rst_n_groups[] = { + "gpio143", +}; + +static const char *const pcie3b_clk_groups[] = { + "gpio156", +}; + +static const char *const pcie4_clk_req_n_groups[] = { + "gpio147", +}; + +static const char *const pcie5_clk_req_n_groups[] = { + "gpio153", +}; + +static const char *const pcie6_clk_req_n_groups[] = { + "gpio150", +}; + +static const char *const phase_flag_groups[] = { + "gpio6", "gpio7", "gpio16", "gpio17", + "gpio18", "gpio19", "gpio20", "gpio21", + "gpio22", "gpio23", "gpio24", "gpio25", + "gpio8", "gpio26", "gpio27", "gpio163", + "gpio164", "gpio188", "gpio184", "gpio172", + "gpio186", "gpio173", "gpio76", "gpio9", + "gpio77", "gpio78", "gpio10", "gpio11", + "gpio12", "gpio13", "gpio14", "gpio15", +}; + +static const char *const pll_bist_sync_groups[] = { + "gpio28", +}; + +static const char *const pll_clk_aux_groups[] = { + "gpio35", +}; + +static const char *const pmc_oca_n_groups[] = { + "gpio249", +}; + +static const char *const pmc_uva_n_groups[] = { + "gpio248", +}; + +static const char *const prng_rosc_groups[] = { + "gpio186", "gpio188", "gpio164", "gpio163", +}; + +static const char *const qdss_cti_groups[] = { + "gpio18", "gpio19", "gpio23", "gpio27", + "gpio161", "gpio162", "gpio215", "gpio217", +}; + +static const char *const qdss_gpio_groups[] = { + "gpio104", "gpio151", "gpio227", "gpio228", + "gpio96", "gpio219", "gpio97", "gpio220", + "gpio108", "gpio231", "gpio109", "gpio232", + "gpio110", "gpio233", "gpio111", "gpio234", + "gpio112", "gpio235", "gpio113", "gpio236", + "gpio149", "gpio221", "gpio99", "gpio222", + "gpio100", "gpio223", "gpio101", "gpio224", + "gpio102", "gpio225", "gpio103", "gpio226", + "gpio152", "gpio237", "gpio107", "gpio238", +}; + +static const char *const qspi0_groups[] = { + "gpio127", "gpio132", "gpio133", "gpio128", + "gpio129", "gpio130", "gpio131", +}; + +static const char *const qup0_se0_groups[] = { + "gpio0", "gpio1", "gpio2", "gpio3", +}; + +static const char *const qup0_se1_groups[] = { + "gpio4", "gpio5", "gpio6", "gpio7", +}; + +static const char *const qup0_se2_groups[] = { + "gpio8", "gpio9", "gpio10", "gpio11", + "gpio17", "gpio18", "gpio19", +}; + +static const char *const qup0_se3_groups[] = { + "gpio12", "gpio13", "gpio14", "gpio15", + "gpio21", "gpio22", "gpio23", +}; + +static const char *const qup0_se4_groups[] = { + "gpio16", "gpio17", "gpio18", "gpio19", +}; + +static const char *const qup0_se5_groups[] = { + "gpio20", "gpio21", "gpio22", "gpio23", +}; + +static const char *const qup0_se6_groups[] = { + "gpio6", "gpio7", "gpio4", "gpio5", +}; + +static const char *const qup0_se7_groups[] = { + "gpio14", "gpio15", "gpio12", "gpio13", +}; + +static const char *const qup1_se0_groups[] = { + "gpio32", "gpio33", "gpio34", "gpio35", +}; + +static const char *const qup1_se1_groups[] = { + "gpio36", "gpio37", "gpio38", "gpio39", +}; + +static const char *const qup1_se2_groups[] = { + "gpio40", "gpio41", "gpio42", "gpio43", + "gpio49", "gpio50", "gpio51", +}; + +static const char *const qup1_se3_groups[] = { + "gpio44", "gpio45", "gpio46", "gpio47", + "gpio33", "gpio34", "gpio35", +}; + +static const char *const qup1_se4_groups[] = { + "gpio48", "gpio49", "gpio50", "gpio51", +}; + +static const char *const qup1_se5_groups[] = { + "gpio52", "gpio53", "gpio54", "gpio55", +}; + +static const char *const qup1_se6_groups[] = { + "gpio56", "gpio57", "gpio58", "gpio59", +}; + +static const char *const qup1_se7_groups[] = { + "gpio54", "gpio55", "gpio52", "gpio53", +}; + +static const char *const qup2_se0_groups[] = { + "gpio64", "gpio65", "gpio66", "gpio67", +}; + +static const char *const qup2_se1_groups[] = { + "gpio68", "gpio69", "gpio70", "gpio71", +}; + +static const char *const qup2_se2_groups[] = { + "gpio72", "gpio73", "gpio74", "gpio75", + "gpio81", "gpio82", "gpio83", +}; + +static const char *const qup2_se3_groups[] = { + "gpio76", "gpio77", "gpio78", "gpio79", + "gpio65", "gpio66", "gpio67", +}; + +static const char *const qup2_se4_groups[] = { + "gpio80", "gpio81", "gpio82", "gpio83", +}; + +static const char *const qup2_se5_groups[] = { + "gpio84", "gpio85", "gpio86", "gpio87", +}; + +static const char *const qup2_se6_groups[] = { + "gpio88", "gpio89", "gpio90", "gpio91", +}; + +static const char *const qup2_se7_groups[] = { + "gpio80", "gpio81", "gpio82", "gpio83", +}; + +static const char *const qup3_se0_groups[] = { + "gpio128", "gpio129", "gpio127", "gpio132", + "gpio130", "gpio131", "gpio133", "gpio247", +}; + +static const char *const qup3_se1_groups[] = { + "gpio40", "gpio41", "gpio42", "gpio43", + "gpio49", "gpio50", "gpio51", "gpio48", +}; + +static const char *const sd_write_protect_groups[] = { + "gpio162", +}; + +static const char *const sdc4_clk_groups[] = { + "gpio127", +}; + +static const char *const sdc4_cmd_groups[] = { + "gpio132", +}; + +static const char *const sdc4_data_groups[] = { + "gpio128", + "gpio129", + "gpio130", + "gpio131", +}; + +static const char *const smb_acok_n_groups[] = { + "gpio245", +}; + +static const char *const sys_throttle_groups[] = { + "gpio39", + "gpio94", +}; + +static const char *const tb_trig_sdc2_groups[] = { + "gpio137", +}; + +static const char *const tb_trig_sdc4_groups[] = { + "gpio133", +}; + +static const char *const tmess_prng_groups[] = { + "gpio92", "gpio93", "gpio94", "gpio95", +}; + +static const char *const tsense_pwm_groups[] = { + "gpio28", "gpio29", "gpio30", "gpio31", + "gpio34", "gpio138", "gpio139", "gpio140", +}; + +static const char *const tsense_therm_groups[] = { + "gpio141", +}; + +static const char *const usb0_dp_groups[] = { + "gpio122", +}; + +static const char *const usb0_phy_ps_groups[] = { + "gpio121", +}; + +static const char *const usb0_sbrx_groups[] = { + "gpio163", +}; + +static const char *const usb0_sbtx_groups[] = { + "gpio164", + "gpio165", +}; + +static const char *const usb0_tmu_groups[] = { + "gpio98", +}; + +static const char *const usb1_dbg_groups[] = { + "gpio105", + "gpio106", +}; + +static const char *const usb1_dp_groups[] = { + "gpio124", +}; + +static const char *const usb1_phy_ps_groups[] = { + "gpio123", +}; + +static const char *const usb1_sbrx_groups[] = { + "gpio172", +}; + +static const char *const usb1_sbtx_groups[] = { + "gpio173", + "gpio174", +}; + +static const char *const usb1_tmu_groups[] = { + "gpio98", +}; + +static const char *const usb2_dp_groups[] = { + "gpio126", +}; + +static const char *const usb2_phy_ps_groups[] = { + "gpio125", +}; + +static const char *const usb2_sbrx_groups[] = { + "gpio181", +}; + +static const char *const usb2_sbtx_groups[] = { + "gpio182", + "gpio183", +}; + +static const char *const usb2_tmu_groups[] = { + "gpio98", +}; + +static const char *const vsense_trigger_mirnat_groups[] = { + "gpio38", +}; + +static const char *const wcn_sw_groups[] = { + "gpio221", +}; + +static const char *const wcn_sw_ctrl_groups[] = { + "gpio214", +}; + +static const struct pinfunction glymur_functions[] = { + MSM_PIN_FUNCTION(gpio), + MSM_PIN_FUNCTION(resout_gpio_n), + MSM_PIN_FUNCTION(aoss_cti), + MSM_PIN_FUNCTION(asc_cci), + MSM_PIN_FUNCTION(atest_char), + MSM_PIN_FUNCTION(atest_usb), + MSM_PIN_FUNCTION(audio_ext_mclk0), + MSM_PIN_FUNCTION(audio_ext_mclk1), + MSM_PIN_FUNCTION(audio_ref_clk), + MSM_PIN_FUNCTION(cam_asc_mclk4), + MSM_PIN_FUNCTION(cam_mclk), + MSM_PIN_FUNCTION(cci_async_in), + MSM_PIN_FUNCTION(cci_i2c_scl), + MSM_PIN_FUNCTION(cci_i2c_sda), + MSM_PIN_FUNCTION(cci_timer), + MSM_PIN_FUNCTION(cmu_rng), + MSM_PIN_FUNCTION(cri_trng), + MSM_PIN_FUNCTION(dbg_out_clk), + MSM_PIN_FUNCTION(ddr_bist_complete), + MSM_PIN_FUNCTION(ddr_bist_fail), + MSM_PIN_FUNCTION(ddr_bist_start), + MSM_PIN_FUNCTION(ddr_bist_stop), + MSM_PIN_FUNCTION(ddr_pxi), + MSM_PIN_FUNCTION(edp0_hot), + MSM_PIN_FUNCTION(edp0_lcd), + MSM_PIN_FUNCTION(edp1_lcd), + MSM_PIN_FUNCTION(egpio), + MSM_PIN_FUNCTION(eusb_ac_en), + MSM_PIN_FUNCTION(gcc_gp1), + MSM_PIN_FUNCTION(gcc_gp2), + MSM_PIN_FUNCTION(gcc_gp3), + MSM_PIN_FUNCTION(host2wlan_sol), + MSM_PIN_FUNCTION(i2c0_s_scl), + MSM_PIN_FUNCTION(i2c0_s_sda), + MSM_PIN_FUNCTION(i2s0_data), + MSM_PIN_FUNCTION(i2s0_sck), + MSM_PIN_FUNCTION(i2s0_ws), + MSM_PIN_FUNCTION(i2s1_data), + MSM_PIN_FUNCTION(i2s1_sck), + MSM_PIN_FUNCTION(i2s1_ws), + MSM_PIN_FUNCTION(ibi_i3c), + MSM_PIN_FUNCTION(jitter_bist), + MSM_PIN_FUNCTION(mdp_vsync_out), + MSM_PIN_FUNCTION(mdp_vsync_e), + MSM_PIN_FUNCTION(mdp_vsync_p), + MSM_PIN_FUNCTION(mdp_vsync_s), + MSM_PIN_FUNCTION(pcie3a_clk), + MSM_PIN_FUNCTION(pcie3a_rst_n), + MSM_PIN_FUNCTION(pcie3b_clk), + MSM_PIN_FUNCTION(pcie4_clk_req_n), + MSM_PIN_FUNCTION(pcie5_clk_req_n), + MSM_PIN_FUNCTION(pcie6_clk_req_n), + MSM_PIN_FUNCTION(phase_flag), + MSM_PIN_FUNCTION(pll_bist_sync), + MSM_PIN_FUNCTION(pll_clk_aux), + MSM_PIN_FUNCTION(pmc_oca_n), + MSM_PIN_FUNCTION(pmc_uva_n), + MSM_PIN_FUNCTION(prng_rosc), + MSM_PIN_FUNCTION(qdss_cti), + MSM_PIN_FUNCTION(qdss_gpio), + MSM_PIN_FUNCTION(qspi0), + MSM_PIN_FUNCTION(qup0_se0), + MSM_PIN_FUNCTION(qup0_se1), + MSM_PIN_FUNCTION(qup0_se2), + MSM_PIN_FUNCTION(qup0_se3), + MSM_PIN_FUNCTION(qup0_se4), + MSM_PIN_FUNCTION(qup0_se5), + MSM_PIN_FUNCTION(qup0_se6), + MSM_PIN_FUNCTION(qup0_se7), + MSM_PIN_FUNCTION(qup1_se0), + MSM_PIN_FUNCTION(qup1_se1), + MSM_PIN_FUNCTION(qup1_se2), + MSM_PIN_FUNCTION(qup1_se3), + MSM_PIN_FUNCTION(qup1_se4), + MSM_PIN_FUNCTION(qup1_se5), + MSM_PIN_FUNCTION(qup1_se6), + MSM_PIN_FUNCTION(qup1_se7), + MSM_PIN_FUNCTION(qup2_se0), + MSM_PIN_FUNCTION(qup2_se1), + MSM_PIN_FUNCTION(qup2_se2), + MSM_PIN_FUNCTION(qup2_se3), + MSM_PIN_FUNCTION(qup2_se4), + MSM_PIN_FUNCTION(qup2_se5), + MSM_PIN_FUNCTION(qup2_se6), + MSM_PIN_FUNCTION(qup2_se7), + MSM_PIN_FUNCTION(qup3_se0), + MSM_PIN_FUNCTION(qup3_se1), + MSM_PIN_FUNCTION(sd_write_protect), + MSM_PIN_FUNCTION(sdc4_clk), + MSM_PIN_FUNCTION(sdc4_cmd), + MSM_PIN_FUNCTION(sdc4_data), + MSM_PIN_FUNCTION(smb_acok_n), + MSM_PIN_FUNCTION(sys_throttle), + MSM_PIN_FUNCTION(tb_trig_sdc2), + MSM_PIN_FUNCTION(tb_trig_sdc4), + MSM_PIN_FUNCTION(tmess_prng), + MSM_PIN_FUNCTION(tsense_pwm), + MSM_PIN_FUNCTION(tsense_therm), + MSM_PIN_FUNCTION(usb0_dp), + MSM_PIN_FUNCTION(usb0_phy_ps), + MSM_PIN_FUNCTION(usb0_sbrx), + MSM_PIN_FUNCTION(usb0_sbtx), + MSM_PIN_FUNCTION(usb0_tmu), + MSM_PIN_FUNCTION(usb1_dbg), + MSM_PIN_FUNCTION(usb1_dp), + MSM_PIN_FUNCTION(usb1_phy_ps), + MSM_PIN_FUNCTION(usb1_sbrx), + MSM_PIN_FUNCTION(usb1_sbtx), + MSM_PIN_FUNCTION(usb1_tmu), + MSM_PIN_FUNCTION(usb2_dp), + MSM_PIN_FUNCTION(usb2_phy_ps), + MSM_PIN_FUNCTION(usb2_sbrx), + MSM_PIN_FUNCTION(usb2_sbtx), + MSM_PIN_FUNCTION(usb2_tmu), + MSM_PIN_FUNCTION(vsense_trigger_mirnat), + MSM_PIN_FUNCTION(wcn_sw), + MSM_PIN_FUNCTION(wcn_sw_ctrl), +}; + +static const struct msm_pingroup glymur_groups[] = { + [0] = PINGROUP(0, qup0_se0, ibi_i3c, _, _, _, _, _, _, _, _, _), + [1] = PINGROUP(1, qup0_se0, ibi_i3c, _, _, _, _, _, _, _, _, _), + [2] = PINGROUP(2, qup0_se0, _, _, _, _, _, _, _, _, _, _), + [3] = PINGROUP(3, qup0_se0, _, _, _, _, _, _, _, _, _, _), + [4] = PINGROUP(4, qup0_se1, qup0_se6, ibi_i3c, _, _, _, _, _, _, _, _), + [5] = PINGROUP(5, qup0_se1, qup0_se6, ibi_i3c, _, _, _, _, _, _, _, _), + [6] = PINGROUP(6, qup0_se1, qup0_se6, i2c0_s_sda, phase_flag, _, _, _, _, _, _, _), + [7] = PINGROUP(7, qup0_se1, qup0_se6, i2c0_s_scl, phase_flag, _, _, _, _, _, _, _), + [8] = PINGROUP(8, qup0_se2, phase_flag, _, _, _, _, _, _, _, _, _), + [9] = PINGROUP(9, qup0_se2, phase_flag, _, _, _, _, _, _, _, _, _), + [10] = PINGROUP(10, qup0_se2, phase_flag, _, _, _, _, _, _, _, _, _), + [11] = PINGROUP(11, qup0_se2, phase_flag, _, _, _, _, _, _, _, _, _), + [12] = PINGROUP(12, qup0_se3, qup0_se7, phase_flag, _, _, _, _, _, _, _, _), + [13] = PINGROUP(13, qup0_se3, qup0_se7, phase_flag, _, _, _, _, _, _, _, _), + [14] = PINGROUP(14, qup0_se3, qup0_se7, phase_flag, _, _, _, _, _, _, _, _), + [15] = PINGROUP(15, qup0_se3, qup0_se7, phase_flag, _, _, _, _, _, _, _, _), + [16] = PINGROUP(16, qup0_se4, phase_flag, _, _, _, _, _, _, _, _, _), + [17] = PINGROUP(17, qup0_se4, qup0_se2, phase_flag, _, _, _, _, _, _, _, _), + [18] = PINGROUP(18, qup0_se4, qup0_se2, phase_flag, _, qdss_cti, _, _, _, _, _, _), + [19] = PINGROUP(19, qup0_se4, qup0_se2, phase_flag, _, qdss_cti, _, _, _, _, _, _), + [20] = PINGROUP(20, qup0_se5, _, phase_flag, _, _, _, _, _, _, _, _), + [21] = PINGROUP(21, qup0_se5, qup0_se3, _, phase_flag, _, _, _, _, _, _, _), + [22] = PINGROUP(22, qup0_se5, qup0_se3, _, phase_flag, _, _, _, _, _, _, _), + [23] = PINGROUP(23, qup0_se5, qup0_se3, phase_flag, _, qdss_cti, _, _, _, _, _, _), + [24] = PINGROUP(24, phase_flag, _, _, _, _, _, _, _, _, _, _), + [25] = PINGROUP(25, phase_flag, _, _, _, _, _, _, _, _, _, _), + [26] = PINGROUP(26, phase_flag, _, _, _, _, _, _, _, _, _, _), + [27] = PINGROUP(27, phase_flag, _, qdss_cti, _, _, _, _, _, _, _, _), + [28] = PINGROUP(28, pll_bist_sync, tsense_pwm, _, _, _, _, _, _, _, _, _), + [29] = PINGROUP(29, tsense_pwm, _, _, _, _, _, _, _, _, _, _), + [30] = PINGROUP(30, tsense_pwm, _, _, _, _, _, _, _, _, _, _), + [31] = PINGROUP(31, tsense_pwm, _, _, _, _, _, _, _, _, _, _), + [32] = PINGROUP(32, qup1_se0, ibi_i3c, _, _, _, _, _, _, _, _, _), + [33] = PINGROUP(33, qup1_se0, ibi_i3c, qup1_se3, _, _, _, _, _, _, _, _), + [34] = PINGROUP(34, qup1_se0, qup1_se3, tsense_pwm, _, _, _, _, _, _, _, _), + [35] = PINGROUP(35, qup1_se0, qup1_se3, pll_clk_aux, _, _, _, _, _, _, _, _), + [36] = PINGROUP(36, qup1_se1, ibi_i3c, _, _, _, _, _, _, _, _, _), + [37] = PINGROUP(37, qup1_se1, ibi_i3c, _, _, _, _, _, _, _, _, _), + [38] = PINGROUP(38, qup1_se1, atest_usb, ddr_pxi, vsense_trigger_mirnat, _, _, _, _, + _, _, _), + [39] = PINGROUP(39, qup1_se1, sys_throttle, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [40] = PINGROUP(40, qup1_se2, qup3_se1, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [41] = PINGROUP(41, qup1_se2, qup3_se1, qup3_se0, atest_usb, ddr_pxi, _, _, _, _, + _, _), + [42] = PINGROUP(42, qup1_se2, qup3_se1, qup0_se1, atest_usb, ddr_pxi, _, _, _, _, + _, _), + [43] = PINGROUP(43, qup1_se2, qup3_se1, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [44] = PINGROUP(44, qup1_se3, _, atest_usb, ddr_pxi, _, _, _, _, _, _, _), + [45] = PINGROUP(45, qup1_se3, cmu_rng, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [46] = PINGROUP(46, qup1_se3, cmu_rng, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [47] = PINGROUP(47, qup1_se3, cmu_rng, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [48] = PINGROUP(48, qup1_se4, qup3_se1, cmu_rng, _, atest_usb, ddr_pxi, _, _, _, + _, _), + [49] = PINGROUP(49, qup1_se4, qup1_se2, qup3_se1, _, atest_usb, ddr_pxi, _, _, + _, _, _), + [50] = PINGROUP(50, qup1_se4, qup1_se2, qup3_se1, _, atest_usb, ddr_pxi, _, _, + _, _, _), + [51] = PINGROUP(51, qup1_se4, qup1_se2, qup3_se1, dbg_out_clk, atest_usb, + ddr_pxi, _, _, _, _, _), + [52] = PINGROUP(52, qup1_se5, qup1_se7, jitter_bist, atest_usb, ddr_pxi, _, _, _, + _, _, _), + [53] = PINGROUP(53, qup1_se5, qup1_se7, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [54] = PINGROUP(54, qup1_se5, qup1_se7, ddr_bist_start, atest_usb, ddr_pxi, _, _, + _, _, _, _), + [55] = PINGROUP(55, qup1_se5, qup1_se7, ddr_bist_stop, atest_usb, ddr_pxi, _, _, + _, _, _, _), + [56] = PINGROUP(56, qup1_se6, ddr_bist_fail, _, _, _, _, _, _, _, _, _), + [57] = PINGROUP(57, qup1_se6, ddr_bist_complete, _, _, _, _, _, _, _, _, _), + [58] = PINGROUP(58, qup1_se6, _, _, _, _, _, _, _, _, _, _), + [59] = PINGROUP(59, qup1_se6, _, _, _, _, _, _, _, _, _, _), + [60] = PINGROUP(60, aoss_cti, _, _, _, _, _, _, _, _, _, _), + [61] = PINGROUP(61, aoss_cti, _, _, _, _, _, _, _, _, _, _), + [62] = PINGROUP(62, aoss_cti, _, _, _, _, _, _, _, _, _, _), + [63] = PINGROUP(63, aoss_cti, _, _, _, _, _, _, _, _, _, _), + [64] = PINGROUP(64, qup2_se0, ibi_i3c, gcc_gp2, _, _, _, _, _, _, _, _), + [65] = PINGROUP(65, qup2_se0, qup2_se3, ibi_i3c, atest_usb, ddr_pxi, _, _, _, _, + _, _), + [66] = PINGROUP(66, qup2_se0, qup2_se3, atest_usb, ddr_pxi, _, _, _, _, _, _, _), + [67] = PINGROUP(67, qup2_se0, qup2_se3, _, _, _, _, _, _, _, _, _), + [68] = PINGROUP(68, qup2_se1, ibi_i3c, _, _, _, _, _, _, _, _, _), + [69] = PINGROUP(69, qup2_se1, ibi_i3c, _, _, _, _, _, _, _, _, _), + [70] = PINGROUP(70, qup2_se1, _, _, _, _, _, _, _, _, _, _), + [71] = PINGROUP(71, qup2_se1, gcc_gp1, _, _, _, _, _, _, _, _, _), + [72] = PINGROUP(72, qup2_se2, gcc_gp1, atest_usb, ddr_pxi, _, _, _, _, _, _, _), + [73] = PINGROUP(73, qup2_se2, gcc_gp2, atest_usb, ddr_pxi, _, _, _, _, _, _, _), + [74] = PINGROUP(74, qup2_se2, gcc_gp3, _, _, _, _, _, _, _, _, _), + [75] = PINGROUP(75, qup2_se2, _, _, _, _, _, _, _, _, _, _), + [76] = PINGROUP(76, qup2_se3, phase_flag, _, _, _, _, _, _, _, _, _), + [77] = PINGROUP(77, qup2_se3, phase_flag, _, _, _, _, _, _, _, _, _), + [78] = PINGROUP(78, qup2_se3, phase_flag, _, _, _, _, _, _, _, _, _), + [79] = PINGROUP(79, qup2_se3, _, _, _, _, _, _, _, _, _, _), + [80] = PINGROUP(80, qup2_se4, qup2_se7, atest_usb, ddr_pxi, _, _, _, _, _, _, _), + [81] = PINGROUP(81, qup2_se4, qup2_se2, qup2_se7, atest_usb, ddr_pxi, _, _, _, + _, _, _), + [82] = PINGROUP(82, qup2_se4, qup2_se2, qup2_se7, gcc_gp3, _, _, _, _, _, _, _), + [83] = PINGROUP(83, qup2_se4, qup2_se2, qup2_se7, _, _, _, _, _, _, _, _), + [84] = PINGROUP(84, qup2_se5, _, _, _, _, _, _, _, _, _, _), + [85] = PINGROUP(85, qup2_se5, _, _, _, _, _, _, _, _, _, _), + [86] = PINGROUP(86, qup2_se5, _, _, _, _, _, _, _, _, _, _), + [87] = PINGROUP(87, qup2_se5, _, _, _, _, _, _, _, _, _, _), + [88] = PINGROUP(88, qup2_se6, _, _, _, _, _, _, _, _, _, _), + [89] = PINGROUP(89, qup2_se6, _, _, _, _, _, _, _, _, _, _), + [90] = PINGROUP(90, qup2_se6, _, _, _, _, _, _, _, _, _, _), + [91] = PINGROUP(91, qup2_se6, _, _, _, _, _, _, _, _, _, _), + [92] = PINGROUP(92, tmess_prng, _, _, _, _, _, _, _, _, _, _), + [93] = PINGROUP(93, tmess_prng, _, _, _, _, _, _, _, _, _, _), + [94] = PINGROUP(94, sys_throttle, tmess_prng, _, _, _, _, _, _, _, _, _), + [95] = PINGROUP(95, tmess_prng, _, _, _, _, _, _, _, _, _, _), + [96] = PINGROUP(96, cam_mclk, qdss_gpio, _, _, _, _, _, _, _, _, _), + [97] = PINGROUP(97, cam_mclk, qdss_gpio, _, _, _, _, _, _, _, _, _), + [98] = PINGROUP(98, cam_mclk, mdp_vsync_p, usb0_tmu, usb1_tmu, usb2_tmu, _, _, _, _, _, _), + [99] = PINGROUP(99, cam_mclk, qdss_gpio, _, _, _, _, _, _, _, _, _), + [100] = PINGROUP(100, cam_asc_mclk4, qdss_gpio, _, _, _, _, _, _, _, _, _), + [101] = PINGROUP(101, cci_i2c_sda, qdss_gpio, _, _, _, _, _, _, _, _, _), + [102] = PINGROUP(102, cci_i2c_scl, qdss_gpio, _, _, _, _, _, _, _, _, _), + [103] = PINGROUP(103, cci_i2c_sda, qdss_gpio, _, _, _, _, _, _, _, _, _), + [104] = PINGROUP(104, cci_i2c_scl, qdss_gpio, _, _, _, _, _, _, _, _, _), + [105] = PINGROUP(105, cci_i2c_sda, mdp_vsync_s, usb1_dbg, _, _, _, _, _, _, _, _), + [106] = PINGROUP(106, cci_i2c_scl, mdp_vsync_e, usb1_dbg, _, _, _, _, _, _, _, _), + [107] = PINGROUP(107, qdss_gpio, _, _, _, _, _, _, _, _, _, _), + [108] = PINGROUP(108, qdss_gpio, _, _, _, _, _, _, _, _, _, _), + [109] = PINGROUP(109, cci_timer, mdp_vsync_out, qdss_gpio, _, _, _, _, _, _, _, _), + [110] = PINGROUP(110, cci_timer, mdp_vsync_out, qdss_gpio, _, _, _, _, _, _, _, _), + [111] = PINGROUP(111, cci_timer, cci_async_in, mdp_vsync_out, qdss_gpio, _, _, _, _, + _, _, _), + [112] = PINGROUP(112, cci_timer, cci_async_in, mdp_vsync_out, qdss_gpio, _, _, _, _, + _, _, _), + [113] = PINGROUP(113, cci_timer, cci_async_in, mdp_vsync_out, qdss_gpio, _, _, _, _, + _, _, _), + [114] = PINGROUP(114, mdp_vsync_out, mdp_vsync_out, _, _, _, _, _, _, _, _, _), + [115] = PINGROUP(115, mdp_vsync_out, mdp_vsync_out, edp1_lcd, _, _, _, _, _, _, _, _), + [116] = PINGROUP(116, _, _, _, _, _, _, _, _, _, _, _), + [117] = PINGROUP(117, _, _, _, _, _, _, _, _, _, _, _), + [118] = PINGROUP(118, host2wlan_sol, _, _, _, _, _, _, _, _, _, _), + [119] = PINGROUP(119, edp0_hot, edp1_lcd, _, _, _, _, _, _, _, _, _), + [120] = PINGROUP(120, edp0_lcd, _, _, _, _, _, _, _, _, _, _), + [121] = PINGROUP(121, usb0_phy_ps, _, _, _, _, _, _, _, _, _, _), + [122] = PINGROUP(122, usb0_dp, _, _, _, _, _, _, _, _, _, _), + [123] = PINGROUP(123, usb1_phy_ps, _, _, _, _, _, _, _, _, _, _), + [124] = PINGROUP(124, usb1_dp, _, _, _, _, _, _, _, _, _, _), + [125] = PINGROUP(125, usb2_phy_ps, _, _, _, _, _, _, _, _, _, _), + [126] = PINGROUP(126, usb2_dp, _, _, _, _, _, _, _, _, _, _), + [127] = PINGROUP(127, qspi0, sdc4_clk, qup3_se0, _, _, _, _, _, _, _, _), + [128] = PINGROUP(128, qspi0, sdc4_data, qup3_se0, _, _, _, _, _, _, _, _), + [129] = PINGROUP(129, qspi0, sdc4_data, qup3_se0, _, _, _, _, _, _, _, _), + [130] = PINGROUP(130, qspi0, sdc4_data, qup3_se0, _, _, _, _, _, _, _, _), + [131] = PINGROUP(131, qspi0, sdc4_data, qup3_se0, _, _, _, _, _, _, _, _), + [132] = PINGROUP(132, qspi0, sdc4_cmd, qup3_se0, _, _, _, _, _, _, _, _), + [133] = PINGROUP(133, qspi0, tb_trig_sdc4, qup3_se0, _, _, _, _, _, _, _, _), + [134] = PINGROUP(134, audio_ext_mclk0, _, _, _, _, _, _, _, _, _, _), + [135] = PINGROUP(135, i2s0_sck, _, _, _, _, _, _, _, _, _, _), + [136] = PINGROUP(136, i2s0_data, _, _, _, _, _, _, _, _, _, _), + [137] = PINGROUP(137, i2s0_data, tb_trig_sdc2, _, _, _, _, _, _, _, _, _), + [138] = PINGROUP(138, i2s0_ws, tsense_pwm, _, _, _, _, _, _, _, _, _), + [139] = PINGROUP(139, i2s1_sck, tsense_pwm, _, _, _, _, _, _, _, _, _), + [140] = PINGROUP(140, i2s1_data, tsense_pwm, _, _, _, _, _, _, _, _, _), + [141] = PINGROUP(141, i2s1_ws, tsense_therm, _, _, _, _, _, _, _, _, _), + [142] = PINGROUP(142, i2s1_data, audio_ext_mclk1, audio_ref_clk, _, _, _, _, _, _, _, _), + [143] = PINGROUP(143, pcie3a_rst_n, _, _, _, _, _, _, _, _, _, _), + [144] = PINGROUP(144, pcie3a_clk, _, _, _, _, _, _, _, _, _, _), + [145] = PINGROUP(145, _, _, _, _, _, _, _, _, _, _, _), + [146] = PINGROUP(146, _, _, _, _, _, _, _, _, _, _, _), + [147] = PINGROUP(147, pcie4_clk_req_n, _, _, _, _, _, _, _, _, _, _), + [148] = PINGROUP(148, _, _, _, _, _, _, _, _, _, _, _), + [149] = PINGROUP(149, qdss_gpio, _, _, _, _, _, _, _, _, _, _), + [150] = PINGROUP(150, pcie6_clk_req_n, _, _, _, _, _, _, _, _, _, _), + [151] = PINGROUP(151, qdss_gpio, _, _, _, _, _, _, _, _, _, _), + [152] = PINGROUP(152, qdss_gpio, _, _, _, _, _, _, _, _, _, _), + [153] = PINGROUP(153, pcie5_clk_req_n, _, _, _, _, _, _, _, _, _, _), + [154] = PINGROUP(154, _, _, _, _, _, _, _, _, _, _, _), + [155] = PINGROUP(155, _, _, _, _, _, _, _, _, _, _, _), + [156] = PINGROUP(156, pcie3b_clk, _, _, _, _, _, _, _, _, _, _), + [157] = PINGROUP(157, _, _, _, _, _, _, _, _, _, _, _), + [158] = PINGROUP(158, _, _, _, _, _, _, _, _, _, _, _), + [159] = PINGROUP(159, _, _, _, _, _, _, _, _, _, _, _), + [160] = PINGROUP(160, resout_gpio_n, _, _, _, _, _, _, _, _, _, _), + [161] = PINGROUP(161, qdss_cti, _, _, _, _, _, _, _, _, _, _), + [162] = PINGROUP(162, sd_write_protect, qdss_cti, _, _, _, _, _, _, _, _, _), + [163] = PINGROUP(163, usb0_sbrx, prng_rosc, phase_flag, _, atest_char, _, _, _, + _, _, _), + [164] = PINGROUP(164, usb0_sbtx, prng_rosc, phase_flag, _, atest_char, _, _, _, _, _, + _), + [165] = PINGROUP(165, usb0_sbtx, _, _, _, _, _, _, _, _, _, _), + [166] = PINGROUP(166, _, _, _, _, _, _, _, _, _, _, _), + [167] = PINGROUP(167, _, _, _, _, _, _, _, _, _, _, _), + [168] = PINGROUP(168, eusb_ac_en, _, _, _, _, _, _, _, _, _, _), + [169] = PINGROUP(169, eusb_ac_en, _, _, _, _, _, _, _, _, _, _), + [170] = PINGROUP(170, _, _, _, _, _, _, _, _, _, _, _), + [171] = PINGROUP(171, _, _, _, _, _, _, _, _, _, _, _), + [172] = PINGROUP(172, usb1_sbrx, phase_flag, _, atest_char, _, _, _, _, _, _, _), + [173] = PINGROUP(173, usb1_sbtx, cri_trng, phase_flag, _, _, _, _, _, _, _, _), + [174] = PINGROUP(174, usb1_sbtx, _, _, _, _, _, _, _, _, _, _), + [175] = PINGROUP(175, _, _, _, _, _, _, _, _, _, _, _), + [176] = PINGROUP(176, _, _, _, _, _, _, _, _, _, _, _), + [177] = PINGROUP(177, eusb_ac_en, _, _, _, _, _, _, _, _, _, _), + [178] = PINGROUP(178, eusb_ac_en, _, _, _, _, _, _, _, _, _, _), + [179] = PINGROUP(179, _, _, _, _, _, _, _, _, _, _, _), + [180] = PINGROUP(180, _, _, _, _, _, _, _, _, _, _, _), + [181] = PINGROUP(181, usb2_sbrx, _, _, _, _, _, _, _, _, _, _), + [182] = PINGROUP(182, usb2_sbtx, _, _, _, _, _, _, _, _, _, _), + [183] = PINGROUP(183, usb2_sbtx, _, _, _, _, _, _, _, _, _, _), + [184] = PINGROUP(184, phase_flag, _, atest_char, _, _, _, _, _, _, _, _), + [185] = PINGROUP(185, _, _, _, _, _, _, _, _, _, _, _), + [186] = PINGROUP(186, eusb_ac_en, prng_rosc, phase_flag, _, _, _, _, _, _, _, _), + [187] = PINGROUP(187, eusb_ac_en, _, _, _, _, _, _, _, _, _, _), + [188] = PINGROUP(188, prng_rosc, phase_flag, _, atest_char, _, _, _, _, _, _, _), + [189] = PINGROUP(189, _, _, _, _, _, _, _, _, _, _, _), + [190] = PINGROUP(190, _, _, _, _, _, _, _, _, _, _, _), + [191] = PINGROUP(191, _, _, _, _, _, _, _, _, _, _, _), + [192] = PINGROUP(192, _, _, _, _, _, _, _, _, _, _, egpio), + [193] = PINGROUP(193, _, _, _, _, _, _, _, _, _, _, egpio), + [194] = PINGROUP(194, _, _, _, _, _, _, _, _, _, _, egpio), + [195] = PINGROUP(195, _, _, _, _, _, _, _, _, _, _, egpio), + [196] = PINGROUP(196, _, _, _, _, _, _, _, _, _, _, egpio), + [197] = PINGROUP(197, _, _, _, _, _, _, _, _, _, _, egpio), + [198] = PINGROUP(198, _, _, _, _, _, _, _, _, _, _, egpio), + [199] = PINGROUP(199, _, _, _, _, _, _, _, _, _, _, egpio), + [200] = PINGROUP(200, _, _, _, _, _, _, _, _, _, _, egpio), + [201] = PINGROUP(201, _, _, _, _, _, _, _, _, _, _, egpio), + [202] = PINGROUP(202, _, _, _, _, _, _, _, _, _, _, egpio), + [203] = PINGROUP(203, _, _, _, _, _, _, _, _, _, _, egpio), + [204] = PINGROUP(204, _, _, _, _, _, _, _, _, _, _, egpio), + [205] = PINGROUP(205, _, _, _, _, _, _, _, _, _, _, egpio), + [206] = PINGROUP(206, _, _, _, _, _, _, _, _, _, _, egpio), + [207] = PINGROUP(207, _, _, _, _, _, _, _, _, _, _, egpio), + [208] = PINGROUP(208, _, _, _, _, _, _, _, _, _, _, egpio), + [209] = PINGROUP(209, _, _, _, _, _, _, _, _, _, _, egpio), + [210] = PINGROUP(210, _, _, _, _, _, _, _, _, _, _, egpio), + [211] = PINGROUP(211, _, _, _, _, _, _, _, _, _, _, egpio), + [212] = PINGROUP(212, _, _, _, _, _, _, _, _, _, _, egpio), + [213] = PINGROUP(213, _, _, _, _, _, _, _, _, _, _, egpio), + [214] = PINGROUP(214, wcn_sw_ctrl, _, _, _, _, _, _, _, _, _, egpio), + [215] = PINGROUP(215, _, qdss_cti, _, _, _, _, _, _, _, _, egpio), + [216] = PINGROUP(216, _, _, _, _, _, _, _, _, _, _, egpio), + [217] = PINGROUP(217, _, qdss_cti, _, _, _, _, _, _, _, _, egpio), + [218] = PINGROUP(218, _, _, _, _, _, _, _, _, _, _, egpio), + [219] = PINGROUP(219, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [220] = PINGROUP(220, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [221] = PINGROUP(221, wcn_sw, _, qdss_gpio, _, _, _, _, _, _, _, egpio), + [222] = PINGROUP(222, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [223] = PINGROUP(223, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [224] = PINGROUP(224, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [225] = PINGROUP(225, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [226] = PINGROUP(226, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [227] = PINGROUP(227, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [228] = PINGROUP(228, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [229] = PINGROUP(229, _, _, _, _, _, _, _, _, _, _, egpio), + [230] = PINGROUP(230, _, _, _, _, _, _, _, _, _, _, egpio), + [231] = PINGROUP(231, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [232] = PINGROUP(232, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [233] = PINGROUP(233, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [234] = PINGROUP(234, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [235] = PINGROUP(235, asc_cci, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [236] = PINGROUP(236, asc_cci, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [237] = PINGROUP(237, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [238] = PINGROUP(238, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [239] = PINGROUP(239, _, _, _, _, _, _, _, _, _, _, egpio), + [240] = PINGROUP(240, _, _, _, _, _, _, _, _, _, _, egpio), + [241] = PINGROUP(241, _, _, _, _, _, _, _, _, _, _, egpio), + [242] = PINGROUP(242, _, _, _, _, _, _, _, _, _, _, egpio), + [243] = PINGROUP(243, _, _, _, _, _, _, _, _, _, _, egpio), + [244] = PINGROUP(244, _, _, _, _, _, _, _, _, _, _, egpio), + [245] = PINGROUP(245, smb_acok_n, _, _, _, _, _, _, _, _, _, _), + [246] = PINGROUP(246, _, _, _, _, _, _, _, _, _, _, _), + [247] = PINGROUP(247, qup3_se0, _, _, _, _, _, _, _, _, _, _), + [248] = PINGROUP(248, pmc_uva_n, _, _, _, _, _, _, _, _, _, _), + [249] = PINGROUP(249, pmc_oca_n, _, _, _, _, _, _, _, _, _, _), + [250] = UFS_RESET(ufs_reset, 0x104004, 0x105000), + [251] = SDC_QDSD_PINGROUP(sdc2_clk, 0xff000, 14, 6), + [252] = SDC_QDSD_PINGROUP(sdc2_cmd, 0xff000, 11, 3), + [253] = SDC_QDSD_PINGROUP(sdc2_data, 0xff000, 9, 0), +}; + +static const struct msm_gpio_wakeirq_map glymur_pdc_map[] = { + { 0, 116 }, { 2, 114 }, { 3, 115 }, { 4, 175 }, { 5, 176 }, + { 7, 111 }, { 11, 129 }, { 13, 130 }, { 15, 112 }, { 19, 113 }, + { 23, 187 }, { 27, 188 }, { 28, 121 }, { 29, 122 }, { 30, 136 }, + { 31, 203 }, { 32, 189 }, { 34, 174 }, { 35, 190 }, { 36, 191 }, + { 39, 124 }, { 43, 192 }, { 47, 193 }, { 51, 123 }, { 53, 133 }, + { 55, 125 }, { 59, 131 }, { 64, 134 }, { 65, 150 }, { 66, 186 }, + { 67, 132 }, { 68, 195 }, { 71, 135 }, { 75, 196 }, { 79, 197 }, + { 83, 198 }, { 84, 181 }, { 85, 199 }, { 87, 200 }, { 91, 201 }, + { 92, 182 }, { 93, 183 }, { 94, 184 }, { 95, 185 }, { 98, 202 }, + { 105, 157 }, { 113, 128 }, { 121, 117 }, { 123, 118 }, { 125, 119 }, + { 129, 120 }, { 131, 126 }, { 132, 160 }, { 133, 194 }, { 134, 127 }, + { 141, 137 }, { 143, 159 }, { 144, 138 }, { 145, 139 }, { 147, 140 }, + { 148, 141 }, { 150, 146 }, { 151, 147 }, { 153, 148 }, { 154, 144 }, + { 156, 149 }, { 157, 151 }, { 163, 142 }, { 172, 143 }, { 181, 145 }, + { 193, 161 }, { 196, 152 }, { 203, 177 }, { 208, 178 }, { 215, 162 }, + { 217, 153 }, { 220, 154 }, { 221, 155 }, { 228, 179 }, { 230, 180 }, + { 232, 206 }, { 234, 172 }, { 235, 173 }, { 242, 158 }, { 244, 156 }, +}; + +static const struct msm_pinctrl_soc_data glymur_tlmm = { + .pins = glymur_pins, + .npins = ARRAY_SIZE(glymur_pins), + .functions = glymur_functions, + .nfunctions = ARRAY_SIZE(glymur_functions), + .groups = glymur_groups, + .ngroups = ARRAY_SIZE(glymur_groups), + .ngpios = 251, + .wakeirq_map = glymur_pdc_map, + .nwakeirq_map = ARRAY_SIZE(glymur_pdc_map), + .egpio_func = 11, +}; + +static const struct of_device_id glymur_tlmm_of_match[] = { + { .compatible = "qcom,glymur-tlmm", .data = &glymur_tlmm }, + { } +}; + +static int glymur_tlmm_probe(struct platform_device *pdev) +{ + return msm_pinctrl_probe(pdev, &glymur_tlmm); +} + +static struct platform_driver glymur_tlmm_driver = { + .driver = { + .name = "glymur-tlmm", + .of_match_table = glymur_tlmm_of_match, + }, + .probe = glymur_tlmm_probe, +}; + +static int __init glymur_tlmm_init(void) +{ + return platform_driver_register(&glymur_tlmm_driver); +} +arch_initcall(glymur_tlmm_init); + +static void __exit glymur_tlmm_exit(void) +{ + platform_driver_unregister(&glymur_tlmm_driver); +} +module_exit(glymur_tlmm_exit); + +MODULE_DESCRIPTION("QTI GLYMUR TLMM driver"); +MODULE_LICENSE("GPL"); +MODULE_DEVICE_TABLE(of, glymur_tlmm_of_match); From 6f9674aa69ad0178ca8fc6995942ba9848c324f4 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Mon, 8 Sep 2025 13:34:24 +0200 Subject: [PATCH 1137/3206] pinctrl: airoha: fix wrong PHY LED mux value for LED1 GPIO46 In all the MUX value for LED1 GPIO46 there is a Copy-Paste error where the MUX value is set to LED0_MODE_MASK instead of LED1_MODE_MASK. This wasn't notice as there were no board that made use of the secondary PHY LED but looking at the internal Documentation the actual value should be LED1_MODE_MASK similar to the other GPIO entry. Fix the wrong value to apply the correct MUX configuration. Cc: stable@vger.kernel.org Fixes: 1c8ace2d0725 ("pinctrl: airoha: Add support for EN7581 SoC") Signed-off-by: Christian Marangi Acked-by: Lorenzo Bianconi Signed-off-by: Linus Walleij --- drivers/pinctrl/mediatek/pinctrl-airoha.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/mediatek/pinctrl-airoha.c b/drivers/pinctrl/mediatek/pinctrl-airoha.c index 1b2f132d76f0af..ee5b9bab455258 100644 --- a/drivers/pinctrl/mediatek/pinctrl-airoha.c +++ b/drivers/pinctrl/mediatek/pinctrl-airoha.c @@ -1752,8 +1752,8 @@ static const struct airoha_pinctrl_func_group phy1_led1_func_group[] = { .regmap[0] = { AIROHA_FUNC_MUX, REG_GPIO_2ND_I2C_MODE, - GPIO_LAN3_LED0_MODE_MASK, - GPIO_LAN3_LED0_MODE_MASK + GPIO_LAN3_LED1_MODE_MASK, + GPIO_LAN3_LED1_MODE_MASK }, .regmap[1] = { AIROHA_FUNC_MUX, @@ -1816,8 +1816,8 @@ static const struct airoha_pinctrl_func_group phy2_led1_func_group[] = { .regmap[0] = { AIROHA_FUNC_MUX, REG_GPIO_2ND_I2C_MODE, - GPIO_LAN3_LED0_MODE_MASK, - GPIO_LAN3_LED0_MODE_MASK + GPIO_LAN3_LED1_MODE_MASK, + GPIO_LAN3_LED1_MODE_MASK }, .regmap[1] = { AIROHA_FUNC_MUX, @@ -1880,8 +1880,8 @@ static const struct airoha_pinctrl_func_group phy3_led1_func_group[] = { .regmap[0] = { AIROHA_FUNC_MUX, REG_GPIO_2ND_I2C_MODE, - GPIO_LAN3_LED0_MODE_MASK, - GPIO_LAN3_LED0_MODE_MASK + GPIO_LAN3_LED1_MODE_MASK, + GPIO_LAN3_LED1_MODE_MASK }, .regmap[1] = { AIROHA_FUNC_MUX, @@ -1944,8 +1944,8 @@ static const struct airoha_pinctrl_func_group phy4_led1_func_group[] = { .regmap[0] = { AIROHA_FUNC_MUX, REG_GPIO_2ND_I2C_MODE, - GPIO_LAN3_LED0_MODE_MASK, - GPIO_LAN3_LED0_MODE_MASK + GPIO_LAN3_LED1_MODE_MASK, + GPIO_LAN3_LED1_MODE_MASK }, .regmap[1] = { AIROHA_FUNC_MUX, From 3ec600210849cf122606e24caab85f0b936cf63c Mon Sep 17 00:00:00 2001 From: Fabien Proriol Date: Mon, 7 Jul 2025 17:55:08 +0200 Subject: [PATCH 1138/3206] power: supply: sbs-charger: Support multiple devices If we have 2 instances of sbs-charger in the DTS, the driver probe for the second instance will fail: [ 8.012874] sbs-battery 18-000b: sbs-battery: battery gas gauge device registered [ 8.039094] sbs-charger 18-0009: ltc4100: smart charger device registered [ 8.112911] sbs-battery 20-000b: sbs-battery: battery gas gauge device registered [ 8.134533] sysfs: cannot create duplicate filename '/class/power_supply/sbs-charger' [ 8.143871] CPU: 3 PID: 295 Comm: systemd-udevd Tainted: G O 5.10.147 #22 [ 8.151974] Hardware name: ALE AMB (DT) [ 8.155828] Call trace: [ 8.158292] dump_backtrace+0x0/0x1d4 [ 8.161960] show_stack+0x18/0x6c [ 8.165280] dump_stack+0xcc/0x128 [ 8.168687] sysfs_warn_dup+0x60/0x7c [ 8.172353] sysfs_do_create_link_sd+0xf0/0x100 [ 8.176886] sysfs_create_link+0x20/0x40 [ 8.180816] device_add+0x270/0x7a4 [ 8.184311] __power_supply_register+0x304/0x560 [ 8.188930] devm_power_supply_register+0x54/0xa0 [ 8.193644] sbs_probe+0xc0/0x214 [sbs_charger] [ 8.198183] i2c_device_probe+0x2dc/0x2f4 [ 8.202196] really_probe+0xf0/0x510 [ 8.205774] driver_probe_device+0xfc/0x160 [ 8.209960] device_driver_attach+0xc0/0xcc [ 8.214146] __driver_attach+0xc0/0x170 [ 8.218002] bus_for_each_dev+0x74/0xd4 [ 8.221862] driver_attach+0x24/0x30 [ 8.225444] bus_add_driver+0x148/0x250 [ 8.229283] driver_register+0x78/0x130 [ 8.233140] i2c_register_driver+0x4c/0xe0 [ 8.237250] sbs_driver_init+0x20/0x1000 [sbs_charger] [ 8.242424] do_one_initcall+0x50/0x1b0 [ 8.242434] do_init_module+0x44/0x230 [ 8.242438] load_module+0x2200/0x27c0 [ 8.242442] __do_sys_finit_module+0xa8/0x11c [ 8.242447] __arm64_sys_finit_module+0x20/0x30 [ 8.242457] el0_svc_common.constprop.0+0x64/0x154 [ 8.242464] do_el0_svc+0x24/0x8c [ 8.242474] el0_svc+0x10/0x20 [ 8.242481] el0_sync_handler+0x108/0x114 [ 8.242485] el0_sync+0x180/0x1c0 [ 8.243847] sbs-charger 20-0009: Failed to register power supply [ 8.287934] sbs-charger: probe of 20-0009 failed with error -17 This is mainly because the "name" field of power_supply_desc is a constant. This patch fixes the issue by reusing the same approach as sbs-battery. With this patch, the result is: [ 7.819532] sbs-charger 18-0009: ltc4100: smart charger device registered [ 7.825305] sbs-battery 18-000b: sbs-battery: battery gas gauge device registered [ 7.887423] sbs-battery 20-000b: sbs-battery: battery gas gauge device registered [ 7.893501] sbs-charger 20-0009: ltc4100: smart charger device registered Signed-off-by: Fabien Proriol Signed-off-by: Sebastian Reichel --- drivers/power/supply/sbs-charger.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/power/supply/sbs-charger.c b/drivers/power/supply/sbs-charger.c index 27764123b929e2..7d5e676205805d 100644 --- a/drivers/power/supply/sbs-charger.c +++ b/drivers/power/supply/sbs-charger.c @@ -154,8 +154,7 @@ static const struct regmap_config sbs_regmap = { .val_format_endian = REGMAP_ENDIAN_LITTLE, /* since based on SMBus */ }; -static const struct power_supply_desc sbs_desc = { - .name = "sbs-charger", +static const struct power_supply_desc sbs_default_desc = { .type = POWER_SUPPLY_TYPE_MAINS, .properties = sbs_properties, .num_properties = ARRAY_SIZE(sbs_properties), @@ -165,9 +164,20 @@ static const struct power_supply_desc sbs_desc = { static int sbs_probe(struct i2c_client *client) { struct power_supply_config psy_cfg = {}; + struct power_supply_desc *sbs_desc; struct sbs_info *chip; int ret, val; + sbs_desc = devm_kmemdup(&client->dev, &sbs_default_desc, + sizeof(*sbs_desc), GFP_KERNEL); + if (!sbs_desc) + return -ENOMEM; + + sbs_desc->name = devm_kasprintf(&client->dev, GFP_KERNEL, "sbs-%s", + dev_name(&client->dev)); + if (!sbs_desc->name) + return -ENOMEM; + chip = devm_kzalloc(&client->dev, sizeof(struct sbs_info), GFP_KERNEL); if (!chip) return -ENOMEM; @@ -191,7 +201,7 @@ static int sbs_probe(struct i2c_client *client) return dev_err_probe(&client->dev, ret, "Failed to get device status\n"); chip->last_state = val; - chip->power_supply = devm_power_supply_register(&client->dev, &sbs_desc, &psy_cfg); + chip->power_supply = devm_power_supply_register(&client->dev, sbs_desc, &psy_cfg); if (IS_ERR(chip->power_supply)) return dev_err_probe(&client->dev, PTR_ERR(chip->power_supply), "Failed to register power supply\n"); From 90bad684e9ac5ae435c2715fab36f6799849e800 Mon Sep 17 00:00:00 2001 From: Wensheng Wang Date: Tue, 5 Aug 2025 18:20:20 +0800 Subject: [PATCH 1139/3206] hwmon: add MP29502 driver Add support for MPS VR controller mp29502. This driver exposes telemetry and limits value readings and writtings. Signed-off-by: Wensheng Wang Link: https://lore.kernel.org/r/20250805102020.749850-3-wenswang@yeah.net [groeck: Fixed document formatting] Signed-off-by: Guenter Roeck --- Documentation/hwmon/index.rst | 1 + Documentation/hwmon/mp29502.rst | 93 +++++ MAINTAINERS | 7 + drivers/hwmon/pmbus/Kconfig | 9 + drivers/hwmon/pmbus/Makefile | 1 + drivers/hwmon/pmbus/mp29502.c | 670 ++++++++++++++++++++++++++++++++ 6 files changed, 781 insertions(+) create mode 100644 Documentation/hwmon/mp29502.rst create mode 100644 drivers/hwmon/pmbus/mp29502.c diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst index 36303148dc43b2..b4d26a6fa3a5b1 100644 --- a/Documentation/hwmon/index.rst +++ b/Documentation/hwmon/index.rst @@ -176,6 +176,7 @@ Hardware Monitoring Kernel Drivers mp2869 mp2888 mp2891 + mp29502 mp2975 mp2993 mp5023 diff --git a/Documentation/hwmon/mp29502.rst b/Documentation/hwmon/mp29502.rst new file mode 100644 index 00000000000000..893e741a6b71cd --- /dev/null +++ b/Documentation/hwmon/mp29502.rst @@ -0,0 +1,93 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Kernel driver mp29502 +===================== + +Supported chips: + + * MPS mp29502 + + Prefix: 'mp29502' + +Author: + + Wensheng Wang + +Description +----------- + +This driver implements support for Monolithic Power Systems, Inc. (MPS) +MP29502 Digital Multi-phase Controller. + +Device compliant with: + +- PMBus rev 1.3 interface. + +The driver exports the following attributes via the 'sysfs' files +for input voltage: + +**in1_input** + +**in1_label** + +**in1_crit** + +**in1_crit_alarm** + +The driver provides the following attributes for output voltage: + +**in2_input** + +**in2_label** + +**in2_crit** + +**in2_crit_alarm** + +**in2_lcrit** + +**in2_lcrit_alarm** + +The driver provides the following attributes for input current: + +**curr1_input** + +**curr1_label** + +The driver provides the following attributes for output current: + +**curr2_input** + +**curr2_label** + +**curr2_crit** + +**curr2_crit_alarm** + +**curr2_max** + +**curr2_max_alarm** + +The driver provides the following attributes for input power: + +**power1_input** + +**power1_label** + +The driver provides the following attributes for output power: + +**power2_input** + +**power2_label** + +The driver provides the following attributes for temperature: + +**temp1_input** + +**temp1_crit** + +**temp1_crit_alarm** + +**temp1_max** + +**temp1_max_alarm** diff --git a/MAINTAINERS b/MAINTAINERS index 2f99cd97d62844..4e6f41ed4f2cff 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17189,6 +17189,13 @@ S: Maintained F: Documentation/hwmon/mp2891.rst F: drivers/hwmon/pmbus/mp2891.c +MPS MP29502 DRIVER +M: Wensheng Wang +L: linux-hwmon@vger.kernel.org +S: Maintained +F: Documentation/hwmon/mp29502.rst +F: drivers/hwmon/pmbus/mp29502.c + MPS MP2993 DRIVER M: Noah Wang L: linux-hwmon@vger.kernel.org diff --git a/drivers/hwmon/pmbus/Kconfig b/drivers/hwmon/pmbus/Kconfig index c5b2c5e9573760..da04ff6df28bd1 100644 --- a/drivers/hwmon/pmbus/Kconfig +++ b/drivers/hwmon/pmbus/Kconfig @@ -401,6 +401,15 @@ config SENSORS_MP2891 This driver can also be built as a module. If so, the module will be called mp2891. +config SENSORS_MP29502 + tristate "MPS MP29502" + help + If you say yes here you get hardware monitoring support for MPS + MP29502 Dual Loop Digital Multi-Phase Controller. + + This driver can also be built as a module. If so, the module will + be called mp29502. + config SENSORS_MP2975 tristate "MPS MP2975" help diff --git a/drivers/hwmon/pmbus/Makefile b/drivers/hwmon/pmbus/Makefile index 6177047414ee8a..4c5ff3f32c5ecb 100644 --- a/drivers/hwmon/pmbus/Makefile +++ b/drivers/hwmon/pmbus/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_SENSORS_MP2856) += mp2856.o obj-$(CONFIG_SENSORS_MP2869) += mp2869.o obj-$(CONFIG_SENSORS_MP2888) += mp2888.o obj-$(CONFIG_SENSORS_MP2891) += mp2891.o +obj-$(CONFIG_SENSORS_MP29502) += mp29502.o obj-$(CONFIG_SENSORS_MP2975) += mp2975.o obj-$(CONFIG_SENSORS_MP2993) += mp2993.o obj-$(CONFIG_SENSORS_MP5023) += mp5023.o diff --git a/drivers/hwmon/pmbus/mp29502.c b/drivers/hwmon/pmbus/mp29502.c new file mode 100644 index 00000000000000..7241373f155770 --- /dev/null +++ b/drivers/hwmon/pmbus/mp29502.c @@ -0,0 +1,670 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Hardware monitoring driver for MPS Multi-phase Digital VR Controllers(MP29502) + */ + +#include +#include +#include +#include +#include "pmbus.h" + +#define MFR_VOUT_SCALE_LOOP 0x29 +#define MFR_SVI3_IOUT_PRT 0x67 +#define MFR_READ_PIN_EST 0x94 +#define MFR_READ_IIN_EST 0x95 +#define MFR_VOUT_PROT1 0x3D +#define MFR_VOUT_PROT2 0x51 +#define MFR_SLOPE_CNT_SET 0xA8 +#define MFR_TSNS_FLT_SET 0xBB + +#define MP29502_VIN_OV_GAIN 4 +#define MP29502_TEMP_LIMIT_OFFSET 40 +#define MP29502_READ_VOUT_DIV 1024 +#define MP29502_READ_IOUT_DIV 32 +#define MP29502_IOUT_LIMIT_UINT 8 +#define MP29502_OVUV_LIMIT_SCALE 10 +#define MP28502_VOUT_OV_GAIN 512 +#define MP28502_VOUT_OV_SCALE 40 +#define MP29502_VOUT_UV_OFFSET 36 +#define MP29502_PIN_GAIN 2 +#define MP29502_IIN_DIV 2 + +#define MP29502_PAGE_NUM 1 + +#define MP29502_RAIL_FUNC (PMBUS_HAVE_VIN | PMBUS_HAVE_VOUT | \ + PMBUS_HAVE_IOUT | PMBUS_HAVE_POUT | \ + PMBUS_HAVE_TEMP | PMBUS_HAVE_PIN | \ + PMBUS_HAVE_IIN | \ + PMBUS_HAVE_STATUS_VOUT | \ + PMBUS_HAVE_STATUS_IOUT | \ + PMBUS_HAVE_STATUS_TEMP | \ + PMBUS_HAVE_STATUS_INPUT) + +struct mp29502_data { + struct pmbus_driver_info info; + int vout_scale; + int vout_bottom_div; + int vout_top_div; + int ovp_div; + int iout_scale; +}; + +#define to_mp29502_data(x) container_of(x, struct mp29502_data, info) + +static u16 mp29502_reg2data_linear11(u16 word) +{ + s16 exponent; + s32 mantissa; + s64 val; + + exponent = ((s16)word) >> 11; + mantissa = ((s16)((word & 0x7ff) << 5)) >> 5; + val = mantissa; + + if (exponent >= 0) + val <<= exponent; + else + val >>= -exponent; + + return val; +} + +static int +mp29502_identify_vout_scale(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_VOUT_SCALE_LOOP); + if (ret < 0) + return ret; + + switch (FIELD_GET(GENMASK(12, 10), ret)) { + case 0: + data->vout_scale = 6400; + break; + case 1: + data->vout_scale = 5120; + break; + case 2: + data->vout_scale = 2560; + break; + case 3: + data->vout_scale = 2048; + break; + case 4: + data->vout_scale = 1024; + break; + case 5: + data->vout_scale = 4; + break; + case 6: + data->vout_scale = 2; + break; + case 7: + data->vout_scale = 1; + break; + default: + data->vout_scale = 1; + break; + } + + return 0; +} + +static int +mp29502_identify_vout_divider(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_VOUT_PROT1); + if (ret < 0) + return ret; + + data->vout_bottom_div = FIELD_GET(GENMASK(11, 0), ret); + + ret = i2c_smbus_read_word_data(client, MFR_VOUT_PROT2); + if (ret < 0) + return ret; + + data->vout_top_div = FIELD_GET(GENMASK(14, 0), ret); + + return 0; +} + +static int +mp29502_identify_ovp_divider(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_SLOPE_CNT_SET); + if (ret < 0) + return ret; + + data->ovp_div = FIELD_GET(GENMASK(9, 0), ret); + + return 0; +} + +static int +mp29502_identify_iout_scale(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_SVI3_IOUT_PRT); + if (ret < 0) + return ret; + + switch (ret & GENMASK(2, 0)) { + case 0: + case 6: + data->iout_scale = 32; + break; + case 1: + data->iout_scale = 1; + break; + case 2: + data->iout_scale = 2; + break; + case 3: + data->iout_scale = 4; + break; + case 4: + data->iout_scale = 8; + break; + case 5: + data->iout_scale = 16; + break; + default: + data->iout_scale = 64; + break; + } + + return 0; +} + +static int mp29502_read_vout_ov_limit(struct i2c_client *client, struct mp29502_data *data) +{ + int ret; + int ov_value; + + /* + * This is because the vout ov fault limit value comes from + * page1 MFR_TSNS_FLT_SET reg, and other telemetry and limit + * value comes from page0 reg. So the page should be set to + * 0 after the reading of vout ov limit. + */ + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, 1); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_TSNS_FLT_SET); + if (ret < 0) + return ret; + + ov_value = DIV_ROUND_CLOSEST(FIELD_GET(GENMASK(12, 7), ret) * + MP28502_VOUT_OV_GAIN * MP28502_VOUT_OV_SCALE, + data->ovp_div); + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, 0); + if (ret < 0) + return ret; + + return ov_value; +} + +static int mp29502_write_vout_ov_limit(struct i2c_client *client, u16 word, + struct mp29502_data *data) +{ + int ret; + + /* + * This is because the vout ov fault limit value comes from + * page1 MFR_TSNS_FLT_SET reg, and other telemetry and limit + * value comes from page0 reg. So the page should be set to + * 0 after the writing of vout ov limit. + */ + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, 1); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_TSNS_FLT_SET); + if (ret < 0) + return ret; + + ret = i2c_smbus_write_word_data(client, MFR_TSNS_FLT_SET, + (ret & ~GENMASK(12, 7)) | + FIELD_PREP(GENMASK(12, 7), + DIV_ROUND_CLOSEST(word * data->ovp_div, + MP28502_VOUT_OV_GAIN * MP28502_VOUT_OV_SCALE))); + + return i2c_smbus_write_byte_data(client, PMBUS_PAGE, 0); +} + +static int mp29502_read_byte_data(struct i2c_client *client, int page, int reg) +{ + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, 0); + if (ret < 0) + return ret; + + switch (reg) { + case PMBUS_VOUT_MODE: + ret = PB_VOUT_MODE_DIRECT; + break; + default: + ret = -ENODATA; + break; + } + + return ret; +} + +static int mp29502_read_word_data(struct i2c_client *client, int page, + int phase, int reg) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + switch (reg) { + case PMBUS_STATUS_WORD: + ret = -ENODATA; + break; + case PMBUS_READ_VIN: + /* + * The MP29502 PMBUS_READ_VIN[10:0] is the vin value, the vin scale is + * 125mV/LSB. And the vin scale is set to 125mV/Lsb(using r/m/b scale) + * in MP29502 pmbus_driver_info struct, so the word data bit0-bit10 can + * be returned to pmbus core directly. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(10, 0), ret); + break; + case PMBUS_READ_VOUT: + /* + * The MP29502 PMBUS_READ_VOUT[11:0] is the vout value, and vout + * value is calculated based on vout scale and vout divider. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(11, 0)) * + data->vout_scale * + (data->vout_bottom_div + + 4 * data->vout_top_div), + MP29502_READ_VOUT_DIV * + data->vout_bottom_div); + break; + case PMBUS_READ_IIN: + /* + * The MP29502 MFR_READ_IIN_EST register is linear11 format, and the + * exponent is not a constant value. But the iin scale is set to + * 1A/Lsb(using r/m/b scale). As a result, the iin read from MP29502 + * should be calculated to A, then return the result to pmbus core. + */ + ret = pmbus_read_word_data(client, page, phase, MFR_READ_IIN_EST); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST(mp29502_reg2data_linear11(ret), + MP29502_IIN_DIV); + break; + case PMBUS_READ_PIN: + /* + * The MP29502 MFR_READ_PIN_EST register is linear11 format, and the + * exponent is not a constant value. But the pin scale is set to + * 1W/Lsb(using r/m/b scale). As a result, the pout read from MP29502 + * should be calculated to W, then return the result to pmbus core. + */ + ret = pmbus_read_word_data(client, page, phase, MFR_READ_PIN_EST); + if (ret < 0) + return ret; + + ret = mp29502_reg2data_linear11(ret) * MP29502_PIN_GAIN; + break; + case PMBUS_READ_POUT: + /* + * The MP29502 PMBUS_READ_POUT register is linear11 format, and the + * exponent is not a constant value. But the pout scale is set to + * 1W/Lsb(using r/m/b scale). As a result, the pout read from MP29502 + * should be calculated to W, then return the result to pmbus core. + * And the pout is calculated based on vout divider. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST(mp29502_reg2data_linear11(ret) * + (data->vout_bottom_div + + 4 * data->vout_top_div), + data->vout_bottom_div); + break; + case PMBUS_READ_IOUT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(10, 0)) * data->iout_scale, + MP29502_READ_IOUT_DIV); + break; + case PMBUS_READ_TEMPERATURE_1: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(10, 0), ret); + break; + case PMBUS_VIN_OV_FAULT_LIMIT: + /* + * The MP29502 PMBUS_VIN_OV_FAULT_LIMIT is 500mV/Lsb, but + * the vin scale is set to 125mV/Lsb(using r/m/b scale), + * so the word data should multiply by 4. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(7, 0), ret) * MP29502_VIN_OV_GAIN; + break; + case PMBUS_VIN_UV_WARN_LIMIT: + case PMBUS_VIN_UV_FAULT_LIMIT: + /* + * The MP29502 PMBUS_VIN_UV_WARN_LIMIT and PMBUS_VIN_UV_FAULT_LIMIT + * scale is 125mV/Lsb, but the vin scale is set to 125mV/Lsb(using + * r/m/b scale), so the word data bit0-bit9 can be returned to pmbus + * core directly. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(9, 0), ret); + break; + case PMBUS_VOUT_OV_FAULT_LIMIT: + /* + * The MP29502 vout ov fault limit value comes from + * page1 MFR_TSNS_FLT_SET[12:7]. + */ + ret = mp29502_read_vout_ov_limit(client, data); + if (ret < 0) + return ret; + + break; + case PMBUS_VOUT_UV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((FIELD_GET(GENMASK(8, 0), ret) * + MP29502_OVUV_LIMIT_SCALE - + MP29502_VOUT_UV_OFFSET) * + (data->vout_bottom_div + + 4 * data->vout_top_div), + data->vout_bottom_div); + break; + case PMBUS_IOUT_OC_FAULT_LIMIT: + case PMBUS_IOUT_OC_WARN_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(7, 0)) * + data->iout_scale * + MP29502_IOUT_LIMIT_UINT, + MP29502_READ_IOUT_DIV); + break; + case PMBUS_OT_FAULT_LIMIT: + case PMBUS_OT_WARN_LIMIT: + /* + * The scale of MP29502 PMBUS_OT_FAULT_LIMIT and PMBUS_OT_WARN_LIMIT + * is 1°C/LSB and they have 40°C offset. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = (ret & GENMASK(7, 0)) - MP29502_TEMP_LIMIT_OFFSET; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int mp29502_write_word_data(struct i2c_client *client, int page, int reg, + u16 word) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, 0); + if (ret < 0) + return ret; + + switch (reg) { + case PMBUS_VIN_OV_FAULT_LIMIT: + /* + * The PMBUS_VIN_OV_FAULT_LIMIT[7:0] is the limit value, + * and bit8-bit15 should not be changed. The scale of + * PMBUS_VIN_OV_FAULT_LIMIT is 500mV/Lsb, but the vin + * scale is set to 125mV/Lsb(using r/m/b scale), so + * the word data should divide by 4. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(7, 0)) | + FIELD_PREP(GENMASK(7, 0), + DIV_ROUND_CLOSEST(word, + MP29502_VIN_OV_GAIN))); + break; + case PMBUS_VIN_UV_WARN_LIMIT: + case PMBUS_VIN_UV_FAULT_LIMIT: + /* + * The PMBUS_VIN_UV_WARN_LIMIT[9:0] and PMBUS_VIN_UV_FAULT_LIMIT[9:0] + * are the limit value, and bit10-bit15 should not be changed. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(9, 0)) | + FIELD_PREP(GENMASK(9, 0), + word)); + break; + case PMBUS_VOUT_OV_FAULT_LIMIT: + ret = mp29502_write_vout_ov_limit(client, word, data); + if (ret < 0) + return ret; + + break; + case PMBUS_VOUT_UV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(8, 0)) | + FIELD_PREP(GENMASK(8, 0), + DIV_ROUND_CLOSEST(word * + data->vout_bottom_div + + MP29502_VOUT_UV_OFFSET * + (data->vout_bottom_div + + 4 * data->vout_top_div), + MP29502_OVUV_LIMIT_SCALE * + (data->vout_bottom_div + + 4 * data->vout_top_div)))); + break; + case PMBUS_IOUT_OC_FAULT_LIMIT: + case PMBUS_IOUT_OC_WARN_LIMIT: + ret = pmbus_write_word_data(client, page, reg, + DIV_ROUND_CLOSEST(word * + MP29502_READ_IOUT_DIV, + MP29502_IOUT_LIMIT_UINT * + data->iout_scale)); + break; + case PMBUS_OT_FAULT_LIMIT: + case PMBUS_OT_WARN_LIMIT: + /* + * The PMBUS_OT_FAULT_LIMIT[7:0] and PMBUS_OT_WARN_LIMIT[7:0] + * are the limit value, and bit8-bit15 should not be changed. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(7, 0)) | + FIELD_PREP(GENMASK(7, 0), + word + MP29502_TEMP_LIMIT_OFFSET)); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int mp29502_identify(struct i2c_client *client, struct pmbus_driver_info *info) +{ + int ret; + + /* Identify vout scale */ + ret = mp29502_identify_vout_scale(client, info, 0); + if (ret < 0) + return ret; + + /* Identify vout divider. */ + ret = mp29502_identify_vout_divider(client, info, 1); + if (ret < 0) + return ret; + + /* Identify ovp divider. */ + ret = mp29502_identify_ovp_divider(client, info, 1); + if (ret < 0) + return ret; + + /* Identify iout scale */ + return mp29502_identify_iout_scale(client, info, 0); +} + +static const struct pmbus_driver_info mp29502_info = { + .pages = MP29502_PAGE_NUM, + .format[PSC_VOLTAGE_IN] = direct, + .format[PSC_TEMPERATURE] = direct, + .format[PSC_CURRENT_IN] = direct, + .format[PSC_CURRENT_OUT] = direct, + .format[PSC_VOLTAGE_OUT] = direct, + .format[PSC_POWER] = direct, + + .m[PSC_VOLTAGE_IN] = 8, + .R[PSC_VOLTAGE_IN] = 0, + .b[PSC_VOLTAGE_IN] = 0, + + .m[PSC_VOLTAGE_OUT] = 1, + .R[PSC_VOLTAGE_OUT] = 3, + .b[PSC_VOLTAGE_OUT] = 0, + + .m[PSC_TEMPERATURE] = 1, + .R[PSC_TEMPERATURE] = 0, + .b[PSC_TEMPERATURE] = 0, + + .m[PSC_CURRENT_IN] = 1, + .R[PSC_CURRENT_IN] = 0, + .b[PSC_CURRENT_IN] = 0, + + .m[PSC_CURRENT_OUT] = 1, + .R[PSC_CURRENT_OUT] = 0, + .b[PSC_CURRENT_OUT] = 0, + + .m[PSC_POWER] = 1, + .R[PSC_POWER] = 0, + .b[PSC_POWER] = 0, + + .func[0] = MP29502_RAIL_FUNC, + .read_word_data = mp29502_read_word_data, + .read_byte_data = mp29502_read_byte_data, + .write_word_data = mp29502_write_word_data, + .identify = mp29502_identify, +}; + +static int mp29502_probe(struct i2c_client *client) +{ + struct pmbus_driver_info *info; + struct mp29502_data *data; + + data = devm_kzalloc(&client->dev, sizeof(struct mp29502_data), + GFP_KERNEL); + if (!data) + return -ENOMEM; + + memcpy(&data->info, &mp29502_info, sizeof(*info)); + info = &data->info; + + return pmbus_do_probe(client, info); +} + +static const struct i2c_device_id mp29502_id[] = { + {"mp29502", 0}, + {} +}; +MODULE_DEVICE_TABLE(i2c, mp29502_id); + +static const struct of_device_id __maybe_unused mp29502_of_match[] = { + {.compatible = "mps,mp29502"}, + {} +}; +MODULE_DEVICE_TABLE(of, mp29502_of_match); + +static struct i2c_driver mp29502_driver = { + .driver = { + .name = "mp29502", + .of_match_table = mp29502_of_match, + }, + .probe = mp29502_probe, + .id_table = mp29502_id, +}; + +module_i2c_driver(mp29502_driver); + +MODULE_AUTHOR("Wensheng Wang Date: Thu, 5 Jun 2025 16:23:57 -0700 Subject: [PATCH 1140/3206] hwmon: Serialize accesses in hwmon core Implement locking in the hardware monitoring core for drivers using the _with_info() API functions. Most hardware monitoring drivers need to support locking to protect against parallel accesses from userspace. With older API functions, such locking had to be implemented in the driver code since sysfs attributes were created by the driver. However, the _with_info() API creates sysfs attributes in the hardware monitoring core. This makes it easy to move the locking primitives into that code. This has the benefit of simplifying driver code while at the same time reducing the risk of incomplete of bad locking implementations in hardware monitoring drivers. While this means that all accesses are forced to be synchronized, this has little if any practical impact since accesses are expected to be low frequency and are typically synchronized from userspace anyway since only a single process is accessing the data. On top of that, many drivers use regmap, which also has its own locking scheme and already serializes accesses. Signed-off-by: Guenter Roeck --- Documentation/hwmon/hwmon-kernel-api.rst | 10 ++++++ drivers/hwmon/hwmon.c | 42 ++++++++++++++++++++---- include/linux/hwmon.h | 3 ++ 3 files changed, 48 insertions(+), 7 deletions(-) diff --git a/Documentation/hwmon/hwmon-kernel-api.rst b/Documentation/hwmon/hwmon-kernel-api.rst index 037b69c23cb593..1d7f1397a82744 100644 --- a/Documentation/hwmon/hwmon-kernel-api.rst +++ b/Documentation/hwmon/hwmon-kernel-api.rst @@ -42,6 +42,9 @@ register/unregister functions:: char *devm_hwmon_sanitize_name(struct device *dev, const char *name); + void hwmon_lock(struct device *dev); + void hwmon_unlock(struct device *dev); + hwmon_device_register_with_info registers a hardware monitoring device. It creates the standard sysfs attributes in the hardware monitoring core, letting the driver focus on reading from and writing to the chip instead @@ -79,6 +82,13 @@ devm_hwmon_sanitize_name is the resource managed version of hwmon_sanitize_name; the memory will be freed automatically on device removal. +When using ``[devm_]hwmon_device_register_with_info()`` to register the +hardware monitoring device, accesses using the associated access functions +are serialised by the hardware monitoring core. If a driver needs locking +for other functions such as interrupt handlers or for attributes which are +fully implemented in the driver, hwmon_lock() and hwmon_unlock() can be used +to ensure that calls to those functions are serialized. + Using devm_hwmon_device_register_with_info() -------------------------------------------- diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index 2e17f3a4c59bdc..0b4bdcd33c7b8d 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,7 @@ struct hwmon_device { const char *label; struct device dev; const struct hwmon_chip_info *chip; + struct mutex lock; struct list_head tzdata; struct attribute_group group; const struct attribute_group **groups; @@ -165,6 +167,8 @@ static int hwmon_thermal_get_temp(struct thermal_zone_device *tz, int *temp) int ret; long t; + guard(mutex)(&hwdev->lock); + ret = hwdev->chip->ops->read(tdata->dev, hwmon_temp, hwmon_temp_input, tdata->index, &t); if (ret < 0) @@ -193,6 +197,8 @@ static int hwmon_thermal_set_trips(struct thermal_zone_device *tz, int low, int if (!info[i]) return 0; + guard(mutex)(&hwdev->lock); + if (info[i]->config[tdata->index] & HWMON_T_MIN) { err = chip->ops->write(tdata->dev, hwmon_temp, hwmon_temp_min, tdata->index, low); @@ -330,8 +336,6 @@ static int hwmon_attr_base(enum hwmon_sensor_types type) * attached to an i2c client device. */ -static DEFINE_MUTEX(hwmon_pec_mutex); - static int hwmon_match_device(struct device *dev, const void *data) { return dev->class == &hwmon_class; @@ -362,17 +366,16 @@ static ssize_t pec_store(struct device *dev, struct device_attribute *devattr, if (!hdev) return -ENODEV; - mutex_lock(&hwmon_pec_mutex); - /* * If there is no write function, we assume that chip specific * handling is not required. */ hwdev = to_hwmon_device(hdev); + guard(mutex)(&hwdev->lock); if (hwdev->chip->ops->write) { err = hwdev->chip->ops->write(hdev, hwmon_chip, hwmon_chip_pec, 0, val); if (err && err != -EOPNOTSUPP) - goto unlock; + goto put; } if (!val) @@ -381,8 +384,7 @@ static ssize_t pec_store(struct device *dev, struct device_attribute *devattr, client->flags |= I2C_CLIENT_PEC; err = count; -unlock: - mutex_unlock(&hwmon_pec_mutex); +put: put_device(hdev); return err; @@ -426,10 +428,13 @@ static ssize_t hwmon_attr_show(struct device *dev, struct device_attribute *devattr, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); s64 val64; long val; int ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->read(dev, hattr->type, hattr->attr, hattr->index, (hattr->type == hwmon_energy64) ? (long *)&val64 : &val); if (ret < 0) @@ -449,10 +454,13 @@ static ssize_t hwmon_attr_show_string(struct device *dev, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); enum hwmon_sensor_types type = hattr->type; const char *s; int ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->read_string(dev, hattr->type, hattr->attr, hattr->index, &s); if (ret < 0) @@ -469,6 +477,7 @@ static ssize_t hwmon_attr_store(struct device *dev, const char *buf, size_t count) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); long val; int ret; @@ -476,6 +485,8 @@ static ssize_t hwmon_attr_store(struct device *dev, if (ret < 0) return ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->write(dev, hattr->type, hattr->attr, hattr->index, val); if (ret < 0) @@ -791,6 +802,22 @@ int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, } EXPORT_SYMBOL_GPL(hwmon_notify_event); +void hwmon_lock(struct device *dev) +{ + struct hwmon_device *hwdev = to_hwmon_device(dev); + + mutex_lock(&hwdev->lock); +} +EXPORT_SYMBOL_GPL(hwmon_lock); + +void hwmon_unlock(struct device *dev) +{ + struct hwmon_device *hwdev = to_hwmon_device(dev); + + mutex_unlock(&hwdev->lock); +} +EXPORT_SYMBOL_GPL(hwmon_unlock); + static int hwmon_num_channel_attrs(const struct hwmon_channel_info *info) { int i, n; @@ -951,6 +978,7 @@ __hwmon_device_register(struct device *dev, const char *name, void *drvdata, tdev = tdev->parent; hdev->of_node = tdev ? tdev->of_node : NULL; hwdev->chip = chip; + mutex_init(&hwdev->lock); dev_set_drvdata(hdev, drvdata); dev_set_name(hdev, HWMON_ID_FORMAT, id); err = device_register(hdev); diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 886fc90b2d2536..301a83afbd6636 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -492,6 +492,9 @@ int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, char *hwmon_sanitize_name(const char *name); char *devm_hwmon_sanitize_name(struct device *dev, const char *name); +void hwmon_lock(struct device *dev); +void hwmon_unlock(struct device *dev); + /** * hwmon_is_bad_char - Is the char invalid in a hwmon name * @ch: the char to be considered From 77b8e6fbf9848d651f5cb7508f18ad0971f3ffdb Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 8 Sep 2025 15:52:02 +0200 Subject: [PATCH 1141/3206] dm-integrity: limit MAX_TAG_SIZE to 255 MAX_TAG_SIZE was 0x1a8 and it may be truncated in the "bi->metadata_size = ic->tag_size" assignment. We need to limit it to 255. Signed-off-by: Mikulas Patocka --- drivers/md/dm-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index efeee0a873c064..ab96b692e5a3eb 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -133,7 +133,7 @@ struct journal_sector { commit_id_t commit_id; }; -#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK])) +#define MAX_TAG_SIZE 255 #define METADATA_PADDING_SECTORS 8 From a679e5683d3eef22ca12514ff8784b2b914ebedc Mon Sep 17 00:00:00 2001 From: Bala-Vignesh-Reddy Date: Fri, 8 Aug 2025 13:38:30 +0530 Subject: [PATCH 1142/3206] selftests: arm64: Check fread return value in exec_target Fix -Wunused-result warning generated when compiled with gcc 13.3.0, by checking fread's return value and handling errors, preventing potential failures when reading from stdin. Fixes compiler warning: warning: ignoring return value of 'fread' declared with attribute 'warn_unused_result' [-Wunused-result] Fixes: 806a15b2545e ("kselftests/arm64: add PAuth test for whether exec() changes keys") Signed-off-by: Bala-Vignesh-Reddy Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/pauth/exec_target.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/pauth/exec_target.c b/tools/testing/selftests/arm64/pauth/exec_target.c index 4435600ca400dd..e597861b26d6bf 100644 --- a/tools/testing/selftests/arm64/pauth/exec_target.c +++ b/tools/testing/selftests/arm64/pauth/exec_target.c @@ -13,7 +13,12 @@ int main(void) unsigned long hwcaps; size_t val; - fread(&val, sizeof(size_t), 1, stdin); + size_t size = fread(&val, sizeof(size_t), 1, stdin); + + if (size != 1) { + fprintf(stderr, "Could not read input from stdin\n"); + return EXIT_FAILURE; + } /* don't try to execute illegal (unimplemented) instructions) caller * should have checked this and keep worker simple From 791d703baddd141dc7b80a69bcd237b6b79150ea Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 12 Aug 2025 15:37:00 +0100 Subject: [PATCH 1143/3206] kselftest/arm64: Log error codes in sve-ptrace Use ksft_perror() to report error codes from failing ptrace operations to make it easier to interpret logs when things go wrong. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/sve-ptrace.c | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index b22303778fb0ae..4cba3bcff66002 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -95,19 +95,27 @@ static int do_child(void) static int get_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd) { struct iovec iov; + int ret; iov.iov_base = fpsimd; iov.iov_len = sizeof(*fpsimd); - return ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); + ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_GETREGSET)"); + return ret; } static int set_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd) { struct iovec iov; + int ret; iov.iov_base = fpsimd; iov.iov_len = sizeof(*fpsimd); - return ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); + ret = ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_SETREGSET)"); + return ret; } static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, @@ -117,6 +125,7 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, void *p; size_t sz = sizeof *sve; struct iovec iov; + int ret; while (1) { if (*size < sz) { @@ -132,8 +141,11 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, iov.iov_base = *buf; iov.iov_len = sz; - if (ptrace(PTRACE_GETREGSET, pid, type->regset, &iov)) + ret = ptrace(PTRACE_GETREGSET, pid, type->regset, &iov); + if (ret) { + ksft_perror("ptrace(PTRACE_GETREGSET)"); goto error; + } sve = *buf; if (sve->size <= sz) @@ -152,10 +164,14 @@ static int set_sve(pid_t pid, const struct vec_type *type, const struct user_sve_header *sve) { struct iovec iov; + int ret; iov.iov_base = (void *)sve; iov.iov_len = sve->size; - return ptrace(PTRACE_SETREGSET, pid, type->regset, &iov); + ret = ptrace(PTRACE_SETREGSET, pid, type->regset, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_SETREGSET)"); + return ret; } /* Validate setting and getting the inherit flag */ From 50af02425afc72b1b47c4a0a0b9c9bdaa1a1b347 Mon Sep 17 00:00:00 2001 From: Bala-Vignesh-Reddy Date: Thu, 7 Aug 2025 17:12:29 +0530 Subject: [PATCH 1144/3206] selftests: arm64: Fix -Waddress warning in tpidr2 test Thanks to -Waddress, the compiler warns that the ksft_test_result() invocations in the arm64 tpidr2 selftest are always true. Oops. Fix the test by, err, actually running the test functions. Fixes: 6d80cb73131d ("kselftest/arm64: Convert tpidr2 test to use kselftest.h") Signed-off-by: Bala-Vignesh-Reddy Reviewed-by: Anshuman Khandual Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/tpidr2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index f58a9f89b952c4..4c89ab0f101018 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -227,10 +227,10 @@ int main(int argc, char **argv) ret = open("/proc/sys/abi/sme_default_vector_length", O_RDONLY, 0); if (ret >= 0) { ksft_test_result(default_value(), "default_value\n"); - ksft_test_result(write_read, "write_read\n"); - ksft_test_result(write_sleep_read, "write_sleep_read\n"); - ksft_test_result(write_fork_read, "write_fork_read\n"); - ksft_test_result(write_clone_read, "write_clone_read\n"); + ksft_test_result(write_read(), "write_read\n"); + ksft_test_result(write_sleep_read(), "write_sleep_read\n"); + ksft_test_result(write_fork_read(), "write_fork_read\n"); + ksft_test_result(write_clone_read(), "write_clone_read\n"); } else { ksft_print_msg("SME support not present\n"); From 740cdafd0d998903c1faeee921028a8a78698be5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 17:13:02 +0200 Subject: [PATCH 1145/3206] kselftest/arm64/gcs: Correctly check return value when disabling GCS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The return value was not assigned to 'ret', so the check afterwards does not do anything. Fixes: 3d37d4307e0f ("kselftest/arm64: Add very basic GCS test program") Signed-off-by: Thomas Weißschuh Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/gcs/basic-gcs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c index 54f9c888249d74..100d2a983155f7 100644 --- a/tools/testing/selftests/arm64/gcs/basic-gcs.c +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -410,7 +410,7 @@ int main(void) } /* One last test: disable GCS, we can do this one time */ - my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); + ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); if (ret != 0) ksft_print_msg("Failed to disable GCS: %d\n", ret); From a985fe638344492727528e52416211dda1c391d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 17:13:03 +0200 Subject: [PATCH 1146/3206] kselftest/arm64/gcs: Use nolibc's getauxval() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nolibc now does have getauxval(), use it. Signed-off-by: Thomas Weißschuh Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/gcs/basic-gcs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c index 100d2a983155f7..250977abc39886 100644 --- a/tools/testing/selftests/arm64/gcs/basic-gcs.c +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -10,6 +10,7 @@ #include #include +#include #include #include "kselftest.h" @@ -386,14 +387,13 @@ int main(void) ksft_print_header(); - /* - * We don't have getauxval() with nolibc so treat a failure to - * read GCS state as a lack of support and skip. - */ + if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) + ksft_exit_skip("SKIP GCS not supported\n"); + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode, 0, 0, 0); if (ret != 0) - ksft_exit_skip("Failed to read GCS state: %d\n", ret); + ksft_exit_fail_msg("Failed to read GCS state: %d\n", ret); if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) { gcs_mode = PR_SHADOW_STACK_ENABLE; From 3198780eaf37c071052edac109d99bff77e6ce5c Mon Sep 17 00:00:00 2001 From: Vivek Yadav Date: Sat, 23 Aug 2025 23:14:00 -0700 Subject: [PATCH 1147/3206] kselftest/arm64: Remove extra blank line Remove an unnecessary blank line to improve code style consistency. ``` [command] ./scripts/checkpatch.pl --strict -f [output] CHECK: Please don't use multiple blank lines CHECK: Blank lines aren't necessary before a close brace '}' ``` Signed-off-by: Vivek Yadav Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/hwcap.c | 1 - tools/testing/selftests/arm64/bti/assembler.h | 1 - tools/testing/selftests/arm64/fp/fp-ptrace.c | 1 - tools/testing/selftests/arm64/fp/vec-syscfg.c | 1 - tools/testing/selftests/arm64/fp/zt-ptrace.c | 1 - tools/testing/selftests/arm64/gcs/gcs-locking.c | 1 - 6 files changed, 6 deletions(-) diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index 002ec38a8bbbf1..27d4790c2f0c22 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -55,7 +55,6 @@ static void cmpbr_sigill(void) /* Not implemented, too complicated and unreliable anyway */ } - static void crc32_sigill(void) { /* CRC32W W0, W0, W1 */ diff --git a/tools/testing/selftests/arm64/bti/assembler.h b/tools/testing/selftests/arm64/bti/assembler.h index 04e7b72880ef99..141cdcbf0b8fac 100644 --- a/tools/testing/selftests/arm64/bti/assembler.h +++ b/tools/testing/selftests/arm64/bti/assembler.h @@ -14,7 +14,6 @@ #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) - .macro startfn name:req .globl \name \name: diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index 124bc883365e44..3dc195f977baf9 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -1569,7 +1569,6 @@ static void run_sve_tests(void) &test_config); } } - } static void run_sme_tests(void) diff --git a/tools/testing/selftests/arm64/fp/vec-syscfg.c b/tools/testing/selftests/arm64/fp/vec-syscfg.c index ea9c7d47790f9d..2d75d342eeb9d4 100644 --- a/tools/testing/selftests/arm64/fp/vec-syscfg.c +++ b/tools/testing/selftests/arm64/fp/vec-syscfg.c @@ -690,7 +690,6 @@ static inline void smstop(void) asm volatile("msr S0_3_C4_C6_3, xzr"); } - /* * Verify we can change the SVE vector length while SME is active and * continue to use SME afterwards. diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c index 584b8d59b7ea1c..a7f34040fbf10b 100644 --- a/tools/testing/selftests/arm64/fp/zt-ptrace.c +++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c @@ -108,7 +108,6 @@ static int get_zt(pid_t pid, char zt[ZT_SIG_REG_BYTES]) return ptrace(PTRACE_GETREGSET, pid, NT_ARM_ZT, &iov); } - static int set_zt(pid_t pid, const char zt[ZT_SIG_REG_BYTES]) { struct iovec iov; diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c b/tools/testing/selftests/arm64/gcs/gcs-locking.c index 989f75a491b757..1e6abb136ffd07 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-locking.c +++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c @@ -165,7 +165,6 @@ TEST_F(valid_modes, lock_enable_disable_others) ASSERT_EQ(ret, 0); ASSERT_EQ(mode, PR_SHADOW_STACK_ALL_MODES); - ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, variant->mode); ASSERT_EQ(ret, 0); From a940568ccde433a0d06aadd4735f7974fd2c59e1 Mon Sep 17 00:00:00 2001 From: Vivek Yadav Date: Sat, 23 Aug 2025 23:14:01 -0700 Subject: [PATCH 1148/3206] kselftest/arm64: Supress warning and improve readability The comment was correct, but `checkpatch` script flagged it with a warning as shown in the output section. The comment is slightly modified to improve readability, which also suppresses the warning. ``` [command] ./script/checkpatch.pl --strict -f tools/testing/selftests/arm64/fp/fp-stress.c [output] WARNING: Possible repeated word: 'on' ``` Signed-off-by: Vivek Yadav Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/fp-stress.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index 74e23208b94cab..3a0ae96cf9096a 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -105,8 +105,8 @@ static void child_start(struct child_data *child, const char *program) /* * Read from the startup pipe, there should be no data - * and we should block until it is closed. We just - * carry on on error since this isn't super critical. + * and we should block until it is closed. We just + * carry-on on error since this isn't super critical. */ ret = read(3, &i, sizeof(i)); if (ret < 0) From 62e8a9fbaad147b65bda9362c2d8a52a86a0bac3 Mon Sep 17 00:00:00 2001 From: Vivek Yadav Date: Sat, 23 Aug 2025 23:14:02 -0700 Subject: [PATCH 1149/3206] kselftest/arm64: Add parentheses around sizeof for clarity Added parentheses around sizeof to make the expression clearer and improve readability. This change has no functional impact. ``` [command] ./scripts/checkpatch.pl tools/testing/selftests/arm64/fp/sve-ptrace.c [output] WARNING: sizeof *sve should be sizeof(*sve) ``` Signed-off-by: Vivek Yadav Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/sve-ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index 4cba3bcff66002..79bcc2369cdb73 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -123,7 +123,7 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, { struct user_sve_header *sve; void *p; - size_t sz = sizeof *sve; + size_t sz = sizeof(*sve); struct iovec iov; int ret; From 34c605fe53d49886d2741223b12950a33bdf2acf Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 26 Aug 2025 16:56:06 +0200 Subject: [PATCH 1150/3206] xen: rework xen_pv_domain() Rework xen_pv_domain() to no longer use the xen_domain_type variable, but the artificial X86_FEATURE_XENPV cpu feature. On non-x86 architectures xen_pv_domain() can be defined as "0". This has the advantage that a kernel not built with CONFIG_XEN_PV will be smaller due to dead code elimination. Set the X86_FEATURE_XENPV feature very early, as xen_pv_domain() is used rather early, too. Reviewed-by: Jason Andryuk Signed-off-by: Juergen Gross Message-ID: <20250826145608.10352-2-jgross@suse.com> --- arch/x86/xen/enlighten_pv.c | 2 +- include/xen/xen.h | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 26bbaf4b7330b4..4806cc28d7ca77 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -382,7 +382,6 @@ static bool __init xen_check_xsave(void) static void __init xen_init_capabilities(void) { - setup_force_cpu_cap(X86_FEATURE_XENPV); setup_clear_cpu_cap(X86_FEATURE_DCA); setup_clear_cpu_cap(X86_FEATURE_APERFMPERF); setup_clear_cpu_cap(X86_FEATURE_MTRR); @@ -1402,6 +1401,7 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) JMP32_INSN_SIZE); xen_domain_type = XEN_PV_DOMAIN; + setup_force_cpu_cap(X86_FEATURE_XENPV); xen_start_flags = xen_start_info->flags; /* Interrupts are guaranteed to be off initially. */ early_boot_irqs_disabled = true; diff --git a/include/xen/xen.h b/include/xen/xen.h index a1e5b3f18d69f9..61854e3f283776 100644 --- a/include/xen/xen.h +++ b/include/xen/xen.h @@ -22,8 +22,15 @@ extern bool xen_pvh; #define xen_pvh 0 #endif +#ifdef CONFIG_X86 +#include + +#define xen_pv_domain() (cpu_feature_enabled(X86_FEATURE_XENPV)) +#else +#define xen_pv_domain() 0 +#endif + #define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) #define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) #define xen_pvh_domain() (xen_pvh) From 0f4283123fe1e6016296048d0fdcfce615047a13 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 26 Aug 2025 16:56:07 +0200 Subject: [PATCH 1151/3206] xen: replace XENFEAT_auto_translated_physmap with xen_pv_domain() Instead of testing the XENFEAT_auto_translated_physmap feature, just use !xen_pv_domain() which is equivalent. This has the advantage that a kernel not built with CONFIG_XEN_PV will be smaller due to dead code elimination. Reviewed-by: Jason Andryuk Signed-off-by: Juergen Gross Message-ID: <20250826145608.10352-3-jgross@suse.com> --- arch/x86/include/asm/xen/page.h | 14 +++++++------- arch/x86/xen/mmu.c | 2 +- arch/x86/xen/p2m.c | 4 ++-- drivers/xen/balloon.c | 4 ++-- drivers/xen/gntdev.c | 2 +- drivers/xen/grant-table.c | 6 +++--- drivers/xen/privcmd.c | 14 ++++++-------- drivers/xen/unpopulated-alloc.c | 4 ++-- drivers/xen/xenbus/xenbus_client.c | 2 +- include/xen/grant_table.h | 4 ++-- include/xen/mem-reservation.h | 4 ++-- include/xen/xen-ops.h | 7 ++++--- 12 files changed, 33 insertions(+), 34 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 85e63d58c07463..59f642a94b9d90 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -12,9 +12,9 @@ #include #include +#include #include #include -#include /* Xen machine address */ typedef struct xmaddr { @@ -162,7 +162,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) * pfn_to_mfn. This will have to be removed when we figured * out which call. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return pfn; mfn = __pfn_to_mfn(pfn); @@ -175,7 +175,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) static inline int phys_to_machine_mapping_valid(unsigned long pfn) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return 1; return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY; @@ -210,7 +210,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) * gfn_to_pfn. This will have to be removed when we figure * out which call. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return mfn; pfn = mfn_to_pfn_no_overrides(mfn); @@ -242,7 +242,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) /* Pseudo-physical <-> Guest conversion */ static inline unsigned long pfn_to_gfn(unsigned long pfn) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return pfn; else return pfn_to_mfn(pfn); @@ -250,7 +250,7 @@ static inline unsigned long pfn_to_gfn(unsigned long pfn) static inline unsigned long gfn_to_pfn(unsigned long gfn) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return gfn; else return mfn_to_pfn(gfn); @@ -284,7 +284,7 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn) { unsigned long pfn; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return mfn; pfn = mfn_to_pfn(mfn); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c4c479373249be..3be45bf4bc7975 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, int nr, struct page **pages) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return xen_xlate_unmap_gfn_range(vma, nr, pages); if (!pages) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 56914e21e30305..2dd12b61a230ad 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -686,7 +686,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, int i, ret = 0; pte_t *pte; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return 0; if (kmap_ops) { @@ -769,7 +769,7 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, { int i, ret = 0; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return 0; for (i = 0; i < count; i++) { diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 2de37dcd75566f..49c3f992639435 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -302,7 +302,7 @@ static enum bp_state reserve_additional_memory(void) * are not restored since this region is now known not to * conflict with any devices. */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { unsigned long pfn, i; pfn = PFN_DOWN(resource->start); @@ -626,7 +626,7 @@ int xen_alloc_ballooned_pages(unsigned int nr_pages, struct page **pages) */ BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { ret = xen_alloc_p2m_entry(page_to_pfn(page)); if (ret < 0) goto out_undo; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 1f21607656182a..74491967f2ae21 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -1183,7 +1183,7 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); + use_ptemod = xen_pv_domain(); err = misc_register(&gntdev_miscdev); if (err != 0) { diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 04a6b470b15dfb..478d2ad725ac6b 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1449,7 +1449,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) unsigned int nr_gframes = end_idx + 1; int rc; - if (xen_feature(XENFEAT_auto_translated_physmap)) { + if (!xen_pv_domain()) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; @@ -1570,7 +1570,7 @@ static int gnttab_setup(void) if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { + if (!xen_pv_domain() && gnttab_shared.addr == NULL) { gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; if (gnttab_shared.addr == NULL) { pr_warn("gnttab share frames is not mapped!\n"); @@ -1588,7 +1588,7 @@ int gnttab_resume(void) int gnttab_suspend(void) { - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) gnttab_interface->unmap_frames(); return 0; } diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 13a10f3294a80d..f52a457b302d9c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -271,7 +271,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) struct mmap_gfn_state state; /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return -ENOSYS; if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) @@ -353,7 +353,7 @@ static int mmap_batch_fn(void *data, int nr, void *state) struct page **cur_pages = NULL; int ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) cur_pages = &pages[st->index]; BUG_ON(nr < 0); @@ -535,7 +535,7 @@ static long privcmd_ioctl_mmap_batch( ret = -EINVAL; goto out_unlock; } - if (xen_feature(XENFEAT_auto_translated_physmap)) { + if (!xen_pv_domain()) { ret = alloc_empty_pages(vma, nr_pages); if (ret < 0) goto out_unlock; @@ -779,8 +779,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file, goto out; } - if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && - xen_feature(XENFEAT_auto_translated_physmap)) { + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); struct page **pages; unsigned int i; @@ -811,8 +810,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file, if (rc) goto out; - if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && - xen_feature(XENFEAT_auto_translated_physmap)) { + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); } else { unsigned int domid = @@ -1591,7 +1589,7 @@ static void privcmd_close(struct vm_area_struct *vma) int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; int rc; - if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) + if (xen_pv_domain() || !numpgs || !pages) return; rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c index a39f2d36dd9cfc..d6fc2aefe2646b 100644 --- a/drivers/xen/unpopulated-alloc.c +++ b/drivers/xen/unpopulated-alloc.c @@ -105,7 +105,7 @@ static int fill_list(unsigned int nr_pages) * are not restored since this region is now known not to * conflict with any devices. */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { xen_pfn_t pfn = PFN_DOWN(res->start); for (i = 0; i < alloc_pages; i++) { @@ -184,7 +184,7 @@ int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages) pages[i] = pg; #ifdef CONFIG_XEN_HAVE_PVMMU - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { ret = xen_alloc_p2m_entry(page_to_pfn(pg)); if (ret < 0) { unsigned int j; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index e73ec225d4a61d..2dc874fb550665 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -955,7 +955,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = { void __init xenbus_ring_ops_init(void) { #ifdef CONFIG_XEN_PV - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) ring_ops = &ring_ops_pv; else #endif diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index e279be353e3f11..69ac6d80a006b7 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -164,7 +164,7 @@ gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr, { if (flags & GNTMAP_contains_pte) map->host_addr = addr; - else if (xen_feature(XENFEAT_auto_translated_physmap)) + else if (!xen_pv_domain()) map->host_addr = __pa(addr); else map->host_addr = addr; @@ -181,7 +181,7 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, { if (flags & GNTMAP_contains_pte) unmap->host_addr = addr; - else if (xen_feature(XENFEAT_auto_translated_physmap)) + else if (!xen_pv_domain()) unmap->host_addr = __pa(addr); else unmap->host_addr = addr; diff --git a/include/xen/mem-reservation.h b/include/xen/mem-reservation.h index a2ab516fcd2caf..3cbe3df0dfd4e9 100644 --- a/include/xen/mem-reservation.h +++ b/include/xen/mem-reservation.h @@ -39,7 +39,7 @@ static inline void xenmem_reservation_va_mapping_update(unsigned long count, xen_pfn_t *frames) { #ifdef CONFIG_XEN_HAVE_PVMMU - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) __xenmem_reservation_va_mapping_update(count, pages, frames); #endif } @@ -48,7 +48,7 @@ static inline void xenmem_reservation_va_mapping_reset(unsigned long count, struct page **pages) { #ifdef CONFIG_XEN_HAVE_PVMMU - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) __xenmem_reservation_va_mapping_reset(count, pages); #endif } diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 9e2a769b0d961a..496e6013c689f9 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -116,7 +117,7 @@ static inline int xen_remap_domain_gfn_array(struct vm_area_struct *vma, unsigned int domid, struct page **pages) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, prot, domid, pages); @@ -150,7 +151,7 @@ static inline int xen_remap_domain_mfn_array(struct vm_area_struct *vma, int nr, int *err_ptr, pgprot_t prot, unsigned int domid) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return -EOPNOTSUPP; return xen_remap_pfn(vma, addr, mfn, nr, err_ptr, prot, domid, @@ -175,7 +176,7 @@ static inline int xen_remap_domain_gfn_range(struct vm_area_struct *vma, pgprot_t prot, unsigned int domid, struct page **pages) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return -EOPNOTSUPP; return xen_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false); From 2ea7a5bcc4cfca817b3502b38f97885767730ed8 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 26 Aug 2025 16:56:08 +0200 Subject: [PATCH 1152/3206] drivers/xen/gntdev: use xen_pv_domain() instead of cached value Eliminate the use_ptemod variable by replacing its use cases with xen_pv_domain(). Instead of passing the xen_pv_domain() return value to gntdev_ioctl_dmabuf_exp_from_refs(), use xen_pv_domain() in that function. Reviewed-by: Jason Andryuk Signed-off-by: Juergen Gross Message-ID: <20250826145608.10352-4-jgross@suse.com> --- drivers/xen/gntdev-dmabuf.c | 7 +++---- drivers/xen/gntdev-dmabuf.h | 2 +- drivers/xen/gntdev.c | 33 ++++++++++++++------------------- 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/drivers/xen/gntdev-dmabuf.c b/drivers/xen/gntdev-dmabuf.c index 82855105ab857f..550980dd3b0bc4 100644 --- a/drivers/xen/gntdev-dmabuf.c +++ b/drivers/xen/gntdev-dmabuf.c @@ -720,16 +720,15 @@ static void dmabuf_imp_release_all(struct gntdev_dmabuf_priv *priv) /* DMA buffer IOCTL support. */ -long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, int use_ptemod, +long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, struct ioctl_gntdev_dmabuf_exp_from_refs __user *u) { struct ioctl_gntdev_dmabuf_exp_from_refs op; u32 *refs; long ret; - if (use_ptemod) { - pr_debug("Cannot provide dma-buf: use_ptemode %d\n", - use_ptemod); + if (xen_pv_domain()) { + pr_debug("Cannot provide dma-buf in a PV domain\n"); return -EINVAL; } diff --git a/drivers/xen/gntdev-dmabuf.h b/drivers/xen/gntdev-dmabuf.h index 3d9b9cf9d5a16a..9adf96ac74d393 100644 --- a/drivers/xen/gntdev-dmabuf.h +++ b/drivers/xen/gntdev-dmabuf.h @@ -18,7 +18,7 @@ struct gntdev_dmabuf_priv *gntdev_dmabuf_init(struct file *filp); void gntdev_dmabuf_fini(struct gntdev_dmabuf_priv *priv); -long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, int use_ptemod, +long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, struct ioctl_gntdev_dmabuf_exp_from_refs __user *u); long gntdev_ioctl_dmabuf_exp_wait_released(struct gntdev_priv *priv, diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 74491967f2ae21..91ba5078c9d956 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -73,9 +73,6 @@ module_param(limit, uint, 0644); MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by one mapping request"); -/* True in PV mode, false otherwise */ -static int use_ptemod; - static void unmap_grant_pages(struct gntdev_grant_map *map, int offset, int pages); @@ -163,7 +160,7 @@ struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, NULL == add->pages || NULL == add->being_removed) goto err; - if (use_ptemod) { + if (xen_pv_domain()) { add->kmap_ops = kvmalloc_array(count, sizeof(add->kmap_ops[0]), GFP_KERNEL); add->kunmap_ops = kvmalloc_array(count, sizeof(add->kunmap_ops[0]), @@ -211,7 +208,7 @@ struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, add->grants[i].ref = INVALID_GRANT_REF; add->map_ops[i].handle = INVALID_GRANT_HANDLE; add->unmap_ops[i].handle = INVALID_GRANT_HANDLE; - if (use_ptemod) { + if (xen_pv_domain()) { add->kmap_ops[i].handle = INVALID_GRANT_HANDLE; add->kunmap_ops[i].handle = INVALID_GRANT_HANDLE; } @@ -268,7 +265,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) if (!refcount_dec_and_test(&map->users)) return; - if (map->pages && !use_ptemod) { + if (map->pages && !xen_pv_domain()) { /* * Increment the reference count. This ensures that the * subsequent call to unmap_grant_pages() will not wind up @@ -298,7 +295,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) */ } - if (use_ptemod && map->notifier_init) + if (xen_pv_domain() && map->notifier_init) mmu_interval_notifier_remove(&map->notifier); if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { @@ -334,7 +331,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) size_t alloced = 0; int i, err = 0; - if (!use_ptemod) { + if (!xen_pv_domain()) { /* Note: it could already be mapped */ if (map->map_ops[0].handle != INVALID_GRANT_HANDLE) return 0; @@ -389,7 +386,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) if (map->flags & GNTMAP_device_map) map->unmap_ops[i].dev_bus_addr = map->map_ops[i].dev_bus_addr; - if (use_ptemod) { + if (xen_pv_domain()) { if (map->kmap_ops[i].status == GNTST_okay) { alloced++; map->kunmap_ops[i].handle = map->kmap_ops[i].handle; @@ -421,7 +418,7 @@ static void __unmap_grant_pages_done(int result, map->unmap_ops[offset+i].handle, map->unmap_ops[offset+i].status); map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; - if (use_ptemod) { + if (xen_pv_domain()) { if (map->kunmap_ops[offset + i].status == GNTST_okay && map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE) successful_unmaps++; @@ -464,7 +461,7 @@ static void __unmap_grant_pages(struct gntdev_grant_map *map, int offset, } map->unmap_data.unmap_ops = map->unmap_ops + offset; - map->unmap_data.kunmap_ops = use_ptemod ? map->kunmap_ops + offset : NULL; + map->unmap_data.kunmap_ops = xen_pv_domain() ? map->kunmap_ops + offset : NULL; map->unmap_data.pages = map->pages + offset; map->unmap_data.count = pages; map->unmap_data.done = __unmap_grant_pages_done; @@ -1039,7 +1036,7 @@ static long gntdev_ioctl(struct file *flip, #ifdef CONFIG_XEN_GNTDEV_DMABUF case IOCTL_GNTDEV_DMABUF_EXP_FROM_REFS: - return gntdev_ioctl_dmabuf_exp_from_refs(priv, use_ptemod, ptr); + return gntdev_ioctl_dmabuf_exp_from_refs(priv, ptr); case IOCTL_GNTDEV_DMABUF_EXP_WAIT_RELEASED: return gntdev_ioctl_dmabuf_exp_wait_released(priv, ptr); @@ -1086,7 +1083,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP); - if (use_ptemod) + if (xen_pv_domain()) vm_flags_set(vma, VM_DONTCOPY); vma->vm_private_data = map; @@ -1102,7 +1099,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) map->pages_vm_start = vma->vm_start; - if (use_ptemod) { + if (xen_pv_domain()) { err = mmu_interval_notifier_insert_locked( &map->notifier, vma->vm_mm, vma->vm_start, vma->vm_end - vma->vm_start, &gntdev_mmu_ops); @@ -1113,7 +1110,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) } mutex_unlock(&priv->lock); - if (use_ptemod) { + if (xen_pv_domain()) { /* * gntdev takes the address of the PTE in find_grant_ptes() and * passes it to the hypervisor in gntdev_map_grant_pages(). The @@ -1139,7 +1136,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) if (err) goto out_put_map; - if (!use_ptemod) { + if (!xen_pv_domain()) { err = vm_map_pages_zero(vma, map->pages, map->count); if (err) goto out_put_map; @@ -1154,7 +1151,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) out_unlock_put: mutex_unlock(&priv->lock); out_put_map: - if (use_ptemod) + if (xen_pv_domain()) unmap_grant_pages(map, 0, map->count); gntdev_put_map(priv, map); return err; @@ -1183,8 +1180,6 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = xen_pv_domain(); - err = misc_register(&gntdev_miscdev); if (err != 0) { pr_err("Could not register gntdev device\n"); From 440cec4ca1c242d72e309a801995584a55af25c6 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 18 Jul 2025 18:50:58 +0530 Subject: [PATCH 1153/3206] drm/amdgpu: Wait for bootloader after PSPv11 reset Some PSPv11 SOCs take a longer time for PSP based mode-1 reset. Instead of checking for C2PMSG_33 status, add the callback wait_for_bootloader. Wait for bootloader to be back to steady state is already part of the generic mode-1 reset flow. Increase the retry count for bootloader wait and also fix the mask to prevent fake pass. Fixes: 8345a71fc54b ("drm/amdgpu: Add more checks to PSP mailbox") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4531 Signed-off-by: Lijo Lazar Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 32f73741d6ee41fd5db8791c1163931e313d0fdc) --- drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c index 6cc05d36e3594d..64b240b51f1aa7 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c @@ -149,12 +149,12 @@ static int psp_v11_0_wait_for_bootloader(struct psp_context *psp) int ret; int retry_loop; - for (retry_loop = 0; retry_loop < 10; retry_loop++) { + for (retry_loop = 0; retry_loop < 20; retry_loop++) { /* Wait for bootloader to signify that is ready having bit 31 of C2PMSG_35 set to 1 */ ret = psp_wait_for( psp, SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_35), - 0x80000000, 0x80000000, PSP_WAITREG_NOVERBOSE); + 0x80000000, 0x8000FFFF, PSP_WAITREG_NOVERBOSE); if (ret == 0) return 0; @@ -397,18 +397,6 @@ static int psp_v11_0_mode1_reset(struct psp_context *psp) msleep(500); - offset = SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_33); - - ret = psp_wait_for(psp, offset, MBOX_TOS_RESP_FLAG, MBOX_TOS_RESP_MASK, - 0); - - if (ret) { - DRM_INFO("psp mode 1 reset failed!\n"); - return -EINVAL; - } - - DRM_INFO("psp mode1 reset succeed \n"); - return 0; } @@ -665,7 +653,8 @@ static const struct psp_funcs psp_v11_0_funcs = { .ring_get_wptr = psp_v11_0_ring_get_wptr, .ring_set_wptr = psp_v11_0_ring_set_wptr, .load_usbc_pd_fw = psp_v11_0_load_usbc_pd_fw, - .read_usbc_pd_fw = psp_v11_0_read_usbc_pd_fw + .read_usbc_pd_fw = psp_v11_0_read_usbc_pd_fw, + .wait_for_bootloader = psp_v11_0_wait_for_bootloader }; void psp_v11_0_set_psp_funcs(struct psp_context *psp) From 8fd8ea2869cfafb3b1d6f95ff49561b13a73438d Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Tue, 19 Aug 2025 13:11:39 -0500 Subject: [PATCH 1154/3206] ipmi:msghandler:Change seq_lock to a mutex Dan Carpenter got a Smatch warning: drivers/char/ipmi/ipmi_msghandler.c:5265 ipmi_free_recv_msg() warn: sleeping in atomic context due to the recent rework of the IPMI driver's locking. I didn't realize vfree could block. But there is an easy solution to this, now that almost everything in the message handler runs in thread context. I wanted to spend the time earlier to see if seq_lock could be converted from a spinlock to a mutex, but I wanted the previous changes to go in and soak before I did that. So I went ahead and did the analysis and converting should work. And solve this problem. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202503240244.LR7pOwyr-lkp@intel.com/ Fixes: 3be997d5a64a ("ipmi:msghandler: Remove srcu from the ipmi user structure") Cc: # 6.16 Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 63 ++++++++++++----------------- 1 file changed, 26 insertions(+), 37 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 8e9050f99e9eff..b78cc359534dff 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -464,7 +464,7 @@ struct ipmi_smi { * interface to match them up with their responses. A routine * is called periodically to time the items in this list. */ - spinlock_t seq_lock; + struct mutex seq_lock; struct seq_table seq_table[IPMI_IPMB_NUM_SEQ]; int curr_seq; @@ -1116,12 +1116,11 @@ static int intf_find_seq(struct ipmi_smi *intf, struct ipmi_recv_msg **recv_msg) { int rv = -ENODEV; - unsigned long flags; if (seq >= IPMI_IPMB_NUM_SEQ) return -EINVAL; - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); if (intf->seq_table[seq].inuse) { struct ipmi_recv_msg *msg = intf->seq_table[seq].recv_msg; @@ -1134,7 +1133,7 @@ static int intf_find_seq(struct ipmi_smi *intf, rv = 0; } } - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); return rv; } @@ -1145,14 +1144,13 @@ static int intf_start_seq_timer(struct ipmi_smi *intf, long msgid) { int rv = -ENODEV; - unsigned long flags; unsigned char seq; unsigned long seqid; GET_SEQ_FROM_MSGID(msgid, seq, seqid); - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); /* * We do this verification because the user can be deleted * while a message is outstanding. @@ -1163,7 +1161,7 @@ static int intf_start_seq_timer(struct ipmi_smi *intf, ent->timeout = ent->orig_timeout; rv = 0; } - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); return rv; } @@ -1174,7 +1172,6 @@ static int intf_err_seq(struct ipmi_smi *intf, unsigned int err) { int rv = -ENODEV; - unsigned long flags; unsigned char seq; unsigned long seqid; struct ipmi_recv_msg *msg = NULL; @@ -1182,7 +1179,7 @@ static int intf_err_seq(struct ipmi_smi *intf, GET_SEQ_FROM_MSGID(msgid, seq, seqid); - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); /* * We do this verification because the user can be deleted * while a message is outstanding. @@ -1196,7 +1193,7 @@ static int intf_err_seq(struct ipmi_smi *intf, msg = ent->recv_msg; rv = 0; } - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); if (msg) deliver_err_response(intf, msg, err); @@ -1209,7 +1206,6 @@ int ipmi_create_user(unsigned int if_num, void *handler_data, struct ipmi_user **user) { - unsigned long flags; struct ipmi_user *new_user = NULL; int rv = 0; struct ipmi_smi *intf; @@ -1277,9 +1273,9 @@ int ipmi_create_user(unsigned int if_num, new_user->gets_events = false; mutex_lock(&intf->users_mutex); - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); list_add(&new_user->link, &intf->users); - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); mutex_unlock(&intf->users_mutex); if (handler->ipmi_watchdog_pretimeout) @@ -1325,7 +1321,6 @@ static void _ipmi_destroy_user(struct ipmi_user *user) { struct ipmi_smi *intf = user->intf; int i; - unsigned long flags; struct cmd_rcvr *rcvr; struct cmd_rcvr *rcvrs = NULL; struct ipmi_recv_msg *msg, *msg2; @@ -1346,7 +1341,7 @@ static void _ipmi_destroy_user(struct ipmi_user *user) list_del(&user->link); atomic_dec(&intf->nr_users); - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); for (i = 0; i < IPMI_IPMB_NUM_SEQ; i++) { if (intf->seq_table[i].inuse && (intf->seq_table[i].recv_msg->user == user)) { @@ -1355,7 +1350,7 @@ static void _ipmi_destroy_user(struct ipmi_user *user) ipmi_free_recv_msg(intf->seq_table[i].recv_msg); } } - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); /* * Remove the user from the command receiver's table. First @@ -2026,10 +2021,7 @@ static int i_ipmi_req_ipmb(struct ipmi_smi *intf, */ smi_msg->user_data = recv_msg; } else { - /* It's a command, so get a sequence for it. */ - unsigned long flags; - - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); if (is_maintenance_mode_cmd(msg)) intf->ipmb_maintenance_mode_timeout = @@ -2087,7 +2079,7 @@ static int i_ipmi_req_ipmb(struct ipmi_smi *intf, * to be correct. */ out_err: - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); } return rv; @@ -2205,10 +2197,7 @@ static int i_ipmi_req_lan(struct ipmi_smi *intf, */ smi_msg->user_data = recv_msg; } else { - /* It's a command, so get a sequence for it. */ - unsigned long flags; - - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); /* * Create a sequence number with a 1 second @@ -2257,7 +2246,7 @@ static int i_ipmi_req_lan(struct ipmi_smi *intf, * to be correct. */ out_err: - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); } return rv; @@ -3575,7 +3564,7 @@ int ipmi_add_smi(struct module *owner, atomic_set(&intf->nr_users, 0); intf->handlers = handlers; intf->send_info = send_info; - spin_lock_init(&intf->seq_lock); + mutex_init(&intf->seq_lock); for (j = 0; j < IPMI_IPMB_NUM_SEQ; j++) { intf->seq_table[j].inuse = 0; intf->seq_table[j].seqid = 0; @@ -4529,9 +4518,10 @@ static int handle_one_recv_msg(struct ipmi_smi *intf, if (msg->rsp_size < 2) { /* Message is too small to be correct. */ - dev_warn(intf->si_dev, - "BMC returned too small a message for netfn %x cmd %x, got %d bytes\n", - (msg->data[0] >> 2) | 1, msg->data[1], msg->rsp_size); + dev_warn_ratelimited(intf->si_dev, + "BMC returned too small a message for netfn %x cmd %x, got %d bytes\n", + (msg->data[0] >> 2) | 1, + msg->data[1], msg->rsp_size); return_unspecified: /* Generate an error response for the message. */ @@ -4951,8 +4941,7 @@ smi_from_recv_msg(struct ipmi_smi *intf, struct ipmi_recv_msg *recv_msg, static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent, struct list_head *timeouts, unsigned long timeout_period, - int slot, unsigned long *flags, - bool *need_timer) + int slot, bool *need_timer) { struct ipmi_recv_msg *msg; @@ -5004,7 +4993,7 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent, return; } - spin_unlock_irqrestore(&intf->seq_lock, *flags); + mutex_unlock(&intf->seq_lock); /* * Send the new message. We send with a zero @@ -5025,7 +5014,7 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent, } else ipmi_free_smi_msg(smi_msg); - spin_lock_irqsave(&intf->seq_lock, *flags); + mutex_lock(&intf->seq_lock); } } @@ -5052,7 +5041,7 @@ static bool ipmi_timeout_handler(struct ipmi_smi *intf, * list. */ INIT_LIST_HEAD(&timeouts); - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); if (intf->ipmb_maintenance_mode_timeout) { if (intf->ipmb_maintenance_mode_timeout <= timeout_period) intf->ipmb_maintenance_mode_timeout = 0; @@ -5062,8 +5051,8 @@ static bool ipmi_timeout_handler(struct ipmi_smi *intf, for (i = 0; i < IPMI_IPMB_NUM_SEQ; i++) check_msg_timeout(intf, &intf->seq_table[i], &timeouts, timeout_period, i, - &flags, &need_timer); - spin_unlock_irqrestore(&intf->seq_lock, flags); + &need_timer); + mutex_unlock(&intf->seq_lock); list_for_each_entry_safe(msg, msg2, &timeouts, link) deliver_err_response(intf, msg, IPMI_TIMEOUT_COMPLETION_CODE); From 5d09ee1bec870263f4ace439402ea840503b503b Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 Aug 2025 11:03:13 -0500 Subject: [PATCH 1155/3206] Revert "ipmi: fix msg stack when IPMI is disconnected" This reverts commit c608966f3f9c2dca596967501d00753282b395fc. This patch has a subtle bug that can cause the IPMI driver to go into an infinite loop if the BMC misbehaves in a certain way. Apparently certain BMCs do misbehave this way because several reports have come in recently about this. Signed-off-by: Corey Minyard Tested-by: Eric Hagberg Cc: # 6.2 --- drivers/char/ipmi/ipmi_kcs_sm.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/char/ipmi/ipmi_kcs_sm.c b/drivers/char/ipmi/ipmi_kcs_sm.c index ecfcb50302f6ce..efda90dcf5b3d0 100644 --- a/drivers/char/ipmi/ipmi_kcs_sm.c +++ b/drivers/char/ipmi/ipmi_kcs_sm.c @@ -122,10 +122,10 @@ struct si_sm_data { unsigned long error0_timeout; }; -static unsigned int init_kcs_data_with_state(struct si_sm_data *kcs, - struct si_sm_io *io, enum kcs_states state) +static unsigned int init_kcs_data(struct si_sm_data *kcs, + struct si_sm_io *io) { - kcs->state = state; + kcs->state = KCS_IDLE; kcs->io = io; kcs->write_pos = 0; kcs->write_count = 0; @@ -140,12 +140,6 @@ static unsigned int init_kcs_data_with_state(struct si_sm_data *kcs, return 2; } -static unsigned int init_kcs_data(struct si_sm_data *kcs, - struct si_sm_io *io) -{ - return init_kcs_data_with_state(kcs, io, KCS_IDLE); -} - static inline unsigned char read_status(struct si_sm_data *kcs) { return kcs->io->inputb(kcs->io, 1); @@ -276,7 +270,7 @@ static int start_kcs_transaction(struct si_sm_data *kcs, unsigned char *data, if (size > MAX_KCS_WRITE_SIZE) return IPMI_REQ_LEN_EXCEEDED_ERR; - if (kcs->state != KCS_IDLE) { + if ((kcs->state != KCS_IDLE) && (kcs->state != KCS_HOSED)) { dev_warn(kcs->io->dev, "KCS in invalid state %d\n", kcs->state); return IPMI_NOT_IN_MY_STATE_ERR; } @@ -501,7 +495,7 @@ static enum si_sm_result kcs_event(struct si_sm_data *kcs, long time) } if (kcs->state == KCS_HOSED) { - init_kcs_data_with_state(kcs, kcs->io, KCS_ERROR0); + init_kcs_data(kcs, kcs->io); return SI_SM_HOSED; } From 14a41628c470f4aa069075cdcf6ec0138b6cf1da Mon Sep 17 00:00:00 2001 From: "Nikola Z. Ivanov" Date: Wed, 27 Aug 2025 00:49:13 +0300 Subject: [PATCH 1156/3206] selftests/arm64: Fix grammatical error in string literals Fix grammatical error in + construct related to memory allocation checks. In essence change "Failed to allocated" to "Failed to allocate". Signed-off-by: Nikola Z. Ivanov Reviewed-by: Mark Brown Reviewed-by: Bagas Sanjaya Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/fp-stress.c | 2 +- tools/testing/selftests/arm64/fp/kernel-test.c | 4 ++-- tools/testing/selftests/arm64/gcs/gcs-stress.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index 3a0ae96cf9096a..9349aa630c8419 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -549,7 +549,7 @@ int main(int argc, char **argv) evs = calloc(tests, sizeof(*evs)); if (!evs) - ksft_exit_fail_msg("Failed to allocated %d epoll events\n", + ksft_exit_fail_msg("Failed to allocate %d epoll events\n", tests); for (i = 0; i < cpus; i++) { diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c index e3cec3723ffa96..0c40007d128214 100644 --- a/tools/testing/selftests/arm64/fp/kernel-test.c +++ b/tools/testing/selftests/arm64/fp/kernel-test.c @@ -188,13 +188,13 @@ static bool create_socket(void) ref = malloc(digest_len); if (!ref) { - printf("Failed to allocated %d byte reference\n", digest_len); + printf("Failed to allocate %d byte reference\n", digest_len); return false; } digest = malloc(digest_len); if (!digest) { - printf("Failed to allocated %d byte digest\n", digest_len); + printf("Failed to allocate %d byte digest\n", digest_len); return false; } diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c index bbc7f4950c13ed..cf316d78ea97c7 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-stress.c +++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c @@ -433,7 +433,7 @@ int main(int argc, char **argv) evs = calloc(tests, sizeof(*evs)); if (!evs) - ksft_exit_fail_msg("Failed to allocated %d epoll events\n", + ksft_exit_fail_msg("Failed to allocate %d epoll events\n", tests); for (i = 0; i < gcs_threads; i++) From b52da4054ee0bf9ecb44996f2c83236ff50b3812 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 5 Sep 2025 11:33:39 -0500 Subject: [PATCH 1157/3206] ipmi: Rework user message limit handling The limit on the number of user messages had a number of issues, improper counting in some cases and a use after free. Restructure how this is all done to handle more in the receive message allocation routine, so all refcouting and user message limit counts are done in that routine. It's a lot cleaner and safer. Reported-by: Gilles BULOZ Closes: https://lore.kernel.org/lkml/aLsw6G0GyqfpKs2S@mail.minyard.net/ Fixes: 8e76741c3d8b ("ipmi: Add a limit on the number of users that may use IPMI") Cc: # 4.19 Signed-off-by: Corey Minyard Tested-by: Gilles BULOZ --- drivers/char/ipmi/ipmi_msghandler.c | 420 +++++++++++++--------------- 1 file changed, 200 insertions(+), 220 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index b78cc359534dff..d2fbf2203bd21f 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -38,7 +38,9 @@ #define IPMI_DRIVER_VERSION "39.2" -static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void); +static struct ipmi_recv_msg *ipmi_alloc_recv_msg(struct ipmi_user *user); +static void ipmi_set_recv_msg_user(struct ipmi_recv_msg *msg, + struct ipmi_user *user); static int ipmi_init_msghandler(void); static void smi_work(struct work_struct *t); static void handle_new_recv_msgs(struct ipmi_smi *intf); @@ -955,7 +957,6 @@ static int deliver_response(struct ipmi_smi *intf, struct ipmi_recv_msg *msg) * risk. At this moment, simply skip it in that case. */ ipmi_free_recv_msg(msg); - atomic_dec(&msg->user->nr_msgs); } else { /* * Deliver it in smi_work. The message will hold a @@ -1611,8 +1612,7 @@ int ipmi_set_gets_events(struct ipmi_user *user, bool val) } list_for_each_entry_safe(msg, msg2, &msgs, link) { - msg->user = user; - kref_get(&user->refcount); + ipmi_set_recv_msg_user(msg, user); deliver_local_response(intf, msg); } } @@ -2277,22 +2277,15 @@ static int i_ipmi_request(struct ipmi_user *user, int run_to_completion = READ_ONCE(intf->run_to_completion); int rv = 0; - if (user) { - if (atomic_add_return(1, &user->nr_msgs) > max_msgs_per_user) { - /* Decrement will happen at the end of the routine. */ - rv = -EBUSY; - goto out; - } - } - - if (supplied_recv) + if (supplied_recv) { recv_msg = supplied_recv; - else { - recv_msg = ipmi_alloc_recv_msg(); - if (recv_msg == NULL) { - rv = -ENOMEM; - goto out; - } + recv_msg->user = user; + if (user) + atomic_inc(&user->nr_msgs); + } else { + recv_msg = ipmi_alloc_recv_msg(user); + if (IS_ERR(recv_msg)) + return PTR_ERR(recv_msg); } recv_msg->user_msg_data = user_msg_data; @@ -2303,8 +2296,7 @@ static int i_ipmi_request(struct ipmi_user *user, if (smi_msg == NULL) { if (!supplied_recv) ipmi_free_recv_msg(recv_msg); - rv = -ENOMEM; - goto out; + return -ENOMEM; } } @@ -2315,10 +2307,6 @@ static int i_ipmi_request(struct ipmi_user *user, goto out_err; } - recv_msg->user = user; - if (user) - /* The put happens when the message is freed. */ - kref_get(&user->refcount); recv_msg->msgid = msgid; /* * Store the message to send in the receive message so timeout @@ -2347,8 +2335,10 @@ static int i_ipmi_request(struct ipmi_user *user, if (rv) { out_err: - ipmi_free_smi_msg(smi_msg); - ipmi_free_recv_msg(recv_msg); + if (!supplied_smi) + ipmi_free_smi_msg(smi_msg); + if (!supplied_recv) + ipmi_free_recv_msg(recv_msg); } else { dev_dbg(intf->si_dev, "Send: %*ph\n", smi_msg->data_size, smi_msg->data); @@ -2358,9 +2348,6 @@ static int i_ipmi_request(struct ipmi_user *user, if (!run_to_completion) mutex_unlock(&intf->users_mutex); -out: - if (rv && user) - atomic_dec(&user->nr_msgs); return rv; } @@ -3851,7 +3838,7 @@ static int handle_ipmb_get_msg_cmd(struct ipmi_smi *intf, unsigned char chan; struct ipmi_user *user = NULL; struct ipmi_ipmb_addr *ipmb_addr; - struct ipmi_recv_msg *recv_msg; + struct ipmi_recv_msg *recv_msg = NULL; if (msg->rsp_size < 10) { /* Message not big enough, just ignore it. */ @@ -3872,9 +3859,8 @@ static int handle_ipmb_get_msg_cmd(struct ipmi_smi *intf, rcvr = find_cmd_rcvr(intf, netfn, cmd, chan); if (rcvr) { user = rcvr->user; - kref_get(&user->refcount); - } else - user = NULL; + recv_msg = ipmi_alloc_recv_msg(user); + } rcu_read_unlock(); if (user == NULL) { @@ -3904,47 +3890,41 @@ static int handle_ipmb_get_msg_cmd(struct ipmi_smi *intf, * causes it to not be freed or queued. */ rv = -1; - } else { - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { - /* - * We couldn't allocate memory for the - * message, so requeue it for handling - * later. - */ - rv = 1; - kref_put(&user->refcount, free_ipmi_user); - } else { - /* Extract the source address from the data. */ - ipmb_addr = (struct ipmi_ipmb_addr *) &recv_msg->addr; - ipmb_addr->addr_type = IPMI_IPMB_ADDR_TYPE; - ipmb_addr->slave_addr = msg->rsp[6]; - ipmb_addr->lun = msg->rsp[7] & 3; - ipmb_addr->channel = msg->rsp[3] & 0xf; + } else if (!IS_ERR(recv_msg)) { + /* Extract the source address from the data. */ + ipmb_addr = (struct ipmi_ipmb_addr *) &recv_msg->addr; + ipmb_addr->addr_type = IPMI_IPMB_ADDR_TYPE; + ipmb_addr->slave_addr = msg->rsp[6]; + ipmb_addr->lun = msg->rsp[7] & 3; + ipmb_addr->channel = msg->rsp[3] & 0xf; - /* - * Extract the rest of the message information - * from the IPMB header. - */ - recv_msg->user = user; - recv_msg->recv_type = IPMI_CMD_RECV_TYPE; - recv_msg->msgid = msg->rsp[7] >> 2; - recv_msg->msg.netfn = msg->rsp[4] >> 2; - recv_msg->msg.cmd = msg->rsp[8]; - recv_msg->msg.data = recv_msg->msg_data; + /* + * Extract the rest of the message information + * from the IPMB header. + */ + recv_msg->recv_type = IPMI_CMD_RECV_TYPE; + recv_msg->msgid = msg->rsp[7] >> 2; + recv_msg->msg.netfn = msg->rsp[4] >> 2; + recv_msg->msg.cmd = msg->rsp[8]; + recv_msg->msg.data = recv_msg->msg_data; - /* - * We chop off 10, not 9 bytes because the checksum - * at the end also needs to be removed. - */ - recv_msg->msg.data_len = msg->rsp_size - 10; - memcpy(recv_msg->msg_data, &msg->rsp[9], - msg->rsp_size - 10); - if (deliver_response(intf, recv_msg)) - ipmi_inc_stat(intf, unhandled_commands); - else - ipmi_inc_stat(intf, handled_commands); - } + /* + * We chop off 10, not 9 bytes because the checksum + * at the end also needs to be removed. + */ + recv_msg->msg.data_len = msg->rsp_size - 10; + memcpy(recv_msg->msg_data, &msg->rsp[9], + msg->rsp_size - 10); + if (deliver_response(intf, recv_msg)) + ipmi_inc_stat(intf, unhandled_commands); + else + ipmi_inc_stat(intf, handled_commands); + } else { + /* + * We couldn't allocate memory for the message, so + * requeue it for handling later. + */ + rv = 1; } return rv; @@ -3957,7 +3937,7 @@ static int handle_ipmb_direct_rcv_cmd(struct ipmi_smi *intf, int rv = 0; struct ipmi_user *user = NULL; struct ipmi_ipmb_direct_addr *daddr; - struct ipmi_recv_msg *recv_msg; + struct ipmi_recv_msg *recv_msg = NULL; unsigned char netfn = msg->rsp[0] >> 2; unsigned char cmd = msg->rsp[3]; @@ -3966,9 +3946,8 @@ static int handle_ipmb_direct_rcv_cmd(struct ipmi_smi *intf, rcvr = find_cmd_rcvr(intf, netfn, cmd, 0); if (rcvr) { user = rcvr->user; - kref_get(&user->refcount); - } else - user = NULL; + recv_msg = ipmi_alloc_recv_msg(user); + } rcu_read_unlock(); if (user == NULL) { @@ -3990,44 +3969,38 @@ static int handle_ipmb_direct_rcv_cmd(struct ipmi_smi *intf, * causes it to not be freed or queued. */ rv = -1; - } else { - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { - /* - * We couldn't allocate memory for the - * message, so requeue it for handling - * later. - */ - rv = 1; - kref_put(&user->refcount, free_ipmi_user); - } else { - /* Extract the source address from the data. */ - daddr = (struct ipmi_ipmb_direct_addr *)&recv_msg->addr; - daddr->addr_type = IPMI_IPMB_DIRECT_ADDR_TYPE; - daddr->channel = 0; - daddr->slave_addr = msg->rsp[1]; - daddr->rs_lun = msg->rsp[0] & 3; - daddr->rq_lun = msg->rsp[2] & 3; + } else if (!IS_ERR(recv_msg)) { + /* Extract the source address from the data. */ + daddr = (struct ipmi_ipmb_direct_addr *)&recv_msg->addr; + daddr->addr_type = IPMI_IPMB_DIRECT_ADDR_TYPE; + daddr->channel = 0; + daddr->slave_addr = msg->rsp[1]; + daddr->rs_lun = msg->rsp[0] & 3; + daddr->rq_lun = msg->rsp[2] & 3; - /* - * Extract the rest of the message information - * from the IPMB header. - */ - recv_msg->user = user; - recv_msg->recv_type = IPMI_CMD_RECV_TYPE; - recv_msg->msgid = (msg->rsp[2] >> 2); - recv_msg->msg.netfn = msg->rsp[0] >> 2; - recv_msg->msg.cmd = msg->rsp[3]; - recv_msg->msg.data = recv_msg->msg_data; - - recv_msg->msg.data_len = msg->rsp_size - 4; - memcpy(recv_msg->msg_data, msg->rsp + 4, - msg->rsp_size - 4); - if (deliver_response(intf, recv_msg)) - ipmi_inc_stat(intf, unhandled_commands); - else - ipmi_inc_stat(intf, handled_commands); - } + /* + * Extract the rest of the message information + * from the IPMB header. + */ + recv_msg->recv_type = IPMI_CMD_RECV_TYPE; + recv_msg->msgid = (msg->rsp[2] >> 2); + recv_msg->msg.netfn = msg->rsp[0] >> 2; + recv_msg->msg.cmd = msg->rsp[3]; + recv_msg->msg.data = recv_msg->msg_data; + + recv_msg->msg.data_len = msg->rsp_size - 4; + memcpy(recv_msg->msg_data, msg->rsp + 4, + msg->rsp_size - 4); + if (deliver_response(intf, recv_msg)) + ipmi_inc_stat(intf, unhandled_commands); + else + ipmi_inc_stat(intf, handled_commands); + } else { + /* + * We couldn't allocate memory for the message, so + * requeue it for handling later. + */ + rv = 1; } return rv; @@ -4141,7 +4114,7 @@ static int handle_lan_get_msg_cmd(struct ipmi_smi *intf, unsigned char chan; struct ipmi_user *user = NULL; struct ipmi_lan_addr *lan_addr; - struct ipmi_recv_msg *recv_msg; + struct ipmi_recv_msg *recv_msg = NULL; if (msg->rsp_size < 12) { /* Message not big enough, just ignore it. */ @@ -4162,9 +4135,8 @@ static int handle_lan_get_msg_cmd(struct ipmi_smi *intf, rcvr = find_cmd_rcvr(intf, netfn, cmd, chan); if (rcvr) { user = rcvr->user; - kref_get(&user->refcount); - } else - user = NULL; + recv_msg = ipmi_alloc_recv_msg(user); + } rcu_read_unlock(); if (user == NULL) { @@ -4195,49 +4167,44 @@ static int handle_lan_get_msg_cmd(struct ipmi_smi *intf, * causes it to not be freed or queued. */ rv = -1; - } else { - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { - /* - * We couldn't allocate memory for the - * message, so requeue it for handling later. - */ - rv = 1; - kref_put(&user->refcount, free_ipmi_user); - } else { - /* Extract the source address from the data. */ - lan_addr = (struct ipmi_lan_addr *) &recv_msg->addr; - lan_addr->addr_type = IPMI_LAN_ADDR_TYPE; - lan_addr->session_handle = msg->rsp[4]; - lan_addr->remote_SWID = msg->rsp[8]; - lan_addr->local_SWID = msg->rsp[5]; - lan_addr->lun = msg->rsp[9] & 3; - lan_addr->channel = msg->rsp[3] & 0xf; - lan_addr->privilege = msg->rsp[3] >> 4; + } else if (!IS_ERR(recv_msg)) { + /* Extract the source address from the data. */ + lan_addr = (struct ipmi_lan_addr *) &recv_msg->addr; + lan_addr->addr_type = IPMI_LAN_ADDR_TYPE; + lan_addr->session_handle = msg->rsp[4]; + lan_addr->remote_SWID = msg->rsp[8]; + lan_addr->local_SWID = msg->rsp[5]; + lan_addr->lun = msg->rsp[9] & 3; + lan_addr->channel = msg->rsp[3] & 0xf; + lan_addr->privilege = msg->rsp[3] >> 4; - /* - * Extract the rest of the message information - * from the IPMB header. - */ - recv_msg->user = user; - recv_msg->recv_type = IPMI_CMD_RECV_TYPE; - recv_msg->msgid = msg->rsp[9] >> 2; - recv_msg->msg.netfn = msg->rsp[6] >> 2; - recv_msg->msg.cmd = msg->rsp[10]; - recv_msg->msg.data = recv_msg->msg_data; + /* + * Extract the rest of the message information + * from the IPMB header. + */ + recv_msg->recv_type = IPMI_CMD_RECV_TYPE; + recv_msg->msgid = msg->rsp[9] >> 2; + recv_msg->msg.netfn = msg->rsp[6] >> 2; + recv_msg->msg.cmd = msg->rsp[10]; + recv_msg->msg.data = recv_msg->msg_data; - /* - * We chop off 12, not 11 bytes because the checksum - * at the end also needs to be removed. - */ - recv_msg->msg.data_len = msg->rsp_size - 12; - memcpy(recv_msg->msg_data, &msg->rsp[11], - msg->rsp_size - 12); - if (deliver_response(intf, recv_msg)) - ipmi_inc_stat(intf, unhandled_commands); - else - ipmi_inc_stat(intf, handled_commands); - } + /* + * We chop off 12, not 11 bytes because the checksum + * at the end also needs to be removed. + */ + recv_msg->msg.data_len = msg->rsp_size - 12; + memcpy(recv_msg->msg_data, &msg->rsp[11], + msg->rsp_size - 12); + if (deliver_response(intf, recv_msg)) + ipmi_inc_stat(intf, unhandled_commands); + else + ipmi_inc_stat(intf, handled_commands); + } else { + /* + * We couldn't allocate memory for the message, so + * requeue it for handling later. + */ + rv = 1; } return rv; @@ -4259,7 +4226,7 @@ static int handle_oem_get_msg_cmd(struct ipmi_smi *intf, unsigned char chan; struct ipmi_user *user = NULL; struct ipmi_system_interface_addr *smi_addr; - struct ipmi_recv_msg *recv_msg; + struct ipmi_recv_msg *recv_msg = NULL; /* * We expect the OEM SW to perform error checking @@ -4288,9 +4255,8 @@ static int handle_oem_get_msg_cmd(struct ipmi_smi *intf, rcvr = find_cmd_rcvr(intf, netfn, cmd, chan); if (rcvr) { user = rcvr->user; - kref_get(&user->refcount); - } else - user = NULL; + recv_msg = ipmi_alloc_recv_msg(user); + } rcu_read_unlock(); if (user == NULL) { @@ -4303,48 +4269,42 @@ static int handle_oem_get_msg_cmd(struct ipmi_smi *intf, */ rv = 0; - } else { - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { - /* - * We couldn't allocate memory for the - * message, so requeue it for handling - * later. - */ - rv = 1; - kref_put(&user->refcount, free_ipmi_user); - } else { - /* - * OEM Messages are expected to be delivered via - * the system interface to SMS software. We might - * need to visit this again depending on OEM - * requirements - */ - smi_addr = ((struct ipmi_system_interface_addr *) - &recv_msg->addr); - smi_addr->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE; - smi_addr->channel = IPMI_BMC_CHANNEL; - smi_addr->lun = msg->rsp[0] & 3; - - recv_msg->user = user; - recv_msg->user_msg_data = NULL; - recv_msg->recv_type = IPMI_OEM_RECV_TYPE; - recv_msg->msg.netfn = msg->rsp[0] >> 2; - recv_msg->msg.cmd = msg->rsp[1]; - recv_msg->msg.data = recv_msg->msg_data; + } else if (!IS_ERR(recv_msg)) { + /* + * OEM Messages are expected to be delivered via + * the system interface to SMS software. We might + * need to visit this again depending on OEM + * requirements + */ + smi_addr = ((struct ipmi_system_interface_addr *) + &recv_msg->addr); + smi_addr->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE; + smi_addr->channel = IPMI_BMC_CHANNEL; + smi_addr->lun = msg->rsp[0] & 3; + + recv_msg->user_msg_data = NULL; + recv_msg->recv_type = IPMI_OEM_RECV_TYPE; + recv_msg->msg.netfn = msg->rsp[0] >> 2; + recv_msg->msg.cmd = msg->rsp[1]; + recv_msg->msg.data = recv_msg->msg_data; - /* - * The message starts at byte 4 which follows the - * Channel Byte in the "GET MESSAGE" command - */ - recv_msg->msg.data_len = msg->rsp_size - 4; - memcpy(recv_msg->msg_data, &msg->rsp[4], - msg->rsp_size - 4); - if (deliver_response(intf, recv_msg)) - ipmi_inc_stat(intf, unhandled_commands); - else - ipmi_inc_stat(intf, handled_commands); - } + /* + * The message starts at byte 4 which follows the + * Channel Byte in the "GET MESSAGE" command + */ + recv_msg->msg.data_len = msg->rsp_size - 4; + memcpy(recv_msg->msg_data, &msg->rsp[4], + msg->rsp_size - 4); + if (deliver_response(intf, recv_msg)) + ipmi_inc_stat(intf, unhandled_commands); + else + ipmi_inc_stat(intf, handled_commands); + } else { + /* + * We couldn't allocate memory for the message, so + * requeue it for handling later. + */ + rv = 1; } return rv; @@ -4402,8 +4362,8 @@ static int handle_read_event_rsp(struct ipmi_smi *intf, if (!user->gets_events) continue; - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { + recv_msg = ipmi_alloc_recv_msg(user); + if (IS_ERR(recv_msg)) { mutex_unlock(&intf->users_mutex); list_for_each_entry_safe(recv_msg, recv_msg2, &msgs, link) { @@ -4424,8 +4384,6 @@ static int handle_read_event_rsp(struct ipmi_smi *intf, deliver_count++; copy_event_into_recv_msg(recv_msg, msg); - recv_msg->user = user; - kref_get(&user->refcount); list_add_tail(&recv_msg->link, &msgs); } mutex_unlock(&intf->users_mutex); @@ -4441,8 +4399,8 @@ static int handle_read_event_rsp(struct ipmi_smi *intf, * No one to receive the message, put it in queue if there's * not already too many things in the queue. */ - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { + recv_msg = ipmi_alloc_recv_msg(NULL); + if (IS_ERR(recv_msg)) { /* * We couldn't allocate memory for the * message, so requeue it for handling @@ -4858,12 +4816,10 @@ static void smi_work(struct work_struct *t) list_del(&msg->link); - if (refcount_read(&user->destroyed) == 0) { + if (refcount_read(&user->destroyed) == 0) ipmi_free_recv_msg(msg); - } else { - atomic_dec(&user->nr_msgs); + else user->handler->ipmi_recv_hndl(msg, user->handler_data); - } } mutex_unlock(&intf->user_msgs_mutex); @@ -5179,27 +5135,51 @@ static void free_recv_msg(struct ipmi_recv_msg *msg) kfree(msg); } -static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void) +static struct ipmi_recv_msg *ipmi_alloc_recv_msg(struct ipmi_user *user) { struct ipmi_recv_msg *rv; + if (user) { + if (atomic_add_return(1, &user->nr_msgs) > max_msgs_per_user) { + atomic_dec(&user->nr_msgs); + return ERR_PTR(-EBUSY); + } + } + rv = kmalloc(sizeof(struct ipmi_recv_msg), GFP_ATOMIC); - if (rv) { - rv->user = NULL; - rv->done = free_recv_msg; - atomic_inc(&recv_msg_inuse_count); + if (!rv) { + if (user) + atomic_dec(&user->nr_msgs); + return ERR_PTR(-ENOMEM); } + + rv->user = user; + rv->done = free_recv_msg; + if (user) + kref_get(&user->refcount); + atomic_inc(&recv_msg_inuse_count); return rv; } void ipmi_free_recv_msg(struct ipmi_recv_msg *msg) { - if (msg->user && !oops_in_progress) + if (msg->user && !oops_in_progress) { + atomic_dec(&msg->user->nr_msgs); kref_put(&msg->user->refcount, free_ipmi_user); + } msg->done(msg); } EXPORT_SYMBOL(ipmi_free_recv_msg); +static void ipmi_set_recv_msg_user(struct ipmi_recv_msg *msg, + struct ipmi_user *user) +{ + WARN_ON_ONCE(msg->user); /* User should not be set. */ + msg->user = user; + atomic_inc(&user->nr_msgs); + kref_get(&user->refcount); +} + static atomic_t panic_done_count = ATOMIC_INIT(0); static void dummy_smi_done_handler(struct ipmi_smi_msg *msg) From 121f4a7e474393502cd71d63f71df7997cbc2f90 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 7 Aug 2025 08:28:51 -0500 Subject: [PATCH 1158/3206] dt-bindings: ipmi: aspeed,ast2400-kcs-bmc: Add missing "clocks" property The ASpeed kcs-bmc nodes have a "clocks" property which isn't documented. It looks like all the LPC child devices have the same clock source and some of the drivers manage their clock. Perhaps it is the parent device that should have the clock, but it's too late for that. Signed-off-by: Rob Herring (Arm) Message-ID: <20250807132852.3291305-1-robh@kernel.org> Signed-off-by: Corey Minyard --- .../devicetree/bindings/ipmi/aspeed,ast2400-kcs-bmc.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-kcs-bmc.yaml b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-kcs-bmc.yaml index 129e32c4c77411..610c7986320897 100644 --- a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-kcs-bmc.yaml +++ b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-kcs-bmc.yaml @@ -40,6 +40,9 @@ properties: - description: ODR register - description: STR register + clocks: + maxItems: 1 + aspeed,lpc-io-reg: $ref: /schemas/types.yaml#/definitions/uint32-array minItems: 1 From 80c4e1948908f725564997c78d10110d9b21a015 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 09:56:45 +0200 Subject: [PATCH 1159/3206] arm64: vdso32: Stop suppressing warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These warnings don't seem to trigger anymore. Probably due to the introduction of the vdso/ header namespace. Nowadays these suppression only hide real problems. Re-enable the warnings. Signed-off-by: Thomas Weißschuh Signed-off-by: Will Deacon --- arch/arm64/kernel/vdso32/Makefile | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index f2dfdc7dc8185b..fd80123bc8e620 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -21,8 +21,6 @@ endif cc32-option = $(call try-run,\ $(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) -cc32-disable-warning = $(call try-run,\ - $(CC_COMPAT) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) # We cannot use the global flags to compile the vDSO files, the main reason # being that the 32-bit compiler may be older than the main (64-bit) compiler @@ -74,16 +72,6 @@ VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) VDSO_CFLAGS += -Werror=date-time VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) -# The 32-bit compiler does not provide 128-bit integers, which are used in -# some headers that are indirectly included from the vDSO code. -# This hack makes the compiler happy and should trigger a warning/error if -# variables of such type are referenced. -VDSO_CFLAGS += -D__uint128_t='void*' -# Silence some warnings coming from headers that operate on long's -# (on GCC 4.8 or older, there is unfortunately no way to silence this warning) -VDSO_CFLAGS += $(call cc32-disable-warning,shift-count-overflow) -VDSO_CFLAGS += -Wno-int-to-pointer-cast - # Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is # unreliable. ifeq ($(CONFIG_THUMB2_COMPAT_VDSO), y) From e5feb030d919f296257d628bae57d0494151dc24 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 7 Aug 2025 15:54:20 -0500 Subject: [PATCH 1160/3206] ipmi: Differentiate between reset and firmware update in maintenance This allows later changes to have different behaviour during a reset verses a firmware update. Signed-off-by: Corey Minyard Tested-by: Frederick Lawler --- drivers/char/ipmi/ipmi_msghandler.c | 42 ++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index d2fbf2203bd21f..18266ba11bc15e 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -541,7 +541,11 @@ struct ipmi_smi { /* For handling of maintenance mode. */ int maintenance_mode; - bool maintenance_mode_enable; + +#define IPMI_MAINTENANCE_MODE_STATE_OFF 0 +#define IPMI_MAINTENANCE_MODE_STATE_FIRMWARE 1 +#define IPMI_MAINTENANCE_MODE_STATE_RESET 2 + int maintenance_mode_state; int auto_maintenance_timeout; spinlock_t maintenance_mode_lock; /* Used in a timer... */ @@ -1530,8 +1534,15 @@ EXPORT_SYMBOL(ipmi_get_maintenance_mode); static void maintenance_mode_update(struct ipmi_smi *intf) { if (intf->handlers->set_maintenance_mode) + /* + * Lower level drivers only care about firmware mode + * as it affects their timing. They don't care about + * reset, which disables all commands for a while. + */ intf->handlers->set_maintenance_mode( - intf->send_info, intf->maintenance_mode_enable); + intf->send_info, + (intf->maintenance_mode_state == + IPMI_MAINTENANCE_MODE_STATE_FIRMWARE)); } int ipmi_set_maintenance_mode(struct ipmi_user *user, int mode) @@ -1548,16 +1559,17 @@ int ipmi_set_maintenance_mode(struct ipmi_user *user, int mode) if (intf->maintenance_mode != mode) { switch (mode) { case IPMI_MAINTENANCE_MODE_AUTO: - intf->maintenance_mode_enable - = (intf->auto_maintenance_timeout > 0); + /* Just leave it alone. */ break; case IPMI_MAINTENANCE_MODE_OFF: - intf->maintenance_mode_enable = false; + intf->maintenance_mode_state = + IPMI_MAINTENANCE_MODE_STATE_OFF; break; case IPMI_MAINTENANCE_MODE_ON: - intf->maintenance_mode_enable = true; + intf->maintenance_mode_state = + IPMI_MAINTENANCE_MODE_STATE_FIRMWARE; break; default: @@ -1917,13 +1929,18 @@ static int i_ipmi_req_sysintf(struct ipmi_smi *intf, if (is_maintenance_mode_cmd(msg)) { unsigned long flags; + int newst; + + if (msg->netfn == IPMI_NETFN_FIRMWARE_REQUEST) + newst = IPMI_MAINTENANCE_MODE_STATE_FIRMWARE; + else + newst = IPMI_MAINTENANCE_MODE_STATE_RESET; spin_lock_irqsave(&intf->maintenance_mode_lock, flags); - intf->auto_maintenance_timeout - = maintenance_mode_timeout_ms; + intf->auto_maintenance_timeout = maintenance_mode_timeout_ms; if (!intf->maintenance_mode - && !intf->maintenance_mode_enable) { - intf->maintenance_mode_enable = true; + && intf->maintenance_mode_state < newst) { + intf->maintenance_mode_state = newst; maintenance_mode_update(intf); } spin_unlock_irqrestore(&intf->maintenance_mode_lock, @@ -5028,7 +5045,8 @@ static bool ipmi_timeout_handler(struct ipmi_smi *intf, -= timeout_period; if (!intf->maintenance_mode && (intf->auto_maintenance_timeout <= 0)) { - intf->maintenance_mode_enable = false; + intf->maintenance_mode_state = + IPMI_MAINTENANCE_MODE_STATE_OFF; maintenance_mode_update(intf); } } @@ -5044,7 +5062,7 @@ static bool ipmi_timeout_handler(struct ipmi_smi *intf, static void ipmi_request_event(struct ipmi_smi *intf) { /* No event requests when in maintenance mode. */ - if (intf->maintenance_mode_enable) + if (intf->maintenance_mode_state) return; if (!intf->in_shutdown) From 281817dffe2855fc0dc07c205b03df630dc6ba38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 09:56:46 +0200 Subject: [PATCH 1161/3206] arm64: vdso32: Respect -Werror from kbuild MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compiler flags for the compat vDSO are built manually as they are not compatible with the ones from kbuild. CONFIG_WERROR is not respected. Explicitly inherit -Werror from kbuild. Signed-off-by: Thomas Weißschuh Signed-off-by: Will Deacon --- arch/arm64/kernel/vdso32/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index fd80123bc8e620..5de4deaf429920 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -61,6 +61,7 @@ VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 # KBUILD_CFLAGS from top-level Makefile VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ + $(filter -Werror,$(KBUILD_CPPFLAGS)) \ -Werror-implicit-function-declaration \ -Wno-format-security \ -std=gnu11 From 30f6c9d5451de72ebc52aed6c15031696c6c5bd3 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 7 Aug 2025 16:05:24 -0500 Subject: [PATCH 1162/3206] ipmi: Disable sysfs access and requests in maintenance mode If the driver goes into any maintenance mode, disable sysfs access until it is done. If the driver goes into reset maintenance mode, disable all messages until it is done. Signed-off-by: Corey Minyard Tested-by: Frederick Lawler --- drivers/char/ipmi/ipmi_msghandler.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 18266ba11bc15e..73d34085deae60 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -2319,6 +2319,11 @@ static int i_ipmi_request(struct ipmi_user *user, if (!run_to_completion) mutex_lock(&intf->users_mutex); + if (intf->maintenance_mode_state == IPMI_MAINTENANCE_MODE_STATE_RESET) { + /* No messages while the BMC is in reset. */ + rv = -EBUSY; + goto out_err; + } if (intf->in_shutdown) { rv = -ENODEV; goto out_err; @@ -2615,6 +2620,12 @@ static int __bmc_get_device_id(struct ipmi_smi *intf, struct bmc_device *bmc, (bmc->dyn_id_set && time_is_after_jiffies(bmc->dyn_id_expiry))) goto out_noprocessing; + /* Don't allow sysfs access when in maintenance mode. */ + if (intf->maintenance_mode_state) { + rv = -EBUSY; + goto out_noprocessing; + } + prev_guid_set = bmc->dyn_guid_set; __get_guid(intf); From 627118470fccc61d7763aa667fcab0a9476843f6 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 7 Aug 2025 17:55:08 -0500 Subject: [PATCH 1163/3206] ipmi: Add a maintenance mode sysfs file So you can see if it's in maintenance mode and see how long is left. Signed-off-by: Corey Minyard Tested-by: Frederick Lawler --- drivers/char/ipmi/ipmi_msghandler.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 73d34085deae60..a33d3e2b345e8f 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -434,6 +434,7 @@ struct ipmi_smi { atomic_t nr_users; struct device_attribute nr_users_devattr; struct device_attribute nr_msgs_devattr; + struct device_attribute maintenance_mode_devattr; /* Used for wake ups at startup. */ @@ -3521,6 +3522,19 @@ static ssize_t nr_msgs_show(struct device *dev, } static DEVICE_ATTR_RO(nr_msgs); +static ssize_t maintenance_mode_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipmi_smi *intf = container_of(attr, + struct ipmi_smi, + maintenance_mode_devattr); + + return sysfs_emit(buf, "%u %d\n", intf->maintenance_mode_state, + intf->auto_maintenance_timeout); +} +static DEVICE_ATTR_RO(maintenance_mode); + static void redo_bmc_reg(struct work_struct *work) { struct ipmi_smi *intf = container_of(work, struct ipmi_smi, @@ -3657,6 +3671,14 @@ int ipmi_add_smi(struct module *owner, goto out_err_bmc_reg; } + intf->maintenance_mode_devattr = dev_attr_maintenance_mode; + sysfs_attr_init(&intf->maintenance_mode_devattr.attr); + rv = device_create_file(intf->si_dev, &intf->maintenance_mode_devattr); + if (rv) { + device_remove_file(intf->si_dev, &intf->nr_users_devattr); + goto out_err_bmc_reg; + } + intf->intf_num = i; mutex_unlock(&ipmi_interfaces_mutex); @@ -3764,6 +3786,7 @@ void ipmi_unregister_smi(struct ipmi_smi *intf) if (intf->handlers->shutdown) intf->handlers->shutdown(intf->send_info); + device_remove_file(intf->si_dev, &intf->maintenance_mode_devattr); device_remove_file(intf->si_dev, &intf->nr_msgs_devattr); device_remove_file(intf->si_dev, &intf->nr_users_devattr); From bbfb8353cb26acb05b48380f2213d51938f11019 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 7 Aug 2025 17:55:44 -0500 Subject: [PATCH 1164/3206] ipmi: Set a timer for maintenance mode Now that maintenance mode rejects all messages, there's nothing to run time timer. Make sure the timer is running in maintenance mode. Signed-off-by: Corey Minyard Tested-by: Frederick Lawler --- drivers/char/ipmi/ipmi_msghandler.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index a33d3e2b345e8f..c6a5673b6f012c 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -52,6 +52,8 @@ static void intf_free(struct kref *ref); static bool initialized; static bool drvregistered; +static struct timer_list ipmi_timer; + /* Numbers in this enumerator should be mapped to ipmi_panic_event_str */ enum ipmi_panic_event_op { IPMI_SEND_PANIC_EVENT_NONE, @@ -1943,6 +1945,7 @@ static int i_ipmi_req_sysintf(struct ipmi_smi *intf, && intf->maintenance_mode_state < newst) { intf->maintenance_mode_state = newst; maintenance_mode_update(intf); + mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES); } spin_unlock_irqrestore(&intf->maintenance_mode_lock, flags); @@ -5081,6 +5084,7 @@ static bool ipmi_timeout_handler(struct ipmi_smi *intf, && (intf->auto_maintenance_timeout <= 0)) { intf->maintenance_mode_state = IPMI_MAINTENANCE_MODE_STATE_OFF; + intf->auto_maintenance_timeout = 0; maintenance_mode_update(intf); } } @@ -5103,8 +5107,6 @@ static void ipmi_request_event(struct ipmi_smi *intf) intf->handlers->request_events(intf->send_info); } -static struct timer_list ipmi_timer; - static atomic_t stop_operation; static void ipmi_timeout_work(struct work_struct *work) @@ -5128,6 +5130,8 @@ static void ipmi_timeout_work(struct work_struct *work) } need_timer = true; } + if (intf->maintenance_mode_state) + need_timer = true; need_timer |= ipmi_timeout_handler(intf, IPMI_TIMEOUT_TIME); } From 753bc23d8ffcf5a395a4eef90cf40c8e7dec51fb Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 Aug 2025 13:11:55 -0500 Subject: [PATCH 1165/3206] ipmi:si: Merge some if statements Changes resulted in a silly looking piece of logic. Get rid of a goto and use if statements properly. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_intf.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 8b5524069c15a3..cd237988bf45d1 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -1096,14 +1096,11 @@ static void smi_timeout(struct timer_list *t) /* Running with interrupts, only do long timeouts. */ timeout = jiffies + SI_TIMEOUT_JIFFIES; smi_inc_stat(smi_info, long_timeouts); - goto do_mod_timer; - } - - /* - * If the state machine asks for a short delay, then shorten - * the timer timeout. - */ - if (smi_result == SI_SM_CALL_WITH_DELAY) { + } else if (smi_result == SI_SM_CALL_WITH_DELAY) { + /* + * If the state machine asks for a short delay, then shorten + * the timer timeout. + */ smi_inc_stat(smi_info, short_timeouts); timeout = jiffies + 1; } else { @@ -1111,7 +1108,6 @@ static void smi_timeout(struct timer_list *t) timeout = jiffies + SI_TIMEOUT_JIFFIES; } -do_mod_timer: if (smi_result != SI_SM_IDLE) smi_mod_timer(smi_info, timeout); else From abe4918a942e17f997f62b1e12bc8a15d8ff5f7e Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 Aug 2025 13:25:33 -0500 Subject: [PATCH 1166/3206] ipmi:si: Move flags get start to its own function It's about to be used from another place, and this looks better, anyway. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_intf.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index cd237988bf45d1..7b2ba318c547e2 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -313,7 +313,7 @@ static void return_hosed_msg(struct smi_info *smi_info, int cCode) static enum si_sm_result start_next_msg(struct smi_info *smi_info) { - int rv; + int rv; if (!smi_info->waiting_msg) { smi_info->curr_msg = NULL; @@ -390,6 +390,17 @@ static void start_clear_flags(struct smi_info *smi_info) smi_info->si_state = SI_CLEARING_FLAGS; } +static void start_get_flags(struct smi_info *smi_info) +{ + unsigned char msg[2]; + + msg[0] = (IPMI_NETFN_APP_REQUEST << 2); + msg[1] = IPMI_GET_MSG_FLAGS_CMD; + + start_new_msg(smi_info, msg, 2); + smi_info->si_state = SI_GETTING_FLAGS; +} + static void start_getting_msg_queue(struct smi_info *smi_info) { smi_info->curr_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2); @@ -817,11 +828,7 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info, * interrupts work with the SMI, that's not really * possible. */ - msg[0] = (IPMI_NETFN_APP_REQUEST << 2); - msg[1] = IPMI_GET_MSG_FLAGS_CMD; - - start_new_msg(smi_info, msg, 2); - smi_info->si_state = SI_GETTING_FLAGS; + start_get_flags(smi_info); goto restart; } } From 9cf93a8fa9513c6d3cc65bdd50e05c1355cef322 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 Aug 2025 14:04:24 -0500 Subject: [PATCH 1167/3206] ipmi: Allow an SMI sender to return an error Getting ready for handling when a BMC is non-responsive or broken, allow the sender operation to fail in an SMI. If it was a user-generated message it will return the error. The powernv code was already doing this internally, but the way it was written could result in deep stack descent if there were a lot of messages queued. Have its send return an error in this case. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_ipmb.c | 4 ++-- drivers/char/ipmi/ipmi_msghandler.c | 16 +++++++++++++--- drivers/char/ipmi/ipmi_powernv.c | 17 +++++++++-------- drivers/char/ipmi/ipmi_si_intf.c | 8 +++----- drivers/char/ipmi/ipmi_ssif.c | 4 ++-- include/linux/ipmi_smi.h | 10 ++++++---- 6 files changed, 35 insertions(+), 24 deletions(-) diff --git a/drivers/char/ipmi/ipmi_ipmb.c b/drivers/char/ipmi/ipmi_ipmb.c index 6a4f279c7c1f53..3a51e58b248754 100644 --- a/drivers/char/ipmi/ipmi_ipmb.c +++ b/drivers/char/ipmi/ipmi_ipmb.c @@ -404,8 +404,7 @@ static void ipmi_ipmb_shutdown(void *send_info) ipmi_ipmb_stop_thread(iidev); } -static void ipmi_ipmb_sender(void *send_info, - struct ipmi_smi_msg *msg) +static int ipmi_ipmb_sender(void *send_info, struct ipmi_smi_msg *msg) { struct ipmi_ipmb_dev *iidev = send_info; unsigned long flags; @@ -417,6 +416,7 @@ static void ipmi_ipmb_sender(void *send_info, spin_unlock_irqrestore(&iidev->lock, flags); up(&iidev->wake_thread); + return IPMI_CC_NO_ERROR; } static void ipmi_ipmb_request_events(void *send_info) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index c6a5673b6f012c..a739c8822c9161 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -4803,6 +4803,7 @@ static void smi_work(struct work_struct *t) int run_to_completion = READ_ONCE(intf->run_to_completion); struct ipmi_smi_msg *newmsg = NULL; struct ipmi_recv_msg *msg, *msg2; + int cc; /* * Start the next message if available. @@ -4811,7 +4812,7 @@ static void smi_work(struct work_struct *t) * because the lower layer is allowed to hold locks while calling * message delivery. */ - +restart: if (!run_to_completion) spin_lock_irqsave(&intf->xmit_msgs_lock, flags); if (intf->curr_msg == NULL && !intf->in_shutdown) { @@ -4832,8 +4833,17 @@ static void smi_work(struct work_struct *t) if (!run_to_completion) spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags); - if (newmsg) - intf->handlers->sender(intf->send_info, newmsg); + if (newmsg) { + cc = intf->handlers->sender(intf->send_info, newmsg); + if (cc) { + if (newmsg->user_data) + deliver_err_response(intf, + newmsg->user_data, cc); + else + ipmi_free_smi_msg(newmsg); + goto restart; + } + } handle_new_recv_msgs(intf); diff --git a/drivers/char/ipmi/ipmi_powernv.c b/drivers/char/ipmi/ipmi_powernv.c index 4a2efafcd1f855..52a1130defe537 100644 --- a/drivers/char/ipmi/ipmi_powernv.c +++ b/drivers/char/ipmi/ipmi_powernv.c @@ -51,7 +51,7 @@ static void send_error_reply(struct ipmi_smi_powernv *smi, ipmi_smi_msg_received(smi->intf, msg); } -static void ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) +static int ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) { struct ipmi_smi_powernv *smi = send_info; struct opal_ipmi_msg *opal_msg; @@ -93,18 +93,19 @@ static void ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) smi->interface_id, opal_msg, size); rc = opal_ipmi_send(smi->interface_id, opal_msg, size); pr_devel("%s: -> %d\n", __func__, rc); - - if (!rc) { - smi->cur_msg = msg; - spin_unlock_irqrestore(&smi->msg_lock, flags); - return; + if (rc) { + comp = IPMI_ERR_UNSPECIFIED; + goto err_unlock; } - comp = IPMI_ERR_UNSPECIFIED; + smi->cur_msg = msg; + spin_unlock_irqrestore(&smi->msg_lock, flags); + return IPMI_CC_NO_ERROR; + err_unlock: spin_unlock_irqrestore(&smi->msg_lock, flags); err: - send_error_reply(smi, msg, comp); + return comp; } static int ipmi_powernv_recv(struct ipmi_smi_powernv *smi) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 7b2ba318c547e2..88d62ef71b1b66 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -809,8 +809,6 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info, * this if there is not yet an upper layer to handle anything. */ if (si_sm_result == SI_SM_ATTN || smi_info->got_attn) { - unsigned char msg[2]; - if (smi_info->si_state != SI_NORMAL) { /* * We got an ATTN, but we are doing something else. @@ -907,8 +905,7 @@ static void flush_messages(void *send_info) } } -static void sender(void *send_info, - struct ipmi_smi_msg *msg) +static int sender(void *send_info, struct ipmi_smi_msg *msg) { struct smi_info *smi_info = send_info; unsigned long flags; @@ -921,7 +918,7 @@ static void sender(void *send_info, * layer will call flush_messages to clear it out. */ smi_info->waiting_msg = msg; - return; + return IPMI_CC_NO_ERROR; } spin_lock_irqsave(&smi_info->si_lock, flags); @@ -936,6 +933,7 @@ static void sender(void *send_info, smi_info->waiting_msg = msg; check_start_timer_thread(smi_info); spin_unlock_irqrestore(&smi_info->si_lock, flags); + return IPMI_CC_NO_ERROR; } static void set_run_to_completion(void *send_info, bool i_run_to_completion) diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c index 1bc42830444dd0..1b63f7d2fcda5f 100644 --- a/drivers/char/ipmi/ipmi_ssif.c +++ b/drivers/char/ipmi/ipmi_ssif.c @@ -1068,8 +1068,7 @@ static void start_next_msg(struct ssif_info *ssif_info, unsigned long *flags) } } -static void sender(void *send_info, - struct ipmi_smi_msg *msg) +static int sender(void *send_info, struct ipmi_smi_msg *msg) { struct ssif_info *ssif_info = send_info; unsigned long oflags, *flags; @@ -1089,6 +1088,7 @@ static void sender(void *send_info, msg->data[0], msg->data[1], (long long)t.tv_sec, (long)t.tv_nsec / NSEC_PER_USEC); } + return IPMI_CC_NO_ERROR; } static int get_smi_info(void *send_info, struct ipmi_smi_info *data) diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 5d69820d8b027e..c2d975bbff60c8 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -109,8 +109,8 @@ struct ipmi_smi_msg { enum ipmi_smi_msg_type type; - long msgid; - void *user_data; + long msgid; + void *user_data; int data_size; unsigned char data[IPMI_MAX_MSG_LENGTH]; @@ -168,9 +168,11 @@ struct ipmi_smi_handlers { * are held when this is run. Message are delivered one at * a time by the message handler, a new message will not be * delivered until the previous message is returned. + * + * This can return an error if the SMI is not in a state where it + * can send a message. */ - void (*sender)(void *send_info, - struct ipmi_smi_msg *msg); + int (*sender)(void *send_info, struct ipmi_smi_msg *msg); /* * Called by the upper layer to request that we try to get From 3bc54ab3b9790ca92f197e9822e486665daa321c Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 Aug 2025 14:09:11 -0500 Subject: [PATCH 1168/3206] ipmi: Rename "user_data" to "recv_msg" in an SMI message It's only used to hold the corresponding receive message, so fix the name to make that clear and the type so nothing else can be accidentally assigned to it. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 26 +++++++++++++------------- include/linux/ipmi_smi.h | 3 ++- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index a739c8822c9161..a0b67a35a5f048 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -1959,7 +1959,7 @@ static int i_ipmi_req_sysintf(struct ipmi_smi *intf, smi_msg->data[0] = (msg->netfn << 2) | (smi_addr->lun & 0x3); smi_msg->data[1] = msg->cmd; smi_msg->msgid = msgid; - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; if (msg->data_len > 0) memcpy(&smi_msg->data[2], msg->data, msg->data_len); smi_msg->data_size = msg->data_len + 2; @@ -2040,7 +2040,7 @@ static int i_ipmi_req_ipmb(struct ipmi_smi *intf, * Save the receive message so we can use it * to deliver the response. */ - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; } else { mutex_lock(&intf->seq_lock); @@ -2153,7 +2153,7 @@ static int i_ipmi_req_ipmb_direct(struct ipmi_smi *intf, memcpy(smi_msg->data + 4, msg->data, msg->data_len); smi_msg->data_size = msg->data_len + 4; - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; return 0; } @@ -2216,7 +2216,7 @@ static int i_ipmi_req_lan(struct ipmi_smi *intf, * Save the receive message so we can use it * to deliver the response. */ - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; } else { mutex_lock(&intf->seq_lock); @@ -4066,7 +4066,7 @@ static int handle_ipmb_direct_rcv_rsp(struct ipmi_smi *intf, struct ipmi_recv_msg *recv_msg; struct ipmi_ipmb_direct_addr *daddr; - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; if (recv_msg == NULL) { dev_warn(intf->si_dev, "IPMI direct message received with no owner. This could be because of a malformed message, or because of a hardware error. Contact your hardware vendor for assistance.\n"); @@ -4489,7 +4489,7 @@ static int handle_bmc_rsp(struct ipmi_smi *intf, struct ipmi_recv_msg *recv_msg; struct ipmi_system_interface_addr *smi_addr; - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; if (recv_msg == NULL) { dev_warn(intf->si_dev, "IPMI SMI message received with no owner. This could be because of a malformed message, or because of a hardware error. Contact your hardware vendor for assistance.\n"); @@ -4563,14 +4563,14 @@ static int handle_one_recv_msg(struct ipmi_smi *intf, } else if ((msg->data_size >= 2) && (msg->data[0] == (IPMI_NETFN_APP_REQUEST << 2)) && (msg->data[1] == IPMI_SEND_MSG_CMD) - && (msg->user_data == NULL)) { + && (msg->recv_msg == NULL)) { if (intf->in_shutdown || intf->run_to_completion) goto out; /* * This is the local response to a command send, start - * the timer for these. The user_data will not be + * the timer for these. The recv_msg will not be * NULL if this is a response send, and we will let * response sends just go through. */ @@ -4630,7 +4630,7 @@ static int handle_one_recv_msg(struct ipmi_smi *intf, requeue = handle_ipmb_direct_rcv_rsp(intf, msg); } else if ((msg->rsp[0] == ((IPMI_NETFN_APP_REQUEST|1) << 2)) && (msg->rsp[1] == IPMI_SEND_MSG_CMD) - && (msg->user_data != NULL)) { + && (msg->recv_msg != NULL)) { /* * It's a response to a response we sent. For this we * deliver a send message response to the user. @@ -4647,7 +4647,7 @@ static int handle_one_recv_msg(struct ipmi_smi *intf, cc = msg->rsp[2]; process_response_response: - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; requeue = 0; if (!recv_msg) @@ -4836,9 +4836,9 @@ static void smi_work(struct work_struct *t) if (newmsg) { cc = intf->handlers->sender(intf->send_info, newmsg); if (cc) { - if (newmsg->user_data) + if (newmsg->recv_msg) deliver_err_response(intf, - newmsg->user_data, cc); + newmsg->recv_msg, cc); else ipmi_free_smi_msg(newmsg); goto restart; @@ -5185,7 +5185,7 @@ struct ipmi_smi_msg *ipmi_alloc_smi_msg(void) rv = kmalloc(sizeof(struct ipmi_smi_msg), GFP_ATOMIC); if (rv) { rv->done = free_smi_msg; - rv->user_data = NULL; + rv->recv_msg = NULL; rv->type = IPMI_SMI_MSG_TYPE_NORMAL; atomic_inc(&smi_msg_inuse_count); } diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index c2d975bbff60c8..892e2d656e1e72 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -110,7 +110,8 @@ struct ipmi_smi_msg { enum ipmi_smi_msg_type type; long msgid; - void *user_data; + /* Response to this message, will be NULL if not from a user request. */ + struct ipmi_recv_msg *recv_msg; int data_size; unsigned char data[IPMI_MAX_MSG_LENGTH]; From bc3a9d217755f65c137f145600f23bf1d6c31ea9 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 Aug 2025 14:56:50 -0500 Subject: [PATCH 1169/3206] ipmi:si: Gracefully handle if the BMC is non-functional If the BMC is not functional, the driver goes into an error state and starts a 1 second timer. When the timer times out, it will attempt a simple message. If the BMC interacts correctly, the driver will start accepting messages again. If not, it remains in error state. If the driver goes into error state, all messages current and pending will return with an error. This should more gracefully handle when the BMC becomes non-operational, as opposed to trying each messages individually and failing them. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_intf.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 88d62ef71b1b66..1c65275906b444 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -53,6 +53,7 @@ #define SI_TIMEOUT_JIFFIES (SI_TIMEOUT_TIME_USEC/SI_USEC_PER_JIFFY) #define SI_SHORT_TIMEOUT_USEC 250 /* .25ms when the SM request a short timeout */ +#define SI_TIMEOUT_HOSED (HZ) /* 1 second when in hosed state. */ enum si_intf_state { SI_NORMAL, @@ -61,7 +62,8 @@ enum si_intf_state { SI_CLEARING_FLAGS, SI_GETTING_MESSAGES, SI_CHECKING_ENABLES, - SI_SETTING_ENABLES + SI_SETTING_ENABLES, + SI_HOSED /* FIXME - add watchdog stuff. */ }; @@ -753,6 +755,8 @@ static void handle_transaction_done(struct smi_info *smi_info) } break; } + case SI_HOSED: /* Shouldn't happen. */ + break; } } @@ -767,6 +771,10 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info, enum si_sm_result si_sm_result; restart: + if (smi_info->si_state == SI_HOSED) + /* Just in case, hosed state is only left from the timeout. */ + return SI_SM_HOSED; + /* * There used to be a loop here that waited a little while * (around 25us) before giving up. That turned out to be @@ -790,18 +798,20 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info, /* * Do the before return_hosed_msg, because that - * releases the lock. + * releases the lock. We just disable operations for + * a while and retry in hosed state. */ - smi_info->si_state = SI_NORMAL; + smi_info->si_state = SI_HOSED; if (smi_info->curr_msg != NULL) { /* * If we were handling a user message, format * a response to send to the upper layer to * tell it about the error. */ - return_hosed_msg(smi_info, IPMI_ERR_UNSPECIFIED); + return_hosed_msg(smi_info, IPMI_BUS_ERR); } - goto restart; + smi_mod_timer(smi_info, jiffies + SI_TIMEOUT_HOSED); + goto out; } /* @@ -899,7 +909,7 @@ static void flush_messages(void *send_info) * mode. This means we are single-threaded, no need for locks. */ result = smi_event_handler(smi_info, 0); - while (result != SI_SM_IDLE) { + while (result != SI_SM_IDLE && result != SI_SM_HOSED) { udelay(SI_SHORT_TIMEOUT_USEC); result = smi_event_handler(smi_info, SI_SHORT_TIMEOUT_USEC); } @@ -912,6 +922,9 @@ static int sender(void *send_info, struct ipmi_smi_msg *msg) debug_timestamp(smi_info, "Enqueue"); + if (smi_info->si_state == SI_HOSED) + return IPMI_BUS_ERR; + if (smi_info->run_to_completion) { /* * If we are running to completion, start it. Upper @@ -1092,6 +1105,10 @@ static void smi_timeout(struct timer_list *t) spin_lock_irqsave(&(smi_info->si_lock), flags); debug_timestamp(smi_info, "Timer"); + if (smi_info->si_state == SI_HOSED) + /* Try something to see if the BMC is now operational. */ + start_get_flags(smi_info); + jiffies_now = jiffies; time_diff = (((long)jiffies_now - (long)smi_info->last_timeout_jiffies) * SI_USEC_PER_JIFFY); From 6248c95eef941bcf987bdd7c62f5e47275b0dbba Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 30 Aug 2025 15:26:06 +0200 Subject: [PATCH 1170/3206] spi: s3c64xx: Drop S3C2443 Samsung S3C24xx family of SoCs was removed the Linux kernel in the commit 61b7f8920b17 ("ARM: s3c: remove all s3c24xx support"), in January 2023. There are no in-kernel users of remaining S3C24xx compatibles or platform data ID. Signed-off-by: Krzysztof Kozlowski Message-ID: <20250830132605.311115-3-krzysztof.kozlowski@linaro.org> Signed-off-by: Mark Brown --- drivers/spi/spi-s3c64xx.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c index 3a00f9e480c5c2..aab36c779c06a5 100644 --- a/drivers/spi/spi-s3c64xx.c +++ b/drivers/spi/spi-s3c64xx.c @@ -1506,16 +1506,6 @@ static const struct dev_pm_ops s3c64xx_spi_pm = { s3c64xx_spi_runtime_resume, NULL) }; -static const struct s3c64xx_spi_port_config s3c2443_spi_port_config = { - /* fifo_lvl_mask is deprecated. Use {rx, tx}_fifomask instead. */ - .fifo_lvl_mask = { 0x7f }, - /* rx_lvl_offset is deprecated. Use {rx, tx}_fifomask instead. */ - .rx_lvl_offset = 13, - .tx_st_done = 21, - .clk_div = 2, - .high_speed = true, -}; - static const struct s3c64xx_spi_port_config s3c6410_spi_port_config = { /* fifo_lvl_mask is deprecated. Use {rx, tx}_fifomask instead. */ .fifo_lvl_mask = { 0x7f, 0x7F }, @@ -1627,9 +1617,6 @@ static const struct s3c64xx_spi_port_config gs101_spi_port_config = { static const struct platform_device_id s3c64xx_spi_driver_ids[] = { { - .name = "s3c2443-spi", - .driver_data = (kernel_ulong_t)&s3c2443_spi_port_config, - }, { .name = "s3c6410-spi", .driver_data = (kernel_ulong_t)&s3c6410_spi_port_config, }, @@ -1641,9 +1628,6 @@ static const struct of_device_id s3c64xx_spi_dt_match[] = { { .compatible = "google,gs101-spi", .data = &gs101_spi_port_config, }, - { .compatible = "samsung,s3c2443-spi", - .data = &s3c2443_spi_port_config, - }, { .compatible = "samsung,s3c6410-spi", .data = &s3c6410_spi_port_config, }, From 2c625f0fe2db4e6a58877ce2318df3aa312eb791 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 30 Aug 2025 15:26:07 +0200 Subject: [PATCH 1171/3206] spi: dt-bindings: samsung: Drop S3C2443 Samsung S3C24xx family of SoCs was removed the Linux kernel in the commit 61b7f8920b17 ("ARM: s3c: remove all s3c24xx support"), in January 2023. There are no in-kernel users of remaining S3C24xx compatibles. Signed-off-by: Krzysztof Kozlowski Message-ID: <20250830132605.311115-4-krzysztof.kozlowski@linaro.org> Acked-by: Rob Herring (Arm) Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/samsung,spi.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/spi/samsung,spi.yaml b/Documentation/devicetree/bindings/spi/samsung,spi.yaml index fe298d47b1a905..1ce8b2770a4aaf 100644 --- a/Documentation/devicetree/bindings/spi/samsung,spi.yaml +++ b/Documentation/devicetree/bindings/spi/samsung,spi.yaml @@ -18,7 +18,6 @@ properties: oneOf: - enum: - google,gs101-spi - - samsung,s3c2443-spi # for S3C2443, S3C2416 and S3C2450 - samsung,s3c6410-spi - samsung,s5pv210-spi # for S5PV210 and S5PC110 - samsung,exynos4210-spi From c05d0b32eebadc8be6e53196e99c64cf2bed1d99 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Sat, 6 Sep 2025 11:09:13 +0200 Subject: [PATCH 1172/3206] regulator: sy7636a: fix lifecycle of power good gpio Attach the power good gpio to the regulator device devres instead of the parent device to fix problems if probe is run multiple times (rmmod/insmod or some deferral). Fixes: 8c485bedfb785 ("regulator: sy7636a: Initial commit") Signed-off-by: Andreas Kemnade Reviewed-by: Alistair Francis Reviewed-by: Peng Fan Message-ID: <20250906-sy7636-rsrc-v1-2-e2886a9763a7@kernel.org> Signed-off-by: Mark Brown --- drivers/regulator/sy7636a-regulator.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/regulator/sy7636a-regulator.c b/drivers/regulator/sy7636a-regulator.c index d1e7ba1fb3e1af..27e3d939b7bb9e 100644 --- a/drivers/regulator/sy7636a-regulator.c +++ b/drivers/regulator/sy7636a-regulator.c @@ -83,9 +83,11 @@ static int sy7636a_regulator_probe(struct platform_device *pdev) if (!regmap) return -EPROBE_DEFER; - gdp = devm_gpiod_get(pdev->dev.parent, "epd-pwr-good", GPIOD_IN); + device_set_of_node_from_dev(&pdev->dev, pdev->dev.parent); + + gdp = devm_gpiod_get(&pdev->dev, "epd-pwr-good", GPIOD_IN); if (IS_ERR(gdp)) { - dev_err(pdev->dev.parent, "Power good GPIO fault %ld\n", PTR_ERR(gdp)); + dev_err(&pdev->dev, "Power good GPIO fault %ld\n", PTR_ERR(gdp)); return PTR_ERR(gdp); } @@ -105,7 +107,6 @@ static int sy7636a_regulator_probe(struct platform_device *pdev) } config.dev = &pdev->dev; - config.dev->of_node = pdev->dev.parent->of_node; config.regmap = regmap; rdev = devm_regulator_register(&pdev->dev, &desc, &config); From 54e7bb6ade8acd3fb1c486c9f3e2c0dfdc18f84e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:23 -0700 Subject: [PATCH 1173/3206] wireguard: kconfig: simplify crypto kconfig selections Simplify the kconfig entry for WIREGUARD: - Drop the selections of the arch-optimized ChaCha20, Poly1305, BLAKE2s, and Curve25519 code. These options no longer exist, as lib/crypto/ now enables the arch-optimized code automatically. - Drop the selection of CRYPTO. This was needed only to make the arch-optimized options visible. lib/crypto/ now handles these options internally, without any dependency on CRYPTO. - Drop the dependency on !KMSAN. This was needed only to avoid selecting arch-optimized code that isn't compatible with KMSAN. lib/crypto/ now handles the !KMSAN dependencies internally. - Add a selection of CRYPTO_LIB_UTILS, since WireGuard directly calls crypto_memneq(). This gets selected indirectly by CRYPTO_LIB_CURVE25519 and CRYPTO_LIB_CHACHA20POLY1305 anyway, but it's best to make this dependency explicit. Link: https://lore.kernel.org/r/20250906213523.84915-13-ebiggers@kernel.org Signed-off-by: Eric Biggers --- drivers/net/Kconfig | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index b29628d46be9b3..ac12eaf11755dd 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -76,24 +76,11 @@ config WIREGUARD tristate "WireGuard secure network tunnel" depends on NET && INET depends on IPV6 || !IPV6 - depends on !KMSAN # KMSAN doesn't support the crypto configs below select NET_UDP_TUNNEL select DST_CACHE - select CRYPTO select CRYPTO_LIB_CURVE25519 select CRYPTO_LIB_CHACHA20POLY1305 - select CRYPTO_CHACHA20_X86_64 if X86 && 64BIT - select CRYPTO_POLY1305_X86_64 if X86 && 64BIT - select CRYPTO_BLAKE2S_X86 if X86 && 64BIT - select CRYPTO_CURVE25519_X86 if X86 && 64BIT - select CRYPTO_CHACHA20_NEON if ARM || (ARM64 && KERNEL_MODE_NEON) - select CRYPTO_POLY1305_NEON if ARM64 && KERNEL_MODE_NEON - select CRYPTO_POLY1305_ARM if ARM - select CRYPTO_BLAKE2S_ARM if ARM - select CRYPTO_CURVE25519_NEON if ARM && KERNEL_MODE_NEON - select CRYPTO_CHACHA_MIPS if CPU_MIPS32_R2 - select CRYPTO_POLY1305_MIPS if MIPS - select CRYPTO_CHACHA_S390 if S390 + select CRYPTO_LIB_UTILS help WireGuard is a secure, fast, and easy to use replacement for IPSec that uses modern cryptography and clever networking tricks. It's From 4b66d18918f8e4d85e51974a9e3ce9abad5c7c3d Mon Sep 17 00:00:00 2001 From: Miaoqing Pan Date: Mon, 8 Sep 2025 09:50:25 +0800 Subject: [PATCH 1174/3206] wifi: ath12k: Fix missing station power save configuration Commit afbab6e4e88d ("wifi: ath12k: modify ath12k_mac_op_bss_info_changed() for MLO") replaced the bss_info_changed() callback with vif_cfg_changed() and link_info_changed() to support Multi-Link Operation (MLO). As a result, the station power save configuration is no longer correctly applied in ath12k_mac_bss_info_changed(). Move the handling of 'BSS_CHANGED_PS' into ath12k_mac_op_vif_cfg_changed() to align with the updated callback structure introduced for MLO, ensuring proper power-save behavior for station interfaces. Tested-on: WCN7850 hw2.0 PCI WLAN.IOE_HMT.1.1-00011-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1 Fixes: afbab6e4e88d ("wifi: ath12k: modify ath12k_mac_op_bss_info_changed() for MLO") Signed-off-by: Miaoqing Pan Reviewed-by: Baochen Qiang Link: https://patch.msgid.link/20250908015025.1301398-1-miaoqing.pan@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/mac.c | 122 ++++++++++++++------------ 1 file changed, 67 insertions(+), 55 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index bd1ec3b2c08416..3a3965b79942d2 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4078,12 +4078,68 @@ static int ath12k_mac_fils_discovery(struct ath12k_link_vif *arvif, return ret; } +static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif) +{ + struct ath12k *ar = arvif->ar; + struct ieee80211_vif *vif = arvif->ahvif->vif; + struct ieee80211_conf *conf = &ath12k_ar_to_hw(ar)->conf; + enum wmi_sta_powersave_param param; + struct ieee80211_bss_conf *info; + enum wmi_sta_ps_mode psmode; + int ret; + int timeout; + bool enable_ps; + + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + if (vif->type != NL80211_IFTYPE_STATION) + return; + + enable_ps = arvif->ahvif->ps; + if (enable_ps) { + psmode = WMI_STA_PS_MODE_ENABLED; + param = WMI_STA_PS_PARAM_INACTIVITY_TIME; + + timeout = conf->dynamic_ps_timeout; + if (timeout == 0) { + info = ath12k_mac_get_link_bss_conf(arvif); + if (!info) { + ath12k_warn(ar->ab, "unable to access bss link conf in setup ps for vif %pM link %u\n", + vif->addr, arvif->link_id); + return; + } + + /* firmware doesn't like 0 */ + timeout = ieee80211_tu_to_usec(info->beacon_int) / 1000; + } + + ret = ath12k_wmi_set_sta_ps_param(ar, arvif->vdev_id, param, + timeout); + if (ret) { + ath12k_warn(ar->ab, "failed to set inactivity time for vdev %d: %i\n", + arvif->vdev_id, ret); + return; + } + } else { + psmode = WMI_STA_PS_MODE_DISABLED; + } + + ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vdev %d psmode %s\n", + arvif->vdev_id, psmode ? "enable" : "disable"); + + ret = ath12k_wmi_pdev_set_ps_mode(ar, arvif->vdev_id, psmode); + if (ret) + ath12k_warn(ar->ab, "failed to set sta power save mode %d for vdev %d: %d\n", + psmode, arvif->vdev_id, ret); +} + static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 changed) { struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); unsigned long links = ahvif->links_map; + struct ieee80211_vif_cfg *vif_cfg; struct ieee80211_bss_conf *info; struct ath12k_link_vif *arvif; struct ieee80211_sta *sta; @@ -4147,61 +4203,24 @@ static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw, } } } -} - -static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif) -{ - struct ath12k *ar = arvif->ar; - struct ieee80211_vif *vif = arvif->ahvif->vif; - struct ieee80211_conf *conf = &ath12k_ar_to_hw(ar)->conf; - enum wmi_sta_powersave_param param; - struct ieee80211_bss_conf *info; - enum wmi_sta_ps_mode psmode; - int ret; - int timeout; - bool enable_ps; - lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + if (changed & BSS_CHANGED_PS) { + links = ahvif->links_map; + vif_cfg = &vif->cfg; - if (vif->type != NL80211_IFTYPE_STATION) - return; + for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + if (!arvif || !arvif->ar) + continue; - enable_ps = arvif->ahvif->ps; - if (enable_ps) { - psmode = WMI_STA_PS_MODE_ENABLED; - param = WMI_STA_PS_PARAM_INACTIVITY_TIME; + ar = arvif->ar; - timeout = conf->dynamic_ps_timeout; - if (timeout == 0) { - info = ath12k_mac_get_link_bss_conf(arvif); - if (!info) { - ath12k_warn(ar->ab, "unable to access bss link conf in setup ps for vif %pM link %u\n", - vif->addr, arvif->link_id); - return; + if (ar->ab->hw_params->supports_sta_ps) { + ahvif->ps = vif_cfg->ps; + ath12k_mac_vif_setup_ps(arvif); } - - /* firmware doesn't like 0 */ - timeout = ieee80211_tu_to_usec(info->beacon_int) / 1000; } - - ret = ath12k_wmi_set_sta_ps_param(ar, arvif->vdev_id, param, - timeout); - if (ret) { - ath12k_warn(ar->ab, "failed to set inactivity time for vdev %d: %i\n", - arvif->vdev_id, ret); - return; - } - } else { - psmode = WMI_STA_PS_MODE_DISABLED; } - - ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vdev %d psmode %s\n", - arvif->vdev_id, psmode ? "enable" : "disable"); - - ret = ath12k_wmi_pdev_set_ps_mode(ar, arvif->vdev_id, psmode); - if (ret) - ath12k_warn(ar->ab, "failed to set sta power save mode %d for vdev %d: %d\n", - psmode, arvif->vdev_id, ret); } static bool ath12k_mac_supports_tpc(struct ath12k *ar, struct ath12k_vif *ahvif, @@ -4223,7 +4242,6 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar, { struct ath12k_vif *ahvif = arvif->ahvif; struct ieee80211_vif *vif = ath12k_ahvif_to_vif(ahvif); - struct ieee80211_vif_cfg *vif_cfg = &vif->cfg; struct cfg80211_chan_def def; u32 param_id, param_value; enum nl80211_band band; @@ -4510,12 +4528,6 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar, } ath12k_mac_fils_discovery(arvif, info); - - if (changed & BSS_CHANGED_PS && - ar->ab->hw_params->supports_sta_ps) { - ahvif->ps = vif_cfg->ps; - ath12k_mac_vif_setup_ps(arvif); - } } static struct ath12k_vif_cache *ath12k_ahvif_get_link_cache(struct ath12k_vif *ahvif, From 82e2be57d544ff9ad4696c85600827b39be8ce9e Mon Sep 17 00:00:00 2001 From: Miaoqing Pan Date: Mon, 8 Sep 2025 09:51:39 +0800 Subject: [PATCH 1175/3206] wifi: ath12k: fix WMI TLV header misalignment When buf_len is not 4-byte aligned in ath12k_wmi_mgmt_send(), the firmware asserts and triggers a recovery. The following error messages are observed: ath12k_pci 0004:01:00.0: failed to submit WMI_MGMT_TX_SEND_CMDID cmd ath12k_pci 0004:01:00.0: failed to send mgmt frame: -108 ath12k_pci 0004:01:00.0: failed to tx mgmt frame, vdev_id 0 :-108 ath12k_pci 0004:01:00.0: waiting recovery start... This issue was observed when running 'iw wlanx set power_save off/on' in MLO station mode, which triggers the sending of an SMPS action frame with a length of 27 bytes to the AP. To resolve the misalignment, use buf_len_aligned instead of buf_len when constructing the WMI TLV header. Tested-on: WCN7850 hw2.0 PCI WLAN.IOE_HMT.1.1-00011-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1 Fixes: d889913205cf ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices") Signed-off-by: Miaoqing Pan Reviewed-by: Baochen Qiang Link: https://patch.msgid.link/20250908015139.1301437-1-miaoqing.pan@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 742ffeb48bce77..29dadedefdd27a 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -843,7 +843,7 @@ int ath12k_wmi_mgmt_send(struct ath12k_link_vif *arvif, u32 buf_id, cmd->tx_params_valid = 0; frame_tlv = (struct wmi_tlv *)(skb->data + sizeof(*cmd)); - frame_tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_BYTE, buf_len); + frame_tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_BYTE, buf_len_aligned); memcpy(frame_tlv->value, frame->data, buf_len); From b94bc4398beccd329a51052c5e7e84e4670dbcd9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:54:17 -0700 Subject: [PATCH 1176/3206] crypto: md5 - Implement export_core() and import_core() Since commit 9d7a0ab1c753 ("crypto: ahash - Handle partial blocks in API"), the recently-added export_core() and import_core() methods in struct shash_alg have effectively become mandatory (even though it is not tested or enforced), since legacy drivers that need a fallback depend on them. Make crypto/md5.c compatible with these legacy drivers by adding export_core() and import_core() methods to it. Fixes: ba8ee22a7f92 ("crypto: md5 - Wrap library and add HMAC support") Link: https://lore.kernel.org/r/20250906215417.89584-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/md5.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/crypto/md5.c b/crypto/md5.c index d05c53e6f3c2c5..c167d203c710ac 100644 --- a/crypto/md5.c +++ b/crypto/md5.c @@ -46,6 +46,18 @@ static int __crypto_md5_import(struct md5_ctx *ctx, const void *in) return 0; } +static int __crypto_md5_export_core(const struct md5_ctx *ctx, void *out) +{ + memcpy(out, ctx, offsetof(struct md5_ctx, buf)); + return 0; +} + +static int __crypto_md5_import_core(struct md5_ctx *ctx, const void *in) +{ + memcpy(ctx, in, offsetof(struct md5_ctx, buf)); + return 0; +} + const u8 md5_zero_message_hash[MD5_DIGEST_SIZE] = { 0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04, 0xe9, 0x80, 0x09, 0x98, 0xec, 0xf8, 0x42, 0x7e, @@ -90,6 +102,16 @@ static int crypto_md5_import(struct shash_desc *desc, const void *in) return __crypto_md5_import(MD5_CTX(desc), in); } +static int crypto_md5_export_core(struct shash_desc *desc, void *out) +{ + return __crypto_md5_export_core(MD5_CTX(desc), out); +} + +static int crypto_md5_import_core(struct shash_desc *desc, const void *in) +{ + return __crypto_md5_import_core(MD5_CTX(desc), in); +} + #define HMAC_MD5_KEY(tfm) ((struct hmac_md5_key *)crypto_shash_ctx(tfm)) #define HMAC_MD5_CTX(desc) ((struct hmac_md5_ctx *)shash_desc_ctx(desc)) @@ -139,6 +161,19 @@ static int crypto_hmac_md5_import(struct shash_desc *desc, const void *in) return __crypto_md5_import(&ctx->hash_ctx, in); } +static int crypto_hmac_md5_export_core(struct shash_desc *desc, void *out) +{ + return __crypto_md5_export_core(&HMAC_MD5_CTX(desc)->hash_ctx, out); +} + +static int crypto_hmac_md5_import_core(struct shash_desc *desc, const void *in) +{ + struct hmac_md5_ctx *ctx = HMAC_MD5_CTX(desc); + + ctx->ostate = HMAC_MD5_KEY(desc->tfm)->ostate; + return __crypto_md5_import_core(&ctx->hash_ctx, in); +} + static struct shash_alg algs[] = { { .base.cra_name = "md5", @@ -153,6 +188,8 @@ static struct shash_alg algs[] = { .digest = crypto_md5_digest, .export = crypto_md5_export, .import = crypto_md5_import, + .export_core = crypto_md5_export_core, + .import_core = crypto_md5_import_core, .descsize = sizeof(struct md5_ctx), .statesize = MD5_SHASH_STATE_SIZE, }, @@ -171,6 +208,8 @@ static struct shash_alg algs[] = { .digest = crypto_hmac_md5_digest, .export = crypto_hmac_md5_export, .import = crypto_hmac_md5_import, + .export_core = crypto_hmac_md5_export_core, + .import_core = crypto_hmac_md5_import_core, .descsize = sizeof(struct hmac_md5_ctx), .statesize = MD5_SHASH_STATE_SIZE, }, From 93a83d044314b041ffe2bb1d43b8b0cea7f60921 Mon Sep 17 00:00:00 2001 From: Feng Yang Date: Mon, 8 Sep 2025 14:08:10 +0800 Subject: [PATCH 1177/3206] selftests/bpf: Fix the issue where the error code is 0 The error message printed here only uses the previous err value, which results in it being printed as 0. When bpf_map__attach_struct_ops encounters an error, it uses libbpf_err_ptr(err) to set errno = -err and returns NULL. Therefore, Using -errno can fix this issue. Fix before: run_subtest:FAIL:1019 bpf_map__attach_struct_ops failed for map pro_epilogue: err=0 Fix after: run_subtest:FAIL:1019 bpf_map__attach_struct_ops failed for map pro_epilogue: err=-9 Signed-off-by: Feng Yang Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250908060810.1054341-1-yangfeng59949@163.com --- tools/testing/selftests/bpf/test_loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index a9388ac8835871..9f684d0dc5b4ba 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -1088,7 +1088,7 @@ void run_subtest(struct test_loader *tester, link = bpf_map__attach_struct_ops(map); if (!link) { PRINT_FAIL("bpf_map__attach_struct_ops failed for map %s: err=%d\n", - bpf_map__name(map), err); + bpf_map__name(map), -errno); goto tobj_cleanup; } links[links_cnt++] = link; From 34f86083a4e1887ada121d2b5c4f47cc038a3356 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 10:53:07 +0200 Subject: [PATCH 1178/3206] bpf: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue, yet nothing in its name tells about that CPU affinity constraint, which is very often not required by users. Make it clear by adding a system_percpu_wq. queue_work() / queue_delayed_work() mod_delayed_work() will now use the new per-cpu wq: whether the user still stick on the old name a warn will be printed along a wq redirect to the new one. This patch add the new system_percpu_wq except for mm, fs and net subsystem, whom are handled in separated patches. The old wq will be kept for a few release cylces. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://lore.kernel.org/r/20250905085309.94596-2-marco.crivellari@suse.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- kernel/bpf/cpumap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9912c7b9a26604..dde605c9415a34 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -27,7 +27,7 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key); /* * cgroup bpf destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup bpf - * destruction work items don't end up filling up max_active of system_wq + * destruction work items don't end up filling up max_active of system_percpu_wq * which may lead to deadlock. */ static struct workqueue_struct *cgroup_bpf_destroy_wq; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b2b7b8ec2c2a1c..a9b347ccbec298 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -550,7 +550,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); if (old_rcpu) { INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free); - queue_rcu_work(system_wq, &old_rcpu->free_work); + queue_rcu_work(system_percpu_wq, &old_rcpu->free_work); } } From 0409819a002165e9471113598323eb72bba17a53 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 10:53:08 +0200 Subject: [PATCH 1179/3206] bpf: replace use of system_unbound_wq with system_dfl_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. queue_work() / queue_delayed_work() / mod_delayed_work() will now use the new unbound wq: whether the user still use the old wq a warn will be printed along with a wq redirect to the new one. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://lore.kernel.org/r/20250905085309.94596-3-marco.crivellari@suse.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 4 ++-- kernel/bpf/memalloc.c | 2 +- kernel/bpf/syscall.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 588bc7e3643679..1ef1e65bd7d0e9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1594,7 +1594,7 @@ void bpf_timer_cancel_and_free(void *val) * timer callback. */ if (this_cpu_read(hrtimer_running)) { - queue_work(system_unbound_wq, &t->cb.delete_work); + queue_work(system_dfl_wq, &t->cb.delete_work); return; } @@ -1607,7 +1607,7 @@ void bpf_timer_cancel_and_free(void *val) if (hrtimer_try_to_cancel(&t->timer) >= 0) kfree_rcu(t, cb.rcu); else - queue_work(system_unbound_wq, &t->cb.delete_work); + queue_work(system_dfl_wq, &t->cb.delete_work); } else { bpf_timer_delete_work(&t->cb.delete_work); } diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 889374722d0aa9..bd45dda9dc354c 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -736,7 +736,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) /* Defer barriers into worker to let the rest of map memory to be freed */ memset(ma, 0, sizeof(*ma)); INIT_WORK(©->work, free_mem_alloc_deferred); - queue_work(system_unbound_wq, ©->work); + queue_work(system_dfl_wq, ©->work); } void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0fbfa8532c392c..3f178a0f8eb12c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -905,7 +905,7 @@ static void bpf_map_free_in_work(struct bpf_map *map) /* Avoid spawning kworkers, since they all might contend * for the same mutex like slab_mutex. */ - queue_work(system_unbound_wq, &map->work); + queue_work(system_dfl_wq, &map->work); } static void bpf_map_free_rcu_gp(struct rcu_head *rcu) From a857210b104f5c186e883d0f2b0eb660349c1e54 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 10:53:09 +0200 Subject: [PATCH 1180/3206] bpf: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch adds a new WQ_PERCPU flag to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://lore.kernel.org/r/20250905085309.94596-4-marco.crivellari@suse.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index dde605c9415a34..248f517d66d048 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -34,7 +34,8 @@ static struct workqueue_struct *cgroup_bpf_destroy_wq; static int __init cgroup_bpf_wq_init(void) { - cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); + cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", + WQ_PERCPU, 1); if (!cgroup_bpf_destroy_wq) panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); return 0; From 5bad16482c2a7e788c042d98f3e97d3b2bbc8cc5 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 5 Sep 2025 21:16:30 +0200 Subject: [PATCH 1181/3206] regulator: dt-bindings: rpi-panel: Split 7" Raspberry Pi 720x1280 v2 binding The 5" and 7" Raspberry Pi 720x1280 Display 2 MCU is a bit more complex than the original Display 1 ATTINY88 and the binding is also a bit more demanding. Split the binding into separate file and fill in required gpio-controller, #gpio-cells and #pwm-cells which must be present for the V2 MCU. Include mention of the 5" panel in the description of Display 2, as the 5" panel uses the same MCU. Fixes: 6d09c6e474bd ("regulator: dt-bindings: rpi-panel: Add regulator for 7" Raspberry Pi 720x1280") Signed-off-by: Marek Vasut Acked-by: Conor Dooley Message-ID: <20250905191637.147141-1-marek.vasut+renesas@mailbox.org> Signed-off-by: Mark Brown --- ...,7inch-touchscreen-panel-regulator-v2.yaml | 61 +++++++++++++++++++ ...ypi,7inch-touchscreen-panel-regulator.yaml | 7 +-- 2 files changed, 63 insertions(+), 5 deletions(-) create mode 100644 Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator-v2.yaml diff --git a/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator-v2.yaml b/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator-v2.yaml new file mode 100644 index 00000000000000..37b9ed371b67d3 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator-v2.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/raspberrypi,7inch-touchscreen-panel-regulator-v2.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: RaspberryPi 5" and 7" display V2 MCU-based regulator/backlight controller + +maintainers: + - Marek Vasut + +description: | + The RaspberryPi 5" and 7" display 2 has an MCU-based regulator, PWM + backlight and GPIO controller on the PCB, which is used to turn the + display unit on/off and control the backlight. + +allOf: + - $ref: regulator.yaml# + +properties: + compatible: + const: raspberrypi,touchscreen-panel-regulator-v2 + + reg: + maxItems: 1 + + gpio-controller: true + "#gpio-cells": + const: 2 + description: + The first cell is the pin number, and the second cell is used to + specify the gpio polarity (GPIO_ACTIVE_HIGH or GPIO_ACTIVE_LOW). + + "#pwm-cells": + const: 3 + description: See ../../pwm/pwm.yaml for description of the cell formats. + +additionalProperties: false + +required: + - compatible + - reg + - gpio-controller + - "#gpio-cells" + - "#pwm-cells" + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + regulator@45 { + compatible = "raspberrypi,touchscreen-panel-regulator-v2"; + reg = <0x45>; + gpio-controller; + #gpio-cells = <2>; + #pwm-cells = <3>; + }; + }; + +... diff --git a/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator.yaml b/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator.yaml index 18944d39d08fcb..41678400e63fa6 100644 --- a/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator.yaml @@ -12,17 +12,14 @@ maintainers: description: | The RaspberryPi 7" display has an ATTINY88-based regulator/backlight controller on the PCB, which is used to turn the display unit on/off - and control the backlight. The V2 supports 5" and 7" panels and also - offers PWM backlight control. + and control the backlight. allOf: - $ref: regulator.yaml# properties: compatible: - enum: - - raspberrypi,7inch-touchscreen-panel-regulator - - raspberrypi,touchscreen-panel-regulator-v2 + const: raspberrypi,7inch-touchscreen-panel-regulator reg: maxItems: 1 From 29c2f18d4f3fa444e3b8972b45d548cf521f8f15 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 29 Aug 2025 09:04:02 +0200 Subject: [PATCH 1182/3206] x86/xen: select HIBERNATE_CALLBACKS more directly The config XEN_SAVE_RESTORE's only purpose is to select HIBERNATE_CALLBACKS, when config XEN is set. The XEN config definition can simply select HIBERNATE_CALLBACKS, though, and the definition of XEN_SAVE_RESTORE can be dropped. So, remove this indirection through XEN_SAVE_RESTORE and select HIBERNATE_CALLBACKS directly. Also, drop the XEN_SAVE_RESTORE from the x86 xen config fragment. No functional change intended with this clean-up. Signed-off-by: Lukas Bulwahn Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Message-ID: <20250829070402.159390-1-lukas.bulwahn@redhat.com> --- arch/x86/configs/xen.config | 1 - arch/x86/xen/Kconfig | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config index d5d091e03bd34b..98b6952ba9d2e2 100644 --- a/arch/x86/configs/xen.config +++ b/arch/x86/configs/xen.config @@ -12,7 +12,6 @@ CONFIG_CPU_FREQ=y # x86 xen specific config options CONFIG_XEN_PVH=y -CONFIG_XEN_SAVE_RESTORE=y # CONFIG_XEN_DEBUG_FS is not set CONFIG_XEN_MCE_LOG=y CONFIG_XEN_ACPI_PROCESSOR=m diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 98d8a50d2aed4b..aa4040fd921560 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -8,6 +8,7 @@ config XEN depends on PARAVIRT select PARAVIRT_CLOCK select X86_HV_CALLBACK_VECTOR + select HIBERNATE_CALLBACKS depends on X86_64 || (X86_32 && X86_PAE) depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MATOM) depends on X86_LOCAL_APIC && X86_TSC @@ -64,12 +65,6 @@ config XEN_PVHVM_GUEST help Support running as a Xen PVHVM guest. -config XEN_SAVE_RESTORE - bool - depends on XEN - select HIBERNATE_CALLBACKS - default y - config XEN_DEBUG_FS bool "Enable Xen debug and tuning parameters in debugfs" depends on XEN && DEBUG_FS From dd2fa82473453661d12723c46c9f43d9876a7efd Mon Sep 17 00:00:00 2001 From: Jonathan Curley Date: Mon, 8 Sep 2025 17:35:16 +0000 Subject: [PATCH 1183/3206] NFSv4/flexfiles: Fix layout merge mirror check. Typo in ff_lseg_match_mirrors makes the diff ineffective. This results in merge happening all the time. Merge happening all the time is problematic because it marks lsegs invalid. Marking lsegs invalid causes all outstanding IO to get restarted with EAGAIN and connections to get closed. Closing connections constantly triggers race conditions in the RDMA implementation... Fixes: 660d1eb22301c ("pNFS/flexfile: Don't merge layout segments if the mirrors don't match") Signed-off-by: Jonathan Curley Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index f8ab7b4e09e7e2..9edb5f9b0c4e47 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -293,7 +293,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1, struct pnfs_layout_segment *l2) { const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1); - const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1); + const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2); u32 i; if (fl1->mirror_array_cnt != fl2->mirror_array_cnt) From cd4453c5e983cf1fd5757e9acb915adb1e4602b6 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 8 Sep 2025 02:46:58 +0000 Subject: [PATCH 1184/3206] tracing: Silence warning when chunk allocation fails in trace_pid_write Syzkaller trigger a fault injection warning: WARNING: CPU: 1 PID: 12326 at tracepoint_add_func+0xbfc/0xeb0 Modules linked in: CPU: 1 UID: 0 PID: 12326 Comm: syz.6.10325 Tainted: G U 6.14.0-rc5-syzkaller #0 Tainted: [U]=USER Hardware name: Google Compute Engine/Google Compute Engine RIP: 0010:tracepoint_add_func+0xbfc/0xeb0 kernel/tracepoint.c:294 Code: 09 fe ff 90 0f 0b 90 0f b6 74 24 43 31 ff 41 bc ea ff ff ff RSP: 0018:ffffc9000414fb48 EFLAGS: 00010283 RAX: 00000000000012a1 RBX: ffffffff8e240ae0 RCX: ffffc90014b78000 RDX: 0000000000080000 RSI: ffffffff81bbd78b RDI: 0000000000000001 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: ffffffffffffffef R13: 0000000000000000 R14: dffffc0000000000 R15: ffffffff81c264f0 FS: 00007f27217f66c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2e80dff8 CR3: 00000000268f8000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tracepoint_probe_register_prio+0xc0/0x110 kernel/tracepoint.c:464 register_trace_prio_sched_switch include/trace/events/sched.h:222 [inline] register_pid_events kernel/trace/trace_events.c:2354 [inline] event_pid_write.isra.0+0x439/0x7a0 kernel/trace/trace_events.c:2425 vfs_write+0x24c/0x1150 fs/read_write.c:677 ksys_write+0x12b/0x250 fs/read_write.c:731 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f We can reproduce the warning by following the steps below: 1. echo 8 >> set_event_notrace_pid. Let tr->filtered_pids owns one pid and register sched_switch tracepoint. 2. echo ' ' >> set_event_pid, and perform fault injection during chunk allocation of trace_pid_list_alloc. Let pid_list with no pid and assign to tr->filtered_pids. 3. echo ' ' >> set_event_pid. Let pid_list is NULL and assign to tr->filtered_pids. 4. echo 9 >> set_event_pid, will trigger the double register sched_switch tracepoint warning. The reason is that syzkaller injects a fault into the chunk allocation in trace_pid_list_alloc, causing a failure in trace_pid_list_set, which may trigger double register of the same tracepoint. This only occurs when the system is about to crash, but to suppress this warning, let's add failure handling logic to trace_pid_list_set. Link: https://lore.kernel.org/20250908024658.2390398-1-pulehui@huaweicloud.com Fixes: 8d6e90983ade ("tracing: Create a sparse bitmask for pid filtering") Reported-by: syzbot+161412ccaeff20ce4dde@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67cb890e.050a0220.d8275.022e.GAE@google.com Signed-off-by: Pu Lehui Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2f1ae6c0ee81e7..b3c94fbaf002ff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -834,7 +834,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, /* copy the current bits to the new max */ ret = trace_pid_list_first(filtered_pids, &pid); while (!ret) { - trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_set(pid_list, pid); + if (ret < 0) + goto out; + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } @@ -871,6 +874,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, trace_parser_clear(&parser); ret = 0; } + out: trace_parser_put(&parser); if (ret < 0) { From 94a28f9ba264b4b93cd1efa97992db59d74e8048 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Mon, 8 Sep 2025 10:54:49 -0500 Subject: [PATCH 1185/3206] hwmon: (sht21) Documentation cleanup Drop extra empty lines and organize sysfs entries in a table. Signed-off-by: Kurt Borja Link: https://lore.kernel.org/r/20250908-sht2x-v4-1-bc15f68af7de@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/sht21.rst | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/Documentation/hwmon/sht21.rst b/Documentation/hwmon/sht21.rst index 1bccc8e8aac8d3..9f66cd51b45dc4 100644 --- a/Documentation/hwmon/sht21.rst +++ b/Documentation/hwmon/sht21.rst @@ -13,8 +13,6 @@ Supported chips: https://www.sensirion.com/file/datasheet_sht21 - - * Sensirion SHT25 Prefix: 'sht25' @@ -25,8 +23,6 @@ Supported chips: https://www.sensirion.com/file/datasheet_sht25 - - Author: Urs Fleisch @@ -47,13 +43,11 @@ in the board setup code. sysfs-Interface --------------- -temp1_input - - temperature input - -humidity1_input - - humidity input -eic - - Electronic Identification Code +=================== ============================================================ +temp1_input Temperature input +humidity1_input Humidity input +eic Electronic Identification Code +=================== ============================================================ Notes ----- From a0cce093689859238f923faa83a6735b7e8613c4 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Mon, 8 Sep 2025 10:54:50 -0500 Subject: [PATCH 1186/3206] hwmon: (sht21) Add support for SHT20, SHT25 chips All sht2x chips share the same communication protocol so add support for them. Signed-off-by: Kurt Borja Link: https://lore.kernel.org/r/20250908-sht2x-v4-2-bc15f68af7de@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/sht21.rst | 10 ++++++++++ drivers/hwmon/Kconfig | 4 ++-- drivers/hwmon/sht21.c | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/Documentation/hwmon/sht21.rst b/Documentation/hwmon/sht21.rst index 9f66cd51b45dc4..d20e8a460ba6c7 100644 --- a/Documentation/hwmon/sht21.rst +++ b/Documentation/hwmon/sht21.rst @@ -3,6 +3,16 @@ Kernel driver sht21 Supported chips: + * Sensirion SHT20 + + Prefix: 'sht20' + + Addresses scanned: none + + Datasheet: Publicly available at the Sensirion website + + https://www.sensirion.com/file/datasheet_sht20 + * Sensirion SHT21 Prefix: 'sht21' diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 8a41275ca518b1..bf291ba7ca32ef 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -1930,8 +1930,8 @@ config SENSORS_SHT21 tristate "Sensiron humidity and temperature sensors. SHT21 and compat." depends on I2C help - If you say yes here you get support for the Sensiron SHT21, SHT25 - humidity and temperature sensors. + If you say yes here you get support for the Sensiron SHT20, SHT21, + SHT25 humidity and temperature sensors. This driver can also be built as a module. If so, the module will be called sht21. diff --git a/drivers/hwmon/sht21.c b/drivers/hwmon/sht21.c index 97327313529b46..97d71e3361e9d7 100644 --- a/drivers/hwmon/sht21.c +++ b/drivers/hwmon/sht21.c @@ -275,7 +275,9 @@ static int sht21_probe(struct i2c_client *client) /* Device ID table */ static const struct i2c_device_id sht21_id[] = { + { "sht20" }, { "sht21" }, + { "sht25" }, { } }; MODULE_DEVICE_TABLE(i2c, sht21_id); From 0ab88e2394392f475b8857ac82c0c987841217f8 Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Mon, 8 Sep 2025 10:25:44 +0800 Subject: [PATCH 1187/3206] hwmon: add GPD devices sensor driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sensors driver for GPD Handhelds that expose fan reading and control via hwmon sysfs. Shenzhen GPD Technology Co., Ltd. manufactures a series of handheld devices. This driver implements these functions through x86 port-mapped IO. Tested-by: Marcin Strągowski Tested-by: someone5678 Tested-by: Justin Weiss Tested-by: Antheas Kapenekakis Tested-by: command_block Tested-by: derjohn Tested-by: Crashdummyy Signed-off-by: Cryolitia PukNgae Link: https://lore.kernel.org/r/20250908-gpd_fan-v9-1-7b4506c03953@uniontech.com Signed-off-by: Guenter Roeck --- MAINTAINERS | 6 + drivers/hwmon/Kconfig | 10 + drivers/hwmon/Makefile | 1 + drivers/hwmon/gpd-fan.c | 715 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 732 insertions(+) create mode 100644 drivers/hwmon/gpd-fan.c diff --git a/MAINTAINERS b/MAINTAINERS index 4e6f41ed4f2cff..c21daede716db3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10421,6 +10421,12 @@ F: drivers/phy/samsung/phy-gs101-ufs.c F: include/dt-bindings/clock/google,gs101.h K: [gG]oogle.?[tT]ensor +GPD FAN DRIVER +M: Cryolitia PukNgae +L: linux-hwmon@vger.kernel.org +S: Maintained +F: drivers/hwmon/gpd-fan.c + GPD POCKET FAN DRIVER M: Hans de Goede L: platform-driver-x86@vger.kernel.org diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index bf291ba7ca32ef..d6769288a76e81 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -769,6 +769,16 @@ config SENSORS_GL520SM This driver can also be built as a module. If so, the module will be called gl520sm. +config SENSORS_GPD + tristate "GPD handhelds" + depends on X86 + help + If you say yes here you get support for fan readings and + control over GPD handheld devices. + + Can also be built as a module. In that case it will be + called gpd-fan. + config SENSORS_G760A tristate "GMT G760A" depends on I2C diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index cd8bc4752b4dbf..051981eb8a5089 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -88,6 +88,7 @@ obj-$(CONFIG_SENSORS_GIGABYTE_WATERFORCE) += gigabyte_waterforce.o obj-$(CONFIG_SENSORS_GL518SM) += gl518sm.o obj-$(CONFIG_SENSORS_GL520SM) += gl520sm.o obj-$(CONFIG_SENSORS_GSC) += gsc-hwmon.o +obj-$(CONFIG_SENSORS_GPD) += gpd-fan.o obj-$(CONFIG_SENSORS_GPIO_FAN) += gpio-fan.o obj-$(CONFIG_SENSORS_GXP_FAN_CTRL) += gxp-fan-ctrl.o obj-$(CONFIG_SENSORS_HIH6130) += hih6130.o diff --git a/drivers/hwmon/gpd-fan.c b/drivers/hwmon/gpd-fan.c new file mode 100644 index 00000000000000..e0b3b46e1bf12a --- /dev/null +++ b/drivers/hwmon/gpd-fan.c @@ -0,0 +1,715 @@ +// SPDX-License-Identifier: GPL-2.0+ + +/* Platform driver for GPD devices that expose fan control via hwmon sysfs. + * + * Fan control is provided via pwm interface in the range [0-255]. + * Each model has a different range in the EC, the written value is scaled to + * accommodate for that. + * + * Based on this repo: + * https://github.com/Cryolitia/gpd-fan-driver + * + * Copyright (c) 2024 Cryolitia PukNgae + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_NAME "gpdfan" +#define GPD_PWM_CTR_OFFSET 0x1841 + +static char *gpd_fan_board = ""; +module_param(gpd_fan_board, charp, 0444); + +// EC read/write locker, protecting a sequence of EC operations +static DEFINE_MUTEX(gpd_fan_sequence_lock); + +enum gpd_board { + win_mini, + win4_6800u, + win_max_2, + duo, +}; + +enum FAN_PWM_ENABLE { + DISABLE = 0, + MANUAL = 1, + AUTOMATIC = 2, +}; + +static struct { + enum FAN_PWM_ENABLE pwm_enable; + u8 pwm_value; + + const struct gpd_fan_drvdata *drvdata; +} gpd_driver_priv; + +struct gpd_fan_drvdata { + const char *board_name; // Board name for module param comparison + const enum gpd_board board; + + const u8 addr_port; + const u8 data_port; + const u16 manual_control_enable; + const u16 rpm_read; + const u16 pwm_write; + const u16 pwm_max; +}; + +static struct gpd_fan_drvdata gpd_win_mini_drvdata = { + .board_name = "win_mini", + .board = win_mini, + + .addr_port = 0x4E, + .data_port = 0x4F, + .manual_control_enable = 0x047A, + .rpm_read = 0x0478, + .pwm_write = 0x047A, + .pwm_max = 244, +}; + +static struct gpd_fan_drvdata gpd_duo_drvdata = { + .board_name = "duo", + .board = duo, + + .addr_port = 0x4E, + .data_port = 0x4F, + .manual_control_enable = 0x047A, + .rpm_read = 0x0478, + .pwm_write = 0x047A, + .pwm_max = 244, +}; + +static struct gpd_fan_drvdata gpd_win4_drvdata = { + .board_name = "win4", + .board = win4_6800u, + + .addr_port = 0x2E, + .data_port = 0x2F, + .manual_control_enable = 0xC311, + .rpm_read = 0xC880, + .pwm_write = 0xC311, + .pwm_max = 127, +}; + +static struct gpd_fan_drvdata gpd_wm2_drvdata = { + .board_name = "wm2", + .board = win_max_2, + + .addr_port = 0x4E, + .data_port = 0x4F, + .manual_control_enable = 0x0275, + .rpm_read = 0x0218, + .pwm_write = 0x1809, + .pwm_max = 184, +}; + +static const struct dmi_system_id dmi_table[] = { + { + // GPD Win Mini + // GPD Win Mini with AMD Ryzen 8840U + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1617-01") + }, + .driver_data = &gpd_win_mini_drvdata, + }, + { + // GPD Win Mini + // GPD Win Mini with AMD Ryzen HX370 + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1617-02") + }, + .driver_data = &gpd_win_mini_drvdata, + }, + { + // GPD Win Mini + // GPD Win Mini with AMD Ryzen HX370 + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1617-02-L") + }, + .driver_data = &gpd_win_mini_drvdata, + }, + { + // GPD Win 4 with AMD Ryzen 6800U + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1618-04"), + DMI_MATCH(DMI_BOARD_VERSION, "Default string"), + }, + .driver_data = &gpd_win4_drvdata, + }, + { + // GPD Win 4 with Ryzen 7840U + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1618-04"), + DMI_MATCH(DMI_BOARD_VERSION, "Ver. 1.0"), + }, + // Since 7840U, win4 uses the same drvdata as wm2 + .driver_data = &gpd_wm2_drvdata, + }, + { + // GPD Win 4 with Ryzen 7840U (another) + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1618-04"), + DMI_MATCH(DMI_BOARD_VERSION, "Ver.1.0"), + }, + .driver_data = &gpd_wm2_drvdata, + }, + { + // GPD Win Max 2 with Ryzen 6800U + // GPD Win Max 2 2023 with Ryzen 7840U + // GPD Win Max 2 2024 with Ryzen 8840U + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1619-04"), + }, + .driver_data = &gpd_wm2_drvdata, + }, + { + // GPD Win Max 2 with AMD Ryzen HX370 + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1619-05"), + }, + .driver_data = &gpd_wm2_drvdata, + }, + { + // GPD Duo + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1622-01"), + }, + .driver_data = &gpd_duo_drvdata, + }, + { + // GPD Duo (another) + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1622-01-L"), + }, + .driver_data = &gpd_duo_drvdata, + }, + { + // GPD Pocket 4 + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1628-04"), + }, + .driver_data = &gpd_win_mini_drvdata, + }, + { + // GPD Pocket 4 (another) + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1628-04-L"), + }, + .driver_data = &gpd_win_mini_drvdata, + }, + {} +}; + +static const struct gpd_fan_drvdata *gpd_module_drvdata[] = { + &gpd_win_mini_drvdata, &gpd_win4_drvdata, &gpd_wm2_drvdata, NULL +}; + +// Helper functions to handle EC read/write +static void gpd_ecram_read(u16 offset, u8 *val) +{ + u16 addr_port = gpd_driver_priv.drvdata->addr_port; + u16 data_port = gpd_driver_priv.drvdata->data_port; + + outb(0x2E, addr_port); + outb(0x11, data_port); + outb(0x2F, addr_port); + outb((u8)((offset >> 8) & 0xFF), data_port); + + outb(0x2E, addr_port); + outb(0x10, data_port); + outb(0x2F, addr_port); + outb((u8)(offset & 0xFF), data_port); + + outb(0x2E, addr_port); + outb(0x12, data_port); + outb(0x2F, addr_port); + *val = inb(data_port); +} + +static void gpd_ecram_write(u16 offset, u8 value) +{ + u16 addr_port = gpd_driver_priv.drvdata->addr_port; + u16 data_port = gpd_driver_priv.drvdata->data_port; + + outb(0x2E, addr_port); + outb(0x11, data_port); + outb(0x2F, addr_port); + outb((u8)((offset >> 8) & 0xFF), data_port); + + outb(0x2E, addr_port); + outb(0x10, data_port); + outb(0x2F, addr_port); + outb((u8)(offset & 0xFF), data_port); + + outb(0x2E, addr_port); + outb(0x12, data_port); + outb(0x2F, addr_port); + outb(value, data_port); +} + +static int gpd_generic_read_rpm(void) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + u8 high, low; + + gpd_ecram_read(drvdata->rpm_read, &high); + gpd_ecram_read(drvdata->rpm_read + 1, &low); + + return (u16)high << 8 | low; +} + +static void gpd_win4_init_ec(void) +{ + u8 chip_id, chip_ver; + + gpd_ecram_read(0x2000, &chip_id); + + if (chip_id == 0x55) { + gpd_ecram_read(0x1060, &chip_ver); + gpd_ecram_write(0x1060, chip_ver | 0x80); + } +} + +static int gpd_win4_read_rpm(void) +{ + int ret; + + ret = gpd_generic_read_rpm(); + + if (ret == 0) + // Re-init EC when speed is 0 + gpd_win4_init_ec(); + + return ret; +} + +static int gpd_wm2_read_rpm(void) +{ + for (u16 pwm_ctr_offset = GPD_PWM_CTR_OFFSET; + pwm_ctr_offset <= GPD_PWM_CTR_OFFSET + 2; pwm_ctr_offset++) { + u8 PWMCTR; + + gpd_ecram_read(pwm_ctr_offset, &PWMCTR); + + if (PWMCTR != 0xB8) + gpd_ecram_write(pwm_ctr_offset, 0xB8); + } + + return gpd_generic_read_rpm(); +} + +// Read value for fan1_input +static int gpd_read_rpm(void) +{ + switch (gpd_driver_priv.drvdata->board) { + case win_mini: + case duo: + return gpd_generic_read_rpm(); + case win4_6800u: + return gpd_win4_read_rpm(); + case win_max_2: + return gpd_wm2_read_rpm(); + } + + return 0; +} + +static int gpd_wm2_read_pwm(void) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + u8 var; + + gpd_ecram_read(drvdata->pwm_write, &var); + + // Match gpd_generic_write_pwm(u8) below + return DIV_ROUND_CLOSEST((var - 1) * 255, (drvdata->pwm_max - 1)); +} + +// Read value for pwm1 +static int gpd_read_pwm(void) +{ + switch (gpd_driver_priv.drvdata->board) { + case win_mini: + case duo: + case win4_6800u: + switch (gpd_driver_priv.pwm_enable) { + case DISABLE: + return 255; + case MANUAL: + return gpd_driver_priv.pwm_value; + case AUTOMATIC: + return -EOPNOTSUPP; + } + break; + case win_max_2: + return gpd_wm2_read_pwm(); + } + return 0; +} + +// PWM value's range in EC is 1 - pwm_max, cast 0 - 255 to it. +static inline u8 gpd_cast_pwm_range(u8 val) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + + return DIV_ROUND_CLOSEST(val * (drvdata->pwm_max - 1), 255) + 1; +} + +static void gpd_generic_write_pwm(u8 val) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + u8 pwm_reg; + + pwm_reg = gpd_cast_pwm_range(val); + gpd_ecram_write(drvdata->pwm_write, pwm_reg); +} + +static void gpd_duo_write_pwm(u8 val) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + u8 pwm_reg; + + pwm_reg = gpd_cast_pwm_range(val); + gpd_ecram_write(drvdata->pwm_write, pwm_reg); + gpd_ecram_write(drvdata->pwm_write + 1, pwm_reg); +} + +// Write value for pwm1 +static int gpd_write_pwm(u8 val) +{ + if (gpd_driver_priv.pwm_enable != MANUAL) + return -EPERM; + + switch (gpd_driver_priv.drvdata->board) { + case duo: + gpd_duo_write_pwm(val); + break; + case win_mini: + case win4_6800u: + case win_max_2: + gpd_generic_write_pwm(val); + break; + } + + return 0; +} + +static void gpd_win_mini_set_pwm_enable(enum FAN_PWM_ENABLE pwm_enable) +{ + switch (pwm_enable) { + case DISABLE: + gpd_generic_write_pwm(255); + break; + case MANUAL: + gpd_generic_write_pwm(gpd_driver_priv.pwm_value); + break; + case AUTOMATIC: + gpd_ecram_write(gpd_driver_priv.drvdata->pwm_write, 0); + break; + } +} + +static void gpd_duo_set_pwm_enable(enum FAN_PWM_ENABLE pwm_enable) +{ + switch (pwm_enable) { + case DISABLE: + gpd_duo_write_pwm(255); + break; + case MANUAL: + gpd_duo_write_pwm(gpd_driver_priv.pwm_value); + break; + case AUTOMATIC: + gpd_ecram_write(gpd_driver_priv.drvdata->pwm_write, 0); + break; + } +} + +static void gpd_wm2_set_pwm_enable(enum FAN_PWM_ENABLE enable) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + + switch (enable) { + case DISABLE: + gpd_generic_write_pwm(255); + gpd_ecram_write(drvdata->manual_control_enable, 1); + break; + case MANUAL: + gpd_generic_write_pwm(gpd_driver_priv.pwm_value); + gpd_ecram_write(drvdata->manual_control_enable, 1); + break; + case AUTOMATIC: + gpd_ecram_write(drvdata->manual_control_enable, 0); + break; + } +} + +// Write value for pwm1_enable +static void gpd_set_pwm_enable(enum FAN_PWM_ENABLE enable) +{ + if (enable == MANUAL) + // Set pwm_value to max firstly when switching to manual mode, in + // consideration of device safety. + gpd_driver_priv.pwm_value = 255; + + switch (gpd_driver_priv.drvdata->board) { + case win_mini: + case win4_6800u: + gpd_win_mini_set_pwm_enable(enable); + break; + case duo: + gpd_duo_set_pwm_enable(enable); + break; + case win_max_2: + gpd_wm2_set_pwm_enable(enable); + break; + } +} + +static umode_t gpd_fan_hwmon_is_visible(__always_unused const void *drvdata, + enum hwmon_sensor_types type, u32 attr, + __always_unused int channel) +{ + if (type == hwmon_fan && attr == hwmon_fan_input) { + return 0444; + } else if (type == hwmon_pwm) { + switch (attr) { + case hwmon_pwm_enable: + case hwmon_pwm_input: + return 0644; + default: + return 0; + } + } + return 0; +} + +static int gpd_fan_hwmon_read(__always_unused struct device *dev, + enum hwmon_sensor_types type, u32 attr, + __always_unused int channel, long *val) +{ + int ret; + + ret = mutex_lock_interruptible(&gpd_fan_sequence_lock); + if (ret) + return ret; + + if (type == hwmon_fan) { + if (attr == hwmon_fan_input) { + ret = gpd_read_rpm(); + + if (ret < 0) + goto OUT; + + *val = ret; + ret = 0; + goto OUT; + } + } else if (type == hwmon_pwm) { + switch (attr) { + case hwmon_pwm_enable: + *val = gpd_driver_priv.pwm_enable; + ret = 0; + goto OUT; + case hwmon_pwm_input: + ret = gpd_read_pwm(); + + if (ret < 0) + goto OUT; + + *val = ret; + ret = 0; + goto OUT; + } + } + + ret = -EOPNOTSUPP; + +OUT: + mutex_unlock(&gpd_fan_sequence_lock); + return ret; +} + +static int gpd_fan_hwmon_write(__always_unused struct device *dev, + enum hwmon_sensor_types type, u32 attr, + __always_unused int channel, long val) +{ + int ret; + + ret = mutex_lock_interruptible(&gpd_fan_sequence_lock); + if (ret) + return ret; + + if (type == hwmon_pwm) { + switch (attr) { + case hwmon_pwm_enable: + if (!in_range(val, 0, 3)) { + ret = -EINVAL; + goto OUT; + } + + gpd_driver_priv.pwm_enable = val; + + gpd_set_pwm_enable(gpd_driver_priv.pwm_enable); + ret = 0; + goto OUT; + case hwmon_pwm_input: + if (!in_range(val, 0, 255)) { + ret = -ERANGE; + goto OUT; + } + + gpd_driver_priv.pwm_value = val; + + ret = gpd_write_pwm(val); + goto OUT; + } + } + + ret = -EOPNOTSUPP; + +OUT: + mutex_unlock(&gpd_fan_sequence_lock); + return ret; +} + +static const struct hwmon_ops gpd_fan_ops = { + .is_visible = gpd_fan_hwmon_is_visible, + .read = gpd_fan_hwmon_read, + .write = gpd_fan_hwmon_write, +}; + +static const struct hwmon_channel_info *gpd_fan_hwmon_channel_info[] = { + HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT), + HWMON_CHANNEL_INFO(pwm, HWMON_PWM_INPUT | HWMON_PWM_ENABLE), + NULL +}; + +static struct hwmon_chip_info gpd_fan_chip_info = { + .ops = &gpd_fan_ops, + .info = gpd_fan_hwmon_channel_info +}; + +static int gpd_fan_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + const struct resource *region; + const struct resource *res; + const struct device *hwdev; + + res = platform_get_resource(pdev, IORESOURCE_IO, 0); + if (IS_ERR(res)) + return dev_err_probe(dev, PTR_ERR(res), + "Failed to get platform resource\n"); + + region = devm_request_region(dev, res->start, + resource_size(res), DRIVER_NAME); + if (IS_ERR(region)) + return dev_err_probe(dev, PTR_ERR(region), + "Failed to request region\n"); + + hwdev = devm_hwmon_device_register_with_info(dev, + DRIVER_NAME, + NULL, + &gpd_fan_chip_info, + NULL); + if (IS_ERR(hwdev)) + return dev_err_probe(dev, PTR_ERR(region), + "Failed to register hwmon device\n"); + + return 0; +} + +static void gpd_fan_remove(__always_unused struct platform_device *pdev) +{ + gpd_driver_priv.pwm_enable = AUTOMATIC; + gpd_set_pwm_enable(AUTOMATIC); +} + +static struct platform_driver gpd_fan_driver = { + .probe = gpd_fan_probe, + .remove = gpd_fan_remove, + .driver = { + .name = KBUILD_MODNAME, + }, +}; + +static struct platform_device *gpd_fan_platform_device; + +static int __init gpd_fan_init(void) +{ + const struct gpd_fan_drvdata *match = NULL; + + for (const struct gpd_fan_drvdata **p = gpd_module_drvdata; *p; p++) { + if (strcmp(gpd_fan_board, (*p)->board_name) == 0) { + match = *p; + break; + } + } + + if (!match) { + const struct dmi_system_id *dmi_match = + dmi_first_match(dmi_table); + if (dmi_match) + match = dmi_match->driver_data; + } + + if (!match) + return -ENODEV; + + gpd_driver_priv.pwm_enable = AUTOMATIC; + gpd_driver_priv.pwm_value = 255; + gpd_driver_priv.drvdata = match; + + struct resource gpd_fan_resources[] = { + { + .start = match->addr_port, + .end = match->data_port, + .flags = IORESOURCE_IO, + }, + }; + + gpd_fan_platform_device = platform_create_bundle(&gpd_fan_driver, + gpd_fan_probe, + gpd_fan_resources, + 1, NULL, 0); + + if (IS_ERR(gpd_fan_platform_device)) { + pr_warn("Failed to create platform device\n"); + return PTR_ERR(gpd_fan_platform_device); + } + + return 0; +} + +static void __exit gpd_fan_exit(void) +{ + platform_device_unregister(gpd_fan_platform_device); + platform_driver_unregister(&gpd_fan_driver); +} + +MODULE_DEVICE_TABLE(dmi, dmi_table); + +module_init(gpd_fan_init); +module_exit(gpd_fan_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Cryolitia PukNgae "); +MODULE_DESCRIPTION("GPD Devices fan control driver"); From 69001f21ded78131b995af2900ef574b04a59ae2 Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Mon, 8 Sep 2025 10:25:45 +0800 Subject: [PATCH 1188/3206] hwmon: document: add gpd-fan Add GPD fan driver document Signed-off-by: Cryolitia PukNgae Link: https://lore.kernel.org/r/20250908-gpd_fan-v9-2-7b4506c03953@uniontech.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/gpd-fan.rst | 78 +++++++++++++++++++++++++++++++++ Documentation/hwmon/index.rst | 1 + MAINTAINERS | 1 + 3 files changed, 80 insertions(+) create mode 100644 Documentation/hwmon/gpd-fan.rst diff --git a/Documentation/hwmon/gpd-fan.rst b/Documentation/hwmon/gpd-fan.rst new file mode 100644 index 00000000000000..0b56b70e6264dd --- /dev/null +++ b/Documentation/hwmon/gpd-fan.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +Kernel driver gpd-fan +========================= + +Author: + - Cryolitia PukNgae + +Description +------------ + +Handheld devices from Shenzhen GPD Technology Co., Ltd. provide fan readings +and fan control through their embedded controllers. + +Supported devices +----------------- + +Currently the driver supports the following handhelds: + + - GPD Win Mini (7840U) + - GPD Win Mini (8840U) + - GPD Win Mini (HX370) + - GPD Pocket 4 + - GPD Duo + - GPD Win Max 2 (6800U) + - GPD Win Max 2 2023 (7840U) + - GPD Win Max 2 2024 (8840U) + - GPD Win Max 2 2025 (HX370) + - GPD Win 4 (6800U) + - GPD Win 4 (7840U) + +Module parameters +----------------- + +gpd_fan_board + Force specific which module quirk should be used. + Use it like "gpd_fan_board=wm2". + + - wm2 + - GPD Win 4 (7840U) + - GPD Win Max 2 (6800U) + - GPD Win Max 2 2023 (7840U) + - GPD Win Max 2 2024 (8840U) + - GPD Win Max 2 2025 (HX370) + - win4 + - GPD Win 4 (6800U) + - win_mini + - GPD Win Mini (7840U) + - GPD Win Mini (8840U) + - GPD Win Mini (HX370) + - GPD Pocket 4 + - GPD Duo + +Sysfs entries +------------- + +The following attributes are supported: + +fan1_input + Read Only. Reads current fan RPM. + +pwm1_enable + Read/Write. Enable manual fan control. Write "0" to disable control and run + at full speed. Write "1" to set to manual, write "2" to let the EC control + decide fan speed. Read this attribute to see current status. + + NB:In consideration of the safety of the device, when setting to manual mode, + the pwm speed will be set to the maximum value (255) by default. You can set + a different value by writing pwm1 later. + +pwm1 + Read/Write. Read this attribute to see current duty cycle in the range + [0-255]. When pwm1_enable is set to "1" (manual) write any value in the + range [0-255] to set fan speed. + + NB: Many boards (except listed under wm2 above) don't support reading the + current pwm value in auto mode. That will just return EOPNOTSUPP. In manual + mode it will always return the real value. diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst index b4d26a6fa3a5b1..1803a43f6216b2 100644 --- a/Documentation/hwmon/index.rst +++ b/Documentation/hwmon/index.rst @@ -82,6 +82,7 @@ Hardware Monitoring Kernel Drivers gigabyte_waterforce gsc-hwmon gl518sm + gpd-fan gxp-fan-ctrl hih6130 hp-wmi-sensors diff --git a/MAINTAINERS b/MAINTAINERS index c21daede716db3..5642fd647326b5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10425,6 +10425,7 @@ GPD FAN DRIVER M: Cryolitia PukNgae L: linux-hwmon@vger.kernel.org S: Maintained +F: Documentation/hwmon/gpd-fan.rst F: drivers/hwmon/gpd-fan.c GPD POCKET FAN DRIVER From b816265396daf1beb915e0ffbfd7f3906c2bf4a4 Mon Sep 17 00:00:00 2001 From: Klaus Kudielka Date: Sun, 7 Sep 2025 12:21:46 +0200 Subject: [PATCH 1189/3206] PCI: mvebu: Fix use of for_each_of_range() iterator 5da3d94a23c6 ("PCI: mvebu: Use for_each_of_range() iterator for parsing "ranges"") simplified code by using the for_each_of_range() iterator, but it broke PCI enumeration on Turris Omnia (and probably other mvebu targets). Issue #1: To determine range.flags, of_pci_range_parser_one() uses bus->get_flags(), which resolves to of_bus_pci_get_flags(), which already returns an IORESOURCE bit field, and NOT the original flags from the "ranges" resource. Then mvebu_get_tgt_attr() attempts the very same conversion again. Remove the misinterpretation of range.flags in mvebu_get_tgt_attr(), to restore the intended behavior. Issue #2: The driver needs target and attributes, which are encoded in the raw address values of the "/soc/pcie/ranges" resource. According to of_pci_range_parser_one(), the raw values are stored in range.bus_addr and range.parent_bus_addr, respectively. range.cpu_addr is a translated version of range.parent_bus_addr, and not relevant here. Use the correct range structure member, to extract target and attributes. This restores the intended behavior. Fixes: 5da3d94a23c6 ("PCI: mvebu: Use for_each_of_range() iterator for parsing "ranges"") Reported-by: Jan Palus Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220479 Signed-off-by: Klaus Kudielka Signed-off-by: Bjorn Helgaas Tested-by: Tony Dinh Tested-by: Jan Palus Link: https://patch.msgid.link/20250907102303.29735-1-klaus.kudielka@gmail.com --- drivers/pci/controller/pci-mvebu.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/pci/controller/pci-mvebu.c b/drivers/pci/controller/pci-mvebu.c index 755651f33811d7..a72aa57591c04e 100644 --- a/drivers/pci/controller/pci-mvebu.c +++ b/drivers/pci/controller/pci-mvebu.c @@ -1168,12 +1168,6 @@ static void __iomem *mvebu_pcie_map_registers(struct platform_device *pdev, return devm_ioremap_resource(&pdev->dev, &port->regs); } -#define DT_FLAGS_TO_TYPE(flags) (((flags) >> 24) & 0x03) -#define DT_TYPE_IO 0x1 -#define DT_TYPE_MEM32 0x2 -#define DT_CPUADDR_TO_TARGET(cpuaddr) (((cpuaddr) >> 56) & 0xFF) -#define DT_CPUADDR_TO_ATTR(cpuaddr) (((cpuaddr) >> 48) & 0xFF) - static int mvebu_get_tgt_attr(struct device_node *np, int devfn, unsigned long type, unsigned int *tgt, @@ -1189,19 +1183,12 @@ static int mvebu_get_tgt_attr(struct device_node *np, int devfn, return -EINVAL; for_each_of_range(&parser, &range) { - unsigned long rtype; u32 slot = upper_32_bits(range.bus_addr); - if (DT_FLAGS_TO_TYPE(range.flags) == DT_TYPE_IO) - rtype = IORESOURCE_IO; - else if (DT_FLAGS_TO_TYPE(range.flags) == DT_TYPE_MEM32) - rtype = IORESOURCE_MEM; - else - continue; - - if (slot == PCI_SLOT(devfn) && type == rtype) { - *tgt = DT_CPUADDR_TO_TARGET(range.cpu_addr); - *attr = DT_CPUADDR_TO_ATTR(range.cpu_addr); + if (slot == PCI_SLOT(devfn) && + type == (range.flags & IORESOURCE_TYPE_BITS)) { + *tgt = (range.parent_bus_addr >> 56) & 0xFF; + *attr = (range.parent_bus_addr >> 48) & 0xFF; return 0; } } From a061e739d36220c002da8b2429d5f16f637eb59a Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Mon, 8 Sep 2025 13:37:19 +0200 Subject: [PATCH 1190/3206] pinctrl: airoha: fix wrong MDIO function bitmaks With further testing with an attached Aeonsemi it was discovered that the pinctrl MDIO function applied the wrong bitmask. The error was probably caused by the confusing documentation related to these bits. Inspecting what the bootloader actually configure, the SGMII_MDIO_MODE is never actually set but instead it's set force enable to the 2 GPIO (gpio 1-2) for MDC and MDIO pin. The usage of GPIO might be confusing but this is just to instruct the SoC to not mess with those 2 PIN and as Benjamin reported it's also an Errata of 7581. The FORCE_GPIO_EN doesn't set them as GPIO function (that is configured by a different register) but it's really to actually ""enable"" those lines. Normally the SoC should autodetect this by HW but it seems AN7581 have problem with this and require this workaround to force enable the 2 pin. Applying this configuration permits correct functionality of any externally attached PHY. Cc: stable@vger.kernel.org Fixes: 1c8ace2d0725 ("pinctrl: airoha: Add support for EN7581 SoC") Signed-off-by: Christian Marangi Acked-by: Benjamin Larsson Signed-off-by: Linus Walleij --- drivers/pinctrl/mediatek/pinctrl-airoha.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/pinctrl/mediatek/pinctrl-airoha.c b/drivers/pinctrl/mediatek/pinctrl-airoha.c index ee5b9bab455258..b405dfa2089135 100644 --- a/drivers/pinctrl/mediatek/pinctrl-airoha.c +++ b/drivers/pinctrl/mediatek/pinctrl-airoha.c @@ -108,6 +108,9 @@ #define JTAG_UDI_EN_MASK BIT(4) #define JTAG_DFD_EN_MASK BIT(3) +#define REG_FORCE_GPIO_EN 0x0228 +#define FORCE_GPIO_EN(n) BIT(n) + /* LED MAP */ #define REG_LAN_LED0_MAPPING 0x027c #define REG_LAN_LED1_MAPPING 0x0280 @@ -718,17 +721,17 @@ static const struct airoha_pinctrl_func_group mdio_func_group[] = { { .name = "mdio", .regmap[0] = { - AIROHA_FUNC_MUX, - REG_GPIO_PON_MODE, - GPIO_SGMII_MDIO_MODE_MASK, - GPIO_SGMII_MDIO_MODE_MASK - }, - .regmap[1] = { AIROHA_FUNC_MUX, REG_GPIO_2ND_I2C_MODE, GPIO_MDC_IO_MASTER_MODE_MODE, GPIO_MDC_IO_MASTER_MODE_MODE }, + .regmap[1] = { + AIROHA_FUNC_MUX, + REG_FORCE_GPIO_EN, + FORCE_GPIO_EN(1) | FORCE_GPIO_EN(2), + FORCE_GPIO_EN(1) | FORCE_GPIO_EN(2) + }, .regmap_size = 2, }, }; From 00e58ff924b3a684b076f9512fe2753be87b50e1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 4 Sep 2025 22:28:41 -0700 Subject: [PATCH 1191/3206] PCI: Test for bit underflow in pcie_set_readrq() In preparation for the future commit ("bitops: Add __attribute_const__ to generic ffs()-family implementations"), which allows GCC's value range tracker to see past ffs(), GCC 8 on ARM thinks that it might be possible that "ffs(rq) - 8" used here: v = FIELD_PREP(PCI_EXP_DEVCTL_READRQ, ffs(rq) - 8); could wrap below 0, leading to a very large value, which would be out of range for the FIELD_PREP() usage: drivers/pci/pci.c: In function 'pcie_set_readrq': include/linux/compiler_types.h:572:38: error: call to '__compiletime_assert_471' declared with attribute error: FIELD_PREP: value too large for the field ... drivers/pci/pci.c:5896:6: note: in expansion of macro 'FIELD_PREP' v = FIELD_PREP(PCI_EXP_DEVCTL_READRQ, ffs(rq) - 8); ^~~~~~~~~~ If the result of the ffs() is bounds checked before being used in FIELD_PREP(), the value tracker seems happy again. :) Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/linux-pci/CA+G9fYuysVr6qT8bjF6f08WLyCJRG7aXAeSd2F7=zTaHHd7L+Q@mail.gmail.com/ Acked-by: Bjorn Helgaas Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250905052836.work.425-kees@kernel.org Signed-off-by: Kees Cook --- drivers/pci/pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b0f4d98036cddd..005b92e6585e91 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5932,6 +5932,7 @@ int pcie_set_readrq(struct pci_dev *dev, int rq) { u16 v; int ret; + unsigned int firstbit; struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus); if (rq < 128 || rq > 4096 || !is_power_of_2(rq)) @@ -5949,7 +5950,10 @@ int pcie_set_readrq(struct pci_dev *dev, int rq) rq = mps; } - v = FIELD_PREP(PCI_EXP_DEVCTL_READRQ, ffs(rq) - 8); + firstbit = ffs(rq); + if (firstbit < 8) + return -EINVAL; + v = FIELD_PREP(PCI_EXP_DEVCTL_READRQ, firstbit - 8); if (bridge->no_inc_mrrs) { int max_mrrs = pcie_get_readrq(dev); From b3a7bb71bfcd56c8266af8cf2a5dee3802e7a449 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:43:57 -0700 Subject: [PATCH 1192/3206] KUnit: Introduce ffs()-family tests Add KUnit tests for ffs()-family bit scanning functions: ffs(), __ffs(), fls(), __fls(), fls64(), __ffs64(), and ffz(). The tests validate mathematical relationships (e.g. ffs(x) == __ffs(x) + 1), and test zero values, power-of-2 patterns, maximum values, and sparse bit patterns. Build and run tested with everything I could reach with KUnit: $ ./tools/testing/kunit/kunit.py run --arch=x86_64 ffs $ ./tools/testing/kunit/kunit.py run --arch=i386 ffs $ ./tools/testing/kunit/kunit.py run --arch=arm64 --make_options "CROSS_COMPILE=aarch64-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=arm --make_options "CROSS_COMPILE=arm-linux-gnueabihf-" ffs $ ./tools/testing/kunit/kunit.py run --arch=powerpc ffs $ ./tools/testing/kunit/kunit.py run --arch=powerpc32 ffs $ ./tools/testing/kunit/kunit.py run --arch=powerpcle ffs $ ./tools/testing/kunit/kunit.py run --arch=riscv --make_options "CROSS_COMPILE=riscv64-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=riscv32 --make_options "CROSS_COMPILE=riscv64-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=s390 --make_options "CROSS_COMPILE=s390x-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=m68k ffs $ ./tools/testing/kunit/kunit.py run --arch=loongarch ffs $ ./tools/testing/kunit/kunit.py run --arch=mips --make_options "CROSS_COMPILE=mipsel-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=sparc --make_options "CROSS_COMPILE=sparc64-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=sparc64 --make_options "CROSS_COMPILE=sparc64-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=alpha --make_options "CROSS_COMPILE=alpha-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=sh --make_options "CROSS_COMPILE=sh4-linux-gnu-" ffs Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-1-kees@kernel.org Signed-off-by: Kees Cook --- lib/Kconfig.debug | 14 ++ lib/tests/Makefile | 1 + lib/tests/ffs_kunit.c | 526 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 541 insertions(+) create mode 100644 lib/tests/ffs_kunit.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dc0e0c6ed075e9..2391872c7b3c16 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2479,6 +2479,20 @@ config STRING_HELPERS_KUNIT_TEST depends on KUNIT default KUNIT_ALL_TESTS +config FFS_KUNIT_TEST + tristate "KUnit test ffs-family functions at runtime" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This builds KUnit tests for ffs-family bit manipulation functions + including ffs(), __ffs(), fls(), __fls(), fls64(), and __ffs64(). + + These tests validate mathematical correctness, edge case handling, + and cross-architecture consistency of bit scanning functions. + + For more information on KUnit and unit tests in general, + please refer to Documentation/dev-tools/kunit/. + config TEST_KSTRTOX tristate "Test kstrto*() family of functions at runtime" diff --git a/lib/tests/Makefile b/lib/tests/Makefile index fa6d728a8b5b15..f7460831cfdd4a 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_BLACKHOLE_DEV_KUNIT_TEST) += blackhole_dev_kunit.o obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o +obj-$(CONFIG_FFS_KUNIT_TEST) += ffs_kunit.o CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced) CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread) CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation) diff --git a/lib/tests/ffs_kunit.c b/lib/tests/ffs_kunit.c new file mode 100644 index 00000000000000..ed11456b116e46 --- /dev/null +++ b/lib/tests/ffs_kunit.c @@ -0,0 +1,526 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for ffs()-family functions + */ +#include +#include + +/* + * Test data structures + */ +struct ffs_test_case { + unsigned long input; + int expected_ffs; /* ffs() result (1-based) */ + int expected_fls; /* fls() result (1-based) */ + const char *description; +}; + +struct ffs64_test_case { + u64 input; + int expected_fls64; /* fls64() result (1-based) */ + unsigned int expected_ffs64_0based; /* __ffs64() result (0-based) */ + const char *description; +}; + +/* + * Basic edge cases - core functionality validation + */ +static const struct ffs_test_case basic_test_cases[] = { + /* Zero case - special handling */ + {0x00000000, 0, 0, "zero value"}, + + /* Single bit patterns - powers of 2 */ + {0x00000001, 1, 1, "bit 0 set"}, + {0x00000002, 2, 2, "bit 1 set"}, + {0x00000004, 3, 3, "bit 2 set"}, + {0x00000008, 4, 4, "bit 3 set"}, + {0x00000010, 5, 5, "bit 4 set"}, + {0x00000020, 6, 6, "bit 5 set"}, + {0x00000040, 7, 7, "bit 6 set"}, + {0x00000080, 8, 8, "bit 7 set"}, + {0x00000100, 9, 9, "bit 8 set"}, + {0x00008000, 16, 16, "bit 15 set"}, + {0x00010000, 17, 17, "bit 16 set"}, + {0x40000000, 31, 31, "bit 30 set"}, + {0x80000000, 32, 32, "bit 31 set (sign bit)"}, + + /* Maximum values */ + {0xFFFFFFFF, 1, 32, "all bits set"}, + + /* Multiple bit patterns */ + {0x00000003, 1, 2, "bits 0-1 set"}, + {0x00000007, 1, 3, "bits 0-2 set"}, + {0x0000000F, 1, 4, "bits 0-3 set"}, + {0x000000FF, 1, 8, "bits 0-7 set"}, + {0x0000FFFF, 1, 16, "bits 0-15 set"}, + {0x7FFFFFFF, 1, 31, "bits 0-30 set"}, + + /* Sparse patterns */ + {0x00000101, 1, 9, "bits 0,8 set"}, + {0x00001001, 1, 13, "bits 0,12 set"}, + {0x80000001, 1, 32, "bits 0,31 set"}, + {0x40000002, 2, 31, "bits 1,30 set"}, +}; + +/* + * 64-bit test cases + */ +static const struct ffs64_test_case ffs64_test_cases[] = { + /* Zero case */ + {0x0000000000000000ULL, 0, 0, "zero value"}, + + /* Single bit patterns */ + {0x0000000000000001ULL, 1, 0, "bit 0 set"}, + {0x0000000000000002ULL, 2, 1, "bit 1 set"}, + {0x0000000000000004ULL, 3, 2, "bit 2 set"}, + {0x0000000000000008ULL, 4, 3, "bit 3 set"}, + {0x0000000000008000ULL, 16, 15, "bit 15 set"}, + {0x0000000000010000ULL, 17, 16, "bit 16 set"}, + {0x0000000080000000ULL, 32, 31, "bit 31 set"}, + {0x0000000100000000ULL, 33, 32, "bit 32 set"}, + {0x0000000200000000ULL, 34, 33, "bit 33 set"}, + {0x4000000000000000ULL, 63, 62, "bit 62 set"}, + {0x8000000000000000ULL, 64, 63, "bit 63 set (sign bit)"}, + + /* Maximum values */ + {0xFFFFFFFFFFFFFFFFULL, 64, 0, "all bits set"}, + + /* Cross 32-bit boundary patterns */ + {0x00000000FFFFFFFFULL, 32, 0, "lower 32 bits set"}, + {0xFFFFFFFF00000000ULL, 64, 32, "upper 32 bits set"}, + {0x8000000000000001ULL, 64, 0, "bits 0,63 set"}, + {0x4000000000000002ULL, 63, 1, "bits 1,62 set"}, + + /* Mixed patterns */ + {0x00000001FFFFFFFFULL, 33, 0, "bit 32 + lower 32 bits"}, + {0xFFFFFFFF80000000ULL, 64, 31, "upper 32 bits + bit 31"}, +}; + +/* + * Helper function to validate ffs results with detailed error messages + */ +static void validate_ffs_result(struct kunit *test, unsigned long input, + int actual, int expected, const char *func_name, + const char *description) +{ + KUNIT_EXPECT_EQ_MSG(test, actual, expected, + "%s(0x%08lx) [%s]: expected %d, got %d", + func_name, input, description, expected, actual); +} + +/* + * Helper function to validate 64-bit ffs results + */ +static void validate_ffs64_result(struct kunit *test, u64 input, + int actual, int expected, const char *func_name, + const char *description) +{ + KUNIT_EXPECT_EQ_MSG(test, actual, expected, + "%s(0x%016llx) [%s]: expected %d, got %d", + func_name, input, description, expected, actual); +} + +/* + * Helper function to validate mathematical relationships between functions + */ +static void validate_ffs_relationships(struct kunit *test, unsigned long input) +{ + int ffs_result; + int fls_result; + unsigned int ffs_0based; + unsigned int fls_0based; + + if (input == 0) { + /* Special case: zero input */ + KUNIT_EXPECT_EQ(test, ffs(input), 0); + KUNIT_EXPECT_EQ(test, fls(input), 0); + /* __ffs and __fls are undefined for 0, but often return specific values */ + return; + } + + ffs_result = ffs(input); + fls_result = fls(input); + ffs_0based = __ffs(input); + fls_0based = __fls(input); + + /* Relationship: ffs(x) == __ffs(x) + 1 for x != 0 */ + KUNIT_EXPECT_EQ_MSG(test, ffs_result, ffs_0based + 1, + "ffs(0x%08lx) != __ffs(0x%08lx) + 1: %d != %u + 1", + input, input, ffs_result, ffs_0based); + + /* Relationship: fls(x) == __fls(x) + 1 for x != 0 */ + KUNIT_EXPECT_EQ_MSG(test, fls_result, fls_0based + 1, + "fls(0x%08lx) != __fls(0x%08lx) + 1: %d != %u + 1", + input, input, fls_result, fls_0based); + + /* Range validation */ + KUNIT_EXPECT_GE(test, ffs_result, 1); + KUNIT_EXPECT_LE(test, ffs_result, BITS_PER_LONG); + KUNIT_EXPECT_GE(test, fls_result, 1); + KUNIT_EXPECT_LE(test, fls_result, BITS_PER_LONG); +} + +/* + * Helper function to validate 64-bit relationships + */ +static void validate_ffs64_relationships(struct kunit *test, u64 input) +{ + int fls64_result; + unsigned int ffs64_0based; + + if (input == 0) { + KUNIT_EXPECT_EQ(test, fls64(input), 0); + return; + } + + fls64_result = fls64(input); + ffs64_0based = __ffs64(input); + + /* Range validation */ + KUNIT_EXPECT_GE(test, fls64_result, 1); + KUNIT_EXPECT_LE(test, fls64_result, 64); + KUNIT_EXPECT_LT(test, ffs64_0based, 64); + + /* + * Relationships with 32-bit functions should hold for small values + * on all architectures. + */ + if (input <= 0xFFFFFFFFULL) { + unsigned long input_32 = (unsigned long)input; + KUNIT_EXPECT_EQ_MSG(test, fls64(input), fls(input_32), + "fls64(0x%llx) != fls(0x%lx): %d != %d", + input, input_32, fls64(input), fls(input_32)); + + if (input != 0) { + KUNIT_EXPECT_EQ_MSG(test, __ffs64(input), __ffs(input_32), + "__ffs64(0x%llx) != __ffs(0x%lx): %lu != %lu", + input, input_32, + (unsigned long)__ffs64(input), + (unsigned long)__ffs(input_32)); + } + } +} + +/* + * Test basic correctness of all ffs-family functions + */ +static void ffs_basic_correctness_test(struct kunit *test) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) { + const struct ffs_test_case *tc = &basic_test_cases[i]; + + /* Test ffs() */ + validate_ffs_result(test, tc->input, ffs(tc->input), + tc->expected_ffs, "ffs", tc->description); + + /* Test fls() */ + validate_ffs_result(test, tc->input, fls(tc->input), + tc->expected_fls, "fls", tc->description); + + /* Test __ffs() - skip zero case as it's undefined */ + if (tc->input != 0) { + /* Calculate expected __ffs() result: __ffs(x) == ffs(x) - 1 */ + unsigned int expected_ffs_0based = tc->expected_ffs - 1; + validate_ffs_result(test, tc->input, __ffs(tc->input), + expected_ffs_0based, "__ffs", tc->description); + } + + /* Test __fls() - skip zero case as it's undefined */ + if (tc->input != 0) { + /* Calculate expected __fls() result: __fls(x) == fls(x) - 1 */ + unsigned int expected_fls_0based = tc->expected_fls - 1; + validate_ffs_result(test, tc->input, __fls(tc->input), + expected_fls_0based, "__fls", tc->description); + } + } +} + +/* + * Test 64-bit function correctness + */ +static void ffs64_correctness_test(struct kunit *test) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) { + const struct ffs64_test_case *tc = &ffs64_test_cases[i]; + + /* Test fls64() */ + validate_ffs64_result(test, tc->input, fls64(tc->input), + tc->expected_fls64, "fls64", tc->description); + + /* Test __ffs64() - skip zero case as it's undefined */ + if (tc->input != 0) { + validate_ffs64_result(test, tc->input, __ffs64(tc->input), + tc->expected_ffs64_0based, "__ffs64", + tc->description); + } + } +} + +/* + * Test mathematical relationships between functions + */ +static void ffs_mathematical_relationships_test(struct kunit *test) +{ + int i; + + /* Test basic cases */ + for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) { + validate_ffs_relationships(test, basic_test_cases[i].input); + } + + /* Test 64-bit cases */ + for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) { + validate_ffs64_relationships(test, ffs64_test_cases[i].input); + } +} + +/* + * Test edge cases and boundary conditions + */ +static void ffs_edge_cases_test(struct kunit *test) +{ + unsigned long test_patterns[] = { + /* Powers of 2 */ + 1UL, 2UL, 4UL, 8UL, 16UL, 32UL, 64UL, 128UL, + 256UL, 512UL, 1024UL, 2048UL, 4096UL, 8192UL, + + /* Powers of 2 minus 1 */ + 1UL, 3UL, 7UL, 15UL, 31UL, 63UL, 127UL, 255UL, + 511UL, 1023UL, 2047UL, 4095UL, 8191UL, + + /* Boundary values */ + 0x7FFFFFFFUL, /* Maximum positive 32-bit */ + 0x80000000UL, /* Minimum negative 32-bit */ + 0xFFFFFFFFUL, /* Maximum 32-bit unsigned */ + }; + int i; + + for (i = 0; i < ARRAY_SIZE(test_patterns); i++) { + validate_ffs_relationships(test, test_patterns[i]); + } +} + +/* + * Test 64-bit edge cases + */ +static void ffs64_edge_cases_test(struct kunit *test) +{ + u64 test_patterns_64[] = { + /* 64-bit powers of 2 */ + 0x0000000100000000ULL, /* 2^32 */ + 0x0000000200000000ULL, /* 2^33 */ + 0x0000000400000000ULL, /* 2^34 */ + 0x0000001000000000ULL, /* 2^36 */ + 0x0000010000000000ULL, /* 2^40 */ + 0x0001000000000000ULL, /* 2^48 */ + 0x0100000000000000ULL, /* 2^56 */ + 0x4000000000000000ULL, /* 2^62 */ + 0x8000000000000000ULL, /* 2^63 */ + + /* Cross-boundary patterns */ + 0x00000000FFFFFFFFULL, /* Lower 32 bits */ + 0xFFFFFFFF00000000ULL, /* Upper 32 bits */ + 0x7FFFFFFFFFFFFFFFULL, /* Maximum positive 64-bit */ + 0xFFFFFFFFFFFFFFFFULL, /* Maximum 64-bit unsigned */ + }; + int i; + + for (i = 0; i < ARRAY_SIZE(test_patterns_64); i++) { + validate_ffs64_relationships(test, test_patterns_64[i]); + } +} + +/* + * ffz() test data - Find First Zero bit test cases + */ +struct ffz_test_case { + unsigned long input; + unsigned long expected_ffz; + const char *description; +}; + +static const struct ffz_test_case ffz_test_cases[] = { + /* Zero bits in specific positions */ + {0xFFFFFFFE, 0, "bit 0 is zero"}, /* ...11111110 */ + {0xFFFFFFFD, 1, "bit 1 is zero"}, /* ...11111101 */ + {0xFFFFFFFB, 2, "bit 2 is zero"}, /* ...11111011 */ + {0xFFFFFFF7, 3, "bit 3 is zero"}, /* ...11110111 */ + {0xFFFFFFEF, 4, "bit 4 is zero"}, /* ...11101111 */ + {0xFFFFFFDF, 5, "bit 5 is zero"}, /* ...11011111 */ + {0xFFFFFFBF, 6, "bit 6 is zero"}, /* ...10111111 */ + {0xFFFFFF7F, 7, "bit 7 is zero"}, /* ...01111111 */ + {0xFFFFFEFF, 8, "bit 8 is zero"}, /* Gap in bit 8 */ + {0xFFFF7FFF, 15, "bit 15 is zero"}, /* Gap in bit 15 */ + {0xFFFEFFFF, 16, "bit 16 is zero"}, /* Gap in bit 16 */ + {0xBFFFFFFF, 30, "bit 30 is zero"}, /* Gap in bit 30 */ + {0x7FFFFFFF, 31, "bit 31 is zero"}, /* 01111111... */ + + /* Multiple zero patterns */ + {0xFFFFFFFC, 0, "bits 0-1 are zero"}, /* ...11111100 */ + {0xFFFFFFF8, 0, "bits 0-2 are zero"}, /* ...11111000 */ + {0xFFFFFFF0, 0, "bits 0-3 are zero"}, /* ...11110000 */ + {0xFFFFFF00, 0, "bits 0-7 are zero"}, /* ...00000000 */ + {0xFFFF0000, 0, "bits 0-15 are zero"}, /* Lower 16 bits zero */ + + /* All zeros (special case) */ + {0x00000000, 0, "all bits zero"}, + + /* Complex patterns */ + {0xFFFDFFFF, 17, "bit 17 is zero"}, /* Gap in bit 17 */ + {0xFFF7FFFF, 19, "bit 19 is zero"}, /* Gap in bit 19 */ + {0xF7FFFFFF, 27, "bit 27 is zero"}, /* Gap in bit 27 */ + {0xDFFFFFFF, 29, "bit 29 is zero"}, /* Gap in bit 29 */ +}; + +/* + * Test basic correctness of ffz() function + */ +static void ffz_basic_correctness_test(struct kunit *test) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) { + const struct ffz_test_case *tc = &ffz_test_cases[i]; + unsigned long result = ffz(tc->input); + + KUNIT_EXPECT_EQ_MSG(test, result, tc->expected_ffz, + "ffz(0x%08lx) [%s]: expected %lu, got %lu", + tc->input, tc->description, tc->expected_ffz, result); + } +} + +/* + * Test mathematical relationships between ffz() and other functions + */ +static void validate_ffz_relationships(struct kunit *test, unsigned long input) +{ + unsigned long ffz_result; + + if (input == 0) { + /* ffz(0) should return 0 (first zero bit is at position 0) */ + KUNIT_EXPECT_EQ(test, ffz(input), 0); + return; + } + + if (input == ~0UL) { + /* ffz(~0) is undefined (no zero bits) - just verify it doesn't crash */ + ffz_result = ffz(input); + /* Implementation-defined behavior, just ensure it completes */ + return; + } + + ffz_result = ffz(input); + + /* Range validation - result should be within valid bit range */ + KUNIT_EXPECT_LT(test, ffz_result, BITS_PER_LONG); + + /* Verify the bit at ffz_result position is actually zero */ + KUNIT_EXPECT_EQ_MSG(test, (input >> ffz_result) & 1, 0, + "ffz(0x%08lx) = %lu, but bit %lu is not zero", + input, ffz_result, ffz_result); + + /* Core relationship: if we set the ffz bit, ffz should find a different bit */ + if (ffz_result < BITS_PER_LONG - 1) { + unsigned long modified = input | (1UL << ffz_result); + if (modified != ~0UL) { /* Skip if all bits would be set */ + unsigned long new_ffz = ffz(modified); + KUNIT_EXPECT_NE_MSG(test, new_ffz, ffz_result, + "ffz(0x%08lx) = %lu, but setting that bit doesn't change ffz result", + input, ffz_result); + } + } +} + +static void ffz_mathematical_relationships_test(struct kunit *test) +{ + unsigned long test_patterns[] = { + /* Powers of 2 with one bit clear */ + 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7, + 0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F, + + /* Multiple patterns */ + 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000, + 0x7FFFFFFF, 0x3FFFFFFF, 0x1FFFFFFF, 0x0FFFFFFF, + + /* Complex bit patterns */ + 0xAAAAAAAA, 0x55555555, 0xCCCCCCCC, 0x33333333, + 0xF0F0F0F0, 0x0F0F0F0F, 0xFF00FF00, 0x00FF00FF, + }; + int i; + + /* Test basic test cases */ + for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) { + validate_ffz_relationships(test, ffz_test_cases[i].input); + } + + /* Test additional patterns */ + for (i = 0; i < ARRAY_SIZE(test_patterns); i++) { + validate_ffz_relationships(test, test_patterns[i]); + } +} + +/* + * Test edge cases and boundary conditions for ffz() + */ +static void ffz_edge_cases_test(struct kunit *test) +{ + unsigned long edge_patterns[] = { + /* Boundary values */ + 0x00000000, /* All zeros */ + 0x80000000, /* Only MSB set */ + 0x00000001, /* Only LSB set */ + 0x7FFFFFFF, /* MSB clear */ + 0xFFFFFFFE, /* LSB clear */ + + /* Powers of 2 complement patterns (one zero bit each) */ + ~(1UL << 0), ~(1UL << 1), ~(1UL << 2), ~(1UL << 3), + ~(1UL << 4), ~(1UL << 8), ~(1UL << 16), ~(1UL << 31), + + /* Walking zero patterns */ + 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7, + 0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F, + 0xFFFFFEFF, 0xFFFFFDFF, 0xFFFFFBFF, 0xFFFFF7FF, + + /* Multiple zeros */ + 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000, + 0xFF000000, 0xF0000000, 0x00000000, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(edge_patterns); i++) { + validate_ffz_relationships(test, edge_patterns[i]); + } +} + +/* + * KUnit test case definitions + */ +static struct kunit_case ffs_test_cases[] = { + KUNIT_CASE(ffs_basic_correctness_test), + KUNIT_CASE(ffs64_correctness_test), + KUNIT_CASE(ffs_mathematical_relationships_test), + KUNIT_CASE(ffs_edge_cases_test), + KUNIT_CASE(ffs64_edge_cases_test), + KUNIT_CASE(ffz_basic_correctness_test), + KUNIT_CASE(ffz_mathematical_relationships_test), + KUNIT_CASE(ffz_edge_cases_test), + KUNIT_CASE(ffs_attribute_const_test), + {} +}; + +/* + * KUnit test suite definition + */ +static struct kunit_suite ffs_test_suite = { + .name = "ffs", + .test_cases = ffs_test_cases, +}; + +kunit_test_suites(&ffs_test_suite); + +MODULE_DESCRIPTION("KUnit tests for ffs()-family functions"); +MODULE_LICENSE("GPL"); From 6606c8c7e81886565f5cbdb0c0ce82e280c2b229 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:43:58 -0700 Subject: [PATCH 1193/3206] bitops: Add __attribute_const__ to generic ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to generic implementations of ffs(), __ffs(), fls(), and __fls() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested with x86_64 defconfig using GCC 14.2.0, which should validate the implementations when used by ARM, ARM64, LoongArch, Microblaze, NIOS2, and SPARC32 architectures. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-2-kees@kernel.org Signed-off-by: Kees Cook --- include/asm-generic/bitops/__ffs.h | 2 +- include/asm-generic/bitops/__fls.h | 2 +- include/asm-generic/bitops/builtin-__ffs.h | 2 +- include/asm-generic/bitops/builtin-__fls.h | 2 +- include/asm-generic/bitops/builtin-fls.h | 2 +- include/asm-generic/bitops/ffs.h | 2 +- include/asm-generic/bitops/fls.h | 2 +- include/asm-generic/bitops/fls64.h | 4 ++-- include/linux/bitops.h | 2 +- lib/clz_ctz.c | 8 ++++---- 10 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/asm-generic/bitops/__ffs.h b/include/asm-generic/bitops/__ffs.h index 2d08c750c8a730..3a899c626fdc35 100644 --- a/include/asm-generic/bitops/__ffs.h +++ b/include/asm-generic/bitops/__ffs.h @@ -10,7 +10,7 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned int generic___ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned int generic___ffs(unsigned long word) { unsigned int num = 0; diff --git a/include/asm-generic/bitops/__fls.h b/include/asm-generic/bitops/__fls.h index e974ec932ec189..35f33780ca6c37 100644 --- a/include/asm-generic/bitops/__fls.h +++ b/include/asm-generic/bitops/__fls.h @@ -10,7 +10,7 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned int generic___fls(unsigned long word) +static __always_inline __attribute_const__ unsigned int generic___fls(unsigned long word) { unsigned int num = BITS_PER_LONG - 1; diff --git a/include/asm-generic/bitops/builtin-__ffs.h b/include/asm-generic/bitops/builtin-__ffs.h index cf4b3d33bf961e..d3c3f567045d46 100644 --- a/include/asm-generic/bitops/builtin-__ffs.h +++ b/include/asm-generic/bitops/builtin-__ffs.h @@ -8,7 +8,7 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned int __ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned int __ffs(unsigned long word) { return __builtin_ctzl(word); } diff --git a/include/asm-generic/bitops/builtin-__fls.h b/include/asm-generic/bitops/builtin-__fls.h index 6d72fc8a525953..7770c4f1bfcd49 100644 --- a/include/asm-generic/bitops/builtin-__fls.h +++ b/include/asm-generic/bitops/builtin-__fls.h @@ -8,7 +8,7 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned int __fls(unsigned long word) +static __always_inline __attribute_const__ unsigned int __fls(unsigned long word) { return (sizeof(word) * 8) - 1 - __builtin_clzl(word); } diff --git a/include/asm-generic/bitops/builtin-fls.h b/include/asm-generic/bitops/builtin-fls.h index c8455cc28841af..be707da8c7cdd2 100644 --- a/include/asm-generic/bitops/builtin-fls.h +++ b/include/asm-generic/bitops/builtin-fls.h @@ -9,7 +9,7 @@ * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline __attribute_const__ int fls(unsigned int x) { return x ? sizeof(x) * 8 - __builtin_clz(x) : 0; } diff --git a/include/asm-generic/bitops/ffs.h b/include/asm-generic/bitops/ffs.h index 4c43f242daeb17..5ff2b7fbda6d8c 100644 --- a/include/asm-generic/bitops/ffs.h +++ b/include/asm-generic/bitops/ffs.h @@ -10,7 +10,7 @@ * the libc and compiler builtin ffs routines, therefore * differs in spirit from ffz (man ffs). */ -static inline int generic_ffs(int x) +static inline __attribute_const__ int generic_ffs(int x) { int r = 1; diff --git a/include/asm-generic/bitops/fls.h b/include/asm-generic/bitops/fls.h index 26f3ce1dd6e448..8eed3437edb95c 100644 --- a/include/asm-generic/bitops/fls.h +++ b/include/asm-generic/bitops/fls.h @@ -10,7 +10,7 @@ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int generic_fls(unsigned int x) +static __always_inline __attribute_const__ int generic_fls(unsigned int x) { int r = 32; diff --git a/include/asm-generic/bitops/fls64.h b/include/asm-generic/bitops/fls64.h index 866f2b2304ff63..b5f58dd261a370 100644 --- a/include/asm-generic/bitops/fls64.h +++ b/include/asm-generic/bitops/fls64.h @@ -16,7 +16,7 @@ * at position 64. */ #if BITS_PER_LONG == 32 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { __u32 h = x >> 32; if (h) @@ -24,7 +24,7 @@ static __always_inline int fls64(__u64 x) return fls(x); } #elif BITS_PER_LONG == 64 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { if (x == 0) return 0; diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 9be2d50da09a41..ea7898cc590396 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -267,7 +267,7 @@ static inline int parity8(u8 val) * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ -static inline unsigned int __ffs64(u64 word) +static inline __attribute_const__ unsigned int __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c index fb8c0c5c2bd277..8778ec44bf630d 100644 --- a/lib/clz_ctz.c +++ b/lib/clz_ctz.c @@ -15,28 +15,28 @@ #include int __weak __ctzsi2(int val); -int __weak __ctzsi2(int val) +int __weak __attribute_const__ __ctzsi2(int val) { return __ffs(val); } EXPORT_SYMBOL(__ctzsi2); int __weak __clzsi2(int val); -int __weak __clzsi2(int val) +int __weak __attribute_const__ __clzsi2(int val) { return 32 - fls(val); } EXPORT_SYMBOL(__clzsi2); int __weak __clzdi2(u64 val); -int __weak __clzdi2(u64 val) +int __weak __attribute_const__ __clzdi2(u64 val) { return 64 - fls64(val); } EXPORT_SYMBOL(__clzdi2); int __weak __ctzdi2(u64 val); -int __weak __ctzdi2(u64 val) +int __weak __attribute_const__ __ctzdi2(u64 val) { return __ffs64(val); } From 4452a0dfc5bddf4df3a945d2e9ecb201d52d164a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:43:59 -0700 Subject: [PATCH 1194/3206] csky: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to C-SKY's implementations of ffs(), __ffs(), fls(), and __fls() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=csky defconfig with GCC csky-linux 15.1.0. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-3-kees@kernel.org Signed-off-by: Kees Cook --- arch/csky/include/asm/bitops.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h index 72e1b2aa29a07d..80d67eee6e860b 100644 --- a/arch/csky/include/asm/bitops.h +++ b/arch/csky/include/asm/bitops.h @@ -9,7 +9,7 @@ /* * asm-generic/bitops/ffs.h */ -static inline int ffs(int x) +static inline __attribute_const__ int ffs(int x) { if (!x) return 0; @@ -26,7 +26,7 @@ static inline int ffs(int x) /* * asm-generic/bitops/__ffs.h */ -static __always_inline unsigned long __ffs(unsigned long x) +static __always_inline __attribute_const__ unsigned long __ffs(unsigned long x) { asm volatile ( "brev %0\n" @@ -39,7 +39,7 @@ static __always_inline unsigned long __ffs(unsigned long x) /* * asm-generic/bitops/fls.h */ -static __always_inline int fls(unsigned int x) +static __always_inline __attribute_const__ int fls(unsigned int x) { asm volatile( "ff1 %0\n" @@ -52,7 +52,7 @@ static __always_inline int fls(unsigned int x) /* * asm-generic/bitops/__fls.h */ -static __always_inline unsigned long __fls(unsigned long x) +static __always_inline __attribute_const__ unsigned long __fls(unsigned long x) { return fls(x) - 1; } From fca08b748d1773dd9743abc063379664986e276d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:00 -0700 Subject: [PATCH 1195/3206] x86: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to x86's implementations of variable__ffs(), variable_ffz(), __fls(), variable_ffs(), and fls() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=x86_64 defconfig with GCC gcc 14.2.0. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-4-kees@kernel.org Signed-off-by: Kees Cook --- arch/x86/include/asm/bitops.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index eebbc8889e70f5..a835f891164d47 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -246,7 +246,7 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) variable_test_bit(nr, addr); } -static __always_inline unsigned long variable__ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned long variable__ffs(unsigned long word) { asm("tzcnt %1,%0" : "=r" (word) @@ -265,7 +265,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word) (unsigned long)__builtin_ctzl(word) : \ variable__ffs(word)) -static __always_inline unsigned long variable_ffz(unsigned long word) +static __always_inline __attribute_const__ unsigned long variable_ffz(unsigned long word) { return variable__ffs(~word); } @@ -287,7 +287,7 @@ static __always_inline unsigned long variable_ffz(unsigned long word) * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __fls(unsigned long word) +static __always_inline __attribute_const__ unsigned long __fls(unsigned long word) { if (__builtin_constant_p(word)) return BITS_PER_LONG - 1 - __builtin_clzl(word); @@ -301,7 +301,7 @@ static __always_inline unsigned long __fls(unsigned long word) #undef ADDR #ifdef __KERNEL__ -static __always_inline int variable_ffs(int x) +static __always_inline __attribute_const__ int variable_ffs(int x) { int r; @@ -355,7 +355,7 @@ static __always_inline int variable_ffs(int x) * set bit if value is nonzero. The last (most significant) bit is * at position 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline __attribute_const__ int fls(unsigned int x) { int r; @@ -400,7 +400,7 @@ static __always_inline int fls(unsigned int x) * at position 64. */ #ifdef CONFIG_X86_64 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { int bitpos = -1; From 69057d3db759cc260aee4ab189b76ae91fbc18f9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:01 -0700 Subject: [PATCH 1196/3206] powerpc: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to PowerPC's implementations of fls() function. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=powerpc defconfig with GCC powerpc-linux-gnu 14.2.0. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Madhavan Srinivasan Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-5-kees@kernel.org Signed-off-by: Kees Cook --- arch/powerpc/include/asm/bitops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 671ecc6711e366..0d0470cd5ac31d 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -276,7 +276,7 @@ static inline void arch___clear_bit_unlock(int nr, volatile unsigned long *addr) * fls: find last (most-significant) bit set. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline __attribute_const__ int fls(unsigned int x) { int lz; @@ -294,7 +294,7 @@ static __always_inline int fls(unsigned int x) * 32-bit fls calls. */ #ifdef CONFIG_PPC64 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { int lz; From 4251f58f620716163575d6b6f7e6a10d43fb5ca5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:02 -0700 Subject: [PATCH 1197/3206] sh: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to SH's implementations of __ffs() and ffz() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=sh defconfig with GCC sh4-linux-gnu 14.2.0. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-6-kees@kernel.org Signed-off-by: Kees Cook --- arch/sh/include/asm/bitops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sh/include/asm/bitops.h b/arch/sh/include/asm/bitops.h index 10ceb0d6b5a997..aba3aa96a50e99 100644 --- a/arch/sh/include/asm/bitops.h +++ b/arch/sh/include/asm/bitops.h @@ -24,7 +24,7 @@ #include #endif -static inline unsigned long ffz(unsigned long word) +static inline unsigned long __attribute_const__ ffz(unsigned long word) { unsigned long result; @@ -44,7 +44,7 @@ static inline unsigned long ffz(unsigned long word) * * Undefined if no bit exists, so code should check against 0 first. */ -static inline unsigned long __ffs(unsigned long word) +static inline __attribute_const__ unsigned long __ffs(unsigned long word) { unsigned long result; From a8d060ddeed52b4e4d0264b34353f025c421e4b9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:03 -0700 Subject: [PATCH 1198/3206] alpha: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to Alpha's implementations of __ffs(), ffs(), fls64(), __fls(), fls(), and ffz() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=alpha defconfig with GCC alpha-linux-gnu 14.2.0. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-7-kees@kernel.org Signed-off-by: Kees Cook --- arch/alpha/include/asm/bitops.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h index 3e33621922c31b..76e4343c090f7d 100644 --- a/arch/alpha/include/asm/bitops.h +++ b/arch/alpha/include/asm/bitops.h @@ -328,7 +328,7 @@ static inline unsigned long ffz_b(unsigned long x) return sum; } -static inline unsigned long ffz(unsigned long word) +static inline unsigned long __attribute_const__ ffz(unsigned long word) { #if defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67) /* Whee. EV67 can calculate it directly. */ @@ -348,7 +348,7 @@ static inline unsigned long ffz(unsigned long word) /* * __ffs = Find First set bit in word. Undefined if no set bit exists. */ -static inline unsigned long __ffs(unsigned long word) +static inline __attribute_const__ unsigned long __ffs(unsigned long word) { #if defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67) /* Whee. EV67 can calculate it directly. */ @@ -373,7 +373,7 @@ static inline unsigned long __ffs(unsigned long word) * differs in spirit from the above __ffs. */ -static inline int ffs(int word) +static inline __attribute_const__ int ffs(int word) { int result = __ffs(word) + 1; return word ? result : 0; @@ -383,14 +383,14 @@ static inline int ffs(int word) * fls: find last bit set. */ #if defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67) -static inline int fls64(unsigned long word) +static inline __attribute_const__ int fls64(unsigned long word) { return 64 - __kernel_ctlz(word); } #else extern const unsigned char __flsm1_tab[256]; -static inline int fls64(unsigned long x) +static inline __attribute_const__ int fls64(unsigned long x) { unsigned long t, a, r; @@ -403,12 +403,12 @@ static inline int fls64(unsigned long x) } #endif -static inline unsigned long __fls(unsigned long x) +static inline __attribute_const__ unsigned long __fls(unsigned long x) { return fls64(x) - 1; } -static inline int fls(unsigned int x) +static inline __attribute_const__ int fls(unsigned int x) { return fls64(x); } From 799776f3360d7505be25adb0d06e28b183c8ad70 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:04 -0700 Subject: [PATCH 1199/3206] hexagon: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to Hexagon's implementations of fls(), ffs(), __ffs(), __fls(), and ffz() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=hexagon defconfig with Clang 21.0.0git (LLVM=1). Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-8-kees@kernel.org Signed-off-by: Kees Cook --- arch/hexagon/include/asm/bitops.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h index 160d8f37fa1a34..b23cb13833af1c 100644 --- a/arch/hexagon/include/asm/bitops.h +++ b/arch/hexagon/include/asm/bitops.h @@ -200,7 +200,7 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) * * Undefined if no zero exists, so code should check against ~0UL first. */ -static inline long ffz(int x) +static inline long __attribute_const__ ffz(int x) { int r; @@ -217,7 +217,7 @@ static inline long ffz(int x) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(unsigned int x) +static inline __attribute_const__ int fls(unsigned int x) { int r; @@ -238,7 +238,7 @@ static inline int fls(unsigned int x) * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ -static inline int ffs(int x) +static inline __attribute_const__ int ffs(int x) { int r; @@ -260,7 +260,7 @@ static inline int ffs(int x) * bits_per_long assumed to be 32 * numbering starts at 0 I think (instead of 1 like ffs) */ -static inline unsigned long __ffs(unsigned long word) +static inline __attribute_const__ unsigned long __ffs(unsigned long word) { int num; @@ -278,7 +278,7 @@ static inline unsigned long __ffs(unsigned long word) * Undefined if no set bit exists, so code should check against 0 first. * bits_per_long assumed to be 32 */ -static inline unsigned long __fls(unsigned long word) +static inline __attribute_const__ unsigned long __fls(unsigned long word) { int num; From c51c26e687a649df9a792f71759105e12e5d082f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:05 -0700 Subject: [PATCH 1200/3206] riscv: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to RISC-V's implementations of variable__ffs(), variable__fls(), and variable_ffs() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=riscv defconfig with GCC riscv64-linux-gnu 14.2.0. Link: https://github.com/KSPP/linux/issues/364 [1] Tested-by: Alexandre Ghiti Acked-by: Alexandre Ghiti Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-9-kees@kernel.org Signed-off-by: Kees Cook --- arch/riscv/include/asm/bitops.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index d59310f74c2ba7..77880677b06e03 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -45,7 +45,7 @@ #error "Unexpected BITS_PER_LONG" #endif -static __always_inline unsigned long variable__ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned long variable__ffs(unsigned long word) { asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) @@ -74,7 +74,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word) (unsigned long)__builtin_ctzl(word) : \ variable__ffs(word)) -static __always_inline unsigned long variable__fls(unsigned long word) +static __always_inline __attribute_const__ unsigned long variable__fls(unsigned long word) { asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) @@ -103,7 +103,7 @@ static __always_inline unsigned long variable__fls(unsigned long word) (unsigned long)(BITS_PER_LONG - 1 - __builtin_clzl(word)) : \ variable__fls(word)) -static __always_inline int variable_ffs(int x) +static __always_inline __attribute_const__ int variable_ffs(int x) { asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) From acfab97bef41324e72784fcfdb6ce7996a8bf548 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:06 -0700 Subject: [PATCH 1201/3206] openrisc: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to OpenRISC's implementations of ffs(), __ffs(), fls(), and __fls() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=openrisc defconfig with GCC or1k-linux 15.1.0. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Stafford Horne Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-10-kees@kernel.org Signed-off-by: Kees Cook --- arch/openrisc/include/asm/bitops/__ffs.h | 2 +- arch/openrisc/include/asm/bitops/__fls.h | 2 +- arch/openrisc/include/asm/bitops/ffs.h | 2 +- arch/openrisc/include/asm/bitops/fls.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/openrisc/include/asm/bitops/__ffs.h b/arch/openrisc/include/asm/bitops/__ffs.h index 1e224b616fdf68..4827b66530b2bd 100644 --- a/arch/openrisc/include/asm/bitops/__ffs.h +++ b/arch/openrisc/include/asm/bitops/__ffs.h @@ -11,7 +11,7 @@ #ifdef CONFIG_OPENRISC_HAVE_INST_FF1 -static inline unsigned long __ffs(unsigned long x) +static inline __attribute_const__ unsigned long __ffs(unsigned long x) { int ret; diff --git a/arch/openrisc/include/asm/bitops/__fls.h b/arch/openrisc/include/asm/bitops/__fls.h index 9658446ad14102..637cc76fe4b7d6 100644 --- a/arch/openrisc/include/asm/bitops/__fls.h +++ b/arch/openrisc/include/asm/bitops/__fls.h @@ -11,7 +11,7 @@ #ifdef CONFIG_OPENRISC_HAVE_INST_FL1 -static inline unsigned long __fls(unsigned long x) +static inline __attribute_const__ unsigned long __fls(unsigned long x) { int ret; diff --git a/arch/openrisc/include/asm/bitops/ffs.h b/arch/openrisc/include/asm/bitops/ffs.h index b4c835d6bc8496..536a60ab9cc30f 100644 --- a/arch/openrisc/include/asm/bitops/ffs.h +++ b/arch/openrisc/include/asm/bitops/ffs.h @@ -10,7 +10,7 @@ #ifdef CONFIG_OPENRISC_HAVE_INST_FF1 -static inline int ffs(int x) +static inline __attribute_const__ int ffs(int x) { int ret; diff --git a/arch/openrisc/include/asm/bitops/fls.h b/arch/openrisc/include/asm/bitops/fls.h index 6b77f6556fb9c1..77da7639bb3e4f 100644 --- a/arch/openrisc/include/asm/bitops/fls.h +++ b/arch/openrisc/include/asm/bitops/fls.h @@ -11,7 +11,7 @@ #ifdef CONFIG_OPENRISC_HAVE_INST_FL1 -static inline int fls(unsigned int x) +static inline __attribute_const__ int fls(unsigned int x) { int ret; From 50c869a6cecabb522f157fa6cc41bca2eb8aecb4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:07 -0700 Subject: [PATCH 1202/3206] m68k: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to M68K's implementations of ffs(), __ffs(), fls(), __fls(), and ffz() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=m68k defconfig with GCC m68k-linux-gnu 14.2.0. Link: https://github.com/KSPP/linux/issues/364 [1] Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-11-kees@kernel.org Signed-off-by: Kees Cook --- arch/m68k/include/asm/bitops.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index 14c64a6f121762..139ec9289ff2d3 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -465,7 +465,7 @@ static inline int find_next_bit(const unsigned long *vaddr, int size, * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ -static inline unsigned long ffz(unsigned long word) +static inline unsigned long __attribute_const__ ffz(unsigned long word) { int res; @@ -488,7 +488,7 @@ static inline unsigned long ffz(unsigned long word) */ #if (defined(__mcfisaaplus__) || defined(__mcfisac__)) && \ !defined(CONFIG_M68000) -static inline unsigned long __ffs(unsigned long x) +static inline __attribute_const__ unsigned long __ffs(unsigned long x) { __asm__ __volatile__ ("bitrev %0; ff1 %0" : "=d" (x) @@ -496,7 +496,7 @@ static inline unsigned long __ffs(unsigned long x) return x; } -static inline int ffs(int x) +static inline __attribute_const__ int ffs(int x) { if (!x) return 0; @@ -518,7 +518,7 @@ static inline int ffs(int x) * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ -static inline int ffs(int x) +static inline __attribute_const__ int ffs(int x) { int cnt; @@ -528,7 +528,7 @@ static inline int ffs(int x) return 32 - cnt; } -static inline unsigned long __ffs(unsigned long x) +static inline __attribute_const__ unsigned long __ffs(unsigned long x) { return ffs(x) - 1; } @@ -536,7 +536,7 @@ static inline unsigned long __ffs(unsigned long x) /* * fls: find last bit set. */ -static inline int fls(unsigned int x) +static inline __attribute_const__ int fls(unsigned int x) { int cnt; @@ -546,7 +546,7 @@ static inline int fls(unsigned int x) return 32 - cnt; } -static inline unsigned long __fls(unsigned long x) +static inline __attribute_const__ unsigned long __fls(unsigned long x) { return fls(x) - 1; } From 32913fe7f71e72fb49033df35ec6388ff7b170ff Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:08 -0700 Subject: [PATCH 1203/3206] mips: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to MIPS's implementations of ffs(), __ffs(), fls(), and __fls() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=mips defconfig with GCC mipsel-linux-gnu 14.2.0. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-12-kees@kernel.org Signed-off-by: Kees Cook --- arch/mips/include/asm/bitops.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index 89f73d1a4ea4e7..42f88452c920fb 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -327,7 +327,7 @@ static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long * * Return the bit position (0..63) of the most significant 1 bit in a word * Returns -1 if no 1 bit exists */ -static __always_inline unsigned long __fls(unsigned long word) +static __always_inline __attribute_const__ unsigned long __fls(unsigned long word) { int num; @@ -393,7 +393,7 @@ static __always_inline unsigned long __fls(unsigned long word) * Returns 0..SZLONG-1 * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned long __ffs(unsigned long word) { return __fls(word & -word); } @@ -405,7 +405,7 @@ static __always_inline unsigned long __ffs(unsigned long word) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(unsigned int x) +static inline __attribute_const__ int fls(unsigned int x) { int r; @@ -458,7 +458,7 @@ static inline int fls(unsigned int x) * the libc and compiler builtin ffs routines, therefore * differs in spirit from the below ffz (man ffs). */ -static inline int ffs(int word) +static inline __attribute_const__ int ffs(int word) { if (!word) return 0; From 28fc0972e392ed3cfa579f3e3659d615cbac647b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:09 -0700 Subject: [PATCH 1204/3206] parisc: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to PARISC's implementations of ffs(), __ffs(), and fls() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=parisc defconfig with GCC hppa-linux-gnu 14.2.0. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Helge Deller Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-13-kees@kernel.org Signed-off-by: Kees Cook --- arch/parisc/include/asm/bitops.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index 0ec9cfc5131fc3..bd1280a8a5ece0 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -123,7 +123,7 @@ static __inline__ int test_and_change_bit(int nr, volatile unsigned long * addr) * cycles for each mispredicted branch. */ -static __inline__ unsigned long __ffs(unsigned long x) +static __inline__ __attribute_const__ unsigned long __ffs(unsigned long x) { unsigned long ret; @@ -161,7 +161,7 @@ static __inline__ unsigned long __ffs(unsigned long x) * This is defined the same way as the libc and compiler builtin * ffs routines, therefore differs in spirit from the above ffz (man ffs). */ -static __inline__ int ffs(int x) +static __inline__ __attribute_const__ int ffs(int x) { return x ? (__ffs((unsigned long)x) + 1) : 0; } @@ -171,7 +171,7 @@ static __inline__ int ffs(int x) * fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __inline__ int fls(unsigned int x) +static __inline__ __attribute_const__ int fls(unsigned int x) { int ret; if (!x) From b77fee88bfdfcba2f92c9de2ed1af793c96c46d8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:10 -0700 Subject: [PATCH 1205/3206] s390: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to S390's implementations of ffs(), __ffs(), fls(), and __fls() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=s390 defconfig with GCC s390x-linux-gnu 14.2.0. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Heiko Carstens Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-14-kees@kernel.org Signed-off-by: Kees Cook --- arch/s390/include/asm/bitops.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index a5ca0a9476916c..fbcc3e1cc7769e 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -179,7 +179,7 @@ static inline unsigned char __flogr(unsigned long word) * * Undefined if no bit exists, so code should check against 0 first. */ -static inline unsigned long __ffs(unsigned long word) +static inline __attribute_const__ unsigned long __ffs(unsigned long word) { return __flogr(-word & word) ^ (BITS_PER_LONG - 1); } @@ -191,7 +191,7 @@ static inline unsigned long __ffs(unsigned long word) * This is defined the same way as the libc and * compiler builtin ffs routines (man ffs). */ -static inline int ffs(int word) +static inline __attribute_const__ int ffs(int word) { unsigned long mask = 2 * BITS_PER_LONG - 1; unsigned int val = (unsigned int)word; @@ -205,7 +205,7 @@ static inline int ffs(int word) * * Undefined if no set bit exists, so code should check against 0 first. */ -static inline unsigned long __fls(unsigned long word) +static inline __attribute_const__ unsigned long __fls(unsigned long word) { return __flogr(word) ^ (BITS_PER_LONG - 1); } @@ -221,7 +221,7 @@ static inline unsigned long __fls(unsigned long word) * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ -static inline int fls64(unsigned long word) +static inline __attribute_const__ int fls64(unsigned long word) { unsigned long mask = 2 * BITS_PER_LONG - 1; @@ -235,7 +235,7 @@ static inline int fls64(unsigned long word) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(unsigned int word) +static inline __attribute_const__ int fls(unsigned int word) { return fls64(word); } From 945fc9dbd8377e48246de3a5f0104ebe82399eb0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:11 -0700 Subject: [PATCH 1206/3206] xtensa: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to Xtensa's implementations of ffs(), __ffs(), fls(), __fls(), ffz() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=xtensa defconfig with GCC xtensa-linux 15.1.0. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-15-kees@kernel.org Signed-off-by: Kees Cook --- arch/xtensa/include/asm/bitops.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h index e02ec583338947..f7390b6761e1bb 100644 --- a/arch/xtensa/include/asm/bitops.h +++ b/arch/xtensa/include/asm/bitops.h @@ -37,7 +37,7 @@ static inline unsigned long __cntlz (unsigned long x) * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ -static inline int ffz(unsigned long x) +static inline int __attribute_const__ ffz(unsigned long x) { return 31 - __cntlz(~x & -~x); } @@ -46,7 +46,7 @@ static inline int ffz(unsigned long x) * __ffs: Find first bit set in word. Return 0 for bit 0 */ -static inline unsigned long __ffs(unsigned long x) +static inline __attribute_const__ unsigned long __ffs(unsigned long x) { return 31 - __cntlz(x & -x); } @@ -57,7 +57,7 @@ static inline unsigned long __ffs(unsigned long x) * differs in spirit from the above ffz (man ffs). */ -static inline int ffs(unsigned long x) +static inline __attribute_const__ int ffs(unsigned long x) { return 32 - __cntlz(x & -x); } @@ -67,7 +67,7 @@ static inline int ffs(unsigned long x) * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls (unsigned int x) +static inline __attribute_const__ int fls (unsigned int x) { return 32 - __cntlz(x); } @@ -78,7 +78,7 @@ static inline int fls (unsigned int x) * * Undefined if no set bit exists, so code should check against 0 first. */ -static inline unsigned long __fls(unsigned long word) +static inline __attribute_const__ unsigned long __fls(unsigned long word) { return 31 - __cntlz(word); } From 07008b9c1cb8cbeb1741c55729639836dccd4a8f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:12 -0700 Subject: [PATCH 1207/3206] sparc: Add __attribute_const__ to ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to sparc64's implementations of ffs(), __ffs(), fls(), and __fls() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested ARCH=sparc defconfig with GCC sparc64-linux-gnu 14.2.0. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-16-kees@kernel.org Signed-off-by: Kees Cook --- arch/sparc/include/asm/bitops_64.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/sparc/include/asm/bitops_64.h b/arch/sparc/include/asm/bitops_64.h index 005a8ae858f16c..2c7d33b3ec2ea5 100644 --- a/arch/sparc/include/asm/bitops_64.h +++ b/arch/sparc/include/asm/bitops_64.h @@ -23,8 +23,8 @@ void set_bit(unsigned long nr, volatile unsigned long *addr); void clear_bit(unsigned long nr, volatile unsigned long *addr); void change_bit(unsigned long nr, volatile unsigned long *addr); -int fls(unsigned int word); -int __fls(unsigned long word); +int __attribute_const__ fls(unsigned int word); +int __attribute_const__ __fls(unsigned long word); #include @@ -32,8 +32,8 @@ int __fls(unsigned long word); #ifdef __KERNEL__ -int ffs(int x); -unsigned long __ffs(unsigned long); +int __attribute_const__ ffs(int x); +unsigned long __attribute_const__ __ffs(unsigned long); #include #include From 95719dfa323709c06ec34cc96e73e0788e19934f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:44:13 -0700 Subject: [PATCH 1208/3206] KUnit: ffs: Validate all the __attribute_const__ annotations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute_const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Validate all the __attibute_const__ annotations were found for all architectures by reproducing the specific problem encountered in the original bug report. Build and run tested with everything I could reach with KUnit: $ ./tools/testing/kunit/kunit.py run --arch=x86_64 ffs $ ./tools/testing/kunit/kunit.py run --arch=i386 ffs $ ./tools/testing/kunit/kunit.py run --arch=arm64 --make_options "CROSS_COMPILE=aarch64-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=arm --make_options "CROSS_COMPILE=arm-linux-gnueabi-" ffs $ ./tools/testing/kunit/kunit.py run --arch=powerpc ffs $ ./tools/testing/kunit/kunit.py run --arch=powerpc32 ffs $ ./tools/testing/kunit/kunit.py run --arch=powerpcle ffs $ ./tools/testing/kunit/kunit.py run --arch=m68k ffs $ ./tools/testing/kunit/kunit.py run --arch=loongarch ffs $ ./tools/testing/kunit/kunit.py run --arch=s390 --make_options "CROSS_COMPILE=s390x-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=riscv --make_options "CROSS_COMPILE=riscv64-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=riscv32 --make_options "CROSS_COMPILE=riscv64-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=sparc --make_options "CROSS_COMPILE=sparc64-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=sparc64 --make_options "CROSS_COMPILE=sparc64-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=alpha --make_options "CROSS_COMPILE=alpha-linux-gnu-" ffs $ ./tools/testing/kunit/kunit.py run --arch=sh --make_options "CROSS_COMPILE=sh4-linux-gnu-" ffs Closes: https://github.com/KSPP/linux/issues/364 Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-17-kees@kernel.org Signed-off-by: Kees Cook --- lib/tests/ffs_kunit.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/lib/tests/ffs_kunit.c b/lib/tests/ffs_kunit.c index ed11456b116e46..9a329cdc09c247 100644 --- a/lib/tests/ffs_kunit.c +++ b/lib/tests/ffs_kunit.c @@ -496,6 +496,46 @@ static void ffz_edge_cases_test(struct kunit *test) } } +/* + * To have useful build error output, split the tests into separate + * functions so it's clear which are missing __attribute_const__. + */ +#define CREATE_WRAPPER(func) \ +static noinline bool build_test_##func(void) \ +{ \ + int init_##func = 32; \ + int result_##func = func(6); \ + \ + /* Does the static initializer vanish after calling func? */ \ + BUILD_BUG_ON(init_##func < 32); \ + \ + /* "Consume" the results so optimizer doesn't drop them. */ \ + barrier_data(&init_##func); \ + barrier_data(&result_##func); \ + \ + return true; \ +} +CREATE_WRAPPER(ffs) +CREATE_WRAPPER(fls) +CREATE_WRAPPER(__ffs) +CREATE_WRAPPER(__fls) +CREATE_WRAPPER(ffz) +#undef CREATE_WRAPPER + +/* + * Make sure that __attribute_const__ has be applied to all the + * functions. This is a regression test for: + * https://github.com/KSPP/linux/issues/364 + */ +static void ffs_attribute_const_test(struct kunit *test) +{ + KUNIT_EXPECT_TRUE(test, build_test_ffs()); + KUNIT_EXPECT_TRUE(test, build_test_fls()); + KUNIT_EXPECT_TRUE(test, build_test___ffs()); + KUNIT_EXPECT_TRUE(test, build_test___fls()); + KUNIT_EXPECT_TRUE(test, build_test_ffz()); +} + /* * KUnit test case definitions */ From 7d715345a86941b9e6c8e520b40078692baed4a4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 8 Sep 2025 21:35:28 +0200 Subject: [PATCH 1209/3206] power: supply: Use devm_mutex_init() Use devm_mutex_init() instead of hand-writing it. This saves some LoC, improves readability and saves some space in the generated .o file. As an example: Before: ====== text data bss dec hex filename 35803 9352 384 45539 b1e3 drivers/power/supply/rt9467-charger.o After: ===== text data bss dec hex filename 34792 9008 384 44184 ac98 drivers/power/supply/rt9467-charger.o Signed-off-by: Christophe JAILLET Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq27xxx_battery.c | 11 +-------- drivers/power/supply/mt6370-charger.c | 11 +-------- drivers/power/supply/rt9467-charger.c | 33 +++----------------------- 3 files changed, 5 insertions(+), 50 deletions(-) diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index 93dcebbe114175..9f243005fab6ec 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -2224,13 +2224,6 @@ static void bq27xxx_external_power_changed(struct power_supply *psy) mod_delayed_work(system_wq, &di->work, HZ / 2); } -static void bq27xxx_battery_mutex_destroy(void *data) -{ - struct mutex *lock = data; - - mutex_destroy(lock); -} - int bq27xxx_battery_setup(struct bq27xxx_device_info *di) { struct power_supply_desc *psy_desc; @@ -2242,9 +2235,7 @@ int bq27xxx_battery_setup(struct bq27xxx_device_info *di) int ret; INIT_DELAYED_WORK(&di->work, bq27xxx_battery_poll); - mutex_init(&di->lock); - ret = devm_add_action_or_reset(di->dev, bq27xxx_battery_mutex_destroy, - &di->lock); + ret = devm_mutex_init(di->dev, &di->lock); if (ret) return ret; diff --git a/drivers/power/supply/mt6370-charger.c b/drivers/power/supply/mt6370-charger.c index eb3bcf81f74166..e6db961d5818d7 100644 --- a/drivers/power/supply/mt6370-charger.c +++ b/drivers/power/supply/mt6370-charger.c @@ -761,13 +761,6 @@ static int mt6370_chg_init_psy(struct mt6370_priv *priv) return PTR_ERR_OR_ZERO(priv->psy); } -static void mt6370_chg_destroy_attach_lock(void *data) -{ - struct mutex *attach_lock = data; - - mutex_destroy(attach_lock); -} - static void mt6370_chg_destroy_wq(void *data) { struct workqueue_struct *wq = data; @@ -894,9 +887,7 @@ static int mt6370_chg_probe(struct platform_device *pdev) if (ret) return dev_err_probe(dev, ret, "Failed to init psy\n"); - mutex_init(&priv->attach_lock); - ret = devm_add_action_or_reset(dev, mt6370_chg_destroy_attach_lock, - &priv->attach_lock); + ret = devm_mutex_init(dev, &priv->attach_lock); if (ret) return ret; diff --git a/drivers/power/supply/rt9467-charger.c b/drivers/power/supply/rt9467-charger.c index 32e7c7620b91e8..fe773dd8b404f3 100644 --- a/drivers/power/supply/rt9467-charger.c +++ b/drivers/power/supply/rt9467-charger.c @@ -1147,27 +1147,6 @@ static int rt9467_reset_chip(struct rt9467_chg_data *data) return regmap_field_write(data->rm_field[F_RST], 1); } -static void rt9467_chg_destroy_adc_lock(void *data) -{ - struct mutex *adc_lock = data; - - mutex_destroy(adc_lock); -} - -static void rt9467_chg_destroy_attach_lock(void *data) -{ - struct mutex *attach_lock = data; - - mutex_destroy(attach_lock); -} - -static void rt9467_chg_destroy_ichg_ieoc_lock(void *data) -{ - struct mutex *ichg_ieoc_lock = data; - - mutex_destroy(ichg_ieoc_lock); -} - static void rt9467_chg_complete_aicl_done(void *data) { struct completion *aicl_done = data; @@ -1220,21 +1199,15 @@ static int rt9467_charger_probe(struct i2c_client *i2c) if (ret) return dev_err_probe(dev, ret, "Failed to add irq chip\n"); - mutex_init(&data->adc_lock); - ret = devm_add_action_or_reset(dev, rt9467_chg_destroy_adc_lock, - &data->adc_lock); + ret = devm_mutex_init(dev, &data->adc_lock); if (ret) return ret; - mutex_init(&data->attach_lock); - ret = devm_add_action_or_reset(dev, rt9467_chg_destroy_attach_lock, - &data->attach_lock); + ret = devm_mutex_init(dev, &data->attach_lock); if (ret) return ret; - mutex_init(&data->ichg_ieoc_lock); - ret = devm_add_action_or_reset(dev, rt9467_chg_destroy_ichg_ieoc_lock, - &data->ichg_ieoc_lock); + ret = devm_mutex_init(dev, &data->ichg_ieoc_lock); if (ret) return ret; From c4a7748b551e5a06fe9a3862001192b1b5cfe195 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:06:40 +0200 Subject: [PATCH 1210/3206] power: supply: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. queue_work() / queue_delayed_work() / mod_delayed_work() will now use the new unbound wq: whether the user still use the old wq a warn will be printed along with a wq redirect to the new one. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://lore.kernel.org/r/20250905090641.106297-2-marco.crivellari@suse.com [rebased patch to cover recent changes] Signed-off-by: Sebastian Reichel --- drivers/power/supply/adc-battery-helper.c | 6 +++--- drivers/power/supply/bq2415x_charger.c | 2 +- drivers/power/supply/bq24190_charger.c | 2 +- drivers/power/supply/bq27xxx_battery.c | 6 +++--- drivers/power/supply/rk817_charger.c | 6 +++--- drivers/power/supply/ucs1002_power.c | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/power/supply/adc-battery-helper.c b/drivers/power/supply/adc-battery-helper.c index 229b49fef8ad1c..6e0f5b6d73d7c1 100644 --- a/drivers/power/supply/adc-battery-helper.c +++ b/drivers/power/supply/adc-battery-helper.c @@ -181,7 +181,7 @@ static void adc_battery_helper_work(struct work_struct *work) help->intern_res_avg_mohm /= win_size; out: - queue_delayed_work(system_wq, &help->work, + queue_delayed_work(system_percpu_wq, &help->work, (help->poll_count <= INIT_POLL_COUNT) ? INIT_POLL_TIME : POLL_TIME); @@ -251,7 +251,7 @@ void adc_battery_helper_external_power_changed(struct power_supply *psy) struct adc_battery_helper *help = power_supply_get_drvdata(psy); dev_dbg(help->psy->dev.parent, "external power changed\n"); - mod_delayed_work(system_wq, &help->work, SETTLE_TIME); + mod_delayed_work(system_percpu_wq, &help->work, SETTLE_TIME); } EXPORT_SYMBOL_GPL(adc_battery_helper_external_power_changed); @@ -260,7 +260,7 @@ static void adc_battery_helper_start_work(struct adc_battery_helper *help) help->poll_count = 0; help->ocv_avg_index = 0; - queue_delayed_work(system_wq, &help->work, 0); + queue_delayed_work(system_percpu_wq, &help->work, 0); flush_delayed_work(&help->work); } diff --git a/drivers/power/supply/bq2415x_charger.c b/drivers/power/supply/bq2415x_charger.c index 843957548fb9bc..b50a28b9dd3867 100644 --- a/drivers/power/supply/bq2415x_charger.c +++ b/drivers/power/supply/bq2415x_charger.c @@ -842,7 +842,7 @@ static int bq2415x_notifier_call(struct notifier_block *nb, if (bq->automode < 1) return NOTIFY_OK; - mod_delayed_work(system_wq, &bq->work, 0); + mod_delayed_work(system_percpu_wq, &bq->work, 0); return NOTIFY_OK; } diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c index e1510c7fdab3bb..ed0ceae8d90b14 100644 --- a/drivers/power/supply/bq24190_charger.c +++ b/drivers/power/supply/bq24190_charger.c @@ -1467,7 +1467,7 @@ static void bq24190_charger_external_power_changed(struct power_supply *psy) * too low default 500mA iinlim. Delay setting the input-current-limit * for 300ms to avoid this. */ - queue_delayed_work(system_wq, &bdi->input_current_limit_work, + queue_delayed_work(system_percpu_wq, &bdi->input_current_limit_work, msecs_to_jiffies(300)); } diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index 9f243005fab6ec..3df95e0d4fa226 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -1127,7 +1127,7 @@ static int poll_interval_param_set(const char *val, const struct kernel_param *k mutex_lock(&bq27xxx_list_lock); list_for_each_entry(di, &bq27xxx_battery_devices, list) - mod_delayed_work(system_wq, &di->work, 0); + mod_delayed_work(system_percpu_wq, &di->work, 0); mutex_unlock(&bq27xxx_list_lock); return ret; @@ -1945,7 +1945,7 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di) di->last_update = jiffies; if (!di->removed && poll_interval > 0) - mod_delayed_work(system_wq, &di->work, poll_interval * HZ); + mod_delayed_work(system_percpu_wq, &di->work, poll_interval * HZ); } void bq27xxx_battery_update(struct bq27xxx_device_info *di) @@ -2221,7 +2221,7 @@ static void bq27xxx_external_power_changed(struct power_supply *psy) struct bq27xxx_device_info *di = power_supply_get_drvdata(psy); /* After charger plug in/out wait 0.5s for things to stabilize */ - mod_delayed_work(system_wq, &di->work, HZ / 2); + mod_delayed_work(system_percpu_wq, &di->work, HZ / 2); } int bq27xxx_battery_setup(struct bq27xxx_device_info *di) diff --git a/drivers/power/supply/rk817_charger.c b/drivers/power/supply/rk817_charger.c index 1251022eb052c1..9436c6bbf51fb4 100644 --- a/drivers/power/supply/rk817_charger.c +++ b/drivers/power/supply/rk817_charger.c @@ -1046,7 +1046,7 @@ static void rk817_charging_monitor(struct work_struct *work) rk817_read_props(charger); /* Run every 8 seconds like the BSP driver did. */ - queue_delayed_work(system_wq, &charger->work, msecs_to_jiffies(8000)); + queue_delayed_work(system_percpu_wq, &charger->work, msecs_to_jiffies(8000)); } static void rk817_cleanup_node(void *data) @@ -1206,7 +1206,7 @@ static int rk817_charger_probe(struct platform_device *pdev) return ret; /* Force the first update immediately. */ - mod_delayed_work(system_wq, &charger->work, 0); + mod_delayed_work(system_percpu_wq, &charger->work, 0); return 0; } @@ -1226,7 +1226,7 @@ static int __maybe_unused rk817_resume(struct device *dev) struct rk817_charger *charger = dev_get_drvdata(dev); /* force an immediate update */ - mod_delayed_work(system_wq, &charger->work, 0); + mod_delayed_work(system_percpu_wq, &charger->work, 0); return 0; } diff --git a/drivers/power/supply/ucs1002_power.c b/drivers/power/supply/ucs1002_power.c index d32a7633f9e7d7..fe94435340de65 100644 --- a/drivers/power/supply/ucs1002_power.c +++ b/drivers/power/supply/ucs1002_power.c @@ -493,7 +493,7 @@ static irqreturn_t ucs1002_alert_irq(int irq, void *data) { struct ucs1002_info *info = data; - mod_delayed_work(system_wq, &info->health_poll, 0); + mod_delayed_work(system_percpu_wq, &info->health_poll, 0); return IRQ_HANDLED; } From cc2ec444e461b6ca2bc73cd7cbd06aaf15bdfa1a Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:06:41 +0200 Subject: [PATCH 1211/3206] power: supply: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch adds a new WQ_PERCPU flag to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://lore.kernel.org/r/20250905090641.106297-3-marco.crivellari@suse.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/ab8500_btemp.c | 3 ++- drivers/power/supply/ipaq_micro_battery.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/ab8500_btemp.c b/drivers/power/supply/ab8500_btemp.c index b00c84fbc33c2a..e5202a7b6209b4 100644 --- a/drivers/power/supply/ab8500_btemp.c +++ b/drivers/power/supply/ab8500_btemp.c @@ -667,7 +667,8 @@ static int ab8500_btemp_bind(struct device *dev, struct device *master, /* Create a work queue for the btemp */ di->btemp_wq = - alloc_workqueue("ab8500_btemp_wq", WQ_MEM_RECLAIM, 0); + alloc_workqueue("ab8500_btemp_wq", WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (di->btemp_wq == NULL) { dev_err(dev, "failed to create work queue\n"); return -ENOMEM; diff --git a/drivers/power/supply/ipaq_micro_battery.c b/drivers/power/supply/ipaq_micro_battery.c index 7e0568a5353f16..ff8573a5ca6d09 100644 --- a/drivers/power/supply/ipaq_micro_battery.c +++ b/drivers/power/supply/ipaq_micro_battery.c @@ -232,7 +232,8 @@ static int micro_batt_probe(struct platform_device *pdev) return -ENOMEM; mb->micro = dev_get_drvdata(pdev->dev.parent); - mb->wq = alloc_workqueue("ipaq-battery-wq", WQ_MEM_RECLAIM, 0); + mb->wq = alloc_workqueue("ipaq-battery-wq", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!mb->wq) return -ENOMEM; From 1dbfb0363224f6da56f6655d596dc5097308d6f5 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Fri, 5 Sep 2025 06:57:27 -0700 Subject: [PATCH 1212/3206] genetlink: fix genl_bind() invoking bind() after -EPERM Per family bind/unbind callbacks were introduced to allow families to track multicast group consumer presence, e.g. to start or stop producing events depending on listeners. However, in genl_bind() the bind() callback was invoked even if capability checks failed and ret was set to -EPERM. This means that callbacks could run on behalf of unauthorized callers while the syscall still returned failure to user space. Fix this by only invoking bind() after "if (ret) break;" check i.e. after permission checks have succeeded. Fixes: 3de21a8990d3 ("genetlink: Add per family bind/unbind callbacks") Signed-off-by: Alok Tiwari Link: https://patch.msgid.link/20250905135731.3026965-1-alok.a.tiwari@oracle.com Signed-off-by: Jakub Kicinski --- net/netlink/genetlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 104732d3454348..978c129c609501 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group) !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; + if (ret) + break; + if (family->bind) family->bind(i); From 674b34c4c770551e916ae707829c7faea4782d3a Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 5 Sep 2025 14:45:07 +0200 Subject: [PATCH 1213/3206] net: dsa: b53: fix ageing time for BCM53101 For some reason Broadcom decided that BCM53101 uses 0.5s increments for the ageing time register, but kept the field width the same [1]. Due to this, the actual ageing time was always half of what was configured. Fix this by adapting the limits and value calculation for BCM53101. So far it looks like this is the only chip with the increased tick speed: $ grep -l -r "Specifies the aging time in 0.5 seconds" cdk/PKG/chip | sort cdk/PKG/chip/bcm53101/bcm53101_a0_defs.h $ grep -l -r "Specifies the aging time in seconds" cdk/PKG/chip | sort cdk/PKG/chip/bcm53010/bcm53010_a0_defs.h cdk/PKG/chip/bcm53020/bcm53020_a0_defs.h cdk/PKG/chip/bcm53084/bcm53084_a0_defs.h cdk/PKG/chip/bcm53115/bcm53115_a0_defs.h cdk/PKG/chip/bcm53118/bcm53118_a0_defs.h cdk/PKG/chip/bcm53125/bcm53125_a0_defs.h cdk/PKG/chip/bcm53128/bcm53128_a0_defs.h cdk/PKG/chip/bcm53134/bcm53134_a0_defs.h cdk/PKG/chip/bcm53242/bcm53242_a0_defs.h cdk/PKG/chip/bcm53262/bcm53262_a0_defs.h cdk/PKG/chip/bcm53280/bcm53280_a0_defs.h cdk/PKG/chip/bcm53280/bcm53280_b0_defs.h cdk/PKG/chip/bcm53600/bcm53600_a0_defs.h cdk/PKG/chip/bcm89500/bcm89500_a0_defs.h [1] https://github.com/Broadcom/OpenMDK/blob/a5d3fc9b12af3eeb68f2ca0ce7ec4056cd14d6c2/cdk/PKG/chip/bcm53101/bcm53101_a0_defs.h#L28966 Fixes: e39d14a760c0 ("net: dsa: b53: implement setting ageing time") Signed-off-by: Jonas Gorski Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250905124507.59186-1-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 829b1f087e9e0e..2f846381d5a762 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1273,9 +1273,15 @@ static int b53_setup(struct dsa_switch *ds) */ ds->untag_vlan_aware_bridge_pvid = true; - /* Ageing time is set in seconds */ - ds->ageing_time_min = 1 * 1000; - ds->ageing_time_max = AGE_TIME_MAX * 1000; + if (dev->chip_id == BCM53101_DEVICE_ID) { + /* BCM53101 uses 0.5 second increments */ + ds->ageing_time_min = 1 * 500; + ds->ageing_time_max = AGE_TIME_MAX * 500; + } else { + /* Everything else uses 1 second increments */ + ds->ageing_time_min = 1 * 1000; + ds->ageing_time_max = AGE_TIME_MAX * 1000; + } ret = b53_reset_switch(dev); if (ret) { @@ -2559,7 +2565,10 @@ int b53_set_ageing_time(struct dsa_switch *ds, unsigned int msecs) else reg = B53_AGING_TIME_CONTROL; - atc = DIV_ROUND_CLOSEST(msecs, 1000); + if (dev->chip_id == BCM53101_DEVICE_ID) + atc = DIV_ROUND_CLOSEST(msecs, 500); + else + atc = DIV_ROUND_CLOSEST(msecs, 1000); if (!is5325(dev) && !is5365(dev)) atc |= AGE_CHANGE; From 8625f5748fea960d2af4f3c3e9891ee8f6f80906 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 5 Sep 2025 13:12:33 +0200 Subject: [PATCH 1214/3206] net: bridge: Bounce invalid boolopts The bridge driver currently tolerates options that it does not recognize. Instead, it should bounce them. Fixes: a428afe82f98 ("net: bridge: add support for user-controlled bool options") Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/e6fdca3b5a8d54183fbda075daffef38bdd7ddce.1757070067.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/bridge/br.c b/net/bridge/br.c index 1885d0c315f027..c683baa3847f17 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -324,6 +324,13 @@ int br_boolopt_multi_toggle(struct net_bridge *br, int err = 0; int opt_id; + opt_id = find_next_bit(&bitmap, BITS_PER_LONG, BR_BOOLOPT_MAX); + if (opt_id != BITS_PER_LONG) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Unknown boolean option %d", + opt_id); + return -EINVAL; + } + for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { bool on = !!(bm->optval & BIT(opt_id)); From d3b28612bc5500133260aaf36794a0a0c287d61b Mon Sep 17 00:00:00 2001 From: Jonas Rebmann Date: Fri, 5 Sep 2025 14:20:50 +0200 Subject: [PATCH 1215/3206] net: phy: NXP_TJA11XX: Update Kconfig with TJA1102 support Update the Kconfig description to indicate support for the TJA1102. Signed-off-by: Jonas Rebmann Link: https://patch.msgid.link/20250905-tja1102-kconfig-v1-1-a57e6ac4e264@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 28acc6392cfc89..392749aae54df4 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -361,7 +361,7 @@ config NXP_TJA11XX_PHY tristate "NXP TJA11xx PHYs support" depends on HWMON help - Currently supports the NXP TJA1100 and TJA1101 PHY. + Currently supports the NXP TJA1100, TJA1101 and TJA1102 PHYs. config NCN26000_PHY tristate "Onsemi 10BASE-T1S Ethernet PHY" From 01c7344e21c2140e72282d9d16d79a61f840fc20 Mon Sep 17 00:00:00 2001 From: Junjie Cao Date: Thu, 14 Aug 2025 14:06:05 +0800 Subject: [PATCH 1216/3206] lkdtm: fortify: Fix potential NULL dereference on kmalloc failure Add missing NULL pointer checks after kmalloc() calls in lkdtm_FORTIFY_STR_MEMBER() and lkdtm_FORTIFY_MEM_MEMBER() functions. Signed-off-by: Junjie Cao Link: https://lore.kernel.org/r/20250814060605.5264-1-junjie.cao@intel.com Signed-off-by: Kees Cook --- drivers/misc/lkdtm/fortify.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/misc/lkdtm/fortify.c b/drivers/misc/lkdtm/fortify.c index 0159276656780d..00ed2147113e69 100644 --- a/drivers/misc/lkdtm/fortify.c +++ b/drivers/misc/lkdtm/fortify.c @@ -44,6 +44,9 @@ static void lkdtm_FORTIFY_STR_MEMBER(void) char *src; src = kmalloc(size, GFP_KERNEL); + if (!src) + return; + strscpy(src, "over ten bytes", size); size = strlen(src) + 1; @@ -109,6 +112,9 @@ static void lkdtm_FORTIFY_MEM_MEMBER(void) char *src; src = kmalloc(size, GFP_KERNEL); + if (!src) + return; + strscpy(src, "over ten bytes", size); size = strlen(src) + 1; From 7989fdce69ec0a928e136477da2aa208a191fba2 Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Fri, 22 Aug 2025 15:55:16 -0700 Subject: [PATCH 1217/3206] percpu: fix race on alloc failed warning limit The 'allocation failed, ...' warning messages can cause unlimited log spam, contrary to the implementation's intent. The warn_limit variable is accessed without synchronization. If more than threads enter the warning path at the same time, the variable will get decremented past 0. Once it becomes negative, the non-zero check will always return true leading to unlimited log spam. Use atomic operation to access warn_limit and change condition to test for non-negative (>= 0) - atomic_dec_if_positive will return -1 once warn_limit becomes 0. Continue to print disable message alongside the last warning. While the change cited in Fixes is only adjacent, the warning limit implementation was correct before it. Only non-atomic allocations were considered for warnings, and those happened to hold pcpu_alloc_mutex while accessing warn_limit. [vdumitrescu@nvidia.com: prevent warn_limit from going negative, per Christoph Lameter] Link: https://lkml.kernel.org/r/ee87cc59-2717-4dbb-8052-1d2692c5aaaa@nvidia.com Link: https://lkml.kernel.org/r/ab22061a-a62f-4429-945b-744e5cc4ba35@nvidia.com Fixes: f7d77dfc91f7 ("mm/percpu.c: print error message too if atomic alloc failed") Signed-off-by: Vlad Dumitrescu Reviewed-by: Baoquan He Cc: Christoph Lameter (Ampere) Cc: Dennis Zhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/percpu.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index a56f35dcc417e6..81462ce5866e16 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1734,7 +1734,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; - static int warn_limit = 10; + static atomic_t warn_limit = ATOMIC_INIT(10); struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; @@ -1904,13 +1904,17 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); - if (do_warn && warn_limit) { - pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", - size, align, is_atomic, err); - if (!is_atomic) - dump_stack(); - if (!--warn_limit) - pr_info("limit reached, disable warning\n"); + if (do_warn) { + int remaining = atomic_dec_if_positive(&warn_limit); + + if (remaining >= 0) { + pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); + if (!is_atomic) + dump_stack(); + if (remaining == 0) + pr_info("limit reached, disable warning\n"); + } } if (is_atomic) { From 78d2d32f0b789d67cbe5cfea0c0714cb2446c37e Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Thu, 28 Aug 2025 14:26:56 +0000 Subject: [PATCH 1218/3206] mm/mremap: fix regression in vrm->new_addr check Commit 3215eaceca87 ("mm/mremap: refactor initial parameter sanity checks") moved the sanity check for vrm->new_addr from mremap_to() to check_mremap_params(). However, this caused a regression as vrm->new_addr is now checked even when MREMAP_FIXED and MREMAP_DONTUNMAP flags are not specified. In this case, vrm->new_addr can be garbage and create unexpected failures. Fix this by moving the new_addr check after the vrm_implies_new_addr() guard. This ensures that the new_addr is only checked when the user has specified one explicitly. Link: https://lkml.kernel.org/r/20250828142657.770502-1-cmllamas@google.com Fixes: 3215eaceca87 ("mm/mremap: refactor initial parameter sanity checks") Signed-off-by: Carlos Llamas Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Cc: Carlos Llamas Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/mremap.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index e618a706aff5a6..35de0a7b910e08 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1774,15 +1774,18 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) if (!vrm->new_len) return -EINVAL; - /* Is the new length or address silly? */ - if (vrm->new_len > TASK_SIZE || - vrm->new_addr > TASK_SIZE - vrm->new_len) + /* Is the new length silly? */ + if (vrm->new_len > TASK_SIZE) return -EINVAL; /* Remainder of checks are for cases with specific new_addr. */ if (!vrm_implies_new_addr(vrm)) return 0; + /* Is the new address silly? */ + if (vrm->new_addr > TASK_SIZE - vrm->new_len) + return -EINVAL; + /* The new address must be page-aligned. */ if (offset_in_page(vrm->new_addr)) return -EINVAL; From d613f53c83ec47089c4e25859d5e8e0359f6f8da Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Aug 2025 10:46:18 +0800 Subject: [PATCH 1219/3206] mm/memory-failure: fix VM_BUG_ON_PAGE(PagePoisoned(page)) when unpoison memory When I did memory failure tests, below panic occurs: page dumped because: VM_BUG_ON_PAGE(PagePoisoned(page)) kernel BUG at include/linux/page-flags.h:616! Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 3 PID: 720 Comm: bash Not tainted 6.10.0-rc1-00195-g148743902568 #40 RIP: 0010:unpoison_memory+0x2f3/0x590 RSP: 0018:ffffa57fc8787d60 EFLAGS: 00000246 RAX: 0000000000000037 RBX: 0000000000000009 RCX: ffff9be25fcdc9c8 RDX: 0000000000000000 RSI: 0000000000000027 RDI: ffff9be25fcdc9c0 RBP: 0000000000300000 R08: ffffffffb4956f88 R09: 0000000000009ffb R10: 0000000000000284 R11: ffffffffb4926fa0 R12: ffffe6b00c000000 R13: ffff9bdb453dfd00 R14: 0000000000000000 R15: fffffffffffffffe FS: 00007f08f04e4740(0000) GS:ffff9be25fcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000564787a30410 CR3: 000000010d4e2000 CR4: 00000000000006f0 Call Trace: unpoison_memory+0x2f3/0x590 simple_attr_write_xsigned.constprop.0.isra.0+0xb3/0x110 debugfs_attr_write+0x42/0x60 full_proxy_write+0x5b/0x80 vfs_write+0xd5/0x540 ksys_write+0x64/0xe0 do_syscall_64+0xb9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f08f0314887 RSP: 002b:00007ffece710078 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007f08f0314887 RDX: 0000000000000009 RSI: 0000564787a30410 RDI: 0000000000000001 RBP: 0000564787a30410 R08: 000000000000fefe R09: 000000007fffffff R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000009 R13: 00007f08f041b780 R14: 00007f08f0417600 R15: 00007f08f0416a00 Modules linked in: hwpoison_inject ---[ end trace 0000000000000000 ]--- RIP: 0010:unpoison_memory+0x2f3/0x590 RSP: 0018:ffffa57fc8787d60 EFLAGS: 00000246 RAX: 0000000000000037 RBX: 0000000000000009 RCX: ffff9be25fcdc9c8 RDX: 0000000000000000 RSI: 0000000000000027 RDI: ffff9be25fcdc9c0 RBP: 0000000000300000 R08: ffffffffb4956f88 R09: 0000000000009ffb R10: 0000000000000284 R11: ffffffffb4926fa0 R12: ffffe6b00c000000 R13: ffff9bdb453dfd00 R14: 0000000000000000 R15: fffffffffffffffe FS: 00007f08f04e4740(0000) GS:ffff9be25fcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000564787a30410 CR3: 000000010d4e2000 CR4: 00000000000006f0 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x31c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]--- The root cause is that unpoison_memory() tries to check the PG_HWPoison flags of an uninitialized page. So VM_BUG_ON_PAGE(PagePoisoned(page)) is triggered. This can be reproduced by below steps: 1.Offline memory block: echo offline > /sys/devices/system/memory/memory12/state 2.Get offlined memory pfn: page-types -b n -rlN 3.Write pfn to unpoison-pfn echo > /sys/kernel/debug/hwpoison/unpoison-pfn This scenario can be identified by pfn_to_online_page() returning NULL. And ZONE_DEVICE pages are never expected, so we can simply fail if pfn_to_online_page() == NULL to fix the bug. Link: https://lkml.kernel.org/r/20250828024618.1744895-1-linmiaohe@huawei.com Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") Signed-off-by: Miaohe Lin Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 10b3c281c2aee0..df6ee59527ddf5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2568,10 +2568,9 @@ int unpoison_memory(unsigned long pfn) static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if (!pfn_valid(pfn)) - return -ENXIO; - - p = pfn_to_page(pfn); + p = pfn_to_online_page(pfn); + if (!p) + return -EIO; folio = page_folio(p); mutex_lock(&mf_mutex); From 04100f775c2ea501927f508f17ad824ad1f23c8d Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Fri, 29 Aug 2025 10:18:15 -0500 Subject: [PATCH 1220/3206] ocfs2: fix recursive semaphore deadlock in fiemap call syzbot detected a OCFS2 hang due to a recursive semaphore on a FS_IOC_FIEMAP of the extent list on a specially crafted mmap file. context_switch kernel/sched/core.c:5357 [inline] __schedule+0x1798/0x4cc0 kernel/sched/core.c:6961 __schedule_loop kernel/sched/core.c:7043 [inline] schedule+0x165/0x360 kernel/sched/core.c:7058 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7115 rwsem_down_write_slowpath+0x872/0xfe0 kernel/locking/rwsem.c:1185 __down_write_common kernel/locking/rwsem.c:1317 [inline] __down_write kernel/locking/rwsem.c:1326 [inline] down_write+0x1ab/0x1f0 kernel/locking/rwsem.c:1591 ocfs2_page_mkwrite+0x2ff/0xc40 fs/ocfs2/mmap.c:142 do_page_mkwrite+0x14d/0x310 mm/memory.c:3361 wp_page_shared mm/memory.c:3762 [inline] do_wp_page+0x268d/0x5800 mm/memory.c:3981 handle_pte_fault mm/memory.c:6068 [inline] __handle_mm_fault+0x1033/0x5440 mm/memory.c:6195 handle_mm_fault+0x40a/0x8e0 mm/memory.c:6364 do_user_addr_fault+0x764/0x1390 arch/x86/mm/fault.c:1387 handle_page_fault arch/x86/mm/fault.c:1476 [inline] exc_page_fault+0x76/0xf0 arch/x86/mm/fault.c:1532 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623 RIP: 0010:copy_user_generic arch/x86/include/asm/uaccess_64.h:126 [inline] RIP: 0010:raw_copy_to_user arch/x86/include/asm/uaccess_64.h:147 [inline] RIP: 0010:_inline_copy_to_user include/linux/uaccess.h:197 [inline] RIP: 0010:_copy_to_user+0x85/0xb0 lib/usercopy.c:26 Code: e8 00 bc f7 fc 4d 39 fc 72 3d 4d 39 ec 77 38 e8 91 b9 f7 fc 4c 89 f7 89 de e8 47 25 5b fd 0f 01 cb 4c 89 ff 48 89 d9 4c 89 f6 a4 0f 1f 00 48 89 cb 0f 01 ca 48 89 d8 5b 41 5c 41 5d 41 5e 41 RSP: 0018:ffffc9000403f950 EFLAGS: 00050256 RAX: ffffffff84c7f101 RBX: 0000000000000038 RCX: 0000000000000038 RDX: 0000000000000000 RSI: ffffc9000403f9e0 RDI: 0000200000000060 RBP: ffffc9000403fa90 R08: ffffc9000403fa17 R09: 1ffff92000807f42 R10: dffffc0000000000 R11: fffff52000807f43 R12: 0000200000000098 R13: 00007ffffffff000 R14: ffffc9000403f9e0 R15: 0000200000000060 copy_to_user include/linux/uaccess.h:225 [inline] fiemap_fill_next_extent+0x1c0/0x390 fs/ioctl.c:145 ocfs2_fiemap+0x888/0xc90 fs/ocfs2/extent_map.c:806 ioctl_fiemap fs/ioctl.c:220 [inline] do_vfs_ioctl+0x1173/0x1430 fs/ioctl.c:532 __do_sys_ioctl fs/ioctl.c:596 [inline] __se_sys_ioctl+0x82/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5f13850fd9 RSP: 002b:00007ffe3b3518b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000200000000000 RCX: 00007f5f13850fd9 RDX: 0000200000000040 RSI: 00000000c020660b RDI: 0000000000000004 RBP: 6165627472616568 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe3b3518f0 R13: 00007ffe3b351b18 R14: 431bde82d7b634db R15: 00007f5f1389a03b ocfs2_fiemap() takes a read lock of the ip_alloc_sem semaphore (since v2.6.22-527-g7307de80510a) and calls fiemap_fill_next_extent() to read the extent list of this running mmap executable. The user supplied buffer to hold the fiemap information page faults calling ocfs2_page_mkwrite() which will take a write lock (since v2.6.27-38-g00dc417fa3e7) of the same semaphore. This recursive semaphore will hold filesystem locks and causes a hang of the fileystem. The ip_alloc_sem protects the inode extent list and size. Release the read semphore before calling fiemap_fill_next_extent() in ocfs2_fiemap() and ocfs2_fiemap_inline(). This does an unnecessary semaphore lock/unlock on the last extent but simplifies the error path. Link: https://lkml.kernel.org/r/61d1a62b-2631-4f12-81e2-cd689914360b@oracle.com Fixes: 00dc417fa3e7 ("ocfs2: fiemap support") Signed-off-by: Mark Tinguely Reported-by: syzbot+541dcc6ee768f77103e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=541dcc6ee768f77103e7 Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/extent_map.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 930150ed5db15f..ef147e8b327126 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -706,6 +706,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, * it not only handles the fiemap for inlined files, but also deals * with the fast symlink, cause they have no difference for extent * mapping per se. + * + * Must be called with ip_alloc_sem semaphore held. */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, @@ -717,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, u64 phys; u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; struct ocfs2_inode_info *oi = OCFS2_I(inode); + lockdep_assert_held_read(&oi->ip_alloc_sem); di = (struct ocfs2_dinode *)di_bh->b_data; if (ocfs2_inode_is_fast_symlink(inode)) @@ -732,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret < 0) return ret; } @@ -802,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; - + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, len_bytes, fe_flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret) break; From 79357cd06d41d0f5a11b17d7c86176e395d10ef2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Sun, 31 Aug 2025 14:10:58 +0200 Subject: [PATCH 1221/3206] mm/vmalloc, mm/kasan: respect gfp mask in kasan_populate_vmalloc() kasan_populate_vmalloc() and its helpers ignore the caller's gfp_mask and always allocate memory using the hardcoded GFP_KERNEL flag. This makes them inconsistent with vmalloc(), which was recently extended to support GFP_NOFS and GFP_NOIO allocations. Page table allocations performed during shadow population also ignore the external gfp_mask. To preserve the intended semantics of GFP_NOFS and GFP_NOIO, wrap the apply_to_page_range() calls into the appropriate memalloc scope. xfs calls vmalloc with GFP_NOFS, so this bug could lead to deadlock. There was a report here https://lkml.kernel.org/r/686ea951.050a0220.385921.0016.GAE@google.com This patch: - Extends kasan_populate_vmalloc() and helpers to take gfp_mask; - Passes gfp_mask down to alloc_pages_bulk() and __get_free_page(); - Enforces GFP_NOFS/NOIO semantics with memalloc_*_save()/restore() around apply_to_page_range(); - Updates vmalloc.c and percpu allocator call sites accordingly. Link: https://lkml.kernel.org/r/20250831121058.92971-1-urezki@gmail.com Fixes: 451769ebb7e7 ("mm/vmalloc: alloc GFP_NO{FS,IO} for vmalloc") Signed-off-by: Uladzislau Rezki (Sony) Reported-by: syzbot+3470c9ffee63e4abafeb@syzkaller.appspotmail.com Reviewed-by: Andrey Ryabinin Cc: Baoquan He Cc: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- include/linux/kasan.h | 6 +++--- mm/kasan/shadow.c | 31 ++++++++++++++++++++++++------- mm/vmalloc.c | 8 ++++---- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 890011071f2b14..fe5ce9215821db 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -562,7 +562,7 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size); +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, @@ -574,7 +574,7 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } @@ -610,7 +610,7 @@ static __always_inline void kasan_poison_vmalloc(const void *start, static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index e2ceebf737ef7b..11d472a5c4e8d0 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -336,13 +336,13 @@ static void ___free_pages_bulk(struct page **pages, int nr_pages) } } -static int ___alloc_pages_bulk(struct page **pages, int nr_pages) +static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask) { unsigned long nr_populated, nr_total = nr_pages; struct page **page_array = pages; while (nr_pages) { - nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages); + nr_populated = alloc_pages_bulk(gfp_mask, nr_pages, pages); if (!nr_populated) { ___free_pages_bulk(page_array, nr_total - nr_pages); return -ENOMEM; @@ -354,25 +354,42 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages) return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); struct vmalloc_populate_data data; + unsigned int flags; int ret = 0; - data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO); + data.pages = (struct page **)__get_free_page(gfp_mask | __GFP_ZERO); if (!data.pages) return -ENOMEM; while (nr_total) { nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); - ret = ___alloc_pages_bulk(data.pages, nr_pages); + ret = ___alloc_pages_bulk(data.pages, nr_pages, gfp_mask); if (ret) break; data.start = start; + + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, kasan_populate_vmalloc_pte, &data); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + ___free_pages_bulk(data.pages, nr_pages); if (ret) break; @@ -386,7 +403,7 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size) +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; @@ -415,7 +432,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); if (ret) return ret; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dbcdceecae134..5edd536ba9d2a5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2026,6 +2026,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); + /* Only reclaim behaviour flags are relevant. */ + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; might_sleep(); /* @@ -2038,8 +2040,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, */ va = node_alloc(size, align, vstart, vend, &addr, &vn_id); if (!va) { - gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -2089,7 +2089,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); - ret = kasan_populate_vmalloc(addr, size); + ret = kasan_populate_vmalloc(addr, size, gfp_mask); if (ret) { free_vmap_area(va); return ERR_PTR(ret); @@ -4826,7 +4826,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { - if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) + if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL)) goto err_free_shadow; } From 3fac212fe489aa0dbe8d80a42a7809840ca7b0f9 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 2 Sep 2025 15:49:26 -0700 Subject: [PATCH 1222/3206] compiler-clang.h: define __SANITIZE_*__ macros only when undefined Clang 22 recently added support for defining __SANITIZE__ macros similar to GCC [1], which causes warnings (or errors with CONFIG_WERROR=y or W=e) with the existing defines that the kernel creates to emulate this behavior with existing clang versions. In file included from :3: In file included from include/linux/compiler_types.h:171: include/linux/compiler-clang.h:37:9: error: '__SANITIZE_THREAD__' macro redefined [-Werror,-Wmacro-redefined] 37 | #define __SANITIZE_THREAD__ | ^ :352:9: note: previous definition is here 352 | #define __SANITIZE_THREAD__ 1 | ^ Refactor compiler-clang.h to only define the sanitizer macros when they are undefined and adjust the rest of the code to use these macros for checking if the sanitizers are enabled, clearing up the warnings and allowing the kernel to easily drop these defines when the minimum supported version of LLVM for building the kernel becomes 22.0.0 or newer. Link: https://lkml.kernel.org/r/20250902-clang-update-sanitize-defines-v1-1-cf3702ca3d92@kernel.org Link: https://github.com/llvm/llvm-project/commit/568c23bbd3303518c5056d7f03444dae4fdc8a9c [1] Signed-off-by: Nathan Chancellor Reviewed-by: Justin Stitt Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Bill Wendling Cc: Dmitriy Vyukov Cc: Marco Elver Cc: Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index fa4ffe037bc77f..8720a0705900ca 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -18,23 +18,42 @@ #define KASAN_ABI_VERSION 5 /* + * Clang 22 added preprocessor macros to match GCC, in hopes of eventually + * dropping __has_feature support for sanitizers: + * https://github.com/llvm/llvm-project/commit/568c23bbd3303518c5056d7f03444dae4fdc8a9c + * Create these macros for older versions of clang so that it is easy to clean + * up once the minimum supported version of LLVM for building the kernel always + * creates these macros. + * * Note: Checking __has_feature(*_sanitizer) is only true if the feature is * enabled. Therefore it is not required to additionally check defined(CONFIG_*) * to avoid adding redundant attributes in other configurations. */ +#if __has_feature(address_sanitizer) && !defined(__SANITIZE_ADDRESS__) +#define __SANITIZE_ADDRESS__ +#endif +#if __has_feature(hwaddress_sanitizer) && !defined(__SANITIZE_HWADDRESS__) +#define __SANITIZE_HWADDRESS__ +#endif +#if __has_feature(thread_sanitizer) && !defined(__SANITIZE_THREAD__) +#define __SANITIZE_THREAD__ +#endif -#if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer) -/* Emulate GCC's __SANITIZE_ADDRESS__ flag */ +/* + * Treat __SANITIZE_HWADDRESS__ the same as __SANITIZE_ADDRESS__ in the kernel. + */ +#ifdef __SANITIZE_HWADDRESS__ #define __SANITIZE_ADDRESS__ +#endif + +#ifdef __SANITIZE_ADDRESS__ #define __no_sanitize_address \ __attribute__((no_sanitize("address", "hwaddress"))) #else #define __no_sanitize_address #endif -#if __has_feature(thread_sanitizer) -/* emulate gcc's __SANITIZE_THREAD__ flag */ -#define __SANITIZE_THREAD__ +#ifdef __SANITIZE_THREAD__ #define __no_sanitize_thread \ __attribute__((no_sanitize("thread"))) #else From 0ce9398aa0830f15f92bbed73853f9861c3e74ff Mon Sep 17 00:00:00 2001 From: wangzijie Date: Thu, 4 Sep 2025 21:57:15 +0800 Subject: [PATCH 1223/3206] proc: fix type confusion in pde_set_flags() Commit 2ce3d282bd50 ("proc: fix missing pde_set_flags() for net proc files") missed a key part in the definition of proc_dir_entry: union { const struct proc_ops *proc_ops; const struct file_operations *proc_dir_ops; }; So dereference of ->proc_ops assumes it is a proc_ops structure results in type confusion and make NULL check for 'proc_ops' not work for proc dir. Add !S_ISDIR(dp->mode) test before calling pde_set_flags() to fix it. Link: https://lkml.kernel.org/r/20250904135715.3972782-1-wangzijie1@honor.com Fixes: 2ce3d282bd50 ("proc: fix missing pde_set_flags() for net proc files") Signed-off-by: wangzijie Reported-by: Brad Spengler Closes: https://lore.kernel.org/all/20250903065758.3678537-1-wangzijie1@honor.com/ Cc: Alexey Dobriyan Cc: Al Viro Cc: Christian Brauner Cc: Jiri Slaby Cc: Stefano Brivio Cc: Signed-off-by: Andrew Morton --- fs/proc/generic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index bd0c099cfdd2a5..1762811122735f 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -393,7 +393,8 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, if (proc_alloc_inum(&dp->low_ino)) goto out_free_entry; - pde_set_flags(dp); + if (!S_ISDIR(dp->mode)) + pde_set_flags(dp); write_lock(&proc_subdir_lock); dp->parent = dir; From 3260a3f0828e06f5f13fac69fb1999a6d60d9cff Mon Sep 17 00:00:00 2001 From: Stanislav Fort Date: Fri, 5 Sep 2025 13:10:46 +0300 Subject: [PATCH 1224/3206] mm/damon/sysfs: fix use-after-free in state_show() state_show() reads kdamond->damon_ctx without holding damon_sysfs_lock. This allows a use-after-free race: CPU 0 CPU 1 ----- ----- state_show() damon_sysfs_turn_damon_on() ctx = kdamond->damon_ctx; mutex_lock(&damon_sysfs_lock); damon_destroy_ctx(kdamond->damon_ctx); kdamond->damon_ctx = NULL; mutex_unlock(&damon_sysfs_lock); damon_is_running(ctx); /* ctx is freed */ mutex_lock(&ctx->kdamond_lock); /* UAF */ (The race can also occur with damon_sysfs_kdamonds_rm_dirs() and damon_sysfs_kdamond_release(), which free or replace the context under damon_sysfs_lock.) Fix by taking damon_sysfs_lock before dereferencing the context, mirroring the locking used in pid_show(). The bug has existed since state_show() first accessed kdamond->damon_ctx. Link: https://lkml.kernel.org/r/20250905101046.2288-1-disclosure@aisle.com Fixes: a61ea561c871 ("mm/damon/sysfs: link DAMON for virtual address spaces monitoring") Signed-off-by: Stanislav Fort Reported-by: Stanislav Fort Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6d2b0dab50cbaa..7b9254cadd5f87 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1260,14 +1260,18 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_kdamond *kdamond = container_of(kobj, struct damon_sysfs_kdamond, kobj); - struct damon_ctx *ctx = kdamond->damon_ctx; - bool running; + struct damon_ctx *ctx; + bool running = false; - if (!ctx) - running = false; - else + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + + ctx = kdamond->damon_ctx; + if (ctx) running = damon_is_running(ctx); + mutex_unlock(&damon_sysfs_lock); + return sysfs_emit(buf, "%s\n", running ? damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] : damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]); From a68172d95c2845d2b5455b072b4ff51ba32650e9 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 5 Sep 2025 12:15:57 +0300 Subject: [PATCH 1225/3206] MAINTAINERS: add tree entry to numa memblocks and emulation block Link: https://lkml.kernel.org/r/20250905091557.3529937-1-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Vlastimil Babka Acked-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Cc: David Hildenbrand Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6dcfbd11efef87..fbdbf7c012a032 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16127,6 +16127,7 @@ M: Andrew Morton M: Mike Rapoport L: linux-mm@kvack.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git F: include/linux/numa_memblks.h F: mm/numa.c F: mm/numa_emulation.c From 08df2d7dd4ab2db8a172d824cda7872d5eca460a Mon Sep 17 00:00:00 2001 From: Jason Andryuk Date: Wed, 27 Aug 2025 20:36:01 -0400 Subject: [PATCH 1226/3206] xen/events: Cleanup find_virq() return codes rc is overwritten by the evtchn_status hypercall in each iteration, so the return value will be whatever the last iteration is. This could incorrectly return success even if the event channel was not found. Change to an explicit -ENOENT for an un-found virq and return 0 on a successful match. Fixes: 62cc5fc7b2e0 ("xen/pv-on-hvm kexec: rebind virqs to existing eventchannel ports") Cc: stable@vger.kernel.org Signed-off-by: Jason Andryuk Reviewed-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Message-ID: <20250828003604.8949-2-jason.andryuk@amd.com> --- drivers/xen/events/events_base.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 41309d38f78c3c..374231d84e4f77 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1318,10 +1318,11 @@ static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn) { struct evtchn_status status; evtchn_port_t port; - int rc = -ENOENT; memset(&status, 0, sizeof(status)); for (port = 0; port < xen_evtchn_max_channels(); port++) { + int rc; + status.dom = DOMID_SELF; status.port = port; rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); @@ -1331,10 +1332,10 @@ static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn) continue; if (status.u.virq == virq && status.vcpu == xen_vcpu_nr(cpu)) { *evtchn = port; - break; + return 0; } } - return rc; + return -ENOENT; } /** From 07ce121d93a5e5fb2440a24da3dbf408fcee978e Mon Sep 17 00:00:00 2001 From: Jason Andryuk Date: Wed, 27 Aug 2025 20:36:02 -0400 Subject: [PATCH 1227/3206] xen/events: Return -EEXIST for bound VIRQs Change find_virq() to return -EEXIST when a VIRQ is bound to a different CPU than the one passed in. With that, remove the BUG_ON() from bind_virq_to_irq() to propogate the error upwards. Some VIRQs are per-cpu, but others are per-domain or global. Those must be bound to CPU0 and can then migrate elsewhere. The lookup for per-domain and global will probably fail when migrated off CPU 0, especially when the current CPU is tracked. This now returns -EEXIST instead of BUG_ON(). A second call to bind a per-domain or global VIRQ is not expected, but make it non-fatal to avoid trying to look up the irq, since we don't know which per_cpu(virq_to_irq) it will be in. Cc: stable@vger.kernel.org Signed-off-by: Jason Andryuk Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Message-ID: <20250828003604.8949-3-jason.andryuk@amd.com> --- drivers/xen/events/events_base.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 374231d84e4f77..b060b5a95f458d 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1314,10 +1314,12 @@ int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev, } EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi); -static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn) +static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn, + bool percpu) { struct evtchn_status status; evtchn_port_t port; + bool exists = false; memset(&status, 0, sizeof(status)); for (port = 0; port < xen_evtchn_max_channels(); port++) { @@ -1330,12 +1332,16 @@ static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn) continue; if (status.status != EVTCHNSTAT_virq) continue; - if (status.u.virq == virq && status.vcpu == xen_vcpu_nr(cpu)) { + if (status.u.virq != virq) + continue; + if (status.vcpu == xen_vcpu_nr(cpu)) { *evtchn = port; return 0; + } else if (!percpu) { + exists = true; } } - return -ENOENT; + return exists ? -EEXIST : -ENOENT; } /** @@ -1382,8 +1388,11 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) evtchn = bind_virq.port; else { if (ret == -EEXIST) - ret = find_virq(virq, cpu, &evtchn); - BUG_ON(ret < 0); + ret = find_virq(virq, cpu, &evtchn, percpu); + if (ret) { + __unbind_from_irq(info, info->irq); + goto out; + } } ret = xen_irq_info_virq_setup(info, cpu, evtchn, virq); From 3fcc8e146935415d69ffabb5df40ecf50e106131 Mon Sep 17 00:00:00 2001 From: Jason Andryuk Date: Wed, 27 Aug 2025 20:36:03 -0400 Subject: [PATCH 1228/3206] xen/events: Update virq_to_irq on migration VIRQs come in 3 flavors, per-VPU, per-domain, and global, and the VIRQs are tracked in per-cpu virq_to_irq arrays. Per-domain and global VIRQs must be bound on CPU 0, and bind_virq_to_irq() sets the per_cpu virq_to_irq at registration time Later, the interrupt can migrate, and info->cpu is updated. When calling __unbind_from_irq(), the per-cpu virq_to_irq is cleared for a different cpu. If bind_virq_to_irq() is called again with CPU 0, the stale irq is returned. There won't be any irq_info for the irq, so things break. Make xen_rebind_evtchn_to_cpu() update the per_cpu virq_to_irq mappings to keep them update to date with the current cpu. This ensures the correct virq_to_irq is cleared in __unbind_from_irq(). Fixes: e46cdb66c8fc ("xen: event channels") Cc: stable@vger.kernel.org Signed-off-by: Jason Andryuk Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Message-ID: <20250828003604.8949-4-jason.andryuk@amd.com> --- drivers/xen/events/events_base.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index b060b5a95f458d..9478fae014e50f 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1797,9 +1797,20 @@ static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu) * virq or IPI channel, which don't actually need to be rebound. Ignore * it, but don't do the xenlinux-level rebind in that case. */ - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) { + int old_cpu = info->cpu; + bind_evtchn_to_cpu(info, tcpu, false); + if (info->type == IRQT_VIRQ) { + int virq = info->u.virq; + int irq = per_cpu(virq_to_irq, old_cpu)[virq]; + + per_cpu(virq_to_irq, old_cpu)[virq] = -1; + per_cpu(virq_to_irq, tcpu)[virq] = irq; + } + } + do_unmask(info, EVT_MASK_REASON_TEMPORARY); return 0; From f770c3d858687252f1270265ba152d5c622e793f Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 4 Sep 2025 15:11:09 +0200 Subject: [PATCH 1229/3206] xen/manage: Fix suspend error path The device power management API has the following asymmetry: * dpm_suspend_start() does not clean up on failure (it requires a call to dpm_resume_end()) * dpm_suspend_end() does clean up on failure (it does not require a call to dpm_resume_start()) The asymmetry was introduced by commit d8f3de0d2412 ("Suspend-related patches for 2.6.27") in June 2008: It removed a call to device_resume() from device_suspend() (which was later renamed to dpm_suspend_start()). When Xen began using the device power management API in May 2008 with commit 0e91398f2a5d ("xen: implement save/restore"), the asymmetry did not yet exist. But since it was introduced, a call to dpm_resume_end() is missing in the error path of dpm_suspend_start(). Fix it. Fixes: d8f3de0d2412 ("Suspend-related patches for 2.6.27") Signed-off-by: Lukas Wunner Cc: stable@vger.kernel.org # v2.6.27 Reviewed-by: "Rafael J. Wysocki (Intel)" Signed-off-by: Juergen Gross Message-ID: <22453676d1ddcebbe81641bb68ddf587fee7e21e.1756990799.git.lukas@wunner.de> --- drivers/xen/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 841afa4933c7a6..1f5a7a42fc3278 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -110,7 +110,7 @@ static void do_suspend(void) err = dpm_suspend_start(PMSG_FREEZE); if (err) { pr_err("%s: dpm_suspend_start %d\n", __func__, err); - goto out_thaw; + goto out_resume_end; } printk(KERN_DEBUG "suspending xenstore...\n"); @@ -150,6 +150,7 @@ static void do_suspend(void) else xs_suspend_cancel(); +out_resume_end: dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE); out_thaw: From cfa7b7659757f8d0fc4914429efa90d0d2577dd7 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 5 Sep 2025 13:41:49 +0300 Subject: [PATCH 1230/3206] drm/i915/power: fix size for for_each_set_bit() in abox iteration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit for_each_set_bit() expects size to be in bits, not bytes. The abox mask iteration uses bytes, but it works by coincidence, because the local variable holding the mask is unsigned long, and the mask only ever has bit 2 as the highest bit. Using a smaller type could lead to subtle and very hard to track bugs. Fixes: 62afef2811e4 ("drm/i915/rkl: RKL uses ABOX0 for pixel transfers") Cc: Ville Syrjälä Cc: Matt Roper Cc: stable@vger.kernel.org # v5.9+ Reviewed-by: Matt Roper Link: https://lore.kernel.org/r/20250905104149.1144751-1-jani.nikula@intel.com Signed-off-by: Jani Nikula (cherry picked from commit 7ea3baa6efe4bb93d11e1c0e6528b1468d7debf6) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_display_power.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c index 273054c2232524..c92f3e73622886 100644 --- a/drivers/gpu/drm/i915/display/intel_display_power.c +++ b/drivers/gpu/drm/i915/display/intel_display_power.c @@ -1172,7 +1172,7 @@ static void icl_mbus_init(struct intel_display *display) if (DISPLAY_VER(display) == 12) abox_regs |= BIT(0); - for_each_set_bit(i, &abox_regs, sizeof(abox_regs)) + for_each_set_bit(i, &abox_regs, BITS_PER_TYPE(abox_regs)) intel_de_rmw(display, MBUS_ABOX_CTL(i), mask, val); } @@ -1629,11 +1629,11 @@ static void tgl_bw_buddy_init(struct intel_display *display) if (table[config].page_mask == 0) { drm_dbg_kms(display->drm, "Unknown memory configuration; disabling address buddy logic.\n"); - for_each_set_bit(i, &abox_mask, sizeof(abox_mask)) + for_each_set_bit(i, &abox_mask, BITS_PER_TYPE(abox_mask)) intel_de_write(display, BW_BUDDY_CTL(i), BW_BUDDY_DISABLE); } else { - for_each_set_bit(i, &abox_mask, sizeof(abox_mask)) { + for_each_set_bit(i, &abox_mask, BITS_PER_TYPE(abox_mask)) { intel_de_write(display, BW_BUDDY_PAGE_MASK(i), table[config].page_mask); From de4da7bd5c5172f6362b5994f541d830119840dc Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 21 Aug 2025 17:23:09 +0200 Subject: [PATCH 1231/3206] KVM: s390: Fix access to unavailable adapter indicator pages during postcopy When you run a KVM guest with vhost-net and migrate that guest to another host, and you immediately enable postcopy after starting the migration, there is a big chance that the network connection of the guest won't work anymore on the destination side after the migration. With a debug kernel v6.16.0, there is also a call trace that looks like this: FAULT_FLAG_ALLOW_RETRY missing 881 CPU: 6 UID: 0 PID: 549 Comm: kworker/6:2 Kdump: loaded Not tainted 6.16.0 #56 NONE Hardware name: IBM 3931 LA1 400 (LPAR) Workqueue: events irqfd_inject [kvm] Call Trace: [<00003173cbecc634>] dump_stack_lvl+0x104/0x168 [<00003173cca69588>] handle_userfault+0xde8/0x1310 [<00003173cc756f0c>] handle_pte_fault+0x4fc/0x760 [<00003173cc759212>] __handle_mm_fault+0x452/0xa00 [<00003173cc7599ba>] handle_mm_fault+0x1fa/0x6a0 [<00003173cc73409a>] __get_user_pages+0x4aa/0xba0 [<00003173cc7349e8>] get_user_pages_remote+0x258/0x770 [<000031734be6f052>] get_map_page+0xe2/0x190 [kvm] [<000031734be6f910>] adapter_indicators_set+0x50/0x4a0 [kvm] [<000031734be7f674>] set_adapter_int+0xc4/0x170 [kvm] [<000031734be2f268>] kvm_set_irq+0x228/0x3f0 [kvm] [<000031734be27000>] irqfd_inject+0xd0/0x150 [kvm] [<00003173cc00c9ec>] process_one_work+0x87c/0x1490 [<00003173cc00dda6>] worker_thread+0x7a6/0x1010 [<00003173cc02dc36>] kthread+0x3b6/0x710 [<00003173cbed2f0c>] __ret_from_fork+0xdc/0x7f0 [<00003173cdd737ca>] ret_from_fork+0xa/0x30 3 locks held by kworker/6:2/549: #0: 00000000800bc958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x7ee/0x1490 #1: 000030f3d527fbd0 ((work_completion)(&irqfd->inject)){+.+.}-{0:0}, at: process_one_work+0x81c/0x1490 #2: 00000000f99862b0 (&mm->mmap_lock){++++}-{3:3}, at: get_map_page+0xa8/0x190 [kvm] The "FAULT_FLAG_ALLOW_RETRY missing" indicates that handle_userfaultfd() saw a page fault request without ALLOW_RETRY flag set, hence userfaultfd cannot remotely resolve it (because the caller was asking for an immediate resolution, aka, FAULT_FLAG_NOWAIT, while remote faults can take time). With that, get_map_page() failed and the irq was lost. We should not be strictly in an atomic environment here and the worker should be sleepable (the call is done during an ioctl from userspace), so we can allow adapter_indicators_set() to just sleep waiting for the remote fault instead. Link: https://issues.redhat.com/browse/RHEL-42486 Signed-off-by: Peter Xu [thuth: Assembled patch description and fixed some cosmetical issues] Signed-off-by: Thomas Huth Reviewed-by: Claudio Imbrenda Acked-by: Janosch Frank Fixes: f65470661f36 ("KVM: s390/interrupt: do not pin adapter interrupt pages") [frankja: Added fixes tag] Signed-off-by: Janosch Frank --- arch/s390/kvm/interrupt.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 2a92a8b9e4c2f7..9384572ffa7b7a 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2778,12 +2778,19 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap) static struct page *get_map_page(struct kvm *kvm, u64 uaddr) { + struct mm_struct *mm = kvm->mm; struct page *page = NULL; + int locked = 1; + + if (mmget_not_zero(mm)) { + mmap_read_lock(mm); + get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE, + &page, &locked); + if (locked) + mmap_read_unlock(mm); + mmput(mm); + } - mmap_read_lock(kvm->mm); - get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, - &page, NULL); - mmap_read_unlock(kvm->mm); return page; } From 185d903064f8680c2de43514c94fddc0d75ed00a Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Mon, 25 Aug 2025 17:18:30 +0200 Subject: [PATCH 1232/3206] KVM: s390: Fix incorrect usage of mmu_notifier_register() If mmu_notifier_register() fails, for example because a signal was pending, the mmu_notifier will not be registered. But when the VM gets destroyed, it will get unregistered anyway and that will cause one extra mmdrop(), which will eventually cause the mm of the process to be freed too early, and cause a use-after free. This bug happens rarely, and only when secure guests are involved. The solution is to check the return value of mmu_notifier_register() and return it to the caller (ultimately it will be propagated all the way to userspace). In case of -EINTR, userspace will try again. Fixes: ca2fd0609b5d ("KVM: s390: pv: add mmu_notifier") Signed-off-by: Claudio Imbrenda Reviewed-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Signed-off-by: Janosch Frank --- arch/s390/kvm/pv.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 25ede8354514f2..6ba5a0305e25bc 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -624,6 +624,17 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) int cc, ret; u16 dummy; + /* Add the notifier only once. No races because we hold kvm->lock */ + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { + /* The notifier will be unregistered when the VM is destroyed */ + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; + ret = mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); + if (ret) { + kvm->arch.pv.mmu_notifier.ops = NULL; + return ret; + } + } + ret = kvm_s390_pv_alloc_vm(kvm); if (ret) return ret; @@ -659,11 +670,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) return -EIO; } kvm->arch.gmap->guest_handle = uvcb.guest_handle; - /* Add the notifier only once. No races because we hold kvm->lock */ - if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { - kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; - mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); - } return 0; } From 5f9df945d4e862979b50e4ecaba3dc81fb06e8ed Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Mon, 25 Aug 2025 17:18:31 +0200 Subject: [PATCH 1233/3206] KVM: s390: Fix FOLL_*/FAULT_FLAG_* confusion Pass the right type of flag to vcpu_dat_fault_handler(); it expects a FOLL_* flag (in particular FOLL_WRITE), but FAULT_FLAG_WRITE is passed instead. This still works because they happen to have the same integer value, but it's a mistake, thus the fix. Signed-off-by: Claudio Imbrenda Fixes: 05066cafa925 ("s390/mm/fault: Handle guest-related program interrupts in KVM") Acked-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bf6fa8b9ca7328..6d51aa5f66bee1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4864,12 +4864,12 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) * @vcpu: the vCPU whose gmap is to be fixed up * @gfn: the guest frame number used for memslots (including fake memslots) * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps - * @flags: FOLL_* flags + * @foll: FOLL_* flags * * Return: 0 on success, < 0 in case of error. * Context: The mm lock must not be held before calling. May sleep. */ -int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags) +int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll) { struct kvm_memory_slot *slot; unsigned int fault_flags; @@ -4883,13 +4883,13 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return vcpu_post_run_addressing_exception(vcpu); - fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; + fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; if (vcpu->arch.gmap->pfault_enabled) - flags |= FOLL_NOWAIT; + foll |= FOLL_NOWAIT; vmaddr = __gfn_to_hva_memslot(slot, gfn); try_again: - pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page); + pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page); /* Access outside memory, inject addressing exception */ if (is_noslot_pfn(pfn)) @@ -4905,7 +4905,7 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u return 0; vcpu->stat.pfault_sync++; /* Could not setup async pfault, try again synchronously */ - flags &= ~FOLL_NOWAIT; + foll &= ~FOLL_NOWAIT; goto try_again; } /* Any other error */ @@ -4925,7 +4925,7 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u return rc; } -static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags) +static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll) { unsigned long gaddr_tmp; gfn_t gfn; @@ -4950,18 +4950,18 @@ static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, un } gfn = gpa_to_gfn(gaddr_tmp); } - return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags); + return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll); } static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) { - unsigned int flags = 0; + unsigned int foll = 0; unsigned long gaddr; int rc; gaddr = current->thread.gmap_teid.addr * PAGE_SIZE; if (kvm_s390_cur_gmap_fault_is_write()) - flags = FAULT_FLAG_WRITE; + foll = FOLL_WRITE; switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) { case 0: @@ -5003,7 +5003,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) send_sig(SIGSEGV, current, 0); if (rc != -ENXIO) break; - flags = FAULT_FLAG_WRITE; + foll = FOLL_WRITE; fallthrough; case PGM_PROTECTION: case PGM_SEGMENT_TRANSLATION: @@ -5013,7 +5013,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) case PGM_REGION_SECOND_TRANS: case PGM_REGION_THIRD_TRANS: kvm_s390_assert_primary_as(vcpu); - return vcpu_dat_fault_handler(vcpu, gaddr, flags); + return vcpu_dat_fault_handler(vcpu, gaddr, foll); default: KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx", current->thread.gmap_int_code, current->thread.gmap_teid.val); From 9f15e0f9ef514b8e1a80707931f6d07362e8ebc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 12 Aug 2025 07:39:02 +0200 Subject: [PATCH 1234/3206] selftests: vDSO: Fix -Wunitialized in powerpc VDSO_CALL() wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The _rval register variable is meant to be an output operand of the asm statement but is instead used as input operand. clang 20.1 notices this and triggers -Wuninitialized warnings: tools/testing/selftests/timers/auxclock.c:154:10: error: variable '_rval' is uninitialized when used here [-Werror,-Wuninitialized] 154 | return VDSO_CALL(self->vdso_clock_gettime64, 2, clockid, ts); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tools/testing/selftests/timers/../vDSO/vdso_call.h:59:10: note: expanded from macro 'VDSO_CALL' 59 | : "r" (_rval) \ | ^~~~~ tools/testing/selftests/timers/auxclock.c:154:10: note: variable '_rval' is declared here tools/testing/selftests/timers/../vDSO/vdso_call.h:47:2: note: expanded from macro 'VDSO_CALL' 47 | register long _rval asm ("r3"); \ | ^ It seems the list of input and output operands have been switched around. However as the argument registers are not always initialized they can not be marked as pure inputs as that would trigger -Wuninitialized warnings. Adding _rval as another input and output operand does also not work as it would collide with the existing _r3 variable. Instead reuse _r3 for both the argument and the return value. Fixes: 6eda706a535c ("selftests: vDSO: fix the way vDSO functions are called for powerpc") Reported-by: kernel test robot Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Reviewed-by: Christophe Leroy Link: https://lore.kernel.org/all/20250812-vdso-tests-fixes-v2-1-90f499dd35f8@linutronix.de Closes: https://lore.kernel.org/oe-kbuild-all/202506180223.BOOk5jDK-lkp@intel.com/ --- tools/testing/selftests/vDSO/vdso_call.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/vDSO/vdso_call.h b/tools/testing/selftests/vDSO/vdso_call.h index bb237d771051bd..e7205584cbdca5 100644 --- a/tools/testing/selftests/vDSO/vdso_call.h +++ b/tools/testing/selftests/vDSO/vdso_call.h @@ -44,7 +44,6 @@ register long _r6 asm ("r6"); \ register long _r7 asm ("r7"); \ register long _r8 asm ("r8"); \ - register long _rval asm ("r3"); \ \ LOADARGS_##nr(fn, args); \ \ @@ -54,13 +53,13 @@ " bns+ 1f\n" \ " neg 3, 3\n" \ "1:" \ - : "+r" (_r0), "=r" (_r3), "+r" (_r4), "+r" (_r5), \ + : "+r" (_r0), "+r" (_r3), "+r" (_r4), "+r" (_r5), \ "+r" (_r6), "+r" (_r7), "+r" (_r8) \ - : "r" (_rval) \ + : \ : "r9", "r10", "r11", "r12", "cr0", "cr1", "cr5", \ "cr6", "cr7", "xer", "lr", "ctr", "memory" \ ); \ - _rval; \ + _r3; \ }) #else From 4b59a9f7628fd82b24f2148f62cf327a84d26555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 12 Aug 2025 07:39:03 +0200 Subject: [PATCH 1235/3206] selftests: vDSO: vdso_test_abi: Correctly skip whole test with missing vDSO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If AT_SYSINFO_EHDR is missing the whole test needs to be skipped. Currently this results in the following output: TAP version 13 1..16 # AT_SYSINFO_EHDR is not present! This output is incorrect, as "1..16" still requires the subtest lines to be printed, which isn't done however. Switch to the correct skipping functions, so the output now correctly indicates that no subtests are being run: TAP version 13 1..0 # SKIP AT_SYSINFO_EHDR is not present! Fixes: 693f5ca08ca0 ("kselftest: Extend vDSO selftest") Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250812-vdso-tests-fixes-v2-2-90f499dd35f8@linutronix.de --- tools/testing/selftests/vDSO/vdso_test_abi.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index a54424e2336f45..67cbfc56e4e1b0 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -182,12 +182,11 @@ int main(int argc, char **argv) unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); ksft_print_header(); - ksft_set_plan(VDSO_TEST_PLAN); - if (!sysinfo_ehdr) { - ksft_print_msg("AT_SYSINFO_EHDR is not present!\n"); - return KSFT_SKIP; - } + if (!sysinfo_ehdr) + ksft_exit_skip("AT_SYSINFO_EHDR is not present!\n"); + + ksft_set_plan(VDSO_TEST_PLAN); version = versions[VDSO_VERSION]; name = (const char **)&names[VDSO_NAMES]; From 3afe371d322cdebc6338bbc121cca6944b6c7f2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 12 Aug 2025 07:39:04 +0200 Subject: [PATCH 1236/3206] selftests: vDSO: vdso_test_abi: Use ksft_finished() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing logic is just an open-coded ksft_finished(). Replace it with the real thing. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250812-vdso-tests-fixes-v2-3-90f499dd35f8@linutronix.de --- tools/testing/selftests/vDSO/vdso_test_abi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index 67cbfc56e4e1b0..b989167ec2472d 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -231,6 +231,5 @@ int main(int argc, char **argv) vdso_test_time(); - ksft_print_cnts(); - return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; + ksft_finished(); } From d7516f25a90c554d25847634884179d8be2782c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 12 Aug 2025 07:39:05 +0200 Subject: [PATCH 1237/3206] selftests: vDSO: vdso_test_abi: Drop clock availability tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test uses the kselftest.h framework and declares in its testplan to always execute 16 testcases. If any of the clockids were not available, the testplan would not be satisfied anymore and the test would fail. Apparently that never happened, so the clockids are always available. Remove the pointless checks. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250812-vdso-tests-fixes-v2-4-90f499dd35f8@linutronix.de --- tools/testing/selftests/vDSO/vdso_test_abi.c | 24 -------------------- 1 file changed, 24 deletions(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index b989167ec2472d..d236dd8305e1e3 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -197,37 +197,13 @@ int main(int argc, char **argv) vdso_test_gettimeofday(); -#if _POSIX_TIMERS > 0 - -#ifdef CLOCK_REALTIME vdso_test_clock(CLOCK_REALTIME); -#endif - -#ifdef CLOCK_BOOTTIME vdso_test_clock(CLOCK_BOOTTIME); -#endif - -#ifdef CLOCK_TAI vdso_test_clock(CLOCK_TAI); -#endif - -#ifdef CLOCK_REALTIME_COARSE vdso_test_clock(CLOCK_REALTIME_COARSE); -#endif - -#ifdef CLOCK_MONOTONIC vdso_test_clock(CLOCK_MONOTONIC); -#endif - -#ifdef CLOCK_MONOTONIC_RAW vdso_test_clock(CLOCK_MONOTONIC_RAW); -#endif - -#ifdef CLOCK_MONOTONIC_COARSE vdso_test_clock(CLOCK_MONOTONIC_COARSE); -#endif - -#endif vdso_test_time(); From 74b408ff06c39238edd66350cd0d4057787fc853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 12 Aug 2025 07:39:06 +0200 Subject: [PATCH 1238/3206] selftests: vDSO: vdso_test_abi: Use explicit indices for name array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The array relies on the numeric values of the clock IDs. When reading the code it is not obvious that the order is correct. Make the code easier to read by using explicit indices. While at it make the array static. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250812-vdso-tests-fixes-v2-5-90f499dd35f8@linutronix.de --- tools/testing/selftests/vDSO/vdso_test_abi.c | 26 ++++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index d236dd8305e1e3..a9a65f0deef3c3 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -31,19 +31,19 @@ typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); typedef time_t (*vdso_time_t)(time_t *t); -const char *vdso_clock_name[12] = { - "CLOCK_REALTIME", - "CLOCK_MONOTONIC", - "CLOCK_PROCESS_CPUTIME_ID", - "CLOCK_THREAD_CPUTIME_ID", - "CLOCK_MONOTONIC_RAW", - "CLOCK_REALTIME_COARSE", - "CLOCK_MONOTONIC_COARSE", - "CLOCK_BOOTTIME", - "CLOCK_REALTIME_ALARM", - "CLOCK_BOOTTIME_ALARM", - "CLOCK_SGI_CYCLE", - "CLOCK_TAI", +static const char * const vdso_clock_name[] = { + [CLOCK_REALTIME] = "CLOCK_REALTIME", + [CLOCK_MONOTONIC] = "CLOCK_MONOTONIC", + [CLOCK_PROCESS_CPUTIME_ID] = "CLOCK_PROCESS_CPUTIME_ID", + [CLOCK_THREAD_CPUTIME_ID] = "CLOCK_THREAD_CPUTIME_ID", + [CLOCK_MONOTONIC_RAW] = "CLOCK_MONOTONIC_RAW", + [CLOCK_REALTIME_COARSE] = "CLOCK_REALTIME_COARSE", + [CLOCK_MONOTONIC_COARSE] = "CLOCK_MONOTONIC_COARSE", + [CLOCK_BOOTTIME] = "CLOCK_BOOTTIME", + [CLOCK_REALTIME_ALARM] = "CLOCK_REALTIME_ALARM", + [CLOCK_BOOTTIME_ALARM] = "CLOCK_BOOTTIME_ALARM", + [10 /* CLOCK_SGI_CYCLE */] = "CLOCK_SGI_CYCLE", + [CLOCK_TAI] = "CLOCK_TAI", }; static void vdso_test_gettimeofday(void) From 7b87dbf9d8465ad437cd7efacc26215b2c189607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 12 Aug 2025 07:39:07 +0200 Subject: [PATCH 1239/3206] selftests: vDSO: vdso_test_abi: Test CPUTIME clocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The structure is already there anyways, so test the CPUTIME clocks, too. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250812-vdso-tests-fixes-v2-6-90f499dd35f8@linutronix.de --- tools/testing/selftests/vDSO/vdso_test_abi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index a9a65f0deef3c3..c25f09998b82d7 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -175,7 +175,7 @@ static inline void vdso_test_clock(clockid_t clock_id) vdso_test_clock_getres(clock_id); } -#define VDSO_TEST_PLAN 16 +#define VDSO_TEST_PLAN 20 int main(int argc, char **argv) { @@ -204,6 +204,8 @@ int main(int argc, char **argv) vdso_test_clock(CLOCK_MONOTONIC); vdso_test_clock(CLOCK_MONOTONIC_RAW); vdso_test_clock(CLOCK_MONOTONIC_COARSE); + vdso_test_clock(CLOCK_PROCESS_CPUTIME_ID); + vdso_test_clock(CLOCK_THREAD_CPUTIME_ID); vdso_test_time(); From 7262aa781fea89cc388335e9c5c6701d8bb13d42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 12 Aug 2025 07:39:08 +0200 Subject: [PATCH 1240/3206] selftests: vDSO: vdso_test_abi: Add tests for clock_gettime64() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be y2038-safe, 32-bit userspace needs to explicitly call the 64-bit safe time APIs. For this the 32-bit vDSOs contains a clock_gettime() variant which always uses 64-bit time types. Also test this vDSO function. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250812-vdso-tests-fixes-v2-7-90f499dd35f8@linutronix.de --- tools/testing/selftests/vDSO/vdso_test_abi.c | 37 +++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index c25f09998b82d7..238d609a457a28 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -26,8 +26,15 @@ static const char *version; static const char **name; +/* The same as struct __kernel_timespec */ +struct vdso_timespec64 { + uint64_t tv_sec; + uint64_t tv_nsec; +}; + typedef long (*vdso_gettimeofday_t)(struct timeval *tv, struct timezone *tz); typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); +typedef long (*vdso_clock_gettime64_t)(clockid_t clk_id, struct vdso_timespec64 *ts); typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); typedef time_t (*vdso_time_t)(time_t *t); @@ -70,6 +77,33 @@ static void vdso_test_gettimeofday(void) } } +static void vdso_test_clock_gettime64(clockid_t clk_id) +{ + /* Find clock_gettime64. */ + vdso_clock_gettime64_t vdso_clock_gettime64 = + (vdso_clock_gettime64_t)vdso_sym(version, name[5]); + + if (!vdso_clock_gettime64) { + ksft_print_msg("Couldn't find %s\n", name[5]); + ksft_test_result_skip("%s %s\n", name[5], + vdso_clock_name[clk_id]); + return; + } + + struct vdso_timespec64 ts; + long ret = VDSO_CALL(vdso_clock_gettime64, 2, clk_id, &ts); + + if (ret == 0) { + ksft_print_msg("The time is %lld.%06lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); + ksft_test_result_pass("%s %s\n", name[5], + vdso_clock_name[clk_id]); + } else { + ksft_test_result_fail("%s %s\n", name[5], + vdso_clock_name[clk_id]); + } +} + static void vdso_test_clock_gettime(clockid_t clk_id) { /* Find clock_gettime. */ @@ -171,11 +205,12 @@ static inline void vdso_test_clock(clockid_t clock_id) ksft_print_msg("clock_id: %s\n", vdso_clock_name[clock_id]); vdso_test_clock_gettime(clock_id); + vdso_test_clock_gettime64(clock_id); vdso_test_clock_getres(clock_id); } -#define VDSO_TEST_PLAN 20 +#define VDSO_TEST_PLAN 29 int main(int argc, char **argv) { From e82bf7570d5ca3667a9038a3c5a42d451b949d89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 12 Aug 2025 07:39:09 +0200 Subject: [PATCH 1241/3206] selftests: vDSO: Drop vdso_test_clock_getres MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vdso_test_abi provides the exact same functionality, properly uses kselftest.h and explicitly calls into the vDSO without relying on the libc. Drop the pointless testcase. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250812-vdso-tests-fixes-v2-8-90f499dd35f8@linutronix.de --- tools/testing/selftests/vDSO/.gitignore | 1 - tools/testing/selftests/vDSO/Makefile | 2 - .../selftests/vDSO/vdso_test_clock_getres.c | 123 ------------------ 3 files changed, 126 deletions(-) delete mode 100644 tools/testing/selftests/vDSO/vdso_test_clock_getres.c diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index 30d5c8f0e5c7df..ba322a353aff14 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only vdso_test vdso_test_abi -vdso_test_clock_getres vdso_test_correctness vdso_test_gettimeofday vdso_test_getcpu diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 918a2caa070ebc..e361aca22a74dc 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -4,7 +4,6 @@ include ../../../scripts/Makefile.arch TEST_GEN_PROGS := vdso_test_gettimeofday TEST_GEN_PROGS += vdso_test_getcpu TEST_GEN_PROGS += vdso_test_abi -TEST_GEN_PROGS += vdso_test_clock_getres ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64)) TEST_GEN_PROGS += vdso_standalone_test_x86 endif @@ -29,7 +28,6 @@ CFLAGS_NOLIBC := -nostdlib -nostdinc -ffreestanding -fno-asynchronous-unwind-tab $(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c $(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_test_abi: parse_vdso.c vdso_test_abi.c -$(OUTPUT)/vdso_test_clock_getres: vdso_test_clock_getres.c $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c | headers $(OUTPUT)/vdso_standalone_test_x86: CFLAGS:=$(CFLAGS_NOLIBC) $(CFLAGS) diff --git a/tools/testing/selftests/vDSO/vdso_test_clock_getres.c b/tools/testing/selftests/vDSO/vdso_test_clock_getres.c deleted file mode 100644 index b5d5f59f725a70..00000000000000 --- a/tools/testing/selftests/vDSO/vdso_test_clock_getres.c +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note -/* - * vdso_clock_getres.c: Sample code to test clock_getres. - * Copyright (c) 2019 Arm Ltd. - * - * Compile with: - * gcc -std=gnu99 vdso_clock_getres.c - * - * Tested on ARM, ARM64, MIPS32, x86 (32-bit and 64-bit), - * Power (32-bit and 64-bit), S390x (32-bit and 64-bit). - * Might work on other architectures. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -static long syscall_clock_getres(clockid_t _clkid, struct timespec *_ts) -{ - long ret; - - ret = syscall(SYS_clock_getres, _clkid, _ts); - - return ret; -} - -const char *vdso_clock_name[12] = { - "CLOCK_REALTIME", - "CLOCK_MONOTONIC", - "CLOCK_PROCESS_CPUTIME_ID", - "CLOCK_THREAD_CPUTIME_ID", - "CLOCK_MONOTONIC_RAW", - "CLOCK_REALTIME_COARSE", - "CLOCK_MONOTONIC_COARSE", - "CLOCK_BOOTTIME", - "CLOCK_REALTIME_ALARM", - "CLOCK_BOOTTIME_ALARM", - "CLOCK_SGI_CYCLE", - "CLOCK_TAI", -}; - -/* - * This function calls clock_getres in vdso and by system call - * with different values for clock_id. - * - * Example of output: - * - * clock_id: CLOCK_REALTIME [PASS] - * clock_id: CLOCK_BOOTTIME [PASS] - * clock_id: CLOCK_TAI [PASS] - * clock_id: CLOCK_REALTIME_COARSE [PASS] - * clock_id: CLOCK_MONOTONIC [PASS] - * clock_id: CLOCK_MONOTONIC_RAW [PASS] - * clock_id: CLOCK_MONOTONIC_COARSE [PASS] - */ -static inline int vdso_test_clock(unsigned int clock_id) -{ - struct timespec x, y; - - printf("clock_id: %s", vdso_clock_name[clock_id]); - clock_getres(clock_id, &x); - syscall_clock_getres(clock_id, &y); - - if ((x.tv_sec != y.tv_sec) || (x.tv_nsec != y.tv_nsec)) { - printf(" [FAIL]\n"); - return KSFT_FAIL; - } - - printf(" [PASS]\n"); - return KSFT_PASS; -} - -int main(int argc, char **argv) -{ - int ret = 0; - -#if _POSIX_TIMERS > 0 - -#ifdef CLOCK_REALTIME - ret += vdso_test_clock(CLOCK_REALTIME); -#endif - -#ifdef CLOCK_BOOTTIME - ret += vdso_test_clock(CLOCK_BOOTTIME); -#endif - -#ifdef CLOCK_TAI - ret += vdso_test_clock(CLOCK_TAI); -#endif - -#ifdef CLOCK_REALTIME_COARSE - ret += vdso_test_clock(CLOCK_REALTIME_COARSE); -#endif - -#ifdef CLOCK_MONOTONIC - ret += vdso_test_clock(CLOCK_MONOTONIC); -#endif - -#ifdef CLOCK_MONOTONIC_RAW - ret += vdso_test_clock(CLOCK_MONOTONIC_RAW); -#endif - -#ifdef CLOCK_MONOTONIC_COARSE - ret += vdso_test_clock(CLOCK_MONOTONIC_COARSE); -#endif - -#endif - if (ret > 0) - return KSFT_FAIL; - - return KSFT_PASS; -} From bf59028ea8d42e8d10bb3d847c9982488ee9e3a0 Mon Sep 17 00:00:00 2001 From: Oscar Maes Date: Tue, 2 Sep 2025 17:02:40 +0200 Subject: [PATCH 1242/3206] selftests: net: add test for destination in broadcast packets Add test to check the broadcast ethernet destination field is set correctly. This test sends a broadcast ping, captures it using tcpdump and ensures that all bits of the 6 octet ethernet destination address are correctly set by examining the output capture file. Co-developed-by: Brett A C Sheffield Signed-off-by: Brett A C Sheffield Signed-off-by: Oscar Maes Link: https://patch.msgid.link/20250902150240.4272-1-oscmaes92@gmail.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/Makefile | 1 + .../selftests/net/broadcast_ether_dst.sh | 83 +++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100755 tools/testing/selftests/net/broadcast_ether_dst.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index c7e03e1d6f63bb..2b31d4a93ad7f7 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -116,6 +116,7 @@ TEST_PROGS += skf_net_off.sh TEST_GEN_FILES += skf_net_off TEST_GEN_FILES += tfo TEST_PROGS += tfo_passive.sh +TEST_PROGS += broadcast_ether_dst.sh TEST_PROGS += broadcast_pmtu.sh TEST_PROGS += ipv6_force_forwarding.sh diff --git a/tools/testing/selftests/net/broadcast_ether_dst.sh b/tools/testing/selftests/net/broadcast_ether_dst.sh new file mode 100755 index 00000000000000..334a7eca8a803b --- /dev/null +++ b/tools/testing/selftests/net/broadcast_ether_dst.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Author: Brett A C Sheffield +# Author: Oscar Maes +# +# Ensure destination ethernet field is correctly set for +# broadcast packets + +source lib.sh + +CLIENT_IP4="192.168.0.1" +GW_IP4="192.168.0.2" + +setup() { + setup_ns CLIENT_NS SERVER_NS + + ip -net "${SERVER_NS}" link add link1 type veth \ + peer name link0 netns "${CLIENT_NS}" + + ip -net "${CLIENT_NS}" link set link0 up + ip -net "${CLIENT_NS}" addr add "${CLIENT_IP4}"/24 dev link0 + + ip -net "${SERVER_NS}" link set link1 up + + ip -net "${CLIENT_NS}" route add default via "${GW_IP4}" + ip netns exec "${CLIENT_NS}" arp -s "${GW_IP4}" 00:11:22:33:44:55 +} + +cleanup() { + rm -f "${CAPFILE}" "${OUTPUT}" + ip -net "${SERVER_NS}" link del link1 + cleanup_ns "${CLIENT_NS}" "${SERVER_NS}" +} + +test_broadcast_ether_dst() { + local rc=0 + CAPFILE=$(mktemp -u cap.XXXXXXXXXX) + OUTPUT=$(mktemp -u out.XXXXXXXXXX) + + echo "Testing ethernet broadcast destination" + + # start tcpdump listening for icmp + # tcpdump will exit after receiving a single packet + # timeout will kill tcpdump if it is still running after 2s + timeout 2s ip netns exec "${CLIENT_NS}" \ + tcpdump -i link0 -c 1 -w "${CAPFILE}" icmp &> "${OUTPUT}" & + pid=$! + slowwait 1 grep -qs "listening" "${OUTPUT}" + + # send broadcast ping + ip netns exec "${CLIENT_NS}" \ + ping -W0.01 -c1 -b 255.255.255.255 &> /dev/null + + # wait for tcpdump for exit after receiving packet + wait "${pid}" + + # compare ethernet destination field to ff:ff:ff:ff:ff:ff + ether_dst=$(tcpdump -r "${CAPFILE}" -tnne 2>/dev/null | \ + awk '{sub(/,/,"",$3); print $3}') + if [[ "${ether_dst}" == "ff:ff:ff:ff:ff:ff" ]]; then + echo "[ OK ]" + rc="${ksft_pass}" + else + echo "[FAIL] expected dst ether addr to be ff:ff:ff:ff:ff:ff," \ + "got ${ether_dst}" + rc="${ksft_fail}" + fi + + return "${rc}" +} + +if [ ! -x "$(command -v tcpdump)" ]; then + echo "SKIP: Could not run test without tcpdump tool" + exit "${ksft_skip}" +fi + +trap cleanup EXIT + +setup +test_broadcast_ether_dst + +exit $? From 52bdd69671b63d5bdac80ed7fdfbad44e915710b Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 9 Sep 2025 14:59:13 +0800 Subject: [PATCH 1243/3206] gpio: loongson-64bit: Remove unneeded semicolon Remove unnecessary semicolons reported by Coccinelle/coccicheck and the semantic patch at scripts/coccinelle/misc/semicolon.cocci. Signed-off-by: Chen Ni Link: https://lore.kernel.org/r/20250909065913.4011133-1-nichen@iscas.ac.cn Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-loongson-64bit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-loongson-64bit.c b/drivers/gpio/gpio-loongson-64bit.c index f84f8c53724943..5f87ce5c86cf1d 100644 --- a/drivers/gpio/gpio-loongson-64bit.c +++ b/drivers/gpio/gpio-loongson-64bit.c @@ -205,7 +205,7 @@ static int loongson_gpio_irq_set_type(struct irq_data *data, unsigned int type) default: return -EINVAL; - }; + } } writeb(pol, lgpio->reg_base + lgpio->chip_data->intpol_offset + hwirq); From 5a5c48e870ed8d8aa9349be625c72f57bde45a4f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Sep 2025 10:27:43 +0200 Subject: [PATCH 1244/3206] irqchip/gic-v5: Delete a stray tab Delete a stray tab that is indenting the code erroneously. [ lpieralisi: Reworded commit message] Signed-off-by: Dan Carpenter Signed-off-by: Lorenzo Pieralisi Signed-off-by: Thomas Gleixner Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/all/20250908082745.113718-2-lpieralisi@kernel.org --- drivers/irqchip/irq-gic-v5-irs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c index f845415f914346..ad1435a858a4c9 100644 --- a/drivers/irqchip/irq-gic-v5-irs.c +++ b/drivers/irqchip/irq-gic-v5-irs.c @@ -568,7 +568,7 @@ static void __init gicv5_irs_init_bases(struct gicv5_irs_chip_data *irs_data, FIELD_PREP(GICV5_IRS_CR1_IST_RA, GICV5_NO_READ_ALLOC) | FIELD_PREP(GICV5_IRS_CR1_IC, GICV5_NON_CACHE) | FIELD_PREP(GICV5_IRS_CR1_OC, GICV5_NON_CACHE); - irs_data->flags |= IRS_FLAGS_NON_COHERENT; + irs_data->flags |= IRS_FLAGS_NON_COHERENT; } else { cr1 = FIELD_PREP(GICV5_IRS_CR1_VPED_WA, GICV5_WRITE_ALLOC) | FIELD_PREP(GICV5_IRS_CR1_VPED_RA, GICV5_READ_ALLOC) | From bfcd1fdaae92faa8cae880eb4c3aaaa60c54bf0d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Sep 2025 10:27:44 +0200 Subject: [PATCH 1245/3206] irqchip/gic-v5: Fix loop in gicv5_its_create_itt_two_level() cleanup path The "i" variable in gicv5_its_create_itt_two_level() needs to be signed otherwise it can cause a forever loop in the function's cleanup path. [ lpieralisi: Reworded commit message ] Fixes: 57d72196dfc8 ("irqchip/gic-v5: Add GICv5 ITS support") Signed-off-by: Dan Carpenter Signed-off-by: Lorenzo Pieralisi Signed-off-by: Thomas Gleixner Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/all/20250908082745.113718-3-lpieralisi@kernel.org --- drivers/irqchip/irq-gic-v5-its.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c index 81d813cd8fbbe9..dcdf8bc0864c5e 100644 --- a/drivers/irqchip/irq-gic-v5-its.c +++ b/drivers/irqchip/irq-gic-v5-its.c @@ -191,9 +191,9 @@ static int gicv5_its_create_itt_two_level(struct gicv5_its_chip_data *its, unsigned int num_events) { unsigned int l1_bits, l2_bits, span, events_per_l2_table; - unsigned int i, complete_tables, final_span, num_ents; + unsigned int complete_tables, final_span, num_ents; __le64 *itt_l1, *itt_l2, **l2ptrs; - int ret; + int i, ret; u64 val; ret = gicv5_its_l2sz_to_l2_bits(itt_l2sz); From a186120c780e21e4cfd186a925e34f718e30de88 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Sep 2025 10:27:45 +0200 Subject: [PATCH 1246/3206] irqchip/gic-v5: Fix error handling in gicv5_its_irq_domain_alloc() Code in gicv5_its_irq_domain_alloc() has two issues: - it checks the wrong return value/variable when calling gicv5_alloc_lpi() - The cleanup code does not take previous loop iterations into account Fix both issues at once by adding the right gicv5_alloc_lpi() variable check and by reworking the function cleanup code to take into account current and previous iterations. [ lpieralisi: Reworded commit message ] Fixes: 57d72196dfc8 ("irqchip/gic-v5: Add GICv5 ITS support") Signed-off-by: Dan Carpenter Signed-off-by: Lorenzo Pieralisi Signed-off-by: Thomas Gleixner Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/all/20250908082745.113718-4-lpieralisi@kernel.org --- drivers/irqchip/irq-gic-v5-its.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c index dcdf8bc0864c5e..554485f0be1fbf 100644 --- a/drivers/irqchip/irq-gic-v5-its.c +++ b/drivers/irqchip/irq-gic-v5-its.c @@ -947,15 +947,18 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi device_id = its_dev->device_id; for (i = 0; i < nr_irqs; i++) { - lpi = gicv5_alloc_lpi(); + ret = gicv5_alloc_lpi(); if (ret < 0) { pr_debug("Failed to find free LPI!\n"); - goto out_eventid; + goto out_free_irqs; } + lpi = ret; ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, &lpi); - if (ret) - goto out_free_lpi; + if (ret) { + gicv5_free_lpi(lpi); + goto out_free_irqs; + } /* * Store eventid and deviceid into the hwirq for later use. @@ -975,8 +978,13 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi return 0; -out_free_lpi: - gicv5_free_lpi(lpi); +out_free_irqs: + while (--i >= 0) { + irqd = irq_domain_get_irq_data(domain, virq + i); + gicv5_free_lpi(irqd->parent_data->hwirq); + irq_domain_reset_irq_data(irqd); + irq_domain_free_irqs_parent(domain, virq + i, 1); + } out_eventid: gicv5_its_free_eventid(its_dev, event_id_base, nr_irqs); return ret; From 26a9f90b6101ea2c9d6f02802cf6d85108104b90 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 8 Sep 2025 13:04:18 -0700 Subject: [PATCH 1247/3206] objtool: Ignore __pi___cfi_ prefixed symbols When building with CONFIG_CFI_CLANG=y after the recent series to separate the x86 startup code, there are objtool warnings along the lines of: vmlinux.o: warning: objtool: __pi___cfi_startup_64_load_idt() falls through to next function __pi_startup_64_load_idt() vmlinux.o: warning: objtool: __pi___cfi_startup_64_setup_gdt_idt() falls through to next function __pi_startup_64_setup_gdt_idt() vmlinux.o: warning: objtool: __pi___cfi___startup_64() falls through to next function __pi___startup_64() As the comment in validate_branch() states, this is expected, so ignore these symbols in the same way that __cfi_ and __pfx_ symbols are already ignored for the rest of the kernel. Fixes: 7b38dec3c5af ("x86/boot: Create a confined code area for startup code") Signed-off-by: Nathan Chancellor Signed-off-by: Borislav Petkov (AMD) Acked-by: Ard Biesheuvel --- tools/objtool/check.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d0d20666e87275..093fcd01dd6e06 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3565,6 +3565,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, /* Ignore KCFI type preambles, which always fall through */ if (!strncmp(func->name, "__cfi_", 6) || !strncmp(func->name, "__pfx_", 6) || + !strncmp(func->name, "__pi___cfi_", 11) || !strncmp(func->name, "__pi___pfx_", 11)) return 0; From 86cd4301c285b5e48f6bb0c1b9eb3bfcaf7315a6 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Mon, 8 Sep 2025 09:18:09 +0800 Subject: [PATCH 1248/3206] irqchip/aspeed-scu-ic: Refactor driver to support variant-based initialization The SCU IC driver handles each AST2600 instance with separate initialization functions and hardcoded register definitions, which is inflexible and creates duplicated code. Consolidate the implementation by introducing a variant-based structure, selected via compatible string, and use a unified init path and MMIO access via of_iomap(). This simplifies the code and prepares for upcoming SoCs like AST2700, which require split register handling. [ tglx: Cleaned up coding style and massaged change log ] Signed-off-by: Ryan Chen Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250908011812.1033858-2-ryan_chen@aspeedtech.com --- drivers/irqchip/irq-aspeed-scu-ic.c | 157 +++++++++++----------------- 1 file changed, 63 insertions(+), 94 deletions(-) diff --git a/drivers/irqchip/irq-aspeed-scu-ic.c b/drivers/irqchip/irq-aspeed-scu-ic.c index 1c7045467c4860..9c1fbdd1ce6579 100644 --- a/drivers/irqchip/irq-aspeed-scu-ic.c +++ b/drivers/irqchip/irq-aspeed-scu-ic.c @@ -7,55 +7,56 @@ */ #include +#include #include #include #include #include -#include +#include #include -#include -#define ASPEED_SCU_IC_REG 0x018 -#define ASPEED_SCU_IC_SHIFT 0 -#define ASPEED_SCU_IC_ENABLE GENMASK(15, ASPEED_SCU_IC_SHIFT) -#define ASPEED_SCU_IC_NUM_IRQS 7 #define ASPEED_SCU_IC_STATUS GENMASK(28, 16) #define ASPEED_SCU_IC_STATUS_SHIFT 16 -#define ASPEED_AST2600_SCU_IC0_REG 0x560 -#define ASPEED_AST2600_SCU_IC0_SHIFT 0 -#define ASPEED_AST2600_SCU_IC0_ENABLE \ - GENMASK(5, ASPEED_AST2600_SCU_IC0_SHIFT) -#define ASPEED_AST2600_SCU_IC0_NUM_IRQS 6 +struct aspeed_scu_ic_variant { + const char *compatible; + unsigned long irq_enable; + unsigned long irq_shift; + unsigned int num_irqs; +}; + +#define SCU_VARIANT(_compat, _shift, _enable, _num) { \ + .compatible = _compat, \ + .irq_shift = _shift, \ + .irq_enable = _enable, \ + .num_irqs = _num, \ +} -#define ASPEED_AST2600_SCU_IC1_REG 0x570 -#define ASPEED_AST2600_SCU_IC1_SHIFT 4 -#define ASPEED_AST2600_SCU_IC1_ENABLE \ - GENMASK(5, ASPEED_AST2600_SCU_IC1_SHIFT) -#define ASPEED_AST2600_SCU_IC1_NUM_IRQS 2 +static const struct aspeed_scu_ic_variant scu_ic_variants[] __initconst = { + SCU_VARIANT("aspeed,ast2400-scu-ic", 0, GENMASK(15, 0), 7), + SCU_VARIANT("aspeed,ast2500-scu-ic", 0, GENMASK(15, 0), 7), + SCU_VARIANT("aspeed,ast2600-scu-ic0", 0, GENMASK(5, 0), 6), + SCU_VARIANT("aspeed,ast2600-scu-ic1", 4, GENMASK(5, 4), 2), +}; struct aspeed_scu_ic { - unsigned long irq_enable; - unsigned long irq_shift; - unsigned int num_irqs; - unsigned int reg; - struct regmap *scu; - struct irq_domain *irq_domain; + unsigned long irq_enable; + unsigned long irq_shift; + unsigned int num_irqs; + void __iomem *base; + struct irq_domain *irq_domain; }; static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) { - unsigned int sts; - unsigned long bit; - unsigned long enabled; - unsigned long max; - unsigned long status; struct aspeed_scu_ic *scu_ic = irq_desc_get_handler_data(desc); struct irq_chip *chip = irq_desc_get_chip(desc); - unsigned int mask = scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT; + unsigned long bit, enabled, max, status; + unsigned int sts, mask; chained_irq_enter(chip, desc); + mask = scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT; /* * The SCU IC has just one register to control its operation and read * status. The interrupt enable bits occupy the lower 16 bits of the @@ -66,7 +67,7 @@ static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) * shifting the status down to get the mapping and then back up to * clear the bit. */ - regmap_read(scu_ic->scu, scu_ic->reg, &sts); + sts = readl(scu_ic->base); enabled = sts & scu_ic->irq_enable; status = (sts >> ASPEED_SCU_IC_STATUS_SHIFT) & enabled; @@ -74,11 +75,9 @@ static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) max = scu_ic->num_irqs + bit; for_each_set_bit_from(bit, &status, max) { - generic_handle_domain_irq(scu_ic->irq_domain, - bit - scu_ic->irq_shift); - - regmap_write_bits(scu_ic->scu, scu_ic->reg, mask, - BIT(bit + ASPEED_SCU_IC_STATUS_SHIFT)); + generic_handle_domain_irq(scu_ic->irq_domain, bit - scu_ic->irq_shift); + writel((readl(scu_ic->base) & ~mask) | BIT(bit + ASPEED_SCU_IC_STATUS_SHIFT), + scu_ic->base); } chained_irq_exit(chip, desc); @@ -87,30 +86,29 @@ static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) static void aspeed_scu_ic_irq_mask(struct irq_data *data) { struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); - unsigned int mask = BIT(data->hwirq + scu_ic->irq_shift) | - (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); + unsigned int bit = BIT(data->hwirq + scu_ic->irq_shift); + unsigned int mask = bit | (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); /* * Status bits are cleared by writing 1. In order to prevent the mask * operation from clearing the status bits, they should be under the * mask and written with 0. */ - regmap_update_bits(scu_ic->scu, scu_ic->reg, mask, 0); + writel(readl(scu_ic->base) & ~mask, scu_ic->base); } static void aspeed_scu_ic_irq_unmask(struct irq_data *data) { struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); unsigned int bit = BIT(data->hwirq + scu_ic->irq_shift); - unsigned int mask = bit | - (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); + unsigned int mask = bit | (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); /* * Status bits are cleared by writing 1. In order to prevent the unmask * operation from clearing the status bits, they should be under the * mask and written with 0. */ - regmap_update_bits(scu_ic->scu, scu_ic->reg, mask, bit); + writel((readl(scu_ic->base) & ~mask) | bit, scu_ic->base); } static int aspeed_scu_ic_irq_set_affinity(struct irq_data *data, @@ -143,21 +141,15 @@ static const struct irq_domain_ops aspeed_scu_ic_domain_ops = { static int aspeed_scu_ic_of_init_common(struct aspeed_scu_ic *scu_ic, struct device_node *node) { - int irq; - int rc = 0; + int irq, rc = 0; - if (!node->parent) { - rc = -ENODEV; + scu_ic->base = of_iomap(node, 0); + if (IS_ERR(scu_ic->base)) { + rc = PTR_ERR(scu_ic->base); goto err; } - - scu_ic->scu = syscon_node_to_regmap(node->parent); - if (IS_ERR(scu_ic->scu)) { - rc = PTR_ERR(scu_ic->scu); - goto err; - } - regmap_write_bits(scu_ic->scu, scu_ic->reg, ASPEED_SCU_IC_STATUS, ASPEED_SCU_IC_STATUS); - regmap_write_bits(scu_ic->scu, scu_ic->reg, ASPEED_SCU_IC_ENABLE, 0); + writel(ASPEED_SCU_IC_STATUS, scu_ic->base); + writel(0, scu_ic->base); irq = irq_of_parse_and_map(node, 0); if (!irq) { @@ -166,8 +158,7 @@ static int aspeed_scu_ic_of_init_common(struct aspeed_scu_ic *scu_ic, } scu_ic->irq_domain = irq_domain_create_linear(of_fwnode_handle(node), scu_ic->num_irqs, - &aspeed_scu_ic_domain_ops, - scu_ic); + &aspeed_scu_ic_domain_ops, scu_ic); if (!scu_ic->irq_domain) { rc = -ENOMEM; goto err; @@ -180,61 +171,39 @@ static int aspeed_scu_ic_of_init_common(struct aspeed_scu_ic *scu_ic, err: kfree(scu_ic); - return rc; } -static int __init aspeed_scu_ic_of_init(struct device_node *node, - struct device_node *parent) +static const struct aspeed_scu_ic_variant *aspeed_scu_ic_find_variant(struct device_node *np) { - struct aspeed_scu_ic *scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); - - if (!scu_ic) - return -ENOMEM; - - scu_ic->irq_enable = ASPEED_SCU_IC_ENABLE; - scu_ic->irq_shift = ASPEED_SCU_IC_SHIFT; - scu_ic->num_irqs = ASPEED_SCU_IC_NUM_IRQS; - scu_ic->reg = ASPEED_SCU_IC_REG; - - return aspeed_scu_ic_of_init_common(scu_ic, node); + for (int i = 0; i < ARRAY_SIZE(scu_ic_variants); i++) { + if (of_device_is_compatible(np, scu_ic_variants[i].compatible)) + return &scu_ic_variants[i]; + } + return NULL; } -static int __init aspeed_ast2600_scu_ic0_of_init(struct device_node *node, - struct device_node *parent) +static int __init aspeed_scu_ic_of_init(struct device_node *node, struct device_node *parent) { - struct aspeed_scu_ic *scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); - - if (!scu_ic) - return -ENOMEM; - - scu_ic->irq_enable = ASPEED_AST2600_SCU_IC0_ENABLE; - scu_ic->irq_shift = ASPEED_AST2600_SCU_IC0_SHIFT; - scu_ic->num_irqs = ASPEED_AST2600_SCU_IC0_NUM_IRQS; - scu_ic->reg = ASPEED_AST2600_SCU_IC0_REG; - - return aspeed_scu_ic_of_init_common(scu_ic, node); -} + const struct aspeed_scu_ic_variant *variant; + struct aspeed_scu_ic *scu_ic; -static int __init aspeed_ast2600_scu_ic1_of_init(struct device_node *node, - struct device_node *parent) -{ - struct aspeed_scu_ic *scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); + variant = aspeed_scu_ic_find_variant(node); + if (!variant) + return -ENODEV; + scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); if (!scu_ic) return -ENOMEM; - scu_ic->irq_enable = ASPEED_AST2600_SCU_IC1_ENABLE; - scu_ic->irq_shift = ASPEED_AST2600_SCU_IC1_SHIFT; - scu_ic->num_irqs = ASPEED_AST2600_SCU_IC1_NUM_IRQS; - scu_ic->reg = ASPEED_AST2600_SCU_IC1_REG; + scu_ic->irq_enable = variant->irq_enable; + scu_ic->irq_shift = variant->irq_shift; + scu_ic->num_irqs = variant->num_irqs; return aspeed_scu_ic_of_init_common(scu_ic, node); } IRQCHIP_DECLARE(ast2400_scu_ic, "aspeed,ast2400-scu-ic", aspeed_scu_ic_of_init); IRQCHIP_DECLARE(ast2500_scu_ic, "aspeed,ast2500-scu-ic", aspeed_scu_ic_of_init); -IRQCHIP_DECLARE(ast2600_scu_ic0, "aspeed,ast2600-scu-ic0", - aspeed_ast2600_scu_ic0_of_init); -IRQCHIP_DECLARE(ast2600_scu_ic1, "aspeed,ast2600-scu-ic1", - aspeed_ast2600_scu_ic1_of_init); +IRQCHIP_DECLARE(ast2600_scu_ic0, "aspeed,ast2600-scu-ic0", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2600_scu_ic1, "aspeed,ast2600-scu-ic1", aspeed_scu_ic_of_init); From 23fc2a41a2c67d622d242b670a10ce8f76a7b68e Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Mon, 8 Sep 2025 09:18:10 +0800 Subject: [PATCH 1249/3206] dt-bindings: mfd: aspeed: Add AST2700 SCU compatibles Add SCU interrupt controller compatible strings for the AST2700 SoC: scu-ic0 to 3. This extends the MFD binding to support AST2700-based platforms. Signed-off-by: Ryan Chen Signed-off-by: Thomas Gleixner Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/all/20250908011812.1033858-3-ryan_chen@aspeedtech.com --- Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml b/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml index 5eccd10d95ce5d..67be6d095fe4f6 100644 --- a/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml +++ b/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml @@ -75,6 +75,10 @@ patternProperties: - aspeed,ast2500-scu-ic - aspeed,ast2600-scu-ic0 - aspeed,ast2600-scu-ic1 + - aspeed,ast2700-scu-ic0 + - aspeed,ast2700-scu-ic1 + - aspeed,ast2700-scu-ic2 + - aspeed,ast2700-scu-ic3 '^silicon-id@[0-9a-f]+$': description: Unique hardware silicon identifiers within the SoC From ed7240444e82aaaa2245a3cc9b040e4db894a665 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Mon, 8 Sep 2025 09:18:11 +0800 Subject: [PATCH 1250/3206] dt-bindings: interrupt-controller: aspeed: Add AST2700 SCU IC compatibles Add compatible strings for the four SCU interrupt controller instances on the AST2700 SoC (scu-ic0 to 3), following the multi-instance model used on AST2600. Also define interrupt indices in the binding header. Signed-off-by: Ryan Chen Signed-off-by: Thomas Gleixner Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/all/20250908011812.1033858-4-ryan_chen@aspeedtech.com --- .../aspeed,ast2500-scu-ic.yaml | 6 +++++- .../interrupt-controller/aspeed-scu-ic.h | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml b/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml index d5287a2bf866bb..d998a9d69b91f4 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml @@ -5,7 +5,7 @@ $id: http://devicetree.org/schemas/interrupt-controller/aspeed,ast2500-scu-ic.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Aspeed AST25XX and AST26XX SCU Interrupt Controller +title: Aspeed AST25XX, AST26XX, AST27XX SCU Interrupt Controller maintainers: - Eddie James @@ -16,6 +16,10 @@ properties: - aspeed,ast2500-scu-ic - aspeed,ast2600-scu-ic0 - aspeed,ast2600-scu-ic1 + - aspeed,ast2700-scu-ic0 + - aspeed,ast2700-scu-ic1 + - aspeed,ast2700-scu-ic2 + - aspeed,ast2700-scu-ic3 reg: maxItems: 1 diff --git a/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h b/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h index f315d5a7f5eef7..7dd04424afcce8 100644 --- a/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h +++ b/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h @@ -20,4 +20,18 @@ #define ASPEED_AST2600_SCU_IC1_LPC_RESET_LO_TO_HI 0 #define ASPEED_AST2600_SCU_IC1_LPC_RESET_HI_TO_LO 1 +#define ASPEED_AST2700_SCU_IC0_PCIE_PERST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC0_PCIE_PERST_HI_TO_LO 2 + +#define ASPEED_AST2700_SCU_IC1_PCIE_RCRST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC1_PCIE_RCRST_HI_TO_LO 2 + +#define ASPEED_AST2700_SCU_IC2_PCIE_PERST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC2_PCIE_PERST_HI_TO_LO 2 +#define ASPEED_AST2700_SCU_IC2_LPC_RESET_LO_TO_HI 1 +#define ASPEED_AST2700_SCU_IC2_LPC_RESET_HI_TO_LO 0 + +#define ASPEED_AST2700_SCU_IC3_LPC_RESET_LO_TO_HI 1 +#define ASPEED_AST2700_SCU_IC3_LPC_RESET_HI_TO_LO 0 + #endif /* _DT_BINDINGS_INTERRUPT_CONTROLLER_ASPEED_SCU_IC_H_ */ From b2a0c13f8b4fc3f6c8b279fdc4395a5fa57dda5d Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Mon, 8 Sep 2025 09:18:12 +0800 Subject: [PATCH 1251/3206] irqchip/aspeed-scu-ic: Add support for AST2700 SCU interrupt controllers AST2700 continues the multi-instance SCU interrupt controller model introduced in the AST2600, with four independent interrupt domains (scu-ic0 to 3). Unlike earlier generations which combine interrupt enable and status bits into a single register, AST2700 separates these into distinct IER and ISR registers. Support for this layout is implemented by using register offsets and separate chained IRQ handlers. The variant table is extended to cover AST2700 IC instances, enabling shared initialization logic while preserving support for previous SoCs. [ tglx: Simplified the logic and cleaned up coding style ] Signed-off-by: Ryan Chen Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250908011812.1033858-5-ryan_chen@aspeedtech.com --- drivers/irqchip/irq-aspeed-scu-ic.c | 119 ++++++++++++++++++++++++---- 1 file changed, 102 insertions(+), 17 deletions(-) diff --git a/drivers/irqchip/irq-aspeed-scu-ic.c b/drivers/irqchip/irq-aspeed-scu-ic.c index 9c1fbdd1ce6579..5584e0f82cce83 100644 --- a/drivers/irqchip/irq-aspeed-scu-ic.c +++ b/drivers/irqchip/irq-aspeed-scu-ic.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Aspeed AST24XX, AST25XX, and AST26XX SCU Interrupt Controller + * Aspeed AST24XX, AST25XX, AST26XX, and AST27XX SCU Interrupt Controller * Copyright 2019 IBM Corporation * * Eddie James @@ -17,26 +17,35 @@ #define ASPEED_SCU_IC_STATUS GENMASK(28, 16) #define ASPEED_SCU_IC_STATUS_SHIFT 16 +#define AST2700_SCU_IC_STATUS GENMASK(15, 0) struct aspeed_scu_ic_variant { const char *compatible; unsigned long irq_enable; unsigned long irq_shift; unsigned int num_irqs; + unsigned long ier; + unsigned long isr; }; -#define SCU_VARIANT(_compat, _shift, _enable, _num) { \ +#define SCU_VARIANT(_compat, _shift, _enable, _num, _ier, _isr) { \ .compatible = _compat, \ .irq_shift = _shift, \ .irq_enable = _enable, \ .num_irqs = _num, \ + .ier = _ier, \ + .isr = _isr, \ } static const struct aspeed_scu_ic_variant scu_ic_variants[] __initconst = { - SCU_VARIANT("aspeed,ast2400-scu-ic", 0, GENMASK(15, 0), 7), - SCU_VARIANT("aspeed,ast2500-scu-ic", 0, GENMASK(15, 0), 7), - SCU_VARIANT("aspeed,ast2600-scu-ic0", 0, GENMASK(5, 0), 6), - SCU_VARIANT("aspeed,ast2600-scu-ic1", 4, GENMASK(5, 4), 2), + SCU_VARIANT("aspeed,ast2400-scu-ic", 0, GENMASK(15, 0), 7, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2500-scu-ic", 0, GENMASK(15, 0), 7, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2600-scu-ic0", 0, GENMASK(5, 0), 6, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2600-scu-ic1", 4, GENMASK(5, 4), 2, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2700-scu-ic0", 0, GENMASK(3, 0), 4, 0x00, 0x04), + SCU_VARIANT("aspeed,ast2700-scu-ic1", 0, GENMASK(3, 0), 4, 0x00, 0x04), + SCU_VARIANT("aspeed,ast2700-scu-ic2", 0, GENMASK(3, 0), 4, 0x04, 0x00), + SCU_VARIANT("aspeed,ast2700-scu-ic3", 0, GENMASK(1, 0), 2, 0x04, 0x00), }; struct aspeed_scu_ic { @@ -45,9 +54,16 @@ struct aspeed_scu_ic { unsigned int num_irqs; void __iomem *base; struct irq_domain *irq_domain; + unsigned long ier; + unsigned long isr; }; -static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) +static inline bool scu_has_split_isr(struct aspeed_scu_ic *scu) +{ + return scu->ier != scu->isr; +} + +static void aspeed_scu_ic_irq_handler_combined(struct irq_desc *desc) { struct aspeed_scu_ic *scu_ic = irq_desc_get_handler_data(desc); struct irq_chip *chip = irq_desc_get_chip(desc); @@ -83,7 +99,34 @@ static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) chained_irq_exit(chip, desc); } -static void aspeed_scu_ic_irq_mask(struct irq_data *data) +static void aspeed_scu_ic_irq_handler_split(struct irq_desc *desc) +{ + struct aspeed_scu_ic *scu_ic = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned long bit, enabled, max, status; + unsigned int sts, mask; + + chained_irq_enter(chip, desc); + + mask = scu_ic->irq_enable; + sts = readl(scu_ic->base + scu_ic->isr); + enabled = sts & scu_ic->irq_enable; + sts = readl(scu_ic->base + scu_ic->isr); + status = sts & enabled; + + bit = scu_ic->irq_shift; + max = scu_ic->num_irqs + bit; + + for_each_set_bit_from(bit, &status, max) { + generic_handle_domain_irq(scu_ic->irq_domain, bit - scu_ic->irq_shift); + /* Clear interrupt */ + writel(BIT(bit), scu_ic->base + scu_ic->isr); + } + + chained_irq_exit(chip, desc); +} + +static void aspeed_scu_ic_irq_mask_combined(struct irq_data *data) { struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); unsigned int bit = BIT(data->hwirq + scu_ic->irq_shift); @@ -97,7 +140,7 @@ static void aspeed_scu_ic_irq_mask(struct irq_data *data) writel(readl(scu_ic->base) & ~mask, scu_ic->base); } -static void aspeed_scu_ic_irq_unmask(struct irq_data *data) +static void aspeed_scu_ic_irq_unmask_combined(struct irq_data *data) { struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); unsigned int bit = BIT(data->hwirq + scu_ic->irq_shift); @@ -111,6 +154,22 @@ static void aspeed_scu_ic_irq_unmask(struct irq_data *data) writel((readl(scu_ic->base) & ~mask) | bit, scu_ic->base); } +static void aspeed_scu_ic_irq_mask_split(struct irq_data *data) +{ + struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); + unsigned int mask = BIT(data->hwirq + scu_ic->irq_shift); + + writel(readl(scu_ic->base) & ~mask, scu_ic->base + scu_ic->ier); +} + +static void aspeed_scu_ic_irq_unmask_split(struct irq_data *data) +{ + struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); + unsigned int bit = BIT(data->hwirq + scu_ic->irq_shift); + + writel(readl(scu_ic->base) | bit, scu_ic->base + scu_ic->ier); +} + static int aspeed_scu_ic_irq_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force) @@ -118,17 +177,29 @@ static int aspeed_scu_ic_irq_set_affinity(struct irq_data *data, return -EINVAL; } -static struct irq_chip aspeed_scu_ic_chip = { +static struct irq_chip aspeed_scu_ic_chip_combined = { .name = "aspeed-scu-ic", - .irq_mask = aspeed_scu_ic_irq_mask, - .irq_unmask = aspeed_scu_ic_irq_unmask, - .irq_set_affinity = aspeed_scu_ic_irq_set_affinity, + .irq_mask = aspeed_scu_ic_irq_mask_combined, + .irq_unmask = aspeed_scu_ic_irq_unmask_combined, + .irq_set_affinity = aspeed_scu_ic_irq_set_affinity, +}; + +static struct irq_chip aspeed_scu_ic_chip_split = { + .name = "ast2700-scu-ic", + .irq_mask = aspeed_scu_ic_irq_mask_split, + .irq_unmask = aspeed_scu_ic_irq_unmask_split, + .irq_set_affinity = aspeed_scu_ic_irq_set_affinity, }; static int aspeed_scu_ic_map(struct irq_domain *domain, unsigned int irq, irq_hw_number_t hwirq) { - irq_set_chip_and_handler(irq, &aspeed_scu_ic_chip, handle_level_irq); + struct aspeed_scu_ic *scu_ic = domain->host_data; + + if (scu_has_split_isr(scu_ic)) + irq_set_chip_and_handler(irq, &aspeed_scu_ic_chip_split, handle_level_irq); + else + irq_set_chip_and_handler(irq, &aspeed_scu_ic_chip_combined, handle_level_irq); irq_set_chip_data(irq, domain->host_data); return 0; @@ -148,8 +219,14 @@ static int aspeed_scu_ic_of_init_common(struct aspeed_scu_ic *scu_ic, rc = PTR_ERR(scu_ic->base); goto err; } - writel(ASPEED_SCU_IC_STATUS, scu_ic->base); - writel(0, scu_ic->base); + + if (scu_has_split_isr(scu_ic)) { + writel(AST2700_SCU_IC_STATUS, scu_ic->base + scu_ic->isr); + writel(0, scu_ic->base + scu_ic->ier); + } else { + writel(ASPEED_SCU_IC_STATUS, scu_ic->base); + writel(0, scu_ic->base); + } irq = irq_of_parse_and_map(node, 0); if (!irq) { @@ -164,7 +241,9 @@ static int aspeed_scu_ic_of_init_common(struct aspeed_scu_ic *scu_ic, goto err; } - irq_set_chained_handler_and_data(irq, aspeed_scu_ic_irq_handler, + irq_set_chained_handler_and_data(irq, scu_has_split_isr(scu_ic) ? + aspeed_scu_ic_irq_handler_split : + aspeed_scu_ic_irq_handler_combined, scu_ic); return 0; @@ -199,6 +278,8 @@ static int __init aspeed_scu_ic_of_init(struct device_node *node, struct device_ scu_ic->irq_enable = variant->irq_enable; scu_ic->irq_shift = variant->irq_shift; scu_ic->num_irqs = variant->num_irqs; + scu_ic->ier = variant->ier; + scu_ic->isr = variant->isr; return aspeed_scu_ic_of_init_common(scu_ic, node); } @@ -207,3 +288,7 @@ IRQCHIP_DECLARE(ast2400_scu_ic, "aspeed,ast2400-scu-ic", aspeed_scu_ic_of_init); IRQCHIP_DECLARE(ast2500_scu_ic, "aspeed,ast2500-scu-ic", aspeed_scu_ic_of_init); IRQCHIP_DECLARE(ast2600_scu_ic0, "aspeed,ast2600-scu-ic0", aspeed_scu_ic_of_init); IRQCHIP_DECLARE(ast2600_scu_ic1, "aspeed,ast2600-scu-ic1", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic0, "aspeed,ast2700-scu-ic0", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic1, "aspeed,ast2700-scu-ic1", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic2, "aspeed,ast2700-scu-ic2", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic3, "aspeed,ast2700-scu-ic3", aspeed_scu_ic_of_init); From 24fb08dcc40f52cf5f95d2cdaa0c26e33a2f4285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 15:28:08 +0200 Subject: [PATCH 1252/3206] posix-timers: Avoid direct access to hrtimer clockbase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The field timer->base->get_time is a private implementation detail and should not be accessed outside of the hrtimer core. Switch to the equivalent helpers. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-1-3ae822e5bfbd@linutronix.de --- kernel/time/posix-timers.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 8b582174b1f949..2741f3725de48e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -299,8 +299,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), - timr->it_interval); + timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval); hrtimer_restart(timer); } @@ -825,7 +824,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) - expires = ktime_add_safe(expires, timer->base->get_time()); + expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer)); hrtimer_set_expires(timer, expires); if (!sigev_none) From 5f531fe9cb489bf63f71f6e5eee6420c57fbc049 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 15:28:09 +0200 Subject: [PATCH 1253/3206] timers/itimer: Avoid direct access to hrtimer clockbase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The field timer->base->get_time is a private implementation detail and should not be accessed outside of the hrtimer core. Switch to the equivalent helper. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-2-3ae822e5bfbd@linutronix.de --- kernel/time/itimer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 876d389b2e219a..7c6110e964e7ec 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -163,8 +163,7 @@ void posixtimer_rearm_itimer(struct task_struct *tsk) struct hrtimer *tmr = &tsk->signal->real_timer; if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) { - hrtimer_forward(tmr, tmr->base->get_time(), - tsk->signal->it_real_incr); + hrtimer_forward_now(tmr, tsk->signal->it_real_incr); hrtimer_restart(tmr); } } From b68b7f3e9b50747b88ba211080d27310430c928b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 15:28:10 +0200 Subject: [PATCH 1254/3206] sched/core: Avoid direct access to hrtimer clockbase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The field timer->base->get_time is a private implementation detail and should not be accessed outside of the hrtimer core. Switch to the equivalent helper. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-3-3ae822e5bfbd@linutronix.de --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4cc..4dc12838ad4fdf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -917,7 +917,7 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); + rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta); if (rq == this_rq()) __hrtick_restart(rq); From e8875795160b484f691938ce07a381e77821f947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 15:28:11 +0200 Subject: [PATCH 1255/3206] lib: test_objpool: Avoid direct access to hrtimer clockbase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The field timer->base->get_time is a private implementation detail and should not be accessed outside of the hrtimer core. Switch to the equivalent helper. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-4-3ae822e5bfbd@linutronix.de --- lib/test_objpool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_objpool.c b/lib/test_objpool.c index 8f688187fa8727..6a34a7582fdbf5 100644 --- a/lib/test_objpool.c +++ b/lib/test_objpool.c @@ -164,7 +164,7 @@ static enum hrtimer_restart ot_hrtimer_handler(struct hrtimer *hrt) /* do bulk-testings for objects pop/push */ item->worker(item, 1); - hrtimer_forward(hrt, hrt->base->get_time(), item->hrtcycle); + hrtimer_forward_now(hrt, item->hrtcycle); return HRTIMER_RESTART; } From 645e0644300b0dac3cf31527e93e51172455749a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 15:28:12 +0200 Subject: [PATCH 1256/3206] ALSA: hrtimer: Avoid direct access to hrtimer clockbase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The field timer->base->get_time is a private implementation detail and should not be accessed outside of the hrtimer core. Switch to the equivalent helper. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Acked-by: Takashi Iwai Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-5-3ae822e5bfbd@linutronix.de --- sound/core/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/hrtimer.c b/sound/core/hrtimer.c index c364bd126ac8b3..2d5f4d47071f79 100644 --- a/sound/core/hrtimer.c +++ b/sound/core/hrtimer.c @@ -44,7 +44,7 @@ static enum hrtimer_restart snd_hrtimer_callback(struct hrtimer *hrt) } /* calculate the drift */ - delta = ktime_sub(hrt->base->get_time(), hrtimer_get_expires(hrt)); + delta = ktime_sub(hrtimer_cb_get_time(hrt), hrtimer_get_expires(hrt)); if (delta > 0) ticks += ktime_divns(delta, ticks * resolution); From 44d7fb8acdf833a97669afeeed40cf27eb1dfc24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 15:28:13 +0200 Subject: [PATCH 1257/3206] media: pwm-ir-tx: Avoid direct access to hrtimer clockbase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The field timer->base->get_time is a private implementation detail and should not be accessed outside of the hrtimer core. Switch to an equivalent higher-level helper. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Acked-by: Sean Young Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-6-3ae822e5bfbd@linutronix.de --- drivers/media/rc/pwm-ir-tx.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/media/rc/pwm-ir-tx.c b/drivers/media/rc/pwm-ir-tx.c index 84533fdd61aa6c..047472dc9244af 100644 --- a/drivers/media/rc/pwm-ir-tx.c +++ b/drivers/media/rc/pwm-ir-tx.c @@ -117,7 +117,6 @@ static int pwm_ir_tx_atomic(struct rc_dev *dev, unsigned int *txbuf, static enum hrtimer_restart pwm_ir_timer(struct hrtimer *timer) { struct pwm_ir *pwm_ir = container_of(timer, struct pwm_ir, timer); - ktime_t now; /* * If we happen to hit an odd latency spike, loop through the @@ -139,9 +138,7 @@ static enum hrtimer_restart pwm_ir_timer(struct hrtimer *timer) hrtimer_add_expires_ns(timer, ns); pwm_ir->txbuf_index++; - - now = timer->base->get_time(); - } while (hrtimer_get_expires_tv64(timer) < now); + } while (hrtimer_expires_remaining(timer) > 0); return HRTIMER_RESTART; } From cdea7cdae26995992a766443e1ea862923f2443d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 15:28:14 +0200 Subject: [PATCH 1258/3206] hrtimer: Use hrtimer_cb_get_time() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various other helpers contain open-coded implementations of hrtimer_cb_get_time(). This prevents refactoring the implementation. Reuse the existing helper. For this to work, move hrtimer_cb_get_time() a bit up in the file and also make its argument 'const'. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-7-3ae822e5bfbd@linutronix.de --- include/linux/hrtimer.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1ef867bb8c44b0..e655502b14e6c4 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -154,14 +154,14 @@ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) return ktime_to_ns(timer->node.expires); } -static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) +static inline ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) { - return ktime_sub(timer->node.expires, timer->base->get_time()); + return timer->base->get_time(); } -static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) +static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { - return timer->base->get_time(); + return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); } static inline int hrtimer_is_hres_active(struct hrtimer *timer) @@ -200,8 +200,7 @@ __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) static inline ktime_t hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) { - return __hrtimer_expires_remaining_adjusted(timer, - timer->base->get_time()); + return __hrtimer_expires_remaining_adjusted(timer, hrtimer_cb_get_time(timer)); } #ifdef CONFIG_TIMERFD @@ -363,7 +362,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { - return hrtimer_forward(timer, timer->base->get_time(), interval); + return hrtimer_forward(timer, hrtimer_cb_get_time(timer), interval); } /* Precise sleep: */ From 009eb5da29a91016e3ebb988e6401e79411be7a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 15:28:15 +0200 Subject: [PATCH 1259/3206] hrtimer: Remove hrtimer_clock_base:: Get_time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The get_time() callbacks always need to match the bases clockid. Instead of maintaining that association twice in hrtimer_bases, use a helper. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-8-3ae822e5bfbd@linutronix.de --- include/linux/hrtimer.h | 5 +---- include/linux/hrtimer_defs.h | 2 -- kernel/time/hrtimer.c | 34 +++++++++++++++++++++++++--------- kernel/time/timer_list.c | 2 -- scripts/gdb/linux/timerlist.py | 2 -- 5 files changed, 26 insertions(+), 19 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index e655502b14e6c4..2cf1bf65b22578 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -154,10 +154,7 @@ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) return ktime_to_ns(timer->node.expires); } -static inline ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) -{ - return timer->base->get_time(); -} +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer); static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 84a5045f80f36f..aa49ffa130e57f 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -41,7 +41,6 @@ * @seq: seqcount around __run_hrtimer * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers - * @get_time: function to retrieve the current time of the clock * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { @@ -51,7 +50,6 @@ struct hrtimer_clock_base { seqcount_raw_spinlock_t seq; struct hrtimer *running; struct timerqueue_head active; - ktime_t (*get_time)(void); ktime_t offset; } __hrtimer_clock_base_align; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c0a..fedd1d793f6cdf 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -59,6 +59,7 @@ #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) static void retrigger_next_event(void *arg); +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); /* * The timer bases: @@ -76,42 +77,34 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, }, .csd = CSD_INIT(retrigger_next_event, NULL) @@ -1253,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, base->get_time()); + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); tim = hrtimer_update_lowres(timer, tim, mode); @@ -1588,6 +1581,29 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) } } +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id) +{ + switch (clock_id) { + case CLOCK_MONOTONIC: + return ktime_get(); + case CLOCK_REALTIME: + return ktime_get_real(); + case CLOCK_BOOTTIME: + return ktime_get_boottime(); + case CLOCK_TAI: + return ktime_get_clocktai(); + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return ktime_get(); + } +} + +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) +{ + return __hrtimer_cb_get_time(timer->base->clockid); +} +EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); + static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index b03d0ada646950..488e47e96e93f9 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -102,8 +102,6 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); - - SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); diff --git a/scripts/gdb/linux/timerlist.py b/scripts/gdb/linux/timerlist.py index 98445671fe8389..ccc24d30de8063 100644 --- a/scripts/gdb/linux/timerlist.py +++ b/scripts/gdb/linux/timerlist.py @@ -56,8 +56,6 @@ def print_base(base): text += " .index: {}\n".format(base['index']) text += " .resolution: {} nsecs\n".format(constants.LX_hrtimer_resolution) - - text += " .get_time: {}\n".format(base['get_time']) if constants.LX_CONFIG_HIGH_RES_TIMERS: text += " .offset: {} nsecs\n".format(base['offset']) text += "active timers:\n" From 3c3af563b31766f67106c7549ad084a08ef613f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 15:28:16 +0200 Subject: [PATCH 1260/3206] hrtimer: Reorder branches in hrtimer_clockid_to_base() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Align the ordering to the one used for hrtimer_bases. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-9-3ae822e5bfbd@linutronix.de --- kernel/time/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fedd1d793f6cdf..f383df28c53259 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1567,10 +1567,10 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { switch (clock_id) { - case CLOCK_REALTIME: - return HRTIMER_BASE_REALTIME; case CLOCK_MONOTONIC: return HRTIMER_BASE_MONOTONIC; + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; case CLOCK_BOOTTIME: return HRTIMER_BASE_BOOTTIME; case CLOCK_TAI: From d2e1b84c5141ff2ad465279acfc3cf943c960b78 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 8 Sep 2025 15:15:51 -0700 Subject: [PATCH 1261/3206] fs/resctrl: Eliminate false positive lockdep warning when reading SNC counters Running resctrl_tests on an SNC-2 system with lockdep debugging enabled triggers several warnings with following trace: WARNING: CPU: 0 PID: 1914 at kernel/cpu.c:528 lockdep_assert_cpus_held ... Call Trace: __mon_event_count ? __lock_acquire ? __pfx___mon_event_count mon_event_count ? __pfx_smp_mon_event_count smp_mon_event_count smp_call_on_cpu_callback get_cpu_cacheinfo_level() called from __mon_event_count() requires CPU hotplug lock to be held. The hotplug lock is indeed held during this time, as confirmed by the lockdep_assert_cpus_held() within mon_event_read() that calls mon_event_count() via IPI, but the lockdep tracking is not able to follow the IPI. Fresh CPU cache information via get_cpu_cacheinfo_level() from __mon_event_count() was added to support the fix for the issue where resctrl inappropriately maintained links to L3 cache information that will be stale in the case when the associated CPU goes offline. Keep the cacheinfo ID in struct rdt_mon_domain to ensure that resctrl does not maintain stale cache information while CPUs can go offline. Return to using a pointer to the L3 cache information (struct cacheinfo) in struct rmid_read, rmid_read::ci. Initialize rmid_read::ci before the IPI where it is used. CPU hotplug lock is held across rmid_read::ci initialization and use to ensure that it points to accurate cache information. Fixes: 594902c986e2 ("x86,fs/resctrl: Remove inappropriate references to cacheinfo in the resctrl subsystem") Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov (AMD) --- fs/resctrl/ctrlmondata.c | 2 +- fs/resctrl/internal.h | 4 ++-- fs/resctrl/monitor.c | 6 ++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index d98e0d2de09fd0..3c39cfacb25183 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -625,11 +625,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) */ list_for_each_entry(d, &r->mon_domains, hdr.list) { if (d->ci_id == domid) { - rr.ci_id = d->ci_id; cpu = cpumask_any(&d->hdr.cpu_mask); ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) continue; + rr.ci = ci; mon_event_read(&rr, r, NULL, rdtgrp, &ci->shared_cpu_map, evtid, false); goto checkresult; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 0a1eedba2b03ad..9a8cf6f11151d9 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -98,7 +98,7 @@ struct mon_data { * domains in @r sharing L3 @ci.id * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains. + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -112,7 +112,7 @@ struct rmid_read { struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; - unsigned int ci_id; + struct cacheinfo *ci; int err; u64 val; void *arch_mon_ctx; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index f5637855c3acac..7326c28a7908f3 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -361,7 +361,6 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { int cpu = smp_processor_id(); struct rdt_mon_domain *d; - struct cacheinfo *ci; struct mbm_state *m; int err, ret; u64 tval = 0; @@ -389,8 +388,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) } /* Summing domains that share a cache, must be on a CPU for that cache. */ - ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); - if (!ci || ci->id != rr->ci_id) + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) return -EINVAL; /* @@ -402,7 +400,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) */ ret = -EINVAL; list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci_id != rr->ci_id) + if (d->ci_id != rr->ci->id) continue; err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, rr->evtid, &tval, rr->arch_mon_ctx); From 2682e7a317504a9d81cbb397249d4299e84dfadd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 9 Sep 2025 12:17:34 +0300 Subject: [PATCH 1262/3206] wifi: iwlwifi: fix 130/1030 configs The 130/1030 devices are really derivatives of 6030, with some small differences not pertaining to the MAC, so they must use the 6030 MAC config. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220472 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220517 Fixes: 35ac275ebe0c ("wifi: iwlwifi: cfg: finish config split") Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250909121728.8e4911f12528.I3aa7194012a4b584fbd5ddaa3a77e483280f1de4@changeid Signed-off-by: Miri Korenblit --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index f9e2095d649050..7e56e4ff764295 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -124,13 +124,13 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct pci_device_id iwl_hw_card_ids[] = { {IWL_PCI_DEVICE(0x0082, 0x1304, iwl6005_mac_cfg)},/* low 5GHz active */ {IWL_PCI_DEVICE(0x0082, 0x1305, iwl6005_mac_cfg)},/* high 5GHz active */ -/* 6x30 Series */ - {IWL_PCI_DEVICE(0x008A, 0x5305, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008A, 0x5307, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008A, 0x5325, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008A, 0x5327, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008B, 0x5315, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008B, 0x5317, iwl1000_mac_cfg)}, +/* 1030/6x30 Series */ + {IWL_PCI_DEVICE(0x008A, 0x5305, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008A, 0x5307, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008A, 0x5325, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008A, 0x5327, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008B, 0x5315, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008B, 0x5317, iwl6030_mac_cfg)}, {IWL_PCI_DEVICE(0x0090, 0x5211, iwl6030_mac_cfg)}, {IWL_PCI_DEVICE(0x0090, 0x5215, iwl6030_mac_cfg)}, {IWL_PCI_DEVICE(0x0090, 0x5216, iwl6030_mac_cfg)}, @@ -181,12 +181,12 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct pci_device_id iwl_hw_card_ids[] = { {IWL_PCI_DEVICE(0x08AE, 0x1027, iwl1000_mac_cfg)}, /* 130 Series WiFi */ - {IWL_PCI_DEVICE(0x0896, 0x5005, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0896, 0x5007, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0897, 0x5015, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0897, 0x5017, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0896, 0x5025, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0896, 0x5027, iwl1000_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5005, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5007, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0897, 0x5015, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0897, 0x5017, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5025, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5027, iwl6030_mac_cfg)}, /* 2x00 Series */ {IWL_PCI_DEVICE(0x0890, 0x4022, iwl2000_mac_cfg)}, From 15f519e9f883b316d86e2bb6b767a023aafd9d83 Mon Sep 17 00:00:00 2001 From: Alex Markuze Date: Tue, 12 Aug 2025 09:57:38 +0000 Subject: [PATCH 1263/3206] ceph: fix race condition validating r_parent before applying state Add validation to ensure the cached parent directory inode matches the directory info in MDS replies. This prevents client-side race conditions where concurrent operations (e.g. rename) cause r_parent to become stale between request initiation and reply processing, which could lead to applying state changes to incorrect directory inodes. [ idryomov: folded a kerneldoc fixup and a follow-up fix from Alex to move CEPH_CAP_PIN reference when r_parent is updated: When the parent directory lock is not held, req->r_parent can become stale and is updated to point to the correct inode. However, the associated CEPH_CAP_PIN reference was not being adjusted. The CEPH_CAP_PIN is a reference on an inode that is tracked for accounting purposes. Moving this pin is important to keep the accounting balanced. When the pin was not moved from the old parent to the new one, it created two problems: The reference on the old, stale parent was never released, causing a reference leak. A reference for the new parent was never acquired, creating the risk of a reference underflow later in ceph_mdsc_release_request(). This patch corrects the logic by releasing the pin from the old parent and acquiring it for the new parent when r_parent is switched. This ensures reference accounting stays balanced. ] Cc: stable@vger.kernel.org Signed-off-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 14 ++-- fs/ceph/dir.c | 17 ++--- fs/ceph/file.c | 24 +++--- fs/ceph/inode.c | 7 +- fs/ceph/mds_client.c | 172 ++++++++++++++++++++++++++----------------- fs/ceph/mds_client.h | 18 ++++- 6 files changed, 145 insertions(+), 107 deletions(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index fdd404fc81124d..f3fe786b4143d4 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; - int pathlen = 0; - u64 pathbase; char *path; mutex_lock(&mdsc->mutex); @@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p) if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); } else if (req->r_dentry) { - path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_dentry->d_lock); @@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); @@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p) } if (req->r_old_dentry) { - path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_old_dentry->d_lock); @@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_old_dentry, path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8478e7e75df66c..32973c62c1a230 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, /* If op failed, mark everyone involved for errors */ if (result) { - int pathlen = 0; - u64 base = 0; - char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, - &base, 0); + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); /* mark error on parent + clear complete */ mapping_set_error(req->r_parent->i_mapping, result); @@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, mapping_set_error(req->r_old_inode->i_mapping, result); pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<>" : path, result); - ceph_mdsc_free_path(path, pathlen); + path_info.vino.ino, IS_ERR(path) ? "<>" : path, result); + ceph_mdsc_free_path_info(&path_info); } out: iput(req->r_old_inode); @@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) int err = -EROFS; int op; char *path; - int pathlen; - u64 pathbase; if (ceph_snap(dir) == CEPH_SNAPDIR) { /* rmdir .snap/foo is RMSNAP */ @@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) if (!dn) { try_async = false; } else { - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); if (IS_ERR(path)) { try_async = false; err = 0; } else { err = ceph_mds_check_access(mdsc, path, MAY_WRITE); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dn); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c02f100f8552bb..978acd3d4b329c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file) int flags, fmode, wanted; struct dentry *dentry; char *path; - int pathlen; - u64 pathbase; bool do_sync = false; int mask = MAY_READ; @@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file) if (!dentry) { do_sync = true; } else { - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); if (IS_ERR(path)) { do_sync = true; err = 0; } else { err = ceph_mds_check_access(mdsc, path, mask); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dentry); /* For none EACCES cases will let the MDS do the mds auth check */ @@ -614,15 +613,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, mapping_set_error(req->r_parent->i_mapping, result); if (result) { - int pathlen = 0; - u64 base = 0; - char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, - &base, 0); + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); pr_warn_client(cl, "async create failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<>" : path, result); - ceph_mdsc_free_path(path, pathlen); + path_info.vino.ino, IS_ERR(path) ? "<>" : path, result); + ceph_mdsc_free_path_info(&path_info); ceph_dir_clear_complete(req->r_parent); if (!d_unhashed(dentry)) @@ -791,8 +788,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, int mask; int err; char *path; - int pathlen; - u64 pathbase; doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n", dir, ceph_vinop(dir), dentry, dentry, @@ -814,7 +809,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (!dn) { try_async = false; } else { - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); if (IS_ERR(path)) { try_async = false; err = 0; @@ -826,7 +822,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, mask |= MAY_WRITE; err = ceph_mds_check_access(mdsc, path, mask); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dn); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index fc543075b827a9..8ac89ce6435c9b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2487,22 +2487,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, int truncate_retry = 20; /* The RMW will take around 50ms */ struct dentry *dentry; char *path; - int pathlen; - u64 pathbase; bool do_sync = false; dentry = d_find_alias(inode); if (!dentry) { do_sync = true; } else { - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); if (IS_ERR(path)) { do_sync = true; err = 0; } else { err = ceph_mds_check_access(mdsc, path, MAY_WRITE); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dentry); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0f497c39ff8246..3bc72b47fe4d42 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2681,8 +2681,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) * ceph_mdsc_build_path - build a path string to a given dentry * @mdsc: mds client * @dentry: dentry to which path should be built - * @plen: returned length of string - * @pbase: returned base inode number + * @path_info: output path, length, base ino+snap, and freepath ownership flag * @for_wire: is this path going to be sent to the MDS? * * Build a string that represents the path to the dentry. This is mostly called @@ -2700,7 +2699,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) * foo/.snap/bar -> foo//bar */ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, - int *plen, u64 *pbase, int for_wire) + struct ceph_path_info *path_info, int for_wire) { struct ceph_client *cl = mdsc->fsc->client; struct dentry *cur; @@ -2810,16 +2809,28 @@ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, return ERR_PTR(-ENAMETOOLONG); } - *pbase = base; - *plen = PATH_MAX - 1 - pos; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + + path_info->vino.ino = base; + path_info->pathlen = PATH_MAX - 1 - pos; + path_info->path = path + pos; + path_info->freepath = true; + + /* Set snap from dentry if available */ + if (d_inode(dentry)) + path_info->vino.snap = ceph_snap(d_inode(dentry)); + else + path_info->vino.snap = CEPH_NOSNAP; + doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), - base, *plen, path + pos); + base, PATH_MAX - 1 - pos, path + pos); return path + pos; } static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry, - struct inode *dir, const char **ppath, int *ppathlen, - u64 *pino, bool *pfreepath, bool parent_locked) + struct inode *dir, struct ceph_path_info *path_info, + bool parent_locked) { char *path; @@ -2828,41 +2839,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry dir = d_inode_rcu(dentry->d_parent); if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && !IS_ENCRYPTED(dir)) { - *pino = ceph_ino(dir); + path_info->vino.ino = ceph_ino(dir); + path_info->vino.snap = ceph_snap(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + path_info->path = dentry->d_name.name; + path_info->pathlen = dentry->d_name.len; + path_info->freepath = false; return 0; } rcu_read_unlock(); - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = true; + /* + * ceph_mdsc_build_path already fills path_info, including snap handling. + */ return 0; } -static int build_inode_path(struct inode *inode, - const char **ppath, int *ppathlen, u64 *pino, - bool *pfreepath) +static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct dentry *dentry; char *path; if (ceph_snap(inode) == CEPH_NOSNAP) { - *pino = ceph_ino(inode); - *ppathlen = 0; + path_info->vino.ino = ceph_ino(inode); + path_info->vino.snap = ceph_snap(inode); + path_info->pathlen = 0; + path_info->freepath = false; return 0; } dentry = d_find_alias(inode); - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); dput(dentry); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = true; + /* + * ceph_mdsc_build_path already fills path_info, including snap from dentry. + * Override with inode's snap since that's what this function is for. + */ + path_info->vino.snap = ceph_snap(inode); return 0; } @@ -2872,26 +2889,32 @@ static int build_inode_path(struct inode *inode, */ static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, - const char *rpath, u64 rino, const char **ppath, - int *pathlen, u64 *ino, bool *freepath, + const char *rpath, u64 rino, + struct ceph_path_info *path_info, bool parent_locked) { struct ceph_client *cl = mdsc->fsc->client; int r = 0; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + if (rinode) { - r = build_inode_path(rinode, ppath, pathlen, ino, freepath); + r = build_inode_path(rinode, path_info); doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode), ceph_snap(rinode)); } else if (rdentry) { - r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino, - freepath, parent_locked); - doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); + r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked); + doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino, + path_info->pathlen, path_info->path); } else if (rpath || rino) { - *ino = rino; - *ppath = rpath; - *pathlen = rpath ? strlen(rpath) : 0; - doutc(cl, " path %.*s\n", *pathlen, rpath); + path_info->vino.ino = rino; + path_info->vino.snap = CEPH_NOSNAP; + path_info->path = rpath; + path_info->pathlen = rpath ? strlen(rpath) : 0; + path_info->freepath = false; + + doutc(cl, " path %.*s\n", path_info->pathlen, rpath); } return r; @@ -2968,11 +2991,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; struct ceph_mds_request_head_legacy *lhead; - const char *path1 = NULL; - const char *path2 = NULL; - u64 ino1 = 0, ino2 = 0; - int pathlen1 = 0, pathlen2 = 0; - bool freepath1 = false, freepath2 = false; + struct ceph_path_info path_info1 = {0}; + struct ceph_path_info path_info2 = {0}; struct dentry *old_dentry = NULL; int len; u16 releases; @@ -2982,25 +3002,49 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, u16 request_head_version = mds_supported_head_version(session); kuid_t caller_fsuid = req->r_cred->fsuid; kgid_t caller_fsgid = req->r_cred->fsgid; + bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, - req->r_parent, req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1, - test_bit(CEPH_MDS_R_PARENT_LOCKED, - &req->r_req_flags)); + req->r_parent, req->r_path1, req->r_ino1.ino, + &path_info1, parent_locked); if (ret < 0) { msg = ERR_PTR(ret); goto out; } + /* + * When the parent directory's i_rwsem is *not* locked, req->r_parent may + * have become stale (e.g. after a concurrent rename) between the time the + * dentry was looked up and now. If we detect that the stored r_parent + * does not match the inode number we just encoded for the request, switch + * to the correct inode so that the MDS receives a valid parent reference. + */ + if (!parent_locked && req->r_parent && path_info1.vino.ino && + ceph_ino(req->r_parent) != path_info1.vino.ino) { + struct inode *old_parent = req->r_parent; + struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL); + if (!IS_ERR(correct_dir)) { + WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n", + ceph_ino(old_parent), path_info1.vino.ino); + /* + * Transfer CEPH_CAP_PIN from the old parent to the new one. + * The pin was taken earlier in ceph_mdsc_submit_request(). + */ + ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN); + iput(old_parent); + req->r_parent = correct_dir; + ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + } + } + /* If r_old_dentry is set, then assume that its parent is locked */ if (req->r_old_dentry && !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) old_dentry = req->r_old_dentry; ret = set_request_path_attr(mdsc, NULL, old_dentry, - req->r_old_dentry_dir, - req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2, true); + req->r_old_dentry_dir, + req->r_path2, req->r_ino2.ino, + &path_info2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; @@ -3031,7 +3075,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, /* filepaths */ len += 2 * (1 + sizeof(u32) + sizeof(u64)); - len += pathlen1 + pathlen2; + len += path_info1.pathlen + path_info2.pathlen; /* cap releases */ len += sizeof(struct ceph_mds_request_release) * @@ -3039,9 +3083,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, !!req->r_old_inode_drop + !!req->r_old_dentry_drop); if (req->r_dentry_drop) - len += pathlen1; + len += path_info1.pathlen; if (req->r_old_dentry_drop) - len += pathlen2; + len += path_info2.pathlen; /* MClientRequest tail */ @@ -3154,8 +3198,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, lhead->ino = cpu_to_le64(req->r_deleg_ino); lhead->args = req->r_args; - ceph_encode_filepath(&p, end, ino1, path1); - ceph_encode_filepath(&p, end, ino2, path2); + ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path); + ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path); /* make note of release offset, in case we need to replay */ req->r_request_release_offset = p - msg->front.iov_base; @@ -3218,11 +3262,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.data_off = cpu_to_le16(0); out_free2: - if (freepath2) - ceph_mdsc_free_path((char *)path2, pathlen2); + ceph_mdsc_free_path_info(&path_info2); out_free1: - if (freepath1) - ceph_mdsc_free_path((char *)path1, pathlen1); + ceph_mdsc_free_path_info(&path_info1); out: return msg; out_err: @@ -4579,24 +4621,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; struct ceph_cap *cap; - char *path; - int pathlen = 0, err; - u64 pathbase; + struct ceph_path_info path_info = {0}; + int err; u64 snap_follows; dentry = d_find_primary(inode); if (dentry) { /* set pathbase to parent dir when msg_version >= 2 */ - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, recon_state->msg_version >= 2); dput(dentry); if (IS_ERR(path)) { err = PTR_ERR(path); goto out_err; } - } else { - path = NULL; - pathbase = 0; } spin_lock(&ci->i_ceph_lock); @@ -4629,7 +4667,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v2.pathbase = cpu_to_le64(pathbase); + rec.v2.pathbase = cpu_to_le64(path_info.vino.ino); rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { @@ -4644,7 +4682,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) ts = inode_get_atime(inode); ceph_encode_timespec64(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v1.pathbase = cpu_to_le64(pathbase); + rec.v1.pathbase = cpu_to_le64(path_info.vino.ino); } if (list_empty(&ci->i_cap_snaps)) { @@ -4706,7 +4744,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) sizeof(struct ceph_filelock); rec.v2.flock_len = cpu_to_le32(struct_len); - struct_len += sizeof(u32) + pathlen + sizeof(rec.v2); + struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2); if (struct_v >= 2) struct_len += sizeof(u64); /* snap_follows */ @@ -4730,7 +4768,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_32(pagelist, struct_len); } - ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); ceph_locks_to_pagelist(flocks, pagelist, num_fcntl_locks, num_flock_locks); @@ -4741,17 +4779,17 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) } else { err = ceph_pagelist_reserve(pagelist, sizeof(u64) + sizeof(u32) + - pathlen + sizeof(rec.v1)); + path_info.pathlen + sizeof(rec.v1)); if (err) goto out_err; ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); - ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); } out_err: - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); if (!err) recon_state->nr_caps++; return err; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3e2a6fa7c19aab..0428a5eaf28c65 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); -static inline void ceph_mdsc_free_path(char *path, int len) +/* + * Structure to group path-related output parameters for build_*_path functions + */ +struct ceph_path_info { + const char *path; + int pathlen; + struct ceph_vino vino; + bool freepath; +}; + +static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info) { - if (!IS_ERR_OR_NULL(path)) - __putname(path - (PATH_MAX - 1 - len)); + if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path)) + __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen)); } extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, - struct dentry *dentry, int *plen, u64 *base, + struct dentry *dentry, struct ceph_path_info *path_info, int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); From bec324f33d1ed346394b2eee25bf6dbf3511f727 Mon Sep 17 00:00:00 2001 From: Alex Markuze Date: Tue, 12 Aug 2025 09:57:39 +0000 Subject: [PATCH 1264/3206] ceph: fix race condition where r_parent becomes stale before sending message When the parent directory's i_rwsem is not locked, req->r_parent may become stale due to concurrent operations (e.g. rename) between dentry lookup and message creation. Validate that r_parent matches the encoded parent inode and update to the correct inode if a mismatch is detected. [ idryomov: folded a follow-up fix from Alex to drop extra reference from ceph_get_reply_dir() in ceph_fill_trace(): ceph_get_reply_dir() may return a different, referenced inode when r_parent is stale and the parent directory lock is not held. ceph_fill_trace() used that inode but failed to drop the reference when it differed from req->r_parent, leaking an inode reference. Keep the directory inode in a local variable and iput() it at function end if it does not match req->r_parent. ] Cc: stable@vger.kernel.org Signed-off-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 81 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 12 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8ac89ce6435c9b..f67025465de0da 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } +/* + * Check if the parent inode matches the vino from directory reply info + */ +static inline bool ceph_vino_matches_parent(struct inode *parent, + struct ceph_vino vino) +{ + return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap; +} + +/* + * Validate that the directory inode referenced by @req->r_parent matches the + * inode number and snapshot id contained in the reply's directory record. If + * they do not match – which can theoretically happen if the parent dentry was + * moved between the time the request was issued and the reply arrived – fall + * back to looking up the correct inode in the inode cache. + * + * A reference is *always* returned. Callers that receive a different inode + * than the original @parent are responsible for dropping the extra reference + * once the reply has been processed. + */ +static struct inode *ceph_get_reply_dir(struct super_block *sb, + struct inode *parent, + struct ceph_mds_reply_info_parsed *rinfo) +{ + struct ceph_vino vino; + + if (unlikely(!rinfo->diri.in)) + return parent; /* nothing to compare against */ + + /* If we didn't have a cached parent inode to begin with, just bail out. */ + if (!parent) + return NULL; + + vino.ino = le64_to_cpu(rinfo->diri.in->ino); + vino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + if (likely(ceph_vino_matches_parent(parent, vino))) + return parent; /* matches – use the original reference */ + + /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */ + WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n", + ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap); + + return ceph_get_inode(sb, vino, NULL); +} + /** * ceph_new_inode - allocate a new inode in advance of an expected create * @dir: parent directory for new inode @@ -1523,6 +1569,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) struct ceph_vino tvino, dvino; struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_client *cl = fsc->client; + struct inode *parent_dir = NULL; int err = 0; doutc(cl, "%p is_dentry %d is_target %d\n", req, @@ -1536,10 +1583,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) } if (rinfo->head->is_dentry) { - struct inode *dir = req->r_parent; - - if (dir) { - err = ceph_fill_inode(dir, NULL, &rinfo->diri, + /* + * r_parent may be stale, in cases when R_PARENT_LOCKED is not set, + * so we need to get the correct inode + */ + parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo); + if (unlikely(IS_ERR(parent_dir))) { + err = PTR_ERR(parent_dir); + goto done; + } + if (parent_dir) { + err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri, rinfo->dirfrag, session, -1, &req->r_caps_reservation); if (err < 0) @@ -1548,14 +1602,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) WARN_ON_ONCE(1); } - if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && + if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { bool is_nokey = false; struct qstr dname; struct dentry *dn, *parent; struct fscrypt_str oname = FSTR_INIT(NULL, 0); - struct ceph_fname fname = { .dir = dir, + struct ceph_fname fname = { .dir = parent_dir, .name = rinfo->dname, .ctext = rinfo->altname, .name_len = rinfo->dname_len, @@ -1564,10 +1618,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) BUG_ON(!rinfo->head->is_target); BUG_ON(req->r_dentry); - parent = d_find_any_alias(dir); + parent = d_find_any_alias(parent_dir); BUG_ON(!parent); - err = ceph_fname_alloc_buffer(dir, &oname); + err = ceph_fname_alloc_buffer(parent_dir, &oname); if (err < 0) { dput(parent); goto done; @@ -1576,7 +1630,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey); if (err < 0) { dput(parent); - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); goto done; } dname.name = oname.name; @@ -1595,7 +1649,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) dname.len, dname.name, dn); if (!dn) { dput(parent); - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); err = -ENOMEM; goto done; } @@ -1610,12 +1664,12 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) ceph_snap(d_inode(dn)) != tvino.snap)) { doutc(cl, " dn %p points to wrong inode %p\n", dn, d_inode(dn)); - ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(parent_dir); d_delete(dn); dput(dn); goto retry_lookup; } - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); req->r_dentry = dn; dput(parent); @@ -1794,6 +1848,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) &dvino, ptvino); } done: + /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */ + if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent)) + iput(parent_dir); doutc(cl, "done err=%d\n", err); return err; } From cce7c15faaac79b532a07ed6ab8332280ad83762 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 27 Aug 2025 20:17:08 +0200 Subject: [PATCH 1265/3206] ceph: always call ceph_shift_unused_folios_left() The function ceph_process_folio_batch() sets folio_batch entries to NULL, which is an illegal state. Before folio_batch_release() crashes due to this API violation, the function ceph_shift_unused_folios_left() is supposed to remove those NULLs from the array. However, since commit ce80b76dd327 ("ceph: introduce ceph_process_folio_batch() method"), this shifting doesn't happen anymore because the "for" loop got moved to ceph_process_folio_batch(), and now the `i` variable that remains in ceph_writepages_start() doesn't get incremented anymore, making the shifting effectively unreachable much of the time. Later, commit 1551ec61dc55 ("ceph: introduce ceph_submit_write() method") added more preconditions for doing the shift, replacing the `i` check (with something that is still just as broken): - if ceph_process_folio_batch() fails, shifting never happens - if ceph_move_dirty_page_in_page_array() was never called (because ceph_process_folio_batch() has returned early for some of various reasons), shifting never happens - if `processed_in_fbatch` is zero (because ceph_process_folio_batch() has returned early for some of the reasons mentioned above or because ceph_move_dirty_page_in_page_array() has failed), shifting never happens Since those two commits, any problem in ceph_process_folio_batch() could crash the kernel, e.g. this way: BUG: kernel NULL pointer dereference, address: 0000000000000034 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: Oops: 0002 [#1] SMP NOPTI CPU: 172 UID: 0 PID: 2342707 Comm: kworker/u778:8 Not tainted 6.15.10-cm4all1-es #714 NONE Hardware name: Dell Inc. PowerEdge R7615/0G9DHV, BIOS 1.6.10 12/08/2023 Workqueue: writeback wb_workfn (flush-ceph-1) RIP: 0010:folios_put_refs+0x85/0x140 Code: 83 c5 01 39 e8 7e 76 48 63 c5 49 8b 5c c4 08 b8 01 00 00 00 4d 85 ed 74 05 41 8b 44 ad 00 48 8b 15 b0 > RSP: 0018:ffffb880af8db778 EFLAGS: 00010207 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000003 RDX: ffffe377cc3b0000 RSI: 0000000000000000 RDI: ffffb880af8db8c0 RBP: 0000000000000000 R08: 000000000000007d R09: 000000000102b86f R10: 0000000000000001 R11: 00000000000000ac R12: ffffb880af8db8c0 R13: 0000000000000000 R14: 0000000000000000 R15: ffff9bd262c97000 FS: 0000000000000000(0000) GS:ffff9c8efc303000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000034 CR3: 0000000160958004 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: ceph_writepages_start+0xeb9/0x1410 The crash can be reproduced easily by changing the ceph_check_page_before_write() return value to `-E2BIG`. (Interestingly, the crash happens only if `huge_zero_folio` has already been allocated; without `huge_zero_folio`, is_huge_zero_folio(NULL) returns true and folios_put_refs() skips NULL entries instead of dereferencing them. That makes reproducing the bug somewhat unreliable. See https://lore.kernel.org/20250826231626.218675-1-max.kellermann@ionos.com for a discussion of this detail.) My suggestion is to move the ceph_shift_unused_folios_left() to right after ceph_process_folio_batch() to ensure it always gets called to fix up the illegal folio_batch state. Cc: stable@vger.kernel.org Fixes: ce80b76dd327 ("ceph: introduce ceph_process_folio_batch() method") Link: https://lore.kernel.org/ceph-devel/aK4v548CId5GIKG1@swift.blarg.de/ Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8b202d789e9350..8bc66b45dade6f 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1687,6 +1687,7 @@ static int ceph_writepages_start(struct address_space *mapping, process_folio_batch: rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); + ceph_shift_unused_folios_left(&ceph_wbc.fbatch); if (rc) goto release_folios; @@ -1695,8 +1696,6 @@ static int ceph_writepages_start(struct address_space *mapping, goto release_folios; if (ceph_wbc.processed_in_fbatch) { - ceph_shift_unused_folios_left(&ceph_wbc.fbatch); - if (folio_batch_count(&ceph_wbc.fbatch) == 0 && ceph_wbc.locked_pages < ceph_wbc.max_pages) { doutc(cl, "reached end fbatch, trying for more\n"); From 249e0a47cdb46bb9eae65511c569044bd8698d7d Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 28 Aug 2025 13:15:52 +0200 Subject: [PATCH 1266/3206] ceph: fix crash after fscrypt_encrypt_pagecache_blocks() error The function move_dirty_folio_in_page_array() was created by commit ce80b76dd327 ("ceph: introduce ceph_process_folio_batch() method") by moving code from ceph_writepages_start() to this function. This new function is supposed to return an error code which is checked by the caller (now ceph_process_folio_batch()), and on error, the caller invokes redirty_page_for_writepage() and then breaks from the loop. However, the refactoring commit has gone wrong, and it by accident, it always returns 0 (= success) because it first NULLs the pointer and then returns PTR_ERR(NULL) which is always 0. This means errors are silently ignored, leaving NULL entries in the page array, which may later crash the kernel. The simple solution is to call PTR_ERR() before clearing the pointer. Cc: stable@vger.kernel.org Fixes: ce80b76dd327 ("ceph: introduce ceph_process_folio_batch() method") Link: https://lore.kernel.org/ceph-devel/aK4v548CId5GIKG1@swift.blarg.de/ Signed-off-by: Max Kellermann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8bc66b45dade6f..322ed268f14aa9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1264,7 +1264,9 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping, 0, gfp_flags); if (IS_ERR(pages[index])) { - if (PTR_ERR(pages[index]) == -EINVAL) { + int err = PTR_ERR(pages[index]); + + if (err == -EINVAL) { pr_err_client(cl, "inode->i_blkbits=%hhu\n", inode->i_blkbits); } @@ -1273,7 +1275,7 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping, BUG_ON(ceph_wbc->locked_pages == 0); pages[index] = NULL; - return PTR_ERR(pages[index]); + return err; } } else { pages[index] = &folio->page; From e3c674db356c4303804b2415e7c2b11776cdd8c3 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 4 Sep 2025 14:53:50 +0200 Subject: [PATCH 1267/3206] tunnels: reset the GSO metadata before reusing the skb If a GSO skb is sent through a Geneve tunnel and if Geneve options are added, the split GSO skb might not fit in the MTU anymore and an ICMP frag needed packet can be generated. In such case the ICMP packet might go through the segmentation logic (and dropped) later if it reaches a path were the GSO status is checked and segmentation is required. This is especially true when an OvS bridge is used with a Geneve tunnel attached to it. The following set of actions could lead to the ICMP packet being wrongfully segmented: 1. An skb is constructed by the TCP layer (e.g. gso_type SKB_GSO_TCPV4, segs >= 2). 2. The skb hits the OvS bridge where Geneve options are added by an OvS action before being sent through the tunnel. 3. When the skb is xmited in the tunnel, the split skb does not fit anymore in the MTU and iptunnel_pmtud_build_icmp is called to generate an ICMP fragmentation needed packet. This is done by reusing the original (GSO!) skb. The GSO metadata is not cleared. 4. The ICMP packet being sent back hits the OvS bridge again and because skb_is_gso returns true, it goes through queue_gso_packets... 5. ...where __skb_gso_segment is called. The skb is then dropped. 6. Note that in the above example on re-transmission the skb won't be a GSO one as it would be segmented (len > MSS) and the ICMP packet should go through. Fix this by resetting the GSO information before reusing an skb in iptunnel_pmtud_build_icmp and iptunnel_pmtud_build_icmpv6. Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Reported-by: Adrian Moreno Signed-off-by: Antoine Tenart Reviewed-by: Stefano Brivio Link: https://patch.msgid.link/20250904125351.159740-1-atenart@kernel.org Signed-off-by: Paolo Abeni --- net/ipv4/ip_tunnel_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index cc9915543637df..2e61ac1371289a 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -206,6 +206,9 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); @@ -300,6 +303,9 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); From fe2a449a45b13df1562419e0104b4777b6ea5248 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sat, 6 Sep 2025 14:49:51 +0800 Subject: [PATCH 1268/3206] tick: Do not set device to detached state in tick_shutdown() tick_shutdown() sets the state of the clockevent device to detached first and the invokes clockevents_exchange_device(), which in turn invokes clockevents_switch_state(). But clockevents_switch_state() returns without invoking the device shutdown callback as the device is already in detached state. As a consequence the timer device is not shutdown when a CPU goes offline. tick_shutdown() does this because it was originally invoked on a online CPU and not on the outgoing CPU. It therefore could not access the clockevent device of the already offlined CPU and just set the state. Since commit 3b1596a21fbf tick_shutdown() is called on the outgoing CPU, so the hardware device can be accessed. Remove the state set before calling clockevents_exchange_device(), so that the subsequent clockevents_switch_state() handles the state transition and invokes the shutdown callback of the clockevent device. [ tglx: Massaged change log ] Fixes: 3b1596a21fbf ("clockevents: Shutdown and unregister current clockevents at CPUHP_AP_TICK_DYING") Signed-off-by: Bibo Mao Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20250906064952.3749122-2-maobibo@loongson.cn --- kernel/time/clockevents.c | 2 +- kernel/time/tick-common.c | 16 +++++----------- kernel/time/tick-internal.h | 2 +- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f3e831f62906f1..a59bc75ab7c5b4 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -633,7 +633,7 @@ void tick_offline_cpu(unsigned int cpu) raw_spin_lock(&clockevents_lock); tick_broadcast_offline(cpu); - tick_shutdown(cpu); + tick_shutdown(); /* * Unregister the clock event devices which were diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 9a3859443c042c..7e33d3f2e889b1 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -411,24 +411,18 @@ int tick_cpu_dying(unsigned int dying_cpu) } /* - * Shutdown an event device on a given cpu: + * Shutdown an event device on the outgoing CPU: * - * This is called on a life CPU, when a CPU is dead. So we cannot - * access the hardware device itself. - * We just set the mode and remove it from the lists. + * Called by the dying CPU during teardown, with clockevents_lock held + * and interrupts disabled. */ -void tick_shutdown(unsigned int cpu) +void tick_shutdown(void) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; if (dev) { - /* - * Prevent that the clock events layer tries to call - * the set mode function! - */ - clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index faac36de35b9ef..4e4f7bbe2a64bc 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -26,7 +26,7 @@ extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); extern void tick_offline_cpu(unsigned int cpu); -extern void tick_shutdown(unsigned int cpu); +extern void tick_shutdown(void); extern void tick_suspend(void); extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, From d55dd6f5e47dd85ab2086b62f7a747d3f8be0fa0 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sat, 6 Sep 2025 14:49:52 +0800 Subject: [PATCH 1269/3206] LoongArch: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Bibo Mao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250906064952.3749122-3-maobibo@loongson.cn --- arch/loongarch/kernel/time.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index f3092f2de8b501..6fb92cc1a4c92a 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -112,8 +112,6 @@ static int arch_timer_starting(unsigned int cpu) static int arch_timer_dying(unsigned int cpu) { - constant_set_state_shutdown(this_cpu_ptr(&constant_clockevent_device)); - /* Clear Timer Interrupt */ write_csr_tintclear(CSR_TINTCLR_TI); From 690aa09b1845c0d5c3c29dabd50a9d0488c97c48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Tue, 9 Sep 2025 11:28:29 +0200 Subject: [PATCH 1270/3206] ASoC: Intel: catpt: Expose correct bit depth to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently wrong bit depth is exposed in hw params, causing clipped volume during playback. Expose correct parameters. Fixes: a126750fc865 ("ASoC: Intel: catpt: PCM operations") Reported-by: Andy Shevchenko Tested-by: Andy Shevchenko Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Message-ID: <20250909092829.375953-1-amadeuszx.slawinski@linux.intel.com> Signed-off-by: Mark Brown --- sound/soc/intel/catpt/pcm.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/sound/soc/intel/catpt/pcm.c b/sound/soc/intel/catpt/pcm.c index 46acb7fdc547d8..bf734c69c4e095 100644 --- a/sound/soc/intel/catpt/pcm.c +++ b/sound/soc/intel/catpt/pcm.c @@ -568,8 +568,9 @@ static const struct snd_pcm_hardware catpt_pcm_hardware = { SNDRV_PCM_INFO_RESUME | SNDRV_PCM_INFO_NO_PERIOD_WAKEUP, .formats = SNDRV_PCM_FMTBIT_S16_LE | - SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, .period_bytes_min = PAGE_SIZE, .period_bytes_max = CATPT_BUFFER_MAX_SIZE / CATPT_PCM_PERIODS_MIN, .periods_min = CATPT_PCM_PERIODS_MIN, @@ -698,14 +699,18 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_48000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, .capture = { .stream_name = "Analog Capture", .channels_min = 2, .channels_max = 4, .rates = SNDRV_PCM_RATE_48000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { @@ -717,7 +722,9 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_8000_192000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { @@ -729,7 +736,9 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_8000_192000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { @@ -741,7 +750,9 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_48000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { From 89ace3acf013eebb299290f41b4ac34bf6c62a10 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 9 Sep 2025 12:30:34 +0100 Subject: [PATCH 1271/3206] ASoC: cs-amp-lib: Rename defines for Cirrus Logic EFI Rename: CS_AMP_CAL_NAME => CIRRUS_LOGIC_CALIBRATION_EFI_NAME CS_AMP_CAL_GUID => CIRRUS_LOGIC_CALIBRATION_EFI_GUID This is to clarify that these are specific to Cirrus Logic, especially the GUID. As defined by the UEFI specification the GUID is a vendor identifier, not an EFI variable identifier. There has been some misunderstanding of the purpose of these, which has led to the Cirrus Logic GUID value being copied and used for other vendor's EFI variables. It is rather strange to have data from another vendor marked with the vendor GUID for Cirrus Logic. Signed-off-by: Richard Fitzgerald Message-ID: <20250909113039.922065-2-rf@opensource.cirrus.com> Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index 808e67c90f7c72..b1530e7c75e897 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -16,11 +16,10 @@ #include #include -#define CS_AMP_CAL_GUID \ +#define CIRRUS_LOGIC_CALIBRATION_EFI_NAME L"CirrusSmartAmpCalibrationData" +#define CIRRUS_LOGIC_CALIBRATION_EFI_GUID \ EFI_GUID(0x02f9af02, 0x7734, 0x4233, 0xb4, 0x3d, 0x93, 0xfe, 0x5a, 0xa3, 0x5d, 0xb3) -#define CS_AMP_CAL_NAME L"CirrusSmartAmpCalibrationData" - static int cs_amp_write_cal_coeff(struct cs_dsp *dsp, const struct cirrus_amp_cal_controls *controls, const char *ctl_name, u32 val) @@ -124,7 +123,9 @@ static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev) int ret; /* Get real size of UEFI variable */ - status = cs_amp_get_efi_variable(CS_AMP_CAL_NAME, &CS_AMP_CAL_GUID, &data_size, NULL); + status = cs_amp_get_efi_variable(CIRRUS_LOGIC_CALIBRATION_EFI_NAME, + &CIRRUS_LOGIC_CALIBRATION_EFI_GUID, + &data_size, NULL); if (status != EFI_BUFFER_TOO_SMALL) return ERR_PTR(-ENOENT); @@ -138,7 +139,9 @@ static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev) if (!data) return ERR_PTR(-ENOMEM); - status = cs_amp_get_efi_variable(CS_AMP_CAL_NAME, &CS_AMP_CAL_GUID, &data_size, data); + status = cs_amp_get_efi_variable(CIRRUS_LOGIC_CALIBRATION_EFI_NAME, + &CIRRUS_LOGIC_CALIBRATION_EFI_GUID, + &data_size, data); if (status != EFI_SUCCESS) { ret = -EINVAL; goto err; From 267b9cdee522d03f95acf7c77de91056a4e004b3 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 9 Sep 2025 12:30:35 +0100 Subject: [PATCH 1272/3206] ASoC: cs-amp-lib: Add handling for Lenovo and HP UEFI speaker ID Add handling of the Lenovo-specific and HP-specific EFI variables for speaker ID. Future Lenovo and HP models will not give the codec driver access to the speaker detect GPIO. Instead, the BIOS will read the GPIO and create an EFI variable with a value indicating the state of the GPIO. The Lenovo and HP EFI variables are both defined to have only two valid values. But the variable name, GUID and values are different. This adds a new exported function cs_amp_get_vendor_spkid(). Signed-off-by: Richard Fitzgerald Message-ID: <20250909113039.922065-3-rf@opensource.cirrus.com> Signed-off-by: Mark Brown --- include/sound/cs-amp-lib.h | 1 + sound/soc/codecs/cs-amp-lib.c | 101 ++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h index 5459c221badfed..43a87a39110c0c 100644 --- a/include/sound/cs-amp-lib.h +++ b/include/sound/cs-amp-lib.h @@ -49,6 +49,7 @@ int cs_amp_write_cal_coeffs(struct cs_dsp *dsp, const struct cirrus_amp_cal_data *data); int cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_index, struct cirrus_amp_cal_data *out_data); +int cs_amp_get_vendor_spkid(struct device *dev); struct cs_amp_test_hooks { efi_status_t (*get_efi_variable)(efi_char16_t *name, diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index b1530e7c75e897..9b51d056d863b7 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -20,6 +20,14 @@ #define CIRRUS_LOGIC_CALIBRATION_EFI_GUID \ EFI_GUID(0x02f9af02, 0x7734, 0x4233, 0xb4, 0x3d, 0x93, 0xfe, 0x5a, 0xa3, 0x5d, 0xb3) +#define LENOVO_SPEAKER_ID_EFI_NAME L"SdwSpeaker" +#define LENOVO_SPEAKER_ID_EFI_GUID \ + EFI_GUID(0x48df970e, 0xe27f, 0x460a, 0xb5, 0x86, 0x77, 0x19, 0x80, 0x1d, 0x92, 0x82) + +#define HP_SPEAKER_ID_EFI_NAME L"HPSpeakerID" +#define HP_SPEAKER_ID_EFI_GUID \ + EFI_GUID(0xc49593a4, 0xd099, 0x419b, 0xa2, 0xc3, 0x67, 0xe9, 0x80, 0xe6, 0x1d, 0x1e) + static int cs_amp_write_cal_coeff(struct cs_dsp *dsp, const struct cirrus_amp_cal_controls *controls, const char *ctl_name, u32 val) @@ -114,6 +122,24 @@ static efi_status_t cs_amp_get_efi_variable(efi_char16_t *name, return EFI_NOT_FOUND; } +static int cs_amp_convert_efi_status(efi_status_t status) +{ + switch (status) { + case EFI_SUCCESS: + return 0; + case EFI_NOT_FOUND: + return -ENOENT; + case EFI_BUFFER_TOO_SMALL: + return -EFBIG; + case EFI_UNSUPPORTED: + case EFI_ACCESS_DENIED: + case EFI_SECURITY_VIOLATION: + return -EACCES; + default: + return -EIO; + } +} + static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev) { struct cirrus_amp_efi_data *efi_data; @@ -276,6 +302,81 @@ int cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_ } EXPORT_SYMBOL_NS_GPL(cs_amp_get_efi_calibration_data, "SND_SOC_CS_AMP_LIB"); +struct cs_amp_spkid_efi { + efi_char16_t *name; + efi_guid_t *guid; + u8 values[2]; +}; + +static int cs_amp_get_efi_byte_spkid(struct device *dev, const struct cs_amp_spkid_efi *info) +{ + efi_status_t status; + unsigned long size; + u8 spkid; + int i, ret; + + size = sizeof(spkid); + status = cs_amp_get_efi_variable(info->name, info->guid, &size, &spkid); + ret = cs_amp_convert_efi_status(status); + if (ret < 0) + return ret; + + if (size == 0) + return -ENOENT; + + for (i = 0; i < ARRAY_SIZE(info->values); i++) { + if (info->values[i] == spkid) + return i; + } + + dev_err(dev, "EFI speaker ID bad value %#x\n", spkid); + + return -EINVAL; +} + +static const struct cs_amp_spkid_efi cs_amp_spkid_byte_types[] = { + { + .name = LENOVO_SPEAKER_ID_EFI_NAME, + .guid = &LENOVO_SPEAKER_ID_EFI_GUID, + .values = { 0xd0, 0xd1 }, + }, + { + .name = HP_SPEAKER_ID_EFI_NAME, + .guid = &HP_SPEAKER_ID_EFI_GUID, + .values = { 0x30, 0x31 }, + }, +}; + +/** + * cs_amp_get_vendor_spkid - get a speaker ID from vendor-specific storage + * @dev: pointer to struct device + * + * Known vendor-specific methods of speaker ID are checked and if one is + * found its speaker ID value is returned. + * + * Return: >=0 is a valid speaker ID. -ENOENT if a vendor-specific method + * was not found. -EACCES if the vendor-specific storage could not + * be read. Other error values indicate that the data from the + * vendor-specific storage was found but could not be understood. + */ +int cs_amp_get_vendor_spkid(struct device *dev) +{ + int i, ret; + + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE) && + !IS_ENABLED(CONFIG_SND_SOC_CS_AMP_LIB_TEST)) + return -ENOENT; + + for (i = 0; i < ARRAY_SIZE(cs_amp_spkid_byte_types); i++) { + ret = cs_amp_get_efi_byte_spkid(dev, &cs_amp_spkid_byte_types[i]); + if (ret != -ENOENT) + return ret; + } + + return -ENOENT; +} +EXPORT_SYMBOL_NS_GPL(cs_amp_get_vendor_spkid, "SND_SOC_CS_AMP_LIB"); + static const struct cs_amp_test_hooks cs_amp_test_hook_ptrs = { .get_efi_variable = cs_amp_get_efi_variable, .write_cal_coeff = cs_amp_write_cal_coeff, From 59255cfa4a0aa3484739ea58d67e02651424ffe3 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 9 Sep 2025 12:30:36 +0100 Subject: [PATCH 1273/3206] ASoC: cs35l56: Check for vendor-specific speaker ID value Call cs_amp_get_vendor_spkid() to get use a vendor-specific speaker ID value, if one exists. The speaker ID is used to load an appropriate set of firmware files for the speakers, and is usually read from a GPIO. Some manufacturers are instead using a custom UEFI variable for the speaker ID. Signed-off-by: Richard Fitzgerald Message-ID: <20250909113039.922065-4-rf@opensource.cirrus.com> Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56-shared.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index 850fcf38599681..95d018ecb95386 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -1054,7 +1054,17 @@ int cs35l56_get_speaker_id(struct cs35l56_base *cs35l56_base) u32 speaker_id; int i, ret; - /* Attempt to read the speaker type from a device property first */ + /* Check for vendor-specific speaker ID method */ + ret = cs_amp_get_vendor_spkid(cs35l56_base->dev); + if (ret >= 0) { + dev_dbg(cs35l56_base->dev, "Vendor Speaker ID = %d\n", ret); + return ret; + } else if (ret != -ENOENT) { + dev_err(cs35l56_base->dev, "Error getting vendor Speaker ID: %d\n", ret); + return ret; + } + + /* Attempt to read the speaker type from a device property */ ret = device_property_read_u32(cs35l56_base->dev, "cirrus,speaker-id", &speaker_id); if (!ret) { dev_dbg(cs35l56_base->dev, "Speaker ID = %d\n", speaker_id); From 7a4e5f4c663319f1e0c990ca62e3bb8d3d215c93 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 9 Sep 2025 12:30:37 +0100 Subject: [PATCH 1274/3206] ASoC: cs-amp-lib-test: Add tests for cs_amp_get_vendor_spkid() Add test cases for cs_amp_get_vendor_spkid() for the Lenovo speaker ID UEFI variable. This is a simple set of cases: - EFI variable is not found - EFI variable size is larger than the expect 1 byte - EFI variable has invalid value 0 - Lenovo UEFI variable valid values are interpreted correctly - HP UEFI variable valid values are interpreted correctly Signed-off-by: Richard Fitzgerald Message-ID: <20250909113039.922065-5-rf@opensource.cirrus.com> Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib-test.c | 196 +++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/sound/soc/codecs/cs-amp-lib-test.c b/sound/soc/codecs/cs-amp-lib-test.c index f53650128fc3d2..e7492afa041e92 100644 --- a/sound/soc/codecs/cs-amp-lib-test.c +++ b/sound/soc/codecs/cs-amp-lib-test.c @@ -19,6 +19,14 @@ #include #include +#define LENOVO_SPEAKER_ID_EFI_NAME L"SdwSpeaker" +#define LENOVO_SPEAKER_ID_EFI_GUID \ + EFI_GUID(0x48df970e, 0xe27f, 0x460a, 0xb5, 0x86, 0x77, 0x19, 0x80, 0x1d, 0x92, 0x82) + +#define HP_SPEAKER_ID_EFI_NAME L"HPSpeakerID" +#define HP_SPEAKER_ID_EFI_GUID \ + EFI_GUID(0xc49593a4, 0xd099, 0x419b, 0xa2, 0xc3, 0x67, 0xe9, 0x80, 0xe6, 0x1d, 0x1e) + KUNIT_DEFINE_ACTION_WRAPPER(faux_device_destroy_wrapper, faux_device_destroy, struct faux_device *) @@ -642,6 +650,185 @@ static void cs_amp_lib_test_write_cal_data_test(struct kunit *test) KUNIT_EXPECT_EQ(test, entry->value, data.calStatus); } +static void cs_amp_lib_test_spkid_lenovo_not_present(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_none); + + KUNIT_EXPECT_EQ(test, -ENOENT, cs_amp_get_vendor_spkid(dev)); +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_lenovo_d0(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + struct kunit *test = kunit_get_current_test(); + + if (efi_guidcmp(*guid, LENOVO_SPEAKER_ID_EFI_GUID) || + memcmp(name, LENOVO_SPEAKER_ID_EFI_NAME, sizeof(LENOVO_SPEAKER_ID_EFI_NAME))) + return EFI_NOT_FOUND; + + KUNIT_ASSERT_EQ(test, *size, 1); + *size = 1; + *(u8 *)buf = 0xd0; + + return EFI_SUCCESS; +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_lenovo_d1(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + struct kunit *test = kunit_get_current_test(); + + if (efi_guidcmp(*guid, LENOVO_SPEAKER_ID_EFI_GUID) || + memcmp(name, LENOVO_SPEAKER_ID_EFI_NAME, sizeof(LENOVO_SPEAKER_ID_EFI_NAME))) + return EFI_NOT_FOUND; + + KUNIT_ASSERT_EQ(test, *size, 1); + *size = 1; + *(u8 *)buf = 0xd1; + + return EFI_SUCCESS; +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_lenovo_00(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + struct kunit *test = kunit_get_current_test(); + + KUNIT_ASSERT_EQ(test, 0, efi_guidcmp(*guid, LENOVO_SPEAKER_ID_EFI_GUID)); + KUNIT_ASSERT_EQ(test, *size, 1); + *size = 1; + *(u8 *)buf = 0; + + return EFI_SUCCESS; +} + +static void cs_amp_lib_test_spkid_lenovo_d0(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_lenovo_d0); + + KUNIT_EXPECT_EQ(test, 0, cs_amp_get_vendor_spkid(dev)); +} + +static void cs_amp_lib_test_spkid_lenovo_d1(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_lenovo_d1); + + KUNIT_EXPECT_EQ(test, 1, cs_amp_get_vendor_spkid(dev)); +} + +static void cs_amp_lib_test_spkid_lenovo_illegal(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_lenovo_00); + + KUNIT_EXPECT_LT(test, cs_amp_get_vendor_spkid(dev), 0); +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_buf_too_small(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + return EFI_BUFFER_TOO_SMALL; +} + +static void cs_amp_lib_test_spkid_lenovo_oversize(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_buf_too_small); + + KUNIT_EXPECT_LT(test, cs_amp_get_vendor_spkid(dev), 0); +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_hp_30(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + struct kunit *test = kunit_get_current_test(); + + if (efi_guidcmp(*guid, HP_SPEAKER_ID_EFI_GUID) || + memcmp(name, HP_SPEAKER_ID_EFI_NAME, sizeof(HP_SPEAKER_ID_EFI_NAME))) + return EFI_NOT_FOUND; + + KUNIT_ASSERT_EQ(test, *size, 1); + *size = 1; + *(u8 *)buf = 0x30; + + return EFI_SUCCESS; +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_hp_31(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + struct kunit *test = kunit_get_current_test(); + + if (efi_guidcmp(*guid, HP_SPEAKER_ID_EFI_GUID) || + memcmp(name, HP_SPEAKER_ID_EFI_NAME, sizeof(HP_SPEAKER_ID_EFI_NAME))) + return EFI_NOT_FOUND; + + KUNIT_ASSERT_EQ(test, *size, 1); + *size = 1; + *(u8 *)buf = 0x31; + + return EFI_SUCCESS; +} + +static void cs_amp_lib_test_spkid_hp_30(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_hp_30); + + KUNIT_EXPECT_EQ(test, 0, cs_amp_get_vendor_spkid(dev)); +} + +static void cs_amp_lib_test_spkid_hp_31(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_hp_31); + + KUNIT_EXPECT_EQ(test, 1, cs_amp_get_vendor_spkid(dev)); +} + static int cs_amp_lib_test_case_init(struct kunit *test) { struct cs_amp_lib_test_priv *priv; @@ -737,6 +924,15 @@ static struct kunit_case cs_amp_lib_test_cases[] = { /* Tests for writing calibration data */ KUNIT_CASE(cs_amp_lib_test_write_cal_data_test), + /* Test cases for speaker ID */ + KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_not_present), + KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_d0), + KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_d1), + KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_illegal), + KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_oversize), + KUNIT_CASE(cs_amp_lib_test_spkid_hp_30), + KUNIT_CASE(cs_amp_lib_test_spkid_hp_31), + { } /* terminator */ }; From b78dd64208a83f66fc21951bd8758281a7d445a5 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 9 Sep 2025 12:30:38 +0100 Subject: [PATCH 1275/3206] ASoC: cs-amp-lib: Add HP-specific EFI variable for calibration data Search for an HP-specific EFI variable for calibration before falling back to the normal Cirrus Logic EFI variable. Future HP models will use an HP-defined EFI variable for storage of amp calibration data. The content is the same as the normal Cirrus Logic EFI variable. The first step in cs_amp_get_cal_efi_buffer() is to get the size of the EFI variable, so this has been made a loop that walks through an array of possible variables. A small change is needed to the KUnit test, which is included in this patch. Originally the cs_amp_lib_test_get_efi_variable() hook function asserted that the passed name and GUID matched the Cirrus Logic EFI variable. Obviously this will fail because the code now tries the HP definition first. The function has been changed to return EFI_NOT_FOUND instead, which emulates the normal behaviour of trying to get the HP variable on a system that has the Cirrus variable. Signed-off-by: Richard Fitzgerald Message-ID: <20250909113039.922065-6-rf@opensource.cirrus.com> Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib-test.c | 5 ++-- sound/soc/codecs/cs-amp-lib.c | 37 ++++++++++++++++++++++++------ 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/sound/soc/codecs/cs-amp-lib-test.c b/sound/soc/codecs/cs-amp-lib-test.c index e7492afa041e92..c090498cbf7820 100644 --- a/sound/soc/codecs/cs-amp-lib-test.c +++ b/sound/soc/codecs/cs-amp-lib-test.c @@ -204,8 +204,9 @@ static efi_status_t cs_amp_lib_test_get_efi_variable(efi_char16_t *name, KUNIT_EXPECT_NOT_ERR_OR_NULL(test, guid); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, size); - KUNIT_EXPECT_MEMEQ(test, name, expected_name, sizeof(expected_name)); - KUNIT_EXPECT_MEMEQ(test, guid, &expected_guid, sizeof(expected_guid)); + if (memcmp(name, expected_name, sizeof(expected_name)) || + efi_guidcmp(*guid, expected_guid)) + return -EFI_NOT_FOUND; if (!buf) { *size = priv->cal_blob->size; diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index 9b51d056d863b7..8434d5196107e9 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -28,6 +28,24 @@ #define HP_SPEAKER_ID_EFI_GUID \ EFI_GUID(0xc49593a4, 0xd099, 0x419b, 0xa2, 0xc3, 0x67, 0xe9, 0x80, 0xe6, 0x1d, 0x1e) +#define HP_CALIBRATION_EFI_NAME L"SmartAmpCalibrationData" +#define HP_CALIBRATION_EFI_GUID \ + EFI_GUID(0x53559579, 0x8753, 0x4f5c, 0x91, 0x30, 0xe8, 0x2a, 0xcf, 0xb8, 0xd8, 0x93) + +static const struct cs_amp_lib_cal_efivar { + efi_char16_t *name; + efi_guid_t *guid; +} cs_amp_lib_cal_efivars[] = { + { + .name = HP_CALIBRATION_EFI_NAME, + .guid = &HP_CALIBRATION_EFI_GUID, + }, + { + .name = CIRRUS_LOGIC_CALIBRATION_EFI_NAME, + .guid = &CIRRUS_LOGIC_CALIBRATION_EFI_GUID, + }, +}; + static int cs_amp_write_cal_coeff(struct cs_dsp *dsp, const struct cirrus_amp_cal_controls *controls, const char *ctl_name, u32 val) @@ -146,12 +164,17 @@ static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev) unsigned long data_size = 0; u8 *data; efi_status_t status; - int ret; + int i, ret; + + /* Find EFI variable and get size */ + for (i = 0; i < ARRAY_SIZE(cs_amp_lib_cal_efivars); i++) { + status = cs_amp_get_efi_variable(cs_amp_lib_cal_efivars[i].name, + cs_amp_lib_cal_efivars[i].guid, + &data_size, NULL); + if (status == EFI_BUFFER_TOO_SMALL) + break; + } - /* Get real size of UEFI variable */ - status = cs_amp_get_efi_variable(CIRRUS_LOGIC_CALIBRATION_EFI_NAME, - &CIRRUS_LOGIC_CALIBRATION_EFI_GUID, - &data_size, NULL); if (status != EFI_BUFFER_TOO_SMALL) return ERR_PTR(-ENOENT); @@ -165,8 +188,8 @@ static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev) if (!data) return ERR_PTR(-ENOMEM); - status = cs_amp_get_efi_variable(CIRRUS_LOGIC_CALIBRATION_EFI_NAME, - &CIRRUS_LOGIC_CALIBRATION_EFI_GUID, + status = cs_amp_get_efi_variable(cs_amp_lib_cal_efivars[i].name, + cs_amp_lib_cal_efivars[i].guid, &data_size, data); if (status != EFI_SUCCESS) { ret = -EINVAL; From e5b4ad2183f7ab18aaf7c73a120d17241ee58e97 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 9 Sep 2025 12:30:39 +0100 Subject: [PATCH 1276/3206] ASoC: cs-amp-lib-test: Add test for getting cal data from HP EFI Add a test case that cs_amp_get_efi_calibration_data() returns data when it is in the HP-specific EFI variable. This uses redirected implementation of cs_amp_get_efi_variable() that only returns data if the passes name and GUID match the HP EFI variable. A simple test case installs this function hook and then checks that calibration data is returned. Signed-off-by: Richard Fitzgerald Message-ID: <20250909113039.922065-7-rf@opensource.cirrus.com> Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib-test.c | 51 ++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/sound/soc/codecs/cs-amp-lib-test.c b/sound/soc/codecs/cs-amp-lib-test.c index c090498cbf7820..2fde8430933830 100644 --- a/sound/soc/codecs/cs-amp-lib-test.c +++ b/sound/soc/codecs/cs-amp-lib-test.c @@ -220,6 +220,56 @@ static efi_status_t cs_amp_lib_test_get_efi_variable(efi_char16_t *name, return EFI_SUCCESS; } +static efi_status_t cs_amp_lib_test_get_hp_cal_efi_variable(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + static const efi_char16_t expected_name[] = L"SmartAmpCalibrationData"; + static const efi_guid_t expected_guid = + EFI_GUID(0x53559579, 0x8753, 0x4f5c, 0x91, 0x30, 0xe8, 0x2a, 0xcf, 0xb8, 0xd8, 0x93); + struct kunit *test = kunit_get_current_test(); + struct cs_amp_lib_test_priv *priv = test->priv; + + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, name); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, guid); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, size); + + if (memcmp(name, expected_name, sizeof(expected_name)) || + efi_guidcmp(*guid, expected_guid)) + return -EFI_NOT_FOUND; + + if (!buf) { + *size = priv->cal_blob->size; + return EFI_BUFFER_TOO_SMALL; + } + + KUNIT_ASSERT_GE_MSG(test, ksize(buf), priv->cal_blob->size, "Buffer to small"); + + memcpy(buf, priv->cal_blob, priv->cal_blob->size); + + return EFI_SUCCESS; +} + +/* Get cal data block from HP variable. */ +static void cs_amp_lib_test_get_hp_efi_cal(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct cirrus_amp_cal_data result_data; + int ret; + + cs_amp_lib_test_init_dummy_cal_blob(test, 2); + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_hp_cal_efi_variable); + + ret = cs_amp_get_efi_calibration_data(&priv->amp_dev->dev, 0, 0, &result_data); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_MEMEQ(test, &result_data, &priv->cal_blob->data[0], sizeof(result_data)); +} + /* Get cal data block for a given amp, matched by target UID. */ static void cs_amp_lib_test_get_efi_cal_by_uid_test(struct kunit *test) { @@ -910,6 +960,7 @@ static struct kunit_case cs_amp_lib_test_cases[] = { KUNIT_CASE(cs_amp_lib_test_get_efi_cal_no_uid_index_not_found_test), KUNIT_CASE(cs_amp_lib_test_get_efi_cal_no_uid_no_index_test), KUNIT_CASE(cs_amp_lib_test_get_efi_cal_zero_not_matched_test), + KUNIT_CASE(cs_amp_lib_test_get_hp_efi_cal), KUNIT_CASE_PARAM(cs_amp_lib_test_get_efi_cal_by_uid_test, cs_amp_lib_test_get_cal_gen_params), KUNIT_CASE_PARAM(cs_amp_lib_test_get_efi_cal_by_index_unchecked_test, From e895f8e29119c8c966ea794af9e9100b10becb88 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Tue, 5 Aug 2025 16:10:25 +0800 Subject: [PATCH 1277/3206] hrtimers: Unconditionally update target CPU base after offline timer migration When testing softirq based hrtimers on an ARM32 board, with high resolution mode and NOHZ inactive, softirq based hrtimers fail to expire after being moved away from an offline CPU: CPU0 CPU1 hrtimer_start(..., HRTIMER_MODE_SOFT); cpu_down(CPU1) ... hrtimers_cpu_dying() // Migrate timers to CPU0 smp_call_function_single(CPU0, returgger_next_event); retrigger_next_event() if (!highres && !nohz) return; As retrigger_next_event() is a NOOP when both high resolution timers and NOHZ are inactive CPU0's hrtimer_cpu_base::softirq_expires_next is not updated and the migrated softirq timers never expire unless there is a softirq based hrtimer queued on CPU0 later. Fix this by removing the hrtimer_hres_active() and tick_nohz_active() check in retrigger_next_event(), which enforces a full update of the CPU base. As this is not a fast path the extra cost does not matter. [ tglx: Massaged change log ] Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Co-developed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Xiongfeng Wang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250805081025.54235-1-wangxiongfeng2@huawei.com --- kernel/time/hrtimer.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c0a..e8c479329282f9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -787,10 +787,10 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) @@ -2295,11 +2295,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); From b9aa93aa5185aee76c4c7a5ba4432b4d0d15f797 Mon Sep 17 00:00:00 2001 From: Jiri Wiesner Date: Thu, 31 Jul 2025 18:18:37 +0200 Subject: [PATCH 1278/3206] clocksource: Print durations for sync check unconditionally A typical set of messages that gets printed as a result of the clocksource watchdog finding the TSC unstable usually does not contain messages indicating CPUs being ahead of or behind the CPU from which the check is carried out. That fact suggests that the TSC does not experience time skew between CPUs (if the clocksource.verify_n_cpus parameter is set to a negative value) but quantitative information is missing. The cs_nsec_max value printed by the "CPU %d check durations" message actually provides a worst case estimate of the time skew. If all CPUs have been checked, the cs_nsec_max value multiplied by 2 is the maximum possible time skew between the TSCs of any two CPUs on the system. The worst case estimate is derived from two boundary cases: 1. No time is consumed to execute instructions between csnow_begin and csnow_mid while all the cs_nsec_max time is consumed by the code between csnow_mid and csnow_end. In this case, the maximum undetectable time skew of a CPU being ahead would be cs_nsec_max. 2. All the cs_nsec_max time is consumed to execute instructions between csnow_begin and csnow_mid while no time is consumed by the code between csnow_mid and csnow_end. In this case, the maximum undetectable time skew of a CPU being behind would be cs_nsec_max. The worst case estimate assumes a system experiencing a corner case consisting of the two boundary cases. Always print the "CPU %d check durations" message so that the maximum possible time skew measured by the TSC sync check can be compared to the time skew measured by the clocksource watchdog. Signed-off-by: Jiri Wiesner Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Link: https://lore.kernel.org/all/aIuXXfdITXdI0lLp@incl --- kernel/time/clocksource.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 0aef0e349e49c5..3edb01db3aa146 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -407,9 +407,8 @@ void clocksource_verify_percpu(struct clocksource *cs) if (!cpumask_empty(&cpus_behind)) pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", cpumask_pr_args(&cpus_behind), testcpu, cs->name); - if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind)) - pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", - testcpu, cs_nsec_min, cs_nsec_max, cs->name); + pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", + testcpu, cs_nsec_min, cs_nsec_max, cs->name); } EXPORT_SYMBOL_GPL(clocksource_verify_percpu); From 641427d5bf90af0625081bf27555418b101274cd Mon Sep 17 00:00:00 2001 From: Alex Tran Date: Wed, 3 Sep 2025 20:17:09 -0700 Subject: [PATCH 1279/3206] docs: networking: can: change bcm_msg_head frames member to support flexible array The documentation of the 'bcm_msg_head' struct does not match how it is defined in 'bcm.h'. Changed the frames member to a flexible array, matching the definition in the header file. See commit 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays with flexible-array members") Signed-off-by: Alex Tran Acked-by: Oliver Hartkopp Link: https://patch.msgid.link/20250904031709.1426895-1-alex.t.tran@gmail.com Fixes: 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays with flexible-array members") Link: https://bugzilla.kernel.org/show_bug.cgi?id=217783 Signed-off-by: Marc Kleine-Budde --- Documentation/networking/can.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst index bc1b585355f7ad..7650c4b5be5f18 100644 --- a/Documentation/networking/can.rst +++ b/Documentation/networking/can.rst @@ -742,7 +742,7 @@ The broadcast manager sends responses to user space in the same form: struct timeval ival1, ival2; /* count and subsequent interval */ canid_t can_id; /* unique can_id for task */ __u32 nframes; /* number of can_frames following */ - struct can_frame frames[0]; + struct can_frame frames[]; }; The aligned payload 'frames' uses the same basic CAN frame structure defined From 7e5969a4d3e794993c9ca8d4026cf31a34b32b30 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Mon, 8 Sep 2025 10:54:51 -0500 Subject: [PATCH 1280/3206] dt-bindings: trivial-devices: Add sht2x sensors Add sensirion,sht2x trivial sensors. Signed-off-by: Kurt Borja Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250908-sht2x-v4-3-bc15f68af7de@gmail.com Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/trivial-devices.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index 0e6ba6e12a639a..edaba5cd8434fa 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -372,6 +372,9 @@ properties: # Sensirion low power multi-pixel gas sensor with I2C interface - sensirion,sgpc3 # Sensirion temperature & humidity sensor with I2C interface + - sensirion,sht20 + - sensirion,sht21 + - sensirion,sht25 - sensirion,sht4x # Sensortek 3 axis accelerometer - sensortek,stk8312 From 393de14673d60384f8f014b75fe679a69c9110e9 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Mon, 8 Sep 2025 10:54:52 -0500 Subject: [PATCH 1281/3206] hwmon: (sht21) Add devicetree support Add DT support for sht2x chips. Signed-off-by: Kurt Borja Link: https://lore.kernel.org/r/20250908-sht2x-v4-4-bc15f68af7de@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/sht21.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/sht21.c b/drivers/hwmon/sht21.c index 97d71e3361e9d7..627d35070a420a 100644 --- a/drivers/hwmon/sht21.c +++ b/drivers/hwmon/sht21.c @@ -282,8 +282,19 @@ static const struct i2c_device_id sht21_id[] = { }; MODULE_DEVICE_TABLE(i2c, sht21_id); +static const struct of_device_id sht21_of_match[] = { + { .compatible = "sensirion,sht20" }, + { .compatible = "sensirion,sht21" }, + { .compatible = "sensirion,sht25" }, + { } +}; +MODULE_DEVICE_TABLE(of, sht21_of_match); + static struct i2c_driver sht21_driver = { - .driver.name = "sht21", + .driver = { + .name = "sht21", + .of_match_table = sht21_of_match, + }, .probe = sht21_probe, .id_table = sht21_id, }; From 80038a758b7fc0cdb6987532cbbf3f75b13e0826 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Tue, 9 Sep 2025 10:02:49 +0200 Subject: [PATCH 1282/3206] hwmon: sy7636a: add alias Add module alias to have it autoloaded. Signed-off-by: Andreas Kemnade Link: https://lore.kernel.org/r/20250909080249.30656-1-andreas@kemnade.info Signed-off-by: Guenter Roeck --- drivers/hwmon/sy7636a-hwmon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hwmon/sy7636a-hwmon.c b/drivers/hwmon/sy7636a-hwmon.c index ed110884786b48..a12fc0ce70e76e 100644 --- a/drivers/hwmon/sy7636a-hwmon.c +++ b/drivers/hwmon/sy7636a-hwmon.c @@ -104,3 +104,4 @@ module_platform_driver(sy7636a_sensor_driver); MODULE_DESCRIPTION("SY7636A sensor driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:sy7636a-temperature"); From f09c1d63e895e1b45248a75656a41df2e8102874 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 3 Sep 2025 16:04:46 +0200 Subject: [PATCH 1283/3206] irqchip/msi-lib: Honor the MSI_FLAG_PCI_MSI_MASK_PARENT flag For systems that implement interrupt masking at the interrupt controller level, the MSI library offers MSI_FLAG_PCI_MSI_MASK_PARENT. It indicates that it isn't enough to only unmask the interrupt at the PCI device level, but that the interrupt controller must also be involved. However, the way this is currently done is less than optimal, as the masking/unmasking is done on both sides, always. It would be far cheaper to unmask both at the start of times, and then only deal with the interrupt controller mask, which is cheaper than a round-trip to the PCI endpoint. Now that the PCI/MSI layer implements irq_startup() and irq_shutdown() callbacks, which [un]mask at the PCI level and honor the request to [un]mask the parent, this can be trivially done. Overwrite the irq_mask/unmask() callbacks of the device domain interrupt chip with irq_[un]mask_parent() when the parent domain asks for it. [ tglx: Adopted to the PCI/MSI changes ] Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Acked-by: Marc Zyngier Link: https://lore.kernel.org/all/20250903135433.380783272@linutronix.de --- drivers/irqchip/irq-msi-lib.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/irqchip/irq-msi-lib.c b/drivers/irqchip/irq-msi-lib.c index 908944009c210b..d5eefc3d72155c 100644 --- a/drivers/irqchip/irq-msi-lib.c +++ b/drivers/irqchip/irq-msi-lib.c @@ -112,6 +112,20 @@ bool msi_lib_init_dev_msi_info(struct device *dev, struct irq_domain *domain, */ if (!chip->irq_set_affinity && !(info->flags & MSI_FLAG_NO_AFFINITY)) chip->irq_set_affinity = msi_domain_set_affinity; + + /* + * If the parent domain insists on being in charge of masking, obey + * blindly. The interrupt is un-masked at the PCI level on startup + * and masked on shutdown to prevent rogue interrupts after the + * driver freed the interrupt. Not masking it at the PCI level + * speeds up operation for disable/enable_irq() as it avoids + * getting all the way out to the PCI device. + */ + if (info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT) { + chip->irq_mask = irq_chip_mask_parent; + chip->irq_unmask = irq_chip_unmask_parent; + } + return true; } EXPORT_SYMBOL_GPL(msi_lib_init_dev_msi_info); From ba9d484ed3578705fcd24795b800e8e4364afb8c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2025 16:04:48 +0200 Subject: [PATCH 1284/3206] PCI/MSI: Remove the conditional parent [un]mask logic Now that msi_lib_init_dev_msi_info() overwrites the irq_[un]mask() callbacks when the MSI_FLAG_PCI_MSI_MASK_PARENT flag is set by the parent domain, the conditional [un]mask logic is obsolete. Remove it. Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Acked-by: Bjorn Helgaas Acked-by: Marc Zyngier Link: https://lore.kernel.org/all/20250903135433.444329373@linutronix.de --- drivers/pci/msi/irqdomain.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index b11b7f63f0d6fc..dfb61f152702bd 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -170,22 +170,6 @@ static unsigned int cond_startup_parent(struct irq_data *data) return 0; } -static __always_inline void cond_mask_parent(struct irq_data *data) -{ - struct msi_domain_info *info = data->domain->host_data; - - if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) - irq_chip_mask_parent(data); -} - -static __always_inline void cond_unmask_parent(struct irq_data *data) -{ - struct msi_domain_info *info = data->domain->host_data; - - if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) - irq_chip_unmask_parent(data); -} - static void pci_irq_shutdown_msi(struct irq_data *data) { struct msi_desc *desc = irq_data_get_msi_desc(data); @@ -208,14 +192,12 @@ static void pci_irq_mask_msi(struct irq_data *data) struct msi_desc *desc = irq_data_get_msi_desc(data); pci_msi_mask(desc, BIT(data->irq - desc->irq)); - cond_mask_parent(data); } static void pci_irq_unmask_msi(struct irq_data *data) { struct msi_desc *desc = irq_data_get_msi_desc(data); - cond_unmask_parent(data); pci_msi_unmask(desc, BIT(data->irq - desc->irq)); } @@ -268,12 +250,10 @@ static unsigned int pci_irq_startup_msix(struct irq_data *data) static void pci_irq_mask_msix(struct irq_data *data) { pci_msix_mask(irq_data_get_msi_desc(data)); - cond_mask_parent(data); } static void pci_irq_unmask_msix(struct irq_data *data) { - cond_unmask_parent(data); pci_msix_unmask(irq_data_get_msi_desc(data)); } From f5507aefc9114ced49d1ee527f63ea12ff5d7751 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 4 Sep 2025 13:40:29 +0200 Subject: [PATCH 1285/3206] s390/debug: Replace kmalloc() + copy_from_user() with memdup_user_nul() Replace kmalloc() followed by copy_from_user() with memdup_user_nul() to improve and simplify debug_get_user_string(). Remove the manual NUL-termination. No functional changes intended. Signed-off-by: Thorsten Blum Reviewed-by: Niklas Schnelle Signed-off-by: Alexander Gordeev --- arch/s390/kernel/debug.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index c62100dc62c8df..6a26f202441d3a 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -1416,18 +1416,12 @@ static inline char *debug_get_user_string(const char __user *user_buf, { char *buffer; - buffer = kmalloc(user_len + 1, GFP_KERNEL); - if (!buffer) - return ERR_PTR(-ENOMEM); - if (copy_from_user(buffer, user_buf, user_len) != 0) { - kfree(buffer); - return ERR_PTR(-EFAULT); - } + buffer = memdup_user_nul(user_buf, user_len); + if (IS_ERR(buffer)) + return buffer; /* got the string, now strip linefeed. */ if (buffer[user_len - 1] == '\n') buffer[user_len - 1] = 0; - else - buffer[user_len] = 0; return buffer; } From 5450abb0dea4f9fb432dea2ca92ea7a9bd25650b Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 5 Sep 2025 13:02:23 +0200 Subject: [PATCH 1286/3206] s390/hmcdrv: Replace kmalloc() + copy_from_user() with memdup_user_nul() Replace kmalloc() followed by copy_from_user() with memdup_user_nul() to improve and simplify hmcdrv_dev_write(). Remove the manual NUL-termination. No functional changes intended. Signed-off-by: Thorsten Blum Signed-off-by: Alexander Gordeev --- drivers/s390/char/hmcdrv_dev.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/s390/char/hmcdrv_dev.c b/drivers/s390/char/hmcdrv_dev.c index e069dd6858995e..b26fcf6849f2ad 100644 --- a/drivers/s390/char/hmcdrv_dev.c +++ b/drivers/s390/char/hmcdrv_dev.c @@ -244,24 +244,17 @@ static ssize_t hmcdrv_dev_write(struct file *fp, const char __user *ubuf, size_t len, loff_t *pos) { ssize_t retlen; + void *pdata; pr_debug("writing file '/dev/%pD' at pos. %lld with length %zd\n", fp, (long long) *pos, len); if (!fp->private_data) { /* first expect a cmd write */ - fp->private_data = kmalloc(len + 1, GFP_KERNEL); - - if (!fp->private_data) - return -ENOMEM; - - if (!copy_from_user(fp->private_data, ubuf, len)) { - ((char *)fp->private_data)[len] = '\0'; - return len; - } - - kfree(fp->private_data); - fp->private_data = NULL; - return -EFAULT; + pdata = memdup_user_nul(ubuf, len); + if (IS_ERR(pdata)) + return PTR_ERR(pdata); + fp->private_data = pdata; + return len; } retlen = hmcdrv_dev_transfer((char *) fp->private_data, From 29e0b471ccbd674d20d4bbddea1a51e7105212c5 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Sat, 6 Sep 2025 00:29:55 +0530 Subject: [PATCH 1287/3206] spi: cadence-quadspi: Flush posted register writes before INDAC access cqspi_indirect_read_execute() and cqspi_indirect_write_execute() first set the enable bit on APB region and then start reading/writing to the AHB region. On TI K3 SoCs these regions lie on different endpoints. This means that the order of the two operations is not guaranteed, and they might be reordered at the interconnect level. It is possible for the AHB write to be executed before the APB write to enable the indirect controller, causing the transaction to be invalid and the write erroring out. Read back the APB region write before accessing the AHB region to make sure the write got flushed and the race condition is eliminated. Fixes: 140623410536 ("mtd: spi-nor: Add driver for Cadence Quad SPI Flash Controller") CC: stable@vger.kernel.org Reviewed-by: Pratyush Yadav Signed-off-by: Pratyush Yadav Signed-off-by: Santhosh Kumar K Message-ID: <20250905185958.3575037-2-s-k6@ti.com> Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 9bf823348cd30d..eaf9a0f522d50c 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -764,6 +764,7 @@ static int cqspi_indirect_read_execute(struct cqspi_flash_pdata *f_pdata, reinit_completion(&cqspi->transfer_complete); writel(CQSPI_REG_INDIRECTRD_START_MASK, reg_base + CQSPI_REG_INDIRECTRD); + readl(reg_base + CQSPI_REG_INDIRECTRD); /* Flush posted write. */ while (remaining > 0) { if (use_irq && @@ -1090,6 +1091,8 @@ static int cqspi_indirect_write_execute(struct cqspi_flash_pdata *f_pdata, reinit_completion(&cqspi->transfer_complete); writel(CQSPI_REG_INDIRECTWR_START_MASK, reg_base + CQSPI_REG_INDIRECTWR); + readl(reg_base + CQSPI_REG_INDIRECTWR); /* Flush posted write. */ + /* * As per 66AK2G02 TRM SPRUHY8F section 11.15.5.3 Indirect Access * Controller programming sequence, couple of cycles of From 1ad55767e77a853c98752ed1e33b68049a243bd7 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Sat, 6 Sep 2025 00:29:56 +0530 Subject: [PATCH 1288/3206] spi: cadence-quadspi: Flush posted register writes before DAC access cqspi_read_setup() and cqspi_write_setup() program the address width as the last step in the setup. This is likely to be immediately followed by a DAC region read/write. On TI K3 SoCs the DAC region is on a different endpoint from the register region. This means that the order of the two operations is not guaranteed, and they might be reordered at the interconnect level. It is possible that the DAC read/write goes through before the address width update goes through. In this situation if the previous command used a different address width the OSPI command is sent with the wrong number of address bytes, resulting in an invalid command and undefined behavior. Read back the size register to make sure the write gets flushed before accessing the DAC region. Fixes: 140623410536 ("mtd: spi-nor: Add driver for Cadence Quad SPI Flash Controller") CC: stable@vger.kernel.org Reviewed-by: Pratyush Yadav Signed-off-by: Pratyush Yadav Signed-off-by: Santhosh Kumar K Message-ID: <20250905185958.3575037-3-s-k6@ti.com> Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index eaf9a0f522d50c..447a32a08a934b 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -719,6 +719,7 @@ static int cqspi_read_setup(struct cqspi_flash_pdata *f_pdata, reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK; reg |= (op->addr.nbytes - 1); writel(reg, reg_base + CQSPI_REG_SIZE); + readl(reg_base + CQSPI_REG_SIZE); /* Flush posted write. */ return 0; } @@ -1063,6 +1064,7 @@ static int cqspi_write_setup(struct cqspi_flash_pdata *f_pdata, reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK; reg |= (op->addr.nbytes - 1); writel(reg, reg_base + CQSPI_REG_SIZE); + readl(reg_base + CQSPI_REG_SIZE); /* Flush posted write. */ return 0; } From 858d4d9e0a9d6b64160ef3c824f428c9742172c4 Mon Sep 17 00:00:00 2001 From: Santhosh Kumar K Date: Sat, 6 Sep 2025 00:29:57 +0530 Subject: [PATCH 1289/3206] spi: cadence-quadspi: Fix cqspi_setup_flash() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'max_cs' stores the largest chip select number. It should only be updated when the current 'cs' is greater than existing 'max_cs'. So, fix the condition accordingly. Also, return failure if there are no flash device declared. Fixes: 0f3841a5e115 ("spi: cadence-qspi: report correct number of chip-select") CC: stable@vger.kernel.org Reviewed-by: Pratyush Yadav Reviewed-by: Théo Lebrun Signed-off-by: Santhosh Kumar K Message-ID: <20250905185958.3575037-4-s-k6@ti.com> Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 447a32a08a934b..6627a3059ea359 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -1722,12 +1722,10 @@ static const struct spi_controller_mem_caps cqspi_mem_caps = { static int cqspi_setup_flash(struct cqspi_st *cqspi) { - unsigned int max_cs = cqspi->num_chipselect - 1; struct platform_device *pdev = cqspi->pdev; struct device *dev = &pdev->dev; struct cqspi_flash_pdata *f_pdata; - unsigned int cs; - int ret; + int ret, cs, max_cs = -1; /* Get flash device data */ for_each_available_child_of_node_scoped(dev->of_node, np) { @@ -1740,10 +1738,10 @@ static int cqspi_setup_flash(struct cqspi_st *cqspi) if (cs >= cqspi->num_chipselect) { dev_err(dev, "Chip select %d out of range.\n", cs); return -EINVAL; - } else if (cs < max_cs) { - max_cs = cs; } + max_cs = max_t(int, cs, max_cs); + f_pdata = &cqspi->f_pdata[cs]; f_pdata->cqspi = cqspi; f_pdata->cs = cs; @@ -1753,6 +1751,11 @@ static int cqspi_setup_flash(struct cqspi_st *cqspi) return ret; } + if (max_cs < 0) { + dev_err(dev, "No flash device declared\n"); + return -ENODEV; + } + cqspi->num_chipselect = max_cs + 1; return 0; } From d9e33b38c89f4cf8c32b8481dbcf3a6cdbba4595 Mon Sep 17 00:00:00 2001 From: Vignesh Raghavendra Date: Sat, 6 Sep 2025 00:29:58 +0530 Subject: [PATCH 1290/3206] spi: cadence-quadspi: Use BIT() macros where possible Convert few open coded bit shifts to BIT() macro for better readability. No functional changes intended. Signed-off-by: Vignesh Raghavendra Signed-off-by: Santhosh Kumar K Message-ID: <20250905185958.3575037-5-s-k6@ti.com> Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 6627a3059ea359..af253b86f1ab26 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -335,7 +335,7 @@ static bool cqspi_is_idle(struct cqspi_st *cqspi) { u32 reg = readl(cqspi->iobase + CQSPI_REG_CONFIG); - return reg & (1UL << CQSPI_REG_CONFIG_IDLE_LSB); + return reg & BIT(CQSPI_REG_CONFIG_IDLE_LSB); } static u32 cqspi_get_rd_sram_level(struct cqspi_st *cqspi) @@ -571,7 +571,7 @@ static int cqspi_command_read(struct cqspi_flash_pdata *f_pdata, reg |= (dummy_clk & CQSPI_REG_CMDCTRL_DUMMY_MASK) << CQSPI_REG_CMDCTRL_DUMMY_LSB; - reg |= (0x1 << CQSPI_REG_CMDCTRL_RD_EN_LSB); + reg |= BIT(CQSPI_REG_CMDCTRL_RD_EN_LSB); /* 0 means 1 byte. */ reg |= (((n_rx - 1) & CQSPI_REG_CMDCTRL_RD_BYTES_MASK) @@ -579,7 +579,7 @@ static int cqspi_command_read(struct cqspi_flash_pdata *f_pdata, /* setup ADDR BIT field */ if (op->addr.nbytes) { - reg |= (0x1 << CQSPI_REG_CMDCTRL_ADDR_EN_LSB); + reg |= BIT(CQSPI_REG_CMDCTRL_ADDR_EN_LSB); reg |= ((op->addr.nbytes - 1) & CQSPI_REG_CMDCTRL_ADD_BYTES_MASK) << CQSPI_REG_CMDCTRL_ADD_BYTES_LSB; @@ -646,7 +646,7 @@ static int cqspi_command_write(struct cqspi_flash_pdata *f_pdata, reg = opcode << CQSPI_REG_CMDCTRL_OPCODE_LSB; if (op->addr.nbytes) { - reg |= (0x1 << CQSPI_REG_CMDCTRL_ADDR_EN_LSB); + reg |= BIT(CQSPI_REG_CMDCTRL_ADDR_EN_LSB); reg |= ((op->addr.nbytes - 1) & CQSPI_REG_CMDCTRL_ADD_BYTES_MASK) << CQSPI_REG_CMDCTRL_ADD_BYTES_LSB; @@ -655,7 +655,7 @@ static int cqspi_command_write(struct cqspi_flash_pdata *f_pdata, } if (n_tx) { - reg |= (0x1 << CQSPI_REG_CMDCTRL_WR_EN_LSB); + reg |= BIT(CQSPI_REG_CMDCTRL_WR_EN_LSB); reg |= ((n_tx - 1) & CQSPI_REG_CMDCTRL_WR_BYTES_MASK) << CQSPI_REG_CMDCTRL_WR_BYTES_LSB; data = 0; @@ -1191,7 +1191,7 @@ static void cqspi_chipselect(struct cqspi_flash_pdata *f_pdata) * CS2 to 4b'1011 * CS3 to 4b'0111 */ - chip_select = 0xF & ~(1 << chip_select); + chip_select = 0xF & ~BIT(chip_select); } reg &= ~(CQSPI_REG_CONFIG_CHIPSELECT_MASK @@ -1277,9 +1277,9 @@ static void cqspi_readdata_capture(struct cqspi_st *cqspi, reg = readl(reg_base + CQSPI_REG_READCAPTURE); if (bypass) - reg |= (1 << CQSPI_REG_READCAPTURE_BYPASS_LSB); + reg |= BIT(CQSPI_REG_READCAPTURE_BYPASS_LSB); else - reg &= ~(1 << CQSPI_REG_READCAPTURE_BYPASS_LSB); + reg &= ~BIT(CQSPI_REG_READCAPTURE_BYPASS_LSB); reg &= ~(CQSPI_REG_READCAPTURE_DELAY_MASK << CQSPI_REG_READCAPTURE_DELAY_LSB); From 5f1af203ef964e7f7bf9d32716dfa5f332cc6f09 Mon Sep 17 00:00:00 2001 From: Mohammad Rafi Shaik Date: Mon, 8 Sep 2025 11:06:29 +0530 Subject: [PATCH 1291/3206] ASoC: qcom: audioreach: Fix lpaif_type configuration for the I2S interface Fix missing lpaif_type configuration for the I2S interface. The proper lpaif interface type required to allow DSP to vote appropriate clock setting for I2S interface. Fixes: 25ab80db6b133 ("ASoC: qdsp6: audioreach: add module configuration command helpers") Cc: stable@vger.kernel.org Reviewed-by: Srinivas Kandagatla Signed-off-by: Mohammad Rafi Shaik Message-ID: <20250908053631.70978-2-mohammad.rafi.shaik@oss.qualcomm.com> Signed-off-by: Mark Brown --- sound/soc/qcom/qdsp6/audioreach.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/qcom/qdsp6/audioreach.c b/sound/soc/qcom/qdsp6/audioreach.c index 4ebaaf736fb98a..3f5eed5afce55e 100644 --- a/sound/soc/qcom/qdsp6/audioreach.c +++ b/sound/soc/qcom/qdsp6/audioreach.c @@ -971,6 +971,7 @@ static int audioreach_i2s_set_media_format(struct q6apm_graph *graph, param_data->param_id = PARAM_ID_I2S_INTF_CFG; param_data->param_size = ic_sz - APM_MODULE_PARAM_DATA_SIZE; + intf_cfg->cfg.lpaif_type = module->hw_interface_type; intf_cfg->cfg.intf_idx = module->hw_interface_idx; intf_cfg->cfg.sd_line_idx = module->sd_line_idx; From 33b55b94bca904ca25a9585e3cd43d15f0467969 Mon Sep 17 00:00:00 2001 From: Mohammad Rafi Shaik Date: Mon, 8 Sep 2025 11:06:30 +0530 Subject: [PATCH 1292/3206] ASoC: qcom: q6apm-lpass-dais: Fix missing set_fmt DAI op for I2S The q6i2s_set_fmt() function was defined but never linked into the I2S DAI operations, resulting DAI format settings is being ignored during stream setup. This change fixes the issue by properly linking the .set_fmt handler within the DAI ops. Fixes: 30ad723b93ade ("ASoC: qdsp6: audioreach: add q6apm lpass dai support") Cc: stable@vger.kernel.org Reviewed-by: Srinivas Kandagatla Signed-off-by: Mohammad Rafi Shaik Message-ID: <20250908053631.70978-3-mohammad.rafi.shaik@oss.qualcomm.com> Signed-off-by: Mark Brown --- sound/soc/qcom/qdsp6/q6apm-lpass-dais.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c index a0d90462fd6a38..36c034819b38ff 100644 --- a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c +++ b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c @@ -260,6 +260,7 @@ static const struct snd_soc_dai_ops q6i2s_ops = { .shutdown = q6apm_lpass_dai_shutdown, .set_channel_map = q6dma_set_channel_map, .hw_params = q6dma_hw_params, + .set_fmt = q6i2s_set_fmt, }; static const struct snd_soc_dai_ops q6hdmi_ops = { From 596e8ba2faf0d2beb9bb68801622fa6461918c1d Mon Sep 17 00:00:00 2001 From: Mohammad Rafi Shaik Date: Mon, 8 Sep 2025 11:06:31 +0530 Subject: [PATCH 1293/3206] ASoC: qcom: sc8280xp: Enable DAI format configuration for MI2S interfaces Add support for configuring the DAI format on MI2S interfaces, this enhancement allows setting the appropriate bit clock and frame clock polarity, ensuring correct audio data transmission over MI2S. Reviewed-by: Srinivas Kandagatla Signed-off-by: Mohammad Rafi Shaik Rule: add Link: https://lore.kernel.org/stable/20250908053631.70978-4-mohammad.rafi.shaik%40oss.qualcomm.com Message-ID: <20250908053631.70978-4-mohammad.rafi.shaik@oss.qualcomm.com> Signed-off-by: Mark Brown --- sound/soc/qcom/sc8280xp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/soc/qcom/sc8280xp.c b/sound/soc/qcom/sc8280xp.c index 73f9f82c4e2581..3067b95bcdbbba 100644 --- a/sound/soc/qcom/sc8280xp.c +++ b/sound/soc/qcom/sc8280xp.c @@ -32,6 +32,10 @@ static int sc8280xp_snd_init(struct snd_soc_pcm_runtime *rtd) int dp_pcm_id = 0; switch (cpu_dai->id) { + case PRIMARY_MI2S_RX...QUATERNARY_MI2S_TX: + case QUINARY_MI2S_RX...QUINARY_MI2S_TX: + snd_soc_dai_set_fmt(cpu_dai, SND_SOC_DAIFMT_BP_FP); + break; case WSA_CODEC_DMA_RX_0: case WSA_CODEC_DMA_RX_1: /* From f5c32370dba668c171c73684f489a3ea0b9503c5 Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Thu, 4 Sep 2025 15:13:51 -0400 Subject: [PATCH 1294/3206] drm/amd/display: Disable DPCD Probe Quirk Disable dpcd probe quirk to native aux. Signed-off-by: Fangzhi Zuo Reviewed-by: Imre Deak Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4500 Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250904191351.746707-1-Jerry.Zuo@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit c5f4fb40584ee591da9fa090c6f265d11cbb1acf) Cc: stable@vger.kernel.org # 6.16.y: 5281cbe0b55a Cc: stable@vger.kernel.org # 6.16.y: 0b4aa85e8981 Cc: stable@vger.kernel.org # 6.16.y: b87ed522b364 Cc: stable@vger.kernel.org # 6.16.y --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 7187d5aedf0a50..77a9d2c7d31856 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -809,6 +809,7 @@ void amdgpu_dm_initialize_dp_connector(struct amdgpu_display_manager *dm, drm_dp_aux_init(&aconnector->dm_dp_aux.aux); drm_dp_cec_register_connector(&aconnector->dm_dp_aux.aux, &aconnector->base); + drm_dp_dpcd_set_probe(&aconnector->dm_dp_aux.aux, false); if (aconnector->base.connector_type == DRM_MODE_CONNECTOR_eDP) return; From f85981327a90c51e76f60e073cb6648b2f167226 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Tue, 9 Sep 2025 20:47:04 +0800 Subject: [PATCH 1295/3206] selftests/bpf: Fix incorrect array size calculation The loop in bench_sockmap_prog_destroy() has two issues: 1. Using 'sizeof(ctx.fds)' as the loop bound results in the number of bytes, not the number of file descriptors, causing the loop to iterate far more times than intended. 2. The condition 'ctx.fds[0] > 0' incorrectly checks only the first fd for all iterations, potentially leaving file descriptors unclosed. Change it to 'ctx.fds[i] > 0' to check each fd properly. These fixes ensure correct cleanup of all file descriptors when the benchmark exits. Reported-by: Dan Carpenter Signed-off-by: Jiayuan Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250909124721.191555-1-jiayuan.chen@linux.dev Closes: https://lore.kernel.org/bpf/aLqfWuRR9R_KTe5e@stanley.mountain/ --- tools/testing/selftests/bpf/benchs/bench_sockmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/benchs/bench_sockmap.c b/tools/testing/selftests/bpf/benchs/bench_sockmap.c index 8ebf563a67a2b5..cfc072aa7fff7e 100644 --- a/tools/testing/selftests/bpf/benchs/bench_sockmap.c +++ b/tools/testing/selftests/bpf/benchs/bench_sockmap.c @@ -10,6 +10,7 @@ #include #include "bench.h" #include "bench_sockmap_prog.skel.h" +#include "bpf_util.h" #define FILE_SIZE (128 * 1024) #define DATA_REPEAT_SIZE 10 @@ -124,8 +125,8 @@ static void bench_sockmap_prog_destroy(void) { int i; - for (i = 0; i < sizeof(ctx.fds); i++) { - if (ctx.fds[0] > 0) + for (i = 0; i < ARRAY_SIZE(ctx.fds); i++) { + if (ctx.fds[i] > 0) close(ctx.fds[i]); } From 70f0b051f82d0234ade2f6753f72a2610048db3b Mon Sep 17 00:00:00 2001 From: Ovidiu Bunea Date: Mon, 25 Aug 2025 14:45:33 -0400 Subject: [PATCH 1296/3206] drm/amd/display: Correct sequences and delays for DCN35 PG & RCG [why] The current PG & RCG programming in driver has some gaps and incorrect sequences. [how] Added delays after ungating clocks to allow ramp up, increased polling to allow more time for power up, and removed the incorrect sequences. Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Charlene Liu Signed-off-by: Ovidiu Bunea Signed-off-by: Wayne Lin Tested-by: Dan Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 1bde5584e297921f45911ae874b0175dce5ed4b5) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dc.h | 1 + .../amd/display/dc/dccg/dcn35/dcn35_dccg.c | 74 +++++------ .../amd/display/dc/hwss/dcn35/dcn35_hwseq.c | 115 +++--------------- .../amd/display/dc/hwss/dcn35/dcn35_init.c | 3 - .../amd/display/dc/hwss/dcn351/dcn351_init.c | 3 - .../gpu/drm/amd/display/dc/inc/hw/pg_cntl.h | 1 + .../amd/display/dc/pg/dcn35/dcn35_pg_cntl.c | 78 +++++++----- 7 files changed, 111 insertions(+), 164 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 59c07756130d5a..f24e1da68269e4 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1145,6 +1145,7 @@ struct dc_debug_options { bool enable_hblank_borrow; bool force_subvp_df_throttle; uint32_t acpi_transition_bitmasks[MAX_PIPES]; + bool enable_pg_cntl_debug_logs; }; diff --git a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c index 58c84f555c0fb8..0ce9489ac6b728 100644 --- a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c +++ b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c @@ -133,30 +133,34 @@ enum dsc_clk_source { }; -static void dccg35_set_dsc_clk_rcg(struct dccg *dccg, int inst, bool enable) +static void dccg35_set_dsc_clk_rcg(struct dccg *dccg, int inst, bool allow_rcg) { struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg); - if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dsc && enable) + if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dsc && allow_rcg) return; switch (inst) { case 0: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 1: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 2: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 3: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; default: BREAK_TO_DEBUGGER(); return; } + + /* Wait for clock to ramp */ + if (!allow_rcg) + udelay(10); } static void dccg35_set_symclk32_se_rcg( @@ -385,35 +389,34 @@ static void dccg35_set_dtbclk_p_rcg(struct dccg *dccg, int inst, bool enable) } } -static void dccg35_set_dppclk_rcg(struct dccg *dccg, - int inst, bool enable) +static void dccg35_set_dppclk_rcg(struct dccg *dccg, int inst, bool allow_rcg) { - struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg); - - if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && enable) + if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && allow_rcg) return; switch (inst) { case 0: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 1: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 2: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 3: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; default: BREAK_TO_DEBUGGER(); break; } - //DC_LOG_DEBUG("%s: inst(%d) DPPCLK rcg_disable: %d\n", __func__, inst, enable ? 0 : 1); + /* Wait for clock to ramp */ + if (!allow_rcg) + udelay(10); } static void dccg35_set_dpstreamclk_rcg( @@ -1177,32 +1180,34 @@ static void dccg35_update_dpp_dto(struct dccg *dccg, int dpp_inst, } static void dccg35_set_dppclk_root_clock_gating(struct dccg *dccg, - uint32_t dpp_inst, uint32_t enable) + uint32_t dpp_inst, uint32_t disallow_rcg) { struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg); - if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp) + if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && !disallow_rcg) return; switch (dpp_inst) { case 0: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, disallow_rcg); break; case 1: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, disallow_rcg); break; case 2: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, disallow_rcg); break; case 3: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, disallow_rcg); break; default: break; } - //DC_LOG_DEBUG("%s: dpp_inst(%d) rcg: %d\n", __func__, dpp_inst, enable); + /* Wait for clock to ramp */ + if (disallow_rcg) + udelay(10); } static void dccg35_get_pixel_rate_div( @@ -1782,8 +1787,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) //Disable DTO switch (inst) { case 0: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK0_DTO_PARAM, DSCCLK0_DTO_PHASE, 0, @@ -1791,8 +1795,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) REG_UPDATE(DSCCLK_DTO_CTRL, DSCCLK0_EN, 1); break; case 1: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK1_DTO_PARAM, DSCCLK1_DTO_PHASE, 0, @@ -1800,8 +1803,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) REG_UPDATE(DSCCLK_DTO_CTRL, DSCCLK1_EN, 1); break; case 2: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK2_DTO_PARAM, DSCCLK2_DTO_PHASE, 0, @@ -1809,8 +1811,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) REG_UPDATE(DSCCLK_DTO_CTRL, DSCCLK2_EN, 1); break; case 3: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK3_DTO_PARAM, DSCCLK3_DTO_PHASE, 0, @@ -1821,6 +1822,9 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) BREAK_TO_DEBUGGER(); return; } + + /* Wait for clock to ramp */ + udelay(10); } static void dccg35_disable_dscclk(struct dccg *dccg, @@ -1864,6 +1868,9 @@ static void dccg35_disable_dscclk(struct dccg *dccg, default: return; } + + /* Wait for clock ramp */ + udelay(10); } static void dccg35_enable_symclk_se(struct dccg *dccg, uint32_t stream_enc_inst, uint32_t link_enc_inst) @@ -2349,10 +2356,7 @@ static void dccg35_disable_symclk_se_cb( void dccg35_root_gate_disable_control(struct dccg *dccg, uint32_t pipe_idx, uint32_t disable_clock_gating) { - - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dpp) { - dccg35_set_dppclk_root_clock_gating(dccg, pipe_idx, disable_clock_gating); - } + dccg35_set_dppclk_root_clock_gating(dccg, pipe_idx, disable_clock_gating); } static const struct dccg_funcs dccg35_funcs_new = { diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index a267f574b61937..764eff6a4ec6b7 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -113,6 +113,14 @@ static void enable_memory_low_power(struct dc *dc) } #endif +static void print_pg_status(struct dc *dc, const char *debug_func, const char *debug_log) +{ + if (dc->debug.enable_pg_cntl_debug_logs && dc->res_pool->pg_cntl) { + if (dc->res_pool->pg_cntl->funcs->print_pg_status) + dc->res_pool->pg_cntl->funcs->print_pg_status(dc->res_pool->pg_cntl, debug_func, debug_log); + } +} + void dcn35_set_dmu_fgcg(struct dce_hwseq *hws, bool enable) { REG_UPDATE_3(DMU_CLK_CNTL, @@ -137,6 +145,8 @@ void dcn35_init_hw(struct dc *dc) uint32_t user_level = MAX_BACKLIGHT_LEVEL; int i; + print_pg_status(dc, __func__, ": start"); + if (dc->clk_mgr && dc->clk_mgr->funcs->init_clocks) dc->clk_mgr->funcs->init_clocks(dc->clk_mgr); @@ -200,10 +210,7 @@ void dcn35_init_hw(struct dc *dc) /* we want to turn off all dp displays before doing detection */ dc->link_srv->blank_all_dp_displays(dc); -/* - if (hws->funcs.enable_power_gating_plane) - hws->funcs.enable_power_gating_plane(dc->hwseq, true); -*/ + if (res_pool->hubbub && res_pool->hubbub->funcs->dchubbub_init) res_pool->hubbub->funcs->dchubbub_init(dc->res_pool->hubbub); /* If taking control over from VBIOS, we may want to optimize our first @@ -236,6 +243,8 @@ void dcn35_init_hw(struct dc *dc) } hws->funcs.init_pipes(dc, dc->current_state); + print_pg_status(dc, __func__, ": after init_pipes"); + if (dc->res_pool->hubbub->funcs->allow_self_refresh_control && !dc->res_pool->hubbub->ctx->dc->debug.disable_stutter) dc->res_pool->hubbub->funcs->allow_self_refresh_control(dc->res_pool->hubbub, @@ -312,6 +321,7 @@ void dcn35_init_hw(struct dc *dc) if (dc->res_pool->pg_cntl->funcs->init_pg_status) dc->res_pool->pg_cntl->funcs->init_pg_status(dc->res_pool->pg_cntl); } + print_pg_status(dc, __func__, ": after init_pg_status"); } static void update_dsc_on_stream(struct pipe_ctx *pipe_ctx, bool enable) @@ -500,97 +510,6 @@ void dcn35_physymclk_root_clock_control(struct dce_hwseq *hws, unsigned int phy_ } } -void dcn35_dsc_pg_control( - struct dce_hwseq *hws, - unsigned int dsc_inst, - bool power_on) -{ - uint32_t power_gate = power_on ? 0 : 1; - uint32_t pwr_status = power_on ? 0 : 2; - uint32_t org_ip_request_cntl = 0; - - if (hws->ctx->dc->debug.disable_dsc_power_gate) - return; - if (hws->ctx->dc->debug.ignore_pg) - return; - REG_GET(DC_IP_REQUEST_CNTL, IP_REQUEST_EN, &org_ip_request_cntl); - if (org_ip_request_cntl == 0) - REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 1); - - switch (dsc_inst) { - case 0: /* DSC0 */ - REG_UPDATE(DOMAIN16_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN16_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - case 1: /* DSC1 */ - REG_UPDATE(DOMAIN17_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN17_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - case 2: /* DSC2 */ - REG_UPDATE(DOMAIN18_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN18_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - case 3: /* DSC3 */ - REG_UPDATE(DOMAIN19_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN19_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - default: - BREAK_TO_DEBUGGER(); - break; - } - - if (org_ip_request_cntl == 0) - REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 0); -} - -void dcn35_enable_power_gating_plane(struct dce_hwseq *hws, bool enable) -{ - bool force_on = true; /* disable power gating */ - uint32_t org_ip_request_cntl = 0; - - if (hws->ctx->dc->debug.disable_hubp_power_gate) - return; - if (hws->ctx->dc->debug.ignore_pg) - return; - REG_GET(DC_IP_REQUEST_CNTL, IP_REQUEST_EN, &org_ip_request_cntl); - if (org_ip_request_cntl == 0) - REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 1); - /* DCHUBP0/1/2/3/4/5 */ - REG_UPDATE(DOMAIN0_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN2_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - /* DPP0/1/2/3/4/5 */ - REG_UPDATE(DOMAIN1_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN3_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - - force_on = true; /* disable power gating */ - if (enable && !hws->ctx->dc->debug.disable_dsc_power_gate) - force_on = false; - - /* DCS0/1/2/3/4 */ - REG_UPDATE(DOMAIN16_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN17_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN18_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN19_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - - -} - /* In headless boot cases, DIG may be turned * on which causes HW/SW discrepancies. * To avoid this, power down hardware on boot @@ -1453,6 +1372,8 @@ void dcn35_prepare_bandwidth( } dcn20_prepare_bandwidth(dc, context); + + print_pg_status(dc, __func__, ": after rcg and power up"); } void dcn35_optimize_bandwidth( @@ -1461,6 +1382,8 @@ void dcn35_optimize_bandwidth( { struct pg_block_update pg_update_state; + print_pg_status(dc, __func__, ": before rcg and power up"); + dcn20_optimize_bandwidth(dc, context); if (dc->hwss.calc_blocks_to_gate) { @@ -1472,6 +1395,8 @@ void dcn35_optimize_bandwidth( if (dc->hwss.root_clock_control) dc->hwss.root_clock_control(dc, &pg_update_state, false); } + + print_pg_status(dc, __func__, ": after rcg and power up"); } void dcn35_set_drr(struct pipe_ctx **pipe_ctx, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c index a3ccf805bd16ae..aefb7c47374158 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c @@ -115,7 +115,6 @@ static const struct hw_sequencer_funcs dcn35_funcs = { .exit_optimized_pwr_state = dcn21_exit_optimized_pwr_state, .update_visual_confirm_color = dcn10_update_visual_confirm_color, .apply_idle_power_optimizations = dcn35_apply_idle_power_optimizations, - .update_dsc_pg = dcn32_update_dsc_pg, .calc_blocks_to_gate = dcn35_calc_blocks_to_gate, .calc_blocks_to_ungate = dcn35_calc_blocks_to_ungate, .hw_block_power_up = dcn35_hw_block_power_up, @@ -150,7 +149,6 @@ static const struct hwseq_private_funcs dcn35_private_funcs = { .plane_atomic_disable = dcn35_plane_atomic_disable, //.plane_atomic_disable = dcn20_plane_atomic_disable,/*todo*/ //.hubp_pg_control = dcn35_hubp_pg_control, - .enable_power_gating_plane = dcn35_enable_power_gating_plane, .dpp_root_clock_control = dcn35_dpp_root_clock_control, .dpstream_root_clock_control = dcn35_dpstream_root_clock_control, .physymclk_root_clock_control = dcn35_physymclk_root_clock_control, @@ -165,7 +163,6 @@ static const struct hwseq_private_funcs dcn35_private_funcs = { .calculate_dccg_k1_k2_values = dcn32_calculate_dccg_k1_k2_values, .resync_fifo_dccg_dio = dcn314_resync_fifo_dccg_dio, .is_dp_dig_pixel_rate_div_policy = dcn35_is_dp_dig_pixel_rate_div_policy, - .dsc_pg_control = dcn35_dsc_pg_control, .dsc_pg_status = dcn32_dsc_pg_status, .enable_plane = dcn35_enable_plane, .wait_for_pipe_update_if_needed = dcn10_wait_for_pipe_update_if_needed, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c index 58f2be2a326b89..a580a55695c3b0 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c @@ -114,7 +114,6 @@ static const struct hw_sequencer_funcs dcn351_funcs = { .exit_optimized_pwr_state = dcn21_exit_optimized_pwr_state, .update_visual_confirm_color = dcn10_update_visual_confirm_color, .apply_idle_power_optimizations = dcn35_apply_idle_power_optimizations, - .update_dsc_pg = dcn32_update_dsc_pg, .calc_blocks_to_gate = dcn351_calc_blocks_to_gate, .calc_blocks_to_ungate = dcn351_calc_blocks_to_ungate, .hw_block_power_up = dcn351_hw_block_power_up, @@ -145,7 +144,6 @@ static const struct hwseq_private_funcs dcn351_private_funcs = { .plane_atomic_disable = dcn35_plane_atomic_disable, //.plane_atomic_disable = dcn20_plane_atomic_disable,/*todo*/ //.hubp_pg_control = dcn35_hubp_pg_control, - .enable_power_gating_plane = dcn35_enable_power_gating_plane, .dpp_root_clock_control = dcn35_dpp_root_clock_control, .dpstream_root_clock_control = dcn35_dpstream_root_clock_control, .physymclk_root_clock_control = dcn35_physymclk_root_clock_control, @@ -159,7 +157,6 @@ static const struct hwseq_private_funcs dcn351_private_funcs = { .setup_hpo_hw_control = dcn35_setup_hpo_hw_control, .calculate_dccg_k1_k2_values = dcn32_calculate_dccg_k1_k2_values, .is_dp_dig_pixel_rate_div_policy = dcn35_is_dp_dig_pixel_rate_div_policy, - .dsc_pg_control = dcn35_dsc_pg_control, .dsc_pg_status = dcn32_dsc_pg_status, .enable_plane = dcn35_enable_plane, .wait_for_pipe_update_if_needed = dcn10_wait_for_pipe_update_if_needed, diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h b/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h index 44f86cc2d1d686..227e3f8d7e5f56 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h @@ -49,6 +49,7 @@ struct pg_cntl_funcs { void (*mem_pg_control)(struct pg_cntl *pg_cntl, bool power_on); void (*dio_pg_control)(struct pg_cntl *pg_cntl, bool power_on); void (*init_pg_status)(struct pg_cntl *pg_cntl); + void (*print_pg_status)(struct pg_cntl *pg_cntl, const char *debug_func, const char *debug_log); }; #endif //__DC_PG_CNTL_H__ diff --git a/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c b/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c index af21c0a27f8657..72bd43f9bbe288 100644 --- a/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c +++ b/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c @@ -79,16 +79,12 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo uint32_t power_gate = power_on ? 0 : 1; uint32_t pwr_status = power_on ? 0 : 2; uint32_t org_ip_request_cntl = 0; - bool block_enabled; - - /*need to enable dscclk regardless DSC_PG*/ - if (pg_cntl->ctx->dc->res_pool->dccg->funcs->enable_dsc && power_on) - pg_cntl->ctx->dc->res_pool->dccg->funcs->enable_dsc( - pg_cntl->ctx->dc->res_pool->dccg, dsc_inst); + bool block_enabled = false; + bool skip_pg = pg_cntl->ctx->dc->debug.ignore_pg || + pg_cntl->ctx->dc->debug.disable_dsc_power_gate || + pg_cntl->ctx->dc->idle_optimizations_allowed; - if (pg_cntl->ctx->dc->debug.ignore_pg || - pg_cntl->ctx->dc->debug.disable_dsc_power_gate || - pg_cntl->ctx->dc->idle_optimizations_allowed) + if (skip_pg && !power_on) return; block_enabled = pg_cntl35_dsc_pg_status(pg_cntl, dsc_inst); @@ -111,7 +107,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN16_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; case 1: /* DSC1 */ REG_UPDATE(DOMAIN17_PG_CONFIG, @@ -119,7 +115,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN17_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; case 2: /* DSC2 */ REG_UPDATE(DOMAIN18_PG_CONFIG, @@ -127,7 +123,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN18_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; case 3: /* DSC3 */ REG_UPDATE(DOMAIN19_PG_CONFIG, @@ -135,7 +131,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN19_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; default: BREAK_TO_DEBUGGER(); @@ -144,12 +140,6 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo if (dsc_inst < MAX_PIPES) pg_cntl->pg_pipe_res_enable[PG_DSC][dsc_inst] = power_on; - - if (pg_cntl->ctx->dc->res_pool->dccg->funcs->disable_dsc && !power_on) { - /*this is to disable dscclk*/ - pg_cntl->ctx->dc->res_pool->dccg->funcs->disable_dsc( - pg_cntl->ctx->dc->res_pool->dccg, dsc_inst); - } } static bool pg_cntl35_hubp_dpp_pg_status(struct pg_cntl *pg_cntl, unsigned int hubp_dpp_inst) @@ -189,11 +179,12 @@ void pg_cntl35_hubp_dpp_pg_control(struct pg_cntl *pg_cntl, unsigned int hubp_dp uint32_t pwr_status = power_on ? 0 : 2; uint32_t org_ip_request_cntl; bool block_enabled; + bool skip_pg = pg_cntl->ctx->dc->debug.ignore_pg || + pg_cntl->ctx->dc->debug.disable_hubp_power_gate || + pg_cntl->ctx->dc->debug.disable_dpp_power_gate || + pg_cntl->ctx->dc->idle_optimizations_allowed; - if (pg_cntl->ctx->dc->debug.ignore_pg || - pg_cntl->ctx->dc->debug.disable_hubp_power_gate || - pg_cntl->ctx->dc->debug.disable_dpp_power_gate || - pg_cntl->ctx->dc->idle_optimizations_allowed) + if (skip_pg && !power_on) return; block_enabled = pg_cntl35_hubp_dpp_pg_status(pg_cntl, hubp_dpp_inst); @@ -213,22 +204,22 @@ void pg_cntl35_hubp_dpp_pg_control(struct pg_cntl *pg_cntl, unsigned int hubp_dp case 0: /* DPP0 & HUBP0 */ REG_UPDATE(DOMAIN0_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN0_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN0_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; case 1: /* DPP1 & HUBP1 */ REG_UPDATE(DOMAIN1_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN1_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN1_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; case 2: /* DPP2 & HUBP2 */ REG_UPDATE(DOMAIN2_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN2_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN2_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; case 3: /* DPP3 & HUBP3 */ REG_UPDATE(DOMAIN3_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN3_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN3_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; default: BREAK_TO_DEBUGGER(); @@ -501,6 +492,36 @@ void pg_cntl35_init_pg_status(struct pg_cntl *pg_cntl) pg_cntl->pg_res_enable[PG_DWB] = block_enabled; } +static void pg_cntl35_print_pg_status(struct pg_cntl *pg_cntl, const char *debug_func, const char *debug_log) +{ + int i = 0; + bool block_enabled = false; + + DC_LOG_DEBUG("%s: %s", debug_func, debug_log); + + DC_LOG_DEBUG("PG_CNTL status:\n"); + + block_enabled = pg_cntl35_io_clk_status(pg_cntl); + DC_LOG_DEBUG("ONO0=%d (DCCG, DIO, DCIO)\n", block_enabled ? 1 : 0); + + block_enabled = pg_cntl35_mem_status(pg_cntl); + DC_LOG_DEBUG("ONO1=%d (DCHUBBUB, DCHVM, DCHUBBUBMEM)\n", block_enabled ? 1 : 0); + + block_enabled = pg_cntl35_plane_otg_status(pg_cntl); + DC_LOG_DEBUG("ONO2=%d (MPC, OPP, OPTC, DWB)\n", block_enabled ? 1 : 0); + + block_enabled = pg_cntl35_hpo_pg_status(pg_cntl); + DC_LOG_DEBUG("ONO3=%d (HPO)\n", block_enabled ? 1 : 0); + + for (i = 0; i < pg_cntl->ctx->dc->res_pool->pipe_count; i++) { + block_enabled = pg_cntl35_hubp_dpp_pg_status(pg_cntl, i); + DC_LOG_DEBUG("ONO%d=%d (DCHUBP%d, DPP%d)\n", 4 + i * 2, block_enabled ? 1 : 0, i, i); + + block_enabled = pg_cntl35_dsc_pg_status(pg_cntl, i); + DC_LOG_DEBUG("ONO%d=%d (DSC%d)\n", 5 + i * 2, block_enabled ? 1 : 0, i); + } +} + static const struct pg_cntl_funcs pg_cntl35_funcs = { .init_pg_status = pg_cntl35_init_pg_status, .dsc_pg_control = pg_cntl35_dsc_pg_control, @@ -511,7 +532,8 @@ static const struct pg_cntl_funcs pg_cntl35_funcs = { .mpcc_pg_control = pg_cntl35_mpcc_pg_control, .opp_pg_control = pg_cntl35_opp_pg_control, .optc_pg_control = pg_cntl35_optc_pg_control, - .dwb_pg_control = pg_cntl35_dwb_pg_control + .dwb_pg_control = pg_cntl35_dwb_pg_control, + .print_pg_status = pg_cntl35_print_pg_status }; struct pg_cntl *pg_cntl35_create( From 60f71f0db7b12f303789ef59949e38ee5838ee8b Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Fri, 5 Sep 2025 10:36:27 -0500 Subject: [PATCH 1297/3206] drm/amd/display: Drop dm_prepare_suspend() and dm_complete() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] dm_prepare_suspend() was added in commit 50e0bae34fa6b ("drm/amd/display: Add and use new dm_prepare_suspend() callback") to allow display to turn off earlier in the suspend sequence. This caused a regression that HDMI audio sometimes didn't work properly after resume unless audio was playing during suspend. [How] Drop dm_prepare_suspend() callback. All code in it will still run during dm_suspend(). Also drop unnecessary dm_complete() callback. dm_complete() was used for failed prepare and also for any case of successful resume. The code in it already runs in dm_resume(). This change will introduce more time that the display is turned on during suspend sequence. The compositor can turn it off sooner if desired. Cc: Harry Wentland Reported-by: Przemysław Kopa Closes: https://lore.kernel.org/amd-gfx/1cea0d56-7739-4ad9-bf8e-c9330faea2bb@kernel.org/T/#m383d9c08397043a271b36c32b64bb80e524e4b0f Reported-by: Kalvin Closes: https://github.com/alsa-project/alsa-lib/issues/465 Closes: https://gitlab.freedesktop.org/pipewire/pipewire/-/issues/4809 Fixes: 50e0bae34fa6b ("drm/amd/display: Add and use new dm_prepare_suspend() callback") Signed-off-by: Mario Limonciello Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 2fd653b9bb5aacec5d4c421ab290905898fe85a2) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7808a647a306c2..352b3dcd0e0e14 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3127,25 +3127,6 @@ static void dm_destroy_cached_state(struct amdgpu_device *adev) dm->cached_state = NULL; } -static void dm_complete(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - dm_destroy_cached_state(adev); -} - -static int dm_prepare_suspend(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - if (amdgpu_in_reset(adev)) - return 0; - - WARN_ON(adev->dm.cached_state); - - return dm_cache_state(adev); -} - static int dm_suspend(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; @@ -3571,10 +3552,8 @@ static const struct amd_ip_funcs amdgpu_dm_funcs = { .early_fini = amdgpu_dm_early_fini, .hw_init = dm_hw_init, .hw_fini = dm_hw_fini, - .prepare_suspend = dm_prepare_suspend, .suspend = dm_suspend, .resume = dm_resume, - .complete = dm_complete, .is_idle = dm_is_idle, .wait_for_idle = dm_wait_for_idle, .check_soft_reset = dm_check_soft_reset, From 1dfd2864a1c4909147663e5a27c055f50f7c2796 Mon Sep 17 00:00:00 2001 From: Geoffrey McRae Date: Thu, 28 Aug 2025 22:26:22 +1000 Subject: [PATCH 1298/3206] drm/amd/display: remove oem i2c adapter on finish Fixes a bug where unbinding of the GPU would leave the oem i2c adapter registered resulting in a null pointer dereference when applications try to access the invalid device. Fixes: 3d5470c97314 ("drm/amd/display/dm: add support for OEM i2c bus") Cc: Harry Wentland Reviewed-by: Alex Deucher Signed-off-by: Geoffrey McRae Signed-off-by: Alex Deucher (cherry picked from commit 89923fb7ead4fdd37b78dd49962d9bb5892403e6) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 352b3dcd0e0e14..4e86370ae70599 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2913,6 +2913,17 @@ static int dm_oem_i2c_hw_init(struct amdgpu_device *adev) return 0; } +static void dm_oem_i2c_hw_fini(struct amdgpu_device *adev) +{ + struct amdgpu_display_manager *dm = &adev->dm; + + if (dm->oem_i2c) { + i2c_del_adapter(&dm->oem_i2c->base); + kfree(dm->oem_i2c); + dm->oem_i2c = NULL; + } +} + /** * dm_hw_init() - Initialize DC device * @ip_block: Pointer to the amdgpu_ip_block for this hw instance. @@ -2963,7 +2974,7 @@ static int dm_hw_fini(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; - kfree(adev->dm.oem_i2c); + dm_oem_i2c_hw_fini(adev); amdgpu_dm_hpd_fini(adev); From ce42a3b581a9db10765eb835840b04dbe7972135 Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Mon, 25 Aug 2025 09:50:49 -0400 Subject: [PATCH 1299/3206] drm/amdkfd: fix p2p links bug in topology When creating p2p links, KFD needs to check XGMI link with two conditions, hive_id and is_sharing_enabled, but it is missing to check is_sharing_enabled, so add it to fix the error. Signed-off-by: Eric Huang Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 36cc7d13178d901982da7a122c883861d98da624) --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 4ec73f33535ebf..720b20e842ba43 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1587,7 +1587,8 @@ static int kfd_dev_create_p2p_links(void) break; if (!dev->gpu || !dev->gpu->adev || (dev->gpu->kfd->hive_id && - dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id)) + dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id && + amdgpu_xgmi_get_is_sharing_enabled(dev->gpu->adev, new_dev->gpu->adev))) goto next; /* check if node(s) is/are peer accessible in one direction or bi-direction */ From 53503556273a5ead8b75534085e2dcb46e96f883 Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Wed, 20 Aug 2025 16:10:51 +0800 Subject: [PATCH 1300/3206] amd/amdkfd: correct mem limit calculation for small APUs Current mem limit check leaks some GTT memory (reserved_for_pt reserved_for_ras + adev->vram_pin_size) for small APUs. Since carveout VRAM is tunable on APUs, there are three case regarding the carveout VRAM size relative to GTT: 1. 0 < carveout < gtt apu_prefer_gtt = true, is_app_apu = false 2. carveout > gtt / 2 apu_prefer_gtt = false, is_app_apu = false 3. 0 = carveout apu_prefer_gtt = true, is_app_apu = true It doesn't make sense to check below limitation in case 1 (default case, small carveout) because the values in the below expression are mixed with carveout and gtt. adev->kfd.vram_used[xcp_id] + vram_needed > vram_size - reserved_for_pt - reserved_for_ras - atomic64_read(&adev->vram_pin_size) gtt: kfd.vram_used, vram_needed, vram_size carveout: reserved_for_pt, reserved_for_ras, adev->vram_pin_size In case 1, vram allocation will go to gtt domain, skip vram check since ttm_mem_limit check already cover this allocation. Signed-off-by: Yifan Zhang Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit fa7c99f04f6dd299388e9282812b14e95558ac8e) --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 44 ++++++++++++++----- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 260165bbe3736d..b16cce7c22c373 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -213,19 +213,35 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, spin_lock(&kfd_mem_limit.mem_limit_lock); if (kfd_mem_limit.system_mem_used + system_mem_needed > - kfd_mem_limit.max_system_mem_limit) + kfd_mem_limit.max_system_mem_limit) { pr_debug("Set no_system_mem_limit=1 if using shared memory\n"); + if (!no_system_mem_limit) { + ret = -ENOMEM; + goto release; + } + } - if ((kfd_mem_limit.system_mem_used + system_mem_needed > - kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) || - (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > - kfd_mem_limit.max_ttm_mem_limit) || - (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed > - vram_size - reserved_for_pt - reserved_for_ras - atomic64_read(&adev->vram_pin_size))) { + if (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > + kfd_mem_limit.max_ttm_mem_limit) { ret = -ENOMEM; goto release; } + /*if is_app_apu is false and apu_prefer_gtt is true, it is an APU with + * carve out < gtt. In that case, VRAM allocation will go to gtt domain, skip + * VRAM check since ttm_mem_limit check already cover this allocation + */ + + if (adev && xcp_id >= 0 && (!adev->apu_prefer_gtt || adev->gmc.is_app_apu)) { + uint64_t vram_available = + vram_size - reserved_for_pt - reserved_for_ras - + atomic64_read(&adev->vram_pin_size); + if (adev->kfd.vram_used[xcp_id] + vram_needed > vram_available) { + ret = -ENOMEM; + goto release; + } + } + /* Update memory accounting by decreasing available system * memory, TTM memory and GPU memory as computed above */ @@ -1626,11 +1642,15 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev, uint64_t vram_available, system_mem_available, ttm_mem_available; spin_lock(&kfd_mem_limit.mem_limit_lock); - vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id) - - adev->kfd.vram_used_aligned[xcp_id] - - atomic64_read(&adev->vram_pin_size) - - reserved_for_pt - - reserved_for_ras; + if (adev->apu_prefer_gtt && !adev->gmc.is_app_apu) + vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id) + - adev->kfd.vram_used_aligned[xcp_id]; + else + vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id) + - adev->kfd.vram_used_aligned[xcp_id] + - atomic64_read(&adev->vram_pin_size) + - reserved_for_pt + - reserved_for_ras; if (adev->apu_prefer_gtt) { system_mem_available = no_system_mem_limit ? From 75871a525a596ff4d16c4aebc0018f8d0923c9b1 Mon Sep 17 00:00:00 2001 From: Tianyu Xu Date: Tue, 12 Aug 2025 21:10:56 +0800 Subject: [PATCH 1301/3206] igb: Fix NULL pointer dereference in ethtool loopback test The igb driver currently causes a NULL pointer dereference when executing the ethtool loopback test. This occurs because there is no associated q_vector for the test ring when it is set up, as interrupts are typically not added to the test rings. Since commit 5ef44b3cb43b removed the napi_id assignment in __xdp_rxq_info_reg(), there is no longer a need to pass a napi_id to it. Therefore, simply use 0 as the last parameter. Fixes: 2c6196013f84 ("igb: Add AF_XDP zero-copy Rx support") Reviewed-by: Aleksandr Loktionov Reviewed-by: Joe Damato Signed-off-by: Tianyu Xu Reviewed-by: Paul Menzel Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index a9a7a94ae61e93..453deb6d14b312 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -4453,8 +4453,7 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring) if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) xdp_rxq_info_unreg(&rx_ring->xdp_rxq); res = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->queue_index, - rx_ring->q_vector->napi.napi_id); + rx_ring->queue_index, 0); if (res < 0) { dev_err(dev, "Failed to register xdp_rxq index %u\n", rx_ring->queue_index); From d709f178abca22a4d3642513df29afe4323a594b Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Fri, 15 Aug 2025 15:26:31 +0900 Subject: [PATCH 1302/3206] igb: fix link test skipping when interface is admin down The igb driver incorrectly skips the link test when the network interface is admin down (if_running == false), causing the test to always report PASS regardless of the actual physical link state. This behavior is inconsistent with other drivers (e.g. i40e, ice, ixgbe, etc.) which correctly test the physical link state regardless of admin state. Remove the if_running check to ensure link test always reflects the physical link state. Fixes: 8d420a1b3ea6 ("igb: correct link test not being run when link is down") Signed-off-by: Kohei Enju Reviewed-by: Paul Menzel Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_ethtool.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 92ef33459aec78..7b8f32c5169aa3 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -2081,11 +2081,8 @@ static void igb_diag_test(struct net_device *netdev, } else { dev_info(&adapter->pdev->dev, "online testing starting\n"); - /* PHY is powered down when interface is down */ - if (if_running && igb_link_test(adapter, &data[TEST_LINK])) + if (igb_link_test(adapter, &data[TEST_LINK])) eth_test->flags |= ETH_TEST_FL_FAILED; - else - data[TEST_LINK] = 0; /* Online tests aren't run; pass by default */ data[TEST_REG] = 0; From 915470e1b44e71d1dd07ee067276f003c3521ee3 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Mon, 18 Aug 2025 17:39:03 +0200 Subject: [PATCH 1303/3206] i40e: fix IRQ freeing in i40e_vsi_request_irq_msix error path If request_irq() in i40e_vsi_request_irq_msix() fails in an iteration later than the first, the error path wants to free the IRQs requested so far. However, it uses the wrong dev_id argument for free_irq(), so it does not free the IRQs correctly and instead triggers the warning: Trying to free already-free IRQ 173 WARNING: CPU: 25 PID: 1091 at kernel/irq/manage.c:1829 __free_irq+0x192/0x2c0 Modules linked in: i40e(+) [...] CPU: 25 UID: 0 PID: 1091 Comm: NetworkManager Not tainted 6.17.0-rc1+ #1 PREEMPT(lazy) Hardware name: [...] RIP: 0010:__free_irq+0x192/0x2c0 [...] Call Trace: free_irq+0x32/0x70 i40e_vsi_request_irq_msix.cold+0x63/0x8b [i40e] i40e_vsi_request_irq+0x79/0x80 [i40e] i40e_vsi_open+0x21f/0x2f0 [i40e] i40e_open+0x63/0x130 [i40e] __dev_open+0xfc/0x210 __dev_change_flags+0x1fc/0x240 netif_change_flags+0x27/0x70 do_setlink.isra.0+0x341/0xc70 rtnl_newlink+0x468/0x860 rtnetlink_rcv_msg+0x375/0x450 netlink_rcv_skb+0x5c/0x110 netlink_unicast+0x288/0x3c0 netlink_sendmsg+0x20d/0x430 ____sys_sendmsg+0x3a2/0x3d0 ___sys_sendmsg+0x99/0xe0 __sys_sendmsg+0x8a/0xf0 do_syscall_64+0x82/0x2c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e [...] ---[ end trace 0000000000000000 ]--- Use the same dev_id for free_irq() as for request_irq(). I tested this with inserting code to fail intentionally. Fixes: 493fb30011b3 ("i40e: Move q_vectors from pointer to array to array of pointers") Signed-off-by: Michal Schmidt Reviewed-by: Aleksandr Loktionov Reviewed-by: Subbaraya Sundeep Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index b83f823e49177c..dd21d93d39dd54 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -4156,7 +4156,7 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename) irq_num = pf->msix_entries[base + vector].vector; irq_set_affinity_notifier(irq_num, NULL); irq_update_affinity_hint(irq_num, NULL); - free_irq(irq_num, &vsi->q_vectors[vector]); + free_irq(irq_num, vsi->q_vectors[vector]); } return err; } From 7934fdc25ad642ab3dbc16d734ab58638520ea60 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 4 Sep 2025 12:35:21 +0200 Subject: [PATCH 1304/3206] drm/xe/configfs: Don't touch survivability_mode on fini This is a user controlled configfs attribute, we should not modify that outside the configfs attr.store() implementation. Fixes: bc417e54e24b ("drm/xe: Enable configfs support for survivability mode") Signed-off-by: Michal Wajdeczko Cc: Lucas De Marchi Cc: Riana Tauro Reviewed-by: Stuart Summers Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250904103521.7130-1-michal.wajdeczko@intel.com (cherry picked from commit 079a5c83dbd23db7a6eed8f558cf75e264d8a17b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_survivability_mode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 41705f5d52e3a3..8f7b0add2364f8 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -41,6 +41,8 @@ * * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode * + * It is the responsibility of the user to clear the mode once firmware flash is complete. + * * Refer :ref:`xe_configfs` for more details on how to use configfs * * Survivability mode is indicated by the below admin-only readable sysfs which provides additional @@ -147,7 +149,6 @@ static void xe_survivability_mode_fini(void *arg) struct pci_dev *pdev = to_pci_dev(xe->drm.dev); struct device *dev = &pdev->dev; - xe_configfs_clear_survivability_mode(pdev); sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); } From 5c87fee3c96ce898ad681552404a66c7605193c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Thu, 4 Sep 2025 18:07:13 +0200 Subject: [PATCH 1305/3206] drm/xe: Attempt to bring bos back to VRAM after eviction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VRAM+TT bos that are evicted from VRAM to TT may remain in TT also after a revalidation following eviction or suspend. This manifests itself as applications becoming sluggish after buffer objects get evicted or after a resume from suspend or hibernation. If the bo supports placement in both VRAM and TT, and we are on DGFX, mark the TT placement as fallback. This means that it is tried only after VRAM + eviction. This flaw has probably been present since the xe module was upstreamed but use a Fixes: commit below where backporting is likely to be simple. For earlier versions we need to open- code the fallback algorithm in the driver. v2: - Remove check for dgfx. (Matthew Auld) - Update the xe_dma_buf kunit test for the new strategy (CI) - Allow dma-buf to pin in current placement (CI) - Make xe_bo_validate() for pinned bos a NOP. Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/5995 Fixes: a78a8da51b36 ("drm/ttm: replace busy placement with flags v6") Cc: Matthew Brost Cc: Matthew Auld Cc: # v6.9+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Link: https://lore.kernel.org/r/20250904160715.2613-2-thomas.hellstrom@linux.intel.com (cherry picked from commit cb3d7b3b46b799c96b54f8e8fe36794a55a77f0b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/tests/xe_bo.c | 2 +- drivers/gpu/drm/xe/tests/xe_dma_buf.c | 10 +--------- drivers/gpu/drm/xe/xe_bo.c | 16 ++++++++++++---- drivers/gpu/drm/xe/xe_bo.h | 2 +- drivers/gpu/drm/xe/xe_dma_buf.c | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/xe/tests/xe_bo.c b/drivers/gpu/drm/xe/tests/xe_bo.c index bb469096d072b5..7b40cc8be1c9c2 100644 --- a/drivers/gpu/drm/xe/tests/xe_bo.c +++ b/drivers/gpu/drm/xe/tests/xe_bo.c @@ -236,7 +236,7 @@ static int evict_test_run_tile(struct xe_device *xe, struct xe_tile *tile, struc } xe_bo_lock(external, false); - err = xe_bo_pin_external(external); + err = xe_bo_pin_external(external, false); xe_bo_unlock(external); if (err) { KUNIT_FAIL(test, "external bo pin err=%pe\n", diff --git a/drivers/gpu/drm/xe/tests/xe_dma_buf.c b/drivers/gpu/drm/xe/tests/xe_dma_buf.c index c53f67ce4b0aa2..121f17c112ec6a 100644 --- a/drivers/gpu/drm/xe/tests/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/tests/xe_dma_buf.c @@ -89,15 +89,7 @@ static void check_residency(struct kunit *test, struct xe_bo *exported, return; } - /* - * If on different devices, the exporter is kept in system if - * possible, saving a migration step as the transfer is just - * likely as fast from system memory. - */ - if (params->mem_mask & XE_BO_FLAG_SYSTEM) - KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, XE_PL_TT)); - else - KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type)); + KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type)); if (params->force_different_devices) KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(imported, XE_PL_TT)); diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 9954bb458ce12d..bae7ff2e59276c 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -186,6 +186,8 @@ static void try_add_system(struct xe_device *xe, struct xe_bo *bo, bo->placements[*c] = (struct ttm_place) { .mem_type = XE_PL_TT, + .flags = (bo_flags & XE_BO_FLAG_VRAM_MASK) ? + TTM_PL_FLAG_FALLBACK : 0, }; *c += 1; } @@ -2269,6 +2271,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res) /** * xe_bo_pin_external - pin an external BO * @bo: buffer object to be pinned + * @in_place: Pin in current placement, don't attempt to migrate. * * Pin an external (not tied to a VM, can be exported via dma-buf / prime FD) * BO. Unique call compared to xe_bo_pin as this function has it own set of @@ -2276,7 +2279,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res) * * Returns 0 for success, negative error code otherwise. */ -int xe_bo_pin_external(struct xe_bo *bo) +int xe_bo_pin_external(struct xe_bo *bo, bool in_place) { struct xe_device *xe = xe_bo_device(bo); int err; @@ -2285,9 +2288,11 @@ int xe_bo_pin_external(struct xe_bo *bo) xe_assert(xe, xe_bo_is_user(bo)); if (!xe_bo_is_pinned(bo)) { - err = xe_bo_validate(bo, NULL, false); - if (err) - return err; + if (!in_place) { + err = xe_bo_validate(bo, NULL, false); + if (err) + return err; + } spin_lock(&xe->pinned.lock); list_add_tail(&bo->pinned_link, &xe->pinned.late.external); @@ -2440,6 +2445,9 @@ int xe_bo_validate(struct xe_bo *bo, struct xe_vm *vm, bool allow_res_evict) }; int ret; + if (xe_bo_is_pinned(bo)) + return 0; + if (vm) { lockdep_assert_held(&vm->lock); xe_vm_assert_held(vm); diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h index 02e8cde4c6b201..9ce94d25201562 100644 --- a/drivers/gpu/drm/xe/xe_bo.h +++ b/drivers/gpu/drm/xe/xe_bo.h @@ -198,7 +198,7 @@ static inline void xe_bo_unlock_vm_held(struct xe_bo *bo) } } -int xe_bo_pin_external(struct xe_bo *bo); +int xe_bo_pin_external(struct xe_bo *bo, bool in_place); int xe_bo_pin(struct xe_bo *bo); void xe_bo_unpin_external(struct xe_bo *bo); void xe_bo_unpin(struct xe_bo *bo); diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index 346f857f38374f..af64baf872ef7b 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -72,7 +72,7 @@ static int xe_dma_buf_pin(struct dma_buf_attachment *attach) return ret; } - ret = xe_bo_pin_external(bo); + ret = xe_bo_pin_external(bo, true); xe_assert(xe, !ret); return 0; From d84820309ed34cc412ce76ecfa9471dae7d7d144 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Thu, 4 Sep 2025 18:07:14 +0200 Subject: [PATCH 1306/3206] drm/xe: Allow the pm notifier to continue on failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its actions are opportunistic anyway and will be completed on device suspend. Marking as a fix to simplify backporting of the fix that follows in the series. v2: - Keep the runtime pm reference over suspend / hibernate and document why. (Matt Auld, Rodrigo Vivi): Fixes: c6a4d46ec1d7 ("drm/xe: evict user memory in PM notifier") Cc: Matthew Auld Cc: Rodrigo Vivi Cc: # v6.16+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Link: https://lore.kernel.org/r/20250904160715.2613-3-thomas.hellstrom@linux.intel.com (cherry picked from commit ebd546fdffddfcaeab08afdd68ec93052c8fa740) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_pm.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index e279b47ba03bf6..c9d5f0a21c48a1 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -301,17 +301,17 @@ static int xe_pm_notifier_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: xe_pm_runtime_get(xe); err = xe_bo_evict_all_user(xe); - if (err) { + if (err) drm_dbg(&xe->drm, "Notifier evict user failed (%d)\n", err); - xe_pm_runtime_put(xe); - break; - } err = xe_bo_notifier_prepare_all_pinned(xe); - if (err) { + if (err) drm_dbg(&xe->drm, "Notifier prepare pin failed (%d)\n", err); - xe_pm_runtime_put(xe); - } + /* + * Keep the runtime pm reference until post hibernation / post suspend to + * avoid a runtime suspend interfering with evicted objects or backup + * allocations. + */ break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: @@ -320,9 +320,6 @@ static int xe_pm_notifier_callback(struct notifier_block *nb, break; } - if (err) - return NOTIFY_BAD; - return NOTIFY_DONE; } From eb5723a75104605b7d2207a7d598e314166fbef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Thu, 4 Sep 2025 18:07:15 +0200 Subject: [PATCH 1307/3206] drm/xe: Block exec and rebind worker while evicting for suspend / hibernate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the xe pm_notifier evicts for suspend / hibernate, there might be racing tasks trying to re-validate again. This can lead to suspend taking excessive time or get stuck in a live-lock. This behaviour becomes much worse with the fix that actually makes re-validation bring back bos to VRAM rather than letting them remain in TT. Prevent that by having exec and the rebind worker waiting for a completion that is set to block by the pm_notifier before suspend and is signaled by the pm_notifier after resume / wakeup. It's probably still possible to craft malicious applications that block suspending. More work is pending to fix that. v3: - Avoid wait_for_completion() in the kernel worker since it could potentially cause work item flushes from freezable processes to wait forever. Instead terminate the rebind workers if needed and re-launch at resume. (Matt Auld) v4: - Fix some bad naming and leftover debug printouts. - Fix kerneldoc. - Use drmm_mutex_init() for the xe->rebind_resume_lock (Matt Auld). - Rework the interface of xe_vm_rebind_resume_worker (Matt Auld). Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4288 Fixes: c6a4d46ec1d7 ("drm/xe: evict user memory in PM notifier") Cc: Matthew Auld Cc: Rodrigo Vivi Cc: # v6.16+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Link: https://lore.kernel.org/r/20250904160715.2613-4-thomas.hellstrom@linux.intel.com (cherry picked from commit 599334572a5a99111015fbbd5152ce4dedc2f8b7) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device_types.h | 6 ++++ drivers/gpu/drm/xe/xe_exec.c | 9 ++++++ drivers/gpu/drm/xe/xe_pm.c | 25 +++++++++++++++++ drivers/gpu/drm/xe/xe_vm.c | 42 +++++++++++++++++++++++++++- drivers/gpu/drm/xe/xe_vm.h | 2 ++ drivers/gpu/drm/xe/xe_vm_types.h | 5 ++++ 6 files changed, 88 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index d4d2c6854790ca..7ceb0c90f3914c 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -553,6 +553,12 @@ struct xe_device { /** @pm_notifier: Our PM notifier to perform actions in response to various PM events. */ struct notifier_block pm_notifier; + /** @pm_block: Completion to block validating tasks on suspend / hibernate prepare */ + struct completion pm_block; + /** @rebind_resume_list: List of wq items to kick on resume. */ + struct list_head rebind_resume_list; + /** @rebind_resume_lock: Lock to protect the rebind_resume_list */ + struct mutex rebind_resume_lock; /** @pmt: Support the PMT driver callback interface */ struct { diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 44364c042ad72d..374c831e691b2b 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -237,6 +237,15 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) goto err_unlock_list; } + /* + * It's OK to block interruptible here with the vm lock held, since + * on task freezing during suspend / hibernate, the call will + * return -ERESTARTSYS and the IOCTL will be rerun. + */ + err = wait_for_completion_interruptible(&xe->pm_block); + if (err) + goto err_unlock_list; + vm_exec.vm = &vm->gpuvm; vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT; if (xe_vm_in_lr_mode(vm)) { diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index c9d5f0a21c48a1..bb9b6ecad2afcd 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -24,6 +24,7 @@ #include "xe_pcode.h" #include "xe_pxp.h" #include "xe_trace.h" +#include "xe_vm.h" #include "xe_wa.h" /** @@ -290,6 +291,19 @@ static u32 vram_threshold_value(struct xe_device *xe) return DEFAULT_VRAM_THRESHOLD; } +static void xe_pm_wake_rebind_workers(struct xe_device *xe) +{ + struct xe_vm *vm, *next; + + mutex_lock(&xe->rebind_resume_lock); + list_for_each_entry_safe(vm, next, &xe->rebind_resume_list, + preempt.pm_activate_link) { + list_del_init(&vm->preempt.pm_activate_link); + xe_vm_resume_rebind_worker(vm); + } + mutex_unlock(&xe->rebind_resume_lock); +} + static int xe_pm_notifier_callback(struct notifier_block *nb, unsigned long action, void *data) { @@ -299,6 +313,7 @@ static int xe_pm_notifier_callback(struct notifier_block *nb, switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: + reinit_completion(&xe->pm_block); xe_pm_runtime_get(xe); err = xe_bo_evict_all_user(xe); if (err) @@ -315,6 +330,8 @@ static int xe_pm_notifier_callback(struct notifier_block *nb, break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: + complete_all(&xe->pm_block); + xe_pm_wake_rebind_workers(xe); xe_bo_notifier_unprepare_all_pinned(xe); xe_pm_runtime_put(xe); break; @@ -341,6 +358,14 @@ int xe_pm_init(struct xe_device *xe) if (err) return err; + err = drmm_mutex_init(&xe->drm, &xe->rebind_resume_lock); + if (err) + goto err_unregister; + + init_completion(&xe->pm_block); + complete_all(&xe->pm_block); + INIT_LIST_HEAD(&xe->rebind_resume_list); + /* For now suspend/resume is only allowed with GuC */ if (!xe_device_uc_enabled(xe)) return 0; diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index d60c4b1153043c..dc4f61e56579cb 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -393,6 +393,9 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec) list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind, &vm->rebind_list); + if (!try_wait_for_completion(&vm->xe->pm_block)) + return -EAGAIN; + ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false); if (ret) return ret; @@ -479,6 +482,33 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm, return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues); } +static bool vm_suspend_rebind_worker(struct xe_vm *vm) +{ + struct xe_device *xe = vm->xe; + bool ret = false; + + mutex_lock(&xe->rebind_resume_lock); + if (!try_wait_for_completion(&vm->xe->pm_block)) { + ret = true; + list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list); + } + mutex_unlock(&xe->rebind_resume_lock); + + return ret; +} + +/** + * xe_vm_resume_rebind_worker() - Resume the rebind worker. + * @vm: The vm whose preempt worker to resume. + * + * Resume a preempt worker that was previously suspended by + * vm_suspend_rebind_worker(). + */ +void xe_vm_resume_rebind_worker(struct xe_vm *vm) +{ + queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work); +} + static void preempt_rebind_work_func(struct work_struct *w) { struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work); @@ -502,6 +532,11 @@ static void preempt_rebind_work_func(struct work_struct *w) } retry: + if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) { + up_write(&vm->lock); + return; + } + if (xe_vm_userptr_check_repin(vm)) { err = xe_vm_userptr_pin(vm); if (err) @@ -1714,6 +1749,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef) if (flags & XE_VM_FLAG_LR_MODE) { INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func); xe_pm_runtime_get_noresume(xe); + INIT_LIST_HEAD(&vm->preempt.pm_activate_link); } if (flags & XE_VM_FLAG_FAULT_MODE) { @@ -1895,8 +1931,12 @@ void xe_vm_close_and_put(struct xe_vm *vm) xe_assert(xe, !vm->preempt.num_exec_queues); xe_vm_close(vm); - if (xe_vm_in_preempt_fence_mode(vm)) + if (xe_vm_in_preempt_fence_mode(vm)) { + mutex_lock(&xe->rebind_resume_lock); + list_del_init(&vm->preempt.pm_activate_link); + mutex_unlock(&xe->rebind_resume_lock); flush_work(&vm->preempt.rebind_work); + } if (xe_vm_in_fault_mode(vm)) xe_svm_close(vm); diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h index 2ecb417c19a280..82b1127958071f 100644 --- a/drivers/gpu/drm/xe/xe_vm.h +++ b/drivers/gpu/drm/xe/xe_vm.h @@ -273,6 +273,8 @@ struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo, struct xe_exec_queue *q, u64 addr, enum xe_cache_level cache_lvl); +void xe_vm_resume_rebind_worker(struct xe_vm *vm); + /** * xe_vm_resv() - Return's the vm's reservation object * @vm: The vm diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 8a07feef503bad..6058cf739388bc 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -293,6 +293,11 @@ struct xe_vm { * BOs */ struct work_struct rebind_work; + /** + * @preempt.pm_activate_link: Link to list of rebind workers to be + * kicked on resume. + */ + struct list_head pm_activate_link; } preempt; /** @um: unified memory state */ From fd99415ec8a80866e5e19f7835876e7b4f294946 Mon Sep 17 00:00:00 2001 From: Julia Filipchuk Date: Wed, 3 Sep 2025 12:00:38 -0700 Subject: [PATCH 1308/3206] drm/xe: Extend Wa_13011645652 to PTL-H, WCL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expand workaround to additional graphics architectures. Cc: Vinay Belgaumkar Cc: Stuart Summers Cc: Daniele Ceraolo Spurio Cc: Lucas De Marchi Cc: Thomas Hellström Cc: Rodrigo Vivi Cc: intel-xe@lists.freedesktop.org Cc: # v6.17+ Signed-off-by: Julia Filipchuk Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250903190122.1028373-2-julia.filipchuk@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 6fc957185e1691bb6dfa4193698a229db537c2a2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_wa_oob.rules | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules index e990f20eccfe3c..710f4423726c99 100644 --- a/drivers/gpu/drm/xe/xe_wa_oob.rules +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules @@ -30,7 +30,8 @@ 16022287689 GRAPHICS_VERSION(2001) GRAPHICS_VERSION(2004) 13011645652 GRAPHICS_VERSION(2004) - GRAPHICS_VERSION(3001) + GRAPHICS_VERSION_RANGE(3000, 3001) + GRAPHICS_VERSION(3003) 14022293748 GRAPHICS_VERSION_RANGE(2001, 2002) GRAPHICS_VERSION(2004) GRAPHICS_VERSION_RANGE(3000, 3001) From 503f1c72c31bbee21e669a08cf65c49e96d42755 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 27 Aug 2025 14:17:36 -0700 Subject: [PATCH 1309/3206] i40e: fix Jumbo Frame support after iPXE boot The i40e hardware has multiple hardware settings which define the Maximum Frame Size (MFS) of the physical port. The firmware has an AdminQ command (0x0603) to configure the MFS, but the i40e Linux driver never issues this command. In most cases this is no problem, as the NVM default value has the device configured for its maximum value of 9728. Unfortunately, recent versions of the iPXE intelxl driver now issue the 0x0603 Set Mac Config command, modifying the MFS and reducing it from its default value of 9728. This occurred as part of iPXE commit 6871a7de705b ("[intelxl] Use admin queue to set port MAC address and maximum frame size"), a prerequisite change for supporting the E800 series hardware in iPXE. Both the E700 and E800 firmware support the AdminQ command, and the iPXE code shares much of the logic between the two device drivers. The ice E800 Linux driver already issues the 0x0603 Set Mac Config command early during probe, and is thus unaffected by the iPXE change. Since commit 3a2c6ced90e1 ("i40e: Add a check to see if MFS is set"), the i40e driver does check the I40E_PRTGL_SAH register, but it only logs a warning message if its value is below the 9728 default. This register also only covers received packets and not transmitted packets. A warning can inform system administrators, but does not correct the issue. No interactions from userspace cause the driver to write to PRTGL_SAH or issue the 0x0603 AdminQ command. Only a GLOBR reset will restore the value to its default value. There is no obvious method to trigger a GLOBR reset from user space. To fix this, introduce the i40e_aq_set_mac_config() function, similar to the one from the ice driver. Call this during early probe to ensure that the device configuration matches driver expectation. Unlike E800, the E700 firmware also has a bit to control whether the MAC should append CRC data. It is on by default, but setting a 0 to this bit would disable CRC. The i40e implementation must set this bit to ensure CRC will be appended by the MAC. In addition to the AQ command, instead of just checking the I40E_PRTGL_SAH register, update its value to the 9728 default and write it back. This ensures that the hardware is in the expected state, regardless of whether the iPXE (or any other early boot driver) has modified this state. This is a better user experience, as we now fix the issues with larger MTU instead of merely warning. It also aligns with the way the ice E800 series driver works. A final note: The Fixes tag provided here is not strictly accurate. The issue occurs as a result of an external entity (the iPXE intelxl driver), and this is not a regression specifically caused by the mentioned change. However, I believe the original change to just warn about PRTGL_SAH being too low was an insufficient fix. Fixes: 3a2c6ced90e1 ("i40e: Add a check to see if MFS is set") Link: https://github.com/ipxe/ipxe/commit/6871a7de705b6f6a4046f0d19da9bcd689c3bc8e Signed-off-by: Jacob Keller Signed-off-by: Aleksandr Loktionov Reviewed-by: Michal Schmidt Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/i40e/i40e_adminq_cmd.h | 1 + drivers/net/ethernet/intel/i40e/i40e_common.c | 34 +++++++++++++++++++ drivers/net/ethernet/intel/i40e/i40e_main.c | 16 +++++---- .../net/ethernet/intel/i40e/i40e_prototype.h | 2 ++ 4 files changed, 47 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index 76d872b91a383d..cc02a85ad42bae 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -1561,6 +1561,7 @@ I40E_CHECK_CMD_LENGTH(i40e_aq_set_phy_config); struct i40e_aq_set_mac_config { __le16 max_frame_size; u8 params; +#define I40E_AQ_SET_MAC_CONFIG_CRC_EN BIT(2) u8 tx_timer_priority; /* bitmap */ __le16 tx_timer_value; __le16 fc_refresh_threshold; diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 270e7e8cf9cfd1..59f5c1e810eb05 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1189,6 +1189,40 @@ int i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures, return status; } +/** + * i40e_aq_set_mac_config - Configure MAC settings + * @hw: pointer to the hw struct + * @max_frame_size: Maximum Frame Size to be supported by the port + * @cmd_details: pointer to command details structure or NULL + * + * Set MAC configuration (0x0603). Note that max_frame_size must be greater + * than zero. + * + * Return: 0 on success, or a negative error code on failure. + */ +int i40e_aq_set_mac_config(struct i40e_hw *hw, u16 max_frame_size, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_set_mac_config *cmd; + struct libie_aq_desc desc; + + cmd = libie_aq_raw(&desc); + + if (max_frame_size == 0) + return -EINVAL; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_set_mac_config); + + cmd->max_frame_size = cpu_to_le16(max_frame_size); + cmd->params = I40E_AQ_SET_MAC_CONFIG_CRC_EN; + +#define I40E_AQ_SET_MAC_CONFIG_FC_DEFAULT_THRESHOLD 0x7FFF + cmd->fc_refresh_threshold = + cpu_to_le16(I40E_AQ_SET_MAC_CONFIG_FC_DEFAULT_THRESHOLD); + + return i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); +} + /** * i40e_aq_clear_pxe_mode * @hw: pointer to the hw struct diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index dd21d93d39dd54..b14019d44b5811 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16045,13 +16045,17 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) dev_dbg(&pf->pdev->dev, "get supported phy types ret = %pe last_status = %s\n", ERR_PTR(err), libie_aq_str(pf->hw.aq.asq_last_status)); - /* make sure the MFS hasn't been set lower than the default */ #define MAX_FRAME_SIZE_DEFAULT 0x2600 - val = FIELD_GET(I40E_PRTGL_SAH_MFS_MASK, - rd32(&pf->hw, I40E_PRTGL_SAH)); - if (val < MAX_FRAME_SIZE_DEFAULT) - dev_warn(&pdev->dev, "MFS for port %x (%d) has been set below the default (%d)\n", - pf->hw.port, val, MAX_FRAME_SIZE_DEFAULT); + + err = i40e_aq_set_mac_config(hw, MAX_FRAME_SIZE_DEFAULT, NULL); + if (err) + dev_warn(&pdev->dev, "set mac config ret = %pe last_status = %s\n", + ERR_PTR(err), libie_aq_str(pf->hw.aq.asq_last_status)); + + /* Make sure the MFS is set to the expected value */ + val = rd32(hw, I40E_PRTGL_SAH); + FIELD_MODIFY(I40E_PRTGL_SAH_MFS_MASK, &val, MAX_FRAME_SIZE_DEFAULT); + wr32(hw, I40E_PRTGL_SAH, val); /* Add a filter to drop all Flow control frames from any VSI from being * transmitted. By doing so we stop a malicious VF from sending out diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index aef5de53ce3bb8..26bb7bffe36101 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -98,6 +98,8 @@ int i40e_aq_set_mac_loopback(struct i40e_hw *hw, struct i40e_asq_cmd_details *cmd_details); int i40e_aq_set_phy_int_mask(struct i40e_hw *hw, u16 mask, struct i40e_asq_cmd_details *cmd_details); +int i40e_aq_set_mac_config(struct i40e_hw *hw, u16 max_frame_size, + struct i40e_asq_cmd_details *cmd_details); int i40e_aq_clear_pxe_mode(struct i40e_hw *hw, struct i40e_asq_cmd_details *cmd_details); int i40e_aq_set_link_restart_an(struct i40e_hw *hw, From bea3e1d4467bcf292c8e54f080353d556d355e26 Mon Sep 17 00:00:00 2001 From: Kang Chen Date: Tue, 9 Sep 2025 11:13:16 +0800 Subject: [PATCH 1310/3206] hfsplus: fix slab-out-of-bounds read in hfsplus_uni2asc() BUG: KASAN: slab-out-of-bounds in hfsplus_uni2asc+0xa71/0xb90 fs/hfsplus/unicode.c:186 Read of size 2 at addr ffff8880289ef218 by task syz.6.248/14290 CPU: 0 UID: 0 PID: 14290 Comm: syz.6.248 Not tainted 6.16.4 #1 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1b0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x5f0 mm/kasan/report.c:482 kasan_report+0xca/0x100 mm/kasan/report.c:595 hfsplus_uni2asc+0xa71/0xb90 fs/hfsplus/unicode.c:186 hfsplus_listxattr+0x5b6/0xbd0 fs/hfsplus/xattr.c:738 vfs_listxattr+0xbe/0x140 fs/xattr.c:493 listxattr+0xee/0x190 fs/xattr.c:924 filename_listxattr fs/xattr.c:958 [inline] path_listxattrat+0x143/0x360 fs/xattr.c:988 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcb/0x4c0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fe0e9fae16d Code: 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fe0eae67f98 EFLAGS: 00000246 ORIG_RAX: 00000000000000c3 RAX: ffffffffffffffda RBX: 00007fe0ea205fa0 RCX: 00007fe0e9fae16d RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000200000000000 RBP: 00007fe0ea0480f0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fe0ea206038 R14: 00007fe0ea205fa0 R15: 00007fe0eae48000 Allocated by task 14290: kasan_save_stack+0x24/0x50 mm/kasan/common.c:47 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0xaa/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4333 [inline] __kmalloc_noprof+0x219/0x540 mm/slub.c:4345 kmalloc_noprof include/linux/slab.h:909 [inline] hfsplus_find_init+0x95/0x1f0 fs/hfsplus/bfind.c:21 hfsplus_listxattr+0x331/0xbd0 fs/hfsplus/xattr.c:697 vfs_listxattr+0xbe/0x140 fs/xattr.c:493 listxattr+0xee/0x190 fs/xattr.c:924 filename_listxattr fs/xattr.c:958 [inline] path_listxattrat+0x143/0x360 fs/xattr.c:988 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcb/0x4c0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f When hfsplus_uni2asc is called from hfsplus_listxattr, it actually passes in a struct hfsplus_attr_unistr*. The size of the corresponding structure is different from that of hfsplus_unistr, so the previous fix (94458781aee6) is insufficient. The pointer on the unicode buffer is still going beyond the allocated memory. This patch introduces two warpper functions hfsplus_uni2asc_xattr_str and hfsplus_uni2asc_str to process two unicode buffers, struct hfsplus_attr_unistr* and struct hfsplus_unistr* respectively. When ustrlen value is bigger than the allocated memory size, the ustrlen value is limited to an safe size. Fixes: 94458781aee6 ("hfsplus: fix slab-out-of-bounds read in hfsplus_uni2asc()") Signed-off-by: Kang Chen Reviewed-by: Viacheslav Dubeyko Signed-off-by: Viacheslav Dubeyko Link: https://lore.kernel.org/r/20250909031316.1647094-1-k.chen@smail.nju.edu.cn Signed-off-by: Viacheslav Dubeyko --- fs/hfsplus/dir.c | 2 +- fs/hfsplus/hfsplus_fs.h | 8 ++++++-- fs/hfsplus/unicode.c | 24 +++++++++++++++++++----- fs/hfsplus/xattr.c | 6 +++--- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 876bbb80fb4dce..1b3e27a0d5e038 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -204,7 +204,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) fd.entrylength); type = be16_to_cpu(entry.type); len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN; - err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); + err = hfsplus_uni2asc_str(sb, &fd.key->cat.name, strbuf, &len); if (err) goto out; if (type == HFSPLUS_FOLDER) { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 49965cd4526124..9dd18de0bc891b 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -521,8 +521,12 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, const struct hfsplus_unistr *s2); int hfsplus_strcmp(const struct hfsplus_unistr *s1, const struct hfsplus_unistr *s2); -int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, - char *astr, int *len_p); +int hfsplus_uni2asc_str(struct super_block *sb, + const struct hfsplus_unistr *ustr, char *astr, + int *len_p); +int hfsplus_uni2asc_xattr_str(struct super_block *sb, + const struct hfsplus_attr_unistr *ustr, + char *astr, int *len_p); int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, int max_unistr_len, const char *astr, int len); int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str); diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 36b6cf2a3abba4..862ba27f1628a8 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -119,9 +119,8 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) return NULL; } -int hfsplus_uni2asc(struct super_block *sb, - const struct hfsplus_unistr *ustr, - char *astr, int *len_p) +static int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, + int max_len, char *astr, int *len_p) { const hfsplus_unichr *ip; struct nls_table *nls = HFSPLUS_SB(sb)->nls; @@ -134,8 +133,8 @@ int hfsplus_uni2asc(struct super_block *sb, ip = ustr->unicode; ustrlen = be16_to_cpu(ustr->length); - if (ustrlen > HFSPLUS_MAX_STRLEN) { - ustrlen = HFSPLUS_MAX_STRLEN; + if (ustrlen > max_len) { + ustrlen = max_len; pr_err("invalid length %u has been corrected to %d\n", be16_to_cpu(ustr->length), ustrlen); } @@ -256,6 +255,21 @@ int hfsplus_uni2asc(struct super_block *sb, return res; } +inline int hfsplus_uni2asc_str(struct super_block *sb, + const struct hfsplus_unistr *ustr, char *astr, + int *len_p) +{ + return hfsplus_uni2asc(sb, ustr, HFSPLUS_MAX_STRLEN, astr, len_p); +} + +inline int hfsplus_uni2asc_xattr_str(struct super_block *sb, + const struct hfsplus_attr_unistr *ustr, + char *astr, int *len_p) +{ + return hfsplus_uni2asc(sb, (const struct hfsplus_unistr *)ustr, + HFSPLUS_ATTR_MAX_STRLEN, astr, len_p); +} + /* * Convert one or more ASCII characters into a single unicode character. * Returns the number of ASCII characters corresponding to the unicode char. diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 18dc3d254d218c..c951fa9835aa12 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -735,9 +735,9 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) goto end_listxattr; xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN; - if (hfsplus_uni2asc(inode->i_sb, - (const struct hfsplus_unistr *)&fd.key->attr.key_name, - strbuf, &xattr_name_len)) { + if (hfsplus_uni2asc_xattr_str(inode->i_sb, + &fd.key->attr.key_name, strbuf, + &xattr_name_len)) { pr_err("unicode conversion failed\n"); res = -EIO; goto end_listxattr; From c33c43f71bda362b292a6e57ac41b64342dc87b3 Mon Sep 17 00:00:00 2001 From: Ming Wang Date: Tue, 9 Sep 2025 20:58:40 +0800 Subject: [PATCH 1311/3206] irqchip/loongson-pch-lpc: Use legacy domain for PCH-LPC IRQ controller On certain Loongson platforms, drivers attempting to request a legacy ISA IRQ directly via request_irq() (e.g., IRQ 4) may fail. The virtual IRQ descriptor is not fully initialized and lacks a valid irqchip. This issue does not affect ACPI-enumerated devices described in DSDT, as their interrupts are properly mapped via the GSI translation path. This indicates the LPC irqdomain itself is functional but is not correctly handling direct VIRQ-to-HWIRQ mappings. The root cause is the use of irq_domain_create_linear(). This API sets up a domain for dynamic, on-demand mapping, typically triggered by a GSI request. It does not pre-populate the mappings for the legacy VIRQ range (0-15). Consequently, if no ACPI device claims a specific GSI (e.g., GSI 4), the corresponding VIRQ (e.g., VIRQ 4) is never mapped to the LPC domain. A direct call to request_irq(4, ...) then fails because the kernel cannot resolve this VIRQ to a hardware interrupt managed by the LPC controller. The PCH-LPC interrupt controller is an i8259-compatible legacy device that requires a deterministic, static 1-to-1 mapping for IRQs 0-15 to support legacy drivers. Fix this by replacing irq_domain_create_linear() with irq_domain_create_legacy(). This API is specifically designed for such controllers. It establishes the required static 1-to-1 VIRQ-to-HWIRQ mapping for the entire legacy range (0-15) immediately upon domain creation. This ensures that any VIRQ in this range is always resolvable, making direct calls to request_irq() for legacy IRQs function correctly. Signed-off-by: Ming Wang Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-loongson-pch-lpc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-loongson-pch-lpc.c b/drivers/irqchip/irq-loongson-pch-lpc.c index 2d4c3ec128b8f2..912bf50a5c7ca7 100644 --- a/drivers/irqchip/irq-loongson-pch-lpc.c +++ b/drivers/irqchip/irq-loongson-pch-lpc.c @@ -200,8 +200,13 @@ int __init pch_lpc_acpi_init(struct irq_domain *parent, goto iounmap_base; } - priv->lpc_domain = irq_domain_create_linear(irq_handle, LPC_COUNT, - &pch_lpc_domain_ops, priv); + /* + * The LPC interrupt controller is a legacy i8259-compatible device, + * which requires a static 1:1 mapping for IRQs 0-15. + * Use irq_domain_create_legacy to establish this static mapping early. + */ + priv->lpc_domain = irq_domain_create_legacy(irq_handle, LPC_COUNT, 0, 0, + &pch_lpc_domain_ops, priv); if (!priv->lpc_domain) { pr_err("Failed to create IRQ domain\n"); goto free_irq_handle; From 7838fb5f119191403560eca2e23613380c0e425e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 4 Sep 2025 12:35:05 -0400 Subject: [PATCH 1312/3206] drm/amdgpu: fix a memory leak in fence cleanup when unloading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b61badd20b44 ("drm/amdgpu: fix usage slab after free") reordered when amdgpu_fence_driver_sw_fini() was called after that patch, amdgpu_fence_driver_sw_fini() effectively became a no-op as the sched entities we never freed because the ring pointers were already set to NULL. Remove the NULL setting. Reported-by: Lin.Cao Cc: Vitaly Prosyak Cc: Christian König Fixes: b61badd20b44 ("drm/amdgpu: fix usage slab after free") Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit a525fa37aac36c4591cc8b07ae8957862415fbd5) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 6379bb25bf5ce3..486c3646710cc4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -421,8 +421,6 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring) dma_fence_put(ring->vmid_wait); ring->vmid_wait = NULL; ring->me = 0; - - ring->adev->rings[ring->idx] = NULL; } /** From 1d66c3f2b8c0b5c51f3f4fe29b362c9851190c5a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 3 Sep 2025 09:11:12 -0400 Subject: [PATCH 1313/3206] drm/amd/display: use udelay rather than fsleep This function can be called from an atomic context so we can't use fsleep(). Fixes: 01f60348d8fb ("drm/amd/display: Fix 'failed to blank crtc!'") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4549 Cc: Wen Chen Cc: Fangzhi Zuo Cc: Nicholas Kazlauskas Cc: Harry Wentland Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher (cherry picked from commit 27e4dc2c0543fd1808cc52bd888ee1e0533c4a2e) --- drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index 3207addbd4ebb3..b7c2d3095b25c5 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -955,7 +955,7 @@ enum dc_status dcn20_enable_stream_timing( return DC_ERROR_UNEXPECTED; } - fsleep(stream->timing.v_total * (stream->timing.h_total * 10000u / stream->timing.pix_clk_100hz)); + udelay(stream->timing.v_total * (stream->timing.h_total * 10000u / stream->timing.pix_clk_100hz)); params.vertical_total_min = stream->adjust.v_total_min; params.vertical_total_max = stream->adjust.v_total_max; From 857ccfc19f9be1269716f3d681650c1bd149a656 Mon Sep 17 00:00:00 2001 From: Pratap Nirujogi Date: Wed, 3 Sep 2025 16:00:24 -0400 Subject: [PATCH 1314/3206] drm/amd/amdgpu: Declare isp firmware binary file Declare isp firmware file isp_4_1_1.bin required by isp4.1.1 device. Suggested-by: Alexey Zagorodnikov Reviewed-by: Mario Limonciello Signed-off-by: Pratap Nirujogi Signed-off-by: Alex Deucher (cherry picked from commit d97b74a833eba1f4f69f67198fd98ef036c0e5f9) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c b/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c index a887df52041407..4258d3e0b706c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c +++ b/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c @@ -29,6 +29,8 @@ #include "amdgpu.h" #include "isp_v4_1_1.h" +MODULE_FIRMWARE("amdgpu/isp_4_1_1.bin"); + #define ISP_PERFORMANCE_STATE_LOW 0 #define ISP_PERFORMANCE_STATE_HIGH 1 From 2b10cb58d7a3fd621ec9b2ba765a092e562ef998 Mon Sep 17 00:00:00 2001 From: David Rosca Date: Mon, 18 Aug 2025 09:06:58 +0200 Subject: [PATCH 1315/3206] drm/amdgpu/vcn4: Fix IB parsing with multiple engine info packages There can be multiple engine info packages in one IB and the first one may be common engine, not decode/encode. We need to parse the entire IB instead of stopping after finding first engine info. Signed-off-by: David Rosca Reviewed-by: Leo Liu Signed-off-by: Alex Deucher (cherry picked from commit dc8f9f0f45166a6b37864e7a031c726981d6e5fc) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 52 +++++++++++---------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index 1924e075b66f41..311fb595bd6930 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -1907,22 +1907,16 @@ static int vcn_v4_0_dec_msg(struct amdgpu_cs_parser *p, struct amdgpu_job *job, #define RADEON_VCN_ENGINE_TYPE_ENCODE (0x00000002) #define RADEON_VCN_ENGINE_TYPE_DECODE (0x00000003) - #define RADEON_VCN_ENGINE_INFO (0x30000001) -#define RADEON_VCN_ENGINE_INFO_MAX_OFFSET 16 - #define RENCODE_ENCODE_STANDARD_AV1 2 #define RENCODE_IB_PARAM_SESSION_INIT 0x00000003 -#define RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET 64 -/* return the offset in ib if id is found, -1 otherwise - * to speed up the searching we only search upto max_offset - */ -static int vcn_v4_0_enc_find_ib_param(struct amdgpu_ib *ib, uint32_t id, int max_offset) +/* return the offset in ib if id is found, -1 otherwise */ +static int vcn_v4_0_enc_find_ib_param(struct amdgpu_ib *ib, uint32_t id, int start) { int i; - for (i = 0; i < ib->length_dw && i < max_offset && ib->ptr[i] >= 8; i += ib->ptr[i]/4) { + for (i = start; i < ib->length_dw && ib->ptr[i] >= 8; i += ib->ptr[i] / 4) { if (ib->ptr[i + 1] == id) return i; } @@ -1937,33 +1931,29 @@ static int vcn_v4_0_ring_patch_cs_in_place(struct amdgpu_cs_parser *p, struct amdgpu_vcn_decode_buffer *decode_buffer; uint64_t addr; uint32_t val; - int idx; + int idx = 0, sidx; /* The first instance can decode anything */ if (!ring->me) return 0; - /* RADEON_VCN_ENGINE_INFO is at the top of ib block */ - idx = vcn_v4_0_enc_find_ib_param(ib, RADEON_VCN_ENGINE_INFO, - RADEON_VCN_ENGINE_INFO_MAX_OFFSET); - if (idx < 0) /* engine info is missing */ - return 0; - - val = amdgpu_ib_get_value(ib, idx + 2); /* RADEON_VCN_ENGINE_TYPE */ - if (val == RADEON_VCN_ENGINE_TYPE_DECODE) { - decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[idx + 6]; - - if (!(decode_buffer->valid_buf_flag & 0x1)) - return 0; - - addr = ((u64)decode_buffer->msg_buffer_address_hi) << 32 | - decode_buffer->msg_buffer_address_lo; - return vcn_v4_0_dec_msg(p, job, addr); - } else if (val == RADEON_VCN_ENGINE_TYPE_ENCODE) { - idx = vcn_v4_0_enc_find_ib_param(ib, RENCODE_IB_PARAM_SESSION_INIT, - RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET); - if (idx >= 0 && ib->ptr[idx + 2] == RENCODE_ENCODE_STANDARD_AV1) - return vcn_v4_0_limit_sched(p, job); + while ((idx = vcn_v4_0_enc_find_ib_param(ib, RADEON_VCN_ENGINE_INFO, idx)) >= 0) { + val = amdgpu_ib_get_value(ib, idx + 2); /* RADEON_VCN_ENGINE_TYPE */ + if (val == RADEON_VCN_ENGINE_TYPE_DECODE) { + decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[idx + 6]; + + if (!(decode_buffer->valid_buf_flag & 0x1)) + return 0; + + addr = ((u64)decode_buffer->msg_buffer_address_hi) << 32 | + decode_buffer->msg_buffer_address_lo; + return vcn_v4_0_dec_msg(p, job, addr); + } else if (val == RADEON_VCN_ENGINE_TYPE_ENCODE) { + sidx = vcn_v4_0_enc_find_ib_param(ib, RENCODE_IB_PARAM_SESSION_INIT, idx); + if (sidx >= 0 && ib->ptr[sidx + 2] == RENCODE_ENCODE_STANDARD_AV1) + return vcn_v4_0_limit_sched(p, job); + } + idx += ib->ptr[idx] / 4; } return 0; } From 3318f2d20ce48849855df5e190813826d0bc3653 Mon Sep 17 00:00:00 2001 From: David Rosca Date: Mon, 18 Aug 2025 09:18:37 +0200 Subject: [PATCH 1316/3206] drm/amdgpu/vcn: Allow limiting ctx to instance 0 for AV1 at any time There is no reason to require this to happen on first submitted IB only. We need to wait for the queue to be idle, but it can be done at any time (including when there are multiple video sessions active). Signed-off-by: David Rosca Reviewed-by: Leo Liu Signed-off-by: Alex Deucher (cherry picked from commit 8908fdce0634a623404e9923ed2f536101a39db5) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 12 ++++++++---- drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 12 ++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c index 4b8f4407047fc0..2811226b0ea5dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c @@ -1888,15 +1888,19 @@ static int vcn_v3_0_limit_sched(struct amdgpu_cs_parser *p, struct amdgpu_job *job) { struct drm_gpu_scheduler **scheds; - - /* The create msg must be in the first IB submitted */ - if (atomic_read(&job->base.entity->fence_seq)) - return -EINVAL; + struct dma_fence *fence; /* if VCN0 is harvested, we can't support AV1 */ if (p->adev->vcn.harvest_config & AMDGPU_VCN_HARVEST_VCN0) return -EINVAL; + /* wait for all jobs to finish before switching to instance 0 */ + fence = amdgpu_ctx_get_fence(p->ctx, job->base.entity, ~0ull); + if (fence) { + dma_fence_wait(fence, false); + dma_fence_put(fence); + } + scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_DEC] [AMDGPU_RING_PRIO_DEFAULT].sched; drm_sched_entity_modify_sched(job->base.entity, scheds, 1); diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index 311fb595bd6930..706f3b2f484f7c 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -1808,15 +1808,19 @@ static int vcn_v4_0_limit_sched(struct amdgpu_cs_parser *p, struct amdgpu_job *job) { struct drm_gpu_scheduler **scheds; - - /* The create msg must be in the first IB submitted */ - if (atomic_read(&job->base.entity->fence_seq)) - return -EINVAL; + struct dma_fence *fence; /* if VCN0 is harvested, we can't support AV1 */ if (p->adev->vcn.harvest_config & AMDGPU_VCN_HARVEST_VCN0) return -EINVAL; + /* wait for all jobs to finish before switching to instance 0 */ + fence = amdgpu_ctx_get_fence(p->ctx, job->base.entity, ~0ull); + if (fence) { + dma_fence_wait(fence, false); + dma_fence_put(fence); + } + scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_ENC] [AMDGPU_RING_PRIO_0].sched; drm_sched_entity_modify_sched(job->base.entity, scheds, 1); From f9bb6ffa7f5ad0f8ee0f53fc4a10655872ee4a14 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 Aug 2025 16:36:56 +0200 Subject: [PATCH 1317/3206] bpf: Fix out-of-bounds dynptr write in bpf_crypto_crypt Stanislav reported that in bpf_crypto_crypt() the destination dynptr's size is not validated to be at least as large as the source dynptr's size before calling into the crypto backend with 'len = src_len'. This can result in an OOB write when the destination is smaller than the source. Concretely, in mentioned function, psrc and pdst are both linear buffers fetched from each dynptr: psrc = __bpf_dynptr_data(src, src_len); [...] pdst = __bpf_dynptr_data_rw(dst, dst_len); [...] err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv) : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv); The crypto backend expects pdst to be large enough with a src_len length that can be written. Add an additional src_len > dst_len check and bail out if it's the case. Note that these kfuncs are accessible under root privileges only. Fixes: 3e1c6f35409f ("bpf: make common crypto API for TC/XDP programs") Reported-by: Stanislav Fort Signed-off-by: Daniel Borkmann Cc: Vadim Fedorenko Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20250829143657.318524-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 94854cd9c4cc32..83c4d9943084b9 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, siv_len = siv ? __bpf_dynptr_size(siv) : 0; src_len = __bpf_dynptr_size(src); dst_len = __bpf_dynptr_size(dst); - if (!src_len || !dst_len) + if (!src_len || !dst_len || src_len > dst_len) return -EINVAL; if (siv_len != ctx->siv_len) From 3aa9b9a165d5e9afc7d8b5dbcd508810c05c8e89 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 Aug 2025 16:36:57 +0200 Subject: [PATCH 1318/3206] selftests/bpf: Extend crypto_sanity selftest with invalid dst buffer Small cleanup and test extension to probe the bpf_crypto_{encrypt,decrypt}() kfunc when a bad dst buffer is passed in to assert that an error is returned. Also, encrypt_sanity() and skb_crypto_setup() were explicit to set the global status variable to zero before any test, so do the same for decrypt_sanity(). Do not explicitly zero the on-stack err before bpf_crypto_ctx_create() given the kfunc is expected to do it internally for the success case. Before kernel fix: # ./vmtest.sh -- ./test_progs -t crypto [...] [ 1.531200] bpf_testmod: loading out-of-tree module taints kernel. [ 1.533388] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel #87/1 crypto_basic/crypto_release:OK #87/2 crypto_basic/crypto_acquire:OK #87 crypto_basic:OK test_crypto_sanity:PASS:skel open 0 nsec test_crypto_sanity:PASS:ip netns add crypto_sanity_ns 0 nsec test_crypto_sanity:PASS:ip -net crypto_sanity_ns -6 addr add face::1/128 dev lo nodad 0 nsec test_crypto_sanity:PASS:ip -net crypto_sanity_ns link set dev lo up 0 nsec test_crypto_sanity:PASS:open_netns 0 nsec test_crypto_sanity:PASS:AF_ALG init fail 0 nsec test_crypto_sanity:PASS:if_nametoindex lo 0 nsec test_crypto_sanity:PASS:skb_crypto_setup fd 0 nsec test_crypto_sanity:PASS:skb_crypto_setup 0 nsec test_crypto_sanity:PASS:skb_crypto_setup retval 0 nsec test_crypto_sanity:PASS:skb_crypto_setup status 0 nsec test_crypto_sanity:PASS:create qdisc hook 0 nsec test_crypto_sanity:PASS:make_sockaddr 0 nsec test_crypto_sanity:PASS:attach encrypt filter 0 nsec test_crypto_sanity:PASS:encrypt socket 0 nsec test_crypto_sanity:PASS:encrypt send 0 nsec test_crypto_sanity:FAIL:encrypt status unexpected error: -5 (errno 95) #88 crypto_sanity:FAIL Summary: 1/2 PASSED, 0 SKIPPED, 1 FAILED After kernel fix: # ./vmtest.sh -- ./test_progs -t crypto [...] [ 1.540963] bpf_testmod: loading out-of-tree module taints kernel. [ 1.542404] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel #87/1 crypto_basic/crypto_release:OK #87/2 crypto_basic/crypto_acquire:OK #87 crypto_basic:OK #88 crypto_sanity:OK Summary: 2/2 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Cc: Vadim Fedorenko Link: https://lore.kernel.org/r/20250829143657.318524-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/crypto_sanity.c | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c index 645be6cddf36cd..dfd8a258f14a88 100644 --- a/tools/testing/selftests/bpf/progs/crypto_sanity.c +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -14,7 +14,7 @@ unsigned char key[256] = {}; u16 udp_test_port = 7777; u32 authsize, key_len; char algo[128] = {}; -char dst[16] = {}; +char dst[16] = {}, dst_bad[8] = {}; int status; static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) @@ -59,10 +59,9 @@ int skb_crypto_setup(void *ctx) .authsize = authsize, }; struct bpf_crypto_ctx *cctx; - int err = 0; + int err; status = 0; - if (key_len > 256) { status = -EINVAL; return 0; @@ -70,8 +69,8 @@ int skb_crypto_setup(void *ctx) __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); __builtin_memcpy(¶ms.key, key, sizeof(key)); - cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); if (!cctx) { status = err; return 0; @@ -80,7 +79,6 @@ int skb_crypto_setup(void *ctx) err = crypto_ctx_insert(cctx); if (err && err != -EEXIST) status = err; - return 0; } @@ -92,6 +90,7 @@ int decrypt_sanity(struct __sk_buff *skb) struct bpf_dynptr psrc, pdst; int err; + status = 0; err = skb_dynptr_validate(skb, &psrc); if (err < 0) { status = err; @@ -110,13 +109,23 @@ int decrypt_sanity(struct __sk_buff *skb) return TC_ACT_SHOT; } - /* dst is a global variable to make testing part easier to check. In real - * production code, a percpu map should be used to store the result. + /* Check also bad case where the dst buffer is smaller than the + * skb's linear section. + */ + bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst); + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL); + if (!status) + status = -EIO; + if (status != -EINVAL) + goto err; + + /* dst is a global variable to make testing part easier to check. + * In real production code, a percpu map should be used to store + * the result. */ bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); - status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL); - +err: return TC_ACT_SHOT; } @@ -129,7 +138,6 @@ int encrypt_sanity(struct __sk_buff *skb) int err; status = 0; - err = skb_dynptr_validate(skb, &psrc); if (err < 0) { status = err; @@ -148,13 +156,23 @@ int encrypt_sanity(struct __sk_buff *skb) return TC_ACT_SHOT; } - /* dst is a global variable to make testing part easier to check. In real - * production code, a percpu map should be used to store the result. + /* Check also bad case where the dst buffer is smaller than the + * skb's linear section. + */ + bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst); + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL); + if (!status) + status = -EIO; + if (status != -EINVAL) + goto err; + + /* dst is a global variable to make testing part easier to check. + * In real production code, a percpu map should be used to store + * the result. */ bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); - status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL); - +err: return TC_ACT_SHOT; } From 7edfc024708258d75f65fadffd7e5f6ac46810b6 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sat, 30 Aug 2025 00:31:58 +0800 Subject: [PATCH 1319/3206] bpf: Fix bpf_strnstr() to handle suffix match cases better bpf_strnstr() should not treat the ending '\0' of s2 as a matching character if the parameter 'len' equal to s2 string length, for example: 1. bpf_strnstr("openat", "open", 4) = -ENOENT 2. bpf_strnstr("openat", "open", 5) = 0 This patch makes (1) return 0, fix just the `len == strlen(s2)` case. And fix a more general case when s2 is a suffix of the first len characters of s1. Fixes: e91370550f1f ("bpf: Add kfuncs for read-only string operations") Signed-off-by: Rong Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/tencent_17DC57B9D16BC443837021BEACE84B7C1507@qq.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68c9..b9b0c5fe33f61a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3664,10 +3664,17 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { - for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) { + for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&c2, s2__ign + j, char, err_out); if (c2 == '\0') return i; + /* + * We allow reading an extra byte from s2 (note the + * `i + j <= len` above) to cover the case when s2 is + * a suffix of the first len chars of s1. + */ + if (i + j == len) + break; __get_kernel_nofault(&c1, s1__ign + j, char, err_out); if (c1 == '\0') return -ENOENT; From 5d40c038c879eeb910039adeaf6102e1c4dda807 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 29 Aug 2025 04:53:57 +0200 Subject: [PATCH 1320/3206] selftests/bpf: Fix "expression result unused" warnings with icecc icecc is a compiler wrapper that distributes compile jobs over a build farm [1]. It works by sending toolchain binaries and preprocessed source code to remote machines. Unfortunately using it with BPF selftests causes build failures due to a clang bug [2]. The problem is that clang suppresses the -Wunused-value warning if the unused expression comes from a macro expansion. Since icecc compiles preprocessed source code, this information is not available. This leads to -Wunused-value false positives. obj_new_no_struct() and obj_new_acq() use the bpf_obj_new() macro and discard the result. arena_spin_lock_slowpath() uses two macros that produce values and ignores the results. Add (void) casts to explicitly indicate that this is intentional and suppress the warning. An alternative solution is to change the macros to not produce values. This would work today for the arena_spin_lock_slowpath() issue, but in the future there may appear users who need them. Another potential solution is to replace these macros with functions. Unfortunately this would not work, because these macros work with unknown types and control flow. [1] https://github.com/icecc/icecream [2] https://github.com/llvm/llvm-project/issues/142614 Signed-off-by: Ilya Leoshkevich Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250829030017.102615-2-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h | 4 ++-- tools/testing/selftests/bpf/progs/linked_list_fail.c | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h index d67466c1ff7754..f90531cf3ee59e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h +++ b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h @@ -302,7 +302,7 @@ int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val * barriers. */ if (val & _Q_LOCKED_MASK) - smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); + (void)smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); /* * take ownership and clear the pending bit. @@ -380,7 +380,7 @@ int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); - arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); + (void)arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); /* * While waiting for the MCS lock, the next pointer may have diff --git a/tools/testing/selftests/bpf/progs/linked_list_fail.c b/tools/testing/selftests/bpf/progs/linked_list_fail.c index 6438982b928bdc..ddd26d1a083f72 100644 --- a/tools/testing/selftests/bpf/progs/linked_list_fail.c +++ b/tools/testing/selftests/bpf/progs/linked_list_fail.c @@ -226,8 +226,7 @@ int obj_new_no_composite(void *ctx) SEC("?tc") int obj_new_no_struct(void *ctx) { - - bpf_obj_new(union { int data; unsigned udata; }); + (void)bpf_obj_new(union { int data; unsigned udata; }); return 0; } @@ -252,7 +251,7 @@ int new_null_ret(void *ctx) SEC("?tc") int obj_new_acq(void *ctx) { - bpf_obj_new(struct foo); + (void)bpf_obj_new(struct foo); return 0; } From 6624fb2f3382271953f951d46f2ea30415a0917e Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sat, 30 Aug 2025 00:32:13 +0800 Subject: [PATCH 1321/3206] selftests/bpf: Add tests for bpf_strnstr Add tests for bpf_strnstr(): bpf_strnstr("", "", 0) = 0 bpf_strnstr("hello world", "hello", 5) = 0 bpf_strnstr(str, "hello", 4) = -ENOENT bpf_strnstr("", "a", 0) = -ENOENT Signed-off-by: Rong Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/tencent_2ED218F8082565C95D86A804BDDA8DBA200A@qq.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/string_kfuncs_success.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index 46697f3818789a..a47690174e0ec4 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -30,8 +30,12 @@ __test(2) int test_strcspn(void *ctx) { return bpf_strcspn(str, "lo"); } __test(6) int test_strstr_found(void *ctx) { return bpf_strstr(str, "world"); } __test(-ENOENT) int test_strstr_notfound(void *ctx) { return bpf_strstr(str, "hi"); } __test(0) int test_strstr_empty(void *ctx) { return bpf_strstr(str, ""); } -__test(0) int test_strnstr_found(void *ctx) { return bpf_strnstr(str, "hello", 6); } -__test(-ENOENT) int test_strnstr_notfound(void *ctx) { return bpf_strnstr(str, "hi", 10); } +__test(0) int test_strnstr_found1(void *ctx) { return bpf_strnstr("", "", 0); } +__test(0) int test_strnstr_found2(void *ctx) { return bpf_strnstr(str, "hello", 5); } +__test(0) int test_strnstr_found3(void *ctx) { return bpf_strnstr(str, "hello", 6); } +__test(-ENOENT) int test_strnstr_notfound1(void *ctx) { return bpf_strnstr(str, "hi", 10); } +__test(-ENOENT) int test_strnstr_notfound2(void *ctx) { return bpf_strnstr(str, "hello", 4); } +__test(-ENOENT) int test_strnstr_notfound3(void *ctx) { return bpf_strnstr("", "a", 0); } __test(0) int test_strnstr_empty(void *ctx) { return bpf_strnstr(str, "", 1); } char _license[] SEC("license") = "GPL"; From 6c6f5c19e67c89e974ef0dd8601804eaa4ae868d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 9 Sep 2025 10:16:38 -0700 Subject: [PATCH 1322/3206] bpf: Update the list of BPF selftests maintainers Unfortunately Mykola won't participate in BPF selftests maintenance anymore. Remove the entry on his behalf. Acked-by: Mykola Lysenko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250909171638.2417272-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..6056ad6f1afa86 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4682,7 +4682,6 @@ F: security/bpf/ BPF [SELFTESTS] (Test Runners & Infrastructure) M: Andrii Nakryiko M: Eduard Zingerman -R: Mykola Lysenko L: bpf@vger.kernel.org S: Maintained F: tools/testing/selftests/bpf/ From 30f241fcf52aaaef7ac16e66530faa11be78a865 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 4 Sep 2025 21:49:07 +0200 Subject: [PATCH 1323/3206] xsk: Fix immature cq descriptor production Eryk reported an issue that I have put under Closes: tag, related to umem addrs being prematurely produced onto pool's completion queue. Let us make the skb's destructor responsible for producing all addrs that given skb used. Commit from fixes tag introduced the buggy behavior, it was not broken from day 1, but rather when xsk multi-buffer got introduced. In order to mitigate performance impact as much as possible, mimic the linear and frag parts within skb by storing the first address from XSK descriptor at sk_buff::destructor_arg. For fragments, store them at ::cb via list. The nodes that will go onto list will be allocated via kmem_cache. xsk_destruct_skb() will consume address stored at ::destructor_arg and optionally go through list from ::cb, if count of descriptors associated with this particular skb is bigger than 1. Previous approach where whole array for storing UMEM addresses from XSK descriptors was pre-allocated during first fragment processing yielded too big performance regression for 64b traffic. In current approach impact is much reduced on my tests and for jumbo frames I observed traffic being slower by at most 9%. Magnus suggested to have this way of processing special cased for XDP_SHARED_UMEM, so we would identify this during bind and set different hooks for 'backpressure mechanism' on CQ and for skb destructor, but given that results looked promising on my side I decided to have a single data path for XSK generic Tx. I suppose other auxiliary stuff would have to land as well in order to make it work. Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path") Reported-by: Eryk Kubanski Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/ Acked-by: Stanislav Fomichev Signed-off-by: Maciej Fijalkowski Tested-by: Jason Xing Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20250904194907.2342177-1-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 113 ++++++++++++++++++++++++++++++++++++++------ net/xdp/xsk_queue.h | 12 +++++ 2 files changed, 111 insertions(+), 14 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9c3acecc14b1ae..72e34bd2d925c0 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,6 +36,20 @@ #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET 32 +struct xsk_addr_node { + u64 addr; + struct list_head addr_node; +}; + +struct xsk_addr_head { + u32 num_descs; + struct list_head addrs_list; +}; + +static struct kmem_cache *xsk_tx_generic_cache; + +#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb)) + void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) @@ -532,24 +546,43 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) { unsigned long flags; int ret; spin_lock_irqsave(&pool->cq_lock, flags); - ret = xskq_prod_reserve_addr(pool->cq, addr); + ret = xskq_prod_reserve(pool->cq); spin_unlock_irqrestore(&pool->cq_lock, flags); return ret; } -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) +static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, + struct sk_buff *skb) { + struct xsk_addr_node *pos, *tmp; + u32 descs_processed = 0; unsigned long flags; + u32 idx; spin_lock_irqsave(&pool->cq_lock, flags); - xskq_prod_submit_n(pool->cq, n); + idx = xskq_get_prod(pool->cq); + + xskq_prod_write_addr(pool->cq, idx, + (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg); + descs_processed++; + + if (unlikely(XSKCB(skb)->num_descs > 1)) { + list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { + xskq_prod_write_addr(pool->cq, idx + descs_processed, + pos->addr); + descs_processed++; + list_del(&pos->addr_node); + kmem_cache_free(xsk_tx_generic_cache, pos); + } + } + xskq_prod_submit_n(pool->cq, descs_processed); spin_unlock_irqrestore(&pool->cq_lock, flags); } @@ -562,9 +595,14 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) spin_unlock_irqrestore(&pool->cq_lock, flags); } +static void xsk_inc_num_desc(struct sk_buff *skb) +{ + XSKCB(skb)->num_descs++; +} + static u32 xsk_get_num_desc(struct sk_buff *skb) { - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; + return XSKCB(skb)->num_descs; } static void xsk_destruct_skb(struct sk_buff *skb) @@ -576,23 +614,33 @@ static void xsk_destruct_skb(struct sk_buff *skb) *compl->tx_timestamp = ktime_get_tai_fast_ns(); } - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); + xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); sock_wfree(skb); } -static void xsk_set_destructor_arg(struct sk_buff *skb) +static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr) { - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; - - skb_shinfo(skb)->destructor_arg = (void *)num; + BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb)); + INIT_LIST_HEAD(&XSKCB(skb)->addrs_list); + XSKCB(skb)->num_descs = 0; + skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr; } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); + u32 num_descs = xsk_get_num_desc(skb); + struct xsk_addr_node *pos, *tmp; + + if (unlikely(num_descs > 1)) { + list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { + list_del(&pos->addr_node); + kmem_cache_free(xsk_tx_generic_cache, pos); + } + } skb->destructor = sock_wfree; - xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); + xsk_cq_cancel_locked(xs->pool, num_descs); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; @@ -609,6 +657,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; + struct xsk_addr_node *xsk_addr; struct sk_buff *skb = xs->skb; struct page *page; void *buffer; @@ -623,6 +672,19 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, return ERR_PTR(err); skb_reserve(skb, hr); + + xsk_set_destructor_arg(skb, desc->addr); + } else { + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (!xsk_addr) + return ERR_PTR(-ENOMEM); + + /* in case of -EOVERFLOW that could happen below, + * xsk_consume_skb() will release this node as whole skb + * would be dropped, which implies freeing all list elements + */ + xsk_addr->addr = desc->addr; + list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } addr = desc->addr; @@ -694,8 +756,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) goto free_err; + + xsk_set_destructor_arg(skb, desc->addr); } else { int nr_frags = skb_shinfo(skb)->nr_frags; + struct xsk_addr_node *xsk_addr; struct page *page; u8 *vaddr; @@ -710,12 +775,22 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, goto free_err; } + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (!xsk_addr) { + __free_page(page); + err = -ENOMEM; + goto free_err; + } + vaddr = kmap_local_page(page); memcpy(vaddr, buffer, len); kunmap_local(vaddr); skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); + + xsk_addr->addr = desc->addr; + list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } if (first_frag && desc->options & XDP_TX_METADATA) { @@ -759,7 +834,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); - xsk_set_destructor_arg(skb); + xsk_inc_num_desc(skb); return skb; @@ -769,7 +844,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (err == -EOVERFLOW) { /* Drop the packet */ - xsk_set_destructor_arg(xs->skb); + xsk_inc_num_desc(xs->skb); xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } else { @@ -812,7 +887,7 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr); + err = xsk_cq_reserve_locked(xs->pool); if (err) { err = -EAGAIN; goto out; @@ -1815,8 +1890,18 @@ static int __init xsk_init(void) if (err) goto out_pernet; + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", + sizeof(struct xsk_addr_node), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!xsk_tx_generic_cache) { + err = -ENOMEM; + goto out_unreg_notif; + } + return 0; +out_unreg_notif: + unregister_netdevice_notifier(&xsk_netdev_notifier); out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 46d87e961ad6d3..f16f390370dc43 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q) /* Functions for producers */ +static inline u32 xskq_get_prod(struct xsk_queue *q) +{ + return READ_ONCE(q->ring->producer); +} + static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); @@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + ring->desc[idx & q->ring_mask] = addr; +} + static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, u32 nb_entries) { From 58b4aa53606f75c7d6e6cd4710da4d4399e8f667 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 29 Aug 2025 21:22:41 +0200 Subject: [PATCH 1324/3206] rust: error: improve `Error::from_errno` documentation This constructor is public since commit 5ed147473458 ("rust: error: make conversion functions public"), and we will refer to it from the documentation of `to_result` in a later commit. Thus improve its documentation, including adding examples. Reviewed-by: Alexandre Courbot Reviewed-by: Benno Lossin Signed-off-by: Miguel Ojeda --- rust/kernel/error.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs index db14da97672262..edd576b7dfe407 100644 --- a/rust/kernel/error.rs +++ b/rust/kernel/error.rs @@ -103,8 +103,23 @@ pub struct Error(NonZeroI32); impl Error { /// Creates an [`Error`] from a kernel error code. /// - /// It is a bug to pass an out-of-range `errno`. `EINVAL` would - /// be returned in such a case. + /// `errno` must be within error code range (i.e. `>= -MAX_ERRNO && < 0`). + /// + /// It is a bug to pass an out-of-range `errno`. [`code::EINVAL`] is returned in such a case. + /// + /// # Examples + /// + /// ``` + /// assert_eq!(Error::from_errno(-1), EPERM); + /// assert_eq!(Error::from_errno(-2), ENOENT); + /// ``` + /// + /// The following calls are considered a bug: + /// + /// ``` + /// assert_eq!(Error::from_errno(0), EINVAL); + /// assert_eq!(Error::from_errno(-1000000), EINVAL); + /// ``` pub fn from_errno(errno: crate::ffi::c_int) -> Error { if let Some(error) = Self::try_from_errno(errno) { error From 099381a08db3539c6aab6616c94d7950d74fcd2d Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 29 Aug 2025 21:22:42 +0200 Subject: [PATCH 1325/3206] rust: error: improve `to_result` documentation Core functions like `to_result` should have good documentation. Thus improve it, including adding an example of how to perform early returns with it. Reviewed-by: Benno Lossin Reviewed-by: Alexandre Courbot Signed-off-by: Miguel Ojeda --- rust/kernel/error.rs | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs index edd576b7dfe407..1c0e0e241daa91 100644 --- a/rust/kernel/error.rs +++ b/rust/kernel/error.rs @@ -392,8 +392,43 @@ impl From for Error { /// [Rust documentation]: https://doc.rust-lang.org/book/ch09-02-recoverable-errors-with-result.html pub type Result = core::result::Result; -/// Converts an integer as returned by a C kernel function to an error if it's negative, and -/// `Ok(())` otherwise. +/// Converts an integer as returned by a C kernel function to a [`Result`]. +/// +/// If the integer is negative, an [`Err`] with an [`Error`] as given by [`Error::from_errno`] is +/// returned. This means the integer must be `>= -MAX_ERRNO`. +/// +/// Otherwise, it returns [`Ok`]. +/// +/// It is a bug to pass an out-of-range negative integer. `Err(EINVAL)` is returned in such a case. +/// +/// # Examples +/// +/// This function may be used to easily perform early returns with the [`?`] operator when working +/// with C APIs within Rust abstractions: +/// +/// ``` +/// # use kernel::error::to_result; +/// # mod bindings { +/// # #![expect(clippy::missing_safety_doc)] +/// # use kernel::prelude::*; +/// # pub(super) unsafe fn f1() -> c_int { 0 } +/// # pub(super) unsafe fn f2() -> c_int { EINVAL.to_errno() } +/// # } +/// fn f() -> Result { +/// // SAFETY: ... +/// to_result(unsafe { bindings::f1() })?; +/// +/// // SAFETY: ... +/// to_result(unsafe { bindings::f2() })?; +/// +/// // ... +/// +/// Ok(()) +/// } +/// # assert_eq!(f(), Err(EINVAL)); +/// ``` +/// +/// [`?`]: https://doc.rust-lang.org/reference/expressions/operator-expr.html#the-question-mark-operator pub fn to_result(err: crate::ffi::c_int) -> Result { if err < 0 { Err(Error::from_errno(err)) From 0d80e7f951be1bdd08d328fd87694be0d6e8aaa8 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 9 Sep 2025 18:49:59 +0000 Subject: [PATCH 1326/3206] rqspinlock: Choose trylock fallback for NMI waiters Currently, out of all 3 types of waiters in the rqspinlock slow path (i.e., pending bit waiter, wait queue head waiter, and wait queue non-head waiter), only the pending bit waiter and wait queue head waiters apply deadlock checks and a timeout on their waiting loop. The assumption here was that the wait queue head's forward progress would be sufficient to identify cases where the lock owner or pending bit waiter is stuck, and non-head waiters relying on the head waiter would prove to be sufficient for their own forward progress. However, the head waiter itself can be preempted by a non-head waiter for the same lock (AA) or a different lock (ABBA) in a manner that impedes its forward progress. In such a case, non-head waiters not performing deadlock and timeout checks becomes insufficient, and the system can enter a state of lockup. This is typically not a concern with non-NMI lock acquisitions, as lock holders which in run in different contexts (IRQ, non-IRQ) use "irqsave" variants of the lock APIs, which naturally excludes such lock holders from preempting one another on the same CPU. It might seem likely that a similar case may occur for rqspinlock when programs are attached to contention tracepoints (begin, end), however, these tracepoints either precede the enqueue into the wait queue, or succeed it, therefore cannot be used to preempt a head waiter's waiting loop. We must still be careful against nested kprobe and fentry programs that may attach to the middle of the head's waiting loop to stall forward progress and invoke another rqspinlock acquisition that proceeds as a non-head waiter. To this end, drop CC_FLAGS_FTRACE from the rqspinlock.o object file. For now, this issue is resolved by falling back to a repeated trylock on the lock word from NMI context, while performing the deadlock checks to break out early in case forward progress is impossible, and use the timeout as a final fallback. A more involved fix to terminate the queue when such a condition occurs will be made as a follow up. A selftest to stress this aspect of nested NMI/non-NMI locking attempts will be added in a subsequent patch to the bpf-next tree when this fix lands and trees are synchronized. Reported-by: Josef Bacik Fixes: 164c246571e9 ("rqspinlock: Protect waiters in queue from stalls") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250909184959.3509085-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 1 + kernel/bpf/rqspinlock.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 269c04a2466400..f6cf8c2af5f7b3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -62,3 +62,4 @@ CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 5ab354d55d8295..a00561b1d3e515 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -471,7 +471,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= _Q_MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { lockevent_inc(lock_no_node); RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); while (!queued_spin_trylock(lock)) { From df0cb5cb50bd54d3cd4d0d83417ceec6a66404aa Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Tue, 9 Sep 2025 22:46:14 +0800 Subject: [PATCH 1327/3206] bpf: Allow fall back to interpreter for programs with stack size <= 512 OpenWRT users reported regression on ARMv6 devices after updating to latest HEAD, where tcpdump filter: tcpdump "not ether host 3c37121a2b3c and not ether host 184ecbca2a3a \ and not ether host 14130b4d3f47 and not ether host f0f61cf440b7 \ and not ether host a84b4dedf471 and not ether host d022be17e1d7 \ and not ether host 5c497967208b and not ether host 706655784d5b" fails with warning: "Kernel filter failed: No error information" when using config: # CONFIG_BPF_JIT_ALWAYS_ON is not set CONFIG_BPF_JIT_DEFAULT_ON=y The issue arises because commits: 1. "bpf: Fix array bounds error with may_goto" changed default runtime to __bpf_prog_ret0_warn when jit_requested = 1 2. "bpf: Avoid __bpf_prog_ret0_warn when jit fails" returns error when jit_requested = 1 but jit fails This change restores interpreter fallback capability for BPF programs with stack size <= 512 bytes when jit fails. Reported-by: Felix Fietkau Closes: https://lore.kernel.org/bpf/2e267b4b-0540-45d8-9310-e127bf95fc63@nbd.name/ Fixes: 6ebc5030e0c5 ("bpf: Fix array bounds error with may_goto") Signed-off-by: KaFai Wan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250909144614.2991253-1-kafai.wan@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f8ac77d08ca72b..e4568d44e82790 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2366,8 +2366,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, or interpreter is being used when - * prog->jit_requested is not 0, so warn about it! + * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; @@ -2468,8 +2467,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return ret; } -static void bpf_prog_select_func(struct bpf_prog *fp) +static bool bpf_prog_select_interpreter(struct bpf_prog *fp) { + bool select_interpreter = false; #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); u32 idx = (round_up(stack_depth, 32) / 32) - 1; @@ -2478,15 +2478,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * But for non-JITed programs, we don't need bpf_func, so no bounds * check needed. */ - if (!fp->jit_requested && - !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + if (idx < ARRAY_SIZE(interpreters)) { fp->bpf_func = interpreters[idx]; + select_interpreter = true; } else { fp->bpf_func = __bpf_prog_ret0_warn; } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif + return select_interpreter; } /** @@ -2505,7 +2506,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ - bool jit_needed = fp->jit_requested; + bool jit_needed = false; if (fp->bpf_func) goto finalize; @@ -2514,7 +2515,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) bpf_prog_has_kfunc_call(fp)) jit_needed = true; - bpf_prog_select_func(fp); + if (!bpf_prog_select_interpreter(fp)) + jit_needed = true; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during From 6d78b4473cdb08b74662355a9e8510bde09c511e Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 9 Sep 2025 09:52:20 +0000 Subject: [PATCH 1328/3206] bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init() Currently, calling bpf_map_kmalloc_node() from __bpf_async_init() can cause various locking issues; see the following stack trace (edited for style) as one example: ... [10.011566] do_raw_spin_lock.cold [10.011570] try_to_wake_up (5) double-acquiring the same [10.011575] kick_pool rq_lock, causing a hardlockup [10.011579] __queue_work [10.011582] queue_work_on [10.011585] kernfs_notify [10.011589] cgroup_file_notify [10.011593] try_charge_memcg (4) memcg accounting raises an [10.011597] obj_cgroup_charge_pages MEMCG_MAX event [10.011599] obj_cgroup_charge_account [10.011600] __memcg_slab_post_alloc_hook [10.011603] __kmalloc_node_noprof ... [10.011611] bpf_map_kmalloc_node [10.011612] __bpf_async_init [10.011615] bpf_timer_init (3) BPF calls bpf_timer_init() [10.011617] bpf_prog_xxxxxxxxxxxxxxxx_fcg_runnable [10.011619] bpf__sched_ext_ops_runnable [10.011620] enqueue_task_scx (2) BPF runs with rq_lock held [10.011622] enqueue_task [10.011626] ttwu_do_activate [10.011629] sched_ttwu_pending (1) grabs rq_lock ... The above was reproduced on bpf-next (b338cf849ec8) by modifying ./tools/sched_ext/scx_flatcg.bpf.c to call bpf_timer_init() during ops.runnable(), and hacking the memcg accounting code a bit to make a bpf_timer_init() call more likely to raise an MEMCG_MAX event. We have also run into other similar variants (both internally and on bpf-next), including double-acquiring cgroup_file_kn_lock, the same worker_pool::lock, etc. As suggested by Shakeel, fix this by using __GFP_HIGH instead of GFP_ATOMIC in __bpf_async_init(), so that e.g. if try_charge_memcg() raises an MEMCG_MAX event, we call __memcg_memory_event() with @allow_spinning=false and avoid calling cgroup_file_notify() there. Depends on mm patch "memcg: skip cgroup_file_notify if spinning is not allowed": https://lore.kernel.org/bpf/20250905201606.66198-1-shakeel.butt@linux.dev/ v0 approach s/bpf_map_kmalloc_node/bpf_mem_alloc/ https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/ v1 approach: https://lore.kernel.org/bpf/20250905234547.862249-1-yepeilin@google.com/ Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Suggested-by: Shakeel Butt Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/20250909095222.2121438-1-yepeilin@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b9b0c5fe33f61a..8af62cb243d9ec 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1274,8 +1274,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until + * kmalloc_nolock() is available, avoid locking issues by using + * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). + */ + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; From 90f7c100d2dd99d5cd5be950d553edd2647e6cc8 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sat, 6 Sep 2025 21:19:29 -0300 Subject: [PATCH 1329/3206] smb: client: fix compound alignment with encryption The encryption layer can't handle the padding iovs, so flatten the compound request into a single buffer with required padding to prevent the server from dropping the connection when finding unaligned compound requests. Fixes: bc925c1216f0 ("smb: client: improve compound padding in encryption") Signed-off-by: Paulo Alcantara (Red Hat) Reviewed-by: David Howells Cc: linux-cifs@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 94b1d7a395d50a..9f9955d274cf65 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2640,13 +2640,35 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) } /* SMB headers in a compound are 8 byte aligned. */ - if (!IS_ALIGNED(len, 8)) { - num_padding = 8 - (len & 7); + if (IS_ALIGNED(len, 8)) + goto out; + + num_padding = 8 - (len & 7); + if (smb3_encryption_required(tcon)) { + int i; + + /* + * Flatten request into a single buffer with required padding as + * the encryption layer can't handle the padding iovs. + */ + for (i = 1; i < rqst->rq_nvec; i++) { + memcpy(rqst->rq_iov[0].iov_base + + rqst->rq_iov[0].iov_len, + rqst->rq_iov[i].iov_base, + rqst->rq_iov[i].iov_len); + rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len; + } + memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len, + 0, num_padding); + rqst->rq_iov[0].iov_len += num_padding; + rqst->rq_nvec = 1; + } else { rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding; rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding; rqst->rq_nvec++; - len += num_padding; } + len += num_padding; +out: shdr->NextCommand = cpu_to_le32(len); } From e0d1c55501d377163eb57feed863777ed1c973ad Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sun, 7 Sep 2025 21:44:01 +0100 Subject: [PATCH 1330/3206] net: phy: fix phy_uses_state_machine() The blamed commit changed the conditions which phylib uses to stop and start the state machine in the suspend and resume paths, and while improving it, has caused two issues. The original code used this test: phydev->attached_dev && phydev->adjust_link and if true, the paths would handle the PHY state machine. This test evaluates true for normal drivers that are using phylib directly while the PHY is attached to the network device, but false in all other cases, which include the following cases: - when the PHY has never been attached to a network device. - when the PHY has been detached from a network device (as phy_detach() sets phydev->attached_dev to NULL, phy_disconnect() calls phy_detach() and additionally sets phydev->adjust_link NULL.) - when phylink is using the driver (as phydev->adjust_link is NULL.) Only the third case was incorrect, and the blamed commit attempted to fix this by changing this test to (simplified for brevity, see phy_uses_state_machine()): phydev->phy_link_change == phy_link_change ? phydev->attached_dev && phydev->adjust_link : true However, this also incorrectly evaluates true in the first two cases. Fix the first case by ensuring that phy_uses_state_machine() returns false when phydev->phy_link_change is NULL. Fix the second case by ensuring that phydev->phy_link_change is set to NULL when phy_detach() is called. Reported-by: Xu Yang Link: https://lore.kernel.org/r/20250806082931.3289134-1-xu.yang_2@nxp.com Fixes: fc75ea20ffb4 ("net: phy: allow MDIO bus PM ops to start/stop state machine for phylink-controlled PHY") Signed-off-by: Russell King (Oracle) Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1uvMEz-00000003Aoe-3qWe@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 7556aa3dd7eeba..c82c1997147bfe 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -287,8 +287,7 @@ static bool phy_uses_state_machine(struct phy_device *phydev) if (phydev->phy_link_change == phy_link_change) return phydev->attached_dev && phydev->adjust_link; - /* phydev->phy_link_change is implicitly phylink_phy_change() */ - return true; + return !!phydev->phy_link_change; } static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) @@ -1864,6 +1863,8 @@ void phy_detach(struct phy_device *phydev) phydev->attached_dev = NULL; phy_link_topo_del_phy(dev, phydev); } + + phydev->phy_link_change = NULL; phydev->phylink = NULL; if (!phydev->is_on_sfp_module) From c5ea3065586d790ea5193a679b85585173d59866 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 7 Sep 2025 21:24:06 -0300 Subject: [PATCH 1331/3206] smb: client: fix data loss due to broken rename(2) Rename of open files in SMB2+ has been broken for a very long time, resulting in data loss as the CIFS client would fail the rename(2) call with -ENOENT and then removing the target file. Fix this by implementing ->rename_pending_delete() for SMB2+, which will rename busy files to random filenames (e.g. silly rename) during unlink(2) or rename(2), and then marking them to delete-on-close. Besides, introduce a FIND_WR_NO_PENDING_DELETE flag to prevent open(2) from reusing open handles that had been marked as delete pending. Handle it in cifs_get_readable_path() as well. Reported-by: Jean-Baptiste Denis Closes: https://marc.info/?i=16aeb380-30d4-4551-9134-4e7d1dc833c0@pasteur.fr Reviewed-by: David Howells Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Cc: Frank Sorenson Cc: Olga Kornievskaia Cc: Benjamin Coddington Cc: Scott Mayhew Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 13 ++- fs/smb/client/file.c | 18 +++- fs/smb/client/inode.c | 86 ++++++++++++---- fs/smb/client/smb2glob.h | 3 +- fs/smb/client/smb2inode.c | 204 ++++++++++++++++++++++++++++++-------- fs/smb/client/smb2ops.c | 4 + fs/smb/client/smb2proto.h | 3 + fs/smb/client/trace.h | 9 +- 8 files changed, 268 insertions(+), 72 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 1e64a4fb6af037..0fae95cf81c434 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -87,7 +87,7 @@ #define SMB_INTERFACE_POLL_INTERVAL 600 /* maximum number of PDUs in one compound */ -#define MAX_COMPOUND 7 +#define MAX_COMPOUND 10 /* * Default number of credits to keep available for SMB3. @@ -1882,9 +1882,12 @@ static inline bool is_replayable_error(int error) /* cifs_get_writable_file() flags */ -#define FIND_WR_ANY 0 -#define FIND_WR_FSUID_ONLY 1 -#define FIND_WR_WITH_DELETE 2 +enum cifs_writable_file_flags { + FIND_WR_ANY = 0U, + FIND_WR_FSUID_ONLY = (1U << 0), + FIND_WR_WITH_DELETE = (1U << 1), + FIND_WR_NO_PENDING_DELETE = (1U << 2), +}; #define MID_FREE 0 #define MID_REQUEST_ALLOCATED 1 @@ -2343,6 +2346,8 @@ struct smb2_compound_vars { struct kvec qi_iov; struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec unlink_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec rename_iov[SMB2_SET_INFO_IOV_SIZE]; struct kvec close_iov; struct smb2_file_rename_info_hdr rename_info; struct smb2_file_link_info_hdr link_info; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 186e061068be3a..cb907e18cc3589 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -998,7 +998,10 @@ int cifs_open(struct inode *inode, struct file *file) /* Get the cached handle as SMB2 close is deferred */ if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) { - rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile); + rc = cifs_get_writable_path(tcon, full_path, + FIND_WR_FSUID_ONLY | + FIND_WR_NO_PENDING_DELETE, + &cfile); } else { rc = cifs_get_readable_path(tcon, full_path, &cfile); } @@ -2530,6 +2533,9 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, continue; if (with_delete && !(open_file->fid.access & DELETE)) continue; + if ((flags & FIND_WR_NO_PENDING_DELETE) && + open_file->status_file_deleted) + continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ @@ -2647,6 +2653,16 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, spin_unlock(&tcon->open_file_lock); free_dentry_path(page); *ret_file = find_readable_file(cinode, 0); + if (*ret_file) { + spin_lock(&cinode->open_file_lock); + if ((*ret_file)->status_file_deleted) { + spin_unlock(&cinode->open_file_lock); + cifsFileInfo_put(*ret_file); + *ret_file = NULL; + } else { + spin_unlock(&cinode->open_file_lock); + } + } return *ret_file ? 0 : -ENOENT; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index fe453a4b3dc831..11d442e8b3d622 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1931,7 +1931,7 @@ cifs_drop_nlink(struct inode *inode) * but will return the EACCES to the caller. Note that the VFS does not call * unlink on negative dentries currently. */ -int cifs_unlink(struct inode *dir, struct dentry *dentry) +static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyrename) { int rc = 0; unsigned int xid; @@ -2003,7 +2003,11 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) goto psx_del_no_retry; } - rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); + if (sillyrename || (server->vals->protocol_id > SMB10_PROT_ID && + d_is_positive(dentry) && d_count(dentry) > 2)) + rc = -EBUSY; + else + rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); psx_del_no_retry: if (!rc) { @@ -2071,6 +2075,11 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) return rc; } +int cifs_unlink(struct inode *dir, struct dentry *dentry) +{ + return __cifs_unlink(dir, dentry, false); +} + static int cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, const char *full_path, struct cifs_sb_info *cifs_sb, @@ -2358,14 +2367,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb); cifs_put_tlink(tlink); + cifsInode = CIFS_I(d_inode(direntry)); + if (!rc) { + set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags); spin_lock(&d_inode(direntry)->i_lock); i_size_write(d_inode(direntry), 0); clear_nlink(d_inode(direntry)); spin_unlock(&d_inode(direntry)->i_lock); } - cifsInode = CIFS_I(d_inode(direntry)); /* force revalidate to go get info when needed */ cifsInode->time = 0; @@ -2458,8 +2469,11 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ do_rename_exit: - if (rc == 0) + if (rc == 0) { d_move(from_dentry, to_dentry); + /* Force a new lookup */ + d_drop(from_dentry); + } cifs_put_tlink(tlink); return rc; } @@ -2470,6 +2484,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, struct dentry *target_dentry, unsigned int flags) { const char *from_name, *to_name; + struct TCP_Server_Info *server; void *page1, *page2; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; @@ -2505,6 +2520,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; page1 = alloc_dentry_path(); page2 = alloc_dentry_path(); @@ -2591,19 +2607,53 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, unlink_target: #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - - /* Try unlinking the target dentry if it's not negative */ - if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) { - if (d_is_dir(target_dentry)) - tmprc = cifs_rmdir(target_dir, target_dentry); - else - tmprc = cifs_unlink(target_dir, target_dentry); - if (tmprc) - goto cifs_rename_exit; - rc = cifs_do_rename(xid, source_dentry, from_name, - target_dentry, to_name); - if (!rc) - rehash = false; + if (d_really_is_positive(target_dentry)) { + if (!rc) { + struct inode *inode = d_inode(target_dentry); + /* + * Samba and ksmbd servers allow renaming a target + * directory that is open, so make sure to update + * ->i_nlink and then mark it as delete pending. + */ + if (S_ISDIR(inode->i_mode)) { + drop_cached_dir_by_name(xid, tcon, to_name, cifs_sb); + spin_lock(&inode->i_lock); + i_size_write(inode, 0); + clear_nlink(inode); + spin_unlock(&inode->i_lock); + set_bit(CIFS_INO_DELETE_PENDING, &CIFS_I(inode)->flags); + CIFS_I(inode)->time = 0; /* force reval */ + inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + } + } else if (rc == -EACCES || rc == -EEXIST) { + /* + * Rename failed, possibly due to a busy target. + * Retry it by unliking the target first. + */ + if (d_is_dir(target_dentry)) { + tmprc = cifs_rmdir(target_dir, target_dentry); + } else { + tmprc = __cifs_unlink(target_dir, target_dentry, + server->vals->protocol_id > SMB10_PROT_ID); + } + if (tmprc) { + /* + * Some servers will return STATUS_ACCESS_DENIED + * or STATUS_DIRECTORY_NOT_EMPTY when failing to + * rename a non-empty directory. Make sure to + * propagate the appropriate error back to + * userspace. + */ + if (tmprc == -EEXIST || tmprc == -ENOTEMPTY) + rc = tmprc; + goto cifs_rename_exit; + } + rc = cifs_do_rename(xid, source_dentry, from_name, + target_dentry, to_name); + if (!rc) + rehash = false; + } } /* force revalidate to go get info when needed */ @@ -2629,6 +2679,8 @@ cifs_dentry_needs_reval(struct dentry *dentry) struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct cached_fid *cfid = NULL; + if (test_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags)) + return false; if (cifs_i->time == 0) return true; diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h index 224495322a05da..e56e4d402f1382 100644 --- a/fs/smb/client/smb2glob.h +++ b/fs/smb/client/smb2glob.h @@ -30,10 +30,9 @@ enum smb2_compound_ops { SMB2_OP_QUERY_DIR, SMB2_OP_MKDIR, SMB2_OP_RENAME, - SMB2_OP_DELETE, SMB2_OP_HARDLINK, SMB2_OP_SET_EOF, - SMB2_OP_RMDIR, + SMB2_OP_UNLINK, SMB2_OP_POSIX_QUERY_INFO, SMB2_OP_SET_REPARSE, SMB2_OP_GET_REPARSE, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 31c13fb5b85b62..7cadc8ca4f55b7 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -346,9 +346,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_posix_query_info_compound_enter(xid, tcon->tid, ses->Suid, full_path); break; - case SMB2_OP_DELETE: - trace_smb3_delete_enter(xid, tcon->tid, ses->Suid, full_path); - break; case SMB2_OP_MKDIR: /* * Directories are created through parameters in the @@ -356,23 +353,40 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, */ trace_smb3_mkdir_enter(xid, tcon->tid, ses->Suid, full_path); break; - case SMB2_OP_RMDIR: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; + case SMB2_OP_UNLINK: + rqst[num_rqst].rq_iov = vars->unlink_iov; rqst[num_rqst].rq_nvec = 1; size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ data[0] = &delete_pending[0]; - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_DISPOSITION_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (rc) + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); - trace_smb3_rmdir_enter(xid, tcon->tid, ses->Suid, full_path); + } + num_rqst++; + trace_smb3_unlink_enter(xid, tcon->tid, ses->Suid, full_path); break; case SMB2_OP_SET_EOF: rqst[num_rqst].rq_iov = &vars->si_iov[0]; @@ -442,7 +456,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, ses->Suid, full_path); break; case SMB2_OP_RENAME: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_iov = vars->rename_iov; rqst[num_rqst].rq_nvec = 2; len = in_iov[i].iov_len; @@ -732,19 +746,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_posix_query_info_compound_done(xid, tcon->tid, ses->Suid); break; - case SMB2_OP_DELETE: - if (rc) - trace_smb3_delete_err(xid, tcon->tid, ses->Suid, rc); - else { - /* - * If dentry (hence, inode) is NULL, lease break is going to - * take care of degrading leases on handles for deleted files. - */ - if (inode) - cifs_mark_open_handles_for_deleted_file(inode, full_path); - trace_smb3_delete_done(xid, tcon->tid, ses->Suid); - } - break; case SMB2_OP_MKDIR: if (rc) trace_smb3_mkdir_err(xid, tcon->tid, ses->Suid, rc); @@ -765,11 +766,11 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_rename_done(xid, tcon->tid, ses->Suid); SMB2_set_info_free(&rqst[num_rqst++]); break; - case SMB2_OP_RMDIR: - if (rc) - trace_smb3_rmdir_err(xid, tcon->tid, ses->Suid, rc); + case SMB2_OP_UNLINK: + if (!rc) + trace_smb3_unlink_done(xid, tcon->tid, ses->Suid); else - trace_smb3_rmdir_done(xid, tcon->tid, ses->Suid); + trace_smb3_unlink_err(xid, tcon->tid, ses->Suid, rc); SMB2_set_info_free(&rqst[num_rqst++]); break; case SMB2_OP_SET_EOF: @@ -1166,7 +1167,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE); return smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, NULL, - &(int){SMB2_OP_RMDIR}, 1, + &(int){SMB2_OP_UNLINK}, 1, NULL, NULL, NULL, NULL); } @@ -1175,20 +1176,29 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb, struct dentry *dentry) { struct cifs_open_parms oparms; + struct inode *inode = NULL; + int rc; - oparms = CIFS_OPARMS(cifs_sb, tcon, name, - DELETE, FILE_OPEN, - CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE); - int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, dentry); + if (dentry) + inode = d_inode(dentry); + + oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE, + FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE); + rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, + NULL, &(int){SMB2_OP_UNLINK}, + 1, NULL, NULL, NULL, dentry); if (rc == -EINVAL) { cifs_dbg(FYI, "invalid lease key, resending request without lease"); rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, NULL); + NULL, &(int){SMB2_OP_UNLINK}, + 1, NULL, NULL, NULL, NULL); } + /* + * If dentry (hence, inode) is NULL, lease break is going to + * take care of degrading leases on handles for deleted files. + */ + if (!rc && inode) + cifs_mark_open_handles_for_deleted_file(inode, name); return rc; } @@ -1441,3 +1451,113 @@ int smb2_query_reparse_point(const unsigned int xid, cifs_free_open_info(&data); return rc; } + +static inline __le16 *utf16_smb2_path(struct cifs_sb_info *cifs_sb, + const char *name, size_t namelen) +{ + int len; + + if (*name == '\\' || + (cifs_sb_master_tlink(cifs_sb) && + cifs_sb_master_tcon(cifs_sb)->posix_extensions && *name == '/')) + name++; + return cifs_strndup_to_utf16(name, namelen, &len, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); +} + +int smb2_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(d_inode(dentry)->i_sb); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(dentry)); + __le16 *utf16_path __free(kfree) = NULL; + __u32 co = file_create_options(dentry); + int cmds[] = { + SMB2_OP_SET_INFO, + SMB2_OP_RENAME, + SMB2_OP_UNLINK, + }; + const int num_cmds = ARRAY_SIZE(cmds); + char *to_name __free(kfree) = NULL; + __u32 attrs = cinode->cifsAttrs; + struct cifs_open_parms oparms; + static atomic_t sillycounter; + struct cifsFileInfo *cfile; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct kvec iov[2]; + const char *ppath; + void *page; + size_t len; + int rc; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + page = alloc_dentry_path(); + + ppath = build_path_from_dentry(dentry->d_parent, page); + if (IS_ERR(ppath)) { + rc = PTR_ERR(ppath); + goto out; + } + + len = strlen(ppath) + strlen("/.__smb1234") + 1; + to_name = kmalloc(len, GFP_KERNEL); + if (!to_name) { + rc = -ENOMEM; + goto out; + } + + scnprintf(to_name, len, "%s%c.__smb%04X", ppath, CIFS_DIR_SEP(cifs_sb), + atomic_inc_return(&sillycounter) & 0xffff); + + utf16_path = utf16_smb2_path(cifs_sb, to_name, len); + if (!utf16_path) { + rc = -ENOMEM; + goto out; + } + + drop_cached_dir_by_name(xid, tcon, full_path, cifs_sb); + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, + DELETE | FILE_WRITE_ATTRIBUTES, + FILE_OPEN, co, ACL_NO_MODE); + + attrs &= ~ATTR_READONLY; + if (!attrs) + attrs = ATTR_NORMAL; + if (d_inode(dentry)->i_nlink <= 1) + attrs |= ATTR_HIDDEN; + iov[0].iov_base = &(FILE_BASIC_INFO) { + .Attributes = cpu_to_le32(attrs), + }; + iov[0].iov_len = sizeof(FILE_BASIC_INFO); + iov[1].iov_base = utf16_path; + iov[1].iov_len = sizeof(*utf16_path) * UniStrlen((wchar_t *)utf16_path); + + cifs_get_writable_path(tcon, full_path, FIND_WR_WITH_DELETE, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov, + cmds, num_cmds, cfile, NULL, NULL, dentry); + if (rc == -EINVAL) { + cifs_dbg(FYI, "invalid lease key, resending request without lease\n"); + cifs_get_writable_path(tcon, full_path, + FIND_WR_WITH_DELETE, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov, + cmds, num_cmds, cfile, NULL, NULL, NULL); + } + if (!rc) { + set_bit(CIFS_INO_DELETE_PENDING, &cinode->flags); + } else { + cifs_tcon_dbg(FYI, "%s: failed to rename '%s' to '%s': %d\n", + __func__, full_path, to_name, rc); + rc = -EIO; + } +out: + cifs_put_tlink(tlink); + free_dentry_path(page); + return rc; +} diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 9f9955d274cf65..e586f3f4b5c937 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -5398,6 +5398,7 @@ struct smb_version_operations smb20_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ @@ -5503,6 +5504,7 @@ struct smb_version_operations smb21_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; struct smb_version_operations smb30_operations = { @@ -5619,6 +5621,7 @@ struct smb_version_operations smb30_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; struct smb_version_operations smb311_operations = { @@ -5735,6 +5738,7 @@ struct smb_version_operations smb311_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 6e805ece6a7b19..b3f1398c9f7906 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -317,5 +317,8 @@ int posix_info_sid_size(const void *beg, const void *end); int smb2_make_nfs_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); +int smb2_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid); #endif /* _SMB2PROTO_H */ diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index fe0e075bc63c3c..fd650e2afc7629 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -669,13 +669,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(posix_query_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter); -DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(unlink_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_wsl_ea_compound_enter); -DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter); @@ -710,13 +709,12 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(posix_query_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done); -DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(unlink_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done); -DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done); @@ -756,14 +754,13 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(posix_query_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err); -DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(unlink_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); -DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err); From 686cab5a18e443e1d5f2abb17bed45837836425f Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Sun, 7 Sep 2025 11:08:21 +0300 Subject: [PATCH 1332/3206] net: dev_ioctl: take ops lock in hwtstamp lower paths ndo hwtstamp callbacks are expected to run under the per-device ops lock. Make the lower get/set paths consistent with the rest of ndo invocations. Kernel log: WARNING: CPU: 13 PID: 51364 at ./include/net/netdev_lock.h:70 __netdev_update_features+0x4bd/0xe60 ... RIP: 0010:__netdev_update_features+0x4bd/0xe60 ... Call Trace: netdev_update_features+0x1f/0x60 mlx5_hwtstamp_set+0x181/0x290 [mlx5_core] mlx5e_hwtstamp_set+0x19/0x30 [mlx5_core] dev_set_hwtstamp_phylib+0x9f/0x220 dev_set_hwtstamp_phylib+0x9f/0x220 dev_set_hwtstamp+0x13d/0x240 dev_ioctl+0x12f/0x4b0 sock_ioctl+0x171/0x370 __x64_sys_ioctl+0x3f7/0x900 ? __sys_setsockopt+0x69/0xb0 do_syscall_64+0x6f/0x2e0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 ... .... ---[ end trace 0000000000000000 ]--- Note that the mlx5_hwtstamp_set and mlx5e_hwtstamp_set functions shown in the trace come from an in progress patch converting the legacy ioctl to ndo_hwtstamp_get/set and are not present in mainline. Fixes: ffb7ed19ac0a ("net: hold netdev instance lock during ioctl operations") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Link: https://patch.msgid.link/20250907080821.2353388-1-cjubran@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/dev_ioctl.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9c0ad7f4b5d810..ad54b12d4b4c80 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -464,8 +464,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_get) - return dev_get_hwtstamp_phylib(dev, kernel_cfg); + if (ops->ndo_hwtstamp_get) { + int err; + + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); @@ -481,8 +488,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_set) - return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + if (ops->ndo_hwtstamp_set) { + int err; + + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); From 0f82c3ba66c6b2e3cde0f255156a753b108ee9dc Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 8 Sep 2025 10:36:14 -0700 Subject: [PATCH 1333/3206] macsec: sync features on RTM_NEWLINK Syzkaller managed to lock the lower device via ETHTOOL_SFEATURES: netdev_lock include/linux/netdevice.h:2761 [inline] netdev_lock_ops include/net/netdev_lock.h:42 [inline] netdev_sync_lower_features net/core/dev.c:10649 [inline] __netdev_update_features+0xcb1/0x1be0 net/core/dev.c:10819 netdev_update_features+0x6d/0xe0 net/core/dev.c:10876 macsec_notify+0x2f5/0x660 drivers/net/macsec.c:4533 notifier_call_chain+0x1b3/0x3e0 kernel/notifier.c:85 call_netdevice_notifiers_extack net/core/dev.c:2267 [inline] call_netdevice_notifiers net/core/dev.c:2281 [inline] netdev_features_change+0x85/0xc0 net/core/dev.c:1570 __dev_ethtool net/ethtool/ioctl.c:3469 [inline] dev_ethtool+0x1536/0x19b0 net/ethtool/ioctl.c:3502 dev_ioctl+0x392/0x1150 net/core/dev_ioctl.c:759 It happens because lower features are out of sync with the upper: __dev_ethtool (real_dev) netdev_lock_ops(real_dev) ETHTOOL_SFEATURES __netdev_features_change netdev_sync_upper_features disable LRO on the lower if (old_features != dev->features) netdev_features_change fires NETDEV_FEAT_CHANGE macsec_notify NETDEV_FEAT_CHANGE netdev_update_features (for each macsec dev) netdev_sync_lower_features if (upper_features != lower_features) netdev_lock_ops(lower) # lower == real_dev stuck ... netdev_unlock_ops(real_dev) Per commit af5f54b0ef9e ("net: Lock lower level devices when updating features"), we elide the lock/unlock when the upper and lower features are synced. Makes sure the lower (real_dev) has proper features after the macsec link has been created. This makes sure we never hit the situation where we need to sync upper flags to the lower. Reported-by: syzbot+7e0f89fb6cae5d002de0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7e0f89fb6cae5d002de0 Fixes: 7e4d784f5810 ("net: hold netdev instance lock during rtnetlink operations") Signed-off-by: Stanislav Fomichev Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250908173614.3358264-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 01329fe7451a12..0eca96eeed58ab 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4286,6 +4286,7 @@ static int macsec_newlink(struct net_device *dev, if (err < 0) goto del_dev; + netdev_update_features(dev); netif_stacked_transfer_operstate(real_dev, dev); linkwatch_fire_event(dev); From 648de37416b301f046f62f1b65715c7fa8ebaa67 Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Mon, 8 Sep 2025 11:16:01 -0700 Subject: [PATCH 1334/3206] mptcp: sockopt: make sync_socket_options propagate SOCK_KEEPOPEN Users reported a scenario where MPTCP connections that were configured with SO_KEEPALIVE prior to connect would fail to enable their keepalives if MTPCP fell back to TCP mode. After investigating, this affects keepalives for any connection where sync_socket_options is called on a socket that is in the closed or listening state. Joins are handled properly. For connects, sync_socket_options is called when the socket is still in the closed state. The tcp_set_keepalive() function does not act on sockets that are closed or listening, hence keepalive is not immediately enabled. Since the SO_KEEPOPEN flag is absent, it is not enabled later in the connect sequence via tcp_finish_connect. Setting the keepalive via sockopt after connect does work, but would not address any subsequently created flows. Fortunately, the fix here is straight-forward: set SOCK_KEEPOPEN on the subflow when calling sync_socket_options. The fix was valdidated both by using tcpdump to observe keepalive packets not being sent before the fix, and being sent after the fix. It was also possible to observe via ss that the keepalive timer was not enabled on these sockets before the fix, but was enabled afterwards. Fixes: 1b3e7ede1365 ("mptcp: setsockopt: handle SO_KEEPALIVE and SO_PRIORITY") Cc: stable@vger.kernel.org Signed-off-by: Krister Johansen Reviewed-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/aL8dYfPZrwedCIh9@templeofstupid.com Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2c267aff95bec9..2abe6f1e994004 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1532,13 +1532,12 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; + bool keep_open; - if (ssk->sk_prot->keepalive) { - if (sock_flag(sk, SOCK_KEEPOPEN)) - ssk->sk_prot->keepalive(ssk, 1); - else - ssk->sk_prot->keepalive(ssk, 0); - } + keep_open = sock_flag(sk, SOCK_KEEPOPEN); + if (ssk->sk_prot->keepalive) + ssk->sk_prot->keepalive(ssk, keep_open); + sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open); ssk->sk_priority = sk->sk_priority; ssk->sk_bound_dev_if = sk->sk_bound_dev_if; From 7094b84863e5832cb1cd9c4b9d648904775b6bd9 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 8 Sep 2025 23:27:27 +0200 Subject: [PATCH 1335/3206] netlink: specs: mptcp: fix if-idx attribute type This attribute is used as a signed number in the code in pm_netlink.c: nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if)) The specs should then reflect that. Note that other 'if-idx' attributes from the same .yaml file use a signed number as well. Fixes: bc8aeb2045e2 ("Documentation: netlink: add a YAML spec for mptcp") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250908-net-mptcp-misc-fixes-6-17-rc5-v1-1-5f2168a66079@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index 02f1ddcfbf1cfd..d15335684ec3d6 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -256,7 +256,7 @@ attribute-sets: type: u32 - name: if-idx - type: u32 + type: s32 - name: reset-reason type: u32 From 6f021e95d0828edc8ed104a294594c2f9569383a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 8 Sep 2025 23:27:28 +0200 Subject: [PATCH 1336/3206] doc: mptcp: net.mptcp.pm_type is deprecated The net.mptcp.pm_type sysctl knob has been deprecated in v6.15, net.mptcp.path_manager should be used instead. Adapt the section about path managers to suggest using the new sysctl knob instead of the deprecated one. Fixes: 595c26d122d1 ("mptcp: sysctl: set path manager by name") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250908-net-mptcp-misc-fixes-6-17-rc5-v1-2-5f2168a66079@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/mptcp.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/mptcp.rst b/Documentation/networking/mptcp.rst index 17f2bab6116447..2e31038d646205 100644 --- a/Documentation/networking/mptcp.rst +++ b/Documentation/networking/mptcp.rst @@ -60,10 +60,10 @@ address announcements. Typically, it is the client side that initiates subflows, and the server side that announces additional addresses via the ``ADD_ADDR`` and ``REMOVE_ADDR`` options. -Path managers are controlled by the ``net.mptcp.pm_type`` sysctl knob -- see -mptcp-sysctl.rst. There are two types: the in-kernel one (type ``0``) where the -same rules are applied for all the connections (see: ``ip mptcp``) ; and the -userspace one (type ``1``), controlled by a userspace daemon (i.e. `mptcpd +Path managers are controlled by the ``net.mptcp.path_manager`` sysctl knob -- +see mptcp-sysctl.rst. There are two types: the in-kernel one (``kernel``) where +the same rules are applied for all the connections (see: ``ip mptcp``) ; and the +userspace one (``userspace``), controlled by a userspace daemon (i.e. `mptcpd `_) where different rules can be applied for each connection. The path managers can be controlled via a Netlink API; see netlink_spec/mptcp_pm.rst. From ef1bd93b3b924086088b7818d9e5d89ede944f1f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 8 Sep 2025 23:27:29 +0200 Subject: [PATCH 1337/3206] selftests: mptcp: shellcheck: support v0.11.0 This v0.11.0 version introduces SC2329: Warn when (non-escaping) functions are never invoked. Except that, similar to SC2317, ShellCheck is currently unable to figure out functions that are invoked via trap, or indirectly, when calling functions via variables. It is then needed to disable this new SC2329. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250908-net-mptcp-misc-fixes-6-17-rc5-v1-3-5f2168a66079@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/diag.sh | 2 +- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 2 +- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +- tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 2 +- tools/testing/selftests/net/mptcp/pm_netlink.sh | 5 +++-- tools/testing/selftests/net/mptcp/simult_flows.sh | 2 +- tools/testing/selftests/net/mptcp/userspace_pm.sh | 2 +- 7 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 7a3cb4c09e450f..d847ff1737c30c 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -28,7 +28,7 @@ flush_pids() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGKILL &>/dev/null diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 5e3c56253274a1..c2ab9f7f0d2133 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -134,7 +134,7 @@ ns4="" TEST_GROUP="" # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "$cin_disconnect" diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 82cae37d9c2026..7fd555b123b900 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -8,7 +8,7 @@ # ShellCheck incorrectly believes that most of the code here is unreachable # because it's invoked by variable name, see how the "tests" array is used -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 . "$(dirname "${0}")/mptcp_lib.sh" diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 418a903c3a4d39..f01989be6e9b3d 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -95,7 +95,7 @@ init() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { mptcp_lib_ns_exit "${ns1}" "${ns2}" "${ns_sbox}" diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index ac7ec6f9402376..ec6a8758819194 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -32,7 +32,7 @@ ns1="" err=$(mktemp) # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "${err}" @@ -70,8 +70,9 @@ format_endpoints() { mptcp_lib_pm_nl_format_endpoints "${@}" } +# This function is invoked indirectly +#shellcheck disable=SC2317,SC2329 get_endpoint() { - # shellcheck disable=SC2317 # invoked indirectly mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}" } diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 2329c2f8519b7c..1903e8e84a3151 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -35,7 +35,7 @@ usage() { } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "$cout" "$sout" diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 333064b0b5ac03..970c329735ff14 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -94,7 +94,7 @@ test_fail() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { print_title "Cleanup" From 5cb782ff3c62c837e4984b6ae9f5d9a423cd5088 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Sun, 7 Sep 2025 12:40:16 -0700 Subject: [PATCH 1338/3206] scsi: ufs: mcq: Fix memory allocation checks for SQE and CQE Previous checks incorrectly tested the DMA addresses (dma_handle) for NULL. Since dma_alloc_coherent() returns the CPU (virtual) address, the NULL check should be performed on the *_base_addr pointer to correctly detect allocation failures. Update the checks to validate sqe_base_addr and cqe_base_addr instead of sqe_dma_addr and cqe_dma_addr. Fixes: 4682abfae2eb ("scsi: ufs: core: mcq: Allocate memory for MCQ mode") Signed-off-by: Alok Tiwari Reviewed-by: Alim Akhtar Reviewed-by: Manivannan Sadhasivam Reviewed-by: Peter Wang Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs-mcq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index 1e50675772febb..cc88aaa106da30 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -243,7 +243,7 @@ int ufshcd_mcq_memory_alloc(struct ufs_hba *hba) hwq->sqe_base_addr = dmam_alloc_coherent(hba->dev, utrdl_size, &hwq->sqe_dma_addr, GFP_KERNEL); - if (!hwq->sqe_dma_addr) { + if (!hwq->sqe_base_addr) { dev_err(hba->dev, "SQE allocation failed\n"); return -ENOMEM; } @@ -252,7 +252,7 @@ int ufshcd_mcq_memory_alloc(struct ufs_hba *hba) hwq->cqe_base_addr = dmam_alloc_coherent(hba->dev, cqe_size, &hwq->cqe_dma_addr, GFP_KERNEL); - if (!hwq->cqe_dma_addr) { + if (!hwq->cqe_base_addr) { dev_err(hba->dev, "CQE allocation failed\n"); return -ENOMEM; } From 181993bb0d626cf88cc803f4356ce5c5abe86278 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Wed, 10 Sep 2025 13:33:40 +0800 Subject: [PATCH 1339/3206] erofs: fix runtime warning on truncate_folio_batch_exceptionals() Commit 0e2f80afcfa6("fs/dax: ensure all pages are idle prior to filesystem unmount") introduced the WARN_ON_ONCE to capture whether the filesystem has removed all DAX entries or not and applied the fix to xfs and ext4. Apply the missed fix on erofs to fix the runtime warning: [ 5.266254] ------------[ cut here ]------------ [ 5.266274] WARNING: CPU: 6 PID: 3109 at mm/truncate.c:89 truncate_folio_batch_exceptionals+0xff/0x260 [ 5.266294] Modules linked in: [ 5.266999] CPU: 6 UID: 0 PID: 3109 Comm: umount Tainted: G S 6.16.0+ #6 PREEMPT(voluntary) [ 5.267012] Tainted: [S]=CPU_OUT_OF_SPEC [ 5.267017] Hardware name: Dell Inc. OptiPlex 5000/05WXFV, BIOS 1.5.1 08/24/2022 [ 5.267024] RIP: 0010:truncate_folio_batch_exceptionals+0xff/0x260 [ 5.267076] Code: 00 00 41 39 df 7f 11 eb 78 83 c3 01 49 83 c4 08 41 39 df 74 6c 48 63 f3 48 83 fe 1f 0f 83 3c 01 00 00 43 f6 44 26 08 01 74 df <0f> 0b 4a 8b 34 22 4c 89 ef 48 89 55 90 e8 ff 54 1f 00 48 8b 55 90 [ 5.267083] RSP: 0018:ffffc900013f36c8 EFLAGS: 00010202 [ 5.267095] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 5.267101] RDX: ffffc900013f3790 RSI: 0000000000000000 RDI: ffff8882a1407898 [ 5.267108] RBP: ffffc900013f3740 R08: 0000000000000000 R09: 0000000000000000 [ 5.267113] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 5.267119] R13: ffff8882a1407ab8 R14: ffffc900013f3888 R15: 0000000000000001 [ 5.267125] FS: 00007aaa8b437800(0000) GS:ffff88850025b000(0000) knlGS:0000000000000000 [ 5.267132] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 5.267138] CR2: 00007aaa8b3aac10 CR3: 000000024f764000 CR4: 0000000000f52ef0 [ 5.267144] PKRU: 55555554 [ 5.267150] Call Trace: [ 5.267154] [ 5.267181] truncate_inode_pages_range+0x118/0x5e0 [ 5.267193] ? save_trace+0x54/0x390 [ 5.267296] truncate_inode_pages_final+0x43/0x60 [ 5.267309] evict+0x2a4/0x2c0 [ 5.267339] dispose_list+0x39/0x80 [ 5.267352] evict_inodes+0x150/0x1b0 [ 5.267376] generic_shutdown_super+0x41/0x180 [ 5.267390] kill_block_super+0x1b/0x50 [ 5.267402] erofs_kill_sb+0x81/0x90 [erofs] [ 5.267436] deactivate_locked_super+0x32/0xb0 [ 5.267450] deactivate_super+0x46/0x60 [ 5.267460] cleanup_mnt+0xc3/0x170 [ 5.267475] __cleanup_mnt+0x12/0x20 [ 5.267485] task_work_run+0x5d/0xb0 [ 5.267499] exit_to_user_mode_loop+0x144/0x170 [ 5.267512] do_syscall_64+0x2b9/0x7c0 [ 5.267523] ? __lock_acquire+0x665/0x2ce0 [ 5.267535] ? __lock_acquire+0x665/0x2ce0 [ 5.267560] ? lock_acquire+0xcd/0x300 [ 5.267573] ? find_held_lock+0x31/0x90 [ 5.267582] ? mntput_no_expire+0x97/0x4e0 [ 5.267606] ? mntput_no_expire+0xa1/0x4e0 [ 5.267625] ? mntput+0x24/0x50 [ 5.267634] ? path_put+0x1e/0x30 [ 5.267647] ? do_faccessat+0x120/0x2f0 [ 5.267677] ? do_syscall_64+0x1a2/0x7c0 [ 5.267686] ? from_kgid_munged+0x17/0x30 [ 5.267703] ? from_kuid_munged+0x13/0x30 [ 5.267711] ? __do_sys_getuid+0x3d/0x50 [ 5.267724] ? do_syscall_64+0x1a2/0x7c0 [ 5.267732] ? irqentry_exit+0x77/0xb0 [ 5.267743] ? clear_bhb_loop+0x30/0x80 [ 5.267752] ? clear_bhb_loop+0x30/0x80 [ 5.267765] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 5.267772] RIP: 0033:0x7aaa8b32a9fb [ 5.267781] Code: c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 f3 0f 1e fa 31 f6 e9 05 00 00 00 0f 1f 44 00 00 f3 0f 1e fa b8 a6 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 e9 83 0d 00 f7 d8 [ 5.267787] RSP: 002b:00007ffd7c4c9468 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 5.267796] RAX: 0000000000000000 RBX: 00005a61592a8b00 RCX: 00007aaa8b32a9fb [ 5.267802] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00005a61592b2080 [ 5.267806] RBP: 00007ffd7c4c9540 R08: 00007aaa8b403b20 R09: 0000000000000020 [ 5.267812] R10: 0000000000000001 R11: 0000000000000246 R12: 00005a61592a8c00 [ 5.267817] R13: 0000000000000000 R14: 00005a61592b2080 R15: 00005a61592a8f10 [ 5.267849] [ 5.267854] irq event stamp: 4721 [ 5.267859] hardirqs last enabled at (4727): [] __up_console_sem+0x90/0xa0 [ 5.267873] hardirqs last disabled at (4732): [] __up_console_sem+0x75/0xa0 [ 5.267884] softirqs last enabled at (3044): [] kernel_fpu_end+0x53/0x70 [ 5.267895] softirqs last disabled at (3042): [] kernel_fpu_begin_mask+0xc4/0x120 [ 5.267905] ---[ end trace 0000000000000000 ]--- Fixes: bde708f1a65d ("fs/dax: always remove DAX page-cache entries when breaking layouts") Signed-off-by: Yuezhang Mo Reviewed-by: Friendy Su Reviewed-by: Daniel Palmer Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/super.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 1b529ace4db0ee..db13b40a78e07d 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -1018,10 +1018,22 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) return 0; } +static void erofs_evict_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + dax_break_layout_final(inode); +#endif + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); +} + const struct super_operations erofs_sops = { .put_super = erofs_put_super, .alloc_inode = erofs_alloc_inode, .free_inode = erofs_free_inode, + .evict_inode = erofs_evict_inode, .statfs = erofs_statfs, .show_options = erofs_show_options, }; From 8cc71fc3b82b51e155fbe20876b1aa17a315ac4c Mon Sep 17 00:00:00 2001 From: Nithyanantham Paramasivam Date: Fri, 5 Sep 2025 18:18:00 +0530 Subject: [PATCH 1340/3206] wifi: cfg80211: Fix "no buffer space available" error in nl80211_get_station() for MLO Currently, nl80211_get_station() allocates a fixed buffer size using NLMSG_DEFAULT_SIZE. In multi-link scenarios - particularly when the number of links exceeds two - this buffer size is often insufficient to accommodate complete station statistics, resulting in "no buffer space available" errors. To address this, modify nl80211_get_station() to return only accumulated station statistics and exclude per link stats. Pass a new flag (link_stats) to nl80211_send_station() to control the inclusion of per link statistics. This allows retaining detailed output with per link data in dump commands, while excluding it from other commands where it is not needed. This change modifies the handling of per link stats introduced in commit 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") to enable them only for nl80211_dump_station(). Apply the same fix to cfg80211_del_sta_sinfo() by skipping per link stats to avoid buffer issues. cfg80211_new_sta() doesn't include stats and is therefore not impacted. Fixes: 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") Signed-off-by: Nithyanantham Paramasivam Link: https://patch.msgid.link/20250905124800.1448493-1-nithyanantham.paramasivam@oss.qualcomm.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893e6..f2f7424e930cc1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7062,7 +7062,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg80211_registered_device *rdev, struct net_device *dev, - const u8 *mac_addr, struct station_info *sinfo) + const u8 *mac_addr, struct station_info *sinfo, + bool link_stats) { void *hdr; struct nlattr *sinfoattr, *bss_param; @@ -7283,7 +7284,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, goto nla_put_failure; } - if (sinfo->valid_links) { + if (link_stats && sinfo->valid_links) { links = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS); if (!links) goto nla_put_failure; @@ -7574,7 +7575,7 @@ static int nl80211_dump_station(struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev->netdev, mac_addr, - &sinfo) < 0) + &sinfo, true) < 0) goto out; sta_idx++; @@ -7635,7 +7636,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, info->snd_portid, info->snd_seq, 0, - rdev, dev, mac_addr, &sinfo) < 0) { + rdev, dev, mac_addr, &sinfo, false) < 0) { nlmsg_free(msg); return -ENOBUFS; } @@ -19680,7 +19681,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, return; if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } @@ -19710,7 +19711,7 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, } if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } From 4c91b0ee35db07ae017dce067c64364c7e95faae Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 9 Sep 2025 20:03:56 +0100 Subject: [PATCH 1341/3206] gpio: loongson-64bit: Fix a less than zero check on an unsigned int struct field Currently the error check from the call to platform_get_irq is always false because an unsigned int chip->irq.parents[i] is being used to to perform the less than zero error check. Fix this by using the int variable ret to perform the check. Fixes: 03c146cb6cd1 ("gpio: loongson-64bit: Add support for Loongson-2K0300 SoC") Signed-off-by: Colin Ian King Reviewed-by: Huacai Chen Reviewed-by: Yao Zi Link: https://lore.kernel.org/r/20250909190356.870000-1-colin.i.king@gmail.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-loongson-64bit.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-loongson-64bit.c b/drivers/gpio/gpio-loongson-64bit.c index 5f87ce5c86cf1d..2120e908c634e3 100644 --- a/drivers/gpio/gpio-loongson-64bit.c +++ b/drivers/gpio/gpio-loongson-64bit.c @@ -267,10 +267,13 @@ static int loongson_gpio_init_irqchip(struct platform_device *pdev, return -ENOMEM; for (i = 0; i < data->intr_num; i++) { - chip->irq.parents[i] = platform_get_irq(pdev, i); - if (chip->irq.parents[i] < 0) - return dev_err_probe(&pdev->dev, chip->irq.parents[i], + int ret; + + ret = platform_get_irq(pdev, i); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, "failed to get IRQ %d\n", i); + chip->irq.parents[i] = ret; } for (i = 0; i < data->intr_num; i++) { From 860b21c31d16f99b8c37b77993682f7bc8c211d7 Mon Sep 17 00:00:00 2001 From: Geonha Lee Date: Thu, 4 Sep 2025 00:04:21 +0900 Subject: [PATCH 1342/3206] KVM: arm64: nv: fix VNCR TLB ASID match logic for non-Global entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_vncr_tlb_lookup() is supposed to return true when the cached VNCR TLB entry is valid for the current context. For non-Global entries, that means the entry’s ASID must match the current ASID. The current code returns true when the ASIDs do *not* match, which inverts the logic. This is a potential vulnerability: - Valid entries are ignored and we fall back to kvm_translate_vncr(), hurting performance. - Mismatched entries are treated as permission faults (-EPERM) instead of triggering a fresh translation. - This can also cause stale translations to be (wrongly) considered valid across address spaces. Flip the predicate so non-Global entries only hit when ASIDs match. Special credit to Team 0xB6 for reporting: DongHa Lee, Gyujeong Jin, Daehyeon Ko, Geonha Lee, Hyungyu Oh, and Jaewon Yang. Signed-off-by: Geonha Lee Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250903150421.90752-1-w1nsom3gna@korea.ac.kr Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 77db81bae86f9b..24eab94d7d7f4c 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1276,7 +1276,7 @@ static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu) !(tcr & TCR_ASID16)) asid &= GENMASK(7, 0); - return asid != vt->wr.asid; + return asid == vt->wr.asid; } return true; From efad60e4605721b829a49bcaa6afc517a80a7247 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 2 Sep 2025 14:08:32 +0100 Subject: [PATCH 1343/3206] KVM: arm64: Initialize PMSCR_EL1 when in VHE According to the pseudocode for StatisticalProfilingEnabled() from Arm DDI0487L.b, PMSCR_EL1 controls profiling at EL1 and EL0: - PMSCR_EL1.E1SPE controls profiling at EL1. - PMSCR_EL1.E0SPE controls profiling at EL0 if HCR_EL2.TGE=0. These two fields reset to UNKNOWN values. When KVM runs in VHE mode and profiling is enabled in the host, before entering a guest, KVM does not touch any of the SPE registers, leaving the buffer enabled, and it clears HCR_EL2.TGE. As a result, depending on the reset value for the E1SPE and E0SPE fields, KVM might unintentionally profile a guest. Make the behaviour consistent and predictable by clearing PMSCR_EL1 when KVM initialises the host debug configuration. Note that this is not a problem for nVHE, because KVM clears PMSCR_EL1.{E1SPE,E0SPE} before entering the guest. Signed-off-by: Alexandru Elisei Link: https://lore.kernel.org/r/20250902130833.338216-2-alexandru.elisei@arm.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 4 +++- arch/arm64/kvm/debug.c | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 2b07f0a27a7d85..0ee4f6fa3a172d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1369,6 +1369,7 @@ static inline bool kvm_system_needs_idmapped_vectors(void) } void kvm_init_host_debug_data(void); +void kvm_debug_init_vhe(void); void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu); void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu); void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 5bf101c869c9ab..bd6b6a620a09ca 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2113,8 +2113,10 @@ static void cpu_hyp_init_features(void) { cpu_set_hyp_vector(); - if (is_kernel_in_hyp_mode()) + if (is_kernel_in_hyp_mode()) { kvm_timer_init_vhe(); + kvm_debug_init_vhe(); + } if (vgic_present) kvm_vgic_init_cpu_hardware(); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 381382c19fe474..fee6e882490ab2 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -96,6 +96,13 @@ void kvm_init_host_debug_data(void) } } +void kvm_debug_init_vhe(void) +{ + /* Clear PMSCR_EL1.E{0,1}SPE which reset to UNKNOWN values. */ + if (SYS_FIELD_GET(ID_AA64DFR0_EL1, PMSVer, read_sysreg(id_aa64dfr0_el1))) + write_sysreg_el1(0, SYS_PMSCR); +} + /* * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host * has taken over MDSCR_EL1. From da2e743419cb5f4ee88cd66c4363951b444207cf Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 2 Sep 2025 14:08:33 +0100 Subject: [PATCH 1344/3206] KVM: arm64: VHE: Save and restore host MDCR_EL2 value correctly Prior to commit 75a5fbaf6623 ("KVM: arm64: Compute MDCR_EL2 at vcpu_load()"), host MDCR_EL2 was saved correctly: kvm_arch_vcpu_load() kvm_vcpu_load_debug() /* Doesn't touch hardware MDCR_EL2. */ kvm_vcpu_load_vhe() __activate_traps_common() /* Saves host MDCR_EL2. */ *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2) /* Writes VCPU MDCR_EL2. */ write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2) The MDCR_EL2 value saved previously was restored in kvm_arch_vcpu_put() -> kvm_vcpu_put_vhe(). After the aforementioned commit, host MDCR_EL2 is never saved: kvm_arch_vcpu_load() kvm_vcpu_load_debug() /* Writes VCPU MDCR_EL2 */ kvm_vcpu_load_vhe() __activate_traps_common() /* Saves **VCPU** MDCR_EL2. */ *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2) /* Writes VCPU MDCR_EL2 a second time. */ write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2) kvm_arch_vcpu_put() -> kvm_vcpu_put_vhe() then restores the VCPU MDCR_EL2 value. Also VCPU's MDCR_EL2 value gets written to hardware twice now. Fix this by saving the host MDCR_EL2 in kvm_arch_vcpu_load() before it gets overwritten by the VCPU's MDCR_EL2 value, and restore it on VCPU put. Signed-off-by: Alexandru Elisei Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250902130833.338216-3-alexandru.elisei@arm.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/debug.c | 6 ++++++ arch/arm64/kvm/hyp/include/hyp/switch.h | 5 ----- arch/arm64/kvm/hyp/nvhe/switch.c | 6 ++++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index fee6e882490ab2..e027d9c32b0d30 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -145,6 +145,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) /* Must be called before kvm_vcpu_load_vhe() */ KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm); + if (has_vhe()) + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); + /* * Determine which of the possible debug states we're in: * @@ -191,6 +194,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu) { + if (has_vhe()) + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); + if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) return; diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 84ec4e100fbb98..b6682202edf3c6 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -431,9 +431,6 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); } - *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); - write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - if (cpus_have_final_cap(ARM64_HAS_HCX)) { u64 hcrx = vcpu->arch.hcrx_el2; if (is_nested_ctxt(vcpu)) { @@ -454,8 +451,6 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); - write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); - write_sysreg(0, hstr_el2); if (system_supports_pmuv3()) { write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index ccd575d5f6dec9..d3b9ec8a7c2834 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -50,6 +50,10 @@ extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); static void __activate_traps(struct kvm_vcpu *vcpu) { ___activate_traps(vcpu, vcpu->arch.hcr_el2); + + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); + __activate_traps_common(vcpu); __activate_cptr_traps(vcpu); @@ -93,6 +97,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) isb(); } + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); + __deactivate_traps_common(vcpu); write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2); From 7d6ca84aa985fc940f5544ed7feedb1b4a82b96b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 5 Sep 2025 03:05:26 -0700 Subject: [PATCH 1345/3206] KVM: arm64: vgic: Drop stale comment on IRQ active state While LPIs lack an active state, KVM unconditionally folds the active state from the LR into the vgic_irq struct meaning this field cannot be 'creatively' reused for something else. Drop the misleading comment to reflect this. Link: https://lore.kernel.org/r/20250905100531.282980-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- include/kvm/arm_vgic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 404883c7af6e83..9f8a116925ca35 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -139,7 +139,7 @@ struct vgic_irq { bool pending_latch; /* The pending latch state used to calculate * the pending state for both level * and edge triggered IRQs. */ - bool active; /* not used for LPIs */ + bool active; bool enabled; bool hw; /* Tied to HW IRQ */ struct kref refcount; /* Used for LPIs */ From 3a08a6ca7c373198c84e2a8c025c395ee966ff8a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 5 Sep 2025 03:05:27 -0700 Subject: [PATCH 1346/3206] KVM: arm64: vgic-v3: Use bare refcount for VGIC LPIs KVM's use of krefs to manage LPIs isn't adding much, especially considering vgic_irq_release() is a noop due to the lack of sufficient context. Switch to using a regular refcount in anticipation of adding a meaningful release concept for LPIs. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250905100531.282980-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-debug.c | 2 +- arch/arm64/kvm/vgic/vgic-init.c | 4 ++-- arch/arm64/kvm/vgic/vgic-its.c | 8 ++++---- arch/arm64/kvm/vgic/vgic-v4.c | 2 +- arch/arm64/kvm/vgic/vgic.c | 17 ++++------------- arch/arm64/kvm/vgic/vgic.h | 8 ++++---- include/kvm/arm_vgic.h | 4 ++-- 7 files changed, 18 insertions(+), 27 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index 2684f273d9e17a..4c1209261b65d4 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -69,7 +69,7 @@ static int iter_mark_lpis(struct kvm *kvm) int nr_lpis = 0; xa_for_each(&dist->lpi_xa, intid, irq) { - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) continue; xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 1e680ad6e86359..4c777136ea5ffc 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -208,7 +208,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) raw_spin_lock_init(&irq->irq_lock); irq->vcpu = NULL; irq->target_vcpu = vcpu0; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 0); switch (dist->vgic_model) { case KVM_DEV_TYPE_ARM_VGIC_V2: irq->targets = 0; @@ -277,7 +277,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) irq->intid = i; irq->vcpu = NULL; irq->target_vcpu = vcpu; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 0); if (vgic_irq_is_sgi(i)) { /* SGIs */ irq->enabled = 1; diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 7368c13f16b729..f162206adb482a 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -99,7 +99,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, raw_spin_lock_init(&irq->irq_lock); irq->config = VGIC_CONFIG_EDGE; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 1); irq->intid = intid; irq->target_vcpu = vcpu; irq->group = 1; @@ -111,7 +111,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, * check that we don't add a second list entry with the same LPI. */ oldirq = xa_load(&dist->lpi_xa, intid); - if (vgic_try_get_irq_kref(oldirq)) { + if (vgic_try_get_irq_ref(oldirq)) { /* Someone was faster with adding this LPI, lets use that. */ kfree(irq); irq = oldirq; @@ -547,7 +547,7 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, rcu_read_lock(); irq = xa_load(&its->translation_cache, cache_key); - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) irq = NULL; rcu_read_unlock(); @@ -571,7 +571,7 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, * its_lock, as the ITE (and the reference it holds) cannot be freed. */ lockdep_assert_held(&its->its_lock); - vgic_get_irq_kref(irq); + vgic_get_irq_ref(irq); old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 4d9343d2b0b155..548aec9d5a7280 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -518,7 +518,7 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq) if (!irq->hw || irq->host_irq != host_irq) continue; - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) return NULL; return irq; diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index f5148b38120ad7..a1d6fab895c453 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -71,7 +71,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) rcu_read_lock(); irq = xa_load(&dist->lpi_xa, intid); - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) irq = NULL; rcu_read_unlock(); @@ -114,15 +114,6 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid) return vgic_get_irq(vcpu->kvm, intid); } -/* - * We can't do anything in here, because we lack the kvm pointer to - * lock and remove the item from the lpi_list. So we keep this function - * empty and use the return value of kref_put() to trigger the freeing. - */ -static void vgic_irq_release(struct kref *ref) -{ -} - void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; @@ -131,7 +122,7 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) if (irq->intid < VGIC_MIN_LPI) return; - if (!kref_put(&irq->refcount, vgic_irq_release)) + if (!refcount_dec_and_test(&irq->refcount)) return; xa_lock_irqsave(&dist->lpi_xa, flags); @@ -399,7 +390,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, * now in the ap_list. This is safe as the caller must already hold a * reference on the irq. */ - vgic_get_irq_kref(irq); + vgic_get_irq_ref(irq); list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); irq->vcpu = vcpu; @@ -657,7 +648,7 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) /* * This vgic_put_irq call matches the - * vgic_get_irq_kref in vgic_queue_irq_unlock, + * vgic_get_irq_ref in vgic_queue_irq_unlock, * where we added the LPI to the ap_list. As * we remove the irq from the list, we drop * also drop the refcount. diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index de1c1d3261c396..ac5f9c5d2b9800 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -267,7 +267,7 @@ void vgic_v2_put(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); -static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) +static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq) { if (!irq) return false; @@ -275,12 +275,12 @@ static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) if (irq->intid < VGIC_MIN_LPI) return true; - return kref_get_unless_zero(&irq->refcount); + return refcount_inc_not_zero(&irq->refcount); } -static inline void vgic_get_irq_kref(struct vgic_irq *irq) +static inline void vgic_get_irq_ref(struct vgic_irq *irq) { - WARN_ON_ONCE(!vgic_try_get_irq_kref(irq)); + WARN_ON_ONCE(!vgic_try_get_irq_ref(irq)); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 9f8a116925ca35..640555ff5b5497 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -142,7 +142,7 @@ struct vgic_irq { bool active; bool enabled; bool hw; /* Tied to HW IRQ */ - struct kref refcount; /* Used for LPIs */ + refcount_t refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ union { From 0a4aedf2bd3031ce83eb31f2cec8905938082b24 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 5 Sep 2025 03:05:28 -0700 Subject: [PATCH 1347/3206] KVM: arm64: Spin off release helper from vgic_put_irq() Spin off the release implementation from vgic_put_irq() to prepare for a more involved fix for lock ordering such that it may be unnested from raw spinlocks. This has the minor functional change of doing call_rcu() behind the xa_lock although it shouldn't be consequential. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250905100531.282980-4-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index a1d6fab895c453..ec4d70936a5bd9 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -114,22 +114,32 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid) return vgic_get_irq(vcpu->kvm, intid); } +static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq) +{ + lockdep_assert_held(&dist->lpi_xa.xa_lock); + __xa_erase(&dist->lpi_xa, irq->intid); + kfree_rcu(irq, rcu); +} + +static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) +{ + if (irq->intid < VGIC_MIN_LPI) + return false; + + return refcount_dec_and_test(&irq->refcount); +} + void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; unsigned long flags; - if (irq->intid < VGIC_MIN_LPI) - return; - - if (!refcount_dec_and_test(&irq->refcount)) + if (!__vgic_put_irq(kvm, irq)) return; xa_lock_irqsave(&dist->lpi_xa, flags); - __xa_erase(&dist->lpi_xa, irq->intid); + vgic_release_lpi_locked(dist, irq); xa_unlock_irqrestore(&dist->lpi_xa, flags); - - kfree_rcu(irq, rcu); } void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) From d54594accf732d17891d276aa1f545ef25606555 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 5 Sep 2025 03:05:29 -0700 Subject: [PATCH 1348/3206] KVM: arm64: vgic-v3: Erase LPIs from xarray outside of raw spinlocks syzkaller has caught us red-handed once more, this time nesting regular spinlocks behind raw spinlocks: ============================= [ BUG: Invalid wait context ] 6.16.0-rc3-syzkaller-g7b8346bd9fce #0 Not tainted ----------------------------- syz.0.29/3743 is trying to lock: a3ff80008e2e9e18 (&xa->xa_lock#20){....}-{3:3}, at: vgic_put_irq+0xb4/0x190 arch/arm64/kvm/vgic/vgic.c:137 other info that might help us debug this: context-{5:5} 3 locks held by syz.0.29/3743: #0: a3ff80008e2e90a8 (&kvm->slots_lock){+.+.}-{4:4}, at: kvm_vgic_destroy+0x50/0x624 arch/arm64/kvm/vgic/vgic-init.c:499 #1: a3ff80008e2e9fa0 (&kvm->arch.config_lock){+.+.}-{4:4}, at: kvm_vgic_destroy+0x5c/0x624 arch/arm64/kvm/vgic/vgic-init.c:500 #2: 58f0000021be1428 (&vgic_cpu->ap_list_lock){....}-{2:2}, at: vgic_flush_pending_lpis+0x3c/0x31c arch/arm64/kvm/vgic/vgic.c:150 stack backtrace: CPU: 0 UID: 0 PID: 3743 Comm: syz.0.29 Not tainted 6.16.0-rc3-syzkaller-g7b8346bd9fce #0 PREEMPT Hardware name: linux,dummy-virt (DT) Call trace: show_stack+0x2c/0x3c arch/arm64/kernel/stacktrace.c:466 (C) __dump_stack+0x30/0x40 lib/dump_stack.c:94 dump_stack_lvl+0xd8/0x12c lib/dump_stack.c:120 dump_stack+0x1c/0x28 lib/dump_stack.c:129 print_lock_invalid_wait_context kernel/locking/lockdep.c:4833 [inline] check_wait_context kernel/locking/lockdep.c:4905 [inline] __lock_acquire+0x978/0x299c kernel/locking/lockdep.c:5190 lock_acquire+0x14c/0x2e0 kernel/locking/lockdep.c:5871 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x5c/0x7c kernel/locking/spinlock.c:162 vgic_put_irq+0xb4/0x190 arch/arm64/kvm/vgic/vgic.c:137 vgic_flush_pending_lpis+0x24c/0x31c arch/arm64/kvm/vgic/vgic.c:158 __kvm_vgic_vcpu_destroy+0x44/0x500 arch/arm64/kvm/vgic/vgic-init.c:455 kvm_vgic_destroy+0x100/0x624 arch/arm64/kvm/vgic/vgic-init.c:505 kvm_arch_destroy_vm+0x80/0x138 arch/arm64/kvm/arm.c:244 kvm_destroy_vm virt/kvm/kvm_main.c:1308 [inline] kvm_put_kvm+0x800/0xff8 virt/kvm/kvm_main.c:1344 kvm_vm_release+0x58/0x78 virt/kvm/kvm_main.c:1367 __fput+0x4ac/0x980 fs/file_table.c:465 ____fput+0x20/0x58 fs/file_table.c:493 task_work_run+0x1bc/0x254 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] do_notify_resume+0x1b4/0x270 arch/arm64/kernel/entry-common.c:151 exit_to_user_mode_prepare arch/arm64/kernel/entry-common.c:169 [inline] exit_to_user_mode arch/arm64/kernel/entry-common.c:178 [inline] el0_svc+0xb4/0x160 arch/arm64/kernel/entry-common.c:768 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 This is of course no good, but is at odds with how LPI refcounts are managed. Solve the locking mess by deferring the release of unreferenced LPIs after the ap_list_lock is released. Mark these to-be-released LPIs specially to avoid racing with vgic_put_irq() and causing a double-free. Since references can only be taken on LPIs with a nonzero refcount, extending the lifetime of freed LPIs is still safe. Reviewed-by: Marc Zyngier Reported-by: syzbot+cef594105ac7e60c6d93@syzkaller.appspotmail.com Closes: https://lore.kernel.org/kvmarm/68acd0d9.a00a0220.33401d.048b.GAE@google.com/ Link: https://lore.kernel.org/r/20250905100531.282980-5-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 41 ++++++++++++++++++++++++++++++++++---- include/kvm/arm_vgic.h | 3 +++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index ec4d70936a5bd9..480391a0dcadb7 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -28,8 +28,8 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * kvm->arch.config_lock (mutex) * its->cmd_lock (mutex) * its->its_lock (mutex) - * vgic_cpu->ap_list_lock must be taken with IRQs disabled - * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled + * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled + * vgic_cpu->ap_list_lock must be taken with IRQs disabled * vgic_irq->irq_lock must be taken with IRQs disabled * * As the ap_list_lock might be taken from the timer interrupt handler, @@ -129,6 +129,15 @@ static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) return refcount_dec_and_test(&irq->refcount); } +static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq *irq) +{ + if (!__vgic_put_irq(kvm, irq)) + return false; + + irq->pending_release = true; + return true; +} + void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; @@ -142,10 +151,27 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) xa_unlock_irqrestore(&dist->lpi_xa, flags); } +static void vgic_release_deleted_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long flags, intid; + struct vgic_irq *irq; + + xa_lock_irqsave(&dist->lpi_xa, flags); + + xa_for_each(&dist->lpi_xa, intid, irq) { + if (irq->pending_release) + vgic_release_lpi_locked(dist, irq); + } + + xa_unlock_irqrestore(&dist->lpi_xa, flags); +} + void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; + bool deleted = false; unsigned long flags; raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); @@ -156,11 +182,14 @@ void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) list_del(&irq->ap_list); irq->vcpu = NULL; raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); + deleted |= vgic_put_irq_norelease(vcpu->kvm, irq); } } raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + + if (deleted) + vgic_release_deleted_lpis(vcpu->kvm); } void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) @@ -631,6 +660,7 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; + bool deleted_lpis = false; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); @@ -663,7 +693,7 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) * we remove the irq from the list, we drop * also drop the refcount. */ - vgic_put_irq(vcpu->kvm, irq); + deleted_lpis |= vgic_put_irq_norelease(vcpu->kvm, irq); continue; } @@ -726,6 +756,9 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) } raw_spin_unlock(&vgic_cpu->ap_list_lock); + + if (unlikely(deleted_lpis)) + vgic_release_deleted_lpis(vcpu->kvm); } static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 640555ff5b5497..4000ff16f2957e 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -140,6 +140,9 @@ struct vgic_irq { * the pending state for both level * and edge triggered IRQs. */ bool active; + bool pending_release; /* Used for LPIs only, unreferenced IRQ + * pending a release */ + bool enabled; bool hw; /* Tied to HW IRQ */ refcount_t refcount; /* Used for LPIs */ From 982f31bbb5b0adc79a9126c0f970e7801c6e8342 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 5 Sep 2025 03:05:30 -0700 Subject: [PATCH 1349/3206] KVM: arm64: vgic-v3: Don't require IRQs be disabled for LPI xarray lock Now that releases of LPIs have been unnested from the ap_list_lock there are no xarray writers that exist in a context where IRQs are already disabled. As such we can relax the locking to the non-IRQ disabling spinlock to guard the LPI xarray. Note that there are still readers of the LPI xarray where IRQs are disabled however the readers rely on RCU protection instead of the lock. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250905100531.282980-6-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-init.c | 2 +- arch/arm64/kvm/vgic/vgic-its.c | 7 +++---- arch/arm64/kvm/vgic/vgic.c | 13 ++++++------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 4c777136ea5ffc..4c3c0d82e47601 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -53,7 +53,7 @@ void kvm_vgic_early_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ); + xa_init(&dist->lpi_xa); } /* CREATION */ diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index f162206adb482a..ce3e3ed3f29f04 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -78,7 +78,6 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq; - unsigned long flags; int ret; /* In this case there is no put, since we keep the reference. */ @@ -89,7 +88,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, if (!irq) return ERR_PTR(-ENOMEM); - ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); + ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); if (ret) { kfree(irq); return ERR_PTR(ret); @@ -104,7 +103,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, irq->target_vcpu = vcpu; irq->group = 1; - xa_lock_irqsave(&dist->lpi_xa, flags); + xa_lock(&dist->lpi_xa); /* * There could be a race with another vgic_add_lpi(), so we need to @@ -126,7 +125,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, } out_unlock: - xa_unlock_irqrestore(&dist->lpi_xa, flags); + xa_unlock(&dist->lpi_xa); if (ret) return ERR_PTR(ret); diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 480391a0dcadb7..a21b482844ce84 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -28,7 +28,7 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * kvm->arch.config_lock (mutex) * its->cmd_lock (mutex) * its->its_lock (mutex) - * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled + * vgic_dist->lpi_xa.xa_lock * vgic_cpu->ap_list_lock must be taken with IRQs disabled * vgic_irq->irq_lock must be taken with IRQs disabled * @@ -141,30 +141,29 @@ static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long flags; if (!__vgic_put_irq(kvm, irq)) return; - xa_lock_irqsave(&dist->lpi_xa, flags); + xa_lock(&dist->lpi_xa); vgic_release_lpi_locked(dist, irq); - xa_unlock_irqrestore(&dist->lpi_xa, flags); + xa_unlock(&dist->lpi_xa); } static void vgic_release_deleted_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long flags, intid; + unsigned long intid; struct vgic_irq *irq; - xa_lock_irqsave(&dist->lpi_xa, flags); + xa_lock(&dist->lpi_xa); xa_for_each(&dist->lpi_xa, intid, irq) { if (irq->pending_release) vgic_release_lpi_locked(dist, irq); } - xa_unlock_irqrestore(&dist->lpi_xa, flags); + xa_unlock(&dist->lpi_xa); } void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) From 13bba09beb5ffa1a4f307c48576c09d5c69f4c31 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 5 Sep 2025 03:05:31 -0700 Subject: [PATCH 1350/3206] KVM: arm64: vgic-v3: Indicate vgic_put_irq() may take LPI xarray lock The release path on LPIs is quite rare, meaning it can be difficult to find lock ordering bugs on the LPI xarray's spinlock. Tell lockdep that vgic_put_irq() might acquire the xa_lock to make unsafe patterns more obvious. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250905100531.282980-7-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index a21b482844ce84..3b247041a1300c 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -142,6 +142,9 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; + if (irq->intid >= VGIC_MIN_LPI) + might_lock(&dist->lpi_xa.xa_lock); + if (!__vgic_put_irq(kvm, irq)) return; From ebb2d8fd81b82c8a57f88add118108b1c4408670 Mon Sep 17 00:00:00 2001 From: Dongha Lee Date: Sat, 6 Sep 2025 13:07:24 +0900 Subject: [PATCH 1351/3206] KVM: arm64: nv: Fix incorrect VNCR invalidation range calculation The code for invalidating VNCR entries in both kvm_invalidate_vncr_ipa() and invalidate_vncr_va() incorrectly uses a bitwise AND with `(size - 1)` instead of `~(size - 1)` to align the start address. This results in masking the address bits instead of aligning them down to the start of the block. This bug may cause stale VNCR TLB entries to remain valid even after a TLBI or MMU notifier, leading to incorrect memory translation and unexpected guest behavior. Credit to Team 0xB6 in bob14: DongHa Lee, Gyujeong Jin, Daehyeon Ko, Geonha Lee, Hyungyu Oh, and Jaewon Yang. Reviewed-by: Marc Zyngier Signed-off-by: Dongha Lee Link: https://lore.kernel.org/r/20250906040724.72960-1-p@sswd.pw Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 24eab94d7d7f4c..50d559248a1f0f 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -847,7 +847,7 @@ static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end) ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, vt->wr.level)); - ipa_start = vt->wr.pa & (ipa_size - 1); + ipa_start = vt->wr.pa & ~(ipa_size - 1); ipa_end = ipa_start + ipa_size; if (ipa_end <= start || ipa_start >= end) @@ -887,7 +887,7 @@ static void invalidate_vncr_va(struct kvm *kvm, va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, vt->wr.level)); - va_start = vt->gva & (va_size - 1); + va_start = vt->gva & ~(va_size - 1); va_end = va_start + va_size; switch (scope->type) { From 2dc720e606319eae6990c31b7cc8fd188f442ce4 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 8 Sep 2025 17:35:57 +0100 Subject: [PATCH 1352/3206] KVM: arm64: Fix parameter ordering for VBAR_EL1 assignment The __vcpu_assign_sys_reg() helper expects the register ID as the second argument and the value to be assigned as the third. However, the existing code was passing these parameters in the incorrect order. Fix the function call to properly read the live value of VBAR_EL1 from the guest and update the vCPU value immediately before pending the exception. This ensures the vCPU's value is the same as the guest's and that the exception will be handled at the correct address upon resuming the guest. Fixes: 798eb5978700 ("KVM: arm64: Sync protected guest VBAR_EL1 on injecting an undef exception") Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20250908163557.2419780-1-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/sys_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 71d2fc97f0046a..82da9b03692d45 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -253,7 +253,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); - __vcpu_assign_sys_reg(vcpu, read_sysreg_el1(SYS_VBAR), VBAR_EL1); + __vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR)); kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); From 51d165e92a701012a11e726217a5c51e367563e4 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chang Date: Mon, 8 Sep 2025 14:48:06 +0800 Subject: [PATCH 1353/3206] KVM: arm64: Remove stage 2 read fault check In the non-NV case, read permission is always granted when mapping stage-2, so checking for it doesn't bring much. On the other hand, shadow stage-2 for NV guests could potentially have non-readable mappings when we align the permissions with those that L1 set for L2, we shouldn't be checking for read faults in this case either. So just remove this check. Suggested-by: Oliver Upton Suggested-by: Marc Zyngier Signed-off-by: Wei-Lin Chang Link: https://lore.kernel.org/r/20250908064806.4093081-1-r09922117@csie.ntu.edu.tw Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0f4271458a079e..e78ffcb4bfe900 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1545,11 +1545,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); VM_BUG_ON(write_fault && exec_fault); - if (fault_is_perm && !write_fault && !exec_fault) { - kvm_err("Unexpected L2 read permission error\n"); - return -EFAULT; - } - if (!is_protected_kvm_enabled()) memcache = &vcpu->arch.mmu_page_cache; else From c04f17412991af9471629023017bf969ea19e60f Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Mon, 8 Sep 2025 11:04:11 -0700 Subject: [PATCH 1354/3206] KVM: arm64: vgic: fix incorrect spinlock API usage The function vgic_flush_lr_state() is calling _raw_spin_unlock() instead of the proper raw_spin_unlock(). _raw_spin_unlock() is an internal low-level API and should not be used directly; using raw_spin_unlock() ensures proper locking semantics in the vgic code. Fixes: 8fa3adb8c6be ("KVM: arm/arm64: vgic: Make vgic_irq->irq_lock a raw_spinlock") Signed-off-by: Alok Tiwari Acked-by: Marc Zyngier Message-ID: <20250908180413.3655546-1-alok.a.tiwari@oracle.com> Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 3b247041a1300c..6dd5a10081e27e 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -854,7 +854,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) * the AP list has been sorted already. */ if (multi_sgi && irq->priority > prio) { - _raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock(&irq->irq_lock); break; } From 8822e8be86d40410ddd2ac8ff44f3050c9ecf9c6 Mon Sep 17 00:00:00 2001 From: aprilgrimoire Date: Sun, 7 Sep 2025 09:06:11 +0000 Subject: [PATCH 1355/3206] platform/x86/amd/pmc: Add MECHREVO Yilong15Pro to spurious_8042 list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The firmware of Mechrevo Yilong15Pro emits a spurious keyboard interrupt on events including closing the lid. When a user closes the lid on an already suspended system this causes the system to wake up. Add Mechrevo Yilong15Pro Series (GM5HG7A) to the list of quirk spurious_8042 to work around this issue. Link: https://lore.kernel.org/linux-pm/6ww4uu6Gl4F5n6VY5dl1ufASfKzs4DhMxAN8BuqUpCoqU3PQukVSVSBCl_lKIzkQ-S8kt1acPd58eyolhkWN32lMLFj4ViI0Tdu2jwhnYZ8=@proton.me/ Signed-off-by: April Grimoire Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/IvSc_IN5Pa0wRXElTk_fEl-cTpMZxg6TCQk_7aRUkTd9vJUp_ZeC0NdXZ0z6Tn7B-XiqqqQvCH65lq6FqhuECBMEYWcHQmWm1Jo7Br8kpeg=@proton.me Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc-quirks.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index 18fb44139de251..4d0a38e06f0830 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -239,6 +239,14 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_BOARD_NAME, "WUJIE14-GX4HRXL"), } }, + { + .ident = "MECHREVO Yilong15Pro Series GM5HG7A", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "MECHREVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "Yilong15Pro Series GM5HG7A"), + } + }, /* https://bugzilla.kernel.org/show_bug.cgi?id=220116 */ { .ident = "PCSpecialist Lafite Pro V 14M", From fba9d5448bd45b0ff7199c47023e9308ea4f1730 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Thu, 4 Sep 2025 15:22:51 +0200 Subject: [PATCH 1356/3206] platform/x86: oxpec: Add support for OneXPlayer X1Pro EVA-02 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is a special edition of X1Pro with a different color. Signed-off-by: Antheas Kapenekakis Link: https://patch.msgid.link/20250904132252.3041613-1-lkml@antheas.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/oxpec.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/platform/x86/oxpec.c b/drivers/platform/x86/oxpec.c index eb076bb4099bed..4f540a9932fe1d 100644 --- a/drivers/platform/x86/oxpec.c +++ b/drivers/platform/x86/oxpec.c @@ -306,6 +306,13 @@ static const struct dmi_system_id dmi_table[] = { }, .driver_data = (void *)oxp_x1, }, + { + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ONE-NETBOOK"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "ONEXPLAYER X1Pro EVA-02"), + }, + .driver_data = (void *)oxp_x1, + }, {}, }; From d857d09fb653f081f5730e5549fce397513b0ef9 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Thu, 4 Sep 2025 15:22:52 +0200 Subject: [PATCH 1357/3206] platform/x86: oxpec: Add support for AOKZOE A1X MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Very similar to OneXFly devices. Uses the same registers. Signed-off-by: Antheas Kapenekakis Link: https://patch.msgid.link/20250904132252.3041613-2-lkml@antheas.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/oxpec.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/platform/x86/oxpec.c b/drivers/platform/x86/oxpec.c index 4f540a9932fe1d..54377b282ff885 100644 --- a/drivers/platform/x86/oxpec.c +++ b/drivers/platform/x86/oxpec.c @@ -124,6 +124,13 @@ static const struct dmi_system_id dmi_table[] = { }, .driver_data = (void *)aok_zoe_a1, }, + { + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "AOKZOE"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "AOKZOE A1X"), + }, + .driver_data = (void *)oxp_fly, + }, { .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "AYANEO"), From c45601306aa5831c3e59158f95b8e34f27e9ea09 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sat, 26 Jul 2025 14:29:30 +0200 Subject: [PATCH 1358/3206] um: Don't mark stack executable On one of my machines UML failed to start after enabling SELinux. UML failed to start because SELinux's execmod rule denies executable pages on a modified file mapping. Historically UML marks it's stack rwx. AFAICT, these days this is no longer needed, so let's remove PROT_EXEC. Signed-off-by: Richard Weinberger Signed-off-by: Johannes Berg --- arch/um/os-Linux/util.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 4193e04d7e4a7f..e3ad71a0d13c41 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -20,8 +20,7 @@ void stack_protections(unsigned long address) { - if (mprotect((void *) address, UM_THREAD_SIZE, - PROT_READ | PROT_WRITE | PROT_EXEC) < 0) + if (mprotect((void *) address, UM_THREAD_SIZE, PROT_READ | PROT_WRITE) < 0) panic("protecting stack failed, errno = %d", errno); } From 7ebf70cf181651fe3f2e44e95e7e5073d594c9c0 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 28 Aug 2025 15:00:51 +0800 Subject: [PATCH 1359/3206] um: virtio_uml: Fix use-after-free after put_device in probe When register_virtio_device() fails in virtio_uml_probe(), the code sets vu_dev->registered = 1 even though the device was not successfully registered. This can lead to use-after-free or other issues. Fixes: 04e5b1fb0183 ("um: virtio: Remove device on disconnect") Signed-off-by: Miaoqian Lin Signed-off-by: Johannes Berg --- arch/um/drivers/virtio_uml.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index ad8d78fb1d9aaf..de7867ae220d0c 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -1250,10 +1250,12 @@ static int virtio_uml_probe(struct platform_device *pdev) device_set_wakeup_capable(&vu_dev->vdev.dev, true); rc = register_virtio_device(&vu_dev->vdev); - if (rc) + if (rc) { put_device(&vu_dev->vdev.dev); + return rc; + } vu_dev->registered = 1; - return rc; + return 0; error_init: os_close_file(vu_dev->sock); From df447a3b4a4b961c9979b4b3ffb74317394b9b40 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 1 Sep 2025 08:27:15 +0800 Subject: [PATCH 1360/3206] um: Fix FD copy size in os_rcv_fd_msg() When copying FDs, the copy size should not include the control message header (cmsghdr). Fix it. Fixes: 5cde6096a4dd ("um: generalize os_rcv_fd") Signed-off-by: Tiwei Bie Signed-off-by: Johannes Berg --- arch/um/os-Linux/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index 617886d1fb1e91..21f0e50fb1df95 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -535,7 +535,7 @@ ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, cmsg->cmsg_type != SCM_RIGHTS) return n; - memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len); + memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len - CMSG_LEN(0)); return n; } From 448097bbd3836d2ee46fa6eabd18661e9a3c8be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Fran=C3=A7ois=20Lessard?= Date: Tue, 2 Sep 2025 15:04:39 -0400 Subject: [PATCH 1361/3206] device property: Add scoped fwnode child node iterators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add scoped versions of fwnode child node iterators that automatically handle reference counting cleanup using the __free() attribute: - fwnode_for_each_child_node_scoped() - fwnode_for_each_available_child_node_scoped() These macros follow the same pattern as existing scoped iterators in the kernel, ensuring fwnode references are automatically released when the iterator variable goes out of scope. This prevents resource leaks and eliminates the need for manual cleanup in error paths. The implementation mirrors the non-scoped variants but uses __free(fwnode_handle) for automatic resource management, providing a safer and more convenient interface for drivers iterating over firmware node children. Signed-off-by: Jean-François Lessard Acked-by: Danilo Krummrich Reviewed-by: Andy Shevchenko Reviewed-by: Sakari Ailus Signed-off-by: Wolfram Sang --- include/linux/property.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/property.h b/include/linux/property.h index 82f0cb3abd1e22..862e208133f360 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -176,6 +176,16 @@ struct fwnode_handle *fwnode_get_next_available_child_node( for (child = fwnode_get_next_available_child_node(fwnode, NULL); child;\ child = fwnode_get_next_available_child_node(fwnode, child)) +#define fwnode_for_each_child_node_scoped(fwnode, child) \ + for (struct fwnode_handle *child __free(fwnode_handle) = \ + fwnode_get_next_child_node(fwnode, NULL); \ + child; child = fwnode_get_next_child_node(fwnode, child)) + +#define fwnode_for_each_available_child_node_scoped(fwnode, child) \ + for (struct fwnode_handle *child __free(fwnode_handle) = \ + fwnode_get_next_available_child_node(fwnode, NULL); \ + child; child = fwnode_get_next_available_child_node(fwnode, child)) + struct fwnode_handle *device_get_next_child_node(const struct device *dev, struct fwnode_handle *child); From 4de37a48b6b58faaded9eb765047cf0d8785ea18 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 29 Aug 2025 11:03:44 +0200 Subject: [PATCH 1362/3206] drm/mediatek: fix potential OF node use-after-free The for_each_child_of_node() helper drops the reference it takes to each node as it iterates over children and an explicit of_node_put() is only needed when exiting the loop early. Drop the recently introduced bogus additional reference count decrement at each iteration that could potentially lead to a use-after-free. Fixes: 1f403699c40f ("drm/mediatek: Fix device/node reference count leaks in mtk_drm_get_all_drm_priv") Cc: Ma Ke Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Reviewed-by: CK Hu Reviewed-by: AngeloGioacchino Del Regno Link: https://patchwork.kernel.org/project/dri-devel/patch/20250829090345.21075-2-johan@kernel.org/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index f8a817689e1626..76719eb5db09c0 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -387,11 +387,11 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) of_id = of_match_node(mtk_drm_of_ids, node); if (!of_id) - goto next_put_node; + continue; pdev = of_find_device_by_node(node); if (!pdev) - goto next_put_node; + continue; drm_dev = device_find_child(&pdev->dev, NULL, mtk_drm_match); if (!drm_dev) @@ -417,11 +417,10 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) next_put_device_pdev_dev: put_device(&pdev->dev); -next_put_node: - of_node_put(node); - - if (cnt == MAX_CRTC) + if (cnt == MAX_CRTC) { + of_node_put(node); break; + } } if (drm_priv->data->mmsys_dev_num == cnt) { From d32bb69523b86c85fc91d4c8a6d8313639ac1c1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Fran=C3=A7ois=20Lessard?= Date: Tue, 2 Sep 2025 15:04:40 -0400 Subject: [PATCH 1363/3206] i2c: core: Use fwnode_for_each_child_node_scoped() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the manual __free(fwnode_handle) iterator declaration with the new scoped iterator macro for cleaner, less error-prone code. This eliminates the need for explicit iterator variable declaration with the cleanup attribute, making the code more consistent with other scoped iterator usage patterns in the kernel. Signed-off-by: Jean-François Lessard Reviewed-by: Andy Shevchenko Reviewed-by: Sakari Ailus Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-slave.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i2c/i2c-core-slave.c b/drivers/i2c/i2c-core-slave.c index 7ee6b992b835f5..02ca55c2246bcf 100644 --- a/drivers/i2c/i2c-core-slave.c +++ b/drivers/i2c/i2c-core-slave.c @@ -112,10 +112,9 @@ bool i2c_detect_slave_mode(struct device *dev) struct fwnode_handle *fwnode = dev_fwnode(dev); if (is_of_node(fwnode)) { - struct fwnode_handle *child __free(fwnode_handle) = NULL; u32 reg; - fwnode_for_each_child_node(fwnode, child) { + fwnode_for_each_child_node_scoped(fwnode, child) { fwnode_property_read_u32(child, "reg", ®); if (reg & I2C_OWN_SLAVE_ADDRESS) return true; From 9ba2556cef1df746fad4d691c8290e235b23c7d1 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 29 Aug 2025 11:03:45 +0200 Subject: [PATCH 1364/3206] drm/mediatek: clean up driver data initialisation The platform and drm devices are only used to look up the drm device and its driver data respectively when initialising the driver data during bind(). Drop the reference counts as soon as they have been used to make the code more readable. Note that the crtc count is never incremented on lookup failures. Signed-off-by: Johan Hovold Reviewed-by: CK Hu Reviewed-by: AngeloGioacchino Del Regno Link: https://patchwork.kernel.org/project/dri-devel/patch/20250829090345.21075-3-johan@kernel.org/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index 76719eb5db09c0..eb5537f0ac90d8 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -394,12 +394,14 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) continue; drm_dev = device_find_child(&pdev->dev, NULL, mtk_drm_match); + put_device(&pdev->dev); if (!drm_dev) - goto next_put_device_pdev_dev; + continue; temp_drm_priv = dev_get_drvdata(drm_dev); + put_device(drm_dev); if (!temp_drm_priv) - goto next_put_device_drm_dev; + continue; if (temp_drm_priv->data->main_len) all_drm_priv[CRTC_MAIN] = temp_drm_priv; @@ -411,12 +413,6 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) if (temp_drm_priv->mtk_drm_bound) cnt++; -next_put_device_drm_dev: - put_device(drm_dev); - -next_put_device_pdev_dev: - put_device(&pdev->dev); - if (cnt == MAX_CRTC) { of_node_put(node); break; From a5d7a8ab4b21747173a2f8f0ebf71d72692793c3 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 9 Sep 2025 20:58:17 +0100 Subject: [PATCH 1365/3206] riscv: dts: allwinner: rename devterm i2c-gpio node to comply with binding The i2c controller binding does not permit permit the node name to contain "gpio", resulting in two warnings: i2c-gpio-0 (i2c-gpio): $nodename:0: 'i2c-gpio-0' does not match '^i2c(@.+|-[a-z0-9]+)?$' i2c-gpio-0 (i2c-gpio): Unevaluated properties are not allowed ('#address-cells', '#size-cells', 'adc@54' were unexpected) Drop it to satisfy dtbs_check. Signed-off-by: Conor Dooley Link: https://patch.msgid.link/20250909-frown-wrinkle-f16df243a970@spud Signed-off-by: Chen-Yu Tsai --- arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts b/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts index bc5c84f227622e..5f2e5cc3e3d555 100644 --- a/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts +++ b/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts @@ -17,7 +17,7 @@ #cooling-cells = <2>; }; - i2c-gpio-0 { + i2c-0 { compatible = "i2c-gpio"; sda-gpios = <&pio 3 14 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>; /* PD14/GPIO44 */ scl-gpios = <&pio 3 15 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>; /* PD15/GPIO45 */ From 9c600589e14f5fc01b8be9a5d0ad1f094b8b304b Mon Sep 17 00:00:00 2001 From: James Guan Date: Wed, 10 Sep 2025 19:19:29 +0800 Subject: [PATCH 1366/3206] wifi: virt_wifi: Fix page fault on connect This patch prevents page fault in __cfg80211_connect_result()[1] when connecting a virt_wifi device, while ensuring that virt_wifi can connect properly. [1] https://lore.kernel.org/linux-wireless/20250909063213.1055024-1-guan_yufei@163.com/ Closes: https://lore.kernel.org/linux-wireless/20250909063213.1055024-1-guan_yufei@163.com/ Signed-off-by: James Guan Link: https://patch.msgid.link/20250910111929.137049-1-guan_yufei@163.com [remove irrelevant network-manager instructions] Signed-off-by: Johannes Berg --- drivers/net/wireless/virtual/virt_wifi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/virtual/virt_wifi.c b/drivers/net/wireless/virtual/virt_wifi.c index 1fffeff2190ca8..4eae89376feb55 100644 --- a/drivers/net/wireless/virtual/virt_wifi.c +++ b/drivers/net/wireless/virtual/virt_wifi.c @@ -277,7 +277,9 @@ static void virt_wifi_connect_complete(struct work_struct *work) priv->is_connected = true; /* Schedules an event that acquires the rtnl lock. */ - cfg80211_connect_result(priv->upperdev, requested_bss, NULL, 0, NULL, 0, + cfg80211_connect_result(priv->upperdev, + priv->is_connected ? fake_router_bssid : NULL, + NULL, 0, NULL, 0, status, GFP_KERNEL); netif_carrier_on(priv->upperdev); } From 0467d6c99d1c64210ee8c9621cd63b12301cab2e Mon Sep 17 00:00:00 2001 From: Feng Chen Date: Wed, 10 Sep 2025 18:18:25 +0800 Subject: [PATCH 1367/3206] spi: dt-bindings: add Amlogic A113L2 SFC The Flash Controller is derived by adding an SPI path to the original raw NAND controller. This controller supports two modes: raw mode and SPI mode. The raw mode has already been implemented in the community, and the SPI mode is described here. Add bindings for Amlogic A113L2 SPI Flash Controller. Signed-off-by: Feng Chen Reviewed-by: Conor Dooley Signed-off-by: Xianwei Zhao Link: https://patch.msgid.link/20250910-spifc-v6-1-1574aa9baebd@amlogic.com Signed-off-by: Mark Brown --- .../bindings/spi/amlogic,a4-spifc.yaml | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 Documentation/devicetree/bindings/spi/amlogic,a4-spifc.yaml diff --git a/Documentation/devicetree/bindings/spi/amlogic,a4-spifc.yaml b/Documentation/devicetree/bindings/spi/amlogic,a4-spifc.yaml new file mode 100644 index 00000000000000..b4cef838bcd40b --- /dev/null +++ b/Documentation/devicetree/bindings/spi/amlogic,a4-spifc.yaml @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2025 Amlogic, Inc. All rights reserved +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/spi/amlogic,a4-spifc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: SPI flash controller for Amlogic ARM SoCs + +maintainers: + - Liang Yang + - Feng Chen + - Xianwei Zhao + +description: + The Amlogic SPI flash controller is an extended version of the Amlogic NAND + flash controller. It supports SPI Nor Flash and SPI NAND Flash(where the Host + ECC HW engine could be enabled). + +allOf: + - $ref: /schemas/spi/spi-controller.yaml# + +properties: + compatible: + const: amlogic,a4-spifc + + reg: + maxItems: 1 + + clocks: + items: + - description: clock apb gate + - description: clock used for the controller + + clock-names: + items: + - const: gate + - const: core + + interrupts: + maxItems: 1 + + amlogic,rx-adj: + description: + Number of clock cycles by which sampling is delayed. + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + +required: + - compatible + - reg + - clocks + - clock-names + +unevaluatedProperties: false + +examples: + - | + sfc0: spi@fe08d000 { + compatible = "amlogic,a4-spifc"; + reg = <0xfe08d000 0x800>; + clocks = <&clkc_periphs 31>, + <&clkc_periphs 102>; + clock-names = "gate", "core"; + + pinctrl-0 = <&spiflash_default>; + pinctrl-names = "default"; + + #address-cells = <1>; + #size-cells = <0>; + + flash@0 { + compatible = "spi-nand"; + reg = <0>; + #address-cells = <1>; + #size-cells = <1>; + nand-ecc-engine = <&sfc0>; + nand-ecc-strength = <8>; + nand-ecc-step-size = <512>; + }; + }; From 4670db6f32e9379f5ab6c9bb2a6787cd9b9230a9 Mon Sep 17 00:00:00 2001 From: Feng Chen Date: Wed, 10 Sep 2025 18:18:26 +0800 Subject: [PATCH 1368/3206] spi: amlogic: add driver for Amlogic SPI Flash Controller This driver provides support for the SPI mode of the Amlogic Flash Controller. It supports both SPI NOR flash and SPI NAND flash. For SPI NAND, the Host ECC hardware engine can be enabled. The controller implements the SPI-MEM interface and does not support generic SPI. Signed-off-by: Feng Chen Signed-off-by: Xianwei Zhao Link: https://patch.msgid.link/20250910-spifc-v6-2-1574aa9baebd@amlogic.com Signed-off-by: Mark Brown --- drivers/spi/Kconfig | 10 + drivers/spi/Makefile | 1 + drivers/spi/spi-amlogic-spifc-a4.c | 1222 ++++++++++++++++++++++++++++ 3 files changed, 1233 insertions(+) create mode 100644 drivers/spi/spi-amlogic-spifc-a4.c diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 891729c9c5642a..b44b0b5e5cdcb9 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -99,6 +99,16 @@ config SPI_AMLOGIC_SPIFC_A1 This enables master mode support for the SPIFC (SPI flash controller) available in Amlogic A1 (A113L SoC). +config SPI_AMLOGIC_SPIFC_A4 + tristate "Amlogic A4 SPI Flash controller" + depends on ARCH_MESON || COMPILE_TEST + select REGMAP_MMIO + help + This enables SPI mode on the NAND Flash Controller of Amlogic + ARM SoCs. It supports SPI Nor Flash and SPI NAND Flash (Could + enable Host ECC HW engine). The controller implements the + SPI-MEM interface, it doesn't support generic SPI. + config SPI_AMLOGIC_SPISG tristate "Amlogic SPISG controller" depends on COMMON_CLK diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile index 062c85989c8c96..eefaeca0974566 100644 --- a/drivers/spi/Makefile +++ b/drivers/spi/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_SPI_ALTERA) += spi-altera-platform.o obj-$(CONFIG_SPI_ALTERA_CORE) += spi-altera-core.o obj-$(CONFIG_SPI_ALTERA_DFL) += spi-altera-dfl.o obj-$(CONFIG_SPI_AMLOGIC_SPIFC_A1) += spi-amlogic-spifc-a1.o +obj-$(CONFIG_SPI_AMLOGIC_SPIFC_A4) += spi-amlogic-spifc-a4.o obj-$(CONFIG_SPI_AMLOGIC_SPISG) += spi-amlogic-spisg.o obj-$(CONFIG_SPI_APPLE) += spi-apple.o obj-$(CONFIG_SPI_AR934X) += spi-ar934x.o diff --git a/drivers/spi/spi-amlogic-spifc-a4.c b/drivers/spi/spi-amlogic-spifc-a4.c new file mode 100644 index 00000000000000..4ca8e82fdc67db --- /dev/null +++ b/drivers/spi/spi-amlogic-spifc-a4.c @@ -0,0 +1,1222 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR MIT) +/* + * Copyright (C) 2025 Amlogic, Inc. All rights reserved + * + * Driver for the SPI Mode of Amlogic Flash Controller + * Authors: + * Liang Yang + * Feng Chen + * Xianwei Zhao + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SFC_CMD 0x00 +#define SFC_CFG 0x04 +#define SFC_DADR 0x08 +#define SFC_IADR 0x0c +#define SFC_BUF 0x10 +#define SFC_INFO 0x14 +#define SFC_DC 0x18 +#define SFC_ADR 0x1c +#define SFC_DL 0x20 +#define SFC_DH 0x24 +#define SFC_CADR 0x28 +#define SFC_SADR 0x2c +#define SFC_RX_IDX 0x34 +#define SFC_RX_DAT 0x38 +#define SFC_SPI_CFG 0x40 + +/* settings in SFC_CMD */ + +/* 4 bits support 4 chip select, high false, low select but spi support 2*/ +#define CHIP_SELECT_MASK GENMASK(13, 10) +#define CS_NONE 0xf +#define CS_0 0xe +#define CS_1 0xd + +#define CLE (0x5 << 14) +#define ALE (0x6 << 14) +#define DWR (0x4 << 14) +#define DRD (0x8 << 14) +#define DUMMY (0xb << 14) +#define IDLE (0xc << 14) +#define IDLE_CYCLE_MASK GENMASK(9, 0) +#define EXT_CYCLE_MASK GENMASK(9, 0) + +#define OP_M2N ((0 << 17) | (2 << 20)) +#define OP_N2M ((1 << 17) | (2 << 20)) +#define OP_STS ((3 << 17) | (2 << 20)) +#define OP_ADL ((0 << 16) | (3 << 20)) +#define OP_ADH ((1 << 16) | (3 << 20)) +#define OP_AIL ((2 << 16) | (3 << 20)) +#define OP_AIH ((3 << 16) | (3 << 20)) +#define OP_ASL ((4 << 16) | (3 << 20)) +#define OP_ASH ((5 << 16) | (3 << 20)) +#define OP_SEED ((8 << 16) | (3 << 20)) +#define SEED_MASK GENMASK(14, 0) +#define ENABLE_RANDOM BIT(19) + +#define CMD_COMMAND(cs_sel, cmd) (CLE | ((cs_sel) << 10) | (cmd)) +#define CMD_ADDR(cs_sel, addr) (ALE | ((cs_sel) << 10) | (addr)) +#define CMD_DUMMY(cs_sel, cyc) (DUMMY | ((cs_sel) << 10) | ((cyc) & EXT_CYCLE_MASK)) +#define CMD_IDLE(cs_sel, cyc) (IDLE | ((cs_sel) << 10) | ((cyc) & IDLE_CYCLE_MASK)) +#define CMD_MEM2NAND(bch, pages) (OP_M2N | ((bch) << 14) | (pages)) +#define CMD_NAND2MEM(bch, pages) (OP_N2M | ((bch) << 14) | (pages)) +#define CMD_DATA_ADDRL(addr) (OP_ADL | ((addr) & 0xffff)) +#define CMD_DATA_ADDRH(addr) (OP_ADH | (((addr) >> 16) & 0xffff)) +#define CMD_INFO_ADDRL(addr) (OP_AIL | ((addr) & 0xffff)) +#define CMD_INFO_ADDRH(addr) (OP_AIH | (((addr) >> 16) & 0xffff)) +#define CMD_SEED(seed) (OP_SEED | ((seed) & SEED_MASK)) + +#define GET_CMD_SIZE(x) (((x) >> 22) & GENMASK(4, 0)) + +#define DEFAULT_PULLUP_CYCLE 2 +#define CS_SETUP_CYCLE 1 +#define CS_HOLD_CYCLE 2 +#define DEFAULT_BUS_CYCLE 4 + +#define RAW_SIZE GENMASK(13, 0) +#define RAW_SIZE_BW 14 + +#define DMA_ADDR_ALIGN 8 + +/* Bit fields in SFC_SPI_CFG */ +#define SPI_MODE_EN BIT(31) +#define RAW_EXT_SIZE GENMASK(29, 18) +#define ADDR_LANE GENMASK(17, 16) +#define CPOL BIT(15) +#define CPHA BIT(14) +#define EN_HOLD BIT(13) +#define EN_WP BIT(12) +#define TXADJ GENMASK(11, 8) +#define RXADJ GENMASK(7, 4) +#define CMD_LANE GENMASK(3, 2) +#define DATA_LANE GENMASK(1, 0) +#define LANE_MAX 0x3 + +/* raw ext size[25:14] + raw size[13:0] */ +#define RAW_MAX_RW_SIZE_MASK GENMASK(25, 0) + +/* Ecc fields */ +#define ECC_COMPLETE BIT(31) +#define ECC_UNCORRECTABLE 0x3f +#define ECC_ERR_CNT(x) (((x) >> 24) & 0x3f) +#define ECC_ZERO_CNT(x) (((x) >> 16) & 0x3f) + +#define ECC_BCH8_512 1 +#define ECC_BCH8_1K 2 +#define ECC_BCH8_PARITY_BYTES 14 +#define ECC_BCH8_USER_BYTES 2 +#define ECC_BCH8_INFO_BYTES (ECC_BCH8_USER_BYTES + ECC_BCH8_PARITY_BYTES) +#define ECC_BCH8_STRENGTH 8 +#define ECC_BCH8_DEFAULT_STEP 512 +#define ECC_DEFAULT_BCH_MODE ECC_BCH8_512 +#define ECC_PER_INFO_BYTE 8 +#define ECC_PATTERN 0x5a +#define ECC_BCH_MAX_SECT_SIZE 63 +/* soft flags for sfc */ +#define SFC_HWECC BIT(0) +#define SFC_DATA_RANDOM BIT(1) +#define SFC_DATA_ONLY BIT(2) +#define SFC_OOB_ONLY BIT(3) +#define SFC_DATA_OOB BIT(4) +#define SFC_AUTO_OOB BIT(5) +#define SFC_RAW_RW BIT(6) +#define SFC_XFER_MDOE_MASK GENMASK(6, 2) + +#define SFC_DATABUF_SIZE 8192 +#define SFC_INFOBUF_SIZE 256 +#define SFC_BUF_SIZE (SFC_DATABUF_SIZE + SFC_INFOBUF_SIZE) + +/* !!! PCB and SPI-NAND chip limitations */ +#define SFC_MAX_FREQUENCY (250 * 1000 * 1000) +#define SFC_MIN_FREQUENCY (4 * 1000 * 1000) +#define SFC_BUS_DEFAULT_CLK 40000000 +#define SFC_MAX_CS_NUM 2 + +/* SPI-FLASH R/W operation cmd */ +#define SPIFLASH_RD_OCTALIO 0xcb +#define SPIFLASH_RD_OCTAL 0x8b +#define SPIFLASH_RD_QUADIO 0xeb +#define SPIFLASH_RD_QUAD 0x6b +#define SPIFLASH_RD_DUALIO 0xbb +#define SPIFLASH_RD_DUAL 0x3b +#define SPIFLASH_RD_FAST 0x0b +#define SPIFLASH_RD 0x03 +#define SPIFLASH_WR_OCTALIO 0xC2 +#define SPIFLASH_WR_OCTAL 0x82 +#define SPIFLASH_WR_QUAD 0x32 +#define SPIFLASH_WR 0x02 +#define SPIFLASH_UP_QUAD 0x34 +#define SPIFLASH_UP 0x84 + +struct aml_sfc_ecc_cfg { + u32 stepsize; + u32 nsteps; + u32 strength; + u32 oobsize; + u32 bch; +}; + +struct aml_ecc_stats { + u32 corrected; + u32 bitflips; + u32 failed; +}; + +struct aml_sfc_caps { + struct aml_sfc_ecc_cfg *ecc_caps; + u32 num_ecc_caps; +}; + +struct aml_sfc { + struct device *dev; + struct clk *gate_clk; + struct clk *core_clk; + struct spi_controller *ctrl; + struct regmap *regmap_base; + const struct aml_sfc_caps *caps; + struct nand_ecc_engine ecc_eng; + struct aml_ecc_stats ecc_stats; + dma_addr_t daddr; + dma_addr_t iaddr; + u32 info_bytes; + u32 bus_rate; + u32 flags; + u32 rx_adj; + u32 cs_sel; + u8 *data_buf; + __le64 *info_buf; + u8 *priv; +}; + +#define AML_ECC_DATA(sz, s, b) { .stepsize = (sz), .strength = (s), .bch = (b) } + +static struct aml_sfc_ecc_cfg aml_a113l2_ecc_caps[] = { + AML_ECC_DATA(512, 8, ECC_BCH8_512), + AML_ECC_DATA(1024, 8, ECC_BCH8_1K), +}; + +static const struct aml_sfc_caps aml_a113l2_sfc_caps = { + .ecc_caps = aml_a113l2_ecc_caps, + .num_ecc_caps = ARRAY_SIZE(aml_a113l2_ecc_caps) +}; + +static struct aml_sfc *nand_to_aml_sfc(struct nand_device *nand) +{ + struct nand_ecc_engine *eng = nand->ecc.engine; + + return container_of(eng, struct aml_sfc, ecc_eng); +} + +static inline void *aml_sfc_to_ecc_ctx(struct aml_sfc *sfc) +{ + return sfc->priv; +} + +static int aml_sfc_wait_cmd_finish(struct aml_sfc *sfc, u64 timeout_ms) +{ + u32 cmd_size = 0; + int ret; + + /* + * The SPINAND flash controller employs a two-stage pipeline: + * 1) command prefetch; 2) command execution. + * + * All commands are stored in the FIFO, with one prefetched for execution. + * + * There are cases where the FIFO is detected as empty, yet a command may + * still be in execution and a prefetched command pending execution. + * + * So, send two idle commands to ensure all previous commands have + * been executed. + */ + regmap_write(sfc->regmap_base, SFC_CMD, CMD_IDLE(sfc->cs_sel, 0)); + regmap_write(sfc->regmap_base, SFC_CMD, CMD_IDLE(sfc->cs_sel, 0)); + + /* Wait for the FIFO to empty. */ + ret = regmap_read_poll_timeout(sfc->regmap_base, SFC_CMD, cmd_size, + !GET_CMD_SIZE(cmd_size), + 10, timeout_ms * 1000); + if (ret) + dev_err(sfc->dev, "wait for empty CMD FIFO time out\n"); + + return ret; +} + +static int aml_sfc_pre_transfer(struct aml_sfc *sfc, u32 idle_cycle, u32 cs2clk_cycle) +{ + int ret; + + ret = regmap_write(sfc->regmap_base, SFC_CMD, CMD_IDLE(CS_NONE, idle_cycle)); + if (ret) + return ret; + + return regmap_write(sfc->regmap_base, SFC_CMD, CMD_IDLE(sfc->cs_sel, cs2clk_cycle)); +} + +static int aml_sfc_end_transfer(struct aml_sfc *sfc, u32 clk2cs_cycle) +{ + int ret; + + ret = regmap_write(sfc->regmap_base, SFC_CMD, CMD_IDLE(sfc->cs_sel, clk2cs_cycle)); + if (ret) + return ret; + + return aml_sfc_wait_cmd_finish(sfc, 0); +} + +static int aml_sfc_set_bus_width(struct aml_sfc *sfc, u8 buswidth, u32 mask) +{ + int i; + u32 conf = 0; + + for (i = 0; i <= LANE_MAX; i++) { + if (buswidth == 1 << i) { + conf = i << __bf_shf(mask); + return regmap_update_bits(sfc->regmap_base, SFC_SPI_CFG, + mask, conf); + } + } + + return 0; +} + +static int aml_sfc_send_cmd(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + int i, ret; + u8 val; + + ret = aml_sfc_set_bus_width(sfc, op->cmd.buswidth, CMD_LANE); + if (ret) + return ret; + + for (i = 0; i < op->cmd.nbytes; i++) { + val = (op->cmd.opcode >> ((op->cmd.nbytes - i - 1) * 8)) & 0xff; + ret = regmap_write(sfc->regmap_base, SFC_CMD, CMD_COMMAND(sfc->cs_sel, val)); + if (ret) + return ret; + } + + return 0; +} + +static int aml_sfc_send_addr(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + int i, ret; + u8 val; + + ret = aml_sfc_set_bus_width(sfc, op->addr.buswidth, ADDR_LANE); + if (ret) + return ret; + + for (i = 0; i < op->addr.nbytes; i++) { + val = (op->addr.val >> ((op->addr.nbytes - i - 1) * 8)) & 0xff; + + ret = regmap_write(sfc->regmap_base, SFC_CMD, CMD_ADDR(sfc->cs_sel, val)); + if (ret) + return ret; + } + + return 0; +} + +static bool aml_sfc_is_xio_op(const struct spi_mem_op *op) +{ + switch (op->cmd.opcode) { + case SPIFLASH_RD_OCTALIO: + case SPIFLASH_RD_QUADIO: + case SPIFLASH_RD_DUALIO: + return true; + default: + break; + } + + return false; +} + +static int aml_sfc_send_cmd_addr_dummy(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + u32 dummy_cycle, cmd; + int ret; + + ret = aml_sfc_send_cmd(sfc, op); + if (ret) + return ret; + + ret = aml_sfc_send_addr(sfc, op); + if (ret) + return ret; + + if (op->dummy.nbytes) { + /* Dummy buswidth configuration is not supported */ + if (aml_sfc_is_xio_op(op)) + dummy_cycle = op->dummy.nbytes * 8 / op->data.buswidth; + else + dummy_cycle = op->dummy.nbytes * 8; + cmd = CMD_DUMMY(sfc->cs_sel, dummy_cycle - 1); + return regmap_write(sfc->regmap_base, SFC_CMD, cmd); + } + + return 0; +} + +static bool aml_sfc_is_snand_hwecc_page_op(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + switch (op->cmd.opcode) { + /* SPINAND read from cache cmd */ + case SPIFLASH_RD_QUADIO: + case SPIFLASH_RD_QUAD: + case SPIFLASH_RD_DUALIO: + case SPIFLASH_RD_DUAL: + case SPIFLASH_RD_FAST: + case SPIFLASH_RD: + /* SPINAND write to cache cmd */ + case SPIFLASH_WR_QUAD: + case SPIFLASH_WR: + case SPIFLASH_UP_QUAD: + case SPIFLASH_UP: + if (sfc->flags & SFC_HWECC) + return true; + else + return false; + default: + break; + } + + return false; +} + +static int aml_sfc_dma_buffer_setup(struct aml_sfc *sfc, void *databuf, + int datalen, void *infobuf, int infolen, + enum dma_data_direction dir) +{ + u32 cmd = 0; + int ret; + + sfc->daddr = dma_map_single(sfc->dev, databuf, datalen, dir); + ret = dma_mapping_error(sfc->dev, sfc->daddr); + if (ret) { + dev_err(sfc->dev, "DMA mapping error\n"); + goto out_map_data; + } + + cmd = CMD_DATA_ADDRL(sfc->daddr); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto out_map_data; + + cmd = CMD_DATA_ADDRH(sfc->daddr); + regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto out_map_data; + + if (infobuf) { + sfc->iaddr = dma_map_single(sfc->dev, infobuf, infolen, dir); + ret = dma_mapping_error(sfc->dev, sfc->iaddr); + if (ret) { + dev_err(sfc->dev, "DMA mapping error\n"); + dma_unmap_single(sfc->dev, sfc->daddr, datalen, dir); + goto out_map_data; + } + + sfc->info_bytes = infolen; + cmd = CMD_INFO_ADDRL(sfc->iaddr); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto out_map_info; + + cmd = CMD_INFO_ADDRH(sfc->iaddr); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto out_map_info; + } + + return 0; + +out_map_info: + dma_unmap_single(sfc->dev, sfc->iaddr, datalen, dir); +out_map_data: + dma_unmap_single(sfc->dev, sfc->daddr, datalen, dir); + + return ret; +} + +static void aml_sfc_dma_buffer_release(struct aml_sfc *sfc, + int datalen, int infolen, + enum dma_data_direction dir) +{ + dma_unmap_single(sfc->dev, sfc->daddr, datalen, dir); + if (infolen) { + dma_unmap_single(sfc->dev, sfc->iaddr, infolen, dir); + sfc->info_bytes = 0; + } +} + +static bool aml_sfc_dma_buffer_is_safe(const void *buffer) +{ + if ((uintptr_t)buffer % DMA_ADDR_ALIGN) + return false; + + if (virt_addr_valid(buffer)) + return true; + + return false; +} + +static void *aml_get_dma_safe_input_buf(const struct spi_mem_op *op) +{ + if (aml_sfc_dma_buffer_is_safe(op->data.buf.in)) + return op->data.buf.in; + + return kzalloc(op->data.nbytes, GFP_KERNEL); +} + +static void aml_sfc_put_dma_safe_input_buf(const struct spi_mem_op *op, void *buf) +{ + if (WARN_ON(op->data.dir != SPI_MEM_DATA_IN) || WARN_ON(!buf)) + return; + + if (buf == op->data.buf.in) + return; + + memcpy(op->data.buf.in, buf, op->data.nbytes); + kfree(buf); +} + +static void *aml_sfc_get_dma_safe_output_buf(const struct spi_mem_op *op) +{ + if (aml_sfc_dma_buffer_is_safe(op->data.buf.out)) + return (void *)op->data.buf.out; + + return kmemdup(op->data.buf.out, op->data.nbytes, GFP_KERNEL); +} + +static void aml_sfc_put_dma_safe_output_buf(const struct spi_mem_op *op, const void *buf) +{ + if (WARN_ON(op->data.dir != SPI_MEM_DATA_OUT) || WARN_ON(!buf)) + return; + + if (buf != op->data.buf.out) + kfree(buf); +} + +static u64 aml_sfc_cal_timeout_cycle(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + u64 ms; + + /* For each byte we wait for (8 cycles / buswidth) of the SPI clock. */ + ms = 8 * MSEC_PER_SEC * op->data.nbytes / op->data.buswidth; + do_div(ms, sfc->bus_rate / DEFAULT_BUS_CYCLE); + + /* + * Double the value and add a 200 ms tolerance to compensate for + * the impact of specific CS hold time, CS setup time sequences, + * controller burst gaps, and other related timing variations. + */ + ms += ms + 200; + + if (ms > UINT_MAX) + ms = UINT_MAX; + + return ms; +} + +static void aml_sfc_check_ecc_pages_valid(struct aml_sfc *sfc, bool raw) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + __le64 *info; + int ret; + + info = sfc->info_buf; + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + info += raw ? 0 : ecc_cfg->nsteps - 1; + + do { + usleep_range(10, 15); + /* info is updated by nfc dma engine*/ + smp_rmb(); + dma_sync_single_for_cpu(sfc->dev, sfc->iaddr, sfc->info_bytes, + DMA_FROM_DEVICE); + ret = le64_to_cpu(*info) & ECC_COMPLETE; + } while (!ret); +} + +static int aml_sfc_raw_io_op(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + void *buf = NULL; + int ret; + bool is_datain = false; + u32 cmd = 0, conf; + u64 timeout_ms; + + if (!op->data.nbytes) + goto end_xfer; + + conf = (op->data.nbytes >> RAW_SIZE_BW) << __bf_shf(RAW_EXT_SIZE); + ret = regmap_update_bits(sfc->regmap_base, SFC_SPI_CFG, RAW_EXT_SIZE, conf); + if (ret) + goto err_out; + + if (op->data.dir == SPI_MEM_DATA_IN) { + is_datain = true; + + buf = aml_get_dma_safe_input_buf(op); + if (!buf) { + ret = -ENOMEM; + goto err_out; + } + + cmd |= CMD_NAND2MEM(0, (op->data.nbytes & RAW_SIZE)); + } else if (op->data.dir == SPI_MEM_DATA_OUT) { + is_datain = false; + + buf = aml_sfc_get_dma_safe_output_buf(op); + if (!buf) { + ret = -ENOMEM; + goto err_out; + } + + cmd |= CMD_MEM2NAND(0, (op->data.nbytes & RAW_SIZE)); + } else { + goto end_xfer; + } + + ret = aml_sfc_dma_buffer_setup(sfc, buf, op->data.nbytes, + is_datain ? sfc->info_buf : NULL, + is_datain ? ECC_PER_INFO_BYTE : 0, + is_datain ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + if (ret) + goto err_out; + + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto err_out; + + timeout_ms = aml_sfc_cal_timeout_cycle(sfc, op); + ret = aml_sfc_wait_cmd_finish(sfc, timeout_ms); + if (ret) + goto err_out; + + if (is_datain) + aml_sfc_check_ecc_pages_valid(sfc, 1); + + if (op->data.dir == SPI_MEM_DATA_IN) + aml_sfc_put_dma_safe_input_buf(op, buf); + else if (op->data.dir == SPI_MEM_DATA_OUT) + aml_sfc_put_dma_safe_output_buf(op, buf); + + aml_sfc_dma_buffer_release(sfc, op->data.nbytes, + is_datain ? ECC_PER_INFO_BYTE : 0, + is_datain ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + +end_xfer: + return aml_sfc_end_transfer(sfc, CS_HOLD_CYCLE); + +err_out: + return ret; +} + +static void aml_sfc_set_user_byte(struct aml_sfc *sfc, __le64 *info_buf, u8 *oob_buf, bool auto_oob) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + __le64 *info; + int i, count, step_size; + + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + step_size = auto_oob ? ECC_BCH8_INFO_BYTES : ECC_BCH8_USER_BYTES; + + for (i = 0, count = 0; i < ecc_cfg->nsteps; i++, count += step_size) { + info = &info_buf[i]; + *info &= cpu_to_le64(~0xffff); + *info |= cpu_to_le64((oob_buf[count + 1] << 8) + oob_buf[count]); + } +} + +static void aml_sfc_get_user_byte(struct aml_sfc *sfc, __le64 *info_buf, u8 *oob_buf) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + __le64 *info; + int i, count; + + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + for (i = 0, count = 0; i < ecc_cfg->nsteps; i++, count += ECC_BCH8_INFO_BYTES) { + info = &info_buf[i]; + oob_buf[count] = le64_to_cpu(*info); + oob_buf[count + 1] = le64_to_cpu(*info) >> 8; + } +} + +static int aml_sfc_check_hwecc_status(struct aml_sfc *sfc, __le64 *info_buf) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + __le64 *info; + u32 i, max_bitflips = 0, per_sector_bitflips = 0; + + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + sfc->ecc_stats.failed = 0; + sfc->ecc_stats.bitflips = 0; + sfc->ecc_stats.corrected = 0; + + for (i = 0, info = info_buf; i < ecc_cfg->nsteps; i++, info++) { + if (ECC_ERR_CNT(le64_to_cpu(*info)) != ECC_UNCORRECTABLE) { + per_sector_bitflips = ECC_ERR_CNT(le64_to_cpu(*info)); + max_bitflips = max_t(u32, max_bitflips, per_sector_bitflips); + sfc->ecc_stats.corrected += per_sector_bitflips; + continue; + } + + return -EBADMSG; + } + + return max_bitflips; +} + +static int aml_sfc_read_page_hwecc(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + int ret, data_len, info_len; + u32 page_size, cmd = 0; + u64 timeout_ms; + + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + page_size = ecc_cfg->stepsize * ecc_cfg->nsteps; + data_len = page_size + ecc_cfg->oobsize; + info_len = ecc_cfg->nsteps * ECC_PER_INFO_BYTE; + + ret = aml_sfc_dma_buffer_setup(sfc, sfc->data_buf, data_len, + sfc->info_buf, info_len, DMA_FROM_DEVICE); + if (ret) + goto err_out; + + cmd |= CMD_NAND2MEM(ecc_cfg->bch, ecc_cfg->nsteps); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto err_out; + + timeout_ms = aml_sfc_cal_timeout_cycle(sfc, op); + ret = aml_sfc_wait_cmd_finish(sfc, timeout_ms); + if (ret) + goto err_out; + + aml_sfc_check_ecc_pages_valid(sfc, 0); + aml_sfc_dma_buffer_release(sfc, data_len, info_len, DMA_FROM_DEVICE); + + /* check ecc status here */ + ret = aml_sfc_check_hwecc_status(sfc, sfc->info_buf); + if (ret < 0) + sfc->ecc_stats.failed++; + else + sfc->ecc_stats.bitflips = ret; + + if (sfc->flags & SFC_DATA_ONLY) { + memcpy(op->data.buf.in, sfc->data_buf, page_size); + } else if (sfc->flags & SFC_OOB_ONLY) { + aml_sfc_get_user_byte(sfc, sfc->info_buf, op->data.buf.in); + } else if (sfc->flags & SFC_DATA_OOB) { + memcpy(op->data.buf.in, sfc->data_buf, page_size); + aml_sfc_get_user_byte(sfc, sfc->info_buf, op->data.buf.in + page_size); + } + + return aml_sfc_end_transfer(sfc, CS_HOLD_CYCLE); + +err_out: + return ret; +} + +static int aml_sfc_write_page_hwecc(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + int ret, data_len, info_len; + u32 page_size, cmd = 0; + u64 timeout_ms; + + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + page_size = ecc_cfg->stepsize * ecc_cfg->nsteps; + data_len = page_size + ecc_cfg->oobsize; + info_len = ecc_cfg->nsteps * ECC_PER_INFO_BYTE; + + memset(sfc->info_buf, ECC_PATTERN, ecc_cfg->oobsize); + memcpy(sfc->data_buf, op->data.buf.out, page_size); + + if (!(sfc->flags & SFC_DATA_ONLY)) { + if (sfc->flags & SFC_AUTO_OOB) + aml_sfc_set_user_byte(sfc, sfc->info_buf, + (u8 *)op->data.buf.out + page_size, 1); + else + aml_sfc_set_user_byte(sfc, sfc->info_buf, + (u8 *)op->data.buf.out + page_size, 0); + } + + ret = aml_sfc_dma_buffer_setup(sfc, sfc->data_buf, data_len, + sfc->info_buf, info_len, DMA_TO_DEVICE); + if (ret) + goto err_out; + + cmd |= CMD_MEM2NAND(ecc_cfg->bch, ecc_cfg->nsteps); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto err_out; + + timeout_ms = aml_sfc_cal_timeout_cycle(sfc, op); + + ret = aml_sfc_wait_cmd_finish(sfc, timeout_ms); + if (ret) + goto err_out; + + aml_sfc_dma_buffer_release(sfc, data_len, info_len, DMA_TO_DEVICE); + + return aml_sfc_end_transfer(sfc, CS_HOLD_CYCLE); + +err_out: + return ret; +} + +static int aml_sfc_exec_op(struct spi_mem *mem, const struct spi_mem_op *op) +{ + struct aml_sfc *sfc; + struct spi_device *spi; + struct aml_sfc_ecc_cfg *ecc_cfg; + int ret; + + sfc = spi_controller_get_devdata(mem->spi->controller); + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + spi = mem->spi; + sfc->cs_sel = spi->chip_select[0] ? CS_1 : CS_0; + + dev_dbg(sfc->dev, "cmd:0x%02x - addr:%08llX@%d:%u - dummy:%d:%u - data:%d:%u", + op->cmd.opcode, op->addr.val, op->addr.buswidth, op->addr.nbytes, + op->dummy.buswidth, op->dummy.nbytes, op->data.buswidth, op->data.nbytes); + + ret = aml_sfc_pre_transfer(sfc, DEFAULT_PULLUP_CYCLE, CS_SETUP_CYCLE); + if (ret) + return ret; + + ret = aml_sfc_send_cmd_addr_dummy(sfc, op); + if (ret) + return ret; + + ret = aml_sfc_set_bus_width(sfc, op->data.buswidth, DATA_LANE); + if (ret) + return ret; + + if (aml_sfc_is_snand_hwecc_page_op(sfc, op) && + ecc_cfg && !(sfc->flags & SFC_RAW_RW)) { + if (op->data.dir == SPI_MEM_DATA_IN) + return aml_sfc_read_page_hwecc(sfc, op); + else + return aml_sfc_write_page_hwecc(sfc, op); + } + + return aml_sfc_raw_io_op(sfc, op); +} + +static int aml_sfc_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op) +{ + struct aml_sfc *sfc; + struct aml_sfc_ecc_cfg *ecc_cfg; + + sfc = spi_controller_get_devdata(mem->spi->controller); + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + if (aml_sfc_is_snand_hwecc_page_op(sfc, op) && ecc_cfg) { + if (op->data.nbytes > ecc_cfg->stepsize * ECC_BCH_MAX_SECT_SIZE) + return -EOPNOTSUPP; + } else if (op->data.nbytes & ~RAW_MAX_RW_SIZE_MASK) { + return -EOPNOTSUPP; + } + + return 0; +} + +static const struct spi_controller_mem_ops aml_sfc_mem_ops = { + .adjust_op_size = aml_sfc_adjust_op_size, + .exec_op = aml_sfc_exec_op, +}; + +static int aml_sfc_layout_ecc(struct mtd_info *mtd, int section, + struct mtd_oob_region *oobregion) +{ + struct nand_device *nand = mtd_to_nanddev(mtd); + + if (section >= nand->ecc.ctx.nsteps) + return -ERANGE; + + oobregion->offset = ECC_BCH8_USER_BYTES + (section * ECC_BCH8_INFO_BYTES); + oobregion->length = ECC_BCH8_PARITY_BYTES; + + return 0; +} + +static int aml_sfc_ooblayout_free(struct mtd_info *mtd, int section, + struct mtd_oob_region *oobregion) +{ + struct nand_device *nand = mtd_to_nanddev(mtd); + + if (section >= nand->ecc.ctx.nsteps) + return -ERANGE; + + oobregion->offset = section * ECC_BCH8_INFO_BYTES; + oobregion->length = ECC_BCH8_USER_BYTES; + + return 0; +} + +static const struct mtd_ooblayout_ops aml_sfc_ooblayout_ops = { + .ecc = aml_sfc_layout_ecc, + .free = aml_sfc_ooblayout_free, +}; + +static int aml_spi_settings(struct aml_sfc *sfc, struct spi_device *spi) +{ + u32 conf = 0; + + if (spi->mode & SPI_CPHA) + conf |= CPHA; + + if (spi->mode & SPI_CPOL) + conf |= CPOL; + + conf |= FIELD_PREP(RXADJ, sfc->rx_adj); + conf |= EN_HOLD | EN_WP; + return regmap_update_bits(sfc->regmap_base, SFC_SPI_CFG, + CPHA | CPOL | RXADJ | + EN_HOLD | EN_WP, conf); +} + +static int aml_set_spi_clk(struct aml_sfc *sfc, struct spi_device *spi) +{ + u32 speed_hz; + int ret; + + if (spi->max_speed_hz > SFC_MAX_FREQUENCY) + speed_hz = SFC_MAX_FREQUENCY; + else if (!spi->max_speed_hz) + speed_hz = SFC_BUS_DEFAULT_CLK; + else if (spi->max_speed_hz < SFC_MIN_FREQUENCY) + speed_hz = SFC_MIN_FREQUENCY; + else + speed_hz = spi->max_speed_hz; + + /* The SPI clock is generated by dividing the bus clock by four by default. */ + ret = regmap_write(sfc->regmap_base, SFC_CFG, (DEFAULT_BUS_CYCLE - 1)); + if (ret) { + dev_err(sfc->dev, "failed to set bus cycle\n"); + return ret; + } + + return clk_set_rate(sfc->core_clk, speed_hz * DEFAULT_BUS_CYCLE); +} + +static int aml_sfc_setup(struct spi_device *spi) +{ + struct aml_sfc *sfc; + int ret; + + sfc = spi_controller_get_devdata(spi->controller); + ret = aml_spi_settings(sfc, spi); + if (ret) + return ret; + + ret = aml_set_spi_clk(sfc, spi); + if (ret) + return ret; + + sfc->bus_rate = clk_get_rate(sfc->core_clk); + + return 0; +} + +static int aml_sfc_ecc_init_ctx(struct nand_device *nand) +{ + struct mtd_info *mtd = nanddev_to_mtd(nand); + struct aml_sfc *sfc = nand_to_aml_sfc(nand); + struct aml_sfc_ecc_cfg *ecc_cfg; + const struct aml_sfc_caps *caps = sfc->caps; + struct aml_sfc_ecc_cfg *ecc_caps = caps->ecc_caps; + int i, ecc_strength, ecc_step_size; + + ecc_step_size = nand->ecc.user_conf.step_size; + ecc_strength = nand->ecc.user_conf.strength; + + for (i = 0; i < caps->num_ecc_caps; i++) { + if (ecc_caps[i].stepsize == ecc_step_size) { + nand->ecc.ctx.conf.step_size = ecc_step_size; + nand->ecc.ctx.conf.flags |= BIT(ecc_caps[i].bch); + } + + if (ecc_caps[i].strength == ecc_strength) + nand->ecc.ctx.conf.strength = ecc_strength; + } + + if (!nand->ecc.ctx.conf.step_size) { + nand->ecc.ctx.conf.step_size = ECC_BCH8_DEFAULT_STEP; + nand->ecc.ctx.conf.flags |= BIT(ECC_DEFAULT_BCH_MODE); + } + + if (!nand->ecc.ctx.conf.strength) + nand->ecc.ctx.conf.strength = ECC_BCH8_STRENGTH; + + nand->ecc.ctx.nsteps = nand->memorg.pagesize / nand->ecc.ctx.conf.step_size; + nand->ecc.ctx.total = nand->ecc.ctx.nsteps * ECC_BCH8_PARITY_BYTES; + + /* Verify the page size and OOB size against the SFC requirements. */ + if ((nand->memorg.pagesize % nand->ecc.ctx.conf.step_size) || + (nand->memorg.oobsize < (nand->ecc.ctx.total + + nand->ecc.ctx.nsteps * ECC_BCH8_USER_BYTES))) + return -EOPNOTSUPP; + + nand->ecc.ctx.conf.engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST; + + ecc_cfg = kzalloc(sizeof(*ecc_cfg), GFP_KERNEL); + if (!ecc_cfg) + return -ENOMEM; + + ecc_cfg->stepsize = nand->ecc.ctx.conf.step_size; + ecc_cfg->nsteps = nand->ecc.ctx.nsteps; + ecc_cfg->strength = nand->ecc.ctx.conf.strength; + ecc_cfg->oobsize = nand->memorg.oobsize; + ecc_cfg->bch = nand->ecc.ctx.conf.flags & BIT(ECC_DEFAULT_BCH_MODE) ? 1 : 2; + + nand->ecc.ctx.priv = ecc_cfg; + sfc->priv = (void *)ecc_cfg; + mtd_set_ooblayout(mtd, &aml_sfc_ooblayout_ops); + + sfc->flags |= SFC_HWECC; + + return 0; +} + +static void aml_sfc_ecc_cleanup_ctx(struct nand_device *nand) +{ + struct aml_sfc *sfc = nand_to_aml_sfc(nand); + + sfc->flags &= ~(SFC_HWECC); + kfree(nand->ecc.ctx.priv); + sfc->priv = NULL; +} + +static int aml_sfc_ecc_prepare_io_req(struct nand_device *nand, + struct nand_page_io_req *req) +{ + struct aml_sfc *sfc = nand_to_aml_sfc(nand); + struct spinand_device *spinand = nand_to_spinand(nand); + + sfc->flags &= ~SFC_XFER_MDOE_MASK; + + if (req->datalen && !req->ooblen) + sfc->flags |= SFC_DATA_ONLY; + else if (!req->datalen && req->ooblen) + sfc->flags |= SFC_OOB_ONLY; + else if (req->datalen && req->ooblen) + sfc->flags |= SFC_DATA_OOB; + + if (req->mode == MTD_OPS_RAW) + sfc->flags |= SFC_RAW_RW; + else if (req->mode == MTD_OPS_AUTO_OOB) + sfc->flags |= SFC_AUTO_OOB; + + memset(spinand->oobbuf, 0xff, nanddev_per_page_oobsize(nand)); + + return 0; +} + +static int aml_sfc_ecc_finish_io_req(struct nand_device *nand, + struct nand_page_io_req *req) +{ + struct aml_sfc *sfc = nand_to_aml_sfc(nand); + struct mtd_info *mtd = nanddev_to_mtd(nand); + + if (req->mode == MTD_OPS_RAW || req->type == NAND_PAGE_WRITE) + return 0; + + if (sfc->ecc_stats.failed) + mtd->ecc_stats.failed++; + + mtd->ecc_stats.corrected += sfc->ecc_stats.corrected; + + return sfc->ecc_stats.failed ? -EBADMSG : sfc->ecc_stats.bitflips; +} + +static const struct spi_controller_mem_caps aml_sfc_mem_caps = { + .ecc = true, +}; + +static const struct nand_ecc_engine_ops aml_sfc_ecc_engine_ops = { + .init_ctx = aml_sfc_ecc_init_ctx, + .cleanup_ctx = aml_sfc_ecc_cleanup_ctx, + .prepare_io_req = aml_sfc_ecc_prepare_io_req, + .finish_io_req = aml_sfc_ecc_finish_io_req, +}; + +static int aml_sfc_clk_init(struct aml_sfc *sfc) +{ + sfc->gate_clk = devm_clk_get_enabled(sfc->dev, "gate"); + if (IS_ERR(sfc->gate_clk)) { + dev_err(sfc->dev, "unable to enable gate clk\n"); + return PTR_ERR(sfc->gate_clk); + } + + sfc->core_clk = devm_clk_get_enabled(sfc->dev, "core"); + if (IS_ERR(sfc->core_clk)) { + dev_err(sfc->dev, "unable to enable core clk\n"); + return PTR_ERR(sfc->core_clk); + } + + return clk_set_rate(sfc->core_clk, SFC_BUS_DEFAULT_CLK); +} + +static int aml_sfc_disable_clk(struct aml_sfc *sfc) +{ + clk_disable_unprepare(sfc->core_clk); + clk_disable_unprepare(sfc->gate_clk); + + return 0; +} + +static int aml_sfc_probe(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + struct device *dev = &pdev->dev; + struct spi_controller *ctrl; + struct aml_sfc *sfc; + void __iomem *reg_base; + int ret; + u32 val = 0; + + const struct regmap_config core_config = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .max_register = SFC_SPI_CFG, + }; + + ctrl = devm_spi_alloc_host(dev, sizeof(*sfc)); + if (!ctrl) + return -ENOMEM; + platform_set_drvdata(pdev, ctrl); + + sfc = spi_controller_get_devdata(ctrl); + sfc->dev = dev; + sfc->ctrl = ctrl; + + sfc->caps = of_device_get_match_data(dev); + if (!sfc->caps) + return dev_err_probe(dev, -ENODEV, "failed to get device data\n"); + + reg_base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(reg_base)) + return PTR_ERR(reg_base); + + sfc->regmap_base = devm_regmap_init_mmio(dev, reg_base, &core_config); + if (IS_ERR(sfc->regmap_base)) + return dev_err_probe(dev, PTR_ERR(sfc->regmap_base), + "failed to init sfc base regmap\n"); + + sfc->data_buf = devm_kzalloc(dev, SFC_BUF_SIZE, GFP_KERNEL); + if (!sfc->data_buf) + return -ENOMEM; + sfc->info_buf = (__le64 *)(sfc->data_buf + SFC_DATABUF_SIZE); + + ret = aml_sfc_clk_init(sfc); + if (ret) + return dev_err_probe(dev, ret, "failed to initialize SFC clock\n"); + + /* Enable Amlogic flash controller spi mode */ + ret = regmap_write(sfc->regmap_base, SFC_SPI_CFG, SPI_MODE_EN); + if (ret) { + dev_err(dev, "failed to enable SPI mode\n"); + goto err_out; + } + + ret = dma_set_mask(sfc->dev, DMA_BIT_MASK(32)); + if (ret) { + dev_err(sfc->dev, "failed to set dma mask\n"); + goto err_out; + } + + sfc->ecc_eng.dev = &pdev->dev; + sfc->ecc_eng.integration = NAND_ECC_ENGINE_INTEGRATION_PIPELINED; + sfc->ecc_eng.ops = &aml_sfc_ecc_engine_ops; + sfc->ecc_eng.priv = sfc; + + ret = nand_ecc_register_on_host_hw_engine(&sfc->ecc_eng); + if (ret) { + dev_err(&pdev->dev, "failed to register Aml host ecc engine.\n"); + goto err_out; + } + + ret = of_property_read_u32(np, "amlogic,rx-adj", &val); + if (!ret) + sfc->rx_adj = val; + + ctrl->dev.of_node = np; + ctrl->mem_ops = &aml_sfc_mem_ops; + ctrl->mem_caps = &aml_sfc_mem_caps; + ctrl->setup = aml_sfc_setup; + ctrl->mode_bits = SPI_TX_QUAD | SPI_TX_DUAL | SPI_RX_QUAD | + SPI_RX_DUAL | SPI_TX_OCTAL | SPI_RX_OCTAL; + ctrl->max_speed_hz = SFC_MAX_FREQUENCY; + ctrl->min_speed_hz = SFC_MIN_FREQUENCY; + ctrl->num_chipselect = SFC_MAX_CS_NUM; + + ret = devm_spi_register_controller(dev, ctrl); + if (ret) + goto err_out; + + return 0; + +err_out: + aml_sfc_disable_clk(sfc); + + return ret; +} + +static void aml_sfc_remove(struct platform_device *pdev) +{ + struct spi_controller *ctlr = platform_get_drvdata(pdev); + struct aml_sfc *sfc = spi_controller_get_devdata(ctlr); + + aml_sfc_disable_clk(sfc); +} + +static const struct of_device_id aml_sfc_of_match[] = { + { + .compatible = "amlogic,a4-spifc", + .data = &aml_a113l2_sfc_caps + }, + {}, +}; +MODULE_DEVICE_TABLE(of, aml_sfc_of_match); + +static struct platform_driver aml_sfc_driver = { + .driver = { + .name = "aml_sfc", + .of_match_table = aml_sfc_of_match, + }, + .probe = aml_sfc_probe, + .remove = aml_sfc_remove, +}; +module_platform_driver(aml_sfc_driver); + +MODULE_DESCRIPTION("Amlogic SPI Flash Controller driver"); +MODULE_AUTHOR("Feng Chen "); +MODULE_LICENSE("Dual MIT/GPL"); From 6a129b2ca5c533aec89fbeb58470811cc4102642 Mon Sep 17 00:00:00 2001 From: Xianwei Zhao Date: Wed, 10 Sep 2025 18:18:27 +0800 Subject: [PATCH 1369/3206] MAINTAINERS: Add an entry for Amlogic spifc driver Add Amlogic spi flash controller entry to MAINTAINERS to clarify the maintainers. Signed-off-by: Xianwei Zhao Link: https://patch.msgid.link/20250910-spifc-v6-3-1574aa9baebd@amlogic.com Signed-off-by: Mark Brown --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index cd7ff55b5d3217..e874e2d47e87d9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1318,6 +1318,16 @@ S: Maintained F: Documentation/devicetree/bindings/rtc/amlogic,a4-rtc.yaml F: drivers/rtc/rtc-amlogic-a4.c +AMLOGIC SPIFC DRIVER +M: Liang Yang +M: Feng Chen +M: Xianwei Zhao +L: linux-amlogic@lists.infradead.org +L: linux-spi@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/spi/amlogic,a4-spifc.yaml +F: drivers/spi/spi-amlogic-spifc-a4.c + AMLOGIC SPISG DRIVER M: Sunny Luo M: Xianwei Zhao From 18d676ac0cce3e3aaa06f29553e53134e30c5f61 Mon Sep 17 00:00:00 2001 From: Nickolay Goppen Date: Mon, 8 Sep 2025 15:28:44 +0300 Subject: [PATCH 1370/3206] pinctrl: qcom: lpass-lpi: Add ability to use custom pin offsets By default pin_offset is calculated by formula: LPI_TLMM_REG_OFFSET * pin_id. However not all platforms are using this pin_offset formula (e.g. SDM660 LPASS LPI uses a predefined array of offsets [1]), so extend lpi_pingroup struct with pin_offset field, introduce extended LPI_PINGROUP_OFFSET macro with pin_offet field and introduce LPI_FLAG_USE_PREDEFINED_PIN_OFFSET flag. This adds an ability to use predefined offset for pin if it exists. [1] https://git.codelinaro.org/clo/la/kernel/msm-4.4/-/blob/LA.UM.7.2.c27-07400-sdm660.0/drivers/pinctrl/qcom/pinctrl-lpi.c#L107 Reviewed-by: Konrad Dybcio Signed-off-by: Nickolay Goppen Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-lpass-lpi.c | 18 ++++++++++++++++-- drivers/pinctrl/qcom/pinctrl-lpass-lpi.h | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c index 28eb92a2b249fe..1c97ec44aa5ff7 100644 --- a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c +++ b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c @@ -41,13 +41,27 @@ struct lpi_pinctrl { static int lpi_gpio_read(struct lpi_pinctrl *state, unsigned int pin, unsigned int addr) { - return ioread32(state->tlmm_base + LPI_TLMM_REG_OFFSET * pin + addr); + u32 pin_offset; + + if (state->data->flags & LPI_FLAG_USE_PREDEFINED_PIN_OFFSET) + pin_offset = state->data->groups[pin].pin_offset; + else + pin_offset = LPI_TLMM_REG_OFFSET * pin; + + return ioread32(state->tlmm_base + pin_offset + addr); } static int lpi_gpio_write(struct lpi_pinctrl *state, unsigned int pin, unsigned int addr, unsigned int val) { - iowrite32(val, state->tlmm_base + LPI_TLMM_REG_OFFSET * pin + addr); + u32 pin_offset; + + if (state->data->flags & LPI_FLAG_USE_PREDEFINED_PIN_OFFSET) + pin_offset = state->data->groups[pin].pin_offset; + else + pin_offset = LPI_TLMM_REG_OFFSET * pin; + + iowrite32(val, state->tlmm_base + pin_offset + addr); return 0; } diff --git a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.h b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.h index a9b2f65c1ebe0f..f4836849286134 100644 --- a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.h +++ b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.h @@ -55,6 +55,22 @@ struct pinctrl_pin_desc; LPI_MUX_##f4, \ }, \ .nfuncs = 5, \ + .pin_offset = 0, \ + } + +#define LPI_PINGROUP_OFFSET(id, soff, f1, f2, f3, f4, poff) \ + { \ + .pin = id, \ + .slew_offset = soff, \ + .funcs = (int[]){ \ + LPI_MUX_gpio, \ + LPI_MUX_##f1, \ + LPI_MUX_##f2, \ + LPI_MUX_##f3, \ + LPI_MUX_##f4, \ + }, \ + .nfuncs = 5, \ + .pin_offset = poff, \ } /* @@ -62,6 +78,7 @@ struct pinctrl_pin_desc; * pin configuration. */ #define LPI_FLAG_SLEW_RATE_SAME_REG BIT(0) +#define LPI_FLAG_USE_PREDEFINED_PIN_OFFSET BIT(1) struct lpi_pingroup { unsigned int pin; @@ -69,6 +86,7 @@ struct lpi_pingroup { int slew_offset; unsigned int *funcs; unsigned int nfuncs; + unsigned int pin_offset; }; struct lpi_function { From 350027a3ef3f30f68a06d87e72f9bcc0dba49fb8 Mon Sep 17 00:00:00 2001 From: Nickolay Goppen Date: Mon, 8 Sep 2025 15:28:45 +0300 Subject: [PATCH 1371/3206] dt-bindings: pinctrl: qcom: Add SDM660 LPI pinctrl Add bindings for pin controller in SDM660 Low Power Audio SubSystem (LPASS). Reviewed-by: Krzysztof Kozlowski Co-developed-by: Richard Acayan Signed-off-by: Richard Acayan Signed-off-by: Nickolay Goppen Signed-off-by: Linus Walleij --- .../qcom,sdm660-lpass-lpi-pinctrl.yaml | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/qcom,sdm660-lpass-lpi-pinctrl.yaml diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-lpass-lpi-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-lpass-lpi-pinctrl.yaml new file mode 100644 index 00000000000000..409e5a4d4da9c6 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-lpass-lpi-pinctrl.yaml @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/qcom,sdm660-lpass-lpi-pinctrl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm SDM660 SoC LPASS LPI TLMM + +maintainers: + - Nickolay Goppen + +description: + Top Level Mode Multiplexer pin controller in the Low Power Audio SubSystem + (LPASS) Low Power Island (LPI) of Qualcomm SDM660 SoC. + +properties: + compatible: + const: qcom,sdm660-lpass-lpi-pinctrl + + reg: + items: + - description: LPASS LPI TLMM Control and Status registers + +patternProperties: + "-state$": + oneOf: + - $ref: "#/$defs/qcom-sdm660-lpass-state" + - patternProperties: + "-pins$": + $ref: "#/$defs/qcom-sdm660-lpass-state" + additionalProperties: false + +$defs: + qcom-sdm660-lpass-state: + type: object + description: + Pinctrl node's client devices use subnodes for desired pin configuration. + Client device subnodes use below standard properties. + $ref: qcom,lpass-lpi-common.yaml#/$defs/qcom-tlmm-state + unevaluatedProperties: false + + properties: + pins: + description: + List of gpio pins affected by the properties specified in this + subnode. + items: + pattern: "^gpio([0-9]|[1-2][0-9]|3[0-1])$" + + function: + enum: [ gpio, comp_rx, dmic1_clk, dmic1_data, dmic2_clk, dmic2_data, + mclk0, pdm_tx, pdm_clk, pdm_rx, pdm_sync ] + description: + Specify the alternative function to be configured for the specified + pins. + +allOf: + - $ref: qcom,lpass-lpi-common.yaml# + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + lpi_tlmm: pinctrl@15070000 { + compatible = "qcom,sdm660-lpass-lpi-pinctrl"; + reg = <0x15070000 0x20000>; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&lpi_tlmm 0 0 32>; + + cdc_pdm_default: cdc-pdm-default-state { + clk-pins { + pins = "gpio18"; + function = "pdm_clk"; + drive-strength = <8>; + output-high; + }; + + sync-pins{ + pins = "gpio19"; + function = "pdm_sync"; + drive-strength = <4>; + output-high; + }; + + tx-pins { + pins = "gpio20"; + function = "pdm_tx"; + drive-strength = <8>; + }; + + rx-pins { + pins = "gpio21", "gpio23", "gpio25"; + function = "pdm_rx"; + drive-strength = <4>; + output-high; + }; + }; + + cdc_comp_default: cdc-comp-default-state { + pins = "gpio22", "gpio24"; + function = "comp_rx"; + drive-strength = <8>; + }; + }; From 5e302106099efc8a309d92ad84cb37d35d6c3775 Mon Sep 17 00:00:00 2001 From: Richard Acayan Date: Mon, 8 Sep 2025 15:28:46 +0300 Subject: [PATCH 1372/3206] pinctrl: qcom: Add SDM660 LPASS LPI TLMM The Snapdragon 660 has a Low-Power Island (LPI) TLMM for configuring pins related to audio. Add the driver for this. Also, this driver uses predefined pin_offsets for each pin taken from downstream driver, which does not follow the usual 0x1000 distance between pins and uses an array with predefined offsets that do not follow any regular pattern [1]. [1] https://git.codelinaro.org/clo/la/kernel/msm-4.4/-/blob/LA.UM.7.2.c27-07400-sdm660.0/drivers/pinctrl/qcom/pinctrl-lpi.c#L107 Reviewed-by: Konrad Dybcio Signed-off-by: Richard Acayan Co-developed-by: Nickolay Goppen Signed-off-by: Nickolay Goppen Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/Kconfig | 10 ++ drivers/pinctrl/qcom/Makefile | 1 + .../pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c | 160 ++++++++++++++++++ 3 files changed, 171 insertions(+) create mode 100644 drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c diff --git a/drivers/pinctrl/qcom/Kconfig b/drivers/pinctrl/qcom/Kconfig index f7594de4b1e9b9..c480e8b7850329 100644 --- a/drivers/pinctrl/qcom/Kconfig +++ b/drivers/pinctrl/qcom/Kconfig @@ -69,6 +69,16 @@ config PINCTRL_SC7280_LPASS_LPI Qualcomm Technologies Inc LPASS (Low Power Audio SubSystem) LPI (Low Power Island) found on the Qualcomm Technologies Inc SC7280 platform. +config PINCTRL_SDM660_LPASS_LPI + tristate "Qualcomm Technologies Inc SDM660 LPASS LPI pin controller driver" + depends on GPIOLIB + depends on ARM64 || COMPILE_TEST + depends on PINCTRL_LPASS_LPI + help + This is the pinctrl, pinmux, pinconf and gpiolib driver for the + Qualcomm Technologies Inc LPASS (Low Power Audio SubSystem) LPI + (Low Power Island) found on the Qualcomm Technologies Inc SDM660 platform. + config PINCTRL_SM4250_LPASS_LPI tristate "Qualcomm Technologies Inc SM4250 LPASS LPI pin controller driver" depends on ARM64 || COMPILE_TEST diff --git a/drivers/pinctrl/qcom/Makefile b/drivers/pinctrl/qcom/Makefile index 46ab3486c3ffcd..567d3051e760dd 100644 --- a/drivers/pinctrl/qcom/Makefile +++ b/drivers/pinctrl/qcom/Makefile @@ -45,6 +45,7 @@ obj-$(CONFIG_PINCTRL_SC7280_LPASS_LPI) += pinctrl-sc7280-lpass-lpi.o obj-$(CONFIG_PINCTRL_SC8180X) += pinctrl-sc8180x.o obj-$(CONFIG_PINCTRL_SC8280XP) += pinctrl-sc8280xp.o obj-$(CONFIG_PINCTRL_SDM660) += pinctrl-sdm660.o +obj-$(CONFIG_PINCTRL_SDM660_LPASS_LPI) += pinctrl-sdm660-lpass-lpi.o obj-$(CONFIG_PINCTRL_SDM670) += pinctrl-sdm670.o obj-$(CONFIG_PINCTRL_SDM845) += pinctrl-sdm845.o obj-$(CONFIG_PINCTRL_SDX55) += pinctrl-sdx55.o diff --git a/drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c new file mode 100644 index 00000000000000..d93af5f0e8d301 --- /dev/null +++ b/drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This driver is solely based on the limited information in downstream code. + * Any verification with schematics would be greatly appreciated. + * + * Copyright (c) 2023, Richard Acayan. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include "pinctrl-lpass-lpi.h" + +enum lpass_lpi_functions { + LPI_MUX_comp_rx, + LPI_MUX_dmic1_clk, + LPI_MUX_dmic1_data, + LPI_MUX_dmic2_clk, + LPI_MUX_dmic2_data, + LPI_MUX_mclk0, + LPI_MUX_pdm_tx, + LPI_MUX_pdm_clk, + LPI_MUX_pdm_rx, + LPI_MUX_pdm_sync, + + LPI_MUX_gpio, + LPI_MUX__, +}; + +static const struct pinctrl_pin_desc sdm660_lpi_pinctrl_pins[] = { + PINCTRL_PIN(0, "gpio0"), + PINCTRL_PIN(1, "gpio1"), + PINCTRL_PIN(2, "gpio2"), + PINCTRL_PIN(3, "gpio3"), + PINCTRL_PIN(4, "gpio4"), + PINCTRL_PIN(5, "gpio5"), + PINCTRL_PIN(6, "gpio6"), + PINCTRL_PIN(7, "gpio7"), + PINCTRL_PIN(8, "gpio8"), + PINCTRL_PIN(9, "gpio9"), + PINCTRL_PIN(10, "gpio10"), + PINCTRL_PIN(11, "gpio11"), + PINCTRL_PIN(12, "gpio12"), + PINCTRL_PIN(13, "gpio13"), + PINCTRL_PIN(14, "gpio14"), + PINCTRL_PIN(15, "gpio15"), + PINCTRL_PIN(16, "gpio16"), + PINCTRL_PIN(17, "gpio17"), + PINCTRL_PIN(18, "gpio18"), + PINCTRL_PIN(19, "gpio19"), + PINCTRL_PIN(20, "gpio20"), + PINCTRL_PIN(21, "gpio21"), + PINCTRL_PIN(22, "gpio22"), + PINCTRL_PIN(23, "gpio23"), + PINCTRL_PIN(24, "gpio24"), + PINCTRL_PIN(25, "gpio25"), + PINCTRL_PIN(26, "gpio26"), + PINCTRL_PIN(27, "gpio27"), + PINCTRL_PIN(28, "gpio28"), + PINCTRL_PIN(29, "gpio29"), + PINCTRL_PIN(30, "gpio30"), + PINCTRL_PIN(31, "gpio31"), +}; + +static const char * const comp_rx_groups[] = { "gpio22", "gpio24" }; +static const char * const dmic1_clk_groups[] = { "gpio26" }; +static const char * const dmic1_data_groups[] = { "gpio27" }; +static const char * const dmic2_clk_groups[] = { "gpio28" }; +static const char * const dmic2_data_groups[] = { "gpio29" }; +static const char * const mclk0_groups[] = { "gpio18" }; +static const char * const pdm_tx_groups[] = { "gpio20" }; +static const char * const pdm_clk_groups[] = { "gpio18" }; +static const char * const pdm_rx_groups[] = { "gpio21", "gpio23", "gpio25" }; +static const char * const pdm_sync_groups[] = { "gpio19" }; + +const struct lpi_pingroup sdm660_lpi_pinctrl_groups[] = { + LPI_PINGROUP_OFFSET(0, LPI_NO_SLEW, _, _, _, _, 0x0000), + LPI_PINGROUP_OFFSET(1, LPI_NO_SLEW, _, _, _, _, 0x1000), + LPI_PINGROUP_OFFSET(2, LPI_NO_SLEW, _, _, _, _, 0x2000), + LPI_PINGROUP_OFFSET(3, LPI_NO_SLEW, _, _, _, _, 0x2010), + LPI_PINGROUP_OFFSET(4, LPI_NO_SLEW, _, _, _, _, 0x3000), + LPI_PINGROUP_OFFSET(5, LPI_NO_SLEW, _, _, _, _, 0x3010), + LPI_PINGROUP_OFFSET(6, LPI_NO_SLEW, _, _, _, _, 0x4000), + LPI_PINGROUP_OFFSET(7, LPI_NO_SLEW, _, _, _, _, 0x4010), + LPI_PINGROUP_OFFSET(8, LPI_NO_SLEW, _, _, _, _, 0x5000), + LPI_PINGROUP_OFFSET(9, LPI_NO_SLEW, _, _, _, _, 0x5010), + LPI_PINGROUP_OFFSET(10, LPI_NO_SLEW, _, _, _, _, 0x5020), + LPI_PINGROUP_OFFSET(11, LPI_NO_SLEW, _, _, _, _, 0x5030), + LPI_PINGROUP_OFFSET(12, LPI_NO_SLEW, _, _, _, _, 0x6000), + LPI_PINGROUP_OFFSET(13, LPI_NO_SLEW, _, _, _, _, 0x6010), + LPI_PINGROUP_OFFSET(14, LPI_NO_SLEW, _, _, _, _, 0x7000), + LPI_PINGROUP_OFFSET(15, LPI_NO_SLEW, _, _, _, _, 0x7010), + LPI_PINGROUP_OFFSET(16, LPI_NO_SLEW, _, _, _, _, 0x5040), + LPI_PINGROUP_OFFSET(17, LPI_NO_SLEW, _, _, _, _, 0x5050), + + LPI_PINGROUP_OFFSET(18, LPI_NO_SLEW, pdm_clk, mclk0, _, _, 0x8000), + LPI_PINGROUP_OFFSET(19, LPI_NO_SLEW, pdm_sync, _, _, _, 0x8010), + LPI_PINGROUP_OFFSET(20, LPI_NO_SLEW, pdm_tx, _, _, _, 0x8020), + LPI_PINGROUP_OFFSET(21, LPI_NO_SLEW, pdm_rx, _, _, _, 0x8030), + LPI_PINGROUP_OFFSET(22, LPI_NO_SLEW, comp_rx, _, _, _, 0x8040), + LPI_PINGROUP_OFFSET(23, LPI_NO_SLEW, pdm_rx, _, _, _, 0x8050), + LPI_PINGROUP_OFFSET(24, LPI_NO_SLEW, comp_rx, _, _, _, 0x8060), + LPI_PINGROUP_OFFSET(25, LPI_NO_SLEW, pdm_rx, _, _, _, 0x8070), + LPI_PINGROUP_OFFSET(26, LPI_NO_SLEW, dmic1_clk, _, _, _, 0x9000), + LPI_PINGROUP_OFFSET(27, LPI_NO_SLEW, dmic1_data, _, _, _, 0x9010), + LPI_PINGROUP_OFFSET(28, LPI_NO_SLEW, dmic2_clk, _, _, _, 0xa000), + LPI_PINGROUP_OFFSET(29, LPI_NO_SLEW, dmic2_data, _, _, _, 0xa010), + + LPI_PINGROUP_OFFSET(30, LPI_NO_SLEW, _, _, _, _, 0xb000), + LPI_PINGROUP_OFFSET(31, LPI_NO_SLEW, _, _, _, _, 0xb010), +}; + +const struct lpi_function sdm660_lpi_pinctrl_functions[] = { + LPI_FUNCTION(comp_rx), + LPI_FUNCTION(dmic1_clk), + LPI_FUNCTION(dmic1_data), + LPI_FUNCTION(dmic2_clk), + LPI_FUNCTION(dmic2_data), + LPI_FUNCTION(mclk0), + LPI_FUNCTION(pdm_tx), + LPI_FUNCTION(pdm_clk), + LPI_FUNCTION(pdm_rx), + LPI_FUNCTION(pdm_sync), +}; + +static const struct lpi_pinctrl_variant_data sdm660_lpi_pinctrl_data = { + .pins = sdm660_lpi_pinctrl_pins, + .npins = ARRAY_SIZE(sdm660_lpi_pinctrl_pins), + .groups = sdm660_lpi_pinctrl_groups, + .ngroups = ARRAY_SIZE(sdm660_lpi_pinctrl_groups), + .functions = sdm660_lpi_pinctrl_functions, + .nfunctions = ARRAY_SIZE(sdm660_lpi_pinctrl_functions), + .flags = LPI_FLAG_SLEW_RATE_SAME_REG | LPI_FLAG_USE_PREDEFINED_PIN_OFFSET +}; + +static const struct of_device_id sdm660_lpi_pinctrl_of_match[] = { + { + .compatible = "qcom,sdm660-lpass-lpi-pinctrl", + .data = &sdm660_lpi_pinctrl_data, + }, + { } +}; +MODULE_DEVICE_TABLE(of, sdm660_lpi_pinctrl_of_match); + +static struct platform_driver sdm660_lpi_pinctrl_driver = { + .driver = { + .name = "qcom-sdm660-lpass-lpi-pinctrl", + .of_match_table = sdm660_lpi_pinctrl_of_match, + }, + .probe = lpi_pinctrl_probe, + .remove = lpi_pinctrl_remove, +}; +module_platform_driver(sdm660_lpi_pinctrl_driver); + +MODULE_AUTHOR("Richard Acayan "); +MODULE_DESCRIPTION("QTI SDM660 LPI GPIO pin control driver"); +MODULE_LICENSE("GPL"); From 87b90cee22d8658a69c0fbd43633839b75f8f05f Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 2 Sep 2025 21:02:22 +0200 Subject: [PATCH 1373/3206] MAINTAINERS: drm-misc: fix X: entries for nova/nouveau Nouveau patches usually flow through the drm-misc tree, while nova (and nova-core) are maintained through a dedicated driver tree and soon through drm-rust. Hence, fix up the corresponding X: entries to list nova instead of nouveau. Reported-by: Maxime Ripard Closes: https://lore.kernel.org/dri-devel/enuksb2qk5wyrilz3l2vnog45lghgmplrav5to6pd5k5owi36h@pxdq6y5dpgpt/ Acked-by: Maxime Ripard Link: https://lore.kernel.org/r/20250902190247.435340-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f94e115a8d32b0..ed7e06ca1d05be 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8091,7 +8091,7 @@ X: drivers/gpu/drm/i915/ X: drivers/gpu/drm/kmb/ X: drivers/gpu/drm/mediatek/ X: drivers/gpu/drm/msm/ -X: drivers/gpu/drm/nouveau/ +X: drivers/gpu/drm/nova/ X: drivers/gpu/drm/radeon/ X: drivers/gpu/drm/tegra/ X: drivers/gpu/drm/xe/ From a3967baad4d533dc254c31e0d221e51c8d223d58 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 9 Sep 2025 23:26:12 +0000 Subject: [PATCH 1374/3206] tcp_bpf: Call sk_msg_free() when tcp_bpf_send_verdict() fails to allocate psock->cork. syzbot reported the splat below. [0] The repro does the following: 1. Load a sk_msg prog that calls bpf_msg_cork_bytes(msg, cork_bytes) 2. Attach the prog to a SOCKMAP 3. Add a socket to the SOCKMAP 4. Activate fault injection 5. Send data less than cork_bytes At 5., the data is carried over to the next sendmsg() as it is smaller than the cork_bytes specified by bpf_msg_cork_bytes(). Then, tcp_bpf_send_verdict() tries to allocate psock->cork to hold the data, but this fails silently due to fault injection + __GFP_NOWARN. If the allocation fails, we need to revert the sk->sk_forward_alloc change done by sk_msg_alloc(). Let's call sk_msg_free() when tcp_bpf_send_verdict fails to allocate psock->cork. The "*copied" also needs to be updated such that a proper error can be returned to the caller, sendmsg. It fails to allocate psock->cork. Nothing has been corked so far, so this patch simply sets "*copied" to 0. [0]: WARNING: net/ipv4/af_inet.c:156 at inet_sock_destruct+0x623/0x730 net/ipv4/af_inet.c:156, CPU#1: syz-executor/5983 Modules linked in: CPU: 1 UID: 0 PID: 5983 Comm: syz-executor Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:inet_sock_destruct+0x623/0x730 net/ipv4/af_inet.c:156 Code: 0f 0b 90 e9 62 fe ff ff e8 7a db b5 f7 90 0f 0b 90 e9 95 fe ff ff e8 6c db b5 f7 90 0f 0b 90 e9 bb fe ff ff e8 5e db b5 f7 90 <0f> 0b 90 e9 e1 fe ff ff 89 f9 80 e1 07 80 c1 03 38 c1 0f 8c 9f fc RSP: 0018:ffffc90000a08b48 EFLAGS: 00010246 RAX: ffffffff8a09d0b2 RBX: dffffc0000000000 RCX: ffff888024a23c80 RDX: 0000000000000100 RSI: 0000000000000fff RDI: 0000000000000000 RBP: 0000000000000fff R08: ffff88807e07c627 R09: 1ffff1100fc0f8c4 R10: dffffc0000000000 R11: ffffed100fc0f8c5 R12: ffff88807e07c380 R13: dffffc0000000000 R14: ffff88807e07c60c R15: 1ffff1100fc0f872 FS: 00005555604c4500(0000) GS:ffff888125af1000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005555604df5c8 CR3: 0000000032b06000 CR4: 00000000003526f0 Call Trace: __sk_destruct+0x86/0x660 net/core/sock.c:2339 rcu_do_batch kernel/rcu/tree.c:2605 [inline] rcu_core+0xca8/0x1770 kernel/rcu/tree.c:2861 handle_softirqs+0x286/0x870 kernel/softirq.c:579 __do_softirq kernel/softirq.c:613 [inline] invoke_softirq kernel/softirq.c:453 [inline] __irq_exit_rcu+0xca/0x1f0 kernel/softirq.c:680 irq_exit_rcu+0x9/0x30 kernel/softirq.c:696 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1052 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1052 Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Reported-by: syzbot+4cabd1d2fa917a456db8@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68c0b6b5.050a0220.3c6139.0013.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250909232623.4151337-1-kuniyu@google.com --- net/ipv4/tcp_bpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ba581785adb4b3..a268e1595b22aa 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -408,8 +408,11 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, if (!psock->cork) { psock->cork = kzalloc(sizeof(*psock->cork), GFP_ATOMIC | __GFP_NOWARN); - if (!psock->cork) + if (!psock->cork) { + sk_msg_free(sk, msg); + *copied = 0; return -ENOMEM; + } } memcpy(psock->cork, msg, sizeof(*msg)); return 0; From d013ebc3499fd87cb9dee1dafd0c58aeb05c27c1 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 10 Sep 2025 16:56:06 +0200 Subject: [PATCH 1375/3206] selftests: can: enable CONFIG_CAN_VCAN as a module A proper kernel configuration for running kselftest can be obtained with: $ yes | make kselftest-merge Build of 'vcan' driver is currently missing, while the other required knobs are already there because of net/link_netns.py [1]. Add a config file in selftests/net/can to store the minimum set of kconfig needed for CAN selftests. [1] https://patch.msgid.link/20250219125039.18024-14-shaw.leon@gmail.com Fixes: 77442ffa83e8 ("selftests: can: Import tst-filter from can-tests") Reviewed-by: Vincent Mailhol Signed-off-by: Davide Caratti Link: https://patch.msgid.link/fa4c0ea262ec529f25e5f5aa9269d84764c67321.1757516009.git.dcaratti@redhat.com Signed-off-by: Marc Kleine-Budde --- tools/testing/selftests/net/can/config | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/testing/selftests/net/can/config diff --git a/tools/testing/selftests/net/can/config b/tools/testing/selftests/net/can/config new file mode 100644 index 00000000000000..188f7979667097 --- /dev/null +++ b/tools/testing/selftests/net/can/config @@ -0,0 +1,3 @@ +CONFIG_CAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_VCAN=m From 7fcbe5b2c6a4b5407bf2241fdb71e0a390f6ab9a Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 25 Aug 2025 23:07:24 +0900 Subject: [PATCH 1376/3206] can: j1939: implement NETDEV_UNREGISTER notification handler syzbot is reporting unregister_netdevice: waiting for vcan0 to become free. Usage count = 2 problem, for j1939 protocol did not have NETDEV_UNREGISTER notification handler for undoing changes made by j1939_sk_bind(). Commit 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") expects that a call to j1939_priv_put() can be unconditionally delayed until j1939_sk_sock_destruct() is called. But we need to call j1939_priv_put() against an extra ref held by j1939_sk_bind() call (as a part of undoing changes made by j1939_sk_bind()) as soon as NETDEV_UNREGISTER notification fires (i.e. before j1939_sk_sock_destruct() is called via j1939_sk_release()). Otherwise, the extra ref on "struct j1939_priv" held by j1939_sk_bind() call prevents "struct net_device" from dropping the usage count to 1; making it impossible for unregister_netdevice() to continue. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=881d65229ca4f9ae8c84 Tested-by: syzbot Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Fixes: 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/ac9db9a4-6c30-416e-8b94-96e6559d55b2@I-love.SAKURA.ne.jp [mkl: remove space in front of label] Signed-off-by: Marc Kleine-Budde --- net/can/j1939/j1939-priv.h | 1 + net/can/j1939/main.c | 3 +++ net/can/j1939/socket.c | 49 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 31a93cae5111b5..81f58924b4acd7 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -212,6 +212,7 @@ void j1939_priv_get(struct j1939_priv *priv); /* notify/alert all j1939 sockets bound to ifindex */ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv); +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv); int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk); void j1939_tp_init(struct j1939_priv *priv); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 7e8a20f2fc42b5..3706a872ecafdb 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -377,6 +377,9 @@ static int j1939_netdev_notify(struct notifier_block *nb, j1939_sk_netdev_event_netdown(priv); j1939_ecu_unmap_all(priv); break; + case NETDEV_UNREGISTER: + j1939_sk_netdev_event_unregister(priv); + break; } j1939_priv_put(priv); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 3d8b588822f9d2..70ebc861ea2a93 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -1300,6 +1300,55 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) read_unlock_bh(&priv->j1939_socks_lock); } +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv) +{ + struct sock *sk; + struct j1939_sock *jsk; + bool wait_rcu = false; + +rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */ + read_lock_bh(&priv->j1939_socks_lock); + list_for_each_entry(jsk, &priv->j1939_socks, list) { + /* Skip if j1939_jsk_add() is not called on this socket. */ + if (!(jsk->state & J1939_SOCK_BOUND)) + continue; + sk = &jsk->sk; + sock_hold(sk); + read_unlock_bh(&priv->j1939_socks_lock); + /* Check if j1939_jsk_del() is not yet called on this socket after holding + * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call + * j1939_jsk_del() with socket's lock held. + */ + lock_sock(sk); + if (jsk->state & J1939_SOCK_BOUND) { + /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del(). + * Make this socket no longer bound, by pretending as if j1939_sk_bind() + * dropped old references but did not get new references. + */ + j1939_jsk_del(priv, jsk); + j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); + j1939_netdev_stop(priv); + /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from + * calling the corresponding j1939_priv_put(). + * + * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after + * an RCU grace period. But since the caller is holding a ref on this + * "priv", we can defer synchronize_rcu() until immediately before + * the caller calls j1939_priv_put(). + */ + j1939_priv_put(priv); + jsk->priv = NULL; + wait_rcu = true; + } + release_sock(sk); + sock_put(sk); + goto rescan; + } + read_unlock_bh(&priv->j1939_socks_lock); + if (wait_rcu) + synchronize_rcu(); +} + static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { From f214744c8a27c3c1da6b538c232da22cd027530e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 24 Aug 2025 19:30:09 +0900 Subject: [PATCH 1377/3206] can: j1939: j1939_sk_bind(): call j1939_priv_put() immediately when j1939_local_ecu_get() failed Commit 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") expects that a call to j1939_priv_put() can be unconditionally delayed until j1939_sk_sock_destruct() is called. But a refcount leak will happen when j1939_sk_bind() is called again after j1939_local_ecu_get() from previous j1939_sk_bind() call returned an error. We need to call j1939_priv_put() before j1939_sk_bind() returns an error. Fixes: 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/4f49a1bc-a528-42ad-86c0-187268ab6535@I-love.SAKURA.ne.jp Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 70ebc861ea2a93..88e7160d424896 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -521,6 +521,9 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); + jsk->priv = NULL; + synchronize_rcu(); + j1939_priv_put(priv); goto out_release_sock; } From 06e02da29f6f1a45fc07bd60c7eaf172dc21e334 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 24 Aug 2025 19:27:40 +0900 Subject: [PATCH 1378/3206] can: j1939: j1939_local_ecu_get(): undo increment when j1939_local_ecu_get() fails Since j1939_sk_bind() and j1939_sk_release() call j1939_local_ecu_put() when J1939_SOCK_BOUND was already set, but the error handling path for j1939_sk_bind() will not set J1939_SOCK_BOUND when j1939_local_ecu_get() fails, j1939_local_ecu_get() needs to undo priv->ents[sa].nusers++ when j1939_local_ecu_get() returns an error. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/e7f80046-4ff7-4ce2-8ad8-7c3c678a42c9@I-love.SAKURA.ne.jp Signed-off-by: Marc Kleine-Budde --- net/can/j1939/bus.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c index 39844f14eed862..797719cb227ec5 100644 --- a/net/can/j1939/bus.c +++ b/net/can/j1939/bus.c @@ -290,8 +290,11 @@ int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa) if (!ecu) ecu = j1939_ecu_create_locked(priv, name); err = PTR_ERR_OR_ZERO(ecu); - if (err) + if (err) { + if (j1939_address_is_unicast(sa)) + priv->ents[sa].nusers--; goto done; + } ecu->nusers++; /* TODO: do we care if ecu->addr != sa? */ From ef79f00be72bd81d2e1e6f060d83cf7e425deee4 Mon Sep 17 00:00:00 2001 From: Anssi Hannula Date: Fri, 22 Aug 2025 12:50:02 +0300 Subject: [PATCH 1379/3206] can: xilinx_can: xcan_write_frame(): fix use-after-free of transmitted SKB can_put_echo_skb() takes ownership of the SKB and it may be freed during or after the call. However, xilinx_can xcan_write_frame() keeps using SKB after the call. Fix that by only calling can_put_echo_skb() after the code is done touching the SKB. The tx_lock is held for the entire xcan_write_frame() execution and also on the can_get_echo_skb() side so the order of operations does not matter. An earlier fix commit 3d3c817c3a40 ("can: xilinx_can: Fix usage of skb memory") did not move the can_put_echo_skb() call far enough. Signed-off-by: Anssi Hannula Fixes: 1598efe57b3e ("can: xilinx_can: refactor code in preparation for CAN FD support") Link: https://patch.msgid.link/20250822095002.168389-1-anssi.hannula@bitwise.fi [mkl: add "commit" in front of sha1 in patch description] [mkl: fix indention] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/xilinx_can.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c index 81baec8eb1e5da..a25a3ca62c12e3 100644 --- a/drivers/net/can/xilinx_can.c +++ b/drivers/net/can/xilinx_can.c @@ -690,14 +690,6 @@ static void xcan_write_frame(struct net_device *ndev, struct sk_buff *skb, dlc |= XCAN_DLCR_EDL_MASK; } - if (!(priv->devtype.flags & XCAN_FLAG_TX_MAILBOXES) && - (priv->devtype.flags & XCAN_FLAG_TXFEMP)) - can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max, 0); - else - can_put_echo_skb(skb, ndev, 0, 0); - - priv->tx_head++; - priv->write_reg(priv, XCAN_FRAME_ID_OFFSET(frame_offset), id); /* If the CAN frame is RTR frame this write triggers transmission * (not on CAN FD) @@ -730,6 +722,14 @@ static void xcan_write_frame(struct net_device *ndev, struct sk_buff *skb, data[1]); } } + + if (!(priv->devtype.flags & XCAN_FLAG_TX_MAILBOXES) && + (priv->devtype.flags & XCAN_FLAG_TXFEMP)) + can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max, 0); + else + can_put_echo_skb(skb, ndev, 0, 0); + + priv->tx_head++; } /** From 5c793afa07da6d2d4595f6c73a2a543a471bb055 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 14 Aug 2025 13:26:37 +0200 Subject: [PATCH 1380/3206] can: rcar_can: rcar_can_resume(): fix s2ram with PSCI On R-Car Gen3 using PSCI, s2ram powers down the SoC. After resume, the CAN interface no longer works, until it is brought down and up again. Fix this by calling rcar_can_start() from the PM resume callback, to fully initialize the controller instead of just restarting it. Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/699b2f7fcb60b31b6f976a37f08ce99c5ffccb31.1755165227.git.geert+renesas@glider.be Signed-off-by: Marc Kleine-Budde --- drivers/net/can/rcar/rcar_can.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c index 64e664f5adcc4a..87c134bcd48db5 100644 --- a/drivers/net/can/rcar/rcar_can.c +++ b/drivers/net/can/rcar/rcar_can.c @@ -861,7 +861,6 @@ static int rcar_can_resume(struct device *dev) { struct net_device *ndev = dev_get_drvdata(dev); struct rcar_can_priv *priv = netdev_priv(ndev); - u16 ctlr; int err; if (!netif_running(ndev)) @@ -873,12 +872,7 @@ static int rcar_can_resume(struct device *dev) return err; } - ctlr = readw(&priv->regs->ctlr); - ctlr &= ~RCAR_CAN_CTLR_SLPM; - writew(ctlr, &priv->regs->ctlr); - ctlr &= ~RCAR_CAN_CTLR_CANM; - writew(ctlr, &priv->regs->ctlr); - priv->can.state = CAN_STATE_ERROR_ACTIVE; + rcar_can_start(ndev); netif_device_attach(ndev); netif_start_queue(ndev); From 8d73829b78ca1a0e6eb93380f3bf5193d58c281c Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 10 Sep 2025 17:19:28 +0200 Subject: [PATCH 1381/3206] x86/startup/sev: Document the CPUID flow in the boot #VC handler Document the CPUID reading the different SEV guest types do - the SNP one which relies on the presence of a CPUID table and the SEV-ES one, which reads the CPUID supplied by the hypervisor. The intent being to clarify the two back-to-back, similar CPUID invocations. No functional changes. [ bp: Turn into a proper patch. ] Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/fbb24767-0e06-d1d6-36e0-1757d98aca66@amd.com --- arch/x86/boot/startup/sev-shared.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 08cc1568d8af27..4e22ffd735168b 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -458,6 +458,13 @@ void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) leaf.fn = fn; leaf.subfn = subfn; + /* + * If SNP is active, then snp_cpuid() uses the CPUID table to obtain the + * CPUID values (with possible HV interaction during post-processing of + * the values). But if SNP is not active (no CPUID table present), then + * snp_cpuid() returns -EOPNOTSUPP so that an SEV-ES guest can call the + * HV to obtain the CPUID information. + */ ret = snp_cpuid(snp_cpuid_hv_msr, NULL, &leaf); if (!ret) goto cpuid_done; @@ -465,6 +472,10 @@ void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) if (ret != -EOPNOTSUPP) goto fail; + /* + * This is reached by a SEV-ES guest and needs to invoke the HV for + * the CPUID data. + */ if (__sev_cpuid_hv_msr(&leaf)) goto fail; From 25fbbaf515acd13399589bd5ee6de5f35740cef2 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Sun, 31 Aug 2025 01:08:56 +0800 Subject: [PATCH 1382/3206] clk: sunxi-ng: mp: Fix dual-divider clock rate readback When dual-divider clock support was introduced, the P divider offset was left out of the .recalc_rate readback function. This causes the clock rate to become bogus or even zero (possibly due to the P divider being 1, leading to a divide-by-zero). Fix this by incorporating the P divider offset into the calculation. Fixes: 45717804b75e ("clk: sunxi-ng: mp: introduce dual-divider clock") Reviewed-by: Andre Przywara Link: https://patch.msgid.link/20250830170901.1996227-4-wens@kernel.org Signed-off-by: Chen-Yu Tsai --- drivers/clk/sunxi-ng/ccu_mp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/sunxi-ng/ccu_mp.c b/drivers/clk/sunxi-ng/ccu_mp.c index 354c981943b6f8..4221b1888b38da 100644 --- a/drivers/clk/sunxi-ng/ccu_mp.c +++ b/drivers/clk/sunxi-ng/ccu_mp.c @@ -185,7 +185,7 @@ static unsigned long ccu_mp_recalc_rate(struct clk_hw *hw, p &= (1 << cmp->p.width) - 1; if (cmp->common.features & CCU_FEATURE_DUAL_DIV) - rate = (parent_rate / p) / m; + rate = (parent_rate / (p + cmp->p.offset)) / m; else rate = (parent_rate >> p) / m; From 7f201ca18c825592e392596a2fca2374dd2a4dfe Mon Sep 17 00:00:00 2001 From: Matthew Maurer Date: Thu, 4 Sep 2025 21:13:52 +0000 Subject: [PATCH 1383/3206] rust: debugfs: Add initial support for directories Adds a `debugfs::Dir` type that can be used to create and remove DebugFS directories. The `Dir` handle automatically cleans up the directory on `Drop`. Signed-off-by: Matthew Maurer Tested-by: Dirk Behme Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250904-debugfs-rust-v11-1-7d12a165685a@google.com Signed-off-by: Danilo Krummrich --- MAINTAINERS | 2 + rust/bindings/bindings_helper.h | 1 + rust/kernel/debugfs.rs | 82 +++++++++++++++++++++++++++++++++ rust/kernel/debugfs/entry.rs | 61 ++++++++++++++++++++++++ rust/kernel/lib.rs | 1 + 5 files changed, 147 insertions(+) create mode 100644 rust/kernel/debugfs.rs create mode 100644 rust/kernel/debugfs/entry.rs diff --git a/MAINTAINERS b/MAINTAINERS index b4528c1ce4453b..4dc5d124bb14c0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7487,6 +7487,8 @@ F: include/linux/kobj* F: include/linux/property.h F: include/linux/sysfs.h F: lib/kobj* +F: rust/kernel/debugfs.rs +F: rust/kernel/debugfs/ F: rust/kernel/device.rs F: rust/kernel/device/ F: rust/kernel/device_id.rs diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 69a975da829f0c..18bd6ab1fd1592 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include diff --git a/rust/kernel/debugfs.rs b/rust/kernel/debugfs.rs new file mode 100644 index 00000000000000..65be71600b8eda --- /dev/null +++ b/rust/kernel/debugfs.rs @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Google LLC. + +//! DebugFS Abstraction +//! +//! C header: [`include/linux/debugfs.h`](srctree/include/linux/debugfs.h) + +// When DebugFS is disabled, many parameters are dead. Linting for this isn't helpful. +#![cfg_attr(not(CONFIG_DEBUG_FS), allow(unused_variables))] + +#[cfg(CONFIG_DEBUG_FS)] +use crate::prelude::*; +use crate::str::CStr; +#[cfg(CONFIG_DEBUG_FS)] +use crate::sync::Arc; + +#[cfg(CONFIG_DEBUG_FS)] +mod entry; +#[cfg(CONFIG_DEBUG_FS)] +use entry::Entry; + +/// Owning handle to a DebugFS directory. +/// +/// The directory in the filesystem represented by [`Dir`] will be removed when handle has been +/// dropped *and* all children have been removed. +// If we have a parent, we hold a reference to it in the `Entry`. This prevents the `dentry` +// we point to from being cleaned up if our parent `Dir`/`Entry` is dropped before us. +// +// The `None` option indicates that the `Arc` could not be allocated, so our children would not be +// able to refer to us. In this case, we need to silently fail. All future child directories/files +// will silently fail as well. +#[derive(Clone)] +pub struct Dir(#[cfg(CONFIG_DEBUG_FS)] Option>); + +impl Dir { + /// Create a new directory in DebugFS. If `parent` is [`None`], it will be created at the root. + fn create(name: &CStr, parent: Option<&Dir>) -> Self { + #[cfg(CONFIG_DEBUG_FS)] + { + let parent_entry = match parent { + // If the parent couldn't be allocated, just early-return + Some(Dir(None)) => return Self(None), + Some(Dir(Some(entry))) => Some(entry.clone()), + None => None, + }; + Self( + // If Arc creation fails, the `Entry` will be dropped, so the directory will be + // cleaned up. + Arc::new(Entry::dynamic_dir(name, parent_entry), GFP_KERNEL).ok(), + ) + } + #[cfg(not(CONFIG_DEBUG_FS))] + Self() + } + + /// Create a new directory in DebugFS at the root. + /// + /// # Examples + /// + /// ``` + /// # use kernel::c_str; + /// # use kernel::debugfs::Dir; + /// let debugfs = Dir::new(c_str!("parent")); + /// ``` + pub fn new(name: &CStr) -> Self { + Dir::create(name, None) + } + + /// Creates a subdirectory within this directory. + /// + /// # Examples + /// + /// ``` + /// # use kernel::c_str; + /// # use kernel::debugfs::Dir; + /// let parent = Dir::new(c_str!("parent")); + /// let child = parent.subdir(c_str!("child")); + /// ``` + pub fn subdir(&self, name: &CStr) -> Self { + Dir::create(name, Some(self)) + } +} diff --git a/rust/kernel/debugfs/entry.rs b/rust/kernel/debugfs/entry.rs new file mode 100644 index 00000000000000..d2fba0e65e20e9 --- /dev/null +++ b/rust/kernel/debugfs/entry.rs @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Google LLC. + +use crate::str::CStr; +use crate::sync::Arc; + +/// Owning handle to a DebugFS entry. +/// +/// # Invariants +/// +/// The wrapped pointer will always be `NULL`, an error, or an owned DebugFS `dentry`. +pub(crate) struct Entry { + entry: *mut bindings::dentry, + // If we were created with an owning parent, this is the keep-alive + _parent: Option>, +} + +// SAFETY: [`Entry`] is just a `dentry` under the hood, which the API promises can be transferred +// between threads. +unsafe impl Send for Entry {} + +// SAFETY: All the C functions we call on the `dentry` pointer are threadsafe. +unsafe impl Sync for Entry {} + +impl Entry { + pub(crate) fn dynamic_dir(name: &CStr, parent: Option>) -> Self { + let parent_ptr = match &parent { + Some(entry) => entry.as_ptr(), + None => core::ptr::null_mut(), + }; + // SAFETY: The invariants of this function's arguments ensure the safety of this call. + // * `name` is a valid C string by the invariants of `&CStr`. + // * `parent_ptr` is either `NULL` (if `parent` is `None`), or a pointer to a valid + // `dentry` by our invariant. `debugfs_create_dir` handles `NULL` pointers correctly. + let entry = unsafe { bindings::debugfs_create_dir(name.as_char_ptr(), parent_ptr) }; + + Entry { + entry, + _parent: parent, + } + } + + /// Returns the pointer representation of the DebugFS directory. + /// + /// # Guarantees + /// + /// Due to the type invariant, the value returned from this function will always be an error + /// code, NULL, or a live DebugFS directory. If it is live, it will remain live at least as + /// long as this entry lives. + pub(crate) fn as_ptr(&self) -> *mut bindings::dentry { + self.entry + } +} + +impl Drop for Entry { + fn drop(&mut self) { + // SAFETY: `debugfs_remove` can take `NULL`, error values, and legal DebugFS dentries. + // `as_ptr` guarantees that the pointer is of this form. + unsafe { bindings::debugfs_remove(self.as_ptr()) } + } +} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 953a6cba501cbe..8a7c62951a1dec 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -76,6 +76,7 @@ pub mod cpu; pub mod cpufreq; pub mod cpumask; pub mod cred; +pub mod debugfs; pub mod device; pub mod device_id; pub mod devres; From 5e40b591cb46c0379d5406fa5548c9b2a3801353 Mon Sep 17 00:00:00 2001 From: Matthew Maurer Date: Thu, 4 Sep 2025 21:13:53 +0000 Subject: [PATCH 1384/3206] rust: debugfs: Add support for read-only files Extends the `debugfs` API to support creating read-only files. This is done via the `Dir::read_only_file` method, which takes a data object that implements the `Writer` trait. The file's content is generated by the `Writer` implementation, and the file is automatically removed when the returned `File` handle is dropped. Signed-off-by: Matthew Maurer Tested-by: Dirk Behme Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250904-debugfs-rust-v11-2-7d12a165685a@google.com [ Fixup build failure when CONFIG_DEBUGFS=n. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/debugfs.rs | 148 +++++++++++++++++++++++++++++++- rust/kernel/debugfs/entry.rs | 42 +++++++++ rust/kernel/debugfs/file_ops.rs | 128 +++++++++++++++++++++++++++ rust/kernel/debugfs/traits.rs | 33 +++++++ 4 files changed, 350 insertions(+), 1 deletion(-) create mode 100644 rust/kernel/debugfs/file_ops.rs create mode 100644 rust/kernel/debugfs/traits.rs diff --git a/rust/kernel/debugfs.rs b/rust/kernel/debugfs.rs index 65be71600b8eda..988aacdadbfa29 100644 --- a/rust/kernel/debugfs.rs +++ b/rust/kernel/debugfs.rs @@ -8,12 +8,18 @@ // When DebugFS is disabled, many parameters are dead. Linting for this isn't helpful. #![cfg_attr(not(CONFIG_DEBUG_FS), allow(unused_variables))] -#[cfg(CONFIG_DEBUG_FS)] use crate::prelude::*; use crate::str::CStr; #[cfg(CONFIG_DEBUG_FS)] use crate::sync::Arc; +use core::marker::PhantomPinned; +use core::ops::Deref; + +mod traits; +pub use traits::Writer; +mod file_ops; +use file_ops::{FileOps, ReadFile}; #[cfg(CONFIG_DEBUG_FS)] mod entry; #[cfg(CONFIG_DEBUG_FS)] @@ -53,6 +59,34 @@ impl Dir { Self() } + /// Creates a DebugFS file which will own the data produced by the initializer provided in + /// `data`. + fn create_file<'a, T, E: 'a>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + file_ops: &'static FileOps, + ) -> impl PinInit, E> + 'a + where + T: Sync + 'static, + { + let scope = Scope::::new(data, move |data| { + #[cfg(CONFIG_DEBUG_FS)] + if let Some(parent) = &self.0 { + // SAFETY: Because data derives from a scope, and our entry will be dropped before + // the data is dropped, it is guaranteed to outlive the entry we return. + unsafe { Entry::dynamic_file(name, parent.clone(), data, file_ops) } + } else { + Entry::empty() + } + }); + try_pin_init! { + File { + scope <- scope + } ? E + } + } + /// Create a new directory in DebugFS at the root. /// /// # Examples @@ -79,4 +113,116 @@ impl Dir { pub fn subdir(&self, name: &CStr) -> Self { Dir::create(name, Some(self)) } + + /// Creates a read-only file in this directory. + /// + /// The file's contents are produced by invoking [`Writer::write`] on the value initialized by + /// `data`. + /// + /// # Examples + /// + /// ``` + /// # use kernel::c_str; + /// # use kernel::debugfs::Dir; + /// # use kernel::prelude::*; + /// # let dir = Dir::new(c_str!("my_debugfs_dir")); + /// let file = KBox::pin_init(dir.read_only_file(c_str!("foo"), 200), GFP_KERNEL)?; + /// // "my_debugfs_dir/foo" now contains the number 200. + /// // The file is removed when `file` is dropped. + /// # Ok::<(), Error>(()) + /// ``` + pub fn read_only_file<'a, T, E: 'a>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + ) -> impl PinInit, E> + 'a + where + T: Writer + Send + Sync + 'static, + { + let file_ops = &>::FILE_OPS; + self.create_file(name, data, file_ops) + } +} + +#[pin_data] +/// Handle to a DebugFS scope, which ensures that attached `data` will outlive the provided +/// [`Entry`] without moving. +/// Currently, this is used to back [`File`] so that its `read` and/or `write` implementations +/// can assume that their backing data is still alive. +struct Scope { + // This order is load-bearing for drops - `_entry` must be dropped before `data`. + #[cfg(CONFIG_DEBUG_FS)] + _entry: Entry, + #[pin] + data: T, + // Even if `T` is `Unpin`, we still can't allow it to be moved. + #[pin] + _pin: PhantomPinned, +} + +#[pin_data] +/// Handle to a DebugFS file, owning its backing data. +/// +/// When dropped, the DebugFS file will be removed and the attached data will be dropped. +pub struct File { + #[pin] + scope: Scope, +} + +#[cfg(not(CONFIG_DEBUG_FS))] +impl<'b, T: 'b> Scope { + fn new(data: impl PinInit + 'b, init: F) -> impl PinInit + 'b + where + F: for<'a> FnOnce(&'a T) + 'b, + { + try_pin_init! { + Self { + data <- data, + _pin: PhantomPinned + } ? E + } + .pin_chain(|scope| { + init(&scope.data); + Ok(()) + }) + } +} + +#[cfg(CONFIG_DEBUG_FS)] +impl<'b, T: 'b> Scope { + fn entry_mut(self: Pin<&mut Self>) -> &mut Entry { + // SAFETY: _entry is not structurally pinned. + unsafe { &mut Pin::into_inner_unchecked(self)._entry } + } + + fn new(data: impl PinInit + 'b, init: F) -> impl PinInit + 'b + where + F: for<'a> FnOnce(&'a T) -> Entry + 'b, + { + try_pin_init! { + Self { + _entry: Entry::empty(), + data <- data, + _pin: PhantomPinned + } ? E + } + .pin_chain(|scope| { + *scope.entry_mut() = init(&scope.data); + Ok(()) + }) + } +} + +impl Deref for Scope { + type Target = T; + fn deref(&self) -> &T { + &self.data + } +} + +impl Deref for File { + type Target = T; + fn deref(&self) -> &T { + &self.scope + } } diff --git a/rust/kernel/debugfs/entry.rs b/rust/kernel/debugfs/entry.rs index d2fba0e65e20e9..227fa50b7a79ae 100644 --- a/rust/kernel/debugfs/entry.rs +++ b/rust/kernel/debugfs/entry.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2025 Google LLC. +use crate::debugfs::file_ops::FileOps; +use crate::ffi::c_void; use crate::str::CStr; use crate::sync::Arc; @@ -40,6 +42,46 @@ impl Entry { } } + /// # Safety + /// + /// * `data` must outlive the returned `Entry`. + pub(crate) unsafe fn dynamic_file( + name: &CStr, + parent: Arc, + data: &T, + file_ops: &'static FileOps, + ) -> Self { + // SAFETY: The invariants of this function's arguments ensure the safety of this call. + // * `name` is a valid C string by the invariants of `&CStr`. + // * `parent.as_ptr()` is a pointer to a valid `dentry` by invariant. + // * The caller guarantees that `data` will outlive the returned `Entry`. + // * The guarantees on `FileOps` assert the vtable will be compatible with the data we have + // provided. + let entry = unsafe { + bindings::debugfs_create_file_full( + name.as_char_ptr(), + file_ops.mode(), + parent.as_ptr(), + core::ptr::from_ref(data) as *mut c_void, + core::ptr::null(), + &**file_ops, + ) + }; + + Entry { + entry, + _parent: Some(parent), + } + } + + /// Constructs a placeholder DebugFS [`Entry`]. + pub(crate) fn empty() -> Self { + Self { + entry: core::ptr::null_mut(), + _parent: None, + } + } + /// Returns the pointer representation of the DebugFS directory. /// /// # Guarantees diff --git a/rust/kernel/debugfs/file_ops.rs b/rust/kernel/debugfs/file_ops.rs new file mode 100644 index 00000000000000..c2fbef96580eaa --- /dev/null +++ b/rust/kernel/debugfs/file_ops.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Google LLC. + +use super::Writer; +use crate::prelude::*; +use crate::seq_file::SeqFile; +use crate::seq_print; +use core::fmt::{Display, Formatter, Result}; +use core::marker::PhantomData; + +#[cfg(CONFIG_DEBUG_FS)] +use core::ops::Deref; + +/// # Invariant +/// +/// `FileOps` will always contain an `operations` which is safe to use for a file backed +/// off an inode which has a pointer to a `T` in its private data that is safe to convert +/// into a reference. +pub(super) struct FileOps { + #[cfg(CONFIG_DEBUG_FS)] + operations: bindings::file_operations, + #[cfg(CONFIG_DEBUG_FS)] + mode: u16, + _phantom: PhantomData, +} + +impl FileOps { + /// # Safety + /// + /// The caller asserts that the provided `operations` is safe to use for a file whose + /// inode has a pointer to `T` in its private data that is safe to convert into a reference. + const unsafe fn new(operations: bindings::file_operations, mode: u16) -> Self { + Self { + #[cfg(CONFIG_DEBUG_FS)] + operations, + #[cfg(CONFIG_DEBUG_FS)] + mode, + _phantom: PhantomData, + } + } + + #[cfg(CONFIG_DEBUG_FS)] + pub(crate) const fn mode(&self) -> u16 { + self.mode + } +} + +#[cfg(CONFIG_DEBUG_FS)] +impl Deref for FileOps { + type Target = bindings::file_operations; + + fn deref(&self) -> &Self::Target { + &self.operations + } +} + +struct WriterAdapter(T); + +impl<'a, T: Writer> Display for WriterAdapter<&'a T> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + self.0.write(f) + } +} + +/// Implements `open` for `file_operations` via `single_open` to fill out a `seq_file`. +/// +/// # Safety +/// +/// * `inode`'s private pointer must point to a value of type `T` which will outlive the `inode` +/// and will not have any unique references alias it during the call. +/// * `file` must point to a live, not-yet-initialized file object. +unsafe extern "C" fn writer_open( + inode: *mut bindings::inode, + file: *mut bindings::file, +) -> c_int { + // SAFETY: The caller ensures that `inode` is a valid pointer. + let data = unsafe { (*inode).i_private }; + // SAFETY: + // * `file` is acceptable by caller precondition. + // * `print_act` will be called on a `seq_file` with private data set to the third argument, + // so we meet its safety requirements. + // * The `data` pointer passed in the third argument is a valid `T` pointer that outlives + // this call by caller preconditions. + unsafe { bindings::single_open(file, Some(writer_act::), data) } +} + +/// Prints private data stashed in a seq_file to that seq file. +/// +/// # Safety +/// +/// `seq` must point to a live `seq_file` whose private data is a valid pointer to a `T` which may +/// not have any unique references alias it during the call. +unsafe extern "C" fn writer_act( + seq: *mut bindings::seq_file, + _: *mut c_void, +) -> c_int { + // SAFETY: By caller precondition, this pointer is valid pointer to a `T`, and + // there are not and will not be any unique references until we are done. + let data = unsafe { &*((*seq).private.cast::()) }; + // SAFETY: By caller precondition, `seq_file` points to a live `seq_file`, so we can lift + // it. + let seq_file = unsafe { SeqFile::from_raw(seq) }; + seq_print!(seq_file, "{}", WriterAdapter(data)); + 0 +} + +// Work around lack of generic const items. +pub(crate) trait ReadFile { + const FILE_OPS: FileOps; +} + +impl ReadFile for T { + const FILE_OPS: FileOps = { + let operations = bindings::file_operations { + read: Some(bindings::seq_read), + llseek: Some(bindings::seq_lseek), + release: Some(bindings::single_release), + open: Some(writer_open::), + // SAFETY: `file_operations` supports zeroes in all fields. + ..unsafe { core::mem::zeroed() } + }; + // SAFETY: `operations` is all stock `seq_file` implementations except for `writer_open`. + // `open`'s only requirement beyond what is provided to all open functions is that the + // inode's data pointer must point to a `T` that will outlive it, which matches the + // `FileOps` requirements. + unsafe { FileOps::new(operations, 0o400) } + }; +} diff --git a/rust/kernel/debugfs/traits.rs b/rust/kernel/debugfs/traits.rs new file mode 100644 index 00000000000000..0e6e461324de42 --- /dev/null +++ b/rust/kernel/debugfs/traits.rs @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Google LLC. + +//! Traits for rendering or updating values exported to DebugFS. + +use crate::sync::Mutex; +use core::fmt::{self, Debug, Formatter}; + +/// A trait for types that can be written into a string. +/// +/// This works very similarly to `Debug`, and is automatically implemented if `Debug` is +/// implemented for a type. It is also implemented for any writable type inside a `Mutex`. +/// +/// The derived implementation of `Debug` [may +/// change](https://doc.rust-lang.org/std/fmt/trait.Debug.html#stability) +/// between Rust versions, so if stability is key for your use case, please implement `Writer` +/// explicitly instead. +pub trait Writer { + /// Formats the value using the given formatter. + fn write(&self, f: &mut Formatter<'_>) -> fmt::Result; +} + +impl Writer for Mutex { + fn write(&self, f: &mut Formatter<'_>) -> fmt::Result { + self.lock().write(f) + } +} + +impl Writer for T { + fn write(&self, f: &mut Formatter<'_>) -> fmt::Result { + writeln!(f, "{self:?}") + } +} From 839dc1d15b9ba5318ed145b20efffcfa91c02a3d Mon Sep 17 00:00:00 2001 From: Matthew Maurer Date: Thu, 4 Sep 2025 21:13:54 +0000 Subject: [PATCH 1385/3206] rust: debugfs: Add support for writable files Extends the `debugfs` API to support creating writable files. This is done via the `Dir::write_only_file` and `Dir::read_write_file` methods, which take a data object that implements the `Reader` trait. Signed-off-by: Matthew Maurer Tested-by: Dirk Behme Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250904-debugfs-rust-v11-3-7d12a165685a@google.com [ Fix up Result<()> -> Result. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/debugfs.rs | 37 ++++++++++- rust/kernel/debugfs/file_ops.rs | 113 +++++++++++++++++++++++++++++++- rust/kernel/debugfs/traits.rs | 69 +++++++++++++++++++ 3 files changed, 216 insertions(+), 3 deletions(-) diff --git a/rust/kernel/debugfs.rs b/rust/kernel/debugfs.rs index 988aacdadbfa29..5d77e0cd393ce4 100644 --- a/rust/kernel/debugfs.rs +++ b/rust/kernel/debugfs.rs @@ -16,10 +16,10 @@ use core::marker::PhantomPinned; use core::ops::Deref; mod traits; -pub use traits::Writer; +pub use traits::{Reader, Writer}; mod file_ops; -use file_ops::{FileOps, ReadFile}; +use file_ops::{FileOps, ReadFile, ReadWriteFile, WriteFile}; #[cfg(CONFIG_DEBUG_FS)] mod entry; #[cfg(CONFIG_DEBUG_FS)] @@ -142,6 +142,39 @@ impl Dir { let file_ops = &>::FILE_OPS; self.create_file(name, data, file_ops) } + + /// Creates a read-write file in this directory. + /// + /// Reading the file uses the [`Writer`] implementation. + /// Writing to the file uses the [`Reader`] implementation. + pub fn read_write_file<'a, T, E: 'a>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + ) -> impl PinInit, E> + 'a + where + T: Writer + Reader + Send + Sync + 'static, + { + let file_ops = &>::FILE_OPS; + self.create_file(name, data, file_ops) + } + + /// Creates a write-only file in this directory. + /// + /// The file owns its backing data. Writing to the file uses the [`Reader`] + /// implementation. + /// + /// The file is removed when the returned [`File`] is dropped. + pub fn write_only_file<'a, T, E: 'a>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + ) -> impl PinInit, E> + 'a + where + T: Reader + Send + Sync + 'static, + { + self.create_file(name, data, &T::FILE_OPS) + } } #[pin_data] diff --git a/rust/kernel/debugfs/file_ops.rs b/rust/kernel/debugfs/file_ops.rs index c2fbef96580eaa..2060c8d14d8345 100644 --- a/rust/kernel/debugfs/file_ops.rs +++ b/rust/kernel/debugfs/file_ops.rs @@ -1,10 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2025 Google LLC. -use super::Writer; +use super::{Reader, Writer}; use crate::prelude::*; use crate::seq_file::SeqFile; use crate::seq_print; +use crate::uaccess::UserSlice; use core::fmt::{Display, Formatter, Result}; use core::marker::PhantomData; @@ -126,3 +127,113 @@ impl ReadFile for T { unsafe { FileOps::new(operations, 0o400) } }; } + +fn read(data: &T, buf: *const c_char, count: usize) -> isize { + let mut reader = UserSlice::new(UserPtr::from_ptr(buf as *mut c_void), count).reader(); + + if let Err(e) = data.read_from_slice(&mut reader) { + return e.to_errno() as isize; + } + + count as isize +} + +/// # Safety +/// +/// `file` must be a valid pointer to a `file` struct. +/// The `private_data` of the file must contain a valid pointer to a `seq_file` whose +/// `private` data in turn points to a `T` that implements `Reader`. +/// `buf` must be a valid user-space buffer. +pub(crate) unsafe extern "C" fn write( + file: *mut bindings::file, + buf: *const c_char, + count: usize, + _ppos: *mut bindings::loff_t, +) -> isize { + // SAFETY: The file was opened with `single_open`, which sets `private_data` to a `seq_file`. + let seq = unsafe { &mut *((*file).private_data.cast::()) }; + // SAFETY: By caller precondition, this pointer is live and points to a value of type `T`. + let data = unsafe { &*(seq.private as *const T) }; + read(data, buf, count) +} + +// A trait to get the file operations for a type. +pub(crate) trait ReadWriteFile { + const FILE_OPS: FileOps; +} + +impl ReadWriteFile for T { + const FILE_OPS: FileOps = { + let operations = bindings::file_operations { + open: Some(writer_open::), + read: Some(bindings::seq_read), + write: Some(write::), + llseek: Some(bindings::seq_lseek), + release: Some(bindings::single_release), + // SAFETY: `file_operations` supports zeroes in all fields. + ..unsafe { core::mem::zeroed() } + }; + // SAFETY: `operations` is all stock `seq_file` implementations except for `writer_open` + // and `write`. + // `writer_open`'s only requirement beyond what is provided to all open functions is that + // the inode's data pointer must point to a `T` that will outlive it, which matches the + // `FileOps` requirements. + // `write` only requires that the file's private data pointer points to `seq_file` + // which points to a `T` that will outlive it, which matches what `writer_open` + // provides. + unsafe { FileOps::new(operations, 0o600) } + }; +} + +/// # Safety +/// +/// `inode` must be a valid pointer to an `inode` struct. +/// `file` must be a valid pointer to a `file` struct. +unsafe extern "C" fn write_only_open( + inode: *mut bindings::inode, + file: *mut bindings::file, +) -> c_int { + // SAFETY: The caller ensures that `inode` and `file` are valid pointers. + unsafe { (*file).private_data = (*inode).i_private }; + 0 +} + +/// # Safety +/// +/// * `file` must be a valid pointer to a `file` struct. +/// * The `private_data` of the file must contain a valid pointer to a `T` that implements +/// `Reader`. +/// * `buf` must be a valid user-space buffer. +pub(crate) unsafe extern "C" fn write_only_write( + file: *mut bindings::file, + buf: *const c_char, + count: usize, + _ppos: *mut bindings::loff_t, +) -> isize { + // SAFETY: The caller ensures that `file` is a valid pointer and that `private_data` holds a + // valid pointer to `T`. + let data = unsafe { &*((*file).private_data as *const T) }; + read(data, buf, count) +} + +pub(crate) trait WriteFile { + const FILE_OPS: FileOps; +} + +impl WriteFile for T { + const FILE_OPS: FileOps = { + let operations = bindings::file_operations { + open: Some(write_only_open), + write: Some(write_only_write::), + llseek: Some(bindings::noop_llseek), + // SAFETY: `file_operations` supports zeroes in all fields. + ..unsafe { core::mem::zeroed() } + }; + // SAFETY: + // * `write_only_open` populates the file private data with the inode private data + // * `write_only_write`'s only requirement is that the private data of the file point to + // a `T` and be legal to convert to a shared reference, which `write_only_open` + // satisfies. + unsafe { FileOps::new(operations, 0o200) } + }; +} diff --git a/rust/kernel/debugfs/traits.rs b/rust/kernel/debugfs/traits.rs index 0e6e461324de42..ab009eb254b321 100644 --- a/rust/kernel/debugfs/traits.rs +++ b/rust/kernel/debugfs/traits.rs @@ -3,8 +3,15 @@ //! Traits for rendering or updating values exported to DebugFS. +use crate::prelude::*; use crate::sync::Mutex; +use crate::uaccess::UserSliceReader; use core::fmt::{self, Debug, Formatter}; +use core::str::FromStr; +use core::sync::atomic::{ + AtomicI16, AtomicI32, AtomicI64, AtomicI8, AtomicIsize, AtomicU16, AtomicU32, AtomicU64, + AtomicU8, AtomicUsize, Ordering, +}; /// A trait for types that can be written into a string. /// @@ -31,3 +38,65 @@ impl Writer for T { writeln!(f, "{self:?}") } } + +/// A trait for types that can be updated from a user slice. +/// +/// This works similarly to `FromStr`, but operates on a `UserSliceReader` rather than a &str. +/// +/// It is automatically implemented for all atomic integers, or any type that implements `FromStr` +/// wrapped in a `Mutex`. +pub trait Reader { + /// Updates the value from the given user slice. + fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result; +} + +impl Reader for Mutex { + fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result { + let mut buf = [0u8; 128]; + if reader.len() > buf.len() { + return Err(EINVAL); + } + let n = reader.len(); + reader.read_slice(&mut buf[..n])?; + + let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?; + let val = s.trim().parse::().map_err(|_| EINVAL)?; + *self.lock() = val; + Ok(()) + } +} + +macro_rules! impl_reader_for_atomic { + ($(($atomic_type:ty, $int_type:ty)),*) => { + $( + impl Reader for $atomic_type { + fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result { + let mut buf = [0u8; 21]; // Enough for a 64-bit number. + if reader.len() > buf.len() { + return Err(EINVAL); + } + let n = reader.len(); + reader.read_slice(&mut buf[..n])?; + + let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?; + let val = s.trim().parse::<$int_type>().map_err(|_| EINVAL)?; + self.store(val, Ordering::Relaxed); + Ok(()) + } + } + )* + }; +} + +impl_reader_for_atomic!( + (AtomicI16, i16), + (AtomicI32, i32), + (AtomicI64, i64), + (AtomicI8, i8), + (AtomicIsize, isize), + (AtomicU16, u16), + (AtomicU32, u32), + (AtomicU64, u64), + (AtomicU8, u8), + (AtomicUsize, usize) +); From 40ecc49466c8b7f9518c5fbbcfb24cb7e26c36c7 Mon Sep 17 00:00:00 2001 From: Matthew Maurer Date: Thu, 4 Sep 2025 21:13:55 +0000 Subject: [PATCH 1386/3206] rust: debugfs: Add support for callback-based files Extends the `debugfs` API to support creating files with content generated and updated by callbacks. This is done via the `read_callback_file`, `write_callback_file`, and `read_write_callback_file` methods. These methods allow for more flexible file definition, either because the type already has a `Writer` or `Reader` method that doesn't do what you'd like, or because you cannot implement it (e.g. because it's a type defined in another crate or a primitive type). Signed-off-by: Matthew Maurer Tested-by: Dirk Behme Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250904-debugfs-rust-v11-4-7d12a165685a@google.com [ Fix up Result<(), Error> -> Result. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/debugfs.rs | 89 +++++++++++++++++ rust/kernel/debugfs/callback_adapters.rs | 122 +++++++++++++++++++++++ rust/kernel/debugfs/file_ops.rs | 8 ++ 3 files changed, 219 insertions(+) create mode 100644 rust/kernel/debugfs/callback_adapters.rs diff --git a/rust/kernel/debugfs.rs b/rust/kernel/debugfs.rs index 5d77e0cd393ce4..34c991d138a7cd 100644 --- a/rust/kernel/debugfs.rs +++ b/rust/kernel/debugfs.rs @@ -12,12 +12,16 @@ use crate::prelude::*; use crate::str::CStr; #[cfg(CONFIG_DEBUG_FS)] use crate::sync::Arc; +use crate::uaccess::UserSliceReader; +use core::fmt; use core::marker::PhantomPinned; use core::ops::Deref; mod traits; pub use traits::{Reader, Writer}; +mod callback_adapters; +use callback_adapters::{FormatAdapter, NoWriter, WritableAdapter}; mod file_ops; use file_ops::{FileOps, ReadFile, ReadWriteFile, WriteFile}; #[cfg(CONFIG_DEBUG_FS)] @@ -143,6 +147,46 @@ impl Dir { self.create_file(name, data, file_ops) } + /// Creates a read-only file in this directory, with contents from a callback. + /// + /// `f` must be a function item or a non-capturing closure. + /// This is statically asserted and not a safety requirement. + /// + /// # Examples + /// + /// ``` + /// # use core::sync::atomic::{AtomicU32, Ordering}; + /// # use kernel::c_str; + /// # use kernel::debugfs::Dir; + /// # use kernel::prelude::*; + /// # let dir = Dir::new(c_str!("foo")); + /// let file = KBox::pin_init( + /// dir.read_callback_file(c_str!("bar"), + /// AtomicU32::new(3), + /// &|val, f| { + /// let out = val.load(Ordering::Relaxed); + /// writeln!(f, "{out:#010x}") + /// }), + /// GFP_KERNEL)?; + /// // Reading "foo/bar" will show "0x00000003". + /// file.store(10, Ordering::Relaxed); + /// // Reading "foo/bar" will now show "0x0000000a". + /// # Ok::<(), Error>(()) + /// ``` + pub fn read_callback_file<'a, T, E: 'a, F>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + _f: &'static F, + ) -> impl PinInit, E> + 'a + where + T: Send + Sync + 'static, + F: Fn(&T, &mut fmt::Formatter<'_>) -> fmt::Result + Send + Sync, + { + let file_ops = >::FILE_OPS.adapt(); + self.create_file(name, data, file_ops) + } + /// Creates a read-write file in this directory. /// /// Reading the file uses the [`Writer`] implementation. @@ -159,6 +203,31 @@ impl Dir { self.create_file(name, data, file_ops) } + /// Creates a read-write file in this directory, with logic from callbacks. + /// + /// Reading from the file is handled by `f`. Writing to the file is handled by `w`. + /// + /// `f` and `w` must be function items or non-capturing closures. + /// This is statically asserted and not a safety requirement. + pub fn read_write_callback_file<'a, T, E: 'a, F, W>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + _f: &'static F, + _w: &'static W, + ) -> impl PinInit, E> + 'a + where + T: Send + Sync + 'static, + F: Fn(&T, &mut fmt::Formatter<'_>) -> fmt::Result + Send + Sync, + W: Fn(&T, &mut UserSliceReader) -> Result + Send + Sync, + { + let file_ops = + , W> as file_ops::ReadWriteFile<_>>::FILE_OPS + .adapt() + .adapt(); + self.create_file(name, data, file_ops) + } + /// Creates a write-only file in this directory. /// /// The file owns its backing data. Writing to the file uses the [`Reader`] @@ -175,6 +244,26 @@ impl Dir { { self.create_file(name, data, &T::FILE_OPS) } + + /// Creates a write-only file in this directory, with write logic from a callback. + /// + /// `w` must be a function item or a non-capturing closure. + /// This is statically asserted and not a safety requirement. + pub fn write_callback_file<'a, T, E: 'a, W>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + _w: &'static W, + ) -> impl PinInit, E> + 'a + where + T: Send + Sync + 'static, + W: Fn(&T, &mut UserSliceReader) -> Result + Send + Sync, + { + let file_ops = , W> as WriteFile<_>>::FILE_OPS + .adapt() + .adapt(); + self.create_file(name, data, file_ops) + } } #[pin_data] diff --git a/rust/kernel/debugfs/callback_adapters.rs b/rust/kernel/debugfs/callback_adapters.rs new file mode 100644 index 00000000000000..6c024230f676d5 --- /dev/null +++ b/rust/kernel/debugfs/callback_adapters.rs @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Google LLC. + +//! Adapters which allow the user to supply a write or read implementation as a value rather +//! than a trait implementation. If provided, it will override the trait implementation. + +use super::{Reader, Writer}; +use crate::prelude::*; +use crate::uaccess::UserSliceReader; +use core::fmt; +use core::fmt::Formatter; +use core::marker::PhantomData; +use core::ops::Deref; + +/// # Safety +/// +/// To implement this trait, it must be safe to cast a `&Self` to a `&Inner`. +/// It is intended for use in unstacking adapters out of `FileOps` backings. +pub(crate) unsafe trait Adapter { + type Inner; +} + +/// Adapter to implement `Reader` via a callback with the same representation as `T`. +/// +/// * Layer it on top of `WriterAdapter` if you want to add a custom callback for `write`. +/// * Layer it on top of `NoWriter` to pass through any support present on the underlying type. +/// +/// # Invariants +/// +/// If an instance for `WritableAdapter<_, W>` is constructed, `W` is inhabited. +#[repr(transparent)] +pub(crate) struct WritableAdapter { + inner: D, + _writer: PhantomData, +} + +// SAFETY: Stripping off the adapter only removes constraints +unsafe impl Adapter for WritableAdapter { + type Inner = D; +} + +impl Writer for WritableAdapter { + fn write(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + self.inner.write(fmt) + } +} + +impl Reader for WritableAdapter +where + W: Fn(&D::Target, &mut UserSliceReader) -> Result + Send + Sync + 'static, +{ + fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result { + // SAFETY: WritableAdapter<_, W> can only be constructed if W is inhabited + let w: &W = unsafe { materialize_zst() }; + w(self.inner.deref(), reader) + } +} + +/// Adapter to implement `Writer` via a callback with the same representation as `T`. +/// +/// # Invariants +/// +/// If an instance for `FormatAdapter<_, F>` is constructed, `F` is inhabited. +#[repr(transparent)] +pub(crate) struct FormatAdapter { + inner: D, + _formatter: PhantomData, +} + +impl Deref for FormatAdapter { + type Target = D; + fn deref(&self) -> &D { + &self.inner + } +} + +impl Writer for FormatAdapter +where + F: Fn(&D, &mut Formatter<'_>) -> fmt::Result + 'static, +{ + fn write(&self, fmt: &mut Formatter<'_>) -> fmt::Result { + // SAFETY: FormatAdapter<_, F> can only be constructed if F is inhabited + let f: &F = unsafe { materialize_zst() }; + f(&self.inner, fmt) + } +} + +// SAFETY: Stripping off the adapter only removes constraints +unsafe impl Adapter for FormatAdapter { + type Inner = D; +} + +#[repr(transparent)] +pub(crate) struct NoWriter { + inner: D, +} + +// SAFETY: Stripping off the adapter only removes constraints +unsafe impl Adapter for NoWriter { + type Inner = D; +} + +impl Deref for NoWriter { + type Target = D; + fn deref(&self) -> &D { + &self.inner + } +} + +/// For types with a unique value, produce a static reference to it. +/// +/// # Safety +/// +/// The caller asserts that F is inhabited +unsafe fn materialize_zst() -> &'static F { + const { assert!(core::mem::size_of::() == 0) }; + let zst_dangle: core::ptr::NonNull = core::ptr::NonNull::dangling(); + // SAFETY: While the pointer is dangling, it is a dangling pointer to a ZST, based on the + // assertion above. The type is also inhabited, by the caller's assertion. This means + // we can materialize it. + unsafe { zst_dangle.as_ref() } +} diff --git a/rust/kernel/debugfs/file_ops.rs b/rust/kernel/debugfs/file_ops.rs index 2060c8d14d8345..50fead17b6f31f 100644 --- a/rust/kernel/debugfs/file_ops.rs +++ b/rust/kernel/debugfs/file_ops.rs @@ -2,6 +2,7 @@ // Copyright (C) 2025 Google LLC. use super::{Reader, Writer}; +use crate::debugfs::callback_adapters::Adapter; use crate::prelude::*; use crate::seq_file::SeqFile; use crate::seq_print; @@ -46,6 +47,13 @@ impl FileOps { } } +impl FileOps { + pub(super) const fn adapt(&self) -> &FileOps { + // SAFETY: `Adapter` asserts that `T` can be legally cast to `T::Inner`. + unsafe { core::mem::transmute(self) } + } +} + #[cfg(CONFIG_DEBUG_FS)] impl Deref for FileOps { type Target = bindings::file_operations; From 6f227d21377c8b86bcacb266ee1f72bc937f4598 Mon Sep 17 00:00:00 2001 From: Matthew Maurer Date: Thu, 4 Sep 2025 21:13:56 +0000 Subject: [PATCH 1387/3206] samples: rust: Add debugfs sample driver Adds a new sample driver that demonstrates the debugfs APIs. The driver creates a directory in debugfs and populates it with a few files: - A read-only file that displays a fwnode property. - A read-write file that exposes an atomic counter. - A read-write file that exposes a custom struct. This sample serves as a basic example of how to use the `debugfs::Dir` and `debugfs::File` APIs to create and manage debugfs entries. Signed-off-by: Matthew Maurer Tested-by: Dirk Behme Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250904-debugfs-rust-v11-5-7d12a165685a@google.com [ Change ACPI ID "LNUXDEBF" to "LNUXBEEF". - Danilo ] Signed-off-by: Danilo Krummrich --- MAINTAINERS | 1 + samples/rust/Kconfig | 11 +++ samples/rust/Makefile | 1 + samples/rust/rust_debugfs.rs | 151 +++++++++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+) create mode 100644 samples/rust/rust_debugfs.rs diff --git a/MAINTAINERS b/MAINTAINERS index 4dc5d124bb14c0..4c471251a304f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7496,6 +7496,7 @@ F: rust/kernel/devres.rs F: rust/kernel/driver.rs F: rust/kernel/faux.rs F: rust/kernel/platform.rs +F: samples/rust/rust_debugfs.rs F: samples/rust/rust_driver_platform.rs F: samples/rust/rust_driver_faux.rs diff --git a/samples/rust/Kconfig b/samples/rust/Kconfig index 7f7371a004ee0a..01101db41ae31b 100644 --- a/samples/rust/Kconfig +++ b/samples/rust/Kconfig @@ -62,6 +62,17 @@ config SAMPLE_RUST_DMA If unsure, say N. +config SAMPLE_RUST_DEBUGFS + tristate "DebugFS Test Module" + depends on DEBUG_FS + help + This option builds the Rust DebugFS Test module sample. + + To compile this as a module, choose M here: + the module will be called rust_debugfs. + + If unsure, say N. + config SAMPLE_RUST_DRIVER_PCI tristate "PCI Driver" depends on PCI diff --git a/samples/rust/Makefile b/samples/rust/Makefile index bd2faad63b4f3b..61276222a99f8c 100644 --- a/samples/rust/Makefile +++ b/samples/rust/Makefile @@ -4,6 +4,7 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_SAMPLE_RUST_MINIMAL) += rust_minimal.o obj-$(CONFIG_SAMPLE_RUST_MISC_DEVICE) += rust_misc_device.o obj-$(CONFIG_SAMPLE_RUST_PRINT) += rust_print.o +obj-$(CONFIG_SAMPLE_RUST_DEBUGFS) += rust_debugfs.o obj-$(CONFIG_SAMPLE_RUST_DMA) += rust_dma.o obj-$(CONFIG_SAMPLE_RUST_DRIVER_PCI) += rust_driver_pci.o obj-$(CONFIG_SAMPLE_RUST_DRIVER_PLATFORM) += rust_driver_platform.o diff --git a/samples/rust/rust_debugfs.rs b/samples/rust/rust_debugfs.rs new file mode 100644 index 00000000000000..82b61a15a34b95 --- /dev/null +++ b/samples/rust/rust_debugfs.rs @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Sample DebugFS exporting platform driver +//! +//! To successfully probe this driver with ACPI, use an ssdt that looks like +//! +//! ```dsl +//! DefinitionBlock ("", "SSDT", 2, "TEST", "VIRTACPI", 0x00000001) +//! { +//! Scope (\_SB) +//! { +//! Device (T432) +//! { +//! Name (_HID, "LNUXBEEF") // ACPI hardware ID to match +//! Name (_UID, 1) +//! Name (_STA, 0x0F) // Device present, enabled +//! Name (_DSD, Package () { // Sample attribute +//! ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), +//! Package() { +//! Package(2) {"compatible", "sample-debugfs"} +//! } +//! }) +//! Name (_CRS, ResourceTemplate () +//! { +//! Memory32Fixed (ReadWrite, 0xFED00000, 0x1000) +//! }) +//! } +//! } +//! } +//! ``` + +use core::str::FromStr; +use core::sync::atomic::AtomicUsize; +use core::sync::atomic::Ordering; +use kernel::c_str; +use kernel::debugfs::{Dir, File}; +use kernel::new_mutex; +use kernel::prelude::*; +use kernel::sync::Mutex; + +use kernel::{acpi, device::Core, of, platform, str::CString, types::ARef}; + +kernel::module_platform_driver! { + type: RustDebugFs, + name: "rust_debugfs", + authors: ["Matthew Maurer"], + description: "Rust DebugFS usage sample", + license: "GPL", +} + +#[pin_data] +struct RustDebugFs { + pdev: ARef, + // As we only hold these for drop effect (to remove the directory/files) we have a leading + // underscore to indicate to the compiler that we don't expect to use this field directly. + _debugfs: Dir, + #[pin] + _compatible: File, + #[pin] + counter: File, + #[pin] + inner: File>, +} + +#[derive(Debug)] +struct Inner { + x: u32, + y: u32, +} + +impl FromStr for Inner { + type Err = Error; + fn from_str(s: &str) -> Result { + let mut parts = s.split_whitespace(); + let x = parts + .next() + .ok_or(EINVAL)? + .parse::() + .map_err(|_| EINVAL)?; + let y = parts + .next() + .ok_or(EINVAL)? + .parse::() + .map_err(|_| EINVAL)?; + if parts.next().is_some() { + return Err(EINVAL); + } + Ok(Inner { x, y }) + } +} + +kernel::acpi_device_table!( + ACPI_TABLE, + MODULE_ACPI_TABLE, + ::IdInfo, + [(acpi::DeviceId::new(c_str!("LNUXBEEF")), ())] +); + +impl platform::Driver for RustDebugFs { + type IdInfo = (); + const OF_ID_TABLE: Option> = None; + const ACPI_ID_TABLE: Option> = Some(&ACPI_TABLE); + + fn probe( + pdev: &platform::Device, + _info: Option<&Self::IdInfo>, + ) -> Result>> { + let result = KBox::try_pin_init(RustDebugFs::new(pdev), GFP_KERNEL)?; + // We can still mutate fields through the files which are atomic or mutexed: + result.counter.store(91, Ordering::Relaxed); + { + let mut guard = result.inner.lock(); + guard.x = guard.y; + guard.y = 42; + } + Ok(result) + } +} + +impl RustDebugFs { + fn build_counter(dir: &Dir) -> impl PinInit> + '_ { + dir.read_write_file(c_str!("counter"), AtomicUsize::new(0)) + } + + fn build_inner(dir: &Dir) -> impl PinInit>> + '_ { + dir.read_write_file(c_str!("pair"), new_mutex!(Inner { x: 3, y: 10 })) + } + + fn new(pdev: &platform::Device) -> impl PinInit + '_ { + let debugfs = Dir::new(c_str!("sample_debugfs")); + let dev = pdev.as_ref(); + + try_pin_init! { + Self { + _compatible <- debugfs.read_only_file( + c_str!("compatible"), + dev.fwnode() + .ok_or(ENOENT)? + .property_read::(c_str!("compatible")) + .required_by(dev)?, + ), + counter <- Self::build_counter(&debugfs), + inner <- Self::build_inner(&debugfs), + _debugfs: debugfs, + pdev: pdev.into(), + } + } + } +} From 5f0942581dd0218c4449dac0639cf362e943c302 Mon Sep 17 00:00:00 2001 From: Matthew Maurer Date: Thu, 4 Sep 2025 21:13:57 +0000 Subject: [PATCH 1388/3206] rust: debugfs: Add support for scoped directories Introduces the concept of a `ScopedDir`, which allows for the creation of debugfs directories and files that are tied to the lifetime of a particular data structure. This ensures that debugfs entries do not outlive the data they refer to. The new `Dir::scope` method creates a new directory that is owned by a `Scope` handle. All files and subdirectories created within this scope are automatically cleaned up when the `Scope` is dropped. Signed-off-by: Matthew Maurer Tested-by: Dirk Behme Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250904-debugfs-rust-v11-6-7d12a165685a@google.com [ Fix up Result<(), Error> -> Result; fix spurious backtick in doc-comment. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/debugfs.rs | 262 +++++++++++++++++++++++++++++++++-- rust/kernel/debugfs/entry.rs | 73 +++++++++- 2 files changed, 320 insertions(+), 15 deletions(-) diff --git a/rust/kernel/debugfs.rs b/rust/kernel/debugfs.rs index 34c991d138a7cd..381c23b3dd839b 100644 --- a/rust/kernel/debugfs.rs +++ b/rust/kernel/debugfs.rs @@ -14,7 +14,10 @@ use crate::str::CStr; use crate::sync::Arc; use crate::uaccess::UserSliceReader; use core::fmt; +use core::marker::PhantomData; use core::marker::PhantomPinned; +#[cfg(CONFIG_DEBUG_FS)] +use core::mem::ManuallyDrop; use core::ops::Deref; mod traits; @@ -40,7 +43,7 @@ use entry::Entry; // able to refer to us. In this case, we need to silently fail. All future child directories/files // will silently fail as well. #[derive(Clone)] -pub struct Dir(#[cfg(CONFIG_DEBUG_FS)] Option>); +pub struct Dir(#[cfg(CONFIG_DEBUG_FS)] Option>>); impl Dir { /// Create a new directory in DebugFS. If `parent` is [`None`], it will be created at the root. @@ -264,17 +267,67 @@ impl Dir { .adapt(); self.create_file(name, data, file_ops) } + + // While this function is safe, it is intentionally not public because it's a bit of a + // footgun. + // + // Unless you also extract the `entry` later and schedule it for `Drop` at the appropriate + // time, a `ScopedDir` with a `Dir` parent will never be deleted. + fn scoped_dir<'data>(&self, name: &CStr) -> ScopedDir<'data, 'static> { + #[cfg(CONFIG_DEBUG_FS)] + { + let parent_entry = match &self.0 { + None => return ScopedDir::empty(), + Some(entry) => entry.clone(), + }; + ScopedDir { + entry: ManuallyDrop::new(Entry::dynamic_dir(name, Some(parent_entry))), + _phantom: PhantomData, + } + } + #[cfg(not(CONFIG_DEBUG_FS))] + ScopedDir::empty() + } + + /// Creates a new scope, which is a directory associated with some data `T`. + /// + /// The created directory will be a subdirectory of `self`. The `init` closure is called to + /// populate the directory with files and subdirectories. These files can reference the data + /// stored in the scope. + /// + /// The entire directory tree created within the scope will be removed when the returned + /// `Scope` handle is dropped. + pub fn scope<'a, T: 'a, E: 'a, F>( + &'a self, + data: impl PinInit + 'a, + name: &'a CStr, + init: F, + ) -> impl PinInit, E> + 'a + where + F: for<'data, 'dir> FnOnce(&'data T, &'dir ScopedDir<'data, 'dir>) + 'a, + { + Scope::new(data, |data| { + let scoped = self.scoped_dir(name); + init(data, &scoped); + scoped.into_entry() + }) + } } #[pin_data] -/// Handle to a DebugFS scope, which ensures that attached `data` will outlive the provided -/// [`Entry`] without moving. -/// Currently, this is used to back [`File`] so that its `read` and/or `write` implementations -/// can assume that their backing data is still alive. -struct Scope { +/// Handle to a DebugFS scope, which ensures that attached `data` will outlive the DebugFS entry +/// without moving. +/// +/// This is internally used to back [`File`], and used in the API to represent the attachment +/// of a directory lifetime to a data structure which may be jointly accessed by a number of +/// different files. +/// +/// When dropped, a `Scope` will remove all directories and files in the filesystem backed by the +/// attached data structure prior to releasing the attached data. +pub struct Scope { // This order is load-bearing for drops - `_entry` must be dropped before `data`. #[cfg(CONFIG_DEBUG_FS)] - _entry: Entry, + _entry: Entry<'static>, #[pin] data: T, // Even if `T` is `Unpin`, we still can't allow it to be moved. @@ -312,14 +365,14 @@ impl<'b, T: 'b> Scope { #[cfg(CONFIG_DEBUG_FS)] impl<'b, T: 'b> Scope { - fn entry_mut(self: Pin<&mut Self>) -> &mut Entry { + fn entry_mut(self: Pin<&mut Self>) -> &mut Entry<'static> { // SAFETY: _entry is not structurally pinned. unsafe { &mut Pin::into_inner_unchecked(self)._entry } } fn new(data: impl PinInit + 'b, init: F) -> impl PinInit + 'b where - F: for<'a> FnOnce(&'a T) -> Entry + 'b, + F: for<'a> FnOnce(&'a T) -> Entry<'static> + 'b, { try_pin_init! { Self { @@ -335,6 +388,31 @@ impl<'b, T: 'b> Scope { } } +impl<'a, T: 'a> Scope { + /// Creates a new scope, which is a directory at the root of the debugfs filesystem, + /// associated with some data `T`. + /// + /// The `init` closure is called to populate the directory with files and subdirectories. These + /// files can reference the data stored in the scope. + /// + /// The entire directory tree created within the scope will be removed when the returned + /// `Scope` handle is dropped. + pub fn dir( + data: impl PinInit + 'a, + name: &'a CStr, + init: F, + ) -> impl PinInit + 'a + where + F: for<'data, 'dir> FnOnce(&'data T, &'dir ScopedDir<'data, 'dir>) + 'a, + { + Scope::new(data, |data| { + let scoped = ScopedDir::new(name); + init(data, &scoped); + scoped.into_entry() + }) + } +} + impl Deref for Scope { type Target = T; fn deref(&self) -> &T { @@ -348,3 +426,169 @@ impl Deref for File { &self.scope } } + +/// A handle to a directory which will live at most `'dir`, accessing data that will live for at +/// least `'data`. +/// +/// Dropping a ScopedDir will not delete or clean it up, this is expected to occur through dropping +/// the `Scope` that created it. +pub struct ScopedDir<'data, 'dir> { + #[cfg(CONFIG_DEBUG_FS)] + entry: ManuallyDrop>, + _phantom: PhantomData &'dir ()>, +} + +impl<'data, 'dir> ScopedDir<'data, 'dir> { + /// Creates a subdirectory inside this `ScopedDir`. + /// + /// The returned directory handle cannot outlive this one. + pub fn dir<'dir2>(&'dir2 self, name: &CStr) -> ScopedDir<'data, 'dir2> { + #[cfg(not(CONFIG_DEBUG_FS))] + let _ = name; + ScopedDir { + #[cfg(CONFIG_DEBUG_FS)] + entry: ManuallyDrop::new(Entry::dir(name, Some(&*self.entry))), + _phantom: PhantomData, + } + } + + fn create_file(&self, name: &CStr, data: &'data T, vtable: &'static FileOps) { + #[cfg(CONFIG_DEBUG_FS)] + core::mem::forget(Entry::file(name, &self.entry, data, vtable)); + } + + /// Creates a read-only file in this directory. + /// + /// The file's contents are produced by invoking [`Writer::write`]. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn read_only_file(&self, name: &CStr, data: &'data T) { + self.create_file(name, data, &T::FILE_OPS) + } + + /// Creates a read-only file in this directory, with contents from a callback. + /// + /// The file contents are generated by calling `f` with `data`. + /// + /// + /// `f` must be a function item or a non-capturing closure. + /// This is statically asserted and not a safety requirement. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn read_callback_file(&self, name: &CStr, data: &'data T, _f: &'static F) + where + T: Send + Sync + 'static, + F: Fn(&T, &mut fmt::Formatter<'_>) -> fmt::Result + Send + Sync, + { + let vtable = as ReadFile<_>>::FILE_OPS.adapt(); + self.create_file(name, data, vtable) + } + + /// Creates a read-write file in this directory. + /// + /// Reading the file uses the [`Writer`] implementation on `data`. Writing to the file uses + /// the [`Reader`] implementation on `data`. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn read_write_file( + &self, + name: &CStr, + data: &'data T, + ) { + let vtable = &>::FILE_OPS; + self.create_file(name, data, vtable) + } + + /// Creates a read-write file in this directory, with logic from callbacks. + /// + /// Reading from the file is handled by `f`. Writing to the file is handled by `w`. + /// + /// `f` and `w` must be function items or non-capturing closures. + /// This is statically asserted and not a safety requirement. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn read_write_callback_file( + &self, + name: &CStr, + data: &'data T, + _f: &'static F, + _w: &'static W, + ) where + T: Send + Sync + 'static, + F: Fn(&T, &mut fmt::Formatter<'_>) -> fmt::Result + Send + Sync, + W: Fn(&T, &mut UserSliceReader) -> Result + Send + Sync, + { + let vtable = , W> as ReadWriteFile<_>>::FILE_OPS + .adapt() + .adapt(); + self.create_file(name, data, vtable) + } + + /// Creates a write-only file in this directory. + /// + /// Writing to the file uses the [`Reader`] implementation on `data`. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn write_only_file(&self, name: &CStr, data: &'data T) { + let vtable = &>::FILE_OPS; + self.create_file(name, data, vtable) + } + + /// Creates a write-only file in this directory, with write logic from a callback. + /// + /// Writing to the file is handled by `w`. + /// + /// `w` must be a function item or a non-capturing closure. + /// This is statically asserted and not a safety requirement. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn write_only_callback_file(&self, name: &CStr, data: &'data T, _w: &'static W) + where + T: Send + Sync + 'static, + W: Fn(&T, &mut UserSliceReader) -> Result + Send + Sync, + { + let vtable = &, W> as WriteFile<_>>::FILE_OPS + .adapt() + .adapt(); + self.create_file(name, data, vtable) + } + + fn empty() -> Self { + ScopedDir { + #[cfg(CONFIG_DEBUG_FS)] + entry: ManuallyDrop::new(Entry::empty()), + _phantom: PhantomData, + } + } + #[cfg(CONFIG_DEBUG_FS)] + fn into_entry(self) -> Entry<'dir> { + ManuallyDrop::into_inner(self.entry) + } + #[cfg(not(CONFIG_DEBUG_FS))] + fn into_entry(self) {} +} + +impl<'data> ScopedDir<'data, 'static> { + // This is safe, but intentionally not exported due to footgun status. A ScopedDir with no + // parent will never be released by default, and needs to have its entry extracted and used + // somewhere. + fn new(name: &CStr) -> ScopedDir<'data, 'static> { + ScopedDir { + #[cfg(CONFIG_DEBUG_FS)] + entry: ManuallyDrop::new(Entry::dir(name, None)), + _phantom: PhantomData, + } + } +} diff --git a/rust/kernel/debugfs/entry.rs b/rust/kernel/debugfs/entry.rs index 227fa50b7a79ae..f99402cd3ba0ca 100644 --- a/rust/kernel/debugfs/entry.rs +++ b/rust/kernel/debugfs/entry.rs @@ -5,26 +5,29 @@ use crate::debugfs::file_ops::FileOps; use crate::ffi::c_void; use crate::str::CStr; use crate::sync::Arc; +use core::marker::PhantomData; /// Owning handle to a DebugFS entry. /// /// # Invariants /// /// The wrapped pointer will always be `NULL`, an error, or an owned DebugFS `dentry`. -pub(crate) struct Entry { +pub(crate) struct Entry<'a> { entry: *mut bindings::dentry, // If we were created with an owning parent, this is the keep-alive - _parent: Option>, + _parent: Option>>, + // If we were created with a non-owning parent, this prevents us from outliving it + _phantom: PhantomData<&'a ()>, } // SAFETY: [`Entry`] is just a `dentry` under the hood, which the API promises can be transferred // between threads. -unsafe impl Send for Entry {} +unsafe impl Send for Entry<'_> {} // SAFETY: All the C functions we call on the `dentry` pointer are threadsafe. -unsafe impl Sync for Entry {} +unsafe impl Sync for Entry<'_> {} -impl Entry { +impl Entry<'static> { pub(crate) fn dynamic_dir(name: &CStr, parent: Option>) -> Self { let parent_ptr = match &parent { Some(entry) => entry.as_ptr(), @@ -39,6 +42,7 @@ impl Entry { Entry { entry, _parent: parent, + _phantom: PhantomData, } } @@ -71,14 +75,71 @@ impl Entry { Entry { entry, _parent: Some(parent), + _phantom: PhantomData, } } +} + +impl<'a> Entry<'a> { + pub(crate) fn dir(name: &CStr, parent: Option<&'a Entry<'_>>) -> Self { + let parent_ptr = match &parent { + Some(entry) => entry.as_ptr(), + None => core::ptr::null_mut(), + }; + // SAFETY: The invariants of this function's arguments ensure the safety of this call. + // * `name` is a valid C string by the invariants of `&CStr`. + // * `parent_ptr` is either `NULL` (if `parent` is `None`), or a pointer to a valid + // `dentry` (because `parent` is a valid reference to an `Entry`). The lifetime `'a` + // ensures that the parent outlives this entry. + let entry = unsafe { bindings::debugfs_create_dir(name.as_char_ptr(), parent_ptr) }; + + Entry { + entry, + _parent: None, + _phantom: PhantomData, + } + } + + pub(crate) fn file( + name: &CStr, + parent: &'a Entry<'_>, + data: &'a T, + file_ops: &FileOps, + ) -> Self { + // SAFETY: The invariants of this function's arguments ensure the safety of this call. + // * `name` is a valid C string by the invariants of `&CStr`. + // * `parent.as_ptr()` is a pointer to a valid `dentry` because we have `&'a Entry`. + // * `data` is a valid pointer to `T` for lifetime `'a`. + // * The returned `Entry` has lifetime `'a`, so it cannot outlive `parent` or `data`. + // * The caller guarantees that `vtable` is compatible with `data`. + // * The guarantees on `FileOps` assert the vtable will be compatible with the data we have + // provided. + let entry = unsafe { + bindings::debugfs_create_file_full( + name.as_char_ptr(), + file_ops.mode(), + parent.as_ptr(), + core::ptr::from_ref(data) as *mut c_void, + core::ptr::null(), + &**file_ops, + ) + }; + + Entry { + entry, + _parent: None, + _phantom: PhantomData, + } + } +} +impl Entry<'_> { /// Constructs a placeholder DebugFS [`Entry`]. pub(crate) fn empty() -> Self { Self { entry: core::ptr::null_mut(), _parent: None, + _phantom: PhantomData, } } @@ -94,7 +155,7 @@ impl Entry { } } -impl Drop for Entry { +impl Drop for Entry<'_> { fn drop(&mut self) { // SAFETY: `debugfs_remove` can take `NULL`, error values, and legal DebugFS dentries. // `as_ptr` guarantees that the pointer is of this form. From a1ffc8ad3165fa1cf6a60c6a4b4e00dfd6603cf2 Mon Sep 17 00:00:00 2001 From: Yi Tao Date: Wed, 10 Sep 2025 14:59:33 +0800 Subject: [PATCH 1389/3206] cgroup: refactor the cgroup_attach_lock code to make it clearer Dynamic cgroup migration involving threadgroup locks can be in one of two states: no lock held, or holding the global lock. Explicitly declaring the different lock modes to make the code easier to understand and facilitates future extensions of the lock modes. Signed-off-by: Yi Tao Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 8 ++++ kernel/cgroup/cgroup-internal.h | 9 +++-- kernel/cgroup/cgroup-v1.c | 14 +++---- kernel/cgroup/cgroup.c | 67 ++++++++++++++++++++++++--------- 4 files changed, 69 insertions(+), 29 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 92ed6d18266d13..ff3c7d0e3e0178 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -140,6 +140,14 @@ enum { __CFTYPE_ADDED = (1 << 18), }; +enum cgroup_attach_lock_mode { + /* Default */ + CGRP_ATTACH_LOCK_GLOBAL, + + /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */ + CGRP_ATTACH_LOCK_NONE, +}; + /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index b14e61c64a3473..a6d6f30b6f657d 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,12 +249,13 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(bool lock_threadgroup); -void cgroup_attach_unlock(bool lock_threadgroup); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) + enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task, bool locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 0a23b65de013fc..852ebe7ca3a1bb 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -69,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); for_each_root(root) { struct cgroup *from_cgrp; @@ -81,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); cgroup_unlock(); return retval; @@ -118,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -154,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); cgroup_unlock(); return ret; } @@ -503,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; - bool locked; + enum cgroup_attach_lock_mode lock_mode; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -532,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0607c5d0923782..0994eeaa2f6941 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2482,7 +2482,7 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * @lock_mode: whether to down_write cgroup_threadgroup_rwsem * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() @@ -2503,21 +2503,39 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ -void cgroup_attach_lock(bool lock_threadgroup) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) { cpus_read_lock(); - if (lock_threadgroup) + + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + * @lock_mode: whether to up_write cgroup_threadgroup_rwsem */ -void cgroup_attach_unlock(bool lock_threadgroup) +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode) { - if (lock_threadgroup) + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } + cpus_read_unlock(); } @@ -2991,7 +3009,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *threadgroup_locked) + enum cgroup_attach_lock_mode *lock_mode) { struct task_struct *tsk; pid_t pid; @@ -3008,8 +3026,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); - *threadgroup_locked = pid || threadgroup; - cgroup_attach_lock(*threadgroup_locked); + + if (pid || threadgroup) + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + *lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(*lock_mode); rcu_read_lock(); if (pid) { @@ -3040,19 +3063,20 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, goto out_unlock_rcu; out_unlock_threadgroup: - cgroup_attach_unlock(*threadgroup_locked); - *threadgroup_locked = false; + cgroup_attach_unlock(*lock_mode); + *lock_mode = CGRP_ATTACH_LOCK_NONE; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) { /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - cgroup_attach_unlock(threadgroup_locked); + cgroup_attach_unlock(lock_mode); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3104,6 +3128,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + enum cgroup_attach_lock_mode lock_mode; bool has_tasks; int ret; @@ -3135,7 +3160,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - cgroup_attach_lock(has_tasks); + + if (has_tasks) + lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(lock_mode); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3156,7 +3187,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(has_tasks); + cgroup_attach_unlock(lock_mode); return ret; } @@ -5279,13 +5310,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool threadgroup_locked; + enum cgroup_attach_lock_mode lock_mode; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5311,7 +5342,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, threadgroup_locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); From 477abc2ec889a9dd3eb4ae0adbf6408a569bf2b6 Mon Sep 17 00:00:00 2001 From: Yi Tao Date: Wed, 10 Sep 2025 14:59:34 +0800 Subject: [PATCH 1390/3206] cgroup: relocate cgroup_attach_lock within cgroup_procs_write_start Later patches will introduce a new parameter `task` to cgroup_attach_lock, thus adjusting the position of cgroup_attach_lock within cgroup_procs_write_start. Between obtaining the threadgroup leader via PID and acquiring the cgroup attach lock, the threadgroup leader may change, which could lead to incorrect cgroup migration. Therefore, after acquiring the cgroup attach lock, we check whether the threadgroup leader has changed, and if so, retry the operation. tj: Minor comment adjustments. Signed-off-by: Yi Tao Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 58 +++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0994eeaa2f6941..a6b81b48bb7008 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3017,29 +3017,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - /* - * If we migrate a single thread, we don't care about threadgroup - * stability. If the thread is `current`, it won't exit(2) under our - * hands or change PID through exec(2). We exclude - * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write - * callers by cgroup_mutex. - * Therefore, we can skip the global lock. - */ - lockdep_assert_held(&cgroup_mutex); - - if (pid || threadgroup) - *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; - else - *lock_mode = CGRP_ATTACH_LOCK_NONE; - - cgroup_attach_lock(*lock_mode); - +retry_find_task: rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } } else { tsk = current; @@ -3056,15 +3040,43 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } get_task_struct(tsk); - goto out_unlock_rcu; + rcu_read_unlock(); + + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers + * by cgroup_mutex. Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + + if (pid || threadgroup) + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + *lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(*lock_mode); + + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * A race with de_thread from another thread's exec() + * may strip us of our leadership. If this happens, + * throw this task away and try again. + */ + cgroup_attach_unlock(*lock_mode); + put_task_struct(tsk); + goto retry_find_task; + } + } + + return tsk; -out_unlock_threadgroup: - cgroup_attach_unlock(*lock_mode); - *lock_mode = CGRP_ATTACH_LOCK_NONE; out_unlock_rcu: rcu_read_unlock(); return tsk; From 0568f89d4fb82d98001baeb870e92f43cd1f7317 Mon Sep 17 00:00:00 2001 From: Yi Tao Date: Wed, 10 Sep 2025 14:59:35 +0800 Subject: [PATCH 1391/3206] cgroup: replace global percpu_rwsem with per threadgroup resem when writing to cgroup.procs The static usage pattern of creating a cgroup, enabling controllers, and then seeding it with CLONE_INTO_CGROUP doesn't require write locking cgroup_threadgroup_rwsem and thus doesn't benefit from this patch. To avoid affecting other users, the per threadgroup rwsem is only used when the favordynmods is enabled. As computer hardware advances, modern systems are typically equipped with many CPU cores and large amounts of memory, enabling the deployment of numerous applications. On such systems, container creation and deletion become frequent operations, making cgroup process migration no longer a cold path. This leads to noticeable contention with common process operations such as fork, exec, and exit. To alleviate the contention between cgroup process migration and operations like process fork, this patch modifies lock to take the write lock on signal_struct->group_rwsem when writing pid to cgroup.procs/threads instead of holding a global write lock. Cgroup process migration has historically relied on signal_struct->group_rwsem to protect thread group integrity. In commit <1ed1328792ff> ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem"), this was changed to a global cgroup_threadgroup_rwsem. The advantage of using a global lock was simplified handling of process group migrations. This patch retains the use of the global lock for protecting process group migration, while reducing contention by using per thread group lock during cgroup.procs/threads writes. The locking behavior is as follows: write cgroup.procs/threads | process fork,exec,exit | process group migration ------------------------------------------------------------------------------ cgroup_lock() | down_read(&g_rwsem) | cgroup_lock() down_write(&p_rwsem) | down_read(&p_rwsem) | down_write(&g_rwsem) critical section | critical section | critical section up_write(&p_rwsem) | up_read(&p_rwsem) | up_write(&g_rwsem) cgroup_unlock() | up_read(&g_rwsem) | cgroup_unlock() g_rwsem denotes cgroup_threadgroup_rwsem, p_rwsem denotes signal_struct->group_rwsem. This patch eliminates contention between cgroup migration and fork operations for threads that belong to different thread groups, thereby reducing the long-tail latency of cgroup migrations and lowering system load. With this patch, under heavy fork and exec interference, the long-tail latency of cgroup migration has been reduced from milliseconds to microseconds. Under heavy cgroup migration interference, the multi-CPU score of the spawn test case in UnixBench increased by 9%. tj: Update comment in cgroup_favor_dynmods() and switch WARN_ONCE() to pr_warn_once(). Signed-off-by: Yi Tao Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 17 +++++++- include/linux/sched/signal.h | 4 ++ init/init_task.c | 3 ++ kernel/cgroup/cgroup-internal.h | 6 ++- kernel/cgroup/cgroup-v1.c | 8 ++-- kernel/cgroup/cgroup.c | 73 ++++++++++++++++++++++++++------- kernel/fork.c | 4 ++ 7 files changed, 93 insertions(+), 22 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ff3c7d0e3e0178..93318fce31f3a8 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -91,6 +91,12 @@ enum { * cgroup_threadgroup_rwsem. This makes hot path operations such as * forks and exits into the slow path and more expensive. * + * Alleviate the contention between fork, exec, exit operations and + * writing to cgroup.procs by taking a per threadgroup rwsem instead of + * the global cgroup_threadgroup_rwsem. Fork and other operations + * from threads in different thread groups no longer contend with + * writing to cgroup.procs. + * * The static usage pattern of creating a cgroup, enabling controllers, * and then seeding it with CLONE_INTO_CGROUP doesn't require write * locking cgroup_threadgroup_rwsem and thus doesn't benefit from @@ -146,6 +152,9 @@ enum cgroup_attach_lock_mode { /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */ CGRP_ATTACH_LOCK_NONE, + + /* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */ + CGRP_ATTACH_LOCK_PER_THREADGROUP, }; /* @@ -846,6 +855,7 @@ struct cgroup_subsys { }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +extern bool cgroup_enable_per_threadgroup_rwsem; struct cgroup_of_peak { unsigned long value; @@ -857,11 +867,14 @@ struct cgroup_of_peak { * @tsk: target task * * Allows cgroup operations to synchronize against threadgroup changes - * using a percpu_rw_semaphore. + * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when + * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition. */ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { percpu_down_read(&cgroup_threadgroup_rwsem); + if (cgroup_enable_per_threadgroup_rwsem) + down_read(&tsk->signal->cgroup_threadgroup_rwsem); } /** @@ -872,6 +885,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) */ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) { + if (cgroup_enable_per_threadgroup_rwsem) + up_read(&tsk->signal->cgroup_threadgroup_rwsem); percpu_up_read(&cgroup_threadgroup_rwsem); } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 1ef1edbaaf79a8..7d6449982822e5 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -226,6 +226,10 @@ struct signal_struct { struct tty_audit_buf *tty_audit_buf; #endif +#ifdef CONFIG_CGROUPS + struct rw_semaphore cgroup_threadgroup_rwsem; +#endif + /* * Thread is the potential origin of an oom condition; kill first on * oom diff --git a/init/init_task.c b/init/init_task.c index e557f622bd9061..a55e2189206fa4 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -27,6 +27,9 @@ static struct signal_struct init_signals = { }, .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, +#ifdef CONFIG_CGROUPS + .cgroup_threadgroup_rwsem = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem), +#endif .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index a6d6f30b6f657d..22051b4f1ccbc0 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,8 +249,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode); -void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 852ebe7ca3a1bb..a9e029b570c8c8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -69,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; cgroup_lock(); - cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); for_each_root(root) { struct cgroup *from_cgrp; @@ -81,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return retval; @@ -118,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -154,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a6b81b48bb7008..fed701df116707 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -239,6 +239,14 @@ static u16 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); +/* + * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem, + * read protected by either. + * + * Can only be turned on, but not turned off. + */ +bool cgroup_enable_per_threadgroup_rwsem __read_mostly; + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.count = REFCOUNT_INIT(2), @@ -1325,14 +1333,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; - /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + /* + * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition. + * favordynmods can flip while task is between + * cgroup_threadgroup_change_begin() and end(), so down_write global + * cgroup_threadgroup_rwsem to synchronize them. + * + * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding + * cgroup_threadgroup_rwsem doesn't exlude tasks between + * cgroup_thread_group_change_begin() and end() and thus it's unsafe to + * turn off. As the scenario is unlikely, simply disallow disabling once + * enabled and print out a warning. + */ + percpu_down_write(&cgroup_threadgroup_rwsem); if (favor && !favoring) { + cgroup_enable_per_threadgroup_rwsem = true; rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { + if (cgroup_enable_per_threadgroup_rwsem) + pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n"); rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } + percpu_up_write(&cgroup_threadgroup_rwsem); } static int cgroup_init_root_id(struct cgroup_root *root) @@ -2482,7 +2506,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() - * @lock_mode: whether to down_write cgroup_threadgroup_rwsem + * @lock_mode: whether acquire and acquire which rwsem + * @tsk: thread group to lock * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() @@ -2502,8 +2527,15 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. + * + * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead + * on dynamic cgroup modifications. see the comment above + * CGRP_ROOT_FAVOR_DYNMODS definition. + * + * tsk is not NULL only when writing to cgroup.procs. */ -void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { cpus_read_lock(); @@ -2513,6 +2545,9 @@ void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + down_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; default: pr_warn("cgroup: Unexpected attach lock mode."); break; @@ -2521,9 +2556,11 @@ void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) /** * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_mode: whether to up_write cgroup_threadgroup_rwsem + * @lock_mode: whether release and release which rwsem + * @tsk: thread group to lock */ -void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode) +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { switch (lock_mode) { case CGRP_ATTACH_LOCK_NONE: @@ -2531,6 +2568,9 @@ void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode) case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + up_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; default: pr_warn("cgroup: Unexpected attach lock mode."); break; @@ -3042,7 +3082,6 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, tsk = ERR_PTR(-EINVAL); goto out_unlock_rcu; } - get_task_struct(tsk); rcu_read_unlock(); @@ -3055,12 +3094,16 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, */ lockdep_assert_held(&cgroup_mutex); - if (pid || threadgroup) - *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; - else + if (pid || threadgroup) { + if (cgroup_enable_per_threadgroup_rwsem) + *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP; + else + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + } else { *lock_mode = CGRP_ATTACH_LOCK_NONE; + } - cgroup_attach_lock(*lock_mode); + cgroup_attach_lock(*lock_mode, tsk); if (threadgroup) { if (!thread_group_leader(tsk)) { @@ -3069,7 +3112,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, * may strip us of our leadership. If this happens, * throw this task away and try again. */ - cgroup_attach_unlock(*lock_mode); + cgroup_attach_unlock(*lock_mode, tsk); put_task_struct(tsk); goto retry_find_task; } @@ -3085,10 +3128,10 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, void cgroup_procs_write_finish(struct task_struct *task, enum cgroup_attach_lock_mode lock_mode) { + cgroup_attach_unlock(lock_mode, task); + /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - - cgroup_attach_unlock(lock_mode); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3178,7 +3221,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) else lock_mode = CGRP_ATTACH_LOCK_NONE; - cgroup_attach_lock(lock_mode); + cgroup_attach_lock(lock_mode, NULL); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3199,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(lock_mode); + cgroup_attach_unlock(lock_mode, NULL); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index c4ada32598bd5e..9a039867ecfd2a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1688,6 +1688,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->cgroup_threadgroup_rwsem); +#endif + sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; From fc670ad5966f999b970b2767f55ce9e978e44d9c Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 10 Sep 2025 11:09:28 -0700 Subject: [PATCH 1392/3206] Revert "KVM: arm64: Reschedule as needed when destroying the stage-2 page-tables" This reverts commit e9abe311f35631a999fe38c86f26f0e48ffe46d5. syzkaller has managed to tease out multiple bugs in this change and fixing-forward didn't remedy the situation. Considering newly-introduced memory safety issues the potential for scheduler stalls don't seem that bad in comparison Link: https://lore.kernel.org/kvmarm/68c09802.050a0220.3c6139.000d.GAE@google.com/ Message-ID: <20250910180930.3679473-2-oliver.upton@linux.dev> Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index e78ffcb4bfe900..862bfa272f0a97 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -904,35 +904,11 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) return 0; } -/* - * Assume that @pgt is valid and unlinked from the KVM MMU to free the - * page-table without taking the kvm_mmu_lock and without performing any - * TLB invalidations. - * - * Also, the range of addresses can be large enough to cause need_resched - * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke - * cond_resched() periodically to prevent hogging the CPU for a long time - * and schedule something else, if required. - */ -static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, - phys_addr_t end) -{ - u64 next; - - do { - next = stage2_range_addr_end(addr, end); - KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, - next - addr); - if (next != end) - cond_resched(); - } while (addr = next, addr != end); -} - static void kvm_stage2_destroy(struct kvm_pgtable *pgt) { unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); - stage2_destroy_range(pgt, 0, BIT(ia_bits)); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, 0, BIT(ia_bits)); KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); } From e6157256ee1a6a500da42556e059d4dec2ade871 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 10 Sep 2025 11:09:29 -0700 Subject: [PATCH 1393/3206] Revert "KVM: arm64: Split kvm_pgtable_stage2_destroy()" This reverts commit 0e89ca13ee5ff41b437bb2a003c0eaf34ea43555. The functional change that depended on this refactoring has been found to be quite problematic. Reverting the whole pile to start fresh when new fixes are available. Message-ID: <20250910180930.3679473-3-oliver.upton@linux.dev> Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 30 ---------------------------- arch/arm64/include/asm/kvm_pkvm.h | 4 +--- arch/arm64/kvm/hyp/pgtable.c | 25 ++++------------------- arch/arm64/kvm/mmu.c | 12 ++--------- arch/arm64/kvm/pkvm.c | 11 ++-------- 5 files changed, 9 insertions(+), 73 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 1246216616b518..2888b5d0375736 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -355,11 +355,6 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return pteref; } -static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) -{ - return pteref; -} - static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { /* @@ -389,11 +384,6 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); } -static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) -{ - return rcu_dereference_raw(pteref); -} - static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { if (walker->flags & KVM_PGTABLE_WALK_SHARED) @@ -561,26 +551,6 @@ static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2 */ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); -/** - * kvm_pgtable_stage2_destroy_range() - Destroy the unlinked range of addresses. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). - * @addr: Intermediate physical address at which to place the mapping. - * @size: Size of the mapping. - * - * The page-table is assumed to be unreachable by any hardware walkers prior - * to freeing and therefore no TLB invalidation is performed. - */ -void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, - u64 addr, u64 size); - -/** - * kvm_pgtable_stage2_destroy_pgd() - Destroy the PGD of guest stage-2 page-table. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). - * - * It is assumed that the rest of the page-table is freed before this operation. - */ -void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); - /** * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure. * @mm_ops: Memory management callbacks. diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 35f9d94780048c..ea58282f59bb4f 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -179,9 +179,7 @@ struct pkvm_mapping { int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops); -void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, - u64 addr, u64 size); -void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); +void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index c36f282a175dfc..c351b4abd5dbfb 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1551,38 +1551,21 @@ static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, return 0; } -void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, - u64 addr, u64 size) +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) { + size_t pgd_sz; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; - WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); -} - -void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) -{ - size_t pgd_sz; - + WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - - /* - * Since the pgtable is unlinked at this point, and not shared with - * other walkers, safely deference pgd with kvm_dereference_pteref_raw() - */ - pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz); + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); pgt->pgd = NULL; } -void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) -{ - kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); - kvm_pgtable_stage2_destroy_pgd(pgt); -} - void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 862bfa272f0a97..7363942925038e 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -904,14 +904,6 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) return 0; } -static void kvm_stage2_destroy(struct kvm_pgtable *pgt) -{ - unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); - - KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, 0, BIT(ia_bits)); - KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); -} - /** * kvm_init_stage2_mmu - Initialise a S2 MMU structure * @kvm: The pointer to the KVM structure @@ -988,7 +980,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t return 0; out_destroy_pgtable: - kvm_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); out_free_pgtable: kfree(pgt); return err; @@ -1089,7 +1081,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) write_unlock(&kvm->mmu_lock); if (pgt) { - kvm_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); kfree(pgt); } } diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 61827cf6fea4aa..fcd70bfe44fb8c 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -316,16 +316,9 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e return 0; } -void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, - u64 addr, u64 size) +void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) { - __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); -} - -void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) -{ - /* Expected to be called after all pKVM mappings have been released. */ - WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root)); + __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL)); } int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, From 5e13f2c491a4100d208e77e92fe577fe3dbad6c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Sep 2025 14:45:21 +0200 Subject: [PATCH 1394/3206] netfilter: nft_set_bitmap: fix lockdep splat due to missing annotation Running new 'set_flush_add_atomic_bitmap' test case for nftables.git with CONFIG_PROVE_RCU_LIST=y yields: net/netfilter/nft_set_bitmap.c:231 RCU-list traversed in non-reader section!! rcu_scheduler_active = 2, debug_locks = 1 1 lock held by nft/4008: #0: ffff888147f79cd8 (&nft_net->commit_mutex){+.+.}-{4:4}, at: nf_tables_valid_genid+0x2f/0xd0 lockdep_rcu_suspicious+0x116/0x160 nft_bitmap_walk+0x22d/0x240 nf_tables_delsetelem+0x1010/0x1a00 .. This is a false positive, the list cannot be altered while the transaction mutex is held, so pass the relevant argument to the iterator. Fixes tag intentionally wrong; no point in picking this up if earlier false-positive-fixups were not applied. Fixes: 28b7a6b84c0a ("netfilter: nf_tables: avoid false-positive lockdep splats in set walker") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_bitmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index c24c922f895d87..8d3f040a904a2c 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -226,7 +226,8 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; From c4eaca2e1052adfd67bed0a36a9d4b8e515666e4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:18 +0200 Subject: [PATCH 1395/3206] netfilter: nft_set_pipapo: don't check genbit from packetpath lookups The pipapo set type is special in that it has two copies of its datastructure: one live copy containing only valid elements and one on-demand clone used during transaction where adds/deletes happen. This clone is not visible to the datapath. This is unlike all other set types in nftables, those all link new elements into their live hlist/tree. For those sets, the lookup functions must skip the new elements while the transaction is ongoing to ensure consistency. As the clone is shallow, removal does have an effect on the packet path: once the transaction enters the commit phase the 'gencursor' bit that determines which elements are active and which elements should be ignored (because they are no longer valid) is flipped. This causes the datapath lookup to ignore these elements if they are found during lookup. This opens up a small race window where pipapo has an inconsistent view of the dataset from when the transaction-cpu flipped the genbit until the transaction-cpu calls nft_pipapo_commit() to swap live/clone pointers: cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: I) increments the genbit A) observes new genbit removes elements from the clone so they won't be found anymore B) lookup in datastructure can't see new elements yet, but old elements are ignored -> Only matches elements that were not changed in the transaction II) calls nft_pipapo_commit(), clone and live pointers are swapped. C New nft_lookup happening now will find matching elements. Consider a packet matching range r1-r2: cpu0 processes following transaction: 1. remove r1-r2 2. add r1-r3 P is contained in both ranges. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 does find r1-r2, but then ignores it due to the genbit indicating the range has been removed. At the same time, r1-r3 is not visible yet, because it can only be found in the clone. The situation persists for all lookups until after cpu0 hits II). The fix is easy: Don't check the genbit from pipapo lookup functions. This is possible because unlike the other set types, the new elements are not reachable from the live copy of the dataset. The clone/live pointer swap is enough to avoid matching on old elements while at the same time all new elements are exposed in one go. After this change, step B above returns a match in r1-r2. This is fine: r1-r2 only becomes truly invalid the moment they get freed. This happens after a synchronize_rcu() call and rcu read lock is held via netfilter hook traversal (nf_hook_slow()). Cc: Stefano Brivio Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 20 ++++++++++++++++++-- net/netfilter/nft_set_pipapo_avx2.c | 4 +--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 9a10251228fd5d..793790d79d1384 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -510,6 +510,23 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, * * This function is called from the data path. It will search for * an element matching the given key in the current active copy. + * Unlike other set types, this uses NFT_GENMASK_ANY instead of + * nft_genmask_cur(). + * + * This is because new (future) elements are not reachable from + * priv->match, they get added to priv->clone instead. + * When the commit phase flips the generation bitmask, the + * 'now old' entries are skipped but without the 'now current' + * elements becoming visible. Using nft_genmask_cur() thus creates + * inconsistent state: matching old entries get skipped but thew + * newly matching entries are unreachable. + * + * GENMASK will still find the 'now old' entries which ensures consistent + * priv->match view. + * + * nft_pipapo_commit swaps ->clone and ->match shortly after the + * genbit flip. As ->clone doesn't contain the old entries in the first + * place, lookup will only find the now-current ones. * * Return: ntables API extension pointer or NULL if no match. */ @@ -518,12 +535,11 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_pipapo *priv = nft_set_priv(set); - u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); - e = pipapo_get(m, (const u8 *)key, genmask, get_jiffies_64()); + e = pipapo_get(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64()); return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 2f090e253caf7c..c0884fa68c7980 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1152,7 +1152,6 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, struct nft_pipapo *priv = nft_set_priv(set); const struct nft_set_ext *ext = NULL; struct nft_pipapo_scratch *scratch; - u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; @@ -1248,8 +1247,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, if (last) { const struct nft_set_ext *e = &f->mt[ret].e->ext; - if (unlikely(nft_set_elem_expired(e) || - !nft_set_elem_active(e, genmask))) + if (unlikely(nft_set_elem_expired(e))) goto next_match; ext = e; From a60f7bf4a1524d8896b76ba89623080aebf44272 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:19 +0200 Subject: [PATCH 1396/3206] netfilter: nft_set_rbtree: continue traversal if element is inactive When the rbtree lookup function finds a match in the rbtree, it sets the range start interval to a potentially inactive element. Then, after tree lookup, if the matching element is inactive, it returns NULL and suppresses a matching result. This is wrong and leads to false negative matches when a transaction has already entered the commit phase. cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: I) increments the genbit A) observes new genbit B) finds matching range C) returns no match: found range invalid in new generation II) removes old elements from the tree C New nft_lookup happening now will find matching element, because it is no longer obscured by old, inactive one. Consider a packet matching range r1-r2: cpu0 processes following transaction: 1. remove r1-r2 2. add r1-r3 P is contained in both ranges. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 does find r1-r2, but then ignores it due to the genbit indicating the range has been removed. It does NOT test for further matches. The situation persists for all lookups until after cpu0 hits II) after which r1-r3 range start node is tested for the first time. Move the "interval start is valid" check ahead so that tree traversal continues if the starting interval is not valid in this generation. Thanks to Stefan Hanreich for providing an initial reproducer for this bug. Reported-by: Stefan Hanreich Fixes: c1eda3c6394f ("netfilter: nft_rbtree: ignore inactive matching element with no descendants") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_rbtree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 938a257c069e23..b1f04168ec9377 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -77,7 +77,9 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; - interval = rbe; + if (nft_set_elem_active(&rbe->ext, genmask) && + !nft_rbtree_elem_expired(rbe)) + interval = rbe; } else if (d > 0) parent = rcu_dereference_raw(parent->rb_right); else { @@ -102,8 +104,6 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, } if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_set_elem_active(&interval->ext, genmask) && - !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) return &interval->ext; From 64102d9bbc3d41dac5188b8fba75b1344c438970 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:20 +0200 Subject: [PATCH 1397/3206] netfilter: nf_tables: place base_seq in struct net This will soon be read from packet path around same time as the gencursor. Both gencursor and base_seq get incremented almost at the same time, so it makes sense to place them in the same structure. This doesn't increase struct net size on 64bit due to padding. Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 1 - include/net/netns/nftables.h | 1 + net/netfilter/nf_tables_api.c | 65 ++++++++++++++++--------------- 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 891e43a01bdc31..3faa80f5d8019d 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1912,7 +1912,6 @@ struct nftables_pernet { struct mutex commit_mutex; u64 table_handle; u64 tstamp; - unsigned int base_seq; unsigned int gc_seq; u8 validate_state; struct work_struct destroy_work; diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index cc8060c017d5fb..99dd166c5d07c3 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -3,6 +3,7 @@ #define _NETNS_NFTABLES_H_ struct netns_nftables { + unsigned int base_seq; u8 gencursor; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c1082de0965676..9518b50695ba1b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1131,11 +1131,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, return ERR_PTR(-ENOENT); } -static __be16 nft_base_seq(const struct net *net) +static unsigned int nft_base_seq(const struct net *net) { - struct nftables_pernet *nft_net = nft_pernet(net); + return READ_ONCE(net->nft.base_seq); +} - return htons(nft_net->base_seq & 0xffff); +static __be16 nft_base_seq_be16(const struct net *net) +{ + return htons(nft_base_seq(net) & 0xffff); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { @@ -1155,7 +1158,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -1248,7 +1251,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -2030,7 +2033,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -2133,7 +2136,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -3671,7 +3674,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, - nft_base_seq(net)); + nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -3839,7 +3842,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -4050,7 +4053,7 @@ static int nf_tables_getrule_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_RULE_TABLE]), (char *)nla_data(nla[NFTA_RULE_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); kfree(buf); @@ -4887,7 +4890,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), flags, ctx->family, NFNETLINK_V0, - nft_base_seq(ctx->net)); + nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -5032,7 +5035,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && @@ -6209,7 +6212,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && @@ -6238,7 +6241,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) seq = cb->nlh->nlmsg_seq; nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, - table->family, NFNETLINK_V0, nft_base_seq(net)); + table->family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -6331,7 +6334,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, - NFNETLINK_V0, nft_base_seq(ctx->net)); + NFNETLINK_V0, nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -6630,7 +6633,7 @@ static int nf_tables_getsetelem_reset(struct sk_buff *skb, } nelems++; } - audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems); + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems); out_unlock: rcu_read_unlock(); @@ -8381,7 +8384,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -8446,7 +8449,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -8480,7 +8483,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) idx++; } if (ctx->reset && entries) - audit_log_obj_reset(table, nft_net->base_seq, entries); + audit_log_obj_reset(table, nft_base_seq(net), entries); if (rc < 0) break; } @@ -8649,7 +8652,7 @@ static int nf_tables_getobj_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), (char *)nla_data(nla[NFTA_OBJ_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); @@ -8754,9 +8757,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp) { - struct nftables_pernet *nft_net = nft_pernet(net); char *buf = kasprintf(gfp, "%s:%u", - table->name, nft_net->base_seq); + table->name, nft_base_seq(net)); audit_log_nfcfg(buf, family, @@ -9442,7 +9444,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -9511,7 +9513,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -9696,17 +9698,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { - struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, - NFNETLINK_V0, nft_base_seq(net)); + NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -10968,11 +10969,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - base_seq = READ_ONCE(nft_net->base_seq); + base_seq = nft_base_seq(net); while (++base_seq == 0) ; - WRITE_ONCE(nft_net->base_seq, base_seq); + WRITE_ONCE(net->nft.base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); @@ -11181,7 +11182,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - nf_tables_commit_audit_log(&adl, nft_net->base_seq); + nf_tables_commit_audit_log(&adl, nft_base_seq(net)); nft_gc_seq_end(nft_net, gc_seq); nft_net->validate_state = NFT_VALIDATE_SKIP; @@ -11506,7 +11507,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid) mutex_lock(&nft_net->commit_mutex); nft_net->tstamp = get_jiffies_64(); - genid_ok = genid == 0 || nft_net->base_seq == genid; + genid_ok = genid == 0 || nft_base_seq(net) == genid; if (!genid_ok) mutex_unlock(&nft_net->commit_mutex); @@ -12143,7 +12144,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); - nft_net->base_seq = 1; + net->nft.base_seq = 1; nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); From 11fe5a82e53ac3581a80c88e0e35fb8a80e15f48 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:21 +0200 Subject: [PATCH 1398/3206] netfilter: nf_tables: make nft_set_do_lookup available unconditionally This function was added for retpoline mitigation and is replaced by a static inline helper if mitigations are not enabled. Enable this helper function unconditionally so next patch can add a lookup restart mechanism to fix possible false negatives while transactions are in progress. Adding lookup restarts in nft_lookup_eval doesn't work as nft_objref would then need the same copypaste loop. This patch is separate to ease review of the actual bug fix. Suggested-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables_core.h | 10 ++-------- net/netfilter/nft_lookup.c | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 6c2f483d9828dd..656e784714f3b5 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -109,17 +109,11 @@ nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, const struct nft_set_ext * nft_hash_lookup(const struct net *net, const struct nft_set *set, const u32 *key); +#endif + const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key); -#else -static inline const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) -{ - return set->ops->lookup(net, set, key); -} -#endif /* called from nft_pipapo_avx2.c */ const struct nft_set_ext * diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 40c602ffbcba72..2c6909bf1b407a 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -24,11 +24,11 @@ struct nft_lookup { struct nft_set_binding binding; }; -#ifdef CONFIG_MITIGATION_RETPOLINE -const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) +static const struct nft_set_ext * +__nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { +#ifdef CONFIG_MITIGATION_RETPOLINE if (set->ops == &nft_set_hash_fast_type.ops) return nft_hash_lookup_fast(net, set, key); if (set->ops == &nft_set_hash_type.ops) @@ -51,10 +51,17 @@ nft_set_do_lookup(const struct net *net, const struct nft_set *set, return nft_rbtree_lookup(net, set, key); WARN_ON_ONCE(1); +#endif return set->ops->lookup(net, set, key); } + +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + return __nft_set_do_lookup(net, set, key); +} EXPORT_SYMBOL_GPL(nft_set_do_lookup); -#endif void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, From b2f742c846cab9afc5953a5d8f17b54922dcc723 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:22 +0200 Subject: [PATCH 1399/3206] netfilter: nf_tables: restart set lookup on base_seq change The hash, hash_fast, rhash and bitwise sets may indicate no result even though a matching element exists during a short time window while other cpu is finalizing the transaction. This happens when the hash lookup/bitwise lookup function has picked up the old genbit, right before it was toggled by nf_tables_commit(), but then the same cpu managed to unlink the matching old element from the hash table: cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: A) observes old genbit increments base_seq I) increments the genbit II) removes old element from the set B) finds matching element C) returns no match: found element is not valid in old generation Next lookup observes new genbit and finds matching e2. Consider a packet matching element e1, e2. cpu0 processes following transaction: 1. remove e1 2. adds e2, which has same key as e1. P matches both e1 and e2. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 observed the old genbit. e2 will not be considered once it is found. The element e1 is not found anymore if cpu0 managed to unlink it from the hlist before cpu1 found it during list traversal. The situation only occurs for a brief time period, lookups happening after I) observe new genbit and return e2. This problem exists in all set types except nft_set_pipapo, so fix it once in nft_lookup rather than each set ops individually. Sample the base sequence counter, which gets incremented right before the genbit is changed. Then, if no match is found, retry the lookup if the base sequence was altered in between. If the base sequence hasn't changed: - No update took place: no-match result is expected. This is the common case. or: - nf_tables_commit() hasn't progressed to genbit update yet. Old elements were still visible and nomatch result is expected, or: - nf_tables_commit updated the genbit: We picked up the new base_seq, so the lookup function also picked up the new genbit, no-match result is expected. If the old genbit was observed, then nft_lookup also picked up the old base_seq: nft_lookup_should_retry() returns true and relookup is performed in the new generation. This problem was added when the unconditional synchronize_rcu() call that followed the current/next generation bit toggle was removed. Thanks to Pablo Neira Ayuso for reviewing an earlier version of this patchset, for suggesting re-use of existing base_seq and placement of the restart loop in nft_set_do_lookup(). Fixes: 0cbc06b3faba ("netfilter: nf_tables: remove synchronize_rcu in commit phase") Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 3 ++- net/netfilter/nft_lookup.c | 31 ++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9518b50695ba1b..c3c73411c40c4b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10973,7 +10973,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) while (++base_seq == 0) ; - WRITE_ONCE(net->nft.base_seq, base_seq); + /* pairs with smp_load_acquire in nft_lookup_eval */ + smp_store_release(&net->nft.base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 2c6909bf1b407a..58c5b14889c474 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -55,11 +55,40 @@ __nft_set_do_lookup(const struct net *net, const struct nft_set *set, return set->ops->lookup(net, set, key); } +static unsigned int nft_base_seq(const struct net *net) +{ + /* pairs with smp_store_release() in nf_tables_commit() */ + return smp_load_acquire(&net->nft.base_seq); +} + +static bool nft_lookup_should_retry(const struct net *net, unsigned int seq) +{ + return unlikely(seq != nft_base_seq(net)); +} + const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { - return __nft_set_do_lookup(net, set, key); + const struct nft_set_ext *ext; + unsigned int base_seq; + + do { + base_seq = nft_base_seq(net); + + ext = __nft_set_do_lookup(net, set, key); + if (ext) + break; + /* No match? There is a small chance that lookup was + * performed in the old generation, but nf_tables_commit() + * already unlinked a (matching) element. + * + * We need to repeat the lookup to make sure that we didn't + * miss a matching element in the new generation. + */ + } while (nft_lookup_should_retry(net, base_seq)); + + return ext; } EXPORT_SYMBOL_GPL(nft_set_do_lookup); From 37a9675e61a2a2a721a28043ffdf2c8ec81eba37 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Sep 2025 23:52:31 +0200 Subject: [PATCH 1400/3206] MAINTAINERS: add Phil as netfilter reviewer Phil has contributed to netfilter with features, fixes and patch reviews for a long time. Make this more formal and add Reviewer tag. Acked-by: Jozsef Kadlecsik Acked-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2df02e4374ed57..ba11421c33e5ef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17480,6 +17480,7 @@ NETFILTER M: Pablo Neira Ayuso M: Jozsef Kadlecsik M: Florian Westphal +R: Phil Sutter L: netfilter-devel@vger.kernel.org L: coreteam@netfilter.org S: Maintained From d02e48830e3fce9701265f6c5a58d9bdaf906a76 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 25 Aug 2025 18:44:28 +0200 Subject: [PATCH 1401/3206] KVM: SVM: Sync TPR from LAPIC into VMCB::V_TPR even if AVIC is active Commit 3bbf3565f48c ("svm: Do not intercept CR8 when enable AVIC") inhibited pre-VMRUN sync of TPR from LAPIC into VMCB::V_TPR in sync_lapic_to_cr8() when AVIC is active. AVIC does automatically sync between these two fields, however it does so only on explicit guest writes to one of these fields, not on a bare VMRUN. This meant that when AVIC is enabled host changes to TPR in the LAPIC state might not get automatically copied into the V_TPR field of VMCB. This is especially true when it is the userspace setting LAPIC state via KVM_SET_LAPIC ioctl() since userspace does not have access to the guest VMCB. Practice shows that it is the V_TPR that is actually used by the AVIC to decide whether to issue pending interrupts to the CPU (not TPR in TASKPRI), so any leftover value in V_TPR will cause serious interrupt delivery issues in the guest when AVIC is enabled. Fix this issue by doing pre-VMRUN TPR sync from LAPIC into VMCB::V_TPR even when AVIC is enabled. Fixes: 3bbf3565f48c ("svm: Do not intercept CR8 when enable AVIC") Cc: stable@vger.kernel.org Signed-off-by: Maciej S. Szmigiero Reviewed-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/c231be64280b1461e854e1ce3595d70cde3a2e9d.1756139678.git.maciej.szmigiero@oracle.com [sean: tag for stable@] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d9931c6c4bc62a..1bfebe40854f49 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4046,8 +4046,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (nested_svm_virtualize_tpr(vcpu) || - kvm_vcpu_apicv_active(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; cr8 = kvm_get_cr8(vcpu); From d4a5d397c7fb1ca967e0da202cac196e7324f4ea Mon Sep 17 00:00:00 2001 From: Matthew Maurer Date: Thu, 4 Sep 2025 21:13:58 +0000 Subject: [PATCH 1402/3206] samples: rust: Add scoped debugfs sample driver Adds a new sample driver `rust_scoped_debugfs` that demonstrates the use of the scoped debugfs APIs. The driver creates a `control` directory with two write-only files, `create` and `remove`. Writing a name and a series of numbers to `create` will create a new subdirectory under a `dynamic` directory. This new subdirectory will contain files that expose the numbers as atomic values. Writing a name to `remove` will remove the corresponding subdirectory from the `dynamic` directory. This sample serves as an example of how to use the `debugfs::Scope` and `debugfs::ScopedDir` APIs to create and manage debugfs entries that are tied to the lifetime of a data structure. Signed-off-by: Matthew Maurer Tested-by: Dirk Behme Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250904-debugfs-rust-v11-7-7d12a165685a@google.com [ Rename "scoped_debugfs" -> "debugfs_scoped", fix up Result<(), Error> -> Result. - Danilo ] Signed-off-by: Danilo Krummrich --- MAINTAINERS | 1 + samples/rust/Kconfig | 11 +++ samples/rust/Makefile | 1 + samples/rust/rust_debugfs_scoped.rs | 134 ++++++++++++++++++++++++++++ 4 files changed, 147 insertions(+) create mode 100644 samples/rust/rust_debugfs_scoped.rs diff --git a/MAINTAINERS b/MAINTAINERS index 4c471251a304f6..c01599d1727cd8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7497,6 +7497,7 @@ F: rust/kernel/driver.rs F: rust/kernel/faux.rs F: rust/kernel/platform.rs F: samples/rust/rust_debugfs.rs +F: samples/rust/rust_debugfs_scoped.rs F: samples/rust/rust_driver_platform.rs F: samples/rust/rust_driver_faux.rs diff --git a/samples/rust/Kconfig b/samples/rust/Kconfig index 01101db41ae31b..66360cdf048ff2 100644 --- a/samples/rust/Kconfig +++ b/samples/rust/Kconfig @@ -73,6 +73,17 @@ config SAMPLE_RUST_DEBUGFS If unsure, say N. +config SAMPLE_RUST_DEBUGFS_SCOPED + tristate "Scoped DebugFS Test Module" + depends on DEBUG_FS + help + This option builds the Rust Scoped DebugFS Test module sample. + + To compile this as a module, choose M here: + the module will be called rust_debugfs_scoped. + + If unsure, say N. + config SAMPLE_RUST_DRIVER_PCI tristate "PCI Driver" depends on PCI diff --git a/samples/rust/Makefile b/samples/rust/Makefile index 61276222a99f8c..69ca01497b589e 100644 --- a/samples/rust/Makefile +++ b/samples/rust/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_SAMPLE_RUST_MINIMAL) += rust_minimal.o obj-$(CONFIG_SAMPLE_RUST_MISC_DEVICE) += rust_misc_device.o obj-$(CONFIG_SAMPLE_RUST_PRINT) += rust_print.o obj-$(CONFIG_SAMPLE_RUST_DEBUGFS) += rust_debugfs.o +obj-$(CONFIG_SAMPLE_RUST_DEBUGFS_SCOPED) += rust_debugfs_scoped.o obj-$(CONFIG_SAMPLE_RUST_DMA) += rust_dma.o obj-$(CONFIG_SAMPLE_RUST_DRIVER_PCI) += rust_driver_pci.o obj-$(CONFIG_SAMPLE_RUST_DRIVER_PLATFORM) += rust_driver_platform.o diff --git a/samples/rust/rust_debugfs_scoped.rs b/samples/rust/rust_debugfs_scoped.rs new file mode 100644 index 00000000000000..b0c4e76b123eaa --- /dev/null +++ b/samples/rust/rust_debugfs_scoped.rs @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Sample DebugFS exporting platform driver that demonstrates the use of +//! `Scope::dir` to create a variety of files without the need to separately +//! track them all. + +use core::sync::atomic::AtomicUsize; +use kernel::debugfs::{Dir, Scope}; +use kernel::prelude::*; +use kernel::sync::Mutex; +use kernel::{c_str, new_mutex, str::CString}; + +module! { + type: RustScopedDebugFs, + name: "rust_debugfs_scoped", + authors: ["Matthew Maurer"], + description: "Rust Scoped DebugFS usage sample", + license: "GPL", +} + +fn remove_file_write( + mod_data: &ModuleData, + reader: &mut kernel::uaccess::UserSliceReader, +) -> Result { + let mut buf = [0u8; 128]; + if reader.len() >= buf.len() { + return Err(EINVAL); + } + let n = reader.len(); + reader.read_slice(&mut buf[..n])?; + + let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?.trim(); + let nul_idx = s.len(); + buf[nul_idx] = 0; + let to_remove = CStr::from_bytes_with_nul(&buf[..nul_idx + 1]).map_err(|_| EINVAL)?; + mod_data + .devices + .lock() + .retain(|device| device.name.as_bytes() != to_remove.as_bytes()); + Ok(()) +} + +fn create_file_write( + mod_data: &ModuleData, + reader: &mut kernel::uaccess::UserSliceReader, +) -> Result { + let mut buf = [0u8; 128]; + if reader.len() > buf.len() { + return Err(EINVAL); + } + let n = reader.len(); + reader.read_slice(&mut buf[..n])?; + + let mut nums = KVec::new(); + + let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?.trim(); + let mut items = s.split_whitespace(); + let name_str = items.next().ok_or(EINVAL)?; + let name = CString::try_from_fmt(fmt!("{name_str}"))?; + let file_name = CString::try_from_fmt(fmt!("{name_str}"))?; + for sub in items { + nums.push( + AtomicUsize::new(sub.parse().map_err(|_| EINVAL)?), + GFP_KERNEL, + )?; + } + + let scope = KBox::pin_init( + mod_data + .device_dir + .scope(DeviceData { name, nums }, &file_name, |dev_data, dir| { + for (idx, val) in dev_data.nums.iter().enumerate() { + let Ok(name) = CString::try_from_fmt(fmt!("{idx}")) else { + return; + }; + dir.read_write_file(&name, val); + } + }), + GFP_KERNEL, + )?; + (*mod_data.devices.lock()).push(scope, GFP_KERNEL)?; + + Ok(()) +} + +struct RustScopedDebugFs { + _data: Pin>>, +} + +#[pin_data] +struct ModuleData { + device_dir: Dir, + #[pin] + devices: Mutex>>>>, +} + +impl ModuleData { + fn init(device_dir: Dir) -> impl PinInit { + pin_init! { + Self { + device_dir: device_dir, + devices <- new_mutex!(KVec::new()) + } + } + } +} + +struct DeviceData { + name: CString, + nums: KVec, +} + +fn init_control(base_dir: &Dir, dyn_dirs: Dir) -> impl PinInit> + '_ { + base_dir.scope( + ModuleData::init(dyn_dirs), + c_str!("control"), + |data, dir| { + dir.write_only_callback_file(c_str!("create"), data, &create_file_write); + dir.write_only_callback_file(c_str!("remove"), data, &remove_file_write); + }, + ) +} + +impl kernel::Module for RustScopedDebugFs { + fn init(_module: &'static kernel::ThisModule) -> Result { + let base_dir = Dir::new(c_str!("rust_scoped_debugfs")); + let dyn_dirs = base_dir.subdir(c_str!("dynamic")); + Ok(Self { + _data: KBox::pin_init(init_control(&base_dir, dyn_dirs), GFP_KERNEL)?, + }) + } +} From cdbc9836c7afadad68f374791738f118263c5371 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 3 Jul 2025 12:10:50 +0200 Subject: [PATCH 1403/3206] libceph: fix invalid accesses to ceph_connection_v1_info There is a place where generic code in messenger.c is reading and another place where it is writing to con->v1 union member without checking that the union member is active (i.e. msgr1 is in use). On 64-bit systems, con->v1.auth_retry overlaps with con->v2.out_iter, so such a read is almost guaranteed to return a bogus value instead of 0 when msgr2 is in use. This ends up being fairly benign because the side effect is just the invalidation of the authorizer and successive fetching of new tickets. con->v1.connect_seq overlaps with con->v2.conn_bufs and the fact that it's being written to can cause more serious consequences, but luckily it's not something that happens often. Cc: stable@vger.kernel.org Fixes: cd1a677cad99 ("libceph, ceph: implement msgr2.1 protocol (crc and secure modes)") Signed-off-by: Ilya Dryomov Reviewed-by: Viacheslav Dubeyko --- net/ceph/messenger.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d1b5705dc0c648..9f6d860411cbd1 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1524,7 +1524,7 @@ static void con_fault_finish(struct ceph_connection *con) * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->v1.auth_retry) { + if (!ceph_msgr2(from_msgr(con->msgr)) && con->v1.auth_retry) { dout("auth_retry %d, invalidating\n", con->v1.auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); @@ -1714,9 +1714,10 @@ static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ if (con->state == CEPH_CON_S_STANDBY) { - dout("clear_standby %p and ++connect_seq\n", con); + dout("clear_standby %p\n", con); con->state = CEPH_CON_S_PREOPEN; - con->v1.connect_seq++; + if (!ceph_msgr2(from_msgr(con->msgr))) + con->v1.connect_seq++; WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); } From e25ddfb388c8b7e5f20e3bf38d627fb485003781 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 10 Sep 2025 20:57:39 +0800 Subject: [PATCH 1404/3206] bpf: Reject bpf_timer for PREEMPT_RT When enable CONFIG_PREEMPT_RT, the kernel will warn when run timer selftests by './test_progs -t timer': BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 In order to avoid such warning, reject bpf_timer in verifier when PREEMPT_RT is enabled. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250910125740.52172-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c89e2b1bc644b7..9fb1f957a09374 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8547,6 +8547,10 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); + return -EOPNOTSUPP; + } meta->map_uid = reg->map_uid; meta->map_ptr = map; return 0; From fbdd61c94bcb09b0c0eb0655917bf4193d07aac1 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 10 Sep 2025 20:57:40 +0800 Subject: [PATCH 1405/3206] selftests/bpf: Skip timer cases when bpf_timer is not supported When enable CONFIG_PREEMPT_RT, verifier will reject bpf_timer with returning -EOPNOTSUPP. Therefore, skip test cases when errno is EOPNOTSUPP. cd tools/testing/selftests/bpf ./test_progs -t timer 125 free_timer:SKIP 456 timer:SKIP 457/1 timer_crash/array:SKIP 457/2 timer_crash/hash:SKIP 457 timer_crash:SKIP 458 timer_lockup:SKIP 459 timer_mim:SKIP Summary: 5/0 PASSED, 6 SKIPPED, 0 FAILED Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250910125740.52172-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/free_timer.c | 4 ++++ tools/testing/selftests/bpf/prog_tests/timer.c | 4 ++++ tools/testing/selftests/bpf/prog_tests/timer_crash.c | 4 ++++ tools/testing/selftests/bpf/prog_tests/timer_lockup.c | 4 ++++ tools/testing/selftests/bpf/prog_tests/timer_mim.c | 4 ++++ 5 files changed, 20 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/free_timer.c b/tools/testing/selftests/bpf/prog_tests/free_timer.c index b7b77a6b29799c..0de8facca4c5bc 100644 --- a/tools/testing/selftests/bpf/prog_tests/free_timer.c +++ b/tools/testing/selftests/bpf/prog_tests/free_timer.c @@ -124,6 +124,10 @@ void test_free_timer(void) int err; skel = free_timer__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "open_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index d66687f1ee6a8d..56f660ca567ba1 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -86,6 +86,10 @@ void serial_test_timer(void) int err; timer_skel = timer__open_and_load(); + if (!timer_skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/timer_crash.c b/tools/testing/selftests/bpf/prog_tests/timer_crash.c index f74b82305da8c8..b841597c8a3a31 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_crash.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_crash.c @@ -12,6 +12,10 @@ static void test_timer_crash_mode(int mode) struct timer_crash *skel; skel = timer_crash__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "timer_crash__open_and_load")) return; skel->bss->pid = getpid(); diff --git a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c index 1a2f99596916fb..eb303fa1e09af9 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c @@ -59,6 +59,10 @@ void test_timer_lockup(void) } skel = timer_lockup__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/timer_mim.c b/tools/testing/selftests/bpf/prog_tests/timer_mim.c index 9ff7843909e7d3..c930c7d7105b9f 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_mim.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_mim.c @@ -65,6 +65,10 @@ void serial_test_timer_mim(void) goto cleanup; timer_skel = timer_mim__open_and_load(); + if (!timer_skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) goto cleanup; From b87ecbc54f22382ace1cf41645e8652a4ce44d52 Mon Sep 17 00:00:00 2001 From: Daniel Almeida Date: Wed, 10 Sep 2025 14:54:31 -0300 Subject: [PATCH 1406/3206] rust: regulator: remove Regulator After some experimenting and further discussion, it is starting to look like Regulator might be a footgun. It turns out that one can get the same behavior by correctly using just Regulator and Regulator, so there is no need to directly expose the manual refcounting ability of Regulator to clients. Remove it while we do not have any other users. Suggested-by: Danilo Krummrich Reviewed-by: Alice Ryhl Reviewed-by: Danilo Krummrich Reviewed-by: Alexandre Courbot Signed-off-by: Daniel Almeida Link: https://patch.msgid.link/20250910-regulator-remove-dynamic-v3-1-07af4dfa97cc@collabora.com Signed-off-by: Mark Brown --- rust/kernel/regulator.rs | 88 +--------------------------------------- 1 file changed, 1 insertion(+), 87 deletions(-) diff --git a/rust/kernel/regulator.rs b/rust/kernel/regulator.rs index 34bb24ec8d4d43..5ea2307f02df4a 100644 --- a/rust/kernel/regulator.rs +++ b/rust/kernel/regulator.rs @@ -30,7 +30,6 @@ mod private { impl Sealed for super::Enabled {} impl Sealed for super::Disabled {} - impl Sealed for super::Dynamic {} } /// A trait representing the different states a [`Regulator`] can be in. @@ -50,13 +49,6 @@ pub struct Enabled; /// own an `enable` reference count, but the regulator may still be on. pub struct Disabled; -/// A state that models the C API. The [`Regulator`] can be either enabled or -/// disabled, and the user is in control of the reference count. This is also -/// the default state. -/// -/// Use [`Regulator::is_enabled`] to check the regulator's current state. -pub struct Dynamic; - impl RegulatorState for Enabled { const DISABLE_ON_DROP: bool = true; } @@ -65,14 +57,9 @@ impl RegulatorState for Disabled { const DISABLE_ON_DROP: bool = false; } -impl RegulatorState for Dynamic { - const DISABLE_ON_DROP: bool = false; -} - /// A trait that abstracts the ability to check if a [`Regulator`] is enabled. pub trait IsEnabled: RegulatorState {} impl IsEnabled for Disabled {} -impl IsEnabled for Dynamic {} /// An error that can occur when trying to convert a [`Regulator`] between states. pub struct Error { @@ -183,64 +170,13 @@ pub struct Error { /// } /// ``` /// -/// ## Using [`Regulator`] -/// -/// This example mimics the behavior of the C API, where the user is in -/// control of the enabled reference count. This is useful for drivers that -/// might call enable and disable to manage the `enable` reference count at -/// runtime, perhaps as a result of `open()` and `close()` calls or whatever -/// other driver-specific or subsystem-specific hooks. -/// -/// ``` -/// # use kernel::prelude::*; -/// # use kernel::c_str; -/// # use kernel::device::Device; -/// # use kernel::regulator::{Regulator, Dynamic}; -/// struct PrivateData { -/// regulator: Regulator, -/// } -/// -/// // A fictictious probe function that obtains a regulator and sets it up. -/// fn probe(dev: &Device) -> Result { -/// // Obtain a reference to a (fictitious) regulator. -/// let regulator = Regulator::::get(dev, c_str!("vcc"))?; -/// -/// Ok(PrivateData { regulator }) -/// } -/// -/// // A fictictious function that indicates that the device is going to be used. -/// fn open(dev: &Device, data: &PrivateData) -> Result { -/// // Increase the `enabled` reference count. -/// data.regulator.enable()?; -/// -/// Ok(()) -/// } -/// -/// fn close(dev: &Device, data: &PrivateData) -> Result { -/// // Decrease the `enabled` reference count. -/// data.regulator.disable()?; -/// -/// Ok(()) -/// } -/// -/// fn remove(dev: &Device, data: PrivateData) -> Result { -/// // `PrivateData` is dropped here, which will drop the -/// // `Regulator` in turn. -/// // -/// // The reference that was obtained by `regulator_get()` will be -/// // released, but it is up to the user to make sure that the number of calls -/// // to `enable()` and `disabled()` are balanced before this point. -/// Ok(()) -/// } -/// ``` -/// /// # Invariants /// /// - `inner` is a non-null wrapper over a pointer to a `struct /// regulator` obtained from [`regulator_get()`]. /// /// [`regulator_get()`]: https://docs.kernel.org/driver-api/regulator.html#c.regulator_get -pub struct Regulator +pub struct Regulator where State: RegulatorState, { @@ -351,28 +287,6 @@ impl Regulator { } } -impl Regulator { - /// Obtains a [`Regulator`] instance from the system. The current state of - /// the regulator is unknown and it is up to the user to manage the enabled - /// reference count. - /// - /// This closely mimics the behavior of the C API and can be used to - /// dynamically manage the enabled reference count at runtime. - pub fn get(dev: &Device, name: &CStr) -> Result { - Regulator::get_internal(dev, name) - } - - /// Increases the `enabled` reference count. - pub fn enable(&self) -> Result { - self.enable_internal() - } - - /// Decreases the `enabled` reference count. - pub fn disable(&self) -> Result { - self.disable_internal() - } -} - impl Regulator { /// Checks if the regulator is enabled. pub fn is_enabled(&self) -> bool { From 2e0fd4583d0efcdc260e61a22666c8368f505353 Mon Sep 17 00:00:00 2001 From: Daniel Almeida Date: Wed, 10 Sep 2025 14:54:32 -0300 Subject: [PATCH 1407/3206] rust: regulator: add devm_enable and devm_enable_optional A lot of drivers only care about enabling the regulator for as long as the underlying Device is bound. This can be easily observed due to the extensive use of `devm_regulator_get_enable` and `devm_regulator_get_enable_optional` throughout the kernel. Therefore, make this helper available in Rust. Also add an example noting how it should be the default API unless the driver needs more fine-grained control over the regulator. Suggested-by: Danilo Krummrich Reviewed-by: Boqun Feng Reviewed-by: Danilo Krummrich Reviewed-by: Alexandre Courbot Signed-off-by: Daniel Almeida Link: https://patch.msgid.link/20250910-regulator-remove-dynamic-v3-2-07af4dfa97cc@collabora.com Signed-off-by: Mark Brown --- rust/helpers/regulator.c | 10 +++++++ rust/kernel/regulator.rs | 60 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/rust/helpers/regulator.c b/rust/helpers/regulator.c index cd8b7ba648ee33..11bc332443bd06 100644 --- a/rust/helpers/regulator.c +++ b/rust/helpers/regulator.c @@ -40,4 +40,14 @@ int rust_helper_regulator_is_enabled(struct regulator *regulator) return regulator_is_enabled(regulator); } +int rust_helper_devm_regulator_get_enable(struct device *dev, const char *id) +{ + return devm_regulator_get_enable(dev, id); +} + +int rust_helper_devm_regulator_get_enable_optional(struct device *dev, const char *id) +{ + return devm_regulator_get_enable_optional(dev, id); +} + #endif diff --git a/rust/kernel/regulator.rs b/rust/kernel/regulator.rs index 5ea2307f02df4a..b55a201e5029fd 100644 --- a/rust/kernel/regulator.rs +++ b/rust/kernel/regulator.rs @@ -18,7 +18,7 @@ use crate::{ bindings, - device::Device, + device::{Bound, Device}, error::{from_err_ptr, to_result, Result}, prelude::*, }; @@ -69,6 +69,41 @@ pub struct Error { /// The regulator that caused the error, so that the operation may be retried. pub regulator: Regulator, } +/// Obtains and enables a [`devres`]-managed regulator for a device. +/// +/// This calls [`regulator_disable()`] and [`regulator_put()`] automatically on +/// driver detach. +/// +/// This API is identical to `devm_regulator_get_enable()`, and should be +/// preferred over the [`Regulator`] API if the caller only +/// cares about the regulator being enabled. +/// +/// [`devres`]: https://docs.kernel.org/driver-api/driver-model/devres.html +/// [`regulator_disable()`]: https://docs.kernel.org/driver-api/regulator.html#c.regulator_disable +/// [`regulator_put()`]: https://docs.kernel.org/driver-api/regulator.html#c.regulator_put +pub fn devm_enable(dev: &Device, name: &CStr) -> Result { + // SAFETY: `dev` is a valid and bound device, while `name` is a valid C + // string. + to_result(unsafe { bindings::devm_regulator_get_enable(dev.as_raw(), name.as_ptr()) }) +} + +/// Same as [`devm_enable`], but calls `devm_regulator_get_enable_optional` +/// instead. +/// +/// This obtains and enables a [`devres`]-managed regulator for a device, but +/// does not print a message nor provides a dummy if the regulator is not found. +/// +/// This calls [`regulator_disable()`] and [`regulator_put()`] automatically on +/// driver detach. +/// +/// [`devres`]: https://docs.kernel.org/driver-api/driver-model/devres.html +/// [`regulator_disable()`]: https://docs.kernel.org/driver-api/regulator.html#c.regulator_disable +/// [`regulator_put()`]: https://docs.kernel.org/driver-api/regulator.html#c.regulator_put +pub fn devm_enable_optional(dev: &Device, name: &CStr) -> Result { + // SAFETY: `dev` is a valid and bound device, while `name` is a valid C + // string. + to_result(unsafe { bindings::devm_regulator_get_enable_optional(dev.as_raw(), name.as_ptr()) }) +} /// A `struct regulator` abstraction. /// @@ -146,6 +181,29 @@ pub struct Error { /// } /// ``` /// +/// If a driver only cares about the regulator being on for as long it is bound +/// to a device, then it should use [`devm_enable`] or [`devm_enable_optional`]. +/// This should be the default use-case unless more fine-grained control over +/// the regulator's state is required. +/// +/// [`devm_enable`]: crate::regulator::devm_enable +/// [`devm_optional`]: crate::regulator::devm_enable_optional +/// +/// ``` +/// # use kernel::prelude::*; +/// # use kernel::c_str; +/// # use kernel::device::{Bound, Device}; +/// # use kernel::regulator; +/// fn enable(dev: &Device) -> Result { +/// // Obtain a reference to a (fictitious) regulator and enable it. This +/// // call only returns whether the operation succeeded. +/// regulator::devm_enable(dev, c_str!("vcc"))?; +/// +/// // The regulator will be disabled and put when `dev` is unbound. +/// Ok(()) +/// } +/// ``` +/// /// ## Disabling a regulator /// /// ``` From d2c773159327f4d2f6438acf1ae2ae9ac0ca46a9 Mon Sep 17 00:00:00 2001 From: Gerald Yang Date: Tue, 9 Sep 2025 13:10:52 +0000 Subject: [PATCH 1408/3206] audit: fix skb leak when audit rate limit is exceeded When configuring a small audit rate limit in /etc/audit/rules.d/audit.rules: -a always,exit -F arch=b64 -S openat -S truncate -S ftruncate -F exit=-EACCES -F auid>=1000 -F auid!=4294967295 -k access -r 100 And then repeatedly triggering permission denied as a normal user: while :; do cat /proc/1/environ; done We can see the messages in kernel log: [ 2531.862184] audit: rate limit exceeded The unreclaimable slab objects start to leak quickly. With kmemleak enabled, many call traces appear like: unreferenced object 0xffff99144b13f600 (size 232): comm "cat", pid 1100, jiffies 4294739144 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 8540ec4f): kmemleak_alloc+0x4a/0x90 kmem_cache_alloc_node+0x2ea/0x390 __alloc_skb+0x174/0x1b0 audit_log_start+0x198/0x3d0 audit_log_proctitle+0x32/0x160 audit_log_exit+0x6c6/0x780 __audit_syscall_exit+0xee/0x140 syscall_exit_work+0x12b/0x150 syscall_exit_to_user_mode_prepare+0x39/0x80 syscall_exit_to_user_mode+0x11/0x260 do_syscall_64+0x8c/0x180 entry_SYSCALL_64_after_hwframe+0x78/0x80 This shows that the skb allocated in audit_log_start() and queued onto skb_list is never freed. In audit_log_end(), each skb is dequeued from skb_list and passed to __audit_log_end(). However, when the audit rate limit is exceeded, __audit_log_end() simply prints "rate limit exceeded" and returns without processing the skb. Since the skb is already removed from skb_list, audit_buffer_free() cannot free it later, leading to a memory leak. Fix this by freeing the skb when the rate limit is exceeded. Fixes: eb59d494eebd ("audit: add record for multiple task security contexts") Signed-off-by: Gerald Yang [PM: fixes tag, subj tweak] Signed-off-by: Paul Moore --- kernel/audit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/audit.c b/kernel/audit.c index 7074838796485a..26a332ffb1b8d9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2616,8 +2616,10 @@ static void __audit_log_end(struct sk_buff *skb) /* queue the netlink packet */ skb_queue_tail(&audit_queue, skb); - } else + } else { audit_log_lost("rate limit exceeded"); + kfree_skb(skb); + } } /** From 6fef6ae764be8a77f61ad3b6937ba82fe8358045 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sun, 7 Sep 2025 21:43:20 +0100 Subject: [PATCH 1409/3206] net: ethtool: fix wrong type used in struct kernel_ethtool_ts_info In C, enumerated types do not have a defined size, apart from being compatible with one of the standard types. This allows an ABI / compiler to choose the type of an enum depending on the values it needs to store, and storing larger values in it can lead to undefined behaviour. The tx_type and rx_filters members of struct kernel_ethtool_ts_info are defined as enumerated types, but are bit arrays, where each bit is defined by the enumerated type. This means they typically store values in excess of the maximum value of the enumerated type, in fact (1 << max_value) and thus must not be declared using the enumated type. Fix both of these to use u32, as per the corresponding __u32 UAPI type. Fixes: 2111375b85ad ("net: Add struct kernel_ethtool_ts_info") Signed-off-by: Russell King (Oracle) Reviewed-by: Kory Maincent Link: https://patch.msgid.link/E1uvMEK-00000003Amd-2pWR@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index de5bd76a400ca1..d7d757e72554e1 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -856,8 +856,8 @@ struct kernel_ethtool_ts_info { enum hwtstamp_provider_qualifier phc_qualifier; enum hwtstamp_source phc_source; int phc_phyindex; - enum hwtstamp_tx_types tx_types; - enum hwtstamp_rx_filters rx_filters; + u32 tx_types; + u32 rx_filters; }; /** From 5537a4679403423e0b49c95b619983a4583d69c5 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 8 Sep 2025 13:26:19 +0200 Subject: [PATCH 1410/3206] net: usb: asix: ax88772: drop phylink use in PM to avoid MDIO runtime PM wakeups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop phylink_{suspend,resume}() from ax88772 PM callbacks. MDIO bus accesses have their own runtime-PM handling and will try to wake the device if it is suspended. Such wake attempts must not happen from PM callbacks while the device PM lock is held. Since phylink {sus|re}sume may trigger MDIO, it must not be called in PM context. No extra phylink PM handling is required for this driver: - .ndo_open/.ndo_stop control the phylink start/stop lifecycle. - ethtool/phylib entry points run in process context, not PM. - phylink MAC ops program the MAC on link changes after resume. Fixes: e0bffe3e6894 ("net: asix: ax88772: migrate to phylink") Reported-by: Hubert Wiśniewski Cc: stable@vger.kernel.org Signed-off-by: Oleksij Rempel Tested-by: Hubert Wiśniewski Tested-by: Xu Yang Link: https://patch.msgid.link/20250908112619.2900723-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/asix_devices.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index 792ddda1ad493d..1e8f7089f5e8a2 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -607,15 +607,8 @@ static const struct net_device_ops ax88772_netdev_ops = { static void ax88772_suspend(struct usbnet *dev) { - struct asix_common_private *priv = dev->driver_priv; u16 medium; - if (netif_running(dev->net)) { - rtnl_lock(); - phylink_suspend(priv->phylink, false); - rtnl_unlock(); - } - /* Stop MAC operation */ medium = asix_read_medium_status(dev, 1); medium &= ~AX_MEDIUM_RE; @@ -644,12 +637,6 @@ static void ax88772_resume(struct usbnet *dev) for (i = 0; i < 3; i++) if (!priv->reset(dev, 1)) break; - - if (netif_running(dev->net)) { - rtnl_lock(); - phylink_resume(priv->phylink); - rtnl_unlock(); - } } static int asix_resume(struct usb_interface *intf) From b2461e20fa9ac18b1305bba5bc7e22ebf644ea01 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 25 Aug 2025 15:00:30 +0800 Subject: [PATCH 1411/3206] firmware: imx: Add stub functions for SCMI MISC API To ensure successful builds when CONFIG_IMX_SCMI_MISC_DRV is not enabled, this patch adds static inline stub implementations for the following functions: - scmi_imx_misc_ctrl_get() - scmi_imx_misc_ctrl_set() These stubs return -EOPNOTSUPP to indicate that the functionality is not supported in the current configuration. This avoids potential build or link errors in code that conditionally calls these functions based on feature availability. This patch also drops the changes in commit 540c830212ed ("firmware: imx: remove duplicate scmi_imx_misc_ctrl_get()"). The original change aimed to simplify the handling of optional features by removing conditional stubs. However, the use of conditional stubs is necessary when CONFIG_IMX_SCMI_MISC_DRV is n, while consumer driver is set to y. This is not a matter of preserving legacy patterns, but rather to ensure that there is no link error whether for module or built-in. Fixes: 0b4f8a68b292 ("firmware: imx: Add i.MX95 MISC driver") Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- include/linux/firmware/imx/sm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index d4212bc42b2c17..99c15bbb46aa83 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -26,8 +26,20 @@ #define SCMI_IMX94_CTRL_SAI3_MCLK 5U /*!< WAKE SAI3 MCLK */ #define SCMI_IMX94_CTRL_SAI4_MCLK 6U /*!< WAKE SAI4 MCLK */ +#if IS_ENABLED(CONFIG_IMX_SCMI_MISC_DRV) int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val); int scmi_imx_misc_ctrl_set(u32 id, u32 val); +#else +static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val) +{ + return -EOPNOTSUPP; +} +#endif int scmi_imx_cpu_start(u32 cpuid, bool start); int scmi_imx_cpu_started(u32 cpuid, bool *started); From 3fb91b5c86d0fb5ff6f65c30a4f20193166e22fe Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 25 Aug 2025 15:00:31 +0800 Subject: [PATCH 1412/3206] firmware: imx: Add stub functions for SCMI LMM API To ensure successful builds when CONFIG_IMX_SCMI_LMM_DRV is not enabled, this patch adds static inline stub implementations for the following functions: - scmi_imx_lmm_operation() - scmi_imx_lmm_info() - scmi_imx_lmm_reset_vector_set() These stubs return -EOPNOTSUPP to indicate that the functionality is not supported in the current configuration. This avoids potential build or link errors in code that conditionally calls these functions based on feature availability. Fixes: 7242bbf418f0 ("firmware: imx: Add i.MX95 SCMI LMM driver") Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- include/linux/firmware/imx/sm.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index 99c15bbb46aa83..f2a72177bb37c1 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -56,7 +56,24 @@ enum scmi_imx_lmm_op { #define SCMI_IMX_LMM_OP_FORCEFUL 0 #define SCMI_IMX_LMM_OP_GRACEFUL BIT(0) +#if IS_ENABLED(CONFIG_IMX_SCMI_LMM_DRV) int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags); int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info); int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector); +#else +static inline int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector) +{ + return -EOPNOTSUPP; +} +#endif #endif From 222accf05fc42f68ae02065d9c1542c20315118b Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 25 Aug 2025 15:00:32 +0800 Subject: [PATCH 1413/3206] firmware: imx: Add stub functions for SCMI CPU API To ensure successful builds when CONFIG_IMX_SCMI_CPU_DRV is not enabled, this patch adds static inline stub implementations for the following functions: - scmi_imx_cpu_start() - scmi_imx_cpu_started() - scmi_imx_cpu_reset_vector_set() These stubs return -EOPNOTSUPP to indicate that the functionality is not supported in the current configuration. This avoids potential build or link errors in code that conditionally calls these functions based on feature availability. Fixes: 1055faa5d660 ("firmware: imx: Add i.MX95 SCMI CPU driver") Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- include/linux/firmware/imx/sm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index f2a72177bb37c1..a33b4502735675 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -41,10 +41,28 @@ static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val) } #endif +#if IS_ENABLED(CONFIG_IMX_SCMI_CPU_DRV) int scmi_imx_cpu_start(u32 cpuid, bool start); int scmi_imx_cpu_started(u32 cpuid, bool *started); int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, bool boot, bool resume); +#else +static inline int scmi_imx_cpu_start(u32 cpuid, bool start) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_cpu_started(u32 cpuid, bool *started) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, + bool boot, bool resume) +{ + return -EOPNOTSUPP; +} +#endif enum scmi_imx_lmm_op { SCMI_IMX_LMM_BOOT, From d79c3eb59780369e57fc9cd325c703e4f3c55210 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 26 Aug 2025 11:29:01 +0200 Subject: [PATCH 1414/3206] ARM: imx: Kconfig: Adjust select after renamed config option Commit 3f490a74a8a1 ("clocksource/drivers/vf-pit: Rename the VF PIT to NXP PIT") renames the config VF_PIT_TIMER to NXP_PIT_TIMER, but it misses adjusting a reference to VF_PIT_TIMER in arch/arm/mach-imx/Kconfig. Adjust the config reference to the new name. Fixes: 3f490a74a8a1 ("clocksource/drivers/vf-pit: Rename the VF PIT to NXP PIT") Signed-off-by: Lukas Bulwahn Signed-off-by: Shawn Guo --- arch/arm/mach-imx/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig index dc47b2312127fd..6ea1bd55acf8de 100644 --- a/arch/arm/mach-imx/Kconfig +++ b/arch/arm/mach-imx/Kconfig @@ -242,7 +242,7 @@ choice config VF_USE_PIT_TIMER bool "Use PIT timer" - select VF_PIT_TIMER + select NXP_PIT_TIMER help Use SoC Periodic Interrupt Timer (PIT) as clocksource From a50342f976d25aace73ff551845ce89406f48f35 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 5 Sep 2025 11:01:09 +0800 Subject: [PATCH 1415/3206] arm64: dts: imx8mp: Correct thermal sensor index The TMU has two temperature measurement sites located on the chip. The probe 0 is located inside of the ANAMIX, while the probe 1 is located near the ARM core. This has been confirmed by checking with HW design team and checking RTL code. So correct the {cpu,soc}-thermal sensor index. Fixes: 30cdd62dce6b ("arm64: dts: imx8mp: Add thermal zones support") Signed-off-by: Peng Fan Reviewed-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi index bb24dba7338ea0..d6d21e8498dcf9 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi @@ -298,7 +298,7 @@ cpu-thermal { polling-delay-passive = <250>; polling-delay = <2000>; - thermal-sensors = <&tmu 0>; + thermal-sensors = <&tmu 1>; trips { cpu_alert0: trip0 { temperature = <85000>; @@ -331,7 +331,7 @@ soc-thermal { polling-delay-passive = <250>; polling-delay = <2000>; - thermal-sensors = <&tmu 1>; + thermal-sensors = <&tmu 0>; trips { soc_alert0: trip0 { temperature = <85000>; From c3f8d13357deab1e04f8a52b499d6b9b704e578e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Sep 2025 15:11:21 +0200 Subject: [PATCH 1416/3206] wifi: nl80211: completely disable per-link stats for now After commit 8cc71fc3b82b ("wifi: cfg80211: Fix "no buffer space available" error in nl80211_get_station() for MLO"), the per-link data is only included in station dumps, where the size limit is somewhat less of an issue. However, it's still an issue, depending on how many links a station has and how much per-link data there is. Thus, for now, disable per-link statistics entirely. A complete fix will need to take this into account, make it opt-in by userspace, and change the dump format to be able to split a single station's data across multiple netlink dump messages, which all together is too much development for a fix. Fixes: 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f2f7424e930cc1..852573423e52d1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7575,7 +7575,7 @@ static int nl80211_dump_station(struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev->netdev, mac_addr, - &sinfo, true) < 0) + &sinfo, false) < 0) goto out; sta_idx++; From 8884c693991333ae065830554b9b0c96590b1bb2 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:31 +0000 Subject: [PATCH 1417/3206] hsr: use rtnl lock when iterating over ports hsr_for_each_port is called in many places without holding the RCU read lock, this may trigger warnings on debug kernels. Most of the callers are actually hold rtnl lock. So add a new helper hsr_for_each_port_rtnl to allow callers in suitable contexts to iterate ports safely without explicit RCU locking. This patch only fixed the callers that is hold rtnl lock. Other caller issues will be fixed in later patches. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 18 +++++++++--------- net/hsr/hsr_main.c | 2 +- net/hsr/hsr_main.h | 3 +++ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 88657255fec12b..bce7b4061ce08b 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -49,7 +49,7 @@ static bool hsr_check_carrier(struct hsr_port *master) ASSERT_RTNL(); - hsr_for_each_port(master->hsr, port) { + hsr_for_each_port_rtnl(master->hsr, port) { if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) { netif_carrier_on(master->dev); return true; @@ -105,7 +105,7 @@ int hsr_get_max_mtu(struct hsr_priv *hsr) struct hsr_port *port; mtu_max = ETH_DATA_LEN; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) mtu_max = min(port->dev->mtu, mtu_max); @@ -139,7 +139,7 @@ static int hsr_dev_open(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -172,7 +172,7 @@ static int hsr_dev_close(struct net_device *dev) struct hsr_priv *hsr; hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -205,7 +205,7 @@ static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, * may become enabled. */ features &= ~NETIF_F_ONE_FOR_ALL; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) features = netdev_increment_features(features, port->dev->features, mask); @@ -484,7 +484,7 @@ static void hsr_set_rx_mode(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -506,7 +506,7 @@ static void hsr_change_rx_flags(struct net_device *dev, int change) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -534,7 +534,7 @@ static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) continue; @@ -580,7 +580,7 @@ static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 192893c3f2ec73..ac1eb1db1a52b4 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -22,7 +22,7 @@ static bool hsr_slave_empty(struct hsr_priv *hsr) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) return false; return true; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 135ec5fce01967..33b0d2460c9bcd 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -224,6 +224,9 @@ struct hsr_priv { #define hsr_for_each_port(hsr, port) \ list_for_each_entry_rcu((port), &(hsr)->ports, port_list) +#define hsr_for_each_port_rtnl(hsr, port) \ + list_for_each_entry_rcu((port), &(hsr)->ports, port_list, lockdep_rtnl_is_held()) + struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt); /* Caller must ensure skb is a valid HSR frame */ From 393c841fe4333cdd856d0ca37b066d72746cfaa6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:32 +0000 Subject: [PATCH 1418/3206] hsr: use hsr_for_each_port_rtnl in hsr_port_get_hsr hsr_port_get_hsr() iterates over ports using hsr_for_each_port(), but many of its callers do not hold the required RCU lock. Switch to hsr_for_each_port_rtnl(), since most callers already hold the rtnl lock. After review, all callers are covered by either the rtnl lock or the RCU lock, except hsr_dev_xmit(). Fix this by adding an RCU read lock there. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-3-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 3 +++ net/hsr/hsr_main.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index bce7b4061ce08b..702da1f9aaa90e 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -226,6 +226,7 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct hsr_priv *hsr = netdev_priv(dev); struct hsr_port *master; + rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (master) { skb->dev = master->dev; @@ -238,6 +239,8 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); } + rcu_read_unlock(); + return NETDEV_TX_OK; } diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index ac1eb1db1a52b4..bc94b07101d80e 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -134,7 +134,7 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type == pt) return port; return NULL; From 847748fc66d08a89135a74e29362a66ba4e3ab15 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:33 +0000 Subject: [PATCH 1419/3206] hsr: hold rcu and dev lock for hsr_get_port_ndev hsr_get_port_ndev calls hsr_for_each_port, which need to hold rcu lock. On the other hand, before return the port device, we need to hold the device reference to avoid UaF in the caller function. Suggested-by: Paolo Abeni Fixes: 9c10dd8eed74 ("net: hsr: Create and export hsr_get_port_ndev()") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-4-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 20 ++++++++++++++------ net/hsr/hsr_device.c | 7 ++++++- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index dadce6009791bc..e42d0fdefee128 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -654,7 +654,7 @@ static void icssg_prueth_hsr_fdb_add_del(struct prueth_emac *emac, static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) { - struct net_device *real_dev; + struct net_device *real_dev, *port_dev; struct prueth_emac *emac; u8 vlan_id, i; @@ -663,11 +663,15 @@ static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) if (is_hsr_master(real_dev)) { for (i = HSR_PT_SLAVE_A; i < HSR_PT_INTERLINK; i++) { - emac = netdev_priv(hsr_get_port_ndev(real_dev, i)); - if (!emac) + port_dev = hsr_get_port_ndev(real_dev, i); + emac = netdev_priv(port_dev); + if (!emac) { + dev_put(port_dev); return -EINVAL; + } icssg_prueth_hsr_fdb_add_del(emac, addr, vlan_id, true); + dev_put(port_dev); } } else { emac = netdev_priv(real_dev); @@ -679,7 +683,7 @@ static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) static int icssg_prueth_hsr_del_mcast(struct net_device *ndev, const u8 *addr) { - struct net_device *real_dev; + struct net_device *real_dev, *port_dev; struct prueth_emac *emac; u8 vlan_id, i; @@ -688,11 +692,15 @@ static int icssg_prueth_hsr_del_mcast(struct net_device *ndev, const u8 *addr) if (is_hsr_master(real_dev)) { for (i = HSR_PT_SLAVE_A; i < HSR_PT_INTERLINK; i++) { - emac = netdev_priv(hsr_get_port_ndev(real_dev, i)); - if (!emac) + port_dev = hsr_get_port_ndev(real_dev, i); + emac = netdev_priv(port_dev); + if (!emac) { + dev_put(port_dev); return -EINVAL; + } icssg_prueth_hsr_fdb_add_del(emac, addr, vlan_id, false); + dev_put(port_dev); } } else { emac = netdev_priv(real_dev); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 702da1f9aaa90e..fbbc3ccf9df64b 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -675,9 +675,14 @@ struct net_device *hsr_get_port_ndev(struct net_device *ndev, struct hsr_priv *hsr = netdev_priv(ndev); struct hsr_port *port; + rcu_read_lock(); hsr_for_each_port(hsr, port) - if (port->type == pt) + if (port->type == pt) { + dev_hold(port->dev); + rcu_read_unlock(); return port->dev; + } + rcu_read_unlock(); return NULL; } EXPORT_SYMBOL(hsr_get_port_ndev); From cfffcf97997bd35f4a59e035523d1762568bdbad Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:30 +0000 Subject: [PATCH 1420/3206] x86/mce: Set CR4.MCE last during init Set the CR4.MCE bit as the last step during init. This brings the MCA init order closer to what is described in the x86 docs. x86 docs: AMD Intel MCG_CTL MCA_CONFIG MCG_EXT_CTL MCi_CTL MCi_CTL MCG_CTL CR4.MCE CR4.MCE Current Linux: AMD Intel CR4.MCE CR4.MCE MCG_CTL MCG_CTL MCA_CONFIG MCG_EXT_CTL MCi_CTL MCi_CTL Updated Linux: AMD Intel MCG_CTL MCG_CTL MCA_CONFIG MCG_EXT_CTL MCi_CTL MCi_CTL CR4.MCE CR4.MCE The new init flow will match Intel's docs, but there will still be a mismatch for AMD regarding MCG_CTL. However, there is no known issue with this ordering, so leave it for now. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com --- arch/x86/kernel/cpu/mce/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 0326fbb83adc5d..9e31834b3542c3 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1850,8 +1850,6 @@ static void __mcheck_cpu_init_generic(void) { u64 cap; - cr4_set_bits(X86_CR4_MCE); - rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); @@ -2276,6 +2274,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_setup_timer(); + cr4_set_bits(X86_CR4_MCE); } /* @@ -2443,6 +2442,7 @@ static void mce_syscore_resume(void) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); __mcheck_cpu_init_prepare_banks(); + cr4_set_bits(X86_CR4_MCE); } static struct syscore_ops mce_syscore_ops = { @@ -2462,6 +2462,7 @@ static void mce_cpu_restart(void *data) __mcheck_cpu_init_generic(); __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_init_timer(); + cr4_set_bits(X86_CR4_MCE); } /* Reinit MCEs after user configuration changes */ From 669ce4984b729ad5b4c6249d4a8721ae52398bfb Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:31 +0000 Subject: [PATCH 1421/3206] x86/mce: Define BSP-only init Currently, MCA initialization is executed identically on each CPU as they are brought online. However, a number of MCA initialization tasks only need to be done once. Define a function to collect all 'global' init tasks and call this from the BSP only. Start with CPU features. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Tested-by: Tony Luck Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com --- arch/x86/include/asm/mce.h | 2 ++ arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/cpu/mce/amd.c | 3 --- arch/x86/kernel/cpu/mce/core.c | 28 +++++++++++++++++++++------- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 3224f3862dc86f..31e3cb550fb3f8 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -241,12 +241,14 @@ struct cper_ia_proc_ctx; #ifdef CONFIG_X86_MCE int mcheck_init(void); +void mca_bsp_init(struct cpuinfo_x86 *c); void mcheck_cpu_init(struct cpuinfo_x86 *c); void mcheck_cpu_clear(struct cpuinfo_x86 *c); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id); #else static inline int mcheck_init(void) { return 0; } +static inline void mca_bsp_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 34a054181c4dc4..8bbfde05f04fb1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1784,6 +1784,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_LA57); detect_nopl(); + mca_bsp_init(c); } void __init init_cpu_devs(void) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index aa13c9304ad892..3c6c19eb0a1807 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -653,9 +653,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; - mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); - mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); mce_flags.amd_threshold = 1; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 9e31834b3542c3..79f3dd7f7851a8 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1837,13 +1837,6 @@ static void __mcheck_cpu_cap_init(void) this_cpu_write(mce_num_banks, b); __mcheck_cpu_mce_banks_init(); - - /* Use accurate RIP reporting if available. */ - if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - mca_cfg.rip_msr = MSR_IA32_MCG_EIP; - - if (cap & MCG_SER_P) - mca_cfg.ser = 1; } static void __mcheck_cpu_init_generic(void) @@ -2240,6 +2233,27 @@ DEFINE_IDTENTRY_RAW(exc_machine_check) } #endif +void mca_bsp_init(struct cpuinfo_x86 *c) +{ + u64 cap; + + if (!mce_available(c)) + return; + + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); + mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + + rdmsrq(MSR_IA32_MCG_CAP, cap); + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: From c6e465b8d45a1bc717d196ee769ee5a9060de8e2 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:32 +0000 Subject: [PATCH 1422/3206] x86/mce: Define BSP-only SMCA init Currently, on AMD systems, MCA interrupt handler functions are set during CPU init. However, the functions only need to be set once for the whole system. Assign the handlers only during BSP init. Do so only for SMCA systems to maintain the old behavior for legacy systems. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 6 ++++++ arch/x86/kernel/cpu/mce/core.c | 3 +++ arch/x86/kernel/cpu/mce/internal.h | 2 ++ 3 files changed, 11 insertions(+) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 3c6c19eb0a1807..7345e24bf65810 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -684,6 +684,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } +void smca_bsp_init(void) +{ + mce_threshold_vector = amd_threshold_interrupt; + deferred_error_int_vector = amd_deferred_error_interrupt; +} + /* * DRAM ECC errors are reported in the Northbridge (bank 4) with * Extended Error Code 8. diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 79f3dd7f7851a8..a8cb7ff53e3218 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2244,6 +2244,9 @@ void mca_bsp_init(struct cpuinfo_x86 *c) mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + if (mce_flags.smca) + smca_bsp_init(); + rdmsrq(MSR_IA32_MCG_CAP, cap); /* Use accurate RIP reporting if available. */ diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 64ac25b953603b..6cb2995f0ec1d7 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -294,12 +294,14 @@ static __always_inline void smca_extract_err_addr(struct mce *m) m->addr &= GENMASK_ULL(55, lsb); } +void smca_bsp_init(void); #else static inline void mce_threshold_create_device(unsigned int cpu) { } static inline void mce_threshold_remove_device(unsigned int cpu) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline bool amd_mce_usable_address(struct mce *m) { return false; } static inline void smca_extract_err_addr(struct mce *m) { } +static inline void smca_bsp_init(void) { } #endif #ifdef CONFIG_X86_ANCIENT_MCE From a46b2bbe1e36e7faab5010f68324b7d191c5c09f Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:33 +0000 Subject: [PATCH 1423/3206] x86/mce: Do 'UNKNOWN' vendor check early The 'UNKNOWN' vendor check is handled as a quirk that is run on each online CPU. However, all CPUs are expected to have the same vendor. Move the 'UNKNOWN' vendor check to the BSP-only init so it is done early and once. Remove the unnecessary return value from the quirks check. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Tested-by: Tony Luck Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com --- arch/x86/kernel/cpu/mce/core.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index a8cb7ff53e3218..515942cbfeb568 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1977,14 +1977,11 @@ static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) } /* Add per CPU specific workarounds here */ -static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static void __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { struct mca_config *cfg = &mca_cfg; switch (c->x86_vendor) { - case X86_VENDOR_UNKNOWN: - pr_info("unknown CPU type - not enabling MCE support\n"); - return false; case X86_VENDOR_AMD: apply_quirks_amd(c); break; @@ -2000,8 +1997,6 @@ static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) cfg->monarch_timeout = 0; if (cfg->bootlog != 0) cfg->panic_timeout = 30; - - return true; } static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) @@ -2240,6 +2235,12 @@ void mca_bsp_init(struct cpuinfo_x86 *c) if (!mce_available(c)) return; + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + mca_cfg.disabled = 1; + pr_info("unknown CPU type - not enabling MCE support\n"); + return; + } + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); @@ -2274,10 +2275,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (!__mcheck_cpu_apply_quirks(c)) { - mca_cfg.disabled = 1; - return; - } + __mcheck_cpu_apply_quirks(c); if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; From 7eee1e92684507f64ec6a75fecbd27e37174b888 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:34 +0000 Subject: [PATCH 1424/3206] x86/mce: Separate global and per-CPU quirks Many quirks are global configuration settings and a handful apply to each CPU. Move the per-CPU quirks to vendor init to execute them on each online CPU. Set the global quirks during BSP-only init so they're only executed once and early. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Tested-by: Tony Luck Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 24 ++++++++++ arch/x86/kernel/cpu/mce/core.c | 85 +++++++++------------------------ arch/x86/kernel/cpu/mce/intel.c | 18 +++++++ 3 files changed, 65 insertions(+), 62 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 7345e24bf65810..b8aed0ac765c91 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -646,6 +646,28 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) wrmsrq(MSR_K7_HWCR, hwcr); } +static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) + mce_banks[0].ctl = 0; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -653,6 +675,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; + amd_apply_cpu_quirks(c); + mce_flags.amd_threshold = 1; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 515942cbfeb568..7fd86c8006ba14 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1807,8 +1807,9 @@ static void __mcheck_cpu_mce_banks_init(void) struct mce_bank *b = &mce_banks[i]; /* - * Init them all, __mcheck_cpu_apply_quirks() is going to apply - * the required vendor quirks before + * Init them all by default. + * + * The required vendor quirks will be applied before * __mcheck_cpu_init_prepare_banks() does the final bank setup. */ b->ctl = -1ULL; @@ -1880,20 +1881,8 @@ static void __mcheck_cpu_init_prepare_banks(void) } } -static void apply_quirks_amd(struct cpuinfo_x86 *c) +static void amd_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* * Lots of broken BIOS around that don't clear them @@ -1902,13 +1891,6 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) mca_cfg.bootlog = 0; } - /* - * Various K7s with broken bank 0 around. Always disable - * by default. - */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks)) - mce_banks[0].ctl = 0; - /* * overflow_recov is supported for F15h Models 00h-0fh * even though we don't have a CPUID bit for it. @@ -1920,25 +1902,12 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) mce_flags.zen_ifu_quirk = 1; } -static void apply_quirks_intel(struct cpuinfo_x86 *c) +static void intel_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - /* Older CPUs (prior to family 6) don't need quirks. */ if (c->x86_vfm < INTEL_PENTIUM_PRO) return; - /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ - if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) - mce_banks[0].init = false; - /* * All newer Intel systems support MCE broadcasting. Enable * synchronization with a one second timeout. @@ -1964,7 +1933,7 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c) mce_flags.skx_repmov_quirk = 1; } -static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) +static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c) { /* * All newer Zhaoxin CPUs support MCE broadcasting. Enable @@ -1976,29 +1945,6 @@ static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) } } -/* Add per CPU specific workarounds here */ -static void __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) -{ - struct mca_config *cfg = &mca_cfg; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - apply_quirks_amd(c); - break; - case X86_VENDOR_INTEL: - apply_quirks_intel(c); - break; - case X86_VENDOR_ZHAOXIN: - apply_quirks_zhaoxin(c); - break; - } - - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = 0; - if (cfg->bootlog != 0) - cfg->panic_timeout = 30; -} - static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) @@ -2256,6 +2202,23 @@ void mca_bsp_init(struct cpuinfo_x86 *c) if (cap & MCG_SER_P) mca_cfg.ser = 1; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_apply_global_quirks(c); + break; + case X86_VENDOR_INTEL: + intel_apply_global_quirks(c); + break; + case X86_VENDOR_ZHAOXIN: + zhaoxin_apply_global_quirks(c); + break; + } + + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = 0; + if (mca_cfg.bootlog != 0) + mca_cfg.panic_timeout = 30; } /* @@ -2275,8 +2238,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - __mcheck_cpu_apply_quirks(c); - if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 9b149b9c410901..4655223ba5606f 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -468,8 +468,26 @@ static void intel_imc_init(struct cpuinfo_x86 *c) } } +static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + * + * Older CPUs (prior to family 6) can't reach this point and already + * return early due to the check of __mcheck_cpu_ancient_init(). + */ + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) + this_cpu_ptr(mce_banks_array)[0].init = false; +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { + intel_apply_cpu_quirks(c); intel_init_cmci(); intel_init_lmce(); intel_imc_init(c); From 91af6842e9945d064401ed2d6e91539a619760d1 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:35 +0000 Subject: [PATCH 1425/3206] x86/mce: Move machine_check_poll() status checks to helper functions There are a number of generic and vendor-specific status checks in machine_check_poll(). These are used to determine if an error should be skipped. Move these into helper functions. Future vendor-specific checks will be added to the helpers. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Tested-by: Tony Luck Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com --- arch/x86/kernel/cpu/mce/core.c | 88 ++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7fd86c8006ba14..5dec0da6169e60 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -714,6 +714,52 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); +/* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ +static bool ser_should_log_poll_error(struct mce *m) +{ + /* Log "not enabled" (speculative) errors */ + if (!(m->status & MCI_STATUS_EN)) + return true; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) + return true; + + return false; +} + +static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + /* If this entry is not valid, ignore it. */ + if (!(m->status & MCI_STATUS_VAL)) + return false; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) + return true; + + if (mca_cfg.ser) + return ser_should_log_poll_error(m); + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + /* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. @@ -765,48 +811,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!mca_cfg.cmci_disabled) mce_track_storm(m); - /* If this entry is not valid, ignore it */ - if (!(m->status & MCI_STATUS_VAL)) + /* Verify that the error should be logged based on hardware conditions. */ + if (!should_log_poll_error(flags, &err)) continue; - /* - * If we are logging everything (at CPU online) or this - * is a corrected error, then we must log it. - */ - if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) - goto log_it; - - /* - * Newer Intel systems that support software error - * recovery need to make additional checks. Other - * CPUs should skip over uncorrected errors, but log - * everything else. - */ - if (!mca_cfg.ser) { - if (m->status & MCI_STATUS_UC) - continue; - goto log_it; - } - - /* Log "not enabled" (speculative) errors */ - if (!(m->status & MCI_STATUS_EN)) - goto log_it; - - /* - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") - * UC == 1 && PCC == 0 && S == 0 - */ - if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) - goto log_it; - - /* - * Skip anything else. Presumption is that our read of this - * bank is racing with a machine check. Leave the log alone - * for do_machine_check() to deal with it. - */ - continue; - -log_it: mce_read_aux(&err, i); m->severity = mce_severity(m, NULL, NULL, false); /* From 5c6f123c419b6e20f84ac1683089a52f449273aa Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:36 +0000 Subject: [PATCH 1426/3206] x86/mce: Add a clear_bank() helper Add a helper at the end of the MCA polling function to collect vendor and/or feature actions. Start with a basic skeleton for now. Actions for AMD thresholding and deferred errors will be added later. [ bp: Drop the obvious comment too. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 5 +++++ arch/x86/kernel/cpu/mce/core.c | 15 ++++++++++----- arch/x86/kernel/cpu/mce/internal.h | 3 +++ 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index b8aed0ac765c91..d6906442f49bf9 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -955,6 +955,11 @@ static void amd_threshold_interrupt(void) } } +void amd_clear_bank(struct mce *m) +{ + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + /* * Sysfs Interface */ diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 5dec0da6169e60..460e90a1a0b172 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -423,7 +423,7 @@ noinstr u64 mce_rdmsrq(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrq(u32 msr, u64 v) +noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -760,6 +760,14 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) return true; } +static void clear_bank(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD) + return amd_clear_bank(m); + + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + /* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. @@ -831,10 +839,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_log(&err); clear_it: - /* - * Clear state for this bank. - */ - mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + clear_bank(m); } /* diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 6cb2995f0ec1d7..b0e00ec5cc8c9f 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -269,6 +269,7 @@ void mce_threshold_create_device(unsigned int cpu); void mce_threshold_remove_device(unsigned int cpu); extern bool amd_filter_mce(struct mce *m); bool amd_mce_usable_address(struct mce *m); +void amd_clear_bank(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -300,6 +301,7 @@ static inline void mce_threshold_create_device(unsigned int cpu) { } static inline void mce_threshold_remove_device(unsigned int cpu) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline bool amd_mce_usable_address(struct mce *m) { return false; } +static inline void amd_clear_bank(struct mce *m) { } static inline void smca_extract_err_addr(struct mce *m) { } static inline void smca_bsp_init(void) { } #endif @@ -319,6 +321,7 @@ static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif noinstr u64 mce_rdmsrq(u32 msr); +noinstr void mce_wrmsrq(u32 msr, u64 v); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { From 002ebddd695a53999550e241b71950f1aa0e1ac4 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:20 +0200 Subject: [PATCH 1427/3206] pmdomain: core: Restore behaviour for disabling unused PM domains Recent changes to genpd prevents those PM domains being powered-on during initialization from being powered-off during the boot sequence. Based upon whether CONFIG_PM_CONFIG_PM_GENERIC_DOMAINS_OF is set of not, genpd relies on the sync_state mechanism or the genpd_power_off_unused() (which is a late_initcall_sync), to understand when it's okay to allow these PM domains to be powered-off. This new behaviour in genpd has lead to problems on different platforms. Let's therefore restore the behavior of genpd_power_off_unused(). Moreover, let's introduce GENPD_FLAG_NO_STAY_ON, to allow genpd OF providers to opt-out from the new behaviour. Link: https://lore.kernel.org/all/20250701114733.636510-1-ulf.hansson@linaro.org/ Reported-by: Geert Uytterhoeven Link: https://lore.kernel.org/all/20250902-rk3576-lockup-regression-v1-1-c4a0c9daeb00@collabora.com/ Reported-by: Nicolas Frattaroli Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Tested-by: Heiko Stuebner Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson --- drivers/pmdomain/core.c | 20 ++++++++++++++------ include/linux/pm_domain.h | 7 +++++++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index 0006ab3d078972..61c2277c9ce39f 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -187,6 +187,7 @@ static const struct genpd_lock_ops genpd_raw_spin_ops = { #define genpd_is_opp_table_fw(genpd) (genpd->flags & GENPD_FLAG_OPP_TABLE_FW) #define genpd_is_dev_name_fw(genpd) (genpd->flags & GENPD_FLAG_DEV_NAME_FW) #define genpd_is_no_sync_state(genpd) (genpd->flags & GENPD_FLAG_NO_SYNC_STATE) +#define genpd_is_no_stay_on(genpd) (genpd->flags & GENPD_FLAG_NO_STAY_ON) static inline bool irq_safe_dev_in_sleep_domain(struct device *dev, const struct generic_pm_domain *genpd) @@ -1357,7 +1358,6 @@ static int genpd_runtime_resume(struct device *dev) return ret; } -#ifndef CONFIG_PM_GENERIC_DOMAINS_OF static bool pd_ignore_unused; static int __init pd_ignore_unused_setup(char *__unused) { @@ -1382,9 +1382,6 @@ static int __init genpd_power_off_unused(void) mutex_lock(&gpd_list_lock); list_for_each_entry(genpd, &gpd_list, gpd_list_node) { - genpd_lock(genpd); - genpd->stay_on = false; - genpd_unlock(genpd); genpd_queue_power_off_work(genpd); } @@ -1393,7 +1390,6 @@ static int __init genpd_power_off_unused(void) return 0; } late_initcall_sync(genpd_power_off_unused); -#endif #ifdef CONFIG_PM_SLEEP @@ -2367,6 +2363,18 @@ static void genpd_lock_init(struct generic_pm_domain *genpd) } } +#ifdef CONFIG_PM_GENERIC_DOMAINS_OF +static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off) +{ + genpd->stay_on = !genpd_is_no_stay_on(genpd) && !is_off; +} +#else +static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off) +{ + genpd->stay_on = false; +} +#endif + /** * pm_genpd_init - Initialize a generic I/O PM domain object. * @genpd: PM domain object to initialize. @@ -2392,7 +2400,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd, INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn); atomic_set(&genpd->sd_count, 0); genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON; - genpd->stay_on = !is_off; + genpd_set_stay_on(genpd, is_off); genpd->sync_state = GENPD_SYNC_STATE_OFF; genpd->device_count = 0; genpd->provider = NULL; diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index c84edf217819b4..f67a2cb7d78148 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -115,6 +115,12 @@ struct dev_pm_domain_list { * genpd provider specific way, likely through a * parent device node. This flag makes genpd to * skip its internal support for this. + * + * GENPD_FLAG_NO_STAY_ON: For genpd OF providers a powered-on PM domain at + * initialization is prevented from being + * powered-off until the ->sync_state() callback is + * invoked. This flag informs genpd to allow a + * power-off without waiting for ->sync_state(). */ #define GENPD_FLAG_PM_CLK (1U << 0) #define GENPD_FLAG_IRQ_SAFE (1U << 1) @@ -126,6 +132,7 @@ struct dev_pm_domain_list { #define GENPD_FLAG_OPP_TABLE_FW (1U << 7) #define GENPD_FLAG_DEV_NAME_FW (1U << 8) #define GENPD_FLAG_NO_SYNC_STATE (1U << 9) +#define GENPD_FLAG_NO_STAY_ON (1U << 10) enum gpd_status { GENPD_STATE_ON = 0, /* PM domain is on */ From 2bc12a8199a0f954d003faa4ad8d100a0b19d85a Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:21 +0200 Subject: [PATCH 1428/3206] pmdomain: rockchip: Fix regulator dependency with GENPD_FLAG_NO_STAY_ON The deferred regulator retrieval for Rockchip PM domains are causing some weird dependencies. More precisely, if the power-domain is powered-on from the HW perspective, its corresponding regulator must not be powered-off via regulator_init_complete(), which is a late_initcall_sync. Even on platforms that don't have the domain-supply regulator specified for the power-domain provider, may suffer from these problems. More precisely, things just happen to work before, because genpd_power_off_unused() (also a late_initcall_sync) managed to power-off the PM domain before regulator_init_complete() powered-off the regulator. Ideally this fragile dependency must be fixed properly for the Rockchip PM domains, but until then, let's fallback to the previous behaviour by using the GENPD_FLAG_NO_STAY_ON flag. Link: https://lore.kernel.org/all/20250902-rk3576-lockup-regression-v1-1-c4a0c9daeb00@collabora.com/ Reported-by: Nicolas Frattaroli Cc: Heiko Stuebner Cc: Sebastian Reichel Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Tested-by: Nicolas Frattaroli Tested-by: Heiko Stuebner Acked-by: Heiko Stuebner Signed-off-by: Ulf Hansson --- drivers/pmdomain/rockchip/pm-domains.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pmdomain/rockchip/pm-domains.c b/drivers/pmdomain/rockchip/pm-domains.c index 242570c505fb65..1955c6d453e4f6 100644 --- a/drivers/pmdomain/rockchip/pm-domains.c +++ b/drivers/pmdomain/rockchip/pm-domains.c @@ -865,7 +865,7 @@ static int rockchip_pm_add_one_domain(struct rockchip_pmu *pmu, pd->genpd.power_on = rockchip_pd_power_on; pd->genpd.attach_dev = rockchip_pd_attach_dev; pd->genpd.detach_dev = rockchip_pd_detach_dev; - pd->genpd.flags = GENPD_FLAG_PM_CLK; + pd->genpd.flags = GENPD_FLAG_PM_CLK | GENPD_FLAG_NO_STAY_ON; if (pd_info->active_wakeup) pd->genpd.flags |= GENPD_FLAG_ACTIVE_WAKEUP; pm_genpd_init(&pd->genpd, NULL, From 36bbaec460db0fb2f6d2641470b4a6f3891a492a Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:22 +0200 Subject: [PATCH 1429/3206] pmdomain: renesas: rcar-sysc: Don't keep unused PM domains powered-on The recent changes to genpd makes a genpd OF provider that is powered-on at initialization to stay powered-on, until the ->sync_state() callback is invoked for it. This may not happen at all, if we wait for a consumer device to be probed, leading to wasting energy. There are ways to enforce the ->sync_state() callback to be invoked, through sysfs or via the probe-defer-timeout, but none of them in its current form are a good fit for rcar-sysc PM domains. Let's therefore opt-out from this behaviour of genpd for now, by using the GENPD_FLAG_NO_STAY_ON. Link: https://lore.kernel.org/all/20250701114733.636510-1-ulf.hansson@linaro.org/ Reported-by: Geert Uytterhoeven Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson --- drivers/pmdomain/renesas/rcar-sysc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pmdomain/renesas/rcar-sysc.c b/drivers/pmdomain/renesas/rcar-sysc.c index 2d4161170c63d0..d8a8ffcde38d8b 100644 --- a/drivers/pmdomain/renesas/rcar-sysc.c +++ b/drivers/pmdomain/renesas/rcar-sysc.c @@ -241,6 +241,7 @@ static int __init rcar_sysc_pd_setup(struct rcar_sysc_pd *pd) } } + genpd->flags |= GENPD_FLAG_NO_STAY_ON; genpd->power_off = rcar_sysc_pd_power_off; genpd->power_on = rcar_sysc_pd_power_on; From d929e42deaf7e2d165ac702421d02eeb9d544c70 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:23 +0200 Subject: [PATCH 1430/3206] pmdomain: renesas: rcar-gen4-sysc: Don't keep unused PM domains powered-on The recent changes to genpd makes a genpd OF provider that is powered-on at initialization to stay powered-on, until the ->sync_state() callback is invoked for it. This may not happen at all, if we wait for a consumer device to be probed, leading to wasting energy. There are ways to enforce the ->sync_state() callback to be invoked, through sysfs or via the probe-defer-timeout, but none of them in its current form are a good fit for rcar-gen4-sysc PM domains. Let's therefore opt-out from this behaviour of genpd for now, by using the GENPD_FLAG_NO_STAY_ON. Link: https://lore.kernel.org/all/20250701114733.636510-1-ulf.hansson@linaro.org/ Reported-by: Geert Uytterhoeven Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson --- drivers/pmdomain/renesas/rcar-gen4-sysc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pmdomain/renesas/rcar-gen4-sysc.c b/drivers/pmdomain/renesas/rcar-gen4-sysc.c index 5aa7fa1df8fe58..7434bf42d21562 100644 --- a/drivers/pmdomain/renesas/rcar-gen4-sysc.c +++ b/drivers/pmdomain/renesas/rcar-gen4-sysc.c @@ -251,6 +251,7 @@ static int __init rcar_gen4_sysc_pd_setup(struct rcar_gen4_sysc_pd *pd) genpd->detach_dev = cpg_mssr_detach_dev; } + genpd->flags |= GENPD_FLAG_NO_STAY_ON; genpd->power_off = rcar_gen4_sysc_pd_power_off; genpd->power_on = rcar_gen4_sysc_pd_power_on; From 303010f4658cb134eb27cee88026fb5d065a48cd Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:24 +0200 Subject: [PATCH 1431/3206] pmdomain: renesas: rmobile-sysc: Don't keep unused PM domains powered-on The recent changes to genpd makes a genpd OF provider that is powered-on at initialization to stay powered-on, until the ->sync_state() callback is invoked for it. This may not happen at all, if we wait for a consumer device to be probed, leading to wasting energy. There are ways to enforce the ->sync_state() callback to be invoked, through sysfs or via the probe-defer-timeout, but none of them in its current form are a good fit for rmobile-sysc PM domains. Let's therefore opt-out from this behaviour of genpd for now, by using the GENPD_FLAG_NO_STAY_ON. Link: https://lore.kernel.org/all/20250701114733.636510-1-ulf.hansson@linaro.org/ Reported-by: Geert Uytterhoeven Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson --- drivers/pmdomain/renesas/rmobile-sysc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pmdomain/renesas/rmobile-sysc.c b/drivers/pmdomain/renesas/rmobile-sysc.c index 8eedc9a1d82501..a6bf7295e909e7 100644 --- a/drivers/pmdomain/renesas/rmobile-sysc.c +++ b/drivers/pmdomain/renesas/rmobile-sysc.c @@ -100,7 +100,8 @@ static void rmobile_init_pm_domain(struct rmobile_pm_domain *rmobile_pd) struct generic_pm_domain *genpd = &rmobile_pd->genpd; struct dev_power_governor *gov = rmobile_pd->gov; - genpd->flags |= GENPD_FLAG_PM_CLK | GENPD_FLAG_ACTIVE_WAKEUP; + genpd->flags |= GENPD_FLAG_PM_CLK | GENPD_FLAG_ACTIVE_WAKEUP | + GENPD_FLAG_NO_STAY_ON; genpd->attach_dev = cpg_mstp_attach_dev; genpd->detach_dev = cpg_mstp_detach_dev; From f0addd325ef692c92c522a2ba4d9db13fc90e664 Mon Sep 17 00:00:00 2001 From: Alexander Kurz Date: Mon, 11 Aug 2025 06:43:58 +0000 Subject: [PATCH 1432/3206] mfd: input: rtc: mc13783: Remove deprecated mc13xxx_irq_ack() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mc13xxx_irq_ack() got deprecated and became dead code with commit 10f9edaeaa30 ("mfd: mc13xxx: Use regmap irq framework for interrupts"). It should be safe to remove it now. Signed-off-by: Alexander Kurz Acked-by: Alexandre Belloni Acked-by: Uwe Kleine-König Acked-by: Dmitry Torokhov # for input Link: https://lore.kernel.org/r/20250811064358.1659-1-akurz@blala.de Signed-off-by: Lee Jones --- drivers/input/misc/mc13783-pwrbutton.c | 1 - drivers/input/touchscreen/mc13783_ts.c | 4 ---- drivers/rtc/rtc-mc13xxx.c | 13 ------------- include/linux/mfd/mc13xxx.h | 6 ------ 4 files changed, 24 deletions(-) diff --git a/drivers/input/misc/mc13783-pwrbutton.c b/drivers/input/misc/mc13783-pwrbutton.c index 1c7faa9b7afe04..b83d762ae2e9f0 100644 --- a/drivers/input/misc/mc13783-pwrbutton.c +++ b/drivers/input/misc/mc13783-pwrbutton.c @@ -57,7 +57,6 @@ static irqreturn_t button_irq(int irq, void *_priv) struct mc13783_pwrb *priv = _priv; int val; - mc13xxx_irq_ack(priv->mc13783, irq); mc13xxx_reg_read(priv->mc13783, MC13783_REG_INTERRUPT_SENSE_1, &val); switch (irq) { diff --git a/drivers/input/touchscreen/mc13783_ts.c b/drivers/input/touchscreen/mc13783_ts.c index 33635da8507999..47b8da00027fd5 100644 --- a/drivers/input/touchscreen/mc13783_ts.c +++ b/drivers/input/touchscreen/mc13783_ts.c @@ -42,8 +42,6 @@ static irqreturn_t mc13783_ts_handler(int irq, void *data) { struct mc13783_ts_priv *priv = data; - mc13xxx_irq_ack(priv->mc13xxx, irq); - /* * Kick off reading coordinates. Note that if work happens already * be queued for future execution (it rearms itself) it will not @@ -137,8 +135,6 @@ static int mc13783_ts_open(struct input_dev *dev) mc13xxx_lock(priv->mc13xxx); - mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_TS); - ret = mc13xxx_irq_request(priv->mc13xxx, MC13XXX_IRQ_TS, mc13783_ts_handler, MC13783_TS_NAME, priv); if (ret) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index e7b87130e6248d..2494d13fd767e9 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -137,10 +137,6 @@ static int mc13xxx_rtc_set_time(struct device *dev, struct rtc_time *tm) } if (!priv->valid) { - ret = mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_RTCRST); - if (unlikely(ret)) - goto out; - ret = mc13xxx_irq_unmask(priv->mc13xxx, MC13XXX_IRQ_RTCRST); } @@ -208,10 +204,6 @@ static int mc13xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) if (unlikely(ret)) goto out; - ret = mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_TODA); - if (unlikely(ret)) - goto out; - s1970 = rtc_tm_to_time64(&alarm->time); dev_dbg(dev, "%s: %s %lld\n", __func__, alarm->enabled ? "on" : "off", @@ -239,12 +231,9 @@ static int mc13xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) static irqreturn_t mc13xxx_rtc_alarm_handler(int irq, void *dev) { struct mc13xxx_rtc *priv = dev; - struct mc13xxx *mc13xxx = priv->mc13xxx; rtc_update_irq(priv->rtc, 1, RTC_IRQF | RTC_AF); - mc13xxx_irq_ack(mc13xxx, irq); - return IRQ_HANDLED; } @@ -293,8 +282,6 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) mc13xxx_lock(mc13xxx); - mc13xxx_irq_ack(mc13xxx, MC13XXX_IRQ_RTCRST); - ret = mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_RTCRST, mc13xxx_rtc_reset_handler, DRIVER_NAME, priv); if (ret) diff --git a/include/linux/mfd/mc13xxx.h b/include/linux/mfd/mc13xxx.h index f372926d5894fb..dd46fe424a8091 100644 --- a/include/linux/mfd/mc13xxx.h +++ b/include/linux/mfd/mc13xxx.h @@ -31,12 +31,6 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode, unsigned int channel, u8 ato, bool atox, unsigned int *sample); -/* Deprecated calls */ -static inline int mc13xxx_irq_ack(struct mc13xxx *mc13xxx, int irq) -{ - return 0; -} - static inline int mc13xxx_irq_request_nounmask(struct mc13xxx *mc13xxx, int irq, irq_handler_t handler, const char *name, void *dev) From 87c0881bd734a2cea12076716d5d9a050789d4b7 Mon Sep 17 00:00:00 2001 From: Stefan Kerkmann Date: Wed, 10 Sep 2025 11:34:05 +0200 Subject: [PATCH 1433/3206] ASoC: dt-bindings: ti,pcm1754: add binding documentation The Texas Instruments PCM1754 is a simple stereo DAC without any digital management interface but soft mute, PCM input format and 44.1 kHz digital de-emphasis can be configured via strapping pins. Only soft mute and PCM input format selection is currently exposed via optional GPIOs in the driver. Signed-off-by: Stefan Kerkmann Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250910-v6-12-topic-pcm1754-v2-1-0917dbe73c65@pengutronix.de Signed-off-by: Mark Brown --- .../devicetree/bindings/sound/ti,pcm1754.yaml | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 Documentation/devicetree/bindings/sound/ti,pcm1754.yaml diff --git a/Documentation/devicetree/bindings/sound/ti,pcm1754.yaml b/Documentation/devicetree/bindings/sound/ti,pcm1754.yaml new file mode 100644 index 00000000000000..a757f737690c18 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/ti,pcm1754.yaml @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/ti,pcm1754.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Texas Instruments PCM1754 Stereo DAC + +description: + The PCM1754 is a simple stereo DAC that is controlled via hardware gpios. + +maintainers: + - Stefan Kerkmann + +allOf: + - $ref: dai-common.yaml# + +properties: + compatible: + enum: + - ti,pcm1754 + + vcc-supply: true + + '#sound-dai-cells': + const: 0 + + format-gpios: + maxItems: 1 + description: + GPIO used to select the PCM format + + mute-gpios: + maxItems: 1 + description: + GPIO used to mute all outputs + +required: + - compatible + - '#sound-dai-cells' + - vcc-supply + +additionalProperties: false + +examples: + - | + #include + codec { + compatible = "ti,pcm1754"; + #sound-dai-cells = <0>; + + vcc-supply = <&vcc_reg>; + mute-gpios = <&gpio 0 GPIO_ACTIVE_HIGH>; + format-gpios = <&gpio 1 GPIO_ACTIVE_HIGH>; + }; From 1217b573978482ae7d21dc5c0bf5aa5007b24f90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alvin=20=C5=A0ipraga?= Date: Wed, 10 Sep 2025 11:34:06 +0200 Subject: [PATCH 1434/3206] ASoC: codecs: pcm1754: add pcm1754 dac driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Texas Instruments PCM1754[1] is a simple stereo DAC without any digital management interface but soft mute, PCM input format and 44.1 kHz digital de-emphasis can be configured via strapping pins. Only soft mute and PCM input format selection is currently exposed via optional GPIOs in the driver. [1]: https://www.ti.com/product/PCM1754 Signed-off-by: Alvin Šipraga Co-developed-by: Stefan Kerkmann Signed-off-by: Stefan Kerkmann Link: https://patch.msgid.link/20250910-v6-12-topic-pcm1754-v2-2-0917dbe73c65@pengutronix.de Signed-off-by: Mark Brown --- sound/soc/codecs/Kconfig | 5 + sound/soc/codecs/Makefile | 2 + sound/soc/codecs/pcm1754.c | 185 +++++++++++++++++++++++++++++++++++++ 3 files changed, 192 insertions(+) create mode 100644 sound/soc/codecs/pcm1754.c diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 6d7e4725d89cd3..423cc02f76bce6 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -177,6 +177,7 @@ config SND_SOC_ALL_CODECS imply SND_SOC_NAU8825 imply SND_SOC_HDMI_CODEC imply SND_SOC_PCM1681 + imply SND_SOC_PCM1754 imply SND_SOC_PCM1789_I2C imply SND_SOC_PCM179X_I2C imply SND_SOC_PCM179X_SPI @@ -1426,6 +1427,10 @@ config SND_SOC_PCM1681 tristate "Texas Instruments PCM1681 CODEC" depends on I2C +config SND_SOC_PCM1754 + tristate "Texas Instruments PCM1754 CODEC" + depends on GPIOLIB + config SND_SOC_PCM1789 tristate diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile index a68c3d192a1b6c..af177e271dda8d 100644 --- a/sound/soc/codecs/Makefile +++ b/sound/soc/codecs/Makefile @@ -201,6 +201,7 @@ snd-soc-ntp8918-y := ntp8918.o snd-soc-ntpfw-y := ntpfw.o snd-soc-hdmi-codec-y := hdmi-codec.o snd-soc-pcm1681-y := pcm1681.o +snd-soc-pcm1754-y := pcm1754.o snd-soc-pcm1789-codec-y := pcm1789.o snd-soc-pcm1789-i2c-y := pcm1789-i2c.o snd-soc-pcm179x-codec-y := pcm179x.o @@ -621,6 +622,7 @@ obj-$(CONFIG_SND_SOC_NTP8918) += snd-soc-ntp8918.o obj-$(CONFIG_SND_SOC_NTPFW) += snd-soc-ntpfw.o obj-$(CONFIG_SND_SOC_HDMI_CODEC) += snd-soc-hdmi-codec.o obj-$(CONFIG_SND_SOC_PCM1681) += snd-soc-pcm1681.o +obj-$(CONFIG_SND_SOC_PCM1754) += snd-soc-pcm1754.o obj-$(CONFIG_SND_SOC_PCM179X) += snd-soc-pcm179x-codec.o obj-$(CONFIG_SND_SOC_PCM1789_I2C) += snd-soc-pcm1789-i2c.o obj-$(CONFIG_SND_SOC_PCM1789) += snd-soc-pcm1789-codec.o diff --git a/sound/soc/codecs/pcm1754.c b/sound/soc/codecs/pcm1754.c new file mode 100644 index 00000000000000..b68a528000be89 --- /dev/null +++ b/sound/soc/codecs/pcm1754.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * PCM1754 DAC ASoC codec driver + * + * Copyright (c) 2022 Alvin Šipraga + * Copyright (c) 2025 Stefan Kerkmann + */ + +#include +#include +#include + +#include +#include + +struct pcm1754_priv { + unsigned int format; + struct gpio_desc *gpiod_mute; + struct gpio_desc *gpiod_format; +}; + +static int pcm1754_set_dai_fmt(struct snd_soc_dai *codec_dai, + unsigned int format) +{ + struct snd_soc_component *component = codec_dai->component; + struct pcm1754_priv *priv = snd_soc_component_get_drvdata(component); + + priv->format = format; + + return 0; +} + +static int pcm1754_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, + struct snd_soc_dai *codec_dai) +{ + struct snd_soc_component *component = codec_dai->component; + struct pcm1754_priv *priv = snd_soc_component_get_drvdata(component); + int format; + + switch (priv->format & SND_SOC_DAIFMT_FORMAT_MASK) { + case SND_SOC_DAIFMT_RIGHT_J: + switch (params_width(params)) { + case 16: + format = 1; + break; + default: + return -EINVAL; + } + break; + case SND_SOC_DAIFMT_I2S: + switch (params_width(params)) { + case 16: + fallthrough; + case 24: + format = 0; + break; + default: + return -EINVAL; + } + break; + default: + dev_err(component->dev, "Invalid DAI format\n"); + return -EINVAL; + } + + gpiod_set_value_cansleep(priv->gpiod_format, format); + + return 0; +} + +static int pcm1754_mute_stream(struct snd_soc_dai *dai, int mute, int stream) +{ + struct pcm1754_priv *priv = snd_soc_component_get_drvdata(dai->component); + + gpiod_set_value_cansleep(priv->gpiod_mute, mute); + + return 0; +} + +static const struct snd_soc_dai_ops pcm1754_dai_ops = { + .set_fmt = pcm1754_set_dai_fmt, + .hw_params = pcm1754_hw_params, + .mute_stream = pcm1754_mute_stream, +}; + +static const struct snd_soc_dai_driver pcm1754_dai = { + .name = "pcm1754", + .playback = { + .stream_name = "Playback", + .channels_min = 2, + .channels_max = 2, + .rates = SNDRV_PCM_RATE_CONTINUOUS, + .rate_min = 5000, + .rate_max = 200000, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE + }, + .ops = &pcm1754_dai_ops, +}; + +static const struct snd_soc_dapm_widget pcm1754_dapm_widgets[] = { + SND_SOC_DAPM_REGULATOR_SUPPLY("VCC", 0, 0), + + SND_SOC_DAPM_DAC("DAC1", "Channel 1 Playback", SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_DAC("DAC2", "Channel 2 Playback", SND_SOC_NOPM, 0, 0), + + SND_SOC_DAPM_OUTPUT("VOUTL"), + SND_SOC_DAPM_OUTPUT("VOUTR"), +}; + +static const struct snd_soc_dapm_route pcm1754_dapm_routes[] = { + { "DAC1", NULL, "Playback" }, + { "DAC2", NULL, "Playback" }, + + { "DAC1", NULL, "VCC" }, + { "DAC2", NULL, "VCC" }, + + { "VOUTL", NULL, "DAC1" }, + { "VOUTR", NULL, "DAC2" }, +}; + +static const struct snd_soc_component_driver soc_component_dev_pcm1754 = { + .dapm_widgets = pcm1754_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(pcm1754_dapm_widgets), + .dapm_routes = pcm1754_dapm_routes, + .num_dapm_routes = ARRAY_SIZE(pcm1754_dapm_routes), +}; + +static int pcm1754_probe(struct platform_device *pdev) +{ + struct pcm1754_priv *priv; + struct device *dev = &pdev->dev; + struct snd_soc_dai_driver *dai_drv; + int ret; + + dai_drv = devm_kmemdup(dev, &pcm1754_dai, sizeof(*dai_drv), GFP_KERNEL); + if (!dai_drv) + return -ENOMEM; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->gpiod_mute = devm_gpiod_get_optional(dev, "mute", GPIOD_OUT_HIGH); + if (IS_ERR(priv->gpiod_mute)) + return dev_err_probe(dev, PTR_ERR(priv->gpiod_mute), + "failed to get mute gpio"); + + priv->gpiod_format = devm_gpiod_get_optional(dev, "format", GPIOD_OUT_LOW); + if (IS_ERR(priv->gpiod_format)) + return dev_err_probe(dev, PTR_ERR(priv->gpiod_format), + "failed to get format gpio"); + + dev_set_drvdata(dev, priv); + + ret = devm_snd_soc_register_component( + &pdev->dev, &soc_component_dev_pcm1754, dai_drv, 1); + if (ret) + return dev_err_probe(dev, ret, "failed to register"); + + return 0; +} + +#ifdef CONFIG_OF +static const struct of_device_id pcm1754_of_match[] = { + { .compatible = "ti,pcm1754" }, + { } +}; +MODULE_DEVICE_TABLE(of, pcm1754_of_match); +#endif + +static struct platform_driver pcm1754_codec_driver = { + .driver = { + .name = "pcm1754-codec", + .of_match_table = of_match_ptr(pcm1754_of_match), + }, + .probe = pcm1754_probe, +}; + +module_platform_driver(pcm1754_codec_driver); + +MODULE_DESCRIPTION("ASoC PCM1754 driver"); +MODULE_AUTHOR("Alvin Šipraga "); +MODULE_AUTHOR("Stefan Kerkmann "); +MODULE_LICENSE("GPL"); From c7ec58c39b0252e6635dde55e5c708132ab25cfc Mon Sep 17 00:00:00 2001 From: Wasim Nazir Date: Mon, 8 Sep 2025 13:49:55 +0530 Subject: [PATCH 1435/3206] dt-bindings: eeprom: at24: Add compatible for Giantec GT24C256C The gt24c256c is another 24c256 compatible EEPROM, and does not follow the generic name matching, so add a separate compatible for it. This ensures accurate device-tree representation and enables proper kernel support for systems using this part. Acked-by: Krzysztof Kozlowski Signed-off-by: Wasim Nazir Link: https://lore.kernel.org/r/20250908-lemans-evk-bu-v4-5-5c319c696a7d@oss.qualcomm.com Signed-off-by: Bartosz Golaszewski --- Documentation/devicetree/bindings/eeprom/at24.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/eeprom/at24.yaml b/Documentation/devicetree/bindings/eeprom/at24.yaml index 0ac68646c07779..50af7ccf6e21ab 100644 --- a/Documentation/devicetree/bindings/eeprom/at24.yaml +++ b/Documentation/devicetree/bindings/eeprom/at24.yaml @@ -143,6 +143,7 @@ properties: - const: atmel,24c128 - items: - enum: + - giantec,gt24c256c - puya,p24c256c - const: atmel,24c256 - items: From 2c2529e470627a8de22ec366d2f4c3146fb3fe96 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 29 Aug 2025 10:51:41 +0100 Subject: [PATCH 1436/3206] arm64: sysreg: Fix and tidy up sysreg field definitions Fix the value of ID_PFR1_EL1.Security NSACR_RFR to be 0b0010, as per DDI0601/2025-06, which wasn't correctly set when introduced in commit 1224308075f1 ("arm64/sysreg: Convert ID_PFR1_EL1 to automatic generation"). While at it, remove redundant definitions of CPACR_EL12 and RCWSMASK_EL1 and fix some typos. Reviewed-by: Anshuman Khandual Acked-by: Marc Zyngier Signed-off-by: Fuad Tabba Acked-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 696ab1f32a6749..f1a012ee0db61c 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -31,7 +31,7 @@ # Mapping # EndSysreg -# Where multiple system regsiters are not VHE aliases but share a +# Where multiple system registers are not VHE aliases but share a # common layout, a SysregFields block can be used to describe the # shared layout: @@ -54,7 +54,7 @@ # # In general it is recommended that new enumeration items be named for the # feature that introduces them (eg, FEAT_LS64_ACCDATA introduces enumeration -# item ACCDATA) though it may be more taseful to do something else. +# item ACCDATA) though it may be more tasteful to do something else. Sysreg OSDTRRX_EL1 2 0 0 0 2 Res0 63:32 @@ -474,7 +474,7 @@ EndEnum Enum 7:4 Security 0b0000 NI 0b0001 EL3 - 0b0001 NSACR_RFR + 0b0010 NSACR_RFR EndEnum UnsignedEnum 3:0 ProgMod 0b0000 NI @@ -2528,10 +2528,6 @@ Field 17:16 ZEN Res0 15:0 EndSysreg -Sysreg CPACR_EL12 3 5 1 0 2 -Mapping CPACR_EL1 -EndSysreg - Sysreg CPACRALIAS_EL1 3 0 1 4 4 Mapping CPACR_EL1 EndSysreg @@ -2576,10 +2572,6 @@ Sysreg PFAR_EL12 3 5 6 0 5 Mapping PFAR_EL1 EndSysreg -Sysreg RCWSMASK_EL1 3 0 13 0 3 -Field 63:0 RCWSMASK -EndSysreg - Sysreg SCTLR2_EL1 3 0 1 0 3 Res0 63:13 Field 12 CPTM0 From f4d4ebc84995178273740f3e601e97fdefc561d2 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 29 Aug 2025 10:51:42 +0100 Subject: [PATCH 1437/3206] arm64: sysreg: Correct sign definitions for EIESB and DoubleLock The `ID_AA64MMFR4_EL1.EIESB` field, is an unsigned enumeration, but was incorrectly defined as a `SignedEnum` when introduced in commit cfc680bb04c5 ("arm64: sysreg: Add layout for ID_AA64MMFR4_EL1"). This is corrected to `UnsignedEnum`. Conversely, the `ID_AA64DFR0_EL1.DoubleLock` field, is a signed enumeration, but was incorrectly defined as an `UnsignedEnum`. This is corrected to `SignedEnum`, which wasn't correctly set when annotated as such in commit ad16d4cf0b4f ("arm64/sysreg: Initial unsigned annotations for ID registers"). Signed-off-by: Fuad Tabba Acked-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index f1a012ee0db61c..d396fa587ec182 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1693,7 +1693,7 @@ UnsignedEnum 43:40 TraceFilt 0b0000 NI 0b0001 IMP EndEnum -UnsignedEnum 39:36 DoubleLock +SignedEnum 39:36 DoubleLock 0b0000 IMP 0b1111 NI EndEnum @@ -2409,7 +2409,7 @@ UnsignedEnum 11:8 ASID2 0b0000 NI 0b0001 IMP EndEnum -SignedEnum 7:4 EIESB +UnsignedEnum 7:4 EIESB 0b0000 NI 0b0001 ToEL3 0b0010 ToELx From 382cbbe7fb2ae841a7e7b4c40a02f12afc803c69 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 29 Aug 2025 10:51:43 +0100 Subject: [PATCH 1438/3206] arm64: sysreg: Add validation checks to sysreg header generation script The gen_sysreg.awk script processes the system register specification in the sysreg text file to generate C macro definitions. The current script will silently accept certain errors in the specification file, leading to incorrect header generation. For example, a Sysreg or SysregFields can be accidentally duplicated, causing its macros to be emitted twice. An Enum can contain duplicate values for different items, which is architecturally incorrect. Add checks to catch these errors at build time. The script now tracks all seen Sysreg and SysregFields definitions and checks for duplicates. It also tracks values within each Enum block to ensure entries are unique. Acked-by: Marc Zyngier Signed-off-by: Fuad Tabba Acked-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/tools/gen-sysreg.awk | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk index f2a1732cb1f638..bbbb812603e8f0 100755 --- a/arch/arm64/tools/gen-sysreg.awk +++ b/arch/arm64/tools/gen-sysreg.awk @@ -122,6 +122,10 @@ $1 == "SysregFields" && block_current() == "Root" { res1 = "UL(0)" unkn = "UL(0)" + if (reg in defined_fields) + fatal("Duplicate SysregFields definition for " reg) + defined_fields[reg] = 1 + next_bit = 63 next @@ -162,6 +166,10 @@ $1 == "Sysreg" && block_current() == "Root" { res1 = "UL(0)" unkn = "UL(0)" + if (reg in defined_regs) + fatal("Duplicate Sysreg definition for " reg) + defined_regs[reg] = 1 + define("REG_" reg, "S" op0 "_" op1 "_C" crn "_C" crm "_" op2) define("SYS_" reg, "sys_reg(" op0 ", " op1 ", " crn ", " crm ", " op2 ")") @@ -284,6 +292,8 @@ $1 == "SignedEnum" && (block_current() == "Sysreg" || block_current() == "Sysreg define_field(reg, field, msb, lsb) define_field_sign(reg, field, "true") + delete seen_enum_vals + next } @@ -297,6 +307,8 @@ $1 == "UnsignedEnum" && (block_current() == "Sysreg" || block_current() == "Sysr define_field(reg, field, msb, lsb) define_field_sign(reg, field, "false") + delete seen_enum_vals + next } @@ -309,6 +321,8 @@ $1 == "Enum" && (block_current() == "Sysreg" || block_current() == "SysregFields define_field(reg, field, msb, lsb) + delete seen_enum_vals + next } @@ -320,6 +334,8 @@ $1 == "EndEnum" && block_current() == "Enum" { lsb = null print "" + delete seen_enum_vals + block_pop() next } @@ -329,6 +345,10 @@ $1 == "EndEnum" && block_current() == "Enum" { val = $1 name = $2 + if (val in seen_enum_vals) + fatal("Duplicate Enum value " val " for " name) + seen_enum_vals[val] = 1 + define(reg "_" field "_" name, "UL(" val ")") next } From 63a796558bc22ec699e4193d5c75534757ddf2e6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 11 Sep 2025 16:33:31 +0200 Subject: [PATCH 1439/3206] Revert "net: usb: asix: ax88772: drop phylink use in PM to avoid MDIO runtime PM wakeups" This reverts commit 5537a4679403 ("net: usb: asix: ax88772: drop phylink use in PM to avoid MDIO runtime PM wakeups"), it breaks operation of asix ethernet usb dongle after system suspend-resume cycle. Link: https://lore.kernel.org/all/b5ea8296-f981-445d-a09a-2f389d7f6fdd@samsung.com/ Fixes: 5537a4679403 ("net: usb: asix: ax88772: drop phylink use in PM to avoid MDIO runtime PM wakeups") Reported-by: Marek Szyprowski Acked-by: Jakub Kicinski Link: https://patch.msgid.link/2945b9dbadb8ee1fee058b19554a5cb14f1763c1.1757601118.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- drivers/net/usb/asix_devices.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index 1e8f7089f5e8a2..792ddda1ad493d 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -607,8 +607,15 @@ static const struct net_device_ops ax88772_netdev_ops = { static void ax88772_suspend(struct usbnet *dev) { + struct asix_common_private *priv = dev->driver_priv; u16 medium; + if (netif_running(dev->net)) { + rtnl_lock(); + phylink_suspend(priv->phylink, false); + rtnl_unlock(); + } + /* Stop MAC operation */ medium = asix_read_medium_status(dev, 1); medium &= ~AX_MEDIUM_RE; @@ -637,6 +644,12 @@ static void ax88772_resume(struct usbnet *dev) for (i = 0; i < 3; i++) if (!priv->reset(dev, 1)) break; + + if (netif_running(dev->net)) { + rtnl_lock(); + phylink_resume(priv->phylink); + rtnl_unlock(); + } } static int asix_resume(struct usb_interface *intf) From 76bc2203a46ef704a4cd8003986f6bd74139a367 Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Thu, 4 Sep 2025 11:05:26 -0500 Subject: [PATCH 1440/3206] dt-bindings: mfd: ti,bq25703a: Add TI BQ25703A Charger Document the Texas instruments BQ25703A series of charger managers/ buck/boost regulators. Signed-off-by: Chris Morgan Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250904160530.66178-2-macroalpha82@gmail.com Signed-off-by: Lee Jones --- .../devicetree/bindings/mfd/ti,bq25703a.yaml | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 Documentation/devicetree/bindings/mfd/ti,bq25703a.yaml diff --git a/Documentation/devicetree/bindings/mfd/ti,bq25703a.yaml b/Documentation/devicetree/bindings/mfd/ti,bq25703a.yaml new file mode 100644 index 00000000000000..ba14663c9266a5 --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/ti,bq25703a.yaml @@ -0,0 +1,117 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mfd/ti,bq25703a.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: BQ25703A Charger Manager/Buck/Boost Converter + +maintainers: + - Chris Morgan + +allOf: + - $ref: /schemas/power/supply/power-supply.yaml# + +properties: + compatible: + const: ti,bq25703a + + reg: + const: 0x6b + + input-current-limit-microamp: + description: + Maximum total input current allowed used for both charging and + powering the device. + minimum: 50000 + maximum: 6400000 + default: 3250000 + + interrupts: + maxItems: 1 + + monitored-battery: + description: + A minimum of constant-charge-current-max-microamp, + constant-charge-voltage-max-microvolt, and + voltage-min-design-microvolt are required. + + regulators: + type: object + additionalProperties: false + description: + Boost converter regulator output of bq257xx. + + properties: + vbus: + type: object + $ref: /schemas/regulator/regulator.yaml + additionalProperties: false + + properties: + regulator-name: true + regulator-min-microamp: + minimum: 0 + maximum: 6350000 + regulator-max-microamp: + minimum: 0 + maximum: 6350000 + regulator-min-microvolt: + minimum: 4480000 + maximum: 20800000 + regulator-max-microvolt: + minimum: 4480000 + maximum: 20800000 + enable-gpios: + description: + The BQ25703 may require both a register write and a GPIO + toggle to enable the boost regulator. + + required: + - regulator-name + - regulator-min-microamp + - regulator-max-microamp + - regulator-min-microvolt + - regulator-max-microvolt + +unevaluatedProperties: false + +required: + - compatible + - reg + - input-current-limit-microamp + - monitored-battery + - power-supplies + +examples: + - | + #include + #include + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + + bq25703: charger@6b { + compatible = "ti,bq25703a"; + reg = <0x6b>; + input-current-limit-microamp = <5000000>; + interrupt-parent = <&gpio0>; + interrupts = ; + monitored-battery = <&battery>; + power-supplies = <&fusb302>; + + regulators { + usb_otg_vbus: vbus { + enable-gpios = <&gpio4 RK_PA6 GPIO_ACTIVE_HIGH>; + regulator-max-microamp = <960000>; + regulator-max-microvolt = <5088000>; + regulator-min-microamp = <512000>; + regulator-min-microvolt = <4992000>; + regulator-name = "usb_otg_vbus"; + }; + }; + }; + }; + +... From 3b1bbfb5fce3ca9fffc92ac1b053b0cfbb1f322b Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Thu, 4 Sep 2025 11:05:27 -0500 Subject: [PATCH 1441/3206] mfd: bq257xx: Add support for BQ25703A core driver The Texas Instruments BQ25703A is an integrated charger manager and boost converter. The MFD driver initializes the device for the regulator driver and power supply driver. Signed-off-by: Chris Morgan Link: https://lore.kernel.org/r/20250904160530.66178-3-macroalpha82@gmail.com Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 11 ++++ drivers/mfd/Makefile | 1 + drivers/mfd/bq257xx.c | 99 ++++++++++++++++++++++++++++++++++ include/linux/mfd/bq257xx.h | 104 ++++++++++++++++++++++++++++++++++++ 4 files changed, 215 insertions(+) create mode 100644 drivers/mfd/bq257xx.c create mode 100644 include/linux/mfd/bq257xx.h diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1e7..768417c973397e 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -1641,6 +1641,17 @@ config MFD_TI_LMU LM36274. It consists of backlight, LED and regulator driver. It provides consistent device controls for lighting functions. +config MFD_BQ257XX + tristate "TI BQ257XX Buck/Boost Charge Controller" + depends on I2C + select MFD_CORE + select REGMAP_I2C + help + Support Texas Instruments BQ25703 Buck/Boost converter with + charge controller. It consists of regulators that provide + system voltage and OTG voltage, and a charger manager for + batteries containing one or more cells. + config MFD_OMAP_USB_HOST bool "TI OMAP USBHS core and TLL driver" depends on USB_EHCI_HCD_OMAP || USB_OHCI_HCD_OMAP3 diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d16..3d700374a42d58 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_MFD_SM501) += sm501.o obj-$(CONFIG_ARCH_BCM2835) += bcm2835-pm.o obj-$(CONFIG_MFD_BCM590XX) += bcm590xx.o obj-$(CONFIG_MFD_BD9571MWV) += bd9571mwv.o +obj-$(CONFIG_MFD_BQ257XX) += bq257xx.o obj-$(CONFIG_MFD_CGBC) += cgbc-core.o obj-$(CONFIG_MFD_CROS_EC_DEV) += cros_ec_dev.o obj-$(CONFIG_MFD_CS42L43) += cs42l43.o diff --git a/drivers/mfd/bq257xx.c b/drivers/mfd/bq257xx.c new file mode 100644 index 00000000000000..e9d49dac0a1670 --- /dev/null +++ b/drivers/mfd/bq257xx.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BQ257XX Core Driver + * Copyright (C) 2025 Chris Morgan + */ + +#include +#include +#include +#include +#include + +static const struct regmap_range bq25703_readonly_reg_ranges[] = { + regmap_reg_range(BQ25703_CHARGER_STATUS, BQ25703_MANUFACT_DEV_ID), +}; + +static const struct regmap_access_table bq25703_writeable_regs = { + .no_ranges = bq25703_readonly_reg_ranges, + .n_no_ranges = ARRAY_SIZE(bq25703_readonly_reg_ranges), +}; + +static const struct regmap_range bq25703_volatile_reg_ranges[] = { + regmap_reg_range(BQ25703_CHARGE_OPTION_0, BQ25703_IIN_HOST), + regmap_reg_range(BQ25703_CHARGER_STATUS, BQ25703_ADC_OPTION), +}; + +static const struct regmap_access_table bq25703_volatile_regs = { + .yes_ranges = bq25703_volatile_reg_ranges, + .n_yes_ranges = ARRAY_SIZE(bq25703_volatile_reg_ranges), +}; + +static const struct regmap_config bq25703_regmap_config = { + .reg_bits = 8, + .val_bits = 16, + .max_register = BQ25703_ADC_OPTION, + .cache_type = REGCACHE_MAPLE, + .wr_table = &bq25703_writeable_regs, + .volatile_table = &bq25703_volatile_regs, + .val_format_endian = REGMAP_ENDIAN_LITTLE, +}; + +static const struct mfd_cell cells[] = { + MFD_CELL_NAME("bq257xx-regulator"), + MFD_CELL_NAME("bq257xx-charger"), +}; + +static int bq257xx_probe(struct i2c_client *client) +{ + struct bq257xx_device *ddata; + int ret; + + ddata = devm_kzalloc(&client->dev, sizeof(*ddata), GFP_KERNEL); + if (!ddata) + return -ENOMEM; + + ddata->client = client; + + ddata->regmap = devm_regmap_init_i2c(client, &bq25703_regmap_config); + if (IS_ERR(ddata->regmap)) { + return dev_err_probe(&client->dev, PTR_ERR(ddata->regmap), + "Failed to allocate register map\n"); + } + + i2c_set_clientdata(client, ddata); + + ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_AUTO, + cells, ARRAY_SIZE(cells), NULL, 0, NULL); + if (ret) + return dev_err_probe(&client->dev, ret, + "Failed to register child devices\n"); + + return ret; +} + +static const struct i2c_device_id bq257xx_i2c_ids[] = { + { "bq25703a" }, + {} +}; +MODULE_DEVICE_TABLE(i2c, bq257xx_i2c_ids); + +static const struct of_device_id bq257xx_of_match[] = { + { .compatible = "ti,bq25703a" }, + {} +}; +MODULE_DEVICE_TABLE(of, bq257xx_of_match); + +static struct i2c_driver bq257xx_driver = { + .driver = { + .name = "bq257xx", + .of_match_table = bq257xx_of_match, + }, + .probe = bq257xx_probe, + .id_table = bq257xx_i2c_ids, +}; +module_i2c_driver(bq257xx_driver); + +MODULE_DESCRIPTION("bq257xx buck/boost/charger driver"); +MODULE_AUTHOR("Chris Morgan "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/bq257xx.h b/include/linux/mfd/bq257xx.h new file mode 100644 index 00000000000000..1d6ddc7fb09fcb --- /dev/null +++ b/include/linux/mfd/bq257xx.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Register definitions for TI BQ257XX + * Copyright (C) 2020 Texas Instruments Incorporated - http://www.ti.com/ + */ + +#define BQ25703_CHARGE_OPTION_0 0x00 +#define BQ25703_CHARGE_CURRENT 0x02 +#define BQ25703_MAX_CHARGE_VOLT 0x04 +#define BQ25703_OTG_VOLT 0x06 +#define BQ25703_OTG_CURRENT 0x08 +#define BQ25703_INPUT_VOLTAGE 0x0a +#define BQ25703_MIN_VSYS 0x0c +#define BQ25703_IIN_HOST 0x0e +#define BQ25703_CHARGER_STATUS 0x20 +#define BQ25703_PROCHOT_STATUS 0x22 +#define BQ25703_IIN_DPM 0x24 +#define BQ25703_ADCIBAT_CHG 0x28 +#define BQ25703_ADCIINCMPIN 0x2a +#define BQ25703_ADCVSYSVBAT 0x2c +#define BQ25703_MANUFACT_DEV_ID 0x2e +#define BQ25703_CHARGE_OPTION_1 0x30 +#define BQ25703_CHARGE_OPTION_2 0x32 +#define BQ25703_CHARGE_OPTION_3 0x34 +#define BQ25703_ADC_OPTION 0x3a + +#define BQ25703_EN_LWPWR BIT(15) +#define BQ25703_WDTMR_ADJ_MASK GENMASK(14, 13) +#define BQ25703_WDTMR_DISABLE 0 +#define BQ25703_WDTMR_5_SEC 1 +#define BQ25703_WDTMR_88_SEC 2 +#define BQ25703_WDTMR_175_SEC 3 + +#define BQ25703_ICHG_MASK GENMASK(12, 6) +#define BQ25703_ICHG_STEP_UA 64000 +#define BQ25703_ICHG_MIN_UA 64000 +#define BQ25703_ICHG_MAX_UA 8128000 + +#define BQ25703_MAX_CHARGE_VOLT_MASK GENMASK(15, 4) +#define BQ25703_VBATREG_STEP_UV 16000 +#define BQ25703_VBATREG_MIN_UV 1024000 +#define BQ25703_VBATREG_MAX_UV 19200000 + +#define BQ25703_OTG_VOLT_MASK GENMASK(13, 6) +#define BQ25703_OTG_VOLT_STEP_UV 64000 +#define BQ25703_OTG_VOLT_MIN_UV 4480000 +#define BQ25703_OTG_VOLT_MAX_UV 20800000 +#define BQ25703_OTG_VOLT_NUM_VOLT 256 + +#define BQ25703_OTG_CUR_MASK GENMASK(14, 8) +#define BQ25703_OTG_CUR_STEP_UA 50000 +#define BQ25703_OTG_CUR_MAX_UA 6350000 + +#define BQ25703_MINVSYS_MASK GENMASK(13, 8) +#define BQ25703_MINVSYS_STEP_UV 256000 +#define BQ25703_MINVSYS_MIN_UV 1024000 +#define BQ25703_MINVSYS_MAX_UV 16128000 + +#define BQ25703_STS_AC_STAT BIT(15) +#define BQ25703_STS_IN_FCHRG BIT(10) +#define BQ25703_STS_IN_PCHRG BIT(9) +#define BQ25703_STS_FAULT_ACOV BIT(7) +#define BQ25703_STS_FAULT_BATOC BIT(6) +#define BQ25703_STS_FAULT_ACOC BIT(5) + +#define BQ25703_IINDPM_MASK GENMASK(14, 8) +#define BQ25703_IINDPM_STEP_UA 50000 +#define BQ25703_IINDPM_MIN_UA 50000 +#define BQ25703_IINDPM_MAX_UA 6400000 +#define BQ25703_IINDPM_DEFAULT_UA 3300000 +#define BQ25703_IINDPM_OFFSET_UA 50000 + +#define BQ25703_ADCIBAT_DISCHG_MASK GENMASK(6, 0) +#define BQ25703_ADCIBAT_CHG_MASK GENMASK(14, 8) +#define BQ25703_ADCIBAT_CHG_STEP_UA 64000 +#define BQ25703_ADCIBAT_DIS_STEP_UA 256000 + +#define BQ25703_ADCIIN GENMASK(15, 8) +#define BQ25703_ADCIINCMPIN_STEP 50000 + +#define BQ25703_ADCVSYS_MASK GENMASK(15, 8) +#define BQ25703_ADCVBAT_MASK GENMASK(7, 0) +#define BQ25703_ADCVSYSVBAT_OFFSET_UV 2880000 +#define BQ25703_ADCVSYSVBAT_STEP 64000 + +#define BQ25703_ADC_CH_MASK GENMASK(7, 0) +#define BQ25703_ADC_CONV_EN BIT(15) +#define BQ25703_ADC_START BIT(14) +#define BQ25703_ADC_FULL_SCALE BIT(13) +#define BQ25703_ADC_CMPIN_EN BIT(7) +#define BQ25703_ADC_VBUS_EN BIT(6) +#define BQ25703_ADC_PSYS_EN BIT(5) +#define BQ25703_ADC_IIN_EN BIT(4) +#define BQ25703_ADC_IDCHG_EN BIT(3) +#define BQ25703_ADC_ICHG_EN BIT(2) +#define BQ25703_ADC_VSYS_EN BIT(1) +#define BQ25703_ADC_VBAT_EN BIT(0) + +#define BQ25703_EN_OTG_MASK BIT(12) + +struct bq257xx_device { + struct i2c_client *client; + struct regmap *regmap; +}; From 1cc017b7f9c7b7cd86fdda4aee36b92d91cc2ad2 Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Thu, 4 Sep 2025 11:05:28 -0500 Subject: [PATCH 1442/3206] power: supply: bq257xx: Add support for BQ257XX charger Add support for the charger function of the BQ257XX. The device is capable of charging batteries with a layout of 1 to 4 cells in series. Signed-off-by: Chris Morgan Reviewed-by: Sebastian Reichel Link: https://lore.kernel.org/r/20250904160530.66178-4-macroalpha82@gmail.com Signed-off-by: Lee Jones --- drivers/power/supply/Kconfig | 7 + drivers/power/supply/Makefile | 1 + drivers/power/supply/bq257xx_charger.c | 755 +++++++++++++++++++++++++ 3 files changed, 763 insertions(+) create mode 100644 drivers/power/supply/bq257xx_charger.c diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig index 79ddb006e2dad6..11893c50c5d2cf 100644 --- a/drivers/power/supply/Kconfig +++ b/drivers/power/supply/Kconfig @@ -767,6 +767,13 @@ config CHARGER_BQ2515X rail, ADC for battery and system monitoring, and push-button controller. +config CHARGER_BQ257XX + tristate "TI BQ257XX battery charger family" + depends on MFD_BQ257XX + help + Say Y to enable support for the TI BQ257XX family of battery + charging integrated circuits. + config CHARGER_BQ25890 tristate "TI BQ25890 battery charger driver" depends on I2C diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile index f943c9150b326d..a7d91244bb8253 100644 --- a/drivers/power/supply/Makefile +++ b/drivers/power/supply/Makefile @@ -97,6 +97,7 @@ obj-$(CONFIG_CHARGER_BQ24190) += bq24190_charger.o obj-$(CONFIG_CHARGER_BQ24257) += bq24257_charger.o obj-$(CONFIG_CHARGER_BQ24735) += bq24735-charger.o obj-$(CONFIG_CHARGER_BQ2515X) += bq2515x_charger.o +obj-$(CONFIG_CHARGER_BQ257XX) += bq257xx_charger.o obj-$(CONFIG_CHARGER_BQ25890) += bq25890_charger.o obj-$(CONFIG_CHARGER_BQ25980) += bq25980_charger.o obj-$(CONFIG_CHARGER_BQ256XX) += bq256xx_charger.o diff --git a/drivers/power/supply/bq257xx_charger.c b/drivers/power/supply/bq257xx_charger.c new file mode 100644 index 00000000000000..02c7d8b61e82b6 --- /dev/null +++ b/drivers/power/supply/bq257xx_charger.c @@ -0,0 +1,755 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BQ257XX Battery Charger Driver + * Copyright (C) 2025 Chris Morgan + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Forward declaration of driver data. */ +struct bq257xx_chg; + +/** + * struct bq257xx_chip_info - chip specific routines + * @bq257xx_hw_init: init function for hw + * @bq257xx_hw_shutdown: shutdown function for hw + * @bq257xx_get_state: get and update state of hardware + * @bq257xx_set_ichg: set maximum charge current (in uA) + * @bq257xx_set_vbatreg: set maximum charge voltage (in uV) + * @bq257xx_set_iindpm: set maximum input current (in uA) + */ +struct bq257xx_chip_info { + int (*bq257xx_hw_init)(struct bq257xx_chg *pdata); + void (*bq257xx_hw_shutdown)(struct bq257xx_chg *pdata); + int (*bq257xx_get_state)(struct bq257xx_chg *pdata); + int (*bq257xx_set_ichg)(struct bq257xx_chg *pdata, int ichg); + int (*bq257xx_set_vbatreg)(struct bq257xx_chg *pdata, int vbatreg); + int (*bq257xx_set_iindpm)(struct bq257xx_chg *pdata, int iindpm); +}; + +/** + * struct bq257xx_chg - driver data for charger + * @chip: hw specific functions + * @bq: parent MFD device + * @charger: power supply device + * @online: charger input is present + * @fast_charge: charger is in fast charge mode + * @pre_charge: charger is in pre-charge mode + * @ov_fault: charger reports over voltage fault + * @batoc_fault: charger reports battery over current fault + * @oc_fault: charger reports over current fault + * @usb_type: USB type reported from parent power supply + * @supplied: Status of parent power supply + * @iindpm_max: maximum input current limit (uA) + * @vbat_max: maximum charge voltage (uV) + * @ichg_max: maximum charge current (uA) + * @vsys_min: minimum system voltage (uV) + */ +struct bq257xx_chg { + const struct bq257xx_chip_info *chip; + struct bq257xx_device *bq; + struct power_supply *charger; + bool online; + bool fast_charge; + bool pre_charge; + bool ov_fault; + bool batoc_fault; + bool oc_fault; + int usb_type; + int supplied; + u32 iindpm_max; + u32 vbat_max; + u32 ichg_max; + u32 vsys_min; +}; + +/** + * bq25703_get_state() - Get the current state of the device + * @pdata: driver platform data + * + * Get the current state of the charger. Check if the charger is + * powered, what kind of charge state (if any) the device is in, + * and if there are any active faults. + * + * Return: Returns 0 on success, or error on failure to read device. + */ +static int bq25703_get_state(struct bq257xx_chg *pdata) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_CHARGER_STATUS, ®); + if (ret) + return ret; + + pdata->online = reg & BQ25703_STS_AC_STAT; + pdata->fast_charge = reg & BQ25703_STS_IN_FCHRG; + pdata->pre_charge = reg & BQ25703_STS_IN_PCHRG; + pdata->ov_fault = reg & BQ25703_STS_FAULT_ACOV; + pdata->batoc_fault = reg & BQ25703_STS_FAULT_BATOC; + pdata->oc_fault = reg & BQ25703_STS_FAULT_ACOC; + + return 0; +} + +/** + * bq25703_get_min_vsys() - Get the minimum system voltage + * @pdata: driver platform data + * @intval: value for minimum voltage + * + * Return: Returns 0 on success or error on failure to read. + */ +static int bq25703_get_min_vsys(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_MIN_VSYS, + ®); + if (ret) + return ret; + + reg = FIELD_GET(BQ25703_MINVSYS_MASK, reg); + *intval = (reg * BQ25703_MINVSYS_STEP_UV) + BQ25703_MINVSYS_MIN_UV; + + return ret; +} + +/** + * bq25703_set_min_vsys() - Set the minimum system voltage + * @pdata: driver platform data + * @vsys: voltage value to set in uV. + * + * This function takes a requested minimum system voltage value, clamps + * it between the minimum supported value by the charger and a user + * defined minimum system value, and then writes the value to the + * appropriate register. + * + * Return: Returns 0 on success or error if an error occurs. + */ +static int bq25703_set_min_vsys(struct bq257xx_chg *pdata, int vsys) +{ + unsigned int reg; + int vsys_min = pdata->vsys_min; + + vsys = clamp(vsys, BQ25703_MINVSYS_MIN_UV, vsys_min); + reg = ((vsys - BQ25703_MINVSYS_MIN_UV) / BQ25703_MINVSYS_STEP_UV); + reg = FIELD_PREP(BQ25703_MINVSYS_MASK, reg); + + return regmap_write(pdata->bq->regmap, BQ25703_MIN_VSYS, + reg); +} + +/** + * bq25703_get_cur() - Get the reported current from the battery + * @pdata: driver platform data + * @intval: value of reported battery current + * + * Read the reported current from the battery. Since value is always + * positive set sign to negative if discharging. + * + * Return: Returns 0 on success or error if unable to read value. + */ +static int bq25703_get_cur(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_ADCIBAT_CHG, ®); + if (ret < 0) + return ret; + + if (pdata->online) + *intval = FIELD_GET(BQ25703_ADCIBAT_CHG_MASK, reg) * + BQ25703_ADCIBAT_CHG_STEP_UA; + else + *intval = -(FIELD_GET(BQ25703_ADCIBAT_DISCHG_MASK, reg) * + BQ25703_ADCIBAT_DIS_STEP_UA); + + return ret; +} + +/** + * bq25703_get_ichg_cur() - Get the maximum reported charge current + * @pdata: driver platform data + * @intval: value of maximum reported charge current + * + * Get the maximum reported charge current from the battery. + * + * Return: Returns 0 on success or error if unable to read value. + */ +static int bq25703_get_ichg_cur(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_CHARGE_CURRENT, ®); + if (ret) + return ret; + + *intval = FIELD_GET(BQ25703_ICHG_MASK, reg) * BQ25703_ICHG_STEP_UA; + + return ret; +} + +/** + * bq25703_set_ichg_cur() - Set the maximum charge current + * @pdata: driver platform data + * @ichg: current value to set in uA. + * + * This function takes a requested maximum charge current value, clamps + * it between the minimum supported value by the charger and a user + * defined maximum charging value, and then writes the value to the + * appropriate register. + * + * Return: Returns 0 on success or error if an error occurs. + */ +static int bq25703_set_ichg_cur(struct bq257xx_chg *pdata, int ichg) +{ + unsigned int reg; + int ichg_max = pdata->ichg_max; + + ichg = clamp(ichg, BQ25703_ICHG_MIN_UA, ichg_max); + reg = FIELD_PREP(BQ25703_ICHG_MASK, (ichg / BQ25703_ICHG_STEP_UA)); + + return regmap_write(pdata->bq->regmap, BQ25703_CHARGE_CURRENT, + reg); +} + +/** + * bq25703_get_chrg_volt() - Get the maximum set charge voltage + * @pdata: driver platform data + * @intval: maximum charge voltage value + * + * Return: Returns 0 on success or error if unable to read value. + */ +static int bq25703_get_chrg_volt(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_MAX_CHARGE_VOLT, + ®); + if (ret) + return ret; + + *intval = FIELD_GET(BQ25703_MAX_CHARGE_VOLT_MASK, reg) * + BQ25703_VBATREG_STEP_UV; + + return ret; +} + +/** + * bq25703_set_chrg_volt() - Set the maximum charge voltage + * @pdata: driver platform data + * @vbat: voltage value to set in uV. + * + * This function takes a requested maximum charge voltage value, clamps + * it between the minimum supported value by the charger and a user + * defined maximum charging value, and then writes the value to the + * appropriate register. + * + * Return: Returns 0 on success or error if an error occurs. + */ +static int bq25703_set_chrg_volt(struct bq257xx_chg *pdata, int vbat) +{ + unsigned int reg; + int vbat_max = pdata->vbat_max; + + vbat = clamp(vbat, BQ25703_VBATREG_MIN_UV, vbat_max); + + reg = FIELD_PREP(BQ25703_MAX_CHARGE_VOLT_MASK, + (vbat / BQ25703_VBATREG_STEP_UV)); + + return regmap_write(pdata->bq->regmap, BQ25703_MAX_CHARGE_VOLT, + reg); +} + +/** + * bq25703_get_iindpm() - Get the maximum set input current + * @pdata: driver platform data + * @intval: maximum input current value + * + * Read the actual input current limit from the device into intval. + * This can differ from the value programmed due to some autonomous + * functions that may be enabled (but are not currently). This is why + * there is a different register used. + * + * Return: Returns 0 on success or error if unable to read register + * value. + */ +static int bq25703_get_iindpm(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_IIN_DPM, ®); + if (ret) + return ret; + + reg = FIELD_GET(BQ25703_IINDPM_MASK, reg); + *intval = (reg * BQ25703_IINDPM_STEP_UA) + BQ25703_IINDPM_OFFSET_UA; + + return ret; +} + +/** + * bq25703_set_iindpm() - Set the maximum input current + * @pdata: driver platform data + * @iindpm: current value in uA. + * + * This function takes a requested maximum input current value, clamps + * it between the minimum supported value by the charger and a user + * defined maximum input value, and then writes the value to the + * appropriate register. + * + * Return: Returns 0 on success or error if an error occurs. + */ +static int bq25703_set_iindpm(struct bq257xx_chg *pdata, int iindpm) +{ + unsigned int reg; + int iindpm_max = pdata->iindpm_max; + + iindpm = clamp(iindpm, BQ25703_IINDPM_MIN_UA, iindpm_max); + + reg = ((iindpm - BQ25703_IINDPM_OFFSET_UA) / BQ25703_IINDPM_STEP_UA); + + return regmap_write(pdata->bq->regmap, BQ25703_IIN_HOST, + FIELD_PREP(BQ25703_IINDPM_MASK, reg)); +} + +/** + * bq25703_get_vbat() - Get the reported voltage from the battery + * @pdata: driver platform data + * @intval: value of reported battery voltage + * + * Read value of battery voltage into intval. + * + * Return: Returns 0 on success or error if unable to read value. + */ +static int bq25703_get_vbat(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_ADCVSYSVBAT, ®); + if (ret) + return ret; + + reg = FIELD_GET(BQ25703_ADCVBAT_MASK, reg); + *intval = (reg * BQ25703_ADCVSYSVBAT_STEP) + BQ25703_ADCVSYSVBAT_OFFSET_UV; + + return ret; +} + +/** + * bq25703_hw_init() - Set all the required registers to init the charger + * @pdata: driver platform data + * + * Initialize the BQ25703 by first disabling the watchdog timer (which + * shuts off the charger in the absence of periodic writes). Then, set + * the charge current, charge voltage, minimum system voltage, and + * input current limit. Disable low power mode to allow ADCs and + * interrupts. Enable the ADC, start the ADC, set the ADC scale to + * full, and enable each individual ADC channel. + * + * Return: Returns 0 on success or error code on error. + */ +static int bq25703_hw_init(struct bq257xx_chg *pdata) +{ + struct regmap *regmap = pdata->bq->regmap; + int ret = 0; + + regmap_update_bits(regmap, BQ25703_CHARGE_OPTION_0, + BQ25703_WDTMR_ADJ_MASK, + FIELD_PREP(BQ25703_WDTMR_ADJ_MASK, + BQ25703_WDTMR_DISABLE)); + + ret = pdata->chip->bq257xx_set_ichg(pdata, pdata->ichg_max); + if (ret) + return ret; + + ret = pdata->chip->bq257xx_set_vbatreg(pdata, pdata->vbat_max); + if (ret) + return ret; + + ret = bq25703_set_min_vsys(pdata, pdata->vsys_min); + if (ret) + return ret; + + ret = pdata->chip->bq257xx_set_iindpm(pdata, pdata->iindpm_max); + if (ret) + return ret; + + /* Disable low power mode by writing 0 to the register. */ + regmap_update_bits(regmap, BQ25703_CHARGE_OPTION_0, + BQ25703_EN_LWPWR, 0); + + /* Enable the ADC. */ + regmap_update_bits(regmap, BQ25703_ADC_OPTION, + BQ25703_ADC_CONV_EN, BQ25703_ADC_CONV_EN); + + /* Start the ADC. */ + regmap_update_bits(regmap, BQ25703_ADC_OPTION, + BQ25703_ADC_START, BQ25703_ADC_START); + + /* Set the scale of the ADC. */ + regmap_update_bits(regmap, BQ25703_ADC_OPTION, + BQ25703_ADC_FULL_SCALE, BQ25703_ADC_FULL_SCALE); + + /* Enable each of the ADC channels available. */ + regmap_update_bits(regmap, BQ25703_ADC_OPTION, + BQ25703_ADC_CH_MASK, + (BQ25703_ADC_CMPIN_EN | BQ25703_ADC_VBUS_EN | + BQ25703_ADC_PSYS_EN | BQ25703_ADC_IIN_EN | + BQ25703_ADC_IDCHG_EN | BQ25703_ADC_ICHG_EN | + BQ25703_ADC_VSYS_EN | BQ25703_ADC_VBAT_EN)); + + return ret; +} + +/** + * bq25703_hw_shutdown() - Set registers for shutdown + * @pdata: driver platform data + * + * Enable low power mode for the device while in shutdown. + */ +static void bq25703_hw_shutdown(struct bq257xx_chg *pdata) +{ + regmap_update_bits(pdata->bq->regmap, BQ25703_CHARGE_OPTION_0, + BQ25703_EN_LWPWR, BQ25703_EN_LWPWR); +} + +static int bq257xx_set_charger_property(struct power_supply *psy, + enum power_supply_property prop, + const union power_supply_propval *val) +{ + struct bq257xx_chg *pdata = power_supply_get_drvdata(psy); + + switch (prop) { + case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: + return pdata->chip->bq257xx_set_iindpm(pdata, val->intval); + + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: + return pdata->chip->bq257xx_set_vbatreg(pdata, val->intval); + + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: + return pdata->chip->bq257xx_set_ichg(pdata, val->intval); + + default: + break; + } + + return -EINVAL; +} + +static int bq257xx_get_charger_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct bq257xx_chg *pdata = power_supply_get_drvdata(psy); + int ret = 0; + + ret = pdata->chip->bq257xx_get_state(pdata); + if (ret) + return ret; + + switch (psp) { + case POWER_SUPPLY_PROP_STATUS: + if (!pdata->online) + val->intval = POWER_SUPPLY_STATUS_DISCHARGING; + else if (pdata->fast_charge || pdata->pre_charge) + val->intval = POWER_SUPPLY_STATUS_CHARGING; + else + val->intval = POWER_SUPPLY_STATUS_NOT_CHARGING; + break; + + case POWER_SUPPLY_PROP_HEALTH: + if (pdata->ov_fault || pdata->batoc_fault) + val->intval = POWER_SUPPLY_HEALTH_OVERVOLTAGE; + else if (pdata->oc_fault) + val->intval = POWER_SUPPLY_HEALTH_OVERCURRENT; + else + val->intval = POWER_SUPPLY_HEALTH_GOOD; + break; + + case POWER_SUPPLY_PROP_MANUFACTURER: + val->strval = "Texas Instruments"; + break; + + case POWER_SUPPLY_PROP_ONLINE: + val->intval = pdata->online; + break; + + case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: + return bq25703_get_iindpm(pdata, &val->intval); + + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: + return bq25703_get_chrg_volt(pdata, &val->intval); + + case POWER_SUPPLY_PROP_CURRENT_NOW: + return bq25703_get_cur(pdata, &val->intval); + + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + return bq25703_get_vbat(pdata, &val->intval); + + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: + return bq25703_get_ichg_cur(pdata, &val->intval); + + case POWER_SUPPLY_PROP_VOLTAGE_MIN: + return bq25703_get_min_vsys(pdata, &val->intval); + + case POWER_SUPPLY_PROP_USB_TYPE: + val->intval = pdata->usb_type; + break; + + default: + return -EINVAL; + } + + return ret; +} + +static enum power_supply_property bq257xx_power_supply_props[] = { + POWER_SUPPLY_PROP_MANUFACTURER, + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_ONLINE, + POWER_SUPPLY_PROP_HEALTH, + POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT, + POWER_SUPPLY_PROP_CURRENT_NOW, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX, + POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX, + POWER_SUPPLY_PROP_VOLTAGE_MIN, + POWER_SUPPLY_PROP_USB_TYPE, +}; + +static int bq257xx_property_is_writeable(struct power_supply *psy, + enum power_supply_property prop) +{ + switch (prop) { + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: + case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: + return true; + default: + return false; + } +} + +/** + * bq257xx_external_power_changed() - Handler for external power change + * @psy: Power supply data + * + * When the external power into the charger is changed, check the USB + * type so that it can be reported. Additionally, update the max input + * current and max charging current to the value reported if it is a + * USB PD charger, otherwise use the default value. Note that each time + * a charger is removed the max charge current register is erased, so + * it must be set again each time the input changes or the device will + * not charge. + */ +static void bq257xx_external_power_changed(struct power_supply *psy) +{ + struct bq257xx_chg *pdata = power_supply_get_drvdata(psy); + union power_supply_propval val; + int ret; + int imax = pdata->iindpm_max; + + pdata->chip->bq257xx_get_state(pdata); + + pdata->supplied = power_supply_am_i_supplied(pdata->charger); + if (pdata->supplied < 0) + return; + + if (pdata->supplied == 0) + goto out; + + ret = power_supply_get_property_from_supplier(psy, + POWER_SUPPLY_PROP_USB_TYPE, + &val); + if (ret) + return; + + pdata->usb_type = val.intval; + + if ((pdata->usb_type == POWER_SUPPLY_USB_TYPE_PD) || + (pdata->usb_type == POWER_SUPPLY_USB_TYPE_PD_DRP) || + (pdata->usb_type == POWER_SUPPLY_USB_TYPE_PD_PPS)) { + ret = power_supply_get_property_from_supplier(psy, + POWER_SUPPLY_PROP_CURRENT_MAX, + &val); + if (ret) + return; + + if (val.intval) + imax = val.intval; + } + + if (pdata->supplied) { + pdata->chip->bq257xx_set_ichg(pdata, pdata->ichg_max); + pdata->chip->bq257xx_set_iindpm(pdata, imax); + pdata->chip->bq257xx_set_vbatreg(pdata, pdata->vbat_max); + } + +out: + power_supply_changed(psy); +} + +static irqreturn_t bq257xx_irq_handler_thread(int irq, void *private) +{ + struct bq257xx_chg *pdata = private; + + bq257xx_external_power_changed(pdata->charger); + return IRQ_HANDLED; +} + +static const struct power_supply_desc bq257xx_power_supply_desc = { + .name = "bq257xx-charger", + .type = POWER_SUPPLY_TYPE_USB, + .usb_types = BIT(POWER_SUPPLY_USB_TYPE_C) | + BIT(POWER_SUPPLY_USB_TYPE_PD) | + BIT(POWER_SUPPLY_USB_TYPE_PD_DRP) | + BIT(POWER_SUPPLY_USB_TYPE_PD_PPS) | + BIT(POWER_SUPPLY_USB_TYPE_UNKNOWN), + .properties = bq257xx_power_supply_props, + .num_properties = ARRAY_SIZE(bq257xx_power_supply_props), + .get_property = bq257xx_get_charger_property, + .set_property = bq257xx_set_charger_property, + .property_is_writeable = bq257xx_property_is_writeable, + .external_power_changed = bq257xx_external_power_changed, +}; + +static const struct bq257xx_chip_info bq25703_chip_info = { + .bq257xx_hw_init = &bq25703_hw_init, + .bq257xx_hw_shutdown = &bq25703_hw_shutdown, + .bq257xx_get_state = &bq25703_get_state, + .bq257xx_set_ichg = &bq25703_set_ichg_cur, + .bq257xx_set_vbatreg = &bq25703_set_chrg_volt, + .bq257xx_set_iindpm = &bq25703_set_iindpm, +}; + +/** + * bq257xx_parse_dt() - Parse the device tree for required properties + * @pdata: driver platform data + * @psy_cfg: power supply config data + * @dev: device struct + * + * Read the device tree to identify the minimum system voltage, the + * maximum charge current, the maximum charge voltage, and the maximum + * input current. + * + * Return: Returns 0 on success or error code on error. + */ +static int bq257xx_parse_dt(struct bq257xx_chg *pdata, + struct power_supply_config *psy_cfg, struct device *dev) +{ + struct power_supply_battery_info *bat_info; + int ret; + + ret = power_supply_get_battery_info(pdata->charger, + &bat_info); + if (ret) + return dev_err_probe(dev, ret, + "Unable to get battery info\n"); + + if ((bat_info->voltage_min_design_uv <= 0) || + (bat_info->constant_charge_voltage_max_uv <= 0) || + (bat_info->constant_charge_current_max_ua <= 0)) + return dev_err_probe(dev, -EINVAL, + "Required bat info missing or invalid\n"); + + pdata->vsys_min = bat_info->voltage_min_design_uv; + pdata->vbat_max = bat_info->constant_charge_voltage_max_uv; + pdata->ichg_max = bat_info->constant_charge_current_max_ua; + + power_supply_put_battery_info(pdata->charger, bat_info); + + ret = device_property_read_u32(dev, + "input-current-limit-microamp", + &pdata->iindpm_max); + if (ret) + pdata->iindpm_max = BQ25703_IINDPM_DEFAULT_UA; + + return 0; +} + +static int bq257xx_charger_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct bq257xx_device *bq = dev_get_drvdata(pdev->dev.parent); + struct bq257xx_chg *pdata; + struct power_supply_config psy_cfg = { }; + int ret; + + device_set_of_node_from_dev(dev, pdev->dev.parent); + + pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); + if (!pdata) + return -ENOMEM; + + pdata->bq = bq; + pdata->chip = &bq25703_chip_info; + + platform_set_drvdata(pdev, pdata); + + psy_cfg.drv_data = pdata; + psy_cfg.fwnode = dev_fwnode(dev); + + pdata->charger = devm_power_supply_register(dev, + &bq257xx_power_supply_desc, + &psy_cfg); + if (IS_ERR(pdata->charger)) + return dev_err_probe(dev, PTR_ERR(pdata->charger), + "Power supply register charger failed\n"); + + ret = bq257xx_parse_dt(pdata, &psy_cfg, dev); + if (ret) + return ret; + + ret = pdata->chip->bq257xx_hw_init(pdata); + if (ret) + return dev_err_probe(dev, ret, "Cannot initialize the charger\n"); + + platform_set_drvdata(pdev, pdata); + + if (bq->client->irq) { + ret = devm_request_threaded_irq(dev, bq->client->irq, NULL, + bq257xx_irq_handler_thread, + IRQF_TRIGGER_RISING | + IRQF_TRIGGER_FALLING | + IRQF_ONESHOT, + dev_name(&bq->client->dev), pdata); + if (ret < 0) + dev_err_probe(dev, ret, "Charger get irq failed\n"); + } + + return ret; +} + +static void bq257xx_charger_shutdown(struct platform_device *pdev) +{ + struct bq257xx_chg *pdata = platform_get_drvdata(pdev); + + pdata->chip->bq257xx_hw_shutdown(pdata); +} + +static struct platform_driver bq257xx_chg_driver = { + .driver = { + .name = "bq257xx-charger", + }, + .probe = bq257xx_charger_probe, + .shutdown = bq257xx_charger_shutdown, +}; +module_platform_driver(bq257xx_chg_driver); + +MODULE_DESCRIPTION("bq257xx charger driver"); +MODULE_AUTHOR("Chris Morgan "); +MODULE_LICENSE("GPL"); From 788b8f6af60b22f78739fc758456b51b2b916dbd Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:26 +0800 Subject: [PATCH 1443/3206] arm64: ptrace: Replace interrupts_enabled() with regs_irqs_disabled() The generic entry code expects architecture code to provide regs_irqs_disabled(regs) function, but arm64 does not have this and provides interrupts_enabled(regs), which has the opposite polarity. In preparation for moving arm64 over to the generic entry code, relace arm64's interrupts_enabled() with regs_irqs_disabled() and update its callers under arch/arm64. For the moment, a definition of interrupts_enabled() is provided for the GICv3 driver. Once arch/arm implement regs_irqs_disabled(), this can be removed. Delete the fast_interrupts_enabled() macro as it is unused and we don't want any new users to show up. No functional changes. Reviewed-by: Ada Couprie Diaz Acked-by: Mark Rutland Suggested-by: Mark Rutland Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/daifflags.h | 2 +- arch/arm64/include/asm/ptrace.h | 9 +++++---- arch/arm64/include/asm/xen/events.h | 2 +- arch/arm64/kernel/acpi.c | 2 +- arch/arm64/kernel/debug-monitors.c | 2 +- arch/arm64/kernel/entry-common.c | 4 ++-- arch/arm64/kernel/sdei.c | 2 +- 7 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index fbb5c99eb2f9d6..5fca480090434c 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -128,7 +128,7 @@ static inline void local_daif_inherit(struct pt_regs *regs) { unsigned long flags = regs->pstate & DAIF_MASK; - if (interrupts_enabled(regs)) + if (!regs_irqs_disabled(regs)) trace_hardirqs_on(); if (system_uses_irq_prio_masking()) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 47ff8654c5ec1e..8b915d4a9d4b32 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -214,11 +214,12 @@ static inline void forget_syscall(struct pt_regs *regs) (regs)->pmr == GIC_PRIO_IRQON : \ true) -#define interrupts_enabled(regs) \ - (!((regs)->pstate & PSR_I_BIT) && irqs_priority_unmasked(regs)) +static __always_inline bool regs_irqs_disabled(const struct pt_regs *regs) +{ + return (regs->pstate & PSR_I_BIT) || !irqs_priority_unmasked(regs); +} -#define fast_interrupts_enabled(regs) \ - (!((regs)->pstate & PSR_F_BIT)) +#define interrupts_enabled(regs) (!regs_irqs_disabled(regs)) static inline unsigned long user_stack_pointer(struct pt_regs *regs) { diff --git a/arch/arm64/include/asm/xen/events.h b/arch/arm64/include/asm/xen/events.h index 2788e95d0ff022..2977b5fe068dbe 100644 --- a/arch/arm64/include/asm/xen/events.h +++ b/arch/arm64/include/asm/xen/events.h @@ -14,7 +14,7 @@ enum ipi_vector { static inline int xen_irqs_disabled(struct pt_regs *regs) { - return !interrupts_enabled(regs); + return regs_irqs_disabled(regs); } #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 4d529ff7ba513a..3fbce0a9a0fe8f 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -407,7 +407,7 @@ int apei_claim_sea(struct pt_regs *regs) return_to_irqs_enabled = !irqs_disabled_flags(arch_local_save_flags()); if (regs) - return_to_irqs_enabled = interrupts_enabled(regs); + return_to_irqs_enabled = !regs_irqs_disabled(regs); /* * SEA can interrupt SError, mask it and describe this as an NMI so diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 110d9ff54174f7..85fc162a6f9be7 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -167,7 +167,7 @@ static void send_user_sigtrap(int si_code) if (WARN_ON(!user_mode(regs))) return; - if (interrupts_enabled(regs)) + if (!regs_irqs_disabled(regs)) local_irq_enable(); arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 2b0c5925502e70..8e798f46ad2828 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -74,7 +74,7 @@ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) { lockdep_assert_irqs_disabled(); - if (interrupts_enabled(regs)) { + if (!regs_irqs_disabled(regs)) { if (regs->exit_rcu) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); @@ -662,7 +662,7 @@ static void noinstr el1_interrupt(struct pt_regs *regs, { write_sysreg(DAIF_PROCCTX_NOIRQ, daif); - if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) + if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && regs_irqs_disabled(regs)) __el1_pnmi(regs, handler); else __el1_irq(regs, handler); diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index 6f24a0251e1837..95169f7b653198 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -243,7 +243,7 @@ unsigned long __kprobes do_sdei_event(struct pt_regs *regs, * If we interrupted the kernel with interrupts masked, we always go * back to wherever we came from. */ - if (mode == kernel_mode && !interrupts_enabled(regs)) + if (mode == kernel_mode && regs_irqs_disabled(regs)) return SDEI_EV_HANDLED; /* From ee776d68ba47cc8e2022f8c2218f1891a1244197 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:27 +0800 Subject: [PATCH 1444/3206] arm64: entry: Refactor the entry and exit for exceptions from EL1 The generic entry code uses irqentry_state_t to track lockdep and RCU state across exception entry and return. For historical reasons, arm64 embeds similar fields within its pt_regs structure. In preparation for moving arm64 over to the generic entry code, pull these fields out of arm64's pt_regs, and use a separate structure, matching the style of the generic entry code. No functional changes. Acked-by: Mark Rutland Reviewed-by: Ada Couprie Diaz Suggested-by: Mark Rutland Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/ptrace.h | 4 - arch/arm64/kernel/entry-common.c | 163 ++++++++++++++++++++----------- 2 files changed, 106 insertions(+), 61 deletions(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 8b915d4a9d4b32..65b053a24d8247 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -169,10 +169,6 @@ struct pt_regs { u64 sdei_ttbr1; struct frame_record_meta stackframe; - - /* Only valid for some EL1 exceptions. */ - u64 lockdep_hardirqs; - u64 exit_rcu; }; /* For correct stack alignment, pt_regs has to be a multiple of 16 bytes. */ diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 8e798f46ad2828..93c95fc51cc0b4 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -29,6 +29,13 @@ #include #include +typedef struct irqentry_state { + union { + bool exit_rcu; + bool lockdep; + }; +} arm64_irqentry_state_t; + /* * Handle IRQ/context state management when entering from kernel mode. * Before this function is called it is not safe to call regular kernel code, @@ -37,29 +44,37 @@ * This is intended to match the logic in irqentry_enter(), handling the kernel * mode transitions only. */ -static __always_inline void __enter_from_kernel_mode(struct pt_regs *regs) +static __always_inline arm64_irqentry_state_t __enter_from_kernel_mode(struct pt_regs *regs) { - regs->exit_rcu = false; + arm64_irqentry_state_t state = { + .exit_rcu = false, + }; if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { lockdep_hardirqs_off(CALLER_ADDR0); ct_irq_enter(); trace_hardirqs_off_finish(); - regs->exit_rcu = true; - return; + state.exit_rcu = true; + return state; } lockdep_hardirqs_off(CALLER_ADDR0); rcu_irq_enter_check_tick(); trace_hardirqs_off_finish(); + + return state; } -static void noinstr enter_from_kernel_mode(struct pt_regs *regs) +static noinstr arm64_irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) { - __enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = __enter_from_kernel_mode(regs); mte_check_tfsr_entry(); mte_disable_tco_entry(current); + + return state; } /* @@ -70,12 +85,13 @@ static void noinstr enter_from_kernel_mode(struct pt_regs *regs) * This is intended to match the logic in irqentry_exit(), handling the kernel * mode transitions only, and with preemption handled elsewhere. */ -static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) +static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs, + arm64_irqentry_state_t state) { lockdep_assert_irqs_disabled(); if (!regs_irqs_disabled(regs)) { - if (regs->exit_rcu) { + if (state.exit_rcu) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); ct_irq_exit(); @@ -85,15 +101,16 @@ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) trace_hardirqs_on(); } else { - if (regs->exit_rcu) + if (state.exit_rcu) ct_irq_exit(); } } -static void noinstr exit_to_kernel_mode(struct pt_regs *regs) +static void noinstr exit_to_kernel_mode(struct pt_regs *regs, + arm64_irqentry_state_t state) { mte_check_tfsr_exit(); - __exit_to_kernel_mode(regs); + __exit_to_kernel_mode(regs, state); } /* @@ -194,9 +211,11 @@ asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) * mode. Before this function is called it is not safe to call regular kernel * code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_enter_nmi(struct pt_regs *regs) +static noinstr arm64_irqentry_state_t arm64_enter_nmi(struct pt_regs *regs) { - regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + arm64_irqentry_state_t state; + + state.lockdep = lockdep_hardirqs_enabled(); __nmi_enter(); lockdep_hardirqs_off(CALLER_ADDR0); @@ -205,6 +224,8 @@ static void noinstr arm64_enter_nmi(struct pt_regs *regs) trace_hardirqs_off_finish(); ftrace_nmi_enter(); + + return state; } /* @@ -212,19 +233,18 @@ static void noinstr arm64_enter_nmi(struct pt_regs *regs) * mode. After this function returns it is not safe to call regular kernel * code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_exit_nmi(struct pt_regs *regs) +static void noinstr arm64_exit_nmi(struct pt_regs *regs, + arm64_irqentry_state_t state) { - bool restore = regs->lockdep_hardirqs; - ftrace_nmi_exit(); - if (restore) { + if (state.lockdep) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); } ct_nmi_exit(); lockdep_hardirq_exit(); - if (restore) + if (state.lockdep) lockdep_hardirqs_on(CALLER_ADDR0); __nmi_exit(); } @@ -234,14 +254,18 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs) * kernel mode. Before this function is called it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) +static noinstr arm64_irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) { - regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + arm64_irqentry_state_t state; + + state.lockdep = lockdep_hardirqs_enabled(); lockdep_hardirqs_off(CALLER_ADDR0); ct_nmi_enter(); trace_hardirqs_off_finish(); + + return state; } /* @@ -249,17 +273,16 @@ static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) * kernel mode. After this function returns it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs) +static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, + arm64_irqentry_state_t state) { - bool restore = regs->lockdep_hardirqs; - - if (restore) { + if (state.lockdep) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); } ct_nmi_exit(); - if (restore) + if (state.lockdep) lockdep_hardirqs_on(CALLER_ADDR0); } @@ -475,73 +498,87 @@ UNHANDLED(el1t, 64, error) static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + arm64_irqentry_state_t state; - enter_from_kernel_mode(regs); + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_mem_abort(far, esr, regs); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + arm64_irqentry_state_t state; - enter_from_kernel_mode(regs); + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_sp_pc_abort(far, esr, regs); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_undef(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_bti(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_gcs(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_mops(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) { - arm64_enter_el1_dbg(regs); + arm64_irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); do_breakpoint(esr, regs); debug_exception_exit(regs); - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr) { - arm64_enter_el1_dbg(regs); + arm64_irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); if (!cortex_a76_erratum_1463225_debug_handler(regs)) { debug_exception_enter(regs); /* @@ -554,37 +591,42 @@ static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr) do_el1_softstep(esr, regs); debug_exception_exit(regs); } - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr) { /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); + arm64_irqentry_state_t state; - arm64_enter_el1_dbg(regs); + state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); do_watchpoint(far, esr, regs); debug_exception_exit(regs); - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr) { - arm64_enter_el1_dbg(regs); + arm64_irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); do_el1_brk64(esr, regs); debug_exception_exit(regs); - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_fpac(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) @@ -639,15 +681,19 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) static __always_inline void __el1_pnmi(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - arm64_enter_nmi(regs); + arm64_irqentry_state_t state; + + state = arm64_enter_nmi(regs); do_interrupt_handler(regs, handler); - arm64_exit_nmi(regs); + arm64_exit_nmi(regs, state); } static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); irq_enter_rcu(); do_interrupt_handler(regs, handler); @@ -655,7 +701,7 @@ static __always_inline void __el1_irq(struct pt_regs *regs, arm64_preempt_schedule_irq(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) @@ -681,11 +727,12 @@ asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs) asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); + arm64_irqentry_state_t state; local_daif_restore(DAIF_ERRCTX); - arm64_enter_nmi(regs); + state = arm64_enter_nmi(regs); do_serror(regs, esr); - arm64_exit_nmi(regs); + arm64_exit_nmi(regs, state); } static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) @@ -997,12 +1044,13 @@ asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs) static void noinstr __el0_error_handler_common(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); + arm64_irqentry_state_t state; enter_from_user_mode(regs); local_daif_restore(DAIF_ERRCTX); - arm64_enter_nmi(regs); + state = arm64_enter_nmi(regs); do_serror(regs, esr); - arm64_exit_nmi(regs); + arm64_exit_nmi(regs, state); local_daif_restore(DAIF_PROCCTX); exit_to_user_mode(regs); } @@ -1122,6 +1170,7 @@ asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) asmlinkage noinstr unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { + arm64_irqentry_state_t state; unsigned long ret; /* @@ -1146,9 +1195,9 @@ __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) else if (cpu_has_pan()) set_pstate_pan(0); - arm64_enter_nmi(regs); + state = arm64_enter_nmi(regs); ret = do_sdei_event(regs, arg); - arm64_exit_nmi(regs); + arm64_exit_nmi(regs, state); return ret; } From 77c1953946391e38c1e5120230f8df14f85219a7 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:28 +0800 Subject: [PATCH 1445/3206] arm64: entry: Rework arm64_preempt_schedule_irq() The generic entry code has the form: | raw_irqentry_exit_cond_resched() | { | if (!preempt_count()) { | ... | if (need_resched()) | preempt_schedule_irq(); | } | } In preparation for moving arm64 over to the generic entry code, align the structure of the arm64 code with raw_irqentry_exit_cond_resched() from the generic entry code. Reviewed-by: Ada Couprie Diaz Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/kernel/entry-common.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 93c95fc51cc0b4..dd7903f371ad65 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -294,10 +294,10 @@ DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); #define need_irq_preemption() (IS_ENABLED(CONFIG_PREEMPTION)) #endif -static void __sched arm64_preempt_schedule_irq(void) +static inline bool arm64_preempt_schedule_irq(void) { if (!need_irq_preemption()) - return; + return false; /* * Note: thread_info::preempt_count includes both thread_info::count @@ -305,7 +305,7 @@ static void __sched arm64_preempt_schedule_irq(void) * preempt_count(). */ if (READ_ONCE(current_thread_info()->preempt_count) != 0) - return; + return false; /* * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC @@ -314,7 +314,7 @@ static void __sched arm64_preempt_schedule_irq(void) * DAIF we must have handled an NMI, so skip preemption. */ if (system_uses_irq_prio_masking() && read_sysreg(daif)) - return; + return false; /* * Preempting a task from an IRQ means we leave copies of PSTATE @@ -324,8 +324,10 @@ static void __sched arm64_preempt_schedule_irq(void) * Only allow a task to be preempted once cpufeatures have been * enabled. */ - if (system_capabilities_finalized()) - preempt_schedule_irq(); + if (!system_capabilities_finalized()) + return false; + + return true; } static void do_interrupt_handler(struct pt_regs *regs, @@ -699,7 +701,8 @@ static __always_inline void __el1_irq(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - arm64_preempt_schedule_irq(); + if (arm64_preempt_schedule_irq()) + preempt_schedule_irq(); exit_to_kernel_mode(regs, state); } From c74c44c6ae207e196c4c31c4a243abb0811a5974 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:29 +0800 Subject: [PATCH 1446/3206] arm64: entry: Use preempt_count() and need_resched() helper The generic entry code uses preempt_count() and need_resched() helpers to check if it should do preempt_schedule_irq(). Currently, arm64 use its own check logic, that is "READ_ONCE(current_thread_info()->preempt_count == 0", which is equivalent to "preempt_count() == 0 && need_resched()". In preparation for moving arm64 over to the generic entry code, use these helpers to replace arm64's own code and move it ahead. No functional changes. Reviewed-by: Ada Couprie Diaz Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/kernel/entry-common.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index dd7903f371ad65..1ba1d40fa6a724 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -299,14 +299,6 @@ static inline bool arm64_preempt_schedule_irq(void) if (!need_irq_preemption()) return false; - /* - * Note: thread_info::preempt_count includes both thread_info::count - * and thread_info::need_resched, and is not equivalent to - * preempt_count(). - */ - if (READ_ONCE(current_thread_info()->preempt_count) != 0) - return false; - /* * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC * priority masking is used the GIC irqchip driver will clear DAIF.IF @@ -701,8 +693,10 @@ static __always_inline void __el1_irq(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - if (arm64_preempt_schedule_irq()) - preempt_schedule_irq(); + if (!preempt_count() && need_resched()) { + if (arm64_preempt_schedule_irq()) + preempt_schedule_irq(); + } exit_to_kernel_mode(regs, state); } From 3c973c51bfbaf356367afa46b94f9100a7d672f2 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:30 +0800 Subject: [PATCH 1447/3206] entry: Add arch_irqentry_exit_need_resched() for arm64 Compared to the generic entry code, ARM64 does additional checks when deciding to reschedule on return from interrupt. So introduce arch_irqentry_exit_need_resched() in the need_resched() condition of the generic raw_irqentry_exit_cond_resched(), with a NOP default. This will allow ARM64 to implement the architecture specific version for switching over to the generic entry code. Suggested-by: Ada Couprie Diaz Suggested-by: Mark Rutland Suggested-by: Kevin Brodsky Suggested-by: Thomas Gleixner Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- kernel/entry/common.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 408d28b5179df8..f62e1d1b2063ea 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -143,6 +143,20 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } +/** + * arch_irqentry_exit_need_resched - Architecture specific need resched function + * + * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed. + * Defaults return true. + * + * The main purpose is to permit arch to avoid preemption of a task from an IRQ. + */ +static inline bool arch_irqentry_exit_need_resched(void); + +#ifndef arch_irqentry_exit_need_resched +static inline bool arch_irqentry_exit_need_resched(void) { return true; } +#endif + void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { @@ -150,7 +164,7 @@ void raw_irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (need_resched() && arch_irqentry_exit_need_resched()) preempt_schedule_irq(); } } From 64f4b8b15f1c3c9a4e416fc5b5b4dc354b78e75e Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:31 +0800 Subject: [PATCH 1448/3206] arm64: entry: Refactor preempt_schedule_irq() check code To align the structure of the code with irqentry_exit_cond_resched() from the generic entry code, hoist the need_irq_preemption() and IS_ENABLED() check earlier. And different preemption check functions are defined based on whether dynamic preemption is enabled. Signed-off-by: Jinjie Ruan Reviewed-by: Ada Couprie Diaz Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/preempt.h | 6 ++++++ arch/arm64/kernel/entry-common.c | 37 +++++++++++++++++++------------- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h index 0159b625cc7f0e..c2437ea0790f64 100644 --- a/arch/arm64/include/asm/preempt.h +++ b/arch/arm64/include/asm/preempt.h @@ -85,6 +85,7 @@ static inline bool should_resched(int preempt_offset) void preempt_schedule(void); void preempt_schedule_notrace(void); +void raw_irqentry_exit_cond_resched(void); #ifdef CONFIG_PREEMPT_DYNAMIC DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); @@ -92,13 +93,18 @@ void dynamic_preempt_schedule(void); #define __preempt_schedule() dynamic_preempt_schedule() void dynamic_preempt_schedule_notrace(void); #define __preempt_schedule_notrace() dynamic_preempt_schedule_notrace() +void dynamic_irqentry_exit_cond_resched(void); +#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() #else /* CONFIG_PREEMPT_DYNAMIC */ #define __preempt_schedule() preempt_schedule() #define __preempt_schedule_notrace() preempt_schedule_notrace() +#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() #endif /* CONFIG_PREEMPT_DYNAMIC */ +#else /* CONFIG_PREEMPTION */ +#define irqentry_exit_cond_resched() {} #endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 1ba1d40fa6a724..64066c643f97fe 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -286,19 +286,8 @@ static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, lockdep_hardirqs_on(CALLER_ADDR0); } -#ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -#define need_irq_preemption() \ - (static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) -#else -#define need_irq_preemption() (IS_ENABLED(CONFIG_PREEMPTION)) -#endif - static inline bool arm64_preempt_schedule_irq(void) { - if (!need_irq_preemption()) - return false; - /* * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC * priority masking is used the GIC irqchip driver will clear DAIF.IF @@ -682,6 +671,26 @@ static __always_inline void __el1_pnmi(struct pt_regs *regs, arm64_exit_nmi(regs, state); } +#ifdef CONFIG_PREEMPTION +void raw_irqentry_exit_cond_resched(void) +{ + if (!preempt_count()) { + if (need_resched() && arm64_preempt_schedule_irq()) + preempt_schedule_irq(); + } +} +#endif + +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) + return; + raw_irqentry_exit_cond_resched(); +} +#endif + static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { @@ -693,10 +702,8 @@ static __always_inline void __el1_irq(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - if (!preempt_count() && need_resched()) { - if (arm64_preempt_schedule_irq()) - preempt_schedule_irq(); - } + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); exit_to_kernel_mode(regs, state); } From 99eb057ccd675b2f0fc71a362553164c65c349a2 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:32 +0800 Subject: [PATCH 1449/3206] arm64: entry: Move arm64_preempt_schedule_irq() into __exit_to_kernel_mode() The arm64 entry code only preempts a kernel context upon a return from a regular IRQ exception. The generic entry code may preempt a kernel context for any exception return where irqentry_exit() is used, and so may preempt other exceptions such as faults. In preparation for moving arm64 over to the generic entry code, align arm64 with the generic behaviour by calling arm64_preempt_schedule_irq() from exit_to_kernel_mode(). To make this possible, arm64_preempt_schedule_irq() and dynamic/raw_irqentry_exit_cond_resched() are moved earlier in the file, with no changes. As Mark pointed out, this change will have the following 2 key impact: - " We'll preempt even without taking a "real" interrupt. That shouldn't result in preemption that wasn't possible before, but it does change the probability of preempting at certain points, and might have a performance impact, so probably warrants a benchmark." - " We will not preempt when taking interrupts from a region of kernel code where IRQs are enabled but RCU is not watching, matching the behaviour of the generic entry code. This has the potential to introduce livelock if we can ever have a screaming interrupt in such a region, so we'll need to go figure out whether that's actually a problem. Having this as a separate patch will make it easier to test/bisect for that specifically." Reviewed-by: Ada Couprie Diaz Suggested-by: Mark Rutland Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/kernel/entry-common.c | 96 ++++++++++++++++---------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 64066c643f97fe..f52067d17baf3d 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -77,6 +77,51 @@ static noinstr arm64_irqentry_state_t enter_from_kernel_mode(struct pt_regs *reg return state; } +static inline bool arm64_preempt_schedule_irq(void) +{ + /* + * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC + * priority masking is used the GIC irqchip driver will clear DAIF.IF + * using gic_arch_enable_irqs() for normal IRQs. If anything is set in + * DAIF we must have handled an NMI, so skip preemption. + */ + if (system_uses_irq_prio_masking() && read_sysreg(daif)) + return false; + + /* + * Preempting a task from an IRQ means we leave copies of PSTATE + * on the stack. cpufeature's enable calls may modify PSTATE, but + * resuming one of these preempted tasks would undo those changes. + * + * Only allow a task to be preempted once cpufeatures have been + * enabled. + */ + if (!system_capabilities_finalized()) + return false; + + return true; +} + +#ifdef CONFIG_PREEMPTION +void raw_irqentry_exit_cond_resched(void) +{ + if (!preempt_count()) { + if (need_resched() && arm64_preempt_schedule_irq()) + preempt_schedule_irq(); + } +} +#endif + +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) + return; + raw_irqentry_exit_cond_resched(); +} +#endif + /* * Handle IRQ/context state management when exiting to kernel mode. * After this function returns it is not safe to call regular kernel code, @@ -99,6 +144,9 @@ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs, return; } + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); + trace_hardirqs_on(); } else { if (state.exit_rcu) @@ -286,31 +334,6 @@ static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, lockdep_hardirqs_on(CALLER_ADDR0); } -static inline bool arm64_preempt_schedule_irq(void) -{ - /* - * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC - * priority masking is used the GIC irqchip driver will clear DAIF.IF - * using gic_arch_enable_irqs() for normal IRQs. If anything is set in - * DAIF we must have handled an NMI, so skip preemption. - */ - if (system_uses_irq_prio_masking() && read_sysreg(daif)) - return false; - - /* - * Preempting a task from an IRQ means we leave copies of PSTATE - * on the stack. cpufeature's enable calls may modify PSTATE, but - * resuming one of these preempted tasks would undo those changes. - * - * Only allow a task to be preempted once cpufeatures have been - * enabled. - */ - if (!system_capabilities_finalized()) - return false; - - return true; -} - static void do_interrupt_handler(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { @@ -671,26 +694,6 @@ static __always_inline void __el1_pnmi(struct pt_regs *regs, arm64_exit_nmi(regs, state); } -#ifdef CONFIG_PREEMPTION -void raw_irqentry_exit_cond_resched(void) -{ - if (!preempt_count()) { - if (need_resched() && arm64_preempt_schedule_irq()) - preempt_schedule_irq(); - } -} -#endif - -#ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -void dynamic_irqentry_exit_cond_resched(void) -{ - if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) - return; - raw_irqentry_exit_cond_resched(); -} -#endif - static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { @@ -702,9 +705,6 @@ static __always_inline void __el1_irq(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - if (IS_ENABLED(CONFIG_PREEMPTION)) - irqentry_exit_cond_resched(); - exit_to_kernel_mode(regs, state); } static void noinstr el1_interrupt(struct pt_regs *regs, From b3cf07851b6c4aa8683557905cd898da9ae8c634 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:33 +0800 Subject: [PATCH 1450/3206] arm64: entry: Switch to generic IRQ entry Currently, x86, Riscv and Loongarch use the generic entry code, which makes maintainer's work easier and code more elegant. Start converting arm64 to use the generic entry infrastructure from kernel/entry/* by switching it to generic IRQ entry, which removes 100+ lines of duplicate code. arm64 will completely switch to generic entry in a later series. The changes are below: - Remove *enter_from/exit_to_kernel_mode(), and wrap with generic irqentry_enter/exit() as their code and functionality are almost identical. - Define ARCH_EXIT_TO_USER_MODE_WORK and implement arch_exit_to_user_mode_work() to check arm64-specific thread flags "_TIF_MTE_ASYNC_FAULT" and "_TIF_FOREIGN_FPSTATE". So also remove *enter_from/exit_to_user_mode(), and wrap with generic enter_from/exit_to_user_mode() because they are exactly the same. - Remove arm64_enter/exit_nmi() and use generic irqentry_nmi_enter/exit() because they're exactly the same, so the temporary arm64 version irqentry_state can also be removed. - Remove PREEMPT_DYNAMIC code, as generic irqentry_exit_cond_resched() has the same functionality. - Implement arch_irqentry_exit_need_resched() with arm64_preempt_schedule_irq() for arm64 which will allow arm64 to do its architecture specific checks. Tested-by: Ada Couprie Diaz Suggested-by: Ada Couprie Diaz Suggested-by: Mark Rutland Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/entry-common.h | 57 ++++ arch/arm64/include/asm/exception.h | 1 - arch/arm64/include/asm/preempt.h | 8 - arch/arm64/kernel/entry-common.c | 378 +++++++------------------- arch/arm64/kernel/signal.c | 3 +- 6 files changed, 156 insertions(+), 292 deletions(-) create mode 100644 arch/arm64/include/asm/entry-common.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a64d..6bb60a0620ecf4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -151,6 +151,7 @@ config ARM64 select GENERIC_EARLY_IOREMAP select GENERIC_IDLE_POLL_SETUP select GENERIC_IOREMAP + select GENERIC_IRQ_ENTRY select GENERIC_IRQ_IPI select GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD select GENERIC_IRQ_PROBE diff --git a/arch/arm64/include/asm/entry-common.h b/arch/arm64/include/asm/entry-common.h new file mode 100644 index 00000000000000..cab8cd78f69385 --- /dev/null +++ b/arch/arm64/include/asm/entry-common.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_ARM64_ENTRY_COMMON_H +#define _ASM_ARM64_ENTRY_COMMON_H + +#include + +#include +#include +#include +#include +#include + +#define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_MTE_ASYNC_FAULT | _TIF_FOREIGN_FPSTATE) + +static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs, + unsigned long ti_work) +{ + if (ti_work & _TIF_MTE_ASYNC_FAULT) { + clear_thread_flag(TIF_MTE_ASYNC_FAULT); + send_sig_fault(SIGSEGV, SEGV_MTEAERR, (void __user *)NULL, current); + } + + if (ti_work & _TIF_FOREIGN_FPSTATE) + fpsimd_restore_current_state(); +} + +#define arch_exit_to_user_mode_work arch_exit_to_user_mode_work + +static inline bool arch_irqentry_exit_need_resched(void) +{ + /* + * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC + * priority masking is used the GIC irqchip driver will clear DAIF.IF + * using gic_arch_enable_irqs() for normal IRQs. If anything is set in + * DAIF we must have handled an NMI, so skip preemption. + */ + if (system_uses_irq_prio_masking() && read_sysreg(daif)) + return false; + + /* + * Preempting a task from an IRQ means we leave copies of PSTATE + * on the stack. cpufeature's enable calls may modify PSTATE, but + * resuming one of these preempted tasks would undo those changes. + * + * Only allow a task to be preempted once cpufeatures have been + * enabled. + */ + if (!system_capabilities_finalized()) + return false; + + return true; +} + +#define arch_irqentry_exit_need_resched arch_irqentry_exit_need_resched + +#endif /* _ASM_ARM64_ENTRY_COMMON_H */ diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index e3874c4fc399e5..a2da3cb21c244a 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -89,7 +89,6 @@ void do_el1_fpac(struct pt_regs *regs, unsigned long esr); void do_el0_mops(struct pt_regs *regs, unsigned long esr); void do_el1_mops(struct pt_regs *regs, unsigned long esr); void do_serror(struct pt_regs *regs, unsigned long esr); -void do_signal(struct pt_regs *regs); void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far); #endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h index c2437ea0790f64..932ea4b6204289 100644 --- a/arch/arm64/include/asm/preempt.h +++ b/arch/arm64/include/asm/preempt.h @@ -2,7 +2,6 @@ #ifndef __ASM_PREEMPT_H #define __ASM_PREEMPT_H -#include #include #define PREEMPT_NEED_RESCHED BIT(32) @@ -85,26 +84,19 @@ static inline bool should_resched(int preempt_offset) void preempt_schedule(void); void preempt_schedule_notrace(void); -void raw_irqentry_exit_cond_resched(void); #ifdef CONFIG_PREEMPT_DYNAMIC -DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); void dynamic_preempt_schedule(void); #define __preempt_schedule() dynamic_preempt_schedule() void dynamic_preempt_schedule_notrace(void); #define __preempt_schedule_notrace() dynamic_preempt_schedule_notrace() -void dynamic_irqentry_exit_cond_resched(void); -#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() #else /* CONFIG_PREEMPT_DYNAMIC */ #define __preempt_schedule() preempt_schedule() #define __preempt_schedule_notrace() preempt_schedule_notrace() -#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() #endif /* CONFIG_PREEMPT_DYNAMIC */ -#else /* CONFIG_PREEMPTION */ -#define irqentry_exit_cond_resched() {} #endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index f52067d17baf3d..f546a914f04174 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -29,13 +30,6 @@ #include #include -typedef struct irqentry_state { - union { - bool exit_rcu; - bool lockdep; - }; -} arm64_irqentry_state_t; - /* * Handle IRQ/context state management when entering from kernel mode. * Before this function is called it is not safe to call regular kernel code, @@ -44,31 +38,14 @@ typedef struct irqentry_state { * This is intended to match the logic in irqentry_enter(), handling the kernel * mode transitions only. */ -static __always_inline arm64_irqentry_state_t __enter_from_kernel_mode(struct pt_regs *regs) +static __always_inline irqentry_state_t __enter_from_kernel_mode(struct pt_regs *regs) { - arm64_irqentry_state_t state = { - .exit_rcu = false, - }; - - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { - lockdep_hardirqs_off(CALLER_ADDR0); - ct_irq_enter(); - trace_hardirqs_off_finish(); - - state.exit_rcu = true; - return state; - } - - lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter_check_tick(); - trace_hardirqs_off_finish(); - - return state; + return irqentry_enter(regs); } -static noinstr arm64_irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) +static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = __enter_from_kernel_mode(regs); mte_check_tfsr_entry(); @@ -77,51 +54,6 @@ static noinstr arm64_irqentry_state_t enter_from_kernel_mode(struct pt_regs *reg return state; } -static inline bool arm64_preempt_schedule_irq(void) -{ - /* - * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC - * priority masking is used the GIC irqchip driver will clear DAIF.IF - * using gic_arch_enable_irqs() for normal IRQs. If anything is set in - * DAIF we must have handled an NMI, so skip preemption. - */ - if (system_uses_irq_prio_masking() && read_sysreg(daif)) - return false; - - /* - * Preempting a task from an IRQ means we leave copies of PSTATE - * on the stack. cpufeature's enable calls may modify PSTATE, but - * resuming one of these preempted tasks would undo those changes. - * - * Only allow a task to be preempted once cpufeatures have been - * enabled. - */ - if (!system_capabilities_finalized()) - return false; - - return true; -} - -#ifdef CONFIG_PREEMPTION -void raw_irqentry_exit_cond_resched(void) -{ - if (!preempt_count()) { - if (need_resched() && arm64_preempt_schedule_irq()) - preempt_schedule_irq(); - } -} -#endif - -#ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -void dynamic_irqentry_exit_cond_resched(void) -{ - if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) - return; - raw_irqentry_exit_cond_resched(); -} -#endif - /* * Handle IRQ/context state management when exiting to kernel mode. * After this function returns it is not safe to call regular kernel code, @@ -131,31 +63,13 @@ void dynamic_irqentry_exit_cond_resched(void) * mode transitions only, and with preemption handled elsewhere. */ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs, - arm64_irqentry_state_t state) -{ - lockdep_assert_irqs_disabled(); - - if (!regs_irqs_disabled(regs)) { - if (state.exit_rcu) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - ct_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); - return; - } - - if (IS_ENABLED(CONFIG_PREEMPTION)) - irqentry_exit_cond_resched(); - - trace_hardirqs_on(); - } else { - if (state.exit_rcu) - ct_irq_exit(); - } + irqentry_state_t state) +{ + irqentry_exit(regs, state); } static void noinstr exit_to_kernel_mode(struct pt_regs *regs, - arm64_irqentry_state_t state) + irqentry_state_t state) { mte_check_tfsr_exit(); __exit_to_kernel_mode(regs, state); @@ -166,18 +80,15 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs, * Before this function is called it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static __always_inline void __enter_from_user_mode(void) +static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { - lockdep_hardirqs_off(CALLER_ADDR0); - CT_WARN_ON(ct_state() != CT_STATE_USER); - user_exit_irqoff(); - trace_hardirqs_off_finish(); + enter_from_user_mode(regs); mte_disable_tco_entry(current); } -static __always_inline void enter_from_user_mode(struct pt_regs *regs) +static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) { - __enter_from_user_mode(); + __enter_from_user_mode(regs); } /* @@ -185,116 +96,19 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static __always_inline void __exit_to_user_mode(void) -{ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - user_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); -} -static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) +static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { - do { - local_irq_enable(); - - if (thread_flags & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) - schedule(); - - if (thread_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - if (thread_flags & _TIF_MTE_ASYNC_FAULT) { - clear_thread_flag(TIF_MTE_ASYNC_FAULT); - send_sig_fault(SIGSEGV, SEGV_MTEAERR, - (void __user *)NULL, current); - } - - if (thread_flags & _TIF_PATCH_PENDING) - klp_update_patch_state(current); - - if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - do_signal(regs); - - if (thread_flags & _TIF_NOTIFY_RESUME) - resume_user_mode_work(regs); - - if (thread_flags & _TIF_FOREIGN_FPSTATE) - fpsimd_restore_current_state(); - - local_irq_disable(); - thread_flags = read_thread_flags(); - } while (thread_flags & _TIF_WORK_MASK); -} - -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) -{ - unsigned long flags; - local_irq_disable(); - - flags = read_thread_flags(); - if (unlikely(flags & _TIF_WORK_MASK)) - do_notify_resume(regs, flags); - - local_daif_mask(); - - lockdep_sys_exit(); -} - -static __always_inline void exit_to_user_mode(struct pt_regs *regs) -{ exit_to_user_mode_prepare(regs); + local_daif_mask(); mte_check_tfsr_exit(); - __exit_to_user_mode(); + exit_to_user_mode(); } asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) { - exit_to_user_mode(regs); -} - -/* - * Handle IRQ/context state management when entering an NMI from user/kernel - * mode. Before this function is called it is not safe to call regular kernel - * code, instrumentable code, or any code which may trigger an exception. - */ -static noinstr arm64_irqentry_state_t arm64_enter_nmi(struct pt_regs *regs) -{ - arm64_irqentry_state_t state; - - state.lockdep = lockdep_hardirqs_enabled(); - - __nmi_enter(); - lockdep_hardirqs_off(CALLER_ADDR0); - lockdep_hardirq_enter(); - ct_nmi_enter(); - - trace_hardirqs_off_finish(); - ftrace_nmi_enter(); - - return state; -} - -/* - * Handle IRQ/context state management when exiting an NMI from user/kernel - * mode. After this function returns it is not safe to call regular kernel - * code, instrumentable code, or any code which may trigger an exception. - */ -static void noinstr arm64_exit_nmi(struct pt_regs *regs, - arm64_irqentry_state_t state) -{ - ftrace_nmi_exit(); - if (state.lockdep) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - } - - ct_nmi_exit(); - lockdep_hardirq_exit(); - if (state.lockdep) - lockdep_hardirqs_on(CALLER_ADDR0); - __nmi_exit(); + arm64_exit_to_user_mode(regs); } /* @@ -302,9 +116,9 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs, * kernel mode. Before this function is called it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static noinstr arm64_irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) +static noinstr irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) { - arm64_irqentry_state_t state; + irqentry_state_t state; state.lockdep = lockdep_hardirqs_enabled(); @@ -322,7 +136,7 @@ static noinstr arm64_irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) * kernel code, instrumentable code, or any code which may trigger an exception. */ static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, - arm64_irqentry_state_t state) + irqentry_state_t state) { if (state.lockdep) { trace_hardirqs_on_prepare(); @@ -353,7 +167,7 @@ extern void (*handle_arch_fiq)(struct pt_regs *); static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector, unsigned long esr) { - arm64_enter_nmi(regs); + irqentry_nmi_enter(regs); console_verbose(); @@ -504,7 +318,7 @@ UNHANDLED(el1t, 64, error) static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -516,7 +330,7 @@ static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -527,7 +341,7 @@ static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -538,7 +352,7 @@ static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -549,7 +363,7 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -560,7 +374,7 @@ static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -571,7 +385,7 @@ static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); @@ -582,7 +396,7 @@ static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = arm64_enter_el1_dbg(regs); if (!cortex_a76_erratum_1463225_debug_handler(regs)) { @@ -604,7 +418,7 @@ static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr) { /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); - arm64_irqentry_state_t state; + irqentry_state_t state; state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); @@ -615,7 +429,7 @@ static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr) static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); @@ -626,7 +440,7 @@ static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr) static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -687,17 +501,17 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) static __always_inline void __el1_pnmi(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - arm64_irqentry_state_t state; + irqentry_state_t state; - state = arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_interrupt_handler(regs, handler); - arm64_exit_nmi(regs, state); + irqentry_nmi_exit(regs, state); } static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); @@ -731,22 +545,22 @@ asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs) asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); - arm64_irqentry_state_t state; + irqentry_state_t state; local_daif_restore(DAIF_ERRCTX); - state = arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_serror(regs, esr); - arm64_exit_nmi(regs, state); + irqentry_nmi_exit(regs, state); } static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) @@ -761,50 +575,50 @@ static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(far)) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sve_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sme_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sme_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_exc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sys(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_sys(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) @@ -814,58 +628,58 @@ static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(instruction_pointer(regs))) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(regs->sp, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_undef(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_undef(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_bti(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_bti(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_mops(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_gcs(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); bad_el0_sync(regs, 0, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr) @@ -873,12 +687,12 @@ static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(regs->pc)) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); debug_exception_enter(regs); do_breakpoint(esr, regs); debug_exception_exit(regs); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) @@ -886,7 +700,7 @@ static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(regs->pc)) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); /* * After handling a breakpoint, we suspend the breakpoint * and use single-step to move to the next instruction. @@ -897,7 +711,7 @@ static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) local_daif_restore(DAIF_PROCCTX); do_el0_softstep(esr, regs); } - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr) @@ -905,39 +719,39 @@ static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr) /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); debug_exception_enter(regs); do_watchpoint(far, esr, regs); debug_exception_exit(regs); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_brk64(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_svc(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); fpsimd_syscall_exit(); } static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_fpac(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) @@ -1011,7 +825,7 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) static void noinstr el0_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); write_sysreg(DAIF_PROCCTX_NOIRQ, daif); @@ -1022,7 +836,7 @@ static void noinstr el0_interrupt(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr __el0_irq_handler_common(struct pt_regs *regs) @@ -1048,15 +862,15 @@ asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs) static void noinstr __el0_error_handler_common(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); - arm64_irqentry_state_t state; + irqentry_state_t state; - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_ERRCTX); - state = arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_serror(regs, esr); - arm64_exit_nmi(regs, state); + irqentry_nmi_exit(regs, state); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) @@ -1067,27 +881,27 @@ asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) #ifdef CONFIG_COMPAT static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_cp15(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_svc_compat(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); do_el0_svc_compat(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_bkpt32(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs) @@ -1166,7 +980,7 @@ asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) unsigned long esr = read_sysreg(esr_el1); unsigned long far = read_sysreg(far_el1); - arm64_enter_nmi(regs); + irqentry_nmi_enter(regs); panic_bad_stack(regs, esr, far); } @@ -1174,7 +988,7 @@ asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) asmlinkage noinstr unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { - arm64_irqentry_state_t state; + irqentry_state_t state; unsigned long ret; /* @@ -1199,9 +1013,9 @@ __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) else if (cpu_has_pan()) set_pstate_pan(0); - state = arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); ret = do_sdei_event(regs, arg); - arm64_exit_nmi(regs, state); + irqentry_nmi_exit(regs, state); return ret; } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index db3f972f8cd973..1110eeb21f572d 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1576,7 +1577,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ -void do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; From 981dd162b63578aee34b5c68795e246734b76d70 Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Thu, 4 Sep 2025 11:05:29 -0500 Subject: [PATCH 1451/3206] regulator: bq257xx: Add bq257xx boost regulator driver Add support for the boost regulator found in the Texas Instruments BQ25703. The boost regulator is capable of outputting between 4.48 and 20.8 volts and between 0 and 6.35 amps. Signed-off-by: Chris Morgan Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20250904160530.66178-5-macroalpha82@gmail.com Signed-off-by: Lee Jones --- drivers/regulator/Kconfig | 8 ++ drivers/regulator/Makefile | 1 + drivers/regulator/bq257xx-regulator.c | 186 ++++++++++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 drivers/regulator/bq257xx-regulator.c diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index eaa6df1c9f8066..60b14472c59959 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -297,6 +297,14 @@ config REGULATOR_BD96801 This driver can also be built as a module. If so, the module will be called bd96801-regulator. +config REGULATOR_BQ257XX + tristate "TI BQ257XX regulator family" + depends on MFD_BQ257XX + depends on GPIOLIB || COMPILE_TEST + help + Say Y to enable support for the boost regulator function of + the BQ257XX family of charger circuits. + config REGULATOR_CPCAP tristate "Motorola CPCAP regulator" depends on MFD_CPCAP diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index be98b29d6675d8..e9ab5945b3c838 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_REGULATOR_BD71828) += bd71828-regulator.o obj-$(CONFIG_REGULATOR_BD718XX) += bd718x7-regulator.o obj-$(CONFIG_REGULATOR_BD9571MWV) += bd9571mwv-regulator.o obj-$(CONFIG_REGULATOR_BD957XMUF) += bd9576-regulator.o +obj-$(CONFIG_REGULATOR_BQ257XX) += bq257xx-regulator.o obj-$(CONFIG_REGULATOR_DA903X) += da903x-regulator.o obj-$(CONFIG_REGULATOR_BD96801) += bd96801-regulator.o obj-$(CONFIG_REGULATOR_DA9052) += da9052-regulator.o diff --git a/drivers/regulator/bq257xx-regulator.c b/drivers/regulator/bq257xx-regulator.c new file mode 100644 index 00000000000000..fc1ccede446882 --- /dev/null +++ b/drivers/regulator/bq257xx-regulator.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BQ257XX Battery Charger Driver + * Copyright (C) 2025 Chris Morgan + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct bq257xx_reg_data { + struct bq257xx_device *bq; + struct regulator_dev *bq257xx_reg; + struct gpio_desc *otg_en_gpio; + struct regulator_desc desc; +}; + +static int bq25703_vbus_get_cur_limit(struct regulator_dev *rdev) +{ + struct bq257xx_reg_data *pdata = rdev_get_drvdata(rdev); + int ret; + unsigned int reg; + + ret = regmap_read(pdata->bq->regmap, BQ25703_OTG_CURRENT, ®); + if (ret) + return ret; + return FIELD_GET(BQ25703_OTG_CUR_MASK, reg) * BQ25703_OTG_CUR_STEP_UA; +} + +/* + * Check if the minimum current and maximum current requested are + * sane values, then set the register accordingly. + */ +static int bq25703_vbus_set_cur_limit(struct regulator_dev *rdev, + int min_uA, int max_uA) +{ + struct bq257xx_reg_data *pdata = rdev_get_drvdata(rdev); + unsigned int reg; + + if ((min_uA > BQ25703_OTG_CUR_MAX_UA) || (max_uA < 0)) + return -EINVAL; + + reg = (max_uA / BQ25703_OTG_CUR_STEP_UA); + + /* Catch rounding errors since our step is 50000uA. */ + if ((reg * BQ25703_OTG_CUR_STEP_UA) < min_uA) + return -EINVAL; + + return regmap_write(pdata->bq->regmap, BQ25703_OTG_CURRENT, + FIELD_PREP(BQ25703_OTG_CUR_MASK, reg)); +} + +static int bq25703_vbus_enable(struct regulator_dev *rdev) +{ + struct bq257xx_reg_data *pdata = rdev_get_drvdata(rdev); + + if (pdata->otg_en_gpio) + gpiod_set_value_cansleep(pdata->otg_en_gpio, 1); + return regulator_enable_regmap(rdev); +} + +static int bq25703_vbus_disable(struct regulator_dev *rdev) +{ + struct bq257xx_reg_data *pdata = rdev_get_drvdata(rdev); + + if (pdata->otg_en_gpio) + gpiod_set_value_cansleep(pdata->otg_en_gpio, 0); + return regulator_disable_regmap(rdev); +} + +static const struct regulator_ops bq25703_vbus_ops = { + .enable = bq25703_vbus_enable, + .disable = bq25703_vbus_disable, + .is_enabled = regulator_is_enabled_regmap, + .list_voltage = regulator_list_voltage_linear, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_current_limit = bq25703_vbus_get_cur_limit, + .set_current_limit = bq25703_vbus_set_cur_limit, +}; + +static const struct regulator_desc bq25703_vbus_desc = { + .name = "vbus", + .of_match = of_match_ptr("vbus"), + .regulators_node = of_match_ptr("regulators"), + .type = REGULATOR_VOLTAGE, + .owner = THIS_MODULE, + .ops = &bq25703_vbus_ops, + .min_uV = BQ25703_OTG_VOLT_MIN_UV, + .uV_step = BQ25703_OTG_VOLT_STEP_UV, + .n_voltages = BQ25703_OTG_VOLT_NUM_VOLT, + .enable_mask = BQ25703_EN_OTG_MASK, + .enable_reg = BQ25703_CHARGE_OPTION_3, + .enable_val = BQ25703_EN_OTG_MASK, + .disable_val = 0, + .vsel_reg = BQ25703_OTG_VOLT, + .vsel_mask = BQ25703_OTG_VOLT_MASK, +}; + +/* Get optional GPIO for OTG regulator enable. */ +static void bq257xx_reg_dt_parse_gpio(struct platform_device *pdev) +{ + struct device_node *child, *subchild; + struct bq257xx_reg_data *pdata = platform_get_drvdata(pdev); + + child = of_get_child_by_name(pdev->dev.of_node, + pdata->desc.regulators_node); + if (!child) + return; + + subchild = of_get_child_by_name(child, pdata->desc.of_match); + if (!subchild) + return; + + of_node_put(child); + + pdata->otg_en_gpio = devm_fwnode_gpiod_get_index(&pdev->dev, + of_fwnode_handle(subchild), + "enable", 0, + GPIOD_OUT_LOW, + pdata->desc.of_match); + + of_node_put(subchild); + + if (IS_ERR(pdata->otg_en_gpio)) { + dev_err(&pdev->dev, "Error getting enable gpio: %ld\n", + PTR_ERR(pdata->otg_en_gpio)); + return; + } +} + +static int bq257xx_regulator_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct bq257xx_device *bq = dev_get_drvdata(pdev->dev.parent); + struct bq257xx_reg_data *pdata; + struct device_node *np = dev->of_node; + struct regulator_config cfg = {}; + + pdev->dev.of_node = pdev->dev.parent->of_node; + pdev->dev.of_node_reused = true; + + pdata = devm_kzalloc(&pdev->dev, sizeof(struct bq257xx_reg_data), GFP_KERNEL); + if (!pdata) + return -ENOMEM; + + pdata->bq = bq; + pdata->desc = bq25703_vbus_desc; + + platform_set_drvdata(pdev, pdata); + bq257xx_reg_dt_parse_gpio(pdev); + + cfg.dev = &pdev->dev; + cfg.driver_data = pdata; + cfg.of_node = np; + cfg.regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!cfg.regmap) + return -ENODEV; + + pdata->bq257xx_reg = devm_regulator_register(dev, &pdata->desc, &cfg); + if (IS_ERR(pdata->bq257xx_reg)) { + return dev_err_probe(&pdev->dev, PTR_ERR(pdata->bq257xx_reg), + "error registering bq257xx regulator"); + } + + return 0; +} + +static struct platform_driver bq257xx_reg_driver = { + .driver = { + .name = "bq257xx-regulator", + }, + .probe = bq257xx_regulator_probe, +}; + +module_platform_driver(bq257xx_reg_driver); + +MODULE_DESCRIPTION("bq257xx regulator driver"); +MODULE_AUTHOR("Chris Morgan "); +MODULE_LICENSE("GPL"); From a377b1be3a0ed51ccabfe22908ebde065848313c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 4 Sep 2025 18:26:12 -0700 Subject: [PATCH 1452/3206] mfd: tps6594: Explicitly include bitfield.h After a recent change that started using FIELD_GET() in tps6594-core.c, there is an error when bitfield.h is not implicitly included, such as when building allmodconfig for ARCH=hexagon: drivers/mfd/tps6594-core.c:767:7: error: call to undeclared function 'FIELD_GET'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 767 | if (FIELD_GET(TPS65224_MASK_EN_PB_VSENSE_CONFIG, pwr_on) == TPS65224_EN_SEL_PB || | ^ Explicitly include bitfield.h to resolve the errors. Reported-by: kernel test robot Fixes: d766ca01c208 ("mfd: tps6594: Add power button functionality") Signed-off-by: Nathan Chancellor Reviewed-by: Michael Walle Closes: https://lore.kernel.org/oe-kbuild-all/202509032356.LGa5hygM-lkp@intel.com/ Link: https://lore.kernel.org/r/20250904-mfd-tps6594-core-fix-bitfield-h-v1-1-5d0f00cfe58f@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/tps6594-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/tps6594-core.c b/drivers/mfd/tps6594-core.c index 7127af7142f54b..8b26c412747279 100644 --- a/drivers/mfd/tps6594-core.c +++ b/drivers/mfd/tps6594-core.c @@ -10,6 +10,7 @@ * Copyright (C) 2023 BayLibre Incorporated - https://www.baylibre.com/ */ +#include #include #include #include From 96e048fa11d6aedf4add4c2f93a8d06445948056 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 7 Sep 2025 12:16:09 +0200 Subject: [PATCH 1453/3206] leds: is31fl319x: Use devm_mutex_init() Use devm_mutex_init() instead of hand-writing it. This saves some LoC, improves readability and saves some space in the generated .o file. Before: ====== text data bss dec hex filename 20011 6752 128 26891 690b drivers/leds/leds-is31fl319x.o After: ===== text data bss dec hex filename 19715 6680 128 26523 679b drivers/leds/leds-is31fl319x.o Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/267aba6eab12be67c297fcd52fcf45a0856338bb.1757240150.git.christophe.jaillet@wanadoo.fr Signed-off-by: Lee Jones --- drivers/leds/leds-is31fl319x.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/leds/leds-is31fl319x.c b/drivers/leds/leds-is31fl319x.c index 27bfab3da47984..e411cee06dabde 100644 --- a/drivers/leds/leds-is31fl319x.c +++ b/drivers/leds/leds-is31fl319x.c @@ -483,11 +483,6 @@ static inline int is31fl3196_db_to_gain(u32 dezibel) return dezibel / IS31FL3196_AUDIO_GAIN_DB_STEP; } -static void is31f1319x_mutex_destroy(void *lock) -{ - mutex_destroy(lock); -} - static int is31fl319x_probe(struct i2c_client *client) { struct is31fl319x_chip *is31; @@ -503,8 +498,7 @@ static int is31fl319x_probe(struct i2c_client *client) if (!is31) return -ENOMEM; - mutex_init(&is31->lock); - err = devm_add_action_or_reset(dev, is31f1319x_mutex_destroy, &is31->lock); + err = devm_mutex_init(dev, &is31->lock); if (err) return err; From b0035df56dcd210d735e90ecd16817693f1a2ed9 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Thu, 11 Sep 2025 15:11:31 +0800 Subject: [PATCH 1454/3206] ALSA: hda/tas2781: Fix a potential race condition that causes a NULL pointer in case no efi.get_variable exsits A a potential race condition reported by one of my customers that leads to a NULL pointer dereference, where the call to efi.get_variable should be guarded with efi_rt_services_supported() to ensure that function exists. Fixes: 4fe238513407 ("ALSA: hda/tas2781: Move and unified the calibrated-data getting function for SPI and I2C into the tas2781_hda lib") Signed-off-by: Shenghao Ding Signed-off-by: Takashi Iwai --- sound/hda/codecs/side-codecs/tas2781_hda.c | 5 +++++ sound/hda/codecs/side-codecs/tas2781_hda_i2c.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/sound/hda/codecs/side-codecs/tas2781_hda.c b/sound/hda/codecs/side-codecs/tas2781_hda.c index 536940c78f001d..96e6d82dc69eb1 100644 --- a/sound/hda/codecs/side-codecs/tas2781_hda.c +++ b/sound/hda/codecs/side-codecs/tas2781_hda.c @@ -193,6 +193,11 @@ int tas2781_save_calibration(struct tas2781_hda *hda) efi_status_t status; int i; + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { + dev_err(p->dev, "%s: NO EFI FOUND!\n", __func__); + return -EINVAL; + } + if (hda->catlog_id < LENOVO) efi_guid = tasdev_fct_efi_guid[hda->catlog_id]; diff --git a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c index 45a70fbf620537..b5b7a1e82b75e6 100644 --- a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c +++ b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c @@ -315,6 +315,11 @@ static int tas2563_save_calibration(struct tas2781_hda *h) unsigned int attr; int ret, i, j, k; + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { + dev_err(p->dev, "%s: NO EFI FOUND!\n", __func__); + return -EINVAL; + } + cd->cali_dat_sz_per_dev = TAS2563_CAL_DATA_SIZE * TASDEV_CALIB_N; /* extra byte for each device is the device number */ From 67a529b7d3c50a56c162476509361f4fe11350dd Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 15 Aug 2025 12:40:02 -0500 Subject: [PATCH 1455/3206] include: adi-axi-common: add version check function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a version check function for checking ADI AXI IP core versions. These cores use a semantic versioning scheme, so it is useful to have a version check function that can check the minor version to enable features in driver while maintaining backward compatibility. Signed-off-by: David Lechner Reviewed-by: Nuno Sá Link: https://patch.msgid.link/20250815-spi-axi-spi-enigne-improve-version-checks-v1-1-13bde357d5b6@baylibre.com Signed-off-by: Mark Brown --- include/linux/adi-axi-common.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/include/linux/adi-axi-common.h b/include/linux/adi-axi-common.h index f64f4ad4bedae3..37962ba530dfc1 100644 --- a/include/linux/adi-axi-common.h +++ b/include/linux/adi-axi-common.h @@ -8,6 +8,8 @@ * https://wiki.analog.com/resources/fpga/docs/hdl/regmap */ +#include + #ifndef ADI_AXI_COMMON_H_ #define ADI_AXI_COMMON_H_ @@ -21,6 +23,25 @@ #define ADI_AXI_PCORE_VER_MINOR(version) (((version) >> 8) & 0xff) #define ADI_AXI_PCORE_VER_PATCH(version) ((version) & 0xff) +/** + * adi_axi_pcore_ver_gteq() - check if a version is satisfied + * @version: the full version read from the hardware + * @major: the major version to compare against + * @minor: the minor version to compare against + * + * ADI AXI IP Cores use semantic versioning, so this can be used to check for + * feature availability. + * + * Return: true if the version is greater than or equal to the specified + * major and minor version, false otherwise. + */ +static inline bool adi_axi_pcore_ver_gteq(u32 version, u32 major, u32 minor) +{ + return ADI_AXI_PCORE_VER_MAJOR(version) > (major) || + (ADI_AXI_PCORE_VER_MAJOR(version) == (major) && + ADI_AXI_PCORE_VER_MINOR(version) >= (minor)); +} + #define ADI_AXI_INFO_FPGA_TECH(info) (((info) >> 24) & 0xff) #define ADI_AXI_INFO_FPGA_FAMILY(info) (((info) >> 16) & 0xff) #define ADI_AXI_INFO_FPGA_SPEED_GRADE(info) (((info) >> 8) & 0xff) From 30db1b21fa37a2f37c7f4d71864405a05e889833 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 15 Aug 2025 12:40:03 -0500 Subject: [PATCH 1456/3206] spi: axi-spi-engine: use adi_axi_pcore_ver_gteq() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make use of the adi_axi_pcore_ver_gteq() helper to make version checks more readable and robust against a major version bump. Signed-off-by: David Lechner Reviewed-by: Nuno Sá Link: https://patch.msgid.link/20250815-spi-axi-spi-enigne-improve-version-checks-v1-2-13bde357d5b6@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 512d53a8ef4d14..e06f412190fd24 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -1050,7 +1050,7 @@ static int spi_engine_probe(struct platform_device *pdev) return -ENODEV; } - if (ADI_AXI_PCORE_VER_MINOR(version) >= 1) { + if (adi_axi_pcore_ver_gteq(version, 1, 1)) { unsigned int sizes = readl(spi_engine->base + SPI_ENGINE_REG_OFFLOAD_MEM_ADDR_WIDTH); @@ -1064,7 +1064,7 @@ static int spi_engine_probe(struct platform_device *pdev) } /* IP v1.5 dropped the requirement for SYNC in offload messages. */ - spi_engine->offload_requires_sync = ADI_AXI_PCORE_VER_MINOR(version) < 5; + spi_engine->offload_requires_sync = !adi_axi_pcore_ver_gteq(version, 1, 5); writel_relaxed(0x00, spi_engine->base + SPI_ENGINE_REG_RESET); writel_relaxed(0xff, spi_engine->base + SPI_ENGINE_REG_INT_PENDING); @@ -1091,15 +1091,12 @@ static int spi_engine_probe(struct platform_device *pdev) host->put_offload = spi_engine_put_offload; host->num_chipselect = 8; - /* Some features depend of the IP core version. */ - if (ADI_AXI_PCORE_VER_MAJOR(version) >= 1) { - if (ADI_AXI_PCORE_VER_MINOR(version) >= 2) { - host->mode_bits |= SPI_CS_HIGH; - host->setup = spi_engine_setup; - } - if (ADI_AXI_PCORE_VER_MINOR(version) >= 3) - host->mode_bits |= SPI_MOSI_IDLE_LOW | SPI_MOSI_IDLE_HIGH; + if (adi_axi_pcore_ver_gteq(version, 1, 2)) { + host->mode_bits |= SPI_CS_HIGH; + host->setup = spi_engine_setup; } + if (adi_axi_pcore_ver_gteq(version, 1, 3)) + host->mode_bits |= SPI_MOSI_IDLE_LOW | SPI_MOSI_IDLE_HIGH; if (host->max_speed_hz == 0) return dev_err_probe(&pdev->dev, -EINVAL, "spi_clk rate is 0"); From 1fcf686def19064a7b5cfaeb28c1f1a119900a2b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 12 Sep 2025 03:27:11 +0800 Subject: [PATCH 1457/3206] erofs: fix long xattr name prefix placement Currently, xattr name prefixes are forcibly placed into the packed inode if the fragments feature is enabled, and users have no option to put them in plain form directly on disk. This is inflexible. First, as mentioned above, users should be able to store unwrapped long xattr name prefixes unconditionally (COMPAT_PLAIN_XATTR_PFX). Second, since we now have the new metabox inode to store metadata, it should be used when available instead of the packed inode. Fixes: 414091322c63 ("erofs: implement metadata compression") Signed-off-by: Gao Xiang --- fs/erofs/erofs_fs.h | 8 +++++--- fs/erofs/internal.h | 1 + fs/erofs/xattr.c | 13 ++++++++++--- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 377ee12b8b9667..3d5738f80072ee 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -12,10 +12,12 @@ /* to allow for x86 boot sectors and other oddities. */ #define EROFS_SUPER_OFFSET 1024 -#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 -#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 -#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 +#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 +#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 #define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008 +#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010 + /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 4ccc5f0ee8dfb9..9319c66e86c3c3 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -234,6 +234,7 @@ EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX) +EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX) static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid) { diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index eaa9efd766eea7..396536d9a86216 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -482,6 +482,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb) erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2; struct erofs_xattr_prefix_item *pfs; int ret = 0, i, len; + bool plain = erofs_sb_has_plain_xattr_pfx(sbi); if (!sbi->xattr_prefix_count) return 0; @@ -490,9 +491,15 @@ int erofs_xattr_prefixes_init(struct super_block *sb) if (!pfs) return -ENOMEM; - if (sbi->packed_inode) - buf.mapping = sbi->packed_inode->i_mapping; - else + if (!plain) { + if (erofs_sb_has_metabox(sbi)) + (void)erofs_init_metabuf(&buf, sb, true); + else if (sbi->packed_inode) + buf.mapping = sbi->packed_inode->i_mapping; + else + plain = true; + } + if (plain) (void)erofs_init_metabuf(&buf, sb, false); for (i = 0; i < sbi->xattr_prefix_count; i++) { From 0460484244e1fa4a0cdbc9ed76c838fc528134d3 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 11 Sep 2025 14:58:00 +0000 Subject: [PATCH 1458/3206] bpf: arm64: simplify exception table handling BPF loads with BPF_PROBE_MEM(SX) can load from unsafe pointers and the JIT adds an exception table entry for the JITed instruction which allows the exeption handler to set the destination register of the load to zero and continue execution from the next instruction. As all arm64 instructions are AARCH64_INSN_SIZE size, the exception handler can just increment the pc by AARCH64_INSN_SIZE without needing the exact address of the instruction following the the faulting instruction. Simplify the exception table usage in arm64 JIT by only saving the destination register in ex->fixup and drop everything related to the fixup_offset. The fault handler is modified to add AARCH64_INSN_SIZE to the pc. Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song Acked-by: Kumar Kartikeya Dwivedi Acked-by: Xu Kuohai Link: https://lore.kernel.org/r/20250911145808.58042-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index f0b1cb2c3bc485..e6d1fdc1e6f52a 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1066,19 +1066,18 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic) emit(A64_RET(A64_LR), ctx); } -#define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) #define DONT_CLEAR 5 /* Unused ARM64 register from BPF's POV */ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) { - off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); if (dst_reg != DONT_CLEAR) regs->regs[dst_reg] = 0; - regs->pc = (unsigned long)&ex->fixup - offset; + /* Skip the faulting instruction */ + regs->pc += AARCH64_INSN_SIZE; return true; } @@ -1088,7 +1087,6 @@ static int add_exception_handler(const struct bpf_insn *insn, int dst_reg) { off_t ins_offset; - off_t fixup_offset; unsigned long pc; struct exception_table_entry *ex; @@ -1119,22 +1117,6 @@ static int add_exception_handler(const struct bpf_insn *insn, if (WARN_ON_ONCE(ins_offset >= 0 || ins_offset < INT_MIN)) return -ERANGE; - /* - * Since the extable follows the program, the fixup offset is always - * negative and limited to BPF_JIT_REGION_SIZE. Store a positive value - * to keep things simple, and put the destination register in the upper - * bits. We don't need to worry about buildtime or runtime sort - * modifying the upper bits because the table is already sorted, and - * isn't part of the main exception table. - * - * The fixup_offset is set to the next instruction from the instruction - * that may fault. The execution will jump to this after handling the - * fault. - */ - fixup_offset = (long)&ex->fixup - (pc + AARCH64_INSN_SIZE); - if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, fixup_offset)) - return -ERANGE; - /* * The offsets above have been calculated using the RO buffer but we * need to use the R/W buffer for writes. @@ -1147,8 +1129,7 @@ static int add_exception_handler(const struct bpf_insn *insn, if (BPF_CLASS(insn->code) != BPF_LDX) dst_reg = DONT_CLEAR; - ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) | - FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); + ex->fixup = FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); ex->type = EX_TYPE_BPF; From 70f23546d246563da648baedbb0432ba1d6bb357 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 11 Sep 2025 14:58:01 +0000 Subject: [PATCH 1459/3206] bpf: core: introduce main_prog_aux for stream access BPF streams are only valid for the main programs, to make it easier to access streams from subprogs, introduce main_prog_aux in struct bpf_prog_aux. prog->aux->main_prog_aux = prog->aux, for main programs and prog->aux->main_prog_aux = main_prog->aux, for subprograms. Make bpf_prog_find_from_stack() use the added main_prog_aux to return the mainprog when a subprog is found on the stack. Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250911145808.58042-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 6 +++--- kernel/bpf/verifier.c | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8f6e87f0f3a89d..d133171c4d2a9a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1633,6 +1633,7 @@ struct bpf_prog_aux { /* function name for valid attach_btf_id */ const char *attach_func_name; struct bpf_prog **func; + struct bpf_prog_aux *main_prog_aux; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; struct bpf_kfunc_desc_tab *kfunc_tab; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e6801945910672..1cda2589d4b35c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -120,6 +120,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->pages = size / PAGE_SIZE; fp->aux = aux; + fp->aux->main_prog_aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); @@ -3297,9 +3298,8 @@ static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) rcu_read_unlock(); if (!prog) return true; - if (bpf_is_subprog(prog)) - return true; - ctxp->prog = prog; + /* Make sure we return the main prog if we found a subprog */ + ctxp->prog = prog->aux->main_prog_aux->prog; return false; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b8b510a25b0315..17fe623400a5bc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21601,6 +21601,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; + func[i]->aux->main_prog_aux = prog->aux; for (j = 0; j < prog->aux->size_poke_tab; j++) { struct bpf_jit_poke_descriptor *poke; From 5c5240d020615f13331f4e2c559186125eddc7d3 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 11 Sep 2025 14:58:02 +0000 Subject: [PATCH 1460/3206] bpf: Report arena faults to BPF stderr Begin reporting arena page faults and the faulting address to BPF program's stderr, this patch adds support in the arm64 and x86-64 JITs, support for other archs can be added later. The fault handlers receive the 32 bit address in the arena region so the upper 32 bits of user_vm_start is added to it before printing the address. This is what the user would expect to see as this is what is printed by bpf_printk() is you pass it an address returned by bpf_arena_alloc_pages(); Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250911145808.58042-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 60 +++++++++++++++++++++++++ arch/x86/net/bpf_jit_comp.c | 85 ++++++++++++++++++++++++++++++++--- include/linux/bpf.h | 6 +++ kernel/bpf/arena.c | 30 +++++++++++++ 4 files changed, 176 insertions(+), 5 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index e6d1fdc1e6f52a..008273a53e0469 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1066,6 +1066,30 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic) emit(A64_RET(A64_LR), ctx); } +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` field in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+-----------+----------+ + * | 31-27 | 26-22 | 21 | 20-16 | 15-0 | + * | | | | | | + * | FIXUP_REG | Unused | ARENA_ACC | ARENA_REG | OFFSET | + * +-----------+--------+-----------+-----------+----------+ + * + * - OFFSET (16 bits): Offset used to compute address for Load/Store instruction. + * - ARENA_REG (5 bits): Register that is used to calculate the address for load/store when + * accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * - FIXUP_REG (5 bits): Destination register for the load instruction (cleared on fault) or set to + * DONT_CLEAR if it is a store instruction. + */ + +#define BPF_FIXUP_OFFSET_MASK GENMASK(15, 0) +#define BPF_FIXUP_ARENA_REG_MASK GENMASK(20, 16) +#define BPF_ARENA_ACCESS BIT(21) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) #define DONT_CLEAR 5 /* Unused ARM64 register from BPF's POV */ @@ -1073,11 +1097,22 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) { int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); + s16 off = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); + int arena_reg = FIELD_GET(BPF_FIXUP_ARENA_REG_MASK, ex->fixup); + bool is_arena = !!(ex->fixup & BPF_ARENA_ACCESS); + bool is_write = (dst_reg == DONT_CLEAR); + unsigned long addr; + + if (is_arena) { + addr = regs->regs[arena_reg] + off; + bpf_prog_report_arena_violation(is_write, addr, regs->pc); + } if (dst_reg != DONT_CLEAR) regs->regs[dst_reg] = 0; /* Skip the faulting instruction */ regs->pc += AARCH64_INSN_SIZE; + return true; } @@ -1087,6 +1122,9 @@ static int add_exception_handler(const struct bpf_insn *insn, int dst_reg) { off_t ins_offset; + s16 off = insn->off; + bool is_arena; + int arena_reg; unsigned long pc; struct exception_table_entry *ex; @@ -1100,6 +1138,9 @@ static int add_exception_handler(const struct bpf_insn *insn, BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) return 0; + is_arena = (BPF_MODE(insn->code) == BPF_PROBE_MEM32) || + (BPF_MODE(insn->code) == BPF_PROBE_ATOMIC); + if (!ctx->prog->aux->extable || WARN_ON_ONCE(ctx->exentry_idx >= ctx->prog->aux->num_exentries)) return -EINVAL; @@ -1131,6 +1172,25 @@ static int add_exception_handler(const struct bpf_insn *insn, ex->fixup = FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); + if (is_arena) { + ex->fixup |= BPF_ARENA_ACCESS; + /* + * insn->src_reg/dst_reg holds the address in the arena region with upper 32-bits + * being zero because of a preceding addr_space_cast(r, 0x0, 0x1) instruction. + * This address is adjusted with the addition of arena_vm_start (see the + * implementation of BPF_PROBE_MEM32 and BPF_PROBE_ATOMIC) before being used for the + * memory access. Pass the reg holding the unmodified 32-bit address to + * ex_handler_bpf. + */ + if (BPF_CLASS(insn->code) == BPF_LDX) + arena_reg = bpf2a64[insn->src_reg]; + else + arena_reg = bpf2a64[insn->dst_reg]; + + ex->fixup |= FIELD_PREP(BPF_FIXUP_OFFSET_MASK, off) | + FIELD_PREP(BPF_FIXUP_ARENA_REG_MASK, arena_reg); + } + ex->type = EX_TYPE_BPF; ctx->exentry_idx++; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7e3fca1646203c..8d34a9400a5e49 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1388,16 +1389,67 @@ static int emit_atomic_ld_st_index(u8 **pprog, u32 atomic_op, u32 size, return 0; } +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` and `data` fields in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+---------+----------+ + * | 31 | 30-24 | 23-16 | 15-8 | 7-0 | + * | | | | | | + * | ARENA_ACC | Unused | ARENA_REG | DST_REG | INSN_LEN | + * +-----------+--------+-----------+---------+----------+ + * + * - INSN_LEN (8 bits): Length of faulting insn (max x86 insn = 15 bytes (fits in 8 bits)). + * - DST_REG (8 bits): Offset of dst_reg from reg2pt_regs[] (max offset = 112 (fits in 8 bits)). + * This is set to DONT_CLEAR if the insn is a store. + * - ARENA_REG (8 bits): Offset of the register that is used to calculate the + * address for load/store when accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * + * Bit layout of `data` (32-bit): + * + * +--------------+--------+--------------+ + * | 31-16 | 15-8 | 7-0 | + * | | | | + * | ARENA_OFFSET | Unused | EX_TYPE_BPF | + * +--------------+--------+--------------+ + * + * - ARENA_OFFSET (16 bits): Offset used to calculate the address for load/store when + * accessing the arena region. + */ + #define DONT_CLEAR 1 +#define FIXUP_INSN_LEN_MASK GENMASK(7, 0) +#define FIXUP_REG_MASK GENMASK(15, 8) +#define FIXUP_ARENA_REG_MASK GENMASK(23, 16) +#define FIXUP_ARENA_ACCESS BIT(31) +#define DATA_ARENA_OFFSET_MASK GENMASK(31, 16) bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { - u32 reg = x->fixup >> 8; + u32 reg = FIELD_GET(FIXUP_REG_MASK, x->fixup); + u32 insn_len = FIELD_GET(FIXUP_INSN_LEN_MASK, x->fixup); + bool is_arena = !!(x->fixup & FIXUP_ARENA_ACCESS); + bool is_write = (reg == DONT_CLEAR); + unsigned long addr; + s16 off; + u32 arena_reg; + + if (is_arena) { + arena_reg = FIELD_GET(FIXUP_ARENA_REG_MASK, x->fixup); + off = FIELD_GET(DATA_ARENA_OFFSET_MASK, x->data); + addr = *(unsigned long *)((void *)regs + arena_reg) + off; + bpf_prog_report_arena_violation(is_write, addr, regs->ip); + } /* jump over faulting load and clear dest register */ if (reg != DONT_CLEAR) *(unsigned long *)((void *)regs + reg) = 0; - regs->ip += x->fixup & 0xff; + regs->ip += insn_len; + return true; } @@ -2070,6 +2122,7 @@ st: if (is_imm8(insn->off)) { struct exception_table_entry *ex; u8 *_insn = image + proglen + (start_of_ldx - temp); + u32 arena_reg, fixup_reg; s64 delta; if (!bpf_prog->aux->extable) @@ -2089,8 +2142,29 @@ st: if (is_imm8(insn->off)) ex->data = EX_TYPE_BPF; - ex->fixup = (prog - start_of_ldx) | - ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8); + /* + * src_reg/dst_reg holds the address in the arena region with upper + * 32-bits being zero because of a preceding addr_space_cast(r, + * 0x0, 0x1) instruction. This address is adjusted with the addition + * of arena_vm_start (see the implementation of BPF_PROBE_MEM32 and + * BPF_PROBE_ATOMIC) before being used for the memory access. Pass + * the reg holding the unmodified 32-bit address to + * ex_handler_bpf(). + */ + if (BPF_CLASS(insn->code) == BPF_LDX) { + arena_reg = reg2pt_regs[src_reg]; + fixup_reg = reg2pt_regs[dst_reg]; + } else { + arena_reg = reg2pt_regs[dst_reg]; + fixup_reg = DONT_CLEAR; + } + + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_ARENA_REG_MASK, arena_reg) | + FIELD_PREP(FIXUP_REG_MASK, fixup_reg); + ex->fixup |= FIXUP_ARENA_ACCESS; + + ex->data |= FIELD_PREP(DATA_ARENA_OFFSET_MASK, insn->off); } break; @@ -2208,7 +2282,8 @@ st: if (is_imm8(insn->off)) * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" * of 4 bytes will be ignored and rbx will be zero inited. */ - ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8); + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_REG_MASK, reg2pt_regs[dst_reg]); } break; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d133171c4d2a9a..41f776071ff517 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2881,6 +2881,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -3168,6 +3169,11 @@ static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { } + +static inline void bpf_prog_report_arena_violation(bool write, unsigned long addr, + unsigned long fault_ip) +{ +} #endif /* CONFIG_BPF_SYSCALL */ static __always_inline int diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 5b37753799d201..1074ac4459f2ca 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -633,3 +633,33 @@ static int __init kfunc_init(void) return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); + +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +{ + struct bpf_stream_stage ss; + struct bpf_prog *prog; + u64 user_vm_start; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it will not + * disappear while we are handling the fault. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(fault_ip); + rcu_read_unlock(); + if (!prog) + return; + + /* Use main prog for stream access */ + prog = prog->aux->main_prog_aux->prog; + + user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); + addr += clear_lo32(user_vm_start); + + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n", + write ? "WRITE" : "READ", addr); + bpf_stream_dump_stack(ss); + })); +} From 744eeb2b27c29a61ad13b6b6367636b233e32336 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 11 Sep 2025 14:58:03 +0000 Subject: [PATCH 1461/3206] selftests: bpf: introduce __stderr and __stdout Add __stderr and __stdout to validate the output of BPF streams for bpf selftests. Similar to __xlated, __jited, etc., __stderr/out can be used in the BPF progs to compare a string (regex supported) to the output in the bpf streams. Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250911145808.58042-5-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_misc.h | 10 +++ tools/testing/selftests/bpf/test_loader.c | 90 ++++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 72c2d72a245e5b..7905396c9cc4e2 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -35,6 +35,12 @@ * inside the brackets. * __msg_unpriv Same as __msg but for unprivileged mode. * + * __stderr Message expected to be found in bpf stderr stream. The + * same regex rules apply like __msg. + * __stderr_unpriv Same as __stderr but for unpriveleged mode. + * __stdout Same as __stderr but for stdout stream. + * __stdout_unpriv Same as __stdout but for unpriveleged mode. + * * __xlated Expect a line in a disassembly log after verifier applies rewrites. * Multiple __xlated attributes could be specified. * Regular expressions could be specified same way as in __msg. @@ -140,6 +146,10 @@ #define __caps_unpriv(caps) __attribute__((btf_decl_tag("comment:test_caps_unpriv=" EXPAND_QUOTE(caps)))) #define __load_if_JITed() __attribute__((btf_decl_tag("comment:load_mode=jited"))) #define __load_if_no_JITed() __attribute__((btf_decl_tag("comment:load_mode=no_jited"))) +#define __stderr(msg) __attribute__((btf_decl_tag("comment:test_expect_stderr=" XSTR(__COUNTER__) "=" msg))) +#define __stderr_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stderr_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __stdout(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout=" XSTR(__COUNTER__) "=" msg))) +#define __stdout_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout_unpriv=" XSTR(__COUNTER__) "=" msg))) /* Define common capabilities tested using __caps_unpriv */ #define CAP_NET_ADMIN 12 diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 9f684d0dc5b4ba..e065b467d5090e 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -38,6 +38,10 @@ #define TEST_TAG_JITED_PFX_UNPRIV "comment:test_jited_unpriv=" #define TEST_TAG_CAPS_UNPRIV "comment:test_caps_unpriv=" #define TEST_TAG_LOAD_MODE_PFX "comment:load_mode=" +#define TEST_TAG_EXPECT_STDERR_PFX "comment:test_expect_stderr=" +#define TEST_TAG_EXPECT_STDERR_PFX_UNPRIV "comment:test_expect_stderr_unpriv=" +#define TEST_TAG_EXPECT_STDOUT_PFX "comment:test_expect_stdout=" +#define TEST_TAG_EXPECT_STDOUT_PFX_UNPRIV "comment:test_expect_stdout_unpriv=" /* Warning: duplicated in bpf_misc.h */ #define POINTER_VALUE 0xbadcafe @@ -79,6 +83,8 @@ struct test_subspec { struct expected_msgs expect_msgs; struct expected_msgs expect_xlated; struct expected_msgs jited; + struct expected_msgs stderr; + struct expected_msgs stdout; int retval; bool execute; __u64 caps; @@ -139,6 +145,10 @@ static void free_test_spec(struct test_spec *spec) free_msgs(&spec->unpriv.expect_xlated); free_msgs(&spec->priv.jited); free_msgs(&spec->unpriv.jited); + free_msgs(&spec->unpriv.stderr); + free_msgs(&spec->priv.stderr); + free_msgs(&spec->unpriv.stdout); + free_msgs(&spec->priv.stdout); free(spec->priv.name); free(spec->unpriv.name); @@ -407,6 +417,10 @@ static int parse_test_spec(struct test_loader *tester, bool xlated_on_next_line = true; bool unpriv_jit_on_next_line; bool jit_on_next_line; + bool stderr_on_next_line = true; + bool unpriv_stderr_on_next_line = true; + bool stdout_on_next_line = true; + bool unpriv_stdout_on_next_line = true; bool collect_jit = false; int func_id, i, err = 0; u32 arch_mask = 0; @@ -598,6 +612,26 @@ static int parse_test_spec(struct test_loader *tester, err = -EINVAL; goto cleanup; } + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDERR_PFX))) { + err = push_disasm_msg(msg, &stderr_on_next_line, + &spec->priv.stderr); + if (err) + goto cleanup; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDERR_PFX_UNPRIV))) { + err = push_disasm_msg(msg, &unpriv_stderr_on_next_line, + &spec->unpriv.stderr); + if (err) + goto cleanup; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDOUT_PFX))) { + err = push_disasm_msg(msg, &stdout_on_next_line, + &spec->priv.stdout); + if (err) + goto cleanup; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDOUT_PFX_UNPRIV))) { + err = push_disasm_msg(msg, &unpriv_stdout_on_next_line, + &spec->unpriv.stdout); + if (err) + goto cleanup; } } @@ -651,6 +685,10 @@ static int parse_test_spec(struct test_loader *tester, clone_msgs(&spec->priv.expect_xlated, &spec->unpriv.expect_xlated); if (spec->unpriv.jited.cnt == 0) clone_msgs(&spec->priv.jited, &spec->unpriv.jited); + if (spec->unpriv.stderr.cnt == 0) + clone_msgs(&spec->priv.stderr, &spec->unpriv.stderr); + if (spec->unpriv.stdout.cnt == 0) + clone_msgs(&spec->priv.stdout, &spec->unpriv.stdout); } spec->valid = true; @@ -712,6 +750,20 @@ static void emit_jited(const char *jited, bool force) fprintf(stdout, "JITED:\n=============\n%s=============\n", jited); } +static void emit_stderr(const char *stderr, bool force) +{ + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "STDERR:\n=============\n%s=============\n", stderr); +} + +static void emit_stdout(const char *bpf_stdout, bool force) +{ + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "STDOUT:\n=============\n%s=============\n", bpf_stdout); +} + static void validate_msgs(char *log_buf, struct expected_msgs *msgs, void (*emit_fn)(const char *buf, bool force)) { @@ -934,6 +986,19 @@ static int get_xlated_program_text(int prog_fd, char *text, size_t text_sz) return err; } +/* Read the bpf stream corresponding to the stream_id */ +static int get_stream(int stream_id, int prog_fd, char *text, size_t text_sz) +{ + LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts); + int ret; + + ret = bpf_prog_stream_read(prog_fd, stream_id, text, text_sz, &ropts); + ASSERT_GT(ret, 0, "stream read"); + text[ret] = '\0'; + + return ret; +} + /* this function is forced noinline and has short generic name to look better * in test_progs output (in case of a failure) */ @@ -1108,6 +1173,31 @@ void run_subtest(struct test_loader *tester, PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval); goto tobj_cleanup; } + + if (subspec->stderr.cnt) { + err = get_stream(2, bpf_program__fd(tprog), + tester->log_buf, tester->log_buf_sz); + if (err <= 0) { + PRINT_FAIL("Unexpected retval from get_stream(): %d, errno = %d\n", + err, errno); + goto tobj_cleanup; + } + emit_stderr(tester->log_buf, false /*force*/); + validate_msgs(tester->log_buf, &subspec->stderr, emit_stderr); + } + + if (subspec->stdout.cnt) { + err = get_stream(1, bpf_program__fd(tprog), + tester->log_buf, tester->log_buf_sz); + if (err <= 0) { + PRINT_FAIL("Unexpected retval from get_stream(): %d, errno = %d\n", + err, errno); + goto tobj_cleanup; + } + emit_stdout(tester->log_buf, false /*force*/); + validate_msgs(tester->log_buf, &subspec->stdout, emit_stdout); + } + /* redo bpf_map__attach_struct_ops for each test */ while (links_cnt > 0) bpf_link__destroy(links[--links_cnt]); From edd03fcd7601ce41068c183875c2cd3471a49f9a Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 11 Sep 2025 14:58:04 +0000 Subject: [PATCH 1462/3206] selftests: bpf: use __stderr in stream error tests Start using __stderr directly in the bpf programs to test the reporting of may_goto timeout detection and spin_lock dead lock detection. Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250911145808.58042-6-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/stream.c | 82 ------------------- tools/testing/selftests/bpf/progs/stream.c | 17 ++++ 2 files changed, 17 insertions(+), 82 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/stream.c b/tools/testing/selftests/bpf/prog_tests/stream.c index 9d0e5d93edee73..6f8eac5ccb6555 100644 --- a/tools/testing/selftests/bpf/prog_tests/stream.c +++ b/tools/testing/selftests/bpf/prog_tests/stream.c @@ -2,7 +2,6 @@ /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ #include #include -#include #include "stream.skel.h" #include "stream_fail.skel.h" @@ -18,87 +17,6 @@ void test_stream_success(void) return; } -struct { - int prog_off; - const char *errstr; -} stream_error_arr[] = { - { - offsetof(struct stream, progs.stream_cond_break), - "ERROR: Timeout detected for may_goto instruction\n" - "CPU: [0-9]+ UID: 0 PID: [0-9]+ Comm: .*\n" - "Call trace:\n" - "([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" - "|[ \t]+[^\n]+\n)*", - }, - { - offsetof(struct stream, progs.stream_deadlock), - "ERROR: AA or ABBA deadlock detected for bpf_res_spin_lock\n" - "Attempted lock = (0x[0-9a-fA-F]+)\n" - "Total held locks = 1\n" - "Held lock\\[ 0\\] = \\1\n" // Lock address must match - "CPU: [0-9]+ UID: 0 PID: [0-9]+ Comm: .*\n" - "Call trace:\n" - "([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" - "|[ \t]+[^\n]+\n)*", - }, -}; - -static int match_regex(const char *pattern, const char *string) -{ - int err, rc; - regex_t re; - - err = regcomp(&re, pattern, REG_EXTENDED | REG_NEWLINE); - if (err) - return -1; - rc = regexec(&re, string, 0, NULL, 0); - regfree(&re); - return rc == 0 ? 1 : 0; -} - -void test_stream_errors(void) -{ - LIBBPF_OPTS(bpf_test_run_opts, opts); - LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts); - struct stream *skel; - int ret, prog_fd; - char buf[1024]; - - skel = stream__open_and_load(); - if (!ASSERT_OK_PTR(skel, "stream__open_and_load")) - return; - - for (int i = 0; i < ARRAY_SIZE(stream_error_arr); i++) { - struct bpf_program **prog; - - prog = (struct bpf_program **)(((char *)skel) + stream_error_arr[i].prog_off); - prog_fd = bpf_program__fd(*prog); - ret = bpf_prog_test_run_opts(prog_fd, &opts); - ASSERT_OK(ret, "ret"); - ASSERT_OK(opts.retval, "retval"); - -#if !defined(__x86_64__) && !defined(__s390x__) && !defined(__aarch64__) - ASSERT_TRUE(1, "Timed may_goto unsupported, skip."); - if (i == 0) { - ret = bpf_prog_stream_read(prog_fd, 2, buf, sizeof(buf), &ropts); - ASSERT_EQ(ret, 0, "stream read"); - continue; - } -#endif - - ret = bpf_prog_stream_read(prog_fd, BPF_STREAM_STDERR, buf, sizeof(buf), &ropts); - ASSERT_GT(ret, 0, "stream read"); - ASSERT_LE(ret, 1023, "len for buf"); - buf[ret] = '\0'; - - ret = match_regex(stream_error_arr[i].errstr, buf); - if (!ASSERT_TRUE(ret == 1, "regex match")) - fprintf(stderr, "Output from stream:\n%s\n", buf); - } - - stream__destroy(skel); -} - void test_stream_syscall(void) { LIBBPF_OPTS(bpf_test_run_opts, opts); diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c index 35790897dc879e..bb465dad824700 100644 --- a/tools/testing/selftests/bpf/progs/stream.c +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -37,7 +37,15 @@ int stream_exhaust(void *ctx) } SEC("syscall") +__arch_x86_64 +__arch_arm64 +__arch_s390x __success __retval(0) +__stderr("ERROR: Timeout detected for may_goto instruction") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") int stream_cond_break(void *ctx) { while (can_loop) @@ -47,6 +55,15 @@ int stream_cond_break(void *ctx) SEC("syscall") __success __retval(0) +__stderr("ERROR: AA or ABBA deadlock detected for bpf_res_spin_lock") +__stderr("{{Attempted lock = (0x[0-9a-fA-F]+)\n" +"Total held locks = 1\n" +"Held lock\\[ 0\\] = \\1}}") +__stderr("...") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") int stream_deadlock(void *ctx) { struct bpf_res_spin_lock *lock, *nlock; From 86f2225065be5af2935f374f1a3abebc052c0868 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 11 Sep 2025 14:58:05 +0000 Subject: [PATCH 1463/3206] selftests/bpf: Add tests for arena fault reporting Add selftests for testing the reporting of arena page faults through BPF streams. Two new bpf programs are added that read and write to an unmapped arena address and the fault reporting is verified in the userspace through streams. The added bpf programs need to access the user_vm_start in struct bpf_arena, this is done by casting &arena to struct bpf_arena *, but barrier_var() is used on this ptr before accessing ptr->user_vm_start; to stop GCC from issuing an out-of-bound access due to the cast from smaller map struct to larger "struct bpf_arena" Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250911145808.58042-7-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/stream.c | 49 ++++++ tools/testing/selftests/bpf/progs/stream.c | 141 ++++++++++++++++++ 2 files changed, 190 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/stream.c b/tools/testing/selftests/bpf/prog_tests/stream.c index 6f8eac5ccb6555..c3cce5c292bdda 100644 --- a/tools/testing/selftests/bpf/prog_tests/stream.c +++ b/tools/testing/selftests/bpf/prog_tests/stream.c @@ -57,3 +57,52 @@ void test_stream_syscall(void) stream__destroy(skel); } + +static void test_address(struct bpf_program *prog, unsigned long *fault_addr_p) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts); + int ret, prog_fd; + char fault_addr[64]; + char buf[1024]; + + prog_fd = bpf_program__fd(prog); + + ret = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(ret, "ret"); + ASSERT_OK(opts.retval, "retval"); + + sprintf(fault_addr, "0x%lx", *fault_addr_p); + + ret = bpf_prog_stream_read(prog_fd, BPF_STREAM_STDERR, buf, sizeof(buf), &ropts); + ASSERT_GT(ret, 0, "stream read"); + ASSERT_LE(ret, 1023, "len for buf"); + buf[ret] = '\0'; + + if (!ASSERT_HAS_SUBSTR(buf, fault_addr, "fault_addr")) { + fprintf(stderr, "Output from stream:\n%s\n", buf); + fprintf(stderr, "Fault Addr: %s\n", fault_addr); + } +} + +void test_stream_arena_fault_address(void) +{ + struct stream *skel; + +#if !defined(__x86_64__) && !defined(__aarch64__) + printf("%s:SKIP: arena fault reporting not supported\n", __func__); + test__skip(); + return; +#endif + + skel = stream__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stream__open_and_load")) + return; + + if (test__start_subtest("read_fault")) + test_address(skel->progs.stream_arena_read_fault, &skel->bss->fault_addr); + if (test__start_subtest("write_fault")) + test_address(skel->progs.stream_arena_write_fault, &skel->bss->fault_addr); + + stream__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c index bb465dad824700..4a5bd852f10c89 100644 --- a/tools/testing/selftests/bpf/progs/stream.c +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -5,6 +5,7 @@ #include #include "bpf_misc.h" #include "bpf_experimental.h" +#include "bpf_arena_common.h" struct arr_elem { struct bpf_res_spin_lock lock; @@ -17,10 +18,29 @@ struct { __type(value, struct arr_elem); } arrmap SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 1); /* number of pages */ +} arena SEC(".maps"); + +struct elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + #define ENOSPC 28 #define _STR "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" int size; +u64 fault_addr; +void *arena_ptr; SEC("syscall") __success __retval(0) @@ -93,4 +113,125 @@ int stream_syscall(void *ctx) return 0; } +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena WRITE access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_write_fault(void *ctx) +{ + struct bpf_arena *ptr = (void *)&arena; + u64 user_vm_start; + + /* Prevent GCC bounds warning: casting &arena to struct bpf_arena * + * triggers bounds checking since the map definition is smaller than struct + * bpf_arena. barrier_var() makes the pointer opaque to GCC, preventing the + * bounds analysis + */ + barrier_var(ptr); + user_vm_start = ptr->user_vm_start; + fault_addr = user_vm_start + 0x7fff; + bpf_addr_space_cast(user_vm_start, 0, 1); + asm volatile ( + "r1 = %0;" + "r2 = 1;" + "*(u32 *)(r1 + 0x7fff) = r2;" + : + : "r" (user_vm_start) + : "r1", "r2" + ); + return 0; +} + +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena READ access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_read_fault(void *ctx) +{ + struct bpf_arena *ptr = (void *)&arena; + u64 user_vm_start; + + /* Prevent GCC bounds warning: casting &arena to struct bpf_arena * + * triggers bounds checking since the map definition is smaller than struct + * bpf_arena. barrier_var() makes the pointer opaque to GCC, preventing the + * bounds analysis + */ + barrier_var(ptr); + user_vm_start = ptr->user_vm_start; + fault_addr = user_vm_start + 0x7fff; + bpf_addr_space_cast(user_vm_start, 0, 1); + asm volatile ( + "r1 = %0;" + "r1 = *(u32 *)(r1 + 0x7fff);" + : + : "r" (user_vm_start) + : "r1" + ); + return 0; +} + +static __noinline void subprog(void) +{ + int __arena *addr = (int __arena *)0xdeadbeef; + + arena_ptr = &arena; + *addr = 1; +} + +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena WRITE access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_subprog_fault(void *ctx) +{ + subprog(); + return 0; +} + +static __noinline int timer_cb(void *map, int *key, struct bpf_timer *timer) +{ + int __arena *addr = (int __arena *)0xdeadbeef; + + arena_ptr = &arena; + *addr = 1; + return 0; +} + +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena WRITE access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_callback_fault(void *ctx) +{ + struct bpf_timer *arr_timer; + + arr_timer = bpf_map_lookup_elem(&array, &(int){0}); + if (!arr_timer) + return 0; + bpf_timer_init(arr_timer, &array, 1); + bpf_timer_set_callback(arr_timer, timer_cb); + bpf_timer_start(arr_timer, 0, 0); + return 0; +} + char _license[] SEC("license") = "GPL"; From f6d2900f2806d584303db689d9e18f0443610514 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Wed, 10 Sep 2025 11:40:03 +0200 Subject: [PATCH 1464/3206] MAINTAINERS: Update the DMA Rust entry Update the DMA Rust maintainers entry in the following two aspects: (1) Change Abdiel's entry to 'Reviewer'. (2) Take patches through the driver-core tree. Abdiel won't do any more maintainer work on the DMA (or scatterlist) infrastructure, but he'd like to be kept in the loop, hence change is entry to 'R:'. Analogous to [1], the DMA (and scatterlist) helpers are closely coupled with the core device infrastructure and the device lifecycle, hence take patches through the driver-core tree by default. Cc: Abdiel Janulgue Link: https://lore.kernel.org/r/20250725202840.2251768-1-ojeda@kernel.org [1] Acked-by: Abdiel Janulgue Acked-by: Greg Kroah-Hartman Signed-off-by: Danilo Krummrich --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index fed6cd812d796a..281149d9b82134 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7239,15 +7239,15 @@ F: include/linux/swiotlb.h F: kernel/dma/ DMA MAPPING HELPERS DEVICE DRIVER API [RUST] -M: Abdiel Janulgue M: Danilo Krummrich +R: Abdiel Janulgue R: Daniel Almeida R: Robin Murphy R: Andreas Hindborg L: rust-for-linux@vger.kernel.org S: Supported W: https://rust-for-linux.com -T: git https://github.com/Rust-for-Linux/linux.git alloc-next +T: git git://git.kernel.org/pub/scm/linux/kernel/git/driver-core/driver-core.git F: rust/helpers/dma.c F: rust/kernel/dma.rs F: samples/rust/rust_dma.rs From 54d94c422fed9575b74167333c1757847a4e6899 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 24 Aug 2025 15:28:00 -0700 Subject: [PATCH 1465/3206] lsm: CONFIG_LSM can depend on CONFIG_SECURITY When CONFIG_SECURITY is not set, CONFIG_LSM (builtin_lsm_order) does not need to be visible and settable since builtin_lsm_order is defined in security.o, which is only built when CONFIG_SECURITY=y. So make CONFIG_LSM depend on CONFIG_SECURITY. Fixes: 13e735c0e953 ("LSM: Introduce CONFIG_LSM") Signed-off-by: Randy Dunlap [PM: subj tweak] Signed-off-by: Paul Moore --- security/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/security/Kconfig b/security/Kconfig index 4816fc74f81ebe..285f284dfcac44 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -269,6 +269,7 @@ endchoice config LSM string "Ordered list of enabled LSMs" + depends on SECURITY default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,ipe,bpf" if DEFAULT_SECURITY_SMACK default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,ipe,bpf" if DEFAULT_SECURITY_APPARMOR default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,ipe,bpf" if DEFAULT_SECURITY_TOMOYO From 136d8a6f73fee0686d163dca91fdffb35e25f092 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 12 Sep 2025 07:13:11 +0900 Subject: [PATCH 1466/3206] firewire: core: remove useless lockdep_assert_held() The bm_work work item should be scheduled after holding fw_card reference counting. At a commit 25feb1a96e21 ("firewire: core: use cleanup function in bm_work"), I misinterpreted it as fw_card spinlock and inserted lockdep_assert_hold() wrongly. This commit removes the useless line. Fixes: 25feb1a96e21 ("firewire: core: use cleanup function in bm_work") Link: https://lore.kernel.org/r/20250911221312.678076-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 474d8066e090ba..32cf6b3344cd90 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -294,8 +294,6 @@ static void bm_work(struct work_struct *work) int expected_gap_count, generation, grace; bool do_reset = false; - lockdep_assert_held(&card->lock); - spin_lock_irq(&card->lock); if (card->local_node == NULL) { From 247981eecd3dd6ff51bd0a0223deba8af39c5498 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Wed, 10 Sep 2025 20:37:16 +0000 Subject: [PATCH 1467/3206] net: Use NAPI_* in test_bit when stopping napi kthread napi_stop_kthread waits for the NAPI_STATE_SCHED_THREADED to be unset before stopping the kthread. But it uses test_bit with the NAPIF_STATE_SCHED_THREADED and that might stop the kthread early before the flag is unset. Use the NAPI_* variant of the NAPI state bits in test_bit instead. Tested: ./tools/testing/selftests/net/nl_netdev.py TAP version 13 1..7 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check ok 4 nl_netdev.napi_list_check ok 5 nl_netdev.dev_set_threaded ok 6 nl_netdev.napi_set_threaded ok 7 nl_netdev.nsim_rxq_reset_down # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 ./tools/testing/selftests/drivers/net/napi_threaded.py TAP version 13 1..2 ok 1 napi_threaded.change_num_queues ok 2 napi_threaded.enable_dev_threaded_disable_napi_threaded # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 689883de94dd ("net: stop napi kthreads when THREADED napi is disabled") Signed-off-by: Samiullah Khawaja Link: https://patch.msgid.link/20250910203716.1016546-1-skhawaja@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 93a25d87b86b66..8d49b2198d072f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6965,7 +6965,7 @@ static void napi_stop_kthread(struct napi_struct *napi) * the kthread. */ while (true) { - if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state)) + if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) break; msleep(20); From 5577352b55833d0f4350eb5d62eda2df09e84922 Mon Sep 17 00:00:00 2001 From: Li Tian Date: Wed, 10 Sep 2025 08:37:32 +0800 Subject: [PATCH 1468/3206] net/mlx5: Not returning mlx5_link_info table when speed is unknown Because mlx5e_link_info and mlx5e_ext_link_info have holes e.g. Azure mlx5 reports PTYS 19. Do not return it unless speed is retrieved successfully. Fixes: 65a5d35571849 ("net/mlx5: Refactor link speed handling with mlx5_link_info struct") Suggested-by: Vitaly Kuznetsov Signed-off-by: Li Tian Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250910003732.5973-1-litian@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/port.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 2d7adf7444ba29..aa9f2b0a77d36f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -1170,7 +1170,11 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev, mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, force_legacy); i = find_first_bit(&temp, max_size); - if (i < max_size) + + /* mlx5e_link_info has holes. Check speed + * is not zero as indication of one. + */ + if (i < max_size && table[i].speed) return &table[i]; return NULL; From 2690cb089502b80b905f2abdafd1bf2d54e1abef Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 10 Sep 2025 17:48:25 +0300 Subject: [PATCH 1469/3206] dpaa2-switch: fix buffer pool seeding for control traffic Starting with commit c50e7475961c ("dpaa2-switch: Fix error checking in dpaa2_switch_seed_bp()"), the probing of a second DPSW object errors out like below. fsl_dpaa2_switch dpsw.1: fsl_mc_driver_probe failed: -12 fsl_dpaa2_switch dpsw.1: probe with driver fsl_dpaa2_switch failed with error -12 The aforementioned commit brought to the surface the fact that seeding buffers into the buffer pool destined for control traffic is not successful and an access violation recoverable error can be seen in the MC firmware log: [E, qbman_rec_isr:391, QBMAN] QBMAN recoverable event 0x1000000 This happens because the driver incorrectly used the ID of the DPBP object instead of the hardware buffer pool ID when trying to release buffers into it. This is because any DPSW object uses two buffer pools, one managed by the Linux driver and destined for control traffic packet buffers and the other one managed by the MC firmware and destined only for offloaded traffic. And since the buffer pool managed by the MC firmware does not have an external facing DPBP equivalent, any subsequent DPBP objects created after the first DPSW will have a DPBP id different to the underlying hardware buffer ID. The issue was not caught earlier because these two numbers can be identical when all DPBP objects are created before the DPSW objects are. This is the case when the DPL file is used to describe the entire DPAA2 object layout and objects are created at boot time and it's also true for the first DPSW being created dynamically using ls-addsw. Fix this by using the buffer pool ID instead of the DPBP id when releasing buffers into the pool. Fixes: 2877e4f7e189 ("staging: dpaa2-switch: setup buffer pool and RX path rings") Signed-off-by: Ioana Ciornei Link: https://patch.msgid.link/20250910144825.2416019-1-ioana.ciornei@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index 4643a338061820..b1e1ad9e4b48e6 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -2736,7 +2736,7 @@ static int dpaa2_switch_setup_dpbp(struct ethsw_core *ethsw) dev_err(dev, "dpsw_ctrl_if_set_pools() failed\n"); goto err_get_attr; } - ethsw->bpid = dpbp_attrs.id; + ethsw->bpid = dpbp_attrs.bpid; return 0; From 29341c6c18b8ad2a9a4a68a61be7e1272d842f21 Mon Sep 17 00:00:00 2001 From: Jihed Chaibi Date: Sat, 30 Aug 2025 22:37:50 +0200 Subject: [PATCH 1470/3206] ARM: dts: kirkwood: Fix sound DAI cells for OpenRD clients A previous commit changed the '#sound-dai-cells' property for the kirkwood audio controller from 1 to 0 in the kirkwood.dtsi file, but did not update the corresponding 'sound-dai' property in the kirkwood-openrd-client.dts file. This created a mismatch, causing a dtbs_check validation error where the dts provides one cell (<&audio0 0>) while the .dtsi expects zero. Remove the extraneous cell from the 'sound-dai' property to fix the schema validation warning and align with the updated binding. Fixes: e662e70fa419 ("arm: dts: kirkwood: fix error in #sound-dai-cells size") Signed-off-by: Jihed Chaibi Reviewed-by: Krzysztof Kozlowski Signed-off-by: Gregory CLEMENT --- arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts b/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts index d4e0b8150a84ce..cf26e2ceaaa074 100644 --- a/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts +++ b/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts @@ -38,7 +38,7 @@ simple-audio-card,mclk-fs = <256>; simple-audio-card,cpu { - sound-dai = <&audio0 0>; + sound-dai = <&audio0>; }; simple-audio-card,codec { From 7eee64e8be51f9ff0393b5bd0752a6e8f9252bf9 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:25:47 +0200 Subject: [PATCH 1471/3206] gpio: use more common syntax for compound literals The (typeof(foo)) construct is unusual in the kernel, use a more typical syntax by explicitly spelling out the type. Link: https://lore.kernel.org/all/20250909-gpio-mmio-gpio-conv-part4-v1-13-9f723dc3524a@linaro.org/ Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250910-make-compound-literals-normal-again-v1-3-076ee7738a0b@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-amdpt.c | 2 +- drivers/gpio/gpio-blzp1600.c | 2 +- drivers/gpio/gpio-dwapb.c | 2 +- drivers/gpio/gpio-ep93xx.c | 2 +- drivers/gpio/gpio-ftgpio010.c | 2 +- drivers/gpio/gpio-ge.c | 2 +- drivers/gpio/gpio-grgpio.c | 2 +- drivers/gpio/gpio-hisi.c | 2 +- drivers/gpio/gpio-idt3243x.c | 2 +- drivers/gpio/gpio-ixp4xx.c | 2 +- drivers/gpio/gpio-loongson-64bit.c | 2 +- drivers/gpio/gpio-mlxbf.c | 2 +- drivers/gpio/gpio-mlxbf2.c | 2 +- drivers/gpio/gpio-mlxbf3.c | 2 +- drivers/gpio/gpio-mpc8xxx.c | 2 +- drivers/gpio/gpio-mxs.c | 2 +- drivers/gpio/gpio-rda.c | 2 +- drivers/gpio/gpio-realtek-otto.c | 2 +- drivers/gpio/gpio-tb10x.c | 2 +- drivers/gpio/gpio-ts4800.c | 2 +- drivers/gpio/gpio-vf610.c | 2 +- drivers/gpio/gpio-visconti.c | 2 +- drivers/gpio/gpio-xgene-sb.c | 2 +- drivers/gpio/gpio-xgs-iproc.c | 2 +- 24 files changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/gpio/gpio-amdpt.c b/drivers/gpio/gpio-amdpt.c index 0a9b870705b90b..bbaf42307bc3d7 100644 --- a/drivers/gpio/gpio-amdpt.c +++ b/drivers/gpio/gpio-amdpt.c @@ -88,7 +88,7 @@ static int pt_gpio_probe(struct platform_device *pdev) return PTR_ERR(pt_gpio->reg_base); } - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = pt_gpio->reg_base + PT_INPUTDATA_REG, diff --git a/drivers/gpio/gpio-blzp1600.c b/drivers/gpio/gpio-blzp1600.c index bfb35d59fa561c..0f8c826ba87612 100644 --- a/drivers/gpio/gpio-blzp1600.c +++ b/drivers/gpio/gpio-blzp1600.c @@ -230,7 +230,7 @@ static int blzp1600_gpio_probe(struct platform_device *pdev) if (IS_ERR(chip->base)) return PTR_ERR(chip->base); - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = &pdev->dev, .sz = 4, .dat = chip->base + GPIO_IDATA_REG, diff --git a/drivers/gpio/gpio-dwapb.c b/drivers/gpio/gpio-dwapb.c index 0fb781ca9da295..b42ff46d292bd8 100644 --- a/drivers/gpio/gpio-dwapb.c +++ b/drivers/gpio/gpio-dwapb.c @@ -525,7 +525,7 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio, set = gpio->regs + GPIO_SWPORTA_DR + pp->idx * GPIO_SWPORT_DR_STRIDE; dirout = gpio->regs + GPIO_SWPORTA_DDR + pp->idx * GPIO_SWPORT_DDR_STRIDE; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = gpio->dev, .sz = 4, .dat = dat, diff --git a/drivers/gpio/gpio-ep93xx.c b/drivers/gpio/gpio-ep93xx.c index c6c8170813331b..1f56e44ffc9a3c 100644 --- a/drivers/gpio/gpio-ep93xx.c +++ b/drivers/gpio/gpio-ep93xx.c @@ -352,7 +352,7 @@ static int ep93xx_gpio_probe(struct platform_device *pdev) gc = &egc->chip.gc; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = &pdev->dev, .sz = 1, .dat = data, diff --git a/drivers/gpio/gpio-ftgpio010.c b/drivers/gpio/gpio-ftgpio010.c index dfa2c9444960a3..11e6907c3b5401 100644 --- a/drivers/gpio/gpio-ftgpio010.c +++ b/drivers/gpio/gpio-ftgpio010.c @@ -264,7 +264,7 @@ static int ftgpio_gpio_probe(struct platform_device *pdev) */ return PTR_ERR(g->clk); - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = g->base + GPIO_DATA_IN, diff --git a/drivers/gpio/gpio-ge.c b/drivers/gpio/gpio-ge.c index a02dd322e0d4ce..b5cbf27b8f4422 100644 --- a/drivers/gpio/gpio-ge.c +++ b/drivers/gpio/gpio-ge.c @@ -67,7 +67,7 @@ static int __init gef_gpio_probe(struct platform_device *pdev) if (IS_ERR(regs)) return PTR_ERR(regs); - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = regs + GEF_GPIO_IN, diff --git a/drivers/gpio/gpio-grgpio.c b/drivers/gpio/gpio-grgpio.c index 3b77fad00749cd..5930f4c6f2b578 100644 --- a/drivers/gpio/gpio-grgpio.c +++ b/drivers/gpio/gpio-grgpio.c @@ -353,7 +353,7 @@ static int grgpio_probe(struct platform_device *ofdev) if (IS_ERR(regs)) return PTR_ERR(regs); - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = regs + GRGPIO_DATA, diff --git a/drivers/gpio/gpio-hisi.c b/drivers/gpio/gpio-hisi.c index 01a99ac613d94e..d8c4ab02ceaef7 100644 --- a/drivers/gpio/gpio-hisi.c +++ b/drivers/gpio/gpio-hisi.c @@ -292,7 +292,7 @@ static int hisi_gpio_probe(struct platform_device *pdev) hisi_gpio->dev = dev; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = hisi_gpio->dev, .sz = 4, .dat = hisi_gpio->reg_base + HISI_GPIO_EXT_PORT_WX, diff --git a/drivers/gpio/gpio-idt3243x.c b/drivers/gpio/gpio-idt3243x.c index 232a621ba086ef..56f1f1e57b6943 100644 --- a/drivers/gpio/gpio-idt3243x.c +++ b/drivers/gpio/gpio-idt3243x.c @@ -147,7 +147,7 @@ static int idt_gpio_probe(struct platform_device *pdev) ctrl->chip.gc.parent = dev; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = &pdev->dev, .sz = 4, .dat = ctrl->gpio + IDT_GPIO_DATA, diff --git a/drivers/gpio/gpio-ixp4xx.c b/drivers/gpio/gpio-ixp4xx.c index 0cf10d0ba16ef7..8a3b6b192288c8 100644 --- a/drivers/gpio/gpio-ixp4xx.c +++ b/drivers/gpio/gpio-ixp4xx.c @@ -294,7 +294,7 @@ static int ixp4xx_gpio_probe(struct platform_device *pdev) flags = 0; #endif - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = g->base + IXP4XX_REG_GPIN, diff --git a/drivers/gpio/gpio-loongson-64bit.c b/drivers/gpio/gpio-loongson-64bit.c index 2120e908c634e3..02f181cb219e99 100644 --- a/drivers/gpio/gpio-loongson-64bit.c +++ b/drivers/gpio/gpio-loongson-64bit.c @@ -292,7 +292,7 @@ static int loongson_gpio_init(struct platform_device *pdev, struct loongson_gpio lgpio->reg_base = reg_base; if (lgpio->chip_data->mode == BIT_CTRL_MODE) { - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = &pdev->dev, .sz = 8, .dat = lgpio->reg_base + lgpio->chip_data->in_offset, diff --git a/drivers/gpio/gpio-mlxbf.c b/drivers/gpio/gpio-mlxbf.c index 843f40496be7b7..a18fedbc463e67 100644 --- a/drivers/gpio/gpio-mlxbf.c +++ b/drivers/gpio/gpio-mlxbf.c @@ -66,7 +66,7 @@ static int mlxbf_gpio_probe(struct platform_device *pdev) gc = &gs->chip.gc; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 8, .dat = gs->base + MLXBF_GPIO_PIN_STATE, diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c index f99f66cd189ca7..7e3b526a6caae0 100644 --- a/drivers/gpio/gpio-mlxbf2.c +++ b/drivers/gpio/gpio-mlxbf2.c @@ -377,7 +377,7 @@ mlxbf2_gpio_probe(struct platform_device *pdev) gc = &gs->chip.gc; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = gs->gpio_io + YU_GPIO_DATAIN, diff --git a/drivers/gpio/gpio-mlxbf3.c b/drivers/gpio/gpio-mlxbf3.c index c812011bdbe65a..4770578269bae8 100644 --- a/drivers/gpio/gpio-mlxbf3.c +++ b/drivers/gpio/gpio-mlxbf3.c @@ -209,7 +209,7 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) return PTR_ERR(gs->gpio_clr_io); gc = &gs->chip.gc; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = gs->gpio_io + MLXBF_GPIO_READ_DATA_IN, diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index 38643fb813c562..dd2cd2cc6e6f29 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -345,7 +345,7 @@ static int mpc8xxx_probe(struct platform_device *pdev) gc = &mpc8xxx_gc->chip.gc; gc->parent = dev; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = mpc8xxx_gc->regs + GPIO_DAT, diff --git a/drivers/gpio/gpio-mxs.c b/drivers/gpio/gpio-mxs.c index af45d1b1af6e04..5635694bf9f448 100644 --- a/drivers/gpio/gpio-mxs.c +++ b/drivers/gpio/gpio-mxs.c @@ -321,7 +321,7 @@ static int mxs_gpio_probe(struct platform_device *pdev) irq_set_chained_handler_and_data(port->irq, mxs_gpio_irq_handler, port); - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = &pdev->dev, .sz = 4, .dat = port->base + PINCTRL_DIN(port), diff --git a/drivers/gpio/gpio-rda.c b/drivers/gpio/gpio-rda.c index bcd85a2237a532..fb479d13eb01a4 100644 --- a/drivers/gpio/gpio-rda.c +++ b/drivers/gpio/gpio-rda.c @@ -237,7 +237,7 @@ static int rda_gpio_probe(struct platform_device *pdev) spin_lock_init(&rda_gpio->lock); - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = rda_gpio->base + RDA_GPIO_VAL, diff --git a/drivers/gpio/gpio-realtek-otto.c b/drivers/gpio/gpio-realtek-otto.c index ab711422254e9e..37b4f73771e651 100644 --- a/drivers/gpio/gpio-realtek-otto.c +++ b/drivers/gpio/gpio-realtek-otto.c @@ -401,7 +401,7 @@ static int realtek_gpio_probe(struct platform_device *pdev) ctrl->line_imr_pos = realtek_gpio_line_imr_pos_swapped; } - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = ctrl->base + REALTEK_GPIO_REG_DATA, diff --git a/drivers/gpio/gpio-tb10x.c b/drivers/gpio/gpio-tb10x.c index f20b6654b86555..09a448ce3eec2f 100644 --- a/drivers/gpio/gpio-tb10x.c +++ b/drivers/gpio/gpio-tb10x.c @@ -135,7 +135,7 @@ static int tb10x_gpio_probe(struct platform_device *pdev) * the lines, no special set or clear registers and a data direction register * wher 1 means "output". */ - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = tb10x_gpio->base + OFFSET_TO_REG_DATA, diff --git a/drivers/gpio/gpio-ts4800.c b/drivers/gpio/gpio-ts4800.c index 844347945e8e71..992ee231db9ff8 100644 --- a/drivers/gpio/gpio-ts4800.c +++ b/drivers/gpio/gpio-ts4800.c @@ -39,7 +39,7 @@ static int ts4800_gpio_probe(struct platform_device *pdev) else if (retval) return retval; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 2, .dat = base_addr + INPUT_REG_OFFSET, diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c index fa7e322a834cc2..f3590db72b1412 100644 --- a/drivers/gpio/gpio-vf610.c +++ b/drivers/gpio/gpio-vf610.c @@ -305,7 +305,7 @@ static int vf610_gpio_probe(struct platform_device *pdev) if (port->sdata->have_paddr) flags |= BGPIOF_READ_OUTPUT_REG_SET; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = port->gpio_base + GPIO_PDIR, diff --git a/drivers/gpio/gpio-visconti.c b/drivers/gpio/gpio-visconti.c index cde1581a91033e..6d5d829634ad76 100644 --- a/drivers/gpio/gpio-visconti.c +++ b/drivers/gpio/gpio-visconti.c @@ -191,7 +191,7 @@ static int visconti_gpio_probe(struct platform_device *pdev) return -ENODEV; } - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = priv->base + GPIO_IDATA, diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c index c559a89aadf7a7..28ee3f7e91b921 100644 --- a/drivers/gpio/gpio-xgene-sb.c +++ b/drivers/gpio/gpio-xgene-sb.c @@ -265,7 +265,7 @@ static int xgene_gpio_sb_probe(struct platform_device *pdev) return -ENODEV; } - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = &pdev->dev, .sz = 4, .dat = regs + MPA_GPIO_IN_ADDR, diff --git a/drivers/gpio/gpio-xgs-iproc.c b/drivers/gpio/gpio-xgs-iproc.c index 9cffdedd31b1c3..77eb29dcc2171a 100644 --- a/drivers/gpio/gpio-xgs-iproc.c +++ b/drivers/gpio/gpio-xgs-iproc.c @@ -233,7 +233,7 @@ static int iproc_gpio_probe(struct platform_device *pdev) if (IS_ERR(chip->base)) return PTR_ERR(chip->base); - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = chip->base + IPROC_GPIO_CCA_DIN, From 571c65bb2f4d17198189cf8d161b96f32674642b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 9 Sep 2025 14:28:43 +0200 Subject: [PATCH 1472/3206] gpiolib: add a common prefix to GPIO descriptor flags While these flags are private within drivers/gpio/, when looking at the code, it's not really clear they are GPIO-specific. Since these are GPIO descriptor flags, prepend their names with a common "GPIOD" prefix. While at it: update the flags' docs: make spelling consistent, correct outdated information, etc. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250909-rename-gpio-flags-v1-1-bda208a40856@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-cdev.c | 90 +++++++++---------- drivers/gpio/gpiolib-of.c | 2 +- drivers/gpio/gpiolib-sysfs.c | 46 +++++----- drivers/gpio/gpiolib.c | 166 +++++++++++++++++------------------ drivers/gpio/gpiolib.h | 36 ++++---- 5 files changed, 170 insertions(+), 170 deletions(-) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index e6a289fa0f8fd5..175836467f216a 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -144,17 +144,17 @@ static void linehandle_flags_to_desc_flags(u32 lflags, unsigned long *flagsp) { unsigned long flags = READ_ONCE(*flagsp); - assign_bit(FLAG_ACTIVE_LOW, &flags, + assign_bit(GPIOD_FLAG_ACTIVE_LOW, &flags, lflags & GPIOHANDLE_REQUEST_ACTIVE_LOW); - assign_bit(FLAG_OPEN_DRAIN, &flags, + assign_bit(GPIOD_FLAG_OPEN_DRAIN, &flags, lflags & GPIOHANDLE_REQUEST_OPEN_DRAIN); - assign_bit(FLAG_OPEN_SOURCE, &flags, + assign_bit(GPIOD_FLAG_OPEN_SOURCE, &flags, lflags & GPIOHANDLE_REQUEST_OPEN_SOURCE); - assign_bit(FLAG_PULL_UP, &flags, + assign_bit(GPIOD_FLAG_PULL_UP, &flags, lflags & GPIOHANDLE_REQUEST_BIAS_PULL_UP); - assign_bit(FLAG_PULL_DOWN, &flags, + assign_bit(GPIOD_FLAG_PULL_DOWN, &flags, lflags & GPIOHANDLE_REQUEST_BIAS_PULL_DOWN); - assign_bit(FLAG_BIAS_DISABLE, &flags, + assign_bit(GPIOD_FLAG_BIAS_DISABLE, &flags, lflags & GPIOHANDLE_REQUEST_BIAS_DISABLE); WRITE_ONCE(*flagsp, flags); @@ -238,7 +238,7 @@ static long linehandle_ioctl(struct file *file, unsigned int cmd, * All line descriptors were created at once with the same * flags so just check if the first one is really output. */ - if (!test_bit(FLAG_IS_OUT, &lh->descs[0]->flags)) + if (!test_bit(GPIOD_FLAG_IS_OUT, &lh->descs[0]->flags)) return -EPERM; if (copy_from_user(&ghd, ip, sizeof(ghd))) @@ -599,10 +599,10 @@ static void linereq_put_event(struct linereq *lr, static u64 line_event_timestamp(struct line *line) { - if (test_bit(FLAG_EVENT_CLOCK_REALTIME, &line->desc->flags)) + if (test_bit(GPIOD_FLAG_EVENT_CLOCK_REALTIME, &line->desc->flags)) return ktime_get_real_ns(); else if (IS_ENABLED(CONFIG_HTE) && - test_bit(FLAG_EVENT_CLOCK_HTE, &line->desc->flags)) + test_bit(GPIOD_FLAG_EVENT_CLOCK_HTE, &line->desc->flags)) return line->timestamp_ns; return ktime_get_ns(); @@ -725,11 +725,11 @@ static int hte_edge_setup(struct line *line, u64 eflags) struct hte_ts_desc *hdesc = &line->hdesc; if (eflags & GPIO_V2_LINE_FLAG_EDGE_RISING) - flags |= test_bit(FLAG_ACTIVE_LOW, &line->desc->flags) ? + flags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &line->desc->flags) ? HTE_FALLING_EDGE_TS : HTE_RISING_EDGE_TS; if (eflags & GPIO_V2_LINE_FLAG_EDGE_FALLING) - flags |= test_bit(FLAG_ACTIVE_LOW, &line->desc->flags) ? + flags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &line->desc->flags) ? HTE_RISING_EDGE_TS : HTE_FALLING_EDGE_TS; @@ -831,7 +831,7 @@ static bool debounced_value(struct line *line) */ value = READ_ONCE(line->level); - if (test_bit(FLAG_ACTIVE_LOW, &line->desc->flags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &line->desc->flags)) value = !value; return value; @@ -939,7 +939,7 @@ static int debounce_setup(struct line *line, unsigned int debounce_period_us) return level; if (!(IS_ENABLED(CONFIG_HTE) && - test_bit(FLAG_EVENT_CLOCK_HTE, &line->desc->flags))) { + test_bit(GPIOD_FLAG_EVENT_CLOCK_HTE, &line->desc->flags))) { irq = gpiod_to_irq(line->desc); if (irq < 0) return -ENXIO; @@ -1061,10 +1061,10 @@ static int edge_detector_setup(struct line *line, return -ENXIO; if (eflags & GPIO_V2_LINE_FLAG_EDGE_RISING) - irqflags |= test_bit(FLAG_ACTIVE_LOW, &line->desc->flags) ? + irqflags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &line->desc->flags) ? IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING; if (eflags & GPIO_V2_LINE_FLAG_EDGE_FALLING) - irqflags |= test_bit(FLAG_ACTIVE_LOW, &line->desc->flags) ? + irqflags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &line->desc->flags) ? IRQF_TRIGGER_RISING : IRQF_TRIGGER_FALLING; irqflags |= IRQF_ONESHOT; @@ -1237,34 +1237,34 @@ static void gpio_v2_line_config_flags_to_desc_flags(u64 lflags, { unsigned long flags = READ_ONCE(*flagsp); - assign_bit(FLAG_ACTIVE_LOW, &flags, + assign_bit(GPIOD_FLAG_ACTIVE_LOW, &flags, lflags & GPIO_V2_LINE_FLAG_ACTIVE_LOW); if (lflags & GPIO_V2_LINE_FLAG_OUTPUT) - set_bit(FLAG_IS_OUT, &flags); + set_bit(GPIOD_FLAG_IS_OUT, &flags); else if (lflags & GPIO_V2_LINE_FLAG_INPUT) - clear_bit(FLAG_IS_OUT, &flags); + clear_bit(GPIOD_FLAG_IS_OUT, &flags); - assign_bit(FLAG_EDGE_RISING, &flags, + assign_bit(GPIOD_FLAG_EDGE_RISING, &flags, lflags & GPIO_V2_LINE_FLAG_EDGE_RISING); - assign_bit(FLAG_EDGE_FALLING, &flags, + assign_bit(GPIOD_FLAG_EDGE_FALLING, &flags, lflags & GPIO_V2_LINE_FLAG_EDGE_FALLING); - assign_bit(FLAG_OPEN_DRAIN, &flags, + assign_bit(GPIOD_FLAG_OPEN_DRAIN, &flags, lflags & GPIO_V2_LINE_FLAG_OPEN_DRAIN); - assign_bit(FLAG_OPEN_SOURCE, &flags, + assign_bit(GPIOD_FLAG_OPEN_SOURCE, &flags, lflags & GPIO_V2_LINE_FLAG_OPEN_SOURCE); - assign_bit(FLAG_PULL_UP, &flags, + assign_bit(GPIOD_FLAG_PULL_UP, &flags, lflags & GPIO_V2_LINE_FLAG_BIAS_PULL_UP); - assign_bit(FLAG_PULL_DOWN, &flags, + assign_bit(GPIOD_FLAG_PULL_DOWN, &flags, lflags & GPIO_V2_LINE_FLAG_BIAS_PULL_DOWN); - assign_bit(FLAG_BIAS_DISABLE, &flags, + assign_bit(GPIOD_FLAG_BIAS_DISABLE, &flags, lflags & GPIO_V2_LINE_FLAG_BIAS_DISABLED); - assign_bit(FLAG_EVENT_CLOCK_REALTIME, &flags, + assign_bit(GPIOD_FLAG_EVENT_CLOCK_REALTIME, &flags, lflags & GPIO_V2_LINE_FLAG_EVENT_CLOCK_REALTIME); - assign_bit(FLAG_EVENT_CLOCK_HTE, &flags, + assign_bit(GPIOD_FLAG_EVENT_CLOCK_HTE, &flags, lflags & GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE); WRITE_ONCE(*flagsp, flags); @@ -2115,10 +2115,10 @@ static int lineevent_create(struct gpio_device *gdev, void __user *ip) } if (eflags & GPIOEVENT_REQUEST_RISING_EDGE) - irqflags |= test_bit(FLAG_ACTIVE_LOW, &desc->flags) ? + irqflags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags) ? IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING; if (eflags & GPIOEVENT_REQUEST_FALLING_EDGE) - irqflags |= test_bit(FLAG_ACTIVE_LOW, &desc->flags) ? + irqflags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags) ? IRQF_TRIGGER_RISING : IRQF_TRIGGER_FALLING; irqflags |= IRQF_ONESHOT; @@ -2253,7 +2253,7 @@ static void gpio_desc_to_lineinfo(struct gpio_desc *desc, scoped_guard(srcu, &desc->gdev->desc_srcu) { label = gpiod_get_label(desc); - if (label && test_bit(FLAG_REQUESTED, &dflags)) + if (label && test_bit(GPIOD_FLAG_REQUESTED, &dflags)) strscpy(info->consumer, label, sizeof(info->consumer)); } @@ -2270,10 +2270,10 @@ static void gpio_desc_to_lineinfo(struct gpio_desc *desc, * The definitive test that a line is available to userspace is to * request it. */ - if (test_bit(FLAG_REQUESTED, &dflags) || - test_bit(FLAG_IS_HOGGED, &dflags) || - test_bit(FLAG_EXPORT, &dflags) || - test_bit(FLAG_SYSFS, &dflags) || + if (test_bit(GPIOD_FLAG_REQUESTED, &dflags) || + test_bit(GPIOD_FLAG_IS_HOGGED, &dflags) || + test_bit(GPIOD_FLAG_EXPORT, &dflags) || + test_bit(GPIOD_FLAG_SYSFS, &dflags) || !gpiochip_line_is_valid(guard.gc, info->offset)) { info->flags |= GPIO_V2_LINE_FLAG_USED; } else if (!atomic) { @@ -2281,34 +2281,34 @@ static void gpio_desc_to_lineinfo(struct gpio_desc *desc, info->flags |= GPIO_V2_LINE_FLAG_USED; } - if (test_bit(FLAG_IS_OUT, &dflags)) + if (test_bit(GPIOD_FLAG_IS_OUT, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_OUTPUT; else info->flags |= GPIO_V2_LINE_FLAG_INPUT; - if (test_bit(FLAG_ACTIVE_LOW, &dflags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_ACTIVE_LOW; - if (test_bit(FLAG_OPEN_DRAIN, &dflags)) + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_OPEN_DRAIN; - if (test_bit(FLAG_OPEN_SOURCE, &dflags)) + if (test_bit(GPIOD_FLAG_OPEN_SOURCE, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_OPEN_SOURCE; - if (test_bit(FLAG_BIAS_DISABLE, &dflags)) + if (test_bit(GPIOD_FLAG_BIAS_DISABLE, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_BIAS_DISABLED; - if (test_bit(FLAG_PULL_DOWN, &dflags)) + if (test_bit(GPIOD_FLAG_PULL_DOWN, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_BIAS_PULL_DOWN; - if (test_bit(FLAG_PULL_UP, &dflags)) + if (test_bit(GPIOD_FLAG_PULL_UP, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_BIAS_PULL_UP; - if (test_bit(FLAG_EDGE_RISING, &dflags)) + if (test_bit(GPIOD_FLAG_EDGE_RISING, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_EDGE_RISING; - if (test_bit(FLAG_EDGE_FALLING, &dflags)) + if (test_bit(GPIOD_FLAG_EDGE_FALLING, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_EDGE_FALLING; - if (test_bit(FLAG_EVENT_CLOCK_REALTIME, &dflags)) + if (test_bit(GPIOD_FLAG_EVENT_CLOCK_REALTIME, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_EVENT_CLOCK_REALTIME; - else if (test_bit(FLAG_EVENT_CLOCK_HTE, &dflags)) + else if (test_bit(GPIOD_FLAG_EVENT_CLOCK_HTE, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE; debounce_period_us = READ_ONCE(desc->debounce_period_us); diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 37ab78243faba2..fad4edf9cc5c0c 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -878,7 +878,7 @@ static void of_gpiochip_remove_hog(struct gpio_chip *chip, { struct gpio_desc *desc; - for_each_gpio_desc_with_flag(chip, desc, FLAG_IS_HOGGED) + for_each_gpio_desc_with_flag(chip, desc, GPIOD_FLAG_IS_HOGGED) if (READ_ONCE(desc->hog) == hog) gpiochip_free_own_desc(desc); } diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c index b64106f1cb7b90..9a849245b35880 100644 --- a/drivers/gpio/gpiolib-sysfs.c +++ b/drivers/gpio/gpiolib-sysfs.c @@ -131,7 +131,7 @@ static ssize_t direction_show(struct device *dev, scoped_guard(mutex, &data->mutex) { gpiod_get_direction(desc); - value = !!test_bit(FLAG_IS_OUT, &desc->flags); + value = !!test_bit(GPIOD_FLAG_IS_OUT, &desc->flags); } return sysfs_emit(buf, "%s\n", value ? "out" : "in"); @@ -226,14 +226,14 @@ static int gpio_sysfs_request_irq(struct gpiod_data *data, unsigned char flags) irq_flags = IRQF_SHARED; if (flags & GPIO_IRQF_TRIGGER_FALLING) { - irq_flags |= test_bit(FLAG_ACTIVE_LOW, &desc->flags) ? + irq_flags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags) ? IRQF_TRIGGER_RISING : IRQF_TRIGGER_FALLING; - set_bit(FLAG_EDGE_FALLING, &desc->flags); + set_bit(GPIOD_FLAG_EDGE_FALLING, &desc->flags); } if (flags & GPIO_IRQF_TRIGGER_RISING) { - irq_flags |= test_bit(FLAG_ACTIVE_LOW, &desc->flags) ? + irq_flags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags) ? IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING; - set_bit(FLAG_EDGE_RISING, &desc->flags); + set_bit(GPIOD_FLAG_EDGE_RISING, &desc->flags); } /* @@ -260,8 +260,8 @@ static int gpio_sysfs_request_irq(struct gpiod_data *data, unsigned char flags) err_unlock: gpiochip_unlock_as_irq(guard.gc, gpio_chip_hwgpio(desc)); err_clr_bits: - clear_bit(FLAG_EDGE_RISING, &desc->flags); - clear_bit(FLAG_EDGE_FALLING, &desc->flags); + clear_bit(GPIOD_FLAG_EDGE_RISING, &desc->flags); + clear_bit(GPIOD_FLAG_EDGE_FALLING, &desc->flags); return ret; } @@ -281,8 +281,8 @@ static void gpio_sysfs_free_irq(struct gpiod_data *data) data->irq_flags = 0; free_irq(data->irq, data); gpiochip_unlock_as_irq(guard.gc, gpio_chip_hwgpio(desc)); - clear_bit(FLAG_EDGE_RISING, &desc->flags); - clear_bit(FLAG_EDGE_FALLING, &desc->flags); + clear_bit(GPIOD_FLAG_EDGE_RISING, &desc->flags); + clear_bit(GPIOD_FLAG_EDGE_FALLING, &desc->flags); } static const char *const trigger_names[] = { @@ -347,10 +347,10 @@ static int gpio_sysfs_set_active_low(struct gpiod_data *data, int value) struct gpio_desc *desc = data->desc; int status = 0; - if (!!test_bit(FLAG_ACTIVE_LOW, &desc->flags) == !!value) + if (!!test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags) == !!value) return 0; - assign_bit(FLAG_ACTIVE_LOW, &desc->flags, value); + assign_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags, value); /* reconfigure poll(2) support if enabled on one edge only */ if (flags == GPIO_IRQF_TRIGGER_FALLING || @@ -373,7 +373,7 @@ static ssize_t active_low_show(struct device *dev, int value; scoped_guard(mutex, &data->mutex) - value = !!test_bit(FLAG_ACTIVE_LOW, &desc->flags); + value = !!test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags); return sysfs_emit(buf, "%d\n", value); } @@ -418,7 +418,7 @@ static umode_t gpio_is_visible(struct kobject *kobj, struct attribute *attr, mode = 0; if (!data->direction_can_change && - test_bit(FLAG_IS_OUT, &data->desc->flags)) + test_bit(GPIOD_FLAG_IS_OUT, &data->desc->flags)) mode = 0; #endif /* CONFIG_GPIO_SYSFS_LEGACY */ } @@ -486,7 +486,7 @@ static int export_gpio_desc(struct gpio_desc *desc) } /* - * No extra locking here; FLAG_SYSFS just signifies that the + * No extra locking here; GPIOD_FLAG_SYSFS just signifies that the * request and export were done by on behalf of userspace, so * they may be undone on its behalf too. */ @@ -505,7 +505,7 @@ static int export_gpio_desc(struct gpio_desc *desc) if (ret < 0) { gpiod_free(desc); } else { - set_bit(FLAG_SYSFS, &desc->flags); + set_bit(GPIOD_FLAG_SYSFS, &desc->flags); gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_REQUESTED); } @@ -515,11 +515,11 @@ static int export_gpio_desc(struct gpio_desc *desc) static int unexport_gpio_desc(struct gpio_desc *desc) { /* - * No extra locking here; FLAG_SYSFS just signifies that the + * No extra locking here; GPIOD_FLAG_SYSFS just signifies that the * request and export were done by on behalf of userspace, so * they may be undone on its behalf too. */ - if (!test_and_clear_bit(FLAG_SYSFS, &desc->flags)) + if (!test_and_clear_bit(GPIOD_FLAG_SYSFS, &desc->flags)) return -EINVAL; gpiod_unexport(desc); @@ -748,14 +748,14 @@ int gpiod_export(struct gpio_desc *desc, bool direction_may_change) if (!guard.gc) return -ENODEV; - if (test_and_set_bit(FLAG_EXPORT, &desc->flags)) + if (test_and_set_bit(GPIOD_FLAG_EXPORT, &desc->flags)) return -EPERM; gdev = desc->gdev; guard(mutex)(&sysfs_lock); - if (!test_bit(FLAG_REQUESTED, &desc->flags)) { + if (!test_bit(GPIOD_FLAG_REQUESTED, &desc->flags)) { gpiod_dbg(desc, "%s: unavailable (not requested)\n", __func__); status = -EPERM; goto err_clear_bit; @@ -866,7 +866,7 @@ int gpiod_export(struct gpio_desc *desc, bool direction_may_change) #endif /* CONFIG_GPIO_SYSFS_LEGACY */ kfree(desc_data); err_clear_bit: - clear_bit(FLAG_EXPORT, &desc->flags); + clear_bit(GPIOD_FLAG_EXPORT, &desc->flags); gpiod_dbg(desc, "%s: status %d\n", __func__, status); return status; } @@ -937,7 +937,7 @@ void gpiod_unexport(struct gpio_desc *desc) } scoped_guard(mutex, &sysfs_lock) { - if (!test_bit(FLAG_EXPORT, &desc->flags)) + if (!test_bit(GPIOD_FLAG_EXPORT, &desc->flags)) return; gdev = gpiod_to_gpio_device(desc); @@ -956,7 +956,7 @@ void gpiod_unexport(struct gpio_desc *desc) return; list_del(&desc_data->list); - clear_bit(FLAG_EXPORT, &desc->flags); + clear_bit(GPIOD_FLAG_EXPORT, &desc->flags); #if IS_ENABLED(CONFIG_GPIO_SYSFS_LEGACY) sysfs_put(desc_data->value_kn); device_unregister(desc_data->dev); @@ -1073,7 +1073,7 @@ void gpiochip_sysfs_unregister(struct gpio_device *gdev) return; /* unregister gpiod class devices owned by sysfs */ - for_each_gpio_desc_with_flag(chip, desc, FLAG_SYSFS) { + for_each_gpio_desc_with_flag(chip, desc, GPIOD_FLAG_SYSFS) { gpiod_unexport(desc); gpiod_free(desc); } diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 98d2fa60249056..01bdf8fad7cff6 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -127,10 +127,10 @@ const char *gpiod_get_label(struct gpio_desc *desc) label = srcu_dereference_check(desc->label, &desc->gdev->desc_srcu, srcu_read_lock_held(&desc->gdev->desc_srcu)); - if (test_bit(FLAG_USED_AS_IRQ, &flags)) + if (test_bit(GPIOD_FLAG_USED_AS_IRQ, &flags)) return label ? label->str : "interrupt"; - if (!test_bit(FLAG_REQUESTED, &flags)) + if (!test_bit(GPIOD_FLAG_REQUESTED, &flags)) return NULL; return label ? label->str : NULL; @@ -450,8 +450,8 @@ int gpiod_get_direction(struct gpio_desc *desc) * Open drain emulation using input mode may incorrectly report * input here, fix that up. */ - if (test_bit(FLAG_OPEN_DRAIN, &flags) && - test_bit(FLAG_IS_OUT, &flags)) + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &flags) && + test_bit(GPIOD_FLAG_IS_OUT, &flags)) return 0; if (!guard.gc->get_direction) @@ -468,7 +468,7 @@ int gpiod_get_direction(struct gpio_desc *desc) if (ret > 0) ret = 1; - assign_bit(FLAG_IS_OUT, &flags, !ret); + assign_bit(GPIOD_FLAG_IS_OUT, &flags, !ret); WRITE_ONCE(desc->flags, flags); return ret; @@ -846,7 +846,7 @@ static void gpiochip_free_remaining_irqs(struct gpio_chip *gc) { struct gpio_desc *desc; - for_each_gpio_desc_with_flag(gc, desc, FLAG_USED_AS_IRQ) + for_each_gpio_desc_with_flag(gc, desc, GPIOD_FLAG_USED_AS_IRQ) gpiod_free_irqs(desc); } @@ -1169,10 +1169,10 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, * lock here. */ if (gc->get_direction && gpiochip_line_is_valid(gc, desc_index)) - assign_bit(FLAG_IS_OUT, &desc->flags, + assign_bit(GPIOD_FLAG_IS_OUT, &desc->flags, !gc->get_direction(gc, desc_index)); else - assign_bit(FLAG_IS_OUT, + assign_bit(GPIOD_FLAG_IS_OUT, &desc->flags, !gc->direction_input); } @@ -2449,7 +2449,7 @@ static int gpiod_request_commit(struct gpio_desc *desc, const char *label) if (!guard.gc) return -ENODEV; - if (test_and_set_bit(FLAG_REQUESTED, &desc->flags)) + if (test_and_set_bit(GPIOD_FLAG_REQUESTED, &desc->flags)) return -EBUSY; offset = gpio_chip_hwgpio(desc); @@ -2478,7 +2478,7 @@ static int gpiod_request_commit(struct gpio_desc *desc, const char *label) return 0; out_clear_bit: - clear_bit(FLAG_REQUESTED, &desc->flags); + clear_bit(GPIOD_FLAG_REQUESTED, &desc->flags); return ret; } @@ -2512,20 +2512,20 @@ static void gpiod_free_commit(struct gpio_desc *desc) flags = READ_ONCE(desc->flags); - if (guard.gc && test_bit(FLAG_REQUESTED, &flags)) { + if (guard.gc && test_bit(GPIOD_FLAG_REQUESTED, &flags)) { if (guard.gc->free) guard.gc->free(guard.gc, gpio_chip_hwgpio(desc)); - clear_bit(FLAG_ACTIVE_LOW, &flags); - clear_bit(FLAG_REQUESTED, &flags); - clear_bit(FLAG_OPEN_DRAIN, &flags); - clear_bit(FLAG_OPEN_SOURCE, &flags); - clear_bit(FLAG_PULL_UP, &flags); - clear_bit(FLAG_PULL_DOWN, &flags); - clear_bit(FLAG_BIAS_DISABLE, &flags); - clear_bit(FLAG_EDGE_RISING, &flags); - clear_bit(FLAG_EDGE_FALLING, &flags); - clear_bit(FLAG_IS_HOGGED, &flags); + clear_bit(GPIOD_FLAG_ACTIVE_LOW, &flags); + clear_bit(GPIOD_FLAG_REQUESTED, &flags); + clear_bit(GPIOD_FLAG_OPEN_DRAIN, &flags); + clear_bit(GPIOD_FLAG_OPEN_SOURCE, &flags); + clear_bit(GPIOD_FLAG_PULL_UP, &flags); + clear_bit(GPIOD_FLAG_PULL_DOWN, &flags); + clear_bit(GPIOD_FLAG_BIAS_DISABLE, &flags); + clear_bit(GPIOD_FLAG_EDGE_RISING, &flags); + clear_bit(GPIOD_FLAG_EDGE_FALLING, &flags); + clear_bit(GPIOD_FLAG_IS_HOGGED, &flags); #ifdef CONFIG_OF_DYNAMIC WRITE_ONCE(desc->hog, NULL); #endif @@ -2568,7 +2568,7 @@ char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset) if (IS_ERR(desc)) return NULL; - if (!test_bit(FLAG_REQUESTED, &desc->flags)) + if (!test_bit(GPIOD_FLAG_REQUESTED, &desc->flags)) return NULL; guard(srcu)(&desc->gdev->desc_srcu); @@ -2736,11 +2736,11 @@ static int gpio_set_bias(struct gpio_desc *desc) flags = READ_ONCE(desc->flags); - if (test_bit(FLAG_BIAS_DISABLE, &flags)) + if (test_bit(GPIOD_FLAG_BIAS_DISABLE, &flags)) bias = PIN_CONFIG_BIAS_DISABLE; - else if (test_bit(FLAG_PULL_UP, &flags)) + else if (test_bit(GPIOD_FLAG_PULL_UP, &flags)) bias = PIN_CONFIG_BIAS_PULL_UP; - else if (test_bit(FLAG_PULL_DOWN, &flags)) + else if (test_bit(GPIOD_FLAG_PULL_DOWN, &flags)) bias = PIN_CONFIG_BIAS_PULL_DOWN; else return 0; @@ -2882,7 +2882,7 @@ int gpiod_direction_input_nonotify(struct gpio_desc *desc) } } if (ret == 0) { - clear_bit(FLAG_IS_OUT, &desc->flags); + clear_bit(GPIOD_FLAG_IS_OUT, &desc->flags); ret = gpio_set_bias(desc); } @@ -2955,7 +2955,7 @@ static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value) } if (!ret) - set_bit(FLAG_IS_OUT, &desc->flags); + set_bit(GPIOD_FLAG_IS_OUT, &desc->flags); trace_gpio_value(desc_to_gpio(desc), 0, val); trace_gpio_direction(desc_to_gpio(desc), 0, ret); return ret; @@ -3021,21 +3021,21 @@ int gpiod_direction_output_nonotify(struct gpio_desc *desc, int value) flags = READ_ONCE(desc->flags); - if (test_bit(FLAG_ACTIVE_LOW, &flags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &flags)) value = !value; else value = !!value; /* GPIOs used for enabled IRQs shall not be set as output */ - if (test_bit(FLAG_USED_AS_IRQ, &flags) && - test_bit(FLAG_IRQ_IS_ENABLED, &flags)) { + if (test_bit(GPIOD_FLAG_USED_AS_IRQ, &flags) && + test_bit(GPIOD_FLAG_IRQ_IS_ENABLED, &flags)) { gpiod_err(desc, "%s: tried to set a GPIO tied to an IRQ as output\n", __func__); return -EIO; } - if (test_bit(FLAG_OPEN_DRAIN, &flags)) { + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &flags)) { /* First see if we can enable open drain in hardware */ ret = gpio_set_config(desc, PIN_CONFIG_DRIVE_OPEN_DRAIN); if (!ret) @@ -3043,7 +3043,7 @@ int gpiod_direction_output_nonotify(struct gpio_desc *desc, int value) /* Emulate open drain by not actively driving the line high */ if (value) goto set_output_flag; - } else if (test_bit(FLAG_OPEN_SOURCE, &flags)) { + } else if (test_bit(GPIOD_FLAG_OPEN_SOURCE, &flags)) { ret = gpio_set_config(desc, PIN_CONFIG_DRIVE_OPEN_SOURCE); if (!ret) goto set_output_value; @@ -3070,7 +3070,7 @@ int gpiod_direction_output_nonotify(struct gpio_desc *desc, int value) * set the IS_OUT flag or otherwise we won't be able to set the line * value anymore. */ - set_bit(FLAG_IS_OUT, &desc->flags); + set_bit(GPIOD_FLAG_IS_OUT, &desc->flags); return 0; } @@ -3210,10 +3210,10 @@ int gpiod_set_transitory(struct gpio_desc *desc, bool transitory) { VALIDATE_DESC(desc); /* - * Handle FLAG_TRANSITORY first, enabling queries to gpiolib for + * Handle GPIOD_FLAG_TRANSITORY first, enabling queries to gpiolib for * persistence state. */ - assign_bit(FLAG_TRANSITORY, &desc->flags, transitory); + assign_bit(GPIOD_FLAG_TRANSITORY, &desc->flags, transitory); /* If the driver supports it, set the persistence state now */ return gpio_set_config_with_argument_optional(desc, @@ -3231,7 +3231,7 @@ int gpiod_set_transitory(struct gpio_desc *desc, bool transitory) int gpiod_is_active_low(const struct gpio_desc *desc) { VALIDATE_DESC(desc); - return test_bit(FLAG_ACTIVE_LOW, &desc->flags); + return test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags); } EXPORT_SYMBOL_GPL(gpiod_is_active_low); @@ -3242,7 +3242,7 @@ EXPORT_SYMBOL_GPL(gpiod_is_active_low); void gpiod_toggle_active_low(struct gpio_desc *desc) { VALIDATE_DESC_VOID(desc); - change_bit(FLAG_ACTIVE_LOW, &desc->flags); + change_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags); gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_CONFIG); } EXPORT_SYMBOL_GPL(gpiod_toggle_active_low); @@ -3448,7 +3448,7 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep, int hwgpio = gpio_chip_hwgpio(desc); int value = test_bit(hwgpio, bits); - if (!raw && test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + if (!raw && test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags)) value = !value; __assign_bit(j, value_bitmap, value); trace_gpio_value(desc_to_gpio(desc), 1, value); @@ -3510,7 +3510,7 @@ int gpiod_get_value(const struct gpio_desc *desc) if (value < 0) return value; - if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags)) value = !value; return value; @@ -3593,7 +3593,7 @@ static int gpio_set_open_drain_value_commit(struct gpio_desc *desc, bool value) } else { ret = gpiochip_direction_output(guard.gc, offset, 0); if (!ret) - set_bit(FLAG_IS_OUT, &desc->flags); + set_bit(GPIOD_FLAG_IS_OUT, &desc->flags); } trace_gpio_direction(desc_to_gpio(desc), value, ret); if (ret < 0) @@ -3620,7 +3620,7 @@ static int gpio_set_open_source_value_commit(struct gpio_desc *desc, bool value) if (value) { ret = gpiochip_direction_output(guard.gc, offset, 1); if (!ret) - set_bit(FLAG_IS_OUT, &desc->flags); + set_bit(GPIOD_FLAG_IS_OUT, &desc->flags); } else { ret = gpiochip_direction_input(guard.gc, offset); } @@ -3635,7 +3635,7 @@ static int gpio_set_open_source_value_commit(struct gpio_desc *desc, bool value) static int gpiod_set_raw_value_commit(struct gpio_desc *desc, bool value) { - if (unlikely(!test_bit(FLAG_IS_OUT, &desc->flags))) + if (unlikely(!test_bit(GPIOD_FLAG_IS_OUT, &desc->flags))) return -EPERM; CLASS(gpio_chip_guard, guard)(desc); @@ -3705,7 +3705,7 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep, WARN_ON(array_info->gdev->can_sleep); for (i = 0; i < array_size; i++) { - if (unlikely(!test_bit(FLAG_IS_OUT, + if (unlikely(!test_bit(GPIOD_FLAG_IS_OUT, &desc_array[i]->flags))) return -EPERM; } @@ -3769,7 +3769,7 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep, int hwgpio = gpio_chip_hwgpio(desc); int value = test_bit(i, value_bitmap); - if (unlikely(!test_bit(FLAG_IS_OUT, &desc->flags))) + if (unlikely(!test_bit(GPIOD_FLAG_IS_OUT, &desc->flags))) return -EPERM; /* @@ -3779,16 +3779,16 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep, */ if (!raw && !(array_info && test_bit(i, array_info->invert_mask)) && - test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags)) value = !value; trace_gpio_value(desc_to_gpio(desc), 0, value); /* * collect all normal outputs belonging to the same chip * open drain and open source outputs are set individually */ - if (test_bit(FLAG_OPEN_DRAIN, &desc->flags) && !raw) { + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags) && !raw) { gpio_set_open_drain_value_commit(desc, value); - } else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags) && !raw) { + } else if (test_bit(GPIOD_FLAG_OPEN_SOURCE, &desc->flags) && !raw) { gpio_set_open_source_value_commit(desc, value); } else { __set_bit(hwgpio, mask); @@ -3854,12 +3854,12 @@ EXPORT_SYMBOL_GPL(gpiod_set_raw_value); */ static int gpiod_set_value_nocheck(struct gpio_desc *desc, int value) { - if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags)) value = !value; - if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags)) return gpio_set_open_drain_value_commit(desc, value); - else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) + else if (test_bit(GPIOD_FLAG_OPEN_SOURCE, &desc->flags)) return gpio_set_open_source_value_commit(desc, value); return gpiod_set_raw_value_commit(desc, value); @@ -4063,16 +4063,16 @@ int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset) } /* To be valid for IRQ the line needs to be input or open drain */ - if (test_bit(FLAG_IS_OUT, &desc->flags) && - !test_bit(FLAG_OPEN_DRAIN, &desc->flags)) { + if (test_bit(GPIOD_FLAG_IS_OUT, &desc->flags) && + !test_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags)) { chip_err(gc, "%s: tried to flag a GPIO set as output for IRQ\n", __func__); return -EIO; } - set_bit(FLAG_USED_AS_IRQ, &desc->flags); - set_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); + set_bit(GPIOD_FLAG_USED_AS_IRQ, &desc->flags); + set_bit(GPIOD_FLAG_IRQ_IS_ENABLED, &desc->flags); return 0; } @@ -4094,8 +4094,8 @@ void gpiochip_unlock_as_irq(struct gpio_chip *gc, unsigned int offset) if (IS_ERR(desc)) return; - clear_bit(FLAG_USED_AS_IRQ, &desc->flags); - clear_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); + clear_bit(GPIOD_FLAG_USED_AS_IRQ, &desc->flags); + clear_bit(GPIOD_FLAG_IRQ_IS_ENABLED, &desc->flags); } EXPORT_SYMBOL_GPL(gpiochip_unlock_as_irq); @@ -4104,8 +4104,8 @@ void gpiochip_disable_irq(struct gpio_chip *gc, unsigned int offset) struct gpio_desc *desc = gpiochip_get_desc(gc, offset); if (!IS_ERR(desc) && - !WARN_ON(!test_bit(FLAG_USED_AS_IRQ, &desc->flags))) - clear_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); + !WARN_ON(!test_bit(GPIOD_FLAG_USED_AS_IRQ, &desc->flags))) + clear_bit(GPIOD_FLAG_IRQ_IS_ENABLED, &desc->flags); } EXPORT_SYMBOL_GPL(gpiochip_disable_irq); @@ -4114,14 +4114,14 @@ void gpiochip_enable_irq(struct gpio_chip *gc, unsigned int offset) struct gpio_desc *desc = gpiochip_get_desc(gc, offset); if (!IS_ERR(desc) && - !WARN_ON(!test_bit(FLAG_USED_AS_IRQ, &desc->flags))) { + !WARN_ON(!test_bit(GPIOD_FLAG_USED_AS_IRQ, &desc->flags))) { /* * We must not be output when using IRQ UNLESS we are * open drain. */ - WARN_ON(test_bit(FLAG_IS_OUT, &desc->flags) && - !test_bit(FLAG_OPEN_DRAIN, &desc->flags)); - set_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); + WARN_ON(test_bit(GPIOD_FLAG_IS_OUT, &desc->flags) && + !test_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags)); + set_bit(GPIOD_FLAG_IRQ_IS_ENABLED, &desc->flags); } } EXPORT_SYMBOL_GPL(gpiochip_enable_irq); @@ -4131,7 +4131,7 @@ bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset) if (offset >= gc->ngpio) return false; - return test_bit(FLAG_USED_AS_IRQ, &gc->gpiodev->descs[offset].flags); + return test_bit(GPIOD_FLAG_USED_AS_IRQ, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_irq); @@ -4164,7 +4164,7 @@ bool gpiochip_line_is_open_drain(struct gpio_chip *gc, unsigned int offset) if (offset >= gc->ngpio) return false; - return test_bit(FLAG_OPEN_DRAIN, &gc->gpiodev->descs[offset].flags); + return test_bit(GPIOD_FLAG_OPEN_DRAIN, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_open_drain); @@ -4173,7 +4173,7 @@ bool gpiochip_line_is_open_source(struct gpio_chip *gc, unsigned int offset) if (offset >= gc->ngpio) return false; - return test_bit(FLAG_OPEN_SOURCE, &gc->gpiodev->descs[offset].flags); + return test_bit(GPIOD_FLAG_OPEN_SOURCE, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_open_source); @@ -4182,7 +4182,7 @@ bool gpiochip_line_is_persistent(struct gpio_chip *gc, unsigned int offset) if (offset >= gc->ngpio) return false; - return !test_bit(FLAG_TRANSITORY, &gc->gpiodev->descs[offset].flags); + return !test_bit(GPIOD_FLAG_TRANSITORY, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_persistent); @@ -4224,7 +4224,7 @@ int gpiod_get_value_cansleep(const struct gpio_desc *desc) if (value < 0) return value; - if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags)) value = !value; return value; @@ -4806,10 +4806,10 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id, int ret; if (lflags & GPIO_ACTIVE_LOW) - set_bit(FLAG_ACTIVE_LOW, &desc->flags); + set_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags); if (lflags & GPIO_OPEN_DRAIN) - set_bit(FLAG_OPEN_DRAIN, &desc->flags); + set_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags); else if (dflags & GPIOD_FLAGS_BIT_OPEN_DRAIN) { /* * This enforces open drain mode from the consumer side. @@ -4817,13 +4817,13 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id, * should *REALLY* have specified them as open drain in the * first place, so print a little warning here. */ - set_bit(FLAG_OPEN_DRAIN, &desc->flags); + set_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags); gpiod_warn(desc, "enforced open drain please flag it properly in DT/ACPI DSDT/board file\n"); } if (lflags & GPIO_OPEN_SOURCE) - set_bit(FLAG_OPEN_SOURCE, &desc->flags); + set_bit(GPIOD_FLAG_OPEN_SOURCE, &desc->flags); if (((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DOWN)) || ((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DISABLE)) || @@ -4834,11 +4834,11 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id, } if (lflags & GPIO_PULL_UP) - set_bit(FLAG_PULL_UP, &desc->flags); + set_bit(GPIOD_FLAG_PULL_UP, &desc->flags); else if (lflags & GPIO_PULL_DOWN) - set_bit(FLAG_PULL_DOWN, &desc->flags); + set_bit(GPIOD_FLAG_PULL_DOWN, &desc->flags); else if (lflags & GPIO_PULL_DISABLE) - set_bit(FLAG_BIAS_DISABLE, &desc->flags); + set_bit(GPIOD_FLAG_BIAS_DISABLE, &desc->flags); ret = gpiod_set_transitory(desc, (lflags & GPIO_TRANSITORY)); if (ret < 0) @@ -4943,7 +4943,7 @@ int gpiod_hog(struct gpio_desc *desc, const char *name, if (!guard.gc) return -ENODEV; - if (test_and_set_bit(FLAG_IS_HOGGED, &desc->flags)) + if (test_and_set_bit(GPIOD_FLAG_IS_HOGGED, &desc->flags)) return 0; hwnum = gpio_chip_hwgpio(desc); @@ -4951,7 +4951,7 @@ int gpiod_hog(struct gpio_desc *desc, const char *name, local_desc = gpiochip_request_own_desc(guard.gc, hwnum, name, lflags, dflags); if (IS_ERR(local_desc)) { - clear_bit(FLAG_IS_HOGGED, &desc->flags); + clear_bit(GPIOD_FLAG_IS_HOGGED, &desc->flags); ret = PTR_ERR(local_desc); pr_err("requesting hog GPIO %s (chip %s, offset %d) failed, %d\n", name, gdev->label, hwnum, ret); @@ -4974,7 +4974,7 @@ static void gpiochip_free_hogs(struct gpio_chip *gc) { struct gpio_desc *desc; - for_each_gpio_desc_with_flag(gc, desc, FLAG_IS_HOGGED) + for_each_gpio_desc_with_flag(gc, desc, GPIOD_FLAG_IS_HOGGED) gpiochip_free_own_desc(desc); } @@ -5089,8 +5089,8 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, } else { dflags = READ_ONCE(desc->flags); /* Exclude open drain or open source from fast output */ - if (test_bit(FLAG_OPEN_DRAIN, &dflags) || - test_bit(FLAG_OPEN_SOURCE, &dflags)) + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &dflags) || + test_bit(GPIOD_FLAG_OPEN_SOURCE, &dflags)) __clear_bit(descs->ndescs, array_info->set_mask); /* Identify 'fast' pins which require invertion */ @@ -5248,12 +5248,12 @@ static void gpiolib_dbg_show(struct seq_file *s, struct gpio_device *gdev) for_each_gpio_desc(gc, desc) { guard(srcu)(&desc->gdev->desc_srcu); flags = READ_ONCE(desc->flags); - is_irq = test_bit(FLAG_USED_AS_IRQ, &flags); - if (is_irq || test_bit(FLAG_REQUESTED, &flags)) { + is_irq = test_bit(GPIOD_FLAG_USED_AS_IRQ, &flags); + if (is_irq || test_bit(GPIOD_FLAG_REQUESTED, &flags)) { gpiod_get_direction(desc); - is_out = test_bit(FLAG_IS_OUT, &flags); + is_out = test_bit(GPIOD_FLAG_IS_OUT, &flags); value = gpio_chip_get_value(gc, desc); - active_low = test_bit(FLAG_ACTIVE_LOW, &flags); + active_low = test_bit(GPIOD_FLAG_ACTIVE_LOW, &flags); seq_printf(s, " gpio-%-3u (%-20.20s|%-20.20s) %s %s %s%s\n", gpio, desc->name ?: "", gpiod_get_label(desc), is_out ? "out" : "in ", diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index 9b74738a9ca5b1..2a003a7311e7ac 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -186,24 +186,24 @@ struct gpio_desc { struct gpio_device *gdev; unsigned long flags; /* flag symbols are bit numbers */ -#define FLAG_REQUESTED 0 -#define FLAG_IS_OUT 1 -#define FLAG_EXPORT 2 /* protected by sysfs_lock */ -#define FLAG_SYSFS 3 /* exported via /sys/class/gpio/control */ -#define FLAG_ACTIVE_LOW 6 /* value has active low */ -#define FLAG_OPEN_DRAIN 7 /* Gpio is open drain type */ -#define FLAG_OPEN_SOURCE 8 /* Gpio is open source type */ -#define FLAG_USED_AS_IRQ 9 /* GPIO is connected to an IRQ */ -#define FLAG_IRQ_IS_ENABLED 10 /* GPIO is connected to an enabled IRQ */ -#define FLAG_IS_HOGGED 11 /* GPIO is hogged */ -#define FLAG_TRANSITORY 12 /* GPIO may lose value in sleep or reset */ -#define FLAG_PULL_UP 13 /* GPIO has pull up enabled */ -#define FLAG_PULL_DOWN 14 /* GPIO has pull down enabled */ -#define FLAG_BIAS_DISABLE 15 /* GPIO has pull disabled */ -#define FLAG_EDGE_RISING 16 /* GPIO CDEV detects rising edge events */ -#define FLAG_EDGE_FALLING 17 /* GPIO CDEV detects falling edge events */ -#define FLAG_EVENT_CLOCK_REALTIME 18 /* GPIO CDEV reports REALTIME timestamps in events */ -#define FLAG_EVENT_CLOCK_HTE 19 /* GPIO CDEV reports hardware timestamps in events */ +#define GPIOD_FLAG_REQUESTED 0 /* GPIO is in use */ +#define GPIOD_FLAG_IS_OUT 1 /* GPIO is in output mode */ +#define GPIOD_FLAG_EXPORT 2 /* GPIO is exported to user-space */ +#define GPIOD_FLAG_SYSFS 3 /* GPIO is exported via /sys/class/gpio */ +#define GPIOD_FLAG_ACTIVE_LOW 6 /* GPIO is active-low */ +#define GPIOD_FLAG_OPEN_DRAIN 7 /* GPIO is open drain type */ +#define GPIOD_FLAG_OPEN_SOURCE 8 /* GPIO is open source type */ +#define GPIOD_FLAG_USED_AS_IRQ 9 /* GPIO is connected to an IRQ */ +#define GPIOD_FLAG_IRQ_IS_ENABLED 10 /* GPIO is connected to an enabled IRQ */ +#define GPIOD_FLAG_IS_HOGGED 11 /* GPIO is hogged */ +#define GPIOD_FLAG_TRANSITORY 12 /* GPIO may lose value in sleep or reset */ +#define GPIOD_FLAG_PULL_UP 13 /* GPIO has pull up enabled */ +#define GPIOD_FLAG_PULL_DOWN 14 /* GPIO has pull down enabled */ +#define GPIOD_FLAG_BIAS_DISABLE 15 /* GPIO has pull disabled */ +#define GPIOD_FLAG_EDGE_RISING 16 /* GPIO CDEV detects rising edge events */ +#define GPIOD_FLAG_EDGE_FALLING 17 /* GPIO CDEV detects falling edge events */ +#define GPIOD_FLAG_EVENT_CLOCK_REALTIME 18 /* GPIO CDEV reports REALTIME timestamps in events */ +#define GPIOD_FLAG_EVENT_CLOCK_HTE 19 /* GPIO CDEV reports hardware timestamps in events */ /* Connection label */ struct gpio_desc_label __rcu *label; From 80d7319c7a2a9865dc730422ec7227bfcc92e6bb Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:37 +0200 Subject: [PATCH 1473/3206] gpio: loongson1: allow building the module with COMPILE_TEST enabled Increase build coverage by allowing the module to be built with COMPILE_TEST=y. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-1-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 31f8bab4b09df1..09cb144f076661 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -885,7 +885,7 @@ config GPIO_ZYNQMP_MODEPIN config GPIO_LOONGSON1 tristate "Loongson1 GPIO support" - depends on MACH_LOONGSON32 + depends on MACH_LOONGSON32 || COMPILE_TEST select GPIO_GENERIC help Say Y or M here to support GPIO on Loongson1 SoCs. From 116eadc92b4c47277d660271eac1efd4afd33121 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:38 +0200 Subject: [PATCH 1474/3206] gpio: loongson1: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-2-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-loongson1.c | 40 ++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/drivers/gpio/gpio-loongson1.c b/drivers/gpio/gpio-loongson1.c index 6ca3b969db4df2..9750a7a1750817 100644 --- a/drivers/gpio/gpio-loongson1.c +++ b/drivers/gpio/gpio-loongson1.c @@ -5,10 +5,11 @@ * Copyright (C) 2015-2023 Keguang Zhang */ +#include #include #include +#include #include -#include /* Loongson 1 GPIO Register Definitions */ #define GPIO_CFG 0x0 @@ -17,19 +18,18 @@ #define GPIO_OUTPUT 0x30 struct ls1x_gpio_chip { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *reg_base; }; static int ls1x_gpio_request(struct gpio_chip *gc, unsigned int offset) { struct ls1x_gpio_chip *ls1x_gc = gpiochip_get_data(gc); - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&ls1x_gc->chip); + __raw_writel(__raw_readl(ls1x_gc->reg_base + GPIO_CFG) | BIT(offset), ls1x_gc->reg_base + GPIO_CFG); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); return 0; } @@ -37,16 +37,16 @@ static int ls1x_gpio_request(struct gpio_chip *gc, unsigned int offset) static void ls1x_gpio_free(struct gpio_chip *gc, unsigned int offset) { struct ls1x_gpio_chip *ls1x_gc = gpiochip_get_data(gc); - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&ls1x_gc->chip); + __raw_writel(__raw_readl(ls1x_gc->reg_base + GPIO_CFG) & ~BIT(offset), ls1x_gc->reg_base + GPIO_CFG); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static int ls1x_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct ls1x_gpio_chip *ls1x_gc; int ret; @@ -59,29 +59,35 @@ static int ls1x_gpio_probe(struct platform_device *pdev) if (IS_ERR(ls1x_gc->reg_base)) return PTR_ERR(ls1x_gc->reg_base); - ret = bgpio_init(&ls1x_gc->gc, dev, 4, ls1x_gc->reg_base + GPIO_DATA, - ls1x_gc->reg_base + GPIO_OUTPUT, NULL, - NULL, ls1x_gc->reg_base + GPIO_DIR, 0); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = ls1x_gc->reg_base + GPIO_DATA, + .set = ls1x_gc->reg_base + GPIO_OUTPUT, + .dirin = ls1x_gc->reg_base + GPIO_DIR, + }; + + ret = gpio_generic_chip_init(&ls1x_gc->chip, &config); if (ret) goto err; - ls1x_gc->gc.owner = THIS_MODULE; - ls1x_gc->gc.request = ls1x_gpio_request; - ls1x_gc->gc.free = ls1x_gpio_free; + ls1x_gc->chip.gc.owner = THIS_MODULE; + ls1x_gc->chip.gc.request = ls1x_gpio_request; + ls1x_gc->chip.gc.free = ls1x_gpio_free; /* * Clear ngpio to let gpiolib get the correct number * by reading ngpios property */ - ls1x_gc->gc.ngpio = 0; + ls1x_gc->chip.gc.ngpio = 0; - ret = devm_gpiochip_add_data(dev, &ls1x_gc->gc, ls1x_gc); + ret = devm_gpiochip_add_data(dev, &ls1x_gc->chip.gc, ls1x_gc); if (ret) goto err; platform_set_drvdata(pdev, ls1x_gc); dev_info(dev, "GPIO controller registered with %d pins\n", - ls1x_gc->gc.ngpio); + ls1x_gc->chip.gc.ngpio); return 0; err: From 43dffacf6be98fb31aa7790d693adc29276461f0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:39 +0200 Subject: [PATCH 1475/3206] gpio: hlwd: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-3-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-hlwd.c | 105 ++++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 51 deletions(-) diff --git a/drivers/gpio/gpio-hlwd.c b/drivers/gpio/gpio-hlwd.c index 0580f6712bea9a..a395f87436ac4d 100644 --- a/drivers/gpio/gpio-hlwd.c +++ b/drivers/gpio/gpio-hlwd.c @@ -6,6 +6,7 @@ // Nintendo Wii (Hollywood) GPIO driver #include +#include #include #include #include @@ -48,7 +49,7 @@ #define HW_GPIO_OWNER 0x3c struct hlwd_gpio { - struct gpio_chip gpioc; + struct gpio_generic_chip gpioc; struct device *dev; void __iomem *regs; int irq; @@ -61,45 +62,44 @@ static void hlwd_gpio_irqhandler(struct irq_desc *desc) struct hlwd_gpio *hlwd = gpiochip_get_data(irq_desc_get_handler_data(desc)); struct irq_chip *chip = irq_desc_get_chip(desc); - unsigned long flags; unsigned long pending; int hwirq; u32 emulated_pending; - raw_spin_lock_irqsave(&hlwd->gpioc.bgpio_lock, flags); - pending = ioread32be(hlwd->regs + HW_GPIOB_INTFLAG); - pending &= ioread32be(hlwd->regs + HW_GPIOB_INTMASK); + scoped_guard(gpio_generic_lock_irqsave, &hlwd->gpioc) { + pending = ioread32be(hlwd->regs + HW_GPIOB_INTFLAG); + pending &= ioread32be(hlwd->regs + HW_GPIOB_INTMASK); - /* Treat interrupts due to edge trigger emulation separately */ - emulated_pending = hlwd->edge_emulation & pending; - pending &= ~emulated_pending; - if (emulated_pending) { - u32 level, rising, falling; + /* Treat interrupts due to edge trigger emulation separately */ + emulated_pending = hlwd->edge_emulation & pending; + pending &= ~emulated_pending; + if (emulated_pending) { + u32 level, rising, falling; - level = ioread32be(hlwd->regs + HW_GPIOB_INTLVL); - rising = level & emulated_pending; - falling = ~level & emulated_pending; + level = ioread32be(hlwd->regs + HW_GPIOB_INTLVL); + rising = level & emulated_pending; + falling = ~level & emulated_pending; - /* Invert the levels */ - iowrite32be(level ^ emulated_pending, - hlwd->regs + HW_GPIOB_INTLVL); + /* Invert the levels */ + iowrite32be(level ^ emulated_pending, + hlwd->regs + HW_GPIOB_INTLVL); - /* Ack all emulated-edge interrupts */ - iowrite32be(emulated_pending, hlwd->regs + HW_GPIOB_INTFLAG); + /* Ack all emulated-edge interrupts */ + iowrite32be(emulated_pending, hlwd->regs + HW_GPIOB_INTFLAG); - /* Signal interrupts only on the correct edge */ - rising &= hlwd->rising_edge; - falling &= hlwd->falling_edge; + /* Signal interrupts only on the correct edge */ + rising &= hlwd->rising_edge; + falling &= hlwd->falling_edge; - /* Mark emulated interrupts as pending */ - pending |= rising | falling; + /* Mark emulated interrupts as pending */ + pending |= rising | falling; + } } - raw_spin_unlock_irqrestore(&hlwd->gpioc.bgpio_lock, flags); chained_irq_enter(chip, desc); for_each_set_bit(hwirq, &pending, 32) - generic_handle_domain_irq(hlwd->gpioc.irq.domain, hwirq); + generic_handle_domain_irq(hlwd->gpioc.gc.irq.domain, hwirq); chained_irq_exit(chip, desc); } @@ -116,30 +116,29 @@ static void hlwd_gpio_irq_mask(struct irq_data *data) { struct hlwd_gpio *hlwd = gpiochip_get_data(irq_data_get_irq_chip_data(data)); - unsigned long flags; u32 mask; - raw_spin_lock_irqsave(&hlwd->gpioc.bgpio_lock, flags); - mask = ioread32be(hlwd->regs + HW_GPIOB_INTMASK); - mask &= ~BIT(data->hwirq); - iowrite32be(mask, hlwd->regs + HW_GPIOB_INTMASK); - raw_spin_unlock_irqrestore(&hlwd->gpioc.bgpio_lock, flags); - gpiochip_disable_irq(&hlwd->gpioc, irqd_to_hwirq(data)); + scoped_guard(gpio_generic_lock_irqsave, &hlwd->gpioc) { + mask = ioread32be(hlwd->regs + HW_GPIOB_INTMASK); + mask &= ~BIT(data->hwirq); + iowrite32be(mask, hlwd->regs + HW_GPIOB_INTMASK); + } + gpiochip_disable_irq(&hlwd->gpioc.gc, irqd_to_hwirq(data)); } static void hlwd_gpio_irq_unmask(struct irq_data *data) { struct hlwd_gpio *hlwd = gpiochip_get_data(irq_data_get_irq_chip_data(data)); - unsigned long flags; u32 mask; - gpiochip_enable_irq(&hlwd->gpioc, irqd_to_hwirq(data)); - raw_spin_lock_irqsave(&hlwd->gpioc.bgpio_lock, flags); + gpiochip_enable_irq(&hlwd->gpioc.gc, irqd_to_hwirq(data)); + + guard(gpio_generic_lock_irqsave)(&hlwd->gpioc); + mask = ioread32be(hlwd->regs + HW_GPIOB_INTMASK); mask |= BIT(data->hwirq); iowrite32be(mask, hlwd->regs + HW_GPIOB_INTMASK); - raw_spin_unlock_irqrestore(&hlwd->gpioc.bgpio_lock, flags); } static void hlwd_gpio_irq_enable(struct irq_data *data) @@ -173,10 +172,9 @@ static int hlwd_gpio_irq_set_type(struct irq_data *data, unsigned int flow_type) { struct hlwd_gpio *hlwd = gpiochip_get_data(irq_data_get_irq_chip_data(data)); - unsigned long flags; u32 level; - raw_spin_lock_irqsave(&hlwd->gpioc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&hlwd->gpioc); hlwd->edge_emulation &= ~BIT(data->hwirq); @@ -197,11 +195,9 @@ static int hlwd_gpio_irq_set_type(struct irq_data *data, unsigned int flow_type) hlwd_gpio_irq_setup_emulation(hlwd, data->hwirq, flow_type); break; default: - raw_spin_unlock_irqrestore(&hlwd->gpioc.bgpio_lock, flags); return -EINVAL; } - raw_spin_unlock_irqrestore(&hlwd->gpioc.bgpio_lock, flags); return 0; } @@ -225,6 +221,7 @@ static const struct irq_chip hlwd_gpio_irq_chip = { static int hlwd_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct hlwd_gpio *hlwd; u32 ngpios; int res; @@ -244,25 +241,31 @@ static int hlwd_gpio_probe(struct platform_device *pdev) * systems where the AHBPROT memory firewall hasn't been configured to * permit PPC access to HW_GPIO_*. * - * Note that this has to happen before bgpio_init reads the - * HW_GPIOB_OUT and HW_GPIOB_DIR, because otherwise it reads the wrong - * values. + * Note that this has to happen before gpio_generic_chip_init() reads + * the HW_GPIOB_OUT and HW_GPIOB_DIR, because otherwise it reads the + * wrong values. */ iowrite32be(0xffffffff, hlwd->regs + HW_GPIO_OWNER); - res = bgpio_init(&hlwd->gpioc, &pdev->dev, 4, - hlwd->regs + HW_GPIOB_IN, hlwd->regs + HW_GPIOB_OUT, - NULL, hlwd->regs + HW_GPIOB_DIR, NULL, - BGPIOF_BIG_ENDIAN_BYTE_ORDER); + config = (struct gpio_generic_chip_config) { + .dev = &pdev->dev, + .sz = 4, + .dat = hlwd->regs + HW_GPIOB_IN, + .set = hlwd->regs + HW_GPIOB_OUT, + .dirout = hlwd->regs + HW_GPIOB_DIR, + .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + }; + + res = gpio_generic_chip_init(&hlwd->gpioc, &config); if (res < 0) { - dev_warn(&pdev->dev, "bgpio_init failed: %d\n", res); + dev_warn(&pdev->dev, "failed to initialize generic GPIO chip: %d\n", res); return res; } res = of_property_read_u32(pdev->dev.of_node, "ngpios", &ngpios); if (res) ngpios = 32; - hlwd->gpioc.ngpio = ngpios; + hlwd->gpioc.gc.ngpio = ngpios; /* Mask and ack all interrupts */ iowrite32be(0, hlwd->regs + HW_GPIOB_INTMASK); @@ -282,7 +285,7 @@ static int hlwd_gpio_probe(struct platform_device *pdev) return hlwd->irq; } - girq = &hlwd->gpioc.irq; + girq = &hlwd->gpioc.gc.irq; gpio_irq_chip_set_chip(girq, &hlwd_gpio_irq_chip); girq->parent_handler = hlwd_gpio_irqhandler; girq->num_parents = 1; @@ -296,7 +299,7 @@ static int hlwd_gpio_probe(struct platform_device *pdev) girq->handler = handle_level_irq; } - return devm_gpiochip_add_data(&pdev->dev, &hlwd->gpioc, hlwd); + return devm_gpiochip_add_data(&pdev->dev, &hlwd->gpioc.gc, hlwd); } static const struct of_device_id hlwd_gpio_match[] = { From 551a097118391018ddc4079cbcec6fe4e7d64bc5 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:40 +0200 Subject: [PATCH 1476/3206] gpio: ath79: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-4-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ath79.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/drivers/gpio/gpio-ath79.c b/drivers/gpio/gpio-ath79.c index de4cc12e5e0399..8879f23f1871ed 100644 --- a/drivers/gpio/gpio-ath79.c +++ b/drivers/gpio/gpio-ath79.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -28,7 +29,7 @@ #define AR71XX_GPIO_REG_INT_MASK 0x24 struct ath79_gpio_ctrl { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *base; raw_spinlock_t lock; unsigned long both_edges; @@ -37,8 +38,9 @@ struct ath79_gpio_ctrl { static struct ath79_gpio_ctrl *irq_data_to_ath79_gpio(struct irq_data *data) { struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); - return container_of(gc, struct ath79_gpio_ctrl, gc); + return container_of(gen_gc, struct ath79_gpio_ctrl, chip); } static u32 ath79_gpio_read(struct ath79_gpio_ctrl *ctrl, unsigned reg) @@ -72,7 +74,7 @@ static void ath79_gpio_irq_unmask(struct irq_data *data) u32 mask = BIT(irqd_to_hwirq(data)); unsigned long flags; - gpiochip_enable_irq(&ctrl->gc, irqd_to_hwirq(data)); + gpiochip_enable_irq(&ctrl->chip.gc, irqd_to_hwirq(data)); raw_spin_lock_irqsave(&ctrl->lock, flags); ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, mask); raw_spin_unlock_irqrestore(&ctrl->lock, flags); @@ -87,7 +89,7 @@ static void ath79_gpio_irq_mask(struct irq_data *data) raw_spin_lock_irqsave(&ctrl->lock, flags); ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, 0); raw_spin_unlock_irqrestore(&ctrl->lock, flags); - gpiochip_disable_irq(&ctrl->gc, irqd_to_hwirq(data)); + gpiochip_disable_irq(&ctrl->chip.gc, irqd_to_hwirq(data)); } static void ath79_gpio_irq_enable(struct irq_data *data) @@ -187,8 +189,9 @@ static void ath79_gpio_irq_handler(struct irq_desc *desc) { struct gpio_chip *gc = irq_desc_get_handler_data(desc); struct irq_chip *irqchip = irq_desc_get_chip(desc); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct ath79_gpio_ctrl *ctrl = - container_of(gc, struct ath79_gpio_ctrl, gc); + container_of(gen_gc, struct ath79_gpio_ctrl, chip); unsigned long flags, pending; u32 both_edges, state; int irq; @@ -224,6 +227,7 @@ MODULE_DEVICE_TABLE(of, ath79_gpio_of_match); static int ath79_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct ath79_gpio_ctrl *ctrl; struct gpio_irq_chip *girq; @@ -253,21 +257,26 @@ static int ath79_gpio_probe(struct platform_device *pdev) return PTR_ERR(ctrl->base); raw_spin_lock_init(&ctrl->lock); - err = bgpio_init(&ctrl->gc, dev, 4, - ctrl->base + AR71XX_GPIO_REG_IN, - ctrl->base + AR71XX_GPIO_REG_SET, - ctrl->base + AR71XX_GPIO_REG_CLEAR, - oe_inverted ? NULL : ctrl->base + AR71XX_GPIO_REG_OE, - oe_inverted ? ctrl->base + AR71XX_GPIO_REG_OE : NULL, - 0); + + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = ctrl->base + AR71XX_GPIO_REG_IN, + .set = ctrl->base + AR71XX_GPIO_REG_SET, + .clr = ctrl->base + AR71XX_GPIO_REG_CLEAR, + .dirout = oe_inverted ? NULL : ctrl->base + AR71XX_GPIO_REG_OE, + .dirin = oe_inverted ? ctrl->base + AR71XX_GPIO_REG_OE : NULL, + }; + + err = gpio_generic_chip_init(&ctrl->chip, &config); if (err) { - dev_err(dev, "bgpio_init failed\n"); + dev_err(dev, "failed to initialize generic GPIO chip\n"); return err; } /* Optional interrupt setup */ if (device_property_read_bool(dev, "interrupt-controller")) { - girq = &ctrl->gc.irq; + girq = &ctrl->chip.gc.irq; gpio_irq_chip_set_chip(girq, &ath79_gpio_irqchip); girq->parent_handler = ath79_gpio_irq_handler; girq->num_parents = 1; @@ -280,7 +289,7 @@ static int ath79_gpio_probe(struct platform_device *pdev) girq->handler = handle_simple_irq; } - return devm_gpiochip_add_data(dev, &ctrl->gc, ctrl); + return devm_gpiochip_add_data(dev, &ctrl->chip.gc, ctrl); } static struct platform_driver ath79_gpio_driver = { From e7a3a1be11d7e786924ed7af3b3411def2e46f21 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:41 +0200 Subject: [PATCH 1477/3206] gpio: ath79: use the generic GPIO chip lock for IRQ handling This driver uses its own raw spinlock in interrupt routines while the generic GPIO chip callbacks use a separate one. This is, of course, racy so use the fact that the lock in generic GPIO chip is also a raw spinlock and convert the interrupt handling functions in this module to using the provided generic GPIO chip locking API. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-5-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ath79.c | 51 +++++++++++++++------------------------ 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/drivers/gpio/gpio-ath79.c b/drivers/gpio/gpio-ath79.c index 8879f23f1871ed..2ad9f6ac66362f 100644 --- a/drivers/gpio/gpio-ath79.c +++ b/drivers/gpio/gpio-ath79.c @@ -31,7 +31,6 @@ struct ath79_gpio_ctrl { struct gpio_generic_chip chip; void __iomem *base; - raw_spinlock_t lock; unsigned long both_edges; }; @@ -72,23 +71,22 @@ static void ath79_gpio_irq_unmask(struct irq_data *data) { struct ath79_gpio_ctrl *ctrl = irq_data_to_ath79_gpio(data); u32 mask = BIT(irqd_to_hwirq(data)); - unsigned long flags; gpiochip_enable_irq(&ctrl->chip.gc, irqd_to_hwirq(data)); - raw_spin_lock_irqsave(&ctrl->lock, flags); + + guard(gpio_generic_lock_irqsave)(&ctrl->chip); + ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, mask); - raw_spin_unlock_irqrestore(&ctrl->lock, flags); } static void ath79_gpio_irq_mask(struct irq_data *data) { struct ath79_gpio_ctrl *ctrl = irq_data_to_ath79_gpio(data); u32 mask = BIT(irqd_to_hwirq(data)); - unsigned long flags; - raw_spin_lock_irqsave(&ctrl->lock, flags); - ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, 0); - raw_spin_unlock_irqrestore(&ctrl->lock, flags); + scoped_guard(gpio_generic_lock_irqsave, &ctrl->chip) + ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, 0); + gpiochip_disable_irq(&ctrl->chip.gc, irqd_to_hwirq(data)); } @@ -96,24 +94,20 @@ static void ath79_gpio_irq_enable(struct irq_data *data) { struct ath79_gpio_ctrl *ctrl = irq_data_to_ath79_gpio(data); u32 mask = BIT(irqd_to_hwirq(data)); - unsigned long flags; - raw_spin_lock_irqsave(&ctrl->lock, flags); + guard(gpio_generic_lock_irqsave)(&ctrl->chip); ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_ENABLE, mask, mask); ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, mask); - raw_spin_unlock_irqrestore(&ctrl->lock, flags); } static void ath79_gpio_irq_disable(struct irq_data *data) { struct ath79_gpio_ctrl *ctrl = irq_data_to_ath79_gpio(data); u32 mask = BIT(irqd_to_hwirq(data)); - unsigned long flags; - raw_spin_lock_irqsave(&ctrl->lock, flags); + guard(gpio_generic_lock_irqsave)(&ctrl->chip); ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, 0); ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_ENABLE, mask, 0); - raw_spin_unlock_irqrestore(&ctrl->lock, flags); } static int ath79_gpio_irq_set_type(struct irq_data *data, @@ -122,7 +116,6 @@ static int ath79_gpio_irq_set_type(struct irq_data *data, struct ath79_gpio_ctrl *ctrl = irq_data_to_ath79_gpio(data); u32 mask = BIT(irqd_to_hwirq(data)); u32 type = 0, polarity = 0; - unsigned long flags; bool disabled; switch (flow_type) { @@ -144,7 +137,7 @@ static int ath79_gpio_irq_set_type(struct irq_data *data, return -EINVAL; } - raw_spin_lock_irqsave(&ctrl->lock, flags); + guard(gpio_generic_lock_irqsave)(&ctrl->chip); if (flow_type == IRQ_TYPE_EDGE_BOTH) { ctrl->both_edges |= mask; @@ -169,8 +162,6 @@ static int ath79_gpio_irq_set_type(struct irq_data *data, ath79_gpio_update_bits( ctrl, AR71XX_GPIO_REG_INT_ENABLE, mask, mask); - raw_spin_unlock_irqrestore(&ctrl->lock, flags); - return 0; } @@ -192,26 +183,24 @@ static void ath79_gpio_irq_handler(struct irq_desc *desc) struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct ath79_gpio_ctrl *ctrl = container_of(gen_gc, struct ath79_gpio_ctrl, chip); - unsigned long flags, pending; + unsigned long pending; u32 both_edges, state; int irq; chained_irq_enter(irqchip, desc); - raw_spin_lock_irqsave(&ctrl->lock, flags); - - pending = ath79_gpio_read(ctrl, AR71XX_GPIO_REG_INT_PENDING); + scoped_guard(gpio_generic_lock_irqsave, &ctrl->chip) { + pending = ath79_gpio_read(ctrl, AR71XX_GPIO_REG_INT_PENDING); - /* Update the polarity of the both edges irqs */ - both_edges = ctrl->both_edges & pending; - if (both_edges) { - state = ath79_gpio_read(ctrl, AR71XX_GPIO_REG_IN); - ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_POLARITY, - both_edges, ~state); + /* Update the polarity of the both edges irqs */ + both_edges = ctrl->both_edges & pending; + if (both_edges) { + state = ath79_gpio_read(ctrl, AR71XX_GPIO_REG_IN); + ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_POLARITY, + both_edges, ~state); + } } - raw_spin_unlock_irqrestore(&ctrl->lock, flags); - for_each_set_bit(irq, &pending, gc->ngpio) generic_handle_domain_irq(gc->irq.domain, irq); @@ -256,8 +245,6 @@ static int ath79_gpio_probe(struct platform_device *pdev) if (IS_ERR(ctrl->base)) return PTR_ERR(ctrl->base); - raw_spin_lock_init(&ctrl->lock); - config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, From 36f30f7ffc4b98dbd49deec8599cf810e7006cdf Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:42 +0200 Subject: [PATCH 1478/3206] gpio: xgene-sb: use generic GPIO chip register read and write APIs The conversion to using the modernized generic GPIO chip API was incomplete without also converting the direct calls to write/read_reg() callbacks. Use the provided wrappers from linux/gpio/generic.h. Fixes: 38d98a822c14 ("gpio: xgene-sb: use new generic GPIO chip API") Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-6-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-xgene-sb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c index 28ee3f7e91b921..661259f026e191 100644 --- a/drivers/gpio/gpio-xgene-sb.c +++ b/drivers/gpio/gpio-xgene-sb.c @@ -63,14 +63,15 @@ struct xgene_gpio_sb { static void xgene_gpio_set_bit(struct gpio_chip *gc, void __iomem *reg, u32 gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); u32 data; - data = gc->read_reg(reg); + data = gpio_generic_read_reg(chip, reg); if (val) data |= GPIO_MASK(gpio); else data &= ~GPIO_MASK(gpio); - gc->write_reg(reg, data); + gpio_generic_write_reg(chip, reg, data); } static int xgene_gpio_sb_irq_set_type(struct irq_data *d, unsigned int type) From e8bd2a6a5059043a9f13a0723acd48c1291a55ff Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:43 +0200 Subject: [PATCH 1479/3206] gpio: brcmstb: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Florian Fainelli Tested-by: Florian Fainelli Acked-by: Doug Berger Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-7-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-brcmstb.c | 116 +++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 54 deletions(-) diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c index e29a9589b3ccbd..be3ff916e134a6 100644 --- a/drivers/gpio/gpio-brcmstb.c +++ b/drivers/gpio/gpio-brcmstb.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -37,7 +38,7 @@ enum gio_reg_index { struct brcmstb_gpio_bank { struct list_head node; int id; - struct gpio_chip gc; + struct gpio_generic_chip chip; struct brcmstb_gpio_priv *parent_priv; u32 width; u32 wake_active; @@ -72,19 +73,18 @@ __brcmstb_gpio_get_active_irqs(struct brcmstb_gpio_bank *bank) { void __iomem *reg_base = bank->parent_priv->reg_base; - return bank->gc.read_reg(reg_base + GIO_STAT(bank->id)) & - bank->gc.read_reg(reg_base + GIO_MASK(bank->id)); + return gpio_generic_read_reg(&bank->chip, reg_base + GIO_STAT(bank->id)) & + gpio_generic_read_reg(&bank->chip, reg_base + GIO_MASK(bank->id)); } static unsigned long brcmstb_gpio_get_active_irqs(struct brcmstb_gpio_bank *bank) { unsigned long status; - unsigned long flags; - raw_spin_lock_irqsave(&bank->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&bank->chip); + status = __brcmstb_gpio_get_active_irqs(bank); - raw_spin_unlock_irqrestore(&bank->gc.bgpio_lock, flags); return status; } @@ -92,26 +92,26 @@ brcmstb_gpio_get_active_irqs(struct brcmstb_gpio_bank *bank) static int brcmstb_gpio_hwirq_to_offset(irq_hw_number_t hwirq, struct brcmstb_gpio_bank *bank) { - return hwirq - bank->gc.offset; + return hwirq - bank->chip.gc.offset; } static void brcmstb_gpio_set_imask(struct brcmstb_gpio_bank *bank, unsigned int hwirq, bool enable) { - struct gpio_chip *gc = &bank->gc; struct brcmstb_gpio_priv *priv = bank->parent_priv; u32 mask = BIT(brcmstb_gpio_hwirq_to_offset(hwirq, bank)); u32 imask; - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - imask = gc->read_reg(priv->reg_base + GIO_MASK(bank->id)); + guard(gpio_generic_lock_irqsave)(&bank->chip); + + imask = gpio_generic_read_reg(&bank->chip, + priv->reg_base + GIO_MASK(bank->id)); if (enable) imask |= mask; else imask &= ~mask; - gc->write_reg(priv->reg_base + GIO_MASK(bank->id), imask); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_MASK(bank->id), imask); } static int brcmstb_gpio_to_irq(struct gpio_chip *gc, unsigned offset) @@ -150,7 +150,8 @@ static void brcmstb_gpio_irq_ack(struct irq_data *d) struct brcmstb_gpio_priv *priv = bank->parent_priv; u32 mask = BIT(brcmstb_gpio_hwirq_to_offset(d->hwirq, bank)); - gc->write_reg(priv->reg_base + GIO_STAT(bank->id), mask); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_STAT(bank->id), mask); } static int brcmstb_gpio_irq_set_type(struct irq_data *d, unsigned int type) @@ -162,7 +163,6 @@ static int brcmstb_gpio_irq_set_type(struct irq_data *d, unsigned int type) u32 edge_insensitive, iedge_insensitive; u32 edge_config, iedge_config; u32 level, ilevel; - unsigned long flags; switch (type) { case IRQ_TYPE_LEVEL_LOW: @@ -194,23 +194,25 @@ static int brcmstb_gpio_irq_set_type(struct irq_data *d, unsigned int type) return -EINVAL; } - raw_spin_lock_irqsave(&bank->gc.bgpio_lock, flags); - - iedge_config = bank->gc.read_reg(priv->reg_base + - GIO_EC(bank->id)) & ~mask; - iedge_insensitive = bank->gc.read_reg(priv->reg_base + - GIO_EI(bank->id)) & ~mask; - ilevel = bank->gc.read_reg(priv->reg_base + - GIO_LEVEL(bank->id)) & ~mask; - - bank->gc.write_reg(priv->reg_base + GIO_EC(bank->id), - iedge_config | edge_config); - bank->gc.write_reg(priv->reg_base + GIO_EI(bank->id), - iedge_insensitive | edge_insensitive); - bank->gc.write_reg(priv->reg_base + GIO_LEVEL(bank->id), - ilevel | level); + guard(gpio_generic_lock_irqsave)(&bank->chip); + + iedge_config = gpio_generic_read_reg(&bank->chip, + priv->reg_base + GIO_EC(bank->id)) & ~mask; + iedge_insensitive = gpio_generic_read_reg(&bank->chip, + priv->reg_base + GIO_EI(bank->id)) & ~mask; + ilevel = gpio_generic_read_reg(&bank->chip, + priv->reg_base + GIO_LEVEL(bank->id)) & ~mask; + + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_EC(bank->id), + iedge_config | edge_config); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_EI(bank->id), + iedge_insensitive | edge_insensitive); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_LEVEL(bank->id), + ilevel | level); - raw_spin_unlock_irqrestore(&bank->gc.bgpio_lock, flags); return 0; } @@ -263,7 +265,7 @@ static void brcmstb_gpio_irq_bank_handler(struct brcmstb_gpio_bank *bank) { struct brcmstb_gpio_priv *priv = bank->parent_priv; struct irq_domain *domain = priv->irq_domain; - int hwbase = bank->gc.offset; + int hwbase = bank->chip.gc.offset; unsigned long status; while ((status = brcmstb_gpio_get_active_irqs(bank))) { @@ -303,7 +305,7 @@ static struct brcmstb_gpio_bank *brcmstb_gpio_hwirq_to_bank( /* banks are in descending order */ list_for_each_entry_reverse(bank, &priv->bank_list, node) { - i += bank->gc.ngpio; + i += bank->chip.gc.ngpio; if (hwirq < i) return bank; } @@ -332,7 +334,7 @@ static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq, dev_dbg(&pdev->dev, "Mapping irq %d for gpio line %d (bank %d)\n", irq, (int)hwirq, bank->id); - ret = irq_set_chip_data(irq, &bank->gc); + ret = irq_set_chip_data(irq, &bank->chip.gc); if (ret < 0) return ret; irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class, @@ -394,7 +396,7 @@ static void brcmstb_gpio_remove(struct platform_device *pdev) * more important to actually perform all of the steps. */ list_for_each_entry(bank, &priv->bank_list, node) - gpiochip_remove(&bank->gc); + gpiochip_remove(&bank->chip.gc); } static int brcmstb_gpio_of_xlate(struct gpio_chip *gc, @@ -412,7 +414,7 @@ static int brcmstb_gpio_of_xlate(struct gpio_chip *gc, if (WARN_ON(gpiospec->args_count < gc->of_gpio_n_cells)) return -EINVAL; - offset = gpiospec->args[0] - bank->gc.offset; + offset = gpiospec->args[0] - bank->chip.gc.offset; if (offset >= gc->ngpio || offset < 0) return -EINVAL; @@ -493,19 +495,17 @@ static int brcmstb_gpio_irq_setup(struct platform_device *pdev, static void brcmstb_gpio_bank_save(struct brcmstb_gpio_priv *priv, struct brcmstb_gpio_bank *bank) { - struct gpio_chip *gc = &bank->gc; unsigned int i; for (i = 0; i < GIO_REG_STAT; i++) - bank->saved_regs[i] = gc->read_reg(priv->reg_base + - GIO_BANK_OFF(bank->id, i)); + bank->saved_regs[i] = gpio_generic_read_reg(&bank->chip, + priv->reg_base + GIO_BANK_OFF(bank->id, i)); } static void brcmstb_gpio_quiesce(struct device *dev, bool save) { struct brcmstb_gpio_priv *priv = dev_get_drvdata(dev); struct brcmstb_gpio_bank *bank; - struct gpio_chip *gc; u32 imask; /* disable non-wake interrupt */ @@ -513,8 +513,6 @@ static void brcmstb_gpio_quiesce(struct device *dev, bool save) disable_irq(priv->parent_irq); list_for_each_entry(bank, &priv->bank_list, node) { - gc = &bank->gc; - if (save) brcmstb_gpio_bank_save(priv, bank); @@ -523,8 +521,9 @@ static void brcmstb_gpio_quiesce(struct device *dev, bool save) imask = bank->wake_active; else imask = 0; - gc->write_reg(priv->reg_base + GIO_MASK(bank->id), - imask); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_MASK(bank->id), + imask); } } @@ -538,12 +537,12 @@ static void brcmstb_gpio_shutdown(struct platform_device *pdev) static void brcmstb_gpio_bank_restore(struct brcmstb_gpio_priv *priv, struct brcmstb_gpio_bank *bank) { - struct gpio_chip *gc = &bank->gc; unsigned int i; for (i = 0; i < GIO_REG_STAT; i++) - gc->write_reg(priv->reg_base + GIO_BANK_OFF(bank->id, i), - bank->saved_regs[i]); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_BANK_OFF(bank->id, i), + bank->saved_regs[i]); } static int brcmstb_gpio_suspend(struct device *dev) @@ -585,6 +584,7 @@ static const struct dev_pm_ops brcmstb_gpio_pm_ops = { static int brcmstb_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; void __iomem *reg_base; @@ -665,17 +665,24 @@ static int brcmstb_gpio_probe(struct platform_device *pdev) bank->width = bank_width; } + gc = &bank->chip.gc; + /* * Regs are 4 bytes wide, have data reg, no set/clear regs, * and direction bits have 0 = output and 1 = input */ - gc = &bank->gc; - err = bgpio_init(gc, dev, 4, - reg_base + GIO_DATA(bank->id), - NULL, NULL, NULL, - reg_base + GIO_IODIR(bank->id), flags); + + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = reg_base + GIO_DATA(bank->id), + .dirin = reg_base + GIO_IODIR(bank->id), + .flags = flags, + }; + + err = gpio_generic_chip_init(&bank->chip, &config); if (err) { - dev_err(dev, "bgpio_init() failed\n"); + dev_err(dev, "failed to initialize generic GPIO chip\n"); goto fail; } @@ -700,7 +707,8 @@ static int brcmstb_gpio_probe(struct platform_device *pdev) * be retained from S5 cold boot */ need_wakeup_event |= !!__brcmstb_gpio_get_active_irqs(bank); - gc->write_reg(reg_base + GIO_MASK(bank->id), 0); + gpio_generic_write_reg(&bank->chip, + reg_base + GIO_MASK(bank->id), 0); err = gpiochip_add_data(gc, bank); if (err) { From 80fd7e96d669d729d9e01bfa3e2b60ea6b500e20 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:44 +0200 Subject: [PATCH 1480/3206] gpio: mt7621: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-8-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mt7621.c | 51 ++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/drivers/gpio/gpio-mt7621.c b/drivers/gpio/gpio-mt7621.c index 93facbebb80efa..e56812a1721151 100644 --- a/drivers/gpio/gpio-mt7621.c +++ b/drivers/gpio/gpio-mt7621.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -30,7 +31,7 @@ struct mtk_gc { struct irq_chip irq_chip; - struct gpio_chip chip; + struct gpio_generic_chip chip; spinlock_t lock; int bank; u32 rising; @@ -59,27 +60,29 @@ struct mtk { static inline struct mtk_gc * to_mediatek_gpio(struct gpio_chip *chip) { - return container_of(chip, struct mtk_gc, chip); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(chip); + + return container_of(gen_gc, struct mtk_gc, chip); } static inline void mtk_gpio_w32(struct mtk_gc *rg, u32 offset, u32 val) { - struct gpio_chip *gc = &rg->chip; + struct gpio_chip *gc = &rg->chip.gc; struct mtk *mtk = gpiochip_get_data(gc); offset = (rg->bank * GPIO_BANK_STRIDE) + offset; - gc->write_reg(mtk->base + offset, val); + gpio_generic_write_reg(&rg->chip, mtk->base + offset, val); } static inline u32 mtk_gpio_r32(struct mtk_gc *rg, u32 offset) { - struct gpio_chip *gc = &rg->chip; + struct gpio_chip *gc = &rg->chip.gc; struct mtk *mtk = gpiochip_get_data(gc); offset = (rg->bank * GPIO_BANK_STRIDE) + offset; - return gc->read_reg(mtk->base + offset); + return gpio_generic_read_reg(&rg->chip, mtk->base + offset); } static irqreturn_t @@ -220,6 +223,7 @@ static const struct irq_chip mt7621_irq_chip = { static int mediatek_gpio_bank_probe(struct device *dev, int bank) { + struct gpio_generic_chip_config config; struct mtk *mtk = dev_get_drvdata(dev); struct mtk_gc *rg; void __iomem *dat, *set, *ctrl, *diro; @@ -236,21 +240,30 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) ctrl = mtk->base + GPIO_REG_DCLR + (rg->bank * GPIO_BANK_STRIDE); diro = mtk->base + GPIO_REG_CTRL + (rg->bank * GPIO_BANK_STRIDE); - ret = bgpio_init(&rg->chip, dev, 4, dat, set, ctrl, diro, NULL, - BGPIOF_NO_SET_ON_INPUT); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = dat, + .set = set, + .clr = ctrl, + .dirout = diro, + .flags = BGPIOF_NO_SET_ON_INPUT, + }; + + ret = gpio_generic_chip_init(&rg->chip, &config); if (ret) { - dev_err(dev, "bgpio_init() failed\n"); + dev_err(dev, "failed to initialize generic GPIO chip\n"); return ret; } - rg->chip.of_gpio_n_cells = 2; - rg->chip.of_xlate = mediatek_gpio_xlate; - rg->chip.label = devm_kasprintf(dev, GFP_KERNEL, "%s-bank%d", + rg->chip.gc.of_gpio_n_cells = 2; + rg->chip.gc.of_xlate = mediatek_gpio_xlate; + rg->chip.gc.label = devm_kasprintf(dev, GFP_KERNEL, "%s-bank%d", dev_name(dev), bank); - if (!rg->chip.label) + if (!rg->chip.gc.label) return -ENOMEM; - rg->chip.offset = bank * MTK_BANK_WIDTH; + rg->chip.gc.offset = bank * MTK_BANK_WIDTH; if (mtk->gpio_irq) { struct gpio_irq_chip *girq; @@ -261,7 +274,7 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) */ ret = devm_request_irq(dev, mtk->gpio_irq, mediatek_gpio_irq_handler, IRQF_SHARED, - rg->chip.label, &rg->chip); + rg->chip.gc.label, &rg->chip.gc); if (ret) { dev_err(dev, "Error requesting IRQ %d: %d\n", @@ -269,7 +282,7 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) return ret; } - girq = &rg->chip.irq; + girq = &rg->chip.gc.irq; gpio_irq_chip_set_chip(girq, &mt7621_irq_chip); /* This will let us handle the parent IRQ in the driver */ girq->parent_handler = NULL; @@ -279,17 +292,17 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) girq->handler = handle_simple_irq; } - ret = devm_gpiochip_add_data(dev, &rg->chip, mtk); + ret = devm_gpiochip_add_data(dev, &rg->chip.gc, mtk); if (ret < 0) { dev_err(dev, "Could not register gpio %d, ret=%d\n", - rg->chip.ngpio, ret); + rg->chip.gc.ngpio, ret); return ret; } /* set polarity to low for all gpios */ mtk_gpio_w32(rg, GPIO_REG_POL, 0); - dev_info(dev, "registering %d gpios\n", rg->chip.ngpio); + dev_info(dev, "registering %d gpios\n", rg->chip.gc.ngpio); return 0; } From 2c1f22fa54fcbf8fbd9c03f5d341c73ef36c6d27 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:45 +0200 Subject: [PATCH 1481/3206] gpio: mt7621: use the generic GPIO chip lock for IRQ handling This driver uses its own spinlock in interrupt routines while the generic GPIO chip callbacks use a separate one. This is, of course, racy so use the fact that the lock in generic GPIO chip is also a spinlock and convert the interrupt handling functions in this module to using the provided generic GPIO chip locking API. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-9-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mt7621.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/drivers/gpio/gpio-mt7621.c b/drivers/gpio/gpio-mt7621.c index e56812a1721151..e7bb9b2cd6cf32 100644 --- a/drivers/gpio/gpio-mt7621.c +++ b/drivers/gpio/gpio-mt7621.c @@ -11,7 +11,6 @@ #include #include #include -#include #define MTK_BANK_CNT 3 #define MTK_BANK_WIDTH 32 @@ -32,7 +31,6 @@ struct mtk_gc { struct irq_chip irq_chip; struct gpio_generic_chip chip; - spinlock_t lock; int bank; u32 rising; u32 falling; @@ -111,12 +109,12 @@ mediatek_gpio_irq_unmask(struct irq_data *d) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct mtk_gc *rg = to_mediatek_gpio(gc); int pin = d->hwirq; - unsigned long flags; u32 rise, fall, high, low; gpiochip_enable_irq(gc, d->hwirq); - spin_lock_irqsave(&rg->lock, flags); + guard(gpio_generic_lock_irqsave)(&rg->chip); + rise = mtk_gpio_r32(rg, GPIO_REG_REDGE); fall = mtk_gpio_r32(rg, GPIO_REG_FEDGE); high = mtk_gpio_r32(rg, GPIO_REG_HLVL); @@ -125,7 +123,6 @@ mediatek_gpio_irq_unmask(struct irq_data *d) mtk_gpio_w32(rg, GPIO_REG_FEDGE, fall | (BIT(pin) & rg->falling)); mtk_gpio_w32(rg, GPIO_REG_HLVL, high | (BIT(pin) & rg->hlevel)); mtk_gpio_w32(rg, GPIO_REG_LLVL, low | (BIT(pin) & rg->llevel)); - spin_unlock_irqrestore(&rg->lock, flags); } static void @@ -134,19 +131,18 @@ mediatek_gpio_irq_mask(struct irq_data *d) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct mtk_gc *rg = to_mediatek_gpio(gc); int pin = d->hwirq; - unsigned long flags; u32 rise, fall, high, low; - spin_lock_irqsave(&rg->lock, flags); - rise = mtk_gpio_r32(rg, GPIO_REG_REDGE); - fall = mtk_gpio_r32(rg, GPIO_REG_FEDGE); - high = mtk_gpio_r32(rg, GPIO_REG_HLVL); - low = mtk_gpio_r32(rg, GPIO_REG_LLVL); - mtk_gpio_w32(rg, GPIO_REG_FEDGE, fall & ~BIT(pin)); - mtk_gpio_w32(rg, GPIO_REG_REDGE, rise & ~BIT(pin)); - mtk_gpio_w32(rg, GPIO_REG_HLVL, high & ~BIT(pin)); - mtk_gpio_w32(rg, GPIO_REG_LLVL, low & ~BIT(pin)); - spin_unlock_irqrestore(&rg->lock, flags); + scoped_guard(gpio_generic_lock_irqsave, &rg->chip) { + rise = mtk_gpio_r32(rg, GPIO_REG_REDGE); + fall = mtk_gpio_r32(rg, GPIO_REG_FEDGE); + high = mtk_gpio_r32(rg, GPIO_REG_HLVL); + low = mtk_gpio_r32(rg, GPIO_REG_LLVL); + mtk_gpio_w32(rg, GPIO_REG_FEDGE, fall & ~BIT(pin)); + mtk_gpio_w32(rg, GPIO_REG_REDGE, rise & ~BIT(pin)); + mtk_gpio_w32(rg, GPIO_REG_HLVL, high & ~BIT(pin)); + mtk_gpio_w32(rg, GPIO_REG_LLVL, low & ~BIT(pin)); + } gpiochip_disable_irq(gc, d->hwirq); } @@ -232,7 +228,6 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) rg = &mtk->gc_map[bank]; memset(rg, 0, sizeof(*rg)); - spin_lock_init(&rg->lock); rg->bank = bank; dat = mtk->base + GPIO_REG_DATA + (rg->bank * GPIO_BANK_STRIDE); From b24489af4500720d8ad57c55111d90e762133c50 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:46 +0200 Subject: [PATCH 1482/3206] gpio: menz127: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-10-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-menz127.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/gpio/gpio-menz127.c b/drivers/gpio/gpio-menz127.c index ebe5da4933bce7..da2bf9381cc43c 100644 --- a/drivers/gpio/gpio-menz127.c +++ b/drivers/gpio/gpio-menz127.c @@ -12,6 +12,7 @@ #include #include #include +#include #define MEN_Z127_CTRL 0x00 #define MEN_Z127_PSR 0x04 @@ -30,7 +31,7 @@ (db <= MEN_Z127_DB_MAX_US)) struct men_z127_gpio { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *reg_base; struct resource *mem; }; @@ -64,7 +65,7 @@ static int men_z127_debounce(struct gpio_chip *gc, unsigned gpio, debounce /= 50; } - raw_spin_lock(&gc->bgpio_lock); + guard(gpio_generic_lock)(&priv->chip); db_en = readl(priv->reg_base + MEN_Z127_DBER); @@ -79,8 +80,6 @@ static int men_z127_debounce(struct gpio_chip *gc, unsigned gpio, writel(db_en, priv->reg_base + MEN_Z127_DBER); writel(db_cnt, priv->reg_base + GPIO_TO_DBCNT_REG(gpio)); - raw_spin_unlock(&gc->bgpio_lock); - return 0; } @@ -91,7 +90,8 @@ static int men_z127_set_single_ended(struct gpio_chip *gc, struct men_z127_gpio *priv = gpiochip_get_data(gc); u32 od_en; - raw_spin_lock(&gc->bgpio_lock); + guard(gpio_generic_lock)(&priv->chip); + od_en = readl(priv->reg_base + MEN_Z127_ODER); if (param == PIN_CONFIG_DRIVE_OPEN_DRAIN) @@ -101,7 +101,6 @@ static int men_z127_set_single_ended(struct gpio_chip *gc, od_en &= ~BIT(offset); writel(od_en, priv->reg_base + MEN_Z127_ODER); - raw_spin_unlock(&gc->bgpio_lock); return 0; } @@ -137,6 +136,7 @@ static void men_z127_release_mem(void *data) static int men_z127_probe(struct mcb_device *mdev, const struct mcb_device_id *id) { + struct gpio_generic_chip_config config; struct men_z127_gpio *men_z127_gpio; struct device *dev = &mdev->dev; int ret; @@ -163,18 +163,21 @@ static int men_z127_probe(struct mcb_device *mdev, mcb_set_drvdata(mdev, men_z127_gpio); - ret = bgpio_init(&men_z127_gpio->gc, &mdev->dev, 4, - men_z127_gpio->reg_base + MEN_Z127_PSR, - men_z127_gpio->reg_base + MEN_Z127_CTRL, - NULL, - men_z127_gpio->reg_base + MEN_Z127_GPIODR, - NULL, 0); + config = (struct gpio_generic_chip_config) { + .dev = &mdev->dev, + .sz = 4, + .dat = men_z127_gpio->reg_base + MEN_Z127_PSR, + .set = men_z127_gpio->reg_base + MEN_Z127_CTRL, + .dirout = men_z127_gpio->reg_base + MEN_Z127_GPIODR, + }; + + ret = gpio_generic_chip_init(&men_z127_gpio->chip, &config); if (ret) return ret; - men_z127_gpio->gc.set_config = men_z127_set_config; + men_z127_gpio->chip.gc.set_config = men_z127_set_config; - ret = devm_gpiochip_add_data(dev, &men_z127_gpio->gc, men_z127_gpio); + ret = devm_gpiochip_add_data(dev, &men_z127_gpio->chip.gc, men_z127_gpio); if (ret) return dev_err_probe(dev, ret, "failed to register MEN 16Z127 GPIO controller"); From 8e1c8ccc1df8b802a7a1b4beadbd8b87fff1c3b3 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:47 +0200 Subject: [PATCH 1483/3206] gpio: sifive: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Samuel Holland Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-11-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sifive.c | 73 ++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index 98ef975c44d9a6..2ced87ffd3bbf2 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,7 @@ struct sifive_gpio { void __iomem *base; - struct gpio_chip gc; + struct gpio_generic_chip gen_gc; struct regmap *regs; unsigned long irq_state; unsigned int trigger[SIFIVE_GPIO_MAX]; @@ -41,10 +42,10 @@ struct sifive_gpio { static void sifive_gpio_set_ie(struct sifive_gpio *chip, unsigned int offset) { - unsigned long flags; unsigned int trigger; - raw_spin_lock_irqsave(&chip->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); + trigger = (chip->irq_state & BIT(offset)) ? chip->trigger[offset] : 0; regmap_update_bits(chip->regs, SIFIVE_GPIO_RISE_IE, BIT(offset), (trigger & IRQ_TYPE_EDGE_RISING) ? BIT(offset) : 0); @@ -54,7 +55,6 @@ static void sifive_gpio_set_ie(struct sifive_gpio *chip, unsigned int offset) (trigger & IRQ_TYPE_LEVEL_HIGH) ? BIT(offset) : 0); regmap_update_bits(chip->regs, SIFIVE_GPIO_LOW_IE, BIT(offset), (trigger & IRQ_TYPE_LEVEL_LOW) ? BIT(offset) : 0); - raw_spin_unlock_irqrestore(&chip->gc.bgpio_lock, flags); } static int sifive_gpio_irq_set_type(struct irq_data *d, unsigned int trigger) @@ -72,13 +72,12 @@ static int sifive_gpio_irq_set_type(struct irq_data *d, unsigned int trigger) } static void sifive_gpio_irq_enable(struct irq_data *d) -{ + { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct sifive_gpio *chip = gpiochip_get_data(gc); irq_hw_number_t hwirq = irqd_to_hwirq(d); int offset = hwirq % SIFIVE_GPIO_MAX; u32 bit = BIT(offset); - unsigned long flags; gpiochip_enable_irq(gc, hwirq); irq_chip_enable_parent(d); @@ -86,13 +85,13 @@ static void sifive_gpio_irq_enable(struct irq_data *d) /* Switch to input */ gc->direction_input(gc, offset); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - /* Clear any sticky pending interrupts */ - regmap_write(chip->regs, SIFIVE_GPIO_RISE_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_FALL_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_HIGH_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_LOW_IP, bit); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + scoped_guard(gpio_generic_lock_irqsave, &chip->gen_gc) { + /* Clear any sticky pending interrupts */ + regmap_write(chip->regs, SIFIVE_GPIO_RISE_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_FALL_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_HIGH_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_LOW_IP, bit); + } /* Enable interrupts */ assign_bit(offset, &chip->irq_state, 1); @@ -118,15 +117,14 @@ static void sifive_gpio_irq_eoi(struct irq_data *d) struct sifive_gpio *chip = gpiochip_get_data(gc); int offset = irqd_to_hwirq(d) % SIFIVE_GPIO_MAX; u32 bit = BIT(offset); - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - /* Clear all pending interrupts */ - regmap_write(chip->regs, SIFIVE_GPIO_RISE_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_FALL_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_HIGH_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_LOW_IP, bit); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + scoped_guard(gpio_generic_lock_irqsave, &chip->gen_gc) { + /* Clear all pending interrupts */ + regmap_write(chip->regs, SIFIVE_GPIO_RISE_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_FALL_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_HIGH_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_LOW_IP, bit); + } irq_chip_eoi_parent(d); } @@ -179,6 +177,7 @@ static const struct regmap_config sifive_gpio_regmap_config = { static int sifive_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct irq_domain *parent; struct gpio_irq_chip *girq; @@ -217,13 +216,17 @@ static int sifive_gpio_probe(struct platform_device *pdev) */ parent = irq_get_irq_data(chip->irq_number[0])->domain; - ret = bgpio_init(&chip->gc, dev, 4, - chip->base + SIFIVE_GPIO_INPUT_VAL, - chip->base + SIFIVE_GPIO_OUTPUT_VAL, - NULL, - chip->base + SIFIVE_GPIO_OUTPUT_EN, - chip->base + SIFIVE_GPIO_INPUT_EN, - BGPIOF_READ_OUTPUT_REG_SET); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = chip->base + SIFIVE_GPIO_INPUT_VAL, + .set = chip->base + SIFIVE_GPIO_OUTPUT_VAL, + .dirout = chip->base + SIFIVE_GPIO_OUTPUT_EN, + .dirin = chip->base + SIFIVE_GPIO_INPUT_EN, + .flags = BGPIOF_READ_OUTPUT_REG_SET, + }; + + ret = gpio_generic_chip_init(&chip->gen_gc, &config); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; @@ -236,12 +239,12 @@ static int sifive_gpio_probe(struct platform_device *pdev) regmap_write(chip->regs, SIFIVE_GPIO_LOW_IE, 0); chip->irq_state = 0; - chip->gc.base = -1; - chip->gc.ngpio = ngpio; - chip->gc.label = dev_name(dev); - chip->gc.parent = dev; - chip->gc.owner = THIS_MODULE; - girq = &chip->gc.irq; + chip->gen_gc.gc.base = -1; + chip->gen_gc.gc.ngpio = ngpio; + chip->gen_gc.gc.label = dev_name(dev); + chip->gen_gc.gc.parent = dev; + chip->gen_gc.gc.owner = THIS_MODULE; + girq = &chip->gen_gc.gc.irq; gpio_irq_chip_set_chip(girq, &sifive_gpio_irqchip); girq->fwnode = dev_fwnode(dev); girq->parent_domain = parent; @@ -249,7 +252,7 @@ static int sifive_gpio_probe(struct platform_device *pdev) girq->handler = handle_bad_irq; girq->default_type = IRQ_TYPE_NONE; - return gpiochip_add_data(&chip->gc, chip); + return gpiochip_add_data(&chip->gen_gc.gc, chip); } static const struct of_device_id sifive_gpio_match[] = { From 063411108de622a26b36487a711903443b0e864b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:48 +0200 Subject: [PATCH 1484/3206] gpio: spacemit-k1: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Yixun Lan Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-12-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-spacemit-k1.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/gpio/gpio-spacemit-k1.c b/drivers/gpio/gpio-spacemit-k1.c index 3cc75c701ec401..a0af23f732819b 100644 --- a/drivers/gpio/gpio-spacemit-k1.c +++ b/drivers/gpio/gpio-spacemit-k1.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -38,7 +39,7 @@ struct spacemit_gpio; struct spacemit_gpio_bank { - struct gpio_chip gc; + struct gpio_generic_chip chip; struct spacemit_gpio *sg; void __iomem *base; u32 irq_mask; @@ -72,7 +73,7 @@ static irqreturn_t spacemit_gpio_irq_handler(int irq, void *dev_id) return IRQ_NONE; for_each_set_bit(n, &pending, BITS_PER_LONG) - handle_nested_irq(irq_find_mapping(gb->gc.irq.domain, n)); + handle_nested_irq(irq_find_mapping(gb->chip.gc.irq.domain, n)); return IRQ_HANDLED; } @@ -143,7 +144,7 @@ static void spacemit_gpio_irq_print_chip(struct irq_data *data, struct seq_file { struct spacemit_gpio_bank *gb = irq_data_get_irq_chip_data(data); - seq_printf(p, "%s-%d", dev_name(gb->gc.parent), spacemit_gpio_bank_index(gb)); + seq_printf(p, "%s-%d", dev_name(gb->chip.gc.parent), spacemit_gpio_bank_index(gb)); } static struct irq_chip spacemit_gpio_chip = { @@ -165,7 +166,7 @@ static bool spacemit_of_node_instance_match(struct gpio_chip *gc, unsigned int i if (i >= SPACEMIT_NR_BANKS) return false; - return (gc == &sg->sgb[i].gc); + return (gc == &sg->sgb[i].chip.gc); } static int spacemit_gpio_add_bank(struct spacemit_gpio *sg, @@ -173,7 +174,8 @@ static int spacemit_gpio_add_bank(struct spacemit_gpio *sg, int index, int irq) { struct spacemit_gpio_bank *gb = &sg->sgb[index]; - struct gpio_chip *gc = &gb->gc; + struct gpio_generic_chip_config config; + struct gpio_chip *gc = &gb->chip.gc; struct device *dev = sg->dev; struct gpio_irq_chip *girq; void __iomem *dat, *set, *clr, *dirin, *dirout; @@ -187,9 +189,19 @@ static int spacemit_gpio_add_bank(struct spacemit_gpio *sg, dirin = gb->base + SPACEMIT_GCDR; dirout = gb->base + SPACEMIT_GSDR; + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = dat, + .set = set, + .clr = clr, + .dirout = dirout, + .dirin = dirin, + .flags = BGPIOF_UNREADABLE_REG_SET | BGPIOF_UNREADABLE_REG_DIR, + }; + /* This registers 32 GPIO lines per bank */ - ret = bgpio_init(gc, dev, 4, dat, set, clr, dirout, dirin, - BGPIOF_UNREADABLE_REG_SET | BGPIOF_UNREADABLE_REG_DIR); + ret = gpio_generic_chip_init(&gb->chip, &config); if (ret) return dev_err_probe(dev, ret, "failed to init gpio chip\n"); @@ -221,7 +233,7 @@ static int spacemit_gpio_add_bank(struct spacemit_gpio *sg, ret = devm_request_threaded_irq(dev, irq, NULL, spacemit_gpio_irq_handler, IRQF_ONESHOT | IRQF_SHARED, - gb->gc.label, gb); + gb->chip.gc.label, gb); if (ret < 0) return dev_err_probe(dev, ret, "failed to register IRQ\n"); From ae9a52990b2cd62e0555adad92d8fe9e431d1bac Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:49 +0200 Subject: [PATCH 1485/3206] gpio: sodaville: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-13-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sodaville.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/gpio/gpio-sodaville.c b/drivers/gpio/gpio-sodaville.c index abd13c79ace09d..37c1338377295f 100644 --- a/drivers/gpio/gpio-sodaville.c +++ b/drivers/gpio/gpio-sodaville.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -39,7 +40,7 @@ struct sdv_gpio_chip_data { void __iomem *gpio_pub_base; struct irq_domain *id; struct irq_chip_generic *gc; - struct gpio_chip chip; + struct gpio_generic_chip gen_gc; }; static int sdv_gpio_pub_set_type(struct irq_data *d, unsigned int type) @@ -180,6 +181,7 @@ static int sdv_register_irqsupport(struct sdv_gpio_chip_data *sd, static int sdv_gpio_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id) { + struct gpio_generic_chip_config config; struct sdv_gpio_chip_data *sd; int ret; u32 mux_val; @@ -206,15 +208,21 @@ static int sdv_gpio_probe(struct pci_dev *pdev, if (!ret) writel(mux_val, sd->gpio_pub_base + GPMUXCTL); - ret = bgpio_init(&sd->chip, &pdev->dev, 4, - sd->gpio_pub_base + GPINR, sd->gpio_pub_base + GPOUTR, - NULL, sd->gpio_pub_base + GPOER, NULL, 0); + config = (struct gpio_generic_chip_config) { + .dev = &pdev->dev, + .sz = 4, + .dat = sd->gpio_pub_base + GPINR, + .set = sd->gpio_pub_base + GPOUTR, + .dirout = sd->gpio_pub_base + GPOER, + }; + + ret = gpio_generic_chip_init(&sd->gen_gc, &config); if (ret) return ret; - sd->chip.ngpio = SDV_NUM_PUB_GPIOS; + sd->gen_gc.gc.ngpio = SDV_NUM_PUB_GPIOS; - ret = devm_gpiochip_add_data(&pdev->dev, &sd->chip, sd); + ret = devm_gpiochip_add_data(&pdev->dev, &sd->gen_gc.gc, sd); if (ret < 0) { dev_err(&pdev->dev, "gpiochip_add() failed.\n"); return ret; From e43e94fa19cf058c4e465fcdbc2f521123058ea6 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:50 +0200 Subject: [PATCH 1486/3206] gpio: mmio: use new generic GPIO chip API Convert the driver to using the new generic GPIO chip interfaces from linux/gpio/generic.h. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-14-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mmio.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index 79e1be149c9484..b4f0ab0daaeb11 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -57,6 +57,7 @@ o ` ~~~~\___/~~~~ ` controller in FPGA is ,.` #include #include +#include #include "gpiolib.h" @@ -737,6 +738,8 @@ MODULE_DEVICE_TABLE(of, bgpio_of_match); static int bgpio_pdev_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; + struct gpio_generic_chip *gen_gc; struct device *dev = &pdev->dev; struct resource *r; void __iomem *dat; @@ -748,7 +751,6 @@ static int bgpio_pdev_probe(struct platform_device *pdev) unsigned long flags = 0; unsigned int base; int err; - struct gpio_chip *gc; const char *label; r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dat"); @@ -777,8 +779,8 @@ static int bgpio_pdev_probe(struct platform_device *pdev) if (IS_ERR(dirin)) return PTR_ERR(dirin); - gc = devm_kzalloc(&pdev->dev, sizeof(*gc), GFP_KERNEL); - if (!gc) + gen_gc = devm_kzalloc(&pdev->dev, sizeof(*gen_gc), GFP_KERNEL); + if (!gen_gc) return -ENOMEM; if (device_is_big_endian(dev)) @@ -787,13 +789,24 @@ static int bgpio_pdev_probe(struct platform_device *pdev) if (device_property_read_bool(dev, "no-output")) flags |= BGPIOF_NO_OUTPUT; - err = bgpio_init(gc, dev, sz, dat, set, clr, dirout, dirin, flags); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = sz, + .dat = dat, + .set = set, + .clr = clr, + .dirout = dirout, + .dirin = dirin, + .flags = flags, + }; + + err = gpio_generic_chip_init(gen_gc, &config); if (err) return err; err = device_property_read_string(dev, "label", &label); if (!err) - gc->label = label; + gen_gc->gc.label = label; /* * This property *must not* be used in device-tree sources, it's only @@ -801,11 +814,11 @@ static int bgpio_pdev_probe(struct platform_device *pdev) */ err = device_property_read_u32(dev, "gpio-mmio,base", &base); if (!err && base <= INT_MAX) - gc->base = base; + gen_gc->gc.base = base; - platform_set_drvdata(pdev, gc); + platform_set_drvdata(pdev, &gen_gc->gc); - return devm_gpiochip_add_data(&pdev->dev, gc, NULL); + return devm_gpiochip_add_data(&pdev->dev, &gen_gc->gc, NULL); } static const struct platform_device_id bgpio_id_table[] = { From 9b90afa6d613b66ec4e74ae75f9bfa5baf386ecd Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:51 +0200 Subject: [PATCH 1487/3206] gpio: move gpio-mmio-specific fields out of struct gpio_chip With all users of bgpio_init() converted to using the modernized generic GPIO chip API, we can now move the gpio-mmio-specific fields out of struct gpio_chip and into the dedicated struct gpio_generic_chip. To that end: adjust the gpio-mmio driver to the new layout, update the docs, etc. The changes in gpio-mlxbf2.c and gpio-mpc8xxx.c are here and not in their respective conversion commits because the former passes the address of the generic chip's lock to the __releases() annotation and we cannot really hide it while gpio-mpc8xxx.c accesses the shadow registers in a driver-specific workaround and there's no reason to make them available in a public API. Also: drop the relevant task from TODO as it's now done. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-15-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/TODO | 5 - drivers/gpio/gpio-mlxbf2.c | 2 +- drivers/gpio/gpio-mmio.c | 321 ++++++++++++++++++----------------- drivers/gpio/gpio-mpc8xxx.c | 5 +- include/linux/gpio/driver.h | 44 ----- include/linux/gpio/generic.h | 67 +++++--- 6 files changed, 211 insertions(+), 233 deletions(-) diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO index b797499e627ee9..8ed74e05903a97 100644 --- a/drivers/gpio/TODO +++ b/drivers/gpio/TODO @@ -131,11 +131,6 @@ Work items: helpers (x86 inb()/outb()) and convert port-mapped I/O drivers to use this with dry-coding and sending to maintainers to test -- Move the MMIO GPIO specific fields out of struct gpio_chip into a - dedicated structure. Currently every GPIO chip has them if gpio-mmio is - enabled in Kconfig even if it itself doesn't register with the helper - library. - ------------------------------------------------------------------------------- Generic regmap GPIO diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c index 7e3b526a6caae0..abffce3894fc44 100644 --- a/drivers/gpio/gpio-mlxbf2.c +++ b/drivers/gpio/gpio-mlxbf2.c @@ -156,7 +156,7 @@ static int mlxbf2_gpio_lock_acquire(struct mlxbf2_gpio_context *gs) * Release the YU arm_gpio_lock after changing the direction mode. */ static void mlxbf2_gpio_lock_release(struct mlxbf2_gpio_context *gs) - __releases(&gs->chip.gc.bgpio_lock) + __releases(&gs->chip.lock) __releases(yu_arm_gpio_lock_param.lock) { writel(YU_ARM_GPIO_LOCK_RELEASE, yu_arm_gpio_lock_param.io); diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index b4f0ab0daaeb11..a3df14d672a92a 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -125,20 +125,23 @@ static unsigned long bgpio_read32be(void __iomem *reg) static unsigned long bgpio_line2mask(struct gpio_chip *gc, unsigned int line) { - if (gc->be_bits) - return BIT(gc->bgpio_bits - 1 - line); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (chip->be_bits) + return BIT(chip->bits - 1 - line); return BIT(line); } static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long pinmask = bgpio_line2mask(gc, gpio); - bool dir = !!(gc->bgpio_dir & pinmask); + bool dir = !!(chip->sdir & pinmask); if (dir) - return !!(gc->read_reg(gc->reg_set) & pinmask); - else - return !!(gc->read_reg(gc->reg_dat) & pinmask); + return !!(chip->read_reg(chip->reg_set) & pinmask); + + return !!(chip->read_reg(chip->reg_dat) & pinmask); } /* @@ -148,26 +151,28 @@ static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio) static int bgpio_get_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - unsigned long get_mask = 0; - unsigned long set_mask = 0; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long get_mask = 0, set_mask = 0; /* Make sure we first clear any bits that are zero when we read the register */ *bits &= ~*mask; - set_mask = *mask & gc->bgpio_dir; - get_mask = *mask & ~gc->bgpio_dir; + set_mask = *mask & chip->sdir; + get_mask = *mask & ~chip->sdir; if (set_mask) - *bits |= gc->read_reg(gc->reg_set) & set_mask; + *bits |= chip->read_reg(chip->reg_set) & set_mask; if (get_mask) - *bits |= gc->read_reg(gc->reg_dat) & get_mask; + *bits |= chip->read_reg(chip->reg_dat) & get_mask; return 0; } static int bgpio_get(struct gpio_chip *gc, unsigned int gpio) { - return !!(gc->read_reg(gc->reg_dat) & bgpio_line2mask(gc, gpio)); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + return !!(chip->read_reg(chip->reg_dat) & bgpio_line2mask(gc, gpio)); } /* @@ -176,9 +181,11 @@ static int bgpio_get(struct gpio_chip *gc, unsigned int gpio) static int bgpio_get_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + /* Make sure we first clear any bits that are zero when we read the register */ *bits &= ~*mask; - *bits |= gc->read_reg(gc->reg_dat) & *mask; + *bits |= chip->read_reg(chip->reg_dat) & *mask; return 0; } @@ -188,6 +195,7 @@ static int bgpio_get_multiple(struct gpio_chip *gc, unsigned long *mask, static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long readmask = 0; unsigned long val; int bit; @@ -200,7 +208,7 @@ static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask, readmask |= bgpio_line2mask(gc, bit); /* Read the register */ - val = gc->read_reg(gc->reg_dat) & readmask; + val = chip->read_reg(chip->reg_dat) & readmask; /* * Mirror the result into the "bits" result, this will give line 0 @@ -219,19 +227,20 @@ static int bgpio_set_none(struct gpio_chip *gc, unsigned int gpio, int val) static int bgpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long mask = bgpio_line2mask(gc, gpio); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); if (val) - gc->bgpio_data |= mask; + chip->sdata |= mask; else - gc->bgpio_data &= ~mask; + chip->sdata &= ~mask; - gc->write_reg(gc->reg_dat, gc->bgpio_data); + chip->write_reg(chip->reg_dat, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return 0; } @@ -239,31 +248,32 @@ static int bgpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static int bgpio_set_with_clear(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long mask = bgpio_line2mask(gc, gpio); if (val) - gc->write_reg(gc->reg_set, mask); + chip->write_reg(chip->reg_set, mask); else - gc->write_reg(gc->reg_clr, mask); + chip->write_reg(chip->reg_clr, mask); return 0; } static int bgpio_set_set(struct gpio_chip *gc, unsigned int gpio, int val) { - unsigned long mask = bgpio_line2mask(gc, gpio); - unsigned long flags; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long mask = bgpio_line2mask(gc, gpio), flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); if (val) - gc->bgpio_data |= mask; + chip->sdata |= mask; else - gc->bgpio_data &= ~mask; + chip->sdata &= ~mask; - gc->write_reg(gc->reg_set, gc->bgpio_data); + chip->write_reg(chip->reg_set, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return 0; } @@ -273,12 +283,13 @@ static void bgpio_multiple_get_masks(struct gpio_chip *gc, unsigned long *set_mask, unsigned long *clear_mask) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); int i; *set_mask = 0; *clear_mask = 0; - for_each_set_bit(i, mask, gc->bgpio_bits) { + for_each_set_bit(i, mask, chip->bits) { if (test_bit(i, bits)) *set_mask |= bgpio_line2mask(gc, i); else @@ -291,25 +302,27 @@ static void bgpio_set_multiple_single_reg(struct gpio_chip *gc, unsigned long *bits, void __iomem *reg) { - unsigned long flags; - unsigned long set_mask, clear_mask; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long flags, set_mask, clear_mask; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); bgpio_multiple_get_masks(gc, mask, bits, &set_mask, &clear_mask); - gc->bgpio_data |= set_mask; - gc->bgpio_data &= ~clear_mask; + chip->sdata |= set_mask; + chip->sdata &= ~clear_mask; - gc->write_reg(reg, gc->bgpio_data); + chip->write_reg(reg, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); } static int bgpio_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - bgpio_set_multiple_single_reg(gc, mask, bits, gc->reg_dat); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + bgpio_set_multiple_single_reg(gc, mask, bits, chip->reg_dat); return 0; } @@ -317,7 +330,9 @@ static int bgpio_set_multiple(struct gpio_chip *gc, unsigned long *mask, static int bgpio_set_multiple_set(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - bgpio_set_multiple_single_reg(gc, mask, bits, gc->reg_set); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + bgpio_set_multiple_single_reg(gc, mask, bits, chip->reg_set); return 0; } @@ -326,21 +341,24 @@ static int bgpio_set_multiple_with_clear(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long set_mask, clear_mask; bgpio_multiple_get_masks(gc, mask, bits, &set_mask, &clear_mask); if (set_mask) - gc->write_reg(gc->reg_set, set_mask); + chip->write_reg(chip->reg_set, set_mask); if (clear_mask) - gc->write_reg(gc->reg_clr, clear_mask); + chip->write_reg(chip->reg_clr, clear_mask); return 0; } static int bgpio_dir_return(struct gpio_chip *gc, unsigned int gpio, bool dir_out) { - if (!gc->bgpio_pinctrl) + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (!chip->pinctrl) return 0; if (dir_out) @@ -375,39 +393,42 @@ static int bgpio_simple_dir_out(struct gpio_chip *gc, unsigned int gpio, static int bgpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); - gc->bgpio_dir &= ~bgpio_line2mask(gc, gpio); + chip->sdir &= ~bgpio_line2mask(gc, gpio); - if (gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); - if (gc->reg_dir_out) - gc->write_reg(gc->reg_dir_out, gc->bgpio_dir); + if (chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); + if (chip->reg_dir_out) + chip->write_reg(chip->reg_dir_out, chip->sdir); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return bgpio_dir_return(gc, gpio, false); } static int bgpio_get_dir(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + /* Return 0 if output, 1 if input */ - if (gc->bgpio_dir_unreadable) { - if (gc->bgpio_dir & bgpio_line2mask(gc, gpio)) + if (chip->dir_unreadable) { + if (chip->sdir & bgpio_line2mask(gc, gpio)) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; } - if (gc->reg_dir_out) { - if (gc->read_reg(gc->reg_dir_out) & bgpio_line2mask(gc, gpio)) + if (chip->reg_dir_out) { + if (chip->read_reg(chip->reg_dir_out) & bgpio_line2mask(gc, gpio)) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; } - if (gc->reg_dir_in) - if (!(gc->read_reg(gc->reg_dir_in) & bgpio_line2mask(gc, gpio))) + if (chip->reg_dir_in) + if (!(chip->read_reg(chip->reg_dir_in) & bgpio_line2mask(gc, gpio))) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; @@ -415,18 +436,19 @@ static int bgpio_get_dir(struct gpio_chip *gc, unsigned int gpio) static void bgpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); - gc->bgpio_dir |= bgpio_line2mask(gc, gpio); + chip->sdir |= bgpio_line2mask(gc, gpio); - if (gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); - if (gc->reg_dir_out) - gc->write_reg(gc->reg_dir_out, gc->bgpio_dir); + if (chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); + if (chip->reg_dir_out) + chip->write_reg(chip->reg_dir_out, chip->sdir); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); } static int bgpio_dir_out_dir_first(struct gpio_chip *gc, unsigned int gpio, @@ -446,31 +468,30 @@ static int bgpio_dir_out_val_first(struct gpio_chip *gc, unsigned int gpio, } static int bgpio_setup_accessors(struct device *dev, - struct gpio_chip *gc, + struct gpio_generic_chip *chip, bool byte_be) { - - switch (gc->bgpio_bits) { + switch (chip->bits) { case 8: - gc->read_reg = bgpio_read8; - gc->write_reg = bgpio_write8; + chip->read_reg = bgpio_read8; + chip->write_reg = bgpio_write8; break; case 16: if (byte_be) { - gc->read_reg = bgpio_read16be; - gc->write_reg = bgpio_write16be; + chip->read_reg = bgpio_read16be; + chip->write_reg = bgpio_write16be; } else { - gc->read_reg = bgpio_read16; - gc->write_reg = bgpio_write16; + chip->read_reg = bgpio_read16; + chip->write_reg = bgpio_write16; } break; case 32: if (byte_be) { - gc->read_reg = bgpio_read32be; - gc->write_reg = bgpio_write32be; + chip->read_reg = bgpio_read32be; + chip->write_reg = bgpio_write32be; } else { - gc->read_reg = bgpio_read32; - gc->write_reg = bgpio_write32; + chip->read_reg = bgpio_read32; + chip->write_reg = bgpio_write32; } break; #if BITS_PER_LONG >= 64 @@ -480,13 +501,13 @@ static int bgpio_setup_accessors(struct device *dev, "64 bit big endian byte order unsupported\n"); return -EINVAL; } else { - gc->read_reg = bgpio_read64; - gc->write_reg = bgpio_write64; + chip->read_reg = bgpio_read64; + chip->write_reg = bgpio_write64; } break; #endif /* BITS_PER_LONG >= 64 */ default: - dev_err(dev, "unsupported data width %u bits\n", gc->bgpio_bits); + dev_err(dev, "unsupported data width %u bits\n", chip->bits); return -EINVAL; } @@ -515,27 +536,25 @@ static int bgpio_setup_accessors(struct device *dev, * - an input direction register (named "dirin") where a 1 bit indicates * the GPIO is an input. */ -static int bgpio_setup_io(struct gpio_chip *gc, - void __iomem *dat, - void __iomem *set, - void __iomem *clr, - unsigned long flags) +static int bgpio_setup_io(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { + struct gpio_chip *gc = &chip->gc; - gc->reg_dat = dat; - if (!gc->reg_dat) + chip->reg_dat = cfg->dat; + if (!chip->reg_dat) return -EINVAL; - if (set && clr) { - gc->reg_set = set; - gc->reg_clr = clr; + if (cfg->set && cfg->clr) { + chip->reg_set = cfg->set; + chip->reg_clr = cfg->clr; gc->set = bgpio_set_with_clear; gc->set_multiple = bgpio_set_multiple_with_clear; - } else if (set && !clr) { - gc->reg_set = set; + } else if (cfg->set && !cfg->clr) { + chip->reg_set = cfg->set; gc->set = bgpio_set_set; gc->set_multiple = bgpio_set_multiple_set; - } else if (flags & BGPIOF_NO_OUTPUT) { + } else if (cfg->flags & BGPIOF_NO_OUTPUT) { gc->set = bgpio_set_none; gc->set_multiple = NULL; } else { @@ -543,10 +562,10 @@ static int bgpio_setup_io(struct gpio_chip *gc, gc->set_multiple = bgpio_set_multiple; } - if (!(flags & BGPIOF_UNREADABLE_REG_SET) && - (flags & BGPIOF_READ_OUTPUT_REG_SET)) { + if (!(cfg->flags & BGPIOF_UNREADABLE_REG_SET) && + (cfg->flags & BGPIOF_READ_OUTPUT_REG_SET)) { gc->get = bgpio_get_set; - if (!gc->be_bits) + if (!chip->be_bits) gc->get_multiple = bgpio_get_set_multiple; /* * We deliberately avoid assigning the ->get_multiple() call @@ -557,7 +576,7 @@ static int bgpio_setup_io(struct gpio_chip *gc, */ } else { gc->get = bgpio_get; - if (gc->be_bits) + if (chip->be_bits) gc->get_multiple = bgpio_get_multiple_be; else gc->get_multiple = bgpio_get_multiple; @@ -566,27 +585,27 @@ static int bgpio_setup_io(struct gpio_chip *gc, return 0; } -static int bgpio_setup_direction(struct gpio_chip *gc, - void __iomem *dirout, - void __iomem *dirin, - unsigned long flags) +static int bgpio_setup_direction(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { - if (dirout || dirin) { - gc->reg_dir_out = dirout; - gc->reg_dir_in = dirin; - if (flags & BGPIOF_NO_SET_ON_INPUT) + struct gpio_chip *gc = &chip->gc; + + if (cfg->dirout || cfg->dirin) { + chip->reg_dir_out = cfg->dirout; + chip->reg_dir_in = cfg->dirin; + if (cfg->flags & BGPIOF_NO_SET_ON_INPUT) gc->direction_output = bgpio_dir_out_dir_first; else gc->direction_output = bgpio_dir_out_val_first; gc->direction_input = bgpio_dir_in; gc->get_direction = bgpio_get_dir; } else { - if (flags & BGPIOF_NO_OUTPUT) + if (cfg->flags & BGPIOF_NO_OUTPUT) gc->direction_output = bgpio_dir_out_err; else gc->direction_output = bgpio_simple_dir_out; - if (flags & BGPIOF_NO_INPUT) + if (cfg->flags & BGPIOF_NO_INPUT) gc->direction_input = bgpio_dir_in_err; else gc->direction_input = bgpio_simple_dir_in; @@ -595,117 +614,101 @@ static int bgpio_setup_direction(struct gpio_chip *gc, return 0; } -static int bgpio_request(struct gpio_chip *chip, unsigned gpio_pin) +static int bgpio_request(struct gpio_chip *gc, unsigned int gpio_pin) { - if (gpio_pin >= chip->ngpio) + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (gpio_pin >= gc->ngpio) return -EINVAL; - if (chip->bgpio_pinctrl) - return gpiochip_generic_request(chip, gpio_pin); + if (chip->pinctrl) + return gpiochip_generic_request(gc, gpio_pin); return 0; } /** - * bgpio_init() - Initialize generic GPIO accessor functions - * @gc: the GPIO chip to set up - * @dev: the parent device of the new GPIO chip (compulsory) - * @sz: the size (width) of the MMIO registers in bytes, typically 1, 2 or 4 - * @dat: MMIO address for the register to READ the value of the GPIO lines, it - * is expected that a 1 in the corresponding bit in this register means the - * line is asserted - * @set: MMIO address for the register to SET the value of the GPIO lines, it is - * expected that we write the line with 1 in this register to drive the GPIO line - * high. - * @clr: MMIO address for the register to CLEAR the value of the GPIO lines, it is - * expected that we write the line with 1 in this register to drive the GPIO line - * low. It is allowed to leave this address as NULL, in that case the SET register - * will be assumed to also clear the GPIO lines, by actively writing the line - * with 0. - * @dirout: MMIO address for the register to set the line as OUTPUT. It is assumed - * that setting a line to 1 in this register will turn that line into an - * output line. Conversely, setting the line to 0 will turn that line into - * an input. - * @dirin: MMIO address for the register to set this line as INPUT. It is assumed - * that setting a line to 1 in this register will turn that line into an - * input line. Conversely, setting the line to 0 will turn that line into - * an output. - * @flags: Different flags that will affect the behaviour of the device, such as - * endianness etc. + * gpio_generic_chip_init() - Initialize a generic GPIO chip. + * @chip: Generic GPIO chip to set up. + * @cfg: Generic GPIO chip configuration. + * + * Returns 0 on success, negative error number on failure. */ -int bgpio_init(struct gpio_chip *gc, struct device *dev, - unsigned long sz, void __iomem *dat, void __iomem *set, - void __iomem *clr, void __iomem *dirout, void __iomem *dirin, - unsigned long flags) +int gpio_generic_chip_init(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { + struct gpio_chip *gc = &chip->gc; + unsigned long flags = cfg->flags; + struct device *dev = cfg->dev; int ret; - if (!is_power_of_2(sz)) + if (!is_power_of_2(cfg->sz)) return -EINVAL; - gc->bgpio_bits = sz * 8; - if (gc->bgpio_bits > BITS_PER_LONG) + chip->bits = cfg->sz * 8; + if (chip->bits > BITS_PER_LONG) return -EINVAL; - raw_spin_lock_init(&gc->bgpio_lock); + raw_spin_lock_init(&chip->lock); gc->parent = dev; gc->label = dev_name(dev); gc->base = -1; gc->request = bgpio_request; - gc->be_bits = !!(flags & BGPIOF_BIG_ENDIAN); + chip->be_bits = !!(flags & BGPIOF_BIG_ENDIAN); ret = gpiochip_get_ngpios(gc, dev); if (ret) - gc->ngpio = gc->bgpio_bits; + gc->ngpio = chip->bits; - ret = bgpio_setup_io(gc, dat, set, clr, flags); + ret = bgpio_setup_io(chip, cfg); if (ret) return ret; - ret = bgpio_setup_accessors(dev, gc, flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER); + ret = bgpio_setup_accessors(dev, chip, + flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER); if (ret) return ret; - ret = bgpio_setup_direction(gc, dirout, dirin, flags); + ret = bgpio_setup_direction(chip, cfg); if (ret) return ret; if (flags & BGPIOF_PINCTRL_BACKEND) { - gc->bgpio_pinctrl = true; + chip->pinctrl = true; /* Currently this callback is only used for pincontrol */ gc->free = gpiochip_generic_free; } - gc->bgpio_data = gc->read_reg(gc->reg_dat); + chip->sdata = chip->read_reg(chip->reg_dat); if (gc->set == bgpio_set_set && !(flags & BGPIOF_UNREADABLE_REG_SET)) - gc->bgpio_data = gc->read_reg(gc->reg_set); + chip->sdata = chip->read_reg(chip->reg_set); if (flags & BGPIOF_UNREADABLE_REG_DIR) - gc->bgpio_dir_unreadable = true; + chip->dir_unreadable = true; /* * Inspect hardware to find initial direction setting. */ - if ((gc->reg_dir_out || gc->reg_dir_in) && + if ((chip->reg_dir_out || chip->reg_dir_in) && !(flags & BGPIOF_UNREADABLE_REG_DIR)) { - if (gc->reg_dir_out) - gc->bgpio_dir = gc->read_reg(gc->reg_dir_out); - else if (gc->reg_dir_in) - gc->bgpio_dir = ~gc->read_reg(gc->reg_dir_in); + if (chip->reg_dir_out) + chip->sdir = chip->read_reg(chip->reg_dir_out); + else if (chip->reg_dir_in) + chip->sdir = ~chip->read_reg(chip->reg_dir_in); /* * If we have two direction registers, synchronise * input setting to output setting, the library * can not handle a line being input and output at * the same time. */ - if (gc->reg_dir_out && gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); + if (chip->reg_dir_out && chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); } return ret; } -EXPORT_SYMBOL_GPL(bgpio_init); +EXPORT_SYMBOL_GPL(gpio_generic_chip_init); #if IS_ENABLED(CONFIG_GPIO_GENERIC_PLATFORM) diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index dd2cd2cc6e6f29..a2a83afb41bbb9 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -71,7 +71,7 @@ static int mpc8572_gpio_get(struct gpio_chip *gc, unsigned int gpio) mpc8xxx_gc->regs + GPIO_DIR); val = gpio_generic_read_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_DAT) & ~out_mask; - out_shadow = gc->bgpio_data & out_mask; + out_shadow = mpc8xxx_gc->chip.sdata & out_mask; return !!((val | out_shadow) & mpc_pin2mask(gpio)); } @@ -399,7 +399,8 @@ static int mpc8xxx_probe(struct platform_device *pdev) gpio_generic_write_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_IBE, 0xffffffff); /* Also, latch state of GPIOs configured as output by bootloader. */ - gc->bgpio_data = gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->chip.sdata = + gpio_generic_read_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_DAT) & gpio_generic_read_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_DIR); diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 9fcd4a988081f7..9b14fd20f13eee 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -388,28 +388,6 @@ struct gpio_irq_chip { * implies that if the chip supports IRQs, these IRQs need to be threaded * as the chip access may sleep when e.g. reading out the IRQ status * registers. - * @read_reg: reader function for generic GPIO - * @write_reg: writer function for generic GPIO - * @be_bits: if the generic GPIO has big endian bit order (bit 31 is representing - * line 0, bit 30 is line 1 ... bit 0 is line 31) this is set to true by the - * generic GPIO core. It is for internal housekeeping only. - * @reg_dat: data (in) register for generic GPIO - * @reg_set: output set register (out=high) for generic GPIO - * @reg_clr: output clear register (out=low) for generic GPIO - * @reg_dir_out: direction out setting register for generic GPIO - * @reg_dir_in: direction in setting register for generic GPIO - * @bgpio_dir_unreadable: indicates that the direction register(s) cannot - * be read and we need to rely on out internal state tracking. - * @bgpio_pinctrl: the generic GPIO uses a pin control backend. - * @bgpio_bits: number of register bits used for a generic GPIO i.e. - * * 8 - * @bgpio_lock: used to lock chip->bgpio_data. Also, this is needed to keep - * shadowed and real data registers writes together. - * @bgpio_data: shadowed data register for generic GPIO to clear/set bits - * safely. - * @bgpio_dir: shadowed direction register for generic GPIO to clear/set - * direction safely. A "1" in this word means the line is set as - * output. * * A gpio_chip can help platforms abstract various sources of GPIOs so * they can all be accessed through a common programming interface. @@ -475,23 +453,6 @@ struct gpio_chip { const char *const *names; bool can_sleep; -#if IS_ENABLED(CONFIG_GPIO_GENERIC) - unsigned long (*read_reg)(void __iomem *reg); - void (*write_reg)(void __iomem *reg, unsigned long data); - bool be_bits; - void __iomem *reg_dat; - void __iomem *reg_set; - void __iomem *reg_clr; - void __iomem *reg_dir_out; - void __iomem *reg_dir_in; - bool bgpio_dir_unreadable; - bool bgpio_pinctrl; - int bgpio_bits; - raw_spinlock_t bgpio_lock; - unsigned long bgpio_data; - unsigned long bgpio_dir; -#endif /* CONFIG_GPIO_GENERIC */ - #ifdef CONFIG_GPIOLIB_IRQCHIP /* * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib @@ -723,11 +684,6 @@ int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -int bgpio_init(struct gpio_chip *gc, struct device *dev, - unsigned long sz, void __iomem *dat, void __iomem *set, - void __iomem *clr, void __iomem *dirout, void __iomem *dirin, - unsigned long flags); - #define BGPIOF_BIG_ENDIAN BIT(0) #define BGPIOF_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ #define BGPIOF_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index 4c0626b53ec903..162430d96660e9 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -50,9 +50,44 @@ struct gpio_generic_chip_config { * struct gpio_generic_chip - Generic GPIO chip implementation. * @gc: The underlying struct gpio_chip object, implementing low-level GPIO * chip routines. + * @read_reg: reader function for generic GPIO + * @write_reg: writer function for generic GPIO + * @be_bits: if the generic GPIO has big endian bit order (bit 31 is + * representing line 0, bit 30 is line 1 ... bit 0 is line 31) this + * is set to true by the generic GPIO core. It is for internal + * housekeeping only. + * @reg_dat: data (in) register for generic GPIO + * @reg_set: output set register (out=high) for generic GPIO + * @reg_clr: output clear register (out=low) for generic GPIO + * @reg_dir_out: direction out setting register for generic GPIO + * @reg_dir_in: direction in setting register for generic GPIO + * @dir_unreadable: indicates that the direction register(s) cannot be read and + * we need to rely on out internal state tracking. + * @pinctrl: the generic GPIO uses a pin control backend. + * @bits: number of register bits used for a generic GPIO + * i.e. * 8 + * @lock: used to lock chip->sdata. Also, this is needed to keep + * shadowed and real data registers writes together. + * @sdata: shadowed data register for generic GPIO to clear/set bits safely. + * @sdir: shadowed direction register for generic GPIO to clear/set direction + * safely. A "1" in this word means the line is set as output. */ struct gpio_generic_chip { struct gpio_chip gc; + unsigned long (*read_reg)(void __iomem *reg); + void (*write_reg)(void __iomem *reg, unsigned long data); + bool be_bits; + void __iomem *reg_dat; + void __iomem *reg_set; + void __iomem *reg_clr; + void __iomem *reg_dir_out; + void __iomem *reg_dir_in; + bool dir_unreadable; + bool pinctrl; + int bits; + raw_spinlock_t lock; + unsigned long sdata; + unsigned long sdir; }; static inline struct gpio_generic_chip * @@ -61,20 +96,8 @@ to_gpio_generic_chip(struct gpio_chip *gc) return container_of(gc, struct gpio_generic_chip, gc); } -/** - * gpio_generic_chip_init() - Initialize a generic GPIO chip. - * @chip: Generic GPIO chip to set up. - * @cfg: Generic GPIO chip configuration. - * - * Returns 0 on success, negative error number on failure. - */ -static inline int -gpio_generic_chip_init(struct gpio_generic_chip *chip, - const struct gpio_generic_chip_config *cfg) -{ - return bgpio_init(&chip->gc, cfg->dev, cfg->sz, cfg->dat, cfg->set, - cfg->clr, cfg->dirout, cfg->dirin, cfg->flags); -} +int gpio_generic_chip_init(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg); /** * gpio_generic_chip_set() - Set the GPIO line value of the generic GPIO chip. @@ -110,10 +133,10 @@ gpio_generic_chip_set(struct gpio_generic_chip *chip, unsigned int offset, static inline unsigned long gpio_generic_read_reg(struct gpio_generic_chip *chip, void __iomem *reg) { - if (WARN_ON(!chip->gc.read_reg)) + if (WARN_ON(!chip->read_reg)) return 0; - return chip->gc.read_reg(reg); + return chip->read_reg(reg); } /** @@ -125,23 +148,23 @@ gpio_generic_read_reg(struct gpio_generic_chip *chip, void __iomem *reg) static inline void gpio_generic_write_reg(struct gpio_generic_chip *chip, void __iomem *reg, unsigned long val) { - if (WARN_ON(!chip->gc.write_reg)) + if (WARN_ON(!chip->write_reg)) return; - chip->gc.write_reg(reg, val); + chip->write_reg(reg, val); } #define gpio_generic_chip_lock(gen_gc) \ - raw_spin_lock(&(gen_gc)->gc.bgpio_lock) + raw_spin_lock(&(gen_gc)->lock) #define gpio_generic_chip_unlock(gen_gc) \ - raw_spin_unlock(&(gen_gc)->gc.bgpio_lock) + raw_spin_unlock(&(gen_gc)->lock) #define gpio_generic_chip_lock_irqsave(gen_gc, flags) \ - raw_spin_lock_irqsave(&(gen_gc)->gc.bgpio_lock, flags) + raw_spin_lock_irqsave(&(gen_gc)->lock, flags) #define gpio_generic_chip_unlock_irqrestore(gen_gc, flags) \ - raw_spin_unlock_irqrestore(&(gen_gc)->gc.bgpio_lock, flags) + raw_spin_unlock_irqrestore(&(gen_gc)->lock, flags) DEFINE_LOCK_GUARD_1(gpio_generic_lock, struct gpio_generic_chip, From 60c6273131330a57b3eb55233d3706f48e4c4b3e Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 25 Jul 2025 15:09:00 +0200 Subject: [PATCH 1488/3206] gfs2: Remove unused GIF_FREE_VFS_INODE flag Since commit 38552ff676f0 ("gfs2: Fix and clean up create / evict interaction"), the GIF_FREE_VFS_INODE flag isn't used anymore. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/incore.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index d4ad82f47eeea4..e3a8a28acb9982 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -376,7 +376,6 @@ struct gfs2_glock { enum { GIF_QD_LOCKED = 1, GIF_SW_PAGED = 3, - GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, }; From 37b1c0f120b78f804f588ce78cfbd5a2df352a33 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 25 Jul 2025 23:41:37 +0200 Subject: [PATCH 1489/3206] gfs2: Remove unused sd_withdraw_wait field This field was introduced in commit 601ef0d52e96 ("gfs2: Force withdraw to replay journals and wait for it to finish") and never used. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/incore.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e3a8a28acb9982..78a00239e384fb 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -822,7 +822,6 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; wait_queue_head_t sd_log_flush_wait; int sd_log_error; /* First log error */ - wait_queue_head_t sd_withdraw_wait; unsigned int sd_log_tail; unsigned int sd_log_flush_tail; From aa94ad9ab230d08741e6630a20fd1296b52c1009 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 29 Jul 2025 12:36:38 +0100 Subject: [PATCH 1490/3206] gfs2: Remove space before newline There is an extraneous space before a newline in a fs_err message. Remove it Signed-off-by: Colin Ian King Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b6fd1cb17de7ba..09af1fe1250384 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -733,7 +733,7 @@ __acquires(&gl->gl_lockref.lock) */ if (ret) { if (cmpxchg(&sdp->sd_log_error, 0, ret)) { - fs_err(sdp, "Error %d syncing glock \n", ret); + fs_err(sdp, "Error %d syncing glock\n", ret); gfs2_dump_glock(NULL, gl, true); } spin_lock(&gl->gl_lockref.lock); From 2309a01351e56446f43c89e200d643647d47e739 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 8 Jul 2025 19:13:32 +0200 Subject: [PATCH 1491/3206] gfs2: do_xmote cleanup Check for asynchronous completion and clear the GLF_PENDING_REPLY flag earlier in do_xmote(). This will make future changes more readable. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 09af1fe1250384..f1383e9445bec8 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -816,6 +816,12 @@ __acquires(&gl->gl_lockref.lock) ret = ls->ls_ops->lm_lock(gl, target, lck_flags); spin_lock(&gl->gl_lockref.lock); + if (!ret) { + /* The operation will be completed asynchronously. */ + return; + } + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); + if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && target == LM_ST_UNLOCKED && test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { @@ -823,14 +829,10 @@ __acquires(&gl->gl_lockref.lock) * The lockspace has been released and the lock has * been unlocked implicitly. */ - } else if (ret) { + } else { fs_err(sdp, "lm_lock ret %d\n", ret); target = gl->gl_state | LM_OUT_ERROR; - } else { - /* The operation will be completed asynchronously. */ - return; } - clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); } /* Complete the operation now. */ From 4250e683de69a637b93f7c7bda7818b36b1cf32e Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 2 Aug 2025 23:41:24 +0200 Subject: [PATCH 1492/3206] gfs2: Simplify refcounting in do_xmote In do_xmote(), take the additional glock references close to where those references are needed. This will simplify the next commit. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f1383e9445bec8..8a7f947883cdfc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -760,7 +760,6 @@ __acquires(&gl->gl_lockref.lock) spin_lock(&gl->gl_lockref.lock); skip_inval: - gl->gl_lockref.count++; /* * Check for an error encountered since we called go_sync and go_inval. * If so, we can't withdraw from the glock code because the withdraw @@ -803,6 +802,7 @@ __acquires(&gl->gl_lockref.lock) if (!test_bit(GLF_CANCELING, &gl->gl_flags)) clear_bit(GLF_LOCK, &gl->gl_flags); clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); + gl->gl_lockref.count++; gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); return; } else { @@ -818,6 +818,7 @@ __acquires(&gl->gl_lockref.lock) if (!ret) { /* The operation will be completed asynchronously. */ + gl->gl_lockref.count++; return; } clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); @@ -837,6 +838,7 @@ __acquires(&gl->gl_lockref.lock) /* Complete the operation now. */ finish_xmote(gl, target); + gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); } From 418c854759341cdf687240bd064de09ce38b92c5 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 8 Jul 2025 22:07:17 +0200 Subject: [PATCH 1493/3206] gfs2: Partially revert "gfs2: do_xmote fixes" When the lm_lock hook which calls dlm_lock() returns an error, do_xmote() previously reported the error to the syslog ("lm_lock ret %d\n") but otherwise ignored it during withdraws. Commit 9947a06d29c0 ("gfs2: do_xmote fixes") changed that to pass the error on to the glock layer, but the error would then only result in an unconditional BUG() in finish_xmote(), which doesn't help. Instead, revert to the previous behavior. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 8a7f947883cdfc..5bdb11de5b132b 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -832,7 +832,8 @@ __acquires(&gl->gl_lockref.lock) */ } else { fs_err(sdp, "lm_lock ret %d\n", ret); - target = gl->gl_state | LM_OUT_ERROR; + GLOCK_BUG_ON(gl, !gfs2_withdrawing_or_withdrawn(sdp)); + return; } } From 6e4224082696940d57c9129ed3aeb5e69904a5b2 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 5 Aug 2025 19:51:47 +0200 Subject: [PATCH 1494/3206] gfs2: Turn gfs2_withdraw into a void function Since commit 601ef0d52e96 ("gfs2: Force withdraw to replay journals and wait for it to finish"), gfs2_withdraw() always returns -1, so turn it into a straight void function. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/util.c | 21 ++++++++------------- fs/gfs2/util.h | 2 +- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 24864a66074b2a..020dd21ad51e2b 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -309,7 +309,7 @@ void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...) va_end(args); } -int gfs2_withdraw(struct gfs2_sbd *sdp) +void gfs2_withdraw(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; const struct lm_lockops *lm = ls->ls_ops; @@ -322,7 +322,7 @@ int gfs2_withdraw(struct gfs2_sbd *sdp) wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG, TASK_UNINTERRUPTIBLE); - return -1; + return; } new = old | BIT(SDF_WITHDRAWN) | BIT(SDF_WITHDRAW_IN_PROG); } while (unlikely(!try_cmpxchg(&sdp->sd_flags, &old, new))); @@ -350,8 +350,6 @@ int gfs2_withdraw(struct gfs2_sbd *sdp) if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname); - - return -1; } /* @@ -481,16 +479,14 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *function, char *file, unsigned int line) { - int me; - gfs2_lm(sdp, "fatal: invalid metadata block - " "bh = %llu (bad magic number), " "function = %s, file = %s, line = %u\n", (unsigned long long)bh->b_blocknr, function, file, line); - me = gfs2_withdraw(sdp); - return (me) ? -1 : -2; + gfs2_withdraw(sdp); + return -1; } /* @@ -503,16 +499,14 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, u16 type, u16 t, const char *function, char *file, unsigned int line) { - int me; - gfs2_lm(sdp, "fatal: invalid metadata block - " "bh = %llu (type: exp=%u, found=%u), " "function = %s, file = %s, line = %u\n", (unsigned long long)bh->b_blocknr, type, t, function, file, line); - me = gfs2_withdraw(sdp); - return (me) ? -1 : -2; + gfs2_withdraw(sdp); + return -1; } /* @@ -528,7 +522,8 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, "fatal: I/O error - " "function = %s, file = %s, line = %u\n", function, file, line); - return gfs2_withdraw(sdp); + gfs2_withdraw(sdp); + return -1; } /* diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 27d03b64102418..2a8a1fac611afe 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -228,6 +228,6 @@ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) __printf(2, 3) void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...); -int gfs2_withdraw(struct gfs2_sbd *sdp); +void gfs2_withdraw(struct gfs2_sbd *sdp); #endif /* __UTIL_DOT_H__ */ From 13c0004168633845fba328edf2cdec0de0292307 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 5 Aug 2025 20:08:35 +0200 Subject: [PATCH 1495/3206] gfs2: Sanitize gfs2_meta_check, gfs2_metatype_check, gfs2_io_error Change those functions to either return a useful value, or nothing at all. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/util.c | 23 ++++++++--------------- fs/gfs2/util.h | 34 +++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 020dd21ad51e2b..56412f63f3bb9b 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -471,13 +471,11 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, /* * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw - * Returns: -1 if this call withdrew the machine, - * -2 if it was already withdrawn */ -int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - const char *function, char *file, - unsigned int line) +void gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, + unsigned int line) { gfs2_lm(sdp, "fatal: invalid metadata block - " @@ -486,18 +484,15 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, (unsigned long long)bh->b_blocknr, function, file, line); gfs2_withdraw(sdp); - return -1; } /* * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw - * Returns: -1 if this call withdrew the machine, - * -2 if it was already withdrawn */ -int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - u16 type, u16 t, const char *function, - char *file, unsigned int line) +void gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, const char *function, + char *file, unsigned int line) { gfs2_lm(sdp, "fatal: invalid metadata block - " @@ -506,7 +501,6 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, (unsigned long long)bh->b_blocknr, type, t, function, file, line); gfs2_withdraw(sdp); - return -1; } /* @@ -515,15 +509,14 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, * 0 if it was already withdrawn */ -int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, - unsigned int line) +void gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, + unsigned int line) { gfs2_lm(sdp, "fatal: I/O error - " "function = %s, file = %s, line = %u\n", function, file, line); gfs2_withdraw(sdp); - return -1; } /* diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 2a8a1fac611afe..da0373b1e82b9e 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -91,9 +91,9 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, gfs2_consist_rgrpd_i((rgd), __func__, __FILE__, __LINE__) -int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - const char *function, - char *file, unsigned int line); +void gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, + char *file, unsigned int line); static inline int gfs2_meta_check(struct gfs2_sbd *sdp, struct buffer_head *bh) @@ -108,10 +108,10 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp, return 0; } -int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - u16 type, u16 t, - const char *function, - char *file, unsigned int line); +void gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, + const char *function, + char *file, unsigned int line); static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, struct buffer_head *bh, @@ -122,12 +122,16 @@ static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; u32 magic = be32_to_cpu(mh->mh_magic); u16 t = be32_to_cpu(mh->mh_type); - if (unlikely(magic != GFS2_MAGIC)) - return gfs2_meta_check_ii(sdp, bh, function, - file, line); - if (unlikely(t != type)) - return gfs2_metatype_check_ii(sdp, bh, type, t, function, - file, line); + if (unlikely(magic != GFS2_MAGIC)) { + gfs2_meta_check_ii(sdp, bh, function, + file, line); + return -EIO; + } + if (unlikely(t != type)) { + gfs2_metatype_check_ii(sdp, bh, type, t, function, + file, line); + return -EIO; + } return 0; } @@ -144,8 +148,8 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, } -int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, - char *file, unsigned int line); +void gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, + char *file, unsigned int line); int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, bool verbose); From cd718046646593ced5ad98f9cde22aaf2a2eb8f2 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 5 Aug 2025 01:58:47 +0200 Subject: [PATCH 1496/3206] gfs2: Do not use atomic operations unnecessarily The GLF_DEMOTE_IN_PROGRESS and GLF_LOCK flags and the glock refcount are all protected by the glock spin lock, so there is no need for atomic operations / barriers here. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 5bdb11de5b132b..1ced38b9a5a25d 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -646,8 +646,10 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) } /* Fast path - we got what we asked for */ - if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) + if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { + clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_demote_wake(gl); + } if (gl->gl_state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { int rv; @@ -891,14 +893,12 @@ __acquires(&gl->gl_lockref.lock) out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_atomic(); gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); return; out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_atomic(); } /** From fd70ab7155c4b92a9747d42c02791a0793ab9c66 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 7 Aug 2025 10:21:33 +0200 Subject: [PATCH 1497/3206] gfs2: Further sanitize lock_dlm.c The gl_req field and GLF_BLOCKING flag are only relevant to gdlm_lock(), its callback gdlm_ast(), and their helpers, so set and clear them inside lock_dlm.c. Also, the LM_FLAG_ANY flag is relevant to gdlm_lock(), but do_xmote() doesn't pass that flag down to gdlm_lock() as it should. Fix that by passing down all the flags. In addition, document the effect of the LM_FLAG_ANY flag on locks held in EX mode locally. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 6 ------ fs/gfs2/glock.h | 4 ++++ fs/gfs2/lock_dlm.c | 26 ++++++++++++++++++-------- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 1ced38b9a5a25d..5958910154158e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -717,12 +717,6 @@ __acquires(&gl->gl_lockref.lock) return; do_error(gl, 0); /* Fail queued try locks */ } - gl->gl_req = target; - set_bit(GLF_BLOCKING, &gl->gl_flags); - if ((gl->gl_req == LM_ST_UNLOCKED) || - (gl->gl_state == LM_ST_EXCLUSIVE) || - (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) - clear_bit(GLF_BLOCKING, &gl->gl_flags); if (!glops->go_inval && !glops->go_sync) goto skip_inval; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 9339a3bff6eeb1..d041b922b45e3b 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -68,6 +68,10 @@ enum { * also be granted in SHARED. The preferred state is whichever is compatible * with other granted locks, or the specified state if no other locks exist. * + * In addition, when a lock is already held in EX mode locally, a SHARED or + * DEFERRED mode request with the LM_FLAG_ANY flag set will be granted. + * (The LM_FLAG_ANY flag is only use for SHARED mode requests currently.) + * * LM_FLAG_NODE_SCOPE * This holder agrees to share the lock within this node. In other words, * the glock is held in EX mode according to DLM, but local holders on the diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index cee5d199d2d870..5daaeaaaf18dd8 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -58,6 +58,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, /** * gfs2_update_reply_times - Update locking statistics * @gl: The glock to update + * @blocking: The operation may have been blocking * * This assumes that gl->gl_dstamp has been set earlier. * @@ -72,12 +73,12 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, * TRY_1CB flags are set are classified as non-blocking. All * other DLM requests are counted as (potentially) blocking. */ -static inline void gfs2_update_reply_times(struct gfs2_glock *gl) +static inline void gfs2_update_reply_times(struct gfs2_glock *gl, + bool blocking) { struct gfs2_pcpu_lkstats *lks; const unsigned gltype = gl->gl_name.ln_type; - unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ? - GFS2_LKS_SRTTB : GFS2_LKS_SRTT; + unsigned index = blocking ? GFS2_LKS_SRTTB : GFS2_LKS_SRTT; s64 rtt; preempt_disable(); @@ -119,14 +120,18 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl) static void gdlm_ast(void *arg) { struct gfs2_glock *gl = arg; + bool blocking; unsigned ret; + blocking = test_bit(GLF_BLOCKING, &gl->gl_flags); + gfs2_update_reply_times(gl, blocking); + clear_bit(GLF_BLOCKING, &gl->gl_flags); + /* If the glock is dead, we only react to a dlm_unlock() reply. */ if (__lockref_is_dead(&gl->gl_lockref) && gl->gl_lksb.sb_status != -DLM_EUNLOCK) return; - gfs2_update_reply_times(gl); BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr) @@ -241,7 +246,7 @@ static bool down_conversion(int cur, int req) } static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, - const int cur, const int req) + const int req, bool blocking) { u32 lkf = 0; @@ -274,7 +279,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, * "upward" lock conversions or else DLM will reject the * request as invalid. */ - if (!down_conversion(cur, req)) + if (blocking) lkf |= DLM_LKF_QUECVT; } @@ -294,14 +299,20 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, unsigned int flags) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; + bool blocking; int cur, req; u32 lkf; char strname[GDLM_STRNAME_BYTES] = ""; int error; + gl->gl_req = req_state; cur = make_mode(gl->gl_name.ln_sbd, gl->gl_state); req = make_mode(gl->gl_name.ln_sbd, req_state); - lkf = make_flags(gl, flags, cur, req); + blocking = !down_conversion(cur, req) && + !(flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)); + lkf = make_flags(gl, flags, req, blocking); + if (blocking) + set_bit(GLF_BLOCKING, &gl->gl_flags); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); if (test_bit(GLF_INITIAL, &gl->gl_flags)) { @@ -341,7 +352,6 @@ static void gdlm_put_lock(struct gfs2_glock *gl) return; } - clear_bit(GLF_BLOCKING, &gl->gl_flags); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); From 2b813a72880dcc90ab5997e2307f961b5a8650bd Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 7 Aug 2025 10:21:33 +0200 Subject: [PATCH 1498/3206] gfs2: Remove DLM_LKF_ALTCW / DLM_LKF_ALTPR code Commit 6802e3400ff45 ("[GFS2] Clean up the glock core") stopped passing the LM_FLAG_ANY flag down to gdlm_lock() (then gfs2_lm_lock()). Since then, gfs2 effectively hasn't been using dlm's DLM_LKF_ALTCW / DLM_LKF_ALTPR flags, but the code still suggests that it does. Recent testing shows that those flags don't even work reliably anymore, so instead of fixing code that hasn't been used since 2008, remove it. In addition, clean up how the flags are passed to [gd]lm_lock(). Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 4 +--- fs/gfs2/lock_dlm.c | 17 ----------------- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 5958910154158e..6294a9f3631890 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -695,14 +695,12 @@ __acquires(&gl->gl_lockref.lock) const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; - unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) && gh && !(gh->gh_flags & LM_FLAG_NOEXP)) goto skip_inval; - lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP); GLOCK_BUG_ON(gl, gl->gl_state == target); GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && @@ -809,7 +807,7 @@ __acquires(&gl->gl_lockref.lock) if (ls->ls_ops->lm_lock) { set_bit(GLF_PENDING_REPLY, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); - ret = ls->ls_ops->lm_lock(gl, target, lck_flags); + ret = ls->ls_ops->lm_lock(gl, target, gh ? gh->gh_flags : 0); spin_lock(&gl->gl_lockref.lock); if (!ret) { diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 5daaeaaaf18dd8..427150a206cfe3 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -162,14 +162,6 @@ static void gdlm_ast(void *arg) } ret = gl->gl_req; - if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) { - if (gl->gl_req == LM_ST_SHARED) - ret = LM_ST_DEFERRED; - else if (gl->gl_req == LM_ST_DEFERRED) - ret = LM_ST_SHARED; - else - BUG(); - } /* * The GLF_INITIAL flag is initially set for new glocks. Upon the @@ -261,15 +253,6 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, lkf |= DLM_LKF_NOQUEUEBAST; } - if (gfs_flags & LM_FLAG_ANY) { - if (req == DLM_LOCK_PR) - lkf |= DLM_LKF_ALTCW; - else if (req == DLM_LOCK_CW) - lkf |= DLM_LKF_ALTPR; - else - BUG(); - } - if (!test_bit(GLF_INITIAL, &gl->gl_flags)) { lkf |= DLM_LKF_CONVERT; From 0c23e24164d83086e75581b0cf930f4e161636d6 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 8 Aug 2025 22:31:59 +0200 Subject: [PATCH 1499/3206] gfs2: Fix LM_FLAG_TRY* logic in add_to_queue The logic in add_to_queue() for determining whether a LM_FLAG_TRY or LM_FLAG_TRY_1CB holder should be queued does not make any sense: we are interested in wether or not the new operation will block behind an existing or future holder in the queue, but the current code checks for ongoing locking or ->go_inval() operations, which has little to do with that. Replace that code with something more sensible, remove the incorrect add_to_queue() function annotations, remove the similarly misguided do_error(gl, 0) call in do_xmote(), and add a missing comment to the same call in do_promote(). Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6294a9f3631890..e754214fdb1cfc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -502,7 +502,7 @@ static bool do_promote(struct gfs2_glock *gl) */ if (list_is_first(&gh->gh_list, &gl->gl_holders)) return false; - do_error(gl, 0); + do_error(gl, 0); /* Fail queued try locks */ break; } set_bit(HIF_HOLDER, &gh->gh_iflags); @@ -713,7 +713,6 @@ __acquires(&gl->gl_lockref.lock) if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) return; - do_error(gl, 0); /* Fail queued try locks */ } if (!glops->go_inval && !glops->go_sync) goto skip_inval; @@ -1459,6 +1458,24 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) va_end(args); } +static bool gfs2_should_queue_trylock(struct gfs2_glock *gl, + struct gfs2_holder *gh) +{ + struct gfs2_holder *current_gh, *gh2; + + current_gh = find_first_holder(gl); + if (current_gh && !may_grant(gl, current_gh, gh)) + return false; + + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) + continue; + if (!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + return false; + } + return true; +} + static inline bool pid_is_meaningful(const struct gfs2_holder *gh) { if (!(gh->gh_flags & GL_NOPID)) @@ -1477,27 +1494,20 @@ static inline bool pid_is_meaningful(const struct gfs2_holder *gh) */ static inline void add_to_queue(struct gfs2_holder *gh) -__releases(&gl->gl_lockref.lock) -__acquires(&gl->gl_lockref.lock) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_holder *gh2; - int try_futile = 0; GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL); if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) GLOCK_BUG_ON(gl, true); - if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { - if (test_bit(GLF_LOCK, &gl->gl_flags)) { - struct gfs2_holder *current_gh; - - current_gh = find_first_holder(gl); - try_futile = !may_grant(gl, current_gh, gh); - } - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) - goto fail; + if ((gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) && + !gfs2_should_queue_trylock(gl, gh)) { + gh->gh_error = GLR_TRYFAILED; + gfs2_holder_wake(gh); + return; } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { @@ -1509,15 +1519,6 @@ __acquires(&gl->gl_lockref.lock) continue; goto trap_recursive; } - list_for_each_entry(gh2, &gl->gl_holders, gh_list) { - if (try_futile && - !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { -fail: - gh->gh_error = GLR_TRYFAILED; - gfs2_holder_wake(gh); - return; - } - } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); From 9b54770b68ae793a3a8d378be4cda2bb7be6c8cc Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 8 Aug 2025 23:18:45 +0200 Subject: [PATCH 1500/3206] gfs2: Remove duplicate check in do_xmote In do_xmote(), remove the duplicate check for the ->go_sync and ->go_inval glock operations. They are either both defined, or none of them are. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e754214fdb1cfc..15a90ab8a979ca 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -714,25 +714,24 @@ __acquires(&gl->gl_lockref.lock) &gl->gl_flags)) return; } - if (!glops->go_inval && !glops->go_sync) + if (!glops->go_inval || !glops->go_sync) goto skip_inval; spin_unlock(&gl->gl_lockref.lock); - if (glops->go_sync) { - ret = glops->go_sync(gl); - /* If we had a problem syncing (due to io errors or whatever, - * we should not invalidate the metadata or tell dlm to - * release the glock to other nodes. - */ - if (ret) { - if (cmpxchg(&sdp->sd_log_error, 0, ret)) { - fs_err(sdp, "Error %d syncing glock\n", ret); - gfs2_dump_glock(NULL, gl, true); - } - spin_lock(&gl->gl_lockref.lock); - goto skip_inval; + ret = glops->go_sync(gl); + /* If we had a problem syncing (due to io errors or whatever, + * we should not invalidate the metadata or tell dlm to + * release the glock to other nodes. + */ + if (ret) { + if (cmpxchg(&sdp->sd_log_error, 0, ret)) { + fs_err(sdp, "Error %d syncing glock\n", ret); + gfs2_dump_glock(NULL, gl, true); } + spin_lock(&gl->gl_lockref.lock); + goto skip_inval; } + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) { /* * The call to go_sync should have cleared out the ail list. From 061df28b82af6b22fb5fa529a8f2ef00474ee004 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 2 Aug 2025 23:57:24 +0200 Subject: [PATCH 1501/3206] gfs2: Fix GLF_INVALIDATE_IN_PROGRESS flag clearing in do_xmote Commit 865cc3e9cc0b ("gfs2: fix a deadlock on withdraw-during-mount") added a statement to do_xmote() to clear the GLF_INVALIDATE_IN_PROGRESS flag a second time after it has already been cleared. Fix that. Fixes: 865cc3e9cc0b ("gfs2: fix a deadlock on withdraw-during-mount") Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 15a90ab8a979ca..5361a2641cdbd7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -797,8 +797,6 @@ __acquires(&gl->gl_lockref.lock) gl->gl_lockref.count++; gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); return; - } else { - clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } } From bddb53b776fb7ce81dfba7c24884d9f2c0c68e50 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 8 Aug 2025 23:26:12 +0200 Subject: [PATCH 1502/3206] gfs2: Get rid of GLF_INVALIDATE_IN_PROGRESS Get rid of the GLF_INVALIDATE_IN_PROGRESS flag: it was originally used to indicate to add_to_queue() that the ->go_sync() and ->go_invalid() operations were in progress, but as we have established in commit "gfs2: Fix LM_FLAG_TRY* logic in add_to_queue", add_to_queue() has no need to know. Commit d99724c3c36a describes a race in which GLF_INVALIDATE_IN_PROGRESS is used to serialize two processes which are both in do_xmote() at the same time. That analysis is wrong: the serialization happens via the GLF_LOCK flag, which ensures that at most one glock operation can be active at any time. Fixes: d99724c3c36a ("gfs2: Close timing window with GLF_INVALIDATE_IN_PROGRESS") Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 16 +--------------- fs/gfs2/incore.h | 1 - fs/gfs2/trace_gfs2.h | 1 - 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 5361a2641cdbd7..e8b630a58da8f8 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -703,17 +703,6 @@ __acquires(&gl->gl_lockref.lock) GLOCK_BUG_ON(gl, gl->gl_state == target); GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); - if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && - glops->go_inval) { - /* - * If another process is already doing the invalidate, let that - * finish first. The glock state machine will get back to this - * holder again later. - */ - if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS, - &gl->gl_flags)) - return; - } if (!glops->go_inval || !glops->go_sync) goto skip_inval; @@ -732,7 +721,7 @@ __acquires(&gl->gl_lockref.lock) goto skip_inval; } - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) { + if (target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) { /* * The call to go_sync should have cleared out the ail list. * If there are still items, we have a problem. We ought to @@ -747,7 +736,6 @@ __acquires(&gl->gl_lockref.lock) gfs2_dump_glock(NULL, gl, true); } glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); - clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } spin_lock(&gl->gl_lockref.lock); @@ -2316,8 +2304,6 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'y'; if (test_bit(GLF_LFLUSH, gflags)) *p++ = 'f'; - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) - *p++ = 'i'; if (test_bit(GLF_PENDING_REPLY, gflags)) *p++ = 'R'; if (test_bit(GLF_HAVE_REPLY, gflags)) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 78a00239e384fb..cd8a3f469291ff 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -319,7 +319,6 @@ enum { GLF_DEMOTE_IN_PROGRESS = 5, GLF_DIRTY = 6, GLF_LFLUSH = 7, - GLF_INVALIDATE_IN_PROGRESS = 8, GLF_HAVE_REPLY = 9, GLF_INITIAL = 10, GLF_HAVE_FROZEN_REPLY = 11, diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 26036ffc3f338e..1c2507a273180b 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -52,7 +52,6 @@ {(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \ {(1UL << GLF_DIRTY), "y" }, \ {(1UL << GLF_LFLUSH), "f" }, \ - {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ {(1UL << GLF_PENDING_REPLY), "R" }, \ {(1UL << GLF_HAVE_REPLY), "r" }, \ {(1UL << GLF_INITIAL), "a" }, \ From 2045364497dbb16663df0267d6f733d964f22866 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 11 Aug 2025 22:34:50 +0200 Subject: [PATCH 1503/3206] gfs2: Simplify do_promote While not immediately obvious, do_promote() returns whether or not there are any active holders in the queue. But the function description is confusing, and this information is easy to come by for callers anyway, so turn do_promote() into a void function. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e8b630a58da8f8..38f320e13a36e3 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -481,11 +481,9 @@ int gfs2_instantiate(struct gfs2_holder *gh) /** * do_promote - promote as many requests as possible on the current queue * @gl: The glock - * - * Returns true on success (i.e., progress was made or there are no waiters). */ -static bool do_promote(struct gfs2_glock *gl) +static void do_promote(struct gfs2_glock *gl) { struct gfs2_holder *gh, *current_gh; @@ -496,13 +494,10 @@ static bool do_promote(struct gfs2_glock *gl) if (!may_grant(gl, current_gh, gh)) { /* * If we get here, it means we may not grant this - * holder for some reason. If this holder is at the - * head of the list, it means we have a blocked holder - * at the head, so return false. + * holder for some reason. */ - if (list_is_first(&gh->gh_list, &gl->gl_holders)) - return false; - do_error(gl, 0); /* Fail queued try locks */ + if (current_gh) + do_error(gl, 0); /* Fail queued try locks */ break; } set_bit(HIF_HOLDER, &gh->gh_iflags); @@ -511,7 +506,6 @@ static bool do_promote(struct gfs2_glock *gl) if (!current_gh) current_gh = gh; } - return true; } /** @@ -855,7 +849,8 @@ __acquires(&gl->gl_lockref.lock) } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); - if (do_promote(gl)) + do_promote(gl); + if (find_first_holder(gl)) goto out_unlock; gh = find_first_waiter(gl); if (!gh) From cd493dcf4f827700b66d1adb43f20d3838759beb Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 11 Aug 2025 21:35:23 +0200 Subject: [PATCH 1504/3206] gfs2: run_queue cleanup Transform the code in run_queue() to make it more readable. No change in functionality. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 38f320e13a36e3..1eba27bb5f3cf7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -835,8 +835,12 @@ __acquires(&gl->gl_lockref.lock) /* While a demote is in progress, the GLF_LOCK flag must be set. */ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); - if (test_bit(GLF_DEMOTE, &gl->gl_flags) && - gl->gl_demote_state != gl->gl_state) { + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { + if (gl->gl_demote_state == gl->gl_state) { + gfs2_demote_wake(gl); + goto promote; + } + if (find_first_holder(gl)) goto out_unlock; if (nonblock) @@ -846,22 +850,21 @@ __acquires(&gl->gl_lockref.lock) gl->gl_target = gl->gl_demote_state; do_xmote(gl, NULL, gl->gl_target); return; - } else { - if (test_bit(GLF_DEMOTE, &gl->gl_flags)) - gfs2_demote_wake(gl); - do_promote(gl); - if (find_first_holder(gl)) - goto out_unlock; - gh = find_first_waiter(gl); - if (!gh) - goto out_unlock; - gl->gl_target = gh->gh_state; - if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) - do_error(gl, 0); /* Fail queued try locks */ - do_xmote(gl, gh, gl->gl_target); - return; } +promote: + do_promote(gl); + if (find_first_holder(gl)) + goto out_unlock; + gh = find_first_waiter(gl); + if (!gh) + goto out_unlock; + gl->gl_target = gh->gh_state; + if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + do_error(gl, 0); /* Fail queued try locks */ + do_xmote(gl, gh, gl->gl_target); + return; + out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); gl->gl_lockref.count++; From 47faf937da43df9a84e65e5a880cf813e764971e Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 11 Aug 2025 21:38:23 +0200 Subject: [PATCH 1505/3206] gfs2: Minor run_queue fixes Provide a better description of why the GLF_DEMOTE_IN_PROGRESS flag cannot be set. Function do_xmote() may block, so make sure it isn't called when nonblock is true. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/glock.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 1eba27bb5f3cf7..747ee7ca44e2fe 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -832,7 +832,12 @@ __acquires(&gl->gl_lockref.lock) return; set_bit(GLF_LOCK, &gl->gl_flags); - /* While a demote is in progress, the GLF_LOCK flag must be set. */ + /* + * The GLF_DEMOTE_IN_PROGRESS flag is only set intermittently during + * locking operations. We have just started a locking operation by + * setting the GLF_LOCK flag, so the GLF_DEMOTE_IN_PROGRESS flag must + * be cleared. + */ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { @@ -859,6 +864,8 @@ __acquires(&gl->gl_lockref.lock) gh = find_first_waiter(gl); if (!gh) goto out_unlock; + if (nonblock) + goto out_sched; gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ From 6ab26555c9ffef96c56ca16356e55ac5ab61ec93 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 7 Jul 2025 14:31:41 +0200 Subject: [PATCH 1506/3206] gfs2: Add proper lockspace locking GFS2 has been calling functions like dlm_lock() even after the lockspace that these functions operate on has been released with dlm_release_lockspace(). It has always assumed that those functions would return -EINVAL in that case, but that was never guaranteed, and it certainly is no longer the case since commit 4db41bf4f04f ("dlm: remove ls_local_handle from struct dlm_ls"). To fix that, add proper lockspace locking. Fixes: 3e11e5304150 ("GFS2: ignore unlock failures after withdraw") Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/file.c | 23 +++++++++++++++-------- fs/gfs2/glock.c | 5 ++--- fs/gfs2/incore.h | 2 ++ fs/gfs2/lock_dlm.c | 46 +++++++++++++++++++++++++++++++++++++--------- 4 files changed, 56 insertions(+), 20 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 72d95185a39f61..bc67fa058c8459 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1442,6 +1442,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int ret; if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; @@ -1450,14 +1451,20 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) locks_lock_file_wait(file, fl); return -EIO; } - if (cmd == F_CANCELLK) - return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (IS_GETLK(cmd)) - return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (lock_is_unlock(fl)) - return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); - else - return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); + down_read(&ls->ls_sem); + ret = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + if (cmd == F_CANCELLK) + ret = dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (IS_GETLK(cmd)) + ret = dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (lock_is_unlock(fl)) + ret = dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); + else + ret = dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); + } + up_read(&ls->ls_sem); + return ret; } static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 747ee7ca44e2fe..b677c0e6b9ab30 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -795,9 +795,8 @@ __acquires(&gl->gl_lockref.lock) } clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); - if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && - target == LM_ST_UNLOCKED && - test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { + if (ret == -ENODEV && gl->gl_target == LM_ST_UNLOCKED && + target == LM_ST_UNLOCKED) { /* * The lockspace has been released and the lock has * been unlocked implicitly. diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index cd8a3f469291ff..5a0ea416cfdae9 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -656,6 +656,8 @@ struct lm_lockstruct { struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ char *ls_lvb_bits; + struct rw_semaphore ls_sem; + spinlock_t ls_recover_spin; /* protects following fields */ unsigned long ls_recover_flags; /* DFL_ */ uint32_t ls_recover_mount; /* gen in first recover_done cb */ diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 427150a206cfe3..3c7db20ce56429 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -312,8 +312,13 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, */ again: - error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, - GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + } + up_read(&ls->ls_sem); if (error == -EBUSY) { msleep(20); goto again; @@ -362,8 +367,13 @@ static void gdlm_put_lock(struct gfs2_glock *gl) flags |= DLM_LKF_VALBLK; again: - error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags, - NULL, gl); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags, + NULL, gl); + } + up_read(&ls->ls_sem); if (error == -EBUSY) { msleep(20); goto again; @@ -379,7 +389,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl) static void gdlm_cancel(struct gfs2_glock *gl) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; - dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); + + down_read(&ls->ls_sem); + if (likely(ls->ls_dlm != NULL)) { + dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); + } + up_read(&ls->ls_sem); } /* @@ -560,7 +575,11 @@ static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name) struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; - error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) + error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + up_read(&ls->ls_sem); if (error) { fs_err(sdp, "%s lkid %x error %d\n", name, lksb->sb_lkid, error); @@ -587,9 +606,14 @@ static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags, memset(strname, 0, GDLM_STRNAME_BYTES); snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num); - error = dlm_lock(ls->ls_dlm, mode, lksb, flags, - strname, GDLM_STRNAME_BYTES - 1, - 0, sync_wait_cb, ls, NULL); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_lock(ls->ls_dlm, mode, lksb, flags, + strname, GDLM_STRNAME_BYTES - 1, + 0, sync_wait_cb, ls, NULL); + } + up_read(&ls->ls_sem); if (error) { fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n", name, lksb->sb_lkid, flags, mode, error); @@ -1316,6 +1340,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) */ INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func); + ls->ls_dlm = NULL; spin_lock_init(&ls->ls_recover_spin); ls->ls_recover_flags = 0; ls->ls_recover_mount = 0; @@ -1350,6 +1375,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) * create/join lockspace */ + init_rwsem(&ls->ls_sem); error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE, &gdlm_lockspace_ops, sdp, &ops_result, &ls->ls_dlm); @@ -1429,10 +1455,12 @@ static void gdlm_unmount(struct gfs2_sbd *sdp) /* mounted_lock and control_lock will be purged in dlm recovery */ release: + down_write(&ls->ls_sem); if (ls->ls_dlm) { dlm_release_lockspace(ls->ls_dlm, 2); ls->ls_dlm = NULL; } + up_write(&ls->ls_sem); free_recover_size(ls); } From 28c4d9bc0708956c1a736a9e49fee71b65deee81 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 6 Aug 2025 23:34:03 +0200 Subject: [PATCH 1507/3206] gfs2: Fix unlikely race in gdlm_put_lock In gdlm_put_lock(), there is a small window of time in which the DFL_UNMOUNT flag has been set but the lockspace hasn't been released, yet. In that window, dlm may still call gdlm_ast() and gdlm_bast(). To prevent it from dereferencing freed glock objects, only free the glock if the lockspace has actually been released. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/lock_dlm.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 3c7db20ce56429..ae058b1f9435dc 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -344,12 +344,6 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); - /* don't want to call dlm if we've unmounted the lock protocol */ - if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { - gfs2_glock_free(gl); - return; - } - /* * When the lockspace is released, all remaining glocks will be * unlocked automatically. This is more efficient than unlocking them @@ -379,6 +373,11 @@ static void gdlm_put_lock(struct gfs2_glock *gl) goto again; } + if (error == -ENODEV) { + gfs2_glock_free(gl); + return; + } + if (error) { fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n", gl->gl_name.ln_type, From 41bab90bbfdc55228b8697d960839a4abb5016d4 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 6 Jun 2025 09:55:02 -0700 Subject: [PATCH 1508/3206] x86/its: Move ITS indirect branch thunks to .text..__x86.indirect_thunk The ITS mitigation includes both indirect branch thunks and return thunks. Both are currently placed in .text..__x86.return_thunk, which is appropriate for the latter but not the former. For consistency with other mitigations, move the indirect branch thunks to .text..__x86.indirect_thunk. Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Pawan Gupta --- arch/x86/lib/retpoline.S | 75 +++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index d78d769a02bd39..f513d33b6d3704 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -15,7 +15,6 @@ .section .text..__x86.indirect_thunk - .macro POLINE reg ANNOTATE_INTRA_FUNCTION_CALL call .Ldo_rop_\@ @@ -73,6 +72,7 @@ SYM_CODE_END(__x86_indirect_thunk_array) #undef GEN #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING + .macro CALL_THUNK reg .align RETPOLINE_THUNK_SIZE @@ -126,7 +126,45 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) #define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg) #include #undef GEN -#endif + +#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ + +#ifdef CONFIG_MITIGATION_ITS + +.macro ITS_THUNK reg + +/* + * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) + * that complete the fineibt_paranoid caller sequence. + */ +1: .byte 0xea +SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + jne 1b +SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + jmp *%\reg + int3 + .align 32, 0xcc /* fill to the end of the line */ + .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ +.endm + +/* ITS mitigation requires thunks be aligned to upper half of cacheline */ +.align 64, 0xcc +.skip 29, 0xcc + +#define GEN(reg) ITS_THUNK reg +#include +#undef GEN + + .align 64, 0xcc +SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) +SYM_CODE_END(__x86_indirect_its_thunk_array) + +#endif /* CONFIG_MITIGATION_ITS */ #ifdef CONFIG_MITIGATION_RETHUNK @@ -370,39 +408,6 @@ SYM_FUNC_END(call_depth_return_thunk) #ifdef CONFIG_MITIGATION_ITS -.macro ITS_THUNK reg - -/* - * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) - * that complete the fineibt_paranoid caller sequence. - */ -1: .byte 0xea -SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) - UNWIND_HINT_UNDEFINED - ANNOTATE_NOENDBR - jne 1b -SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) - UNWIND_HINT_UNDEFINED - ANNOTATE_NOENDBR - ANNOTATE_RETPOLINE_SAFE - jmp *%\reg - int3 - .align 32, 0xcc /* fill to the end of the line */ - .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ -.endm - -/* ITS mitigation requires thunks be aligned to upper half of cacheline */ -.align 64, 0xcc -.skip 29, 0xcc - -#define GEN(reg) ITS_THUNK reg -#include -#undef GEN - - .align 64, 0xcc -SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) -SYM_CODE_END(__x86_indirect_its_thunk_array) - .align 64, 0xcc .skip 32, 0xcc SYM_CODE_START(its_return_thunk) From 70784d2e0453f8c90fadbf7b67c452ce784f1478 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ahelenia=20Ziemia=C5=84ska?= Date: Wed, 4 Jun 2025 14:18:46 +0200 Subject: [PATCH 1509/3206] microblaze: fix typos in Kconfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit optimalize -> optimize, these configs turn the functions on instead of allowing them to be turned on, consistent pluralisation Signed-off-by: Ahelenia Ziemiańska Link: https://lore.kernel.org/r/2pg4pexvl2guyww56tnjrt3hjsb6bqtccmpkzt42sqz3igcq56@tarta.nabijaczleweli.xyz Signed-off-by: Michal Simek --- arch/microblaze/Kconfig.platform | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/microblaze/Kconfig.platform b/arch/microblaze/Kconfig.platform index 7795f90dad8680..9cf9007ed69ace 100644 --- a/arch/microblaze/Kconfig.platform +++ b/arch/microblaze/Kconfig.platform @@ -8,10 +8,10 @@ menu "Platform options" config OPT_LIB_FUNCTION - bool "Optimalized lib function" + bool "Optimized lib function" default y help - Allows turn on optimalized library function (memcpy and memmove). + Turns on optimized library functions (memcpy and memmove). They are optimized by using word alignment. This will work fine if both source and destination are aligned on the same boundary. However, if they are aligned on different boundaries @@ -19,13 +19,13 @@ config OPT_LIB_FUNCTION on MicroBlaze systems without a barrel shifter. config OPT_LIB_ASM - bool "Optimalized lib function ASM" + bool "Optimized lib function ASM" depends on OPT_LIB_FUNCTION && (XILINX_MICROBLAZE0_USE_BARREL = 1) depends on CPU_BIG_ENDIAN default n help - Allows turn on optimalized library function (memcpy and memmove). - Function are written in asm code. + Turns on optimized library functions (memcpy and memmove). + They are written in assembly. # Definitions for MICROBLAZE0 comment "Definitions for MICROBLAZE0" From f0bff4e437960ac45c3078582c00f323b1339f96 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 14 Mar 2025 08:09:48 +0100 Subject: [PATCH 1510/3206] microblaze: Replace __ASSEMBLY__ with __ASSEMBLER__ in uapi headers __ASSEMBLY__ is only defined by the Makefile of the kernel, so this is not really useful for uapi headers (unless the userspace Makefile defines it, too). Let's switch to __ASSEMBLER__ which gets set automatically by the compiler when compiling assembly code. This is a completely mechanical patch (done with a simple "sed -i" statement). Cc: Michal Simek Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20250314071013.1575167-18-thuth@redhat.com Signed-off-by: Michal Simek --- arch/microblaze/include/uapi/asm/ptrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/microblaze/include/uapi/asm/ptrace.h b/arch/microblaze/include/uapi/asm/ptrace.h index 46dd94cb78021f..8039957a1a9cd6 100644 --- a/arch/microblaze/include/uapi/asm/ptrace.h +++ b/arch/microblaze/include/uapi/asm/ptrace.h @@ -10,7 +10,7 @@ #ifndef _UAPI_ASM_MICROBLAZE_PTRACE_H #define _UAPI_ASM_MICROBLAZE_PTRACE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef unsigned long microblaze_reg_t; @@ -68,6 +68,6 @@ struct pt_regs { #endif /* __KERNEL */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_MICROBLAZE_PTRACE_H */ From 438f7cd41765dbe8830009a37f90145b7f05ed39 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 14 Mar 2025 08:09:49 +0100 Subject: [PATCH 1511/3206] microblaze: Replace __ASSEMBLY__ with __ASSEMBLER__ in non-uapi headers While the GCC and Clang compilers already define __ASSEMBLER__ automatically when compiling assembly code, __ASSEMBLY__ is a macro that only gets defined by the Makefiles in the kernel. This can be very confusing when switching between userspace and kernelspace coding, or when dealing with uapi headers that rather should use __ASSEMBLER__ instead. So let's standardize on the __ASSEMBLER__ macro that is provided by the compilers now. This is a completely mechanical patch (done with a simple "sed -i" statement). Cc: Michal Simek Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20250314071013.1575167-19-thuth@redhat.com Signed-off-by: Michal Simek --- arch/microblaze/include/asm/asm-compat.h | 2 +- arch/microblaze/include/asm/current.h | 4 ++-- arch/microblaze/include/asm/entry.h | 4 ++-- arch/microblaze/include/asm/exceptions.h | 4 ++-- arch/microblaze/include/asm/fixmap.h | 4 ++-- arch/microblaze/include/asm/ftrace.h | 2 +- arch/microblaze/include/asm/kgdb.h | 4 ++-- arch/microblaze/include/asm/mmu.h | 4 ++-- arch/microblaze/include/asm/page.h | 8 ++++---- arch/microblaze/include/asm/pgtable.h | 18 +++++++++--------- arch/microblaze/include/asm/processor.h | 8 ++++---- arch/microblaze/include/asm/ptrace.h | 4 ++-- arch/microblaze/include/asm/sections.h | 4 ++-- arch/microblaze/include/asm/setup.h | 4 ++-- arch/microblaze/include/asm/thread_info.h | 4 ++-- arch/microblaze/include/asm/unistd.h | 4 ++-- .../microblaze/include/asm/xilinx_mb_manager.h | 4 ++-- 17 files changed, 43 insertions(+), 43 deletions(-) diff --git a/arch/microblaze/include/asm/asm-compat.h b/arch/microblaze/include/asm/asm-compat.h index c05259ce2d2c2d..9f046147623197 100644 --- a/arch/microblaze/include/asm/asm-compat.h +++ b/arch/microblaze/include/asm/asm-compat.h @@ -4,7 +4,7 @@ #include -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define stringify_in_c(...) __VA_ARGS__ # define ASM_CONST(x) x #else diff --git a/arch/microblaze/include/asm/current.h b/arch/microblaze/include/asm/current.h index a4bb45be30e69f..099e69f32bf97a 100644 --- a/arch/microblaze/include/asm/current.h +++ b/arch/microblaze/include/asm/current.h @@ -14,13 +14,13 @@ * but check asm/microblaze/kernel/entry.S to be sure. */ #define CURRENT_TASK r31 -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ /* * Dedicate r31 to keeping the current task pointer */ register struct task_struct *current asm("r31"); # define get_current() current -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_CURRENT_H */ diff --git a/arch/microblaze/include/asm/entry.h b/arch/microblaze/include/asm/entry.h index 6c42bed4116628..9efadf12397ca8 100644 --- a/arch/microblaze/include/asm/entry.h +++ b/arch/microblaze/include/asm/entry.h @@ -21,7 +21,7 @@ #define PER_CPU(var) var -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ DECLARE_PER_CPU(unsigned int, KSP); /* Saved kernel stack pointer */ DECLARE_PER_CPU(unsigned int, KM); /* Kernel/user mode */ DECLARE_PER_CPU(unsigned int, ENTRY_SP); /* Saved SP on kernel entry */ @@ -29,6 +29,6 @@ DECLARE_PER_CPU(unsigned int, R11_SAVE); /* Temp variable for entry */ DECLARE_PER_CPU(unsigned int, CURRENT_SAVE); /* Saved current pointer */ extern asmlinkage void do_notify_resume(struct pt_regs *regs, int in_syscall); -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_ENTRY_H */ diff --git a/arch/microblaze/include/asm/exceptions.h b/arch/microblaze/include/asm/exceptions.h index 967f175173e141..c4591e4f7175bb 100644 --- a/arch/microblaze/include/asm/exceptions.h +++ b/arch/microblaze/include/asm/exceptions.h @@ -11,7 +11,7 @@ #define _ASM_MICROBLAZE_EXCEPTIONS_H #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Macros to enable and disable HW exceptions in the MSR */ /* Define MSR enable bit for HW exceptions */ @@ -64,6 +64,6 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig); void die(const char *str, struct pt_regs *fp, long err); void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr); -#endif /*__ASSEMBLY__ */ +#endif /*__ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_MICROBLAZE_EXCEPTIONS_H */ diff --git a/arch/microblaze/include/asm/fixmap.h b/arch/microblaze/include/asm/fixmap.h index e6e9288bff7613..f9797849e4d436 100644 --- a/arch/microblaze/include/asm/fixmap.h +++ b/arch/microblaze/include/asm/fixmap.h @@ -15,7 +15,7 @@ #ifndef _ASM_FIXMAP_H #define _ASM_FIXMAP_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #ifdef CONFIG_HIGHMEM @@ -62,5 +62,5 @@ extern void __set_fixmap(enum fixed_addresses idx, #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/microblaze/include/asm/ftrace.h b/arch/microblaze/include/asm/ftrace.h index 4ca38b92a3a209..27c1bafb669c36 100644 --- a/arch/microblaze/include/asm/ftrace.h +++ b/arch/microblaze/include/asm/ftrace.h @@ -7,7 +7,7 @@ #define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 8 /* sizeof mcount call */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void _mcount(void); extern void ftrace_call_graph(void); void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr); diff --git a/arch/microblaze/include/asm/kgdb.h b/arch/microblaze/include/asm/kgdb.h index 8dc5ebb07fd5a6..321c3c8bfcf27f 100644 --- a/arch/microblaze/include/asm/kgdb.h +++ b/arch/microblaze/include/asm/kgdb.h @@ -3,7 +3,7 @@ #ifndef __MICROBLAZE_KGDB_H__ #define __MICROBLAZE_KGDB_H__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define CACHE_FLUSH_IS_SAFE 1 #define BUFMAX 2048 @@ -27,6 +27,6 @@ static inline void arch_kgdb_breakpoint(void) struct pt_regs; asmlinkage void microblaze_kgdb_break(struct pt_regs *regs); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __MICROBLAZE_KGDB_H__ */ #endif /* __KERNEL__ */ diff --git a/arch/microblaze/include/asm/mmu.h b/arch/microblaze/include/asm/mmu.h index b928a87c00766a..7262dc4da3385e 100644 --- a/arch/microblaze/include/asm/mmu.h +++ b/arch/microblaze/include/asm/mmu.h @@ -9,7 +9,7 @@ #define _ASM_MICROBLAZE_MMU_H # ifdef __KERNEL__ -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ /* Default "unsigned long" context */ typedef unsigned long mm_context_t; @@ -56,7 +56,7 @@ extern void _tlbia(void); /* invalidate all TLB entries */ * mapping has to increase tlb_skip size. */ extern u32 tlb_skip; -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ /* * The MicroBlaze processor has a TLB architecture identical to PPC-40x. The diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index 90fc9c81debda7..90ac9f34b4b492 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -25,7 +25,7 @@ #define PTE_SHIFT (PAGE_SHIFT - 2) /* 1024 ptes per page */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * PAGE_OFFSET -- the first address of the first page of memory. With MMU @@ -100,7 +100,7 @@ extern int page_is_ram(unsigned long pfn); # define page_to_virt(page) __va(page_to_pfn(page) << PAGE_SHIFT) # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ /* Convert between virtual and physical address for MMU. */ /* Handle MicroBlaze processor with virtual memory. */ @@ -113,7 +113,7 @@ extern int page_is_ram(unsigned long pfn); #define tovirt(rd, rs) \ addik rd, rs, (CONFIG_KERNEL_START - CONFIG_KERNEL_BASE_ADDR) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ # define __pa(x) __virt_to_phys((unsigned long)(x)) # define __va(x) ((void *)__phys_to_virt((unsigned long)(x))) @@ -130,7 +130,7 @@ static inline const void *pfn_to_virt(unsigned long pfn) #define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr))) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define TOPHYS(addr) __virt_to_phys(addr) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index bae1abfa6f6b86..a60e8d89510267 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -10,14 +10,14 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern int mem_init_done; #endif #include #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -39,7 +39,7 @@ extern pte_t *va_to_pte(unsigned long address); #define VMALLOC_START (CONFIG_KERNEL_START + CONFIG_LOWMEM_SIZE) #define VMALLOC_END ioremap_bot -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * Macro to mark a page protection value as "uncacheable". @@ -208,7 +208,7 @@ extern pte_t *va_to_pte(unsigned long address); * Also, write permissions imply read permissions. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -216,7 +216,7 @@ extern pte_t *va_to_pte(unsigned long address); extern unsigned long empty_zero_page[1024]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define pte_none(pte) ((pte_val(pte) & ~_PTE_NONE_MASK) == 0) #define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT) @@ -237,7 +237,7 @@ extern unsigned long empty_zero_page[1024]; #define pfn_pte(pfn, prot) \ __pte(((pte_basic_t)(pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -436,13 +436,13 @@ extern int mem_init_done; asmlinkage void __init mmu_init(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern unsigned long ioremap_bot, ioremap_base; void setup_memory(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_PGTABLE_H */ diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index 4e193c7550dfa2..d59bdfffca7cc0 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -14,7 +14,7 @@ #include #include -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ /* from kernel/cpu/mb.c */ extern const struct seq_operations cpuinfo_op; @@ -29,7 +29,7 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long usp); extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ /* * This is used to define STACK_TOP, and with MMU it must be below @@ -45,7 +45,7 @@ extern void ret_from_kernel_thread(void); # define THREAD_KSP 0 -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ /* If you change this, you must change the associated assembly-languages * constants defined below, THREAD_*. @@ -88,5 +88,5 @@ unsigned long __get_wchan(struct task_struct *p); extern struct dentry *of_debugfs_root; #endif -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_PROCESSOR_H */ diff --git a/arch/microblaze/include/asm/ptrace.h b/arch/microblaze/include/asm/ptrace.h index bfcb89df5e26fc..17982292a64fdf 100644 --- a/arch/microblaze/include/asm/ptrace.h +++ b/arch/microblaze/include/asm/ptrace.h @@ -7,7 +7,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define kernel_mode(regs) ((regs)->pt_mode) #define user_mode(regs) (!kernel_mode(regs)) @@ -20,5 +20,5 @@ static inline long regs_return_value(struct pt_regs *regs) return regs->r3; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_PTRACE_H */ diff --git a/arch/microblaze/include/asm/sections.h b/arch/microblaze/include/asm/sections.h index a9311ad84a67fc..f5008f5e7a5c12 100644 --- a/arch/microblaze/include/asm/sections.h +++ b/arch/microblaze/include/asm/sections.h @@ -10,11 +10,11 @@ #include -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ extern char _ssbss[], _esbss[]; extern unsigned long __ivt_start[], __ivt_end[]; extern u32 _fdt_start[], _fdt_end[]; -# endif /* !__ASSEMBLY__ */ +# endif /* !__ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_SECTIONS_H */ diff --git a/arch/microblaze/include/asm/setup.h b/arch/microblaze/include/asm/setup.h index bf2600f7595932..837ed0bbae4b5b 100644 --- a/arch/microblaze/include/asm/setup.h +++ b/arch/microblaze/include/asm/setup.h @@ -9,7 +9,7 @@ #include -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ extern char cmd_line[COMMAND_LINE_SIZE]; extern char *klimit; @@ -25,5 +25,5 @@ void machine_shutdown(void); void machine_halt(void); void machine_power_off(void); -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_SETUP_H */ diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index a0ddd2a36fb94b..0153f7c2717c98 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -13,7 +13,7 @@ #define THREAD_SIZE (1 << THREAD_SHIFT) #define THREAD_SIZE_ORDER 1 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ # include # include @@ -86,7 +86,7 @@ static inline struct thread_info *current_thread_info(void) } /* thread information allocation */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * thread information flags diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index cfe3f888b432b0..fedda9908aa94e 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -8,7 +8,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* #define __ARCH_WANT_OLD_READDIR */ /* #define __ARCH_WANT_OLD_STAT */ @@ -33,6 +33,6 @@ #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_FORK -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_UNISTD_H */ diff --git a/arch/microblaze/include/asm/xilinx_mb_manager.h b/arch/microblaze/include/asm/xilinx_mb_manager.h index 7b6995722b0c0a..121a3224882b2c 100644 --- a/arch/microblaze/include/asm/xilinx_mb_manager.h +++ b/arch/microblaze/include/asm/xilinx_mb_manager.h @@ -5,7 +5,7 @@ #ifndef _XILINX_MB_MANAGER_H #define _XILINX_MB_MANAGER_H -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ #include @@ -21,7 +21,7 @@ void xmb_manager_register(uintptr_t phys_baseaddr, u32 cr_val, void *priv, void (*reset_callback)(void *data)); asmlinkage void xmb_inject_err(void); -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ /* Error injection offset */ #define XMB_INJECT_ERR_OFFSET 0x200 From 9dfec4a51df9cf0dcc23cb4ac6fc314bf9e999d0 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Wed, 10 Sep 2025 15:58:47 +0800 Subject: [PATCH 1512/3206] USB: core: remove the move buf action The buffer size of sysfs is fixed at PAGE_SIZE, and the page offset of the buf parameter of sysfs_emit_at() must be 0, there is no need to manually manage the buf pointer offset. Fixes: 711d41ab4a0e ("usb: core: Use sysfs_emit_at() when showing dynamic IDs") Reported-by: syzbot+b6445765657b5855e869@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b6445765657b5855e869 Tested-by: syzbot+b6445765657b5855e869@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Link: https://lore.kernel.org/r/tencent_B32D6D8C9450EBFEEE5ACC2C7B0E6C402D0A@qq.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/driver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c index c3177034b779eb..f441958b0ef45a 100644 --- a/drivers/usb/core/driver.c +++ b/drivers/usb/core/driver.c @@ -119,11 +119,11 @@ ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf) guard(mutex)(&usb_dynids_lock); list_for_each_entry(dynid, &dynids->list, node) if (dynid->id.bInterfaceClass != 0) - count += sysfs_emit_at(&buf[count], count, "%04x %04x %02x\n", + count += sysfs_emit_at(buf, count, "%04x %04x %02x\n", dynid->id.idVendor, dynid->id.idProduct, dynid->id.bInterfaceClass); else - count += sysfs_emit_at(&buf[count], count, "%04x %04x\n", + count += sysfs_emit_at(buf, count, "%04x %04x\n", dynid->id.idVendor, dynid->id.idProduct); return count; } From 8ab2f1c35669bff7d7ed1bb16bf5cc989b3e2e17 Mon Sep 17 00:00:00 2001 From: Thomas Fourier Date: Tue, 26 Aug 2025 09:58:08 +0200 Subject: [PATCH 1513/3206] mmc: mvsdio: Fix dma_unmap_sg() nents value The dma_unmap_sg() functions should be called with the same nents as the dma_map_sg(), not the value the map function returned. Fixes: 236caa7cc351 ("mmc: SDIO driver for Marvell SoCs") Signed-off-by: Thomas Fourier Reviewed-by: Linus Walleij Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/mvsdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index a9e6277789ba69..79df2fa89a3fdd 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -292,7 +292,7 @@ static u32 mvsd_finish_data(struct mvsd_host *host, struct mmc_data *data, host->pio_ptr = NULL; host->pio_size = 0; } else { - dma_unmap_sg(mmc_dev(host->mmc), data->sg, host->sg_frags, + dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len, mmc_get_dma_dir(data)); } From c656c99c118b44bb360b0718fadfbf3754fd44d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= Date: Wed, 3 Sep 2025 12:51:39 +0200 Subject: [PATCH 1514/3206] dt-bindings: mmc: sdhci-pxa: Add minItems to pinctrl-names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some older boards only require a default pinctrl. Add a minItems property so these don't cause dt-validate warnings. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509030625.PBgLIAwG-lkp@intel.com/ Signed-off-by: Duje Mihanović Acked-by: Rob Herring (Arm) Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml b/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml index fba1cc50fdf07c..186ce8ff4626a1 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml +++ b/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml @@ -44,6 +44,7 @@ allOf: items: - const: default - const: state_cmd_gpio + minItems: 1 pinctrl-1: description: @@ -61,6 +62,7 @@ allOf: items: - const: default - const: state_uhs + minItems: 1 pinctrl-1: description: From 00637d92cb1e00e2f5c68a8504053c2ac41ac125 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 4 Sep 2025 17:35:43 +0200 Subject: [PATCH 1515/3206] mmc: sh_mmcif: Remove dummy PM resume callback Unassigned system sleep callbacks were always treated the same as dummy callbacks that just return zero. Signed-off-by: Geert Uytterhoeven Reviewed-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/sh_mmcif.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 413c34585c90d5..bf899c8e38f517 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -1579,12 +1579,7 @@ static int sh_mmcif_suspend(struct device *dev) return 0; } -static int sh_mmcif_resume(struct device *dev) -{ - return 0; -} - -static DEFINE_SIMPLE_DEV_PM_OPS(sh_mmcif_dev_pm_ops, sh_mmcif_suspend, sh_mmcif_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(sh_mmcif_dev_pm_ops, sh_mmcif_suspend, NULL); static struct platform_driver sh_mmcif_driver = { .probe = sh_mmcif_probe, From 8c2467dea393622123f66b38fa4ca695670eb656 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 12 Sep 2025 11:23:52 +0200 Subject: [PATCH 1516/3206] pinctrl: Simplify printks with pOF format Print full device node name with %pOF format, so the code will be a bit simpler. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Linus Walleij --- drivers/pinctrl/sprd/pinctrl-sprd.c | 9 +++------ drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c | 7 +++---- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/pinctrl/sprd/pinctrl-sprd.c b/drivers/pinctrl/sprd/pinctrl-sprd.c index c4a1d99dfed043..16cf9d15f24707 100644 --- a/drivers/pinctrl/sprd/pinctrl-sprd.c +++ b/drivers/pinctrl/sprd/pinctrl-sprd.c @@ -258,8 +258,7 @@ static int sprd_dt_node_to_map(struct pinctrl_dev *pctldev, grp = sprd_pinctrl_find_group_by_name(pctl, np->name); if (!grp) { - dev_err(pctl->dev, "unable to find group for node %s\n", - of_node_full_name(np)); + dev_err(pctl->dev, "unable to find group for node %pOF\n", np); return -EINVAL; } @@ -276,16 +275,14 @@ static int sprd_dt_node_to_map(struct pinctrl_dev *pctldev, if (ret < 0) { if (ret != -EINVAL) dev_err(pctl->dev, - "%s: could not parse property function\n", - of_node_full_name(np)); + "%pOF: could not parse property function\n", np); function = NULL; } ret = pinconf_generic_parse_dt_config(np, pctldev, &configs, &num_configs); if (ret < 0) { - dev_err(pctl->dev, "%s: could not parse node property\n", - of_node_full_name(np)); + dev_err(pctl->dev, "%pOF: could not parse node property\n", np); return ret; } diff --git a/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c b/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c index 5f13315ebff340..50a16f3bd13161 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c +++ b/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c @@ -274,8 +274,7 @@ static void fill_pin_function(struct device *dev, struct device_node *node, if (!strcmp(pins[pin].pin.name, name)) break; if (pin == npins) { - dev_warn(dev, "%s: cannot find pin %s\n", - of_node_full_name(node), name); + dev_warn(dev, "%pOF: cannot find pin %s\n", node, name); index++; continue; } @@ -283,8 +282,8 @@ static void fill_pin_function(struct device *dev, struct device_node *node, /* Read the associated mux value. */ muxval = sunxi_pinctrl_dt_read_pinmux(node, index); if (muxval == INVALID_MUX) { - dev_warn(dev, "%s: invalid mux value for pin %s\n", - of_node_full_name(node), name); + dev_warn(dev, "%pOF: invalid mux value for pin %s\n", + node, name); index++; continue; } From 0b5fe1c4ab3c2960b8b978122eba810f37f87aee Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Mon, 8 Sep 2025 11:34:54 +0300 Subject: [PATCH 1517/3206] pmdomain: ti-sci: Set PD on/off state according to the HW state At the moment the driver sets the power state of all the PDs it creates to off, regardless of the actual HW state. This has two drawbacks: 1) The kernel cannot disable unused PDs automatically for power saving, as it thinks they are off already 2) A more specific case (but perhaps applicable to other scenarios also): bootloader enabled splash-screen cannot be kept on the screen. The issue in 2) is that the driver framework automatically enables the device's PD before calling probe() and disables it after the probe(). This means that when the display subsystem (DSS) driver probes, but e.g. fails due to deferred probing, the DSS PD gets turned off and the driver cannot do anything to affect that. Solving the 2) requires more changes to actually keep the PD on during the boot, but a prerequisite for it is to have the correct power state for the PD. The downside with this patch is that it takes time to call the 'is_on' op, and we need to call it for each PD. In my tests with AM62 SK, using defconfig, I see an increase from ~3.5ms to ~7ms. However, the added feature is valuable, so in my opinion it's worth it. The performance could probably be improved with a new firmware API which returns the power states of all the PDs. There's also a related HW issue at play here: if the DSS IP is enabled and active, and its PD is turned off without first disabling the DSS display outputs, the DSS IP will hang and causes the kernel to halt if and when the DSS driver accesses the DSS registers the next time. With the current upstream kernel, with this patch applied, this means that if the bootloader enables the display, and the DSS driver is compiled as a module, the kernel will at some point disable unused PDs, including the DSS PD. When the DSS module is later loaded, it will hang the kernel. The same issue is already there, even without this patch, as the DSS driver may hit deferred probing, which causes the PD to be turned off, and leading to kernel halt when the DSS driver is probed again. This issue has been made quite rare with some arrangements in the DSS driver's probe, but it's still there. With recent change from Ulf (e.g. commit 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync")), the sync state mechanism comes to rescue. It will keep the power domains enabled, until the drivers have probed, or the sync-state is triggered via some other mechanism (e.g. manually by the boot scripts). Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: Tomi Valkeinen Signed-off-by: Ulf Hansson --- drivers/pmdomain/ti/ti_sci_pm_domains.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/pmdomain/ti/ti_sci_pm_domains.c b/drivers/pmdomain/ti/ti_sci_pm_domains.c index 82df7e44250bb6..e5d1934f78d9ee 100644 --- a/drivers/pmdomain/ti/ti_sci_pm_domains.c +++ b/drivers/pmdomain/ti/ti_sci_pm_domains.c @@ -200,6 +200,23 @@ static bool ti_sci_pm_idx_exists(struct ti_sci_genpd_provider *pd_provider, u32 return false; } +static bool ti_sci_pm_pd_is_on(struct ti_sci_genpd_provider *pd_provider, + int pd_idx) +{ + bool is_on; + int ret; + + if (!pd_provider->ti_sci->ops.dev_ops.is_on) + return false; + + ret = pd_provider->ti_sci->ops.dev_ops.is_on(pd_provider->ti_sci, + pd_idx, NULL, &is_on); + if (ret) + return false; + + return is_on; +} + static int ti_sci_pm_domain_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -231,6 +248,8 @@ static int ti_sci_pm_domain_probe(struct platform_device *pdev) index, &args)) { if (args.args_count >= 1 && args.np == dev->of_node) { + bool is_on; + of_node_put(args.np); if (args.args[0] > max_id) { max_id = args.args[0]; @@ -264,7 +283,10 @@ static int ti_sci_pm_domain_probe(struct platform_device *pdev) pd_provider->ti_sci->ops.pm_ops.set_latency_constraint) pd->pd.domain.ops.suspend = ti_sci_pd_suspend; - pm_genpd_init(&pd->pd, NULL, true); + is_on = ti_sci_pm_pd_is_on(pd_provider, + pd->idx); + + pm_genpd_init(&pd->pd, NULL, !is_on); list_add(&pd->node, &pd_provider->pd_list); } else { From ce55f63137919a1d943b0d883a6ac3744108af0f Mon Sep 17 00:00:00 2001 From: Monish Chunara Date: Mon, 8 Sep 2025 13:49:51 +0530 Subject: [PATCH 1518/3206] dt-bindings: mmc: sdhci-msm: Document the Lemans compatible Add the MSM SDHCI compatible name to support both eMMC and SD card for Lemans, which uses 'sa8775p' as the fallback SoC. Ensure the new compatible string matches existing Lemans-compatible formats without introducing a new naming convention. The SDHCI controller on Lemans is based on MSM SDHCI v5 IP. Hence, document the compatible with "qcom,sdhci-msm-v5" as the fallback. Signed-off-by: Monish Chunara Acked-by: Krzysztof Kozlowski Signed-off-by: Wasim Nazir Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/sdhci-msm.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/mmc/sdhci-msm.yaml b/Documentation/devicetree/bindings/mmc/sdhci-msm.yaml index 22d1f50c3fd1a0..594bd174ff211e 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-msm.yaml +++ b/Documentation/devicetree/bindings/mmc/sdhci-msm.yaml @@ -48,6 +48,7 @@ properties: - qcom,qcs615-sdhci - qcom,qcs8300-sdhci - qcom,qdu1000-sdhci + - qcom,sa8775p-sdhci - qcom,sar2130p-sdhci - qcom,sc7180-sdhci - qcom,sc7280-sdhci From 08b68ca543ee9d5a8d2dc406165e4887dd8f170b Mon Sep 17 00:00:00 2001 From: Sarthak Garg Date: Mon, 8 Sep 2025 16:11:19 +0530 Subject: [PATCH 1519/3206] mmc: sdhci-msm: Enable tuning for SDR50 mode for SD card For Qualcomm SoCs which needs level shifter for SD card, extra delay is seen on receiver data path. To compensate this delay enable tuning for SDR50 mode for targets which has level shifter. SDHCI_SDR50_NEEDS_TUNING caps will be set for targets with level shifter on Qualcomm SOC's. Signed-off-by: Sarthak Garg Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index d2906bf6e598c3..4e5edbf2fc9b6f 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -81,6 +81,7 @@ #define CORE_IO_PAD_PWR_SWITCH_EN BIT(15) #define CORE_IO_PAD_PWR_SWITCH BIT(16) #define CORE_HC_SELECT_IN_EN BIT(18) +#define CORE_HC_SELECT_IN_SDR50 (4 << 19) #define CORE_HC_SELECT_IN_HS400 (6 << 19) #define CORE_HC_SELECT_IN_MASK (7 << 19) @@ -1133,6 +1134,10 @@ static bool sdhci_msm_is_tuning_needed(struct sdhci_host *host) { struct mmc_ios *ios = &host->mmc->ios; + if (ios->timing == MMC_TIMING_UHS_SDR50 && + host->flags & SDHCI_SDR50_NEEDS_TUNING) + return true; + /* * Tuning is required for SDR104, HS200 and HS400 cards and * if clock frequency is greater than 100MHz in these modes. @@ -1201,6 +1206,8 @@ static int sdhci_msm_execute_tuning(struct mmc_host *mmc, u32 opcode) struct mmc_ios ios = host->mmc->ios; struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); + const struct sdhci_msm_offset *msm_offset = msm_host->offset; + u32 config; if (!sdhci_msm_is_tuning_needed(host)) { msm_host->use_cdr = false; @@ -1217,6 +1224,14 @@ static int sdhci_msm_execute_tuning(struct mmc_host *mmc, u32 opcode) */ msm_host->tuning_done = 0; + if (ios.timing == MMC_TIMING_UHS_SDR50 && + host->flags & SDHCI_SDR50_NEEDS_TUNING) { + config = readl_relaxed(host->ioaddr + msm_offset->core_vendor_spec); + config &= ~CORE_HC_SELECT_IN_MASK; + config |= CORE_HC_SELECT_IN_EN | CORE_HC_SELECT_IN_SDR50; + writel_relaxed(config, host->ioaddr + msm_offset->core_vendor_spec); + } + /* * For HS400 tuning in HS200 timing requires: * - select MCLK/2 in VENDOR_SPEC From 98682689a1a443067e1981d9eed8c6ddcef8cdf3 Mon Sep 17 00:00:00 2001 From: Sarthak Garg Date: Mon, 8 Sep 2025 16:11:20 +0530 Subject: [PATCH 1520/3206] dt-bindings: mmc: controller: Add max-sd-hs-hz property Introduce a new optional device tree property max-sd-hs-hz to limit the maximum frequency (in Hz) used for SD cards operating in High-Speed (HS) mode due to any board electrical limitations. Signed-off-by: Sarthak Garg Reviewed-by: Krzysztof Kozlowski Signed-off-by: Ulf Hansson --- .../devicetree/bindings/mmc/mmc-controller-common.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/devicetree/bindings/mmc/mmc-controller-common.yaml b/Documentation/devicetree/bindings/mmc/mmc-controller-common.yaml index 9a72354397591d..7414d5522dfe8a 100644 --- a/Documentation/devicetree/bindings/mmc/mmc-controller-common.yaml +++ b/Documentation/devicetree/bindings/mmc/mmc-controller-common.yaml @@ -93,6 +93,14 @@ properties: minimum: 400000 maximum: 384000000 + max-sd-hs-hz: + description: | + Maximum frequency (in Hz) to be used for SD cards operating in + High-Speed (HS) mode. + minimum: 400000 + maximum: 50000000 + default: 50000000 + disable-wp: $ref: /schemas/types.yaml#/definitions/flag description: From f338529ca9279e3bea392cb53cec8bd292909cb1 Mon Sep 17 00:00:00 2001 From: Sarthak Garg Date: Mon, 8 Sep 2025 16:11:21 +0530 Subject: [PATCH 1521/3206] mmc: core: Parse and use the new max-sd-hs-hz DT property Introduce a new device tree flag to cap the maximum High-Speed (HS) mode frequency for SD cards, accommodating board-specific electrical limitations which cannot support the default 50Mhz HS frequency and others. Signed-off-by: Sarthak Garg Signed-off-by: Ulf Hansson --- drivers/mmc/core/host.c | 2 ++ drivers/mmc/core/sd.c | 2 +- include/linux/mmc/host.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 5f0ec23aeff553..88c95dbfd9cfd5 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -302,6 +302,8 @@ int mmc_of_parse(struct mmc_host *host) /* f_max is obtained from the optional "max-frequency" property */ device_property_read_u32(dev, "max-frequency", &host->f_max); + device_property_read_u32(dev, "max-sd-hs-hz", &host->max_sd_hs_hz); + /* * Configure CD and WP pins. They are both by default active low to * match the SDHCI spec. If GPIOs are provided for CD and / or WP, the diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index ec02067f03c5c5..67cd6300482976 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -359,7 +359,7 @@ static int mmc_read_switch(struct mmc_card *card) } if (status[13] & SD_MODE_HIGH_SPEED) - card->sw_caps.hs_max_dtr = HIGH_SPEED_MAX_DTR; + card->sw_caps.hs_max_dtr = card->host->max_sd_hs_hz ?: HIGH_SPEED_MAX_DTR; if (card->scr.sda_spec3) { card->sw_caps.sd3_bus_mode = status[13]; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e0d935a4ac1d5a..e0e2c265e5d101 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -576,6 +576,7 @@ struct mmc_host { int hsq_depth; u32 err_stats[MMC_ERR_MAX]; + u32 max_sd_hs_hz; unsigned long private[] ____cacheline_aligned; }; From d3021e6aa11fecdafa85038a037c04d5bfeda9d5 Mon Sep 17 00:00:00 2001 From: Josua Mayer Date: Thu, 11 Sep 2025 20:28:04 +0200 Subject: [PATCH 1522/3206] arm64: dts: marvell: cn913x-solidrun: fix sata ports status Commit "arm64: dts: marvell: only enable complete sata nodes" changed armada-cp11x.dtsi disabling all sata ports status by default. The author missed some dts which relied on the dtsi enabling all ports, and just disabled unused ones instead. Update dts for SolidRun cn913x based boards to enable the available ports, rather than disabling the unvavailable one. Further according to dt bindings the serdes phys are to be specified in the port node, not the controller node. Move those phys properties accordingly in clearfog base/pro/solidwan. Fixes: 30023876aef4 ("arm64: dts: marvell: only enable complete sata nodes") Cc: stable@vger.kernel.org Signed-off-by: Josua Mayer Signed-off-by: Gregory CLEMENT --- arch/arm64/boot/dts/marvell/cn9130-cf.dtsi | 7 ++++--- arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts | 6 ++++-- arch/arm64/boot/dts/marvell/cn9132-clearfog.dts | 6 ++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi b/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi index ad0ab34b66028c..bd42bfbe408bbe 100644 --- a/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi +++ b/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi @@ -152,11 +152,12 @@ /* SRDS #0 - SATA on M.2 connector */ &cp0_sata0 { - phys = <&cp0_comphy0 1>; status = "okay"; - /* only port 1 is available */ - /delete-node/ sata-port@0; + sata-port@1 { + phys = <&cp0_comphy0 1>; + status = "okay"; + }; }; /* microSD */ diff --git a/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts b/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts index 47234d0858dd21..338853d3b179bb 100644 --- a/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts +++ b/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts @@ -563,11 +563,13 @@ /* SRDS #1 - SATA on M.2 (J44) */ &cp1_sata0 { - phys = <&cp1_comphy1 0>; status = "okay"; /* only port 0 is available */ - /delete-node/ sata-port@1; + sata-port@0 { + phys = <&cp1_comphy1 0>; + status = "okay"; + }; }; &cp1_syscon0 { diff --git a/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts b/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts index 0f53745a6fa0d8..115c55d73786e2 100644 --- a/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts +++ b/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts @@ -512,10 +512,9 @@ status = "okay"; /* only port 1 is available */ - /delete-node/ sata-port@0; - sata-port@1 { phys = <&cp1_comphy3 1>; + status = "okay"; }; }; @@ -631,9 +630,8 @@ status = "okay"; /* only port 1 is available */ - /delete-node/ sata-port@0; - sata-port@1 { + status = "okay"; phys = <&cp2_comphy3 1>; }; }; From 48b51799a5461707705454568453618cdd7307f4 Mon Sep 17 00:00:00 2001 From: Josua Mayer Date: Thu, 11 Sep 2025 20:28:05 +0200 Subject: [PATCH 1523/3206] arm64: dts: marvell: cn9132-clearfog: disable eMMC high-speed modes Similar to MacchiatoBIN the high-speed modes are unstable on the CN9132 CEX-7 module, leading to failed transactions under normal use. Disable all high-speed modes including UHS. Additionally add no-sdio and non-removable properties as appropriate for eMMC. Fixes: e9ff907f4076 ("arm64: dts: add description for solidrun cn9132 cex7 module and clearfog board") Cc: stable@vger.kernel.org Signed-off-by: Josua Mayer Signed-off-by: Gregory CLEMENT --- arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi b/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi index afc041c1c448c3..bb2bb47fd77c12 100644 --- a/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi +++ b/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi @@ -137,6 +137,14 @@ pinctrl-0 = <&ap_mmc0_pins>; pinctrl-names = "default"; vqmmc-supply = <&v_1_8>; + /* + * Not stable in HS modes - phy needs "more calibration", so disable + * UHS (by preventing voltage switch), SDR104, SDR50 and DDR50 modes. + */ + no-1-8-v; + no-sd; + no-sdio; + non-removable; status = "okay"; }; From 794a066688038df46c01e177cc6faebded0acba4 Mon Sep 17 00:00:00 2001 From: Josua Mayer Date: Thu, 11 Sep 2025 20:28:06 +0200 Subject: [PATCH 1524/3206] arm64: dts: marvell: cn9132-clearfog: fix multi-lane pci x2 and x4 ports The mvebu-comphy driver does not currently know how to pass correct lane-count to ATF while configuring the serdes lanes. This causes the system to hard reset during reconfiguration, if a pci card is present and has established a link during bootloader. Remove the comphy handles from the respective pci nodes to avoid runtime reconfiguration, relying solely on bootloader configuration - while avoiding the hard reset. When bootloader has configured the lanes correctly, the pci ports are functional under Linux. This issue may be addressed in the comphy driver at a future point. Fixes: e9ff907f4076 ("arm64: dts: add description for solidrun cn9132 cex7 module and clearfog board") Cc: stable@vger.kernel.org Signed-off-by: Josua Mayer Signed-off-by: Gregory CLEMENT --- arch/arm64/boot/dts/marvell/cn9132-clearfog.dts | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts b/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts index 115c55d73786e2..6f237d3542b910 100644 --- a/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts +++ b/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts @@ -413,7 +413,13 @@ /* SRDS #0,#1,#2,#3 - PCIe */ &cp0_pcie0 { num-lanes = <4>; - phys = <&cp0_comphy0 0>, <&cp0_comphy1 0>, <&cp0_comphy2 0>, <&cp0_comphy3 0>; + /* + * The mvebu-comphy driver does not currently know how to pass correct + * lane-count to ATF while configuring the serdes lanes. + * Rely on bootloader configuration only. + * + * phys = <&cp0_comphy0 0>, <&cp0_comphy1 0>, <&cp0_comphy2 0>, <&cp0_comphy3 0>; + */ status = "okay"; }; @@ -475,7 +481,13 @@ /* SRDS #0,#1 - PCIe */ &cp1_pcie0 { num-lanes = <2>; - phys = <&cp1_comphy0 0>, <&cp1_comphy1 0>; + /* + * The mvebu-comphy driver does not currently know how to pass correct + * lane-count to ATF while configuring the serdes lanes. + * Rely on bootloader configuration only. + * + * phys = <&cp1_comphy0 0>, <&cp1_comphy1 0>; + */ status = "okay"; }; From 7b7e71683b4ccbe0dbd7d434707623327e852f20 Mon Sep 17 00:00:00 2001 From: Ben Chuang Date: Thu, 11 Sep 2025 10:40:20 +0800 Subject: [PATCH 1525/3206] mmc: sdhci: Move the code related to setting the clock from sdhci_set_ios_common() into sdhci_set_ios() The sdhci_set_clock() is called in sdhci_set_ios_common() and __sdhci_uhs2_set_ios(). According to Section 3.13.2 "Card Interface Detection Sequence" of the SD Host Controller Standard Specification Version 7.00, the SD clock is supplied after power is supplied, so we only need one in __sdhci_uhs2_set_ios(). Let's move the code related to setting the clock from sdhci_set_ios_common() into sdhci_set_ios() and modify the parameters passed to sdhci_set_clock() in __sdhci_uhs2_set_ios(). Fixes: 10c8298a052b ("mmc: sdhci-uhs2: add set_ios()") Cc: stable@vger.kernel.org # v6.13+ Signed-off-by: Ben Chuang Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-uhs2.c | 3 ++- drivers/mmc/host/sdhci.c | 34 +++++++++++++++++----------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/drivers/mmc/host/sdhci-uhs2.c b/drivers/mmc/host/sdhci-uhs2.c index 0efeb9d0c3765a..18fb6ee5b96a98 100644 --- a/drivers/mmc/host/sdhci-uhs2.c +++ b/drivers/mmc/host/sdhci-uhs2.c @@ -295,7 +295,8 @@ static void __sdhci_uhs2_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) else sdhci_uhs2_set_power(host, ios->power_mode, ios->vdd); - sdhci_set_clock(host, host->clock); + sdhci_set_clock(host, ios->clock); + host->clock = ios->clock; } static int sdhci_uhs2_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 3a17821efa5ca9..ac7e11f37af71f 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2367,23 +2367,6 @@ void sdhci_set_ios_common(struct mmc_host *mmc, struct mmc_ios *ios) (ios->power_mode == MMC_POWER_UP) && !(host->quirks2 & SDHCI_QUIRK2_PRESET_VALUE_BROKEN)) sdhci_enable_preset_value(host, false); - - if (!ios->clock || ios->clock != host->clock) { - host->ops->set_clock(host, ios->clock); - host->clock = ios->clock; - - if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK && - host->clock) { - host->timeout_clk = mmc->actual_clock ? - mmc->actual_clock / 1000 : - host->clock / 1000; - mmc->max_busy_timeout = - host->ops->get_max_timeout_count ? - host->ops->get_max_timeout_count(host) : - 1 << 27; - mmc->max_busy_timeout /= host->timeout_clk; - } - } } EXPORT_SYMBOL_GPL(sdhci_set_ios_common); @@ -2410,6 +2393,23 @@ void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) sdhci_set_ios_common(mmc, ios); + if (!ios->clock || ios->clock != host->clock) { + host->ops->set_clock(host, ios->clock); + host->clock = ios->clock; + + if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK && + host->clock) { + host->timeout_clk = mmc->actual_clock ? + mmc->actual_clock / 1000 : + host->clock / 1000; + mmc->max_busy_timeout = + host->ops->get_max_timeout_count ? + host->ops->get_max_timeout_count(host) : + 1 << 27; + mmc->max_busy_timeout /= host->timeout_clk; + } + } + if (host->ops->set_power) host->ops->set_power(host, ios->power_mode, ios->vdd); else From 09c2b628f6403ad467fc73326a50020590603871 Mon Sep 17 00:00:00 2001 From: Ben Chuang Date: Thu, 11 Sep 2025 10:41:01 +0800 Subject: [PATCH 1526/3206] mmc: sdhci-uhs2: Fix calling incorrect sdhci_set_clock() function Fix calling incorrect sdhci_set_clock() in __sdhci_uhs2_set_ios() when the vendor defines its own sdhci_set_clock(). Fixes: 10c8298a052b ("mmc: sdhci-uhs2: add set_ios()") Cc: stable@vger.kernel.org # v6.13+ Signed-off-by: Ben Chuang Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-uhs2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-uhs2.c b/drivers/mmc/host/sdhci-uhs2.c index 18fb6ee5b96a98..c459a08d01da52 100644 --- a/drivers/mmc/host/sdhci-uhs2.c +++ b/drivers/mmc/host/sdhci-uhs2.c @@ -295,7 +295,7 @@ static void __sdhci_uhs2_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) else sdhci_uhs2_set_power(host, ios->power_mode, ios->vdd); - sdhci_set_clock(host, ios->clock); + host->ops->set_clock(host, ios->clock); host->clock = ios->clock; } From 77a436c93d10d68201bfd4941d1ca3230dfd1f40 Mon Sep 17 00:00:00 2001 From: Ben Chuang Date: Thu, 11 Sep 2025 10:42:42 +0800 Subject: [PATCH 1527/3206] mmc: sdhci-pci-gli: GL9767: Fix initializing the UHS-II interface during a power-on According to the power structure of IC hardware design for UHS-II interface, reset control and timing must be added to the initialization process of powering on the UHS-II interface. Fixes: 27dd3b82557a ("mmc: sdhci-pci-gli: enable UHS-II mode for GL9767") Cc: stable@vger.kernel.org # v6.13+ Signed-off-by: Ben Chuang Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-gli.c | 68 +++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c index 3a1de477e9af8d..b0f91cc9e40e43 100644 --- a/drivers/mmc/host/sdhci-pci-gli.c +++ b/drivers/mmc/host/sdhci-pci-gli.c @@ -283,6 +283,8 @@ #define PCIE_GLI_9767_UHS2_CTL2_ZC_VALUE 0xb #define PCIE_GLI_9767_UHS2_CTL2_ZC_CTL BIT(6) #define PCIE_GLI_9767_UHS2_CTL2_ZC_CTL_VALUE 0x1 +#define PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN BIT(13) +#define PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE BIT(14) #define GLI_MAX_TUNING_LOOP 40 @@ -1179,6 +1181,65 @@ static void gl9767_set_low_power_negotiation(struct pci_dev *pdev, bool enable) gl9767_vhs_read(pdev); } +static void sdhci_gl9767_uhs2_phy_reset(struct sdhci_host *host, bool assert) +{ + struct sdhci_pci_slot *slot = sdhci_priv(host); + struct pci_dev *pdev = slot->chip->pdev; + u32 value, set, clr; + + if (assert) { + /* Assert reset, set RESETN and clean RESETN_VALUE */ + set = PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN; + clr = PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE; + } else { + /* De-assert reset, clean RESETN and set RESETN_VALUE */ + set = PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE; + clr = PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN; + } + + gl9767_vhs_write(pdev); + pci_read_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, &value); + value |= set; + pci_write_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, value); + value &= ~clr; + pci_write_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, value); + gl9767_vhs_read(pdev); +} + +static void __gl9767_uhs2_set_power(struct sdhci_host *host, unsigned char mode, unsigned short vdd) +{ + u8 pwr = 0; + + if (mode != MMC_POWER_OFF) { + pwr = sdhci_get_vdd_value(vdd); + if (!pwr) + WARN(1, "%s: Invalid vdd %#x\n", + mmc_hostname(host->mmc), vdd); + pwr |= SDHCI_VDD2_POWER_180; + } + + if (host->pwr == pwr) + return; + + host->pwr = pwr; + + if (pwr == 0) { + sdhci_writeb(host, 0, SDHCI_POWER_CONTROL); + } else { + sdhci_writeb(host, 0, SDHCI_POWER_CONTROL); + + pwr |= SDHCI_POWER_ON; + sdhci_writeb(host, pwr & 0xf, SDHCI_POWER_CONTROL); + usleep_range(5000, 6250); + + /* Assert reset */ + sdhci_gl9767_uhs2_phy_reset(host, true); + pwr |= SDHCI_VDD2_POWER_ON; + sdhci_writeb(host, pwr, SDHCI_POWER_CONTROL); + usleep_range(5000, 6250); + } +} + static void sdhci_gl9767_set_clock(struct sdhci_host *host, unsigned int clock) { struct sdhci_pci_slot *slot = sdhci_priv(host); @@ -1205,6 +1266,11 @@ static void sdhci_gl9767_set_clock(struct sdhci_host *host, unsigned int clock) } sdhci_enable_clk(host, clk); + + if (mmc_card_uhs2(host->mmc)) + /* De-assert reset */ + sdhci_gl9767_uhs2_phy_reset(host, false); + gl9767_set_low_power_negotiation(pdev, true); } @@ -1476,7 +1542,7 @@ static void sdhci_gl9767_set_power(struct sdhci_host *host, unsigned char mode, gl9767_vhs_read(pdev); sdhci_gli_overcurrent_event_enable(host, false); - sdhci_uhs2_set_power(host, mode, vdd); + __gl9767_uhs2_set_power(host, mode, vdd); sdhci_gli_overcurrent_event_enable(host, true); } else { gl9767_vhs_write(pdev); From 072755cca7e743c28a273fcb69b0e826109473d7 Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Thu, 11 Sep 2025 23:06:05 +0200 Subject: [PATCH 1528/3206] mmc: core: Fix variable shadowing in mmc_route_rpmb_frames() Rename the inner 'frm' variable to 'resp_frm' in the write path of mmc_route_rpmb_frames() to avoid shadowing the outer 'frm' variable. The function declares 'frm' at function scope pointing to the request frame, but then redeclares another 'frm' variable inside the write block pointing to the response frame. This shadowing makes the code confusing and error-prone. Using 'resp_frm' for the response frame makes the distinction clear and improves code readability. Fixes: 7852028a35f0 ("mmc: block: register RPMB partition with the RPMB subsystem") Reviewed-by: Avri Altman Reviewed-by: Jens Wiklander Signed-off-by: Bean Huo Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 8fd9891462054d..e14820cde25291 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2930,15 +2930,15 @@ static int mmc_route_rpmb_frames(struct device *dev, u8 *req, return -ENOMEM; if (write) { - struct rpmb_frame *frm = (struct rpmb_frame *)resp; + struct rpmb_frame *resp_frm = (struct rpmb_frame *)resp; /* Send write request frame(s) */ set_idata(idata[0], MMC_WRITE_MULTIPLE_BLOCK, 1 | MMC_CMD23_ARG_REL_WR, req, req_len); /* Send result request frame */ - memset(frm, 0, sizeof(*frm)); - frm->req_resp = cpu_to_be16(RPMB_RESULT_READ); + memset(resp_frm, 0, sizeof(*resp_frm)); + resp_frm->req_resp = cpu_to_be16(RPMB_RESULT_READ); set_idata(idata[1], MMC_WRITE_MULTIPLE_BLOCK, 1, resp, resp_len); From ce7d8a3e19e7f62060b31d401339b81458587ce2 Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Thu, 11 Sep 2025 23:06:06 +0200 Subject: [PATCH 1529/3206] mmc: core: Improve RPMB frame handling code Introduce RPMB_FRAME_SIZE, CHECK_SIZE_NEQ(), and CHECK_SIZE_ALIGNED() macros to replace repetitive sizeof(struct rpmb_frame) checks in mmc_route_rpmb_frames(). Signed-off-by: Bean Huo Reviewed-by: Avri Altman Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index e14820cde25291..a74e75df93b061 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -121,6 +121,10 @@ struct rpmb_frame { #define RPMB_READ_DATA 0x4 /* Read data from RPMB partition */ #define RPMB_RESULT_READ 0x5 /* Read result request (Internal) */ +#define RPMB_FRAME_SIZE sizeof(struct rpmb_frame) +#define CHECK_SIZE_NEQ(val) ((val) != sizeof(struct rpmb_frame)) +#define CHECK_SIZE_ALIGNED(val) IS_ALIGNED((val), sizeof(struct rpmb_frame)) + static DEFINE_MUTEX(block_mutex); /* @@ -2858,12 +2862,12 @@ static void set_idata(struct mmc_blk_ioc_data *idata, u32 opcode, * The size of an RPMB frame must match what's expected by the * hardware. */ - BUILD_BUG_ON(sizeof(struct rpmb_frame) != 512); + static_assert(!CHECK_SIZE_NEQ(512), "RPMB frame size must be 512 bytes"); idata->ic.opcode = opcode; idata->ic.flags = MMC_RSP_R1 | MMC_CMD_ADTC; idata->ic.write_flag = write_flag; - idata->ic.blksz = sizeof(struct rpmb_frame); + idata->ic.blksz = RPMB_FRAME_SIZE; idata->ic.blocks = buf_bytes / idata->ic.blksz; idata->buf = buf; idata->buf_bytes = buf_bytes; @@ -2887,32 +2891,28 @@ static int mmc_route_rpmb_frames(struct device *dev, u8 *req, if (IS_ERR(md->queue.card)) return PTR_ERR(md->queue.card); - if (req_len < sizeof(*frm)) + if (req_len < RPMB_FRAME_SIZE) return -EINVAL; req_type = be16_to_cpu(frm->req_resp); switch (req_type) { case RPMB_PROGRAM_KEY: - if (req_len != sizeof(struct rpmb_frame) || - resp_len != sizeof(struct rpmb_frame)) + if (CHECK_SIZE_NEQ(req_len) || CHECK_SIZE_NEQ(resp_len)) return -EINVAL; write = true; break; case RPMB_GET_WRITE_COUNTER: - if (req_len != sizeof(struct rpmb_frame) || - resp_len != sizeof(struct rpmb_frame)) + if (CHECK_SIZE_NEQ(req_len) || CHECK_SIZE_NEQ(resp_len)) return -EINVAL; write = false; break; case RPMB_WRITE_DATA: - if (req_len % sizeof(struct rpmb_frame) || - resp_len != sizeof(struct rpmb_frame)) + if (!CHECK_SIZE_ALIGNED(req_len) || CHECK_SIZE_NEQ(resp_len)) return -EINVAL; write = true; break; case RPMB_READ_DATA: - if (req_len != sizeof(struct rpmb_frame) || - resp_len % sizeof(struct rpmb_frame)) + if (CHECK_SIZE_NEQ(req_len) || !CHECK_SIZE_ALIGNED(resp_len)) return -EINVAL; write = false; break; @@ -2920,10 +2920,8 @@ static int mmc_route_rpmb_frames(struct device *dev, u8 *req, return -EINVAL; } - if (write) - cmd_count = 3; - else - cmd_count = 2; + /* Write operations require 3 commands, read operations require 2 */ + cmd_count = write ? 3 : 2; idata = alloc_idata(rpmb, cmd_count); if (!idata) @@ -2937,7 +2935,7 @@ static int mmc_route_rpmb_frames(struct device *dev, u8 *req, 1 | MMC_CMD23_ARG_REL_WR, req, req_len); /* Send result request frame */ - memset(resp_frm, 0, sizeof(*resp_frm)); + memset(resp_frm, 0, RPMB_FRAME_SIZE); resp_frm->req_resp = cpu_to_be16(RPMB_RESULT_READ); set_idata(idata[1], MMC_WRITE_MULTIPLE_BLOCK, 1, resp, resp_len); From 510f05bb73c68809efcd54ee6339f82bfbe83782 Mon Sep 17 00:00:00 2001 From: Xinpeng Sun Date: Thu, 28 Aug 2025 10:09:58 +0800 Subject: [PATCH 1530/3206] HID: intel-thc-hid: intel-quicki2c: Add WCL Device IDs Add THC I2C WildcatLake device IDs. Signed-off-by: Xinpeng Sun Reviewed-by: Even Xu Signed-off-by: Jiri Kosina --- drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c | 2 ++ drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c b/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c index 854926b3cfd455..a2643ae790d6e4 100644 --- a/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c +++ b/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c @@ -997,6 +997,8 @@ static const struct pci_device_id quicki2c_pci_tbl[] = { { PCI_DEVICE_DATA(INTEL, THC_PTL_H_DEVICE_ID_I2C_PORT2, &ptl_ddata) }, { PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_I2C_PORT1, &ptl_ddata) }, { PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_I2C_PORT2, &ptl_ddata) }, + { PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_I2C_PORT1, &ptl_ddata) }, + { PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_I2C_PORT2, &ptl_ddata) }, { } }; MODULE_DEVICE_TABLE(pci, quicki2c_pci_tbl); diff --git a/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h b/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h index d412eafcf9ea48..4e60a7de4727d1 100644 --- a/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h +++ b/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h @@ -13,6 +13,8 @@ #define PCI_DEVICE_ID_INTEL_THC_PTL_H_DEVICE_ID_I2C_PORT2 0xE34A #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_I2C_PORT1 0xE448 #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_I2C_PORT2 0xE44A +#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_I2C_PORT1 0x4D48 +#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_I2C_PORT2 0x4D4A /* Packet size value, the unit is 16 bytes */ #define MAX_PACKET_SIZE_VALUE_LNL 256 From cc54ed51c761728f6933cca889b684ed7fbaaf07 Mon Sep 17 00:00:00 2001 From: Xinpeng Sun Date: Thu, 28 Aug 2025 10:09:59 +0800 Subject: [PATCH 1531/3206] HID: intel-thc-hid: intel-quickspi: Add WCL Device IDs Add THC SPI WildcatLake device IDs. Signed-off-by: Xinpeng Sun Reviewed-by: Even Xu Signed-off-by: Jiri Kosina --- drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c | 2 ++ drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c b/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c index 5e5f179dd11300..84314989dc5346 100644 --- a/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c +++ b/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c @@ -976,6 +976,8 @@ static const struct pci_device_id quickspi_pci_tbl[] = { {PCI_DEVICE_DATA(INTEL, THC_PTL_H_DEVICE_ID_SPI_PORT2, &ptl), }, {PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_SPI_PORT1, &ptl), }, {PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_SPI_PORT2, &ptl), }, + {PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_SPI_PORT1, &ptl), }, + {PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_SPI_PORT2, &ptl), }, {} }; MODULE_DEVICE_TABLE(pci, quickspi_pci_tbl); diff --git a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h index 6fdf674b21c5a6..f3532d866749ca 100644 --- a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h +++ b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h @@ -19,6 +19,8 @@ #define PCI_DEVICE_ID_INTEL_THC_PTL_H_DEVICE_ID_SPI_PORT2 0xE34B #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_SPI_PORT1 0xE449 #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_SPI_PORT2 0xE44B +#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_SPI_PORT1 0x4D49 +#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_SPI_PORT2 0x4D4B /* HIDSPI special ACPI parameters DSM methods */ #define ACPI_QUICKSPI_REVISION_NUM 2 From 8599049a96850ac96a64c0c6e5cfdec5b94d9207 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Mon, 1 Sep 2025 12:20:07 +0200 Subject: [PATCH 1532/3206] HID: lenovo: Use KEY_PERFORMANCE instead of ACPI's platform_profile Commit 84c9d2a968c82 ("HID: lenovo: Support for ThinkPad-X12-TAB-1/2 Kbd Fn keys") added a dependency on ACPI's platform_profile. This should not be done for generic USB devices as this prevents using the devices on non ACPI devices like Apple silicon Macs and other non-ACPI arm64 systems. An attempt to allow using platform_profile on non-ACPI systems was rejected in [1] and instead platform_profile was made to fail during init in commit dd133162c9cf ("ACPI: platform_profile: Avoid initializing on non-ACPI platforms"). So remove the broken dependency and instead let's user space handle this keycode by sending the new KEY_PERFORMANCE. Stable backport depends on commit 89c5214639294 ("Input: add keycode for performance mode key"). [1]: https://lore.kernel.org/linux-acpi/CAJZ5v0icRdTSToaKbdf=MdRin4NyB2MstUVaQo8VR6-n7DkVMQ@mail.gmail.com/ Cc: regressions@lists.linux.dev Cc: stable@vger.kernel.org Fixes: 84c9d2a968c82 ("HID: lenovo: Support for ThinkPad-X12-TAB-1/2 Kbd Fn keys") Signed-off-by: Janne Grunau Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 2 -- drivers/hid/hid-lenovo.c | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 79997553d8f987..b934523593d95d 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -597,8 +597,6 @@ config HID_LED config HID_LENOVO tristate "Lenovo / Thinkpad devices" - depends on ACPI - select ACPI_PLATFORM_PROFILE select NEW_LEDS select LEDS_CLASS help diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c index b3121fa7a72d73..654879814f97aa 100644 --- a/drivers/hid/hid-lenovo.c +++ b/drivers/hid/hid-lenovo.c @@ -32,8 +32,6 @@ #include #include -#include - #include "hid-ids.h" /* Userspace expects F20 for mic-mute KEY_MICMUTE does not work */ @@ -734,7 +732,7 @@ static int lenovo_raw_event_TP_X12_tab(struct hid_device *hdev, u32 raw_data) report_key_event(input, KEY_RFKILL); return 1; } - platform_profile_cycle(); + report_key_event(input, KEY_PERFORMANCE); return 1; case TP_X12_RAW_HOTKEY_FN_F10: /* TAB1 has PICKUP Phone and TAB2 use Snipping tool*/ From 2a5e76b9a0efc44807ff0e6b141649fac65a55ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Szymanski?= Date: Thu, 4 Sep 2025 18:42:07 +0200 Subject: [PATCH 1533/3206] HID: cp2112: fix setter callbacks return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 6485543488a6 ("HID: cp2112: use new line value setter callbacks"), setting a GPIO value always fails with error -EBADE. That's because the returned value by the setter callbacks is the returned value by the hid_hw_raw_request() function which is the number of bytes sent on success or a negative value on error. The function gpiochip_set() returns -EBADE if the setter callbacks return a value > 0. Fix this by making the setter callbacks return 0 on success or a negative value on error. While at it, use the returned value by cp2112_gpio_set_unlocked() in the direction_output callback. Fixes: 6485543488a6 ("HID: cp2112: use new line value setter callbacks") Signed-off-by: Sébastien Szymanski Reviewed-by: Bartosz Golaszewski Signed-off-by: Jiri Kosina --- drivers/hid/hid-cp2112.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/hid/hid-cp2112.c b/drivers/hid/hid-cp2112.c index 234fa82eab0795..b5f2b6356f512a 100644 --- a/drivers/hid/hid-cp2112.c +++ b/drivers/hid/hid-cp2112.c @@ -229,10 +229,12 @@ static int cp2112_gpio_set_unlocked(struct cp2112_device *dev, ret = hid_hw_raw_request(hdev, CP2112_GPIO_SET, buf, CP2112_GPIO_SET_LENGTH, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); - if (ret < 0) + if (ret != CP2112_GPIO_SET_LENGTH) { hid_err(hdev, "error setting GPIO values: %d\n", ret); + return ret < 0 ? ret : -EIO; + } - return ret; + return 0; } static int cp2112_gpio_set(struct gpio_chip *chip, unsigned int offset, @@ -309,9 +311,7 @@ static int cp2112_gpio_direction_output(struct gpio_chip *chip, * Set gpio value when output direction is already set, * as specified in AN495, Rev. 0.2, cpt. 4.4 */ - cp2112_gpio_set_unlocked(dev, offset, value); - - return 0; + return cp2112_gpio_set_unlocked(dev, offset, value); } static int cp2112_hid_get(struct hid_device *hdev, unsigned char report_number, From 831f70a5b93bd2d9e858ced2c12fab5766ede5e7 Mon Sep 17 00:00:00 2001 From: Amit Chaudhari Date: Tue, 19 Aug 2025 17:49:19 -0400 Subject: [PATCH 1534/3206] HID: asus: add support for missing PX series fn keys Add support for missing hotkey keycodes affecting Asus PX13 and PX16 families so userspace can use them. Signed-off-by: Amit Chaudhari Signed-off-by: Jiri Kosina --- drivers/hid/hid-asus.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index d27dcfb2b9e4e1..8db9d4e7c3b0b2 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -974,7 +974,10 @@ static int asus_input_mapping(struct hid_device *hdev, case 0xc4: asus_map_key_clear(KEY_KBDILLUMUP); break; case 0xc5: asus_map_key_clear(KEY_KBDILLUMDOWN); break; case 0xc7: asus_map_key_clear(KEY_KBDILLUMTOGGLE); break; + case 0x4e: asus_map_key_clear(KEY_FN_ESC); break; + case 0x7e: asus_map_key_clear(KEY_EMOJI_PICKER); break; + case 0x8b: asus_map_key_clear(KEY_PROG1); break; /* ProArt Creator Hub key */ case 0x6b: asus_map_key_clear(KEY_F21); break; /* ASUS touchpad toggle */ case 0x38: asus_map_key_clear(KEY_PROG1); break; /* ROG key */ case 0xba: asus_map_key_clear(KEY_PROG2); break; /* Fn+C ASUS Splendid */ From 5799d5d8a6c877f03ad5b5a640977053be45059a Mon Sep 17 00:00:00 2001 From: David Kaplan Date: Fri, 12 Sep 2025 10:24:28 -0500 Subject: [PATCH 1535/3206] x86/bugs: Add attack vector controls for VMSCAPE Use attack vector controls to select whether VMSCAPE requires mitigation, similar to other bugs. Signed-off-by: David Kaplan Signed-off-by: Borislav Petkov (AMD) --- .../admin-guide/hw-vuln/attack_vector_controls.rst | 1 + arch/x86/kernel/cpu/bugs.c | 14 ++++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst index 5964901d66e317..d0bdbd81dcf9f2 100644 --- a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst +++ b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst @@ -218,6 +218,7 @@ SRSO X X X X SSB X TAA X X X X * (Note 2) TSA X X X X +VMSCAPE X =============== ============== ============ ============= ============== ============ ======== Notes: diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 36dcfc5105be9a..e817bbae01591b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -434,6 +434,9 @@ static bool __init should_mitigate_vuln(unsigned int bug) case X86_BUG_SPEC_STORE_BYPASS: return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER); + case X86_BUG_VMSCAPE: + return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST); + default: WARN(1, "Unknown bug %x\n", bug); return false; @@ -3304,15 +3307,18 @@ early_param("vmscape", vmscape_parse_cmdline); static void __init vmscape_select_mitigation(void) { - if (cpu_mitigations_off() || - !boot_cpu_has_bug(X86_BUG_VMSCAPE) || + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) || !boot_cpu_has(X86_FEATURE_IBPB)) { vmscape_mitigation = VMSCAPE_MITIGATION_NONE; return; } - if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) - vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_VMSCAPE)) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + else + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } } static void __init vmscape_update_mitigation(void) From 3e403c2faad9398fb70313f7cc0303c38f1ade9c Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Thu, 11 Sep 2025 23:21:01 +0200 Subject: [PATCH 1536/3206] dt-bindings: i2c: qcom-cci: Document QCM2290 compatible The CCI on QCM2290 is the interface for controlling camera sensor over I2C. It requires only two clocks. Signed-off-by: Loic Poulain Reviewed-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/qcom,i2c-cci.yaml | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml index 73144473b9b24e..83b13370ff6c12 100644 --- a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml +++ b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml @@ -25,6 +25,7 @@ properties: - items: - enum: + - qcom,qcm2290-cci - qcom,sc7280-cci - qcom,sc8280xp-cci - qcom,sdm670-cci @@ -44,11 +45,11 @@ properties: const: 0 clocks: - minItems: 3 + minItems: 2 maxItems: 6 clock-names: - minItems: 3 + minItems: 2 maxItems: 6 interrupts: @@ -113,6 +114,7 @@ allOf: then: properties: clocks: + minItems: 3 maxItems: 3 clock-names: items: @@ -120,6 +122,22 @@ allOf: - const: cci_ahb - const: cci + - if: + properties: + compatible: + contains: + enum: + - qcom,qcm2290-cci + then: + properties: + clocks: + minItems: 2 + maxItems: 2 + clock-names: + items: + - const: ahb + - const: cci + - if: properties: compatible: From c62859e13712334bd17d46ee0c1b26e8673d2961 Mon Sep 17 00:00:00 2001 From: Wenmeng Liu Date: Fri, 12 Sep 2025 23:19:25 +0800 Subject: [PATCH 1537/3206] dt-bindings: i2c: qcom-cci: Document sa8775p compatible Add the sa8775p CCI device string compatible. Acked-by: Rob Herring (Arm) Reviewed-by: Bryan O'Donoghue Signed-off-by: Wenmeng Liu Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml index 83b13370ff6c12..7456783d1f8ef5 100644 --- a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml +++ b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml @@ -26,6 +26,7 @@ properties: - items: - enum: - qcom,qcm2290-cci + - qcom,sa8775p-cci - qcom,sc7280-cci - qcom,sc8280xp-cci - qcom,sdm670-cci @@ -241,6 +242,7 @@ allOf: compatible: contains: enum: + - qcom,sa8775p-cci - qcom,sm8550-cci - qcom,sm8650-cci - qcom,x1e80100-cci From 947e2d6414b3118bab920f1b0c27b4301dc5a361 Mon Sep 17 00:00:00 2001 From: Denzeel Oliva Date: Sun, 7 Sep 2025 22:13:38 +0000 Subject: [PATCH 1538/3206] dt-bindings: i2c: exynos5: Add exynos990-hsi2c compatible Add samsung,exynos990-hsi2c dedicated compatible for representing I2C of Exynos990 SoC. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Denzeel Oliva Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml b/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml index 7ae8c7b1d0067e..6fc03281a8f844 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml +++ b/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml @@ -38,6 +38,7 @@ properties: - google,gs101-hsi2c - samsung,exynos2200-hsi2c - samsung,exynos850-hsi2c + - samsung,exynos990-hsi2c - const: samsung,exynosautov9-hsi2c - const: samsung,exynos5-hsi2c # Exynos5250 and Exynos5420 deprecated: true From f77a13df57e803fe574dd29c5454088fcc6f3285 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Thu, 28 Aug 2025 16:01:29 +0200 Subject: [PATCH 1539/3206] dt-bindings: i2c: apple,i2c: Add apple,t6020-i2c compatible After discussion with the devicetree maintainers we agreed to not extend lists with the generic compatible "apple,i2c" anymore [1]. Use "apple,t8103-i2c" as fallback compatible as it is the SoC the driver and bindings were written for. This block is compatible with t8103, so just add the new per-SoC compatible using apple,t8103-i2c as base. [1]: https://lore.kernel.org/asahi/12ab93b7-1fc2-4ce0-926e-c8141cfe81bf@kernel.org/ Signed-off-by: Janne Grunau Acked-by: Andi Shyti Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/apple,i2c.yaml | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml index fed3e1b8c43f67..500a965bdb7a84 100644 --- a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml @@ -20,17 +20,22 @@ allOf: properties: compatible: - items: - - enum: - - apple,s5l8960x-i2c - - apple,t7000-i2c - - apple,s8000-i2c - - apple,t8010-i2c - - apple,t8015-i2c - - apple,t8103-i2c - - apple,t8112-i2c - - apple,t6000-i2c - - const: apple,i2c + oneOf: + - items: + - const: apple,t6020-i2c + - const: apple,t8103-i2c + - items: + - enum: + # Do not add additional SoC to this list. + - apple,s5l8960x-i2c + - apple,t7000-i2c + - apple,s8000-i2c + - apple,t8010-i2c + - apple,t8015-i2c + - apple,t8103-i2c + - apple,t8112-i2c + - apple,t6000-i2c + - const: apple,i2c reg: maxItems: 1 From 5b7c90ef646d175e8c85a6d31ce3794789bbb925 Mon Sep 17 00:00:00 2001 From: Mukesh Kumar Savaliya Date: Wed, 27 Aug 2025 00:27:18 +0530 Subject: [PATCH 1540/3206] MAINTAINERS: Update email address for Qualcomm's I2C GENI maintainers Update email address to @oss.qualcomm.com for both the maintainers from qualcomm, Viken dadhani and Mukesh Kumar Savaliya. Signed-off-by: Mukesh Kumar Savaliya Signed-off-by: Wolfram Sang --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cd7ff55b5d3217..967de62878b3dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20774,8 +20774,8 @@ S: Supported F: drivers/dma/qcom/hidma* QUALCOMM I2C QCOM GENI DRIVER -M: Mukesh Kumar Savaliya -M: Viken Dadhaniya +M: Mukesh Kumar Savaliya +M: Viken Dadhaniya L: linux-i2c@vger.kernel.org L: linux-arm-msm@vger.kernel.org S: Maintained From c90fa5493f7a96370c6774b4197d974b003fad7f Mon Sep 17 00:00:00 2001 From: Manikanta Guntupalli Date: Thu, 31 Jul 2025 18:25:35 +0530 Subject: [PATCH 1541/3206] i2c: mux: pca9541: Use I2C adapter timeout value for arbitration timeout Remove existing arbitration timeout macros and use I2C adapter timeout value for arbitration timeout and forceful bus ownership. I2C adapter timeout can be configurable from user space, so using it for arbitration timeout helps in configuring the arbitration timeout from user space depending on the use case. Signed-off-by: Manikanta Guntupalli Signed-off-by: Wolfram Sang --- drivers/i2c/muxes/i2c-mux-pca9541.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c index 8663c8a7c26936..3d8002caf7031b 100644 --- a/drivers/i2c/muxes/i2c-mux-pca9541.c +++ b/drivers/i2c/muxes/i2c-mux-pca9541.c @@ -63,10 +63,6 @@ #define mybus(x) (!((x) & MYBUS) || ((x) & MYBUS) == MYBUS) #define busoff(x) (!((x) & BUSON) || ((x) & BUSON) == BUSON) -/* arbitration timeouts, in jiffies */ -#define ARB_TIMEOUT (HZ / 8) /* 125 ms until forcing bus ownership */ -#define ARB2_TIMEOUT (HZ / 4) /* 250 ms until acquisition failure */ - /* arbitration retry delays, in us */ #define SELECT_DELAY_SHORT 50 #define SELECT_DELAY_LONG 1000 @@ -229,6 +225,9 @@ static int pca9541_arbitrate(struct i2c_client *client) */ data->select_timeout = SELECT_DELAY_LONG; if (time_is_before_eq_jiffies(data->arb_timeout)) { + dev_warn(&client->dev, + "Arbitration timeout on I2C bus, forcing bus ownership\n"); + /* Time is up, take the bus and reset it. */ pca9541_reg_write(client, PCA9541_CONTROL, @@ -251,10 +250,10 @@ static int pca9541_select_chan(struct i2c_mux_core *muxc, u32 chan) struct pca9541 *data = i2c_mux_priv(muxc); struct i2c_client *client = data->client; int ret; - unsigned long timeout = jiffies + ARB2_TIMEOUT; + unsigned long timeout = jiffies + (2 * client->adapter->timeout); /* give up after this time */ - data->arb_timeout = jiffies + ARB_TIMEOUT; + data->arb_timeout = jiffies + client->adapter->timeout; /* force bus ownership after this time */ do { @@ -267,6 +266,7 @@ static int pca9541_select_chan(struct i2c_mux_core *muxc, u32 chan) else msleep(data->select_timeout / 1000); } while (time_is_after_eq_jiffies(timeout)); + dev_warn(&client->dev, "Failed to acquire I2C bus, timed out\n"); return -ETIMEDOUT; } From eddfe53b0d84b02802432bf783ccd53b095dd10a Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 27 Aug 2025 13:10:42 +0300 Subject: [PATCH 1542/3206] i2c: core: Drop dev_pm_domain_detach() call Starting with commit f99508074e78 ("PM: domains: Detach on device_unbind_cleanup()"), there is no longer a need to call dev_pm_domain_detach() in the bus remove function. The device_unbind_cleanup() function now handles this to avoid invoking devres cleanup handlers while the PM domain is powered off, which could otherwise lead to failures as described in the above-mentioned commit. Drop the explicit dev_pm_domain_detach() call and rely instead on the flags passed to dev_pm_domain_attach() to power off the domain. Signed-off-by: Claudiu Beznea Reviewed-by: Ulf Hansson Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index ecca8c006b0203..ae7e9c8b65a65c 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -573,7 +573,8 @@ static int i2c_device_probe(struct device *dev) goto err_clear_wakeup_irq; do_power_on = !i2c_acpi_waive_d0_probe(dev); - status = dev_pm_domain_attach(&client->dev, do_power_on ? PD_FLAG_ATTACH_POWER_ON : 0); + status = dev_pm_domain_attach(&client->dev, PD_FLAG_DETACH_POWER_OFF | + (do_power_on ? PD_FLAG_ATTACH_POWER_ON : 0)); if (status) goto err_clear_wakeup_irq; @@ -581,7 +582,7 @@ static int i2c_device_probe(struct device *dev) GFP_KERNEL); if (!client->devres_group_id) { status = -ENOMEM; - goto err_detach_pm_domain; + goto err_clear_wakeup_irq; } client->debugfs = debugfs_create_dir(dev_name(&client->dev), @@ -608,8 +609,6 @@ static int i2c_device_probe(struct device *dev) err_release_driver_resources: debugfs_remove_recursive(client->debugfs); devres_release_group(&client->dev, client->devres_group_id); -err_detach_pm_domain: - dev_pm_domain_detach(&client->dev, do_power_on); err_clear_wakeup_irq: dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); @@ -636,8 +635,6 @@ static void i2c_device_remove(struct device *dev) devres_release_group(&client->dev, client->devres_group_id); - dev_pm_domain_detach(&client->dev, true); - dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); From 69329daf16af7d0ec189a760f28ea6a1ffb7f23a Mon Sep 17 00:00:00 2001 From: Akhil R Date: Mon, 18 Aug 2025 10:03:44 +0530 Subject: [PATCH 1543/3206] dt-bindings: i2c: nvidia,tegra20-i2c: Add Tegra256 I2C compatible Add compatible for Tegra256 I2C controllers. Tegra256 consists of 8 generic Tegra I2C controllers similar to previous generations. The parent clock frequency is different in these controllers and hence the timing parameter values are different from the previous ones. Signed-off-by: Akhil R Acked-by: Rob Herring (Arm) Acked-by: Thierry Reding Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml b/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml index 6b6f6762d122f9..32c3b69ccf3420 100644 --- a/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml @@ -80,6 +80,11 @@ properties: support for 64 KiB transactions whereas earlier chips supported no more than 4 KiB per transactions. const: nvidia,tegra194-i2c + - description: | + Tegra256 has 8 generic I2C controllers. The controllers are similar to + the previous generations, but have a different parent clock and hence + the timing parameters are configured differently. + const: nvidia,tegra256-i2c reg: maxItems: 1 @@ -186,6 +191,7 @@ allOf: contains: enum: - nvidia,tegra194-i2c + - nvidia,tegra256-i2c then: required: - resets From 6e3cb25e62f2081f19057f1abe62c014b8e814de Mon Sep 17 00:00:00 2001 From: Akhil R Date: Mon, 18 Aug 2025 10:03:45 +0530 Subject: [PATCH 1544/3206] i2c: tegra: Add Tegra256 support Add compatible and the hardware struct for Tegra256. Tegra256 controllers use a different parent clock. Hence the timing parameters are different from the previous generations to meet the expected frequencies. Signed-off-by: Akhil R Acked-by: Thierry Reding Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-tegra.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 4eb31b913c1a7d..e533460bccc39e 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -1649,7 +1649,33 @@ static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { .has_interface_timing_reg = true, }; +static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { + .has_continue_xfer_support = true, + .has_per_pkt_xfer_complete_irq = true, + .clk_divisor_hs_mode = 7, + .clk_divisor_std_mode = 0x7a, + .clk_divisor_fast_mode = 0x40, + .clk_divisor_fast_plus_mode = 0x19, + .has_config_load_reg = true, + .has_multi_master_mode = true, + .has_slcg_override_reg = true, + .has_mst_fifo = true, + .has_mst_reset = true, + .quirks = &tegra194_i2c_quirks, + .supports_bus_clear = true, + .has_apb_dma = false, + .tlow_std_mode = 0x8, + .thigh_std_mode = 0x7, + .tlow_fast_fastplus_mode = 0x3, + .thigh_fast_fastplus_mode = 0x3, + .setup_hold_time_std_mode = 0x08080808, + .setup_hold_time_fast_fast_plus_mode = 0x02020202, + .setup_hold_time_hs_mode = 0x090909, + .has_interface_timing_reg = true, +}; + static const struct of_device_id tegra_i2c_of_match[] = { + { .compatible = "nvidia,tegra256-i2c", .data = &tegra256_i2c_hw, }, { .compatible = "nvidia,tegra194-i2c", .data = &tegra194_i2c_hw, }, { .compatible = "nvidia,tegra186-i2c", .data = &tegra186_i2c_hw, }, #if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) From 201825fb4278accb6ace42915566c22391a0900d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 11 Sep 2025 15:43:15 +0100 Subject: [PATCH 1545/3206] net: ethtool: handle EOPNOTSUPP from ethtool get_ts_info() method Network drivers sometimes return -EOPNOTSUPP from their get_ts_info() method, and this should not cause the reporting of PHY timestamping information to be prohibited. Handle this error code, and also arrange for ethtool_net_get_ts_info_by_phc() to return -EOPNOTSUPP when the method is not implemented. This allows e.g. PHYs connected to DSA switches which support timestamping to report their timestamping capabilities. Fixes: b9e3f7dc9ed9 ("net: ethtool: tsinfo: Enhance tsinfo to support several hwtstamp by net topology") Signed-off-by: Russell King (Oracle) Reviewed-by: Kory Maincent Link: https://patch.msgid.link/E1uwiW3-00000004jRF-3CnC@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- net/ethtool/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 4f58648a27ad63..92e6a681c797ec 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -905,7 +905,7 @@ int ethtool_net_get_ts_info_by_phc(struct net_device *dev, int err; if (!ops->get_ts_info) - return -ENODEV; + return -EOPNOTSUPP; /* Does ptp comes from netdev */ ethtool_init_tsinfo(info); @@ -973,7 +973,7 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev, int err; err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc); - if (err == -ENODEV) { + if (err == -ENODEV || err == -EOPNOTSUPP) { struct phy_device *phy; phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); From a5edf3550f4260504b7e0ab3d40d13ffe924b773 Mon Sep 17 00:00:00 2001 From: hupu Date: Wed, 10 Sep 2025 16:16:55 +0800 Subject: [PATCH 1546/3206] perf subcmd: avoid crash in exclude_cmds when excludes is empty When cross-compiling the perf tool for ARM64, `perf help` may crash with the following assertion failure: help.c:122: exclude_cmds: Assertion `cmds->names[ci] == NULL' failed. This happens when the perf binary is not named exactly "perf" or when multiple "perf-*" binaries exist in the same directory. In such cases, the `excludes` command list can be empty, which leads to the final assertion in exclude_cmds() being triggered. Add a simple guard at the beginning of exclude_cmds() to return early if excludes->cnt is zero, preventing the crash. Signed-off-by: hupu Reported-by: Guilherme Amadio Reviewed-by: Namhyung Kim Link: https://lore.kernel.org/r/20250909094953.106706-1-amadio@gentoo.org Signed-off-by: Namhyung Kim --- tools/lib/subcmd/help.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index 9ef569492560ef..ddaeb4eb3e2497 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -75,6 +75,9 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) size_t ci, cj, ei; int cmp; + if (!excludes->cnt) + return; + ci = cj = ei = 0; while (ci < cmds->cnt && ei < excludes->cnt) { cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name); From 7947ad15614ce897f47ce8ae123b82445d1861d0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 12 Sep 2025 17:01:29 -0700 Subject: [PATCH 1547/3206] perf lock: Provide a host_env for session new When "perf lock con" is run in a live mode, with no data file, a host environment must be provided. Testing missed this as a failing assert was creating the 1 line of expected stderr output. $ sudo perf lock con -ab true perf: util/session.c:195: __perf_session__new: Assertion `host_env != NULL' failed. Aborted Fixes: 525a599badeeafba ("perf env: Remove global perf_env") Signed-off-by: Ian Rogers Signed-off-by: Namhyung Kim --- tools/perf/builtin-lock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index fd49703021fd26..078634461df270 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -2009,6 +2009,7 @@ static int __cmd_contention(int argc, const char **argv) .owner = show_lock_owner, .cgroups = RB_ROOT, }; + struct perf_env host_env; lockhash_table = calloc(LOCKHASH_SIZE, sizeof(*lockhash_table)); if (!lockhash_table) @@ -2024,7 +2025,10 @@ static int __cmd_contention(int argc, const char **argv) eops.mmap = perf_event__process_mmap; eops.tracing_data = perf_event__process_tracing_data; - session = perf_session__new(use_bpf ? NULL : &data, &eops); + perf_env__init(&host_env); + session = __perf_session__new(use_bpf ? NULL : &data, &eops, + /*trace_event_repipe=*/false, &host_env); + if (IS_ERR(session)) { pr_err("Initializing perf session failed\n"); err = PTR_ERR(session); @@ -2142,6 +2146,7 @@ static int __cmd_contention(int argc, const char **argv) evlist__delete(con.evlist); lock_contention_finish(&con); perf_session__delete(session); + perf_env__exit(&host_env); zfree(&lockhash_table); return err; } From 46834d90a9a13549264b9581067d8f746b4b36cc Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sat, 6 Sep 2025 14:21:45 +0200 Subject: [PATCH 1548/3206] crypto: ccp - Always pass in an error pointer to __sev_platform_shutdown_locked() When 9770b428b1a2 ("crypto: ccp - Move dev_info/err messages for SEV/SNP init and shutdown") moved the error messages dumping so that they don't need to be issued by the callers, it missed the case where __sev_firmware_shutdown() calls __sev_platform_shutdown_locked() with a NULL argument which leads to a NULL ptr deref on the shutdown path, during suspend to disk: #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 0 UID: 0 PID: 983 Comm: hib.sh Not tainted 6.17.0-rc4+ #1 PREEMPT(voluntary) Hardware name: Supermicro Super Server/H12SSL-i, BIOS 2.5 09/08/2022 RIP: 0010:__sev_platform_shutdown_locked.cold+0x0/0x21 [ccp] That rIP is: 00000000000006fd <__sev_platform_shutdown_locked.cold>: 6fd: 8b 13 mov (%rbx),%edx 6ff: 48 8b 7d 00 mov 0x0(%rbp),%rdi 703: 89 c1 mov %eax,%ecx Code: 74 05 31 ff 41 89 3f 49 8b 3e 89 ea 48 c7 c6 a0 8e 54 a0 41 bf 92 ff ff ff e8 e5 2e 09 e1 c6 05 2a d4 38 00 01 e9 26 af ff ff <8b> 13 48 8b 7d 00 89 c1 48 c7 c6 18 90 54 a0 89 44 24 04 e8 c1 2e RSP: 0018:ffffc90005467d00 EFLAGS: 00010282 RAX: 00000000ffffff92 RBX: 0000000000000000 RCX: 0000000000000000 ^^^^^^^^^^^^^^^^ and %rbx is nice and clean. Call Trace: __sev_firmware_shutdown.isra.0 sev_dev_destroy psp_dev_destroy sp_destroy pci_device_shutdown device_shutdown kernel_power_off hibernate.cold state_store kernfs_fop_write_iter vfs_write ksys_write do_syscall_64 entry_SYSCALL_64_after_hwframe Pass in a pointer to the function-local error var in the caller. With that addressed, suspending the ccp shows the error properly at least: ccp 0000:47:00.1: sev command 0x2 timed out, disabling PSP ccp 0000:47:00.1: SEV: failed to SHUTDOWN error 0x0, rc -110 SEV-SNP: Leaking PFN range 0x146800-0x146a00 SEV-SNP: PFN 0x146800 unassigned, dumping non-zero entries in 2M PFN region: [0x146800 - 0x146a00] ... ccp 0000:47:00.1: SEV-SNP firmware shutdown failed, rc -16, error 0x0 ACPI: PM: Preparing to enter system sleep state S5 kvm: exiting hardware virtualization reboot: Power down Btw, this driver is crying to be cleaned up to pass in a proper I/O struct which can be used to store information between the different functions, otherwise stuff like that will happen in the future again. Fixes: 9770b428b1a2 ("crypto: ccp - Move dev_info/err messages for SEV/SNP init and shutdown") Signed-off-by: Borislav Petkov (AMD) Cc: Reviewed-by: Ashish Kalra Acked-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index e058ba02779296..9f5ccc1720cbc1 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -2430,7 +2430,7 @@ static void __sev_firmware_shutdown(struct sev_device *sev, bool panic) { int error; - __sev_platform_shutdown_locked(NULL); + __sev_platform_shutdown_locked(&error); if (sev_es_tmr) { /* From a0c17ed907ac3326cf3c9d6007ea222a746f5cc2 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 11 Sep 2025 13:14:06 +0000 Subject: [PATCH 1549/3206] iommu/amd: Fix alias device DTE setting Commit 7bea695ada0 restructured DTE flag handling but inadvertently changed the alias device configuration logic. This may cause incorrect DTE settings for certain devices. Add alias flag check before calling set_dev_entry_from_acpi(). Also move the device iteration loop inside the alias check to restrict execution to cases where alias devices are present. Fixes: 7bea695ada0 ("iommu/amd: Introduce struct ivhd_dte_flags to store persistent DTE flags") Cc: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 6795326a6524b8..ba9e582a8bbe5d 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1455,12 +1455,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, PCI_FUNC(e->devid)); devid = e->devid; - for (dev_i = devid_start; dev_i <= devid; ++dev_i) { - if (alias) + if (alias) { + for (dev_i = devid_start; dev_i <= devid; ++dev_i) pci_seg->alias_table[dev_i] = devid_to; + set_dev_entry_from_acpi(iommu, devid_to, flags, ext_flags); } set_dev_entry_from_acpi_range(iommu, devid_start, devid, flags, ext_flags); - set_dev_entry_from_acpi(iommu, devid_to, flags, ext_flags); break; case IVHD_DEV_SPECIAL: { u8 handle, type; From 07c24945cafc810bcce3ea6c336a6495e813b47f Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 12 Sep 2025 20:06:50 +0700 Subject: [PATCH 1550/3206] Revert "drm: Add directive to format code in comment" Commit 6cc44e9618f03f ("drm: Add directive to format code in comment") fixes original Sphinx indentation warning as introduced in 471920ce25d50b ("drm/gpuvm: Add locking helpers"), by means of using code-block:: directive. It semantically conflicts with earlier bb324f85f72284 ("drm/gpuvm: Wrap drm_gpuvm_sm_map_exec_lock() expected usage in literal code block") that did the same using double colon syntax instead. These duplicated literal code block directives causes the original warnings not being fixed. Revert 6cc44e9618f03f to keep things rolling without these warnings. Fixes: 6cc44e9618f0 ("drm: Add directive to format code in comment") Fixes: 471920ce25d5 ("drm/gpuvm: Add locking helpers") Signed-off-by: Bagas Sanjaya Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Danilo Krummrich --- drivers/gpu/drm/drm_gpuvm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c index 60b672d3fd8353..8b8a3ca8d7fc1d 100644 --- a/drivers/gpu/drm/drm_gpuvm.c +++ b/drivers/gpu/drm/drm_gpuvm.c @@ -2432,8 +2432,6 @@ static const struct drm_gpuvm_ops lock_ops = { * * The expected usage is: * - * .. code-block:: c - * * vm_bind { * struct drm_exec exec; * From 91bf158f8cdf6fd344d3035a13ac746d5846de33 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sat, 13 Sep 2025 19:57:36 +0900 Subject: [PATCH 1551/3206] firewire: core: use macro expression for gap count mismatch The gap_count field is assigned to zero when mismatch is detected. In such case, the macro expression is preferable since it is easy to understand the situation. This commit applies the idea. Link: https://lore.kernel.org/r/20250913105737.778038-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 2 +- drivers/firewire/core-topology.c | 2 +- drivers/firewire/core.h | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 32cf6b3344cd90..bf2e7f55b83e60 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -425,7 +425,7 @@ static void bm_work(struct work_struct *work) */ card->bm_generation = generation; - if (card->gap_count == 0) { + if (card->gap_count == GAP_COUNT_MISMATCHED) { /* * If self IDs have inconsistent gap counts, do a * bus reset ASAP. The config rom read might never diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 74a6aa7d8cc92c..5f8fb1201d8016 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -241,7 +241,7 @@ static struct fw_node *build_tree(struct fw_card *card, const u32 *sid, int self // If PHYs report different gap counts, set an invalid count which will force a gap // count reconfiguration and a reset. if (phy_packet_self_id_zero_get_gap_count(self_id_sequence[0]) != gap_count) - gap_count = 0; + gap_count = GAP_COUNT_MISMATCHED; update_hop_count(node); diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index 083e39034c378e..79eb57fd5812ac 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -27,6 +27,9 @@ struct fw_packet; /* -card */ +// This is the arbitrary value we use to indicate a mismatched gap count. +#define GAP_COUNT_MISMATCHED 0 + extern __printf(2, 3) void fw_err(const struct fw_card *card, const char *fmt, ...); extern __printf(2, 3) From 2ba08d1bad79cc8d9c82f529adc01f27118e0ca7 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sat, 13 Sep 2025 19:57:37 +0900 Subject: [PATCH 1552/3206] firewire: core: use macro expression for not-registered state of BUS_MANAGER_ID The value of BUS_MANAGER_ID register has 0x3f when no node_id is registered. Current implementation uses hard-coded numeric literal but in the case the macro expression is preferable since it is easy to distinguish the state from node ID mask. This commit applies the idea. Link: https://lore.kernel.org/r/20250913105737.778038-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 12 ++++++++---- drivers/firewire/core.h | 3 +++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index bf2e7f55b83e60..adb90161c4c6ac 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -327,7 +327,7 @@ static void bm_work(struct work_struct *work) * next generation. */ __be32 data[2] = { - cpu_to_be32(0x3f), + cpu_to_be32(BUS_MANAGER_ID_NOT_REGISTERED), cpu_to_be32(local_id), }; struct fw_device *irm_device = fw_node_get_device(card->irm_node); @@ -372,10 +372,14 @@ static void bm_work(struct work_struct *work) if (rcode == RCODE_COMPLETE) { int bm_id = be32_to_cpu(data[0]); - if (generation == card->generation) - card->bm_node_id = bm_id == 0x3f ? local_id : 0xffc0 | bm_id; + if (generation == card->generation) { + if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) + card->bm_node_id = 0xffc0 & bm_id; + else + card->bm_node_id = local_id; + } - if (bm_id != 0x3f) { + if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) { spin_unlock_irq(&card->lock); // Somebody else is BM. Only act as IRM. diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index 79eb57fd5812ac..9e68ebf0673d73 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -170,6 +170,9 @@ static inline void fw_iso_context_init_work(struct fw_iso_context *ctx, work_fun /* -topology */ +// The initial value of BUS_MANAGER_ID register, to express nothing registered. +#define BUS_MANAGER_ID_NOT_REGISTERED 0x3f + enum { FW_NODE_CREATED, FW_NODE_UPDATED, From 44a375e8aaff360944606b3bb2df89f1376cbaf3 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 1 Sep 2025 09:53:04 +0100 Subject: [PATCH 1553/3206] ARM: 9458/1: module: Ensure the override of module_arch_freeing_init() The function module_arch_freeing_init() defined in arch/arm/kernel/module.c is supposed to override a weak function of the same name defined in kernel/module/main.c. However, the ARM override is also marked as weak, which means that selecting the correct function unnecessarily depends on the order in which object files with both functions are passed to the linker. Although it happens to be correct at the moment, the proper pattern is to make the ARM override a strong definition. Fixes: cdcb07e45a91 ("ARM: 8975/1: module: fix handling of unwind init sections") Signed-off-by: Petr Pavlu Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index da488d92e7a00d..55ca3fcd37e860 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -484,7 +484,7 @@ module_arch_cleanup(struct module *mod) #endif } -void __weak module_arch_freeing_init(struct module *mod) +void module_arch_freeing_init(struct module *mod) { #ifdef CONFIG_ARM_UNWIND struct unwind_table *init = mod->arch.init_table; From 6eb350a2233100a283f882c023e5ad426d0ed63b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Aug 2025 17:02:30 +0200 Subject: [PATCH 1554/3206] rseq: Protect event mask against membarrier IPI rseq_need_restart() reads and clears task::rseq_event_mask with preemption disabled to guard against the scheduler. But membarrier() uses an IPI and sets the PREEMPT bit in the event mask from the IPI, which leaves that RMW operation unprotected. Use guard(irq) if CONFIG_MEMBARRIER is enabled to fix that. Fixes: 2a36ab717e8f ("rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ") Signed-off-by: Thomas Gleixner Reviewed-by: Boqun Feng Reviewed-by: Mathieu Desnoyers Cc: stable@vger.kernel.org --- include/linux/rseq.h | 11 ++++++++--- kernel/rseq.c | 10 +++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bc8af3eb559876..1fbeb61babeb8b 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,6 +7,12 @@ #include #include +#ifdef CONFIG_MEMBARRIER +# define RSEQ_EVENT_GUARD irq +#else +# define RSEQ_EVENT_GUARD preempt +#endif + /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. @@ -41,9 +47,8 @@ static inline void rseq_handle_notify_resume(struct ksignal *ksig, static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - preempt_disable(); - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - preempt_enable(); + scoped_guard(RSEQ_EVENT_GUARD) + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); rseq_handle_notify_resume(ksig, regs); } diff --git a/kernel/rseq.c b/kernel/rseq.c index b7a1ec327e8117..2452b7366b00e9 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -342,12 +342,12 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) /* * Load and clear event mask atomically with respect to - * scheduler preemption. + * scheduler preemption and membarrier IPIs. */ - preempt_disable(); - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - preempt_enable(); + scoped_guard(RSEQ_EVENT_GUARD) { + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + } return !!event_mask; } From a001cd248ab244633c5fabe4f7c707e13fc1d1cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Aug 2025 15:29:44 -0700 Subject: [PATCH 1555/3206] rseq/selftests: Use weak symbol reference, not definition, to link with glibc Add "extern" to the glibc-defined weak rseq symbols to convert the rseq selftest's usage from weak symbol definitions to weak symbol _references_. Effectively re-defining the glibc symbols wreaks havoc when building with -fno-common, e.g. generates segfaults when running multi-threaded programs, as dynamically linked applications end up with multiple versions of the symbols. Building with -fcommon, which until recently has the been the default for GCC and clang, papers over the bug by allowing the linker to resolve the weak/tentative definition to glibc's "real" definition. Note, the symbol itself (or rather its address), not the value of the symbol, is set to 0/NULL for unresolved weak symbol references, as the symbol doesn't exist and thus can't have a value. Check for a NULL rseq size pointer to handle the scenario where the test is statically linked against a libc that doesn't support rseq in any capacity. Fixes: 3bcbc20942db ("selftests/rseq: Play nice with binaries statically linked against glibc 2.35+") Reported-by: Thomas Gleixner Suggested-by: Florian Weimer Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Cc: stable@vger.kernel.org Closes: https://lore.kernel.org/all/87frdoybk4.ffs@tglx --- tools/testing/selftests/rseq/rseq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index 663a9cef1952f0..dcac5cbe793370 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -40,9 +40,9 @@ * Define weak versions to play nice with binaries that are statically linked * against a libc that doesn't support registering its own rseq. */ -__weak ptrdiff_t __rseq_offset; -__weak unsigned int __rseq_size; -__weak unsigned int __rseq_flags; +extern __weak ptrdiff_t __rseq_offset; +extern __weak unsigned int __rseq_size; +extern __weak unsigned int __rseq_flags; static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset; static const unsigned int *libc_rseq_size_p = &__rseq_size; @@ -209,7 +209,7 @@ void rseq_init(void) * libc not having registered a restartable sequence. Try to find the * symbols if that's the case. */ - if (!*libc_rseq_size_p) { + if (!libc_rseq_size_p || !*libc_rseq_size_p) { libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset"); libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size"); libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags"); From 98c6d259319ecf6e8d027abd3f14b81324b8c0ad Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:15:03 -0700 Subject: [PATCH 1556/3206] mm/gup: check ref_count instead of lru before migration Patch series "mm: better GUP pin lru_add_drain_all()", v2. Series of lru_add_drain_all()-related patches, arising from recent mm/gup migration report from Will Deacon. This patch (of 5): Will Deacon reports:- When taking a longterm GUP pin via pin_user_pages(), __gup_longterm_locked() tries to migrate target folios that should not be longterm pinned, for example because they reside in a CMA region or movable zone. This is done by first pinning all of the target folios anyway, collecting all of the longterm-unpinnable target folios into a list, dropping the pins that were just taken and finally handing the list off to migrate_pages() for the actual migration. It is critically important that no unexpected references are held on the folios being migrated, otherwise the migration will fail and pin_user_pages() will return -ENOMEM to its caller. Unfortunately, it is relatively easy to observe migration failures when running pKVM (which uses pin_user_pages() on crosvm's virtual address space to resolve stage-2 page faults from the guest) on a 6.15-based Pixel 6 device and this results in the VM terminating prematurely. In the failure case, 'crosvm' has called mlock(MLOCK_ONFAULT) on its mapping of guest memory prior to the pinning. Subsequently, when pin_user_pages() walks the page-table, the relevant 'pte' is not present and so the faulting logic allocates a new folio, mlocks it with mlock_folio() and maps it in the page-table. Since commit 2fbb0c10d1e8 ("mm/munlock: mlock_page() munlock_page() batch by pagevec"), mlock/munlock operations on a folio (formerly page), are deferred. For example, mlock_folio() takes an additional reference on the target folio before placing it into a per-cpu 'folio_batch' for later processing by mlock_folio_batch(), which drops the refcount once the operation is complete. Processing of the batches is coupled with the LRU batch logic and can be forcefully drained with lru_add_drain_all() but as long as a folio remains unprocessed on the batch, its refcount will be elevated. This deferred batching therefore interacts poorly with the pKVM pinning scenario as we can find ourselves in a situation where the migration code fails to migrate a folio due to the elevated refcount from the pending mlock operation. Hugh Dickins adds:- !folio_test_lru() has never been a very reliable way to tell if an lru_add_drain_all() is worth calling, to remove LRU cache references to make the folio migratable: the LRU flag may be set even while the folio is held with an extra reference in a per-CPU LRU cache. 5.18 commit 2fbb0c10d1e8 may have made it more unreliable. Then 6.11 commit 33dfe9204f29 ("mm/gup: clear the LRU flag of a page before adding to LRU batch") tried to make it reliable, by moving LRU flag clearing; but missed the mlock/munlock batches, so still unreliable as reported. And it turns out to be difficult to extend 33dfe9204f29's LRU flag clearing to the mlock/munlock batches: if they do benefit from batching, mlock/munlock cannot be so effective when easily suppressed while !LRU. Instead, switch to an expected ref_count check, which was more reliable all along: some more false positives (unhelpful drains) than before, and never a guarantee that the folio will prove migratable, but better. Note on PG_private_2: ceph and nfs are still using the deprecated PG_private_2 flag, with the aid of netfs and filemap support functions. Although it is consistently matched by an increment of folio ref_count, folio_expected_ref_count() intentionally does not recognize it, and ceph folio migration currently depends on that for PG_private_2 folios to be rejected. New references to the deprecated flag are discouraged, so do not add it into the collect_longterm_unpinnable_folios() calculation: but longterm pinning of transiently PG_private_2 ceph and nfs folios (an uncommon case) may invoke a redundant lru_add_drain_all(). And this makes easy the backport to earlier releases: up to and including 6.12, btrfs also used PG_private_2, but without a ref_count increment. Note for stable backports: requires 6.16 commit 86ebd50224c0 ("mm: add folio_expected_ref_count() for reference count calculation"). Link: https://lkml.kernel.org/r/41395944-b0e3-c3ac-d648-8ddd70451d28@google.com Link: https://lkml.kernel.org/r/bd1f314a-fca1-8f19-cac0-b936c9614557@google.com Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region") Signed-off-by: Hugh Dickins Reported-by: Will Deacon Closes: https://lore.kernel.org/linux-mm/20250815101858.24352-1-will@kernel.org/ Acked-by: Kiryl Shutsemau Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index adffe663594dc6..82aec6443c0afb 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2307,7 +2307,8 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (!folio_test_lru(folio) && drain_allow) { + if (drain_allow && folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); drain_allow = false; } From a09a8a1fbb374e0053b97306da9dbc05bd384685 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:16:53 -0700 Subject: [PATCH 1557/3206] mm/gup: local lru_add_drain() to avoid lru_add_drain_all() In many cases, if collect_longterm_unpinnable_folios() does need to drain the LRU cache to release a reference, the cache in question is on this same CPU, and much more efficiently drained by a preliminary local lru_add_drain(), than the later cross-CPU lru_add_drain_all(). Marked for stable, to counter the increase in lru_add_drain_all()s from "mm/gup: check ref_count instead of lru before migration". Note for clean backports: can take 6.16 commit a03db236aebf ("gup: optimize longterm pin_user_pages() for large folio") first. Link: https://lkml.kernel.org/r/66f2751f-283e-816d-9530-765db7edc465@google.com Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 82aec6443c0afb..b47066a54f5239 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2287,8 +2287,8 @@ static unsigned long collect_longterm_unpinnable_folios( struct pages_or_folios *pofs) { unsigned long collected = 0; - bool drain_allow = true; struct folio *folio; + int drained = 0; long i = 0; for (folio = pofs_get_folio(pofs, i); folio; @@ -2307,10 +2307,17 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (drain_allow && folio_ref_count(folio) != - folio_expected_ref_count(folio) + 1) { + if (drained == 0 && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { + lru_add_drain(); + drained = 1; + } + if (drained == 1 && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); - drain_allow = false; + drained = 2; } if (!folio_isolate_lru(folio)) From afb99e9f500485160f34b8cad6d3763ada3e80e8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:19:17 -0700 Subject: [PATCH 1558/3206] mm: revert "mm/gup: clear the LRU flag of a page before adding to LRU batch" This reverts commit 33dfe9204f29: now that collect_longterm_unpinnable_folios() is checking ref_count instead of lru, and mlock/munlock do not participate in the revised LRU flag clearing, those changes are misleading, and enlarge the window during which mlock/munlock may miss an mlock_count update. It is possible (I'd hesitate to claim probable) that the greater likelihood of missed mlock_count updates would explain the "Realtime threads delayed due to kcompactd0" observed on 6.12 in the Link below. If that is the case, this reversion will help; but a complete solution needs also a further patch, beyond the scope of this series. Included some 80-column cleanup around folio_batch_add_and_move(). The role of folio_test_clear_lru() (before taking per-memcg lru_lock) is questionable since 6.13 removed mem_cgroup_move_account() etc; but perhaps there are still some races which need it - not examined here. Link: https://lore.kernel.org/linux-mm/DU0PR01MB10385345F7153F334100981888259A@DU0PR01MB10385.eurprd01.prod.exchangelabs.com/ Link: https://lkml.kernel.org/r/05905d7b-ed14-68b1-79d8-bdec30367eba@google.com Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/swap.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 3632dd061bebb0..6ae2d5680574be 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -164,6 +164,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; + /* block memcg migration while the folio moves between lru */ + if (move_fn != lru_add && !folio_test_clear_lru(folio)) + continue; + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); @@ -176,14 +180,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) } static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, - struct folio *folio, move_fn_t move_fn, - bool on_lru, bool disable_irq) + struct folio *folio, move_fn_t move_fn, bool disable_irq) { unsigned long flags; - if (on_lru && !folio_test_clear_lru(folio)) - return; - folio_get(folio); if (disable_irq) @@ -191,8 +191,8 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, else local_lock(&cpu_fbatches.lock); - if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) || - lru_cache_disabled()) + if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || + folio_test_large(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) @@ -201,13 +201,13 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, local_unlock(&cpu_fbatches.lock); } -#define folio_batch_add_and_move(folio, op, on_lru) \ - __folio_batch_add_and_move( \ - &cpu_fbatches.op, \ - folio, \ - op, \ - on_lru, \ - offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \ +#define folio_batch_add_and_move(folio, op) \ + __folio_batch_add_and_move( \ + &cpu_fbatches.op, \ + folio, \ + op, \ + offsetof(struct cpu_fbatches, op) >= \ + offsetof(struct cpu_fbatches, lock_irq) \ ) static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) @@ -231,10 +231,10 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) void folio_rotate_reclaimable(struct folio *folio) { if (folio_test_locked(folio) || folio_test_dirty(folio) || - folio_test_unevictable(folio)) + folio_test_unevictable(folio) || !folio_test_lru(folio)) return; - folio_batch_add_and_move(folio, lru_move_tail, true); + folio_batch_add_and_move(folio, lru_move_tail); } void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, @@ -328,10 +328,11 @@ static void folio_activate_drain(int cpu) void folio_activate(struct folio *folio) { - if (folio_test_active(folio) || folio_test_unevictable(folio)) + if (folio_test_active(folio) || folio_test_unevictable(folio) || + !folio_test_lru(folio)) return; - folio_batch_add_and_move(folio, lru_activate, true); + folio_batch_add_and_move(folio, lru_activate); } #else @@ -507,7 +508,7 @@ void folio_add_lru(struct folio *folio) lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); - folio_batch_add_and_move(folio, lru_add, false); + folio_batch_add_and_move(folio, lru_add); } EXPORT_SYMBOL(folio_add_lru); @@ -685,13 +686,13 @@ void lru_add_drain_cpu(int cpu) void deactivate_file_folio(struct folio *folio) { /* Deactivating an unevictable folio will not accelerate reclaim */ - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() && lru_gen_clear_refs(folio)) return; - folio_batch_add_and_move(folio, lru_deactivate_file, true); + folio_batch_add_and_move(folio, lru_deactivate_file); } /* @@ -704,13 +705,13 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; - folio_batch_add_and_move(folio, lru_deactivate, true); + folio_batch_add_and_move(folio, lru_deactivate); } /** @@ -723,10 +724,11 @@ void folio_deactivate(struct folio *folio) void folio_mark_lazyfree(struct folio *folio) { if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || + !folio_test_lru(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; - folio_batch_add_and_move(folio, lru_lazyfree, true); + folio_batch_add_and_move(folio, lru_lazyfree); } void lru_add_drain(void) From 8d79ed36bfc83d0583ab72216b7980340478cdfb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:21:12 -0700 Subject: [PATCH 1559/3206] mm: revert "mm: vmscan.c: fix OOM on swap stress test" This reverts commit 0885ef470560: that was a fix to the reverted 33dfe9204f29b415bbc0abb1a50642d1ba94f5e9. Link: https://lkml.kernel.org/r/aa0e9d67-fbcd-9d79-88a1-641dfbe1d9d1@google.com Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a48aec8bfd9251..674999999cd067 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4507,7 +4507,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* ineligible */ - if (!folio_test_lru(folio) || zone > sc->reclaim_idx) { + if (zone > sc->reclaim_idx) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; From 2da6de30e60dd9bb14600eff1cc99df2fa2ddae3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:23:15 -0700 Subject: [PATCH 1560/3206] mm: folio_may_be_lru_cached() unless folio_test_large() mm/swap.c and mm/mlock.c agree to drain any per-CPU batch as soon as a large folio is added: so collect_longterm_unpinnable_folios() just wastes effort when calling lru_add_drain[_all]() on a large folio. But although there is good reason not to batch up PMD-sized folios, we might well benefit from batching a small number of low-order mTHPs (though unclear how that "small number" limitation will be implemented). So ask if folio_may_be_lru_cached() rather than !folio_test_large(), to insulate those particular checks from future change. Name preferred to "folio_is_batchable" because large folios can well be put on a batch: it's just the per-CPU LRU caches, drained much later, which need care. Marked for stable, to counter the increase in lru_add_drain_all()s from "mm/gup: check ref_count instead of lru before migration". Link: https://lkml.kernel.org/r/57d2eaf8-3607-f318-e0c5-be02dce61ad0@google.com Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region") Signed-off-by: Hugh Dickins Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 ++++++++++ mm/gup.c | 4 ++-- mm/mlock.c | 6 +++--- mm/swap.c | 2 +- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe6ed2cc3fdfb..7012a0f758d84a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -385,6 +385,16 @@ void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); +static inline bool folio_may_be_lru_cached(struct folio *folio) +{ + /* + * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting. + * Holding small numbers of low-order mTHP folios in per-CPU LRU cache + * will be sensible, but nobody has implemented and tested that yet. + */ + return !folio_test_large(folio); +} + extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) diff --git a/mm/gup.c b/mm/gup.c index b47066a54f5239..0bc4d140fc07fb 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2307,13 +2307,13 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (drained == 0 && + if (drained == 0 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain(); drained = 1; } - if (drained == 1 && + if (drained == 1 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); diff --git a/mm/mlock.c b/mm/mlock.c index a1d93ad33c6db5..bb0776f5ef7cad 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio) */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } diff --git a/mm/swap.c b/mm/swap.c index 6ae2d5680574be..b74ebe865dd92a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -192,7 +192,7 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, local_lock(&cpu_fbatches.lock); if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) From e6a0deb6fa5b0fc134ee2aa127d1cfc9456d8445 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 13:15:12 -0700 Subject: [PATCH 1561/3206] mm/damon/core: introduce damon_call_control->dealloc_on_cancel Patch series "mm/damon/sysfs: fix refresh_ms control overwriting on multi-kdamonds usages". Automatic esssential DAMON/DAMOS status update feature of DAMON sysfs interface (refresh_ms) is broken [1] for multiple DAMON contexts (kdamonds) use case, since it uses a global single damon_call_control object for all created DAMON contexts. The fields of the object, particularly the list field is over-written for the contexts and it makes unexpected results including user-space hangup and kernel crashes [2]. Fix it by extending damon_call_control for the use case and updating the usage on DAMON sysfs interface to use per-context dynamically allocated damon_call_control object. This patch (of 2): When damon_call_control->repeat is set, damon_call() is executed asynchronously, and is eventually canceled when kdamond finishes. If the damon_call_control object is dynamically allocated, finding the place to deallocate the object is difficult. Introduce a new damon_call_control field, namely dealloc_on_cancel, to ask the kdamond deallocates those dynamically allocated objects when those are canceled. Link: https://lkml.kernel.org/r/20250908201513.60802-3-sj@kernel.org Link: https://lkml.kernel.org/r/20250908201513.60802-2-sj@kernel.org Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work") Signed-off-by: SeongJae Park Cc: Yunjeong Mun Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index f13664c62ddda5..9e62b2a85538da 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -636,6 +636,7 @@ struct damon_operations { * @data: Data that will be passed to @fn. * @repeat: Repeat invocations. * @return_code: Return code from @fn invocation. + * @dealloc_on_cancel: De-allocate when canceled. * * Control damon_call(), which requests specific kdamond to invoke a given * function. Refer to damon_call() for more details. @@ -645,6 +646,7 @@ struct damon_call_control { void *data; bool repeat; int return_code; + bool dealloc_on_cancel; /* private: internal use only */ /* informs if the kdamond finished handling of the request */ struct completion completion; diff --git a/mm/damon/core.c b/mm/damon/core.c index c2e0b469fd4327..08065b36397224 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2479,10 +2479,14 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) mutex_lock(&ctx->call_controls_lock); list_del(&control->list); mutex_unlock(&ctx->call_controls_lock); - if (!control->repeat) + if (!control->repeat) { complete(&control->completion); - else + } else if (control->canceled && control->dealloc_on_cancel) { + kfree(control); + continue; + } else { list_add(&control->list, &repeat_controls); + } } control = list_first_entry_or_null(&repeat_controls, struct damon_call_control, list); From 04a06b139ec08aa63d7377f6d3e5218f8ddb1c5d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 13:15:13 -0700 Subject: [PATCH 1562/3206] mm/damon/sysfs: use dynamically allocated repeat mode damon_call_control DAMON sysfs interface is using a single global repeat mode damon_call_control variable for refresh_ms handling, for all DAMON contexts. As a result, when there are more than one context, the single global damon_call_control is unexpectedly over-written (corrupted). Particularly the ->link field is overwritten by the multiple contexts and this can cause a user hangup, and/or a kernel crash. Fix it by using dynamically allocated damon_call_control object per DAMON context. Link: https://lkml.kernel.org/r/20250908201513.60802-3-sj@kernel.org Link: https://lore.kernel.org/20250904011738.930-1-yunjeong.mun@sk.com [1] Link: https://lore.kernel.org/20250905035411.39501-1-sj@kernel.org [2] Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work") Signed-off-by: SeongJae Park Reported-by: Yunjeong Mun Closes: https://lore.kernel.org/20250904011738.930-1-yunjeong.mun@sk.com Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 7b9254cadd5f87..c96c2154128faa 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1534,14 +1534,10 @@ static int damon_sysfs_repeat_call_fn(void *data) return 0; } -static struct damon_call_control damon_sysfs_repeat_call_control = { - .fn = damon_sysfs_repeat_call_fn, - .repeat = true, -}; - static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) { struct damon_ctx *ctx; + struct damon_call_control *repeat_call_control; int err; if (damon_sysfs_kdamond_running(kdamond)) @@ -1554,18 +1550,29 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) damon_destroy_ctx(kdamond->damon_ctx); kdamond->damon_ctx = NULL; + repeat_call_control = kmalloc(sizeof(*repeat_call_control), + GFP_KERNEL); + if (!repeat_call_control) + return -ENOMEM; + ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); - if (IS_ERR(ctx)) + if (IS_ERR(ctx)) { + kfree(repeat_call_control); return PTR_ERR(ctx); + } err = damon_start(&ctx, 1, false); if (err) { + kfree(repeat_call_control); damon_destroy_ctx(ctx); return err; } kdamond->damon_ctx = ctx; - damon_sysfs_repeat_call_control.data = kdamond; - damon_call(ctx, &damon_sysfs_repeat_call_control); + repeat_call_control->fn = damon_sysfs_repeat_call_fn; + repeat_call_control->data = kdamond; + repeat_call_control->repeat = true; + repeat_call_control->dealloc_on_cancel = true; + damon_call(ctx, repeat_call_control); return err; } From 615cd3705d204680f4ae8d0ad0dec8b778dc2753 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 8 Sep 2025 20:49:59 +0100 Subject: [PATCH 1563/3206] MAINTAINERS: add Jann Horn as rmap reviewer Jann has been an excellent contributor in all areas of memory management, and has demonstrated great expertise in the reverse mapping. It's therefore appropriate for him to become a reviewer. Link: https://lkml.kernel.org/r/20250908194959.820913-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Harry Yoo Acked-by: SeongJae Park Acked-by: Vlastimil Babka Acked-by: Liam R. Howlett Cc: Jann Horn Cc: Rik van Riel Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index fbdbf7c012a032..b71b8c6813aab9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16195,6 +16195,7 @@ R: Rik van Riel R: Liam R. Howlett R: Vlastimil Babka R: Harry Yoo +R: Jann Horn L: linux-mm@kvack.org S: Maintained F: include/linux/rmap.h From 72291a5a0ead8686e3cbfd35baa7d7b1cfdbf6ea Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 8 Sep 2025 18:48:57 +0800 Subject: [PATCH 1564/3206] MAINTAINERS: add Lance Yang as a THP reviewer I've been actively digging into the MM/THP subsystem for over a year now, and there's a real interest in contributing more and getting further involved. Well, missing out on any more cool THP things is really a pain ;) Link: https://lkml.kernel.org/r/20250908104857.35397-1-lance.yang@linux.dev Signed-off-by: Lance Yang Acked-by: David Hildenbrand Acked-by: Lorenzo Stoakes Acked-by: Zi Yan Acked-by: Barry Song Acked-by: Baolin Wang Cc: Liam R. Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b71b8c6813aab9..03b433441836e2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16240,6 +16240,7 @@ R: Nico Pache R: Ryan Roberts R: Dev Jain R: Barry Song +R: Lance Yang L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org From f826edeb888c5a8bd1b6e95ae6a50b0db2b21902 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 19:22:36 -0700 Subject: [PATCH 1565/3206] samples/damon/wsse: avoid starting DAMON before initialization Patch series "samples/damon: fix boot time enable handling fixup merge mistakes". First three patches of the patch series "mm/damon: fix misc bugs in DAMON modules" [1] were trying to fix boot time DAMON sample modules enabling issues. The issues are the modules can crash if those are enabled before DAMON is enabled, like using boot time parameter options. The three patches were fixing the issues by avoiding starting DAMON before the module initialization phase. However, probably by a mistake during a merge, only half of the change is merged, and the part for avoiding the starting of DAMON before the module initialized is missed. So the problem is not solved and thus the modules can still crash if enabled before DAMON is initialized. Fix those by applying the unmerged parts again. Note that the broken commits are merged into 6.17-rc1, but also backported to relevant stable kernels. So this series also needs to be merged into the stable kernels. Hence Cc-ing stable@. This patch (of 3): Commit 0ed1165c3727 ("samples/damon/wsse: fix boot time enable handling") is somehow incompletely applying the origin patch [2]. It is missing the part that avoids starting DAMON before module initialization. Probably a mistake during a merge has happened. Fix it by applying the missed part again. Link: https://lkml.kernel.org/r/20250909022238.2989-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250909022238.2989-2-sj@kernel.org Link: https://lkml.kernel.org/r/20250706193207.39810-1-sj@kernel.org [1] Link: https://lore.kernel.org/20250706193207.39810-2-sj@kernel.org [2] Fixes: 0ed1165c3727 ("samples/damon/wsse: fix boot time enable handling") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- samples/damon/wsse.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/damon/wsse.c b/samples/damon/wsse.c index da052023b09907..21eaf15f987d47 100644 --- a/samples/damon/wsse.c +++ b/samples/damon/wsse.c @@ -118,6 +118,9 @@ static int damon_sample_wsse_enable_store( return 0; if (enabled) { + if (!init_called) + return 0; + err = damon_sample_wsse_start(); if (err) enabled = false; From e6b733ca2f99e968d696c2e812c8eb8e090bf37b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 19:22:37 -0700 Subject: [PATCH 1566/3206] samples/damon/prcl: avoid starting DAMON before initialization Commit 2780505ec2b4 ("samples/damon/prcl: fix boot time enable crash") is somehow incompletely applying the origin patch [1]. It is missing the part that avoids starting DAMON before module initialization. Probably a mistake during a merge has happened. Fix it by applying the missed part again. Link: https://lkml.kernel.org/r/20250909022238.2989-3-sj@kernel.org Link: https://lore.kernel.org/20250706193207.39810-3-sj@kernel.org [1] Fixes: 2780505ec2b4 ("samples/damon/prcl: fix boot time enable crash") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- samples/damon/prcl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/damon/prcl.c b/samples/damon/prcl.c index 1b839c06a612f2..0226652f94d55e 100644 --- a/samples/damon/prcl.c +++ b/samples/damon/prcl.c @@ -137,6 +137,9 @@ static int damon_sample_prcl_enable_store( if (enabled == is_enabled) return 0; + if (!init_called) + return 0; + if (enabled) { err = damon_sample_prcl_start(); if (err) From c62cff40481c037307a13becbda795f7afdcfebd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 19:22:38 -0700 Subject: [PATCH 1567/3206] samples/damon/mtier: avoid starting DAMON before initialization Commit 964314344eab ("samples/damon/mtier: support boot time enable setup") is somehow incompletely applying the origin patch [1]. It is missing the part that avoids starting DAMON before module initialization. Probably a mistake during a merge has happened. Fix it by applying the missed part again. Link: https://lkml.kernel.org/r/20250909022238.2989-4-sj@kernel.org Link: https://lore.kernel.org/20250706193207.39810-4-sj@kernel.org [1] Fixes: 964314344eab ("samples/damon/mtier: support boot time enable setup") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- samples/damon/mtier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/damon/mtier.c b/samples/damon/mtier.c index 7ebd352138e4f2..beaf36657deacc 100644 --- a/samples/damon/mtier.c +++ b/samples/damon/mtier.c @@ -208,6 +208,9 @@ static int damon_sample_mtier_enable_store( if (enabled == is_enabled) return 0; + if (!init_called) + return 0; + if (enabled) { err = damon_sample_mtier_start(); if (err) From 025e87f8ea2ae3a28bf1fe2b052bfa412c27ed4a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 6 Sep 2025 23:43:34 +0900 Subject: [PATCH 1568/3206] nilfs2: fix CFI failure when accessing /sys/fs/nilfs2/features/* When accessing one of the files under /sys/fs/nilfs2/features when CONFIG_CFI_CLANG is enabled, there is a CFI violation: CFI failure at kobj_attr_show+0x59/0x80 (target: nilfs_feature_revision_show+0x0/0x30; expected type: 0xfc392c4d) ... Call Trace: sysfs_kf_seq_show+0x2a6/0x390 ? __cfi_kobj_attr_show+0x10/0x10 kernfs_seq_show+0x104/0x15b seq_read_iter+0x580/0xe2b ... When the kobject of the kset for /sys/fs/nilfs2 is initialized, its ktype is set to kset_ktype, which has a ->sysfs_ops of kobj_sysfs_ops. When nilfs_feature_attr_group is added to that kobject via sysfs_create_group(), the kernfs_ops of each files is sysfs_file_kfops_rw, which will call sysfs_kf_seq_show() when ->seq_show() is called. sysfs_kf_seq_show() in turn calls kobj_attr_show() through ->sysfs_ops->show(). kobj_attr_show() casts the provided attribute out to a 'struct kobj_attribute' via container_of() and calls ->show(), resulting in the CFI violation since neither nilfs_feature_revision_show() nor nilfs_feature_README_show() match the prototype of ->show() in 'struct kobj_attribute'. Resolve the CFI violation by adjusting the second parameter in nilfs_feature_{revision,README}_show() from 'struct attribute' to 'struct kobj_attribute' to match the expected prototype. Link: https://lkml.kernel.org/r/20250906144410.22511-1-konishi.ryusuke@gmail.com Fixes: aebe17f68444 ("nilfs2: add /sys/fs/nilfs2/features group") Signed-off-by: Nathan Chancellor Signed-off-by: Ryusuke Konishi Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202509021646.bc78d9ef-lkp@intel.com/ Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/sysfs.c | 4 ++-- fs/nilfs2/sysfs.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 14868a3dd592ca..bc52afbfc5c739 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -1075,7 +1075,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) ************************************************************************/ static ssize_t nilfs_feature_revision_show(struct kobject *kobj, - struct attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d.%d\n", NILFS_CURRENT_REV, NILFS_MINOR_REV); @@ -1087,7 +1087,7 @@ static const char features_readme_str[] = "(1) revision\n\tshow current revision of NILFS file system driver.\n"; static ssize_t nilfs_feature_README_show(struct kobject *kobj, - struct attribute *attr, + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, features_readme_str); diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h index 78a87a016928b7..d370cd5cce3f5d 100644 --- a/fs/nilfs2/sysfs.h +++ b/fs/nilfs2/sysfs.h @@ -50,16 +50,16 @@ struct nilfs_sysfs_dev_subgroups { struct completion sg_segments_kobj_unregister; }; -#define NILFS_COMMON_ATTR_STRUCT(name) \ +#define NILFS_KOBJ_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ struct attribute attr; \ - ssize_t (*show)(struct kobject *, struct attribute *, \ + ssize_t (*show)(struct kobject *, struct kobj_attribute *, \ char *); \ - ssize_t (*store)(struct kobject *, struct attribute *, \ + ssize_t (*store)(struct kobject *, struct kobj_attribute *, \ const char *, size_t); \ } -NILFS_COMMON_ATTR_STRUCT(feature); +NILFS_KOBJ_ATTR_STRUCT(feature); #define NILFS_DEV_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ From 30989f67650cbf8dc763f7c22e3a210f70a8d7d0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 10 Sep 2025 16:25:27 +0200 Subject: [PATCH 1569/3206] MAINTAINERS: Input: Drop melfas-mip4 section Emails to the sole melfas-mip4 driver maintainer bounce: 550 No such user here (connected from melfas.com) so clearly this is not a supported driver anymore. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250910142526.105286-2-krzysztof.kozlowski@linaro.org Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 7 ------- 1 file changed, 7 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ecca2a4f96fdea..6d6dd9f1316c8a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15523,13 +15523,6 @@ S: Supported W: http://www.melexis.com F: drivers/iio/temperature/mlx90635.c -MELFAS MIP4 TOUCHSCREEN DRIVER -M: Sangwon Jee -S: Supported -W: http://www.melfas.com -F: Documentation/devicetree/bindings/input/touchscreen/melfas_mip4.txt -F: drivers/input/touchscreen/melfas_mip4.c - MELLANOX BLUEFIELD I2C DRIVER M: Khalil Blaiech M: Asmaa Mnebhi From 918856986014142271a70a334d300994b9c41720 Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Thu, 28 Aug 2025 08:35:57 +0000 Subject: [PATCH 1570/3206] platform/chrome: Centralize cros_ec_device allocation Introduce a helper function, cros_ec_device_alloc(), to centralize the allocation of the struct cros_ec_device. Convert all protocol device drivers to use this new function. This is a preparatory step for separating common initialization logic out of device drivers' probe() and cros_ec_register(). Link: https://lore.kernel.org/r/20250828083601.856083-2-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec.c | 12 ++++++++++++ drivers/platform/chrome/cros_ec.h | 3 +++ drivers/platform/chrome/cros_ec_i2c.c | 4 ++-- drivers/platform/chrome/cros_ec_ishtp.c | 2 +- drivers/platform/chrome/cros_ec_lpc.c | 2 +- drivers/platform/chrome/cros_ec_rpmsg.c | 2 +- drivers/platform/chrome/cros_ec_spi.c | 2 +- drivers/platform/chrome/cros_ec_uart.c | 2 +- 8 files changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c index b1730362e399c7..25283a148ab9c0 100644 --- a/drivers/platform/chrome/cros_ec.c +++ b/drivers/platform/chrome/cros_ec.c @@ -30,6 +30,18 @@ static struct cros_ec_platform pd_p = { .cmd_offset = EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX), }; +struct cros_ec_device *cros_ec_device_alloc(struct device *dev) +{ + struct cros_ec_device *ec_dev; + + ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + if (!ec_dev) + return NULL; + + return ec_dev; +} +EXPORT_SYMBOL(cros_ec_device_alloc); + /** * cros_ec_irq_handler() - top half part of the interrupt handler * @irq: IRQ id diff --git a/drivers/platform/chrome/cros_ec.h b/drivers/platform/chrome/cros_ec.h index 6b95f1e0bace3d..cd4643bc536710 100644 --- a/drivers/platform/chrome/cros_ec.h +++ b/drivers/platform/chrome/cros_ec.h @@ -11,6 +11,9 @@ #include struct cros_ec_device; +struct device; + +struct cros_ec_device *cros_ec_device_alloc(struct device *dev); int cros_ec_register(struct cros_ec_device *ec_dev); void cros_ec_unregister(struct cros_ec_device *ec_dev); diff --git a/drivers/platform/chrome/cros_ec_i2c.c b/drivers/platform/chrome/cros_ec_i2c.c index 38af97cdaab229..ee3c5130ec3f7f 100644 --- a/drivers/platform/chrome/cros_ec_i2c.c +++ b/drivers/platform/chrome/cros_ec_i2c.c @@ -289,10 +289,10 @@ static int cros_ec_cmd_xfer_i2c(struct cros_ec_device *ec_dev, static int cros_ec_i2c_probe(struct i2c_client *client) { struct device *dev = &client->dev; - struct cros_ec_device *ec_dev = NULL; + struct cros_ec_device *ec_dev; int err; - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; diff --git a/drivers/platform/chrome/cros_ec_ishtp.c b/drivers/platform/chrome/cros_ec_ishtp.c index 7e7190b30cbb97..c102a796670c64 100644 --- a/drivers/platform/chrome/cros_ec_ishtp.c +++ b/drivers/platform/chrome/cros_ec_ishtp.c @@ -543,7 +543,7 @@ static int cros_ec_dev_init(struct ishtp_cl_data *client_data) struct cros_ec_device *ec_dev; struct device *dev = cl_data_to_dev(client_data); - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c index 7d9a78289c9626..30fa89b81666cf 100644 --- a/drivers/platform/chrome/cros_ec_lpc.c +++ b/drivers/platform/chrome/cros_ec_lpc.c @@ -637,7 +637,7 @@ static int cros_ec_lpc_probe(struct platform_device *pdev) } } - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; diff --git a/drivers/platform/chrome/cros_ec_rpmsg.c b/drivers/platform/chrome/cros_ec_rpmsg.c index bc2666491db1f1..9ac2b923db6ded 100644 --- a/drivers/platform/chrome/cros_ec_rpmsg.c +++ b/drivers/platform/chrome/cros_ec_rpmsg.c @@ -216,7 +216,7 @@ static int cros_ec_rpmsg_probe(struct rpmsg_device *rpdev) struct cros_ec_device *ec_dev; int ret; - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c index 8ca0f854e7ac56..c778300a4145d4 100644 --- a/drivers/platform/chrome/cros_ec_spi.c +++ b/drivers/platform/chrome/cros_ec_spi.c @@ -749,7 +749,7 @@ static int cros_ec_spi_probe(struct spi_device *spi) if (ec_spi == NULL) return -ENOMEM; ec_spi->spi = spi; - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; diff --git a/drivers/platform/chrome/cros_ec_uart.c b/drivers/platform/chrome/cros_ec_uart.c index 19c179d49c90df..1a7511b1bbe378 100644 --- a/drivers/platform/chrome/cros_ec_uart.c +++ b/drivers/platform/chrome/cros_ec_uart.c @@ -259,7 +259,7 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) if (!ec_uart) return -ENOMEM; - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; From e19ceeb1c0f63e3e15b197c5f34797134b51ba0e Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Thu, 28 Aug 2025 08:35:58 +0000 Subject: [PATCH 1571/3206] platform/chrome: Centralize common cros_ec_device initialization Move the common initialization from protocol device drivers into central cros_ec_device_alloc(). This removes duplicated code from each driver's probe function. The buffer sizes are now calculated once, using the maximum possible overhead required by any of the transport protocols, ensuring the allocated buffers are sufficient for all cases. Link: https://lore.kernel.org/r/20250828083601.856083-3-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec.c | 9 +++++++++ drivers/platform/chrome/cros_ec_i2c.c | 5 ----- drivers/platform/chrome/cros_ec_ishtp.c | 4 ---- drivers/platform/chrome/cros_ec_lpc.c | 4 ---- drivers/platform/chrome/cros_ec_rpmsg.c | 4 ---- drivers/platform/chrome/cros_ec_spi.c | 5 ----- drivers/platform/chrome/cros_ec_uart.c | 4 ---- include/linux/platform_data/cros_ec_proto.h | 14 ++++++++++---- 8 files changed, 19 insertions(+), 30 deletions(-) diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c index 25283a148ab9c0..da049068b6e9e1 100644 --- a/drivers/platform/chrome/cros_ec.c +++ b/drivers/platform/chrome/cros_ec.c @@ -38,6 +38,15 @@ struct cros_ec_device *cros_ec_device_alloc(struct device *dev) if (!ec_dev) return NULL; + ec_dev->din_size = sizeof(struct ec_host_response) + + sizeof(struct ec_response_get_protocol_info) + + EC_MAX_RESPONSE_OVERHEAD; + ec_dev->dout_size = sizeof(struct ec_host_request) + + sizeof(struct ec_params_rwsig_action) + + EC_MAX_REQUEST_OVERHEAD; + + ec_dev->dev = dev; + return ec_dev; } EXPORT_SYMBOL(cros_ec_device_alloc); diff --git a/drivers/platform/chrome/cros_ec_i2c.c b/drivers/platform/chrome/cros_ec_i2c.c index ee3c5130ec3f7f..def1144a077ea6 100644 --- a/drivers/platform/chrome/cros_ec_i2c.c +++ b/drivers/platform/chrome/cros_ec_i2c.c @@ -297,16 +297,11 @@ static int cros_ec_i2c_probe(struct i2c_client *client) return -ENOMEM; i2c_set_clientdata(client, ec_dev); - ec_dev->dev = dev; ec_dev->priv = client; ec_dev->irq = client->irq; ec_dev->cmd_xfer = cros_ec_cmd_xfer_i2c; ec_dev->pkt_xfer = cros_ec_pkt_xfer_i2c; ec_dev->phys_name = client->adapter->name; - ec_dev->din_size = sizeof(struct ec_host_response_i2c) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request_i2c) + - sizeof(struct ec_params_rwsig_action); err = cros_ec_register(ec_dev); if (err) { diff --git a/drivers/platform/chrome/cros_ec_ishtp.c b/drivers/platform/chrome/cros_ec_ishtp.c index c102a796670c64..4e74e702c5a24d 100644 --- a/drivers/platform/chrome/cros_ec_ishtp.c +++ b/drivers/platform/chrome/cros_ec_ishtp.c @@ -550,14 +550,10 @@ static int cros_ec_dev_init(struct ishtp_cl_data *client_data) client_data->ec_dev = ec_dev; dev->driver_data = ec_dev; - ec_dev->dev = dev; ec_dev->priv = client_data->cros_ish_cl; ec_dev->cmd_xfer = NULL; ec_dev->pkt_xfer = cros_ec_pkt_xfer_ish; ec_dev->phys_name = dev_name(dev); - ec_dev->din_size = sizeof(struct cros_ish_in_msg) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct cros_ish_out_msg) + sizeof(struct ec_params_rwsig_action); return cros_ec_register(ec_dev); } diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c index 30fa89b81666cf..78cfff80cdeafe 100644 --- a/drivers/platform/chrome/cros_ec_lpc.c +++ b/drivers/platform/chrome/cros_ec_lpc.c @@ -642,14 +642,10 @@ static int cros_ec_lpc_probe(struct platform_device *pdev) return -ENOMEM; platform_set_drvdata(pdev, ec_dev); - ec_dev->dev = dev; ec_dev->phys_name = dev_name(dev); ec_dev->cmd_xfer = cros_ec_cmd_xfer_lpc; ec_dev->pkt_xfer = cros_ec_pkt_xfer_lpc; ec_dev->cmd_readmem = cros_ec_lpc_readmem; - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); ec_dev->priv = ec_lpc; /* diff --git a/drivers/platform/chrome/cros_ec_rpmsg.c b/drivers/platform/chrome/cros_ec_rpmsg.c index 9ac2b923db6ded..09bd9e49464e62 100644 --- a/drivers/platform/chrome/cros_ec_rpmsg.c +++ b/drivers/platform/chrome/cros_ec_rpmsg.c @@ -224,14 +224,10 @@ static int cros_ec_rpmsg_probe(struct rpmsg_device *rpdev) if (!ec_rpmsg) return -ENOMEM; - ec_dev->dev = dev; ec_dev->priv = ec_rpmsg; ec_dev->cmd_xfer = cros_ec_cmd_xfer_rpmsg; ec_dev->pkt_xfer = cros_ec_pkt_xfer_rpmsg; ec_dev->phys_name = dev_name(&rpdev->dev); - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); dev_set_drvdata(dev, ec_dev); ec_rpmsg->rpdev = rpdev; diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c index c778300a4145d4..28fa82f8cb07e0 100644 --- a/drivers/platform/chrome/cros_ec_spi.c +++ b/drivers/platform/chrome/cros_ec_spi.c @@ -757,16 +757,11 @@ static int cros_ec_spi_probe(struct spi_device *spi) cros_ec_spi_dt_probe(ec_spi, dev); spi_set_drvdata(spi, ec_dev); - ec_dev->dev = dev; ec_dev->priv = ec_spi; ec_dev->irq = spi->irq; ec_dev->cmd_xfer = cros_ec_cmd_xfer_spi; ec_dev->pkt_xfer = cros_ec_pkt_xfer_spi; ec_dev->phys_name = dev_name(&ec_spi->spi->dev); - ec_dev->din_size = EC_MSG_PREAMBLE_COUNT + - sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); ec_spi->last_transfer_ns = ktime_get_ns(); diff --git a/drivers/platform/chrome/cros_ec_uart.c b/drivers/platform/chrome/cros_ec_uart.c index 1a7511b1bbe378..d5b37414ff1245 100644 --- a/drivers/platform/chrome/cros_ec_uart.c +++ b/drivers/platform/chrome/cros_ec_uart.c @@ -276,14 +276,10 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) /* Initialize ec_dev for cros_ec */ ec_dev->phys_name = dev_name(dev); - ec_dev->dev = dev; ec_dev->priv = ec_uart; ec_dev->irq = ec_uart->irq; ec_dev->cmd_xfer = NULL; ec_dev->pkt_xfer = cros_ec_uart_pkt_xfer; - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); serdev_device_set_client_ops(serdev, &cros_ec_uart_client_ops); diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 3ec24f445c29cd..4d96cffbb9e32c 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -33,12 +33,18 @@ /* * Max bus-specific overhead incurred by request/responses. - * I2C requires 1 additional byte for requests. - * I2C requires 2 additional bytes for responses. - * SPI requires up to 32 additional bytes for responses. + * + * Request: + * - I2C requires 1 byte (see struct ec_host_request_i2c). + * - ISHTP requires 4 bytes (see struct cros_ish_out_msg). + * + * Response: + * - I2C requires 2 bytes (see struct ec_host_response_i2c). + * - ISHTP requires 4 bytes (see struct cros_ish_in_msg). + * - SPI requires 32 bytes (see EC_MSG_PREAMBLE_COUNT). */ #define EC_PROTO_VERSION_UNKNOWN 0 -#define EC_MAX_REQUEST_OVERHEAD 1 +#define EC_MAX_REQUEST_OVERHEAD 4 #define EC_MAX_RESPONSE_OVERHEAD 32 /* From 7a79b0bfd8b3995a39d25bffcf57273635c0e542 Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Thu, 28 Aug 2025 08:35:59 +0000 Subject: [PATCH 1572/3206] platform/chrome: cros_ec: Separate initialization from cros_ec_register() Move the initialization of the `struct cros_ec_device` from cros_ec_register() into cros_ec_device_alloc(). This decouples device initialization from registration. By doing so, the per-device lock is now available immediately after allocation, allowing it to be used safely even before the device is fully registered. Link: https://lore.kernel.org/r/20250828083601.856083-4-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec.c | 57 ++++++++++++++++--------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c index da049068b6e9e1..61bcef8741db59 100644 --- a/drivers/platform/chrome/cros_ec.c +++ b/drivers/platform/chrome/cros_ec.c @@ -30,6 +30,14 @@ static struct cros_ec_platform pd_p = { .cmd_offset = EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX), }; +static void cros_ec_device_free(void *data) +{ + struct cros_ec_device *ec_dev = data; + + mutex_destroy(&ec_dev->lock); + lockdep_unregister_key(&ec_dev->lockdep_key); +} + struct cros_ec_device *cros_ec_device_alloc(struct device *dev) { struct cros_ec_device *ec_dev; @@ -45,7 +53,28 @@ struct cros_ec_device *cros_ec_device_alloc(struct device *dev) sizeof(struct ec_params_rwsig_action) + EC_MAX_REQUEST_OVERHEAD; + ec_dev->din = devm_kzalloc(dev, ec_dev->din_size, GFP_KERNEL); + if (!ec_dev->din) + return NULL; + + ec_dev->dout = devm_kzalloc(dev, ec_dev->dout_size, GFP_KERNEL); + if (!ec_dev->dout) + return NULL; + ec_dev->dev = dev; + ec_dev->max_response = sizeof(struct ec_response_get_protocol_info); + ec_dev->max_request = sizeof(struct ec_params_rwsig_action); + ec_dev->suspend_timeout_ms = EC_HOST_SLEEP_TIMEOUT_DEFAULT; + + BLOCKING_INIT_NOTIFIER_HEAD(&ec_dev->event_notifier); + BLOCKING_INIT_NOTIFIER_HEAD(&ec_dev->panic_notifier); + + lockdep_register_key(&ec_dev->lockdep_key); + mutex_init(&ec_dev->lock); + lockdep_set_class(&ec_dev->lock, &ec_dev->lockdep_key); + + if (devm_add_action_or_reset(dev, cros_ec_device_free, ec_dev)) + return NULL; return ec_dev; } @@ -200,29 +229,7 @@ static int cros_ec_ready_event(struct notifier_block *nb, int cros_ec_register(struct cros_ec_device *ec_dev) { struct device *dev = ec_dev->dev; - int err = 0; - - BLOCKING_INIT_NOTIFIER_HEAD(&ec_dev->event_notifier); - BLOCKING_INIT_NOTIFIER_HEAD(&ec_dev->panic_notifier); - - ec_dev->max_request = sizeof(struct ec_params_hello); - ec_dev->max_response = sizeof(struct ec_response_get_protocol_info); - ec_dev->max_passthru = 0; - ec_dev->ec = NULL; - ec_dev->pd = NULL; - ec_dev->suspend_timeout_ms = EC_HOST_SLEEP_TIMEOUT_DEFAULT; - - ec_dev->din = devm_kzalloc(dev, ec_dev->din_size, GFP_KERNEL); - if (!ec_dev->din) - return -ENOMEM; - - ec_dev->dout = devm_kzalloc(dev, ec_dev->dout_size, GFP_KERNEL); - if (!ec_dev->dout) - return -ENOMEM; - - lockdep_register_key(&ec_dev->lockdep_key); - mutex_init(&ec_dev->lock); - lockdep_set_class(&ec_dev->lock, &ec_dev->lockdep_key); + int err; /* Send RWSIG continue to jump to RW for devices using RWSIG. */ err = cros_ec_rwsig_continue(ec_dev); @@ -322,8 +329,6 @@ int cros_ec_register(struct cros_ec_device *ec_dev) exit: platform_device_unregister(ec_dev->ec); platform_device_unregister(ec_dev->pd); - mutex_destroy(&ec_dev->lock); - lockdep_unregister_key(&ec_dev->lockdep_key); return err; } EXPORT_SYMBOL(cros_ec_register); @@ -343,8 +348,6 @@ void cros_ec_unregister(struct cros_ec_device *ec_dev) &ec_dev->notifier_ready); platform_device_unregister(ec_dev->pd); platform_device_unregister(ec_dev->ec); - mutex_destroy(&ec_dev->lock); - lockdep_unregister_key(&ec_dev->lockdep_key); } EXPORT_SYMBOL(cros_ec_unregister); From 56cb557279d70397cefb497e0f06bdd6fd685f8e Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Thu, 28 Aug 2025 08:36:00 +0000 Subject: [PATCH 1573/3206] platform/chrome: cros_ec: Add a flag to track registration state Introduce a `registered` flag to the `struct cros_ec_device` to allow callers to determine if the device has been fully registered and is ready for use. This is a preparatory step to prevent race conditions where other drivers might try to access the device before it is fully registered or after it has been unregistered. Link: https://lore.kernel.org/r/20250828083601.856083-5-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec.c | 7 +++++++ drivers/platform/chrome/cros_ec_proto.c | 15 +++++++++++++++ include/linux/platform_data/cros_ec_proto.h | 4 ++++ 3 files changed, 26 insertions(+) diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c index 61bcef8741db59..1da79e3d215bfa 100644 --- a/drivers/platform/chrome/cros_ec.c +++ b/drivers/platform/chrome/cros_ec.c @@ -9,6 +9,7 @@ * battery charging and regulator control, firmware update. */ +#include #include #include #include @@ -316,6 +317,9 @@ int cros_ec_register(struct cros_ec_device *ec_dev) goto exit; } + scoped_guard(mutex, &ec_dev->lock) + ec_dev->registered = true; + dev_info(dev, "Chrome EC device registered\n"); /* @@ -343,6 +347,9 @@ EXPORT_SYMBOL(cros_ec_register); */ void cros_ec_unregister(struct cros_ec_device *ec_dev) { + scoped_guard(mutex, &ec_dev->lock) + ec_dev->registered = false; + if (ec_dev->mkbp_event_supported) blocking_notifier_chain_unregister(&ec_dev->event_notifier, &ec_dev->notifier_ready); diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index 3e94a0a82173db..1d8d9168ec1aab 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -3,6 +3,7 @@ // // Copyright (C) 2015 Google, Inc +#include #include #include #include @@ -1153,5 +1154,19 @@ int cros_ec_get_cmd_versions(struct cros_ec_device *ec_dev, u16 cmd) } EXPORT_SYMBOL_GPL(cros_ec_get_cmd_versions); +/** + * cros_ec_device_registered - Return if the ec_dev is registered. + * + * @ec_dev: EC device + * + * Return: true if registered. Otherwise, false. + */ +bool cros_ec_device_registered(struct cros_ec_device *ec_dev) +{ + guard(mutex)(&ec_dev->lock); + return ec_dev->registered; +} +EXPORT_SYMBOL_GPL(cros_ec_device_registered); + MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ChromeOS EC communication protocol helpers"); diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 4d96cffbb9e32c..de14923720a53f 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -128,6 +128,7 @@ struct cros_ec_command { * @dout_size: Size of dout buffer to allocate (zero to use static dout). * @wake_enabled: True if this device can wake the system from sleep. * @suspended: True if this device had been suspended. + * @registered: True if this device had been registered. * @cmd_xfer: Send command to EC and get response. * Returns the number of bytes received if the communication * succeeded, but that doesn't mean the EC was happy with the @@ -186,6 +187,7 @@ struct cros_ec_device { int dout_size; bool wake_enabled; bool suspended; + bool registered; int (*cmd_xfer)(struct cros_ec_device *ec, struct cros_ec_command *msg); int (*pkt_xfer)(struct cros_ec_device *ec, @@ -278,6 +280,8 @@ int cros_ec_cmd_readmem(struct cros_ec_device *ec_dev, u8 offset, u8 size, void int cros_ec_get_cmd_versions(struct cros_ec_device *ec_dev, u16 cmd); +bool cros_ec_device_registered(struct cros_ec_device *ec_dev); + /** * cros_ec_get_time_ns() - Return time in ns. * From 48633acccf38d706d7b368400647bb9db9caf1ae Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Thu, 28 Aug 2025 08:36:01 +0000 Subject: [PATCH 1574/3206] Input: cros_ec_keyb - Defer probe until parent EC device is registered The `cros_ec_keyb` driver can be probed before the cros_ec_device has completed the registration. This creates a race condition where `cros_ec_keyb` might access uninitialized data. Fix this by calling `cros_ec_device_registered()` to check the parent's status. If the device is not yet ready, return -EPROBE_DEFER to ensure the probe is retried later. Acked-by: Dmitry Torokhov Link: https://lore.kernel.org/r/20250828083601.856083-6-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/input/keyboard/cros_ec_keyb.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c index c1e53d87c8a759..f7209c8ebbccd1 100644 --- a/drivers/input/keyboard/cros_ec_keyb.c +++ b/drivers/input/keyboard/cros_ec_keyb.c @@ -705,6 +705,12 @@ static int cros_ec_keyb_probe(struct platform_device *pdev) ec = dev_get_drvdata(pdev->dev.parent); if (!ec) return -EPROBE_DEFER; + /* + * Even if the cros_ec_device pointer is available, still need to check + * if the device is fully registered before using it. + */ + if (!cros_ec_device_registered(ec)) + return -EPROBE_DEFER; ckdev = devm_kzalloc(dev, sizeof(*ckdev), GFP_KERNEL); if (!ckdev) From ec8f26092e525ee3cb1a56d455109fd40bfff3ef Mon Sep 17 00:00:00 2001 From: Donald Menig Date: Sun, 14 Sep 2025 09:43:33 +0200 Subject: [PATCH 1575/3206] ALSA: hda/realtek: Add ALC295 Dell TAS2781 I2C fixup This patch adds a new fixup for the ALC295 codec on some Dell laptops that use the TAS2781 I2C amplifier. The fixup correctly initializes the amplifier and pins, allowing sound to work on all speakers of these devices. The fixup chain is added to the relevant quirk entries for Dell Polaris models. [ adjusted for 6.17 kernel code by tiwai ] Fixes: 1e9c708dc3ae ("ALSA: hda/tas2781: Add new quirk for Lenovo, ASUS, Dell projects") Link: https://bugzilla.suse.com/show_bug.cgi?id=1249575 Signed-off-by: Donald Menig Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 85bb8c4d3b1704..59cc01079fee79 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -3702,6 +3702,7 @@ enum { ALC236_FIXUP_DELL_DUAL_CODECS, ALC287_FIXUP_CS35L41_I2C_2_THINKPAD_ACPI, ALC287_FIXUP_TAS2781_I2C, + ALC295_FIXUP_DELL_TAS2781_I2C, ALC245_FIXUP_TAS2781_SPI_2, ALC287_FIXUP_TXNW2781_I2C, ALC287_FIXUP_YOGA7_14ARB7_I2C, @@ -5167,6 +5168,12 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc294_fixup_gx502_hp, }, + [ALC295_FIXUP_DELL_TAS2781_I2C] = { + .type = HDA_FIXUP_FUNC, + .v.func = tas2781_fixup_tias_i2c, + .chained = true, + .chain_id = ALC289_FIXUP_DUAL_SPK + }, [ALC294_FIXUP_ASUS_GU502_PINS] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -6289,8 +6296,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0c1e, "Dell Precision 3540", ALC236_FIXUP_DELL_DUAL_CODECS), SND_PCI_QUIRK(0x1028, 0x0c28, "Dell Inspiron 16 Plus 7630", ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS), SND_PCI_QUIRK(0x1028, 0x0c4d, "Dell", ALC287_FIXUP_CS35L41_I2C_4), - SND_PCI_QUIRK(0x1028, 0x0c94, "Dell Polaris 3 metal", ALC287_FIXUP_TAS2781_I2C), - SND_PCI_QUIRK(0x1028, 0x0c96, "Dell Polaris 2in1", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x1028, 0x0c94, "Dell Polaris 3 metal", ALC295_FIXUP_DELL_TAS2781_I2C), + SND_PCI_QUIRK(0x1028, 0x0c96, "Dell Polaris 2in1", ALC295_FIXUP_DELL_TAS2781_I2C), SND_PCI_QUIRK(0x1028, 0x0cbd, "Dell Oasis 13 CS MTL-U", ALC289_FIXUP_DELL_CS35L41_SPI_2), SND_PCI_QUIRK(0x1028, 0x0cbe, "Dell Oasis 13 2-IN-1 MTL-U", ALC289_FIXUP_DELL_CS35L41_SPI_2), SND_PCI_QUIRK(0x1028, 0x0cbf, "Dell Oasis 13 Low Weight MTU-L", ALC289_FIXUP_DELL_CS35L41_SPI_2), From 3b5eba544a8af6826209d34daf6197b79b1333bf Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Sun, 4 Feb 2024 10:29:39 -0300 Subject: [PATCH 1576/3206] perf: make pmu_bus const Now that the driver core can properly handle constant struct bus_type, move the pmu_bus variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: "Ricardo B. Marliere" Acked-by: Mark Rutland Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240204-bus_cleanup-events-v1-1-c779d1639c3a@marliere.net Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 872122e074e5fe..155bb686c8ceba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12216,7 +12216,7 @@ static const struct attribute_group *pmu_dev_groups[] = { }; static int pmu_bus_running; -static struct bus_type pmu_bus = { +static const struct bus_type pmu_bus = { .name = "event_source", .dev_groups = pmu_dev_groups, }; From 2e7bba08923ebc675b1f0e0e0959e68e53047838 Mon Sep 17 00:00:00 2001 From: Anderson Nascimento Date: Thu, 11 Sep 2025 20:07:44 -0300 Subject: [PATCH 1577/3206] net/tcp: Fix a NULL pointer dereference when using TCP-AO with TCP_REPAIR A NULL pointer dereference can occur in tcp_ao_finish_connect() during a connect() system call on a socket with a TCP-AO key added and TCP_REPAIR enabled. The function is called with skb being NULL and attempts to dereference it on tcp_hdr(skb)->seq without a prior skb validation. Fix this by checking if skb is NULL before dereferencing it. The commentary is taken from bpf_skops_established(), which is also called in the same flow. Unlike the function being patched, bpf_skops_established() validates the skb before dereferencing it. int main(void){ struct sockaddr_in sockaddr; struct tcp_ao_add tcp_ao; int sk; int one = 1; memset(&sockaddr,'\0',sizeof(sockaddr)); memset(&tcp_ao,'\0',sizeof(tcp_ao)); sk = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); sockaddr.sin_family = AF_INET; memcpy(tcp_ao.alg_name,"cmac(aes128)",12); memcpy(tcp_ao.key,"ABCDEFGHABCDEFGH",16); tcp_ao.keylen = 16; memcpy(&tcp_ao.addr,&sockaddr,sizeof(sockaddr)); setsockopt(sk, IPPROTO_TCP, TCP_AO_ADD_KEY, &tcp_ao, sizeof(tcp_ao)); setsockopt(sk, IPPROTO_TCP, TCP_REPAIR, &one, sizeof(one)); sockaddr.sin_family = AF_INET; sockaddr.sin_port = htobe16(123); inet_aton("127.0.0.1", &sockaddr.sin_addr); connect(sk,(struct sockaddr *)&sockaddr,sizeof(sockaddr)); return 0; } $ gcc tcp-ao-nullptr.c -o tcp-ao-nullptr -Wall $ unshare -Urn BUG: kernel NULL pointer dereference, address: 00000000000000b6 PGD 1f648d067 P4D 1f648d067 PUD 1982e8067 PMD 0 Oops: Oops: 0000 [#1] SMP NOPTI Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 RIP: 0010:tcp_ao_finish_connect (net/ipv4/tcp_ao.c:1182) Fixes: 7c2ffaf21bd6 ("net/tcp: Calculate TCP-AO traffic keys") Signed-off-by: Anderson Nascimento Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911230743.2551-3-anderson@allelesecurity.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ao.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index bbb8d5f0eae7d3..3338b6cc85c487 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -1178,7 +1178,9 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) if (!ao) return; - WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); ao->rcv_sne = 0; hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) From 70d99623d5c11e1a9bcc564b8fbad6fa916913d8 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 12 Sep 2025 11:33:31 +0200 Subject: [PATCH 1578/3206] dpll: fix clock quality level reporting The DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EPRC is not reported via netlink due to bug in dpll_msg_add_clock_quality_level(). The usage of DPLL_CLOCK_QUALITY_LEVEL_MAX for both DECLARE_BITMAP() and for_each_set_bit() is not correct because these macros requires bitmap size and not the highest valid bit in the bitmap. Use correct bitmap size to fix this issue. Fixes: a1afb959add1 ("dpll: add clock quality level attribute and op") Signed-off-by: Ivan Vecera Reviewed-by: Arkadiusz Kubalewski Link: https://patch.msgid.link/20250912093331.862333-1-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- drivers/dpll/dpll_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 036f21cac0a918..0a852011653c4c 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -211,8 +211,8 @@ static int dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll, struct netlink_ext_ack *extack) { + DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX + 1) = { 0 }; const struct dpll_device_ops *ops = dpll_device_ops(dpll); - DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) = { 0 }; enum dpll_clock_quality_level ql; int ret; @@ -221,7 +221,7 @@ dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll, ret = ops->clock_quality_level_get(dpll, dpll_priv(dpll), qls, extack); if (ret) return ret; - for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) + for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX + 1) if (nla_put_u32(msg, DPLL_A_CLOCK_QUALITY_LEVEL, ql)) return -EMSGSIZE; From 64863f4ca4945bdb62ce2b30823f39ea9fe95415 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 11 Sep 2025 23:58:16 +0100 Subject: [PATCH 1579/3206] rxrpc: Fix unhandled errors in rxgk_verify_packet_integrity() rxgk_verify_packet_integrity() may get more errors than just -EPROTO from rxgk_verify_mic_skb(). Pretty much anything other than -ENOMEM constitutes an unrecoverable error. In the case of -ENOMEM, we can just drop the packet and wait for a retransmission. Similar happens with rxgk_decrypt_skb() and its callers. Fix rxgk_decrypt_skb() or rxgk_verify_mic_skb() to return a greater variety of abort codes and fix their callers to abort the connection on any error apart from -ENOMEM. Also preclear the variables used to hold the abort code returned from rxgk_decrypt_skb() or rxgk_verify_mic_skb() to eliminate uninitialised variable warnings. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Reported-by: Dan Carpenter Closes: https://lists.infradead.org/pipermail/linux-afs/2025-April/009739.html Closes: https://lists.infradead.org/pipermail/linux-afs/2025-April/009740.html Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Reviewed-by: Simon Horman Link: https://patch.msgid.link/2038804.1757631496@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk.c | 18 ++++++++++-------- net/rxrpc/rxgk_app.c | 10 ++++++---- net/rxrpc/rxgk_common.h | 14 ++++++++++++-- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 1e19c605bcc829..dce5a3d8a964f8 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -475,7 +475,7 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, struct krb5_buffer metadata; unsigned int offset = sp->offset, len = sp->len; size_t data_offset = 0, data_len = len; - u32 ac; + u32 ac = 0; int ret = -ENOMEM; _enter(""); @@ -499,9 +499,10 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, skb, &offset, &len, &ac); kfree(hdr); - if (ret == -EPROTO) { - rxrpc_abort_eproto(call, skb, ac, - rxgk_abort_1_verify_mic_eproto); + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); } else { sp->offset = offset; sp->len = len; @@ -524,15 +525,16 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, struct rxgk_header hdr; unsigned int offset = sp->offset, len = sp->len; int ret; - u32 ac; + u32 ac = 0; _enter(""); ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); - if (ret == -EPROTO) - rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); - if (ret < 0) + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); goto error; + } if (len < sizeof(hdr)) { ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index b94b77a1c31780..df684b5a853141 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -187,7 +187,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, struct key *server_key; unsigned int ticket_offset, ticket_len; u32 kvno, enctype; - int ret, ec; + int ret, ec = 0; struct { __be32 kvno; @@ -236,9 +236,11 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, &ticket_offset, &ticket_len, &ec); crypto_free_aead(token_enc); token_enc = NULL; - if (ret < 0) - return rxrpc_abort_conn(conn, skb, ec, ret, - rxgk_abort_resp_tok_dec); + if (ret < 0) { + if (ret != -ENOMEM) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + } ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, ticket_len, _key); diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h index 7370a56559853f..80164d89e19c03 100644 --- a/net/rxrpc/rxgk_common.h +++ b/net/rxrpc/rxgk_common.h @@ -88,11 +88,16 @@ int rxgk_decrypt_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch. */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } @@ -127,11 +132,16 @@ int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } From 2429a197648178cd4dc930a9d87c13c547460564 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 12 Sep 2025 00:06:17 +0100 Subject: [PATCH 1580/3206] rxrpc: Fix untrusted unsigned subtract Fix the following Smatch static checker warning: net/rxrpc/rxgk_app.c:65 rxgk_yfs_decode_ticket() warn: untrusted unsigned subtract. 'ticket_len - 10 * 4' by prechecking the length of what we're trying to extract in two places in the token and decoding for a response packet. Also use sizeof() on the struct we're extracting rather specifying the size numerically to be consistent with the other related statements. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Reported-by: Dan Carpenter Closes: https://lists.infradead.org/pipermail/linux-afs/2025-September/010135.html Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Reviewed-by: Simon Horman Link: https://patch.msgid.link/2039268.1757631977@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk_app.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index df684b5a853141..30275cb5ba3e25 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -54,6 +54,10 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, _enter(""); + if (ticket_len < 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + /* Get the session key length */ ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); if (ret < 0) @@ -195,22 +199,23 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, __be32 token_len; } container; + if (token_len < sizeof(container)) + goto short_packet; + /* Decode the RXGK_TokenContainer object. This tells us which server * key we should be using. We can then fetch the key, get the secret * and set up the crypto to extract the token. */ if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + goto short_packet; kvno = ntohl(container.kvno); enctype = ntohl(container.enctype); ticket_len = ntohl(container.token_len); ticket_offset = token_offset + sizeof(container); - if (xdr_round_up(ticket_len) > token_len - 3 * 4) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + if (xdr_round_up(ticket_len) > token_len - sizeof(container)) + goto short_packet; _debug("KVNO %u", kvno); _debug("ENC %u", enctype); @@ -285,4 +290,8 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, * also come out this way if the ticket decryption fails. */ return ret; + +short_packet: + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); } From af82e857df5dd883a4867bcaf5dde041e57a4e33 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Thu, 11 Sep 2025 18:36:10 -0400 Subject: [PATCH 1581/3206] octeon_ep: Validate the VF ID Add a helper to validate the VF ID and use it in the VF ndo ops to prevent accessing out-of-range entries. Without this check, users can run commands such as: # ip link show dev enp135s0 2: enp135s0: mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000 link/ether 00:00:00:01:01:00 brd ff:ff:ff:ff:ff:ff vf 0 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff, spoof checking on, link-state enable, trust off vf 1 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff, spoof checking on, link-state enable, trust off # ip link set dev enp135s0 vf 4 mac 00:00:00:00:00:14 # echo $? 0 even though VF 4 does not exist, which results in silent success instead of returning an error. Fixes: 8a241ef9b9b8 ("octeon_ep: add ndo ops for VFs in PF driver") Signed-off-by: Kamal Heib Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250911223610.1803144-1-kheib@redhat.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeon_ep/octep_main.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index 24499bb36c0057..bcea3fc26a8c7d 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -1124,11 +1124,24 @@ static int octep_set_features(struct net_device *dev, netdev_features_t features return err; } +static bool octep_is_vf_valid(struct octep_device *oct, int vf) +{ + if (vf >= CFG_GET_ACTIVE_VFS(oct->conf)) { + netdev_err(oct->netdev, "Invalid VF ID %d\n", vf); + return false; + } + + return true; +} + static int octep_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivi) { struct octep_device *oct = netdev_priv(dev); + if (!octep_is_vf_valid(oct, vf)) + return -EINVAL; + ivi->vf = vf; ether_addr_copy(ivi->mac, oct->vf_info[vf].mac_addr); ivi->spoofchk = true; @@ -1143,6 +1156,9 @@ static int octep_set_vf_mac(struct net_device *dev, int vf, u8 *mac) struct octep_device *oct = netdev_priv(dev); int err; + if (!octep_is_vf_valid(oct, vf)) + return -EINVAL; + if (!is_valid_ether_addr(mac)) { dev_err(&oct->pdev->dev, "Invalid MAC Address %pM\n", mac); return -EADDRNOTAVAIL; From f83ec76bf285bea5727f478a68b894f5543ca76e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Sep 2025 14:21:14 -0700 Subject: [PATCH 1582/3206] Linux 6.17-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cf37b94078211c..9771619ac596c8 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Baby Opossum Posse # *DOCUMENTATION* From 56c0a2a9ddc2f5b5078c5fb0f81ab76bbc3d4c37 Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Wed, 10 Sep 2025 16:29:16 +1000 Subject: [PATCH 1583/3206] qed: Don't collect too many protection override GRC elements In the protection override dump path, the firmware can return far too many GRC elements, resulting in attempting to write past the end of the previously-kmalloc'ed dump buffer. This will result in a kernel panic with reason: BUG: unable to handle kernel paging request at ADDRESS where "ADDRESS" is just past the end of the protection override dump buffer. The start address of the buffer is: p_hwfn->cdev->dbg_features[DBG_FEATURE_PROTECTION_OVERRIDE].dump_buf and the size of the buffer is buf_size in the same data structure. The panic can be arrived at from either the qede Ethernet driver path: [exception RIP: qed_grc_dump_addr_range+0x108] qed_protection_override_dump at ffffffffc02662ed [qed] qed_dbg_protection_override_dump at ffffffffc0267792 [qed] qed_dbg_feature at ffffffffc026aa8f [qed] qed_dbg_all_data at ffffffffc026b211 [qed] qed_fw_fatal_reporter_dump at ffffffffc027298a [qed] devlink_health_do_dump at ffffffff82497f61 devlink_health_report at ffffffff8249cf29 qed_report_fatal_error at ffffffffc0272baf [qed] qede_sp_task at ffffffffc045ed32 [qede] process_one_work at ffffffff81d19783 or the qedf storage driver path: [exception RIP: qed_grc_dump_addr_range+0x108] qed_protection_override_dump at ffffffffc068b2ed [qed] qed_dbg_protection_override_dump at ffffffffc068c792 [qed] qed_dbg_feature at ffffffffc068fa8f [qed] qed_dbg_all_data at ffffffffc0690211 [qed] qed_fw_fatal_reporter_dump at ffffffffc069798a [qed] devlink_health_do_dump at ffffffff8aa95e51 devlink_health_report at ffffffff8aa9ae19 qed_report_fatal_error at ffffffffc0697baf [qed] qed_hw_err_notify at ffffffffc06d32d7 [qed] qed_spq_post at ffffffffc06b1011 [qed] qed_fcoe_destroy_conn at ffffffffc06b2e91 [qed] qedf_cleanup_fcport at ffffffffc05e7597 [qedf] qedf_rport_event_handler at ffffffffc05e7bf7 [qedf] fc_rport_work at ffffffffc02da715 [libfc] process_one_work at ffffffff8a319663 Resolve this by clamping the firmware's return value to the maximum number of legal elements the firmware should return. Fixes: d52c89f120de8 ("qed*: Utilize FW 8.37.2.0") Signed-off-by: Jamie Bainbridge Link: https://patch.msgid.link/f8e1182934aa274c18d0682a12dbaf347595469c.1757485536.git.jamie.bainbridge@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_debug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index 9c3d3dd2f84753..1f0cea3cae92f5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -4462,10 +4462,11 @@ static enum dbg_status qed_protection_override_dump(struct qed_hwfn *p_hwfn, goto out; } - /* Add override window info to buffer */ + /* Add override window info to buffer, preventing buffer overflow */ override_window_dwords = - qed_rd(p_hwfn, p_ptt, GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) * - PROTECTION_OVERRIDE_ELEMENT_DWORDS; + min(qed_rd(p_hwfn, p_ptt, GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) * + PROTECTION_OVERRIDE_ELEMENT_DWORDS, + PROTECTION_OVERRIDE_DEPTH_DWORDS); if (override_window_dwords) { addr = BYTES_TO_DWORDS(GRC_REG_PROTECTION_OVERRIDE_WINDOW); offset += qed_grc_dump_addr_range(p_hwfn, From a9888628cb2c768202a4530e2816da1889cc3165 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Sep 2025 18:54:15 +0200 Subject: [PATCH 1584/3206] net: dst_metadata: fix IP_DF bit not extracted from tunnel headers Both OVS and TC flower allow extracting and matching on the DF bit of the outer IP header via OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT in the OVS_KEY_ATTR_TUNNEL and TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT in the TCA_FLOWER_KEY_ENC_FLAGS respectively. Flow dissector extracts this information as FLOW_DIS_F_TUNNEL_DONT_FRAGMENT from the tunnel info key. However, the IP_TUNNEL_DONT_FRAGMENT_BIT in the tunnel key is never actually set, because the tunneling code doesn't actually extract it from the IP header. OAM and CRIT_OPT are extracted by the tunnel implementation code, same code also sets the KEY flag, if present. UDP tunnel core takes care of setting the CSUM flag if the checksum is present in the UDP header, but the DONT_FRAGMENT is not handled at any layer. Fix that by checking the bit and setting the corresponding flag while populating the tunnel info in the IP layer where it belongs. Not using __assign_bit as we don't really need to clear the bit in a just initialized field. It also doesn't seem like using __assign_bit will make the code look better. Clearly, users didn't rely on this functionality for anything very important until now. The reason why this doesn't break OVS logic is that it only matches on what kernel previously parsed out and if kernel consistently reports this bit as zero, OVS will only match on it to be zero, which sort of works. But it is still a bug that the uAPI reports and allows matching on the field that is not actually checked in the packet. And this is causing misleading -df reporting in OVS datapath flows, while the tunnel traffic actually has the bit set in most cases. This may also cause issues if a hardware properly implements support for tunnel flag matching as it will disagree with the implementation in a software path of TC flower. Fixes: 7d5437c709de ("openvswitch: Add tunneling interface.") Fixes: 1d17568e74de ("net/sched: cls_flower: add support for matching tunnel control flags") Signed-off-by: Ilya Maximets Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250909165440.229890-2-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/net/dst_metadata.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 4160731dcb6e3a..1fc2fb03ce3f9a 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -3,6 +3,7 @@ #define __NET_DST_METADATA_H 1 #include +#include #include #include #include @@ -220,9 +221,15 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, int md_size) { const struct iphdr *iph = ip_hdr(skb); + struct metadata_dst *tun_dst; + + tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, + 0, flags, tunnel_id, md_size); - return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, flags, tunnel_id, md_size); + if (tun_dst && (iph->frag_off & htons(IP_DF))) + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + tun_dst->u.tun_info.key.tun_flags); + return tun_dst; } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, From 6cafb93c1f2aec7875f7a024a53e15f0683df699 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Sep 2025 18:54:16 +0200 Subject: [PATCH 1585/3206] selftests: openvswitch: add a simple test for tunnel metadata This test ensures that upon receiving decapsulated packets from a tunnel interface in openvswitch, the tunnel metadata fields are properly populated. This partially covers interoperability of the kernel tunnel ports and openvswitch tunnels (LWT) and parsing and formatting of the tunnel metadata fields of the openvswitch netlink uAPI. Doing so, this test also ensures that fields and flags are properly extracted during decapsulation by the tunnel core code, serving as a regression test for the previously fixed issue with the DF bit not being extracted from the outer IP header. The ovs-dpctl.py script already supports all that is necessary for the tunnel ports for this test, so we only need to adjust the ovs_add_if() function to pass the '-t' port type argument in order to be able to create tunnel ports in the openvswitch datapath. Reviewed-by: Aaron Conole Signed-off-by: Ilya Maximets Link: https://patch.msgid.link/20250909165440.229890-3-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- .../selftests/net/openvswitch/openvswitch.sh | 88 +++++++++++++++++-- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index 3c8d3455d8e7f1..b327d3061ed53a 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -25,6 +25,7 @@ tests=" nat_related_v4 ip4-nat-related: ICMP related matches work with SNAT netlink_checks ovsnl: validate netlink attrs and settings upcall_interfaces ovs: test the upcall interfaces + tunnel_metadata ovs: test extraction of tunnel metadata drop_reason drop: test drop reasons are emitted psample psample: Sampling packets with psample" @@ -113,13 +114,13 @@ ovs_add_dp () { } ovs_add_if () { - info "Adding IF to DP: br:$2 if:$3" - if [ "$4" != "-u" ]; then - ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if "$2" "$3" \ - || return 1 + info "Adding IF to DP: br:$3 if:$4 ($2)" + if [ "$5" != "-u" ]; then + ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if \ + -t "$2" "$3" "$4" || return 1 else python3 $ovs_base/ovs-dpctl.py add-if \ - -u "$2" "$3" >$ovs_dir/$3.out 2>$ovs_dir/$3.err & + -u -t "$2" "$3" "$4" >$ovs_dir/$4.out 2>$ovs_dir/$4.err & pid=$! on_exit "ovs_sbx $1 kill -TERM $pid 2>/dev/null" fi @@ -166,9 +167,9 @@ ovs_add_netns_and_veths () { fi if [ "$7" != "-u" ]; then - ovs_add_if "$1" "$2" "$4" || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" || return 1 else - ovs_add_if "$1" "$2" "$4" -u || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" -u || return 1 fi if [ $TRACING -eq 1 ]; then @@ -756,6 +757,79 @@ test_upcall_interfaces() { return 0 } +ovs_add_kernel_tunnel() { + local sbxname=$1; shift + local ns=$1; shift + local tnl_type=$1; shift + local name=$1; shift + local addr=$1; shift + + info "setting up kernel ${tnl_type} tunnel ${name}" + ovs_sbx "${sbxname}" ip -netns ${ns} link add dev ${name} type ${tnl_type} $* || return 1 + on_exit "ovs_sbx ${sbxname} ip -netns ${ns} link del ${name} >/dev/null 2>&1" + ovs_sbx "${sbxname}" ip -netns ${ns} addr add dev ${name} ${addr} || return 1 + ovs_sbx "${sbxname}" ip -netns ${ns} link set dev ${name} mtu 1450 up || return 1 +} + +test_tunnel_metadata() { + which arping >/dev/null 2>&1 || return $ksft_skip + + sbxname="test_tunnel_metadata" + sbx_add "${sbxname}" || return 1 + + info "setting up new DP" + ovs_add_dp "${sbxname}" tdp0 -V 2:1 || return 1 + + ovs_add_netns_and_veths "${sbxname}" tdp0 tns left0 l0 \ + 172.31.110.1/24 || return 1 + + info "removing veth interface from openvswitch and setting IP" + ovs_del_if "${sbxname}" tdp0 left0 || return 1 + ovs_sbx "${sbxname}" ip addr add 172.31.110.2/24 dev left0 || return 1 + ovs_sbx "${sbxname}" ip link set left0 up || return 1 + + info "setting up tunnel port in openvswitch" + ovs_add_if "${sbxname}" "vxlan" tdp0 ovs-vxlan0 -u || return 1 + on_exit "ovs_sbx ${sbxname} ip link del ovs-vxlan0" + ovs_wait ip link show ovs-vxlan0 &>/dev/null || return 1 + ovs_sbx "${sbxname}" ip link set ovs-vxlan0 up || return 1 + + configs=$(echo ' + 1 172.31.221.1/24 1155332 32 set udpcsum flags\(df\|csum\) + 2 172.31.222.1/24 1234567 45 set noudpcsum flags\(df\) + 3 172.31.223.1/24 1020304 23 unset udpcsum flags\(csum\) + 4 172.31.224.1/24 1357986 15 unset noudpcsum' | sed '/^$/d') + + while read -r i addr id ttl df csum flags; do + ovs_add_kernel_tunnel "${sbxname}" tns vxlan vxlan${i} ${addr} \ + remote 172.31.110.2 id ${id} dstport 4789 \ + ttl ${ttl} df ${df} ${csum} || return 1 + done <<< "${configs}" + + ovs_wait grep -q 'listening on upcall packet handler' \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + + info "sending arping" + for i in 1 2 3 4; do + ovs_sbx "${sbxname}" ip netns exec tns \ + arping -I vxlan${i} 172.31.22${i}.2 -c 1 \ + >${ovs_dir}/arping.stdout 2>${ovs_dir}/arping.stderr + done + + info "checking that received decapsulated packets carry correct metadata" + while read -r i addr id ttl df csum flags; do + arp_hdr="arp\\(sip=172.31.22${i}.1,tip=172.31.22${i}.2,op=1,sha=" + addrs="src=172.31.110.1,dst=172.31.110.2" + ports="tp_src=[0-9]*,tp_dst=4789" + tnl_md="tunnel\\(tun_id=${id},${addrs},ttl=${ttl},${ports},${flags}\\)" + + ovs_sbx "${sbxname}" grep -qE "MISS upcall.*${tnl_md}.*${arp_hdr}" \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + done <<< "${configs}" + + return 0 +} + run_test() { ( tname="$1" From 67ff56cecc8701665ec137e5f151a7a7b2c37329 Mon Sep 17 00:00:00 2001 From: Ritvik Gupta Date: Mon, 11 Aug 2025 06:49:58 +0530 Subject: [PATCH 1586/3206] rust: kernel: cpu: mark `CpuId::current()` inline When building the kernel using llvm-20.1.7-rust-1.89.0-x86_64, this symbol is generated: $ llvm-nm --demangle vmlinux | grep CpuId ffffffff84c77450 T ::current However, this Rust symbol is a trivial wrapper around `raw_smp_processor_id` function. It doesn't make sense to go through a trivial wrapper for such functions, so mark it inline. After applying this patch, the above command will produce no output. Suggested-by: Alice Ryhl Link: https://github.com/Rust-for-Linux/linux/issues/1145 Signed-off-by: Ritvik Gupta Reviewed-by: Boqun Feng Reviewed-by: Alice Ryhl Signed-off-by: Miguel Ojeda --- rust/kernel/cpu.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/kernel/cpu.rs b/rust/kernel/cpu.rs index 5de730c8d81722..cb6c0338ef5a61 100644 --- a/rust/kernel/cpu.rs +++ b/rust/kernel/cpu.rs @@ -109,6 +109,7 @@ impl CpuId { /// unexpectedly due to preemption or CPU migration. It should only be /// used when the context ensures that the task remains on the same CPU /// or the users could use a stale (yet valid) CPU ID. + #[inline] pub fn current() -> Self { // SAFETY: raw_smp_processor_id() always returns a valid CPU ID. unsafe { Self::from_u32_unchecked(bindings::raw_smp_processor_id()) } From a15d12c24fa790533c8c133e84a8bf23777a7a43 Mon Sep 17 00:00:00 2001 From: Benno Lossin Date: Tue, 22 Jul 2025 14:14:38 +0200 Subject: [PATCH 1587/3206] rust: sync: extend module documentation of aref Commit 07dad44aa9a9 ("rust: kernel: move ARef and AlwaysRefCounted to sync::aref") moved `ARef` and `AlwaysRefCounted` into their own module. In that process only a short, single line description of the module was added. Extend the description by explaining what is meant by "internal reference counting", the two items in the trait & the difference to `Arc`. Signed-off-by: Benno Lossin Reviewed-by: Alexandre Courbot Signed-off-by: Miguel Ojeda --- rust/kernel/sync/aref.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/rust/kernel/sync/aref.rs b/rust/kernel/sync/aref.rs index c53d42cb073485..0d24a0432015d0 100644 --- a/rust/kernel/sync/aref.rs +++ b/rust/kernel/sync/aref.rs @@ -1,6 +1,21 @@ // SPDX-License-Identifier: GPL-2.0 //! Internal reference counting support. +//! +//! Many C types already have their own reference counting mechanism (e.g. by storing a +//! `refcount_t`). This module provides support for directly using their internal reference count +//! from Rust; instead of making users have to use an additional Rust-reference count in the form of +//! [`Arc`]. +//! +//! The smart pointer [`ARef`] acts similarly to [`Arc`] in that it holds a refcount on the +//! underlying object, but this refcount is internal to the object. It essentially is a Rust +//! implementation of the `get_` and `put_` pattern used in C for reference counting. +//! +//! To make use of [`ARef`], `MyType` needs to implement [`AlwaysRefCounted`]. It is a trait +//! for accessing the internal reference count of an object of the `MyType` type. +//! +//! [`Arc`]: crate::sync::Arc +//! [`Arc`]: crate::sync::Arc use core::{marker::PhantomData, mem::ManuallyDrop, ops::Deref, ptr::NonNull}; From bf87a41b85d67695a04b422499b77748fe845945 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 24 Jul 2025 10:20:05 -0700 Subject: [PATCH 1588/3206] rust: list: Add an example for `ListLinksSelfPtr` usage It appears that the support for `ListLinksSelfPtr` is dead code at the moment [1]. Although some tests were added at [2] for impl `ListItem` using `ListLinksSelfPtr` field, still we could use more examples demonstrating and testing the usage of `ListLinksSelfPtr`. Hence add an example similar to `ListLinks` usage. The example is mostly based on Alice's usage in binder driver [3]. Link: https://lore.kernel.org/rust-for-linux/20250719183649.596051-1-ojeda@kernel.org/ [1] Link: https://lore.kernel.org/rust-for-linux/20250709-list-no-offset-v4-5-a429e75840a9@gmail.com/ [2] Link: https://lore.kernel.org/rust-for-linux/20231101-rust-binder-v1-4-08ba9197f637@google.com/ [3] Signed-off-by: Boqun Feng Reviewed-by: Alice Ryhl [ Fixed typo. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/kernel/list.rs | 120 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/rust/kernel/list.rs b/rust/kernel/list.rs index 44e5219cfcbcbb..7355bbac16a7fe 100644 --- a/rust/kernel/list.rs +++ b/rust/kernel/list.rs @@ -38,6 +38,8 @@ pub use self::arc_field::{define_list_arc_field_getter, ListArcField}; /// /// # Examples /// +/// Use [`ListLinks`] as the type of the intrusive field. +/// /// ``` /// use kernel::list::*; /// @@ -140,6 +142,124 @@ pub use self::arc_field::{define_list_arc_field_getter, ListArcField}; /// } /// # Result::<(), Error>::Ok(()) /// ``` +/// +/// Use [`ListLinksSelfPtr`] as the type of the intrusive field. This allows a list of trait object +/// type. +/// +/// ``` +/// use kernel::list::*; +/// +/// trait Foo { +/// fn foo(&self) -> (&'static str, i32); +/// } +/// +/// #[pin_data] +/// struct DTWrap { +/// #[pin] +/// links: ListLinksSelfPtr>, +/// value: T, +/// } +/// +/// impl DTWrap { +/// fn new(value: T) -> Result> { +/// ListArc::pin_init(try_pin_init!(Self { +/// value, +/// links <- ListLinksSelfPtr::new(), +/// }), GFP_KERNEL) +/// } +/// } +/// +/// impl_list_arc_safe! { +/// impl{T: ?Sized} ListArcSafe<0> for DTWrap { untracked; } +/// } +/// impl_list_item! { +/// impl ListItem<0> for DTWrap { using ListLinksSelfPtr { self.links }; } +/// } +/// +/// // Create a new empty list. +/// let mut list = List::>::new(); +/// { +/// assert!(list.is_empty()); +/// } +/// +/// struct A(i32); +/// // `A` returns the inner value for `foo`. +/// impl Foo for A { fn foo(&self) -> (&'static str, i32) { ("a", self.0) } } +/// +/// struct B; +/// // `B` always returns 42. +/// impl Foo for B { fn foo(&self) -> (&'static str, i32) { ("b", 42) } } +/// +/// // Insert 3 element using `push_back()`. +/// list.push_back(DTWrap::new(A(15))?); +/// list.push_back(DTWrap::new(A(32))?); +/// list.push_back(DTWrap::new(B)?); +/// +/// // Iterate over the list to verify the nodes were inserted correctly. +/// // [A(15), A(32), B] +/// { +/// let mut iter = list.iter(); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("a", 15)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("a", 32)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("b", 42)); +/// assert!(iter.next().is_none()); +/// +/// // Verify the length of the list. +/// assert_eq!(list.iter().count(), 3); +/// } +/// +/// // Pop the items from the list using `pop_back()` and verify the content. +/// { +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value.foo(), ("b", 42)); +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value.foo(), ("a", 32)); +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value.foo(), ("a", 15)); +/// } +/// +/// // Insert 3 elements using `push_front()`. +/// list.push_front(DTWrap::new(A(15))?); +/// list.push_front(DTWrap::new(A(32))?); +/// list.push_front(DTWrap::new(B)?); +/// +/// // Iterate over the list to verify the nodes were inserted correctly. +/// // [B, A(32), A(15)] +/// { +/// let mut iter = list.iter(); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("b", 42)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("a", 32)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("a", 15)); +/// assert!(iter.next().is_none()); +/// +/// // Verify the length of the list. +/// assert_eq!(list.iter().count(), 3); +/// } +/// +/// // Pop the items from the list using `pop_front()` and verify the content. +/// { +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value.foo(), ("a", 15)); +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value.foo(), ("a", 32)); +/// } +/// +/// // Push `list2` to `list` through `push_all_back()`. +/// // list: [B] +/// // list2: [B, A(25)] +/// { +/// let mut list2 = List::>::new(); +/// list2.push_back(DTWrap::new(B)?); +/// list2.push_back(DTWrap::new(A(25))?); +/// +/// list.push_all_back(&mut list2); +/// +/// // list: [B, B, A(25)] +/// // list2: [] +/// let mut iter = list.iter(); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("b", 42)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("b", 42)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("a", 25)); +/// assert!(iter.next().is_none()); +/// assert!(list2.is_empty()); +/// } +/// # Result::<(), Error>::Ok(()) +/// ``` pub struct List, const ID: u64 = 0> { first: *mut ListLinksFields, _ty: PhantomData>, From 379b870c28c6a615a101df7986eba70fea99eff7 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 15 Sep 2025 11:42:31 +0900 Subject: [PATCH 1589/3206] firewire: core: use helper macros instead of direct access to HZ There are some macros available to convert usecs, msecs, and secs into jiffies count. Link: https://lore.kernel.org/r/20250915024232.851955-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 17 +++++++---------- drivers/firewire/core-cdev.c | 4 ++-- drivers/firewire/core-device.c | 6 +++--- drivers/firewire/core-transaction.c | 4 ++-- drivers/firewire/core.h | 2 ++ 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index adb90161c4c6ac..2541e8bb4b75a6 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -229,8 +229,7 @@ void fw_schedule_bus_reset(struct fw_card *card, bool delayed, bool short_reset) /* Use an arbitrary short delay to combine multiple reset requests. */ fw_card_get(card); - if (!queue_delayed_work(fw_workqueue, &card->br_work, - delayed ? DIV_ROUND_UP(HZ, 100) : 0)) + if (!queue_delayed_work(fw_workqueue, &card->br_work, delayed ? msecs_to_jiffies(10) : 0)) fw_card_put(card); } EXPORT_SYMBOL(fw_schedule_bus_reset); @@ -241,10 +240,10 @@ static void br_work(struct work_struct *work) /* Delay for 2s after last reset per IEEE 1394 clause 8.2.1. */ if (card->reset_jiffies != 0 && - time_before64(get_jiffies_64(), card->reset_jiffies + 2 * HZ)) { + time_before64(get_jiffies_64(), card->reset_jiffies + secs_to_jiffies(2))) { trace_bus_reset_postpone(card->index, card->generation, card->br_short); - if (!queue_delayed_work(fw_workqueue, &card->br_work, 2 * HZ)) + if (!queue_delayed_work(fw_workqueue, &card->br_work, secs_to_jiffies(2))) fw_card_put(card); return; } @@ -309,8 +308,7 @@ static void bm_work(struct work_struct *work) irm_id = card->irm_node->node_id; local_id = card->local_node->node_id; - grace = time_after64(get_jiffies_64(), - card->reset_jiffies + DIV_ROUND_UP(HZ, 8)); + grace = time_after64(get_jiffies_64(), card->reset_jiffies + msecs_to_jiffies(125)); if ((is_next_generation(generation, card->bm_generation) && !card->bm_abdicate) || @@ -396,7 +394,7 @@ static void bm_work(struct work_struct *work) * that the problem has gone away by then. */ spin_unlock_irq(&card->lock); - fw_schedule_bm_work(card, DIV_ROUND_UP(HZ, 8)); + fw_schedule_bm_work(card, msecs_to_jiffies(125)); return; } @@ -418,7 +416,7 @@ static void bm_work(struct work_struct *work) * bus reset is less than 125ms ago. Reschedule this job. */ spin_unlock_irq(&card->lock); - fw_schedule_bm_work(card, DIV_ROUND_UP(HZ, 8)); + fw_schedule_bm_work(card, msecs_to_jiffies(125)); return; } @@ -551,8 +549,7 @@ void fw_card_initialize(struct fw_card *card, card->split_timeout_hi = DEFAULT_SPLIT_TIMEOUT / 8000; card->split_timeout_lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; card->split_timeout_cycles = DEFAULT_SPLIT_TIMEOUT; - card->split_timeout_jiffies = - DIV_ROUND_UP(DEFAULT_SPLIT_TIMEOUT * HZ, 8000); + card->split_timeout_jiffies = isoc_cycles_to_jiffies(DEFAULT_SPLIT_TIMEOUT); card->color = 0; card->broadcast_channel = BROADCAST_CHANNEL_INITIAL; diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 78b10c6ef7fec7..9e90c79becdb57 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -1324,8 +1324,8 @@ static void iso_resource_work(struct work_struct *work) todo = r->todo; // Allow 1000ms grace period for other reallocations. if (todo == ISO_RES_ALLOC && - time_before64(get_jiffies_64(), client->device->card->reset_jiffies + HZ)) { - schedule_iso_resource(r, DIV_ROUND_UP(HZ, 3)); + time_before64(get_jiffies_64(), client->device->card->reset_jiffies + secs_to_jiffies(1))) { + schedule_iso_resource(r, msecs_to_jiffies(333)); skip = true; } else { // We could be called twice within the same generation. diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 6a04a00146942d..7d5821cd9b3715 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -847,9 +847,9 @@ static void fw_schedule_device_work(struct fw_device *device, */ #define MAX_RETRIES 10 -#define RETRY_DELAY (3 * HZ) -#define INITIAL_DELAY (HZ / 2) -#define SHUTDOWN_DELAY (2 * HZ) +#define RETRY_DELAY secs_to_jiffies(3) +#define INITIAL_DELAY msecs_to_jiffies(500) +#define SHUTDOWN_DELAY secs_to_jiffies(2) static void fw_device_shutdown(struct work_struct *work) { diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 1d1c2d8f85aec8..623e1d9bd107f9 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -458,7 +458,7 @@ static struct fw_packet phy_config_packet = { void fw_send_phy_config(struct fw_card *card, int node_id, int generation, int gap_count) { - long timeout = DIV_ROUND_UP(HZ, 10); + long timeout = msecs_to_jiffies(100); u32 data = 0; phy_packet_set_packet_identifier(&data, PHY_PACKET_PACKET_IDENTIFIER_PHY_CONFIG); @@ -1220,7 +1220,7 @@ static void update_split_timeout(struct fw_card *card) cycles = clamp(cycles, 800u, 3u * 8000u); card->split_timeout_cycles = cycles; - card->split_timeout_jiffies = DIV_ROUND_UP(cycles * HZ, 8000); + card->split_timeout_jiffies = isoc_cycles_to_jiffies(cycles); } static void handle_registers(struct fw_card *card, struct fw_request *request, diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index 9e68ebf0673d73..7f2ca93406cedf 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -30,6 +30,8 @@ struct fw_packet; // This is the arbitrary value we use to indicate a mismatched gap count. #define GAP_COUNT_MISMATCHED 0 +#define isoc_cycles_to_jiffies(cycles) usecs_to_jiffies(cycles * USEC_PER_SEC / 8000) + extern __printf(2, 3) void fw_err(const struct fw_card *card, const char *fmt, ...); extern __printf(2, 3) From 931383f161c066ac5fda12035540498931739842 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 15 Sep 2025 11:42:32 +0900 Subject: [PATCH 1590/3206] firewire: core: use helper macro to compare against current jiffies The pattern of calling either time_before64() or time_after64() with get_jiffies_64() can be replaced with the corresponding helper macros. Link: https://lore.kernel.org/r/20250915024232.851955-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 4 ++-- drivers/firewire/core-cdev.c | 2 +- drivers/firewire/core-device.c | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 2541e8bb4b75a6..b5e01a71114571 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -240,7 +240,7 @@ static void br_work(struct work_struct *work) /* Delay for 2s after last reset per IEEE 1394 clause 8.2.1. */ if (card->reset_jiffies != 0 && - time_before64(get_jiffies_64(), card->reset_jiffies + secs_to_jiffies(2))) { + time_is_after_jiffies64(card->reset_jiffies + secs_to_jiffies(2))) { trace_bus_reset_postpone(card->index, card->generation, card->br_short); if (!queue_delayed_work(fw_workqueue, &card->br_work, secs_to_jiffies(2))) @@ -308,7 +308,7 @@ static void bm_work(struct work_struct *work) irm_id = card->irm_node->node_id; local_id = card->local_node->node_id; - grace = time_after64(get_jiffies_64(), card->reset_jiffies + msecs_to_jiffies(125)); + grace = time_is_before_jiffies64(card->reset_jiffies + msecs_to_jiffies(125)); if ((is_next_generation(generation, card->bm_generation) && !card->bm_abdicate) || diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 9e90c79becdb57..1be8f5eb3e17dc 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -1324,7 +1324,7 @@ static void iso_resource_work(struct work_struct *work) todo = r->todo; // Allow 1000ms grace period for other reallocations. if (todo == ISO_RES_ALLOC && - time_before64(get_jiffies_64(), client->device->card->reset_jiffies + secs_to_jiffies(1))) { + time_is_after_jiffies64(client->device->card->reset_jiffies + secs_to_jiffies(1))) { schedule_iso_resource(r, msecs_to_jiffies(333)); skip = true; } else { diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 7d5821cd9b3715..457a0da024a761 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -855,8 +855,7 @@ static void fw_device_shutdown(struct work_struct *work) { struct fw_device *device = from_work(device, work, work.work); - if (time_before64(get_jiffies_64(), - device->card->reset_jiffies + SHUTDOWN_DELAY) + if (time_is_after_jiffies64(device->card->reset_jiffies + SHUTDOWN_DELAY) && !list_empty(&device->card->link)) { fw_schedule_device_work(device, SHUTDOWN_DELAY); return; From d162694037215fe25f1487999c58d70df809a2fd Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 4 Sep 2025 13:06:41 +0200 Subject: [PATCH 1591/3206] smb: server: let smb_direct_writev() respect SMB_DIRECT_MAX_SEND_SGES We should not use more sges for ib_post_send() than we told the rdma device in rdma_create_qp()! Otherwise ib_post_send() will return -EINVAL, so we disconnect the connection. Or with the current siw.ko we'll get 0 from ib_post_send(), but will never ever get a completion for the request. I've already sent a fix for siw.ko... So we need to make sure smb_direct_writev() limits the number of vectors we pass to individual smb_direct_post_send_data() calls, so that we don't go over the queue pair limits. Commit 621433b7e25d ("ksmbd: smbd: relax the count of sges required") was very strange and I guess only needed because SMB_DIRECT_MAX_SEND_SGES was 8 at that time. It basically removed the check that the rdma device is able to handle the number of sges we try to use. While the real problem was added by commit ddbdc861e37c ("ksmbd: smbd: introduce read/write credits for RDMA read/write") as it used the minumun of device->attrs.max_send_sge and device->attrs.max_sge_rd, with the problem that device->attrs.max_sge_rd is always 1 for iWarp. And that limitation should only apply to RDMA Read operations. For now we keep that limitation for RDMA Write operations too, fixing that is a task for another day as it's not really required a bug fix. Commit 2b4eeeaa9061 ("ksmbd: decrease the number of SMB3 smbdirect server SGEs") lowered SMB_DIRECT_MAX_SEND_SGES to 6, which is also used by our client code. And that client code enforces device->attrs.max_send_sge >= 6 since commit d2e81f92e5b7 ("Decrease the number of SMB3 smbdirect client SGEs") and (briefly looking) only the i40w driver provides only 3, see I40IW_MAX_WQ_FRAGMENT_COUNT. But currently we'd require 4 anyway, so that would not work anyway, but now it fails early. Cc: Steve French Cc: Tom Talpey Cc: Hyunchul Lee Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: linux-rdma@vger.kernel.org Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers") Fixes: ddbdc861e37c ("ksmbd: smbd: introduce read/write credits for RDMA read/write") Fixes: 621433b7e25d ("ksmbd: smbd: relax the count of sges required") Fixes: 2b4eeeaa9061 ("ksmbd: decrease the number of SMB3 smbdirect server SGEs") Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 157 ++++++++++++++++++++++----------- 1 file changed, 107 insertions(+), 50 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 5466aa8c39b1cd..cc4322bfa1d611 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1209,78 +1209,130 @@ static int smb_direct_writev(struct ksmbd_transport *t, bool need_invalidate, unsigned int remote_key) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); - int remaining_data_length; - int start, i, j; - int max_iov_size = st->max_send_size - + size_t remaining_data_length; + size_t iov_idx; + size_t iov_ofs; + size_t max_iov_size = st->max_send_size - sizeof(struct smb_direct_data_transfer); int ret; - struct kvec vec; struct smb_direct_send_ctx send_ctx; + int error = 0; if (st->status != SMB_DIRECT_CS_CONNECTED) return -ENOTCONN; //FIXME: skip RFC1002 header.. + if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4)) + return -EINVAL; buflen -= 4; + iov_idx = 1; + iov_ofs = 0; remaining_data_length = buflen; ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key); - start = i = 1; - buflen = 0; - while (true) { - buflen += iov[i].iov_len; - if (buflen > max_iov_size) { - if (i > start) { - remaining_data_length -= - (buflen - iov[i].iov_len); - ret = smb_direct_post_send_data(st, &send_ctx, - &iov[start], i - start, - remaining_data_length); - if (ret) + while (remaining_data_length) { + struct kvec vecs[SMB_DIRECT_MAX_SEND_SGES - 1]; /* minus smbdirect hdr */ + size_t possible_bytes = max_iov_size; + size_t possible_vecs; + size_t bytes = 0; + size_t nvecs = 0; + + /* + * For the last message remaining_data_length should be + * have been 0 already! + */ + if (WARN_ON_ONCE(iov_idx >= niovs)) { + error = -EINVAL; + goto done; + } + + /* + * We have 2 factors which limit the arguments we pass + * to smb_direct_post_send_data(): + * + * 1. The number of supported sges for the send, + * while one is reserved for the smbdirect header. + * And we currently need one SGE per page. + * 2. The number of negotiated payload bytes per send. + */ + possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx); + + while (iov_idx < niovs && possible_vecs && possible_bytes) { + struct kvec *v = &vecs[nvecs]; + int page_count; + + v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs; + v->iov_len = min_t(size_t, + iov[iov_idx].iov_len - iov_ofs, + possible_bytes); + page_count = get_buf_page_count(v->iov_base, v->iov_len); + if (page_count > possible_vecs) { + /* + * If the number of pages in the buffer + * is to much (because we currently require + * one SGE per page), we need to limit the + * length. + * + * We know possible_vecs is at least 1, + * so we always keep the first page. + * + * We need to calculate the number extra + * pages (epages) we can also keep. + * + * We calculate the number of bytes in the + * first page (fplen), this should never be + * larger than v->iov_len because page_count is + * at least 2, but adding a limitation feels + * better. + * + * Then we calculate the number of bytes (elen) + * we can keep for the extra pages. + */ + size_t epages = possible_vecs - 1; + size_t fpofs = offset_in_page(v->iov_base); + size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len); + size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE); + + v->iov_len = fplen + elen; + page_count = get_buf_page_count(v->iov_base, v->iov_len); + if (WARN_ON_ONCE(page_count > possible_vecs)) { + /* + * Something went wrong in the above + * logic... + */ + error = -EINVAL; goto done; - } else { - /* iov[start] is too big, break it */ - int nvec = (buflen + max_iov_size - 1) / - max_iov_size; - - for (j = 0; j < nvec; j++) { - vec.iov_base = - (char *)iov[start].iov_base + - j * max_iov_size; - vec.iov_len = - min_t(int, max_iov_size, - buflen - max_iov_size * j); - remaining_data_length -= vec.iov_len; - ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1, - remaining_data_length); - if (ret) - goto done; } - i++; - if (i == niovs) - break; } - start = i; - buflen = 0; - } else { - i++; - if (i == niovs) { - /* send out all remaining vecs */ - remaining_data_length -= buflen; - ret = smb_direct_post_send_data(st, &send_ctx, - &iov[start], i - start, - remaining_data_length); - if (ret) - goto done; - break; + possible_vecs -= page_count; + nvecs += 1; + possible_bytes -= v->iov_len; + bytes += v->iov_len; + + iov_ofs += v->iov_len; + if (iov_ofs >= iov[iov_idx].iov_len) { + iov_idx += 1; + iov_ofs = 0; } } + + remaining_data_length -= bytes; + + ret = smb_direct_post_send_data(st, &send_ctx, + vecs, nvecs, + remaining_data_length); + if (unlikely(ret)) { + error = ret; + goto done; + } } done: ret = smb_direct_flush_send_list(st, &send_ctx, true); + if (unlikely(!ret && error)) + ret = error; /* * As an optimization, we don't wait for individual I/O to finish @@ -1744,6 +1796,11 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } + if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) { + pr_err("warning: device max_send_sge = %d too small\n", + device->attrs.max_send_sge); + return -EINVAL; + } if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { pr_err("warning: device max_recv_sge = %d too small\n", device->attrs.max_recv_sge); @@ -1767,7 +1824,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_wr = max_send_wrs; cap->max_recv_wr = t->recv_credit_max; - cap->max_send_sge = max_sge_per_wr; + cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; cap->max_rdma_ctxs = t->max_rw_credits; From 5282491fc49d5614ac6ddcd012e5743eecb6a67c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 10 Sep 2025 11:22:52 +0900 Subject: [PATCH 1592/3206] ksmbd: smbdirect: validate data_offset and data_length field of smb_direct_data_transfer If data_offset and data_length of smb_direct_data_transfer struct are invalid, out of bounds issue could happen. This patch validate data_offset and data_length field in recv_done. Cc: stable@vger.kernel.org Fixes: 2ea086e35c3d ("ksmbd: add buffer validation for smb direct") Reviewed-by: Stefan Metzmacher Reported-by: Luigino Camastra, Aisle Research Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index cc4322bfa1d611..d52f3757827631 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -554,7 +554,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) case SMB_DIRECT_MSG_DATA_TRANSFER: { struct smb_direct_data_transfer *data_transfer = (struct smb_direct_data_transfer *)recvmsg->packet; - unsigned int data_length; + unsigned int data_offset, data_length; int avail_recvmsg_count, receive_credits; if (wc->byte_len < @@ -565,14 +565,15 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } data_length = le32_to_cpu(data_transfer->data_length); - if (data_length) { - if (wc->byte_len < sizeof(struct smb_direct_data_transfer) + - (u64)data_length) { - put_recvmsg(t, recvmsg); - smb_direct_disconnect_rdma_connection(t); - return; - } + data_offset = le32_to_cpu(data_transfer->data_offset); + if (wc->byte_len < data_offset || + wc->byte_len < (u64)data_offset + data_length) { + put_recvmsg(t, recvmsg); + smb_direct_disconnect_rdma_connection(t); + return; + } + if (data_length) { if (t->full_packet_received) recvmsg->first_segment = true; From e1868ba37fd27c6a68e31565402b154beaa65df0 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 11 Sep 2025 10:05:23 +0900 Subject: [PATCH 1593/3206] ksmbd: smbdirect: verify remaining_data_length respects max_fragmented_recv_size This is inspired by the check for data_offset + data_length. Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: stable@vger.kernel.org Fixes: 2ea086e35c3d ("ksmbd: add buffer validation for smb direct") Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index d52f3757827631..6550bd9f002c27 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -554,7 +554,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) case SMB_DIRECT_MSG_DATA_TRANSFER: { struct smb_direct_data_transfer *data_transfer = (struct smb_direct_data_transfer *)recvmsg->packet; - unsigned int data_offset, data_length; + u32 remaining_data_length, data_offset, data_length; int avail_recvmsg_count, receive_credits; if (wc->byte_len < @@ -564,6 +564,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) return; } + remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); data_length = le32_to_cpu(data_transfer->data_length); data_offset = le32_to_cpu(data_transfer->data_offset); if (wc->byte_len < data_offset || @@ -572,6 +573,14 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(t); return; } + if (remaining_data_length > t->max_fragmented_recv_size || + data_length > t->max_fragmented_recv_size || + (u64)remaining_data_length + (u64)data_length > + (u64)t->max_fragmented_recv_size) { + put_recvmsg(t, recvmsg); + smb_direct_disconnect_rdma_connection(t); + return; + } if (data_length) { if (t->full_packet_received) From b62fd63ade7cb573b114972ef8f9fa505be8d74a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 3 Sep 2025 16:53:21 +0100 Subject: [PATCH 1594/3206] btrfs: fix invalid extref key setup when replaying dentry The offset for an extref item's key is not the object ID of the parent dir, otherwise we would not need the extref item and would use plain ref items. Instead the offset is the result of a hash computation that uses the object ID of the parent dir and the name associated to the entry. So fix this by setting the key offset at replay_one_name() to be the result of calling btrfs_extref_hash(). Fixes: 725af92a6251 ("btrfs: Open-code name_in_log_ref in replay_one_name") Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7d5d90845ca985..7a63afedd01e6e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1964,7 +1964,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = key->objectid; + search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len); ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); if (ret < 0) { goto out; From 5b8d2964754102323ca24495ba94892426284e3a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 5 Sep 2025 15:54:43 +0200 Subject: [PATCH 1595/3206] btrfs: zoned: fix incorrect ASSERT in btrfs_zoned_reserve_data_reloc_bg() When moving a block-group to the dedicated data relocation space-info in btrfs_zoned_reserve_data_reloc_bg() it is asserted that the newly created block group for data relocation does not contain any zone_unusable bytes. But on disks with zone_capacity < zone_size, the difference between zone_size and zone_capacity is accounted as zone_unusable. Instead of asserting that the block-group does not contain any zone_unusable bytes, remove them from the block-groups total size. Reported-by: Yi Zhang Link: https://lore.kernel.org/linux-block/CAHj4cs8-cS2E+-xQ-d2Bj6vMJZ+CwT_cbdWBTju4BV35LsvEYw@mail.gmail.com/ Fixes: daa0fde322350 ("btrfs: zoned: fix data relocation block group reservation") Reviewed-by: Naohiro Aota Tested-by: Yi Zhang Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index ea662036f4413f..efc2a81f50e551 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2582,9 +2582,9 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); space_info->total_bytes -= bg->length; space_info->disk_total -= bg->length * factor; + space_info->disk_total -= bg->zone_unusable; /* There is no allocation ever happened. */ ASSERT(bg->used == 0); - ASSERT(bg->zone_unusable == 0); /* No super block in a block group on the zoned setup. */ ASSERT(bg->bytes_super == 0); spin_unlock(&space_info->lock); From 8679d2687c351824d08cf1f0e86f3b65f22a00fe Mon Sep 17 00:00:00 2001 From: austinchang Date: Thu, 11 Sep 2025 06:06:29 +0000 Subject: [PATCH 1596/3206] btrfs: initialize inode::file_extent_tree after i_mode has been set btrfs_init_file_extent_tree() uses S_ISREG() to determine if the file is a regular file. In the beginning of btrfs_read_locked_inode(), the i_mode hasn't been read from inode item, then file_extent_tree won't be used at all in volumes without NO_HOLES. Fix this by calling btrfs_init_file_extent_tree() after i_mode is initialized in btrfs_read_locked_inode(). Fixes: 3d7db6e8bd22e6 ("btrfs: don't allocate file extent tree for non regular files") CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Filipe Manana Signed-off-by: austinchang Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 3 --- fs/btrfs/inode.c | 11 +++++------ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0f8d8e275143b2..c0c1ddd46b67a2 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1843,7 +1843,6 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; struct inode *vfs_inode = &inode->vfs_inode; @@ -1864,8 +1863,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e7218e78bff456..18db1053cdf087 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3885,10 +3885,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(inode); - if (ret) - goto out; - ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; @@ -3920,8 +3916,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); @@ -3953,6 +3947,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path btrfs_set_inode_mapping_order(inode); cache_index: + ret = btrfs_init_file_extent_tree(inode); + if (ret) + goto out; + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any From 80eb65ccf6f72dc37b972583fe71cd8a50ff7e51 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 8 Sep 2025 12:51:11 +0100 Subject: [PATCH 1597/3206] btrfs: annotate block group access with data_race() when sorting for reclaim When sorting the block group list for reclaim we are using a block group's used bytes counter without taking the block group's spinlock, so we can race with a concurrent task updating it (at btrfs_update_block_group()), which makes tools like KCSAN unhappy and report a race. Since the sorting is not strictly needed from a functional perspective and such races should rarely cause any ordering changes (only load/store tearing could cause them), not to mention that after the sorting the ordering may no longer be accurate due to concurrent allocations and deallocations of extents in a block group, annotate the accesses to the used counter with data_race() to silence KCSAN and similar tools. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9bf282d2453c02..499a9edf0ca315 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1795,7 +1795,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, bg1 = list_entry(a, struct btrfs_block_group, bg_list); bg2 = list_entry(b, struct btrfs_block_group, bg_list); - return bg1->used > bg2->used; + /* + * Some other task may be updating the ->used field concurrently, but it + * is not serious if we get a stale value or load/store tearing issues, + * as sorting the list of block groups to reclaim is not critical and an + * occasional imperfect order is ok. So silence KCSAN and avoid the + * overhead of locking or any other synchronization. + */ + return data_race(bg1->used > bg2->used); } static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) From 03ee64b5e525c40e9bc723885c6b0b9c6188b55b Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Mon, 4 Aug 2025 12:45:19 -0700 Subject: [PATCH 1598/3206] rv: Support systems with time64-only syscalls Some systems (like 32-bit RISC-V) only have the 64-bit time_t versions of syscalls. So handle the 32-bit time_t version of those being undefined. Fixes: f74f8bb246cf ("rv: Add rtapp_sleep monitor") Closes: https://lore.kernel.org/oe-kbuild-all/202508160204.SsFyNfo6-lkp@intel.com Signed-off-by: Palmer Dabbelt Acked-by: Nam Cao Link: https://lore.kernel.org/r/20250804194518.97620-2-palmer@dabbelt.com Signed-off-by: Gabriele Monaco --- kernel/trace/rv/monitors/sleep/sleep.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c index eea447b069071e..c1347da69e9dec 100644 --- a/kernel/trace/rv/monitors/sleep/sleep.c +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -127,7 +127,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) mon = ltl_get_monitor(current); switch (id) { +#ifdef __NR_clock_nanosleep case __NR_clock_nanosleep: +#endif #ifdef __NR_clock_nanosleep_time64 case __NR_clock_nanosleep_time64: #endif @@ -138,7 +140,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true); break; +#ifdef __NR_futex case __NR_futex: +#endif #ifdef __NR_futex_time64 case __NR_futex_time64: #endif From de090d1ccae1e191af4beb92964591c6e4f31f28 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 6 Aug 2025 14:09:11 +0200 Subject: [PATCH 1599/3206] rv: Fix wrong type cast in enabled_monitors_next() Argument 'p' of enabled_monitors_next() is not a pointer to struct rv_monitor, it is actually a pointer to the list_head inside struct rv_monitor. Therefore it is wrong to cast 'p' to struct rv_monitor *. This wrong type cast has been there since the beginning. But it still worked because the list_head was the first field in struct rv_monitor_def. This is no longer true since commit 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") moved the list_head, and this wrong type cast became a functional problem. Properly use container_of() instead. Fixes: 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") Signed-off-by: Nam Cao Reviewed-by: Gabriele Monaco Link: https://lore.kernel.org/r/20250806120911.989365-1-namcao@linutronix.de Signed-off-by: Gabriele Monaco --- kernel/trace/rv/rv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 1482e91c39f402..b341445b8fbdf1 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -495,7 +495,7 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) */ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) { - struct rv_monitor *mon = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); (*pos)++; From 3afaff7a0ce97457c8ab46862f2c06603a89962e Mon Sep 17 00:00:00 2001 From: Akhilesh Patil Date: Mon, 11 Aug 2025 17:42:53 +0530 Subject: [PATCH 1600/3206] include/linux/rv.h: remove redundant include file Remove redundant include to clean up the code. Move all unique include files inside CONFIG_RV as they are only needed when CONFIG_RV is enabled. Arrange include files alphabetically. Fixes: 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") [1] Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202507312017.oyD08TL5-lkp@intel.com/ Signed-off-by: Akhilesh Patil Reviewed-by: Gabriele Monaco Link: https://lore.kernel.org/r/aJneRbHGlNFg7lr9@bhairav-test.ee.iitb.ac.in Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/rv.h b/include/linux/rv.h index 14410a42faefd0..9520aab34bcbe3 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -7,16 +7,14 @@ #ifndef _LINUX_RV_H #define _LINUX_RV_H -#include -#include - #define MAX_DA_NAME_LEN 32 #define MAX_DA_RETRY_RACING_EVENTS 3 #ifdef CONFIG_RV +#include #include +#include #include -#include /* * Deterministic automaton per-object variables. From 9b5096761c184b3923ae45c5e82da31005a765c7 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Wed, 3 Sep 2025 14:51:12 +0800 Subject: [PATCH 1601/3206] rv: Fix missing mutex unlock in rv_register_monitor() If create_monitor_dir() fails, the function returns directly without releasing rv_interface_lock. This leaves the mutex locked and causes subsequent monitor registration attempts to deadlock. Fix it by making the error path jump to out_unlock, ensuring that the mutex is always released before returning. Fixes: 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") Signed-off-by: Zhen Ni Reviewed-by: Gabriele Monaco Reviewed-by: Nam Cao Link: https://lore.kernel.org/r/20250903065112.1878330-1-zhen.ni@easystack.cn Signed-off-by: Gabriele Monaco --- kernel/trace/rv/rv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index b341445b8fbdf1..48338520376f90 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -805,7 +805,7 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) retval = create_monitor_dir(monitor, parent); if (retval) - return retval; + goto out_unlock; /* keep children close to the parent for easier visualisation */ if (parent) From 19c839a98c731169f06d32e7c9e00c78a0086ebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Szymanski?= Date: Fri, 12 Sep 2025 22:18:50 +0200 Subject: [PATCH 1602/3206] gpiolib: acpi: initialize acpi_gpio_info struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 7c010d463372 ("gpiolib: acpi: Make sure we fill struct acpi_gpio_info"), uninitialized acpi_gpio_info struct are passed to __acpi_find_gpio() and later in the call stack info->quirks is used in acpi_populate_gpio_lookup. This breaks the i2c_hid_cpi driver: [ 58.122916] i2c_hid_acpi i2c-UNIW0001:00: HID over i2c has not been provided an Int IRQ [ 58.123097] i2c_hid_acpi i2c-UNIW0001:00: probe with driver i2c_hid_acpi failed with error -22 Fix this by initializing the acpi_gpio_info pass to __acpi_find_gpio() Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220388 Fixes: 7c010d463372 ("gpiolib: acpi: Make sure we fill struct acpi_gpio_info") Signed-off-by: Sébastien Szymanski Tested-by: Hans de Goede Reviewed-by: Hans de Goede Acked-by: Mika Westerberg Tested-By: Calvin Owens Cc: stable@vger.kernel.org Signed-off-by: Andy Shevchenko --- drivers/gpio/gpiolib-acpi-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib-acpi-core.c b/drivers/gpio/gpiolib-acpi-core.c index e81a29902bb274..284e762d92c4d4 100644 --- a/drivers/gpio/gpiolib-acpi-core.c +++ b/drivers/gpio/gpiolib-acpi-core.c @@ -942,7 +942,7 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, { struct acpi_device *adev = to_acpi_device_node(fwnode); bool can_fallback = acpi_can_fallback_to_crs(adev, con_id); - struct acpi_gpio_info info; + struct acpi_gpio_info info = {}; struct gpio_desc *desc; int ret; @@ -999,7 +999,7 @@ int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id, int ret; for (i = 0, idx = 0; idx <= index; i++) { - struct acpi_gpio_info info; + struct acpi_gpio_info info = {}; struct gpio_desc *desc; /* Ignore -EPROBE_DEFER, it only matters if idx matches */ From f205ed23f0687ea8b14c140e0af9643eed683691 Mon Sep 17 00:00:00 2001 From: Bou-Saan Che Date: Sun, 14 Sep 2025 22:15:37 +0300 Subject: [PATCH 1603/3206] ALSA: hda: cs35l41: Support Lenovo Thinkbook 13x Gen 5 This laptop does not contain _DSD so needs to be supported using the configuration table. Signed-off-by: Bou-Saan Che Signed-off-by: Takashi Iwai --- sound/hda/codecs/side-codecs/cs35l41_hda_property.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/hda/codecs/side-codecs/cs35l41_hda_property.c b/sound/hda/codecs/side-codecs/cs35l41_hda_property.c index d8249d997c2a0b..16d5ea77192f04 100644 --- a/sound/hda/codecs/side-codecs/cs35l41_hda_property.c +++ b/sound/hda/codecs/side-codecs/cs35l41_hda_property.c @@ -135,6 +135,8 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "17AA38C8", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 }, { "17AA38F9", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, { "17AA38FA", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, + { "17AA3929", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 }, + { "17AA392B", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 }, {} }; @@ -558,6 +560,8 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CSC3551", "17AA38C8", generic_dsd_config }, { "CSC3551", "17AA38F9", generic_dsd_config }, { "CSC3551", "17AA38FA", generic_dsd_config }, + { "CSC3551", "17AA3929", generic_dsd_config }, + { "CSC3551", "17AA392B", generic_dsd_config }, {} }; From c1d31894d892a88454eee6530c99f3c4fcbc9397 Mon Sep 17 00:00:00 2001 From: Bou-Saan Che Date: Sun, 14 Sep 2025 22:17:38 +0300 Subject: [PATCH 1604/3206] ALSA: hda/realtek: Support Lenovo Thinkbook 13x Gen 5 The laptop does not contain valid _DSD for these amps, so requires entries into the CS35L41 configuration table to function correctly. Signed-off-by: Bou-Saan Che Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 59cc01079fee79..c86ba6ae4ef8c4 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7100,6 +7100,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3913, "Lenovo 145", ALC236_FIXUP_LENOVO_INV_DMIC), SND_PCI_QUIRK(0x17aa, 0x391f, "Yoga S990-16 pro Quad YC Quad", ALC287_FIXUP_TXNW2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3920, "Yoga S990-16 pro Quad VECO Quad", ALC287_FIXUP_TXNW2781_I2C), + SND_PCI_QUIRK(0x17aa, 0x3929, "Thinkbook 13x Gen 5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), + SND_PCI_QUIRK(0x17aa, 0x392b, "Thinkbook 13x Gen 5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC), SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo B50-70", ALC269_FIXUP_DMIC_THINKPAD_ACPI), SND_PCI_QUIRK(0x17aa, 0x3bf8, "Quanta FL1", ALC269_FIXUP_PCM_44K), From d99c203034989c3652cc8e7330ba5750fc74c04f Mon Sep 17 00:00:00 2001 From: Bou-Saan Che Date: Sun, 14 Sep 2025 22:17:50 +0300 Subject: [PATCH 1605/3206] ALSA: hda/realtek: Fix volume control on Lenovo Thinkbook 13x Gen 4 The issue was caused by incorrect configuration in the driver, which prevented proper volume control on certain systems. Signed-off-by: Bou-Saan Che Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index c86ba6ae4ef8c4..618ec00d3f9897 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7078,8 +7078,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x38be, "Yoga S980-14.5 proX YC Dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38bf, "Yoga S980-14.5 proX LX Dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38c3, "Y980 DUAL", ALC287_FIXUP_TAS2781_I2C), - SND_PCI_QUIRK(0x17aa, 0x38c7, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4), - SND_PCI_QUIRK(0x17aa, 0x38c8, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4), + SND_PCI_QUIRK(0x17aa, 0x38c7, "Thinkbook 13x Gen 4", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), + SND_PCI_QUIRK(0x17aa, 0x38c8, "Thinkbook 13x Gen 4", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), SND_PCI_QUIRK(0x17aa, 0x38cb, "Y790 YG DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38cd, "Y790 VECO DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38d2, "Lenovo Yoga 9 14IMH9", ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN), From c29287bb32bc72a037c34d84a160060a6122b7ed Mon Sep 17 00:00:00 2001 From: Jihed Chaibi Date: Sun, 14 Sep 2025 22:59:45 +0200 Subject: [PATCH 1606/3206] ALSA: asihpi: Simplify error handling in PCM substream setup Refactor error handling in the PCM substream setup to combine redundant checks and improve code readability. Free the dpcm structure and return appropriate error codes (-EBUSY for HPI_ERROR_OBJ_ALREADY_OPEN, -EIO for other errors) in a single block. Signed-off-by: Jihed Chaibi Signed-off-by: Takashi Iwai --- sound/pci/asihpi/asihpi.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sound/pci/asihpi/asihpi.c b/sound/pci/asihpi/asihpi.c index 8419f2b6e5891e..fd0a67b772d1f0 100644 --- a/sound/pci/asihpi/asihpi.c +++ b/sound/pci/asihpi/asihpi.c @@ -982,12 +982,12 @@ static int snd_card_asihpi_playback_open(struct snd_pcm_substream *substream) err = hpi_outstream_open(card->hpi->adapter->index, substream->number, &dpcm->h_stream); hpi_handle_error(err); - if (err) + if (err) { kfree(dpcm); - if (err == HPI_ERROR_OBJ_ALREADY_OPEN) - return -EBUSY; - if (err) + if (err == HPI_ERROR_OBJ_ALREADY_OPEN) + return -EBUSY; return -EIO; + } /*? also check ASI5000 samplerate source If external, only support external rate. @@ -1156,12 +1156,12 @@ static int snd_card_asihpi_capture_open(struct snd_pcm_substream *substream) err = hpi_handle_error( hpi_instream_open(card->hpi->adapter->index, substream->number, &dpcm->h_stream)); - if (err) + if (err) { kfree(dpcm); - if (err == HPI_ERROR_OBJ_ALREADY_OPEN) - return -EBUSY; - if (err) + if (err == HPI_ERROR_OBJ_ALREADY_OPEN) + return -EBUSY; return -EIO; + } timer_setup(&dpcm->timer, snd_card_asihpi_timer_function, 0); dpcm->substream = substream; From fdd7c7e0d2ab3987882c570612d4622f437292c7 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 4 Sep 2025 21:41:28 -0700 Subject: [PATCH 1607/3206] rust: Introduce atomic API helpers In order to support LKMM atomics in Rust, add rust_helper_* for atomic APIs. These helpers ensure the implementation of LKMM atomics in Rust is the same as in C. This could save the maintenance burden of having two similar atomic implementations in asm. Originally-by: Mark Rutland Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/all/20250719030827.61357-2-boqun.feng@gmail.com/ --- rust/helpers/atomic.c | 1040 +++++++++++++++++++++ rust/helpers/helpers.c | 1 + scripts/atomic/gen-atomics.sh | 1 + scripts/atomic/gen-rust-atomic-helpers.sh | 67 ++ 4 files changed, 1109 insertions(+) create mode 100644 rust/helpers/atomic.c create mode 100755 scripts/atomic/gen-rust-atomic-helpers.sh diff --git a/rust/helpers/atomic.c b/rust/helpers/atomic.c new file mode 100644 index 00000000000000..cf06b7ef9a1c55 --- /dev/null +++ b/rust/helpers/atomic.c @@ -0,0 +1,1040 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Generated by scripts/atomic/gen-rust-atomic-helpers.sh +// DO NOT MODIFY THIS FILE DIRECTLY + +/* + * This file provides helpers for the various atomic functions for Rust. + */ +#ifndef _RUST_ATOMIC_API_H +#define _RUST_ATOMIC_API_H + +#include + +// TODO: Remove this after INLINE_HELPERS support is added. +#ifndef __rust_helper +#define __rust_helper +#endif + +__rust_helper int +rust_helper_atomic_read(const atomic_t *v) +{ + return atomic_read(v); +} + +__rust_helper int +rust_helper_atomic_read_acquire(const atomic_t *v) +{ + return atomic_read_acquire(v); +} + +__rust_helper void +rust_helper_atomic_set(atomic_t *v, int i) +{ + atomic_set(v, i); +} + +__rust_helper void +rust_helper_atomic_set_release(atomic_t *v, int i) +{ + atomic_set_release(v, i); +} + +__rust_helper void +rust_helper_atomic_add(int i, atomic_t *v) +{ + atomic_add(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return(int i, atomic_t *v) +{ + return atomic_add_return(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return_acquire(int i, atomic_t *v) +{ + return atomic_add_return_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return_release(int i, atomic_t *v) +{ + return atomic_add_return_release(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return_relaxed(int i, atomic_t *v) +{ + return atomic_add_return_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add(int i, atomic_t *v) +{ + return atomic_fetch_add(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_acquire(int i, atomic_t *v) +{ + return atomic_fetch_add_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_release(int i, atomic_t *v) +{ + return atomic_fetch_add_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_add_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_sub(int i, atomic_t *v) +{ + atomic_sub(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return(int i, atomic_t *v) +{ + return atomic_sub_return(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return_acquire(int i, atomic_t *v) +{ + return atomic_sub_return_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return_release(int i, atomic_t *v) +{ + return atomic_sub_return_release(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return_relaxed(int i, atomic_t *v) +{ + return atomic_sub_return_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub(int i, atomic_t *v) +{ + return atomic_fetch_sub(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub_acquire(int i, atomic_t *v) +{ + return atomic_fetch_sub_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub_release(int i, atomic_t *v) +{ + return atomic_fetch_sub_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_sub_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_inc(atomic_t *v) +{ + atomic_inc(v); +} + +__rust_helper int +rust_helper_atomic_inc_return(atomic_t *v) +{ + return atomic_inc_return(v); +} + +__rust_helper int +rust_helper_atomic_inc_return_acquire(atomic_t *v) +{ + return atomic_inc_return_acquire(v); +} + +__rust_helper int +rust_helper_atomic_inc_return_release(atomic_t *v) +{ + return atomic_inc_return_release(v); +} + +__rust_helper int +rust_helper_atomic_inc_return_relaxed(atomic_t *v) +{ + return atomic_inc_return_relaxed(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc(atomic_t *v) +{ + return atomic_fetch_inc(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc_acquire(atomic_t *v) +{ + return atomic_fetch_inc_acquire(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc_release(atomic_t *v) +{ + return atomic_fetch_inc_release(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc_relaxed(atomic_t *v) +{ + return atomic_fetch_inc_relaxed(v); +} + +__rust_helper void +rust_helper_atomic_dec(atomic_t *v) +{ + atomic_dec(v); +} + +__rust_helper int +rust_helper_atomic_dec_return(atomic_t *v) +{ + return atomic_dec_return(v); +} + +__rust_helper int +rust_helper_atomic_dec_return_acquire(atomic_t *v) +{ + return atomic_dec_return_acquire(v); +} + +__rust_helper int +rust_helper_atomic_dec_return_release(atomic_t *v) +{ + return atomic_dec_return_release(v); +} + +__rust_helper int +rust_helper_atomic_dec_return_relaxed(atomic_t *v) +{ + return atomic_dec_return_relaxed(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec(atomic_t *v) +{ + return atomic_fetch_dec(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec_acquire(atomic_t *v) +{ + return atomic_fetch_dec_acquire(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec_release(atomic_t *v) +{ + return atomic_fetch_dec_release(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec_relaxed(atomic_t *v) +{ + return atomic_fetch_dec_relaxed(v); +} + +__rust_helper void +rust_helper_atomic_and(int i, atomic_t *v) +{ + atomic_and(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and(int i, atomic_t *v) +{ + return atomic_fetch_and(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and_acquire(int i, atomic_t *v) +{ + return atomic_fetch_and_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and_release(int i, atomic_t *v) +{ + return atomic_fetch_and_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_and_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_andnot(int i, atomic_t *v) +{ + atomic_andnot(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot(int i, atomic_t *v) +{ + return atomic_fetch_andnot(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot_acquire(int i, atomic_t *v) +{ + return atomic_fetch_andnot_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot_release(int i, atomic_t *v) +{ + return atomic_fetch_andnot_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_andnot_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_or(int i, atomic_t *v) +{ + atomic_or(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or(int i, atomic_t *v) +{ + return atomic_fetch_or(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or_acquire(int i, atomic_t *v) +{ + return atomic_fetch_or_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or_release(int i, atomic_t *v) +{ + return atomic_fetch_or_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_or_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_xor(int i, atomic_t *v) +{ + atomic_xor(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor(int i, atomic_t *v) +{ + return atomic_fetch_xor(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor_acquire(int i, atomic_t *v) +{ + return atomic_fetch_xor_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor_release(int i, atomic_t *v) +{ + return atomic_fetch_xor_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_xor_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_xchg(atomic_t *v, int new) +{ + return atomic_xchg(v, new); +} + +__rust_helper int +rust_helper_atomic_xchg_acquire(atomic_t *v, int new) +{ + return atomic_xchg_acquire(v, new); +} + +__rust_helper int +rust_helper_atomic_xchg_release(atomic_t *v, int new) +{ + return atomic_xchg_release(v, new); +} + +__rust_helper int +rust_helper_atomic_xchg_relaxed(atomic_t *v, int new) +{ + return atomic_xchg_relaxed(v, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg(v, old, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg_acquire(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg_acquire(v, old, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg_release(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg_release(v, old, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg_acquire(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg_release(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_sub_and_test(int i, atomic_t *v) +{ + return atomic_sub_and_test(i, v); +} + +__rust_helper bool +rust_helper_atomic_dec_and_test(atomic_t *v) +{ + return atomic_dec_and_test(v); +} + +__rust_helper bool +rust_helper_atomic_inc_and_test(atomic_t *v) +{ + return atomic_inc_and_test(v); +} + +__rust_helper bool +rust_helper_atomic_add_negative(int i, atomic_t *v) +{ + return atomic_add_negative(i, v); +} + +__rust_helper bool +rust_helper_atomic_add_negative_acquire(int i, atomic_t *v) +{ + return atomic_add_negative_acquire(i, v); +} + +__rust_helper bool +rust_helper_atomic_add_negative_release(int i, atomic_t *v) +{ + return atomic_add_negative_release(i, v); +} + +__rust_helper bool +rust_helper_atomic_add_negative_relaxed(int i, atomic_t *v) +{ + return atomic_add_negative_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_unless(atomic_t *v, int a, int u) +{ + return atomic_fetch_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic_add_unless(atomic_t *v, int a, int u) +{ + return atomic_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic_inc_not_zero(atomic_t *v) +{ + return atomic_inc_not_zero(v); +} + +__rust_helper bool +rust_helper_atomic_inc_unless_negative(atomic_t *v) +{ + return atomic_inc_unless_negative(v); +} + +__rust_helper bool +rust_helper_atomic_dec_unless_positive(atomic_t *v) +{ + return atomic_dec_unless_positive(v); +} + +__rust_helper int +rust_helper_atomic_dec_if_positive(atomic_t *v) +{ + return atomic_dec_if_positive(v); +} + +__rust_helper s64 +rust_helper_atomic64_read(const atomic64_t *v) +{ + return atomic64_read(v); +} + +__rust_helper s64 +rust_helper_atomic64_read_acquire(const atomic64_t *v) +{ + return atomic64_read_acquire(v); +} + +__rust_helper void +rust_helper_atomic64_set(atomic64_t *v, s64 i) +{ + atomic64_set(v, i); +} + +__rust_helper void +rust_helper_atomic64_set_release(atomic64_t *v, s64 i) +{ + atomic64_set_release(v, i); +} + +__rust_helper void +rust_helper_atomic64_add(s64 i, atomic64_t *v) +{ + atomic64_add(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return(s64 i, atomic64_t *v) +{ + return atomic64_add_return(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return_acquire(s64 i, atomic64_t *v) +{ + return atomic64_add_return_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return_release(s64 i, atomic64_t *v) +{ + return atomic64_add_return_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_add_return_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_sub(s64 i, atomic64_t *v) +{ + atomic64_sub(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return(s64 i, atomic64_t *v) +{ + return atomic64_sub_return(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return_acquire(s64 i, atomic64_t *v) +{ + return atomic64_sub_return_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return_release(s64 i, atomic64_t *v) +{ + return atomic64_sub_return_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_sub_return_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_inc(atomic64_t *v) +{ + atomic64_inc(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return(atomic64_t *v) +{ + return atomic64_inc_return(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return_acquire(atomic64_t *v) +{ + return atomic64_inc_return_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return_release(atomic64_t *v) +{ + return atomic64_inc_return_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return_relaxed(atomic64_t *v) +{ + return atomic64_inc_return_relaxed(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc(atomic64_t *v) +{ + return atomic64_fetch_inc(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc_acquire(atomic64_t *v) +{ + return atomic64_fetch_inc_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc_release(atomic64_t *v) +{ + return atomic64_fetch_inc_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc_relaxed(atomic64_t *v) +{ + return atomic64_fetch_inc_relaxed(v); +} + +__rust_helper void +rust_helper_atomic64_dec(atomic64_t *v) +{ + atomic64_dec(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return(atomic64_t *v) +{ + return atomic64_dec_return(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return_acquire(atomic64_t *v) +{ + return atomic64_dec_return_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return_release(atomic64_t *v) +{ + return atomic64_dec_return_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return_relaxed(atomic64_t *v) +{ + return atomic64_dec_return_relaxed(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec(atomic64_t *v) +{ + return atomic64_fetch_dec(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec_acquire(atomic64_t *v) +{ + return atomic64_fetch_dec_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec_release(atomic64_t *v) +{ + return atomic64_fetch_dec_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec_relaxed(atomic64_t *v) +{ + return atomic64_fetch_dec_relaxed(v); +} + +__rust_helper void +rust_helper_atomic64_and(s64 i, atomic64_t *v) +{ + atomic64_and(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_andnot(s64 i, atomic64_t *v) +{ + atomic64_andnot(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_or(s64 i, atomic64_t *v) +{ + atomic64_or(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_xor(s64 i, atomic64_t *v) +{ + atomic64_xor(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_xchg(atomic64_t *v, s64 new) +{ + return atomic64_xchg(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_xchg_acquire(atomic64_t *v, s64 new) +{ + return atomic64_xchg_acquire(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_xchg_release(atomic64_t *v, s64 new) +{ + return atomic64_xchg_release(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_xchg_relaxed(atomic64_t *v, s64 new) +{ + return atomic64_xchg_relaxed(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg(v, old, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg_acquire(v, old, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg_release(v, old, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg_acquire(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg_release(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_sub_and_test(s64 i, atomic64_t *v) +{ + return atomic64_sub_and_test(i, v); +} + +__rust_helper bool +rust_helper_atomic64_dec_and_test(atomic64_t *v) +{ + return atomic64_dec_and_test(v); +} + +__rust_helper bool +rust_helper_atomic64_inc_and_test(atomic64_t *v) +{ + return atomic64_inc_and_test(v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative(s64 i, atomic64_t *v) +{ + return atomic64_add_negative(i, v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative_acquire(s64 i, atomic64_t *v) +{ + return atomic64_add_negative_acquire(i, v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative_release(s64 i, atomic64_t *v) +{ + return atomic64_add_negative_release(i, v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_add_negative_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) +{ + return atomic64_fetch_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) +{ + return atomic64_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic64_inc_not_zero(atomic64_t *v) +{ + return atomic64_inc_not_zero(v); +} + +__rust_helper bool +rust_helper_atomic64_inc_unless_negative(atomic64_t *v) +{ + return atomic64_inc_unless_negative(v); +} + +__rust_helper bool +rust_helper_atomic64_dec_unless_positive(atomic64_t *v) +{ + return atomic64_dec_unless_positive(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_if_positive(atomic64_t *v) +{ + return atomic64_dec_if_positive(v); +} + +#endif /* _RUST_ATOMIC_API_H */ +// 615a0e0c98b5973a47fe4fa65e92935051ca00ed diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 7cf7fe95e41dd5..7053f9245759a9 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -7,6 +7,7 @@ * Sorted alphabetically. */ +#include "atomic.c" #include "auxiliary.c" #include "blk.c" #include "bug.c" diff --git a/scripts/atomic/gen-atomics.sh b/scripts/atomic/gen-atomics.sh index 5b98a83076932f..02508d0d6fe45e 100755 --- a/scripts/atomic/gen-atomics.sh +++ b/scripts/atomic/gen-atomics.sh @@ -11,6 +11,7 @@ cat < ${LINUXDIR}/include/${header} diff --git a/scripts/atomic/gen-rust-atomic-helpers.sh b/scripts/atomic/gen-rust-atomic-helpers.sh new file mode 100755 index 00000000000000..45b1e100ed7c63 --- /dev/null +++ b/scripts/atomic/gen-rust-atomic-helpers.sh @@ -0,0 +1,67 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +ATOMICDIR=$(dirname $0) + +. ${ATOMICDIR}/atomic-tbl.sh + +#gen_proto_order_variant(meta, pfx, name, sfx, order, atomic, int, arg...) +gen_proto_order_variant() +{ + local meta="$1"; shift + local pfx="$1"; shift + local name="$1"; shift + local sfx="$1"; shift + local order="$1"; shift + local atomic="$1"; shift + local int="$1"; shift + + local atomicname="${atomic}_${pfx}${name}${sfx}${order}" + + local ret="$(gen_ret_type "${meta}" "${int}")" + local params="$(gen_params "${int}" "${atomic}" "$@")" + local args="$(gen_args "$@")" + local retstmt="$(gen_ret_stmt "${meta}")" + +cat < + +// TODO: Remove this after INLINE_HELPERS support is added. +#ifndef __rust_helper +#define __rust_helper +#endif + +EOF + +grep '^[a-z]' "$1" | while read name meta args; do + gen_proto "${meta}" "${name}" "atomic" "int" ${args} +done + +grep '^[a-z]' "$1" | while read name meta args; do + gen_proto "${meta}" "${name}" "atomic64" "s64" ${args} +done + +cat < Date: Thu, 4 Sep 2025 21:41:29 -0700 Subject: [PATCH 1608/3206] rust: sync: Add basic atomic operation mapping framework Preparation for generic atomic implementation. To unify the implementation of a generic method over `i32` and `i64`, the C side atomic methods need to be grouped so that in a generic method, they can be referred as ::, otherwise their parameters and return value are different between `i32` and `i64`, which would require using `transmute()` to unify the type into a `T`. Introduce `AtomicImpl` to represent a basic type in Rust that has the direct mapping to an atomic implementation from C. Use a sealed trait to restrict `AtomicImpl` to only support `i32` and `i64` for now. Further, different methods are put into different `*Ops` trait groups, and this is for the future when smaller types like `i8`/`i16` are supported but only with a limited set of API (e.g. only set(), load(), xchg() and cmpxchg(), no add() or sub() etc). While the atomic mod is introduced, documentation is also added for memory models and data races. Also bump my role to the maintainer of ATOMIC INFRASTRUCTURE to reflect my responsibility on the Rust atomic mod. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Reviewed-by: Elle Rhumsaa Link: https://lore.kernel.org/all/20250719030827.61357-3-boqun.feng@gmail.com/ --- MAINTAINERS | 4 +- rust/kernel/sync.rs | 1 + rust/kernel/sync/atomic.rs | 22 +++ rust/kernel/sync/atomic/internal.rs | 265 ++++++++++++++++++++++++++++ 4 files changed, 291 insertions(+), 1 deletion(-) create mode 100644 rust/kernel/sync/atomic.rs create mode 100644 rust/kernel/sync/atomic/internal.rs diff --git a/MAINTAINERS b/MAINTAINERS index cd7ff55b5d3217..538778b81fd4cc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3990,7 +3990,7 @@ F: drivers/input/touchscreen/atmel_mxt_ts.c ATOMIC INFRASTRUCTURE M: Will Deacon M: Peter Zijlstra -R: Boqun Feng +M: Boqun Feng R: Mark Rutland L: linux-kernel@vger.kernel.org S: Maintained @@ -3999,6 +3999,8 @@ F: arch/*/include/asm/atomic*.h F: include/*/atomic*.h F: include/linux/refcount.h F: scripts/atomic/ +F: rust/kernel/sync/atomic.rs +F: rust/kernel/sync/atomic/ ATTO EXPRESSSAS SAS/SATA RAID SCSI DRIVER M: Bradley Grove diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index 00f9b558a3ade1..7e962e5429d2e9 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -11,6 +11,7 @@ use pin_init; mod arc; pub mod aref; +pub mod atomic; pub mod completion; mod condvar; pub mod lock; diff --git a/rust/kernel/sync/atomic.rs b/rust/kernel/sync/atomic.rs new file mode 100644 index 00000000000000..b9f2f478007326 --- /dev/null +++ b/rust/kernel/sync/atomic.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Atomic primitives. +//! +//! These primitives have the same semantics as their C counterparts: and the precise definitions of +//! semantics can be found at [`LKMM`]. Note that Linux Kernel Memory (Consistency) Model is the +//! only model for Rust code in kernel, and Rust's own atomics should be avoided. +//! +//! # Data races +//! +//! [`LKMM`] atomics have different rules regarding data races: +//! +//! - A normal write from C side is treated as an atomic write if +//! CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=y. +//! - Mixed-size atomic accesses don't cause data races. +//! +//! [`LKMM`]: srctree/tools/memory-model/ + +#[allow(dead_code, unreachable_pub)] +mod internal; + +pub use internal::AtomicImpl; diff --git a/rust/kernel/sync/atomic/internal.rs b/rust/kernel/sync/atomic/internal.rs new file mode 100644 index 00000000000000..6fdd8e59f45be8 --- /dev/null +++ b/rust/kernel/sync/atomic/internal.rs @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Atomic internal implementations. +//! +//! Provides 1:1 mapping to the C atomic operations. + +use crate::bindings; +use crate::macros::paste; +use core::cell::UnsafeCell; + +mod private { + /// Sealed trait marker to disable customized impls on atomic implementation traits. + pub trait Sealed {} +} + +// `i32` and `i64` are only supported atomic implementations. +impl private::Sealed for i32 {} +impl private::Sealed for i64 {} + +/// A marker trait for types that implement atomic operations with C side primitives. +/// +/// This trait is sealed, and only types that have directly mapping to the C side atomics should +/// impl this: +/// +/// - `i32` maps to `atomic_t`. +/// - `i64` maps to `atomic64_t`. +pub trait AtomicImpl: Sized + Send + Copy + private::Sealed { + /// The type of the delta in arithmetic or logical operations. + /// + /// For example, in `atomic_add(ptr, v)`, it's the type of `v`. Usually it's the same type of + /// [`Self`], but it may be different for the atomic pointer type. + type Delta; +} + +// `atomic_t` implements atomic operations on `i32`. +impl AtomicImpl for i32 { + type Delta = Self; +} + +// `atomic64_t` implements atomic operations on `i64`. +impl AtomicImpl for i64 { + type Delta = Self; +} + +/// Atomic representation. +#[repr(transparent)] +pub struct AtomicRepr(UnsafeCell); + +impl AtomicRepr { + /// Creates a new atomic representation `T`. + pub const fn new(v: T) -> Self { + Self(UnsafeCell::new(v)) + } + + /// Returns a pointer to the underlying `T`. + /// + /// # Guarantees + /// + /// The returned pointer is valid and properly aligned (i.e. aligned to [`align_of::()`]). + pub const fn as_ptr(&self) -> *mut T { + // GUARANTEE: `self.0` is an `UnsafeCell`, therefore the pointer returned by `.get()` + // must be valid and properly aligned. + self.0.get() + } +} + +// This macro generates the function signature with given argument list and return type. +macro_rules! declare_atomic_method { + ( + $(#[doc=$doc:expr])* + $func:ident($($arg:ident : $arg_type:ty),*) $(-> $ret:ty)? + ) => { + paste!( + $(#[doc = $doc])* + fn [< atomic_ $func >]($($arg: $arg_type,)*) $(-> $ret)?; + ); + }; + ( + $(#[doc=$doc:expr])* + $func:ident [$variant:ident $($rest:ident)*]($($arg_sig:tt)*) $(-> $ret:ty)? + ) => { + paste!( + declare_atomic_method!( + $(#[doc = $doc])* + [< $func _ $variant >]($($arg_sig)*) $(-> $ret)? + ); + ); + + declare_atomic_method!( + $(#[doc = $doc])* + $func [$($rest)*]($($arg_sig)*) $(-> $ret)? + ); + }; + ( + $(#[doc=$doc:expr])* + $func:ident []($($arg_sig:tt)*) $(-> $ret:ty)? + ) => { + declare_atomic_method!( + $(#[doc = $doc])* + $func($($arg_sig)*) $(-> $ret)? + ); + } +} + +// This macro generates the function implementation with given argument list and return type, and it +// will replace "call(...)" expression with "$ctype _ $func" to call the real C function. +macro_rules! impl_atomic_method { + ( + ($ctype:ident) $func:ident($($arg:ident: $arg_type:ty),*) $(-> $ret:ty)? { + $unsafe:tt { call($($c_arg:expr),*) } + } + ) => { + paste!( + #[inline(always)] + fn [< atomic_ $func >]($($arg: $arg_type,)*) $(-> $ret)? { + // TODO: Ideally we want to use the SAFETY comments written at the macro invocation + // (e.g. in `declare_and_impl_atomic_methods!()`, however, since SAFETY comments + // are just comments, and they are not passed to macros as tokens, therefore we + // cannot use them here. One potential improvement is that if we support using + // attributes as an alternative for SAFETY comments, then we can use that for macro + // generating code. + // + // SAFETY: specified on macro invocation. + $unsafe { bindings::[< $ctype _ $func >]($($c_arg,)*) } + } + ); + }; + ( + ($ctype:ident) $func:ident[$variant:ident $($rest:ident)*]($($arg_sig:tt)*) $(-> $ret:ty)? { + $unsafe:tt { call($($arg:tt)*) } + } + ) => { + paste!( + impl_atomic_method!( + ($ctype) [< $func _ $variant >]($($arg_sig)*) $( -> $ret)? { + $unsafe { call($($arg)*) } + } + ); + ); + impl_atomic_method!( + ($ctype) $func [$($rest)*]($($arg_sig)*) $( -> $ret)? { + $unsafe { call($($arg)*) } + } + ); + }; + ( + ($ctype:ident) $func:ident[]($($arg_sig:tt)*) $( -> $ret:ty)? { + $unsafe:tt { call($($arg:tt)*) } + } + ) => { + impl_atomic_method!( + ($ctype) $func($($arg_sig)*) $(-> $ret)? { + $unsafe { call($($arg)*) } + } + ); + } +} + +// Delcares $ops trait with methods and implements the trait for `i32` and `i64`. +macro_rules! declare_and_impl_atomic_methods { + ($(#[$attr:meta])* $pub:vis trait $ops:ident { + $( + $(#[doc=$doc:expr])* + fn $func:ident [$($variant:ident),*]($($arg_sig:tt)*) $( -> $ret:ty)? { + $unsafe:tt { bindings::#call($($arg:tt)*) } + } + )* + }) => { + $(#[$attr])* + $pub trait $ops: AtomicImpl { + $( + declare_atomic_method!( + $(#[doc=$doc])* + $func[$($variant)*]($($arg_sig)*) $(-> $ret)? + ); + )* + } + + impl $ops for i32 { + $( + impl_atomic_method!( + (atomic) $func[$($variant)*]($($arg_sig)*) $(-> $ret)? { + $unsafe { call($($arg)*) } + } + ); + )* + } + + impl $ops for i64 { + $( + impl_atomic_method!( + (atomic64) $func[$($variant)*]($($arg_sig)*) $(-> $ret)? { + $unsafe { call($($arg)*) } + } + ); + )* + } + } +} + +declare_and_impl_atomic_methods!( + /// Basic atomic operations + pub trait AtomicBasicOps { + /// Atomic read (load). + fn read[acquire](a: &AtomicRepr) -> Self { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast()) } + } + + /// Atomic set (store). + fn set[release](a: &AtomicRepr, v: Self) { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast(), v) } + } + } +); + +declare_and_impl_atomic_methods!( + /// Exchange and compare-and-exchange atomic operations + pub trait AtomicExchangeOps { + /// Atomic exchange. + /// + /// Atomically updates `*a` to `v` and returns the old value. + fn xchg[acquire, release, relaxed](a: &AtomicRepr, v: Self) -> Self { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast(), v) } + } + + /// Atomic compare and exchange. + /// + /// If `*a` == `*old`, atomically updates `*a` to `new`. Otherwise, `*a` is not + /// modified, `*old` is updated to the current value of `*a`. + /// + /// Return `true` if the update of `*a` occurred, `false` otherwise. + fn try_cmpxchg[acquire, release, relaxed]( + a: &AtomicRepr, old: &mut Self, new: Self + ) -> bool { + // SAFETY: `a.as_ptr()` is valid and properly aligned. `core::ptr::from_mut(old)` + // is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast(), core::ptr::from_mut(old), new) } + } + } +); + +declare_and_impl_atomic_methods!( + /// Atomic arithmetic operations + pub trait AtomicArithmeticOps { + /// Atomic add (wrapping). + /// + /// Atomically updates `*a` to `(*a).wrapping_add(v)`. + fn add[](a: &AtomicRepr, v: Self::Delta) { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(v, a.as_ptr().cast()) } + } + + /// Atomic fetch and add (wrapping). + /// + /// Atomically updates `*a` to `(*a).wrapping_add(v)`, and returns the value of `*a` + /// before the update. + fn fetch_add[acquire, release, relaxed](a: &AtomicRepr, v: Self::Delta) -> Self { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(v, a.as_ptr().cast()) } + } + } +); From b638c9bc471030ebd898b57c5bf7c96f6d70cda4 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 4 Sep 2025 21:41:30 -0700 Subject: [PATCH 1609/3206] rust: sync: atomic: Add ordering annotation types Preparation for atomic primitives. Instead of a suffix like _acquire, a method parameter along with the corresponding generic parameter will be used to specify the ordering of an atomic operations. For example, atomic load() can be defined as: impl Atomic { pub fn load(&self, _o: O) -> T { ... } } and acquire users would do: let r = x.load(Acquire); relaxed users: let r = x.load(Relaxed); doing the following: let r = x.load(Release); will cause a compiler error. Compared to suffixes, it's easier to tell what ordering variants an operation has, and it also make it easier to unify the implementation of all ordering variants in one method via generic. The `TYPE` associate const is for generic function to pick up the particular implementation specified by an ordering annotation. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Reviewed-by: Elle Rhumsaa Link: https://lore.kernel.org/all/20250719030827.61357-4-boqun.feng@gmail.com/ --- rust/kernel/sync/atomic.rs | 2 + rust/kernel/sync/atomic/ordering.rs | 104 ++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 rust/kernel/sync/atomic/ordering.rs diff --git a/rust/kernel/sync/atomic.rs b/rust/kernel/sync/atomic.rs index b9f2f478007326..2302e6d51fe2b1 100644 --- a/rust/kernel/sync/atomic.rs +++ b/rust/kernel/sync/atomic.rs @@ -18,5 +18,7 @@ #[allow(dead_code, unreachable_pub)] mod internal; +pub mod ordering; pub use internal::AtomicImpl; +pub use ordering::{Acquire, Full, Relaxed, Release}; diff --git a/rust/kernel/sync/atomic/ordering.rs b/rust/kernel/sync/atomic/ordering.rs new file mode 100644 index 00000000000000..3f103aa8db99b1 --- /dev/null +++ b/rust/kernel/sync/atomic/ordering.rs @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Memory orderings. +//! +//! The semantics of these orderings follows the [`LKMM`] definitions and rules. +//! +//! - [`Acquire`] provides ordering between the load part of the annotated operation and all the +//! following memory accesses, and if there is a store part, the store part has the [`Relaxed`] +//! ordering. +//! - [`Release`] provides ordering between all the preceding memory accesses and the store part of +//! the annotated operation, and if there is a load part, the load part has the [`Relaxed`] +//! ordering. +//! - [`Full`] means "fully-ordered", that is: +//! - It provides ordering between all the preceding memory accesses and the annotated operation. +//! - It provides ordering between the annotated operation and all the following memory accesses. +//! - It provides ordering between all the preceding memory accesses and all the following memory +//! accesses. +//! - All the orderings are the same strength as a full memory barrier (i.e. `smp_mb()`). +//! - [`Relaxed`] provides no ordering except the dependency orderings. Dependency orderings are +//! described in "DEPENDENCY RELATIONS" in [`LKMM`]'s [`explanation`]. +//! +//! [`LKMM`]: srctree/tools/memory-model/ +//! [`explanation`]: srctree/tools/memory-model/Documentation/explanation.txt + +/// The annotation type for relaxed memory ordering, for the description of relaxed memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Relaxed; + +/// The annotation type for acquire memory ordering, for the description of acquire memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Acquire; + +/// The annotation type for release memory ordering, for the description of release memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Release; + +/// The annotation type for fully-ordered memory ordering, for the description fully-ordered memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Full; + +/// Describes the exact memory ordering. +#[doc(hidden)] +pub enum OrderingType { + /// Relaxed ordering. + Relaxed, + /// Acquire ordering. + Acquire, + /// Release ordering. + Release, + /// Fully-ordered. + Full, +} + +mod internal { + /// Sealed trait, can be only implemented inside atomic mod. + pub trait Sealed {} + + impl Sealed for super::Relaxed {} + impl Sealed for super::Acquire {} + impl Sealed for super::Release {} + impl Sealed for super::Full {} +} + +/// The trait bound for annotating operations that support any ordering. +pub trait Ordering: internal::Sealed { + /// Describes the exact memory ordering. + const TYPE: OrderingType; +} + +impl Ordering for Relaxed { + const TYPE: OrderingType = OrderingType::Relaxed; +} + +impl Ordering for Acquire { + const TYPE: OrderingType = OrderingType::Acquire; +} + +impl Ordering for Release { + const TYPE: OrderingType = OrderingType::Release; +} + +impl Ordering for Full { + const TYPE: OrderingType = OrderingType::Full; +} + +/// The trait bound for operations that only support acquire or relaxed ordering. +pub trait AcquireOrRelaxed: Ordering {} + +impl AcquireOrRelaxed for Acquire {} +impl AcquireOrRelaxed for Relaxed {} + +/// The trait bound for operations that only support release or relaxed ordering. +pub trait ReleaseOrRelaxed: Ordering {} + +impl ReleaseOrRelaxed for Release {} +impl ReleaseOrRelaxed for Relaxed {} From 29c32c405e53605dfd24054a4460516f7f6e3938 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 4 Sep 2025 21:41:31 -0700 Subject: [PATCH 1610/3206] rust: sync: atomic: Add generic atomics To provide using LKMM atomics for Rust code, a generic `Atomic` is added, currently `T` needs to be Send + Copy because these are the straightforward usages and all basic types support this. Implement `AtomicType` for `i32` and `i64`, and so far only basic operations load() and store() are introduced. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Reviewed-by: Elle Rhumsaa Link: https://lore.kernel.org/all/20250719030827.61357-5-boqun.feng@gmail.com/ --- rust/kernel/sync/atomic.rs | 271 +++++++++++++++++++++++++++ rust/kernel/sync/atomic/predefine.rs | 15 ++ 2 files changed, 286 insertions(+) create mode 100644 rust/kernel/sync/atomic/predefine.rs diff --git a/rust/kernel/sync/atomic.rs b/rust/kernel/sync/atomic.rs index 2302e6d51fe2b1..ea5782b6ee9591 100644 --- a/rust/kernel/sync/atomic.rs +++ b/rust/kernel/sync/atomic.rs @@ -19,6 +19,277 @@ #[allow(dead_code, unreachable_pub)] mod internal; pub mod ordering; +mod predefine; pub use internal::AtomicImpl; pub use ordering::{Acquire, Full, Relaxed, Release}; + +use crate::build_error; +use internal::{AtomicBasicOps, AtomicRepr}; +use ordering::OrderingType; + +/// A memory location which can be safely modified from multiple execution contexts. +/// +/// This has the same size, alignment and bit validity as the underlying type `T`. And it disables +/// niche optimization for the same reason as [`UnsafeCell`]. +/// +/// The atomic operations are implemented in a way that is fully compatible with the [Linux Kernel +/// Memory (Consistency) Model][LKMM], hence they should be modeled as the corresponding +/// [`LKMM`][LKMM] atomic primitives. With the help of [`Atomic::from_ptr()`] and +/// [`Atomic::as_ptr()`], this provides a way to interact with [C-side atomic operations] +/// (including those without the `atomic` prefix, e.g. `READ_ONCE()`, `WRITE_ONCE()`, +/// `smp_load_acquire()` and `smp_store_release()`). +/// +/// # Invariants +/// +/// `self.0` is a valid `T`. +/// +/// [`UnsafeCell`]: core::cell::UnsafeCell +/// [LKMM]: srctree/tools/memory-model/ +/// [C-side atomic operations]: srctree/Documentation/atomic_t.txt +#[repr(transparent)] +pub struct Atomic(AtomicRepr); + +// SAFETY: `Atomic` is safe to share among execution contexts because all accesses are atomic. +unsafe impl Sync for Atomic {} + +/// Types that support basic atomic operations. +/// +/// # Round-trip transmutability +/// +/// `T` is round-trip transmutable to `U` if and only if both of these properties hold: +/// +/// - Any valid bit pattern for `T` is also a valid bit pattern for `U`. +/// - Transmuting (e.g. using [`transmute()`]) a value of type `T` to `U` and then to `T` again +/// yields a value that is in all aspects equivalent to the original value. +/// +/// # Safety +/// +/// - [`Self`] must have the same size and alignment as [`Self::Repr`]. +/// - [`Self`] must be [round-trip transmutable] to [`Self::Repr`]. +/// +/// Note that this is more relaxed than requiring the bi-directional transmutability (i.e. +/// [`transmute()`] is always sound between `U` and `T`) because of the support for atomic +/// variables over unit-only enums, see [Examples]. +/// +/// # Limitations +/// +/// Because C primitives are used to implement the atomic operations, and a C function requires a +/// valid object of a type to operate on (i.e. no `MaybeUninit<_>`), hence at the Rust <-> C +/// surface, only types with all the bits initialized can be passed. As a result, types like `(u8, +/// u16)` (padding bytes are uninitialized) are currently not supported. +/// +/// # Examples +/// +/// A unit-only enum that implements [`AtomicType`]: +/// +/// ``` +/// use kernel::sync::atomic::{AtomicType, Atomic, Relaxed}; +/// +/// #[derive(Clone, Copy, PartialEq, Eq)] +/// #[repr(i32)] +/// enum State { +/// Uninit = 0, +/// Working = 1, +/// Done = 2, +/// }; +/// +/// // SAFETY: `State` and `i32` has the same size and alignment, and it's round-trip +/// // transmutable to `i32`. +/// unsafe impl AtomicType for State { +/// type Repr = i32; +/// } +/// +/// let s = Atomic::new(State::Uninit); +/// +/// assert_eq!(State::Uninit, s.load(Relaxed)); +/// ``` +/// [`transmute()`]: core::mem::transmute +/// [round-trip transmutable]: AtomicType#round-trip-transmutability +/// [Examples]: AtomicType#examples +pub unsafe trait AtomicType: Sized + Send + Copy { + /// The backing atomic implementation type. + type Repr: AtomicImpl; +} + +#[inline(always)] +const fn into_repr(v: T) -> T::Repr { + // SAFETY: Per the safety requirement of `AtomicType`, `T` is round-trip transmutable to + // `T::Repr`, therefore the transmute operation is sound. + unsafe { core::mem::transmute_copy(&v) } +} + +/// # Safety +/// +/// `r` must be a valid bit pattern of `T`. +#[inline(always)] +const unsafe fn from_repr(r: T::Repr) -> T { + // SAFETY: Per the safety requirement of the function, the transmute operation is sound. + unsafe { core::mem::transmute_copy(&r) } +} + +impl Atomic { + /// Creates a new atomic `T`. + pub const fn new(v: T) -> Self { + // INVARIANT: Per the safety requirement of `AtomicType`, `into_repr(v)` is a valid `T`. + Self(AtomicRepr::new(into_repr(v))) + } + + /// Creates a reference to an atomic `T` from a pointer of `T`. + /// + /// This usually is used when communicating with C side or manipulating a C struct, see + /// examples below. + /// + /// # Safety + /// + /// - `ptr` is aligned to `align_of::()`. + /// - `ptr` is valid for reads and writes for `'a`. + /// - For the duration of `'a`, other accesses to `*ptr` must not cause data races (defined + /// by [`LKMM`]) against atomic operations on the returned reference. Note that if all other + /// accesses are atomic, then this safety requirement is trivially fulfilled. + /// + /// [`LKMM`]: srctree/tools/memory-model + /// + /// # Examples + /// + /// Using [`Atomic::from_ptr()`] combined with [`Atomic::load()`] or [`Atomic::store()`] can + /// achieve the same functionality as `READ_ONCE()`/`smp_load_acquire()` or + /// `WRITE_ONCE()`/`smp_store_release()` in C side: + /// + /// ``` + /// # use kernel::types::Opaque; + /// use kernel::sync::atomic::{Atomic, Relaxed, Release}; + /// + /// // Assume there is a C struct `foo`. + /// mod cbindings { + /// #[repr(C)] + /// pub(crate) struct foo { + /// pub(crate) a: i32, + /// pub(crate) b: i32 + /// } + /// } + /// + /// let tmp = Opaque::new(cbindings::foo { a: 1, b: 2 }); + /// + /// // struct foo *foo_ptr = ..; + /// let foo_ptr = tmp.get(); + /// + /// // SAFETY: `foo_ptr` is valid, and `.a` is in bounds. + /// let foo_a_ptr = unsafe { &raw mut (*foo_ptr).a }; + /// + /// // a = READ_ONCE(foo_ptr->a); + /// // + /// // SAFETY: `foo_a_ptr` is valid for read, and all other accesses on it is atomic, so no + /// // data race. + /// let a = unsafe { Atomic::from_ptr(foo_a_ptr) }.load(Relaxed); + /// # assert_eq!(a, 1); + /// + /// // smp_store_release(&foo_ptr->a, 2); + /// // + /// // SAFETY: `foo_a_ptr` is valid for writes, and all other accesses on it is atomic, so + /// // no data race. + /// unsafe { Atomic::from_ptr(foo_a_ptr) }.store(2, Release); + /// ``` + pub unsafe fn from_ptr<'a>(ptr: *mut T) -> &'a Self + where + T: Sync, + { + // CAST: `T` and `Atomic` have the same size, alignment and bit validity. + // SAFETY: Per function safety requirement, `ptr` is a valid pointer and the object will + // live long enough. It's safe to return a `&Atomic` because function safety requirement + // guarantees other accesses won't cause data races. + unsafe { &*ptr.cast::() } + } + + /// Returns a pointer to the underlying atomic `T`. + /// + /// Note that use of the return pointer must not cause data races defined by [`LKMM`]. + /// + /// # Guarantees + /// + /// The returned pointer is valid and properly aligned (i.e. aligned to [`align_of::()`]). + /// + /// [`LKMM`]: srctree/tools/memory-model + /// [`align_of::()`]: core::mem::align_of + pub const fn as_ptr(&self) -> *mut T { + // GUARANTEE: Per the function guarantee of `AtomicRepr::as_ptr()`, the `self.0.as_ptr()` + // must be a valid and properly aligned pointer for `T::Repr`, and per the safety guarantee + // of `AtomicType`, it's a valid and properly aligned pointer of `T`. + self.0.as_ptr().cast() + } + + /// Returns a mutable reference to the underlying atomic `T`. + /// + /// This is safe because the mutable reference of the atomic `T` guarantees exclusive access. + pub fn get_mut(&mut self) -> &mut T { + // CAST: `T` and `T::Repr` has the same size and alignment per the safety requirement of + // `AtomicType`, and per the type invariants `self.0` is a valid `T`, therefore the casting + // result is a valid pointer of `T`. + // SAFETY: The pointer is valid per the CAST comment above, and the mutable reference + // guarantees exclusive access. + unsafe { &mut *self.0.as_ptr().cast() } + } +} + +impl Atomic +where + T::Repr: AtomicBasicOps, +{ + /// Loads the value from the atomic `T`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Relaxed}; + /// + /// let x = Atomic::new(42i32); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// let x = Atomic::new(42i64); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// ``` + #[doc(alias("atomic_read", "atomic64_read"))] + #[inline(always)] + pub fn load(&self, _: Ordering) -> T { + let v = { + match Ordering::TYPE { + OrderingType::Relaxed => T::Repr::atomic_read(&self.0), + OrderingType::Acquire => T::Repr::atomic_read_acquire(&self.0), + _ => build_error!("Wrong ordering"), + } + }; + + // SAFETY: `v` comes from reading `self.0`, which is a valid `T` per the type invariants. + unsafe { from_repr(v) } + } + + /// Stores a value to the atomic `T`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Relaxed}; + /// + /// let x = Atomic::new(42i32); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// x.store(43, Relaxed); + /// + /// assert_eq!(43, x.load(Relaxed)); + /// ``` + #[doc(alias("atomic_set", "atomic64_set"))] + #[inline(always)] + pub fn store(&self, v: T, _: Ordering) { + let v = into_repr(v); + + // INVARIANT: `v` is a valid `T`, and is stored to `self.0` by `atomic_set*()`. + match Ordering::TYPE { + OrderingType::Relaxed => T::Repr::atomic_set(&self.0, v), + OrderingType::Release => T::Repr::atomic_set_release(&self.0, v), + _ => build_error!("Wrong ordering"), + } + } +} diff --git a/rust/kernel/sync/atomic/predefine.rs b/rust/kernel/sync/atomic/predefine.rs new file mode 100644 index 00000000000000..33356deee9524a --- /dev/null +++ b/rust/kernel/sync/atomic/predefine.rs @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Pre-defined atomic types + +// SAFETY: `i32` has the same size and alignment with itself, and is round-trip transmutable to +// itself. +unsafe impl super::AtomicType for i32 { + type Repr = i32; +} + +// SAFETY: `i64` has the same size and alignment with itself, and is round-trip transmutable to +// itself. +unsafe impl super::AtomicType for i64 { + type Repr = i64; +} From b606a532c01380ac282fd0a0858e3e8ea01ad904 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 4 Sep 2025 21:41:32 -0700 Subject: [PATCH 1611/3206] rust: sync: atomic: Add atomic {cmp,}xchg operations xchg() and cmpxchg() are basic operations on atomic. Provide these based on C APIs. Note that cmpxchg() use the similar function signature as compare_exchange() in Rust std: returning a `Result`, `Ok(old)` means the operation succeeds and `Err(old)` means the operation fails. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Reviewed-by: Elle Rhumsaa Link: https://lore.kernel.org/all/20250719030827.61357-6-boqun.feng@gmail.com/ --- rust/kernel/sync/atomic.rs | 168 ++++++++++++++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 1 deletion(-) diff --git a/rust/kernel/sync/atomic.rs b/rust/kernel/sync/atomic.rs index ea5782b6ee9591..4c32d12dc61e68 100644 --- a/rust/kernel/sync/atomic.rs +++ b/rust/kernel/sync/atomic.rs @@ -25,7 +25,7 @@ pub use internal::AtomicImpl; pub use ordering::{Acquire, Full, Relaxed, Release}; use crate::build_error; -use internal::{AtomicBasicOps, AtomicRepr}; +use internal::{AtomicBasicOps, AtomicExchangeOps, AtomicRepr}; use ordering::OrderingType; /// A memory location which can be safely modified from multiple execution contexts. @@ -293,3 +293,169 @@ where } } } + +impl Atomic +where + T::Repr: AtomicExchangeOps, +{ + /// Atomic exchange. + /// + /// Atomically updates `*self` to `v` and returns the old value of `*self`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Acquire, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.xchg(52, Acquire)); + /// assert_eq!(52, x.load(Relaxed)); + /// ``` + #[doc(alias("atomic_xchg", "atomic64_xchg", "swap"))] + #[inline(always)] + pub fn xchg(&self, v: T, _: Ordering) -> T { + let v = into_repr(v); + + // INVARIANT: `self.0` is a valid `T` after `atomic_xchg*()` because `v` is transmutable to + // `T`. + let ret = { + match Ordering::TYPE { + OrderingType::Full => T::Repr::atomic_xchg(&self.0, v), + OrderingType::Acquire => T::Repr::atomic_xchg_acquire(&self.0, v), + OrderingType::Release => T::Repr::atomic_xchg_release(&self.0, v), + OrderingType::Relaxed => T::Repr::atomic_xchg_relaxed(&self.0, v), + } + }; + + // SAFETY: `ret` comes from reading `*self`, which is a valid `T` per type invariants. + unsafe { from_repr(ret) } + } + + /// Atomic compare and exchange. + /// + /// If `*self` == `old`, atomically updates `*self` to `new`. Otherwise, `*self` is not + /// modified. + /// + /// Compare: The comparison is done via the byte level comparison between `*self` and `old`. + /// + /// Ordering: When succeeds, provides the corresponding ordering as the `Ordering` type + /// parameter indicates, and a failed one doesn't provide any ordering, the load part of a + /// failed cmpxchg is a [`Relaxed`] load. + /// + /// Returns `Ok(value)` if cmpxchg succeeds, and `value` is guaranteed to be equal to `old`, + /// otherwise returns `Err(value)`, and `value` is the current value of `*self`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Full, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// // Checks whether cmpxchg succeeded. + /// let success = x.cmpxchg(52, 64, Relaxed).is_ok(); + /// # assert!(!success); + /// + /// // Checks whether cmpxchg failed. + /// let failure = x.cmpxchg(52, 64, Relaxed).is_err(); + /// # assert!(failure); + /// + /// // Uses the old value if failed, probably re-try cmpxchg. + /// match x.cmpxchg(52, 64, Relaxed) { + /// Ok(_) => { }, + /// Err(old) => { + /// // do something with `old`. + /// # assert_eq!(old, 42); + /// } + /// } + /// + /// // Uses the latest value regardlessly, same as atomic_cmpxchg() in C. + /// let latest = x.cmpxchg(42, 64, Full).unwrap_or_else(|old| old); + /// # assert_eq!(42, latest); + /// assert_eq!(64, x.load(Relaxed)); + /// ``` + /// + /// [`Relaxed`]: ordering::Relaxed + #[doc(alias( + "atomic_cmpxchg", + "atomic64_cmpxchg", + "atomic_try_cmpxchg", + "atomic64_try_cmpxchg", + "compare_exchange" + ))] + #[inline(always)] + pub fn cmpxchg( + &self, + mut old: T, + new: T, + o: Ordering, + ) -> Result { + // Note on code generation: + // + // try_cmpxchg() is used to implement cmpxchg(), and if the helper functions are inlined, + // the compiler is able to figure out that branch is not needed if the users don't care + // about whether the operation succeeds or not. One exception is on x86, due to commit + // 44fe84459faf ("locking/atomic: Fix atomic_try_cmpxchg() semantics"), the + // atomic_try_cmpxchg() on x86 has a branch even if the caller doesn't care about the + // success of cmpxchg and only wants to use the old value. For example, for code like: + // + // let latest = x.cmpxchg(42, 64, Full).unwrap_or_else(|old| old); + // + // It will still generate code: + // + // movl $0x40, %ecx + // movl $0x34, %eax + // lock + // cmpxchgl %ecx, 0x4(%rsp) + // jne 1f + // 2: + // ... + // 1: movl %eax, %ecx + // jmp 2b + // + // This might be "fixed" by introducing a try_cmpxchg_exclusive() that knows the "*old" + // location in the C function is always safe to write. + if self.try_cmpxchg(&mut old, new, o) { + Ok(old) + } else { + Err(old) + } + } + + /// Atomic compare and exchange and returns whether the operation succeeds. + /// + /// If `*self` == `old`, atomically updates `*self` to `new`. Otherwise, `*self` is not + /// modified, `*old` is updated to the current value of `*self`. + /// + /// "Compare" and "Ordering" part are the same as [`Atomic::cmpxchg()`]. + /// + /// Returns `true` means the cmpxchg succeeds otherwise returns `false`. + #[inline(always)] + fn try_cmpxchg(&self, old: &mut T, new: T, _: Ordering) -> bool { + let mut tmp = into_repr(*old); + let new = into_repr(new); + + // INVARIANT: `self.0` is a valid `T` after `atomic_try_cmpxchg*()` because `new` is + // transmutable to `T`. + let ret = { + match Ordering::TYPE { + OrderingType::Full => T::Repr::atomic_try_cmpxchg(&self.0, &mut tmp, new), + OrderingType::Acquire => { + T::Repr::atomic_try_cmpxchg_acquire(&self.0, &mut tmp, new) + } + OrderingType::Release => { + T::Repr::atomic_try_cmpxchg_release(&self.0, &mut tmp, new) + } + OrderingType::Relaxed => { + T::Repr::atomic_try_cmpxchg_relaxed(&self.0, &mut tmp, new) + } + } + }; + + // SAFETY: `tmp` comes from reading `*self`, which is a valid `T` per type invariants. + *old = unsafe { from_repr(tmp) }; + + ret + } +} From d132054360baf0d127a463bbf853e43dd6eb0dd9 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 4 Sep 2025 21:41:33 -0700 Subject: [PATCH 1612/3206] rust: sync: atomic: Add the framework of arithmetic operations One important set of atomic operations is the arithmetic operations, i.e. add(), sub(), fetch_add(), add_return(), etc. However it may not make senses for all the types that `AtomicType` to have arithmetic operations, for example a `Foo(u32)` may not have a reasonable add() or sub(), plus subword types (`u8` and `u16`) currently don't have atomic arithmetic operations even on C side and might not have them in the future in Rust (because they are usually suboptimal on a few architecures). Therefore the plan is to add a few subtraits of `AtomicType` describing which types have and can do atomic arithemtic operations. One trait `AtomicAdd` is added, and only add() and fetch_add() are added. The rest will be added in the future. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Reviewed-by: Elle Rhumsaa Link: https://lore.kernel.org/all/20250719030827.61357-7-boqun.feng@gmail.com/ --- rust/kernel/sync/atomic.rs | 94 +++++++++++++++++++++++++++- rust/kernel/sync/atomic/predefine.rs | 14 +++++ 2 files changed, 106 insertions(+), 2 deletions(-) diff --git a/rust/kernel/sync/atomic.rs b/rust/kernel/sync/atomic.rs index 4c32d12dc61e68..016a6bcaf0807b 100644 --- a/rust/kernel/sync/atomic.rs +++ b/rust/kernel/sync/atomic.rs @@ -16,7 +16,6 @@ //! //! [`LKMM`]: srctree/tools/memory-model/ -#[allow(dead_code, unreachable_pub)] mod internal; pub mod ordering; mod predefine; @@ -25,7 +24,7 @@ pub use internal::AtomicImpl; pub use ordering::{Acquire, Full, Relaxed, Release}; use crate::build_error; -use internal::{AtomicBasicOps, AtomicExchangeOps, AtomicRepr}; +use internal::{AtomicArithmeticOps, AtomicBasicOps, AtomicExchangeOps, AtomicRepr}; use ordering::OrderingType; /// A memory location which can be safely modified from multiple execution contexts. @@ -112,6 +111,19 @@ pub unsafe trait AtomicType: Sized + Send + Copy { type Repr: AtomicImpl; } +/// Types that support atomic add operations. +/// +/// # Safety +/// +// TODO: Properly defines `wrapping_add` in the following comment. +/// `wrapping_add` any value of type `Self::Repr::Delta` obtained by [`Self::rhs_into_delta()`] to +/// any value of type `Self::Repr` obtained through transmuting a value of type `Self` to must +/// yield a value with a bit pattern also valid for `Self`. +pub unsafe trait AtomicAdd: AtomicType { + /// Converts `Rhs` into the `Delta` type of the atomic implementation. + fn rhs_into_delta(rhs: Rhs) -> ::Delta; +} + #[inline(always)] const fn into_repr(v: T) -> T::Repr { // SAFETY: Per the safety requirement of `AtomicType`, `T` is round-trip transmutable to @@ -459,3 +471,81 @@ where ret } } + +impl Atomic +where + T::Repr: AtomicArithmeticOps, +{ + /// Atomic add. + /// + /// Atomically updates `*self` to `(*self).wrapping_add(v)`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// x.add(12, Relaxed); + /// + /// assert_eq!(54, x.load(Relaxed)); + /// ``` + #[inline(always)] + pub fn add(&self, v: Rhs, _: ordering::Relaxed) + where + T: AtomicAdd, + { + let v = T::rhs_into_delta(v); + + // INVARIANT: `self.0` is a valid `T` after `atomic_add()` due to safety requirement of + // `AtomicAdd`. + T::Repr::atomic_add(&self.0, v); + } + + /// Atomic fetch and add. + /// + /// Atomically updates `*self` to `(*self).wrapping_add(v)`, and returns the value of `*self` + /// before the update. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Acquire, Full, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// assert_eq!(54, { x.fetch_add(12, Acquire); x.load(Relaxed) }); + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// assert_eq!(54, { x.fetch_add(12, Full); x.load(Relaxed) } ); + /// ``` + #[inline(always)] + pub fn fetch_add(&self, v: Rhs, _: Ordering) -> T + where + T: AtomicAdd, + { + let v = T::rhs_into_delta(v); + + // INVARIANT: `self.0` is a valid `T` after `atomic_fetch_add*()` due to safety requirement + // of `AtomicAdd`. + let ret = { + match Ordering::TYPE { + OrderingType::Full => T::Repr::atomic_fetch_add(&self.0, v), + OrderingType::Acquire => T::Repr::atomic_fetch_add_acquire(&self.0, v), + OrderingType::Release => T::Repr::atomic_fetch_add_release(&self.0, v), + OrderingType::Relaxed => T::Repr::atomic_fetch_add_relaxed(&self.0, v), + } + }; + + // SAFETY: `ret` comes from reading `self.0`, which is a valid `T` per type invariants. + unsafe { from_repr(ret) } + } +} diff --git a/rust/kernel/sync/atomic/predefine.rs b/rust/kernel/sync/atomic/predefine.rs index 33356deee9524a..a6e5883be7cb43 100644 --- a/rust/kernel/sync/atomic/predefine.rs +++ b/rust/kernel/sync/atomic/predefine.rs @@ -8,8 +8,22 @@ unsafe impl super::AtomicType for i32 { type Repr = i32; } +// SAFETY: The wrapping add result of two `i32`s is a valid `i32`. +unsafe impl super::AtomicAdd for i32 { + fn rhs_into_delta(rhs: i32) -> i32 { + rhs + } +} + // SAFETY: `i64` has the same size and alignment with itself, and is round-trip transmutable to // itself. unsafe impl super::AtomicType for i64 { type Repr = i64; } + +// SAFETY: The wrapping add result of two `i64`s is a valid `i64`. +unsafe impl super::AtomicAdd for i64 { + fn rhs_into_delta(rhs: i64) -> i64 { + rhs + } +} From d6df37ba918198c0a7f55734c20512431770c4b3 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 4 Sep 2025 21:41:34 -0700 Subject: [PATCH 1613/3206] rust: sync: atomic: Add Atomic Add generic atomic support for basic unsigned types that have an `AtomicImpl` with the same size and alignment. Unit tests are added including Atomic and Atomic. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alice Ryhl Reviewed-by: Andreas Hindborg Reviewed-by: Benno Lossin Reviewed-by: Elle Rhumsaa Link: https://lore.kernel.org/all/20250719030827.61357-8-boqun.feng@gmail.com/ --- rust/kernel/sync/atomic/predefine.rs | 95 ++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/rust/kernel/sync/atomic/predefine.rs b/rust/kernel/sync/atomic/predefine.rs index a6e5883be7cb43..d0875812f6adde 100644 --- a/rust/kernel/sync/atomic/predefine.rs +++ b/rust/kernel/sync/atomic/predefine.rs @@ -27,3 +27,98 @@ unsafe impl super::AtomicAdd for i64 { rhs } } + +// SAFETY: `u32` and `i32` has the same size and alignment, and `u32` is round-trip transmutable to +// `i32`. +unsafe impl super::AtomicType for u32 { + type Repr = i32; +} + +// SAFETY: The wrapping add result of two `i32`s is a valid `u32`. +unsafe impl super::AtomicAdd for u32 { + fn rhs_into_delta(rhs: u32) -> i32 { + rhs as i32 + } +} + +// SAFETY: `u64` and `i64` has the same size and alignment, and `u64` is round-trip transmutable to +// `i64`. +unsafe impl super::AtomicType for u64 { + type Repr = i64; +} + +// SAFETY: The wrapping add result of two `i64`s is a valid `u64`. +unsafe impl super::AtomicAdd for u64 { + fn rhs_into_delta(rhs: u64) -> i64 { + rhs as i64 + } +} + +use crate::macros::kunit_tests; + +#[kunit_tests(rust_atomics)] +mod tests { + use super::super::*; + + // Call $fn($val) with each $type of $val. + macro_rules! for_each_type { + ($val:literal in [$($type:ty),*] $fn:expr) => { + $({ + let v: $type = $val; + + $fn(v); + })* + } + } + + #[test] + fn atomic_basic_tests() { + for_each_type!(42 in [i32, i64, u32, u64] |v| { + let x = Atomic::new(v); + + assert_eq!(v, x.load(Relaxed)); + }); + } + + #[test] + fn atomic_xchg_tests() { + for_each_type!(42 in [i32, i64, u32, u64] |v| { + let x = Atomic::new(v); + + let old = v; + let new = v + 1; + + assert_eq!(old, x.xchg(new, Full)); + assert_eq!(new, x.load(Relaxed)); + }); + } + + #[test] + fn atomic_cmpxchg_tests() { + for_each_type!(42 in [i32, i64, u32, u64] |v| { + let x = Atomic::new(v); + + let old = v; + let new = v + 1; + + assert_eq!(Err(old), x.cmpxchg(new, new, Full)); + assert_eq!(old, x.load(Relaxed)); + assert_eq!(Ok(old), x.cmpxchg(old, new, Relaxed)); + assert_eq!(new, x.load(Relaxed)); + }); + } + + #[test] + fn atomic_arithmetic_tests() { + for_each_type!(42 in [i32, i64, u32, u64] |v| { + let x = Atomic::new(v); + + assert_eq!(v, x.fetch_add(12, Full)); + assert_eq!(v + 12, x.load(Relaxed)); + + x.add(13, Relaxed); + + assert_eq!(v + 25, x.load(Relaxed)); + }); + } +} From 84c6d36bcaf98a8c0533cd334a624d536d20879b Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 4 Sep 2025 21:41:35 -0700 Subject: [PATCH 1614/3206] rust: sync: atomic: Add Atomic<{usize,isize}> Add generic atomic support for `usize` and `isize`. Note that instead of mapping directly to `atomic_long_t`, the represention type (`AtomicType::Repr`) is selected based on CONFIG_64BIT. This reduces the necessity of creating `atomic_long_*` helpers, which could save the binary size of kernel if inline helpers are not available. To do so, an internal type `isize_atomic_repr` is defined, it's `i32` in 32bit kernel and `i64` in 64bit kernel. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alice Ryhl Reviewed-by: Andreas Hindborg Reviewed-by: Benno Lossin Reviewed-by: Elle Rhumsaa Link: https://lore.kernel.org/all/20250719030827.61357-9-boqun.feng@gmail.com/ --- rust/kernel/sync/atomic/predefine.rs | 53 +++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/rust/kernel/sync/atomic/predefine.rs b/rust/kernel/sync/atomic/predefine.rs index d0875812f6adde..45a17985cda45e 100644 --- a/rust/kernel/sync/atomic/predefine.rs +++ b/rust/kernel/sync/atomic/predefine.rs @@ -2,6 +2,9 @@ //! Pre-defined atomic types +use crate::static_assert; +use core::mem::{align_of, size_of}; + // SAFETY: `i32` has the same size and alignment with itself, and is round-trip transmutable to // itself. unsafe impl super::AtomicType for i32 { @@ -28,6 +31,35 @@ unsafe impl super::AtomicAdd for i64 { } } +// Defines an internal type that always maps to the integer type which has the same size alignment +// as `isize` and `usize`, and `isize` and `usize` are always bi-directional transmutable to +// `isize_atomic_repr`, which also always implements `AtomicImpl`. +#[allow(non_camel_case_types)] +#[cfg(not(CONFIG_64BIT))] +type isize_atomic_repr = i32; +#[allow(non_camel_case_types)] +#[cfg(CONFIG_64BIT)] +type isize_atomic_repr = i64; + +// Ensure size and alignment requirements are checked. +static_assert!(size_of::() == size_of::()); +static_assert!(align_of::() == align_of::()); +static_assert!(size_of::() == size_of::()); +static_assert!(align_of::() == align_of::()); + +// SAFETY: `isize` has the same size and alignment with `isize_atomic_repr`, and is round-trip +// transmutable to `isize_atomic_repr`. +unsafe impl super::AtomicType for isize { + type Repr = isize_atomic_repr; +} + +// SAFETY: The wrapping add result of two `isize_atomic_repr`s is a valid `usize`. +unsafe impl super::AtomicAdd for isize { + fn rhs_into_delta(rhs: isize) -> isize_atomic_repr { + rhs as isize_atomic_repr + } +} + // SAFETY: `u32` and `i32` has the same size and alignment, and `u32` is round-trip transmutable to // `i32`. unsafe impl super::AtomicType for u32 { @@ -54,6 +86,19 @@ unsafe impl super::AtomicAdd for u64 { } } +// SAFETY: `usize` has the same size and alignment with `isize_atomic_repr`, and is round-trip +// transmutable to `isize_atomic_repr`. +unsafe impl super::AtomicType for usize { + type Repr = isize_atomic_repr; +} + +// SAFETY: The wrapping add result of two `isize_atomic_repr`s is a valid `usize`. +unsafe impl super::AtomicAdd for usize { + fn rhs_into_delta(rhs: usize) -> isize_atomic_repr { + rhs as isize_atomic_repr + } +} + use crate::macros::kunit_tests; #[kunit_tests(rust_atomics)] @@ -73,7 +118,7 @@ mod tests { #[test] fn atomic_basic_tests() { - for_each_type!(42 in [i32, i64, u32, u64] |v| { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { let x = Atomic::new(v); assert_eq!(v, x.load(Relaxed)); @@ -82,7 +127,7 @@ mod tests { #[test] fn atomic_xchg_tests() { - for_each_type!(42 in [i32, i64, u32, u64] |v| { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { let x = Atomic::new(v); let old = v; @@ -95,7 +140,7 @@ mod tests { #[test] fn atomic_cmpxchg_tests() { - for_each_type!(42 in [i32, i64, u32, u64] |v| { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { let x = Atomic::new(v); let old = v; @@ -110,7 +155,7 @@ mod tests { #[test] fn atomic_arithmetic_tests() { - for_each_type!(42 in [i32, i64, u32, u64] |v| { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { let x = Atomic::new(v); assert_eq!(v, x.fetch_add(12, Full)); From d9ea5a41cef80dc8103f4114b072b27364f2e06a Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 4 Sep 2025 21:41:36 -0700 Subject: [PATCH 1615/3206] rust: sync: Add memory barriers Memory barriers are building blocks for concurrent code, hence provide a minimal set of them. The compiler barrier, barrier(), is implemented in inline asm instead of using core::sync::atomic::compiler_fence() because memory models are different: kernel's atomics are implemented in inline asm therefore the compiler barrier should be implemented in inline asm as well. Also it's currently only public to the kernel crate until there's a reasonable driver usage. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alice Ryhl Reviewed-by: Elle Rhumsaa Link: https://lore.kernel.org/all/20250719030827.61357-10-boqun.feng@gmail.com/ --- rust/helpers/barrier.c | 18 +++++++++++ rust/helpers/helpers.c | 1 + rust/kernel/sync.rs | 1 + rust/kernel/sync/barrier.rs | 61 +++++++++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+) create mode 100644 rust/helpers/barrier.c create mode 100644 rust/kernel/sync/barrier.rs diff --git a/rust/helpers/barrier.c b/rust/helpers/barrier.c new file mode 100644 index 00000000000000..cdf28ce8e51167 --- /dev/null +++ b/rust/helpers/barrier.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +void rust_helper_smp_mb(void) +{ + smp_mb(); +} + +void rust_helper_smp_wmb(void) +{ + smp_wmb(); +} + +void rust_helper_smp_rmb(void) +{ + smp_rmb(); +} diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 7053f9245759a9..85ad14b8192501 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -9,6 +9,7 @@ #include "atomic.c" #include "auxiliary.c" +#include "barrier.c" #include "blk.c" #include "bug.c" #include "build_assert.c" diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index 7e962e5429d2e9..bf8943c88a891a 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -12,6 +12,7 @@ use pin_init; mod arc; pub mod aref; pub mod atomic; +pub mod barrier; pub mod completion; mod condvar; pub mod lock; diff --git a/rust/kernel/sync/barrier.rs b/rust/kernel/sync/barrier.rs new file mode 100644 index 00000000000000..8f2d435fcd9444 --- /dev/null +++ b/rust/kernel/sync/barrier.rs @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Memory barriers. +//! +//! These primitives have the same semantics as their C counterparts: and the precise definitions +//! of semantics can be found at [`LKMM`]. +//! +//! [`LKMM`]: srctree/tools/memory-model/ + +/// A compiler barrier. +/// +/// A barrier that prevents compiler from reordering memory accesses across the barrier. +#[inline(always)] +pub(crate) fn barrier() { + // By default, Rust inline asms are treated as being able to access any memory or flags, hence + // it suffices as a compiler barrier. + // + // SAFETY: An empty asm block. + unsafe { core::arch::asm!("") }; +} + +/// A full memory barrier. +/// +/// A barrier that prevents compiler and CPU from reordering memory accesses across the barrier. +#[inline(always)] +pub fn smp_mb() { + if cfg!(CONFIG_SMP) { + // SAFETY: `smp_mb()` is safe to call. + unsafe { bindings::smp_mb() }; + } else { + barrier(); + } +} + +/// A write-write memory barrier. +/// +/// A barrier that prevents compiler and CPU from reordering memory write accesses across the +/// barrier. +#[inline(always)] +pub fn smp_wmb() { + if cfg!(CONFIG_SMP) { + // SAFETY: `smp_wmb()` is safe to call. + unsafe { bindings::smp_wmb() }; + } else { + barrier(); + } +} + +/// A read-read memory barrier. +/// +/// A barrier that prevents compiler and CPU from reordering memory read accesses across the +/// barrier. +#[inline(always)] +pub fn smp_rmb() { + if cfg!(CONFIG_SMP) { + // SAFETY: `smp_rmb()` is safe to call. + unsafe { bindings::smp_rmb() }; + } else { + barrier(); + } +} From bb38f35b35f9de0cebc4d62ea73482454e38cef3 Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Thu, 4 Sep 2025 21:41:37 -0700 Subject: [PATCH 1616/3206] rust: implement `kernel::sync::Refcount` This is a wrapping layer of `include/linux/refcount.h`. Currently the kernel refcount has already been used in `Arc`, however it calls into FFI directly. [boqun: Add the missing <> for the link in comment] Signed-off-by: Gary Guo Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alice Ryhl Reviewed-by: Boqun Feng Reviewed-by: Fiona Behrens Reviewed-by: Benno Lossin Reviewed-by: Elle Rhumsaa Link: https://lore.kernel.org/r/20250723233312.3304339-2-gary@kernel.org --- rust/helpers/refcount.c | 10 ++++ rust/kernel/sync.rs | 2 + rust/kernel/sync/refcount.rs | 98 ++++++++++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100644 rust/kernel/sync/refcount.rs diff --git a/rust/helpers/refcount.c b/rust/helpers/refcount.c index d6adbd2e45a185..d175898ad7b81e 100644 --- a/rust/helpers/refcount.c +++ b/rust/helpers/refcount.c @@ -7,11 +7,21 @@ refcount_t rust_helper_REFCOUNT_INIT(int n) return (refcount_t)REFCOUNT_INIT(n); } +void rust_helper_refcount_set(refcount_t *r, int n) +{ + refcount_set(r, n); +} + void rust_helper_refcount_inc(refcount_t *r) { refcount_inc(r); } +void rust_helper_refcount_dec(refcount_t *r) +{ + refcount_dec(r); +} + bool rust_helper_refcount_dec_and_test(refcount_t *r) { return refcount_dec_and_test(r); diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index bf8943c88a891a..cf5b638a097d99 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -19,6 +19,7 @@ pub mod lock; mod locked_by; pub mod poll; pub mod rcu; +mod refcount; pub use arc::{Arc, ArcBorrow, UniqueArc}; pub use completion::Completion; @@ -27,6 +28,7 @@ pub use lock::global::{global_lock, GlobalGuard, GlobalLock, GlobalLockBackend, pub use lock::mutex::{new_mutex, Mutex, MutexGuard}; pub use lock::spinlock::{new_spinlock, SpinLock, SpinLockGuard}; pub use locked_by::LockedBy; +pub use refcount::Refcount; /// Represents a lockdep class. It's a wrapper around C's `lock_class_key`. #[repr(transparent)] diff --git a/rust/kernel/sync/refcount.rs b/rust/kernel/sync/refcount.rs new file mode 100644 index 00000000000000..cc1a80ae7ae9a4 --- /dev/null +++ b/rust/kernel/sync/refcount.rs @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Atomic reference counting. +//! +//! C header: [`include/linux/refcount.h`](srctree/include/linux/refcount.h) + +use crate::build_assert; +use crate::types::Opaque; + +/// Atomic reference counter. +/// +/// This type is conceptually an atomic integer, but provides saturation semantics compared to +/// normal atomic integers. Values in the negative range when viewed as a signed integer are +/// saturation (bad) values. For details about the saturation semantics, please refer to top of +/// [`include/linux/refcount.h`](srctree/include/linux/refcount.h). +/// +/// Wraps the kernel's C `refcount_t`. +#[repr(transparent)] +pub struct Refcount(Opaque); + +impl Refcount { + /// Construct a new [`Refcount`] from an initial value. + /// + /// The initial value should be non-saturated. + #[inline] + pub fn new(value: i32) -> Self { + build_assert!(value >= 0, "initial value saturated"); + // SAFETY: There are no safety requirements for this FFI call. + Self(Opaque::new(unsafe { bindings::REFCOUNT_INIT(value) })) + } + + #[inline] + fn as_ptr(&self) -> *mut bindings::refcount_t { + self.0.get() + } + + /// Set a refcount's value. + #[inline] + pub fn set(&self, value: i32) { + // SAFETY: `self.as_ptr()` is valid. + unsafe { bindings::refcount_set(self.as_ptr(), value) } + } + + /// Increment a refcount. + /// + /// It will saturate if overflows and `WARN`. It will also `WARN` if the refcount is 0, as this + /// represents a possible use-after-free condition. + /// + /// Provides no memory ordering, it is assumed that caller already has a reference on the + /// object. + #[inline] + pub fn inc(&self) { + // SAFETY: self is valid. + unsafe { bindings::refcount_inc(self.as_ptr()) } + } + + /// Decrement a refcount. + /// + /// It will `WARN` on underflow and fail to decrement when saturated. + /// + /// Provides release memory ordering, such that prior loads and stores are done + /// before. + #[inline] + pub fn dec(&self) { + // SAFETY: `self.as_ptr()` is valid. + unsafe { bindings::refcount_dec(self.as_ptr()) } + } + + /// Decrement a refcount and test if it is 0. + /// + /// It will `WARN` on underflow and fail to decrement when saturated. + /// + /// Provides release memory ordering, such that prior loads and stores are done + /// before, and provides an acquire ordering on success such that memory deallocation + /// must come after. + /// + /// Returns true if the resulting refcount is 0, false otherwise. + /// + /// # Notes + /// + /// A common pattern of using `Refcount` is to free memory when the reference count reaches + /// zero. This means that the reference to `Refcount` could become invalid after calling this + /// function. This is fine as long as the reference to `Refcount` is no longer used when this + /// function returns `false`. It is not necessary to use raw pointers in this scenario, see + /// . + #[inline] + #[must_use = "use `dec` instead if you do not need to test if it is 0"] + pub fn dec_and_test(&self) -> bool { + // SAFETY: `self.as_ptr()` is valid. + unsafe { bindings::refcount_dec_and_test(self.as_ptr()) } + } +} + +// SAFETY: `refcount_t` is thread-safe. +unsafe impl Send for Refcount {} + +// SAFETY: `refcount_t` is thread-safe. +unsafe impl Sync for Refcount {} From 7487645f0b2d1a30590bafa7a977dc6661006d4f Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Thu, 4 Sep 2025 21:41:38 -0700 Subject: [PATCH 1617/3206] rust: make `Arc::into_unique_or_drop` associated function Make `Arc::into_unique_or_drop` to become a mere associated function instead of a method (i.e. removing the `self` receiver). It's a general convention for Rust smart pointers to avoid having methods defined on them, because if the pointee type has a method of the same name, then it is shadowed. This is normally for avoiding semver breakage, which isn't an issue for kernel codebase, but it's still generally a good practice to follow this rule, so that `ptr.foo()` would always be calling a method on the pointee type. Signed-off-by: Gary Guo Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Benno Lossin Reviewed-by: Alexandre Courbot Reviewed-by: Elle Rhumsaa Link: https://lore.kernel.org/r/20250723233312.3304339-3-gary@kernel.org --- rust/kernel/sync/arc.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs index 63a66761d0c7d7..4ee155b43b2dc5 100644 --- a/rust/kernel/sync/arc.rs +++ b/rust/kernel/sync/arc.rs @@ -321,7 +321,7 @@ impl Arc { /// use kernel::sync::{Arc, UniqueArc}; /// /// let arc = Arc::new(42, GFP_KERNEL)?; - /// let unique_arc = arc.into_unique_or_drop(); + /// let unique_arc = Arc::into_unique_or_drop(arc); /// /// // The above conversion should succeed since refcount of `arc` is 1. /// assert!(unique_arc.is_some()); @@ -337,18 +337,18 @@ impl Arc { /// let arc = Arc::new(42, GFP_KERNEL)?; /// let another = arc.clone(); /// - /// let unique_arc = arc.into_unique_or_drop(); + /// let unique_arc = Arc::into_unique_or_drop(arc); /// /// // The above conversion should fail since refcount of `arc` is >1. /// assert!(unique_arc.is_none()); /// /// # Ok::<(), Error>(()) /// ``` - pub fn into_unique_or_drop(self) -> Option>> { + pub fn into_unique_or_drop(this: Self) -> Option>> { // We will manually manage the refcount in this method, so we disable the destructor. - let me = ManuallyDrop::new(self); + let this = ManuallyDrop::new(this); // SAFETY: We own a refcount, so the pointer is still valid. - let refcount = unsafe { me.ptr.as_ref() }.refcount.get(); + let refcount = unsafe { this.ptr.as_ref() }.refcount.get(); // If the refcount reaches a non-zero value, then we have destroyed this `Arc` and will // return without further touching the `Arc`. If the refcount reaches zero, then there are @@ -365,7 +365,7 @@ impl Arc { // must pin the `UniqueArc` because the values was previously in an `Arc`, and they pin // their values. Some(Pin::from(UniqueArc { - inner: ManuallyDrop::into_inner(me), + inner: ManuallyDrop::into_inner(this), })) } else { None From 076acb647c1f448177d8b3b0e4f33de959713d7d Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Thu, 4 Sep 2025 21:41:39 -0700 Subject: [PATCH 1618/3206] rust: convert `Arc` to use `Refcount` With `Refcount` type created, `Arc` can use `Refcount` instead of calling into FFI directly. Signed-off-by: Gary Guo Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexandre Courbot Reviewed-by: Benno Lossin Reviewed-by: Elle Rhumsaa Link: https://lore.kernel.org/r/20250723233312.3304339-4-gary@kernel.org --- rust/kernel/sync/arc.rs | 45 +++++++++++++---------------------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs index 4ee155b43b2dc5..9298993ea7d8b3 100644 --- a/rust/kernel/sync/arc.rs +++ b/rust/kernel/sync/arc.rs @@ -8,7 +8,7 @@ //! threads. //! //! It is different from the standard library's [`Arc`] in a few ways: -//! 1. It is backed by the kernel's `refcount_t` type. +//! 1. It is backed by the kernel's [`Refcount`] type. //! 2. It does not support weak references, which allows it to be half the size. //! 3. It saturates the reference count instead of aborting when it goes over a threshold. //! 4. It does not provide a `get_mut` method, so the ref counted object is pinned. @@ -18,11 +18,11 @@ use crate::{ alloc::{AllocError, Flags, KBox}, - bindings, ffi::c_void, init::InPlaceInit, + sync::Refcount, try_init, - types::{ForeignOwnable, Opaque}, + types::ForeignOwnable, }; use core::{ alloc::Layout, @@ -145,7 +145,7 @@ pub struct Arc { #[pin_data] #[repr(C)] struct ArcInner { - refcount: Opaque, + refcount: Refcount, data: T, } @@ -157,7 +157,7 @@ impl ArcInner { /// `ptr` must have been returned by a previous call to [`Arc::into_raw`], and the `Arc` must /// not yet have been destroyed. unsafe fn container_of(ptr: *const T) -> NonNull> { - let refcount_layout = Layout::new::(); + let refcount_layout = Layout::new::(); // SAFETY: The caller guarantees that the pointer is valid. let val_layout = Layout::for_value(unsafe { &*ptr }); // SAFETY: We're computing the layout of a real struct that existed when compiling this @@ -229,8 +229,7 @@ impl Arc { pub fn new(contents: T, flags: Flags) -> Result { // INVARIANT: The refcount is initialised to a non-zero value. let value = ArcInner { - // SAFETY: There are no safety requirements for this FFI call. - refcount: Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }), + refcount: Refcount::new(1), data: contents, }; @@ -348,18 +347,13 @@ impl Arc { // We will manually manage the refcount in this method, so we disable the destructor. let this = ManuallyDrop::new(this); // SAFETY: We own a refcount, so the pointer is still valid. - let refcount = unsafe { this.ptr.as_ref() }.refcount.get(); + let refcount = unsafe { &this.ptr.as_ref().refcount }; // If the refcount reaches a non-zero value, then we have destroyed this `Arc` and will // return without further touching the `Arc`. If the refcount reaches zero, then there are // no other arcs, and we can create a `UniqueArc`. - // - // SAFETY: We own a refcount, so the pointer is not dangling. - let is_zero = unsafe { bindings::refcount_dec_and_test(refcount) }; - if is_zero { - // SAFETY: We have exclusive access to the arc, so we can perform unsynchronized - // accesses to the refcount. - unsafe { core::ptr::write(refcount, bindings::REFCOUNT_INIT(1)) }; + if refcount.dec_and_test() { + refcount.set(1); // INVARIANT: We own the only refcount to this arc, so we may create a `UniqueArc`. We // must pin the `UniqueArc` because the values was previously in an `Arc`, and they pin @@ -456,14 +450,10 @@ impl Borrow for Arc { impl Clone for Arc { fn clone(&self) -> Self { - // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is - // safe to dereference it. - let refcount = unsafe { self.ptr.as_ref() }.refcount.get(); - - // INVARIANT: C `refcount_inc` saturates the refcount, so it cannot overflow to zero. + // INVARIANT: `Refcount` saturates the refcount, so it cannot overflow to zero. // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is // safe to increment the refcount. - unsafe { bindings::refcount_inc(refcount) }; + unsafe { self.ptr.as_ref() }.refcount.inc(); // SAFETY: We just incremented the refcount. This increment is now owned by the new `Arc`. unsafe { Self::from_inner(self.ptr) } @@ -472,16 +462,10 @@ impl Clone for Arc { impl Drop for Arc { fn drop(&mut self) { - // SAFETY: By the type invariant, there is necessarily a reference to the object. We cannot - // touch `refcount` after it's decremented to a non-zero value because another thread/CPU - // may concurrently decrement it to zero and free it. It is ok to have a raw pointer to - // freed/invalid memory as long as it is never dereferenced. - let refcount = unsafe { self.ptr.as_ref() }.refcount.get(); - // INVARIANT: If the refcount reaches zero, there are no other instances of `Arc`, and // this instance is being dropped, so the broken invariant is not observable. - // SAFETY: Also by the type invariant, we are allowed to decrement the refcount. - let is_zero = unsafe { bindings::refcount_dec_and_test(refcount) }; + // SAFETY: By the type invariant, there is necessarily a reference to the object. + let is_zero = unsafe { self.ptr.as_ref() }.refcount.dec_and_test(); if is_zero { // The count reached zero, we must free the memory. // @@ -775,8 +759,7 @@ impl UniqueArc { // INVARIANT: The refcount is initialised to a non-zero value. let inner = KBox::try_init::( try_init!(ArcInner { - // SAFETY: There are no safety requirements for this FFI call. - refcount: Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }), + refcount: Refcount::new(1), data <- pin_init::uninit::(), }? AllocError), flags, From a307bf1db5448eccd72a1d7857f7661c6330d5ad Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Thu, 4 Sep 2025 21:41:40 -0700 Subject: [PATCH 1619/3206] rust: block: convert `block::mq` to use `Refcount` Currently there's a custom reference counting in `block::mq`, which uses `AtomicU64` Rust atomics, and this type doesn't exist on some 32-bit architectures. We cannot just change it to use 32-bit atomics, because doing so will make it vulnerable to refcount overflow. So switch it to use the kernel refcount `kernel::sync::Refcount` instead. There is an operation needed by `block::mq`, atomically decreasing refcount from 2 to 0, which is not available through refcount.h, so I exposed `Refcount::as_atomic` which allows accessing the refcount directly. [boqun: Adopt the LKMM atomic API] Signed-off-by: Gary Guo Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Benno Lossin Reviewed-by: Elle Rhumsaa Acked-by: Andreas Hindborg Tested-by: David Gow Link: https://lore.kernel.org/r/20250723233312.3304339-5-gary@kernel.org --- rust/kernel/block/mq/operations.rs | 7 +-- rust/kernel/block/mq/request.rs | 73 +++++++++--------------------- rust/kernel/sync/refcount.rs | 15 ++++++ 3 files changed, 40 insertions(+), 55 deletions(-) diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs index c2b98f507bcbd4..c0f95a9419c4e2 100644 --- a/rust/kernel/block/mq/operations.rs +++ b/rust/kernel/block/mq/operations.rs @@ -10,9 +10,10 @@ use crate::{ block::mq::Request, error::{from_result, Result}, prelude::*, + sync::Refcount, types::ARef, }; -use core::{marker::PhantomData, sync::atomic::AtomicU64, sync::atomic::Ordering}; +use core::marker::PhantomData; /// Implement this trait to interface blk-mq as block devices. /// @@ -78,7 +79,7 @@ impl OperationsVTable { let request = unsafe { &*(*bd).rq.cast::>() }; // One refcount for the ARef, one for being in flight - request.wrapper_ref().refcount().store(2, Ordering::Relaxed); + request.wrapper_ref().refcount().set(2); // SAFETY: // - We own a refcount that we took above. We pass that to `ARef`. @@ -187,7 +188,7 @@ impl OperationsVTable { // SAFETY: The refcount field is allocated but not initialized, so // it is valid for writes. - unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(AtomicU64::new(0)) }; + unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(Refcount::new(0)) }; Ok(0) }) diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs index fefd394f064a71..f62a376dc31399 100644 --- a/rust/kernel/block/mq/request.rs +++ b/rust/kernel/block/mq/request.rs @@ -8,13 +8,10 @@ use crate::{ bindings, block::mq::Operations, error::Result, + sync::{atomic::Relaxed, Refcount}, types::{ARef, AlwaysRefCounted, Opaque}, }; -use core::{ - marker::PhantomData, - ptr::NonNull, - sync::atomic::{AtomicU64, Ordering}, -}; +use core::{marker::PhantomData, ptr::NonNull}; /// A wrapper around a blk-mq [`struct request`]. This represents an IO request. /// @@ -37,6 +34,9 @@ use core::{ /// We need to track 3 and 4 to ensure that it is safe to end the request and hand /// back ownership to the block layer. /// +/// Note that the driver can still obtain new `ARef` even if there is no `ARef`s in existence by +/// using `tag_to_rq`, hence the need to distinguish B and C. +/// /// The states are tracked through the private `refcount` field of /// `RequestDataWrapper`. This structure lives in the private data area of the C /// [`struct request`]. @@ -98,13 +98,16 @@ impl Request { /// /// [`struct request`]: srctree/include/linux/blk-mq.h fn try_set_end(this: ARef) -> Result<*mut bindings::request, ARef> { - // We can race with `TagSet::tag_to_rq` - if let Err(_old) = this.wrapper_ref().refcount().compare_exchange( - 2, - 0, - Ordering::Relaxed, - Ordering::Relaxed, - ) { + // To hand back the ownership, we need the current refcount to be 2. + // Since we can race with `TagSet::tag_to_rq`, this needs to atomically reduce + // refcount to 0. `Refcount` does not provide a way to do this, so use the underlying + // atomics directly. + if let Err(_old) = this + .wrapper_ref() + .refcount() + .as_atomic() + .cmpxchg(2, 0, Relaxed) + { return Err(this); } @@ -173,13 +176,13 @@ pub(crate) struct RequestDataWrapper { /// - 0: The request is owned by C block layer. /// - 1: The request is owned by Rust abstractions but there are no [`ARef`] references to it. /// - 2+: There are [`ARef`] references to the request. - refcount: AtomicU64, + refcount: Refcount, } impl RequestDataWrapper { /// Return a reference to the refcount of the request that is embedding /// `self`. - pub(crate) fn refcount(&self) -> &AtomicU64 { + pub(crate) fn refcount(&self) -> &Refcount { &self.refcount } @@ -189,7 +192,7 @@ impl RequestDataWrapper { /// # Safety /// /// - `this` must point to a live allocation of at least the size of `Self`. - pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut AtomicU64 { + pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut Refcount { // SAFETY: Because of the safety requirements of this function, the // field projection is safe. unsafe { &raw mut (*this).refcount } @@ -205,47 +208,13 @@ unsafe impl Send for Request {} // mutate `self` are internally synchronized` unsafe impl Sync for Request {} -/// Store the result of `op(target.load())` in target, returning new value of -/// target. -fn atomic_relaxed_op_return(target: &AtomicU64, op: impl Fn(u64) -> u64) -> u64 { - let old = target.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| Some(op(x))); - - // SAFETY: Because the operation passed to `fetch_update` above always - // return `Some`, `old` will always be `Ok`. - let old = unsafe { old.unwrap_unchecked() }; - - op(old) -} - -/// Store the result of `op(target.load)` in `target` if `target.load() != -/// pred`, returning [`true`] if the target was updated. -fn atomic_relaxed_op_unless(target: &AtomicU64, op: impl Fn(u64) -> u64, pred: u64) -> bool { - target - .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| { - if x == pred { - None - } else { - Some(op(x)) - } - }) - .is_ok() -} - // SAFETY: All instances of `Request` are reference counted. This // implementation of `AlwaysRefCounted` ensure that increments to the ref count // keeps the object alive in memory at least until a matching reference count // decrement is executed. unsafe impl AlwaysRefCounted for Request { fn inc_ref(&self) { - let refcount = &self.wrapper_ref().refcount(); - - #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))] - let updated = atomic_relaxed_op_unless(refcount, |x| x + 1, 0); - - #[cfg(CONFIG_DEBUG_MISC)] - if !updated { - panic!("Request refcount zero on clone") - } + self.wrapper_ref().refcount().inc(); } unsafe fn dec_ref(obj: core::ptr::NonNull) { @@ -257,10 +226,10 @@ unsafe impl AlwaysRefCounted for Request { let refcount = unsafe { &*RequestDataWrapper::refcount_ptr(wrapper_ptr) }; #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))] - let new_refcount = atomic_relaxed_op_return(refcount, |x| x - 1); + let is_zero = refcount.dec_and_test(); #[cfg(CONFIG_DEBUG_MISC)] - if new_refcount == 0 { + if is_zero { panic!("Request reached refcount zero in Rust abstractions"); } } diff --git a/rust/kernel/sync/refcount.rs b/rust/kernel/sync/refcount.rs index cc1a80ae7ae9a4..19236a5bccdeb0 100644 --- a/rust/kernel/sync/refcount.rs +++ b/rust/kernel/sync/refcount.rs @@ -5,6 +5,7 @@ //! C header: [`include/linux/refcount.h`](srctree/include/linux/refcount.h) use crate::build_assert; +use crate::sync::atomic::Atomic; use crate::types::Opaque; /// Atomic reference counter. @@ -34,6 +35,20 @@ impl Refcount { self.0.get() } + /// Get the underlying atomic counter that backs the refcount. + /// + /// NOTE: Usage of this function is discouraged as it can circumvent the protections offered by + /// `refcount.h`. If there is no way to achieve the result using APIs in `refcount.h`, then + /// this function can be used. Otherwise consider adding a binding for the required API. + #[inline] + pub fn as_atomic(&self) -> &Atomic { + let ptr = self.0.get().cast(); + // SAFETY: `refcount_t` is a transparent wrapper of `atomic_t`, which is an atomic 32-bit + // integer that is layout-wise compatible with `Atomic`. All values are valid for + // `refcount_t`, despite some of the values being considered saturated and "bad". + unsafe { &*ptr } + } + /// Set a refcount's value. #[inline] pub fn set(&self, value: i32) { From 17d9f8eaa87d40a2ff66598875a43363e37a909b Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Thu, 4 Sep 2025 21:41:41 -0700 Subject: [PATCH 1620/3206] MAINTAINERS: update atomic infrastructure entry to include Rust I would like to help review atomic related patches, especially Rust related ones, hence add myself as an reviewer. Suggested-by: Boqun Feng Signed-off-by: Gary Guo Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Elle Rhumsaa Acked-by: Boqun Feng Link: https://lore.kernel.org/r/20250723233312.3304339-6-gary@kernel.org --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 538778b81fd4cc..2f346abc84ab06 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3992,6 +3992,7 @@ M: Will Deacon M: Peter Zijlstra M: Boqun Feng R: Mark Rutland +R: Gary Guo L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/atomic_*.txt @@ -4001,6 +4002,7 @@ F: include/linux/refcount.h F: scripts/atomic/ F: rust/kernel/sync/atomic.rs F: rust/kernel/sync/atomic/ +F: rust/kernel/sync/refcount.rs ATTO EXPRESSSAS SAS/SATA RAID SCSI DRIVER M: Bradley Grove From fe8d238e646e16cc431b7a5899f8dda690258ee9 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 10 Sep 2025 17:50:41 +0800 Subject: [PATCH 1621/3206] sched/fair: Propagate load for throttled cfs_rq Before task based throttle model, propagating load will stop at a throttled cfs_rq and that propagate will happen on unthrottle time by update_load_avg(). Now that there is no update_load_avg() on unthrottle for throttled cfs_rq and all load tracking is done by task related operations, let the propagate happen immediately. While at it, add a comment to explain why cfs_rqs that are not affected by throttle have to be added to leaf cfs_rq list in propagate_entity_cfs_rq() per my understanding of commit 0258bdfaff5b ("sched/fair: Fix unfairness caused by missing load decay"). Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou --- kernel/sched/fair.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df8dc389af8e14..f993de30e14669 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5729,6 +5729,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled; +} + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { @@ -6721,6 +6726,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return 0; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return false; +} + static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return 0; @@ -13151,10 +13161,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - return; - - if (!throttled_hierarchy(cfs_rq)) + /* + * If a task gets attached to this cfs_rq and before being queued, + * it gets migrated to another CPU due to reasons like affinity + * change, make sure this cfs_rq stays on leaf cfs_rq list to have + * that removed load decayed or it can cause faireness problem. + */ + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ @@ -13165,10 +13178,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) update_load_avg(cfs_rq, se, UPDATE_TG); - if (cfs_rq_throttled(cfs_rq)) - break; - - if (!throttled_hierarchy(cfs_rq)) + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } } From fcd394866e3db344cbe0bb485d7e3f741ac07245 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 10 Sep 2025 17:50:42 +0800 Subject: [PATCH 1622/3206] sched/fair: update_cfs_group() for throttled cfs_rqs With task based throttle model, tasks in a throttled hierarchy are allowed to continue to run if they are running in kernel mode. For this reason, PELT clock is not stopped for these cfs_rqs in throttled hierarchy when they still have tasks running or queued. Since PELT clock is not stopped, whether to allow update_cfs_group() doing its job for cfs_rqs which are in throttled hierarchy but still have tasks running/queued is a question. The good side is, continue to run update_cfs_group() can get these cfs_rq entities with an up2date weight and that up2date weight can be useful to derive an accurate load for the CPU as well as ensure fairness if multiple tasks of different cgroups are running on the same CPU. OTOH, as Benjamin Segall pointed: when unthrottle comes around the most likely correct distribution is the distribution we had at the time of throttle. In reality, either way may not matter that much if tasks in throttled hierarchy don't run in kernel mode for too long. But in case that happens, let these cfs_rq entities have an up2date weight seems a good thing to do. Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/fair.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f993de30e14669..58f5349d37256b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se) if (!gcfs_rq || !gcfs_rq->load.weight) return; - if (throttled_hierarchy(gcfs_rq)) - return; - shares = calc_group_shares(gcfs_rq); if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); From 253b3f587241967a97a971e23b1e2a7d74244fad Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 10 Sep 2025 17:50:43 +0800 Subject: [PATCH 1623/3206] sched/fair: Do not special case tasks in throttled hierarchy With the introduction of task based throttle model, task in a throttled hierarchy is allowed to continue to run till it gets throttled on its ret2user path. For this reason, remove those throttled_hierarchy() checks in the following functions so that those tasks can get their turn as normal tasks: dequeue_entities(), check_preempt_wakeup_fair() and yield_to_task_fair(). The benefit of doing it this way is: if those tasks gets the chance to run earlier and if they hold any kernel resources, they can release those resources earlier. The downside is, if they don't hold any kernel resouces, all they can do is to throttle themselves on their way back to user space so the favor to let them run seems not that useful and for check_preempt_wakeup_fair(), that favor may be bad for curr. K Prateek Nayak pointed out prio_changed_fair() can send a throttled task to check_preempt_wakeup_fair(), further tests showed the affinity change path from move_queued_task() can also send a throttled task to check_preempt_wakeup_fair(), that's why the check of task_is_throttled() in that function. Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 58f5349d37256b..3dbdfaa6974776 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7081,7 +7081,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + if (task_sleep && se) set_next_buddy(se); break; } @@ -8735,7 +8735,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + if (task_is_throttled(p)) return; if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { @@ -9009,8 +9009,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - /* throttled hierarchies are not runnable */ - if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + /* !se->on_rq also covers throttled task */ + if (!se->on_rq) return false; /* Tell the scheduler that we'd really like se to run next. */ From 0d4eaf8caf8cd633b23e949e2996b420052c2d45 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 12 Sep 2025 11:44:28 +0800 Subject: [PATCH 1624/3206] sched/fair: Do not balance task to a throttled cfs_rq When doing load balance and the target cfs_rq is in throttled hierarchy, whether to allow balancing there is a question. The good side to allow balancing is: if the target CPU is idle or less loaded and the being balanced task is holding some kernel resources, then it seems a good idea to balance the task there and let the task get the CPU earlier and release kernel resources sooner. The bad part is, if the task is not holding any kernel resources, then the balance seems not that useful. While theoretically it's debatable, a performance test[0] which involves 200 cgroups and each cgroup runs hackbench(20 sender, 20 receiver) in pipe mode showed a performance degradation on AMD Genoa when allowing load balance to throttled cfs_rq. Analysis[1] showed hackbench doesn't like task migration across LLC boundary. For this reason, add a check in can_migrate_task() to forbid balancing to a cfs_rq that is in throttled hierarchy. This reduced task migration a lot and performance restored. [0]: https://lore.kernel.org/lkml/20250822110701.GB289@bytedance/ [1]: https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/ Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak --- kernel/sched/fair.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3dbdfaa6974776..18a30ae35441a4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5737,6 +5737,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttle_count; } +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) +{ + return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); +} + static inline bool task_is_throttled(struct task_struct *p) { return cfs_bandwidth_used() && p->throttled; @@ -6733,6 +6738,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return 0; } +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) +{ + return 0; +} + #ifdef CONFIG_FAIR_GROUP_SCHED void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -9369,14 +9379,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or - * 2) cannot be migrated to this CPU due to cpus_ptr, or - * 3) running (obviously), or - * 4) are cache-hot on their current CPU, or - * 5) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) + * 2) target cfs_rq is in throttled hierarchy, or + * 3) cannot be migrated to this CPU due to cpus_ptr, or + * 4) running (obviously), or + * 5) are cache-hot on their current CPU, or + * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) */ if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; + if (lb_throttled_hierarchy(p, env->dst_cpu)) + return 0; + /* * We want to prioritize the migration of eligible tasks. * For ineligible tasks we soft-limit them and only allow From 043439ad1a23cd3f65628310d1f5a06e61f8b431 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 9 Sep 2025 01:43:53 -0700 Subject: [PATCH 1625/3206] powerpc/pseries: Define papr-hvpipe ioctl PowerPC FW introduced HVPIPE RTAS calls such as ibm,send-hvpipe-msg and ibm,receive-hvpipe-msg for the user space to exchange information with different sources such as Hardware Management Consoles (HMC). HVPIPE_IOC_CREATE_HANDLE is defined to use /dev/papr-hvpipe interface for ibm,send-hvpipe-msg and ibm,receive-hvpipe-msg RTAS calls. Also defined papr_hvpipe_hdr which will added in the payload that is passed between the kernel and the user space. Signed-off-by: Haren Myneni Tested-by: Shashank MS Reviewed-by: Mahesh Salgaonkar Reviewed-by: Tyrel Datwyler Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250909084402.1488456-2-haren@linux.ibm.com --- .../userspace-api/ioctl/ioctl-number.rst | 2 ++ arch/powerpc/include/uapi/asm/papr-hvpipe.h | 33 +++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 arch/powerpc/include/uapi/asm/papr-hvpipe.h diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 406a9f4d08694e..7c527a01d1cf5a 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -374,6 +374,8 @@ Code Seq# Include File Comments 0xB2 08 arch/powerpc/include/uapi/asm/papr-physical-attestation.h powerpc/pseries Physical Attestation API +0xB2 09 arch/powerpc/include/uapi/asm/papr-hvpipe.h powerpc/pseries HVPIPE API + 0xB3 00 linux/mmc/ioctl.h 0xB4 00-0F linux/gpio.h 0xB5 00-0F uapi/linux/rpmsg.h diff --git a/arch/powerpc/include/uapi/asm/papr-hvpipe.h b/arch/powerpc/include/uapi/asm/papr-hvpipe.h new file mode 100644 index 00000000000000..459a7bb0e6c9c6 --- /dev/null +++ b/arch/powerpc/include/uapi/asm/papr-hvpipe.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_PAPR_HVPIPE_H_ +#define _UAPI_PAPR_HVPIPE_H_ + +#include +#include +#include + +/* + * This header is included in payload between OS and the user + * space. + * flags: OS notifies the user space whether the hvpipe is + * closed or the buffer has the payload. + */ +struct papr_hvpipe_hdr { + u8 version; + u8 reserved[3]; + u32 flags; + u8 reserved2[40]; +}; + +/* + * ioctl for /dev/papr-hvpipe + */ +#define PAPR_HVPIPE_IOC_CREATE_HANDLE _IOW(PAPR_MISCDEV_IOC_ID, 9, __u32) + +/* + * hvpipe_hdr flags used for read() + */ +#define HVPIPE_MSG_AVAILABLE 0x01 /* Payload is available */ +#define HVPIPE_LOST_CONNECTION 0x02 /* Pipe connection is closed/unavailable */ + +#endif /* _UAPI_PAPR_HVPIPE_H_ */ From 26b4fcecea05b8927b17c9ace132914ffcb4d97e Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 9 Sep 2025 01:43:54 -0700 Subject: [PATCH 1626/3206] powerpc/pseries: Define HVPIPE specific macros Define HVPIPE specific macros which are needed to support ibm,send-hvpipe-msg and ibm,receive-hvpipe-msg RTAS calls and used to handle HVPIPE message events. Signed-off-by: Haren Myneni Tested-by: Shashank MS Reviewed-by: Mahesh Salgaonkar Reviewed-by: Tyrel Datwyler Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250909084402.1488456-3-haren@linux.ibm.com --- arch/powerpc/include/asm/rtas.h | 9 +++++++++ arch/powerpc/kernel/rtas.c | 24 ++++++++++++++++++++++++ arch/powerpc/kernel/rtasd.c | 2 ++ 3 files changed, 35 insertions(+) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 75fa0293c508e6..d046bbd5017d3c 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -68,9 +68,11 @@ enum rtas_function_index { RTAS_FNIDX__IBM_READ_PCI_CONFIG, RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE, RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2, + RTAS_FNIDX__IBM_RECEIVE_HVPIPE_MSG, RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW, RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW, RTAS_FNIDX__IBM_SCAN_LOG_DUMP, + RTAS_FNIDX__IBM_SEND_HVPIPE_MSG, RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR, RTAS_FNIDX__IBM_SET_EEH_OPTION, RTAS_FNIDX__IBM_SET_SLOT_RESET, @@ -163,9 +165,11 @@ typedef struct { #define RTAS_FN_IBM_READ_PCI_CONFIG rtas_fn_handle(RTAS_FNIDX__IBM_READ_PCI_CONFIG) #define RTAS_FN_IBM_READ_SLOT_RESET_STATE rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE) #define RTAS_FN_IBM_READ_SLOT_RESET_STATE2 rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2) +#define RTAS_FN_IBM_RECEIVE_HVPIPE_MSG rtas_fn_handle(RTAS_FNIDX__IBM_RECEIVE_HVPIPE_MSG) #define RTAS_FN_IBM_REMOVE_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW) #define RTAS_FN_IBM_RESET_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW) #define RTAS_FN_IBM_SCAN_LOG_DUMP rtas_fn_handle(RTAS_FNIDX__IBM_SCAN_LOG_DUMP) +#define RTAS_FN_IBM_SEND_HVPIPE_MSG rtas_fn_handle(RTAS_FNIDX__IBM_SEND_HVPIPE_MSG) #define RTAS_FN_IBM_SET_DYNAMIC_INDICATOR rtas_fn_handle(RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR) #define RTAS_FN_IBM_SET_EEH_OPTION rtas_fn_handle(RTAS_FNIDX__IBM_SET_EEH_OPTION) #define RTAS_FN_IBM_SET_SLOT_RESET rtas_fn_handle(RTAS_FNIDX__IBM_SET_SLOT_RESET) @@ -217,6 +221,7 @@ typedef struct { #define RTAS_HARDWARE_ERROR -1 /* Hardware or other unspecified error. */ #define RTAS_BUSY -2 /* Retry immediately. */ #define RTAS_INVALID_PARAMETER -3 /* Invalid indicator/domain/sensor etc. */ +#define RTAS_FUNC_NOT_SUPPORTED -5 /* Function not supported */ #define RTAS_UNEXPECTED_STATE_CHANGE -7 /* Seems limited to EEH and slot reset. */ #define RTAS_EXTENDED_DELAY_MIN 9900 /* Retry after delaying for ~1ms. */ #define RTAS_EXTENDED_DELAY_MAX 9905 /* Retry after delaying for ~100s. */ @@ -233,6 +238,7 @@ typedef struct { #define RTAS_EPOW_WARNING 0x40000000 /* set bit 1 */ #define RTAS_HOTPLUG_EVENTS 0x10000000 /* set bit 3 */ #define RTAS_IO_EVENTS 0x08000000 /* set bit 4 */ +#define RTAS_HVPIPE_MSG_EVENTS 0x04000000 /* set bit 5 */ #define RTAS_EVENT_SCAN_ALL_EVENTS 0xffffffff /* RTAS event severity */ @@ -282,6 +288,7 @@ typedef struct { #define RTAS_TYPE_DEALLOC 0xE3 #define RTAS_TYPE_DUMP 0xE4 #define RTAS_TYPE_HOTPLUG 0xE5 +#define RTAS_TYPE_HVPIPE 0xE6 /* I don't add PowerMGM events right now, this is a different topic */ #define RTAS_TYPE_PMGM_POWER_SW_ON 0x60 #define RTAS_TYPE_PMGM_POWER_SW_OFF 0x61 @@ -374,6 +381,7 @@ inline uint32_t rtas_ext_event_company_id(struct rtas_ext_event_log_v6 *ext_log) #define PSERIES_ELOG_SECT_ID_HMC_ID (('H' << 8) | 'M') #define PSERIES_ELOG_SECT_ID_EPOW (('E' << 8) | 'P') #define PSERIES_ELOG_SECT_ID_IO_EVENT (('I' << 8) | 'E') +#define PSERIES_ELOG_SECT_ID_HVPIPE_EVENT (('P' << 8) | 'E') #define PSERIES_ELOG_SECT_ID_MANUFACT_INFO (('M' << 8) | 'I') #define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H') #define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D') @@ -519,6 +527,7 @@ extern struct mutex rtas_ibm_get_indices_lock; extern struct mutex rtas_ibm_set_dynamic_indicator_lock; extern struct mutex rtas_ibm_get_dynamic_sensor_state_lock; extern struct mutex rtas_ibm_physical_attestation_lock; +extern struct mutex rtas_ibm_send_hvpipe_msg_lock; #define GLOBAL_INTERRUPT_QUEUE 9005 diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index e61245c4468e53..8d81c1e7a8db13 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -98,6 +98,8 @@ DEFINE_MUTEX(rtas_ibm_get_vpd_lock); DEFINE_MUTEX(rtas_ibm_get_indices_lock); DEFINE_MUTEX(rtas_ibm_set_dynamic_indicator_lock); DEFINE_MUTEX(rtas_ibm_get_dynamic_sensor_state_lock); +DEFINE_MUTEX(rtas_ibm_receive_hvpipe_msg_lock); +DEFINE_MUTEX(rtas_ibm_send_hvpipe_msg_lock); static struct rtas_function rtas_function_table[] __ro_after_init = { [RTAS_FNIDX__CHECK_EXCEPTION] = { @@ -373,6 +375,17 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { [RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2] = { .name = "ibm,read-slot-reset-state2", }, + [RTAS_FNIDX__IBM_RECEIVE_HVPIPE_MSG] { + .name = "ibm,receive-hvpipe-msg", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = 1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + /* + * PAPR+ v2.13 R1–7.3.32.1 + */ + .lock = &rtas_ibm_receive_hvpipe_msg_lock, + }, [RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW] = { .name = "ibm,remove-pe-dma-window", }, @@ -391,6 +404,17 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { .buf_idx2 = -1, .size_idx2 = -1, }, }, + [RTAS_FNIDX__IBM_SEND_HVPIPE_MSG] { + .name = "ibm,send-hvpipe-msg", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + /* + * PAPR+ v2.13 R1–7.3.32.2 + */ + .lock = &rtas_ibm_send_hvpipe_msg_lock, + }, [RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR] = { .name = "ibm,set-dynamic-indicator", .filter = &(const struct rtas_filter) { diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 9bba469239fcd0..6336ec9aedd0af 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -89,6 +89,8 @@ static char *rtas_event_type(int type) return "Platform Resource Reassignment Event"; case RTAS_TYPE_HOTPLUG: return "Hotplug Event"; + case RTAS_TYPE_HVPIPE: + return "Hypervisor Pipe Notification event"; } return rtas_type[0]; From 814ef095f12c9fa142043ee689500f3a41bb6dab Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 9 Sep 2025 01:43:55 -0700 Subject: [PATCH 1627/3206] powerpc/pseries: Add papr-hvpipe char driver for HVPIPE interfaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hypervisor provides ibm,send-hvpipe-msg and ibm,receive-hvpipe-msg RTAS calls which can be used by the partition to communicate through an inband hypervisor channel with different external sources such as Hardware Management Console (HMC). The information exchanged, whether it be messages, raw or formatted data, etc., is only known to between applications in the OS and the source (HMC). This patch adds papr-hvpipe character driver and provides the standard interfaces such as open / ioctl/ read / write to user space for exchanging information with HMC using send/recevive HVPIPE RTAS functions. PAPR (7.3.32 Hypervisor Pipe Information Exchange) defines the HVPIPE usage: - The hypervisor has one HVPIPE per partition for all sources. - OS can determine this feature’s availability by detecting the “ibm,hypervisor-pipe-capable” property in the /rtas node of the device tree. - Each source is represented by the source ID which is used in send / recv HVPIPE RTAS. (Ex: source ID is the target for the payload in send RTAS). - Return status of ibm,send-hvpipe-msg can be considered as delivered the payload. - Return status of ibm,receive-hvpipe-msg can be considered as ACK to source. - The hypervisor generates hvpipe message event interrupt when the partition has the payload to receive. Provide the interfaces to the user space with /dev/papr-hvpipe character device using the following programming model: int devfd = open("/dev/papr-hvpipe") int fd = ioctl(devfd, PAPR_HVPIPE_IOC_CREATE_HANDLE, &srcID); - Restrict the user space to use the same source ID and do not expect more than one process access with the same source. char *buf = malloc(size); - SIZE should be 4K and the buffer contains header and the payload. length = write(fd, buf, size); - OS issues ibm,send-hvpipe-msg RTAS and returns the RTAS status to the user space. ret = poll(fd,...) - The HVPIPE event message IRQ wakes up for any waiting FDs. length = read(fd, buf, size); - OS issues ibm,receive-hvpipe-msg to receive payload from the hypervisor. release(fd); - OS issues ibm,receive-hvpipe-msg if any payload is pending so that pipe is not blocked. The actual implementation of these calls are added in the next patches. Signed-off-by: Haren Myneni Tested-by: Shashank MS Reviewed-by: Mahesh Salgaonkar Reviewed-by: Tyrel Datwyler Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250909084402.1488456-4-haren@linux.ibm.com --- arch/powerpc/platforms/pseries/Makefile | 1 + arch/powerpc/platforms/pseries/papr-hvpipe.c | 285 +++++++++++++++++++ arch/powerpc/platforms/pseries/papr-hvpipe.h | 14 + 3 files changed, 300 insertions(+) create mode 100644 arch/powerpc/platforms/pseries/papr-hvpipe.c create mode 100644 arch/powerpc/platforms/pseries/papr-hvpipe.h diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 57222678bb3f9f..931ebaa474c81e 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -5,6 +5,7 @@ obj-y := lpar.o hvCall.o nvram.o reconfig.o \ of_helpers.o rtas-work-area.o papr-sysparm.o \ papr-rtas-common.o papr-vpd.o papr-indices.o \ papr-platform-dump.o papr-phy-attest.o \ + papr-hvpipe.o \ setup.o iommu.o event_sources.o ras.o \ firmware.o power.o dlpar.o mobility.o rng.o \ pci.o pci_dlpar.o eeh_pseries.o msi.o \ diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c new file mode 100644 index 00000000000000..aca905189cf264 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "papr-hvpipe: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pseries.h" +#include "papr-hvpipe.h" + +static DEFINE_SPINLOCK(hvpipe_src_list_lock); +static LIST_HEAD(hvpipe_src_list); + +/* + * New PowerPC FW provides support for partitions and various + * sources (Ex: remote hardware management console (HMC)) to + * exchange information through an inband hypervisor channel + * called HVPIPE. Only HMCs are supported right now and + * partitions can communicate with multiple HMCs and each + * source represented by source ID. + * + * FW introduces send HVPIPE and recv HVPIPE RTAS calls for + * partitions to send and receive payloads respectively. + * + * These RTAS functions have the following certain requirements + * / limitations: + * - One hvpipe per partition for all sources. + * - Assume the return status of send HVPIPE as delivered to source + * - Assume the return status of recv HVPIPE as ACK to source + * - Generates HVPIPE event message when the payload is ready + * for the partition. The hypervisor will not deliver another + * event until the partition read the previous payload which + * means the pipe is blocked for any sources. + * + * Linux implementation: + * Follow the similar interfaces that the OS has for other RTAS calls. + * ex: /dev/papr-indices, /dev/papr-vpd, etc. + * - /dev/papr-hvpipe is available for the user space. + * - devfd = open("/dev/papr-hvpipe", ..) + * - fd = ioctl(fd,HVPIPE_IOC_CREATE_HANDLE,&srcID)-for each source + * - write(fd, buf, size) --> Issue send HVPIPE RTAS call and + * returns size for success or the corresponding error for RTAS + * return code for failure. + * - poll(fd,..) -> wakeup FD if the payload is available to read. + * HVPIPE event message handler wakeup FD based on source ID in + * the event message + * - read(fd, buf, size) --> Issue recv HVPIPE RTAS call and + * returns size for success or the corresponding error for RTAS + * return code for failure. + */ + +static struct hvpipe_source_info *hvpipe_find_source(u32 srcID) +{ + struct hvpipe_source_info *src_info; + + list_for_each_entry(src_info, &hvpipe_src_list, list) + if (src_info->srcID == srcID) + return src_info; + + return NULL; +} + +/* + * papr_hvpipe_handle_write - Issue send HVPIPE RTAS and return + * the RTAS status to the user space + */ +static ssize_t papr_hvpipe_handle_write(struct file *file, + const char __user *buf, size_t size, loff_t *off) +{ + struct hvpipe_source_info *src_info = file->private_data; + + if (!src_info) + return -EIO; + + return 0; +} + +/* + * papr_hvpipe_handle_read - If the payload for the specific + * source is pending in the hypervisor, issue recv HVPIPE RTAS + * and return the payload to the user space. + * + * When the payload is available for the partition, the + * hypervisor notifies HVPIPE event with the source ID + * and the event handler wakeup FD(s) that are waiting. + */ +static ssize_t papr_hvpipe_handle_read(struct file *file, + char __user *buf, size_t size, loff_t *off) +{ + + struct hvpipe_source_info *src_info = file->private_data; + + if (!src_info) + return -EIO; + + return 0; +} + +/* + * The user space waits for the payload to receive. + * The hypervisor sends HVPIPE event message to the partition + * when the payload is available. The event handler wakeup FD + * depends on the source ID in the message event. + */ +static __poll_t papr_hvpipe_handle_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct hvpipe_source_info *src_info = filp->private_data; + + if (!src_info) + return POLLNVAL; + + return 0; +} + +static int papr_hvpipe_handle_release(struct inode *inode, + struct file *file) +{ + struct hvpipe_source_info *src_info; + + /* + * Hold the lock, remove source from src_list, reset the + * hvpipe status and release the lock to prevent any race + * with message event IRQ. + */ + spin_lock(&hvpipe_src_list_lock); + src_info = file->private_data; + list_del(&src_info->list); + file->private_data = NULL; + spin_unlock(&hvpipe_src_list_lock); + kfree(src_info); + return 0; +} + +static const struct file_operations papr_hvpipe_handle_ops = { + .read = papr_hvpipe_handle_read, + .write = papr_hvpipe_handle_write, + .release = papr_hvpipe_handle_release, + .poll = papr_hvpipe_handle_poll, +}; + +static int papr_hvpipe_dev_create_handle(u32 srcID) +{ + struct hvpipe_source_info *src_info; + struct file *file; + long err; + int fd; + + spin_lock(&hvpipe_src_list_lock); + /* + * Do not allow more than one process communicates with + * each source. + */ + src_info = hvpipe_find_source(srcID); + if (src_info) { + spin_unlock(&hvpipe_src_list_lock); + pr_err("pid(%d) is already using the source(%d)\n", + src_info->tsk->pid, srcID); + return -EALREADY; + } + spin_unlock(&hvpipe_src_list_lock); + + src_info = kzalloc(sizeof(*src_info), GFP_KERNEL_ACCOUNT); + if (!src_info) + return -ENOMEM; + + src_info->srcID = srcID; + src_info->tsk = current; + init_waitqueue_head(&src_info->recv_wqh); + + fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = fd; + goto free_buf; + } + + file = anon_inode_getfile("[papr-hvpipe]", + &papr_hvpipe_handle_ops, (void *)src_info, + O_RDWR); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto free_fd; + } + + spin_lock(&hvpipe_src_list_lock); + /* + * If two processes are executing ioctl() for the same + * source ID concurrently, prevent the second process to + * acquire FD. + */ + if (hvpipe_find_source(srcID)) { + spin_unlock(&hvpipe_src_list_lock); + err = -EALREADY; + goto free_file; + } + list_add(&src_info->list, &hvpipe_src_list); + spin_unlock(&hvpipe_src_list_lock); + + fd_install(fd, file); + return fd; + +free_file: + fput(file); +free_fd: + put_unused_fd(fd); +free_buf: + kfree(src_info); + return err; +} + +/* + * Top-level ioctl handler for /dev/papr_hvpipe + * + * Use separate FD for each source (exa :HMC). So ioctl is called + * with source ID which returns FD. + */ +static long papr_hvpipe_dev_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + u32 __user *argp = (void __user *)arg; + u32 srcID; + long ret; + + if (get_user(srcID, argp)) + return -EFAULT; + + /* + * Support only HMC source right now + */ + if (!(srcID & HVPIPE_HMC_ID_MASK)) + return -EINVAL; + + switch (ioctl) { + case PAPR_HVPIPE_IOC_CREATE_HANDLE: + ret = papr_hvpipe_dev_create_handle(srcID); + break; + default: + ret = -ENOIOCTLCMD; + break; + } + + return ret; +} + +static const struct file_operations papr_hvpipe_ops = { + .unlocked_ioctl = papr_hvpipe_dev_ioctl, +}; + +static struct miscdevice papr_hvpipe_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "papr-hvpipe", + .fops = &papr_hvpipe_ops, +}; + +static int __init papr_hvpipe_init(void) +{ + int ret; + + if (!of_find_property(rtas.dev, "ibm,hypervisor-pipe-capable", + NULL)) + return -ENODEV; + + if (!rtas_function_implemented(RTAS_FN_IBM_SEND_HVPIPE_MSG) || + !rtas_function_implemented(RTAS_FN_IBM_RECEIVE_HVPIPE_MSG)) + return -ENODEV; + + ret = misc_register(&papr_hvpipe_dev); + if (ret) { + pr_err("misc-dev registration failed %d\n", ret); + return ret; + } + + return 0; +} +machine_device_initcall(pseries, papr_hvpipe_init); diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.h b/arch/powerpc/platforms/pseries/papr-hvpipe.h new file mode 100644 index 00000000000000..3b6f9e73791388 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _PAPR_HVPIPE_H +#define _PAPR_HVPIPE_H + +#define HVPIPE_HMC_ID_MASK 0x02000000 /*02-HMC,00-reserved and HMC ID */ + +struct hvpipe_source_info { + struct list_head list; /* list of sources */ + u32 srcID; + wait_queue_head_t recv_wqh; /* wake up poll() waitq */ + struct task_struct *tsk; +}; + +#endif /* _PAPR_HVPIPE_H */ From 56dbc6678bbb9c011bea91c4a0774a9464ab99a7 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 9 Sep 2025 01:43:56 -0700 Subject: [PATCH 1628/3206] powerpc/pseries: Send payload with ibm,send-hvpipe-msg RTAS ibm,send-hvpipe-msg RTAS call is used to send data to the source (Ex: Hardware Management Console) over the hypervisor pipe. The maximum data length of 4048 bytes is supported with this RTAS call right now. The user space uses write() to send this payload which invokes this RTAS. Then the write returns the buffer length (including papr_hvpipe_hdr length) to the user space for success or RTAS failure error. ibm,send-hvpipe-msg call takes source ID as target and the buffer in the form of buffer list. The buffer list format consists of work area of size 4K to hold buffer list and number of 4K work areas depends on buffers is as follows: Length of Buffer List in bytes Address of 4K buffer 1 Length of 4K buffer 1 used ... Address of 4K buffer n Length of 4K buffer n used Only one buffer is used right now because of max payload size is 4048 bytes. writev() can be used in future when supported more than one buffer. Signed-off-by: Haren Myneni Tested-by: Shashank MS Reviewed-by: Mahesh Salgaonkar Reviewed-by: Tyrel Datwyler Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250909084402.1488456-5-haren@linux.ibm.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 118 ++++++++++++++++++- arch/powerpc/platforms/pseries/papr-hvpipe.h | 7 ++ 2 files changed, 124 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index aca905189cf264..9f2bac71bab11b 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "pseries.h" #include "papr-hvpipe.h" @@ -59,6 +60,51 @@ static LIST_HEAD(hvpipe_src_list); * return code for failure. */ +/* + * ibm,send-hvpipe-msg RTAS call + * @area: Caller-provided work area buffer to send. + * @srcID: Target source for the send pipe message. + */ +static int rtas_ibm_send_hvpipe_msg(struct rtas_work_area *area, u32 srcID) +{ + const s32 token = rtas_function_token(RTAS_FN_IBM_SEND_HVPIPE_MSG); + s32 fwrc; + int ret; + + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + do { + fwrc = rtas_call(token, 2, 1, NULL, srcID, + rtas_work_area_phys(area)); + + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case RTAS_SUCCESS: + ret = 0; + break; + case RTAS_HARDWARE_ERROR: + ret = -EIO; + break; + case RTAS_INVALID_PARAMETER: + ret = -EINVAL; + break; + case RTAS_HVPIPE_CLOSED: + ret = -EPIPE; + break; + case RTAS_FUNC_NOT_SUPPORTED: + ret = -EOPNOTSUPP; + break; + default: + ret = -EIO; + pr_err_ratelimited("unexpected ibm,receive-hvpipe-msg status %d\n", fwrc); + break; + } + + return ret; +} + static struct hvpipe_source_info *hvpipe_find_source(u32 srcID) { struct hvpipe_source_info *src_info; @@ -78,11 +124,81 @@ static ssize_t papr_hvpipe_handle_write(struct file *file, const char __user *buf, size_t size, loff_t *off) { struct hvpipe_source_info *src_info = file->private_data; + struct rtas_work_area *work_area, *work_buf; + unsigned long ret, len; + __be64 *area_be; if (!src_info) return -EIO; - return 0; + /* + * Send HVPIPE RTAS is used to send payload to the specific + * source with the input parameters source ID and the payload + * as buffer list. Each entry in the buffer list contains + * address/length pair of the buffer. + * + * The buffer list format is as follows: + * + * Header (length of address/length pairs and the header length) + * Address of 4K buffer 1 + * Length of 4K buffer 1 used + * ... + * Address of 4K buffer n + * Length of 4K buffer n used + * + * See PAPR 7.3.32.2 ibm,send-hvpipe-msg + * + * Even though can support max 1MB payload, the hypervisor + * supports only 4048 bytes payload at present and also + * just one address/length entry. + * + * writev() interface can be added in future when the + * hypervisor supports multiple buffer list entries. + */ + /* HVPIPE_MAX_WRITE_BUFFER_SIZE = 4048 bytes */ + if ((size > (HVPIPE_HDR_LEN + HVPIPE_MAX_WRITE_BUFFER_SIZE)) || + (size <= HVPIPE_HDR_LEN)) + return -EINVAL; + + /* + * The length of (address + length) pair + the length of header + */ + len = (2 * sizeof(u64)) + sizeof(u64); + size -= HVPIPE_HDR_LEN; + buf += HVPIPE_HDR_LEN; + mutex_lock(&rtas_ibm_send_hvpipe_msg_lock); + work_area = rtas_work_area_alloc(SZ_4K); + if (!work_area) { + ret = -ENOMEM; + goto out; + } + area_be = (__be64 *)rtas_work_area_raw_buf(work_area); + /* header */ + area_be[0] = cpu_to_be64(len); + + work_buf = rtas_work_area_alloc(SZ_4K); + if (!work_buf) { + ret = -ENOMEM; + goto out_work; + } + /* First buffer address */ + area_be[1] = cpu_to_be64(rtas_work_area_phys(work_buf)); + /* First buffer address length */ + area_be[2] = cpu_to_be64(size); + + if (!copy_from_user(rtas_work_area_raw_buf(work_buf), buf, size)) { + ret = rtas_ibm_send_hvpipe_msg(work_area, src_info->srcID); + if (!ret) + ret = size + HVPIPE_HDR_LEN; + } else + ret = -EPERM; + + rtas_work_area_free(work_buf); +out_work: + rtas_work_area_free(work_area); +out: + mutex_unlock(&rtas_ibm_send_hvpipe_msg_lock); + return ret; } /* diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.h b/arch/powerpc/platforms/pseries/papr-hvpipe.h index 3b6f9e73791388..6f98da4ec45fef 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.h +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.h @@ -3,6 +3,13 @@ #define _PAPR_HVPIPE_H #define HVPIPE_HMC_ID_MASK 0x02000000 /*02-HMC,00-reserved and HMC ID */ +#define HVPIPE_MAX_WRITE_BUFFER_SIZE 4048 +/* + * hvpipe specific RTAS return values + */ +#define RTAS_HVPIPE_CLOSED -4 + +#define HVPIPE_HDR_LEN sizeof(struct papr_hvpipe_hdr) struct hvpipe_source_info { struct list_head list; /* list of sources */ From cebdb522fd3edd1fe05f7b4a74a27da7dd0f8d86 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 9 Sep 2025 01:43:57 -0700 Subject: [PATCH 1629/3206] powerpc/pseries: Receive payload with ibm,receive-hvpipe-msg RTAS ibm,receive-hvpipe-msg RTAS call is used to receive data from the source (Ex: Hardware Management Console) over the hypervisor pipe. The hypervisor will signal the OS via a Hypervisor Pipe Event external interrupt when data is available to be received from the pipe and the event message has the source ID and the message type such as payload or closed pipe to the specific source. The hypervisor will not generate another interrupt for the next payload until the partition reads the previous payload. It means the hvpipe is blocked and will not deliver other events for any source. The maximum data length of 4048 bytes is supported with this RTAS call right now. The user space uses read() to receive data from HMC which issues ibm,receive-hvpipe-msg RTAS and the kernel returns the buffer length (including papr_hvpipe_hdr length) to the user space for success or RTAS failure error. If the message is regarding the pipe closed, kernel just returns the papr_hvpipe_hdr with flags = HVPIPE_LOST_CONNECTION and expects the user space to close FD for the corresponding source. ibm,receive-hvpipe-msg RTAS call passes the buffer and returns the source ID from where this payload is received and the payload length. Signed-off-by: Haren Myneni Tested-by: Shashank MS Reviewed-by: Mahesh Salgaonkar Reviewed-by: Tyrel Datwyler Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250909084402.1488456-6-haren@linux.ibm.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 164 ++++++++++++++++++- arch/powerpc/platforms/pseries/papr-hvpipe.h | 1 + 2 files changed, 163 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 9f2bac71bab11b..87deb429b331ff 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -60,6 +60,54 @@ static LIST_HEAD(hvpipe_src_list); * return code for failure. */ +/* + * ibm,receive-hvpipe-msg RTAS call. + * @area: Caller-provided work area buffer for results. + * @srcID: Source ID returned by the RTAS call. + * @bytesw: Bytes written by RTAS call to @area. + */ +static int rtas_ibm_receive_hvpipe_msg(struct rtas_work_area *area, + u32 *srcID, u32 *bytesw) +{ + const s32 token = rtas_function_token(RTAS_FN_IBM_RECEIVE_HVPIPE_MSG); + u32 rets[2]; + s32 fwrc; + int ret; + + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + do { + fwrc = rtas_call(token, 2, 3, rets, + rtas_work_area_phys(area), + rtas_work_area_size(area)); + + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case RTAS_SUCCESS: + *srcID = rets[0]; + *bytesw = rets[1]; + ret = 0; + break; + case RTAS_HARDWARE_ERROR: + ret = -EIO; + break; + case RTAS_INVALID_PARAMETER: + ret = -EINVAL; + break; + case RTAS_FUNC_NOT_SUPPORTED: + ret = -EOPNOTSUPP; + break; + default: + ret = -EIO; + pr_err_ratelimited("unexpected ibm,receive-hvpipe-msg status %d\n", fwrc); + break; + } + + return ret; +} + /* * ibm,send-hvpipe-msg RTAS call * @area: Caller-provided work area buffer to send. @@ -116,9 +164,60 @@ static struct hvpipe_source_info *hvpipe_find_source(u32 srcID) return NULL; } +/* + * This work function collects receive buffer with recv HVPIPE + * RTAS call. Called from read() + * @buf: User specified buffer to copy the payload that returned + * from recv HVPIPE RTAS. + * @size: Size of buffer user passed. + */ +static int hvpipe_rtas_recv_msg(char __user *buf, int size) +{ + struct rtas_work_area *work_area; + u32 srcID, bytes_written; + int ret; + + work_area = rtas_work_area_alloc(SZ_4K); + if (!work_area) { + pr_err("Could not allocate RTAS buffer for recv pipe\n"); + return -ENOMEM; + } + + ret = rtas_ibm_receive_hvpipe_msg(work_area, &srcID, + &bytes_written); + if (!ret) { + /* + * Recv HVPIPE RTAS is successful. + * When releasing FD or no one is waiting on the + * specific source, issue recv HVPIPE RTAS call + * so that pipe is not blocked - this func is called + * with NULL buf. + */ + if (buf) { + if (size < bytes_written) { + pr_err("Received the payload size = %d, but the buffer size = %d\n", + bytes_written, size); + bytes_written = size; + } + ret = copy_to_user(buf, + rtas_work_area_raw_buf(work_area), + bytes_written); + if (!ret) + ret = bytes_written; + } + } else { + pr_err("ibm,receive-hvpipe-msg failed with %d\n", + ret); + } + + rtas_work_area_free(work_area); + return ret; +} + /* * papr_hvpipe_handle_write - Issue send HVPIPE RTAS and return - * the RTAS status to the user space + * the size (payload + HVPIPE_HDR_LEN) for RTAS success. + * Otherwise returns the status of RTAS to the user space */ static ssize_t papr_hvpipe_handle_write(struct file *file, const char __user *buf, size_t size, loff_t *off) @@ -215,11 +314,72 @@ static ssize_t papr_hvpipe_handle_read(struct file *file, { struct hvpipe_source_info *src_info = file->private_data; + struct papr_hvpipe_hdr hdr; + long ret; if (!src_info) return -EIO; - return 0; + /* + * Max payload is 4048 (HVPIPE_MAX_WRITE_BUFFER_SIZE) + */ + if ((size > (HVPIPE_HDR_LEN + HVPIPE_MAX_WRITE_BUFFER_SIZE)) || + (size < HVPIPE_HDR_LEN)) + return -EINVAL; + + /* + * Payload is not available to receive or source pipe + * is not closed. + */ + if (!src_info->hvpipe_status) + return 0; + + hdr.version = 0; + hdr.flags = 0; + + /* + * In case if the hvpipe has payload and also the + * hypervisor closed the pipe to the source, retrieve + * the payload and return to the user space first and + * then notify the userspace about the hvpipe close in + * next read(). + */ + if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE) + hdr.flags = HVPIPE_MSG_AVAILABLE; + else if (src_info->hvpipe_status & HVPIPE_LOST_CONNECTION) + hdr.flags = HVPIPE_LOST_CONNECTION; + else + /* + * Should not be here without one of the above + * flags set + */ + return -EIO; + + ret = copy_to_user(buf, &hdr, HVPIPE_HDR_LEN); + if (ret) + return ret; + + /* + * Message event has payload, so get the payload with + * recv HVPIPE RTAS. + */ + if (hdr.flags & HVPIPE_MSG_AVAILABLE) { + ret = hvpipe_rtas_recv_msg(buf + HVPIPE_HDR_LEN, + size - HVPIPE_HDR_LEN); + if (ret > 0) { + src_info->hvpipe_status &= ~HVPIPE_MSG_AVAILABLE; + ret += HVPIPE_HDR_LEN; + } + } else if (hdr.flags & HVPIPE_LOST_CONNECTION) { + /* + * Hypervisor is closing the pipe for the specific + * source. So notify user space. + */ + src_info->hvpipe_status &= ~HVPIPE_LOST_CONNECTION; + ret = HVPIPE_HDR_LEN; + } + + return ret; } /* diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.h b/arch/powerpc/platforms/pseries/papr-hvpipe.h index 6f98da4ec45fef..125658e6b59634 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.h +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.h @@ -14,6 +14,7 @@ struct hvpipe_source_info { struct list_head list; /* list of sources */ u32 srcID; + u32 hvpipe_status; wait_queue_head_t recv_wqh; /* wake up poll() waitq */ struct task_struct *tsk; }; From da24fb99a1b5cc842b9446f67f6bcda36b49817f Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 9 Sep 2025 01:43:58 -0700 Subject: [PATCH 1630/3206] powerpc/pseries: Wakeup hvpipe FD when the payload is pending The user space polls on the wait_queue for the payload from the specific source. The hypervisor interrupts the OS when the pipe status for the specific source is changed such as payload is available for the partition or pipe to the source is closed. The OS retrieves the HVPIPE event message with check-exception RTAS and event message contains the source ID and the pipe status. Then wakes up all FDs waiting on the wait_queue so that the user space can read the payload or close the FD if the pipe to source in the hypervisor is closed. The hypervisor assigns one pipe per partition for all sources. Hence issue ibm,receive-hvpipe-msg() to read the pending payload during release() before closing FD so that pipe to the partition will not be blocked. Signed-off-by: Haren Myneni Tested-by: Shashank MS Reviewed-by: Mahesh Salgaonkar Reviewed-by: Tyrel Datwyler Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250909084402.1488456-7-haren@linux.ibm.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 28 +++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 87deb429b331ff..4a1444a0d331da 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -396,6 +396,21 @@ static __poll_t papr_hvpipe_handle_poll(struct file *filp, if (!src_info) return POLLNVAL; + /* + * If hvpipe already has pending payload, return so that + * the user space can issue read(). + */ + if (src_info->hvpipe_status) + return POLLIN | POLLRDNORM; + + /* + * Wait for the message event + * hvpipe_event_interrupt() wakes up this wait_queue + */ + poll_wait(filp, &src_info->recv_wqh, wait); + if (src_info->hvpipe_status) + return POLLIN | POLLRDNORM; + return 0; } @@ -413,7 +428,18 @@ static int papr_hvpipe_handle_release(struct inode *inode, src_info = file->private_data; list_del(&src_info->list); file->private_data = NULL; - spin_unlock(&hvpipe_src_list_lock); + /* + * If the pipe for this specific source has any pending + * payload, issue recv HVPIPE RTAS so that pipe will not + * be blocked. + */ + if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE) { + src_info->hvpipe_status = 0; + spin_unlock(&hvpipe_src_list_lock); + hvpipe_rtas_recv_msg(NULL, 0); + } else + spin_unlock(&hvpipe_src_list_lock); + kfree(src_info); return 0; } From b48b6cc8c655d8cdcf5124ba9901b74c8f759668 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 9 Sep 2025 01:43:59 -0700 Subject: [PATCH 1631/3206] powerpc/pseries: Enable HVPIPE event message interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hypervisor signals the OS via a Hypervisor Pipe Event external interrupt when data is available to be received from the pipe. Then the OS should call RTAS check-exception and provide the input Event Mask as defined for the ‘ibm,hvpipe-msg-events’. In response, check-exception will return an event log containing an Pipe Events message. This message contains the source ID for which this message is intended to and the pipe status such as whether the payload is pending in the hypervisor or pipe to source is closed. If there is any user space process waiting in the wait_queue for the payload from this source ID, wake up that process which can issue read() to obtain payload with ibm,receive-hvpipe-msg RTAS or close FD if the pipe to source is closed. The hypervisor has one pipe per partition for all sources and it will not deliver another hvpipe event message until the partition reads the payload for the previous hvpipe event. So if the source ID is not found in the source list, issue the dummy ibm,receive-hvpipe-msg RTAS so that pipe will not blocked. Register hvpipe event source interrupt based on entries from /proc/device-tree//event-sources/ibm,hvpipe-msg-events property. Signed-off-by: Haren Myneni Tested-by: Shashank MS Reviewed-by: Mahesh Salgaonkar Reviewed-by: Tyrel Datwyler Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250909084402.1488456-8-haren@linux.ibm.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 146 ++++++++++++++++++- arch/powerpc/platforms/pseries/papr-hvpipe.h | 14 ++ 2 files changed, 155 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 4a1444a0d331da..eb312594b1f02b 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -22,6 +22,11 @@ static DEFINE_SPINLOCK(hvpipe_src_list_lock); static LIST_HEAD(hvpipe_src_list); +static unsigned char hvpipe_ras_buf[RTAS_ERROR_LOG_MAX]; +static struct workqueue_struct *papr_hvpipe_wq; +static struct work_struct *papr_hvpipe_work; +static int hvpipe_check_exception_token; + /* * New PowerPC FW provides support for partitions and various * sources (Ex: remote hardware management console (HMC)) to @@ -554,6 +559,117 @@ static long papr_hvpipe_dev_ioctl(struct file *filp, unsigned int ioctl, return ret; } +/* + * papr_hvpipe_work_fn - called to issue recv HVPIPE RTAS for + * sources that are not monitored by user space so that pipe + * will not be blocked. + */ +static void papr_hvpipe_work_fn(struct work_struct *work) +{ + hvpipe_rtas_recv_msg(NULL, 0); +} + +/* + * HVPIPE event message IRQ handler. + * The hypervisor sends event IRQ if the partition has payload + * and generates another event only after payload is read with + * recv HVPIPE RTAS. + */ +static irqreturn_t hvpipe_event_interrupt(int irq, void *dev_id) +{ + struct hvpipe_event_buf *hvpipe_event; + struct pseries_errorlog *pseries_log; + struct hvpipe_source_info *src_info; + struct rtas_error_log *elog; + int rc; + + rc = rtas_call(hvpipe_check_exception_token, 6, 1, NULL, + RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), + RTAS_HVPIPE_MSG_EVENTS, 1, __pa(&hvpipe_ras_buf), + rtas_get_error_log_max()); + + if (rc != 0) { + pr_err_ratelimited("unexpected hvpipe-event-notification failed %d\n", rc); + return IRQ_HANDLED; + } + + elog = (struct rtas_error_log *)hvpipe_ras_buf; + if (unlikely(rtas_error_type(elog) != RTAS_TYPE_HVPIPE)) { + pr_warn_ratelimited("Unexpected event type %d\n", + rtas_error_type(elog)); + return IRQ_HANDLED; + } + + pseries_log = get_pseries_errorlog(elog, + PSERIES_ELOG_SECT_ID_HVPIPE_EVENT); + hvpipe_event = (struct hvpipe_event_buf *)pseries_log->data; + + /* + * The hypervisor notifies partition when the payload is + * available to read with recv HVPIPE RTAS and it will not + * notify another event for any source until the previous + * payload is read. Means the pipe is blocked in the + * hypervisor until the payload is read. + * + * If the source is ready to accept payload and wakeup the + * corresponding FD. Hold lock and update hvpipe_status + * and this lock is needed in case the user space process + * is in release FD instead of poll() so that release() + * reads the payload to unblock pipe before closing FD. + * + * otherwise (means no other user process waiting for the + * payload, issue recv HVPIPE RTAS (papr_hvpipe_work_fn()) + * to unblock pipe. + */ + spin_lock(&hvpipe_src_list_lock); + src_info = hvpipe_find_source(be32_to_cpu(hvpipe_event->srcID)); + if (src_info) { + u32 flags = 0; + + if (hvpipe_event->event_type & HVPIPE_LOST_CONNECTION) + flags = HVPIPE_LOST_CONNECTION; + else if (hvpipe_event->event_type & HVPIPE_MSG_AVAILABLE) + flags = HVPIPE_MSG_AVAILABLE; + + src_info->hvpipe_status |= flags; + wake_up(&src_info->recv_wqh); + spin_unlock(&hvpipe_src_list_lock); + } else { + spin_unlock(&hvpipe_src_list_lock); + /* + * user space is not waiting on this source. So + * execute receive pipe RTAS so that pipe will not + * be blocked. + */ + if (hvpipe_event->event_type & HVPIPE_MSG_AVAILABLE) + queue_work(papr_hvpipe_wq, papr_hvpipe_work); + } + + return IRQ_HANDLED; +} + +static int __init enable_hvpipe_IRQ(void) +{ + struct device_node *np; + + hvpipe_check_exception_token = rtas_function_token(RTAS_FN_CHECK_EXCEPTION); + if (hvpipe_check_exception_token == RTAS_UNKNOWN_SERVICE) + return -ENODEV; + + /* hvpipe events */ + np = of_find_node_by_path("/event-sources/ibm,hvpipe-msg-events"); + if (np != NULL) { + request_event_sources_irqs(np, hvpipe_event_interrupt, + "HPIPE_EVENT"); + of_node_put(np); + } else { + pr_err("Can not enable hvpipe event IRQ\n"); + return -ENODEV; + } + + return 0; +} + static const struct file_operations papr_hvpipe_ops = { .unlocked_ioctl = papr_hvpipe_dev_ioctl, }; @@ -576,12 +692,32 @@ static int __init papr_hvpipe_init(void) !rtas_function_implemented(RTAS_FN_IBM_RECEIVE_HVPIPE_MSG)) return -ENODEV; - ret = misc_register(&papr_hvpipe_dev); - if (ret) { - pr_err("misc-dev registration failed %d\n", ret); - return ret; + papr_hvpipe_work = kzalloc(sizeof(struct work_struct), GFP_ATOMIC); + if (!papr_hvpipe_work) + return -ENOMEM; + + INIT_WORK(papr_hvpipe_work, papr_hvpipe_work_fn); + + papr_hvpipe_wq = alloc_ordered_workqueue("papr hvpipe workqueue", 0); + if (!papr_hvpipe_wq) { + ret = -ENOMEM; + goto out; } - return 0; + ret = enable_hvpipe_IRQ(); + if (!ret) + ret = misc_register(&papr_hvpipe_dev); + + if (!ret) { + pr_info("hvpipe feature is enabled\n"); + return 0; + } + + pr_err("hvpipe feature is not enabled %d\n", ret); + destroy_workqueue(papr_hvpipe_wq); +out: + kfree(papr_hvpipe_work); + papr_hvpipe_work = NULL; + return ret; } machine_device_initcall(pseries, papr_hvpipe_init); diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.h b/arch/powerpc/platforms/pseries/papr-hvpipe.h index 125658e6b59634..aab7f77e087dd2 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.h +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.h @@ -19,4 +19,18 @@ struct hvpipe_source_info { struct task_struct *tsk; }; +/* + * Source ID Format 0xCCRRQQQQ + * CC = indicating value is source type (ex: 0x02 for HMC) + * RR = 0x00 (reserved) + * QQQQ = 0x0000 – 0xFFFF indicating the source index indetifier + */ +struct hvpipe_event_buf { + __be32 srcID; /* Source ID */ + u8 event_type; /* 0x01 for hvpipe message available */ + /* from specified src ID */ + /* 0x02 for loss of pipe connection */ + /* with specified src ID */ +}; + #endif /* _PAPR_HVPIPE_H */ From 39a08a4f94980518ef2eca3c6c6b61094c99f1af Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 9 Sep 2025 01:44:00 -0700 Subject: [PATCH 1632/3206] powerpc/pseries: Enable hvpipe with ibm,set-system-parameter RTAS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The partition uses “Hypervisor Pipe OS Enablement Notification” system parameter token (value = 64) to enable / disable hvpipe in the hypervisor. Once hvpipe is enabled, the hypervisor notifies OS if the payload is pending for that partition from any source. This system parameter token takes 1 byte length of data with 1 = Enable and 0 = Disable. Enable hvpipe in the hypervisor with ibm,set-system-parameter RTAS after registering hvpipe event source interrupt. Signed-off-by: Haren Myneni Tested-by: Shashank MS Reviewed-by: Mahesh Salgaonkar Reviewed-by: Tyrel Datwyler Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250909084402.1488456-9-haren@linux.ibm.com --- arch/powerpc/include/asm/papr-sysparm.h | 1 + arch/powerpc/platforms/pseries/papr-hvpipe.c | 34 ++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/papr-sysparm.h b/arch/powerpc/include/asm/papr-sysparm.h index c3cd5b131033eb..a3b5a0d05db6fc 100644 --- a/arch/powerpc/include/asm/papr-sysparm.h +++ b/arch/powerpc/include/asm/papr-sysparm.h @@ -21,6 +21,7 @@ typedef struct { #define PAPR_SYSPARM_COOP_MEM_OVERCOMMIT_ATTRS mk_papr_sysparm(44) #define PAPR_SYSPARM_TLB_BLOCK_INVALIDATE_ATTRS mk_papr_sysparm(50) #define PAPR_SYSPARM_LPAR_NAME mk_papr_sysparm(55) +#define PAPR_SYSPARM_HVPIPE_ENABLE mk_papr_sysparm(64) /** * struct papr_sysparm_buf - RTAS work area layout for system parameter functions. diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index eb312594b1f02b..65368dbb134b68 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "pseries.h" #include "papr-hvpipe.h" @@ -648,6 +649,32 @@ static irqreturn_t hvpipe_event_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +/* + * Enable hvpipe by system parameter set with parameter + * token = 64 and with 1 byte buffer data: + * 0 = hvpipe not in use/disable + * 1 = hvpipe in use/enable + */ +static int set_hvpipe_sys_param(u8 val) +{ + struct papr_sysparm_buf *buf; + int ret; + + buf = papr_sysparm_buf_alloc(); + if (!buf) + return -ENOMEM; + + buf->len = cpu_to_be16(1); + buf->val[0] = val; + ret = papr_sysparm_set(PAPR_SYSPARM_HVPIPE_ENABLE, buf); + if (ret) + pr_err("Can not enable hvpipe %d\n", ret); + + papr_sysparm_buf_free(buf); + + return ret; +} + static int __init enable_hvpipe_IRQ(void) { struct device_node *np; @@ -705,8 +732,11 @@ static int __init papr_hvpipe_init(void) } ret = enable_hvpipe_IRQ(); - if (!ret) - ret = misc_register(&papr_hvpipe_dev); + if (!ret) { + ret = set_hvpipe_sys_param(1); + if (!ret) + ret = misc_register(&papr_hvpipe_dev); + } if (!ret) { pr_info("hvpipe feature is enabled\n"); From 6d84f85151bbd062d36bda6daf37a73945b471c9 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 9 Sep 2025 01:44:01 -0700 Subject: [PATCH 1633/3206] powerpc/pseries: HVPIPE changes to support migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hypervisor assigns one pipe per partition for all sources and assigns new pipe after migration. Also the partition ID that is used by source as its target ID may be changed after the migration. So disable hvpipe during SUSPEND event with ‘hvpipe enable’ system parameter value = 0 and enable it after migration during RESUME event with hvpipe enable’ system parameter value = 1. The user space calls such as ioctl()/ read() / write() / poll() returns -ENXIO between SUSPEND and RESUME events. The user space process can close FD and reestablish connection with new FD after migration if needed (Example: source IDs are changed). Signed-off-by: Haren Myneni Tested-by: Shashank MS Reviewed-by: Mahesh Salgaonkar Reviewed-by: Tyrel Datwyler Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250909084402.1488456-10-haren@linux.ibm.com --- arch/powerpc/platforms/pseries/mobility.c | 3 + arch/powerpc/platforms/pseries/papr-hvpipe.c | 65 ++++++++++++++++++++ arch/powerpc/platforms/pseries/papr-hvpipe.h | 6 ++ 3 files changed, 74 insertions(+) diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 62bd8e2d5d4c0b..95fe802ccdfdb4 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -28,6 +28,7 @@ #include #include "pseries.h" #include "vas.h" /* vas_migration_handler() */ +#include "papr-hvpipe.h" /* hvpipe_migration_handler() */ #include "../../kernel/cacheinfo.h" static struct kobject *mobility_kobj; @@ -744,6 +745,7 @@ static int pseries_migrate_partition(u64 handle) * by closing VAS windows at the beginning of this function. */ vas_migration_handler(VAS_SUSPEND); + hvpipe_migration_handler(HVPIPE_SUSPEND); ret = wait_for_vasi_session_suspending(handle); if (ret) @@ -770,6 +772,7 @@ static int pseries_migrate_partition(u64 handle) out: vas_migration_handler(VAS_RESUME); + hvpipe_migration_handler(HVPIPE_RESUME); return ret; } diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 65368dbb134b68..21a2f447c43fdc 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -27,6 +27,7 @@ static unsigned char hvpipe_ras_buf[RTAS_ERROR_LOG_MAX]; static struct workqueue_struct *papr_hvpipe_wq; static struct work_struct *papr_hvpipe_work; static int hvpipe_check_exception_token; +static bool hvpipe_feature; /* * New PowerPC FW provides support for partitions and various @@ -233,6 +234,12 @@ static ssize_t papr_hvpipe_handle_write(struct file *file, unsigned long ret, len; __be64 *area_be; + /* + * Return -ENXIO during migration + */ + if (!hvpipe_feature) + return -ENXIO; + if (!src_info) return -EIO; @@ -323,6 +330,12 @@ static ssize_t papr_hvpipe_handle_read(struct file *file, struct papr_hvpipe_hdr hdr; long ret; + /* + * Return -ENXIO during migration + */ + if (!hvpipe_feature) + return -ENXIO; + if (!src_info) return -EIO; @@ -399,6 +412,13 @@ static __poll_t papr_hvpipe_handle_poll(struct file *filp, { struct hvpipe_source_info *src_info = filp->private_data; + /* + * HVPIPE is disabled during SUSPEND and enabled after migration. + * So return POLLRDHUP during migration + */ + if (!hvpipe_feature) + return POLLRDHUP; + if (!src_info) return POLLNVAL; @@ -539,6 +559,12 @@ static long papr_hvpipe_dev_ioctl(struct file *filp, unsigned int ioctl, u32 srcID; long ret; + /* + * Return -ENXIO during migration + */ + if (!hvpipe_feature) + return -ENXIO; + if (get_user(srcID, argp)) return -EFAULT; @@ -697,6 +723,44 @@ static int __init enable_hvpipe_IRQ(void) return 0; } +void hvpipe_migration_handler(int action) +{ + pr_info("hvpipe migration event %d\n", action); + + /* + * HVPIPE is not used (Failed to create /dev/papr-hvpipe). + * So nothing to do for migration. + */ + if (!papr_hvpipe_work) + return; + + switch (action) { + case HVPIPE_SUSPEND: + if (hvpipe_feature) { + /* + * Disable hvpipe_feature to the user space. + * It will be enabled with RESUME event. + */ + hvpipe_feature = false; + /* + * set system parameter hvpipe 'disable' + */ + set_hvpipe_sys_param(0); + } + break; + case HVPIPE_RESUME: + /* + * set system parameter hvpipe 'enable' + */ + if (!set_hvpipe_sys_param(1)) + hvpipe_feature = true; + else + pr_err("hvpipe is not enabled after migration\n"); + + break; + } +} + static const struct file_operations papr_hvpipe_ops = { .unlocked_ioctl = papr_hvpipe_dev_ioctl, }; @@ -740,6 +804,7 @@ static int __init papr_hvpipe_init(void) if (!ret) { pr_info("hvpipe feature is enabled\n"); + hvpipe_feature = true; return 0; } diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.h b/arch/powerpc/platforms/pseries/papr-hvpipe.h index aab7f77e087dd2..c343f4230865c0 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.h +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.h @@ -11,6 +11,11 @@ #define HVPIPE_HDR_LEN sizeof(struct papr_hvpipe_hdr) +enum hvpipe_migrate_action { + HVPIPE_SUSPEND, + HVPIPE_RESUME, +}; + struct hvpipe_source_info { struct list_head list; /* list of sources */ u32 srcID; @@ -33,4 +38,5 @@ struct hvpipe_event_buf { /* with specified src ID */ }; +void hvpipe_migration_handler(int action); #endif /* _PAPR_HVPIPE_H */ From d9e46de4bf5c5f987075afd5f240bb2a8a5d71ed Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 16 Aug 2025 18:33:26 +0200 Subject: [PATCH 1634/3206] powerpc/8xx: Remove left-over instruction and comments in DataStoreTLBMiss handler Commit ac9f97ff8b32 ("powerpc/8xx: Inconditionally use task PGDIR in DTLB misses") removed the test that needed the valeur in SPRN_EPN but failed to remove the read. Remove it. And remove related comments, including the very same comment in InstructionTLBMiss that should have been removed by commit 33c527522f39 ("powerpc/8xx: Inconditionally use task PGDIR in ITLB misses"). Also update the comment about absence of a second level table which has been handled implicitely since commit 5ddb75cee5af ("powerpc/8xx: remove tests on PGDIR entry validity"). Fixes: ac9f97ff8b32 ("powerpc/8xx: Inconditionally use task PGDIR in DTLB misses") Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/5811c8d1d6187f280ad140d6c0ad6010e41eeaeb.1755361995.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_8xx.S | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 78942fd6b4b97c..393e19ee13222d 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -162,7 +162,7 @@ instruction_counter: * For the MPC8xx, this is a software tablewalk to load the instruction * TLB. The task switch loads the M_TWB register with the pointer to the first * level table. - * If we discover there is no second level table (value is zero) or if there + * If there is no second level table (value is zero) or if there * is an invalid pte, we load that into the TLB, which causes another fault * into the TLB Error interrupt where we can handle such problems. * We have to use the MD_xxx registers for the tablewalk because the @@ -183,9 +183,6 @@ instruction_counter: mtspr SPRN_SPRG_SCRATCH2, r10 mtspr SPRN_M_TW, r11 - /* If we are faulting a kernel address, we have to use the - * kernel page tables. - */ mfspr r10, SPRN_SRR0 /* Get effective address of fault */ INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11) mtspr SPRN_MD_EPN, r10 @@ -228,10 +225,6 @@ instruction_counter: mtspr SPRN_SPRG_SCRATCH2, r10 mtspr SPRN_M_TW, r11 - /* If we are faulting a kernel address, we have to use the - * kernel page tables. - */ - mfspr r10, SPRN_MD_EPN mfspr r10, SPRN_M_TWB /* Get level 1 table */ lwz r11, 0(r10) /* Get level 1 entry */ From f2863371f017eb03c230addc253783fa4c7e90f5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Aug 2025 08:30:18 +0200 Subject: [PATCH 1635/3206] powerpc/603: Really copy kernel PGD entries into all PGDIRs Commit 82ef440f9a38 ("powerpc/603: Copy kernel PGD entries into all PGDIRs and preallocate execmem page tables") was supposed to extend to powerpc 603 the copy of kernel PGD entries into all PGDIRs implemented in a previous patch on the 8xx. But 603 is book3s/32 and uses a duplicate of pgd_alloc() defined in another header. So really do the copy at the correct place for the 603. Fixes: 82ef440f9a38 ("powerpc/603: Copy kernel PGD entries into all PGDIRs and preallocate execmem page tables") Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/752ab7514cae089a2dd7cc0f3d5e35849f76adb9.1755757797.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/pgalloc.h | 10 ++++++++-- arch/powerpc/include/asm/nohash/pgalloc.h | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index dd4eb306317581..f4390704d5ba29 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -7,8 +7,14 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), - pgtable_gfp_flags(mm, GFP_KERNEL)); + pgd_t *pgd = kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); + +#ifdef CONFIG_PPC_BOOK3S_603 + memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, + (MAX_PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); +#endif + return pgd; } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h index bb5f3e8ea912df..4ef780b291bc31 100644 --- a/arch/powerpc/include/asm/nohash/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/pgalloc.h @@ -22,7 +22,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t *pgd = kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), pgtable_gfp_flags(mm, GFP_KERNEL)); -#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_BOOK3S_603) +#ifdef CONFIG_PPC_8xx memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (MAX_PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); #endif From a38108a23ab558b834d71d542d32c05ab0fb64d4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Sep 2025 10:30:52 +0300 Subject: [PATCH 1636/3206] wifi: iwlwifi: pcie: fix byte count table for some devices In my previous fix for this condition, I erroneously listed 9000 instead of 7000 family, when 7000/8000 were already using iwlmvm. Thus the condition ended up wrong, causing the issue I had fixed for older devices to suddenly appear on 7000/8000 family devices. Correct the condition accordingly. Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/r/20250909165811.10729-1-00107082@163.com/ Fixes: 586e3cb33ba6 ("wifi: iwlwifi: fix byte count table for old devices") Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250915102743.777aaafbcc6c.I84404edfdfbf400501f6fb06def5b86c501da198@changeid Signed-off-by: Miri Korenblit --- drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c index d912e709a92cbf..bb03dad4a3006b 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c @@ -2092,7 +2092,7 @@ static void iwl_txq_gen1_update_byte_cnt_tbl(struct iwl_trans *trans, break; } - if (trans->mac_cfg->device_family >= IWL_DEVICE_FAMILY_9000 && + if (trans->mac_cfg->device_family >= IWL_DEVICE_FAMILY_7000 && trans->mac_cfg->device_family < IWL_DEVICE_FAMILY_AX210) len = DIV_ROUND_UP(len, 4); From 91d8a53db2199eefc73ecf3682e0665ea6895696 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 10 Sep 2025 17:22:13 +0200 Subject: [PATCH 1637/3206] xfrm: fix offloading of cross-family tunnels Xiumei reported a regression in IPsec offload tests over xfrmi, where the traffic for IPv6 over IPv4 tunnels is processed in SW instead of going through crypto offload, after commit cc18f482e8b6 ("xfrm: provide common xdo_dev_offload_ok callback implementation"). Commit cc18f482e8b6 added a generic version of existing checks attempting to prevent packets with IPv4 options or IPv6 extension headers from being sent to HW that doesn't support offloading such packets. The check mistakenly uses x->props.family (the outer family) to determine the inner packet's family and verify if options/extensions are present. In the case of IPv6 over IPv4, the check compares some of the traffic class bits to the expected no-options ihl value (5). The original check was introduced in commit 2ac9cfe78223 ("net/mlx5e: IPSec, Add Innova IPSec offload TX data path"), and then duplicated in the other drivers. Before commit cc18f482e8b6, the loose check (ihl > 5) passed because those traffic class bits were not set to a value that triggered the no-offload codepath. Packets with options/extension headers that should have been handled in SW went through the offload path, and were likely dropped by the NIC or incorrectly processed. Since commit cc18f482e8b6, the check is now strict (ihl != 5), and in a basic setup (no traffic class configured), all packets go through the no-offload codepath. The commits that introduced the incorrect family checks in each driver are: 2ac9cfe78223 ("net/mlx5e: IPSec, Add Innova IPSec offload TX data path") 8362ea16f69f ("crypto: chcr - ESN for Inline IPSec Tx") 859a497fe80c ("nfp: implement xfrm callbacks and expose ipsec offload feature to upper layer") 32188be805d0 ("cn10k-ipsec: Allow ipsec crypto offload for skb with SA") [ixgbe/ixgbevf commits are ignored, as that HW does not support tunnel mode, thus no cross-family setups are possible] Fixes: cc18f482e8b6 ("xfrm: provide common xdo_dev_offload_ok callback implementation") Reported-by: Xiumei Mu Signed-off-by: Sabrina Dubroca Reviewed-by: Leon Romanovsky Reviewed-by: Zhu Yanjun Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index c7a1f080d2de3a..44b9de6e4e7788 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -438,7 +438,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->props.mode == XFRM_MODE_TUNNEL; - switch (x->props.family) { + switch (x->inner_mode.family) { case AF_INET: /* Check for IPv4 options */ if (ip_hdr(skb)->ihl != 5) From 75604e9a5b60707722028947d6dc6bdacb42282e Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Sat, 16 Aug 2025 18:49:05 +0800 Subject: [PATCH 1638/3206] pwm: loongson: Fix LOONGSON_PWM_FREQ_DEFAULT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per the 7A1000 and 7A2000 user manual, the clock frequency of their PWM controllers is 50 MHz, not 50 kHz. Fixes: 2b62c89448dd ("pwm: Add Loongson PWM controller support") Signed-off-by: Xi Ruoyao Reviewed-by: Binbin Zhou Reviewed-by: Huacai Chen Link: https://lore.kernel.org/r/20250816104904.4779-2-xry111@xry111.site Cc: stable@vger.kernel.org Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-loongson.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-loongson.c b/drivers/pwm/pwm-loongson.c index 1ba16168cbb408..31a57edecfd0ba 100644 --- a/drivers/pwm/pwm-loongson.c +++ b/drivers/pwm/pwm-loongson.c @@ -49,7 +49,7 @@ #define LOONGSON_PWM_CTRL_REG_DZONE BIT(10) /* Anti-dead Zone Enable Bit */ /* default input clk frequency for the ACPI case */ -#define LOONGSON_PWM_FREQ_DEFAULT 50000 /* Hz */ +#define LOONGSON_PWM_FREQ_DEFAULT 50000000 /* Hz */ struct pwm_loongson_ddata { struct clk *clk; From 3a4b9d027e4061766f618292df91760ea64a1fcc Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 19 Aug 2025 19:42:24 +0800 Subject: [PATCH 1639/3206] pwm: berlin: Fix wrong register in suspend/resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'enable' register should be BERLIN_PWM_EN rather than BERLIN_PWM_ENABLE, otherwise, the driver accesses wrong address, there will be cpu exception then kernel panic during suspend/resume. Fixes: bbf0722c1c66 ("pwm: berlin: Add suspend/resume support") Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20250819114224.31825-1-jszhang@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-berlin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pwm/pwm-berlin.c b/drivers/pwm/pwm-berlin.c index 831aed228cafcb..858d369913742c 100644 --- a/drivers/pwm/pwm-berlin.c +++ b/drivers/pwm/pwm-berlin.c @@ -234,7 +234,7 @@ static int berlin_pwm_suspend(struct device *dev) for (i = 0; i < chip->npwm; i++) { struct berlin_pwm_channel *channel = &bpc->channel[i]; - channel->enable = berlin_pwm_readl(bpc, i, BERLIN_PWM_ENABLE); + channel->enable = berlin_pwm_readl(bpc, i, BERLIN_PWM_EN); channel->ctrl = berlin_pwm_readl(bpc, i, BERLIN_PWM_CONTROL); channel->duty = berlin_pwm_readl(bpc, i, BERLIN_PWM_DUTY); channel->tcnt = berlin_pwm_readl(bpc, i, BERLIN_PWM_TCNT); @@ -262,7 +262,7 @@ static int berlin_pwm_resume(struct device *dev) berlin_pwm_writel(bpc, i, channel->ctrl, BERLIN_PWM_CONTROL); berlin_pwm_writel(bpc, i, channel->duty, BERLIN_PWM_DUTY); berlin_pwm_writel(bpc, i, channel->tcnt, BERLIN_PWM_TCNT); - berlin_pwm_writel(bpc, i, channel->enable, BERLIN_PWM_ENABLE); + berlin_pwm_writel(bpc, i, channel->enable, BERLIN_PWM_EN); } return 0; From afe872274edc7da46719a2029bfa4eab142b15f6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 2 Sep 2025 14:03:48 +0100 Subject: [PATCH 1640/3206] pwm: Fix incorrect variable used in error message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dev_err message is reporting the incorrect return value ret_tohw, it should be reporting the value in ret_fromhw. Fix this by using ret_fromhw instead of ret_tohw. Fixes: 6c5126c6406d ("pwm: Provide new consumer API functions for waveforms") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20250902130348.2630053-1-colin.i.king@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Uwe Kleine-König --- drivers/pwm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 0d66376a83ec35..bff0057d87a42b 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -276,7 +276,7 @@ int pwm_round_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform * if (IS_ENABLED(CONFIG_PWM_DEBUG) && ret_fromhw > 0) dev_err(&chip->dev, "Unexpected return value from __pwm_round_waveform_fromhw: requested %llu/%llu [+%llu], return value %d\n", - wf_req.duty_length_ns, wf_req.period_length_ns, wf_req.duty_offset_ns, ret_tohw); + wf_req.duty_length_ns, wf_req.period_length_ns, wf_req.duty_offset_ns, ret_fromhw); if (IS_ENABLED(CONFIG_PWM_DEBUG) && (ret_tohw == 0) != pwm_check_rounding(&wf_req, wf)) From 21a5e91fda50fc662ce1a12bd0aae9d103455b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 11 Aug 2025 18:00:59 +0200 Subject: [PATCH 1641/3206] pwm: tiehrpwm: Don't drop runtime PM reference in .free() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pwm driver calls pm_runtime_get_sync() when the hardware becomes enabled and pm_runtime_put_sync() when it becomes disabled. The PWM's state is kept when a consumer goes away, so the call to pm_runtime_put_sync() in the .free() callback is unbalanced resulting in a non-functional device and a reference underlow for the second consumer. The easiest fix for that issue is to just not drop the runtime PM reference in .free(), so do that. Fixes: 19891b20e7c2 ("pwm: pwm-tiehrpwm: PWM driver support for EHRPWM") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/bbb089c4b5650cc1f7b25cf582d817543fd25384.1754927682.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-tiehrpwm.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c index 0125e73b98dfb4..5e674a7bbf3bec 100644 --- a/drivers/pwm/pwm-tiehrpwm.c +++ b/drivers/pwm/pwm-tiehrpwm.c @@ -391,11 +391,6 @@ static void ehrpwm_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) { struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); - if (pwm_is_enabled(pwm)) { - dev_warn(pwmchip_parent(chip), "Removing PWM device without disabling\n"); - pm_runtime_put_sync(pwmchip_parent(chip)); - } - /* set period value to zero on free */ pc->period_cycles[pwm->hwpwm] = 0; } From 878dbfc12cc52b17d79d205560c0fafcf5332b13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 11 Aug 2025 18:01:00 +0200 Subject: [PATCH 1642/3206] pwm: tiehrpwm: Make code comment in .free() more useful MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of explaining trivia to everyone who can read C describe the higher-level effect of setting pc->period_cycles[pwm->hwpwm] to zero. Fixes: 01b2d4536f02 ("pwm: pwm-tiehrpwm: Fix conflicting channel period setting") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/4c38dd119a77d7017115318a3f2c50bde62a6f21.1754927682.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-tiehrpwm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c index 5e674a7bbf3bec..a94b1e387b924d 100644 --- a/drivers/pwm/pwm-tiehrpwm.c +++ b/drivers/pwm/pwm-tiehrpwm.c @@ -391,7 +391,7 @@ static void ehrpwm_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) { struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); - /* set period value to zero on free */ + /* Don't let a pwm without consumer block requests to the other channel */ pc->period_cycles[pwm->hwpwm] = 0; } From bc7ce5bfc504eea9eac0eb0215017b9fcfc62c59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 11 Aug 2025 18:01:01 +0200 Subject: [PATCH 1643/3206] pwm: tiehrpwm: Fix various off-by-one errors in duty-cycle calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Up-Count Mode the timer is reset to zero one tick after it reaches TBPRD, so the period length is (TBPRD + 1) * T_TBCLK. This matches both the documentation and measurements. So the value written to the TBPRD has to be one less than the calculated period_cycles value. A complication here is that for a 100% relative duty-cycle the value written to the CMPx register has to be TBPRD + 1 which might overflow if TBPRD is 0xffff. To handle that the calculation of the AQCTLx register has to be moved to ehrpwm_pwm_config() and the edge at CTR = CMPx has to be skipped. Additionally the AQCTL_PRD register field has to be 0 because that defines the hardware's action when the maximal counter value is reached, which is (as above) one clock tick before the period's end. The period start edge has to happen when the counter is reset and so is defined in the AQCTL_ZRO field. Fixes: 19891b20e7c2 ("pwm: pwm-tiehrpwm: PWM driver support for EHRPWM") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/dc818c69b7cf05109ecda9ee6b0043a22de757c1.1754927682.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-tiehrpwm.c | 143 +++++++++++++++---------------------- 1 file changed, 58 insertions(+), 85 deletions(-) diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c index a94b1e387b924d..a23e48b8523dbf 100644 --- a/drivers/pwm/pwm-tiehrpwm.c +++ b/drivers/pwm/pwm-tiehrpwm.c @@ -36,7 +36,7 @@ #define CLKDIV_MAX 7 #define HSPCLKDIV_MAX 7 -#define PERIOD_MAX 0xFFFF +#define PERIOD_MAX 0x10000 /* compare module registers */ #define CMPA 0x12 @@ -65,14 +65,10 @@ #define AQCTL_ZRO_FRCHIGH BIT(1) #define AQCTL_ZRO_FRCTOGGLE (BIT(1) | BIT(0)) -#define AQCTL_CHANA_POLNORMAL (AQCTL_CAU_FRCLOW | AQCTL_PRD_FRCHIGH | \ - AQCTL_ZRO_FRCHIGH) -#define AQCTL_CHANA_POLINVERSED (AQCTL_CAU_FRCHIGH | AQCTL_PRD_FRCLOW | \ - AQCTL_ZRO_FRCLOW) -#define AQCTL_CHANB_POLNORMAL (AQCTL_CBU_FRCLOW | AQCTL_PRD_FRCHIGH | \ - AQCTL_ZRO_FRCHIGH) -#define AQCTL_CHANB_POLINVERSED (AQCTL_CBU_FRCHIGH | AQCTL_PRD_FRCLOW | \ - AQCTL_ZRO_FRCLOW) +#define AQCTL_CHANA_POLNORMAL (AQCTL_CAU_FRCLOW | AQCTL_ZRO_FRCHIGH) +#define AQCTL_CHANA_POLINVERSED (AQCTL_CAU_FRCHIGH | AQCTL_ZRO_FRCLOW) +#define AQCTL_CHANB_POLNORMAL (AQCTL_CBU_FRCLOW | AQCTL_ZRO_FRCHIGH) +#define AQCTL_CHANB_POLINVERSED (AQCTL_CBU_FRCHIGH | AQCTL_ZRO_FRCLOW) #define AQSFRC_RLDCSF_MASK (BIT(7) | BIT(6)) #define AQSFRC_RLDCSF_ZRO 0 @@ -108,7 +104,6 @@ struct ehrpwm_pwm_chip { unsigned long clk_rate; void __iomem *mmio_base; unsigned long period_cycles[NUM_PWM_CHANNEL]; - enum pwm_polarity polarity[NUM_PWM_CHANNEL]; struct clk *tbclk; struct ehrpwm_context ctx; }; @@ -177,51 +172,20 @@ static int set_prescale_div(unsigned long rqst_prescaler, u16 *prescale_div, return 1; } -static void configure_polarity(struct ehrpwm_pwm_chip *pc, int chan) -{ - u16 aqctl_val, aqctl_mask; - unsigned int aqctl_reg; - - /* - * Configure PWM output to HIGH/LOW level on counter - * reaches compare register value and LOW/HIGH level - * on counter value reaches period register value and - * zero value on counter - */ - if (chan == 1) { - aqctl_reg = AQCTLB; - aqctl_mask = AQCTL_CBU_MASK; - - if (pc->polarity[chan] == PWM_POLARITY_INVERSED) - aqctl_val = AQCTL_CHANB_POLINVERSED; - else - aqctl_val = AQCTL_CHANB_POLNORMAL; - } else { - aqctl_reg = AQCTLA; - aqctl_mask = AQCTL_CAU_MASK; - - if (pc->polarity[chan] == PWM_POLARITY_INVERSED) - aqctl_val = AQCTL_CHANA_POLINVERSED; - else - aqctl_val = AQCTL_CHANA_POLNORMAL; - } - - aqctl_mask |= AQCTL_PRD_MASK | AQCTL_ZRO_MASK; - ehrpwm_modify(pc->mmio_base, aqctl_reg, aqctl_mask, aqctl_val); -} - /* * period_ns = 10^9 * (ps_divval * period_cycles) / PWM_CLK_RATE * duty_ns = 10^9 * (ps_divval * duty_cycles) / PWM_CLK_RATE */ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - u64 duty_ns, u64 period_ns) + u64 duty_ns, u64 period_ns, enum pwm_polarity polarity) { struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); u32 period_cycles, duty_cycles; u16 ps_divval, tb_divval; unsigned int i, cmp_reg; unsigned long long c; + u16 aqctl_val, aqctl_mask; + unsigned int aqctl_reg; if (period_ns > NSEC_PER_SEC) return -ERANGE; @@ -231,15 +195,10 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, do_div(c, NSEC_PER_SEC); period_cycles = (unsigned long)c; - if (period_cycles < 1) { - period_cycles = 1; - duty_cycles = 1; - } else { - c = pc->clk_rate; - c = c * duty_ns; - do_div(c, NSEC_PER_SEC); - duty_cycles = (unsigned long)c; - } + c = pc->clk_rate; + c = c * duty_ns; + do_div(c, NSEC_PER_SEC); + duty_cycles = (unsigned long)c; /* * Period values should be same for multiple PWM channels as IP uses @@ -271,46 +230,67 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, return -EINVAL; } - pm_runtime_get_sync(pwmchip_parent(chip)); - - /* Update clock prescaler values */ - ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_CLKDIV_MASK, tb_divval); - /* Update period & duty cycle with presacler division */ period_cycles = period_cycles / ps_divval; duty_cycles = duty_cycles / ps_divval; - /* Configure shadow loading on Period register */ - ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_PRDLD_MASK, TBCTL_PRDLD_SHDW); + if (period_cycles < 1) + period_cycles = 1; - ehrpwm_write(pc->mmio_base, TBPRD, period_cycles); + pm_runtime_get_sync(pwmchip_parent(chip)); - /* Configure ehrpwm counter for up-count mode */ - ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_CTRMODE_MASK, - TBCTL_CTRMODE_UP); + /* Update clock prescaler values */ + ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_CLKDIV_MASK, tb_divval); - if (pwm->hwpwm == 1) + if (pwm->hwpwm == 1) { /* Channel 1 configured with compare B register */ cmp_reg = CMPB; - else + + aqctl_reg = AQCTLB; + aqctl_mask = AQCTL_CBU_MASK; + + if (polarity == PWM_POLARITY_INVERSED) + aqctl_val = AQCTL_CHANB_POLINVERSED; + else + aqctl_val = AQCTL_CHANB_POLNORMAL; + + /* if duty_cycle is big, don't toggle on CBU */ + if (duty_cycles > period_cycles) + aqctl_val &= ~AQCTL_CBU_MASK; + + } else { /* Channel 0 configured with compare A register */ cmp_reg = CMPA; - ehrpwm_write(pc->mmio_base, cmp_reg, duty_cycles); + aqctl_reg = AQCTLA; + aqctl_mask = AQCTL_CAU_MASK; - pm_runtime_put_sync(pwmchip_parent(chip)); + if (polarity == PWM_POLARITY_INVERSED) + aqctl_val = AQCTL_CHANA_POLINVERSED; + else + aqctl_val = AQCTL_CHANA_POLNORMAL; - return 0; -} + /* if duty_cycle is big, don't toggle on CAU */ + if (duty_cycles > period_cycles) + aqctl_val &= ~AQCTL_CAU_MASK; + } -static int ehrpwm_pwm_set_polarity(struct pwm_chip *chip, - struct pwm_device *pwm, - enum pwm_polarity polarity) -{ - struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); + aqctl_mask |= AQCTL_PRD_MASK | AQCTL_ZRO_MASK; + ehrpwm_modify(pc->mmio_base, aqctl_reg, aqctl_mask, aqctl_val); + + /* Configure shadow loading on Period register */ + ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_PRDLD_MASK, TBCTL_PRDLD_SHDW); + + ehrpwm_write(pc->mmio_base, TBPRD, period_cycles - 1); + + /* Configure ehrpwm counter for up-count mode */ + ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_CTRMODE_MASK, + TBCTL_CTRMODE_UP); + + if (!(duty_cycles > period_cycles)) + ehrpwm_write(pc->mmio_base, cmp_reg, duty_cycles); - /* Configuration of polarity in hardware delayed, do at enable */ - pc->polarity[pwm->hwpwm] = polarity; + pm_runtime_put_sync(pwmchip_parent(chip)); return 0; } @@ -339,9 +319,6 @@ static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) ehrpwm_modify(pc->mmio_base, AQCSFRC, aqcsfrc_mask, aqcsfrc_val); - /* Channels polarity can be configured from action qualifier module */ - configure_polarity(pc, pwm->hwpwm); - /* Enable TBCLK */ ret = clk_enable(pc->tbclk); if (ret) { @@ -406,10 +383,6 @@ static int ehrpwm_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, ehrpwm_pwm_disable(chip, pwm); enabled = false; } - - err = ehrpwm_pwm_set_polarity(chip, pwm, state->polarity); - if (err) - return err; } if (!state->enabled) { @@ -418,7 +391,7 @@ static int ehrpwm_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } - err = ehrpwm_pwm_config(chip, pwm, state->duty_cycle, state->period); + err = ehrpwm_pwm_config(chip, pwm, state->duty_cycle, state->period, state->polarity); if (err) return err; From 00f83f0e07e44e2f1fb94b223e77ab7b18ee2d7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 11 Aug 2025 18:01:02 +0200 Subject: [PATCH 1644/3206] pwm: tiehrpwm: Fix corner case in clock divisor calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function set_prescale_div() is responsible for calculating the clock divisor settings such that the input clock rate is divided down such that the required period length is at most 0x10000 clock ticks. If period_cycles is an integer multiple of 0x10000, the divisor period_cycles / 0x10000 is good enough. So round up in the calculation of the required divisor and compare it using >= instead of >. Fixes: 19891b20e7c2 ("pwm: pwm-tiehrpwm: PWM driver support for EHRPWM") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/85488616d7bfcd9c32717651d0be7e330e761b9c.1754927682.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-tiehrpwm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c index a23e48b8523dbf..7a86cb090f76f1 100644 --- a/drivers/pwm/pwm-tiehrpwm.c +++ b/drivers/pwm/pwm-tiehrpwm.c @@ -161,7 +161,7 @@ static int set_prescale_div(unsigned long rqst_prescaler, u16 *prescale_div, *prescale_div = (1 << clkdiv) * (hspclkdiv ? (hspclkdiv * 2) : 1); - if (*prescale_div > rqst_prescaler) { + if (*prescale_div >= rqst_prescaler) { *tb_clk_div = (clkdiv << TBCTL_CLKDIV_SHIFT) | (hspclkdiv << TBCTL_HSPCLKDIV_SHIFT); return 0; @@ -224,7 +224,7 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, pc->period_cycles[pwm->hwpwm] = period_cycles; /* Configure clock prescaler to support Low frequency PWM wave */ - if (set_prescale_div(period_cycles/PERIOD_MAX, &ps_divval, + if (set_prescale_div(DIV_ROUND_UP(period_cycles, PERIOD_MAX), &ps_divval, &tb_divval)) { dev_err(pwmchip_parent(chip), "Unsupported values\n"); return -EINVAL; From 09cbe54681241ac67ca595743d29f7da85363928 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 27 May 2025 22:58:22 +0200 Subject: [PATCH 1645/3206] dt-bindings: timer: renesas,rz-mtu3: Use #pwm-cells = <3> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the goal to unify all PWM bindings to use #pwm-cells = <3> update the renesas,rz-mtu3 binding accordingly. Keep <2> documented as a deprecated value at least until the in-tree device trees are fixed accordingly. Signed-off-by: Uwe Kleine-König Acked-by: Rob Herring (Arm) Reviewed-by: Biju Das Link: https://lore.kernel.org/r/20250527205823.377785-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- .../devicetree/bindings/timer/renesas,rz-mtu3.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml b/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml index 3931054b42fb97..3ad10c5b66ba54 100644 --- a/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml +++ b/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml @@ -221,7 +221,10 @@ properties: maxItems: 1 "#pwm-cells": - const: 2 + oneOf: + - const: 2 + deprecated: true + - const: 3 required: - compatible @@ -299,5 +302,5 @@ examples: clocks = <&cpg CPG_MOD R9A07G044_MTU_X_MCK_MTU3>; power-domains = <&cpg>; resets = <&cpg R9A07G044_MTU_X_PRESET_MTU3>; - #pwm-cells = <2>; + #pwm-cells = <3>; }; From 5364e70b013c204088dfcd888a5517a81f0b1836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 8 Jul 2025 19:18:00 +0200 Subject: [PATCH 1646/3206] pwm: Disable PWM_DEBUG check for disabled states MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a PWM is requested to be disabled, the result is unspecified, the only intention is to save some power. So skip all checks in this case. All but two checks already only triggered for states with .enabled = true. The first resulted in some false positive diagnostics, the other checked for a condition that depending on hardware might not be implementable. Similar if the lowlevel driver disabled the hardware this might be a valid reaction and with .enabled = false all other state parameters are unreliable, so skip further tests in this case, too. All later usages of .enabled can be assumed to yield true, and so several if conditions can be simplified. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/16d29212b09b66c286c1232b1ab0ec0f8d510aae.1751994988.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/core.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index bff0057d87a42b..d5d2dfbe4adecf 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -496,6 +496,13 @@ static void pwm_apply_debug(struct pwm_device *pwm, if (!chip->ops->get_state) return; + /* + * If a disabled PWM was requested the result is unspecified, so nothing + * to check. + */ + if (!state->enabled) + return; + /* * *state was just applied. Read out the hardware state and do some * checks. @@ -507,16 +514,23 @@ static void pwm_apply_debug(struct pwm_device *pwm, /* If that failed there isn't much to debug */ return; + /* + * If the PWM was disabled that's maybe strange but there is nothing + * that can be sensibly checked then. So return early. + */ + if (!s1.enabled) + return; + /* * The lowlevel driver either ignored .polarity (which is a bug) or as * best effort inverted .polarity and fixed .duty_cycle respectively. * Undo this inversion and fixup for further tests. */ - if (s1.enabled && s1.polarity != state->polarity) { + if (s1.polarity != state->polarity) { s2.polarity = state->polarity; s2.duty_cycle = s1.period - s1.duty_cycle; s2.period = s1.period; - s2.enabled = s1.enabled; + s2.enabled = true; } else { s2 = s1; } @@ -525,8 +539,7 @@ static void pwm_apply_debug(struct pwm_device *pwm, state->duty_cycle < state->period) dev_warn(pwmchip_parent(chip), ".apply ignored .polarity\n"); - if (state->enabled && s2.enabled && - last->polarity == state->polarity && + if (last->polarity == state->polarity && last->period > s2.period && last->period <= state->period) dev_warn(pwmchip_parent(chip), @@ -537,13 +550,12 @@ static void pwm_apply_debug(struct pwm_device *pwm, * Rounding period up is fine only if duty_cycle is 0 then, because a * flat line doesn't have a characteristic period. */ - if (state->enabled && s2.enabled && state->period < s2.period && s2.duty_cycle) + if (state->period < s2.period && s2.duty_cycle) dev_warn(pwmchip_parent(chip), ".apply is supposed to round down period (requested: %llu, applied: %llu)\n", state->period, s2.period); - if (state->enabled && - last->polarity == state->polarity && + if (last->polarity == state->polarity && last->period == s2.period && last->duty_cycle > s2.duty_cycle && last->duty_cycle <= state->duty_cycle) @@ -553,16 +565,12 @@ static void pwm_apply_debug(struct pwm_device *pwm, s2.duty_cycle, s2.period, last->duty_cycle, last->period); - if (state->enabled && s2.enabled && state->duty_cycle < s2.duty_cycle) + if (state->duty_cycle < s2.duty_cycle) dev_warn(pwmchip_parent(chip), ".apply is supposed to round down duty_cycle (requested: %llu/%llu, applied: %llu/%llu)\n", state->duty_cycle, state->period, s2.duty_cycle, s2.period); - if (!state->enabled && s2.enabled && s2.duty_cycle > 0) - dev_warn(pwmchip_parent(chip), - "requested disabled, but yielded enabled with duty > 0\n"); - /* reapply the state that the driver reported being configured. */ err = chip->ops->apply(chip, pwm, &s1); trace_pwm_apply(pwm, &s1, err); From b871d093f1caebeb39b12136a33e7529b8fdea50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 8 Jul 2025 19:18:01 +0200 Subject: [PATCH 1647/3206] pwm: Check actual period and duty_cycle for ignored polarity test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a lowlevel driver configures the wrong period that might (historically) be ok if the emitted signal has a 100% relative duty_cycle as that just corresponds to rounding down the duty_cycle to 0 which is an allowed thing to do for a lowlevel driver. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/bc511c0250ea2f6390e4209ab1ea9c08a3c18612.1751994988.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index d5d2dfbe4adecf..2570ad6a7f59d0 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -536,7 +536,7 @@ static void pwm_apply_debug(struct pwm_device *pwm, } if (s2.polarity != state->polarity && - state->duty_cycle < state->period) + s2.duty_cycle < s2.period) dev_warn(pwmchip_parent(chip), ".apply ignored .polarity\n"); if (last->polarity == state->polarity && From e7c9b66b106989aeb17b167f5bbea9a108d26c0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 17 Jul 2025 17:11:16 +0200 Subject: [PATCH 1648/3206] pwm: Provide a gpio device for waveform drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A PWM is a more general concept than an output-only GPIO. When using duty_length = period_length the PWM looks like an active GPIO, with duty_length = 0 like an inactive GPIO. With the waveform abstraction there is enough control over the configuration to ensure that PWMs that cannot generate a constant signal at both levels error out. The pwm-pca9685 driver already provides a gpio chip. When this driver is converted to the waveform callbacks, the gpio part can just be dropped. Signed-off-by: Uwe Kleine-König Reviewed-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20250717151117.1828585-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/Kconfig | 9 ++++++ drivers/pwm/core.c | 72 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/pwm.h | 3 ++ 3 files changed, 84 insertions(+) diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index f00ce973dddf65..2aa24608a8dfca 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -38,6 +38,15 @@ config PWM_DEBUG It is expected to introduce some runtime overhead and diagnostic output to the kernel log, so only enable while working on a driver. +config PWM_PROVIDE_GPIO + bool "Provide a GPIO chip for each PWM chip" + depends on GPIOLIB + help + Most PWMs can emit both a constant active high and a constant active + low signal and so they can be used as GPIO. Say Y here to let each + PWM chip provide a GPIO chip and so be easily plugged into consumers + that know how to handle GPIOs but not PWMs. + config PWM_AB8500 tristate "AB8500 PWM support" depends on AB8500_CORE && ARCH_U8500 diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 2570ad6a7f59d0..ea2ccf42e81441 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -2391,6 +2391,51 @@ static const struct file_operations pwm_cdev_fileops = { static dev_t pwm_devt; +static int pwm_gpio_request(struct gpio_chip *gc, unsigned int offset) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + struct pwm_device *pwm; + + pwm = pwm_request_from_chip(chip, offset, "pwm-gpio"); + if (IS_ERR(pwm)) + return PTR_ERR(pwm); + + return 0; +} + +static void pwm_gpio_free(struct gpio_chip *gc, unsigned int offset) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + + pwm_put(&chip->pwms[offset]); +} + +static int pwm_gpio_get_direction(struct gpio_chip *gc, unsigned int offset) +{ + return GPIO_LINE_DIRECTION_OUT; +} + +static int pwm_gpio_set(struct gpio_chip *gc, unsigned int offset, int value) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + struct pwm_device *pwm = &chip->pwms[offset]; + int ret; + struct pwm_waveform wf = { + .period_length_ns = 1, + }; + + ret = pwm_round_waveform_might_sleep(pwm, &wf); + if (ret < 0) + return ret; + + if (value) + wf.duty_length_ns = wf.period_length_ns; + else + wf.duty_length_ns = 0; + + return pwm_set_waveform_might_sleep(pwm, &wf, true); +} + /** * __pwmchip_add() - register a new PWM chip * @chip: the PWM chip to add @@ -2457,9 +2502,33 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner) if (ret) goto err_device_add; + if (IS_ENABLED(CONFIG_PWM_PROVIDE_GPIO) && chip->ops->write_waveform) { + struct device *parent = pwmchip_parent(chip); + + chip->gpio = (typeof(chip->gpio)){ + .label = dev_name(parent), + .parent = parent, + .request = pwm_gpio_request, + .free = pwm_gpio_free, + .get_direction = pwm_gpio_get_direction, + .set = pwm_gpio_set, + .base = -1, + .ngpio = chip->npwm, + .can_sleep = true, + }; + + ret = gpiochip_add_data(&chip->gpio, chip); + if (ret) + goto err_gpiochip_add; + } + return 0; +err_gpiochip_add: + + cdev_device_del(&chip->cdev, &chip->dev); err_device_add: + scoped_guard(pwmchip, chip) chip->operational = false; @@ -2480,6 +2549,9 @@ EXPORT_SYMBOL_GPL(__pwmchip_add); */ void pwmchip_remove(struct pwm_chip *chip) { + if (IS_ENABLED(CONFIG_PWM_PROVIDE_GPIO) && chip->ops->write_waveform) + gpiochip_remove(&chip->gpio); + pwmchip_sysfs_unexport(chip); scoped_guard(mutex, &pwm_lock) { diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 8cafc483db53ad..549ac4aaad59ba 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -321,6 +322,7 @@ struct pwm_ops { * @npwm: number of PWMs controlled by this chip * @of_xlate: request a PWM device given a device tree PWM specifier * @atomic: can the driver's ->apply() be called in atomic context + * @gpio: &struct gpio_chip to operate this PWM chip's lines as GPO * @uses_pwmchip_alloc: signals if pwmchip_allow was used to allocate this chip * @operational: signals if the chip can be used (or is already deregistered) * @nonatomic_lock: mutex for nonatomic chips @@ -340,6 +342,7 @@ struct pwm_chip { bool atomic; /* only used internally by the PWM framework */ + struct gpio_chip gpio; bool uses_pwmchip_alloc; bool operational; union { From a2f77ae4778d1eef2d273683ccc09fa971111c8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 30 Jul 2025 10:02:20 +0200 Subject: [PATCH 1649/3206] pwm: tiecap: Document behaviour of hardware disable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to David Lechner[1] disabling a tiecap PWM makes the PWM pin an input. The reported problem is fixed in commit deaeeda2051f ("backlight: pwm_bl: Don't rely on a disabled PWM emiting inactive state"). Document the behaviour in the driver for future reference. [1] https://lore.kernel.org/linux-pwm/39a472c0-ba24-de7b-8783-a16a71b172cd@lechnology.com Signed-off-by: Uwe Kleine-König Reviewed-by: David Lechner Link: https://lore.kernel.org/r/20250730080219.183181-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-tiecap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c index d91b2bdc88fce2..67cc5e8bdb0ef8 100644 --- a/drivers/pwm/pwm-tiecap.c +++ b/drivers/pwm/pwm-tiecap.c @@ -3,6 +3,10 @@ * ECAP PWM driver * * Copyright (C) 2012 Texas Instruments, Inc. - https://www.ti.com/ + * + * Hardware properties: + * - On disable the PWM pin becomes an input, so the behaviour depends on + * external wiring. */ #include From c95ab56a7ad6e6ae2be93591287fd9b25027fe38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 25 Jul 2025 17:45:05 +0200 Subject: [PATCH 1650/3206] pwm: mediatek: Simplify representation of channel offsets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The general register layout contains some per-chip registers starting at offset 0 and then at a higher address there are n nearly identical and equidistant blocks for the registers of the n channels. This allows to represent the offsets of per-channel registers as $base + i * $width instead of listing all (or too many) offsets explicitly in an array. So for a small additional effort in pwm_mediatek_writel() the three arrays with the channel offsets can be dropped. The size changes according to bloat-o-meter are: add/remove: 0/3 grow/shrink: 1/0 up/down: 12/-96 (-84) Function old new delta pwm_mediatek_apply 696 708 +12 mtk_pwm_reg_offset_v3 32 - -32 mtk_pwm_reg_offset_v2 32 - -32 mtk_pwm_reg_offset_v1 32 - -32 Total: Before=5347, After=5263, chg -1.57% Signed-off-by: Uwe Kleine-König Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250725154506.2610172-11-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-mediatek.c | 58 ++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index e4b595fc5a5e04..c4169ae614bcbf 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -38,7 +38,8 @@ struct pwm_mediatek_of_data { unsigned int num_pwms; bool pwm45_fixup; u16 pwm_ck_26m_sel_reg; - const unsigned int *reg_offset; + unsigned int chanreg_base; + unsigned int chanreg_width; }; /** @@ -57,19 +58,6 @@ struct pwm_mediatek_chip { const struct pwm_mediatek_of_data *soc; }; -static const unsigned int mtk_pwm_reg_offset_v1[] = { - 0x0010, 0x0050, 0x0090, 0x00d0, 0x0110, 0x0150, 0x0190, 0x0220 -}; - -static const unsigned int mtk_pwm_reg_offset_v2[] = { - 0x0080, 0x00c0, 0x0100, 0x0140, 0x0180, 0x01c0, 0x0200, 0x0240 -}; - -/* PWM IP Version 3.0.2 */ -static const unsigned int mtk_pwm_reg_offset_v3[] = { - 0x0100, 0x0200, 0x0300, 0x0400, 0x0500, 0x0600, 0x0700, 0x0800 -}; - static inline struct pwm_mediatek_chip * to_pwm_mediatek_chip(struct pwm_chip *chip) { @@ -118,7 +106,8 @@ static inline void pwm_mediatek_writel(struct pwm_mediatek_chip *chip, unsigned int num, unsigned int offset, u32 value) { - writel(value, chip->regs + chip->soc->reg_offset[num] + offset); + writel(value, chip->regs + chip->soc->chanreg_base + + num * chip->soc->chanreg_width + offset); } static void pwm_mediatek_enable(struct pwm_chip *chip, struct pwm_device *pwm) @@ -303,86 +292,99 @@ static int pwm_mediatek_probe(struct platform_device *pdev) static const struct pwm_mediatek_of_data mt2712_pwm_data = { .num_pwms = 8, .pwm45_fixup = false, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt6795_pwm_data = { .num_pwms = 7, .pwm45_fixup = false, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7622_pwm_data = { .num_pwms = 6, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7623_pwm_data = { .num_pwms = 5, .pwm45_fixup = true, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7628_pwm_data = { .num_pwms = 4, .pwm45_fixup = true, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7629_pwm_data = { .num_pwms = 1, .pwm45_fixup = false, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7981_pwm_data = { .num_pwms = 3, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v2, + .chanreg_base = 0x80, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7986_pwm_data = { .num_pwms = 2, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7988_pwm_data = { .num_pwms = 8, .pwm45_fixup = false, - .reg_offset = mtk_pwm_reg_offset_v2, + .chanreg_base = 0x80, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt8183_pwm_data = { .num_pwms = 4, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt8365_pwm_data = { .num_pwms = 3, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt8516_pwm_data = { .num_pwms = 5, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt6991_pwm_data = { .num_pwms = 4, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL_V3, - .reg_offset = mtk_pwm_reg_offset_v3, + .chanreg_base = 0x100, + .chanreg_width = 0x100, }; static const struct of_device_id pwm_mediatek_of_match[] = { From 88863c9d81bb043bd02f70be6cb629494cac4225 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 25 Jul 2025 17:45:06 +0200 Subject: [PATCH 1651/3206] pwm: mediatek: Introduce and use a few more register defines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using a magic constant for bound checking, derive the numbers from appropriate register defines. Signed-off-by: Uwe Kleine-König Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250725154506.2610172-12-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-mediatek.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index c4169ae614bcbf..c592ff9b7ed9fd 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -7,6 +7,7 @@ * */ +#include #include #include #include @@ -21,19 +22,19 @@ /* PWM registers and bits definitions */ #define PWMCON 0x00 +#define PWMCON_CLKDIV GENMASK(2, 0) #define PWMHDUR 0x04 #define PWMLDUR 0x08 #define PWMGDUR 0x0c #define PWMWAVENUM 0x28 #define PWMDWIDTH 0x2c +#define PWMDWIDTH_PERIOD GENMASK(12, 0) #define PWM45DWIDTH_FIXUP 0x30 #define PWMTHRES 0x30 #define PWM45THRES_FIXUP 0x34 #define PWM_CK_26M_SEL_V3 0x74 #define PWM_CK_26M_SEL 0x210 -#define PWM_CLK_DIV_MAX 7 - struct pwm_mediatek_of_data { unsigned int num_pwms; bool pwm45_fixup; @@ -162,14 +163,14 @@ static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, if (!cnt_period) return -EINVAL; - while (cnt_period > 8192) { + while (cnt_period - 1 > FIELD_MAX(PWMDWIDTH_PERIOD)) { resolution *= 2; clkdiv++; cnt_period = DIV_ROUND_CLOSEST_ULL((u64)period_ns * 1000, resolution); } - if (clkdiv > PWM_CLK_DIV_MAX) { + if (clkdiv > FIELD_MAX(PWMCON_CLKDIV)) { dev_err(pwmchip_parent(chip), "period of %d ns not supported\n", period_ns); ret = -EINVAL; goto out; From f43e1280731c2a6bbd2d9517fd6b726d6ebe6641 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 25 Jul 2025 17:45:07 +0200 Subject: [PATCH 1652/3206] pwm: mediatek: Rework parameters for clk helper function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert pwm_mediatek_clk_enable() and pwm_mediatek_clk_disable() to take lower level parameters. This enables these functions to be used in the next commit when there is no valid pwm_chip and pwm_device yet. Signed-off-by: Uwe Kleine-König Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250725154506.2610172-13-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-mediatek.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index c592ff9b7ed9fd..7fe003bcc74d7b 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -65,10 +65,9 @@ to_pwm_mediatek_chip(struct pwm_chip *chip) return pwmchip_get_drvdata(chip); } -static int pwm_mediatek_clk_enable(struct pwm_chip *chip, - struct pwm_device *pwm) +static int pwm_mediatek_clk_enable(struct pwm_mediatek_chip *pc, + unsigned int hwpwm) { - struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); int ret; ret = clk_prepare_enable(pc->clk_top); @@ -79,7 +78,7 @@ static int pwm_mediatek_clk_enable(struct pwm_chip *chip, if (ret < 0) goto disable_clk_top; - ret = clk_prepare_enable(pc->clk_pwms[pwm->hwpwm]); + ret = clk_prepare_enable(pc->clk_pwms[hwpwm]); if (ret < 0) goto disable_clk_main; @@ -93,12 +92,10 @@ static int pwm_mediatek_clk_enable(struct pwm_chip *chip, return ret; } -static void pwm_mediatek_clk_disable(struct pwm_chip *chip, - struct pwm_device *pwm) +static void pwm_mediatek_clk_disable(struct pwm_mediatek_chip *pc, + unsigned int hwpwm) { - struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); - - clk_disable_unprepare(pc->clk_pwms[pwm->hwpwm]); + clk_disable_unprepare(pc->clk_pwms[hwpwm]); clk_disable_unprepare(pc->clk_main); clk_disable_unprepare(pc->clk_top); } @@ -141,7 +138,7 @@ static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, u64 resolution; int ret; - ret = pwm_mediatek_clk_enable(chip, pwm); + ret = pwm_mediatek_clk_enable(pc, pwm->hwpwm); if (ret < 0) return ret; @@ -198,7 +195,7 @@ static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, } out: - pwm_mediatek_clk_disable(chip, pwm); + pwm_mediatek_clk_disable(pc, pwm->hwpwm); return ret; } @@ -206,6 +203,7 @@ static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, static int pwm_mediatek_apply(struct pwm_chip *chip, struct pwm_device *pwm, const struct pwm_state *state) { + struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); int err; if (state->polarity != PWM_POLARITY_NORMAL) @@ -214,7 +212,7 @@ static int pwm_mediatek_apply(struct pwm_chip *chip, struct pwm_device *pwm, if (!state->enabled) { if (pwm->state.enabled) { pwm_mediatek_disable(chip, pwm); - pwm_mediatek_clk_disable(chip, pwm); + pwm_mediatek_clk_disable(pc, pwm->hwpwm); } return 0; @@ -225,7 +223,7 @@ static int pwm_mediatek_apply(struct pwm_chip *chip, struct pwm_device *pwm, return err; if (!pwm->state.enabled) - err = pwm_mediatek_clk_enable(chip, pwm); + err = pwm_mediatek_clk_enable(pc, pwm->hwpwm); return err; } From a911f15745fd4a93381b52acea3083e7e9fd135a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 25 Jul 2025 17:45:08 +0200 Subject: [PATCH 1653/3206] pwm: mediatek: Initialize clks when the hardware is enabled at probe time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a PWM is already configured by the bootloader (e.g. to power a backlight), the clk enable count must be increased to keep clock usage balanced. So check which PWMs are enabled during probe and enable the respective clocks. Signed-off-by: Uwe Kleine-König Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250725154506.2610172-14-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-mediatek.c | 47 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 7fe003bcc74d7b..faa0205d4a0dc5 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -232,6 +232,49 @@ static const struct pwm_ops pwm_mediatek_ops = { .apply = pwm_mediatek_apply, }; +static int pwm_mediatek_init_used_clks(struct pwm_mediatek_chip *pc) +{ + const struct pwm_mediatek_of_data *soc = pc->soc; + unsigned int hwpwm; + u32 enabled, handled = 0; + int ret; + + ret = clk_prepare_enable(pc->clk_top); + if (ret) + return ret; + + ret = clk_prepare_enable(pc->clk_main); + if (ret) + goto err_enable_main; + + enabled = readl(pc->regs) & GENMASK(soc->num_pwms - 1, 0); + + while (enabled & ~handled) { + hwpwm = ilog2(enabled & ~handled); + + ret = pwm_mediatek_clk_enable(pc, hwpwm); + if (ret) { + while (handled) { + hwpwm = ilog2(handled); + + pwm_mediatek_clk_disable(pc, hwpwm); + handled &= ~BIT(hwpwm); + } + + break; + } + + handled |= BIT(hwpwm); + } + + clk_disable_unprepare(pc->clk_main); +err_enable_main: + + clk_disable_unprepare(pc->clk_top); + + return ret; +} + static int pwm_mediatek_probe(struct platform_device *pdev) { struct pwm_chip *chip; @@ -279,6 +322,10 @@ static int pwm_mediatek_probe(struct platform_device *pdev) "Failed to get %s clock\n", name); } + ret = pwm_mediatek_init_used_clks(pc); + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to initialize used clocks\n"); + chip->ops = &pwm_mediatek_ops; ret = devm_pwmchip_add(&pdev->dev, chip); From edd6a37e06f381af9a05c813a822ac8528da0fca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 25 Jul 2025 17:45:09 +0200 Subject: [PATCH 1654/3206] pwm: mediatek: Implement .get_state() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The registers can be read out just fine on an MT8365. In the assumption that this works on all supported devices, a .get_state() callback can be implemented. This enables consumers to make use of pwm_get_state_hw() and improves the usefulness of /sys/kernel/debug/pwm. Signed-off-by: Uwe Kleine-König Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250725154506.2610172-15-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-mediatek.c | 70 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index faa0205d4a0dc5..2a5323e9fa50c8 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -31,6 +31,7 @@ #define PWMDWIDTH_PERIOD GENMASK(12, 0) #define PWM45DWIDTH_FIXUP 0x30 #define PWMTHRES 0x30 +#define PWMTHRES_DUTY GENMASK(12, 0) #define PWM45THRES_FIXUP 0x34 #define PWM_CK_26M_SEL_V3 0x74 #define PWM_CK_26M_SEL 0x210 @@ -108,6 +109,13 @@ static inline void pwm_mediatek_writel(struct pwm_mediatek_chip *chip, num * chip->soc->chanreg_width + offset); } +static inline u32 pwm_mediatek_readl(struct pwm_mediatek_chip *chip, + unsigned int num, unsigned int offset) +{ + return readl(chip->regs + chip->soc->chanreg_base + + num * chip->soc->chanreg_width + offset); +} + static void pwm_mediatek_enable(struct pwm_chip *chip, struct pwm_device *pwm) { struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); @@ -228,8 +236,70 @@ static int pwm_mediatek_apply(struct pwm_chip *chip, struct pwm_device *pwm, return err; } +static int pwm_mediatek_get_state(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) +{ + struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); + int ret; + u32 enable; + u32 reg_width = PWMDWIDTH, reg_thres = PWMTHRES; + + if (pc->soc->pwm45_fixup && pwm->hwpwm > 2) { + /* + * PWM[4,5] has distinct offset for PWMDWIDTH and PWMTHRES + * from the other PWMs on MT7623. + */ + reg_width = PWM45DWIDTH_FIXUP; + reg_thres = PWM45THRES_FIXUP; + } + + ret = pwm_mediatek_clk_enable(pc, pwm->hwpwm); + if (ret < 0) + return ret; + + enable = readl(pc->regs); + if (enable & BIT(pwm->hwpwm)) { + u32 clkdiv, cnt_period, cnt_duty; + unsigned long clk_rate; + + clk_rate = clk_get_rate(pc->clk_pwms[pwm->hwpwm]); + if (!clk_rate) { + ret = -EINVAL; + goto out; + } + + state->enabled = true; + state->polarity = PWM_POLARITY_NORMAL; + + clkdiv = FIELD_GET(PWMCON_CLKDIV, + pwm_mediatek_readl(pc, pwm->hwpwm, PWMCON)); + cnt_period = FIELD_GET(PWMDWIDTH_PERIOD, + pwm_mediatek_readl(pc, pwm->hwpwm, reg_width)); + cnt_duty = FIELD_GET(PWMTHRES_DUTY, + pwm_mediatek_readl(pc, pwm->hwpwm, reg_thres)); + + /* + * cnt_period is a 13 bit value, NSEC_PER_SEC is 30 bits wide + * and clkdiv is less than 8, so the multiplication doesn't + * overflow an u64. + */ + state->period = + DIV_ROUND_UP_ULL((u64)cnt_period * NSEC_PER_SEC << clkdiv, clk_rate); + state->duty_cycle = + DIV_ROUND_UP_ULL((u64)cnt_duty * NSEC_PER_SEC << clkdiv, clk_rate); + } else { + state->enabled = false; + } + +out: + pwm_mediatek_clk_disable(pc, pwm->hwpwm); + + return ret; +} + static const struct pwm_ops pwm_mediatek_ops = { .apply = pwm_mediatek_apply, + .get_state = pwm_mediatek_get_state, }; static int pwm_mediatek_init_used_clks(struct pwm_mediatek_chip *pc) From 849b064c16977202298ac411f80c83ea047fe466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 25 Jul 2025 17:45:10 +0200 Subject: [PATCH 1655/3206] pwm: mediatek: Fix various issues in the .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit duty_cycle and period were silently cast from u64 to int losing relevant bits. Dividing by the result of a division (resolution) looses precision. clkdiv was determined using a loop while it can be done without one. Also too low period values were not catched. Improve all these issues. Handling period and duty_cycle being u64 now requires a bit more care to prevent overflows, so mul_u64_u64_div_u64() is used. The changes implemented in this change also align the chosen hardware settings to match the usual PWM rules (i.e. round down instead round nearest) and so .apply() also matches .get_state() silencing several warnings with PWM_DEBUG=y. While this probably doesn't result in problems, this aspect makes this change---though it might be considered a fix---unsuitable for backporting. Signed-off-by: Uwe Kleine-König Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250725154506.2610172-16-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-mediatek.c | 67 +++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 2a5323e9fa50c8..434ddc57f5dc3e 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -137,13 +137,13 @@ static void pwm_mediatek_disable(struct pwm_chip *chip, struct pwm_device *pwm) } static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) + u64 duty_ns, u64 period_ns) { struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); - u32 clkdiv = 0, cnt_period, cnt_duty, reg_width = PWMDWIDTH, - reg_thres = PWMTHRES; + u32 clkdiv, enable; + u32 reg_width = PWMDWIDTH, reg_thres = PWMTHRES; + u64 cnt_period, cnt_duty; unsigned long clk_rate; - u64 resolution; int ret; ret = pwm_mediatek_clk_enable(pc, pwm->hwpwm); @@ -151,7 +151,11 @@ static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, return ret; clk_rate = clk_get_rate(pc->clk_pwms[pwm->hwpwm]); - if (!clk_rate) { + /* + * With the clk running with not more than 1 GHz the calculations below + * won't overflow + */ + if (!clk_rate || clk_rate > 1000000000) { ret = -EINVAL; goto out; } @@ -160,27 +164,40 @@ static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, if (pc->soc->pwm_ck_26m_sel_reg) writel(0, pc->regs + pc->soc->pwm_ck_26m_sel_reg); - /* Using resolution in picosecond gets accuracy higher */ - resolution = (u64)NSEC_PER_SEC * 1000; - do_div(resolution, clk_rate); - - cnt_period = DIV_ROUND_CLOSEST_ULL((u64)period_ns * 1000, resolution); - if (!cnt_period) - return -EINVAL; + cnt_period = mul_u64_u64_div_u64(period_ns, clk_rate, NSEC_PER_SEC); + if (cnt_period == 0) { + ret = -ERANGE; + goto out; + } - while (cnt_period - 1 > FIELD_MAX(PWMDWIDTH_PERIOD)) { - resolution *= 2; - clkdiv++; - cnt_period = DIV_ROUND_CLOSEST_ULL((u64)period_ns * 1000, - resolution); + if (cnt_period > FIELD_MAX(PWMDWIDTH_PERIOD) + 1) { + if (cnt_period >= ((FIELD_MAX(PWMDWIDTH_PERIOD) + 1) << FIELD_MAX(PWMCON_CLKDIV))) { + clkdiv = FIELD_MAX(PWMCON_CLKDIV); + cnt_period = FIELD_MAX(PWMDWIDTH_PERIOD) + 1; + } else { + clkdiv = ilog2(cnt_period) - ilog2(FIELD_MAX(PWMDWIDTH_PERIOD)); + cnt_period >>= clkdiv; + } + } else { + clkdiv = 0; } - if (clkdiv > FIELD_MAX(PWMCON_CLKDIV)) { - dev_err(pwmchip_parent(chip), "period of %d ns not supported\n", period_ns); - ret = -EINVAL; - goto out; + cnt_duty = mul_u64_u64_div_u64(duty_ns, clk_rate, NSEC_PER_SEC) >> clkdiv; + if (cnt_duty > cnt_period) + cnt_duty = cnt_period; + + if (cnt_duty) { + cnt_duty -= 1; + enable = BIT(pwm->hwpwm); + } else { + enable = 0; } + cnt_period -= 1; + + dev_dbg(&chip->dev, "pwm#%u: %lld/%lld @%lu -> CON: %x, PERIOD: %llx, DUTY: %llx\n", + pwm->hwpwm, duty_ns, period_ns, clk_rate, clkdiv, cnt_period, cnt_duty); + if (pc->soc->pwm45_fixup && pwm->hwpwm > 2) { /* * PWM[4,5] has distinct offset for PWMDWIDTH and PWMTHRES @@ -190,13 +207,11 @@ static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, reg_thres = PWM45THRES_FIXUP; } - cnt_duty = DIV_ROUND_CLOSEST_ULL((u64)duty_ns * 1000, resolution); - pwm_mediatek_writel(pc, pwm->hwpwm, PWMCON, BIT(15) | clkdiv); - pwm_mediatek_writel(pc, pwm->hwpwm, reg_width, cnt_period - 1); + pwm_mediatek_writel(pc, pwm->hwpwm, reg_width, cnt_period); - if (cnt_duty) { - pwm_mediatek_writel(pc, pwm->hwpwm, reg_thres, cnt_duty - 1); + if (enable) { + pwm_mediatek_writel(pc, pwm->hwpwm, reg_thres, cnt_duty); pwm_mediatek_enable(chip, pwm); } else { pwm_mediatek_disable(chip, pwm); From ed5902a2464834656f94f1c23fa61f99ea38f328 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 25 Jul 2025 17:45:11 +0200 Subject: [PATCH 1656/3206] pwm: mediatek: Lock and cache clock rate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This simplifies error handling and reduces the amount of clk_get_rate() calls. While touching the clk handling also allocate the clock array as part of driver data and lock the clock rate to ensure that the output doesn't change unexpectedly. Signed-off-by: Uwe Kleine-König Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250725154506.2610172-17-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-mediatek.c | 63 +++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 434ddc57f5dc3e..4291072a13a7f6 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -49,15 +49,18 @@ struct pwm_mediatek_of_data { * @regs: base address of PWM chip * @clk_top: the top clock generator * @clk_main: the clock used by PWM core - * @clk_pwms: the clock used by each PWM channel * @soc: pointer to chip's platform data + * @clk_pwms: the clock and clkrate used by each PWM channel */ struct pwm_mediatek_chip { void __iomem *regs; struct clk *clk_top; struct clk *clk_main; - struct clk **clk_pwms; const struct pwm_mediatek_of_data *soc; + struct { + struct clk *clk; + unsigned long rate; + } clk_pwms[]; }; static inline struct pwm_mediatek_chip * @@ -79,12 +82,28 @@ static int pwm_mediatek_clk_enable(struct pwm_mediatek_chip *pc, if (ret < 0) goto disable_clk_top; - ret = clk_prepare_enable(pc->clk_pwms[hwpwm]); + ret = clk_prepare_enable(pc->clk_pwms[hwpwm].clk); if (ret < 0) goto disable_clk_main; + if (!pc->clk_pwms[hwpwm].rate) { + pc->clk_pwms[hwpwm].rate = clk_get_rate(pc->clk_pwms[hwpwm].clk); + + /* + * With the clk running with not more than 1 GHz the + * calculations in .apply() won't overflow. + */ + if (!pc->clk_pwms[hwpwm].rate || + pc->clk_pwms[hwpwm].rate > 1000000000) { + ret = -EINVAL; + goto disable_clk_hwpwm; + } + } + return 0; +disable_clk_hwpwm: + clk_disable_unprepare(pc->clk_pwms[hwpwm].clk); disable_clk_main: clk_disable_unprepare(pc->clk_main); disable_clk_top: @@ -96,7 +115,7 @@ static int pwm_mediatek_clk_enable(struct pwm_mediatek_chip *pc, static void pwm_mediatek_clk_disable(struct pwm_mediatek_chip *pc, unsigned int hwpwm) { - clk_disable_unprepare(pc->clk_pwms[hwpwm]); + clk_disable_unprepare(pc->clk_pwms[hwpwm].clk); clk_disable_unprepare(pc->clk_main); clk_disable_unprepare(pc->clk_top); } @@ -150,15 +169,7 @@ static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, if (ret < 0) return ret; - clk_rate = clk_get_rate(pc->clk_pwms[pwm->hwpwm]); - /* - * With the clk running with not more than 1 GHz the calculations below - * won't overflow - */ - if (!clk_rate || clk_rate > 1000000000) { - ret = -EINVAL; - goto out; - } + clk_rate = pc->clk_pwms[pwm->hwpwm].rate; /* Make sure we use the bus clock and not the 26MHz clock */ if (pc->soc->pwm_ck_26m_sel_reg) @@ -277,11 +288,7 @@ static int pwm_mediatek_get_state(struct pwm_chip *chip, struct pwm_device *pwm, u32 clkdiv, cnt_period, cnt_duty; unsigned long clk_rate; - clk_rate = clk_get_rate(pc->clk_pwms[pwm->hwpwm]); - if (!clk_rate) { - ret = -EINVAL; - goto out; - } + clk_rate = pc->clk_pwms[pwm->hwpwm].rate; state->enabled = true; state->polarity = PWM_POLARITY_NORMAL; @@ -306,7 +313,6 @@ static int pwm_mediatek_get_state(struct pwm_chip *chip, struct pwm_device *pwm, state->enabled = false; } -out: pwm_mediatek_clk_disable(pc, pwm->hwpwm); return ret; @@ -370,7 +376,8 @@ static int pwm_mediatek_probe(struct platform_device *pdev) soc = of_device_get_match_data(&pdev->dev); - chip = devm_pwmchip_alloc(&pdev->dev, soc->num_pwms, sizeof(*pc)); + chip = devm_pwmchip_alloc(&pdev->dev, soc->num_pwms, + sizeof(*pc) + soc->num_pwms * sizeof(*pc->clk_pwms)); if (IS_ERR(chip)) return PTR_ERR(chip); pc = to_pwm_mediatek_chip(chip); @@ -381,11 +388,6 @@ static int pwm_mediatek_probe(struct platform_device *pdev) if (IS_ERR(pc->regs)) return PTR_ERR(pc->regs); - pc->clk_pwms = devm_kmalloc_array(&pdev->dev, soc->num_pwms, - sizeof(*pc->clk_pwms), GFP_KERNEL); - if (!pc->clk_pwms) - return -ENOMEM; - pc->clk_top = devm_clk_get(&pdev->dev, "top"); if (IS_ERR(pc->clk_top)) return dev_err_probe(&pdev->dev, PTR_ERR(pc->clk_top), @@ -401,10 +403,15 @@ static int pwm_mediatek_probe(struct platform_device *pdev) snprintf(name, sizeof(name), "pwm%d", i + 1); - pc->clk_pwms[i] = devm_clk_get(&pdev->dev, name); - if (IS_ERR(pc->clk_pwms[i])) - return dev_err_probe(&pdev->dev, PTR_ERR(pc->clk_pwms[i]), + pc->clk_pwms[i].clk = devm_clk_get(&pdev->dev, name); + if (IS_ERR(pc->clk_pwms[i].clk)) + return dev_err_probe(&pdev->dev, PTR_ERR(pc->clk_pwms[i].clk), "Failed to get %s clock\n", name); + + ret = devm_clk_rate_exclusive_get(&pdev->dev, pc->clk_pwms[i].clk); + if (ret) + return dev_err_probe(&pdev->dev, ret, + "Failed to lock clock rate for %s\n", name); } ret = pwm_mediatek_init_used_clks(pc); From 3513752cfe6fc1cfb0d99b3b4c554eb56e8bf8a1 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 12 Aug 2025 22:00:35 +0200 Subject: [PATCH 1657/3206] dt-bindings: pwm: fsl,vf610-ftm-pwm: Add compatible for s32g2 and s32g3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The S32G2 and S32G3 have a FlexTimer (FTM) available which is the same as the one found on the Vybrid Family and the i.MX8. Add the compatibles in the bindings Signed-off-by: Daniel Lezcano Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250812200036.3432917-2-daniel.lezcano@linaro.org Signed-off-by: Uwe Kleine-König --- .../devicetree/bindings/pwm/fsl,vf610-ftm-pwm.yaml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pwm/fsl,vf610-ftm-pwm.yaml b/Documentation/devicetree/bindings/pwm/fsl,vf610-ftm-pwm.yaml index 7f9f72d95e7a32..c7a10180208e03 100644 --- a/Documentation/devicetree/bindings/pwm/fsl,vf610-ftm-pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/fsl,vf610-ftm-pwm.yaml @@ -26,9 +26,14 @@ maintainers: properties: compatible: - enum: - - fsl,vf610-ftm-pwm - - fsl,imx8qm-ftm-pwm + oneOf: + - enum: + - fsl,vf610-ftm-pwm + - fsl,imx8qm-ftm-pwm + - nxp,s32g2-ftm-pwm + - items: + - const: nxp,s32g3-ftm-pwm + - const: nxp,s32g2-ftm-pwm reg: maxItems: 1 From d8af3812b1e8b3b02bdac2f74eda1463540edd61 Mon Sep 17 00:00:00 2001 From: Ghennadi Procopciuc Date: Tue, 12 Aug 2025 22:00:36 +0200 Subject: [PATCH 1658/3206] pwm: Add the S32G support in the Freescale FTM driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Automotive S32G2 and S32G3 platforms include two FTM timers for pwm. Each FTM has 6 PWM channels. The current Freescale FTM driver supports the iMX8 and the Vybrid Family FTM IP. The FTM IP found on the S32G platforms is almost identical except for the number of channels and the register mapping. These changes allow to deal with different number of channels and support the holes found in the register memory mapping for s32gx for suspend / resume. The fault register does not exist on the s32gx and at resume time all the mapping is wrote back leading to a kernel crash. /* restore all registers from cache */ regcache_cache_only(fpc->regmap, false); regcache_sync(fpc->regmap); The regmap callbacks 'writeable_reg()' and 'readable_reg()' will skip the address corresponding to a register which is not present. Tested on a s32g274-rdb2 J5 PWM pin output with signal visualization on oscilloscope. Signed-off-by: Ghennadi Procopciuc Co-developed-by: Daniel Lezcano Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250812200036.3432917-3-daniel.lezcano@linaro.org Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-fsl-ftm.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c index 6683931872fc74..35406b2e1925e8 100644 --- a/drivers/pwm/pwm-fsl-ftm.c +++ b/drivers/pwm/pwm-fsl-ftm.c @@ -3,6 +3,7 @@ * Freescale FlexTimer Module (FTM) PWM Driver * * Copyright 2012-2013 Freescale Semiconductor, Inc. + * Copyright 2020-2025 NXP */ #include @@ -30,6 +31,8 @@ enum fsl_pwm_clk { struct fsl_ftm_soc { bool has_enable_bits; + bool has_flt_reg; + unsigned int npwm; }; struct fsl_pwm_periodcfg { @@ -374,6 +377,20 @@ static bool fsl_pwm_volatile_reg(struct device *dev, unsigned int reg) return false; } +static bool fsl_pwm_is_reg(struct device *dev, unsigned int reg) +{ + struct pwm_chip *chip = dev_get_drvdata(dev); + struct fsl_pwm_chip *fpc = to_fsl_chip(chip); + + if (reg >= FTM_CSC(fpc->soc->npwm) && reg < FTM_CNTIN) + return false; + + if ((reg == FTM_FLTCTRL || reg == FTM_FLTPOL) && !fpc->soc->has_flt_reg) + return false; + + return true; +} + static const struct regmap_config fsl_pwm_regmap_config = { .reg_bits = 32, .reg_stride = 4, @@ -382,21 +399,24 @@ static const struct regmap_config fsl_pwm_regmap_config = { .max_register = FTM_PWMLOAD, .volatile_reg = fsl_pwm_volatile_reg, .cache_type = REGCACHE_FLAT, + .writeable_reg = fsl_pwm_is_reg, + .readable_reg = fsl_pwm_is_reg, }; static int fsl_pwm_probe(struct platform_device *pdev) { + const struct fsl_ftm_soc *soc = of_device_get_match_data(&pdev->dev); struct pwm_chip *chip; struct fsl_pwm_chip *fpc; void __iomem *base; int ret; - chip = devm_pwmchip_alloc(&pdev->dev, 8, sizeof(*fpc)); + chip = devm_pwmchip_alloc(&pdev->dev, soc->npwm, sizeof(*fpc)); if (IS_ERR(chip)) return PTR_ERR(chip); fpc = to_fsl_chip(chip); - fpc->soc = of_device_get_match_data(&pdev->dev); + fpc->soc = soc; base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(base)) @@ -512,15 +532,26 @@ static const struct dev_pm_ops fsl_pwm_pm_ops = { static const struct fsl_ftm_soc vf610_ftm_pwm = { .has_enable_bits = false, + .has_flt_reg = true, + .npwm = 8, }; static const struct fsl_ftm_soc imx8qm_ftm_pwm = { .has_enable_bits = true, + .has_flt_reg = true, + .npwm = 8, +}; + +static const struct fsl_ftm_soc s32g2_ftm_pwm = { + .has_enable_bits = true, + .has_flt_reg = false, + .npwm = 6, }; static const struct of_device_id fsl_pwm_dt_ids[] = { { .compatible = "fsl,vf610-ftm-pwm", .data = &vf610_ftm_pwm }, { .compatible = "fsl,imx8qm-ftm-pwm", .data = &imx8qm_ftm_pwm }, + { .compatible = "nxp,s32g2-ftm-pwm", .data = &s32g2_ftm_pwm }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, fsl_pwm_dt_ids); From ca478d8a4b6d342e7df95bcba842301027a3b490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 29 Jul 2025 12:36:00 +0200 Subject: [PATCH 1659/3206] pwm: pca9685: Don't disable hardware in .free() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's the responsibility of the consumer to disable the hardware before it's released. And there are use cases where it's beneficial to keep the PWM on, e.g. to keep a backlight on before kexec()ing into a new kernel. Even if it would be considered right to disable on pwm_put(), this should be done in the core and not each individual driver. So drop the hardware access in .free(). Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/1ee1a514aeb5f0effafa2d6ec91bc54130895cd9.1753784092.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-pca9685.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index 9ce75704a15f89..de5fe86b4f3351 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -497,7 +497,6 @@ static void pca9685_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) struct pca9685 *pca = to_pca(chip); mutex_lock(&pca->lock); - pca9685_pwm_set_duty(chip, pwm->hwpwm, 0); clear_bit(pwm->hwpwm, pca->pwms_enabled); mutex_unlock(&pca->lock); From de5855613263b426ee697dd30224322f2e634dec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 29 Jul 2025 12:36:01 +0200 Subject: [PATCH 1660/3206] pwm: pca9685: Use bulk write to atomicially update registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The output of a PWM channel is configured by four register values. Write them in a single i2c transaction to ensure glitch free updates. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/bfa8c0267c9ec059d0d77f146998d564654c75ca.1753784092.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-pca9685.c | 46 ++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index de5fe86b4f3351..b22c6da20ebc3b 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -61,6 +61,8 @@ #define MODE1_SUB2 BIT(2) #define MODE1_SUB1 BIT(3) #define MODE1_SLEEP BIT(4) +#define MODE1_AI BIT(5) + #define MODE2_INVRT BIT(4) #define MODE2_OUTDRV BIT(2) @@ -131,6 +133,19 @@ static int pca9685_write_reg(struct pwm_chip *chip, unsigned int reg, unsigned i return err; } +static int pca9685_write_4reg(struct pwm_chip *chip, unsigned int reg, u8 val[4]) +{ + struct pca9685 *pca = to_pca(chip); + struct device *dev = pwmchip_parent(chip); + int err; + + err = regmap_bulk_write(pca->regmap, reg, val, 4); + if (err) + dev_err(dev, "regmap_write to register 0x%x failed: %pe\n", reg, ERR_PTR(err)); + + return err; +} + /* Helper function to set the duty cycle ratio to duty/4096 (e.g. duty=2048 -> 50%) */ static void pca9685_pwm_set_duty(struct pwm_chip *chip, int channel, unsigned int duty) { @@ -143,12 +158,10 @@ static void pca9685_pwm_set_duty(struct pwm_chip *chip, int channel, unsigned in return; } else if (duty >= PCA9685_COUNTER_RANGE) { /* Set the full ON bit and clear the full OFF bit */ - pca9685_write_reg(chip, REG_ON_H(channel), LED_FULL); - pca9685_write_reg(chip, REG_OFF_H(channel), 0); + pca9685_write_4reg(chip, REG_ON_L(channel), (u8[4]){ 0, LED_FULL, 0, 0 }); return; } - if (pwm->state.usage_power && channel < PCA9685_MAXCHAN) { /* * If usage_power is set, the pca9685 driver will phase shift @@ -163,12 +176,9 @@ static void pca9685_pwm_set_duty(struct pwm_chip *chip, int channel, unsigned in off = (on + duty) % PCA9685_COUNTER_RANGE; - /* Set ON time (clears full ON bit) */ - pca9685_write_reg(chip, REG_ON_L(channel), on & 0xff); - pca9685_write_reg(chip, REG_ON_H(channel), (on >> 8) & 0xf); - /* Set OFF time (clears full OFF bit) */ - pca9685_write_reg(chip, REG_OFF_L(channel), off & 0xff); - pca9685_write_reg(chip, REG_OFF_H(channel), (off >> 8) & 0xf); + /* implicitly clear full ON and full OFF bit */ + pca9685_write_4reg(chip, REG_ON_L(channel), + (u8[4]){ on & 0xff, (on >> 8) & 0xf, off & 0xff, (off >> 8) & 0xf }); } static unsigned int pca9685_pwm_get_duty(struct pwm_chip *chip, int channel) @@ -543,9 +553,8 @@ static int pca9685_pwm_probe(struct i2c_client *client) mutex_init(&pca->lock); - ret = pca9685_read_reg(chip, PCA9685_MODE2, ®); - if (ret) - return ret; + /* clear MODE2_OCH */ + reg = 0; if (device_property_read_bool(&client->dev, "invert")) reg |= MODE2_INVRT; @@ -561,16 +570,19 @@ static int pca9685_pwm_probe(struct i2c_client *client) if (ret) return ret; - /* Disable all LED ALLCALL and SUBx addresses to avoid bus collisions */ + /* + * Disable all LED ALLCALL and SUBx addresses to avoid bus collisions, + * enable Auto-Increment. + */ pca9685_read_reg(chip, PCA9685_MODE1, ®); reg &= ~(MODE1_ALLCALL | MODE1_SUB1 | MODE1_SUB2 | MODE1_SUB3); + reg |= MODE1_AI; pca9685_write_reg(chip, PCA9685_MODE1, reg); /* Reset OFF/ON registers to POR default */ - pca9685_write_reg(chip, PCA9685_ALL_LED_OFF_L, 0); - pca9685_write_reg(chip, PCA9685_ALL_LED_OFF_H, LED_FULL); - pca9685_write_reg(chip, PCA9685_ALL_LED_ON_L, 0); - pca9685_write_reg(chip, PCA9685_ALL_LED_ON_H, LED_FULL); + ret = pca9685_write_4reg(chip, PCA9685_ALL_LED_ON_L, (u8[]){ 0, LED_FULL, 0, LED_FULL }); + if (ret < 0) + return dev_err_probe(&client->dev, ret, "Failed to reset ON/OFF registers\n"); chip->ops = &pca9685_pwm_ops; From 3d4c42172380d287f118a14458ea9b418bbebefc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 29 Jul 2025 12:36:02 +0200 Subject: [PATCH 1661/3206] pwm: pca9685: Make use of register caching in regmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This essentially only caches the PRESCALE register because the per channel registers are affected by the ALL configuration that is used by the virtual pwm #16. The PRESCALE register is read often so caching it saves quite some i2c transfers. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/dc25361908ad1dd790f108599bc9dbcc752288a5.1753784092.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-pca9685.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index b22c6da20ebc3b..882fcb05175692 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -521,11 +521,36 @@ static const struct pwm_ops pca9685_pwm_ops = { .free = pca9685_pwm_free, }; +static bool pca9685_readable_reg(struct device *dev, unsigned int reg) +{ + /* The ALL_LED registers are readable but read as zero */ + return reg <= REG_OFF_H(15) || reg >= PCA9685_PRESCALE; +} + +static bool pca9685_writeable_reg(struct device *dev, unsigned int reg) +{ + return reg <= REG_OFF_H(15) || reg >= PCA9685_ALL_LED_ON_L; +} + +static bool pca9685_volatile_reg(struct device *dev, unsigned int reg) +{ + /* + * Writing to an ALL_LED register affects all LEDi registers, so they + * are not cachable. :-\ + */ + return reg < PCA9685_PRESCALE; +} + static const struct regmap_config pca9685_regmap_i2c_config = { .reg_bits = 8, .val_bits = 8, + + .readable_reg = pca9685_readable_reg, + .writeable_reg = pca9685_writeable_reg, + .volatile_reg = pca9685_volatile_reg, + .max_register = PCA9685_NUMREGS, - .cache_type = REGCACHE_NONE, + .cache_type = REGCACHE_MAPLE, }; static int pca9685_pwm_probe(struct i2c_client *client) From 42f18ae36f3f262683739fed5fff9342a6954914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 29 Jul 2025 12:36:03 +0200 Subject: [PATCH 1662/3206] pwm: pca9685: Drop GPIO support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functionality will be restored after the driver is converted to the waveform API as the pwm core optionally provides a gpio chip for all pwm chips that support the waveform API. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/d975376fce9640c90ddc868e3722adeb83fff279.1753784092.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-pca9685.c | 156 -------------------------------------- 1 file changed, 156 deletions(-) diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index 882fcb05175692..3f04defd3718bc 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -26,7 +26,6 @@ * that is enabled is allowed to change the prescale register. * PWM channels requested afterwards must use a period that results in the same * prescale setting as the one set by the first requested channel. - * GPIOs do not count as enabled PWMs as they are not using the prescaler. */ #define PCA9685_MODE1 0x00 @@ -80,10 +79,6 @@ struct pca9685 { struct regmap *regmap; struct mutex lock; DECLARE_BITMAP(pwms_enabled, PCA9685_MAXCHAN + 1); -#if IS_ENABLED(CONFIG_GPIOLIB) - struct gpio_chip gpio; - DECLARE_BITMAP(pwms_inuse, PCA9685_MAXCHAN + 1); -#endif }; static inline struct pca9685 *to_pca(struct pwm_chip *chip) @@ -217,147 +212,6 @@ static unsigned int pca9685_pwm_get_duty(struct pwm_chip *chip, int channel) return (off - on) & (PCA9685_COUNTER_RANGE - 1); } -#if IS_ENABLED(CONFIG_GPIOLIB) -static bool pca9685_pwm_test_and_set_inuse(struct pca9685 *pca, int pwm_idx) -{ - bool is_inuse; - - mutex_lock(&pca->lock); - if (pwm_idx >= PCA9685_MAXCHAN) { - /* - * "All LEDs" channel: - * pretend already in use if any of the PWMs are requested - */ - if (!bitmap_empty(pca->pwms_inuse, PCA9685_MAXCHAN)) { - is_inuse = true; - goto out; - } - } else { - /* - * Regular channel: - * pretend already in use if the "all LEDs" channel is requested - */ - if (test_bit(PCA9685_MAXCHAN, pca->pwms_inuse)) { - is_inuse = true; - goto out; - } - } - is_inuse = test_and_set_bit(pwm_idx, pca->pwms_inuse); -out: - mutex_unlock(&pca->lock); - return is_inuse; -} - -static void pca9685_pwm_clear_inuse(struct pca9685 *pca, int pwm_idx) -{ - mutex_lock(&pca->lock); - clear_bit(pwm_idx, pca->pwms_inuse); - mutex_unlock(&pca->lock); -} - -static int pca9685_pwm_gpio_request(struct gpio_chip *gpio, unsigned int offset) -{ - struct pwm_chip *chip = gpiochip_get_data(gpio); - struct pca9685 *pca = to_pca(chip); - - if (pca9685_pwm_test_and_set_inuse(pca, offset)) - return -EBUSY; - pm_runtime_get_sync(pwmchip_parent(chip)); - return 0; -} - -static int pca9685_pwm_gpio_get(struct gpio_chip *gpio, unsigned int offset) -{ - struct pwm_chip *chip = gpiochip_get_data(gpio); - - return pca9685_pwm_get_duty(chip, offset) != 0; -} - -static int pca9685_pwm_gpio_set(struct gpio_chip *gpio, unsigned int offset, - int value) -{ - struct pwm_chip *chip = gpiochip_get_data(gpio); - - pca9685_pwm_set_duty(chip, offset, value ? PCA9685_COUNTER_RANGE : 0); - - return 0; -} - -static void pca9685_pwm_gpio_free(struct gpio_chip *gpio, unsigned int offset) -{ - struct pwm_chip *chip = gpiochip_get_data(gpio); - struct pca9685 *pca = to_pca(chip); - - pca9685_pwm_set_duty(chip, offset, 0); - pm_runtime_put(pwmchip_parent(chip)); - pca9685_pwm_clear_inuse(pca, offset); -} - -static int pca9685_pwm_gpio_get_direction(struct gpio_chip *chip, - unsigned int offset) -{ - /* Always out */ - return GPIO_LINE_DIRECTION_OUT; -} - -static int pca9685_pwm_gpio_direction_input(struct gpio_chip *gpio, - unsigned int offset) -{ - return -EINVAL; -} - -static int pca9685_pwm_gpio_direction_output(struct gpio_chip *gpio, - unsigned int offset, int value) -{ - pca9685_pwm_gpio_set(gpio, offset, value); - - return 0; -} - -/* - * The PCA9685 has a bit for turning the PWM output full off or on. Some - * boards like Intel Galileo actually uses these as normal GPIOs so we - * expose a GPIO chip here which can exclusively take over the underlying - * PWM channel. - */ -static int pca9685_pwm_gpio_probe(struct pwm_chip *chip) -{ - struct pca9685 *pca = to_pca(chip); - struct device *dev = pwmchip_parent(chip); - - pca->gpio.label = dev_name(dev); - pca->gpio.parent = dev; - pca->gpio.request = pca9685_pwm_gpio_request; - pca->gpio.free = pca9685_pwm_gpio_free; - pca->gpio.get_direction = pca9685_pwm_gpio_get_direction; - pca->gpio.direction_input = pca9685_pwm_gpio_direction_input; - pca->gpio.direction_output = pca9685_pwm_gpio_direction_output; - pca->gpio.get = pca9685_pwm_gpio_get; - pca->gpio.set = pca9685_pwm_gpio_set; - pca->gpio.base = -1; - pca->gpio.ngpio = PCA9685_MAXCHAN; - pca->gpio.can_sleep = true; - - return devm_gpiochip_add_data(dev, &pca->gpio, chip); -} -#else -static inline bool pca9685_pwm_test_and_set_inuse(struct pca9685 *pca, - int pwm_idx) -{ - return false; -} - -static inline void -pca9685_pwm_clear_inuse(struct pca9685 *pca, int pwm_idx) -{ -} - -static inline int pca9685_pwm_gpio_probe(struct pwm_chip *chip) -{ - return 0; -} -#endif - static void pca9685_set_sleep_mode(struct pwm_chip *chip, bool enable) { struct device *dev = pwmchip_parent(chip); @@ -487,9 +341,6 @@ static int pca9685_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) { struct pca9685 *pca = to_pca(chip); - if (pca9685_pwm_test_and_set_inuse(pca, pwm->hwpwm)) - return -EBUSY; - if (pwm->hwpwm < PCA9685_MAXCHAN) { /* PWMs - except the "all LEDs" channel - default to enabled */ mutex_lock(&pca->lock); @@ -511,7 +362,6 @@ static void pca9685_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) mutex_unlock(&pca->lock); pm_runtime_put(pwmchip_parent(chip)); - pca9685_pwm_clear_inuse(pca, pwm->hwpwm); } static const struct pwm_ops pca9685_pwm_ops = { @@ -615,12 +465,6 @@ static int pca9685_pwm_probe(struct i2c_client *client) if (ret < 0) return ret; - ret = pca9685_pwm_gpio_probe(chip); - if (ret < 0) { - pwmchip_remove(chip); - return ret; - } - pm_runtime_enable(&client->dev); if (pm_runtime_enabled(&client->dev)) { From ce1116446098e183720ac529217889c88b964e0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 29 Jul 2025 12:36:04 +0200 Subject: [PATCH 1663/3206] pwm: pca9586: Convert to waveform API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows to expose the duty_offset feature that the chip supports, and so also emit inverted polarity waveforms. The conversion from a waveform to hardware settings (and vice versa) is aligned to the usual rounding rules silencing warnings with PWM_DEBUG. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/1927d115ae6797858e6c4537971dacf1d563854f.1753784092.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-pca9685.c | 323 ++++++++++++++++++++------------------ 1 file changed, 173 insertions(+), 150 deletions(-) diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index 3f04defd3718bc..107bebec3546ed 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -49,7 +49,14 @@ #define PCA9685_PRESCALE_MAX 0xFF /* => min. frequency of 24 Hz */ #define PCA9685_COUNTER_RANGE 4096 -#define PCA9685_OSC_CLOCK_MHZ 25 /* Internal oscillator with 25 MHz */ +#define PCA9685_OSC_CLOCK_HZ 25000000 /* Internal oscillator with 25 MHz */ + +/* + * The time value of one counter tick. Note that NSEC_PER_SEC is an integer + * multiple of PCA9685_OSC_CLOCK_HZ, so there is no rounding involved and we're + * not loosing precision due to the early division. + */ +#define PCA9685_QUANTUM_NS(_prescale) ((NSEC_PER_SEC / PCA9685_OSC_CLOCK_HZ) * (_prescale + 1)) #define PCA9685_NUMREGS 0xFF #define PCA9685_MAXCHAN 0x10 @@ -141,200 +148,213 @@ static int pca9685_write_4reg(struct pwm_chip *chip, unsigned int reg, u8 val[4] return err; } -/* Helper function to set the duty cycle ratio to duty/4096 (e.g. duty=2048 -> 50%) */ -static void pca9685_pwm_set_duty(struct pwm_chip *chip, int channel, unsigned int duty) +static int pca9685_set_sleep_mode(struct pwm_chip *chip, bool enable) { - struct pwm_device *pwm = &chip->pwms[channel]; - unsigned int on, off; - - if (duty == 0) { - /* Set the full OFF bit, which has the highest precedence */ - pca9685_write_reg(chip, REG_OFF_H(channel), LED_FULL); - return; - } else if (duty >= PCA9685_COUNTER_RANGE) { - /* Set the full ON bit and clear the full OFF bit */ - pca9685_write_4reg(chip, REG_ON_L(channel), (u8[4]){ 0, LED_FULL, 0, 0 }); - return; - } + struct pca9685 *pca = to_pca(chip); + int err; - if (pwm->state.usage_power && channel < PCA9685_MAXCHAN) { - /* - * If usage_power is set, the pca9685 driver will phase shift - * the individual channels relative to their channel number. - * This improves EMI because the enabled channels no longer - * turn on at the same time, while still maintaining the - * configured duty cycle / power output. - */ - on = channel * PCA9685_COUNTER_RANGE / PCA9685_MAXCHAN; - } else - on = 0; + err = regmap_update_bits(pca->regmap, PCA9685_MODE1, + MODE1_SLEEP, enable ? MODE1_SLEEP : 0); + if (err) + return err; - off = (on + duty) % PCA9685_COUNTER_RANGE; + if (!enable) { + /* Wait 500us for the oscillator to be back up */ + udelay(500); + } - /* implicitly clear full ON and full OFF bit */ - pca9685_write_4reg(chip, REG_ON_L(channel), - (u8[4]){ on & 0xff, (on >> 8) & 0xf, off & 0xff, (off >> 8) & 0xf }); + return 0; } -static unsigned int pca9685_pwm_get_duty(struct pwm_chip *chip, int channel) +struct pca9685_waveform { + u8 onoff[4]; + u8 prescale; +}; + +static int pca9685_round_waveform_tohw(struct pwm_chip *chip, struct pwm_device *pwm, const struct pwm_waveform *wf, void *_wfhw) { - struct pwm_device *pwm = &chip->pwms[channel]; - unsigned int off = 0, on = 0, val = 0; + struct pca9685_waveform *wfhw = _wfhw; + struct pca9685 *pca = to_pca(chip); + unsigned int best_prescale; + u8 prescale; + unsigned int period_ns, duty; + int ret_tohw = 0; - if (WARN_ON(channel >= PCA9685_MAXCHAN)) { - /* HW does not support reading state of "all LEDs" channel */ - return 0; - } + if (!wf->period_length_ns) { + *wfhw = (typeof(*wfhw)){ + .onoff = { 0, 0, 0, LED_FULL, }, + .prescale = 0, + }; + + dev_dbg(&chip->dev, "pwm#%u: %lld/%lld [+%lld] -> [%hhx %hhx %hhx %hhx] PSC:%hhx\n", + pwm->hwpwm, wf->duty_length_ns, wf->period_length_ns, wf->duty_offset_ns, + wfhw->onoff[0], wfhw->onoff[1], wfhw->onoff[2], wfhw->onoff[3], wfhw->prescale); - pca9685_read_reg(chip, LED_N_OFF_H(channel), &off); - if (off & LED_FULL) { - /* Full OFF bit is set */ return 0; } - pca9685_read_reg(chip, LED_N_ON_H(channel), &on); - if (on & LED_FULL) { - /* Full ON bit is set */ - return PCA9685_COUNTER_RANGE; + if (wf->period_length_ns >= PCA9685_COUNTER_RANGE * PCA9685_QUANTUM_NS(255)) { + best_prescale = 255; + } else if (wf->period_length_ns < PCA9685_COUNTER_RANGE * PCA9685_QUANTUM_NS(3)) { + best_prescale = 3; + ret_tohw = 1; + } else { + best_prescale = (unsigned int)wf->period_length_ns / (PCA9685_COUNTER_RANGE * (NSEC_PER_SEC / PCA9685_OSC_CLOCK_HZ)) - 1; } - pca9685_read_reg(chip, LED_N_OFF_L(channel), &val); - off = ((off & 0xf) << 8) | (val & 0xff); - if (!pwm->state.usage_power) - return off; + guard(mutex)(&pca->lock); - /* Read ON register to calculate duty cycle of staggered output */ - if (pca9685_read_reg(chip, LED_N_ON_L(channel), &val)) { - /* Reset val to 0 in case reading LED_N_ON_L failed */ - val = 0; - } - on = ((on & 0xf) << 8) | (val & 0xff); - return (off - on) & (PCA9685_COUNTER_RANGE - 1); -} + if (!pca9685_prescaler_can_change(pca, pwm->hwpwm)) { + unsigned int current_prescale; + int ret; -static void pca9685_set_sleep_mode(struct pwm_chip *chip, bool enable) -{ - struct device *dev = pwmchip_parent(chip); - struct pca9685 *pca = to_pca(chip); - int err = regmap_update_bits(pca->regmap, PCA9685_MODE1, - MODE1_SLEEP, enable ? MODE1_SLEEP : 0); - if (err) { - dev_err(dev, "regmap_update_bits of register 0x%x failed: %pe\n", - PCA9685_MODE1, ERR_PTR(err)); - return; + ret = regmap_read(pca->regmap, PCA9685_PRESCALE, ¤t_prescale); + if (ret) + return ret; + + if (current_prescale > best_prescale) + ret_tohw = 1; + + prescale = current_prescale; + } else { + prescale = best_prescale; } - if (!enable) { - /* Wait 500us for the oscillator to be back up */ - udelay(500); + period_ns = PCA9685_COUNTER_RANGE * PCA9685_QUANTUM_NS(prescale); + + duty = (unsigned)min_t(u64, wf->duty_length_ns, period_ns) / PCA9685_QUANTUM_NS(prescale); + + if (duty < PCA9685_COUNTER_RANGE) { + unsigned int on, off; + + on = (unsigned)min_t(u64, wf->duty_offset_ns, period_ns) / PCA9685_QUANTUM_NS(prescale); + off = (on + duty) % PCA9685_COUNTER_RANGE; + + /* + * With a zero duty cycle, it doesn't matter if period was + * rounded up + */ + if (!duty) + ret_tohw = 0; + + *wfhw = (typeof(*wfhw)){ + .onoff = { on & 0xff, (on >> 8) & 0xf, off & 0xff, (off >> 8) & 0xf }, + .prescale = prescale, + }; + } else { + *wfhw = (typeof(*wfhw)){ + .onoff = { 0, LED_FULL, 0, 0, }, + .prescale = prescale, + }; } + + dev_dbg(&chip->dev, "pwm#%u: %lld/%lld [+%lld] -> %s[%hhx %hhx %hhx %hhx] PSC:%hhx\n", + pwm->hwpwm, wf->duty_length_ns, wf->period_length_ns, wf->duty_offset_ns, + ret_tohw ? "#" : "", wfhw->onoff[0], wfhw->onoff[1], wfhw->onoff[2], wfhw->onoff[3], wfhw->prescale); + + return ret_tohw; } -static int __pca9685_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - const struct pwm_state *state) +static int pca9685_round_waveform_fromhw(struct pwm_chip *chip, struct pwm_device *pwm, + const void *_wfhw, struct pwm_waveform *wf) { + const struct pca9685_waveform *wfhw = _wfhw; struct pca9685 *pca = to_pca(chip); - unsigned long long duty, prescale; - unsigned int val = 0; - - if (state->polarity != PWM_POLARITY_NORMAL) - return -EINVAL; - - prescale = DIV_ROUND_CLOSEST_ULL(PCA9685_OSC_CLOCK_MHZ * state->period, - PCA9685_COUNTER_RANGE * 1000) - 1; - if (prescale < PCA9685_PRESCALE_MIN || prescale > PCA9685_PRESCALE_MAX) { - dev_err(pwmchip_parent(chip), "pwm not changed: period out of bounds!\n"); - return -EINVAL; - } + unsigned int prescale; - if (!state->enabled) { - pca9685_pwm_set_duty(chip, pwm->hwpwm, 0); - return 0; - } + if (wfhw->prescale) + prescale = wfhw->prescale; + else + scoped_guard(mutex, &pca->lock) { + int ret; - pca9685_read_reg(chip, PCA9685_PRESCALE, &val); - if (prescale != val) { - if (!pca9685_prescaler_can_change(pca, pwm->hwpwm)) { - dev_err(pwmchip_parent(chip), - "pwm not changed: periods of enabled pwms must match!\n"); - return -EBUSY; + ret = regmap_read(pca->regmap, PCA9685_PRESCALE, &prescale); + if (ret) + return ret; } - /* - * Putting the chip briefly into SLEEP mode - * at this point won't interfere with the - * pm_runtime framework, because the pm_runtime - * state is guaranteed active here. - */ - /* Put chip into sleep mode */ - pca9685_set_sleep_mode(chip, true); + wf->period_length_ns = PCA9685_COUNTER_RANGE * PCA9685_QUANTUM_NS(prescale); - /* Change the chip-wide output frequency */ - pca9685_write_reg(chip, PCA9685_PRESCALE, prescale); + if (wfhw->onoff[3] & LED_FULL) { + wf->duty_length_ns = 0; + wf->duty_offset_ns = 0; + } else if (wfhw->onoff[1] & LED_FULL) { + wf->duty_length_ns = wf->period_length_ns; + wf->duty_offset_ns = 0; + } else { + unsigned int on = wfhw->onoff[0] | (wfhw->onoff[1] & 0xf) << 8; + unsigned int off = wfhw->onoff[2] | (wfhw->onoff[3] & 0xf) << 8; - /* Wake the chip up */ - pca9685_set_sleep_mode(chip, false); + wf->duty_length_ns = (off - on) % PCA9685_COUNTER_RANGE * PCA9685_QUANTUM_NS(prescale); + wf->duty_offset_ns = on * PCA9685_QUANTUM_NS(prescale); } - duty = PCA9685_COUNTER_RANGE * state->duty_cycle; - duty = DIV_ROUND_UP_ULL(duty, state->period); - pca9685_pwm_set_duty(chip, pwm->hwpwm, duty); + dev_dbg(&chip->dev, "pwm#%u: [%hhx %hhx %hhx %hhx] PSC:%hhx -> %lld/%lld [+%lld]\n", + pwm->hwpwm, + wfhw->onoff[0], wfhw->onoff[1], wfhw->onoff[2], wfhw->onoff[3], wfhw->prescale, + wf->duty_length_ns, wf->period_length_ns, wf->duty_offset_ns); + return 0; } -static int pca9685_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - const struct pwm_state *state) +static int pca9685_read_waveform(struct pwm_chip *chip, struct pwm_device *pwm, void *_wfhw) { + struct pca9685_waveform *wfhw = _wfhw; struct pca9685 *pca = to_pca(chip); + unsigned int prescale; int ret; - mutex_lock(&pca->lock); - ret = __pca9685_pwm_apply(chip, pwm, state); - if (ret == 0) { - if (state->enabled) - set_bit(pwm->hwpwm, pca->pwms_enabled); - else - clear_bit(pwm->hwpwm, pca->pwms_enabled); - } - mutex_unlock(&pca->lock); + guard(mutex)(&pca->lock); - return ret; + ret = regmap_bulk_read(pca->regmap, REG_ON_L(pwm->hwpwm), &wfhw->onoff, 4); + if (ret) + return ret; + + ret = regmap_read(pca->regmap, PCA9685_PRESCALE, &prescale); + if (ret) + return ret; + + wfhw->prescale = prescale; + + return 0; } -static int pca9685_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) +static int pca9685_write_waveform(struct pwm_chip *chip, struct pwm_device *pwm, const void *_wfhw) { - unsigned long long duty; - unsigned int val = 0; + const struct pca9685_waveform *wfhw = _wfhw; + struct pca9685 *pca = to_pca(chip); + unsigned int current_prescale; + int ret; - /* Calculate (chip-wide) period from prescale value */ - pca9685_read_reg(chip, PCA9685_PRESCALE, &val); - /* - * PCA9685_OSC_CLOCK_MHZ is 25, i.e. an integer divider of 1000. - * The following calculation is therefore only a multiplication - * and we are not losing precision. - */ - state->period = (PCA9685_COUNTER_RANGE * 1000 / PCA9685_OSC_CLOCK_MHZ) * - (val + 1); + guard(mutex)(&pca->lock); - /* The (per-channel) polarity is fixed */ - state->polarity = PWM_POLARITY_NORMAL; + if (wfhw->prescale) { + ret = regmap_read(pca->regmap, PCA9685_PRESCALE, ¤t_prescale); + if (ret) + return ret; - if (pwm->hwpwm >= PCA9685_MAXCHAN) { - /* - * The "all LEDs" channel does not support HW readout - * Return 0 and disabled for backwards compatibility - */ - state->duty_cycle = 0; - state->enabled = false; - return 0; - } + if (current_prescale != wfhw->prescale) { + if (!pca9685_prescaler_can_change(pca, pwm->hwpwm)) + return -EBUSY; - state->enabled = true; - duty = pca9685_pwm_get_duty(chip, pwm->hwpwm); - state->duty_cycle = DIV_ROUND_DOWN_ULL(duty * state->period, PCA9685_COUNTER_RANGE); + /* Put chip into sleep mode */ + ret = pca9685_set_sleep_mode(chip, true); + if (ret) + return ret; - return 0; + /* Change the chip-wide output frequency */ + ret = regmap_write(pca->regmap, PCA9685_PRESCALE, wfhw->prescale); + if (ret) + return ret; + + /* Wake the chip up */ + ret = pca9685_set_sleep_mode(chip, false); + if (ret) + return ret; + } + } + + return regmap_bulk_write(pca->regmap, REG_ON_L(pwm->hwpwm), &wfhw->onoff, 4); } static int pca9685_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) @@ -365,8 +385,11 @@ static void pca9685_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) } static const struct pwm_ops pca9685_pwm_ops = { - .apply = pca9685_pwm_apply, - .get_state = pca9685_pwm_get_state, + .sizeof_wfhw = sizeof(struct pca9685_waveform), + .round_waveform_tohw = pca9685_round_waveform_tohw, + .round_waveform_fromhw = pca9685_round_waveform_fromhw, + .read_waveform = pca9685_read_waveform, + .write_waveform = pca9685_write_waveform, .request = pca9685_pwm_request, .free = pca9685_pwm_free, }; From efedb508591e231b47b23ce6b353c81eeb3b9a84 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 21 Aug 2025 10:31:11 +0200 Subject: [PATCH 1664/3206] dt-bindings: pwm: nxp,lpc1850-sct-pwm: Minor whitespace cleanup in example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DTS code coding style expects exactly one space around '=' character. Signed-off-by: Krzysztof Kozlowski Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250821083110.46420-2-krzysztof.kozlowski@linaro.org Signed-off-by: Uwe Kleine-König --- Documentation/devicetree/bindings/pwm/nxp,lpc1850-sct-pwm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pwm/nxp,lpc1850-sct-pwm.yaml b/Documentation/devicetree/bindings/pwm/nxp,lpc1850-sct-pwm.yaml index ffda0123878eda..920e0413d4312b 100644 --- a/Documentation/devicetree/bindings/pwm/nxp,lpc1850-sct-pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/nxp,lpc1850-sct-pwm.yaml @@ -48,7 +48,7 @@ examples: pwm@40000000 { compatible = "nxp,lpc1850-sct-pwm"; reg = <0x40000000 0x1000>; - clocks =<&ccu1 CLK_CPU_SCT>; + clocks = <&ccu1 CLK_CPU_SCT>; clock-names = "pwm"; #pwm-cells = <3>; }; From d322a0e01d9ecc91881d7a6ad388a37a1a81471f Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Thu, 28 Aug 2025 16:01:39 +0200 Subject: [PATCH 1665/3206] dt-bindings: pwm: apple,s5l-fpwm: Add t6020-fpwm compatible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PWM controller on Apple's M2 Pro/Max SoCs behaves in the same way as on previous M1 and M2 SoCs. Add its per SoC compatible. At the same time fix the order of existing entries. The sort order logic is having SoC numeric code families in release order, and SoCs within each family in release order: - t8xxx (Apple HxxP/G series, "phone"/"tablet" chips) - t8103 (Apple H13G/M1) - t8112 (Apple H14G/M2) - t6xxx (Apple HxxJ series, "desktop" chips) - t6000/t6001/t6002 (Apple H13J(S/C/D) / M1 Pro/Max/Ultra) - t6020/t6021/t6022 (Apple H14J(S/C/D) / M2 Pro/Max/Ultra) Note that SoCs of the t600[0-2] / t602[0-2] family share the t6000 / t6020 compatible where the hardware is 100% compatible, which is usually the case in this highly related set of SoCs. Signed-off-by: Janne Grunau Reviewed-by: Neal Gompa Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250828-dt-apple-t6020-v1-20-507ba4c4b98e@jannau.net Signed-off-by: Uwe Kleine-König --- Documentation/devicetree/bindings/pwm/apple,s5l-fpwm.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pwm/apple,s5l-fpwm.yaml b/Documentation/devicetree/bindings/pwm/apple,s5l-fpwm.yaml index 142157bff0cd85..04519b0c581d0e 100644 --- a/Documentation/devicetree/bindings/pwm/apple,s5l-fpwm.yaml +++ b/Documentation/devicetree/bindings/pwm/apple,s5l-fpwm.yaml @@ -17,8 +17,9 @@ properties: items: - enum: - apple,t8103-fpwm - - apple,t6000-fpwm - apple,t8112-fpwm + - apple,t6000-fpwm + - apple,t6020-fpwm - const: apple,s5l-fpwm reg: From ebd524a3ac3a172aa26f99d20d4d00d57da9a875 Mon Sep 17 00:00:00 2001 From: Ivaylo Ivanov Date: Sun, 14 Sep 2025 16:20:33 +0300 Subject: [PATCH 1666/3206] dt-bindings: pwm: samsung: add exynos8890 compatible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add samsung,exynos8890-pwm compatible string to binding document. Signed-off-by: Ivaylo Ivanov Link: https://lore.kernel.org/r/20250914132033.2622886-1-ivo.ivanov.ivanov1@gmail.com Signed-off-by: Uwe Kleine-König --- Documentation/devicetree/bindings/pwm/pwm-samsung.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pwm/pwm-samsung.yaml b/Documentation/devicetree/bindings/pwm/pwm-samsung.yaml index 17a2b927af3370..97acbdec39f102 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-samsung.yaml +++ b/Documentation/devicetree/bindings/pwm/pwm-samsung.yaml @@ -31,6 +31,7 @@ properties: - enum: - samsung,exynos5433-pwm - samsung,exynos7-pwm + - samsung,exynos8890-pwm - samsung,exynosautov9-pwm - samsung,exynosautov920-pwm - tesla,fsd-pwm From 8f2689f194b8d1bff41150ae316abdfccf191309 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 12 Aug 2025 23:35:41 +0900 Subject: [PATCH 1667/3206] pwm: cros-ec: Avoid -Wflex-array-member-not-at-end warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Use the new TRAILING_OVERLAP() helper to fix the following warnings: drivers/pwm/pwm-cros-ec.c:53:40: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] drivers/pwm/pwm-cros-ec.c:87:40: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] This helper creates a union between a flexible-array member (FAM) and a set of members that would otherwise follow it. This overlays the trailing members onto the FAM while preserving the original memory layout. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/aJtRPZpc-Lv-C6zD@kspp Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-cros-ec.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c index 189301dc395e25..67cfa17f58e0d5 100644 --- a/drivers/pwm/pwm-cros-ec.c +++ b/drivers/pwm/pwm-cros-ec.c @@ -49,10 +49,9 @@ static int cros_ec_pwm_set_duty(struct cros_ec_pwm_device *ec_pwm, u8 index, u16 duty) { struct cros_ec_device *ec = ec_pwm->ec; - struct { - struct cros_ec_command msg; + TRAILING_OVERLAP(struct cros_ec_command, msg, data, struct ec_params_pwm_set_duty params; - } __packed buf; + ) __packed buf; struct ec_params_pwm_set_duty *params = &buf.params; struct cros_ec_command *msg = &buf.msg; int ret; @@ -83,13 +82,12 @@ static int cros_ec_pwm_set_duty(struct cros_ec_pwm_device *ec_pwm, u8 index, static int cros_ec_pwm_get_duty(struct cros_ec_device *ec, bool use_pwm_type, u8 index) { - struct { - struct cros_ec_command msg; + TRAILING_OVERLAP(struct cros_ec_command, msg, data, union { struct ec_params_pwm_get_duty params; struct ec_response_pwm_get_duty resp; }; - } __packed buf; + ) __packed buf; struct ec_params_pwm_get_duty *params = &buf.params; struct ec_response_pwm_get_duty *resp = &buf.resp; struct cros_ec_command *msg = &buf.msg; From 0e58f6a7dd689c73d67e6a2164b46d4618f2698a Mon Sep 17 00:00:00 2001 From: Shaopeng Tan Date: Mon, 23 Jun 2025 16:50:50 +0900 Subject: [PATCH 1668/3206] fs/resctrl: Optimize code in rdt_get_tree() schemata_list_destroy() has to be called if schemata_list_create() fails. rdt_get_tree() calls schemata_list_destroy() in two different ways: directly if schemata_list_create() itself fails and on the exit path via the out_schemata_free goto label. Remove schemata_list_destroy() call on schemata_list_create() failure. Use existing out_schemata_free goto label instead. Signed-off-by: Shaopeng Tan Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Reviewed-by: James Morse Reviewed-by: Koba Ko Link: https://lore.kernel.org/20250623075051.3610592-1-tan.shaopeng@jp.fujitsu.com --- fs/resctrl/rdtgroup.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 77d08229d85502..5f0b7cfa1cc2ae 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2608,10 +2608,8 @@ static int rdt_get_tree(struct fs_context *fc) goto out_root; ret = schemata_list_create(); - if (ret) { - schemata_list_destroy(); - goto out_ctx; - } + if (ret) + goto out_schemata_free; ret = closid_init(); if (ret) @@ -2683,7 +2681,6 @@ static int rdt_get_tree(struct fs_context *fc) closid_exit(); out_schemata_free: schemata_list_destroy(); -out_ctx: rdt_disable_ctx(); out_root: rdtgroup_destroy_root(); From 09f37134464cc03baf5cb8eab2d99db27ee73217 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:00 -0500 Subject: [PATCH 1669/3206] x86,fs/resctrl: Consolidate monitor event descriptions There are currently only three monitor events, all associated with the RDT_RESOURCE_L3 resource. Growing support for additional events will be easier with some restructuring to have a single point in file system code where all attributes of all events are defined. Place all event descriptions into an array mon_event_all[]. Doing this has the beneficial side effect of removing the need for rdt_resource::evt_list. Add resctrl_event_id::QOS_FIRST_EVENT for a lower bound on range checks for event ids and as the starting index to scan mon_event_all[]. Drop the code that builds evt_list and change the two places where the list is scanned to scan mon_event_all[] instead using a new helper macro for_each_mon_event(). Architecture code now informs file system code which events are available with resctrl_enable_mon_event(). Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/core.c | 12 ++++-- fs/resctrl/internal.h | 13 ++++-- fs/resctrl/monitor.c | 63 +++++++++++++++--------------- fs/resctrl/rdtgroup.c | 11 +++--- include/linux/resctrl.h | 4 +- include/linux/resctrl_types.h | 12 ++++-- 6 files changed, 66 insertions(+), 49 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 187d527ef73b6e..7fcae25874fea1 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -864,12 +864,18 @@ static __init bool get_rdt_mon_resources(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + } if (!rdt_mon_features) return false; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9a8cf6f11151d9..7a57366d1abce7 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -52,19 +52,26 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) } /** - * struct mon_evt - Entry in the event list of a resource + * struct mon_evt - Properties of a monitor event * @evtid: event id + * @rid: resource id for this event * @name: name of the event * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list + * @enabled: true if the event is enabled */ struct mon_evt { enum resctrl_event_id evtid; + enum resctrl_res_level rid; char *name; bool configurable; - struct list_head list; + bool enabled; }; +extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; + +#define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ + mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 7326c28a7908f3..d5bf3e0334b690 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -842,38 +842,39 @@ static void dom_data_exit(struct rdt_resource *r) mutex_unlock(&rdtgroup_mutex); } -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, -}; - -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, -}; - -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, -}; - /* - * Initialize the event list for the resource. - * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. + * All available events. Architecture code marks the ones that + * are supported by a system using resctrl_enable_mon_event() + * to set .enabled. */ -static void l3_mon_evt_init(struct rdt_resource *r) +struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { + [QOS_L3_OCCUP_EVENT_ID] = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_TOTAL_EVENT_ID] = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_LOCAL_EVENT_ID] = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, +}; + +void resctrl_enable_mon_event(enum resctrl_event_id eventid) { - INIT_LIST_HEAD(&r->evt_list); + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) + return; + if (mon_event_all[eventid].enabled) { + pr_warn("Duplicate enable for event %d\n", eventid); + return; + } - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); + mon_event_all[eventid].enabled = true; } /** @@ -900,15 +901,13 @@ int resctrl_mon_resource_init(void) if (ret) return ret; - l3_mon_evt_init(r); - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 5f0b7cfa1cc2ae..ab943a5907c5c6 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1152,7 +1152,9 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, struct rdt_resource *r = rdt_kn_parent_priv(of->kn); struct mon_evt *mevt; - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; seq_printf(seq, "%s\n", mevt->name); if (mevt->configurable) seq_printf(seq, "%s_config\n", mevt->name); @@ -3054,10 +3056,9 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, struct mon_evt *mevt; int ret, domid; - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 6fb4894b8cfd1f..2944042bd84c5d 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -269,7 +269,6 @@ enum resctrl_schema_fmt { * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. - * @evt_list: List of monitoring events * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource @@ -287,7 +286,6 @@ struct rdt_resource { struct list_head mon_domains; char *name; enum resctrl_schema_fmt schema_fmt; - struct list_head evt_list; unsigned int mbm_cfg_mask; bool cdp_capable; }; @@ -372,6 +370,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); +void resctrl_enable_mon_event(enum resctrl_event_id eventid); + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); /** diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index a25fb9c4070d3c..2dadbc54e4b352 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,11 +34,15 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) -/* - * Event IDs, the values match those used to program IA32_QM_EVTSEL before - * reading IA32_QM_CTR on RDT systems. - */ +/* Event IDs */ enum resctrl_event_id { + /* Must match value of first event below */ + QOS_FIRST_EVENT = 0x01, + + /* + * These values match those used to program IA32_QM_EVTSEL before + * reading IA32_QM_CTR on RDT systems. + */ QOS_L3_OCCUP_EVENT_ID = 0x01, QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, From d257cc2e5c8bb8236cb161360d6c0529fd442712 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:01 -0500 Subject: [PATCH 1670/3206] x86,fs/resctrl: Replace architecture event enabled checks The resctrl file system now has complete knowledge of the status of every event. So there is no need for per-event function calls to check. Replace each of the resctrl_arch_is_{event}enabled() calls with resctrl_is_mon_event_enabled(QOS_{EVENT}). No functional change. Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/include/asm/resctrl.h | 15 --------------- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- arch/x86/kernel/cpu/resctrl/monitor.c | 4 ++-- fs/resctrl/ctrlmondata.c | 4 ++-- fs/resctrl/monitor.c | 16 +++++++++++----- fs/resctrl/rdtgroup.c | 18 +++++++++--------- include/linux/resctrl.h | 2 ++ 7 files changed, 28 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index feb93b50e990ac..b1dd5d6b87db4e 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -84,21 +84,6 @@ static inline void resctrl_arch_disable_mon(void) static_branch_dec_cpuslocked(&rdt_enable_key); } -static inline bool resctrl_arch_is_llc_occupancy_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_total_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_local_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); -} - /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7fcae25874fea1..1a319ce9328c70 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -402,13 +402,13 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { size_t tsize; - if (resctrl_arch_is_mbm_total_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { tsize = sizeof(*hw_dom->arch_mbm_total); hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_total) return -ENOMEM; } - if (resctrl_arch_is_mbm_local_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { tsize = sizeof(*hw_dom->arch_mbm_local); hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_local) { diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c261558276cdd4..61d38517e2bf4a 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -207,11 +207,11 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) memset(hw_dom->arch_mbm_total, 0, sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) memset(hw_dom->arch_mbm_local, 0, sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); } diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 3c39cfacb25183..42b281b3852ff8 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -473,12 +473,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, rdt_last_cmd_clear(); if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; else ret = -EINVAL; } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; else ret = -EINVAL; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index d5bf3e0334b690..b0b1dcc367955c 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -336,7 +336,7 @@ void free_rmid(u32 closid, u32 rmid) entry = __rmid_entry(idx); - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) add_rmid_to_limbo(entry); else list_add_tail(&entry->list, &rmid_free_lru); @@ -635,10 +635,10 @@ static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, * This is protected from concurrent reads from user as both * the user and overflow handler hold the global mutex. */ - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); } @@ -877,6 +877,12 @@ void resctrl_enable_mon_event(enum resctrl_event_id eventid) mon_event_all[eventid].enabled = true; } +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) +{ + return eventid >= QOS_FIRST_EVENT && eventid < QOS_NUM_EVENTS && + mon_event_all[eventid].enabled; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * @@ -912,9 +918,9 @@ int resctrl_mon_resource_init(void) RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) + else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index ab943a5907c5c6..2ca8e66c0d2042 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -123,8 +123,8 @@ void rdt_staged_configs_clear(void) static bool resctrl_is_mbm_enabled(void) { - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); + return (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID) || + resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } static bool resctrl_is_mbm_event(int e) @@ -196,7 +196,7 @@ static int closid_alloc(void) lockdep_assert_held(&rdtgroup_mutex); if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { + resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { cleanest_closid = resctrl_find_cleanest_closid(); if (cleanest_closid < 0) return cleanest_closid; @@ -4048,7 +4048,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -4084,12 +4084,12 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize; - if (resctrl_arch_is_llc_occupancy_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_arch_is_mbm_total_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { tsize = sizeof(*d->mbm_total); d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_total) { @@ -4097,7 +4097,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain return -ENOMEM; } } - if (resctrl_arch_is_mbm_local_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { tsize = sizeof(*d->mbm_local); d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_local) { @@ -4142,7 +4142,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) RESCTRL_PICK_ANY_CPU); } - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); /* @@ -4217,7 +4217,7 @@ void resctrl_offline_cpu(unsigned int cpu) cancel_delayed_work(&d->mbm_over); mbm_setup_overflow_handler(d, 0, cpu); } - if (resctrl_arch_is_llc_occupancy_enabled() && + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && cpu == d->cqm_work_cpu && has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); cqm_setup_limbo_handler(d, 0, cpu); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 2944042bd84c5d..40aba6b5d4f080 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -372,6 +372,8 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); void resctrl_enable_mon_event(enum resctrl_event_id eventid); +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); /** From 63cc9811aa874e6fab671599ba93a989f4f93a5d Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:02 -0500 Subject: [PATCH 1671/3206] x86/resctrl: Remove the rdt_mon_features global variable rdt_mon_features is used as a bitmask of enabled monitor events. A monitor event's status is now maintained in mon_evt::enabled with all monitor events' mon_evt structures found in the filesystem's mon_event_all[] array. Remove the remaining uses of rdt_mon_features. Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/include/asm/resctrl.h | 1 - arch/x86/kernel/cpu/resctrl/core.c | 9 +++++---- arch/x86/kernel/cpu/resctrl/monitor.c | 5 ----- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index b1dd5d6b87db4e..575f8408a9e7c6 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -44,7 +44,6 @@ DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); extern bool rdt_alloc_capable; extern bool rdt_mon_capable; -extern unsigned int rdt_mon_features; DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 1a319ce9328c70..5d14f9a14eda54 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -863,21 +863,22 @@ static __init bool get_rdt_alloc_resources(void) static __init bool get_rdt_mon_resources(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + bool ret = false; if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); - rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); + ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); - rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); + ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); - rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + ret = true; } - if (!rdt_mon_features) + if (!ret) return false; return !rdt_get_mon_l3_config(r); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 61d38517e2bf4a..07f8ab097cbe14 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -31,11 +31,6 @@ */ bool rdt_mon_capable; -/* - * Global to indicate which monitoring events are enabled. - */ -unsigned int rdt_mon_features; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) static int snc_nodes_per_l3_cache = 1; From 83b039877310ae1eb614eef17b780df1e10d9fb5 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:03 -0500 Subject: [PATCH 1672/3206] x86,fs/resctrl: Prepare for more monitor events There's a rule in computer programming that objects appear zero, once, or many times. So code accordingly. There are two MBM events and resctrl is coded with a lot of if (local) do one thing if (total) do a different thing Change the rdt_mon_domain and rdt_hw_mon_domain structures to hold arrays of pointers to per event data instead of explicit fields for total and local bandwidth. Simplify by coding for many events using loops on which are enabled. Move resctrl_is_mbm_event() to so it can be used more widely. Also provide a for_each_mbm_event_id() helper macro. Cleanup variable names in functions touched to consistently use "eventid" for those with type enum resctrl_event_id. Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/core.c | 40 +++++++++++---------- arch/x86/kernel/cpu/resctrl/internal.h | 8 ++--- arch/x86/kernel/cpu/resctrl/monitor.c | 36 +++++++++---------- fs/resctrl/monitor.c | 13 ++++--- fs/resctrl/rdtgroup.c | 50 +++++++++++++------------- include/linux/resctrl.h | 23 +++++++++--- include/linux/resctrl_types.h | 3 ++ 7 files changed, 96 insertions(+), 77 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 5d14f9a14eda54..fbf019c1ff11bf 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -365,8 +365,10 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { - kfree(hw_dom->arch_mbm_total); - kfree(hw_dom->arch_mbm_local); + int idx; + + for_each_mbm_idx(idx) + kfree(hw_dom->arch_mbm_states[idx]); kfree(hw_dom); } @@ -400,25 +402,27 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * */ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { - size_t tsize; - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { - tsize = sizeof(*hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_total) - return -ENOMEM; - } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { - tsize = sizeof(*hw_dom->arch_mbm_local); - hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_local) { - kfree(hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = NULL; - return -ENOMEM; - } + size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + for_each_mbm_idx(idx) { + kfree(hw_dom->arch_mbm_states[idx]); + hw_dom->arch_mbm_states[idx] = NULL; + } + + return -ENOMEM; } static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5e3c41b3643737..58dca892a5df22 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -54,15 +54,15 @@ struct rdt_hw_ctrl_domain { * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share * a resource for a monitor function * @d_resctrl: Properties exposed to the resctrl file system - * @arch_mbm_total: arch private state for MBM total bandwidth - * @arch_mbm_local: arch private state for MBM local bandwidth + * @arch_mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct arch_mbm_state + * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_mon_domain { struct rdt_mon_domain d_resctrl; - struct arch_mbm_state *arch_mbm_total; - struct arch_mbm_state *arch_mbm_local; + struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 07f8ab097cbe14..f01db2034d08df 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -161,18 +161,14 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do u32 rmid, enum resctrl_event_id eventid) { - switch (eventid) { - case QOS_L3_OCCUP_EVENT_ID: - return NULL; - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &hw_dom->arch_mbm_total[rmid]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &hw_dom->arch_mbm_local[rmid]; - default: - /* Never expect to get here */ - WARN_ON_ONCE(1); + struct arch_mbm_state *state; + + if (!resctrl_is_mbm_event(eventid)) return NULL; - } + + state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; + + return state ? &state[rmid] : NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -201,14 +197,16 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) - memset(hw_dom->arch_mbm_total, 0, - sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) - memset(hw_dom->arch_mbm_local, 0, - sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + memset(hw_dom->arch_mbm_states[idx], 0, + sizeof(*hw_dom->arch_mbm_states[0]) * r->num_rmid); + } } static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index b0b1dcc367955c..e0dfa5fb969e4f 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -346,15 +346,14 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *state; - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: + if (!resctrl_is_mbm_event(evtid)) return NULL; - } + + state = d->mbm_states[MBM_STATE_IDX(evtid)]; + + return state ? &state[idx] : NULL; } static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 2ca8e66c0d2042..a6047e9345cdab 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -127,12 +127,6 @@ static bool resctrl_is_mbm_enabled(void) resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); -} - /* * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap * of free CLOSIDs. @@ -4020,9 +4014,13 @@ static void rdtgroup_setup_default(void) static void domain_destroy_mon_state(struct rdt_mon_domain *d) { + int idx; + bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } } void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4082,32 +4080,34 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; + size_t tsize = sizeof(*d->mbm_states[0]); + enum resctrl_event_id eventid; + int idx; if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } - } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + d->mbm_states[idx] = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + bitmap_free(d->rmid_busy_llc); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } + + return -ENOMEM; } int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 40aba6b5d4f080..478d7a935ca36e 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -161,8 +161,9 @@ struct rdt_ctrl_domain { * @hdr: common header for different domain types * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold - * @mbm_total: saved state for MBM total bandwidth - * @mbm_local: saved state for MBM local bandwidth + * @mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct mbm_state + * indexed by RMID on x86 or combined CLOSID, RMID on Arm. * @mbm_over: worker to periodically read MBM h/w counters * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters @@ -172,8 +173,7 @@ struct rdt_mon_domain { struct rdt_domain_hdr hdr; unsigned int ci_id; unsigned long *rmid_busy_llc; - struct mbm_state *mbm_total; - struct mbm_state *mbm_local; + struct mbm_state *mbm_states[QOS_NUM_L3_MBM_EVENTS]; struct delayed_work mbm_over; struct delayed_work cqm_limbo; int mbm_work_cpu; @@ -376,6 +376,21 @@ bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); +static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid) +{ + return (eventid >= QOS_L3_MBM_TOTAL_EVENT_ID && + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/* Iterate over all memory bandwidth events */ +#define for_each_mbm_event_id(eventid) \ + for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID; \ + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID; eventid++) + +/* Iterate over memory bandwidth arrays in domain structures */ +#define for_each_mbm_idx(idx) \ + for (idx = 0; idx < QOS_NUM_L3_MBM_EVENTS; idx++) + /** * resctrl_arch_mon_event_config_write() - Write the config for an event. * @config_info: struct resctrl_mon_config_info describing the resource, domain diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index 2dadbc54e4b352..d98351663c2c64 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -51,4 +51,7 @@ enum resctrl_event_id { QOS_NUM_EVENTS, }; +#define QOS_NUM_L3_MBM_EVENTS (QOS_L3_MBM_LOCAL_EVENT_ID - QOS_L3_MBM_TOTAL_EVENT_ID + 1) +#define MBM_STATE_IDX(evt) ((evt) - QOS_L3_MBM_TOTAL_EVENT_ID) + #endif /* __LINUX_RESCTRL_TYPES_H */ From 1ebe8f7e782523e62cd1fa8237f7afba5d1dae83 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Sun, 31 Aug 2025 22:43:57 +0100 Subject: [PATCH 1673/3206] PM: EM: Fix late boot with holes in CPU topology Commit e3f1164fc9ee ("PM: EM: Support late CPUs booting and capacity adjustment") added a mechanism to handle CPUs that come up late by retrying when any of the `cpufreq_cpu_get()` call fails. However, if there are holes in the CPU topology (offline CPUs, e.g. nosmt), the first missing CPU causes the loop to break, preventing subsequent online CPUs from being updated. Instead of aborting on the first missing CPU policy, loop through all and retry if any were missing. Fixes: e3f1164fc9ee ("PM: EM: Support late CPUs booting and capacity adjustment") Suggested-by: Kenneth Crudup Reported-by: Kenneth Crudup Link: https://lore.kernel.org/linux-pm/40212796-734c-4140-8a85-854f72b8144d@panix.com/ Cc: 6.9+ # 6.9+ Signed-off-by: Christian Loehle Link: https://patch.msgid.link/20250831214357.2020076-1-christian.loehle@arm.com [ rjw: Drop the new pr_debug() message which is not very useful ] Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 8df55397414a12..5f17d2e8e95420 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -799,7 +799,7 @@ void em_adjust_cpu_capacity(unsigned int cpu) static void em_check_capacity_update(void) { cpumask_var_t cpu_done_mask; - int cpu; + int cpu, failed_cpus = 0; if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) { pr_warn("no free memory\n"); @@ -817,10 +817,8 @@ static void em_check_capacity_update(void) policy = cpufreq_cpu_get(cpu); if (!policy) { - pr_debug("Accessing cpu%d policy failed\n", cpu); - schedule_delayed_work(&em_update_work, - msecs_to_jiffies(1000)); - break; + failed_cpus++; + continue; } cpufreq_cpu_put(policy); @@ -835,6 +833,9 @@ static void em_check_capacity_update(void) em_adjust_new_capacity(cpu, dev, pd); } + if (failed_cpus) + schedule_delayed_work(&em_update_work, msecs_to_jiffies(1000)); + free_cpumask_var(cpu_done_mask); } From e19c06219985f2beb9d71959d80f56e318abf744 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:04 -0500 Subject: [PATCH 1674/3206] x86/cpufeatures: Add support for Assignable Bandwidth Monitoring Counters (ABMC) Users can create as many monitor groups as RMIDs supported by the hardware. However, the bandwidth monitoring feature on AMD only guarantees that RMIDs currently assigned to a processor will be tracked by hardware. The counters of any other RMIDs which are no longer being tracked will be reset to zero. The MBM event counters return "Unavailable" for the RMIDs that are not tracked by hardware. So, there can be only limited number of groups that can give guaranteed monitoring numbers. With ever changing configurations there is no way to definitely know which of these groups are being tracked during a particular time. Users do not have the option to monitor a group or set of groups for a certain period of time without worrying about RMID being reset in between. The ABMC feature allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. There is no need to worry about counters being reset during this period. Additionally, the user can specify the type of memory transactions (e.g., reads, writes) for the counter to track. Without ABMC enabled, monitoring will work in current mode without assignment option. The Linux resctrl subsystem provides an interface that allows monitoring of up to two memory bandwidth events per group, selected from a combination of available total and local events. When ABMC is enabled, two events will be assigned to each group by default, in line with the current interface design. Users will also have the option to configure which types of memory transactions are counted by these events. Due to the limited number of available counters (32), users may quickly exhaust the available counters. If the system runs out of assignable ABMC counters, the kernel will report an error. In such cases, users will need to unassign one or more active counters to free up counters for new assignments. resctrl will provide options to assign or unassign events through the group-specific interface file. The feature is detected via CPUID_Fn80000020_EBX_x00 bit 5: ABMC (Assignable Bandwidth Monitoring Counters). The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). [ bp: Massage commit message, fixup enumeration due to VMSCAPE ] Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 751ca35386b0ef..b2a562217d3ffc 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -496,6 +496,7 @@ #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ +#define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ /* * BUG word(s) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 6b868afb26c319..4cee6213d66738 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -51,6 +51,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 }, { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 }, { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 }, { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, From bebf57bf054b561a62f3440142b2eddab2b0bbff Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:05 -0500 Subject: [PATCH 1675/3206] x86/resctrl: Add ABMC feature in the command line options Add a kernel command-line parameter to enable or disable the exposure of the ABMC (Assignable Bandwidth Monitoring Counters) hardware feature to resctrl. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/filesystems/resctrl.rst | 1 + arch/x86/kernel/cpu/resctrl/core.c | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5a7a83c411e9c5..889e68e83682d1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6155,7 +6155,7 @@ rdt= [HW,X86,RDT] Turn on/off individual RDT features. List is: cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp, - mba, smba, bmec. + mba, smba, bmec, abmc. E.g. to turn on cmt and turn off mba use: rdt=cmt,!mba diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index c7949dd44f2f3a..c97fd77a107dc6 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -26,6 +26,7 @@ MBM (Memory Bandwidth Monitoring) "cqm_mbm_total", "cqm_mbm_local" MBA (Memory Bandwidth Allocation) "mba" SMBA (Slow Memory Bandwidth Allocation) "" BMEC (Bandwidth Monitoring Event Configuration) "" +ABMC (Assignable Bandwidth Monitoring Counters) "" =============================================== ================================ Historically, new features were made visible by default in /proc/cpuinfo. This diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index fbf019c1ff11bf..b07b12a058862b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -711,6 +711,7 @@ enum { RDT_FLAG_MBA, RDT_FLAG_SMBA, RDT_FLAG_BMEC, + RDT_FLAG_ABMC, }; #define RDT_OPT(idx, n, f) \ @@ -736,6 +737,7 @@ static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), + RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) From 5ad68c8f965fed78c61f2ac7aea933f06bb50032 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:06 -0500 Subject: [PATCH 1676/3206] x86,fs/resctrl: Consolidate monitoring related data from rdt_resource The cache allocation and memory bandwidth allocation feature properties are consolidated into struct resctrl_cache and struct resctrl_membw respectively. In preparation for more monitoring properties that will clobber the existing resource struct more, re-organize the monitoring specific properties to also be in a separate structure. Also convert "bandwidth sources" terminology to "memory transactions" to have consistency within resctrl for related monitoring features. [ bp: Massage commit message. ] Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- arch/x86/kernel/cpu/resctrl/monitor.c | 10 +++++----- fs/resctrl/rdtgroup.c | 6 +++--- include/linux/resctrl.h | 18 +++++++++++++----- 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index b07b12a058862b..267e9206a99926 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -107,7 +107,7 @@ u32 resctrl_arch_system_num_rmid_idx(void) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->num_rmid; + return r->mon.num_rmid; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) @@ -541,7 +541,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { mon_domain_free(hw_dom); return; } diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index f01db2034d08df..2558b1bdef8b54 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -130,7 +130,7 @@ static int logical_rmid_to_physical_rmid(int cpu, int lrmid) if (snc_nodes_per_l3_cache == 1) return lrmid; - return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; } static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) @@ -205,7 +205,7 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * continue; idx = MBM_STATE_IDX(eventid); memset(hw_dom->arch_mbm_states[idx], 0, - sizeof(*hw_dom->arch_mbm_states[0]) * r->num_rmid); + sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); } } @@ -344,7 +344,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; - r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; + r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -359,7 +359,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - threshold = resctrl_rmid_realloc_limit / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; /* * Because num_rmid may not be a power of two, round the value @@ -373,7 +373,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); - r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } r->mon_capable = true; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index a6047e9345cdab..b6ab107049939a 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1135,7 +1135,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->num_rmid); + seq_printf(seq, "%d\n", r->mon.num_rmid); return 0; } @@ -1731,9 +1731,9 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) } /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { + if ((val & r->mon.mbm_cfg_mask) != val) { rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); + r->mon.mbm_cfg_mask); return -EINVAL; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 478d7a935ca36e..fe2af6cb96d4b4 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -255,38 +255,46 @@ enum resctrl_schema_fmt { RESCTRL_SCHEMA_RANGE, }; +/** + * struct resctrl_mon - Monitoring related data of a resctrl resource. + * @num_rmid: Number of RMIDs available. + * @mbm_cfg_mask: Memory transactions that can be tracked when bandwidth + * monitoring events can be configured. + */ +struct resctrl_mon { + int num_rmid; + unsigned int mbm_cfg_mask; +}; + /** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine - * @num_rmid: Number of RMIDs available * @ctrl_scope: Scope of this resource for control functions * @mon_scope: Scope of this resource for monitor functions * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. + * @mon: Monitoring related data. * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. - * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth - * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { int rid; bool alloc_capable; bool mon_capable; - int num_rmid; enum resctrl_scope ctrl_scope; enum resctrl_scope mon_scope; struct resctrl_cache cache; struct resctrl_membw membw; + struct resctrl_mon mon; struct list_head ctrl_domains; struct list_head mon_domains; char *name; enum resctrl_schema_fmt schema_fmt; - unsigned int mbm_cfg_mask; bool cdp_capable; }; From 13390861b426e936db20d675804a5b405622bc79 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:07 -0500 Subject: [PATCH 1677/3206] x86,fs/resctrl: Detect Assignable Bandwidth Monitoring feature details ABMC feature details are reported via CPUID Fn8000_0020_EBX_x5. Bits Description 15:0 MAX_ABMC Maximum Supported Assignable Bandwidth Monitoring Counter ID + 1 The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). Detect the feature and number of assignable counters supported. For backward compatibility, upon detecting the assignable counter feature, enable the mbm_total_bytes and mbm_local_bytes events that users are familiar with as part of original L3 MBM support. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] --- arch/x86/kernel/cpu/resctrl/core.c | 7 +++++-- arch/x86/kernel/cpu/resctrl/monitor.c | 11 ++++++++--- fs/resctrl/monitor.c | 7 +++++++ include/linux/resctrl.h | 4 ++++ 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 267e9206a99926..2e68aa02ad3f4c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -883,6 +883,8 @@ static __init bool get_rdt_mon_resources(void) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); ret = true; } + if (rdt_cpu_has(X86_FEATURE_ABMC)) + ret = true; if (!ret) return false; @@ -978,7 +980,7 @@ static enum cpuhp_state rdt_online; /* Runs once on the BSP during boot. */ void resctrl_cpu_detect(struct cpuinfo_x86 *c) { - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; c->x86_cache_mbm_width_offset = -1; @@ -990,7 +992,8 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) || + cpu_has(c, X86_FEATURE_ABMC)) { u32 eax, ebx, ecx, edx; /* QoS sub-leaf, EAX=0Fh, ECX=1 */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 2558b1bdef8b54..0a695ce68f462d 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -339,6 +339,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); unsigned int threshold; + u32 eax, ebx, ecx, edx; snc_nodes_per_l3_cache = snc_get_config(); @@ -368,14 +369,18 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) */ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); - if (rdt_cpu_has(X86_FEATURE_BMEC)) { - u32 eax, ebx, ecx, edx; - + if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } + if (rdt_cpu_has(X86_FEATURE_ABMC)) { + r->mon.mbm_cntr_assignable = true; + cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); + r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; + } + r->mon_capable = true; return 0; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index e0dfa5fb969e4f..b578451de2b50b 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -922,6 +922,13 @@ int resctrl_mon_resource_init(void) else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + if (r->mon.mbm_cntr_assignable) { + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + } + return 0; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index fe2af6cb96d4b4..eb80cc233be431 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -260,10 +260,14 @@ enum resctrl_schema_fmt { * @num_rmid: Number of RMIDs available. * @mbm_cfg_mask: Memory transactions that can be tracked when bandwidth * monitoring events can be configured. + * @num_mbm_cntrs: Number of assignable counters. + * @mbm_cntr_assignable:Is system capable of supporting counter assignment? */ struct resctrl_mon { int num_rmid; unsigned int mbm_cfg_mask; + int num_mbm_cntrs; + bool mbm_cntr_assignable; }; /** From faebbc58cde9d8f6050ac152c34c88195ed4abaa Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:08 -0500 Subject: [PATCH 1678/3206] x86/resctrl: Add support to enable/disable AMD ABMC feature Add the functionality to enable/disable the AMD ABMC feature. The AMD ABMC feature is enabled by setting enabled bit(0) in the L3_QOS_EXT_CFG MSR. When the state of ABMC is changed, the MSR needs to be updated on all the logical processors in the QOS Domain. Hardware counters will reset when ABMC state is changed. [ bp: Massage commit message. ] Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/resctrl/internal.h | 5 +++ arch/x86/kernel/cpu/resctrl/monitor.c | 45 ++++++++++++++++++++++++++ include/linux/resctrl.h | 20 ++++++++++++ 4 files changed, 71 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa1410..e4945e5c92ef80 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1223,6 +1223,7 @@ /* - AMD: */ #define MSR_IA32_MBA_BW_BASE 0xc0000200 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 +#define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff #define MSR_IA32_EVT_CFG_BASE 0xc0000400 /* AMD-V MSRs */ diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 58dca892a5df22..a79a487e639c02 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -37,6 +37,9 @@ struct arch_mbm_state { u64 prev_msr; }; +/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */ +#define ABMC_ENABLE_BIT 0 + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -102,6 +105,7 @@ struct msr_param { * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. * @cdp_enabled: CDP state of this resource + * @mbm_cntr_assign_enabled: ABMC feature is enabled * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -115,6 +119,7 @@ struct rdt_hw_resource { unsigned int mon_scale; unsigned int mbm_width; bool cdp_enabled; + bool mbm_cntr_assign_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 0a695ce68f462d..cce35a0ad455de 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -399,3 +399,48 @@ void __init intel_rdt_mbm_apply_quirk(void) mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; mbm_cf = mbm_cf_table[cf_index].cf; } + +static void resctrl_abmc_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); +} + +/* + * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs + * associated with all monitor domains. + */ +static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, + &enable, 1); + resctrl_arch_reset_rmid_all(r, d); + } +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (r->mon.mbm_cntr_assignable && + hw_res->mbm_cntr_assign_enabled != enable) { + _resctrl_abmc_enable(r, enable); + hw_res->mbm_cntr_assign_enabled = enable; + } + + return 0; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; +} diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index eb80cc233be431..919806122c5095 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -445,6 +445,26 @@ static inline u32 resctrl_get_config_index(u32 closid, bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l); int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); +/** + * resctrl_arch_mbm_cntr_assign_enabled() - Check if MBM counter assignment + * mode is enabled. + * @r: Pointer to the resource structure. + * + * Return: + * true if the assignment mode is enabled, false otherwise. + */ +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); + +/** + * resctrl_arch_mbm_cntr_assign_set() - Configure the MBM counter assignment mode. + * @r: Pointer to the resource structure. + * @enable: Set to true to enable, false to disable the assignment mode. + * + * Return: + * 0 on success, < 0 on error. + */ +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. From 3b497c3f4f0427d940ec5c8600e840c8adc5cfbf Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:09 -0500 Subject: [PATCH 1679/3206] fs/resctrl: Introduce the interface to display monitoring modes Introduce the resctrl file "mbm_assign_mode" to list the supported counter assignment modes. The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Each event within a resctrl group can be assigned independently in this mode. On AMD systems "mbm_event" mode is backed by the ABMC (Assignable Bandwidth Monitoring Counters) hardware feature and is enabled by default. The "default" mode is the existing mode that works without the explicit counter assignment, instead relying on dynamic counter assignment by hardware that may result in hardware not dedicating a counter resulting in monitoring data reads returning "Unavailable". Provide an interface to display the monitor modes on the system. $ cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode [mbm_event] default Add IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED) check to support Arm64. On x86, CONFIG_RESCTRL_ASSIGN_FIXED is not defined. On Arm64, it will be defined when the "mbm_event" mode is supported. Add IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED) check early to ensure the user interface remains compatible with upcoming Arm64 support. IS_ENABLED() safely evaluates to 0 when the configuration is not defined. As a result, for MPAM, the display would be either: [default] or [mbm_event] Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 31 +++++++++++++++++++++++++++ fs/resctrl/internal.h | 4 ++++ fs/resctrl/monitor.c | 30 ++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 9 +++++++- 4 files changed, 73 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index c97fd77a107dc6..b692829fec5f6b 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -257,6 +257,37 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/mbm_local_bytes_config 0=0x30;1=0x30;3=0x15;4=0x15 +"mbm_assign_mode": + The supported counter assignment modes. The enclosed brackets indicate which mode + is enabled. + :: + + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + [mbm_event] + default + + "mbm_event": + + mbm_event mode allows users to assign a hardware counter to an RMID, event + pair and monitor the bandwidth usage as long as it is assigned. The hardware + continues to track the assigned counter until it is explicitly unassigned by + the user. Each event within a resctrl group can be assigned independently. + + In this mode, a monitoring event can only accumulate data while it is backed + by a hardware counter. Use "mbm_L3_assignments" found in each CTRL_MON and MON + group to specify which of the events should have a counter assigned. The number + of counters available is described in the "num_mbm_cntrs" file. Changing the + mode may cause all counters on the resource to reset. + + "default": + + In default mode, resctrl assumes there is a hardware counter for each + event within every CTRL_MON and MON group. On AMD platforms, it is + recommended to use the mbm_event mode, if supported, to prevent reset of MBM + events between reads resulting from hardware re-allocating counters. This can + result in misleading values or display "Unavailable" if no counter is assigned + to the event. + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 7a57366d1abce7..78aeb7ea38af26 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -382,6 +382,10 @@ bool closid_allocated(unsigned int closid); int resctrl_find_cleanest_closid(void); +void *rdt_kn_parent_priv(struct kernfs_node *kn); + +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index b578451de2b50b..96231d517e711d 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -882,6 +882,36 @@ bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) mon_event_all[eventid].enabled; } +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool enabled; + + mutex_lock(&rdtgroup_mutex); + enabled = resctrl_arch_mbm_cntr_assign_enabled(r); + + if (r->mon.mbm_cntr_assignable) { + if (enabled) + seq_puts(s, "[mbm_event]\n"); + else + seq_puts(s, "[default]\n"); + + if (!IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED)) { + if (enabled) + seq_puts(s, "default\n"); + else + seq_puts(s, "mbm_event\n"); + } + } else { + seq_puts(s, "[default]\n"); + } + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index b6ab107049939a..144585a8599670 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -975,7 +975,7 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of, return 0; } -static void *rdt_kn_parent_priv(struct kernfs_node *kn) +void *rdt_kn_parent_priv(struct kernfs_node *kn) { /* * The parent pointer is only valid within RCU section since it can be @@ -1911,6 +1911,13 @@ static struct rftype res_common_files[] = { .seq_show = mbm_local_bytes_config_show, .write = mbm_local_bytes_config_write, }, + { + .name = "mbm_assign_mode", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_mode_show, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, + }, { .name = "cpus", .mode = 0644, From 8c793336eaf8893a29626155d74615fe9f03e7f2 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:10 -0500 Subject: [PATCH 1680/3206] fs/resctrl: Add resctrl file to display number of assignable counters The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Create 'num_mbm_cntrs' resctrl file that displays the number of counters supported in each domain. 'num_mbm_cntrs' is only visible to user space when the system supports "mbm_event" mode. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 11 +++++++++++ fs/resctrl/internal.h | 2 ++ fs/resctrl/monitor.c | 26 ++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 6 ++++++ 4 files changed, 45 insertions(+) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index b692829fec5f6b..4eb27530be6ffe 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -288,6 +288,17 @@ with the following files: result in misleading values or display "Unavailable" if no counter is assigned to the event. +"num_mbm_cntrs": + The maximum number of counters (total of available and assigned counters) in + each domain when the system supports mbm_event mode. + + For example, on a system with maximum of 32 memory bandwidth monitoring + counters in each of its L3 domains: + :: + + # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs + 0=32;1=32 + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 78aeb7ea38af26..7a12187eced821 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -386,6 +386,8 @@ void *rdt_kn_parent_priv(struct kernfs_node *kn); int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 96231d517e711d..112979e9c44453 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -912,6 +912,30 @@ int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, return 0; } +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + seq_printf(s, "%d=%d", dom->hdr.id, r->mon.num_mbm_cntrs); + sep = true; + } + seq_putc(s, '\n'); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return 0; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * @@ -957,6 +981,8 @@ int resctrl_mon_resource_init(void) resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + resctrl_file_fflags_init("num_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 144585a8599670..9d95d01da3f9d3 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1836,6 +1836,12 @@ static struct rftype res_common_files[] = { .seq_show = rdt_default_ctrl_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, + { + .name = "num_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_num_mbm_cntrs_show, + }, { .name = "min_cbm_bits", .mode = 0444, From 4d32c24a74f2c12ff440d381ba01de574f6631ce Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:11 -0500 Subject: [PATCH 1681/3206] fs/resctrl: Introduce mbm_cntr_cfg to track assignable counters per domain The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Counters are assigned/unassigned at monitoring domain level. Manage a monitoring domain's hardware counters using a per monitoring domain array of struct mbm_cntr_cfg that is indexed by the hardware counter ID. A hardware counter's configuration contains the MBM event ID and points to the monitoring group that it is assigned to, with a NULL pointer meaning that the hardware counter is available for assignment. There is no direct way to determine which hardware counters are assigned to a particular monitoring group. Check every entry of every hardware counter configuration array in every monitoring domain to query which MBM events of a monitoring group is tracked by hardware. Such queries are acceptable because of a very small number of assignable counters (32 to 64). Suggested-by: Peter Newman Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- fs/resctrl/rdtgroup.c | 8 ++++++++ include/linux/resctrl.h | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 9d95d01da3f9d3..61f7b68f22730d 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -4029,6 +4029,7 @@ static void domain_destroy_mon_state(struct rdt_mon_domain *d) { int idx; + kfree(d->cntr_cfg); bitmap_free(d->rmid_busy_llc); for_each_mbm_idx(idx) { kfree(d->mbm_states[idx]); @@ -4112,6 +4113,13 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain goto cleanup; } + if (resctrl_is_mbm_enabled() && r->mon.mbm_cntr_assignable) { + tsize = sizeof(*d->cntr_cfg); + d->cntr_cfg = kcalloc(r->mon.num_mbm_cntrs, tsize, GFP_KERNEL); + if (!d->cntr_cfg) + goto cleanup; + } + return 0; cleanup: bitmap_free(d->rmid_busy_llc); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 919806122c5095..e013caba664147 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -156,6 +156,18 @@ struct rdt_ctrl_domain { u32 *mbps_val; }; +/** + * struct mbm_cntr_cfg - Assignable counter configuration. + * @evtid: MBM event to which the counter is assigned. Only valid + * if @rdtgroup is not NULL. + * @rdtgrp: resctrl group assigned to the counter. NULL if the + * counter is free. + */ +struct mbm_cntr_cfg { + enum resctrl_event_id evtid; + struct rdtgroup *rdtgrp; +}; + /** * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types @@ -168,6 +180,8 @@ struct rdt_ctrl_domain { * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters * @cqm_work_cpu: worker CPU for CQM h/w counters + * @cntr_cfg: array of assignable counters' configuration (indexed + * by counter ID) */ struct rdt_mon_domain { struct rdt_domain_hdr hdr; @@ -178,6 +192,7 @@ struct rdt_mon_domain { struct delayed_work cqm_limbo; int mbm_work_cpu; int cqm_work_cpu; + struct mbm_cntr_cfg *cntr_cfg; }; /** From 16ff6b038fb3b64aa033efdc95d673239610e1a6 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:12 -0500 Subject: [PATCH 1682/3206] fs/resctrl: Introduce interface to display number of free MBM counters Introduce the "available_mbm_cntrs" resctrl file to display the number of counters available for assignment in each domain when "mbm_event" mode is enabled. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 11 +++++++ fs/resctrl/internal.h | 3 ++ fs/resctrl/monitor.c | 44 +++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 6 ++++ 4 files changed, 64 insertions(+) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 4eb27530be6ffe..446736dbd97f19 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -299,6 +299,17 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs 0=32;1=32 +"available_mbm_cntrs": + The number of counters available for assignment in each domain when mbm_event + mode is enabled on the system. + + For example, on a system with 30 available [hardware] assignable counters + in each of its L3 domains: + :: + + # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs + 0=30;1=30 + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 7a12187eced821..4f372e80bf3738 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -388,6 +388,9 @@ int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, + void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 112979e9c44453..1fa82a62b2e570 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -936,6 +936,48 @@ int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, return 0; } +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + u32 cntrs, i; + int ret = 0; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + cntrs = 0; + for (i = 0; i < r->mon.num_mbm_cntrs; i++) { + if (!dom->cntr_cfg[i].rdtgrp) + cntrs++; + } + + seq_printf(s, "%d=%u", dom->hdr.id, cntrs); + sep = true; + } + seq_putc(s, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * @@ -983,6 +1025,8 @@ int resctrl_mon_resource_init(void) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("available_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 61f7b68f22730d..2e1d0a2703da4f 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1822,6 +1822,12 @@ static struct rftype res_common_files[] = { .seq_show = rdt_mon_features_show, .fflags = RFTYPE_MON_INFO, }, + { + .name = "available_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_available_mbm_cntrs_show, + }, { .name = "num_rmids", .mode = 0444, From 84ecefb766748916099f5b7444a973a623611d63 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:13 -0500 Subject: [PATCH 1683/3206] x86/resctrl: Add data structures and definitions for ABMC assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ABMC feature allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. The ABMC feature implements an MSR L3_QOS_ABMC_CFG (C000_03FDh). ABMC counter assignment is done by setting the counter id, bandwidth source (RMID) and bandwidth configuration. Attempts to read or write the MSR when ABMC is not enabled will result in a #GP(0) exception. Introduce the data structures and definitions for MSR L3_QOS_ABMC_CFG (0xC000_03FDh): ========================================================================= Bits Mnemonic Description Access Reset Type Value ========================================================================= 63 CfgEn Configuration Enable R/W 0 62 CtrEn Enable/disable counting R/W 0 61:53 – Reserved MBZ 0 52:48 CtrID Counter Identifier R/W 0 47 IsCOS BwSrc field is a CLOSID R/W 0 (not an RMID) 46:44 – Reserved MBZ 0 43:32 BwSrc Bandwidth Source R/W 0 (RMID or CLOSID) 31:0 BwType Bandwidth configuration R/W 0 tracked by the CtrID ========================================================================== The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). [ bp: Touchups. ] Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/resctrl/internal.h | 36 ++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e4945e5c92ef80..e3a445b43d7d6c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1223,6 +1223,7 @@ /* - AMD: */ #define MSR_IA32_MBA_BW_BASE 0xc0000200 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 +#define MSR_IA32_L3_QOS_ABMC_CFG 0xc00003fd #define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff #define MSR_IA32_EVT_CFG_BASE 0xc0000400 diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index a79a487e639c02..0444fea49b11f8 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -164,6 +164,42 @@ union cpuid_0x10_x_edx { unsigned int full; }; +/* + * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG. + * + * @bw_type : Event configuration that represents the memory + * transactions being tracked by the @cntr_id. + * @bw_src : Bandwidth source (RMID or CLOSID). + * @reserved1 : Reserved. + * @is_clos : @bw_src field is a CLOSID (not an RMID). + * @cntr_id : Counter identifier. + * @reserved : Reserved. + * @cntr_en : Counting enable bit. + * @cfg_en : Configuration enable bit. + * + * Configuration and counting: + * Counter can be configured across multiple writes to MSR. Configuration + * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the + * configuration is applied. + * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not + * count events. + * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start + * counting events. + */ +union l3_qos_abmc_cfg { + struct { + unsigned long bw_type :32, + bw_src :12, + reserved1: 3, + is_clos : 1, + cntr_id : 5, + reserved : 9, + cntr_en : 1, + cfg_en : 1; + } split; + unsigned long full; +}; + void rdt_ctrl_update(void *arg); int rdt_get_mon_l3_config(struct rdt_resource *r); From ebebda853633de389ba2c6737f8ca38405713e90 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:14 -0500 Subject: [PATCH 1684/3206] fs/resctrl: Introduce event configuration field in struct mon_evt When supported, mbm_event counter assignment mode allows the user to configure events to track specific types of memory transactions. Introduce an evt_cfg field in struct mon_evt to define the type of memory transactions tracked by a monitoring event. Also add a helper function to get the evt_cfg value. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- fs/resctrl/internal.h | 5 +++++ fs/resctrl/monitor.c | 10 ++++++++++ include/linux/resctrl.h | 2 ++ 3 files changed, 17 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 4f372e80bf3738..1cddfff007a24f 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -56,6 +56,10 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * @evtid: event id * @rid: resource id for this event * @name: name of the event + * @evt_cfg: Event configuration value that represents the + * memory transactions (e.g., READS_TO_LOCAL_MEM, + * READS_TO_REMOTE_MEM) being tracked by @evtid. + * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable * @enabled: true if the event is enabled */ @@ -63,6 +67,7 @@ struct mon_evt { enum resctrl_event_id evtid; enum resctrl_res_level rid; char *name; + u32 evt_cfg; bool configurable; bool enabled; }; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 1fa82a62b2e570..f714e7baaea650 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -882,6 +882,11 @@ bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) mon_event_all[eventid].enabled; } +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) +{ + return mon_event_all[evtid].evt_cfg; +} + int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { @@ -1023,6 +1028,11 @@ int resctrl_mon_resource_init(void) resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("available_mbm_cntrs", diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index e013caba664147..87daa4ca312ddf 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -409,6 +409,8 @@ static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid) eventid <= QOS_L3_MBM_LOCAL_EVENT_ID); } +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id eventid); + /* Iterate over all memory bandwidth events */ #define for_each_mbm_event_id(eventid) \ for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID; \ From 3104fc98cba139e55107df6c2e60593423a5a799 Mon Sep 17 00:00:00 2001 From: Zhe Qiao Date: Fri, 12 Sep 2025 21:52:39 +0200 Subject: [PATCH 1685/3206] ACPICA: Modify variable definition position To prevent potential undefined risks. Link: https://github.com/acpica/acpica/commit/efc4d8ab Signed-off-by: Zhe Qiao Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/psopinfo.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/acpica/psopinfo.c b/drivers/acpi/acpica/psopinfo.c index 1c8044ffcb97c5..5168b4a0467072 100644 --- a/drivers/acpi/acpica/psopinfo.c +++ b/drivers/acpi/acpica/psopinfo.c @@ -35,7 +35,7 @@ static const u8 acpi_gbl_argument_count[] = const struct acpi_opcode_info *acpi_ps_get_opcode_info(u16 opcode) { #ifdef ACPI_DEBUG_OUTPUT - const char *opcode_name = "Unknown AML opcode"; + #endif ACPI_FUNCTION_NAME(ps_get_opcode_info); @@ -62,6 +62,8 @@ const struct acpi_opcode_info *acpi_ps_get_opcode_info(u16 opcode) #if defined ACPI_ASL_COMPILER && defined ACPI_DEBUG_OUTPUT #include "asldefine.h" + const char *opcode_name = "Unknown AML opcode"; + switch (opcode) { case AML_RAW_DATA_BYTE: opcode_name = "-Raw Data Byte-"; @@ -102,11 +104,11 @@ const struct acpi_opcode_info *acpi_ps_get_opcode_info(u16 opcode) default: break; } -#endif /* Unknown AML opcode */ ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "%s [%4.4X]\n", opcode_name, opcode)); +#endif return (&acpi_gbl_aml_op_info[_UNK]); } From 2926d3753aa1d839df8a6a349c713c1ebcbf8087 Mon Sep 17 00:00:00 2001 From: Zhe Qiao Date: Fri, 12 Sep 2025 21:53:30 +0200 Subject: [PATCH 1686/3206] ACPICA: Remove redundant "#ifdef" definitions Link: https://github.com/acpica/acpica/commit/204776a3 Signed-off-by: Zhe Qiao Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/psopinfo.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/acpi/acpica/psopinfo.c b/drivers/acpi/acpica/psopinfo.c index 5168b4a0467072..ca867d9407f9c0 100644 --- a/drivers/acpi/acpica/psopinfo.c +++ b/drivers/acpi/acpica/psopinfo.c @@ -34,9 +34,6 @@ static const u8 acpi_gbl_argument_count[] = const struct acpi_opcode_info *acpi_ps_get_opcode_info(u16 opcode) { -#ifdef ACPI_DEBUG_OUTPUT - -#endif ACPI_FUNCTION_NAME(ps_get_opcode_info); From 9d2f57fee5a0bf5b3f7ef690ba90d8d8ba1c0c96 Mon Sep 17 00:00:00 2001 From: Zhe Qiao Date: Fri, 12 Sep 2025 21:54:07 +0200 Subject: [PATCH 1687/3206] ACPICA: Change the compilation conditions To prevent the risk of undefined variables. Link: https://github.com/acpica/acpica/commit/9f86d4c9 Signed-off-by: Zhe Qiao Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/psopinfo.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/acpica/psopinfo.c b/drivers/acpi/acpica/psopinfo.c index ca867d9407f9c0..532ea307a67548 100644 --- a/drivers/acpi/acpica/psopinfo.c +++ b/drivers/acpi/acpica/psopinfo.c @@ -34,6 +34,9 @@ static const u8 acpi_gbl_argument_count[] = const struct acpi_opcode_info *acpi_ps_get_opcode_info(u16 opcode) { +#if defined ACPI_ASL_COMPILER && defined ACPI_DEBUG_OUTPUT + const char *opcode_name = "Unknown AML opcode"; +#endif ACPI_FUNCTION_NAME(ps_get_opcode_info); @@ -59,8 +62,6 @@ const struct acpi_opcode_info *acpi_ps_get_opcode_info(u16 opcode) #if defined ACPI_ASL_COMPILER && defined ACPI_DEBUG_OUTPUT #include "asldefine.h" - const char *opcode_name = "Unknown AML opcode"; - switch (opcode) { case AML_RAW_DATA_BYTE: opcode_name = "-Raw Data Byte-"; From feb8ae81b2378b75a99c81d315602ac8918ed382 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 12 Sep 2025 21:54:53 +0200 Subject: [PATCH 1688/3206] ACPICA: Allow to skip Global Lock initialization Introduce acpi_gbl_use_global_lock, which allows to skip the Global Lock initialization. This is useful for systems without Global Lock (such as loong_arch), so as to avoid error messages during boot phase: ACPI Error: Could not enable global_lock event (20240827/evxfevnt-182) ACPI Error: No response from Global Lock hardware, disabling lock (20240827/evglock-59) Link: https://github.com/acpica/acpica/commit/463cb0fe Signed-off-by: Huacai Chen Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/evglock.c | 4 ++++ include/acpi/acpixf.h | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/drivers/acpi/acpica/evglock.c b/drivers/acpi/acpica/evglock.c index fa3e0d00d1ca96..df2a4ab0e0da9d 100644 --- a/drivers/acpi/acpica/evglock.c +++ b/drivers/acpi/acpica/evglock.c @@ -42,6 +42,10 @@ acpi_status acpi_ev_init_global_lock_handler(void) return_ACPI_STATUS(AE_OK); } + if (!acpi_gbl_use_global_lock) { + return_ACPI_STATUS(AE_OK); + } + /* Attempt installation of the global lock handler */ status = acpi_install_fixed_event_handler(ACPI_EVENT_GLOBAL, diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index b49396aa405812..97c25ae8a36e3d 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -213,6 +213,12 @@ ACPI_INIT_GLOBAL(u8, acpi_gbl_osi_data, 0); */ ACPI_INIT_GLOBAL(u8, acpi_gbl_reduced_hardware, FALSE); +/* + * ACPI Global Lock is mainly used for systems with SMM, so no-SMM systems + * (such as loong_arch) may not have and not use Global Lock. + */ +ACPI_INIT_GLOBAL(u8, acpi_gbl_use_global_lock, TRUE); + /* * Maximum timeout for While() loop iterations before forced method abort. * This mechanism is intended to prevent infinite loops during interpreter From 12fd607554c4efb4856959f0e5823f541bc0e701 Mon Sep 17 00:00:00 2001 From: Ahmed Salem Date: Fri, 12 Sep 2025 21:55:35 +0200 Subject: [PATCH 1689/3206] ACPICA: Apply ACPI_NONSTRING Add ACPI_NONSTRING for destination char arrays without a terminating NUL character. This is a follow-up to commit 2b82118845e0 ("ACPICA: Apply ACPI_NONSTRING") where a few more destination arrays were missed. Link: https://github.com/acpica/acpica/commit/f359e5ed Fixes: 2b82118845e0 ("ACPICA: Apply ACPI_NONSTRING") Signed-off-by: Ahmed Salem Signed-off-by: Rafael J. Wysocki --- include/acpi/actbl.h | 2 +- tools/power/acpi/os_specific/service_layers/oslinuxtbl.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h index 243097a3da6360..8a67d4ea6e3feb 100644 --- a/include/acpi/actbl.h +++ b/include/acpi/actbl.h @@ -73,7 +73,7 @@ struct acpi_table_header { char oem_id[ACPI_OEM_ID_SIZE] ACPI_NONSTRING; /* ASCII OEM identification */ char oem_table_id[ACPI_OEM_TABLE_ID_SIZE] ACPI_NONSTRING; /* ASCII OEM table identification */ u32 oem_revision; /* OEM revision number */ - char asl_compiler_id[ACPI_NAMESEG_SIZE]; /* ASCII ASL compiler vendor ID */ + char asl_compiler_id[ACPI_NAMESEG_SIZE] ACPI_NONSTRING; /* ASCII ASL compiler vendor ID */ u32 asl_compiler_revision; /* ASL compiler version */ }; diff --git a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c index 9741e7503591c1..de93067a5da320 100644 --- a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c +++ b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c @@ -995,7 +995,7 @@ static acpi_status osl_list_customized_tables(char *directory) { void *table_dir; u32 instance; - char temp_name[ACPI_NAMESEG_SIZE]; + char temp_name[ACPI_NAMESEG_SIZE] ACPI_NONSTRING; char *filename; acpi_status status = AE_OK; @@ -1312,7 +1312,7 @@ osl_get_customized_table(char *pathname, { void *table_dir; u32 current_instance = 0; - char temp_name[ACPI_NAMESEG_SIZE]; + char temp_name[ACPI_NAMESEG_SIZE] ACPI_NONSTRING; char table_filename[PATH_MAX]; char *filename; acpi_status status; From 4b020eff6633af45fd714ff08cec95341da834ad Mon Sep 17 00:00:00 2001 From: Ahmed Salem Date: Fri, 12 Sep 2025 21:56:18 +0200 Subject: [PATCH 1690/3206] ACPICA: iASL: Fix printing CDAT table header When disassembling CDAT AML, the lack of CDAT's common ACPI table header should be taken into consideration when printing the table's actual header. As the table header lacks fields like Signature, header information is mangled and incorrect: # $ iasl -d -ds CDAT cdat.aml # Input file cdat.aml, Length 0xE4 (228) bytes # ACPI: ? 0x0000000000000000 000C01 (v00 ? 00180000 ?? 23456789) Both Signature and Length are printed incorrectly, and the remaining fields are not applicable for this special table. To fix this, verify acpi_gbl_CDAT is set (<-ds CDAT> via command line) and Header->Signature is an invalid nameseg (due to the presence of non-ASCII char(s)), then print only the explicitly-passed signature (ACPI_SIG_CDAT) and the table length as cast with a pointer to the struct acpi_table_cdat structure. Link: https://github.com/acpica/acpica/commit/2b5586b4 Signed-off-by: Ahmed Salem Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/tbprint.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/acpi/acpica/tbprint.c b/drivers/acpi/acpica/tbprint.c index fd64460a2e2603..049f6c2f1e321d 100644 --- a/drivers/acpi/acpica/tbprint.c +++ b/drivers/acpi/acpica/tbprint.c @@ -121,6 +121,14 @@ acpi_tb_print_table_header(acpi_physical_address address, ACPI_CAST_PTR(struct acpi_table_rsdp, header)->revision, local_header.oem_id)); + } else if (acpi_gbl_CDAT && !acpi_ut_valid_nameseg(header->signature)) { + + /* CDAT does not use the common ACPI table header */ + + ACPI_INFO(("%-4.4s 0x%8.8X%8.8X %06X", + ACPI_SIG_CDAT, ACPI_FORMAT_UINT64(address), + ACPI_CAST_PTR(struct acpi_table_cdat, + header)->length)); } else { /* Standard ACPI table with full common header */ From 16ae95800b1cc46c0d69d8d90c9c7be488421a40 Mon Sep 17 00:00:00 2001 From: Ahmed Salem Date: Fri, 12 Sep 2025 21:58:25 +0200 Subject: [PATCH 1691/3206] ACPICA: acpidump: drop ACPI_NONSTRING attribute from file_name Partially revert commit 70662db73d54 ("ACPICA: Apply ACPI_NONSTRING in more places") as I've yet again incorrectly applied the ACPI_NONSTRING attribute where it is not needed. A warning was initially reported by Collin Funk [1], and further review by Jiri Slaby [2] highlighted another issue related to the same commit. Drop the ACPI_NONSTRING attribute to fix the issue. Fixes: 70662db73d54 ("ACPICA: Apply ACPI_NONSTRING in more places") Link: https://lore.kernel.org/all/87ecvpcypw.fsf@gmail.com [1] Link: https://lore.kernel.org/all/5c210121-c9b8-4458-b1ad-0da24732ac72@kernel.org [2] Link: https://github.com/acpica/acpica/commit/a6ee09ca Reported-by: Collin Funk Signed-off-by: Ahmed Salem Cc: 6.16+ # 6.16+ Signed-off-by: Rafael J. Wysocki --- tools/power/acpi/tools/acpidump/apfiles.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/acpi/tools/acpidump/apfiles.c b/tools/power/acpi/tools/acpidump/apfiles.c index 75db0091e2758a..d6b8a201480b75 100644 --- a/tools/power/acpi/tools/acpidump/apfiles.c +++ b/tools/power/acpi/tools/acpidump/apfiles.c @@ -103,7 +103,7 @@ int ap_open_output_file(char *pathname) int ap_write_to_binary_file(struct acpi_table_header *table, u32 instance) { - char filename[ACPI_NAMESEG_SIZE + 16] ACPI_NONSTRING; + char filename[ACPI_NAMESEG_SIZE + 16]; char instance_str[16]; ACPI_FILE file; acpi_size actual; From 22c65572eff14a6e9546a9dbaa333619eb5505ab Mon Sep 17 00:00:00 2001 From: Ahmed Salem Date: Fri, 12 Sep 2025 21:59:17 +0200 Subject: [PATCH 1692/3206] ACPICA: Debugger: drop ACPI_NONSTRING attribute from name_seg ACPICA commit 4623b3369f3aa1ec5229d461e91c514510a96912 Partially revert commit 70662db73d54 ("ACPICA: Apply ACPI_NONSTRING in more places") as I've yet again incorrectly applied the ACPI_NONSTRING attribute where it is not needed. A warning was initially reported by Collin Funk [1], and further review by Jiri Slaby [2] highlighted another issue related to the same commit. Drop the ACPI_NONSTRING attribute to fix the issue. Fixes: 70662db73d54 ("ACPICA: Apply ACPI_NONSTRING in more places") Link: https://lore.kernel.org/all/87ecvpcypw.fsf@gmail.com [1] Link: https://lore.kernel.org/all/5c210121-c9b8-4458-b1ad-0da24732ac72@kernel.org [2] Link: https://github.com/acpica/acpica/commit/4623b336 Reported-by: Jiri Slaby Signed-off-by: Ahmed Salem Cc: 6.16+ # 6.16+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/acdebug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/acpica/acdebug.h b/drivers/acpi/acpica/acdebug.h index fe6d38b43c9a5c..91241bd6917a43 100644 --- a/drivers/acpi/acpica/acdebug.h +++ b/drivers/acpi/acpica/acdebug.h @@ -37,7 +37,7 @@ struct acpi_db_argument_info { struct acpi_db_execute_walk { u32 count; u32 max_count; - char name_seg[ACPI_NAMESEG_SIZE + 1] ACPI_NONSTRING; + char name_seg[ACPI_NAMESEG_SIZE + 1]; }; #define PARAM_LIST(pl) pl From e9dff11a7a50fcef23fe3e8314fafae6d5641826 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 12 Sep 2025 22:00:17 +0200 Subject: [PATCH 1693/3206] ACPICA: dispatcher: Use acpi_ds_clear_operands() in acpi_ds_call_control_method() When deleting the previous walkstate operand stack acpi_ds_call_control_method() was deleting obj_desc->Method.param_count operands. But Method.param_count does not necessarily match this_walk_state->num_operands, it may be either less or more. After correcting the for loop to check `i < this_walk_state->num_operands` the code is identical to acpi_ds_clear_operands(), so just outright replace the code with acpi_ds_clear_operands() to fix this. Link: https://github.com/acpica/acpica/commit/53fc0220 Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/dsmethod.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/acpi/acpica/dsmethod.c b/drivers/acpi/acpica/dsmethod.c index fef6fb29ece4d6..e707a703680264 100644 --- a/drivers/acpi/acpica/dsmethod.c +++ b/drivers/acpi/acpica/dsmethod.c @@ -546,14 +546,7 @@ acpi_ds_call_control_method(struct acpi_thread_state *thread, * Delete the operands on the previous walkstate operand stack * (they were copied to new objects) */ - for (i = 0; i < obj_desc->method.param_count; i++) { - acpi_ut_remove_reference(this_walk_state->operands[i]); - this_walk_state->operands[i] = NULL; - } - - /* Clear the operand stack */ - - this_walk_state->num_operands = 0; + acpi_ds_clear_operands(this_walk_state); ACPI_DEBUG_PRINT((ACPI_DB_DISPATCH, "**** Begin nested execution of [%4.4s] **** WalkState=%p\n", From 761dc71c6020d6aa68666e96373342d49a7e9d0a Mon Sep 17 00:00:00 2001 From: Saket Dumbre Date: Fri, 12 Sep 2025 22:01:04 +0200 Subject: [PATCH 1694/3206] ACPICA: Update dsmethod.c to get rid of unused variable warning All the 3 major C compilers (MSVC, GCC, LLVM/Clang) warn about the unused variable i after the removal of its usage by PR #1031 addressing Issue #1027 Link: https://github.com/acpica/acpica/commit/6d235320 Signed-off-by: Saket Dumbre Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/dsmethod.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/acpi/acpica/dsmethod.c b/drivers/acpi/acpica/dsmethod.c index e707a703680264..b2f756b7078d34 100644 --- a/drivers/acpi/acpica/dsmethod.c +++ b/drivers/acpi/acpica/dsmethod.c @@ -462,7 +462,6 @@ acpi_ds_call_control_method(struct acpi_thread_state *thread, struct acpi_walk_state *next_walk_state = NULL; union acpi_operand_object *obj_desc; struct acpi_evaluate_info *info; - u32 i; ACPI_FUNCTION_TRACE_PTR(ds_call_control_method, this_walk_state); From e2c80b3c23782d809b4e15da7d8a0135a8d350b5 Mon Sep 17 00:00:00 2001 From: Saket Dumbre Date: Fri, 12 Sep 2025 22:01:52 +0200 Subject: [PATCH 1695/3206] ACPICA: Print error messages for too few or too many arguments Fix Issue #1027 by displaying error messages when there are too few or too many arguments in the caller vs the definition of an ASL/AML method. Link: https://github.com/acpica/acpica/commit/cbc243e4 Reported-by: Peter Williams Signed-off-by: Rafael J. Wysocki Tested-by: Hans de Goede Signed-off-by: Saket Dumbre --- drivers/acpi/acpica/dsmethod.c | 11 +++++++++-- include/acpi/acexcep.h | 10 ++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/acpica/dsmethod.c b/drivers/acpi/acpica/dsmethod.c index b2f756b7078d34..45ec32e81903ab 100644 --- a/drivers/acpi/acpica/dsmethod.c +++ b/drivers/acpi/acpica/dsmethod.c @@ -483,10 +483,17 @@ acpi_ds_call_control_method(struct acpi_thread_state *thread, } if (this_walk_state->num_operands < obj_desc->method.param_count) { - ACPI_ERROR((AE_INFO, "Missing argument for method [%4.4s]", + ACPI_ERROR((AE_INFO, "Missing argument(s) for method [%4.4s]", acpi_ut_get_node_name(method_node))); - return_ACPI_STATUS(AE_AML_UNINITIALIZED_ARG); + return_ACPI_STATUS(AE_AML_TOO_FEW_ARGUMENTS); + } + + else if (this_walk_state->num_operands > obj_desc->method.param_count) { + ACPI_ERROR((AE_INFO, "Too many arguments for method [%4.4s]", + acpi_ut_get_node_name(method_node))); + + return_ACPI_STATUS(AE_AML_TOO_MANY_ARGUMENTS); } /* Init for new method, possibly wait on method mutex */ diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h index 53c98f5fe3c361..a2db36d18419a1 100644 --- a/include/acpi/acexcep.h +++ b/include/acpi/acexcep.h @@ -173,8 +173,10 @@ struct acpi_exception_info { #define AE_AML_TARGET_TYPE EXCEP_AML (0x0023) #define AE_AML_PROTOCOL EXCEP_AML (0x0024) #define AE_AML_BUFFER_LENGTH EXCEP_AML (0x0025) +#define AE_AML_TOO_FEW_ARGUMENTS EXCEP_AML (0x0026) +#define AE_AML_TOO_MANY_ARGUMENTS EXCEP_AML (0x0027) -#define AE_CODE_AML_MAX 0x0025 +#define AE_CODE_AML_MAX 0x0027 /* * Internal exceptions used for control @@ -353,7 +355,11 @@ static const struct acpi_exception_info acpi_gbl_exception_names_aml[] = { "A target operand of an incorrect type was encountered"), EXCEP_TXT("AE_AML_PROTOCOL", "Violation of a fixed ACPI protocol"), EXCEP_TXT("AE_AML_BUFFER_LENGTH", - "The length of the buffer is invalid/incorrect") + "The length of the buffer is invalid/incorrect"), + EXCEP_TXT("AE_AML_TOO_FEW_ARGUMENTS", + "There are fewer than expected method arguments"), + EXCEP_TXT("AE_AML_TOO_MANY_ARGUMENTS", + "There are too many arguments for this method") }; static const struct acpi_exception_info acpi_gbl_exception_names_ctrl[] = { From b0fb6891b8ad55cb5767edf94351ffe4f0b9a404 Mon Sep 17 00:00:00 2001 From: Saket Dumbre Date: Fri, 12 Sep 2025 22:02:32 +0200 Subject: [PATCH 1696/3206] ACPICA: Update version to 20250807 Link: https://github.com/acpica/acpica/commit/0845a773 Signed-off-by: Saket Dumbre Signed-off-by: Rafael J. Wysocki --- include/acpi/acpixf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 97c25ae8a36e3d..e65a2afe92504b 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -12,7 +12,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20250404 +#define ACPI_CA_VERSION 0x20250807 #include #include From 8ca944fea4d6d9019e01f2d6f6e766f315a9d73f Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 12 Sep 2025 22:03:16 +0200 Subject: [PATCH 1697/3206] ACPICA: Fix largest possible resource descriptor index ACPI_RESOURCE_NAME_LARGE_MAX should be equal to the last actually used resource descriptor index (ACPI_RESOURCE_NAME_CLOCK_INPUT). Otherwise 'resource_index' in 'acpi_ut_validate_resource()' may be clamped incorrectly and resulting value may issue an out-of-bounds access for 'acpi_gbl_resource_types' array. Compile tested only. Fixes: 520d4a0ee5b6 ("ACPICA: add support for ClockInput resource (v6.5)") Link: https://github.com/acpica/acpica/commit/cf00116c Link: https://marc.info/?l=linux-acpi&m=175449676131260&w=2 Signed-off-by: Dmitry Antipov Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/aclocal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index 0c41f0097e8d71..f98640086f4ef3 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -1141,7 +1141,7 @@ struct acpi_port_info { #define ACPI_RESOURCE_NAME_PIN_GROUP_FUNCTION 0x91 #define ACPI_RESOURCE_NAME_PIN_GROUP_CONFIG 0x92 #define ACPI_RESOURCE_NAME_CLOCK_INPUT 0x93 -#define ACPI_RESOURCE_NAME_LARGE_MAX 0x94 +#define ACPI_RESOURCE_NAME_LARGE_MAX 0x93 /***************************************************************************** * From eddf12041ddd8c40289015f42285c5ac614b5c30 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 12 Sep 2025 22:04:04 +0200 Subject: [PATCH 1698/3206] ACPICA: CEDT: Add Back-Invalidate restriction to CXL Window This is added in newer version (3.0+) of the CXL Spec to support the HDM-DB coherency model. Link: https://github.com/acpica/acpica/commit/a6886da1 Signed-off-by: Davidlohr Bueso Signed-off-by: Rafael J. Wysocki --- include/acpi/actbl1.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h index 99fd1588ff3822..0b4c332df25c3e 100644 --- a/include/acpi/actbl1.h +++ b/include/acpi/actbl1.h @@ -565,6 +565,7 @@ struct acpi_cedt_cfmws_target_element { #define ACPI_CEDT_CFMWS_RESTRICT_VOLATILE (1<<2) #define ACPI_CEDT_CFMWS_RESTRICT_PMEM (1<<3) #define ACPI_CEDT_CFMWS_RESTRICT_FIXED (1<<4) +#define ACPI_CEDT_CFMWS_RESTRICT_BI (1<<5) /* 2: CXL XOR Interleave Math Structure */ From 81f92cff6d423f86ed7cbc9d7fa967c6c92f2fa8 Mon Sep 17 00:00:00 2001 From: Saket Dumbre Date: Fri, 12 Sep 2025 22:05:01 +0200 Subject: [PATCH 1699/3206] ACPICA: ACPI_TYPE_ANY does not include the package type So add it to the list of acceptable Arg3 types for _DSM. Link: https://github.com/acpica/acpica/commit/6eb81e7c Signed-off-by: Saket Dumbre Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/acpredef.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpica/acpredef.h b/drivers/acpi/acpica/acpredef.h index 76c5ed02e91645..da2c45880cc7e9 100644 --- a/drivers/acpi/acpica/acpredef.h +++ b/drivers/acpi/acpica/acpredef.h @@ -450,7 +450,8 @@ const union acpi_predefined_info acpi_gbl_predefined_methods[] = { {{"_DSM", METHOD_4ARGS(ACPI_TYPE_BUFFER, ACPI_TYPE_INTEGER, ACPI_TYPE_INTEGER, - ACPI_TYPE_ANY) | ARG_COUNT_IS_MINIMUM, + ACPI_TYPE_ANY | ACPI_TYPE_PACKAGE) | + ARG_COUNT_IS_MINIMUM, METHOD_RETURNS(ACPI_RTYPE_ALL)}}, /* Must return a value, but it can be of any type */ {{"_DSS", METHOD_1ARGS(ACPI_TYPE_INTEGER), From 54ba9071a04b0d05086826c442c2de5830fce5bf Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Fri, 12 Sep 2025 22:06:08 +0200 Subject: [PATCH 1700/3206] ACPICA: acpidump: fix return values in ap_is_valid_checksum() The function ap_is_valid_checksum() has a boolean name suggesting it should return TRUE/FALSE, but incorrectly returns AE_OK on success and has no explicit return on failure, leading to undefined behavior. Fix by returning proper values: - FALSE when checksum validation fails - TRUE when checksum validation succeeds Link: https://github.com/acpica/acpica/commit/479ba862 Signed-off-by: Kaushlendra Kumar Signed-off-by: Rafael J. Wysocki --- tools/power/acpi/tools/acpidump/apdump.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/power/acpi/tools/acpidump/apdump.c b/tools/power/acpi/tools/acpidump/apdump.c index bf30143efbdcb1..7a6223aa703c39 100644 --- a/tools/power/acpi/tools/acpidump/apdump.c +++ b/tools/power/acpi/tools/acpidump/apdump.c @@ -86,9 +86,10 @@ u8 ap_is_valid_checksum(struct acpi_table_header *table) if (ACPI_FAILURE(status)) { fprintf(stderr, "%4.4s: Warning: wrong checksum in table\n", table->signature); + return (FALSE); } - return (AE_OK); + return (TRUE); } /****************************************************************************** From f7a4fb22312646329ba21bc58958fd83fb9fc15d Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:15 -0500 Subject: [PATCH 1701/3206] x86,fs/resctrl: Implement resctrl_arch_config_cntr() to assign a counter with ABMC The ABMC feature allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Implement an x86 architecture-specific handler to configure a counter. This architecture specific handler is called by resctrl fs when a counter is assigned or unassigned as well as when an already assigned counter's configuration should be updated. Configure counters by writing to the L3_QOS_ABMC_CFG MSR, specifying the counter ID, bandwidth source (RMID), and event configuration. The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] --- arch/x86/kernel/cpu/resctrl/monitor.c | 36 +++++++++++++++++++++++++++ include/linux/resctrl.h | 19 ++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index cce35a0ad455de..ed295a6c5e6677 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -444,3 +444,39 @@ bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; } + +static void resctrl_abmc_config_one_amd(void *info) +{ + union l3_qos_abmc_cfg *abmc_cfg = info; + + wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); +} + +/* + * Send an IPI to the domain to assign the counter to RMID, event pair. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + union l3_qos_abmc_cfg abmc_cfg = { 0 }; + struct arch_mbm_state *am; + + abmc_cfg.split.cfg_en = 1; + abmc_cfg.split.cntr_en = assign ? 1 : 0; + abmc_cfg.split.cntr_id = cntr_id; + abmc_cfg.split.bw_src = rmid; + if (assign) + abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); + + smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); + + /* + * The hardware counter is reset (because cfg_en == 1) so there is no + * need to record initial non-zero counts. + */ + am = get_arch_mbm_state(hw_dom, rmid, evtid); + if (am) + memset(am, 0, sizeof(*am)); +} diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 87daa4ca312ddf..50e38445183a86 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -594,6 +594,25 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * */ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); +/** + * resctrl_arch_config_cntr() - Configure the counter with its new RMID + * and event details. + * @r: Resource structure. + * @d: The domain in which counter with ID @cntr_id should be configured. + * @evtid: Monitoring event type (e.g., QOS_L3_MBM_TOTAL_EVENT_ID + * or QOS_L3_MBM_LOCAL_EVENT_ID). + * @rmid: RMID. + * @closid: CLOSID. + * @cntr_id: Counter ID to configure. + * @assign: True to assign the counter or update an existing assignment, + * false to unassign the counter. + * + * This can be called from any CPU. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; From bd85310efd71b9e7809e1b95fe7a60fde42e62db Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:16 -0500 Subject: [PATCH 1702/3206] fs/resctrl: Add the functionality to assign MBM events When supported, "mbm_event" counter assignment mode offers "num_mbm_cntrs" number of counters that can be assigned to RMID, event pairs and monitor bandwidth usage as long as it is assigned. Add the functionality to allocate and assign a counter to an RMID, event pair in the domain. Also, add the helper rdtgroup_assign_cntrs() to assign counters in the group. Log the error message "Failed to allocate counter for in domain " in /sys/fs/resctrl/info/last_cmd_status if all the counters are in use. Exit on the first failure when assigning counters across all the domains. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- fs/resctrl/internal.h | 2 + fs/resctrl/monitor.c | 156 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 1cddfff007a24f..762705d7eb8db3 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -396,6 +396,8 @@ int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index f714e7baaea650..106e9bdb8a9d83 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -356,6 +356,55 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, return state ? &state[idx] : NULL; } +/* + * mbm_cntr_get() - Return the counter ID for the matching @evtid and @rdtgrp. + * + * Return: + * Valid counter ID on success, or -ENOENT on failure. + */ +static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + if (!r->mon.mbm_cntr_assignable) + return -ENOENT; + + if (!resctrl_is_mbm_event(evtid)) + return -ENOENT; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (d->cntr_cfg[cntr_id].rdtgrp == rdtgrp && + d->cntr_cfg[cntr_id].evtid == evtid) + return cntr_id; + } + + return -ENOENT; +} + +/* + * mbm_cntr_alloc() - Initialize and return a new counter ID in the domain @d. + * Caller must ensure that the specified event is not assigned already. + * + * Return: + * Valid counter ID on success, or -ENOSPC on failure. + */ +static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (!d->cntr_cfg[cntr_id].rdtgrp) { + d->cntr_cfg[cntr_id].rdtgrp = rdtgrp; + d->cntr_cfg[cntr_id].evtid = evtid; + return cntr_id; + } + } + + return -ENOSPC; +} + static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { int cpu = smp_processor_id(); @@ -887,6 +936,113 @@ u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) return mon_event_all[evtid].evt_cfg; } +/* + * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID + * pair in the domain. + * + * Assign the counter if @assign is true else unassign the counter. Reset the + * associated non-architectural state. + */ +static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct mbm_state *m; + + resctrl_arch_config_cntr(r, d, evtid, rmid, closid, cntr_id, assign); + + m = get_mbm_state(d, closid, rmid, evtid); + if (m) + memset(m, 0, sizeof(*m)); +} + +/* + * rdtgroup_alloc_assign_cntr() - Allocate a counter ID and assign it to the event + * pointed to by @mevt and the resctrl group @rdtgrp within the domain @d. + * + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + /* No action required if the counter is assigned already. */ + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + if (cntr_id >= 0) + return 0; + + cntr_id = mbm_cntr_alloc(r, d, rdtgrp, mevt->evtid); + if (cntr_id < 0) { + rdt_last_cmd_printf("Failed to allocate counter for %s in domain %d\n", + mevt->name, d->hdr.id); + return cntr_id; + } + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, true); + + return 0; +} + +/* + * rdtgroup_assign_cntr_event() - Assign a hardware counter for the event in + * @mevt to the resctrl group @rdtgrp. Assign counters to all domains if @d is + * NULL; otherwise, assign the counter to the specified domain @d. + * + * If all counters in a domain are already in use, rdtgroup_alloc_assign_cntr() + * will fail. The assignment process will abort at the first failure encountered + * during domain traversal, which may result in the event being only partially + * assigned. + * + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + int ret = 0; + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + if (ret) + return ret; + } + } else { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + } + + return ret; +} + +/* + * rdtgroup_assign_cntrs() - Assign counters to MBM events. Called when + * a new group is created. + * + * Each group can accommodate two counters per domain: one for the total + * event and one for the local event. Assignments may fail due to the limited + * number of counters. However, it is not necessary to fail the group creation + * and thus no failure is returned. Users have the option to modify the + * counter assignments after the group has been created. + */ +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { From aab2c5088cdb26e80d51ffbe72d24ab23fa1533e Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:17 -0500 Subject: [PATCH 1703/3206] fs/resctrl: Add the functionality to unassign MBM events The "mbm_event" counter assignment mode offers "num_mbm_cntrs" number of counters that can be assigned to RMID, event pairs and monitor bandwidth usage as long as it is assigned. If all the counters are in use, the kernel logs the error message "Failed to allocate counter for in domain " in /sys/fs/resctrl/info/last_cmd_status when a new assignment is requested. To make space for a new assignment, users must unassign an already assigned counter and retry the assignment again. Add the functionality to unassign and free the counters in the domain. Also, add the helper rdtgroup_unassign_cntrs() to unassign counters in the group. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- fs/resctrl/internal.h | 2 ++ fs/resctrl/monitor.c | 66 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 762705d7eb8db3..c6b66d4a6a375f 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -398,6 +398,8 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_fil void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 106e9bdb8a9d83..2ed29ae831a4c6 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -405,6 +405,14 @@ static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, return -ENOSPC; } +/* + * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d. + */ +static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) +{ + memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); +} + static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { int cpu = smp_processor_id(); @@ -1043,6 +1051,64 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); } +/* + * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration + * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp. + */ +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + + /* If there is no cntr_id assigned, nothing to do */ + if (cntr_id < 0) + return; + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, false); + + mbm_cntr_free(d, cntr_id); +} + +/* + * rdtgroup_unassign_cntr_event() - Unassign a hardware counter associated with + * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign + * the counters from all the domains if @d is NULL else unassign from @d. + */ +static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } else { + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } +} + +/* + * rdtgroup_unassign_cntrs() - Unassign the counters associated with MBM events. + * Called when a group is deleted. + */ +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { From bc53eea6c2a1dea152a0073a2f2814b697ad197e Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:18 -0500 Subject: [PATCH 1704/3206] fs/resctrl: Pass struct rdtgroup instead of individual members Reading monitoring data for a monitoring group requires both the RMID and CLOSID. The RMID and CLOSID are members of struct rdtgroup but passed separately to several functions involved in retrieving event data. When "mbm_event" counter assignment mode is enabled, a counter ID is required to read event data. The counter ID is obtained through mbm_cntr_get(), which expects a struct rdtgroup pointer. Provide a pointer to the struct rdtgroup as parameter to functions involved in retrieving event data to simplify access to RMID, CLOSID, and counter ID. Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- fs/resctrl/monitor.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 2ed29ae831a4c6..c815153cad074c 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -413,9 +413,11 @@ static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); } -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) +static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { int cpu = smp_processor_id(); + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct rdt_mon_domain *d; struct mbm_state *m; int err, ret; @@ -475,8 +477,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. + * @rdtgrp: resctrl group associated with the CLOSID and RMID to identify + * the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * * Supporting function to calculate the memory bandwidth @@ -484,9 +486,11 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { u64 cur_bw, bytes, cur_bytes; + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct mbm_state *m; m = get_mbm_state(rr->d, closid, rmid, rr->evtid); @@ -515,7 +519,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp, rr); /* * For Ctrl groups read data from child monitor groups and @@ -526,8 +530,7 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) + if (__mon_event_count(entry, rr) == 0) ret = 0; } } @@ -658,7 +661,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) } static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id evtid) + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { struct rmid_read rr = {0}; @@ -672,30 +675,30 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * return; } - __mon_event_count(closid, rmid, &rr); + __mon_event_count(rdtgrp, &rr); /* * If the software controller is enabled, compute the * bandwidth for this event id. */ if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); + mbm_bw_count(rdtgrp, &rr); resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) + struct rdtgroup *rdtgrp) { /* * This is protected from concurrent reads from user as both * the user and overflow handler hold the global mutex. */ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_TOTAL_EVENT_ID); if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_LOCAL_EVENT_ID); } /* @@ -768,11 +771,11 @@ void mbm_handle_overflow(struct work_struct *work) d = container_of(work, struct rdt_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); + mbm_update(r, d, prgrp); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); + mbm_update(r, d, crgrp); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); From 862314fd1f93d96eddb0559a807c66cb1f6ee520 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:19 -0500 Subject: [PATCH 1705/3206] fs/resctrl: Introduce counter ID read, reset calls in mbm_event mode When supported, "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor the bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Introduce the architecture calls resctrl_arch_cntr_read() and resctrl_arch_reset_cntr() to read and reset event counters when "mbm_event" mode is supported. Function names match existing resctrl_arch_rmid_read() and resctrl_arch_reset_rmid(). Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- include/linux/resctrl.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 50e38445183a86..04152654827d4f 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -613,6 +613,44 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign); +/** + * resctrl_arch_cntr_read() - Read the event data corresponding to the counter ID + * assigned to the RMID, event pair for this resource + * and domain. + * @r: Resource that the counter should be read from. + * @d: Domain that the counter should be read from. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to read. + * @eventid: The MBM event to which @cntr_id is assigned. + * @val: Result of the counter read in bytes. + * + * Called on a CPU that belongs to domain @d when "mbm_event" mode is enabled. + * Called from a non-migrateable process context via smp_call_on_cpu() unless all + * CPUs are nohz_full, in which case it is called via IPI (smp_call_function_any()). + * + * Return: + * 0 on success, or -EIO, -EINVAL etc on error. + */ +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val); + +/** + * resctrl_arch_reset_cntr() - Reset any private state associated with counter ID. + * @r: The domain's resource. + * @d: The counter ID's domain. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to reset. + * @eventid: The MBM event to which @cntr_id is assigned. + * + * This can be called from any CPU. + */ +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; From 7c9ac605e202c4668e441fc8146a993577131ca1 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:20 -0500 Subject: [PATCH 1706/3206] x86/resctrl: Refactor resctrl_arch_rmid_read() resctrl_arch_rmid_read() adjusts the value obtained from MSR_IA32_QM_CTR to account for the overflow for MBM events and apply counter scaling for all the events. This logic is common to both reading an RMID and reading a hardware counter directly. Refactor the hardware value adjustment logic into get_corrected_val() to prepare for support of reading a hardware counter. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/monitor.c | 38 ++++++++++++++++----------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index ed295a6c5e6677..1f77fd58e707e9 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -217,24 +217,13 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 unused, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *ignored) +static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 msr_val) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; - u64 msr_val, chunks; - u32 prmid; - int ret; - - resctrl_arch_rmid_read_context_check(); - - prmid = logical_rmid_to_physical_rmid(cpu, rmid); - ret = __rmid_read_phys(prmid, eventid, &msr_val); - if (ret) - return ret; + u64 chunks; am = get_arch_mbm_state(hw_dom, rmid, eventid); if (am) { @@ -246,7 +235,26 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, chunks = msr_val; } - *val = chunks * hw_res->mon_scale; + return chunks * hw_res->mon_scale; +} + +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, enum resctrl_event_id eventid, + u64 *val, void *ignored) +{ + int cpu = cpumask_any(&d->hdr.cpu_mask); + u64 msr_val; + u32 prmid; + int ret; + + resctrl_arch_rmid_read_context_check(); + + prmid = logical_rmid_to_physical_rmid(cpu, rmid); + ret = __rmid_read_phys(prmid, eventid, &msr_val); + if (ret) + return ret; + + *val = get_corrected_val(r, d, rmid, eventid, msr_val); return 0; } From 2a65b72c1603a74f35228acbb8de2ecff9c13efe Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:21 -0500 Subject: [PATCH 1707/3206] x86/resctrl: Implement resctrl_arch_reset_cntr() and resctrl_arch_cntr_read() System software reads resctrl event data for a particular resource by writing the RMID and Event Identifier (EvtID) to the QM_EVTSEL register and then reading the event data from the QM_CTR register. In ABMC mode, the event data of a specific counter ID is read by setting the following fields: QM_EVTSEL.ExtendedEvtID = 1, QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID to the desired counter ID. Reading the QM_CTR then returns the contents of the specified counter ID. RMID_VAL_ERROR bit is set if the counter configuration is invalid, or if an invalid counter ID is set in the QM_EVTSEL.RMID field. RMID_VAL_UNAVAIL bit is set if the counter data is unavailable. Introduce resctrl_arch_reset_cntr() and resctrl_arch_cntr_read() to reset and read event data for a specific counter. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/internal.h | 6 +++ arch/x86/kernel/cpu/resctrl/monitor.c | 69 ++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 0444fea49b11f8..e5edddb290c9d3 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -40,6 +40,12 @@ struct arch_mbm_state { /* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */ #define ABMC_ENABLE_BIT 0 +/* + * Qos Event Identifiers. + */ +#define ABMC_EXTENDED_EVT_ID BIT(31) +#define ABMC_EVT_ID BIT(0) + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 1f77fd58e707e9..0b3c199e9e016d 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -259,6 +259,75 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, return 0; } +static int __cntr_id_read(u32 cntr_id, u64 *val) +{ + u64 msr_val; + + /* + * QM_EVTSEL Register definition: + * ======================================================= + * Bits Mnemonic Description + * ======================================================= + * 63:44 -- Reserved + * 43:32 RMID RMID or counter ID in ABMC mode + * when reading an MBM event + * 31 ExtendedEvtID Extended Event Identifier + * 30:8 -- Reserved + * 7:0 EvtID Event Identifier + * ======================================================= + * The contents of a specific counter can be read by setting the + * following fields in QM_EVTSEL.ExtendedEvtID(=1) and + * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID + * to the desired counter ID. Reading the QM_CTR then returns the + * contents of the specified counter. The RMID_VAL_ERROR bit is set + * if the counter configuration is invalid, or if an invalid counter + * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit + * is set if the counter data is unavailable. + */ + wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id); + rdmsrl(MSR_IA32_QM_CTR, msr_val); + + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; + + *val = msr_val; + return 0; +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct arch_mbm_state *am; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + memset(am, 0, sizeof(*am)); + + /* Record any initial, non-zero count value. */ + __cntr_id_read(cntr_id, &am->prev_msr); + } +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + u64 msr_val; + int ret; + + ret = __cntr_id_read(cntr_id, &msr_val); + if (ret) + return ret; + + *val = get_corrected_val(r, d, rmid, eventid, msr_val); + + return 0; +} + /* * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 * which indicates that RMIDs are configured in legacy mode. From 84973249011fda3ff292f83439a062fec81ef982 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Mon, 15 Sep 2025 10:42:19 +0100 Subject: [PATCH 1708/3206] ALSA: serial-generic: remove shared static buffer If multiple instances of this driver are instantiated and try to send concurrently then the single static buffer snd_serial_generic_tx_work() will cause corruption in the data output. Move the buffer into the per-instance driver data to avoid this. Signed-off-by: John Keeping Signed-off-by: Takashi Iwai --- sound/drivers/serial-generic.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sound/drivers/serial-generic.c b/sound/drivers/serial-generic.c index 21ae053c057671..766206c6ca75a8 100644 --- a/sound/drivers/serial-generic.c +++ b/sound/drivers/serial-generic.c @@ -37,6 +37,8 @@ MODULE_LICENSE("GPL"); #define SERIAL_TX_STATE_ACTIVE 1 #define SERIAL_TX_STATE_WAKEUP 2 +#define INTERNAL_BUF_SIZE 256 + struct snd_serial_generic { struct serdev_device *serdev; @@ -51,6 +53,7 @@ struct snd_serial_generic { struct work_struct tx_work; unsigned long tx_state; + char tx_buf[INTERNAL_BUF_SIZE]; }; static void snd_serial_generic_tx_wakeup(struct snd_serial_generic *drvdata) @@ -61,11 +64,8 @@ static void snd_serial_generic_tx_wakeup(struct snd_serial_generic *drvdata) schedule_work(&drvdata->tx_work); } -#define INTERNAL_BUF_SIZE 256 - static void snd_serial_generic_tx_work(struct work_struct *work) { - static char buf[INTERNAL_BUF_SIZE]; int num_bytes; struct snd_serial_generic *drvdata = container_of(work, struct snd_serial_generic, tx_work); @@ -78,8 +78,10 @@ static void snd_serial_generic_tx_work(struct work_struct *work) if (!test_bit(SERIAL_MODE_OUTPUT_OPEN, &drvdata->filemode)) break; - num_bytes = snd_rawmidi_transmit_peek(substream, buf, INTERNAL_BUF_SIZE); - num_bytes = serdev_device_write_buf(drvdata->serdev, buf, num_bytes); + num_bytes = snd_rawmidi_transmit_peek(substream, drvdata->tx_buf, + INTERNAL_BUF_SIZE); + num_bytes = serdev_device_write_buf(drvdata->serdev, drvdata->tx_buf, + num_bytes); if (!num_bytes) break; From 159f36cd4de7718779fd0b232de5137b4ffd2d1e Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:22 -0500 Subject: [PATCH 1709/3206] fs/resctrl: Support counter read/reset with mbm_event assignment mode When "mbm_event" counter assignment mode is enabled, the architecture requires a counter ID to read the event data. Introduce an is_mbm_cntr field in struct rmid_read to indicate whether counter assignment mode is in use. Update the logic to call resctrl_arch_cntr_read() and resctrl_arch_reset_cntr() when the assignment mode is active. Report 'Unassigned' in case the user attempts to read an event without assigning a hardware counter. Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 6 ++++ fs/resctrl/ctrlmondata.c | 22 ++++++++++--- fs/resctrl/internal.h | 3 ++ fs/resctrl/monitor.c | 47 ++++++++++++++++++++------- 4 files changed, 62 insertions(+), 16 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 446736dbd97f19..4c24c5f3f4c196 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -434,6 +434,12 @@ When monitoring is enabled all MON groups will also contain: for the L3 cache they occupy). These are named "mon_sub_L3_YY" where "YY" is the node number. + When the 'mbm_event' counter assignment mode is enabled, reading + an MBM event of a MON group returns 'Unassigned' if no hardware + counter is assigned to it. For CTRL_MON groups, 'Unassigned' is + returned if the MBM event does not have an assigned counter in the + CTRL_MON group nor in any of its associated MON groups. + "mon_hw_id": Available only with debug option. The identifier used by hardware for the monitor group. On x86 this is the RMID. diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 42b281b3852ff8..0d0ef54fc4de1f 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -563,10 +563,15 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->r = r; rr->d = d; rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r) && + resctrl_is_mbm_event(evtid)) { + rr->is_mbm_cntr = true; + } else { + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } } cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); @@ -582,7 +587,8 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + if (rr->arch_mon_ctx) + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) @@ -653,10 +659,16 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) checkresult: + /* + * -ENOENT is a special case, set only when "mbm_event" counter assignment + * mode is enabled and no counter has been assigned. + */ if (rr.err == -EIO) seq_puts(m, "Error\n"); else if (rr.err == -EINVAL) seq_puts(m, "Unavailable\n"); + else if (rr.err == -ENOENT) + seq_puts(m, "Unassigned\n"); else seq_printf(m, "%llu\n", rr.val); diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index c6b66d4a6a375f..2f1f2efe2f40f1 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -111,6 +111,8 @@ struct mon_data { * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it + * is an MBM event. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -125,6 +127,7 @@ struct rmid_read { enum resctrl_event_id evtid; bool first; struct cacheinfo *ci; + bool is_mbm_cntr; int err; u64 val; void *arch_mon_ctx; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index c815153cad074c..55327056596ee2 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -419,12 +419,24 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; struct rdt_mon_domain *d; + int cntr_id = -ENOENT; struct mbm_state *m; int err, ret; u64 tval = 0; + if (rr->is_mbm_cntr) { + cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid); + if (cntr_id < 0) { + rr->err = -ENOENT; + return -EINVAL; + } + } + if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + if (rr->is_mbm_cntr) + resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid); + else + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); @@ -435,8 +447,12 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) /* Reading a single domain, must be on a CPU in that domain. */ if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) return -EINVAL; - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -460,8 +476,12 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { if (d->ci_id != rr->ci->id) continue; - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -668,11 +688,15 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * rr.r = r; rr.d = d; rr.evtid = evtid; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r)) { + rr.is_mbm_cntr = true; + } else { + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } } __mon_event_count(rdtgrp, &rr); @@ -684,7 +708,8 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * if (is_mba_sc(NULL)) mbm_bw_count(rdtgrp, &rr); - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); + if (rr.arch_mon_ctx) + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, From ea274cbeaf8f0667267b347e3f84797439cdab4e Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:23 -0500 Subject: [PATCH 1710/3206] fs/resctrl: Add event configuration directory under info/L3_MON/ The "mbm_event" counter assignment mode allows the user to assign a hardware counter to an RMID, event pair and monitor the bandwidth as long as it is assigned. The user can specify the memory transaction(s) for the counter to track. When this mode is supported, the /sys/fs/resctrl/info/L3_MON/event_configs directory contains a sub-directory for each MBM event that can be assigned to a counter. The MBM event sub-directory contains a file named "event_filter" that is used to view and modify which memory transactions the MBM event is configured with. Create /sys/fs/resctrl/info/L3_MON/event_configs directory on resctrl mount and pre-populate it with directories for the two existing MBM events: mbm_total_bytes and mbm_local_bytes. Create the "event_filter" file within each MBM event directory with the needed *show() that displays the memory transactions with which the MBM event is configured. Example: $ mount -t resctrl resctrl /sys/fs/resctrl $ cd /sys/fs/resctrl/ $ cat info/L3_MON/event_configs/mbm_total_bytes/event_filter local_reads,remote_reads,local_non_temporal_writes, remote_non_temporal_writes,local_reads_slow_memory, remote_reads_slow_memory,dirty_victim_writes_all $ cat info/L3_MON/event_configs/mbm_local_bytes/event_filter local_reads,local_non_temporal_writes,local_reads_slow_memory Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 33 +++++++++++++++ fs/resctrl/internal.h | 4 ++ fs/resctrl/monitor.c | 56 +++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 59 ++++++++++++++++++++++++++- include/linux/resctrl_types.h | 3 ++ 5 files changed, 153 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 4c24c5f3f4c196..ddd95f1472e67d 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -310,6 +310,39 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs 0=30;1=30 +"event_configs": + Directory that exists when "mbm_event" counter assignment mode is supported. + Contains a sub-directory for each MBM event that can be assigned to a counter. + + Two MBM events are supported by default: mbm_local_bytes and mbm_total_bytes. + Each MBM event's sub-directory contains a file named "event_filter" that is + used to view and modify which memory transactions the MBM event is configured + with. The file is accessible only when "mbm_event" counter assignment mode is + enabled. + + List of memory transaction types supported: + + ========================== ======================================================== + Name Description + ========================== ======================================================== + dirty_victim_writes_all Dirty Victims from the QOS domain to all types of memory + remote_reads_slow_memory Reads to slow memory in the non-local NUMA domain + local_reads_slow_memory Reads to slow memory in the local NUMA domain + remote_non_temporal_writes Non-temporal writes to non-local NUMA domain + local_non_temporal_writes Non-temporal writes to local NUMA domain + remote_reads Reads to memory in the non-local NUMA domain + local_reads Reads to memory in the local NUMA domain + ========================== ======================================================== + + For example:: + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes, + local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 2f1f2efe2f40f1..9bf2e2fd5c19a2 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -241,6 +241,8 @@ struct rdtgroup { #define RFTYPE_DEBUG BIT(10) +#define RFTYPE_ASSIGN_CONFIG BIT(11) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) @@ -403,6 +405,8 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 55327056596ee2..7179f9865a48bd 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -972,6 +972,61 @@ u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) return mon_event_all[evtid].evt_cfg; } +/** + * struct mbm_transaction - Memory transaction an MBM event can be configured with. + * @name: Name of memory transaction (read, write ...). + * @val: The bit (eg. READS_TO_LOCAL_MEM or READS_TO_REMOTE_MEM) used to + * represent the memory transaction within an event's configuration. + */ +struct mbm_transaction { + char name[32]; + u32 val; +}; + +/* Decoded values for each type of memory transaction. */ +static struct mbm_transaction mbm_transactions[NUM_MBM_TRANSACTIONS] = { + {"local_reads", READS_TO_LOCAL_MEM}, + {"remote_reads", READS_TO_REMOTE_MEM}, + {"local_non_temporal_writes", NON_TEMP_WRITE_TO_LOCAL_MEM}, + {"remote_non_temporal_writes", NON_TEMP_WRITE_TO_REMOTE_MEM}, + {"local_reads_slow_memory", READS_TO_LOCAL_S_MEM}, + {"remote_reads_slow_memory", READS_TO_REMOTE_S_MEM}, + {"dirty_victim_writes_all", DIRTY_VICTIMS_TO_ALL_MEM}, +}; + +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + bool sep = false; + int ret = 0, i; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (mevt->evt_cfg & mbm_transactions[i].val) { + if (sep) + seq_putc(seq, ','); + seq_printf(seq, "%s", mbm_transactions[i].name); + sep = true; + } + } + seq_putc(seq, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + /* * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID * pair in the domain. @@ -1287,6 +1342,7 @@ int resctrl_mon_resource_init(void) RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("available_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 2e1d0a2703da4f..8f0c403e3fb5f9 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1923,6 +1923,12 @@ static struct rftype res_common_files[] = { .seq_show = mbm_local_bytes_config_show, .write = mbm_local_bytes_config_write, }, + { + .name = "event_filter", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = event_filter_show, + }, { .name = "mbm_assign_mode", .mode = 0444, @@ -2183,10 +2189,48 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, return ret; } +static int resctrl_mkdir_event_configs(struct rdt_resource *r, struct kernfs_node *l3_mon_kn) +{ + struct kernfs_node *kn_subdir, *kn_subdir2; + struct mon_evt *mevt; + int ret; + + kn_subdir = kernfs_create_dir(l3_mon_kn, "event_configs", l3_mon_kn->mode, NULL); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + kn_subdir2 = kernfs_create_dir(kn_subdir, mevt->name, kn_subdir->mode, mevt); + if (IS_ERR(kn_subdir2)) { + ret = PTR_ERR(kn_subdir2); + goto out; + } + + ret = rdtgroup_kn_set_ugid(kn_subdir2); + if (ret) + goto out; + + ret = rdtgroup_add_files(kn_subdir2, RFTYPE_ASSIGN_CONFIG); + if (ret) + break; + } + +out: + return ret; +} + static int rdtgroup_mkdir_info_resdir(void *priv, char *name, unsigned long fflags) { struct kernfs_node *kn_subdir; + struct rdt_resource *r; int ret; kn_subdir = kernfs_create_dir(kn_info, name, @@ -2199,8 +2243,19 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name, return ret; ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); + if (ret) + return ret; + + if ((fflags & RFTYPE_MON_INFO) == RFTYPE_MON_INFO) { + r = priv; + if (r->mon.mbm_cntr_assignable) { + ret = resctrl_mkdir_event_configs(r, kn_subdir); + if (ret) + return ret; + } + } + + kernfs_activate(kn_subdir); return ret; } diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index d98351663c2c64..acfe07860b346c 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,6 +34,9 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) +/* Number of memory transactions that an MBM event can be configured with */ +#define NUM_MBM_TRANSACTIONS 7 + /* Event IDs */ enum resctrl_event_id { /* Must match value of first event below */ From f9ae5913d47cda67481a4f54cc3273d3d1d00a01 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:24 -0500 Subject: [PATCH 1711/3206] fs/resctrl: Provide interface to update the event configurations When "mbm_event" counter assignment mode is enabled, users can modify the event configuration by writing to the 'event_filter' resctrl file. The event configurations for mbm_event mode are located in /sys/fs/resctrl/info/L3_MON/event_configs/. Update the assignments of all CTRL_MON and MON resource groups when the event configuration is modified. Example: $ mount -t resctrl resctrl /sys/fs/resctrl $ cd /sys/fs/resctrl/ $ cat info/L3_MON/event_configs/mbm_local_bytes/event_filter local_reads,local_non_temporal_writes,local_reads_slow_memory $ echo "local_reads,local_non_temporal_writes" > info/L3_MON/event_configs/mbm_total_bytes/event_filter $ cat info/L3_MON/event_configs/mbm_total_bytes/event_filter local_reads,local_non_temporal_writes Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 12 +++ fs/resctrl/internal.h | 3 + fs/resctrl/monitor.c | 114 ++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 3 +- 4 files changed, 131 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index ddd95f1472e67d..2e840ef26f6829 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -343,6 +343,18 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter local_reads,local_non_temporal_writes,local_reads_slow_memory + Modify the event configuration by writing to the "event_filter" file within + the "event_configs" directory. The read/write "event_filter" file contains the + configuration of the event that reflects which memory transactions are counted by it. + + For example:: + + # echo "local_reads, local_non_temporal_writes" > + /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,local_non_temporal_writes + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9bf2e2fd5c19a2..90d3e4ab335b6b 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -407,6 +407,9 @@ void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 7179f9865a48bd..ccb9726bba545d 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1192,6 +1192,120 @@ void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); } +static int resctrl_parse_mem_transactions(char *tok, u32 *val) +{ + u32 temp_val = 0; + char *evt_str; + bool found; + int i; + +next_config: + if (!tok || tok[0] == '\0') { + *val = temp_val; + return 0; + } + + /* Start processing the strings for each memory transaction type */ + evt_str = strim(strsep(&tok, ",")); + found = false; + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (!strcmp(mbm_transactions[i].name, evt_str)) { + temp_val |= mbm_transactions[i].val; + found = true; + break; + } + } + + if (!found) { + rdt_last_cmd_printf("Invalid memory transaction type %s\n", evt_str); + return -EINVAL; + } + + goto next_config; +} + +/* + * rdtgroup_update_cntr_event - Update the counter assignments for the event + * in a group. + * @r: Resource to which update needs to be done. + * @rdtgrp: Resctrl group. + * @evtid: MBM monitor event. + */ +static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp, + enum resctrl_event_id evtid) +{ + struct rdt_mon_domain *d; + int cntr_id; + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + cntr_id = mbm_cntr_get(r, d, rdtgrp, evtid); + if (cntr_id >= 0) + rdtgroup_assign_cntr(r, d, evtid, rdtgrp->mon.rmid, + rdtgrp->closid, cntr_id, true); + } +} + +/* + * resctrl_update_cntr_allrdtgrp - Update the counter assignments for the event + * for all the groups. + * @mevt MBM Monitor event. + */ +static void resctrl_update_cntr_allrdtgrp(struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + struct rdtgroup *prgrp, *crgrp; + + /* + * Find all the groups where the event is assigned and update the + * configuration of existing assignments. + */ + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + rdtgroup_update_cntr_event(r, prgrp, mevt->evtid); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + rdtgroup_update_cntr_event(r, crgrp, mevt->evtid); + } +} + +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + u32 evt_cfg = 0; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + ret = resctrl_parse_mem_transactions(buf, &evt_cfg); + if (!ret && mevt->evt_cfg != evt_cfg) { + mevt->evt_cfg = evt_cfg; + resctrl_update_cntr_allrdtgrp(mevt); + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 8f0c403e3fb5f9..e90bc808fe5327 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1925,9 +1925,10 @@ static struct rftype res_common_files[] = { }, { .name = "event_filter", - .mode = 0444, + .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = event_filter_show, + .write = event_filter_write, }, { .name = "mbm_assign_mode", From ac1df9bb0ba3ae94137fb494cd9efc598f65d826 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:25 -0500 Subject: [PATCH 1712/3206] fs/resctrl: Introduce mbm_assign_on_mkdir to enable assignments on mkdir The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor the bandwidth as long as it is assigned. Introduce a user-configurable option that determines if a counter will automatically be assigned to an RMID, event pair when its associated monitor group is created via mkdir. Accessible when "mbm_event" counter assignment mode is enabled. Suggested-by: Peter Newman Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 20 ++++++++++ fs/resctrl/internal.h | 6 +++ fs/resctrl/monitor.c | 53 +++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 7 ++++ include/linux/resctrl.h | 3 ++ 5 files changed, 89 insertions(+) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 2e840ef26f6829..1de815b3a07b45 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -355,6 +355,26 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter local_reads,local_non_temporal_writes +"mbm_assign_on_mkdir": + Exists when "mbm_event" counter assignment mode is supported. Accessible + only when "mbm_event" counter assignment mode is enabled. + + Determines if a counter will automatically be assigned to an RMID, MBM event + pair when its associated monitor group is created via mkdir. Enabled by default + on boot, also when switched from "default" mode to "mbm_event" counter assignment + mode. Users can disable this capability by writing to the interface. + + "0": + Auto assignment is disabled. + "1": + Auto assignment is enabled. + + Example:: + + # echo 0 > /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + 0 + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 90d3e4ab335b6b..66c677c1b85881 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -410,6 +410,12 @@ int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index ccb9726bba545d..deca9535fbbb8e 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1027,6 +1027,57 @@ int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v return ret; } +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, struct seq_file *s, + void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + seq_printf(s, "%u\n", r->mon.mbm_assign_on_mkdir); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool value; + int ret; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + r->mon.mbm_assign_on_mkdir = value; + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + /* * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID * pair in the domain. @@ -1457,6 +1508,8 @@ int resctrl_mon_resource_init(void) resctrl_file_fflags_init("available_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); + resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | + RFTYPE_RES_CACHE); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index e90bc808fe5327..c7ea42c2a3c24e 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1808,6 +1808,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_last_cmd_status_show, .fflags = RFTYPE_TOP_INFO, }, + { + .name = "mbm_assign_on_mkdir", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_on_mkdir_show, + .write = resctrl_mbm_assign_on_mkdir_write, + }, { .name = "num_closids", .mode = 0444, diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 04152654827d4f..a7d92718b653f5 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -277,12 +277,15 @@ enum resctrl_schema_fmt { * monitoring events can be configured. * @num_mbm_cntrs: Number of assignable counters. * @mbm_cntr_assignable:Is system capable of supporting counter assignment? + * @mbm_assign_on_mkdir:True if counters should automatically be assigned to MBM + * events of monitor groups created via mkdir. */ struct resctrl_mon { int num_rmid; unsigned int mbm_cfg_mask; int num_mbm_cntrs; bool mbm_cntr_assignable; + bool mbm_assign_on_mkdir; }; /** From ef712fe97ec575657abb12d76837867dd8b8a0ed Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:26 -0500 Subject: [PATCH 1713/3206] fs/resctrl: Auto assign counters on mkdir and clean up on group removal Resctrl provides a user-configurable option mbm_assign_on_mkdir that determines if a counter will automatically be assigned to an RMID, event pair when its associated monitor group is created via mkdir. Enable mbm_assign_on_mkdir by default to automatically assign counters to the two default events (MBM total and MBM local) of a new monitoring group created via mkdir. This maintains backward compatibility with original resctrl support for these two events. Unassign and free counters belonging to a monitoring group when the group is deleted. Monitor group creation does not fail if a counter cannot be assigned to one or both events. There may be limited counters and users have the flexibility to modify counter assignments at a later time. Log the error message "Failed to allocate counter for in domain " in /sys/fs/resctrl/info/last_cmd_status when a new monitoring group is created but counter assignment failed. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- fs/resctrl/monitor.c | 4 +++- fs/resctrl/rdtgroup.c | 22 ++++++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index deca9535fbbb8e..9cb334136d210a 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1231,7 +1231,8 @@ void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) || + !r->mon.mbm_assign_on_mkdir) return; if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) @@ -1503,6 +1504,7 @@ int resctrl_mon_resource_init(void) (READS_TO_LOCAL_MEM | READS_TO_LOCAL_S_MEM | NON_TEMP_WRITE_TO_LOCAL_MEM); + r->mon.mbm_assign_on_mkdir = true; resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("available_mbm_cntrs", diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index c7ea42c2a3c24e..48f98146c09962 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2713,6 +2713,8 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_info; + rdtgroup_assign_cntrs(&rdtgroup_default); + ret = mkdir_mondata_all(rdtgroup_default.kn, &rdtgroup_default, &kn_mondata); if (ret < 0) @@ -2751,8 +2753,10 @@ static int rdt_get_tree(struct fs_context *fc) if (resctrl_arch_mon_capable()) kernfs_remove(kn_mondata); out_mongrp: - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(&rdtgroup_default); kernfs_remove(kn_mongrp); + } out_info: kernfs_remove(kn_info); out_closid_exit: @@ -2897,6 +2901,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) head = &rdtgrp->mon.crdtgrp_list; list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + rdtgroup_unassign_cntrs(sentry); free_rmid(sentry->closid, sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); @@ -2937,6 +2942,8 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); kernfs_remove(rdtgrp->kn); @@ -3021,6 +3028,7 @@ static void resctrl_fs_teardown(void) return; rmdir_all_sub(); + rdtgroup_unassign_cntrs(&rdtgroup_default); mon_put_kn_priv(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; @@ -3501,9 +3509,12 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) } rdtgrp->mon.rmid = ret; + rdtgroup_assign_cntrs(rdtgrp); + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); if (ret) { rdt_last_cmd_puts("kernfs subdir error\n"); + rdtgroup_unassign_cntrs(rdtgrp); free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } @@ -3513,8 +3524,10 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) { - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(rgrp); free_rmid(rgrp->closid, rgrp->mon.rmid); + } } /* @@ -3790,6 +3803,9 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; + + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); /* @@ -3837,6 +3853,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); closid_free(rdtgrp->closid); From cba8222880b88d96f3ed6c8a115e335e552b83a1 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:27 -0500 Subject: [PATCH 1714/3206] fs/resctrl: Introduce mbm_L3_assignments to list assignments in a group Introduce the mbm_L3_assignments resctrl file associated with CTRL_MON and MON resource groups to display the counter assignment states of the resource group when "mbm_event" counter assignment mode is enabled. Display the list in the following format: :=;= Event: A valid MBM event listed in /sys/fs/resctrl/info/L3_MON/event_configs directory. Domain ID: A valid domain ID. The assignment state can be one of the following: _ : No counter assigned. e : Counter assigned exclusively. Example: To list the assignment states for the default group $ cd /sys/fs/resctrl $ cat /sys/fs/resctrl/mbm_L3_assignments mbm_total_bytes:0=e;1=e mbm_local_bytes:0=e;1=e Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 31 +++++++++++++++++ fs/resctrl/internal.h | 2 ++ fs/resctrl/monitor.c | 49 +++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 6 ++++ 4 files changed, 88 insertions(+) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 1de815b3a07b45..a2b7240b0818b5 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -509,6 +509,37 @@ When monitoring is enabled all MON groups will also contain: Available only with debug option. The identifier used by hardware for the monitor group. On x86 this is the RMID. +When monitoring is enabled all MON groups may also contain: + +"mbm_L3_assignments": + Exists when "mbm_event" counter assignment mode is supported and lists the + counter assignment states of the group. + + The assignment list is displayed in the following format: + + :=;= + + Event: A valid MBM event in the + /sys/fs/resctrl/info/L3_MON/event_configs directory. + + Domain ID: A valid domain ID. + + Assignment states: + + _ : No counter assigned. + + e : Counter assigned exclusively. + + Example: + + To display the counter assignment states for the default group. + :: + + # cd /sys/fs/resctrl + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + When the "mba_MBps" mount option is used all CTRL_MON groups will also contain: "mba_MBps_event": diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 66c677c1b85881..43297b36f5ddaf 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -416,6 +416,8 @@ int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 9cb334136d210a..b692a0ef17107d 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1454,6 +1454,54 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, return ret; } +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + struct mon_evt *mevt; + int ret = 0; + bool sep; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out_unlock; + } + + rdt_last_cmd_clear(); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + sep = false; + seq_printf(s, "%s:", mevt->name); + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + if (mbm_cntr_get(r, d, rdtgrp, mevt->evtid) < 0) + seq_printf(s, "%d=_", d->hdr.id); + else + seq_printf(s, "%d=e", d->hdr.id); + + sep = true; + } + seq_putc(s, '\n'); + } + +out_unlock: + rdtgroup_kn_unlock(of->kn); + + return ret; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * @@ -1512,6 +1560,7 @@ int resctrl_mon_resource_init(void) resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 48f98146c09962..519aa6acef5b24 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1937,6 +1937,12 @@ static struct rftype res_common_files[] = { .seq_show = event_filter_show, .write = event_filter_write, }, + { + .name = "mbm_L3_assignments", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_L3_assignments_show, + }, { .name = "mbm_assign_mode", .mode = 0444, From 88bee79640aea6192f98c8faae30e0013453479d Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:28 -0500 Subject: [PATCH 1715/3206] fs/resctrl: Introduce the interface to modify assignments in a group Enable the mbm_l3_assignments resctrl file to be used to modify counter assignments of CTRL_MON and MON groups when the "mbm_event" counter assignment mode is enabled. Process the assignment modifications in the following format: :=;= Event: A valid MBM event in the /sys/fs/resctrl/info/L3_MON/event_configs directory. Domain ID: A valid domain ID. When writing, '*' applies the changes to all domains. Assignment states: _ : Unassign a counter. e : Assign a counter exclusively. Examples: $ cd /sys/fs/resctrl $ cat /sys/fs/resctrl/mbm_L3_assignments mbm_total_bytes:0=e;1=e mbm_local_bytes:0=e;1=e To unassign the counter associated with the mbm_total_bytes event on domain 0: $ echo "mbm_total_bytes:0=_" > mbm_L3_assignments $ cat /sys/fs/resctrl/mbm_L3_assignments mbm_total_bytes:0=_;1=e mbm_local_bytes:0=e;1=e To unassign the counter associated with the mbm_total_bytes event on all the domains: $ echo "mbm_total_bytes:*=_" > mbm_L3_assignments $ cat /sys/fs/resctrl/mbm_L3_assignments mbm_total_bytes:0=_;1=_ mbm_local_bytes:0=e;1=e Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 151 +++++++++++++++++++++++++- fs/resctrl/internal.h | 3 + fs/resctrl/monitor.c | 139 ++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 3 +- 4 files changed, 294 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index a2b7240b0818b5..f60f6a96cb6b81 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -522,7 +522,8 @@ When monitoring is enabled all MON groups may also contain: Event: A valid MBM event in the /sys/fs/resctrl/info/L3_MON/event_configs directory. - Domain ID: A valid domain ID. + Domain ID: A valid domain ID. When writing, '*' applies the changes + to all the domains. Assignment states: @@ -540,6 +541,35 @@ When monitoring is enabled all MON groups may also contain: mbm_total_bytes:0=e;1=e mbm_local_bytes:0=e;1=e + Assignments can be modified by writing to the interface. + + Examples: + + To unassign the counter associated with the mbm_total_bytes event on domain 0: + :: + + # echo "mbm_total_bytes:0=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=e + mbm_local_bytes:0=e;1=e + + To unassign the counter associated with the mbm_total_bytes event on all the domains: + :: + + # echo "mbm_total_bytes:*=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=_ + mbm_local_bytes:0=e;1=e + + To assign a counter associated with the mbm_total_bytes event on all domains in + exclusive mode: + :: + + # echo "mbm_total_bytes:*=e" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + When the "mba_MBps" mount option is used all CTRL_MON groups will also contain: "mba_MBps_event": @@ -1585,6 +1615,125 @@ View the llc occupancy snapshot:: # cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy 11234000 + +Examples on working with mbm_assign_mode +======================================== + +a. Check if MBM counter assignment mode is supported. +:: + + # mount -t resctrl resctrl /sys/fs/resctrl/ + + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + [mbm_event] + default + +The "mbm_event" mode is detected and enabled. + +b. Check how many assignable counters are supported. +:: + + # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs + 0=32;1=32 + +c. Check how many assignable counters are available for assignment in each domain. +:: + + # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs + 0=30;1=30 + +d. To list the default group's assign states. +:: + + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + +e. To unassign the counter associated with the mbm_total_bytes event on domain 0. +:: + + # echo "mbm_total_bytes:0=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=e + mbm_local_bytes:0=e;1=e + +f. To unassign the counter associated with the mbm_total_bytes event on all domains. +:: + + # echo "mbm_total_bytes:*=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignment + mbm_total_bytes:0=_;1=_ + mbm_local_bytes:0=e;1=e + +g. To assign a counter associated with the mbm_total_bytes event on all domains in +exclusive mode. +:: + + # echo "mbm_total_bytes:*=e" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + +h. Read the events mbm_total_bytes and mbm_local_bytes of the default group. There is +no change in reading the events with the assignment. +:: + + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_total_bytes + 779247936 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_total_bytes + 562324232 + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + 212122123 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + 121212144 + +i. Check the event configurations. +:: + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes, + local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory + +j. Change the event configuration for mbm_local_bytes. +:: + + # echo "local_reads, local_non_temporal_writes, local_reads_slow_memory, remote_reads" > + /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory,remote_reads + +k. Now read the local events again. The first read may come back with "Unavailable" +status. The subsequent read of mbm_local_bytes will display the current value. +:: + + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + Unavailable + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + 2252323 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + Unavailable + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + 1566565 + +l. Users have the option to go back to 'default' mbm_assign_mode if required. This can be +done using the following command. Note that switching the mbm_assign_mode may reset all +the MBM counters (and thus all MBM events) of all the resctrl groups. +:: + + # echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + mbm_event + [default] + +m. Unmount the resctrl filesystem. +:: + + # umount /sys/fs/resctrl/ + Intel RDT Errata ================ diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 43297b36f5ddaf..c69b1da80d3f9e 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -418,6 +418,9 @@ ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index b692a0ef17107d..f388dbcdbdcd63 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1502,6 +1502,145 @@ int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, voi return ret; } +/* + * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching + * event name. + */ +static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *name) +{ + struct mon_evt *mevt; + + for_each_mon_event(mevt) { + if (mevt->rid == r->rid && mevt->enabled && + resctrl_is_mbm_event(mevt->evtid) && + !strcmp(mevt->name, name)) + return mevt; + } + + return NULL; +} + +static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int ret = 0; + + if (!assign || strlen(assign) != 1) + return -EINVAL; + + switch (*assign) { + case 'e': + ret = rdtgroup_assign_cntr_event(d, rdtgrp, mevt); + break; + case '_': + rdtgroup_unassign_cntr_event(d, rdtgrp, mevt); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp, + char *event, char *tok) +{ + struct rdt_mon_domain *d; + unsigned long dom_id = 0; + char *dom_str, *id_str; + struct mon_evt *mevt; + int ret; + + mevt = mbm_get_mon_event_by_name(r, event); + if (!mevt) { + rdt_last_cmd_printf("Invalid event %s\n", event); + return -ENOENT; + } + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + + id_str = strsep(&dom_str, "="); + + /* Check for domain id '*' which means all domains */ + if (id_str && *id_str == '*') { + ret = rdtgroup_modify_assign_state(dom_str, NULL, rdtgrp, mevt); + if (ret) + rdt_last_cmd_printf("Assign operation '%s:*=%s' failed\n", + event, dom_str); + return ret; + } else if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing domain id\n"); + return -EINVAL; + } + + /* Verify if the dom_id is valid */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->hdr.id == dom_id) { + ret = rdtgroup_modify_assign_state(dom_str, d, rdtgrp, mevt); + if (ret) { + rdt_last_cmd_printf("Assign operation '%s:%ld=%s' failed\n", + event, dom_id, dom_str); + return ret; + } + goto next; + } + } + + rdt_last_cmd_printf("Invalid domain id %ld\n", dom_id); + return -EINVAL; +} + +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdtgroup *rdtgrp; + char *token, *event; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event mode is not enabled\n"); + rdtgroup_kn_unlock(of->kn); + return -EINVAL; + } + + while ((token = strsep(&buf, "\n")) != NULL) { + /* + * The write command follows the following format: + * ":=" + * Extract the event name first. + */ + event = strsep(&token, ":"); + + ret = resctrl_parse_mbm_assignment(r, rdtgrp, event, token); + if (ret) + break; + } + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 519aa6acef5b24..bd4a115ffea188 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1939,9 +1939,10 @@ static struct rftype res_common_files[] = { }, { .name = "mbm_L3_assignments", - .mode = 0444, + .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = mbm_L3_assignments_show, + .write = mbm_L3_assignments_write, }, { .name = "mbm_assign_mode", From 9f0209b857d2fc99c2a32bff1d7bfd54b314c29c Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:29 -0500 Subject: [PATCH 1716/3206] fs/resctrl: Disable BMEC event configuration when mbm_event mode is enabled The BMEC (Bandwidth Monitoring Event Configuration) feature enables per-domain event configuration. With BMEC the MBM events are configured using the mbm_total_bytes_config or mbm_local_bytes_config files in /sys/fs/resctrl/info/L3_MON/ and the per-domain event configuration affects all monitor resource groups. The mbm_event counter assignment mode enables counters to be assigned to RMID (i.e. a monitor resource group), event pairs, with potentially unique event configurations associated with every counter. There may be systems that support both BMEC and mbm_event counter assignment mode, but resctrl supporting both concurrently will present a conflicting interface to the user with both per-domain and per RMID, event configurations active at the same time. The mbm_event counter assignment provides most flexibility to user space and aligns with Arm's counter support. On systems that support both, disable BMEC event configuration when mbm_event mode is enabled by hiding the mbm_total_bytes_config or mbm_local_bytes_config files when mbm_event mode is enabled. Ensure mon_features always displays accurate information about monitor features. Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- fs/resctrl/rdtgroup.c | 47 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index bd4a115ffea188..0c404a159d4558 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1150,7 +1150,8 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, if (mevt->rid != r->rid || !mevt->enabled) continue; seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) + if (mevt->configurable && + !resctrl_arch_mbm_cntr_assign_enabled(r)) seq_printf(seq, "%s_config\n", mevt->name); } @@ -1799,6 +1800,44 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, return ret ?: nbytes; } +/* + * resctrl_bmec_files_show() — Controls the visibility of BMEC-related resctrl + * files. When @show is true, the files are displayed; when false, the files + * are hidden. + * Don't treat kernfs_find_and_get failure as an error, since this function may + * be called regardless of whether BMEC is supported or the event is enabled. + */ +static void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show) +{ + struct kernfs_node *kn_config, *mon_kn = NULL; + char name[32]; + + if (!l3_mon_kn) { + sprintf(name, "%s_MON", r->name); + mon_kn = kernfs_find_and_get(kn_info, name); + if (!mon_kn) + return; + l3_mon_kn = mon_kn; + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_total_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_local_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + /* Release the reference only if it was acquired */ + if (mon_kn) + kernfs_put(mon_kn); +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -2267,6 +2306,12 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name, ret = resctrl_mkdir_event_configs(r, kn_subdir); if (ret) return ret; + /* + * Hide BMEC related files if mbm_event mode + * is enabled. + */ + if (resctrl_arch_mbm_cntr_assign_enabled(r)) + resctrl_bmec_files_show(r, kn_subdir, false); } } From 8004ea01cf6338298e0c6ab055bc3ec659ce381b Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:30 -0500 Subject: [PATCH 1717/3206] fs/resctrl: Introduce the interface to switch between monitor modes Resctrl subsystem can support two monitoring modes, "mbm_event" or "default". In mbm_event mode, monitoring event can only accumulate data while it is backed by a hardware counter. In "default" mode, resctrl assumes there is a hardware counter for each event within every CTRL_MON and MON group. Introduce mbm_assign_mode resctrl file to switch between mbm_event and default modes. Example: To list the MBM monitor modes supported: $ cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode [mbm_event] default To enable the "mbm_event" counter assignment mode: $ echo "mbm_event" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode To enable the "default" monitoring mode: $ echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode Reset MBM event counters automatically as part of changing the mode. Clear both architectural and non-architectural event states to prevent overflow conditions during the next event read. Clear assignable counter configuration on all the domains. Also, enable auto assignment when switching to "mbm_event" mode. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 22 +++++- fs/resctrl/internal.h | 6 ++ fs/resctrl/monitor.c | 100 ++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 7 +- 4 files changed, 131 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index f60f6a96cb6b81..006d23af66e19f 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -259,7 +259,8 @@ with the following files: "mbm_assign_mode": The supported counter assignment modes. The enclosed brackets indicate which mode - is enabled. + is enabled. The MBM events associated with counters may reset when "mbm_assign_mode" + is changed. :: # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode @@ -279,6 +280,15 @@ with the following files: of counters available is described in the "num_mbm_cntrs" file. Changing the mode may cause all counters on the resource to reset. + Moving to mbm_event counter assignment mode requires users to assign the counters + to the events. Otherwise, the MBM event counters will return 'Unassigned' when read. + + The mode is beneficial for AMD platforms that support more CTRL_MON + and MON groups than available hardware counters. By default, this + feature is enabled on AMD platforms with the ABMC (Assignable Bandwidth + Monitoring Counters) capability, ensuring counters remain assigned even + when the corresponding RMID is not actively used by any processor. + "default": In default mode, resctrl assumes there is a hardware counter for each @@ -288,6 +298,16 @@ with the following files: result in misleading values or display "Unavailable" if no counter is assigned to the event. + * To enable "mbm_event" counter assignment mode: + :: + + # echo "mbm_event" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + + * To enable "default" monitoring mode: + :: + + # echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + "num_mbm_cntrs": The maximum number of counters (total of available and assigned counters) in each domain when the system supports mbm_event mode. diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index c69b1da80d3f9e..cf1fd82dc5a99e 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -396,6 +396,12 @@ void *rdt_kn_parent_priv(struct kernfs_node *kn); int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show); + int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index f388dbcdbdcd63..50c24460d992ab 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1078,6 +1078,33 @@ ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf return ret ?: nbytes; } +/* + * mbm_cntr_free_all() - Clear all the counter ID configuration details in the + * domain @d. Called when mbm_assign_mode is changed. + */ +static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs); +} + +/* + * resctrl_reset_rmid_all() - Reset all non-architecture states for all the + * supported RMIDs. + */ +static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + enum resctrl_event_id evt; + int idx; + + for_each_mbm_event_id(evt) { + if (!resctrl_is_mon_event_enabled(evt)) + continue; + idx = MBM_STATE_IDX(evt); + memset(d->mbm_states[idx], 0, sizeof(*d->mbm_states[0]) * idx_limit); + } +} + /* * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID * pair in the domain. @@ -1388,6 +1415,79 @@ int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, return 0; } +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *d; + int ret = 0; + bool enable; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!strcmp(buf, "default")) { + enable = 0; + } else if (!strcmp(buf, "mbm_event")) { + if (r->mon.mbm_cntr_assignable) { + enable = 1; + } else { + ret = -EINVAL; + rdt_last_cmd_puts("mbm_event mode is not supported\n"); + goto out_unlock; + } + } else { + ret = -EINVAL; + rdt_last_cmd_puts("Unsupported assign mode\n"); + goto out_unlock; + } + + if (enable != resctrl_arch_mbm_cntr_assign_enabled(r)) { + ret = resctrl_arch_mbm_cntr_assign_set(r, enable); + if (ret) + goto out_unlock; + + /* Update the visibility of BMEC related files */ + resctrl_bmec_files_show(r, NULL, !enable); + + /* + * Initialize the default memory transaction values for + * total and local events. + */ + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); + /* Enable auto assignment when switching to "mbm_event" mode */ + if (enable) + r->mon.mbm_assign_on_mkdir = true; + /* + * Reset all the non-achitectural RMID state and assignable counters. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + mbm_cntr_free_all(r, d); + resctrl_reset_rmid_all(r, d); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 0c404a159d4558..0320360cd7a6eb 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1807,8 +1807,8 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, * Don't treat kernfs_find_and_get failure as an error, since this function may * be called regardless of whether BMEC is supported or the event is enabled. */ -static void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, - bool show) +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show) { struct kernfs_node *kn_config, *mon_kn = NULL; char name[32]; @@ -1985,9 +1985,10 @@ static struct rftype res_common_files[] = { }, { .name = "mbm_assign_mode", - .mode = 0444, + .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = resctrl_mbm_assign_mode_show, + .write = resctrl_mbm_assign_mode_write, .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, }, { From 0f1576e43adc62756879a240e66e89ce386b6eb9 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:31 -0500 Subject: [PATCH 1718/3206] x86/resctrl: Configure mbm_event mode if supported Configure mbm_event mode on AMD platforms. On AMD platforms, it is recommended to use the mbm_event mode, if supported, to prevent the hardware from resetting counters between reads. This can result in misleading values or display "Unavailable" if no counter is assigned to the event. Enable mbm_event mode, known as ABMC (Assignable Bandwidth Monitoring Counters) on AMD, by default if the system supports it. Update ABMC across all logical processors within the resctrl domain to ensure proper functionality. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/core.c | 7 +++++++ arch/x86/kernel/cpu/resctrl/internal.h | 1 + arch/x86/kernel/cpu/resctrl/monitor.c | 8 ++++++++ 3 files changed, 16 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 2e68aa02ad3f4c..06ca5a30140c2f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -520,6 +520,9 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = container_of(hdr, struct rdt_mon_domain, hdr); cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); return; } @@ -539,6 +542,10 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + arch_mon_domain_online(r, d); if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index e5edddb290c9d3..9f4c2f0aaf5c80 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -215,5 +215,6 @@ bool rdt_cpu_has(int flag); void __init intel_rdt_mbm_apply_quirk(void); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 0b3c199e9e016d..c8945610d45550 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -456,6 +456,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) r->mon.mbm_cntr_assignable = true; cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; + hw_res->mbm_cntr_assign_enabled = true; } r->mon_capable = true; @@ -557,3 +558,10 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, if (am) memset(am, 0, sizeof(*am)); } + +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled); +} From d79bab8a48bfcf5495f72d10bf609478a4a3b916 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:32 -0500 Subject: [PATCH 1719/3206] MAINTAINERS: resctrl: Add myself as reviewer I have been contributing to resctrl for sometime now and I would like to help with code reviews as well. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Acked-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f6206963efbf0b..bee4e104728aeb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21175,6 +21175,7 @@ M: Tony Luck M: Reinette Chatre R: Dave Martin R: James Morse +R: Babu Moger L: linux-kernel@vger.kernel.org S: Supported F: Documentation/filesystems/resctrl.rst From 5337609a314828aa2474ac359db615f475c4a4d2 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 12 Sep 2025 10:27:38 -0400 Subject: [PATCH 1720/3206] powerpc/ftrace: ensure ftrace record ops are always set for NOPs When an ftrace call site is converted to a NOP, its corresponding dyn_ftrace record should have its ftrace_ops pointer set to ftrace_nop_ops. Correct the powerpc implementation to ensure the ftrace_rec_set_nop_ops() helper is called on all successful NOP initialization paths. This ensures all ftrace records are consistent before being handled by the ftrace core. Fixes: eec37961a56a ("powerpc64/ftrace: Move ftrace sequence out of line") Suggested-by: Naveen N Rao Signed-off-by: Joe Lawrence Acked-by: Naveen N Rao (AMD) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250912142740.3581368-2-joe.lawrence@redhat.com --- arch/powerpc/kernel/trace/ftrace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 6dca92d5a6e822..841d077e28251a 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -488,8 +488,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) return ret; /* Set up out-of-line stub */ - if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) - return ftrace_init_ool_stub(mod, rec); + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) { + ret = ftrace_init_ool_stub(mod, rec); + goto out; + } /* Nop-out the ftrace location */ new = ppc_inst(PPC_RAW_NOP()); @@ -520,6 +522,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) return -EINVAL; } +out: + if (!ret) + ret = ftrace_rec_set_nop_ops(rec); + return ret; } From f6b4df37ebfeb47e50e27780500d2d06b4d211bd Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 12 Sep 2025 10:27:39 -0400 Subject: [PATCH 1721/3206] powerpc64/modules: correctly iterate over stubs in setup_ftrace_ool_stubs CONFIG_PPC_FTRACE_OUT_OF_LINE introduced setup_ftrace_ool_stubs() to extend the ppc64le module .stubs section with an array of ftrace_ool_stub structures for each patchable function. Fix its ppc64_stub_entry stub reservation loop to properly write across all of the num_stubs used and not just the first entry. Fixes: eec37961a56a ("powerpc64/ftrace: Move ftrace sequence out of line") Signed-off-by: Joe Lawrence Acked-by: Naveen N Rao (AMD) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250912142740.3581368-3-joe.lawrence@redhat.com --- arch/powerpc/kernel/module_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 126bf3b06ab7e2..0e45cac4de76b6 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -1139,7 +1139,7 @@ static int setup_ftrace_ool_stubs(const Elf64_Shdr *sechdrs, unsigned long addr, /* reserve stubs */ for (i = 0; i < num_stubs; i++) - if (patch_u32((void *)&stub->funcdata, PPC_RAW_NOP())) + if (patch_u32((void *)&stub[i].funcdata, PPC_RAW_NOP())) return -1; #endif From b137312fbf2dd1edc39acf7e8e6e8ac0a6ad72c0 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 12 Sep 2025 10:27:40 -0400 Subject: [PATCH 1722/3206] powerpc64/modules: replace stub allocation sentinel with an explicit counter The logic for allocating ppc64_stub_entry trampolines in the .stubs section relies on an inline sentinel, where a NULL .funcdata member indicates an available slot. While preceding commits fixed the initialization bugs that led to ftrace stub corruption, the sentinel-based approach remains fragile: it depends on an implicit convention between subsystems modifying different struct types in the same memory area. Replace the sentinel with an explicit counter, module->arch.num_stubs. Instead of iterating through memory to find a NULL marker, the module loader uses this counter as the boundary for the next free slot. This simplifies the allocation code, hardens it against future changes to stub structures, and removes the need for an extra relocation slot previously reserved to terminate the sentinel search. Signed-off-by: Joe Lawrence Acked-by: Naveen N Rao (AMD) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250912142740.3581368-4-joe.lawrence@redhat.com --- arch/powerpc/include/asm/module.h | 1 + arch/powerpc/kernel/module_64.c | 26 ++++++++------------------ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h index e1ee5026ac4af4..864e22deaa2cd2 100644 --- a/arch/powerpc/include/asm/module.h +++ b/arch/powerpc/include/asm/module.h @@ -27,6 +27,7 @@ struct ppc_plt_entry { struct mod_arch_specific { #ifdef __powerpc64__ unsigned int stubs_section; /* Index of stubs section in module */ + unsigned int stub_count; /* Number of stubs used */ #ifdef CONFIG_PPC_KERNEL_PCREL unsigned int got_section; /* What section is the GOT? */ unsigned int pcpu_section; /* .data..percpu section */ diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 0e45cac4de76b6..2a44bc8e2439f2 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -209,8 +209,7 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, char *secstrings, struct module *me) { - /* One extra reloc so it's always 0-addr terminated */ - unsigned long relocs = 1; + unsigned long relocs = 0; unsigned i; /* Every relocated section... */ @@ -705,7 +704,7 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, /* Find this stub, or if that fails, the next avail. entry */ stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; - for (i = 0; stub_func_addr(stubs[i].funcdata); i++) { + for (i = 0; i < me->arch.stub_count; i++) { if (WARN_ON(i >= num_stubs)) return 0; @@ -716,6 +715,7 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, if (!create_stub(sechdrs, &stubs[i], addr, me, name)) return 0; + me->arch.stub_count++; return (unsigned long)&stubs[i]; } @@ -1118,29 +1118,19 @@ int module_trampoline_target(struct module *mod, unsigned long addr, static int setup_ftrace_ool_stubs(const Elf64_Shdr *sechdrs, unsigned long addr, struct module *me) { #ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE - unsigned int i, total_stubs, num_stubs; + unsigned int total_stubs, num_stubs; struct ppc64_stub_entry *stub; total_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stub); num_stubs = roundup(me->arch.ool_stub_count * sizeof(struct ftrace_ool_stub), sizeof(struct ppc64_stub_entry)) / sizeof(struct ppc64_stub_entry); - /* Find the next available entry */ - stub = (void *)sechdrs[me->arch.stubs_section].sh_addr; - for (i = 0; stub_func_addr(stub[i].funcdata); i++) - if (WARN_ON(i >= total_stubs)) - return -1; - - if (WARN_ON(i + num_stubs > total_stubs)) + if (WARN_ON(me->arch.stub_count + num_stubs > total_stubs)) return -1; - stub += i; - me->arch.ool_stubs = (struct ftrace_ool_stub *)stub; - - /* reserve stubs */ - for (i = 0; i < num_stubs; i++) - if (patch_u32((void *)&stub[i].funcdata, PPC_RAW_NOP())) - return -1; + stub = (void *)sechdrs[me->arch.stubs_section].sh_addr; + me->arch.ool_stubs = (struct ftrace_ool_stub *)(stub + me->arch.stub_count); + me->arch.stub_count += num_stubs; #endif return 0; From 6798668ab19557520876d0f7dfabca827ee8b524 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Thu, 4 Sep 2025 10:51:18 +0000 Subject: [PATCH 1723/3206] riscv, bpf: Remove duplicated bpf_flush_icache() The bpf_flush_icache() is done by bpf_arch_text_copy() already. Remove the duplicated one in arch_prepare_bpf_trampoline(). Signed-off-by: Hengqi Chen Signed-off-by: Daniel Borkmann Reviewed-by: Pu Lehui Link: https://lore.kernel.org/bpf/20250904105119.21861-1-hengqi.chen@gmail.com --- arch/riscv/net/bpf_jit_comp64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 2f7188e0340a5c..886b647dc206fb 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1269,7 +1269,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, goto out; } - bpf_flush_icache(ro_image, ro_image_end); out: kvfree(image); return ret < 0 ? ret : size; From fd2e08128944a7679e753f920e9eda72057e427c Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Mon, 8 Sep 2025 01:24:48 +0000 Subject: [PATCH 1724/3206] riscv, bpf: Sign extend struct ops return values properly The ns_bpf_qdisc selftest triggers a kernel panic: Unable to handle kernel paging request at virtual address ffffffffa38dbf58 Current test_progs pgtable: 4K pagesize, 57-bit VAs, pgdp=0x00000001109cc000 [ffffffffa38dbf58] pgd=000000011fffd801, p4d=000000011fffd401, pud=000000011fffd001, pmd=0000000000000000 Oops [#1] Modules linked in: bpf_testmod(OE) xt_conntrack nls_iso8859_1 [...] [last unloaded: bpf_testmod(OE)] CPU: 1 UID: 0 PID: 23584 Comm: test_progs Tainted: G W OE 6.17.0-rc1-g2465bb83e0b4 #1 NONE Tainted: [W]=WARN, [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Hardware name: Unknown Unknown Product/Unknown Product, BIOS 2024.01+dfsg-1ubuntu5.1 01/01/2024 epc : __qdisc_run+0x82/0x6f0 ra : __qdisc_run+0x6e/0x6f0 epc : ffffffff80bd5c7a ra : ffffffff80bd5c66 sp : ff2000000eecb550 gp : ffffffff82472098 tp : ff60000096895940 t0 : ffffffff8001f180 t1 : ffffffff801e1664 t2 : 0000000000000000 s0 : ff2000000eecb5d0 s1 : ff60000093a6a600 a0 : ffffffffa38dbee8 a1 : 0000000000000001 a2 : ff2000000eecb510 a3 : 0000000000000001 a4 : 0000000000000000 a5 : 0000000000000010 a6 : 0000000000000000 a7 : 0000000000735049 s2 : ffffffffa38dbee8 s3 : 0000000000000040 s4 : ff6000008bcda000 s5 : 0000000000000008 s6 : ff60000093a6a680 s7 : ff60000093a6a6f0 s8 : ff60000093a6a6ac s9 : ff60000093140000 s10: 0000000000000000 s11: ff2000000eecb9d0 t3 : 0000000000000000 t4 : 0000000000ff0000 t5 : 0000000000000000 t6 : ff60000093a6a8b6 status: 0000000200000120 badaddr: ffffffffa38dbf58 cause: 000000000000000d [] __qdisc_run+0x82/0x6f0 [] __dev_queue_xmit+0x4c0/0x1128 [] neigh_resolve_output+0xd0/0x170 [] ip6_finish_output2+0x226/0x6c8 [] ip6_finish_output+0x10c/0x2a0 [] ip6_output+0x5e/0x178 [] ip6_xmit+0x29a/0x608 [] inet6_csk_xmit+0xe6/0x140 [] __tcp_transmit_skb+0x45c/0xaa8 [] tcp_connect+0x9ce/0xd10 [] tcp_v6_connect+0x4ac/0x5e8 [] __inet_stream_connect+0xd8/0x318 [] inet_stream_connect+0x3e/0x68 [] __sys_connect_file+0x50/0x88 [] __sys_connect+0x96/0xc8 [] __riscv_sys_connect+0x20/0x30 [] do_trap_ecall_u+0x256/0x378 [] handle_exception+0x14a/0x156 Code: 892a 0363 1205 489c 8bc1 c7e5 2d03 084a 2703 080a (2783) 0709 ---[ end trace 0000000000000000 ]--- The bpf_fifo_dequeue prog returns a skb which is a pointer. The pointer is treated as a 32bit value and sign extend to 64bit in epilogue. This behavior is right for most bpf prog types but wrong for struct ops which requires RISC-V ABI. So let's sign extend struct ops return values according to the function model and RISC-V ABI([0]). [0]: https://riscv.org/wp-content/uploads/2024/12/riscv-calling.pdf Fixes: 25ad10658dc1 ("riscv, bpf: Adapt bpf trampoline to optimized riscv ftrace framework") Signed-off-by: Hengqi Chen Signed-off-by: Daniel Borkmann Tested-by: Pu Lehui Reviewed-by: Pu Lehui Link: https://lore.kernel.org/bpf/20250908012448.1695-1-hengqi.chen@gmail.com --- arch/riscv/net/bpf_jit_comp64.c | 42 ++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 886b647dc206fb..14d7aab61fcb36 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -711,6 +711,39 @@ static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, return 0; } +/* + * Sign-extend the register if necessary + */ +static int sign_extend(u8 rd, u8 rs, u8 sz, bool sign, struct rv_jit_context *ctx) +{ + if (!sign && (sz == 1 || sz == 2)) { + if (rd != rs) + emit_mv(rd, rs, ctx); + return 0; + } + + switch (sz) { + case 1: + emit_sextb(rd, rs, ctx); + break; + case 2: + emit_sexth(rd, rs, ctx); + break; + case 4: + emit_sextw(rd, rs, ctx); + break; + case 8: + if (rd != rs) + emit_mv(rd, rs, ctx); + break; + default: + pr_err("bpf-jit: invalid size %d for sign_extend\n", sz); + return -EINVAL; + } + + return 0; +} + #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) #define REG_DONT_CLEAR_MARKER 0 /* RV_REG_ZERO unused in pt_regmap */ @@ -1175,8 +1208,15 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, restore_args(min_t(int, nr_arg_slots, RV_MAX_REG_ARGS), args_off, ctx); if (save_ret) { - emit_ld(RV_REG_A0, -retval_off, RV_REG_FP, ctx); emit_ld(regmap[BPF_REG_0], -(retval_off - 8), RV_REG_FP, ctx); + if (is_struct_ops) { + ret = sign_extend(RV_REG_A0, regmap[BPF_REG_0], m->ret_size, + m->ret_flags & BTF_FMODEL_SIGNED_ARG, ctx); + if (ret) + goto out; + } else { + emit_ld(RV_REG_A0, -retval_off, RV_REG_FP, ctx); + } } emit_ld(RV_REG_S1, -sreg_off, RV_REG_FP, ctx); From 3c17001b21b9f168c957ced9384abe969019b609 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:24 +0200 Subject: [PATCH 1725/3206] pidfs: validate extensible ioctls Validate extensible ioctls stricter than we do now. Reviewed-by: Aleksa Sarai Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/pidfs.c | 2 +- include/linux/fs.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index edc35522d75c49..0a5083b9cce5b8 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd) * erronously mistook the file descriptor for a pidfd. * This is not perfect but will catch most cases. */ - return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); + return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); } return false; diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d7051f..2f2edc53bf3c85 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -4023,4 +4023,18 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); +static inline bool extensible_ioctl_valid(unsigned int cmd_a, + unsigned int cmd_b, size_t min_size) +{ + if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b)) + return false; + if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b)) + return false; + if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b)) + return false; + if (_IOC_SIZE(cmd_a) < min_size) + return false; + return true; +} + #endif /* _LINUX_FS_H */ From 4d906371d1f9fc9ce47b2c8f37444680246557bc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:25 +0200 Subject: [PATCH 1726/3206] nsfs: drop tautological ioctl() check This check does absolutely nothing. So drop it. Signed-off-by: Christian Brauner --- fs/nsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 59aa801347a7de..ce51041bb62435 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -163,7 +163,7 @@ static bool nsfs_ioctl_valid(unsigned int cmd) case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + return true; } /* Extensible ioctls require some extra handling. */ From f8527a29f4619f74bc30a9845ea87abb9a6faa1e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:26 +0200 Subject: [PATCH 1727/3206] nsfs: validate extensible ioctls Validate extensible ioctls stricter than we do now. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/nsfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index ce51041bb62435..d016d36272d49e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -169,9 +169,11 @@ static bool nsfs_ioctl_valid(unsigned int cmd) /* Extensible ioctls require some extra handling. */ switch (_IOC_NR(cmd)) { case _IOC_NR(NS_MNT_GET_INFO): + return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_NEXT): + return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_PREV): - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0); } return false; From d3aeb6d97b22272bb4783c6d4309d81bb0a4527c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 5 Sep 2025 22:57:29 +0200 Subject: [PATCH 1728/3206] uprobes/x86: Return error from uprobe syscall when not called from trampoline Currently uprobe syscall handles all errors with forcing SIGILL to current process. As suggested by Andrii it'd be helpful for uprobe syscall detection to return error value for the !in_uprobe_trampoline check. This way we could just call uprobe syscall and based on return value we will find out if the kernel has it. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov --- arch/x86/kernel/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 0a8c0a4a54233a..845aeaf36b8d2f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -810,7 +810,7 @@ SYSCALL_DEFINE0(uprobe) /* Allow execution only from uprobe trampolines. */ if (!in_uprobe_trampoline(regs->ip)) - goto sigill; + return -ENXIO; err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); if (err) From 6d48436560e91be858158e227f21aab71698814e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 5 Sep 2025 22:57:30 +0200 Subject: [PATCH 1729/3206] selftests/bpf: Fix uprobe_sigill test for uprobe syscall error value The uprobe syscall now returns -ENXIO errno when called outside kernel trampoline, fixing the current sigill test to reflect that and renaming it to uprobe_error. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 34 ++++--------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index 5da0b49eeacaed..6d75ede16e7ced 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -757,34 +757,12 @@ static void test_uprobe_race(void) #define __NR_uprobe 336 #endif -static void test_uprobe_sigill(void) +static void test_uprobe_error(void) { - int status, err, pid; + long err = syscall(__NR_uprobe); - pid = fork(); - if (!ASSERT_GE(pid, 0, "fork")) - return; - /* child */ - if (pid == 0) { - asm volatile ( - "pushq %rax\n" - "pushq %rcx\n" - "pushq %r11\n" - "movq $" __stringify(__NR_uprobe) ", %rax\n" - "syscall\n" - "popq %r11\n" - "popq %rcx\n" - "retq\n" - ); - exit(0); - } - - err = waitpid(pid, &status, 0); - ASSERT_EQ(err, pid, "waitpid"); - - /* verify the child got killed with SIGILL */ - ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED"); - ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG"); + ASSERT_EQ(err, -1, "error"); + ASSERT_EQ(errno, ENXIO, "errno"); } static void __test_uprobe_syscall(void) @@ -805,8 +783,8 @@ static void __test_uprobe_syscall(void) test_uprobe_usdt(); if (test__start_subtest("uprobe_race")) test_uprobe_race(); - if (test__start_subtest("uprobe_sigill")) - test_uprobe_sigill(); + if (test__start_subtest("uprobe_error")) + test_uprobe_error(); if (test__start_subtest("uprobe_regs_equal")) test_uprobe_regs_equal(false); if (test__start_subtest("regs_change")) From 8b184c34806e5da4d4847fabd3faeff38b47e70a Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Mon, 15 Sep 2025 10:48:53 +0800 Subject: [PATCH 1730/3206] ASoC: Intel: hda-sdw-bpt: set persistent_buffer false MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The persistent_buffer agreement is false when hda_cl_prepare() is called. We should use the same value when hda_cl_cleanup() is called. Fixes: 5d5cb86fb46ea ("ASoC: SOF: Intel: hda-sdw-bpt: add helpers for SoundWire BPT DMA") Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20250915024853.1153518-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-sdw-bpt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sof/intel/hda-sdw-bpt.c b/sound/soc/sof/intel/hda-sdw-bpt.c index 1327f1cad0bcd9..ff5abccf0d88b6 100644 --- a/sound/soc/sof/intel/hda-sdw-bpt.c +++ b/sound/soc/sof/intel/hda-sdw-bpt.c @@ -150,7 +150,7 @@ static int hda_sdw_bpt_dma_deprepare(struct device *dev, struct hdac_ext_stream u32 mask; int ret; - ret = hda_cl_cleanup(sdev->dev, dmab_bdl, true, sdw_bpt_stream); + ret = hda_cl_cleanup(sdev->dev, dmab_bdl, false, sdw_bpt_stream); if (ret < 0) { dev_err(sdev->dev, "%s: SDW BPT DMA cleanup failed\n", __func__); From 18dda9eb9e11b2aeec73cbe2a56ab2f862841ba4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 13 Sep 2025 21:16:11 +0100 Subject: [PATCH 1731/3206] spi: amlogic: Fix error checking on regmap_write call Currently a call to regmap_write is not being error checked because the return checke is being performed on the variable ret and this variable is not assigned the return value from the regmap_write call. Fix this by adding in the missing assignment. Fixes: 4670db6f32e9 ("spi: amlogic: add driver for Amlogic SPI Flash Controller") Signed-off-by: Colin Ian King Link: https://patch.msgid.link/20250913201612.1338217-1-colin.i.king@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-amlogic-spifc-a4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-amlogic-spifc-a4.c b/drivers/spi/spi-amlogic-spifc-a4.c index 4ca8e82fdc67db..4338d00e56a6e8 100644 --- a/drivers/spi/spi-amlogic-spifc-a4.c +++ b/drivers/spi/spi-amlogic-spifc-a4.c @@ -420,7 +420,7 @@ static int aml_sfc_dma_buffer_setup(struct aml_sfc *sfc, void *databuf, goto out_map_data; cmd = CMD_DATA_ADDRH(sfc->daddr); - regmap_write(sfc->regmap_base, SFC_CMD, cmd); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); if (ret) goto out_map_data; From c7a321e4e90e1bd072697bc050b9426e04cffc6a Mon Sep 17 00:00:00 2001 From: Mohammad Rafi Shaik Date: Sun, 14 Sep 2025 18:45:49 +0530 Subject: [PATCH 1732/3206] ASoC: qcom: sc8280xp: Fix sound card driver name match data for QCS8275 The QCS8275 board is based on Qualcomm's QCS8300 SoC family, and all supported firmware files are located in the qcs8300 directory. The sound topology and ALSA UCM configuration files have also been migrated from the qcs8275 directory to the actual SoC qcs8300 directory in linux-firmware. With the current setup, the sound topology fails to load, resulting in sound card registration failure. This patch updates the driver match data to use the correct driver name qcs8300 for the qcs8275-sndcard, ensuring that the sound card driver correctly loads the sound topology and ALSA UCM configuration files from the qcs8300 directory. Fixes: 34d340d48e595 ("ASoC: qcom: sc8280xp: Add support for QCS8275") Cc: stable@vger.kernel.org Signed-off-by: Mohammad Rafi Shaik Reviewed-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250914131549.1198740-1-mohammad.rafi.shaik@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/qcom/sc8280xp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/qcom/sc8280xp.c b/sound/soc/qcom/sc8280xp.c index 73f9f82c4e2581..db48168b7d3f5a 100644 --- a/sound/soc/qcom/sc8280xp.c +++ b/sound/soc/qcom/sc8280xp.c @@ -186,7 +186,7 @@ static int sc8280xp_platform_probe(struct platform_device *pdev) static const struct of_device_id snd_sc8280xp_dt_match[] = { {.compatible = "qcom,qcm6490-idp-sndcard", "qcm6490"}, {.compatible = "qcom,qcs6490-rb3gen2-sndcard", "qcs6490"}, - {.compatible = "qcom,qcs8275-sndcard", "qcs8275"}, + {.compatible = "qcom,qcs8275-sndcard", "qcs8300"}, {.compatible = "qcom,qcs9075-sndcard", "qcs9075"}, {.compatible = "qcom,qcs9100-sndcard", "qcs9100"}, {.compatible = "qcom,sc8280xp-sndcard", "sc8280xp"}, From 73caf2bcf3f0a3843c091b78b0711e356f8a2ef9 Mon Sep 17 00:00:00 2001 From: Mac Chiang Date: Mon, 15 Sep 2025 10:54:56 +0800 Subject: [PATCH 1733/3206] ASoC: Intel: sof_sdw: use PRODUCT_FAMILY for Fatcat series PRODUCT_NAME is machine-specific. Use PRODUCT_FAMILY to ensure the machine quirk is applied with consistent audio configurations across Fatcat series products. Signed-off-by: Mac Chiang Reviewed-by: Kai Vehmanen Signed-off-by: Bard Liao Link: https://patch.msgid.link/20250915025456.1154200-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index f997b2dc221b2b..28f03a5f29f741 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -761,7 +761,7 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = { .callback = sof_sdw_quirk_cb, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Google"), - DMI_MATCH(DMI_PRODUCT_NAME, "Fatcat"), + DMI_MATCH(DMI_PRODUCT_FAMILY, "Google_Fatcat"), }, .driver_data = (void *)(SOC_SDW_PCH_DMIC | SOF_BT_OFFLOAD_SSP(2) | From d7871f400cad1da376f1d7724209a1c49226c456 Mon Sep 17 00:00:00 2001 From: Venkata Prasad Potturu Date: Wed, 10 Sep 2025 22:43:59 +0530 Subject: [PATCH 1734/3206] ASoC: amd: acp: Fix incorrect retrival of acp_chip_info Use dev_get_drvdata(dev->parent) instead of dev_get_platdata(dev) to correctly obtain acp_chip_info members in the acp I2S driver. Previously, some members were not updated properly due to incorrect data access, which could potentially lead to null pointer dereferences. This issue was missed in the earlier commit ("ASoC: amd: acp: Fix NULL pointer deref in acp_i2s_set_tdm_slot"), which only addressed set_tdm_slot(). This change ensures that all relevant functions correctly retrieve acp_chip_info, preventing further null pointer dereference issues. Fixes: e3933683b25e ("ASoC: amd: acp: Remove redundant acp_dev_data structure") Signed-off-by: Venkata Prasad Potturu Reviewed-by: Cezary Rojewski Link: https://patch.msgid.link/20250910171419.3682468-1-venkataprasad.potturu@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-i2s.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sound/soc/amd/acp/acp-i2s.c b/sound/soc/amd/acp/acp-i2s.c index 617690362ad75c..4ba0a66981ea9d 100644 --- a/sound/soc/amd/acp/acp-i2s.c +++ b/sound/soc/amd/acp/acp-i2s.c @@ -73,7 +73,7 @@ static int acp_i2s_set_fmt(struct snd_soc_dai *cpu_dai, unsigned int fmt) { struct device *dev = cpu_dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); int mode; mode = fmt & SND_SOC_DAIFMT_FORMAT_MASK; @@ -199,7 +199,7 @@ static int acp_i2s_hwparams(struct snd_pcm_substream *substream, struct snd_pcm_ u32 reg_val, fmt_reg, tdm_fmt; u32 lrclk_div_val, bclk_div_val; - chip = dev_get_platdata(dev); + chip = dev_get_drvdata(dev->parent); rsrc = chip->rsrc; /* These values are as per Hardware Spec */ @@ -386,7 +386,7 @@ static int acp_i2s_trigger(struct snd_pcm_substream *substream, int cmd, struct { struct acp_stream *stream = substream->runtime->private_data; struct device *dev = dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); struct acp_resource *rsrc = chip->rsrc; u32 val, period_bytes, reg_val, ier_val, water_val, buf_size, buf_reg; @@ -516,14 +516,13 @@ static int acp_i2s_trigger(struct snd_pcm_substream *substream, int cmd, struct static int acp_i2s_prepare(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) { struct device *dev = dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); struct acp_resource *rsrc = chip->rsrc; struct acp_stream *stream = substream->runtime->private_data; u32 reg_dma_size = 0, reg_fifo_size = 0, reg_fifo_addr = 0; u32 phy_addr = 0, acp_fifo_addr = 0, ext_int_ctrl; unsigned int dir = substream->stream; - chip = dev_get_platdata(dev); switch (dai->driver->id) { case I2S_SP_INSTANCE: if (dir == SNDRV_PCM_STREAM_PLAYBACK) { @@ -632,7 +631,7 @@ static int acp_i2s_startup(struct snd_pcm_substream *substream, struct snd_soc_d { struct acp_stream *stream = substream->runtime->private_data; struct device *dev = dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); struct acp_resource *rsrc = chip->rsrc; unsigned int dir = substream->stream; unsigned int irq_bit = 0; From cc648f4dde2ffbb74b9c1626e3eaaac89c6fbe16 Mon Sep 17 00:00:00 2001 From: Balamurugan C Date: Mon, 15 Sep 2025 10:56:54 +0800 Subject: [PATCH 1735/3206] ASoC: Intel: PTL: Add entry for HDMI-In capture support to non-I2S codec boards. Adding HDMI-In capture support for the PTL products which doesn't have onboard I2S codec. But need to support HDMI-In capture via I2S and audio playback through HDMI/DP monitor. Signed-off-by: Balamurugan C Signed-off-by: Bard Liao Link: https://patch.msgid.link/20250915025655.1154279-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_ssp_amp.c | 6 ++++++ sound/soc/intel/common/soc-acpi-intel-ptl-match.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/sound/soc/intel/boards/sof_ssp_amp.c b/sound/soc/intel/boards/sof_ssp_amp.c index 48ee5353bdf1c4..729c0cd7c19c25 100644 --- a/sound/soc/intel/boards/sof_ssp_amp.c +++ b/sound/soc/intel/boards/sof_ssp_amp.c @@ -216,6 +216,12 @@ static const struct platform_device_id board_ids[] = { /* SSP 0 and SSP 2 are used for HDMI IN */ SOF_HDMI_PLAYBACK_PRESENT), }, + { + .name = "ptl_lt6911_hdmi_ssp", + .driver_data = (kernel_ulong_t)(SOF_SSP_MASK_HDMI_CAPTURE(0x5) | + /* SSP 0 and SSP 2 are used for HDMI IN */ + SOF_HDMI_PLAYBACK_PRESENT), + }, { } }; MODULE_DEVICE_TABLE(platform, board_ids); diff --git a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c index e292701dfcfe5b..3c8b10e21ceb83 100644 --- a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c @@ -61,6 +61,12 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_ptl_machines[] = { SND_SOC_ACPI_TPLG_INTEL_SSP_MSB | SND_SOC_ACPI_TPLG_INTEL_DMIC_NUMBER, }, + /* place amp-only boards in the end of table */ + { + .id = "INTC10B0", + .drv_name = "ptl_lt6911_hdmi_ssp", + .sof_tplg_filename = "sof-ptl-hdmi-ssp02.tplg", + }, {}, }; EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_ptl_machines); From 66d938e89e940e512f4c3deac938ecef399c13f9 Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Fri, 5 Sep 2025 09:59:25 +0800 Subject: [PATCH 1736/3206] netfs: Prevent duplicate unlocking The filio lock has been released here, so there is no need to jump to error_folio_unlock to release it again. Reported-by: syzbot+b73c7d94a151e2ee1e9b@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b73c7d94a151e2ee1e9b Signed-off-by: Lizhi Xu Acked-by: David Howells Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index f27ea5099a6813..09394ac2c180d3 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -347,7 +347,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, folio_put(folio); ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); if (ret < 0) - goto error_folio_unlock; + goto out; continue; copied: From fa1d117162aa820a8f73a31ccab85bde6c84725b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 16 Aug 2025 11:06:32 +0100 Subject: [PATCH 1737/3206] x86/cpu: Detect FreeBSD Bhyve hypervisor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Detect the Bhyve hypervisor and enable 15-bit MSI support if available. Detecting Bhyve used to be a purely cosmetic issue of the kernel printing 'Hypervisor detected: Bhyve' at boot time. But FreeBSD 15.0 will support¹ the 15-bit MSI enlightenment to support more than 255 vCPUs (http://david.woodhou.se/ExtDestId.pdf) which means there's now actually some functional reason to do so. ¹ https://github.com/freebsd/freebsd-src/commit/313a68ea20b4 [ bp: Massage, move tail comment ontop. ] Signed-off-by: David Woodhouse Signed-off-by: Borislav Petkov (AMD) Acked-by: Ahmed S. Darwish Link: https://lore.kernel.org/03802f6f7f5b5cf8c5e8adfe123c397ca8e21093.camel@infradead.org --- arch/x86/Kconfig | 9 +++++ arch/x86/include/asm/hypervisor.h | 2 + arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/bhyve.c | 66 +++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/hypervisor.c | 3 ++ 5 files changed, 81 insertions(+) create mode 100644 arch/x86/kernel/cpu/bhyve.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 52c8910ba2efdd..2be5a50f00c292 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -879,6 +879,15 @@ config ACRN_GUEST IOT with small footprint and real-time features. More details can be found in https://projectacrn.org/. +config BHYVE_GUEST + bool "Bhyve (BSD Hypervisor) Guest support" + depends on X86_64 + help + This option allows to run Linux to recognise when it is running as a + guest in the Bhyve hypervisor, and to support more than 255 vCPUs when + when doing so. More details about Bhyve can be found at https://bhyve.org + and https://wiki.freebsd.org/bhyve/. + config INTEL_TDX_GUEST bool "Intel TDX (Trust Domain Extensions) - Guest Support" depends on X86_64 && CPU_SUP_INTEL diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index e41cbf2ec41d20..9ad86a7d13f6d7 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -30,6 +30,7 @@ enum x86_hypervisor_type { X86_HYPER_KVM, X86_HYPER_JAILHOUSE, X86_HYPER_ACRN, + X86_HYPER_BHYVE, }; #ifdef CONFIG_HYPERVISOR_GUEST @@ -64,6 +65,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_pv; extern const struct hypervisor_x86 x86_hyper_kvm; extern const struct hypervisor_x86 x86_hyper_jailhouse; extern const struct hypervisor_x86 x86_hyper_acrn; +extern const struct hypervisor_x86 x86_hyper_bhyve; extern struct hypervisor_x86 x86_hyper_xen_hvm; extern bool nopv; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1e26179ff18c4a..2f8a58ef690ec7 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_X86_SGX) += sgx/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +obj-$(CONFIG_BHYVE_GUEST) += bhyve.o obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c new file mode 100644 index 00000000000000..f1a8ca3dd1ed86 --- /dev/null +++ b/arch/x86/kernel/cpu/bhyve.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FreeBSD Bhyve guest enlightenments + * + * Copyright © 2025 Amazon.com, Inc. or its affiliates. + * + * Author: David Woodhouse + */ + +#include +#include +#include +#include + +static uint32_t bhyve_cpuid_base; +static uint32_t bhyve_cpuid_max; + +#define BHYVE_SIGNATURE "bhyve bhyve " + +#define CPUID_BHYVE_FEATURES 0x40000001 + +/* Features advertised in CPUID_BHYVE_FEATURES %eax */ + +/* MSI Extended Dest ID */ +#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0) + +static uint32_t __init bhyve_detect(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return 0; + + bhyve_cpuid_base = cpuid_base_hypervisor(BHYVE_SIGNATURE, 0); + if (!bhyve_cpuid_base) + return 0; + + bhyve_cpuid_max = cpuid_eax(bhyve_cpuid_base); + return bhyve_cpuid_max; +} + +static uint32_t bhyve_features(void) +{ + unsigned int cpuid_leaf = bhyve_cpuid_base | CPUID_BHYVE_FEATURES; + + if (bhyve_cpuid_max < cpuid_leaf) + return 0; + + return cpuid_eax(cpuid_leaf); +} + +static bool __init bhyve_ext_dest_id(void) +{ + return !!(bhyve_features() & CPUID_BHYVE_FEAT_EXT_DEST_ID); +} + +static bool __init bhyve_x2apic_available(void) +{ + return true; +} + +const struct hypervisor_x86 x86_hyper_bhyve __refconst = { + .name = "Bhyve", + .detect = bhyve_detect, + .init.init_platform = x86_init_noop, + .init.x2apic_available = bhyve_x2apic_available, + .init.msi_ext_dest_id = bhyve_ext_dest_id, +}; diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 553bfbfc3a1b5a..f3e9219845e858 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -45,6 +45,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #ifdef CONFIG_ACRN_GUEST &x86_hyper_acrn, #endif +#ifdef CONFIG_BHYVE_GUEST + &x86_hyper_bhyve, +#endif }; enum x86_hypervisor_type x86_hyper_type; From 013e484dbd687a9174acf8f4450217bdb86ad788 Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Tue, 19 Aug 2025 15:39:51 +0000 Subject: [PATCH 1738/3206] drm/xe/tile: Release kobject for the failure path Call kobject_put() for the failure path to release the kobject v2: remove extra newline. (Matt) Fixes: e3d0839aa501 ("drm/xe/tile: Abort driver load for sysfs creation failure") Cc: Himal Prasad Ghimiray Reviewed-by: Matthew Brost Signed-off-by: Shuicheng Lin Link: https://lore.kernel.org/r/20250819153950.2973344-2-shuicheng.lin@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit b98775bca99511cc22ab459a2de646cd2fa7241f) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_tile_sysfs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_tile_sysfs.c b/drivers/gpu/drm/xe/xe_tile_sysfs.c index b804234a655160..9e1236a9ec6734 100644 --- a/drivers/gpu/drm/xe/xe_tile_sysfs.c +++ b/drivers/gpu/drm/xe/xe_tile_sysfs.c @@ -44,16 +44,18 @@ int xe_tile_sysfs_init(struct xe_tile *tile) kt->tile = tile; err = kobject_add(&kt->base, &dev->kobj, "tile%d", tile->id); - if (err) { - kobject_put(&kt->base); - return err; - } + if (err) + goto err_object; tile->sysfs = &kt->base; err = xe_vram_freq_sysfs_init(tile); if (err) - return err; + goto err_object; return devm_add_action_or_reset(xe->drm.dev, tile_sysfs_fini, tile); + +err_object: + kobject_put(&kt->base); + return err; } From fef8b64e48e836344574b85132a1c317f4260022 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 11 Sep 2025 00:24:39 +0200 Subject: [PATCH 1739/3206] drm/xe/pf: Drop rounddown_pow_of_two fair LMEM limitation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This effectively reverts commit 4c3fe5eae46b ("drm/xe/pf: Limit fair VF LMEM provisioning") since we don't need it any more after non-contig VRAM allocations were fixed. This allows larger LMEM auto-provisioning for VFs, so instead: [ ] GT0: PF: LMEM available(14096M) fair(1 x 8192M) [ ] GT0: PF: VF1 provisioned with 8589934592 (8.00 GiB) LMEM or [ ] GT0: PF: LMEM available(14096M) fair(2 x 4096M) [ ] GT0: PF: VF1..VF2 provisioned with 4294967296 (4.00 GiB) LMEM we may get: [ ] GT0: PF: LMEM available(14096M) fair(1 x 14096M) [ ] GT0: PF: VF1 provisioned with 14780727296 (13.8 GiB) LMEM and [ ] GT0: PF: LMEM available(14096M) fair(2 x 7048M) [ ] GT0: PF: VF1..VF2 provisioned with 7390363648 (6.88 GiB) LMEM Fixes: 1e32ffbc9dc8 ("drm/xe/sriov: support non-contig VRAM provisioning") Signed-off-by: Michal Wajdeczko Reviewed-by: Piotr Piórkowski Link: https://lore.kernel.org/r/20250910222439.32869-1-michal.wajdeczko@intel.com (cherry picked from commit 95c1cfa306087142989bff34ea0e05dcd95ddc58) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 494909f74eb22c..d84831a03610db 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -1632,7 +1632,6 @@ static u64 pf_estimate_fair_lmem(struct xe_gt *gt, unsigned int num_vfs) u64 fair; fair = div_u64(available, num_vfs); - fair = rounddown_pow_of_two(fair); /* XXX: ttm_vram_mgr & drm_buddy limitation */ fair = ALIGN_DOWN(fair, alignment); #ifdef MAX_FAIR_LMEM fair = min_t(u64, MAX_FAIR_LMEM, fair); From 796667c9dc94ef9431d4ac620021527b273080e1 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 9 Sep 2025 09:55:06 +0200 Subject: [PATCH 1740/3206] fs/proc/namespaces: make ns_entries const Global variables that are never modified should be "const" so so that they live in the .rodata section instead of the .data section of the kernel, gaining the protection of the kernel's strict memory permissions as described in Documentation/security/self-protection.rst Signed-off-by: Max Kellermann Signed-off-by: Christian Brauner --- fs/proc/namespaces.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 4403a2e20c165d..ea2b597fd92cdb 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -12,7 +12,7 @@ #include "internal.h" -static const struct proc_ns_operations *ns_entries[] = { +static const struct proc_ns_operations *const ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif @@ -117,7 +117,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry, static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { struct task_struct *task = get_proc_task(file_inode(file)); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; if (!task) return -ENOENT; @@ -151,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct task_struct *task = get_proc_task(dir); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; unsigned int len = dentry->d_name.len; struct dentry *res = ERR_PTR(-ENOENT); From af67f4c1cd0730810521ef4bc194c21ac3683d97 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 9 Sep 2025 09:54:58 +0200 Subject: [PATCH 1741/3206] fs: use the switch statement in init_special_inode() Similar to may_open(). No functional changes. Signed-off-by: Mateusz Guzik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index cc0f717a140d3e..7b81d4a101b8c7 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2518,21 +2518,28 @@ void __init inode_init(void) void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; - if (S_ISCHR(mode)) { + switch (inode->i_mode & S_IFMT) { + case S_IFCHR: inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; - } else if (S_ISBLK(mode)) { + break; + case S_IFBLK: if (IS_ENABLED(CONFIG_BLOCK)) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; - } else if (S_ISFIFO(mode)) + break; + case S_IFIFO: inode->i_fop = &pipefifo_fops; - else if (S_ISSOCK(mode)) - ; /* leave it no_open_fops */ - else + break; + case S_IFSOCK: + /* leave it no_open_fops */ + break; + default: printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, inode->i_ino); + break; + } } EXPORT_SYMBOL(init_special_inode); From 84f1766bdba5a6394618a45b34a320f336b02d95 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 11 Sep 2025 11:49:08 +0200 Subject: [PATCH 1742/3206] initrd: Fix unused variable warning in rd_load_image() on s390 The local variables 'rotator' and 'rotate' (used for the progress indicator) aren't used on s390. Building the kernel with W=1 generates the following warning: init/do_mounts_rd.c:192:17: warning: variable 'rotate' set but not used [-Wunused-but-set-variable] 192 | unsigned short rotate = 0; | ^ 1 warning generated. Remove the preprocessor directives and use the IS_ENABLED(CONFIG_S390) macro instead, allowing the compiler to optimize away unused variables and avoid the warning on s390. Signed-off-by: Thorsten Blum Signed-off-by: Christian Brauner --- init/do_mounts_rd.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index ac021ae6e6fa78..3cbdf43a42dff5 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -191,9 +191,7 @@ int __init rd_load_image(char *from) char *buf = NULL; unsigned short rotate = 0; decompress_fn decompressor = NULL; -#if !defined(CONFIG_S390) char rotator[4] = { '|' , '/' , '-' , '\\' }; -#endif out_file = filp_open("/dev/ram", O_RDWR, 0); if (IS_ERR(out_file)) @@ -255,12 +253,10 @@ int __init rd_load_image(char *from) } kernel_read(in_file, buf, BLOCK_SIZE, &in_pos); kernel_write(out_file, buf, BLOCK_SIZE, &out_pos); -#if !defined(CONFIG_S390) - if (!(i % 16)) { + if (!IS_ENABLED(CONFIG_S390) && !(i % 16)) { pr_cont("%c\b", rotator[rotate & 0x3]); rotate++; } -#endif } pr_cont("done.\n"); From ac1440de1ba39fbd68da7fbcc4b1885685aa69e1 Mon Sep 17 00:00:00 2001 From: Dishank Jogi Date: Mon, 28 Jul 2025 07:14:52 +0000 Subject: [PATCH 1743/3206] zorro: Remove extra whitespace in macro definitions Clean up the formatting of the MANUF() and PRODUCT() macro definitions in 'drivers/zorro/names.c' by removing unneeded spaces. No functional changes. Signed-off-by: Dishank Jogi Reviewed-by: Geert Uytterhoeven Link: https://patch.msgid.link/20250728071452.35171-1-jogidishank503@gmail.com Signed-off-by: Geert Uytterhoeven --- drivers/zorro/names.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/zorro/names.c b/drivers/zorro/names.c index 077114ccc84073..b44f90989a66c5 100644 --- a/drivers/zorro/names.c +++ b/drivers/zorro/names.c @@ -36,21 +36,21 @@ struct zorro_manuf_info { * real memory.. Parse the same file multiple times * to get all the info. */ -#define MANUF( manuf, name ) static char __manufstr_##manuf[] __initdata = name; +#define MANUF(manuf, name) static char __manufstr_##manuf[] __initdata = name; #define ENDMANUF() -#define PRODUCT( manuf, prod, name ) static char __prodstr_##manuf##prod[] __initdata = name; +#define PRODUCT(manuf, prod, name) static char __prodstr_##manuf##prod[] __initdata = name; #include "devlist.h" -#define MANUF( manuf, name ) static struct zorro_prod_info __prods_##manuf[] __initdata = { +#define MANUF(manuf, name) static struct zorro_prod_info __prods_##manuf[] __initdata = { #define ENDMANUF() }; -#define PRODUCT( manuf, prod, name ) { 0x##prod, 0, __prodstr_##manuf##prod }, +#define PRODUCT(manuf, prod, name) { 0x##prod, 0, __prodstr_##manuf##prod }, #include "devlist.h" static struct zorro_manuf_info __initdata zorro_manuf_list[] = { -#define MANUF( manuf, name ) { 0x##manuf, ARRAY_SIZE(__prods_##manuf), __manufstr_##manuf, __prods_##manuf }, +#define MANUF(manuf, name) { 0x##manuf, ARRAY_SIZE(__prods_##manuf), __manufstr_##manuf, __prods_##manuf }, #define ENDMANUF() -#define PRODUCT( manuf, prod, name ) +#define PRODUCT(manuf, prod, name) #include "devlist.h" }; From 6d5674090543b89aac0c177d67e5fb32ddc53804 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 10 Sep 2025 17:16:13 +0200 Subject: [PATCH 1744/3206] m68k: bitops: Fix find_*_bit() signatures The function signatures of the m68k-optimized implementations of the find_{first,next}_{,zero_}bit() helpers do not match the generic variants. Fix this by changing all non-pointer inputs and outputs to "unsigned long", and updating a few local variables. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509092305.ncd9mzaZ-lkp@intel.com/ Signed-off-by: Geert Uytterhoeven Acked-by: "Yury Norov (NVIDIA)" Link: https://patch.msgid.link/de6919554fbb4cd1427155c6bafbac8a9df822c8.1757517135.git.geert@linux-m68k.org --- arch/m68k/include/asm/bitops.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index 14c64a6f121762..50ec92651d5a5f 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -350,12 +350,12 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, #include #else -static inline int find_first_zero_bit(const unsigned long *vaddr, - unsigned size) +static inline unsigned long find_first_zero_bit(const unsigned long *vaddr, + unsigned long size) { const unsigned long *p = vaddr; - int res = 32; - unsigned int words; + unsigned long res = 32; + unsigned long words; unsigned long num; if (!size) @@ -376,8 +376,9 @@ static inline int find_first_zero_bit(const unsigned long *vaddr, } #define find_first_zero_bit find_first_zero_bit -static inline int find_next_zero_bit(const unsigned long *vaddr, int size, - int offset) +static inline unsigned long find_next_zero_bit(const unsigned long *vaddr, + unsigned long size, + unsigned long offset) { const unsigned long *p = vaddr + (offset >> 5); int bit = offset & 31UL, res; @@ -406,11 +407,12 @@ static inline int find_next_zero_bit(const unsigned long *vaddr, int size, } #define find_next_zero_bit find_next_zero_bit -static inline int find_first_bit(const unsigned long *vaddr, unsigned size) +static inline unsigned long find_first_bit(const unsigned long *vaddr, + unsigned long size) { const unsigned long *p = vaddr; - int res = 32; - unsigned int words; + unsigned long res = 32; + unsigned long words; unsigned long num; if (!size) @@ -431,8 +433,9 @@ static inline int find_first_bit(const unsigned long *vaddr, unsigned size) } #define find_first_bit find_first_bit -static inline int find_next_bit(const unsigned long *vaddr, int size, - int offset) +static inline unsigned long find_next_bit(const unsigned long *vaddr, + unsigned long size, + unsigned long offset) { const unsigned long *p = vaddr + (offset >> 5); int bit = offset & 31UL, res; From 695f2c0c25f3c844f9dd177d20c1e8afc55df1ac Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Aug 2025 11:30:06 +0200 Subject: [PATCH 1745/3206] m68k: defconfig: Update defconfigs for v6.17-rc1 - Enable Netfilter legacy tables support, - Drop CONFIG_IP_NF_FILTER=m, CONFIG_IP_NF_MANGLE=m, CONFIG_IP6_NF_FILTER=m, and CONFIG_IP6_NF_MANGLE=m (auto-modular since commit 9fce66583f06c212 ("netfilter: Exclude LEGACY TABLES on PREEMPT_RT.")), - Enable legacy EBTABLES support (no longer auto-selected since commit 9fce66583f06c212 ("netfilter: Exclude LEGACY TABLES on PREEMPT_RT.")), - Drop CONFIG_CDROM_PKTCDVD=m (removed in commit 1cea5180f2f812c4 ("block: remove pktcdvd driver")), - Move CONFIG_CRC_BENCHMARK=y (moved in commit 89a51591405e09a8 ("lib/crc: Move files into lib/crc/")). Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/012bc96a01eef989b39eedbe84591bd50c022e57.1754904412.git.geert@linux-m68k.org --- arch/m68k/configs/amiga_defconfig | 9 +++------ arch/m68k/configs/apollo_defconfig | 9 +++------ arch/m68k/configs/atari_defconfig | 9 +++------ arch/m68k/configs/bvme6000_defconfig | 9 +++------ arch/m68k/configs/hp300_defconfig | 9 +++------ arch/m68k/configs/mac_defconfig | 9 +++------ arch/m68k/configs/multi_defconfig | 9 +++------ arch/m68k/configs/mvme147_defconfig | 9 +++------ arch/m68k/configs/mvme16x_defconfig | 9 +++------ arch/m68k/configs/q40_defconfig | 9 +++------ arch/m68k/configs/sun3_defconfig | 9 +++------ arch/m68k/configs/sun3x_defconfig | 9 +++------ 12 files changed, 36 insertions(+), 72 deletions(-) diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 5171bb183967b9..44bbeb07b36a42 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -125,6 +125,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -206,14 +207,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -233,10 +232,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -245,6 +242,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -309,7 +307,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -605,6 +602,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -636,7 +634,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 16f343ae48c675..9068daac1b7c5a 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -121,6 +121,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -202,14 +203,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -229,10 +228,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -241,6 +238,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -299,7 +297,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -562,6 +559,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -593,7 +591,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index c08788728ea962..3e824864f6d778 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -128,6 +128,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -209,14 +210,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -236,10 +235,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -248,6 +245,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -310,7 +308,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -582,6 +579,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -613,7 +611,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 962497e7c53fd6..9d032e4ce145ea 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -118,6 +118,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -199,14 +200,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -226,10 +225,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -238,6 +235,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -296,7 +294,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -554,6 +551,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -585,7 +583,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index ec28650189e406..94d0d74994f54d 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -120,6 +120,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -201,14 +202,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -228,10 +227,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -240,6 +237,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -298,7 +296,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -564,6 +561,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -595,7 +593,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 0afb3ad180dee3..f72c5374628d30 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -119,6 +119,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -200,14 +201,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -227,10 +226,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -239,6 +236,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -298,7 +296,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -581,6 +578,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -612,7 +610,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index b311e953995d6d..e008dabfa2d3f3 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -139,6 +139,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -220,14 +221,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -247,10 +246,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -259,6 +256,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -327,7 +325,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -668,6 +665,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -699,7 +697,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index f4e6224f137f99..dd2b340336af19 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -117,6 +117,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -198,14 +199,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -225,10 +224,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -237,6 +234,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -295,7 +293,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -554,6 +551,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -585,7 +583,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 498e167222f18c..cf584e961b1d7d 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -118,6 +118,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -199,14 +200,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -226,10 +225,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -238,6 +235,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -296,7 +294,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -555,6 +552,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -586,7 +584,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 8c6b1eef853423..081bcbc9eed0eb 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -119,6 +119,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -200,14 +201,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -227,10 +226,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -239,6 +236,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -300,7 +298,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -571,6 +568,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -602,7 +600,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index c34648f299efb9..43a88b2c694d8c 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -114,6 +114,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -195,14 +196,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -222,10 +221,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -234,6 +231,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -292,7 +290,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -552,6 +549,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -582,7 +580,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 73810d14660f21..6190478f85bbf1 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -115,6 +115,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -196,14 +197,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -223,10 +222,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -235,6 +232,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -293,7 +291,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -552,6 +549,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -583,7 +581,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m From cde560f98a9b6e64dd675f6bd10137cc8243a32a Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 11 Sep 2025 08:56:41 +0200 Subject: [PATCH 1746/3206] fs: expand dump_inode() This adds fs name and few fields from struct inode: i_mode, i_opflags, i_flags, i_state and i_count. All values printed raw, no attempt to pretty-print anything. Compile tested on i386 and runtime tested on amd64. Sample output: [ 23.121281] VFS_WARN_ON_INODE("crap") encountered for inode ffff9a1a83ce3660 fs pipefs mode 10600 opflags 0x4 flags 0x0 state 0x38 count 0 Signed-off-by: Mateusz Guzik --- fs/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 833de5457a065b..bf750376020665 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2935,10 +2935,18 @@ EXPORT_SYMBOL(mode_strip_sgid); * * TODO: add a proper inode dumping routine, this is a stub to get debug off the * ground. + * + * TODO: handle getting to fs type with get_kernel_nofault()? + * See dump_mapping() above. */ void dump_inode(struct inode *inode, const char *reason) { - pr_warn("%s encountered for inode %px", reason, inode); + struct super_block *sb = inode->i_sb; + + pr_warn("%s encountered for inode %px\n" + "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", + reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, + inode->i_flags, inode->i_state, atomic_read(&inode->i_count)); } EXPORT_SYMBOL(dump_inode); From e60625e7ce10f696690d9e36a9f32fc7c1897f79 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 12 Sep 2025 08:52:57 +0200 Subject: [PATCH 1747/3206] initramfs: Use struct_size() helper to improve dir_add() Use struct_size() to calculate the number of bytes to allocate for a new directory entry. No functional changes. Signed-off-by: Thorsten Blum Signed-off-by: Christian Brauner --- init/initramfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/init/initramfs.c b/init/initramfs.c index 097673b97784db..c607f4d681ef69 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "do_mounts.h" #include "initramfs_internal.h" @@ -152,7 +153,7 @@ static void __init dir_add(const char *name, size_t nlen, time64_t mtime) { struct dir_entry *de; - de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL); + de = kmalloc(struct_size(de, name, nlen), GFP_KERNEL); if (!de) panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); From beb022ef9263d0c5b1a994fa3ab5d3244a01aaa1 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 12 Sep 2025 09:46:52 +0200 Subject: [PATCH 1748/3206] initrd: Use str_plural() in rd_load_image() Add the local variable 'nr_disks' and replace the manual ternary "s" pluralization with the standardized str_plural() helper function. Use pr_notice() instead of printk(KERN_NOTICE) to silence a checkpatch warning. No functional changes intended. Signed-off-by: Thorsten Blum Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- init/do_mounts_rd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index 3cbdf43a42dff5..19d9f33dcacf85 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "do_mounts.h" @@ -186,7 +187,7 @@ static unsigned long nr_blocks(struct file *file) int __init rd_load_image(char *from) { int res = 0; - unsigned long rd_blocks, devblocks; + unsigned long rd_blocks, devblocks, nr_disks; int nblocks, i; char *buf = NULL; unsigned short rotate = 0; @@ -242,8 +243,9 @@ int __init rd_load_image(char *from) goto done; } - printk(KERN_NOTICE "RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ", - nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : ""); + nr_disks = (nblocks - 1) / devblocks + 1; + pr_notice("RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ", + nblocks, nr_disks, str_plural(nr_disks)); for (i = 0; i < nblocks; i++) { if (i && (i % devblocks == 0)) { pr_cont("done disk #1.\n"); From afd77d2050c35aee0d51ab7fb5b36a0fcabd4eee Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 12 Sep 2025 08:47:24 +0200 Subject: [PATCH 1749/3206] initramfs: Replace strcpy() with strscpy() in find_link() strcpy() is deprecated; use strscpy() instead. Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Thorsten Blum Reviewed-by: Justin Stitt Signed-off-by: Christian Brauner --- init/initramfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/initramfs.c b/init/initramfs.c index c607f4d681ef69..6ddbfb17fb8f1b 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -109,7 +109,7 @@ static char __init *find_link(int major, int minor, int ino, q->minor = minor; q->ino = ino; q->mode = mode; - strcpy(q->name, name); + strscpy(q->name, name); q->next = NULL; *p = q; hardlink_seen = true; From 74792608606a525a0e0df7e8d48acd8000561389 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 15 Sep 2025 09:11:05 +0200 Subject: [PATCH 1750/3206] init: INITRAMFS_PRESERVE_MTIME should depend on BLK_DEV_INITRD INITRAMFS_PRESERVE_MTIME is only used in init/initramfs.c and init/initramfs_test.c. Hence add a dependency on BLK_DEV_INITRD, to prevent asking the user about this feature when configuring a kernel without initramfs support. Fixes: 1274aea127b2e8c9 ("initramfs: add INITRAMFS_PRESERVE_MTIME Kconfig option") Signed-off-by: Geert Uytterhoeven Reviewed-by: Martin Wilck Reviewed-by: David Disseldorp Signed-off-by: Christian Brauner --- init/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/init/Kconfig b/init/Kconfig index 83632025121937..096726e9c95c58 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1497,6 +1497,7 @@ config BOOT_CONFIG_EMBED_FILE config INITRAMFS_PRESERVE_MTIME bool "Preserve cpio archive mtimes in initramfs" + depends on BLK_DEV_INITRD default y help Each entry in an initramfs cpio archive carries an mtime value. When From 54fd6bd42e7bd351802ff1d193a2e33e4bfb1836 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 8 Sep 2025 17:26:45 +0530 Subject: [PATCH 1751/3206] cdx: Split mcdi.h and reorganize headers Move bitfield.h from the CDX controller directory to include/linux/cdx to make them accessible to other drivers. As part of this refactoring, split mcdi.h into two headers: - mcdi.h: retains interface-level declarations - mcdid.h: contains internal definitions and macros This is in preparation for VersalNET EDAC driver that relies on it. Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Acked-by: Nikhil Agarwal Link: https://lore.kernel.org/20250908115649.22903-1-shubhrajyoti.datta@amd.com --- drivers/cdx/controller/cdx_controller.c | 2 +- drivers/cdx/controller/cdx_rpmsg.c | 2 +- drivers/cdx/controller/mcdi.c | 5 +- drivers/cdx/controller/mcdi_functions.c | 1 - drivers/cdx/controller/mcdi_functions.h | 3 +- drivers/cdx/controller/mcdid.h | 63 +++++++++++++++++++ .../linux/cdx}/bitfield.h | 0 .../controller => include/linux/cdx}/mcdi.h | 52 +-------------- 8 files changed, 71 insertions(+), 57 deletions(-) create mode 100644 drivers/cdx/controller/mcdid.h rename {drivers/cdx/controller => include/linux/cdx}/bitfield.h (100%) rename {drivers/cdx/controller => include/linux/cdx}/mcdi.h (74%) diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c index fca83141e3e66e..3f8b9041babf5b 100644 --- a/drivers/cdx/controller/cdx_controller.c +++ b/drivers/cdx/controller/cdx_controller.c @@ -14,7 +14,7 @@ #include "cdx_controller.h" #include "../cdx.h" #include "mcdi_functions.h" -#include "mcdi.h" +#include "mcdid.h" static unsigned int cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd) { diff --git a/drivers/cdx/controller/cdx_rpmsg.c b/drivers/cdx/controller/cdx_rpmsg.c index 04b578a0be17c2..d4f763323aacca 100644 --- a/drivers/cdx/controller/cdx_rpmsg.c +++ b/drivers/cdx/controller/cdx_rpmsg.c @@ -15,7 +15,7 @@ #include "../cdx.h" #include "cdx_controller.h" #include "mcdi_functions.h" -#include "mcdi.h" +#include "mcdid.h" static struct rpmsg_device_id cdx_rpmsg_id_table[] = { { .name = "mcdi_ipc" }, diff --git a/drivers/cdx/controller/mcdi.c b/drivers/cdx/controller/mcdi.c index e760f8d347cc19..90bf9f7c257b71 100644 --- a/drivers/cdx/controller/mcdi.c +++ b/drivers/cdx/controller/mcdi.c @@ -23,9 +23,10 @@ #include #include #include +#include -#include "bitfield.h" -#include "mcdi.h" +#include +#include "mcdid.h" static void cdx_mcdi_cancel_cmd(struct cdx_mcdi *cdx, struct cdx_mcdi_cmd *cmd); static void cdx_mcdi_wait_for_cleanup(struct cdx_mcdi *cdx); diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c index 885c69e6ebe5b6..8ae2d99be81e5b 100644 --- a/drivers/cdx/controller/mcdi_functions.c +++ b/drivers/cdx/controller/mcdi_functions.c @@ -5,7 +5,6 @@ #include -#include "mcdi.h" #include "mcdi_functions.h" int cdx_mcdi_get_num_buses(struct cdx_mcdi *cdx) diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h index b9942affdc6b2d..57fd1bae706b96 100644 --- a/drivers/cdx/controller/mcdi_functions.h +++ b/drivers/cdx/controller/mcdi_functions.h @@ -8,7 +8,8 @@ #ifndef CDX_MCDI_FUNCTIONS_H #define CDX_MCDI_FUNCTIONS_H -#include "mcdi.h" +#include +#include "mcdid.h" #include "../cdx.h" /** diff --git a/drivers/cdx/controller/mcdid.h b/drivers/cdx/controller/mcdid.h new file mode 100644 index 00000000000000..7fc29f099265e7 --- /dev/null +++ b/drivers/cdx/controller/mcdid.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2008-2013 Solarflare Communications Inc. + * Copyright (C) 2022-2025, Advanced Micro Devices, Inc. + */ + +#ifndef CDX_MCDID_H +#define CDX_MCDID_H + +#include +#include +#include + +#include "mc_cdx_pcol.h" + +#ifdef DEBUG +#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x) +#define CDX_WARN_ON_PARANOID(x) WARN_ON(x) +#else +#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0) +#define CDX_WARN_ON_PARANOID(x) do {} while (0) +#endif + +#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX) + +static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx) +{ + return cdx->mcdi ? &cdx->mcdi->iface : NULL; +} + +int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd, + const struct cdx_dword *inbuf, size_t inlen, + cdx_mcdi_async_completer *complete, + unsigned long cookie); +int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, + unsigned int timeout_jiffies); + +/* + * We expect that 16- and 32-bit fields in MCDI requests and responses + * are appropriately aligned, but 64-bit fields are only + * 32-bit-aligned. + */ +#define MCDI_BYTE(_buf, _field) \ + ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1), \ + *MCDI_PTR(_buf, _field)) +#define MCDI_WORD(_buf, _field) \ + ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2), \ + le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field))) +#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1) \ + CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), \ + MC_CMD_ ## _name1, _value1) +#define MCDI_SET_QWORD(_buf, _field, _value) \ + do { \ + CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0], \ + CDX_DWORD, (u32)(_value)); \ + CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1], \ + CDX_DWORD, (u64)(_value) >> 32); \ + } while (0) +#define MCDI_QWORD(_buf, _field) \ + (CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) | \ + (u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32) + +#endif /* CDX_MCDID_H */ diff --git a/drivers/cdx/controller/bitfield.h b/include/linux/cdx/bitfield.h similarity index 100% rename from drivers/cdx/controller/bitfield.h rename to include/linux/cdx/bitfield.h diff --git a/drivers/cdx/controller/mcdi.h b/include/linux/cdx/mcdi.h similarity index 74% rename from drivers/cdx/controller/mcdi.h rename to include/linux/cdx/mcdi.h index 54a65e9760aeee..46e3f63b062a37 100644 --- a/drivers/cdx/controller/mcdi.h +++ b/include/linux/cdx/mcdi.h @@ -11,16 +11,7 @@ #include #include -#include "bitfield.h" -#include "mc_cdx_pcol.h" - -#ifdef DEBUG -#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x) -#define CDX_WARN_ON_PARANOID(x) WARN_ON(x) -#else -#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0) -#define CDX_WARN_ON_PARANOID(x) do {} while (0) -#endif +#include "linux/cdx/bitfield.h" /** * enum cdx_mcdi_mode - MCDI transaction mode @@ -36,8 +27,6 @@ enum cdx_mcdi_mode { #define MCDI_RPC_LONG_TIMEOU (60 * HZ) #define MCDI_RPC_POST_RST_TIME (10 * HZ) -#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX) - /** * enum cdx_mcdi_cmd_state - State for an individual MCDI command * @MCDI_STATE_QUEUED: Command not started and is waiting to run. @@ -180,25 +169,6 @@ struct cdx_mcdi_data { u32 fn_flags; }; -static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx) -{ - return cdx->mcdi ? &cdx->mcdi->iface : NULL; -} - -int cdx_mcdi_init(struct cdx_mcdi *cdx); -void cdx_mcdi_finish(struct cdx_mcdi *cdx); - -void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len); -int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, - const struct cdx_dword *inbuf, size_t inlen, - struct cdx_dword *outbuf, size_t outlen, size_t *outlen_actual); -int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd, - const struct cdx_dword *inbuf, size_t inlen, - cdx_mcdi_async_completer *complete, - unsigned long cookie); -int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, - unsigned int timeout_jiffies); - /* * We expect that 16- and 32-bit fields in MCDI requests and responses * are appropriately aligned, but 64-bit fields are only @@ -215,28 +185,8 @@ int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, #define _MCDI_DWORD(_buf, _field) \ ((_buf) + (_MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _OFST, 4) >> 2)) -#define MCDI_BYTE(_buf, _field) \ - ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1), \ - *MCDI_PTR(_buf, _field)) -#define MCDI_WORD(_buf, _field) \ - ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2), \ - le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field))) #define MCDI_SET_DWORD(_buf, _field, _value) \ CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), CDX_DWORD, _value) #define MCDI_DWORD(_buf, _field) \ CDX_DWORD_FIELD(*_MCDI_DWORD(_buf, _field), CDX_DWORD) -#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1) \ - CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), \ - MC_CMD_ ## _name1, _value1) -#define MCDI_SET_QWORD(_buf, _field, _value) \ - do { \ - CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0], \ - CDX_DWORD, (u32)(_value)); \ - CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1], \ - CDX_DWORD, (u64)(_value) >> 32); \ - } while (0) -#define MCDI_QWORD(_buf, _field) \ - (CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) | \ - (u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32) - #endif /* CDX_MCDI_H */ From f99b3917789d83ea89b24b722d784956f8289f45 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 15 Sep 2025 14:57:29 +0200 Subject: [PATCH 1752/3206] fs: rename generic_delete_inode() and generic_drop_inode() generic_delete_inode() is rather misleading for what the routine is doing. inode_just_drop() should be much clearer. The new naming is inconsistent with generic_drop_inode(), so rename that one as well with inode_ as the suffix. No functional changes. Signed-off-by: Mateusz Guzik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 4 ++-- Documentation/filesystems/vfs.rst | 4 ++-- block/bdev.c | 2 +- drivers/dax/super.c | 2 +- drivers/misc/ibmasm/ibmasmfs.c | 2 +- drivers/usb/gadget/function/f_fs.c | 2 +- drivers/usb/gadget/legacy/inode.c | 2 +- fs/9p/vfs_super.c | 2 +- fs/afs/inode.c | 4 ++-- fs/btrfs/inode.c | 2 +- fs/ceph/super.c | 2 +- fs/configfs/mount.c | 2 +- fs/efivarfs/super.c | 2 +- fs/ext4/super.c | 2 +- fs/f2fs/super.c | 2 +- fs/fuse/inode.c | 2 +- fs/gfs2/super.c | 2 +- fs/hostfs/hostfs_kern.c | 2 +- fs/inode.c | 6 +++--- fs/kernfs/mount.c | 2 +- fs/nfs/inode.c | 2 +- fs/ocfs2/dlmfs/dlmfs.c | 2 +- fs/orangefs/super.c | 2 +- fs/overlayfs/super.c | 2 +- fs/pidfs.c | 2 +- fs/proc/inode.c | 2 +- fs/pstore/inode.c | 2 +- fs/ramfs/inode.c | 2 +- fs/smb/client/cifsfs.c | 2 +- fs/ubifs/super.c | 2 +- fs/xfs/xfs_super.c | 2 +- include/linux/fs.h | 4 ++-- kernel/bpf/inode.c | 2 +- mm/shmem.c | 2 +- 34 files changed, 40 insertions(+), 40 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f0750..b5db45c0094cac 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -340,8 +340,8 @@ of those. Caller makes sure async writeback cannot be running for the inode whil ->drop_inode() returns int now; it's called on final iput() with inode->i_lock held and it returns true if filesystems wants the inode to be -dropped. As before, generic_drop_inode() is still the default and it's been -updated appropriately. generic_delete_inode() is also alive and it consists +dropped. As before, inode_generic_drop() is still the default and it's been +updated appropriately. inode_just_drop() is also alive and it consists simply of return 1. Note that all actual eviction work is done by caller after ->drop_inode() returns. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 486a9163347478..7a314eee6305fd 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -327,11 +327,11 @@ or bottom half). inode->i_lock spinlock held. This method should be either NULL (normal UNIX filesystem - semantics) or "generic_delete_inode" (for filesystems that do + semantics) or "inode_just_drop" (for filesystems that do not want to cache inodes - causing "delete_inode" to always be called regardless of the value of i_nlink) - The "generic_delete_inode()" behavior is equivalent to the old + The "inode_just_drop()" behavior is equivalent to the old practice of using "force_delete" in the put_inode() case, but does not have the races that the "force_delete()" approach had. diff --git a/block/bdev.c b/block/bdev.c index b77ddd12dc0634..810707cca9703e 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -412,7 +412,7 @@ static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .free_inode = bdev_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = bdev_evict_inode, }; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 54c480e874cb30..d7714d8afb0fa0 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -388,7 +388,7 @@ static const struct super_operations dax_sops = { .alloc_inode = dax_alloc_inode, .destroy_inode = dax_destroy_inode, .free_inode = dax_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static int dax_init_fs_context(struct fs_context *fc) diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index c44de892a61ec4..5372ed2a363ecd 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -94,7 +94,7 @@ static int ibmasmfs_init_fs_context(struct fs_context *fc) static const struct super_operations ibmasmfs_s_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations; diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 08a251df20c438..5246fa6af3d61c 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1891,7 +1891,7 @@ static struct dentry *ffs_sb_create_file(struct super_block *sb, /* Super block */ static const struct super_operations ffs_sb_operations = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; struct ffs_sb_fill_data { diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index b51e132b0cd2a7..13c3da49348c59 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -2011,7 +2011,7 @@ gadgetfs_create_file (struct super_block *sb, char const *name, static const struct super_operations gadget_fs_operations = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static int diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 795c6388744cdb..1581ebac5bb423 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -252,7 +252,7 @@ static int v9fs_drop_inode(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) - return generic_drop_inode(inode); + return inode_generic_drop(inode); /* * in case of non cached mode always drop the * inode because we want the inode attribute diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e9538e91f8484d..e1cb17b8579139 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode) _enter(""); if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags)) - return generic_delete_inode(inode); + return inode_just_drop(inode); else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b77dd22b8cdbe3..2081654dea5c65 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7953,7 +7953,7 @@ int btrfs_drop_inode(struct inode *inode) if (btrfs_root_refs(&root->root_item) == 0) return 1; else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } static void init_once(void *foo) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c3eb651862c555..70dc9467f6a0ec 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1042,7 +1042,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 740f18b60c9d0d..456c4a2efb5329 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode) static const struct super_operations configfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .free_inode = configfs_free_inode, }; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index c4a13991135689..6f131910be9dd7 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -127,7 +127,7 @@ static int efivarfs_unfreeze_fs(struct super_block *sb); static const struct super_operations efivarfs_ops = { .statfs = efivarfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .alloc_inode = efivarfs_alloc_inode, .free_inode = efivarfs_free_inode, .show_options = efivarfs_show_options, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c7d39da7e733b1..848e059d58cf1f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1417,7 +1417,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) static int ext4_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e16c4e2830c298..63cf73409da69a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1768,7 +1768,7 @@ static int f2fs_drop_inode(struct inode *inode) trace_f2fs_drop_inode(inode, 0); return 0; } - ret = generic_drop_inode(inode); + ret = inode_generic_drop(inode); if (!ret) ret = fscrypt_drop_inode(inode); trace_f2fs_drop_inode(inode, ret); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ecb869e895ab1d..63390575b6fd40 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1209,7 +1209,7 @@ static const struct super_operations fuse_super_operations = { .free_inode = fuse_free_inode, .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b42e2110084b6c..644b2d1e72769e 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1050,7 +1050,7 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(SDF_EVICTING, &sdp->sd_flags)) return 1; - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /** diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 01e516175bcd72..1e1acf5775ab5f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -261,7 +261,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations hostfs_sbops = { .alloc_inode = hostfs_alloc_inode, .free_inode = hostfs_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = hostfs_evict_inode, .statfs = hostfs_statfs, .show_options = hostfs_show_options, diff --git a/fs/inode.c b/fs/inode.c index 7b81d4a101b8c7..a66c02123bca60 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1838,11 +1838,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, EXPORT_SYMBOL(insert_inode_locked4); -int generic_delete_inode(struct inode *inode) +int inode_just_drop(struct inode *inode) { return 1; } -EXPORT_SYMBOL(generic_delete_inode); +EXPORT_SYMBOL(inode_just_drop); /* * Called when we're dropping the last reference @@ -1866,7 +1866,7 @@ static void iput_final(struct inode *inode) if (op->drop_inode) drop = op->drop_inode(inode); else - drop = generic_drop_inode(inode); + drop = inode_generic_drop(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index e384a69fbece7b..76eaf64b9d9e08 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -57,7 +57,7 @@ static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) const struct super_operations kernfs_sops = { .statfs = kernfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = kernfs_evict_inode, .show_options = kernfs_sop_show_options, diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae42308..cbd8a7f9c61704 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid) int nfs_drop_inode(struct inode *inode) { - return NFS_STALE(inode) || generic_drop_inode(inode); + return NFS_STALE(inode) || inode_generic_drop(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 5130ec44e5e158..807e2b758a5ce1 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -547,7 +547,7 @@ static const struct super_operations dlmfs_ops = { .alloc_inode = dlmfs_alloc_inode, .free_inode = dlmfs_free_inode, .evict_inode = dlmfs_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct inode_operations dlmfs_file_inode_operations = { diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index f3da840758e742..b46100a4f52935 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -306,7 +306,7 @@ static const struct super_operations orangefs_s_ops = { .free_inode = orangefs_free_inode, .destroy_inode = orangefs_destroy_inode, .write_inode = orangefs_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .statfs = orangefs_statfs, .show_options = orangefs_show_options, }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index df85a76597e910..bd3d7ba8fb95b5 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -280,7 +280,7 @@ static const struct super_operations ovl_super_operations = { .alloc_inode = ovl_alloc_inode, .free_inode = ovl_free_inode, .destroy_inode = ovl_destroy_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = ovl_put_super, .sync_fs = ovl_sync_fs, .statfs = ovl_statfs, diff --git a/fs/pidfs.c b/fs/pidfs.c index edc35522d75c49..3865272178a863 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -718,7 +718,7 @@ static void pidfs_evict_inode(struct inode *inode) } static const struct super_operations pidfs_sops = { - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pidfs_evict_inode, .statfs = simple_statfs, }; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 129490151be147..d9b7ef1223437d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -187,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 1a2e1185426ca1..b4e55c90f8dc23 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -282,7 +282,7 @@ static int pstore_reconfigure(struct fs_context *fc) static const struct super_operations pstore_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pstore_evict_inode, .show_options = pstore_show_options, }; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index f8874c3b8c1e95..41f9995da7cab0 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -215,7 +215,7 @@ static int ramfs_show_options(struct seq_file *m, struct dentry *root) static const struct super_operations ramfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = ramfs_show_options, }; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 3bd85ab2deb192..b4075385a6493c 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -857,7 +857,7 @@ static int cifs_drop_inode(struct inode *inode) /* no serverino => unconditional eviction */ return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) || - generic_drop_inode(inode); + inode_generic_drop(inode); } static const struct super_operations cifs_super_ops = { diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f3e3b20686085e..733fd1e5a9a2de 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -335,7 +335,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) static int ubifs_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a770d..a05ff68748dc09 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -778,7 +778,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode); + return inode_generic_drop(inode); } STATIC void diff --git a/include/linux/fs.h b/include/linux/fs.h index 4daf9b30a64117..724b9af67f35a5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3312,8 +3312,8 @@ extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); -extern int generic_delete_inode(struct inode *inode); -static inline int generic_drop_inode(struct inode *inode) +extern int inode_just_drop(struct inode *inode); +static inline int inode_generic_drop(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392ae..6d021d18afa6d6 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -788,7 +788,7 @@ static void bpf_free_inode(struct inode *inode) const struct super_operations bpf_super_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = bpf_show_options, .free_inode = bpf_free_inode, }; diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b6e..932727247c64b2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5341,7 +5341,7 @@ static const struct super_operations shmem_ops = { .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, From 8b0d03129b6165bbf8c9494897489c6da6fadd58 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 8 Sep 2025 17:26:46 +0530 Subject: [PATCH 1753/3206] cdx: Export Symbols for MCDI RPC and Initialization The cdx_mcdi_init(), cdx_mcdi_process_cmd(), and cdx_mcdi_rpc() functions are needed by the VersalNET EDAC module that interact with the MCDI (Management Controller Direct Interface) framework. These functions facilitate communication between different hardware components by enabling command execution and status management. Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Acked-by: Nikhil Agarwal Link: https://lore.kernel.org/20250908115649.22903-1-shubhrajyoti.datta@amd.com --- drivers/cdx/controller/mcdi.c | 38 +++++++++++++++++++++++++++++++++++ include/linux/cdx/mcdi.h | 7 +++++++ 2 files changed, 45 insertions(+) diff --git a/drivers/cdx/controller/mcdi.c b/drivers/cdx/controller/mcdi.c index 90bf9f7c257b71..2e82ffc18d89c5 100644 --- a/drivers/cdx/controller/mcdi.c +++ b/drivers/cdx/controller/mcdi.c @@ -100,6 +100,19 @@ static unsigned long cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd return cdx->mcdi_ops->mcdi_rpc_timeout(cdx, cmd); } +/** + * cdx_mcdi_init - Initialize MCDI (Management Controller Driver Interface) state + * @cdx: Handle to the CDX MCDI structure + * + * This function allocates and initializes internal MCDI structures and resources + * for the CDX device, including the workqueue, locking primitives, and command + * tracking mechanisms. It sets the initial operating mode and prepares the device + * for MCDI operations. + * + * Return: + * * 0 - on success + * * -ENOMEM - if memory allocation or workqueue creation fails + */ int cdx_mcdi_init(struct cdx_mcdi *cdx) { struct cdx_mcdi_iface *mcdi; @@ -129,7 +142,16 @@ int cdx_mcdi_init(struct cdx_mcdi *cdx) fail: return rc; } +EXPORT_SYMBOL_GPL(cdx_mcdi_init); +/** + * cdx_mcdi_finish - Cleanup MCDI (Management Controller Driver Interface) state + * @cdx: Handle to the CDX MCDI structure + * + * This function is responsible for cleaning up the MCDI (Management Controller Driver Interface) + * resources associated with a cdx_mcdi structure. Also destroys the mcdi workqueue. + * + */ void cdx_mcdi_finish(struct cdx_mcdi *cdx) { struct cdx_mcdi_iface *mcdi; @@ -144,6 +166,7 @@ void cdx_mcdi_finish(struct cdx_mcdi *cdx) kfree(cdx->mcdi); cdx->mcdi = NULL; } +EXPORT_SYMBOL_GPL(cdx_mcdi_finish); static bool cdx_mcdi_flushed(struct cdx_mcdi_iface *mcdi, bool ignore_cleanups) { @@ -554,6 +577,19 @@ static void cdx_mcdi_start_or_queue(struct cdx_mcdi_iface *mcdi, cdx_mcdi_cmd_start_or_queue(mcdi, cmd); } +/** + * cdx_mcdi_process_cmd - Process an incoming MCDI response + * @cdx: Handle to the CDX MCDI structure + * @outbuf: Pointer to the response buffer received from the management controller + * @len: Length of the response buffer in bytes + * + * This function handles a response from the management controller. It locates the + * corresponding command using the sequence number embedded in the header, + * completes the command if it is still pending, and initiates any necessary cleanup. + * + * The function assumes that the response buffer is well-formed and at least one + * dword in size. + */ void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len) { struct cdx_mcdi_iface *mcdi; @@ -591,6 +627,7 @@ void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int le cdx_mcdi_process_cleanup_list(mcdi->cdx, &cleanup_list); } +EXPORT_SYMBOL_GPL(cdx_mcdi_process_cmd); static void cdx_mcdi_cmd_work(struct work_struct *context) { @@ -758,6 +795,7 @@ int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, return cdx_mcdi_rpc_sync(cdx, cmd, inbuf, inlen, outbuf, outlen, outlen_actual, false); } +EXPORT_SYMBOL_GPL(cdx_mcdi_rpc); /** * cdx_mcdi_rpc_async - Schedule an MCDI command to run asynchronously diff --git a/include/linux/cdx/mcdi.h b/include/linux/cdx/mcdi.h index 46e3f63b062a37..74075305cba48c 100644 --- a/include/linux/cdx/mcdi.h +++ b/include/linux/cdx/mcdi.h @@ -169,6 +169,13 @@ struct cdx_mcdi_data { u32 fn_flags; }; +void cdx_mcdi_finish(struct cdx_mcdi *cdx); +int cdx_mcdi_init(struct cdx_mcdi *cdx); +void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len); +int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, + const struct cdx_dword *inbuf, size_t inlen, + struct cdx_dword *outbuf, size_t outlen, size_t *outlen_actual); + /* * We expect that 16- and 32-bit fields in MCDI requests and responses * are appropriately aligned, but 64-bit fields are only From 36e74c95638296b1f272d9d011d435ce62d8f11c Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 8 Sep 2025 17:26:47 +0530 Subject: [PATCH 1754/3206] RAS: Export log_non_standard_event() to drivers The function log_non_standard_event() is responsible for logging platform-specific or vendor-defined RAS (Reliability, Availability, and Serviceability) events. Currently, this function is only available within the RAS subsystem, preventing external modules from leveraging its capabilities. Export it to drivers to log non-standard RAS events via EDAC. Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250908115649.22903-1-shubhrajyoti.datta@amd.com --- drivers/ras/ras.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index a6e4792a1b2e92..ac0e132ccc3eb9 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -51,6 +51,7 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, { trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len); } +EXPORT_SYMBOL_GPL(log_non_standard_event); void log_arm_hw_error(struct cper_sec_proc_arm *err) { From 8d978222e87c5ea50f912141b18421882f89492a Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 8 Sep 2025 17:26:48 +0530 Subject: [PATCH 1755/3206] dt-bindings: memory-controllers: Add support for Versal NET EDAC Add device tree bindings for AMD Versal NET EDAC for DDR controller. Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/20250908115649.22903-1-shubhrajyoti.datta@amd.com --- .../xlnx,versal-net-ddrmc5.yaml | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml diff --git a/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml new file mode 100644 index 00000000000000..479288567d0b0c --- /dev/null +++ b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/memory-controllers/xlnx,versal-net-ddrmc5.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Xilinx Versal NET Memory Controller + +maintainers: + - Shubhrajyoti Datta + +description: + The integrated DDR Memory Controllers (DDRMCs) support both DDR5 and LPDDR5 + compact and extended memory interfaces. Versal NET DDR memory controller + has an optional ECC support which correct single bit ECC errors and detect + double bit ECC errors. It also has support for reporting other errors like + MMCM (Mixed-Mode Clock Manager) errors and General software errors. + +properties: + compatible: + const: xlnx,versal-net-ddrmc5 + + amd,rproc: + $ref: /schemas/types.yaml#/definitions/phandle + description: + phandle to the remoteproc_r5 rproc node using which APU interacts + with remote processor. APU primarily communicates with the RPU for + accessing the DDRMC address space and getting error notification. + +required: + - compatible + - amd,rproc + +additionalProperties: false + +examples: + - | + memory-controller { + compatible = "xlnx,versal-net-ddrmc5"; + amd,rproc = <&remoteproc_r5>; + }; From d5fe2fec6c40dda03df8cc9b4a97de0b7e39f984 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 8 Sep 2025 17:26:49 +0530 Subject: [PATCH 1756/3206] EDAC: Add a driver for the AMD Versal NET DDR controller Add a driver for the AMD Versal NET DDR memory controller which supports single bit error correction, double bit error detection and other system errors from various IP subsystems (e.g., RPU, NOCs, HNICX, PL). The driver listens for notifications from the NMC (Network management controller) using RPMsg (Remote Processor Messaging). The channel used for communicating to RPMsg is named "error_edac". Upon receipt of a notification, the driver sends a RAS event trace. [ bp: - Fixup title - Rewrite commit message - Fixup Kconfig text - Zap unused defines and align them - Simplify rpmsg_cb() considerably - Drop silly double-brackets in conditionals - Use proper void * type in mcdi_request() - Do not clear chinfo in rpmsg_probe() unnecessarily - Fix indentation - Do a proper err unwind path in init_versalnet() - Redo the error unwind path in mc_probe() properly - Fix the ordering in mc_remove() ] Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250908115649.22903-1-shubhrajyoti.datta@amd.com Link: https://lore.kernel.org/r/20250703173105.GLaGa-WQCESDNsqygm@fat_crate.local --- MAINTAINERS | 7 + drivers/edac/Kconfig | 8 + drivers/edac/Makefile | 1 + drivers/edac/versalnet_edac.c | 958 ++++++++++++++++++++++++++++++ include/linux/cdx/edac_cdx_pcol.h | 28 + 5 files changed, 1002 insertions(+) create mode 100644 drivers/edac/versalnet_edac.c create mode 100644 include/linux/cdx/edac_cdx_pcol.h diff --git a/MAINTAINERS b/MAINTAINERS index ac20587a3e987a..23254e89a1f887 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27631,6 +27631,13 @@ S: Maintained F: Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml F: drivers/edac/versal_edac.c +XILINX VERSALNET EDAC DRIVER +M: Shubhrajyoti Datta +S: Maintained +F: Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml +F: drivers/edac/versalnet_edac.c +F: include/linux/cdx/edac_cdx_pcol.h + XILINX WATCHDOG DRIVER M: Srinivas Neeli R: Shubhrajyoti Datta diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index b824472208c48a..39352b9b7a7e45 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -584,4 +584,12 @@ config EDAC_CORTEX_A72 The detected and reported errors are from reading CPU/L2 memory error syndrome registers. +config EDAC_VERSALNET + tristate "AMD VersalNET DDR Controller" + depends on CDX_CONTROLLER && ARCH_ZYNQMP + help + Support for single bit error correction, double bit error detection + and other system errors from various IP subsystems like RPU, NOCs, + HNICX, PL on the AMD Versal NET DDR memory controller. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index c1736f264320cc..1c14796410a3e2 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -88,4 +88,5 @@ obj-$(CONFIG_EDAC_NPCM) += npcm_edac.o obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o obj-$(CONFIG_EDAC_VERSAL) += versal_edac.o obj-$(CONFIG_EDAC_LOONGSON) += loongson_edac.o +obj-$(CONFIG_EDAC_VERSALNET) += versalnet_edac.o obj-$(CONFIG_EDAC_CORTEX_A72) += a72_edac.o diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c new file mode 100644 index 00000000000000..66714fffa59182 --- /dev/null +++ b/drivers/edac/versalnet_edac.c @@ -0,0 +1,958 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Versal NET memory controller driver + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "edac_module.h" + +/* Granularity of reported error in bytes */ +#define MC5_ERR_GRAIN 1 +#define MC_GET_DDR_CONFIG_IN_LEN 4 + +#define MC5_IRQ_CE_MASK GENMASK(18, 15) +#define MC5_IRQ_UE_MASK GENMASK(14, 11) + +#define MC5_RANK_1_MASK GENMASK(11, 6) +#define MASK_24 GENMASK(29, 24) +#define MASK_0 GENMASK(5, 0) + +#define MC5_LRANK_1_MASK GENMASK(11, 6) +#define MC5_LRANK_2_MASK GENMASK(17, 12) +#define MC5_BANK1_MASK GENMASK(11, 6) +#define MC5_GRP_0_MASK GENMASK(17, 12) +#define MC5_GRP_1_MASK GENMASK(23, 18) + +#define MC5_REGHI_ROW 7 +#define MC5_EACHBIT 1 +#define MC5_ERR_TYPE_CE 0 +#define MC5_ERR_TYPE_UE 1 +#define MC5_HIGH_MEM_EN BIT(20) +#define MC5_MEM_MASK GENMASK(19, 0) +#define MC5_X16_BASE 256 +#define MC5_X16_ECC 32 +#define MC5_X16_SIZE (MC5_X16_BASE + MC5_X16_ECC) +#define MC5_X32_SIZE 576 +#define MC5_HIMEM_BASE (256 * SZ_1M) +#define MC5_ILC_HIMEM_EN BIT(28) +#define MC5_ILC_MEM GENMASK(27, 0) +#define MC5_INTERLEAVE_SEL GENMASK(3, 0) +#define MC5_BUS_WIDTH_MASK GENMASK(19, 18) +#define MC5_NUM_CHANS_MASK BIT(17) +#define MC5_RANK_MASK GENMASK(15, 14) + +#define ERROR_LEVEL 2 +#define ERROR_ID 3 +#define TOTAL_ERR_LENGTH 5 +#define MSG_ERR_OFFSET 8 +#define MSG_ERR_LENGTH 9 +#define ERROR_DATA 10 +#define MCDI_RESPONSE 0xFF + +#define REG_MAX 152 +#define ADEC_MAX 152 +#define NUM_CONTROLLERS 8 +#define REGS_PER_CONTROLLER 19 +#define ADEC_NUM 19 +#define BUFFER_SZ 80 + +#define XDDR5_BUS_WIDTH_64 0 +#define XDDR5_BUS_WIDTH_32 1 +#define XDDR5_BUS_WIDTH_16 2 + +/** + * struct ecc_error_info - ECC error log information. + * @burstpos: Burst position. + * @lrank: Logical Rank number. + * @rank: Rank number. + * @group: Group number. + * @bank: Bank number. + * @col: Column number. + * @row: Row number. + * @rowhi: Row number higher bits. + * @i: Combined ECC error vector containing encoded values of burst position, + * rank, bank, column, and row information. + */ +union ecc_error_info { + struct { + u32 burstpos:3; + u32 lrank:4; + u32 rank:2; + u32 group:3; + u32 bank:2; + u32 col:11; + u32 row:7; + u32 rowhi; + }; + u64 i; +} __packed; + +/* Row and column bit positions in the address decoder (ADEC) registers. */ +union row_col_mapping { + struct { + u32 row0:6; + u32 row1:6; + u32 row2:6; + u32 row3:6; + u32 row4:6; + u32 reserved:2; + }; + struct { + u32 col1:6; + u32 col2:6; + u32 col3:6; + u32 col4:6; + u32 col5:6; + u32 reservedcol:2; + }; + u32 i; +} __packed; + +/** + * struct ecc_status - ECC status information to report. + * @ceinfo: Correctable errors. + * @ueinfo: Uncorrected errors. + * @channel: Channel number. + * @error_type: Error type. + */ +struct ecc_status { + union ecc_error_info ceinfo[2]; + union ecc_error_info ueinfo[2]; + u8 channel; + u8 error_type; +}; + +/** + * struct mc_priv - DDR memory controller private instance data. + * @message: Buffer for framing the event specific info. + * @stat: ECC status information. + * @error_id: The error id. + * @error_level: The error level. + * @dwidth: Width of data bus excluding ECC bits. + * @part_len: The support of the message received. + * @regs: The registers sent on the rpmsg. + * @adec: Address decode registers. + * @mci: Memory controller interface. + * @ept: rpmsg endpoint. + * @mcdi: The mcdi handle. + */ +struct mc_priv { + char message[256]; + struct ecc_status stat; + u32 error_id; + u32 error_level; + u32 dwidth; + u32 part_len; + u32 regs[REG_MAX]; + u32 adec[ADEC_MAX]; + struct mem_ctl_info *mci[NUM_CONTROLLERS]; + struct rpmsg_endpoint *ept; + struct cdx_mcdi *mcdi; +}; + +/* + * Address decoder (ADEC) registers to match the order in which the register + * information is received from the firmware. + */ +enum adec_info { + CONF = 0, + ADEC0, + ADEC1, + ADEC2, + ADEC3, + ADEC4, + ADEC5, + ADEC6, + ADEC7, + ADEC8, + ADEC9, + ADEC10, + ADEC11, + ADEC12, + ADEC13, + ADEC14, + ADEC15, + ADEC16, + ADECILC, +}; + +enum reg_info { + ISR = 0, + IMR, + ECCR0_ERR_STATUS, + ECCR0_ADDR_LO, + ECCR0_ADDR_HI, + ECCR0_DATA_LO, + ECCR0_DATA_HI, + ECCR0_PAR, + ECCR1_ERR_STATUS, + ECCR1_ADDR_LO, + ECCR1_ADDR_HI, + ECCR1_DATA_LO, + ECCR1_DATA_HI, + ECCR1_PAR, + XMPU_ERR, + XMPU_ERR_ADDR_L0, + XMPU_ERR_ADDR_HI, + XMPU_ERR_AXI_ID, + ADEC_CHK_ERR_LOG, +}; + +static bool get_ddr_info(u32 *error_data, struct mc_priv *priv) +{ + u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr; + struct ecc_status *p; + + isr = error_data[ISR]; + + if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK))) + return false; + + eccr0_val = error_data[ECCR0_ERR_STATUS]; + eccr1_val = error_data[ECCR1_ERR_STATUS]; + + if (!eccr0_val && !eccr1_val) + return false; + + p = &priv->stat; + + if (!eccr0_val) + p->channel = 1; + else + p->channel = 0; + + reglo = error_data[ECCR0_ADDR_LO]; + reghi = error_data[ECCR0_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[0].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[0].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR0_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + reglo = error_data[ECCR1_ADDR_LO]; + reghi = error_data[ECCR1_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[1].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[1].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR1_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + return true; +} + +/** + * convert_to_physical - Convert @error_data to a physical address. + * @priv: DDR memory controller private instance data. + * @pinf: ECC error info structure. + * @controller: Controller number of the MC5 + * @error_data: the DDRMC5 ADEC address decoder register data + * + * Return: physical address of the DDR memory. + */ +static unsigned long convert_to_physical(struct mc_priv *priv, + union ecc_error_info pinf, + int controller, int *error_data) +{ + u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset; + u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base; + unsigned long err_addr = 0, addr; + union row_col_mapping cols; + union row_col_mapping rows; + u32 col_bit_0; + + row = pinf.rowhi << MC5_REGHI_ROW | pinf.row; + offset = controller * ADEC_NUM; + + reg = error_data[ADEC6]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC7]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC8]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + + reg = error_data[ADEC9]; + rows.i = reg; + + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + + col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]); + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << col_bit_0; + + cols.i = error_data[ADEC10]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + cols.i = error_data[ADEC11]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + reg = error_data[ADEC12]; + err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0); + pinf.bank >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg); + pinf.bank >>= MC5_EACHBIT; + + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.group >>= MC5_EACHBIT; + + reg = error_data[ADEC4]; + err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0); + pinf.rank >>= MC5_EACHBIT; + err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg); + pinf.rank >>= MC5_EACHBIT; + + reg = error_data[ADEC5]; + err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.lrank >>= MC5_EACHBIT; + + high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE; + interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL; + + high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK; + low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK; + reg = priv->adec[ADEC14 + offset]; + ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN); + ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M; + if (ilc_himem_en) + ilc_base_ctrl_add = ilcmem_base - high_mem_offset; + else + ilc_base_ctrl_add = ilcmem_base - low_mem_offset; + + if (priv->dwidth == DEV_X16) { + blk = err_addr / MC5_X16_SIZE; + rsh_req_addr = (blk << 8) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } else { + blk = err_addr / MC5_X32_SIZE; + rsh_req_addr = (blk << 9) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } + + if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base) + addr = err_addr - high_mem_offset; + else + addr = err_addr - low_mem_offset; + + return addr; +} + +/** + * handle_error - Handle errors. + * @priv: DDR memory controller private instance data. + * @stat: ECC status structure. + * @ctl_num: Controller number of the MC5 + * @error_data: the MC5 ADEC address decoder register data + * + * Handles ECC correctable and uncorrectable errors. + */ +static void handle_error(struct mc_priv *priv, struct ecc_status *stat, + int ctl_num, int *error_data) +{ + union ecc_error_info pinf; + struct mem_ctl_info *mci; + unsigned long pa; + phys_addr_t pfn; + int err; + + if (WARN_ON_ONCE(ctl_num > NUM_CONTROLLERS)) + return; + + mci = priv->mci[ctl_num]; + + if (stat->error_type == MC5_ERR_TYPE_CE) { + pinf = stat->ceinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s Controller %d Addr at %lx\n", + "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + } + + if (stat->error_type == MC5_ERR_TYPE_UE) { + pinf = stat->ueinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s controller %d Addr at %lx\n", + "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + pa = convert_to_physical(priv, pinf, ctl_num, error_data); + pfn = PHYS_PFN(pa); + + if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) { + err = memory_failure(pfn, MF_ACTION_REQUIRED); + if (err) + edac_dbg(2, "memory_failure() error: %d", err); + else + edac_dbg(2, "Poison page at PA 0x%lx\n", pa); + } + } +} + +static void mc_init(struct mem_ctl_info *mci, struct device *dev) +{ + struct mc_priv *priv = mci->pvt_info; + struct csrow_info *csi; + struct dimm_info *dimm; + u32 row; + int ch; + + /* Initialize controller capabilities and configuration */ + mci->mtype_cap = MEM_FLAG_DDR5; + mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED; + mci->scrub_cap = SCRUB_HW_SRC; + mci->scrub_mode = SCRUB_NONE; + + mci->edac_cap = EDAC_FLAG_SECDED; + mci->ctl_name = "VersalNET DDR5"; + mci->dev_name = dev_name(dev); + mci->mod_name = "versalnet_edac"; + + edac_op_state = EDAC_OPSTATE_INT; + + for (row = 0; row < mci->nr_csrows; row++) { + csi = mci->csrows[row]; + for (ch = 0; ch < csi->nr_channels; ch++) { + dimm = csi->channels[ch]->dimm; + dimm->edac_mode = EDAC_SECDED; + dimm->mtype = MEM_DDR5; + dimm->grain = MC5_ERR_GRAIN; + dimm->dtype = priv->dwidth; + } + } +} + +#define to_mci(k) container_of(k, struct mem_ctl_info, dev) + +static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd) +{ + return MCDI_RPC_TIMEOUT; +} + +static void mcdi_request(struct cdx_mcdi *cdx, + const struct cdx_dword *hdr, size_t hdr_len, + const struct cdx_dword *sdu, size_t sdu_len) +{ + void *send_buf; + int ret; + + send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL); + if (!send_buf) + return; + + memcpy(send_buf, hdr, hdr_len); + memcpy(send_buf + hdr_len, sdu, sdu_len); + + ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len); + if (ret) + dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret); + + kfree(send_buf); +} + +static const struct cdx_mcdi_ops mcdi_ops = { + .mcdi_rpc_timeout = mcdi_rpc_timeout, + .mcdi_request = mcdi_request, +}; + +static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi) +{ + size_t outlen; + int ret; + + MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN); + MCDI_DECLARE_BUF(outbuf, BUFFER_SZ); + + MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index); + + ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf), + outbuf, sizeof(outbuf), &outlen); + if (!ret) + memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG), + (ADEC_NUM * 4)); +} + +static int setup_mcdi(struct mc_priv *mc_priv) +{ + struct cdx_mcdi *amd_mcdi; + int ret, i; + + amd_mcdi = kzalloc(sizeof(*amd_mcdi), GFP_KERNEL); + if (!amd_mcdi) + return -ENOMEM; + + amd_mcdi->mcdi_ops = &mcdi_ops; + ret = cdx_mcdi_init(amd_mcdi); + if (ret) { + kfree(amd_mcdi); + return ret; + } + + amd_mcdi->ept = mc_priv->ept; + mc_priv->mcdi = amd_mcdi; + + for (i = 0; i < NUM_CONTROLLERS; i++) + get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi); + + return 0; +} + +static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2, + 0xb8, 0xb4, 0x45, 0x56, 0x2e, + 0x8c, 0x5b, 0xec); + +static int rpmsg_cb(struct rpmsg_device *rpdev, void *data, + int len, void *priv, u32 src) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + const guid_t *sec_type = &guid_null; + u32 length, offset, error_id; + u32 *result = (u32 *)data; + struct ecc_status *p; + int i, j, k, sec_sev; + const char *err_str; + u32 *adec_data; + + if (*(u8 *)data == MCDI_RESPONSE) { + cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len); + return 0; + } + + sec_sev = result[ERROR_LEVEL]; + error_id = result[ERROR_ID]; + length = result[MSG_ERR_LENGTH]; + offset = result[MSG_ERR_OFFSET]; + + if (result[TOTAL_ERR_LENGTH] > length) { + if (!mc_priv->part_len) + mc_priv->part_len = length; + else + mc_priv->part_len += length; + /* + * The data can come in 2 stretches. Construct the regs from 2 + * messages the offset indicates the offset from which the data is to + * be taken + */ + for (i = 0 ; i < length; i++) { + k = offset + i; + j = ERROR_DATA + i; + mc_priv->regs[k] = result[j]; + } + if (mc_priv->part_len < result[TOTAL_ERR_LENGTH]) + return 0; + mc_priv->part_len = 0; + } + + mc_priv->error_id = error_id; + mc_priv->error_level = result[ERROR_LEVEL]; + + switch (error_id) { + case 5: err_str = "General Software Non-Correctable error"; break; + case 6: err_str = "CFU error"; break; + case 7: err_str = "CFRAME error"; break; + case 10: err_str = "DDRMC Microblaze Correctable ECC error"; break; + case 11: err_str = "DDRMC Microblaze Non-Correctable ECC error"; break; + case 15: err_str = "MMCM error"; break; + case 16: err_str = "HNICX Correctable error"; break; + case 17: err_str = "HNICX Non-Correctable error"; break; + + case 18: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_CE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + case 19: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_UE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + + case 21: err_str = "GT Non-Correctable error"; break; + case 22: err_str = "PL Sysmon Correctable error"; break; + case 23: err_str = "PL Sysmon Non-Correctable error"; break; + case 111: err_str = "LPX unexpected dfx activation error"; break; + case 114: err_str = "INT_LPD Non-Correctable error"; break; + case 116: err_str = "INT_OCM Non-Correctable error"; break; + case 117: err_str = "INT_FPD Correctable error"; break; + case 118: err_str = "INT_FPD Non-Correctable error"; break; + case 120: err_str = "INT_IOU Non-Correctable error"; break; + case 123: err_str = "err_int_irq from APU GIC Distributor"; break; + case 124: err_str = "fault_int_irq from APU GIC Distribute"; break; + case 132 ... 139: err_str = "FPX SPLITTER error"; break; + case 140: err_str = "APU Cluster 0 error"; break; + case 141: err_str = "APU Cluster 1 error"; break; + case 142: err_str = "APU Cluster 2 error"; break; + case 143: err_str = "APU Cluster 3 error"; break; + case 145: err_str = "WWDT1 LPX error"; break; + case 147: err_str = "IPI error"; break; + case 152 ... 153: err_str = "AFIFS error"; break; + case 154 ... 155: err_str = "LPX glitch error"; break; + case 185 ... 186: err_str = "FPX AFIFS error"; break; + case 195 ... 199: err_str = "AFIFM error"; break; + case 108: err_str = "PSM Correctable error"; break; + case 59: err_str = "PMC correctable error"; break; + case 60: err_str = "PMC Un correctable error"; break; + case 43 ... 47: err_str = "PMC Sysmon error"; break; + case 163 ... 184: err_str = "RPU error"; break; + case 148: err_str = "OCM0 correctable error"; break; + case 149: err_str = "OCM1 correctable error"; break; + case 150: err_str = "OCM0 Un-correctable error"; break; + case 151: err_str = "OCM1 Un-correctable error"; break; + case 189: err_str = "PSX_CMN_3 PD block consolidated error"; break; + case 191: err_str = "FPD_INT_WRAP PD block consolidated error"; break; + case 232: err_str = "CRAM Un-Correctable error"; break; + default: err_str = "VERSAL_EDAC_ERR_ID: %d"; break; + } + + snprintf(mc_priv->message, + sizeof(mc_priv->message), + "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str); + + /* Convert to bytes */ + length = result[TOTAL_ERR_LENGTH] * 4; + log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message, + sec_sev, (void *)&result[ERROR_DATA], length); + + return 0; +} + +static struct rpmsg_device_id amd_rpmsg_id_table[] = { + { .name = "error_ipc" }, + { }, +}; +MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table); + +static int rpmsg_probe(struct rpmsg_device *rpdev) +{ + struct rpmsg_channel_info chinfo; + struct mc_priv *pg; + + pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data; + chinfo.src = RPMSG_ADDR_ANY; + chinfo.dst = rpdev->dst; + strscpy(chinfo.name, amd_rpmsg_id_table[0].name, + strlen(amd_rpmsg_id_table[0].name)); + + pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo); + if (!pg->ept) + return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n", + chinfo.name); + + dev_set_drvdata(&rpdev->dev, pg); + + return 0; +} + +static void rpmsg_remove(struct rpmsg_device *rpdev) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + + rpmsg_destroy_ept(mc_priv->ept); + dev_set_drvdata(&rpdev->dev, NULL); +} + +static struct rpmsg_driver amd_rpmsg_driver = { + .drv.name = KBUILD_MODNAME, + .probe = rpmsg_probe, + .remove = rpmsg_remove, + .callback = rpmsg_cb, + .id_table = amd_rpmsg_id_table, +}; + +static void versal_edac_release(struct device *dev) +{ + kfree(dev); +} + +static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev) +{ + u32 num_chans, rank, dwidth, config; + struct edac_mc_layer layers[2]; + struct mem_ctl_info *mci; + struct device *dev; + enum dev_type dt; + char *name; + int rc, i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + config = priv->adec[CONF + i * ADEC_NUM]; + num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config); + rank = 1 << FIELD_GET(MC5_RANK_MASK, config); + dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config); + + switch (dwidth) { + case XDDR5_BUS_WIDTH_16: + dt = DEV_X16; + break; + case XDDR5_BUS_WIDTH_32: + dt = DEV_X32; + break; + case XDDR5_BUS_WIDTH_64: + dt = DEV_X64; + break; + default: + dt = DEV_UNKNOWN; + } + + if (dt == DEV_UNKNOWN) + continue; + + /* Find the first enabled device and register that one. */ + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = rank; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = num_chans; + layers[1].is_virt_csrow = false; + + rc = -ENOMEM; + mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers, + sizeof(struct mc_priv)); + if (!mci) { + edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i); + goto err_alloc; + } + + priv->mci[i] = mci; + priv->dwidth = dt; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev->release = versal_edac_release; + name = kmalloc(32, GFP_KERNEL); + sprintf(name, "versal-net-ddrmc5-edac-%d", i); + dev->init_name = name; + rc = device_register(dev); + if (rc) + goto err_alloc; + + mci->pdev = dev; + + platform_set_drvdata(pdev, priv); + + mc_init(mci, dev); + rc = edac_mc_add_mc(mci); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i); + goto err_alloc; + } + } + return 0; + +err_alloc: + while (i--) { + mci = priv->mci[i]; + if (!mci) + continue; + + if (mci->pdev) { + device_unregister(mci->pdev); + edac_mc_del_mc(mci->pdev); + } + + edac_mc_free(mci); + } + + return rc; +} + +static void remove_versalnet(struct mc_priv *priv) +{ + struct mem_ctl_info *mci; + int i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + device_unregister(priv->mci[i]->pdev); + mci = edac_mc_del_mc(priv->mci[i]->pdev); + if (!mci) + return; + + edac_mc_free(mci); + } +} + +static int mc_probe(struct platform_device *pdev) +{ + struct device_node *r5_core_node; + struct mc_priv *priv; + struct rproc *rp; + int rc; + + r5_core_node = of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0); + if (!r5_core_node) { + dev_err(&pdev->dev, "amd,rproc: invalid phandle\n"); + return -EINVAL; + } + + rp = rproc_get_by_phandle(r5_core_node->phandle); + if (!rp) + return -EPROBE_DEFER; + + rc = rproc_boot(rp); + if (rc) { + dev_err(&pdev->dev, "Failed to attach to remote processor\n"); + goto err_rproc_boot; + } + + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + goto err_alloc; + + amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv; + + rc = register_rpmsg_driver(&amd_rpmsg_driver); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc); + goto err_alloc; + } + + rc = setup_mcdi(priv); + if (rc) + goto err_unreg; + + priv->mcdi->r5_rproc = rp; + + rc = init_versalnet(priv, pdev); + if (rc) + goto err_init; + + return 0; + +err_init: + cdx_mcdi_finish(priv->mcdi); + +err_unreg: + unregister_rpmsg_driver(&amd_rpmsg_driver); + +err_alloc: + rproc_shutdown(rp); + +err_rproc_boot: + rproc_put(rp); + + return rc; +} + +static void mc_remove(struct platform_device *pdev) +{ + struct mc_priv *priv = platform_get_drvdata(pdev); + + unregister_rpmsg_driver(&amd_rpmsg_driver); + remove_versalnet(priv); + rproc_shutdown(priv->mcdi->r5_rproc); + cdx_mcdi_finish(priv->mcdi); +} + +static const struct of_device_id amd_edac_match[] = { + { .compatible = "xlnx,versal-net-ddrmc5", }, + {} +}; +MODULE_DEVICE_TABLE(of, amd_edac_match); + +static struct platform_driver amd_ddr_edac_mc_driver = { + .driver = { + .name = "versal-net-edac", + .of_match_table = amd_edac_match, + }, + .probe = mc_probe, + .remove = mc_remove, +}; + +module_platform_driver(amd_ddr_edac_mc_driver); + +MODULE_AUTHOR("AMD Inc"); +MODULE_DESCRIPTION("Versal NET EDAC driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/cdx/edac_cdx_pcol.h b/include/linux/cdx/edac_cdx_pcol.h new file mode 100644 index 00000000000000..749db33bb482d1 --- /dev/null +++ b/include/linux/cdx/edac_cdx_pcol.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Driver for AMD network controllers and boards + * + * Copyright (C) 2021, Xilinx, Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef MC_CDX_PCOL_H +#define MC_CDX_PCOL_H +#include + +#define MC_CMD_EDAC_GET_DDR_CONFIG_OUT_WORD_LENGTH_LEN 4 +/* Number of registers for the DDR controller */ +#define MC_CMD_GET_DDR_CONFIG_OFST 4 +#define MC_CMD_GET_DDR_CONFIG_LEN 4 + +/***********************************/ +/* MC_CMD_EDAC_GET_DDR_CONFIG + * Provides detailed configuration for the DDR controller of the given index. + */ +#define MC_CMD_EDAC_GET_DDR_CONFIG 0x3 + +/* MC_CMD_EDAC_GET_DDR_CONFIG_IN msgrequest */ +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_OFST 0 +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_LEN 4 + +#endif /* MC_CDX_PCOL_H */ From b8cac8c98e85e977369a4c07b5eccd73fbc30cb9 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 10 Sep 2025 16:07:54 +0200 Subject: [PATCH 1757/3206] power: supply: intel_dc_ti_battery: Drop no longer relevant comment Drop the comment about not being able to use devm_iio_channel_get(). The code has actually already successfully been switched over to devm_iio_channel_get(). This is just a no longer applicable left-over comment, drop it. Signed-off-by: Hans de Goede Signed-off-by: Sebastian Reichel --- drivers/power/supply/intel_dc_ti_battery.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/power/supply/intel_dc_ti_battery.c b/drivers/power/supply/intel_dc_ti_battery.c index 457d23b689e988..56b0c92e9d28a1 100644 --- a/drivers/power/supply/intel_dc_ti_battery.c +++ b/drivers/power/supply/intel_dc_ti_battery.c @@ -345,11 +345,6 @@ static int dc_ti_battery_probe(struct platform_device *pdev) chip->dev = dev; chip->regmap = pmic->regmap; - /* - * Note cannot use devm_iio_channel_get because ACPI systems lack - * the device<->channel maps which iio_channel_get will uses when passed - * a non NULL device pointer. - */ chip->vbat_channel = devm_iio_channel_get(dev, "VBAT"); if (IS_ERR(chip->vbat_channel)) { dev_dbg(dev, "devm_iio_channel_get() ret %ld\n", PTR_ERR(chip->vbat_channel)); From 8edb9e77119be3cdd930e71204ee48c4994b217b Mon Sep 17 00:00:00 2001 From: David Kaplan Date: Mon, 15 Sep 2025 08:47:00 -0500 Subject: [PATCH 1758/3206] x86/bugs: Use early_param() for spectre_v2_user Most of the mitigations in bugs.c use early_param() to parse their command line options. Modify spectre_v2_user to use early_param() for consistency. Remove spec_v2_user_print_cond() because informing a user about their cmdline choice isn't very interesting and the chosen mitigation is already printed in spectre_v2_user_update_mitigation(). Signed-off-by: David Kaplan Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Pawan Gupta Link: https://lore.kernel.org/r/20250819192200.2003074-2-david.kaplan@amd.com --- arch/x86/kernel/cpu/bugs.c | 68 +++++++++++++++----------------------- 1 file changed, 26 insertions(+), 42 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e817bbae01591b..a5072ec6e5c5e5 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1847,7 +1847,7 @@ enum spectre_v2_mitigation_cmd { static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO; -enum spectre_v2_user_cmd { +enum spectre_v2_user_mitigation_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, SPECTRE_V2_USER_CMD_FORCE, @@ -1857,6 +1857,9 @@ enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_SECCOMP_IBPB, }; +static enum spectre_v2_user_mitigation_cmd spectre_v2_user_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; + static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", @@ -1865,50 +1868,31 @@ static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", }; -static const struct { - const char *option; - enum spectre_v2_user_cmd cmd; - bool secure; -} v2_user_options[] __initconst = { - { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, - { "off", SPECTRE_V2_USER_CMD_NONE, false }, - { "on", SPECTRE_V2_USER_CMD_FORCE, true }, - { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, - { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, - { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, - { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, -}; - -static void __init spec_v2_user_print_cond(const char *reason, bool secure) -{ - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("spectre_v2_user=%s forced on command line.\n", reason); -} - -static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) +static int __init spectre_v2_user_parse_cmdline(char *str) { - char arg[20]; - int ret, i; - - if (!IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) - return SPECTRE_V2_USER_CMD_NONE; - - ret = cmdline_find_option(boot_command_line, "spectre_v2_user", - arg, sizeof(arg)); - if (ret < 0) - return SPECTRE_V2_USER_CMD_AUTO; + if (!str) + return -EINVAL; - for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { - if (match_option(arg, ret, v2_user_options[i].option)) { - spec_v2_user_print_cond(v2_user_options[i].option, - v2_user_options[i].secure); - return v2_user_options[i].cmd; - } - } + if (!strcmp(str, "auto")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_AUTO; + else if (!strcmp(str, "off")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_NONE; + else if (!strcmp(str, "on")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_FORCE; + else if (!strcmp(str, "prctl")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL; + else if (!strcmp(str, "prctl,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL_IBPB; + else if (!strcmp(str, "seccomp")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP; + else if (!strcmp(str, "seccomp,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP_IBPB; + else + pr_err("Ignoring unknown spectre_v2_user option (%s).", str); - pr_err("Unknown user space protection option (%s). Switching to default\n", arg); - return SPECTRE_V2_USER_CMD_AUTO; + return 0; } +early_param("spectre_v2_user", spectre_v2_user_parse_cmdline); static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { @@ -1920,7 +1904,7 @@ static void __init spectre_v2_user_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - switch (spectre_v2_parse_user_cmdline()) { + switch (spectre_v2_user_cmd) { case SPECTRE_V2_USER_CMD_NONE: return; case SPECTRE_V2_USER_CMD_FORCE: From 9a9f8147ae7fd558d6f0d40b560ffbdd408768df Mon Sep 17 00:00:00 2001 From: David Kaplan Date: Mon, 15 Sep 2025 08:47:01 -0500 Subject: [PATCH 1759/3206] x86/bugs: Use early_param() for spectre_v2 Most of the mitigations in bugs.c use early_param() for command line parsing. Rework the spectre_v2 and nospectre_v2 command line options to be consistent with the others. Remove spec_v2_print_cond() as informing the user of the their cmdline choice isn't interesting. [ bp: Zap spectre_v2_check_cmd(). ] Signed-off-by: David Kaplan Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Pawan Gupta Link: https://lore.kernel.org/r/20250819192200.2003074-3-david.kaplan@amd.com --- arch/x86/kernel/cpu/bugs.c | 181 +++++++++++++++++-------------------- 1 file changed, 82 insertions(+), 99 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a5072ec6e5c5e5..c348f14c2b5a80 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1845,7 +1845,8 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_IBRS, }; -static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO; +static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; enum spectre_v2_user_mitigation_cmd { SPECTRE_V2_USER_CMD_NONE, @@ -2039,112 +2040,51 @@ static const char * const spectre_v2_strings[] = { [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; -static const struct { - const char *option; - enum spectre_v2_mitigation_cmd cmd; - bool secure; -} mitigation_options[] __initconst = { - { "off", SPECTRE_V2_CMD_NONE, false }, - { "on", SPECTRE_V2_CMD_FORCE, true }, - { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, - { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, - { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, - { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, - { "auto", SPECTRE_V2_CMD_AUTO, false }, - { "ibrs", SPECTRE_V2_CMD_IBRS, false }, -}; +static bool nospectre_v2 __ro_after_init; -static void __init spec_v2_print_cond(const char *reason, bool secure) +static int __init nospectre_v2_parse_cmdline(char *str) { - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("%s selected on command line.\n", reason); + nospectre_v2 = true; + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + return 0; } +early_param("nospectre_v2", nospectre_v2_parse_cmdline); -static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +static int __init spectre_v2_parse_cmdline(char *str) { - enum spectre_v2_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) - return SPECTRE_V2_CMD_NONE; - - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); - if (ret < 0) - return cmd; - - for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { - if (!match_option(arg, ret, mitigation_options[i].option)) - continue; - cmd = mitigation_options[i].cmd; - break; - } - - if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_EIBRS || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && - !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (!str) + return -EINVAL; - if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { - pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (nospectre_v2) + return 0; - if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { - pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (!strcmp(str, "off")) + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + else if (!strcmp(str, "on")) + spectre_v2_cmd = SPECTRE_V2_CMD_FORCE; + else if (!strcmp(str, "retpoline")) + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE; + else if (!strcmp(str, "retpoline,amd") || + !strcmp(str, "retpoline,lfence")) + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_LFENCE; + else if (!strcmp(str, "retpoline,generic")) + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_GENERIC; + else if (!strcmp(str, "eibrs")) + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS; + else if (!strcmp(str, "eibrs,lfence")) + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_LFENCE; + else if (!strcmp(str, "eibrs,retpoline")) + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_RETPOLINE; + else if (!strcmp(str, "auto")) + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + else if (!strcmp(str, "ibrs")) + spectre_v2_cmd = SPECTRE_V2_CMD_IBRS; + else + pr_err("Ignoring unknown spectre_v2 option (%s).", str); - spec_v2_print_cond(mitigation_options[i].option, - mitigation_options[i].secure); - return cmd; + return 0; } +early_param("spectre_v2", spectre_v2_parse_cmdline); static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { @@ -2332,7 +2272,50 @@ static void __init bhi_apply_mitigation(void) static void __init spectre_v2_select_mitigation(void) { - spectre_v2_cmd = spectre_v2_parse_cmdline(); + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { + pr_err("RETPOLINE selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if ((spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("EIBRS selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { + pr_err("IBRS selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("IBRS selected but not Intel CPU. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("IBRS selected but CPU doesn't have IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { + pr_err("IBRS selected but running as XenPV guest. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)) From 02ac6cc8c5a129013695c3eee48b65fb5ba669c0 Mon Sep 17 00:00:00 2001 From: David Kaplan Date: Mon, 15 Sep 2025 08:47:02 -0500 Subject: [PATCH 1760/3206] x86/bugs: Simplify SSB cmdline parsing Simplify the SSB command line parsing by selecting a mitigation directly, as is done in most of the simpler vulnerabilities. Use early_param() instead of cmdline_find_option() for consistency with the other mitigation selections. Signed-off-by: David Kaplan Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Pawan Gupta Link: https://lore.kernel.org/r/20250819192200.2003074-4-david.kaplan@amd.com --- arch/x86/include/asm/nospec-branch.h | 1 + arch/x86/kernel/cpu/bugs.c | 120 +++++++++------------------ 2 files changed, 41 insertions(+), 80 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e29f82466f4323..08ed5a2e46a5fd 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -514,6 +514,7 @@ enum spectre_v2_user_mitigation { /* The Speculative Store Bypass disable variants */ enum ssb_mitigation { SPEC_STORE_BYPASS_NONE, + SPEC_STORE_BYPASS_AUTO, SPEC_STORE_BYPASS_DISABLE, SPEC_STORE_BYPASS_PRCTL, SPEC_STORE_BYPASS_SECCOMP, diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c348f14c2b5a80..570de55485df53 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2525,16 +2525,8 @@ static void update_mds_branch_idle(void) #undef pr_fmt #define pr_fmt(fmt) "Speculative Store Bypass: " fmt -static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; - -/* The kernel command line selection */ -enum ssb_mitigation_cmd { - SPEC_STORE_BYPASS_CMD_NONE, - SPEC_STORE_BYPASS_CMD_AUTO, - SPEC_STORE_BYPASS_CMD_ON, - SPEC_STORE_BYPASS_CMD_PRCTL, - SPEC_STORE_BYPASS_CMD_SECCOMP, -}; +static enum ssb_mitigation ssb_mode __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SSB) ? SPEC_STORE_BYPASS_AUTO : SPEC_STORE_BYPASS_NONE; static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_NONE] = "Vulnerable", @@ -2543,94 +2535,61 @@ static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", }; -static const struct { - const char *option; - enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initconst = { - { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ - { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ - { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ - { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ - { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ -}; +static bool nossb __ro_after_init; -static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +static int __init nossb_parse_cmdline(char *str) { - enum ssb_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ? - SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || - cpu_mitigations_off()) { - return SPEC_STORE_BYPASS_CMD_NONE; - } else { - ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", - arg, sizeof(arg)); - if (ret < 0) - return cmd; + nossb = true; + ssb_mode = SPEC_STORE_BYPASS_NONE; + return 0; +} +early_param("nospec_store_bypass_disable", nossb_parse_cmdline); - for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { - if (!match_option(arg, ret, ssb_mitigation_options[i].option)) - continue; +static int __init ssb_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; - cmd = ssb_mitigation_options[i].cmd; - break; - } + if (nossb) + return 0; - if (i >= ARRAY_SIZE(ssb_mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - } + if (!strcmp(str, "auto")) + ssb_mode = SPEC_STORE_BYPASS_AUTO; + else if (!strcmp(str, "on")) + ssb_mode = SPEC_STORE_BYPASS_DISABLE; + else if (!strcmp(str, "off")) + ssb_mode = SPEC_STORE_BYPASS_NONE; + else if (!strcmp(str, "prctl")) + ssb_mode = SPEC_STORE_BYPASS_PRCTL; + else if (!strcmp(str, "seccomp")) + ssb_mode = IS_ENABLED(CONFIG_SECCOMP) ? + SPEC_STORE_BYPASS_SECCOMP : SPEC_STORE_BYPASS_PRCTL; + else + pr_err("Ignoring unknown spec_store_bypass_disable option (%s).\n", + str); - return cmd; + return 0; } +early_param("spec_store_bypass_disable", ssb_parse_cmdline); static void __init ssb_select_mitigation(void) { - enum ssb_mitigation_cmd cmd; - - if (!boot_cpu_has(X86_FEATURE_SSBD)) - goto out; - - cmd = ssb_parse_cmdline(); - if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && - (cmd == SPEC_STORE_BYPASS_CMD_NONE || - cmd == SPEC_STORE_BYPASS_CMD_AUTO)) + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) { + ssb_mode = SPEC_STORE_BYPASS_NONE; return; + } - switch (cmd) { - case SPEC_STORE_BYPASS_CMD_SECCOMP: - /* - * Choose prctl+seccomp as the default mode if seccomp is - * enabled. - */ - if (IS_ENABLED(CONFIG_SECCOMP)) - ssb_mode = SPEC_STORE_BYPASS_SECCOMP; - else - ssb_mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_ON: - ssb_mode = SPEC_STORE_BYPASS_DISABLE; - break; - case SPEC_STORE_BYPASS_CMD_AUTO: + if (ssb_mode == SPEC_STORE_BYPASS_AUTO) { if (should_mitigate_vuln(X86_BUG_SPEC_STORE_BYPASS)) ssb_mode = SPEC_STORE_BYPASS_PRCTL; else ssb_mode = SPEC_STORE_BYPASS_NONE; - break; - case SPEC_STORE_BYPASS_CMD_PRCTL: - ssb_mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_NONE: - break; } -out: - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - pr_info("%s\n", ssb_strings[ssb_mode]); + if (!boot_cpu_has(X86_FEATURE_SSBD)) + ssb_mode = SPEC_STORE_BYPASS_NONE; + + pr_info("%s\n", ssb_strings[ssb_mode]); } static void __init ssb_apply_mitigation(void) @@ -2846,6 +2805,7 @@ static int ssb_prctl_get(struct task_struct *task) return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: case SPEC_STORE_BYPASS_PRCTL: + case SPEC_STORE_BYPASS_AUTO: if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ssb_noexec(task)) From 6131690df4adae33e01c0b51b9b78b3e8ed3b76f Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Tue, 29 Jul 2025 16:28:02 +0200 Subject: [PATCH 1761/3206] firmware: tegra: Do not warn on missing memory-region property The IPC shared memory can reside in system memory or SRAM. In the latter case the memory-region property is expected not to be present, so do not warn about it. Reported-by: Jonathan Hunter Fixes: dbe4efea38d0 ("firmware: tegra: bpmp: Use of_reserved_mem_region_to_resource() for "memory-region"") Signed-off-by: Thierry Reding --- drivers/firmware/tegra/bpmp-tegra186.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/tegra/bpmp-tegra186.c b/drivers/firmware/tegra/bpmp-tegra186.c index 7cfc5fdfa49d50..64863db7a71576 100644 --- a/drivers/firmware/tegra/bpmp-tegra186.c +++ b/drivers/firmware/tegra/bpmp-tegra186.c @@ -198,7 +198,10 @@ static int tegra186_bpmp_dram_init(struct tegra_bpmp *bpmp) err = of_reserved_mem_region_to_resource(bpmp->dev->of_node, 0, &res); if (err < 0) { - dev_warn(bpmp->dev, "failed to parse memory region: %d\n", err); + if (err != -ENODEV) + dev_warn(bpmp->dev, + "failed to parse memory region: %d\n", err); + return err; } From 7f830e126dc357fc086905ce9730140fd4528d66 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 15 Sep 2025 11:04:12 -0500 Subject: [PATCH 1762/3206] x86/sev: Guard sev_evict_cache() with CONFIG_AMD_MEM_ENCRYPT The sev_evict_cache() is guest-related code and should be guarded by CONFIG_AMD_MEM_ENCRYPT, not CONFIG_KVM_AMD_SEV. CONFIG_AMD_MEM_ENCRYPT=y is required for a guest to run properly as an SEV-SNP guest, but a guest kernel built with CONFIG_KVM_AMD_SEV=n would get the stub function of sev_evict_cache() instead of the version that performs the actual eviction. Move the function declarations under the appropriate #ifdef. Fixes: 7b306dfa326f ("x86/sev: Evict cache lines during SNP memory validation") Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Cc: stable@kernel.org # 6.16.x Link: https://lore.kernel.org/r/70e38f2c4a549063de54052c9f64929705313526.1757708959.git.thomas.lendacky@amd.com --- arch/x86/include/asm/sev.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 02236962fdb108..465b19fd1a2db8 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -562,6 +562,24 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, extern struct ghcb *boot_ghcb; +static inline void sev_evict_cache(void *va, int npages) +{ + volatile u8 val __always_unused; + u8 *bytes = va; + int page_idx; + + /* + * For SEV guests, a read from the first/last cache-lines of a 4K page + * using the guest key is sufficient to cause a flush of all cache-lines + * associated with that 4K page without incurring all the overhead of a + * full CLFLUSH sequence. + */ + for (page_idx = 0; page_idx < npages; page_idx++) { + val = bytes[page_idx * PAGE_SIZE]; + val = bytes[page_idx * PAGE_SIZE + PAGE_SIZE - 1]; + } +} + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define snp_vmpl 0 @@ -605,6 +623,7 @@ static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } +static inline void sev_evict_cache(void *va, int npages) {} #endif /* CONFIG_AMD_MEM_ENCRYPT */ @@ -619,24 +638,6 @@ int rmp_make_shared(u64 pfn, enum pg_level level); void snp_leak_pages(u64 pfn, unsigned int npages); void kdump_sev_callback(void); void snp_fixup_e820_tables(void); - -static inline void sev_evict_cache(void *va, int npages) -{ - volatile u8 val __always_unused; - u8 *bytes = va; - int page_idx; - - /* - * For SEV guests, a read from the first/last cache-lines of a 4K page - * using the guest key is sufficient to cause a flush of all cache-lines - * associated with that 4K page without incurring all the overhead of a - * full CLFLUSH sequence. - */ - for (page_idx = 0; page_idx < npages; page_idx++) { - val = bytes[page_idx * PAGE_SIZE]; - val = bytes[page_idx * PAGE_SIZE + PAGE_SIZE - 1]; - } -} #else static inline bool snp_probe_rmptable_info(void) { return false; } static inline int snp_rmptable_init(void) { return -ENOSYS; } @@ -652,7 +653,6 @@ static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} static inline void kdump_sev_callback(void) { } static inline void snp_fixup_e820_tables(void) {} -static inline void sev_evict_cache(void *va, int npages) {} #endif #endif From cd4ea81be3eb94047ad023c631afd9bd6c295400 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 12 Sep 2025 02:06:09 +0200 Subject: [PATCH 1763/3206] io_uring/io-wq: fix `max_workers` breakage and `nr_workers` underflow Commit 88e6c42e40de ("io_uring/io-wq: add check free worker before create new worker") reused the variable `do_create` for something else, abusing it for the free worker check. This caused the value to effectively always be `true` at the time `nr_workers < max_workers` was checked, but it should really be `false`. This means the `max_workers` setting was ignored, and worse: if the limit had already been reached, incrementing `nr_workers` was skipped even though another worker would be created. When later lots of workers exit, the `nr_workers` field could easily underflow, making the problem worse because more and more workers would be created without incrementing `nr_workers`. The simple solution is to use a different variable for the free worker check instead of using one variable for two different things. Cc: stable@vger.kernel.org Fixes: 88e6c42e40de ("io_uring/io-wq: add check free worker before create new worker") Signed-off-by: Max Kellermann Reviewed-by: Fengnan Chang Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 17dfaa0395c46b..1d03b2fc4b2594 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -352,16 +352,16 @@ static void create_worker_cb(struct callback_head *cb) struct io_wq *wq; struct io_wq_acct *acct; - bool do_create = false; + bool activated_free_worker, do_create = false; worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; acct = worker->acct; rcu_read_lock(); - do_create = !io_acct_activate_free_worker(acct); + activated_free_worker = io_acct_activate_free_worker(acct); rcu_read_unlock(); - if (!do_create) + if (activated_free_worker) goto no_need_create; raw_spin_lock(&acct->workers_lock); From 20c9ccffccd61b37325a0519fb6d485caeecf7fa Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sun, 14 Sep 2025 11:18:08 -0700 Subject: [PATCH 1764/3206] perf maps: Ensure kmap is set up for all inserts __maps__fixup_overlap_and_insert may split or directly insert a map, when doing this the map may need to have a kmap set up for the sake of the kmaps. The missing kmap set up fails the check_invariants test in maps, later "Internal error" reports from map__kmap and ultimately causes segfaults. Similar fixes were added in commit e0e4e0b8b7fa ("perf maps: Add missing map__set_kmap_maps() when replacing a kernel map") and commit 25d9c0301d36 ("perf maps: Set the kmaps for newly created/added kernel maps") but they missed cases. To try to reduce the risk of this, update the kmap directly following any manual insert. This identified another problem in maps__copy_from. Fixes: e0e4e0b8b7fa ("perf maps: Add missing map__set_kmap_maps() when replacing a kernel map") Fixes: 25d9c0301d36 ("perf maps: Set the kmaps for newly created/added kernel maps") Signed-off-by: Ian Rogers Signed-off-by: Namhyung Kim --- tools/perf/util/maps.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 85b2a93a59ac65..779f6230130af2 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -477,6 +477,7 @@ static int __maps__insert(struct maps *maps, struct map *new) } /* Insert the value at the end. */ maps_by_address[nr_maps] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) maps_by_name[nr_maps] = map__get(new); @@ -502,8 +503,6 @@ static int __maps__insert(struct maps *maps, struct map *new) if (map__end(new) < map__start(new)) RC_CHK_ACCESS(maps)->ends_broken = true; - map__set_kmap_maps(new, maps); - return 0; } @@ -891,6 +890,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) if (before) { map__put(maps_by_address[i]); maps_by_address[i] = before; + map__set_kmap_maps(before, maps); if (maps_by_name) { map__put(maps_by_name[ni]); @@ -918,6 +918,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) */ map__put(maps_by_address[i]); maps_by_address[i] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) { map__put(maps_by_name[ni]); @@ -942,14 +943,13 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) */ map__put(maps_by_address[i]); maps_by_address[i] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) { map__put(maps_by_name[ni]); maps_by_name[ni] = map__get(new); } - map__set_kmap_maps(new, maps); - check_invariants(maps); return err; } @@ -1019,6 +1019,7 @@ int maps__copy_from(struct maps *dest, struct maps *parent) err = unwind__prepare_access(dest, new, NULL); if (!err) { dest_maps_by_address[i] = new; + map__set_kmap_maps(new, dest); if (dest_maps_by_name) dest_maps_by_name[i] = map__get(new); RC_CHK_ACCESS(dest)->nr_maps = i + 1; From 32d376610bdfdcda39e0a74aac7c6c3b92f91098 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 15 Sep 2025 14:42:09 +0100 Subject: [PATCH 1765/3206] bpftool: Search for tracefs at /sys/kernel/tracing first With "bpftool prog tracelog", bpftool prints messages from the trace pipe. To do so, it first needs to find the tracefs mount point to open the pipe. Bpftool looks at a few "default" locations, including /sys/kernel/debug/tracing and /sys/kernel/tracing. Some of these locations, namely /tracing and /trace, are not standard. They are in the list because some users used to hardcode the tracing directory to short names; but we have no compelling reason to look at these locations. If we fail to find the tracefs at the default locations, we have an additional step to find it by parsing /proc/mounts anyway, so it's safe to remove these entries from the list of default locations to check. Additionally, Alexei reports that looking for the tracefs at /sys/kernel/debug/tracing may automatically mount the file system under that location, and generate a kernel log message telling that auto-mounting there is deprecated. To avoid this message, let's swap the order for checking the potential mount points: try /sys/kernel/tracing first, which should be the standard location nowadays. The kernel log message may still appear if the tracefs is not mounted on /sys/kernel/tracing when we run bpftool. Reported-by: Alexei Starovoitov Closes: https://lore.kernel.org/r/CAADnVQLcMi5YQhZKsU4z3S2uVUAGu_62C33G2Zx_ruG3uXa-Ug@mail.gmail.com/ Signed-off-by: Quentin Monnet Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20250915134209.36568-1-qmo@kernel.org Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/tracelog.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tools/bpf/bpftool/tracelog.c b/tools/bpf/bpftool/tracelog.c index 31d806e3bdaaa9..573a8d99f009a4 100644 --- a/tools/bpf/bpftool/tracelog.c +++ b/tools/bpf/bpftool/tracelog.c @@ -57,10 +57,8 @@ find_tracefs_mnt_single(unsigned long magic, char *mnt, const char *mntpt) static bool get_tracefs_pipe(char *mnt) { static const char * const known_mnts[] = { - "/sys/kernel/debug/tracing", "/sys/kernel/tracing", - "/tracing", - "/trace", + "/sys/kernel/debug/tracing", }; const char *pipe_name = "/trace_pipe"; const char *fstype = "tracefs"; @@ -95,12 +93,7 @@ static bool get_tracefs_pipe(char *mnt) return false; p_info("could not find tracefs, attempting to mount it now"); - /* Most of the time, tracefs is automatically mounted by debugfs at - * /sys/kernel/debug/tracing when we try to access it. If we could not - * find it, it is likely that debugfs is not mounted. Let's give one - * attempt at mounting just tracefs at /sys/kernel/tracing. - */ - strcpy(mnt, known_mnts[1]); + strcpy(mnt, known_mnts[0]); if (mount_tracefs(mnt)) return false; From dcfd151d3277cc3bc73c94114e360556d47b992d Mon Sep 17 00:00:00 2001 From: Mallesh Koujalagi Date: Fri, 12 Sep 2025 17:04:58 +0530 Subject: [PATCH 1766/3206] drm/xe/hwmon: Remove type casting Refactor: eliminate type casts by using proper u32 declarations. v2: - Address review comments. (Karthik) v3: - Use the proper u32 type and drop cast. (Lucas De Marchi) - Modify variable when actually using u64 value. - Change r value to reg_value with u32 type. v4: - Remove newline between trailer and Signed-off-by. (Lucas De Marchi) - Change reg_val to val for more user-friendly logging. - Use mul_u32_u32 function since both values are u32. v5: - mul_u32_u32 function with shift. (Lucas De Marchi) Fixes: 7596d839f6228 ("drm/xe/hwmon: Add support to manage power limits though mailbox") Signed-off-by: Mallesh Koujalagi Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250912113458.2815172-1-mallesh.koujalagi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 4e1d3b5e6423dc841acd8691d75626b3d3b2b6a8) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_hwmon.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index c17ed1ae86493c..c5b63e10bb9113 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -286,7 +286,7 @@ static struct xe_reg xe_hwmon_get_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg */ static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *value) { - u64 reg_val = 0, min, max; + u32 reg_val = 0; struct xe_device *xe = hwmon->xe; struct xe_reg rapl_limit, pkg_power_sku; struct xe_mmio *mmio = xe_root_tile_mmio(xe); @@ -294,7 +294,7 @@ static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channe mutex_lock(&hwmon->hwmon_lock); if (hwmon->xe->info.has_mbx_power_limits) { - xe_hwmon_pcode_read_power_limit(hwmon, attr, channel, (u32 *)®_val); + xe_hwmon_pcode_read_power_limit(hwmon, attr, channel, ®_val); } else { rapl_limit = xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, channel); pkg_power_sku = xe_hwmon_get_reg(hwmon, REG_PKG_POWER_SKU, channel); @@ -304,19 +304,21 @@ static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channe /* Check if PL limits are disabled. */ if (!(reg_val & PWR_LIM_EN)) { *value = PL_DISABLE; - drm_info(&hwmon->xe->drm, "%s disabled for channel %d, val 0x%016llx\n", + drm_info(&hwmon->xe->drm, "%s disabled for channel %d, val 0x%08x\n", PWR_ATTR_TO_STR(attr), channel, reg_val); goto unlock; } reg_val = REG_FIELD_GET(PWR_LIM_VAL, reg_val); - *value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power); + *value = mul_u32_u32(reg_val, SF_POWER) >> hwmon->scl_shift_power; /* For platforms with mailbox power limit support clamping would be done by pcode. */ if (!hwmon->xe->info.has_mbx_power_limits) { - reg_val = xe_mmio_read64_2x32(mmio, pkg_power_sku); - min = REG_FIELD_GET(PKG_MIN_PWR, reg_val); - max = REG_FIELD_GET(PKG_MAX_PWR, reg_val); + u64 pkg_pwr, min, max; + + pkg_pwr = xe_mmio_read64_2x32(mmio, pkg_power_sku); + min = REG_FIELD_GET(PKG_MIN_PWR, pkg_pwr); + max = REG_FIELD_GET(PKG_MAX_PWR, pkg_pwr); min = mul_u64_u32_shr(min, SF_POWER, hwmon->scl_shift_power); max = mul_u64_u32_shr(max, SF_POWER, hwmon->scl_shift_power); if (min && max) @@ -493,8 +495,8 @@ xe_hwmon_power_max_interval_show(struct device *dev, struct device_attribute *at { struct xe_hwmon *hwmon = dev_get_drvdata(dev); struct xe_mmio *mmio = xe_root_tile_mmio(hwmon->xe); - u32 x, y, x_w = 2; /* 2 bits */ - u64 r, tau4, out; + u32 reg_val, x, y, x_w = 2; /* 2 bits */ + u64 tau4, out; int channel = (to_sensor_dev_attr(attr)->index % 2) ? CHANNEL_PKG : CHANNEL_CARD; u32 power_attr = (to_sensor_dev_attr(attr)->index > 1) ? PL2_HWMON_ATTR : PL1_HWMON_ATTR; @@ -505,23 +507,24 @@ xe_hwmon_power_max_interval_show(struct device *dev, struct device_attribute *at mutex_lock(&hwmon->hwmon_lock); if (hwmon->xe->info.has_mbx_power_limits) { - ret = xe_hwmon_pcode_read_power_limit(hwmon, power_attr, channel, (u32 *)&r); + ret = xe_hwmon_pcode_read_power_limit(hwmon, power_attr, channel, ®_val); if (ret) { drm_err(&hwmon->xe->drm, - "power interval read fail, ch %d, attr %d, r 0%llx, ret %d\n", - channel, power_attr, r, ret); - r = 0; + "power interval read fail, ch %d, attr %d, val 0x%08x, ret %d\n", + channel, power_attr, reg_val, ret); + reg_val = 0; } } else { - r = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, channel)); + reg_val = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, + channel)); } mutex_unlock(&hwmon->hwmon_lock); xe_pm_runtime_put(hwmon->xe); - x = REG_FIELD_GET(PWR_LIM_TIME_X, r); - y = REG_FIELD_GET(PWR_LIM_TIME_Y, r); + x = REG_FIELD_GET(PWR_LIM_TIME_X, reg_val); + y = REG_FIELD_GET(PWR_LIM_TIME_Y, reg_val); /* * tau = (1 + (x / 4)) * power(2,y), x = bits(23:22), y = bits(21:17) From 7926ba2143d8fef40bb940232818c7363e33598c Mon Sep 17 00:00:00 2001 From: Nitin Gote Date: Thu, 11 Sep 2025 10:58:23 +0530 Subject: [PATCH 1767/3206] drm/xe: defer free of NVM auxiliary container to device release callback Do not kfree the intel_dg_nvm_dev in xe_nvm_fini() right after auxiliary_device_delete/uninit. The auxiliary_device embeds the device/kobject (and its name); freeing it too early can race with asynchronous device_del/udev processing and cause a use-after-free. Signed-off-by: Nitin Gote Fixes: c28bfb107dac ("drm/xe/nvm: add on-die non-volatile memory device") Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250911052823.226696-1-nitin.r.gote@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit d4c3ed963e41d488695cf91068eabb8eb9f538ec) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_nvm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_nvm.c b/drivers/gpu/drm/xe/xe_nvm.c index 61b0a1531a539b..2cfe9eb673913f 100644 --- a/drivers/gpu/drm/xe/xe_nvm.c +++ b/drivers/gpu/drm/xe/xe_nvm.c @@ -35,6 +35,10 @@ static const struct intel_dg_nvm_region regions[INTEL_DG_NVM_REGIONS] = { static void xe_nvm_release_dev(struct device *dev) { + struct auxiliary_device *aux = container_of(dev, struct auxiliary_device, dev); + struct intel_dg_nvm_dev *nvm = container_of(aux, struct intel_dg_nvm_dev, aux_dev); + + kfree(nvm); } static bool xe_nvm_non_posted_erase(struct xe_device *xe) @@ -162,6 +166,5 @@ void xe_nvm_fini(struct xe_device *xe) auxiliary_device_delete(&nvm->aux_dev); auxiliary_device_uninit(&nvm->aux_dev); - kfree(nvm); xe->nvm = NULL; } From f7528e4412138699d69946d2a811feb319268f6b Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Mon, 15 Sep 2025 20:16:57 +0800 Subject: [PATCH 1768/3206] selftests/bpf: Skip timer_interrupt case when bpf_timer is not supported Like commit fbdd61c94bcb ("selftests/bpf: Skip timer cases when bpf_timer is not supported"), 'timer_interrupt' test case should be skipped if verifier rejects bpf_timer with returning -EOPNOTSUPP. cd tools/testing/selftests/bpf ./test_progs -t timer 461 timer_interrupt:SKIP Summary: 6/0 PASSED, 7 SKIPPED, 0 FAILED Signed-off-by: Leon Hwang Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250915121657.28084-1-leon.hwang@linux.dev --- tools/testing/selftests/bpf/prog_tests/timer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index 86425939527c8e..34f9ccce260293 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -108,6 +108,10 @@ void test_timer_interrupt(void) LIBBPF_OPTS(bpf_test_run_opts, opts); skel = timer_interrupt__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "timer_interrupt__open_and_load")) return; From a9d4e9f0e871352a48a82da11a50df7196fe567a Mon Sep 17 00:00:00 2001 From: Saket Kumar Bhaskar Date: Sat, 13 Sep 2025 14:43:37 +0530 Subject: [PATCH 1769/3206] selftests/bpf: Fix arena_spin_lock selftest failure For systems having CONFIG_NR_CPUS set to > 1024 in kernel config the selftest fails as arena_spin_lock_irqsave() returns EOPNOTSUPP. (eg - incase of powerpc default value for CONFIG_NR_CPUS is 8192) The selftest is skipped incase bpf program returns EOPNOTSUPP, with a descriptive message logged. Tested-by: Venkat Rao Bagalkote Signed-off-by: Saket Kumar Bhaskar Link: https://lore.kernel.org/r/20250913091337.1841916-1-skb99@linux.ibm.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/arena_spin_lock.c | 13 +++++++++++++ tools/testing/selftests/bpf/progs/arena_spin_lock.c | 5 ++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c index 0223fce4db2bc2..693fd86fbde622 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c @@ -40,8 +40,13 @@ static void *spin_lock_thread(void *arg) err = bpf_prog_test_run_opts(prog_fd, &topts); ASSERT_OK(err, "test_run err"); + + if (topts.retval == -EOPNOTSUPP) + goto end; + ASSERT_EQ((int)topts.retval, 0, "test_run retval"); +end: pthread_exit(arg); } @@ -63,6 +68,7 @@ static void test_arena_spin_lock_size(int size) skel = arena_spin_lock__open_and_load(); if (!ASSERT_OK_PTR(skel, "arena_spin_lock__open_and_load")) return; + if (skel->data->test_skip == 2) { test__skip(); goto end; @@ -86,6 +92,13 @@ static void test_arena_spin_lock_size(int size) goto end_barrier; } + if (skel->data->test_skip == 3) { + printf("%s:SKIP: CONFIG_NR_CPUS exceed the maximum supported by arena spinlock\n", + __func__); + test__skip(); + goto end_barrier; + } + ASSERT_EQ(skel->bss->counter, repeat * nthreads, "check counter value"); end_barrier: diff --git a/tools/testing/selftests/bpf/progs/arena_spin_lock.c b/tools/testing/selftests/bpf/progs/arena_spin_lock.c index c4500c37f85e06..086b57a426cf5a 100644 --- a/tools/testing/selftests/bpf/progs/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/progs/arena_spin_lock.c @@ -37,8 +37,11 @@ int prog(void *ctx) #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) unsigned long flags; - if ((ret = arena_spin_lock_irqsave(&lock, flags))) + if ((ret = arena_spin_lock_irqsave(&lock, flags))) { + if (ret == -EOPNOTSUPP) + test_skip = 3; return ret; + } if (counter != limit) counter++; bpf_repeat(cs_count); From 2c895133950646f45e5cf3900b168c952c8dbee8 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 15 Sep 2025 03:26:17 +0000 Subject: [PATCH 1770/3206] bpf: Do not limit bpf_cgroup_from_id to current's namespace The bpf_cgroup_from_id kfunc relies on cgroup_get_from_id to obtain the cgroup corresponding to a given cgroup ID. This helper can be called in a lot of contexts where the current thread can be random. A recent example was its use in sched_ext's ops.tick(), to obtain the root cgroup pointer. Since the current task can be whatever random user space task preempted by the timer tick, this makes the behavior of the helper unreliable. Refactor out __cgroup_get_from_id as the non-namespace aware version of cgroup_get_from_id, and change bpf_cgroup_from_id to make use of it. There is no compatibility breakage here, since changing the namespace against which the lookup is being done to the root cgroup namespace only permits a wider set of lookups to succeed now. The cgroup IDs across namespaces are globally unique, and thus don't need to be retranslated. Reported-by: Dan Schatzberg Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250915032618.1551762-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/cgroup.h | 1 + kernel/bpf/helpers.c | 2 +- kernel/cgroup/cgroup.c | 24 ++++++++++++++++++++---- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e23..b08c8e62881cd9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -650,6 +650,7 @@ static inline void cgroup_kthread_ready(void) } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); +struct cgroup *__cgroup_get_from_id(u64 id); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c0c0764a20251c..51229aba531818 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2546,7 +2546,7 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) { struct cgroup *cgrp; - cgrp = cgroup_get_from_id(cgid); + cgrp = __cgroup_get_from_id(cgid); if (IS_ERR(cgrp)) return NULL; return cgrp; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb73..6c2d20ac697c3c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6343,15 +6343,15 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) } /* - * cgroup_get_from_id : get the cgroup associated with cgroup id + * __cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure - * Only cgroups within current task's cgroup NS are valid. + * There are no cgroup NS restrictions. */ -struct cgroup *cgroup_get_from_id(u64 id) +struct cgroup *__cgroup_get_from_id(u64 id) { struct kernfs_node *kn; - struct cgroup *cgrp, *root_cgrp; + struct cgroup *cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) @@ -6373,6 +6373,22 @@ struct cgroup *cgroup_get_from_id(u64 id) if (!cgrp) return ERR_PTR(-ENOENT); + return cgrp; +} + +/* + * cgroup_get_from_id : get the cgroup associated with cgroup id + * @id: cgroup id + * On success return the cgrp or ERR_PTR on failure + * Only cgroups within current task's cgroup NS are valid. + */ +struct cgroup *cgroup_get_from_id(u64 id) +{ + struct cgroup *cgrp, *root_cgrp; + + cgrp = __cgroup_get_from_id(id); + if (IS_ERR(cgrp)) + return cgrp; root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { From a8250d167c0cf6b98ccb5168fb2daf2859679d72 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 15 Sep 2025 03:26:18 +0000 Subject: [PATCH 1771/3206] selftests/bpf: Add a test for bpf_cgroup_from_id lookup in non-root cgns Make sure that we only switch the cgroup namespace and enter a new cgroup in a child process separate from test_progs, to not mess up the environment for subsequent tests. To remove this cgroup, we need to wait for the child to exit, and then rmdir its cgroup. If the read call fails, or waitpid succeeds, we know the child exited (read call would fail when the last pipe end is closed, otherwise waitpid waits until exit(2) is called). We then invoke a newly introduced remove_cgroup_pid() helper, that identifies cgroup path using the passed in pid of the now dead child, instead of using the current process pid (getpid()). Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250915032618.1551762-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/cgroup_helpers.c | 20 ++++++ tools/testing/selftests/bpf/cgroup_helpers.h | 1 + .../selftests/bpf/prog_tests/cgrp_kfunc.c | 71 +++++++++++++++++++ .../selftests/bpf/progs/cgrp_kfunc_success.c | 12 ++++ 4 files changed, 104 insertions(+) diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 15f6260148728d..20cede4db3cee8 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -412,6 +412,26 @@ void remove_cgroup(const char *relative_path) log_err("rmdiring cgroup %s .. %s", relative_path, cgroup_path); } +/* + * remove_cgroup_pid() - Remove a cgroup setup by process identified by PID + * @relative_path: The cgroup path, relative to the workdir, to remove + * @pid: PID to be used to find cgroup_path + * + * This function expects a cgroup to already be created, relative to the cgroup + * work dir. It also expects the cgroup doesn't have any children or live + * processes and it removes the cgroup. + * + * On failure, it will print an error to stderr. + */ +void remove_cgroup_pid(const char *relative_path, int pid) +{ + char cgroup_path[PATH_MAX + 1]; + + format_cgroup_path_pid(cgroup_path, relative_path, pid); + if (rmdir(cgroup_path)) + log_err("rmdiring cgroup %s .. %s", relative_path, cgroup_path); +} + /** * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD * @relative_path: The cgroup path, relative to the workdir, to join diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h index 182e1ac36c95dd..3857304be87410 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.h +++ b/tools/testing/selftests/bpf/cgroup_helpers.h @@ -19,6 +19,7 @@ int cgroup_setup_and_join(const char *relative_path); int get_root_cgroup(void); int create_and_get_cgroup(const char *relative_path); void remove_cgroup(const char *relative_path); +void remove_cgroup_pid(const char *relative_path, int pid); unsigned long long get_cgroup_id(const char *relative_path); int get_cgroup1_hierarchy_id(const char *subsys_name); diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c b/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c index adda85f970589e..4b42fbc96efc9a 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c +++ b/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c @@ -4,6 +4,8 @@ #define _GNU_SOURCE #include #include +#include +#include #include "cgrp_kfunc_failure.skel.h" #include "cgrp_kfunc_success.skel.h" @@ -87,6 +89,72 @@ static const char * const success_tests[] = { "test_cgrp_from_id", }; +static void test_cgrp_from_id_ns(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct cgrp_kfunc_success *skel; + struct bpf_program *prog; + int pid, pipe_fd[2]; + + skel = open_load_cgrp_kfunc_skel(); + if (!ASSERT_OK_PTR(skel, "open_load_skel")) + return; + + if (!ASSERT_OK(skel->bss->err, "pre_mkdir_err")) + goto cleanup; + + prog = skel->progs.test_cgrp_from_id_ns; + + if (!ASSERT_OK(pipe(pipe_fd), "pipe")) + goto cleanup; + + pid = fork(); + if (!ASSERT_GE(pid, 0, "fork result")) { + close(pipe_fd[0]); + close(pipe_fd[1]); + goto cleanup; + } + + if (pid == 0) { + int ret = 0; + + close(pipe_fd[0]); + + if (!ASSERT_GE(cgroup_setup_and_join("cgrp_from_id_ns"), 0, "join cgroup")) + exit(1); + + if (!ASSERT_OK(unshare(CLONE_NEWCGROUP), "unshare cgns")) + exit(1); + + ret = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + if (!ASSERT_OK(ret, "test run ret")) + exit(1); + + if (!ASSERT_OK(opts.retval, "test run retval")) + exit(1); + + if (!ASSERT_EQ(write(pipe_fd[1], &ret, sizeof(ret)), sizeof(ret), "write pipe")) + exit(1); + + exit(0); + } else { + int res; + + close(pipe_fd[1]); + + ASSERT_EQ(read(pipe_fd[0], &res, sizeof(res)), sizeof(res), "read res"); + ASSERT_EQ(waitpid(pid, NULL, 0), pid, "wait on child"); + + remove_cgroup_pid("cgrp_from_id_ns", pid); + + ASSERT_OK(res, "result from run"); + } + + close(pipe_fd[0]); +cleanup: + cgrp_kfunc_success__destroy(skel); +} + void test_cgrp_kfunc(void) { int i, err; @@ -102,6 +170,9 @@ void test_cgrp_kfunc(void) run_success_test(success_tests[i]); } + if (test__start_subtest("test_cgrp_from_id_ns")) + test_cgrp_from_id_ns(); + RUN_TESTS(cgrp_kfunc_failure); cleanup: diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c index 5354455a01be8a..02d8f160ca0e95 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c @@ -221,3 +221,15 @@ int BPF_PROG(test_cgrp_from_id, struct cgroup *cgrp, const char *path) return 0; } + +SEC("syscall") +int test_cgrp_from_id_ns(void *ctx) +{ + struct cgroup *cg; + + cg = bpf_cgroup_from_id(1); + if (!cg) + return 42; + bpf_cgroup_release(cg); + return 0; +} From 0f101028407605c7b7cea53946ee4d25f82cfb6c Mon Sep 17 00:00:00 2001 From: Ivaylo Ivanov Date: Sun, 14 Sep 2025 16:02:29 +0300 Subject: [PATCH 1772/3206] dt-bindings: regulator: document max77838 pmic The max77838 is a pmic, containing a BUCK regulator and 4 LDOs. It's primarily used in the Samsung Galaxy S7 lineup and is accessed over I2C. Document it. Signed-off-by: Ivaylo Ivanov Acked-by: Conor Dooley Reviewed-by: Igor Belwon Link: https://patch.msgid.link/20250914130230.2622030-2-ivo.ivanov.ivanov1@gmail.com Signed-off-by: Mark Brown --- .../bindings/regulator/maxim,max77838.yaml | 68 +++++++++++++++++++ MAINTAINERS | 6 ++ 2 files changed, 74 insertions(+) create mode 100644 Documentation/devicetree/bindings/regulator/maxim,max77838.yaml diff --git a/Documentation/devicetree/bindings/regulator/maxim,max77838.yaml b/Documentation/devicetree/bindings/regulator/maxim,max77838.yaml new file mode 100644 index 00000000000000..bed36af5493df2 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/maxim,max77838.yaml @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/maxim,max77838.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Maxim Integrated MAX77838 PMIC + +maintainers: + - Ivaylo Ivanov + +properties: + $nodename: + pattern: "pmic@[0-9a-f]{1,2}" + compatible: + enum: + - maxim,max77838 + + reg: + maxItems: 1 + + regulators: + type: object + $ref: regulator.yaml# + description: | + list of regulators provided by this controller, must be named + after their hardware counterparts ldo[1-4] and buck + + properties: + buck: + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + + patternProperties: + "^ldo([1-4])$": + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + + additionalProperties: false + +required: + - compatible + - reg + - regulators + +additionalProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + pmic@60 { + compatible = "maxim,max77838"; + reg = <0x60>; + + regulators { + ldo2 { + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + }; + }; + }; + }; +... diff --git a/MAINTAINERS b/MAINTAINERS index f6206963efbf0b..7554c7f2570a91 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15069,6 +15069,12 @@ F: Documentation/devicetree/bindings/*/*max77802.yaml F: drivers/regulator/max77802-regulator.c F: include/dt-bindings/*/*max77802.h +MAXIM MAX77838 PMIC REGULATOR DEVICE DRIVER +M: Ivaylo Ivanov +L: linux-kernel@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/regulator/maxim,max77838.yaml + MAXIM MAX77976 BATTERY CHARGER M: Luca Ceresoli S: Supported From 6a1f303cba45fa3b612d5a2898b1b1b045eb74e3 Mon Sep 17 00:00:00 2001 From: Ivaylo Ivanov Date: Sun, 14 Sep 2025 16:02:30 +0300 Subject: [PATCH 1773/3206] regulator: max77838: add max77838 regulator driver The max77838 PMIC contains a BUCK regulator and 4 LDOs. It's primarily used in the Samsung Galaxy S7 lineup and is accessed over I2C. Signed-off-by: Ivaylo Ivanov Link: https://patch.msgid.link/20250914130230.2622030-3-ivo.ivanov.ivanov1@gmail.com Signed-off-by: Mark Brown --- MAINTAINERS | 1 + drivers/regulator/Kconfig | 8 + drivers/regulator/Makefile | 1 + drivers/regulator/max77838-regulator.c | 221 +++++++++++++++++++++++++ 4 files changed, 231 insertions(+) create mode 100644 drivers/regulator/max77838-regulator.c diff --git a/MAINTAINERS b/MAINTAINERS index 7554c7f2570a91..fcb5b2c115bd1d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15074,6 +15074,7 @@ M: Ivaylo Ivanov L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/regulator/maxim,max77838.yaml +F: drivers/regulator/max77838-regulator.c MAXIM MAX77976 BATTERY CHARGER M: Luca Ceresoli diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index eaa6df1c9f8066..2cf119e59e4bd1 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -777,6 +777,14 @@ config REGULATOR_MAX77826 It includes support for control of output voltage. This regulator is found on the Samsung Galaxy S5 (klte) smartphone. +config REGULATOR_MAX77838 + tristate "Maxim 77838 regulator" + depends on REGMAP_I2C + help + This driver controls a Maxim 77838 regulator via I2C bus. + The regulator include 4 LDOs and a BUCK regulator. It's + present on the Samsung Galaxy S7 lineup of smartphones. + config REGULATOR_MC13XXX_CORE tristate diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index be98b29d6675d8..0622e9de628383 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_REGULATOR_MAX77686) += max77686-regulator.o obj-$(CONFIG_REGULATOR_MAX77693) += max77693-regulator.o obj-$(CONFIG_REGULATOR_MAX77802) += max77802-regulator.o obj-$(CONFIG_REGULATOR_MAX77826) += max77826-regulator.o +obj-$(CONFIG_REGULATOR_MAX77838) += max77838-regulator.o obj-$(CONFIG_REGULATOR_MAX77857) += max77857-regulator.o obj-$(CONFIG_REGULATOR_MC13783) += mc13783-regulator.o obj-$(CONFIG_REGULATOR_MC13892) += mc13892-regulator.o diff --git a/drivers/regulator/max77838-regulator.c b/drivers/regulator/max77838-regulator.c new file mode 100644 index 00000000000000..9faddbfd25fd80 --- /dev/null +++ b/drivers/regulator/max77838-regulator.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// +// regulator driver for Maxim MAX77838 +// +// based on max77826-regulator.c +// +// Copyright (c) 2025, Ivaylo Ivanov + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum max77838_registers { + MAX77838_REG_DEVICE_ID = 0x00, + MAX77838_REG_TOPSYS_STAT, + MAX77838_REG_STAT, + MAX77838_REG_EN, + MAX77838_REG_GPIO_PD_CTRL, + MAX77838_REG_UVLO_CFG1, + /* 0x06 - 0x0B: reserved */ + MAX77838_REG_I2C_CFG = 0x0C, + /* 0x0D - 0x0F: reserved */ + MAX77838_REG_LDO1_CFG = 0x10, + MAX77838_REG_LDO2_CFG, + MAX77838_REG_LDO3_CFG, + MAX77838_REG_LDO4_CFG, + /* 0x14 - 0x1F: reserved */ + MAX77838_REG_BUCK_CFG1 = 0x20, + MAX77838_REG_BUCK_VOUT, +}; + +enum max77838_regulators { + MAX77838_LDO1 = 0, + MAX77838_LDO2, + MAX77838_LDO3, + MAX77838_LDO4, + MAX77838_BUCK, + MAX77838_MAX_REGULATORS, +}; + +#define MAX77838_MASK_LDO 0x7f +#define MAX77838_MASK_BUCK 0xff + +#define MAX77838_LDO1_EN BIT(0) +#define MAX77838_LDO2_EN BIT(1) +#define MAX77838_LDO3_EN BIT(2) +#define MAX77838_LDO4_EN BIT(3) +#define MAX77838_BUCK_EN BIT(4) + +#define MAX77838_BUCK_AD BIT(3) +#define MAX77838_LDO_AD BIT(7) + +#define MAX77838_LDO_VOLT_MIN 600000 +#define MAX77838_LDO_VOLT_MAX 3775000 +#define MAX77838_LDO_VOLT_STEP 25000 + +#define MAX77838_BUCK_VOLT_MIN 500000 +#define MAX77838_BUCK_VOLT_MAX 2093750 +#define MAX77838_BUCK_VOLT_STEP 6250 + +#define MAX77838_VOLT_RANGE(_type) \ + ((MAX77838_ ## _type ## _VOLT_MAX - \ + MAX77838_ ## _type ## _VOLT_MIN) / \ + MAX77838_ ## _type ## _VOLT_STEP + 1) + +#define MAX77838_LDO(_id) \ + [MAX77838_LDO ## _id] = { \ + .id = MAX77838_LDO ## _id, \ + .name = "ldo"#_id, \ + .of_match = of_match_ptr("ldo"#_id), \ + .regulators_node = "regulators", \ + .ops = &max77838_regulator_ops, \ + .min_uV = MAX77838_LDO_VOLT_MIN, \ + .uV_step = MAX77838_LDO_VOLT_STEP, \ + .n_voltages = MAX77838_VOLT_RANGE(LDO), \ + .enable_reg = MAX77838_REG_EN, \ + .enable_mask = MAX77838_LDO ## _id ## _EN, \ + .vsel_reg = MAX77838_REG_LDO ## _id ## _CFG, \ + .vsel_mask = MAX77838_MASK_LDO, \ + .active_discharge_off = 0, \ + .active_discharge_on = MAX77838_LDO_AD, \ + .active_discharge_mask = MAX77838_LDO_AD, \ + .active_discharge_reg = MAX77838_REG_LDO ## _id ## _CFG, \ + .owner = THIS_MODULE, \ + } + +#define MAX77838_BUCK_DESC \ + [MAX77838_BUCK] = { \ + .id = MAX77838_BUCK, \ + .name = "buck", \ + .of_match = of_match_ptr("buck"), \ + .regulators_node = "regulators", \ + .ops = &max77838_regulator_ops, \ + .min_uV = MAX77838_BUCK_VOLT_MIN, \ + .uV_step = MAX77838_BUCK_VOLT_STEP, \ + .n_voltages = MAX77838_VOLT_RANGE(BUCK), \ + .enable_reg = MAX77838_REG_EN, \ + .enable_mask = MAX77838_BUCK_EN, \ + .vsel_reg = MAX77838_REG_BUCK_VOUT, \ + .vsel_mask = MAX77838_MASK_BUCK, \ + .active_discharge_off = 0, \ + .active_discharge_on = MAX77838_BUCK_AD, \ + .active_discharge_mask = MAX77838_BUCK_AD, \ + .active_discharge_reg = MAX77838_REG_BUCK_CFG1, \ + .owner = THIS_MODULE, \ + } + +struct max77838_regulator_info { + struct regmap *regmap; +}; + +static const struct regmap_config max77838_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = MAX77838_REG_BUCK_VOUT, +}; + +static const struct regulator_ops max77838_regulator_ops = { + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .list_voltage = regulator_list_voltage_linear, + .map_voltage = regulator_map_voltage_linear, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .set_active_discharge = regulator_set_active_discharge_regmap, +}; + +static const struct regulator_desc max77838_regulators_desc[] = { + MAX77838_LDO(1), + MAX77838_LDO(2), + MAX77838_LDO(3), + MAX77838_LDO(4), + MAX77838_BUCK_DESC, +}; + +static int max77838_read_device_id(struct regmap *regmap, struct device *dev) +{ + unsigned int device_id; + int ret; + + ret = regmap_read(regmap, MAX77838_REG_DEVICE_ID, &device_id); + if (!ret) + dev_dbg(dev, "DEVICE_ID: 0x%x\n", device_id); + + return ret; +} + +static int max77838_i2c_probe(struct i2c_client *client) +{ + struct device *dev = &client->dev; + struct max77838_regulator_info *info; + struct regulator_config config = {}; + struct regulator_dev *rdev; + struct regmap *regmap; + int i; + + info = devm_kzalloc(dev, sizeof(struct max77838_regulator_info), + GFP_KERNEL); + if (!info) + return -ENOMEM; + + regmap = devm_regmap_init_i2c(client, &max77838_regmap_config); + if (IS_ERR(regmap)) { + dev_err(dev, "Failed to allocate regmap!\n"); + return PTR_ERR(regmap); + } + + info->regmap = regmap; + i2c_set_clientdata(client, info); + + config.dev = dev; + config.regmap = regmap; + config.driver_data = info; + + for (i = 0; i < MAX77838_MAX_REGULATORS; i++) { + rdev = devm_regulator_register(dev, + &max77838_regulators_desc[i], + &config); + if (IS_ERR(rdev)) { + dev_err(dev, "Failed to register regulator!\n"); + return PTR_ERR(rdev); + } + } + + return max77838_read_device_id(regmap, dev); +} + +static const struct of_device_id __maybe_unused max77838_of_match[] = { + { .compatible = "maxim,max77838" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, max77838_of_match); + +static const struct i2c_device_id max77838_id[] = { + { "max77838-regulator" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(i2c, max77838_id); + +static struct i2c_driver max77838_regulator_driver = { + .driver = { + .name = "max77838", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .of_match_table = of_match_ptr(max77838_of_match), + }, + .probe = max77838_i2c_probe, + .id_table = max77838_id, +}; +module_i2c_driver(max77838_regulator_driver); + +MODULE_AUTHOR("Ivaylo Ivanov "); +MODULE_DESCRIPTION("MAX77838 PMIC regulator driver"); +MODULE_LICENSE("GPL"); From 3ae4c527080ce81b889ffc2780e077770b95ae88 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Thu, 11 Sep 2025 17:30:56 +0100 Subject: [PATCH 1774/3206] selftests/bpf: More open-coded gettid syscall cleanup Commit 0e2fb011a0ba ("selftests/bpf: Clean up open-coded gettid syscall invocations") addressed the issue that older libc may not have a gettid() function call wrapper for the associated syscall. A few more instances have crept into tests, use sys_gettid() instead, and poison raw gettid() usage to avoid future issues. Signed-off-by: Alan Maguire Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250911163056.543071-1-alan.maguire@oracle.com --- tools/testing/selftests/bpf/bpf_util.h | 3 +++ tools/testing/selftests/bpf/network_helpers.c | 2 +- tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c | 2 +- tools/testing/selftests/bpf/prog_tests/kernel_flag.c | 2 +- tools/testing/selftests/bpf/prog_tests/task_local_data.h | 2 +- tools/testing/selftests/bpf/prog_tests/test_task_local_data.c | 2 +- 6 files changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index 5f6963a320d732..4bc2d25f33e180 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -67,6 +67,9 @@ static inline void bpf_strlcpy(char *dst, const char *src, size_t sz) #define sys_gettid() syscall(SYS_gettid) #endif +/* and poison usage to ensure it does not creep back in. */ +#pragma GCC poison gettid + #ifndef ENOTSUPP #define ENOTSUPP 524 #endif diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 72b5c174ab3bb8..cdf7b664144420 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -457,7 +457,7 @@ int append_tid(char *str, size_t sz) if (end + 8 > sz) return -1; - sprintf(&str[end], "%07d", gettid()); + sprintf(&str[end], "%07ld", sys_gettid()); str[end + 7] = '\0'; return 0; diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c b/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c index e0dd966e4a3ef3..5ad904e9d15df9 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c @@ -44,7 +44,7 @@ static void test_read_cgroup_xattr(void) if (!ASSERT_OK_PTR(skel, "read_cgroupfs_xattr__open_and_load")) goto out; - skel->bss->target_pid = gettid(); + skel->bss->target_pid = sys_gettid(); if (!ASSERT_OK(read_cgroupfs_xattr__attach(skel), "read_cgroupfs_xattr__attach")) goto out; diff --git a/tools/testing/selftests/bpf/prog_tests/kernel_flag.c b/tools/testing/selftests/bpf/prog_tests/kernel_flag.c index a133354ac9bc29..97b00c7efe9431 100644 --- a/tools/testing/selftests/bpf/prog_tests/kernel_flag.c +++ b/tools/testing/selftests/bpf/prog_tests/kernel_flag.c @@ -16,7 +16,7 @@ void test_kernel_flag(void) if (!ASSERT_OK_PTR(lsm_skel, "lsm_skel")) return; - lsm_skel->bss->monitored_tid = gettid(); + lsm_skel->bss->monitored_tid = sys_gettid(); ret = test_kernel_flag__attach(lsm_skel); if (!ASSERT_OK(ret, "test_kernel_flag__attach")) diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h index a408d10c368821..2de38776a2d499 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_data.h +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -158,7 +158,7 @@ static int __tld_init_data_p(int map_fd) void *data_alloc = NULL; int err, tid_fd = -1; - tid_fd = syscall(SYS_pidfd_open, gettid(), O_EXCL); + tid_fd = syscall(SYS_pidfd_open, sys_gettid(), O_EXCL); if (tid_fd < 0) { err = -errno; goto out; diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c index 3b5cd2cd89c7c2..9fd6306b455c3c 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c @@ -63,7 +63,7 @@ void *test_task_local_data_basic_thread(void *arg) if (!ASSERT_OK_PTR(value2, "tld_get_data")) goto out; - tid = gettid(); + tid = sys_gettid(); *value0 = tid + 0; *value1 = tid + 1; From a10f910c77f280327b481e77eab909934ec508f0 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Wed, 9 Jul 2025 10:54:38 +0200 Subject: [PATCH 1775/3206] drm: bridge: anx7625: Fix NULL pointer dereference with early IRQ If the interrupt occurs before resource initialization is complete, the interrupt handler/worker may access uninitialized data such as the I2C tcpc_client device, potentially leading to NULL pointer dereference. Signed-off-by: Loic Poulain Fixes: 8bdfc5dae4e3 ("drm/bridge: anx7625: Add anx7625 MIPI DSI/DPI to DP") Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250709085438.56188-1-loic.poulain@oss.qualcomm.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/analogix/anx7625.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/analogix/anx7625.c b/drivers/gpu/drm/bridge/analogix/anx7625.c index c0ad8f59e48398..8b3304dedcd998 100644 --- a/drivers/gpu/drm/bridge/analogix/anx7625.c +++ b/drivers/gpu/drm/bridge/analogix/anx7625.c @@ -2677,7 +2677,7 @@ static int anx7625_i2c_probe(struct i2c_client *client) ret = devm_request_threaded_irq(dev, platform->pdata.intp_irq, NULL, anx7625_intr_hpd_isr, IRQF_TRIGGER_FALLING | - IRQF_ONESHOT, + IRQF_ONESHOT | IRQF_NO_AUTOEN, "anx7625-intp", platform); if (ret) { DRM_DEV_ERROR(dev, "fail to request irq\n"); @@ -2746,8 +2746,10 @@ static int anx7625_i2c_probe(struct i2c_client *client) } /* Add work function */ - if (platform->pdata.intp_irq) + if (platform->pdata.intp_irq) { + enable_irq(platform->pdata.intp_irq); queue_work(platform->workqueue, &platform->work); + } if (platform->pdata.audio_en) anx7625_register_audio(dev, platform); From 440d20154add24082eb43305f85288a756a5cc56 Mon Sep 17 00:00:00 2001 From: David Kaplan Date: Mon, 15 Sep 2025 08:47:03 -0500 Subject: [PATCH 1776/3206] x86/bugs: Remove uses of cpu_mitigations_off() cpu_mitigations_off() is no longer needed because all bugs use attack vector controls to select a mitigation, and cpu_mitigations_off() is equivalent to no attack vectors being selected. Remove the few remaining unnecessary uses of this function in this file. Signed-off-by: David Kaplan Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Pawan Gupta --- arch/x86/kernel/cpu/bugs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 570de55485df53..f28a7383b88bdb 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -687,8 +687,7 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } @@ -3125,14 +3124,15 @@ static void __init srso_select_mitigation(void) static void __init srso_update_mitigation(void) { + if (!boot_cpu_has_bug(X86_BUG_SRSO)) + return; + /* If retbleed is using IBPB, that works for SRSO as well */ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB && boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) srso_mitigation = SRSO_MITIGATION_IBPB; - if (boot_cpu_has_bug(X86_BUG_SRSO) && - !cpu_mitigations_off()) - pr_info("%s\n", srso_strings[srso_mitigation]); + pr_info("%s\n", srso_strings[srso_mitigation]); } static void __init srso_apply_mitigation(void) From e06722a9df14eff39f42059f84063daa0a95b92c Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 4 Sep 2025 00:49:11 +0200 Subject: [PATCH 1777/3206] ACPI: APEI: Remove redundant assignments in erst_dbg_{ioctl|write}() Use the result of copy_from_user() directly instead of assigning it to the local variable 'rc' and then overwriting it in erst_dbg_write() or immediately returning from erst_dbg_ioctl(). No intentional functional changes. Signed-off-by: Thorsten Blum Reviewed-by: Shuai Xue Link: https://patch.msgid.link/20250903224913.242928-2-thorsten.blum@linux.dev [ rjw: Changelog edit ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/erst-dbg.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c index 246076341e8cc0..ff0e8bf8e97ac8 100644 --- a/drivers/acpi/apei/erst-dbg.c +++ b/drivers/acpi/apei/erst-dbg.c @@ -60,9 +60,8 @@ static long erst_dbg_ioctl(struct file *f, unsigned int cmd, unsigned long arg) switch (cmd) { case APEI_ERST_CLEAR_RECORD: - rc = copy_from_user(&record_id, (void __user *)arg, - sizeof(record_id)); - if (rc) + if (copy_from_user(&record_id, (void __user *)arg, + sizeof(record_id))) return -EFAULT; return erst_clear(record_id); case APEI_ERST_GET_RECORD_COUNT: @@ -175,8 +174,7 @@ static ssize_t erst_dbg_write(struct file *filp, const char __user *ubuf, erst_dbg_buf = p; erst_dbg_buf_len = usize; } - rc = copy_from_user(erst_dbg_buf, ubuf, usize); - if (rc) { + if (copy_from_user(erst_dbg_buf, ubuf, usize)) { rc = -EFAULT; goto out; } From 7d444f50999709d52a18ec7e88649ac936ab62dc Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Wed, 10 Sep 2025 04:45:31 +0000 Subject: [PATCH 1778/3206] ACPI: APEI: EINJ: Allow more types of addresses except MMIO EINJ driver today only allows injection request to go through for two kinds of IORESOURCE_MEM: IORES_DESC_PERSISTENT_MEMORY and IORES_DESC_SOFT_RESERVED. This check prevents user of EINJ to test memory corrupted in many interesting areas: - Legacy persistent memory - Memory claimed to be used by ACPI tables or NV storage - Kernel crash memory and others There is need to test how kernel behaves when something consumes memory errors in these memory regions. For example, if certain ACPI table is corrupted, does kernel crash gracefully to prevent "silent data corruption". For another example, legacy persistent memory, when managed by Device DAX, does support recovering from Machine Check Exception raised by memory failure, hence worth to be tested. However, attempt to inject memory error via EINJ to legacy persistent memory or ACPI owned memory fails with -EINVAL. Allow EINJ to inject at address except it is MMIO. Leave it to the BIOS or firmware to decide what is a legitimate injection target. In addition to the test done in [1], on a machine having the following iomem resources: ... 01000000-08ffffff : Crash kernel 768f0098-768f00a7 : APEI EINJ ... 768f4000-77323fff : ACPI Non-volatile Storage 77324000-777fefff : ACPI Tables 777ff000-777fffff : System RAM 77800000-7fffffff : Reserved 80000000-8fffffff : PCI MMCONFIG 0000 [bus 00-ff] 90040000-957fffff : PCI Bus 0000:00 ... 300000000-3ffffffff : Persistent Memory (legacy) ... I commented __einj_error_inject during the test and just tested when injecting a memory error at each start address shown above: - 0x80000000 and 0x90040000 both failed with EINVAL - request passed through for all other addresses Link: https://lore.kernel.org/linux-acpi/20250825223348.3780279-1-jiaqiyan@google.com [1] Signed-off-by: Jiaqi Yan Reviewed-by: Hanjun Guo Link: https://patch.msgid.link/20250910044531.264043-1-jiaqiyan@google.com [ rjw: Adding a link tag for the [1] reference ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/einj-core.c | 51 ++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/apei/einj-core.c b/drivers/acpi/apei/einj-core.c index 2561b045acc7bc..3c87953dbd197a 100644 --- a/drivers/acpi/apei/einj-core.c +++ b/drivers/acpi/apei/einj-core.c @@ -656,6 +656,43 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, return rc; } +/* Allow almost all types of address except MMIO. */ +static bool is_allowed_range(u64 base_addr, u64 size) +{ + int i; + /* + * MMIO region is usually claimed with IORESOURCE_MEM + IORES_DESC_NONE. + * However, IORES_DESC_NONE is treated like a wildcard when we check if + * region intersects with known resource. So do an allow list check for + * IORES_DESCs that definitely or most likely not MMIO. + */ + int non_mmio_desc[] = { + IORES_DESC_CRASH_KERNEL, + IORES_DESC_ACPI_TABLES, + IORES_DESC_ACPI_NV_STORAGE, + IORES_DESC_PERSISTENT_MEMORY, + IORES_DESC_PERSISTENT_MEMORY_LEGACY, + /* Treat IORES_DESC_DEVICE_PRIVATE_MEMORY as MMIO. */ + IORES_DESC_RESERVED, + IORES_DESC_SOFT_RESERVED, + }; + + if (region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) + == REGION_INTERSECTS) + return true; + + for (i = 0; i < ARRAY_SIZE(non_mmio_desc); ++i) { + if (region_intersects(base_addr, size, IORESOURCE_MEM, non_mmio_desc[i]) + == REGION_INTERSECTS) + return true; + } + + if (arch_is_platform_page(base_addr)) + return true; + + return false; +} + /* Inject the specified hardware error */ int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3, u64 param4) @@ -702,19 +739,15 @@ int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3, * Disallow crazy address masks that give BIOS leeway to pick * injection address almost anywhere. Insist on page or * better granularity and that target address is normal RAM or - * NVDIMM. + * as long as is not MMIO. */ base_addr = param1 & param2; size = ~param2 + 1; - if (((param2 & PAGE_MASK) != PAGE_MASK) || - ((region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) - != REGION_INTERSECTS) && - (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY) - != REGION_INTERSECTS) && - (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_SOFT_RESERVED) - != REGION_INTERSECTS) && - !arch_is_platform_page(base_addr))) + if ((param2 & PAGE_MASK) != PAGE_MASK) + return -EINVAL; + + if (!is_allowed_range(base_addr, size)) return -EINVAL; if (is_zero_pfn(base_addr >> PAGE_SHIFT)) From bf4206d7ac278bbd1eb2cf02d25486c2cc61b298 Mon Sep 17 00:00:00 2001 From: Chen Pei Date: Wed, 10 Sep 2025 17:41:36 +0800 Subject: [PATCH 1779/3206] ACPI: SPCR: Add support for DBG2 RISC-V SBI port subtype Commit 4aca2bef90bd1296 ("ACPICA: Headers: Add RISC-V SBI Subtype to DBG2") added the definition of ACPI_DBG2_RISCV_SBI_CON. Continue to implement its function so that the parameters of UART can be configured correctly. Subsequent calls to setup_earlycon() will reuse the earlycon based on SBI. Signed-off-by: Chen Pei Reviewed-by: Guo Ren (Alibaba Damo Academy) Link: https://patch.msgid.link/20250910094136.4423-1-cp0613@linux.alibaba.com [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/spcr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/acpi/spcr.c b/drivers/acpi/spcr.c index cd36a97b0ea2c7..208d6bbc65e037 100644 --- a/drivers/acpi/spcr.c +++ b/drivers/acpi/spcr.c @@ -141,6 +141,9 @@ int __init acpi_parse_spcr(bool enable_earlycon, bool enable_console) case ACPI_DBG2_16550_NVIDIA: uart = "uart"; break; + case ACPI_DBG2_RISCV_SBI_CON: + uart = "sbi"; + break; default: err = -ENOENT; goto done; From 995694ef91da7076ce235f028680fb31ad0b5c04 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 11 Sep 2025 14:04:53 +0200 Subject: [PATCH 1780/3206] cpufreq: ACPI: Use on_each_cpu_mask() in drv_write() Make drv_write() call on_each_cpu_mask() instead of using an open-coded equivalent of the latter. Also remove a comment mentioning the smp_call_function_many() usage which is not particularly useful anyway. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) --- drivers/cpufreq/acpi-cpufreq.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 4f7f9201598dc4..083d8369a59121 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -318,7 +318,6 @@ static u32 drv_read(struct acpi_cpufreq_data *data, const struct cpumask *mask) return cmd.val; } -/* Called via smp_call_function_many(), on the target CPUs */ static void do_drv_write(void *_cmd) { struct drv_cmd *cmd = _cmd; @@ -335,14 +334,8 @@ static void drv_write(struct acpi_cpufreq_data *data, .val = val, .func.write = data->cpu_freq_write, }; - int this_cpu; - this_cpu = get_cpu(); - if (cpumask_test_cpu(this_cpu, mask)) - do_drv_write(&cmd); - - smp_call_function_many(mask, do_drv_write, &cmd, 1); - put_cpu(); + on_each_cpu_mask(mask, do_drv_write, &cmd, true); } static u32 get_cur_val(const struct cpumask *mask, struct acpi_cpufreq_data *data) From b13448dd64e27752fad252cec7da1a50ab9f0b6f Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 12 Sep 2025 12:18:16 -0700 Subject: [PATCH 1781/3206] bpf: potential double-free of env->insn_aux_data Function bpf_patch_insn_data() has the following structure: static struct bpf_prog *bpf_patch_insn_data(... env ...) { struct bpf_prog *new_prog; struct bpf_insn_aux_data *new_data = NULL; if (len > 1) { new_data = vrealloc(...); // <--------- (1) if (!new_data) return NULL; env->insn_aux_data = new_data; // <---- (2) } new_prog = bpf_patch_insn_single(env->prog, off, patch, len); if (IS_ERR(new_prog)) { ... vfree(new_data); // <----------------- (3) return NULL; } ... happy path ... } In case if bpf_patch_insn_single() returns an error the `new_data` allocated at (1) will be freed at (3). However, at (2) this pointer is stored in `env->insn_aux_data`. Which is freed unconditionally by verifier.c:bpf_check() on both happy and error paths. Thus, leading to double-free. Fix this by removing vfree() call at (3), ownership over `new_data` is already passed to `env->insn_aux_data` at this point. Fixes: 77620d126739 ("bpf: use realloc in bpf_patch_insn_data") Reported-by: Chris Mason Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250912-patch-insn-data-double-free-v1-1-af05bd85a21a@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17fe623400a5bc..1029380f84db3e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20800,7 +20800,6 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of verbose(env, "insn %d cannot be patched due to 16-bit range\n", env->insn_aux_data[off].orig_idx); - vfree(new_data); return NULL; } adjust_insn_aux_data(env, new_prog, off, len); From c1b6b8c7706354b73196649c46b5e6d4d61c2f5c Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Wed, 10 Sep 2025 12:27:05 +0530 Subject: [PATCH 1782/3206] drm/amdgpu/gfx11: Add Cleaner Shader Support for GFX11.0.1/11.0.4 GPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable the cleaner shader for additional GFX11.0.1/11.0.4 series GPUs to ensure data isolation among GPU tasks. The cleaner shader is tasked with clearing the Local Data Store (LDS), Vector General Purpose Registers (VGPRs), and Scalar General Purpose Registers (SGPRs), which helps avoid data leakage and guarantees the accuracy of computational results. This update extends cleaner shader support to GFX11.0.1/11.0.4 GPUs, previously available for GFX11.0.3. It enhances security by clearing GPU memory between processes and maintains a consistent GPU state across KGD and KFD workloads. Cc: Wasee Alam Cc: Mario Sopena-Novales Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 0a71ceb27f88a944c2de2808b67b2f46ac75076b) --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index c85de8c8f6f50b..c37527704d4332 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1654,6 +1654,21 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) } } break; + case IP_VERSION(11, 0, 1): + case IP_VERSION(11, 0, 4): + adev->gfx.cleaner_shader_ptr = gfx_11_0_3_cleaner_shader_hex; + adev->gfx.cleaner_shader_size = sizeof(gfx_11_0_3_cleaner_shader_hex); + if (adev->gfx.pfp_fw_version >= 102 && + adev->gfx.mec_fw_version >= 66 && + adev->mes.fw_version[0] >= 128) { + adev->gfx.enable_cleaner_shader = true; + r = amdgpu_gfx_cleaner_shader_sw_init(adev, adev->gfx.cleaner_shader_size); + if (r) { + adev->gfx.enable_cleaner_shader = false; + dev_err(adev->dev, "Failed to initialize cleaner shader\n"); + } + } + break; case IP_VERSION(11, 5, 0): case IP_VERSION(11, 5, 1): adev->gfx.cleaner_shader_ptr = gfx_11_0_3_cleaner_shader_hex; From 29a2f430475357f760679b249f33e7282688e292 Mon Sep 17 00:00:00 2001 From: Ivan Lipski Date: Tue, 2 Sep 2025 16:20:09 -0400 Subject: [PATCH 1783/3206] drm/amd/display: Allow RX6xxx & RX7700 to invoke amdgpu_irq_get/put [Why&How] As reported on https://gitlab.freedesktop.org/drm/amd/-/issues/3936, SMU hang can occur if the interrupts are not enabled appropriately, causing a vblank timeout. This patch reverts commit 5009628d8509 ("drm/amd/display: Remove unnecessary amdgpu_irq_get/put"), but only for RX6xxx & RX7700 GPUs, on which the issue was observed. This will re-enable interrupts regardless of whether the user space needed it or not. Fixes: 5009628d8509 ("drm/amd/display: Remove unnecessary amdgpu_irq_get/put") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3936 Suggested-by: Sun peng Li Reviewed-by: Sun peng Li Signed-off-by: Ivan Lipski Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 95d168b367aa28a59f94fc690ff76ebf69312c6d) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 4e86370ae70599..97d9eba1796357 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -8717,7 +8717,16 @@ static int amdgpu_dm_encoder_init(struct drm_device *dev, static void manage_dm_interrupts(struct amdgpu_device *adev, struct amdgpu_crtc *acrtc, struct dm_crtc_state *acrtc_state) -{ +{ /* + * We cannot be sure that the frontend index maps to the same + * backend index - some even map to more than one. + * So we have to go through the CRTC to find the right IRQ. + */ + int irq_type = amdgpu_display_crtc_idx_to_irq_type( + adev, + acrtc->crtc_id); + struct drm_device *dev = adev_to_drm(adev); + struct drm_vblank_crtc_config config = {0}; struct dc_crtc_timing *timing; int offdelay; @@ -8770,7 +8779,35 @@ static void manage_dm_interrupts(struct amdgpu_device *adev, drm_crtc_vblank_on_config(&acrtc->base, &config); + /* Allow RX6xxx, RX7700, RX7800 GPUs to call amdgpu_irq_get.*/ + switch (amdgpu_ip_version(adev, DCE_HWIP, 0)) { + case IP_VERSION(3, 0, 0): + case IP_VERSION(3, 0, 2): + case IP_VERSION(3, 0, 3): + case IP_VERSION(3, 2, 0): + if (amdgpu_irq_get(adev, &adev->pageflip_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot get pageflip irq!\n"); +#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) + if (amdgpu_irq_get(adev, &adev->vline0_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot get vline0 irq!\n"); +#endif + } + } else { + /* Allow RX6xxx, RX7700, RX7800 GPUs to call amdgpu_irq_put.*/ + switch (amdgpu_ip_version(adev, DCE_HWIP, 0)) { + case IP_VERSION(3, 0, 0): + case IP_VERSION(3, 0, 2): + case IP_VERSION(3, 0, 3): + case IP_VERSION(3, 2, 0): +#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) + if (amdgpu_irq_put(adev, &adev->vline0_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot put vline0 irq!\n"); +#endif + if (amdgpu_irq_put(adev, &adev->pageflip_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot put pageflip irq!\n"); + } + drm_crtc_vblank_off(&acrtc->base); } } From a318eb807825d71900e212f5aab3469e86feff8e Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Sat, 13 Sep 2025 08:40:53 -0700 Subject: [PATCH 1784/3206] vhost-scsi: fix argument order in tport allocation error message The error log in vhost_scsi_make_tport() prints the arguments in the wrong order, producing confusing output. For example, when creating a target with a name in WWNN format such as "fc.port1234", the log looks like: Emulated fc.port1234 Address: FCP, exceeds max: 64 Instead, the message should report the emulated protocol type first, followed by the configfs name as: Emulated FCP Address: fc.port1234, exceeds max: 64 Fix the argument order so the error log is consistent and clear. Signed-off-by: Alok Tiwari Message-Id: <20250913154106.3995856-1-alok.a.tiwari@oracle.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- drivers/vhost/scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index abf51332a5c559..98e4f68f4e3cb6 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -2884,7 +2884,7 @@ vhost_scsi_make_tport(struct target_fabric_configfs *tf, check_len: if (strlen(name) >= VHOST_SCSI_NAMELEN) { pr_err("Emulated %s Address: %s, exceeds" - " max: %d\n", name, vhost_scsi_dump_proto_id(tport), + " max: %d\n", vhost_scsi_dump_proto_id(tport), name, VHOST_SCSI_NAMELEN); kfree(tport); return ERR_PTR(-EINVAL); From 4351ca3fcb3ffecf12631b4996bf085a2dad0db6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Thu, 11 Sep 2025 15:33:34 +0200 Subject: [PATCH 1785/3206] rds: ib: Increment i_fastreg_wrs before bailing out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to increment i_fastreg_wrs before we bail out from rds_ib_post_reg_frmr(). We have a fixed budget of how many FRWR operations that can be outstanding using the dedicated QP used for memory registrations and de-registrations. This budget is enforced by the atomic_t i_fastreg_wrs. If we bail out early in rds_ib_post_reg_frmr(), we will "leak" the possibility of posting an FRWR operation, and if that accumulates, no FRWR operation can be carried out. Fixes: 1659185fb4d0 ("RDS: IB: Support Fastreg MR (FRMR) memory registration mode") Fixes: 3a2886cca703 ("net/rds: Keep track of and wait for FRWR segments in use upon shutdown") Cc: stable@vger.kernel.org Signed-off-by: Håkon Bugge Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20250911133336.451212-1-haakon.bugge@oracle.com Signed-off-by: Jakub Kicinski --- net/rds/ib_frmr.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 28c1b00221780f..bd861191157b54 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -133,12 +133,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len, &off, PAGE_SIZE); - if (unlikely(ret != ibmr->sg_dma_len)) - return ret < 0 ? ret : -EINVAL; + if (unlikely(ret != ibmr->sg_dma_len)) { + ret = ret < 0 ? ret : -EINVAL; + goto out_inc; + } - if (cmpxchg(&frmr->fr_state, - FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) - return -EBUSY; + if (cmpxchg(&frmr->fr_state, FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) { + ret = -EBUSY; + goto out_inc; + } atomic_inc(&ibmr->ic->i_fastreg_inuse_count); @@ -166,11 +169,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) /* Failure here can be because of -ENOMEM as well */ rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE); - atomic_inc(&ibmr->ic->i_fastreg_wrs); if (printk_ratelimit()) pr_warn("RDS/IB: %s returned error(%d)\n", __func__, ret); - goto out; + goto out_inc; } /* Wait for the registration to complete in order to prevent an invalid @@ -179,8 +181,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) */ wait_event(frmr->fr_reg_done, !frmr->fr_reg); -out: + return ret; +out_inc: + atomic_inc(&ibmr->ic->i_fastreg_wrs); return ret; } From 80c5b023a7d6ae41bd79aadece4cb1fc62e95a08 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:42 +0900 Subject: [PATCH 1786/3206] firewire: core: use scoped_guard() to manage critical section to update topology At present, guard() macro is used for the critical section to update topology. It is inconvenient to add the other critical sections into the function. This commit uses scoped_guard() macro instead. Link: https://lore.kernel.org/r/20250915234747.915922-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-topology.c | 74 +++++++++++++++----------------- 1 file changed, 34 insertions(+), 40 deletions(-) diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 5f8fb1201d8016..17aaf14cab0b9e 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -458,46 +458,40 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, trace_bus_reset_handle(card->index, generation, node_id, bm_abdicate, self_ids, self_id_count); - guard(spinlock_irqsave)(&card->lock); - - /* - * If the selfID buffer is not the immediate successor of the - * previously processed one, we cannot reliably compare the - * old and new topologies. - */ - if (!is_next_generation(generation, card->generation) && - card->local_node != NULL) { - fw_destroy_nodes(card); - card->bm_retries = 0; - } - - card->broadcast_channel_allocated = card->broadcast_channel_auto_allocated; - card->node_id = node_id; - /* - * Update node_id before generation to prevent anybody from using - * a stale node_id together with a current generation. - */ - smp_wmb(); - card->generation = generation; - card->reset_jiffies = get_jiffies_64(); - card->bm_node_id = 0xffff; - card->bm_abdicate = bm_abdicate; - fw_schedule_bm_work(card, 0); - - local_node = build_tree(card, self_ids, self_id_count, generation); - - update_topology_map(card, self_ids, self_id_count); - - card->color++; - - if (local_node == NULL) { - fw_err(card, "topology build failed\n"); - /* FIXME: We need to issue a bus reset in this case. */ - } else if (card->local_node == NULL) { - card->local_node = local_node; - for_each_fw_node(card, local_node, report_found_node); - } else { - update_tree(card, local_node); + scoped_guard(spinlock, &card->lock) { + // If the selfID buffer is not the immediate successor of the + // previously processed one, we cannot reliably compare the + // old and new topologies. + if (!is_next_generation(generation, card->generation) && card->local_node != NULL) { + fw_destroy_nodes(card); + card->bm_retries = 0; + } + card->broadcast_channel_allocated = card->broadcast_channel_auto_allocated; + card->node_id = node_id; + // Update node_id before generation to prevent anybody from using + // a stale node_id together with a current generation. + smp_wmb(); + card->generation = generation; + card->reset_jiffies = get_jiffies_64(); + card->bm_node_id = 0xffff; + card->bm_abdicate = bm_abdicate; + fw_schedule_bm_work(card, 0); + + local_node = build_tree(card, self_ids, self_id_count, generation); + + update_topology_map(card, self_ids, self_id_count); + + card->color++; + + if (local_node == NULL) { + fw_err(card, "topology build failed\n"); + // FIXME: We need to issue a bus reset in this case. + } else if (card->local_node == NULL) { + card->local_node = local_node; + for_each_fw_node(card, local_node, report_found_node); + } else { + update_tree(card, local_node); + } } } EXPORT_SYMBOL(fw_core_handle_bus_reset); From 07c446e35b89bc8774792f8036e595cffdf5b162 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:43 +0900 Subject: [PATCH 1787/3206] firewire: core: maintain phy packet receivers locally in cdev layer The list of receivers for phy packet is used only by cdev layer, while it is maintained as a member of fw_card structure. This commit maintains the list locally in cdev layer. Link: https://lore.kernel.org/r/20250915234747.915922-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 1 - drivers/firewire/core-cdev.c | 27 ++++++++++++++++++++------- include/linux/firewire.h | 2 -- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index b5e01a71114571..616abb836ef3dd 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -556,7 +556,6 @@ void fw_card_initialize(struct fw_card *card, kref_init(&card->kref); init_completion(&card->done); INIT_LIST_HEAD(&card->transaction_list); - INIT_LIST_HEAD(&card->phy_receiver_list); spin_lock_init(&card->lock); card->local_node = NULL; diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 1be8f5eb3e17dc..112b33099610cf 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -47,6 +47,9 @@ #define FW_CDEV_VERSION_AUTO_FLUSH_ISO_OVERFLOW 5 #define FW_CDEV_VERSION_EVENT_ASYNC_TSTAMP 6 +static DEFINE_SPINLOCK(phy_receiver_list_lock); +static LIST_HEAD(phy_receiver_list); + struct client { u32 version; struct fw_device *device; @@ -1669,15 +1672,16 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg) static int ioctl_receive_phy_packets(struct client *client, union ioctl_arg *arg) { struct fw_cdev_receive_phy_packets *a = &arg->receive_phy_packets; - struct fw_card *card = client->device->card; /* Access policy: Allow this ioctl only on local nodes' device files. */ if (!client->device->is_local) return -ENOSYS; - guard(spinlock_irq)(&card->lock); + // NOTE: This can be without irq when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irq, &phy_receiver_list_lock) + list_move_tail(&client->phy_receiver_link, &phy_receiver_list); - list_move_tail(&client->phy_receiver_link, &card->phy_receiver_list); client->phy_receiver_closure = a->closure; return 0; @@ -1687,10 +1691,17 @@ void fw_cdev_handle_phy_packet(struct fw_card *card, struct fw_packet *p) { struct client *client; - guard(spinlock_irqsave)(&card->lock); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + guard(spinlock_irqsave)(&phy_receiver_list_lock); + + list_for_each_entry(client, &phy_receiver_list, phy_receiver_link) { + struct inbound_phy_packet_event *e; + + if (client->device->card != card) + continue; - list_for_each_entry(client, &card->phy_receiver_list, phy_receiver_link) { - struct inbound_phy_packet_event *e = kmalloc(sizeof(*e) + 8, GFP_ATOMIC); + e = kmalloc(sizeof(*e) + 8, GFP_ATOMIC); if (e == NULL) break; @@ -1857,7 +1868,9 @@ static int fw_device_op_release(struct inode *inode, struct file *file) struct client_resource *resource; unsigned long index; - scoped_guard(spinlock_irq, &client->device->card->lock) + // NOTE: This can be without irq when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irq, &phy_receiver_list_lock) list_del(&client->phy_receiver_link); scoped_guard(mutex, &client->device->client_list_mutex) diff --git a/include/linux/firewire.h b/include/linux/firewire.h index d38c6e538e5c15..f3260aacf730ea 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -115,8 +115,6 @@ struct fw_card { int index; struct list_head link; - struct list_head phy_receiver_list; - struct delayed_work br_work; /* bus reset job */ bool br_short; From 7d138cb269dbd2fa9b0da89a9c10503d1cf269d5 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:44 +0900 Subject: [PATCH 1788/3206] firewire: core: use spin lock specific to topology map At present, the operation for read transaction to topology map register is not protected by any kind of lock primitives. This causes a potential problem to result in the mixed content of topology map. This commit adds and uses spin lock specific to topology map. Link: https://lore.kernel.org/r/20250915234747.915922-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-topology.c | 22 ++++++++++++++-------- drivers/firewire/core-transaction.c | 6 +++++- include/linux/firewire.h | 6 +++++- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 17aaf14cab0b9e..c62cf93f3f65e1 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -435,20 +435,22 @@ static void update_tree(struct fw_card *card, struct fw_node *root) } } -static void update_topology_map(struct fw_card *card, - u32 *self_ids, int self_id_count) +static void update_topology_map(__be32 *buffer, size_t buffer_size, int root_node_id, + const u32 *self_ids, int self_id_count) { - int node_count = (card->root_node->node_id & 0x3f) + 1; - __be32 *map = card->topology_map; + __be32 *map = buffer; + int node_count = (root_node_id & 0x3f) + 1; + + memset(map, 0, buffer_size); *map++ = cpu_to_be32((self_id_count + 2) << 16); - *map++ = cpu_to_be32(be32_to_cpu(card->topology_map[1]) + 1); + *map++ = cpu_to_be32(be32_to_cpu(buffer[1]) + 1); *map++ = cpu_to_be32((node_count << 16) | self_id_count); while (self_id_count--) *map++ = cpu_to_be32p(self_ids++); - fw_compute_block_crc(card->topology_map); + fw_compute_block_crc(buffer); } void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, @@ -479,8 +481,6 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, local_node = build_tree(card, self_ids, self_id_count, generation); - update_topology_map(card, self_ids, self_id_count); - card->color++; if (local_node == NULL) { @@ -493,5 +493,11 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, update_tree(card, local_node); } } + + // Just used by transaction layer. + scoped_guard(spinlock, &card->topology_map.lock) { + update_topology_map(card->topology_map.buffer, sizeof(card->topology_map.buffer), + card->root_node->node_id, self_ids, self_id_count); + } } EXPORT_SYMBOL(fw_core_handle_bus_reset); diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 623e1d9bd107f9..8edffafd21c1fc 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -1196,7 +1196,11 @@ static void handle_topology_map(struct fw_card *card, struct fw_request *request } start = (offset - topology_map_region.start) / 4; - memcpy(payload, &card->topology_map[start], length); + + // NOTE: This can be without irqsave when we can guarantee that fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->topology_map.lock) + memcpy(payload, &card->topology_map.buffer[start], length); fw_send_response(card, request, RCODE_COMPLETE); } diff --git a/include/linux/firewire.h b/include/linux/firewire.h index f3260aacf730ea..aeb71c39e57ee2 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -129,7 +129,11 @@ struct fw_card { bool broadcast_channel_allocated; u32 broadcast_channel; - __be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4]; + + struct { + __be32 buffer[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4]; + spinlock_t lock; + } topology_map; __be32 maint_utility_register; From 420bd7068cbfaea0a857472dd631dc48311e2a8f Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:45 +0900 Subject: [PATCH 1789/3206] firewire: core: use spin lock specific to transaction The list of instance for asynchronous transaction to wait for response subaction is maintained as a member of fw_card structure. The card-wide spinlock is used at present for any operation over the list, however it is not necessarily suited for the purpose. This commit adds and uses the spin lock specific to maintain the list. Link: https://lore.kernel.org/r/20250915234747.915922-5-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 12 ++++-- drivers/firewire/core-transaction.c | 59 ++++++++++++++++++----------- include/linux/firewire.h | 10 +++-- 3 files changed, 51 insertions(+), 30 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 616abb836ef3dd..39f95007305f23 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -544,8 +544,12 @@ void fw_card_initialize(struct fw_card *card, card->index = atomic_inc_return(&index); card->driver = driver; card->device = device; - card->current_tlabel = 0; - card->tlabel_mask = 0; + + card->transactions.current_tlabel = 0; + card->transactions.tlabel_mask = 0; + INIT_LIST_HEAD(&card->transactions.list); + spin_lock_init(&card->transactions.lock); + card->split_timeout_hi = DEFAULT_SPLIT_TIMEOUT / 8000; card->split_timeout_lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; card->split_timeout_cycles = DEFAULT_SPLIT_TIMEOUT; @@ -555,7 +559,7 @@ void fw_card_initialize(struct fw_card *card, kref_init(&card->kref); init_completion(&card->done); - INIT_LIST_HEAD(&card->transaction_list); + spin_lock_init(&card->lock); card->local_node = NULL; @@ -772,7 +776,7 @@ void fw_core_remove_card(struct fw_card *card) destroy_workqueue(card->isoc_wq); destroy_workqueue(card->async_wq); - WARN_ON(!list_empty(&card->transaction_list)); + WARN_ON(!list_empty(&card->transactions.list)); } EXPORT_SYMBOL(fw_core_remove_card); diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 8edffafd21c1fc..5366d8a781ac79 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -49,12 +49,14 @@ static int close_transaction(struct fw_transaction *transaction, struct fw_card { struct fw_transaction *t = NULL, *iter; - scoped_guard(spinlock_irqsave, &card->lock) { - list_for_each_entry(iter, &card->transaction_list, link) { + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) { + list_for_each_entry(iter, &card->transactions.list, link) { if (iter == transaction) { if (try_cancel_split_timeout(iter)) { list_del_init(&iter->link); - card->tlabel_mask &= ~(1ULL << iter->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << iter->tlabel); t = iter; } break; @@ -117,11 +119,11 @@ static void split_transaction_timeout_callback(struct timer_list *timer) struct fw_transaction *t = timer_container_of(t, timer, split_timeout_timer); struct fw_card *card = t->card; - scoped_guard(spinlock_irqsave, &card->lock) { + scoped_guard(spinlock_irqsave, &card->transactions.lock) { if (list_empty(&t->link)) return; list_del(&t->link); - card->tlabel_mask &= ~(1ULL << t->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << t->tlabel); } if (!t->with_tstamp) { @@ -259,18 +261,21 @@ static void fw_fill_request(struct fw_packet *packet, int tcode, int tlabel, } static int allocate_tlabel(struct fw_card *card) +__must_hold(&card->transactions_lock) { int tlabel; - tlabel = card->current_tlabel; - while (card->tlabel_mask & (1ULL << tlabel)) { + lockdep_assert_held(&card->transactions.lock); + + tlabel = card->transactions.current_tlabel; + while (card->transactions.tlabel_mask & (1ULL << tlabel)) { tlabel = (tlabel + 1) & 0x3f; - if (tlabel == card->current_tlabel) + if (tlabel == card->transactions.current_tlabel) return -EBUSY; } - card->current_tlabel = (tlabel + 1) & 0x3f; - card->tlabel_mask |= 1ULL << tlabel; + card->transactions.current_tlabel = (tlabel + 1) & 0x3f; + card->transactions.tlabel_mask |= 1ULL << tlabel; return tlabel; } @@ -331,7 +336,6 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode void *payload, size_t length, union fw_transaction_callback callback, bool with_tstamp, void *callback_data) { - unsigned long flags; int tlabel; /* @@ -339,11 +343,11 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode * the list while holding the card spinlock. */ - spin_lock_irqsave(&card->lock, flags); - - tlabel = allocate_tlabel(card); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) + tlabel = allocate_tlabel(card); if (tlabel < 0) { - spin_unlock_irqrestore(&card->lock, flags); if (!with_tstamp) { callback.without_tstamp(card, RCODE_SEND_ERROR, NULL, 0, callback_data); } else { @@ -368,15 +372,22 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode t->callback = callback; t->with_tstamp = with_tstamp; t->callback_data = callback_data; - - fw_fill_request(&t->packet, tcode, t->tlabel, destination_id, card->node_id, generation, - speed, offset, payload, length); t->packet.callback = transmit_complete_callback; - list_add_tail(&t->link, &card->transaction_list); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->lock) { + // The node_id field of fw_card can be updated when handling SelfIDComplete. + fw_fill_request(&t->packet, tcode, t->tlabel, destination_id, card->node_id, + generation, speed, offset, payload, length); + } - spin_unlock_irqrestore(&card->lock, flags); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) + list_add_tail(&t->link, &card->transactions.list); + // Safe with no lock, since the index field of fw_card is immutable once assigned. trace_async_request_outbound_initiate((uintptr_t)t, card->index, generation, speed, t->packet.header, payload, tcode_is_read_request(tcode) ? 0 : length / 4); @@ -1111,12 +1122,14 @@ void fw_core_handle_response(struct fw_card *card, struct fw_packet *p) break; } - scoped_guard(spinlock_irqsave, &card->lock) { - list_for_each_entry(iter, &card->transaction_list, link) { + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) { + list_for_each_entry(iter, &card->transactions.list, link) { if (iter->node_id == source && iter->tlabel == tlabel) { if (try_cancel_split_timeout(iter)) { list_del_init(&iter->link); - card->tlabel_mask &= ~(1ULL << iter->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << iter->tlabel); t = iter; } break; diff --git a/include/linux/firewire.h b/include/linux/firewire.h index aeb71c39e57ee2..8d6801cf2fcabf 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -88,11 +88,15 @@ struct fw_card { int node_id; int generation; - int current_tlabel; - u64 tlabel_mask; - struct list_head transaction_list; u64 reset_jiffies; + struct { + int current_tlabel; + u64 tlabel_mask; + struct list_head list; + spinlock_t lock; + } transactions; + u32 split_timeout_hi; u32 split_timeout_lo; unsigned int split_timeout_cycles; From b5725cfa4120a4d234ab112aad151d731531d093 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:46 +0900 Subject: [PATCH 1790/3206] firewire: core: use spin lock specific to timer for split transaction At present the parameters to compute timeout time for split transaction is protected by card-wide spin lock, while it is not necessarily convenient in a point to narrower critical section. This commit adds and uses another spin lock specific for the purpose. Link: https://lore.kernel.org/r/20250915234747.915922-6-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 10 +++-- drivers/firewire/core-transaction.c | 63 +++++++++++++++++++---------- include/linux/firewire.h | 15 ++++--- 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 39f95007305f23..96495392a1f6d3 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -550,10 +550,12 @@ void fw_card_initialize(struct fw_card *card, INIT_LIST_HEAD(&card->transactions.list); spin_lock_init(&card->transactions.lock); - card->split_timeout_hi = DEFAULT_SPLIT_TIMEOUT / 8000; - card->split_timeout_lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; - card->split_timeout_cycles = DEFAULT_SPLIT_TIMEOUT; - card->split_timeout_jiffies = isoc_cycles_to_jiffies(DEFAULT_SPLIT_TIMEOUT); + card->split_timeout.hi = DEFAULT_SPLIT_TIMEOUT / 8000; + card->split_timeout.lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; + card->split_timeout.cycles = DEFAULT_SPLIT_TIMEOUT; + card->split_timeout.jiffies = isoc_cycles_to_jiffies(DEFAULT_SPLIT_TIMEOUT); + spin_lock_init(&card->split_timeout.lock); + card->color = 0; card->broadcast_channel = BROADCAST_CHANNEL_INITIAL; diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 5366d8a781ac79..dd3656a0c1ff0d 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -137,14 +137,18 @@ static void split_transaction_timeout_callback(struct timer_list *timer) static void start_split_transaction_timeout(struct fw_transaction *t, struct fw_card *card) { - guard(spinlock_irqsave)(&card->lock); + unsigned long delta; if (list_empty(&t->link) || WARN_ON(t->is_split_transaction)) return; t->is_split_transaction = true; - mod_timer(&t->split_timeout_timer, - jiffies + card->split_timeout_jiffies); + + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) + delta = card->split_timeout.jiffies; + mod_timer(&t->split_timeout_timer, jiffies + delta); } static u32 compute_split_timeout_timestamp(struct fw_card *card, u32 request_timestamp); @@ -164,8 +168,12 @@ static void transmit_complete_callback(struct fw_packet *packet, break; case ACK_PENDING: { - t->split_timeout_cycle = - compute_split_timeout_timestamp(card, packet->timestamp) & 0xffff; + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + t->split_timeout_cycle = + compute_split_timeout_timestamp(card, packet->timestamp) & 0xffff; + } start_split_transaction_timeout(t, card); break; } @@ -790,11 +798,14 @@ EXPORT_SYMBOL(fw_fill_response); static u32 compute_split_timeout_timestamp(struct fw_card *card, u32 request_timestamp) +__must_hold(&card->split_timeout.lock) { unsigned int cycles; u32 timestamp; - cycles = card->split_timeout_cycles; + lockdep_assert_held(&card->split_timeout.lock); + + cycles = card->split_timeout.cycles; cycles += request_timestamp & 0x1fff; timestamp = request_timestamp & ~0x1fff; @@ -845,9 +856,12 @@ static struct fw_request *allocate_request(struct fw_card *card, return NULL; kref_init(&request->kref); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) + request->response.timestamp = compute_split_timeout_timestamp(card, p->timestamp); + request->response.speed = p->speed; - request->response.timestamp = - compute_split_timeout_timestamp(card, p->timestamp); request->response.generation = p->generation; request->response.ack = 0; request->response.callback = free_response_callback; @@ -1228,16 +1242,17 @@ static const struct fw_address_region registers_region = .end = CSR_REGISTER_BASE | CSR_CONFIG_ROM, }; static void update_split_timeout(struct fw_card *card) +__must_hold(&card->split_timeout.lock) { unsigned int cycles; - cycles = card->split_timeout_hi * 8000 + (card->split_timeout_lo >> 19); + cycles = card->split_timeout.hi * 8000 + (card->split_timeout.lo >> 19); /* minimum per IEEE 1394, maximum which doesn't overflow OHCI */ cycles = clamp(cycles, 800u, 3u * 8000u); - card->split_timeout_cycles = cycles; - card->split_timeout_jiffies = isoc_cycles_to_jiffies(cycles); + card->split_timeout.cycles = cycles; + card->split_timeout.jiffies = isoc_cycles_to_jiffies(cycles); } static void handle_registers(struct fw_card *card, struct fw_request *request, @@ -1287,12 +1302,15 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, case CSR_SPLIT_TIMEOUT_HI: if (tcode == TCODE_READ_QUADLET_REQUEST) { - *data = cpu_to_be32(card->split_timeout_hi); + *data = cpu_to_be32(card->split_timeout.hi); } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { - guard(spinlock_irqsave)(&card->lock); - - card->split_timeout_hi = be32_to_cpu(*data) & 7; - update_split_timeout(card); + // NOTE: This can be without irqsave when we can guarantee that + // __fw_send_request() for local destination never runs in any type of IRQ + // context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + card->split_timeout.hi = be32_to_cpu(*data) & 7; + update_split_timeout(card); + } } else { rcode = RCODE_TYPE_ERROR; } @@ -1300,12 +1318,15 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, case CSR_SPLIT_TIMEOUT_LO: if (tcode == TCODE_READ_QUADLET_REQUEST) { - *data = cpu_to_be32(card->split_timeout_lo); + *data = cpu_to_be32(card->split_timeout.lo); } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { - guard(spinlock_irqsave)(&card->lock); - - card->split_timeout_lo = be32_to_cpu(*data) & 0xfff80000; - update_split_timeout(card); + // NOTE: This can be without irqsave when we can guarantee that + // __fw_send_request() for local destination never runs in any type of IRQ + // context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + card->split_timeout.lo = be32_to_cpu(*data) & 0xfff80000; + update_split_timeout(card); + } } else { rcode = RCODE_TYPE_ERROR; } diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 8d6801cf2fcabf..6d208769d45627 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -97,18 +97,21 @@ struct fw_card { spinlock_t lock; } transactions; - u32 split_timeout_hi; - u32 split_timeout_lo; - unsigned int split_timeout_cycles; - unsigned int split_timeout_jiffies; + struct { + u32 hi; + u32 lo; + unsigned int cycles; + unsigned int jiffies; + spinlock_t lock; + } split_timeout; unsigned long long guid; unsigned max_receive; int link_speed; int config_rom_generation; - spinlock_t lock; /* Take this lock when handling the lists in - * this struct. */ + spinlock_t lock; + struct fw_node *local_node; struct fw_node *root_node; struct fw_node *irm_node; From e0cda0dd12e08ecb8d26b8d78dc63e67e7069510 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:47 +0900 Subject: [PATCH 1791/3206] firewire: core: annotate fw_destroy_nodes with must-hold-lock The function, fw_destroy_nodes(), is used widely within firewire-core module. It has a prerequisite condition that struct fw_card.lock must be hold in advance. This commit adds annotation for it. Link: https://lore.kernel.org/r/20250915234747.915922-7-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-topology.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index c62cf93f3f65e1..8fa0772ee723f4 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -325,9 +325,11 @@ static void report_found_node(struct fw_card *card, card->bm_retries = 0; } -/* Must be called with card->lock held */ void fw_destroy_nodes(struct fw_card *card) +__must_hold(&card->lock) { + lockdep_assert_held(&card->lock); + card->color++; if (card->local_node != NULL) for_each_fw_node(card, card->local_node, report_lost_node); From 35ae4e86292ef7dfe4edbb9942955c884e984352 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 10 Sep 2025 02:43:34 +0000 Subject: [PATCH 1792/3206] bonding: set random address only when slaves already exist After commit 5c3bf6cba791 ("bonding: assign random address if device address is same as bond"), bonding will erroneously randomize the MAC address of the first interface added to the bond if fail_over_mac = follow. Correct this by additionally testing for the bond being empty before randomizing the MAC. Fixes: 5c3bf6cba791 ("bonding: assign random address if device address is same as bond") Reported-by: Qiuling Ren Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250910024336.400253-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 257333c8871092..8832bc9f107bc0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2132,6 +2132,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len); } else if (bond->params.fail_over_mac == BOND_FOM_FOLLOW && BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && + bond_has_slaves(bond) && memcmp(slave_dev->dev_addr, bond_dev->dev_addr, bond_dev->addr_len) == 0) { /* Set slave to random address to avoid duplicate mac * address in later fail over. From 71379e1c95af2c57567fcac24184c94cb7de4cd6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 10 Sep 2025 02:43:35 +0000 Subject: [PATCH 1793/3206] selftests: bonding: add fail_over_mac testing Add a test to check each value of bond fail_over_mac option. Also fix a minor garp_test print issue. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250910024336.400253-2-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- .../drivers/net/bonding/bond_options.sh | 139 +++++++++++++++++- .../drivers/net/bonding/bond_topo_2d1c.sh | 3 + .../drivers/net/bonding/bond_topo_3d1c.sh | 2 + 3 files changed, 142 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index 7bc148889ca729..e3f3cc803b562c 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -7,6 +7,7 @@ ALL_TESTS=" prio arp_validate num_grat_arp + fail_over_mac " lib_dir=$(dirname "$0") @@ -352,8 +353,8 @@ garp_test() exp_num=$(echo "${param}" | cut -f6 -d ' ') active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") - slowwait_for_counter $((exp_num + 5)) $exp_num \ - tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" + slowwait_for_counter $((exp_num + 5)) $exp_num tc_rule_handle_stats_get \ + "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" &> /dev/null # check result real_num=$(tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}") @@ -376,6 +377,140 @@ num_grat_arp() done } +check_all_mac_same() +{ + RET=0 + # all slaves should have same mac address (with the first port's mac) + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + local eth1_mac=$(ip -n "$s_ns" -j link show eth1 | jq -r '.[]["address"]') + local eth2_mac=$(ip -n "$s_ns" -j link show eth2 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ] || [ "$eth0_mac" != "$bond_mac" ] || \ + [ "$eth1_mac" != "$bond_mac" ] || [ "$eth2_mac" != "$bond_mac" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_first() +{ + RET=0 + # bond mac address should be same with the first added slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_active() +{ + RET=0 + # bond mac address should be same with active slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + local active_slave_mac=$(ip -n "$s_ns" -j link show "$active_slave" | jq -r '.[]["address"]') + if [ "$bond_mac" != "$active_slave_mac" ]; then + RET=1 + fi +} + +check_backup_slave_mac_not_change() +{ + RET=0 + # backup slave's mac address is not changed + if ip -n "$s_ns" -d -j link show type bond_slave | jq -e '.[] + | select(.linkinfo.info_slave_data.state=="BACKUP") + | select(.address != .linkinfo.info_slave_data.perm_hwaddr)' &> /dev/null; then + RET=1 + fi +} + +check_backup_slave_mac_inherit() +{ + local backup_mac + RET=0 + + # backup slaves should use mac[1] or mac[2] + local backup_macs=$(ip -n "$s_ns" -d -j link show type bond_slave | \ + jq -r '.[] | select(.linkinfo.info_slave_data.state=="BACKUP") | .address') + for backup_mac in $backup_macs; do + if [ "$backup_mac" != "${mac[1]}" ] && [ "$backup_mac" != "${mac[2]}" ]; then + RET=1 + fi + done +} + +check_first_slave_random_mac() +{ + RET=0 + # remove the first added slave and added it back + ip -n "$s_ns" link set eth0 nomaster + ip -n "$s_ns" link set eth0 master bond0 + + # the first slave should use random mac address + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" = "${mac[0]}" ] && RET=1 + log_test "bond fail_over_mac follow" "random first slave mac" + + # remove the first slave, the permanent MAC address should be restored back + ip -n "$s_ns" link set eth0 nomaster + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" != "${mac[0]}" ] && RET=1 +} + +do_active_backup_failover() +{ + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + ip -n ${s_ns} link set ${active_slave} down + slowwait 2 active_slave_changed $active_slave + ip -n ${s_ns} link set ${active_slave} up +} + +fail_over_mac() +{ + # Bring down the first interface on the switch to force the bond to + # select another active interface instead of the first one that joined. + ip -n "$g_ns" link set s0 down + + # fail_over_mac none + bond_reset "mode active-backup miimon 100 fail_over_mac 0" + check_all_mac_same + log_test "fail_over_mac 0" "all slaves have same mac" + do_active_backup_failover + check_all_mac_same + log_test "fail_over_mac 0" "failover: all slaves have same mac" + + # fail_over_mac active + bond_reset "mode active-backup miimon 100 fail_over_mac 1" + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "backup slave mac is not changed" + do_active_backup_failover + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "failover: backup slave mac is not changed" + + # fail_over_mac follow + bond_reset "mode active-backup miimon 100 fail_over_mac 2" + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "backup slave mac inherit" + do_active_backup_failover + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "failover: bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "failover: backup slave mac inherit" + check_first_slave_random_mac + log_test "fail_over_mac 2" "first slave mac random" + +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh index 195ef83cfbf167..167aa4a4a12a42 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh @@ -39,6 +39,8 @@ g_ip4="192.0.2.254" s_ip6="2001:db8::1" c_ip6="2001:db8::10" g_ip6="2001:db8::254" +mac[0]="00:0a:0b:0c:0d:01" +mac[1]="00:0a:0b:0c:0d:02" gateway_create() { @@ -62,6 +64,7 @@ server_create() for i in $(seq 0 1); do ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh index 3a1333d9a85b36..23a2932301cc0d 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh @@ -26,6 +26,7 @@ # +-------------------------------------+ source bond_topo_2d1c.sh +mac[2]="00:0a:0b:0c:0d:03" setup_prepare() { @@ -36,6 +37,7 @@ setup_prepare() # Add the extra device as we use 3 down links for bond0 local i=2 ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 ip -n ${s_ns} link set eth${i} master bond0 From f755be0b1ff429a2ecf709beeb1bcd7abc111c2b Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:50 +0200 Subject: [PATCH 1794/3206] mptcp: propagate shutdown to subflows when possible When the MPTCP DATA FIN have been ACKed, there is no more MPTCP related metadata to exchange, and all subflows can be safely shutdown. Before this patch, the subflows were actually terminated at 'close()' time. That's certainly fine most of the time, but not when the userspace 'shutdown()' a connection, without close()ing it. When doing so, the subflows were staying in LAST_ACK state on one side -- and consequently in FIN_WAIT2 on the other side -- until the 'close()' of the MPTCP socket. Now, when the DATA FIN have been ACKed, all subflows are shutdown. A consequence of this is that the TCP 'FIN' flag can be set earlier now, but the end result is the same. This affects the packetdrill tests looking at the end of the MPTCP connections, but for a good reason. Note that tcp_shutdown() will check the subflow state, so no need to do that again before calling it. Fixes: 3721b9b64676 ("mptcp: Track received DATA_FIN sequence number and add related helpers") Cc: stable@vger.kernel.org Fixes: 16a9a9da1723 ("mptcp: Add helper to process acks of DATA_FIN") Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-1-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e6fd97b21e9eea..5e497a83e9675b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -371,6 +371,20 @@ static void mptcp_close_wake_up(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } +static void mptcp_shutdown_subflows(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + + slow = lock_sock_fast(ssk); + tcp_shutdown(ssk, SEND_SHUTDOWN); + unlock_sock_fast(ssk, slow); + } +} + /* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { @@ -395,6 +409,7 @@ static void mptcp_check_data_fin_ack(struct sock *sk) break; case TCP_CLOSING: case TCP_LAST_ACK: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; } @@ -563,6 +578,7 @@ static bool mptcp_check_data_fin(struct sock *sk) mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; default: From 14e22b43df25dbd4301351b882486ea38892ae4f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:51 +0200 Subject: [PATCH 1795/3206] selftests: mptcp: connect: catch IO errors on listen side IO errors were correctly printed to stderr, and propagated up to the main loop for the server side, but the returned value was ignored. As a consequence, the program for the listener side was no longer exiting with an error code in case of IO issues. Because of that, some issues might not have been seen. But very likely, most issues either had an effect on the client side, or the file transfer was not the expected one, e.g. the connection got reset before the end. Still, it is better to fix this. The main consequence of this issue is the error that was reported by the selftests: the received and sent files were different, and the MIB counters were not printed. Also, when such errors happened during the 'disconnect' tests, the program tried to continue until the timeout. Now when an IO error is detected, the program exits directly with an error. Fixes: 05be5e273c84 ("selftests: mptcp: add disconnect tests") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-2-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 4f07ac9fa207cb..1408698df09997 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1093,6 +1093,7 @@ int main_loop_s(int listensock) struct pollfd polls; socklen_t salen; int remotesock; + int err = 0; int fd = 0; again: @@ -1125,7 +1126,7 @@ int main_loop_s(int listensock) SOCK_TEST_TCPULP(remotesock, 0); memset(&winfo, 0, sizeof(winfo)); - copyfd_io(fd, remotesock, 1, true, &winfo); + err = copyfd_io(fd, remotesock, 1, true, &winfo); } else { perror("accept"); return 1; @@ -1134,10 +1135,10 @@ int main_loop_s(int listensock) if (cfg_input) close(fd); - if (--cfg_repeat > 0) + if (!err && --cfg_repeat > 0) goto again; - return 0; + return err; } static void init_rng(void) From 8708c5d8b3fb3f6d5d3b9e6bfe01a505819f519a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:52 +0200 Subject: [PATCH 1796/3206] selftests: mptcp: avoid spurious errors on TCP disconnect The disconnect test-case, with 'plain' TCP sockets generates spurious errors, e.g. 07 ns1 TCP -> ns1 (dead:beef:1::1:10006) MPTCP read: Connection reset by peer read: Connection reset by peer (duration 155ms) [FAIL] client exit code 3, server 3 netns ns1-FloSdv (listener) socket stat for 10006: TcpActiveOpens 2 0.0 TcpPassiveOpens 2 0.0 TcpEstabResets 2 0.0 TcpInSegs 274 0.0 TcpOutSegs 276 0.0 TcpOutRsts 3 0.0 TcpExtPruneCalled 2 0.0 TcpExtRcvPruned 1 0.0 TcpExtTCPPureAcks 104 0.0 TcpExtTCPRcvCollapsed 2 0.0 TcpExtTCPBacklogCoalesce 42 0.0 TcpExtTCPRcvCoalesce 43 0.0 TcpExtTCPChallengeACK 1 0.0 TcpExtTCPFromZeroWindowAdv 42 0.0 TcpExtTCPToZeroWindowAdv 41 0.0 TcpExtTCPWantZeroWindowAdv 13 0.0 TcpExtTCPOrigDataSent 164 0.0 TcpExtTCPDelivered 165 0.0 TcpExtTCPRcvQDrop 1 0.0 In the failing scenarios (TCP -> MPTCP), the involved sockets are actually plain TCP ones, as fallbacks for passive sockets at 2WHS time cause the MPTCP listeners to actually create 'plain' TCP sockets. Similar to commit 218cc166321f ("selftests: mptcp: avoid spurious errors on disconnect"), the root cause is in the user-space bits: the test program tries to disconnect as soon as all the pending data has been spooled, generating an RST. If such option reaches the peer before the connection has reached the closed status, the TCP socket will report an error to the user-space, as per protocol specification, causing the above failure. Note that it looks like this issue got more visible since the "tcp: receiver changes" series from commit 06baf9bfa6ca ("Merge branch 'tcp-receiver-changes'"). Address the issue by explicitly waiting for the TCP sockets (-t) to reach a closed status before performing the disconnect. More precisely, the test program now waits for plain TCP sockets or TCP subflows in addition to the MPTCP sockets that were already monitored. While at it, use 'ss' with '-n' to avoid resolving service names, which is not needed here. Fixes: 218cc166321f ("selftests: mptcp: avoid spurious errors on disconnect") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-3-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 1408698df09997..b148cadb96d0b7 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1248,7 +1248,7 @@ void xdisconnect(int fd) else xerror("bad family"); - strcpy(cmd, "ss -M | grep -q "); + strcpy(cmd, "ss -Mnt | grep -q "); cmdlen = strlen(cmd); if (!inet_ntop(addr.ss_family, raw_addr, &cmd[cmdlen], sizeof(cmd) - cmdlen)) @@ -1258,7 +1258,7 @@ void xdisconnect(int fd) /* * wait until the pending data is completely flushed and all - * the MPTCP sockets reached the closed status. + * the sockets reached the closed status. * disconnect will bypass/ignore/drop any pending data. */ for (i = 0; ; i += msec_sleep) { From a17c5aa3a32373f80b4714b411bd8d4ffee6fc6a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:53 +0200 Subject: [PATCH 1797/3206] selftests: mptcp: print trailing bytes with od This is better than printing random bytes in the terminal. Note that Jakub suggested 'hexdump', but Mat found out this tool is not often installed by default. 'od' can do a similar job, and it is in the POSIX specs and available in coreutils, so it should be on more systems. While at it, display a few more bytes, just to fill in the two lines. And no need to display the 3rd only line showing the next number of bytes: 0000040. Suggested-by: Jakub Kicinski Suggested-by: Mat Martineau Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-4-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 09cd24b2ae4662..d62e653d48b0f2 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -384,7 +384,7 @@ mptcp_lib_make_file() { mptcp_lib_print_file_err() { ls -l "${1}" 1>&2 echo "Trailing bytes are: " - tail -c 27 "${1}" + tail -c 32 "${1}" | od -x | head -n2 } # $1: input file ; $2: output file ; $3: what kind of file From cf74e0aa0eb024741c8b5cb7e839d2824ca77336 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:54 +0200 Subject: [PATCH 1798/3206] selftests: mptcp: connect: print pcap prefix To be able to find which capture files have been produced after several runs. This prefix was not printed anywhere before. While at it, always use the same prefix by taking info from ns1, instead of "$connector_ns", which is sometimes ns1, sometimes ns2 in the subtests. Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-5-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index c2ab9f7f0d2133..47ecb5b3836eb5 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -211,6 +211,11 @@ if $checksum; then done fi +if $capture; then + rndh="${ns1:4}" + mptcp_lib_pr_info "Packet capture files will have this prefix: ${rndh}-" +fi + set_ethtool_flags() { local ns="$1" local dev="$2" @@ -361,7 +366,6 @@ do_transfer() if $capture; then local capuser - local rndh="${connector_ns:4}" if [ -z $SUDO_USER ] ; then capuser="" else From 96939cec994070aa5df852c10fad5fc303a97ea3 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:20 +0200 Subject: [PATCH 1799/3206] mptcp: set remote_deny_join_id0 on SYN recv When a SYN containing the 'C' flag (deny join id0) was received, this piece of information was not propagated to the path-manager. Even if this flag is mainly set on the server side, a client can also tell the server it cannot try to establish new subflows to the client's initial IP address and port. The server's PM should then record such info when received, and before sending events about the new connection. Fixes: df377be38725 ("mptcp: add deny_join_id0 in mptcp_options_received") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-1-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3f1b62a9fe889a..f31a3a79531a2e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -883,6 +883,10 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); + + if (mp_opt.deny_join_id0) + WRITE_ONCE(owner->pm.remote_deny_join_id0, true); + mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress From 2293c57484ae64c9a3c847c8807db8c26a3a4d41 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:21 +0200 Subject: [PATCH 1800/3206] mptcp: pm: nl: announce deny-join-id0 flag During the connection establishment, a peer can tell the other one that it cannot establish new subflows to the initial IP address and port by setting the 'C' flag [1]. Doing so makes sense when the sender is behind a strict NAT, operating behind a legacy Layer 4 load balancer, or using anycast IP address for example. When this 'C' flag is set, the path-managers must then not try to establish new subflows to the other peer's initial IP address and port. The in-kernel PM has access to this info, but the userspace PM didn't. The RFC8684 [1] is strict about that: (...) therefore the receiver MUST NOT try to open any additional subflows toward this address and port. So it is important to tell the userspace about that as it is responsible for the respect of this flag. When a new connection is created and established, the Netlink events now contain the existing but not currently used 'flags' attribute. When MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 is set, it means no other subflows to the initial IP address and port -- info that are also part of the event -- can be established. Link: https://datatracker.ietf.org/doc/html/rfc8684#section-3.1-20.6 [1] Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Reported-by: Marek Majkowski Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/532 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-2-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 4 ++-- include/uapi/linux/mptcp.h | 2 ++ include/uapi/linux/mptcp_pm.h | 4 ++-- net/mptcp/pm_netlink.c | 7 +++++++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index d15335684ec3d6..d1b4829b580ad0 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: closed doc: >- diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 67d015df8893cc..5fd5b4cf75ca1e 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -31,6 +31,8 @@ #define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) +#define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) + #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) #define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 6ac84b2f636ca2..7359d34da446b9 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 50aaf259959aea..ce7d42d3bd007b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -408,6 +408,7 @@ static int mptcp_event_created(struct sk_buff *skb, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); + u16 flags = 0; if (err) return err; @@ -415,6 +416,12 @@ static int mptcp_event_created(struct sk_buff *skb, if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) return -EMSGSIZE; + if (READ_ONCE(msk->pm.remote_deny_join_id0)) + flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; + + if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags)) + return -EMSGSIZE; + return mptcp_event_add_subflow(skb, ssk); } From 24733e193a0d68f20d220e86da0362460c9aa812 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:22 +0200 Subject: [PATCH 1801/3206] selftests: mptcp: userspace pm: validate deny-join-id0 flag The previous commit adds the MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 flag. Make sure it is correctly announced by the other peer when it has been received. pm_nl_ctl will now display 'deny_join_id0:1' when monitoring the events, and when this flag was set by the other peer. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-3-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 7 +++++++ tools/testing/selftests/net/mptcp/userspace_pm.sh | 14 +++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 994a556f46c151..93fea3442216c8 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -188,6 +188,13 @@ static int capture_events(int fd, int event_group) fprintf(stderr, ",error:%u", *(__u8 *)RTA_DATA(attrs)); else if (attrs->rta_type == MPTCP_ATTR_SERVER_SIDE) fprintf(stderr, ",server_side:%u", *(__u8 *)RTA_DATA(attrs)); + else if (attrs->rta_type == MPTCP_ATTR_FLAGS) { + __u16 flags = *(__u16 *)RTA_DATA(attrs); + + /* only print when present, easier */ + if (flags & MPTCP_PM_EV_FLAG_DENY_JOIN_ID0) + fprintf(stderr, ",deny_join_id0:1"); + } attrs = RTA_NEXT(attrs, msg_len); } diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 970c329735ff14..3d45991f24ede9 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -201,6 +201,9 @@ make_connection() is_v6="v4" fi + # set this on the client side only: will not affect the rest + ip netns exec "$ns2" sysctl -q net.mptcp.allow_join_initial_addr_port=0 + :>"$client_evts" :>"$server_evts" @@ -223,23 +226,28 @@ make_connection() local client_token local client_port local client_serverside + local client_nojoin local server_token local server_serverside + local server_nojoin client_token=$(mptcp_lib_evts_get_info token "$client_evts") client_port=$(mptcp_lib_evts_get_info sport "$client_evts") client_serverside=$(mptcp_lib_evts_get_info server_side "$client_evts") + client_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$client_evts") server_token=$(mptcp_lib_evts_get_info token "$server_evts") server_serverside=$(mptcp_lib_evts_get_info server_side "$server_evts") + server_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$server_evts") print_test "Established IP${is_v6} MPTCP Connection ns2 => ns1" - if [ "$client_token" != "" ] && [ "$server_token" != "" ] && [ "$client_serverside" = 0 ] && - [ "$server_serverside" = 1 ] + if [ "${client_token}" != "" ] && [ "${server_token}" != "" ] && + [ "${client_serverside}" = 0 ] && [ "${server_serverside}" = 1 ] && + [ "${client_nojoin:-0}" = 0 ] && [ "${server_nojoin:-0}" = 1 ] then test_pass print_title "Connection info: ${client_addr}:${client_port} -> ${connect_addr}:${app_port}" else - test_fail "Expected tokens (c:${client_token} - s:${server_token}) and server (c:${client_serverside} - s:${server_serverside})" + test_fail "Expected tokens (c:${client_token} - s:${server_token}), server (c:${client_serverside} - s:${server_serverside}), nojoin (c:${client_nojoin} - s:${server_nojoin})" mptcp_lib_result_print_all_tap exit ${KSFT_FAIL} fi From 92da495cb65719583aa06bc946aeb18a10e1e6e2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:23 +0200 Subject: [PATCH 1802/3206] mptcp: tfo: record 'deny join id0' info When TFO is used, the check to see if the 'C' flag (deny join id0) was set was bypassed. This flag can be set when TFO is used, so the check should also be done when TFO is used. Note that the set_fully_established label is also used when a 4th ACK is received. In this case, deny_join_id0 will not be set. Fixes: dfc8d0603033 ("mptcp: implement delayed seq generation for passive fastopen") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-4-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 2a8ea28442b271..1103b3341a7056 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -985,13 +985,13 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return false; } - if (mp_opt->deny_join_id0) - WRITE_ONCE(msk->pm.remote_deny_join_id0, true); - if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); set_fully_established: + if (mp_opt->deny_join_id0) + WRITE_ONCE(msk->pm.remote_deny_join_id0, true); + mptcp_data_lock((struct sock *)msk); __mptcp_subflow_fully_established(msk, subflow, mp_opt); mptcp_data_unlock((struct sock *)msk); From b86418beade11d45540a2d20c4ec1128849b6c27 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 12 Sep 2025 14:52:24 +0200 Subject: [PATCH 1803/3206] selftests: mptcp: sockopt: fix error messages This patch fixes several issues in the error reporting of the MPTCP sockopt selftest: 1. Fix diff not printed: The error messages for counter mismatches had the actual difference ('diff') as argument, but it was missing in the format string. Displaying it makes the debugging easier. 2. Fix variable usage: The error check for 'mptcpi_bytes_acked' incorrectly used 'ret2' (sent bytes) for both the expected value and the difference calculation. It now correctly uses 'ret' (received bytes), which is the expected value for bytes_acked. 3. Fix off-by-one in diff: The calculation for the 'mptcpi_rcv_delta' diff was 's.mptcpi_rcv_delta - ret', which is off-by-one. It has been corrected to 's.mptcpi_rcv_delta - (ret + 1)' to match the expected value in the condition above it. Fixes: 5dcff89e1455 ("selftests: mptcp: explicitly tests aggregate counters") Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-5-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_sockopt.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index e934dd26a59d9b..112c07c4c37a3c 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -667,22 +667,26 @@ static void process_one_client(int fd, int pipefd) do_getsockopts(&s, fd, ret, ret2); if (s.mptcpi_rcv_delta != (uint64_t)ret + 1) - xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64, s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - ret); + xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64 ", diff %" PRId64, + s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - (ret + 1)); /* be nice when running on top of older kernel */ if (s.pkt_stats_avail) { if (s.last_sample.mptcpi_bytes_sent != ret2) - xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_sent, ret2, s.last_sample.mptcpi_bytes_sent - ret2); if (s.last_sample.mptcpi_bytes_received != ret) - xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_received, ret, s.last_sample.mptcpi_bytes_received - ret); if (s.last_sample.mptcpi_bytes_acked != ret) - xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64, - s.last_sample.mptcpi_bytes_acked, ret2, - s.last_sample.mptcpi_bytes_acked - ret2); + xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, + s.last_sample.mptcpi_bytes_acked, ret, + s.last_sample.mptcpi_bytes_acked - ret); } close(fd); From 93ab4881a4e2b9657bdce4b8940073bfb4ed5eab Mon Sep 17 00:00:00 2001 From: Yeounsu Moon Date: Sat, 13 Sep 2025 15:01:36 +0900 Subject: [PATCH 1804/3206] net: natsemi: fix `rx_dropped` double accounting on `netif_rx()` failure `netif_rx()` already increments `rx_dropped` core stat when it fails. The driver was also updating `ndev->stats.rx_dropped` in the same path. Since both are reported together via `ip -s -s` command, this resulted in drops being counted twice in user-visible stats. Keep the driver update on `if (unlikely(!skb))`, but skip it after `netif_rx()` errors. Fixes: caf586e5f23c ("net: add a core netdev->rx_dropped counter") Signed-off-by: Yeounsu Moon Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250913060135.35282-3-yyyynoom@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/natsemi/ns83820.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c index 56d5464222d97a..cdbf82affa7bea 100644 --- a/drivers/net/ethernet/natsemi/ns83820.c +++ b/drivers/net/ethernet/natsemi/ns83820.c @@ -820,7 +820,7 @@ static void rx_irq(struct net_device *ndev) struct ns83820 *dev = PRIV(ndev); struct rx_info *info = &dev->rx_info; unsigned next_rx; - int rx_rc, len; + int len; u32 cmdsts; __le32 *desc; unsigned long flags; @@ -881,8 +881,10 @@ static void rx_irq(struct net_device *ndev) if (likely(CMDSTS_OK & cmdsts)) { #endif skb_put(skb, len); - if (unlikely(!skb)) + if (unlikely(!skb)) { + ndev->stats.rx_dropped++; goto netdev_mangle_me_harder_failed; + } if (cmdsts & CMDSTS_DEST_MULTI) ndev->stats.multicast++; ndev->stats.rx_packets++; @@ -901,15 +903,12 @@ static void rx_irq(struct net_device *ndev) __vlan_hwaccel_put_tag(skb, htons(ETH_P_IPV6), tag); } #endif - rx_rc = netif_rx(skb); - if (NET_RX_DROP == rx_rc) { -netdev_mangle_me_harder_failed: - ndev->stats.rx_dropped++; - } + netif_rx(skb); } else { dev_kfree_skb_irq(skb); } +netdev_mangle_me_harder_failed: nr++; next_rx = info->next_rx; desc = info->descs + (DESC_SIZE * next_rx); From ce4be9e4307c5a60701ff6e0cafa74caffdc54ce Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 9 Sep 2025 13:48:35 +0900 Subject: [PATCH 1805/3206] zram: fix slot write race condition Parallel concurrent writes to the same zram index result in leaked zsmalloc handles. Schematically we can have something like this: CPU0 CPU1 zram_slot_lock() zs_free(handle) zram_slot_lock() zram_slot_lock() zs_free(handle) zram_slot_lock() compress compress handle = zs_malloc() handle = zs_malloc() zram_slot_lock zram_set_handle(handle) zram_slot_lock zram_slot_lock zram_set_handle(handle) zram_slot_lock Either CPU0 or CPU1 zsmalloc handle will leak because zs_free() is done too early. In fact, we need to reset zram entry right before we set its new handle, all under the same slot lock scope. Link: https://lkml.kernel.org/r/20250909045150.635345-1-senozhatsky@chromium.org Fixes: 71268035f5d7 ("zram: free slot memory early during write") Signed-off-by: Sergey Senozhatsky Reported-by: Changhui Zhong Closes: https://lore.kernel.org/all/CAGVVp+UtpGoW5WEdEU7uVTtsSCjPN=ksN6EcvyypAtFDOUf30A@mail.gmail.com/ Tested-by: Changhui Zhong Cc: Jens Axboe Cc: Minchan Kim Cc: Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8acad3cc6e6ea1..f31652085adcb6 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1795,6 +1795,7 @@ static int write_same_filled_page(struct zram *zram, unsigned long fill, u32 index) { zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_flag(zram, index, ZRAM_SAME); zram_set_handle(zram, index, fill); zram_slot_unlock(zram, index); @@ -1832,6 +1833,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page, kunmap_local(src); zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_flag(zram, index, ZRAM_HUGE); zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, PAGE_SIZE); @@ -1855,11 +1857,6 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) unsigned long element; bool same_filled; - /* First, free memory allocated to this slot (if any) */ - zram_slot_lock(zram, index); - zram_free_page(zram, index); - zram_slot_unlock(zram, index); - mem = kmap_local_page(page); same_filled = page_same_filled(mem, &element); kunmap_local(mem); @@ -1901,6 +1898,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) zcomp_stream_put(zstrm); zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index); From 38f4885088fc5ad41b8b0a2a2cfc73d01e709e5c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Aug 2025 16:35:55 -0400 Subject: [PATCH 1806/3206] mnt_ns_tree_remove(): DTRT if mnt_ns had never been added to mnt_ns_list Actual removal is done under the lock, but for checking if need to bother the lockless RB_EMPTY_NODE() is safe - either that namespace had never been added to mnt_ns_tree, in which case the the node will stay empty, or whoever had allocated it has called mnt_ns_tree_add() and it has already run to completion. After that point RB_EMPTY_NODE() will become false and will remain false, no matter what we do with other nodes in the tree. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index ae6d1312b1849c..39afeb521a80c0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -187,7 +187,7 @@ static void mnt_ns_release_rcu(struct rcu_head *rcu) static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ - if (!is_anon_ns(ns)) { + if (!RB_EMPTY_NODE(&ns->mnt_ns_tree_node)) { mnt_ns_tree_write_lock(); rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); list_bidir_del_rcu(&ns->mnt_ns_list); From 4b4d06a7630e14c4a8567d0dae6847260eba8a3c Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:39:46 -0400 Subject: [PATCH 1807/3206] gpu: nova-core: use `kernel::{fmt,prelude::fmt!}` Reduce coupling to implementation details of the formatting machinery by avoiding direct use for `core`'s formatting traits and macros. Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Acked-by: Danilo Krummrich Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- drivers/gpu/nova-core/gpu.rs | 3 +-- drivers/gpu/nova-core/regs/macros.rs | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs index b5c9786619a9d4..600cc90b5fabea 100644 --- a/drivers/gpu/nova-core/gpu.rs +++ b/drivers/gpu/nova-core/gpu.rs @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -use kernel::{device, devres::Devres, error::code::*, pci, prelude::*, sync::Arc}; +use kernel::{device, devres::Devres, error::code::*, fmt, pci, prelude::*, sync::Arc}; use crate::driver::Bar0; use crate::falcon::{gsp::Gsp, sec2::Sec2, Falcon}; @@ -12,7 +12,6 @@ use crate::gfw; use crate::regs; use crate::util; use crate::vbios::Vbios; -use core::fmt; macro_rules! define_chipset { ({ $($variant:ident = $value:expr),* $(,)* }) => diff --git a/drivers/gpu/nova-core/regs/macros.rs b/drivers/gpu/nova-core/regs/macros.rs index a3e6de1779d413..6b9df4205f4698 100644 --- a/drivers/gpu/nova-core/regs/macros.rs +++ b/drivers/gpu/nova-core/regs/macros.rs @@ -149,10 +149,10 @@ macro_rules! register { // TODO[REGA]: display the raw hex value, then the value of all the fields. This requires // matching the fields, which will complexify the syntax considerably... - impl ::core::fmt::Debug for $name { - fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { + impl ::kernel::fmt::Debug for $name { + fn fmt(&self, f: &mut ::kernel::fmt::Formatter<'_>) -> ::kernel::fmt::Result { f.debug_tuple(stringify!($name)) - .field(&format_args!("0x{0:x}", &self.0)) + .field(&::kernel::prelude::fmt!("0x{0:x}", &self.0)) .finish() } } From 1f96115f502abae7d08ba35dd02e6b0381f265a4 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:39:47 -0400 Subject: [PATCH 1808/3206] rust: alloc: use `kernel::{fmt,prelude::fmt!}` Reduce coupling to implementation details of the formatting machinery by avoiding direct use for `core`'s formatting traits and macros. Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Acked-by: Danilo Krummrich Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/alloc/kbox.rs | 2 +- rust/kernel/alloc/kvec.rs | 2 +- rust/kernel/alloc/kvec/errors.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/kernel/alloc/kbox.rs b/rust/kernel/alloc/kbox.rs index 1aa83d751f10d3..27c4b5a9b61dc5 100644 --- a/rust/kernel/alloc/kbox.rs +++ b/rust/kernel/alloc/kbox.rs @@ -7,7 +7,6 @@ use super::allocator::{KVmalloc, Kmalloc, Vmalloc}; use super::{AllocError, Allocator, Flags}; use core::alloc::Layout; use core::borrow::{Borrow, BorrowMut}; -use core::fmt; use core::marker::PhantomData; use core::mem::ManuallyDrop; use core::mem::MaybeUninit; @@ -17,6 +16,7 @@ use core::ptr::NonNull; use core::result::Result; use crate::ffi::c_void; +use crate::fmt; use crate::init::InPlaceInit; use crate::types::ForeignOwnable; use pin_init::{InPlaceWrite, Init, PinInit, ZeroableOption}; diff --git a/rust/kernel/alloc/kvec.rs b/rust/kernel/alloc/kvec.rs index d42dbdc44f0fd5..dfc101e03f358a 100644 --- a/rust/kernel/alloc/kvec.rs +++ b/rust/kernel/alloc/kvec.rs @@ -7,9 +7,9 @@ use super::{ layout::ArrayLayout, AllocError, Allocator, Box, Flags, }; +use crate::fmt; use core::{ borrow::{Borrow, BorrowMut}, - fmt, marker::PhantomData, mem::{ManuallyDrop, MaybeUninit}, ops::Deref, diff --git a/rust/kernel/alloc/kvec/errors.rs b/rust/kernel/alloc/kvec/errors.rs index 348b8d27e102ca..21a920a4b09bc1 100644 --- a/rust/kernel/alloc/kvec/errors.rs +++ b/rust/kernel/alloc/kvec/errors.rs @@ -2,7 +2,7 @@ //! Errors for the [`Vec`] type. -use core::fmt::{self, Debug, Formatter}; +use kernel::fmt::{self, Debug, Formatter}; use kernel::prelude::*; /// Error type for [`Vec::push_within_capacity`]. From e0be3d34f1089fd9681a66f8295de91a77f55d7f Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:39:48 -0400 Subject: [PATCH 1809/3206] rust: block: use `kernel::{fmt,prelude::fmt!}` Reduce coupling to implementation details of the formatting machinery by avoiding direct use for `core`'s formatting traits and macros. Suggested-by: Alice Ryhl Link: https://rust-for-linux.zulipchat.com/#narrow/channel/288089-General/topic/Custom.20formatting/with/516476467 Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Tamir Duberstein Acked-by: Andreas Hindborg Signed-off-by: Miguel Ojeda --- drivers/block/rnull.rs | 2 +- rust/kernel/block/mq.rs | 2 +- rust/kernel/block/mq/gen_disk.rs | 2 +- rust/kernel/block/mq/raw_writer.rs | 3 +-- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs index d07e76ae2c13f4..6366da12c5a5fd 100644 --- a/drivers/block/rnull.rs +++ b/drivers/block/rnull.rs @@ -51,7 +51,7 @@ impl kernel::InPlaceModule for NullBlkModule { .logical_block_size(4096)? .physical_block_size(4096)? .rotational(false) - .build(format_args!("rnullb{}", 0), tagset) + .build(fmt!("rnullb{}", 0), tagset) })(); try_pin_init!(Self { diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs index 831445d37181a7..61ea35bba7d501 100644 --- a/rust/kernel/block/mq.rs +++ b/rust/kernel/block/mq.rs @@ -82,7 +82,7 @@ //! Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; //! let mut disk = gen_disk::GenDiskBuilder::new() //! .capacity_sectors(4096) -//! .build(format_args!("myblk"), tagset)?; +//! .build(fmt!("myblk"), tagset)?; //! //! # Ok::<(), kernel::error::Error>(()) //! ``` diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index e1af0fa302a372..be92d0e5f03126 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -6,9 +6,9 @@ //! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h) use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet}; +use crate::fmt::{self, Write}; use crate::{bindings, error::from_err_ptr, error::Result, sync::Arc}; use crate::{error, static_lock_class}; -use core::fmt::{self, Write}; /// A builder for [`GenDisk`]. /// diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs index 7e2159e4f6a6f7..d311e24e2595ae 100644 --- a/rust/kernel/block/mq/raw_writer.rs +++ b/rust/kernel/block/mq/raw_writer.rs @@ -1,8 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -use core::fmt::{self, Write}; - use crate::error::Result; +use crate::fmt::{self, Write}; use crate::prelude::EINVAL; /// A mutable reference to a byte buffer where a string can be written into. From 97bcbe585476e67980a0873e89965254e1fd6a67 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:39:49 -0400 Subject: [PATCH 1810/3206] rust: device: use `kernel::{fmt,prelude::fmt!}` Reduce coupling to implementation details of the formatting machinery by avoiding direct use for `core`'s formatting traits and macros. Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Acked-by: Danilo Krummrich Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/device.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/kernel/device.rs b/rust/kernel/device.rs index 5902b3714a1662..303af0ef9bf73d 100644 --- a/rust/kernel/device.rs +++ b/rust/kernel/device.rs @@ -5,10 +5,10 @@ //! C header: [`include/linux/device.h`](srctree/include/linux/device.h) use crate::{ - bindings, + bindings, fmt, types::{ARef, ForeignOwnable, Opaque}, }; -use core::{fmt, marker::PhantomData, ptr}; +use core::{marker::PhantomData, ptr}; #[cfg(CONFIG_PRINTK)] use crate::c_str; @@ -595,7 +595,7 @@ macro_rules! impl_device_context_into_aref { macro_rules! dev_printk { ($method:ident, $dev:expr, $($f:tt)*) => { { - ($dev).$method(::core::format_args!($($f)*)); + ($dev).$method($crate::prelude::fmt!($($f)*)); } } } From e6aedde22dc42d83280d63753309884de3c21da4 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:39:50 -0400 Subject: [PATCH 1811/3206] rust: file: use `kernel::{fmt,prelude::fmt!}` Reduce coupling to implementation details of the formatting machinery by avoiding direct use for `core`'s formatting traits and macros. Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/fs/file.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rust/kernel/fs/file.rs b/rust/kernel/fs/file.rs index 35fd5db35c4652..67a3654f0fd370 100644 --- a/rust/kernel/fs/file.rs +++ b/rust/kernel/fs/file.rs @@ -11,6 +11,7 @@ use crate::{ bindings, cred::Credential, error::{code::*, Error, Result}, + fmt, types::{ARef, AlwaysRefCounted, NotThreadSafe, Opaque}, }; use core::ptr; @@ -460,8 +461,8 @@ impl From for Error { } } -impl core::fmt::Debug for BadFdError { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl fmt::Debug for BadFdError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.pad("EBADF") } } From aa2417c1a5842a1eb9b6b5dcf1a9ccb617583eb9 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:39:51 -0400 Subject: [PATCH 1812/3206] rust: kunit: use `kernel::{fmt,prelude::fmt!}` Reduce coupling to implementation details of the formatting machinery by avoiding direct use for `core`'s formatting traits and macros. Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/kunit.rs | 8 ++++---- scripts/rustdoc_test_gen.rs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/rust/kernel/kunit.rs b/rust/kernel/kunit.rs index 41efd87595d6ea..cf58a204b2224e 100644 --- a/rust/kernel/kunit.rs +++ b/rust/kernel/kunit.rs @@ -6,8 +6,8 @@ //! //! Reference: +use crate::fmt; use crate::prelude::*; -use core::fmt; #[cfg(CONFIG_PRINTK)] use crate::c_str; @@ -74,14 +74,14 @@ macro_rules! kunit_assert { // mistake (it is hidden to prevent that). // // This mimics KUnit's failed assertion format. - $crate::kunit::err(format_args!( + $crate::kunit::err($crate::prelude::fmt!( " # {}: ASSERTION FAILED at {FILE}:{LINE}\n", $name )); - $crate::kunit::err(format_args!( + $crate::kunit::err($crate::prelude::fmt!( " Expected {CONDITION} to be true, but is false\n" )); - $crate::kunit::err(format_args!( + $crate::kunit::err($crate::prelude::fmt!( " Failure not reported to KUnit since this is a non-KUnit task\n" )); break 'out; diff --git a/scripts/rustdoc_test_gen.rs b/scripts/rustdoc_test_gen.rs index abb34ada25082c..c8f9dc2ab976c2 100644 --- a/scripts/rustdoc_test_gen.rs +++ b/scripts/rustdoc_test_gen.rs @@ -202,7 +202,7 @@ pub extern "C" fn {kunit_name}(__kunit_test: *mut ::kernel::bindings::kunit) {{ // This follows the syntax for declaring test metadata in the proposed KTAP v2 spec, which may // be used for the proposed KUnit test attributes API. Thus hopefully this will make migration // easier later on. - ::kernel::kunit::info(format_args!(" # {kunit_name}.location: {real_path}:{line}\n")); + ::kernel::kunit::info(fmt!(" # {kunit_name}.location: {real_path}:{line}\n")); /// The anchor where the test code body starts. #[allow(unused)] From 5990533a83bb801b4a28ba29df3d033ab17fdad4 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:39:52 -0400 Subject: [PATCH 1813/3206] rust: seq_file: use `kernel::{fmt,prelude::fmt!}` Reduce coupling to implementation details of the formatting machinery by avoiding direct use for `core`'s formatting traits and macros. Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/seq_file.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/kernel/seq_file.rs b/rust/kernel/seq_file.rs index 8f199b1a3bb14d..59fbfc2473f81c 100644 --- a/rust/kernel/seq_file.rs +++ b/rust/kernel/seq_file.rs @@ -4,7 +4,7 @@ //! //! C header: [`include/linux/seq_file.h`](srctree/include/linux/seq_file.h) -use crate::{bindings, c_str, types::NotThreadSafe, types::Opaque}; +use crate::{bindings, c_str, fmt, types::NotThreadSafe, types::Opaque}; /// A utility for generating the contents of a seq file. #[repr(transparent)] @@ -31,7 +31,7 @@ impl SeqFile { /// Used by the [`seq_print`] macro. #[inline] - pub fn call_printf(&self, args: core::fmt::Arguments<'_>) { + pub fn call_printf(&self, args: fmt::Arguments<'_>) { // SAFETY: Passing a void pointer to `Arguments` is valid for `%pA`. unsafe { bindings::seq_printf( @@ -47,7 +47,7 @@ impl SeqFile { #[macro_export] macro_rules! seq_print { ($m:expr, $($arg:tt)+) => ( - $m.call_printf(format_args!($($arg)+)) + $m.call_printf($crate::prelude::fmt!($($arg)+)) ); } pub use seq_print; From 0fe1ca3c8bc5e0b278d4793247eb787131e84112 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:39:53 -0400 Subject: [PATCH 1814/3206] rust: sync: use `kernel::{fmt,prelude::fmt!}` Reduce coupling to implementation details of the formatting machinery by avoiding direct use for `core`'s formatting traits and macros. Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/sync/arc.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs index 74121cf935f364..bb4eb285319022 100644 --- a/rust/kernel/sync/arc.rs +++ b/rust/kernel/sync/arc.rs @@ -20,6 +20,7 @@ use crate::{ alloc::{AllocError, Flags, KBox}, bindings, ffi::c_void, + fmt, init::InPlaceInit, try_init, types::{ForeignOwnable, Opaque}, @@ -27,7 +28,6 @@ use crate::{ use core::{ alloc::Layout, borrow::{Borrow, BorrowMut}, - fmt, marker::PhantomData, mem::{ManuallyDrop, MaybeUninit}, ops::{Deref, DerefMut}, From eb98599528ebee9b660d98ae6613c2f2966e0dbb Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:39:54 -0400 Subject: [PATCH 1815/3206] rust: device: use `kernel::{fmt,prelude::fmt!}` Reduce coupling to implementation details of the formatting machinery by avoiding direct use for `core`'s formatting traits and macros. Signed-off-by: Tamir Duberstein Reviewed-by: Benno Lossin Signed-off-by: Miguel Ojeda --- rust/kernel/device/property.rs | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/rust/kernel/device/property.rs b/rust/kernel/device/property.rs index 49ee12a906dbad..3a332a8c53a9eb 100644 --- a/rust/kernel/device/property.rs +++ b/rust/kernel/device/property.rs @@ -11,6 +11,7 @@ use crate::{ alloc::KVec, bindings, error::{to_result, Result}, + fmt, prelude::*, str::{CStr, CString}, types::{ARef, Opaque}, @@ -68,16 +69,16 @@ impl FwNode { unsafe { bindings::is_of_node(self.as_raw()) } } - /// Returns an object that implements [`Display`](core::fmt::Display) for + /// Returns an object that implements [`Display`](fmt::Display) for /// printing the name of a node. /// /// This is an alternative to the default `Display` implementation, which /// prints the full path. - pub fn display_name(&self) -> impl core::fmt::Display + '_ { + pub fn display_name(&self) -> impl fmt::Display + '_ { struct FwNodeDisplayName<'a>(&'a FwNode); - impl core::fmt::Display for FwNodeDisplayName<'_> { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + impl fmt::Display for FwNodeDisplayName<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // SAFETY: `self` is valid by its type invariant. let name = unsafe { bindings::fwnode_get_name(self.0.as_raw()) }; if name.is_null() { @@ -87,7 +88,7 @@ impl FwNode { // - `fwnode_get_name` returns null or a valid C string. // - `name` was checked to be non-null. let name = unsafe { CStr::from_char_ptr(name) }; - write!(f, "{name}") + fmt::Display::fmt(name, f) } } @@ -351,8 +352,8 @@ impl FwNodeReferenceArgs { } } -impl core::fmt::Debug for FwNodeReferenceArgs { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl fmt::Debug for FwNodeReferenceArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{:?}", self.as_slice()) } } @@ -377,8 +378,8 @@ enum Node<'a> { Owned(ARef), } -impl core::fmt::Display for FwNode { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl fmt::Display for FwNode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // The logic here is the same as the one in lib/vsprintf.c // (fwnode_full_name_string). @@ -413,9 +414,9 @@ impl core::fmt::Display for FwNode { // SAFETY: `fwnode_get_name_prefix` returns null or a // valid C string. let prefix = unsafe { CStr::from_char_ptr(prefix) }; - write!(f, "{prefix}")?; + fmt::Display::fmt(prefix, f)?; } - write!(f, "{}", fwnode.display_name())?; + fmt::Display::fmt(&fwnode.display_name(), f)?; } Ok(()) From 7dfabaa0c489ce65883c26ba5cdb15169f168d7a Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:41:52 -0400 Subject: [PATCH 1816/3206] drm/panic: use `core::ffi::CStr` method names Prepare for `core::ffi::CStr` taking the place of `kernel::str::CStr` by avoid methods that only exist on the latter. Link: https://github.com/Rust-for-Linux/linux/issues/1075 Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Acked-by: Danilo Krummrich Signed-off-by: Tamir Duberstein Acked-by: Jocelyn Falempe Signed-off-by: Miguel Ojeda --- drivers/gpu/drm/drm_panic_qr.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_panic_qr.rs b/drivers/gpu/drm/drm_panic_qr.rs index 50c286c5cee8be..ac27e86c601c8c 100644 --- a/drivers/gpu/drm/drm_panic_qr.rs +++ b/drivers/gpu/drm/drm_panic_qr.rs @@ -968,7 +968,7 @@ pub unsafe extern "C" fn drm_panic_qr_generate( // nul-terminated string. let url_cstr: &CStr = unsafe { CStr::from_char_ptr(url) }; let segments = &[ - &Segment::Binary(url_cstr.as_bytes()), + &Segment::Binary(url_cstr.to_bytes()), &Segment::Numeric(&data_slice[0..data_len]), ]; match EncodedMsg::new(segments, tmp_slice) { From 7ad635c936f8b11496a9a83a539304a39e66cf45 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:41:53 -0400 Subject: [PATCH 1817/3206] rust: auxiliary: use `core::ffi::CStr` method names Prepare for `core::ffi::CStr` taking the place of `kernel::str::CStr` by avoid methods that only exist on the latter. Link: https://github.com/Rust-for-Linux/linux/issues/1075 Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Acked-by: Danilo Krummrich Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/auxiliary.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/kernel/auxiliary.rs b/rust/kernel/auxiliary.rs index 4749fb6bffef34..58be0987139756 100644 --- a/rust/kernel/auxiliary.rs +++ b/rust/kernel/auxiliary.rs @@ -105,8 +105,8 @@ pub struct DeviceId(bindings::auxiliary_device_id); impl DeviceId { /// Create a new [`DeviceId`] from name. pub const fn new(modname: &'static CStr, name: &'static CStr) -> Self { - let name = name.as_bytes_with_nul(); - let modname = modname.as_bytes_with_nul(); + let name = name.to_bytes_with_nul(); + let modname = modname.to_bytes_with_nul(); // TODO: Replace with `bindings::auxiliary_device_id::default()` once stabilized for // `const`. From a3a7d09ab8bf881ea64aa8f33c76554ab9b5f6d6 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:41:54 -0400 Subject: [PATCH 1818/3206] rust: configfs: use `core::ffi::CStr` method names Prepare for `core::ffi::CStr` taking the place of `kernel::str::CStr` by avoid methods that only exist on the latter. Also avoid `Deref for CStr` as that impl doesn't exist on `core::ffi::CStr`. Link: https://github.com/Rust-for-Linux/linux/issues/1075 Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Tamir Duberstein Acked-by: Andreas Hindborg Signed-off-by: Miguel Ojeda --- rust/kernel/configfs.rs | 4 ++-- samples/rust/rust_configfs.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/kernel/configfs.rs b/rust/kernel/configfs.rs index 2736b798cdc6c4..9fb5ef825e4129 100644 --- a/rust/kernel/configfs.rs +++ b/rust/kernel/configfs.rs @@ -263,7 +263,7 @@ impl Group { try_pin_init!(Self { group <- pin_init::init_zeroed().chain(|v: &mut Opaque| { let place = v.get(); - let name = name.as_bytes_with_nul().as_ptr(); + let name = name.to_bytes_with_nul().as_ptr(); // SAFETY: It is safe to initialize a group once it has been zeroed. unsafe { bindings::config_group_init_type_name(place, name.cast(), item_type.as_ptr()) @@ -613,7 +613,7 @@ where pub const fn new(name: &'static CStr) -> Self { Self { attribute: Opaque::new(bindings::configfs_attribute { - ca_name: name.as_char_ptr(), + ca_name: crate::str::as_char_ptr_in_const_context(name), ca_owner: core::ptr::null_mut(), ca_mode: 0o660, show: Some(Self::show), diff --git a/samples/rust/rust_configfs.rs b/samples/rust/rust_configfs.rs index af04bfa35cb28e..5005453f874da0 100644 --- a/samples/rust/rust_configfs.rs +++ b/samples/rust/rust_configfs.rs @@ -94,7 +94,7 @@ impl configfs::AttributeOperations<0> for Configuration { fn show(container: &Configuration, page: &mut [u8; PAGE_SIZE]) -> Result { pr_info!("Show message\n"); - let data = container.message; + let data = container.message.to_bytes(); page[0..data.len()].copy_from_slice(data); Ok(data.len()) } From 23cd58b1d80364a0dbacf1f8ee5bf1b0aa825be1 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:41:55 -0400 Subject: [PATCH 1819/3206] rust: cpufreq: use `core::ffi::CStr` method names Prepare for `core::ffi::CStr` taking the place of `kernel::str::CStr` by avoid methods that only exist on the latter. Link: https://github.com/Rust-for-Linux/linux/issues/1075 Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Tamir Duberstein Acked-by: Viresh Kumar Signed-off-by: Miguel Ojeda --- rust/kernel/cpufreq.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index be2dffbdb54628..86c02e81729e27 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -1016,7 +1016,7 @@ impl Registration { }; const fn copy_name(name: &'static CStr) -> [c_char; CPUFREQ_NAME_LEN] { - let src = name.as_bytes_with_nul(); + let src = name.to_bytes_with_nul(); let mut dst = [0; CPUFREQ_NAME_LEN]; build_assert!(src.len() <= CPUFREQ_NAME_LEN); From 23218425cb2f18785539856f9aabfa4a4f47b3c5 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:41:56 -0400 Subject: [PATCH 1820/3206] rust: drm: use `core::ffi::CStr` method names Prepare for `core::ffi::CStr` taking the place of `kernel::str::CStr` by avoid methods that only exist on the latter. Link: https://github.com/Rust-for-Linux/linux/issues/1075 Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Acked-by: Danilo Krummrich Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/drm/device.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/kernel/drm/device.rs b/rust/kernel/drm/device.rs index f8f1db5eeb0f6f..0956ba0f64dea3 100644 --- a/rust/kernel/drm/device.rs +++ b/rust/kernel/drm/device.rs @@ -82,8 +82,8 @@ impl Device { major: T::INFO.major, minor: T::INFO.minor, patchlevel: T::INFO.patchlevel, - name: T::INFO.name.as_char_ptr().cast_mut(), - desc: T::INFO.desc.as_char_ptr().cast_mut(), + name: crate::str::as_char_ptr_in_const_context(T::INFO.name).cast_mut(), + desc: crate::str::as_char_ptr_in_const_context(T::INFO.desc).cast_mut(), driver_features: drm::driver::FEAT_GEM, ioctls: T::IOCTLS.as_ptr(), From 141ba59cc967d8c156b9cc5c95aff4b1fe50eec8 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:41:57 -0400 Subject: [PATCH 1821/3206] rust: firmware: use `core::ffi::CStr` method names Prepare for `core::ffi::CStr` taking the place of `kernel::str::CStr` by avoid methods that only exist on the latter. Link: https://github.com/Rust-for-Linux/linux/issues/1075 Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Acked-by: Danilo Krummrich Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/firmware.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/firmware.rs b/rust/kernel/firmware.rs index 1abab5b2f0522e..94e6bb88b90306 100644 --- a/rust/kernel/firmware.rs +++ b/rust/kernel/firmware.rs @@ -291,7 +291,7 @@ impl ModInfoBuilder { let module_name = this.module_name; if !this.module_name.is_empty() { - this = this.push_internal(module_name.as_bytes_with_nul()); + this = this.push_internal(module_name.to_bytes_with_nul()); if N != 0 { // Re-use the space taken by the NULL terminator and swap it with the '.' separator. From f16a23743ef57e42985bb934a7d511cddb82b868 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:41:58 -0400 Subject: [PATCH 1822/3206] rust: kunit: use `core::ffi::CStr` method names Prepare for `core::ffi::CStr` taking the place of `kernel::str::CStr` by avoid methods that only exist on the latter. Link: https://github.com/Rust-for-Linux/linux/issues/1075 Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/kunit.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/kernel/kunit.rs b/rust/kernel/kunit.rs index cf58a204b2224e..3a43886cc14e45 100644 --- a/rust/kernel/kunit.rs +++ b/rust/kernel/kunit.rs @@ -102,12 +102,12 @@ macro_rules! kunit_assert { unsafe impl Sync for UnaryAssert {} static LOCATION: Location = Location($crate::bindings::kunit_loc { - file: FILE.as_char_ptr(), + file: $crate::str::as_char_ptr_in_const_context(FILE), line: LINE, }); static ASSERTION: UnaryAssert = UnaryAssert($crate::bindings::kunit_unary_assert { assert: $crate::bindings::kunit_assert {}, - condition: CONDITION.as_char_ptr(), + condition: $crate::str::as_char_ptr_in_const_context(CONDITION), expected_true: true, }); @@ -202,7 +202,7 @@ pub const fn kunit_case( ) -> kernel::bindings::kunit_case { kernel::bindings::kunit_case { run_case: Some(run_case), - name: name.as_char_ptr(), + name: kernel::str::as_char_ptr_in_const_context(name), attr: kernel::bindings::kunit_attributes { speed: kernel::bindings::kunit_speed_KUNIT_SPEED_NORMAL, }, From e49c43ef82f57a155a6c1e6c79795abff1227f29 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:41:59 -0400 Subject: [PATCH 1823/3206] rust: miscdevice: use `core::ffi::CStr` method names Prepare for `core::ffi::CStr` taking the place of `kernel::str::CStr` by avoid methods that only exist on the latter. Link: https://github.com/Rust-for-Linux/linux/issues/1075 Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/miscdevice.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/miscdevice.rs b/rust/kernel/miscdevice.rs index 6373fe183b2748..d3aa7d25afad81 100644 --- a/rust/kernel/miscdevice.rs +++ b/rust/kernel/miscdevice.rs @@ -34,7 +34,7 @@ impl MiscDeviceOptions { // SAFETY: All zeros is valid for this C type. let mut result: bindings::miscdevice = unsafe { MaybeUninit::zeroed().assume_init() }; result.minor = bindings::MISC_DYNAMIC_MINOR as ffi::c_int; - result.name = self.name.as_char_ptr(); + result.name = crate::str::as_char_ptr_in_const_context(self.name); result.fops = MiscdeviceVTable::::build(); result } From 182d95571ffa278f7c68a80d76de88a5333fb69f Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:42:00 -0400 Subject: [PATCH 1824/3206] rust: net: use `core::ffi::CStr` method names Prepare for `core::ffi::CStr` taking the place of `kernel::str::CStr` by avoid methods that only exist on the latter. Link: https://github.com/Rust-for-Linux/linux/issues/1075 Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/net/phy.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index 7de5cc7a0eeee0..be1027b7961b70 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -497,7 +497,7 @@ unsafe impl Sync for DriverVTable {} pub const fn create_phy_driver() -> DriverVTable { // INVARIANT: All the fields of `struct phy_driver` are initialized properly. DriverVTable(Opaque::new(bindings::phy_driver { - name: T::NAME.as_char_ptr().cast_mut(), + name: crate::str::as_char_ptr_in_const_context(T::NAME).cast_mut(), flags: T::FLAGS, phy_id: T::PHY_DEVICE_ID.id(), phy_id_mask: T::PHY_DEVICE_ID.mask_as_int(), From 5749cd1ed8bd11e99ba87146d117e711a4a38f30 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:42:01 -0400 Subject: [PATCH 1825/3206] rust: of: use `core::ffi::CStr` method names Prepare for `core::ffi::CStr` taking the place of `kernel::str::CStr` by avoid methods that only exist on the latter. Link: https://github.com/Rust-for-Linux/linux/issues/1075 Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/kernel/of.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/of.rs b/rust/kernel/of.rs index b76b35265df2ea..58b20c367f993f 100644 --- a/rust/kernel/of.rs +++ b/rust/kernel/of.rs @@ -34,7 +34,7 @@ unsafe impl RawDeviceIdIndex for DeviceId { impl DeviceId { /// Create a new device id from an OF 'compatible' string. pub const fn new(compatible: &'static CStr) -> Self { - let src = compatible.as_bytes_with_nul(); + let src = compatible.to_bytes_with_nul(); // Replace with `bindings::of_device_id::default()` once stabilized for `const`. // SAFETY: FFI type is valid to be zero-initialized. let mut of: bindings::of_device_id = unsafe { core::mem::zeroed() }; From 657403637f7d343352efb29b53d9f92dcf86aebb Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:42:02 -0400 Subject: [PATCH 1826/3206] rust: acpi: use `core::ffi::CStr` method names Prepare for `core::ffi::CStr` taking the place of `kernel::str::CStr` by avoid methods that only exist on the latter. Signed-off-by: Tamir Duberstein Reviewed-by: Benno Lossin Signed-off-by: Miguel Ojeda --- rust/kernel/acpi.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/rust/kernel/acpi.rs b/rust/kernel/acpi.rs index 7ae317368b0000..37e1161c12985f 100644 --- a/rust/kernel/acpi.rs +++ b/rust/kernel/acpi.rs @@ -37,11 +37,8 @@ impl DeviceId { /// Create a new device id from an ACPI 'id' string. #[inline(always)] pub const fn new(id: &'static CStr) -> Self { - build_assert!( - id.len_with_nul() <= Self::ACPI_ID_LEN, - "ID exceeds 16 bytes" - ); - let src = id.as_bytes_with_nul(); + let src = id.to_bytes_with_nul(); + build_assert!(src.len() <= Self::ACPI_ID_LEN, "ID exceeds 16 bytes"); // Replace with `bindings::acpi_device_id::default()` once stabilized for `const`. // SAFETY: FFI type is valid to be zero-initialized. let mut acpi: bindings::acpi_device_id = unsafe { core::mem::zeroed() }; From 35e526398bd0faddef3396d71760e4c5ea75868f Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Sat, 23 Aug 2025 20:16:47 +0800 Subject: [PATCH 1827/3206] drm/i915/backlight: Honor VESA eDP backlight luminance control capability The VESA AUX backlight fails to be enable luminance based backlight mainpulation becaused luminance_set is false by default. Fix it by using luminance support control capabitliy. Fixes: e13af5166a359 ("drm/i915/backlight: Use drm helper to initialize edp backlight") Signed-off-by: Aaron Ma Reviewed-by: Suraj Kandpal Signed-off-by: Suraj Kandpal Link: https://lore.kernel.org/r/20250823121647.275834-1-aaron.ma@canonical.com (cherry picked from commit 72136efb875d8438c20b9c8ab72945d474933471) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c index 41228478b21c78..0a3a3f6a5f9d89 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c +++ b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c @@ -546,7 +546,7 @@ static int intel_dp_aux_vesa_setup_backlight(struct intel_connector *connector, luminance_range->max_luminance, panel->vbt.backlight.pwm_freq_hz, intel_dp->edp_dpcd, ¤t_level, ¤t_mode, - false); + panel->backlight.edp.vesa.luminance_control_support); if (ret < 0) return ret; From 1b09d08866277677d11726116f5e786d5ba00173 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Mon, 15 Sep 2025 14:35:46 +0530 Subject: [PATCH 1828/3206] platform/x86/amd/pmf: Support new ACPI ID AMDI0108 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include the ACPI ID AMDI0108, which is used on upcoming AMD platforms, in the PMF driver's list of supported devices. Signed-off-by: Shyam Sundar S K Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20250915090546.2759130-1-Shyam-sundar.S-k@amd.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index ef988605c4da63..bc544a4a5266ee 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -403,6 +403,7 @@ static const struct acpi_device_id amd_pmf_acpi_ids[] = { {"AMDI0103", 0}, {"AMDI0105", 0}, {"AMDI0107", 0}, + {"AMDI0108", 0}, { } }; MODULE_DEVICE_TABLE(acpi, amd_pmf_acpi_ids); From 225d1ee0f5ba3218d1814d36564fdb5f37b50474 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Tue, 16 Sep 2025 09:28:18 +0200 Subject: [PATCH 1829/3206] platform/x86: asus-wmi: Re-add extra keys to ignore_key_wlan quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out that the dual screen models use 0x5E for attaching and detaching the keyboard instead of 0x5F. So, re-add the codes by reverting commit cf3940ac737d ("platform/x86: asus-wmi: Remove extra keys from ignore_key_wlan quirk"). For our future reference, add a comment next to 0x5E indicating that it is used for that purpose. Fixes: cf3940ac737d ("platform/x86: asus-wmi: Remove extra keys from ignore_key_wlan quirk") Reported-by: Rahul Chandra Closes: https://lore.kernel.org/all/10020-68c90c80-d-4ac6c580@106290038/ Cc: stable@kernel.org Signed-off-by: Antheas Kapenekakis Link: https://patch.msgid.link/20250916072818.196462-1-lkml@antheas.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-nb-wmi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index 3a488cf9ca06c9..6a62bc5b02fda7 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -673,6 +673,8 @@ static void asus_nb_wmi_key_filter(struct asus_wmi_driver *asus_wmi, int *code, if (atkbd_reports_vol_keys) *code = ASUS_WMI_KEY_IGNORE; break; + case 0x5D: /* Wireless console Toggle */ + case 0x5E: /* Wireless console Enable / Keyboard Attach, Detach */ case 0x5F: /* Wireless console Disable / Special Key */ if (quirks->key_wlan_event) *code = quirks->key_wlan_event; From 9fc4a3da9a0259a0500848b5d8657918efde176b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 15 Sep 2025 17:28:51 +0200 Subject: [PATCH 1830/3206] ALSA: pcm: Disable bottom softirqs as part of spin_lock_irq() on PREEMPT_RT snd_pcm_group_lock_irq() acquires a spinlock_t and disables interrupts via spin_lock_irq(). This also implicitly disables the handling of softirqs such as TIMER_SOFTIRQ. On PREEMPT_RT softirqs are preemptible and spin_lock_irq() does not disable them. That means a timer can be invoked during spin_lock_irq() on the same CPU. Due to synchronisations reasons local_bh_disable() has a per-CPU lock named softirq_ctrl.lock which synchronizes individual softirq against each other. syz-bot managed to trigger a lockdep report where softirq_ctrl.lock is acquired in hrtimer_cancel() in addition to hrtimer_run_softirq(). This is a possible deadlock. The softirq_ctrl.lock can not be made part of spin_lock_irq() as this would lead to too much synchronisation against individual threads on the system. To avoid the possible deadlock, softirqs must be manually disabled before the lock is acquired. Disable softirqs before the lock is acquired on PREEMPT_RT. Reported-by: syzbot+10b4363fb0f46527f3f3@syzkaller.appspotmail.com Fixes: d2d6422f8bd1 ("x86: Allow to enable PREEMPT_RT.") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Takashi Iwai --- sound/core/pcm_native.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 1eab940fa2e5ac..68bee40c9adafd 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -84,19 +84,24 @@ void snd_pcm_group_init(struct snd_pcm_group *group) } /* define group lock helpers */ -#define DEFINE_PCM_GROUP_LOCK(action, mutex_action) \ +#define DEFINE_PCM_GROUP_LOCK(action, bh_lock, bh_unlock, mutex_action) \ static void snd_pcm_group_ ## action(struct snd_pcm_group *group, bool nonatomic) \ { \ - if (nonatomic) \ + if (nonatomic) { \ mutex_ ## mutex_action(&group->mutex); \ - else \ - spin_ ## action(&group->lock); \ -} - -DEFINE_PCM_GROUP_LOCK(lock, lock); -DEFINE_PCM_GROUP_LOCK(unlock, unlock); -DEFINE_PCM_GROUP_LOCK(lock_irq, lock); -DEFINE_PCM_GROUP_LOCK(unlock_irq, unlock); + } else { \ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && bh_lock) \ + local_bh_disable(); \ + spin_ ## action(&group->lock); \ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && bh_unlock) \ + local_bh_enable(); \ + } \ +} + +DEFINE_PCM_GROUP_LOCK(lock, false, false, lock); +DEFINE_PCM_GROUP_LOCK(unlock, false, false, unlock); +DEFINE_PCM_GROUP_LOCK(lock_irq, true, false, lock); +DEFINE_PCM_GROUP_LOCK(unlock_irq, false, true, unlock); /** * snd_pcm_stream_lock - Lock the PCM stream From 17628f1abbf4bd4162c655f3260d68bc1934ec73 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Fri, 12 Sep 2025 19:59:16 +0300 Subject: [PATCH 1831/3206] dt-bindings: gpio: fix trivial-gpio's schema id In case the trivial-gpio schema is referenced through a $ref like /schemas/trivial-gpio.yaml to match its current schema ID, the following error message is displayed: Error in referenced schema matching $id: http://devicetree.org/schemas/trivial-gpio.yaml Tried these paths (check schema $id if path is wrong): /path/to/linux/Documentation/devicetree/bindings/trivial-gpio.yaml /path/to/dtchema/schemas/trivial-gpio.yaml Fix this by adding the 'gpio' folder to the schema's ID to match its file path. Fixes: f03a7f20b23c ("dt-bindings: gpio: Create a trivial GPIO schema") Signed-off-by: Ioana Ciornei Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250912165916.3098215-1-ioana.ciornei@nxp.com Signed-off-by: Bartosz Golaszewski --- Documentation/devicetree/bindings/gpio/trivial-gpio.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/gpio/trivial-gpio.yaml b/Documentation/devicetree/bindings/gpio/trivial-gpio.yaml index 0299d4a25086af..c994177de940af 100644 --- a/Documentation/devicetree/bindings/gpio/trivial-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/trivial-gpio.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/trivial-gpio.yaml# +$id: http://devicetree.org/schemas/gpio/trivial-gpio.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Trivial 2-cell GPIO controllers From 8d7de4a014f589c1776959f7fdadbf7b12045aac Mon Sep 17 00:00:00 2001 From: Jonas Rebmann Date: Wed, 10 Sep 2025 14:35:22 +0200 Subject: [PATCH 1832/3206] ASoC: dt-bindings: asahi-kasei,ak4458: Reference common DAI properties Reference the dai-common.yaml schema to allow '#sound-dai-cells' and "sound-name-prefix' to be used. Signed-off-by: Jonas Rebmann Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250910-imx8mp-prt8ml-v1-2-fd04aed15670@pengutronix.de Signed-off-by: Mark Brown --- .../devicetree/bindings/sound/asahi-kasei,ak4458.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/sound/asahi-kasei,ak4458.yaml b/Documentation/devicetree/bindings/sound/asahi-kasei,ak4458.yaml index 4477f84b7acc0e..1fdbeecc5eff9d 100644 --- a/Documentation/devicetree/bindings/sound/asahi-kasei,ak4458.yaml +++ b/Documentation/devicetree/bindings/sound/asahi-kasei,ak4458.yaml @@ -15,6 +15,9 @@ properties: - asahi-kasei,ak4458 - asahi-kasei,ak4497 + "#sound-dai-cells": + const: 0 + reg: maxItems: 1 @@ -46,6 +49,7 @@ required: - reg allOf: + - $ref: dai-common.yaml# - if: properties: compatible: From e3df98d30369d7859a855bd3e500ea46205640ca Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Tue, 9 Sep 2025 07:04:31 +0700 Subject: [PATCH 1833/3206] xfs: extend removed sysctls table Commit 21d59d00221e4e ("xfs: remove deprecated sysctl knobs") moves recently-removed sysctls to the removed sysctls table but fails to extend the table, hence triggering Sphinx warning: Documentation/admin-guide/xfs.rst:365: ERROR: Malformed table. Text in column margin in table line 8. ============================= ======= Name Removed ============================= ======= fs.xfs.xfsbufd_centisec v4.0 fs.xfs.age_buffer_centisecs v4.0 fs.xfs.irix_symlink_mode v6.18 fs.xfs.irix_sgid_inherit v6.18 fs.xfs.speculative_cow_prealloc_lifetime v6.18 ============================= ======= [docutils] Extend "Name" column of the table to fit the now-longest sysctl, which is fs.xfs.speculative_cow_prealloc_lifetime. Fixes: 21d59d00221e ("xfs: remove deprecated sysctl knobs") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250908180406.32124fb7@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Reviewed-by: Darrick J. Wong Reviewed-by: Randy Dunlap Signed-off-by: Carlos Maiolino --- Documentation/admin-guide/xfs.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index d6f531f2c0e694..c85cd327af284d 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -355,15 +355,15 @@ None currently. Removed Sysctls =============== -============================= ======= - Name Removed -============================= ======= - fs.xfs.xfsbufd_centisec v4.0 - fs.xfs.age_buffer_centisecs v4.0 - fs.xfs.irix_symlink_mode v6.18 - fs.xfs.irix_sgid_inherit v6.18 - fs.xfs.speculative_cow_prealloc_lifetime v6.18 -============================= ======= +========================================== ======= + Name Removed +========================================== ======= + fs.xfs.xfsbufd_centisec v4.0 + fs.xfs.age_buffer_centisecs v4.0 + fs.xfs.irix_symlink_mode v6.18 + fs.xfs.irix_sgid_inherit v6.18 + fs.xfs.speculative_cow_prealloc_lifetime v6.18 +========================================== ======= Error handling ============== From eff8668607888988cad7b31528ff08d8883c5d7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:51 -0700 Subject: [PATCH 1834/3206] xfs: remove the xlog_op_header_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 5 ++--- fs/xfs/xfs_log.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 0d637c276db053..367dfdece9be18 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -141,14 +141,13 @@ struct xfs_unmount_log_format { #define XLOG_END_TRANS 0x10 /* End a continued transaction */ #define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ - -typedef struct xlog_op_header { +struct xlog_op_header { __be32 oh_tid; /* transaction id of operation : 4 b */ __be32 oh_len; /* bytes in data region : 4 b */ __u8 oh_clientid; /* who sent me this : 1 b */ __u8 oh_flags; /* : 1 b */ __u16 oh_res2; /* 32 bit align : 2 b */ -} xlog_op_header_t; +}; /* valid values for h_fmt */ #define XLOG_FMT_UNKNOWN 0 diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c8a57e21a1d3e0..94e06c22be6730 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2656,10 +2656,11 @@ xlog_state_get_iclog_space( * until you know exactly how many bytes get copied. Therefore, wait * until later to update ic_offset. * - * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's + * xlog_write() algorithm assumes that at least 2 xlog_op_header's * can fit into remaining data section. */ - if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + if (iclog->ic_size - iclog->ic_offset < + 2 * sizeof(struct xlog_op_header)) { int error = 0; xlog_state_switch_iclogs(log, iclog, iclog->ic_size); @@ -3153,11 +3154,11 @@ xlog_calc_unit_res( */ /* for trans header */ - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); unit_bytes += sizeof(xfs_trans_header_t); /* for start-rec */ - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); /* * for LR headers - the space for data in an iclog is the size minus @@ -3180,12 +3181,12 @@ xlog_calc_unit_res( num_headers = howmany(unit_bytes, iclog_space); /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; + unit_bytes += sizeof(struct xlog_op_header) * num_headers; /* add extra header reservations if we overrun */ while (!num_headers || howmany(unit_bytes, iclog_space) > num_headers) { - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); num_headers++; } unit_bytes += log->l_iclog_hsize * num_headers; @@ -3322,7 +3323,7 @@ xlog_verify_iclog( struct xlog_in_core *iclog, int count) { - xlog_op_header_t *ophead; + struct xlog_op_header *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; void *base_ptr, *ptr, *p; @@ -3400,7 +3401,7 @@ xlog_verify_iclog( op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); } } - ptr += sizeof(xlog_op_header_t) + op_len; + ptr += sizeof(struct xlog_op_header) + op_len; } } #endif From 05f17dcbfd5dbe309af310508d8830ac4e0c5d4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:52 -0700 Subject: [PATCH 1835/3206] xfs: remove the xfs_trans_header_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/libxfs/xfs_log_recover.h | 2 +- fs/xfs/xfs_log.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 367dfdece9be18..2c3c5e67f78a6d 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -212,12 +212,12 @@ typedef struct xfs_log_iovec { * Do not change the below structure without redoing the code in * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). */ -typedef struct xfs_trans_header { +struct xfs_trans_header { uint th_magic; /* magic number */ uint th_type; /* transaction type */ int32_t th_tid; /* transaction id (unused) */ uint th_num_items; /* num items logged by trans */ -} xfs_trans_header_t; +}; #define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 95de2309503069..9e712e62369c47 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -111,7 +111,7 @@ struct xlog_recover_item { struct xlog_recover { struct hlist_node r_list; xlog_tid_t r_log_tid; /* log's transaction id */ - xfs_trans_header_t r_theader; /* trans header for partial */ + struct xfs_trans_header r_theader; /* trans header for partial */ int r_state; /* not needed */ xfs_lsn_t r_lsn; /* xact lsn */ struct list_head r_itemq; /* q for items */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 94e06c22be6730..7c590ecdf86549 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3155,7 +3155,7 @@ xlog_calc_unit_res( /* for trans header */ unit_bytes += sizeof(struct xlog_op_header); - unit_bytes += sizeof(xfs_trans_header_t); + unit_bytes += sizeof(struct xfs_trans_header); /* for start-rec */ unit_bytes += sizeof(struct xlog_op_header); From 476688c8ac60da9bfcb3ce7f5a2d30a145ef7f76 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:53 -0700 Subject: [PATCH 1836/3206] xfs: remove the xfs_extent_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Also fix up the comment about the struct xfs_extent definition to be correct and read more easily. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 2c3c5e67f78a6d..6d0cad455a8f37 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -605,16 +605,17 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf) /* * EFI/EFD log format definitions */ -typedef struct xfs_extent { +struct xfs_extent { xfs_fsblock_t ext_start; xfs_extlen_t ext_len; -} xfs_extent_t; +}; /* - * Since an xfs_extent_t has types (start:64, len: 32) - * there are different alignments on 32 bit and 64 bit kernels. - * So we provide the different variants for use by a - * conversion routine. + * Since the structures in struct xfs_extent add up to 96 bytes, it has + * different alignments on i386 vs all other architectures, because i386 + * does not pad structures to their natural alignment. + * + * Provide the different variants for use by a conversion routine. */ typedef struct xfs_extent_32 { uint64_t ext_start; @@ -637,7 +638,7 @@ typedef struct xfs_efi_log_format { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[]; /* array of extents to free */ + struct xfs_extent efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_t; static inline size_t @@ -690,7 +691,7 @@ typedef struct xfs_efd_log_format { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[]; /* array of extents freed */ + struct xfs_extent efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_t; static inline size_t From 7eaf684bc48923b5584fc119e8c477be2cdb3eb2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:54 -0700 Subject: [PATCH 1837/3206] xfs: remove the xfs_extent32_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 6d0cad455a8f37..f11ba20a164efc 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -617,10 +617,10 @@ struct xfs_extent { * * Provide the different variants for use by a conversion routine. */ -typedef struct xfs_extent_32 { +struct xfs_extent_32 { uint64_t ext_start; uint32_t ext_len; -} __attribute__((packed)) xfs_extent_32_t; +} __attribute__((packed)); typedef struct xfs_extent_64 { uint64_t ext_start; @@ -654,7 +654,7 @@ typedef struct xfs_efi_log_format_32 { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[]; /* array of extents to free */ + struct xfs_extent_32 efi_extents[]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; static inline size_t @@ -707,7 +707,7 @@ typedef struct xfs_efd_log_format_32 { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[]; /* array of extents freed */ + struct xfs_extent_32 efd_extents[]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; static inline size_t From 72628b6f459ea4fed3003db8161b52ee746442d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:55 -0700 Subject: [PATCH 1838/3206] xfs: remove the xfs_extent64_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index f11ba20a164efc..2b270912e5384c 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -622,11 +622,11 @@ struct xfs_extent_32 { uint32_t ext_len; } __attribute__((packed)); -typedef struct xfs_extent_64 { +struct xfs_extent_64 { uint64_t ext_start; uint32_t ext_len; uint32_t ext_pad; -} xfs_extent_64_t; +}; /* * This is the structure used to lay out an efi log item in the @@ -670,7 +670,7 @@ typedef struct xfs_efi_log_format_64 { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[]; /* array of extents to free */ + struct xfs_extent_64 efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_64_t; static inline size_t @@ -723,7 +723,7 @@ typedef struct xfs_efd_log_format_64 { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[]; /* array of extents freed */ + struct xfs_extent_64 efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_64_t; static inline size_t From 655d9ec7bd9e38735ae36dbc635a9161a046f7b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:56 -0700 Subject: [PATCH 1839/3206] xfs: remove the xfs_efi_log_format_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/xfs_extfree_item.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 2b270912e5384c..81c84c8a66fd7a 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -633,13 +633,13 @@ struct xfs_extent_64 { * log. The efi_extents field is a variable size array whose * size is given by efi_nextents. */ -typedef struct xfs_efi_log_format { +struct xfs_efi_log_format { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ struct xfs_extent efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_t; +}; static inline size_t xfs_efi_log_format_sizeof( diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index c8402040410b54..e56376853f2d25 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -49,7 +49,7 @@ struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; - xfs_efi_log_format_t efi_format; + struct xfs_efi_log_format efi_format; }; static inline size_t From 68c9f8444ae930343a2c900cb909825bc8f7304a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:57 -0700 Subject: [PATCH 1840/3206] xfs: remove the xfs_efi_log_format_32_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/xfs_extfree_item.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 81c84c8a66fd7a..75cc8b9be598b5 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -649,13 +649,13 @@ xfs_efi_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efi_log_format_32 { +struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ struct xfs_extent_32 efi_extents[]; /* array of extents to free */ -} __attribute__((packed)) xfs_efi_log_format_32_t; +} __attribute__((packed)); static inline size_t xfs_efi_log_format32_sizeof( diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 47ee598a982703..0190cc55d75b06 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -202,7 +202,7 @@ xfs_efi_copy_format( sizeof(struct xfs_extent)); return 0; } else if (buf->iov_len == len32) { - xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->iov_base; + struct xfs_efi_log_format_32 *src_efi_fmt_32 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; From 3fe5abc2bf4db88c7c9c99e8a1f5b3d1336d528f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:58 -0700 Subject: [PATCH 1841/3206] xfs: remove the xfs_efi_log_format_64_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/xfs_extfree_item.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 75cc8b9be598b5..358f7cb43be26d 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -665,13 +665,13 @@ xfs_efi_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efi_log_format_64 { +struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ struct xfs_extent_64 efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_64_t; +}; static inline size_t xfs_efi_log_format64_sizeof( diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 0190cc55d75b06..418ddab590e06b 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -216,7 +216,7 @@ xfs_efi_copy_format( } return 0; } else if (buf->iov_len == len64) { - xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->iov_base; + struct xfs_efi_log_format_64 *src_efi_fmt_64 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; From 0a33d5ad8a46d1f63174d2684b1d743bd6090554 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:59 -0700 Subject: [PATCH 1842/3206] xfs: remove the xfs_efd_log_format_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/xfs_extfree_item.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 358f7cb43be26d..cb63bb156db14c 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -686,13 +686,13 @@ xfs_efi_log_format64_sizeof( * log. The efd_extents array is a variable size array whose * size is given by efd_nextents; */ -typedef struct xfs_efd_log_format { +struct xfs_efd_log_format { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ struct xfs_extent efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_t; +}; static inline size_t xfs_efd_log_format_sizeof( diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index e56376853f2d25..af1b0331f7afa4 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -69,7 +69,7 @@ struct xfs_efd_log_item { struct xfs_log_item efd_item; struct xfs_efi_log_item *efd_efip; uint efd_next_extent; - xfs_efd_log_format_t efd_format; + struct xfs_efd_log_format efd_format; }; static inline size_t From a0cb349672f9ac2dcd80afa3dd25e2df2842db7a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:00 -0700 Subject: [PATCH 1843/3206] xfs: remove the unused xfs_efd_log_format_32_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index cb63bb156db14c..2155cc6b2aaeb0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -702,13 +702,13 @@ xfs_efd_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efd_log_format_32 { +struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ struct xfs_extent_32 efd_extents[]; /* array of extents freed */ -} __attribute__((packed)) xfs_efd_log_format_32_t; +} __attribute__((packed)); static inline size_t xfs_efd_log_format32_sizeof( From 3dde08b64c98cf76b2e2378ecf36351464e2972a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:01 -0700 Subject: [PATCH 1844/3206] xfs: remove the unused xfs_efd_log_format_64_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 2155cc6b2aaeb0..aa8e3b557733f3 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -718,13 +718,13 @@ xfs_efd_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efd_log_format_64 { +struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ struct xfs_extent_64 efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_64_t; +}; static inline size_t xfs_efd_log_format64_sizeof( From 1b5c7cc8f8c54858f69311290d5ade12627ff233 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:02 -0700 Subject: [PATCH 1845/3206] xfs: remove the unused xfs_buf_log_format_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index aa8e3b557733f3..631af2e28c561b 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -541,7 +541,7 @@ struct xfs_log_dinode { #define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) #define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1) -typedef struct xfs_buf_log_format { +struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ unsigned short blf_flags; /* misc state */ @@ -549,7 +549,7 @@ typedef struct xfs_buf_log_format { int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ -} xfs_buf_log_format_t; +}; /* * All buffers now need to tell recovery where the magic number From ae1ef3272b31e6bccd9f2014e8e8c41887a5137b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:03 -0700 Subject: [PATCH 1846/3206] xfs: remove the unused xfs_dq_logformat_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 631af2e28c561b..fff3a2aaeefdd8 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -957,14 +957,14 @@ struct xfs_xmd_log_format { * The first two fields must be the type and size fitting into * 32 bits : log_recovery code assumes that. */ -typedef struct xfs_dq_logformat { +struct xfs_dq_logformat { uint16_t qlf_type; /* dquot log item type */ uint16_t qlf_size; /* size of this item */ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ int64_t qlf_blkno; /* blkno of dquot buffer */ int32_t qlf_len; /* len of dquot buffer */ uint32_t qlf_boffset; /* off of dquot in buffer */ -} xfs_dq_logformat_t; +}; /* * log format struct for QUOTAOFF records. From bf0013f59ccdb283083f0451f6edc50ff98e68c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:04 -0700 Subject: [PATCH 1847/3206] xfs: remove the unused xfs_qoff_logformat_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index fff3a2aaeefdd8..49c4a33166a6a3 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -974,12 +974,12 @@ struct xfs_dq_logformat { * to the first and ensures that the first logitem is taken out of the AIL * only when the last one is securely committed. */ -typedef struct xfs_qoff_logformat { +struct xfs_qoff_logformat { unsigned short qf_type; /* quotaoff log item type */ unsigned short qf_size; /* size of this item */ unsigned int qf_flags; /* USR and/or GRP */ char qf_pad[12]; /* padding for future */ -} xfs_qoff_logformat_t; +}; /* * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. From 3e5bdfe48e1f159de7ca3b23a6afa6c10f2a9ad2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:05 -0700 Subject: [PATCH 1848/3206] xfs: remove the unused xfs_log_iovec_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 49c4a33166a6a3..a42a8321172459 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -194,12 +194,11 @@ typedef union xlog_in_core2 { } xlog_in_core_2_t; /* not an on-disk structure, but needed by log recovery in userspace */ -typedef struct xfs_log_iovec { +struct xfs_log_iovec { void *i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ uint i_type; /* type of region */ -} xfs_log_iovec_t; - +}; /* * Transaction Header definitions. From 0b737f4ac1d3ec093347241df74bbf5f54a7e16c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:20:29 -0700 Subject: [PATCH 1849/3206] xfs: rename the old_crc variable in xlog_recover_process old_crc is a very misleading name. Rename it to expected_crc as that described the usage much better. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_log_recover.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e6ed9e09c02710..0a4db8efd9033f 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2894,20 +2894,19 @@ xlog_recover_process( int pass, struct list_head *buffer_list) { - __le32 old_crc = rhead->h_crc; - __le32 crc; + __le32 expected_crc = rhead->h_crc, crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets old_crc to 0 so we must consider this valid even on v5 supers. - * Otherwise, return EFSBADCRC on failure so the callers up the stack - * know precisely what failed. + * sets expected_crc to 0 so we must consider this valid even on v5 + * supers. Otherwise, return EFSBADCRC on failure so the callers up the + * stack know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (old_crc && crc != old_crc) + if (expected_crc && crc != expected_crc) return -EFSBADCRC; return 0; } @@ -2918,11 +2917,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != old_crc) { - if (old_crc || xfs_has_crc(log->l_mp)) { + if (crc != expected_crc) { + if (expected_crc || xfs_has_crc(log->l_mp)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(old_crc), + le32_to_cpu(expected_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } From e747883c7d7306acb4d683038d881528fbfbe749 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:20:30 -0700 Subject: [PATCH 1850/3206] xfs: fix log CRC mismatches between i386 and other architectures When mounting file systems with a log that was dirtied on i386 on other architectures or vice versa, log recovery is unhappy: [ 11.068052] XFS (vdb): Torn write (CRC failure) detected at log block 0x2. Truncating head block from 0xc. This is because the CRCs generated by i386 and other architectures always diff. The reason for that is that sizeof(struct xlog_rec_header) returns different values for i386 vs the rest (324 vs 328), because the struct is not sizeof(uint64_t) aligned, and i386 has odd struct size alignment rules. This issue goes back to commit 13cdc853c519 ("Add log versioning, and new super block field for the log stripe") in the xfs-import tree, which adds log v2 support and the h_size field that causes the unaligned size. At that time it only mattered for the crude debug only log header checksum, but with commit 0e446be44806 ("xfs: add CRC checks to the log") it became a real issue for v5 file system, because now there is a proper CRC, and regular builds actually expect it match. Fix this by allowing checksums with and without the padding. Fixes: 0e446be44806 ("xfs: add CRC checks to the log") Cc: # v3.8 Signed-off-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 30 +++++++++++++++++++++++++++++- fs/xfs/libxfs/xfs_ondisk.h | 2 ++ fs/xfs/xfs_log.c | 8 ++++---- fs/xfs/xfs_log_priv.h | 4 ++-- fs/xfs/xfs_log_recover.c | 19 +++++++++++++++++-- 5 files changed, 54 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a42a8321172459..fd00b77af32b9d 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -173,12 +173,40 @@ typedef struct xlog_rec_header { __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; - /* new fields */ + + /* fields added by the Linux port: */ __be32 h_fmt; /* format of log record : 4 */ uuid_t h_fs_uuid; /* uuid of FS : 16 */ + + /* fields added for log v2: */ __be32 h_size; /* iclog size : 4 */ + + /* + * When h_size added for log v2 support, it caused structure to have + * a different size on i386 vs all other architectures because the + * sum of the size ofthe member is not aligned by that of the largest + * __be64-sized member, and i386 has really odd struct alignment rules. + * + * Due to the way the log headers are placed out on-disk that alone is + * not a problem becaue the xlog_rec_header always sits alone in a + * BBSIZEs area, and the rest of that area is padded with zeroes. + * But xlog_cksum used to calculate the checksum based on the structure + * size, and thus gives different checksums for i386 vs the rest. + * We now do two checksum validation passes for both sizes to allow + * moving v5 file systems with unclean logs between i386 and other + * (little-endian) architectures. + */ + __u32 h_pad0; } xlog_rec_header_t; +#ifdef __i386__ +#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_size) +#define XLOG_REC_SIZE_OTHER sizeof(struct xlog_rec_header) +#else +#define XLOG_REC_SIZE sizeof(struct xlog_rec_header) +#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_size) +#endif /* __i386__ */ + typedef struct xlog_rec_ext_header { __be32 xh_cycle; /* write cycle of log : 4 */ __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 5ed44fdf749105..7bfa3242e2c536 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -174,6 +174,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 328); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 260); XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7c590ecdf86549..2978de9da38edb 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1568,13 +1568,13 @@ xlog_cksum( struct xlog *log, struct xlog_rec_header *rhead, char *dp, - int size) + unsigned int hdrsize, + unsigned int size) { uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum_update((char *)rhead, - sizeof(struct xlog_rec_header), + crc = xfs_start_cksum_update((char *)rhead, hdrsize, offsetof(struct xlog_rec_header, h_crc)); /* ... then for additional cycle data for v2 logs ... */ @@ -1818,7 +1818,7 @@ xlog_sync( /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, - iclog->ic_datap, size); + iclog->ic_datap, XLOG_REC_SIZE, size); /* * Intentionally corrupt the log record CRC based on the error injection * frequency, if defined. This facilitates testing log recovery in the diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index a9a7a271c15bb7..0cfc654d8e872b 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -499,8 +499,8 @@ xlog_recover_finish( extern void xlog_recover_cancel(struct xlog *); -extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, - char *dp, int size); +__le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, unsigned int hdrsize, unsigned int size); extern struct kmem_cache *xfs_log_ticket_cache; struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 0a4db8efd9033f..549d60959aee5b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2894,9 +2894,24 @@ xlog_recover_process( int pass, struct list_head *buffer_list) { - __le32 expected_crc = rhead->h_crc, crc; + __le32 expected_crc = rhead->h_crc, crc, other_crc; - crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE, + be32_to_cpu(rhead->h_len)); + + /* + * Look at the end of the struct xlog_rec_header definition in + * xfs_log_format.h for the glory details. + */ + if (expected_crc && crc != expected_crc) { + other_crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE_OTHER, + be32_to_cpu(rhead->h_len)); + if (other_crc == expected_crc) { + xfs_notice_once(log->l_mp, + "Fixing up incorrect CRC due to padding."); + crc = other_crc; + } + } /* * Nothing else to do if this is a CRC verification pass. Just return From 94deac977fbd0246c971b4f1d17a6385f5e0b1a4 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 1 Sep 2025 10:52:04 +0000 Subject: [PATCH 1851/3206] fs: add an enum for number of life time hints Add WRITE_LIFE_HINT_NR into the rw_hint enum to define the number of values write life time hints can be set to. This is useful for e.g. file systems which may want to map these values to allocation groups. Signed-off-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- include/linux/rw_hint.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h index 309ca72f2dfb63..adcc43042c9021 100644 --- a/include/linux/rw_hint.h +++ b/include/linux/rw_hint.h @@ -14,6 +14,7 @@ enum rw_hint { WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, + WRITE_LIFE_HINT_NR, } __packed; /* Sparse ignores __packed annotations on enums, hence the #ifndef below. */ From 0301dae732a5402a68fdb8d8461b97da6b9bccc6 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 1 Sep 2025 10:52:05 +0000 Subject: [PATCH 1852/3206] xfs: refactor hint based zone allocation Replace the co-location code with a matrix that makes it more clear on how the decisions are made. The matrix contains scores for zone/file hint combinations. A "GOOD" score for an open zone will result in immediate co-location while "OK" combinations will only be picked if we cannot open a new zone. Signed-off-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 122 ++++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 60 deletions(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index f28214c28ab545..ff24769b88702e 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -493,64 +493,64 @@ xfs_try_open_zone( return oz; } +enum xfs_zone_alloc_score { + /* Any open zone will do it, we're desperate */ + XFS_ZONE_ALLOC_ANY = 0, + + /* It better fit somehow */ + XFS_ZONE_ALLOC_OK = 1, + + /* Only reuse a zone if it fits really well. */ + XFS_ZONE_ALLOC_GOOD = 2, +}; + /* - * For data with short or medium lifetime, try to colocated it into an - * already open zone with a matching temperature. + * Life time hint co-location matrix. Fields not set default to 0 + * aka XFS_ZONE_ALLOC_ANY. */ -static bool -xfs_colocate_eagerly( - enum rw_hint file_hint) -{ - switch (file_hint) { - case WRITE_LIFE_MEDIUM: - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - return true; - default: - return false; - } -} - -static bool -xfs_good_hint_match( - struct xfs_open_zone *oz, - enum rw_hint file_hint) -{ - switch (oz->oz_write_hint) { - case WRITE_LIFE_LONG: - case WRITE_LIFE_EXTREME: - /* colocate long and extreme */ - if (file_hint == WRITE_LIFE_LONG || - file_hint == WRITE_LIFE_EXTREME) - return true; - break; - case WRITE_LIFE_MEDIUM: - /* colocate medium with medium */ - if (file_hint == WRITE_LIFE_MEDIUM) - return true; - break; - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - case WRITE_LIFE_NOT_SET: - /* colocate short and none */ - if (file_hint <= WRITE_LIFE_SHORT) - return true; - break; - } - return false; -} +static const unsigned int +xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = { + [WRITE_LIFE_NOT_SET] = { + [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_NONE] = { + [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_GOOD, + [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_SHORT] = { + [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_GOOD, + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_GOOD, + [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_MEDIUM] = { + [WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_LONG] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_EXTREME] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, +}; static bool xfs_try_use_zone( struct xfs_zone_info *zi, enum rw_hint file_hint, struct xfs_open_zone *oz, - bool lowspace) + unsigned int goodness) { if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) return false; - if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + + if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness) return false; + if (!atomic_inc_not_zero(&oz->oz_ref)) return false; @@ -581,14 +581,14 @@ static struct xfs_open_zone * xfs_select_open_zone_lru( struct xfs_zone_info *zi, enum rw_hint file_hint, - bool lowspace) + unsigned int goodness) { struct xfs_open_zone *oz; lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) + if (xfs_try_use_zone(zi, file_hint, oz, goodness)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); @@ -651,9 +651,11 @@ xfs_select_zone_nowait( * data. */ spin_lock(&zi->zi_open_zones_lock); - if (xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - else if (pack_tight) + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD); + if (oz) + goto out_unlock; + + if (pack_tight) oz = xfs_select_open_zone_mru(zi, write_hint); if (oz) goto out_unlock; @@ -667,16 +669,16 @@ xfs_select_zone_nowait( goto out_unlock; /* - * Try to colocate cold data with other cold data if we failed to open a - * new zone for it. + * Try to find an zone that is an ok match to colocate data with. + */ + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK); + if (oz) + goto out_unlock; + + /* + * Pick the least recently used zone, regardless of hint match */ - if (write_hint != WRITE_LIFE_NOT_SET && - !xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY); out_unlock: spin_unlock(&zi->zi_open_zones_lock); return oz; From 8e2cdd8e18ff5073ad76ab2220910001eae39398 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 1 Sep 2025 10:52:05 +0000 Subject: [PATCH 1853/3206] xfs: adjust the hint based zone allocation policy As we really can't make any general assumptions about files that don't have any life time hint set or are set to "NONE", adjust the allocation policy to avoid co-locating data from those files with files with a set life time. Signed-off-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index ff24769b88702e..23a027387933d5 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -512,17 +512,11 @@ static const unsigned int xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = { [WRITE_LIFE_NOT_SET] = { [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, - [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, - [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_OK, }, [WRITE_LIFE_NONE] = { - [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, - [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_GOOD, - [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, }, [WRITE_LIFE_SHORT] = { - [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_GOOD, - [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_GOOD, [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, }, [WRITE_LIFE_MEDIUM] = { From d733f18a6da6fb719450d5122162556d785ed580 Mon Sep 17 00:00:00 2001 From: Aditya Bodkhe Date: Tue, 16 Sep 2025 10:10:34 +0530 Subject: [PATCH 1854/3206] powerpc/ftrace: support CONFIG_FUNCTION_GRAPH_RETVAL commit a1be9ccc57f0 ("function_graph: Support recording and printing the return value of function") introduced support for function graph return value tracing. Additionally, commit a3ed4157b7d8 ("fgraph: Replace fgraph_ret_regs with ftrace_regs") further refactored and optimized the implementation, making `struct fgraph_ret_regs` unnecessary. This patch enables the above modifications for powerpc all, ensuring that function graph return value tracing is available on this architecture. In this patch we have redefined two functions: - 'ftrace_regs_get_return_value()' - the existing implementation on ppc returns -ve of return value based on some conditions not relevant to our patch. - 'ftrace_regs_get_frame_pointer()' - always returns 0 in current code . We also allocate stack space to equivalent of 'SWITCH_FRAME_SIZE', allowing us to directly use predefined offsets like 'GPR3' and 'GPR4' this keeps code clean and consistent with already defined offsets . After this patch, v6.14+ kernel can also be built with FPROBE on powerpc but there are a few other build and runtime dependencies for FPROBE to work properly. The next patch addresses them. Tested-by: Venkat Rao Bagalkote Reviewed-by: Christophe Leroy Signed-off-by: Aditya Bodkhe Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250916044035.29033-1-adityab1@linux.ibm.com --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/ftrace.h | 15 +++++++++ arch/powerpc/kernel/trace/ftrace_entry.S | 42 ++++++++++++++---------- 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index ba9750c8e0cd8b..449404d3932bfe 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -246,6 +246,7 @@ config PPC select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_DESCRIPTORS if PPC64_ELF_ABI_V1 select HAVE_FUNCTION_ERROR_INJECTION + select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER if !COMPILE_TEST && (PPC64 || (PPC32 && CC_IS_GCC)) select HAVE_GCC_PLUGINS diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index bd61a230b19d08..5984eaa75ce8dd 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -50,6 +50,21 @@ static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs * asm volatile("mfmsr %0" : "=r" ((_regs)->msr)); \ } while (0) +#undef ftrace_regs_get_return_value +static __always_inline unsigned long +ftrace_regs_get_return_value(const struct ftrace_regs *fregs) +{ + return arch_ftrace_regs(fregs)->regs.gpr[3]; +} +#define ftrace_regs_get_return_value ftrace_regs_get_return_value + +#undef ftrace_regs_get_frame_pointer +static __always_inline unsigned long +ftrace_regs_get_frame_pointer(const struct ftrace_regs *fregs) +{ + return arch_ftrace_regs(fregs)->regs.gpr[1]; +} + static __always_inline void ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip) diff --git a/arch/powerpc/kernel/trace/ftrace_entry.S b/arch/powerpc/kernel/trace/ftrace_entry.S index 3565c67fc63859..6599fe3c62347f 100644 --- a/arch/powerpc/kernel/trace/ftrace_entry.S +++ b/arch/powerpc/kernel/trace/ftrace_entry.S @@ -409,23 +409,31 @@ EXPORT_SYMBOL(_mcount) _GLOBAL(return_to_handler) /* need to save return values */ #ifdef CONFIG_PPC64 - std r4, -32(r1) - std r3, -24(r1) + stdu r1, -SWITCH_FRAME_SIZE(r1) + std r4, GPR4(r1) + std r3, GPR3(r1) + /* Save previous stack pointer (r1) */ + addi r3, r1, SWITCH_FRAME_SIZE + std r3, GPR1(r1) /* save TOC */ - std r2, -16(r1) - std r31, -8(r1) + std r2, 24(r1) + std r31, 32(r1) mr r31, r1 - stdu r1, -112(r1) - + /* pass ftrace_regs/pt_regs to ftrace_return_to_handler */ + addi r3, r1, STACK_INT_FRAME_REGS /* * We might be called from a module. * Switch to our TOC to run inside the core kernel. */ LOAD_PACA_TOC() #else - stwu r1, -16(r1) - stw r3, 8(r1) - stw r4, 12(r1) + stwu r1, -SWITCH_FRAME_SIZE(r1) + stw r4, GPR4(r1) + stw r3, GPR3(r1) + addi r3, r1, SWITCH_FRAME_SIZE + stw r3, GPR1(r1) + /* pass ftrace_regs/pt_regs to ftrace_return_to_handler */ + addi r3, r1, STACK_INT_FRAME_REGS #endif bl ftrace_return_to_handler @@ -435,15 +443,15 @@ _GLOBAL(return_to_handler) mtlr r3 #ifdef CONFIG_PPC64 - ld r1, 0(r1) - ld r4, -32(r1) - ld r3, -24(r1) - ld r2, -16(r1) - ld r31, -8(r1) + ld r4, GPR4(r1) + ld r3, GPR3(r1) + ld r2, 24(r1) + ld r31, 32(r1) + ld r1, 0(r1) #else - lwz r3, 8(r1) - lwz r4, 12(r1) - addi r1, r1, 16 + lwz r3, GPR3(r1) + lwz r4, GPR4(r1) + addi r1, r1, SWITCH_FRAME_SIZE #endif /* Jump back to real return address */ From 7cec88bfdd33434d62d4d5ba664127fa175b50e7 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Tue, 16 Sep 2025 10:10:35 +0530 Subject: [PATCH 1855/3206] powerpc/fprobe: fix updated fprobe for function-graph tracer Since commit 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer"), FPROBE depends on HAVE_FUNCTION_GRAPH_FREGS. With previous patch adding HAVE_FUNCTION_GRAPH_FREGS for powerpc, FPROBE can be enabled on powerpc. But with the commit b5fa903b7f7c ("fprobe: Add fprobe_header encoding feature"), asm/fprobe.h header is needed to define arch dependent encode/decode macros. The fprobe header MSB pattern on powerpc is not 0xf. So, define FPROBE_HEADER_MSB_PATTERN expected on powerpc. Also, commit 762abbc0d09f ("fprobe: Use ftrace_regs in fprobe exit handler") introduced HAVE_FTRACE_REGS_HAVING_PT_REGS for archs that have pt_regs in ftrace_regs. Advertise that on powerpc to reuse common definitions like ftrace_partial_regs(). Acked-by: Masami Hiramatsu (Google) Tested-by: Venkat Rao Bagalkote Signed-off-by: Hari Bathini Signed-off-by: Aditya Bodkhe Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250916044035.29033-2-adityab1@linux.ibm.com --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/fprobe.h | 12 ++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 arch/powerpc/include/asm/fprobe.h diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 449404d3932bfe..325c1171894db9 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -243,6 +243,7 @@ config PPC select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_GUP_FAST select HAVE_FTRACE_GRAPH_FUNC + select HAVE_FTRACE_REGS_HAVING_PT_REGS select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_DESCRIPTORS if PPC64_ELF_ABI_V1 select HAVE_FUNCTION_ERROR_INJECTION diff --git a/arch/powerpc/include/asm/fprobe.h b/arch/powerpc/include/asm/fprobe.h new file mode 100644 index 00000000000000..d64bc28fb3d3c3 --- /dev/null +++ b/arch/powerpc/include/asm/fprobe.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_PPC_FPROBE_H +#define _ASM_PPC_FPROBE_H + +#include + +#ifdef CONFIG_64BIT +#undef FPROBE_HEADER_MSB_PATTERN +#define FPROBE_HEADER_MSB_PATTERN (PAGE_OFFSET & ~FPROBE_HEADER_MSB_MASK) +#endif + +#endif /* _ASM_PPC_FPROBE_H */ From 9316512b717f6f25c4649b3fdb0a905b6a318e9f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Sep 2025 12:03:49 +0200 Subject: [PATCH 1856/3206] powerpc/32: Remove PAGE_KERNEL_TEXT to fix startup failure PAGE_KERNEL_TEXT is an old macro that is used to tell kernel whether kernel text has to be mapped read-only or read-write based on build time options. But nowadays, with functionnalities like jump_labels, static links, etc ... more only less all kernels need to be read-write at some point, and some combinations of configs failed to work due to innacurate setting of PAGE_KERNEL_TEXT. On the other hand, today we have CONFIG_STRICT_KERNEL_RWX which implements a more controlled access to kernel modifications. Instead of trying to keep PAGE_KERNEL_TEXT accurate with all possible options that may imply kernel text modification, always set kernel text read-write at startup and rely on CONFIG_STRICT_KERNEL_RWX to provide accurate protection. Do this by passing PAGE_KERNEL_X to map_kernel_page() in __maping_ram_chunk() instead of passing PAGE_KERNEL_TEXT. Once this is done, the only remaining user of PAGE_KERNEL_TEXT is mmu_mark_initmem_nx() which uses it in a call to setibat(). As setibat() ignores the RW/RO, we can seamlessly replace PAGE_KERNEL_TEXT by PAGE_KERNEL_X here as well and get rid of PAGE_KERNEL_TEXT completely. Reported-by: Erhard Furtner Closes: https://lore.kernel.org/all/342b4120-911c-4723-82ec-d8c9b03a8aef@mailbox.org/ Signed-off-by: Christophe Leroy Tested-by: Andrew Donnellan Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/8e2d793abf87ae3efb8f6dce10f974ac0eda61b8.1757412205.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/pgtable.h | 12 ------------ arch/powerpc/mm/book3s32/mmu.c | 4 ++-- arch/powerpc/mm/pgtable_32.c | 2 +- 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index fd29fb289d1eb2..17fd7ff6e535b6 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -20,18 +20,6 @@ struct mm_struct; #include #endif /* !CONFIG_PPC_BOOK3S */ -/* - * Protection used for kernel text. We want the debuggers to be able to - * set breakpoints anywhere, so don't write protect the kernel text - * on platforms where such control is possible. - */ -#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \ - defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) -#define PAGE_KERNEL_TEXT PAGE_KERNEL_X -#else -#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX -#endif - /* Make modules code happy. We don't set RO yet */ #define PAGE_KERNEL_EXEC PAGE_KERNEL_X diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index be9c4106e22f04..c42ecdf94e48cd 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -204,7 +204,7 @@ int mmu_mark_initmem_nx(void) for (i = 0; i < nb - 1 && base < top;) { size = bat_block_size(base, top); - setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); base += size; } if (base < top) { @@ -215,7 +215,7 @@ int mmu_mark_initmem_nx(void) pr_warn("Some RW data is getting mapped X. " "Adjust CONFIG_DATA_SHIFT to avoid that.\n"); } - setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); base += size; } for (; i < nb; i++) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 15276068f657df..0c9ef705803e93 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -104,7 +104,7 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) p = memstart_addr + s; for (; s < top; s += PAGE_SIZE) { ktext = core_kernel_text(v); - map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL); + map_kernel_page(v, p, ktext ? PAGE_KERNEL_X : PAGE_KERNEL); v += PAGE_SIZE; p += PAGE_SIZE; } From 30ef245c6f5a6842d60308590cf26d0ae836fbf0 Mon Sep 17 00:00:00 2001 From: David Kaplan Date: Mon, 15 Sep 2025 08:47:04 -0500 Subject: [PATCH 1857/3206] x86/bugs: Fix spectre_v2 forcing There were two oddities with spectre_v2 command line options. First, any option other than 'off' or 'auto' would force spectre_v2 mitigations even if the CPU (hypothetically) wasn't vulnerable to spectre_v2. That was inconsistent with all the other bugs where mitigations are ignored unless an explicit 'force' option is specified. Second, even though spectre_v2 mitigations would be enabled in these cases, the X86_BUG_SPECTRE_V2 bit wasn't set. This is again inconsistent with the forcing behavior of other bugs and arguably incorrect as it doesn't make sense to enable a mitigation if the X86_BUG bit isn't set. Fix both issues by only forcing spectre_v2 mitigations when the 'spectre_v2=on' option is specified (which was already called SPECTRE_V2_CMD_FORCE) and setting the relevant X86_BUG_* bits in that case. This also allows for simplifying bhi_update_mitigation() because spectre_v2_cmd will now always be SPECTRE_V2_CMD_NONE if the CPU is immune to spectre_v2. Signed-off-by: David Kaplan Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com --- arch/x86/kernel/cpu/bugs.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index f28a7383b88bdb..145f8777aa32d4 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2057,29 +2057,32 @@ static int __init spectre_v2_parse_cmdline(char *str) if (nospectre_v2) return 0; - if (!strcmp(str, "off")) + if (!strcmp(str, "off")) { spectre_v2_cmd = SPECTRE_V2_CMD_NONE; - else if (!strcmp(str, "on")) + } else if (!strcmp(str, "on")) { spectre_v2_cmd = SPECTRE_V2_CMD_FORCE; - else if (!strcmp(str, "retpoline")) + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER); + } else if (!strcmp(str, "retpoline")) { spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE; - else if (!strcmp(str, "retpoline,amd") || - !strcmp(str, "retpoline,lfence")) + } else if (!strcmp(str, "retpoline,amd") || + !strcmp(str, "retpoline,lfence")) { spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_LFENCE; - else if (!strcmp(str, "retpoline,generic")) + } else if (!strcmp(str, "retpoline,generic")) { spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_GENERIC; - else if (!strcmp(str, "eibrs")) + } else if (!strcmp(str, "eibrs")) { spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS; - else if (!strcmp(str, "eibrs,lfence")) + } else if (!strcmp(str, "eibrs,lfence")) { spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_LFENCE; - else if (!strcmp(str, "eibrs,retpoline")) + } else if (!strcmp(str, "eibrs,retpoline")) { spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_RETPOLINE; - else if (!strcmp(str, "auto")) + } else if (!strcmp(str, "auto")) { spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; - else if (!strcmp(str, "ibrs")) + } else if (!strcmp(str, "ibrs")) { spectre_v2_cmd = SPECTRE_V2_CMD_IBRS; - else + } else { pr_err("Ignoring unknown spectre_v2 option (%s).", str); + } return 0; } @@ -2232,10 +2235,6 @@ static void __init bhi_update_mitigation(void) { if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) bhi_mitigation = BHI_MITIGATION_OFF; - - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) - bhi_mitigation = BHI_MITIGATION_OFF; } static void __init bhi_apply_mitigation(void) @@ -2316,9 +2315,10 @@ static void __init spectre_v2_select_mitigation(void) spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; } - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)) + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) { + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; return; + } switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: From d1cc1baef67ac6c09b74629ca053bf3fb812f7dc Mon Sep 17 00:00:00 2001 From: David Kaplan Date: Mon, 15 Sep 2025 08:47:05 -0500 Subject: [PATCH 1858/3206] x86/bugs: Fix reporting of LFENCE retpoline The LFENCE retpoline mitigation is not secure but the kernel prints inconsistent messages about this fact. The dmesg log says 'Mitigation: LFENCE', implying the system is mitigated. But sysfs reports 'Vulnerable: LFENCE' implying the system (correctly) is not mitigated. Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere when this mitigation is selected. Signed-off-by: David Kaplan Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com --- arch/x86/kernel/cpu/bugs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 145f8777aa32d4..66dbb3bd279123 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2032,7 +2032,7 @@ static void __init spectre_v2_user_apply_mitigation(void) static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", - [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", @@ -3559,9 +3559,6 @@ static const char *spectre_bhi_state(void) static ssize_t spectre_v2_show_state(char *buf) { - if (spectre_v2_enabled == SPECTRE_V2_LFENCE) - return sysfs_emit(buf, "Vulnerable: LFENCE\n"); - if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); From 930f2361fe542a00de9ce6070b1b6edb976f1165 Mon Sep 17 00:00:00 2001 From: David Kaplan Date: Mon, 15 Sep 2025 08:47:06 -0500 Subject: [PATCH 1859/3206] x86/bugs: Report correct retbleed mitigation status On Intel CPUs, the default retbleed mitigation is IBRS/eIBRS but this requires that a similar spectre_v2 mitigation is applied. If the user selects a different spectre_v2 mitigation (like spectre_v2=retpoline) a warning is printed but sysfs will still report 'Mitigation: IBRS' or 'Mitigation: Enhanced IBRS'. This is incorrect because retbleed is not mitigated, and IBRS is not actually set. Fix this by choosing RETBLEED_MITIGATION_NONE in this scenario so the kernel correctly reports the system as vulnerable to retbleed. Signed-off-by: David Kaplan Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com --- arch/x86/kernel/cpu/bugs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 66dbb3bd279123..6a526ae1fe9933 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1462,8 +1462,10 @@ static void __init retbleed_update_mitigation(void) retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; break; default: - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { pr_err(RETBLEED_INTEL_MSG); + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } } } From 42c21838708c20dd8ba605e4099bf6a7156c3362 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:24:13 -0700 Subject: [PATCH 1860/3206] xfs: move the XLOG_REG_ constants out of xfs_log_format.h These are purely in-memory values and not used at all in xfsprogs. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 37 ---------------------------------- fs/xfs/xfs_log.h | 37 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index fd00b77af32b9d..6c50cb2ece1972 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -86,43 +86,6 @@ struct xfs_unmount_log_format { uint32_t pad2; /* may as well make it 64 bits */ }; -/* Region types for iovec's i_type */ -#define XLOG_REG_TYPE_BFORMAT 1 -#define XLOG_REG_TYPE_BCHUNK 2 -#define XLOG_REG_TYPE_EFI_FORMAT 3 -#define XLOG_REG_TYPE_EFD_FORMAT 4 -#define XLOG_REG_TYPE_IFORMAT 5 -#define XLOG_REG_TYPE_ICORE 6 -#define XLOG_REG_TYPE_IEXT 7 -#define XLOG_REG_TYPE_IBROOT 8 -#define XLOG_REG_TYPE_ILOCAL 9 -#define XLOG_REG_TYPE_IATTR_EXT 10 -#define XLOG_REG_TYPE_IATTR_BROOT 11 -#define XLOG_REG_TYPE_IATTR_LOCAL 12 -#define XLOG_REG_TYPE_QFORMAT 13 -#define XLOG_REG_TYPE_DQUOT 14 -#define XLOG_REG_TYPE_QUOTAOFF 15 -#define XLOG_REG_TYPE_LRHEADER 16 -#define XLOG_REG_TYPE_UNMOUNT 17 -#define XLOG_REG_TYPE_COMMIT 18 -#define XLOG_REG_TYPE_TRANSHDR 19 -#define XLOG_REG_TYPE_ICREATE 20 -#define XLOG_REG_TYPE_RUI_FORMAT 21 -#define XLOG_REG_TYPE_RUD_FORMAT 22 -#define XLOG_REG_TYPE_CUI_FORMAT 23 -#define XLOG_REG_TYPE_CUD_FORMAT 24 -#define XLOG_REG_TYPE_BUI_FORMAT 25 -#define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_ATTRI_FORMAT 27 -#define XLOG_REG_TYPE_ATTRD_FORMAT 28 -#define XLOG_REG_TYPE_ATTR_NAME 29 -#define XLOG_REG_TYPE_ATTR_VALUE 30 -#define XLOG_REG_TYPE_XMI_FORMAT 31 -#define XLOG_REG_TYPE_XMD_FORMAT 32 -#define XLOG_REG_TYPE_ATTR_NEWNAME 33 -#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 -#define XLOG_REG_TYPE_MAX 34 - /* * Flags to log operation header * diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index af6daf4f67924b..dcc1f44ed68f90 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -20,6 +20,43 @@ struct xfs_log_vec { int lv_alloc_size; /* size of allocated lv */ }; +/* Region types for iovec's i_type */ +#define XLOG_REG_TYPE_BFORMAT 1 +#define XLOG_REG_TYPE_BCHUNK 2 +#define XLOG_REG_TYPE_EFI_FORMAT 3 +#define XLOG_REG_TYPE_EFD_FORMAT 4 +#define XLOG_REG_TYPE_IFORMAT 5 +#define XLOG_REG_TYPE_ICORE 6 +#define XLOG_REG_TYPE_IEXT 7 +#define XLOG_REG_TYPE_IBROOT 8 +#define XLOG_REG_TYPE_ILOCAL 9 +#define XLOG_REG_TYPE_IATTR_EXT 10 +#define XLOG_REG_TYPE_IATTR_BROOT 11 +#define XLOG_REG_TYPE_IATTR_LOCAL 12 +#define XLOG_REG_TYPE_QFORMAT 13 +#define XLOG_REG_TYPE_DQUOT 14 +#define XLOG_REG_TYPE_QUOTAOFF 15 +#define XLOG_REG_TYPE_LRHEADER 16 +#define XLOG_REG_TYPE_UNMOUNT 17 +#define XLOG_REG_TYPE_COMMIT 18 +#define XLOG_REG_TYPE_TRANSHDR 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_RUI_FORMAT 21 +#define XLOG_REG_TYPE_RUD_FORMAT 22 +#define XLOG_REG_TYPE_CUI_FORMAT 23 +#define XLOG_REG_TYPE_CUD_FORMAT 24 +#define XLOG_REG_TYPE_BUI_FORMAT 25 +#define XLOG_REG_TYPE_BUD_FORMAT 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_XMI_FORMAT 31 +#define XLOG_REG_TYPE_XMD_FORMAT 32 +#define XLOG_REG_TYPE_ATTR_NEWNAME 33 +#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 +#define XLOG_REG_TYPE_MAX 34 + #define XFS_LOG_VEC_ORDERED (-1) /* From 8ad5294849379543782e290e8e670b69e4580a24 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Mon, 15 Sep 2025 17:27:50 +0100 Subject: [PATCH 1861/3206] ASoC: codecs: add new pm4125 audio codec driver The audio codec is found in Qualcomm PM2250/PM4125 PMICs and is used on platforms like Qualcomm QCM2290. It has soundwire interface and corresponding RX and TX slave devices. It has only two input channels: HPH left and right. The line output (LO) is linked to HPHL so the hardware has some limitations regarding concurrent playback via HPH and LO for instance. The codec driver also uses WCD MBCH framework. The MBHC functionality is implemented in a minimalistic way to enable IRQs and avoid different issues with IRQs. Co-developed-by: Srinivas Kandagatla Signed-off-by: Srinivas Kandagatla Signed-off-by: Alexey Klimov Reviewed-by: Srinivas Kandagatla Tested-by: Srinivas Kandagatla Link: https://patch.msgid.link/20250915-pm4125_audio_codec_v1-v4-3-b247b64eec52@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/Kconfig | 18 + sound/soc/codecs/Makefile | 8 + sound/soc/codecs/pm4125-sdw.c | 545 ++++++++++ sound/soc/codecs/pm4125.c | 1780 +++++++++++++++++++++++++++++++++ sound/soc/codecs/pm4125.h | 307 ++++++ 5 files changed, 2658 insertions(+) create mode 100644 sound/soc/codecs/pm4125-sdw.c create mode 100644 sound/soc/codecs/pm4125.c create mode 100644 sound/soc/codecs/pm4125.h diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 6d7e4725d89cd3..eca0134a41f8b3 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -192,6 +192,7 @@ config SND_SOC_ALL_CODECS imply SND_SOC_PCM512x_SPI imply SND_SOC_PCM6240 imply SND_SOC_PEB2466 + imply SND_SOC_PM4125_SDW imply SND_SOC_RK3308 imply SND_SOC_RK3328 imply SND_SOC_RK817 @@ -1542,6 +1543,23 @@ config SND_SOC_PEB2466 To compile this driver as a module, choose M here: the module will be called snd-soc-peb2466. +config SND_SOC_PM4125 + depends on SND_SOC_PM4125_SDW + tristate + depends on SOUNDWIRE || !SOUNDWIRE + +config SND_SOC_PM4125_SDW + tristate "PM4125 audio codec - SDW" + select SND_SOC_PM4125 + select SND_SOC_WCD_MBHC + select REGMAP_IRQ + depends on SOUNDWIRE + select REGMAP_SOUNDWIRE + help + The PMIC PM4125 has an in-built audio codec IC used with SoCs + like QCM2290, and it is connected via soundwire and SPMI. + To compile this codec driver say Y or m. + config SND_SOC_RK3308 tristate "Rockchip RK3308 audio CODEC" depends on ARM64 || COMPILE_TEST diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile index a68c3d192a1b6c..9eed819eaf948e 100644 --- a/sound/soc/codecs/Makefile +++ b/sound/soc/codecs/Makefile @@ -222,6 +222,8 @@ snd-soc-pcm512x-i2c-y := pcm512x-i2c.o snd-soc-pcm512x-spi-y := pcm512x-spi.o snd-soc-pcm6240-y := pcm6240.o snd-soc-peb2466-y := peb2466.o +snd-soc-pm4125-y := pm4125.o +snd-soc-pm4125-sdw-y := pm4125-sdw.o snd-soc-rk3308-y := rk3308_codec.o snd-soc-rk3328-y := rk3328_codec.o snd-soc-rk817-y := rk817_codec.o @@ -642,6 +644,12 @@ obj-$(CONFIG_SND_SOC_PCM512x_I2C) += snd-soc-pcm512x-i2c.o obj-$(CONFIG_SND_SOC_PCM512x_SPI) += snd-soc-pcm512x-spi.o obj-$(CONFIG_SND_SOC_PCM6240) += snd-soc-pcm6240.o obj-$(CONFIG_SND_SOC_PEB2466) += snd-soc-peb2466.o +obj-$(CONFIG_SND_SOC_PM4125_SDW) += snd-soc-pm4125-sdw.o +obj-$(CONFIG_SND_SOC_PM4125) += snd-soc-pm4125.o +ifdef CONFIG_SND_SOC_PM4125_SDW +# avoid link failure by forcing sdw code built-in when needed +obj-$(CONFIG_SND_SOC_PM4125) += snd-soc-pm4125-sdw.o +endif obj-$(CONFIG_SND_SOC_RK3308) += snd-soc-rk3308.o obj-$(CONFIG_SND_SOC_RK3328) += snd-soc-rk3328.o obj-$(CONFIG_SND_SOC_RK817) += snd-soc-rk817.o diff --git a/sound/soc/codecs/pm4125-sdw.c b/sound/soc/codecs/pm4125-sdw.c new file mode 100644 index 00000000000000..4ed09fbe3f545a --- /dev/null +++ b/sound/soc/codecs/pm4125-sdw.c @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +// Copyright, 2025 Linaro Ltd + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pm4125.h" + +static struct pm4125_sdw_ch_info pm4125_sdw_rx_ch_info[] = { + WCD_SDW_CH(PM4125_HPH_L, PM4125_HPH_PORT, BIT(0)), + WCD_SDW_CH(PM4125_HPH_R, PM4125_HPH_PORT, BIT(1)), +}; + +static struct pm4125_sdw_ch_info pm4125_sdw_tx_ch_info[] = { + WCD_SDW_CH(PM4125_ADC1, PM4125_ADC_1_2_DMIC1L_BCS_PORT, BIT(0)), + WCD_SDW_CH(PM4125_ADC2, PM4125_ADC_1_2_DMIC1L_BCS_PORT, BIT(1)), +}; + +static struct sdw_dpn_prop pm4125_dpn_prop[PM4125_MAX_SWR_PORTS] = { + { + .num = 1, + .type = SDW_DPN_SIMPLE, + .min_ch = 1, + .max_ch = 8, + .simple_ch_prep_sm = true, + }, { + .num = 2, + .type = SDW_DPN_SIMPLE, + .min_ch = 1, + .max_ch = 4, + .simple_ch_prep_sm = true, + } +}; + +struct device *pm4125_sdw_device_get(struct device_node *np) +{ + return bus_find_device_by_of_node(&sdw_bus_type, np); +} +EXPORT_SYMBOL_GPL(pm4125_sdw_device_get); + +int pm4125_sdw_hw_params(struct pm4125_sdw_priv *priv, struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, struct snd_soc_dai *dai) +{ + struct sdw_port_config port_config[PM4125_MAX_SWR_PORTS]; + unsigned long ch_mask; + int i, j; + + priv->sconfig.ch_count = 1; + priv->active_ports = 0; + for (i = 0; i < PM4125_MAX_SWR_PORTS; i++) { + ch_mask = priv->port_config[i].ch_mask; + if (!ch_mask) + continue; + + for_each_set_bit(j, &ch_mask, 4) + priv->sconfig.ch_count++; + + port_config[priv->active_ports] = priv->port_config[i]; + priv->active_ports++; + } + + priv->sconfig.bps = 1; + priv->sconfig.frame_rate = params_rate(params); + priv->sconfig.direction = priv->is_tx ? SDW_DATA_DIR_TX : SDW_DATA_DIR_RX; + priv->sconfig.type = SDW_STREAM_PCM; + + return sdw_stream_add_slave(priv->sdev, &priv->sconfig, &port_config[0], priv->active_ports, + priv->sruntime); +} +EXPORT_SYMBOL_GPL(pm4125_sdw_hw_params); + +static int pm4125_update_status(struct sdw_slave *slave, enum sdw_slave_status status) +{ + struct pm4125_sdw_priv *priv = dev_get_drvdata(&slave->dev); + + if (priv->regmap && status == SDW_SLAVE_ATTACHED) { + /* Write out any cached changes that happened between probe and attach */ + regcache_cache_only(priv->regmap, false); + return regcache_sync(priv->regmap); + } + + return 0; +} + +/* + * Handle Soundwire out-of-band interrupt event by triggering the first irq of the slave_irq + * irq domain, which then will be handled by the regmap_irq threaded irq. + * Looping is to ensure no interrupts were missed in the process. + */ +static int pm4125_interrupt_callback(struct sdw_slave *slave, struct sdw_slave_intr_status *status) +{ + struct pm4125_sdw_priv *priv = dev_get_drvdata(&slave->dev); + struct irq_domain *slave_irq = priv->slave_irq; + u32 sts1, sts2, sts3; + + do { + handle_nested_irq(irq_find_mapping(slave_irq, 0)); + regmap_read(priv->regmap, PM4125_DIG_SWR_INTR_STATUS_0, &sts1); + regmap_read(priv->regmap, PM4125_DIG_SWR_INTR_STATUS_1, &sts2); + regmap_read(priv->regmap, PM4125_DIG_SWR_INTR_STATUS_2, &sts3); + + } while (sts1 || sts2 || sts3); + + return IRQ_HANDLED; +} + +static const struct reg_default pm4125_defaults[] = { + { PM4125_ANA_MICBIAS_MICB_1_2_EN, 0x01 }, + { PM4125_ANA_MICBIAS_MICB_3_EN, 0x00 }, + { PM4125_ANA_MICBIAS_LDO_1_SETTING, 0x21 }, + { PM4125_ANA_MICBIAS_LDO_1_CTRL, 0x01 }, + { PM4125_ANA_TX_AMIC1, 0x00 }, + { PM4125_ANA_TX_AMIC2, 0x00 }, + { PM4125_ANA_MBHC_MECH, 0x39 }, + { PM4125_ANA_MBHC_ELECT, 0x08 }, + { PM4125_ANA_MBHC_ZDET, 0x10 }, + { PM4125_ANA_MBHC_RESULT_1, 0x00 }, + { PM4125_ANA_MBHC_RESULT_2, 0x00 }, + { PM4125_ANA_MBHC_RESULT_3, 0x00 }, + { PM4125_ANA_MBHC_BTN0_ZDET_VREF1, 0x00 }, + { PM4125_ANA_MBHC_BTN1_ZDET_VREF2, 0x10 }, + { PM4125_ANA_MBHC_BTN2_ZDET_VREF3, 0x20 }, + { PM4125_ANA_MBHC_BTN3_ZDET_DBG_400, 0x30 }, + { PM4125_ANA_MBHC_BTN4_ZDET_DBG_1400, 0x40 }, + { PM4125_ANA_MBHC_MICB2_RAMP, 0x00 }, + { PM4125_ANA_MBHC_CTL_1, 0x02 }, + { PM4125_ANA_MBHC_CTL_2, 0x05 }, + { PM4125_ANA_MBHC_PLUG_DETECT_CTL, 0xE9 }, + { PM4125_ANA_MBHC_ZDET_ANA_CTL, 0x0F }, + { PM4125_ANA_MBHC_ZDET_RAMP_CTL, 0x00 }, + { PM4125_ANA_MBHC_FSM_STATUS, 0x00 }, + { PM4125_ANA_MBHC_ADC_RESULT, 0x00 }, + { PM4125_ANA_MBHC_CTL_CLK, 0x30 }, + { PM4125_ANA_MBHC_ZDET_CALIB_RESULT, 0x00 }, + { PM4125_ANA_NCP_EN, 0x00 }, + { PM4125_ANA_NCP_VCTRL, 0xA7 }, + { PM4125_ANA_HPHPA_CNP_CTL_1, 0x54 }, + { PM4125_ANA_HPHPA_CNP_CTL_2, 0x2B }, + { PM4125_ANA_HPHPA_PA_STATUS, 0x00 }, + { PM4125_ANA_HPHPA_FSM_CLK, 0x12 }, + { PM4125_ANA_HPHPA_L_GAIN, 0x00 }, + { PM4125_ANA_HPHPA_R_GAIN, 0x00 }, + { PM4125_SWR_HPHPA_HD2, 0x1B }, + { PM4125_ANA_HPHPA_SPARE_CTL, 0x02 }, + { PM4125_ANA_SURGE_EN, 0x38 }, + { PM4125_ANA_COMBOPA_CTL, 0x35 }, + { PM4125_ANA_COMBOPA_CTL_4, 0x84 }, + { PM4125_ANA_COMBOPA_CTL_5, 0x05 }, + { PM4125_ANA_RXLDO_CTL, 0x86 }, + { PM4125_ANA_MBIAS_EN, 0x00 }, + { PM4125_DIG_SWR_CHIP_ID0, 0x00 }, + { PM4125_DIG_SWR_CHIP_ID1, 0x00 }, + { PM4125_DIG_SWR_CHIP_ID2, 0x0C }, + { PM4125_DIG_SWR_CHIP_ID3, 0x01 }, + { PM4125_DIG_SWR_SWR_TX_CLK_RATE, 0x00 }, + { PM4125_DIG_SWR_CDC_RST_CTL, 0x03 }, + { PM4125_DIG_SWR_TOP_CLK_CFG, 0x00 }, + { PM4125_DIG_SWR_CDC_RX_CLK_CTL, 0x00 }, + { PM4125_DIG_SWR_CDC_TX_CLK_CTL, 0x33 }, + { PM4125_DIG_SWR_SWR_RST_EN, 0x00 }, + { PM4125_DIG_SWR_CDC_RX_RST, 0x00 }, + { PM4125_DIG_SWR_CDC_RX0_CTL, 0xFC }, + { PM4125_DIG_SWR_CDC_RX1_CTL, 0xFC }, + { PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1, 0x00 }, + { PM4125_DIG_SWR_CDC_COMP_CTL_0, 0x00 }, + { PM4125_DIG_SWR_CDC_RX_DELAY_CTL, 0x66 }, + { PM4125_DIG_SWR_CDC_RX_GAIN_0, 0x55 }, + { PM4125_DIG_SWR_CDC_RX_GAIN_1, 0xA9 }, + { PM4125_DIG_SWR_CDC_RX_GAIN_CTL, 0x00 }, + { PM4125_DIG_SWR_CDC_TX0_CTL, 0x68 }, + { PM4125_DIG_SWR_CDC_TX1_CTL, 0x68 }, + { PM4125_DIG_SWR_CDC_TX_RST, 0x00 }, + { PM4125_DIG_SWR_CDC_REQ0_CTL, 0x01 }, + { PM4125_DIG_SWR_CDC_REQ1_CTL, 0x01 }, + { PM4125_DIG_SWR_CDC_RST, 0x00 }, + { PM4125_DIG_SWR_CDC_AMIC_CTL, 0x02 }, + { PM4125_DIG_SWR_CDC_DMIC_CTL, 0x00 }, + { PM4125_DIG_SWR_CDC_DMIC1_CTL, 0x00 }, + { PM4125_DIG_SWR_CDC_DMIC1_RATE, 0x01 }, + { PM4125_DIG_SWR_PDM_WD_CTL0, 0x00 }, + { PM4125_DIG_SWR_PDM_WD_CTL1, 0x00 }, + { PM4125_DIG_SWR_INTR_MODE, 0x00 }, + { PM4125_DIG_SWR_INTR_MASK_0, 0xFF }, + { PM4125_DIG_SWR_INTR_MASK_1, 0x7F }, + { PM4125_DIG_SWR_INTR_MASK_2, 0x0C }, + { PM4125_DIG_SWR_INTR_STATUS_0, 0x00 }, + { PM4125_DIG_SWR_INTR_STATUS_1, 0x00 }, + { PM4125_DIG_SWR_INTR_STATUS_2, 0x00 }, + { PM4125_DIG_SWR_INTR_CLEAR_0, 0x00 }, + { PM4125_DIG_SWR_INTR_CLEAR_1, 0x00 }, + { PM4125_DIG_SWR_INTR_CLEAR_2, 0x00 }, + { PM4125_DIG_SWR_INTR_LEVEL_0, 0x00 }, + { PM4125_DIG_SWR_INTR_LEVEL_1, 0x2A }, + { PM4125_DIG_SWR_INTR_LEVEL_2, 0x00 }, + { PM4125_DIG_SWR_CDC_CONN_RX0_CTL, 0x00 }, + { PM4125_DIG_SWR_CDC_CONN_RX1_CTL, 0x00 }, + { PM4125_DIG_SWR_LOOP_BACK_MODE, 0x00 }, + { PM4125_DIG_SWR_DRIVE_STRENGTH_0, 0x00 }, + { PM4125_DIG_SWR_DIG_DEBUG_CTL, 0x00 }, + { PM4125_DIG_SWR_DIG_DEBUG_EN, 0x00 }, + { PM4125_DIG_SWR_DEM_BYPASS_DATA0, 0x55 }, + { PM4125_DIG_SWR_DEM_BYPASS_DATA1, 0x55 }, + { PM4125_DIG_SWR_DEM_BYPASS_DATA2, 0x55 }, + { PM4125_DIG_SWR_DEM_BYPASS_DATA3, 0x01 }, +}; + +static bool pm4125_rdwr_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case PM4125_ANA_MICBIAS_MICB_1_2_EN: + case PM4125_ANA_MICBIAS_MICB_3_EN: + case PM4125_ANA_MICBIAS_LDO_1_SETTING: + case PM4125_ANA_MICBIAS_LDO_1_CTRL: + case PM4125_ANA_TX_AMIC1: + case PM4125_ANA_TX_AMIC2: + case PM4125_ANA_MBHC_MECH: + case PM4125_ANA_MBHC_ELECT: + case PM4125_ANA_MBHC_ZDET: + case PM4125_ANA_MBHC_BTN0_ZDET_VREF1: + case PM4125_ANA_MBHC_BTN1_ZDET_VREF2: + case PM4125_ANA_MBHC_BTN2_ZDET_VREF3: + case PM4125_ANA_MBHC_BTN3_ZDET_DBG_400: + case PM4125_ANA_MBHC_BTN4_ZDET_DBG_1400: + case PM4125_ANA_MBHC_MICB2_RAMP: + case PM4125_ANA_MBHC_CTL_1: + case PM4125_ANA_MBHC_CTL_2: + case PM4125_ANA_MBHC_PLUG_DETECT_CTL: + case PM4125_ANA_MBHC_ZDET_ANA_CTL: + case PM4125_ANA_MBHC_ZDET_RAMP_CTL: + case PM4125_ANA_MBHC_CTL_CLK: + case PM4125_ANA_NCP_EN: + case PM4125_ANA_NCP_VCTRL: + case PM4125_ANA_HPHPA_CNP_CTL_1: + case PM4125_ANA_HPHPA_CNP_CTL_2: + case PM4125_ANA_HPHPA_FSM_CLK: + case PM4125_ANA_HPHPA_L_GAIN: + case PM4125_ANA_HPHPA_R_GAIN: + case PM4125_ANA_HPHPA_SPARE_CTL: + case PM4125_SWR_HPHPA_HD2: + case PM4125_ANA_SURGE_EN: + case PM4125_ANA_COMBOPA_CTL: + case PM4125_ANA_COMBOPA_CTL_4: + case PM4125_ANA_COMBOPA_CTL_5: + case PM4125_ANA_RXLDO_CTL: + case PM4125_ANA_MBIAS_EN: + case PM4125_DIG_SWR_SWR_TX_CLK_RATE: + case PM4125_DIG_SWR_CDC_RST_CTL: + case PM4125_DIG_SWR_TOP_CLK_CFG: + case PM4125_DIG_SWR_CDC_RX_CLK_CTL: + case PM4125_DIG_SWR_CDC_TX_CLK_CTL: + case PM4125_DIG_SWR_SWR_RST_EN: + case PM4125_DIG_SWR_CDC_RX_RST: + case PM4125_DIG_SWR_CDC_RX0_CTL: + case PM4125_DIG_SWR_CDC_RX1_CTL: + case PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1: + case PM4125_DIG_SWR_CDC_COMP_CTL_0: + case PM4125_DIG_SWR_CDC_RX_DELAY_CTL: + case PM4125_DIG_SWR_CDC_RX_GAIN_0: + case PM4125_DIG_SWR_CDC_RX_GAIN_1: + case PM4125_DIG_SWR_CDC_RX_GAIN_CTL: + case PM4125_DIG_SWR_CDC_TX0_CTL: + case PM4125_DIG_SWR_CDC_TX1_CTL: + case PM4125_DIG_SWR_CDC_TX_RST: + case PM4125_DIG_SWR_CDC_REQ0_CTL: + case PM4125_DIG_SWR_CDC_REQ1_CTL: + case PM4125_DIG_SWR_CDC_RST: + case PM4125_DIG_SWR_CDC_AMIC_CTL: + case PM4125_DIG_SWR_CDC_DMIC_CTL: + case PM4125_DIG_SWR_CDC_DMIC1_CTL: + case PM4125_DIG_SWR_CDC_DMIC1_RATE: + case PM4125_DIG_SWR_PDM_WD_CTL0: + case PM4125_DIG_SWR_PDM_WD_CTL1: + case PM4125_DIG_SWR_INTR_MODE: + case PM4125_DIG_SWR_INTR_MASK_0: + case PM4125_DIG_SWR_INTR_MASK_1: + case PM4125_DIG_SWR_INTR_MASK_2: + case PM4125_DIG_SWR_INTR_CLEAR_0: + case PM4125_DIG_SWR_INTR_CLEAR_1: + case PM4125_DIG_SWR_INTR_CLEAR_2: + case PM4125_DIG_SWR_INTR_LEVEL_0: + case PM4125_DIG_SWR_INTR_LEVEL_1: + case PM4125_DIG_SWR_INTR_LEVEL_2: + case PM4125_DIG_SWR_CDC_CONN_RX0_CTL: + case PM4125_DIG_SWR_CDC_CONN_RX1_CTL: + case PM4125_DIG_SWR_LOOP_BACK_MODE: + case PM4125_DIG_SWR_DRIVE_STRENGTH_0: + case PM4125_DIG_SWR_DIG_DEBUG_CTL: + case PM4125_DIG_SWR_DIG_DEBUG_EN: + case PM4125_DIG_SWR_DEM_BYPASS_DATA0: + case PM4125_DIG_SWR_DEM_BYPASS_DATA1: + case PM4125_DIG_SWR_DEM_BYPASS_DATA2: + case PM4125_DIG_SWR_DEM_BYPASS_DATA3: + return true; + } + + return false; +} + +static bool pm4125_readable_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case PM4125_ANA_MBHC_RESULT_1: + case PM4125_ANA_MBHC_RESULT_2: + case PM4125_ANA_MBHC_RESULT_3: + case PM4125_ANA_MBHC_FSM_STATUS: + case PM4125_ANA_MBHC_ADC_RESULT: + case PM4125_ANA_MBHC_ZDET_CALIB_RESULT: + case PM4125_ANA_HPHPA_PA_STATUS: + case PM4125_DIG_SWR_CHIP_ID0: + case PM4125_DIG_SWR_CHIP_ID1: + case PM4125_DIG_SWR_CHIP_ID2: + case PM4125_DIG_SWR_CHIP_ID3: + case PM4125_DIG_SWR_INTR_STATUS_0: + case PM4125_DIG_SWR_INTR_STATUS_1: + case PM4125_DIG_SWR_INTR_STATUS_2: + return true; + } + return pm4125_rdwr_register(dev, reg); +} + +static bool pm4125_volatile_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case PM4125_ANA_MBHC_RESULT_1: + case PM4125_ANA_MBHC_RESULT_2: + case PM4125_ANA_MBHC_RESULT_3: + case PM4125_ANA_MBHC_FSM_STATUS: + case PM4125_ANA_MBHC_ADC_RESULT: + case PM4125_ANA_MBHC_ZDET_CALIB_RESULT: + case PM4125_ANA_HPHPA_PA_STATUS: + case PM4125_DIG_SWR_CHIP_ID0: + case PM4125_DIG_SWR_CHIP_ID1: + case PM4125_DIG_SWR_CHIP_ID2: + case PM4125_DIG_SWR_CHIP_ID3: + case PM4125_DIG_SWR_INTR_STATUS_0: + case PM4125_DIG_SWR_INTR_STATUS_1: + case PM4125_DIG_SWR_INTR_STATUS_2: + return true; + } + + return false; +} + +static const struct regmap_config pm4125_regmap_config = { + .name = "pm4125_csr", + .reg_bits = 32, + .val_bits = 8, + .cache_type = REGCACHE_MAPLE, + .reg_defaults = pm4125_defaults, + .num_reg_defaults = ARRAY_SIZE(pm4125_defaults), + .max_register = PM4125_MAX_REGISTER, + .readable_reg = pm4125_readable_register, + .writeable_reg = pm4125_rdwr_register, + .volatile_reg = pm4125_volatile_register, +}; + +static const struct sdw_slave_ops pm4125_slave_ops = { + .update_status = pm4125_update_status, + .interrupt_callback = pm4125_interrupt_callback, +}; + +static int pm4125_sdw_component_bind(struct device *dev, struct device *master, void *data) +{ + pm_runtime_set_autosuspend_delay(dev, 3000); + pm_runtime_use_autosuspend(dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + + return 0; +} + +static void pm4125_sdw_component_unbind(struct device *dev, struct device *master, void *data) +{ + pm_runtime_disable(dev); + pm_runtime_set_suspended(dev); + pm_runtime_dont_use_autosuspend(dev); +} + +static const struct component_ops pm4125_sdw_component_ops = { + .bind = pm4125_sdw_component_bind, + .unbind = pm4125_sdw_component_unbind, +}; + +static int pm4125_probe(struct sdw_slave *pdev, const struct sdw_device_id *id) +{ + struct device *dev = &pdev->dev; + struct pm4125_sdw_priv *priv; + u8 master_ch_mask[PM4125_MAX_SWR_CH_IDS]; + int master_ch_mask_size = 0; + int ret, i; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + /* Port map index starts at 0, however the data port for this codec starts at index 1 */ + if (of_property_present(dev->of_node, "qcom,tx-port-mapping")) { + priv->is_tx = true; + ret = of_property_read_u32_array(dev->of_node, "qcom,tx-port-mapping", + &pdev->m_port_map[1], PM4125_MAX_TX_SWR_PORTS); + } else { + ret = of_property_read_u32_array(dev->of_node, "qcom,rx-port-mapping", + &pdev->m_port_map[1], PM4125_MAX_SWR_PORTS); + } + + if (ret < 0) + dev_info(dev, "Error getting static port mapping for %s (%d)\n", + priv->is_tx ? "TX" : "RX", ret); + + priv->sdev = pdev; + dev_set_drvdata(dev, priv); + + pdev->prop.scp_int1_mask = SDW_SCP_INT1_IMPL_DEF | + SDW_SCP_INT1_BUS_CLASH | + SDW_SCP_INT1_PARITY; + pdev->prop.lane_control_support = true; + pdev->prop.simple_clk_stop_capable = true; + + memset(master_ch_mask, 0, PM4125_MAX_SWR_CH_IDS); + + if (priv->is_tx) { + master_ch_mask_size = of_property_count_u8_elems(dev->of_node, + "qcom,tx-channel-mapping"); + + if (master_ch_mask_size) + ret = of_property_read_u8_array(dev->of_node, "qcom,tx-channel-mapping", + master_ch_mask, master_ch_mask_size); + } else { + master_ch_mask_size = of_property_count_u8_elems(dev->of_node, + "qcom,rx-channel-mapping"); + + if (master_ch_mask_size) + ret = of_property_read_u8_array(dev->of_node, "qcom,rx-channel-mapping", + master_ch_mask, master_ch_mask_size); + } + + if (ret < 0) + dev_info(dev, "Static channel mapping not specified using device channel maps\n"); + + if (priv->is_tx) { + pdev->prop.source_ports = GENMASK(PM4125_MAX_TX_SWR_PORTS, 0); + pdev->prop.src_dpn_prop = pm4125_dpn_prop; + priv->ch_info = &pm4125_sdw_tx_ch_info[0]; + + for (i = 0; i < master_ch_mask_size; i++) + priv->ch_info[i].master_ch_mask = PM4125_SWRM_CH_MASK(master_ch_mask[i]); + + pdev->prop.wake_capable = true; + + priv->regmap = devm_regmap_init_sdw(pdev, &pm4125_regmap_config); + if (IS_ERR(priv->regmap)) + return dev_err_probe(dev, PTR_ERR(priv->regmap), "regmap init failed\n"); + + /* Start in cache-only until device is enumerated */ + regcache_cache_only(priv->regmap, true); + } else { + pdev->prop.sink_ports = GENMASK(PM4125_MAX_SWR_PORTS - 1, 0); + pdev->prop.sink_dpn_prop = pm4125_dpn_prop; + priv->ch_info = &pm4125_sdw_rx_ch_info[0]; + + for (i = 0; i < master_ch_mask_size; i++) + priv->ch_info[i].master_ch_mask = PM4125_SWRM_CH_MASK(master_ch_mask[i]); + } + + ret = component_add(dev, &pm4125_sdw_component_ops); + if (ret) + return ret; + + /* Set suspended until aggregate device is bind */ + pm_runtime_set_suspended(dev); + + return 0; +} + +static int pm4125_remove(struct sdw_slave *pdev) +{ + struct device *dev = &pdev->dev; + + component_del(dev, &pm4125_sdw_component_ops); + + return 0; +} + +static const struct sdw_device_id pm4125_slave_id[] = { + SDW_SLAVE_ENTRY(0x0217, 0x10c, 0), /* Soundwire pm4125 RX/TX Device ID */ + { } +}; +MODULE_DEVICE_TABLE(sdw, pm4125_slave_id); + +static int __maybe_unused pm4125_sdw_runtime_suspend(struct device *dev) +{ + struct pm4125_sdw_priv *priv = dev_get_drvdata(dev); + + if (priv->regmap) { + regcache_cache_only(priv->regmap, true); + regcache_mark_dirty(priv->regmap); + } + + return 0; +} + +static int __maybe_unused pm4125_sdw_runtime_resume(struct device *dev) +{ + struct pm4125_sdw_priv *priv = dev_get_drvdata(dev); + + if (priv->regmap) { + regcache_cache_only(priv->regmap, false); + regcache_sync(priv->regmap); + } + + return 0; +} + +static const struct dev_pm_ops pm4125_sdw_pm_ops = { + SET_RUNTIME_PM_OPS(pm4125_sdw_runtime_suspend, pm4125_sdw_runtime_resume, NULL) +}; + +static struct sdw_driver pm4125_codec_driver = { + .probe = pm4125_probe, + .remove = pm4125_remove, + .ops = &pm4125_slave_ops, + .id_table = pm4125_slave_id, + .driver = { + .name = "pm4125-codec", + .pm = &pm4125_sdw_pm_ops, + } +}; +module_sdw_driver(pm4125_codec_driver); + +MODULE_DESCRIPTION("PM4125 SDW codec driver"); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/pm4125.c b/sound/soc/codecs/pm4125.c new file mode 100644 index 00000000000000..706fc668ffe2a3 --- /dev/null +++ b/sound/soc/codecs/pm4125.c @@ -0,0 +1,1780 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +// Copyright (c) 2025, Linaro Ltd + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pm4125.h" +#include "wcd-mbhc-v2.h" + +#define WCD_MBHC_HS_V_MAX 1600 +#define PM4125_MBHC_MAX_BUTTONS 8 + +#define PM4125_RATES (SNDRV_PCM_RATE_8000 | SNDRV_PCM_RATE_16000 |\ + SNDRV_PCM_RATE_32000 | SNDRV_PCM_RATE_48000 |\ + SNDRV_PCM_RATE_96000 | SNDRV_PCM_RATE_192000 |\ + SNDRV_PCM_RATE_384000) + +/* Fractional Rates */ +#define PM4125_FRAC_RATES (SNDRV_PCM_RATE_44100 | SNDRV_PCM_RATE_88200 |\ + SNDRV_PCM_RATE_176400 | SNDRV_PCM_RATE_352800) + +#define PM4125_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE |\ + SNDRV_PCM_FMTBIT_S24_3LE | SNDRV_PCM_FMTBIT_S32_LE) + +/* Registers in SPMI addr space */ +#define PM4125_CODEC_RESET_REG 0xF3DB +#define PM4125_CODEC_OFF 0x1 +#define PM4125_CODEC_ON 0x0 +#define PM4125_CODEC_FOUNDRY_ID_REG 0x7 + +enum { + HPH_COMP_DELAY, + HPH_PA_DELAY, + AMIC2_BCS_ENABLE, +}; + +enum { + AIF1_PB = 0, + AIF1_CAP, + NUM_CODEC_DAIS, +}; + +struct pm4125_priv { + struct sdw_slave *tx_sdw_dev; + struct pm4125_sdw_priv *sdw_priv[NUM_CODEC_DAIS]; + struct device *txdev; + struct device *rxdev; + struct device_node *rxnode; + struct device_node *txnode; + struct regmap *regmap; + struct regmap *spmi_regmap; + /* mbhc module */ + struct wcd_mbhc *wcd_mbhc; + struct wcd_mbhc_config mbhc_cfg; + struct wcd_mbhc_intr intr_ids; + struct irq_domain *virq; + const struct regmap_irq_chip *pm4125_regmap_irq_chip; + struct regmap_irq_chip_data *irq_chip; + struct snd_soc_jack *jack; + unsigned long status_mask; + s32 micb_ref[PM4125_MAX_MICBIAS]; + s32 pullup_ref[PM4125_MAX_MICBIAS]; + u32 micb1_mv; + u32 micb2_mv; + u32 micb3_mv; + + int hphr_pdm_wd_int; + int hphl_pdm_wd_int; + bool comp1_enable; + bool comp2_enable; + + atomic_t gloal_mbias_cnt; +}; + +static const char * const pm4125_power_supplies[] = { + "vdd-io", "vdd-cp", "vdd-mic-bias", "vdd-pa-vpos", +}; + +static const DECLARE_TLV_DB_SCALE(line_gain, 0, 7, 1); +static const DECLARE_TLV_DB_SCALE(analog_gain, 0, 25, 1); + +static const struct wcd_mbhc_field pm4125_mbhc_fields[WCD_MBHC_REG_FUNC_MAX] = { + WCD_MBHC_FIELD(WCD_MBHC_L_DET_EN, PM4125_ANA_MBHC_MECH, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_GND_DET_EN, PM4125_ANA_MBHC_MECH, 0x40), + WCD_MBHC_FIELD(WCD_MBHC_MECH_DETECTION_TYPE, PM4125_ANA_MBHC_MECH, 0x20), + WCD_MBHC_FIELD(WCD_MBHC_MIC_CLAMP_CTL, PM4125_ANA_MBHC_PLUG_DETECT_CTL, 0x30), + WCD_MBHC_FIELD(WCD_MBHC_ELECT_DETECTION_TYPE, PM4125_ANA_MBHC_ELECT, 0x08), + WCD_MBHC_FIELD(WCD_MBHC_HS_L_DET_PULL_UP_CTRL, PM4125_ANA_MBHC_PLUG_DETECT_CTL, 0x1F), + WCD_MBHC_FIELD(WCD_MBHC_HS_L_DET_PULL_UP_COMP_CTRL, PM4125_ANA_MBHC_MECH, 0x04), + WCD_MBHC_FIELD(WCD_MBHC_HPHL_PLUG_TYPE, PM4125_ANA_MBHC_MECH, 0x10), + WCD_MBHC_FIELD(WCD_MBHC_GND_PLUG_TYPE, PM4125_ANA_MBHC_MECH, 0x08), + WCD_MBHC_FIELD(WCD_MBHC_SW_HPH_LP_100K_TO_GND, PM4125_ANA_MBHC_MECH, 0x01), + WCD_MBHC_FIELD(WCD_MBHC_ELECT_SCHMT_ISRC, PM4125_ANA_MBHC_ELECT, 0x06), + WCD_MBHC_FIELD(WCD_MBHC_FSM_EN, PM4125_ANA_MBHC_ELECT, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_INSREM_DBNC, PM4125_ANA_MBHC_PLUG_DETECT_CTL, 0x0F), + WCD_MBHC_FIELD(WCD_MBHC_BTN_DBNC, PM4125_ANA_MBHC_CTL_1, 0x03), + WCD_MBHC_FIELD(WCD_MBHC_HS_VREF, PM4125_ANA_MBHC_CTL_2, 0x03), + WCD_MBHC_FIELD(WCD_MBHC_HS_COMP_RESULT, PM4125_ANA_MBHC_RESULT_3, 0x08), + WCD_MBHC_FIELD(WCD_MBHC_IN2P_CLAMP_STATE, PM4125_ANA_MBHC_RESULT_3, 0x10), + WCD_MBHC_FIELD(WCD_MBHC_MIC_SCHMT_RESULT, PM4125_ANA_MBHC_RESULT_3, 0x20), + WCD_MBHC_FIELD(WCD_MBHC_HPHL_SCHMT_RESULT, PM4125_ANA_MBHC_RESULT_3, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_HPHR_SCHMT_RESULT, PM4125_ANA_MBHC_RESULT_3, 0x40), + WCD_MBHC_FIELD(WCD_MBHC_BTN_RESULT, PM4125_ANA_MBHC_RESULT_3, 0x07), + WCD_MBHC_FIELD(WCD_MBHC_BTN_ISRC_CTL, PM4125_ANA_MBHC_ELECT, 0x70), + WCD_MBHC_FIELD(WCD_MBHC_ELECT_RESULT, PM4125_ANA_MBHC_RESULT_3, 0xFF), + WCD_MBHC_FIELD(WCD_MBHC_MICB_CTRL, PM4125_ANA_MICBIAS_MICB_1_2_EN, 0xC0), + WCD_MBHC_FIELD(WCD_MBHC_HPHR_PA_EN, PM4125_ANA_HPHPA_CNP_CTL_2, 0x40), + WCD_MBHC_FIELD(WCD_MBHC_HPHL_PA_EN, PM4125_ANA_HPHPA_CNP_CTL_2, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_HPH_PA_EN, PM4125_ANA_HPHPA_CNP_CTL_2, 0xC0), + WCD_MBHC_FIELD(WCD_MBHC_SWCH_LEVEL_REMOVE, PM4125_ANA_MBHC_RESULT_3, 0x10), + WCD_MBHC_FIELD(WCD_MBHC_FSM_STATUS, PM4125_ANA_MBHC_FSM_STATUS, 0x01), + WCD_MBHC_FIELD(WCD_MBHC_MUX_CTL, PM4125_ANA_MBHC_CTL_2, 0x70), + WCD_MBHC_FIELD(WCD_MBHC_MOISTURE_STATUS, PM4125_ANA_MBHC_FSM_STATUS, 0x20), + WCD_MBHC_FIELD(WCD_MBHC_HPHL_OCP_DET_EN, PM4125_ANA_HPHPA_CNP_CTL_2, 0x01), + WCD_MBHC_FIELD(WCD_MBHC_HPHR_OCP_DET_EN, PM4125_ANA_HPHPA_CNP_CTL_2, 0x01), + WCD_MBHC_FIELD(WCD_MBHC_HPHL_OCP_STATUS, PM4125_DIG_SWR_INTR_STATUS_0, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_HPHR_OCP_STATUS, PM4125_DIG_SWR_INTR_STATUS_0, 0x20), + WCD_MBHC_FIELD(WCD_MBHC_ADC_EN, PM4125_ANA_MBHC_CTL_1, 0x08), + WCD_MBHC_FIELD(WCD_MBHC_ADC_COMPLETE, PM4125_ANA_MBHC_FSM_STATUS, 0x40), + WCD_MBHC_FIELD(WCD_MBHC_ADC_TIMEOUT, PM4125_ANA_MBHC_FSM_STATUS, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_ADC_RESULT, PM4125_ANA_MBHC_ADC_RESULT, 0xFF), + WCD_MBHC_FIELD(WCD_MBHC_MICB2_VOUT, PM4125_ANA_MICBIAS_LDO_1_SETTING, 0x3F), + WCD_MBHC_FIELD(WCD_MBHC_ADC_MODE, PM4125_ANA_MBHC_CTL_1, 0x10), + WCD_MBHC_FIELD(WCD_MBHC_DETECTION_DONE, PM4125_ANA_MBHC_CTL_1, 0x04), + WCD_MBHC_FIELD(WCD_MBHC_ELECT_ISRC_EN, PM4125_ANA_MBHC_ZDET, 0x02), +}; + +static const struct regmap_irq pm4125_irqs[PM4125_NUM_IRQS] = { + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_BUTTON_PRESS_DET, 0, BIT(0)), + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_BUTTON_RELEASE_DET, 0, BIT(1)), + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_ELECT_INS_REM_DET, 0, BIT(2)), + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_ELECT_INS_REM_LEG_DET, 0, BIT(3)), + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_SW_DET, 0, BIT(4)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHR_OCP_INT, 0, BIT(5)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHR_CNP_INT, 0, BIT(6)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHL_OCP_INT, 0, BIT(7)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHL_CNP_INT, 1, BIT(0)), + REGMAP_IRQ_REG(PM4125_IRQ_EAR_CNP_INT, 1, BIT(1)), + REGMAP_IRQ_REG(PM4125_IRQ_EAR_SCD_INT, 1, BIT(2)), + REGMAP_IRQ_REG(PM4125_IRQ_AUX_CNP_INT, 1, BIT(3)), + REGMAP_IRQ_REG(PM4125_IRQ_AUX_SCD_INT, 1, BIT(4)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHL_PDM_WD_INT, 1, BIT(5)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHR_PDM_WD_INT, 1, BIT(6)), + REGMAP_IRQ_REG(PM4125_IRQ_AUX_PDM_WD_INT, 1, BIT(7)), + REGMAP_IRQ_REG(PM4125_IRQ_LDORT_SCD_INT, 2, BIT(0)), + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_MOISTURE_INT, 2, BIT(1)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHL_SURGE_DET_INT, 2, BIT(2)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHR_SURGE_DET_INT, 2, BIT(3)), +}; + +static int pm4125_handle_post_irq(void *data) +{ + struct pm4125_priv *pm4125 = (struct pm4125_priv *)data; + + regmap_write(pm4125->regmap, PM4125_DIG_SWR_INTR_CLEAR_0, 0); + regmap_write(pm4125->regmap, PM4125_DIG_SWR_INTR_CLEAR_1, 0); + regmap_write(pm4125->regmap, PM4125_DIG_SWR_INTR_CLEAR_2, 0); + + return IRQ_HANDLED; +} + +static const u32 pm4125_config_regs[] = { + PM4125_DIG_SWR_INTR_LEVEL_0, +}; + +static struct regmap_irq_chip pm4125_regmap_irq_chip = { + .name = "pm4125", + .irqs = pm4125_irqs, + .num_irqs = ARRAY_SIZE(pm4125_irqs), + .num_regs = 3, + .status_base = PM4125_DIG_SWR_INTR_STATUS_0, + .mask_base = PM4125_DIG_SWR_INTR_MASK_0, + .ack_base = PM4125_DIG_SWR_INTR_CLEAR_0, + .use_ack = 1, + .clear_ack = 1, + .config_base = pm4125_config_regs, + .num_config_bases = ARRAY_SIZE(pm4125_config_regs), + .num_config_regs = 1, + .runtime_pm = true, + .handle_post_irq = pm4125_handle_post_irq, +}; + +static void pm4125_reset(struct pm4125_priv *pm4125) +{ + regmap_write(pm4125->spmi_regmap, PM4125_CODEC_RESET_REG, PM4125_CODEC_OFF); + usleep_range(20, 30); + regmap_write(pm4125->spmi_regmap, PM4125_CODEC_RESET_REG, PM4125_CODEC_ON); + usleep_range(5000, 5010); +} + +static void pm4125_io_init(struct regmap *regmap) +{ + /* Disable HPH OCP */ + regmap_update_bits(regmap, PM4125_ANA_HPHPA_CNP_CTL_2, + PM4125_ANA_HPHPA_CNP_OCP_EN_L_MASK | PM4125_ANA_HPHPA_CNP_OCP_EN_R_MASK, + PM4125_ANA_HPHPA_CNP_OCP_DISABLE); + + /* Enable surge protection */ + regmap_update_bits(regmap, PM4125_ANA_SURGE_EN, PM4125_ANA_SURGE_PROTECTION_HPHL_MASK, + FIELD_PREP(PM4125_ANA_SURGE_PROTECTION_HPHL_MASK, + PM4125_ANA_SURGE_PROTECTION_ENABLE)); + regmap_update_bits(regmap, PM4125_ANA_SURGE_EN, PM4125_ANA_SURGE_PROTECTION_HPHR_MASK, + FIELD_PREP(PM4125_ANA_SURGE_PROTECTION_HPHR_MASK, + PM4125_ANA_SURGE_PROTECTION_ENABLE)); + + /* Disable mic bias 2 pull down */ + regmap_update_bits(regmap, PM4125_ANA_MICBIAS_MICB_1_2_EN, + PM4125_ANA_MICBIAS_MICB2_PULL_DN_MASK, + FIELD_PREP(PM4125_ANA_MICBIAS_MICB2_PULL_DN_MASK, + PM4125_ANA_MICBIAS_MICB_PULL_DISABLE)); +} + +static int pm4125_global_mbias_disable(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + if (atomic_dec_and_test(&pm4125->gloal_mbias_cnt)) { + + snd_soc_component_write_field(component, PM4125_ANA_MBIAS_EN, + PM4125_ANA_MBIAS_EN_V2I_MASK, + PM4125_ANA_MBIAS_EN_DISABLE); + snd_soc_component_write_field(component, PM4125_ANA_MBIAS_EN, + PM4125_ANA_MBIAS_EN_GLOBAL_MASK, + PM4125_ANA_MBIAS_EN_DISABLE); + } + + return 0; +} + +static int pm4125_global_mbias_enable(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + if (atomic_inc_return(&pm4125->gloal_mbias_cnt) == 1) { + snd_soc_component_write_field(component, PM4125_ANA_MBIAS_EN, + PM4125_ANA_MBIAS_EN_GLOBAL_MASK, + PM4125_ANA_MBIAS_EN_ENABLE); + snd_soc_component_write_field(component, PM4125_ANA_MBIAS_EN, + PM4125_ANA_MBIAS_EN_V2I_MASK, + PM4125_ANA_MBIAS_EN_ENABLE); + usleep_range(1000, 1100); + } + + return 0; +} + +static int pm4125_rx_clk_enable(struct snd_soc_component *component) +{ + pm4125_global_mbias_enable(component); + + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_ANA_RX_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_ENABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_ANA_RX_DIV2_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_ENABLE); + usleep_range(5000, 5100); + + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_FSM_CLK, + PM4125_ANA_HPHPA_FSM_DIV_RATIO_MASK, + PM4125_ANA_HPHPA_FSM_DIV_RATIO_68); + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_FSM_CLK, + PM4125_ANA_HPHPA_FSM_CLK_DIV_EN_MASK, + PM4125_ANA_HPHPA_FSM_CLK_DIV_ENABLE); + snd_soc_component_update_bits(component, PM4125_ANA_NCP_VCTRL, 0x07, 0x06); + snd_soc_component_write_field(component, PM4125_ANA_NCP_EN, + PM4125_ANA_NCP_ENABLE_MASK, + PM4125_ANA_NCP_ENABLE); + usleep_range(500, 510); + + return 0; +} + +static int pm4125_rx_clk_disable(struct snd_soc_component *component) +{ + + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_FSM_CLK, + PM4125_ANA_HPHPA_FSM_CLK_DIV_EN_MASK, + PM4125_ANA_HPHPA_FSM_CLK_DIV_DISABLE); + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_FSM_CLK, + PM4125_ANA_HPHPA_FSM_DIV_RATIO_MASK, + 0x00); + snd_soc_component_write_field(component, PM4125_ANA_NCP_EN, + PM4125_ANA_NCP_ENABLE_MASK, + PM4125_ANA_NCP_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_ANA_RX_DIV2_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_ANA_RX_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_DISABLE); + + pm4125_global_mbias_disable(component); + + return 0; +} + + +static int pm4125_codec_enable_rxclk(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, + int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + pm4125_rx_clk_enable(component); + break; + case SND_SOC_DAPM_POST_PMD: + pm4125_rx_clk_disable(component); + break; + } + + return 0; +} + +static int pm4125_codec_hphl_dac_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_CNP_CTL_1, + PM4125_ANA_HPHPA_CNP_CTL_1_EN_MASK, + PM4125_ANA_HPHPA_CNP_CTL_1_EN); + snd_soc_component_write_field(component, PM4125_SWR_HPHPA_HD2, + PM4125_SWR_HPHPA_HD2_LEFT_MASK, + PM4125_SWR_HPHPA_HD2_ENABLE); + break; + case SND_SOC_DAPM_POST_PMU: + if (pm4125->comp1_enable) { + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHL_EN_MASK, + PM4125_DIG_SWR_COMP_ENABLE); + + if (pm4125->comp2_enable) + snd_soc_component_write_field(component, + PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHR_EN_MASK, + PM4125_DIG_SWR_COMP_ENABLE); + /* + * 5ms sleep is required after COMP is enabled as per + * HW requirement + */ + usleep_range(5000, 5100); + } else { + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHL_EN_MASK, + PM4125_DIG_SWR_COMP_DISABLE); + } + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX0_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX0_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_ENABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX0_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_ENABLE); + break; + case SND_SOC_DAPM_POST_PMD: + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX0_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX0_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX0_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_ENABLE); + if (pm4125->comp1_enable) + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHL_EN_MASK, + PM4125_DIG_SWR_COMP_DISABLE); + break; + } + + return 0; +} + +static int pm4125_codec_hphr_dac_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_CNP_CTL_1, + PM4125_ANA_HPHPA_CNP_CTL_1_EN_MASK, + PM4125_ANA_HPHPA_CNP_CTL_1_EN); + snd_soc_component_write_field(component, PM4125_SWR_HPHPA_HD2, + PM4125_SWR_HPHPA_HD2_RIGHT_MASK, + PM4125_SWR_HPHPA_HD2_ENABLE); + break; + case SND_SOC_DAPM_POST_PMU: + if (pm4125->comp2_enable) { + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHR_EN_MASK, + PM4125_DIG_SWR_COMP_ENABLE); + if (pm4125->comp1_enable) + snd_soc_component_write_field(component, + PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHL_EN_MASK, + PM4125_DIG_SWR_COMP_ENABLE); + /* + * 5ms sleep is required after COMP is enabled + * as per HW requirement + */ + usleep_range(5000, 5100); + } else { + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHR_EN_MASK, + PM4125_DIG_SWR_COMP_DISABLE); + } + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX1_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX1_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_ENABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX1_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_ENABLE); + break; + case SND_SOC_DAPM_POST_PMD: + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX1_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX1_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX1_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_ENABLE); + if (pm4125->comp2_enable) + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHR_EN_MASK, + PM4125_DIG_SWR_COMP_DISABLE); + break; + } + + return 0; +} + +static int pm4125_codec_ear_lo_dac_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX0_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX0_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_ENABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX0_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_ENABLE); + break; + case SND_SOC_DAPM_POST_PMD: + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX0_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX0_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX0_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_ENABLE); + break; + } + + return 0; +} + + +static int pm4125_codec_enable_hphl_wdt_irq(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + switch (event) { + case SND_SOC_DAPM_POST_PMU: + usleep_range(5000, 5100); + enable_irq(pm4125->hphl_pdm_wd_int); + break; + case SND_SOC_DAPM_PRE_PMD: + disable_irq_nosync(pm4125->hphl_pdm_wd_int); + break; + } + + return 0; +} + +static int pm4125_codec_enable_hphr_wdt_irq(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + switch (event) { + case SND_SOC_DAPM_POST_PMU: + usleep_range(5000, 5100); + enable_irq(pm4125->hphr_pdm_wd_int); + break; + case SND_SOC_DAPM_PRE_PMD: + disable_irq_nosync(pm4125->hphr_pdm_wd_int); + break; + } + + return 0; +} + +static int pm4125_codec_enable_hphr_pa(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + usleep_range(200, 210); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL1, + PM4125_WDT_ENABLE_MASK, + (PM4125_WDT_ENABLE_RX1_M | PM4125_WDT_ENABLE_RX1_L)); + break; + case SND_SOC_DAPM_POST_PMD: + usleep_range(5000, 5100); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL1, + PM4125_WDT_ENABLE_MASK, 0x00); + break; + } + + return 0; +} + +static int pm4125_codec_enable_hphl_pa(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + usleep_range(200, 210); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, + (PM4125_WDT_ENABLE_RX0_M | PM4125_WDT_ENABLE_RX0_L)); + break; + case SND_SOC_DAPM_POST_PMD: + usleep_range(5000, 5100); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, 0x00); + break; + } + + return 0; +} + +static int pm4125_codec_enable_lo_pa(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_5, 0x04, 0x00); + usleep_range(1000, 1010); + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_4, 0x0F, 0x0F); + usleep_range(1000, 1010); + snd_soc_component_write_field(component, PM4125_ANA_COMBOPA_CTL, + PM4125_ANA_COMBO_PA_SELECT_MASK, + PM4125_ANA_COMBO_PA_SELECT_LO); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, + (PM4125_WDT_ENABLE_RX0_M | PM4125_WDT_ENABLE_RX0_L)); + break; + case SND_SOC_DAPM_POST_PMU: + usleep_range(5000, 5010); + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_4, 0x0F, 0x04); + break; + case SND_SOC_DAPM_POST_PMD: + usleep_range(2000, 2010); + snd_soc_component_write_field(component, PM4125_ANA_COMBOPA_CTL, + PM4125_ANA_COMBO_PA_SELECT_MASK, + PM4125_ANA_COMBO_PA_SELECT_EAR); + usleep_range(5000, 5100); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, 0x00); + break; + } + + return 0; +} + +static int pm4125_codec_enable_ear_pa(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_5, 0x04, 0x00); + usleep_range(1000, 1010); + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_4, 0x0F, 0x0F); + usleep_range(1000, 1010); + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL, + PM4125_ANA_COMBO_PA_SELECT_MASK, + PM4125_ANA_COMBO_PA_SELECT_EAR); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, + (PM4125_WDT_ENABLE_RX0_M | PM4125_WDT_ENABLE_RX0_L)); + break; + case SND_SOC_DAPM_POST_PMU: + usleep_range(5000, 5010); + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_4, 0x0F, 0x04); + break; + case SND_SOC_DAPM_POST_PMD: + usleep_range(5000, 5010); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, 0x00); + break; + } + + return 0; +} + +static int pm4125_get_micb_vout_ctl_val(struct device *dev, u32 micb_mv) +{ + if (micb_mv < 1600 || micb_mv > 2850) { + dev_err(dev, "%s: unsupported micbias voltage (%u mV)\n", __func__, micb_mv); + return -EINVAL; + } + + return (micb_mv - 1600) / 50; +} + +static int pm4125_codec_enable_adc(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + /* Enable BCS for Headset mic */ + if (w->shift == 1 && + !(snd_soc_component_read(component, PM4125_ANA_TX_AMIC2) & 0x10)) { + set_bit(AMIC2_BCS_ENABLE, &pm4125->status_mask); + } + pm4125_global_mbias_enable(component); + if (w->shift) + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1, + PM4125_DIG_SWR_TX_ANA_TXD1_MODE_MASK, + PM4125_DIG_SWR_TXD_MODE_NORMAL); + else + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1, + PM4125_DIG_SWR_TX_ANA_TXD0_MODE_MASK, + PM4125_DIG_SWR_TXD_MODE_NORMAL); + break; + case SND_SOC_DAPM_POST_PMD: + if (w->shift == 1 && test_bit(AMIC2_BCS_ENABLE, &pm4125->status_mask)) + clear_bit(AMIC2_BCS_ENABLE, &pm4125->status_mask); + + if (w->shift) + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1, + PM4125_DIG_SWR_TX_ANA_TXD1_MODE_MASK, + 0x00); + else + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1, + PM4125_DIG_SWR_TX_ANA_TXD0_MODE_MASK, + 0x00); + pm4125_global_mbias_disable(component); + break; + }; + + return 0; +} + +static int pm4125_codec_enable_dmic(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + u16 dmic_clk_reg = w->reg; + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_AMIC_CTL, + PM4125_DIG_SWR_AMIC_SELECT_MASK, + PM4125_DIG_SWR_AMIC_SELECT_DMIC1); + snd_soc_component_update_bits(component, dmic_clk_reg, + PM4125_DIG_SWR_DMIC1_CLK_EN_MASK, + PM4125_DIG_SWR_DMIC1_CLK_ENABLE); + break; + case SND_SOC_DAPM_POST_PMD: + snd_soc_component_update_bits(component, dmic_clk_reg, + PM4125_DIG_SWR_DMIC1_CLK_EN_MASK, + PM4125_DIG_SWR_DMIC1_CLK_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_AMIC_CTL, + PM4125_DIG_SWR_AMIC_SELECT_MASK, + PM4125_DIG_SWR_AMIC_SELECT_AMIC3); + break; + } + + return 0; +} + +static int pm4125_micbias_control(struct snd_soc_component *component, int micb_num, int req, + bool is_dapm) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + int micb_index = micb_num - 1; + u16 micb_reg; + u8 pullup_mask = 0, enable_mask = 0; + + if ((micb_index < 0) || (micb_index > PM4125_MAX_MICBIAS - 1)) { + dev_err(component->dev, "%s: Invalid micbias index, micb_ind:%d\n", + __func__, micb_index); + return -EINVAL; + } + switch (micb_num) { + case MIC_BIAS_1: + micb_reg = PM4125_ANA_MICBIAS_MICB_1_2_EN; + pullup_mask = PM4125_ANA_MICBIAS_MICB1_PULL_UP_MASK; + enable_mask = 0x40; + break; + case MIC_BIAS_2: + micb_reg = PM4125_ANA_MICBIAS_MICB_1_2_EN; + pullup_mask = PM4125_ANA_MICBIAS_MICB2_PULL_UP_MASK; + enable_mask = 0x04; + break; + case MIC_BIAS_3: + micb_reg = PM4125_ANA_MICBIAS_MICB_3_EN; + pullup_mask = 0x02; + break; + default: + dev_err(component->dev, "%s: Invalid micbias number: %d\n", + __func__, micb_num); + return -EINVAL; + }; + + switch (req) { + case MICB_PULLUP_ENABLE: + pm4125->pullup_ref[micb_index]++; + if ((pm4125->pullup_ref[micb_index] == 1) && + (pm4125->micb_ref[micb_index] == 0)) + snd_soc_component_update_bits(component, micb_reg, + pullup_mask, pullup_mask); + break; + case MICB_PULLUP_DISABLE: + if (pm4125->pullup_ref[micb_index] > 0) + pm4125->pullup_ref[micb_index]--; + if ((pm4125->pullup_ref[micb_index] == 0) && + (pm4125->micb_ref[micb_index] == 0)) + snd_soc_component_update_bits(component, micb_reg, + pullup_mask, 0x00); + break; + case MICB_ENABLE: + pm4125->micb_ref[micb_index]++; + if (pm4125->micb_ref[micb_index] == 1) { + pm4125_global_mbias_enable(component); + snd_soc_component_update_bits(component, micb_reg, + enable_mask, enable_mask); + } + break; + case MICB_DISABLE: + if (pm4125->micb_ref[micb_index] > 0) + pm4125->micb_ref[micb_index]--; + if ((pm4125->micb_ref[micb_index] == 0) && + (pm4125->pullup_ref[micb_index] > 0)) { + snd_soc_component_update_bits(component, micb_reg, + pullup_mask, pullup_mask); + snd_soc_component_update_bits(component, micb_reg, + enable_mask, 0x00); + pm4125_global_mbias_disable(component); + } else if ((pm4125->micb_ref[micb_index] == 0) && + (pm4125->pullup_ref[micb_index] == 0)) { + snd_soc_component_update_bits(component, micb_reg, + enable_mask, 0x00); + pm4125_global_mbias_disable(component); + } + break; + }; + + return 0; +} + +static int pm4125_codec_enable_micbias(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, + int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + int micb_num = w->shift; + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + if (micb_num == MIC_BIAS_3) + pm4125_micbias_control(component, micb_num, MICB_PULLUP_ENABLE, true); + else + pm4125_micbias_control(component, micb_num, MICB_ENABLE, true); + break; + case SND_SOC_DAPM_POST_PMU: + usleep_range(1000, 1100); + break; + case SND_SOC_DAPM_POST_PMD: + if (micb_num == MIC_BIAS_3) + pm4125_micbias_control(component, micb_num, MICB_PULLUP_DISABLE, true); + else + pm4125_micbias_control(component, micb_num, MICB_DISABLE, true); + break; + } + + return 0; +} + +static int pm4125_codec_enable_micbias_pullup(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + int micb_num = w->shift; + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + pm4125_micbias_control(component, micb_num, MICB_PULLUP_ENABLE, true); + break; + case SND_SOC_DAPM_POST_PMU: + usleep_range(1000, 1100); + break; + case SND_SOC_DAPM_POST_PMD: + pm4125_micbias_control(component, micb_num, MICB_PULLUP_DISABLE, true); + break; + } + + return 0; +} + +static int pm4125_connect_port(struct pm4125_sdw_priv *sdw_priv, u8 port_idx, u8 ch_id, bool enable) +{ + struct sdw_port_config *port_config = &sdw_priv->port_config[port_idx - 1]; + const struct pm4125_sdw_ch_info *ch_info = &sdw_priv->ch_info[ch_id]; + struct sdw_slave *sdev = sdw_priv->sdev; + u8 port_num = ch_info->port_num; + u8 ch_mask = ch_info->ch_mask; + u8 mstr_port_num, mstr_ch_mask; + + port_config->num = port_num; + + mstr_port_num = sdev->m_port_map[port_num]; + mstr_ch_mask = ch_info->master_ch_mask; + + if (enable) { + port_config->ch_mask |= ch_mask; + sdw_priv->master_channel_map[mstr_port_num] |= mstr_ch_mask; + } else { + port_config->ch_mask &= ~ch_mask; + sdw_priv->master_channel_map[mstr_port_num] &= ~mstr_ch_mask; + } + + return 0; +} + +static int pm4125_get_compander(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *component = snd_soc_kcontrol_component(kcontrol); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + struct soc_mixer_control *mc; + bool hphr; + + mc = (struct soc_mixer_control *)(kcontrol->private_value); + hphr = mc->shift; + + ucontrol->value.integer.value[0] = hphr ? pm4125->comp2_enable : pm4125->comp1_enable; + return 0; +} + +static int pm4125_set_compander(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *component = snd_soc_kcontrol_component(kcontrol); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + struct pm4125_sdw_priv *sdw_priv = pm4125->sdw_priv[AIF1_PB]; + int value = ucontrol->value.integer.value[0]; + struct soc_mixer_control *mc; + int portidx; + bool hphr; + + mc = (struct soc_mixer_control *)(kcontrol->private_value); + hphr = mc->shift; + + if (hphr) { + if (value == pm4125->comp2_enable) + return 0; + + pm4125->comp2_enable = value; + } else { + if (value == pm4125->comp1_enable) + return 0; + + pm4125->comp1_enable = value; + } + + portidx = sdw_priv->ch_info[mc->reg].port_num; + + pm4125_connect_port(sdw_priv, portidx, mc->reg, value ? true : false); + + return 1; +} + +static int pm4125_get_swr_port(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) +{ + struct soc_mixer_control *mixer = (struct soc_mixer_control *)kcontrol->private_value; + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(comp); + struct pm4125_sdw_priv *sdw_priv; + int dai_id = mixer->shift; + int ch_idx = mixer->reg; + int portidx; + + sdw_priv = pm4125->sdw_priv[dai_id]; + portidx = sdw_priv->ch_info[ch_idx].port_num; + + ucontrol->value.integer.value[0] = sdw_priv->port_enable[portidx]; + + return 0; +} + +static int pm4125_set_swr_port(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) +{ + struct soc_mixer_control *mixer = (struct soc_mixer_control *)kcontrol->private_value; + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(comp); + struct pm4125_sdw_priv *sdw_priv; + int dai_id = mixer->shift; + int ch_idx = mixer->reg; + int portidx; + bool enable; + + sdw_priv = pm4125->sdw_priv[dai_id]; + + portidx = sdw_priv->ch_info[ch_idx].port_num; + + enable = ucontrol->value.integer.value[0]; + + if (enable == sdw_priv->port_enable[portidx]) { + pm4125_connect_port(sdw_priv, portidx, ch_idx, enable); + return 0; + } + + sdw_priv->port_enable[portidx] = enable; + pm4125_connect_port(sdw_priv, portidx, ch_idx, enable); + + return 1; +} + +static void pm4125_mbhc_bias_control(struct snd_soc_component *component, bool enable) +{ + snd_soc_component_write_field(component, PM4125_ANA_MBHC_ELECT, + PM4125_ANA_MBHC_ELECT_BIAS_EN_MASK, + enable ? PM4125_ANA_MBHC_ELECT_BIAS_ENABLE : + PM4125_ANA_MBHC_ELECT_BIAS_DISABLE); +} + +static void pm4125_mbhc_program_btn_thr(struct snd_soc_component *component, + int *btn_low, int *btn_high, + int num_btn, bool is_micbias) +{ + int i, vth; + + if (num_btn > WCD_MBHC_DEF_BUTTONS) { + dev_err(component->dev, "%s: invalid number of buttons: %d\n", + __func__, num_btn); + return; + } + + for (i = 0; i < num_btn; i++) { + vth = ((btn_high[i] * 2) / 25) & 0x3F; + snd_soc_component_write_field(component, PM4125_ANA_MBHC_BTN0_ZDET_VREF1 + i, + PM4125_ANA_MBHC_BTN0_THRESHOLD_MASK, vth << 2); + } +} + +static const struct wcd_mbhc_cb mbhc_cb = { + .mbhc_bias = pm4125_mbhc_bias_control, + .set_btn_thr = pm4125_mbhc_program_btn_thr, +}; + +static int pm4125_mbhc_init(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + struct wcd_mbhc_intr *intr_ids = &pm4125->intr_ids; + + intr_ids->mbhc_sw_intr = regmap_irq_get_virq(pm4125->irq_chip, PM4125_IRQ_MBHC_SW_DET); + + intr_ids->mbhc_btn_press_intr = regmap_irq_get_virq(pm4125->irq_chip, + PM4125_IRQ_MBHC_BUTTON_PRESS_DET); + + intr_ids->mbhc_btn_release_intr = regmap_irq_get_virq(pm4125->irq_chip, + PM4125_IRQ_MBHC_BUTTON_RELEASE_DET); + + intr_ids->mbhc_hs_ins_intr = regmap_irq_get_virq(pm4125->irq_chip, + PM4125_IRQ_MBHC_ELECT_INS_REM_LEG_DET); + + intr_ids->mbhc_hs_rem_intr = regmap_irq_get_virq(pm4125->irq_chip, + PM4125_IRQ_MBHC_ELECT_INS_REM_DET); + + intr_ids->hph_left_ocp = regmap_irq_get_virq(pm4125->irq_chip, PM4125_IRQ_HPHL_OCP_INT); + + intr_ids->hph_right_ocp = regmap_irq_get_virq(pm4125->irq_chip, PM4125_IRQ_HPHR_OCP_INT); + + pm4125->wcd_mbhc = wcd_mbhc_init(component, &mbhc_cb, intr_ids, pm4125_mbhc_fields, false); + if (IS_ERR(pm4125->wcd_mbhc)) + return PTR_ERR(pm4125->wcd_mbhc); + + return 0; +} + +static void pm4125_mbhc_deinit(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + wcd_mbhc_deinit(pm4125->wcd_mbhc); +} + +static const struct snd_kcontrol_new pm4125_snd_controls[] = { + SOC_SINGLE_EXT("HPHL_COMP Switch", PM4125_COMP_L, 0, 1, 0, + pm4125_get_compander, pm4125_set_compander), + SOC_SINGLE_EXT("HPHR_COMP Switch", PM4125_COMP_R, 1, 1, 0, + pm4125_get_compander, pm4125_set_compander), + + SOC_SINGLE_TLV("HPHL Volume", PM4125_ANA_HPHPA_L_GAIN, 0, 20, 1, + line_gain), + SOC_SINGLE_TLV("HPHR Volume", PM4125_ANA_HPHPA_R_GAIN, 0, 20, 1, + line_gain), + SOC_SINGLE_TLV("ADC1 Volume", PM4125_ANA_TX_AMIC1, 0, 8, 0, + analog_gain), + SOC_SINGLE_TLV("ADC2 Volume", PM4125_ANA_TX_AMIC2, 0, 8, 0, + analog_gain), + + SOC_SINGLE_EXT("HPHL Switch", PM4125_HPH_L, 0, 1, 0, + pm4125_get_swr_port, pm4125_set_swr_port), + SOC_SINGLE_EXT("HPHR Switch", PM4125_HPH_R, 0, 1, 0, + pm4125_get_swr_port, pm4125_set_swr_port), + + SOC_SINGLE_EXT("ADC1 Switch", PM4125_ADC1, 1, 1, 0, + pm4125_get_swr_port, pm4125_set_swr_port), + SOC_SINGLE_EXT("ADC2 Switch", PM4125_ADC2, 1, 1, 0, + pm4125_get_swr_port, pm4125_set_swr_port), +}; + +static const struct snd_kcontrol_new adc1_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new adc2_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new dmic1_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new dmic2_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new ear_rdac_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new lo_rdac_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new hphl_rdac_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new hphr_rdac_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const char * const adc2_mux_text[] = { + "INP2", "INP3" +}; + +static const struct soc_enum adc2_enum = SOC_ENUM_SINGLE(PM4125_ANA_TX_AMIC2, 4, + ARRAY_SIZE(adc2_mux_text), adc2_mux_text); + +static const struct snd_kcontrol_new tx_adc2_mux = SOC_DAPM_ENUM("ADC2 MUX Mux", adc2_enum); + +static const struct snd_soc_dapm_widget pm4125_dapm_widgets[] = { + /* Input widgets */ + SND_SOC_DAPM_INPUT("AMIC1"), + SND_SOC_DAPM_INPUT("AMIC2"), + SND_SOC_DAPM_INPUT("AMIC3"), + SND_SOC_DAPM_INPUT("IN1_HPHL"), + SND_SOC_DAPM_INPUT("IN2_HPHR"), + + /* TX widgets */ + SND_SOC_DAPM_ADC_E("ADC1", NULL, SND_SOC_NOPM, 0, 0, pm4125_codec_enable_adc, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_ADC_E("ADC2", NULL, SND_SOC_NOPM, 1, 0, pm4125_codec_enable_adc, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + + SND_SOC_DAPM_MUX("ADC2 MUX", SND_SOC_NOPM, 0, 0, &tx_adc2_mux), + + /* TX mixers */ + SND_SOC_DAPM_MIXER("ADC1_MIXER", SND_SOC_NOPM, 0, 0, adc1_switch, ARRAY_SIZE(adc1_switch)), + SND_SOC_DAPM_MIXER("ADC2_MIXER", SND_SOC_NOPM, 1, 0, adc2_switch, ARRAY_SIZE(adc2_switch)), + + /* MIC_BIAS widgets */ + SND_SOC_DAPM_SUPPLY("MIC BIAS1", SND_SOC_NOPM, MIC_BIAS_1, 0, pm4125_codec_enable_micbias, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_SUPPLY("MIC BIAS2", SND_SOC_NOPM, MIC_BIAS_2, 0, pm4125_codec_enable_micbias, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_SUPPLY("MIC BIAS3", SND_SOC_NOPM, MIC_BIAS_3, 0, pm4125_codec_enable_micbias, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + + SND_SOC_DAPM_SUPPLY("PA_VPOS", SND_SOC_NOPM, 0, 0, NULL, 0), + + /* RX widgets */ + SND_SOC_DAPM_PGA_E("EAR PGA", PM4125_ANA_COMBOPA_CTL, 7, 0, NULL, 0, + pm4125_codec_enable_ear_pa, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | + SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_PGA_E("LO PGA", PM4125_ANA_COMBOPA_CTL, 7, 0, NULL, 0, + pm4125_codec_enable_lo_pa, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | + SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_PGA_E("HPHL PGA", PM4125_ANA_HPHPA_CNP_CTL_2, 7, 0, NULL, 0, + pm4125_codec_enable_hphl_pa, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_PGA_E("HPHR PGA", PM4125_ANA_HPHPA_CNP_CTL_2, 6, 0, NULL, 0, + pm4125_codec_enable_hphr_pa, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + + SND_SOC_DAPM_DAC_E("RDAC1", NULL, SND_SOC_NOPM, 0, 0, pm4125_codec_hphl_dac_event, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | + SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_DAC_E("RDAC2", NULL, SND_SOC_NOPM, 0, 0, pm4125_codec_hphr_dac_event, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | + SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_DAC_E("RDAC3", NULL, SND_SOC_NOPM, 0, 0, pm4125_codec_ear_lo_dac_event, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | + SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMD), + + + SND_SOC_DAPM_SUPPLY("HPHL_WDT_IRQ", SND_SOC_NOPM, 0, 0, pm4125_codec_enable_hphl_wdt_irq, + SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD), + SND_SOC_DAPM_SUPPLY("HPHR_WDT_IRQ", SND_SOC_NOPM, 0, 0, pm4125_codec_enable_hphr_wdt_irq, + SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD), + SND_SOC_DAPM_SUPPLY("RXCLK", SND_SOC_NOPM, 0, 0, pm4125_codec_enable_rxclk, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_MIXER_E("RX1", SND_SOC_NOPM, 0, 0, NULL, 0, NULL, 0), + SND_SOC_DAPM_MIXER_E("RX2", SND_SOC_NOPM, 0, 0, NULL, 0, NULL, 0), + + /* RX mixer widgets */ + SND_SOC_DAPM_MIXER("EAR_RDAC", SND_SOC_NOPM, 0, 0, ear_rdac_switch, + ARRAY_SIZE(ear_rdac_switch)), + SND_SOC_DAPM_MIXER("LO_RDAC", SND_SOC_NOPM, 0, 0, lo_rdac_switch, + ARRAY_SIZE(lo_rdac_switch)), + SND_SOC_DAPM_MIXER("HPHL_RDAC", SND_SOC_NOPM, 0, 0, hphl_rdac_switch, + ARRAY_SIZE(hphl_rdac_switch)), + SND_SOC_DAPM_MIXER("HPHR_RDAC", SND_SOC_NOPM, 0, 0, hphr_rdac_switch, + ARRAY_SIZE(hphr_rdac_switch)), + + /* TX output widgets */ + SND_SOC_DAPM_OUTPUT("ADC1_OUTPUT"), + SND_SOC_DAPM_OUTPUT("ADC2_OUTPUT"), + + /* RX output widgets */ + SND_SOC_DAPM_OUTPUT("EAR"), + SND_SOC_DAPM_OUTPUT("LO"), + SND_SOC_DAPM_OUTPUT("HPHL"), + SND_SOC_DAPM_OUTPUT("HPHR"), + + /* MIC_BIAS pull up widgets */ + SND_SOC_DAPM_SUPPLY("VA MIC BIAS1", SND_SOC_NOPM, MIC_BIAS_1, 0, + pm4125_codec_enable_micbias_pullup, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_SUPPLY("VA MIC BIAS2", SND_SOC_NOPM, MIC_BIAS_2, 0, + pm4125_codec_enable_micbias_pullup, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_SUPPLY("VA MIC BIAS3", SND_SOC_NOPM, MIC_BIAS_3, 0, + pm4125_codec_enable_micbias_pullup, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + + /* TX widgets */ + SND_SOC_DAPM_ADC_E("DMIC1", NULL, PM4125_DIG_SWR_CDC_DMIC1_CTL, 0, 0, + pm4125_codec_enable_dmic, SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_ADC_E("DMIC2", NULL, PM4125_DIG_SWR_CDC_DMIC1_CTL, 1, 0, + pm4125_codec_enable_dmic, SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + + /* TX mixer widgets */ + SND_SOC_DAPM_MIXER("DMIC1_MIXER", SND_SOC_NOPM, 0, 0, dmic1_switch, + ARRAY_SIZE(dmic1_switch)), + SND_SOC_DAPM_MIXER("DMIC2_MIXER", SND_SOC_NOPM, 1, 0, dmic2_switch, + ARRAY_SIZE(dmic2_switch)), + + /* Output widgets */ + SND_SOC_DAPM_OUTPUT("DMIC1_OUTPUT"), + SND_SOC_DAPM_OUTPUT("DMIC2_OUTPUT"), +}; + +static const struct snd_soc_dapm_route pm4125_audio_map[] = { + { "ADC1_OUTPUT", NULL, "ADC1_MIXER" }, + { "ADC1_MIXER", "Switch", "ADC1" }, + { "ADC1", NULL, "AMIC1" }, + + { "ADC2_OUTPUT", NULL, "ADC2_MIXER" }, + { "ADC2_MIXER", "Switch", "ADC2" }, + { "ADC2", NULL, "ADC2 MUX" }, + { "ADC2 MUX", "INP3", "AMIC3" }, + { "ADC2 MUX", "INP2", "AMIC2" }, + + { "IN1_HPHL", NULL, "PA_VPOS" }, + { "RX1", NULL, "IN1_HPHL" }, + { "RX1", NULL, "RXCLK" }, + { "RX1", NULL, "HPHL_WDT_IRQ" }, + { "RDAC1", NULL, "RX1" }, + { "HPHL_RDAC", "Switch", "RDAC1" }, + { "HPHL PGA", NULL, "HPHL_RDAC" }, + { "HPHL", NULL, "HPHL PGA" }, + + { "IN2_HPHR", NULL, "PA_VPOS" }, + { "RX2", NULL, "IN2_HPHR" }, + { "RX2", NULL, "RXCLK" }, + { "RX2", NULL, "HPHR_WDT_IRQ" }, + { "RDAC2", NULL, "RX2" }, + { "HPHR_RDAC", "Switch", "RDAC2" }, + { "HPHR PGA", NULL, "HPHR_RDAC" }, + { "HPHR", NULL, "HPHR PGA" }, + + { "RDAC3", NULL, "RX1" }, + { "EAR_RDAC", "Switch", "RDAC3" }, + { "EAR PGA", NULL, "EAR_RDAC" }, + { "EAR", NULL, "EAR PGA" }, + + { "LO_RDAC", "Switch", "RDAC3" }, + { "LO PGA", NULL, "LO_RDAC" }, + { "LO", NULL, "LO PGA" }, + + { "DMIC1_OUTPUT", NULL, "DMIC1_MIXER" }, + { "DMIC1_MIXER", "Switch", "DMIC1" }, + + { "DMIC2_OUTPUT", NULL, "DMIC2_MIXER" }, + { "DMIC2_MIXER", "Switch", "DMIC2" }, +}; + +static int pm4125_set_micbias_data(struct device *dev, struct pm4125_priv *pm4125) +{ + int vout_ctl; + + /* Set micbias voltage */ + vout_ctl = pm4125_get_micb_vout_ctl_val(dev, pm4125->micb1_mv); + if (vout_ctl < 0) + return -EINVAL; + + regmap_update_bits(pm4125->regmap, PM4125_ANA_MICBIAS_LDO_1_SETTING, + PM4125_ANA_MICBIAS_MICB_OUT_VAL_MASK, vout_ctl << 3); + return 0; +} + +static irqreturn_t pm4125_wd_handle_irq(int irq, void *data) +{ + /* + * HPHR/HPHL Watchdog interrupt threaded handler + * Watchdog interrupts are expected to be enabled when switching on the HPHL/R + * in order to make sure the interrupts are acked by the regmap_irq handler + * io allow PDM sync. We could leave those interrupts masked but we would + * not haveany valid way to enable/disable them without violating irq layers. + * + * The HPHR/HPHL Watchdog interrupts are handled by regmap_irq, so requesting + * a threaded handler is the safest way to be able to ack those interrupts + * without colliding with the regmap_irq setup. + */ + return IRQ_HANDLED; +} + +static const struct irq_chip pm4125_codec_irq_chip = { + .name = "pm4125_codec", +}; + +static int pm4125_codec_irq_chip_map(struct irq_domain *irqd, unsigned int virq, + irq_hw_number_t hw) +{ + irq_set_chip_and_handler(virq, &pm4125_codec_irq_chip, handle_simple_irq); + irq_set_nested_thread(virq, 1); + irq_set_noprobe(virq); + + return 0; +} + +static const struct irq_domain_ops pm4125_domain_ops = { + .map = pm4125_codec_irq_chip_map, +}; + +static int pm4125_irq_init(struct pm4125_priv *pm4125, struct device *dev) +{ + pm4125->virq = irq_domain_add_linear(NULL, 1, &pm4125_domain_ops, NULL); + if (!(pm4125->virq)) { + dev_err(dev, "%s: Failed to add IRQ domain\n", __func__); + return -EINVAL; + } + + pm4125_regmap_irq_chip.irq_drv_data = pm4125; + + return devm_regmap_add_irq_chip(dev, pm4125->regmap, irq_create_mapping(pm4125->virq, 0), + IRQF_ONESHOT, 0, &pm4125_regmap_irq_chip, + &pm4125->irq_chip); +} + +static int pm4125_soc_codec_probe(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + struct sdw_slave *tx_sdw_dev = pm4125->tx_sdw_dev; + struct device *dev = component->dev; + unsigned long time_left; + int i, ret; + + time_left = wait_for_completion_timeout(&tx_sdw_dev->initialization_complete, + msecs_to_jiffies(5000)); + if (!time_left) { + dev_err(dev, "soundwire device init timeout\n"); + return -ETIMEDOUT; + } + + snd_soc_component_init_regmap(component, pm4125->regmap); + ret = pm_runtime_resume_and_get(dev); + if (ret < 0) + return ret; + + pm4125_io_init(pm4125->regmap); + + /* Set all interrupts as edge triggered */ + for (i = 0; i < pm4125_regmap_irq_chip.num_regs; i++) + regmap_write(pm4125->regmap, (PM4125_DIG_SWR_INTR_LEVEL_0 + i), 0); + + pm_runtime_put(dev); + + pm4125->hphr_pdm_wd_int = regmap_irq_get_virq(pm4125->irq_chip, PM4125_IRQ_HPHR_PDM_WD_INT); + pm4125->hphl_pdm_wd_int = regmap_irq_get_virq(pm4125->irq_chip, PM4125_IRQ_HPHL_PDM_WD_INT); + + /* Request for watchdog interrupts */ + ret = devm_request_threaded_irq(dev, pm4125->hphr_pdm_wd_int, NULL, pm4125_wd_handle_irq, + IRQF_ONESHOT | IRQF_TRIGGER_RISING, + "HPHR PDM WDOG INT", pm4125); + if (ret) + dev_err(dev, "Failed to request HPHR wdt interrupt: %d\n", ret); + + ret = devm_request_threaded_irq(dev, pm4125->hphl_pdm_wd_int, NULL, pm4125_wd_handle_irq, + IRQF_ONESHOT | IRQF_TRIGGER_RISING, + "HPHL PDM WDOG INT", pm4125); + if (ret) + dev_err(dev, "Failed to request HPHL wdt interrupt: %d\n", ret); + + disable_irq_nosync(pm4125->hphr_pdm_wd_int); + disable_irq_nosync(pm4125->hphl_pdm_wd_int); + + ret = pm4125_mbhc_init(component); + if (ret) + dev_err(component->dev, "mbhc initialization failed\n"); + + return ret; +} + +static void pm4125_soc_codec_remove(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + pm4125_mbhc_deinit(component); + free_irq(pm4125->hphl_pdm_wd_int, pm4125); + free_irq(pm4125->hphr_pdm_wd_int, pm4125); +} + +static int pm4125_codec_set_jack(struct snd_soc_component *comp, struct snd_soc_jack *jack, + void *data) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(comp->dev); + int ret = 0; + + if (jack) + ret = wcd_mbhc_start(pm4125->wcd_mbhc, &pm4125->mbhc_cfg, jack); + else + wcd_mbhc_stop(pm4125->wcd_mbhc); + + return ret; +} + +static const struct snd_soc_component_driver soc_codec_dev_pm4125 = { + .name = "pm4125_codec", + .probe = pm4125_soc_codec_probe, + .remove = pm4125_soc_codec_remove, + .controls = pm4125_snd_controls, + .num_controls = ARRAY_SIZE(pm4125_snd_controls), + .dapm_widgets = pm4125_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(pm4125_dapm_widgets), + .dapm_routes = pm4125_audio_map, + .num_dapm_routes = ARRAY_SIZE(pm4125_audio_map), + .set_jack = pm4125_codec_set_jack, + .endianness = 1, +}; + +static void pm4125_dt_parse_micbias_info(struct device *dev, struct pm4125_priv *priv) +{ + struct device_node *np = dev->of_node; + u32 prop_val = 0; + int ret; + + ret = of_property_read_u32(np, "qcom,micbias1-microvolt", &prop_val); + if (!ret) + priv->micb1_mv = prop_val / 1000; + else + dev_warn(dev, "Micbias1 DT property not found\n"); + + ret = of_property_read_u32(np, "qcom,micbias2-microvolt", &prop_val); + if (!ret) + priv->micb2_mv = prop_val / 1000; + else + dev_warn(dev, "Micbias2 DT property not found\n"); + + ret = of_property_read_u32(np, "qcom,micbias3-microvolt", &prop_val); + if (!ret) + priv->micb3_mv = prop_val / 1000; + else + dev_warn(dev, "Micbias3 DT property not found\n"); +} + +static int pm4125_codec_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, + struct snd_soc_dai *dai) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dai->dev); + struct pm4125_sdw_priv *sdw_priv = pm4125->sdw_priv[dai->id]; + + return pm4125_sdw_hw_params(sdw_priv, substream, params, dai); +} + +static int pm4125_codec_free(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dai->dev); + struct pm4125_sdw_priv *sdw_priv = pm4125->sdw_priv[dai->id]; + + return sdw_stream_remove_slave(sdw_priv->sdev, sdw_priv->sruntime); +} + +static int pm4125_codec_set_sdw_stream(struct snd_soc_dai *dai, void *stream, int direction) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dai->dev); + struct pm4125_sdw_priv *sdw_priv = pm4125->sdw_priv[dai->id]; + + sdw_priv->sruntime = stream; + + return 0; +} + +static int pm4125_get_channel_map(const struct snd_soc_dai *dai, + unsigned int *tx_num, unsigned int *tx_slot, + unsigned int *rx_num, unsigned int *rx_slot) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dai->dev); + struct pm4125_sdw_priv *sdw_priv = pm4125->sdw_priv[dai->id]; + int i; + + switch (dai->id) { + case AIF1_PB: + if (!rx_slot || !rx_num) { + dev_err(dai->dev, "Invalid rx_slot %p or rx_num %p\n", rx_slot, rx_num); + return -EINVAL; + } + + for (i = 0; i < SDW_MAX_PORTS; i++) + rx_slot[i] = sdw_priv->master_channel_map[i]; + + *rx_num = i; + break; + case AIF1_CAP: + if (!tx_slot || !tx_num) { + dev_err(dai->dev, "Invalid tx_slot %p or tx_num %p\n", tx_slot, tx_num); + return -EINVAL; + } + + for (i = 0; i < SDW_MAX_PORTS; i++) + tx_slot[i] = sdw_priv->master_channel_map[i]; + + *tx_num = i; + break; + default: + break; + } + + return 0; +} + +static const struct snd_soc_dai_ops pm4125_sdw_dai_ops = { + .hw_params = pm4125_codec_hw_params, + .hw_free = pm4125_codec_free, + .set_stream = pm4125_codec_set_sdw_stream, + .get_channel_map = pm4125_get_channel_map, +}; + +static struct snd_soc_dai_driver pm4125_dais[] = { + [0] = { + .name = "pm4125-sdw-rx", + .playback = { + .stream_name = "PM4125 AIF Playback", + .rates = PM4125_RATES | PM4125_FRAC_RATES, + .formats = PM4125_FORMATS, + .rate_min = 8000, + .rate_max = 384000, + .channels_min = 1, + .channels_max = 4, + }, + .ops = &pm4125_sdw_dai_ops, + }, + [1] = { + .name = "pm4125-sdw-tx", + .capture = { + .stream_name = "PM4125 AIF Capture", + .rates = PM4125_RATES, + .formats = PM4125_FORMATS, + .rate_min = 8000, + .rate_max = 192000, + .channels_min = 1, + .channels_max = 4, + }, + .ops = &pm4125_sdw_dai_ops, + }, +}; + +static int pm4125_bind(struct device *dev) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dev); + struct device_link *devlink; + int ret; + + /* Give the soundwire subdevices some more time to settle */ + usleep_range(15000, 15010); + + ret = component_bind_all(dev, pm4125); + if (ret) { + dev_err(dev, "Slave bind failed, ret = %d\n", ret); + return ret; + } + + pm4125->rxdev = pm4125_sdw_device_get(pm4125->rxnode); + if (!pm4125->rxdev) { + dev_err(dev, "could not find rxslave with matching of node\n"); + ret = -EINVAL; + goto error_unbind_all; + } + + pm4125->sdw_priv[AIF1_PB] = dev_get_drvdata(pm4125->rxdev); + pm4125->sdw_priv[AIF1_PB]->pm4125 = pm4125; + + pm4125->txdev = pm4125_sdw_device_get(pm4125->txnode); + if (!pm4125->txdev) { + dev_err(dev, "could not find txslave with matching of node\n"); + ret = -EINVAL; + goto error_unbind_all; + } + + pm4125->sdw_priv[AIF1_CAP] = dev_get_drvdata(pm4125->txdev); + pm4125->sdw_priv[AIF1_CAP]->pm4125 = pm4125; + + pm4125->tx_sdw_dev = dev_to_sdw_dev(pm4125->txdev); + if (!pm4125->tx_sdw_dev) { + dev_err(dev, "could not get txslave with matching of dev\n"); + ret = -EINVAL; + goto error_unbind_all; + } + + /* + * As TX is the main CSR reg interface, which should not be suspended first. + * expicilty add the dependency link + */ + devlink = device_link_add(pm4125->rxdev, pm4125->txdev, + DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME); + if (!devlink) { + dev_err(dev, "Could not devlink TX and RX\n"); + ret = -EINVAL; + goto error_unbind_all; + } + + devlink = device_link_add(dev, pm4125->txdev, + DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME); + if (!devlink) { + dev_err(dev, "Could not devlink PM4125 and TX\n"); + ret = -EINVAL; + goto link_remove_rx_tx; + } + + devlink = device_link_add(dev, pm4125->rxdev, + DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME); + if (!devlink) { + dev_err(dev, "Could not devlink PM4125 and RX\n"); + ret = -EINVAL; + goto link_remove_dev_tx; + } + + pm4125->regmap = dev_get_regmap(&pm4125->tx_sdw_dev->dev, NULL); + if (!pm4125->regmap) { + dev_err(dev, "could not get TX device regmap\n"); + ret = -EINVAL; + goto link_remove_dev_rx; + } + + ret = pm4125_irq_init(pm4125, dev); + if (ret) { + dev_err(dev, "IRQ init failed: %d\n", ret); + goto link_remove_dev_rx; + } + + pm4125->sdw_priv[AIF1_PB]->slave_irq = pm4125->virq; + pm4125->sdw_priv[AIF1_CAP]->slave_irq = pm4125->virq; + + ret = pm4125_set_micbias_data(dev, pm4125); + if (ret < 0) { + dev_err(dev, "Bad micbias pdata\n"); + goto link_remove_dev_rx; + } + + ret = snd_soc_register_component(dev, &soc_codec_dev_pm4125, + pm4125_dais, ARRAY_SIZE(pm4125_dais)); + if (!ret) + return ret; + + dev_err(dev, "Codec registration failed\n"); + +link_remove_dev_rx: + device_link_remove(dev, pm4125->rxdev); +link_remove_dev_tx: + device_link_remove(dev, pm4125->txdev); +link_remove_rx_tx: + device_link_remove(pm4125->rxdev, pm4125->txdev); +error_unbind_all: + component_unbind_all(dev, pm4125); + return ret; +} + +static void pm4125_unbind(struct device *dev) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dev); + + snd_soc_unregister_component(dev); + device_link_remove(dev, pm4125->txdev); + device_link_remove(dev, pm4125->rxdev); + device_link_remove(pm4125->rxdev, pm4125->txdev); + component_unbind_all(dev, pm4125); +} + +static const struct component_master_ops pm4125_comp_ops = { + .bind = pm4125_bind, + .unbind = pm4125_unbind, +}; + +static int pm4125_add_slave_components(struct pm4125_priv *pm4125, struct device *dev, + struct component_match **matchptr) +{ + struct device_node *np = dev->of_node; + + pm4125->rxnode = of_parse_phandle(np, "qcom,rx-device", 0); + if (!pm4125->rxnode) + return dev_err_probe(dev, -ENODEV, "Couldn't parse phandle to qcom,rx-device\n"); + component_match_add_release(dev, matchptr, component_release_of, component_compare_of, + pm4125->rxnode); + + pm4125->txnode = of_parse_phandle(np, "qcom,tx-device", 0); + if (!pm4125->txnode) + return dev_err_probe(dev, -ENODEV, "Couldn't parse phandle to qcom,tx-device\n"); + component_match_add_release(dev, matchptr, component_release_of, component_compare_of, + pm4125->txnode); + + return 0; +} + +static int pm4125_probe(struct platform_device *pdev) +{ + struct component_match *match = NULL; + struct device *dev = &pdev->dev; + struct pm4125_priv *pm4125; + struct wcd_mbhc_config *cfg; + int ret; + + pm4125 = devm_kzalloc(dev, sizeof(*pm4125), GFP_KERNEL); + if (!pm4125) + return -ENOMEM; + + dev_set_drvdata(dev, pm4125); + + ret = devm_regulator_bulk_get_enable(dev, ARRAY_SIZE(pm4125_power_supplies), + pm4125_power_supplies); + if (ret) + return dev_err_probe(dev, ret, "Failed to get and enable supplies\n"); + + pm4125->spmi_regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!pm4125->spmi_regmap) + return -ENXIO; + + pm4125_reset(pm4125); + + pm4125_dt_parse_micbias_info(dev, pm4125); + atomic_set(&pm4125->gloal_mbias_cnt, 0); + + cfg = &pm4125->mbhc_cfg; + cfg->mbhc_micbias = MIC_BIAS_2; + cfg->anc_micbias = MIC_BIAS_2; + cfg->v_hs_max = WCD_MBHC_HS_V_MAX; + cfg->num_btn = PM4125_MBHC_MAX_BUTTONS; + cfg->micb_mv = pm4125->micb2_mv; + cfg->linein_th = 5000; + cfg->hs_thr = 1700; + cfg->hph_thr = 50; + + wcd_dt_parse_mbhc_data(dev, &pm4125->mbhc_cfg); + + ret = pm4125_add_slave_components(pm4125, dev, &match); + if (ret) + return ret; + + ret = component_master_add_with_match(dev, &pm4125_comp_ops, match); + if (ret) + return ret; + + pm_runtime_set_autosuspend_delay(dev, 1000); + pm_runtime_use_autosuspend(dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + pm_runtime_idle(dev); + + return 0; +} + +static void pm4125_remove(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + + component_master_del(&pdev->dev, &pm4125_comp_ops); + + pm_runtime_disable(dev); + pm_runtime_set_suspended(dev); + pm_runtime_dont_use_autosuspend(dev); +} + +static const struct of_device_id pm4125_of_match[] = { + { .compatible = "qcom,pm4125-codec" }, + { } +}; +MODULE_DEVICE_TABLE(of, pm4125_of_match); + +static struct platform_driver pm4125_codec_driver = { + .probe = pm4125_probe, + .remove = pm4125_remove, + .driver = { + .name = "pm4125_codec", + .of_match_table = pm4125_of_match, + .suppress_bind_attrs = true, + }, +}; + +module_platform_driver(pm4125_codec_driver); +MODULE_DESCRIPTION("PM4125 audio codec driver"); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/pm4125.h b/sound/soc/codecs/pm4125.h new file mode 100644 index 00000000000000..3520c711b744a9 --- /dev/null +++ b/sound/soc/codecs/pm4125.h @@ -0,0 +1,307 @@ +/* SPDX-License-Identifier: GPL-2.0-only + * Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _PM4125_REGISTERS_H +#define _PM4125_REGISTERS_H + +#include +#include + +#define PM4125_ANA_BASE_ADDR 0x3000 +#define PM4125_DIG_BASE_ADDR 0x3400 + +#define PM4125_ANA_MICBIAS_MICB_1_2_EN (PM4125_ANA_BASE_ADDR+0x040) +#define PM4125_ANA_MICBIAS_MICB1_PULL_UP_MASK BIT(5) +#define PM4125_ANA_MICBIAS_MICB2_PULL_UP_MASK BIT(1) +#define PM4125_ANA_MICBIAS_MICB2_PULL_DN_MASK BIT(0) +#define PM4125_ANA_MICBIAS_MICB_PULL_ENABLE 1 +#define PM4125_ANA_MICBIAS_MICB_PULL_DISABLE 0 +#define PM4125_ANA_MICBIAS_MICB_3_EN (PM4125_ANA_BASE_ADDR+0x041) +#define PM4125_ANA_MICBIAS_LDO_1_SETTING (PM4125_ANA_BASE_ADDR+0x042) +#define PM4125_ANA_MICBIAS_MICB_OUT_VAL_MASK GENMASK(7, 3) +#define PM4125_ANA_MICBIAS_LDO_1_CTRL (PM4125_ANA_BASE_ADDR+0x043) +#define PM4125_ANA_TX_AMIC1 (PM4125_ANA_BASE_ADDR+0x047) +#define PM4125_ANA_TX_AMIC2 (PM4125_ANA_BASE_ADDR+0x048) +#define PM4125_ANA_MBHC_MECH (PM4125_ANA_BASE_ADDR+0x05A) +#define PM4125_ANA_MBHC_ELECT (PM4125_ANA_BASE_ADDR+0x05B) +#define PM4125_ANA_MBHC_ELECT_BIAS_EN_MASK BIT(0) +#define PM4125_ANA_MBHC_ELECT_BIAS_ENABLE 1 +#define PM4125_ANA_MBHC_ELECT_BIAS_DISABLE 0 +#define PM4125_ANA_MBHC_ZDET (PM4125_ANA_BASE_ADDR+0x05C) +#define PM4125_ANA_MBHC_RESULT_1 (PM4125_ANA_BASE_ADDR+0x05D) +#define PM4125_ANA_MBHC_RESULT_2 (PM4125_ANA_BASE_ADDR+0x05E) +#define PM4125_ANA_MBHC_RESULT_3 (PM4125_ANA_BASE_ADDR+0x05F) +#define PM4125_ANA_MBHC_BTN0_ZDET_VREF1 (PM4125_ANA_BASE_ADDR+0x060) +#define PM4125_ANA_MBHC_BTN0_THRESHOLD_MASK GENMASK(7, 2) +#define PM4125_ANA_MBHC_BTN1_ZDET_VREF2 (PM4125_ANA_BASE_ADDR+0x061) +#define PM4125_ANA_MBHC_BTN2_ZDET_VREF3 (PM4125_ANA_BASE_ADDR+0x062) +#define PM4125_ANA_MBHC_BTN3_ZDET_DBG_400 (PM4125_ANA_BASE_ADDR+0x063) +#define PM4125_ANA_MBHC_BTN4_ZDET_DBG_1400 (PM4125_ANA_BASE_ADDR+0x064) +#define PM4125_ANA_MBHC_MICB2_RAMP (PM4125_ANA_BASE_ADDR+0x065) +#define PM4125_ANA_MBHC_CTL_1 (PM4125_ANA_BASE_ADDR+0x066) +#define PM4125_ANA_MBHC_CTL_2 (PM4125_ANA_BASE_ADDR+0x067) +#define PM4125_ANA_MBHC_PLUG_DETECT_CTL (PM4125_ANA_BASE_ADDR+0x068) +#define PM4125_ANA_MBHC_ZDET_ANA_CTL (PM4125_ANA_BASE_ADDR+0x069) +#define PM4125_ANA_MBHC_ZDET_RAMP_CTL (PM4125_ANA_BASE_ADDR+0x06A) +#define PM4125_ANA_MBHC_FSM_STATUS (PM4125_ANA_BASE_ADDR+0x06B) +#define PM4125_ANA_MBHC_ADC_RESULT (PM4125_ANA_BASE_ADDR+0x06C) +#define PM4125_ANA_MBHC_CTL_CLK (PM4125_ANA_BASE_ADDR+0x06D) +#define PM4125_ANA_MBHC_ZDET_CALIB_RESULT (PM4125_ANA_BASE_ADDR+0x072) +#define PM4125_ANA_NCP_EN (PM4125_ANA_BASE_ADDR+0x077) +#define PM4125_ANA_NCP_ENABLE_MASK BIT(0) +#define PM4125_ANA_NCP_ENABLE 1 +#define PM4125_ANA_NCP_DISABLE 0 +#define PM4125_ANA_NCP_VCTRL (PM4125_ANA_BASE_ADDR+0x07C) +#define PM4125_ANA_HPHPA_CNP_CTL_1 (PM4125_ANA_BASE_ADDR+0x083) +#define PM4125_ANA_HPHPA_CNP_CTL_1_EN_MASK BIT(1) +#define PM4125_ANA_HPHPA_CNP_CTL_1_EN 1 +#define PM4125_ANA_HPHPA_CNP_CTL_2 (PM4125_ANA_BASE_ADDR+0x084) +#define PM4125_ANA_HPHPA_CNP_OCP_EN_L_MASK BIT(1) +#define PM4125_ANA_HPHPA_CNP_OCP_EN_R_MASK BIT(0) +#define PM4125_ANA_HPHPA_CNP_OCP_ENABLE 1 +#define PM4125_ANA_HPHPA_CNP_OCP_DISABLE 0 +#define PM4125_ANA_HPHPA_PA_STATUS (PM4125_ANA_BASE_ADDR+0x087) +#define PM4125_ANA_HPHPA_FSM_CLK (PM4125_ANA_BASE_ADDR+0x088) +#define PM4125_ANA_HPHPA_FSM_CLK_DIV_EN_MASK BIT(7) +#define PM4125_ANA_HPHPA_FSM_CLK_DIV_ENABLE 1 +#define PM4125_ANA_HPHPA_FSM_CLK_DIV_DISABLE 0 +#define PM4125_ANA_HPHPA_FSM_DIV_RATIO_MASK GENMASK(6, 0) +#define PM4125_ANA_HPHPA_FSM_DIV_RATIO_68 (0x11) +#define PM4125_ANA_HPHPA_L_GAIN (PM4125_ANA_BASE_ADDR+0x08B) +#define PM4125_ANA_HPHPA_R_GAIN (PM4125_ANA_BASE_ADDR+0x08C) +#define PM4125_ANA_HPHPA_SPARE_CTL (PM4125_ANA_BASE_ADDR+0x08E) +#define PM4125_SWR_HPHPA_HD2 (PM4125_ANA_BASE_ADDR+0x090) +#define PM4125_SWR_HPHPA_HD2_LEFT_MASK GENMASK(5, 3) +#define PM4125_SWR_HPHPA_HD2_RIGHT_MASK GENMASK(2, 0) +#define PM4125_SWR_HPHPA_HD2_ENABLE (BIT(2) | BIT(1) | BIT(0)) +#define PM4125_ANA_SURGE_EN (PM4125_ANA_BASE_ADDR+0x097) +#define PM4125_ANA_SURGE_PROTECTION_HPHL_MASK BIT(7) +#define PM4125_ANA_SURGE_PROTECTION_HPHR_MASK BIT(6) +#define PM4125_ANA_SURGE_PROTECTION_ENABLE 1 +#define PM4125_ANA_SURGE_PROTECTION_DISABLE 0 +#define PM4125_ANA_COMBOPA_CTL (PM4125_ANA_BASE_ADDR+0x09B) +#define PM4125_ANA_COMBO_PA_SELECT_MASK BIT(6) +#define PM4125_ANA_COMBO_PA_SELECT_EAR 0 +#define PM4125_ANA_COMBO_PA_SELECT_LO 1 +#define PM4125_ANA_COMBOPA_CTL_4 (PM4125_ANA_BASE_ADDR+0x09F) +#define PM4125_ANA_COMBOPA_CTL_5 (PM4125_ANA_BASE_ADDR+0x0A0) +#define PM4125_ANA_RXLDO_CTL (PM4125_ANA_BASE_ADDR+0x0B2) +#define PM4125_ANA_MBIAS_EN (PM4125_ANA_BASE_ADDR+0x0B4) +#define PM4125_ANA_MBIAS_EN_GLOBAL_MASK BIT(5) +#define PM4125_ANA_MBIAS_EN_V2I_MASK BIT(4) +#define PM4125_ANA_MBIAS_EN_ENABLE 1 +#define PM4125_ANA_MBIAS_EN_DISABLE 0 + +#define PM4125_DIG_SWR_CHIP_ID0 (PM4125_DIG_BASE_ADDR+0x001) +#define PM4125_DIG_SWR_CHIP_ID1 (PM4125_DIG_BASE_ADDR+0x002) +#define PM4125_DIG_SWR_CHIP_ID2 (PM4125_DIG_BASE_ADDR+0x003) +#define PM4125_DIG_SWR_CHIP_ID3 (PM4125_DIG_BASE_ADDR+0x004) +#define PM4125_DIG_SWR_SWR_TX_CLK_RATE (PM4125_DIG_BASE_ADDR+0x040) +#define PM4125_DIG_SWR_CDC_RST_CTL (PM4125_DIG_BASE_ADDR+0x041) +#define PM4125_DIG_SWR_TOP_CLK_CFG (PM4125_DIG_BASE_ADDR+0x042) +#define PM4125_DIG_SWR_CDC_RX_CLK_CTL (PM4125_DIG_BASE_ADDR+0x043) +#define PM4125_DIG_SWR_ANA_RX_DIV2_CLK_EN_MASK BIT(5) +#define PM4125_DIG_SWR_ANA_RX_CLK_EN_MASK BIT(4) +#define PM4125_DIG_SWR_RX1_CLK_EN_MASK BIT(1) +#define PM4125_DIG_SWR_RX0_CLK_EN_MASK BIT(0) +#define PM4125_DIG_SWR_RX_CLK_ENABLE 1 +#define PM4125_DIG_SWR_RX_CLK_DISABLE 0 +#define PM4125_DIG_SWR_CDC_TX_CLK_CTL (PM4125_DIG_BASE_ADDR+0x044) +#define PM4125_DIG_SWR_SWR_RST_EN (PM4125_DIG_BASE_ADDR+0x045) +#define PM4125_DIG_SWR_CDC_RX_RST (PM4125_DIG_BASE_ADDR+0x047) +#define PM4125_DIG_SWR_CDC_RX0_CTL (PM4125_DIG_BASE_ADDR+0x048) +#define PM4125_DIG_SWR_DSM_DITHER_EN_MASK BIT(7) +#define PM4125_DIG_SWR_DSM_DITHER_DISABLE 0 +#define PM4125_DIG_SWR_DSM_DITHER_ENABLE 1 +#define PM4125_DIG_SWR_CDC_RX1_CTL (PM4125_DIG_BASE_ADDR+0x049) +#define PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1 (PM4125_DIG_BASE_ADDR+0x04B) +#define PM4125_DIG_SWR_TX_ANA_TXD1_MODE_MASK GENMASK(7, 4) +#define PM4125_DIG_SWR_TX_ANA_TXD0_MODE_MASK GENMASK(3, 0) +#define PM4125_DIG_SWR_TXD_MODE_ULPI (0x9) +#define PM4125_DIG_SWR_TXD_MODE_NORMAL (0x3) +#define PM4125_DIG_SWR_CDC_COMP_CTL_0 (PM4125_DIG_BASE_ADDR+0x04F) +#define PM4125_DIG_SWR_COMP_HPHL_EN_MASK BIT(1) +#define PM4125_DIG_SWR_COMP_HPHR_EN_MASK BIT(0) +#define PM4125_DIG_SWR_COMP_ENABLE 1 +#define PM4125_DIG_SWR_COMP_DISABLE 0 +#define PM4125_DIG_SWR_CDC_RX_DELAY_CTL (PM4125_DIG_BASE_ADDR+0x052) +#define PM4125_DIG_SWR_CDC_RX_GAIN_0 (PM4125_DIG_BASE_ADDR+0x053) +#define PM4125_DIG_SWR_CDC_RX_GAIN_1 (PM4125_DIG_BASE_ADDR+0x054) +#define PM4125_DIG_SWR_CDC_RX_GAIN_CTL (PM4125_DIG_BASE_ADDR+0x057) +#define PM4125_DIG_SWR_RX1_EN_MASK BIT(3) +#define PM4125_DIG_SWR_RX0_EN_MASK BIT(2) +#define PM4125_DIG_SWR_RX_INPUT_DISABLE 0 +#define PM4125_DIG_SWR_RX_INPUT_ENABLE 1 +#define PM4125_DIG_SWR_CDC_TX0_CTL (PM4125_DIG_BASE_ADDR+0x060) +#define PM4125_DIG_SWR_CDC_TX1_CTL (PM4125_DIG_BASE_ADDR+0x061) +#define PM4125_DIG_SWR_CDC_TX_RST (PM4125_DIG_BASE_ADDR+0x063) +#define PM4125_DIG_SWR_CDC_REQ0_CTL (PM4125_DIG_BASE_ADDR+0x064) +#define PM4125_DIG_SWR_CDC_REQ1_CTL (PM4125_DIG_BASE_ADDR+0x065) +#define PM4125_DIG_SWR_CDC_RST (PM4125_DIG_BASE_ADDR+0x067) +#define PM4125_DIG_SWR_CDC_AMIC_CTL (PM4125_DIG_BASE_ADDR+0x06A) +#define PM4125_DIG_SWR_AMIC_SELECT_MASK BIT(1) +#define PM4125_DIG_SWR_AMIC_SELECT_DMIC1 0 +#define PM4125_DIG_SWR_AMIC_SELECT_AMIC3 1 +#define PM4125_DIG_SWR_CDC_DMIC_CTL (PM4125_DIG_BASE_ADDR+0x06B) +#define PM4125_DIG_SWR_CDC_DMIC1_CTL (PM4125_DIG_BASE_ADDR+0x06C) +#define PM4125_DIG_SWR_DMIC1_CLK_EN_MASK BIT(3) +#define PM4125_DIG_SWR_DMIC1_CLK_ENABLE 1 +#define PM4125_DIG_SWR_DMIC1_CLK_DISABLE 0 +#define PM4125_DIG_SWR_CDC_DMIC1_RATE (PM4125_DIG_BASE_ADDR+0x06D) +#define PM4125_DIG_SWR_PDM_WD_CTL0 (PM4125_DIG_BASE_ADDR+0x070) +#define PM4125_WDT_ENABLE_MASK GENMASK(1, 0) +#define PM4125_WDT_ENABLE_RX0_L BIT(0) +#define PM4125_WDT_ENABLE_RX0_M BIT(1) +#define PM4125_DIG_SWR_PDM_WD_CTL1 (PM4125_DIG_BASE_ADDR+0x071) +#define PM4125_WDT_ENABLE_RX1_L BIT(0) +#define PM4125_WDT_ENABLE_RX1_M BIT(1) +#define PM4125_DIG_SWR_INTR_MODE (PM4125_DIG_BASE_ADDR+0x080) +#define PM4125_DIG_SWR_INTR_MASK_0 (PM4125_DIG_BASE_ADDR+0x081) +#define PM4125_DIG_SWR_INTR_MASK_1 (PM4125_DIG_BASE_ADDR+0x082) +#define PM4125_DIG_SWR_INTR_MASK_2 (PM4125_DIG_BASE_ADDR+0x083) +#define PM4125_DIG_SWR_INTR_STATUS_0 (PM4125_DIG_BASE_ADDR+0x084) +#define PM4125_DIG_SWR_INTR_STATUS_1 (PM4125_DIG_BASE_ADDR+0x085) +#define PM4125_DIG_SWR_INTR_STATUS_2 (PM4125_DIG_BASE_ADDR+0x086) +#define PM4125_DIG_SWR_INTR_CLEAR_0 (PM4125_DIG_BASE_ADDR+0x087) +#define PM4125_DIG_SWR_INTR_CLEAR_1 (PM4125_DIG_BASE_ADDR+0x088) +#define PM4125_DIG_SWR_INTR_CLEAR_2 (PM4125_DIG_BASE_ADDR+0x089) +#define PM4125_DIG_SWR_INTR_LEVEL_0 (PM4125_DIG_BASE_ADDR+0x08A) +#define PM4125_DIG_SWR_INTR_LEVEL_1 (PM4125_DIG_BASE_ADDR+0x08B) +#define PM4125_DIG_SWR_INTR_LEVEL_2 (PM4125_DIG_BASE_ADDR+0x08C) +#define PM4125_DIG_SWR_CDC_CONN_RX0_CTL (PM4125_DIG_BASE_ADDR+0x093) +#define PM4125_DIG_SWR_CDC_CONN_RX1_CTL (PM4125_DIG_BASE_ADDR+0x094) +#define PM4125_DIG_SWR_LOOP_BACK_MODE (PM4125_DIG_BASE_ADDR+0x097) +#define PM4125_DIG_SWR_DRIVE_STRENGTH_0 (PM4125_DIG_BASE_ADDR+0x0A0) +#define PM4125_DIG_SWR_DIG_DEBUG_CTL (PM4125_DIG_BASE_ADDR+0x0AB) +#define PM4125_DIG_SWR_DIG_DEBUG_EN (PM4125_DIG_BASE_ADDR+0x0AC) +#define PM4125_DIG_SWR_DEM_BYPASS_DATA0 (PM4125_DIG_BASE_ADDR+0x0B0) +#define PM4125_DIG_SWR_DEM_BYPASS_DATA1 (PM4125_DIG_BASE_ADDR+0x0B1) +#define PM4125_DIG_SWR_DEM_BYPASS_DATA2 (PM4125_DIG_BASE_ADDR+0x0B2) +#define PM4125_DIG_SWR_DEM_BYPASS_DATA3 (PM4125_DIG_BASE_ADDR+0x0B3) + +#define PM4125_ANALOG_REGISTERS_MAX_SIZE (PM4125_ANA_BASE_ADDR+0x0B5) +#define PM4125_DIGITAL_REGISTERS_MAX_SIZE (PM4125_DIG_BASE_ADDR+0x0B4) +#define PM4125_ANALOG_MAX_REGISTER (PM4125_ANALOG_REGISTERS_MAX_SIZE - 1) +#define PM4125_DIGITAL_MAX_REGISTER (PM4125_DIGITAL_REGISTERS_MAX_SIZE - 1) +#define PM4125_MAX_REGISTER PM4125_DIGITAL_MAX_REGISTER + +#define PM4125_MAX_MICBIAS 3 +#define PM4125_MAX_SWR_CH_IDS 15 +#define PM4125_SWRM_CH_MASK(ch_idx) BIT(ch_idx - 1) + +enum pm4125_tx_sdw_ports { + PM4125_ADC_1_2_DMIC1L_BCS_PORT = 1, + PM4125_DMIC_1L_1R_ADC1_BCS_PORT, + PM4125_MAX_TX_SWR_PORTS = PM4125_DMIC_1L_1R_ADC1_BCS_PORT, +}; + +enum pm4125_rx_sdw_ports { + PM4125_HPH_PORT = 1, + PM4125_COMP_PORT, + PM4125_MAX_SWR_PORTS = PM4125_COMP_PORT, +}; + +struct pm4125_sdw_ch_info { + int port_num; + unsigned int ch_mask; + unsigned int master_ch_mask; +}; + +#define WCD_SDW_CH(id, pn, cmask) \ + [id] = { \ + .port_num = pn, \ + .ch_mask = cmask, \ + .master_ch_mask = cmask, \ + } + +struct pm4125_priv; +struct pm4125_sdw_priv { + struct sdw_slave *sdev; + struct sdw_stream_config sconfig; + struct sdw_stream_runtime *sruntime; + struct sdw_port_config port_config[PM4125_MAX_SWR_PORTS]; + struct pm4125_sdw_ch_info *ch_info; + bool port_enable[PM4125_MAX_SWR_CH_IDS]; + unsigned int master_channel_map[SDW_MAX_PORTS]; + int active_ports; + int num_ports; + bool is_tx; + struct pm4125_priv *pm4125; + struct irq_domain *slave_irq; + struct regmap *regmap; +}; + +#if IS_ENABLED(CONFIG_SND_SOC_PM4125_SDW) +int pm4125_sdw_free(struct pm4125_sdw_priv *pm4125, struct snd_pcm_substream *substream, + struct snd_soc_dai *dai); +int pm4125_sdw_set_sdw_stream(struct pm4125_sdw_priv *pm4125, struct snd_soc_dai *dai, void *stream, + int direction); +int pm4125_sdw_hw_params(struct pm4125_sdw_priv *pm4125, struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); + +struct device *pm4125_sdw_device_get(struct device_node *np); + +#else +static inline int pm4125_sdw_free(struct pm4125_sdw_priv *pm4125, + struct snd_pcm_substream *substream, struct snd_soc_dai *dai) +{ + return -EOPNOTSUPP; +} + +static inline int pm4125_sdw_set_sdw_stream(struct pm4125_sdw_priv *pm4125, + struct snd_soc_dai *dai, void *stream, int direction) +{ + return -EOPNOTSUPP; +} + +static inline int pm4125_sdw_hw_params(struct pm4125_sdw_priv *pm4125, + struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, struct snd_soc_dai *dai) +{ + return -EOPNOTSUPP; +} +#endif + +enum { + /* INTR_CTRL_INT_MASK_0 */ + PM4125_IRQ_MBHC_BUTTON_PRESS_DET = 0, + PM4125_IRQ_MBHC_BUTTON_RELEASE_DET, + PM4125_IRQ_MBHC_ELECT_INS_REM_DET, + PM4125_IRQ_MBHC_ELECT_INS_REM_LEG_DET, + PM4125_IRQ_MBHC_SW_DET, + PM4125_IRQ_HPHR_OCP_INT, + PM4125_IRQ_HPHR_CNP_INT, + PM4125_IRQ_HPHL_OCP_INT, + + /* INTR_CTRL_INT_MASK_1 */ + PM4125_IRQ_HPHL_CNP_INT, + PM4125_IRQ_EAR_CNP_INT, + PM4125_IRQ_EAR_SCD_INT, + PM4125_IRQ_AUX_CNP_INT, + PM4125_IRQ_AUX_SCD_INT, + PM4125_IRQ_HPHL_PDM_WD_INT, + PM4125_IRQ_HPHR_PDM_WD_INT, + PM4125_IRQ_AUX_PDM_WD_INT, + + /* INTR_CTRL_INT_MASK_2 */ + PM4125_IRQ_LDORT_SCD_INT, + PM4125_IRQ_MBHC_MOISTURE_INT, + PM4125_IRQ_HPHL_SURGE_DET_INT, + PM4125_IRQ_HPHR_SURGE_DET_INT, + PM4125_NUM_IRQS, +}; + +enum pm4125_tx_sdw_channels { + PM4125_ADC1, + PM4125_ADC2, +}; + +enum pm4125_rx_sdw_channels { + PM4125_HPH_L, + PM4125_HPH_R, + PM4125_COMP_L, + PM4125_COMP_R, +}; + +#endif /* _PM4125_REGISTERS_H */ From b9cb410d48b00a759f0947f2491785f4d0486c68 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Mon, 15 Sep 2025 17:27:51 +0100 Subject: [PATCH 1862/3206] MAINTAINERS: add Qualcomm PM4125 audio codec to drivers list Since new audio codec driver is added the get_maintainers script should catch the new files. Signed-off-by: Alexey Klimov Reviewed-by: Srinivas Kandagatla Tested-by: Srinivas Kandagatla Link: https://patch.msgid.link/20250915-pm4125_audio_codec_v1-v4-4-b247b64eec52@linaro.org Signed-off-by: Mark Brown --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f6206963efbf0b..e96159e6617f2d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20470,6 +20470,8 @@ F: include/dt-bindings/sound/qcom,wcd93* F: sound/soc/codecs/lpass-*.* F: sound/soc/codecs/msm8916-wcd-analog.c F: sound/soc/codecs/msm8916-wcd-digital.c +F: sound/soc/codecs/pm4125-sdw.c +F: sound/soc/codecs/pm4125.* F: sound/soc/codecs/wcd-clsh-v2.* F: sound/soc/codecs/wcd-mbhc-v2.* F: sound/soc/codecs/wcd93*.* From 88d0d17192c5a850dc07bb38035b69c4cefde270 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Mon, 15 Sep 2025 17:27:48 +0100 Subject: [PATCH 1863/3206] ASoC: dt-bindings: add bindings for pm4125 audio codec The audio codec IC is found on Qualcomm PM4125/PM2250 PMIC. It has TX and RX soundwire slave devices hence two files are added. Signed-off-by: Alexey Klimov Reviewed-by: Srinivas Kandagatla Tested-by: Srinivas Kandagatla Link: https://patch.msgid.link/20250915-pm4125_audio_codec_v1-v4-1-b247b64eec52@linaro.org Signed-off-by: Mark Brown --- .../bindings/sound/qcom,pm4125-codec.yaml | 134 ++++++++++++++++++ .../bindings/sound/qcom,pm4125-sdw.yaml | 79 +++++++++++ 2 files changed, 213 insertions(+) create mode 100644 Documentation/devicetree/bindings/sound/qcom,pm4125-codec.yaml create mode 100644 Documentation/devicetree/bindings/sound/qcom,pm4125-sdw.yaml diff --git a/Documentation/devicetree/bindings/sound/qcom,pm4125-codec.yaml b/Documentation/devicetree/bindings/sound/qcom,pm4125-codec.yaml new file mode 100644 index 00000000000000..6e2f103be1d324 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/qcom,pm4125-codec.yaml @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/qcom,pm4125-codec.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm PM4125 Audio Codec + +maintainers: + - Alexey Klimov + +description: + The audio codec IC found on Qualcomm PM4125/PM2250 PMIC. + It has RX and TX Soundwire slave devices. + +allOf: + - $ref: dai-common.yaml# + +properties: + compatible: + const: qcom,pm4125-codec + + reg: + description: + Specifies the SPMI base address for the audio codec peripherals. The + address space contains reset register needed to power-on the codec. + maxItems: 1 + + reg-names: + maxItems: 1 + + vdd-io-supply: + description: A reference to the 1.8V I/O supply + + vdd-cp-supply: + description: A reference to the charge pump I/O supply + + vdd-mic-bias-supply: + description: A reference to the 3.3V mic bias supply + + vdd-pa-vpos-supply: + description: A reference to the PA VPOS supply + + qcom,tx-device: + $ref: /schemas/types.yaml#/definitions/phandle-array + description: A reference to Soundwire tx device phandle + + qcom,rx-device: + $ref: /schemas/types.yaml#/definitions/phandle-array + description: A reference to Soundwire rx device phandle + + qcom,micbias1-microvolt: + description: micbias1 voltage + minimum: 1800000 + maximum: 2850000 + + qcom,micbias2-microvolt: + description: micbias2 voltage + minimum: 1800000 + maximum: 2850000 + + qcom,micbias3-microvolt: + description: micbias3 voltage + minimum: 1800000 + maximum: 2850000 + + qcom,mbhc-buttons-vthreshold-microvolt: + description: + Array of 8 Voltage threshold values corresponding to headset + button0 - button7 + minItems: 8 + maxItems: 8 + + '#sound-dai-cells': + const: 1 + +required: + - compatible + - reg + - vdd-io-supply + - vdd-cp-supply + - vdd-mic-bias-supply + - vdd-pa-vpos-supply + - qcom,tx-device + - qcom,rx-device + - qcom,micbias1-microvolt + - qcom,micbias2-microvolt + - qcom,micbias3-microvolt + - '#sound-dai-cells' + +unevaluatedProperties: false + +examples: + - | + #include + + spmi { + #address-cells = <2>; + #size-cells = <0>; + + pmic { + #address-cells = <1>; + #size-cells = <0>; + + audio-codec@f000 { + compatible = "qcom,pm4125-codec"; + reg = <0xf000>; + vdd-io-supply = <&pm4125_l15>; + vdd-cp-supply = <&pm4125_s4>; + vdd-pa-vpos-supply = <&pm4125_s4>; + vdd-mic-bias-supply = <&pm4125_l22>; + qcom,micbias1-microvolt = <1800000>; + qcom,micbias2-microvolt = <1800000>; + qcom,micbias3-microvolt = <1800000>; + qcom,rx-device = <&pm4125_rx>; + qcom,tx-device = <&pm4125_tx>; + #sound-dai-cells = <1>; + }; + }; + }; + + /* ... */ + + soundwire@a610000 { + reg = <0x0a610000 0x2000>; + #address-cells = <2>; + #size-cells = <0>; + pm4125_rx: audio-codec@0,4 { + compatible = "sdw20217010c00"; + reg = <0 4>; + qcom,rx-port-mapping = <1 3>; + }; + }; +... diff --git a/Documentation/devicetree/bindings/sound/qcom,pm4125-sdw.yaml b/Documentation/devicetree/bindings/sound/qcom,pm4125-sdw.yaml new file mode 100644 index 00000000000000..23624f32ac3058 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/qcom,pm4125-sdw.yaml @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/qcom,pm4125-sdw.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm SoundWire Slave devices on PM4125/PM2250 PMIC audio codec. + +maintainers: + - Alexey Klimov + +description: + The audio codec IC found on Qualcomm PM4125/PM2250 PMICs. + It has RX and TX Soundwire slave devices. + +properties: + compatible: + const: sdw20217010c00 + + reg: + maxItems: 1 + + qcom,tx-port-mapping: + description: | + Specifies static port mapping between device and host tx ports. + In the order of the device port index which are adc1_port, adc23_port, + dmic03_mbhc_port, dmic46_port. + Supports maximum 2 tx soundwire ports. + + PM4125 TX Port 1 (ADC1,2 & DMIC0 & MBHC) <=> SWR0 Port 1 + PM4125 TX Port 2 (ADC1 & DMIC0,1,2 & MBHC) <=> SWR0 Port 2 + + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 2 + maxItems: 2 + items: + enum: [1, 2, 3, 4] + + qcom,rx-port-mapping: + description: | + Specifies static port mapping between device and host rx ports. + In the order of device port index which are hph_port, clsh_port, + comp_port, lo_port, dsd port. + Supports maximum 2 rx soundwire ports. + + PM4125 RX Port 1 (HPH_L/R) <==> SWR1 Port 1 (HPH_L/R) + PM4125 RX Port 2 (COMP_L/R) <==> SWR1 Port 3 (COMP_L/R) + + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 2 + maxItems: 2 + items: + enum: [1, 2, 3, 4, 5] + +required: + - compatible + - reg + +oneOf: + - required: + - qcom,tx-port-mapping + - required: + - qcom,rx-port-mapping + +additionalProperties: false + +examples: + - | + soundwire@a610000 { + reg = <0x0a610000 0x2000>; + #address-cells = <2>; + #size-cells = <0>; + pm4125_rx: codec@0,1 { + compatible = "sdw20217010c00"; + reg = <0 1>; + qcom,rx-port-mapping = <1 3>; + }; + }; +... From 8b84d712ad849172f6bbcad57534b284d942b0b5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 25 Aug 2025 12:20:52 -0500 Subject: [PATCH 1864/3206] regulator: spacemit: support SpacemiT P1 regulators Add support for the regulators found in the SpacemiT P1 PMIC. This PMIC provides 6 buck converters and 12 LDO regulators. The PMIC is implemented as a multi-function device. These regulators are probed based on this driver being named in a MFD cell in the simple MFD I2C driver. Signed-off-by: Alex Elder Link: https://patch.msgid.link/20250825172057.163883-4-elder@riscstar.com Signed-off-by: Mark Brown --- drivers/regulator/Kconfig | 13 +++ drivers/regulator/Makefile | 1 + drivers/regulator/spacemit-p1.c | 157 ++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+) create mode 100644 drivers/regulator/spacemit-p1.c diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 929ea98e18176c..5e451be51392ce 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -1443,6 +1443,19 @@ config REGULATOR_SLG51000 The SLG51000 is seven compact and customizable low dropout regulators. +config REGULATOR_SPACEMIT_P1 + tristate "SpacemiT P1 regulators" + depends on ARCH_SPACEMIT || COMPILE_TEST + depends on I2C + select MFD_SPACEMIT_P1 + default ARCH_SPACEMIT + help + Enable support for regulators implemented by the SpacemiT P1 + power controller. The P1 implements 6 high-efficiency buck + converters and 12 programmable LDO regulators. To compile this + driver as a module, choose M here. The module will be called + "spacemit-pmic". + config REGULATOR_STM32_BOOSTER tristate "STMicroelectronics STM32 BOOSTER" depends on ARCH_STM32 || COMPILE_TEST diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index 7bac19847781cf..76b02d12b758c6 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -167,6 +167,7 @@ obj-$(CONFIG_REGULATOR_S5M8767) += s5m8767.o obj-$(CONFIG_REGULATOR_SC2731) += sc2731-regulator.o obj-$(CONFIG_REGULATOR_SKY81452) += sky81452-regulator.o obj-$(CONFIG_REGULATOR_SLG51000) += slg51000-regulator.o +obj-$(CONFIG_REGULATOR_SPACEMIT_P1) += spacemit-p1.o obj-$(CONFIG_REGULATOR_STM32_BOOSTER) += stm32-booster.o obj-$(CONFIG_REGULATOR_STM32_VREFBUF) += stm32-vrefbuf.o obj-$(CONFIG_REGULATOR_STM32_PWR) += stm32-pwr.o diff --git a/drivers/regulator/spacemit-p1.c b/drivers/regulator/spacemit-p1.c new file mode 100644 index 00000000000000..d437e6738ea1e3 --- /dev/null +++ b/drivers/regulator/spacemit-p1.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Driver for regulators found in the SpacemiT P1 PMIC + * + * Copyright (C) 2025 by RISCstar Solutions Corporation. All rights reserved. + * Derived from code from SpacemiT. + * Copyright (c) 2023, SPACEMIT Co., Ltd + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MOD_NAME "spacemit-p1-regulator" + +enum p1_regulator_id { + P1_BUCK1, + P1_BUCK2, + P1_BUCK3, + P1_BUCK4, + P1_BUCK5, + P1_BUCK6, + + P1_ALDO1, + P1_ALDO2, + P1_ALDO3, + P1_ALDO4, + + P1_DLDO1, + P1_DLDO2, + P1_DLDO3, + P1_DLDO4, + P1_DLDO5, + P1_DLDO6, + P1_DLDO7, +}; + +static const struct regulator_ops p1_regulator_ops = { + .list_voltage = regulator_list_voltage_linear_range, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, +}; + +/* Selector value 255 can be used to disable the buck converter on sleep */ +static const struct linear_range p1_buck_ranges[] = { + REGULATOR_LINEAR_RANGE(500000, 0, 170, 5000), + REGULATOR_LINEAR_RANGE(1375000, 171, 254, 25000), +}; + +/* Selector value 0 can be used for suspend */ +static const struct linear_range p1_ldo_ranges[] = { + REGULATOR_LINEAR_RANGE(500000, 11, 127, 25000), +}; + +/* These define the voltage selector field for buck and LDO regulators */ +#define BUCK_MASK GENMASK(7, 0) +#define LDO_MASK GENMASK(6, 0) + +#define P1_ID(_TYPE, _n) P1_ ## _TYPE ## _n +#define P1_ENABLE_REG(_off, _n) ((_off) + 3 * ((_n) - 1)) + +#define P1_REG_DESC(_TYPE, _type, _n, _s, _off, _mask, _nv, _ranges) \ + { \ + .name = #_type #_n, \ + .supply_name = _s, \ + .of_match = of_match_ptr(#_type #_n), \ + .regulators_node = of_match_ptr("regulators"), \ + .id = P1_ID(_TYPE, _n), \ + .n_voltages = _nv, \ + .ops = &p1_regulator_ops, \ + .owner = THIS_MODULE, \ + .linear_ranges = _ranges, \ + .n_linear_ranges = ARRAY_SIZE(_ranges), \ + .vsel_reg = P1_ENABLE_REG(_off, _n) + 1, \ + .vsel_mask = _mask, \ + .enable_reg = P1_ENABLE_REG(_off, _n), \ + .enable_mask = BIT(0), \ + } + +#define P1_BUCK_DESC(_n) \ + P1_REG_DESC(BUCK, buck, _n, "vcc", 0x47, BUCK_MASK, 254, p1_buck_ranges) + +#define P1_ALDO_DESC(_n) \ + P1_REG_DESC(ALDO, aldo, _n, "vcc", 0x5b, LDO_MASK, 117, p1_ldo_ranges) + +#define P1_DLDO_DESC(_n) \ + P1_REG_DESC(DLDO, dldo, _n, "buck5", 0x67, LDO_MASK, 117, p1_ldo_ranges) + +static const struct regulator_desc p1_regulator_desc[] = { + P1_BUCK_DESC(1), + P1_BUCK_DESC(2), + P1_BUCK_DESC(3), + P1_BUCK_DESC(4), + P1_BUCK_DESC(5), + P1_BUCK_DESC(6), + + P1_ALDO_DESC(1), + P1_ALDO_DESC(2), + P1_ALDO_DESC(3), + P1_ALDO_DESC(4), + + P1_DLDO_DESC(1), + P1_DLDO_DESC(2), + P1_DLDO_DESC(3), + P1_DLDO_DESC(4), + P1_DLDO_DESC(5), + P1_DLDO_DESC(6), + P1_DLDO_DESC(7), +}; + +static int p1_regulator_probe(struct platform_device *pdev) +{ + struct regulator_config config = { }; + struct device *dev = &pdev->dev; + u32 i; + + /* + * The parent device (PMIC) owns the regmap. Since we don't + * provide one in the config structure, that one will be used. + */ + config.dev = dev->parent; + + for (i = 0; i < ARRAY_SIZE(p1_regulator_desc); i++) { + const struct regulator_desc *desc = &p1_regulator_desc[i]; + struct regulator_dev *rdev; + + rdev = devm_regulator_register(dev, desc, &config); + if (IS_ERR(rdev)) + return dev_err_probe(dev, PTR_ERR(rdev), + "error registering regulator %s\n", + desc->name); + } + + return 0; +} + +static struct platform_driver p1_regulator_driver = { + .probe = p1_regulator_probe, + .driver = { + .name = MOD_NAME, + }, +}; + +module_platform_driver(p1_regulator_driver); + +MODULE_DESCRIPTION("SpacemiT P1 regulator driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:" MOD_NAME); From 51dad33ede63618a6b425c650f3042d85e646dac Mon Sep 17 00:00:00 2001 From: Ming Yu Date: Fri, 12 Sep 2025 17:19:46 +0800 Subject: [PATCH 1865/3206] mfd: Add core driver for Nuvoton NCT6694 The Nuvoton NCT6694 provides an USB interface to the host to access its features. Sub-devices can use the USB functions nct6694_read_msg() and nct6694_write_msg() to issue a command. They can also request interrupt that will be called when the USB device receives its interrupt pipe. Signed-off-by: Ming Yu Link: https://lore.kernel.org/r/20250912091952.1169369-2-a0282524688@gmail.com Signed-off-by: Lee Jones --- MAINTAINERS | 6 + drivers/mfd/Kconfig | 15 ++ drivers/mfd/Makefile | 2 + drivers/mfd/nct6694.c | 388 ++++++++++++++++++++++++++++++++++++ include/linux/mfd/nct6694.h | 102 ++++++++++ 5 files changed, 513 insertions(+) create mode 100644 drivers/mfd/nct6694.c create mode 100644 include/linux/mfd/nct6694.h diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..a8a05872d07757 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18082,6 +18082,12 @@ F: drivers/nubus/ F: include/linux/nubus.h F: include/uapi/linux/nubus.h +NUVOTON NCT6694 MFD DRIVER +M: Ming Yu +S: Supported +F: drivers/mfd/nct6694.c +F: include/linux/mfd/nct6694.h + NUVOTON NCT7201 IIO DRIVER M: Eason Yang L: linux-iio@vger.kernel.org diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1e7..f3d157776e93a5 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -1134,6 +1134,21 @@ config MFD_MENF21BMC This driver can also be built as a module. If so the module will be called menf21bmc. +config MFD_NCT6694 + tristate "Nuvoton NCT6694 support" + select MFD_CORE + depends on USB + help + This enables support for the Nuvoton USB device NCT6694, which shares + peripherals. + The Nuvoton NCT6694 is a peripheral expander with 16 GPIO chips, + 6 I2C controllers, 2 CANfd controllers, 2 Watchdog timers, ADC, + PWM, and RTC. + This driver provides core APIs to access the NCT6694 hardware + monitoring and control features. + Additional drivers must be enabled to utilize the specific + functionalities of the device. + config MFD_OCELOT tristate "Microsemi Ocelot External Control Support" depends on SPI_MASTER diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d16..1e7738c02b2c97 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -121,6 +121,8 @@ obj-$(CONFIG_MFD_MC13XXX) += mc13xxx-core.o obj-$(CONFIG_MFD_MC13XXX_SPI) += mc13xxx-spi.o obj-$(CONFIG_MFD_MC13XXX_I2C) += mc13xxx-i2c.o +obj-$(CONFIG_MFD_NCT6694) += nct6694.o + obj-$(CONFIG_MFD_CORE) += mfd-core.o ocelot-soc-objs := ocelot-core.o ocelot-spi.o diff --git a/drivers/mfd/nct6694.c b/drivers/mfd/nct6694.c new file mode 100644 index 00000000000000..308b2fda3055c5 --- /dev/null +++ b/drivers/mfd/nct6694.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Nuvoton Technology Corp. + * + * Nuvoton NCT6694 core driver using USB interface to provide + * access to the NCT6694 hardware monitoring and control features. + * + * The NCT6694 is an integrated controller that provides GPIO, I2C, + * CAN, WDT, HWMON and RTC management. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct mfd_cell nct6694_devs[] = { + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + + MFD_CELL_NAME("nct6694-canfd"), + MFD_CELL_NAME("nct6694-canfd"), + + MFD_CELL_NAME("nct6694-wdt"), + MFD_CELL_NAME("nct6694-wdt"), + + MFD_CELL_NAME("nct6694-hwmon"), + + MFD_CELL_NAME("nct6694-rtc"), +}; + +static int nct6694_response_err_handling(struct nct6694 *nct6694, unsigned char err_status) +{ + switch (err_status) { + case NCT6694_NO_ERROR: + return 0; + case NCT6694_NOT_SUPPORT_ERROR: + dev_err(nct6694->dev, "Command is not supported!\n"); + break; + case NCT6694_NO_RESPONSE_ERROR: + dev_warn(nct6694->dev, "Command received no response!\n"); + break; + case NCT6694_TIMEOUT_ERROR: + dev_warn(nct6694->dev, "Command timed out!\n"); + break; + case NCT6694_PENDING: + dev_err(nct6694->dev, "Command is pending!\n"); + break; + default: + return -EINVAL; + } + + return -EIO; +} + +/** + * nct6694_read_msg() - Read message from NCT6694 device + * @nct6694: NCT6694 device pointer + * @cmd_hd: command header structure + * @buf: buffer to store the response data + * + * Sends a command to the NCT6694 device and reads the response. + * The command header is specified in @cmd_hd, and the response + * data is stored in @buf. + * + * Return: Negative value on error or 0 on success. + */ +int nct6694_read_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf) +{ + union nct6694_usb_msg *msg = nct6694->usb_msg; + struct usb_device *udev = nct6694->udev; + int tx_len, rx_len, ret; + + guard(mutex)(&nct6694->access_lock); + + memcpy(&msg->cmd_header, cmd_hd, sizeof(*cmd_hd)); + msg->cmd_header.hctrl = NCT6694_HCTRL_GET; + + /* Send command packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), &msg->cmd_header, + sizeof(*msg), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive response packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), &msg->response_header, + sizeof(*msg), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive data packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), buf, + le16_to_cpu(cmd_hd->len), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + if (rx_len != le16_to_cpu(cmd_hd->len)) { + dev_err(nct6694->dev, "Expected received length %d, but got %d\n", + le16_to_cpu(cmd_hd->len), rx_len); + return -EIO; + } + + return nct6694_response_err_handling(nct6694, msg->response_header.sts); +} +EXPORT_SYMBOL_GPL(nct6694_read_msg); + +/** + * nct6694_write_msg() - Write message to NCT6694 device + * @nct6694: NCT6694 device pointer + * @cmd_hd: command header structure + * @buf: buffer containing the data to be sent + * + * Sends a command to the NCT6694 device and writes the data + * from @buf. The command header is specified in @cmd_hd. + * + * Return: Negative value on error or 0 on success. + */ +int nct6694_write_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf) +{ + union nct6694_usb_msg *msg = nct6694->usb_msg; + struct usb_device *udev = nct6694->udev; + int tx_len, rx_len, ret; + + guard(mutex)(&nct6694->access_lock); + + memcpy(&msg->cmd_header, cmd_hd, sizeof(*cmd_hd)); + msg->cmd_header.hctrl = NCT6694_HCTRL_SET; + + /* Send command packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), &msg->cmd_header, + sizeof(*msg), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Send data packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), buf, + le16_to_cpu(cmd_hd->len), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive response packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), &msg->response_header, + sizeof(*msg), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive data packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), buf, + le16_to_cpu(cmd_hd->len), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + if (rx_len != le16_to_cpu(cmd_hd->len)) { + dev_err(nct6694->dev, "Expected transmitted length %d, but got %d\n", + le16_to_cpu(cmd_hd->len), rx_len); + return -EIO; + } + + return nct6694_response_err_handling(nct6694, msg->response_header.sts); +} +EXPORT_SYMBOL_GPL(nct6694_write_msg); + +static void usb_int_callback(struct urb *urb) +{ + struct nct6694 *nct6694 = urb->context; + __le32 *status_le = urb->transfer_buffer; + u32 int_status; + int ret; + + switch (urb->status) { + case 0: + break; + case -ECONNRESET: + case -ENOENT: + case -ESHUTDOWN: + return; + default: + goto resubmit; + } + + int_status = le32_to_cpu(*status_le); + + while (int_status) { + int irq = __ffs(int_status); + + generic_handle_irq_safe(irq_find_mapping(nct6694->domain, irq)); + int_status &= ~BIT(irq); + } + +resubmit: + ret = usb_submit_urb(urb, GFP_ATOMIC); + if (ret) + dev_warn(nct6694->dev, "Failed to resubmit urb, status %pe", ERR_PTR(ret)); +} + +static void nct6694_irq_enable(struct irq_data *data) +{ + struct nct6694 *nct6694 = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + guard(spinlock_irqsave)(&nct6694->irq_lock); + + nct6694->irq_enable |= BIT(hwirq); +} + +static void nct6694_irq_disable(struct irq_data *data) +{ + struct nct6694 *nct6694 = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + guard(spinlock_irqsave)(&nct6694->irq_lock); + + nct6694->irq_enable &= ~BIT(hwirq); +} + +static const struct irq_chip nct6694_irq_chip = { + .name = "nct6694-irq", + .flags = IRQCHIP_SKIP_SET_WAKE, + .irq_enable = nct6694_irq_enable, + .irq_disable = nct6694_irq_disable, +}; + +static int nct6694_irq_domain_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) +{ + struct nct6694 *nct6694 = d->host_data; + + irq_set_chip_data(irq, nct6694); + irq_set_chip_and_handler(irq, &nct6694_irq_chip, handle_simple_irq); + + return 0; +} + +static void nct6694_irq_domain_unmap(struct irq_domain *d, unsigned int irq) +{ + irq_set_chip_and_handler(irq, NULL, NULL); + irq_set_chip_data(irq, NULL); +} + +static const struct irq_domain_ops nct6694_irq_domain_ops = { + .map = nct6694_irq_domain_map, + .unmap = nct6694_irq_domain_unmap, +}; + +static int nct6694_usb_probe(struct usb_interface *iface, + const struct usb_device_id *id) +{ + struct usb_device *udev = interface_to_usbdev(iface); + struct usb_endpoint_descriptor *int_endpoint; + struct usb_host_interface *interface; + struct device *dev = &iface->dev; + struct nct6694 *nct6694; + int ret; + + nct6694 = devm_kzalloc(dev, sizeof(*nct6694), GFP_KERNEL); + if (!nct6694) + return -ENOMEM; + + nct6694->usb_msg = devm_kzalloc(dev, sizeof(union nct6694_usb_msg), GFP_KERNEL); + if (!nct6694->usb_msg) + return -ENOMEM; + + nct6694->int_buffer = devm_kzalloc(dev, sizeof(*nct6694->int_buffer), GFP_KERNEL); + if (!nct6694->int_buffer) + return -ENOMEM; + + nct6694->int_in_urb = usb_alloc_urb(0, GFP_KERNEL); + if (!nct6694->int_in_urb) + return -ENOMEM; + + nct6694->domain = irq_domain_create_simple(NULL, NCT6694_NR_IRQS, 0, + &nct6694_irq_domain_ops, + nct6694); + if (!nct6694->domain) { + ret = -ENODEV; + goto err_urb; + } + + nct6694->dev = dev; + nct6694->udev = udev; + + ida_init(&nct6694->gpio_ida); + ida_init(&nct6694->i2c_ida); + ida_init(&nct6694->canfd_ida); + ida_init(&nct6694->wdt_ida); + + spin_lock_init(&nct6694->irq_lock); + + ret = devm_mutex_init(dev, &nct6694->access_lock); + if (ret) + goto err_ida; + + interface = iface->cur_altsetting; + + int_endpoint = &interface->endpoint[0].desc; + if (!usb_endpoint_is_int_in(int_endpoint)) { + ret = -ENODEV; + goto err_ida; + } + + usb_fill_int_urb(nct6694->int_in_urb, udev, usb_rcvintpipe(udev, NCT6694_INT_IN_EP), + nct6694->int_buffer, sizeof(*nct6694->int_buffer), usb_int_callback, + nct6694, int_endpoint->bInterval); + + ret = usb_submit_urb(nct6694->int_in_urb, GFP_KERNEL); + if (ret) + goto err_ida; + + usb_set_intfdata(iface, nct6694); + + ret = mfd_add_hotplug_devices(dev, nct6694_devs, ARRAY_SIZE(nct6694_devs)); + if (ret) + goto err_mfd; + + return 0; + +err_mfd: + usb_kill_urb(nct6694->int_in_urb); +err_ida: + ida_destroy(&nct6694->wdt_ida); + ida_destroy(&nct6694->canfd_ida); + ida_destroy(&nct6694->i2c_ida); + ida_destroy(&nct6694->gpio_ida); + irq_domain_remove(nct6694->domain); +err_urb: + usb_free_urb(nct6694->int_in_urb); + return ret; +} + +static void nct6694_usb_disconnect(struct usb_interface *iface) +{ + struct nct6694 *nct6694 = usb_get_intfdata(iface); + + mfd_remove_devices(nct6694->dev); + usb_kill_urb(nct6694->int_in_urb); + ida_destroy(&nct6694->wdt_ida); + ida_destroy(&nct6694->canfd_ida); + ida_destroy(&nct6694->i2c_ida); + ida_destroy(&nct6694->gpio_ida); + irq_domain_remove(nct6694->domain); + usb_free_urb(nct6694->int_in_urb); +} + +static const struct usb_device_id nct6694_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(NCT6694_VENDOR_ID, NCT6694_PRODUCT_ID, 0xFF, 0x00, 0x00) }, + { } +}; +MODULE_DEVICE_TABLE(usb, nct6694_ids); + +static struct usb_driver nct6694_usb_driver = { + .name = "nct6694", + .id_table = nct6694_ids, + .probe = nct6694_usb_probe, + .disconnect = nct6694_usb_disconnect, +}; +module_usb_driver(nct6694_usb_driver); + +MODULE_DESCRIPTION("Nuvoton NCT6694 core driver"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/nct6694.h b/include/linux/mfd/nct6694.h new file mode 100644 index 00000000000000..6eb9be2cd4a011 --- /dev/null +++ b/include/linux/mfd/nct6694.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Nuvoton Technology Corp. + * + * Nuvoton NCT6694 USB transaction and data structure. + */ + +#ifndef __MFD_NCT6694_H +#define __MFD_NCT6694_H + +#define NCT6694_VENDOR_ID 0x0416 +#define NCT6694_PRODUCT_ID 0x200B +#define NCT6694_INT_IN_EP 0x81 +#define NCT6694_BULK_IN_EP 0x02 +#define NCT6694_BULK_OUT_EP 0x03 + +#define NCT6694_HCTRL_SET 0x40 +#define NCT6694_HCTRL_GET 0x80 + +#define NCT6694_URB_TIMEOUT 1000 + +enum nct6694_irq_id { + NCT6694_IRQ_GPIO0 = 0, + NCT6694_IRQ_GPIO1, + NCT6694_IRQ_GPIO2, + NCT6694_IRQ_GPIO3, + NCT6694_IRQ_GPIO4, + NCT6694_IRQ_GPIO5, + NCT6694_IRQ_GPIO6, + NCT6694_IRQ_GPIO7, + NCT6694_IRQ_GPIO8, + NCT6694_IRQ_GPIO9, + NCT6694_IRQ_GPIOA, + NCT6694_IRQ_GPIOB, + NCT6694_IRQ_GPIOC, + NCT6694_IRQ_GPIOD, + NCT6694_IRQ_GPIOE, + NCT6694_IRQ_GPIOF, + NCT6694_IRQ_CAN0, + NCT6694_IRQ_CAN1, + NCT6694_IRQ_RTC, + NCT6694_NR_IRQS, +}; + +enum nct6694_response_err_status { + NCT6694_NO_ERROR = 0, + NCT6694_FORMAT_ERROR, + NCT6694_RESERVED1, + NCT6694_RESERVED2, + NCT6694_NOT_SUPPORT_ERROR, + NCT6694_NO_RESPONSE_ERROR, + NCT6694_TIMEOUT_ERROR, + NCT6694_PENDING, +}; + +struct __packed nct6694_cmd_header { + u8 rsv1; + u8 mod; + union __packed { + __le16 offset; + struct __packed { + u8 cmd; + u8 sel; + }; + }; + u8 hctrl; + u8 rsv2; + __le16 len; +}; + +struct __packed nct6694_response_header { + u8 sequence_id; + u8 sts; + u8 reserved[4]; + __le16 len; +}; + +union __packed nct6694_usb_msg { + struct nct6694_cmd_header cmd_header; + struct nct6694_response_header response_header; +}; + +struct nct6694 { + struct device *dev; + struct ida gpio_ida; + struct ida i2c_ida; + struct ida canfd_ida; + struct ida wdt_ida; + struct irq_domain *domain; + struct mutex access_lock; + spinlock_t irq_lock; + struct urb *int_in_urb; + struct usb_device *udev; + union nct6694_usb_msg *usb_msg; + __le32 *int_buffer; + unsigned int irq_enable; +}; + +int nct6694_read_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf); +int nct6694_write_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf); + +#endif From 611a995e8ae1a52e34abb80ae02800ea100bdf84 Mon Sep 17 00:00:00 2001 From: Ming Yu Date: Fri, 12 Sep 2025 17:19:47 +0800 Subject: [PATCH 1866/3206] gpio: Add Nuvoton NCT6694 GPIO support This driver supports GPIO and IRQ functionality for NCT6694 MFD device based on USB interface. Reviewed-by: Linus Walleij Acked-by: Bartosz Golaszewski Signed-off-by: Ming Yu Link: https://lore.kernel.org/r/20250912091952.1169369-3-a0282524688@gmail.com Signed-off-by: Lee Jones --- MAINTAINERS | 1 + drivers/gpio/Kconfig | 12 + drivers/gpio/Makefile | 1 + drivers/gpio/gpio-nct6694.c | 499 ++++++++++++++++++++++++++++++++++++ 4 files changed, 513 insertions(+) create mode 100644 drivers/gpio/gpio-nct6694.c diff --git a/MAINTAINERS b/MAINTAINERS index a8a05872d07757..e340d1934394f9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18085,6 +18085,7 @@ F: include/uapi/linux/nubus.h NUVOTON NCT6694 MFD DRIVER M: Ming Yu S: Supported +F: drivers/gpio/gpio-nct6694.c F: drivers/mfd/nct6694.c F: include/linux/mfd/nct6694.h diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index e43abb322fa6e1..1e0b1f5190a1a2 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -1522,6 +1522,18 @@ config GPIO_MAX77759 This driver can also be built as a module. If so, the module will be called gpio-max77759. +config GPIO_NCT6694 + tristate "Nuvoton NCT6694 GPIO controller support" + depends on MFD_NCT6694 + select GENERIC_IRQ_CHIP + select GPIOLIB_IRQCHIP + help + This driver supports 8 GPIO pins per bank that can all be interrupt + sources. + + This driver can also be built as a module. If so, the module will be + called gpio-nct6694. + config GPIO_PALMAS tristate "TI PALMAS series PMICs GPIO" depends on MFD_PALMAS diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 379f55e9ed1e69..f3e837fccdd2c5 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -128,6 +128,7 @@ obj-$(CONFIG_GPIO_MT7621) += gpio-mt7621.o obj-$(CONFIG_GPIO_MVEBU) += gpio-mvebu.o obj-$(CONFIG_GPIO_MXC) += gpio-mxc.o obj-$(CONFIG_GPIO_MXS) += gpio-mxs.o +obj-$(CONFIG_GPIO_NCT6694) += gpio-nct6694.o obj-$(CONFIG_GPIO_NOMADIK) += gpio-nomadik.o obj-$(CONFIG_GPIO_NPCM_SGPIO) += gpio-npcm-sgpio.o obj-$(CONFIG_GPIO_OCTEON) += gpio-octeon.o diff --git a/drivers/gpio/gpio-nct6694.c b/drivers/gpio/gpio-nct6694.c new file mode 100644 index 00000000000000..a8607f0d99153a --- /dev/null +++ b/drivers/gpio/gpio-nct6694.c @@ -0,0 +1,499 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Nuvoton NCT6694 GPIO controller driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * USB command module type for NCT6694 GPIO controller. + * This defines the module type used for communication with the NCT6694 + * GPIO controller over the USB interface. + */ +#define NCT6694_GPIO_MOD 0xFF + +#define NCT6694_GPIO_VER 0x90 +#define NCT6694_GPIO_VALID 0x110 +#define NCT6694_GPI_DATA 0x120 +#define NCT6694_GPO_DIR 0x170 +#define NCT6694_GPO_TYPE 0x180 +#define NCT6694_GPO_DATA 0x190 + +#define NCT6694_GPI_STS 0x130 +#define NCT6694_GPI_CLR 0x140 +#define NCT6694_GPI_FALLING 0x150 +#define NCT6694_GPI_RISING 0x160 + +#define NCT6694_NR_GPIO 8 + +struct nct6694_gpio_data { + struct nct6694 *nct6694; + struct gpio_chip gpio; + struct mutex lock; + /* Protect irq operation */ + struct mutex irq_lock; + + unsigned char reg_val; + unsigned char irq_trig_falling; + unsigned char irq_trig_rising; + + /* Current gpio group */ + unsigned char group; + int irq; +}; + +static int nct6694_get_direction(struct gpio_chip *gpio, unsigned int offset) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_DIR + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + return !(BIT(offset) & data->reg_val); +} + +static int nct6694_direction_input(struct gpio_chip *gpio, unsigned int offset) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_DIR + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + data->reg_val &= ~BIT(offset); + + return nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); +} + +static int nct6694_direction_output(struct gpio_chip *gpio, + unsigned int offset, int val) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_DIR + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + /* Set direction to output */ + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + data->reg_val |= BIT(offset); + ret = nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + /* Then set output level */ + cmd_hd.offset = cpu_to_le16(NCT6694_GPO_DATA + data->group); + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + if (val) + data->reg_val |= BIT(offset); + else + data->reg_val &= ~BIT(offset); + + return nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); +} + +static int nct6694_get_value(struct gpio_chip *gpio, unsigned int offset) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_DIR + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + if (BIT(offset) & data->reg_val) { + cmd_hd.offset = cpu_to_le16(NCT6694_GPO_DATA + data->group); + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + return !!(BIT(offset) & data->reg_val); + } + + cmd_hd.offset = cpu_to_le16(NCT6694_GPI_DATA + data->group); + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + return !!(BIT(offset) & data->reg_val); +} + +static int nct6694_set_value(struct gpio_chip *gpio, unsigned int offset, + int val) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_DATA + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + if (val) + data->reg_val |= BIT(offset); + else + data->reg_val &= ~BIT(offset); + + return nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); +} + +static int nct6694_set_config(struct gpio_chip *gpio, unsigned int offset, + unsigned long config) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_TYPE + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + switch (pinconf_to_config_param(config)) { + case PIN_CONFIG_DRIVE_OPEN_DRAIN: + data->reg_val |= BIT(offset); + break; + case PIN_CONFIG_DRIVE_PUSH_PULL: + data->reg_val &= ~BIT(offset); + break; + default: + return -ENOTSUPP; + } + + return nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); +} + +static int nct6694_init_valid_mask(struct gpio_chip *gpio, + unsigned long *valid_mask, + unsigned int ngpios) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPIO_VALID + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + *valid_mask = data->reg_val; + + return ret; +} + +static irqreturn_t nct6694_irq_handler(int irq, void *priv) +{ + struct nct6694_gpio_data *data = priv; + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPI_STS + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + unsigned char status; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret) + return IRQ_NONE; + + status = data->reg_val; + + while (status) { + int bit = __ffs(status); + + data->reg_val = BIT(bit); + handle_nested_irq(irq_find_mapping(data->gpio.irq.domain, bit)); + status &= ~BIT(bit); + cmd_hd.offset = cpu_to_le16(NCT6694_GPI_CLR + data->group); + nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); + } + + return IRQ_HANDLED; +} + +static int nct6694_get_irq_trig(struct nct6694_gpio_data *data) +{ + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPI_FALLING + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->irq_trig_falling); + if (ret) + return ret; + + cmd_hd.offset = cpu_to_le16(NCT6694_GPI_RISING + data->group); + return nct6694_read_msg(data->nct6694, &cmd_hd, &data->irq_trig_rising); +} + +static void nct6694_irq_mask(struct irq_data *d) +{ + struct gpio_chip *gpio = irq_data_get_irq_chip_data(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + + gpiochip_disable_irq(gpio, hwirq); +} + +static void nct6694_irq_unmask(struct irq_data *d) +{ + struct gpio_chip *gpio = irq_data_get_irq_chip_data(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + + gpiochip_enable_irq(gpio, hwirq); +} + +static int nct6694_irq_set_type(struct irq_data *d, unsigned int type) +{ + struct gpio_chip *gpio = irq_data_get_irq_chip_data(d); + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + + guard(mutex)(&data->lock); + + switch (type) { + case IRQ_TYPE_EDGE_RISING: + data->irq_trig_rising |= BIT(hwirq); + break; + + case IRQ_TYPE_EDGE_FALLING: + data->irq_trig_falling |= BIT(hwirq); + break; + + case IRQ_TYPE_EDGE_BOTH: + data->irq_trig_rising |= BIT(hwirq); + data->irq_trig_falling |= BIT(hwirq); + break; + + default: + return -ENOTSUPP; + } + + return 0; +} + +static void nct6694_irq_bus_lock(struct irq_data *d) +{ + struct gpio_chip *gpio = irq_data_get_irq_chip_data(d); + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + + mutex_lock(&data->irq_lock); +} + +static void nct6694_irq_bus_sync_unlock(struct irq_data *d) +{ + struct gpio_chip *gpio = irq_data_get_irq_chip_data(d); + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPI_FALLING + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + + scoped_guard(mutex, &data->lock) { + nct6694_write_msg(data->nct6694, &cmd_hd, &data->irq_trig_falling); + + cmd_hd.offset = cpu_to_le16(NCT6694_GPI_RISING + data->group); + nct6694_write_msg(data->nct6694, &cmd_hd, &data->irq_trig_rising); + } + + mutex_unlock(&data->irq_lock); +} + +static const struct irq_chip nct6694_irq_chip = { + .name = "gpio-nct6694", + .irq_mask = nct6694_irq_mask, + .irq_unmask = nct6694_irq_unmask, + .irq_set_type = nct6694_irq_set_type, + .irq_bus_lock = nct6694_irq_bus_lock, + .irq_bus_sync_unlock = nct6694_irq_bus_sync_unlock, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + +static void nct6694_irq_dispose_mapping(void *d) +{ + struct nct6694_gpio_data *data = d; + + irq_dispose_mapping(data->irq); +} + +static void nct6694_gpio_ida_free(void *d) +{ + struct nct6694_gpio_data *data = d; + struct nct6694 *nct6694 = data->nct6694; + + ida_free(&nct6694->gpio_ida, data->group); +} + +static int nct6694_gpio_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct nct6694 *nct6694 = dev_get_drvdata(dev->parent); + struct nct6694_gpio_data *data; + struct gpio_irq_chip *girq; + int ret, i; + char **names; + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->nct6694 = nct6694; + + ret = ida_alloc(&nct6694->gpio_ida, GFP_KERNEL); + if (ret < 0) + return ret; + data->group = ret; + + ret = devm_add_action_or_reset(dev, nct6694_gpio_ida_free, data); + if (ret) + return ret; + + names = devm_kcalloc(dev, NCT6694_NR_GPIO, sizeof(char *), + GFP_KERNEL); + if (!names) + return -ENOMEM; + + for (i = 0; i < NCT6694_NR_GPIO; i++) { + names[i] = devm_kasprintf(dev, GFP_KERNEL, "GPIO%X%d", + data->group, i); + if (!names[i]) + return -ENOMEM; + } + + data->irq = irq_create_mapping(nct6694->domain, + NCT6694_IRQ_GPIO0 + data->group); + if (!data->irq) + return -EINVAL; + + ret = devm_add_action_or_reset(dev, nct6694_irq_dispose_mapping, data); + if (ret) + return ret; + + data->gpio.names = (const char * const*)names; + data->gpio.label = pdev->name; + data->gpio.direction_input = nct6694_direction_input; + data->gpio.get = nct6694_get_value; + data->gpio.direction_output = nct6694_direction_output; + data->gpio.set = nct6694_set_value; + data->gpio.get_direction = nct6694_get_direction; + data->gpio.set_config = nct6694_set_config; + data->gpio.init_valid_mask = nct6694_init_valid_mask; + data->gpio.base = -1; + data->gpio.can_sleep = false; + data->gpio.owner = THIS_MODULE; + data->gpio.ngpio = NCT6694_NR_GPIO; + + platform_set_drvdata(pdev, data); + + ret = devm_mutex_init(dev, &data->lock); + if (ret) + return ret; + + ret = devm_mutex_init(dev, &data->irq_lock); + if (ret) + return ret; + + ret = nct6694_get_irq_trig(data); + if (ret) { + dev_err_probe(dev, ret, "Failed to get irq trigger type\n"); + return ret; + } + + girq = &data->gpio.irq; + gpio_irq_chip_set_chip(girq, &nct6694_irq_chip); + girq->parent_handler = NULL; + girq->num_parents = 0; + girq->parents = NULL; + girq->default_type = IRQ_TYPE_NONE; + girq->handler = handle_level_irq; + girq->threaded = true; + + ret = devm_request_threaded_irq(dev, data->irq, NULL, nct6694_irq_handler, + IRQF_ONESHOT | IRQF_SHARED, + "gpio-nct6694", data); + if (ret) { + dev_err_probe(dev, ret, "Failed to request irq\n"); + return ret; + } + + return devm_gpiochip_add_data(dev, &data->gpio, data); +} + +static struct platform_driver nct6694_gpio_driver = { + .driver = { + .name = "nct6694-gpio", + }, + .probe = nct6694_gpio_probe, +}; + +module_platform_driver(nct6694_gpio_driver); + +MODULE_DESCRIPTION("USB-GPIO controller driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:nct6694-gpio"); From c5cf27dbaeb6e12ea1703ee896dd4b42e92343aa Mon Sep 17 00:00:00 2001 From: Ming Yu Date: Fri, 12 Sep 2025 17:19:48 +0800 Subject: [PATCH 1867/3206] i2c: Add Nuvoton NCT6694 I2C support This driver supports I2C adapter functionality for NCT6694 MFD device based on USB interface. Each I2C controller uses the default baudrate of 100kHz, which can be overridden via module parameters. Acked-by: Andi Shyti Reviewed-by: Wolfram Sang Signed-off-by: Ming Yu Link: https://lore.kernel.org/r/20250912091952.1169369-4-a0282524688@gmail.com Signed-off-by: Lee Jones --- MAINTAINERS | 1 + drivers/i2c/busses/Kconfig | 10 ++ drivers/i2c/busses/Makefile | 1 + drivers/i2c/busses/i2c-nct6694.c | 196 +++++++++++++++++++++++++++++++ 4 files changed, 208 insertions(+) create mode 100644 drivers/i2c/busses/i2c-nct6694.c diff --git a/MAINTAINERS b/MAINTAINERS index e340d1934394f9..c8f912cb0b9547 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18086,6 +18086,7 @@ NUVOTON NCT6694 MFD DRIVER M: Ming Yu S: Supported F: drivers/gpio/gpio-nct6694.c +F: drivers/i2c/busses/i2c-nct6694.c F: drivers/mfd/nct6694.c F: include/linux/mfd/nct6694.h diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 070d014fdc5d5d..63a2b5a9abc39f 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -1357,6 +1357,16 @@ config I2C_LJCA This driver can also be built as a module. If so, the module will be called i2c-ljca. +config I2C_NCT6694 + tristate "Nuvoton NCT6694 I2C adapter support" + depends on MFD_NCT6694 + help + If you say yes to this option, support will be included for Nuvoton + NCT6694, a USB to I2C interface. + + This driver can also be built as a module. If so, the module will + be called i2c-nct6694. + config I2C_CP2615 tristate "Silicon Labs CP2615 USB sound card and I2C adapter" depends on USB diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index 04db855fdfd66f..fe8cf6325fc989 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -135,6 +135,7 @@ obj-$(CONFIG_I2C_GXP) += i2c-gxp.o obj-$(CONFIG_I2C_DIOLAN_U2C) += i2c-diolan-u2c.o obj-$(CONFIG_I2C_DLN2) += i2c-dln2.o obj-$(CONFIG_I2C_LJCA) += i2c-ljca.o +obj-$(CONFIG_I2C_NCT6694) += i2c-nct6694.o obj-$(CONFIG_I2C_CP2615) += i2c-cp2615.o obj-$(CONFIG_I2C_PARPORT) += i2c-parport.o obj-$(CONFIG_I2C_PCI1XXXX) += i2c-mchp-pci1xxxx.o diff --git a/drivers/i2c/busses/i2c-nct6694.c b/drivers/i2c/busses/i2c-nct6694.c new file mode 100644 index 00000000000000..1413ab6f94628b --- /dev/null +++ b/drivers/i2c/busses/i2c-nct6694.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Nuvoton NCT6694 I2C adapter driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include + +/* + * USB command module type for NCT6694 I2C controller. + * This defines the module type used for communication with the NCT6694 + * I2C controller over the USB interface. + */ +#define NCT6694_I2C_MOD 0x03 + +/* Command 00h - I2C Deliver */ +#define NCT6694_I2C_DELIVER 0x00 +#define NCT6694_I2C_DELIVER_SEL 0x00 + +#define NCT6694_I2C_MAX_XFER_SIZE 64 +#define NCT6694_I2C_MAX_DEVS 6 + +static unsigned char br_reg[NCT6694_I2C_MAX_DEVS] = {[0 ... (NCT6694_I2C_MAX_DEVS - 1)] = 0xFF}; + +module_param_array(br_reg, byte, NULL, 0644); +MODULE_PARM_DESC(br_reg, + "I2C Baudrate register per adapter: (0=25K, 1=50K, 2=100K, 3=200K, 4=400K, 5=800K, 6=1M), default=2"); + +enum nct6694_i2c_baudrate { + NCT6694_I2C_BR_25K = 0, + NCT6694_I2C_BR_50K, + NCT6694_I2C_BR_100K, + NCT6694_I2C_BR_200K, + NCT6694_I2C_BR_400K, + NCT6694_I2C_BR_800K, + NCT6694_I2C_BR_1M +}; + +struct __packed nct6694_i2c_deliver { + u8 port; + u8 br; + u8 addr; + u8 w_cnt; + u8 r_cnt; + u8 rsv[11]; + u8 write_data[NCT6694_I2C_MAX_XFER_SIZE]; + u8 read_data[NCT6694_I2C_MAX_XFER_SIZE]; +}; + +struct nct6694_i2c_data { + struct device *dev; + struct nct6694 *nct6694; + struct i2c_adapter adapter; + struct nct6694_i2c_deliver deliver; + unsigned char port; + unsigned char br; +}; + +static int nct6694_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) +{ + struct nct6694_i2c_data *data = adap->algo_data; + struct nct6694_i2c_deliver *deliver = &data->deliver; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_I2C_MOD, + .cmd = NCT6694_I2C_DELIVER, + .sel = NCT6694_I2C_DELIVER_SEL, + .len = cpu_to_le16(sizeof(*deliver)) + }; + int ret, i; + + for (i = 0; i < num; i++) { + struct i2c_msg *msg_temp = &msgs[i]; + + memset(deliver, 0, sizeof(*deliver)); + + deliver->port = data->port; + deliver->br = data->br; + deliver->addr = i2c_8bit_addr_from_msg(msg_temp); + if (msg_temp->flags & I2C_M_RD) { + deliver->r_cnt = msg_temp->len; + ret = nct6694_write_msg(data->nct6694, &cmd_hd, deliver); + if (ret < 0) + return ret; + + memcpy(msg_temp->buf, deliver->read_data, msg_temp->len); + } else { + deliver->w_cnt = msg_temp->len; + memcpy(deliver->write_data, msg_temp->buf, msg_temp->len); + ret = nct6694_write_msg(data->nct6694, &cmd_hd, deliver); + if (ret < 0) + return ret; + } + } + + return num; +} + +static u32 nct6694_i2c_func(struct i2c_adapter *adapter) +{ + return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL; +} + +static const struct i2c_adapter_quirks nct6694_i2c_quirks = { + .max_read_len = NCT6694_I2C_MAX_XFER_SIZE, + .max_write_len = NCT6694_I2C_MAX_XFER_SIZE, +}; + +static const struct i2c_algorithm nct6694_i2c_algo = { + .xfer = nct6694_i2c_xfer, + .functionality = nct6694_i2c_func, +}; + +static int nct6694_i2c_set_baudrate(struct nct6694_i2c_data *data) +{ + if (data->port >= NCT6694_I2C_MAX_DEVS) { + dev_err(data->dev, "Invalid I2C port index %d\n", data->port); + return -EINVAL; + } + + if (br_reg[data->port] > NCT6694_I2C_BR_1M) { + dev_warn(data->dev, "Invalid baudrate %d for I2C%d, using 100K\n", + br_reg[data->port], data->port); + br_reg[data->port] = NCT6694_I2C_BR_100K; + } + + data->br = br_reg[data->port]; + + return 0; +} + +static void nct6694_i2c_ida_free(void *d) +{ + struct nct6694_i2c_data *data = d; + struct nct6694 *nct6694 = data->nct6694; + + ida_free(&nct6694->i2c_ida, data->port); +} + +static int nct6694_i2c_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct nct6694 *nct6694 = dev_get_drvdata(dev->parent); + struct nct6694_i2c_data *data; + int ret; + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->dev = dev; + data->nct6694 = nct6694; + + ret = ida_alloc(&nct6694->i2c_ida, GFP_KERNEL); + if (ret < 0) + return ret; + data->port = ret; + + ret = devm_add_action_or_reset(dev, nct6694_i2c_ida_free, data); + if (ret) + return ret; + + ret = nct6694_i2c_set_baudrate(data); + if (ret) + return ret; + + sprintf(data->adapter.name, "NCT6694 I2C Adapter %d", data->port); + data->adapter.owner = THIS_MODULE; + data->adapter.algo = &nct6694_i2c_algo; + data->adapter.quirks = &nct6694_i2c_quirks; + data->adapter.dev.parent = dev; + data->adapter.algo_data = data; + + platform_set_drvdata(pdev, data); + + return devm_i2c_add_adapter(dev, &data->adapter); +} + +static struct platform_driver nct6694_i2c_driver = { + .driver = { + .name = "nct6694-i2c", + }, + .probe = nct6694_i2c_probe, +}; + +module_platform_driver(nct6694_i2c_driver); + +MODULE_DESCRIPTION("USB-I2C adapter driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:nct6694-i2c"); From 8a204684d0ffdf8d39c16d70fc6f1000e831ef27 Mon Sep 17 00:00:00 2001 From: Ming Yu Date: Fri, 12 Sep 2025 17:19:49 +0800 Subject: [PATCH 1868/3206] can: Add Nuvoton NCT6694 CANFD support This driver supports Socket CANFD functionality for NCT6694 MFD device based on USB interface. Reviewed-by: Marc Kleine-Budde Reviewed-by: Vincent Mailhol Signed-off-by: Ming Yu Link: https://lore.kernel.org/r/20250912091952.1169369-5-a0282524688@gmail.com Signed-off-by: Lee Jones --- MAINTAINERS | 1 + drivers/net/can/usb/Kconfig | 11 + drivers/net/can/usb/Makefile | 1 + drivers/net/can/usb/nct6694_canfd.c | 832 ++++++++++++++++++++++++++++ 4 files changed, 845 insertions(+) create mode 100644 drivers/net/can/usb/nct6694_canfd.c diff --git a/MAINTAINERS b/MAINTAINERS index c8f912cb0b9547..758c9a67184e72 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18088,6 +18088,7 @@ S: Supported F: drivers/gpio/gpio-nct6694.c F: drivers/i2c/busses/i2c-nct6694.c F: drivers/mfd/nct6694.c +F: drivers/net/can/usb/nct6694_canfd.c F: include/linux/mfd/nct6694.h NUVOTON NCT7201 IIO DRIVER diff --git a/drivers/net/can/usb/Kconfig b/drivers/net/can/usb/Kconfig index a7547a83120e84..cf65a90816b9e4 100644 --- a/drivers/net/can/usb/Kconfig +++ b/drivers/net/can/usb/Kconfig @@ -134,6 +134,17 @@ config CAN_MCBA_USB This driver supports the CAN BUS Analyzer interface from Microchip (http://www.microchip.com/development-tools/). +config CAN_NCT6694 + tristate "Nuvoton NCT6694 Socket CANfd support" + depends on MFD_NCT6694 + select CAN_RX_OFFLOAD + help + If you say yes to this option, support will be included for Nuvoton + NCT6694, a USB device to socket CANfd controller. + + This driver can also be built as a module. If so, the module will + be called nct6694_canfd. + config CAN_PEAK_USB tristate "PEAK PCAN-USB/USB Pro interfaces for CAN 2.0b/CAN-FD" help diff --git a/drivers/net/can/usb/Makefile b/drivers/net/can/usb/Makefile index 8b11088e9a5956..fcafb1ac262eaf 100644 --- a/drivers/net/can/usb/Makefile +++ b/drivers/net/can/usb/Makefile @@ -11,5 +11,6 @@ obj-$(CONFIG_CAN_F81604) += f81604.o obj-$(CONFIG_CAN_GS_USB) += gs_usb.o obj-$(CONFIG_CAN_KVASER_USB) += kvaser_usb/ obj-$(CONFIG_CAN_MCBA_USB) += mcba_usb.o +obj-$(CONFIG_CAN_NCT6694) += nct6694_canfd.o obj-$(CONFIG_CAN_PEAK_USB) += peak_usb/ obj-$(CONFIG_CAN_UCAN) += ucan.o diff --git a/drivers/net/can/usb/nct6694_canfd.c b/drivers/net/can/usb/nct6694_canfd.c new file mode 100644 index 00000000000000..8deff16491a1a3 --- /dev/null +++ b/drivers/net/can/usb/nct6694_canfd.c @@ -0,0 +1,832 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Nuvoton NCT6694 Socket CANfd driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEVICE_NAME "nct6694-canfd" + +/* USB command module type for NCT6694 CANfd controller. + * This defines the module type used for communication with the NCT6694 + * CANfd controller over the USB interface. + */ +#define NCT6694_CANFD_MOD 0x05 + +/* Command 00h - CAN Setting and Initialization */ +#define NCT6694_CANFD_SETTING 0x00 +#define NCT6694_CANFD_SETTING_ACTIVE_CTRL1 BIT(0) +#define NCT6694_CANFD_SETTING_ACTIVE_CTRL2 BIT(1) +#define NCT6694_CANFD_SETTING_ACTIVE_NBTP_DBTP BIT(2) +#define NCT6694_CANFD_SETTING_CTRL1_MON BIT(0) +#define NCT6694_CANFD_SETTING_CTRL1_NISO BIT(1) +#define NCT6694_CANFD_SETTING_CTRL1_LBCK BIT(2) +#define NCT6694_CANFD_SETTING_NBTP_NTSEG2 GENMASK(6, 0) +#define NCT6694_CANFD_SETTING_NBTP_NTSEG1 GENMASK(15, 8) +#define NCT6694_CANFD_SETTING_NBTP_NBRP GENMASK(24, 16) +#define NCT6694_CANFD_SETTING_NBTP_NSJW GENMASK(31, 25) +#define NCT6694_CANFD_SETTING_DBTP_DSJW GENMASK(3, 0) +#define NCT6694_CANFD_SETTING_DBTP_DTSEG2 GENMASK(7, 4) +#define NCT6694_CANFD_SETTING_DBTP_DTSEG1 GENMASK(12, 8) +#define NCT6694_CANFD_SETTING_DBTP_DBRP GENMASK(20, 16) +#define NCT6694_CANFD_SETTING_DBTP_TDC BIT(23) + +/* Command 01h - CAN Information */ +#define NCT6694_CANFD_INFORMATION 0x01 +#define NCT6694_CANFD_INFORMATION_SEL 0x00 + +/* Command 02h - CAN Event */ +#define NCT6694_CANFD_EVENT 0x02 +#define NCT6694_CANFD_EVENT_SEL(idx, mask) \ + ((idx ? 0x80 : 0x00) | ((mask) & 0x7F)) + +#define NCT6694_CANFD_EVENT_MASK GENMASK(5, 0) +#define NCT6694_CANFD_EVT_TX_FIFO_EMPTY BIT(7) /* Read-clear */ +#define NCT6694_CANFD_EVT_RX_DATA_LOST BIT(5) /* Read-clear */ +#define NCT6694_CANFD_EVT_RX_DATA_IN BIT(7) /* Read-clear */ + +/* Command 10h - CAN Deliver */ +#define NCT6694_CANFD_DELIVER 0x10 +#define NCT6694_CANFD_DELIVER_SEL(buf_cnt) \ + ((buf_cnt) & 0xFF) + +/* Command 11h - CAN Receive */ +#define NCT6694_CANFD_RECEIVE 0x11 +#define NCT6694_CANFD_RECEIVE_SEL(idx, buf_cnt) \ + ((idx ? 0x80 : 0x00) | ((buf_cnt) & 0x7F)) + +#define NCT6694_CANFD_FRAME_TAG(idx) (0xC0 | (idx)) +#define NCT6694_CANFD_FRAME_FLAG_EFF BIT(0) +#define NCT6694_CANFD_FRAME_FLAG_RTR BIT(1) +#define NCT6694_CANFD_FRAME_FLAG_FD BIT(2) +#define NCT6694_CANFD_FRAME_FLAG_BRS BIT(3) +#define NCT6694_CANFD_FRAME_FLAG_ERR BIT(4) + +#define NCT6694_NAPI_WEIGHT 32 + +enum nct6694_event_err { + NCT6694_CANFD_EVT_ERR_NO_ERROR = 0, + NCT6694_CANFD_EVT_ERR_CRC_ERROR, + NCT6694_CANFD_EVT_ERR_STUFF_ERROR, + NCT6694_CANFD_EVT_ERR_ACK_ERROR, + NCT6694_CANFD_EVT_ERR_FORM_ERROR, + NCT6694_CANFD_EVT_ERR_BIT_ERROR, + NCT6694_CANFD_EVT_ERR_TIMEOUT_ERROR, + NCT6694_CANFD_EVT_ERR_UNKNOWN_ERROR, +}; + +enum nct6694_event_status { + NCT6694_CANFD_EVT_STS_ERROR_ACTIVE = 0, + NCT6694_CANFD_EVT_STS_ERROR_PASSIVE, + NCT6694_CANFD_EVT_STS_BUS_OFF, + NCT6694_CANFD_EVT_STS_WARNING, +}; + +struct __packed nct6694_canfd_setting { + __le32 nbr; + __le32 dbr; + u8 active; + u8 reserved[3]; + __le16 ctrl1; + __le16 ctrl2; + __le32 nbtp; + __le32 dbtp; +}; + +struct __packed nct6694_canfd_information { + u8 tx_fifo_cnt; + u8 rx_fifo_cnt; + u8 reserved[2]; + __le32 can_clk; +}; + +struct __packed nct6694_canfd_event { + u8 err; + u8 status; + u8 tx_evt; + u8 rx_evt; + u8 rec; + u8 tec; + u8 reserved[2]; +}; + +struct __packed nct6694_canfd_frame { + u8 tag; + u8 flag; + u8 reserved; + u8 length; + __le32 id; + u8 data[CANFD_MAX_DLEN]; +}; + +struct nct6694_canfd_priv { + struct can_priv can; /* must be the first member */ + struct can_rx_offload offload; + struct net_device *ndev; + struct nct6694 *nct6694; + struct workqueue_struct *wq; + struct work_struct tx_work; + struct nct6694_canfd_frame tx; + struct nct6694_canfd_frame rx; + struct nct6694_canfd_event event[2]; + struct can_berr_counter bec; +}; + +static inline struct nct6694_canfd_priv *rx_offload_to_priv(struct can_rx_offload *offload) +{ + return container_of(offload, struct nct6694_canfd_priv, offload); +} + +static const struct can_bittiming_const nct6694_canfd_bittiming_nominal_const = { + .name = DEVICE_NAME, + .tseg1_min = 1, + .tseg1_max = 256, + .tseg2_min = 1, + .tseg2_max = 128, + .sjw_max = 128, + .brp_min = 1, + .brp_max = 512, + .brp_inc = 1, +}; + +static const struct can_bittiming_const nct6694_canfd_bittiming_data_const = { + .name = DEVICE_NAME, + .tseg1_min = 1, + .tseg1_max = 32, + .tseg2_min = 1, + .tseg2_max = 16, + .sjw_max = 16, + .brp_min = 1, + .brp_max = 32, + .brp_inc = 1, +}; + +static void nct6694_canfd_rx_offload(struct can_rx_offload *offload, + struct sk_buff *skb) +{ + struct nct6694_canfd_priv *priv = rx_offload_to_priv(offload); + int ret; + + ret = can_rx_offload_queue_tail(offload, skb); + if (ret) + priv->ndev->stats.rx_fifo_errors++; +} + +static void nct6694_canfd_handle_lost_msg(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct net_device_stats *stats = &ndev->stats; + struct can_frame *cf; + struct sk_buff *skb; + + netdev_dbg(ndev, "RX FIFO overflow, message(s) lost.\n"); + + stats->rx_errors++; + stats->rx_over_errors++; + + skb = alloc_can_err_skb(ndev, &cf); + if (!skb) + return; + + cf->can_id |= CAN_ERR_CRTL; + cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW; + + nct6694_canfd_rx_offload(&priv->offload, skb); +} + +static void nct6694_canfd_handle_rx(struct net_device *ndev, u8 rx_evt) +{ + struct net_device_stats *stats = &ndev->stats; + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct nct6694_canfd_frame *frame = &priv->rx; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_RECEIVE, + .sel = NCT6694_CANFD_RECEIVE_SEL(ndev->dev_port, 1), + .len = cpu_to_le16(sizeof(*frame)) + }; + struct sk_buff *skb; + int ret; + + ret = nct6694_read_msg(priv->nct6694, &cmd_hd, frame); + if (ret) + return; + + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_FD) { + struct canfd_frame *cfd; + + skb = alloc_canfd_skb(priv->ndev, &cfd); + if (!skb) { + stats->rx_dropped++; + return; + } + + cfd->can_id = le32_to_cpu(frame->id); + cfd->len = canfd_sanitize_len(frame->length); + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_EFF) + cfd->can_id |= CAN_EFF_FLAG; + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_BRS) + cfd->flags |= CANFD_BRS; + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_ERR) + cfd->flags |= CANFD_ESI; + + memcpy(cfd->data, frame->data, cfd->len); + } else { + struct can_frame *cf; + + skb = alloc_can_skb(priv->ndev, &cf); + if (!skb) { + stats->rx_dropped++; + return; + } + + cf->can_id = le32_to_cpu(frame->id); + cf->len = can_cc_dlc2len(frame->length); + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_EFF) + cf->can_id |= CAN_EFF_FLAG; + + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_RTR) + cf->can_id |= CAN_RTR_FLAG; + else + memcpy(cf->data, frame->data, cf->len); + } + + nct6694_canfd_rx_offload(&priv->offload, skb); +} + +static int nct6694_canfd_get_berr_counter(const struct net_device *ndev, + struct can_berr_counter *bec) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + + *bec = priv->bec; + + return 0; +} + +static void nct6694_canfd_handle_state_change(struct net_device *ndev, u8 status) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + enum can_state new_state, rx_state, tx_state; + struct can_berr_counter bec; + struct can_frame *cf; + struct sk_buff *skb; + + nct6694_canfd_get_berr_counter(ndev, &bec); + can_state_get_by_berr_counter(ndev, &bec, &tx_state, &rx_state); + + new_state = max(tx_state, rx_state); + + /* state hasn't changed */ + if (new_state == priv->can.state) + return; + + skb = alloc_can_err_skb(ndev, &cf); + + can_change_state(ndev, cf, tx_state, rx_state); + + if (new_state == CAN_STATE_BUS_OFF) { + can_bus_off(ndev); + } else if (cf) { + cf->can_id |= CAN_ERR_CNT; + cf->data[6] = bec.txerr; + cf->data[7] = bec.rxerr; + } + + if (skb) + nct6694_canfd_rx_offload(&priv->offload, skb); +} + +static void nct6694_canfd_handle_bus_err(struct net_device *ndev, u8 bus_err) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct can_frame *cf; + struct sk_buff *skb; + + priv->can.can_stats.bus_error++; + + skb = alloc_can_err_skb(ndev, &cf); + if (cf) + cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR; + + switch (bus_err) { + case NCT6694_CANFD_EVT_ERR_CRC_ERROR: + netdev_dbg(ndev, "CRC error\n"); + ndev->stats.rx_errors++; + if (cf) + cf->data[3] |= CAN_ERR_PROT_LOC_CRC_SEQ; + break; + + case NCT6694_CANFD_EVT_ERR_STUFF_ERROR: + netdev_dbg(ndev, "Stuff error\n"); + ndev->stats.rx_errors++; + if (cf) + cf->data[2] |= CAN_ERR_PROT_STUFF; + break; + + case NCT6694_CANFD_EVT_ERR_ACK_ERROR: + netdev_dbg(ndev, "Ack error\n"); + ndev->stats.tx_errors++; + if (cf) { + cf->can_id |= CAN_ERR_ACK; + cf->data[2] |= CAN_ERR_PROT_TX; + } + break; + + case NCT6694_CANFD_EVT_ERR_FORM_ERROR: + netdev_dbg(ndev, "Form error\n"); + ndev->stats.rx_errors++; + if (cf) + cf->data[2] |= CAN_ERR_PROT_FORM; + break; + + case NCT6694_CANFD_EVT_ERR_BIT_ERROR: + netdev_dbg(ndev, "Bit error\n"); + ndev->stats.tx_errors++; + if (cf) + cf->data[2] |= CAN_ERR_PROT_TX | CAN_ERR_PROT_BIT; + break; + + default: + break; + } + + if (skb) + nct6694_canfd_rx_offload(&priv->offload, skb); +} + +static void nct6694_canfd_handle_tx(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct net_device_stats *stats = &ndev->stats; + + stats->tx_bytes += can_rx_offload_get_echo_skb_queue_tail(&priv->offload, + 0, NULL); + stats->tx_packets++; + netif_wake_queue(ndev); +} + +static irqreturn_t nct6694_canfd_irq(int irq, void *data) +{ + struct net_device *ndev = data; + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct nct6694_canfd_event *event = &priv->event[ndev->dev_port]; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_EVENT, + .sel = NCT6694_CANFD_EVENT_SEL(ndev->dev_port, NCT6694_CANFD_EVENT_MASK), + .len = cpu_to_le16(sizeof(priv->event)) + }; + irqreturn_t handled = IRQ_NONE; + int ret; + + ret = nct6694_read_msg(priv->nct6694, &cmd_hd, priv->event); + if (ret < 0) + return handled; + + if (event->rx_evt & NCT6694_CANFD_EVT_RX_DATA_IN) { + nct6694_canfd_handle_rx(ndev, event->rx_evt); + handled = IRQ_HANDLED; + } + + if (event->rx_evt & NCT6694_CANFD_EVT_RX_DATA_LOST) { + nct6694_canfd_handle_lost_msg(ndev); + handled = IRQ_HANDLED; + } + + if (event->status) { + nct6694_canfd_handle_state_change(ndev, event->status); + handled = IRQ_HANDLED; + } + + if (event->err != NCT6694_CANFD_EVT_ERR_NO_ERROR) { + if (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) + nct6694_canfd_handle_bus_err(ndev, event->err); + handled = IRQ_HANDLED; + } + + if (event->tx_evt & NCT6694_CANFD_EVT_TX_FIFO_EMPTY) { + nct6694_canfd_handle_tx(ndev); + handled = IRQ_HANDLED; + } + + if (handled) + can_rx_offload_threaded_irq_finish(&priv->offload); + + priv->bec.rxerr = event->rec; + priv->bec.txerr = event->tec; + + return handled; +} + +static void nct6694_canfd_tx_work(struct work_struct *work) +{ + struct nct6694_canfd_priv *priv = container_of(work, + struct nct6694_canfd_priv, + tx_work); + struct nct6694_canfd_frame *frame = &priv->tx; + struct net_device *ndev = priv->ndev; + struct net_device_stats *stats = &ndev->stats; + struct sk_buff *skb = priv->can.echo_skb[0]; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_DELIVER, + .sel = NCT6694_CANFD_DELIVER_SEL(1), + .len = cpu_to_le16(sizeof(*frame)) + }; + u32 txid; + int err; + + memset(frame, 0, sizeof(*frame)); + + frame->tag = NCT6694_CANFD_FRAME_TAG(ndev->dev_port); + + if (can_is_canfd_skb(skb)) { + struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + + if (cfd->flags & CANFD_BRS) + frame->flag |= NCT6694_CANFD_FRAME_FLAG_BRS; + + if (cfd->can_id & CAN_EFF_FLAG) { + txid = cfd->can_id & CAN_EFF_MASK; + frame->flag |= NCT6694_CANFD_FRAME_FLAG_EFF; + } else { + txid = cfd->can_id & CAN_SFF_MASK; + } + frame->flag |= NCT6694_CANFD_FRAME_FLAG_FD; + frame->id = cpu_to_le32(txid); + frame->length = canfd_sanitize_len(cfd->len); + + memcpy(frame->data, cfd->data, frame->length); + } else { + struct can_frame *cf = (struct can_frame *)skb->data; + + if (cf->can_id & CAN_EFF_FLAG) { + txid = cf->can_id & CAN_EFF_MASK; + frame->flag |= NCT6694_CANFD_FRAME_FLAG_EFF; + } else { + txid = cf->can_id & CAN_SFF_MASK; + } + + if (cf->can_id & CAN_RTR_FLAG) + frame->flag |= NCT6694_CANFD_FRAME_FLAG_RTR; + else + memcpy(frame->data, cf->data, cf->len); + + frame->id = cpu_to_le32(txid); + frame->length = cf->len; + } + + err = nct6694_write_msg(priv->nct6694, &cmd_hd, frame); + if (err) { + can_free_echo_skb(ndev, 0, NULL); + stats->tx_dropped++; + stats->tx_errors++; + netif_wake_queue(ndev); + } +} + +static netdev_tx_t nct6694_canfd_start_xmit(struct sk_buff *skb, + struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + + if (can_dev_dropped_skb(ndev, skb)) + return NETDEV_TX_OK; + + netif_stop_queue(ndev); + can_put_echo_skb(skb, ndev, 0, 0); + queue_work(priv->wq, &priv->tx_work); + + return NETDEV_TX_OK; +} + +static int nct6694_canfd_start(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + const struct can_bittiming *n_bt = &priv->can.bittiming; + const struct can_bittiming *d_bt = &priv->can.fd.data_bittiming; + struct nct6694_canfd_setting *setting __free(kfree) = NULL; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_SETTING, + .sel = ndev->dev_port, + .len = cpu_to_le16(sizeof(*setting)) + }; + u32 en_tdc; + int ret; + + setting = kzalloc(sizeof(*setting), GFP_KERNEL); + if (!setting) + return -ENOMEM; + + if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) + setting->ctrl1 |= cpu_to_le16(NCT6694_CANFD_SETTING_CTRL1_MON); + + if (priv->can.ctrlmode & CAN_CTRLMODE_FD_NON_ISO) + setting->ctrl1 |= cpu_to_le16(NCT6694_CANFD_SETTING_CTRL1_NISO); + + if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) + setting->ctrl1 |= cpu_to_le16(NCT6694_CANFD_SETTING_CTRL1_LBCK); + + /* Disable clock divider */ + setting->ctrl2 = 0; + + setting->nbtp = cpu_to_le32(FIELD_PREP(NCT6694_CANFD_SETTING_NBTP_NSJW, + n_bt->sjw - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_NBTP_NBRP, + n_bt->brp - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_NBTP_NTSEG2, + n_bt->phase_seg2 - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_NBTP_NTSEG1, + n_bt->prop_seg + n_bt->phase_seg1 - 1)); + + if (d_bt->brp <= 2) + en_tdc = NCT6694_CANFD_SETTING_DBTP_TDC; + else + en_tdc = 0; + + setting->dbtp = cpu_to_le32(FIELD_PREP(NCT6694_CANFD_SETTING_DBTP_DSJW, + d_bt->sjw - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_DBTP_DBRP, + d_bt->brp - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_DBTP_DTSEG2, + d_bt->phase_seg2 - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_DBTP_DTSEG1, + d_bt->prop_seg + d_bt->phase_seg1 - 1) | + en_tdc); + + setting->active = NCT6694_CANFD_SETTING_ACTIVE_CTRL1 | + NCT6694_CANFD_SETTING_ACTIVE_CTRL2 | + NCT6694_CANFD_SETTING_ACTIVE_NBTP_DBTP; + + ret = nct6694_write_msg(priv->nct6694, &cmd_hd, setting); + if (ret) + return ret; + + priv->can.state = CAN_STATE_ERROR_ACTIVE; + + return 0; +} + +static void nct6694_canfd_stop(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct nct6694_canfd_setting *setting __free(kfree) = NULL; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_SETTING, + .sel = ndev->dev_port, + .len = cpu_to_le16(sizeof(*setting)) + }; + + /* The NCT6694 cannot be stopped. To ensure safe operation and avoid + * interference, the control mode is set to Listen-Only mode. This + * mode allows the device to monitor bus activity without actively + * participating in communication. + */ + setting = kzalloc(sizeof(*setting), GFP_KERNEL); + if (!setting) + return; + + nct6694_read_msg(priv->nct6694, &cmd_hd, setting); + setting->ctrl1 = cpu_to_le16(NCT6694_CANFD_SETTING_CTRL1_MON); + setting->active = NCT6694_CANFD_SETTING_ACTIVE_CTRL1; + nct6694_write_msg(priv->nct6694, &cmd_hd, setting); + + priv->can.state = CAN_STATE_STOPPED; +} + +static int nct6694_canfd_close(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + + netif_stop_queue(ndev); + nct6694_canfd_stop(ndev); + destroy_workqueue(priv->wq); + free_irq(ndev->irq, ndev); + can_rx_offload_disable(&priv->offload); + close_candev(ndev); + return 0; +} + +static int nct6694_canfd_set_mode(struct net_device *ndev, enum can_mode mode) +{ + int ret; + + switch (mode) { + case CAN_MODE_START: + ret = nct6694_canfd_start(ndev); + if (ret) + return ret; + + netif_wake_queue(ndev); + break; + + default: + return -EOPNOTSUPP; + } + + return ret; +} + +static int nct6694_canfd_open(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + int ret; + + ret = open_candev(ndev); + if (ret) + return ret; + + can_rx_offload_enable(&priv->offload); + + ret = request_threaded_irq(ndev->irq, NULL, + nct6694_canfd_irq, IRQF_ONESHOT, + "nct6694_canfd", ndev); + if (ret) { + netdev_err(ndev, "Failed to request IRQ\n"); + goto can_rx_offload_disable; + } + + priv->wq = alloc_ordered_workqueue("%s-nct6694_wq", + WQ_FREEZABLE | WQ_MEM_RECLAIM, + ndev->name); + if (!priv->wq) { + ret = -ENOMEM; + goto free_irq; + } + + ret = nct6694_canfd_start(ndev); + if (ret) + goto destroy_wq; + + netif_start_queue(ndev); + + return 0; + +destroy_wq: + destroy_workqueue(priv->wq); +free_irq: + free_irq(ndev->irq, ndev); +can_rx_offload_disable: + can_rx_offload_disable(&priv->offload); + close_candev(ndev); + return ret; +} + +static const struct net_device_ops nct6694_canfd_netdev_ops = { + .ndo_open = nct6694_canfd_open, + .ndo_stop = nct6694_canfd_close, + .ndo_start_xmit = nct6694_canfd_start_xmit, + .ndo_change_mtu = can_change_mtu, +}; + +static const struct ethtool_ops nct6694_canfd_ethtool_ops = { + .get_ts_info = ethtool_op_get_ts_info, +}; + +static int nct6694_canfd_get_clock(struct nct6694_canfd_priv *priv) +{ + struct nct6694_canfd_information *info __free(kfree) = NULL; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_INFORMATION, + .sel = NCT6694_CANFD_INFORMATION_SEL, + .len = cpu_to_le16(sizeof(*info)) + }; + int ret; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + ret = nct6694_read_msg(priv->nct6694, &cmd_hd, info); + if (ret) + return ret; + + return le32_to_cpu(info->can_clk); +} + +static int nct6694_canfd_probe(struct platform_device *pdev) +{ + struct nct6694 *nct6694 = dev_get_drvdata(pdev->dev.parent); + struct nct6694_canfd_priv *priv; + struct net_device *ndev; + int port, irq, ret, can_clk; + + port = ida_alloc(&nct6694->canfd_ida, GFP_KERNEL); + if (port < 0) + return port; + + irq = irq_create_mapping(nct6694->domain, + NCT6694_IRQ_CAN0 + port); + if (!irq) { + ret = -EINVAL; + goto free_ida; + } + + ndev = alloc_candev(sizeof(struct nct6694_canfd_priv), 1); + if (!ndev) { + ret = -ENOMEM; + goto dispose_irq; + } + + ndev->irq = irq; + ndev->flags |= IFF_ECHO; + ndev->dev_port = port; + ndev->netdev_ops = &nct6694_canfd_netdev_ops; + ndev->ethtool_ops = &nct6694_canfd_ethtool_ops; + + priv = netdev_priv(ndev); + priv->nct6694 = nct6694; + priv->ndev = ndev; + + can_clk = nct6694_canfd_get_clock(priv); + if (can_clk < 0) { + ret = dev_err_probe(&pdev->dev, can_clk, + "Failed to get clock\n"); + goto free_candev; + } + + INIT_WORK(&priv->tx_work, nct6694_canfd_tx_work); + + priv->can.clock.freq = can_clk; + priv->can.bittiming_const = &nct6694_canfd_bittiming_nominal_const; + priv->can.fd.data_bittiming_const = &nct6694_canfd_bittiming_data_const; + priv->can.do_set_mode = nct6694_canfd_set_mode; + priv->can.do_get_berr_counter = nct6694_canfd_get_berr_counter; + priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK | + CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_BERR_REPORTING | + CAN_CTRLMODE_FD_NON_ISO; + + ret = can_set_static_ctrlmode(ndev, CAN_CTRLMODE_FD); + if (ret) + goto free_candev; + + ret = can_rx_offload_add_manual(ndev, &priv->offload, + NCT6694_NAPI_WEIGHT); + if (ret) { + dev_err_probe(&pdev->dev, ret, "Failed to add rx_offload\n"); + goto free_candev; + } + + platform_set_drvdata(pdev, priv); + SET_NETDEV_DEV(priv->ndev, &pdev->dev); + + ret = register_candev(priv->ndev); + if (ret) + goto rx_offload_del; + + return 0; + +rx_offload_del: + can_rx_offload_del(&priv->offload); +free_candev: + free_candev(ndev); +dispose_irq: + irq_dispose_mapping(irq); +free_ida: + ida_free(&nct6694->canfd_ida, port); + return ret; +} + +static void nct6694_canfd_remove(struct platform_device *pdev) +{ + struct nct6694_canfd_priv *priv = platform_get_drvdata(pdev); + struct nct6694 *nct6694 = priv->nct6694; + struct net_device *ndev = priv->ndev; + int port = ndev->dev_port; + int irq = ndev->irq; + + unregister_candev(ndev); + can_rx_offload_del(&priv->offload); + free_candev(ndev); + irq_dispose_mapping(irq); + ida_free(&nct6694->canfd_ida, port); +} + +static struct platform_driver nct6694_canfd_driver = { + .driver = { + .name = DEVICE_NAME, + }, + .probe = nct6694_canfd_probe, + .remove = nct6694_canfd_remove, +}; + +module_platform_driver(nct6694_canfd_driver); + +MODULE_DESCRIPTION("USB-CAN FD driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); From f9d737a7d84ff4c1df4244361e66ddda400678dc Mon Sep 17 00:00:00 2001 From: Ming Yu Date: Fri, 12 Sep 2025 17:19:50 +0800 Subject: [PATCH 1869/3206] watchdog: Add Nuvoton NCT6694 WDT support This driver supports Watchdog timer functionality for NCT6694 MFD device based on USB interface. Acked-by: Guenter Roeck Signed-off-by: Ming Yu Link: https://lore.kernel.org/r/20250912091952.1169369-6-a0282524688@gmail.com Signed-off-by: Lee Jones --- MAINTAINERS | 1 + drivers/watchdog/Kconfig | 11 ++ drivers/watchdog/Makefile | 1 + drivers/watchdog/nct6694_wdt.c | 307 +++++++++++++++++++++++++++++++++ 4 files changed, 320 insertions(+) create mode 100644 drivers/watchdog/nct6694_wdt.c diff --git a/MAINTAINERS b/MAINTAINERS index 758c9a67184e72..4639d5933c5ec7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18089,6 +18089,7 @@ F: drivers/gpio/gpio-nct6694.c F: drivers/i2c/busses/i2c-nct6694.c F: drivers/mfd/nct6694.c F: drivers/net/can/usb/nct6694_canfd.c +F: drivers/watchdog/nct6694_wdt.c F: include/linux/mfd/nct6694.h NUVOTON NCT7201 IIO DRIVER diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 0c25b2ed44eb4f..05008d937e405b 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -760,6 +760,17 @@ config MAX77620_WATCHDOG MAX77620 chips. To compile this driver as a module, choose M here: the module will be called max77620_wdt. +config NCT6694_WATCHDOG + tristate "Nuvoton NCT6694 watchdog support" + depends on MFD_NCT6694 + select WATCHDOG_CORE + help + Say Y here to support Nuvoton NCT6694 watchdog timer + functionality. + + This driver can also be built as a module. If so, the module + will be called nct6694_wdt. + config IMX2_WDT tristate "IMX2+ Watchdog" depends on ARCH_MXC || ARCH_LAYERSCAPE || COMPILE_TEST diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index bbd4d62d2cc3bf..b680e4d3c1bc20 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -235,6 +235,7 @@ obj-$(CONFIG_WM831X_WATCHDOG) += wm831x_wdt.o obj-$(CONFIG_WM8350_WATCHDOG) += wm8350_wdt.o obj-$(CONFIG_MAX63XX_WATCHDOG) += max63xx_wdt.o obj-$(CONFIG_MAX77620_WATCHDOG) += max77620_wdt.o +obj-$(CONFIG_NCT6694_WATCHDOG) += nct6694_wdt.o obj-$(CONFIG_ZIIRAVE_WATCHDOG) += ziirave_wdt.o obj-$(CONFIG_SOFT_WATCHDOG) += softdog.o obj-$(CONFIG_MENF21BMC_WATCHDOG) += menf21bmc_wdt.o diff --git a/drivers/watchdog/nct6694_wdt.c b/drivers/watchdog/nct6694_wdt.c new file mode 100644 index 00000000000000..bc3689bd4b6bb0 --- /dev/null +++ b/drivers/watchdog/nct6694_wdt.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Nuvoton NCT6694 WDT driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DEVICE_NAME "nct6694-wdt" + +#define NCT6694_DEFAULT_TIMEOUT 10 +#define NCT6694_DEFAULT_PRETIMEOUT 0 + +#define NCT6694_WDT_MAX_DEVS 2 + +/* + * USB command module type for NCT6694 WDT controller. + * This defines the module type used for communication with the NCT6694 + * WDT controller over the USB interface. + */ +#define NCT6694_WDT_MOD 0x07 + +/* Command 00h - WDT Setup */ +#define NCT6694_WDT_SETUP 0x00 +#define NCT6694_WDT_SETUP_SEL(idx) (idx ? 0x01 : 0x00) + +/* Command 01h - WDT Command */ +#define NCT6694_WDT_COMMAND 0x01 +#define NCT6694_WDT_COMMAND_SEL(idx) (idx ? 0x01 : 0x00) + +static unsigned int timeout[NCT6694_WDT_MAX_DEVS] = { + [0 ... (NCT6694_WDT_MAX_DEVS - 1)] = NCT6694_DEFAULT_TIMEOUT +}; +module_param_array(timeout, int, NULL, 0644); +MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds"); + +static unsigned int pretimeout[NCT6694_WDT_MAX_DEVS] = { + [0 ... (NCT6694_WDT_MAX_DEVS - 1)] = NCT6694_DEFAULT_PRETIMEOUT +}; +module_param_array(pretimeout, int, NULL, 0644); +MODULE_PARM_DESC(pretimeout, "Watchdog pre-timeout in seconds"); + +static bool nowayout = WATCHDOG_NOWAYOUT; +module_param(nowayout, bool, 0); +MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +enum { + NCT6694_ACTION_NONE = 0, + NCT6694_ACTION_SIRQ, + NCT6694_ACTION_GPO, +}; + +struct __packed nct6694_wdt_setup { + __le32 pretimeout; + __le32 timeout; + u8 owner; + u8 scratch; + u8 control; + u8 status; + __le32 countdown; +}; + +struct __packed nct6694_wdt_cmd { + __le32 wdt_cmd; + __le32 reserved; +}; + +union __packed nct6694_wdt_msg { + struct nct6694_wdt_setup setup; + struct nct6694_wdt_cmd cmd; +}; + +struct nct6694_wdt_data { + struct watchdog_device wdev; + struct device *dev; + struct nct6694 *nct6694; + union nct6694_wdt_msg *msg; + unsigned char wdev_idx; +}; + +static int nct6694_wdt_setting(struct watchdog_device *wdev, + u32 timeout_val, u8 timeout_act, + u32 pretimeout_val, u8 pretimeout_act) +{ + struct nct6694_wdt_data *data = watchdog_get_drvdata(wdev); + struct nct6694_wdt_setup *setup = &data->msg->setup; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_WDT_MOD, + .cmd = NCT6694_WDT_SETUP, + .sel = NCT6694_WDT_SETUP_SEL(data->wdev_idx), + .len = cpu_to_le16(sizeof(*setup)) + }; + unsigned int timeout_fmt, pretimeout_fmt; + + if (pretimeout_val == 0) + pretimeout_act = NCT6694_ACTION_NONE; + + timeout_fmt = (timeout_val * 1000) | (timeout_act << 24); + pretimeout_fmt = (pretimeout_val * 1000) | (pretimeout_act << 24); + + memset(setup, 0, sizeof(*setup)); + setup->timeout = cpu_to_le32(timeout_fmt); + setup->pretimeout = cpu_to_le32(pretimeout_fmt); + + return nct6694_write_msg(data->nct6694, &cmd_hd, setup); +} + +static int nct6694_wdt_start(struct watchdog_device *wdev) +{ + struct nct6694_wdt_data *data = watchdog_get_drvdata(wdev); + int ret; + + ret = nct6694_wdt_setting(wdev, wdev->timeout, NCT6694_ACTION_GPO, + wdev->pretimeout, NCT6694_ACTION_GPO); + if (ret) + return ret; + + dev_dbg(data->dev, "Setting WDT(%d): timeout = %d, pretimeout = %d\n", + data->wdev_idx, wdev->timeout, wdev->pretimeout); + + return ret; +} + +static int nct6694_wdt_stop(struct watchdog_device *wdev) +{ + struct nct6694_wdt_data *data = watchdog_get_drvdata(wdev); + struct nct6694_wdt_cmd *cmd = &data->msg->cmd; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_WDT_MOD, + .cmd = NCT6694_WDT_COMMAND, + .sel = NCT6694_WDT_COMMAND_SEL(data->wdev_idx), + .len = cpu_to_le16(sizeof(*cmd)) + }; + + memcpy(&cmd->wdt_cmd, "WDTC", 4); + cmd->reserved = 0; + + return nct6694_write_msg(data->nct6694, &cmd_hd, cmd); +} + +static int nct6694_wdt_ping(struct watchdog_device *wdev) +{ + struct nct6694_wdt_data *data = watchdog_get_drvdata(wdev); + struct nct6694_wdt_cmd *cmd = &data->msg->cmd; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_WDT_MOD, + .cmd = NCT6694_WDT_COMMAND, + .sel = NCT6694_WDT_COMMAND_SEL(data->wdev_idx), + .len = cpu_to_le16(sizeof(*cmd)) + }; + + memcpy(&cmd->wdt_cmd, "WDTS", 4); + cmd->reserved = 0; + + return nct6694_write_msg(data->nct6694, &cmd_hd, cmd); +} + +static int nct6694_wdt_set_timeout(struct watchdog_device *wdev, + unsigned int new_timeout) +{ + int ret; + + ret = nct6694_wdt_setting(wdev, new_timeout, NCT6694_ACTION_GPO, + wdev->pretimeout, NCT6694_ACTION_GPO); + if (ret) + return ret; + + wdev->timeout = new_timeout; + + return 0; +} + +static int nct6694_wdt_set_pretimeout(struct watchdog_device *wdev, + unsigned int new_pretimeout) +{ + int ret; + + ret = nct6694_wdt_setting(wdev, wdev->timeout, NCT6694_ACTION_GPO, + new_pretimeout, NCT6694_ACTION_GPO); + if (ret) + return ret; + + wdev->pretimeout = new_pretimeout; + + return 0; +} + +static unsigned int nct6694_wdt_get_time(struct watchdog_device *wdev) +{ + struct nct6694_wdt_data *data = watchdog_get_drvdata(wdev); + struct nct6694_wdt_setup *setup = &data->msg->setup; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_WDT_MOD, + .cmd = NCT6694_WDT_SETUP, + .sel = NCT6694_WDT_SETUP_SEL(data->wdev_idx), + .len = cpu_to_le16(sizeof(*setup)) + }; + unsigned int timeleft_ms; + int ret; + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, setup); + if (ret) + return 0; + + timeleft_ms = le32_to_cpu(setup->countdown); + + return timeleft_ms / 1000; +} + +static const struct watchdog_info nct6694_wdt_info = { + .options = WDIOF_SETTIMEOUT | + WDIOF_KEEPALIVEPING | + WDIOF_MAGICCLOSE | + WDIOF_PRETIMEOUT, + .identity = DEVICE_NAME, +}; + +static const struct watchdog_ops nct6694_wdt_ops = { + .owner = THIS_MODULE, + .start = nct6694_wdt_start, + .stop = nct6694_wdt_stop, + .set_timeout = nct6694_wdt_set_timeout, + .set_pretimeout = nct6694_wdt_set_pretimeout, + .get_timeleft = nct6694_wdt_get_time, + .ping = nct6694_wdt_ping, +}; + +static void nct6694_wdt_ida_free(void *d) +{ + struct nct6694_wdt_data *data = d; + struct nct6694 *nct6694 = data->nct6694; + + ida_free(&nct6694->wdt_ida, data->wdev_idx); +} + +static int nct6694_wdt_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct nct6694 *nct6694 = dev_get_drvdata(dev->parent); + struct nct6694_wdt_data *data; + struct watchdog_device *wdev; + int ret; + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->msg = devm_kzalloc(dev, sizeof(union nct6694_wdt_msg), + GFP_KERNEL); + if (!data->msg) + return -ENOMEM; + + data->dev = dev; + data->nct6694 = nct6694; + + ret = ida_alloc(&nct6694->wdt_ida, GFP_KERNEL); + if (ret < 0) + return ret; + data->wdev_idx = ret; + + ret = devm_add_action_or_reset(dev, nct6694_wdt_ida_free, data); + if (ret) + return ret; + + wdev = &data->wdev; + wdev->info = &nct6694_wdt_info; + wdev->ops = &nct6694_wdt_ops; + wdev->timeout = timeout[data->wdev_idx]; + wdev->pretimeout = pretimeout[data->wdev_idx]; + if (timeout[data->wdev_idx] < pretimeout[data->wdev_idx]) { + dev_warn(data->dev, "pretimeout < timeout. Setting to zero\n"); + wdev->pretimeout = 0; + } + + wdev->min_timeout = 1; + wdev->max_timeout = 255; + + platform_set_drvdata(pdev, data); + + watchdog_set_drvdata(&data->wdev, data); + watchdog_set_nowayout(&data->wdev, nowayout); + watchdog_stop_on_reboot(&data->wdev); + + return devm_watchdog_register_device(dev, &data->wdev); +} + +static struct platform_driver nct6694_wdt_driver = { + .driver = { + .name = DEVICE_NAME, + }, + .probe = nct6694_wdt_probe, +}; + +module_platform_driver(nct6694_wdt_driver); + +MODULE_DESCRIPTION("USB-WDT driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:nct6694-wdt"); From 197e779d29d87961be12eb6429dda472a843830f Mon Sep 17 00:00:00 2001 From: Ming Yu Date: Fri, 12 Sep 2025 17:19:51 +0800 Subject: [PATCH 1870/3206] hwmon: Add Nuvoton NCT6694 HWMON support This driver supports Hardware monitor functionality for NCT6694 MFD device based on USB interface. Reviewed-by: Guenter Roeck Signed-off-by: Ming Yu Link: https://lore.kernel.org/r/20250912091952.1169369-7-a0282524688@gmail.com Signed-off-by: Lee Jones --- MAINTAINERS | 1 + drivers/hwmon/Kconfig | 10 + drivers/hwmon/Makefile | 1 + drivers/hwmon/nct6694-hwmon.c | 949 ++++++++++++++++++++++++++++++++++ 4 files changed, 961 insertions(+) create mode 100644 drivers/hwmon/nct6694-hwmon.c diff --git a/MAINTAINERS b/MAINTAINERS index 4639d5933c5ec7..bbacc9d48a8331 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18086,6 +18086,7 @@ NUVOTON NCT6694 MFD DRIVER M: Ming Yu S: Supported F: drivers/gpio/gpio-nct6694.c +F: drivers/hwmon/nct6694-hwmon.c F: drivers/i2c/busses/i2c-nct6694.c F: drivers/mfd/nct6694.c F: drivers/net/can/usb/nct6694_canfd.c diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 9d28fcf7cd2a6f..19f660d9a0c540 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -1698,6 +1698,16 @@ config SENSORS_NCT6683 This driver can also be built as a module. If so, the module will be called nct6683. +config SENSORS_NCT6694 + tristate "Nuvoton NCT6694 Hardware Monitor support" + depends on MFD_NCT6694 + help + Say Y here to support Nuvoton NCT6694 hardware monitoring + functionality. + + This driver can also be built as a module. If so, the module + will be called nct6694-hwmon. + config SENSORS_NCT6775_CORE tristate select REGMAP diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index cd8bc4752b4dbf..9bce91611dc379 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -174,6 +174,7 @@ obj-$(CONFIG_SENSORS_MLXREG_FAN) += mlxreg-fan.o obj-$(CONFIG_SENSORS_MENF21BMC_HWMON) += menf21bmc_hwmon.o obj-$(CONFIG_SENSORS_MR75203) += mr75203.o obj-$(CONFIG_SENSORS_NCT6683) += nct6683.o +obj-$(CONFIG_SENSORS_NCT6694) += nct6694-hwmon.o obj-$(CONFIG_SENSORS_NCT6775_CORE) += nct6775-core.o nct6775-objs := nct6775-platform.o obj-$(CONFIG_SENSORS_NCT6775) += nct6775.o diff --git a/drivers/hwmon/nct6694-hwmon.c b/drivers/hwmon/nct6694-hwmon.c new file mode 100644 index 00000000000000..6dcf22ca5018a5 --- /dev/null +++ b/drivers/hwmon/nct6694-hwmon.c @@ -0,0 +1,949 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Nuvoton NCT6694 HWMON driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * USB command module type for NCT6694 report channel + * This defines the module type used for communication with the NCT6694 + * report channel over the USB interface. + */ +#define NCT6694_RPT_MOD 0xFF + +/* Report channel */ +/* + * The report channel is used to report the status of the hardware monitor + * devices, such as voltage, temperature, fan speed, and PWM. + */ +#define NCT6694_VIN_IDX(x) (0x00 + (x)) +#define NCT6694_TIN_IDX(x) \ + ({ typeof(x) (_x) = (x); \ + ((_x) < 10) ? (0x10 + ((_x) * 2)) : \ + (0x30 + (((_x) - 10) * 2)); }) +#define NCT6694_FIN_IDX(x) (0x50 + ((x) * 2)) +#define NCT6694_PWM_IDX(x) (0x70 + (x)) +#define NCT6694_VIN_STS(x) (0x68 + (x)) +#define NCT6694_TIN_STS(x) (0x6A + (x)) +#define NCT6694_FIN_STS(x) (0x6E + (x)) + +/* + * USB command module type for NCT6694 HWMON controller. + * This defines the module type used for communication with the NCT6694 + * HWMON controller over the USB interface. + */ +#define NCT6694_HWMON_MOD 0x00 + +/* Command 00h - Hardware Monitor Control */ +#define NCT6694_HWMON_CONTROL 0x00 +#define NCT6694_HWMON_CONTROL_SEL 0x00 + +/* Command 02h - Alarm Control */ +#define NCT6694_HWMON_ALARM 0x02 +#define NCT6694_HWMON_ALARM_SEL 0x00 + +/* + * USB command module type for NCT6694 PWM controller. + * This defines the module type used for communication with the NCT6694 + * PWM controller over the USB interface. + */ +#define NCT6694_PWM_MOD 0x01 + +/* PWM Command - Manual Control */ +#define NCT6694_PWM_CONTROL 0x01 +#define NCT6694_PWM_CONTROL_SEL 0x00 + +#define NCT6694_FREQ_FROM_REG(reg) ((reg) * 25000 / 255) +#define NCT6694_FREQ_TO_REG(val) \ + (DIV_ROUND_CLOSEST(clamp_val((val), 100, 25000) * 255, 25000)) + +#define NCT6694_LSB_REG_MASK GENMASK(7, 5) +#define NCT6694_TIN_HYST_MASK GENMASK(7, 5) + +enum nct6694_hwmon_temp_mode { + NCT6694_HWMON_TWOTIME_IRQ = 0, + NCT6694_HWMON_ONETIME_IRQ, + NCT6694_HWMON_REALTIME_IRQ, + NCT6694_HWMON_COMPARE_IRQ, +}; + +struct __packed nct6694_hwmon_control { + u8 vin_en[2]; + u8 tin_en[2]; + u8 fin_en[2]; + u8 pwm_en[2]; + u8 reserved1[40]; + u8 pwm_freq[10]; + u8 reserved2[6]; +}; + +struct __packed nct6694_hwmon_alarm { + u8 smi_ctrl; + u8 reserved1[15]; + struct { + u8 hl; + u8 ll; + } vin_limit[16]; + struct { + u8 hyst; + s8 hl; + } tin_cfg[32]; + __be16 fin_ll[10]; + u8 reserved2[4]; +}; + +struct __packed nct6694_pwm_control { + u8 mal_en[2]; + u8 mal_val[10]; + u8 reserved[12]; +}; + +union __packed nct6694_hwmon_rpt { + u8 vin; + struct { + u8 msb; + u8 lsb; + } tin; + __be16 fin; + u8 pwm; + u8 status; +}; + +union __packed nct6694_hwmon_msg { + struct nct6694_hwmon_alarm hwmon_alarm; + struct nct6694_pwm_control pwm_ctrl; +}; + +struct nct6694_hwmon_data { + struct nct6694 *nct6694; + struct mutex lock; + struct nct6694_hwmon_control hwmon_en; + union nct6694_hwmon_rpt *rpt; + union nct6694_hwmon_msg *msg; +}; + +static inline long in_from_reg(u8 reg) +{ + return reg * 16; +} + +static inline u8 in_to_reg(long val) +{ + return DIV_ROUND_CLOSEST(val, 16); +} + +static inline long temp_from_reg(s8 reg) +{ + return reg * 1000; +} + +static inline s8 temp_to_reg(long val) +{ + return DIV_ROUND_CLOSEST(val, 1000); +} + +#define NCT6694_HWMON_IN_CONFIG (HWMON_I_INPUT | HWMON_I_ENABLE | \ + HWMON_I_MAX | HWMON_I_MIN | \ + HWMON_I_ALARM) +#define NCT6694_HWMON_TEMP_CONFIG (HWMON_T_INPUT | HWMON_T_ENABLE | \ + HWMON_T_MAX | HWMON_T_MAX_HYST | \ + HWMON_T_MAX_ALARM) +#define NCT6694_HWMON_FAN_CONFIG (HWMON_F_INPUT | HWMON_F_ENABLE | \ + HWMON_F_MIN | HWMON_F_MIN_ALARM) +#define NCT6694_HWMON_PWM_CONFIG (HWMON_PWM_INPUT | HWMON_PWM_ENABLE | \ + HWMON_PWM_FREQ) +static const struct hwmon_channel_info *nct6694_info[] = { + HWMON_CHANNEL_INFO(in, + NCT6694_HWMON_IN_CONFIG, /* VIN0 */ + NCT6694_HWMON_IN_CONFIG, /* VIN1 */ + NCT6694_HWMON_IN_CONFIG, /* VIN2 */ + NCT6694_HWMON_IN_CONFIG, /* VIN3 */ + NCT6694_HWMON_IN_CONFIG, /* VIN5 */ + NCT6694_HWMON_IN_CONFIG, /* VIN6 */ + NCT6694_HWMON_IN_CONFIG, /* VIN7 */ + NCT6694_HWMON_IN_CONFIG, /* VIN14 */ + NCT6694_HWMON_IN_CONFIG, /* VIN15 */ + NCT6694_HWMON_IN_CONFIG, /* VIN16 */ + NCT6694_HWMON_IN_CONFIG, /* VBAT */ + NCT6694_HWMON_IN_CONFIG, /* VSB */ + NCT6694_HWMON_IN_CONFIG, /* AVSB */ + NCT6694_HWMON_IN_CONFIG, /* VCC */ + NCT6694_HWMON_IN_CONFIG, /* VHIF */ + NCT6694_HWMON_IN_CONFIG), /* VTT */ + + HWMON_CHANNEL_INFO(temp, + NCT6694_HWMON_TEMP_CONFIG, /* THR1 */ + NCT6694_HWMON_TEMP_CONFIG, /* THR2 */ + NCT6694_HWMON_TEMP_CONFIG, /* THR14 */ + NCT6694_HWMON_TEMP_CONFIG, /* THR15 */ + NCT6694_HWMON_TEMP_CONFIG, /* THR16 */ + NCT6694_HWMON_TEMP_CONFIG, /* TDP0 */ + NCT6694_HWMON_TEMP_CONFIG, /* TDP1 */ + NCT6694_HWMON_TEMP_CONFIG, /* TDP2 */ + NCT6694_HWMON_TEMP_CONFIG, /* TDP3 */ + NCT6694_HWMON_TEMP_CONFIG, /* TDP4 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN0 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN1 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN2 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN3 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN4 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN5 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN6 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN7 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN8 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN9 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN10 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN11 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN12 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN13 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN14 */ + NCT6694_HWMON_TEMP_CONFIG), /* DTIN15 */ + + HWMON_CHANNEL_INFO(fan, + NCT6694_HWMON_FAN_CONFIG, /* FIN0 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN1 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN2 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN3 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN4 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN5 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN6 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN7 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN8 */ + NCT6694_HWMON_FAN_CONFIG), /* FIN9 */ + + HWMON_CHANNEL_INFO(pwm, + NCT6694_HWMON_PWM_CONFIG, /* PWM0 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM1 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM2 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM3 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM4 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM5 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM6 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM7 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM8 */ + NCT6694_HWMON_PWM_CONFIG), /* PWM9 */ + NULL +}; + +static int nct6694_in_read(struct device *dev, u32 attr, int channel, + long *val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + unsigned char vin_en; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_in_enable: + vin_en = data->hwmon_en.vin_en[(channel / 8)]; + *val = !!(vin_en & BIT(channel % 8)); + + return 0; + case hwmon_in_input: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_VIN_IDX(channel)), + .len = cpu_to_le16(sizeof(data->rpt->vin)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->vin); + if (ret) + return ret; + + *val = in_from_reg(data->rpt->vin); + + return 0; + case hwmon_in_max: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + *val = in_from_reg(data->msg->hwmon_alarm.vin_limit[channel].hl); + + return 0; + case hwmon_in_min: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + *val = in_from_reg(data->msg->hwmon_alarm.vin_limit[channel].ll); + + return 0; + case hwmon_in_alarm: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_VIN_STS(channel / 8)), + .len = cpu_to_le16(sizeof(data->rpt->status)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->status); + if (ret) + return ret; + + *val = !!(data->rpt->status & BIT(channel % 8)); + + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_temp_read(struct device *dev, u32 attr, int channel, + long *val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + unsigned char temp_en, temp_hyst; + signed char temp_max; + int ret, temp_raw; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_temp_enable: + temp_en = data->hwmon_en.tin_en[channel / 8]; + *val = !!(temp_en & BIT(channel % 8)); + + return 0; + case hwmon_temp_input: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_TIN_IDX(channel)), + .len = cpu_to_le16(sizeof(data->rpt->tin)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->tin); + if (ret) + return ret; + + temp_raw = data->rpt->tin.msb << 3; + temp_raw |= FIELD_GET(NCT6694_LSB_REG_MASK, data->rpt->tin.lsb); + + /* Real temperature(milli degrees Celsius) = temp_raw * 1000 * 0.125 */ + *val = sign_extend32(temp_raw, 10) * 125; + + return 0; + case hwmon_temp_max: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + *val = temp_from_reg(data->msg->hwmon_alarm.tin_cfg[channel].hl); + + return 0; + case hwmon_temp_max_hyst: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + temp_max = data->msg->hwmon_alarm.tin_cfg[channel].hl; + temp_hyst = FIELD_GET(NCT6694_TIN_HYST_MASK, + data->msg->hwmon_alarm.tin_cfg[channel].hyst); + *val = temp_from_reg(temp_max - temp_hyst); + + return 0; + case hwmon_temp_max_alarm: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_TIN_STS(channel / 8)), + .len = cpu_to_le16(sizeof(data->rpt->status)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->status); + if (ret) + return ret; + + *val = !!(data->rpt->status & BIT(channel % 8)); + + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_fan_read(struct device *dev, u32 attr, int channel, + long *val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + unsigned char fanin_en; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_fan_enable: + fanin_en = data->hwmon_en.fin_en[channel / 8]; + *val = !!(fanin_en & BIT(channel % 8)); + + return 0; + case hwmon_fan_input: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_FIN_IDX(channel)), + .len = cpu_to_le16(sizeof(data->rpt->fin)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->fin); + if (ret) + return ret; + + *val = be16_to_cpu(data->rpt->fin); + + return 0; + case hwmon_fan_min: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + *val = be16_to_cpu(data->msg->hwmon_alarm.fin_ll[channel]); + + return 0; + case hwmon_fan_min_alarm: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_FIN_STS(channel / 8)), + .len = cpu_to_le16(sizeof(data->rpt->status)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->status); + if (ret) + return ret; + + *val = !!(data->rpt->status & BIT(channel % 8)); + + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_pwm_read(struct device *dev, u32 attr, int channel, + long *val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + unsigned char pwm_en; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_pwm_enable: + pwm_en = data->hwmon_en.pwm_en[channel / 8]; + *val = !!(pwm_en & BIT(channel % 8)); + + return 0; + case hwmon_pwm_input: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_PWM_IDX(channel)), + .len = cpu_to_le16(sizeof(data->rpt->pwm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->pwm); + if (ret) + return ret; + + *val = data->rpt->pwm; + + return 0; + case hwmon_pwm_freq: + *val = NCT6694_FREQ_FROM_REG(data->hwmon_en.pwm_freq[channel]); + + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_in_write(struct device *dev, u32 attr, int channel, + long val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_in_enable: + if (val == 0) + data->hwmon_en.vin_en[channel / 8] &= ~BIT(channel % 8); + else if (val == 1) + data->hwmon_en.vin_en[channel / 8] |= BIT(channel % 8); + else + return -EINVAL; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + case hwmon_in_max: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + val = clamp_val(val, 0, 2032); + data->msg->hwmon_alarm.vin_limit[channel].hl = in_to_reg(val); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + case hwmon_in_min: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + val = clamp_val(val, 0, 2032); + data->msg->hwmon_alarm.vin_limit[channel].ll = in_to_reg(val); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_temp_write(struct device *dev, u32 attr, int channel, + long val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + unsigned char temp_hyst; + signed char temp_max; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_temp_enable: + if (val == 0) + data->hwmon_en.tin_en[channel / 8] &= ~BIT(channel % 8); + else if (val == 1) + data->hwmon_en.tin_en[channel / 8] |= BIT(channel % 8); + else + return -EINVAL; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + case hwmon_temp_max: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + val = clamp_val(val, -127000, 127000); + data->msg->hwmon_alarm.tin_cfg[channel].hl = temp_to_reg(val); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + case hwmon_temp_max_hyst: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + + val = clamp_val(val, -127000, 127000); + temp_max = data->msg->hwmon_alarm.tin_cfg[channel].hl; + temp_hyst = temp_max - temp_to_reg(val); + temp_hyst = clamp_val(temp_hyst, 0, 7); + data->msg->hwmon_alarm.tin_cfg[channel].hyst = + (data->msg->hwmon_alarm.tin_cfg[channel].hyst & ~NCT6694_TIN_HYST_MASK) | + FIELD_PREP(NCT6694_TIN_HYST_MASK, temp_hyst); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_fan_write(struct device *dev, u32 attr, int channel, + long val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_fan_enable: + if (val == 0) + data->hwmon_en.fin_en[channel / 8] &= ~BIT(channel % 8); + else if (val == 1) + data->hwmon_en.fin_en[channel / 8] |= BIT(channel % 8); + else + return -EINVAL; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + case hwmon_fan_min: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + val = clamp_val(val, 1, 65535); + data->msg->hwmon_alarm.fin_ll[channel] = cpu_to_be16(val); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_pwm_write(struct device *dev, u32 attr, int channel, + long val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_pwm_enable: + if (val == 0) + data->hwmon_en.pwm_en[channel / 8] &= ~BIT(channel % 8); + else if (val == 1) + data->hwmon_en.pwm_en[channel / 8] |= BIT(channel % 8); + else + return -EINVAL; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + case hwmon_pwm_input: + if (val < 0 || val > 255) + return -EINVAL; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_PWM_MOD, + .cmd = NCT6694_PWM_CONTROL, + .sel = NCT6694_PWM_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->msg->pwm_ctrl)) + }; + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->pwm_ctrl); + if (ret) + return ret; + + data->msg->pwm_ctrl.mal_val[channel] = val; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->pwm_ctrl); + case hwmon_pwm_freq: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + + data->hwmon_en.pwm_freq[channel] = NCT6694_FREQ_TO_REG(val); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_read(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *val) +{ + switch (type) { + case hwmon_in: + /* in mV */ + return nct6694_in_read(dev, attr, channel, val); + case hwmon_temp: + /* in mC */ + return nct6694_temp_read(dev, attr, channel, val); + case hwmon_fan: + /* in RPM */ + return nct6694_fan_read(dev, attr, channel, val); + case hwmon_pwm: + /* in value 0~255 */ + return nct6694_pwm_read(dev, attr, channel, val); + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_write(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long val) +{ + switch (type) { + case hwmon_in: + return nct6694_in_write(dev, attr, channel, val); + case hwmon_temp: + return nct6694_temp_write(dev, attr, channel, val); + case hwmon_fan: + return nct6694_fan_write(dev, attr, channel, val); + case hwmon_pwm: + return nct6694_pwm_write(dev, attr, channel, val); + default: + return -EOPNOTSUPP; + } +} + +static umode_t nct6694_is_visible(const void *data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + switch (type) { + case hwmon_in: + switch (attr) { + case hwmon_in_enable: + case hwmon_in_max: + case hwmon_in_min: + return 0644; + case hwmon_in_alarm: + case hwmon_in_input: + return 0444; + default: + return 0; + } + case hwmon_temp: + switch (attr) { + case hwmon_temp_enable: + case hwmon_temp_max: + case hwmon_temp_max_hyst: + return 0644; + case hwmon_temp_input: + case hwmon_temp_max_alarm: + return 0444; + default: + return 0; + } + case hwmon_fan: + switch (attr) { + case hwmon_fan_enable: + case hwmon_fan_min: + return 0644; + case hwmon_fan_input: + case hwmon_fan_min_alarm: + return 0444; + default: + return 0; + } + case hwmon_pwm: + switch (attr) { + case hwmon_pwm_enable: + case hwmon_pwm_freq: + case hwmon_pwm_input: + return 0644; + default: + return 0; + } + default: + return 0; + } +} + +static const struct hwmon_ops nct6694_hwmon_ops = { + .is_visible = nct6694_is_visible, + .read = nct6694_read, + .write = nct6694_write, +}; + +static const struct hwmon_chip_info nct6694_chip_info = { + .ops = &nct6694_hwmon_ops, + .info = nct6694_info, +}; + +static int nct6694_hwmon_init(struct nct6694_hwmon_data *data) +{ + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + int ret; + + /* + * Record each Hardware Monitor Channel enable status + * and PWM frequency register + */ + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + if (ret) + return ret; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + + /* Select hwmon device alarm mode */ + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + data->msg->hwmon_alarm.smi_ctrl = NCT6694_HWMON_REALTIME_IRQ; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); +} + +static int nct6694_hwmon_probe(struct platform_device *pdev) +{ + struct nct6694_hwmon_data *data; + struct nct6694 *nct6694 = dev_get_drvdata(pdev->dev.parent); + struct device *hwmon_dev; + int ret; + + data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->rpt = devm_kzalloc(&pdev->dev, sizeof(union nct6694_hwmon_rpt), + GFP_KERNEL); + if (!data->rpt) + return -ENOMEM; + + data->msg = devm_kzalloc(&pdev->dev, sizeof(union nct6694_hwmon_msg), + GFP_KERNEL); + if (!data->msg) + return -ENOMEM; + + data->nct6694 = nct6694; + ret = devm_mutex_init(&pdev->dev, &data->lock); + if (ret) + return ret; + + ret = nct6694_hwmon_init(data); + if (ret) + return ret; + + /* Register hwmon device to HWMON framework */ + hwmon_dev = devm_hwmon_device_register_with_info(&pdev->dev, + "nct6694", data, + &nct6694_chip_info, + NULL); + return PTR_ERR_OR_ZERO(hwmon_dev); +} + +static struct platform_driver nct6694_hwmon_driver = { + .driver = { + .name = "nct6694-hwmon", + }, + .probe = nct6694_hwmon_probe, +}; + +module_platform_driver(nct6694_hwmon_driver); + +MODULE_DESCRIPTION("USB-HWMON driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:nct6694-hwmon"); From d463bb140583609f78f61d48c3dfb6f46c5cb062 Mon Sep 17 00:00:00 2001 From: Ming Yu Date: Fri, 12 Sep 2025 17:19:52 +0800 Subject: [PATCH 1871/3206] rtc: Add Nuvoton NCT6694 RTC support This driver supports RTC functionality for NCT6694 MFD device based on USB interface. Acked-by: Alexandre Belloni Signed-off-by: Ming Yu Link: https://lore.kernel.org/r/20250912091952.1169369-8-a0282524688@gmail.com Signed-off-by: Lee Jones --- MAINTAINERS | 1 + drivers/rtc/Kconfig | 10 ++ drivers/rtc/Makefile | 1 + drivers/rtc/rtc-nct6694.c | 297 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 309 insertions(+) create mode 100644 drivers/rtc/rtc-nct6694.c diff --git a/MAINTAINERS b/MAINTAINERS index bbacc9d48a8331..442f24a408a651 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18090,6 +18090,7 @@ F: drivers/hwmon/nct6694-hwmon.c F: drivers/i2c/busses/i2c-nct6694.c F: drivers/mfd/nct6694.c F: drivers/net/can/usb/nct6694_canfd.c +F: drivers/rtc/rtc-nct6694.c F: drivers/watchdog/nct6694_wdt.c F: include/linux/mfd/nct6694.h diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 64f6e9756aff4a..4a8dc8d0a4b7c0 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -416,6 +416,16 @@ config RTC_DRV_NCT3018Y This driver can also be built as a module, if so, the module will be called "rtc-nct3018y". +config RTC_DRV_NCT6694 + tristate "Nuvoton NCT6694 RTC support" + depends on MFD_NCT6694 + help + If you say yes to this option, support will be included for Nuvoton + NCT6694, a USB device to RTC. + + This driver can also be built as a module. If so, the module will + be called rtc-nct6694. + config RTC_DRV_RK808 tristate "Rockchip RK805/RK808/RK809/RK817/RK818 RTC" depends on MFD_RK8XX diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 789bddfea99d8f..610a9ee5fd33c3 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -119,6 +119,7 @@ obj-$(CONFIG_RTC_DRV_MXC) += rtc-mxc.o obj-$(CONFIG_RTC_DRV_MXC_V2) += rtc-mxc_v2.o obj-$(CONFIG_RTC_DRV_GAMECUBE) += rtc-gamecube.o obj-$(CONFIG_RTC_DRV_NCT3018Y) += rtc-nct3018y.o +obj-$(CONFIG_RTC_DRV_NCT6694) += rtc-nct6694.o obj-$(CONFIG_RTC_DRV_NTXEC) += rtc-ntxec.o obj-$(CONFIG_RTC_DRV_OMAP) += rtc-omap.o obj-$(CONFIG_RTC_DRV_OPAL) += rtc-opal.o diff --git a/drivers/rtc/rtc-nct6694.c b/drivers/rtc/rtc-nct6694.c new file mode 100644 index 00000000000000..35401a0d9cf53b --- /dev/null +++ b/drivers/rtc/rtc-nct6694.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Nuvoton NCT6694 RTC driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * USB command module type for NCT6694 RTC controller. + * This defines the module type used for communication with the NCT6694 + * RTC controller over the USB interface. + */ +#define NCT6694_RTC_MOD 0x08 + +/* Command 00h - RTC Time */ +#define NCT6694_RTC_TIME 0x0000 +#define NCT6694_RTC_TIME_SEL 0x00 + +/* Command 01h - RTC Alarm */ +#define NCT6694_RTC_ALARM 0x01 +#define NCT6694_RTC_ALARM_SEL 0x00 + +/* Command 02h - RTC Status */ +#define NCT6694_RTC_STATUS 0x02 +#define NCT6694_RTC_STATUS_SEL 0x00 + +#define NCT6694_RTC_IRQ_INT_EN BIT(0) /* Transmit a USB INT-in when RTC alarm */ +#define NCT6694_RTC_IRQ_GPO_EN BIT(5) /* Trigger a GPO Low Pulse when RTC alarm */ + +#define NCT6694_RTC_IRQ_EN (NCT6694_RTC_IRQ_INT_EN | NCT6694_RTC_IRQ_GPO_EN) +#define NCT6694_RTC_IRQ_STS BIT(0) /* Write 1 clear IRQ status */ + +struct __packed nct6694_rtc_time { + u8 sec; + u8 min; + u8 hour; + u8 week; + u8 day; + u8 month; + u8 year; +}; + +struct __packed nct6694_rtc_alarm { + u8 sec; + u8 min; + u8 hour; + u8 alarm_en; + u8 alarm_pend; +}; + +struct __packed nct6694_rtc_status { + u8 irq_en; + u8 irq_pend; +}; + +union __packed nct6694_rtc_msg { + struct nct6694_rtc_time time; + struct nct6694_rtc_alarm alarm; + struct nct6694_rtc_status sts; +}; + +struct nct6694_rtc_data { + struct nct6694 *nct6694; + struct rtc_device *rtc; + union nct6694_rtc_msg *msg; + int irq; +}; + +static int nct6694_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct nct6694_rtc_data *data = dev_get_drvdata(dev); + struct nct6694_rtc_time *time = &data->msg->time; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_TIME, + .sel = NCT6694_RTC_TIME_SEL, + .len = cpu_to_le16(sizeof(*time)) + }; + int ret; + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, time); + if (ret) + return ret; + + tm->tm_sec = bcd2bin(time->sec); /* tm_sec expect 0 ~ 59 */ + tm->tm_min = bcd2bin(time->min); /* tm_min expect 0 ~ 59 */ + tm->tm_hour = bcd2bin(time->hour); /* tm_hour expect 0 ~ 23 */ + tm->tm_wday = bcd2bin(time->week) - 1; /* tm_wday expect 0 ~ 6 */ + tm->tm_mday = bcd2bin(time->day); /* tm_mday expect 1 ~ 31 */ + tm->tm_mon = bcd2bin(time->month) - 1; /* tm_month expect 0 ~ 11 */ + tm->tm_year = bcd2bin(time->year) + 100; /* tm_year expect since 1900 */ + + return ret; +} + +static int nct6694_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct nct6694_rtc_data *data = dev_get_drvdata(dev); + struct nct6694_rtc_time *time = &data->msg->time; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_TIME, + .sel = NCT6694_RTC_TIME_SEL, + .len = cpu_to_le16(sizeof(*time)) + }; + + time->sec = bin2bcd(tm->tm_sec); + time->min = bin2bcd(tm->tm_min); + time->hour = bin2bcd(tm->tm_hour); + time->week = bin2bcd(tm->tm_wday + 1); + time->day = bin2bcd(tm->tm_mday); + time->month = bin2bcd(tm->tm_mon + 1); + time->year = bin2bcd(tm->tm_year - 100); + + return nct6694_write_msg(data->nct6694, &cmd_hd, time); +} + +static int nct6694_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct nct6694_rtc_data *data = dev_get_drvdata(dev); + struct nct6694_rtc_alarm *alarm = &data->msg->alarm; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_ALARM, + .sel = NCT6694_RTC_ALARM_SEL, + .len = cpu_to_le16(sizeof(*alarm)) + }; + int ret; + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, alarm); + if (ret) + return ret; + + alrm->time.tm_sec = bcd2bin(alarm->sec); + alrm->time.tm_min = bcd2bin(alarm->min); + alrm->time.tm_hour = bcd2bin(alarm->hour); + alrm->enabled = alarm->alarm_en; + alrm->pending = alarm->alarm_pend; + + return ret; +} + +static int nct6694_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct nct6694_rtc_data *data = dev_get_drvdata(dev); + struct nct6694_rtc_alarm *alarm = &data->msg->alarm; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_ALARM, + .sel = NCT6694_RTC_ALARM_SEL, + .len = cpu_to_le16(sizeof(*alarm)) + }; + + alarm->sec = bin2bcd(alrm->time.tm_sec); + alarm->min = bin2bcd(alrm->time.tm_min); + alarm->hour = bin2bcd(alrm->time.tm_hour); + alarm->alarm_en = alrm->enabled ? NCT6694_RTC_IRQ_EN : 0; + alarm->alarm_pend = 0; + + return nct6694_write_msg(data->nct6694, &cmd_hd, alarm); +} + +static int nct6694_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) +{ + struct nct6694_rtc_data *data = dev_get_drvdata(dev); + struct nct6694_rtc_status *sts = &data->msg->sts; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_STATUS, + .sel = NCT6694_RTC_STATUS_SEL, + .len = cpu_to_le16(sizeof(*sts)) + }; + + if (enabled) + sts->irq_en |= NCT6694_RTC_IRQ_EN; + else + sts->irq_en &= ~NCT6694_RTC_IRQ_EN; + + sts->irq_pend = 0; + + return nct6694_write_msg(data->nct6694, &cmd_hd, sts); +} + +static const struct rtc_class_ops nct6694_rtc_ops = { + .read_time = nct6694_rtc_read_time, + .set_time = nct6694_rtc_set_time, + .read_alarm = nct6694_rtc_read_alarm, + .set_alarm = nct6694_rtc_set_alarm, + .alarm_irq_enable = nct6694_rtc_alarm_irq_enable, +}; + +static irqreturn_t nct6694_irq(int irq, void *dev_id) +{ + struct nct6694_rtc_data *data = dev_id; + struct nct6694_rtc_status *sts = &data->msg->sts; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_STATUS, + .sel = NCT6694_RTC_STATUS_SEL, + .len = cpu_to_le16(sizeof(*sts)) + }; + int ret; + + rtc_lock(data->rtc); + + sts->irq_en = NCT6694_RTC_IRQ_EN; + sts->irq_pend = NCT6694_RTC_IRQ_STS; + ret = nct6694_write_msg(data->nct6694, &cmd_hd, sts); + if (ret) { + rtc_unlock(data->rtc); + return IRQ_NONE; + } + + rtc_update_irq(data->rtc, 1, RTC_IRQF | RTC_AF); + + rtc_unlock(data->rtc); + + return IRQ_HANDLED; +} + +static void nct6694_irq_dispose_mapping(void *d) +{ + struct nct6694_rtc_data *data = d; + + irq_dispose_mapping(data->irq); +} + +static int nct6694_rtc_probe(struct platform_device *pdev) +{ + struct nct6694_rtc_data *data; + struct nct6694 *nct6694 = dev_get_drvdata(pdev->dev.parent); + int ret; + + data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->msg = devm_kzalloc(&pdev->dev, sizeof(union nct6694_rtc_msg), + GFP_KERNEL); + if (!data->msg) + return -ENOMEM; + + data->irq = irq_create_mapping(nct6694->domain, NCT6694_IRQ_RTC); + if (!data->irq) + return -EINVAL; + + ret = devm_add_action_or_reset(&pdev->dev, nct6694_irq_dispose_mapping, + data); + if (ret) + return ret; + + ret = devm_device_init_wakeup(&pdev->dev); + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to init wakeup\n"); + + data->rtc = devm_rtc_allocate_device(&pdev->dev); + if (IS_ERR(data->rtc)) + return PTR_ERR(data->rtc); + + data->nct6694 = nct6694; + data->rtc->ops = &nct6694_rtc_ops; + data->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000; + data->rtc->range_max = RTC_TIMESTAMP_END_2099; + + platform_set_drvdata(pdev, data); + + ret = devm_request_threaded_irq(&pdev->dev, data->irq, NULL, + nct6694_irq, IRQF_ONESHOT, + "rtc-nct6694", data); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, "Failed to request irq\n"); + + return devm_rtc_register_device(data->rtc); +} + +static struct platform_driver nct6694_rtc_driver = { + .driver = { + .name = "nct6694-rtc", + }, + .probe = nct6694_rtc_probe, +}; + +module_platform_driver(nct6694_rtc_driver); + +MODULE_DESCRIPTION("USB-RTC driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:nct6694-rtc"); From 288dac9fb6084330d968459c750c838fd06e10e6 Mon Sep 17 00:00:00 2001 From: Qi Xi Date: Thu, 4 Sep 2025 11:44:47 +0800 Subject: [PATCH 1872/3206] drm: bridge: cdns-mhdp8546: Fix missing mutex unlock on error path Add missing mutex unlock before returning from the error path in cdns_mhdp_atomic_enable(). Fixes: 935a92a1c400 ("drm: bridge: cdns-mhdp8546: Fix possible null pointer dereference") Reported-by: Hulk Robot Signed-off-by: Qi Xi Reviewed-by: Luca Ceresoli Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250904034447.665427-1-xiqi2@huawei.com Signed-off-by: Luca Ceresoli --- drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c b/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c index a614d1384f7152..38726ae1bf1504 100644 --- a/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c +++ b/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c @@ -1984,8 +1984,10 @@ static void cdns_mhdp_atomic_enable(struct drm_bridge *bridge, mhdp_state = to_cdns_mhdp_bridge_state(new_state); mhdp_state->current_mode = drm_mode_duplicate(bridge->dev, mode); - if (!mhdp_state->current_mode) - return; + if (!mhdp_state->current_mode) { + ret = -EINVAL; + goto out; + } drm_mode_set_name(mhdp_state->current_mode); From aee814458fb98819876442f0261fad0bb9842224 Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:20 +0200 Subject: [PATCH 1873/3206] dt-bindings: mfd: gpio: Add MAX7360 Add device tree bindings for Maxim Integrated MAX7360 device with support for keypad, rotary, gpios and pwm functionalities. Reviewed-by: "Rob Herring (Arm)" Reviewed-by: Krzysztof Kozlowski Signed-off-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-1-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- .../bindings/gpio/maxim,max7360-gpio.yaml | 83 ++++++++ .../bindings/mfd/maxim,max7360.yaml | 191 ++++++++++++++++++ 2 files changed, 274 insertions(+) create mode 100644 Documentation/devicetree/bindings/gpio/maxim,max7360-gpio.yaml create mode 100644 Documentation/devicetree/bindings/mfd/maxim,max7360.yaml diff --git a/Documentation/devicetree/bindings/gpio/maxim,max7360-gpio.yaml b/Documentation/devicetree/bindings/gpio/maxim,max7360-gpio.yaml new file mode 100644 index 00000000000000..c5c3fc4c816f7f --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/maxim,max7360-gpio.yaml @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/gpio/maxim,max7360-gpio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Maxim MAX7360 GPIO controller + +maintainers: + - Kamel Bouhara + - Mathieu Dubois-Briand + +description: | + Maxim MAX7360 GPIO controller, in MAX7360 chipset + https://www.analog.com/en/products/max7360.html + + The device provides two series of GPIOs, referred here as GPIOs and GPOs. + + PORT0 to PORT7 pins can be used as GPIOs, with support for interrupts and + constant-current mode. These pins will also be used by the rotary encoder and + PWM functionalities. + + COL2 to COL7 pins can be used as GPOs, there is no input capability. COL pins + will be partitioned, with the first pins being affected to the keypad + functionality and the last ones as GPOs. + +properties: + compatible: + enum: + - maxim,max7360-gpio + - maxim,max7360-gpo + + gpio-controller: true + + "#gpio-cells": + const: 2 + + interrupt-controller: true + + "#interrupt-cells": + const: 2 + + maxim,constant-current-disable: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Bit field, each bit disables constant-current output of the associated + GPIO, starting from the least significant bit for the first GPIO. + maximum: 0xff + +required: + - compatible + - gpio-controller + +allOf: + - if: + properties: + compatible: + contains: + enum: + - maxim,max7360-gpio + ngpios: false + then: + required: + - interrupt-controller + else: + properties: + interrupt-controller: false + maxim,constant-current-disable: false + +additionalProperties: false + +examples: + - | + gpio { + compatible = "maxim,max7360-gpio"; + + gpio-controller; + #gpio-cells = <2>; + maxim,constant-current-disable = <0x06>; + + interrupt-controller; + #interrupt-cells = <2>; + }; diff --git a/Documentation/devicetree/bindings/mfd/maxim,max7360.yaml b/Documentation/devicetree/bindings/mfd/maxim,max7360.yaml new file mode 100644 index 00000000000000..3fc920c8639d0f --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/maxim,max7360.yaml @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mfd/maxim,max7360.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Maxim MAX7360 Keypad, Rotary encoder, PWM and GPIO controller + +maintainers: + - Kamel Bouhara + - Mathieu Dubois-Briand + +description: | + Maxim MAX7360 device, with following functions: + - keypad controller + - rotary controller + - GPIO and GPO controller + - PWM controller + + https://www.analog.com/en/products/max7360.html + +allOf: + - $ref: /schemas/input/matrix-keymap.yaml# + - $ref: /schemas/input/input.yaml# + +properties: + compatible: + enum: + - maxim,max7360 + + reg: + maxItems: 1 + + interrupts: + maxItems: 2 + + interrupt-names: + items: + - const: inti + - const: intk + + keypad-debounce-delay-ms: + description: Keypad debounce delay in ms + minimum: 9 + maximum: 40 + default: 9 + + rotary-debounce-delay-ms: + description: Rotary encoder debounce delay in ms + minimum: 0 + maximum: 15 + default: 0 + + linux,axis: + $ref: /schemas/input/rotary-encoder.yaml#/properties/linux,axis + + rotary-encoder,relative-axis: + $ref: /schemas/types.yaml#/definitions/flag + description: + Register a relative axis rather than an absolute one. + + rotary-encoder,steps: + $ref: /schemas/types.yaml#/definitions/uint32 + default: 24 + description: + Number of steps in a full turnaround of the + encoder. Only relevant for absolute axis. Defaults to 24 which is a + typical value for such devices. + + rotary-encoder,rollover: + $ref: /schemas/types.yaml#/definitions/flag + description: + Automatic rollover when the rotary value becomes + greater than the specified steps or smaller than 0. For absolute axis only. + + "#pwm-cells": + const: 3 + + gpio: + $ref: /schemas/gpio/maxim,max7360-gpio.yaml# + description: + PORT0 to PORT7 general purpose input/output pins configuration. + + gpo: + $ref: /schemas/gpio/maxim,max7360-gpio.yaml# + description: > + COL2 to COL7 general purpose output pins configuration. Allows to use + unused keypad columns as outputs. + + The MAX7360 has 8 column lines and 6 of them can be used as GPOs. GPIOs + numbers used for this gpio-controller node do correspond to the column + numbers: values 0 and 1 are never valid, values from 2 to 7 might be valid + depending on the value of the keypad,num-column property. + +patternProperties: + '-pins$': + type: object + description: + Pinctrl node's client devices use subnodes for desired pin configuration. + Client device subnodes use below standard properties. + $ref: /schemas/pinctrl/pincfg-node.yaml + + properties: + pins: + description: + List of gpio pins affected by the properties specified in this + subnode. + items: + pattern: '^(PORT[0-7]|ROTARY)$' + minItems: 1 + maxItems: 8 + + function: + description: + Specify the alternative function to be configured for the specified + pins. + enum: [gpio, pwm, rotary] + + additionalProperties: false + +required: + - compatible + - reg + - interrupts + - interrupt-names + - linux,keymap + - linux,axis + - "#pwm-cells" + - gpio + - gpo + +unevaluatedProperties: false + +examples: + - | + #include + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + io-expander@38 { + compatible = "maxim,max7360"; + reg = <0x38>; + + interrupt-parent = <&gpio1>; + interrupts = <23 IRQ_TYPE_LEVEL_LOW>, + <24 IRQ_TYPE_LEVEL_LOW>; + interrupt-names = "inti", "intk"; + + keypad,num-rows = <8>; + keypad,num-columns = <4>; + linux,keymap = < + MATRIX_KEY(0x00, 0x00, KEY_F5) + MATRIX_KEY(0x01, 0x00, KEY_F4) + MATRIX_KEY(0x02, 0x01, KEY_F6) + >; + keypad-debounce-delay-ms = <10>; + autorepeat; + + rotary-debounce-delay-ms = <2>; + linux,axis = <0>; /* REL_X */ + rotary-encoder,relative-axis; + + #pwm-cells = <3>; + + max7360_gpio: gpio { + compatible = "maxim,max7360-gpio"; + + gpio-controller; + #gpio-cells = <2>; + maxim,constant-current-disable = <0x06>; + + interrupt-controller; + #interrupt-cells = <0x2>; + }; + + max7360_gpo: gpo { + compatible = "maxim,max7360-gpo"; + + gpio-controller; + #gpio-cells = <2>; + }; + + backlight_pins: backlight-pins { + pins = "PORT2"; + function = "pwm"; + }; + }; + }; From a22ddeef55c4df847d9ac862b6192da774948fe1 Mon Sep 17 00:00:00 2001 From: Kamel Bouhara Date: Sun, 24 Aug 2025 13:57:21 +0200 Subject: [PATCH 1874/3206] mfd: Add max7360 support Add core driver to support MAX7360 i2c chip, multi function device with keypad, GPIO, PWM, GPO and rotary encoder submodules. Signed-off-by: Kamel Bouhara Co-developed-by: Mathieu Dubois-Briand Signed-off-by: Mathieu Dubois-Briand Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-2-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 14 +++ drivers/mfd/Makefile | 1 + drivers/mfd/max7360.c | 171 ++++++++++++++++++++++++++++++++++++ include/linux/mfd/max7360.h | 109 +++++++++++++++++++++++ 4 files changed, 295 insertions(+) create mode 100644 drivers/mfd/max7360.c create mode 100644 include/linux/mfd/max7360.h diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1e7..58b1c2900d59e1 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -2481,5 +2481,19 @@ config MFD_UPBOARD_FPGA To compile this driver as a module, choose M here: the module will be called upboard-fpga. +config MFD_MAX7360 + tristate "Maxim MAX7360 I2C IO Expander" + depends on I2C + select MFD_CORE + select REGMAP_I2C + select REGMAP_IRQ + help + Say yes here to add support for Maxim MAX7360 device, embedding + keypad, rotary encoder, PWM and GPIO features. + + This driver provides common support for accessing the device; + additional drivers must be enabled in order to use the functionality + of the device. + endmenu endif diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d16..c81c6a8473e1e2 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -163,6 +163,7 @@ obj-$(CONFIG_MFD_DA9063) += da9063.o obj-$(CONFIG_MFD_DA9150) += da9150-core.o obj-$(CONFIG_MFD_MAX14577) += max14577.o +obj-$(CONFIG_MFD_MAX7360) += max7360.o obj-$(CONFIG_MFD_MAX77541) += max77541.o obj-$(CONFIG_MFD_MAX77620) += max77620.o obj-$(CONFIG_MFD_MAX77650) += max77650.o diff --git a/drivers/mfd/max7360.c b/drivers/mfd/max7360.c new file mode 100644 index 00000000000000..5ee459c490ecd5 --- /dev/null +++ b/drivers/mfd/max7360.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Maxim MAX7360 Core Driver + * + * Copyright 2025 Bootlin + * + * Authors: + * Kamel Bouhara + * Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct mfd_cell max7360_cells[] = { + { .name = "max7360-pinctrl" }, + { .name = "max7360-pwm" }, + { .name = "max7360-keypad" }, + { .name = "max7360-rotary" }, + { + .name = "max7360-gpo", + .of_compatible = "maxim,max7360-gpo", + }, + { + .name = "max7360-gpio", + .of_compatible = "maxim,max7360-gpio", + }, +}; + +static const struct regmap_range max7360_volatile_ranges[] = { + regmap_reg_range(MAX7360_REG_KEYFIFO, MAX7360_REG_KEYFIFO), + regmap_reg_range(MAX7360_REG_I2C_TIMEOUT, MAX7360_REG_RTR_CNT), +}; + +static const struct regmap_access_table max7360_volatile_table = { + .yes_ranges = max7360_volatile_ranges, + .n_yes_ranges = ARRAY_SIZE(max7360_volatile_ranges), +}; + +static const struct regmap_config max7360_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = MAX7360_REG_PWMCFG(MAX7360_PORT_PWM_COUNT - 1), + .volatile_table = &max7360_volatile_table, + .cache_type = REGCACHE_MAPLE, +}; + +static int max7360_mask_irqs(struct regmap *regmap) +{ + struct device *dev = regmap_get_device(regmap); + unsigned int val; + int ret; + + /* + * GPIO/PWM interrupts are not masked on reset: as the MAX7360 "INTI" + * interrupt line is shared between GPIOs and rotary encoder, this could + * result in repeated spurious interrupts on the rotary encoder driver + * if the GPIO driver is not loaded. Mask them now to avoid this + * situation. + */ + for (unsigned int i = 0; i < MAX7360_PORT_PWM_COUNT; i++) { + ret = regmap_write_bits(regmap, MAX7360_REG_PWMCFG(i), + MAX7360_PORT_CFG_INTERRUPT_MASK, + MAX7360_PORT_CFG_INTERRUPT_MASK); + if (ret) + return dev_err_probe(dev, ret, + "Failed to write MAX7360 port configuration\n"); + } + + /* Read GPIO in register, to ACK any pending IRQ. */ + ret = regmap_read(regmap, MAX7360_REG_GPIOIN, &val); + if (ret) + return dev_err_probe(dev, ret, "Failed to read GPIO values\n"); + + return 0; +} + +static int max7360_reset(struct regmap *regmap) +{ + struct device *dev = regmap_get_device(regmap); + int ret; + + ret = regmap_write(regmap, MAX7360_REG_GPIOCFG, MAX7360_GPIO_CFG_GPIO_RST); + if (ret) { + dev_err(dev, "Failed to reset GPIO configuration: %x\n", ret); + return ret; + } + + ret = regcache_drop_region(regmap, MAX7360_REG_GPIOCFG, MAX7360_REG_GPIO_LAST); + if (ret) { + dev_err(dev, "Failed to drop regmap cache: %x\n", ret); + return ret; + } + + ret = regmap_write(regmap, MAX7360_REG_SLEEP, 0); + if (ret) { + dev_err(dev, "Failed to reset autosleep configuration: %x\n", ret); + return ret; + } + + ret = regmap_write(regmap, MAX7360_REG_DEBOUNCE, 0); + if (ret) + dev_err(dev, "Failed to reset GPO port count: %x\n", ret); + + return ret; +} + +static int max7360_probe(struct i2c_client *client) +{ + struct device *dev = &client->dev; + struct regmap *regmap; + int ret; + + regmap = devm_regmap_init_i2c(client, &max7360_regmap_config); + if (IS_ERR(regmap)) + return dev_err_probe(dev, PTR_ERR(regmap), "Failed to initialise regmap\n"); + + ret = max7360_reset(regmap); + if (ret) + return dev_err_probe(dev, ret, "Failed to reset device\n"); + + /* Get the device out of shutdown mode. */ + ret = regmap_write_bits(regmap, MAX7360_REG_GPIOCFG, + MAX7360_GPIO_CFG_GPIO_EN, + MAX7360_GPIO_CFG_GPIO_EN); + if (ret) + return dev_err_probe(dev, ret, "Failed to enable GPIO and PWM module\n"); + + ret = max7360_mask_irqs(regmap); + if (ret) + return dev_err_probe(dev, ret, "Could not mask interrupts\n"); + + ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, + max7360_cells, ARRAY_SIZE(max7360_cells), + NULL, 0, NULL); + if (ret) + return dev_err_probe(dev, ret, "Failed to register child devices\n"); + + return 0; +} + +static const struct of_device_id max7360_dt_match[] = { + { .compatible = "maxim,max7360" }, + {} +}; +MODULE_DEVICE_TABLE(of, max7360_dt_match); + +static struct i2c_driver max7360_driver = { + .driver = { + .name = "max7360", + .of_match_table = max7360_dt_match, + }, + .probe = max7360_probe, +}; +module_i2c_driver(max7360_driver); + +MODULE_DESCRIPTION("Maxim MAX7360 I2C IO Expander core driver"); +MODULE_AUTHOR("Kamel Bouhara "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/max7360.h b/include/linux/mfd/max7360.h new file mode 100644 index 00000000000000..44cf2bf651a252 --- /dev/null +++ b/include/linux/mfd/max7360.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __LINUX_MFD_MAX7360_H +#define __LINUX_MFD_MAX7360_H + +#include + +#define MAX7360_MAX_KEY_ROWS 8 +#define MAX7360_MAX_KEY_COLS 8 +#define MAX7360_MAX_KEY_NUM (MAX7360_MAX_KEY_ROWS * MAX7360_MAX_KEY_COLS) +#define MAX7360_ROW_SHIFT 3 + +#define MAX7360_MAX_GPIO 8 +#define MAX7360_MAX_GPO 6 +#define MAX7360_PORT_PWM_COUNT 8 +#define MAX7360_PORT_RTR_PIN (MAX7360_PORT_PWM_COUNT - 1) + +/* + * MAX7360 registers + */ +#define MAX7360_REG_KEYFIFO 0x00 +#define MAX7360_REG_CONFIG 0x01 +#define MAX7360_REG_DEBOUNCE 0x02 +#define MAX7360_REG_INTERRUPT 0x03 +#define MAX7360_REG_PORTS 0x04 +#define MAX7360_REG_KEYREP 0x05 +#define MAX7360_REG_SLEEP 0x06 + +/* + * MAX7360 GPIO registers + * + * All these registers are reset together when writing bit 3 of + * MAX7360_REG_GPIOCFG. + */ +#define MAX7360_REG_GPIOCFG 0x40 +#define MAX7360_REG_GPIOCTRL 0x41 +#define MAX7360_REG_GPIODEB 0x42 +#define MAX7360_REG_GPIOCURR 0x43 +#define MAX7360_REG_GPIOOUTM 0x44 +#define MAX7360_REG_PWMCOM 0x45 +#define MAX7360_REG_RTRCFG 0x46 +#define MAX7360_REG_I2C_TIMEOUT 0x48 +#define MAX7360_REG_GPIOIN 0x49 +#define MAX7360_REG_RTR_CNT 0x4A +#define MAX7360_REG_PWMBASE 0x50 +#define MAX7360_REG_PWMCFGBASE 0x58 + +#define MAX7360_REG_GPIO_LAST 0x5F + +#define MAX7360_REG_PWM(x) (MAX7360_REG_PWMBASE + (x)) +#define MAX7360_REG_PWMCFG(x) (MAX7360_REG_PWMCFGBASE + (x)) + +/* + * Configuration register bits + */ +#define MAX7360_FIFO_EMPTY 0x3F +#define MAX7360_FIFO_OVERFLOW 0x7F +#define MAX7360_FIFO_RELEASE BIT(6) +#define MAX7360_FIFO_COL GENMASK(5, 3) +#define MAX7360_FIFO_ROW GENMASK(2, 0) + +#define MAX7360_CFG_SLEEP BIT(7) +#define MAX7360_CFG_INTERRUPT BIT(5) +#define MAX7360_CFG_KEY_RELEASE BIT(3) +#define MAX7360_CFG_WAKEUP BIT(1) +#define MAX7360_CFG_TIMEOUT BIT(0) + +#define MAX7360_DEBOUNCE GENMASK(4, 0) +#define MAX7360_DEBOUNCE_MIN 9 +#define MAX7360_DEBOUNCE_MAX 40 +#define MAX7360_PORTS GENMASK(8, 5) + +#define MAX7360_INTERRUPT_TIME_MASK GENMASK(4, 0) +#define MAX7360_INTERRUPT_FIFO_MASK GENMASK(7, 5) + +#define MAX7360_PORT_CFG_INTERRUPT_MASK BIT(7) +#define MAX7360_PORT_CFG_INTERRUPT_EDGES BIT(6) +#define MAX7360_PORT_CFG_COMMON_PWM BIT(5) + +/* + * Autosleep register values + */ +#define MAX7360_AUTOSLEEP_8192MS 0x01 +#define MAX7360_AUTOSLEEP_4096MS 0x02 +#define MAX7360_AUTOSLEEP_2048MS 0x03 +#define MAX7360_AUTOSLEEP_1024MS 0x04 +#define MAX7360_AUTOSLEEP_512MS 0x05 +#define MAX7360_AUTOSLEEP_256MS 0x06 + +#define MAX7360_GPIO_CFG_RTR_EN BIT(7) +#define MAX7360_GPIO_CFG_GPIO_EN BIT(4) +#define MAX7360_GPIO_CFG_GPIO_RST BIT(3) + +#define MAX7360_ROT_DEBOUNCE GENMASK(3, 0) +#define MAX7360_ROT_DEBOUNCE_MIN 0 +#define MAX7360_ROT_DEBOUNCE_MAX 15 +#define MAX7360_ROT_INTCNT GENMASK(6, 4) +#define MAX7360_ROT_INTCNT_DLY BIT(7) + +#define MAX7360_INT_INTI 0 +#define MAX7360_INT_INTK 1 + +#define MAX7360_INT_GPIO 0 +#define MAX7360_INT_KEYPAD 1 +#define MAX7360_INT_ROTARY 2 + +#define MAX7360_NR_INTERNAL_IRQS 3 + +#endif From b4b993c0e39436ffb3a9b21cabf62b5df085b2e1 Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:22 +0200 Subject: [PATCH 1875/3206] pinctrl: Add MAX7360 pinctrl driver Add driver for Maxim Integrated MAX7360 pinctrl on the PORT pins. Pins can be used either for GPIO, PWM or rotary encoder functionalities. Signed-off-by: Mathieu Dubois-Briand Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-3-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/pinctrl/Kconfig | 11 ++ drivers/pinctrl/Makefile | 1 + drivers/pinctrl/pinctrl-max7360.c | 215 ++++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+) create mode 100644 drivers/pinctrl/pinctrl-max7360.c diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index ddd11668457ced..57e4bdc8011d94 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -358,6 +358,17 @@ config PINCTRL_LPC18XX help Pinctrl driver for NXP LPC18xx/43xx System Control Unit (SCU). +config PINCTRL_MAX7360 + tristate "MAX7360 Pincontrol support" + depends on MFD_MAX7360 + select PINMUX + select GENERIC_PINCONF + help + Say Y here to enable pin control support for Maxim MAX7360 keypad + controller. + This keypad controller has 8 GPIO pins that may work as GPIO, or PWM, + or rotary encoder alternate modes. + config PINCTRL_MAX77620 tristate "MAX77620/MAX20024 Pincontrol support" depends on MFD_MAX77620 && OF diff --git a/drivers/pinctrl/Makefile b/drivers/pinctrl/Makefile index 909ab89a56d293..65aa432fc97ed9 100644 --- a/drivers/pinctrl/Makefile +++ b/drivers/pinctrl/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_PINCTRL_FALCON) += pinctrl-falcon.o obj-$(CONFIG_PINCTRL_LOONGSON2) += pinctrl-loongson2.o obj-$(CONFIG_PINCTRL_XWAY) += pinctrl-xway.o obj-$(CONFIG_PINCTRL_LPC18XX) += pinctrl-lpc18xx.o +obj-$(CONFIG_PINCTRL_MAX7360) += pinctrl-max7360.o obj-$(CONFIG_PINCTRL_MAX77620) += pinctrl-max77620.o obj-$(CONFIG_PINCTRL_MCP23S08_I2C) += pinctrl-mcp23s08_i2c.o obj-$(CONFIG_PINCTRL_MCP23S08_SPI) += pinctrl-mcp23s08_spi.o diff --git a/drivers/pinctrl/pinctrl-max7360.c b/drivers/pinctrl/pinctrl-max7360.c new file mode 100644 index 00000000000000..abfaff468bad1f --- /dev/null +++ b/drivers/pinctrl/pinctrl-max7360.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Bootlin + * + * Author: Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "core.h" +#include "pinmux.h" + +struct max7360_pinctrl { + struct pinctrl_dev *pctldev; + struct pinctrl_desc pinctrl_desc; +}; + +static const struct pinctrl_pin_desc max7360_pins[] = { + PINCTRL_PIN(0, "PORT0"), + PINCTRL_PIN(1, "PORT1"), + PINCTRL_PIN(2, "PORT2"), + PINCTRL_PIN(3, "PORT3"), + PINCTRL_PIN(4, "PORT4"), + PINCTRL_PIN(5, "PORT5"), + PINCTRL_PIN(6, "PORT6"), + PINCTRL_PIN(7, "PORT7"), +}; + +static const unsigned int port0_pins[] = {0}; +static const unsigned int port1_pins[] = {1}; +static const unsigned int port2_pins[] = {2}; +static const unsigned int port3_pins[] = {3}; +static const unsigned int port4_pins[] = {4}; +static const unsigned int port5_pins[] = {5}; +static const unsigned int port6_pins[] = {6}; +static const unsigned int port7_pins[] = {7}; +static const unsigned int rotary_pins[] = {6, 7}; + +static const struct pingroup max7360_groups[] = { + PINCTRL_PINGROUP("PORT0", port0_pins, ARRAY_SIZE(port0_pins)), + PINCTRL_PINGROUP("PORT1", port1_pins, ARRAY_SIZE(port1_pins)), + PINCTRL_PINGROUP("PORT2", port2_pins, ARRAY_SIZE(port2_pins)), + PINCTRL_PINGROUP("PORT3", port3_pins, ARRAY_SIZE(port3_pins)), + PINCTRL_PINGROUP("PORT4", port4_pins, ARRAY_SIZE(port4_pins)), + PINCTRL_PINGROUP("PORT5", port5_pins, ARRAY_SIZE(port5_pins)), + PINCTRL_PINGROUP("PORT6", port6_pins, ARRAY_SIZE(port6_pins)), + PINCTRL_PINGROUP("PORT7", port7_pins, ARRAY_SIZE(port7_pins)), + PINCTRL_PINGROUP("ROTARY", rotary_pins, ARRAY_SIZE(rotary_pins)), +}; + +static int max7360_pinctrl_get_groups_count(struct pinctrl_dev *pctldev) +{ + return ARRAY_SIZE(max7360_groups); +} + +static const char *max7360_pinctrl_get_group_name(struct pinctrl_dev *pctldev, + unsigned int group) +{ + return max7360_groups[group].name; +} + +static int max7360_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, + unsigned int group, + const unsigned int **pins, + unsigned int *num_pins) +{ + *pins = max7360_groups[group].pins; + *num_pins = max7360_groups[group].npins; + return 0; +} + +static const struct pinctrl_ops max7360_pinctrl_ops = { + .get_groups_count = max7360_pinctrl_get_groups_count, + .get_group_name = max7360_pinctrl_get_group_name, + .get_group_pins = max7360_pinctrl_get_group_pins, +#ifdef CONFIG_OF + .dt_node_to_map = pinconf_generic_dt_node_to_map_pin, + .dt_free_map = pinconf_generic_dt_free_map, +#endif +}; + +static const char * const simple_groups[] = { + "PORT0", "PORT1", "PORT2", "PORT3", + "PORT4", "PORT5", "PORT6", "PORT7", +}; + +static const char * const rotary_groups[] = { "ROTARY" }; + +#define MAX7360_PINCTRL_FN_GPIO 0 +#define MAX7360_PINCTRL_FN_PWM 1 +#define MAX7360_PINCTRL_FN_ROTARY 2 +static const struct pinfunction max7360_functions[] = { + [MAX7360_PINCTRL_FN_GPIO] = PINCTRL_PINFUNCTION("gpio", simple_groups, + ARRAY_SIZE(simple_groups)), + [MAX7360_PINCTRL_FN_PWM] = PINCTRL_PINFUNCTION("pwm", simple_groups, + ARRAY_SIZE(simple_groups)), + [MAX7360_PINCTRL_FN_ROTARY] = PINCTRL_PINFUNCTION("rotary", rotary_groups, + ARRAY_SIZE(rotary_groups)), +}; + +static int max7360_get_functions_count(struct pinctrl_dev *pctldev) +{ + return ARRAY_SIZE(max7360_functions); +} + +static const char *max7360_get_function_name(struct pinctrl_dev *pctldev, unsigned int selector) +{ + return max7360_functions[selector].name; +} + +static int max7360_get_function_groups(struct pinctrl_dev *pctldev, unsigned int selector, + const char * const **groups, + unsigned int * const num_groups) +{ + *groups = max7360_functions[selector].groups; + *num_groups = max7360_functions[selector].ngroups; + + return 0; +} + +static int max7360_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, + unsigned int group) +{ + struct regmap *regmap = dev_get_regmap(pctldev->dev->parent, NULL); + int val; + + /* + * GPIO and PWM functions are the same: we only need to handle the + * rotary encoder function, on pins 6 and 7. + */ + if (max7360_groups[group].pins[0] >= 6) { + if (selector == MAX7360_PINCTRL_FN_ROTARY) + val = MAX7360_GPIO_CFG_RTR_EN; + else + val = 0; + + return regmap_write_bits(regmap, MAX7360_REG_GPIOCFG, MAX7360_GPIO_CFG_RTR_EN, val); + } + + return 0; +} + +static const struct pinmux_ops max7360_pmxops = { + .get_functions_count = max7360_get_functions_count, + .get_function_name = max7360_get_function_name, + .get_function_groups = max7360_get_function_groups, + .set_mux = max7360_set_mux, + .strict = true, +}; + +static int max7360_pinctrl_probe(struct platform_device *pdev) +{ + struct regmap *regmap; + struct pinctrl_desc *pd; + struct max7360_pinctrl *chip; + struct device *dev = &pdev->dev; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return dev_err_probe(dev, -ENODEV, "Could not get parent regmap\n"); + + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); + if (!chip) + return -ENOMEM; + + pd = &chip->pinctrl_desc; + + pd->pctlops = &max7360_pinctrl_ops; + pd->pmxops = &max7360_pmxops; + pd->name = dev_name(dev); + pd->pins = max7360_pins; + pd->npins = MAX7360_MAX_GPIO; + pd->owner = THIS_MODULE; + + /* + * This MFD sub-device does not have any associated device tree node: + * properties are stored in the device node of the parent (MFD) device + * and this same node is used in phandles of client devices. + * Reuse this device tree node here, as otherwise the pinctrl subsystem + * would be confused by this topology. + */ + device_set_of_node_from_dev(dev, dev->parent); + + chip->pctldev = devm_pinctrl_register(dev, pd, chip); + if (IS_ERR(chip->pctldev)) + return dev_err_probe(dev, PTR_ERR(chip->pctldev), "can't register controller\n"); + + return 0; +} + +static struct platform_driver max7360_pinctrl_driver = { + .driver = { + .name = "max7360-pinctrl", + }, + .probe = max7360_pinctrl_probe, +}; +module_platform_driver(max7360_pinctrl_driver); + +MODULE_DESCRIPTION("MAX7360 pinctrl driver"); +MODULE_AUTHOR("Mathieu Dubois-Briand "); +MODULE_LICENSE("GPL"); From d93a75d94b79ba3e664f7236ee05790e8b1d0e4b Mon Sep 17 00:00:00 2001 From: Kamel Bouhara Date: Sun, 24 Aug 2025 13:57:23 +0200 Subject: [PATCH 1876/3206] pwm: max7360: Add MAX7360 PWM support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add driver for Maxim Integrated MAX7360 PWM controller, supporting up to 8 independent PWM outputs. Signed-off-by: Kamel Bouhara Co-developed-by: Mathieu Dubois-Briand Signed-off-by: Mathieu Dubois-Briand Reviewed-by: Andy Shevchenko Acked-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-4-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/pwm/Kconfig | 10 ++ drivers/pwm/Makefile | 1 + drivers/pwm/pwm-max7360.c | 209 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 220 insertions(+) create mode 100644 drivers/pwm/pwm-max7360.c diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index f00ce973dddf65..f2b1ce47de7f79 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -432,6 +432,16 @@ config PWM_LPSS_PLATFORM To compile this driver as a module, choose M here: the module will be called pwm-lpss-platform. +config PWM_MAX7360 + tristate "MAX7360 PWMs" + depends on MFD_MAX7360 + help + PWM driver for Maxim Integrated MAX7360 multifunction device, with + support for up to 8 PWM outputs. + + To compile this driver as a module, choose M here: the module + will be called pwm-max7360. + config PWM_MC33XS2410 tristate "MC33XS2410 PWM support" depends on OF diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index ff4f47e5fb7a0d..dfa8b4966ee19a 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_PWM_LPC32XX) += pwm-lpc32xx.o obj-$(CONFIG_PWM_LPSS) += pwm-lpss.o obj-$(CONFIG_PWM_LPSS_PCI) += pwm-lpss-pci.o obj-$(CONFIG_PWM_LPSS_PLATFORM) += pwm-lpss-platform.o +obj-$(CONFIG_PWM_MAX7360) += pwm-max7360.o obj-$(CONFIG_PWM_MC33XS2410) += pwm-mc33xs2410.o obj-$(CONFIG_PWM_MEDIATEK) += pwm-mediatek.o obj-$(CONFIG_PWM_MESON) += pwm-meson.o diff --git a/drivers/pwm/pwm-max7360.c b/drivers/pwm/pwm-max7360.c new file mode 100644 index 00000000000000..ebf93a7aee5be4 --- /dev/null +++ b/drivers/pwm/pwm-max7360.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Bootlin + * + * Author: Kamel BOUHARA + * Author: Mathieu Dubois-Briand + * + * PWM functionality of the MAX7360 multi-function device. + * https://www.analog.com/media/en/technical-documentation/data-sheets/MAX7360.pdf + * + * Limitations: + * - Only supports normal polarity. + * - The period is fixed to 2 ms. + * - Only the duty cycle can be changed, new values are applied at the beginning + * of the next cycle. + * - When disabled, the output is put in Hi-Z immediately. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX7360_NUM_PWMS 8 +#define MAX7360_PWM_MAX 255 +#define MAX7360_PWM_STEPS 256 +#define MAX7360_PWM_PERIOD_NS (2 * NSEC_PER_MSEC) + +struct max7360_pwm_waveform { + u8 duty_steps; + bool enabled; +}; + +static int max7360_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) +{ + struct regmap *regmap = pwmchip_get_drvdata(chip); + + /* + * Make sure we use the individual PWM configuration register and not + * the global one. + * We never need to use the global one, so there is no need to revert + * that in the .free() callback. + */ + return regmap_write_bits(regmap, MAX7360_REG_PWMCFG(pwm->hwpwm), + MAX7360_PORT_CFG_COMMON_PWM, 0); +} + +static int max7360_pwm_round_waveform_tohw(struct pwm_chip *chip, + struct pwm_device *pwm, + const struct pwm_waveform *wf, + void *_wfhw) +{ + struct max7360_pwm_waveform *wfhw = _wfhw; + u64 duty_steps; + + /* + * Ignore user provided values for period_length_ns and duty_offset_ns: + * we only support fixed period of MAX7360_PWM_PERIOD_NS and offset of 0. + * Values from 0 to 254 as duty_steps will provide duty cycles of 0/256 + * to 254/256, while value 255 will provide a duty cycle of 100%. + */ + if (wf->duty_length_ns >= MAX7360_PWM_PERIOD_NS) { + duty_steps = MAX7360_PWM_MAX; + } else { + duty_steps = (u32)wf->duty_length_ns * MAX7360_PWM_STEPS / MAX7360_PWM_PERIOD_NS; + if (duty_steps == MAX7360_PWM_MAX) + duty_steps = MAX7360_PWM_MAX - 1; + } + + wfhw->duty_steps = min(MAX7360_PWM_MAX, duty_steps); + wfhw->enabled = !!wf->period_length_ns; + + if (wf->period_length_ns && wf->period_length_ns < MAX7360_PWM_PERIOD_NS) + return 1; + else + return 0; +} + +static int max7360_pwm_round_waveform_fromhw(struct pwm_chip *chip, struct pwm_device *pwm, + const void *_wfhw, struct pwm_waveform *wf) +{ + const struct max7360_pwm_waveform *wfhw = _wfhw; + + wf->period_length_ns = wfhw->enabled ? MAX7360_PWM_PERIOD_NS : 0; + wf->duty_offset_ns = 0; + + if (wfhw->enabled) { + if (wfhw->duty_steps == MAX7360_PWM_MAX) + wf->duty_length_ns = MAX7360_PWM_PERIOD_NS; + else + wf->duty_length_ns = DIV_ROUND_UP(wfhw->duty_steps * MAX7360_PWM_PERIOD_NS, + MAX7360_PWM_STEPS); + } else { + wf->duty_length_ns = 0; + } + + return 0; +} + +static int max7360_pwm_write_waveform(struct pwm_chip *chip, + struct pwm_device *pwm, + const void *_wfhw) +{ + struct regmap *regmap = pwmchip_get_drvdata(chip); + const struct max7360_pwm_waveform *wfhw = _wfhw; + unsigned int val; + int ret; + + if (wfhw->enabled) { + ret = regmap_write(regmap, MAX7360_REG_PWM(pwm->hwpwm), wfhw->duty_steps); + if (ret) + return ret; + } + + val = wfhw->enabled ? BIT(pwm->hwpwm) : 0; + return regmap_write_bits(regmap, MAX7360_REG_GPIOCTRL, BIT(pwm->hwpwm), val); +} + +static int max7360_pwm_read_waveform(struct pwm_chip *chip, + struct pwm_device *pwm, + void *_wfhw) +{ + struct regmap *regmap = pwmchip_get_drvdata(chip); + struct max7360_pwm_waveform *wfhw = _wfhw; + unsigned int val; + int ret; + + ret = regmap_read(regmap, MAX7360_REG_GPIOCTRL, &val); + if (ret) + return ret; + + if (val & BIT(pwm->hwpwm)) { + wfhw->enabled = true; + ret = regmap_read(regmap, MAX7360_REG_PWM(pwm->hwpwm), &val); + if (ret) + return ret; + + wfhw->duty_steps = val; + } else { + wfhw->enabled = false; + wfhw->duty_steps = 0; + } + + return 0; +} + +static const struct pwm_ops max7360_pwm_ops = { + .request = max7360_pwm_request, + .round_waveform_tohw = max7360_pwm_round_waveform_tohw, + .round_waveform_fromhw = max7360_pwm_round_waveform_fromhw, + .read_waveform = max7360_pwm_read_waveform, + .write_waveform = max7360_pwm_write_waveform, +}; + +static int max7360_pwm_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct pwm_chip *chip; + struct regmap *regmap; + int ret; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return dev_err_probe(dev, -ENODEV, "Could not get parent regmap\n"); + + /* + * This MFD sub-device does not have any associated device tree node: + * properties are stored in the device node of the parent (MFD) device + * and this same node is used in phandles of client devices. + * Reuse this device tree node here, as otherwise the PWM subsystem + * would be confused by this topology. + */ + device_set_of_node_from_dev(dev, dev->parent); + + chip = devm_pwmchip_alloc(dev, MAX7360_NUM_PWMS, 0); + if (IS_ERR(chip)) + return PTR_ERR(chip); + chip->ops = &max7360_pwm_ops; + + pwmchip_set_drvdata(chip, regmap); + + ret = devm_pwmchip_add(dev, chip); + if (ret) + return dev_err_probe(dev, ret, "Failed to add PWM chip\n"); + + return 0; +} + +static struct platform_driver max7360_pwm_driver = { + .driver = { + .name = "max7360-pwm", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + }, + .probe = max7360_pwm_probe, +}; +module_platform_driver(max7360_pwm_driver); + +MODULE_DESCRIPTION("MAX7360 PWM driver"); +MODULE_AUTHOR("Kamel BOUHARA "); +MODULE_AUTHOR("Mathieu Dubois-Briand "); +MODULE_LICENSE("GPL"); From 553b75d4bfe9264f631d459fe9996744e0672b0e Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:24 +0200 Subject: [PATCH 1877/3206] gpio: regmap: Allow to allocate regmap-irq device GPIO controller often have support for IRQ: allow to easily allocate both gpio-regmap and regmap-irq in one operation. Reviewed-by: Andy Shevchenko Acked-by: Bartosz Golaszewski Signed-off-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-5-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/gpio/gpio-regmap.c | 29 +++++++++++++++++++++++++++-- include/linux/gpio/regmap.h | 11 +++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c index e8a32dfebdcb31..e1944931ee7ccf 100644 --- a/drivers/gpio/gpio-regmap.c +++ b/drivers/gpio/gpio-regmap.c @@ -32,6 +32,11 @@ struct gpio_regmap { unsigned int reg_dir_in_base; unsigned int reg_dir_out_base; +#ifdef CONFIG_REGMAP_IRQ + int regmap_irq_line; + struct regmap_irq_chip_data *irq_chip_data; +#endif + int (*reg_mask_xlate)(struct gpio_regmap *gpio, unsigned int base, unsigned int offset, unsigned int *reg, unsigned int *mask); @@ -215,6 +220,7 @@ EXPORT_SYMBOL_GPL(gpio_regmap_get_drvdata); */ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config) { + struct irq_domain *irq_domain; struct gpio_regmap *gpio; struct gpio_chip *chip; int ret; @@ -295,8 +301,22 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config if (ret < 0) goto err_free_gpio; - if (config->irq_domain) { - ret = gpiochip_irqchip_add_domain(chip, config->irq_domain); +#ifdef CONFIG_REGMAP_IRQ + if (config->regmap_irq_chip) { + gpio->regmap_irq_line = config->regmap_irq_line; + ret = regmap_add_irq_chip_fwnode(dev_fwnode(config->parent), config->regmap, + config->regmap_irq_line, config->regmap_irq_flags, + 0, config->regmap_irq_chip, &gpio->irq_chip_data); + if (ret) + goto err_free_gpio; + + irq_domain = regmap_irq_get_domain(gpio->irq_chip_data); + } else +#endif + irq_domain = config->irq_domain; + + if (irq_domain) { + ret = gpiochip_irqchip_add_domain(chip, irq_domain); if (ret) goto err_remove_gpiochip; } @@ -317,6 +337,11 @@ EXPORT_SYMBOL_GPL(gpio_regmap_register); */ void gpio_regmap_unregister(struct gpio_regmap *gpio) { +#ifdef CONFIG_REGMAP_IRQ + if (gpio->irq_chip_data) + regmap_del_irq_chip(gpio->regmap_irq_line, gpio->irq_chip_data); +#endif + gpiochip_remove(&gpio->gpio_chip); kfree(gpio); } diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index c722c67668c6e5..19b52ac03a5dec 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -40,6 +40,11 @@ struct regmap; * @drvdata: (Optional) Pointer to driver specific data which is * not used by gpio-remap but is provided "as is" to the * driver callback(s). + * @regmap_irq_chip: (Optional) Pointer on an regmap_irq_chip structure. If + * set, a regmap-irq device will be created and the IRQ + * domain will be set accordingly. + * @regmap_irq_line (Optional) The IRQ the device uses to signal interrupts. + * @regmap_irq_flags (Optional) The IRQF_ flags to use for the interrupt. * * The ->reg_mask_xlate translates a given base address and GPIO offset to * register and mask pair. The base address is one of the given register @@ -78,6 +83,12 @@ struct gpio_regmap_config { int ngpio_per_reg; struct irq_domain *irq_domain; +#ifdef CONFIG_REGMAP_IRQ + struct regmap_irq_chip *regmap_irq_chip; + int regmap_irq_line; + unsigned long regmap_irq_flags; +#endif + int (*reg_mask_xlate)(struct gpio_regmap *gpio, unsigned int base, unsigned int offset, unsigned int *reg, unsigned int *mask); From 0627b71fa5508ab605b6e9fd74baed40805cfdda Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:25 +0200 Subject: [PATCH 1878/3206] gpio: regmap: Allow to provide init_valid_mask callback Allows to populate the gpio_regmap_config structure with init_valid_mask() callback to set on the final gpio_chip structure. Reviewed-by: Michael Walle Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Reviewed-by: Bartosz Golaszewski Signed-off-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-6-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/gpio/gpio-regmap.c | 1 + include/linux/gpio/regmap.h | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c index e1944931ee7ccf..d9d23853e0326b 100644 --- a/drivers/gpio/gpio-regmap.c +++ b/drivers/gpio/gpio-regmap.c @@ -261,6 +261,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config chip->names = config->names; chip->label = config->label ?: dev_name(config->parent); chip->can_sleep = regmap_might_sleep(config->regmap); + chip->init_valid_mask = config->init_valid_mask; chip->request = gpiochip_generic_request; chip->free = gpiochip_generic_free; diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index 19b52ac03a5dec..622a2939ebe0fd 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -6,6 +6,7 @@ struct device; struct fwnode_handle; struct gpio_regmap; +struct gpio_chip; struct irq_domain; struct regmap; @@ -40,6 +41,8 @@ struct regmap; * @drvdata: (Optional) Pointer to driver specific data which is * not used by gpio-remap but is provided "as is" to the * driver callback(s). + * @init_valid_mask: (Optional) Routine to initialize @valid_mask, to be used + * if not all GPIOs are valid. * @regmap_irq_chip: (Optional) Pointer on an regmap_irq_chip structure. If * set, a regmap-irq device will be created and the IRQ * domain will be set accordingly. @@ -93,6 +96,10 @@ struct gpio_regmap_config { unsigned int offset, unsigned int *reg, unsigned int *mask); + int (*init_valid_mask)(struct gpio_chip *gc, + unsigned long *valid_mask, + unsigned int ngpios); + void *drvdata; }; From b1a7433d857edb14b993161af9ed1ee98d4c9cee Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:26 +0200 Subject: [PATCH 1879/3206] gpio: max7360: Add MAX7360 gpio support Add driver for Maxim Integrated MAX7360 GPIO/GPO controller. Two sets of GPIOs are provided by the device: - Up to 8 GPIOs, shared with the PWM and rotary encoder functionalities. These GPIOs also provide interrupts on input changes. - Up to 6 GPOs, on unused keypad columns pins. Co-developed-by: Kamel Bouhara Signed-off-by: Kamel Bouhara Acked-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Signed-off-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-7-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/gpio/Kconfig | 12 ++ drivers/gpio/Makefile | 1 + drivers/gpio/gpio-max7360.c | 257 ++++++++++++++++++++++++++++++++++++ 3 files changed, 270 insertions(+) create mode 100644 drivers/gpio/gpio-max7360.c diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index e43abb322fa6e1..6cf57a18dbe7c7 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -1492,6 +1492,18 @@ config GPIO_MADERA help Support for GPIOs on Cirrus Logic Madera class codecs. +config GPIO_MAX7360 + tristate "MAX7360 GPIO support" + depends on MFD_MAX7360 + select GPIO_REGMAP + select REGMAP_IRQ + help + Allows to use MAX7360 I/O Expander PWM lines as GPIO and keypad COL + lines as GPO. + + This driver can also be built as a module. If so, the module will be + called gpio-max7360. + config GPIO_MAX77620 tristate "GPIO support for PMIC MAX77620 and MAX20024" depends on MFD_MAX77620 diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 379f55e9ed1e69..52abba3ef81c49 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -106,6 +106,7 @@ obj-$(CONFIG_GPIO_MAX7300) += gpio-max7300.o obj-$(CONFIG_GPIO_MAX7301) += gpio-max7301.o obj-$(CONFIG_GPIO_MAX730X) += gpio-max730x.o obj-$(CONFIG_GPIO_MAX732X) += gpio-max732x.o +obj-$(CONFIG_GPIO_MAX7360) += gpio-max7360.o obj-$(CONFIG_GPIO_MAX77620) += gpio-max77620.o obj-$(CONFIG_GPIO_MAX77650) += gpio-max77650.o obj-$(CONFIG_GPIO_MAX77759) += gpio-max77759.o diff --git a/drivers/gpio/gpio-max7360.c b/drivers/gpio/gpio-max7360.c new file mode 100644 index 00000000000000..db92a43776a920 --- /dev/null +++ b/drivers/gpio/gpio-max7360.c @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Bootlin + * + * Author: Kamel BOUHARA + * Author: Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX7360_GPIO_PORT 1 +#define MAX7360_GPIO_COL 2 + +struct max7360_gpio_plat_data { + unsigned int function; +}; + +static struct max7360_gpio_plat_data max7360_gpio_port_plat = { .function = MAX7360_GPIO_PORT }; +static struct max7360_gpio_plat_data max7360_gpio_col_plat = { .function = MAX7360_GPIO_COL }; + +static int max7360_get_available_gpos(struct device *dev, unsigned int *available_gpios) +{ + u32 columns; + int ret; + + ret = device_property_read_u32(dev->parent, "keypad,num-columns", &columns); + if (ret) { + dev_err(dev, "Failed to read columns count\n"); + return ret; + } + + *available_gpios = min(MAX7360_MAX_GPO, MAX7360_MAX_KEY_COLS - columns); + + return 0; +} + +static int max7360_gpo_init_valid_mask(struct gpio_chip *gc, + unsigned long *valid_mask, + unsigned int ngpios) +{ + unsigned int available_gpios; + int ret; + + ret = max7360_get_available_gpos(gc->parent, &available_gpios); + if (ret) + return ret; + + bitmap_clear(valid_mask, 0, MAX7360_MAX_KEY_COLS - available_gpios); + + return 0; +} + +static int max7360_set_gpos_count(struct device *dev, struct regmap *regmap) +{ + /* + * MAX7360 COL0 to COL7 pins can be used either as keypad columns, + * general purpose output or a mix of both. + * By default, all pins are used as keypad, here we update this + * configuration to allow to use some of them as GPIOs. + */ + unsigned int available_gpios; + unsigned int val; + int ret; + + ret = max7360_get_available_gpos(dev, &available_gpios); + if (ret) + return ret; + + /* + * Configure which GPIOs will be used for keypad. + * MAX7360_REG_DEBOUNCE contains configuration both for keypad debounce + * timings and gpos/keypad columns repartition. Only the later is + * modified here. + */ + val = FIELD_PREP(MAX7360_PORTS, available_gpios); + ret = regmap_write_bits(regmap, MAX7360_REG_DEBOUNCE, MAX7360_PORTS, val); + if (ret) + dev_err(dev, "Failed to write max7360 columns/gpos configuration"); + + return ret; +} + +static int max7360_gpio_reg_mask_xlate(struct gpio_regmap *gpio, + unsigned int base, unsigned int offset, + unsigned int *reg, unsigned int *mask) +{ + if (base == MAX7360_REG_PWMBASE) { + /* + * GPIO output is using PWM duty cycle registers: one register + * per line, with value being either 0 or 255. + */ + *reg = base + offset; + *mask = GENMASK(7, 0); + } else { + *reg = base; + *mask = BIT(offset); + } + + return 0; +} + +static const struct regmap_irq max7360_regmap_irqs[MAX7360_MAX_GPIO] = { + REGMAP_IRQ_REG(0, 0, BIT(0)), + REGMAP_IRQ_REG(1, 0, BIT(1)), + REGMAP_IRQ_REG(2, 0, BIT(2)), + REGMAP_IRQ_REG(3, 0, BIT(3)), + REGMAP_IRQ_REG(4, 0, BIT(4)), + REGMAP_IRQ_REG(5, 0, BIT(5)), + REGMAP_IRQ_REG(6, 0, BIT(6)), + REGMAP_IRQ_REG(7, 0, BIT(7)), +}; + +static int max7360_handle_mask_sync(const int index, + const unsigned int mask_buf_def, + const unsigned int mask_buf, + void *const irq_drv_data) +{ + struct regmap *regmap = irq_drv_data; + int ret; + + for (unsigned int i = 0; i < MAX7360_MAX_GPIO; i++) { + ret = regmap_assign_bits(regmap, MAX7360_REG_PWMCFG(i), + MAX7360_PORT_CFG_INTERRUPT_MASK, mask_buf & BIT(i)); + if (ret) + return ret; + } + + return 0; +} + +static int max7360_gpio_probe(struct platform_device *pdev) +{ + const struct max7360_gpio_plat_data *plat_data; + struct gpio_regmap_config gpio_config = { }; + struct regmap_irq_chip *irq_chip; + struct device *dev = &pdev->dev; + struct regmap *regmap; + unsigned int outconf; + int ret; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return dev_err_probe(dev, -ENODEV, "could not get parent regmap\n"); + + plat_data = device_get_match_data(dev); + if (plat_data->function == MAX7360_GPIO_PORT) { + if (device_property_read_bool(dev, "interrupt-controller")) { + /* + * Port GPIOs with interrupt-controller property: add IRQ + * controller. + */ + gpio_config.regmap_irq_flags = IRQF_ONESHOT | IRQF_SHARED; + gpio_config.regmap_irq_line = + fwnode_irq_get_byname(dev_fwnode(dev->parent), "inti"); + if (gpio_config.regmap_irq_line < 0) + return dev_err_probe(dev, gpio_config.regmap_irq_line, + "Failed to get IRQ\n"); + + /* Create custom IRQ configuration. */ + irq_chip = devm_kzalloc(dev, sizeof(*irq_chip), GFP_KERNEL); + gpio_config.regmap_irq_chip = irq_chip; + if (!irq_chip) + return -ENOMEM; + + irq_chip->name = dev_name(dev); + irq_chip->status_base = MAX7360_REG_GPIOIN; + irq_chip->status_is_level = true; + irq_chip->num_regs = 1; + irq_chip->num_irqs = MAX7360_MAX_GPIO; + irq_chip->irqs = max7360_regmap_irqs; + irq_chip->handle_mask_sync = max7360_handle_mask_sync; + irq_chip->irq_drv_data = regmap; + + for (unsigned int i = 0; i < MAX7360_MAX_GPIO; i++) { + ret = regmap_write_bits(regmap, MAX7360_REG_PWMCFG(i), + MAX7360_PORT_CFG_INTERRUPT_EDGES, + MAX7360_PORT_CFG_INTERRUPT_EDGES); + if (ret) + return dev_err_probe(dev, ret, + "Failed to enable interrupts\n"); + } + } + + /* + * Port GPIOs: set output mode configuration (constant-current or not). + * This property is optional. + */ + ret = device_property_read_u32(dev, "maxim,constant-current-disable", &outconf); + if (!ret) { + ret = regmap_write(regmap, MAX7360_REG_GPIOOUTM, outconf); + if (ret) + return dev_err_probe(dev, ret, + "Failed to set constant-current configuration\n"); + } + } + + /* Add gpio device. */ + gpio_config.parent = dev; + gpio_config.regmap = regmap; + if (plat_data->function == MAX7360_GPIO_PORT) { + gpio_config.ngpio = MAX7360_MAX_GPIO; + gpio_config.reg_dat_base = GPIO_REGMAP_ADDR(MAX7360_REG_GPIOIN); + gpio_config.reg_set_base = GPIO_REGMAP_ADDR(MAX7360_REG_PWMBASE); + gpio_config.reg_dir_out_base = GPIO_REGMAP_ADDR(MAX7360_REG_GPIOCTRL); + gpio_config.ngpio_per_reg = MAX7360_MAX_GPIO; + gpio_config.reg_mask_xlate = max7360_gpio_reg_mask_xlate; + } else { + ret = max7360_set_gpos_count(dev, regmap); + if (ret) + return dev_err_probe(dev, ret, "Failed to set GPOS pin count\n"); + + gpio_config.reg_set_base = GPIO_REGMAP_ADDR(MAX7360_REG_PORTS); + gpio_config.ngpio = MAX7360_MAX_KEY_COLS; + gpio_config.init_valid_mask = max7360_gpo_init_valid_mask; + } + + return PTR_ERR_OR_ZERO(devm_gpio_regmap_register(dev, &gpio_config)); +} + +static const struct of_device_id max7360_gpio_of_match[] = { + { + .compatible = "maxim,max7360-gpo", + .data = &max7360_gpio_col_plat + }, { + .compatible = "maxim,max7360-gpio", + .data = &max7360_gpio_port_plat + }, { + } +}; +MODULE_DEVICE_TABLE(of, max7360_gpio_of_match); + +static struct platform_driver max7360_gpio_driver = { + .driver = { + .name = "max7360-gpio", + .of_match_table = max7360_gpio_of_match, + }, + .probe = max7360_gpio_probe, +}; +module_platform_driver(max7360_gpio_driver); + +MODULE_DESCRIPTION("MAX7360 GPIO driver"); +MODULE_AUTHOR("Kamel BOUHARA "); +MODULE_AUTHOR("Mathieu Dubois-Briand "); +MODULE_LICENSE("GPL"); From fa6a23f1c59c67de9160b4acc5a8651ad2106fa8 Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:27 +0200 Subject: [PATCH 1880/3206] input: keyboard: Add support for MAX7360 keypad Add driver for Maxim Integrated MAX7360 keypad controller, providing support for up to 64 keys, with a matrix of 8 columns and 8 rows. Acked-by: Dmitry Torokhov Signed-off-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-8-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/input/keyboard/Kconfig | 12 + drivers/input/keyboard/Makefile | 1 + drivers/input/keyboard/max7360-keypad.c | 308 ++++++++++++++++++++++++ 3 files changed, 321 insertions(+) create mode 100644 drivers/input/keyboard/max7360-keypad.c diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index 7c4f309a4cb63a..1b10528b7ca3eb 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -422,6 +422,18 @@ config KEYBOARD_MAX7359 To compile this driver as a module, choose M here: the module will be called max7359_keypad. +config KEYBOARD_MAX7360 + tristate "Maxim MAX7360 Key Switch Controller" + select INPUT_MATRIXKMAP + depends on I2C + depends on MFD_MAX7360 + help + If you say yes here you get support for the keypad controller on the + Maxim MAX7360 I/O Expander. + + To compile this driver as a module, choose M here: the module will be + called max7360_keypad. + config KEYBOARD_MPR121 tristate "Freescale MPR121 Touchkey" depends on I2C diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile index 8bc20ab2b103b0..636367cd1042c4 100644 --- a/drivers/input/keyboard/Makefile +++ b/drivers/input/keyboard/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_KEYBOARD_LPC32XX) += lpc32xx-keys.o obj-$(CONFIG_KEYBOARD_MAPLE) += maple_keyb.o obj-$(CONFIG_KEYBOARD_MATRIX) += matrix_keypad.o obj-$(CONFIG_KEYBOARD_MAX7359) += max7359_keypad.o +obj-$(CONFIG_KEYBOARD_MAX7360) += max7360-keypad.o obj-$(CONFIG_KEYBOARD_MPR121) += mpr121_touchkey.o obj-$(CONFIG_KEYBOARD_MT6779) += mt6779-keypad.o obj-$(CONFIG_KEYBOARD_MTK_PMIC) += mtk-pmic-keys.o diff --git a/drivers/input/keyboard/max7360-keypad.c b/drivers/input/keyboard/max7360-keypad.c new file mode 100644 index 00000000000000..503be952b0a668 --- /dev/null +++ b/drivers/input/keyboard/max7360-keypad.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Bootlin + * + * Author: Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct max7360_keypad { + struct input_dev *input; + unsigned int rows; + unsigned int cols; + unsigned int debounce_ms; + int irq; + struct regmap *regmap; + unsigned short keycodes[MAX7360_MAX_KEY_ROWS * MAX7360_MAX_KEY_COLS]; +}; + +static irqreturn_t max7360_keypad_irq(int irq, void *data) +{ + struct max7360_keypad *max7360_keypad = data; + struct device *dev = max7360_keypad->input->dev.parent; + unsigned int val; + unsigned int row, col; + unsigned int release; + unsigned int code; + int error; + + error = regmap_read(max7360_keypad->regmap, MAX7360_REG_KEYFIFO, &val); + if (error) { + dev_err(dev, "Failed to read MAX7360 FIFO"); + return IRQ_NONE; + } + + /* FIFO overflow: ignore it and get next event. */ + if (val == MAX7360_FIFO_OVERFLOW) { + dev_warn(dev, "max7360 FIFO overflow"); + error = regmap_read_poll_timeout(max7360_keypad->regmap, MAX7360_REG_KEYFIFO, + val, val != MAX7360_FIFO_OVERFLOW, 0, 1000); + if (error) { + dev_err(dev, "Failed to empty MAX7360 FIFO"); + return IRQ_NONE; + } + } + + if (val == MAX7360_FIFO_EMPTY) { + dev_dbg(dev, "Got a spurious interrupt"); + + return IRQ_NONE; + } + + row = FIELD_GET(MAX7360_FIFO_ROW, val); + col = FIELD_GET(MAX7360_FIFO_COL, val); + release = val & MAX7360_FIFO_RELEASE; + + code = MATRIX_SCAN_CODE(row, col, get_count_order(max7360_keypad->cols)); + + dev_dbg(dev, "key[%d:%d] %s\n", row, col, release ? "release" : "press"); + + input_event(max7360_keypad->input, EV_MSC, MSC_SCAN, code); + input_report_key(max7360_keypad->input, max7360_keypad->keycodes[code], !release); + input_sync(max7360_keypad->input); + + return IRQ_HANDLED; +} + +static int max7360_keypad_open(struct input_dev *pdev) +{ + struct max7360_keypad *max7360_keypad = input_get_drvdata(pdev); + struct device *dev = max7360_keypad->input->dev.parent; + int error; + + /* Somebody is using the device: get out of sleep. */ + error = regmap_write_bits(max7360_keypad->regmap, MAX7360_REG_CONFIG, + MAX7360_CFG_SLEEP, MAX7360_CFG_SLEEP); + if (error) + dev_err(dev, "Failed to write max7360 configuration: %d\n", error); + + return error; +} + +static void max7360_keypad_close(struct input_dev *pdev) +{ + struct max7360_keypad *max7360_keypad = input_get_drvdata(pdev); + struct device *dev = max7360_keypad->input->dev.parent; + int error; + + /* Nobody is using the device anymore: go to sleep. */ + error = regmap_write_bits(max7360_keypad->regmap, MAX7360_REG_CONFIG, MAX7360_CFG_SLEEP, 0); + if (error) + dev_err(dev, "Failed to write max7360 configuration: %d\n", error); +} + +static int max7360_keypad_hw_init(struct max7360_keypad *max7360_keypad) +{ + struct device *dev = max7360_keypad->input->dev.parent; + unsigned int val; + int error; + + val = max7360_keypad->debounce_ms - MAX7360_DEBOUNCE_MIN; + error = regmap_write_bits(max7360_keypad->regmap, MAX7360_REG_DEBOUNCE, + MAX7360_DEBOUNCE, + FIELD_PREP(MAX7360_DEBOUNCE, val)); + if (error) + return dev_err_probe(dev, error, + "Failed to write max7360 debounce configuration\n"); + + error = regmap_write_bits(max7360_keypad->regmap, MAX7360_REG_INTERRUPT, + MAX7360_INTERRUPT_TIME_MASK, + FIELD_PREP(MAX7360_INTERRUPT_TIME_MASK, 1)); + if (error) + return dev_err_probe(dev, error, + "Failed to write max7360 keypad interrupt configuration\n"); + + return 0; +} + +static int max7360_keypad_build_keymap(struct max7360_keypad *max7360_keypad) +{ + struct input_dev *input_dev = max7360_keypad->input; + struct device *dev = input_dev->dev.parent->parent; + struct matrix_keymap_data keymap_data; + const char *propname = "linux,keymap"; + unsigned int max_keys; + int error; + int size; + + size = device_property_count_u32(dev, propname); + if (size <= 0) { + dev_err(dev, "missing or malformed property %s: %d\n", propname, size); + return size < 0 ? size : -EINVAL; + } + + max_keys = max7360_keypad->cols * max7360_keypad->rows; + if (size > max_keys) { + dev_err(dev, "%s size overflow (%d vs max %u)\n", propname, size, max_keys); + return -EINVAL; + } + + u32 *keys __free(kfree) = kmalloc_array(size, sizeof(*keys), GFP_KERNEL); + if (!keys) + return -ENOMEM; + + error = device_property_read_u32_array(dev, propname, keys, size); + if (error) { + dev_err(dev, "failed to read %s property: %d\n", propname, error); + return error; + } + + keymap_data.keymap = keys; + keymap_data.keymap_size = size; + error = matrix_keypad_build_keymap(&keymap_data, NULL, + max7360_keypad->rows, max7360_keypad->cols, + max7360_keypad->keycodes, max7360_keypad->input); + if (error) + return error; + + return 0; +} + +static int max7360_keypad_parse_fw(struct device *dev, + struct max7360_keypad *max7360_keypad, + bool *autorepeat) +{ + int error; + + error = matrix_keypad_parse_properties(dev->parent, &max7360_keypad->rows, + &max7360_keypad->cols); + if (error) + return error; + + if (!max7360_keypad->rows || !max7360_keypad->cols || + max7360_keypad->rows > MAX7360_MAX_KEY_ROWS || + max7360_keypad->cols > MAX7360_MAX_KEY_COLS) { + dev_err(dev, "Invalid number of columns or rows (%ux%u)\n", + max7360_keypad->cols, max7360_keypad->rows); + return -EINVAL; + } + + *autorepeat = device_property_read_bool(dev->parent, "autorepeat"); + + max7360_keypad->debounce_ms = MAX7360_DEBOUNCE_MIN; + error = device_property_read_u32(dev->parent, "keypad-debounce-delay-ms", + &max7360_keypad->debounce_ms); + if (error == -EINVAL) { + dev_info(dev, "Using default keypad-debounce-delay-ms: %u\n", + max7360_keypad->debounce_ms); + } else if (error < 0) { + dev_err(dev, "Failed to read keypad-debounce-delay-ms property\n"); + return error; + } + + if (!in_range(max7360_keypad->debounce_ms, MAX7360_DEBOUNCE_MIN, + MAX7360_DEBOUNCE_MAX - MAX7360_DEBOUNCE_MIN + 1)) { + dev_err(dev, "Invalid keypad-debounce-delay-ms: %u, should be between %u and %u.\n", + max7360_keypad->debounce_ms, MAX7360_DEBOUNCE_MIN, MAX7360_DEBOUNCE_MAX); + return -EINVAL; + } + + return 0; +} + +static int max7360_keypad_probe(struct platform_device *pdev) +{ + struct max7360_keypad *max7360_keypad; + struct device *dev = &pdev->dev; + struct input_dev *input; + struct regmap *regmap; + bool autorepeat; + int error; + int irq; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return dev_err_probe(dev, -ENODEV, "Could not get parent regmap\n"); + + irq = fwnode_irq_get_byname(dev_fwnode(dev->parent), "intk"); + if (irq < 0) + return dev_err_probe(dev, irq, "Failed to get IRQ\n"); + + max7360_keypad = devm_kzalloc(dev, sizeof(*max7360_keypad), GFP_KERNEL); + if (!max7360_keypad) + return -ENOMEM; + + max7360_keypad->regmap = regmap; + + error = max7360_keypad_parse_fw(dev, max7360_keypad, &autorepeat); + if (error) + return error; + + input = devm_input_allocate_device(dev); + if (!input) + return -ENOMEM; + + max7360_keypad->input = input; + + input->id.bustype = BUS_I2C; + input->name = pdev->name; + input->open = max7360_keypad_open; + input->close = max7360_keypad_close; + + error = max7360_keypad_build_keymap(max7360_keypad); + if (error) + return dev_err_probe(dev, error, "Failed to build keymap\n"); + + input_set_capability(input, EV_MSC, MSC_SCAN); + if (autorepeat) + __set_bit(EV_REP, input->evbit); + + input_set_drvdata(input, max7360_keypad); + + error = devm_request_threaded_irq(dev, irq, NULL, max7360_keypad_irq, + IRQF_ONESHOT, + "max7360-keypad", max7360_keypad); + if (error) + return dev_err_probe(dev, error, "Failed to register interrupt\n"); + + error = input_register_device(input); + if (error) + return dev_err_probe(dev, error, "Could not register input device\n"); + + error = max7360_keypad_hw_init(max7360_keypad); + if (error) + return dev_err_probe(dev, error, "Failed to initialize max7360 keypad\n"); + + device_init_wakeup(dev, true); + error = dev_pm_set_wake_irq(dev, irq); + if (error) + dev_warn(dev, "Failed to set up wakeup irq: %d\n", error); + + return 0; +} + +static void max7360_keypad_remove(struct platform_device *pdev) +{ + dev_pm_clear_wake_irq(&pdev->dev); + device_init_wakeup(&pdev->dev, false); +} + +static struct platform_driver max7360_keypad_driver = { + .driver = { + .name = "max7360-keypad", + }, + .probe = max7360_keypad_probe, + .remove = max7360_keypad_remove, +}; +module_platform_driver(max7360_keypad_driver); + +MODULE_DESCRIPTION("MAX7360 Keypad driver"); +MODULE_AUTHOR("Mathieu Dubois-Briand "); +MODULE_LICENSE("GPL"); From 229c15e9a69cb3d6a303a9e20b10fb991b66895d Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:28 +0200 Subject: [PATCH 1881/3206] input: misc: Add support for MAX7360 rotary Add driver for Maxim Integrated MAX7360 rotary encoder controller, supporting a single rotary switch. Acked-by: Dmitry Torokhov Signed-off-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-9-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/input/misc/Kconfig | 10 ++ drivers/input/misc/Makefile | 1 + drivers/input/misc/max7360-rotary.c | 192 ++++++++++++++++++++++++++++ 3 files changed, 203 insertions(+) create mode 100644 drivers/input/misc/max7360-rotary.c diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig index 0fb21c99a5e3d1..d604aad90a8956 100644 --- a/drivers/input/misc/Kconfig +++ b/drivers/input/misc/Kconfig @@ -230,6 +230,16 @@ config INPUT_M68K_BEEP tristate "M68k Beeper support" depends on M68K +config INPUT_MAX7360_ROTARY + tristate "Maxim MAX7360 Rotary Encoder" + depends on MFD_MAX7360 + help + If you say yes here you get support for the rotary encoder on the + Maxim MAX7360 I/O Expander. + + To compile this driver as a module, choose M here: the module will be + called max7360_rotary. + config INPUT_MAX77650_ONKEY tristate "Maxim MAX77650 ONKEY support" depends on MFD_MAX77650 diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile index d468c8140b93da..ac45cb9b5c998b 100644 --- a/drivers/input/misc/Makefile +++ b/drivers/input/misc/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_INPUT_IQS7222) += iqs7222.o obj-$(CONFIG_INPUT_KEYSPAN_REMOTE) += keyspan_remote.o obj-$(CONFIG_INPUT_KXTJ9) += kxtj9.o obj-$(CONFIG_INPUT_M68K_BEEP) += m68kspkr.o +obj-$(CONFIG_INPUT_MAX7360_ROTARY) += max7360-rotary.o obj-$(CONFIG_INPUT_MAX77650_ONKEY) += max77650-onkey.o obj-$(CONFIG_INPUT_MAX77693_HAPTIC) += max77693-haptic.o obj-$(CONFIG_INPUT_MAX8925_ONKEY) += max8925_onkey.o diff --git a/drivers/input/misc/max7360-rotary.c b/drivers/input/misc/max7360-rotary.c new file mode 100644 index 00000000000000..385831ef34b633 --- /dev/null +++ b/drivers/input/misc/max7360-rotary.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Bootlin + * + * Author: Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX7360_ROTARY_DEFAULT_STEPS 24 + +struct max7360_rotary { + struct input_dev *input; + struct regmap *regmap; + unsigned int debounce_ms; + + unsigned int pos; + + u32 steps; + u32 axis; + bool relative_axis; + bool rollover; +}; + +static void max7360_rotary_report_event(struct max7360_rotary *max7360_rotary, int steps) +{ + if (max7360_rotary->relative_axis) { + input_report_rel(max7360_rotary->input, max7360_rotary->axis, steps); + } else { + int pos = max7360_rotary->pos; + int maxval = max7360_rotary->steps; + + /* + * Add steps to the position. + * Make sure added steps are always in ]-maxval; maxval[ + * interval, so (pos + maxval) is always >= 0. + * Then set back pos to the [0; maxval[ interval. + */ + pos += steps % maxval; + if (max7360_rotary->rollover) + pos = (pos + maxval) % maxval; + else + pos = clamp(pos, 0, maxval - 1); + + max7360_rotary->pos = pos; + input_report_abs(max7360_rotary->input, max7360_rotary->axis, max7360_rotary->pos); + } + + input_sync(max7360_rotary->input); +} + +static irqreturn_t max7360_rotary_irq(int irq, void *data) +{ + struct max7360_rotary *max7360_rotary = data; + struct device *dev = max7360_rotary->input->dev.parent; + unsigned int val; + int error; + + error = regmap_read(max7360_rotary->regmap, MAX7360_REG_RTR_CNT, &val); + if (error < 0) { + dev_err(dev, "Failed to read rotary counter\n"); + return IRQ_NONE; + } + + if (val == 0) + return IRQ_NONE; + + max7360_rotary_report_event(max7360_rotary, sign_extend32(val, 7)); + + return IRQ_HANDLED; +} + +static int max7360_rotary_hw_init(struct max7360_rotary *max7360_rotary) +{ + struct device *dev = max7360_rotary->input->dev.parent; + int val; + int error; + + val = FIELD_PREP(MAX7360_ROT_DEBOUNCE, max7360_rotary->debounce_ms) | + FIELD_PREP(MAX7360_ROT_INTCNT, 1) | MAX7360_ROT_INTCNT_DLY; + error = regmap_write(max7360_rotary->regmap, MAX7360_REG_RTRCFG, val); + if (error) + dev_err(dev, "Failed to set max7360 rotary encoder configuration\n"); + + return error; +} + +static int max7360_rotary_probe(struct platform_device *pdev) +{ + struct max7360_rotary *max7360_rotary; + struct device *dev = &pdev->dev; + struct input_dev *input; + struct regmap *regmap; + int irq; + int error; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return dev_err_probe(dev, -ENODEV, "Could not get parent regmap\n"); + + irq = fwnode_irq_get_byname(dev_fwnode(dev->parent), "inti"); + if (irq < 0) + return dev_err_probe(dev, irq, "Failed to get IRQ\n"); + + max7360_rotary = devm_kzalloc(dev, sizeof(*max7360_rotary), GFP_KERNEL); + if (!max7360_rotary) + return -ENOMEM; + + max7360_rotary->regmap = regmap; + + device_property_read_u32(dev->parent, "linux,axis", &max7360_rotary->axis); + max7360_rotary->rollover = device_property_read_bool(dev->parent, + "rotary-encoder,rollover"); + max7360_rotary->relative_axis = + device_property_read_bool(dev->parent, "rotary-encoder,relative-axis"); + + error = device_property_read_u32(dev->parent, "rotary-encoder,steps", + &max7360_rotary->steps); + if (error) + max7360_rotary->steps = MAX7360_ROTARY_DEFAULT_STEPS; + + device_property_read_u32(dev->parent, "rotary-debounce-delay-ms", + &max7360_rotary->debounce_ms); + if (max7360_rotary->debounce_ms > MAX7360_ROT_DEBOUNCE_MAX) + return dev_err_probe(dev, -EINVAL, "Invalid debounce timing: %u\n", + max7360_rotary->debounce_ms); + + input = devm_input_allocate_device(dev); + if (!input) + return -ENOMEM; + + max7360_rotary->input = input; + + input->id.bustype = BUS_I2C; + input->name = pdev->name; + + if (max7360_rotary->relative_axis) + input_set_capability(input, EV_REL, max7360_rotary->axis); + else + input_set_abs_params(input, max7360_rotary->axis, 0, max7360_rotary->steps, 0, 1); + + error = devm_request_threaded_irq(dev, irq, NULL, max7360_rotary_irq, + IRQF_ONESHOT | IRQF_SHARED, + "max7360-rotary", max7360_rotary); + if (error) + return dev_err_probe(dev, error, "Failed to register interrupt\n"); + + error = input_register_device(input); + if (error) + return dev_err_probe(dev, error, "Could not register input device\n"); + + error = max7360_rotary_hw_init(max7360_rotary); + if (error) + return dev_err_probe(dev, error, "Failed to initialize max7360 rotary\n"); + + device_init_wakeup(dev, true); + error = dev_pm_set_wake_irq(dev, irq); + if (error) + dev_warn(dev, "Failed to set up wakeup irq: %d\n", error); + + return 0; +} + +static void max7360_rotary_remove(struct platform_device *pdev) +{ + dev_pm_clear_wake_irq(&pdev->dev); + device_init_wakeup(&pdev->dev, false); +} + +static struct platform_driver max7360_rotary_driver = { + .driver = { + .name = "max7360-rotary", + }, + .probe = max7360_rotary_probe, + .remove = max7360_rotary_remove, +}; +module_platform_driver(max7360_rotary_driver); + +MODULE_DESCRIPTION("MAX7360 Rotary driver"); +MODULE_AUTHOR("Mathieu Dubois-Briand "); +MODULE_LICENSE("GPL"); From 32d4cedd24ed346edbe063323ed495d685e033df Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:29 +0200 Subject: [PATCH 1882/3206] MAINTAINERS: Add entry on MAX7360 driver Add myself as maintainer of Maxim MAX7360 driver and device-tree bindings. Signed-off-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-10-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- MAINTAINERS | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..2026f2e44dd0aa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15013,6 +15013,19 @@ L: linux-iio@vger.kernel.org S: Maintained F: drivers/iio/temperature/max30208.c +MAXIM MAX7360 KEYPAD LED MFD DRIVER +M: Mathieu Dubois-Briand +S: Maintained +F: Documentation/devicetree/bindings/gpio/maxim,max7360-gpio.yaml +F: Documentation/devicetree/bindings/mfd/maxim,max7360.yaml +F: drivers/gpio/gpio-max7360.c +F: drivers/input/keyboard/max7360-keypad.c +F: drivers/input/misc/max7360-rotary.c +F: drivers/mfd/max7360.c +F: drivers/pinctrl/pinctrl-max7360.c +F: drivers/pwm/pwm-max7360.c +F: include/linux/mfd/max7360.h + MAXIM MAX77650 PMIC MFD DRIVER M: Bartosz Golaszewski L: linux-kernel@vger.kernel.org From 3f5df63955756fbe253a2a26043accf7318fa53a Mon Sep 17 00:00:00 2001 From: Aleksandrs Vinarskis Date: Wed, 10 Sep 2025 14:01:08 +0200 Subject: [PATCH 1883/3206] dt-bindings: leds: Add generic LED consumer documentation Introduce common generic led consumer binding, where consumer defines led(s) by phandle, as opposed to trigger-source binding where the trigger source is defined in led itself. Add already used in some schemas 'leds' parameter which expects phandle-array. Additionally, introduce 'led-names' which could be used by consumers to map LED devices to their respective functions. Signed-off-by: Aleksandrs Vinarskis Reviewed-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250910-leds-v5-1-bb90a0f897d5@vinarskis.com Signed-off-by: Lee Jones --- .../bindings/leds/leds-consumer.yaml | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 Documentation/devicetree/bindings/leds/leds-consumer.yaml diff --git a/Documentation/devicetree/bindings/leds/leds-consumer.yaml b/Documentation/devicetree/bindings/leds/leds-consumer.yaml new file mode 100644 index 00000000000000..fe6a0faa1d3b8f --- /dev/null +++ b/Documentation/devicetree/bindings/leds/leds-consumer.yaml @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/leds/leds-consumer.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Common leds consumer + +maintainers: + - Aleksandrs Vinarskis + +description: + Some LED defined in DT are required by other DT consumers, for example + v4l2 subnode may require privacy or flash LED. Unlike trigger-source + approach which is typically used as 'soft' binding, referencing LED + devices by phandle makes things simpler when 'hard' binding is desired. + + Document LED properties that its consumers may define. + +select: true + +properties: + leds: + oneOf: + - type: object + - $ref: /schemas/types.yaml#/definitions/phandle-array + description: + A list of LED device(s) required by a particular consumer. + items: + maxItems: 1 + + led-names: + description: + A list of device name(s). Used to map LED devices to their respective + functions, when consumer requires more than one LED. + +additionalProperties: true + +examples: + - | + #include + #include + + leds { + compatible = "gpio-leds"; + + privacy_led: privacy-led { + color = ; + default-state = "off"; + function = LED_FUNCTION_INDICATOR; + gpios = <&tlmm 110 GPIO_ACTIVE_HIGH>; + }; + }; + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + v4l2_node: camera@36 { + reg = <0x36>; + + leds = <&privacy_led>; + led-names = "privacy"; + }; + }; + +... From 22420da3662a69d8894ee624494213a5888a1e87 Mon Sep 17 00:00:00 2001 From: Aleksandrs Vinarskis Date: Wed, 10 Sep 2025 14:01:09 +0200 Subject: [PATCH 1884/3206] dt-bindings: leds: Unify 'leds' property A number of existing schemas use 'leds' property to provide phandle-array of LED(s) to the consumer. Additionally, with the upcoming privacy-led support in device-tree, v4l2 subnode could be a LED consumer, meaning that all camera sensors should support 'leds' and 'led-names' property via common 'video-interface-devices.yaml'. To avoid duplication, unify 'leds' property from existing schemas to newly introduced 'led-consumer.yaml'. Signed-off-by: Aleksandrs Vinarskis Reviewed-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250910-leds-v5-2-bb90a0f897d5@vinarskis.com Signed-off-by: Lee Jones --- .../devicetree/bindings/leds/backlight/led-backlight.yaml | 6 +----- .../devicetree/bindings/leds/leds-group-multicolor.yaml | 5 +---- .../bindings/media/video-interface-devices.yaml | 8 ++++++++ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/Documentation/devicetree/bindings/leds/backlight/led-backlight.yaml b/Documentation/devicetree/bindings/leds/backlight/led-backlight.yaml index f5554da6bc6c73..8fc5af8f27f9eb 100644 --- a/Documentation/devicetree/bindings/leds/backlight/led-backlight.yaml +++ b/Documentation/devicetree/bindings/leds/backlight/led-backlight.yaml @@ -23,11 +23,7 @@ properties: compatible: const: led-backlight - leds: - description: A list of LED nodes - $ref: /schemas/types.yaml#/definitions/phandle-array - items: - maxItems: 1 + leds: true required: - compatible diff --git a/Documentation/devicetree/bindings/leds/leds-group-multicolor.yaml b/Documentation/devicetree/bindings/leds/leds-group-multicolor.yaml index 8ed059a5a724f6..5c9cfa39396b0b 100644 --- a/Documentation/devicetree/bindings/leds/leds-group-multicolor.yaml +++ b/Documentation/devicetree/bindings/leds/leds-group-multicolor.yaml @@ -17,10 +17,7 @@ properties: compatible: const: leds-group-multicolor - leds: - description: - An aray of monochromatic leds - $ref: /schemas/types.yaml#/definitions/phandle-array + leds: true required: - leds diff --git a/Documentation/devicetree/bindings/media/video-interface-devices.yaml b/Documentation/devicetree/bindings/media/video-interface-devices.yaml index cf7712ad297c01..3ad1590b04966f 100644 --- a/Documentation/devicetree/bindings/media/video-interface-devices.yaml +++ b/Documentation/devicetree/bindings/media/video-interface-devices.yaml @@ -17,6 +17,14 @@ properties: An array of phandles, each referring to a flash LED, a sub-node of the LED driver device node. + leds: + minItems: 1 + maxItems: 1 + + led-names: + enum: + - privacy + lens-focus: $ref: /schemas/types.yaml#/definitions/phandle description: From 5b27dfb1d7b59db9e72766c990a3ee80e39e4f69 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Tue, 9 Sep 2025 15:46:26 +0200 Subject: [PATCH 1885/3206] s390/dcssblk: Add DAX support With ZONE_DEVICE now available for s390, struct pages can be allocated for proper DAX support in dcssblk driver via devm_memremap_pages(). Adding struct pages for a range requires that the range is aligned to SUBSECTION_SIZE, which is defined as 2 MB in common code. Therefore, only enable DAX support and allocate struct pages for DCSS ranges that are aligned to 2 MB. Signed-off-by: Gerald Schaefer Acked-by: Heiko Carstens Reviewed-by: Alexander Gordeev Signed-off-by: Alexander Gordeev --- drivers/s390/block/Kconfig | 12 ++---------- drivers/s390/block/dcssblk.c | 35 +++++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig index 8c1c908d2c6e72..877a9bc7f04b93 100644 --- a/drivers/s390/block/Kconfig +++ b/drivers/s390/block/Kconfig @@ -5,19 +5,11 @@ comment "S/390 block device drivers" config DCSSBLK def_tristate m prompt "DCSSBLK support" - depends on S390 && BLOCK && (DAX || DAX=n) + depends on S390 && BLOCK && ZONE_DEVICE + select FS_DAX help Support for dcss block device -config DCSSBLK_DAX - def_bool y - depends on DCSSBLK - # requires S390 ZONE_DEVICE support - depends on BROKEN - prompt "DCSSBLK DAX support" - help - Enable DAX operation for the dcss block device - config DASD def_tristate y prompt "Support for DASD devices" diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 94fa5edecaddf8..86fef4b15015d1 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -79,6 +79,8 @@ struct dcssblk_dev_info { int num_of_segments; struct list_head seg_list; struct dax_device *dax_dev; + struct dev_pagemap pgmap; + void *pgmap_addr; }; struct segment_info { @@ -415,6 +417,8 @@ dcssblk_shared_store(struct device *dev, struct device_attribute *attr, const ch dax_remove_host(dev_info->gd); kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); + if (dev_info->pgmap_addr) + devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap); del_gendisk(dev_info->gd); put_disk(dev_info->gd); @@ -537,9 +541,6 @@ static int dcssblk_setup_dax(struct dcssblk_dev_info *dev_info) { struct dax_device *dax_dev; - if (!IS_ENABLED(CONFIG_DCSSBLK_DAX)) - return 0; - dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops); if (IS_ERR(dax_dev)) return PTR_ERR(dax_dev); @@ -562,6 +563,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char struct dcssblk_dev_info *dev_info; struct segment_info *seg_info, *temp; char *local_buf; + void *addr; unsigned long seg_byte_size; dev_info = NULL; @@ -687,9 +689,26 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char if (rc) goto put_dev; - rc = dcssblk_setup_dax(dev_info); - if (rc) - goto out_dax; + if (!IS_ALIGNED(dev_info->start, SUBSECTION_SIZE) || + !IS_ALIGNED(dev_info->end + 1, SUBSECTION_SIZE)) { + pr_info("DCSS %s is not aligned to %lu bytes, DAX support disabled\n", + local_buf, SUBSECTION_SIZE); + } else { + dev_info->pgmap.type = MEMORY_DEVICE_FS_DAX; + dev_info->pgmap.range.start = dev_info->start; + dev_info->pgmap.range.end = dev_info->end; + dev_info->pgmap.nr_range = 1; + addr = devm_memremap_pages(&dev_info->dev, &dev_info->pgmap); + if (IS_ERR(addr)) { + rc = PTR_ERR(addr); + goto put_dev; + } + dev_info->pgmap_addr = addr; + rc = dcssblk_setup_dax(dev_info); + if (rc) + goto out_dax; + pr_info("DAX support enabled for DCSS %s\n", local_buf); + } get_device(&dev_info->dev); rc = device_add_disk(&dev_info->dev, dev_info->gd, NULL); @@ -716,6 +735,8 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char out_dax: kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); + if (dev_info->pgmap_addr) + devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap); put_dev: list_del(&dev_info->lh); put_disk(dev_info->gd); @@ -801,6 +822,8 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch dax_remove_host(dev_info->gd); kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); + if (dev_info->pgmap_addr) + devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap); del_gendisk(dev_info->gd); put_disk(dev_info->gd); From d515503f3c8a8475b2f78782534aad09722904e1 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 8 Aug 2025 16:17:32 +0300 Subject: [PATCH 1886/3206] i3c: mipi-i3c-hci-pci: Add support for Intel Wildcat Lake-U I3C Add I3C controller PCI IDs on Intel Wildcat Lake-U. Signed-off-by: Jarkko Nikula Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250808131732.1213227-1-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/mipi-i3c-hci-pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/i3c/master/mipi-i3c-hci/mipi-i3c-hci-pci.c b/drivers/i3c/master/mipi-i3c-hci/mipi-i3c-hci-pci.c index c6c3a3ec11eae3..08e6cbdf89cead 100644 --- a/drivers/i3c/master/mipi-i3c-hci/mipi-i3c-hci-pci.c +++ b/drivers/i3c/master/mipi-i3c-hci/mipi-i3c-hci-pci.c @@ -124,6 +124,9 @@ static void mipi_i3c_hci_pci_remove(struct pci_dev *pci) } static const struct pci_device_id mipi_i3c_hci_pci_devices[] = { + /* Wildcat Lake-U */ + { PCI_VDEVICE(INTEL, 0x4d7c), (kernel_ulong_t)&intel_info}, + { PCI_VDEVICE(INTEL, 0x4d6f), (kernel_ulong_t)&intel_info}, /* Panther Lake-H */ { PCI_VDEVICE(INTEL, 0xe37c), (kernel_ulong_t)&intel_info}, { PCI_VDEVICE(INTEL, 0xe36f), (kernel_ulong_t)&intel_info}, From f8d9e56aeb87ce82ce8636cd176cc59b69aa0e41 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 22 Aug 2025 13:56:27 +0300 Subject: [PATCH 1887/3206] i3c: master: Add helpers for DMA mapping and bounce buffer handling Some I3C controllers such as MIPI I3C HCI may pad the last DWORD (32-bit) with stale data from the RX FIFO in DMA transfers if the receive length is not DWORD aligned and when the device DMA is IOMMU mapped. In such a case, a properly sized bounce buffer is required in order to avoid possible data corruption. In a review discussion, proposal was to have a common helpers in I3C core for DMA mapping and bounce buffer handling. Drivers may use the helper i3c_master_dma_map_single() to map a buffer for a DMA transfer. It internally allocates a bounce buffer if buffer is not DMA'able or when the driver requires it for a transfer. Helper i3c_master_dma_unmap_single() does the needed cleanups and data copying from the bounce buffer. Signed-off-by: Jarkko Nikula Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250822105630.2820009-2-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master.c | 74 ++++++++++++++++++++++++++++++++++++++ include/linux/i3c/master.h | 26 ++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 2ef898a8fd8065..033d06cabca2cf 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1727,6 +1728,79 @@ int i3c_master_do_daa(struct i3c_master_controller *master) } EXPORT_SYMBOL_GPL(i3c_master_do_daa); +/** + * i3c_master_dma_map_single() - Map buffer for single DMA transfer + * @dev: device object of a device doing DMA + * @buf: destination/source buffer for DMA + * @len: length of transfer + * @force_bounce: true, force to use a bounce buffer, + * false, function will auto check is a bounce buffer required + * @dir: DMA direction + * + * Map buffer for a DMA transfer and allocate a bounce buffer if required. + * + * Return: I3C DMA transfer descriptor or NULL in case of error. + */ +struct i3c_dma *i3c_master_dma_map_single(struct device *dev, void *buf, + size_t len, bool force_bounce, enum dma_data_direction dir) +{ + struct i3c_dma *dma_xfer __free(kfree) = NULL; + void *bounce __free(kfree) = NULL; + void *dma_buf = buf; + + dma_xfer = kzalloc(sizeof(*dma_xfer), GFP_KERNEL); + if (!dma_xfer) + return NULL; + + dma_xfer->dev = dev; + dma_xfer->buf = buf; + dma_xfer->dir = dir; + dma_xfer->len = len; + dma_xfer->map_len = len; + + if (is_vmalloc_addr(buf)) + force_bounce = true; + + if (force_bounce) { + dma_xfer->map_len = ALIGN(len, cache_line_size()); + if (dir == DMA_FROM_DEVICE) + bounce = kzalloc(dma_xfer->map_len, GFP_KERNEL); + else + bounce = kmemdup(buf, dma_xfer->map_len, GFP_KERNEL); + if (!bounce) + return NULL; + dma_buf = bounce; + } + + dma_xfer->addr = dma_map_single(dev, dma_buf, dma_xfer->map_len, dir); + if (dma_mapping_error(dev, dma_xfer->addr)) + return NULL; + + dma_xfer->bounce_buf = no_free_ptr(bounce); + return no_free_ptr(dma_xfer); +} +EXPORT_SYMBOL_GPL(i3c_master_dma_map_single); + +/** + * i3c_master_dma_unmap_single() - Unmap buffer after DMA + * @dma_xfer: DMA transfer and mapping descriptor + * + * Unmap buffer and cleanup DMA transfer descriptor. + */ +void i3c_master_dma_unmap_single(struct i3c_dma *dma_xfer) +{ + dma_unmap_single(dma_xfer->dev, dma_xfer->addr, + dma_xfer->map_len, dma_xfer->dir); + if (dma_xfer->bounce_buf) { + if (dma_xfer->dir == DMA_FROM_DEVICE) + memcpy(dma_xfer->buf, dma_xfer->bounce_buf, + dma_xfer->len); + kfree(dma_xfer->bounce_buf); + } + kfree(dma_xfer); +} +EXPORT_SYMBOL_GPL(i3c_master_dma_unmap_single); + /** * i3c_master_set_info() - set master device information * @master: master used to send frames on the bus diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 043f5c7ff398ff..c52a82dd79a634 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -558,6 +558,26 @@ struct i3c_master_controller { #define i3c_bus_for_each_i3cdev(bus, dev) \ list_for_each_entry(dev, &(bus)->devs.i3c, common.node) +/** + * struct i3c_dma - DMA transfer and mapping descriptor + * @dev: device object of a device doing DMA + * @buf: destination/source buffer for DMA + * @len: length of transfer + * @map_len: length of DMA mapping + * @addr: mapped DMA address for a Host Controller Driver + * @dir: DMA direction + * @bounce_buf: an allocated bounce buffer if transfer needs it or NULL + */ +struct i3c_dma { + struct device *dev; + void *buf; + size_t len; + size_t map_len; + dma_addr_t addr; + enum dma_data_direction dir; + void *bounce_buf; +}; + int i3c_master_do_i2c_xfers(struct i3c_master_controller *master, const struct i2c_msg *xfers, int nxfers); @@ -575,6 +595,12 @@ int i3c_master_get_free_addr(struct i3c_master_controller *master, int i3c_master_add_i3c_dev_locked(struct i3c_master_controller *master, u8 addr); int i3c_master_do_daa(struct i3c_master_controller *master); +struct i3c_dma *i3c_master_dma_map_single(struct device *dev, void *ptr, + size_t len, bool force_bounce, + enum dma_data_direction dir); +void i3c_master_dma_unmap_single(struct i3c_dma *dma_xfer); +DEFINE_FREE(i3c_master_dma_unmap_single, void *, + if (_T) i3c_master_dma_unmap_single(_T)) int i3c_master_set_info(struct i3c_master_controller *master, const struct i3c_device_info *info); From 1c46bfc4f75e7d43261af65b64e7d3dc4a30daa9 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 22 Aug 2025 13:56:28 +0300 Subject: [PATCH 1888/3206] i3c: mipi-i3c-hci: Use core helpers for DMA mapping and bounce buffering So far only I3C private and I2C transfers have required a bounce buffer for DMA transfers when buffer is not DMA'able. It was observed that when the device DMA is IOMMU mapped and the receive length is not a multiple of DWORDs (32-bit), the last DWORD is padded with stale data from the RX FIFO, corrupting 1-3 bytes beyond the expected data. A similar issue, though less severe, occurs when an I3C target returns less data than requested. In this case, the padding does not exceed the requested number of bytes, assuming the device DMA is not IOMMU mapped. Therefore, all I3C private transfer, CCC command payload and I2C transfer receive buffers must be properly sized for the DMA being IOMMU mapped. Even if those buffers are already DMA safe, their size may not be DWORD aligned. To prepare for the device DMA being IOMMU mapped and to address the above issue, use helpers from I3C core for DMA mapping and bounce buffering for all DMA transfers. For now, require bounce buffer only when the buffer is in the vmalloc() area to avoid unnecessary copying with CCC commands and DMA-safe I2C transfers. Signed-off-by: Jarkko Nikula Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250822105630.2820009-3-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/core.c | 34 -------------------------- drivers/i3c/master/mipi-i3c-hci/dma.c | 27 +++++++++----------- drivers/i3c/master/mipi-i3c-hci/hci.h | 3 +-- 3 files changed, 12 insertions(+), 52 deletions(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c index 60f1175f1f37cc..b2977b6ac9f7a1 100644 --- a/drivers/i3c/master/mipi-i3c-hci/core.c +++ b/drivers/i3c/master/mipi-i3c-hci/core.c @@ -272,34 +272,6 @@ static int i3c_hci_daa(struct i3c_master_controller *m) return hci->cmd->perform_daa(hci); } -static int i3c_hci_alloc_safe_xfer_buf(struct i3c_hci *hci, - struct hci_xfer *xfer) -{ - if (hci->io != &mipi_i3c_hci_dma || - xfer->data == NULL || !is_vmalloc_addr(xfer->data)) - return 0; - - if (xfer->rnw) - xfer->bounce_buf = kzalloc(xfer->data_len, GFP_KERNEL); - else - xfer->bounce_buf = kmemdup(xfer->data, - xfer->data_len, GFP_KERNEL); - - return xfer->bounce_buf == NULL ? -ENOMEM : 0; -} - -static void i3c_hci_free_safe_xfer_buf(struct i3c_hci *hci, - struct hci_xfer *xfer) -{ - if (hci->io != &mipi_i3c_hci_dma || xfer->bounce_buf == NULL) - return; - - if (xfer->rnw) - memcpy(xfer->data, xfer->bounce_buf, xfer->data_len); - - kfree(xfer->bounce_buf); -} - static int i3c_hci_priv_xfers(struct i3c_dev_desc *dev, struct i3c_priv_xfer *i3c_xfers, int nxfers) @@ -333,9 +305,6 @@ static int i3c_hci_priv_xfers(struct i3c_dev_desc *dev, } hci->cmd->prep_i3c_xfer(hci, dev, &xfer[i]); xfer[i].cmd_desc[0] |= CMD_0_ROC; - ret = i3c_hci_alloc_safe_xfer_buf(hci, &xfer[i]); - if (ret) - goto out; } last = i - 1; xfer[last].cmd_desc[0] |= CMD_0_TOC; @@ -359,9 +328,6 @@ static int i3c_hci_priv_xfers(struct i3c_dev_desc *dev, } out: - for (i = 0; i < nxfers; i++) - i3c_hci_free_safe_xfer_buf(hci, &xfer[i]); - hci_free_xfer(xfer, nxfers); return ret; } diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c index 491dfe70b66002..351851859f02fe 100644 --- a/drivers/i3c/master/mipi-i3c-hci/dma.c +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c @@ -349,9 +349,7 @@ static void hci_dma_unmap_xfer(struct i3c_hci *hci, xfer = xfer_list + i; if (!xfer->data) continue; - dma_unmap_single(&hci->master.dev, - xfer->data_dma, xfer->data_len, - xfer->rnw ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + i3c_master_dma_unmap_single(xfer->dma); } } @@ -362,7 +360,6 @@ static int hci_dma_queue_xfer(struct i3c_hci *hci, struct hci_rh_data *rh; unsigned int i, ring, enqueue_ptr; u32 op1_val, op2_val; - void *buf; /* For now we only use ring 0 */ ring = 0; @@ -373,6 +370,8 @@ static int hci_dma_queue_xfer(struct i3c_hci *hci, for (i = 0; i < n; i++) { struct hci_xfer *xfer = xfer_list + i; u32 *ring_data = rh->xfer + rh->xfer_struct_sz * enqueue_ptr; + enum dma_data_direction dir = xfer->rnw ? DMA_FROM_DEVICE : + DMA_TO_DEVICE; /* store cmd descriptor */ *ring_data++ = xfer->cmd_desc[0]; @@ -391,21 +390,17 @@ static int hci_dma_queue_xfer(struct i3c_hci *hci, /* 2nd and 3rd words of Data Buffer Descriptor Structure */ if (xfer->data) { - buf = xfer->bounce_buf ? xfer->bounce_buf : xfer->data; - xfer->data_dma = - dma_map_single(&hci->master.dev, - buf, - xfer->data_len, - xfer->rnw ? - DMA_FROM_DEVICE : - DMA_TO_DEVICE); - if (dma_mapping_error(&hci->master.dev, - xfer->data_dma)) { + xfer->dma = i3c_master_dma_map_single(&hci->master.dev, + xfer->data, + xfer->data_len, + false, + dir); + if (!xfer->dma) { hci_dma_unmap_xfer(hci, xfer_list, i); return -ENOMEM; } - *ring_data++ = lower_32_bits(xfer->data_dma); - *ring_data++ = upper_32_bits(xfer->data_dma); + *ring_data++ = lower_32_bits(xfer->dma->addr); + *ring_data++ = upper_32_bits(xfer->dma->addr); } else { *ring_data++ = 0; *ring_data++ = 0; diff --git a/drivers/i3c/master/mipi-i3c-hci/hci.h b/drivers/i3c/master/mipi-i3c-hci/hci.h index 69ea1d10414b8c..33bc4906df1ff7 100644 --- a/drivers/i3c/master/mipi-i3c-hci/hci.h +++ b/drivers/i3c/master/mipi-i3c-hci/hci.h @@ -94,8 +94,7 @@ struct hci_xfer { }; struct { /* DMA specific */ - dma_addr_t data_dma; - void *bounce_buf; + struct i3c_dma *dma; int ring_number; int ring_entry; }; From 9e23897bca622eae20d2c038cc09b45bfaf5aed2 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 22 Aug 2025 13:56:29 +0300 Subject: [PATCH 1889/3206] i3c: mipi-i3c-hci: Use physical device pointer with DMA API DMA transfer faults on Intel hardware when the IOMMU is enabled and driver initialization will fail when attempting to do the first transfer: DMAR: DRHD: handling fault status reg 2 DMAR: [DMA Read NO_PASID] Request device [00:11.0] fault addr 0x676e3000 [fault reason 0x71] SM: Present bit in first-level paging entry is clear i3c mipi-i3c-hci.0: ring 0: Transfer Aborted mipi-i3c-hci mipi-i3c-hci.0: probe with driver mipi-i3c-hci failed with error -62 Reason for this is that the IOMMU setup is done for the physical devices only and not for the virtual I3C Controller device object. Therefore use the pointer to a physical device object with the DMA API. Due to a data corruption observation when the device DMA is IOMMU mapped, a properly sized receive bounce buffer is required if transfer length is not a multiple of DWORDs. Reported-by: Signed-off-by: Jarkko Nikula Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250822105630.2820009-4-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/dma.c | 46 +++++++++++++++++++-------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c index 351851859f02fe..09688ada4912bf 100644 --- a/drivers/i3c/master/mipi-i3c-hci/dma.c +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "hci.h" #include "cmd.h" @@ -138,6 +139,7 @@ struct hci_rh_data { }; struct hci_rings_data { + struct device *sysdev; unsigned int total; struct hci_rh_data headers[] __counted_by(total); }; @@ -165,20 +167,20 @@ static void hci_dma_cleanup(struct i3c_hci *hci) rh_reg_write(IBI_SETUP, 0); if (rh->xfer) - dma_free_coherent(&hci->master.dev, + dma_free_coherent(rings->sysdev, rh->xfer_struct_sz * rh->xfer_entries, rh->xfer, rh->xfer_dma); if (rh->resp) - dma_free_coherent(&hci->master.dev, + dma_free_coherent(rings->sysdev, rh->resp_struct_sz * rh->xfer_entries, rh->resp, rh->resp_dma); kfree(rh->src_xfers); if (rh->ibi_status) - dma_free_coherent(&hci->master.dev, + dma_free_coherent(rings->sysdev, rh->ibi_status_sz * rh->ibi_status_entries, rh->ibi_status, rh->ibi_status_dma); if (rh->ibi_data_dma) - dma_unmap_single(&hci->master.dev, rh->ibi_data_dma, + dma_unmap_single(rings->sysdev, rh->ibi_data_dma, rh->ibi_chunk_sz * rh->ibi_chunks_total, DMA_FROM_DEVICE); kfree(rh->ibi_data); @@ -194,11 +196,23 @@ static int hci_dma_init(struct i3c_hci *hci) { struct hci_rings_data *rings; struct hci_rh_data *rh; + struct device *sysdev; u32 regval; unsigned int i, nr_rings, xfers_sz, resps_sz; unsigned int ibi_status_ring_sz, ibi_data_ring_sz; int ret; + /* + * Set pointer to a physical device that does DMA and has IOMMU setup + * done for it in case of enabled IOMMU and use it with the DMA API. + * Here such device is either + * "mipi-i3c-hci" platform device (OF/ACPI enumeration) parent or + * grandparent (PCI enumeration). + */ + sysdev = hci->master.dev.parent; + if (sysdev->parent && dev_is_pci(sysdev->parent)) + sysdev = sysdev->parent; + regval = rhs_reg_read(CONTROL); nr_rings = FIELD_GET(MAX_HEADER_COUNT_CAP, regval); dev_info(&hci->master.dev, "%d DMA rings available\n", nr_rings); @@ -213,6 +227,7 @@ static int hci_dma_init(struct i3c_hci *hci) return -ENOMEM; hci->io_data = rings; rings->total = nr_rings; + rings->sysdev = sysdev; regval = FIELD_PREP(MAX_HEADER_COUNT, rings->total); rhs_reg_write(CONTROL, regval); @@ -239,9 +254,9 @@ static int hci_dma_init(struct i3c_hci *hci) xfers_sz = rh->xfer_struct_sz * rh->xfer_entries; resps_sz = rh->resp_struct_sz * rh->xfer_entries; - rh->xfer = dma_alloc_coherent(&hci->master.dev, xfers_sz, + rh->xfer = dma_alloc_coherent(rings->sysdev, xfers_sz, &rh->xfer_dma, GFP_KERNEL); - rh->resp = dma_alloc_coherent(&hci->master.dev, resps_sz, + rh->resp = dma_alloc_coherent(rings->sysdev, resps_sz, &rh->resp_dma, GFP_KERNEL); rh->src_xfers = kmalloc_array(rh->xfer_entries, sizeof(*rh->src_xfers), @@ -295,16 +310,16 @@ static int hci_dma_init(struct i3c_hci *hci) ibi_data_ring_sz = rh->ibi_chunk_sz * rh->ibi_chunks_total; rh->ibi_status = - dma_alloc_coherent(&hci->master.dev, ibi_status_ring_sz, + dma_alloc_coherent(rings->sysdev, ibi_status_ring_sz, &rh->ibi_status_dma, GFP_KERNEL); rh->ibi_data = kmalloc(ibi_data_ring_sz, GFP_KERNEL); ret = -ENOMEM; if (!rh->ibi_status || !rh->ibi_data) goto err_out; rh->ibi_data_dma = - dma_map_single(&hci->master.dev, rh->ibi_data, + dma_map_single(rings->sysdev, rh->ibi_data, ibi_data_ring_sz, DMA_FROM_DEVICE); - if (dma_mapping_error(&hci->master.dev, rh->ibi_data_dma)) { + if (dma_mapping_error(rings->sysdev, rh->ibi_data_dma)) { rh->ibi_data_dma = 0; ret = -ENOMEM; goto err_out; @@ -372,6 +387,7 @@ static int hci_dma_queue_xfer(struct i3c_hci *hci, u32 *ring_data = rh->xfer + rh->xfer_struct_sz * enqueue_ptr; enum dma_data_direction dir = xfer->rnw ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + bool need_bounce; /* store cmd descriptor */ *ring_data++ = xfer->cmd_desc[0]; @@ -390,10 +406,13 @@ static int hci_dma_queue_xfer(struct i3c_hci *hci, /* 2nd and 3rd words of Data Buffer Descriptor Structure */ if (xfer->data) { - xfer->dma = i3c_master_dma_map_single(&hci->master.dev, + need_bounce = device_iommu_mapped(rings->sysdev) && + xfer->rnw && + xfer->data_len != ALIGN(xfer->data_len, 4); + xfer->dma = i3c_master_dma_map_single(rings->sysdev, xfer->data, xfer->data_len, - false, + need_bounce, dir); if (!xfer->dma) { hci_dma_unmap_xfer(hci, xfer_list, i); @@ -581,6 +600,7 @@ static void hci_dma_recycle_ibi_slot(struct i3c_hci *hci, static void hci_dma_process_ibi(struct i3c_hci *hci, struct hci_rh_data *rh) { + struct hci_rings_data *rings = hci->io_data; struct i3c_dev_desc *dev; struct i3c_hci_dev_data *dev_data; struct hci_dma_dev_ibi_data *dev_ibi; @@ -691,7 +711,7 @@ static void hci_dma_process_ibi(struct i3c_hci *hci, struct hci_rh_data *rh) * rh->ibi_chunk_sz; if (first_part > ibi_size) first_part = ibi_size; - dma_sync_single_for_cpu(&hci->master.dev, ring_ibi_data_dma, + dma_sync_single_for_cpu(rings->sysdev, ring_ibi_data_dma, first_part, DMA_FROM_DEVICE); memcpy(slot->data, ring_ibi_data, first_part); @@ -700,7 +720,7 @@ static void hci_dma_process_ibi(struct i3c_hci *hci, struct hci_rh_data *rh) /* we wrap back to the start and copy remaining data */ ring_ibi_data = rh->ibi_data; ring_ibi_data_dma = rh->ibi_data_dma; - dma_sync_single_for_cpu(&hci->master.dev, ring_ibi_data_dma, + dma_sync_single_for_cpu(rings->sysdev, ring_ibi_data_dma, ibi_size - first_part, DMA_FROM_DEVICE); memcpy(slot->data + first_part, ring_ibi_data, ibi_size - first_part); From ec01115194142e39a06142447de88117365c90b3 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 22 Aug 2025 13:56:30 +0300 Subject: [PATCH 1890/3206] i3c: mipi-i3c-hci: Use own DMA bounce buffer management for I2C transfers Stop using I2C DMA-safe API for two reasons: - Not needed if driver is using PIO mode. - DMA transfers needs a DWORD align sized receive bounce buffer when the device DMA is IOMMU mapped, which is causing needless double bounce buffering in that case. Cc: Billy Tsai Signed-off-by: Jarkko Nikula Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250822105630.2820009-5-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c index b2977b6ac9f7a1..7a467ef657870a 100644 --- a/drivers/i3c/master/mipi-i3c-hci/core.c +++ b/drivers/i3c/master/mipi-i3c-hci/core.c @@ -348,7 +348,7 @@ static int i3c_hci_i2c_xfers(struct i2c_dev_desc *dev, return -ENOMEM; for (i = 0; i < nxfers; i++) { - xfer[i].data = i2c_get_dma_safe_msg_buf(&i2c_xfers[i], 1); + xfer[i].data = i2c_xfers[i].buf; xfer[i].data_len = i2c_xfers[i].len; xfer[i].rnw = i2c_xfers[i].flags & I2C_M_RD; hci->cmd->prep_i2c_xfer(hci, dev, &xfer[i]); @@ -374,10 +374,6 @@ static int i3c_hci_i2c_xfers(struct i2c_dev_desc *dev, } out: - for (i = 0; i < nxfers; i++) - i2c_put_dma_safe_msg_buf(xfer[i].data, &i2c_xfers[i], - ret ? false : true); - hci_free_xfer(xfer, nxfers); return ret; } From fc09ffd3a658901b9262f5aedffb4663a2b1dcf5 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Wed, 27 Aug 2025 13:30:05 +0300 Subject: [PATCH 1891/3206] i3c: mipi-i3c-hci: Change interrupt status prints to dev_dbg() Change interrupt status prints from local DBG() macro to dev_dbg() in order to make it easier to enable them without needing to recompile code with DEBUG defined. While doing so, spell out the status register names as they are in the specification to make it easier to differentiate between different interrupt status registers. Since dynamic debug prints can include the line number remove the "(in)" and "(out)" markers from the PIO interrupt status prints. Prefix the ring interrupt status print using "Ring %d" instead of "rh%d" to make it uniform across all other prints showing the ring number. Signed-off-by: Jarkko Nikula Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250827103009.243771-2-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/core.c | 2 +- drivers/i3c/master/mipi-i3c-hci/dma.c | 3 ++- drivers/i3c/master/mipi-i3c-hci/pio.c | 7 ++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c index 7a467ef657870a..d532933ac7ab01 100644 --- a/drivers/i3c/master/mipi-i3c-hci/core.c +++ b/drivers/i3c/master/mipi-i3c-hci/core.c @@ -553,7 +553,7 @@ static irqreturn_t i3c_hci_irq_handler(int irq, void *dev_id) val = reg_read(INTR_STATUS); reg_write(INTR_STATUS, val); - DBG("INTR_STATUS = %#x", val); + dev_dbg(&hci->master.dev, "INTR_STATUS %#x", val); if (val) result = IRQ_HANDLED; diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c index 09688ada4912bf..f5f5ab4db172e8 100644 --- a/drivers/i3c/master/mipi-i3c-hci/dma.c +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c @@ -760,7 +760,8 @@ static bool hci_dma_irq_handler(struct i3c_hci *hci) rh = &rings->headers[i]; status = rh_reg_read(INTR_STATUS); - DBG("rh%d status: %#x", i, status); + dev_dbg(&hci->master.dev, "Ring %d: RH_INTR_STATUS %#x", + i, status); if (!status) continue; rh_reg_write(INTR_STATUS, status); diff --git a/drivers/i3c/master/mipi-i3c-hci/pio.c b/drivers/i3c/master/mipi-i3c-hci/pio.c index 2fc71e6969111a..cde883137bc756 100644 --- a/drivers/i3c/master/mipi-i3c-hci/pio.c +++ b/drivers/i3c/master/mipi-i3c-hci/pio.c @@ -986,7 +986,8 @@ static bool hci_pio_irq_handler(struct i3c_hci *hci) spin_lock(&pio->lock); status = pio_reg_read(INTR_STATUS); - DBG("(in) status: %#x/%#x", status, pio->enabled_irqs); + dev_dbg(&hci->master.dev, "PIO_INTR_STATUS %#x/%#x", + status, pio->enabled_irqs); status &= pio->enabled_irqs | STAT_LATENCY_WARNINGS; if (!status) { spin_unlock(&pio->lock); @@ -1023,8 +1024,8 @@ static bool hci_pio_irq_handler(struct i3c_hci *hci) pio->enabled_irqs &= ~STAT_CMD_QUEUE_READY; pio_reg_write(INTR_SIGNAL_ENABLE, pio->enabled_irqs); - DBG("(out) status: %#x/%#x", - pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); + dev_dbg(&hci->master.dev, "PIO_INTR_STATUS %#x/%#x", + pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); spin_unlock(&pio->lock); return true; } From 422d0e401e13f6effdff0ae669fe1c81059cb91f Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Wed, 27 Aug 2025 13:30:06 +0300 Subject: [PATCH 1892/3206] i3c: mipi-i3c-hci: Remove nonexistent ring interrupt Ring interrupt bit 7, INTR_WARN_INS_STOP_MODE was probably drafted at some point but is marked as reserved in the MIPI I3C HCI specification versions 1.1 and 1.2 that came out after the initial code and also in the earlier specification versions so remove it. Signed-off-by: Jarkko Nikula Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250827103009.243771-3-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/dma.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c index f5f5ab4db172e8..8bc9de1895435c 100644 --- a/drivers/i3c/master/mipi-i3c-hci/dma.c +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c @@ -77,7 +77,6 @@ #define INTR_TRANSFER_COMPLETION BIT(11) #define INTR_RING_OP BIT(10) #define INTR_TRANSFER_ERR BIT(9) -#define INTR_WARN_INS_STOP_MODE BIT(7) #define INTR_IBI_RING_FULL BIT(6) #define INTR_TRANSFER_ABORT BIT(5) @@ -278,7 +277,6 @@ static int hci_dma_init(struct i3c_hci *hci) INTR_TRANSFER_COMPLETION | INTR_RING_OP | INTR_TRANSFER_ERR | - INTR_WARN_INS_STOP_MODE | INTR_IBI_RING_FULL | INTR_TRANSFER_ABORT); @@ -795,9 +793,6 @@ static bool hci_dma_irq_handler(struct i3c_hci *hci) RING_CTRL_RUN_STOP); } } - if (status & INTR_WARN_INS_STOP_MODE) - dev_warn_ratelimited(&hci->master.dev, - "ring %d: Inserted Stop on Mode Change\n", i); if (status & INTR_IBI_RING_FULL) dev_err_ratelimited(&hci->master.dev, "ring %d: IBI Ring Full Condition\n", i); From 4470c85ed54d30b1e25f2096c808459204725045 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Wed, 27 Aug 2025 13:30:07 +0300 Subject: [PATCH 1893/3206] i3c: mipi-i3c-hci: Uniform ring number printouts Use the same "Ring" prefix in all prints that print out the ring number. Signed-off-by: Jarkko Nikula Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250827103009.243771-4-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/dma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c index 8bc9de1895435c..3fadacbda5829e 100644 --- a/drivers/i3c/master/mipi-i3c-hci/dma.c +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c @@ -775,7 +775,7 @@ static bool hci_dma_irq_handler(struct i3c_hci *hci) u32 ring_status; dev_notice_ratelimited(&hci->master.dev, - "ring %d: Transfer Aborted\n", i); + "Ring %d: Transfer Aborted\n", i); mipi_i3c_hci_resume(hci); ring_status = rh_reg_read(RING_STATUS); if (!(ring_status & RING_STATUS_RUNNING) && @@ -795,7 +795,7 @@ static bool hci_dma_irq_handler(struct i3c_hci *hci) } if (status & INTR_IBI_RING_FULL) dev_err_ratelimited(&hci->master.dev, - "ring %d: IBI Ring Full Condition\n", i); + "Ring %d: IBI Ring Full Condition\n", i); handled = true; } From a00e15f34e5e08481c858af6ed694d98a0fbf744 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Wed, 27 Aug 2025 13:30:08 +0300 Subject: [PATCH 1894/3206] i3c: mipi-i3c-hci: Remove function enter DBG() printouts These function enter DBG("") printouts are not very useful in error report point of view because they require code recompile. In which case they can be replaced with more informative debug prints if needed so remove them for now. Signed-off-by: Jarkko Nikula Link: https://lore.kernel.org/r/20250827103009.243771-5-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/core.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c index d532933ac7ab01..9932945ecf06d7 100644 --- a/drivers/i3c/master/mipi-i3c-hci/core.c +++ b/drivers/i3c/master/mipi-i3c-hci/core.c @@ -121,8 +121,6 @@ static int i3c_hci_bus_init(struct i3c_master_controller *m) struct i3c_device_info info; int ret; - DBG(""); - if (hci->cmd == &mipi_i3c_hci_cmd_v1) { ret = mipi_i3c_hci_dat_v1.init(hci); if (ret) @@ -159,8 +157,6 @@ static void i3c_hci_bus_cleanup(struct i3c_master_controller *m) struct i3c_hci *hci = to_i3c_hci(m); struct platform_device *pdev = to_platform_device(m->dev.parent); - DBG(""); - reg_clear(HC_CONTROL, HC_CONTROL_BUS_ENABLE); synchronize_irq(platform_get_irq(pdev, 0)); hci->io->cleanup(hci); @@ -267,8 +263,6 @@ static int i3c_hci_daa(struct i3c_master_controller *m) { struct i3c_hci *hci = to_i3c_hci(m); - DBG(""); - return hci->cmd->perform_daa(hci); } @@ -385,8 +379,6 @@ static int i3c_hci_attach_i3c_dev(struct i3c_dev_desc *dev) struct i3c_hci_dev_data *dev_data; int ret; - DBG(""); - dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); if (!dev_data) return -ENOMEM; @@ -410,8 +402,6 @@ static int i3c_hci_reattach_i3c_dev(struct i3c_dev_desc *dev, u8 old_dyn_addr) struct i3c_hci *hci = to_i3c_hci(m); struct i3c_hci_dev_data *dev_data = i3c_dev_get_master_data(dev); - DBG(""); - if (hci->cmd == &mipi_i3c_hci_cmd_v1) mipi_i3c_hci_dat_v1.set_dynamic_addr(hci, dev_data->dat_idx, dev->info.dyn_addr); @@ -424,8 +414,6 @@ static void i3c_hci_detach_i3c_dev(struct i3c_dev_desc *dev) struct i3c_hci *hci = to_i3c_hci(m); struct i3c_hci_dev_data *dev_data = i3c_dev_get_master_data(dev); - DBG(""); - i3c_dev_set_master_data(dev, NULL); if (hci->cmd == &mipi_i3c_hci_cmd_v1) mipi_i3c_hci_dat_v1.free_entry(hci, dev_data->dat_idx); @@ -439,8 +427,6 @@ static int i3c_hci_attach_i2c_dev(struct i2c_dev_desc *dev) struct i3c_hci_dev_data *dev_data; int ret; - DBG(""); - if (hci->cmd != &mipi_i3c_hci_cmd_v1) return 0; dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); @@ -464,8 +450,6 @@ static void i3c_hci_detach_i2c_dev(struct i2c_dev_desc *dev) struct i3c_hci *hci = to_i3c_hci(m); struct i3c_hci_dev_data *dev_data = i2c_dev_get_master_data(dev); - DBG(""); - if (dev_data) { i2c_dev_set_master_data(dev, NULL); if (hci->cmd == &mipi_i3c_hci_cmd_v1) From a4ea64abb480a74c625424e617f6be80cfc26898 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Wed, 27 Aug 2025 13:30:09 +0300 Subject: [PATCH 1895/3206] i3c: mipi-i3c-hci: Convert remaining DBG() prints to dev_dbg() Get rid of local DBG() macro and convert remaining debug prints to dev_dbg() which can be controlled without code recompile when kernel is built with dynamic debug support. Signed-off-by: Jarkko Nikula Link: https://lore.kernel.org/r/20250827103009.243771-6-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/cmd_v1.c | 9 ++- drivers/i3c/master/mipi-i3c-hci/cmd_v2.c | 7 ++- drivers/i3c/master/mipi-i3c-hci/core.c | 16 ++--- drivers/i3c/master/mipi-i3c-hci/dma.c | 15 +++-- drivers/i3c/master/mipi-i3c-hci/ext_caps.c | 11 ++-- drivers/i3c/master/mipi-i3c-hci/hci.h | 3 - drivers/i3c/master/mipi-i3c-hci/pio.c | 68 +++++++++++++--------- 7 files changed, 73 insertions(+), 56 deletions(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/cmd_v1.c b/drivers/i3c/master/mipi-i3c-hci/cmd_v1.c index dd636094b07f59..eb8a3ae2990d77 100644 --- a/drivers/i3c/master/mipi-i3c-hci/cmd_v1.c +++ b/drivers/i3c/master/mipi-i3c-hci/cmd_v1.c @@ -317,7 +317,9 @@ static int hci_cmd_v1_daa(struct i3c_hci *hci) break; next_addr = ret; - DBG("next_addr = 0x%02x, DAA using DAT %d", next_addr, dat_idx); + dev_dbg(&hci->master.dev, + "next_addr = 0x%02x, DAA using DAT %d", + next_addr, dat_idx); mipi_i3c_hci_dat_v1.set_dynamic_addr(hci, dat_idx, next_addr); mipi_i3c_hci_dct_index_reset(hci); @@ -349,8 +351,9 @@ static int hci_cmd_v1_daa(struct i3c_hci *hci) } i3c_hci_dct_get_val(hci, 0, &pid, &dcr, &bcr); - DBG("assigned address %#x to device PID=0x%llx DCR=%#x BCR=%#x", - next_addr, pid, dcr, bcr); + dev_dbg(&hci->master.dev, + "assigned address %#x to device PID=0x%llx DCR=%#x BCR=%#x", + next_addr, pid, dcr, bcr); mipi_i3c_hci_dat_v1.free_entry(hci, dat_idx); dat_idx = -1; diff --git a/drivers/i3c/master/mipi-i3c-hci/cmd_v2.c b/drivers/i3c/master/mipi-i3c-hci/cmd_v2.c index 4493b2b067cbce..efb4326a25b73e 100644 --- a/drivers/i3c/master/mipi-i3c-hci/cmd_v2.c +++ b/drivers/i3c/master/mipi-i3c-hci/cmd_v2.c @@ -261,7 +261,7 @@ static int hci_cmd_v2_daa(struct i3c_hci *hci) if (ret < 0) break; next_addr = ret; - DBG("next_addr = 0x%02x", next_addr); + dev_dbg(&hci->master.dev, "next_addr = 0x%02x", next_addr); xfer[0].cmd_tid = hci_get_tid(); xfer[0].cmd_desc[0] = CMD_0_ATTR_A | @@ -293,8 +293,9 @@ static int hci_cmd_v2_daa(struct i3c_hci *hci) pid = (pid << 32) | device_id[0]; bcr = FIELD_GET(W1_MASK(55, 48), device_id[1]); dcr = FIELD_GET(W1_MASK(63, 56), device_id[1]); - DBG("assigned address %#x to device PID=0x%llx DCR=%#x BCR=%#x", - next_addr, pid, dcr, bcr); + dev_dbg(&hci->master.dev, + "assigned address %#x to device PID=0x%llx DCR=%#x BCR=%#x", + next_addr, pid, dcr, bcr); /* * TODO: Extend the subsystem layer to allow for registering * new device and provide BCR/DCR/PID at the same time. diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c index 9932945ecf06d7..47e42cb4dbe71e 100644 --- a/drivers/i3c/master/mipi-i3c-hci/core.c +++ b/drivers/i3c/master/mipi-i3c-hci/core.c @@ -147,7 +147,7 @@ static int i3c_hci_bus_init(struct i3c_master_controller *m) amd_set_resp_buf_thld(hci); reg_set(HC_CONTROL, HC_CONTROL_BUS_ENABLE); - DBG("HC_CONTROL = %#x", reg_read(HC_CONTROL)); + dev_dbg(&hci->master.dev, "HC_CONTROL = %#x", reg_read(HC_CONTROL)); return 0; } @@ -192,8 +192,8 @@ static int i3c_hci_send_ccc_cmd(struct i3c_master_controller *m, DECLARE_COMPLETION_ONSTACK(done); int i, last, ret = 0; - DBG("cmd=%#x rnw=%d ndests=%d data[0].len=%d", - ccc->id, ccc->rnw, ccc->ndests, ccc->dests[0].payload.len); + dev_dbg(&hci->master.dev, "cmd=%#x rnw=%d ndests=%d data[0].len=%d", + ccc->id, ccc->rnw, ccc->ndests, ccc->dests[0].payload.len); xfer = hci_alloc_xfer(nxfers); if (!xfer) @@ -251,8 +251,8 @@ static int i3c_hci_send_ccc_cmd(struct i3c_master_controller *m, } if (ccc->rnw) - DBG("got: %*ph", - ccc->dests[0].payload.len, ccc->dests[0].payload.data); + dev_dbg(&hci->master.dev, "got: %*ph", + ccc->dests[0].payload.len, ccc->dests[0].payload.data); out: hci_free_xfer(xfer, nxfers); @@ -277,7 +277,7 @@ static int i3c_hci_priv_xfers(struct i3c_dev_desc *dev, unsigned int size_limit; int i, last, ret = 0; - DBG("nxfers = %d", nxfers); + dev_dbg(&hci->master.dev, "nxfers = %d", nxfers); xfer = hci_alloc_xfer(nxfers); if (!xfer) @@ -335,7 +335,7 @@ static int i3c_hci_i2c_xfers(struct i2c_dev_desc *dev, DECLARE_COMPLETION_ONSTACK(done); int i, last, ret = 0; - DBG("nxfers = %d", nxfers); + dev_dbg(&hci->master.dev, "nxfers = %d", nxfers); xfer = hci_alloc_xfer(nxfers); if (!xfer) @@ -587,7 +587,7 @@ static int i3c_hci_init(struct i3c_hci *hci) } hci->caps = reg_read(HC_CAPABILITIES); - DBG("caps = %#x", hci->caps); + dev_dbg(&hci->master.dev, "caps = %#x", hci->caps); size_in_dwords = hci->version_major < 1 || (hci->version_major == 1 && hci->version_minor < 1); diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c index 3fadacbda5829e..c401a9425cdc59 100644 --- a/drivers/i3c/master/mipi-i3c-hci/dma.c +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c @@ -248,8 +248,9 @@ static int hci_dma_init(struct i3c_hci *hci) regval = rh_reg_read(CR_SETUP); rh->xfer_struct_sz = FIELD_GET(CR_XFER_STRUCT_SIZE, regval); rh->resp_struct_sz = FIELD_GET(CR_RESP_STRUCT_SIZE, regval); - DBG("xfer_struct_sz = %d, resp_struct_sz = %d", - rh->xfer_struct_sz, rh->resp_struct_sz); + dev_dbg(&hci->master.dev, + "xfer_struct_sz = %d, resp_struct_sz = %d", + rh->xfer_struct_sz, rh->resp_struct_sz); xfers_sz = rh->xfer_struct_sz * rh->xfer_entries; resps_sz = rh->resp_struct_sz * rh->xfer_entries; @@ -523,11 +524,11 @@ static void hci_dma_xfer_done(struct i3c_hci *hci, struct hci_rh_data *rh) ring_resp = rh->resp + rh->resp_struct_sz * done_ptr; resp = *ring_resp; tid = RESP_TID(resp); - DBG("resp = 0x%08x", resp); + dev_dbg(&hci->master.dev, "resp = 0x%08x", resp); xfer = rh->src_xfers[done_ptr]; if (!xfer) { - DBG("orphaned ring entry"); + dev_dbg(&hci->master.dev, "orphaned ring entry"); } else { hci_dma_unmap_xfer(hci, xfer, 1); xfer->ring_entry = -1; @@ -630,7 +631,7 @@ static void hci_dma_process_ibi(struct i3c_hci *hci, struct hci_rh_data *rh) ring_ibi_status = rh->ibi_status + rh->ibi_status_sz * ptr; ibi_status = *ring_ibi_status; - DBG("status = %#x", ibi_status); + dev_dbg(&hci->master.dev, "status = %#x", ibi_status); if (ibi_status_error) { /* we no longer care */ @@ -658,7 +659,9 @@ static void hci_dma_process_ibi(struct i3c_hci *hci, struct hci_rh_data *rh) if (last_ptr == -1) { /* this IBI sequence is not yet complete */ - DBG("no LAST_STATUS available (e=%d d=%d)", enq_ptr, deq_ptr); + dev_dbg(&hci->master.dev, + "no LAST_STATUS available (e=%d d=%d)", + enq_ptr, deq_ptr); return; } deq_ptr = last_ptr + 1; diff --git a/drivers/i3c/master/mipi-i3c-hci/ext_caps.c b/drivers/i3c/master/mipi-i3c-hci/ext_caps.c index 2e9b23efdc45da..7714f00ea9cc09 100644 --- a/drivers/i3c/master/mipi-i3c-hci/ext_caps.c +++ b/drivers/i3c/master/mipi-i3c-hci/ext_caps.c @@ -35,7 +35,7 @@ static int hci_extcap_hardware_id(struct i3c_hci *hci, void __iomem *base) switch (hci->vendor_mipi_id) { case MIPI_VENDOR_NXP: hci->quirks |= HCI_QUIRK_RAW_CCC; - DBG("raw CCC quirks set"); + dev_dbg(&hci->master.dev, "raw CCC quirks set"); break; } @@ -77,7 +77,8 @@ static int hci_extcap_xfer_modes(struct i3c_hci *hci, void __iomem *base) for (index = 0; index < entries; index++) { u32 mode_entry = readl(base); - DBG("mode %d: 0x%08x", index, mode_entry); + dev_dbg(&hci->master.dev, "mode %d: 0x%08x", + index, mode_entry); /* TODO: will be needed when I3C core does more than SDR */ base += 4; } @@ -97,7 +98,8 @@ static int hci_extcap_xfer_rates(struct i3c_hci *hci, void __iomem *base) dev_info(&hci->master.dev, "available data rates:\n"); for (index = 0; index < entries; index++) { rate_entry = readl(base); - DBG("entry %d: 0x%08x", index, rate_entry); + dev_dbg(&hci->master.dev, "entry %d: 0x%08x", + index, rate_entry); rate = FIELD_GET(XFERRATE_ACTUAL_RATE_KHZ, rate_entry); rate_id = FIELD_GET(XFERRATE_RATE_ID, rate_entry); mode_id = FIELD_GET(XFERRATE_MODE_ID, rate_entry); @@ -268,7 +270,8 @@ int i3c_hci_parse_ext_caps(struct i3c_hci *hci) cap_header = readl(curr_cap); cap_id = FIELD_GET(CAP_HEADER_ID, cap_header); cap_length = FIELD_GET(CAP_HEADER_LENGTH, cap_header); - DBG("id=0x%02x length=%d", cap_id, cap_length); + dev_dbg(&hci->master.dev, "id=0x%02x length=%d", + cap_id, cap_length); if (!cap_length) break; if (curr_cap + cap_length * 4 >= end) { diff --git a/drivers/i3c/master/mipi-i3c-hci/hci.h b/drivers/i3c/master/mipi-i3c-hci/hci.h index 33bc4906df1ff7..249ccb13c90928 100644 --- a/drivers/i3c/master/mipi-i3c-hci/hci.h +++ b/drivers/i3c/master/mipi-i3c-hci/hci.h @@ -12,9 +12,6 @@ #include -/* Handy logging macro to save on line length */ -#define DBG(x, ...) pr_devel("%s: " x "\n", __func__, ##__VA_ARGS__) - /* 32-bit word aware bit and mask macros */ #define W0_MASK(h, l) GENMASK((h) - 0, (l) - 0) #define W1_MASK(h, l) GENMASK((h) - 32, (l) - 32) diff --git a/drivers/i3c/master/mipi-i3c-hci/pio.c b/drivers/i3c/master/mipi-i3c-hci/pio.c index cde883137bc756..710faa46a00faa 100644 --- a/drivers/i3c/master/mipi-i3c-hci/pio.c +++ b/drivers/i3c/master/mipi-i3c-hci/pio.c @@ -213,8 +213,8 @@ static void hci_pio_cleanup(struct i3c_hci *hci) pio_reg_write(INTR_SIGNAL_ENABLE, 0x0); if (pio) { - DBG("status = %#x/%#x", - pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); + dev_dbg(&hci->master.dev, "status = %#x/%#x", + pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); BUG_ON(pio->curr_xfer); BUG_ON(pio->curr_rx); BUG_ON(pio->curr_tx); @@ -226,13 +226,17 @@ static void hci_pio_cleanup(struct i3c_hci *hci) static void hci_pio_write_cmd(struct i3c_hci *hci, struct hci_xfer *xfer) { - DBG("cmd_desc[%d] = 0x%08x", 0, xfer->cmd_desc[0]); - DBG("cmd_desc[%d] = 0x%08x", 1, xfer->cmd_desc[1]); + dev_dbg(&hci->master.dev, "cmd_desc[%d] = 0x%08x", + 0, xfer->cmd_desc[0]); + dev_dbg(&hci->master.dev, "cmd_desc[%d] = 0x%08x", + 1, xfer->cmd_desc[1]); pio_reg_write(COMMAND_QUEUE_PORT, xfer->cmd_desc[0]); pio_reg_write(COMMAND_QUEUE_PORT, xfer->cmd_desc[1]); if (hci->cmd == &mipi_i3c_hci_cmd_v2) { - DBG("cmd_desc[%d] = 0x%08x", 2, xfer->cmd_desc[2]); - DBG("cmd_desc[%d] = 0x%08x", 3, xfer->cmd_desc[3]); + dev_dbg(&hci->master.dev, "cmd_desc[%d] = 0x%08x", + 2, xfer->cmd_desc[2]); + dev_dbg(&hci->master.dev, "cmd_desc[%d] = 0x%08x", + 3, xfer->cmd_desc[3]); pio_reg_write(COMMAND_QUEUE_PORT, xfer->cmd_desc[2]); pio_reg_write(COMMAND_QUEUE_PORT, xfer->cmd_desc[3]); } @@ -254,7 +258,8 @@ static bool hci_pio_do_rx(struct i3c_hci *hci, struct hci_pio_data *pio) nr_words = min(xfer->data_left / 4, pio->rx_thresh_size); /* extract data from FIFO */ xfer->data_left -= nr_words * 4; - DBG("now %d left %d", nr_words * 4, xfer->data_left); + dev_dbg(&hci->master.dev, "now %d left %d", + nr_words * 4, xfer->data_left); while (nr_words--) *p++ = pio_reg_read(XFER_DATA_PORT); } @@ -269,7 +274,7 @@ static void hci_pio_do_trailing_rx(struct i3c_hci *hci, struct hci_xfer *xfer = pio->curr_rx; u32 *p; - DBG("%d remaining", count); + dev_dbg(&hci->master.dev, "%d remaining", count); p = xfer->data; p += (xfer->data_len - xfer->data_left) / 4; @@ -278,7 +283,8 @@ static void hci_pio_do_trailing_rx(struct i3c_hci *hci, unsigned int nr_words = count / 4; /* extract data from FIFO */ xfer->data_left -= nr_words * 4; - DBG("now %d left %d", nr_words * 4, xfer->data_left); + dev_dbg(&hci->master.dev, "now %d left %d", + nr_words * 4, xfer->data_left); while (nr_words--) *p++ = pio_reg_read(XFER_DATA_PORT); } @@ -321,7 +327,8 @@ static bool hci_pio_do_tx(struct i3c_hci *hci, struct hci_pio_data *pio) nr_words = min(xfer->data_left / 4, pio->tx_thresh_size); /* push data into the FIFO */ xfer->data_left -= nr_words * 4; - DBG("now %d left %d", nr_words * 4, xfer->data_left); + dev_dbg(&hci->master.dev, "now %d left %d", + nr_words * 4, xfer->data_left); while (nr_words--) pio_reg_write(XFER_DATA_PORT, *p++); } @@ -336,7 +343,7 @@ static bool hci_pio_do_tx(struct i3c_hci *hci, struct hci_pio_data *pio) */ if (!(pio_reg_read(INTR_STATUS) & STAT_TX_THLD)) return false; - DBG("trailing %d", xfer->data_left); + dev_dbg(&hci->master.dev, "trailing %d", xfer->data_left); pio_reg_write(XFER_DATA_PORT, *p); xfer->data_left = 0; } @@ -481,7 +488,7 @@ static bool hci_pio_process_resp(struct i3c_hci *hci, struct hci_pio_data *pio) u32 resp = pio_reg_read(RESPONSE_QUEUE_PORT); unsigned int tid = RESP_TID(resp); - DBG("resp = 0x%08x", resp); + dev_dbg(&hci->master.dev, "resp = 0x%08x", resp); if (tid != xfer->cmd_tid) { dev_err(&hci->master.dev, "response tid=%d when expecting %d\n", @@ -522,14 +529,15 @@ static bool hci_pio_process_resp(struct i3c_hci *hci, struct hci_pio_data *pio) * still exists. */ if (pio->curr_rx == xfer) { - DBG("short RX ?"); + dev_dbg(&hci->master.dev, "short RX ?"); pio->curr_rx = pio->curr_rx->next_data; } else if (pio->curr_tx == xfer) { - DBG("short TX ?"); + dev_dbg(&hci->master.dev, "short TX ?"); pio->curr_tx = pio->curr_tx->next_data; } else if (xfer->data_left) { - DBG("PIO xfer count = %d after response", - xfer->data_left); + dev_dbg(&hci->master.dev, + "PIO xfer count = %d after response", + xfer->data_left); } pio->curr_resp = xfer->next_resp; @@ -591,7 +599,7 @@ static int hci_pio_queue_xfer(struct i3c_hci *hci, struct hci_xfer *xfer, int n) struct hci_xfer *prev_queue_tail; int i; - DBG("n = %d", n); + dev_dbg(&hci->master.dev, "n = %d", n); /* link xfer instances together and initialize data count */ for (i = 0; i < n; i++) { @@ -611,8 +619,9 @@ static int hci_pio_queue_xfer(struct i3c_hci *hci, struct hci_xfer *xfer, int n) if (!hci_pio_process_cmd(hci, pio)) pio->enabled_irqs |= STAT_CMD_QUEUE_READY; pio_reg_write(INTR_SIGNAL_ENABLE, pio->enabled_irqs); - DBG("status = %#x/%#x", - pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); + dev_dbg(&hci->master.dev, "status = %#x/%#x", + pio_reg_read(INTR_STATUS), + pio_reg_read(INTR_SIGNAL_ENABLE)); } spin_unlock_irq(&pio->lock); return 0; @@ -686,10 +695,10 @@ static bool hci_pio_dequeue_xfer(struct i3c_hci *hci, struct hci_xfer *xfer, int int ret; spin_lock_irq(&pio->lock); - DBG("n=%d status=%#x/%#x", n, - pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); - DBG("main_status = %#x/%#x", - readl(hci->base_regs + 0x20), readl(hci->base_regs + 0x28)); + dev_dbg(&hci->master.dev, "n=%d status=%#x/%#x", n, + pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); + dev_dbg(&hci->master.dev, "main_status = %#x/%#x", + readl(hci->base_regs + 0x20), readl(hci->base_regs + 0x28)); ret = hci_pio_dequeue_xfer_common(hci, pio, xfer, n); spin_unlock_irq(&pio->lock); @@ -733,8 +742,8 @@ static void hci_pio_err(struct i3c_hci *hci, struct hci_pio_data *pio, mipi_i3c_hci_pio_reset(hci); mipi_i3c_hci_resume(hci); - DBG("status=%#x/%#x", - pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); + dev_dbg(&hci->master.dev, "status=%#x/%#x", + pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); } static void hci_pio_set_ibi_thresh(struct i3c_hci *hci, @@ -749,7 +758,7 @@ static void hci_pio_set_ibi_thresh(struct i3c_hci *hci, if (regval != pio->reg_queue_thresh) { pio_reg_write(QUEUE_THLD_CTRL, regval); pio->reg_queue_thresh = regval; - DBG("%d", thresh_val); + dev_dbg(&hci->master.dev, "%d", thresh_val); } } @@ -773,7 +782,8 @@ static bool hci_pio_get_ibi_segment(struct i3c_hci *hci, /* extract the data from the IBI port */ nr_words = thresh_val; ibi->seg_cnt -= nr_words * 4; - DBG("now %d left %d", nr_words * 4, ibi->seg_cnt); + dev_dbg(&hci->master.dev, "now %d left %d", + nr_words * 4, ibi->seg_cnt); while (nr_words--) *p++ = pio_reg_read(IBI_PORT); } @@ -791,7 +801,7 @@ static bool hci_pio_get_ibi_segment(struct i3c_hci *hci, hci_pio_set_ibi_thresh(hci, pio, 1); if (!(pio_reg_read(INTR_STATUS) & STAT_IBI_STATUS_THLD)) return false; - DBG("trailing %d", ibi->seg_cnt); + dev_dbg(&hci->master.dev, "trailing %d", ibi->seg_cnt); data = pio_reg_read(IBI_PORT); data = (__force u32) cpu_to_le32(data); while (ibi->seg_cnt--) { @@ -820,7 +830,7 @@ static bool hci_pio_prep_new_ibi(struct i3c_hci *hci, struct hci_pio_data *pio) */ ibi_status = pio_reg_read(IBI_PORT); - DBG("status = %#x", ibi_status); + dev_dbg(&hci->master.dev, "status = %#x", ibi_status); ibi->addr = FIELD_GET(IBI_TARGET_ADDR, ibi_status); if (ibi_status & IBI_ERROR) { dev_err(&hci->master.dev, "IBI error from %#x\n", ibi->addr); From 9395b3c412933401a34845d5326afe4011bbd40f Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 5 Sep 2025 13:03:20 +0300 Subject: [PATCH 1896/3206] i3c: Fix default I2C adapter timeout value Commit 3a379bbcea0a ("i3c: Add core I3C infrastructure") set the default adapter timeout for I2C transfers as 1000 (ms). However that parameter is defined in jiffies not in milliseconds. With mipi-i3c-hci driver this wasn't visible until commit c0a90eb55a69 ("i3c: mipi-i3c-hci: use adapter timeout value for I2C transfers"). Fix this by setting the default timeout as HZ (CONFIG_HZ) not 1000. Fixes: 1b84691e7870 ("i3c: dw: use adapter timeout value for I2C transfers") Fixes: be27ed672878 ("i3c: master: cdns: use adapter timeout value for I2C transfers") Fixes: c0a90eb55a69 ("i3c: mipi-i3c-hci: use adapter timeout value for I2C transfers") Fixes: a747e01adad2 ("i3c: master: svc: use adapter timeout value for I2C transfers") Fixes: d028219a9f14 ("i3c: master: Add basic driver for the Renesas I3C controller") Fixes: 3a379bbcea0a ("i3c: Add core I3C infrastructure") Cc: stable@vger.kernel.org # 6.17 Signed-off-by: Jarkko Nikula Reviewed-by: Frank Li Reviewed-by: Wolfram Sang Link: https://lore.kernel.org/r/20250905100320.954536-1-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 033d06cabca2cf..acaba4d5369712 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -2566,7 +2566,7 @@ static int i3c_master_i2c_adapter_init(struct i3c_master_controller *master) strscpy(adap->name, dev_name(master->dev.parent), sizeof(adap->name)); /* FIXME: Should we allow i3c masters to override these values? */ - adap->timeout = 1000; + adap->timeout = HZ; adap->retries = 3; id = of_alias_get_id(master->dev.of_node, "i2c"); From f3317e8c36a26d3374f29b8475982a96c6d44ccf Mon Sep 17 00:00:00 2001 From: Jorge Marques Date: Wed, 27 Aug 2025 15:51:58 +0200 Subject: [PATCH 1897/3206] dt-bindings: i3c: Add adi-i3c-master Add bindings doc for ADI I3C Controller IP core, a FPGA synthesizable IP core that implements the MIPI I3C Basic controller specification. The IP Core is versioned following Semantic Versioning 2.0.0 and ADI's open-source HDL guidelines for devicetree bindings and drivers. Reviewed-by: Conor Dooley Signed-off-by: Jorge Marques Link: https://lore.kernel.org/r/20250827-adi-i3c-master-v9-1-04413925abe1@analog.com Signed-off-by: Alexandre Belloni --- .../bindings/i3c/adi,i3c-master.yaml | 72 +++++++++++++++++++ MAINTAINERS | 5 ++ 2 files changed, 77 insertions(+) create mode 100644 Documentation/devicetree/bindings/i3c/adi,i3c-master.yaml diff --git a/Documentation/devicetree/bindings/i3c/adi,i3c-master.yaml b/Documentation/devicetree/bindings/i3c/adi,i3c-master.yaml new file mode 100644 index 00000000000000..2498672d265488 --- /dev/null +++ b/Documentation/devicetree/bindings/i3c/adi,i3c-master.yaml @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/i3c/adi,i3c-master.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Analog Devices I3C Controller + +description: + FPGA-based I3C controller designed to interface with I3C and I2C peripherals, + implementing a subset of the I3C-basic specification. The IP core is tested + on arm, microblaze, and arm64 architectures. + + https://analogdevicesinc.github.io/hdl/library/i3c_controller + +maintainers: + - Jorge Marques + +properties: + compatible: + const: adi,i3c-master-v1 + + reg: + maxItems: 1 + + clocks: + minItems: 1 + items: + - description: The AXI interconnect clock, drives the register map. + - description: + The secondary clock, drives the internal logic asynchronously to the + register map. The presence of this entry states that the IP Core was + synthesized with a second clock input, and the absence of this entry + indicates a topology where a single clock input drives all the + internal logic. + + clock-names: + minItems: 1 + items: + - const: axi + - const: i3c + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - clocks + - clock-names + - interrupts + +allOf: + - $ref: i3c.yaml# + +unevaluatedProperties: false + +examples: + - | + #include + + i3c@44a00000 { + compatible = "adi,i3c-master-v1"; + reg = <0x44a00000 0x1000>; + interrupts = <3 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clkc 15>, <&clkc 15>; + clock-names = "axi", "i3c"; + #address-cells = <3>; + #size-cells = <0>; + + /* I3C and I2C devices */ + }; diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..e6c913e7b0ca94 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11606,6 +11606,11 @@ S: Maintained F: Documentation/devicetree/bindings/i3c/aspeed,ast2600-i3c.yaml F: drivers/i3c/master/ast2600-i3c-master.c +I3C DRIVER FOR ANALOG DEVICES I3C CONTROLLER IP +M: Jorge Marques +S: Maintained +F: Documentation/devicetree/bindings/i3c/adi,i3c-master.yaml + I3C DRIVER FOR CADENCE I3C MASTER IP M: Przemysław Gaj S: Maintained From a79ac2cdc91d6be3010f2e9a3b2a2ccfc26e2086 Mon Sep 17 00:00:00 2001 From: Jorge Marques Date: Wed, 27 Aug 2025 15:51:59 +0200 Subject: [PATCH 1898/3206] i3c: master: Add driver for Analog Devices I3C Controller IP Add support for Analog Devices I3C Controller IP, an AXI-interfaced IP core that supports I3C and I2C devices, multiple speed-grades and I3C IBIs. Reviewed-by: Frank Li Signed-off-by: Jorge Marques Link: https://lore.kernel.org/r/20250827-adi-i3c-master-v9-2-04413925abe1@analog.com Signed-off-by: Alexandre Belloni --- MAINTAINERS | 1 + drivers/i3c/master/Kconfig | 11 + drivers/i3c/master/Makefile | 1 + drivers/i3c/master/adi-i3c-master.c | 1019 +++++++++++++++++++++++++++ 4 files changed, 1032 insertions(+) create mode 100644 drivers/i3c/master/adi-i3c-master.c diff --git a/MAINTAINERS b/MAINTAINERS index e6c913e7b0ca94..d63a58b028189d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11610,6 +11610,7 @@ I3C DRIVER FOR ANALOG DEVICES I3C CONTROLLER IP M: Jorge Marques S: Maintained F: Documentation/devicetree/bindings/i3c/adi,i3c-master.yaml +F: drivers/i3c/master/adi-i3c-master.c I3C DRIVER FOR CADENCE I3C MASTER IP M: Przemysław Gaj diff --git a/drivers/i3c/master/Kconfig b/drivers/i3c/master/Kconfig index 13df2944f2ec9a..82cf330778d5a7 100644 --- a/drivers/i3c/master/Kconfig +++ b/drivers/i3c/master/Kconfig @@ -1,4 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-only +config ADI_I3C_MASTER + tristate "Analog Devices I3C master driver" + depends on HAS_IOMEM + help + Support for Analog Devices I3C Controller IP, an AXI-interfaced IP + core that supports I3C and I2C devices, multiple speed-grades and I3C + IBIs. + + This driver can also be built as a module. If so, the module will be + called adi-i3c-master. + config CDNS_I3C_MASTER tristate "Cadence I3C master driver" depends on HAS_IOMEM diff --git a/drivers/i3c/master/Makefile b/drivers/i3c/master/Makefile index aac74f3e385144..816a227b6f7acf 100644 --- a/drivers/i3c/master/Makefile +++ b/drivers/i3c/master/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_ADI_I3C_MASTER) += adi-i3c-master.o obj-$(CONFIG_CDNS_I3C_MASTER) += i3c-master-cdns.o obj-$(CONFIG_DW_I3C_MASTER) += dw-i3c-master.o obj-$(CONFIG_AST2600_I3C_MASTER) += ast2600-i3c-master.o diff --git a/drivers/i3c/master/adi-i3c-master.c b/drivers/i3c/master/adi-i3c-master.c new file mode 100644 index 00000000000000..162f9eed39aa5e --- /dev/null +++ b/drivers/i3c/master/adi-i3c-master.c @@ -0,0 +1,1019 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * I3C Controller driver + * Copyright 2025 Analog Devices Inc. + * Author: Jorge Marques + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internals.h" + +#define ADI_MAX_DEVS 16 +#define ADI_HAS_MDB_FROM_BCR(x) (FIELD_GET(BIT(2), (x))) + +#define REG_ENABLE 0x040 + +#define REG_PID_L 0x054 +#define REG_PID_H 0x058 +#define REG_DCR_BCR_DA 0x05c +#define REG_DCR_BCR_DA_GET_DA(x) FIELD_GET(GENMASK(22, 16), (x)) +#define REG_DCR_BCR_DA_GET_BCR(x) FIELD_GET(GENMASK(15, 8), (x)) +#define REG_DCR_BCR_DA_GET_DCR(x) FIELD_GET(GENMASK(7, 0), (x)) + +#define REG_IRQ_MASK 0x080 +#define REG_IRQ_PENDING 0x084 +#define REG_IRQ_PENDING_DAA BIT(7) +#define REG_IRQ_PENDING_IBI BIT(6) +#define REG_IRQ_PENDING_CMDR BIT(5) + +#define REG_CMD_FIFO 0x0d4 +#define REG_CMD_FIFO_0_IS_CCC BIT(22) +#define REG_CMD_FIFO_0_BCAST BIT(21) +#define REG_CMD_FIFO_0_SR BIT(20) +#define REG_CMD_FIFO_0_LEN(l) FIELD_PREP(GENMASK(19, 8), (l)) +#define REG_CMD_FIFO_0_DEV_ADDR(a) FIELD_PREP(GENMASK(7, 1), (a)) +#define REG_CMD_FIFO_0_RNW BIT(0) +#define REG_CMD_FIFO_1_CCC(id) FIELD_PREP(GENMASK(7, 0), (id)) + +#define REG_CMD_FIFO_ROOM 0x0c0 +#define REG_CMDR_FIFO 0x0d8 +#define REG_CMDR_FIFO_UDA_ERROR 8 +#define REG_CMDR_FIFO_NACK_RESP 6 +#define REG_CMDR_FIFO_CE2_ERROR 4 +#define REG_CMDR_FIFO_CE0_ERROR 1 +#define REG_CMDR_FIFO_NO_ERROR 0 +#define REG_CMDR_FIFO_ERROR(x) FIELD_GET(GENMASK(23, 20), (x)) +#define REG_CMDR_FIFO_XFER_BYTES(x) FIELD_GET(GENMASK(19, 8), (x)) + +#define REG_SDO_FIFO 0x0dc +#define REG_SDO_FIFO_ROOM 0x0c8 +#define REG_SDI_FIFO 0x0e0 +#define REG_IBI_FIFO 0x0e4 +#define REG_FIFO_STATUS 0x0e8 +#define REG_FIFO_STATUS_CMDR_EMPTY BIT(0) +#define REG_FIFO_STATUS_IBI_EMPTY BIT(1) + +#define REG_OPS 0x100 +#define REG_OPS_PP_SG_MASK GENMASK(6, 5) +#define REG_OPS_SET_SG(x) FIELD_PREP(REG_OPS_PP_SG_MASK, (x)) + +#define REG_IBI_CONFIG 0x140 +#define REG_IBI_CONFIG_ENABLE BIT(0) +#define REG_IBI_CONFIG_LISTEN BIT(1) + +#define REG_DEV_CHAR 0x180 +#define REG_DEV_CHAR_IS_I2C BIT(0) +#define REG_DEV_CHAR_IS_ATTACHED BIT(1) +#define REG_DEV_CHAR_BCR_IBI(x) FIELD_PREP(GENMASK(3, 2), (x)) +#define REG_DEV_CHAR_WEN BIT(8) +#define REG_DEV_CHAR_ADDR(x) FIELD_PREP(GENMASK(15, 9), (x)) + +enum speed_grade {PP_SG_UNSET, PP_SG_1MHZ, PP_SG_3MHZ, PP_SG_6MHZ, PP_SG_12MHZ}; + +struct adi_i3c_cmd { + u32 cmd0; + u32 cmd1; + u32 tx_len; + const void *tx_buf; + u32 rx_len; + void *rx_buf; + u32 error; +}; + +struct adi_i3c_xfer { + struct list_head node; + struct completion comp; + int ret; + unsigned int ncmds; + unsigned int ncmds_comp; + struct adi_i3c_cmd cmds[] __counted_by(ncmds); +}; + +struct adi_i3c_master { + struct i3c_master_controller base; + u32 free_rr_slots; + struct { + unsigned int num_slots; + struct i3c_dev_desc **slots; + spinlock_t lock; /* Protect IBI slot access */ + } ibi; + struct { + struct list_head list; + struct adi_i3c_xfer *cur; + spinlock_t lock; /* Protect transfer */ + } xferqueue; + void __iomem *regs; + struct clk *clk; + unsigned long i3c_scl_lim; + struct { + u8 addrs[ADI_MAX_DEVS]; + u8 index; + } daa; +}; + +static inline struct adi_i3c_master *to_adi_i3c_master(struct i3c_master_controller *master) +{ + return container_of(master, struct adi_i3c_master, base); +} + +static void adi_i3c_master_wr_to_tx_fifo(struct adi_i3c_master *master, + const u8 *buf, unsigned int nbytes) +{ + unsigned int n, m; + + n = readl(master->regs + REG_SDO_FIFO_ROOM); + m = min(n, nbytes); + i3c_writel_fifo(master->regs + REG_SDO_FIFO, buf, nbytes); +} + +static void adi_i3c_master_rd_from_rx_fifo(struct adi_i3c_master *master, + u8 *buf, unsigned int nbytes) +{ + i3c_readl_fifo(master->regs + REG_SDI_FIFO, buf, nbytes); +} + +static bool adi_i3c_master_supports_ccc_cmd(struct i3c_master_controller *m, + const struct i3c_ccc_cmd *cmd) +{ + if (cmd->ndests > 1) + return false; + + switch (cmd->id) { + case I3C_CCC_ENEC(true): + case I3C_CCC_ENEC(false): + case I3C_CCC_DISEC(true): + case I3C_CCC_DISEC(false): + case I3C_CCC_RSTDAA(true): + case I3C_CCC_RSTDAA(false): + case I3C_CCC_ENTDAA: + case I3C_CCC_SETDASA: + case I3C_CCC_SETNEWDA: + case I3C_CCC_GETMWL: + case I3C_CCC_GETMRL: + case I3C_CCC_GETPID: + case I3C_CCC_GETBCR: + case I3C_CCC_GETDCR: + case I3C_CCC_GETSTATUS: + case I3C_CCC_GETHDRCAP: + return true; + default: + break; + } + + return false; +} + +static int adi_i3c_master_disable(struct adi_i3c_master *master) +{ + writel(0, master->regs + REG_IBI_CONFIG); + + return 0; +} + +static struct adi_i3c_xfer *adi_i3c_master_alloc_xfer(struct adi_i3c_master *master, + unsigned int ncmds) +{ + struct adi_i3c_xfer *xfer; + + xfer = kzalloc(struct_size(xfer, cmds, ncmds), GFP_KERNEL); + if (!xfer) + return NULL; + + INIT_LIST_HEAD(&xfer->node); + xfer->ncmds = ncmds; + xfer->ret = -ETIMEDOUT; + + return xfer; +} + +static void adi_i3c_master_start_xfer_locked(struct adi_i3c_master *master) +{ + struct adi_i3c_xfer *xfer = master->xferqueue.cur; + unsigned int i, n, m; + + if (!xfer) + return; + + for (i = 0; i < xfer->ncmds; i++) { + struct adi_i3c_cmd *cmd = &xfer->cmds[i]; + + if (!(cmd->cmd0 & REG_CMD_FIFO_0_RNW)) + adi_i3c_master_wr_to_tx_fifo(master, cmd->tx_buf, cmd->tx_len); + } + + n = readl(master->regs + REG_CMD_FIFO_ROOM); + for (i = 0; i < xfer->ncmds; i++) { + struct adi_i3c_cmd *cmd = &xfer->cmds[i]; + + m = cmd->cmd0 & REG_CMD_FIFO_0_IS_CCC ? 2 : 1; + if (m > n) + break; + writel(cmd->cmd0, master->regs + REG_CMD_FIFO); + if (cmd->cmd0 & REG_CMD_FIFO_0_IS_CCC) + writel(cmd->cmd1, master->regs + REG_CMD_FIFO); + n -= m; + } +} + +static void adi_i3c_master_end_xfer_locked(struct adi_i3c_master *master, + u32 pending) +{ + struct adi_i3c_xfer *xfer = master->xferqueue.cur; + int i, ret = 0; + + if (!xfer) + return; + + while (!(readl(master->regs + REG_FIFO_STATUS) & REG_FIFO_STATUS_CMDR_EMPTY)) { + struct adi_i3c_cmd *cmd; + u32 cmdr, rx_len; + + cmdr = readl(master->regs + REG_CMDR_FIFO); + + cmd = &xfer->cmds[xfer->ncmds_comp++]; + if (cmd->cmd0 & REG_CMD_FIFO_0_RNW) { + rx_len = min_t(u32, REG_CMDR_FIFO_XFER_BYTES(cmdr), cmd->rx_len); + adi_i3c_master_rd_from_rx_fifo(master, cmd->rx_buf, rx_len); + } + cmd->error = REG_CMDR_FIFO_ERROR(cmdr); + } + + for (i = 0; i < xfer->ncmds_comp; i++) { + switch (xfer->cmds[i].error) { + case REG_CMDR_FIFO_NO_ERROR: + break; + + case REG_CMDR_FIFO_CE0_ERROR: + case REG_CMDR_FIFO_CE2_ERROR: + case REG_CMDR_FIFO_NACK_RESP: + case REG_CMDR_FIFO_UDA_ERROR: + ret = -EIO; + break; + + default: + ret = -EINVAL; + break; + } + } + + xfer->ret = ret; + + if (xfer->ncmds_comp != xfer->ncmds) + return; + + complete(&xfer->comp); + + xfer = list_first_entry_or_null(&master->xferqueue.list, + struct adi_i3c_xfer, node); + if (xfer) + list_del_init(&xfer->node); + + master->xferqueue.cur = xfer; + adi_i3c_master_start_xfer_locked(master); +} + +static void adi_i3c_master_queue_xfer(struct adi_i3c_master *master, + struct adi_i3c_xfer *xfer) +{ + init_completion(&xfer->comp); + guard(spinlock_irqsave)(&master->xferqueue.lock); + if (master->xferqueue.cur) { + list_add_tail(&xfer->node, &master->xferqueue.list); + } else { + master->xferqueue.cur = xfer; + adi_i3c_master_start_xfer_locked(master); + } +} + +static void adi_i3c_master_unqueue_xfer(struct adi_i3c_master *master, + struct adi_i3c_xfer *xfer) +{ + guard(spinlock_irqsave)(&master->xferqueue.lock); + if (master->xferqueue.cur == xfer) + master->xferqueue.cur = NULL; + else + list_del_init(&xfer->node); + + writel(0x01, master->regs + REG_ENABLE); + writel(0x00, master->regs + REG_ENABLE); + writel(REG_IRQ_PENDING_CMDR, master->regs + REG_IRQ_MASK); +} + +static enum i3c_error_code adi_i3c_cmd_get_err(struct adi_i3c_cmd *cmd) +{ + switch (cmd->error) { + case REG_CMDR_FIFO_CE0_ERROR: + return I3C_ERROR_M0; + + case REG_CMDR_FIFO_CE2_ERROR: + case REG_CMDR_FIFO_NACK_RESP: + return I3C_ERROR_M2; + + default: + break; + } + + return I3C_ERROR_UNKNOWN; +} + +static int adi_i3c_master_send_ccc_cmd(struct i3c_master_controller *m, + struct i3c_ccc_cmd *cmd) +{ + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_xfer *xfer __free(kfree) = NULL; + struct adi_i3c_cmd *ccmd; + + xfer = adi_i3c_master_alloc_xfer(master, 1); + if (!xfer) + return -ENOMEM; + + ccmd = xfer->cmds; + ccmd->cmd1 = REG_CMD_FIFO_1_CCC(cmd->id); + ccmd->cmd0 = REG_CMD_FIFO_0_IS_CCC | + REG_CMD_FIFO_0_LEN(cmd->dests[0].payload.len); + + if (cmd->id & I3C_CCC_DIRECT) + ccmd->cmd0 |= REG_CMD_FIFO_0_DEV_ADDR(cmd->dests[0].addr); + + if (cmd->rnw) { + ccmd->cmd0 |= REG_CMD_FIFO_0_RNW; + ccmd->rx_buf = cmd->dests[0].payload.data; + ccmd->rx_len = cmd->dests[0].payload.len; + } else { + ccmd->tx_buf = cmd->dests[0].payload.data; + ccmd->tx_len = cmd->dests[0].payload.len; + } + + adi_i3c_master_queue_xfer(master, xfer); + if (!wait_for_completion_timeout(&xfer->comp, msecs_to_jiffies(1000))) + adi_i3c_master_unqueue_xfer(master, xfer); + + cmd->err = adi_i3c_cmd_get_err(&xfer->cmds[0]); + + return 0; +} + +static int adi_i3c_master_priv_xfers(struct i3c_dev_desc *dev, + struct i3c_priv_xfer *xfers, + int nxfers) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_xfer *xfer __free(kfree) = NULL; + int i, ret; + + if (!nxfers) + return 0; + + xfer = adi_i3c_master_alloc_xfer(master, nxfers); + if (!xfer) + return -ENOMEM; + + for (i = 0; i < nxfers; i++) { + struct adi_i3c_cmd *ccmd = &xfer->cmds[i]; + + ccmd->cmd0 = REG_CMD_FIFO_0_DEV_ADDR(dev->info.dyn_addr); + + if (xfers[i].rnw) { + ccmd->cmd0 |= REG_CMD_FIFO_0_RNW; + ccmd->rx_buf = xfers[i].data.in; + ccmd->rx_len = xfers[i].len; + } else { + ccmd->tx_buf = xfers[i].data.out; + ccmd->tx_len = xfers[i].len; + } + + ccmd->cmd0 |= REG_CMD_FIFO_0_LEN(xfers[i].len); + + if (i < nxfers - 1) + ccmd->cmd0 |= REG_CMD_FIFO_0_SR; + + if (!i) + ccmd->cmd0 |= REG_CMD_FIFO_0_BCAST; + } + + adi_i3c_master_queue_xfer(master, xfer); + if (!wait_for_completion_timeout(&xfer->comp, + msecs_to_jiffies(1000))) + adi_i3c_master_unqueue_xfer(master, xfer); + + ret = xfer->ret; + + for (i = 0; i < nxfers; i++) + xfers[i].err = adi_i3c_cmd_get_err(&xfer->cmds[i]); + + return ret; +} + +struct adi_i3c_i2c_dev_data { + struct i3c_generic_ibi_pool *ibi_pool; + u16 id; + s16 ibi; +}; + +static int adi_i3c_master_get_rr_slot(struct adi_i3c_master *master, + u8 dyn_addr) +{ + if (!master->free_rr_slots) + return -ENOSPC; + + return ffs(master->free_rr_slots) - 1; +} + +static int adi_i3c_master_reattach_i3c_dev(struct i3c_dev_desc *dev, u8 dyn_addr) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + u8 addr; + + addr = dev->info.dyn_addr ? dev->info.dyn_addr : dev->info.static_addr; + + writel(REG_DEV_CHAR_ADDR(dyn_addr), master->regs + REG_DEV_CHAR); + writel((readl(master->regs + REG_DEV_CHAR) & + ~REG_DEV_CHAR_IS_ATTACHED) | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + writel(REG_DEV_CHAR_ADDR(addr), master->regs + REG_DEV_CHAR); + writel(readl(master->regs + REG_DEV_CHAR) | + REG_DEV_CHAR_IS_ATTACHED | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + return 0; +} + +static int adi_i3c_master_attach_i3c_dev(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data; + int slot; + u8 addr; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + slot = adi_i3c_master_get_rr_slot(master, dev->info.dyn_addr); + if (slot < 0) { + kfree(data); + return slot; + } + + data->id = slot; + i3c_dev_set_master_data(dev, data); + master->free_rr_slots &= ~BIT(slot); + + addr = dev->info.dyn_addr ? dev->info.dyn_addr : dev->info.static_addr; + + writel(REG_DEV_CHAR_ADDR(addr), master->regs + REG_DEV_CHAR); + writel(readl(master->regs + REG_DEV_CHAR) | + REG_DEV_CHAR_IS_ATTACHED | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + return 0; +} + +static void adi_i3c_master_sync_dev_char(struct i3c_master_controller *m) +{ + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct i3c_dev_desc *i3cdev; + u32 bcr_ibi; + u8 addr; + + i3c_bus_for_each_i3cdev(&m->bus, i3cdev) { + addr = i3cdev->info.dyn_addr ? + i3cdev->info.dyn_addr : i3cdev->info.static_addr; + writel(REG_DEV_CHAR_ADDR(addr), master->regs + REG_DEV_CHAR); + bcr_ibi = FIELD_GET(I3C_BCR_IBI_PAYLOAD | I3C_BCR_IBI_REQ_CAP, (i3cdev->info.bcr)); + writel(readl(master->regs + REG_DEV_CHAR) | + REG_DEV_CHAR_BCR_IBI(bcr_ibi) | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + } +} + +static void adi_i3c_master_detach_i3c_dev(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data = i3c_dev_get_master_data(dev); + u8 addr; + + addr = dev->info.dyn_addr ? dev->info.dyn_addr : dev->info.static_addr; + + writel(REG_DEV_CHAR_ADDR(addr), master->regs + REG_DEV_CHAR); + writel((readl(master->regs + REG_DEV_CHAR) & + ~REG_DEV_CHAR_IS_ATTACHED) | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + i3c_dev_set_master_data(dev, NULL); + master->free_rr_slots |= BIT(data->id); + kfree(data); +} + +static int adi_i3c_master_attach_i2c_dev(struct i2c_dev_desc *dev) +{ + struct i3c_master_controller *m = i2c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data; + int slot; + + slot = adi_i3c_master_get_rr_slot(master, 0); + if (slot < 0) + return slot; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->id = slot; + master->free_rr_slots &= ~BIT(slot); + i2c_dev_set_master_data(dev, data); + + writel(REG_DEV_CHAR_ADDR(dev->addr) | + REG_DEV_CHAR_IS_I2C | REG_DEV_CHAR_IS_ATTACHED | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + return 0; +} + +static void adi_i3c_master_detach_i2c_dev(struct i2c_dev_desc *dev) +{ + struct i3c_master_controller *m = i2c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data = i2c_dev_get_master_data(dev); + + writel(REG_DEV_CHAR_ADDR(dev->addr) | + REG_DEV_CHAR_IS_I2C | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + i2c_dev_set_master_data(dev, NULL); + master->free_rr_slots |= BIT(data->id); + kfree(data); +} + +static void adi_i3c_master_bus_cleanup(struct i3c_master_controller *m) +{ + struct adi_i3c_master *master = to_adi_i3c_master(m); + + adi_i3c_master_disable(master); +} + +static void adi_i3c_master_upd_i3c_scl_lim(struct adi_i3c_master *master) +{ + struct i3c_master_controller *m = &master->base; + struct i3c_bus *bus = i3c_master_get_bus(m); + u8 i3c_scl_lim = 0; + struct i3c_dev_desc *dev; + u8 pp_sg; + + i3c_bus_for_each_i3cdev(bus, dev) { + u8 max_fscl; + + max_fscl = max(I3C_CCC_MAX_SDR_FSCL(dev->info.max_read_ds), + I3C_CCC_MAX_SDR_FSCL(dev->info.max_write_ds)); + + switch (max_fscl) { + case I3C_SDR1_FSCL_8MHZ: + max_fscl = PP_SG_6MHZ; + break; + case I3C_SDR2_FSCL_6MHZ: + max_fscl = PP_SG_3MHZ; + break; + case I3C_SDR3_FSCL_4MHZ: + max_fscl = PP_SG_3MHZ; + break; + case I3C_SDR4_FSCL_2MHZ: + max_fscl = PP_SG_1MHZ; + break; + case I3C_SDR0_FSCL_MAX: + default: + max_fscl = PP_SG_12MHZ; + break; + } + + if (max_fscl && + (i3c_scl_lim > max_fscl || !i3c_scl_lim)) + i3c_scl_lim = max_fscl; + } + + if (!i3c_scl_lim) + return; + + master->i3c_scl_lim = i3c_scl_lim - 1; + + pp_sg = readl(master->regs + REG_OPS) & ~REG_OPS_PP_SG_MASK; + pp_sg |= REG_OPS_SET_SG(master->i3c_scl_lim); + + writel(pp_sg, master->regs + REG_OPS); +} + +static void adi_i3c_master_get_features(struct adi_i3c_master *master, + unsigned int slot, + struct i3c_device_info *info) +{ + u32 buf; + + /* Dynamic address and PID are for identification only */ + memset(info, 0, sizeof(*info)); + buf = readl(master->regs + REG_DCR_BCR_DA); + info->dyn_addr = REG_DCR_BCR_DA_GET_DA(buf); + info->dcr = REG_DCR_BCR_DA_GET_DCR(buf); + info->bcr = REG_DCR_BCR_DA_GET_BCR(buf); + info->pid = readl(master->regs + REG_PID_L); + info->pid |= (u64)readl(master->regs + REG_PID_H) << 32; +} + +static int adi_i3c_master_do_daa(struct i3c_master_controller *m) +{ + struct adi_i3c_master *master = to_adi_i3c_master(m); + int ret, addr = 0; + u32 irq_mask; + + for (u8 i = 0; i < ADI_MAX_DEVS; i++) { + addr = i3c_master_get_free_addr(m, addr); + if (addr < 0) + return addr; + master->daa.addrs[i] = addr; + } + + irq_mask = readl(master->regs + REG_IRQ_MASK); + writel(irq_mask | REG_IRQ_PENDING_DAA, + master->regs + REG_IRQ_MASK); + + master->daa.index = 0; + ret = i3c_master_entdaa_locked(&master->base); + + writel(irq_mask, master->regs + REG_IRQ_MASK); + + /* DAA always finishes with CE2_ERROR or NACK_RESP */ + if (ret && ret != I3C_ERROR_M2) + return ret; + + /* Add I3C devices discovered */ + for (u8 i = 0; i < master->daa.index; i++) + i3c_master_add_i3c_dev_locked(m, master->daa.addrs[i]); + /* Sync retrieved devs info with the IP */ + adi_i3c_master_sync_dev_char(m); + + i3c_master_defslvs_locked(&master->base); + + adi_i3c_master_upd_i3c_scl_lim(master); + + return 0; +} + +static int adi_i3c_master_bus_init(struct i3c_master_controller *m) +{ + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct i3c_device_info info = { }; + int ret; + + ret = i3c_master_get_free_addr(m, 0); + if (ret < 0) + return ret; + + adi_i3c_master_get_features(master, 0, &info); + ret = i3c_master_set_info(&master->base, &info); + if (ret) + return ret; + + writel(REG_IBI_CONFIG_LISTEN, + master->regs + REG_IBI_CONFIG); + + return 0; +} + +static void adi_i3c_master_handle_ibi(struct adi_i3c_master *master, + u32 raw) +{ + struct adi_i3c_i2c_dev_data *data; + struct i3c_ibi_slot *slot; + struct i3c_dev_desc *dev; + u8 da, id, mdb, len; + u8 *buf; + + da = FIELD_GET(GENMASK(23, 17), raw); + mdb = FIELD_GET(GENMASK(15, 8), raw); + for (id = 0; id < master->ibi.num_slots; id++) { + if (master->ibi.slots[id] && + master->ibi.slots[id]->info.dyn_addr == da) + break; + } + + if (id == master->ibi.num_slots) + return; + + dev = master->ibi.slots[id]; + len = ADI_HAS_MDB_FROM_BCR(dev->info.bcr); + data = i3c_dev_get_master_data(dev); + + guard(spinlock)(&master->ibi.lock); + slot = i3c_generic_ibi_get_free_slot(data->ibi_pool); + if (!slot) + return; + + slot->len = len; + buf = slot->data; + buf[0] = mdb; + i3c_master_queue_ibi(dev, slot); +} + +static void adi_i3c_master_demux_ibis(struct adi_i3c_master *master) +{ + while (!(readl(master->regs + REG_FIFO_STATUS) & REG_FIFO_STATUS_IBI_EMPTY)) { + u32 raw = readl(master->regs + REG_IBI_FIFO); + + adi_i3c_master_handle_ibi(master, raw); + } +} + +static void adi_i3c_master_handle_da_req(struct adi_i3c_master *master) +{ + u8 payload0[8]; + u32 addr; + + adi_i3c_master_rd_from_rx_fifo(master, payload0, 6); + addr = master->daa.addrs[master->daa.index++]; + addr = (addr << 1) | (parity8(addr) ? 0 : 1); + + writel(addr, master->regs + REG_SDO_FIFO); +} + +static irqreturn_t adi_i3c_master_irq(int irq, void *data) +{ + struct adi_i3c_master *master = data; + u32 pending; + + pending = readl(master->regs + REG_IRQ_PENDING); + writel(pending, master->regs + REG_IRQ_PENDING); + if (pending & REG_IRQ_PENDING_CMDR) { + scoped_guard(spinlock_irqsave, &master->xferqueue.lock) { + adi_i3c_master_end_xfer_locked(master, pending); + } + } + if (pending & REG_IRQ_PENDING_IBI) + adi_i3c_master_demux_ibis(master); + if (pending & REG_IRQ_PENDING_DAA) + adi_i3c_master_handle_da_req(master); + + return IRQ_HANDLED; +} + +static int adi_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, + struct i2c_msg *xfers, + int nxfers) +{ + struct i3c_master_controller *m = i2c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_xfer *xfer __free(kfree) = NULL; + int i; + + if (!nxfers) + return 0; + for (i = 0; i < nxfers; i++) { + if (xfers[i].flags & I2C_M_TEN) + return -EOPNOTSUPP; + } + xfer = adi_i3c_master_alloc_xfer(master, nxfers); + if (!xfer) + return -ENOMEM; + + for (i = 0; i < nxfers; i++) { + struct adi_i3c_cmd *ccmd = &xfer->cmds[i]; + + ccmd->cmd0 = REG_CMD_FIFO_0_DEV_ADDR(xfers[i].addr); + + if (xfers[i].flags & I2C_M_RD) { + ccmd->cmd0 |= REG_CMD_FIFO_0_RNW; + ccmd->rx_buf = xfers[i].buf; + ccmd->rx_len = xfers[i].len; + } else { + ccmd->tx_buf = xfers[i].buf; + ccmd->tx_len = xfers[i].len; + } + + ccmd->cmd0 |= REG_CMD_FIFO_0_LEN(xfers[i].len); + } + + adi_i3c_master_queue_xfer(master, xfer); + if (!wait_for_completion_timeout(&xfer->comp, + m->i2c.timeout)) + adi_i3c_master_unqueue_xfer(master, xfer); + + return xfer->ret; +} + +static int adi_i3c_master_disable_ibi(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct i3c_dev_desc *i3cdev; + u32 enabled = 0; + int ret; + + ret = i3c_master_disec_locked(m, dev->info.dyn_addr, + I3C_CCC_EVENT_SIR); + + i3c_bus_for_each_i3cdev(&m->bus, i3cdev) { + if (dev != i3cdev && i3cdev->ibi) + enabled |= i3cdev->ibi->enabled; + } + if (!enabled) { + writel(REG_IBI_CONFIG_LISTEN, + master->regs + REG_IBI_CONFIG); + writel(readl(master->regs + REG_IRQ_MASK) & ~REG_IRQ_PENDING_IBI, + master->regs + REG_IRQ_MASK); + } + + return ret; +} + +static int adi_i3c_master_enable_ibi(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + + writel(REG_IBI_CONFIG_LISTEN | REG_IBI_CONFIG_ENABLE, + master->regs + REG_IBI_CONFIG); + + writel(readl(master->regs + REG_IRQ_MASK) | REG_IRQ_PENDING_IBI, + master->regs + REG_IRQ_MASK); + + return i3c_master_enec_locked(m, dev->info.dyn_addr, + I3C_CCC_EVENT_SIR); +} + +static int adi_i3c_master_request_ibi(struct i3c_dev_desc *dev, + const struct i3c_ibi_setup *req) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data; + unsigned int i; + + data = i3c_dev_get_master_data(dev); + data->ibi_pool = i3c_generic_ibi_alloc_pool(dev, req); + if (IS_ERR(data->ibi_pool)) + return PTR_ERR(data->ibi_pool); + + scoped_guard(spinlock_irqsave, &master->ibi.lock) { + for (i = 0; i < master->ibi.num_slots; i++) { + if (!master->ibi.slots[i]) { + data->ibi = i; + master->ibi.slots[i] = dev; + break; + } + } + } + + if (i < master->ibi.num_slots) + return 0; + + i3c_generic_ibi_free_pool(data->ibi_pool); + data->ibi_pool = NULL; + + return -ENOSPC; +} + +static void adi_i3c_master_free_ibi(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data = i3c_dev_get_master_data(dev); + + scoped_guard(spinlock_irqsave, &master->ibi.lock) { + master->ibi.slots[data->ibi] = NULL; + } + + i3c_generic_ibi_free_pool(data->ibi_pool); +} + +static void adi_i3c_master_recycle_ibi_slot(struct i3c_dev_desc *dev, + struct i3c_ibi_slot *slot) +{ + struct adi_i3c_i2c_dev_data *data = i3c_dev_get_master_data(dev); + + i3c_generic_ibi_recycle_slot(data->ibi_pool, slot); +} + +static const struct i3c_master_controller_ops adi_i3c_master_ops = { + .bus_init = adi_i3c_master_bus_init, + .bus_cleanup = adi_i3c_master_bus_cleanup, + .attach_i3c_dev = adi_i3c_master_attach_i3c_dev, + .reattach_i3c_dev = adi_i3c_master_reattach_i3c_dev, + .detach_i3c_dev = adi_i3c_master_detach_i3c_dev, + .attach_i2c_dev = adi_i3c_master_attach_i2c_dev, + .detach_i2c_dev = adi_i3c_master_detach_i2c_dev, + .do_daa = adi_i3c_master_do_daa, + .supports_ccc_cmd = adi_i3c_master_supports_ccc_cmd, + .send_ccc_cmd = adi_i3c_master_send_ccc_cmd, + .priv_xfers = adi_i3c_master_priv_xfers, + .i2c_xfers = adi_i3c_master_i2c_xfers, + .request_ibi = adi_i3c_master_request_ibi, + .enable_ibi = adi_i3c_master_enable_ibi, + .disable_ibi = adi_i3c_master_disable_ibi, + .free_ibi = adi_i3c_master_free_ibi, + .recycle_ibi_slot = adi_i3c_master_recycle_ibi_slot, +}; + +static const struct of_device_id adi_i3c_master_of_match[] = { + { .compatible = "adi,i3c-master-v1" }, + {} +}; + +static int adi_i3c_master_probe(struct platform_device *pdev) +{ + struct adi_i3c_master *master; + struct clk_bulk_data *clk; + unsigned int version; + int ret, irq; + + master = devm_kzalloc(&pdev->dev, sizeof(*master), GFP_KERNEL); + if (!master) + return -ENOMEM; + + master->regs = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(master->regs)) + return PTR_ERR(master->regs); + + ret = devm_clk_bulk_get_all_enabled(&pdev->dev, &clk); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "Failed to get clocks\n"); + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + version = readl(master->regs + ADI_AXI_REG_VERSION); + if (ADI_AXI_PCORE_VER_MAJOR(version) != 1) + dev_err_probe(&pdev->dev, -ENODEV, "Unsupported peripheral version %u.%u.%u\n", + ADI_AXI_PCORE_VER_MAJOR(version), + ADI_AXI_PCORE_VER_MINOR(version), + ADI_AXI_PCORE_VER_PATCH(version)); + + writel(0x00, master->regs + REG_ENABLE); + writel(0x00, master->regs + REG_IRQ_MASK); + + ret = devm_request_irq(&pdev->dev, irq, adi_i3c_master_irq, 0, + dev_name(&pdev->dev), master); + if (ret) + return ret; + + platform_set_drvdata(pdev, master); + + master->free_rr_slots = GENMASK(ADI_MAX_DEVS, 1); + + writel(REG_IRQ_PENDING_CMDR, master->regs + REG_IRQ_MASK); + + spin_lock_init(&master->ibi.lock); + master->ibi.num_slots = 15; + master->ibi.slots = devm_kcalloc(&pdev->dev, master->ibi.num_slots, + sizeof(*master->ibi.slots), + GFP_KERNEL); + if (!master->ibi.slots) + return -ENOMEM; + + spin_lock_init(&master->xferqueue.lock); + INIT_LIST_HEAD(&master->xferqueue.list); + + return i3c_master_register(&master->base, &pdev->dev, + &adi_i3c_master_ops, false); +} + +static void adi_i3c_master_remove(struct platform_device *pdev) +{ + struct adi_i3c_master *master = platform_get_drvdata(pdev); + + writel(0xff, master->regs + REG_IRQ_PENDING); + writel(0x00, master->regs + REG_IRQ_MASK); + writel(0x01, master->regs + REG_ENABLE); + + i3c_master_unregister(&master->base); +} + +static struct platform_driver adi_i3c_master = { + .probe = adi_i3c_master_probe, + .remove = adi_i3c_master_remove, + .driver = { + .name = "adi-i3c-master", + .of_match_table = adi_i3c_master_of_match, + }, +}; +module_platform_driver(adi_i3c_master); + +MODULE_AUTHOR("Jorge Marques "); +MODULE_DESCRIPTION("Analog Devices I3C master driver"); +MODULE_LICENSE("GPL"); From a7869b0a2540fd122eccec00ae7d4243166b0a60 Mon Sep 17 00:00:00 2001 From: Stanley Chu Date: Fri, 29 Aug 2025 09:23:08 +0800 Subject: [PATCH 1899/3206] i3c: master: svc: Use manual response for IBI events Driver wants to nack the IBI request when the target is not in the known address list. In below code, svc_i3c_master_nack_ibi() will cause undefined behavior when using AUTOIBI with auto response rule, because hw always auto ack the IBI request. switch (ibitype) { case SVC_I3C_MSTATUS_IBITYPE_IBI: dev = svc_i3c_master_dev_from_addr(master, ibiaddr); if (!dev || !is_events_enabled(master, SVC_I3C_EVENT_IBI)) svc_i3c_master_nack_ibi(master); ... break; AutoIBI has another issue that the controller doesn't quit AutoIBI state after IBIWON polling timeout when there is a SDA glitch(high->low->high). 1. SDA high->low: raising an interrupt to execute IBI ISR 2. SDA low->high 3. Driver writes an AutoIBI request 4. AutoIBI process does not start because SDA is not low 5. IBIWON polling times out 6. Controller reamins in AutoIBI state and doesn't accept EmitStop request Emitting broadcast address with IBIRESP_MANUAL avoids both issues. Fixes: dd3c52846d59 ("i3c: master: svc: Add Silvaco I3C master driver") Signed-off-by: Stanley Chu Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250829012309.3562585-2-yschu@nuvoton.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/svc-i3c-master.c | 30 ++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index 701ae165b25b79..8e7b4ab919e3da 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -517,9 +517,24 @@ static void svc_i3c_master_ibi_isr(struct svc_i3c_master *master) */ writel(SVC_I3C_MINT_IBIWON, master->regs + SVC_I3C_MSTATUS); - /* Acknowledge the incoming interrupt with the AUTOIBI mechanism */ - writel(SVC_I3C_MCTRL_REQUEST_AUTO_IBI | - SVC_I3C_MCTRL_IBIRESP_AUTO, + /* + * Write REQUEST_START_ADDR request to emit broadcast address for arbitration, + * instend of using AUTO_IBI. + * + * Using AutoIBI request may cause controller to remain in AutoIBI state when + * there is a glitch on SDA line (high->low->high). + * 1. SDA high->low, raising an interrupt to execute IBI isr. + * 2. SDA low->high. + * 3. IBI isr writes an AutoIBI request. + * 4. The controller will not start AutoIBI process because SDA is not low. + * 5. IBIWON polling times out. + * 6. Controller reamins in AutoIBI state and doesn't accept EmitStop request. + */ + writel(SVC_I3C_MCTRL_REQUEST_START_ADDR | + SVC_I3C_MCTRL_TYPE_I3C | + SVC_I3C_MCTRL_IBIRESP_MANUAL | + SVC_I3C_MCTRL_DIR(SVC_I3C_MCTRL_DIR_WRITE) | + SVC_I3C_MCTRL_ADDR(I3C_BROADCAST_ADDR), master->regs + SVC_I3C_MCTRL); /* Wait for IBIWON, should take approximately 100us */ @@ -539,10 +554,15 @@ static void svc_i3c_master_ibi_isr(struct svc_i3c_master *master) switch (ibitype) { case SVC_I3C_MSTATUS_IBITYPE_IBI: dev = svc_i3c_master_dev_from_addr(master, ibiaddr); - if (!dev || !is_events_enabled(master, SVC_I3C_EVENT_IBI)) + if (!dev || !is_events_enabled(master, SVC_I3C_EVENT_IBI)) { svc_i3c_master_nack_ibi(master); - else + } else { + if (dev->info.bcr & I3C_BCR_IBI_PAYLOAD) + svc_i3c_master_ack_ibi(master, true); + else + svc_i3c_master_ack_ibi(master, false); svc_i3c_master_handle_ibi(master, dev); + } break; case SVC_I3C_MSTATUS_IBITYPE_HOT_JOIN: if (is_events_enabled(master, SVC_I3C_EVENT_HOTJOIN)) From 3448a934ba6f803911ac084d05a2ffce507ea6c6 Mon Sep 17 00:00:00 2001 From: Stanley Chu Date: Fri, 29 Aug 2025 09:23:09 +0800 Subject: [PATCH 1900/3206] i3c: master: svc: Recycle unused IBI slot In svc_i3c_master_handle_ibi(), an IBI slot is fetched from the pool to store the IBI payload. However, when an error condition is encountered, the function returns without recycling the IBI slot, resulting in an IBI slot leak. Fixes: c85e209b799f ("i3c: master: svc: fix ibi may not return mandatory data byte") Signed-off-by: Stanley Chu Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250829012309.3562585-3-yschu@nuvoton.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/svc-i3c-master.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index 8e7b4ab919e3da..9641e66a4e5f2d 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -417,6 +417,7 @@ static int svc_i3c_master_handle_ibi(struct svc_i3c_master *master, SVC_I3C_MSTATUS_COMPLETE(val), 0, 1000); if (ret) { dev_err(master->dev, "Timeout when polling for COMPLETE\n"); + i3c_generic_ibi_recycle_slot(data->ibi_pool, slot); return ret; } From c5d0df4945085098e4778b27267a225efacf4d2a Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Mon, 8 Sep 2025 10:39:30 +0100 Subject: [PATCH 1901/3206] dt-bindings: i3c: renesas,i3c: Add RZ/V2H(P) and RZ/V2N support Add device tree binding support for the I3C Bus Interface on Renesas RZ/V2H(P) and RZ/V2N SoCs. The I3C IP on these SoCs is identical to that found on the RZ/G3E SoC. Add new compatible strings "renesas,r9a09g056-i3c" for RZ/V2N and "renesas,r9a09g057-i3c" for RZ/V2H(P). Both variants use "renesas,r9a09g047-i3c" as a fallback compatible to indicate hardware compatibility with the RZ/G3E implementation. Update the title to be more generic as it now covers multiple SoC families beyond just RZ/G3S and RZ/G3E. Signed-off-by: Lad Prabhakar Reviewed-by: Tommaso Merciai Acked-by: Rob Herring (Arm) Reviewed-by: Wolfram Sang Link: https://lore.kernel.org/r/20250908093930.12591-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Alexandre Belloni --- .../devicetree/bindings/i3c/renesas,i3c.yaml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/i3c/renesas,i3c.yaml b/Documentation/devicetree/bindings/i3c/renesas,i3c.yaml index fe2e9633c46f8b..a20d875086d463 100644 --- a/Documentation/devicetree/bindings/i3c/renesas,i3c.yaml +++ b/Documentation/devicetree/bindings/i3c/renesas,i3c.yaml @@ -4,7 +4,7 @@ $id: http://devicetree.org/schemas/i3c/renesas,i3c.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Renesas RZ/G3S and RZ/G3E I3C Bus Interface +title: Renesas I3C Bus Interface maintainers: - Wolfram Sang @@ -12,10 +12,16 @@ maintainers: properties: compatible: - items: - - enum: - - renesas,r9a08g045-i3c # RZ/G3S - - renesas,r9a09g047-i3c # RZ/G3E + oneOf: + - items: + - enum: + - renesas,r9a08g045-i3c # RZ/G3S + - renesas,r9a09g047-i3c # RZ/G3E + - items: + - enum: + - renesas,r9a09g056-i3c # RZ/V2N + - renesas,r9a09g057-i3c # RZ/V2H(P) + - const: renesas,r9a09g047-i3c reg: maxItems: 1 From bc7dd24c114e37c67061f4651bbbbbc57106c617 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sun, 3 Aug 2025 23:12:57 +0200 Subject: [PATCH 1902/3206] i3c: renesas: Simplify return statement in 'renesas_i3c_daa' There was already a bail out for 'ret < 0', so we can always return success at the end of the function. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/aIyzJ7HOENL1qp1l@stanley.mountain Signed-off-by: Wolfram Sang Tested-by: Tommaso Merciai Link: https://lore.kernel.org/r/20250803211256.18513-2-wsa+renesas@sang-engineering.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/renesas-i3c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i3c/master/renesas-i3c.c b/drivers/i3c/master/renesas-i3c.c index 174d3dc5d2764e..275f7b9242886e 100644 --- a/drivers/i3c/master/renesas-i3c.c +++ b/drivers/i3c/master/renesas-i3c.c @@ -679,7 +679,7 @@ static int renesas_i3c_daa(struct i3c_master_controller *m) i3c_master_add_i3c_dev_locked(m, i3c->addrs[pos]); } - return ret < 0 ? ret : 0; + return 0; } static bool renesas_i3c_supports_ccc_cmd(struct i3c_master_controller *m, From 17e163f3d7a5449fe9065030048e28c4087b24ce Mon Sep 17 00:00:00 2001 From: Manikanta Guntupalli Date: Wed, 30 Jul 2025 20:42:07 +0530 Subject: [PATCH 1903/3206] i3c: dw: Add shutdown support to dw_i3c_master driver Add shutdown handler to the Synopsys DesignWare I3C master driver, ensuring the device is gracefully disabled during system shutdown. The shutdown handler cancels any pending hot-join work and disables interrupts. Signed-off-by: Manikanta Guntupalli Link: https://lore.kernel.org/r/20250730151207.4113708-1-manikanta.guntupalli@amd.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/dw-i3c-master.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index 974122b2d20ee5..9ceedf09c3b6a6 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -1737,6 +1737,28 @@ static const struct dev_pm_ops dw_i3c_pm_ops = { SET_RUNTIME_PM_OPS(dw_i3c_master_runtime_suspend, dw_i3c_master_runtime_resume, NULL) }; +static void dw_i3c_shutdown(struct platform_device *pdev) +{ + struct dw_i3c_master *master = platform_get_drvdata(pdev); + int ret; + + ret = pm_runtime_resume_and_get(master->dev); + if (ret < 0) { + dev_err(master->dev, + "<%s> cannot resume i3c bus master, err: %d\n", + __func__, ret); + return; + } + + cancel_work_sync(&master->hj_work); + + /* Disable interrupts */ + writel((u32)~INTR_ALL, master->regs + INTR_STATUS_EN); + writel((u32)~INTR_ALL, master->regs + INTR_SIGNAL_EN); + + pm_runtime_put_autosuspend(master->dev); +} + static const struct of_device_id dw_i3c_master_of_match[] = { { .compatible = "snps,dw-i3c-master-1.00a", }, {}, @@ -1752,6 +1774,7 @@ MODULE_DEVICE_TABLE(acpi, amd_i3c_device_match); static struct platform_driver dw_i3c_driver = { .probe = dw_i3c_probe, .remove = dw_i3c_remove, + .shutdown = dw_i3c_shutdown, .driver = { .name = "dw-i3c-master", .of_match_table = dw_i3c_master_of_match, From d46651d4e3c0caab554c4c591c0b6c3b026b1e93 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Thu, 4 Sep 2025 20:35:07 +0800 Subject: [PATCH 1904/3206] ipmi: Add Loongson-2K BMC support This patch adds Loongson-2K BMC IPMI support. According to the existing design, we use software simulation to implement the KCS interface registers: Stauts/Command/Data_Out/Data_In. Also since both host side and BMC side read and write kcs status, fifo flag is used to ensure data consistency. The single KCS message block is as follows: +-------------------------------------------------------------------------+ |FIFO flags| KCS register data | CMD data | KCS version | WR REQ | WR ACK | +-------------------------------------------------------------------------+ Co-developed-by: Chong Qiao Signed-off-by: Chong Qiao Reviewed-by: Huacai Chen Acked-by: Corey Minyard Signed-off-by: Binbin Zhou Message-ID: <8f9ffb6f0405345af8f04193ce1510aacd075e72.1756987761.git.zhoubinbin@loongson.cn> Signed-off-by: Corey Minyard --- drivers/char/ipmi/Kconfig | 7 ++ drivers/char/ipmi/Makefile | 1 + drivers/char/ipmi/ipmi_si.h | 7 ++ drivers/char/ipmi/ipmi_si_intf.c | 4 + drivers/char/ipmi/ipmi_si_ls2k.c | 189 +++++++++++++++++++++++++++++++ 5 files changed, 208 insertions(+) create mode 100644 drivers/char/ipmi/ipmi_si_ls2k.c diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig index f4adc6feb3b229..92bed266d07cdd 100644 --- a/drivers/char/ipmi/Kconfig +++ b/drivers/char/ipmi/Kconfig @@ -84,6 +84,13 @@ config IPMI_IPMB bus, and it also supports direct messaging on the bus using IPMB direct messages. This module requires I2C support. +config IPMI_LS2K + bool 'Loongson-2K IPMI interface' + depends on LOONGARCH + select MFD_LS2K_BMC_CORE + help + Provides a driver for Loongson-2K IPMI interfaces. + config IPMI_POWERNV depends on PPC_POWERNV tristate 'POWERNV (OPAL firmware) IPMI interface' diff --git a/drivers/char/ipmi/Makefile b/drivers/char/ipmi/Makefile index e0944547c9d0ee..4ea450a82242fc 100644 --- a/drivers/char/ipmi/Makefile +++ b/drivers/char/ipmi/Makefile @@ -8,6 +8,7 @@ ipmi_si-y := ipmi_si_intf.o ipmi_kcs_sm.o ipmi_smic_sm.o ipmi_bt_sm.o \ ipmi_si_mem_io.o ipmi_si-$(CONFIG_HAS_IOPORT) += ipmi_si_port_io.o ipmi_si-$(CONFIG_PCI) += ipmi_si_pci.o +ipmi_si-$(CONFIG_IPMI_LS2K) += ipmi_si_ls2k.o ipmi_si-$(CONFIG_PARISC) += ipmi_si_parisc.o obj-$(CONFIG_IPMI_HANDLER) += ipmi_msghandler.o diff --git a/drivers/char/ipmi/ipmi_si.h b/drivers/char/ipmi/ipmi_si.h index 508c3fd4587766..687835b53da588 100644 --- a/drivers/char/ipmi/ipmi_si.h +++ b/drivers/char/ipmi/ipmi_si.h @@ -101,6 +101,13 @@ void ipmi_si_pci_shutdown(void); static inline void ipmi_si_pci_init(void) { } static inline void ipmi_si_pci_shutdown(void) { } #endif +#ifdef CONFIG_IPMI_LS2K +void ipmi_si_ls2k_init(void); +void ipmi_si_ls2k_shutdown(void); +#else +static inline void ipmi_si_ls2k_init(void) { } +static inline void ipmi_si_ls2k_shutdown(void) { } +#endif #ifdef CONFIG_PARISC void ipmi_si_parisc_init(void); void ipmi_si_parisc_shutdown(void); diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 1c65275906b444..70e55f5ff85e78 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -2138,6 +2138,8 @@ static int __init init_ipmi_si(void) ipmi_si_pci_init(); + ipmi_si_ls2k_init(); + ipmi_si_parisc_init(); mutex_lock(&smi_infos_lock); @@ -2349,6 +2351,8 @@ static void cleanup_ipmi_si(void) ipmi_si_pci_shutdown(); + ipmi_si_ls2k_shutdown(); + ipmi_si_parisc_shutdown(); ipmi_si_platform_shutdown(); diff --git a/drivers/char/ipmi/ipmi_si_ls2k.c b/drivers/char/ipmi/ipmi_si_ls2k.c new file mode 100644 index 00000000000000..45442c257efdbe --- /dev/null +++ b/drivers/char/ipmi/ipmi_si_ls2k.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Driver for Loongson-2K BMC IPMI interface + * + * Copyright (C) 2024-2025 Loongson Technology Corporation Limited. + * + * Authors: + * Chong Qiao + * Binbin Zhou + */ + +#include +#include +#include +#include + +#include "ipmi_si.h" + +#define LS2K_KCS_FIFO_IBFH 0x0 +#define LS2K_KCS_FIFO_IBFT 0x1 +#define LS2K_KCS_FIFO_OBFH 0x2 +#define LS2K_KCS_FIFO_OBFT 0x3 + +/* KCS registers */ +#define LS2K_KCS_REG_STS 0x4 +#define LS2K_KCS_REG_DATA_OUT 0x5 +#define LS2K_KCS_REG_DATA_IN 0x6 +#define LS2K_KCS_REG_CMD 0x8 + +#define LS2K_KCS_CMD_DATA 0xa +#define LS2K_KCS_VERSION 0xb +#define LS2K_KCS_WR_REQ 0xc +#define LS2K_KCS_WR_ACK 0x10 + +#define LS2K_KCS_STS_OBF BIT(0) +#define LS2K_KCS_STS_IBF BIT(1) +#define LS2K_KCS_STS_SMS_ATN BIT(2) +#define LS2K_KCS_STS_CMD BIT(3) + +#define LS2K_KCS_DATA_MASK (LS2K_KCS_STS_OBF | LS2K_KCS_STS_IBF | LS2K_KCS_STS_CMD) + +static bool ls2k_registered; + +static unsigned char ls2k_mem_inb_v0(const struct si_sm_io *io, unsigned int offset) +{ + void __iomem *addr = io->addr; + int reg_offset; + + if (offset & BIT(0)) { + reg_offset = LS2K_KCS_REG_STS; + } else { + writeb(readb(addr + LS2K_KCS_REG_STS) & ~LS2K_KCS_STS_OBF, addr + LS2K_KCS_REG_STS); + reg_offset = LS2K_KCS_REG_DATA_OUT; + } + + return readb(addr + reg_offset); +} + +static unsigned char ls2k_mem_inb_v1(const struct si_sm_io *io, unsigned int offset) +{ + void __iomem *addr = io->addr; + unsigned char inb = 0, cmd; + bool obf, ibf; + + obf = readb(addr + LS2K_KCS_FIFO_OBFH) ^ readb(addr + LS2K_KCS_FIFO_OBFT); + ibf = readb(addr + LS2K_KCS_FIFO_IBFH) ^ readb(addr + LS2K_KCS_FIFO_IBFT); + cmd = readb(addr + LS2K_KCS_CMD_DATA); + + if (offset & BIT(0)) { + inb = readb(addr + LS2K_KCS_REG_STS) & ~LS2K_KCS_DATA_MASK; + inb |= FIELD_PREP(LS2K_KCS_STS_OBF, obf) + | FIELD_PREP(LS2K_KCS_STS_IBF, ibf) + | FIELD_PREP(LS2K_KCS_STS_CMD, cmd); + } else { + inb = readb(addr + LS2K_KCS_REG_DATA_OUT); + writeb(readb(addr + LS2K_KCS_FIFO_OBFH), addr + LS2K_KCS_FIFO_OBFT); + } + + return inb; +} + +static void ls2k_mem_outb_v0(const struct si_sm_io *io, unsigned int offset, + unsigned char val) +{ + void __iomem *addr = io->addr; + unsigned char sts = readb(addr + LS2K_KCS_REG_STS); + int reg_offset; + + if (sts & LS2K_KCS_STS_IBF) + return; + + if (offset & BIT(0)) { + reg_offset = LS2K_KCS_REG_CMD; + sts |= LS2K_KCS_STS_CMD; + } else { + reg_offset = LS2K_KCS_REG_DATA_IN; + sts &= ~LS2K_KCS_STS_CMD; + } + + writew(val, addr + reg_offset); + writeb(sts | LS2K_KCS_STS_IBF, addr + LS2K_KCS_REG_STS); + writel(readl(addr + LS2K_KCS_WR_REQ) + 1, addr + LS2K_KCS_WR_REQ); +} + +static void ls2k_mem_outb_v1(const struct si_sm_io *io, unsigned int offset, + unsigned char val) +{ + void __iomem *addr = io->addr; + unsigned char ibfh, ibft; + int reg_offset; + + ibfh = readb(addr + LS2K_KCS_FIFO_IBFH); + ibft = readb(addr + LS2K_KCS_FIFO_IBFT); + + if (ibfh ^ ibft) + return; + + reg_offset = (offset & BIT(0)) ? LS2K_KCS_REG_CMD : LS2K_KCS_REG_DATA_IN; + writew(val, addr + reg_offset); + + writeb(offset & BIT(0), addr + LS2K_KCS_CMD_DATA); + writeb(!ibft, addr + LS2K_KCS_FIFO_IBFH); + writel(readl(addr + LS2K_KCS_WR_REQ) + 1, addr + LS2K_KCS_WR_REQ); +} + +static void ls2k_mem_cleanup(struct si_sm_io *io) +{ + if (io->addr) + iounmap(io->addr); +} + +static int ipmi_ls2k_mem_setup(struct si_sm_io *io) +{ + unsigned char version; + + io->addr = ioremap(io->addr_data, io->regspacing); + if (!io->addr) + return -EIO; + + version = readb(io->addr + LS2K_KCS_VERSION); + + io->inputb = version ? ls2k_mem_inb_v1 : ls2k_mem_inb_v0; + io->outputb = version ? ls2k_mem_outb_v1 : ls2k_mem_outb_v0; + io->io_cleanup = ls2k_mem_cleanup; + + return 0; +} + +static int ipmi_ls2k_probe(struct platform_device *pdev) +{ + struct si_sm_io io; + + memset(&io, 0, sizeof(io)); + + io.si_info = &ipmi_kcs_si_info; + io.io_setup = ipmi_ls2k_mem_setup; + io.addr_data = pdev->resource[0].start; + io.regspacing = resource_size(&pdev->resource[0]); + io.dev = &pdev->dev; + + dev_dbg(&pdev->dev, "addr 0x%lx, spacing %d.\n", io.addr_data, io.regspacing); + + return ipmi_si_add_smi(&io); +} + +static void ipmi_ls2k_remove(struct platform_device *pdev) +{ + ipmi_si_remove_by_dev(&pdev->dev); +} + +struct platform_driver ipmi_ls2k_platform_driver = { + .driver = { + .name = "ls2k-ipmi-si", + }, + .probe = ipmi_ls2k_probe, + .remove = ipmi_ls2k_remove, +}; + +void ipmi_si_ls2k_init(void) +{ + platform_driver_register(&ipmi_ls2k_platform_driver); + ls2k_registered = true; +} + +void ipmi_si_ls2k_shutdown(void) +{ + if (ls2k_registered) + platform_driver_unregister(&ipmi_ls2k_platform_driver); +} From 3ab1da2614e616ea62f32e4d695f1cc449089f07 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 16 Sep 2025 11:12:51 +0200 Subject: [PATCH 1905/3206] i3c: master: adi: fix header location MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The adi-axi-common header has been moved to the upper directory. Acked-by: Jorge Marques Link: https://lore.kernel.org/r/20250519-dev-axi-clkgen-limits-v6-3-bc4b3b61d1d4@analog.com Acked-by: Nuno Sá Link: https://lore.kernel.org/r/20250916091252.39265-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/adi-i3c-master.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i3c/master/adi-i3c-master.c b/drivers/i3c/master/adi-i3c-master.c index 162f9eed39aa5e..18597ba1f1c3e7 100644 --- a/drivers/i3c/master/adi-i3c-master.c +++ b/drivers/i3c/master/adi-i3c-master.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include From d0fa8751525d3aa4359de00bcbed578eab6f1d79 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 19 Aug 2025 11:58:03 +0800 Subject: [PATCH 1906/3206] backlight: led_bl: Use devm_kcalloc() for array space allocation Replace calls of devm_kzalloc() with devm_kcalloc() in led_bl_get_leds() and led_bl_parse_levels() for safer memory allocation with built-in overflow protection. Signed-off-by: Qianfeng Rong Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250819035804.433615-1-rongqianfeng@vivo.com Signed-off-by: Lee Jones --- drivers/video/backlight/led_bl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/video/backlight/led_bl.c b/drivers/video/backlight/led_bl.c index d2db157b2c290a..dd03d91a6e5071 100644 --- a/drivers/video/backlight/led_bl.c +++ b/drivers/video/backlight/led_bl.c @@ -89,7 +89,7 @@ static int led_bl_get_leds(struct device *dev, return -EINVAL; } - leds = devm_kzalloc(dev, sizeof(struct led_classdev *) * nb_leds, + leds = devm_kcalloc(dev, nb_leds, sizeof(struct led_classdev *), GFP_KERNEL); if (!leds) return -ENOMEM; @@ -137,7 +137,7 @@ static int led_bl_parse_levels(struct device *dev, unsigned int db; u32 *levels = NULL; - levels = devm_kzalloc(dev, sizeof(u32) * num_levels, + levels = devm_kcalloc(dev, num_levels, sizeof(u32), GFP_KERNEL); if (!levels) return -ENOMEM; From cbc7f3b4f6ca19320e2eacf8fc1403d6f331ce14 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 7 Aug 2025 18:53:41 +0300 Subject: [PATCH 1907/3206] drm/xe: Fix a NULL vs IS_ERR() in xe_vm_add_compute_exec_queue() The xe_preempt_fence_create() function returns error pointers. It never returns NULL. Update the error checking to match. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Dan Carpenter Reviewed-by: Matthew Brost Link: https://lore.kernel.org/r/aJTMBdX97cof_009@stanley.mountain Signed-off-by: Rodrigo Vivi (cherry picked from commit 75cc23ffe5b422bc3cbd5cf0956b8b86e4b0e162) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_vm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index dc4f61e56579cb..5146999d27fa2d 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -240,8 +240,8 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q) pfence = xe_preempt_fence_create(q, q->lr.context, ++q->lr.seqno); - if (!pfence) { - err = -ENOMEM; + if (IS_ERR(pfence)) { + err = PTR_ERR(pfence); goto out_fini; } From ef381e17930e9c5268a53f57187c87a85676d0f7 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 10 Sep 2025 14:01:10 +0200 Subject: [PATCH 1908/3206] leds: led-class: Add Device Tree support to led_get() Add 'name' argument to of_led_get() such that it can lookup LEDs in devicetree by either name or index. And use this modified function to add devicetree support to the generic (non devicetree specific) [devm_]led_get() function. This uses the standard devicetree pattern of adding a -names string array to map names to the indexes for an array of resources. Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Hans de Goede Signed-off-by: Aleksandrs Vinarskis Link: https://lore.kernel.org/r/20250910-leds-v5-3-bb90a0f897d5@vinarskis.com Signed-off-by: Lee Jones --- drivers/leds/led-class.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 15633fbf3c166a..f3faf37f9a08ac 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -252,15 +252,23 @@ static const struct class leds_class = { * of_led_get() - request a LED device via the LED framework * @np: device node to get the LED device from * @index: the index of the LED + * @name: the name of the LED used to map it to its function, if present * * Returns the LED device parsed from the phandle specified in the "leds" * property of a device tree node or a negative error-code on failure. */ -static struct led_classdev *of_led_get(struct device_node *np, int index) +static struct led_classdev *of_led_get(struct device_node *np, int index, + const char *name) { struct device *led_dev; struct device_node *led_node; + /* + * For named LEDs, first look up the name in the "led-names" property. + * If it cannot be found, then of_parse_phandle() will propagate the error. + */ + if (name) + index = of_property_match_string(np, "led-names", name); led_node = of_parse_phandle(np, "leds", index); if (!led_node) return ERR_PTR(-ENOENT); @@ -324,7 +332,7 @@ struct led_classdev *__must_check devm_of_led_get(struct device *dev, if (!dev) return ERR_PTR(-EINVAL); - led = of_led_get(dev->of_node, index); + led = of_led_get(dev->of_node, index, NULL); if (IS_ERR(led)) return led; @@ -342,9 +350,14 @@ EXPORT_SYMBOL_GPL(devm_of_led_get); struct led_classdev *led_get(struct device *dev, char *con_id) { struct led_lookup_data *lookup; + struct led_classdev *led_cdev; const char *provider = NULL; struct device *led_dev; + led_cdev = of_led_get(dev->of_node, -1, con_id); + if (!IS_ERR(led_cdev) || PTR_ERR(led_cdev) != -ENOENT) + return led_cdev; + mutex_lock(&leds_lookup_lock); list_for_each_entry(lookup, &leds_lookup_list, list) { if (!strcmp(lookup->dev_id, dev_name(dev)) && From 07c7efda24453e05951fb2879f5452b720b91169 Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Tue, 9 Sep 2025 10:43:04 +0300 Subject: [PATCH 1909/3206] video: backlight: lp855x_bl: Set correct EPROM start for LP8556 According to LP8556 datasheet EPROM region starts at 0x98 so adjust value in the driver accordingly. Signed-off-by: Svyatoslav Ryhel Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250909074304.92135-2-clamor95@gmail.com Signed-off-by: Lee Jones --- drivers/video/backlight/lp855x_bl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/backlight/lp855x_bl.c b/drivers/video/backlight/lp855x_bl.c index 7075bfab59c4dc..d191560ce285f9 100644 --- a/drivers/video/backlight/lp855x_bl.c +++ b/drivers/video/backlight/lp855x_bl.c @@ -22,7 +22,7 @@ #define LP855X_DEVICE_CTRL 0x01 #define LP855X_EEPROM_START 0xA0 #define LP855X_EEPROM_END 0xA7 -#define LP8556_EPROM_START 0xA0 +#define LP8556_EPROM_START 0x98 #define LP8556_EPROM_END 0xAF /* LP8555/7 Registers */ From b12224c28d84d054dfb680c05cda61d1e2584bf5 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 15 Jul 2025 14:24:43 +0200 Subject: [PATCH 1910/3206] backlight: Include Include to avoid dependency on backlight header to include it. Signed-off-by: Thomas Zimmermann Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250715122643.137027-7-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/backlight.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 9dc93c5e480b40..1e9b7e85d99a2c 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_PMAC_BACKLIGHT #include From 945e411acde3800234d506f4304203a9b11890f8 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 15 Jul 2025 14:24:44 +0200 Subject: [PATCH 1911/3206] backlight: apple_dwi_bl: Include Include to declare struct of_device_id. Avoids dependency on backlight header to include it. Signed-off-by: Thomas Zimmermann Reviewed-by: Nick Chan Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250715122643.137027-8-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/apple_dwi_bl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/backlight/apple_dwi_bl.c b/drivers/video/backlight/apple_dwi_bl.c index 93bd744972d60a..ed8bf13d3f512b 100644 --- a/drivers/video/backlight/apple_dwi_bl.c +++ b/drivers/video/backlight/apple_dwi_bl.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include From 6789cd935a57464deaacdd14c84bc026aa228e72 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 15 Jul 2025 14:24:45 +0200 Subject: [PATCH 1912/3206] backlight: as3711_bl: Include Include to declare various OF helpers. Avoids dependency on backlight header to include it. Signed-off-by: Thomas Zimmermann Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250715122643.137027-9-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/as3711_bl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/backlight/as3711_bl.c b/drivers/video/backlight/as3711_bl.c index 9f89eb19894e39..753160bbc3e722 100644 --- a/drivers/video/backlight/as3711_bl.c +++ b/drivers/video/backlight/as3711_bl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include From e2e76f67bdbbc7b8df608e3dd1028059d838871e Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 15 Jul 2025 14:24:46 +0200 Subject: [PATCH 1913/3206] backlight: da9052_bl: Include Include to declare struct platform_device_id. Avoids dependency on backlight header to include it. Signed-off-by: Thomas Zimmermann Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250715122643.137027-10-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/da9052_bl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/backlight/da9052_bl.c b/drivers/video/backlight/da9052_bl.c index f41523d78121b7..2493138febfa1d 100644 --- a/drivers/video/backlight/da9052_bl.c +++ b/drivers/video/backlight/da9052_bl.c @@ -9,6 +9,7 @@ #include #include +#include #include #include From ce4bb1a2f1cbcd5f6471f74ee5c7e1443a4cfd84 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 15 Jul 2025 14:24:47 +0200 Subject: [PATCH 1914/3206] backlight: jornada720: Include Include to declare IOMEM(). Avoids dependency on backlight header to include it. Signed-off-by: Thomas Zimmermann Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250715122643.137027-11-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/jornada720_bl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/backlight/jornada720_bl.c b/drivers/video/backlight/jornada720_bl.c index e28d2c07179894..bbb65fdaddc79e 100644 --- a/drivers/video/backlight/jornada720_bl.c +++ b/drivers/video/backlight/jornada720_bl.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include From 5f60004f152b432c6ae5dbacc172adc1fa215825 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 15 Jul 2025 14:24:48 +0200 Subject: [PATCH 1915/3206] backlight: ktd2801: Include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include to declare struct of_device_id. Avoids dependency on backlight header to include it. Signed-off-by: Thomas Zimmermann Reviewed-by: Nick Chan Reviewed-by: "Daniel Thompson (RISCstar)" Acked-by: Duje Mihanović Link: https://lore.kernel.org/r/20250715122643.137027-12-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/ktd2801-backlight.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/backlight/ktd2801-backlight.c b/drivers/video/backlight/ktd2801-backlight.c index 0489b0615cebc9..17eac1b3bce4ad 100644 --- a/drivers/video/backlight/ktd2801-backlight.c +++ b/drivers/video/backlight/ktd2801-backlight.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include From b38ed7c05a35f3a029c2fc5e43a94aa81e2ac843 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 15 Jul 2025 14:24:49 +0200 Subject: [PATCH 1916/3206] backlight: led_bl: Include Include to declare struct of_count_phandle_with_args(). Avoids dependency on backlight header to include it. Signed-off-by: Thomas Zimmermann Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250715122643.137027-13-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/led_bl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/backlight/led_bl.c b/drivers/video/backlight/led_bl.c index dd03d91a6e5071..efc5e380669aea 100644 --- a/drivers/video/backlight/led_bl.c +++ b/drivers/video/backlight/led_bl.c @@ -9,6 +9,7 @@ #include #include #include +#include #include struct led_bl_data { From 246da2b48e2ce973db255fc4b6faf42f73c03114 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 15 Jul 2025 14:24:50 +0200 Subject: [PATCH 1917/3206] backlight: rave-sp: Include and Include to declare struct device_node and include to declare struct of_device_id. Avoids dependency on backlight header to include it. Signed-off-by: Thomas Zimmermann Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250715122643.137027-14-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/rave-sp-backlight.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/video/backlight/rave-sp-backlight.c b/drivers/video/backlight/rave-sp-backlight.c index e708a060a6e46f..bfe01b9b9174c2 100644 --- a/drivers/video/backlight/rave-sp-backlight.c +++ b/drivers/video/backlight/rave-sp-backlight.c @@ -9,8 +9,10 @@ #include #include +#include #include #include +#include #include #define RAVE_SP_BACKLIGHT_LCD_EN BIT(7) From ba3b29a639fe5173033914db6ee58d8d9bb86aba Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 15 Jul 2025 14:24:51 +0200 Subject: [PATCH 1918/3206] backlight: rt4831: Include Include to declare struct of_device_id. Avoids dependency on backlight header to include it. Signed-off-by: Thomas Zimmermann Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250715122643.137027-15-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/rt4831-backlight.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/backlight/rt4831-backlight.c b/drivers/video/backlight/rt4831-backlight.c index 7ead75929a437a..26214519bfcee1 100644 --- a/drivers/video/backlight/rt4831-backlight.c +++ b/drivers/video/backlight/rt4831-backlight.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include From f0bd03832f5c84f90919bd018156b1b6eb911692 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 10 Sep 2025 19:11:06 +0800 Subject: [PATCH 1919/3206] md: init queue_limits->max_hw_wzeroes_unmap_sectors parameter The parameter max_hw_wzeroes_unmap_sectors in queue_limits should be equal to max_write_zeroes_sectors if it is set to a non-zero value. However, the stacked md drivers call md_init_stacking_limits() to initialize this parameter to UINT_MAX but only adjust max_write_zeroes_sectors when setting limits. Therefore, this discrepancy triggers a value check failure in blk_validate_limits(). $ modprobe scsi_debug num_parts=2 dev_size_mb=8 lbprz=1 lbpws=1 $ mdadm --create /dev/md0 --level=0 --raid-device=2 /dev/sda1 /dev/sda2 mdadm: Defaulting to version 1.2 metadata mdadm: RUN_ARRAY failed: Invalid argument Fix this failure by explicitly setting max_hw_wzeroes_unmap_sectors to max_write_zeroes_sectors. Since the linear and raid0 drivers support write zeroes, so they can support unmap write zeroes operation if all of the backend devices support it. However, the raid1/10/5 drivers don't support write zeroes, so we have to set it to zero. Fixes: 0c40d7cb5ef3 ("block: introduce max_{hw|user}_wzeroes_unmap_sectors to queue limits") Reported-by: John Garry Closes: https://lore.kernel.org/linux-block/803a2183-a0bb-4b7a-92f1-afc5097630d2@oracle.com/ Signed-off-by: Zhang Yi Tested-by: John Garry Reviewed-by: Li Nan Reviewed-by: Martin K. Petersen Reviewed-by: Yu Kuai Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/linux-raid/20250910111107.3247530-2-yi.zhang@huaweicloud.com Signed-off-by: Yu Kuai --- drivers/md/md-linear.c | 1 + drivers/md/raid0.c | 1 + drivers/md/raid1.c | 1 + drivers/md/raid10.c | 1 + drivers/md/raid5.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 5d9b081153757e..3e1f165c2d20f6 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -73,6 +73,7 @@ static int linear_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f1d8811a542ae2..419139ad7663cc 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -382,6 +382,7 @@ static int raid0_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; lim.chunk_sectors = mddev->chunk_sectors; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index bf44878ec640e5..d30b82beeb92fb 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3211,6 +3211,7 @@ static int raid1_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b60c30bfb6c794..9832eefb2f157b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4008,6 +4008,7 @@ static int raid10_set_queue_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.chunk_sectors = mddev->chunk_sectors; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 023649fe2476f3..e385ef1355e8b3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7732,6 +7732,7 @@ static int raid5_set_limits(struct mddev *mddev) lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; mddev_stack_rdev_limits(mddev, &lim, 0); rdev_for_each(rdev, mddev) queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, From b4f7a727c29cd1fa6149a81f16408ee7c1f7de0c Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Tue, 9 Sep 2025 17:34:32 +0300 Subject: [PATCH 1920/3206] dt-bindings: power: supply: bq27xxx: document optional interrupt Document an optional interrupt found in some controllers of BQ27xxx series. The pin to which the interrupt is connected is called SOC_INT or GPOUT. Signed-off-by: Svyatoslav Ryhel Reviewed-by: Rob Herring (Arm) Signed-off-by: Sebastian Reichel --- .../bindings/power/supply/bq27xxx.yaml | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/power/supply/bq27xxx.yaml b/Documentation/devicetree/bindings/power/supply/bq27xxx.yaml index 309ea33b5b259d..bc05400186cf1c 100644 --- a/Documentation/devicetree/bindings/power/supply/bq27xxx.yaml +++ b/Documentation/devicetree/bindings/power/supply/bq27xxx.yaml @@ -16,9 +16,6 @@ description: | Support various Texas Instruments fuel gauge devices that share similar register maps and power supply properties -allOf: - - $ref: power-supply.yaml# - properties: compatible: enum: @@ -58,6 +55,10 @@ properties: maxItems: 1 description: integer, I2C address of the fuel gauge. + interrupts: + maxItems: 1 + description: the SOC_INT or GPOUT pin + monitored-battery: description: | The fuel gauge uses the following battery properties: @@ -68,6 +69,36 @@ properties: power-supplies: true +allOf: + - $ref: power-supply.yaml# + - if: + properties: + compatible: + contains: + enum: + - ti,bq27200 + - ti,bq27210 + - ti,bq27500 # deprecated, use revision specific property below + - ti,bq27510 # deprecated, use revision specific property below + - ti,bq27520 # deprecated, use revision specific property below + - ti,bq27500-1 + - ti,bq27510g1 + - ti,bq27510g2 + - ti,bq27521 + - ti,bq27541 + - ti,bq27542 + - ti,bq27546 + - ti,bq27742 + - ti,bq27545 + - ti,bq27411 + - ti,bq27z561 + - ti,bq28z610 + - ti,bq34z100 + - ti,bq78z100 + then: + properties: + interrupts: false + required: - compatible - reg From d0759b10989c5c5aae3d455458c9fc4e8cc694f7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Sep 2025 20:21:33 +0200 Subject: [PATCH 1921/3206] ACPI: property: Fix buffer properties extraction for subnodes The ACPI handle passed to acpi_extract_properties() as the first argument represents the ACPI namespace scope in which to look for objects returning buffers associated with buffer properties. For _DSD objects located immediately under ACPI devices, this handle is the same as the handle of the device object holding the _DSD, but for data-only subnodes it is not so. First of all, data-only subnodes are represented by objects that cannot hold other objects in their scopes (like control methods). Therefore a data-only subnode handle cannot be used for completing relative pathname segments, so the current code in in acpi_nondev_subnode_extract() passing a data-only subnode handle to acpi_extract_properties() is invalid. Moreover, a data-only subnode of device A may be represented by an object located in the scope of device B (which kind of makes sense, for instance, if A is a B's child). In that case, the scope in question would be the one of device B. In other words, the scope mentioned above is the same as the scope used for subnode object lookup in acpi_nondev_subnode_extract(). Accordingly, rearrange that function to use the same scope for the extraction of properties and subnode object lookup. Fixes: 103e10c69c61 ("ACPI: property: Add support for parsing buffer property UUID") Cc: 6.0+ # 6.0+ Signed-off-by: Rafael J. Wysocki Reviewed-by: Sakari Ailus Tested-by: Sakari Ailus --- drivers/acpi/property.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 436019d96027bd..83b03dce576c50 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -83,6 +83,7 @@ static bool acpi_nondev_subnode_extract(union acpi_object *desc, struct fwnode_handle *parent) { struct acpi_data_node *dn; + acpi_handle scope = NULL; bool result; if (acpi_graph_ignore_port(handle)) @@ -98,27 +99,18 @@ static bool acpi_nondev_subnode_extract(union acpi_object *desc, INIT_LIST_HEAD(&dn->data.properties); INIT_LIST_HEAD(&dn->data.subnodes); - result = acpi_extract_properties(handle, desc, &dn->data); - - if (handle) { - acpi_handle scope; - acpi_status status; + /* + * The scope for the completion of relative pathname segments and + * subnode object lookup is the one of the namespace node (device) + * containing the object that has returned the package. That is, it's + * the scope of that object's parent device. + */ + if (handle) + acpi_get_parent(handle, &scope); - /* - * The scope for the subnode object lookup is the one of the - * namespace node (device) containing the object that has - * returned the package. That is, it's the scope of that - * object's parent. - */ - status = acpi_get_parent(handle, &scope); - if (ACPI_SUCCESS(status) - && acpi_enumerate_nondev_subnodes(scope, desc, &dn->data, - &dn->fwnode)) - result = true; - } else if (acpi_enumerate_nondev_subnodes(NULL, desc, &dn->data, - &dn->fwnode)) { + result = acpi_extract_properties(scope, desc, &dn->data); + if (acpi_enumerate_nondev_subnodes(scope, desc, &dn->data, &dn->fwnode)) result = true; - } if (result) { dn->handle = handle; From d06118fe9b03426484980ed4c189a8c7b99fa631 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Sep 2025 20:23:28 +0200 Subject: [PATCH 1922/3206] ACPI: property: Disregard references in data-only subnode lists Data-only subnode links following the ACPI data subnode GUID in a _DSD package are expected to point to named objects returning _DSD-equivalent packages. If a reference to such an object is used in the target field of any of those links, that object will be evaluated in place (as a named object) and its return data will be embedded in the outer _DSD package. For this reason, it is not expected to see a subnode link with the target field containing a local reference (that would mean pointing to a device or another object that cannot be evaluated in place and therefore cannot return a _DSD-equivalent package). Accordingly, simplify the code parsing data-only subnode links to simply print a message when it encounters a local reference in the target field of one of those links. Moreover, since acpi_nondev_subnode_data_ok() would only have one caller after the change above, fold it into that caller. Link: https://lore.kernel.org/linux-acpi/CAJZ5v0jVeSrDO6hrZhKgRZrH=FpGD4vNUjFD8hV9WwN9TLHjzQ@mail.gmail.com/ Signed-off-by: Rafael J. Wysocki Reviewed-by: Sakari Ailus Tested-by: Sakari Ailus --- drivers/acpi/property.c | 51 ++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 83b03dce576c50..5b7c10a2824d68 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -124,32 +124,12 @@ static bool acpi_nondev_subnode_extract(union acpi_object *desc, return false; } -static bool acpi_nondev_subnode_data_ok(acpi_handle handle, - const union acpi_object *link, - struct list_head *list, - struct fwnode_handle *parent) -{ - struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; - acpi_status status; - - status = acpi_evaluate_object_typed(handle, NULL, NULL, &buf, - ACPI_TYPE_PACKAGE); - if (ACPI_FAILURE(status)) - return false; - - if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list, - parent)) - return true; - - ACPI_FREE(buf.pointer); - return false; -} - static bool acpi_nondev_subnode_ok(acpi_handle scope, const union acpi_object *link, struct list_head *list, struct fwnode_handle *parent) { + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; acpi_handle handle; acpi_status status; @@ -161,7 +141,17 @@ static bool acpi_nondev_subnode_ok(acpi_handle scope, if (ACPI_FAILURE(status)) return false; - return acpi_nondev_subnode_data_ok(handle, link, list, parent); + status = acpi_evaluate_object_typed(handle, NULL, NULL, &buf, + ACPI_TYPE_PACKAGE); + if (ACPI_FAILURE(status)) + return false; + + if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list, + parent)) + return true; + + ACPI_FREE(buf.pointer); + return false; } static bool acpi_add_nondev_subnodes(acpi_handle scope, @@ -174,7 +164,6 @@ static bool acpi_add_nondev_subnodes(acpi_handle scope, for (i = 0; i < links->package.count; i++) { union acpi_object *link, *desc; - acpi_handle handle; bool result; link = &links->package.elements[i]; @@ -186,22 +175,26 @@ static bool acpi_add_nondev_subnodes(acpi_handle scope, if (link->package.elements[0].type != ACPI_TYPE_STRING) continue; - /* The second one may be a string, a reference or a package. */ + /* The second one may be a string or a package. */ switch (link->package.elements[1].type) { case ACPI_TYPE_STRING: result = acpi_nondev_subnode_ok(scope, link, list, parent); break; - case ACPI_TYPE_LOCAL_REFERENCE: - handle = link->package.elements[1].reference.handle; - result = acpi_nondev_subnode_data_ok(handle, link, list, - parent); - break; case ACPI_TYPE_PACKAGE: desc = &link->package.elements[1]; result = acpi_nondev_subnode_extract(desc, NULL, link, list, parent); break; + case ACPI_TYPE_LOCAL_REFERENCE: + /* + * It is not expected to see any local references in + * the links package because referencing a named object + * should cause it to be evaluated in place. + */ + acpi_handle_info(scope, "subnode %s: Unexpected reference\n", + link->package.elements[0].string.pointer); + fallthrough; default: result = false; break; From 737c3a09dcf69ba2814f3674947ccaec1861c985 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Sep 2025 20:25:13 +0200 Subject: [PATCH 1923/3206] ACPI: property: Add code comments explaining what is going on In some places in the ACPI device properties handling code, it is unclear why the code is what it is. Some assumptions are not documented and some pieces of code are based on knowledge that is not mentioned anywhere. Add code comments explaining these things. Signed-off-by: Rafael J. Wysocki Reviewed-by: Sakari Ailus Tested-by: Sakari Ailus --- drivers/acpi/property.c | 46 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 5b7c10a2824d68..f4776a4085e064 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -108,7 +108,18 @@ static bool acpi_nondev_subnode_extract(union acpi_object *desc, if (handle) acpi_get_parent(handle, &scope); + /* + * Extract properties from the _DSD-equivalent package pointed to by + * desc and use scope (if not NULL) for the completion of relative + * pathname segments. + * + * The extracted properties will be held in the new data node dn. + */ result = acpi_extract_properties(scope, desc, &dn->data); + /* + * Look for subnodes in the _DSD-equivalent package pointed to by desc + * and create child nodes of dn if there are any. + */ if (acpi_enumerate_nondev_subnodes(scope, desc, &dn->data, &dn->fwnode)) result = true; @@ -133,6 +144,12 @@ static bool acpi_nondev_subnode_ok(acpi_handle scope, acpi_handle handle; acpi_status status; + /* + * If the scope is unknown, the _DSD-equivalent package being parsed + * was embedded in an outer _DSD-equivalent package as a result of + * direct evaluation of an object pointed to by a reference. In that + * case, using a pathname as the target object pointer is invalid. + */ if (!scope) return false; @@ -162,6 +179,10 @@ static bool acpi_add_nondev_subnodes(acpi_handle scope, bool ret = false; int i; + /* + * Every element in the links package is expected to represent a link + * to a non-device node in a tree containing device-specific data. + */ for (i = 0; i < links->package.count; i++) { union acpi_object *link, *desc; bool result; @@ -171,17 +192,38 @@ static bool acpi_add_nondev_subnodes(acpi_handle scope, if (link->package.count != 2) continue; - /* The first one must be a string. */ + /* The first one (the key) must be a string. */ if (link->package.elements[0].type != ACPI_TYPE_STRING) continue; - /* The second one may be a string or a package. */ + /* The second one (the target) may be a string or a package. */ switch (link->package.elements[1].type) { case ACPI_TYPE_STRING: + /* + * The string is expected to be a full pathname or a + * pathname segment relative to the given scope. That + * pathname is expected to point to an object returning + * a package that contains _DSD-equivalent information. + */ result = acpi_nondev_subnode_ok(scope, link, list, parent); break; case ACPI_TYPE_PACKAGE: + /* + * This happens when a reference is used in AML to + * point to the target. Since the target is expected + * to be a named object, a reference to it will cause it + * to be avaluated in place and its return package will + * be embedded in the links package at the location of + * the reference. + * + * The target package is expected to contain _DSD- + * equivalent information, but the scope in which it + * is located in the original AML is unknown. Thus + * it cannot contain pathname segments represented as + * strings because there is no way to build full + * pathnames out of them. + */ desc = &link->package.elements[1]; result = acpi_nondev_subnode_extract(desc, NULL, link, list, parent); From baf60d5cb8bc6b85511c5df5f0ad7620bb66d23c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Sep 2025 20:28:52 +0200 Subject: [PATCH 1924/3206] ACPI: property: Do not pass NULL handles to acpi_attach_data() In certain circumstances, the ACPI handle of a data-only node may be NULL, in which case it does not make sense to attempt to attach that node to an ACPI namespace object, so update the code to avoid attempts to do so. This prevents confusing and unuseful error messages from being printed. Also document the fact that the ACPI handle of a data-only node may be NULL and when that happens in a code comment. In addition, make acpi_add_nondev_subnodes() print a diagnostic message for each data-only node with an unknown ACPI namespace scope. Fixes: 1d52f10917a7 ("ACPI: property: Tie data nodes to acpi handles") Cc: 6.0+ # 6.0+ Signed-off-by: Rafael J. Wysocki Reviewed-by: Sakari Ailus Tested-by: Sakari Ailus --- drivers/acpi/property.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index f4776a4085e064..c086786fe84cb4 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -124,6 +124,10 @@ static bool acpi_nondev_subnode_extract(union acpi_object *desc, result = true; if (result) { + /* + * This will be NULL if the desc package is embedded in an outer + * _DSD-equivalent package and its scope cannot be determined. + */ dn->handle = handle; dn->data.pointer = desc; list_add_tail(&dn->sibling, list); @@ -224,6 +228,8 @@ static bool acpi_add_nondev_subnodes(acpi_handle scope, * strings because there is no way to build full * pathnames out of them. */ + acpi_handle_debug(scope, "subnode %s: Unknown scope\n", + link->package.elements[0].string.pointer); desc = &link->package.elements[1]; result = acpi_nondev_subnode_extract(desc, NULL, link, list, parent); @@ -396,6 +402,9 @@ static void acpi_untie_nondev_subnodes(struct acpi_device_data *data) struct acpi_data_node *dn; list_for_each_entry(dn, &data->subnodes, sibling) { + if (!dn->handle) + continue; + acpi_detach_data(dn->handle, acpi_nondev_subnode_tag); acpi_untie_nondev_subnodes(&dn->data); @@ -410,6 +419,9 @@ static bool acpi_tie_nondev_subnodes(struct acpi_device_data *data) acpi_status status; bool ret; + if (!dn->handle) + continue; + status = acpi_attach_data(dn->handle, acpi_nondev_subnode_tag, dn); if (ACPI_FAILURE(status) && status != AE_ALREADY_EXISTS) { acpi_handle_err(dn->handle, "Can't tag data node\n"); From 0f83b1d436c98083dcbf7f3e6204015ed4a46743 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Sep 2025 20:29:50 +0200 Subject: [PATCH 1925/3206] ACPI: property: Adjust failure handling in acpi_nondev_subnode_extract() Make acpi_nondev_subnode_extract() follow the usual code flow pattern in which failure is handled at the point where it is detected. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Sakari Ailus Tested-by: Sakari Ailus --- drivers/acpi/property.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index c086786fe84cb4..54baa23a9e5ae3 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -123,20 +123,21 @@ static bool acpi_nondev_subnode_extract(union acpi_object *desc, if (acpi_enumerate_nondev_subnodes(scope, desc, &dn->data, &dn->fwnode)) result = true; - if (result) { - /* - * This will be NULL if the desc package is embedded in an outer - * _DSD-equivalent package and its scope cannot be determined. - */ - dn->handle = handle; - dn->data.pointer = desc; - list_add_tail(&dn->sibling, list); - return true; + if (!result) { + kfree(dn); + acpi_handle_debug(handle, "Invalid properties/subnodes data, skipping\n"); + return false; } - kfree(dn); - acpi_handle_debug(handle, "Invalid properties/subnodes data, skipping\n"); - return false; + /* + * This will be NULL if the desc package is embedded in an outer + * _DSD-equivalent package and its scope cannot be determined. + */ + dn->handle = handle; + dn->data.pointer = desc; + list_add_tail(&dn->sibling, list); + + return true; } static bool acpi_nondev_subnode_ok(acpi_handle scope, From 3ee4211ef8693377f67624a46b4f8577f6a495d9 Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Tue, 16 Sep 2025 12:47:29 +0800 Subject: [PATCH 1926/3206] cgroup: Remove redundant rcu_read_lock/unlock() in spin_lock Since commit a8bb74acd8efe ("rcu: Consolidate RCU-sched update-side function definitions") there is no difference between rcu_read_lock(), rcu_read_lock_bh() and rcu_read_lock_sched() in terms of RCU read section and the relevant grace period. That means that spin_lock(), which implies rcu_read_lock_sched(), also implies rcu_read_lock(). There is no need no explicitly start a RCU read section if one has already been started implicitly by spin_lock(). Simplify the code and remove the inner rcu_read_lock() invocation. Cc: Tejun Heo Cc: Johannes Weiner Cc: Waiman Long Signed-off-by: pengdonglin Signed-off-by: pengdonglin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 -- kernel/cgroup/debug.c | 4 ---- 2 files changed, 6 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index fed701df116707..dcf8dc9f6343cb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3025,14 +3025,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, /* look up all src csets */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 80aa3f027ac3b1..81ea38dd6f9d27 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) return -ENODEV; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); @@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, css, css->id); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); cgroup_kn_unlock(of->kn); return 0; @@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) return -ENOMEM; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name_buf); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); kfree(name_buf); return 0; From 58ab6d25a1bfca42510979cb2b6921f1c807bd02 Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Tue, 16 Sep 2025 12:47:30 +0800 Subject: [PATCH 1927/3206] cgroup/cpuset: Remove redundant rcu_read_lock/unlock() in spin_lock Since commit a8bb74acd8efe ("rcu: Consolidate RCU-sched update-side function definitions") there is no difference between rcu_read_lock(), rcu_read_lock_bh() and rcu_read_lock_sched() in terms of RCU read section and the relevant grace period. That means that spin_lock(), which implies rcu_read_lock_sched(), also implies rcu_read_lock(). There is no need no explicitly start a RCU read section if one has already been started implicitly by spin_lock(). Simplify the code and remove the inner rcu_read_lock() invocation. Cc: Waiman Long Cc: Johannes Weiner Acked-by: Waiman Long Signed-off-by: pengdonglin Signed-off-by: pengdonglin Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0d41b4993f8cf4..caa885823eebc7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4107,7 +4107,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) struct cpuset *cs; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = task_cs(tsk); if (cs != &top_cpuset) @@ -4129,7 +4128,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) cpumask_copy(pmask, possible_mask); } - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } @@ -4202,9 +4200,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return mask; @@ -4299,10 +4295,8 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return allowed; From 220928e52cb03d223b3acad3888baf0687486d21 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 18 Aug 2025 20:21:18 +0100 Subject: [PATCH 1928/3206] arm64/hwcap: Add hwcap for FEAT_LSFE FEAT_LSFE (Large System Float Extension), providing atomic floating point memory operations, is optional from v9.5. This feature adds no new architectural stare and we have no immediate use for it in the kernel so simply provide a hwcap for it to support discovery by userspace. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- Documentation/arch/arm64/elf_hwcaps.rst | 4 ++++ arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpufeature.c | 2 ++ arch/arm64/kernel/cpuinfo.c | 1 + 5 files changed, 9 insertions(+) diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst index f58ada4d6cb2fd..a15df49568498f 100644 --- a/Documentation/arch/arm64/elf_hwcaps.rst +++ b/Documentation/arch/arm64/elf_hwcaps.rst @@ -441,6 +441,10 @@ HWCAP3_MTE_FAR HWCAP3_MTE_STORE_ONLY Functionality implied by ID_AA64PFR2_EL1.MTESTOREONLY == 0b0001. +HWCAP3_LSFE + Functionality implied by ID_AA64ISAR3_EL1.LSFE == 0b0001 + + 4. Unused AT_HWCAP bits ----------------------- diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 13f94c8ddfc03b..6d567265467ccc 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -178,6 +178,7 @@ #define __khwcap3_feature(x) (const_ilog2(HWCAP3_ ## x) + 128) #define KERNEL_HWCAP_MTE_FAR __khwcap3_feature(MTE_FAR) #define KERNEL_HWCAP_MTE_STORE_ONLY __khwcap3_feature(MTE_STORE_ONLY) +#define KERNEL_HWCAP_LSFE __khwcap3_feature(LSFE) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 72c78468b806b5..575564ecdb0b78 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -145,5 +145,6 @@ */ #define HWCAP3_MTE_FAR (1UL << 0) #define HWCAP3_MTE_STORE_ONLY (1UL << 1) +#define HWCAP3_LSFE (1UL << 2) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9ad065f15f1d61..b1219f14459f49 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -278,6 +278,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { static const struct arm64_ftr_bits ftr_id_aa64isar3[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -3252,6 +3253,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM), HWCAP_CAP(ID_AA64ISAR2_EL1, LUT, IMP, CAP_HWCAP, KERNEL_HWCAP_LUT), HWCAP_CAP(ID_AA64ISAR3_EL1, FAMINMAX, IMP, CAP_HWCAP, KERNEL_HWCAP_FAMINMAX), + HWCAP_CAP(ID_AA64ISAR3_EL1, LSFE, IMP, CAP_HWCAP, KERNEL_HWCAP_LSFE), HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index ba834909a28bd0..c44e6d94f5deb1 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -162,6 +162,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_SMOP4] = "smesmop4", [KERNEL_HWCAP_MTE_FAR] = "mtefar", [KERNEL_HWCAP_MTE_STORE_ONLY] = "mtestoreonly", + [KERNEL_HWCAP_LSFE] = "lsfe", }; #ifdef CONFIG_COMPAT From 47687aa4d9c91a9d9b1dbae40c242cd291030bd9 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:15 -0500 Subject: [PATCH 1929/3206] arm64: probes: Break ret out from bl/blr Prepare for GCS by breaking RET out into its own function, where it makes more sense to encapsulate the new behavior independent from the branch instructions. Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/decode-insn.c | 7 ++++--- arch/arm64/kernel/probes/simulate-insn.c | 10 +++++++++- arch/arm64/kernel/probes/simulate-insn.h | 3 ++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 6438bf62e753f3..4137cc5ef031f6 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -108,9 +108,10 @@ arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) aarch64_insn_is_bl(insn)) { api->handler = simulate_b_bl; } else if (aarch64_insn_is_br(insn) || - aarch64_insn_is_blr(insn) || - aarch64_insn_is_ret(insn)) { - api->handler = simulate_br_blr_ret; + aarch64_insn_is_blr(insn)) { + api->handler = simulate_br_blr; + } else if (aarch64_insn_is_ret(insn)) { + api->handler = simulate_ret; } else { /* * Instruction cannot be stepped out-of-line and we don't diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 4c6d2d712fbd3c..09a0b36122d010 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -126,7 +126,7 @@ simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs) } void __kprobes -simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs) +simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs) { int xn = (opcode >> 5) & 0x1f; @@ -138,6 +138,14 @@ simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs) set_x_reg(regs, 30, addr + 4); } +void __kprobes +simulate_ret(u32 opcode, long addr, struct pt_regs *regs) +{ + int xn = (opcode >> 5) & 0x1f; + + instruction_pointer_set(regs, get_x_reg(regs, xn)); +} + void __kprobes simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs) { diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h index efb2803ec943d6..9e772a292d565d 100644 --- a/arch/arm64/kernel/probes/simulate-insn.h +++ b/arch/arm64/kernel/probes/simulate-insn.h @@ -11,7 +11,8 @@ void simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs); -void simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs); +void simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs); +void simulate_ret(u32 opcode, long addr, struct pt_regs *regs); void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs); From ea920b50ac9ff13ef0282428bd80395ea134a26c Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:16 -0500 Subject: [PATCH 1930/3206] arm64: uaccess: Move existing GCS accessors definitions to gcs.h We are going to add some additional GCS access helpers to gcs.h in order to avoid some forward reference problems with uaccess. In preparation for that, lets move the existing gcssttr() and put_user_gcs() routines into gcs.h where it makes sense to keep all the accessors together. Further, the code which uses them already includes gcs.h and there is an existing CONFIG_ARM64_GCS check we can reuse. The GCSSTTR instruction description comment is corrected during the move. Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- arch/arm64/include/asm/gcs.h | 37 ++++++++++++++++++++++++++++- arch/arm64/include/asm/uaccess.h | 40 -------------------------------- 2 files changed, 36 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 5bc432234d3aba..10c68d3e6e3073 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -21,7 +21,7 @@ static inline void gcsstr(u64 *addr, u64 val) register u64 *_addr __asm__ ("x0") = addr; register long _val __asm__ ("x1") = val; - /* GCSSTTR x1, x0 */ + /* GCSSTTR x1, [x0] */ asm volatile( ".inst 0xd91f1c01\n" : @@ -81,6 +81,41 @@ static inline int gcs_check_locked(struct task_struct *task, return 0; } +static inline int gcssttr(unsigned long __user *addr, unsigned long val) +{ + register unsigned long __user *_addr __asm__ ("x0") = addr; + register unsigned long _val __asm__ ("x1") = val; + int err = 0; + + /* GCSSTTR x1, [x0] */ + asm volatile( + "1: .inst 0xd91f1c01\n" + "2: \n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) + : "+r" (err) + : "rZ" (_val), "r" (_addr) + : "memory"); + + return err; +} + +static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, + int *err) +{ + int ret; + + if (!access_ok((char __user *)addr, sizeof(u64))) { + *err = -EFAULT; + return; + } + + uaccess_ttbr0_enable(); + ret = gcssttr(addr, val); + if (ret != 0) + *err = ret; + uaccess_ttbr0_disable(); +} + #else static inline bool task_gcs_el0_enabled(struct task_struct *task) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 5b91803201ef88..1aa4ecb73429fe 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -502,44 +502,4 @@ static inline size_t probe_subpage_writeable(const char __user *uaddr, #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ -#ifdef CONFIG_ARM64_GCS - -static inline int gcssttr(unsigned long __user *addr, unsigned long val) -{ - register unsigned long __user *_addr __asm__ ("x0") = addr; - register unsigned long _val __asm__ ("x1") = val; - int err = 0; - - /* GCSSTTR x1, x0 */ - asm volatile( - "1: .inst 0xd91f1c01\n" - "2: \n" - _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) - : "+r" (err) - : "rZ" (_val), "r" (_addr) - : "memory"); - - return err; -} - -static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, - int *err) -{ - int ret; - - if (!access_ok((char __user *)addr, sizeof(u64))) { - *err = -EFAULT; - return; - } - - uaccess_ttbr0_enable(); - ret = gcssttr(addr, val); - if (ret != 0) - *err = ret; - uaccess_ttbr0_disable(); -} - - -#endif /* CONFIG_ARM64_GCS */ - #endif /* __ASM_UACCESS_H */ From 030b3ffbdac75005ef73af752a42cd48c7bba155 Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Wed, 3 Sep 2025 17:52:07 -0700 Subject: [PATCH 1931/3206] arm64: mm: Cast start/end markers to char *, not u64 There are a few memset() calls in map_kernel.c that cast marker-symbol addresses to u64 in order to perform pointer subtraction (range size computation). Cast them with (char *) instead, aligning with idiomatic C pointer arithmetic. This patch provably has no effect at runtime: I have verified that .text of vmlinux is identical after this change. Signed-off-by: Sam Edwards Signed-off-by: Will Deacon --- arch/arm64/kernel/pi/map_kernel.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index 0f4bd77718590c..2b304786023073 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -179,7 +179,7 @@ static void __init remap_idmap_for_lpa2(void) * Don't bother with the FDT, we no longer need it after this. */ memset(init_idmap_pg_dir, 0, - (u64)init_idmap_pg_end - (u64)init_idmap_pg_dir); + (char *)init_idmap_pg_end - (char *)init_idmap_pg_dir); create_init_idmap(init_idmap_pg_dir, mask); dsb(ishst); @@ -188,7 +188,7 @@ static void __init remap_idmap_for_lpa2(void) set_ttbr0_for_lpa2((u64)init_idmap_pg_dir); /* wipe the temporary ID map from memory */ - memset(init_pg_dir, 0, (u64)init_pg_end - (u64)init_pg_dir); + memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir); } static void __init map_fdt(u64 fdt) @@ -242,7 +242,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) map_fdt((u64)fdt); /* Clear BSS and the initial page tables */ - memset(__bss_start, 0, (u64)init_pg_end - (u64)__bss_start); + memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start); /* Parse the command line for CPU feature overrides */ chosen = fdt_path_offset(fdt, chosen_str); From c56aa9a67a0853ffcf64ebe7f1dbe5a5a7c315cc Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Wed, 3 Sep 2025 17:52:08 -0700 Subject: [PATCH 1932/3206] arm64: mm: Make map_fdt() return mapped pointer Currently map_fdt() accepts a physical address and relies on the caller to keep using the same value after mapping, since the implementation happens to install an identity mapping. This obscures the fact that the usable pointer is defined by the mapping, not by the input value. Since the mapping determines pointer validity, it is more natural to produce the pointer at mapping time. Change map_fdt() to return a void * pointing to the mapped FDT. This clarifies the data flow, removes the implicit identity assumption, and prepares for making map_fdt() accept a phys_addr_t in a follow-up change. Signed-off-by: Sam Edwards Signed-off-by: Will Deacon --- arch/arm64/kernel/pi/map_kernel.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index 2b304786023073..5dc4107b5a7f0f 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -191,7 +191,7 @@ static void __init remap_idmap_for_lpa2(void) memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir); } -static void __init map_fdt(u64 fdt) +static void *__init map_fdt(u64 fdt) { static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE); u64 efdt = fdt + MAX_FDT_SIZE; @@ -205,6 +205,8 @@ static void __init map_fdt(u64 fdt) fdt, PAGE_KERNEL, IDMAP_ROOT_LEVEL, (pte_t *)init_idmap_pg_dir, false, 0); dsb(ishst); + + return (void *)fdt; } /* @@ -238,15 +240,14 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) int root_level = 4 - CONFIG_PGTABLE_LEVELS; int va_bits = VA_BITS; int chosen; - - map_fdt((u64)fdt); + void *fdt_mapped = map_fdt((u64)fdt); /* Clear BSS and the initial page tables */ memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start); /* Parse the command line for CPU feature overrides */ - chosen = fdt_path_offset(fdt, chosen_str); - init_feature_override(boot_status, fdt, chosen); + chosen = fdt_path_offset(fdt_mapped, chosen_str); + init_feature_override(boot_status, fdt_mapped, chosen); if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && !cpu_has_lva()) { va_bits = VA_BITS_MIN; @@ -266,7 +267,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) * fill in the high bits from the seed. */ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { - u64 kaslr_seed = kaslr_early_init(fdt, chosen); + u64 kaslr_seed = kaslr_early_init(fdt_mapped, chosen); if (kaslr_seed && kaslr_requires_kpti()) arm64_use_ng_mappings = ng_mappings_allowed(); From b868fff5b10b6d09506e93e489ee19166bf6c5d2 Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Wed, 3 Sep 2025 17:52:09 -0700 Subject: [PATCH 1933/3206] arm64: mm: Represent physical memory with phys_addr_t and resource_size_t This is a type-correctness cleanup to MMU/boot code that replaces several instances of void * and u64 with phys_addr_t (to represent addresses) and resource_size_t (to represent sizes) to emphasize that the code in question concerns physical memory specifically. The rationale for this change is to improve clarity and readability in a few modules that handle both types (physical and virtual) of address and differentiation is essential. I have left u64 in cases where the address may be either physical or virtual, where the address is exclusively virtual but used in heavy pointer arithmetic, and in cases I may have overlooked. I do not necessarily consider u64 the ideal type in those situations, but it avoids breaking existing semantics in this cleanup. This patch provably has no effect at runtime: I have verified that .text of vmlinux is identical after this change. Signed-off-by: Sam Edwards Signed-off-by: Will Deacon --- arch/arm64/kernel/pi/map_kernel.c | 26 +++++++++++++------------- arch/arm64/kernel/pi/map_range.c | 20 ++++++++++++-------- arch/arm64/kernel/pi/pi.h | 9 +++++---- arch/arm64/mm/init.c | 6 +++--- arch/arm64/mm/mmu.c | 17 +++++++++-------- 5 files changed, 42 insertions(+), 36 deletions(-) diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index 5dc4107b5a7f0f..e6d35eff1486b0 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -18,9 +18,9 @@ extern const u8 __eh_frame_start[], __eh_frame_end[]; -extern void idmap_cpu_replace_ttbr1(void *pgdir); +extern void idmap_cpu_replace_ttbr1(phys_addr_t pgdir); -static void __init map_segment(pgd_t *pg_dir, u64 *pgd, u64 va_offset, +static void __init map_segment(pgd_t *pg_dir, phys_addr_t *pgd, u64 va_offset, void *start, void *end, pgprot_t prot, bool may_use_cont, int root_level) { @@ -40,7 +40,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) { bool enable_scs = IS_ENABLED(CONFIG_UNWIND_PATCH_PAC_INTO_SCS); bool twopass = IS_ENABLED(CONFIG_RELOCATABLE); - u64 pgdp = (u64)init_pg_dir + PAGE_SIZE; + phys_addr_t pgdp = (phys_addr_t)init_pg_dir + PAGE_SIZE; pgprot_t text_prot = PAGE_KERNEL_ROX; pgprot_t data_prot = PAGE_KERNEL; pgprot_t prot; @@ -90,7 +90,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) true, root_level); dsb(ishst); - idmap_cpu_replace_ttbr1(init_pg_dir); + idmap_cpu_replace_ttbr1((phys_addr_t)init_pg_dir); if (twopass) { if (IS_ENABLED(CONFIG_RELOCATABLE)) @@ -129,10 +129,10 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) /* Copy the root page table to its final location */ memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PAGE_SIZE); dsb(ishst); - idmap_cpu_replace_ttbr1(swapper_pg_dir); + idmap_cpu_replace_ttbr1((phys_addr_t)swapper_pg_dir); } -static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr) +static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(phys_addr_t ttbr) { u64 sctlr = read_sysreg(sctlr_el1); u64 tcr = read_sysreg(tcr_el1) | TCR_DS; @@ -172,7 +172,7 @@ static void __init remap_idmap_for_lpa2(void) */ create_init_idmap(init_pg_dir, mask); dsb(ishst); - set_ttbr0_for_lpa2((u64)init_pg_dir); + set_ttbr0_for_lpa2((phys_addr_t)init_pg_dir); /* * Recreate the initial ID map with the same granularity as before. @@ -185,17 +185,17 @@ static void __init remap_idmap_for_lpa2(void) dsb(ishst); /* switch back to the updated initial ID map */ - set_ttbr0_for_lpa2((u64)init_idmap_pg_dir); + set_ttbr0_for_lpa2((phys_addr_t)init_idmap_pg_dir); /* wipe the temporary ID map from memory */ memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir); } -static void *__init map_fdt(u64 fdt) +static void *__init map_fdt(phys_addr_t fdt) { static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE); - u64 efdt = fdt + MAX_FDT_SIZE; - u64 ptep = (u64)ptes; + phys_addr_t efdt = fdt + MAX_FDT_SIZE; + phys_addr_t ptep = (phys_addr_t)ptes; /* We're idmapped when called */ /* * Map up to MAX_FDT_SIZE bytes, but avoid overlap with @@ -232,7 +232,7 @@ static bool __init ng_mappings_allowed(void) return true; } -asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) +asmlinkage void __init early_map_kernel(u64 boot_status, phys_addr_t fdt) { static char const chosen_str[] __initconst = "/chosen"; u64 va_base, pa_base = (u64)&_text; @@ -240,7 +240,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) int root_level = 4 - CONFIG_PGTABLE_LEVELS; int va_bits = VA_BITS; int chosen; - void *fdt_mapped = map_fdt((u64)fdt); + void *fdt_mapped = map_fdt(fdt); /* Clear BSS and the initial page tables */ memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start); diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c index 7982788e7b9aaa..de52cd85c69193 100644 --- a/arch/arm64/kernel/pi/map_range.c +++ b/arch/arm64/kernel/pi/map_range.c @@ -26,8 +26,9 @@ * @va_offset: Offset between a physical page and its current mapping * in the VA space */ -void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset) +void __init map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset) { u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; ptdesc_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; @@ -87,19 +88,22 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, } } -asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) +asmlinkage phys_addr_t __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) { - u64 ptep = (u64)pg_dir + PAGE_SIZE; + phys_addr_t ptep = (phys_addr_t)pg_dir + PAGE_SIZE; /* MMU is off */ pgprot_t text_prot = PAGE_KERNEL_ROX; pgprot_t data_prot = PAGE_KERNEL; pgprot_val(text_prot) &= ~clrmask; pgprot_val(data_prot) &= ~clrmask; - map_range(&ptep, (u64)_stext, (u64)__initdata_begin, (u64)_stext, - text_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); - map_range(&ptep, (u64)__initdata_begin, (u64)_end, (u64)__initdata_begin, - data_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); + /* MMU is off; pointer casts to phys_addr_t are safe */ + map_range(&ptep, (u64)_stext, (u64)__initdata_begin, + (phys_addr_t)_stext, text_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); + map_range(&ptep, (u64)__initdata_begin, (u64)_end, + (phys_addr_t)__initdata_begin, data_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); return ptep; } diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h index 46cafee7829f48..08ef9f80456bcc 100644 --- a/arch/arm64/kernel/pi/pi.h +++ b/arch/arm64/kernel/pi/pi.h @@ -29,9 +29,10 @@ u64 kaslr_early_init(void *fdt, int chosen); void relocate_kernel(u64 offset); int scs_patch(const u8 eh_frame[], int size); -void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset); +void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset); -asmlinkage void early_map_kernel(u64 boot_status, void *fdt); +asmlinkage void early_map_kernel(u64 boot_status, phys_addr_t fdt); -asmlinkage u64 create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); +asmlinkage phys_addr_t create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index ea84a61ed50848..70c2ca813c1856 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -243,7 +243,7 @@ void __init arm64_memblock_init(void) */ if (memory_limit != PHYS_ADDR_MAX) { memblock_mem_limit_remove_map(memory_limit); - memblock_add(__pa_symbol(_text), (u64)(_end - _text)); + memblock_add(__pa_symbol(_text), (resource_size_t)(_end - _text)); } if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { @@ -252,8 +252,8 @@ void __init arm64_memblock_init(void) * initrd to become inaccessible via the linear mapping. * Otherwise, this is a no-op */ - u64 base = phys_initrd_start & PAGE_MASK; - u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; + phys_addr_t base = phys_initrd_start & PAGE_MASK; + resource_size_t size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; /* * We can only add back the initrd memory if we don't end up diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 34e5d78af076d0..de463040582c49 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -794,17 +794,18 @@ static void __init declare_kernel_vmas(void) declare_vma(&vmlinux_seg[4], _data, _end, 0); } -void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset); +void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset); static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; static void __init create_idmap(void) { - u64 start = __pa_symbol(__idmap_text_start); - u64 end = __pa_symbol(__idmap_text_end); - u64 ptep = __pa_symbol(idmap_ptes); + phys_addr_t start = __pa_symbol(__idmap_text_start); + phys_addr_t end = __pa_symbol(__idmap_text_end); + phys_addr_t ptep = __pa_symbol(idmap_ptes); __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX, IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, @@ -812,7 +813,7 @@ static void __init create_idmap(void) if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) { extern u32 __idmap_kpti_flag; - u64 pa = __pa_symbol(&__idmap_kpti_flag); + phys_addr_t pa = __pa_symbol(&__idmap_kpti_flag); /* * The KPTI G-to-nG conversion code needs a read-write mapping @@ -1331,8 +1332,8 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) struct range arch_get_mappable_range(void) { struct range mhp_range; - u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); - u64 end_linear_pa = __pa(PAGE_END - 1); + phys_addr_t start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); + phys_addr_t end_linear_pa = __pa(PAGE_END - 1); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { /* From c0f303d7d4723b01c686e949e6f26a93e5cda910 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Tue, 9 Sep 2025 11:32:35 +0800 Subject: [PATCH 1934/3206] arm64: mm: Rework the 'rodata=' options As per admin guide documentation, "rodata=on" should be the default on platforms. Documentation/admin-guide/kernel-parameters.txt describes these options as rodata= [KNL,EARLY] on Mark read-only kernel memory as read-only (default). off Leave read-only kernel memory writable for debugging. full Mark read-only kernel memory and aliases as read-only [arm64] But on arm64 platform, RODATA_FULL_DEFAULT_ENABLED is enabled by default, so "rodata=full" is the default instead. For parity with other architectures, namely x86, rework 'rodata=on' to match the current "full" behaviour and replace 'rodata=full' with a new 'rodata=noalias' option which retains writable aliases in the direct map for memory regions outside of the kernel image. Signed-off-by: Huang Shijie Reviewed-by: Anshuman Khandual Signed-off-by: Will Deacon --- Documentation/admin-guide/kernel-parameters.txt | 5 +++-- arch/arm64/include/asm/setup.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 747a55abf4946b..fe99652c584ec8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6405,8 +6405,9 @@ rodata= [KNL,EARLY] on Mark read-only kernel memory as read-only (default). off Leave read-only kernel memory writable for debugging. - full Mark read-only kernel memory and aliases as read-only - [arm64] + noalias Mark read-only kernel memory as read-only but retain + writable aliases in the direct map for regions outside + of the kernel image. [arm64] rockchip.usb_uart [EARLY] diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h index ba269a7a320134..3d96dde4d214cb 100644 --- a/arch/arm64/include/asm/setup.h +++ b/arch/arm64/include/asm/setup.h @@ -21,7 +21,7 @@ static inline bool arch_parse_debug_rodata(char *arg) if (!arg) return false; - if (!strcmp(arg, "full")) { + if (!strcmp(arg, "on")) { rodata_enabled = rodata_full = true; return true; } @@ -31,7 +31,7 @@ static inline bool arch_parse_debug_rodata(char *arg) return true; } - if (!strcmp(arg, "on")) { + if (!strcmp(arg, "noalias")) { rodata_enabled = true; rodata_full = false; return true; From bfbbb0d3215f0f6ef622cc8066e5f6afda6960a2 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Tue, 9 Sep 2025 11:32:36 +0800 Subject: [PATCH 1935/3206] arm64/Kconfig: Remove CONFIG_RODATA_FULL_DEFAULT_ENABLED Now that 'rodata=full' has been removed in favour of parity with x86, CONFIG_RODATA_FULL_DEFAULT_ENABLED no longer serves a useful purpose. Remove it. Reviewed-by: Anshuman Khandual Signed-off-by: Huang Shijie Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 14 -------------- arch/arm64/mm/pageattr.c | 2 +- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a64d..8b3c23ee7a6463 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1698,20 +1698,6 @@ config MITIGATE_SPECTRE_BRANCH_HISTORY When taking an exception from user-space, a sequence of branches or a firmware call overwrites the branch history. -config RODATA_FULL_DEFAULT_ENABLED - bool "Apply r/o permissions of VM areas also to their linear aliases" - default y - help - Apply read-only attributes of VM areas to the linear alias of - the backing pages as well. This prevents code or read-only data - from being modified (inadvertently or intentionally) via another - mapping of the same memory page. This additional enhancement can - be turned off at runtime by passing rodata=[off|on] (and turned on - with rodata=full if this option is set to 'n') - - This requires the linear region to be mapped down to pages, - which may adversely affect performance in some cases. - config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" depends on !KCSAN diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 04d4a8f676db42..667aff1efe4928 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -20,7 +20,7 @@ struct page_change_data { pgprot_t clear_mask; }; -bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); +bool rodata_full __ro_after_init = true; bool can_set_direct_map(void) { From 19dd484cd19c308e2ea763f8d31e43a7f1ab6141 Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Fri, 12 Sep 2025 20:09:05 -0400 Subject: [PATCH 1936/3206] arm64/fpsimd: simplify sme_setup() The function checks info->vq_map for emptiness right before calling find_last_bit(). We can use the find_last_bit() output and save on bitmap_empty() call, which is O(N). Signed-off-by: Yury Norov (NVIDIA) Signed-off-by: Will Deacon --- arch/arm64/kernel/fpsimd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index c37f02d7194e0b..e3f8f51748bc94 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1265,6 +1265,8 @@ void __init sme_setup(void) if (!system_supports_sme()) return; + min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); + /* * SME doesn't require any particular vector length be * supported but it does require at least one. We should have @@ -1272,9 +1274,8 @@ void __init sme_setup(void) * let's double check here. The bitmap is SVE_VQ_MAP sized for * sharing with SVE. */ - WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX)); + WARN_ON(min_bit >= SVE_VQ_MAX); - min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit)); max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX); From 0b47b6c3543efd65f2e620e359b05f4938314fbd Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 12 Sep 2025 18:14:38 +0200 Subject: [PATCH 1937/3206] Revert "sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scx_bpf_reenqueue_local() can be called from ops.cpu_release() when a CPU is taken by a higher scheduling class to give tasks queued to the CPU's local DSQ a chance to be migrated somewhere else, instead of waiting indefinitely for that CPU to become available again. In doing so, we decided to skip migration-disabled tasks, under the assumption that they cannot be migrated anyway. However, when a higher scheduling class preempts a CPU, the running task is always inserted at the head of the local DSQ as a migration-disabled task. This means it is always skipped by scx_bpf_reenqueue_local(), and ends up being confined to the same CPU even if that CPU is heavily contended by other higher scheduling class tasks. As an example, let's consider the following scenario: $ schedtool -a 0,1, -e yes > /dev/null $ sudo schedtool -F -p 99 -a 0, -e \ stress-ng -c 1 --cpu-load 99 --cpu-load-slice 1000 The first task (SCHED_EXT) can run on CPU0 or CPU1. The second task (SCHED_FIFO) is pinned to CPU0 and consumes ~99% of it. If the SCHED_EXT task initially runs on CPU0, it will remain there because it always sees CPU0 as "idle" in the short gaps left by the RT task, resulting in ~1% utilization while CPU1 stays idle: 0[||||||||||||||||||||||100.0%] 8[ 0.0%] 1[ 0.0%] 9[ 0.0%] 2[ 0.0%] 10[ 0.0%] 3[ 0.0%] 11[ 0.0%] 4[ 0.0%] 12[ 0.0%] 5[ 0.0%] 13[ 0.0%] 6[ 0.0%] 14[ 0.0%] 7[ 0.0%] 15[ 0.0%] PID USER PRI NI S CPU CPU%▽MEM% TIME+ Command 1067 root RT 0 R 0 99.0 0.2 0:31.16 stress-ng-cpu [run] 975 arighi 20 0 R 0 1.0 0.0 0:26.32 yes By allowing scx_bpf_reenqueue_local() to re-enqueue migration-disabled tasks, the scheduler can choose to migrate them to other CPUs (CPU1 in this case) via ops.enqueue(), leading to better CPU utilization: 0[||||||||||||||||||||||100.0%] 8[ 0.0%] 1[||||||||||||||||||||||100.0%] 9[ 0.0%] 2[ 0.0%] 10[ 0.0%] 3[ 0.0%] 11[ 0.0%] 4[ 0.0%] 12[ 0.0%] 5[ 0.0%] 13[ 0.0%] 6[ 0.0%] 14[ 0.0%] 7[ 0.0%] 15[ 0.0%] PID USER PRI NI S CPU CPU%▽MEM% TIME+ Command 577 root RT 0 R 0 100.0 0.2 0:23.17 stress-ng-cpu [run] 555 arighi 20 0 R 1 100.0 0.0 0:28.67 yes It's debatable whether per-CPU tasks should be re-enqueued as well, but doing so is probably safer: the scheduler can recognize re-enqueued tasks through the %SCX_ENQ_REENQ flag, reassess their placement, and either put them back at the head of the local DSQ or let another task attempt to take the CPU. This also prevents giving per-CPU tasks an implicit priority boost, which would otherwise make them more likely to reclaim CPUs preempted by higher scheduling classes. Fixes: 97e13ecb02668 ("sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()") Cc: stable@vger.kernel.org # v6.15+ Signed-off-by: Andrea Righi Acked-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4ae32ef179dd0f..088ceff38c8a47 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6788,12 +6788,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. - * - * Also skip re-enqueueing tasks that can only run on this - * CPU, as they would just be re-added to the same local - * DSQ without any benefit. */ - if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) + if (p->migration_pending) continue; dispatch_dequeue(rq, p); From 5020d05b3476f3561377a4ab076d42fda00e3607 Mon Sep 17 00:00:00 2001 From: Huisong Li Date: Thu, 11 Sep 2025 19:24:06 +0800 Subject: [PATCH 1938/3206] ACPI: processor: Remove unused empty stubs of some functions Empty stubs are defined in processor.h for some functions provided by the ACPI processor idle driver, but those functions are only used in the main ACPI processor driver which requires the ACPI processor idle driver to be present (selecting CONFIG_ACPI_PROCESSOR causes CONFIG_ACPI_PROCESSOR_IDLE to be selected too automatically). This means that the empty stubs in question are not really necessary and if both CONFIG_ACPI_PROCESSOR and CONFIG_ACPI_PROCESSOR_IDLE are unset, the compiler complains that they are defined, but not used. Drop them to get rid of the compiler warning. Signed-off-by: Huisong Li Link: https://patch.msgid.link/20250911112408.1668431-2-lihuisong@huawei.com [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- include/acpi/processor.h | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/include/acpi/processor.h b/include/acpi/processor.h index ff864c1cee3a4c..2976a6d0c54f79 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -425,26 +425,6 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); void acpi_processor_register_idle_driver(void); void acpi_processor_unregister_idle_driver(void); -#else -static inline int acpi_processor_power_init(struct acpi_processor *pr) -{ - return -ENODEV; -} - -static inline int acpi_processor_power_exit(struct acpi_processor *pr) -{ - return -ENODEV; -} - -static inline int acpi_processor_power_state_has_changed(struct acpi_processor *pr) -{ - return -ENODEV; -} - -static inline int acpi_processor_hotplug(struct acpi_processor *pr) -{ - return -ENODEV; -} #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ /* in processor_thermal.c */ From 9cd2a7f1180f9b6fe5214abc90eaf5c053f545ee Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:17 -0500 Subject: [PATCH 1939/3206] arm64: uaccess: Add additional userspace GCS accessors Uprobes need more advanced read, push, and pop userspace GCS functionality. Implement those features using the existing gcsstr() and copy_from_user(). Its important to note that GCS pages can be read by normal instructions, but the hardware validates that pages used by GCS specific operations, have a GCS privilege set. We aren't validating this in load_user_gcs because it requires stabilizing the VMA over the read which may fault. Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas Reviewed-by: Mark Brown [will: Add '__force' to gcspr cast in pop_user_gcs()] Signed-off-by: Will Deacon --- arch/arm64/include/asm/gcs.h | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 10c68d3e6e3073..8fa0707069e8b9 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -116,6 +116,47 @@ static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, uaccess_ttbr0_disable(); } +static inline void push_user_gcs(unsigned long val, int *err) +{ + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + + gcspr -= sizeof(u64); + put_user_gcs(val, (unsigned long __user *)gcspr, err); + if (!*err) + write_sysreg_s(gcspr, SYS_GCSPR_EL0); +} + +/* + * Unlike put/push_user_gcs() above, get/pop_user_gsc() doesn't + * validate the GCS permission is set on the page being read. This + * differs from how the hardware works when it consumes data stored at + * GCSPR. Callers should ensure this is acceptable. + */ +static inline u64 get_user_gcs(unsigned long __user *addr, int *err) +{ + unsigned long ret; + u64 load = 0; + + /* Ensure previous GCS operation are visible before we read the page */ + gcsb_dsync(); + ret = copy_from_user(&load, addr, sizeof(load)); + if (ret != 0) + *err = ret; + return load; +} + +static inline u64 pop_user_gcs(int *err) +{ + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + u64 read_val; + + read_val = get_user_gcs((__force unsigned long __user *)gcspr, err); + if (!*err) + write_sysreg_s(gcspr + sizeof(u64), SYS_GCSPR_EL0); + + return read_val; +} + #else static inline bool task_gcs_el0_enabled(struct task_struct *task) @@ -126,6 +167,10 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task) static inline void gcs_set_el0_mode(struct task_struct *task) { } static inline void gcs_free(struct task_struct *task) { } static inline void gcs_preserve_current_state(void) { } +static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, + int *err) { } +static inline void push_user_gcs(unsigned long val, int *err) { } + static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, const struct kernel_clone_args *args) { @@ -136,6 +181,15 @@ static inline int gcs_check_locked(struct task_struct *task, { return 0; } +static inline u64 get_user_gcs(unsigned long __user *addr, int *err) +{ + *err = -EFAULT; + return 0; +} +static inline u64 pop_user_gcs(int *err) +{ + return 0; +} #endif From efb07ac534e24e22a7eb32815fb50f69931cdeae Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:18 -0500 Subject: [PATCH 1940/3206] arm64: probes: Add GCS support to bl/blr/ret The arm64 probe simulation doesn't currently have logic in place to deal with GCS and this results in core dumps if probes are inserted at control flow locations. Fix-up bl, blr and ret to manipulate the shadow stack as needed. While we manipulate and validate the shadow stack correctly, the hardware provides additional security by only allowing GCS operations against pages which are marked to support GCS. For writing there is gcssttr() which enforces this, but there isn't an equivalent for reading. This means that uprobe users should be aware that probing on control flow instructions which require reading the shadow stack (ex: ret) offers lower security guarantees than what is achieved without the uprobe active. Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/simulate-insn.c | 44 +++++++++++++++++++----- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 09a0b36122d010..97ed4db7541796 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -13,6 +13,7 @@ #include #include "simulate-insn.h" +#include "asm/gcs.h" #define bbl_displacement(insn) \ sign_extend32(((insn) & 0x3ffffff) << 2, 27) @@ -49,6 +50,21 @@ static inline u32 get_w_reg(struct pt_regs *regs, int reg) return lower_32_bits(pt_regs_read_reg(regs, reg)); } +static inline int update_lr(struct pt_regs *regs, long addr) +{ + int err = 0; + + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + push_user_gcs(addr, &err); + if (err) { + force_sig(SIGSEGV); + return err; + } + } + procedure_link_pointer_set(regs, addr); + return err; +} + static bool __kprobes check_cbz(u32 opcode, struct pt_regs *regs) { int xn = opcode & 0x1f; @@ -107,9 +123,9 @@ simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs) { int disp = bbl_displacement(opcode); - /* Link register is x30 */ if (opcode & (1 << 31)) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; instruction_pointer_set(regs, addr + disp); } @@ -129,21 +145,31 @@ void __kprobes simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs) { int xn = (opcode >> 5) & 0x1f; + int b_target = get_x_reg(regs, xn); - /* update pc first in case we're doing a "blr lr" */ - instruction_pointer_set(regs, get_x_reg(regs, xn)); - - /* Link register is x30 */ if (((opcode >> 21) & 0x3) == 1) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; + + instruction_pointer_set(regs, b_target); } void __kprobes simulate_ret(u32 opcode, long addr, struct pt_regs *regs) { + u64 ret_addr; + int err = 0; int xn = (opcode >> 5) & 0x1f; - - instruction_pointer_set(regs, get_x_reg(regs, xn)); + unsigned long r_target = get_x_reg(regs, xn); + + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + ret_addr = pop_user_gcs(&err); + if (err || ret_addr != r_target) { + force_sig(SIGSEGV); + return; + } + } + instruction_pointer_set(regs, r_target); } void __kprobes From dadb3ebcf395ebee3626d88ac7e5e234f15bae2c Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Sun, 14 Sep 2025 15:44:26 +0200 Subject: [PATCH 1941/3206] workqueue: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch adds a new WQ_PERCPU flag to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 4 ++-- kernel/workqueue.c | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index b6834b7aee4bce..71a9900c03c706 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -410,7 +410,7 @@ enum wq_flags { __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ /* BH wq only allows the following flags */ - __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, + __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI | WQ_PERCPU, }; enum wq_consts { @@ -570,7 +570,7 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ - alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 90db8cf015c2fb..45320e27a16c4a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7828,22 +7828,22 @@ void __init workqueue_init_early(void) ordered_wq_attrs[i] = attrs; } - system_wq = alloc_workqueue("events", 0, 0); - system_percpu_wq = alloc_workqueue("events", 0, 0); - system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); - system_long_wq = alloc_workqueue("events_long", 0, 0); + system_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_highpri_wq = alloc_workqueue("events_highpri", + WQ_HIGHPRI | WQ_PERCPU, 0); + system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", - WQ_FREEZABLE, 0); + WQ_FREEZABLE | WQ_PERCPU, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", - WQ_POWER_EFFICIENT, 0); + WQ_POWER_EFFICIENT | WQ_PERCPU, 0); system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", - WQ_FREEZABLE | WQ_POWER_EFFICIENT, - 0); - system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); + WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0); + system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", - WQ_BH | WQ_HIGHPRI, 0); + WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0); BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || From 4a601714bb24926507b2051c4a95f07e0e142004 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:19 -0500 Subject: [PATCH 1942/3206] arm64: uprobes: Add GCS support to uretprobes Ret probes work by changing the value in the link register at the probe location to return to the probe rather than the calling routine. Thus the GCS needs to be updated with this address as well. Since its possible to insert probes at locations where the current value of the LR doesn't match the GCS state this needs to be detected and handled in order to maintain the existing no-fault behavior. Co-developed-by: Steve Capper Signed-off-by: Steve Capper Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas [will: Add '__force' to gcspr casts in arch_uretprobe_hijack_return_addr()] Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/uprobes.c | 33 ++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 1f91fd2a818798..2799bdb2fb820b 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "decode-insn.h" @@ -159,11 +160,43 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { unsigned long orig_ret_vaddr; + unsigned long gcs_ret_vaddr; + int err = 0; + u64 gcspr; orig_ret_vaddr = procedure_link_pointer(regs); + + if (task_gcs_el0_enabled(current)) { + gcspr = read_sysreg_s(SYS_GCSPR_EL0); + gcs_ret_vaddr = get_user_gcs((__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + + /* + * If the LR and GCS return addr don't match, then some kind of PAC + * signing or control flow occurred since entering the probed function. + * Likely because the user is attempting to retprobe on an instruction + * that isn't a function boundary or inside a leaf function. Explicitly + * abort this retprobe because it will generate a GCS exception. + */ + if (gcs_ret_vaddr != orig_ret_vaddr) { + orig_ret_vaddr = -1; + goto out; + } + + put_user_gcs(trampoline_vaddr, (__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + } + /* Replace the return addr with trampoline addr */ procedure_link_pointer_set(regs, trampoline_vaddr); +out: return orig_ret_vaddr; } From cc66c711e58f5dd29da79c24ca699a0312e012e3 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:20 -0500 Subject: [PATCH 1943/3206] arm64: Kconfig: Remove GCS restrictions on UPROBES Now that the uprobe paths have been made GCS compatible drop the Kconfig restriction. Signed-off-by: Jeremy Linton Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a64d..c61572bbe59b7e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2225,7 +2225,6 @@ config ARM64_GCS default y select ARCH_HAS_USER_SHADOW_STACK select ARCH_USES_HIGH_VMA_FLAGS - depends on !UPROBES help Guarded Control Stack (GCS) provides support for a separate stack with restricted access which contains only return From ba1afc94deb849eab843a372b969444581add2c9 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:21 -0500 Subject: [PATCH 1944/3206] uprobes: uprobe_warn should use passed task uprobe_warn() is passed a task structure, yet its using current. For the most part this shouldn't matter, but since a task structure is provided, lets use it. Fixes: 248d3a7b2f10 ("uprobes: Change uprobe_copy_process() to dup return_instances") Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Signed-off-by: Will Deacon --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd83..4b97d16f731c1a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -121,7 +121,7 @@ struct xol_area { static void uprobe_warn(struct task_struct *t, const char *msg) { - pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); + pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg); } /* From 84bf1ac85af84d354c7a2fdbdc0d4efc8aaec34b Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 25 Aug 2025 16:00:14 -0700 Subject: [PATCH 1945/3206] ice: fix Rx page leak on multi-buffer frames The ice_put_rx_mbuf() function handles calling ice_put_rx_buf() for each buffer in the current frame. This function was introduced as part of handling multi-buffer XDP support in the ice driver. It works by iterating over the buffers from first_desc up to 1 plus the total number of fragments in the frame, cached from before the XDP program was executed. If the hardware posts a descriptor with a size of 0, the logic used in ice_put_rx_mbuf() breaks. Such descriptors get skipped and don't get added as fragments in ice_add_xdp_frag. Since the buffer isn't counted as a fragment, we do not iterate over it in ice_put_rx_mbuf(), and thus we don't call ice_put_rx_buf(). Because we don't call ice_put_rx_buf(), we don't attempt to re-use the page or free it. This leaves a stale page in the ring, as we don't increment next_to_alloc. The ice_reuse_rx_page() assumes that the next_to_alloc has been incremented properly, and that it always points to a buffer with a NULL page. Since this function doesn't check, it will happily recycle a page over the top of the next_to_alloc buffer, losing track of the old page. Note that this leak only occurs for multi-buffer frames. The ice_put_rx_mbuf() function always handles at least one buffer, so a single-buffer frame will always get handled correctly. It is not clear precisely why the hardware hands us descriptors with a size of 0 sometimes, but it happens somewhat regularly with "jumbo frames" used by 9K MTU. To fix ice_put_rx_mbuf(), we need to make sure to call ice_put_rx_buf() on all buffers between first_desc and next_to_clean. Borrow the logic of a similar function in i40e used for this same purpose. Use the same logic also in ice_get_pgcnts(). Instead of iterating over just the number of fragments, use a loop which iterates until the current index reaches to the next_to_clean element just past the current frame. Unlike i40e, the ice_put_rx_mbuf() function does call ice_put_rx_buf() on the last buffer of the frame indicating the end of packet. For non-linear (multi-buffer) frames, we need to take care when adjusting the pagecnt_bias. An XDP program might release fragments from the tail of the frame, in which case that fragment page is already released. Only update the pagecnt_bias for the first descriptor and fragments still remaining post-XDP program. Take care to only access the shared info for fragmented buffers, as this avoids a significant cache miss. The xdp_xmit value only needs to be updated if an XDP program is run, and only once per packet. Drop the xdp_xmit pointer argument from ice_put_rx_mbuf(). Instead, set xdp_xmit in the ice_clean_rx_irq() function directly. This avoids needing to pass the argument and avoids an extra bit-wise OR for each buffer in the frame. Move the increment of the ntc local variable to ensure its updated *before* all calls to ice_get_pgcnts() or ice_put_rx_mbuf(), as the loop logic requires the index of the element just after the current frame. Now that we use an index pointer in the ring to identify the packet, we no longer need to track or cache the number of fragments in the rx_ring. Cc: Christoph Petrausch Cc: Jesper Dangaard Brouer Reported-by: Jaroslav Pulchart Closes: https://lore.kernel.org/netdev/CAK8fFZ4hY6GUJNENz3wY9jaYLZXGfpr7dnZxzGMYoE44caRbgw@mail.gmail.com/ Fixes: 743bbd93cf29 ("ice: put Rx buffers after being done with current frame") Tested-by: Michal Kubiak Signed-off-by: Jacob Keller Acked-by: Jesper Dangaard Brouer Tested-by: Priya Singh Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 80 ++++++++++------------- drivers/net/ethernet/intel/ice/ice_txrx.h | 1 - 2 files changed, 34 insertions(+), 47 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index d2871757ec9401..41e7e29879a309 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -894,10 +894,6 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page, rx_buf->page_offset, size); sinfo->xdp_frags_size += size; - /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail() - * can pop off frags but driver has to handle it on its own - */ - rx_ring->nr_frags = sinfo->nr_frags; if (page_is_pfmemalloc(rx_buf->page)) xdp_buff_set_frag_pfmemalloc(xdp); @@ -968,20 +964,20 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size, /** * ice_get_pgcnts - grab page_count() for gathered fragments * @rx_ring: Rx descriptor ring to store the page counts on + * @ntc: the next to clean element (not included in this frame!) * * This function is intended to be called right before running XDP * program so that the page recycling mechanism will be able to take * a correct decision regarding underlying pages; this is done in such * way as XDP program can change the refcount of page */ -static void ice_get_pgcnts(struct ice_rx_ring *rx_ring) +static void ice_get_pgcnts(struct ice_rx_ring *rx_ring, unsigned int ntc) { - u32 nr_frags = rx_ring->nr_frags + 1; u32 idx = rx_ring->first_desc; struct ice_rx_buf *rx_buf; u32 cnt = rx_ring->count; - for (int i = 0; i < nr_frags; i++) { + while (idx != ntc) { rx_buf = &rx_ring->rx_buf[idx]; rx_buf->pgcnt = page_count(rx_buf->page); @@ -1154,62 +1150,51 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf) } /** - * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all frame frags + * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all buffers in frame * @rx_ring: Rx ring with all the auxiliary data * @xdp: XDP buffer carrying linear + frags part - * @xdp_xmit: XDP_TX/XDP_REDIRECT verdict storage - * @ntc: a current next_to_clean value to be stored at rx_ring + * @ntc: the next to clean element (not included in this frame!) * @verdict: return code from XDP program execution * - * Walk through gathered fragments and satisfy internal page - * recycle mechanism; we take here an action related to verdict - * returned by XDP program; + * Called after XDP program is completed, or on error with verdict set to + * ICE_XDP_CONSUMED. + * + * Walk through buffers from first_desc to the end of the frame, releasing + * buffers and satisfying internal page recycle mechanism. The action depends + * on verdict from XDP program. */ static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, - u32 *xdp_xmit, u32 ntc, u32 verdict) + u32 ntc, u32 verdict) { - u32 nr_frags = rx_ring->nr_frags + 1; u32 idx = rx_ring->first_desc; u32 cnt = rx_ring->count; - u32 post_xdp_frags = 1; struct ice_rx_buf *buf; - int i; + u32 xdp_frags = 0; + int i = 0; if (unlikely(xdp_buff_has_frags(xdp))) - post_xdp_frags += xdp_get_shared_info_from_buff(xdp)->nr_frags; + xdp_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; - for (i = 0; i < post_xdp_frags; i++) { + while (idx != ntc) { buf = &rx_ring->rx_buf[idx]; + if (++idx == cnt) + idx = 0; - if (verdict & (ICE_XDP_TX | ICE_XDP_REDIR)) { + /* An XDP program could release fragments from the end of the + * buffer. For these, we need to keep the pagecnt_bias as-is. + * To do this, only adjust pagecnt_bias for fragments up to + * the total remaining after the XDP program has run. + */ + if (verdict != ICE_XDP_CONSUMED) ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - *xdp_xmit |= verdict; - } else if (verdict & ICE_XDP_CONSUMED) { + else if (i++ <= xdp_frags) buf->pagecnt_bias++; - } else if (verdict == ICE_XDP_PASS) { - ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - } ice_put_rx_buf(rx_ring, buf); - - if (++idx == cnt) - idx = 0; - } - /* handle buffers that represented frags released by XDP prog; - * for these we keep pagecnt_bias as-is; refcount from struct page - * has been decremented within XDP prog and we do not have to increase - * the biased refcnt - */ - for (; i < nr_frags; i++) { - buf = &rx_ring->rx_buf[idx]; - ice_put_rx_buf(rx_ring, buf); - if (++idx == cnt) - idx = 0; } xdp->data = NULL; rx_ring->first_desc = ntc; - rx_ring->nr_frags = 0; } /** @@ -1317,6 +1302,10 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) /* retrieve a buffer from the ring */ rx_buf = ice_get_rx_buf(rx_ring, size, ntc); + /* Increment ntc before calls to ice_put_rx_mbuf() */ + if (++ntc == cnt) + ntc = 0; + if (!xdp->data) { void *hard_start; @@ -1325,24 +1314,23 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) xdp_prepare_buff(xdp, hard_start, offset, size, !!offset); xdp_buff_clear_frags_flag(xdp); } else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) { - ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc, ICE_XDP_CONSUMED); + ice_put_rx_mbuf(rx_ring, xdp, ntc, ICE_XDP_CONSUMED); break; } - if (++ntc == cnt) - ntc = 0; /* skip if it is NOP desc */ if (ice_is_non_eop(rx_ring, rx_desc)) continue; - ice_get_pgcnts(rx_ring); + ice_get_pgcnts(rx_ring, ntc); xdp_verdict = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_desc); if (xdp_verdict == ICE_XDP_PASS) goto construct_skb; total_rx_bytes += xdp_get_buff_len(xdp); total_rx_pkts++; - ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); + ice_put_rx_mbuf(rx_ring, xdp, ntc, xdp_verdict); + xdp_xmit |= xdp_verdict & (ICE_XDP_TX | ICE_XDP_REDIR); continue; construct_skb: @@ -1355,7 +1343,7 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) rx_ring->ring_stats->rx_stats.alloc_buf_failed++; xdp_verdict = ICE_XDP_CONSUMED; } - ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); + ice_put_rx_mbuf(rx_ring, xdp, ntc, xdp_verdict); if (!skb) break; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index fef750c5f288f3..2fd8e78178a271 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -358,7 +358,6 @@ struct ice_rx_ring { struct ice_tx_ring *xdp_ring; struct ice_rx_ring *next; /* pointer to next ring in q_vector */ struct xsk_buff_pool *xsk_pool; - u32 nr_frags; u16 max_frame; u16 rx_buf_len; dma_addr_t dma; /* physical address of ring */ From e37084a26070c546ae7961ee135bbfb15fbe13fd Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 22 Aug 2025 17:16:17 +0200 Subject: [PATCH 1946/3206] i40e: remove redundant memory barrier when cleaning Tx descs i40e has a feature which writes to memory location last descriptor successfully sent. Memory barrier in i40e_clean_tx_irq() was used to avoid forward-reading descriptor fields in case DD bit was not set. Having mentioned feature in place implies that such situation will not happen as we know in advance how many descriptors HW has dealt with. Besides, this barrier placement was wrong. Idea is to have this protection *after* reading DD bit from HW descriptor, not before. Digging through git history showed me that indeed barrier was before DD bit check, anyways the commit introducing i40e_get_head() should have wiped it out altogether. Also, there was one commit doing s/read_barrier_depends/smp_rmb when get head feature was already in place, but it was only theoretical based on ixgbe experiences, which is different in these terms as that driver has to read DD bit from HW descriptor. Fixes: 1943d8ba9507 ("i40e/i40evf: enable hardware feature head write back") Signed-off-by: Maciej Fijalkowski Reviewed-by: Aleksandr Loktionov Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 048c3303913094..b194eae032084d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -948,9 +948,6 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi, if (!eop_desc) break; - /* prevent any other reads prior to eop_desc */ - smp_rmb(); - i40e_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf); /* we have caught up to head, no work left to do */ if (tx_head == tx_desc) From b85936e95a4bd2a07e134af71e2c0750a69d2b8b Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 8 Sep 2025 13:26:28 +0200 Subject: [PATCH 1947/3206] ixgbe: initialize aci.lock before it's used Currently aci.lock is initialized too late. A bunch of ACI callbacks using the lock are called prior it's initialized. Commit 337369f8ce9e ("locking/mutex: Add MUTEX_WARN_ON() into fast path") highlights that issue what results in call trace. [ 4.092899] DEBUG_LOCKS_WARN_ON(lock->magic != lock) [ 4.092910] WARNING: CPU: 0 PID: 578 at kernel/locking/mutex.c:154 mutex_lock+0x6d/0x80 [ 4.098757] Call Trace: [ 4.098847] [ 4.098922] ixgbe_aci_send_cmd+0x8c/0x1e0 [ixgbe] [ 4.099108] ? hrtimer_try_to_cancel+0x18/0x110 [ 4.099277] ixgbe_aci_get_fw_ver+0x52/0xa0 [ixgbe] [ 4.099460] ixgbe_check_fw_error+0x1fc/0x2f0 [ixgbe] [ 4.099650] ? usleep_range_state+0x69/0xd0 [ 4.099811] ? usleep_range_state+0x8c/0xd0 [ 4.099964] ixgbe_probe+0x3b0/0x12d0 [ixgbe] [ 4.100132] local_pci_probe+0x43/0xa0 [ 4.100267] work_for_cpu_fn+0x13/0x20 [ 4.101647] Move aci.lock mutex initialization to ixgbe_sw_init() before any ACI command is sent. Along with that move also related SWFW semaphore in order to reduce size of ixgbe_probe() and that way all locks are initialized in ixgbe_sw_init(). Reviewed-by: Michal Swiatkowski Reviewed-by: Przemek Kitszel Reviewed-by: Simon Horman Reviewed-by: Aleksandr Loktionov Fixes: 4600cdf9f5ac ("ixgbe: Enable link management in E610 device") Signed-off-by: Jedrzej Jagielski Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 80e6a2ef1350e3..9a6a67a6d64464 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6973,6 +6973,13 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter, break; } + /* Make sure the SWFW semaphore is in a valid state */ + if (hw->mac.ops.init_swfw_sync) + hw->mac.ops.init_swfw_sync(hw); + + if (hw->mac.type == ixgbe_mac_e610) + mutex_init(&hw->aci.lock); + #ifdef IXGBE_FCOE /* FCoE support exists, always init the FCoE lock */ spin_lock_init(&adapter->fcoe.lock); @@ -11643,10 +11650,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_sw_init; - /* Make sure the SWFW semaphore is in a valid state */ - if (hw->mac.ops.init_swfw_sync) - hw->mac.ops.init_swfw_sync(hw); - if (ixgbe_check_fw_error(adapter)) return ixgbe_recovery_probe(adapter); @@ -11850,8 +11853,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ether_addr_copy(hw->mac.addr, hw->mac.perm_addr); ixgbe_mac_set_default_filter(adapter); - if (hw->mac.type == ixgbe_mac_e610) - mutex_init(&hw->aci.lock); timer_setup(&adapter->service_timer, ixgbe_service_timer, 0); if (ixgbe_removed(hw->hw_addr)) { @@ -12007,9 +12008,9 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) devl_unlock(adapter->devlink); ixgbe_release_hw_control(adapter); ixgbe_clear_interrupt_scheme(adapter); +err_sw_init: if (hw->mac.type == ixgbe_mac_e610) mutex_destroy(&adapter->hw.aci.lock); -err_sw_init: ixgbe_disable_sriov(adapter); adapter->flags2 &= ~IXGBE_FLAG2_SEARCH_FOR_SFP; iounmap(adapter->io_addr); From 316ba68175b04a9f6f75295764789ea94e31d48c Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 8 Sep 2025 13:26:29 +0200 Subject: [PATCH 1948/3206] ixgbe: destroy aci.lock later within ixgbe_remove path There's another issue with aci.lock and previous patch uncovers it. aci.lock is being destroyed during removing ixgbe while some of the ixgbe closing routines are still ongoing. These routines use Admin Command Interface which require taking aci.lock which has been already destroyed what leads to call trace. [ +0.000004] DEBUG_LOCKS_WARN_ON(lock->magic != lock) [ +0.000007] WARNING: CPU: 12 PID: 10277 at kernel/locking/mutex.c:155 mutex_lock+0x5f/0x70 [ +0.000002] Call Trace: [ +0.000003] [ +0.000006] ixgbe_aci_send_cmd+0xc8/0x220 [ixgbe] [ +0.000049] ? try_to_wake_up+0x29d/0x5d0 [ +0.000009] ixgbe_disable_rx_e610+0xc4/0x110 [ixgbe] [ +0.000032] ixgbe_disable_rx+0x3d/0x200 [ixgbe] [ +0.000027] ixgbe_down+0x102/0x3b0 [ixgbe] [ +0.000031] ixgbe_close_suspend+0x28/0x90 [ixgbe] [ +0.000028] ixgbe_close+0xfb/0x100 [ixgbe] [ +0.000025] __dev_close_many+0xae/0x220 [ +0.000005] dev_close_many+0xc2/0x1a0 [ +0.000004] ? kernfs_should_drain_open_files+0x2a/0x40 [ +0.000005] unregister_netdevice_many_notify+0x204/0xb00 [ +0.000006] ? __kernfs_remove.part.0+0x109/0x210 [ +0.000006] ? kobj_kset_leave+0x4b/0x70 [ +0.000008] unregister_netdevice_queue+0xf6/0x130 [ +0.000006] unregister_netdev+0x1c/0x40 [ +0.000005] ixgbe_remove+0x216/0x290 [ixgbe] [ +0.000021] pci_device_remove+0x42/0xb0 [ +0.000007] device_release_driver_internal+0x19c/0x200 [ +0.000008] driver_detach+0x48/0x90 [ +0.000003] bus_remove_driver+0x6d/0xf0 [ +0.000006] pci_unregister_driver+0x2e/0xb0 [ +0.000005] ixgbe_exit_module+0x1c/0xc80 [ixgbe] Same as for the previous commit, the issue has been highlighted by the commit 337369f8ce9e ("locking/mutex: Add MUTEX_WARN_ON() into fast path"). Move destroying aci.lock to the end of ixgbe_remove(), as this simply fixes the issue. Fixes: 4600cdf9f5ac ("ixgbe: Enable link management in E610 device") Signed-off-by: Jedrzej Jagielski Reviewed-by: Przemek Kitszel Reviewed-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 9a6a67a6d64464..6218bdb7f941f6 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -12061,10 +12061,8 @@ static void ixgbe_remove(struct pci_dev *pdev) set_bit(__IXGBE_REMOVING, &adapter->state); cancel_work_sync(&adapter->service_task); - if (adapter->hw.mac.type == ixgbe_mac_e610) { + if (adapter->hw.mac.type == ixgbe_mac_e610) ixgbe_disable_link_status_events(adapter); - mutex_destroy(&adapter->hw.aci.lock); - } if (adapter->mii_bus) mdiobus_unregister(adapter->mii_bus); @@ -12124,6 +12122,9 @@ static void ixgbe_remove(struct pci_dev *pdev) disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state); free_netdev(netdev); + if (adapter->hw.mac.type == ixgbe_mac_e610) + mutex_destroy(&adapter->hw.aci.lock); + if (disable_dev) pci_disable_device(pdev); } From 528eb4e19ec0df30d0c9ae4074ce945667dde919 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Wed, 10 Sep 2025 22:47:21 +0900 Subject: [PATCH 1949/3206] igc: don't fail igc_probe() on LED setup error When igc_led_setup() fails, igc_probe() fails and triggers kernel panic in free_netdev() since unregister_netdev() is not called. [1] This behavior can be tested using fault-injection framework, especially the failslab feature. [2] Since LED support is not mandatory, treat LED setup failures as non-fatal and continue probe with a warning message, consequently avoiding the kernel panic. [1] kernel BUG at net/core/dev.c:12047! Oops: invalid opcode: 0000 [#1] SMP NOPTI CPU: 0 UID: 0 PID: 937 Comm: repro-igc-led-e Not tainted 6.17.0-rc4-enjuk-tnguy-00865-gc4940196ab02 #64 PREEMPT(voluntary) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:free_netdev+0x278/0x2b0 [...] Call Trace: igc_probe+0x370/0x910 local_pci_probe+0x3a/0x80 pci_device_probe+0xd1/0x200 [...] [2] #!/bin/bash -ex FAILSLAB_PATH=/sys/kernel/debug/failslab/ DEVICE=0000:00:05.0 START_ADDR=$(grep " igc_led_setup" /proc/kallsyms \ | awk '{printf("0x%s", $1)}') END_ADDR=$(printf "0x%x" $((START_ADDR + 0x100))) echo $START_ADDR > $FAILSLAB_PATH/require-start echo $END_ADDR > $FAILSLAB_PATH/require-end echo 1 > $FAILSLAB_PATH/times echo 100 > $FAILSLAB_PATH/probability echo N > $FAILSLAB_PATH/ignore-gfp-wait echo $DEVICE > /sys/bus/pci/drivers/igc/bind Fixes: ea578703b03d ("igc: Add support for LEDs on i225/i226") Signed-off-by: Kohei Enju Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Reviewed-by: Vitaly Lifshits Reviewed-by: Kurt Kanzenbach Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 1 + drivers/net/ethernet/intel/igc/igc_main.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 266bfcf2a28f02..a427f05814c1ae 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -345,6 +345,7 @@ struct igc_adapter { /* LEDs */ struct mutex led_mutex; struct igc_led_classdev *leds; + bool leds_available; }; void igc_up(struct igc_adapter *adapter); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index e79b14d50b2405..728d7ca5338bf2 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7335,8 +7335,14 @@ static int igc_probe(struct pci_dev *pdev, if (IS_ENABLED(CONFIG_IGC_LEDS)) { err = igc_led_setup(adapter); - if (err) - goto err_register; + if (err) { + netdev_warn_once(netdev, + "LED init failed (%d); continuing without LED support\n", + err); + adapter->leds_available = false; + } else { + adapter->leds_available = true; + } } return 0; @@ -7392,7 +7398,7 @@ static void igc_remove(struct pci_dev *pdev) cancel_work_sync(&adapter->watchdog_task); hrtimer_cancel(&adapter->hrtimer); - if (IS_ENABLED(CONFIG_IGC_LEDS)) + if (IS_ENABLED(CONFIG_IGC_LEDS) && adapter->leds_available) igc_led_free(adapter); /* Release control of h/w to f/w. If f/w is AMT enabled, this From f9b80514a7227c589291792cb6743b0ddf41c2bc Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 15 Sep 2025 20:59:02 -0500 Subject: [PATCH 1950/3206] drm/amd: Only restore cached manual clock settings in restore if OD enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If OD is not enabled then restoring cached clock settings doesn't make sense and actually leads to errors in resume. Check if enabled before restoring settings. Fixes: 4e9526924d09 ("drm/amd: Restore cached manual clock settings during resume") Reported-by: Jérôme Lécuyer Closes: https://lore.kernel.org/amd-gfx/0ffe2692-7bfa-4821-856e-dd0f18e2c32b@amd.com/T/#me6db8ddb192626360c462b7570ed7eba0c6c9733 Suggested-by: Jérôme Lécuyer Acked-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 1a4dd33cc6e1baaa81efdbe68227a19f51c50f20) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index b47cb4a5f4887d..408f05dfab9015 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -2236,7 +2236,7 @@ static int smu_resume(struct amdgpu_ip_block *ip_block) return ret; } - if (smu_dpm_ctx->dpm_level == AMD_DPM_FORCED_LEVEL_MANUAL) { + if (smu_dpm_ctx->dpm_level == AMD_DPM_FORCED_LEVEL_MANUAL && smu->od_enabled) { ret = smu_od_edit_dpm_table(smu, PP_OD_COMMIT_DPM_TABLE, NULL, 0); if (ret) return ret; From 57b100d4cf14276e0340eecb561005c07c129eb8 Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Thu, 28 Aug 2025 12:00:00 +0530 Subject: [PATCH 1951/3206] tools/cpupower: fix error return value in cpupower_write_sysfs() The cpupower_write_sysfs() function currently returns -1 on write failure, but the function signature indicates it should return an unsigned int. Returning -1 from an unsigned function results in a large positive value rather than indicating an error condition. Fix this by returning 0 on failure, which is more appropriate for an unsigned return type and maintains consistency with typical success/failure semantics where 0 indicates failure and non-zero indicates success (bytes written). Link: https://lore.kernel.org/r/20250828063000.803229-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Signed-off-by: Shuah Khan --- tools/power/cpupower/lib/cpupower.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/cpupower/lib/cpupower.c b/tools/power/cpupower/lib/cpupower.c index ce8dfb8e46abda..d7f7ec6f151c27 100644 --- a/tools/power/cpupower/lib/cpupower.c +++ b/tools/power/cpupower/lib/cpupower.c @@ -56,7 +56,7 @@ unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen) if (numwritten < 1) { perror(path); close(fd); - return -1; + return 0; } close(fd); From 3a8ee3a9f4f6caca192fd2fdc88c1ce56c521b38 Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Tue, 22 Jul 2025 17:15:04 +0800 Subject: [PATCH 1952/3206] riscv: introduce ioremap_wc() Compared with IO attributes, NC attributes can improve performance, specifically in these aspects: Relaxed Order, Gathering, Supports Read Speculation, Supports Unaligned Access. Signed-off-by: Yunhui Cui Signed-off-by: Qingfang Deng Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250722091504.45974-2-cuiyunhui@bytedance.com Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/io.h | 4 ++++ arch/riscv/include/asm/pgtable.h | 1 + 2 files changed, 5 insertions(+) diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index a0e51840b9db43..09bb5f57a9d346 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -28,6 +28,10 @@ #ifdef CONFIG_MMU #define IO_SPACE_LIMIT (PCI_IO_SIZE - 1) #define PCI_IOBASE ((void __iomem *)PCI_IO_START) + +#define ioremap_wc(addr, size) \ + ioremap_prot((addr), (size), __pgprot(_PAGE_KERNEL_NC)) + #endif /* CONFIG_MMU */ /* diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 91697fbf1f9013..3d17399a06c772 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -203,6 +203,7 @@ extern struct pt_alloc_ops pt_ops __meminitdata; #define PAGE_TABLE __pgprot(_PAGE_TABLE) +#define _PAGE_KERNEL_NC ((_PAGE_KERNEL & ~_PAGE_MTMASK) | _PAGE_NOCACHE) #define _PAGE_IOREMAP ((_PAGE_KERNEL & ~_PAGE_MTMASK) | _PAGE_IO) #define PAGE_KERNEL_IO __pgprot(_PAGE_IOREMAP) From 35ebe00307f3f9b1a7103d0697632c6ef7310d1c Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 6 Jun 2025 09:09:51 +0200 Subject: [PATCH 1953/3206] riscv: Replace __ASSEMBLY__ with __ASSEMBLER__ in uapi headers __ASSEMBLY__ is only defined by the Makefile of the kernel, so this is not really useful for uapi headers (unless the userspace Makefile defines it, too). Let's switch to __ASSEMBLER__ which gets set automatically by the compiler when compiling assembly code. This is a completely mechanical patch (done with a simple "sed -i" statement). Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Alexandre Ghiti Cc: linux-riscv@lists.infradead.org Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20250606070952.498274-2-thuth@redhat.com Signed-off-by: Paul Walmsley --- arch/riscv/include/uapi/asm/kvm.h | 2 +- arch/riscv/include/uapi/asm/ptrace.h | 4 ++-- arch/riscv/include/uapi/asm/sigcontext.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index ef27d4289da118..251099d860aa46 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -9,7 +9,7 @@ #ifndef __LINUX_KVM_RISCV_H #define __LINUX_KVM_RISCV_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h index a38268b19c3d3d..beff8df80ac9c3 100644 --- a/arch/riscv/include/uapi/asm/ptrace.h +++ b/arch/riscv/include/uapi/asm/ptrace.h @@ -6,7 +6,7 @@ #ifndef _UAPI_ASM_RISCV_PTRACE_H #define _UAPI_ASM_RISCV_PTRACE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -127,6 +127,6 @@ struct __riscv_v_regset_state { */ #define RISCV_MAX_VLENB (8192) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_RISCV_PTRACE_H */ diff --git a/arch/riscv/include/uapi/asm/sigcontext.h b/arch/riscv/include/uapi/asm/sigcontext.h index cd4f175dc83763..748dffc9ae194c 100644 --- a/arch/riscv/include/uapi/asm/sigcontext.h +++ b/arch/riscv/include/uapi/asm/sigcontext.h @@ -15,7 +15,7 @@ /* The size of END signal context header. */ #define END_HDR_SIZE 0x0 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct __sc_riscv_v_state { struct __riscv_v_ext_state v_state; @@ -35,6 +35,6 @@ struct sigcontext { }; }; -#endif /*!__ASSEMBLY__*/ +#endif /*!__ASSEMBLER__*/ #endif /* _UAPI_ASM_RISCV_SIGCONTEXT_H */ From f811f58597acba9100dd61cdef052d1d1f931968 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 6 Jun 2025 09:09:52 +0200 Subject: [PATCH 1954/3206] riscv: Replace __ASSEMBLY__ with __ASSEMBLER__ in non-uapi headers While the GCC and Clang compilers already define __ASSEMBLER__ automatically when compiling assembly code, __ASSEMBLY__ is a macro that only gets defined by the Makefiles in the kernel. This can be very confusing when switching between userspace and kernelspace coding, or when dealing with uapi headers that rather should use __ASSEMBLER__ instead. So let's standardize on the __ASSEMBLER__ macro that is provided by the compilers now. This originally was a completely mechanical patch (done with a simple "sed -i" statement), with some manual fixups during rebasing of the patch later. Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Alexandre Ghiti Cc: linux-riscv@lists.infradead.org Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20250606070952.498274-3-thuth@redhat.com Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/alternative-macros.h | 12 ++++++------ arch/riscv/include/asm/alternative.h | 2 +- arch/riscv/include/asm/asm-extable.h | 6 +++--- arch/riscv/include/asm/asm.h | 10 +++++----- arch/riscv/include/asm/assembler.h | 2 +- arch/riscv/include/asm/barrier.h | 4 ++-- arch/riscv/include/asm/cache.h | 4 ++-- arch/riscv/include/asm/cpu_ops_sbi.h | 2 +- arch/riscv/include/asm/csr.h | 4 ++-- arch/riscv/include/asm/current.h | 4 ++-- arch/riscv/include/asm/errata_list.h | 6 +++--- arch/riscv/include/asm/ftrace.h | 6 +++--- arch/riscv/include/asm/gpr-num.h | 6 +++--- arch/riscv/include/asm/image.h | 4 ++-- arch/riscv/include/asm/insn-def.h | 8 ++++---- arch/riscv/include/asm/jump_label.h | 4 ++-- arch/riscv/include/asm/kasan.h | 2 +- arch/riscv/include/asm/kgdb.h | 4 ++-- arch/riscv/include/asm/mmu.h | 4 ++-- arch/riscv/include/asm/page.h | 4 ++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/riscv/include/asm/processor.h | 4 ++-- arch/riscv/include/asm/ptrace.h | 4 ++-- arch/riscv/include/asm/scs.h | 4 ++-- arch/riscv/include/asm/set_memory.h | 4 ++-- arch/riscv/include/asm/thread_info.h | 4 ++-- arch/riscv/include/asm/vdso.h | 4 ++-- arch/riscv/include/asm/vdso/getrandom.h | 4 ++-- arch/riscv/include/asm/vdso/gettimeofday.h | 4 ++-- arch/riscv/include/asm/vdso/processor.h | 4 ++-- arch/riscv/include/asm/vdso/vsyscall.h | 4 ++-- tools/arch/riscv/include/asm/csr.h | 6 +++--- tools/arch/riscv/include/asm/vdso/processor.h | 4 ++-- 33 files changed, 76 insertions(+), 76 deletions(-) diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index 231d777d936c2d..9619bd5c8ebaa3 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -4,7 +4,7 @@ #ifdef CONFIG_RISCV_ALTERNATIVE -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro ALT_ENTRY oldptr newptr vendor_id patch_id new_len .4byte \oldptr - . @@ -53,7 +53,7 @@ #define __ALTERNATIVE_CFG(...) ALTERNATIVE_CFG __VA_ARGS__ #define __ALTERNATIVE_CFG_2(...) ALTERNATIVE_CFG_2 __VA_ARGS__ -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #include #include @@ -98,7 +98,7 @@ __ALTERNATIVE_CFG(old_c, new_c_1, vendor_id_1, patch_id_1, enable_1) \ ALT_NEW_CONTENT(vendor_id_2, patch_id_2, enable_2, new_c_2) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, patch_id, CONFIG_k) \ __ALTERNATIVE_CFG(old_c, new_c, vendor_id, patch_id, IS_ENABLED(CONFIG_k)) @@ -109,7 +109,7 @@ new_c_2, vendor_id_2, patch_id_2, IS_ENABLED(CONFIG_k_2)) #else /* CONFIG_RISCV_ALTERNATIVE */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro ALTERNATIVE_CFG old_c \old_c @@ -118,12 +118,12 @@ #define __ALTERNATIVE_CFG(old_c, ...) ALTERNATIVE_CFG old_c #define __ALTERNATIVE_CFG_2(old_c, ...) ALTERNATIVE_CFG old_c -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #define __ALTERNATIVE_CFG(old_c, ...) old_c "\n" #define __ALTERNATIVE_CFG_2(old_c, ...) old_c "\n" -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define _ALTERNATIVE_CFG(old_c, ...) __ALTERNATIVE_CFG(old_c) #define _ALTERNATIVE_CFG_2(old_c, ...) __ALTERNATIVE_CFG_2(old_c) diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index 3c2b59b2501792..0e95539ba451ba 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -8,7 +8,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_RISCV_ALTERNATIVE diff --git a/arch/riscv/include/asm/asm-extable.h b/arch/riscv/include/asm/asm-extable.h index 0c8bfd54fc4e05..37d425d7a76296 100644 --- a/arch/riscv/include/asm/asm-extable.h +++ b/arch/riscv/include/asm/asm-extable.h @@ -10,7 +10,7 @@ #ifdef CONFIG_MMU -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __ASM_EXTABLE_RAW(insn, fixup, type, data) \ .pushsection __ex_table, "a"; \ @@ -25,7 +25,7 @@ __ASM_EXTABLE_RAW(\insn, \fixup, EX_TYPE_FIXUP, 0) .endm -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #include #include @@ -77,7 +77,7 @@ EX_DATA_REG(ADDR, addr) \ ")") -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #else /* CONFIG_MMU */ #define _ASM_EXTABLE_UACCESS_ERR(insn, fixup, err) diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h index 2a16e88e13deda..8bd2a11382a390 100644 --- a/arch/riscv/include/asm/asm.h +++ b/arch/riscv/include/asm/asm.h @@ -6,7 +6,7 @@ #ifndef _ASM_RISCV_ASM_H #define _ASM_RISCV_ASM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __ASM_STR(x) x #else #define __ASM_STR(x) #x @@ -30,7 +30,7 @@ #define SRLI __REG_SEL(srliw, srli) #if __SIZEOF_POINTER__ == 8 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define RISCV_PTR .dword #define RISCV_SZPTR 8 #define RISCV_LGPTR 3 @@ -40,7 +40,7 @@ #define RISCV_LGPTR "3" #endif #elif __SIZEOF_POINTER__ == 4 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define RISCV_PTR .word #define RISCV_SZPTR 4 #define RISCV_LGPTR 2 @@ -69,7 +69,7 @@ #error "Unexpected __SIZEOF_SHORT__" #endif -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include /* Common assembly source macros */ @@ -194,6 +194,6 @@ #define ASM_NOKPROBE(name) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_ASM_H */ diff --git a/arch/riscv/include/asm/assembler.h b/arch/riscv/include/asm/assembler.h index 44b1457d3e9567..16931712beab64 100644 --- a/arch/riscv/include/asm/assembler.h +++ b/arch/riscv/include/asm/assembler.h @@ -5,7 +5,7 @@ * Author: Jee Heng Sia */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #error "Only include this from assembly code" #endif diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h index b8c5726d86acb1..700ba3f922cb51 100644 --- a/arch/riscv/include/asm/barrier.h +++ b/arch/riscv/include/asm/barrier.h @@ -10,7 +10,7 @@ #ifndef _ASM_RISCV_BARRIER_H #define _ASM_RISCV_BARRIER_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -82,6 +82,6 @@ do { \ #include -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_BARRIER_H */ diff --git a/arch/riscv/include/asm/cache.h b/arch/riscv/include/asm/cache.h index 570e9d8acad1e5..eb42b739d1328c 100644 --- a/arch/riscv/include/asm/cache.h +++ b/arch/riscv/include/asm/cache.h @@ -24,7 +24,7 @@ #define ARCH_SLAB_MINALIGN 16 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern int dma_cache_alignment; #ifdef CONFIG_RISCV_DMA_NONCOHERENT @@ -35,6 +35,6 @@ static inline int dma_get_cache_alignment(void) } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_CACHE_H */ diff --git a/arch/riscv/include/asm/cpu_ops_sbi.h b/arch/riscv/include/asm/cpu_ops_sbi.h index d6e4665b31954c..776fa55fbaa456 100644 --- a/arch/riscv/include/asm/cpu_ops_sbi.h +++ b/arch/riscv/include/asm/cpu_ops_sbi.h @@ -5,7 +5,7 @@ #ifndef __ASM_CPU_OPS_SBI_H #define __ASM_CPU_OPS_SBI_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 6fed42e377059c..4a37a98398ad3b 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -513,7 +513,7 @@ #define IE_TIE (_AC(0x1, UL) << RV_IRQ_TIMER) #define IE_EIE (_AC(0x1, UL) << RV_IRQ_EXT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define csr_swap(csr, val) \ ({ \ @@ -575,6 +575,6 @@ : "memory"); \ }) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_CSR_H */ diff --git a/arch/riscv/include/asm/current.h b/arch/riscv/include/asm/current.h index 21774d868c65bd..ba5aa72aff631a 100644 --- a/arch/riscv/include/asm/current.h +++ b/arch/riscv/include/asm/current.h @@ -13,7 +13,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct task_struct; @@ -35,6 +35,6 @@ static __always_inline struct task_struct *get_current(void) register unsigned long current_stack_pointer __asm__("sp"); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_CURRENT_H */ diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index 6e426ed7919a4a..e17d6c98b3bfdf 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -29,7 +29,7 @@ #define ERRATA_THEAD_NUMBER 3 #endif -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define ALT_INSN_FAULT(x) \ ALTERNATIVE(__stringify(RISCV_PTR do_trap_insn_fault), \ @@ -42,7 +42,7 @@ ALTERNATIVE(__stringify(RISCV_PTR do_page_fault), \ __stringify(RISCV_PTR sifive_cip_453_page_fault_trp), \ SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453, \ CONFIG_ERRATA_SIFIVE_CIP_453) -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #define ALT_SFENCE_VMA_ASID(asid) \ asm(ALTERNATIVE("sfence.vma x0, %0", "sfence.vma", SIFIVE_VENDOR_ID, \ @@ -123,6 +123,6 @@ asm volatile(ALTERNATIVE( \ #define THEAD_C9XX_RV_IRQ_PMU 17 #define THEAD_C9XX_CSR_SCOUNTEROF 0x5c5 -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 22ebea3c2b26c1..e5026cd8f022f5 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -13,7 +13,7 @@ #endif #define ARCH_SUPPORTS_FTRACE_OPS 1 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void *return_address(unsigned int level); @@ -112,7 +112,7 @@ do { \ #define MCOUNT_JALR_SIZE 4 #define MCOUNT_NOP4_SIZE 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct dyn_ftrace; int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec); #define ftrace_init_nop ftrace_init_nop @@ -235,7 +235,7 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsi #endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_DYNAMIC_FTRACE */ diff --git a/arch/riscv/include/asm/gpr-num.h b/arch/riscv/include/asm/gpr-num.h index efeb5edf8a3af1..b499cf83273415 100644 --- a/arch/riscv/include/asm/gpr-num.h +++ b/arch/riscv/include/asm/gpr-num.h @@ -2,7 +2,7 @@ #ifndef __ASM_GPR_NUM_H #define __ASM_GPR_NUM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 .equ .L__gpr_num_x\num, \num @@ -41,7 +41,7 @@ .equ .L__gpr_num_t5, 30 .equ .L__gpr_num_t6, 31 -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define __DEFINE_ASM_GPR_NUMS \ " .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \ @@ -80,6 +80,6 @@ " .equ .L__gpr_num_t5, 30\n" \ " .equ .L__gpr_num_t6, 31\n" -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_GPR_NUM_H */ diff --git a/arch/riscv/include/asm/image.h b/arch/riscv/include/asm/image.h index 8927a6ea1127e2..899254966e8585 100644 --- a/arch/riscv/include/asm/image.h +++ b/arch/riscv/include/asm/image.h @@ -29,7 +29,7 @@ #define RISCV_HEADER_VERSION (RISCV_HEADER_VERSION_MAJOR << 16 | \ RISCV_HEADER_VERSION_MINOR) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define riscv_image_flag_field(flags, field)\ (((flags) >> field##_SHIFT) & field##_MASK) /** @@ -63,5 +63,5 @@ struct riscv_image_header { u32 magic2; u32 res3; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_IMAGE_H */ diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index d5adbaec1d010d..c9cfcea52cbbcf 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -25,7 +25,7 @@ #define INSN_S_SIMM5_SHIFT 7 #define INSN_S_OPCODE_SHIFT 0 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifdef CONFIG_AS_HAS_INSN @@ -77,7 +77,7 @@ #define __INSN_I(...) insn_i __VA_ARGS__ #define __INSN_S(...) insn_s __VA_ARGS__ -#else /* ! __ASSEMBLY__ */ +#else /* ! __ASSEMBLER__ */ #ifdef CONFIG_AS_HAS_INSN @@ -153,7 +153,7 @@ #endif -#endif /* ! __ASSEMBLY__ */ +#endif /* ! __ASSEMBLER__ */ #define INSN_R(opcode, func3, func7, rd, rs1, rs2) \ __INSN_R(RV_##opcode, RV_##func3, RV_##func7, \ @@ -263,7 +263,7 @@ #define RISCV_INSN_NOP4 _AC(0x00000013, U) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define nop() __asm__ __volatile__ ("nop") #define __nops(n) ".rept " #n "\nnop\n.endr\n" #define nops(n) __asm__ __volatile__ (__nops(n)) diff --git a/arch/riscv/include/asm/jump_label.h b/arch/riscv/include/asm/jump_label.h index 87a71cc6d146ce..3ab5f2e3212bec 100644 --- a/arch/riscv/include/asm/jump_label.h +++ b/arch/riscv/include/asm/jump_label.h @@ -7,7 +7,7 @@ #ifndef __ASM_JUMP_LABEL_H #define __ASM_JUMP_LABEL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -66,5 +66,5 @@ static __always_inline bool arch_static_branch_jump(struct static_key * const ke return true; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_JUMP_LABEL_H */ diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index e6a0071bdb56c4..60af6691f90321 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -4,7 +4,7 @@ #ifndef __ASM_KASAN_H #define __ASM_KASAN_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * The following comment was copied from arm64: diff --git a/arch/riscv/include/asm/kgdb.h b/arch/riscv/include/asm/kgdb.h index cc11c4544cffd1..7559d728c5ff9b 100644 --- a/arch/riscv/include/asm/kgdb.h +++ b/arch/riscv/include/asm/kgdb.h @@ -17,12 +17,12 @@ #define BREAK_INSTR_SIZE 4 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ void arch_kgdb_breakpoint(void); extern unsigned long kgdb_compiled_break; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #define DBG_REG_ZERO "zero" #define DBG_REG_RA "ra" diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 1cc90465d75b18..cf8e6eac77d520 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -7,7 +7,7 @@ #ifndef _ASM_RISCV_MMU_H #define _ASM_RISCV_MMU_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef struct { #ifndef CONFIG_MMU @@ -40,6 +40,6 @@ typedef struct { void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, pgprot_t prot); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_MMU_H */ diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 572a141ddecdb1..ffe213ad65a4ee 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -41,7 +41,7 @@ #define PAGE_OFFSET ((unsigned long)phys_ram_base) #endif /* CONFIG_MMU */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_RISCV_ISA_ZICBOZ void clear_page(void *page); @@ -199,7 +199,7 @@ static __always_inline void *pfn_to_kaddr(unsigned long pfn) return __va(pfn << PAGE_SHIFT); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define virt_addr_valid(vaddr) ({ \ unsigned long _addr = (unsigned long)vaddr; \ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 3d17399a06c772..224eb3376d04ff 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -111,7 +111,7 @@ #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -1119,6 +1119,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ set_pgd(pgdp, pgd); \ }) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_RISCV_PGTABLE_H */ diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 24d3af4d3807e3..da5426122d280b 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -54,7 +54,7 @@ #define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE / 3) #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct task_struct; struct pt_regs; @@ -215,6 +215,6 @@ long get_tagged_addr_ctrl(struct task_struct *task); #define GET_TAGGED_ADDR_CTRL() get_tagged_addr_ctrl(current) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_PROCESSOR_H */ diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index a7dc0e33075796..addc8188152f7f 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -10,7 +10,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct pt_regs { unsigned long epc; @@ -180,6 +180,6 @@ static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) return !(regs->status & SR_PIE); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_PTRACE_H */ diff --git a/arch/riscv/include/asm/scs.h b/arch/riscv/include/asm/scs.h index 0e45db78b24bf2..ab7714aa93bdc4 100644 --- a/arch/riscv/include/asm/scs.h +++ b/arch/riscv/include/asm/scs.h @@ -2,7 +2,7 @@ #ifndef _ASM_SCS_H #define _ASM_SCS_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include #ifdef CONFIG_SHADOW_CALL_STACK @@ -49,6 +49,6 @@ .endm #endif /* CONFIG_SHADOW_CALL_STACK */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_SCS_H */ diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index ea263d3683ef6f..87389e93325a3b 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -6,7 +6,7 @@ #ifndef _ASM_RISCV_SET_MEMORY_H #define _ASM_RISCV_SET_MEMORY_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Functions to change memory attributes. */ @@ -45,7 +45,7 @@ int set_direct_map_default_noflush(struct page *page); int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); bool kernel_page_present(struct page *page); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_XIP_KERNEL) #ifdef CONFIG_64BIT diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index f5916a70879a87..c33d8b7dd4880d 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -37,7 +37,7 @@ #define IRQ_STACK_SIZE THREAD_SIZE -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -98,7 +98,7 @@ struct thread_info { void arch_release_task_struct(struct task_struct *tsk); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * thread information flags diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index c130d8100232cb..f80357fe24d111 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -16,7 +16,7 @@ #define __VDSO_PAGES 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #define VDSO_SYMBOL(base, name) \ @@ -34,7 +34,7 @@ extern char compat_vdso_start[], compat_vdso_end[]; extern char vdso_start[], vdso_end[]; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* CONFIG_MMU */ diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h index c6d66895c1f585..ab4aef9550998e 100644 --- a/arch/riscv/include/asm/vdso/getrandom.h +++ b/arch/riscv/include/asm/vdso/getrandom.h @@ -5,7 +5,7 @@ #ifndef __ASM_VDSO_GETRANDOM_H #define __ASM_VDSO_GETRANDOM_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -25,6 +25,6 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns return ret; } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h index 29164f84f93cec..9ec08fa04d35a3 100644 --- a/arch/riscv/include/asm/vdso/gettimeofday.h +++ b/arch/riscv/include/asm/vdso/gettimeofday.h @@ -2,7 +2,7 @@ #ifndef __ASM_VDSO_GETTIMEOFDAY_H #define __ASM_VDSO_GETTIMEOFDAY_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -79,6 +79,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return csr_read(CSR_TIME); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h index 8f383f05a290f1..98fb44336c055f 100644 --- a/arch/riscv/include/asm/vdso/processor.h +++ b/arch/riscv/include/asm/vdso/processor.h @@ -2,7 +2,7 @@ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -23,6 +23,6 @@ static inline void cpu_relax(void) barrier(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/arch/riscv/include/asm/vdso/vsyscall.h b/arch/riscv/include/asm/vdso/vsyscall.h index 1140b54b4bc827..558eb9dfda5203 100644 --- a/arch/riscv/include/asm/vdso/vsyscall.h +++ b/arch/riscv/include/asm/vdso/vsyscall.h @@ -2,13 +2,13 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include /* The asm-generic header needs to be included after the definitions above */ #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/tools/arch/riscv/include/asm/csr.h b/tools/arch/riscv/include/asm/csr.h index 0dfc09254f99af..56d7367ee344c1 100644 --- a/tools/arch/riscv/include/asm/csr.h +++ b/tools/arch/riscv/include/asm/csr.h @@ -468,13 +468,13 @@ #define IE_TIE (_AC(0x1, UL) << RV_IRQ_TIMER) #define IE_EIE (_AC(0x1, UL) << RV_IRQ_EXT) -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __ASM_STR(x) x #else #define __ASM_STR(x) #x #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define csr_swap(csr, val) \ ({ \ @@ -536,6 +536,6 @@ : "memory"); \ }) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_CSR_H */ diff --git a/tools/arch/riscv/include/asm/vdso/processor.h b/tools/arch/riscv/include/asm/vdso/processor.h index 662aca03984817..0665b117f30f27 100644 --- a/tools/arch/riscv/include/asm/vdso/processor.h +++ b/tools/arch/riscv/include/asm/vdso/processor.h @@ -2,7 +2,7 @@ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -27,6 +27,6 @@ static inline void cpu_relax(void) barrier(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_VDSO_PROCESSOR_H */ From f2fab612824ffc8314d3a752724dd37a3ce27a31 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Tue, 13 May 2025 17:16:31 +0200 Subject: [PATCH 1955/3206] riscv: Add kprobes KUnit test Add KUnit test for riscv kprobes, mostly for simulated instructions. The test install kprobes into multiple sample functions, and check that these functions still return the expected magic value. This test can detect some kprobe bugs reported in the past (in Link:). Link: https://lore.kernel.org/linux-riscv/20241119111056.2554419-1-namcao@linutronix.de/ Link: https://lore.kernel.org/stable/c7e463c0-8cad-4f4e-addd-195c06b7b6de@iscas.ac.cn/ Link: https://lore.kernel.org/linux-riscv/20230829182500.61875-1-namcaov@gmail.com/ Signed-off-by: Nam Cao Tested-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250513151631.3520793-1-namcao@linutronix.de Signed-off-by: Paul Walmsley --- arch/riscv/kernel/tests/Kconfig.debug | 12 + arch/riscv/kernel/tests/Makefile | 1 + arch/riscv/kernel/tests/kprobes/Makefile | 1 + .../kernel/tests/kprobes/test-kprobes-asm.S | 229 ++++++++++++++++++ .../riscv/kernel/tests/kprobes/test-kprobes.c | 56 +++++ .../riscv/kernel/tests/kprobes/test-kprobes.h | 24 ++ 6 files changed, 323 insertions(+) create mode 100644 arch/riscv/kernel/tests/kprobes/Makefile create mode 100644 arch/riscv/kernel/tests/kprobes/test-kprobes-asm.S create mode 100644 arch/riscv/kernel/tests/kprobes/test-kprobes.c create mode 100644 arch/riscv/kernel/tests/kprobes/test-kprobes.h diff --git a/arch/riscv/kernel/tests/Kconfig.debug b/arch/riscv/kernel/tests/Kconfig.debug index 78cea5d2c27022..5db4df44279e9d 100644 --- a/arch/riscv/kernel/tests/Kconfig.debug +++ b/arch/riscv/kernel/tests/Kconfig.debug @@ -30,6 +30,18 @@ config RISCV_MODULE_LINKING_KUNIT If unsure, say N. +config RISCV_KPROBES_KUNIT + bool "KUnit test for riscv kprobes" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on KPROBES + default KUNIT_ALL_TESTS + help + Enable testing for riscv kprobes. Useful for riscv and/or kprobes + development. The test verifies that kprobes do not change the behaviour + of some sample functions. + + If unsure, say N. + endif # RUNTIME_TESTING_MENU endmenu # "arch/riscv/kernel runtime Testing" diff --git a/arch/riscv/kernel/tests/Makefile b/arch/riscv/kernel/tests/Makefile index 7d6c76cffe2067..407e7e6c28dcbc 100644 --- a/arch/riscv/kernel/tests/Makefile +++ b/arch/riscv/kernel/tests/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_RISCV_MODULE_LINKING_KUNIT) += module_test/ +obj-$(CONFIG_RISCV_KPROBES_KUNIT) += kprobes/ diff --git a/arch/riscv/kernel/tests/kprobes/Makefile b/arch/riscv/kernel/tests/kprobes/Makefile new file mode 100644 index 00000000000000..4cb6c66a98e8ea --- /dev/null +++ b/arch/riscv/kernel/tests/kprobes/Makefile @@ -0,0 +1 @@ +obj-y += test-kprobes.o test-kprobes-asm.o diff --git a/arch/riscv/kernel/tests/kprobes/test-kprobes-asm.S b/arch/riscv/kernel/tests/kprobes/test-kprobes-asm.S new file mode 100644 index 00000000000000..b951d0f1248231 --- /dev/null +++ b/arch/riscv/kernel/tests/kprobes/test-kprobes-asm.S @@ -0,0 +1,229 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#include +#include +#include "test-kprobes.h" + +SYM_FUNC_START(test_kprobes_add) + li a1, KPROBE_TEST_MAGIC_UPPER + li a2, KPROBE_TEST_MAGIC_LOWER +test_kprobes_add_addr1: + add a1, a1, a2 +test_kprobes_add_addr2: + add a0, a1, x0 + ret +SYM_FUNC_END(test_kprobes_add) + +SYM_FUNC_START(test_kprobes_jal) + li a0, 0 + mv a1, ra + .option push + .option norvc +test_kprobes_jal_addr1: + jal x0, 2f + ret + .option pop +1: li a0, KPROBE_TEST_MAGIC_UPPER + ret + .option push + .option norvc +test_kprobes_jal_addr2: +2: jal 1b + .option pop + li a2, KPROBE_TEST_MAGIC_LOWER + add a0, a0, a2 + jr a1 +SYM_FUNC_END(test_kprobes_jal) + +SYM_FUNC_START(test_kprobes_jalr) + la a0, 1f + mv a1, ra + .option push + .option norvc +test_kprobes_jalr_addr: + jalr a0 + .option pop + li t0, KPROBE_TEST_MAGIC_UPPER + add a0, a0, t0 + jr a1 +1: li a0, KPROBE_TEST_MAGIC_LOWER + ret +SYM_FUNC_END(test_kprobes_jalr) + +SYM_FUNC_START(test_kprobes_auipc) +test_kprobes_auipc_addr: + auipc a0, KPROBE_TEST_MAGIC_LOWER + la a1, test_kprobes_auipc_addr + sub a0, a0, a1 + srli a0, a0, 12 + li a1, KPROBE_TEST_MAGIC_UPPER + add a0, a0, a1 + ret +SYM_FUNC_END(test_kprobes_auipc) + +SYM_FUNC_START(test_kprobes_branch) + .option push + .option norvc + li a0, 0 + li a1, 1 + li a2, 2 +test_kprobes_branch_addr1: + beqz a0, 1f + ret +1: +test_kprobes_branch_addr2: + beqz a1, 3f +test_kprobes_branch_addr3: + bnez a0, 3f +test_kprobes_branch_addr4: + bnez a2, 1f + ret +1: +test_kprobes_branch_addr5: + bge a1, a2, 3f +test_kprobes_branch_addr6: + bge a2, a1, 2f + ret +1: + li t0, KPROBE_TEST_MAGIC_UPPER + add a0, a0, t0 + ret +2: +test_kprobes_branch_addr7: + blt a2, a1, 3f + li a0, KPROBE_TEST_MAGIC_LOWER +test_kprobes_branch_addr8: + blt a1, a2, 1b +3: + li a0, 0 + ret + .option pop +SYM_FUNC_END(test_kprobes_branch) + +#ifdef CONFIG_RISCV_ISA_C + +SYM_FUNC_START(test_kprobes_c_j) + li a0, 0 +test_kprobes_branch_c_j_addr1: + c.j 2f +1: + li a1, KPROBE_TEST_MAGIC_UPPER + add a0, a0, a1 + ret +2: li a0, KPROBE_TEST_MAGIC_LOWER +test_kprobes_branch_c_j_addr2: + c.j 1b +SYM_FUNC_END(test_kprobes_c_j) + +SYM_FUNC_START(test_kprobes_c_jr) + la a0, 2f +test_kprobes_c_jr_addr1: + c.jr a0 + ret +1: li a1, KPROBE_TEST_MAGIC_LOWER + add a0, a0, a1 + ret +2: + li a0, KPROBE_TEST_MAGIC_UPPER + la a1, 1b +test_kprobes_c_jr_addr2: + c.jr a1 +SYM_FUNC_END(test_kprobes_c_jr) + +SYM_FUNC_START(test_kprobes_c_jalr) + mv a1, ra + la a0, 1f +test_kprobes_c_jalr_addr: + c.jalr a0 + li a2, KPROBE_TEST_MAGIC_UPPER + add a0, a0, a2 + jr a1 +1: li a0, KPROBE_TEST_MAGIC_LOWER + ret +SYM_FUNC_END(test_kprobes_c_jalr) + +SYM_FUNC_START(test_kprobes_c_beqz) + li a0, 0 + li a1, 1 +test_kprobes_c_beqz_addr1: + c.beqz a0, 2f + ret +1: li a1, KPROBE_TEST_MAGIC_UPPER + add a0, a0, a1 + ret +test_kprobes_c_beqz_addr2: +2: c.beqz a1, 3f + li a0, KPROBE_TEST_MAGIC_LOWER + mv a1, x0 +test_kprobes_c_beqz_addr3: + c.beqz a1, 1b +3: li a0, 0 + ret +SYM_FUNC_END(test_kprobes_c_beqz) + +SYM_FUNC_START(test_kprobes_c_bnez) + li a0, 0 + li a1, 1 +test_kprobes_c_bnez_addr1: + c.bnez a1, 2f + ret +1: li a1, KPROBE_TEST_MAGIC_UPPER + add a0, a0, a1 + ret +test_kprobes_c_bnez_addr2: +2: c.bnez a0, 3f + li a0, KPROBE_TEST_MAGIC_LOWER +test_kprobes_c_bnez_addr3: + c.bnez a0, 1b +3: li a0, 0 + ret +SYM_FUNC_END(test_kprobes_c_bnez) + +#endif /* CONFIG_RISCV_ISA_C */ + +SYM_DATA_START(test_kprobes_addresses) + RISCV_PTR test_kprobes_add_addr1 + RISCV_PTR test_kprobes_add_addr2 + RISCV_PTR test_kprobes_jal_addr1 + RISCV_PTR test_kprobes_jal_addr2 + RISCV_PTR test_kprobes_jalr_addr + RISCV_PTR test_kprobes_auipc_addr + RISCV_PTR test_kprobes_branch_addr1 + RISCV_PTR test_kprobes_branch_addr2 + RISCV_PTR test_kprobes_branch_addr3 + RISCV_PTR test_kprobes_branch_addr4 + RISCV_PTR test_kprobes_branch_addr5 + RISCV_PTR test_kprobes_branch_addr6 + RISCV_PTR test_kprobes_branch_addr7 + RISCV_PTR test_kprobes_branch_addr8 +#ifdef CONFIG_RISCV_ISA_C + RISCV_PTR test_kprobes_branch_c_j_addr1 + RISCV_PTR test_kprobes_branch_c_j_addr2 + RISCV_PTR test_kprobes_c_jr_addr1 + RISCV_PTR test_kprobes_c_jr_addr2 + RISCV_PTR test_kprobes_c_jalr_addr + RISCV_PTR test_kprobes_c_beqz_addr1 + RISCV_PTR test_kprobes_c_beqz_addr2 + RISCV_PTR test_kprobes_c_beqz_addr3 + RISCV_PTR test_kprobes_c_bnez_addr1 + RISCV_PTR test_kprobes_c_bnez_addr2 + RISCV_PTR test_kprobes_c_bnez_addr3 +#endif /* CONFIG_RISCV_ISA_C */ + RISCV_PTR 0 +SYM_DATA_END(test_kprobes_addresses) + +SYM_DATA_START(test_kprobes_functions) + RISCV_PTR test_kprobes_add + RISCV_PTR test_kprobes_jal + RISCV_PTR test_kprobes_jalr + RISCV_PTR test_kprobes_auipc + RISCV_PTR test_kprobes_branch +#ifdef CONFIG_RISCV_ISA_C + RISCV_PTR test_kprobes_c_j + RISCV_PTR test_kprobes_c_jr + RISCV_PTR test_kprobes_c_jalr + RISCV_PTR test_kprobes_c_beqz + RISCV_PTR test_kprobes_c_bnez +#endif /* CONFIG_RISCV_ISA_C */ + RISCV_PTR 0 +SYM_DATA_END(test_kprobes_functions) diff --git a/arch/riscv/kernel/tests/kprobes/test-kprobes.c b/arch/riscv/kernel/tests/kprobes/test-kprobes.c new file mode 100644 index 00000000000000..6f6cdfbf5a9588 --- /dev/null +++ b/arch/riscv/kernel/tests/kprobes/test-kprobes.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include "test-kprobes.h" + +static int kprobe_dummy_handler(struct kprobe *kp, struct pt_regs *regs) +{ + return 0; +} + +static void test_kprobe_riscv(struct kunit *test) +{ + unsigned int num_kprobe = 0; + long (*func)(void); + struct kprobe *kp; + int i; + + while (test_kprobes_addresses[num_kprobe]) + num_kprobe++; + + kp = kcalloc(num_kprobe, sizeof(*kp), GFP_KERNEL); + KUNIT_EXPECT_TRUE(test, kp); + if (!kp) + return; + + for (i = 0; i < num_kprobe; ++i) { + kp[i].addr = test_kprobes_addresses[i]; + kp[i].pre_handler = kprobe_dummy_handler; + KUNIT_EXPECT_EQ(test, 0, register_kprobe(&kp[i])); + } + + for (i = 0;; ++i) { + func = test_kprobes_functions[i]; + if (!func) + break; + KUNIT_EXPECT_EQ_MSG(test, KPROBE_TEST_MAGIC, func(), "function %d broken", i); + } + + for (i = 0; i < num_kprobe; ++i) + unregister_kprobe(&kp[i]); + kfree(kp); +} + +static struct kunit_case kprobes_testcases[] = { + KUNIT_CASE(test_kprobe_riscv), + {} +}; + +static struct kunit_suite kprobes_test_suite = { + .name = "kprobes_test_riscv", + .test_cases = kprobes_testcases, +}; + +kunit_test_suites(&kprobes_test_suite); diff --git a/arch/riscv/kernel/tests/kprobes/test-kprobes.h b/arch/riscv/kernel/tests/kprobes/test-kprobes.h new file mode 100644 index 00000000000000..3886ab491ecba3 --- /dev/null +++ b/arch/riscv/kernel/tests/kprobes/test-kprobes.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef TEST_KPROBES_H +#define TEST_KPROBES_H + +/* + * The magic value that all the functions in the test_kprobes_functions array return. The test + * installs kprobes into these functions, and verify that the functions still correctly return this + * value. + */ +#define KPROBE_TEST_MAGIC 0xcafebabe +#define KPROBE_TEST_MAGIC_LOWER 0x0000babe +#define KPROBE_TEST_MAGIC_UPPER 0xcafe0000 + +#ifndef __ASSEMBLY__ + +/* array of addresses to install kprobes */ +extern void *test_kprobes_addresses[]; + +/* array of functions that return KPROBE_TEST_MAGIC */ +extern long (*test_kprobes_functions[])(void); + +#endif /* __ASSEMBLY__ */ + +#endif /* TEST_KPROBES_H */ From 932131fd3ed21538a9b16f14e46b2794f244a196 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 20 Jun 2025 20:21:57 +0000 Subject: [PATCH 1956/3206] riscv: Fix typo EXRACT -> EXTRACT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simply fix a typo. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Andrew Jones Signed-off-by: Alexandre Ghiti Reviewed-by: Clément Léger Link: https://lore.kernel.org/r/20250620-dev-alex-insn_duplicate_v5_manual-v5-1-d865dc9ad180@rivosinc.com Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/insn.h | 2 +- arch/riscv/kernel/vector.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h index 09fde95a5e8f75..2a589a58b2917d 100644 --- a/arch/riscv/include/asm/insn.h +++ b/arch/riscv/include/asm/insn.h @@ -352,7 +352,7 @@ static __always_inline bool riscv_insn_is_c_jalr(u32 code) ({typeof(x) x_ = (x); RV_X(x_, RVFDQ_FL_FS_WIDTH_OFF, \ RVFDQ_FL_FS_WIDTH_MASK); }) -#define RVV_EXRACT_VL_VS_WIDTH(x) RVFDQ_EXTRACT_FL_FS_WIDTH(x) +#define RVV_EXTRACT_VL_VS_WIDTH(x) RVFDQ_EXTRACT_FL_FS_WIDTH(x) /* * Get the immediate from a J-type instruction. diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index 184f780c932d44..901e67adf57608 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -93,7 +93,7 @@ bool insn_is_vector(u32 insn_buf) return true; case RVV_OPCODE_VL: case RVV_OPCODE_VS: - width = RVV_EXRACT_VL_VS_WIDTH(insn_buf); + width = RVV_EXTRACT_VL_VS_WIDTH(insn_buf); if (width == RVV_VL_VS_WIDTH_8 || width == RVV_VL_VS_WIDTH_16 || width == RVV_VL_VS_WIDTH_32 || width == RVV_VL_VS_WIDTH_64) return true; From 833bbb0d91d21694ec0d8909b6c71f5df448c575 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 20 Jun 2025 20:21:58 +0000 Subject: [PATCH 1957/3206] riscv: Strengthen duplicate and inconsistent definition of RV_X() RV_X() macro is defined in two different ways which is error prone. So harmonize its first definition and add another macro RV_X_MASK() for the second one. Reviewed-by: Andrew Jones Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250620-dev-alex-insn_duplicate_v5_manual-v5-2-d865dc9ad180@rivosinc.com [pjw@kernel.org: upcase the macro name to conform with previous practice] Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/insn.h | 39 +++++++++++++------------- arch/riscv/kernel/machine_kexec_file.c | 2 +- arch/riscv/kernel/traps_misaligned.c | 2 +- arch/riscv/kvm/vcpu_insn.c | 2 +- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h index 2a589a58b2917d..090658571aa56f 100644 --- a/arch/riscv/include/asm/insn.h +++ b/arch/riscv/include/asm/insn.h @@ -288,43 +288,44 @@ static __always_inline bool riscv_insn_is_c_jalr(u32 code) #define RV_IMM_SIGN(x) (-(((x) >> 31) & 1)) #define RVC_IMM_SIGN(x) (-(((x) >> 12) & 1)) -#define RV_X(X, s, mask) (((X) >> (s)) & (mask)) -#define RVC_X(X, s, mask) RV_X(X, s, mask) +#define RV_X_MASK(X, s, mask) (((X) >> (s)) & (mask)) +#define RV_X(X, s, n) RV_X_MASK(X, s, ((1 << (n)) - 1)) +#define RVC_X(X, s, mask) RV_X_MASK(X, s, mask) #define RV_EXTRACT_RS1_REG(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RVG_RS1_OPOFF, RVG_RS1_MASK)); }) + (RV_X_MASK(x_, RVG_RS1_OPOFF, RVG_RS1_MASK)); }) #define RV_EXTRACT_RD_REG(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RVG_RD_OPOFF, RVG_RD_MASK)); }) + (RV_X_MASK(x_, RVG_RD_OPOFF, RVG_RD_MASK)); }) #define RV_EXTRACT_UTYPE_IMM(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RV_U_IMM_31_12_OPOFF, RV_U_IMM_31_12_MASK)); }) + (RV_X_MASK(x_, RV_U_IMM_31_12_OPOFF, RV_U_IMM_31_12_MASK)); }) #define RV_EXTRACT_JTYPE_IMM(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RV_J_IMM_10_1_OPOFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OFF) | \ - (RV_X(x_, RV_J_IMM_11_OPOFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OFF) | \ - (RV_X(x_, RV_J_IMM_19_12_OPOFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OFF) | \ + (RV_X_MASK(x_, RV_J_IMM_10_1_OPOFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OFF) | \ + (RV_X_MASK(x_, RV_J_IMM_11_OPOFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OFF) | \ + (RV_X_MASK(x_, RV_J_IMM_19_12_OPOFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OFF) | \ (RV_IMM_SIGN(x_) << RV_J_IMM_SIGN_OFF); }) #define RV_EXTRACT_ITYPE_IMM(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RV_I_IMM_11_0_OPOFF, RV_I_IMM_11_0_MASK)) | \ + (RV_X_MASK(x_, RV_I_IMM_11_0_OPOFF, RV_I_IMM_11_0_MASK)) | \ (RV_IMM_SIGN(x_) << RV_I_IMM_SIGN_OFF); }) #define RV_EXTRACT_BTYPE_IMM(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RV_B_IMM_4_1_OPOFF, RV_B_IMM_4_1_MASK) << RV_B_IMM_4_1_OFF) | \ - (RV_X(x_, RV_B_IMM_10_5_OPOFF, RV_B_IMM_10_5_MASK) << RV_B_IMM_10_5_OFF) | \ - (RV_X(x_, RV_B_IMM_11_OPOFF, RV_B_IMM_11_MASK) << RV_B_IMM_11_OFF) | \ + (RV_X_MASK(x_, RV_B_IMM_4_1_OPOFF, RV_B_IMM_4_1_MASK) << RV_B_IMM_4_1_OFF) | \ + (RV_X_MASK(x_, RV_B_IMM_10_5_OPOFF, RV_B_IMM_10_5_MASK) << RV_B_IMM_10_5_OFF) | \ + (RV_X_MASK(x_, RV_B_IMM_11_OPOFF, RV_B_IMM_11_MASK) << RV_B_IMM_11_OFF) | \ (RV_IMM_SIGN(x_) << RV_B_IMM_SIGN_OFF); }) #define RVC_EXTRACT_C2_RS1_REG(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RVC_C2_RS1_OPOFF, RVC_C2_RS1_MASK)); }) + (RV_X_MASK(x_, RVC_C2_RS1_OPOFF, RVC_C2_RS1_MASK)); }) #define RVC_EXTRACT_JTYPE_IMM(x) \ ({typeof(x) x_ = (x); \ @@ -346,10 +347,10 @@ static __always_inline bool riscv_insn_is_c_jalr(u32 code) (RVC_IMM_SIGN(x_) << RVC_B_IMM_SIGN_OFF); }) #define RVG_EXTRACT_SYSTEM_CSR(x) \ - ({typeof(x) x_ = (x); RV_X(x_, RVG_SYSTEM_CSR_OFF, RVG_SYSTEM_CSR_MASK); }) + ({typeof(x) x_ = (x); RV_X_MASK(x_, RVG_SYSTEM_CSR_OFF, RVG_SYSTEM_CSR_MASK); }) #define RVFDQ_EXTRACT_FL_FS_WIDTH(x) \ - ({typeof(x) x_ = (x); RV_X(x_, RVFDQ_FL_FS_WIDTH_OFF, \ + ({typeof(x) x_ = (x); RV_X_MASK(x_, RVFDQ_FL_FS_WIDTH_OFF, \ RVFDQ_FL_FS_WIDTH_MASK); }) #define RVV_EXTRACT_VL_VS_WIDTH(x) RVFDQ_EXTRACT_FL_FS_WIDTH(x) @@ -375,10 +376,10 @@ static inline void riscv_insn_insert_jtype_imm(u32 *insn, s32 imm) { /* drop the old IMMs, all jal IMM bits sit at 31:12 */ *insn &= ~GENMASK(31, 12); - *insn |= (RV_X(imm, RV_J_IMM_10_1_OFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OPOFF) | - (RV_X(imm, RV_J_IMM_11_OFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OPOFF) | - (RV_X(imm, RV_J_IMM_19_12_OFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OPOFF) | - (RV_X(imm, RV_J_IMM_SIGN_OFF, 1) << RV_J_IMM_SIGN_OPOFF); + *insn |= (RV_X_MASK(imm, RV_J_IMM_10_1_OFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OPOFF) | + (RV_X_MASK(imm, RV_J_IMM_11_OFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OPOFF) | + (RV_X_MASK(imm, RV_J_IMM_19_12_OFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OPOFF) | + (RV_X_MASK(imm, RV_J_IMM_SIGN_OFF, 1) << RV_J_IMM_SIGN_OPOFF); } /* diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c index b9eb41b0a97519..dd9d92a9651746 100644 --- a/arch/riscv/kernel/machine_kexec_file.c +++ b/arch/riscv/kernel/machine_kexec_file.c @@ -15,6 +15,7 @@ #include #include #include +#include const struct kexec_file_ops * const kexec_file_loaders[] = { &elf_kexec_ops, @@ -109,7 +110,6 @@ static char *setup_kdump_cmdline(struct kimage *image, char *cmdline, } #endif -#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) #define RISCV_IMM_BITS 12 #define RISCV_IMM_REACH (1LL << RISCV_IMM_BITS) #define RISCV_CONST_HIGH_PART(x) \ diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index f760e4fcc052d2..18a1fb240e25e1 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -18,6 +18,7 @@ #include #include #include +#include #define INSN_MATCH_LB 0x3 #define INSN_MASK_LB 0x707f @@ -113,7 +114,6 @@ #define SH_RS2 20 #define SH_RS2C 2 -#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) #define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ (RV_X(x, 10, 3) << 3) | \ (RV_X(x, 5, 1) << 6)) diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c index 97dec18e69892a..62cb2ab4b63680 100644 --- a/arch/riscv/kvm/vcpu_insn.c +++ b/arch/riscv/kvm/vcpu_insn.c @@ -8,6 +8,7 @@ #include #include +#include #define INSN_OPCODE_MASK 0x007c #define INSN_OPCODE_SHIFT 2 @@ -91,7 +92,6 @@ #define SH_RS2C 2 #define MASK_RX 0x1f -#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) #define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ (RV_X(x, 10, 3) << 3) | \ (RV_X(x, 5, 1) << 6)) From a601732236834a84e110508e884dc8d368d99d07 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 20 Jun 2025 20:21:59 +0000 Subject: [PATCH 1958/3206] riscv: Move all duplicate insn parsing macros into asm/insn.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kernel/traps_misaligned.c and kvm/vcpu_insn.c define the same macros to extract information from the instructions. Let's move the definitions into asm/insn.h to avoid this duplication. Reviewed-by: Andrew Jones Signed-off-by: Alexandre Ghiti Reviewed-by: Clément Léger Link: https://lore.kernel.org/r/20250620-dev-alex-insn_duplicate_v5_manual-v5-3-d865dc9ad180@rivosinc.com [pjw@kernel.org: updated to apply] Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/insn.h | 171 ++++++++++++++++++++++++++- arch/riscv/kernel/traps_misaligned.c | 142 ---------------------- arch/riscv/kvm/vcpu_insn.c | 126 -------------------- 3 files changed, 166 insertions(+), 273 deletions(-) diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h index 090658571aa56f..89ae44fb408838 100644 --- a/arch/riscv/include/asm/insn.h +++ b/arch/riscv/include/asm/insn.h @@ -286,11 +286,172 @@ static __always_inline bool riscv_insn_is_c_jalr(u32 code) (code & RVC_INSN_J_RS1_MASK) != 0; } -#define RV_IMM_SIGN(x) (-(((x) >> 31) & 1)) -#define RVC_IMM_SIGN(x) (-(((x) >> 12) & 1)) -#define RV_X_MASK(X, s, mask) (((X) >> (s)) & (mask)) -#define RV_X(X, s, n) RV_X_MASK(X, s, ((1 << (n)) - 1)) -#define RVC_X(X, s, mask) RV_X_MASK(X, s, mask) +#define INSN_MATCH_LB 0x3 +#define INSN_MASK_LB 0x707f +#define INSN_MATCH_LH 0x1003 +#define INSN_MASK_LH 0x707f +#define INSN_MATCH_LW 0x2003 +#define INSN_MASK_LW 0x707f +#define INSN_MATCH_LD 0x3003 +#define INSN_MASK_LD 0x707f +#define INSN_MATCH_LBU 0x4003 +#define INSN_MASK_LBU 0x707f +#define INSN_MATCH_LHU 0x5003 +#define INSN_MASK_LHU 0x707f +#define INSN_MATCH_LWU 0x6003 +#define INSN_MASK_LWU 0x707f +#define INSN_MATCH_SB 0x23 +#define INSN_MASK_SB 0x707f +#define INSN_MATCH_SH 0x1023 +#define INSN_MASK_SH 0x707f +#define INSN_MATCH_SW 0x2023 +#define INSN_MASK_SW 0x707f +#define INSN_MATCH_SD 0x3023 +#define INSN_MASK_SD 0x707f + +#define INSN_MATCH_C_LD 0x6000 +#define INSN_MASK_C_LD 0xe003 +#define INSN_MATCH_C_SD 0xe000 +#define INSN_MASK_C_SD 0xe003 +#define INSN_MATCH_C_LW 0x4000 +#define INSN_MASK_C_LW 0xe003 +#define INSN_MATCH_C_SW 0xc000 +#define INSN_MASK_C_SW 0xe003 +#define INSN_MATCH_C_LDSP 0x6002 +#define INSN_MASK_C_LDSP 0xe003 +#define INSN_MATCH_C_SDSP 0xe002 +#define INSN_MASK_C_SDSP 0xe003 +#define INSN_MATCH_C_LWSP 0x4002 +#define INSN_MASK_C_LWSP 0xe003 +#define INSN_MATCH_C_SWSP 0xc002 +#define INSN_MASK_C_SWSP 0xe003 + +#define INSN_OPCODE_MASK 0x007c +#define INSN_OPCODE_SHIFT 2 +#define INSN_OPCODE_SYSTEM 28 + +#define INSN_MASK_WFI 0xffffffff +#define INSN_MATCH_WFI 0x10500073 + +#define INSN_MASK_WRS 0xffffffff +#define INSN_MATCH_WRS 0x00d00073 + +#define INSN_MATCH_CSRRW 0x1073 +#define INSN_MASK_CSRRW 0x707f +#define INSN_MATCH_CSRRS 0x2073 +#define INSN_MASK_CSRRS 0x707f +#define INSN_MATCH_CSRRC 0x3073 +#define INSN_MASK_CSRRC 0x707f +#define INSN_MATCH_CSRRWI 0x5073 +#define INSN_MASK_CSRRWI 0x707f +#define INSN_MATCH_CSRRSI 0x6073 +#define INSN_MASK_CSRRSI 0x707f +#define INSN_MATCH_CSRRCI 0x7073 +#define INSN_MASK_CSRRCI 0x707f + +#define INSN_MATCH_FLW 0x2007 +#define INSN_MASK_FLW 0x707f +#define INSN_MATCH_FLD 0x3007 +#define INSN_MASK_FLD 0x707f +#define INSN_MATCH_FLQ 0x4007 +#define INSN_MASK_FLQ 0x707f +#define INSN_MATCH_FSW 0x2027 +#define INSN_MASK_FSW 0x707f +#define INSN_MATCH_FSD 0x3027 +#define INSN_MASK_FSD 0x707f +#define INSN_MATCH_FSQ 0x4027 +#define INSN_MASK_FSQ 0x707f + +#define INSN_MATCH_C_FLD 0x2000 +#define INSN_MASK_C_FLD 0xe003 +#define INSN_MATCH_C_FLW 0x6000 +#define INSN_MASK_C_FLW 0xe003 +#define INSN_MATCH_C_FSD 0xa000 +#define INSN_MASK_C_FSD 0xe003 +#define INSN_MATCH_C_FSW 0xe000 +#define INSN_MASK_C_FSW 0xe003 +#define INSN_MATCH_C_FLDSP 0x2002 +#define INSN_MASK_C_FLDSP 0xe003 +#define INSN_MATCH_C_FSDSP 0xa002 +#define INSN_MASK_C_FSDSP 0xe003 +#define INSN_MATCH_C_FLWSP 0x6002 +#define INSN_MASK_C_FLWSP 0xe003 +#define INSN_MATCH_C_FSWSP 0xe002 +#define INSN_MASK_C_FSWSP 0xe003 + +#define INSN_MATCH_C_LHU 0x8400 +#define INSN_MASK_C_LHU 0xfc43 +#define INSN_MATCH_C_LH 0x8440 +#define INSN_MASK_C_LH 0xfc43 +#define INSN_MATCH_C_SH 0x8c00 +#define INSN_MASK_C_SH 0xfc43 + +#define INSN_16BIT_MASK 0x3 +#define INSN_IS_16BIT(insn) (((insn) & INSN_16BIT_MASK) != INSN_16BIT_MASK) +#define INSN_LEN(insn) (INSN_IS_16BIT(insn) ? 2 : 4) + +#define SHIFT_RIGHT(x, y) \ + ((y) < 0 ? ((x) << -(y)) : ((x) >> (y))) + +#define REG_MASK \ + ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES)) + +#define REG_OFFSET(insn, pos) \ + (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK) + +#define REG_PTR(insn, pos, regs) \ + ((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))) + +#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) +#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) +#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) +#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs)) +#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs)) +#define GET_SP(regs) (*REG_PTR(2, 0, regs)) +#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val)) +#define IMM_I(insn) ((s32)(insn) >> 20) +#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \ + (s32)(((insn) >> 7) & 0x1f)) + +#define SH_RD 7 +#define SH_RS1 15 +#define SH_RS2 20 +#define SH_RS2C 2 +#define MASK_RX 0x1f + +#if defined(CONFIG_64BIT) +#define LOG_REGBYTES 3 +#else +#define LOG_REGBYTES 2 +#endif + +#define MASK_FUNCT3 0x7000 + +#define GET_FUNCT3(insn) (((insn) >> 12) & 7) + +#define RV_IMM_SIGN(x) (-(((x) >> 31) & 1)) +#define RVC_IMM_SIGN(x) (-(((x) >> 12) & 1)) +#define RV_X_MASK(X, s, mask) (((X) >> (s)) & (mask)) +#define RV_X(X, s, n) RV_X_MASK(X, s, ((1 << (n)) - 1)) +#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ + (RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 5, 1) << 6)) +#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 5, 2) << 6)) +#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \ + (RV_X(x, 12, 1) << 5) | \ + (RV_X(x, 2, 2) << 6)) +#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \ + (RV_X(x, 12, 1) << 5) | \ + (RV_X(x, 2, 3) << 6)) +#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \ + (RV_X(x, 7, 2) << 6)) +#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 7, 3) << 6)) +#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3)) +#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3)) +#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) +#define RVC_X(X, s, mask) RV_X_MASK(X, s, mask) #define RV_EXTRACT_RS1_REG(x) \ ({typeof(x) x_ = (x); \ diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 18a1fb240e25e1..2a27d3ff4ac66d 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -20,148 +20,6 @@ #include #include -#define INSN_MATCH_LB 0x3 -#define INSN_MASK_LB 0x707f -#define INSN_MATCH_LH 0x1003 -#define INSN_MASK_LH 0x707f -#define INSN_MATCH_LW 0x2003 -#define INSN_MASK_LW 0x707f -#define INSN_MATCH_LD 0x3003 -#define INSN_MASK_LD 0x707f -#define INSN_MATCH_LBU 0x4003 -#define INSN_MASK_LBU 0x707f -#define INSN_MATCH_LHU 0x5003 -#define INSN_MASK_LHU 0x707f -#define INSN_MATCH_LWU 0x6003 -#define INSN_MASK_LWU 0x707f -#define INSN_MATCH_SB 0x23 -#define INSN_MASK_SB 0x707f -#define INSN_MATCH_SH 0x1023 -#define INSN_MASK_SH 0x707f -#define INSN_MATCH_SW 0x2023 -#define INSN_MASK_SW 0x707f -#define INSN_MATCH_SD 0x3023 -#define INSN_MASK_SD 0x707f - -#define INSN_MATCH_FLW 0x2007 -#define INSN_MASK_FLW 0x707f -#define INSN_MATCH_FLD 0x3007 -#define INSN_MASK_FLD 0x707f -#define INSN_MATCH_FLQ 0x4007 -#define INSN_MASK_FLQ 0x707f -#define INSN_MATCH_FSW 0x2027 -#define INSN_MASK_FSW 0x707f -#define INSN_MATCH_FSD 0x3027 -#define INSN_MASK_FSD 0x707f -#define INSN_MATCH_FSQ 0x4027 -#define INSN_MASK_FSQ 0x707f - -#define INSN_MATCH_C_LD 0x6000 -#define INSN_MASK_C_LD 0xe003 -#define INSN_MATCH_C_SD 0xe000 -#define INSN_MASK_C_SD 0xe003 -#define INSN_MATCH_C_LW 0x4000 -#define INSN_MASK_C_LW 0xe003 -#define INSN_MATCH_C_SW 0xc000 -#define INSN_MASK_C_SW 0xe003 -#define INSN_MATCH_C_LDSP 0x6002 -#define INSN_MASK_C_LDSP 0xe003 -#define INSN_MATCH_C_SDSP 0xe002 -#define INSN_MASK_C_SDSP 0xe003 -#define INSN_MATCH_C_LWSP 0x4002 -#define INSN_MASK_C_LWSP 0xe003 -#define INSN_MATCH_C_SWSP 0xc002 -#define INSN_MASK_C_SWSP 0xe003 - -#define INSN_MATCH_C_FLD 0x2000 -#define INSN_MASK_C_FLD 0xe003 -#define INSN_MATCH_C_FLW 0x6000 -#define INSN_MASK_C_FLW 0xe003 -#define INSN_MATCH_C_FSD 0xa000 -#define INSN_MASK_C_FSD 0xe003 -#define INSN_MATCH_C_FSW 0xe000 -#define INSN_MASK_C_FSW 0xe003 -#define INSN_MATCH_C_FLDSP 0x2002 -#define INSN_MASK_C_FLDSP 0xe003 -#define INSN_MATCH_C_FSDSP 0xa002 -#define INSN_MASK_C_FSDSP 0xe003 -#define INSN_MATCH_C_FLWSP 0x6002 -#define INSN_MASK_C_FLWSP 0xe003 -#define INSN_MATCH_C_FSWSP 0xe002 -#define INSN_MASK_C_FSWSP 0xe003 - -#define INSN_MATCH_C_LHU 0x8400 -#define INSN_MASK_C_LHU 0xfc43 -#define INSN_MATCH_C_LH 0x8440 -#define INSN_MASK_C_LH 0xfc43 -#define INSN_MATCH_C_SH 0x8c00 -#define INSN_MASK_C_SH 0xfc43 - -#define INSN_LEN(insn) ((((insn) & 0x3) < 0x3) ? 2 : 4) - -#if defined(CONFIG_64BIT) -#define LOG_REGBYTES 3 -#define XLEN 64 -#else -#define LOG_REGBYTES 2 -#define XLEN 32 -#endif -#define REGBYTES (1 << LOG_REGBYTES) -#define XLEN_MINUS_16 ((XLEN) - 16) - -#define SH_RD 7 -#define SH_RS1 15 -#define SH_RS2 20 -#define SH_RS2C 2 - -#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ - (RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 1) << 6)) -#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 2) << 6)) -#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 2) << 6)) -#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 3) << 6)) -#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \ - (RV_X(x, 7, 2) << 6)) -#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 7, 3) << 6)) -#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3)) -#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3)) -#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) - -#define SHIFT_RIGHT(x, y) \ - ((y) < 0 ? ((x) << -(y)) : ((x) >> (y))) - -#define REG_MASK \ - ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES)) - -#define REG_OFFSET(insn, pos) \ - (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK) - -#define REG_PTR(insn, pos, regs) \ - (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos)) - -#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) -#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) -#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) -#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs)) -#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs)) -#define GET_SP(regs) (*REG_PTR(2, 0, regs)) -#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val)) -#define IMM_I(insn) ((s32)(insn) >> 20) -#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \ - (s32)(((insn) >> 7) & 0x1f)) -#define MASK_FUNCT3 0x7000 - -#define GET_PRECISION(insn) (((insn) >> 25) & 3) -#define GET_RM(insn) (((insn) >> 12) & 7) -#define PRECISION_S 0 -#define PRECISION_D 1 - #ifdef CONFIG_FPU #define FP_GET_RD(insn) (insn >> 7 & 0x1F) diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c index 62cb2ab4b63680..de1f96ea62251f 100644 --- a/arch/riscv/kvm/vcpu_insn.c +++ b/arch/riscv/kvm/vcpu_insn.c @@ -10,132 +10,6 @@ #include #include -#define INSN_OPCODE_MASK 0x007c -#define INSN_OPCODE_SHIFT 2 -#define INSN_OPCODE_SYSTEM 28 - -#define INSN_MASK_WFI 0xffffffff -#define INSN_MATCH_WFI 0x10500073 - -#define INSN_MASK_WRS 0xffffffff -#define INSN_MATCH_WRS 0x00d00073 - -#define INSN_MATCH_CSRRW 0x1073 -#define INSN_MASK_CSRRW 0x707f -#define INSN_MATCH_CSRRS 0x2073 -#define INSN_MASK_CSRRS 0x707f -#define INSN_MATCH_CSRRC 0x3073 -#define INSN_MASK_CSRRC 0x707f -#define INSN_MATCH_CSRRWI 0x5073 -#define INSN_MASK_CSRRWI 0x707f -#define INSN_MATCH_CSRRSI 0x6073 -#define INSN_MASK_CSRRSI 0x707f -#define INSN_MATCH_CSRRCI 0x7073 -#define INSN_MASK_CSRRCI 0x707f - -#define INSN_MATCH_LB 0x3 -#define INSN_MASK_LB 0x707f -#define INSN_MATCH_LH 0x1003 -#define INSN_MASK_LH 0x707f -#define INSN_MATCH_LW 0x2003 -#define INSN_MASK_LW 0x707f -#define INSN_MATCH_LD 0x3003 -#define INSN_MASK_LD 0x707f -#define INSN_MATCH_LBU 0x4003 -#define INSN_MASK_LBU 0x707f -#define INSN_MATCH_LHU 0x5003 -#define INSN_MASK_LHU 0x707f -#define INSN_MATCH_LWU 0x6003 -#define INSN_MASK_LWU 0x707f -#define INSN_MATCH_SB 0x23 -#define INSN_MASK_SB 0x707f -#define INSN_MATCH_SH 0x1023 -#define INSN_MASK_SH 0x707f -#define INSN_MATCH_SW 0x2023 -#define INSN_MASK_SW 0x707f -#define INSN_MATCH_SD 0x3023 -#define INSN_MASK_SD 0x707f - -#define INSN_MATCH_C_LD 0x6000 -#define INSN_MASK_C_LD 0xe003 -#define INSN_MATCH_C_SD 0xe000 -#define INSN_MASK_C_SD 0xe003 -#define INSN_MATCH_C_LW 0x4000 -#define INSN_MASK_C_LW 0xe003 -#define INSN_MATCH_C_SW 0xc000 -#define INSN_MASK_C_SW 0xe003 -#define INSN_MATCH_C_LDSP 0x6002 -#define INSN_MASK_C_LDSP 0xe003 -#define INSN_MATCH_C_SDSP 0xe002 -#define INSN_MASK_C_SDSP 0xe003 -#define INSN_MATCH_C_LWSP 0x4002 -#define INSN_MASK_C_LWSP 0xe003 -#define INSN_MATCH_C_SWSP 0xc002 -#define INSN_MASK_C_SWSP 0xe003 - -#define INSN_16BIT_MASK 0x3 - -#define INSN_IS_16BIT(insn) (((insn) & INSN_16BIT_MASK) != INSN_16BIT_MASK) - -#define INSN_LEN(insn) (INSN_IS_16BIT(insn) ? 2 : 4) - -#ifdef CONFIG_64BIT -#define LOG_REGBYTES 3 -#else -#define LOG_REGBYTES 2 -#endif -#define REGBYTES (1 << LOG_REGBYTES) - -#define SH_RD 7 -#define SH_RS1 15 -#define SH_RS2 20 -#define SH_RS2C 2 -#define MASK_RX 0x1f - -#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ - (RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 1) << 6)) -#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 2) << 6)) -#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 2) << 6)) -#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 3) << 6)) -#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \ - (RV_X(x, 7, 2) << 6)) -#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 7, 3) << 6)) -#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3)) -#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3)) -#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) - -#define SHIFT_RIGHT(x, y) \ - ((y) < 0 ? ((x) << -(y)) : ((x) >> (y))) - -#define REG_MASK \ - ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES)) - -#define REG_OFFSET(insn, pos) \ - (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK) - -#define REG_PTR(insn, pos, regs) \ - ((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))) - -#define GET_FUNCT3(insn) (((insn) >> 12) & 7) - -#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) -#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) -#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) -#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs)) -#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs)) -#define GET_SP(regs) (*REG_PTR(2, 0, regs)) -#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val)) -#define IMM_I(insn) ((s32)(insn) >> 20) -#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \ - (s32)(((insn) >> 7) & 0x1f)) - struct insn_func { unsigned long mask; unsigned long match; From f8a03516a530cc36bc9015c84ba7540ee3e8d7bd Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Fri, 18 Jul 2025 15:27:07 +0800 Subject: [PATCH 1959/3206] raid6: riscv: Clean up unused header file inclusion These two C files don't reference things defined in simd.h or types.h so remove these redundant #inclusions. Fixes: 6093faaf9593 ("raid6: Add RISC-V SIMD syndrome and recovery calculations") Reviewed-by: Alexandre Ghiti Signed-off-by: Chunyan Zhang Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250718072711.3865118-2-zhangchunyan@iscas.ac.cn Signed-off-by: Paul Walmsley --- lib/raid6/recov_rvv.c | 2 -- lib/raid6/rvv.c | 3 --- 2 files changed, 5 deletions(-) diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c index 5d54c4b437df78..5f779719c3d34c 100644 --- a/lib/raid6/recov_rvv.c +++ b/lib/raid6/recov_rvv.c @@ -4,9 +4,7 @@ * Author: Chunyan Zhang */ -#include #include -#include #include static int rvv_has_vector(void) diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c index 7d82efa5b14f9e..b193ea176d5d33 100644 --- a/lib/raid6/rvv.c +++ b/lib/raid6/rvv.c @@ -9,11 +9,8 @@ * Copyright 2002-2004 H. Peter Anvin */ -#include #include -#include #include -#include #include "rvv.h" #define NSIZE (riscv_v_vsize / 32) /* NSIZE = vlenb */ From 2dfb75cd5695fa9db2ad90d1339330eda7a0239d Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Fri, 18 Jul 2025 15:27:08 +0800 Subject: [PATCH 1960/3206] raid6: riscv: replace one load with a move to speed up the caculation Since wp$$==wq$$, it doesn't need to load the same data twice, use move instruction to replace one of the loads to let the program run faster. Reviewed-by: Alexandre Ghiti Signed-off-by: Chunyan Zhang Link: https://lore.kernel.org/r/20250718072711.3865118-3-zhangchunyan@iscas.ac.cn Signed-off-by: Paul Walmsley --- lib/raid6/rvv.c | 60 ++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c index b193ea176d5d33..89da5fc247aa94 100644 --- a/lib/raid6/rvv.c +++ b/lib/raid6/rvv.c @@ -44,7 +44,7 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]) @@ -117,7 +117,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]) @@ -218,9 +218,9 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -310,9 +310,9 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -440,13 +440,13 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -566,13 +566,13 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -754,21 +754,21 @@ static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" "vle8.v v16, (%[wp4])\n" - "vle8.v v17, (%[wp4])\n" + "vmv.v.v v17, v16\n" "vle8.v v20, (%[wp5])\n" - "vle8.v v21, (%[wp5])\n" + "vmv.v.v v21, v20\n" "vle8.v v24, (%[wp6])\n" - "vle8.v v25, (%[wp6])\n" + "vmv.v.v v25, v24\n" "vle8.v v28, (%[wp7])\n" - "vle8.v v29, (%[wp7])\n" + "vmv.v.v v29, v28\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -948,21 +948,21 @@ static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" "vle8.v v16, (%[wp4])\n" - "vle8.v v17, (%[wp4])\n" + "vmv.v.v v17, v16\n" "vle8.v v20, (%[wp5])\n" - "vle8.v v21, (%[wp5])\n" + "vmv.v.v v21, v20\n" "vle8.v v24, (%[wp6])\n" - "vle8.v v25, (%[wp6])\n" + "vmv.v.v v25, v24\n" "vle8.v v28, (%[wp7])\n" - "vle8.v v29, (%[wp7])\n" + "vmv.v.v v29, v28\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), From 109f8b51543d106aee50dfe911f439e43fb30c7a Mon Sep 17 00:00:00 2001 From: "Remy D. Farley" Date: Sat, 13 Sep 2025 14:05:28 +0000 Subject: [PATCH 1961/3206] doc/netlink: Fix typos in operation attributes I'm trying to generate Rust bindings for netlink using the yaml spec. It looks like there's a typo in conntrack spec: attribute set conntrack-attrs defines attributes "counters-{orig,reply}" (plural), while get operation references "counter-{orig,reply}" (singular). The latter should be fixed, as it denotes multiple counters (packet and byte). The corresonding C define is CTA_COUNTERS_ORIG. Also, dump request references "nfgen-family" attribute, which neither exists in conntrack-attrs attrset nor ctattr_type enum. There's member of nfgenmsg struct with the same name, which is where family value is actually taken from. > static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, > struct sk_buff *skb, > const struct nlmsghdr *nlh, > const struct nlattr * const cda[], > struct netlink_ext_ack *extack) > { > int err; > struct nfgenmsg *nfmsg = nlmsg_data(nlh); > u_int8_t u3 = nfmsg->nfgen_family; ^^^^^^^^^^^^ Signed-off-by: Remy D. Farley Fixes: 23fc9311a526 ("netlink: specs: add conntrack dump and stats dump support") Link: https://patch.msgid.link/20250913140515.1132886-1-one-d-wide@protonmail.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/conntrack.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Documentation/netlink/specs/conntrack.yaml b/Documentation/netlink/specs/conntrack.yaml index c6832633ab7bf9..591e22a2ee4382 100644 --- a/Documentation/netlink/specs/conntrack.yaml +++ b/Documentation/netlink/specs/conntrack.yaml @@ -575,8 +575,8 @@ operations: - nat-dst - timeout - mark - - counter-orig - - counter-reply + - counters-orig + - counters-reply - use - id - nat-dst @@ -591,7 +591,6 @@ operations: request: value: 0x101 attributes: - - nfgen-family - mark - filter - status @@ -608,8 +607,8 @@ operations: - nat-dst - timeout - mark - - counter-orig - - counter-reply + - counters-orig + - counters-reply - use - id - nat-dst From 94ff1ed3030e88cfe4e34c1d47c5832995c953c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Sep 2025 16:42:55 -0700 Subject: [PATCH 1962/3206] MAINTAINERS: make the DPLL entry cover drivers DPLL maintainers should probably be CCed on driver patches, too. Remove the *, which makes the pattern only match files directly under drivers/dpll but not its sub-directories. Acked-by: Jiri Pirko Acked-by: Vadim Fedorenko Acked-by: Arkadiusz Kubalewski Link: https://patch.msgid.link/20250915234255.1306612-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea95802d87e14b..086b4a7bd04f42 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7430,7 +7430,7 @@ S: Supported F: Documentation/devicetree/bindings/dpll/dpll-device.yaml F: Documentation/devicetree/bindings/dpll/dpll-pin.yaml F: Documentation/driver-api/dpll.rst -F: drivers/dpll/* +F: drivers/dpll/ F: include/linux/dpll.h F: include/uapi/linux/dpll.h From b460b317b21d5106f3ebd34bd51b851ad355a70a Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Wed, 17 Sep 2025 09:03:45 +0900 Subject: [PATCH 1963/3206] firewire: core: schedule bm_work item outside of spin lock Before (re)building topology tree, fw_core_handle_bus_reset() schedules a work item under acquiring fw_card spin lock. The work item invokes bm_work() which acquires the spin lock at first, then can be stalled to wait until the building tree finishes. This is inconvenient. This commit moves the timing to schedule the work item after releasing the spin lock. Link: https://lore.kernel.org/r/20250917000347.52369-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-topology.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 8fa0772ee723f4..2f73bcd5696f2b 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -479,7 +479,6 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, card->reset_jiffies = get_jiffies_64(); card->bm_node_id = 0xffff; card->bm_abdicate = bm_abdicate; - fw_schedule_bm_work(card, 0); local_node = build_tree(card, self_ids, self_id_count, generation); @@ -496,6 +495,8 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, } } + fw_schedule_bm_work(card, 0); + // Just used by transaction layer. scoped_guard(spinlock, &card->topology_map.lock) { update_topology_map(card->topology_map.buffer, sizeof(card->topology_map.buffer), From abe7159125702c734e851bc0c52b51cd446298a5 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Wed, 17 Sep 2025 09:03:46 +0900 Subject: [PATCH 1964/3206] firewire: core: disable bus management work temporarily during updating topology When processing selfID sequence, bus topology tree is (re)built, and some members of fw_card are determined. Once determined, the members are valid during the bus generation. The above operations are in the critical section of fw_card spin lock. Before building the bus topology, a work item is scheduled for bus manager work. The bm_work() function is invoked by the work item. The function tries to acquire the spin lock, then can be stalled until the bus topology building finishes. The bus manager should work once the members of fw_card are determined. This commit suppresses the above situation by disabling the work item during processing selfID sequence. Link: https://lore.kernel.org/r/20250917000347.52369-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-topology.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 2f73bcd5696f2b..90b988035a2afa 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -460,8 +460,14 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, { struct fw_node *local_node; + might_sleep(); + trace_bus_reset_handle(card->index, generation, node_id, bm_abdicate, self_ids, self_id_count); + // Disable bus management work during updating the cache of bus topology, since the work + // accesses to some members of fw_card. + disable_delayed_work_sync(&card->bm_work); + scoped_guard(spinlock, &card->lock) { // If the selfID buffer is not the immediate successor of the // previously processed one, we cannot reliably compare the @@ -495,6 +501,8 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, } } + enable_delayed_work(&card->bm_work); + fw_schedule_bm_work(card, 0); // Just used by transaction layer. From 582310376d6e9a8d261b682178713cdc4b251af6 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Wed, 17 Sep 2025 09:03:47 +0900 Subject: [PATCH 1965/3206] firewire: core: shrink critical section of fw_card spinlock in bm_work Now fw_core_handle_bus_reset() and bm_work() are serialized. Some members of fw_card are free to access in bm_work() This commit shrinks critical section of fw_card spinlock in bm_work() Link: https://lore.kernel.org/r/20250917000347.52369-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 96495392a1f6d3..4fcd5ce4b2cef2 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -293,12 +293,8 @@ static void bm_work(struct work_struct *work) int expected_gap_count, generation, grace; bool do_reset = false; - spin_lock_irq(&card->lock); - - if (card->local_node == NULL) { - spin_unlock_irq(&card->lock); + if (card->local_node == NULL) return; - } generation = card->generation; @@ -354,8 +350,6 @@ static void bm_work(struct work_struct *work) goto pick_me; } - spin_unlock_irq(&card->lock); - rcode = fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP, irm_id, generation, SCODE_100, CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, @@ -365,21 +359,20 @@ static void bm_work(struct work_struct *work) if (rcode == RCODE_GENERATION) return; - spin_lock_irq(&card->lock); - if (rcode == RCODE_COMPLETE) { int bm_id = be32_to_cpu(data[0]); if (generation == card->generation) { - if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) - card->bm_node_id = 0xffc0 & bm_id; - else - card->bm_node_id = local_id; + // Used by cdev layer for "struct fw_cdev_event_bus_reset". + scoped_guard(spinlock, &card->lock) { + if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) + card->bm_node_id = 0xffc0 & bm_id; + else + card->bm_node_id = local_id; + } } if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) { - spin_unlock_irq(&card->lock); - // Somebody else is BM. Only act as IRM. if (local_id == irm_id) allocate_broadcast_channel(card, generation); @@ -393,7 +386,6 @@ static void bm_work(struct work_struct *work) * some local problem. Let's try again later and hope * that the problem has gone away by then. */ - spin_unlock_irq(&card->lock); fw_schedule_bm_work(card, msecs_to_jiffies(125)); return; } @@ -415,7 +407,6 @@ static void bm_work(struct work_struct *work) * We weren't BM in the last generation, and the last * bus reset is less than 125ms ago. Reschedule this job. */ - spin_unlock_irq(&card->lock); fw_schedule_bm_work(card, msecs_to_jiffies(125)); return; } @@ -458,7 +449,6 @@ static void bm_work(struct work_struct *work) if (!root_device_is_running) { // If we haven't probed this device yet, bail out now // and let's try again once that's done. - spin_unlock_irq(&card->lock); return; } else if (root_device->cmc) { // We will send out a force root packet for this @@ -495,8 +485,6 @@ static void bm_work(struct work_struct *work) if (do_reset) { int card_gap_count = card->gap_count; - spin_unlock_irq(&card->lock); - fw_notice(card, "phy config: new root=%x, gap_count=%d\n", new_root_id, expected_gap_count); fw_send_phy_config(card, new_root_id, generation, expected_gap_count); @@ -517,8 +505,6 @@ static void bm_work(struct work_struct *work) } else { struct fw_device *root_device = fw_node_get_device(root_node); - spin_unlock_irq(&card->lock); - if (root_device && root_device->cmc) { // Make sure that the cycle master sends cycle start packets. __be32 data = cpu_to_be32(CSR_STATE_BIT_CMSTR); From 6b4be64fd9fec16418f365c2d8e47a7566e9eba5 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 15 Sep 2025 15:24:32 +0300 Subject: [PATCH 1966/3206] net/mlx5e: Harden uplink netdev access against device unbind The function mlx5_uplink_netdev_get() gets the uplink netdevice pointer from mdev->mlx5e_res.uplink_netdev. However, the netdevice can be removed and its pointer cleared when unbound from the mlx5_core.eth driver. This results in a NULL pointer, causing a kernel panic. BUG: unable to handle page fault for address: 0000000000001300 at RIP: 0010:mlx5e_vport_rep_load+0x22a/0x270 [mlx5_core] Call Trace: mlx5_esw_offloads_rep_load+0x68/0xe0 [mlx5_core] esw_offloads_enable+0x593/0x910 [mlx5_core] mlx5_eswitch_enable_locked+0x341/0x420 [mlx5_core] mlx5_devlink_eswitch_mode_set+0x17e/0x3a0 [mlx5_core] devlink_nl_eswitch_set_doit+0x60/0xd0 genl_family_rcv_msg_doit+0xe0/0x130 genl_rcv_msg+0x183/0x290 netlink_rcv_skb+0x4b/0xf0 genl_rcv+0x24/0x40 netlink_unicast+0x255/0x380 netlink_sendmsg+0x1f3/0x420 __sock_sendmsg+0x38/0x60 __sys_sendto+0x119/0x180 do_syscall_64+0x53/0x1d0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Ensure the pointer is valid before use by checking it for NULL. If it is valid, immediately call netdev_hold() to take a reference, and preventing the netdevice from being freed while it is in use. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Reviewed-by: Jiri Pirko Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757939074-617281-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_rep.c | 27 +++++++++++++++---- .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 1 + .../ethernet/mellanox/mlx5/core/lib/mlx5.h | 15 ++++++++++- include/linux/mlx5/driver.h | 1 + 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 63a7a788fb0db5..cd0242eb008c29 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1506,12 +1506,21 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = { static int mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { - struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev)); struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + struct net_device *netdev; + struct mlx5e_priv *priv; + int err; + + netdev = mlx5_uplink_netdev_get(dev); + if (!netdev) + return 0; + priv = netdev_priv(netdev); rpriv->netdev = priv->netdev; - return mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, - rpriv); + err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, + rpriv); + mlx5_uplink_netdev_put(dev, netdev); + return err; } static void @@ -1638,8 +1647,16 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) { struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); struct net_device *netdev = rpriv->netdev; - struct mlx5e_priv *priv = netdev_priv(netdev); - void *ppriv = priv->ppriv; + struct mlx5e_priv *priv; + void *ppriv; + + if (!netdev) { + ppriv = rpriv; + goto free_ppriv; + } + + priv = netdev_priv(netdev); + ppriv = priv->ppriv; if (rep->vport == MLX5_VPORT_UPLINK) { mlx5e_vport_uplink_rep_unload(rpriv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 8b497765018381..5f2d6c35f1ad59 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1515,6 +1515,7 @@ static u32 mlx5_esw_qos_lag_link_speed_get_locked(struct mlx5_core_dev *mdev) speed = lksettings.base.speed; out: + mlx5_uplink_netdev_put(mdev, slave); return speed; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index b111ccd03b0267..74ea5da58b7ea4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -47,7 +47,20 @@ int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data); static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev) { - return mdev->mlx5e_res.uplink_netdev; + struct mlx5e_resources *mlx5e_res = &mdev->mlx5e_res; + struct net_device *netdev; + + mutex_lock(&mlx5e_res->uplink_netdev_lock); + netdev = mlx5e_res->uplink_netdev; + netdev_hold(netdev, &mlx5e_res->tracker, GFP_KERNEL); + mutex_unlock(&mlx5e_res->uplink_netdev_lock); + return netdev; +} + +static inline void mlx5_uplink_netdev_put(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + netdev_put(netdev, &mdev->mlx5e_res.tracker); } struct mlx5_sd; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c5fbfb8574931..10fe492e1fedcd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -663,6 +663,7 @@ struct mlx5e_resources { bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; + netdevice_tracker tracker; struct mutex uplink_netdev_lock; struct mlx5_crypto_dek_priv *dek_priv; }; From 7601a0a46216f4ba05adff2de75923b4e8e585c2 Mon Sep 17 00:00:00 2001 From: Lama Kayal Date: Mon, 15 Sep 2025 15:24:34 +0300 Subject: [PATCH 1967/3206] net/mlx5e: Add a miss level for ipsec crypto offload The cited commit adds a miss table for switchdev mode. But it uses the same level as policy table. Will hit the following error when running command: # ip xfrm state add src 192.168.1.22 dst 192.168.1.21 proto \ esp spi 1001 reqid 10001 aead 'rfc4106(gcm(aes))' \ 0x3a189a7f9374955d3817886c8587f1da3df387ff 128 \ mode tunnel offload dev enp8s0f0 dir in Error: mlx5_core: Device failed to offload this state. The dmesg error is: mlx5_core 0000:03:00.0: ipsec_miss_create:578:(pid 311797): fail to create IPsec miss_rule err=-22 Fix it by adding a new miss level to avoid the error. Fixes: 7d9e292ecd67 ("net/mlx5e: Move IPSec policy check after decryption") Signed-off-by: Jianbo Liu Signed-off-by: Chris Mi Signed-off-by: Lama Kayal Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757939074-617281-4-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/fs.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 4 ++-- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index 9560fcba643f50..ac65e319148029 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -92,6 +92,7 @@ enum { MLX5E_ACCEL_FS_ESP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL, MLX5E_ACCEL_FS_POL_FT_LEVEL, + MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL, MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL, #endif }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h index ffcd0cdeb77544..23703f28386ad9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h @@ -185,6 +185,7 @@ struct mlx5e_ipsec_rx_create_attr { u32 family; int prio; int pol_level; + int pol_miss_level; int sa_level; int status_level; enum mlx5_flow_namespace_type chains_ns; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index 98b6a3a623f995..65dc3529283b69 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -747,6 +747,7 @@ static void ipsec_rx_create_attr_set(struct mlx5e_ipsec *ipsec, attr->family = family; attr->prio = MLX5E_NIC_PRIO; attr->pol_level = MLX5E_ACCEL_FS_POL_FT_LEVEL; + attr->pol_miss_level = MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL; attr->sa_level = MLX5E_ACCEL_FS_ESP_FT_LEVEL; attr->status_level = MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL; attr->chains_ns = MLX5_FLOW_NAMESPACE_KERNEL; @@ -833,7 +834,7 @@ static int ipsec_rx_chains_create_miss(struct mlx5e_ipsec *ipsec, ft_attr.max_fte = 1; ft_attr.autogroup.max_num_groups = 1; - ft_attr.level = attr->pol_level; + ft_attr.level = attr->pol_miss_level; ft_attr.prio = attr->prio; ft = mlx5_create_auto_grouped_flow_table(attr->ns, &ft_attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index cb165085a4c10c..db552c012b4f4f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -114,9 +114,9 @@ #define ETHTOOL_NUM_PRIOS 11 #define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS) /* Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy, - * {IPsec RoCE MPV,Alias table},IPsec RoCE policy + * IPsec policy miss, {IPsec RoCE MPV,Alias table},IPsec RoCE policy */ -#define KERNEL_NIC_PRIO_NUM_LEVELS 10 +#define KERNEL_NIC_PRIO_NUM_LEVELS 11 #define KERNEL_NIC_NUM_PRIOS 1 /* One more level for tc, and one more for promisc */ #define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 2) From 5fe59140276d94f1390d062f5643f852270f8d95 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 11 May 2025 23:17:53 +0200 Subject: [PATCH 1968/3206] riscv: kprobes: Move branch_rs2_idx to insn.h Similar to other instruction-processing macros/functions, branch_rs2_idx should be in insn.h. Move it into insn.h as RV_EXTRACT_RS2_REG. This new name matches the style in insn.h. Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/linux-riscv/107d4a6c1818bf169be2407b273a0483e6d55bbb.1747215274.git.namcao@linutronix.de/ [pjw@kernel.org: updated to use RV_X_MASK and to apply] Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/insn.h | 5 +++++ arch/riscv/kernel/probes/simulate-insn.c | 5 +---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h index 89ae44fb408838..a7ab9bc24a3ae2 100644 --- a/arch/riscv/include/asm/insn.h +++ b/arch/riscv/include/asm/insn.h @@ -64,6 +64,7 @@ #define RVG_RS2_OPOFF 20 #define RVG_RD_OPOFF 7 #define RVG_RS1_MASK GENMASK(4, 0) +#define RVG_RS2_MASK GENMASK(4, 0) #define RVG_RD_MASK GENMASK(4, 0) /* The bit field of immediate value in RVC J instruction */ @@ -457,6 +458,10 @@ static __always_inline bool riscv_insn_is_c_jalr(u32 code) ({typeof(x) x_ = (x); \ (RV_X_MASK(x_, RVG_RS1_OPOFF, RVG_RS1_MASK)); }) +#define RV_EXTRACT_RS2_REG(x) \ + ({typeof(x) x_ = (x); \ + (RV_X_MASK(x_, RVG_RS2_OPOFF, RVG_RS2_MASK)); }) + #define RV_EXTRACT_RD_REG(x) \ ({typeof(x) x_ = (x); \ (RV_X_MASK(x_, RVG_RD_OPOFF, RVG_RD_MASK)); }) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 6c166029079c42..77be381bb8b472 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -121,9 +121,6 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re #define branch_rs1_idx(opcode) \ (((opcode) >> 15) & 0x1f) -#define branch_rs2_idx(opcode) \ - (((opcode) >> 20) & 0x1f) - #define branch_funct3(opcode) \ (((opcode) >> 12) & 0x7) @@ -157,7 +154,7 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r unsigned long rs2_val; if (!rv_insn_reg_get_val(regs, branch_rs1_idx(opcode), &rs1_val) || - !rv_insn_reg_get_val(regs, branch_rs2_idx(opcode), &rs2_val)) + !rv_insn_reg_get_val(regs, RV_EXTRACT_RS2_REG(opcode), &rs2_val)) return false; offset_tmp = branch_offset(opcode); From 518c550eebbc5502177c3b8c4a28286120c518db Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 11 May 2025 23:17:54 +0200 Subject: [PATCH 1969/3206] riscv: kprobes: Move branch_funct3 to insn.h Similar to other instruction-processing macros/functions, branch_funct3 should be in insn.h. Move it into insn.h as RV_EXTRACT_FUNCT3. This new name matches the style in insn.h. Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/linux-riscv/200c29a26338f19d09963fa02562787e8cfa06f2.1747215274.git.namcao@linutronix.de/ [pjw@kernel.org: updated to use RV_X_MASK and to apply] Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/insn.h | 5 +++++ arch/riscv/kernel/probes/simulate-insn.c | 5 +---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h index a7ab9bc24a3ae2..c3005573e8c999 100644 --- a/arch/riscv/include/asm/insn.h +++ b/arch/riscv/include/asm/insn.h @@ -454,6 +454,11 @@ static __always_inline bool riscv_insn_is_c_jalr(u32 code) #define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) #define RVC_X(X, s, mask) RV_X_MASK(X, s, mask) +#define RV_EXTRACT_FUNCT3(x) \ + ({typeof(x) x_ = (x); \ + (RV_X_MASK(x_, RV_INSN_FUNCT3_OPOFF, \ + RV_INSN_FUNCT3_MASK >> RV_INSN_FUNCT3_OPOFF)); }) + #define RV_EXTRACT_RS1_REG(x) \ ({typeof(x) x_ = (x); \ (RV_X_MASK(x_, RVG_RS1_OPOFF, RVG_RS1_MASK)); }) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 77be381bb8b472..d5f74fadbc3a04 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -121,9 +121,6 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re #define branch_rs1_idx(opcode) \ (((opcode) >> 15) & 0x1f) -#define branch_funct3(opcode) \ - (((opcode) >> 12) & 0x7) - #define branch_imm(opcode) \ (((((opcode) >> 8) & 0xf ) << 1) | \ ((((opcode) >> 25) & 0x3f) << 5) | \ @@ -158,7 +155,7 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r return false; offset_tmp = branch_offset(opcode); - switch (branch_funct3(opcode)) { + switch (RV_EXTRACT_FUNCT3(opcode)) { case RVG_FUNCT3_BEQ: offset = (rs1_val == rs2_val) ? offset_tmp : 4; break; From 8f1ea7f04edd918b0e0fd8dc1318b22049a6c716 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 11 May 2025 23:17:55 +0200 Subject: [PATCH 1970/3206] riscv: kprobes: Remove duplication of RV_EXTRACT_JTYPE_IMM Use RV_EXTRACT_JTYPE_IMM, instead of reimplementing it in simulate_jal(). Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/linux-riscv/af502036738d381c6bdb96a236d21bab8c343f74.1747215274.git.namcao@linutronix.de/ Signed-off-by: Paul Walmsley --- arch/riscv/kernel/probes/simulate-insn.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index d5f74fadbc3a04..b76a691d0d9aaf 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -41,19 +41,16 @@ bool __kprobes simulate_jal(u32 opcode, unsigned long addr, struct pt_regs *regs * 1 10 1 8 5 JAL/J */ bool ret; - u32 imm; + s32 imm; u32 index = (opcode >> 7) & 0x1f; ret = rv_insn_reg_set_val(regs, index, addr + 4); if (!ret) return ret; - imm = ((opcode >> 21) & 0x3ff) << 1; - imm |= ((opcode >> 20) & 0x1) << 11; - imm |= ((opcode >> 12) & 0xff) << 12; - imm |= ((opcode >> 31) & 0x1) << 20; + imm = RV_EXTRACT_JTYPE_IMM(opcode); - instruction_pointer_set(regs, addr + sign_extend32((imm), 20)); + instruction_pointer_set(regs, addr + imm); return ret; } From d57676c21ef6f3dd530bcd7f5035a5e0b7699cc6 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 11 May 2025 23:17:56 +0200 Subject: [PATCH 1971/3206] riscv: kprobes: Remove duplication of RV_EXTRACT_RS1_REG Use RV_EXTRACT_RS1_REG instead of reimplementing its code. Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/linux-riscv/b441038c991da11a7a48ea7140ab00e3bb119387.1747215274.git.namcao@linutronix.de/ Signed-off-by: Paul Walmsley --- arch/riscv/kernel/probes/simulate-insn.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index b76a691d0d9aaf..625d514c4ada6c 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -66,7 +66,7 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg unsigned long base_addr; u32 imm = (opcode >> 20) & 0xfff; u32 rd_index = (opcode >> 7) & 0x1f; - u32 rs1_index = (opcode >> 15) & 0x1f; + u32 rs1_index = RV_EXTRACT_RS1_REG(opcode); ret = rv_insn_reg_get_val(regs, rs1_index, &base_addr); if (!ret) @@ -115,9 +115,6 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re return true; } -#define branch_rs1_idx(opcode) \ - (((opcode) >> 15) & 0x1f) - #define branch_imm(opcode) \ (((((opcode) >> 8) & 0xf ) << 1) | \ ((((opcode) >> 25) & 0x3f) << 5) | \ @@ -147,7 +144,7 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r unsigned long rs1_val; unsigned long rs2_val; - if (!rv_insn_reg_get_val(regs, branch_rs1_idx(opcode), &rs1_val) || + if (!rv_insn_reg_get_val(regs, RV_EXTRACT_RS1_REG(opcode), &rs1_val) || !rv_insn_reg_get_val(regs, RV_EXTRACT_RS2_REG(opcode), &rs2_val)) return false; From 76494817df791a2a6453dd353a4eec3faf57c578 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 11 May 2025 23:17:57 +0200 Subject: [PATCH 1972/3206] riscv: kprobes: Remove duplication of RV_EXTRACT_BTYPE_IMM Use RV_EXTRACT_BTYPE_IMM, instead of reimplementing it in simulate_branch(). Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/linux-riscv/b441038c991da11a7a48ea7140ab00e3bb119387.1747215274.git.namcao@linutronix.de/ Signed-off-by: Paul Walmsley --- arch/riscv/kernel/probes/simulate-insn.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 625d514c4ada6c..3ba97e79a2a3be 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -115,15 +115,6 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re return true; } -#define branch_imm(opcode) \ - (((((opcode) >> 8) & 0xf ) << 1) | \ - ((((opcode) >> 25) & 0x3f) << 5) | \ - ((((opcode) >> 7) & 0x1 ) << 11) | \ - ((((opcode) >> 31) & 0x1 ) << 12)) - -#define branch_offset(opcode) \ - sign_extend32((branch_imm(opcode)), 12) - bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs) { /* @@ -148,7 +139,7 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r !rv_insn_reg_get_val(regs, RV_EXTRACT_RS2_REG(opcode), &rs2_val)) return false; - offset_tmp = branch_offset(opcode); + offset_tmp = RV_EXTRACT_BTYPE_IMM(opcode); switch (RV_EXTRACT_FUNCT3(opcode)) { case RVG_FUNCT3_BEQ: offset = (rs1_val == rs2_val) ? offset_tmp : 4; From 05ede658d435f3969b13220c629cdc626356353f Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 11 May 2025 23:17:58 +0200 Subject: [PATCH 1973/3206] riscv: kprobes: Remove duplication of RVC_EXTRACT_JTYPE_IMM Use RVC_EXTRACT_JTYPE_IMM, instead of reimplementing it in simulate_c_j(). Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/linux-riscv/24497deaab06d6b12cb84923606ec26f67e25424.1747215274.git.namcao@linutronix.de/ [pjw@kernel.org: fixed subject line typo] Signed-off-by: Paul Walmsley --- arch/riscv/kernel/probes/simulate-insn.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 3ba97e79a2a3be..5defbde4dd50da 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -170,24 +170,9 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r bool __kprobes simulate_c_j(u32 opcode, unsigned long addr, struct pt_regs *regs) { - /* - * 15 13 12 2 1 0 - * | funct3 | offset[11|4|9:8|10|6|7|3:1|5] | opcode | - * 3 11 2 - */ - - s32 offset; - - offset = ((opcode >> 3) & 0x7) << 1; - offset |= ((opcode >> 11) & 0x1) << 4; - offset |= ((opcode >> 2) & 0x1) << 5; - offset |= ((opcode >> 7) & 0x1) << 6; - offset |= ((opcode >> 6) & 0x1) << 7; - offset |= ((opcode >> 9) & 0x3) << 8; - offset |= ((opcode >> 8) & 0x1) << 10; - offset |= ((opcode >> 12) & 0x1) << 11; + s32 offset = RVC_EXTRACT_JTYPE_IMM(opcode); - instruction_pointer_set(regs, addr + sign_extend32(offset, 11)); + instruction_pointer_set(regs, addr + offset); return true; } From 580c11cd0b364c098df86789e230ee54ca3ece46 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 11 May 2025 23:17:59 +0200 Subject: [PATCH 1974/3206] riscv: kprobes: Remove duplication of RVC_EXTRACT_C2_RS1_REG Use RVC_EXTRACT_C2_RS1_REG, instead of reimplementing it in simulate_c_jr_jalr(). Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/linux-riscv/d56955cd683411c6d2f63d13c78e0572462a3269.1747215274.git.namcao@linutronix.de/ Signed-off-by: Paul Walmsley --- arch/riscv/kernel/probes/simulate-insn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 5defbde4dd50da..f5d64613dab5a8 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -188,7 +188,7 @@ static bool __kprobes simulate_c_jr_jalr(u32 opcode, unsigned long addr, struct unsigned long jump_addr; - u32 rs1 = (opcode >> 7) & 0x1f; + u32 rs1 = RVC_EXTRACT_C2_RS1_REG(opcode); if (rs1 == 0) /* C.JR is only valid when rs1 != x0 */ return false; From 05df05bb04188c1c898f940cb2ef5440f94d5b56 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 11 May 2025 23:18:00 +0200 Subject: [PATCH 1975/3206] riscv: kprobes: Remove duplication of RVC_EXTRACT_BTYPE_IMM Use RVC_EXTRACT_BTYPE_IMM, instead of reimplementing it in simulate_c_bnez_beqz(). Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/linux-riscv/8a8ed970f279fa5f24c90d840c2130e37bc6d16e.1747215274.git.namcao@linutronix.de/ Signed-off-by: Paul Walmsley --- arch/riscv/kernel/probes/simulate-insn.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index f5d64613dab5a8..e670e55954d27f 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -232,16 +232,10 @@ static bool __kprobes simulate_c_bnez_beqz(u32 opcode, unsigned long addr, struc if (!rv_insn_reg_get_val(regs, rs1, &rs1_val)) return false; - if ((rs1_val != 0 && is_bnez) || (rs1_val == 0 && !is_bnez)) { - offset = ((opcode >> 3) & 0x3) << 1; - offset |= ((opcode >> 10) & 0x3) << 3; - offset |= ((opcode >> 2) & 0x1) << 5; - offset |= ((opcode >> 5) & 0x3) << 6; - offset |= ((opcode >> 12) & 0x1) << 8; - offset = sign_extend32(offset, 8); - } else { + if ((rs1_val != 0 && is_bnez) || (rs1_val == 0 && !is_bnez)) + offset = RVC_EXTRACT_BTYPE_IMM(opcode); + else offset = 2; - } instruction_pointer_set(regs, addr + offset); From e33349630220e19264bd133dea8eee5d4e8684c6 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 11 May 2025 23:18:01 +0200 Subject: [PATCH 1976/3206] riscv: kprobes: Remove duplication of RV_EXTRACT_RD_REG Use RV_EXTRACT_RD_REG, instead of reimplementing its code. Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/linux-riscv/b31e5b41df5839a76103348e54dc034c8a43447a.1747215274.git.namcao@linutronix.de/ Signed-off-by: Paul Walmsley --- arch/riscv/kernel/probes/simulate-insn.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index e670e55954d27f..1717df780409b0 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -42,7 +42,7 @@ bool __kprobes simulate_jal(u32 opcode, unsigned long addr, struct pt_regs *regs */ bool ret; s32 imm; - u32 index = (opcode >> 7) & 0x1f; + u32 index = RV_EXTRACT_RD_REG(opcode); ret = rv_insn_reg_set_val(regs, index, addr + 4); if (!ret) @@ -65,7 +65,7 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg bool ret; unsigned long base_addr; u32 imm = (opcode >> 20) & 0xfff; - u32 rd_index = (opcode >> 7) & 0x1f; + u32 rd_index = RV_EXTRACT_RD_REG(opcode); u32 rs1_index = RV_EXTRACT_RS1_REG(opcode); ret = rv_insn_reg_get_val(regs, rs1_index, &base_addr); @@ -81,9 +81,6 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg return ret; } -#define auipc_rd_idx(opcode) \ - ((opcode >> 7) & 0x1f) - #define auipc_imm(opcode) \ ((((opcode) >> 12) & 0xfffff) << 12) @@ -104,7 +101,7 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re * 20 5 7 */ - u32 rd_idx = auipc_rd_idx(opcode); + u32 rd_idx = RV_EXTRACT_RD_REG(opcode); unsigned long rd_val = addr + auipc_offset(opcode); if (!rv_insn_reg_set_val(regs, rd_idx, rd_val)) From 7843b48dbf47d7fed4feaeb960244a757bb5d355 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 11 May 2025 23:18:02 +0200 Subject: [PATCH 1977/3206] riscv: kprobes: Remove duplication of RV_EXTRACT_UTYPE_IMM Use RV_EXTRACT_UTYPE_IMM, instead of reimplementing it in simulate_auipc(). Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/linux-riscv/8f0defce9f1f23f1b44bb9750ed083cfc124213c.1747215274.git.namcao@linutronix.de/ Signed-off-by: Paul Walmsley --- arch/riscv/kernel/probes/simulate-insn.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 1717df780409b0..2b3cd69d6f8e15 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -81,17 +81,6 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg return ret; } -#define auipc_imm(opcode) \ - ((((opcode) >> 12) & 0xfffff) << 12) - -#if __riscv_xlen == 64 -#define auipc_offset(opcode) sign_extend64(auipc_imm(opcode), 31) -#elif __riscv_xlen == 32 -#define auipc_offset(opcode) auipc_imm(opcode) -#else -#error "Unexpected __riscv_xlen" -#endif - bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *regs) { /* @@ -102,7 +91,7 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re */ u32 rd_idx = RV_EXTRACT_RD_REG(opcode); - unsigned long rd_val = addr + auipc_offset(opcode); + unsigned long rd_val = addr + (s32)RV_EXTRACT_UTYPE_IMM(opcode); if (!rv_insn_reg_set_val(regs, rd_idx, rd_val)) return false; From 4d4a3cc7f280b2751a6967b25c6d8c1e2740cafd Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 11 May 2025 23:18:03 +0200 Subject: [PATCH 1978/3206] riscv: kprobes: Remove duplication of RV_EXTRACT_ITYPE_IMM Use RV_EXTRACT_ITYPE_IMM, instead of re-implementing it in simulate_jalr(). Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/linux-riscv/8ae34e966c312ae5cf6c09a35ddc290cce942208.1747215274.git.namcao@linutronix.de/ Signed-off-by: Paul Walmsley --- arch/riscv/kernel/probes/simulate-insn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 2b3cd69d6f8e15..fa581590c1f8b2 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -64,7 +64,7 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg */ bool ret; unsigned long base_addr; - u32 imm = (opcode >> 20) & 0xfff; + u32 imm = RV_EXTRACT_ITYPE_IMM(opcode); u32 rd_index = RV_EXTRACT_RD_REG(opcode); u32 rs1_index = RV_EXTRACT_RS1_REG(opcode); From 29589343488e116ac31f6f3cfa83e43949a2207a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Sep 2025 23:32:30 +0200 Subject: [PATCH 1979/3206] asm-generic: Provide generic TIF infrastructure Common TIF bits do not have to be defined by every architecture. They can be defined in a generic header. That allows adding generic TIF bits without chasing a gazillion of architecture headers, which is again a unjustified burden on anyone who works on generic infrastructure as it always needs a boat load of work to keep existing architecture code working when adding new stuff. While it is not as horrible as the ignorance of the generic entry infrastructure, it is a welcome mechanism to make architecture people rethink their approach of just leaching generic improvements into architecture code and thereby making it accumulatingly harder to maintain and improve generic code. It's about time that this changes. Provide the infrastructure and split the TIF space in half, 16 generic and 16 architecture specific bits. This could probably be extended by TIF_SINGLESTEP and BLOCKSTEP, but those are only used in architecture specific code. So leave them alone for now. Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Acked-by: Arnd Bergmann --- arch/Kconfig | 4 +++ include/asm-generic/thread_info_tif.h | 48 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 include/asm-generic/thread_info_tif.h diff --git a/arch/Kconfig b/arch/Kconfig index d1b4ffd6e08564..c20df40a722048 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1730,6 +1730,10 @@ config ARCH_VMLINUX_NEEDS_RELOCS relocations preserved. This is used by some architectures to construct bespoke relocation tables for KASLR. +# Select if architecture uses the common generic TIF bits +config HAVE_GENERIC_TIF_BITS + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h new file mode 100644 index 00000000000000..ee3793e9b1a4dc --- /dev/null +++ b/include/asm-generic/thread_info_tif.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_THREAD_INFO_TIF_H_ +#define _ASM_GENERIC_THREAD_INFO_TIF_H_ + +#include + +/* Bits 16-31 are reserved for architecture specific purposes */ + +#define TIF_NOTIFY_RESUME 0 // callback before returning to user +#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) + +#define TIF_SIGPENDING 1 // signal pending +#define _TIF_SIGPENDING BIT(TIF_SIGPENDING) + +#define TIF_NOTIFY_SIGNAL 2 // signal notifications exist +#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) + +#define TIF_MEMDIE 3 // is terminating due to OOM killer +#define _TIF_MEMDIE BIT(TIF_MEMDIE) + +#define TIF_NEED_RESCHED 4 // rescheduling necessary +#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) + +#ifdef HAVE_TIF_NEED_RESCHED_LAZY +# define TIF_NEED_RESCHED_LAZY 5 // Lazy rescheduling needed +# define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) +#endif + +#ifdef HAVE_TIF_POLLING_NRFLAG +# define TIF_POLLING_NRFLAG 6 // idle is polling for TIF_NEED_RESCHED +# define _TIF_POLLING_NRFLAG BIT(TIF_POLLING_NRFLAG) +#endif + +#define TIF_USER_RETURN_NOTIFY 7 // notify kernel of userspace return +#define _TIF_USER_RETURN_NOTIFY BIT(TIF_USER_RETURN_NOTIFY) + +#define TIF_UPROBE 8 // breakpointed or singlestepping +#define _TIF_UPROBE BIT(TIF_UPROBE) + +#define TIF_PATCH_PENDING 9 // pending live patching update +#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) + +#ifdef HAVE_TIF_RESTORE_SIGMASK +# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() */ +# define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) +#endif + +#endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */ From da3f033a9fbfdb88826c1b0486fe1522fe4f94aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Sep 2025 23:32:32 +0200 Subject: [PATCH 1980/3206] x86: Use generic TIF bits No point in defining generic items and the upcoming RSEQ optimizations are only available with this _and_ the generic entry infrastructure, which is already used by x86. So no further action required here. Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers --- arch/x86/Kconfig | 1 + arch/x86/include/asm/thread_info.h | 76 ++++++++++++------------------ 2 files changed, 32 insertions(+), 45 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 52c8910ba2efdd..70b94e025f415b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -239,6 +239,7 @@ config X86 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA if X86_32 select HAVE_EXIT_THREAD + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC if HAVE_FUNCTION_GRAPH_TRACER diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 9282465eea21d3..e71e0e8362ed8e 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -80,56 +80,42 @@ struct thread_info { #endif /* - * thread information flags - * - these are process state flags that various assembly files - * may need to access + * Tell the generic TIF infrastructure which bits x86 supports */ -#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ -#define TIF_SIGPENDING 2 /* signal pending */ -#define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 4 /* Lazy rescheduling needed */ -#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ -#define TIF_SSBD 6 /* Speculative store bypass disable */ -#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ -#define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ -#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ -#define TIF_UPROBE 12 /* breakpointed or singlestepping */ -#define TIF_PATCH_PENDING 13 /* pending live patching update */ -#define TIF_NEED_FPU_LOAD 14 /* load FPU on return to userspace */ -#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ -#define TIF_NOTSC 16 /* TSC is not accessible in userland */ -#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ -#define TIF_MEMDIE 20 /* is terminating due to OOM killer */ -#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_POLLING_NRFLAG +#define HAVE_TIF_SINGLESTEP + +#include + +/* Architecture specific TIF space starts at 16 */ +#define TIF_SSBD 16 /* Speculative store bypass disable */ +#define TIF_SPEC_IB 17 /* Indirect branch speculation mitigation */ +#define TIF_SPEC_L1D_FLUSH 18 /* Flush L1D on mm switches (processes) */ +#define TIF_NEED_FPU_LOAD 19 /* load FPU on return to userspace */ +#define TIF_NOCPUID 20 /* CPUID is not accessible in userland */ +#define TIF_NOTSC 21 /* TSC is not accessible in userland */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ -#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ +#define TIF_SINGLESTEP 25 /* reenable singlestep on user return*/ +#define TIF_BLOCKSTEP 26 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ - -#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) -#define _TIF_SSBD (1 << TIF_SSBD) -#define _TIF_SPEC_IB (1 << TIF_SPEC_IB) -#define _TIF_SPEC_L1D_FLUSH (1 << TIF_SPEC_L1D_FLUSH) -#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) -#define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) -#define _TIF_NEED_FPU_LOAD (1 << TIF_NEED_FPU_LOAD) -#define _TIF_NOCPUID (1 << TIF_NOCPUID) -#define _TIF_NOTSC (1 << TIF_NOTSC) -#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) -#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) -#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) -#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) -#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) -#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_ADDR32 (1 << TIF_ADDR32) +#define TIF_ADDR32 28 /* 32-bit address space on 64 bits */ + +#define _TIF_SSBD BIT(TIF_SSBD) +#define _TIF_SPEC_IB BIT(TIF_SPEC_IB) +#define _TIF_SPEC_L1D_FLUSH BIT(TIF_SPEC_L1D_FLUSH) +#define _TIF_NEED_FPU_LOAD BIT(TIF_NEED_FPU_LOAD) +#define _TIF_NOCPUID BIT(TIF_NOCPUID) +#define _TIF_NOTSC BIT(TIF_NOTSC) +#define _TIF_IO_BITMAP BIT(TIF_IO_BITMAP) +#define _TIF_SPEC_FORCE_UPDATE BIT(TIF_SPEC_FORCE_UPDATE) +#define _TIF_FORCED_TF BIT(TIF_FORCED_TF) +#define _TIF_BLOCKSTEP BIT(TIF_BLOCKSTEP) +#define _TIF_SINGLESTEP BIT(TIF_SINGLESTEP) +#define _TIF_LAZY_MMU_UPDATES BIT(TIF_LAZY_MMU_UPDATES) +#define _TIF_ADDR32 BIT(TIF_ADDR32) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ From 06e5b72858b168fcedb0402b8dfdb896276fc08e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Sep 2025 23:32:34 +0200 Subject: [PATCH 1981/3206] s390: Use generic TIF bits No point in defining generic items and the upcoming RSEQ optimizations are only available with this _and_ the generic entry infrastructure, which is already used by s390. So no further action required here. This leaves a comment about the AUDIT/TRACE/SECCOMP bits which are handled by SYSCALL_WORK in the generic code, so they seem redundant, but that's a problem for the s390 wizards to think about. Signed-off-by: Thomas Gleixner Acked-by: Heiko Carstens --- arch/s390/Kconfig | 1 + arch/s390/include/asm/thread_info.h | 44 ++++++++++++----------------- 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf680c26a33cf7..f991ab92e3912d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -199,6 +199,7 @@ config S390 select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FENTRY select HAVE_FTRACE_GRAPH_FUNC diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index f6ed2c8192c87c..fe6da066b123a1 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -56,43 +56,35 @@ void arch_setup_new_exec(void); /* * thread information flags bit numbers + * + * Tell the generic TIF infrastructure which special bits s390 supports */ -#define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ -#define TIF_SIGPENDING 1 /* signal pending */ -#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling needed */ -#define TIF_UPROBE 4 /* breakpointed or single-stepping */ -#define TIF_PATCH_PENDING 5 /* pending live patching update */ -#define TIF_ASCE_PRIMARY 6 /* primary asce is kernel asce */ -#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ -#define TIF_GUARDED_STORAGE 8 /* load guarded storage control block */ -#define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ -#define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ -#define TIF_31BIT 16 /* 32bit process */ -#define TIF_MEMDIE 17 /* is terminating due to OOM killer */ -#define TIF_RESTORE_SIGMASK 18 /* restore signal mask in do_signal() */ -#define TIF_SINGLE_STEP 19 /* This task is single stepped */ -#define TIF_BLOCK_STEP 20 /* This task is block stepped */ -#define TIF_UPROBE_SINGLESTEP 21 /* This task is uprobe single stepped */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include + +/* Architecture specific bits */ +#define TIF_ASCE_PRIMARY 16 /* primary asce is kernel asce */ +#define TIF_GUARDED_STORAGE 17 /* load guarded storage control block */ +#define TIF_ISOLATE_BP_GUEST 18 /* Run KVM guests with isolated BP */ +#define TIF_PER_TRAP 19 /* Need to handle PER trap on exit to usermode */ +#define TIF_31BIT 20 /* 32bit process */ +#define TIF_SINGLE_STEP 21 /* This task is single stepped */ +#define TIF_BLOCK_STEP 22 /* This task is block stepped */ +#define TIF_UPROBE_SINGLESTEP 23 /* This task is uprobe single stepped */ + +/* These could move over to SYSCALL_WORK bits, no? */ #define TIF_SYSCALL_TRACE 24 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 25 /* syscall auditing active */ #define TIF_SECCOMP 26 /* secure computing */ #define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ -#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING BIT(TIF_SIGPENDING) -#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) -#define _TIF_UPROBE BIT(TIF_UPROBE) -#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) #define _TIF_ASCE_PRIMARY BIT(TIF_ASCE_PRIMARY) -#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) #define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) #define _TIF_PER_TRAP BIT(TIF_PER_TRAP) #define _TIF_31BIT BIT(TIF_31BIT) -#define _TIF_MEMDIE BIT(TIF_MEMDIE) -#define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP) #define _TIF_BLOCK_STEP BIT(TIF_BLOCK_STEP) #define _TIF_UPROBE_SINGLESTEP BIT(TIF_UPROBE_SINGLESTEP) From c7ac5a089d495fd57f6851126e83e9c19205afae Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 11 Sep 2025 11:28:06 +0200 Subject: [PATCH 1982/3206] s390/entry: Remove unused TIF flags The conversion of s390 to generic entry missed to remove the TIF_SYSCALL*/TIF_SECCOMP flags. Remove them as they are unused now. Fixes: 56e62a737028 ("s390: convert to generic entry") Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Acked-by: Heiko Carstens --- arch/s390/include/asm/thread_info.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index fe6da066b123a1..7878e9bfbf072b 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -74,12 +74,6 @@ void arch_setup_new_exec(void); #define TIF_BLOCK_STEP 22 /* This task is block stepped */ #define TIF_UPROBE_SINGLESTEP 23 /* This task is uprobe single stepped */ -/* These could move over to SYSCALL_WORK bits, no? */ -#define TIF_SYSCALL_TRACE 24 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 25 /* syscall auditing active */ -#define TIF_SECCOMP 26 /* secure computing */ -#define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ - #define _TIF_ASCE_PRIMARY BIT(TIF_ASCE_PRIMARY) #define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) @@ -88,9 +82,5 @@ void arch_setup_new_exec(void); #define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP) #define _TIF_BLOCK_STEP BIT(TIF_BLOCK_STEP) #define _TIF_UPROBE_SINGLESTEP BIT(TIF_UPROBE_SINGLESTEP) -#define _TIF_SYSCALL_TRACE BIT(TIF_SYSCALL_TRACE) -#define _TIF_SYSCALL_AUDIT BIT(TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP BIT(TIF_SECCOMP) -#define _TIF_SYSCALL_TRACEPOINT BIT(TIF_SYSCALL_TRACEPOINT) #endif /* _ASM_THREAD_INFO_H */ From f9629891d407e455dc1334e1594108c73074fe8b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Sep 2025 23:32:36 +0200 Subject: [PATCH 1983/3206] loongarch: Use generic TIF bits No point in defining generic items and the upcoming RSEQ optimizations are only available with this _and_ the generic entry infrastructure, which is already used by loongarch. So no further action required here. Signed-off-by: Thomas Gleixner --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/thread_info.h | 76 +++++++++++------------- 2 files changed, 35 insertions(+), 42 deletions(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index f0abc38c40ac9e..2e90d862ebb3e5 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -140,6 +140,7 @@ config LOONGARCH select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN select HAVE_EXIT_THREAD + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FTRACE_GRAPH_FUNC select HAVE_FUNCTION_ARG_ACCESS_API diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h index 9dfa2ef0081670..def7cb14467ec6 100644 --- a/arch/loongarch/include/asm/thread_info.h +++ b/arch/loongarch/include/asm/thread_info.h @@ -65,50 +65,42 @@ register unsigned long current_stack_pointer __asm__("$sp"); * access * - pending work-to-be-done flags are in LSW * - other flags in MSW + * + * Tell the generic TIF infrastructure which special bits loongarch supports */ -#define TIF_NEED_RESCHED 0 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 1 /* lazy rescheduling necessary */ -#define TIF_SIGPENDING 2 /* signal pending */ -#define TIF_NOTIFY_RESUME 3 /* callback before returning to user */ -#define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */ -#define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ -#define TIF_NOHZ 6 /* in adaptive nohz mode */ -#define TIF_UPROBE 7 /* breakpointed or singlestepping */ -#define TIF_USEDFPU 8 /* FPU was used by this task this quantum (SMP) */ -#define TIF_USEDSIMD 9 /* SIMD has been used this quantum */ -#define TIF_MEMDIE 10 /* is terminating due to OOM killer */ -#define TIF_FIXADE 11 /* Fix address errors in software */ -#define TIF_LOGADE 12 /* Log address errors to syslog */ -#define TIF_32BIT_REGS 13 /* 32-bit general purpose registers */ -#define TIF_32BIT_ADDR 14 /* 32-bit address space */ -#define TIF_LOAD_WATCH 15 /* If set, load watch registers */ -#define TIF_SINGLESTEP 16 /* Single Step */ -#define TIF_LSX_CTX_LIVE 17 /* LSX context must be preserved */ -#define TIF_LASX_CTX_LIVE 18 /* LASX context must be preserved */ -#define TIF_USEDLBT 19 /* LBT was used by this task this quantum (SMP) */ -#define TIF_LBT_CTX_LIVE 20 /* LBT context must be preserved */ -#define TIF_PATCH_PENDING 21 /* pending live patching update */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include + +/* Architecture specific bits */ +#define TIF_NOHZ 16 /* in adaptive nohz mode */ +#define TIF_USEDFPU 17 /* FPU was used by this task this quantum (SMP) */ +#define TIF_USEDSIMD 18 /* SIMD has been used this quantum */ +#define TIF_FIXADE 10 /* Fix address errors in software */ +#define TIF_LOGADE 20 /* Log address errors to syslog */ +#define TIF_32BIT_REGS 21 /* 32-bit general purpose registers */ +#define TIF_32BIT_ADDR 22 /* 32-bit address space */ +#define TIF_LOAD_WATCH 23 /* If set, load watch registers */ +#define TIF_SINGLESTEP 24 /* Single Step */ +#define TIF_LSX_CTX_LIVE 25 /* LSX context must be preserved */ +#define TIF_LASX_CTX_LIVE 26 /* LASX context must be preserved */ +#define TIF_USEDLBT 27 /* LBT was used by this task this quantum (SMP) */ +#define TIF_LBT_CTX_LIVE 28 /* LBT context must be preserved */ -#define _TIF_NEED_RESCHED (1< Date: Mon, 8 Sep 2025 23:32:38 +0200 Subject: [PATCH 1984/3206] riscv: Use generic TIF bits No point in defining generic items and the upcoming RSEQ optimizations are only available with this _and_ the generic entry infrastructure, which is already used by RISCV. So no further action required here. Signed-off-by: Thomas Gleixner --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/thread_info.h | 31 ++++++++++++---------------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 51dcd8eaa24356..0c280614a28478 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -161,6 +161,7 @@ config RISCV select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_TRACER if !XIP_KERNEL && HAVE_DYNAMIC_FTRACE select HAVE_EBPF_JIT if MMU + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST if MMU select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index f5916a70879a87..a315b0261b9d7b 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -107,23 +107,18 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); * - pending work-to-be-done flags are in lowest half-word * - other flags in upper half-word(s) */ -#define TIF_NEED_RESCHED 0 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 1 /* Lazy rescheduling needed */ -#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ -#define TIF_SIGPENDING 3 /* signal pending */ -#define TIF_RESTORE_SIGMASK 4 /* restore signal mask in do_signal() */ -#define TIF_MEMDIE 5 /* is terminating due to OOM killer */ -#define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */ -#define TIF_UPROBE 10 /* uprobe breakpoint or singlestep */ -#define TIF_32BIT 11 /* compat-mode 32bit process */ -#define TIF_RISCV_V_DEFER_RESTORE 12 /* restore Vector before returing to user */ - -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -#define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_RISCV_V_DEFER_RESTORE (1 << TIF_RISCV_V_DEFER_RESTORE) + +/* + * Tell the generic TIF infrastructure which bits riscv supports + */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include + +#define TIF_32BIT 16 /* compat-mode 32bit process */ +#define TIF_RISCV_V_DEFER_RESTORE 17 /* restore Vector before returing to user */ + +#define _TIF_RISCV_V_DEFER_RESTORE BIT(TIF_RISCV_V_DEFER_RESTORE) #endif /* _ASM_RISCV_THREAD_INFO_H */ From 0b1619c38600fc06c73b1f59c64af0b7df08fc2c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 15 Sep 2025 11:10:07 +0200 Subject: [PATCH 1985/3206] gpio: nomadik: fix the debugfs helper stub Commit ddeb66d2cb10 ("gpio: nomadik: don't print out global GPIO numbers in debugfs callbacks") failed to also update the stub of the debugfs helper for !CONFIG_DEBUG_FS. Fix the resulting build failure. Fixes: ddeb66d2cb10 ("gpio: nomadik: don't print out global GPIO numbers in debugfs callbacks") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509132232.12viPUPB-lkp@intel.com/ Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250915091007.28438-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/gpio-nomadik.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h index 7ba53b499e164e..592a774a53cdf5 100644 --- a/include/linux/gpio/gpio-nomadik.h +++ b/include/linux/gpio/gpio-nomadik.h @@ -268,8 +268,7 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, static inline void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, struct gpio_chip *chip, - unsigned int offset, - unsigned int gpio) + unsigned int offset) { } From a1eab4d813f7b6e606ed21381b8cfda5c59a87e5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Sep 2025 11:06:42 -1000 Subject: [PATCH 1986/3206] sched_ext, sched/core: Fix build failure when !FAIR_GROUP_SCHED && EXT_GROUP_SCHED While collecting SCX related fields in struct task_group into struct scx_task_group, 6e6558a6bc41 ("sched_ext, sched/core: Factor out struct scx_task_group") forgot update tg->scx_weight usage in tg_weight(), which leads to build failure when CONFIG_FAIR_GROUP_SCHED is disabled but CONFIG_EXT_GROUP_SCHED is enabled. Fix it. Fixes: 6e6558a6bc41 ("sched_ext, sched/core: Factor out struct scx_task_group") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509170230.MwZsJSWa-lkp@intel.com/ Tested-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4cc..ccba6fc3c3fed2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9551,7 +9551,7 @@ static unsigned long tg_weight(struct task_group *tg) #ifdef CONFIG_FAIR_GROUP_SCHED return scale_load_down(tg->shares); #else - return sched_weight_from_cgroup(tg->scx_weight); + return sched_weight_from_cgroup(tg->scx.weight); #endif } From 70d1d98934e723b9e463283a542b88f4f009ae82 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 11 Aug 2025 14:33:45 -0700 Subject: [PATCH 1987/3206] x86/cpu: Rename and move CPU model entry for Diamond Rapids This model was added as INTEL_PANTHERCOVE_X (based on the name of the core) with a comment that the platform name is Diamond Rapids. It was also placed at the end of the file in a new section for family 19 processors. This is different from previous naming as Andrew Cooper noted. PeterZ agreed and posted a patch[1] to fix the name and move it in sequence with other Xeon servers. But without a commit description or sign-off the patch wasn't ever applied. Patch updated to cover one additional use of the #define by turbostat and to change the "Family 6" comment to also list 18 and 19 since new models in these families are mixed in with family 6. Originally-by: Peter Zijlstra Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/all/20250214130205.GK14028@noisy.programming.kicks-ass.net/ # [1] --- arch/x86/include/asm/intel-family.h | 7 +++---- .../platform/x86/intel/speed_select_if/isst_if_common.c | 2 +- drivers/platform/x86/intel/tpmi_power_domains.c | 2 +- tools/power/x86/turbostat/turbostat.c | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index e345dbdf933eb4..f32a0eca2ae56a 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -51,7 +51,7 @@ #define INTEL_PENTIUM_MMX IFM(5, 0x04) /* P55C */ #define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ -/* Family 6 */ +/* Family 6, 18, 19 */ #define INTEL_PENTIUM_PRO IFM(6, 0x01) #define INTEL_PENTIUM_II_KLAMATH IFM(6, 0x03) #define INTEL_PENTIUM_III_DESCHUTES IFM(6, 0x05) @@ -126,6 +126,8 @@ #define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */ #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) +#define INTEL_DIAMONDRAPIDS_X IFM(19, 0x01) /* Panther Cove */ + #define INTEL_BARTLETTLAKE IFM(6, 0xD7) /* Raptor Cove */ /* "Hybrid" Processors (P-Core/E-Core) */ @@ -203,9 +205,6 @@ #define INTEL_P4_PRESCOTT_2M IFM(15, 0x04) #define INTEL_P4_CEDARMILL IFM(15, 0x06) /* Also Xeon Dempsey */ -/* Family 19 */ -#define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ - /* * Intel CPU core types * diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c index 71e104a068e9e2..7449873c3d4068 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c @@ -790,7 +790,7 @@ static const struct x86_cpu_id isst_cpu_ids[] = { X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, SST_HPM_SUPPORTED), X86_MATCH_VFM(INTEL_ICELAKE_D, 0), X86_MATCH_VFM(INTEL_ICELAKE_X, 0), - X86_MATCH_VFM(INTEL_PANTHERCOVE_X, SST_HPM_SUPPORTED), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, SST_HPM_SUPPORTED), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), X86_MATCH_VFM(INTEL_SKYLAKE_X, SST_MBOX_SUPPORTED), {} diff --git a/drivers/platform/x86/intel/tpmi_power_domains.c b/drivers/platform/x86/intel/tpmi_power_domains.c index 8641353b2e0617..7d93119a4c30cc 100644 --- a/drivers/platform/x86/intel/tpmi_power_domains.c +++ b/drivers/platform/x86/intel/tpmi_power_domains.c @@ -85,7 +85,7 @@ static const struct x86_cpu_id tpmi_cpu_ids[] = { X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, NULL), X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, NULL), X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, NULL), - X86_MATCH_VFM(INTEL_PANTHERCOVE_X, NULL), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, tpmi_cpu_ids); diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 72a280e7a9d594..47eb2d4d13a554 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1195,7 +1195,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_EMERALDRAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_D, &spr_features }, - { INTEL_PANTHERCOVE_X, &dmr_features }, + { INTEL_DIAMONDRAPIDS_X, &dmr_features }, { INTEL_LAKEFIELD, &cnl_features }, { INTEL_ALDERLAKE, &adl_features }, { INTEL_ALDERLAKE_L, &adl_features }, From af507c6951180ae5e5edb71f56a227ffdcc54f78 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 21 Aug 2025 05:19:09 +0000 Subject: [PATCH 1988/3206] x86/cpu/cacheinfo: Simplify cacheinfo_amd_init_llc_id() using _cpuid4_info struct _cpuid4_info has the same layout as the CPUID leaf 0x8000001d. Use the encoded definition and amd_fill_cpuid4_info(), get_cache_id() helpers instead of open coding masks and shifts to calculate the llc_id. cacheinfo_amd_init_llc_id() is only called on AMD systems that support X86_FEATURE_TOPOEXT and amd_fill_cpuid4_info() uses the information from CPUID leaf 0x8000001d on all these systems which is consistent with the current open coded implementation. While at it, avoid reading cpu_data() every time get_cache_id() is called and instead pass the APIC ID necessary to return the _cpuid4_info.id from get_cache_id(). No functional changes intended. [ bp: do what Ahmed suggests: merge into one patch, make id4 ptr const. ] Signed-off-by: K Prateek Nayak Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Acked-by: Ahmed S. Darwish Link: https://lore.kernel.org/20250821051910.7351-2-kprateek.nayak@amd.com --- arch/x86/kernel/cpu/cacheinfo.c | 48 +++++++++++++++------------------ 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index adfa7e8bb86557..51a95b07831fa1 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -289,6 +289,22 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } +/* + * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static unsigned int get_cache_id(u32 apicid, const struct _cpuid4_info *id4) +{ + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + + return apicid >> index_msb; +} + /* * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist. */ @@ -312,18 +328,11 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) * Newer families: LLC ID is calculated from the number * of threads sharing the L3 cache. */ - u32 eax, ebx, ecx, edx, num_sharing_cache = 0; u32 llc_index = find_num_cache_leaves(c) - 1; + struct _cpuid4_info id4 = {}; - cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); - if (eax) - num_sharing_cache = ((eax >> 14) & 0xfff) + 1; - - if (num_sharing_cache) { - int index_msb = get_count_order(num_sharing_cache); - - c->topo.llc_id = c->topo.apicid >> index_msb; - } + if (!amd_fill_cpuid4_info(llc_index, &id4)) + c->topo.llc_id = get_cache_id(c->topo.apicid, &id4); } } @@ -598,27 +607,12 @@ int init_cache_level(unsigned int cpu) return 0; } -/* - * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input - * ECX as cache index. Then right shift apicid by the number's order to get - * cache id for this cache node. - */ -static void get_cache_id(int cpu, struct _cpuid4_info *id4) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - unsigned long num_threads_sharing; - int index_msb; - - num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - id4->id = c->topo.apicid >> index_msb; -} - int populate_cache_leaves(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *ci = this_cpu_ci->info_list; u8 cpu_vendor = boot_cpu_data.x86_vendor; + u32 apicid = cpu_data(cpu).topo.apicid; struct amd_northbridge *nb = NULL; struct _cpuid4_info id4 = {}; int idx, ret; @@ -628,7 +622,7 @@ int populate_cache_leaves(unsigned int cpu) if (ret) return ret; - get_cache_id(cpu, &id4); + id4.id = get_cache_id(apicid, &id4); if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) nb = amd_init_l3_cache(idx); From d691c5f87f344a448b1a522284fa314e2bb403e2 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 1 Sep 2025 17:04:16 +0000 Subject: [PATCH 1989/3206] x86/cpu/topology: Check for X86_FEATURE_XTOPOLOGY instead of passing has_xtopology cpu_parse_topology_ext() sets X86_FEATURE_XTOPOLOGY before returning true if any of the XTOPOLOGY leaf (0x80000026 / 0xb) could be parsed successfully. Instead of storing and passing around this return value using "has_xtopology" in parse_topology_amd(), check for X86_FEATURE_XTOPOLOGY directly in parse_8000_001e() to simplify the flow. No functional changes intended. Signed-off-by: K Prateek Nayak Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/20250901170418.4314-1-kprateek.nayak@amd.com --- arch/x86/kernel/cpu/topology_amd.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index c79ebbb639cbff..7ebd4a15c56184 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -59,7 +59,7 @@ static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) tscan->amd_node_id = node_id; } -static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) +static bool parse_8000_001e(struct topo_scan *tscan) { struct { // eax @@ -85,7 +85,7 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) * If leaf 0xb/0x26 is available, then the APIC ID and the domain * shifts are set already. */ - if (!has_topoext) { + if (!cpu_feature_enabled(X86_FEATURE_XTOPOLOGY)) { tscan->c->topo.initial_apicid = leaf.ext_apic_id; /* @@ -175,30 +175,27 @@ static void topoext_fixup(struct topo_scan *tscan) static void parse_topology_amd(struct topo_scan *tscan) { + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) + tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); + /* * Try to get SMT, CORE, TILE, and DIE shifts from extended * CPUID leaf 0x8000_0026 on supported processors first. If * extended CPUID leaf 0x8000_0026 is not supported, try to * get SMT and CORE shift from leaf 0xb. If either leaf is * available, cpu_parse_topology_ext() will return true. - */ - bool has_xtopology = cpu_parse_topology_ext(tscan); - - if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) - tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); - - /* + * * If XTOPOLOGY leaves (0x26/0xb) are not available, try to * get the CORE shift from leaf 0x8000_0008 first. */ - if (!has_xtopology && !parse_8000_0008(tscan)) + if (!cpu_parse_topology_ext(tscan) && !parse_8000_0008(tscan)) return; /* * Prefer leaf 0x8000001e if available to get the SMT shift and * the initial APIC ID if XTOPOLOGY leaves are not available. */ - if (parse_8000_001e(tscan, has_xtopology)) + if (parse_8000_001e(tscan)) return; /* Try the NODEID MSR */ From bc6397cf0bc4f2b7a47cc6ac44086daf67c3c71c Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 1 Sep 2025 17:04:17 +0000 Subject: [PATCH 1990/3206] x86/cpu/topology: Define AMD64_CPUID_EXT_FEAT MSR Add defines for the 0xc001_1005 MSR (Core::X86::Msr::CPUID_ExtFeatures) used to toggle the extended CPUID features, instead of using literal numbers. Also define and use the bits necessary for an old TOPOEXT fixup on AMD Family 0x15 processors. No functional changes intended. [ bp: Massage, rename MSR to adhere to the documentation name. ] Signed-off-by: K Prateek Nayak Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/20250901170418.4314-1-kprateek.nayak@amd.com --- arch/x86/include/asm/msr-index.h | 5 +++++ arch/x86/kernel/cpu/topology_amd.c | 7 ++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa1410..a734b560214967 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -631,6 +631,11 @@ #define MSR_AMD_PPIN 0xc00102f1 #define MSR_AMD64_CPUID_FN_7 0xc0011002 #define MSR_AMD64_CPUID_FN_1 0xc0011004 + +#define MSR_AMD64_CPUID_EXT_FEAT 0xc0011005 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT 54 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT BIT_ULL(MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) + #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_TW_CFG 0xc0011023 diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 7ebd4a15c56184..6ac097e1310659 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -163,11 +163,12 @@ static void topoext_fixup(struct topo_scan *tscan) c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) return; - if (msr_set_bit(0xc0011005, 54) <= 0) + if (msr_set_bit(MSR_AMD64_CPUID_EXT_FEAT, + MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) <= 0) return; - rdmsrq(0xc0011005, msrval); - if (msrval & BIT_64(54)) { + rdmsrq(MSR_AMD64_CPUID_EXT_FEAT, msrval); + if (msrval & MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); } From d98b40c07552a3bdc72bfc5879748707ba7598ae Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 1 Sep 2025 17:04:18 +0000 Subject: [PATCH 1991/3206] Documentation/x86/topology: Detail CPUID leaves used for topology enumeration Add a new section describing the different CPUID leaves and fields used to parse topology on x86 systems. [ bp: Cleanups and simplifications ontop. ] Suggested-by: Borislav Petkov Signed-off-by: K Prateek Nayak Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/20250901170418.4314-1-kprateek.nayak@amd.com --- Documentation/arch/x86/topology.rst | 191 ++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst index c12837e61bda53..86bec8ac2c4de7 100644 --- a/Documentation/arch/x86/topology.rst +++ b/Documentation/arch/x86/topology.rst @@ -141,6 +141,197 @@ Thread-related topology information in the kernel: +System topology enumeration +=========================== + +The topology on x86 systems can be discovered using a combination of vendor +specific CPUID leaves which enumerate the processor topology and the cache +hierarchy. + +The CPUID leaves in their preferred order of parsing for each x86 vendor is as +follows: + +1) AMD + + 1) CPUID leaf 0x80000026 [Extended CPU Topology] (Core::X86::Cpuid::ExCpuTopology) + + The extended CPUID leaf 0x80000026 is the extension of the CPUID leaf 0xB + and provides the topology information of Core, Complex, CCD (Die), and + Socket in each level. + + Support for the leaf is discovered by checking if the maximum extended + CPUID level is >= 0x80000026 and then checking if `LogProcAtThisLevel` + in `EBX[15:0]` at a particular level (starting from 0) is non-zero. + + The `LevelType` in `ECX[15:8]` at the level provides the topology domain + the level describes - Core, Complex, CCD(Die), or the Socket. + + The kernel uses the `CoreMaskWidth` from `EAX[4:0]` to discover the + number of bits that need to be right-shifted from `ExtendedLocalApicId` + in `EDX[31:0]` in order to get a unique Topology ID for the topology + level. CPUs with the same Topology ID share the resources at that level. + + CPUID leaf 0x80000026 also provides more information regarding the power + and efficiency rankings, and about the core type on AMD processors with + heterogeneous characteristics. + + If CPUID leaf 0x80000026 is supported, further parsing is not required. + + 2) CPUID leaf 0x0000000B [Extended Topology Enumeration] (Core::X86::Cpuid::ExtTopEnum) + + The extended CPUID leaf 0x0000000B is the predecessor on the extended + CPUID leaf 0x80000026 and only describes the core, and the socket domains + of the processor topology. + + The support for the leaf is discovered by checking if the maximum supported + CPUID level is >= 0xB and then if `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + The `LevelType` in `ECX[15:8]` at the level provides the topology domain + that the level describes - Thread, or Processor (Socket). + + The kernel uses the `CoreMaskWidth` from `EAX[4:0]` to discover the + number of bits that need to be right-shifted from the `ExtendedLocalApicId` + in `EDX[31:0]` to get a unique Topology ID for that topology level. CPUs + sharing the Topology ID share the resources at that level. + + If CPUID leaf 0xB is supported, further parsing is not required. + + + 3) CPUID leaf 0x80000008 ECX [Size Identifiers] (Core::X86::Cpuid::SizeId) + + If neither the CPUID leaf 0x80000026 nor 0xB is supported, the number of + CPUs on the package is detected using the Size Identifier leaf + 0x80000008 ECX. + + The support for the leaf is discovered by checking if the supported + extended CPUID level is >= 0x80000008. + + The shifts from the APIC ID for the Socket ID is calculated from the + `ApicIdSize` field in `ECX[15:12]` if it is non-zero. + + If `ApicIdSize` is reported to be zero, the shift is calculated as the + order of the `number of threads` calculated from `NC` field in + `ECX[7:0]` which describes the `number of threads - 1` on the package. + + Unless Extended APIC ID is supported, the APIC ID used to find the + Socket ID is from the `LocalApicId` field of CPUID leaf 0x00000001 + `EBX[31:24]`. + + The topology parsing continues to detect if Extended APIC ID is + supported or not. + + + 4) CPUID leaf 0x8000001E [Extended APIC ID, Core Identifiers, Node Identifiers] + (Core::X86::Cpuid::{ExtApicId,CoreId,NodeId}) + + The support for Extended APIC ID can be detected by checking for the + presence of `TopologyExtensions` in `ECX[22]` of CPUID leaf 0x80000001 + [Feature Identifiers] (Core::X86::Cpuid::FeatureExtIdEcx). + + If Topology Extensions is supported, the APIC ID from `ExtendedApicId` + from CPUID leaf 0x8000001E `EAX[31:0]` should be preferred over that from + `LocalApicId` field of CPUID leaf 0x00000001 `EBX[31:24]` for topology + enumeration. + + On processors of Family 0x17 and above that do not support CPUID leaf + 0x80000026 or CPUID leaf 0xB, the shifts from the APIC ID for the Core + ID is calculated using the order of `number of threads per core` + calculated using the `ThreadsPerCore` field in `EBX[15:8]` which + describes `number of threads per core - 1`. + + On Processors of Family 0x15, the Core ID from `EBX[7:0]` is used as the + `cu_id` (Compute Unit ID) to detect CPUs that share the compute units. + + + All AMD processors that support the `TopologyExtensions` feature store the + `NodeId` from the `ECX[7:0]` of CPUID leaf 0x8000001E + (Core::X86::Cpuid::NodeId) as the per-CPU `node_id`. On older processors, + the `node_id` was discovered using MSR_FAM10H_NODE_ID MSR (MSR + 0x0xc001_100c). The presence of the NODE_ID MSR was detected by checking + `ECX[19]` of CPUID leaf 0x80000001 [Feature Identifiers] + (Core::X86::Cpuid::FeatureExtIdEcx). + + +2) Intel + + On Intel platforms, the CPUID leaves that enumerate the processor + topology are as follows: + + 1) CPUID leaf 0x1F (V2 Extended Topology Enumeration Leaf) + + The CPUID leaf 0x1F is the extension of the CPUID leaf 0xB and provides + the topology information of Core, Module, Tile, Die, DieGrp, and Socket + in each level. + + The support for the leaf is discovered by checking if the supported + CPUID level is >= 0x1F and then `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + The `Domain Type` in `ECX[15:8]` of the sub-leaf provides the topology + domain that the level describes - Core, Module, Tile, Die, DieGrp, and + Socket. + + The kernel uses the value from `EAX[4:0]` to discover the number of + bits that need to be right shifted from the `x2APIC ID` in `EDX[31:0]` + to get a unique Topology ID for the topology level. CPUs with the same + Topology ID share the resources at that level. + + If CPUID leaf 0x1F is supported, further parsing is not required. + + + 2) CPUID leaf 0x0000000B (Extended Topology Enumeration Leaf) + + The extended CPUID leaf 0x0000000B is the predecessor of the V2 Extended + Topology Enumeration Leaf 0x1F and only describes the core, and the + socket domains of the processor topology. + + The support for the leaf is iscovered by checking if the supported CPUID + level is >= 0xB and then checking if `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + CPUID leaf 0x0000000B shares the same layout as CPUID leaf 0x1F and + should be enumerated in a similar manner. + + If CPUID leaf 0xB is supported, further parsing is not required. + + + 3) CPUID leaf 0x00000004 (Deterministic Cache Parameters Leaf) + + On Intel processors that support neither CPUID leaf 0x1F, nor CPUID leaf + 0xB, the shifts for the SMT domains is calculated using the number of + CPUs sharing the L1 cache. + + Processors that feature Hyper-Threading is detected using `EDX[28]` of + CPUID leaf 0x1 (Basic CPUID Information). + + The order of `Maximum number of addressable IDs for logical processors + sharing this cache` from `EAX[25:14]` of level-0 of CPUID 0x4 provides + the shifts from the APIC ID required to compute the Core ID. + + The APIC ID and Package information is computed using the data from + CPUID leaf 0x1. + + + 4) CPUID leaf 0x00000001 (Basic CPUID Information) + + The mask and shifts to derive the Physical Package (socket) ID is + computed using the `Maximum number of addressable IDs for logical + processors in this physical package` from `EBX[23:16]` of CPUID leaf + 0x1. + + The APIC ID on the legacy platforms is derived from the `Initial APIC + ID` field from `EBX[31:24]` of CPUID leaf 0x1. + + +3) Centaur and Zhaoxin + + Similar to Intel, Centaur and Zhaoxin use a combination of CPUID leaf + 0x00000004 (Deterministic Cache Parameters Leaf) and CPUID leaf 0x00000001 + (Basic CPUID Information) to derive the topology information. + + + System topology examples ======================== From dd86b69d20fb9fa7e941ed01ff05f1e662fcc3ff Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Tue, 16 Sep 2025 12:25:49 -0500 Subject: [PATCH 1992/3206] fs/resctrl: Fix counter auto-assignment on mkdir with mbm_event enabled rdt_resource::resctrl_mon::mbm_assign_on_mkdir determines if a counter will automatically be assigned to an RMID, MBM event pair when its associated monitor group is created via mkdir. Testing shows that counters are always automatically assigned to new monitor groups, whether mbm_assign_on_mkdir is set or not. To support automatic counter assignment the check for mbm_assign_on_mkdir should be in rdtgroup_assign_cntrs() that assigns counters during monitor group creation. Instead, the check for mbm_assign_on_mkdir is in rdtgroup_unassign_cntrs() that is called on monitor group deletion from where counters should always be unassigned, whether mbm_assign_on_mkdir is set or not. Fix automatic counter assignment by moving the mbm_assign_on_mkdir check from rdtgroup_unassign_cntrs() to rdtgroup_assign_cntrs(). [ bp: Replace commit message with Reinette's version. ] Fixes: ef712fe97ec57 ("fs/resctrl: Auto assign counters on mkdir and clean up on group removal") Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Acked-by: Reinette Chatre --- fs/resctrl/monitor.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 50c24460d992ab..4076336fbba6db 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1200,7 +1200,8 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) || + !r->mon.mbm_assign_on_mkdir) return; if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) @@ -1258,8 +1259,7 @@ void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) || - !r->mon.mbm_assign_on_mkdir) + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) return; if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) From 6fffa38c4c4427470f832e59b7380b992e677e36 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Tue, 16 Sep 2025 20:30:16 +0000 Subject: [PATCH 1993/3206] EDAC/amd64: Add support for AMD family 1Ah-based newer models Add support for family 1Ah-based models 50h-57h, 90h-9Fh, A0h-AFh, and C0h-C7h. Also, raise the maximum memory controllers number as those machines support that many. Signed-off-by: Avadhut Naik Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250916203242.1281036-1-avadhut.naik@amd.com --- drivers/edac/amd64_edac.c | 20 ++++++++++++++++++++ drivers/edac/amd64_edac.h | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 07f1e9dc1ca71b..2f6ab783bf2097 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3923,6 +3923,26 @@ static int per_family_init(struct amd64_pvt *pvt) pvt->ctl_name = "F1Ah_M40h"; pvt->flags.zn_regs_v2 = 1; break; + case 0x50 ... 0x57: + pvt->ctl_name = "F1Ah_M50h"; + pvt->max_mcs = 16; + pvt->flags.zn_regs_v2 = 1; + break; + case 0x90 ... 0x9f: + pvt->ctl_name = "F1Ah_M90h"; + pvt->max_mcs = 8; + pvt->flags.zn_regs_v2 = 1; + break; + case 0xa0 ... 0xaf: + pvt->ctl_name = "F1Ah_MA0h"; + pvt->max_mcs = 8; + pvt->flags.zn_regs_v2 = 1; + break; + case 0xc0 ... 0xc7: + pvt->ctl_name = "F1Ah_MC0h"; + pvt->max_mcs = 16; + pvt->flags.zn_regs_v2 = 1; + break; } break; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 17228d07de4c8c..d70b8a8d0b092a 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -96,7 +96,7 @@ /* Hardware limit on ChipSelect rows per MC and processors per system */ #define NUM_CHIPSELECTS 8 #define DRAM_RANGES 8 -#define NUM_CONTROLLERS 12 +#define NUM_CONTROLLERS 16 #define ON true #define OFF false From 6e1c2c6c2c40ce99e0d2633b212f43c702c1a002 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Tue, 16 Sep 2025 20:30:17 +0000 Subject: [PATCH 1994/3206] EDAC/mc_sysfs: Increase legacy channel support to 16 Newer AMD systems can support up to 16 channels per EDAC "mc" device. These are detected by the EDAC module running on the device, and the current EDAC interface is appropriately enumerated. The legacy EDAC sysfs interface however, provides device attributes for channels 0 through 11 only. Consequently, the last four channels, 12 through 15, will not be enumerated and will not be visible through the legacy sysfs interface. Add additional device attributes to ensure that all 16 channels, if present, are enumerated by and visible through the legacy EDAC sysfs interface. Signed-off-by: Avadhut Naik Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250916203242.1281036-1-avadhut.naik@amd.com --- drivers/edac/edac_mc_sysfs.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 0f338adf7d9376..8689631f190536 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -305,6 +305,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 10); DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 11); +DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 12); +DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 13); +DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 14); +DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 15); /* Total possible dynamic DIMM Label attribute file table */ static struct attribute *dynamic_csrow_dimm_attr[] = { @@ -320,6 +328,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { &dev_attr_legacy_ch9_dimm_label.attr.attr, &dev_attr_legacy_ch10_dimm_label.attr.attr, &dev_attr_legacy_ch11_dimm_label.attr.attr, + &dev_attr_legacy_ch12_dimm_label.attr.attr, + &dev_attr_legacy_ch13_dimm_label.attr.attr, + &dev_attr_legacy_ch14_dimm_label.attr.attr, + &dev_attr_legacy_ch15_dimm_label.attr.attr, NULL }; @@ -348,6 +360,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, channel_ce_count_show, NULL, 10); DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, channel_ce_count_show, NULL, 11); +DEVICE_CHANNEL(ch12_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 12); +DEVICE_CHANNEL(ch13_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 13); +DEVICE_CHANNEL(ch14_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 14); +DEVICE_CHANNEL(ch15_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 15); /* Total possible dynamic ce_count attribute file table */ static struct attribute *dynamic_csrow_ce_count_attr[] = { @@ -363,6 +383,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { &dev_attr_legacy_ch9_ce_count.attr.attr, &dev_attr_legacy_ch10_ce_count.attr.attr, &dev_attr_legacy_ch11_ce_count.attr.attr, + &dev_attr_legacy_ch12_ce_count.attr.attr, + &dev_attr_legacy_ch13_ce_count.attr.attr, + &dev_attr_legacy_ch14_ce_count.attr.attr, + &dev_attr_legacy_ch15_ce_count.attr.attr, NULL }; From e4c00c4ce2aafe61dc7436e763a78d6d112d9e2f Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 16 Sep 2025 21:29:04 +0000 Subject: [PATCH 1995/3206] x86/sev: Add new dump_rmp parameter to snp_leak_pages() API When leaking certain page types, such as Hypervisor Fixed (HV_FIXED) pages, it does not make sense to dump RMP contents for the 2MB range of the page(s) being leaked. In the case of HV_FIXED pages, this is not an error situation where the surrounding 2MB page RMP entries can provide debug information. Add new __snp_leak_pages() API with dump_rmp bool parameter to support continue adding pages to the snp_leaked_pages_list but not issue dump_rmpentry(). Make snp_leak_pages() a wrapper for the common case which also allows existing users to continue to dump RMP entries. Suggested-by: Thomas Lendacky Suggested-by: Sean Christopherson Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/cover.1758057691.git.ashish.kalra@amd.com --- arch/x86/include/asm/sev.h | 7 ++++++- arch/x86/virt/svm/sev.c | 7 ++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 44dae74722467c..f9046c4b9a2b06 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -655,9 +655,13 @@ void snp_dump_hva_rmpentry(unsigned long address); int psmash(u64 pfn); int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable); int rmp_make_shared(u64 pfn, enum pg_level level); -void snp_leak_pages(u64 pfn, unsigned int npages); +void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp); void kdump_sev_callback(void); void snp_fixup_e820_tables(void); +static inline void snp_leak_pages(u64 pfn, unsigned int pages) +{ + __snp_leak_pages(pfn, pages, true); +} #else static inline bool snp_probe_rmptable_info(void) { return false; } static inline int snp_rmptable_init(void) { return -ENOSYS; } @@ -670,6 +674,7 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as return -ENODEV; } static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } +static inline void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) {} static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} static inline void kdump_sev_callback(void) { } static inline void snp_fixup_e820_tables(void) {} diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 942372e69b4dd3..ee643a6cd69165 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -1029,7 +1029,7 @@ int rmp_make_shared(u64 pfn, enum pg_level level) } EXPORT_SYMBOL_GPL(rmp_make_shared); -void snp_leak_pages(u64 pfn, unsigned int npages) +void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) { struct page *page = pfn_to_page(pfn); @@ -1052,14 +1052,15 @@ void snp_leak_pages(u64 pfn, unsigned int npages) (PageHead(page) && compound_nr(page) <= npages)) list_add_tail(&page->buddy_list, &snp_leaked_pages_list); - dump_rmpentry(pfn); + if (dump_rmp) + dump_rmpentry(pfn); snp_nr_leaked_pages++; pfn++; page++; } spin_unlock(&snp_leaked_pages_list_lock); } -EXPORT_SYMBOL_GPL(snp_leak_pages); +EXPORT_SYMBOL_GPL(__snp_leak_pages); void kdump_sev_callback(void) { From e09701dcdd9ca06be249091eeb786d57e67b613e Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 16 Sep 2025 21:29:33 +0000 Subject: [PATCH 1996/3206] crypto: ccp - Add new HV-Fixed page allocation/free API When SEV-SNP is active, the TEE extended command header page and all output buffers for TEE extended commands (such as used by Seamless Firmware servicing support) must be in hypervisor-fixed state, assigned to the hypervisor and marked immutable in the RMP entrie(s). Add a new generic SEV API interface to allocate/free hypervisor fixed pages which abstracts hypervisor fixed page allocation/free for PSP sub devices. The API internally uses SNP_INIT_EX to transition pages to HV-Fixed page state. If SNP is not enabled then the allocator is simply a wrapper over alloc_pages() and __free_pages(). When the sub device free the pages, they are put on a free list and future allocation requests will try to re-use the freed pages from this list. But this list is not preserved across PSP driver load/unload hence this free/reuse support is only supported while PSP driver is loaded. As HV_FIXED page state is only changed at reboot, these pages are leaked as they cannot be returned back to the page allocator and then potentially allocated to guests, which will cause SEV-SNP guests to fail to start or terminate when accessing the HV_FIXED page. Suggested-by: Thomas Lendacky Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Acked-by: Herbert Xu Link: https://lore.kernel.org/cover.1758057691.git.ashish.kalra@amd.com --- drivers/crypto/ccp/sev-dev.c | 182 +++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/sev-dev.h | 3 + 2 files changed, 185 insertions(+) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index e058ba02779296..f7b9c6547e18f4 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -82,6 +82,21 @@ MODULE_FIRMWARE("amd/amd_sev_fam19h_model1xh.sbin"); /* 4th gen EPYC */ static bool psp_dead; static int psp_timeout; +enum snp_hv_fixed_pages_state { + ALLOCATED, + HV_FIXED, +}; + +struct snp_hv_fixed_pages_entry { + struct list_head list; + struct page *page; + unsigned int order; + bool free; + enum snp_hv_fixed_pages_state page_state; +}; + +static LIST_HEAD(snp_hv_fixed_pages); + /* Trusted Memory Region (TMR): * The TMR is a 1MB area that must be 1MB aligned. Use the page allocator * to allocate the memory, which will return aligned memory for the specified @@ -1073,6 +1088,165 @@ static void snp_set_hsave_pa(void *arg) wrmsrq(MSR_VM_HSAVE_PA, 0); } +/* Hypervisor Fixed pages API interface */ +static void snp_hv_fixed_pages_state_update(struct sev_device *sev, + enum snp_hv_fixed_pages_state page_state) +{ + struct snp_hv_fixed_pages_entry *entry; + + /* List is protected by sev_cmd_mutex */ + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + list_for_each_entry(entry, &snp_hv_fixed_pages, list) + entry->page_state = page_state; +} + +/* + * Allocate HV_FIXED pages in 2MB aligned sizes to ensure the whole + * 2MB pages are marked as HV_FIXED. + */ +struct page *snp_alloc_hv_fixed_pages(unsigned int num_2mb_pages) +{ + struct psp_device *psp_master = psp_get_master_device(); + struct snp_hv_fixed_pages_entry *entry; + struct sev_device *sev; + unsigned int order; + struct page *page; + + if (!psp_master || !psp_master->sev_data) + return NULL; + + sev = psp_master->sev_data; + + order = get_order(PMD_SIZE * num_2mb_pages); + + /* + * SNP_INIT_EX is protected by sev_cmd_mutex, therefore this list + * also needs to be protected using the same mutex. + */ + guard(mutex)(&sev_cmd_mutex); + + /* + * This API uses SNP_INIT_EX to transition allocated pages to HV_Fixed + * page state, fail if SNP is already initialized. + */ + if (sev->snp_initialized) + return NULL; + + /* Re-use freed pages that match the request */ + list_for_each_entry(entry, &snp_hv_fixed_pages, list) { + /* Hypervisor fixed page allocator implements exact fit policy */ + if (entry->order == order && entry->free) { + entry->free = false; + memset(page_address(entry->page), 0, + (1 << entry->order) * PAGE_SIZE); + return entry->page; + } + } + + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!page) + return NULL; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + __free_pages(page, order); + return NULL; + } + + entry->page = page; + entry->order = order; + list_add_tail(&entry->list, &snp_hv_fixed_pages); + + return page; +} + +void snp_free_hv_fixed_pages(struct page *page) +{ + struct psp_device *psp_master = psp_get_master_device(); + struct snp_hv_fixed_pages_entry *entry, *nentry; + + if (!psp_master || !psp_master->sev_data) + return; + + /* + * SNP_INIT_EX is protected by sev_cmd_mutex, therefore this list + * also needs to be protected using the same mutex. + */ + guard(mutex)(&sev_cmd_mutex); + + list_for_each_entry_safe(entry, nentry, &snp_hv_fixed_pages, list) { + if (entry->page != page) + continue; + + /* + * HV_FIXED page state cannot be changed until reboot + * and they cannot be used by an SNP guest, so they cannot + * be returned back to the page allocator. + * Mark the pages as free internally to allow possible re-use. + */ + if (entry->page_state == HV_FIXED) { + entry->free = true; + } else { + __free_pages(page, entry->order); + list_del(&entry->list); + kfree(entry); + } + return; + } +} + +static void snp_add_hv_fixed_pages(struct sev_device *sev, struct sev_data_range_list *range_list) +{ + struct snp_hv_fixed_pages_entry *entry; + struct sev_data_range *range; + int num_elements; + + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + num_elements = list_count_nodes(&snp_hv_fixed_pages) + + range_list->num_elements; + + /* + * Ensure the list of HV_FIXED pages that will be passed to firmware + * do not exceed the page-sized argument buffer. + */ + if (num_elements * sizeof(*range) + sizeof(*range_list) > PAGE_SIZE) { + dev_warn(sev->dev, "Additional HV_Fixed pages cannot be accommodated, omitting\n"); + return; + } + + range = &range_list->ranges[range_list->num_elements]; + list_for_each_entry(entry, &snp_hv_fixed_pages, list) { + range->base = page_to_pfn(entry->page) << PAGE_SHIFT; + range->page_count = 1 << entry->order; + range++; + } + range_list->num_elements = num_elements; +} + +static void snp_leak_hv_fixed_pages(void) +{ + struct snp_hv_fixed_pages_entry *entry; + + /* List is protected by sev_cmd_mutex */ + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + list_for_each_entry(entry, &snp_hv_fixed_pages, list) + if (entry->page_state == HV_FIXED) + __snp_leak_pages(page_to_pfn(entry->page), + 1 << entry->order, false); +} + static int snp_filter_reserved_mem_regions(struct resource *rs, void *arg) { struct sev_data_range_list *range_list = arg; @@ -1163,6 +1337,12 @@ static int __sev_snp_init_locked(int *error) return rc; } + /* + * Add HV_Fixed pages from other PSP sub-devices, such as SFS to the + * HV_Fixed page list. + */ + snp_add_hv_fixed_pages(sev, snp_range_list); + memset(&data, 0, sizeof(data)); data.init_rmp = 1; data.list_paddr_en = 1; @@ -1202,6 +1382,7 @@ static int __sev_snp_init_locked(int *error) return rc; } + snp_hv_fixed_pages_state_update(sev, HV_FIXED); sev->snp_initialized = true; dev_dbg(sev->dev, "SEV-SNP firmware initialized\n"); @@ -1784,6 +1965,7 @@ static int __sev_snp_shutdown_locked(int *error, bool panic) return ret; } + snp_leak_hv_fixed_pages(); sev->snp_initialized = false; dev_dbg(sev->dev, "SEV-SNP firmware shutdown\n"); diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h index 3e4e5574e88a30..28021abc85ad26 100644 --- a/drivers/crypto/ccp/sev-dev.h +++ b/drivers/crypto/ccp/sev-dev.h @@ -65,4 +65,7 @@ void sev_dev_destroy(struct psp_device *psp); void sev_pci_init(void); void sev_pci_exit(void); +struct page *snp_alloc_hv_fixed_pages(unsigned int num_2mb_pages); +void snp_free_hv_fixed_pages(struct page *page); + #endif /* __SEV_DEV_H */ From 648dbccc03a000cd64c2a9d86012d98053545e64 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 16 Sep 2025 21:29:49 +0000 Subject: [PATCH 1997/3206] crypto: ccp - Add AMD Seamless Firmware Servicing (SFS) driver AMD Seamless Firmware Servicing (SFS) is a secure method to allow non-persistent updates to running firmware and settings without requiring BIOS reflash and/or system reset. SFS does not address anything that runs on the x86 processors and it can be used to update ASP firmware, modules, register settings and update firmware for other microprocessors like TMPM, etc. SFS driver support adds ioctl support to communicate the SFS commands to the ASP/PSP by using the TEE mailbox interface. The Seamless Firmware Servicing (SFS) driver is added as a PSP sub-device. For detailed information, please look at the SFS specifications: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/58604.pdf Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Acked-by: Herbert Xu Link: https://lore.kernel.org/cover.1758057691.git.ashish.kalra@amd.com --- drivers/crypto/ccp/Makefile | 3 +- drivers/crypto/ccp/psp-dev.c | 20 ++ drivers/crypto/ccp/psp-dev.h | 8 +- drivers/crypto/ccp/sfs.c | 311 ++++++++++++++++++++++++++++ drivers/crypto/ccp/sfs.h | 47 +++++ include/linux/psp-platform-access.h | 2 + include/uapi/linux/psp-sfs.h | 87 ++++++++ 7 files changed, 476 insertions(+), 2 deletions(-) create mode 100644 drivers/crypto/ccp/sfs.c create mode 100644 drivers/crypto/ccp/sfs.h create mode 100644 include/uapi/linux/psp-sfs.h diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index 394484929dae3b..a9626b30044aed 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -13,7 +13,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ tee-dev.o \ platform-access.o \ dbc.o \ - hsti.o + hsti.o \ + sfs.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 1c5a7189631eca..9e21da0e298ad7 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -17,6 +17,7 @@ #include "psp-dev.h" #include "sev-dev.h" #include "tee-dev.h" +#include "sfs.h" #include "platform-access.h" #include "dbc.h" #include "hsti.h" @@ -182,6 +183,17 @@ static int psp_check_tee_support(struct psp_device *psp) return 0; } +static int psp_check_sfs_support(struct psp_device *psp) +{ + /* Check if device supports SFS feature */ + if (!psp->capability.sfs) { + dev_dbg(psp->dev, "psp does not support SFS\n"); + return -ENODEV; + } + + return 0; +} + static int psp_init(struct psp_device *psp) { int ret; @@ -198,6 +210,12 @@ static int psp_init(struct psp_device *psp) return ret; } + if (!psp_check_sfs_support(psp)) { + ret = sfs_dev_init(psp); + if (ret) + return ret; + } + if (psp->vdata->platform_access) { ret = platform_access_dev_init(psp); if (ret) @@ -302,6 +320,8 @@ void psp_dev_destroy(struct sp_device *sp) tee_dev_destroy(psp); + sfs_dev_destroy(psp); + dbc_dev_destroy(psp); platform_access_dev_destroy(psp); diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index e43ce87ede7690..268c83f298cb0d 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -32,7 +32,8 @@ union psp_cap_register { unsigned int sev :1, tee :1, dbc_thru_ext :1, - rsvd1 :4, + sfs :1, + rsvd1 :3, security_reporting :1, fused_part :1, rsvd2 :1, @@ -68,6 +69,7 @@ struct psp_device { void *tee_data; void *platform_access_data; void *dbc_data; + void *sfs_data; union psp_cap_register capability; }; @@ -118,12 +120,16 @@ struct psp_ext_request { * @PSP_SUB_CMD_DBC_SET_UID: Set UID for DBC * @PSP_SUB_CMD_DBC_GET_PARAMETER: Get parameter from DBC * @PSP_SUB_CMD_DBC_SET_PARAMETER: Set parameter for DBC + * @PSP_SUB_CMD_SFS_GET_FW_VERS: Get firmware versions for ASP and other MP + * @PSP_SUB_CMD_SFS_UPDATE: Command to load, verify and execute SFS package */ enum psp_sub_cmd { PSP_SUB_CMD_DBC_GET_NONCE = PSP_DYNAMIC_BOOST_GET_NONCE, PSP_SUB_CMD_DBC_SET_UID = PSP_DYNAMIC_BOOST_SET_UID, PSP_SUB_CMD_DBC_GET_PARAMETER = PSP_DYNAMIC_BOOST_GET_PARAMETER, PSP_SUB_CMD_DBC_SET_PARAMETER = PSP_DYNAMIC_BOOST_SET_PARAMETER, + PSP_SUB_CMD_SFS_GET_FW_VERS = PSP_SFS_GET_FW_VERSIONS, + PSP_SUB_CMD_SFS_UPDATE = PSP_SFS_UPDATE, }; int psp_extended_mailbox_cmd(struct psp_device *psp, unsigned int timeout_msecs, diff --git a/drivers/crypto/ccp/sfs.c b/drivers/crypto/ccp/sfs.c new file mode 100644 index 00000000000000..2f4beaafe7ec67 --- /dev/null +++ b/drivers/crypto/ccp/sfs.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure Processor Seamless Firmware Servicing support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#include + +#include "sfs.h" +#include "sev-dev.h" + +#define SFS_DEFAULT_TIMEOUT (10 * MSEC_PER_SEC) +#define SFS_MAX_PAYLOAD_SIZE (2 * 1024 * 1024) +#define SFS_NUM_2MB_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PMD_SIZE) +#define SFS_NUM_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PAGE_SIZE) + +static DEFINE_MUTEX(sfs_ioctl_mutex); + +static struct sfs_misc_dev *misc_dev; + +static int send_sfs_cmd(struct sfs_device *sfs_dev, int msg) +{ + int ret; + + sfs_dev->command_buf->hdr.status = 0; + sfs_dev->command_buf->hdr.sub_cmd_id = msg; + + ret = psp_extended_mailbox_cmd(sfs_dev->psp, + SFS_DEFAULT_TIMEOUT, + (struct psp_ext_request *)sfs_dev->command_buf); + if (ret == -EIO) { + dev_dbg(sfs_dev->dev, + "msg 0x%x failed with PSP error: 0x%x, extended status: 0x%x\n", + msg, sfs_dev->command_buf->hdr.status, + *(u32 *)sfs_dev->command_buf->buf); + } + + return ret; +} + +static int send_sfs_get_fw_versions(struct sfs_device *sfs_dev) +{ + /* + * SFS_GET_FW_VERSIONS command needs the output buffer to be + * initialized to 0xC7 in every byte. + */ + memset(sfs_dev->command_buf->sfs_buffer, 0xc7, PAGE_SIZE); + sfs_dev->command_buf->hdr.payload_size = 2 * PAGE_SIZE; + + return send_sfs_cmd(sfs_dev, PSP_SFS_GET_FW_VERSIONS); +} + +static int send_sfs_update_package(struct sfs_device *sfs_dev, const char *payload_name) +{ + char payload_path[PAYLOAD_NAME_SIZE + sizeof("amd/")]; + const struct firmware *firmware; + unsigned long package_size; + int ret; + + /* Sanitize userspace provided payload name */ + if (!strnchr(payload_name, PAYLOAD_NAME_SIZE, '\0')) + return -EINVAL; + + snprintf(payload_path, sizeof(payload_path), "amd/%s", payload_name); + + ret = firmware_request_nowarn(&firmware, payload_path, sfs_dev->dev); + if (ret < 0) { + dev_warn_ratelimited(sfs_dev->dev, "firmware request failed for %s (%d)\n", + payload_path, ret); + return -ENOENT; + } + + /* + * SFS Update Package command's input buffer contains TEE_EXT_CMD_BUFFER + * followed by the Update Package and it should be 64KB aligned. + */ + package_size = ALIGN(firmware->size + PAGE_SIZE, 0x10000U); + + /* + * SFS command buffer is a pre-allocated 2MB buffer, fail update package + * if SFS payload is larger than the pre-allocated command buffer. + */ + if (package_size > SFS_MAX_PAYLOAD_SIZE) { + dev_warn_ratelimited(sfs_dev->dev, + "SFS payload size %ld larger than maximum supported payload size of %u\n", + package_size, SFS_MAX_PAYLOAD_SIZE); + release_firmware(firmware); + return -E2BIG; + } + + /* + * Copy firmware data to a HV_Fixed memory region. + */ + memcpy(sfs_dev->command_buf->sfs_buffer, firmware->data, firmware->size); + sfs_dev->command_buf->hdr.payload_size = package_size; + + release_firmware(firmware); + + return send_sfs_cmd(sfs_dev, PSP_SFS_UPDATE); +} + +static long sfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct sfs_user_get_fw_versions __user *sfs_get_fw_versions; + struct sfs_user_update_package __user *sfs_update_package; + struct psp_device *psp_master = psp_get_master_device(); + char payload_name[PAYLOAD_NAME_SIZE]; + struct sfs_device *sfs_dev; + int ret = 0; + + if (!psp_master || !psp_master->sfs_data) + return -ENODEV; + + sfs_dev = psp_master->sfs_data; + + guard(mutex)(&sfs_ioctl_mutex); + + switch (cmd) { + case SFSIOCFWVERS: + dev_dbg(sfs_dev->dev, "in SFSIOCFWVERS\n"); + + sfs_get_fw_versions = (struct sfs_user_get_fw_versions __user *)arg; + + ret = send_sfs_get_fw_versions(sfs_dev); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_get_fw_versions->blob, sfs_dev->command_buf->sfs_buffer, + PAGE_SIZE)) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_get_fw_versions->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_get_fw_versions->sfs_extended_status))) + return -EFAULT; + break; + case SFSIOCUPDATEPKG: + dev_dbg(sfs_dev->dev, "in SFSIOCUPDATEPKG\n"); + + sfs_update_package = (struct sfs_user_update_package __user *)arg; + + if (copy_from_user(payload_name, sfs_update_package->payload_name, + PAYLOAD_NAME_SIZE)) + return -EFAULT; + + ret = send_sfs_update_package(sfs_dev, payload_name); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_update_package->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_update_package->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_update_package->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_update_package->sfs_extended_status))) + return -EFAULT; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static const struct file_operations sfs_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = sfs_ioctl, +}; + +static void sfs_exit(struct kref *ref) +{ + misc_deregister(&misc_dev->misc); + kfree(misc_dev); + misc_dev = NULL; +} + +void sfs_dev_destroy(struct psp_device *psp) +{ + struct sfs_device *sfs_dev = psp->sfs_data; + + if (!sfs_dev) + return; + + /* + * Change SFS command buffer back to the default "Write-Back" type. + */ + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + + snp_free_hv_fixed_pages(sfs_dev->page); + + if (sfs_dev->misc) + kref_put(&misc_dev->refcount, sfs_exit); + + psp->sfs_data = NULL; +} + +/* Based on sev_misc_init() */ +static int sfs_misc_init(struct sfs_device *sfs) +{ + struct device *dev = sfs->dev; + int ret; + + /* + * SFS feature support can be detected on multiple devices but the SFS + * FW commands must be issued on the master. During probe, we do not + * know the master hence we create /dev/sfs on the first device probe. + */ + if (!misc_dev) { + struct miscdevice *misc; + + misc_dev = kzalloc(sizeof(*misc_dev), GFP_KERNEL); + if (!misc_dev) + return -ENOMEM; + + misc = &misc_dev->misc; + misc->minor = MISC_DYNAMIC_MINOR; + misc->name = "sfs"; + misc->fops = &sfs_fops; + misc->mode = 0600; + + ret = misc_register(misc); + if (ret) + return ret; + + kref_init(&misc_dev->refcount); + } else { + kref_get(&misc_dev->refcount); + } + + sfs->misc = misc_dev; + dev_dbg(dev, "registered SFS device\n"); + + return 0; +} + +int sfs_dev_init(struct psp_device *psp) +{ + struct device *dev = psp->dev; + struct sfs_device *sfs_dev; + struct page *page; + int ret = -ENOMEM; + + sfs_dev = devm_kzalloc(dev, sizeof(*sfs_dev), GFP_KERNEL); + if (!sfs_dev) + return -ENOMEM; + + /* + * Pre-allocate 2MB command buffer for all SFS commands using + * SNP HV_Fixed page allocator which also transitions the + * SFS command buffer to HV_Fixed page state if SNP is enabled. + */ + page = snp_alloc_hv_fixed_pages(SFS_NUM_2MB_PAGES_CMDBUF); + if (!page) { + dev_dbg(dev, "Command Buffer HV-Fixed page allocation failed\n"); + goto cleanup_dev; + } + sfs_dev->page = page; + sfs_dev->command_buf = page_address(page); + + dev_dbg(dev, "Command buffer 0x%px to be marked as HV_Fixed\n", sfs_dev->command_buf); + + /* + * SFS command buffer must be mapped as non-cacheable. + */ + ret = set_memory_uc((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + if (ret) { + dev_dbg(dev, "Set memory uc failed\n"); + goto cleanup_cmd_buf; + } + + dev_dbg(dev, "Command buffer 0x%px marked uncacheable\n", sfs_dev->command_buf); + + psp->sfs_data = sfs_dev; + sfs_dev->dev = dev; + sfs_dev->psp = psp; + + ret = sfs_misc_init(sfs_dev); + if (ret) + goto cleanup_mem_attr; + + dev_notice(sfs_dev->dev, "SFS support is available\n"); + + return 0; + +cleanup_mem_attr: + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + +cleanup_cmd_buf: + snp_free_hv_fixed_pages(page); + +cleanup_dev: + psp->sfs_data = NULL; + devm_kfree(dev, sfs_dev); + + return ret; +} diff --git a/drivers/crypto/ccp/sfs.h b/drivers/crypto/ccp/sfs.h new file mode 100644 index 00000000000000..97704c210efdd2 --- /dev/null +++ b/drivers/crypto/ccp/sfs.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Platform Security Processor (PSP) Seamless Firmware (SFS) Support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __SFS_H__ +#define __SFS_H__ + +#include + +#include +#include +#include +#include +#include + +#include "psp-dev.h" + +struct sfs_misc_dev { + struct kref refcount; + struct miscdevice misc; +}; + +struct sfs_command { + struct psp_ext_req_buffer_hdr hdr; + u8 buf[PAGE_SIZE - sizeof(struct psp_ext_req_buffer_hdr)]; + u8 sfs_buffer[]; +} __packed; + +struct sfs_device { + struct device *dev; + struct psp_device *psp; + + struct page *page; + struct sfs_command *command_buf; + + struct sfs_misc_dev *misc; +}; + +void sfs_dev_destroy(struct psp_device *psp); +int sfs_dev_init(struct psp_device *psp); + +#endif /* __SFS_H__ */ diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index 1504fb012c05b8..540abf7de04886 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -7,6 +7,8 @@ enum psp_platform_access_msg { PSP_CMD_NONE = 0x0, + PSP_SFS_GET_FW_VERSIONS, + PSP_SFS_UPDATE, PSP_CMD_HSTI_QUERY = 0x14, PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, diff --git a/include/uapi/linux/psp-sfs.h b/include/uapi/linux/psp-sfs.h new file mode 100644 index 00000000000000..94e51670383c81 --- /dev/null +++ b/include/uapi/linux/psp-sfs.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Userspace interface for AMD Seamless Firmware Servicing (SFS) + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __PSP_SFS_USER_H__ +#define __PSP_SFS_USER_H__ + +#include + +/** + * SFS: AMD Seamless Firmware Support (SFS) interface + */ + +#define PAYLOAD_NAME_SIZE 64 +#define TEE_EXT_CMD_BUFFER_SIZE 4096 + +/** + * struct sfs_user_get_fw_versions - get current level of base firmware (output). + * @blob: current level of base firmware for ASP and patch levels (input/output). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_get_fw_versions { + __u8 blob[TEE_EXT_CMD_BUFFER_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * struct sfs_user_update_package - update SFS package (input). + * @payload_name: name of SFS package to load, verify and execute (input). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_update_package { + char payload_name[PAYLOAD_NAME_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * Seamless Firmware Support (SFS) IOC + * + * possible return codes for all SFS IOCTLs: + * 0: success + * -EINVAL: invalid input + * -E2BIG: excess data passed + * -EFAULT: failed to copy to/from userspace + * -EBUSY: mailbox in recovery or in use + * -ENODEV: driver not bound with PSP device + * -EACCES: request isn't authorized + * -EINVAL: invalid parameter + * -ETIMEDOUT: request timed out + * -EAGAIN: invalid request for state machine + * -ENOENT: not implemented + * -ENFILE: overflow + * -EPERM: invalid signature + * -EIO: PSP I/O error + */ +#define SFS_IOC_TYPE 'S' + +/** + * SFSIOCFWVERS - returns blob containing FW versions + * ASP provides the current level of Base Firmware for the ASP + * and the other microprocessors as well as current patch + * level(s). + */ +#define SFSIOCFWVERS _IOWR(SFS_IOC_TYPE, 0x1, struct sfs_user_get_fw_versions) + +/** + * SFSIOCUPDATEPKG - updates package/payload + * ASP loads, verifies and executes the SFS package. + * By default, the SFS package/payload is loaded from + * /lib/firmware/amd, but alternative firmware loading + * path can be specified using kernel parameter + * firmware_class.path or the firmware loading path + * can be customized using sysfs file: + * /sys/module/firmware_class/parameters/path. + */ +#define SFSIOCUPDATEPKG _IOWR(SFS_IOC_TYPE, 0x2, struct sfs_user_update_package) + +#endif /* __PSP_SFS_USER_H__ */ From b6f56a44e4c1014b08859dcf04ed246500e310e5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Sep 2025 13:35:15 +0200 Subject: [PATCH 1998/3206] net: rfkill: gpio: Fix crash due to dereferencering uninitialized pointer Since commit 7d5e9737efda ("net: rfkill: gpio: get the name and type from device property") rfkill_find_type() gets called with the possibly uninitialized "const char *type_name;" local variable. On x86 systems when rfkill-gpio binds to a "BCM4752" or "LNV4752" acpi_device, the rfkill->type is set based on the ACPI acpi_device_id: rfkill->type = (unsigned)id->driver_data; and there is no "type" property so device_property_read_string() will fail and leave type_name uninitialized, leading to a potential crash. rfkill_find_type() does accept a NULL pointer, fix the potential crash by initializing type_name to NULL. Note likely sofar this has not been caught because: 1. Not many x86 machines actually have a "BCM4752"/"LNV4752" acpi_device 2. The stack happened to contain NULL where type_name is stored Fixes: 7d5e9737efda ("net: rfkill: gpio: get the name and type from device property") Cc: stable@vger.kernel.org Cc: Heikki Krogerus Signed-off-by: Hans de Goede Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20250913113515.21698-1-hansg@kernel.org Signed-off-by: Johannes Berg --- net/rfkill/rfkill-gpio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 41e657e977618a..cf2dcec6ce5afc 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -94,10 +94,10 @@ static const struct dmi_system_id rfkill_gpio_deny_table[] = { static int rfkill_gpio_probe(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill; - struct gpio_desc *gpio; + const char *type_name = NULL; const char *name_property; const char *type_property; - const char *type_name; + struct gpio_desc *gpio; int ret; if (dmi_check_system(rfkill_gpio_deny_table)) From a404d099554d17206d1f283c9a91f0616324f691 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Sun, 14 Sep 2025 03:19:19 +0000 Subject: [PATCH 1999/3206] rust: pci: fix incorrect platform reference in PCI driver unbind doc comment Substitute 'platform' with 'pci'. Fixes: 18ebb25dfa18 ("rust: pci: implement Driver::unbind()") Cc: stable@kernel.org Signed-off-by: Rahul Rameshbabu Signed-off-by: Danilo Krummrich --- rust/kernel/pci.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 78271bf88ceafc..b80e1321696011 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -274,7 +274,7 @@ pub trait Driver: Send { /// Implementers should attempt to initialize the device here. fn probe(dev: &Device, id_info: &Self::IdInfo) -> Result>>; - /// Platform driver unbind. + /// PCI driver unbind. /// /// Called when a [`Device`] is unbound from its bound [`Driver`]. Implementing this callback /// is optional. From 855318e7c0c4a3e3014c0469dd5bc93a1c0df30c Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Sun, 14 Sep 2025 03:18:34 +0000 Subject: [PATCH 2000/3206] rust: pci: fix incorrect platform reference in PCI driver probe doc comment Substitute 'platform' with 'pci'. Fixes: 1bd8b6b2c5d3 ("rust: pci: add basic PCI device / driver abstractions") Cc: stable@kernel.org Signed-off-by: Rahul Rameshbabu Signed-off-by: Danilo Krummrich --- rust/kernel/pci.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index b80e1321696011..7fcc5f6022c19f 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -270,8 +270,8 @@ pub trait Driver: Send { /// PCI driver probe. /// - /// Called when a new platform device is added or discovered. - /// Implementers should attempt to initialize the device here. + /// Called when a new pci device is added or discovered. Implementers should + /// attempt to initialize the device here. fn probe(dev: &Device, id_info: &Self::IdInfo) -> Result>>; /// PCI driver unbind. From 1dd28fd86c3fa4e395031dd6f2ba920242107010 Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Wed, 17 Sep 2025 08:11:43 +0000 Subject: [PATCH 2001/3206] ASoC: rt5682s: Adjust SAR ADC button mode to fix noise issue Adjust register settings for SAR adc button detection mode to fix noise issue in headset. Signed-off-by: Jack Yu Link: https://patch.msgid.link/766cd1d2dd7a403ba65bb4cc44845f71@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682s.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sound/soc/codecs/rt5682s.c b/sound/soc/codecs/rt5682s.c index 80b921695e7d1c..1d80a4b862e276 100644 --- a/sound/soc/codecs/rt5682s.c +++ b/sound/soc/codecs/rt5682s.c @@ -653,14 +653,15 @@ static void rt5682s_sar_power_mode(struct snd_soc_component *component, int mode switch (mode) { case SAR_PWR_SAVING: snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_3, - RT5682S_CBJ_IN_BUF_MASK, RT5682S_CBJ_IN_BUF_DIS); + RT5682S_CBJ_IN_BUF_MASK, RT5682S_CBJ_IN_BUF_EN); snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_1, - RT5682S_MB1_PATH_MASK | RT5682S_MB2_PATH_MASK, - RT5682S_CTRL_MB1_REG | RT5682S_CTRL_MB2_REG); + RT5682S_MB1_PATH_MASK | RT5682S_MB2_PATH_MASK | + RT5682S_VREF_POW_MASK, RT5682S_CTRL_MB1_FSM | + RT5682S_CTRL_MB2_FSM | RT5682S_VREF_POW_FSM); snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK | RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS | - RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU); + RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU); usleep_range(5000, 5500); snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK, RT5682S_SAR_BUTDET_EN); @@ -688,7 +689,7 @@ static void rt5682s_sar_power_mode(struct snd_soc_component *component, int mode snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK | RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS | - RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU); + RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU); break; default: dev_err(component->dev, "Invalid SAR Power mode: %d\n", mode); @@ -725,7 +726,7 @@ static void rt5682s_disable_push_button_irq(struct snd_soc_component *component) snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK | RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS | - RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU); + RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU); } /** @@ -786,7 +787,7 @@ static int rt5682s_headset_detect(struct snd_soc_component *component, int jack_ jack_type = SND_JACK_HEADSET; snd_soc_component_write(component, RT5682S_SAR_IL_CMD_3, 0x024c); snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_1, - RT5682S_FAST_OFF_MASK, RT5682S_FAST_OFF_EN); + RT5682S_FAST_OFF_MASK, RT5682S_FAST_OFF_DIS); snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_SEL_MB1_2_MASK, val << RT5682S_SAR_SEL_MB1_2_SFT); rt5682s_enable_push_button_irq(component); @@ -966,7 +967,7 @@ static int rt5682s_set_jack_detect(struct snd_soc_component *component, RT5682S_EMB_JD_MASK | RT5682S_DET_TYPE | RT5682S_POL_FAST_OFF_MASK | RT5682S_MIC_CAP_MASK, RT5682S_EMB_JD_EN | RT5682S_DET_TYPE | - RT5682S_POL_FAST_OFF_HIGH | RT5682S_MIC_CAP_HS); + RT5682S_POL_FAST_OFF_LOW | RT5682S_MIC_CAP_HS); regmap_update_bits(rt5682s->regmap, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_POW_MASK, RT5682S_SAR_POW_EN); regmap_update_bits(rt5682s->regmap, RT5682S_GPIO_CTRL_1, From ab63e9910d2d3ea4b8e6c08812258a676defcb9c Mon Sep 17 00:00:00 2001 From: Tim Kuo Date: Wed, 17 Sep 2025 13:58:39 +0800 Subject: [PATCH 2002/3206] spi: mt65xx: add dual and quad mode for standard spi device Mediatek SPI hardware natively supports dual and quad modes, and these modes are already enabled for SPI flash devices under spi-mem framework in MTK SPI controller spi-mt65xx. However, other SPI devices, such as touch panels, are limited to single mode because spi-mt65xx lacks SPI mode argument parsing from SPI framework for these SPI devices outside spi-mem framework. This patch adds dual and quad mode support for these SPI devices by introducing a new API, mtk_spi_set_nbits, for SPI mode argument parsing. Signed-off-by: Tim Kuo Reviewed-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20250917055839.500615-1-Tim.Kuo@mediatek.com Signed-off-by: Mark Brown --- drivers/spi/spi-mt65xx.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c index 8a3c00c3af42a7..4b40985af1eaf0 100644 --- a/drivers/spi/spi-mt65xx.c +++ b/drivers/spi/spi-mt65xx.c @@ -563,6 +563,22 @@ static void mtk_spi_setup_packet(struct spi_controller *host) writel(reg_val, mdata->base + SPI_CFG1_REG); } +inline u32 mtk_spi_set_nbit(u32 nbit) +{ + switch (nbit) { + default: + pr_warn_once("unknown nbit mode %u. Falling back to single mode\n", + nbit); + fallthrough; + case SPI_NBITS_SINGLE: + return 0x0; + case SPI_NBITS_DUAL: + return 0x1; + case SPI_NBITS_QUAD: + return 0x2; + } +} + static void mtk_spi_enable_transfer(struct spi_controller *host) { u32 cmd; @@ -729,10 +745,16 @@ static int mtk_spi_transfer_one(struct spi_controller *host, /* prepare xfer direction and duplex mode */ if (mdata->dev_comp->ipm_design) { - if (!xfer->tx_buf || !xfer->rx_buf) { + if (xfer->tx_buf && xfer->rx_buf) { + reg_val &= ~SPI_CFG3_IPM_HALF_DUPLEX_EN; + } else if (xfer->tx_buf) { + reg_val |= SPI_CFG3_IPM_HALF_DUPLEX_EN; + reg_val &= ~SPI_CFG3_IPM_HALF_DUPLEX_DIR; + reg_val |= mtk_spi_set_nbit(xfer->tx_nbits); + } else { reg_val |= SPI_CFG3_IPM_HALF_DUPLEX_EN; - if (xfer->rx_buf) - reg_val |= SPI_CFG3_IPM_HALF_DUPLEX_DIR; + reg_val |= SPI_CFG3_IPM_HALF_DUPLEX_DIR; + reg_val |= mtk_spi_set_nbit(xfer->rx_nbits); } writel(reg_val, mdata->base + SPI_CFG3_IPM_REG); } From 7e18682bdbda4df24837dae4cf103b2a6de3d699 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 17 Sep 2025 13:12:49 +0200 Subject: [PATCH 2003/3206] ALSA: spi/at73c213: Use guard() for spin locks Clean up the code using guard() for spin locks. Merely code refactoring, and no behavior change. Signed-off-by: Takashi Iwai --- sound/spi/at73c213.c | 52 ++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/sound/spi/at73c213.c b/sound/spi/at73c213.c index 0ece7ccbd55fae..0c2394733dc4bb 100644 --- a/sound/spi/at73c213.c +++ b/sound/spi/at73c213.c @@ -273,9 +273,8 @@ static int snd_at73c213_pcm_trigger(struct snd_pcm_substream *substream, int cmd) { struct snd_at73c213 *chip = snd_pcm_substream_chip(substream); - int retval = 0; - spin_lock(&chip->lock); + guard(spinlock)(&chip->lock); switch (cmd) { case SNDRV_PCM_TRIGGER_START: @@ -288,13 +287,11 @@ static int snd_at73c213_pcm_trigger(struct snd_pcm_substream *substream, break; default: dev_dbg(&chip->spi->dev, "spurious command %x\n", cmd); - retval = -EINVAL; + return -EINVAL; break; } - spin_unlock(&chip->lock); - - return retval; + return 0; } static snd_pcm_uframes_t @@ -358,30 +355,29 @@ static irqreturn_t snd_at73c213_interrupt(int irq, void *dev_id) int next_period; int retval = IRQ_NONE; - spin_lock(&chip->lock); + scoped_guard(spinlock, &chip->lock) { + block_size = frames_to_bytes(runtime, runtime->period_size); + status = ssc_readl(chip->ssc->regs, IMR); - block_size = frames_to_bytes(runtime, runtime->period_size); - status = ssc_readl(chip->ssc->regs, IMR); - - if (status & SSC_BIT(IMR_ENDTX)) { - chip->period++; - if (chip->period == runtime->periods) - chip->period = 0; - next_period = chip->period + 1; - if (next_period == runtime->periods) - next_period = 0; - - offset = block_size * next_period; - - ssc_writel(chip->ssc->regs, PDC_TNPR, - (long)runtime->dma_addr + offset); - ssc_writel(chip->ssc->regs, PDC_TNCR, - runtime->period_size * runtime->channels); - retval = IRQ_HANDLED; - } + if (status & SSC_BIT(IMR_ENDTX)) { + chip->period++; + if (chip->period == runtime->periods) + chip->period = 0; + next_period = chip->period + 1; + if (next_period == runtime->periods) + next_period = 0; + + offset = block_size * next_period; - ssc_readl(chip->ssc->regs, IMR); - spin_unlock(&chip->lock); + ssc_writel(chip->ssc->regs, PDC_TNPR, + (long)runtime->dma_addr + offset); + ssc_writel(chip->ssc->regs, PDC_TNCR, + runtime->period_size * runtime->channels); + retval = IRQ_HANDLED; + } + + ssc_readl(chip->ssc->regs, IMR); + } if (status & SSC_BIT(IMR_ENDTX)) snd_pcm_period_elapsed(chip->substream); From 1f9fc89cbbe8a7a8648ea2f827f7d8590e62e52c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 17 Sep 2025 14:13:27 +0200 Subject: [PATCH 2004/3206] ALSA: seq: Fix KCSAN data-race warning at snd_seq_fifo_poll_wait() snd_seq_fifo_poll_wait() evaluates f->cells without locking after poll_wait(), and KCSAN doesn't like it as it appears to be a data-race. Although this doesn't matter much in practice as the value is volatile, it's still better to address it for the mind piece. Wrap it with f->lock spinlock for avoiding the potential data race. Reported-by: syzbot+c3dbc239259940ededba@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=c3dbc239259940ededba Signed-off-by: Takashi Iwai --- sound/core/seq/seq_fifo.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c index f23c6b7ae2403c..91cce18901114b 100644 --- a/sound/core/seq/seq_fifo.c +++ b/sound/core/seq/seq_fifo.c @@ -210,6 +210,7 @@ int snd_seq_fifo_poll_wait(struct snd_seq_fifo *f, struct file *file, poll_table *wait) { poll_wait(file, &f->input_sleep, wait); + guard(spinlock_irq)(&f->lock); return (f->cells > 0); } From 44499ecb4f2817743c37d861bdb3e95f37d3d9cd Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 17 Sep 2025 15:09:01 +0200 Subject: [PATCH 2005/3206] ALSA: usb: qcom: Fix false-positive address space check The sanity check previously added to uaudio_transfer_buffer_setup() assumed the allocated buffer being linear-mapped. But the buffer allocated via usb_alloc_coherent() isn't always so, rather to be used with (SG-)DMA API. This leaded to a false-positive warning and the driver failed to work. Actually uaudio_transfer_buffer_setup() deals only with the DMA-API addresses for MEM_XFER_BUF type, while other callers of uaudio_iommu_map() are with pages with physical addresses for MEM_EVENT_RING and MEM_XFER_RING types. So this patch splits the mapping helper function to two different ones, uaudio_iommu_map() for the DMA pages and uaudio_iommu_map_pa() for the latter, in order to handle mapping differently for each type. Along with it, the unnecessary address check that caused probe error is dropped, too. Fixes: 3335a1bbd624 ("ALSA: qc_audio_offload: try to reduce address space confusion") Suggested-by: Arnd Bergmann Acked-by: Arnd Bergmann Reported-and-tested-by: Luca Weiss Closes: https://lore.kernel.org/DBR2363A95M1.L9XBNC003490@fairphone.com Signed-off-by: Takashi Iwai --- sound/usb/qcom/qc_audio_offload.c | 92 ++++++++++++++++--------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/sound/usb/qcom/qc_audio_offload.c b/sound/usb/qcom/qc_audio_offload.c index a25c5a5316901c..9ad76fff741b8d 100644 --- a/sound/usb/qcom/qc_audio_offload.c +++ b/sound/usb/qcom/qc_audio_offload.c @@ -538,38 +538,33 @@ static void uaudio_iommu_unmap(enum mem_type mtype, unsigned long iova, umap_size, iova, mapped_iova_size); } +static int uaudio_iommu_map_prot(bool dma_coherent) +{ + int prot = IOMMU_READ | IOMMU_WRITE; + + if (dma_coherent) + prot |= IOMMU_CACHE; + return prot; +} + /** - * uaudio_iommu_map() - maps iommu memory for adsp + * uaudio_iommu_map_pa() - maps iommu memory for adsp * @mtype: ring type * @dma_coherent: dma coherent * @pa: physical address for ring/buffer * @size: size of memory region - * @sgt: sg table for memory region * * Maps the XHCI related resources to a memory region that is assigned to be * used by the adsp. This will be mapped to the domain, which is created by * the ASoC USB backend driver. * */ -static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent, - phys_addr_t pa, size_t size, - struct sg_table *sgt) +static unsigned long uaudio_iommu_map_pa(enum mem_type mtype, bool dma_coherent, + phys_addr_t pa, size_t size) { - struct scatterlist *sg; unsigned long iova = 0; - size_t total_len = 0; - unsigned long iova_sg; - phys_addr_t pa_sg; bool map = true; - size_t sg_len; - int prot; - int ret; - int i; - - prot = IOMMU_READ | IOMMU_WRITE; - - if (dma_coherent) - prot |= IOMMU_CACHE; + int prot = uaudio_iommu_map_prot(dma_coherent); switch (mtype) { case MEM_EVENT_RING: @@ -583,20 +578,41 @@ static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent, &uaudio_qdev->xfer_ring_iova_size, &uaudio_qdev->xfer_ring_list, size); break; - case MEM_XFER_BUF: - iova = uaudio_get_iova(&uaudio_qdev->curr_xfer_buf_iova, - &uaudio_qdev->xfer_buf_iova_size, - &uaudio_qdev->xfer_buf_list, size); - break; default: dev_err(uaudio_qdev->data->dev, "unknown mem type %d\n", mtype); } if (!iova || !map) - goto done; + return 0; + + iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, GFP_KERNEL); - if (!sgt) - goto skip_sgt_map; + return iova; +} + +static unsigned long uaudio_iommu_map_xfer_buf(bool dma_coherent, size_t size, + struct sg_table *sgt) +{ + struct scatterlist *sg; + unsigned long iova = 0; + size_t total_len = 0; + unsigned long iova_sg; + phys_addr_t pa_sg; + size_t sg_len; + int prot = uaudio_iommu_map_prot(dma_coherent); + int ret; + int i; + + prot = IOMMU_READ | IOMMU_WRITE; + + if (dma_coherent) + prot |= IOMMU_CACHE; + + iova = uaudio_get_iova(&uaudio_qdev->curr_xfer_buf_iova, + &uaudio_qdev->xfer_buf_iova_size, + &uaudio_qdev->xfer_buf_list, size); + if (!iova) + goto done; iova_sg = iova; for_each_sg(sgt->sgl, sg, sgt->nents, i) { @@ -618,11 +634,6 @@ static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent, uaudio_iommu_unmap(MEM_XFER_BUF, iova, size, total_len); iova = 0; } - return iova; - -skip_sgt_map: - iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, GFP_KERNEL); - done: return iova; } @@ -1020,7 +1031,6 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs, struct sg_table xfer_buf_sgt; dma_addr_t xfer_buf_dma; void *xfer_buf; - phys_addr_t xfer_buf_pa; u32 len = xfer_buf_len; bool dma_coherent; dma_addr_t xfer_buf_dma_sysdev; @@ -1051,18 +1061,12 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs, if (!xfer_buf) return -ENOMEM; - /* Remapping is not possible if xfer_buf is outside of linear map */ - xfer_buf_pa = virt_to_phys(xfer_buf); - if (WARN_ON(!page_is_ram(PFN_DOWN(xfer_buf_pa)))) { - ret = -ENXIO; - goto unmap_sync; - } dma_get_sgtable(subs->dev->bus->sysdev, &xfer_buf_sgt, xfer_buf, xfer_buf_dma, len); /* map the physical buffer into sysdev as well */ - xfer_buf_dma_sysdev = uaudio_iommu_map(MEM_XFER_BUF, dma_coherent, - xfer_buf_pa, len, &xfer_buf_sgt); + xfer_buf_dma_sysdev = uaudio_iommu_map_xfer_buf(dma_coherent, + len, &xfer_buf_sgt); if (!xfer_buf_dma_sysdev) { ret = -ENOMEM; goto unmap_sync; @@ -1143,8 +1147,8 @@ uaudio_endpoint_setup(struct snd_usb_substream *subs, sg_free_table(sgt); /* data transfer ring */ - iova = uaudio_iommu_map(MEM_XFER_RING, dma_coherent, tr_pa, - PAGE_SIZE, NULL); + iova = uaudio_iommu_map_pa(MEM_XFER_RING, dma_coherent, tr_pa, + PAGE_SIZE); if (!iova) { ret = -ENOMEM; goto clear_pa; @@ -1207,8 +1211,8 @@ static int uaudio_event_ring_setup(struct snd_usb_substream *subs, mem_info->dma = sg_dma_address(sgt->sgl); sg_free_table(sgt); - iova = uaudio_iommu_map(MEM_EVENT_RING, dma_coherent, er_pa, - PAGE_SIZE, NULL); + iova = uaudio_iommu_map_pa(MEM_EVENT_RING, dma_coherent, er_pa, + PAGE_SIZE); if (!iova) { ret = -ENOMEM; goto clear_pa; From a86556264696b797d94238d99d8284d0d34ed960 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 15 Sep 2025 16:12:40 +0200 Subject: [PATCH 2006/3206] dm-raid: don't set io_min and io_opt for raid1 These commands modprobe brd rd_size=1048576 vgcreate vg /dev/ram* lvcreate -m4 -L10 -n lv vg trigger the following warnings: device-mapper: table: 252:10: adding target device (start sect 0 len 24576) caused an alignment inconsistency device-mapper: table: 252:10: adding target device (start sect 0 len 24576) caused an alignment inconsistency The warnings are caused by the fact that io_min is 512 and physical block size is 4096. If there's chunk-less raid, such as raid1, io_min shouldn't be set to zero because it would be raised to 512 and it would trigger the warning. Signed-off-by: Mikulas Patocka Reviewed-by: Martin K. Petersen Cc: stable@vger.kernel.org --- drivers/md/dm-raid.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 79ea85d18e24e5..f4b904e2432853 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3813,8 +3813,10 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) struct raid_set *rs = ti->private; unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors); - limits->io_min = chunk_size_bytes; - limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + if (chunk_size_bytes) { + limits->io_min = chunk_size_bytes; + limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + } } static void raid_presuspend(struct dm_target *ti) From 5bcf9e1d0a5da750ff180e7385b0bd4041686516 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= Date: Sat, 13 Sep 2025 23:12:48 +0200 Subject: [PATCH 2007/3206] dt-bindings: clock: marvell,pxa1908: Add syscon compatible to apmu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add required syscon compatible and #power-domain-cells to the APMU controller. This is required for the SoC's power domain controller as the registers are shared. Device tree bindings for said power domains are also added. Reviewed-by: Rob Herring (Arm) Signed-off-by: Duje Mihanović Signed-off-by: Ulf Hansson --- .../bindings/clock/marvell,pxa1908.yaml | 30 +++++++++++++++---- MAINTAINERS | 1 + .../dt-bindings/power/marvell,pxa1908-power.h | 17 +++++++++++ 3 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 include/dt-bindings/power/marvell,pxa1908-power.h diff --git a/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml b/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml index 4e78933232b6b9..6f3a8578fe2a68 100644 --- a/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml +++ b/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml @@ -19,11 +19,14 @@ description: | properties: compatible: - enum: - - marvell,pxa1908-apbc - - marvell,pxa1908-apbcp - - marvell,pxa1908-mpmu - - marvell,pxa1908-apmu + oneOf: + - enum: + - marvell,pxa1908-apbc + - marvell,pxa1908-apbcp + - marvell,pxa1908-mpmu + - items: + - const: marvell,pxa1908-apmu + - const: syscon reg: maxItems: 1 @@ -31,6 +34,9 @@ properties: '#clock-cells': const: 1 + '#power-domain-cells': + const: 1 + required: - compatible - reg @@ -38,11 +44,23 @@ required: additionalProperties: false +if: + not: + properties: + compatible: + contains: + const: marvell,pxa1908-apmu + +then: + properties: + '#power-domain-cells': false + examples: # APMU block: - | clock-controller@d4282800 { - compatible = "marvell,pxa1908-apmu"; + compatible = "marvell,pxa1908-apmu", "syscon"; reg = <0xd4282800 0x400>; #clock-cells = <1>; + #power-domain-cells = <1>; }; diff --git a/MAINTAINERS b/MAINTAINERS index daf520a13bdf6a..22aa7ce417b8d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2872,6 +2872,7 @@ S: Maintained F: arch/arm64/boot/dts/marvell/mmp/ F: drivers/clk/mmp/clk-pxa1908*.c F: include/dt-bindings/clock/marvell,pxa1908.h +F: include/dt-bindings/power/marvell,pxa1908-power.h ARM/Mediatek RTC DRIVER M: Eddie Huang diff --git a/include/dt-bindings/power/marvell,pxa1908-power.h b/include/dt-bindings/power/marvell,pxa1908-power.h new file mode 100644 index 00000000000000..19b088351af138 --- /dev/null +++ b/include/dt-bindings/power/marvell,pxa1908-power.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */ +/* + * Marvell PXA1908 power domains + * + * Copyright 2025, Duje Mihanović + */ + +#ifndef __DTS_MARVELL_PXA1908_POWER_H +#define __DTS_MARVELL_PXA1908_POWER_H + +#define PXA1908_POWER_DOMAIN_VPU 0 +#define PXA1908_POWER_DOMAIN_GPU 1 +#define PXA1908_POWER_DOMAIN_GPU2D 2 +#define PXA1908_POWER_DOMAIN_DSI 3 +#define PXA1908_POWER_DOMAIN_ISP 4 + +#endif From 6f51a04551d162f5acbcf6c1998f0527093c93ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= Date: Sat, 13 Sep 2025 23:12:49 +0200 Subject: [PATCH 2008/3206] pmdomain: marvell: Add PXA1908 power domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marvell's PXA1908 SoC has a few power domains for its VPU, GPU, image processor and DSI PHY. Add a driver to control these. Signed-off-by: Duje Mihanović Signed-off-by: Ulf Hansson --- MAINTAINERS | 1 + drivers/pmdomain/Kconfig | 1 + drivers/pmdomain/Makefile | 1 + drivers/pmdomain/marvell/Kconfig | 18 ++ drivers/pmdomain/marvell/Makefile | 3 + .../marvell/pxa1908-power-controller.c | 274 ++++++++++++++++++ 6 files changed, 298 insertions(+) create mode 100644 drivers/pmdomain/marvell/Kconfig create mode 100644 drivers/pmdomain/marvell/Makefile create mode 100644 drivers/pmdomain/marvell/pxa1908-power-controller.c diff --git a/MAINTAINERS b/MAINTAINERS index 22aa7ce417b8d3..918df6d74c2a50 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2871,6 +2871,7 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm64/boot/dts/marvell/mmp/ F: drivers/clk/mmp/clk-pxa1908*.c +F: drivers/pmdomain/marvell/ F: include/dt-bindings/clock/marvell,pxa1908.h F: include/dt-bindings/power/marvell,pxa1908-power.h diff --git a/drivers/pmdomain/Kconfig b/drivers/pmdomain/Kconfig index 91f04ace35d4b0..23076ae90e6641 100644 --- a/drivers/pmdomain/Kconfig +++ b/drivers/pmdomain/Kconfig @@ -7,6 +7,7 @@ source "drivers/pmdomain/apple/Kconfig" source "drivers/pmdomain/arm/Kconfig" source "drivers/pmdomain/bcm/Kconfig" source "drivers/pmdomain/imx/Kconfig" +source "drivers/pmdomain/marvell/Kconfig" source "drivers/pmdomain/mediatek/Kconfig" source "drivers/pmdomain/qcom/Kconfig" source "drivers/pmdomain/renesas/Kconfig" diff --git a/drivers/pmdomain/Makefile b/drivers/pmdomain/Makefile index 7030f44a49df9e..ebc802f13eb953 100644 --- a/drivers/pmdomain/Makefile +++ b/drivers/pmdomain/Makefile @@ -5,6 +5,7 @@ obj-y += apple/ obj-y += arm/ obj-y += bcm/ obj-y += imx/ +obj-y += marvell/ obj-y += mediatek/ obj-y += qcom/ obj-y += renesas/ diff --git a/drivers/pmdomain/marvell/Kconfig b/drivers/pmdomain/marvell/Kconfig new file mode 100644 index 00000000000000..6c4084c8266702 --- /dev/null +++ b/drivers/pmdomain/marvell/Kconfig @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Marvell PM Domains" + depends on ARCH_MMP || COMPILE_TEST + +config PXA1908_PM_DOMAINS + tristate "Marvell PXA1908 power domains" + depends on OF + depends on PM + default y if ARCH_MMP && ARM64 + select AUXILIARY_BUS + select MFD_SYSCON + select PM_GENERIC_DOMAINS + select PM_GENERIC_DOMAINS_OF + help + Say Y here to enable support for Marvell PXA1908's power domanis. + +endmenu diff --git a/drivers/pmdomain/marvell/Makefile b/drivers/pmdomain/marvell/Makefile new file mode 100644 index 00000000000000..22c25013f6c856 --- /dev/null +++ b/drivers/pmdomain/marvell/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_PXA1908_PM_DOMAINS) += pxa1908-power-controller.o diff --git a/drivers/pmdomain/marvell/pxa1908-power-controller.c b/drivers/pmdomain/marvell/pxa1908-power-controller.c new file mode 100644 index 00000000000000..ff5e6e82d3f8df --- /dev/null +++ b/drivers/pmdomain/marvell/pxa1908-power-controller.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Duje Mihanović + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* VPU, GPU, ISP */ +#define APMU_PWR_CTRL_REG 0xd8 +#define APMU_PWR_BLK_TMR_REG 0xdc +#define APMU_PWR_STATUS_REG 0xf0 + +/* DSI */ +#define APMU_DEBUG 0x88 +#define DSI_PHY_DVM_MASK BIT(31) + +#define POWER_ON_LATENCY_US 300 +#define POWER_OFF_LATENCY_US 20 +#define POWER_POLL_TIMEOUT_US (25 * USEC_PER_MSEC) +#define POWER_POLL_SLEEP_US 6 + +#define NR_DOMAINS 5 + +#define to_pxa1908_pd(_genpd) container_of(_genpd, struct pxa1908_pd, genpd) + +struct pxa1908_pd_ctrl { + struct generic_pm_domain *domains[NR_DOMAINS]; + struct genpd_onecell_data onecell_data; + struct regmap *base; + struct device *dev; +}; + +struct pxa1908_pd_data { + u32 reg_clk_res_ctrl; + u32 pwr_state; + u32 hw_mode; + bool keep_on; + int id; +}; + +struct pxa1908_pd { + const struct pxa1908_pd_data data; + struct pxa1908_pd_ctrl *ctrl; + struct generic_pm_domain genpd; + bool initialized; +}; + +static inline bool pxa1908_pd_is_on(struct pxa1908_pd *pd) +{ + struct pxa1908_pd_ctrl *ctrl = pd->ctrl; + + return pd->data.id != PXA1908_POWER_DOMAIN_DSI + ? regmap_test_bits(ctrl->base, APMU_PWR_STATUS_REG, pd->data.pwr_state) + : regmap_test_bits(ctrl->base, APMU_DEBUG, DSI_PHY_DVM_MASK); +} + +static int pxa1908_pd_power_on(struct generic_pm_domain *genpd) +{ + struct pxa1908_pd *pd = to_pxa1908_pd(genpd); + const struct pxa1908_pd_data *data = &pd->data; + struct pxa1908_pd_ctrl *ctrl = pd->ctrl; + unsigned int status; + int ret = 0; + + regmap_set_bits(ctrl->base, data->reg_clk_res_ctrl, data->hw_mode); + if (data->id != PXA1908_POWER_DOMAIN_ISP) + regmap_write(ctrl->base, APMU_PWR_BLK_TMR_REG, 0x20001fff); + regmap_set_bits(ctrl->base, APMU_PWR_CTRL_REG, data->pwr_state); + + ret = regmap_read_poll_timeout(ctrl->base, APMU_PWR_STATUS_REG, status, + status & data->pwr_state, POWER_POLL_SLEEP_US, + POWER_ON_LATENCY_US + POWER_POLL_TIMEOUT_US); + if (ret == -ETIMEDOUT) + dev_err(ctrl->dev, "timed out powering on domain '%s'\n", pd->genpd.name); + + return ret; +} + +static int pxa1908_pd_power_off(struct generic_pm_domain *genpd) +{ + struct pxa1908_pd *pd = to_pxa1908_pd(genpd); + const struct pxa1908_pd_data *data = &pd->data; + struct pxa1908_pd_ctrl *ctrl = pd->ctrl; + unsigned int status; + int ret; + + regmap_clear_bits(ctrl->base, APMU_PWR_CTRL_REG, data->pwr_state); + + ret = regmap_read_poll_timeout(ctrl->base, APMU_PWR_STATUS_REG, status, + !(status & data->pwr_state), POWER_POLL_SLEEP_US, + POWER_OFF_LATENCY_US + POWER_POLL_TIMEOUT_US); + if (ret == -ETIMEDOUT) { + dev_err(ctrl->dev, "timed out powering off domain '%s'\n", pd->genpd.name); + return ret; + } + + return regmap_clear_bits(ctrl->base, data->reg_clk_res_ctrl, data->hw_mode); +} + +static inline int pxa1908_dsi_power_on(struct generic_pm_domain *genpd) +{ + struct pxa1908_pd *pd = to_pxa1908_pd(genpd); + struct pxa1908_pd_ctrl *ctrl = pd->ctrl; + + return regmap_set_bits(ctrl->base, APMU_DEBUG, DSI_PHY_DVM_MASK); +} + +static inline int pxa1908_dsi_power_off(struct generic_pm_domain *genpd) +{ + struct pxa1908_pd *pd = to_pxa1908_pd(genpd); + struct pxa1908_pd_ctrl *ctrl = pd->ctrl; + + return regmap_clear_bits(ctrl->base, APMU_DEBUG, DSI_PHY_DVM_MASK); +} + +#define DOMAIN(_id, _name, ctrl, mode, state) \ + [_id] = { \ + .data = { \ + .reg_clk_res_ctrl = ctrl, \ + .hw_mode = BIT(mode), \ + .pwr_state = BIT(state), \ + .id = _id, \ + }, \ + .genpd = { \ + .name = _name, \ + .power_on = pxa1908_pd_power_on, \ + .power_off = pxa1908_pd_power_off, \ + }, \ + } + +static struct pxa1908_pd domains[NR_DOMAINS] = { + DOMAIN(PXA1908_POWER_DOMAIN_VPU, "vpu", 0xa4, 19, 2), + DOMAIN(PXA1908_POWER_DOMAIN_GPU, "gpu", 0xcc, 11, 0), + DOMAIN(PXA1908_POWER_DOMAIN_GPU2D, "gpu2d", 0xf4, 11, 6), + DOMAIN(PXA1908_POWER_DOMAIN_ISP, "isp", 0x38, 15, 4), + [PXA1908_POWER_DOMAIN_DSI] = { + .genpd = { + .name = "dsi", + .power_on = pxa1908_dsi_power_on, + .power_off = pxa1908_dsi_power_off, + /* + * TODO: There is no DSI driver written yet and until then we probably + * don't want to power off the DSI PHY ever. + */ + .flags = GENPD_FLAG_ALWAYS_ON, + }, + .data = { + /* See above. */ + .keep_on = true, + }, + }, +}; + +static void pxa1908_pd_remove(struct auxiliary_device *auxdev) +{ + struct pxa1908_pd *pd; + int ret; + + for (int i = NR_DOMAINS - 1; i >= 0; i--) { + pd = &domains[i]; + + if (!pd->initialized) + continue; + + if (pxa1908_pd_is_on(pd) && !pd->data.keep_on) + pxa1908_pd_power_off(&pd->genpd); + + ret = pm_genpd_remove(&pd->genpd); + if (ret) + dev_err(&auxdev->dev, "failed to remove domain '%s': %d\n", + pd->genpd.name, ret); + } +} + +static int +pxa1908_pd_init(struct pxa1908_pd_ctrl *ctrl, int id, struct device *dev) +{ + struct pxa1908_pd *pd = &domains[id]; + int ret; + + ctrl->domains[id] = &pd->genpd; + + pd->ctrl = ctrl; + + /* Make sure the state of the hardware is synced with the domain table above. */ + if (pd->data.keep_on) { + ret = pd->genpd.power_on(&pd->genpd); + if (ret) + return dev_err_probe(dev, ret, "failed to power on domain '%s'\n", + pd->genpd.name); + } else { + if (pxa1908_pd_is_on(pd)) { + dev_warn(dev, + "domain '%s' is on despite being default off; powering off\n", + pd->genpd.name); + + ret = pd->genpd.power_off(&pd->genpd); + if (ret) + return dev_err_probe(dev, ret, + "failed to power off domain '%s'\n", + pd->genpd.name); + } + } + + ret = pm_genpd_init(&pd->genpd, NULL, !pd->data.keep_on); + if (ret) + return dev_err_probe(dev, ret, "domain '%s' failed to initialize\n", + pd->genpd.name); + + pd->initialized = true; + + return 0; +} + +static int +pxa1908_pd_probe(struct auxiliary_device *auxdev, const struct auxiliary_device_id *aux_id) +{ + struct pxa1908_pd_ctrl *ctrl; + struct device *dev = &auxdev->dev; + int ret; + + ctrl = devm_kzalloc(dev, sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return -ENOMEM; + + auxiliary_set_drvdata(auxdev, ctrl); + + ctrl->base = syscon_node_to_regmap(dev->parent->of_node); + if (IS_ERR(ctrl->base)) + return dev_err_probe(dev, PTR_ERR(ctrl->base), "no regmap available\n"); + + ctrl->dev = dev; + ctrl->onecell_data.domains = ctrl->domains; + ctrl->onecell_data.num_domains = NR_DOMAINS; + + for (int i = 0; i < NR_DOMAINS; i++) { + ret = pxa1908_pd_init(ctrl, i, dev); + if (ret) + goto err; + } + + return of_genpd_add_provider_onecell(dev->parent->of_node, &ctrl->onecell_data); + +err: + pxa1908_pd_remove(auxdev); + return ret; +} + +static const struct auxiliary_device_id pxa1908_pd_id[] = { + { .name = "clk_pxa1908_apmu.power" }, + { } +}; +MODULE_DEVICE_TABLE(auxiliary, pxa1908_pd_id); + +static struct auxiliary_driver pxa1908_pd_driver = { + .probe = pxa1908_pd_probe, + .remove = pxa1908_pd_remove, + .id_table = pxa1908_pd_id, +}; +module_auxiliary_driver(pxa1908_pd_driver); + +MODULE_AUTHOR("Duje Mihanović "); +MODULE_DESCRIPTION("Marvell PXA1908 power domain driver"); +MODULE_LICENSE("GPL"); From 6fabca2fc94d33cdf7ec102058983b086293395f Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 17 Sep 2025 10:08:00 +0200 Subject: [PATCH 2009/3206] bpf: Explicitly check accesses to bpf_sock_addr Syzkaller found a kernel warning on the following sock_addr program: 0: r0 = 0 1: r2 = *(u32 *)(r1 +60) 2: exit which triggers: verifier bug: error during ctx access conversion (0) This is happening because offset 60 in bpf_sock_addr corresponds to an implicit padding of 4 bytes, right after msg_src_ip4. Access to this padding isn't rejected in sock_addr_is_valid_access and it thus later fails to convert the access. This patch fixes it by explicitly checking the various fields of bpf_sock_addr in sock_addr_is_valid_access. I checked the other ctx structures and is_valid_access functions and didn't find any other similar cases. Other cases of (properly handled) padding are covered in new tests in a subsequent patch. Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") Reported-by: syzbot+136ca59d411f92e821b7@syzkaller.appspotmail.com Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Daniel Borkmann Closes: https://syzkaller.appspot.com/bug?extid=136ca59d411f92e821b7 Link: https://lore.kernel.org/bpf/b58609d9490649e76e584b0361da0abd3c2c1779.1758094761.git.paul.chaignon@gmail.com --- net/core/filter.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 63f3baee2dafa7..8342f810ad8562 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9284,13 +9284,17 @@ static bool sock_addr_is_valid_access(int off, int size, return false; info->reg_type = PTR_TO_SOCKET; break; - default: - if (type == BPF_READ) { - if (size != size_default) - return false; - } else { + case bpf_ctx_range(struct bpf_sock_addr, user_family): + case bpf_ctx_range(struct bpf_sock_addr, family): + case bpf_ctx_range(struct bpf_sock_addr, type): + case bpf_ctx_range(struct bpf_sock_addr, protocol): + if (type != BPF_READ) return false; - } + if (size != size_default) + return false; + break; + default: + return false; } return true; From 7c60f6e488b7aba224ebe396b04ad54b8e31e168 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 17 Sep 2025 10:09:26 +0200 Subject: [PATCH 2010/3206] selftests/bpf: Move macros to bpf_misc.h Move the sizeof_field and offsetofend macros from individual test files to the common bpf_misc.h to avoid duplication. Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/97a3f3788bd3aec309100bc073a5c77130e371fd.1758094761.git.paul.chaignon@gmail.com --- tools/testing/selftests/bpf/progs/bpf_misc.h | 4 ++++ tools/testing/selftests/bpf/progs/test_cls_redirect.c | 4 +--- tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c | 5 +---- tools/testing/selftests/bpf/progs/verifier_ctx.c | 2 -- tools/testing/selftests/bpf/progs/verifier_sock.c | 4 ---- 5 files changed, 6 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 7905396c9cc4e2..1004c4a64aafb9 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -167,6 +167,10 @@ #define __imm_ptr(name) [name]"r"(&name) #define __imm_insn(name, expr) [name]"i"(*(long *)&(expr)) +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) + /* Magic constants used with __retval() */ #define POINTER_VALUE 0xbadcafe #define TEST_DATA_LEN 64 diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index 823169fb6e4c7f..26a53e54b8fa21 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -22,6 +22,7 @@ #include "bpf_compiler.h" #include "test_cls_redirect.h" +#include "bpf_misc.h" #pragma GCC diagnostic ignored "-Waddress-of-packed-member" @@ -31,9 +32,6 @@ #define INLINING __always_inline #endif -#define offsetofend(TYPE, MEMBER) \ - (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) - #define IP_OFFSET_MASK (0x1FFF) #define IP_MF (0x2000) diff --git a/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c index 5f4e87ee949ad1..1ecdf4c54de41f 100644 --- a/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c @@ -14,10 +14,7 @@ #include #define BPF_PROG_TEST_TCP_HDR_OPTIONS #include "test_tcp_hdr_options.h" - -#ifndef sizeof_field -#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) -#endif +#include "bpf_misc.h" __u8 test_kind = TCPOPT_EXP; __u16 test_magic = 0xeB9F; diff --git a/tools/testing/selftests/bpf/progs/verifier_ctx.c b/tools/testing/selftests/bpf/progs/verifier_ctx.c index 424463094760ac..b927906aa305a9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ctx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ctx.c @@ -5,8 +5,6 @@ #include #include "bpf_misc.h" -#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) - SEC("tc") __description("context stores via BPF_ATOMIC") __failure __msg("BPF_ATOMIC stores into R1 ctx is not allowed") diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index 0d5e56dffabb8c..bf88c644eb3038 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -5,10 +5,6 @@ #include #include "bpf_misc.h" -#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) -#define offsetofend(TYPE, MEMBER) \ - (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) - struct { __uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY); __uint(max_entries, 1); From 180a46bc1a1c585f5187df7bccdb522556f4ecd8 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 17 Sep 2025 10:10:53 +0200 Subject: [PATCH 2011/3206] selftests/bpf: Test accesses to ctx padding This patch adds tests covering the various paddings in ctx structures. In case of sk_lookup BPF programs, the behavior is a bit different because accesses to the padding are explicitly allowed. Other cases result in a clear reject from the verifier. Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/3dc5f025e350aeb2bb1c257b87c577518e574aeb.1758094761.git.paul.chaignon@gmail.com --- .../selftests/bpf/progs/verifier_ctx.c | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_ctx.c b/tools/testing/selftests/bpf/progs/verifier_ctx.c index b927906aa305a9..5ebf7d9bcc55e1 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ctx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ctx.c @@ -262,4 +262,34 @@ narrow_load("sockops", bpf_sock_ops, skb_hwtstamp); unaligned_access("flow_dissector", __sk_buff, data); unaligned_access("netfilter", bpf_nf_ctx, skb); +#define padding_access(type, ctx, prev_field, sz) \ + SEC(type) \ + __description("access on " #ctx " padding after " #prev_field) \ + __naked void padding_ctx_access_##ctx(void) \ + { \ + asm volatile (" \ + r1 = *(u%[size] *)(r1 + %[off]); \ + r0 = 0; \ + exit;" \ + : \ + : __imm_const(size, sz * 8), \ + __imm_const(off, offsetofend(struct ctx, prev_field)) \ + : __clobber_all); \ + } + +__failure __msg("invalid bpf_context access") +padding_access("cgroup/bind4", bpf_sock_addr, msg_src_ip6[3], 4); + +__success +padding_access("sk_lookup", bpf_sk_lookup, remote_port, 2); + +__failure __msg("invalid bpf_context access") +padding_access("tc", __sk_buff, tstamp_type, 2); + +__failure __msg("invalid bpf_context access") +padding_access("cgroup/post_bind4", bpf_sock, dst_port, 2); + +__failure __msg("invalid bpf_context access") +padding_access("sk_reuseport", sk_reuseport_md, hash, 4); + char _license[] SEC("license") = "GPL"; From 5b3a897c3f90e16beff5124934bf5b10c6c68f6b Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Mon, 15 Sep 2025 11:33:14 +0300 Subject: [PATCH 2012/3206] pmdomain: rockchip: enable ROCKCHIP_PM_DOMAINS with ARCH_ROCKCHIP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On rk3399 based rockpi4b, mounting rootfs from mmc fails unless ROCKCHIP_PM_DOMAINS is enabled. Accoriding to Heiko Stübner all SoCs since 2012 have power domains so the support should be enabled by default on both arm and arm64. Failing boot without CONFIG_ROCKCHIP_PM_DOMAINS=y: https://ledge.validation.linaro.org/scheduler/job/119268 /dev/disk/by-partuuid/1d48ffd8-f2a7-4a33-b52f-186089b3c85e: Can't lookup blockdev /dev/disk/by-partuuid/1d48ffd8-f2a7-4a33-b52f-186089b3c85e: Can't lookup blockdev /dev/disk/by-partuuid/1d48ffd8-f2a7-4a33-b52f-186089b3c85e: Can't lookup blockdev /dev/disk/by-partuuid/1d48ffd8-f2a7-4a33-b52f-186089b3c85e: Can't lookup blockdev /dev/disk/by-partuuid/1d48ffd8-f2a7-4a33-b52f-186089b3c85e: Can't lookup blockdev /dev/disk/by-partuuid/1d48ffd8-f2a7-4a33-b52f-186089b3c85e: Can't lookup blockdev dw-apb-uart ff1a0000.serial: forbid DMA for kernel console root '/dev/disk/by-partuuid/1d48ffd8-f2a7-4a33-b52f-186089b3c85e' doesn't exist or does not contain a /dev. rk_gmac-dwmac fe300000.ethernet: deferred probe timeout, ignoring dependency rk_gmac-dwmac fe300000.ethernet: probe with driver rk_gmac-dwmac failed with error -110 rk_iommu ff650800.iommu: deferred probe timeout, ignoring dependency rk_iommu ff650800.iommu: probe with driver rk_iommu failed with error -110 dwmmc_rockchip fe320000.mmc: deferred probe timeout, ignoring dependency rockchip-typec-phy ff7c0000.phy: deferred probe timeout, ignoring dependency dwmmc_rockchip fe320000.mmc: probe with driver dwmmc_rockchip failed with error -110 rockchip-typec-phy ff7c0000.phy: probe with driver rockchip-typec-phy failed with error -110 rockchip-typec-phy ff800000.phy: deferred probe timeout, ignoring dependency rockchip-typec-phy ff800000.phy: probe with driver rockchip-typec-phy failed with error -110 rk_iommu ff660480.iommu: deferred probe timeout, ignoring dependency rk_iommu ff660480.iommu: probe with driver rk_iommu failed with error -110 rk_iommu ff8f3f00.iommu: deferred probe timeout, ignoring dependency rk_iommu ff8f3f00.iommu: probe with driver rk_iommu failed with error -110 rk_iommu ff903f00.iommu: deferred probe timeout, ignoring dependency rk_iommu ff903f00.iommu: probe with driver rk_iommu failed with error -110 rk_iommu ff914000.iommu: deferred probe timeout, ignoring dependency rk_iommu ff914000.iommu: probe with driver rk_iommu failed with error -110 rk_iommu ff924000.iommu: deferred probe timeout, ignoring dependency rk_iommu ff924000.iommu: probe with driver rk_iommu failed with error -110 platform fe800000.usb: deferred probe pending: platform: wait for supplier /phy@ff7c0000/usb3-port sdhci-arasan fe330000.mmc: deferred probe timeout, ignoring dependency platform fe900000.usb: deferred probe pending: platform: wait for supplier /phy@ff800000/usb3-port sdhci-arasan fe330000.mmc: probe with driver sdhci-arasan failed with error -110 platform ff1d0000.spi: deferred probe pending: (reason unknown) platform hdmi-sound: deferred probe pending: asoc-simple-card: parse error Working boot with CONFIG_ROCKCHIP_PM_DOMAINS=y: https://ledge.validation.linaro.org/scheduler/job/119272 dwmmc_rockchip fe320000.mmc: IDMAC supports 32-bit address mode. dwmmc_rockchip fe320000.mmc: Using internal DMA controller. dwmmc_rockchip fe320000.mmc: Version ID is 270a dwmmc_rockchip fe320000.mmc: DW MMC controller at irq 45,32 bit host data width,256 deep fifo dwmmc_rockchip fe320000.mmc: Got CD GPIO ff1a0000.serial: ttyS2 at MMIO 0xff1a0000 (irq = 44, base_baud = 1500000) is a 16550A printk: legacy console [ttyS2] enabled mmc_host mmc1: Bus speed (slot 0) = 400000Hz (slot req 400000Hz, actual 400000HZ div = 0) dw_wdt ff848000.watchdog: No valid TOPs array specified mmc_host mmc1: Bus speed (slot 0) = 50000000Hz (slot req 50000000Hz, actual 50000000HZ div = 0) mmc0: CQHCI version 5.10 rk_gmac-dwmac fe300000.ethernet: IRQ eth_wake_irq not found mmc1: new high speed SDHC card at address aaaa fan53555-regulator 0-0040: FAN53555 Option[8] Rev[1] Detected! fan53555-regulator 0-0041: FAN53555 Option[8] Rev[1] Detected! rk_gmac-dwmac fe300000.ethernet: IRQ eth_lpi not found mmcblk1: mmc1:aaaa SC16G 14.8 GiB rk_gmac-dwmac fe300000.ethernet: IRQ sfty not found GPT:Primary header thinks Alt. header is not at the end of the disk. rk_gmac-dwmac fe300000.ethernet: Deprecated MDIO bus assumption used GPT:1978417 != 31116287 rk_gmac-dwmac fe300000.ethernet: PTP uses main clock GPT:Alternate GPT header not at the end of the disk. rk_gmac-dwmac fe300000.ethernet: clock input or output? (input). GPT:1978417 != 31116287 rk_gmac-dwmac fe300000.ethernet: TX delay(0x28). GPT: Use GNU Parted to correct GPT errors. rk_gmac-dwmac fe300000.ethernet: RX delay(0x11). mmcblk1: p1 p2 p3 p4 p5 p6 p7 p8 Acked-by: Heiko Stuebner Acked-by: Arnd Bergmann Suggested-by: Ulf Hansson Suggested-by: Heiko Stübner Suggested-by: Arnd Bergmann Signed-off-by: Mikko Rapeli Signed-off-by: Ulf Hansson --- drivers/pmdomain/rockchip/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pmdomain/rockchip/Kconfig b/drivers/pmdomain/rockchip/Kconfig index 218d43186e5b9a..17f2e6fe86b6f7 100644 --- a/drivers/pmdomain/rockchip/Kconfig +++ b/drivers/pmdomain/rockchip/Kconfig @@ -3,6 +3,7 @@ if ARCH_ROCKCHIP || COMPILE_TEST config ROCKCHIP_PM_DOMAINS bool "Rockchip generic power domain" + default ARCH_ROCKCHIP depends on PM depends on HAVE_ARM_SMCCC_DISCOVERY depends on REGULATOR From 027a7a9c07d0d759ab496a7509990aa33a4b689c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 10 Sep 2025 19:11:07 +0800 Subject: [PATCH 2013/3206] drbd: init queue_limits->max_hw_wzeroes_unmap_sectors parameter The parameter max_hw_wzeroes_unmap_sectors in queue_limits should be equal to max_write_zeroes_sectors if it is set to a non-zero value. However, when the backend bdev is specified, this parameter is initialized to UINT_MAX during the call to blk_set_stacking_limits(), while only max_write_zeroes_sectors is adjusted. Therefore, this discrepancy triggers a value check failure in blk_validate_limits(). Since the drvd driver doesn't yet support unmap write zeroes, so fix this failure by explicitly setting max_hw_wzeroes_unmap_sectors to zero. Fixes: 0c40d7cb5ef3 ("block: introduce max_{hw|user}_wzeroes_unmap_sectors to queue limits") Signed-off-by: Zhang Yi Reviewed-by: Martin K. Petersen Reviewed-by: Yu Kuai Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e09930c2b22627..91f3b8afb63ce6 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1330,6 +1330,7 @@ void drbd_reconsider_queue_parameters(struct drbd_device *device, lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; else lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; if ((lim.discard_granularity >> SECTOR_SHIFT) > lim.max_hw_discard_sectors) { From fd4e876f59b7e70283b4025c717cad8948397be1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Sep 2025 16:25:24 +0200 Subject: [PATCH 2014/3206] softirq: Provide a handshake for canceling tasklets via polling The tasklet_unlock_spin_wait() via tasklet_disable_in_atomic() is provided for a few legacy tasklet users. The interface is used from atomic context (which is either softirq or disabled preemption) on non-PREEMPT_RT and relies on spinning until the tasklet callback completes. On PREEMPT_RT the context is never atomic but the busy polling logic remains. It is possible that the thread invoking tasklet_unlock_spin_wait() has higher priority than the tasklet. If both run on the same CPU the the tasklet makes no progress and the thread trying to cancel the tasklet will live-lock the system. To avoid the lockup tasklet_unlock_spin_wait() uses local_bh_disable()/ enable() which utilizes the local_lock_t for synchronisation. This lock is a central per-CPU BKL and about to be removed. Solve this by acquire a lock in tasklet_action_common() which is held while the tasklet's callback is invoked. This lock will be acquired from tasklet_unlock_spin_wait() via tasklet_callback_cancel_wait_running(). After the tasklet completed tasklet_callback_sync_wait_running() drops the lock and acquires it again. In order to avoid unlocking the lock even if there is no cancel request, there is a cb_waiters counter which is incremented during a cancel request. Blocking on the lock will PI-boost the tasklet if needed, ensuring progress is made. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 62 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index 513b1945987cc6..4e2c980e7712ea 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -805,6 +805,58 @@ static bool tasklet_clear_sched(struct tasklet_struct *t) return false; } +#ifdef CONFIG_PREEMPT_RT +struct tasklet_sync_callback { + spinlock_t cb_lock; + atomic_t cb_waiters; +}; + +static DEFINE_PER_CPU(struct tasklet_sync_callback, tasklet_sync_callback) = { + .cb_lock = __SPIN_LOCK_UNLOCKED(tasklet_sync_callback.cb_lock), + .cb_waiters = ATOMIC_INIT(0), +}; + +static void tasklet_lock_callback(void) +{ + spin_lock(this_cpu_ptr(&tasklet_sync_callback.cb_lock)); +} + +static void tasklet_unlock_callback(void) +{ + spin_unlock(this_cpu_ptr(&tasklet_sync_callback.cb_lock)); +} + +static void tasklet_callback_cancel_wait_running(void) +{ + struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback); + + atomic_inc(&sync_cb->cb_waiters); + spin_lock(&sync_cb->cb_lock); + atomic_dec(&sync_cb->cb_waiters); + spin_unlock(&sync_cb->cb_lock); +} + +static void tasklet_callback_sync_wait_running(void) +{ + struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback); + + if (atomic_read(&sync_cb->cb_waiters)) { + spin_unlock(&sync_cb->cb_lock); + spin_lock(&sync_cb->cb_lock); + } +} + +#else /* !CONFIG_PREEMPT_RT: */ + +static void tasklet_lock_callback(void) { } +static void tasklet_unlock_callback(void) { } +static void tasklet_callback_sync_wait_running(void) { } + +#ifdef CONFIG_SMP +static void tasklet_callback_cancel_wait_running(void) { } +#endif +#endif /* !CONFIG_PREEMPT_RT */ + static void tasklet_action_common(struct tasklet_head *tl_head, unsigned int softirq_nr) { @@ -816,6 +868,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, tl_head->tail = &tl_head->head; local_irq_enable(); + tasklet_lock_callback(); while (list) { struct tasklet_struct *t = list; @@ -835,6 +888,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, } } tasklet_unlock(t); + tasklet_callback_sync_wait_running(); continue; } tasklet_unlock(t); @@ -847,6 +901,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, __raise_softirq_irqoff(softirq_nr); local_irq_enable(); } + tasklet_unlock_callback(); } static __latent_entropy void tasklet_action(void) @@ -897,12 +952,9 @@ void tasklet_unlock_spin_wait(struct tasklet_struct *t) /* * Prevent a live lock when current preempted soft * interrupt processing or prevents ksoftirqd from - * running. If the tasklet runs on a different CPU - * then this has no effect other than doing the BH - * disable/enable dance for nothing. + * running. */ - local_bh_disable(); - local_bh_enable(); + tasklet_callback_cancel_wait_running(); } else { cpu_relax(); } From 3253cb49cbad4772389d6ef55be75db1f97da910 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Sep 2025 16:25:25 +0200 Subject: [PATCH 2015/3206] softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT softirqs are preemptible on PREEMPT_RT. There is synchronisation between individual sections which disable bottom halves. This in turn means that a forced threaded interrupt cannot preempt another forced threaded interrupt. Instead it will PI-boost the other handler and wait for its completion. This is required because code within a softirq section is assumed to be non-preemptible and may expect exclusive access to per-CPU resources such as variables or pinned timers. Code with such expectation has been identified and updated to use local_lock_nested_bh() for locking of the per-CPU resource. This means the softirq lock can be removed. Disable the softirq synchronization, but add a new config switch CONFIG_PREEMPT_RT_NEEDS_BH_LOCK which allows to re-enable the synchronized behavior in case that there are issues, which haven't been detected yet. The softirq_ctrl.cnt accounting remains to let the NOHZ code know if softirqs are currently handled. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner --- kernel/Kconfig.preempt | 13 +++++++ kernel/softirq.c | 83 ++++++++++++++++++++++++++++++++---------- 2 files changed, 76 insertions(+), 20 deletions(-) diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 54ea59ff8fbeb6..da326800c1c9be 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -103,6 +103,19 @@ config PREEMPT_RT Select this if you are building a kernel for systems which require real-time guarantees. +config PREEMPT_RT_NEEDS_BH_LOCK + bool "Enforce softirq synchronisation on PREEMPT_RT" + depends on PREEMPT_RT + help + Enforce synchronisation across the softirqs context. On PREEMPT_RT + the softirq is preemptible. This enforces the same per-CPU BLK + semantic non-PREEMPT_RT builds have. This should not be needed + because per-CPU locks were added to avoid the per-CPU BKL. + + This switch provides the old behaviour for testing reasons. Select + this if you suspect an error with preemptible softirq and want test + the old synchronized behaviour. + config PREEMPT_COUNT bool diff --git a/kernel/softirq.c b/kernel/softirq.c index 4e2c980e7712ea..77198911b8dd4b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -165,7 +165,11 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) /* First entry of a task into a BH disabled section? */ if (!current->softirq_disable_cnt) { if (preemptible()) { - local_lock(&softirq_ctrl.lock); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) + local_lock(&softirq_ctrl.lock); + else + migrate_disable(); + /* Required to meet the RCU bottomhalf requirements. */ rcu_read_lock(); } else { @@ -177,17 +181,34 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) * Track the per CPU softirq disabled state. On RT this is per CPU * state to allow preemption of bottom half disabled sections. */ - newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt); - /* - * Reflect the result in the task state to prevent recursion on the - * local lock and to make softirq_count() & al work. - */ - current->softirq_disable_cnt = newcnt; + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + newcnt = this_cpu_add_return(softirq_ctrl.cnt, cnt); + /* + * Reflect the result in the task state to prevent recursion on the + * local lock and to make softirq_count() & al work. + */ + current->softirq_disable_cnt = newcnt; - if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { - raw_local_irq_save(flags); - lockdep_softirqs_off(ip); - raw_local_irq_restore(flags); + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { + raw_local_irq_save(flags); + lockdep_softirqs_off(ip); + raw_local_irq_restore(flags); + } + } else { + bool sirq_dis = false; + + if (!current->softirq_disable_cnt) + sirq_dis = true; + + this_cpu_add(softirq_ctrl.cnt, cnt); + current->softirq_disable_cnt += cnt; + WARN_ON_ONCE(current->softirq_disable_cnt < 0); + + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_dis) { + raw_local_irq_save(flags); + lockdep_softirqs_off(ip); + raw_local_irq_restore(flags); + } } } EXPORT_SYMBOL(__local_bh_disable_ip); @@ -195,23 +216,42 @@ EXPORT_SYMBOL(__local_bh_disable_ip); static void __local_bh_enable(unsigned int cnt, bool unlock) { unsigned long flags; + bool sirq_en = false; int newcnt; - DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != - this_cpu_read(softirq_ctrl.cnt)); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != + this_cpu_read(softirq_ctrl.cnt)); + if (softirq_count() == cnt) + sirq_en = true; + } else { + if (current->softirq_disable_cnt == cnt) + sirq_en = true; + } - if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) { + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_en) { raw_local_irq_save(flags); lockdep_softirqs_on(_RET_IP_); raw_local_irq_restore(flags); } - newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt); - current->softirq_disable_cnt = newcnt; + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + newcnt = this_cpu_sub_return(softirq_ctrl.cnt, cnt); + current->softirq_disable_cnt = newcnt; - if (!newcnt && unlock) { - rcu_read_unlock(); - local_unlock(&softirq_ctrl.lock); + if (!newcnt && unlock) { + rcu_read_unlock(); + local_unlock(&softirq_ctrl.lock); + } + } else { + current->softirq_disable_cnt -= cnt; + this_cpu_sub(softirq_ctrl.cnt, cnt); + if (unlock && !current->softirq_disable_cnt) { + migrate_enable(); + rcu_read_unlock(); + } else { + WARN_ON_ONCE(current->softirq_disable_cnt < 0); + } } } @@ -228,7 +268,10 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) lock_map_release(&bh_lock_map); local_irq_save(flags); - curcnt = __this_cpu_read(softirq_ctrl.cnt); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) + curcnt = this_cpu_read(softirq_ctrl.cnt); + else + curcnt = current->softirq_disable_cnt; /* * If this is not reenabling soft interrupts, no point in trying to From c60934d5ca088908794510ae9314dbbc5fcc9935 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Sun, 10 Aug 2025 18:16:19 -0400 Subject: [PATCH 2016/3206] pmdomain: mediatek: airoha: convert from round_rate() to determine_rate() The round_rate() clk ops is deprecated, so migrate this driver from round_rate() to determine_rate() using the Coccinelle semantic patch appended to the "under-the-cut" portion of the patch. Note that prior to running the Coccinelle, airoha_cpu_pmdomain_clk_round() was renamed to airoha_cpu_pmdomain_clk_round_rate(). Signed-off-by: Brian Masney Signed-off-by: Ulf Hansson --- drivers/pmdomain/mediatek/airoha-cpu-pmdomain.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pmdomain/mediatek/airoha-cpu-pmdomain.c b/drivers/pmdomain/mediatek/airoha-cpu-pmdomain.c index 0fd88d2f9ac29d..3b1d202f89dc51 100644 --- a/drivers/pmdomain/mediatek/airoha-cpu-pmdomain.c +++ b/drivers/pmdomain/mediatek/airoha-cpu-pmdomain.c @@ -21,10 +21,10 @@ struct airoha_cpu_pmdomain_priv { struct generic_pm_domain pd; }; -static long airoha_cpu_pmdomain_clk_round(struct clk_hw *hw, unsigned long rate, - unsigned long *parent_rate) +static int airoha_cpu_pmdomain_clk_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { - return rate; + return 0; } static unsigned long airoha_cpu_pmdomain_clk_get(struct clk_hw *hw, @@ -48,7 +48,7 @@ static int airoha_cpu_pmdomain_clk_is_enabled(struct clk_hw *hw) static const struct clk_ops airoha_cpu_pmdomain_clk_ops = { .recalc_rate = airoha_cpu_pmdomain_clk_get, .is_enabled = airoha_cpu_pmdomain_clk_is_enabled, - .round_rate = airoha_cpu_pmdomain_clk_round, + .determine_rate = airoha_cpu_pmdomain_clk_determine_rate, }; static int airoha_cpu_pmdomain_set_performance_state(struct generic_pm_domain *domain, From 7cd8db0fb0b2bf309163d56fec585c0f9e0964d1 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Mon, 15 Sep 2025 11:33:15 +0300 Subject: [PATCH 2017/3206] mmc: add COMPILE_TEST to multiple drivers These compile on x86_64 with =y and =m. Acked-by: Arnd Bergmann Signed-off-by: Mikko Rapeli Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 4afa0130779d97..0939bba1ac206c 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -56,7 +56,7 @@ config MMC_STM32_SDMMC config MMC_PXA tristate "Intel PXA25x/26x/27x Multimedia Card Interface support" - depends on ARCH_PXA + depends on ARCH_PXA || COMPILE_TEST help This selects the Intel(R) PXA(R) Multimedia card Interface. If you have a PXA(R) platform with a Multimedia Card slot, @@ -608,7 +608,7 @@ config MMC_SDHCI_MSM config MMC_MXC tristate "Freescale i.MX21/27/31 or MPC512x Multimedia Card support" - depends on ARCH_MXC || PPC_MPC512x + depends on ARCH_MXC || PPC_MPC512x || COMPILE_TEST help This selects the Freescale i.MX21, i.MX27, i.MX31 or MPC512x Multimedia Card Interface. If you have an i.MX or MPC512x platform @@ -866,7 +866,8 @@ config MMC_DW_PCI config MMC_DW_ROCKCHIP tristate "Rockchip specific extensions for Synopsys DW Memory Card Interface" - depends on MMC_DW && ARCH_ROCKCHIP + depends on MMC_DW + depends on ARCH_ROCKCHIP || COMPILE_TEST select MMC_DW_PLTFM help This selects support for Rockchip SoC specific extensions to the @@ -948,7 +949,7 @@ config MMC_USHC config MMC_WMT tristate "Wondermedia SD/MMC Host Controller support" - depends on ARCH_VT8500 + depends on ARCH_VT8500 || COMPILE_TEST default y help This selects support for the SD/MMC Host Controller on From 67da3f16e5f97a864a0beb4f9758d09e1890a76e Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Mon, 15 Sep 2025 11:33:16 +0300 Subject: [PATCH 2018/3206] mmc: select REGMAP_MMIO with MMC_LOONGSON2 COMPILE_TEST with MMC_LOONGSON2 failed to link due to undeclared dependency: ERROR: modpost: "__devm_regmap_init_mmio_clk" [drivers/mmc/host/loongson2-mmc.ko] undefined! Fixes: 2115772014bd ("mmc: loongson2: Add Loongson-2K SD/SDIO controller driver") Suggested-by: Arnd Bergmann Suggested-by: Binbin Zhou Signed-off-by: Mikko Rapeli Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 0939bba1ac206c..2c963cb6724b9e 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -1116,6 +1116,7 @@ config MMC_LOONGSON2 tristate "Loongson-2K SD/SDIO/eMMC Host Interface support" depends on LOONGARCH || COMPILE_TEST depends on HAS_DMA + select REGMAP_MMIO help This selects support for the SD/SDIO/eMMC Host Controller on Loongson-2K series CPUs. From e6d2338b6f3e522872f3a14fcc5e5de2f58bf23b Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 16 Sep 2025 14:21:45 +0200 Subject: [PATCH 2019/3206] firewire: core: use struct_size and flex_array_size in ioctl_add_descriptor Use struct_size() to determine the memory needed for a new 'struct descriptor_resource' and flex_array_size() to calculate the number of bytes to copy from userspace. This removes the hardcoded size (4 bytes) for the 'u32 data[]' entries. No functional changes intended. Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20250916122143.2459993-3-thorsten.blum@linux.dev Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-cdev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 112b33099610cf..9913162006181e 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -940,11 +940,12 @@ static int ioctl_add_descriptor(struct client *client, union ioctl_arg *arg) if (a->length > 256) return -EINVAL; - r = kmalloc(sizeof(*r) + a->length * 4, GFP_KERNEL); + r = kmalloc(struct_size(r, data, a->length), GFP_KERNEL); if (r == NULL) return -ENOMEM; - if (copy_from_user(r->data, u64_to_uptr(a->data), a->length * 4)) { + if (copy_from_user(r->data, u64_to_uptr(a->data), + flex_array_size(r, data, a->length))) { ret = -EFAULT; goto failed; } From 5b7bdc4402b12bdad747cce305ecbc9737aed7ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 16 Sep 2025 18:51:35 +0200 Subject: [PATCH 2020/3206] kselftest/arm64/gcs/basic-gcs: Respect parent directory CFLAGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit basic-gcs has it's own make rule to handle the special compiler invocation to build against nolibc. This rule does not respect the $(CFLAGS) passed by the Makefile from the parent directory. However these $(CFLAGS) set up the include path to include the UAPI headers from the current kernel. Due to this the asm/hwcap.h header is used from the toolchain instead of the UAPI and the definition of HWCAP_GCS is not found. Restructure the rule for basic-gcs to respect the $(CFLAGS). Also drop those options which are already provided by $(CFLAGS). Reported-by: Naresh Kamboju Closes: https://lore.kernel.org/lkml/CA+G9fYv77X+kKz2YT6xw7=9UrrotTbQ6fgNac7oohOg8BgGvtw@mail.gmail.com/ Fixes: a985fe638344 ("kselftest/arm64/gcs: Use nolibc's getauxval()") Tested-by: Linux Kernel Functional Testing Signed-off-by: Thomas Weißschuh Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/gcs/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index d2f3497a9103fc..1fbbf0ca1f0291 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -14,11 +14,11 @@ LDLIBS+=-lpthread include ../../lib.mk $(OUTPUT)/basic-gcs: basic-gcs.c - $(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ - -static -include ../../../../include/nolibc/nolibc.h \ + $(CC) $(CFLAGS) -fno-asynchronous-unwind-tables -fno-ident -s -nostdlib -nostdinc \ + -static -I../../../../include/nolibc -include ../../../../include/nolibc/nolibc.h \ -I../../../../../usr/include \ -std=gnu99 -I../.. -g \ - -ffreestanding -Wall $^ -o $@ -lgcc + -ffreestanding $^ -o $@ -lgcc $(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S $(CC) -nostdlib $^ -o $@ From 7dd670db9afdca6aed4a9f99776cb72e6ed5a1f5 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 17 Sep 2025 17:06:09 +0100 Subject: [PATCH 2021/3206] ALSA: hda: intel-dsp-config: Prevent SEGFAULT if ACPI_HANDLE() is NULL Check in snd_intel_dsp_check_soundwire() that the pointer returned by ACPI_HANDLE() is not NULL, before passing it on to other functions. The original code assumed a non-NULL return, but if it was unexpectedly NULL it would end up passed to acpi_walk_namespace() as the start point, and would result in [ 3.219028] BUG: kernel NULL pointer dereference, address: 0000000000000018 [ 3.219029] #PF: supervisor read access in kernel mode [ 3.219030] #PF: error_code(0x0000) - not-present page [ 3.219031] PGD 0 P4D 0 [ 3.219032] Oops: Oops: 0000 [#1] SMP NOPTI [ 3.219035] CPU: 2 UID: 0 PID: 476 Comm: (udev-worker) Tainted: G S AW E 6.17.0-rc5-test #1 PREEMPT(voluntary) [ 3.219038] Tainted: [S]=CPU_OUT_OF_SPEC, [A]=OVERRIDDEN_ACPI_TABLE, [W]=WARN, [E]=UNSIGNED_MODULE [ 3.219040] RIP: 0010:acpi_ns_walk_namespace+0xb5/0x480 This problem was triggered by a bugged DSDT that the kernel couldn't parse. But it shouldn't be possible to SEGFAULT the kernel just because of some bugs in ACPI. Fixes: 0650857570d1 ("ALSA: hda: add autodetection for SoundWire") Signed-off-by: Richard Fitzgerald Signed-off-by: Takashi Iwai --- sound/hda/core/intel-dsp-config.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/hda/core/intel-dsp-config.c b/sound/hda/core/intel-dsp-config.c index c15284742899ff..2a9e35cddcf7c6 100644 --- a/sound/hda/core/intel-dsp-config.c +++ b/sound/hda/core/intel-dsp-config.c @@ -650,6 +650,8 @@ static int snd_intel_dsp_check_soundwire(struct pci_dev *pci) int ret; handle = ACPI_HANDLE(&pci->dev); + if (!handle) + return -ENODEV; ret = sdw_intel_acpi_scan(handle, &info); if (ret < 0) From ff89a4d285c82813faa0e2e386d07120ae1f9c85 Mon Sep 17 00:00:00 2001 From: Zongyao Bai Date: Tue, 16 Sep 2025 05:47:15 +0800 Subject: [PATCH 2022/3206] drm/xe/sysfs: Add cleanup action in xe_device_sysfs_init On partial failure, some sysfs files created before the failure might not be removed. Add common cleanup step to remove them all immediately, as is should be harmless to attempt to remove non-existing files. Fixes: 0e414bf7ad01 ("drm/xe: Expose PCIe link downgrade attributes") Cc: Lucas De Marchi Cc: Stuart Summers Cc: Shuicheng Lin Cc: Michal Wajdeczko Signed-off-by: Zongyao Bai Reviewed-by: Shuicheng Lin Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250915214716.1327379-2-zongyao.bai@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 1a869168d91f1a1a2b0db22cea0295c67908e5d8) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device_sysfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c index bd9015761aa0ed..3e3b2d9033a7f8 100644 --- a/drivers/gpu/drm/xe/xe_device_sysfs.c +++ b/drivers/gpu/drm/xe/xe_device_sysfs.c @@ -311,12 +311,16 @@ int xe_device_sysfs_init(struct xe_device *xe) if (xe->info.platform == XE_BATTLEMAGE) { ret = sysfs_create_files(&dev->kobj, auto_link_downgrade_attrs); if (ret) - return ret; + goto cleanup; ret = late_bind_create_files(dev); if (ret) - return ret; + goto cleanup; } return devm_add_action_or_reset(dev, xe_device_sysfs_fini, xe); + +cleanup: + xe_device_sysfs_fini(xe); + return ret; } From ae5fbbda341f92e605a9508a0fb18456155517f0 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Tue, 9 Sep 2025 15:12:40 -0700 Subject: [PATCH 2023/3206] drm/xe: Fix error handling if PXP fails to start Since the PXP start comes after __xe_exec_queue_init() has completed, we need to cleanup what was done in that function in case of a PXP start error. __xe_exec_queue_init calls the submission backend init() function, so we need to introduce an opposite for that. Unfortunately, while we already have a fini() function pointer, it performs other operations in addition to cleaning up what was done by the init(). Therefore, for clarity, the existing fini() has been renamed to destroy(), while a new fini() has been added to only clean up what was done by the init(), with the latter being called by the former (via xe_exec_queue_fini). Fixes: 72d479601d67 ("drm/xe/pxp/uapi: Add userspace and LRC support for PXP-using queues") Signed-off-by: Daniele Ceraolo Spurio Cc: John Harrison Cc: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://lore.kernel.org/r/20250909221240.3711023-3-daniele.ceraolospurio@intel.com (cherry picked from commit 626667321deb4c7a294725406faa3dd71c3d445d) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_exec_queue.c | 22 ++++++--- drivers/gpu/drm/xe/xe_exec_queue_types.h | 8 ++- drivers/gpu/drm/xe/xe_execlist.c | 25 ++++++---- drivers/gpu/drm/xe/xe_execlist_types.h | 2 +- drivers/gpu/drm/xe/xe_guc_exec_queue_types.h | 4 +- drivers/gpu/drm/xe/xe_guc_submit.c | 52 ++++++++++++-------- 6 files changed, 72 insertions(+), 41 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 8991b4aed44071..c07edcda99c5ca 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -151,6 +151,16 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q) return err; } +static void __xe_exec_queue_fini(struct xe_exec_queue *q) +{ + int i; + + q->ops->fini(q); + + for (i = 0; i < q->width; ++i) + xe_lrc_put(q->lrc[i]); +} + struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm, u32 logical_mask, u16 width, struct xe_hw_engine *hwe, u32 flags, @@ -181,11 +191,13 @@ struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *v if (xe_exec_queue_uses_pxp(q)) { err = xe_pxp_exec_queue_add(xe->pxp, q); if (err) - goto err_post_alloc; + goto err_post_init; } return q; +err_post_init: + __xe_exec_queue_fini(q); err_post_alloc: __xe_exec_queue_free(q); return ERR_PTR(err); @@ -283,13 +295,11 @@ void xe_exec_queue_destroy(struct kref *ref) xe_exec_queue_put(eq); } - q->ops->fini(q); + q->ops->destroy(q); } void xe_exec_queue_fini(struct xe_exec_queue *q) { - int i; - /* * Before releasing our ref to lrc and xef, accumulate our run ticks * and wakeup any waiters. @@ -298,9 +308,7 @@ void xe_exec_queue_fini(struct xe_exec_queue *q) if (q->xef && atomic_dec_and_test(&q->xef->exec_queue.pending_removal)) wake_up_var(&q->xef->exec_queue.pending_removal); - for (i = 0; i < q->width; ++i) - xe_lrc_put(q->lrc[i]); - + __xe_exec_queue_fini(q); __xe_exec_queue_free(q); } diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index cc1cffb5c87f1d..1c9d03f2a3e5da 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -166,8 +166,14 @@ struct xe_exec_queue_ops { int (*init)(struct xe_exec_queue *q); /** @kill: Kill inflight submissions for backend */ void (*kill)(struct xe_exec_queue *q); - /** @fini: Fini exec queue for submission backend */ + /** @fini: Undoes the init() for submission backend */ void (*fini)(struct xe_exec_queue *q); + /** + * @destroy: Destroy exec queue for submission backend. The backend + * function must call xe_exec_queue_fini() (which will in turn call the + * fini() backend function) to ensure the queue is properly cleaned up. + */ + void (*destroy)(struct xe_exec_queue *q); /** @set_priority: Set priority for exec queue */ int (*set_priority)(struct xe_exec_queue *q, enum xe_exec_queue_priority priority); diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c index 788f56b066b6ad..f83d421ac9d3d2 100644 --- a/drivers/gpu/drm/xe/xe_execlist.c +++ b/drivers/gpu/drm/xe/xe_execlist.c @@ -385,10 +385,20 @@ static int execlist_exec_queue_init(struct xe_exec_queue *q) return err; } -static void execlist_exec_queue_fini_async(struct work_struct *w) +static void execlist_exec_queue_fini(struct xe_exec_queue *q) +{ + struct xe_execlist_exec_queue *exl = q->execlist; + + drm_sched_entity_fini(&exl->entity); + drm_sched_fini(&exl->sched); + + kfree(exl); +} + +static void execlist_exec_queue_destroy_async(struct work_struct *w) { struct xe_execlist_exec_queue *ee = - container_of(w, struct xe_execlist_exec_queue, fini_async); + container_of(w, struct xe_execlist_exec_queue, destroy_async); struct xe_exec_queue *q = ee->q; struct xe_execlist_exec_queue *exl = q->execlist; struct xe_device *xe = gt_to_xe(q->gt); @@ -401,10 +411,6 @@ static void execlist_exec_queue_fini_async(struct work_struct *w) list_del(&exl->active_link); spin_unlock_irqrestore(&exl->port->lock, flags); - drm_sched_entity_fini(&exl->entity); - drm_sched_fini(&exl->sched); - kfree(exl); - xe_exec_queue_fini(q); } @@ -413,10 +419,10 @@ static void execlist_exec_queue_kill(struct xe_exec_queue *q) /* NIY */ } -static void execlist_exec_queue_fini(struct xe_exec_queue *q) +static void execlist_exec_queue_destroy(struct xe_exec_queue *q) { - INIT_WORK(&q->execlist->fini_async, execlist_exec_queue_fini_async); - queue_work(system_unbound_wq, &q->execlist->fini_async); + INIT_WORK(&q->execlist->destroy_async, execlist_exec_queue_destroy_async); + queue_work(system_unbound_wq, &q->execlist->destroy_async); } static int execlist_exec_queue_set_priority(struct xe_exec_queue *q, @@ -467,6 +473,7 @@ static const struct xe_exec_queue_ops execlist_exec_queue_ops = { .init = execlist_exec_queue_init, .kill = execlist_exec_queue_kill, .fini = execlist_exec_queue_fini, + .destroy = execlist_exec_queue_destroy, .set_priority = execlist_exec_queue_set_priority, .set_timeslice = execlist_exec_queue_set_timeslice, .set_preempt_timeout = execlist_exec_queue_set_preempt_timeout, diff --git a/drivers/gpu/drm/xe/xe_execlist_types.h b/drivers/gpu/drm/xe/xe_execlist_types.h index 415140936f11da..92c4ba52db0cb1 100644 --- a/drivers/gpu/drm/xe/xe_execlist_types.h +++ b/drivers/gpu/drm/xe/xe_execlist_types.h @@ -42,7 +42,7 @@ struct xe_execlist_exec_queue { bool has_run; - struct work_struct fini_async; + struct work_struct destroy_async; enum xe_exec_queue_priority active_priority; struct list_head active_link; diff --git a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h index a3f421e2adc03b..c30c0e3ccbbb93 100644 --- a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h @@ -35,8 +35,8 @@ struct xe_guc_exec_queue { struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE]; /** @lr_tdr: long running TDR worker */ struct work_struct lr_tdr; - /** @fini_async: do final fini async from this worker */ - struct work_struct fini_async; + /** @destroy_async: do final destroy async from this worker */ + struct work_struct destroy_async; /** @resume_time: time of last resume */ u64 resume_time; /** @state: GuC specific state for this xe_exec_queue */ diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index cafb47711e9b3f..93fc7b290b655f 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1277,48 +1277,57 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) return DRM_GPU_SCHED_STAT_NO_HANG; } -static void __guc_exec_queue_fini_async(struct work_struct *w) +static void guc_exec_queue_fini(struct xe_exec_queue *q) +{ + struct xe_guc_exec_queue *ge = q->guc; + struct xe_guc *guc = exec_queue_to_guc(q); + + release_guc_id(guc, q); + xe_sched_entity_fini(&ge->entity); + xe_sched_fini(&ge->sched); + + /* + * RCU free due sched being exported via DRM scheduler fences + * (timeline name). + */ + kfree_rcu(ge, rcu); +} + +static void __guc_exec_queue_destroy_async(struct work_struct *w) { struct xe_guc_exec_queue *ge = - container_of(w, struct xe_guc_exec_queue, fini_async); + container_of(w, struct xe_guc_exec_queue, destroy_async); struct xe_exec_queue *q = ge->q; struct xe_guc *guc = exec_queue_to_guc(q); xe_pm_runtime_get(guc_to_xe(guc)); trace_xe_exec_queue_destroy(q); - release_guc_id(guc, q); if (xe_exec_queue_is_lr(q)) cancel_work_sync(&ge->lr_tdr); /* Confirm no work left behind accessing device structures */ cancel_delayed_work_sync(&ge->sched.base.work_tdr); - xe_sched_entity_fini(&ge->entity); - xe_sched_fini(&ge->sched); - /* - * RCU free due sched being exported via DRM scheduler fences - * (timeline name). - */ - kfree_rcu(ge, rcu); xe_exec_queue_fini(q); + xe_pm_runtime_put(guc_to_xe(guc)); } -static void guc_exec_queue_fini_async(struct xe_exec_queue *q) +static void guc_exec_queue_destroy_async(struct xe_exec_queue *q) { struct xe_guc *guc = exec_queue_to_guc(q); struct xe_device *xe = guc_to_xe(guc); - INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async); + INIT_WORK(&q->guc->destroy_async, __guc_exec_queue_destroy_async); /* We must block on kernel engines so slabs are empty on driver unload */ if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q)) - __guc_exec_queue_fini_async(&q->guc->fini_async); + __guc_exec_queue_destroy_async(&q->guc->destroy_async); else - queue_work(xe->destroy_wq, &q->guc->fini_async); + queue_work(xe->destroy_wq, &q->guc->destroy_async); } -static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q) +static void __guc_exec_queue_destroy(struct xe_guc *guc, struct xe_exec_queue *q) { /* * Might be done from within the GPU scheduler, need to do async as we @@ -1327,7 +1336,7 @@ static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q) * this we and don't really care when everything is fini'd, just that it * is. */ - guc_exec_queue_fini_async(q); + guc_exec_queue_destroy_async(q); } static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg) @@ -1341,7 +1350,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg) if (exec_queue_registered(q)) disable_scheduling_deregister(guc, q); else - __guc_exec_queue_fini(guc, q); + __guc_exec_queue_destroy(guc, q); } static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q) @@ -1574,14 +1583,14 @@ static bool guc_exec_queue_try_add_msg(struct xe_exec_queue *q, #define STATIC_MSG_CLEANUP 0 #define STATIC_MSG_SUSPEND 1 #define STATIC_MSG_RESUME 2 -static void guc_exec_queue_fini(struct xe_exec_queue *q) +static void guc_exec_queue_destroy(struct xe_exec_queue *q) { struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP; if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q)) guc_exec_queue_add_msg(q, msg, CLEANUP); else - __guc_exec_queue_fini(exec_queue_to_guc(q), q); + __guc_exec_queue_destroy(exec_queue_to_guc(q), q); } static int guc_exec_queue_set_priority(struct xe_exec_queue *q, @@ -1711,6 +1720,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = { .init = guc_exec_queue_init, .kill = guc_exec_queue_kill, .fini = guc_exec_queue_fini, + .destroy = guc_exec_queue_destroy, .set_priority = guc_exec_queue_set_priority, .set_timeslice = guc_exec_queue_set_timeslice, .set_preempt_timeout = guc_exec_queue_set_preempt_timeout, @@ -1732,7 +1742,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q) if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) xe_exec_queue_put(q); else if (exec_queue_destroyed(q)) - __guc_exec_queue_fini(guc, q); + __guc_exec_queue_destroy(guc, q); } if (q->guc->suspend_pending) { set_exec_queue_suspended(q); @@ -1989,7 +1999,7 @@ static void handle_deregister_done(struct xe_guc *guc, struct xe_exec_queue *q) if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) xe_exec_queue_put(q); else - __guc_exec_queue_fini(guc, q); + __guc_exec_queue_destroy(guc, q); } int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len) From 413187f79062634575098653c40f95d439d3b157 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 17 Sep 2025 15:26:49 +0200 Subject: [PATCH 2024/3206] stddef: Remove token-pasting in TRAILING_OVERLAP() Currently, TRAILING_OVERLAP() token-pastes the FAM parameter into the name of internal pdding member `__offset_to_##FAM`. This forces FAM to be a single identifier, which prevents callers from using a FAM when it's a nested member. For instance, see the following scenario: | struct flex { | size_t count; | int data[]; | }; | struct foo { | int hdr_foo; | struct flex f; | }; | struct composite { | struct foo hdr; | int data[100]; | }; In this case, it'd be useful if TRAILING_OVERLAP() could be used in the following way: | struct composite { | TRAILING_OVERLAP(struct foo, hdr, f.data, | int data[100]; | ); | }; However, this is not current possible due to the token concatenation in `__offset_to_##FAM`, which fails when FAM contains a dot. So, remove token-pasting and use the fixed internal name `__offset_to_FAM` and, with this, expand the capabilities of TRAILING_OVERLAP(). :) Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/13b3e0a69aad837b4e32ca8269b9d91bf1fbe9ef.1758115257.git.gustavoars@kernel.org Signed-off-by: Kees Cook --- include/linux/stddef.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/stddef.h b/include/linux/stddef.h index dab49e2ec8c0af..701099c67c240a 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -108,7 +108,7 @@ enum { union { \ TYPE NAME; \ struct { \ - unsigned char __offset_to_##FAM[offsetof(TYPE, FAM)]; \ + unsigned char __offset_to_FAM[offsetof(TYPE, FAM)]; \ MEMBERS \ }; \ } From 2bbdcf02c3f3df6c85abd0a2fee0d7f0f4113e96 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 17 Sep 2025 15:28:07 +0200 Subject: [PATCH 2025/3206] stddef: Introduce __TRAILING_OVERLAP() Introduce underlying __TRAILING_OVERLAP() macro to let callers apply atributes to trailing overlapping members. For instance, the code below: | struct flex { | size_t count; | int data[]; | }; | struct { | struct flex f; | struct foo a; | struct boo b; | } __packed instance; can now be changed to the following, and preserve the __packed attribute: | __TRAILING_OVERLAP(struct flex, f, data, __packed, | struct foo a; | struct boo b; | ) instance; Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/f80c529b239ce11f0a51f714fe00ddf839e05f5e.1758115257.git.gustavoars@kernel.org Signed-off-by: Kees Cook --- include/linux/stddef.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/include/linux/stddef.h b/include/linux/stddef.h index 701099c67c240a..80b6bfb944f0d2 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -94,7 +94,8 @@ enum { __DECLARE_FLEX_ARRAY(TYPE, NAME) /** - * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members. + * __TRAILING_OVERLAP() - Overlap a flexible-array member with trailing + * members. * * Creates a union between a flexible-array member (FAM) in a struct and a set * of additional members that would otherwise follow it. @@ -102,15 +103,30 @@ enum { * @TYPE: Flexible structure type name, including "struct" keyword. * @NAME: Name for a variable to define. * @FAM: The flexible-array member within @TYPE + * @ATTRS: Any struct attributes (usually empty) * @MEMBERS: Trailing overlapping members. */ -#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS) \ +#define __TRAILING_OVERLAP(TYPE, NAME, FAM, ATTRS, MEMBERS) \ union { \ TYPE NAME; \ struct { \ unsigned char __offset_to_FAM[offsetof(TYPE, FAM)]; \ MEMBERS \ - }; \ + } ATTRS; \ } +/** + * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members. + * + * Creates a union between a flexible-array member (FAM) in a struct and a set + * of additional members that would otherwise follow it. + * + * @TYPE: Flexible structure type name, including "struct" keyword. + * @NAME: Name for a variable to define. + * @FAM: The flexible-array member within @TYPE + * @MEMBERS: Trailing overlapping members. + */ +#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS) \ + __TRAILING_OVERLAP(TYPE, NAME, FAM, /* no attrs */, MEMBERS) + #endif From abe962346ef420998d47ba1c2fe591582f69e92e Mon Sep 17 00:00:00 2001 From: Igor Belwon Date: Wed, 17 Sep 2025 16:58:28 +0200 Subject: [PATCH 2026/3206] regulator: Fix MAX77838 selection The current entry for the MAX77838 regulator is unselectable (as it depended on a non-user-selectable config - REGMAP_I2C). Fix this by making it select the config, and not depending on it. Signed-off-by: Igor Belwon Link: https://patch.msgid.link/20250917-maxreg-kconfig-fix-v1-1-1369f88d6272@mentallysanemainliners.org Signed-off-by: Mark Brown --- drivers/regulator/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 5e451be51392ce..e252bb11ae6615 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -779,7 +779,8 @@ config REGULATOR_MAX77826 config REGULATOR_MAX77838 tristate "Maxim 77838 regulator" - depends on REGMAP_I2C + depends on I2C + select REGMAP_I2C help This driver controls a Maxim 77838 regulator via I2C bus. The regulator include 4 LDOs and a BUCK regulator. It's From c1c863457780adfb2e29fa9a85897179ad3903e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 15 Sep 2025 23:26:28 +0200 Subject: [PATCH 2027/3206] selftest/futex: Make the error check more precise for futex_numa_mpol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of just checking if the syscall failed as expected, check as well if the returned error code matches the expected error code. [ bigeasy: reword the commmit message ] Signed-off-by: André Almeida Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long --- .../futex/functional/futex_numa_mpol.c | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index 802c15c8219063..dd7b05e8cda45a 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -77,7 +77,7 @@ static void join_max_threads(void) } } -static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flags) +static void __test_futex(void *futex_ptr, int err_value, unsigned int futex_flags) { int to_wake, ret, i, need_exit = 0; @@ -88,11 +88,17 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag do { ret = futex2_wake(futex_ptr, to_wake, futex_flags); - if (must_fail) { - if (ret < 0) - break; - ksft_exit_fail_msg("futex2_wake(%d, 0x%x) should fail, but didn't\n", - to_wake, futex_flags); + + if (err_value) { + if (ret >= 0) + ksft_exit_fail_msg("futex2_wake(%d, 0x%x) should fail, but didn't\n", + to_wake, futex_flags); + + if (errno != err_value) + ksft_exit_fail_msg("futex2_wake(%d, 0x%x) expected error was %d, but returned %d (%s)\n", + to_wake, futex_flags, err_value, errno, strerror(errno)); + + break; } if (ret < 0) { ksft_exit_fail_msg("Failed futex2_wake(%d, 0x%x): %m\n", @@ -106,12 +112,12 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag join_max_threads(); for (i = 0; i < MAX_THREADS; i++) { - if (must_fail && thread_args[i].result != -1) { + if (err_value && thread_args[i].result != -1) { ksft_print_msg("Thread %d should fail but succeeded (%d)\n", i, thread_args[i].result); need_exit = 1; } - if (!must_fail && thread_args[i].result != 0) { + if (!err_value && thread_args[i].result != 0) { ksft_print_msg("Thread %d failed (%d)\n", i, thread_args[i].result); need_exit = 1; } @@ -120,14 +126,14 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag ksft_exit_fail_msg("Aborting due to earlier errors.\n"); } -static void test_futex(void *futex_ptr, int must_fail) +static void test_futex(void *futex_ptr, int err_value) { - __test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); + __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); } -static void test_futex_mpol(void *futex_ptr, int must_fail) +static void test_futex_mpol(void *futex_ptr, int err_value) { - __test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); + __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); } static void usage(char *prog) @@ -184,16 +190,16 @@ int main(int argc, char *argv[]) /* FUTEX2_NUMA futex must be 8-byte aligned */ ksft_print_msg("Mis-aligned futex\n"); - test_futex(futex_ptr + mem_size - 4, 1); + test_futex(futex_ptr + mem_size - 4, EINVAL); futex_numa->numa = FUTEX_NO_NODE; mprotect(futex_ptr, mem_size, PROT_READ); ksft_print_msg("Memory, RO\n"); - test_futex(futex_ptr, 1); + test_futex(futex_ptr, EFAULT); mprotect(futex_ptr, mem_size, PROT_NONE); ksft_print_msg("Memory, no access\n"); - test_futex(futex_ptr, 1); + test_futex(futex_ptr, EFAULT); mprotect(futex_ptr, mem_size, PROT_READ | PROT_WRITE); ksft_print_msg("Memory back to RW\n"); From 2951dddef0a87c20e83c77070bd5077290d9121f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 15 Sep 2025 23:26:29 +0200 Subject: [PATCH 2028/3206] selftest/futex: Reintroduce "Memory out of range" numa_mpol's subtest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit d8e2f919997 ("selftests/futex: Fix some futex_numa_mpol subtests") removed the "Memory out of range" subtest due to it being dependent on the memory layout of the test process having an invalid memory address just after the `*futex_ptr` allocated memory. Reintroduce this test and make it deterministic, by allocation two memory pages and marking the second one with PROT_NONE. Signed-off-by: André Almeida Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long --- .../testing/selftests/futex/functional/futex_numa_mpol.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index dd7b05e8cda45a..5f4e3111269c3a 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -174,10 +174,13 @@ int main(int argc, char *argv[]) ksft_set_plan(1); mem_size = sysconf(_SC_PAGE_SIZE); - futex_ptr = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + futex_ptr = mmap(NULL, mem_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (futex_ptr == MAP_FAILED) ksft_exit_fail_msg("mmap() for %d bytes failed\n", mem_size); + /* Create an invalid memory region for the "Memory out of range" test */ + mprotect(futex_ptr + mem_size, mem_size, PROT_NONE); + futex_numa = futex_ptr; ksft_print_msg("Regular test\n"); @@ -192,6 +195,9 @@ int main(int argc, char *argv[]) ksft_print_msg("Mis-aligned futex\n"); test_futex(futex_ptr + mem_size - 4, EINVAL); + ksft_print_msg("Memory out of range\n"); + test_futex(futex_ptr + mem_size, EFAULT); + futex_numa->numa = FUTEX_NO_NODE; mprotect(futex_ptr, mem_size, PROT_READ); ksft_print_msg("Memory, RO\n"); From ed323aeda5e09fa1ab95946673939c8c425c329c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 15 Sep 2025 23:26:30 +0200 Subject: [PATCH 2029/3206] selftest/futex: Compile also with libnuma < 2.0.16 After using numa_set_mempolicy_home_node() the test fails to compile on systems with libnuma library versioned lower than 2.0.16. In order to allow lower library version add a pkg-config related check and exclude that part of the code. Without the proper MPOL setup it can't be tested. Make a total number of tests two. The first one is the first batch and the second is the MPOL related one. The goal is to let the user know if it has been skipped due to library limitation. Remove test_futex_mpol(), it was unused and it is now complained by the compiler if the part is not compiled. Fixes: 0ecb4232fc65e ("selftests/futex: Set the home_node in futex_numa_mpol") Closes: https://lore.kernel.org/oe-lkp/202507150858.bedaf012-lkp@intel.com Reported-by: kernel test robot Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) --- .../selftests/futex/functional/Makefile | 5 ++++- .../futex/functional/futex_numa_mpol.c | 21 +++++++++---------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index ddfa61d857b9bf..bd50aecfca8a31 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,6 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 +PKG_CONFIG ?= pkg-config +LIBNUMA_TEST = $(shell sh -c "$(PKG_CONFIG) numa --atleast-version 2.0.16 > /dev/null 2>&1 && echo SUFFICIENT || echo NO") + INCLUDES := -I../include -I../../ $(KHDR_INCLUDES) -CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread -D_FILE_OFFSET_BITS=64 -D_TIME_BITS=64 $(INCLUDES) $(KHDR_INCLUDES) +CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread -D_FILE_OFFSET_BITS=64 -D_TIME_BITS=64 $(INCLUDES) $(KHDR_INCLUDES) -DLIBNUMA_VER_$(LIBNUMA_TEST)=1 LDLIBS := -lpthread -lrt -lnuma LOCAL_HDRS := \ diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index 5f4e3111269c3a..722427fe90bf0f 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -131,11 +131,6 @@ static void test_futex(void *futex_ptr, int err_value) __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); } -static void test_futex_mpol(void *futex_ptr, int err_value) -{ - __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); -} - static void usage(char *prog) { printf("Usage: %s\n", prog); @@ -148,7 +143,7 @@ static void usage(char *prog) int main(int argc, char *argv[]) { struct futex32_numa *futex_numa; - int mem_size, i; + int mem_size; void *futex_ptr; int c; @@ -171,7 +166,7 @@ int main(int argc, char *argv[]) } ksft_print_header(); - ksft_set_plan(1); + ksft_set_plan(2); mem_size = sysconf(_SC_PAGE_SIZE); futex_ptr = mmap(NULL, mem_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); @@ -211,8 +206,11 @@ int main(int argc, char *argv[]) ksft_print_msg("Memory back to RW\n"); test_futex(futex_ptr, 0); + ksft_test_result_pass("futex2 memory boundarie tests passed\n"); + /* MPOL test. Does not work as expected */ - for (i = 0; i < 4; i++) { +#ifdef LIBNUMA_VER_SUFFICIENT + for (int i = 0; i < 4; i++) { unsigned long nodemask; int ret; @@ -231,15 +229,16 @@ int main(int argc, char *argv[]) ret = futex2_wake(futex_ptr, 0, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); if (ret < 0) ksft_test_result_fail("Failed to wake 0 with MPOL: %m\n"); - if (0) - test_futex_mpol(futex_numa, 0); if (futex_numa->numa != i) { ksft_exit_fail_msg("Returned NUMA node is %d expected %d\n", futex_numa->numa, i); } } } - ksft_test_result_pass("NUMA MPOL tests passed\n"); + ksft_test_result_pass("futex2 MPOL hints test passed\n"); +#else + ksft_test_result_skip("futex2 MPOL hints test requires libnuma 2.0.16+\n"); +#endif ksft_finished(); return 0; } From 4ca24d6abbca5df76c4b189dd94fb055613de297 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 15 Sep 2025 11:08:14 -0500 Subject: [PATCH 2030/3206] lib/crypto: sha256: Add support for 2-way interleaved hashing Many arm64 and x86_64 CPUs can compute two SHA-256 hashes in nearly the same speed as one, if the instructions are interleaved. This is because SHA-256 is serialized block-by-block, and two interleaved hashes take much better advantage of the CPU's instruction-level parallelism. Meanwhile, a very common use case for SHA-256 hashing in the Linux kernel is dm-verity and fs-verity. Both use a Merkle tree that has a fixed block size, usually 4096 bytes with an empty or 32-byte salt prepended. Usually, many blocks need to be hashed at a time. This is an ideal scenario for 2-way interleaved hashing. To enable this optimization, add a new function sha256_finup_2x() to the SHA-256 library API. It computes the hash of two equal-length messages, starting from a common initial context. For now it always falls back to sequential processing. Later patches will wire up arm64 and x86_64 optimized implementations. Note that the interleaving factor could in principle be higher than 2x. However, that runs into many practical difficulties and CPU throughput limitations. Thus, both the implementations I'm adding are 2x. In the interest of using the simplest solution, the API matches that. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250915160819.140019-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha2.h | 28 +++++++++++++++++ lib/crypto/sha256.c | 71 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 94 insertions(+), 5 deletions(-) diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index 15e461e568cca6..e5dafb935cc88b 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -375,6 +375,34 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]); */ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]); +/** + * sha256_finup_2x() - Compute two SHA-256 digests from a common initial + * context. On some CPUs, this is faster than sequentially + * computing each digest. + * @ctx: an optional initial context, which may have already processed data. If + * NULL, a default initial context is used (equivalent to sha256_init()). + * @data1: data for the first message + * @data2: data for the second message + * @len: the length of each of @data1 and @data2, in bytes + * @out1: (output) the first SHA-256 message digest + * @out2: (output) the second SHA-256 message digest + * + * Context: Any context. + */ +void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, + const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]); + +/** + * sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real + * interleaved implementation, as opposed to a + * sequential fallback + * @return: true if optimized + * + * Context: Any context. + */ +bool sha256_finup_2x_is_optimized(void); + /** * struct hmac_sha256_key - Prepared key for HMAC-SHA256 * @key: private diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 8fa15165d23e89..881b935418cead 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = { }, }; -static const struct sha256_block_state sha256_iv = { - .h = { - SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, - SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, +static const struct sha256_ctx initial_sha256_ctx = { + .ctx = { + .state = { + .h = { + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, + }, + }, + .bytecount = 0, }, }; +#define sha256_iv (initial_sha256_ctx.ctx.state) + static const u32 sha256_K[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, @@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]) } EXPORT_SYMBOL(sha256); -/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */ +/* + * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined) + * doesn't need either HMAC support or interleaved hashing support + */ #ifndef __DISABLE_EXPORTS + +#ifndef sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + return false; +} +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return false; +} +#endif + +/* Sequential fallback implementation of sha256_finup_2x() */ +static noinline_for_stack void sha256_finup_2x_sequential( + const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, + size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) +{ + struct __sha256_ctx mut_ctx; + + mut_ctx = *ctx; + __sha256_update(&mut_ctx, data1, len); + __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE); + + mut_ctx = *ctx; + __sha256_update(&mut_ctx, data2, len); + __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE); +} + +void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, + const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + if (ctx == NULL) + ctx = &initial_sha256_ctx; + + if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1, + out2))) + return; + sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2); +} +EXPORT_SYMBOL_GPL(sha256_finup_2x); + +bool sha256_finup_2x_is_optimized(void) +{ + return sha256_finup_2x_is_optimized_arch(); +} +EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized); + static void __hmac_sha256_preparekey(struct sha256_block_state *istate, struct sha256_block_state *ostate, const u8 *raw_key, size_t raw_key_len, From 34c3f1e346e7b2b8a420b1ea341b341525baf92b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 15 Sep 2025 11:08:15 -0500 Subject: [PATCH 2031/3206] lib/crypto: arm64/sha256: Add support for 2-way interleaved hashing Add an implementation of sha256_finup_2x_arch() for arm64. It interleaves the computation of two SHA-256 hashes using the ARMv8 SHA-256 instructions. dm-verity and fs-verity will take advantage of this for greatly improved performance on capable CPUs. This increases the throughput of SHA-256 hashing 4096-byte messages by the following amounts on the following CPUs: ARM Cortex-X1: 70% ARM Cortex-X3: 68% ARM Cortex-A76: 65% ARM Cortex-A715: 43% ARM Cortex-A510: 25% ARM Cortex-A55: 8% Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250915160819.140019-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/arm64/sha256-ce.S | 284 ++++++++++++++++++++++++++++++++++- lib/crypto/arm64/sha256.h | 37 +++++ 2 files changed, 315 insertions(+), 6 deletions(-) diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S index b99d9589c42175..410174ba52373b 100644 --- a/lib/crypto/arm64/sha256-ce.S +++ b/lib/crypto/arm64/sha256-ce.S @@ -70,18 +70,22 @@ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + .macro load_round_constants tmp + adr_l \tmp, .Lsha2_rcon + ld1 { v0.4s- v3.4s}, [\tmp], #64 + ld1 { v4.4s- v7.4s}, [\tmp], #64 + ld1 { v8.4s-v11.4s}, [\tmp], #64 + ld1 {v12.4s-v15.4s}, [\tmp] + .endm + /* * size_t __sha256_ce_transform(struct sha256_block_state *state, * const u8 *data, size_t nblocks); */ .text SYM_FUNC_START(__sha256_ce_transform) - /* load round constants */ - adr_l x8, .Lsha2_rcon - ld1 { v0.4s- v3.4s}, [x8], #64 - ld1 { v4.4s- v7.4s}, [x8], #64 - ld1 { v8.4s-v11.4s}, [x8], #64 - ld1 {v12.4s-v15.4s}, [x8] + + load_round_constants x8 /* load state */ ld1 {dgav.4s, dgbv.4s}, [x0] @@ -134,3 +138,271 @@ CPU_LE( rev32 v19.16b, v19.16b ) mov x0, x2 ret SYM_FUNC_END(__sha256_ce_transform) + + .unreq dga + .unreq dgav + .unreq dgb + .unreq dgbv + .unreq t0 + .unreq t1 + .unreq dg0q + .unreq dg0v + .unreq dg1q + .unreq dg1v + .unreq dg2q + .unreq dg2v + + // parameters for sha256_ce_finup2x() + ctx .req x0 + data1 .req x1 + data2 .req x2 + len .req w3 + out1 .req x4 + out2 .req x5 + + // other scalar variables + count .req x6 + final_step .req w7 + + // x8-x9 are used as temporaries. + + // v0-v15 are used to cache the SHA-256 round constants. + // v16-v19 are used for the message schedule for the first message. + // v20-v23 are used for the message schedule for the second message. + // v24-v31 are used for the state and temporaries as given below. + // *_a are for the first message and *_b for the second. + state0_a_q .req q24 + state0_a .req v24 + state1_a_q .req q25 + state1_a .req v25 + state0_b_q .req q26 + state0_b .req v26 + state1_b_q .req q27 + state1_b .req v27 + t0_a .req v28 + t0_b .req v29 + t1_a_q .req q30 + t1_a .req v30 + t1_b_q .req q31 + t1_b .req v31 + +#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) +#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) +// offsetof(struct __sha256_ctx, state) is assumed to be 0. + + // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a + // and m0_b contain the current 4 message schedule words for the first + // and second message respectively. + // + // If not all the message schedule words have been computed yet, then + // this also computes 4 more message schedule words for each message. + // m1_a-m3_a contain the next 3 groups of 4 message schedule words for + // the first message, and likewise m1_b-m3_b for the second. After + // consuming the current value of m0_a, this macro computes the group + // after m3_a and writes it to m0_a, and likewise for *_b. This means + // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a, + // m3_a, m0_a), and likewise for *_b, so the caller must cycle through + // the registers accordingly. + .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \ + m0_b, m1_b, m2_b, m3_b + add t0_a\().4s, \m0_a\().4s, \k\().4s + add t0_b\().4s, \m0_b\().4s, \k\().4s + .if \i < 48 + sha256su0 \m0_a\().4s, \m1_a\().4s + sha256su0 \m0_b\().4s, \m1_b\().4s + sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s + sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s + .endif + mov t1_a.16b, state0_a.16b + mov t1_b.16b, state0_b.16b + sha256h state0_a_q, state1_a_q, t0_a\().4s + sha256h state0_b_q, state1_b_q, t0_b\().4s + sha256h2 state1_a_q, t1_a_q, t0_a\().4s + sha256h2 state1_b_q, t1_b_q, t0_b\().4s + .endm + + .macro do_16rounds_2x i, k0, k1, k2, k3 + do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23 + do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20 + do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21 + do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22 + .endm + +// +// void sha256_ce_finup2x(const struct __sha256_ctx *ctx, +// const u8 *data1, const u8 *data2, int len, +// u8 out1[SHA256_DIGEST_SIZE], +// u8 out2[SHA256_DIGEST_SIZE]); +// +// This function computes the SHA-256 digests of two messages |data1| and +// |data2| that are both |len| bytes long, starting from the initial context +// |ctx|. |len| must be at least SHA256_BLOCK_SIZE. +// +// The instructions for the two SHA-256 operations are interleaved. On many +// CPUs, this is almost twice as fast as hashing each message individually due +// to taking better advantage of the CPU's SHA-256 and SIMD throughput. +// +SYM_FUNC_START(sha256_ce_finup2x) + sub sp, sp, #128 + mov final_step, #0 + load_round_constants x8 + + // Load the initial state from ctx->state. + ld1 {state0_a.4s-state1_a.4s}, [ctx] + + // Load ctx->bytecount. Take the mod 64 of it to get the number of + // bytes that are buffered in ctx->buf. Also save it in a register with + // len added to it. + ldr x8, [ctx, #OFFSETOF_BYTECOUNT] + add count, x8, len, sxtw + and x8, x8, #63 + cbz x8, .Lfinup2x_enter_loop // No bytes buffered? + + // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them + // followed by the first 64 - x8 bytes of data. Since len >= 64, we + // just load 64 bytes from each of ctx->buf, data1, and data2 + // unconditionally and rearrange the data as needed. + add x9, ctx, #OFFSETOF_BUF + ld1 {v16.16b-v19.16b}, [x9] + st1 {v16.16b-v19.16b}, [sp] + + ld1 {v16.16b-v19.16b}, [data1], #64 + add x9, sp, x8 + st1 {v16.16b-v19.16b}, [x9] + ld1 {v16.4s-v19.4s}, [sp] + + ld1 {v20.16b-v23.16b}, [data2], #64 + st1 {v20.16b-v23.16b}, [x9] + ld1 {v20.4s-v23.4s}, [sp] + + sub len, len, #64 + sub data1, data1, x8 + sub data2, data2, x8 + add len, len, w8 + mov state0_b.16b, state0_a.16b + mov state1_b.16b, state1_a.16b + b .Lfinup2x_loop_have_data + +.Lfinup2x_enter_loop: + sub len, len, #64 + mov state0_b.16b, state0_a.16b + mov state1_b.16b, state1_a.16b +.Lfinup2x_loop: + // Load the next two data blocks. + ld1 {v16.4s-v19.4s}, [data1], #64 + ld1 {v20.4s-v23.4s}, [data2], #64 +.Lfinup2x_loop_have_data: + // Convert the words of the data blocks from big endian. +CPU_LE( rev32 v16.16b, v16.16b ) +CPU_LE( rev32 v17.16b, v17.16b ) +CPU_LE( rev32 v18.16b, v18.16b ) +CPU_LE( rev32 v19.16b, v19.16b ) +CPU_LE( rev32 v20.16b, v20.16b ) +CPU_LE( rev32 v21.16b, v21.16b ) +CPU_LE( rev32 v22.16b, v22.16b ) +CPU_LE( rev32 v23.16b, v23.16b ) +.Lfinup2x_loop_have_bswapped_data: + + // Save the original state for each block. + st1 {state0_a.4s-state1_b.4s}, [sp] + + // Do the SHA-256 rounds on each block. + do_16rounds_2x 0, v0, v1, v2, v3 + do_16rounds_2x 16, v4, v5, v6, v7 + do_16rounds_2x 32, v8, v9, v10, v11 + do_16rounds_2x 48, v12, v13, v14, v15 + + // Add the original state for each block. + ld1 {v16.4s-v19.4s}, [sp] + add state0_a.4s, state0_a.4s, v16.4s + add state1_a.4s, state1_a.4s, v17.4s + add state0_b.4s, state0_b.4s, v18.4s + add state1_b.4s, state1_b.4s, v19.4s + + // Update len and loop back if more blocks remain. + sub len, len, #64 + tbz len, #31, .Lfinup2x_loop // len >= 0? + + // Check if any final blocks need to be handled. + // final_step = 2: all done + // final_step = 1: need to do count-only padding block + // final_step = 0: need to do the block with 0x80 padding byte + tbnz final_step, #1, .Lfinup2x_done + tbnz final_step, #0, .Lfinup2x_finalize_countonly + add len, len, #64 + cbz len, .Lfinup2x_finalize_blockaligned + + // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block. + // To do this, write the padding starting with the 0x80 byte to + // &sp[64]. Then for each message, copy the last 64 data bytes to sp + // and load from &sp[64 - len] to get the needed padding block. This + // code relies on the data buffers being >= 64 bytes in length. + sub w8, len, #64 // w8 = len - 64 + add data1, data1, w8, sxtw // data1 += len - 64 + add data2, data2, w8, sxtw // data2 += len - 64 +CPU_LE( mov x9, #0x80 ) +CPU_LE( fmov d16, x9 ) +CPU_BE( movi v16.16b, #0 ) +CPU_BE( mov x9, #0x8000000000000000 ) +CPU_BE( mov v16.d[1], x9 ) + movi v17.16b, #0 + stp q16, q17, [sp, #64] + stp q17, q17, [sp, #96] + sub x9, sp, w8, sxtw // x9 = &sp[64 - len] + cmp len, #56 + b.ge 1f // will count spill into its own block? + lsl count, count, #3 +CPU_LE( rev count, count ) + str count, [x9, #56] + mov final_step, #2 // won't need count-only block + b 2f +1: + mov final_step, #1 // will need count-only block +2: + ld1 {v16.16b-v19.16b}, [data1] + st1 {v16.16b-v19.16b}, [sp] + ld1 {v16.4s-v19.4s}, [x9] + ld1 {v20.16b-v23.16b}, [data2] + st1 {v20.16b-v23.16b}, [sp] + ld1 {v20.4s-v23.4s}, [x9] + b .Lfinup2x_loop_have_data + + // Prepare a padding block, either: + // + // {0x80, 0, 0, 0, ..., count (as __be64)} + // This is for a block aligned message. + // + // { 0, 0, 0, 0, ..., count (as __be64)} + // This is for a message whose length mod 64 is >= 56. + // + // Pre-swap the endianness of the words. +.Lfinup2x_finalize_countonly: + movi v16.2d, #0 + b 1f +.Lfinup2x_finalize_blockaligned: + mov x8, #0x80000000 + fmov d16, x8 +1: + movi v17.2d, #0 + movi v18.2d, #0 + ror count, count, #29 // ror(lsl(count, 3), 32) + mov v19.d[0], xzr + mov v19.d[1], count + mov v20.16b, v16.16b + movi v21.2d, #0 + movi v22.2d, #0 + mov v23.16b, v19.16b + mov final_step, #2 + b .Lfinup2x_loop_have_bswapped_data + +.Lfinup2x_done: + // Write the two digests with all bytes in the correct order. +CPU_LE( rev32 state0_a.16b, state0_a.16b ) +CPU_LE( rev32 state1_a.16b, state1_a.16b ) +CPU_LE( rev32 state0_b.16b, state0_b.16b ) +CPU_LE( rev32 state1_b.16b, state1_b.16b ) + st1 {state0_a.4s-state1_a.4s}, [out1] + st1 {state0_b.4s-state1_b.4s}, [out2] + add sp, sp, #128 + ret +SYM_FUNC_END(sha256_ce_finup2x) diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h index a211966c124a96..a7bc3a90ada6b2 100644 --- a/lib/crypto/arm64/sha256.h +++ b/lib/crypto/arm64/sha256.h @@ -44,6 +44,43 @@ static void sha256_blocks(struct sha256_block_state *state, } } +static_assert(offsetof(struct __sha256_ctx, state) == 0); +static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); +static_assert(offsetof(struct __sha256_ctx, buf) == 40); +asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, int len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]); + +#define sha256_finup_2x_arch sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + /* + * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. + * Further limit len to 65536 to avoid spending too long with preemption + * disabled. (Of course, in practice len is nearly always 4096 anyway.) + */ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE && + len <= 65536 && likely(may_use_simd())) { + kernel_neon_begin(); + sha256_ce_finup2x(ctx, data1, data2, len, out1, out2); + kernel_neon_end(); + kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); + kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); + return true; + } + return false; +} + +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return static_key_enabled(&have_ce); +} + #ifdef CONFIG_KERNEL_MODE_NEON #define sha256_mod_init_arch sha256_mod_init_arch static inline void sha256_mod_init_arch(void) From bc6d6a4172a7fa599fe66f16e9a2c1cdc6983207 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 15 Sep 2025 11:08:16 -0500 Subject: [PATCH 2032/3206] lib/crypto: x86/sha256: Add support for 2-way interleaved hashing Add an implementation of sha256_finup_2x_arch() for x86_64. It interleaves the computation of two SHA-256 hashes using the x86 SHA-NI instructions. dm-verity and fs-verity will take advantage of this for greatly improved performance on capable CPUs. This increases the throughput of SHA-256 hashing 4096-byte messages by the following amounts on the following CPUs: Intel Ice Lake (server): 4% Intel Sapphire Rapids: 38% Intel Emerald Rapids: 38% AMD Zen 1 (Threadripper 1950X): 84% AMD Zen 4 (EPYC 9B14): 98% AMD Zen 5 (Ryzen 9 9950X): 64% For now, this seems to benefit AMD more than Intel. This seems to be because current AMD CPUs support concurrent execution of the SHA-NI instructions, but unfortunately current Intel CPUs don't, except for the sha256msg2 instruction. Hopefully future Intel CPUs will support SHA-NI on more execution ports. Zen 1 supports 2 concurrent sha256rnds2, and Zen 4 supports 4 concurrent sha256rnds2, which suggests that even better performance may be achievable on Zen 4 by interleaving more than two hashes. However, doing so poses a number of trade-offs, and furthermore Zen 5 goes back to supporting "only" 2 concurrent sha256rnds2. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250915160819.140019-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/x86/sha256-ni-asm.S | 368 +++++++++++++++++++++++++++++++++ lib/crypto/x86/sha256.h | 39 ++++ 2 files changed, 407 insertions(+) diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S index 4bd9490ffc6621..de5f707e7ef71b 100644 --- a/lib/crypto/x86/sha256-ni-asm.S +++ b/lib/crypto/x86/sha256-ni-asm.S @@ -165,6 +165,374 @@ SYM_FUNC_START(sha256_ni_transform) RET SYM_FUNC_END(sha256_ni_transform) +#undef DIGEST_PTR +#undef DATA_PTR +#undef NUM_BLKS +#undef SHA256CONSTANTS +#undef MSG +#undef STATE0 +#undef STATE1 +#undef MSG0 +#undef MSG1 +#undef MSG2 +#undef MSG3 +#undef TMP +#undef SHUF_MASK +#undef ABEF_SAVE +#undef CDGH_SAVE + +// parameters for sha256_ni_finup2x() +#define CTX %rdi +#define DATA1 %rsi +#define DATA2 %rdx +#define LEN %ecx +#define LEN8 %cl +#define LEN64 %rcx +#define OUT1 %r8 +#define OUT2 %r9 + +// other scalar variables +#define SHA256CONSTANTS %rax +#define COUNT %r10 +#define COUNT32 %r10d +#define FINAL_STEP %r11d + +// rbx is used as a temporary. + +#define MSG %xmm0 // sha256rnds2 implicit operand +#define STATE0_A %xmm1 +#define STATE1_A %xmm2 +#define STATE0_B %xmm3 +#define STATE1_B %xmm4 +#define TMP_A %xmm5 +#define TMP_B %xmm6 +#define MSG0_A %xmm7 +#define MSG1_A %xmm8 +#define MSG2_A %xmm9 +#define MSG3_A %xmm10 +#define MSG0_B %xmm11 +#define MSG1_B %xmm12 +#define MSG2_B %xmm13 +#define MSG3_B %xmm14 +#define SHUF_MASK %xmm15 + +#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state) +#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) +#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) + +// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b +// contain the current 4 message schedule words for the first and second message +// respectively. +// +// If not all the message schedule words have been computed yet, then this also +// computes 4 more message schedule words for each message. m1_a-m3_a contain +// the next 3 groups of 4 message schedule words for the first message, and +// likewise m1_b-m3_b for the second. After consuming the current value of +// m0_a, this macro computes the group after m3_a and writes it to m0_a, and +// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the +// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must +// cycle through the registers accordingly. +.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b + movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A + movdqa TMP_A, TMP_B + paddd \m0_a, TMP_A + paddd \m0_b, TMP_B +.if \i < 48 + sha256msg1 \m1_a, \m0_a + sha256msg1 \m1_b, \m0_b +.endif + movdqa TMP_A, MSG + sha256rnds2 STATE0_A, STATE1_A + movdqa TMP_B, MSG + sha256rnds2 STATE0_B, STATE1_B + pshufd $0x0E, TMP_A, MSG + sha256rnds2 STATE1_A, STATE0_A + pshufd $0x0E, TMP_B, MSG + sha256rnds2 STATE1_B, STATE0_B +.if \i < 48 + movdqa \m3_a, TMP_A + movdqa \m3_b, TMP_B + palignr $4, \m2_a, TMP_A + palignr $4, \m2_b, TMP_B + paddd TMP_A, \m0_a + paddd TMP_B, \m0_b + sha256msg2 \m3_a, \m0_a + sha256msg2 \m3_b, \m0_b +.endif +.endm + +// +// void sha256_ni_finup2x(const struct __sha256_ctx *ctx, +// const u8 *data1, const u8 *data2, int len, +// u8 out1[SHA256_DIGEST_SIZE], +// u8 out2[SHA256_DIGEST_SIZE]); +// +// This function computes the SHA-256 digests of two messages |data1| and +// |data2| that are both |len| bytes long, starting from the initial context +// |ctx|. |len| must be at least SHA256_BLOCK_SIZE. +// +// The instructions for the two SHA-256 operations are interleaved. On many +// CPUs, this is almost twice as fast as hashing each message individually due +// to taking better advantage of the CPU's SHA-256 and SIMD throughput. +// +SYM_FUNC_START(sha256_ni_finup2x) + // Allocate 128 bytes of stack space, 16-byte aligned. + push %rbx + push %rbp + mov %rsp, %rbp + sub $128, %rsp + and $~15, %rsp + + // Load the shuffle mask for swapping the endianness of 32-bit words. + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + + // Set up pointer to the round constants. + lea K256+32*4(%rip), SHA256CONSTANTS + + // Initially we're not processing the final blocks. + xor FINAL_STEP, FINAL_STEP + + // Load the initial state from ctx->state. + movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA + movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE + movdqa STATE0_A, TMP_A + punpcklqdq STATE1_A, STATE0_A // FEBA + punpckhqdq TMP_A, STATE1_A // DCHG + pshufd $0x1B, STATE0_A, STATE0_A // ABEF + pshufd $0xB1, STATE1_A, STATE1_A // CDGH + + // Load ctx->bytecount. Take the mod 64 of it to get the number of + // bytes that are buffered in ctx->buf. Also save it in a register with + // LEN added to it. + mov LEN, LEN + mov OFFSETOF_BYTECOUNT(CTX), %rbx + lea (%rbx, LEN64, 1), COUNT + and $63, %ebx + jz .Lfinup2x_enter_loop // No bytes buffered? + + // %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them + // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we + // just load 64 bytes from each of ctx->buf, DATA1, and DATA2 + // unconditionally and rearrange the data as needed. + + movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A + movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A + movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A + movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A + movdqa MSG0_A, 0*16(%rsp) + movdqa MSG1_A, 1*16(%rsp) + movdqa MSG2_A, 2*16(%rsp) + movdqa MSG3_A, 3*16(%rsp) + + movdqu 0*16(DATA1), MSG0_A + movdqu 1*16(DATA1), MSG1_A + movdqu 2*16(DATA1), MSG2_A + movdqu 3*16(DATA1), MSG3_A + movdqu MSG0_A, 0*16(%rsp,%rbx) + movdqu MSG1_A, 1*16(%rsp,%rbx) + movdqu MSG2_A, 2*16(%rsp,%rbx) + movdqu MSG3_A, 3*16(%rsp,%rbx) + movdqa 0*16(%rsp), MSG0_A + movdqa 1*16(%rsp), MSG1_A + movdqa 2*16(%rsp), MSG2_A + movdqa 3*16(%rsp), MSG3_A + + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA2), MSG3_B + movdqu MSG0_B, 0*16(%rsp,%rbx) + movdqu MSG1_B, 1*16(%rsp,%rbx) + movdqu MSG2_B, 2*16(%rsp,%rbx) + movdqu MSG3_B, 3*16(%rsp,%rbx) + movdqa 0*16(%rsp), MSG0_B + movdqa 1*16(%rsp), MSG1_B + movdqa 2*16(%rsp), MSG2_B + movdqa 3*16(%rsp), MSG3_B + + sub $64, %rbx // rbx = buffered - 64 + sub %rbx, DATA1 // DATA1 += 64 - buffered + sub %rbx, DATA2 // DATA2 += 64 - buffered + add %ebx, LEN // LEN += buffered - 64 + movdqa STATE0_A, STATE0_B + movdqa STATE1_A, STATE1_B + jmp .Lfinup2x_loop_have_data + +.Lfinup2x_enter_loop: + sub $64, LEN + movdqa STATE0_A, STATE0_B + movdqa STATE1_A, STATE1_B +.Lfinup2x_loop: + // Load the next two data blocks. + movdqu 0*16(DATA1), MSG0_A + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA1), MSG1_A + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA1), MSG2_A + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA1), MSG3_A + movdqu 3*16(DATA2), MSG3_B + add $64, DATA1 + add $64, DATA2 +.Lfinup2x_loop_have_data: + // Convert the words of the data blocks from big endian. + pshufb SHUF_MASK, MSG0_A + pshufb SHUF_MASK, MSG0_B + pshufb SHUF_MASK, MSG1_A + pshufb SHUF_MASK, MSG1_B + pshufb SHUF_MASK, MSG2_A + pshufb SHUF_MASK, MSG2_B + pshufb SHUF_MASK, MSG3_A + pshufb SHUF_MASK, MSG3_B +.Lfinup2x_loop_have_bswapped_data: + + // Save the original state for each block. + movdqa STATE0_A, 0*16(%rsp) + movdqa STATE0_B, 1*16(%rsp) + movdqa STATE1_A, 2*16(%rsp) + movdqa STATE1_B, 3*16(%rsp) + + // Do the SHA-256 rounds on each block. +.irp i, 0, 16, 32, 48 + do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \ + MSG0_B, MSG1_B, MSG2_B, MSG3_B + do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \ + MSG1_B, MSG2_B, MSG3_B, MSG0_B + do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \ + MSG2_B, MSG3_B, MSG0_B, MSG1_B + do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \ + MSG3_B, MSG0_B, MSG1_B, MSG2_B +.endr + + // Add the original state for each block. + paddd 0*16(%rsp), STATE0_A + paddd 1*16(%rsp), STATE0_B + paddd 2*16(%rsp), STATE1_A + paddd 3*16(%rsp), STATE1_B + + // Update LEN and loop back if more blocks remain. + sub $64, LEN + jge .Lfinup2x_loop + + // Check if any final blocks need to be handled. + // FINAL_STEP = 2: all done + // FINAL_STEP = 1: need to do count-only padding block + // FINAL_STEP = 0: need to do the block with 0x80 padding byte + cmp $1, FINAL_STEP + jg .Lfinup2x_done + je .Lfinup2x_finalize_countonly + add $64, LEN + jz .Lfinup2x_finalize_blockaligned + + // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block. + // To do this, write the padding starting with the 0x80 byte to + // &sp[64]. Then for each message, copy the last 64 data bytes to sp + // and load from &sp[64 - LEN] to get the needed padding block. This + // code relies on the data buffers being >= 64 bytes in length. + mov $64, %ebx + sub LEN, %ebx // ebx = 64 - LEN + sub %rbx, DATA1 // DATA1 -= 64 - LEN + sub %rbx, DATA2 // DATA2 -= 64 - LEN + mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary + movd FINAL_STEP, MSG0_A + pxor MSG1_A, MSG1_A + movdqa MSG0_A, 4*16(%rsp) + movdqa MSG1_A, 5*16(%rsp) + movdqa MSG1_A, 6*16(%rsp) + movdqa MSG1_A, 7*16(%rsp) + cmp $56, LEN + jge 1f // will COUNT spill into its own block? + shl $3, COUNT + bswap COUNT + mov COUNT, 56(%rsp,%rbx) + mov $2, FINAL_STEP // won't need count-only block + jmp 2f +1: + mov $1, FINAL_STEP // will need count-only block +2: + movdqu 0*16(DATA1), MSG0_A + movdqu 1*16(DATA1), MSG1_A + movdqu 2*16(DATA1), MSG2_A + movdqu 3*16(DATA1), MSG3_A + movdqa MSG0_A, 0*16(%rsp) + movdqa MSG1_A, 1*16(%rsp) + movdqa MSG2_A, 2*16(%rsp) + movdqa MSG3_A, 3*16(%rsp) + movdqu 0*16(%rsp,%rbx), MSG0_A + movdqu 1*16(%rsp,%rbx), MSG1_A + movdqu 2*16(%rsp,%rbx), MSG2_A + movdqu 3*16(%rsp,%rbx), MSG3_A + + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA2), MSG3_B + movdqa MSG0_B, 0*16(%rsp) + movdqa MSG1_B, 1*16(%rsp) + movdqa MSG2_B, 2*16(%rsp) + movdqa MSG3_B, 3*16(%rsp) + movdqu 0*16(%rsp,%rbx), MSG0_B + movdqu 1*16(%rsp,%rbx), MSG1_B + movdqu 2*16(%rsp,%rbx), MSG2_B + movdqu 3*16(%rsp,%rbx), MSG3_B + jmp .Lfinup2x_loop_have_data + + // Prepare a padding block, either: + // + // {0x80, 0, 0, 0, ..., count (as __be64)} + // This is for a block aligned message. + // + // { 0, 0, 0, 0, ..., count (as __be64)} + // This is for a message whose length mod 64 is >= 56. + // + // Pre-swap the endianness of the words. +.Lfinup2x_finalize_countonly: + pxor MSG0_A, MSG0_A + jmp 1f + +.Lfinup2x_finalize_blockaligned: + mov $0x80000000, %ebx + movd %ebx, MSG0_A +1: + pxor MSG1_A, MSG1_A + pxor MSG2_A, MSG2_A + ror $29, COUNT + movq COUNT, MSG3_A + pslldq $8, MSG3_A + movdqa MSG0_A, MSG0_B + pxor MSG1_B, MSG1_B + pxor MSG2_B, MSG2_B + movdqa MSG3_A, MSG3_B + mov $2, FINAL_STEP + jmp .Lfinup2x_loop_have_bswapped_data + +.Lfinup2x_done: + // Write the two digests with all bytes in the correct order. + movdqa STATE0_A, TMP_A + movdqa STATE0_B, TMP_B + punpcklqdq STATE1_A, STATE0_A // GHEF + punpcklqdq STATE1_B, STATE0_B + punpckhqdq TMP_A, STATE1_A // ABCD + punpckhqdq TMP_B, STATE1_B + pshufd $0xB1, STATE0_A, STATE0_A // HGFE + pshufd $0xB1, STATE0_B, STATE0_B + pshufd $0x1B, STATE1_A, STATE1_A // DCBA + pshufd $0x1B, STATE1_B, STATE1_B + pshufb SHUF_MASK, STATE0_A + pshufb SHUF_MASK, STATE0_B + pshufb SHUF_MASK, STATE1_A + pshufb SHUF_MASK, STATE1_B + movdqu STATE0_A, 1*16(OUT1) + movdqu STATE0_B, 1*16(OUT2) + movdqu STATE1_A, 0*16(OUT1) + movdqu STATE1_B, 0*16(OUT2) + + mov %rbp, %rsp + pop %rbp + pop %rbx + RET +SYM_FUNC_END(sha256_ni_finup2x) + .section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 K256: diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h index 669bc06538b67e..ecea65b608db33 100644 --- a/lib/crypto/x86/sha256.h +++ b/lib/crypto/x86/sha256.h @@ -8,6 +8,8 @@ #include #include +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni); + DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); #define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ @@ -36,11 +38,48 @@ static void sha256_blocks(struct sha256_block_state *state, static_call(sha256_blocks_x86)(state, data, nblocks); } +static_assert(offsetof(struct __sha256_ctx, state) == 0); +static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); +static_assert(offsetof(struct __sha256_ctx, buf) == 40); +asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, int len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]); + +#define sha256_finup_2x_arch sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + /* + * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. + * Further limit len to 65536 to avoid spending too long with preemption + * disabled. (Of course, in practice len is nearly always 4096 anyway.) + */ + if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE && + len <= 65536 && likely(irq_fpu_usable())) { + kernel_fpu_begin(); + sha256_ni_finup2x(ctx, data1, data2, len, out1, out2); + kernel_fpu_end(); + kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); + kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); + return true; + } + return false; +} + +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return static_key_enabled(&have_sha_ni); +} + #define sha256_mod_init_arch sha256_mod_init_arch static inline void sha256_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha256_blocks_x86, sha256_blocks_ni); + static_branch_enable(&have_sha_ni); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { From 6733968be7cb0ce559c755f639c41629fa11761e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 15 Sep 2025 11:08:17 -0500 Subject: [PATCH 2033/3206] lib/crypto: tests: Add tests and benchmark for sha256_finup_2x() Update sha256_kunit to include test cases and a benchmark for the new sha256_finup_2x() function. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250915160819.140019-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/tests/sha256_kunit.c | 184 ++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c index 1cd4caee6010d5..dcedfca06df658 100644 --- a/lib/crypto/tests/sha256_kunit.c +++ b/lib/crypto/tests/sha256_kunit.c @@ -5,6 +5,7 @@ #include #include "sha256-testvecs.h" +/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */ #define HASH sha256 #define HASH_CTX sha256_ctx #define HASH_SIZE SHA256_DIGEST_SIZE @@ -21,9 +22,192 @@ #define HMAC_USINGRAWKEY hmac_sha256_usingrawkey #include "hash-test-template.h" +static void free_guarded_buf(void *buf) +{ + vfree(buf); +} + +/* + * Allocate a KUnit-managed buffer that has length @len bytes immediately + * followed by an unmapped page, and assert that the allocation succeeds. + */ +static void *alloc_guarded_buf(struct kunit *test, size_t len) +{ + size_t full_len = round_up(len, PAGE_SIZE); + void *buf = vmalloc(full_len); + + KUNIT_ASSERT_NOT_NULL(test, buf); + KUNIT_ASSERT_EQ(test, 0, + kunit_add_action_or_reset(test, free_guarded_buf, buf)); + return buf + full_len - len; +} + +/* + * Test for sha256_finup_2x(). Specifically, choose various data lengths and + * salt lengths, and for each one, verify that sha256_finup_2x() produces the + * same results as sha256_update() and sha256_final(). + * + * Use guarded buffers for all inputs and outputs to reliably detect any + * out-of-bounds reads or writes, even if they occur in assembly code. + */ +static void test_sha256_finup_2x(struct kunit *test) +{ + const size_t max_data_len = 16384; + u8 *data1_buf, *data2_buf, *hash1, *hash2; + u8 expected_hash1[SHA256_DIGEST_SIZE]; + u8 expected_hash2[SHA256_DIGEST_SIZE]; + u8 salt[SHA256_BLOCK_SIZE]; + struct sha256_ctx *ctx; + + data1_buf = alloc_guarded_buf(test, max_data_len); + data2_buf = alloc_guarded_buf(test, max_data_len); + hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE); + hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE); + ctx = alloc_guarded_buf(test, sizeof(*ctx)); + + rand_bytes(data1_buf, max_data_len); + rand_bytes(data2_buf, max_data_len); + rand_bytes(salt, sizeof(salt)); + + for (size_t i = 0; i < 500; i++) { + size_t salt_len = rand_length(sizeof(salt)); + size_t data_len = rand_length(max_data_len); + const u8 *data1 = data1_buf + max_data_len - data_len; + const u8 *data2 = data2_buf + max_data_len - data_len; + struct sha256_ctx orig_ctx; + + sha256_init(ctx); + sha256_update(ctx, salt, salt_len); + orig_ctx = *ctx; + + sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2); + KUNIT_ASSERT_MEMEQ_MSG( + test, ctx, &orig_ctx, sizeof(*ctx), + "sha256_finup_2x() modified its ctx argument"); + + sha256_update(ctx, data1, data_len); + sha256_final(ctx, expected_hash1); + sha256_update(&orig_ctx, data2, data_len); + sha256_final(&orig_ctx, expected_hash2); + KUNIT_ASSERT_MEMEQ_MSG( + test, hash1, expected_hash1, SHA256_DIGEST_SIZE, + "Wrong hash1 with salt_len=%zu data_len=%zu", salt_len, + data_len); + KUNIT_ASSERT_MEMEQ_MSG( + test, hash2, expected_hash2, SHA256_DIGEST_SIZE, + "Wrong hash2 with salt_len=%zu data_len=%zu", salt_len, + data_len); + } +} + +/* Test sha256_finup_2x() with ctx == NULL */ +static void test_sha256_finup_2x_defaultctx(struct kunit *test) +{ + const size_t data_len = 128; + struct sha256_ctx ctx; + u8 hash1_a[SHA256_DIGEST_SIZE]; + u8 hash2_a[SHA256_DIGEST_SIZE]; + u8 hash1_b[SHA256_DIGEST_SIZE]; + u8 hash2_b[SHA256_DIGEST_SIZE]; + + rand_bytes(test_buf, 2 * data_len); + + sha256_init(&ctx); + sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a, + hash2_a); + + sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b, + hash2_b); + + KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE); + KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE); +} + +/* + * Test that sha256_finup_2x() and sha256_update/final() produce consistent + * results with total message lengths that require more than 32 bits. + */ +static void test_sha256_finup_2x_hugelen(struct kunit *test) +{ + const size_t data_len = 4 * SHA256_BLOCK_SIZE; + struct sha256_ctx ctx = {}; + u8 expected_hash[SHA256_DIGEST_SIZE]; + u8 hash[SHA256_DIGEST_SIZE]; + + rand_bytes(test_buf, data_len); + for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) { + sha256_init(&ctx); + ctx.ctx.bytecount = 0x123456789abcd00 + align; + + sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash); + + sha256_update(&ctx, test_buf, data_len); + sha256_final(&ctx, expected_hash); + + KUNIT_ASSERT_MEMEQ(test, hash, expected_hash, + SHA256_DIGEST_SIZE); + } +} + +/* Benchmark for sha256_finup_2x() */ +static void benchmark_sha256_finup_2x(struct kunit *test) +{ + /* + * Try a few different salt lengths, since sha256_finup_2x() performance + * may vary slightly for the same data_len depending on how many bytes + * were already processed in the initial context. + */ + static const size_t salt_lens_to_test[] = { 0, 32, 64 }; + const size_t data_len = 4096; + const size_t num_iters = 4096; + struct sha256_ctx ctx; + u8 hash1[SHA256_DIGEST_SIZE]; + u8 hash2[SHA256_DIGEST_SIZE]; + + if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK)) + kunit_skip(test, "not enabled"); + if (!sha256_finup_2x_is_optimized()) + kunit_skip(test, "not relevant"); + + rand_bytes(test_buf, data_len * 2); + + /* Warm-up */ + for (size_t i = 0; i < num_iters; i++) + sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len], + data_len, hash1, hash2); + + for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) { + size_t salt_len = salt_lens_to_test[i]; + u64 t0, t1; + + /* + * Prepare the initial context. The time to process the salt is + * not measured; we're just interested in sha256_finup_2x(). + */ + sha256_init(&ctx); + sha256_update(&ctx, test_buf, salt_len); + + preempt_disable(); + t0 = ktime_get_ns(); + for (size_t j = 0; j < num_iters; j++) + sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len], + data_len, hash1, hash2); + t1 = ktime_get_ns(); + preempt_enable(); + kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s", + data_len, salt_len, + div64_u64((u64)data_len * 2 * num_iters * 1000, + t1 - t0 ?: 1)); + } +} + static struct kunit_case hash_test_cases[] = { HASH_KUNIT_CASES, + KUNIT_CASE(test_sha256_finup_2x), + KUNIT_CASE(test_sha256_finup_2x_defaultctx), + KUNIT_CASE(test_sha256_finup_2x_hugelen), KUNIT_CASE(benchmark_hash), + KUNIT_CASE(benchmark_sha256_finup_2x), {}, }; From 4bd70b53bd7d554d402b2d8f5972a54e39a2e5ab Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 15 Sep 2025 11:08:18 -0500 Subject: [PATCH 2034/3206] fsverity: Remove inode parameter from fsverity_hash_block() Due to the conversion from crypto_shash to the library API, fsverity_hash_block() can no longer fail. Therefore, the inode parameter, which was used only to print an error message in the case of a failure, is no longer necessary. Remove it. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250915160819.140019-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/verity/enable.c | 12 +++++------- fs/verity/fsverity_private.h | 2 +- fs/verity/hash_algs.c | 3 +-- fs/verity/verify.c | 4 ++-- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 89eccc4becf902..95ec42b847972c 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -19,8 +19,7 @@ struct block_buffer { }; /* Hash a block, writing the result to the next level's pending block buffer. */ -static int hash_one_block(struct inode *inode, - const struct merkle_tree_params *params, +static int hash_one_block(const struct merkle_tree_params *params, struct block_buffer *cur) { struct block_buffer *next = cur + 1; @@ -36,8 +35,7 @@ static int hash_one_block(struct inode *inode, /* Zero-pad the block if it's shorter than the block size. */ memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); - fsverity_hash_block(params, inode, cur->data, - &next->data[next->filled]); + fsverity_hash_block(params, cur->data, &next->data[next->filled]); next->filled += params->digest_size; cur->filled = 0; return 0; @@ -123,7 +121,7 @@ static int build_merkle_tree(struct file *filp, fsverity_err(inode, "Short read of file data"); goto out; } - err = hash_one_block(inode, params, &buffers[-1]); + err = hash_one_block(params, &buffers[-1]); if (err) goto out; for (level = 0; level < num_levels; level++) { @@ -134,7 +132,7 @@ static int build_merkle_tree(struct file *filp, } /* Next block at @level is full */ - err = hash_one_block(inode, params, &buffers[level]); + err = hash_one_block(params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, @@ -154,7 +152,7 @@ static int build_merkle_tree(struct file *filp, /* Finish all nonempty pending tree blocks. */ for (level = 0; level < num_levels; level++) { if (buffers[level].filled != 0) { - err = hash_one_block(inode, params, &buffers[level]); + err = hash_one_block(params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index bc1d887c532e74..dd20b138d452fa 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -90,7 +90,7 @@ union fsverity_hash_ctx * fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size); void fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, const void *data, u8 *out); + const void *data, u8 *out); void fsverity_hash_buffer(const struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out); void __init fsverity_check_hash_algs(void); diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 9bb3c6344907e9..de53e14c8aa78b 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -94,7 +94,6 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, /** * fsverity_hash_block() - hash a single data or hash block * @params: the Merkle tree's parameters - * @inode: inode for which the hashing is being done * @data: virtual address of a buffer containing the block to hash * @out: output digest, size 'params->digest_size' bytes * @@ -102,7 +101,7 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, * in the Merkle tree parameters. */ void fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, const void *data, u8 *out) + const void *data, u8 *out) { union fsverity_hash_ctx ctx; diff --git a/fs/verity/verify.c b/fs/verity/verify.c index affc307eb6a6b1..cbb75f7bc9bf73 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -202,7 +202,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, unsigned long hblock_idx = hblocks[level - 1].index; unsigned int hoffset = hblocks[level - 1].hoffset; - fsverity_hash_block(params, inode, haddr, real_hash); + fsverity_hash_block(params, haddr, real_hash); if (memcmp(want_hash, real_hash, hsize) != 0) goto corrupted; /* @@ -221,7 +221,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, } /* Finally, verify the data block. */ - fsverity_hash_block(params, inode, data, real_hash); + fsverity_hash_block(params, data, real_hash); if (memcmp(want_hash, real_hash, hsize) != 0) goto corrupted; return true; From a1f692fd69ccdbe1e492d366788b63227d429753 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 15 Sep 2025 11:08:19 -0500 Subject: [PATCH 2035/3206] fsverity: Use 2-way interleaved SHA-256 hashing when supported When the crypto library provides an optimized implementation of sha256_finup_2x(), use it to interleave the hashing of pairs of data blocks. On some CPUs this nearly doubles hashing performance. The increase in overall throughput of cold-cache fsverity reads that I'm seeing on arm64 and x86_64 is roughly 35% (though this metric is hard to measure as it jumps around a lot). For now this is only done on the verification path, and only for data blocks, not Merkle tree blocks. We could use sha256_finup_2x() on Merkle tree blocks too, but that is less important as there aren't as many Merkle tree blocks as data blocks, and that would require some additional code restructuring. We could also use sha256_finup_2x() to accelerate building the Merkle tree, but verification performance is more important. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250915160819.140019-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/verity/verify.c | 173 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 139 insertions(+), 34 deletions(-) diff --git a/fs/verity/verify.c b/fs/verity/verify.c index cbb75f7bc9bf73..58f9cc610826bc 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -10,6 +10,31 @@ #include #include +#define FS_VERITY_MAX_PENDING_BLOCKS 2 + +struct fsverity_pending_block { + const void *data; + u64 pos; + u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; +}; + +struct fsverity_verification_context { + struct inode *inode; + struct fsverity_info *vi; + unsigned long max_ra_pages; + + /* + * This is the queue of data blocks that are pending verification. When + * the crypto layer supports interleaved hashing, we allow multiple + * blocks to be queued up in order to utilize it. This can improve + * performance significantly vs. sequential hashing of each block. + */ + int num_pending; + int max_pending; + struct fsverity_pending_block + pending_blocks[FS_VERITY_MAX_PENDING_BLOCKS]; +}; + static struct workqueue_struct *fsverity_read_workqueue; /* @@ -79,7 +104,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, } /* - * Verify a single data block against the file's Merkle tree. + * Verify the hash of a single data block against the file's Merkle tree. * * In principle, we need to verify the entire path to the root node. However, * for efficiency the filesystem may cache the hash blocks. Therefore we need @@ -88,10 +113,11 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, * * Return: %true if the data block is valid, else %false. */ -static bool -verify_data_block(struct inode *inode, struct fsverity_info *vi, - const void *data, u64 data_pos, unsigned long max_ra_pages) +static bool verify_data_block(struct inode *inode, struct fsverity_info *vi, + const struct fsverity_pending_block *dblock, + unsigned long max_ra_pages) { + const u64 data_pos = dblock->pos; const struct merkle_tree_params *params = &vi->tree_params; const unsigned int hsize = params->digest_size; int level; @@ -115,8 +141,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, */ u64 hidx = data_pos >> params->log_blocksize; - /* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */ - BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX); + /* + * Up to FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS pages may + * be mapped at once. + */ + static_assert(FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS <= + KM_MAX_IDX); if (unlikely(data_pos >= inode->i_size)) { /* @@ -127,7 +157,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, * any part past EOF should be all zeroes. Therefore, we need * to verify that any data blocks fully past EOF are all zeroes. */ - if (memchr_inv(data, 0, params->block_size)) { + if (memchr_inv(dblock->data, 0, params->block_size)) { fsverity_err(inode, "FILE CORRUPTED! Data past EOF is not zeroed"); return false; @@ -220,18 +250,18 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, put_page(hpage); } - /* Finally, verify the data block. */ - fsverity_hash_block(params, data, real_hash); - if (memcmp(want_hash, real_hash, hsize) != 0) + /* Finally, verify the hash of the data block. */ + if (memcmp(want_hash, dblock->real_hash, hsize) != 0) goto corrupted; return true; corrupted: - fsverity_err(inode, - "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", - data_pos, level - 1, - params->hash_alg->name, hsize, want_hash, - params->hash_alg->name, hsize, real_hash); + fsverity_err( + inode, + "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + data_pos, level - 1, params->hash_alg->name, hsize, want_hash, + params->hash_alg->name, hsize, + level == 0 ? dblock->real_hash : real_hash); error: for (; level > 0; level--) { kunmap_local(hblocks[level - 1].addr); @@ -240,13 +270,73 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, return false; } -static bool -verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, - unsigned long max_ra_pages) +static void +fsverity_init_verification_context(struct fsverity_verification_context *ctx, + struct inode *inode, + unsigned long max_ra_pages) { - struct inode *inode = data_folio->mapping->host; struct fsverity_info *vi = *fsverity_info_addr(inode); - const unsigned int block_size = vi->tree_params.block_size; + + ctx->inode = inode; + ctx->vi = vi; + ctx->max_ra_pages = max_ra_pages; + ctx->num_pending = 0; + if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 && + sha256_finup_2x_is_optimized()) + ctx->max_pending = 2; + else + ctx->max_pending = 1; +} + +static void +fsverity_clear_pending_blocks(struct fsverity_verification_context *ctx) +{ + int i; + + for (i = ctx->num_pending - 1; i >= 0; i--) { + kunmap_local(ctx->pending_blocks[i].data); + ctx->pending_blocks[i].data = NULL; + } + ctx->num_pending = 0; +} + +static bool +fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx) +{ + struct fsverity_info *vi = ctx->vi; + const struct merkle_tree_params *params = &vi->tree_params; + int i; + + if (ctx->num_pending == 2) { + /* num_pending == 2 implies that the algorithm is SHA-256 */ + sha256_finup_2x(params->hashstate ? ¶ms->hashstate->sha256 : + NULL, + ctx->pending_blocks[0].data, + ctx->pending_blocks[1].data, params->block_size, + ctx->pending_blocks[0].real_hash, + ctx->pending_blocks[1].real_hash); + } else { + for (i = 0; i < ctx->num_pending; i++) + fsverity_hash_block(params, ctx->pending_blocks[i].data, + ctx->pending_blocks[i].real_hash); + } + + for (i = 0; i < ctx->num_pending; i++) { + if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i], + ctx->max_ra_pages)) + return false; + } + fsverity_clear_pending_blocks(ctx); + return true; +} + +static bool fsverity_add_data_blocks(struct fsverity_verification_context *ctx, + struct folio *data_folio, size_t len, + size_t offset) +{ + struct fsverity_info *vi = ctx->vi; + const struct merkle_tree_params *params = &vi->tree_params; + const unsigned int block_size = params->block_size; u64 pos = (u64)data_folio->index << PAGE_SHIFT; if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size))) @@ -255,14 +345,11 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, folio_test_uptodate(data_folio))) return false; do { - void *data; - bool valid; - - data = kmap_local_folio(data_folio, offset); - valid = verify_data_block(inode, vi, data, pos + offset, - max_ra_pages); - kunmap_local(data); - if (!valid) + ctx->pending_blocks[ctx->num_pending].data = + kmap_local_folio(data_folio, offset); + ctx->pending_blocks[ctx->num_pending].pos = pos + offset; + if (++ctx->num_pending == ctx->max_pending && + !fsverity_verify_pending_blocks(ctx)) return false; offset += block_size; len -= block_size; @@ -284,7 +371,15 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, */ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { - return verify_data_blocks(folio, len, offset, 0); + struct fsverity_verification_context ctx; + + fsverity_init_verification_context(&ctx, folio->mapping->host, 0); + + if (fsverity_add_data_blocks(&ctx, folio, len, offset) && + fsverity_verify_pending_blocks(&ctx)) + return true; + fsverity_clear_pending_blocks(&ctx); + return false; } EXPORT_SYMBOL_GPL(fsverity_verify_blocks); @@ -305,6 +400,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks); */ void fsverity_verify_bio(struct bio *bio) { + struct inode *inode = bio_first_folio_all(bio)->mapping->host; + struct fsverity_verification_context ctx; struct folio_iter fi; unsigned long max_ra_pages = 0; @@ -321,13 +418,21 @@ void fsverity_verify_bio(struct bio *bio) max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2); } + fsverity_init_verification_context(&ctx, inode, max_ra_pages); + bio_for_each_folio_all(fi, bio) { - if (!verify_data_blocks(fi.folio, fi.length, fi.offset, - max_ra_pages)) { - bio->bi_status = BLK_STS_IOERR; - break; - } + if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length, + fi.offset)) + goto ioerr; } + + if (!fsverity_verify_pending_blocks(&ctx)) + goto ioerr; + return; + +ioerr: + fsverity_clear_pending_blocks(&ctx); + bio->bi_status = BLK_STS_IOERR; } EXPORT_SYMBOL_GPL(fsverity_verify_bio); #endif /* CONFIG_BLOCK */ From a3c73d629ea1373af3c0c954d41fd1af555492e3 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 16 Sep 2025 14:22:50 -0700 Subject: [PATCH 2036/3206] bpf: dont report verifier bug for missing bpf_scc_visit on speculative path Syzbot generated a program that triggers a verifier_bug() call in maybe_exit_scc(). maybe_exit_scc() assumes that, when called for a state with insn_idx in some SCC, there should be an instance of struct bpf_scc_visit allocated for that SCC. Turns out the assumption does not hold for speculative execution paths. See example in the next patch. maybe_scc_exit() is called from update_branch_counts() for states that reach branch count of zero, meaning that path exploration for a particular path is finished. Path exploration can finish in one of three ways: a. Verification error is found. In this case, update_branch_counts() is called only for non-speculative paths. b. Top level BPF_EXIT is reached. Such instructions are never a part of an SCC, so compute_scc_callchain() in maybe_scc_exit() will return false, and maybe_scc_exit() will return early. c. A checkpoint is reached and matched. Checkpoints are created by is_state_visited(), which calls maybe_enter_scc(), which allocates bpf_scc_visit instances for checkpoints within SCCs. Hence, for non-speculative symbolic execution paths, the assumption still holds: if maybe_scc_exit() is called for a state within an SCC, bpf_scc_visit instance must exist. This patch removes the verifier_bug() call for speculative paths. Fixes: c9e31900b54c ("bpf: propagate read/precision marks over state graph backedges") Reported-by: syzbot+3afc814e8df1af64b653@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/68c85acd.050a0220.2ff435.03a4.GAE@google.com/ Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250916212251.3490455-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1029380f84db3e..beaa391e02fb09 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1950,9 +1950,24 @@ static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_stat return 0; visit = scc_visit_lookup(env, callchain); if (!visit) { - verifier_bug(env, "scc exit: no visit info for call chain %s", - format_callchain(env, callchain)); - return -EFAULT; + /* + * If path traversal stops inside an SCC, corresponding bpf_scc_visit + * must exist for non-speculative paths. For non-speculative paths + * traversal stops when: + * a. Verification error is found, maybe_exit_scc() is not called. + * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member + * of any SCC. + * c. A checkpoint is reached and matched. Checkpoints are created by + * is_state_visited(), which calls maybe_enter_scc(), which allocates + * bpf_scc_visit instances for checkpoints within SCCs. + * (c) is the only case that can reach this point. + */ + if (!st->speculative) { + verifier_bug(env, "scc exit: no visit info for call chain %s", + format_callchain(env, callchain)); + return -EFAULT; + } + return 0; } if (visit->entry_state != st) return 0; From a24a2dda70fb40629d44b899452d8e4c0be075e2 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 16 Sep 2025 14:22:51 -0700 Subject: [PATCH 2037/3206] selftests/bpf: trigger verifier.c:maybe_exit_scc() for a speculative state This is a test case minimized from a syzbot reproducer from [1]. The test case triggers verifier.c:maybe_exit_scc() w/o preceding call to verifier.c:maybe_enter_scc() on a speculative symbolic execution path. Here is verifier log for the test case: Live regs before insn: 0: .......... (b7) r0 = 100 1 1: 0......... (7b) *(u64 *)(r10 -512) = r0 1 2: 0......... (b5) if r0 <= 0x0 goto pc-2 3: 0......... (95) exit 0: R1=ctx() R10=fp0 0: (b7) r0 = 100 ; R0_w=100 1: (7b) *(u64 *)(r10 -512) = r0 ; R0_w=100 R10=fp0 fp-512_w=100 2: (b5) if r0 <= 0x0 goto pc-2 mark_precise: ... 2: R0_w=100 3: (95) exit from 2 to 1 (speculative execution): R0_w=scalar() R1=ctx() R10=fp0 fp-512_w=100 1: R0_w=scalar() R1=ctx() R10=fp0 fp-512_w=100 1: (7b) *(u64 *)(r10 -512) = r0 processed 5 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 - Non-speculative execution path 0-3 does not allocate any checkpoints (and hence does not call maybe_enter_scc()), and schedules a speculative jump from 2 to 1. - Speculative execution path stops immediately because of an infinite loop detection and triggers verifier.c:update_branch_counts() -> maybe_exit_scc() calls. [1] https://lore.kernel.org/bpf/68c85acd.050a0220.2ff435.03a4.GAE@google.com/ Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250916212251.3490455-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_loops1.c | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_loops1.c b/tools/testing/selftests/bpf/progs/verifier_loops1.c index e07b43b78fd210..fbdde80e7b9055 100644 --- a/tools/testing/selftests/bpf/progs/verifier_loops1.c +++ b/tools/testing/selftests/bpf/progs/verifier_loops1.c @@ -283,4 +283,25 @@ exit_%=: \ : __clobber_all); } +/* + * This test case triggered a bug in verifier.c:maybe_exit_scc(). + * Speculative execution path reaches stack access instruction, + * stops and triggers maybe_exit_scc() w/o accompanying maybe_enter_scc() call. + */ +SEC("socket") +__arch_x86_64 +__caps_unpriv(CAP_BPF) +__naked void maybe_exit_scc_bug1(void) +{ + asm volatile ( + "r0 = 100;" +"1:" + /* Speculative execution path reaches and stops here. */ + "*(u64 *)(r10 - 512) = r0;" + /* Condition is always false, but verifier speculatively executes the true branch. */ + "if r0 <= 0x0 goto 1b;" + "exit;" + ::: __clobber_all); +} + char _license[] SEC("license") = "GPL"; From 6ff4a0fa3e1b2b9756254b477fb2f0fbe04ff378 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Tue, 16 Sep 2025 23:26:53 +0000 Subject: [PATCH 2038/3206] bpf, arm64: Call bpf_jit_binary_pack_finalize() in bpf_jit_free() The current implementation seems incorrect and does NOT match the comment above, use bpf_jit_binary_pack_finalize() instead. Fixes: 1dad391daef1 ("bpf, arm64: use bpf_prog_pack for memory management") Acked-by: Puranjay Mohan Signed-off-by: Hengqi Chen Acked-by: Song Liu Acked-by: Puranjay Mohan Link: https://lore.kernel.org/r/20250916232653.101004-1-hengqi.chen@gmail.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 008273a53e0469..e36261c6395296 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -3115,8 +3115,7 @@ void bpf_jit_free(struct bpf_prog *prog) * before freeing it. */ if (jit_data) { - bpf_arch_text_copy(&jit_data->ro_header->size, &jit_data->header->size, - sizeof(jit_data->header->size)); + bpf_jit_binary_pack_finalize(jit_data->ro_header, jit_data->header); kfree(jit_data); } prog->bpf_func -= cfi_get_offset(); From b783a6265589783e297f8dc4647a31d870d8396e Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 17 Sep 2025 06:04:44 +0000 Subject: [PATCH 2039/3206] cpuset: move the root cpuset write check earlier The 'cpus' or 'mems' lists of the top_cpuset cannot be modified. This check can be moved before acquiring any locks as a common code block to improve efficiency and maintainability. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index caa885823eebc7..12012ce6277726 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2338,10 +2338,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, bool force = false; int old_prs = cs->partition_root_state; - /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */ - if (cs == &top_cpuset) - return -EACCES; - /* * An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case @@ -2802,15 +2798,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, { int retval; - /* - * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; - * it's read-only - */ - if (cs == &top_cpuset) { - retval = -EACCES; - goto done; - } - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. * Since nodelist_parse() fails on an empty mask, we special case @@ -3280,6 +3267,10 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, struct cpuset *trialcs; int retval = -ENODEV; + /* root is read-only */ + if (cs == &top_cpuset) + return -EACCES; + buf = strstrip(buf); cpuset_full_lock(); if (!is_cpuset_online(cs)) From bba0ccf829b904e6d1f119d94f01d7209d34ba28 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 17 Sep 2025 06:04:45 +0000 Subject: [PATCH 2040/3206] cpuset: remove unused assignment to trialcs->partition_root_state The trialcs->partition_root_state field is not used during the configuration of 'cpuset.cpus' or 'cpuset.cpus.exclusive'. Therefore, the assignment of values to this field can be safely removed. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 12012ce6277726..bea7e6ef5d5d13 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2363,7 +2363,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * trialcs->effective_xcpus is used as a temporary cpumask * for checking validity of the partition root. */ - trialcs->partition_root_state = PRS_MEMBER; if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) compute_effective_exclusive_cpumask(trialcs, NULL, cs); } @@ -2497,7 +2496,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, return 0; if (*buf) { - trialcs->partition_root_state = PRS_MEMBER; /* * Reject the change if there is exclusive CPUs conflict with * the siblings. From 6a59fc4a3a5b19ea375b54ea70d16713c45ea5a0 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 17 Sep 2025 06:04:46 +0000 Subject: [PATCH 2041/3206] cpuset: change return type of is_partition_[in]valid to bool The functions is_partition_valid() and is_partition_invalid() logically return boolean values, but were previously declared with return type 'int'. This patch changes their return type to 'bool' to better reflect their semantic meaning and improve type safety. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index bea7e6ef5d5d13..1eb76a78b19065 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -160,12 +160,12 @@ void dec_dl_tasks_cs(struct task_struct *p) cs->nr_deadline_tasks--; } -static inline int is_partition_valid(const struct cpuset *cs) +static inline bool is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } -static inline int is_partition_invalid(const struct cpuset *cs) +static inline bool is_partition_invalid(const struct cpuset *cs) { return cs->partition_root_state < 0; } From 86bbbd1f33ab31a20f6cacf88660333d25ef5fa4 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 17 Sep 2025 06:04:47 +0000 Subject: [PATCH 2042/3206] cpuset: Refactor exclusive CPU mask computation logic The current compute_effective_exclusive_cpumask function handles multiple scenarios with different input parameters, making the code difficult to follow. This patch refactors it into two separate functions: compute_excpus and compute_trialcs_excpus. The compute_excpus function calculates the exclusive CPU mask for a given input and excludes exclusive CPUs from sibling cpusets when cs's exclusive_cpus is not explicitly set. The compute_trialcs_excpus function specifically handles exclusive CPU computation for trial cpusets used during CPU mask configuration updates, and always excludes exclusive CPUs from sibling cpusets. This refactoring significantly improves code readability and clarity, making it explicit which function to call for each use case and what parameters should be provided. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 103 ++++++++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 38 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1eb76a78b19065..2b87a4b09fc953 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1401,38 +1401,25 @@ bool cpuset_cpu_is_isolated(int cpu) } EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); -/* - * compute_effective_exclusive_cpumask - compute effective exclusive CPUs - * @cs: cpuset - * @xcpus: effective exclusive CPUs value to be set - * @real_cs: the real cpuset (can be NULL) - * Return: 0 if there is no sibling conflict, > 0 otherwise +/** + * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets + * @parent: Parent cpuset containing all siblings + * @cs: Current cpuset (will be skipped) + * @excpus: exclusive effective CPU mask to modify * - * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to - * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus - * as well. The provision of real_cs means that a cpumask is being changed and - * the given cs is a trial one. + * This function ensures the given @excpus mask doesn't include any CPUs that + * are exclusively allocated to sibling cpusets. It walks through all siblings + * of @cs under @parent and removes their exclusive CPUs from @excpus. */ -static int compute_effective_exclusive_cpumask(struct cpuset *cs, - struct cpumask *xcpus, - struct cpuset *real_cs) +static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, + struct cpumask *excpus) { struct cgroup_subsys_state *css; - struct cpuset *parent = parent_cs(cs); struct cpuset *sibling; int retval = 0; - if (!xcpus) - xcpus = cs->effective_xcpus; - - cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); - - if (!real_cs) { - if (!cpumask_empty(cs->exclusive_cpus)) - return 0; - } else { - cs = real_cs; - } + if (cpumask_empty(excpus)) + return retval; /* * Exclude exclusive CPUs from siblings @@ -1442,20 +1429,60 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs, if (sibling == cs) continue; - if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) { - cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); + if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { + cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); retval++; continue; } - if (cpumask_intersects(xcpus, sibling->effective_xcpus)) { - cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); + if (cpumask_intersects(excpus, sibling->effective_xcpus)) { + cpumask_andnot(excpus, excpus, sibling->effective_xcpus); retval++; } } rcu_read_unlock(); + return retval; } +/* + * compute_excpus - compute effective exclusive CPUs + * @cs: cpuset + * @xcpus: effective exclusive CPUs value to be set + * Return: 0 if there is no sibling conflict, > 0 otherwise + * + * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets + * and exclude their exclusive_cpus or effective_xcpus as well. + */ +static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) +{ + struct cpuset *parent = parent_cs(cs); + + cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus); + + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + +/* + * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset + * @trialcs: The trial cpuset containing the proposed new configuration + * @cs: The original cpuset that the trial configuration is based on + * Return: 0 if successful with no sibling conflict, >0 if a conflict is found + * + * Computes the effective_xcpus for a trial configuration. @cs is provided to represent + * the real cs. + */ +static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(trialcs); + struct cpumask *excpus = trialcs->effective_xcpus; + + cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); + return rm_siblings_excl_cpus(parent, cs, excpus); +} + static inline bool is_remote_partition(struct cpuset *cs) { return !list_empty(&cs->remote_sibling); @@ -1497,7 +1524,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * Note that creating a remote partition with any local partition root * above it or remote partition root underneath it is not allowed. */ - compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); + compute_excpus(cs, tmp->new_cpus); WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) @@ -1546,7 +1573,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) cs->partition_root_state = PRS_MEMBER; /* effective_xcpus may need to be changed */ - compute_effective_exclusive_cpumask(cs, NULL, NULL); + compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); @@ -1747,12 +1774,12 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* - * Need to call compute_effective_exclusive_cpumask() in case + * Need to call compute_excpus() in case * exclusive_cpus not set. Sibling conflict should only happen * if exclusive_cpus isn't set. */ xcpus = tmp->delmask; - if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) + if (compute_excpus(cs, xcpus)) WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); /* @@ -2034,7 +2061,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * 2) All the effective_cpus will be used up and cp * has tasks */ - compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); + compute_excpus(cs, new_ecpus); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); @@ -2113,7 +2140,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * its value is being processed. */ if (remote && (cp != cs)) { - compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); + compute_excpus(cp, tmp->new_cpus); if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -2215,7 +2242,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_effective_exclusive_cpumask(cp, NULL, NULL); + compute_excpus(cp, cp->effective_xcpus); /* * Make sure effective_xcpus is properly set for a valid @@ -2364,7 +2391,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * for checking validity of the partition root. */ if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) - compute_effective_exclusive_cpumask(trialcs, NULL, cs); + compute_trialcs_excpus(trialcs, cs); } /* Nothing to do if the cpus didn't change */ @@ -2500,7 +2527,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Reject the change if there is exclusive CPUs conflict with * the siblings. */ - if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) + if (compute_trialcs_excpus(trialcs, cs)) return -EINVAL; } From c5866c9a007deb92717fc0b94ac47b47291748be Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 17 Sep 2025 06:04:48 +0000 Subject: [PATCH 2043/3206] cpuset: refactor CPU mask buffer parsing logic The current implementation contains redundant handling for empty mask inputs, as cpulist_parse() already properly handles these cases. This refactoring introduces a new helper function parse_cpuset_cpulist() to consolidate CPU list parsing logic and eliminate special-case checks for empty inputs. Additionally, the effective_xcpus computation for trial cpusets has been simplified. Rather than computing effective_xcpus only when exclusive_cpus is set or when the cpuset forms a valid partition, we now recalculate it on every cpuset.cpus update. This approach ensures consistency and allows removal of redundant effective_xcpus logic in subsequent patches. The trial cpuset's effective_xcpus calculation follows two distinct cases: 1. For member cpusets: effective_xcpus is determined by the intersection of cpuset->exclusive_cpus and the parent's effective_xcpus. 2. For non-member cpusets: effective_xcpus is derived from the intersection of user_xcpus and the parent's effective_xcpus. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 59 +++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2b87a4b09fc953..45784c62d1c698 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -170,6 +170,11 @@ static inline bool is_partition_invalid(const struct cpuset *cs) return cs->partition_root_state < 0; } +static inline bool cs_is_member(const struct cpuset *cs) +{ + return cs->partition_root_state == PRS_MEMBER; +} + /* * Callers should hold callback_lock to modify partition_root_state. */ @@ -1479,7 +1484,13 @@ static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) struct cpuset *parent = parent_cs(trialcs); struct cpumask *excpus = trialcs->effective_xcpus; - cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); + /* trialcs is member, cpuset.cpus has no impact to excpus */ + if (cs_is_member(cs)) + cpumask_and(excpus, trialcs->exclusive_cpus, + parent->effective_xcpus); + else + cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); + return rm_siblings_excl_cpus(parent, cs, excpus); } @@ -2349,6 +2360,19 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, rcu_read_unlock(); } +static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) +{ + int retval; + + retval = cpulist_parse(buf, out_mask); + if (retval < 0) + return retval; + if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed)) + return -EINVAL; + + return 0; +} + /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider @@ -2365,34 +2389,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, bool force = false; int old_prs = cs->partition_root_state; - /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. - * Since cpulist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have cpus. - */ - if (!*buf) { - cpumask_clear(trialcs->cpus_allowed); - if (cpumask_empty(trialcs->exclusive_cpus)) - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); - if (retval < 0) - return retval; - - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) - return -EINVAL; - - /* - * When exclusive_cpus isn't explicitly set, it is constrained - * by cpus_allowed and parent's effective_xcpus. Otherwise, - * trialcs->effective_xcpus is used as a temporary cpumask - * for checking validity of the partition root. - */ - if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) - compute_trialcs_excpus(trialcs, cs); - } + retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); + if (retval < 0) + return retval; /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) @@ -2401,6 +2400,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (alloc_tmpmasks(&tmp)) return -ENOMEM; + compute_trialcs_excpus(trialcs, cs); + if (old_prs) { if (is_partition_valid(cs) && cpumask_empty(trialcs->effective_xcpus)) { From 8daab66eb329ed2fe7e2922c3739dfa53dcf4694 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 17 Sep 2025 06:04:49 +0000 Subject: [PATCH 2044/3206] cpuset: introduce cpus_excl_conflict and mems_excl_conflict helpers This patch adds cpus_excl_conflict() and mems_excl_conflict() helper functions to improve code readability and maintainability. The exclusive conflict checking follows these rules: 1. If either cpuset has the 'exclusive' flag set, their user_xcpus must not have any overlap. 2. If neither cpuset has the 'exclusive' flag set, their 'cpuset.cpus.exclusive' (only for v2) values must not intersect. 3. The 'cpuset.cpus' of one cpuset must not form a subset of another cpuset's 'cpuset.cpus.exclusive'. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 74 +++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 45784c62d1c698..8cb4d8fcf10cf2 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -583,6 +583,47 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) return true; } +/** + * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * Conflict detection rules: + * 1. If either cpuset is CPU exclusive, they must be mutually exclusive + * 2. exclusive_cpus masks cannot intersect between cpusets + * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + */ +static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + /* If either cpuset is exclusive, check if they are mutually exclusive */ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return !cpusets_are_exclusive(cs1, cs2); + + /* Exclusive_cpus cannot intersect */ + if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) + return true; + + /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ + if (!cpumask_empty(cs1->cpus_allowed) && + cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) + return true; + + if (!cpumask_empty(cs2->cpus_allowed) && + cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + return true; + + return false; +} + +static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2))) + return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); + return false; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -664,38 +705,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - bool txset, cxset; /* Are exclusive_cpus set? */ - if (c == cur) continue; - - txset = !cpumask_empty(trial->exclusive_cpus); - cxset = !cpumask_empty(c->exclusive_cpus); - if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || - (txset && cxset)) { - if (!cpusets_are_exclusive(trial, c)) - goto out; - } else if (txset || cxset) { - struct cpumask *xcpus, *acpus; - - /* - * When just one of the exclusive_cpus's is set, - * cpus_allowed of the other cpuset, if set, cannot be - * a subset of it or none of those CPUs will be - * available if these exclusive CPUs are activated. - */ - if (txset) { - xcpus = trial->exclusive_cpus; - acpus = c->cpus_allowed; - } else { - xcpus = c->exclusive_cpus; - acpus = trial->cpus_allowed; - } - if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) - goto out; - } - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) + if (cpus_excl_conflict(trial, c)) + goto out; + if (mems_excl_conflict(trial, c)) goto out; } From 7e05981ba34a214fd43a2e4d776bc6b0c235e2fb Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 17 Sep 2025 06:04:50 +0000 Subject: [PATCH 2045/3206] cpuset: refactor out validate_partition Refactor the validate_partition function to handle cpuset partition validation when modifying cpuset.cpus. This refactoring also makes the function reusable for handling cpuset.cpus.exclusive updates in subsequent patches. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 48 +++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8cb4d8fcf10cf2..2e963e2ef8eae8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2387,6 +2387,37 @@ static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) return 0; } +/** + * validate_partition - Validate a cpuset partition configuration + * @cs: The cpuset to validate + * @trialcs: The trial cpuset containing proposed configuration changes + * + * If any validation check fails, the appropriate error code is set in the + * cpuset's prs_err field. + * + * Return: PRS error code (0 if valid, non-zero error code if invalid) + */ +static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) +{ + struct cpuset *parent = parent_cs(cs); + + if (cs_is_member(trialcs)) + return PERR_NONE; + + if (cpumask_empty(trialcs->effective_xcpus)) + return PERR_INVCPUS; + + if (prstate_housekeeping_conflict(trialcs->partition_root_state, + trialcs->effective_xcpus)) + return PERR_HKEEPING; + + if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) + return PERR_NOCPUS; + + return PERR_NONE; +} + + /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider @@ -2402,6 +2433,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, bool invalidate = false; bool force = false; int old_prs = cs->partition_root_state; + enum prs_errcode prs_err; retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); if (retval < 0) @@ -2416,18 +2448,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, compute_trialcs_excpus(trialcs, cs); - if (old_prs) { - if (is_partition_valid(cs) && - cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } + prs_err = validate_partition(cs, trialcs); + if (prs_err) { + invalidate = true; + cs->prs_err = prs_err; } /* From c6366739804f836ac88474527430d3fd174580eb Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 17 Sep 2025 06:04:51 +0000 Subject: [PATCH 2046/3206] cpuset: refactor cpus_allowed_validate_change Refactor cpus_allowed_validate_change to handle the special case where cpuset.cpus can be set even when violating partition sibling CPU exclusivity rules. This differs from the general validation logic in validate_change. Add a wrapper function to properly handle this exceptional case. The trialcs->prs_err field is cleared before performing validation checks for both CPU changes and partition errors. If cpus_allowed_validate_change fails its validation, trialcs->prs_err is set to PERR_NOTEXCL. If partition validation fails, the specific error code returned by validate_partition is assigned to trialcs->prs_err. With the partition validation status now directly available through trialcs->prs_err, the local boolean variable 'invalidate' becomes redundant and can be safely removed. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 84 ++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2e963e2ef8eae8..cc3837899d4d19 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2417,6 +2417,42 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri return PERR_NONE; } +static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + int retval; + struct cpuset *parent = parent_cs(cs); + + retval = validate_change(cs, trialcs); + + if ((retval == -EINVAL) && cpuset_v2()) { + struct cgroup_subsys_state *css; + struct cpuset *cp; + + /* + * The -EINVAL error code indicates that partition sibling + * CPU exclusivity rule has been violated. We still allow + * the cpumask change to proceed while invalidating the + * partition. However, any conflicting sibling partitions + * have to be marked as invalid too. + */ + trialcs->prs_err = PERR_NOTEXCL; + rcu_read_lock(); + cpuset_for_each_child(cp, css, parent) { + struct cpumask *xcpus = user_xcpus(trialcs); + + if (is_partition_valid(cp) && + cpumask_intersects(xcpus, cp->effective_xcpus)) { + rcu_read_unlock(); + update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); + rcu_read_lock(); + } + } + rcu_read_unlock(); + retval = 0; + } + return retval; +} /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it @@ -2429,8 +2465,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; bool force = false; int old_prs = cs->partition_root_state; enum prs_errcode prs_err; @@ -2447,12 +2481,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return -ENOMEM; compute_trialcs_excpus(trialcs, cs); + trialcs->prs_err = PERR_NONE; - prs_err = validate_partition(cs, trialcs); - if (prs_err) { - invalidate = true; - cs->prs_err = prs_err; - } + if (cpus_allowed_validate_change(cs, trialcs, &tmp) < 0) + goto out_free; /* * Check all the descendants in update_cpumasks_hier() if @@ -2460,40 +2492,14 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); - retval = validate_change(cs, trialcs); - - if ((retval == -EINVAL) && cpuset_v2()) { - struct cgroup_subsys_state *css; - struct cpuset *cp; - - /* - * The -EINVAL error code indicates that partition sibling - * CPU exclusivity rule has been violated. We still allow - * the cpumask change to proceed while invalidating the - * partition. However, any conflicting sibling partitions - * have to be marked as invalid too. - */ - invalidate = true; - rcu_read_lock(); - cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = user_xcpus(trialcs); - - if (is_partition_valid(cp) && - cpumask_intersects(xcpus, cp->effective_xcpus)) { - rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); - rcu_read_lock(); - } - } - rcu_read_unlock(); - retval = 0; + prs_err = validate_partition(cs, trialcs); + if (prs_err) { + trialcs->prs_err = prs_err; + cs->prs_err = prs_err; } - if (retval < 0) - goto out_free; - if (is_partition_valid(cs) || - (is_partition_invalid(cs) && !invalidate)) { + (is_partition_invalid(cs) && !trialcs->prs_err)) { struct cpumask *xcpus = trialcs->effective_xcpus; if (cpumask_empty(xcpus) && is_partition_invalid(cs)) @@ -2504,7 +2510,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ if (is_remote_partition(cs)) remote_cpus_update(cs, NULL, xcpus, &tmp); - else if (invalidate) + else if (trialcs->prs_err) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); else From 27db8246004ad467ab36dedce847e24f9ca34b94 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 17 Sep 2025 06:04:52 +0000 Subject: [PATCH 2047/3206] cpuset: introduce partition_cpus_change Introduce the partition_cpus_change function to handle both regular CPU set updates and exclusive CPU modifications, either of which may trigger partition state changes. This generalized function will also be utilized for exclusive CPU updates in subsequent patches. With the introduction of compute_trialcs_excpus in a previous patch, the trialcs->effective_xcpus field is now consistently computed and maintained. Consequently, the legacy logic which assigned **trialcs->allowed_cpus to a local 'xcpus' variable** when trialcs->effective_xcpus was empty has been removed. This removal is safe because when trialcs is not a partition member, trialcs->effective_xcpus is now correctly populated with the intersection of user_xcpus and the parent's effective_xcpus. This calculation inherently covers the scenario previously handled by the removed code. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 64 +++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cc3837899d4d19..440f570c666ce5 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2454,6 +2454,43 @@ static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialc return retval; } +/** + * partition_cpus_change - Handle partition state changes due to CPU mask updates + * @cs: The target cpuset being modified + * @trialcs: The trial cpuset containing proposed configuration changes + * @tmp: Temporary masks for intermediate calculations + * + * This function handles partition state transitions triggered by CPU mask changes. + * CPU modifications may cause a partition to be disabled or require state updates. + */ +static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + enum prs_errcode prs_err; + + if (cs_is_member(cs)) + return; + + prs_err = validate_partition(cs, trialcs); + if (prs_err) + trialcs->prs_err = cs->prs_err = prs_err; + + if (is_remote_partition(cs)) { + if (trialcs->prs_err) + remote_partition_disable(cs, tmp); + else + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, tmp); + } else { + if (trialcs->prs_err) + update_parent_effective_cpumask(cs, partcmd_invalidate, + NULL, tmp); + else + update_parent_effective_cpumask(cs, partcmd_update, + trialcs->effective_xcpus, tmp); + } +} + /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider @@ -2467,7 +2504,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; bool force = false; int old_prs = cs->partition_root_state; - enum prs_errcode prs_err; retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); if (retval < 0) @@ -2492,31 +2528,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); - prs_err = validate_partition(cs, trialcs); - if (prs_err) { - trialcs->prs_err = prs_err; - cs->prs_err = prs_err; - } - - if (is_partition_valid(cs) || - (is_partition_invalid(cs) && !trialcs->prs_err)) { - struct cpumask *xcpus = trialcs->effective_xcpus; - - if (cpumask_empty(xcpus) && is_partition_invalid(cs)) - xcpus = trialcs->cpus_allowed; - - /* - * Call remote_cpus_update() to handle valid remote partition - */ - if (is_remote_partition(cs)) - remote_cpus_update(cs, NULL, xcpus, &tmp); - else if (trialcs->prs_err) - update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); - else - update_parent_effective_cpumask(cs, partcmd_update, - xcpus, &tmp); - } + partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); From de9f15e21c55a0a7d2c907b7f0eec95385c5a9de Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 17 Sep 2025 06:04:53 +0000 Subject: [PATCH 2048/3206] cpuset: use parse_cpulist for setting cpus.exclusive Previous patches made parse_cpulist handle empty cpu mask input. Now use this helper for exclusive cpus setting. Also, compute_trialcs_xcpus can be called with empty cpus and handles it correctly. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 440f570c666ce5..5c818b509cb8ee 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2566,27 +2566,20 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, bool force = false; int old_prs = cs->partition_root_state; - if (!*buf) { - cpumask_clear(trialcs->exclusive_cpus); - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->exclusive_cpus); - if (retval < 0) - return retval; - } + retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus); + if (retval < 0) + return retval; /* Nothing to do if the CPUs didn't change */ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (*buf) { - /* - * Reject the change if there is exclusive CPUs conflict with - * the siblings. - */ - if (compute_trialcs_excpus(trialcs, cs)) - return -EINVAL; - } + /* + * Reject the change if there is exclusive CPUs conflict with + * the siblings. + */ + if (compute_trialcs_excpus(trialcs, cs)) + return -EINVAL; /* * Check all the descendants in update_cpumasks_hier() if From c49b5e89c45f317f23d11b640f77e91d0d8e5b56 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 17 Sep 2025 06:04:54 +0000 Subject: [PATCH 2049/3206] cpuset: use partition_cpus_change for setting exclusive cpus A previous patch has introduced a new helper function partition_cpus_change(). Now replace the exclusive cpus setting logic with this helper function. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5c818b509cb8ee..44231cb1d83fe8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2561,8 +2561,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; bool force = false; int old_prs = cs->partition_root_state; @@ -2594,32 +2592,9 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (alloc_tmpmasks(&tmp)) return -ENOMEM; - if (old_prs) { - if (cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } + trialcs->prs_err = PERR_NONE; + partition_cpus_change(cs, trialcs, &tmp); - if (is_remote_partition(cs)) { - if (invalidate) - remote_partition_disable(cs, &tmp); - else - remote_cpus_update(cs, trialcs->exclusive_cpus, - trialcs->effective_xcpus, &tmp); - } else if (invalidate) { - update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); - } else { - update_parent_effective_cpumask(cs, partcmd_update, - trialcs->effective_xcpus, &tmp); - } - } spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); From bdf780fbcef5df4d365404a178e7f845a317b4e9 Mon Sep 17 00:00:00 2001 From: Huisong Li Date: Thu, 11 Sep 2025 19:24:07 +0800 Subject: [PATCH 2050/3206] ACPI: processor: idle: Rearrange declarations in header file Group all of the declarations of functions that belong to the ACPI processor idle driver together in one place in processor.h. While at it, drop the unnecessary extern modifier from the declaraions of two functions. Signed-off-by: Huisong Li Link: https://patch.msgid.link/20250911112408.1668431-3-lihuisong@huawei.com [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- include/acpi/processor.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 2976a6d0c54f79..6ee4a69412de67 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -425,6 +425,8 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); void acpi_processor_register_idle_driver(void); void acpi_processor_unregister_idle_driver(void); +int acpi_processor_ffh_lpi_probe(unsigned int cpu); +int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi); #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ /* in processor_thermal.c */ @@ -447,11 +449,6 @@ static inline void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy) } #endif /* CONFIG_CPU_FREQ */ -#ifdef CONFIG_ACPI_PROCESSOR_IDLE -extern int acpi_processor_ffh_lpi_probe(unsigned int cpu); -extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi); -#endif - void acpi_processor_init_invariance_cppc(void); #endif From a8ba87f04ca9cdec06776ce92dce1395026dc3bb Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 16 Sep 2025 08:01:26 +0000 Subject: [PATCH 2051/3206] bonding: don't set oif to bond dev when getting NS target destination Unlike IPv4, IPv6 routing strictly requires the source address to be valid on the outgoing interface. If the NS target is set to a remote VLAN interface, and the source address is also configured on a VLAN over a bond interface, setting the oif to the bond device will fail to retrieve the correct destination route. Fix this by not setting the oif to the bond device when retrieving the NS target destination. This allows the correct destination device (the VLAN interface) to be determined, so that bond_verify_device_path can return the proper VLAN tags for sending NS messages. Reported-by: David Wilder Closes: https://lore.kernel.org/netdev/aGOKggdfjv0cApTO@fedora/ Suggested-by: Jay Vosburgh Tested-by: David Wilder Acked-by: Jay Vosburgh Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250916080127.430626-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8832bc9f107bc0..57be04f6cb11a8 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3356,7 +3356,6 @@ static void bond_ns_send_all(struct bonding *bond, struct slave *slave) /* Find out through which dev should the packet go */ memset(&fl6, 0, sizeof(struct flowi6)); fl6.daddr = targets[i]; - fl6.flowi6_oif = bond->dev->ifindex; dst = ip6_route_output(dev_net(bond->dev), NULL, &fl6); if (dst->error) { From dc5f94b1ec8f9f65d1ad7a5f45fcac740544e35f Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 16 Sep 2025 08:01:27 +0000 Subject: [PATCH 2052/3206] selftests: bonding: add vlan over bond testing Add a vlan over bond testing to make sure arp/ns target works. Also change all the configs to mudules. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250916080127.430626-2-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- .../drivers/net/bonding/bond_options.sh | 58 +++++++++++++++++++ .../selftests/drivers/net/bonding/config | 1 + 2 files changed, 59 insertions(+) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index e3f3cc803b562c..187b478d0ddf29 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -8,6 +8,7 @@ ALL_TESTS=" arp_validate num_grat_arp fail_over_mac + vlan_over_bond " lib_dir=$(dirname "$0") @@ -508,7 +509,64 @@ fail_over_mac() log_test "fail_over_mac 2" "failover: backup slave mac inherit" check_first_slave_random_mac log_test "fail_over_mac 2" "first slave mac random" +} + +vlan_over_bond_arp() +{ + local mode="$1" + RET=0 + + bond_reset "mode $mode arp_interval 100 arp_ip_target 192.0.3.10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 101 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond arp" "$mode" +} + +vlan_over_bond_ns() +{ + local mode="$1" + RET=0 + + if skip_ns; then + log_test_skip "vlan_over_bond ns" "$mode" + return 0 + fi + bond_reset "mode $mode arp_interval 100 ns_ip6_target 2001:db8::3:10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 102 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond ns" "$mode" +} + +vlan_over_bond() +{ + # add vlan 3 for client + ip -n "${c_ns}" link add eth0.3 link eth0 type vlan id 3 + ip -n "${c_ns}" link set eth0.3 up + ip -n "${c_ns}" addr add 192.0.3.10/24 dev eth0.3 + ip -n "${c_ns}" addr add 2001:db8::3:10/64 dev eth0.3 + + # Add tc rule to check the vlan pkts + tc -n "${c_ns}" qdisc add dev eth0.3 clsact + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol arp \ + handle 101 flower skip_hw arp_op request \ + arp_sip 192.0.3.1 arp_tip 192.0.3.10 action pass + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol ipv6 \ + handle 102 flower skip_hw ip_proto icmpv6 \ + type 135 src_ip 2001:db8::3:1 action pass + + vlan_over_bond_arp "active-backup" + vlan_over_bond_ns "active-backup" } trap cleanup EXIT diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 4d16a69ffc6507..832fa1caeb6627 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -10,3 +10,4 @@ CONFIG_NET_CLS_MATCHALL=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y CONFIG_VETH=y +CONFIG_VLAN_8021Q=m From dc3382fffdec2c1d6df5836c88fa37b39cd8651e Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Tue, 16 Sep 2025 15:58:16 +0800 Subject: [PATCH 2053/3206] tracing: kprobe-event: Fix null-ptr-deref in trace_kprobe_create_internal() A crash was observed with the following output: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 UID: 0 PID: 2899 Comm: syz.2.399 Not tainted 6.17.0-rc5+ #5 PREEMPT(none) RIP: 0010:trace_kprobe_create_internal+0x3fc/0x1440 kernel/trace/trace_kprobe.c:911 Call Trace: trace_kprobe_create_cb+0xa2/0xf0 kernel/trace/trace_kprobe.c:1089 trace_probe_create+0xf1/0x110 kernel/trace/trace_probe.c:2246 dyn_event_create+0x45/0x70 kernel/trace/trace_dynevent.c:128 create_or_delete_trace_kprobe+0x5e/0xc0 kernel/trace/trace_kprobe.c:1107 trace_parse_run_command+0x1a5/0x330 kernel/trace/trace.c:10785 vfs_write+0x2b6/0xd00 fs/read_write.c:684 ksys_write+0x129/0x240 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x5d/0x2d0 arch/x86/entry/syscall_64.c:94 Function kmemdup() may return NULL in trace_kprobe_create_internal(), add check for it's return value. Link: https://lore.kernel.org/all/20250916075816.3181175-1-wangliang74@huawei.com/ Fixes: 33b4e38baa03 ("tracing: kprobe-event: Allocate string buffers from heap") Signed-off-by: Wang Liang Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ccae62d4fb9177..fa60362a3f31bd 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -908,6 +908,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], return -EINVAL; } buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { From a72175c985132885573593222a7b088cf49b07ae Mon Sep 17 00:00:00 2001 From: Sathesh B Edara Date: Tue, 16 Sep 2025 06:32:07 -0700 Subject: [PATCH 2054/3206] octeon_ep: fix VF MAC address lifecycle handling Currently, VF MAC address info is not updated when the MAC address is configured from VF, and it is not cleared when the VF is removed. This leads to stale or missing MAC information in the PF, which may cause incorrect state tracking or inconsistencies when VFs are hot-plugged or reassigned. Fix this by: - storing the VF MAC address in the PF when it is set from VF - clearing the stored VF MAC address when the VF is removed This ensures that the PF always has correct VF MAC state. Fixes: cde29af9e68e ("octeon_ep: add PF-VF mailbox communication") Signed-off-by: Sathesh B Edara Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250916133207.21737-1-sedara@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c index ebecdd29f3bd05..0867fab61b1905 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c @@ -196,6 +196,7 @@ static void octep_pfvf_get_mac_addr(struct octep_device *oct, u32 vf_id, vf_id); return; } + ether_addr_copy(oct->vf_info[vf_id].mac_addr, rsp->s_set_mac.mac_addr); rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_ACK; } @@ -205,6 +206,8 @@ static void octep_pfvf_dev_remove(struct octep_device *oct, u32 vf_id, { int err; + /* Reset VF-specific information maintained by the PF */ + memset(&oct->vf_info[vf_id], 0, sizeof(struct octep_pfvf_info)); err = octep_ctrl_net_dev_remove(oct, vf_id); if (err) { rsp->s.type = OCTEP_PFVF_MBOX_TYPE_RSP_NACK; From f3243bed39c26ce0f13e6392a634f91d409b2d02 Mon Sep 17 00:00:00 2001 From: Junhui Liu Date: Tue, 22 Jul 2025 00:53:10 +0800 Subject: [PATCH 2055/3206] riscv: mm: Return intended SATP mode for noXlvl options Change the return value of match_noXlvl() to return the SATP mode that will be used, rather than the mode being disabled. This enables unified logic for return value judgement with the function that obtains mmu-type from the fdt, avoiding extra conversion. This only changes the naming, with no functional impact. Signed-off-by: Junhui Liu Reviewed-by: Alexandre Ghiti Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250722-satp-from-fdt-v1-1-5ba22218fa5f@pigmoral.tech Signed-off-by: Paul Walmsley --- arch/riscv/kernel/pi/cmdline_early.c | 4 ++-- arch/riscv/mm/init.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/pi/cmdline_early.c b/arch/riscv/kernel/pi/cmdline_early.c index fbcdc9e4e14322..389d086a071876 100644 --- a/arch/riscv/kernel/pi/cmdline_early.c +++ b/arch/riscv/kernel/pi/cmdline_early.c @@ -41,9 +41,9 @@ static char *get_early_cmdline(uintptr_t dtb_pa) static u64 match_noXlvl(char *cmdline) { if (strstr(cmdline, "no4lvl")) - return SATP_MODE_48; + return SATP_MODE_39; else if (strstr(cmdline, "no5lvl")) - return SATP_MODE_57; + return SATP_MODE_48; return 0; } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 15683ae13fa5d1..054265b3f26803 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -864,9 +864,9 @@ static __init void set_satp_mode(uintptr_t dtb_pa) kernel_map.page_offset = PAGE_OFFSET_L5; - if (satp_mode_cmdline == SATP_MODE_57) { + if (satp_mode_cmdline == SATP_MODE_48) { disable_pgtable_l5(); - } else if (satp_mode_cmdline == SATP_MODE_48) { + } else if (satp_mode_cmdline == SATP_MODE_39) { disable_pgtable_l5(); disable_pgtable_l4(); return; From c3a45c5fde95d319125812f629c579cc63c69941 Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Fri, 12 Sep 2025 09:51:46 +0300 Subject: [PATCH 2056/3206] dt-bindings: power: supply: bq24190: document charge enable pin Document active low Charge Enable pin. Battery charging is enabled when REG01[5:4] = 01 and CE pin = Low. CE pin must be pulled high or low. Signed-off-by: Svyatoslav Ryhel Reviewed-by: Rob Herring (Arm) Signed-off-by: Sebastian Reichel --- Documentation/devicetree/bindings/power/supply/bq24190.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/power/supply/bq24190.yaml b/Documentation/devicetree/bindings/power/supply/bq24190.yaml index ac9a76fc5876be..938554a9fb02c2 100644 --- a/Documentation/devicetree/bindings/power/supply/bq24190.yaml +++ b/Documentation/devicetree/bindings/power/supply/bq24190.yaml @@ -30,6 +30,12 @@ properties: interrupts: maxItems: 1 + ce-gpios: + description: + Active low Charge Enable pin. Battery charging is enabled when + REG01[5:4] = 01 and CE pin is Low. CE pin must be pulled high or low. + maxItems: 1 + usb-otg-vbus: $ref: /schemas/regulator/regulator.yaml# description: | From 1bafaa156ed3881cd4f187ab1c43e408742e1f11 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 9 Sep 2025 10:09:25 +0800 Subject: [PATCH 2057/3206] power: supply: rx51: remove redundant condition checks Remove redundant condition checks and replace else if with else. Signed-off-by: Xichao Zhao Signed-off-by: Sebastian Reichel --- drivers/power/supply/rx51_battery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/power/supply/rx51_battery.c b/drivers/power/supply/rx51_battery.c index 7cdcd415e8684d..b0220ec2d92661 100644 --- a/drivers/power/supply/rx51_battery.c +++ b/drivers/power/supply/rx51_battery.c @@ -116,7 +116,7 @@ static int rx51_battery_read_temperature(struct rx51_device_info *di) int mid = (max + min) / 2; if (rx51_temp_table2[mid] <= raw) min = mid; - else if (rx51_temp_table2[mid] > raw) + else max = mid; if (rx51_temp_table2[mid] == raw) break; From d69ae81efbc95c94a2760fc82d27cdab4c26fe76 Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 17 Sep 2025 18:15:14 +0800 Subject: [PATCH 2058/3206] power: supply: core: Add resistance power supply property Some battery drivers provide the ability to export internal resistance as a parameter. Add internal_resistance power supply property for that purpose. Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- Documentation/ABI/testing/sysfs-class-power | 16 ++++++++++++++++ drivers/power/supply/power_supply_sysfs.c | 1 + include/linux/power_supply.h | 1 + 3 files changed, 18 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index 87a058e14e7edd..e6cce423c632f0 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -553,6 +553,22 @@ Description: Integer > 0: representing full cycles Integer = 0: cycle_count info is not available +What: /sys/class/power_supply//internal_resistance +Date: August 2025 +Contact: linux-arm-msm@vger.kernel.org +Description: + Represent the battery's internal resistance, often referred + to as Equivalent Series Resistance (ESR). It is a dynamic + parameter that reflects the opposition to current flow within + the cell. It is not a fixed value but varies significantly + based on several operational conditions, including battery + state of charge (SoC), temperature, and whether the battery + is in a charging or discharging state. + + Access: Read + + Valid values: Represented in microohms + **USB Properties** What: /sys/class/power_supply//input_current_limit diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index 18e5e84a81c634..8ba08d456352b9 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -223,6 +223,7 @@ static struct power_supply_attr power_supply_attrs[] __ro_after_init = { POWER_SUPPLY_ATTR(MANUFACTURE_YEAR), POWER_SUPPLY_ATTR(MANUFACTURE_MONTH), POWER_SUPPLY_ATTR(MANUFACTURE_DAY), + POWER_SUPPLY_ATTR(INTERNAL_RESISTANCE), /* Properties of type `const char *' */ POWER_SUPPLY_ATTR(MODEL_NAME), POWER_SUPPLY_ATTR(MANUFACTURER), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index f21f806bfb3831..f38da7c039d205 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -176,6 +176,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_MANUFACTURE_YEAR, POWER_SUPPLY_PROP_MANUFACTURE_MONTH, POWER_SUPPLY_PROP_MANUFACTURE_DAY, + POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, /* Properties of type `const char *' */ POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_MANUFACTURER, From cd93fbdce5981c947f22015ded3ac6bd1939b0ad Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 17 Sep 2025 18:15:15 +0800 Subject: [PATCH 2059/3206] power: supply: core: Add state_of_health power supply property Add state_of_health power supply property to represent battery health percentage. Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- Documentation/ABI/testing/sysfs-class-power | 21 +++++++++++++++++++++ drivers/power/supply/power_supply_sysfs.c | 1 + include/linux/power_supply.h | 1 + 3 files changed, 23 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index e6cce423c632f0..4b21d5d2325136 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -569,6 +569,27 @@ Description: Valid values: Represented in microohms +What: /sys/class/power_supply//state_of_health +Date: August 2025 +Contact: linux-arm-msm@vger.kernel.org +Description: + The state_of_health parameter quantifies the overall condition + of a battery as a percentage, reflecting its ability to deliver + rated performance relative to its original specifications. It is + dynamically computed using a combination of learned capacity + and impedance-based degradation indicators, both of which evolve + over the battery's lifecycle. + Note that the exact algorithms are kept secret by most battery + vendors and the value from different battery vendors cannot be + compared with each other as there is no vendor-agnostic definition + of "performance". Also this usually cannot be used for any + calculations (i.e. this is not the factor between charge_full and + charge_full_design). + + Access: Read + + Valid values: 0 - 100 (percent) + **USB Properties** What: /sys/class/power_supply//input_current_limit diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index 8ba08d456352b9..198405f7126f96 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -224,6 +224,7 @@ static struct power_supply_attr power_supply_attrs[] __ro_after_init = { POWER_SUPPLY_ATTR(MANUFACTURE_MONTH), POWER_SUPPLY_ATTR(MANUFACTURE_DAY), POWER_SUPPLY_ATTR(INTERNAL_RESISTANCE), + POWER_SUPPLY_ATTR(STATE_OF_HEALTH), /* Properties of type `const char *' */ POWER_SUPPLY_ATTR(MODEL_NAME), POWER_SUPPLY_ATTR(MANUFACTURER), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index f38da7c039d205..360ffdf272dab8 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -177,6 +177,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_MANUFACTURE_MONTH, POWER_SUPPLY_PROP_MANUFACTURE_DAY, POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, + POWER_SUPPLY_PROP_STATE_OF_HEALTH, /* Properties of type `const char *' */ POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_MANUFACTURER, From 45e57e6a213448f0b372f9cbd3f90f301f675c9b Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 17 Sep 2025 18:15:16 +0800 Subject: [PATCH 2060/3206] power: supply: qcom_battmgr: Add resistance power supply property Add power supply property to get battery internal resistance from the battery management firmware. Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- drivers/power/supply/qcom_battmgr.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index fdb2d1b883fc58..fc3058d038b14e 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2019-2020, The Linux Foundation. All rights reserved. * Copyright (c) 2022, Linaro Ltd + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. */ #include #include @@ -254,6 +255,7 @@ struct qcom_battmgr_status { unsigned int voltage_now; unsigned int voltage_ocv; unsigned int temperature; + unsigned int resistance; unsigned int discharge_time; unsigned int charge_time; @@ -418,6 +420,7 @@ static const u8 sm8350_bat_prop_map[] = { [POWER_SUPPLY_PROP_MODEL_NAME] = BATT_MODEL_NAME, [POWER_SUPPLY_PROP_TIME_TO_FULL_AVG] = BATT_TTF_AVG, [POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG] = BATT_TTE_AVG, + [POWER_SUPPLY_PROP_INTERNAL_RESISTANCE] = BATT_RESISTANCE, [POWER_SUPPLY_PROP_POWER_NOW] = BATT_POWER_NOW, }; @@ -584,6 +587,9 @@ static int qcom_battmgr_bat_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_TEMP: val->intval = battmgr->status.temperature; break; + case POWER_SUPPLY_PROP_INTERNAL_RESISTANCE: + val->intval = battmgr->status.resistance; + break; case POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG: val->intval = battmgr->status.discharge_time; break; @@ -668,6 +674,7 @@ static const enum power_supply_property sm8350_bat_props[] = { POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_TIME_TO_FULL_AVG, POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG, + POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, POWER_SUPPLY_PROP_POWER_NOW, }; @@ -1200,6 +1207,9 @@ static void qcom_battmgr_sm8350_callback(struct qcom_battmgr *battmgr, case BATT_TTE_AVG: battmgr->status.discharge_time = le32_to_cpu(resp->intval.value); break; + case BATT_RESISTANCE: + battmgr->status.resistance = le32_to_cpu(resp->intval.value); + break; case BATT_POWER_NOW: battmgr->status.power_now = le32_to_cpu(resp->intval.value); break; From b8e5030e09c11a47b7dadd28b492ec00b40a1b8c Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 17 Sep 2025 18:15:17 +0800 Subject: [PATCH 2061/3206] power: supply: qcom_battmgr: Add state_of_health property Add state_of_health property to read battery health percentage from battery management firmware. Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- drivers/power/supply/qcom_battmgr.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index fc3058d038b14e..784e3b0110bdd9 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -256,6 +256,7 @@ struct qcom_battmgr_status { unsigned int voltage_ocv; unsigned int temperature; unsigned int resistance; + unsigned int soh_percent; unsigned int discharge_time; unsigned int charge_time; @@ -421,6 +422,7 @@ static const u8 sm8350_bat_prop_map[] = { [POWER_SUPPLY_PROP_TIME_TO_FULL_AVG] = BATT_TTF_AVG, [POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG] = BATT_TTE_AVG, [POWER_SUPPLY_PROP_INTERNAL_RESISTANCE] = BATT_RESISTANCE, + [POWER_SUPPLY_PROP_STATE_OF_HEALTH] = BATT_SOH, [POWER_SUPPLY_PROP_POWER_NOW] = BATT_POWER_NOW, }; @@ -590,6 +592,9 @@ static int qcom_battmgr_bat_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_INTERNAL_RESISTANCE: val->intval = battmgr->status.resistance; break; + case POWER_SUPPLY_PROP_STATE_OF_HEALTH: + val->intval = battmgr->status.soh_percent; + break; case POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG: val->intval = battmgr->status.discharge_time; break; @@ -675,6 +680,7 @@ static const enum power_supply_property sm8350_bat_props[] = { POWER_SUPPLY_PROP_TIME_TO_FULL_AVG, POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG, POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, + POWER_SUPPLY_PROP_STATE_OF_HEALTH, POWER_SUPPLY_PROP_POWER_NOW, }; @@ -1167,6 +1173,9 @@ static void qcom_battmgr_sm8350_callback(struct qcom_battmgr *battmgr, case BATT_CAPACITY: battmgr->status.percent = le32_to_cpu(resp->intval.value) / 100; break; + case BATT_SOH: + battmgr->status.soh_percent = le32_to_cpu(resp->intval.value); + break; case BATT_VOLT_OCV: battmgr->status.voltage_ocv = le32_to_cpu(resp->intval.value); break; From b3c0f651b3cf4dfaf2e8210d7bb9b79471f6403b Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 17 Sep 2025 18:15:18 +0800 Subject: [PATCH 2062/3206] power: supply: qcom_battmgr: update compats for SM8550 and X1E80100 The SM8550 and X1E80100 platforms now include charge control functionality in battery management firmware, allowing charging to stop when the battery reaches a set level and resume when it drops below another level. To support this in the qcom_battmgr driver, CHARGE_CONTROL_START/END_THRESHOLD power supply properties can be added to manage these levels. This results in the battery power supply properties for SM8550 and X1E80100 differing from those for SM8350 and SC8280XP. Therefore, separate compatible entries for SM8550 and X1E80100 are introduced, each with their own variant definitions as match data. Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- drivers/power/supply/qcom_battmgr.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index 784e3b0110bdd9..99d1de374b345f 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -19,8 +19,10 @@ #define BATTMGR_STRING_LEN 128 enum qcom_battmgr_variant { - QCOM_BATTMGR_SM8350, QCOM_BATTMGR_SC8280XP, + QCOM_BATTMGR_SM8350, + QCOM_BATTMGR_SM8550, + QCOM_BATTMGR_X1E80100, }; #define BATTMGR_BAT_STATUS 0x1 @@ -494,7 +496,8 @@ static int qcom_battmgr_bat_get_property(struct power_supply *psy, if (!battmgr->service_up) return -EAGAIN; - if (battmgr->variant == QCOM_BATTMGR_SC8280XP) + if (battmgr->variant == QCOM_BATTMGR_SC8280XP || + battmgr->variant == QCOM_BATTMGR_X1E80100) ret = qcom_battmgr_bat_sc8280xp_update(battmgr, psp); else ret = qcom_battmgr_bat_sm8350_update(battmgr, psp); @@ -767,7 +770,8 @@ static int qcom_battmgr_usb_get_property(struct power_supply *psy, if (!battmgr->service_up) return -EAGAIN; - if (battmgr->variant == QCOM_BATTMGR_SC8280XP) + if (battmgr->variant == QCOM_BATTMGR_SC8280XP || + battmgr->variant == QCOM_BATTMGR_X1E80100) ret = qcom_battmgr_bat_sc8280xp_update(battmgr, psp); else ret = qcom_battmgr_usb_sm8350_update(battmgr, psp); @@ -889,7 +893,8 @@ static int qcom_battmgr_wls_get_property(struct power_supply *psy, if (!battmgr->service_up) return -EAGAIN; - if (battmgr->variant == QCOM_BATTMGR_SC8280XP) + if (battmgr->variant == QCOM_BATTMGR_SC8280XP || + battmgr->variant == QCOM_BATTMGR_X1E80100) ret = qcom_battmgr_bat_sc8280xp_update(battmgr, psp); else ret = qcom_battmgr_wls_sm8350_update(battmgr, psp); @@ -1323,7 +1328,8 @@ static void qcom_battmgr_callback(const void *data, size_t len, void *priv) if (opcode == BATTMGR_NOTIFICATION) qcom_battmgr_notification(battmgr, data, len); - else if (battmgr->variant == QCOM_BATTMGR_SC8280XP) + else if (battmgr->variant == QCOM_BATTMGR_SC8280XP || + battmgr->variant == QCOM_BATTMGR_X1E80100) qcom_battmgr_sc8280xp_callback(battmgr, data, len); else qcom_battmgr_sm8350_callback(battmgr, data, len); @@ -1359,7 +1365,8 @@ static void qcom_battmgr_pdr_notify(void *priv, int state) static const struct of_device_id qcom_battmgr_of_variants[] = { { .compatible = "qcom,sc8180x-pmic-glink", .data = (void *)QCOM_BATTMGR_SC8280XP }, { .compatible = "qcom,sc8280xp-pmic-glink", .data = (void *)QCOM_BATTMGR_SC8280XP }, - { .compatible = "qcom,x1e80100-pmic-glink", .data = (void *)QCOM_BATTMGR_SC8280XP }, + { .compatible = "qcom,sm8550-pmic-glink", .data = (void *)QCOM_BATTMGR_SM8550 }, + { .compatible = "qcom,x1e80100-pmic-glink", .data = (void *)QCOM_BATTMGR_X1E80100 }, /* Unmatched devices falls back to QCOM_BATTMGR_SM8350 */ {} }; @@ -1399,7 +1406,8 @@ static int qcom_battmgr_probe(struct auxiliary_device *adev, else battmgr->variant = QCOM_BATTMGR_SM8350; - if (battmgr->variant == QCOM_BATTMGR_SC8280XP) { + if (battmgr->variant == QCOM_BATTMGR_SC8280XP || + battmgr->variant == QCOM_BATTMGR_X1E80100) { battmgr->bat_psy = devm_power_supply_register(dev, &sc8280xp_bat_psy_desc, &psy_cfg); if (IS_ERR(battmgr->bat_psy)) return dev_err_probe(dev, PTR_ERR(battmgr->bat_psy), From 7f8624af8e8c2c1a0169b46a23729a4cc614635c Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 17 Sep 2025 18:15:19 +0800 Subject: [PATCH 2063/3206] dt-bindings: soc: qcom,pmic-glink: Add charge limit nvmem properties Add nvmem properties to retrieve charge control configurations from the PMIC SDAM registers. Acked-by: Rob Herring (Arm) Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- .../bindings/soc/qcom/qcom,pmic-glink.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,pmic-glink.yaml b/Documentation/devicetree/bindings/soc/qcom/qcom,pmic-glink.yaml index 48114bb0c9276c..7085bf88afabaa 100644 --- a/Documentation/devicetree/bindings/soc/qcom/qcom,pmic-glink.yaml +++ b/Documentation/devicetree/bindings/soc/qcom/qcom,pmic-glink.yaml @@ -56,6 +56,20 @@ properties: The array should contain a gpio entry for each PMIC Glink connector, in reg order. It is defined that GPIO active level means "CC2" or Reversed/Flipped orientation. + nvmem-cells: + minItems: 3 + maxItems: 3 + description: + The nvmem cells contain the charge control settings, including the charge control + enable status, the battery state of charge (SoC) threshold for stopping charging, + and the battery SoC delta required to restart charging. + + nvmem-cell-names: + items: + - const: charge_limit_en + - const: charge_limit_end + - const: charge_limit_delta + patternProperties: '^connector@\d$': $ref: /schemas/connector/usb-connector.yaml# From cc3e883a06251ba835f15672dbe8724f2687971b Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 17 Sep 2025 18:15:20 +0800 Subject: [PATCH 2064/3206] power: supply: qcom_battmgr: Add charge control support Add charge control support for SM8550 and X1E80100. It's supported with below two power supply properties: charge_control_end_threshold: The battery SoC (State of Charge) threshold at which the charging should be terminated. charge_control_start_threshold: The battery SoC threshold at which the charging should be resumed. Tested-by: Neil Armstrong # on Thinkpad T14S OLED Reviewed-by: Neil Armstrong Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- drivers/power/supply/qcom_battmgr.c | 275 +++++++++++++++++++++++++++- 1 file changed, 273 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index 99d1de374b345f..0fe14a109b70fc 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,9 @@ enum qcom_battmgr_variant { #define BATT_RESISTANCE 21 #define BATT_POWER_NOW 22 #define BATT_POWER_AVG 23 +#define BATT_CHG_CTRL_EN 24 +#define BATT_CHG_CTRL_START_THR 25 +#define BATT_CHG_CTRL_END_THR 26 #define BATTMGR_USB_PROPERTY_GET 0x32 #define BATTMGR_USB_PROPERTY_SET 0x33 @@ -92,6 +96,13 @@ enum qcom_battmgr_variant { #define WLS_TYPE 5 #define WLS_BOOST_EN 6 +#define BATTMGR_CHG_CTRL_LIMIT_EN 0x48 +#define CHARGE_CTRL_START_THR_MIN 50 +#define CHARGE_CTRL_START_THR_MAX 95 +#define CHARGE_CTRL_END_THR_MIN 55 +#define CHARGE_CTRL_END_THR_MAX 100 +#define CHARGE_CTRL_DELTA_SOC 5 + struct qcom_battmgr_enable_request { struct pmic_glink_hdr hdr; __le32 battery_id; @@ -126,6 +137,13 @@ struct qcom_battmgr_discharge_time_request { __le32 reserved; }; +struct qcom_battmgr_charge_ctrl_request { + struct pmic_glink_hdr hdr; + __le32 enable; + __le32 target_soc; + __le32 delta_soc; +}; + struct qcom_battmgr_message { struct pmic_glink_hdr hdr; union { @@ -238,6 +256,8 @@ struct qcom_battmgr_info { unsigned int capacity_warning; unsigned int cycle_count; unsigned int charge_count; + unsigned int charge_ctrl_start; + unsigned int charge_ctrl_end; char model_number[BATTMGR_STRING_LEN]; char serial_number[BATTMGR_STRING_LEN]; char oem_info[BATTMGR_STRING_LEN]; @@ -426,6 +446,8 @@ static const u8 sm8350_bat_prop_map[] = { [POWER_SUPPLY_PROP_INTERNAL_RESISTANCE] = BATT_RESISTANCE, [POWER_SUPPLY_PROP_STATE_OF_HEALTH] = BATT_SOH, [POWER_SUPPLY_PROP_POWER_NOW] = BATT_POWER_NOW, + [POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD] = BATT_CHG_CTRL_START_THR, + [POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD] = BATT_CHG_CTRL_END_THR, }; static int qcom_battmgr_bat_sm8350_update(struct qcom_battmgr *battmgr, @@ -604,6 +626,12 @@ static int qcom_battmgr_bat_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_TIME_TO_FULL_AVG: val->intval = battmgr->status.charge_time; break; + case POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD: + val->intval = battmgr->info.charge_ctrl_start; + break; + case POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD: + val->intval = battmgr->info.charge_ctrl_end; + break; case POWER_SUPPLY_PROP_MANUFACTURE_YEAR: val->intval = battmgr->info.year; break; @@ -629,6 +657,149 @@ static int qcom_battmgr_bat_get_property(struct power_supply *psy, return 0; } +static int qcom_battmgr_set_charge_control(struct qcom_battmgr *battmgr, + u32 target_soc, u32 delta_soc) +{ + struct qcom_battmgr_charge_ctrl_request request = { + .hdr.owner = cpu_to_le32(PMIC_GLINK_OWNER_BATTMGR), + .hdr.type = cpu_to_le32(PMIC_GLINK_REQ_RESP), + .hdr.opcode = cpu_to_le32(BATTMGR_CHG_CTRL_LIMIT_EN), + .enable = cpu_to_le32(1), + .target_soc = cpu_to_le32(target_soc), + .delta_soc = cpu_to_le32(delta_soc), + }; + + return qcom_battmgr_request(battmgr, &request, sizeof(request)); +} + +static int qcom_battmgr_set_charge_start_threshold(struct qcom_battmgr *battmgr, int start_soc) +{ + u32 target_soc, delta_soc; + int ret; + + if (start_soc < CHARGE_CTRL_START_THR_MIN || + start_soc > CHARGE_CTRL_START_THR_MAX) { + dev_err(battmgr->dev, "charge control start threshold exceed range: [%u - %u]\n", + CHARGE_CTRL_START_THR_MIN, CHARGE_CTRL_START_THR_MAX); + return -EINVAL; + } + + /* + * If the new start threshold is larger than the old end threshold, + * move the end threshold one step (DELTA_SOC) after the new start + * threshold. + */ + if (start_soc > battmgr->info.charge_ctrl_end) { + target_soc = start_soc + CHARGE_CTRL_DELTA_SOC; + target_soc = min_t(u32, target_soc, CHARGE_CTRL_END_THR_MAX); + delta_soc = target_soc - start_soc; + delta_soc = min_t(u32, delta_soc, CHARGE_CTRL_DELTA_SOC); + } else { + target_soc = battmgr->info.charge_ctrl_end; + delta_soc = battmgr->info.charge_ctrl_end - start_soc; + } + + mutex_lock(&battmgr->lock); + ret = qcom_battmgr_set_charge_control(battmgr, target_soc, delta_soc); + mutex_unlock(&battmgr->lock); + if (!ret) { + battmgr->info.charge_ctrl_start = start_soc; + battmgr->info.charge_ctrl_end = target_soc; + } + + return 0; +} + +static int qcom_battmgr_set_charge_end_threshold(struct qcom_battmgr *battmgr, int end_soc) +{ + u32 delta_soc = CHARGE_CTRL_DELTA_SOC; + int ret; + + if (end_soc < CHARGE_CTRL_END_THR_MIN || + end_soc > CHARGE_CTRL_END_THR_MAX) { + dev_err(battmgr->dev, "charge control end threshold exceed range: [%u - %u]\n", + CHARGE_CTRL_END_THR_MIN, CHARGE_CTRL_END_THR_MAX); + return -EINVAL; + } + + if (battmgr->info.charge_ctrl_start && end_soc > battmgr->info.charge_ctrl_start) + delta_soc = end_soc - battmgr->info.charge_ctrl_start; + + mutex_lock(&battmgr->lock); + ret = qcom_battmgr_set_charge_control(battmgr, end_soc, delta_soc); + mutex_unlock(&battmgr->lock); + if (!ret) { + battmgr->info.charge_ctrl_start = end_soc - delta_soc; + battmgr->info.charge_ctrl_end = end_soc; + } + + return 0; +} + +static int qcom_battmgr_charge_control_thresholds_init(struct qcom_battmgr *battmgr) +{ + int ret; + u8 en, end_soc, start_soc, delta_soc; + + ret = nvmem_cell_read_u8(battmgr->dev->parent, "charge_limit_en", &en); + if (!ret && en != 0) { + ret = nvmem_cell_read_u8(battmgr->dev->parent, "charge_limit_end", &end_soc); + if (ret < 0) + return ret; + + ret = nvmem_cell_read_u8(battmgr->dev->parent, "charge_limit_delta", &delta_soc); + if (ret < 0) + return ret; + + if (delta_soc >= end_soc) + return -EINVAL; + + start_soc = end_soc - delta_soc; + end_soc = clamp(end_soc, CHARGE_CTRL_END_THR_MIN, CHARGE_CTRL_END_THR_MAX); + start_soc = clamp(start_soc, CHARGE_CTRL_START_THR_MIN, CHARGE_CTRL_START_THR_MAX); + + battmgr->info.charge_ctrl_start = start_soc; + battmgr->info.charge_ctrl_end = end_soc; + } + + return 0; +} + +static int qcom_battmgr_bat_is_writeable(struct power_supply *psy, + enum power_supply_property psp) +{ + switch (psp) { + case POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD: + case POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD: + return 1; + default: + return 0; + } + + return 0; +} + +static int qcom_battmgr_bat_set_property(struct power_supply *psy, + enum power_supply_property psp, + const union power_supply_propval *pval) +{ + struct qcom_battmgr *battmgr = power_supply_get_drvdata(psy); + + if (!battmgr->service_up) + return -EAGAIN; + + switch (psp) { + case POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD: + return qcom_battmgr_set_charge_start_threshold(battmgr, pval->intval); + case POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD: + return qcom_battmgr_set_charge_end_threshold(battmgr, pval->intval); + default: + return -EINVAL; + } + + return 0; +} + static const enum power_supply_property sc8280xp_bat_props[] = { POWER_SUPPLY_PROP_STATUS, POWER_SUPPLY_PROP_PRESENT, @@ -663,6 +834,43 @@ static const struct power_supply_desc sc8280xp_bat_psy_desc = { .get_property = qcom_battmgr_bat_get_property, }; +static const enum power_supply_property x1e80100_bat_props[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_PRESENT, + POWER_SUPPLY_PROP_TECHNOLOGY, + POWER_SUPPLY_PROP_CYCLE_COUNT, + POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_POWER_NOW, + POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN, + POWER_SUPPLY_PROP_CHARGE_FULL, + POWER_SUPPLY_PROP_CHARGE_EMPTY, + POWER_SUPPLY_PROP_CHARGE_NOW, + POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN, + POWER_SUPPLY_PROP_ENERGY_FULL, + POWER_SUPPLY_PROP_ENERGY_EMPTY, + POWER_SUPPLY_PROP_ENERGY_NOW, + POWER_SUPPLY_PROP_TEMP, + POWER_SUPPLY_PROP_MANUFACTURE_YEAR, + POWER_SUPPLY_PROP_MANUFACTURE_MONTH, + POWER_SUPPLY_PROP_MANUFACTURE_DAY, + POWER_SUPPLY_PROP_MODEL_NAME, + POWER_SUPPLY_PROP_MANUFACTURER, + POWER_SUPPLY_PROP_SERIAL_NUMBER, + POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD, + POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD, +}; + +static const struct power_supply_desc x1e80100_bat_psy_desc = { + .name = "qcom-battmgr-bat", + .type = POWER_SUPPLY_TYPE_BATTERY, + .properties = x1e80100_bat_props, + .num_properties = ARRAY_SIZE(x1e80100_bat_props), + .get_property = qcom_battmgr_bat_get_property, + .set_property = qcom_battmgr_bat_set_property, + .property_is_writeable = qcom_battmgr_bat_is_writeable, +}; + static const enum power_supply_property sm8350_bat_props[] = { POWER_SUPPLY_PROP_STATUS, POWER_SUPPLY_PROP_HEALTH, @@ -695,6 +903,42 @@ static const struct power_supply_desc sm8350_bat_psy_desc = { .get_property = qcom_battmgr_bat_get_property, }; +static const enum power_supply_property sm8550_bat_props[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_HEALTH, + POWER_SUPPLY_PROP_PRESENT, + POWER_SUPPLY_PROP_CHARGE_TYPE, + POWER_SUPPLY_PROP_CAPACITY, + POWER_SUPPLY_PROP_VOLTAGE_OCV, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_VOLTAGE_MAX, + POWER_SUPPLY_PROP_CURRENT_NOW, + POWER_SUPPLY_PROP_TEMP, + POWER_SUPPLY_PROP_TECHNOLOGY, + POWER_SUPPLY_PROP_CHARGE_COUNTER, + POWER_SUPPLY_PROP_CYCLE_COUNT, + POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN, + POWER_SUPPLY_PROP_CHARGE_FULL, + POWER_SUPPLY_PROP_MODEL_NAME, + POWER_SUPPLY_PROP_TIME_TO_FULL_AVG, + POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG, + POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, + POWER_SUPPLY_PROP_STATE_OF_HEALTH, + POWER_SUPPLY_PROP_POWER_NOW, + POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD, + POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD, +}; + +static const struct power_supply_desc sm8550_bat_psy_desc = { + .name = "qcom-battmgr-bat", + .type = POWER_SUPPLY_TYPE_BATTERY, + .properties = sm8550_bat_props, + .num_properties = ARRAY_SIZE(sm8550_bat_props), + .get_property = qcom_battmgr_bat_get_property, + .set_property = qcom_battmgr_bat_set_property, + .property_is_writeable = qcom_battmgr_bat_is_writeable, +}; + static int qcom_battmgr_ac_get_property(struct power_supply *psy, enum power_supply_property psp, union power_supply_propval *val) @@ -1114,6 +1358,9 @@ static void qcom_battmgr_sc8280xp_callback(struct qcom_battmgr *battmgr, case BATTMGR_BAT_CHARGE_TIME: battmgr->status.charge_time = le32_to_cpu(resp->time); break; + case BATTMGR_CHG_CTRL_LIMIT_EN: + battmgr->error = 0; + break; default: dev_warn(battmgr->dev, "unknown message %#x\n", opcode); break; @@ -1227,6 +1474,12 @@ static void qcom_battmgr_sm8350_callback(struct qcom_battmgr *battmgr, case BATT_POWER_NOW: battmgr->status.power_now = le32_to_cpu(resp->intval.value); break; + case BATT_CHG_CTRL_START_THR: + battmgr->info.charge_ctrl_start = le32_to_cpu(resp->intval.value); + break; + case BATT_CHG_CTRL_END_THR: + battmgr->info.charge_ctrl_end = le32_to_cpu(resp->intval.value); + break; default: dev_warn(battmgr->dev, "unknown property %#x\n", property); break; @@ -1309,6 +1562,7 @@ static void qcom_battmgr_sm8350_callback(struct qcom_battmgr *battmgr, } break; case BATTMGR_REQUEST_NOTIFICATION: + case BATTMGR_CHG_CTRL_LIMIT_EN: battmgr->error = 0; break; default: @@ -1376,11 +1630,13 @@ static char *qcom_battmgr_battery[] = { "battery" }; static int qcom_battmgr_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { + const struct power_supply_desc *psy_desc; struct power_supply_config psy_cfg_supply = {}; struct power_supply_config psy_cfg = {}; const struct of_device_id *match; struct qcom_battmgr *battmgr; struct device *dev = &adev->dev; + int ret; battmgr = devm_kzalloc(dev, sizeof(*battmgr), GFP_KERNEL); if (!battmgr) @@ -1406,9 +1662,19 @@ static int qcom_battmgr_probe(struct auxiliary_device *adev, else battmgr->variant = QCOM_BATTMGR_SM8350; + ret = qcom_battmgr_charge_control_thresholds_init(battmgr); + if (ret < 0) + return dev_err_probe(dev, ret, + "failed to init battery charge control thresholds\n"); + if (battmgr->variant == QCOM_BATTMGR_SC8280XP || battmgr->variant == QCOM_BATTMGR_X1E80100) { - battmgr->bat_psy = devm_power_supply_register(dev, &sc8280xp_bat_psy_desc, &psy_cfg); + if (battmgr->variant == QCOM_BATTMGR_X1E80100) + psy_desc = &x1e80100_bat_psy_desc; + else + psy_desc = &sc8280xp_bat_psy_desc; + + battmgr->bat_psy = devm_power_supply_register(dev, psy_desc, &psy_cfg); if (IS_ERR(battmgr->bat_psy)) return dev_err_probe(dev, PTR_ERR(battmgr->bat_psy), "failed to register battery power supply\n"); @@ -1428,7 +1694,12 @@ static int qcom_battmgr_probe(struct auxiliary_device *adev, return dev_err_probe(dev, PTR_ERR(battmgr->wls_psy), "failed to register wireless charing power supply\n"); } else { - battmgr->bat_psy = devm_power_supply_register(dev, &sm8350_bat_psy_desc, &psy_cfg); + if (battmgr->variant == QCOM_BATTMGR_SM8550) + psy_desc = &sm8550_bat_psy_desc; + else + psy_desc = &sm8350_bat_psy_desc; + + battmgr->bat_psy = devm_power_supply_register(dev, psy_desc, &psy_cfg); if (IS_ERR(battmgr->bat_psy)) return dev_err_probe(dev, PTR_ERR(battmgr->bat_psy), "failed to register battery power supply\n"); From 45c8a6cc2bcd780e634a6ba8e46bffbdf1fc5c01 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Sep 2025 17:56:46 +0000 Subject: [PATCH 2065/3206] tcp: Clear tcp_sk(sk)->fastopen_rsk in tcp_disconnect(). syzbot reported the splat below where a socket had tcp_sk(sk)->fastopen_rsk in the TCP_ESTABLISHED state. [0] syzbot reused the server-side TCP Fast Open socket as a new client before the TFO socket completes 3WHS: 1. accept() 2. connect(AF_UNSPEC) 3. connect() to another destination As of accept(), sk->sk_state is TCP_SYN_RECV, and tcp_disconnect() changes it to TCP_CLOSE and makes connect() possible, which restarts timers. Since tcp_disconnect() forgot to clear tcp_sk(sk)->fastopen_rsk, the retransmit timer triggered the warning and the intended packet was not retransmitted. Let's call reqsk_fastopen_remove() in tcp_disconnect(). [0]: WARNING: CPU: 2 PID: 0 at net/ipv4/tcp_timer.c:542 tcp_retransmit_timer (net/ipv4/tcp_timer.c:542 (discriminator 7)) Modules linked in: CPU: 2 UID: 0 PID: 0 Comm: swapper/2 Not tainted 6.17.0-rc5-g201825fb4278 #62 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:tcp_retransmit_timer (net/ipv4/tcp_timer.c:542 (discriminator 7)) Code: 41 55 41 54 55 53 48 8b af b8 08 00 00 48 89 fb 48 85 ed 0f 84 55 01 00 00 0f b6 47 12 3c 03 74 0c 0f b6 47 12 3c 04 74 04 90 <0f> 0b 90 48 8b 85 c0 00 00 00 48 89 ef 48 8b 40 30 e8 6a 4f 06 3e RSP: 0018:ffffc900002f8d40 EFLAGS: 00010293 RAX: 0000000000000002 RBX: ffff888106911400 RCX: 0000000000000017 RDX: 0000000002517619 RSI: ffffffff83764080 RDI: ffff888106911400 RBP: ffff888106d5c000 R08: 0000000000000001 R09: ffffc900002f8de8 R10: 00000000000000c2 R11: ffffc900002f8ff8 R12: ffff888106911540 R13: ffff888106911480 R14: ffff888106911840 R15: ffffc900002f8de0 FS: 0000000000000000(0000) GS:ffff88907b768000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8044d69d90 CR3: 0000000002c30003 CR4: 0000000000370ef0 Call Trace: tcp_write_timer (net/ipv4/tcp_timer.c:738) call_timer_fn (kernel/time/timer.c:1747) __run_timers (kernel/time/timer.c:1799 kernel/time/timer.c:2372) timer_expire_remote (kernel/time/timer.c:2385 kernel/time/timer.c:2376 kernel/time/timer.c:2135) tmigr_handle_remote_up (kernel/time/timer_migration.c:944 kernel/time/timer_migration.c:1035) __walk_groups.isra.0 (kernel/time/timer_migration.c:533 (discriminator 1)) tmigr_handle_remote (kernel/time/timer_migration.c:1096) handle_softirqs (./arch/x86/include/asm/jump_label.h:36 ./include/trace/events/irq.h:142 kernel/softirq.c:580) irq_exit_rcu (kernel/softirq.c:614 kernel/softirq.c:453 kernel/softirq.c:680 kernel/softirq.c:696) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1050 (discriminator 35) arch/x86/kernel/apic/apic.c:1050 (discriminator 35)) Fixes: 8336886f786f ("tcp: TCP Fast Open Server - support TFO listeners") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250915175800.118793-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71a956fbfc5533..ad76556800f2b2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3327,6 +3327,7 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; + struct request_sock *req; u32 seq; if (old_state != TCP_CLOSE) @@ -3442,6 +3443,10 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); + if (req) + reqsk_fastopen_remove(sk, req, false); tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; From 1fd0362262baff9381615a5b346298abbc165ba3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Sep 2025 17:56:47 +0000 Subject: [PATCH 2066/3206] selftest: packetdrill: Add tcp_fastopen_server_reset-after-disconnect.pkt. The test reproduces the scenario explained in the previous patch. Without the patch, the test triggers the warning and cannot see the last retransmitted packet. # ./ksft_runner.sh tcp_fastopen_server_reset-after-disconnect.pkt TAP version 13 1..2 [ 29.229250] ------------[ cut here ]------------ [ 29.231414] WARNING: CPU: 26 PID: 0 at net/ipv4/tcp_timer.c:542 tcp_retransmit_timer+0x32/0x9f0 ... tcp_fastopen_server_reset-after-disconnect.pkt:26: error handling packet: Timed out waiting for packet not ok 1 ipv4 tcp_fastopen_server_reset-after-disconnect.pkt:26: error handling packet: Timed out waiting for packet not ok 2 ipv6 # Totals: pass:0 fail:2 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250915175800.118793-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- ...fastopen_server_reset-after-disconnect.pkt | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt diff --git a/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt new file mode 100644 index 00000000000000..26794e7ddfd5fb --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +`./defaults.sh + ./set_sysctls.py /proc/sys/net/ipv4/tcp_fastopen=0x602 /proc/sys/net/ipv4/tcp_timestamps=0` + + 0 socket(..., SOCK_STREAM|SOCK_NONBLOCK, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:10(10) win 32792 + +0 > S. 0:0(0) ack 11 win 65535 + +// sk->sk_state is TCP_SYN_RECV + +.1 accept(3, ..., ...) = 4 + +// tcp_disconnect() sets sk->sk_state to TCP_CLOSE + +0 connect(4, AF_UNSPEC, ...) = 0 + +0 > R. 1:1(0) ack 11 win 65535 + +// connect() sets sk->sk_state to TCP_SYN_SENT + +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 connect(4, ..., ...) = -1 EINPROGRESS (Operation is now in progress) + +0 > S 0:0(0) win 65535 + +// tp->fastopen_rsk must be NULL + +1 > S 0:0(0) win 65535 From 26caeae9fb482ec443753b4e3307e5122b60b850 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Fri, 5 Sep 2025 16:56:33 -0700 Subject: [PATCH 2067/3206] drm/xe/guc: Set RCS/CCS yield policy All recent platforms (including all the ones officially supported by the Xe driver) do not allow concurrent execution of RCS and CCS workloads from different address spaces, with the HW blocking the context switch when it detects such a scenario. The DUAL_QUEUE flag helps with this, by causing the GuC to not submit a context it knows will not be able to execute. This, however, causes a new problem: if RCS and CCS queues have pending workloads from different address spaces, the GuC needs to choose from which of the 2 queues to pick the next workload to execute. By default, the GuC prioritizes RCS submissions over CCS ones, which can lead to CCS workloads being significantly (or completely) starved of execution time. The driver can tune this by setting a dedicated scheduling policy KLV; this KLV allows the driver to specify a quantum (in ms) and a ratio (percentage value between 0 and 100), and the GuC will prioritize the CCS for that percentage of each quantum. Given that we want to guarantee enough RCS throughput to avoid missing frames, we set the yield policy to 20% of each 80ms interval. v2: updated quantum and ratio, improved comment, use xe_guc_submit_disable in gt_sanitize Fixes: d9a1ae0d17bd ("drm/xe/guc: Enable WA_DUAL_QUEUE for newer platforms") Signed-off-by: Daniele Ceraolo Spurio Cc: Matthew Brost Cc: John Harrison Cc: Vinay Belgaumkar Reviewed-by: John Harrison Tested-by: Vinay Belgaumkar Link: https://lore.kernel.org/r/20250905235632.3333247-2-daniele.ceraolospurio@intel.com (cherry picked from commit 88434448438e4302e272b2a2b810b42e05ea024b) Signed-off-by: Rodrigo Vivi [Rodrigo added #include "xe_guc_submit.h" while backporting] --- drivers/gpu/drm/xe/abi/guc_actions_abi.h | 1 + drivers/gpu/drm/xe/abi/guc_klvs_abi.h | 25 +++++++++ drivers/gpu/drm/xe/xe_gt.c | 3 +- drivers/gpu/drm/xe/xe_guc.c | 6 +-- drivers/gpu/drm/xe/xe_guc_submit.c | 66 ++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_guc_submit.h | 2 + 6 files changed, 98 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_abi.h index 81eb046aeebfef..b9f67d7a00d879 100644 --- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h @@ -117,6 +117,7 @@ enum xe_guc_action { XE_GUC_ACTION_ENTER_S_STATE = 0x501, XE_GUC_ACTION_EXIT_S_STATE = 0x502, XE_GUC_ACTION_GLOBAL_SCHED_POLICY_CHANGE = 0x506, + XE_GUC_ACTION_UPDATE_SCHEDULING_POLICIES_KLV = 0x509, XE_GUC_ACTION_SCHED_CONTEXT = 0x1000, XE_GUC_ACTION_SCHED_CONTEXT_MODE_SET = 0x1001, XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE = 0x1002, diff --git a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h index 0366a9da597751..d7719d0e36ca78 100644 --- a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h +++ b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h @@ -17,6 +17,7 @@ * | 0 | 31:16 | **KEY** - KLV key identifier | * | | | - `GuC Self Config KLVs`_ | * | | | - `GuC Opt In Feature KLVs`_ | + * | | | - `GuC Scheduling Policies KLVs`_ | * | | | - `GuC VGT Policy KLVs`_ | * | | | - `GuC VF Configuration KLVs`_ | * | | | | @@ -152,6 +153,30 @@ enum { #define GUC_KLV_OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH_KEY 0x4003 #define GUC_KLV_OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH_LEN 0u +/** + * DOC: GuC Scheduling Policies KLVs + * + * `GuC KLV`_ keys available for use with UPDATE_SCHEDULING_POLICIES_KLV. + * + * _`GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD` : 0x1001 + * Some platforms do not allow concurrent execution of RCS and CCS + * workloads from different address spaces. By default, the GuC prioritizes + * RCS submissions over CCS ones, which can lead to CCS workloads being + * significantly (or completely) starved of execution time. This KLV allows + * the driver to specify a quantum (in ms) and a ratio (percentage value + * between 0 and 100), and the GuC will prioritize the CCS for that + * percentage of each quantum. For example, specifying 100ms and 30% will + * make the GuC prioritize the CCS for 30ms of every 100ms. + * Note that this does not necessarly mean that RCS and CCS engines will + * only be active for their percentage of the quantum, as the restriction + * only kicks in if both classes are fully busy with non-compatible address + * spaces; i.e., if one engine is idle or running the same address space, + * a pending job on the other engine will still be submitted to the HW no + * matter what the ratio is + */ +#define GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD_KEY 0x1001 +#define GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD_LEN 2u + /** * DOC: GuC VGT Policy KLVs * diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index c8eda36546d343..17634195cdc26a 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -41,6 +41,7 @@ #include "xe_gt_topology.h" #include "xe_guc_exec_queue_types.h" #include "xe_guc_pc.h" +#include "xe_guc_submit.h" #include "xe_hw_fence.h" #include "xe_hw_engine_class_sysfs.h" #include "xe_irq.h" @@ -97,7 +98,7 @@ void xe_gt_sanitize(struct xe_gt *gt) * FIXME: if xe_uc_sanitize is called here, on TGL driver will not * reload */ - gt->uc.guc.submission_state.enabled = false; + xe_guc_submit_disable(>->uc.guc); } static void xe_gt_enable_host_l2_vram(struct xe_gt *gt) diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c index b1d1d6da37581e..270fc379249366 100644 --- a/drivers/gpu/drm/xe/xe_guc.c +++ b/drivers/gpu/drm/xe/xe_guc.c @@ -880,9 +880,7 @@ int xe_guc_post_load_init(struct xe_guc *guc) return ret; } - guc->submission_state.enabled = true; - - return 0; + return xe_guc_submit_enable(guc); } int xe_guc_reset(struct xe_guc *guc) @@ -1579,7 +1577,7 @@ void xe_guc_sanitize(struct xe_guc *guc) { xe_uc_fw_sanitize(&guc->fw); xe_guc_ct_disable(&guc->ct); - guc->submission_state.enabled = false; + xe_guc_submit_disable(guc); } int xe_guc_reset_prepare(struct xe_guc *guc) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 93fc7b290b655f..0104afbc941c84 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -32,6 +32,7 @@ #include "xe_guc_ct.h" #include "xe_guc_exec_queue_types.h" #include "xe_guc_id_mgr.h" +#include "xe_guc_klv_helpers.h" #include "xe_guc_submit_types.h" #include "xe_hw_engine.h" #include "xe_hw_fence.h" @@ -316,6 +317,71 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids) return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc); } +/* + * Given that we want to guarantee enough RCS throughput to avoid missing + * frames, we set the yield policy to 20% of each 80ms interval. + */ +#define RC_YIELD_DURATION 80 /* in ms */ +#define RC_YIELD_RATIO 20 /* in percent */ +static u32 *emit_render_compute_yield_klv(u32 *emit) +{ + *emit++ = PREP_GUC_KLV_TAG(SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD); + *emit++ = RC_YIELD_DURATION; + *emit++ = RC_YIELD_RATIO; + + return emit; +} + +#define SCHEDULING_POLICY_MAX_DWORDS 16 +static int guc_init_global_schedule_policy(struct xe_guc *guc) +{ + u32 data[SCHEDULING_POLICY_MAX_DWORDS]; + u32 *emit = data; + u32 count = 0; + int ret; + + if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 1, 0)) + return 0; + + *emit++ = XE_GUC_ACTION_UPDATE_SCHEDULING_POLICIES_KLV; + + if (CCS_MASK(guc_to_gt(guc))) + emit = emit_render_compute_yield_klv(emit); + + count = emit - data; + if (count > 1) { + xe_assert(guc_to_xe(guc), count <= SCHEDULING_POLICY_MAX_DWORDS); + + ret = xe_guc_ct_send_block(&guc->ct, data, count); + if (ret < 0) { + xe_gt_err(guc_to_gt(guc), + "failed to enable GuC sheduling policies: %pe\n", + ERR_PTR(ret)); + return ret; + } + } + + return 0; +} + +int xe_guc_submit_enable(struct xe_guc *guc) +{ + int ret; + + ret = guc_init_global_schedule_policy(guc); + if (ret) + return ret; + + guc->submission_state.enabled = true; + + return 0; +} + +void xe_guc_submit_disable(struct xe_guc *guc) +{ + guc->submission_state.enabled = false; +} + static void __release_guc_id(struct xe_guc *guc, struct xe_exec_queue *q, u32 xa_count) { int i; diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h index 9b71a986c6ca69..0d126b807c1041 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.h +++ b/drivers/gpu/drm/xe/xe_guc_submit.h @@ -13,6 +13,8 @@ struct xe_exec_queue; struct xe_guc; int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids); +int xe_guc_submit_enable(struct xe_guc *guc); +void xe_guc_submit_disable(struct xe_guc *guc); int xe_guc_submit_reset_prepare(struct xe_guc *guc); void xe_guc_submit_reset_wait(struct xe_guc *guc); From f57e53ea252363234f86674db475839e5b87102e Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 10 Sep 2025 11:49:05 +0200 Subject: [PATCH 2068/3206] smb: client: let recv_done verify data_offset, data_length and remaining_data_length This is inspired by the related server fixes. Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Reviewed-by: Namjae Jeon Fixes: f198186aa9bb ("CIFS: SMBD: Establish SMB Direct connection") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 02d6db431fd4ec..dafa3ed4a630f9 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -453,9 +453,12 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_recv_io *response = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); struct smbdirect_socket *sc = response->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbd_connection *info = container_of(sc, struct smbd_connection, socket); - int data_length = 0; + u32 data_offset = 0; + u32 data_length = 0; + u32 remaining_data_length = 0; log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n", response, sc->recv_io.expected, wc->status, wc->opcode, @@ -487,7 +490,22 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) /* SMBD data transfer packet */ case SMBDIRECT_EXPECT_DATA_TRANSFER: data_transfer = smbdirect_recv_io_payload(response); + + if (wc->byte_len < + offsetof(struct smbdirect_data_transfer, padding)) + goto error; + + remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); + data_offset = le32_to_cpu(data_transfer->data_offset); data_length = le32_to_cpu(data_transfer->data_length); + if (wc->byte_len < data_offset || + (u64)wc->byte_len < (u64)data_offset + data_length) + goto error; + + if (remaining_data_length > sp->max_fragmented_recv_size || + data_length > sp->max_fragmented_recv_size || + (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size) + goto error; if (data_length) { if (sc->recv_io.reassembly.full_packet_received) From 93ed9a2951308db374cba4562533dde97bac70d3 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 17 Sep 2025 16:03:22 -0300 Subject: [PATCH 2069/3206] smb: client: fix filename matching of deferred files Fix the following case where the client would end up closing both deferred files (foo.tmp & foo) after unlink(foo) due to strstr() call in cifs_close_deferred_file_under_dentry(): fd1 = openat(AT_FDCWD, "foo", O_WRONLY|O_CREAT|O_TRUNC, 0666); fd2 = openat(AT_FDCWD, "foo.tmp", O_WRONLY|O_CREAT|O_TRUNC, 0666); close(fd1); close(fd2); unlink("foo"); Fixes: e3fc065682eb ("cifs: Deferred close performance improvements") Signed-off-by: Paulo Alcantara (Red Hat) Reviewed-by: Enzo Matsumiya Cc: Frank Sorenson Cc: David Howells Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsproto.h | 4 ++-- fs/smb/client/inode.c | 6 +++--- fs/smb/client/misc.c | 38 ++++++++++++++++---------------------- 3 files changed, 21 insertions(+), 27 deletions(-) diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index c34c533b2efade..e8fba98690ce38 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -312,8 +312,8 @@ extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode); extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon); -extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon, - const char *path); +void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon, + struct dentry *dentry); extern void cifs_mark_open_handles_for_deleted_file(struct inode *inode, const char *path); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 11d442e8b3d622..1703f1285d36d2 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1984,7 +1984,7 @@ static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyren } netfs_wait_for_outstanding_io(inode); - cifs_close_deferred_file_under_dentry(tcon, full_path); + cifs_close_deferred_file_under_dentry(tcon, dentry); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { @@ -2538,10 +2538,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, goto cifs_rename_exit; } - cifs_close_deferred_file_under_dentry(tcon, from_name); + cifs_close_deferred_file_under_dentry(tcon, source_dentry); if (d_inode(target_dentry) != NULL) { netfs_wait_for_outstanding_io(d_inode(target_dentry)); - cifs_close_deferred_file_under_dentry(tcon, to_name); + cifs_close_deferred_file_under_dentry(tcon, target_dentry); } rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index da23cc12a52caa..dda6dece802ad2 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -832,33 +832,28 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) kfree(tmp_list); } } -void -cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) + +void cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, + struct dentry *dentry) { - struct cifsFileInfo *cfile; struct file_list *tmp_list, *tmp_next_list; - void *page; - const char *full_path; + struct cifsFileInfo *cfile; LIST_HEAD(file_head); - page = alloc_dentry_path(); spin_lock(&tcon->open_file_lock); list_for_each_entry(cfile, &tcon->openFileList, tlist) { - full_path = build_path_from_dentry(cfile->dentry, page); - if (strstr(full_path, path)) { - if (delayed_work_pending(&cfile->deferred)) { - if (cancel_delayed_work(&cfile->deferred)) { - spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - cifs_del_deferred_close(cfile); - spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - - tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); - if (tmp_list == NULL) - break; - tmp_list->cfile = cfile; - list_add_tail(&tmp_list->list, &file_head); - } - } + if ((cfile->dentry == dentry) && + delayed_work_pending(&cfile->deferred) && + cancel_delayed_work(&cfile->deferred)) { + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); + if (tmp_list == NULL) + break; + tmp_list->cfile = cfile; + list_add_tail(&tmp_list->list, &file_head); } } spin_unlock(&tcon->open_file_lock); @@ -868,7 +863,6 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) list_del(&tmp_list->list); kfree(tmp_list); } - free_dentry_path(page); } /* From bac28f604c7699727b2fecf14c3a54668bbe458e Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 12 Aug 2025 12:58:21 +0200 Subject: [PATCH 2070/3206] smb: client: use disable[_delayed]_work_sync in smbdirect.c This makes it safer during the disconnect and avoids requeueing. It's ok to call disable[delayed_]work[_sync]() more than once. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Fixes: 050b8c374019 ("smbd: Make upper layer decide when to destroy the transport") Fixes: f198186aa9bb ("CIFS: SMBD: Establish SMB Direct connection") Fixes: c7398583340a ("CIFS: SMBD: Implement RDMA memory registration") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index dafa3ed4a630f9..1f8de8866c0605 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1353,7 +1353,7 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->ib.qp = NULL; log_rdma_event(INFO, "cancelling idle timer\n"); - cancel_delayed_work_sync(&info->idle_timer_work); + disable_delayed_work_sync(&info->idle_timer_work); /* It's not possible for upper layer to get to reassembly */ log_rdma_event(INFO, "drain the reassembly queue\n"); @@ -1726,7 +1726,7 @@ static struct smbd_connection *_smbd_get_connection( return NULL; negotiation_failed: - cancel_delayed_work_sync(&info->idle_timer_work); + disable_delayed_work_sync(&info->idle_timer_work); destroy_caches_and_workqueue(info); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; rdma_disconnect(sc->rdma.cm_id); @@ -2085,7 +2085,7 @@ static void destroy_mr_list(struct smbd_connection *info) struct smbdirect_socket *sc = &info->socket; struct smbd_mr *mr, *tmp; - cancel_work_sync(&info->mr_recovery_work); + disable_work_sync(&info->mr_recovery_work); list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { if (mr->state == MR_INVALIDATED) ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, From d9dcbbcf9145b68aa85c40947311a6907277e097 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 12 Aug 2025 13:03:19 +0200 Subject: [PATCH 2071/3206] smb: client: let smbd_destroy() call disable_work_sync(&info->post_send_credits_work) In smbd_destroy() we may destroy the memory so we better wait until post_send_credits_work is no longer pending and will never be started again. I actually just hit the case using rxe: WARNING: CPU: 0 PID: 138 at drivers/infiniband/sw/rxe/rxe_verbs.c:1032 rxe_post_recv+0x1ee/0x480 [rdma_rxe] ... [ 5305.686979] [ T138] smbd_post_recv+0x445/0xc10 [cifs] [ 5305.687135] [ T138] ? srso_alias_return_thunk+0x5/0xfbef5 [ 5305.687149] [ T138] ? __kasan_check_write+0x14/0x30 [ 5305.687185] [ T138] ? __pfx_smbd_post_recv+0x10/0x10 [cifs] [ 5305.687329] [ T138] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 5305.687356] [ T138] ? srso_alias_return_thunk+0x5/0xfbef5 [ 5305.687368] [ T138] ? srso_alias_return_thunk+0x5/0xfbef5 [ 5305.687378] [ T138] ? _raw_spin_unlock_irqrestore+0x11/0x60 [ 5305.687389] [ T138] ? srso_alias_return_thunk+0x5/0xfbef5 [ 5305.687399] [ T138] ? get_receive_buffer+0x168/0x210 [cifs] [ 5305.687555] [ T138] smbd_post_send_credits+0x382/0x4b0 [cifs] [ 5305.687701] [ T138] ? __pfx_smbd_post_send_credits+0x10/0x10 [cifs] [ 5305.687855] [ T138] ? __pfx___schedule+0x10/0x10 [ 5305.687865] [ T138] ? __pfx__raw_spin_lock_irq+0x10/0x10 [ 5305.687875] [ T138] ? queue_delayed_work_on+0x8e/0xa0 [ 5305.687889] [ T138] process_one_work+0x629/0xf80 [ 5305.687908] [ T138] ? srso_alias_return_thunk+0x5/0xfbef5 [ 5305.687917] [ T138] ? __kasan_check_write+0x14/0x30 [ 5305.687933] [ T138] worker_thread+0x87f/0x1570 ... It means rxe_post_recv was called after rdma_destroy_qp(). This happened because put_receive_buffer() was triggered by ib_drain_qp() and called: queue_work(info->workqueue, &info->post_send_credits_work); Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Fixes: f198186aa9bb ("CIFS: SMBD: Establish SMB Direct connection") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 1f8de8866c0605..1f3d6414586361 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1347,6 +1347,9 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } + log_rdma_event(INFO, "cancelling post_send_credits_work\n"); + disable_work_sync(&info->post_send_credits_work); + log_rdma_event(INFO, "destroying qp\n"); ib_drain_qp(sc->ib.qp); rdma_destroy_qp(sc->rdma.cm_id); From 96fa515e70f3e4b98685ef8cac9d737fc62f10e1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 16 Sep 2025 07:54:06 +0930 Subject: [PATCH 2072/3206] btrfs: tree-checker: fix the incorrect inode ref size check [BUG] Inside check_inode_ref(), we need to make sure every structure, including the btrfs_inode_extref header, is covered by the item. But our code is incorrectly using "sizeof(iref)", where @iref is just a pointer. This means "sizeof(iref)" will always be "sizeof(void *)", which is much smaller than "sizeof(struct btrfs_inode_extref)". This will allow some bad inode extrefs to sneak in, defeating tree-checker. [FIX] Fix the typo by calling "sizeof(*iref)", which is the same as "sizeof(struct btrfs_inode_extref)", and will be the correct behavior we want. Fixes: 71bf92a9b877 ("btrfs: tree-checker: Add check for INODE_REF") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Johannes Thumshirn Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 0f556f4de3f924..a997c7cc35a26f 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1756,10 +1756,10 @@ static int check_inode_ref(struct extent_buffer *leaf, while (ptr < end) { u16 namelen; - if (unlikely(ptr + sizeof(iref) > end)) { + if (unlikely(ptr + sizeof(*iref) > end)) { inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", - ptr, end, sizeof(iref)); + ptr, end, sizeof(*iref)); return -EUCLEAN; } From ed4e6b5d644c4dd2bc2872ffec036b7da0ec2e27 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 15 Sep 2025 08:37:47 +0200 Subject: [PATCH 2073/3206] btrfs: ref-verify: handle damaged extent root tree Syzbot hits a problem with enabled ref-verify, ignorebadroots and a fuzzed/damaged extent tree. There's no fallback option like in other places that can deal with it so disable the whole ref-verify as it is just a debugging feature. Reported-by: syzbot+9c3e0cdfbfe351b0bc0e@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000001b6052062139be1c@google.com/ Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 3871c3a6c743b5..9f1858b42c0e21 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -980,11 +980,18 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; + extent_root = btrfs_extent_root(fs_info, 0); + /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */ + if (IS_ERR(extent_root)) { + btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling"); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; - extent_root = btrfs_extent_root(fs_info, 0); eb = btrfs_read_lock_root_node(extent_root); level = btrfs_header_level(eb); path->nodes[level] = eb; From 9574b2330dbd2b5459b74d3b5e9619d39299fc6f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 16 Sep 2025 15:42:41 +0800 Subject: [PATCH 2074/3206] crypto: af_alg - Set merge to zero early in af_alg_sendmsg If an error causes af_alg_sendmsg to abort, ctx->merge may contain a garbage value from the previous loop. This may then trigger a crash on the next entry into af_alg_sendmsg when it attempts to do a merge that can't be done. Fix this by setting ctx->merge to zero near the start of the loop. Fixes: 8ff590903d5 ("crypto: algif_skcipher - User-space interface for skcipher operations") Reported-by: Muhammad Alifa Ramdhan Reported-by: Bing-Jhong Billy Jheng Signed-off-by: Herbert Xu --- crypto/af_alg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 0da7c1ac778a0e..407f2c238f2c67 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1019,6 +1019,8 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, continue; } + ctx->merge = 0; + if (!af_alg_writable(sk)) { err = af_alg_wait_for_wmem(sk, msg->msg_flags); if (err) @@ -1058,7 +1060,6 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, ctx->used += plen; copied += plen; size -= plen; - ctx->merge = 0; } else { do { struct page *pg; From 1b34cbbf4f011a121ef7b2d7d6e6920a036d5285 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 16 Sep 2025 17:20:59 +0800 Subject: [PATCH 2075/3206] crypto: af_alg - Disallow concurrent writes in af_alg_sendmsg Issuing two writes to the same af_alg socket is bogus as the data will be interleaved in an unpredictable fashion. Furthermore, concurrent writes may create inconsistencies in the internal socket state. Disallow this by adding a new ctx->write field that indiciates exclusive ownership for writing. Fixes: 8ff590903d5 ("crypto: algif_skcipher - User-space interface for skcipher operations") Reported-by: Muhammad Alifa Ramdhan Reported-by: Bing-Jhong Billy Jheng Signed-off-by: Herbert Xu --- crypto/af_alg.c | 7 +++++++ include/crypto/if_alg.h | 10 ++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 407f2c238f2c67..ca6fdcc6c54aca 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -970,6 +970,12 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, } lock_sock(sk); + if (ctx->write) { + release_sock(sk); + return -EBUSY; + } + ctx->write = true; + if (ctx->init && !ctx->more) { if (ctx->used) { err = -EINVAL; @@ -1105,6 +1111,7 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, unlock: af_alg_data_wakeup(sk); + ctx->write = false; release_sock(sk); return copied ?: err; diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index f7b3b93f3a49a7..0c70f3a5557505 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -135,6 +135,7 @@ struct af_alg_async_req { * SG? * @enc: Cryptographic operation to be performed when * recvmsg is invoked. + * @write: True if we are in the middle of a write. * @init: True if metadata has been sent. * @len: Length of memory allocated for this data structure. * @inflight: Non-zero when AIO requests are in flight. @@ -151,10 +152,11 @@ struct af_alg_ctx { size_t used; atomic_t rcvused; - bool more; - bool merge; - bool enc; - bool init; + u32 more:1, + merge:1, + enc:1, + write:1, + init:1; unsigned int len; From f38c2c3e572ce0ce5c01de0358ed70328e0cb5af Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 18 Sep 2025 06:26:55 +0000 Subject: [PATCH 2076/3206] arm64: cputype: Add Cortex-A720AE definitions Add cputype definitions for Cortex-A720AE. These will be used for errata detection in subsequent patches. These values can be found in the Cortex-A720AE TRM: https://developer.arm.com/documentation/102828/0001/ ... in Table A-187 Signed-off-by: Kuninori Morimoto Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 661735616787e2..b10eba7f52476b 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -96,6 +96,7 @@ #define ARM_CPU_PART_NEOVERSE_V3 0xD84 #define ARM_CPU_PART_CORTEX_X925 0xD85 #define ARM_CPU_PART_CORTEX_A725 0xD87 +#define ARM_CPU_PART_CORTEX_A720AE 0xD89 #define ARM_CPU_PART_NEOVERSE_N3 0xD8E #define APM_CPU_PART_XGENE 0x000 @@ -185,6 +186,7 @@ #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3) #define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925) #define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725) +#define MIDR_CORTEX_A720AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720AE) #define MIDR_NEOVERSE_N3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N3) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) From 3ba8d4aa42bd5dc6e2493ce4a73bb41c9cfd77ca Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 18 Sep 2025 06:27:12 +0000 Subject: [PATCH 2077/3206] arm64: errata: Expand speculative SSBS workaround for Cortex-A720AE It is same as Cortex-A720. Link: https://lore.kernel.org/all/aMlFwbDjJ6yKuxTv@J2N7QTR9R3.cambridge.arm.com/ Signed-off-by: Kuninori Morimoto Signed-off-by: Will Deacon --- arch/arm64/kernel/cpu_errata.c | 1 + arch/arm64/kernel/proton-pack.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 59d723c9ab8f5a..7ff6b49beaaffc 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -531,6 +531,7 @@ static const struct midr_range erratum_spec_ssbs_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index edf1783ffc8174..f9a32dfde00671 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -884,6 +884,7 @@ static u8 spectre_bhb_loop_affected(void) static const struct midr_range spectre_bhb_k38_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), {}, }; static const struct midr_range spectre_bhb_k32_list[] = { From 0aeb54ac4cd5cf8f60131b4d9ec0b6dc9c27b20d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:28:13 -0700 Subject: [PATCH 2078/3206] tls: make sure to abort the stream if headers are bogus Normally we wait for the socket to buffer up the whole record before we service it. If the socket has a tiny buffer, however, we read out the data sooner, to prevent connection stalls. Make sure that we abort the connection when we find out late that the record is actually invalid. Retrying the parsing is fine in itself but since we copy some more data each time before we parse we can overflow the allocated skb space. Constructing a scenario in which we're under pressure without enough data in the socket to parse the length upfront is quite hard. syzbot figured out a way to do this by serving us the header in small OOB sends, and then filling in the recvbuf with a large normal send. Make sure that tls_rx_msg_size() aborts strp, if we reach an invalid record there's really no way to recover. Reported-by: Lee Jones Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Reviewed-by: Sabrina Dubroca Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250917002814.1743558-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/tls/tls.h | 1 + net/tls/tls_strp.c | 14 +++++++++----- net/tls/tls_sw.c | 3 +-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/net/tls/tls.h b/net/tls/tls.h index 4e077068e6d98a..e4c42731ce39ae 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -141,6 +141,7 @@ void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); void tls_err_abort(struct sock *sk, int err); +void tls_strp_abort_strp(struct tls_strparser *strp, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index d71643b494a1ae..98e12f0ff57e51 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -13,7 +13,7 @@ static struct workqueue_struct *tls_strp_wq; -static void tls_strp_abort_strp(struct tls_strparser *strp, int err) +void tls_strp_abort_strp(struct tls_strparser *strp, int err) { if (strp->stopped) return; @@ -211,11 +211,17 @@ static int tls_strp_copyin_frag(struct tls_strparser *strp, struct sk_buff *skb, struct sk_buff *in_skb, unsigned int offset, size_t in_len) { + unsigned int nfrag = skb->len / PAGE_SIZE; size_t len, chunk; skb_frag_t *frag; int sz; - frag = &skb_shinfo(skb)->frags[skb->len / PAGE_SIZE]; + if (unlikely(nfrag >= skb_shinfo(skb)->nr_frags)) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EMSGSIZE; + } + + frag = &skb_shinfo(skb)->frags[nfrag]; len = in_len; /* First make sure we got the header */ @@ -520,10 +526,8 @@ static int tls_strp_read_sock(struct tls_strparser *strp) tls_strp_load_anchor_with_queue(strp, inq); if (!strp->stm.full_len) { sz = tls_rx_msg_size(strp, strp->anchor); - if (sz < 0) { - tls_strp_abort_strp(strp, sz); + if (sz < 0) return sz; - } strp->stm.full_len = sz; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bac65d0d4e3e1e..daac9fd4be7eb5 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2474,8 +2474,7 @@ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb) return data_len + TLS_HEADER_SIZE; read_failure: - tls_err_abort(strp->sk, ret); - + tls_strp_abort_strp(strp, ret); return ret; } From 4c05c7ed880fb58790731fb53571af67b7632d87 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:28:14 -0700 Subject: [PATCH 2079/3206] selftests: tls: test skb copy under mem pressure and OOB Add a test which triggers mem pressure via OOB writes. Reviewed-by: Sabrina Dubroca Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250917002814.1743558-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tls.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 0f5640d8dc7fbf..dd093f9df6f147 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -2770,6 +2770,22 @@ TEST_F(tls_err, poll_partial_rec_async) } } +/* Use OOB+large send to trigger copy mode due to memory pressure. + * OOB causes a short read. + */ +TEST_F(tls_err, oob_pressure) +{ + char buf[1<<16]; + int i; + + memrnd(buf, sizeof(buf)); + + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); + EXPECT_EQ(send(self->fd2, buf, sizeof(buf), 0), sizeof(buf)); + for (i = 0; i < 64; i++) + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); +} + TEST(non_established) { struct tls12_crypto_info_aes_gcm_256 tls12; struct sockaddr_in addr; From b98b208300573f4ab29507f81194a6030b208444 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 25 Aug 2025 19:56:26 +0930 Subject: [PATCH 2080/3206] btrfs: reject invalid compression level Inspired by recent changes to compression level parsing in 6db1df415d73fc ("btrfs: accept and ignore compression level for lzo") it turns out that we do not do any extra validation for compression level input string, thus allowing things like "compress=lzo:invalid" to be accepted without warnings. Although we accept levels that are beyond the supported algorithm ranges, accepting completely invalid level specification is not correct. Fix the too loose checks for compression level, by doing proper error handling of kstrtoint(), so that we will reject not only too large values (beyond int range) but also completely wrong levels like "lzo:invalid". Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 22 +++++++++++++--------- fs/btrfs/compression.h | 2 +- fs/btrfs/super.c | 27 +++++++++++++++++++-------- 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d09d622016ef54..35e3071cec0636 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1616,25 +1616,29 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) } /* - * Convert the compression suffix (eg. after "zlib" starting with ":") to - * level, unrecognized string will set the default level. Negative level - * numbers are allowed. + * Convert the compression suffix (eg. after "zlib" starting with ":") to level. + * + * If the resulting level exceeds the algo's supported levels, it will be clamped. + * + * Return <0 if no valid string can be found. + * Return 0 if everything is fine. */ -int btrfs_compress_str2level(unsigned int type, const char *str) +int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret) { int level = 0; int ret; - if (!type) + if (!type) { + *level_ret = btrfs_compress_set_level(type, level); return 0; + } if (str[0] == ':') { ret = kstrtoint(str + 1, 10, &level); if (ret) - level = 0; + return ret; } - level = btrfs_compress_set_level(type, level); - - return level; + *level_ret = btrfs_compress_set_level(type, level); + return 0; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 1b38e707bbd985..7b41b2b5ff4488 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -102,7 +102,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); -int btrfs_compress_str2level(unsigned int type, const char *str); +int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); struct folio *btrfs_alloc_compr_folio(void); void btrfs_free_compr_folio(struct folio *folio); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e708faf1892f47..a3f9f9e1c02ab5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -276,6 +276,7 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, const struct fs_parameter *param, int opt) { const char *string = param->string; + int ret; /* * Provide the same semantics as older kernels that don't use fs @@ -294,15 +295,19 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zlib", true)) { ctx->compress_type = BTRFS_COMPRESS_ZLIB; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, - string + 4); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "lzo", true)) { ctx->compress_type = BTRFS_COMPRESS_LZO; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, - string + 3); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3, + &ctx->compress_level); + if (ret < 0) + goto error; if (string[3] == ':' && string[4]) btrfs_warn(NULL, "Compression level ignored for LZO"); btrfs_set_opt(ctx->mount_opt, COMPRESS); @@ -310,8 +315,10 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zstd", true)) { ctx->compress_type = BTRFS_COMPRESS_ZSTD; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, - string + 4); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); @@ -322,10 +329,14 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); } else { - btrfs_err(NULL, "unrecognized compression value %s", string); - return -EINVAL; + ret = -EINVAL; + goto error; } return 0; +error: + btrfs_err(NULL, "failed to parse compression option '%s'", string); + return ret; + } static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) From baad7830ee9a56756b3857348452fe756cb0a702 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:43:36 +0800 Subject: [PATCH 2081/3206] objtool/LoongArch: Mark types based on break immediate code If the break immediate code is 0, it should mark the type as INSN_TRAP. If the break immediate code is 1, it should mark the type as INSN_BUG. While at it, format the code style and add the code comment for nop. Cc: stable@vger.kernel.org Suggested-by: WANG Rui Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/objtool/arch/loongarch/decode.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index b6fdc68053cc4e..428ba6de382191 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -310,10 +310,16 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec if (decode_insn_reg2i16_fomat(inst, insn)) return 0; - if (inst.word == 0) + if (inst.word == 0) { + /* andi $zero, $zero, 0x0 */ insn->type = INSN_NOP; - else if (inst.reg0i15_format.opcode == break_op) { - /* break */ + } else if (inst.reg0i15_format.opcode == break_op && + inst.reg0i15_format.immediate == 0x0) { + /* break 0x0 */ + insn->type = INSN_TRAP; + } else if (inst.reg0i15_format.opcode == break_op && + inst.reg0i15_format.immediate == 0x1) { + /* break 0x1 */ insn->type = INSN_BUG; } else if (inst.reg2_format.opcode == ertn_op) { /* ertn */ From 539d7344d4feaea37e05863e9aa86bd31f28e46f Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:43:36 +0800 Subject: [PATCH 2082/3206] objtool/LoongArch: Mark special atomic instruction as INSN_BUG type When compiling with LLVM and CONFIG_RUST is set, there exists the following objtool warning: rust/compiler_builtins.o: warning: objtool: __rust__unordsf2(): unexpected end of section .text.unlikely. objdump shows that the end of section .text.unlikely is an atomic instruction: amswap.w $zero, $ra, $zero According to the LoongArch Reference Manual, if the amswap.w atomic memory access instruction has the same register number as rd and rj, the execution will trigger an Instruction Non-defined Exception, so mark the above instruction as INSN_BUG type to fix the warning. Cc: stable@vger.kernel.org Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/arch/loongarch/include/asm/inst.h | 12 ++++++++++++ tools/objtool/arch/loongarch/decode.c | 21 +++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/tools/arch/loongarch/include/asm/inst.h b/tools/arch/loongarch/include/asm/inst.h index c25b5853181dba..d68fad63c8b732 100644 --- a/tools/arch/loongarch/include/asm/inst.h +++ b/tools/arch/loongarch/include/asm/inst.h @@ -51,6 +51,10 @@ enum reg2i16_op { bgeu_op = 0x1b, }; +enum reg3_op { + amswapw_op = 0x70c0, +}; + struct reg0i15_format { unsigned int immediate : 15; unsigned int opcode : 17; @@ -96,6 +100,13 @@ struct reg2i16_format { unsigned int opcode : 6; }; +struct reg3_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int rk : 5; + unsigned int opcode : 17; +}; + union loongarch_instruction { unsigned int word; struct reg0i15_format reg0i15_format; @@ -105,6 +116,7 @@ union loongarch_instruction { struct reg2i12_format reg2i12_format; struct reg2i14_format reg2i14_format; struct reg2i16_format reg2i16_format; + struct reg3_format reg3_format; }; #define LOONGARCH_INSN_SIZE sizeof(union loongarch_instruction) diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index 428ba6de382191..2e555c4060c5e4 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -278,6 +278,25 @@ static bool decode_insn_reg2i16_fomat(union loongarch_instruction inst, return true; } +static bool decode_insn_reg3_fomat(union loongarch_instruction inst, + struct instruction *insn) +{ + switch (inst.reg3_format.opcode) { + case amswapw_op: + if (inst.reg3_format.rd == LOONGARCH_GPR_ZERO && + inst.reg3_format.rk == LOONGARCH_GPR_RA && + inst.reg3_format.rj == LOONGARCH_GPR_ZERO) { + /* amswap.w $zero, $ra, $zero */ + insn->type = INSN_BUG; + } + break; + default: + return false; + } + + return true; +} + int arch_decode_instruction(struct objtool_file *file, const struct section *sec, unsigned long offset, unsigned int maxlen, struct instruction *insn) @@ -309,6 +328,8 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec return 0; if (decode_insn_reg2i16_fomat(inst, insn)) return 0; + if (decode_insn_reg3_fomat(inst, insn)) + return 0; if (inst.word == 0) { /* andi $zero, $zero, 0x0 */ From b15212824a01cb0b62f7b522f4ee334622cf982a Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:43:42 +0800 Subject: [PATCH 2083/3206] LoongArch: Make LTO case independent in Makefile LTO is not only used for Clang, but maybe also used for Rust, make LTO case out of CONFIG_CC_HAS_ANNOTATE_TABLEJUMP in Makefile. This is preparation for later patch, no function changes. Cc: stable@vger.kernel.org Suggested-by: WANG Rui Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index a3a9759414f40f..9d80af7f75c833 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -102,16 +102,16 @@ KBUILD_CFLAGS += $(call cc-option,-mthin-add-sub) $(call cc-option,-Wa$(comma) ifdef CONFIG_OBJTOOL ifdef CONFIG_CC_HAS_ANNOTATE_TABLEJUMP +KBUILD_CFLAGS += -mannotate-tablejump +else +KBUILD_CFLAGS += -fno-jump-tables # keep compatibility with older compilers +endif +ifdef CONFIG_LTO_CLANG # The annotate-tablejump option can not be passed to LLVM backend when LTO is enabled. # Ensure it is aware of linker with LTO, '--loongarch-annotate-tablejump' also needs to # be passed via '-mllvm' to ld.lld. -KBUILD_CFLAGS += -mannotate-tablejump -ifdef CONFIG_LTO_CLANG KBUILD_LDFLAGS += -mllvm --loongarch-annotate-tablejump endif -else -KBUILD_CFLAGS += -fno-jump-tables # keep compatibility with older compilers -endif endif KBUILD_RUSTFLAGS += --target=loongarch64-unknown-none-softfloat -Ccode-model=small From 74f8295c6fb8436bec9995baf6ba463151b6fb68 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:43:42 +0800 Subject: [PATCH 2084/3206] LoongArch: Handle jump tables options for RUST When compiling with LLVM and CONFIG_RUST is set, there exist objtool warnings in rust/core.o and rust/kernel.o, like this: rust/core.o: warning: objtool: _RNvXs1_NtNtCs5QSdWC790r4_4core5ascii10ascii_charNtB5_9AsciiCharNtNtB9_3fmt5Debug3fmt+0x54: sibling call from callable instruction with modified stack frame For this special case, the related object file shows that there is no generated relocation section '.rela.discard.tablejump_annotate' for the table jump instruction jirl, thus objtool can not know that what is the actual destination address. If rustc has the option "-Cllvm-args=--loongarch-annotate-tablejump", pass the option to enable jump tables for objtool, otherwise it should pass "-Zno-jump-tables" to keep compatibility with older rustc. How to test: $ rustup component add rust-src $ make LLVM=1 rustavailable $ make ARCH=loongarch LLVM=1 clean defconfig $ scripts/config -d MODVERSIONS \ -e RUST -e SAMPLES -e SAMPLES_RUST \ -e SAMPLE_RUST_CONFIGFS -e SAMPLE_RUST_MINIMAL \ -e SAMPLE_RUST_MISC_DEVICE -e SAMPLE_RUST_PRINT \ -e SAMPLE_RUST_DMA -e SAMPLE_RUST_DRIVER_PCI \ -e SAMPLE_RUST_DRIVER_PLATFORM -e SAMPLE_RUST_DRIVER_FAUX \ -e SAMPLE_RUST_DRIVER_AUXILIARY -e SAMPLE_RUST_HOSTPROGS $ make ARCH=loongarch LLVM=1 olddefconfig all Cc: stable@vger.kernel.org Acked-by: Miguel Ojeda Reported-by: Miguel Ojeda Closes: https://lore.kernel.org/rust-for-linux/CANiq72mNeCuPkCDrG2db3w=AX+O-zYrfprisDPmRac_qh65Dmg@mail.gmail.com/ Suggested-by: WANG Rui Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 4 ++++ arch/loongarch/Makefile | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index f0abc38c40ac9e..57933a717e92ab 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -298,6 +298,10 @@ config AS_HAS_LVZ_EXTENSION config CC_HAS_ANNOTATE_TABLEJUMP def_bool $(cc-option,-mannotate-tablejump) +config RUSTC_HAS_ANNOTATE_TABLEJUMP + depends on RUST + def_bool $(rustc-option,-Cllvm-args=--loongarch-annotate-tablejump) + menu "Kernel type and options" source "kernel/Kconfig.hz" diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 9d80af7f75c833..ae419e32f22e2f 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -106,6 +106,11 @@ KBUILD_CFLAGS += -mannotate-tablejump else KBUILD_CFLAGS += -fno-jump-tables # keep compatibility with older compilers endif +ifdef CONFIG_RUSTC_HAS_ANNOTATE_TABLEJUMP +KBUILD_RUSTFLAGS += -Cllvm-args=--loongarch-annotate-tablejump +else +KBUILD_RUSTFLAGS += -Zno-jump-tables # keep compatibility with older compilers +endif ifdef CONFIG_LTO_CLANG # The annotate-tablejump option can not be passed to LLVM backend when LTO is enabled. # Ensure it is aware of linker with LTO, '--loongarch-annotate-tablejump' also needs to From f5003098e2f337d8e8a87dc636250e3fa978d9ad Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:43:42 +0800 Subject: [PATCH 2085/3206] LoongArch: Update help info of ARCH_STRICT_ALIGN Loongson-3A6000 and 3C6000 CPUs also support unaligned memory access, so the current description is out of date to some extent. Actually, all of Loongson-3 series processors based on LoongArch support unaligned memory access, this hardware capability is indicated by the bit 20 (UAL) of CPUCFG1 register, update the help info to reflect the reality. Cc: stable@vger.kernel.org Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 57933a717e92ab..0631a6b11281b9 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -567,10 +567,14 @@ config ARCH_STRICT_ALIGN -mstrict-align build parameter to prevent unaligned accesses. CPUs with h/w unaligned access support: - Loongson-2K2000/2K3000/3A5000/3C5000/3D5000. + Loongson-2K2000/2K3000 and all of Loongson-3 series processors + based on LoongArch. CPUs without h/w unaligned access support: - Loongson-2K500/2K1000. + Loongson-2K0300/2K0500/2K1000. + + If you want to make sure whether to support unaligned memory access + on your hardware, please read the bit 20 (UAL) of CPUCFG1 register. This option is enabled by default to make the kernel be able to run on all LoongArch systems. But you can disable it manually if you want From a9d13433fe17be0e867e51e71a1acd2731fbef8d Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 18 Sep 2025 19:44:01 +0800 Subject: [PATCH 2086/3206] LoongArch: Align ACPI structures if ARCH_STRICT_ALIGN enabled ARCH_STRICT_ALIGN is used for hardware without UAL, now it only control the -mstrict-align flag. However, ACPI structures are packed by default so will cause unaligned accesses. To avoid this, define ACPI_MISALIGNMENT_NOT_SUPPORTED in asm/acenv.h to align ACPI structures if ARCH_STRICT_ALIGN enabled. Cc: stable@vger.kernel.org Reported-by: Binbin Zhou Suggested-by: Xi Ruoyao Suggested-by: Jiaxun Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/acenv.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/acenv.h b/arch/loongarch/include/asm/acenv.h index 52f298f7293bab..483c955f2ae50d 100644 --- a/arch/loongarch/include/asm/acenv.h +++ b/arch/loongarch/include/asm/acenv.h @@ -10,9 +10,8 @@ #ifndef _ASM_LOONGARCH_ACENV_H #define _ASM_LOONGARCH_ACENV_H -/* - * This header is required by ACPI core, but we have nothing to fill in - * right now. Will be updated later when needed. - */ +#ifdef CONFIG_ARCH_STRICT_ALIGN +#define ACPI_MISALIGNMENT_NOT_SUPPORTED +#endif /* CONFIG_ARCH_STRICT_ALIGN */ #endif /* _ASM_LOONGARCH_ACENV_H */ From 51adb03e6b865c0c6790f29659ff52d56742de2e Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Thu, 18 Sep 2025 19:44:04 +0800 Subject: [PATCH 2087/3206] LoongArch: Check the return value when creating kobj Add a check for the return value of kobject_create_and_add(), to ensure that the kobj allocation succeeds for later use. Cc: stable@vger.kernel.org Signed-off-by: Tao Cui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/env.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/loongarch/kernel/env.c b/arch/loongarch/kernel/env.c index c0a5dc9aeae287..be309a71f20491 100644 --- a/arch/loongarch/kernel/env.c +++ b/arch/loongarch/kernel/env.c @@ -109,6 +109,8 @@ static int __init boardinfo_init(void) struct kobject *loongson_kobj; loongson_kobj = kobject_create_and_add("loongson", firmware_kobj); + if (!loongson_kobj) + return -ENOMEM; return sysfs_create_file(loongson_kobj, &boardinfo_attr.attr); } From d6d69f0edde63b553345d4efaceb7daed89fe04c Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Thu, 18 Sep 2025 19:44:04 +0800 Subject: [PATCH 2088/3206] LoongArch: Replace sprintf() with sysfs_emit() As Documentation/filesystems/sysfs.rst suggested, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. No functional change intended. Cc: stable@vger.kernel.org Signed-off-by: Tao Cui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/env.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/env.c b/arch/loongarch/kernel/env.c index be309a71f20491..23bd5ae2212c26 100644 --- a/arch/loongarch/kernel/env.c +++ b/arch/loongarch/kernel/env.c @@ -86,7 +86,7 @@ late_initcall(fdt_cpu_clk_init); static ssize_t boardinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, + return sysfs_emit(buf, "BIOS Information\n" "Vendor\t\t\t: %s\n" "Version\t\t\t: %s\n" From 677d4a52d4dc4a147d5e84af9ff207832578be70 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:44:08 +0800 Subject: [PATCH 2089/3206] LoongArch: Fix unreliable stack for live patching When testing the kernel live patching with "modprobe livepatch-sample", there is a timeout over 15 seconds from "starting patching transition" to "patching complete". The dmesg command shows "unreliable stack" for user tasks in debug mode, here is one of the messages: livepatch: klp_try_switch_task: bash:1193 has an unreliable stack The "unreliable stack" is because it can not unwind from do_syscall() to its previous frame handle_syscall(). It should use fp to find the original stack top due to secondary stack in do_syscall(), but fp is not used for some other functions, then fp can not be restored by the next frame of do_syscall(), so it is necessary to save fp if task is not current, in order to get the stack top of do_syscall(). Here are the call chains: klp_enable_patch() klp_try_complete_transition() klp_try_switch_task() klp_check_and_switch_task() klp_check_stack() stack_trace_save_tsk_reliable() arch_stack_walk_reliable() When executing "rmmod livepatch-sample", there exists a similar issue. With this patch, it takes a short time for patching and unpatching. Before: # modprobe livepatch-sample # dmesg -T | tail -3 [Sat Sep 6 11:00:20 2025] livepatch: 'livepatch_sample': starting patching transition [Sat Sep 6 11:00:35 2025] livepatch: signaling remaining tasks [Sat Sep 6 11:00:36 2025] livepatch: 'livepatch_sample': patching complete # echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled # rmmod livepatch_sample rmmod: ERROR: Module livepatch_sample is in use # rmmod livepatch_sample # dmesg -T | tail -3 [Sat Sep 6 11:06:05 2025] livepatch: 'livepatch_sample': starting unpatching transition [Sat Sep 6 11:06:20 2025] livepatch: signaling remaining tasks [Sat Sep 6 11:06:21 2025] livepatch: 'livepatch_sample': unpatching complete After: # modprobe livepatch-sample # dmesg -T | tail -2 [Tue Sep 16 16:19:30 2025] livepatch: 'livepatch_sample': starting patching transition [Tue Sep 16 16:19:31 2025] livepatch: 'livepatch_sample': patching complete # echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled # rmmod livepatch_sample # dmesg -T | tail -2 [Tue Sep 16 16:19:36 2025] livepatch: 'livepatch_sample': starting unpatching transition [Tue Sep 16 16:19:37 2025] livepatch: 'livepatch_sample': unpatching complete Cc: stable@vger.kernel.org # v6.9+ Fixes: 199cc14cb4f1 ("LoongArch: Add kernel livepatching support") Reported-by: Xi Zhang Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/stacktrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/stacktrace.c b/arch/loongarch/kernel/stacktrace.c index 9a038d1070d73b..387dc4d3c4868f 100644 --- a/arch/loongarch/kernel/stacktrace.c +++ b/arch/loongarch/kernel/stacktrace.c @@ -51,12 +51,13 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (task == current) { regs->regs[3] = (unsigned long)__builtin_frame_address(0); regs->csr_era = (unsigned long)__builtin_return_address(0); + regs->regs[22] = 0; } else { regs->regs[3] = thread_saved_fp(task); regs->csr_era = thread_saved_ra(task); + regs->regs[22] = task->thread.reg22; } regs->regs[1] = 0; - regs->regs[22] = 0; for (unwind_start(&state, task, regs); !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) { From ac398f570724c41e5e039d54e4075519f6af7408 Mon Sep 17 00:00:00 2001 From: Guangshuo Li <202321181@mail.sdu.edu.cn> Date: Thu, 18 Sep 2025 19:44:10 +0800 Subject: [PATCH 2090/3206] LoongArch: vDSO: Check kcalloc() result in init_vdso() Add a NULL-pointer check after the kcalloc() call in init_vdso(). If allocation fails, return -ENOMEM to prevent a possible dereference of vdso_info.code_mapping.pages when it is NULL. Cc: stable@vger.kernel.org Fixes: 2ed119aef60d ("LoongArch: Set correct size for vDSO code mapping") Signed-off-by: Guangshuo Li <202321181@mail.sdu.edu.cn> Signed-off-by: Huacai Chen --- arch/loongarch/kernel/vdso.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c index 7b888d9085a014..dee1a15d7f4c77 100644 --- a/arch/loongarch/kernel/vdso.c +++ b/arch/loongarch/kernel/vdso.c @@ -54,6 +54,9 @@ static int __init init_vdso(void) vdso_info.code_mapping.pages = kcalloc(vdso_info.size / PAGE_SIZE, sizeof(struct page *), GFP_KERNEL); + if (!vdso_info.code_mapping.pages) + return -ENOMEM; + pfn = __phys_to_pfn(__pa_symbol(vdso_info.vdso)); for (i = 0; i < vdso_info.size / PAGE_SIZE; i++) vdso_info.code_mapping.pages[i] = pfn_to_page(pfn + i); From 091b29d53fe645781c5c1f405bc9fcd50ce5792b Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Thu, 18 Sep 2025 19:44:22 +0800 Subject: [PATCH 2091/3206] LoongArch: KVM: Remove unused returns and semicolons The default branch has already handled all undefined cases, so the final return statement is redundant. Redundant semicolons are removed, too. Cc: stable@vger.kernel.org Reviewed-by: Bibo Mao Signed-off-by: Tao Cui Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 2ce41f93b2a444..6c9c7de7226b63 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -778,10 +778,8 @@ static long kvm_save_notify(struct kvm_vcpu *vcpu) return 0; default: return KVM_HCALL_INVALID_CODE; - }; - - return KVM_HCALL_INVALID_CODE; -}; + } +} /* * kvm_handle_lsx_disabled() - Guest used LSX while disabled in root. From f58c9aa1065f73d243904b267c71f6a9d1e9f90e Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 18 Sep 2025 19:44:22 +0800 Subject: [PATCH 2092/3206] LoongArch: KVM: Fix VM migration failure with PTW enabled With PTW disabled system, bit _PAGE_DIRTY is a HW bit for page writing. However with PTW enabled system, bit _PAGE_WRITE is also a "HW bit" for page writing, because hardware synchronizes _PAGE_WRITE to _PAGE_DIRTY automatically. Previously, _PAGE_WRITE is treated as a SW bit to record the page writeable attribute for the fast page fault handling in the secondary MMU, however with PTW enabled machine, this bit is used by HW already (so setting it will silence the TLB modify exception). Here define KVM_PAGE_WRITEABLE with the SW bit _PAGE_MODIFIED, so that it can work on both PTW disabled and enabled machines. And for HW write bits, both _PAGE_DIRTY and _PAGE_WRITE are set or clear together. Cc: stable@vger.kernel.org Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_mmu.h | 20 ++++++++++++++++---- arch/loongarch/kvm/mmu.c | 8 ++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_mmu.h b/arch/loongarch/include/asm/kvm_mmu.h index 099bafc6f797c9..e36cc7e8ed200a 100644 --- a/arch/loongarch/include/asm/kvm_mmu.h +++ b/arch/loongarch/include/asm/kvm_mmu.h @@ -16,6 +16,13 @@ */ #define KVM_MMU_CACHE_MIN_PAGES (CONFIG_PGTABLE_LEVELS - 1) +/* + * _PAGE_MODIFIED is a SW pte bit, it records page ever written on host + * kernel, on secondary MMU it records the page writeable attribute, in + * order for fast path handling. + */ +#define KVM_PAGE_WRITEABLE _PAGE_MODIFIED + #define _KVM_FLUSH_PGTABLE 0x1 #define _KVM_HAS_PGMASK 0x2 #define kvm_pfn_pte(pfn, prot) (((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) @@ -52,10 +59,10 @@ static inline void kvm_set_pte(kvm_pte_t *ptep, kvm_pte_t val) WRITE_ONCE(*ptep, val); } -static inline int kvm_pte_write(kvm_pte_t pte) { return pte & _PAGE_WRITE; } -static inline int kvm_pte_dirty(kvm_pte_t pte) { return pte & _PAGE_DIRTY; } static inline int kvm_pte_young(kvm_pte_t pte) { return pte & _PAGE_ACCESSED; } static inline int kvm_pte_huge(kvm_pte_t pte) { return pte & _PAGE_HUGE; } +static inline int kvm_pte_dirty(kvm_pte_t pte) { return pte & __WRITEABLE; } +static inline int kvm_pte_writeable(kvm_pte_t pte) { return pte & KVM_PAGE_WRITEABLE; } static inline kvm_pte_t kvm_pte_mkyoung(kvm_pte_t pte) { @@ -69,12 +76,12 @@ static inline kvm_pte_t kvm_pte_mkold(kvm_pte_t pte) static inline kvm_pte_t kvm_pte_mkdirty(kvm_pte_t pte) { - return pte | _PAGE_DIRTY; + return pte | __WRITEABLE; } static inline kvm_pte_t kvm_pte_mkclean(kvm_pte_t pte) { - return pte & ~_PAGE_DIRTY; + return pte & ~__WRITEABLE; } static inline kvm_pte_t kvm_pte_mkhuge(kvm_pte_t pte) @@ -87,6 +94,11 @@ static inline kvm_pte_t kvm_pte_mksmall(kvm_pte_t pte) return pte & ~_PAGE_HUGE; } +static inline kvm_pte_t kvm_pte_mkwriteable(kvm_pte_t pte) +{ + return pte | KVM_PAGE_WRITEABLE; +} + static inline int kvm_need_flush(kvm_ptw_ctx *ctx) { return ctx->flag & _KVM_FLUSH_PGTABLE; diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index ed956c5cf2cc04..7c8143e79c1279 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -569,7 +569,7 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ /* Track access to pages marked old */ new = kvm_pte_mkyoung(*ptep); if (write && !kvm_pte_dirty(new)) { - if (!kvm_pte_write(new)) { + if (!kvm_pte_writeable(new)) { ret = -EFAULT; goto out; } @@ -856,9 +856,9 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) prot_bits |= _CACHE_SUC; if (writeable) { - prot_bits |= _PAGE_WRITE; + prot_bits = kvm_pte_mkwriteable(prot_bits); if (write) - prot_bits |= __WRITEABLE; + prot_bits = kvm_pte_mkdirty(prot_bits); } /* Disable dirty logging on HugePages */ @@ -904,7 +904,7 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) kvm_release_faultin_page(kvm, page, false, writeable); spin_unlock(&kvm->mmu_lock); - if (prot_bits & _PAGE_DIRTY) + if (kvm_pte_dirty(prot_bits)) mark_page_dirty_in_slot(kvm, memslot, gfn); out: From 47256c4c8b1bfbc63223a0da2d4fa90b6ede5cbb Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 18 Sep 2025 19:44:22 +0800 Subject: [PATCH 2093/3206] LoongArch: KVM: Avoid copy_*_user() with lock hold in kvm_eiointc_ctrl_access() Function copy_from_user() and copy_to_user() may sleep because of page fault, and they cannot be called in spin_lock hold context. Here move function calling of copy_from_user() and copy_to_user() before spinlock context in function kvm_eiointc_ctrl_access(). Otherwise there will be possible warning such as: BUG: sleeping function called from invalid context at include/linux/uaccess.h:192 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 6292, name: qemu-system-loo preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 INFO: lockdep is turned off. irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last enabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last disabled at (0): [<0000000000000000>] 0x0 CPU: 41 UID: 0 PID: 6292 Comm: qemu-system-loo Tainted: G W 6.17.0-rc3+ #31 PREEMPT(full) Tainted: [W]=WARN Stack : 0000000000000076 0000000000000000 9000000004c28264 9000100092ff4000 9000100092ff7b80 9000100092ff7b88 0000000000000000 9000100092ff7cc8 9000100092ff7cc0 9000100092ff7cc0 9000100092ff7a00 0000000000000001 0000000000000001 9000100092ff7b88 947d2f9216a5e8b9 900010008773d880 00000000ffff8b9f fffffffffffffffe 0000000000000ba1 fffffffffffffffe 000000000000003e 900000000825a15b 000010007ad38000 9000100092ff7ec0 0000000000000000 0000000000000000 9000000006f3ac60 9000000007252000 0000000000000000 00007ff746ff2230 0000000000000053 9000200088a021b0 0000555556c9d190 0000000000000000 9000000004c2827c 000055556cfb5f40 00000000000000b0 0000000000000007 0000000000000007 0000000000071c1d Call Trace: [<9000000004c2827c>] show_stack+0x5c/0x180 [<9000000004c20fac>] dump_stack_lvl+0x94/0xe4 [<9000000004c99c7c>] __might_resched+0x26c/0x290 [<9000000004f68968>] __might_fault+0x20/0x88 [] kvm_eiointc_ctrl_access.isra.0+0x88/0x380 [kvm] [] kvm_device_ioctl+0x194/0x290 [kvm] [<900000000506b0d8>] sys_ioctl+0x388/0x1010 [<90000000063ed210>] do_syscall+0xb0/0x2d8 [<9000000004c25ef8>] handle_syscall+0xb8/0x158 Cc: stable@vger.kernel.org Fixes: 1ad7efa552fd5 ("LoongArch: KVM: Add EIOINTC user mode read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 026b139dcff2de..2b3c5203738a4f 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -426,21 +426,26 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, struct loongarch_eiointc *s = dev->kvm->arch.eiointc; data = (void __user *)attr->addr; - spin_lock_irqsave(&s->lock, flags); switch (type) { case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU: + case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE: if (copy_from_user(&val, data, 4)) - ret = -EFAULT; - else { - if (val >= EIOINTC_ROUTE_MAX_VCPUS) - ret = -EINVAL; - else - s->num_cpu = val; - } + return -EFAULT; + break; + default: + break; + } + + spin_lock_irqsave(&s->lock, flags); + switch (type) { + case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU: + if (val >= EIOINTC_ROUTE_MAX_VCPUS) + ret = -EINVAL; + else + s->num_cpu = val; break; case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE: - if (copy_from_user(&s->features, data, 4)) - ret = -EFAULT; + s->features = val; if (!(s->features & BIT(EIOINTC_HAS_VIRT_EXTENSION))) s->status |= BIT(EIOINTC_ENABLE); break; From 62f11796a0dfa1a2ef5f50a2d1bc81c81628fb8e Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 18 Sep 2025 19:44:22 +0800 Subject: [PATCH 2094/3206] LoongArch: KVM: Avoid copy_*_user() with lock hold in kvm_eiointc_regs_access() Function copy_from_user() and copy_to_user() may sleep because of page fault, and they cannot be called in spin_lock hold context. Here move function calling of copy_from_user() and copy_to_user() before spinlock context in function kvm_eiointc_ctrl_access(). Otherwise there will be possible warning such as: BUG: sleeping function called from invalid context at include/linux/uaccess.h:192 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 6292, name: qemu-system-loo preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 INFO: lockdep is turned off. irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last enabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last disabled at (0): [<0000000000000000>] 0x0 CPU: 41 UID: 0 PID: 6292 Comm: qemu-system-loo Tainted: G W 6.17.0-rc3+ #31 PREEMPT(full) Tainted: [W]=WARN Stack : 0000000000000076 0000000000000000 9000000004c28264 9000100092ff4000 9000100092ff7b80 9000100092ff7b88 0000000000000000 9000100092ff7cc8 9000100092ff7cc0 9000100092ff7cc0 9000100092ff7a00 0000000000000001 0000000000000001 9000100092ff7b88 947d2f9216a5e8b9 900010008773d880 00000000ffff8b9f fffffffffffffffe 0000000000000ba1 fffffffffffffffe 000000000000003e 900000000825a15b 000010007ad38000 9000100092ff7ec0 0000000000000000 0000000000000000 9000000006f3ac60 9000000007252000 0000000000000000 00007ff746ff2230 0000000000000053 9000200088a021b0 0000555556c9d190 0000000000000000 9000000004c2827c 000055556cfb5f40 00000000000000b0 0000000000000007 0000000000000007 0000000000071c1d Call Trace: [<9000000004c2827c>] show_stack+0x5c/0x180 [<9000000004c20fac>] dump_stack_lvl+0x94/0xe4 [<9000000004c99c7c>] __might_resched+0x26c/0x290 [<9000000004f68968>] __might_fault+0x20/0x88 [] kvm_eiointc_regs_access.isra.0+0x88/0x380 [kvm] [] kvm_device_ioctl+0x194/0x290 [kvm] [<900000000506b0d8>] sys_ioctl+0x388/0x1010 [<90000000063ed210>] do_syscall+0xb0/0x2d8 [<9000000004c25ef8>] handle_syscall+0xb8/0x158 Cc: stable@vger.kernel.org Fixes: 1ad7efa552fd5 ("LoongArch: KVM: Add EIOINTC user mode read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 33 ++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 2b3c5203738a4f..6455420c77372e 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -467,19 +467,17 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, static int kvm_eiointc_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, - bool is_write) + bool is_write, int *data) { int addr, cpu, offset, ret = 0; unsigned long flags; void *p = NULL; - void __user *data; struct loongarch_eiointc *s; s = dev->kvm->arch.eiointc; addr = attr->attr; cpu = addr >> 16; addr &= 0xffff; - data = (void __user *)attr->addr; switch (addr) { case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: offset = (addr - EIOINTC_NODETYPE_START) / 4; @@ -518,13 +516,10 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev, } spin_lock_irqsave(&s->lock, flags); - if (is_write) { - if (copy_from_user(p, data, 4)) - ret = -EFAULT; - } else { - if (copy_to_user(data, p, 4)) - ret = -EFAULT; - } + if (is_write) + memcpy(p, data, 4); + else + memcpy(data, p, 4); spin_unlock_irqrestore(&s->lock, flags); return ret; @@ -581,9 +576,18 @@ static int kvm_eiointc_sw_status_access(struct kvm_device *dev, static int kvm_eiointc_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { + int ret, data; + switch (attr->group) { case KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS: - return kvm_eiointc_regs_access(dev, attr, false); + ret = kvm_eiointc_regs_access(dev, attr, false, &data); + if (ret) + return ret; + + if (copy_to_user((void __user *)attr->addr, &data, 4)) + ret = -EFAULT; + + return ret; case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: return kvm_eiointc_sw_status_access(dev, attr, false); default: @@ -594,11 +598,16 @@ static int kvm_eiointc_get_attr(struct kvm_device *dev, static int kvm_eiointc_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { + int data; + switch (attr->group) { case KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL: return kvm_eiointc_ctrl_access(dev, attr); case KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS: - return kvm_eiointc_regs_access(dev, attr, true); + if (copy_from_user(&data, (void __user *)attr->addr, 4)) + return -EFAULT; + + return kvm_eiointc_regs_access(dev, attr, true, &data); case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: return kvm_eiointc_sw_status_access(dev, attr, true); default: From 01a8e68396a6d51f5ba92021ad1a4b8eaabdd0e7 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 18 Sep 2025 19:44:22 +0800 Subject: [PATCH 2095/3206] LoongArch: KVM: Avoid copy_*_user() with lock hold in kvm_eiointc_sw_status_access() Function copy_from_user() and copy_to_user() may sleep because of page fault, and they cannot be called in spin_lock hold context. Here move funtcion calling of copy_from_user() and copy_to_user() out of function kvm_eiointc_sw_status_access(). Otherwise there will be possible warning such as: BUG: sleeping function called from invalid context at include/linux/uaccess.h:192 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 6292, name: qemu-system-loo preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 INFO: lockdep is turned off. irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last enabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last disabled at (0): [<0000000000000000>] 0x0 CPU: 41 UID: 0 PID: 6292 Comm: qemu-system-loo Tainted: G W 6.17.0-rc3+ #31 PREEMPT(full) Tainted: [W]=WARN Stack : 0000000000000076 0000000000000000 9000000004c28264 9000100092ff4000 9000100092ff7b80 9000100092ff7b88 0000000000000000 9000100092ff7cc8 9000100092ff7cc0 9000100092ff7cc0 9000100092ff7a00 0000000000000001 0000000000000001 9000100092ff7b88 947d2f9216a5e8b9 900010008773d880 00000000ffff8b9f fffffffffffffffe 0000000000000ba1 fffffffffffffffe 000000000000003e 900000000825a15b 000010007ad38000 9000100092ff7ec0 0000000000000000 0000000000000000 9000000006f3ac60 9000000007252000 0000000000000000 00007ff746ff2230 0000000000000053 9000200088a021b0 0000555556c9d190 0000000000000000 9000000004c2827c 000055556cfb5f40 00000000000000b0 0000000000000007 0000000000000007 0000000000071c1d Call Trace: [<9000000004c2827c>] show_stack+0x5c/0x180 [<9000000004c20fac>] dump_stack_lvl+0x94/0xe4 [<9000000004c99c7c>] __might_resched+0x26c/0x290 [<9000000004f68968>] __might_fault+0x20/0x88 [] kvm_eiointc_sw_status_access.isra.0+0x88/0x380 [kvm] [] kvm_device_ioctl+0x194/0x290 [kvm] [<900000000506b0d8>] sys_ioctl+0x388/0x1010 [<90000000063ed210>] do_syscall+0xb0/0x2d8 [<9000000004c25ef8>] handle_syscall+0xb8/0x158 Cc: stable@vger.kernel.org Fixes: 1ad7efa552fd5 ("LoongArch: KVM: Add EIOINTC user mode read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 6455420c77372e..c3233369538184 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -527,19 +527,17 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev, static int kvm_eiointc_sw_status_access(struct kvm_device *dev, struct kvm_device_attr *attr, - bool is_write) + bool is_write, int *data) { int addr, ret = 0; unsigned long flags; void *p = NULL; - void __user *data; struct loongarch_eiointc *s; s = dev->kvm->arch.eiointc; addr = attr->attr; addr &= 0xffff; - data = (void __user *)attr->addr; switch (addr) { case KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU: if (is_write) @@ -561,13 +559,10 @@ static int kvm_eiointc_sw_status_access(struct kvm_device *dev, return -EINVAL; } spin_lock_irqsave(&s->lock, flags); - if (is_write) { - if (copy_from_user(p, data, 4)) - ret = -EFAULT; - } else { - if (copy_to_user(data, p, 4)) - ret = -EFAULT; - } + if (is_write) + memcpy(p, data, 4); + else + memcpy(data, p, 4); spin_unlock_irqrestore(&s->lock, flags); return ret; @@ -589,7 +584,14 @@ static int kvm_eiointc_get_attr(struct kvm_device *dev, return ret; case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: - return kvm_eiointc_sw_status_access(dev, attr, false); + ret = kvm_eiointc_sw_status_access(dev, attr, false, &data); + if (ret) + return ret; + + if (copy_to_user((void __user *)attr->addr, &data, 4)) + ret = -EFAULT; + + return ret; default: return -EINVAL; } @@ -609,7 +611,10 @@ static int kvm_eiointc_set_attr(struct kvm_device *dev, return kvm_eiointc_regs_access(dev, attr, true, &data); case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: - return kvm_eiointc_sw_status_access(dev, attr, true); + if (copy_from_user(&data, (void __user *)attr->addr, 4)) + return -EFAULT; + + return kvm_eiointc_sw_status_access(dev, attr, true, &data); default: return -EINVAL; } From 8dc5245673cf7f33743e5c0d2a4207c0b8df3067 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 18 Sep 2025 19:44:25 +0800 Subject: [PATCH 2096/3206] LoongArch: KVM: Avoid copy_*_user() with lock hold in kvm_pch_pic_regs_access() Function copy_from_user() and copy_to_user() may sleep because of page fault, and they cannot be called in spin_lock hold context. Here move function calling of copy_from_user() and copy_to_user() out of spinlock context in function kvm_pch_pic_regs_access(). Otherwise there will be possible warning such as: BUG: sleeping function called from invalid context at include/linux/uaccess.h:192 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 6292, name: qemu-system-loo preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 INFO: lockdep is turned off. irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last enabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last disabled at (0): [<0000000000000000>] 0x0 CPU: 41 UID: 0 PID: 6292 Comm: qemu-system-loo Tainted: G W 6.17.0-rc3+ #31 PREEMPT(full) Tainted: [W]=WARN Stack : 0000000000000076 0000000000000000 9000000004c28264 9000100092ff4000 9000100092ff7b80 9000100092ff7b88 0000000000000000 9000100092ff7cc8 9000100092ff7cc0 9000100092ff7cc0 9000100092ff7a00 0000000000000001 0000000000000001 9000100092ff7b88 947d2f9216a5e8b9 900010008773d880 00000000ffff8b9f fffffffffffffffe 0000000000000ba1 fffffffffffffffe 000000000000003e 900000000825a15b 000010007ad38000 9000100092ff7ec0 0000000000000000 0000000000000000 9000000006f3ac60 9000000007252000 0000000000000000 00007ff746ff2230 0000000000000053 9000200088a021b0 0000555556c9d190 0000000000000000 9000000004c2827c 000055556cfb5f40 00000000000000b0 0000000000000007 0000000000000007 0000000000071c1d Call Trace: [<9000000004c2827c>] show_stack+0x5c/0x180 [<9000000004c20fac>] dump_stack_lvl+0x94/0xe4 [<9000000004c99c7c>] __might_resched+0x26c/0x290 [<9000000004f68968>] __might_fault+0x20/0x88 [] kvm_pch_pic_regs_access.isra.0+0x88/0x380 [kvm] [] kvm_device_ioctl+0x194/0x290 [kvm] [<900000000506b0d8>] sys_ioctl+0x388/0x1010 [<90000000063ed210>] do_syscall+0xb0/0x2d8 [<9000000004c25ef8>] handle_syscall+0xb8/0x158 Cc: stable@vger.kernel.org Fixes: d206d95148732 ("LoongArch: KVM: Add PCHPIC user mode read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/pch_pic.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c index 119290bcea79ab..baf3b4faf7ead2 100644 --- a/arch/loongarch/kvm/intc/pch_pic.c +++ b/arch/loongarch/kvm/intc/pch_pic.c @@ -348,6 +348,7 @@ static int kvm_pch_pic_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, bool is_write) { + char buf[8]; int addr, offset, len = 8, ret = 0; void __user *data; void *p = NULL; @@ -397,17 +398,23 @@ static int kvm_pch_pic_regs_access(struct kvm_device *dev, return -EINVAL; } - spin_lock(&s->lock); - /* write or read value according to is_write */ if (is_write) { - if (copy_from_user(p, data, len)) - ret = -EFAULT; - } else { - if (copy_to_user(data, p, len)) - ret = -EFAULT; + if (copy_from_user(buf, data, len)) + return -EFAULT; } + + spin_lock(&s->lock); + if (is_write) + memcpy(p, buf, len); + else + memcpy(buf, p, len); spin_unlock(&s->lock); + if (!is_write) { + if (copy_to_user(data, buf, len)) + return -EFAULT; + } + return ret; } From dd68f51febbd6eb8a40872724f4c9bcc84442114 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 20 Aug 2025 19:29:03 +0100 Subject: [PATCH 2097/3206] kselftest/arm64: Verify that we reject out of bounds VLs in sve-ptrace We do not currently have a test that asserts that we reject attempts to set a vector length smaller than SVE_VL_MIN or larger than SVE_VL_MAX, add one since that is our current behaviour. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/sve-ptrace.c | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index 79bcc2369cdb73..0ce841a7bb6ee6 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -66,7 +66,7 @@ static const struct vec_type vec_types[] = { }; #define VL_TESTS (((TEST_VQ_MAX - SVE_VQ_MIN) + 1) * 4) -#define FLAG_TESTS 2 +#define FLAG_TESTS 4 #define FPSIMD_TESTS 2 #define EXPECTED_TESTS ((VL_TESTS + FLAG_TESTS + FPSIMD_TESTS) * ARRAY_SIZE(vec_types)) @@ -286,6 +286,25 @@ static void check_u32(unsigned int vl, const char *reg, } } +/* Set out of range VLs */ +static void ptrace_set_vl_ranges(pid_t child, const struct vec_type *type) +{ + struct user_sve_header sve; + int ret; + + memset(&sve, 0, sizeof(sve)); + sve.flags = SVE_PT_REGS_SVE; + sve.size = sizeof(sve); + + ret = set_sve(child, type, &sve); + ksft_test_result(ret != 0, "%s Set invalid VL 0\n", type->name); + + sve.vl = SVE_VL_MAX + SVE_VQ_BYTES; + ret = set_sve(child, type, &sve); + ksft_test_result(ret != 0, "%s Set invalid VL %d\n", type->name, + SVE_VL_MAX + SVE_VQ_BYTES); +} + /* Access the FPSIMD registers via the SVE regset */ static void ptrace_sve_fpsimd(pid_t child, const struct vec_type *type) { @@ -719,6 +738,17 @@ static int do_parent(pid_t child) vec_types[i].name); } + /* Setting out of bounds VLs should fail */ + if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) { + ptrace_set_vl_ranges(child, &vec_types[i]); + } else { + ksft_test_result_skip("%s Set invalid VL 0\n", + vec_types[i].name); + ksft_test_result_skip("%s Set invalid VL %d\n", + vec_types[i].name, + SVE_VL_MAX + SVE_VQ_BYTES); + } + /* Step through every possible VQ */ for (vq = SVE_VQ_MIN; vq <= TEST_VQ_MAX; vq++) { vl = sve_vl_from_vq(vq); From 09b5febf84262a303ecedf0821e03b8d8492a38b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 20 Aug 2025 19:29:04 +0100 Subject: [PATCH 2098/3206] kselftest/arm64: Check that unsupported regsets fail in sve-ptrace Add a test which verifies that NT_ARM_SVE and NT_ARM_SSVE reads and writes are rejected as expected when the relevant architecture feature is not supported. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/sve-ptrace.c | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index 0ce841a7bb6ee6..e0fc3a001e2830 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -174,6 +174,38 @@ static int set_sve(pid_t pid, const struct vec_type *type, return ret; } +/* A read operation fails */ +static void read_fails(pid_t child, const struct vec_type *type) +{ + struct user_sve_header *new_sve = NULL; + size_t new_sve_size = 0; + void *ret; + + ret = get_sve(child, type, (void **)&new_sve, &new_sve_size); + + ksft_test_result(ret == NULL, "%s unsupported read fails\n", + type->name); + + free(new_sve); +} + +/* A write operation fails */ +static void write_fails(pid_t child, const struct vec_type *type) +{ + struct user_sve_header sve; + int ret; + + /* Just the header, no data */ + memset(&sve, 0, sizeof(sve)); + sve.size = sizeof(sve); + sve.flags = SVE_PT_REGS_SVE; + sve.vl = SVE_VL_MIN; + ret = set_sve(child, type, &sve); + + ksft_test_result(ret != 0, "%s unsupported write fails\n", + type->name); +} + /* Validate setting and getting the inherit flag */ static void ptrace_set_get_inherit(pid_t child, const struct vec_type *type) { @@ -718,6 +750,20 @@ static int do_parent(pid_t child) } for (i = 0; i < ARRAY_SIZE(vec_types); i++) { + /* + * If the vector type isn't supported reads and writes + * should fail. + */ + if (!(getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap)) { + read_fails(child, &vec_types[i]); + write_fails(child, &vec_types[i]); + } else { + ksft_test_result_skip("%s unsupported read fails\n", + vec_types[i].name); + ksft_test_result_skip("%s unsupported write fails\n", + vec_types[i].name); + } + /* FPSIMD via SVE regset */ if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) { ptrace_sve_fpsimd(child, &vec_types[i]); From a9f859b516ac98c06b0d24e691fceab32a9665d5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Sep 2025 15:48:00 +0200 Subject: [PATCH 2099/3206] s390/bitops: Limit return value range of __flogr() With the recent ffs() and ffs64() optimization a logical AND operation was removed, which allowed the compiler to tell the return value range of both functions. This may lead to compile warnings as reported by the kernel test robot: drivers/infiniband/hw/mlx5/mr.c: In function 'mlx5r_cache_create_ent_locked': >> drivers/infiniband/hw/mlx5/mr.c:840:31: warning: 'sprintf' may write a terminating nul past the end of the destination [-Wformat-overflow=] 840 | sprintf(ent->name, "%d", order); | ^ In function 'mlx5_mkey_cache_debugfs_add_ent', inlined from 'mlx5r_cache_create_ent_locked' at drivers/infiniband/hw/mlx5/mr.c:930:3: drivers/infiniband/hw/mlx5/mr.c:840:9: note: 'sprintf' output between 2 and 5 bytes into a destination of size 4 840 | sprintf(ent->name, "%d", order); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Add the AND operation again to address the warning. From a correctness point of view the AND operation is not necessary, however there is no other way to tell the compiler that the returned value of the flogr inline assembly is in the range of 0..64. This increases the kernel image size by 566 bytes (defconfig, gcc 15.2.0). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508211859.UoYsJbLN-lkp@intel.com/ Fixes: de88e74889a3 ("s390/bitops: Slightly optimize ffs() and fls64()") Suggested-by: Juergen Christ Reviewed-by: Juergen Christ Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 9dfb687ba62009..9bc70acbac9ec4 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -169,7 +169,7 @@ static __always_inline unsigned char __flogr(unsigned long word) asm volatile( " flogr %[rp],%[rp]\n" : [rp] "+d" (rp.pair) : : "cc"); - return rp.even; + return rp.even & 127; } } From f72e2cff13aefe305fc8fc6afe4f43626e4ad88c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Sep 2025 15:48:01 +0200 Subject: [PATCH 2100/3206] compiler_types: Add __assume macro Make the statement attribute "assume" with a new __assume macro available. The assume attribute is used to indicate that a certain condition is assumed to be true. Compilers may or may not use this indication to generate optimized code. If this condition is violated at runtime, the behavior is undefined. Note that the clang documentation states that optimizers may react differently to this attribute, and this may even have a negative performance impact. Therefore this attribute should be used with care. Signed-off-by: Heiko Carstens Reviewed-by: Nathan Chancellor Signed-off-by: Alexander Gordeev --- include/linux/compiler_types.h | 23 +++++++++++++++++++++++ init/Kconfig | 10 ++++++++++ 2 files changed, 33 insertions(+) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 16755431fc11ee..2f3e80bf9f3574 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -329,6 +329,29 @@ struct ftrace_likely_data { #define __no_sanitize_or_inline __always_inline #endif +/* + * The assume attribute is used to indicate that a certain condition is + * assumed to be true. If this condition is violated at runtime, the behavior + * is undefined. Compilers may or may not use this indication to generate + * optimized code. + * + * Note that the clang documentation states that optimizers may react + * differently to this attribute, and this may even have a negative + * performance impact. Therefore this attribute should be used with care. + * + * Optional: only supported since gcc >= 13 + * Optional: only supported since clang >= 19 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html#index-assume-statement-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#id13 + * + */ +#ifdef CONFIG_CC_HAS_ASSUME +# define __assume(expr) __attribute__((__assume__(expr))) +#else +# define __assume(expr) +#endif + /* * Optional: only supported since gcc >= 15 * Optional: only supported since clang >= 18 diff --git a/init/Kconfig b/init/Kconfig index 83632025121937..dabec90f18f646 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -112,6 +112,16 @@ config TOOLS_SUPPORT_RELR config CC_HAS_ASM_INLINE def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null) +config CC_HAS_ASSUME + bool + # clang needs to be at least 19.1.0 since the meaning of the assume + # attribute changed: + # https://github.com/llvm/llvm-project/commit/c44fa3e8a9a44c2e9a575768a3c185354b9f6c17 + default y if CC_IS_CLANG && CLANG_VERSION >= 190100 + # supported since gcc 13.1.0 + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106654 + default y if CC_IS_GCC && GCC_VERSION >= 130100 + config CC_HAS_NO_PROFILE_FN_ATTR def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror) From 79161603952c842eb22313f2060051b359b0a592 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Sep 2025 15:48:02 +0200 Subject: [PATCH 2101/3206] s390/bitops: Use __assume() for __flogr() inline assembly return value Use __assume() to tell compilers that the output operand of the __flogr() inline assembly contains a value in the range of 0..64. This allows to optimize the logical AND operation away. This reduces the kernel image size by 2804 bytes (defconfig, gcc 15.2.0). Suggested-by: Juergen Christ Reviewed-by: Juergen Christ Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 9bc70acbac9ec4..ac94672db81701 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -132,9 +132,10 @@ static inline bool test_bit_inv(unsigned long nr, */ static __always_inline unsigned char __flogr(unsigned long word) { - if (__builtin_constant_p(word)) { - unsigned long bit = 0; + unsigned long bit; + if (__builtin_constant_p(word)) { + bit = 0; if (!word) return 64; if (!(word & 0xffffffff00000000UL)) { @@ -169,7 +170,14 @@ static __always_inline unsigned char __flogr(unsigned long word) asm volatile( " flogr %[rp],%[rp]\n" : [rp] "+d" (rp.pair) : : "cc"); - return rp.even & 127; + bit = rp.even; + /* + * The result of the flogr instruction is a value in the range + * of 0..64. Let the compiler know that the AND operation can + * be optimized away. + */ + __assume(bit <= 64); + return bit & 127; } } From f46ccdb87a2573a23ee2d2c21a6b087af9ae76c0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Sep 2025 15:48:03 +0200 Subject: [PATCH 2102/3206] s390/bitops: Cleanup __flogr() The flogr() inline assembly has no side effects and generates the same output if the input does not change. Therefore remove the volatile qualifier to allow the compiler to optimize the inline assembly away, if possible. Also remove the superfluous '\n' which makes the inline assembly appear larger than it is according to compiler heuristics (number of lines). Furthermore change the return type of flogr() to unsigned long and add the const attribute to the function. This reduces the kernel image size by 994 bytes (defconfig, gcc 15.2.0). Suggested-by: Juergen Christ Reviewed-by: Juergen Christ Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index ac94672db81701..e643f24ebc05e5 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -130,7 +130,7 @@ static inline bool test_bit_inv(unsigned long nr, * where the most significant bit has bit number 0. * If no bit is set this function returns 64. */ -static __always_inline unsigned char __flogr(unsigned long word) +static __always_inline __attribute_const__ unsigned long __flogr(unsigned long word) { unsigned long bit; @@ -167,9 +167,8 @@ static __always_inline unsigned char __flogr(unsigned long word) union register_pair rp __uninitialized; rp.even = word; - asm volatile( - " flogr %[rp],%[rp]\n" - : [rp] "+d" (rp.pair) : : "cc"); + asm("flogr %[rp],%[rp]" + : [rp] "+d" (rp.pair) : : "cc"); bit = rp.even; /* * The result of the flogr instruction is a value in the range From c2fcb2e79d6f4113e93aabc50158bc97a359d1f5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 18 Sep 2025 09:57:47 +0300 Subject: [PATCH 2103/3206] EDAC/versalnet: Return the correct error in mc_probe() Return -ENOMEM if memory allocation in mc_probe() fails. [ bp: Massage commit message. ] Fixes: d5fe2fec6c40 ("EDAC: Add a driver for the AMD Versal NET DDR controller") Signed-off-by: Dan Carpenter Signed-off-by: Borislav Petkov (AMD) --- drivers/edac/versalnet_edac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c index 66714fffa59182..7c5db8bf0595b6 100644 --- a/drivers/edac/versalnet_edac.c +++ b/drivers/edac/versalnet_edac.c @@ -888,8 +888,10 @@ static int mc_probe(struct platform_device *pdev) } priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); - if (!priv) + if (!priv) { + rc = -ENOMEM; goto err_alloc; + } amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv; From f8cc02321bfca71967db1620d684aa3abab59612 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Thu, 21 Aug 2025 19:01:49 +0800 Subject: [PATCH 2104/3206] dt-bindings: perf: fsl-imx-ddr: Add a compatible string fsl,imx94-ddr-pmu for i.MX94 i.MX94 has a DDR Performance Monitor Unit which is compatible with i.MX93. This will add a compatible for i.MX94. Reviewed-by: Peng Fan Reviewed-by: Frank Li Acked-by: Conor Dooley Signed-off-by: Xu Yang Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml index 8597ea625edba5..d2e578d6b83b88 100644 --- a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml +++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml @@ -33,6 +33,7 @@ properties: - items: - enum: - fsl,imx91-ddr-pmu + - fsl,imx94-ddr-pmu - fsl,imx95-ddr-pmu - const: fsl,imx93-ddr-pmu From e4d9e8fb406bef3936aceac85d3f400b1acdbe73 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Thu, 21 Aug 2025 19:01:50 +0800 Subject: [PATCH 2105/3206] perf: imx_perf: add support for i.MX94 platform Add compatible string and related devtype for i.MX94 platform. Reviewed-by: Peng Fan Reviewed-by: Frank Li Signed-off-by: Xu Yang Signed-off-by: Will Deacon --- drivers/perf/fsl_imx9_ddr_perf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c index 267754fdf58146..7050b48c046717 100644 --- a/drivers/perf/fsl_imx9_ddr_perf.c +++ b/drivers/perf/fsl_imx9_ddr_perf.c @@ -104,6 +104,11 @@ static const struct imx_ddr_devtype_data imx93_devtype_data = { .filter_ver = DDR_PERF_AXI_FILTER_V1 }; +static const struct imx_ddr_devtype_data imx94_devtype_data = { + .identifier = "imx94", + .filter_ver = DDR_PERF_AXI_FILTER_V2 +}; + static const struct imx_ddr_devtype_data imx95_devtype_data = { .identifier = "imx95", .filter_ver = DDR_PERF_AXI_FILTER_V2 @@ -122,6 +127,7 @@ static inline bool axi_filter_v2(struct ddr_pmu *pmu) static const struct of_device_id imx_ddr_pmu_dt_ids[] = { { .compatible = "fsl,imx91-ddr-pmu", .data = &imx91_devtype_data }, { .compatible = "fsl,imx93-ddr-pmu", .data = &imx93_devtype_data }, + { .compatible = "fsl,imx94-ddr-pmu", .data = &imx94_devtype_data }, { .compatible = "fsl,imx95-ddr-pmu", .data = &imx95_devtype_data }, { /* sentinel */ } }; From 2c599c68c43e65fa333b222effbeab61cbd35df5 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Thu, 21 Aug 2025 19:01:52 +0800 Subject: [PATCH 2106/3206] MAINTAINERS: include fsl_imx9_ddr_perf.c and some perf metric files The fsl_imx9_ddr_perf.c and some perf metric files under tools/perf/pmu-events/arch/arm64/freescale/ is missing in MAINTAINERS. Add them and add me as another maintainer. Reviewed-by: Frank Li Signed-off-by: Xu Yang Signed-off-by: Will Deacon --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..3815a2c4b3a84c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9744,11 +9744,14 @@ F: drivers/video/fbdev/imxfb.c FREESCALE IMX DDR PMU DRIVER M: Frank Li +M: Xu Yang L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: Documentation/admin-guide/perf/imx-ddr.rst F: Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml F: drivers/perf/fsl_imx8_ddr_perf.c +F: drivers/perf/fsl_imx9_ddr_perf.c +F: tools/perf/pmu-events/arch/arm64/freescale/ FREESCALE IMX I2C DRIVER M: Oleksij Rempel From 1e558fb31bec3076500219cc417f477fe10a8463 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Wed, 13 Aug 2025 16:32:57 +0800 Subject: [PATCH 2107/3206] drivers: perf: use us_to_ktime() where appropriate The arm_ccn_pmu_poll_period_us are more suitable for using the us_to_ktime(). This can make the code more concise and enhance readability. Signed-off-by: Xichao Zhao Acked-by: Mark Rutland Signed-off-by: Will Deacon --- drivers/perf/arm-ccn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c index 1a0d0e1a226334..8af3563fdf60a3 100644 --- a/drivers/perf/arm-ccn.c +++ b/drivers/perf/arm-ccn.c @@ -565,7 +565,7 @@ module_param_named(pmu_poll_period_us, arm_ccn_pmu_poll_period_us, uint, static ktime_t arm_ccn_pmu_timer_period(void) { - return ns_to_ktime((u64)arm_ccn_pmu_poll_period_us * 1000); + return us_to_ktime((u64)arm_ccn_pmu_poll_period_us); } From 71396cfac97d0249fa7d8dcc8e649b6ba4c090e4 Mon Sep 17 00:00:00 2001 From: Ilkka Koskinen Date: Thu, 28 Aug 2025 15:35:19 -0700 Subject: [PATCH 2108/3206] perf/dwc_pcie: Support counting multiple lane events in parallel While Designware PCIe PMU allows to count only one time based event at a time, it allows to count all the lane events simultaneously. After the patch one is able to count a group of lane events: $ perf stat -e '{dwc_rootport/tx_memory_write,lane=1/,dwc_rootport/rx_memory_read,lane=0/}' dd if=/dev/nvme0n1 of=/dev/null bs=1M count=1 Earlier the events wouldn't have been counted successfully. Signed-off-by: Ilkka Koskinen Signed-off-by: Will Deacon --- .../admin-guide/perf/dwc_pcie_pmu.rst | 4 +- drivers/perf/dwc_pcie_pmu.c | 161 ++++++++++++++---- 2 files changed, 132 insertions(+), 33 deletions(-) diff --git a/Documentation/admin-guide/perf/dwc_pcie_pmu.rst b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst index cb376f335f402c..167f9281fbf57e 100644 --- a/Documentation/admin-guide/perf/dwc_pcie_pmu.rst +++ b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst @@ -16,8 +16,8 @@ provides the following two features: - one 64-bit counter for Time Based Analysis (RX/TX data throughput and time spent in each low-power LTSSM state) and -- one 32-bit counter for Event Counting (error and non-error events for - a specified lane) +- one 32-bit counter per event for Event Counting (error and non-error + events for a specified lane) Note: There is no interrupt for counter overflow. diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index 146ff57813fb16..d77f767cde8934 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -39,6 +39,10 @@ #define DWC_PCIE_EVENT_CLEAR GENMASK(1, 0) #define DWC_PCIE_EVENT_PER_CLEAR 0x1 +/* Event Selection Field has two subfields */ +#define DWC_PCIE_CNT_EVENT_SEL_GROUP GENMASK(11, 8) +#define DWC_PCIE_CNT_EVENT_SEL_EVID GENMASK(7, 0) + #define DWC_PCIE_EVENT_CNT_DATA 0xC #define DWC_PCIE_TIME_BASED_ANAL_CTL 0x10 @@ -73,6 +77,10 @@ enum dwc_pcie_event_type { DWC_PCIE_EVENT_TYPE_MAX, }; +#define DWC_PCIE_LANE_GROUP_6 6 +#define DWC_PCIE_LANE_GROUP_7 7 +#define DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP 256 + #define DWC_PCIE_LANE_EVENT_MAX_PERIOD GENMASK_ULL(31, 0) #define DWC_PCIE_MAX_PERIOD GENMASK_ULL(63, 0) @@ -82,8 +90,11 @@ struct dwc_pcie_pmu { u16 ras_des_offset; u32 nr_lanes; + /* Groups #6 and #7 */ + DECLARE_BITMAP(lane_events, 2 * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP); + struct perf_event *time_based_event; + struct hlist_node cpuhp_node; - struct perf_event *event[DWC_PCIE_EVENT_TYPE_MAX]; int on_cpu; }; @@ -246,19 +257,26 @@ static const struct attribute_group *dwc_pcie_attr_groups[] = { }; static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu, + struct perf_event *event, bool enable) { struct pci_dev *pdev = pcie_pmu->pdev; u16 ras_des_offset = pcie_pmu->ras_des_offset; + int event_id = DWC_PCIE_EVENT_ID(event); + int lane = DWC_PCIE_EVENT_LANE(event); + u32 ctrl; + + ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) | + FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) | + FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR); if (enable) - pci_clear_and_set_config_dword(pdev, - ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, - DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON); + ctrl |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON); else - pci_clear_and_set_config_dword(pdev, - ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, - DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF); + ctrl |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF); + + pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + ctrl); } static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu, @@ -276,11 +294,22 @@ static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event) { struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); struct pci_dev *pdev = pcie_pmu->pdev; + int event_id = DWC_PCIE_EVENT_ID(event); + int lane = DWC_PCIE_EVENT_LANE(event); u16 ras_des_offset = pcie_pmu->ras_des_offset; - u32 val; + u32 val, ctrl; + ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) | + FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) | + FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON); + pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + ctrl); pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val); + ctrl |= FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR); + pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + ctrl); + return val; } @@ -329,26 +358,77 @@ static void dwc_pcie_pmu_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event); - u64 delta, prev, now = 0; + u64 delta, prev, now; + + if (type == DWC_PCIE_LANE_EVENT) { + now = dwc_pcie_pmu_read_lane_event_counter(event) & + DWC_PCIE_LANE_EVENT_MAX_PERIOD; + local64_add(now, &event->count); + return; + } do { prev = local64_read(&hwc->prev_count); - - if (type == DWC_PCIE_LANE_EVENT) - now = dwc_pcie_pmu_read_lane_event_counter(event); - else if (type == DWC_PCIE_TIME_BASE_EVENT) - now = dwc_pcie_pmu_read_time_based_counter(event); + now = dwc_pcie_pmu_read_time_based_counter(event); } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); delta = (now - prev) & DWC_PCIE_MAX_PERIOD; - /* 32-bit counter for Lane Event Counting */ - if (type == DWC_PCIE_LANE_EVENT) - delta &= DWC_PCIE_LANE_EVENT_MAX_PERIOD; - local64_add(delta, &event->count); } +static int dwc_pcie_pmu_validate_add_lane_event(struct perf_event *event, + unsigned long val_lane_events[]) +{ + int event_id, event_nr, group; + + event_id = DWC_PCIE_EVENT_ID(event); + event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id); + group = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id); + + if (group != DWC_PCIE_LANE_GROUP_6 && group != DWC_PCIE_LANE_GROUP_7) + return -EINVAL; + + group -= DWC_PCIE_LANE_GROUP_6; + + if (test_and_set_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr, + val_lane_events)) + return -EINVAL; + + return 0; +} + +static int dwc_pcie_pmu_validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + DECLARE_BITMAP(val_lane_events, 2 * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP); + bool time_event; + int type; + + type = DWC_PCIE_EVENT_TYPE(leader); + if (type == DWC_PCIE_TIME_BASE_EVENT) + time_event = true; + else + if (dwc_pcie_pmu_validate_add_lane_event(leader, val_lane_events)) + return -ENOSPC; + + for_each_sibling_event(sibling, leader) { + type = DWC_PCIE_EVENT_TYPE(sibling); + if (type == DWC_PCIE_TIME_BASE_EVENT) { + if (time_event) + return -ENOSPC; + + time_event = true; + continue; + } + + if (dwc_pcie_pmu_validate_add_lane_event(sibling, val_lane_events)) + return -ENOSPC; + } + + return 0; +} + static int dwc_pcie_pmu_event_init(struct perf_event *event) { struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); @@ -367,10 +447,6 @@ static int dwc_pcie_pmu_event_init(struct perf_event *event) if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) return -EINVAL; - if (event->group_leader != event && - !is_software_event(event->group_leader)) - return -EINVAL; - for_each_sibling_event(sibling, event->group_leader) { if (sibling->pmu != event->pmu && !is_software_event(sibling)) return -EINVAL; @@ -385,6 +461,9 @@ static int dwc_pcie_pmu_event_init(struct perf_event *event) return -EINVAL; } + if (dwc_pcie_pmu_validate_group(event)) + return -ENOSPC; + event->cpu = pcie_pmu->on_cpu; return 0; @@ -400,7 +479,7 @@ static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags) local64_set(&hwc->prev_count, 0); if (type == DWC_PCIE_LANE_EVENT) - dwc_pcie_pmu_lane_event_enable(pcie_pmu, true); + dwc_pcie_pmu_lane_event_enable(pcie_pmu, event, true); else if (type == DWC_PCIE_TIME_BASE_EVENT) dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true); } @@ -414,12 +493,13 @@ static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags) if (event->hw.state & PERF_HES_STOPPED) return; + dwc_pcie_pmu_event_update(event); + if (type == DWC_PCIE_LANE_EVENT) - dwc_pcie_pmu_lane_event_enable(pcie_pmu, false); + dwc_pcie_pmu_lane_event_enable(pcie_pmu, event, false); else if (type == DWC_PCIE_TIME_BASE_EVENT) dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false); - dwc_pcie_pmu_event_update(event); hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; } @@ -434,14 +514,17 @@ static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags) u16 ras_des_offset = pcie_pmu->ras_des_offset; u32 ctrl; - /* one counter for each type and it is in use */ - if (pcie_pmu->event[type]) - return -ENOSPC; - - pcie_pmu->event[type] = event; hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; if (type == DWC_PCIE_LANE_EVENT) { + int event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id); + int group = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id) - + DWC_PCIE_LANE_GROUP_6; + + if (test_and_set_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr, + pcie_pmu->lane_events)) + return -ENOSPC; + /* EVENT_COUNTER_DATA_REG needs clear manually */ ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) | FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) | @@ -450,6 +533,11 @@ static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags) pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, ctrl); } else if (type == DWC_PCIE_TIME_BASE_EVENT) { + if (pcie_pmu->time_based_event) + return -ENOSPC; + + pcie_pmu->time_based_event = event; + /* * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely * use it with any manually controlled duration. And it is @@ -478,7 +566,18 @@ static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags) dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE); perf_event_update_userpage(event); - pcie_pmu->event[type] = NULL; + + if (type == DWC_PCIE_TIME_BASE_EVENT) { + pcie_pmu->time_based_event = NULL; + } else { + int event_id = DWC_PCIE_EVENT_ID(event); + int event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id); + int group = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id) - + DWC_PCIE_LANE_GROUP_6; + + clear_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr, + pcie_pmu->lane_events); + } } static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node) From a7005ff2d0a5dfb15ba7152f4fb325ad9e00a472 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 1 Sep 2025 13:40:30 +0100 Subject: [PATCH 2109/3206] arm64: sysreg: Add new PMSFCR_EL1 fields and PMSDSFR_EL1 register Add new fields and register that are introduced for the features FEAT_SPE_EFT (extended filtering) and FEAT_SPE_FDS (data source filtering). Tested-by: Leo Yan Reviewed-by: Anshuman Khandual Acked-by: Will Deacon Signed-off-by: James Clark Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 696ab1f32a6749..b743fc8ffe5de4 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2994,11 +2994,20 @@ Field 0 RND EndSysreg Sysreg PMSFCR_EL1 3 0 9 9 4 -Res0 63:19 +Res0 63:53 +Field 52 SIMDm +Field 51 FPm +Field 50 STm +Field 49 LDm +Field 48 Bm +Res0 47:21 +Field 20 SIMD +Field 19 FP Field 18 ST Field 17 LD Field 16 B -Res0 15:4 +Res0 15:5 +Field 4 FDS Field 3 FnE Field 2 FL Field 1 FT From b4401403afb992844fa47513ac9c94520722c43d Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 1 Sep 2025 13:40:31 +0100 Subject: [PATCH 2110/3206] perf: arm_spe: Support FEAT_SPEv1p4 filters FEAT_SPEv1p4 (optional from Armv8.8) adds some new filter bits and also makes some previously available bits unavailable again e.g: E[30], bit [30] When FEAT_SPEv1p4 is _not_ implemented ... Continuing to hard code the valid filter bits for each version isn't scalable, and it also doesn't work for filter bits that aren't related to SPE version. For example most bits have a further condition: E[15], bit [15] When ... and filtering on event 15 is supported: Whether "filtering on event 15" is implemented or not is only discoverable from the TRM of that specific CPU or by probing PMSEVFR_EL1. Instead of hard coding them, write all 1s to the PMSEVFR_EL1 register and read it back to discover the RES0 bits. Unsupported bits are RAZ/WI so should read as 0s. For any hardware that doesn't strictly follow RAZ/WI for unsupported filters: Any bits that should have been supported in a specific SPE version but now incorrectly appear to be RES0 wouldn't have worked anyway, so it's better to fail to open events that request them rather than behaving unexpectedly. Bits that aren't implemented but also aren't RAZ/WI will be incorrectly reported as supported, but allowing them to be used is harmless. Testing on N1SDP shows the probed RES0 bits to be the same as the hard coded ones. The FVP with SPEv1p4 shows only additional new RES0 bits, i.e. no previously hard coded RES0 bits are missing. Tested-by: Leo Yan Signed-off-by: James Clark Signed-off-by: Will Deacon --- arch/arm64/include/asm/sysreg.h | 9 --------- drivers/perf/arm_spe_pmu.c | 23 +++++++---------------- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index d5b5f2ae1afaaa..20cbd9860c8ff7 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -344,15 +344,6 @@ #define SYS_PAR_EL1_ATTR GENMASK_ULL(63, 56) #define SYS_PAR_EL1_F0_RES0 (GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52)) -/*** Statistical Profiling Extension ***/ -#define PMSEVFR_EL1_RES0_IMP \ - (GENMASK_ULL(47, 32) | GENMASK_ULL(23, 16) | GENMASK_ULL(11, 8) |\ - BIT_ULL(6) | BIT_ULL(4) | BIT_ULL(2) | BIT_ULL(0)) -#define PMSEVFR_EL1_RES0_V1P1 \ - (PMSEVFR_EL1_RES0_IMP & ~(BIT_ULL(18) | BIT_ULL(17) | BIT_ULL(11))) -#define PMSEVFR_EL1_RES0_V1P2 \ - (PMSEVFR_EL1_RES0_V1P1 & ~BIT_ULL(6)) - /* Buffer error reporting */ #define PMBSR_EL1_FAULT_FSC_SHIFT PMBSR_EL1_MSS_SHIFT #define PMBSR_EL1_FAULT_FSC_MASK PMBSR_EL1_MSS_MASK diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 369e77ad5f13ff..86c9948ab5a064 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -89,6 +89,7 @@ struct arm_spe_pmu { #define SPE_PMU_FEAT_DEV_PROBED (1UL << 63) u64 features; + u64 pmsevfr_res0; u16 max_record_sz; u16 align; struct perf_output_handle __percpu *handle; @@ -697,20 +698,6 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev) return IRQ_HANDLED; } -static u64 arm_spe_pmsevfr_res0(u16 pmsver) -{ - switch (pmsver) { - case ID_AA64DFR0_EL1_PMSVer_IMP: - return PMSEVFR_EL1_RES0_IMP; - case ID_AA64DFR0_EL1_PMSVer_V1P1: - return PMSEVFR_EL1_RES0_V1P1; - case ID_AA64DFR0_EL1_PMSVer_V1P2: - /* Return the highest version we support in default */ - default: - return PMSEVFR_EL1_RES0_V1P2; - } -} - /* Perf callbacks */ static int arm_spe_pmu_event_init(struct perf_event *event) { @@ -726,10 +713,10 @@ static int arm_spe_pmu_event_init(struct perf_event *event) !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus)) return -ENOENT; - if (arm_spe_event_to_pmsevfr(event) & arm_spe_pmsevfr_res0(spe_pmu->pmsver)) + if (arm_spe_event_to_pmsevfr(event) & spe_pmu->pmsevfr_res0) return -EOPNOTSUPP; - if (arm_spe_event_to_pmsnevfr(event) & arm_spe_pmsevfr_res0(spe_pmu->pmsver)) + if (arm_spe_event_to_pmsnevfr(event) & spe_pmu->pmsevfr_res0) return -EOPNOTSUPP; if (attr->exclude_idle) @@ -1107,6 +1094,10 @@ static void __arm_spe_pmu_dev_probe(void *info) spe_pmu->counter_sz = 16; } + /* Write all 1s and then read back. Unsupported filter bits are RAZ/WI. */ + write_sysreg_s(U64_MAX, SYS_PMSEVFR_EL1); + spe_pmu->pmsevfr_res0 = ~read_sysreg_s(SYS_PMSEVFR_EL1); + dev_info(dev, "probed SPEv1.%d for CPUs %*pbl [max_record_sz %u, align %u, features 0x%llx]\n", spe_pmu->pmsver - 1, cpumask_pr_args(&spe_pmu->supported_cpus), From 51b9f16697cda2229aacc10bbb0216b6900cbca1 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 1 Sep 2025 13:40:32 +0100 Subject: [PATCH 2111/3206] perf: arm_spe: Expose event filter Expose an "event_filter" entry in the caps folder to inform user space about which events can be filtered. Change the return type of arm_spe_pmu_cap_get() from u32 to u64 to accommodate the added event filter entry. Signed-off-by: Leo Yan Tested-by: Leo Yan Signed-off-by: James Clark Signed-off-by: Will Deacon --- drivers/perf/arm_spe_pmu.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 86c9948ab5a064..ba55bc3db70842 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -116,6 +116,7 @@ enum arm_spe_pmu_capabilities { SPE_PMU_CAP_FEAT_MAX, SPE_PMU_CAP_CNT_SZ = SPE_PMU_CAP_FEAT_MAX, SPE_PMU_CAP_MIN_IVAL, + SPE_PMU_CAP_EVENT_FILTER, }; static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = { @@ -123,7 +124,7 @@ static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = { [SPE_PMU_CAP_ERND] = SPE_PMU_FEAT_ERND, }; -static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap) +static u64 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap) { if (cap < SPE_PMU_CAP_FEAT_MAX) return !!(spe_pmu->features & arm_spe_pmu_feat_caps[cap]); @@ -133,6 +134,8 @@ static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap) return spe_pmu->counter_sz; case SPE_PMU_CAP_MIN_IVAL: return spe_pmu->min_period; + case SPE_PMU_CAP_EVENT_FILTER: + return ~spe_pmu->pmsevfr_res0; default: WARN(1, "unknown cap %d\n", cap); } @@ -149,7 +152,19 @@ static ssize_t arm_spe_pmu_cap_show(struct device *dev, container_of(attr, struct dev_ext_attribute, attr); int cap = (long)ea->var; - return sysfs_emit(buf, "%u\n", arm_spe_pmu_cap_get(spe_pmu, cap)); + return sysfs_emit(buf, "%llu\n", arm_spe_pmu_cap_get(spe_pmu, cap)); +} + +static ssize_t arm_spe_pmu_cap_show_hex(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct arm_spe_pmu *spe_pmu = dev_get_drvdata(dev); + struct dev_ext_attribute *ea = + container_of(attr, struct dev_ext_attribute, attr); + int cap = (long)ea->var; + + return sysfs_emit(buf, "0x%llx\n", arm_spe_pmu_cap_get(spe_pmu, cap)); } #define SPE_EXT_ATTR_ENTRY(_name, _func, _var) \ @@ -159,12 +174,15 @@ static ssize_t arm_spe_pmu_cap_show(struct device *dev, #define SPE_CAP_EXT_ATTR_ENTRY(_name, _var) \ SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show, _var) +#define SPE_CAP_EXT_ATTR_ENTRY_HEX(_name, _var) \ + SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show_hex, _var) static struct attribute *arm_spe_pmu_cap_attr[] = { SPE_CAP_EXT_ATTR_ENTRY(arch_inst, SPE_PMU_CAP_ARCH_INST), SPE_CAP_EXT_ATTR_ENTRY(ernd, SPE_PMU_CAP_ERND), SPE_CAP_EXT_ATTR_ENTRY(count_size, SPE_PMU_CAP_CNT_SZ), SPE_CAP_EXT_ATTR_ENTRY(min_interval, SPE_PMU_CAP_MIN_IVAL), + SPE_CAP_EXT_ATTR_ENTRY_HEX(event_filter, SPE_PMU_CAP_EVENT_FILTER), NULL, }; From dad9603c5ea308a7b26af66ff4824dccd438af5d Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 1 Sep 2025 13:40:33 +0100 Subject: [PATCH 2112/3206] perf: arm_spe: Add support for FEAT_SPE_EFT extended filtering FEAT_SPE_EFT (optional from Armv9.4) adds mask bits for the existing load, store and branch filters. It also adds two new filter bits for SIMD and floating point with their own associated mask bits. The current filters only allow OR filtering on samples that are load OR store etc, and the new mask bits allow setting part of the filter to an AND, for example filtering samples that are store AND SIMD. With mask bits set to 0, the OR behavior is preserved, so the unless any masks are explicitly set old filters will behave the same. Add them all and make them behave the same way as existing format bits, hidden and return EOPNOTSUPP if set when the feature doesn't exist. Reviewed-by: Leo Yan Tested-by: Leo Yan Signed-off-by: James Clark Signed-off-by: Will Deacon --- drivers/perf/arm_spe_pmu.c | 66 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index ba55bc3db70842..591f72fa032760 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -86,6 +86,7 @@ struct arm_spe_pmu { #define SPE_PMU_FEAT_ERND (1UL << 5) #define SPE_PMU_FEAT_INV_FILT_EVT (1UL << 6) #define SPE_PMU_FEAT_DISCARD (1UL << 7) +#define SPE_PMU_FEAT_EFT (1UL << 8) #define SPE_PMU_FEAT_DEV_PROBED (1UL << 63) u64 features; @@ -216,6 +217,27 @@ static const struct attribute_group arm_spe_pmu_cap_group = { #define ATTR_CFG_FLD_discard_CFG config /* PMBLIMITR_EL1.FM = DISCARD */ #define ATTR_CFG_FLD_discard_LO 35 #define ATTR_CFG_FLD_discard_HI 35 +#define ATTR_CFG_FLD_branch_filter_mask_CFG config /* PMSFCR_EL1.Bm */ +#define ATTR_CFG_FLD_branch_filter_mask_LO 36 +#define ATTR_CFG_FLD_branch_filter_mask_HI 36 +#define ATTR_CFG_FLD_load_filter_mask_CFG config /* PMSFCR_EL1.LDm */ +#define ATTR_CFG_FLD_load_filter_mask_LO 37 +#define ATTR_CFG_FLD_load_filter_mask_HI 37 +#define ATTR_CFG_FLD_store_filter_mask_CFG config /* PMSFCR_EL1.STm */ +#define ATTR_CFG_FLD_store_filter_mask_LO 38 +#define ATTR_CFG_FLD_store_filter_mask_HI 38 +#define ATTR_CFG_FLD_simd_filter_CFG config /* PMSFCR_EL1.SIMD */ +#define ATTR_CFG_FLD_simd_filter_LO 39 +#define ATTR_CFG_FLD_simd_filter_HI 39 +#define ATTR_CFG_FLD_simd_filter_mask_CFG config /* PMSFCR_EL1.SIMDm */ +#define ATTR_CFG_FLD_simd_filter_mask_LO 40 +#define ATTR_CFG_FLD_simd_filter_mask_HI 40 +#define ATTR_CFG_FLD_float_filter_CFG config /* PMSFCR_EL1.FP */ +#define ATTR_CFG_FLD_float_filter_LO 41 +#define ATTR_CFG_FLD_float_filter_HI 41 +#define ATTR_CFG_FLD_float_filter_mask_CFG config /* PMSFCR_EL1.FPm */ +#define ATTR_CFG_FLD_float_filter_mask_LO 42 +#define ATTR_CFG_FLD_float_filter_mask_HI 42 #define ATTR_CFG_FLD_event_filter_CFG config1 /* PMSEVFR_EL1 */ #define ATTR_CFG_FLD_event_filter_LO 0 @@ -234,8 +256,15 @@ GEN_PMU_FORMAT_ATTR(pa_enable); GEN_PMU_FORMAT_ATTR(pct_enable); GEN_PMU_FORMAT_ATTR(jitter); GEN_PMU_FORMAT_ATTR(branch_filter); +GEN_PMU_FORMAT_ATTR(branch_filter_mask); GEN_PMU_FORMAT_ATTR(load_filter); +GEN_PMU_FORMAT_ATTR(load_filter_mask); GEN_PMU_FORMAT_ATTR(store_filter); +GEN_PMU_FORMAT_ATTR(store_filter_mask); +GEN_PMU_FORMAT_ATTR(simd_filter); +GEN_PMU_FORMAT_ATTR(simd_filter_mask); +GEN_PMU_FORMAT_ATTR(float_filter); +GEN_PMU_FORMAT_ATTR(float_filter_mask); GEN_PMU_FORMAT_ATTR(event_filter); GEN_PMU_FORMAT_ATTR(inv_event_filter); GEN_PMU_FORMAT_ATTR(min_latency); @@ -247,8 +276,15 @@ static struct attribute *arm_spe_pmu_formats_attr[] = { &format_attr_pct_enable.attr, &format_attr_jitter.attr, &format_attr_branch_filter.attr, + &format_attr_branch_filter_mask.attr, &format_attr_load_filter.attr, + &format_attr_load_filter_mask.attr, &format_attr_store_filter.attr, + &format_attr_store_filter_mask.attr, + &format_attr_simd_filter.attr, + &format_attr_simd_filter_mask.attr, + &format_attr_float_filter.attr, + &format_attr_float_filter_mask.attr, &format_attr_event_filter.attr, &format_attr_inv_event_filter.attr, &format_attr_min_latency.attr, @@ -269,6 +305,16 @@ static umode_t arm_spe_pmu_format_attr_is_visible(struct kobject *kobj, if (attr == &format_attr_inv_event_filter.attr && !(spe_pmu->features & SPE_PMU_FEAT_INV_FILT_EVT)) return 0; + if ((attr == &format_attr_branch_filter_mask.attr || + attr == &format_attr_load_filter_mask.attr || + attr == &format_attr_store_filter_mask.attr || + attr == &format_attr_simd_filter.attr || + attr == &format_attr_simd_filter_mask.attr || + attr == &format_attr_float_filter.attr || + attr == &format_attr_float_filter_mask.attr) && + !(spe_pmu->features & SPE_PMU_FEAT_EFT)) + return 0; + return attr->mode; } @@ -364,8 +410,15 @@ static u64 arm_spe_event_to_pmsfcr(struct perf_event *event) u64 reg = 0; reg |= FIELD_PREP(PMSFCR_EL1_LD, ATTR_CFG_GET_FLD(attr, load_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_LDm, ATTR_CFG_GET_FLD(attr, load_filter_mask)); reg |= FIELD_PREP(PMSFCR_EL1_ST, ATTR_CFG_GET_FLD(attr, store_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_STm, ATTR_CFG_GET_FLD(attr, store_filter_mask)); reg |= FIELD_PREP(PMSFCR_EL1_B, ATTR_CFG_GET_FLD(attr, branch_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_Bm, ATTR_CFG_GET_FLD(attr, branch_filter_mask)); + reg |= FIELD_PREP(PMSFCR_EL1_SIMD, ATTR_CFG_GET_FLD(attr, simd_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_SIMDm, ATTR_CFG_GET_FLD(attr, simd_filter_mask)); + reg |= FIELD_PREP(PMSFCR_EL1_FP, ATTR_CFG_GET_FLD(attr, float_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_FPm, ATTR_CFG_GET_FLD(attr, float_filter_mask)); if (reg) reg |= PMSFCR_EL1_FT; @@ -767,6 +820,16 @@ static int arm_spe_pmu_event_init(struct perf_event *event) !(spe_pmu->features & SPE_PMU_FEAT_FILT_LAT)) return -EOPNOTSUPP; + if ((FIELD_GET(PMSFCR_EL1_LDm, reg) || + FIELD_GET(PMSFCR_EL1_STm, reg) || + FIELD_GET(PMSFCR_EL1_Bm, reg) || + FIELD_GET(PMSFCR_EL1_SIMD, reg) || + FIELD_GET(PMSFCR_EL1_SIMDm, reg) || + FIELD_GET(PMSFCR_EL1_FP, reg) || + FIELD_GET(PMSFCR_EL1_FPm, reg)) && + !(spe_pmu->features & SPE_PMU_FEAT_EFT)) + return -EOPNOTSUPP; + if (ATTR_CFG_GET_FLD(&event->attr, discard) && !(spe_pmu->features & SPE_PMU_FEAT_DISCARD)) return -EOPNOTSUPP; @@ -1058,6 +1121,9 @@ static void __arm_spe_pmu_dev_probe(void *info) if (spe_pmu->pmsver >= ID_AA64DFR0_EL1_PMSVer_V1P2) spe_pmu->features |= SPE_PMU_FEAT_DISCARD; + if (FIELD_GET(PMSIDR_EL1_EFT, reg)) + spe_pmu->features |= SPE_PMU_FEAT_EFT; + /* This field has a spaced out encoding, so just use a look-up */ fld = FIELD_GET(PMSIDR_EL1_INTERVAL, reg); switch (fld) { From 510a8fa49dc1d18b120e2d3992fa2aff7fc5c46b Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 1 Sep 2025 13:40:34 +0100 Subject: [PATCH 2113/3206] arm64/boot: Factor out a macro to check SPE version We check the version of SPE twice, and we'll add one more check in the next commit so factor out a macro to do this. Change the #3 magic number to the actual SPE version define (V1p2) to make it more readable. No functional changes intended. Tested-by: Leo Yan Reviewed-by: Leo Yan Signed-off-by: James Clark Signed-off-by: Will Deacon --- arch/arm64/include/asm/el2_setup.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 46033027510cca..a305386eb2e379 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -91,6 +91,14 @@ msr cntvoff_el2, xzr // Clear virtual offset .endm +/* Branch to skip_label if SPE version is less than given version */ +.macro __spe_vers_imp skip_label, version, tmp + mrs \tmp, id_aa64dfr0_el1 + ubfx \tmp, \tmp, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 + cmp \tmp, \version + b.lt \skip_label +.endm + .macro __init_el2_debug mrs x1, id_aa64dfr0_el1 ubfx x0, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4 @@ -103,8 +111,7 @@ csel x2, xzr, x0, eq // all PMU counters from EL1 /* Statistical profiling */ - ubfx x0, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 - cbz x0, .Lskip_spe_\@ // Skip if SPE not present + __spe_vers_imp .Lskip_spe_\@, ID_AA64DFR0_EL1_PMSVer_IMP, x0 // Skip if SPE not present mrs_s x0, SYS_PMBIDR_EL1 // If SPE available at EL2, and x0, x0, #(1 << PMBIDR_EL1_P_SHIFT) @@ -263,10 +270,8 @@ mov x0, xzr mov x2, xzr - mrs x1, id_aa64dfr0_el1 - ubfx x1, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 - cmp x1, #3 - b.lt .Lskip_spe_fgt_\@ + /* If SPEv1p2 is implemented, */ + __spe_vers_imp .Lskip_spe_fgt_\@, #ID_AA64DFR0_EL1_PMSVer_V1P2, x1 /* Disable PMSNEVFR_EL1 read and write traps */ orr x0, x0, #HDFGRTR_EL2_nPMSNEVFR_EL1_MASK orr x2, x2, #HDFGWTR_EL2_nPMSNEVFR_EL1_MASK From 00d7a1af5ab58d89c2f0af27485b2d710c862dfc Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 1 Sep 2025 13:40:35 +0100 Subject: [PATCH 2114/3206] arm64/boot: Enable EL2 requirements for SPE_FEAT_FDS SPE data source filtering (optional from Armv8.8) requires that traps to the filter register PMSDSFR be disabled. Document the requirements and disable the traps if the feature is present. Tested-by: Leo Yan Reviewed-by: Leo Yan Signed-off-by: James Clark Signed-off-by: Will Deacon --- Documentation/arch/arm64/booting.rst | 11 +++++++++++ arch/arm64/include/asm/el2_setup.h | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index 2f666a7c303cdf..e4f953839f7181 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -466,6 +466,17 @@ Before jumping into the kernel, the following conditions must be met: - HDFGWTR2_EL2.nPMICFILTR_EL0 (bit 3) must be initialised to 0b1. - HDFGWTR2_EL2.nPMUACR_EL1 (bit 4) must be initialised to 0b1. + For CPUs with SPE data source filtering (FEAT_SPE_FDS): + + - If EL3 is present: + + - MDCR_EL3.EnPMS3 (bit 42) must be initialised to 0b1. + + - If the kernel is entered at EL1 and EL2 is present: + + - HDFGRTR2_EL2.nPMSDSFR_EL1 (bit 19) must be initialised to 0b1. + - HDFGWTR2_EL2.nPMSDSFR_EL1 (bit 19) must be initialised to 0b1. + For CPUs with Memory Copy and Memory Set instructions (FEAT_MOPS): - If the kernel is entered at EL1 and EL2 is present: diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index a305386eb2e379..b37da3ee852963 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -392,6 +392,17 @@ orr x0, x0, #HDFGRTR2_EL2_nPMICFILTR_EL0 orr x0, x0, #HDFGRTR2_EL2_nPMUACR_EL1 .Lskip_pmuv3p9_\@: + /* If SPE is implemented, */ + __spe_vers_imp .Lskip_spefds_\@, ID_AA64DFR0_EL1_PMSVer_IMP, x1 + /* we can read PMSIDR and */ + mrs_s x1, SYS_PMSIDR_EL1 + and x1, x1, #PMSIDR_EL1_FDS + /* if FEAT_SPE_FDS is implemented, */ + cbz x1, .Lskip_spefds_\@ + /* disable traps of PMSDSFR to EL2. */ + orr x0, x0, #HDFGRTR2_EL2_nPMSDSFR_EL1 + +.Lskip_spefds_\@: msr_s SYS_HDFGRTR2_EL2, x0 msr_s SYS_HDFGWTR2_EL2, x0 msr_s SYS_HFGRTR2_EL2, xzr From f8f89e8cf3d668a40106444276d8c448c114e963 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Wed, 20 Aug 2025 16:45:33 +0800 Subject: [PATCH 2115/3206] perf: arm_pmuv3: Factor out PMCCNTR_EL0 use conditions PMCCNTR_EL0 is preferred for counting CPU_CYCLES under certain conditions. Factor out the condition check to a separate function for further extension. Add documents for better understanding. No functional changes intended. Reviewed-by: James Clark Acked-by: Mark Rutland Signed-off-by: Yicong Yang Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index f6d7bab5d555c0..69c5cc8f56067c 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -978,6 +978,32 @@ static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc, return -EAGAIN; } +static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; + + if (evtype != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) + return false; + + /* + * A CPU_CYCLES event with threshold counting cannot use PMCCNTR_EL0 + * since it lacks threshold support. + */ + if (armv8pmu_event_get_threshold(&event->attr)) + return false; + + /* + * PMCCNTR_EL0 is not affected by BRBE controls like BRBCR_ELx.FZP. + * So don't use it for branch events. + */ + if (has_branch_stack(event)) + return false; + + return true; +} + static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, struct perf_event *event) { @@ -986,8 +1012,7 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; /* Always prefer to place a cycle counter into the cycle counter. */ - if ((evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) && - !armv8pmu_event_get_threshold(&event->attr) && !has_branch_stack(event)) { + if (armv8pmu_can_use_pmccntr(cpuc, event)) { if (!test_and_set_bit(ARMV8_PMU_CYCLE_IDX, cpuc->used_mask)) return ARMV8_PMU_CYCLE_IDX; else if (armv8pmu_event_is_64bit(event) && From e31c0eb10388e2af0502b77e61453efbc8a4f974 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 14 Aug 2025 17:16:20 +0800 Subject: [PATCH 2116/3206] drivers/perf: hisi: Add support for HiSilicon NoC PMU Adds the support for HiSilicon NoC (Network on Chip) PMU which will be used to monitor the events on the system bus. The PMU device will be named after the SCL ID (either Super CPU cluster or Super IO cluster) and the index ID, just similar to other HiSilicon Uncore PMUs. Below PMU formats are provided besides the event: - ch: the transaction channel (data, request, response, etc) which can be used to filter the counting. - tt_en: tracetag filtering enable. Just as other HiSilicon Uncore PMUs the NoC PMU supports only counting the transactions with tracetag. The NoC PMU doesn't have an interrupt to indicate the overflow. However we have a 64 bit counter which is large enough and it's nearly impossible to overflow. Reviewed-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/hisi-pmu.rst | 11 + drivers/perf/hisilicon/Makefile | 3 +- drivers/perf/hisilicon/hisi_uncore_noc_pmu.c | 443 +++++++++++++++++++ 3 files changed, 456 insertions(+), 1 deletion(-) create mode 100644 drivers/perf/hisilicon/hisi_uncore_noc_pmu.c diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst index 48992a0b8e94f7..6f0ea4f641cc23 100644 --- a/Documentation/admin-guide/perf/hisi-pmu.rst +++ b/Documentation/admin-guide/perf/hisi-pmu.rst @@ -112,6 +112,17 @@ uring channel. It is 2 bits. Some important codes are as follows: - 2'b00: default value, count the events which sent to the both uring and uring_ext channel; +6. ch: NoC PMU supports filtering the event counts of certain transaction +channel with this option. The current supported channels are as follows: + +- 3'b010: Request channel +- 3'b100: Snoop channel +- 3'b110: Response channel +- 3'b111: Data channel + +7. tt_en: NoC PMU supports counting only transactions that have tracetag set +if this option is set. See the 2nd list for more information about tracetag. + Users could configure IDs to count data come from specific CCL/ICL, by setting srcid_cmd & srcid_msk, and data desitined for specific CCL/ICL by setting tgtid_cmd & tgtid_msk. A set bit in srcid_msk/tgtid_msk means the PMU will not diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile index 48dcc8381ea75d..dcec8f39719d00 100644 --- a/drivers/perf/hisilicon/Makefile +++ b/drivers/perf/hisilicon/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \ hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \ - hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o hisi_uncore_uc_pmu.o + hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o hisi_uncore_uc_pmu.o \ + hisi_uncore_noc_pmu.o obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o obj-$(CONFIG_HNS3_PMU) += hns3_pmu.o diff --git a/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c new file mode 100644 index 00000000000000..de3b9cc7aadad0 --- /dev/null +++ b/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c @@ -0,0 +1,443 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Driver for HiSilicon Uncore NoC (Network on Chip) PMU device + * + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. + * Author: Yicong Yang + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hisi_uncore_pmu.h" + +#define NOC_PMU_VERSION 0x1e00 +#define NOC_PMU_GLOBAL_CTRL 0x1e04 +#define NOC_PMU_GLOBAL_CTRL_PMU_EN BIT(0) +#define NOC_PMU_GLOBAL_CTRL_TT_EN BIT(1) +#define NOC_PMU_CNT_INFO 0x1e08 +#define NOC_PMU_CNT_INFO_OVERFLOW(n) BIT(n) +#define NOC_PMU_EVENT_CTRL0 0x1e20 +#define NOC_PMU_EVENT_CTRL_TYPE GENMASK(4, 0) +/* + * Note channel of 0x0 will reset the counter value, so don't do it before + * we read out the counter. + */ +#define NOC_PMU_EVENT_CTRL_CHANNEL GENMASK(10, 8) +#define NOC_PMU_EVENT_CTRL_EN BIT(11) +#define NOC_PMU_EVENT_COUNTER0 0x1e80 + +#define NOC_PMU_NR_COUNTERS 4 +#define NOC_PMU_CH_DEFAULT 0x7 + +#define NOC_PMU_EVENT_CTRLn(ctrl0, n) ((ctrl0) + 4 * (n)) +#define NOC_PMU_EVENT_CNTRn(cntr0, n) ((cntr0) + 8 * (n)) + +HISI_PMU_EVENT_ATTR_EXTRACTOR(ch, config1, 2, 0); +HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_en, config1, 3, 3); + +/* Dynamic CPU hotplug state used by this PMU driver */ +static enum cpuhp_state hisi_noc_pmu_cpuhp_state; + +struct hisi_noc_pmu_regs { + u32 version; + u32 pmu_ctrl; + u32 event_ctrl0; + u32 event_cntr0; + u32 overflow_status; +}; + +/* + * Tracetag filtering is not per event and all the events should keep + * the consistence. Return true if the new comer doesn't match the + * tracetag filtering configuration of the current scheduled events. + */ +static bool hisi_noc_pmu_check_global_filter(struct perf_event *curr, + struct perf_event *new) +{ + return hisi_get_tt_en(curr) == hisi_get_tt_en(new); +} + +static void hisi_noc_pmu_write_evtype(struct hisi_pmu *noc_pmu, int idx, u32 type) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, idx)); + reg &= ~NOC_PMU_EVENT_CTRL_TYPE; + reg |= FIELD_PREP(NOC_PMU_EVENT_CTRL_TYPE, type); + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, idx)); +} + +static int hisi_noc_pmu_get_event_idx(struct perf_event *event) +{ + struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu); + struct hisi_pmu_hwevents *pmu_events = &noc_pmu->pmu_events; + int cur_idx; + + cur_idx = find_first_bit(pmu_events->used_mask, noc_pmu->num_counters); + if (cur_idx != noc_pmu->num_counters && + !hisi_noc_pmu_check_global_filter(pmu_events->hw_events[cur_idx], event)) + return -EAGAIN; + + return hisi_uncore_pmu_get_event_idx(event); +} + +static u64 hisi_noc_pmu_read_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + + return readq(noc_pmu->base + NOC_PMU_EVENT_CNTRn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_noc_pmu_write_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc, u64 val) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + + writeq(val, noc_pmu->base + NOC_PMU_EVENT_CNTRn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_noc_pmu_enable_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + reg |= NOC_PMU_EVENT_CTRL_EN; + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); +} + +static void hisi_noc_pmu_disable_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + reg &= ~NOC_PMU_EVENT_CTRL_EN; + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); +} + +static void hisi_noc_pmu_enable_counter_int(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + /* We don't support interrupt, so a stub here. */ +} + +static void hisi_noc_pmu_disable_counter_int(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ +} + +static void hisi_noc_pmu_start_counters(struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg |= NOC_PMU_GLOBAL_CTRL_PMU_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); +} + +static void hisi_noc_pmu_stop_counters(struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg &= ~NOC_PMU_GLOBAL_CTRL_PMU_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); +} + +static u32 hisi_noc_pmu_get_int_status(struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + + return readl(noc_pmu->base + reg_info->overflow_status); +} + +static void hisi_noc_pmu_clear_int_status(struct hisi_pmu *noc_pmu, int idx) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + reg_info->overflow_status); + reg &= ~NOC_PMU_CNT_INFO_OVERFLOW(idx); + writel(reg, noc_pmu->base + reg_info->overflow_status); +} + +static void hisi_noc_pmu_enable_filter(struct perf_event *event) +{ + struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu); + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + struct hw_perf_event *hwc = &event->hw; + u32 tt_en = hisi_get_tt_en(event); + u32 ch = hisi_get_ch(event); + u32 reg; + + if (!ch) + ch = NOC_PMU_CH_DEFAULT; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + reg &= ~NOC_PMU_EVENT_CTRL_CHANNEL; + reg |= FIELD_PREP(NOC_PMU_EVENT_CTRL_CHANNEL, ch); + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + + /* + * Since tracetag filter applies to all the counters, don't touch it + * if user doesn't specify it explicitly. + */ + if (tt_en) { + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg |= NOC_PMU_GLOBAL_CTRL_TT_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); + } +} + +static void hisi_noc_pmu_disable_filter(struct perf_event *event) +{ + struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu); + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 tt_en = hisi_get_tt_en(event); + u32 reg; + + /* + * If we're not the last counter, don't touch the global tracetag + * configuration. + */ + if (bitmap_weight(noc_pmu->pmu_events.used_mask, noc_pmu->num_counters) > 1) + return; + + if (tt_en) { + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg &= ~NOC_PMU_GLOBAL_CTRL_TT_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); + } +} + +static const struct hisi_uncore_ops hisi_uncore_noc_ops = { + .write_evtype = hisi_noc_pmu_write_evtype, + .get_event_idx = hisi_noc_pmu_get_event_idx, + .read_counter = hisi_noc_pmu_read_counter, + .write_counter = hisi_noc_pmu_write_counter, + .enable_counter = hisi_noc_pmu_enable_counter, + .disable_counter = hisi_noc_pmu_disable_counter, + .enable_counter_int = hisi_noc_pmu_enable_counter_int, + .disable_counter_int = hisi_noc_pmu_disable_counter_int, + .start_counters = hisi_noc_pmu_start_counters, + .stop_counters = hisi_noc_pmu_stop_counters, + .get_int_status = hisi_noc_pmu_get_int_status, + .clear_int_status = hisi_noc_pmu_clear_int_status, + .enable_filter = hisi_noc_pmu_enable_filter, + .disable_filter = hisi_noc_pmu_disable_filter, +}; + +static struct attribute *hisi_noc_pmu_format_attrs[] = { + HISI_PMU_FORMAT_ATTR(event, "config:0-7"), + HISI_PMU_FORMAT_ATTR(ch, "config1:0-2"), + HISI_PMU_FORMAT_ATTR(tt_en, "config1:3"), + NULL +}; + +static const struct attribute_group hisi_noc_pmu_format_group = { + .name = "format", + .attrs = hisi_noc_pmu_format_attrs, +}; + +static struct attribute *hisi_noc_pmu_events_attrs[] = { + HISI_PMU_EVENT_ATTR(cycles, 0x0e), + /* Flux on/off the ring */ + HISI_PMU_EVENT_ATTR(ingress_flow_sum, 0x1a), + HISI_PMU_EVENT_ATTR(egress_flow_sum, 0x17), + /* Buffer full duration on/off the ring */ + HISI_PMU_EVENT_ATTR(ingress_buf_full, 0x19), + HISI_PMU_EVENT_ATTR(egress_buf_full, 0x12), + /* Failure packets count on/off the ring */ + HISI_PMU_EVENT_ATTR(cw_ingress_fail, 0x01), + HISI_PMU_EVENT_ATTR(cc_ingress_fail, 0x09), + HISI_PMU_EVENT_ATTR(cw_egress_fail, 0x03), + HISI_PMU_EVENT_ATTR(cc_egress_fail, 0x0b), + /* Flux of the ring */ + HISI_PMU_EVENT_ATTR(cw_main_flow_sum, 0x05), + HISI_PMU_EVENT_ATTR(cc_main_flow_sum, 0x0d), + NULL +}; + +static const struct attribute_group hisi_noc_pmu_events_group = { + .name = "events", + .attrs = hisi_noc_pmu_events_attrs, +}; + +static const struct attribute_group *hisi_noc_pmu_attr_groups[] = { + &hisi_noc_pmu_format_group, + &hisi_noc_pmu_events_group, + &hisi_pmu_cpumask_attr_group, + &hisi_pmu_identifier_group, + NULL +}; + +static int hisi_noc_pmu_dev_init(struct platform_device *pdev, struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info; + + hisi_uncore_pmu_init_topology(noc_pmu, &pdev->dev); + + if (noc_pmu->topo.scl_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, "failed to get scl-id\n"); + + if (noc_pmu->topo.index_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, "failed to get idx-id\n"); + + if (noc_pmu->topo.sub_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, "failed to get sub-id\n"); + + noc_pmu->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(noc_pmu->base)) + return dev_err_probe(&pdev->dev, PTR_ERR(noc_pmu->base), + "fail to remap io memory\n"); + + noc_pmu->dev_info = device_get_match_data(&pdev->dev); + if (!noc_pmu->dev_info) + return -ENODEV; + + noc_pmu->pmu_events.attr_groups = noc_pmu->dev_info->attr_groups; + noc_pmu->counter_bits = noc_pmu->dev_info->counter_bits; + noc_pmu->check_event = noc_pmu->dev_info->check_event; + noc_pmu->num_counters = NOC_PMU_NR_COUNTERS; + noc_pmu->ops = &hisi_uncore_noc_ops; + noc_pmu->dev = &pdev->dev; + noc_pmu->on_cpu = -1; + + reg_info = noc_pmu->dev_info->private; + noc_pmu->identifier = readl(noc_pmu->base + reg_info->version); + + return 0; +} + +static void hisi_noc_pmu_remove_cpuhp_instance(void *hotplug_node) +{ + cpuhp_state_remove_instance_nocalls(hisi_noc_pmu_cpuhp_state, hotplug_node); +} + +static void hisi_noc_pmu_unregister_pmu(void *pmu) +{ + perf_pmu_unregister(pmu); +} + +static int hisi_noc_pmu_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct hisi_pmu *noc_pmu; + char *name; + int ret; + + noc_pmu = devm_kzalloc(dev, sizeof(*noc_pmu), GFP_KERNEL); + if (!noc_pmu) + return -ENOMEM; + + /* + * HiSilicon Uncore PMU framework needs to get common hisi_pmu device + * from device's drvdata. + */ + platform_set_drvdata(pdev, noc_pmu); + + ret = hisi_noc_pmu_dev_init(pdev, noc_pmu); + if (ret) + return ret; + + ret = cpuhp_state_add_instance(hisi_noc_pmu_cpuhp_state, &noc_pmu->node); + if (ret) + return dev_err_probe(dev, ret, "Fail to register cpuhp instance\n"); + + ret = devm_add_action_or_reset(dev, hisi_noc_pmu_remove_cpuhp_instance, + &noc_pmu->node); + if (ret) + return ret; + + hisi_pmu_init(noc_pmu, THIS_MODULE); + + name = devm_kasprintf(dev, GFP_KERNEL, "hisi_scl%d_noc%d_%d", + noc_pmu->topo.scl_id, noc_pmu->topo.index_id, + noc_pmu->topo.sub_id); + if (!name) + return -ENOMEM; + + ret = perf_pmu_register(&noc_pmu->pmu, name, -1); + if (ret) + return dev_err_probe(dev, ret, "Fail to register PMU\n"); + + return devm_add_action_or_reset(dev, hisi_noc_pmu_unregister_pmu, + &noc_pmu->pmu); +} + +static struct hisi_noc_pmu_regs hisi_noc_v1_pmu_regs = { + .version = NOC_PMU_VERSION, + .pmu_ctrl = NOC_PMU_GLOBAL_CTRL, + .event_ctrl0 = NOC_PMU_EVENT_CTRL0, + .event_cntr0 = NOC_PMU_EVENT_COUNTER0, + .overflow_status = NOC_PMU_CNT_INFO, +}; + +static const struct hisi_pmu_dev_info hisi_noc_v1 = { + .attr_groups = hisi_noc_pmu_attr_groups, + .counter_bits = 64, + .check_event = NOC_PMU_EVENT_CTRL_TYPE, + .private = &hisi_noc_v1_pmu_regs, +}; + +static const struct acpi_device_id hisi_noc_pmu_ids[] = { + { "HISI04E0", (kernel_ulong_t) &hisi_noc_v1 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, hisi_noc_pmu_ids); + +static struct platform_driver hisi_noc_pmu_driver = { + .driver = { + .name = "hisi_noc_pmu", + .acpi_match_table = hisi_noc_pmu_ids, + .suppress_bind_attrs = true, + }, + .probe = hisi_noc_pmu_probe, +}; + +static int __init hisi_noc_pmu_module_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/hisi/noc:online", + hisi_uncore_pmu_online_cpu, + hisi_uncore_pmu_offline_cpu); + if (ret < 0) { + pr_err("hisi_noc_pmu: Fail to setup cpuhp callbacks, ret = %d\n", ret); + return ret; + } + hisi_noc_pmu_cpuhp_state = ret; + + ret = platform_driver_register(&hisi_noc_pmu_driver); + if (ret) + cpuhp_remove_multi_state(hisi_noc_pmu_cpuhp_state); + + return ret; +} +module_init(hisi_noc_pmu_module_init); + +static void __exit hisi_noc_pmu_module_exit(void) +{ + platform_driver_unregister(&hisi_noc_pmu_driver); + cpuhp_remove_multi_state(hisi_noc_pmu_cpuhp_state); +} +module_exit(hisi_noc_pmu_module_exit); + +MODULE_IMPORT_NS("HISI_PMU"); +MODULE_DESCRIPTION("HiSilicon SoC Uncore NoC PMU driver"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yicong Yang "); From 2257798498b3b069e5ff46ad957c32a9a06b5fc9 Mon Sep 17 00:00:00 2001 From: Junhao He Date: Thu, 14 Aug 2025 17:16:21 +0800 Subject: [PATCH 2117/3206] drivers/perf: hisi: Add support for HiSilicon MN PMU driver MN (Miscellaneous Node) is a hybrid node in ARM CHI. It broadcasts the following two types of requests: DVM operations and PCIe configuration. MN PMU devices exist on both SCCL and SICL, so we named the MN pmu driver after SCL (Super cluster) ID. The MN PMU driver using the HiSilicon uncore PMU framework. And only the event parameter is supported. Signed-off-by: Junhao He Reviewed-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Will Deacon --- drivers/perf/hisilicon/Makefile | 2 +- drivers/perf/hisilicon/hisi_uncore_mn_pmu.c | 411 ++++++++++++++++++++ 2 files changed, 412 insertions(+), 1 deletion(-) create mode 100644 drivers/perf/hisilicon/hisi_uncore_mn_pmu.c diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile index dcec8f39719d00..186be3d02238b1 100644 --- a/drivers/perf/hisilicon/Makefile +++ b/drivers/perf/hisilicon/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \ hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \ hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o hisi_uncore_uc_pmu.o \ - hisi_uncore_noc_pmu.o + hisi_uncore_noc_pmu.o hisi_uncore_mn_pmu.o obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o obj-$(CONFIG_HNS3_PMU) += hns3_pmu.o diff --git a/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c b/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c new file mode 100644 index 00000000000000..4df4eebe243e66 --- /dev/null +++ b/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HiSilicon SoC MN uncore Hardware event counters support + * + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "hisi_uncore_pmu.h" + +/* Dynamic CPU hotplug state used by MN PMU */ +static enum cpuhp_state hisi_mn_pmu_online; + +/* MN register definition */ +#define HISI_MN_DYNAMIC_CTRL_REG 0x400 +#define HISI_MN_DYNAMIC_CTRL_EN BIT(0) +#define HISI_MN_PERF_CTRL_REG 0x408 +#define HISI_MN_PERF_CTRL_EN BIT(6) +#define HISI_MN_INT_MASK_REG 0x800 +#define HISI_MN_INT_STATUS_REG 0x808 +#define HISI_MN_INT_CLEAR_REG 0x80C +#define HISI_MN_EVENT_CTRL_REG 0x1C00 +#define HISI_MN_VERSION_REG 0x1C04 +#define HISI_MN_EVTYPE0_REG 0x1d00 +#define HISI_MN_EVTYPE_MASK GENMASK(7, 0) +#define HISI_MN_CNTR0_REG 0x1e00 +#define HISI_MN_EVTYPE_REGn(evtype0, n) ((evtype0) + (n) * 4) +#define HISI_MN_CNTR_REGn(cntr0, n) ((cntr0) + (n) * 8) + +#define HISI_MN_NR_COUNTERS 4 +#define HISI_MN_TIMEOUT_US 500U + +struct hisi_mn_pmu_regs { + u32 version; + u32 dyn_ctrl; + u32 perf_ctrl; + u32 int_mask; + u32 int_clear; + u32 int_status; + u32 event_ctrl; + u32 event_type0; + u32 event_cntr0; +}; + +/* + * Each event request takes a certain amount of time to complete. If + * we counting the latency related event, we need to wait for the all + * requests complete. Otherwise, the value of counter is slightly larger. + */ +static void hisi_mn_pmu_counter_flush(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + int ret; + u32 val; + + val = readl(mn_pmu->base + reg_info->dyn_ctrl); + val |= HISI_MN_DYNAMIC_CTRL_EN; + writel(val, mn_pmu->base + reg_info->dyn_ctrl); + + ret = readl_poll_timeout_atomic(mn_pmu->base + reg_info->dyn_ctrl, + val, !(val & HISI_MN_DYNAMIC_CTRL_EN), + 1, HISI_MN_TIMEOUT_US); + if (ret) + dev_warn(mn_pmu->dev, "Counter flush timeout\n"); +} + +static u64 hisi_mn_pmu_read_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + return readq(mn_pmu->base + HISI_MN_CNTR_REGn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_mn_pmu_write_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc, u64 val) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + writeq(val, mn_pmu->base + HISI_MN_CNTR_REGn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_mn_pmu_write_evtype(struct hisi_pmu *mn_pmu, int idx, u32 type) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + /* + * Select the appropriate event select register. + * There are 2 32-bit event select registers for the + * 8 hardware counters, each event code is 8-bit wide. + */ + val = readl(mn_pmu->base + HISI_MN_EVTYPE_REGn(reg_info->event_type0, idx / 4)); + val &= ~(HISI_MN_EVTYPE_MASK << HISI_PMU_EVTYPE_SHIFT(idx)); + val |= (type << HISI_PMU_EVTYPE_SHIFT(idx)); + writel(val, mn_pmu->base + HISI_MN_EVTYPE_REGn(reg_info->event_type0, idx / 4)); +} + +static void hisi_mn_pmu_start_counters(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->perf_ctrl); + val |= HISI_MN_PERF_CTRL_EN; + writel(val, mn_pmu->base + reg_info->perf_ctrl); +} + +static void hisi_mn_pmu_stop_counters(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->perf_ctrl); + val &= ~HISI_MN_PERF_CTRL_EN; + writel(val, mn_pmu->base + reg_info->perf_ctrl); + + hisi_mn_pmu_counter_flush(mn_pmu); +} + +static void hisi_mn_pmu_enable_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->event_ctrl); + val |= BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->event_ctrl); +} + +static void hisi_mn_pmu_disable_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->event_ctrl); + val &= ~BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->event_ctrl); +} + +static void hisi_mn_pmu_enable_counter_int(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->int_mask); + val &= ~BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->int_mask); +} + +static void hisi_mn_pmu_disable_counter_int(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->int_mask); + val |= BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->int_mask); +} + +static u32 hisi_mn_pmu_get_int_status(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + return readl(mn_pmu->base + reg_info->int_status); +} + +static void hisi_mn_pmu_clear_int_status(struct hisi_pmu *mn_pmu, int idx) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + writel(BIT(idx), mn_pmu->base + reg_info->int_clear); +} + +static struct attribute *hisi_mn_pmu_format_attr[] = { + HISI_PMU_FORMAT_ATTR(event, "config:0-7"), + NULL +}; + +static const struct attribute_group hisi_mn_pmu_format_group = { + .name = "format", + .attrs = hisi_mn_pmu_format_attr, +}; + +static struct attribute *hisi_mn_pmu_events_attr[] = { + HISI_PMU_EVENT_ATTR(req_eobarrier_num, 0x00), + HISI_PMU_EVENT_ATTR(req_ecbarrier_num, 0x01), + HISI_PMU_EVENT_ATTR(req_dvmop_num, 0x02), + HISI_PMU_EVENT_ATTR(req_dvmsync_num, 0x03), + HISI_PMU_EVENT_ATTR(req_retry_num, 0x04), + HISI_PMU_EVENT_ATTR(req_writenosnp_num, 0x05), + HISI_PMU_EVENT_ATTR(req_readnosnp_num, 0x06), + HISI_PMU_EVENT_ATTR(snp_dvm_num, 0x07), + HISI_PMU_EVENT_ATTR(snp_dvmsync_num, 0x08), + HISI_PMU_EVENT_ATTR(l3t_req_dvm_num, 0x09), + HISI_PMU_EVENT_ATTR(l3t_req_dvmsync_num, 0x0A), + HISI_PMU_EVENT_ATTR(mn_req_dvm_num, 0x0B), + HISI_PMU_EVENT_ATTR(mn_req_dvmsync_num, 0x0C), + HISI_PMU_EVENT_ATTR(pa_req_dvm_num, 0x0D), + HISI_PMU_EVENT_ATTR(pa_req_dvmsync_num, 0x0E), + HISI_PMU_EVENT_ATTR(snp_dvm_latency, 0x80), + HISI_PMU_EVENT_ATTR(snp_dvmsync_latency, 0x81), + HISI_PMU_EVENT_ATTR(l3t_req_dvm_latency, 0x82), + HISI_PMU_EVENT_ATTR(l3t_req_dvmsync_latency, 0x83), + HISI_PMU_EVENT_ATTR(mn_req_dvm_latency, 0x84), + HISI_PMU_EVENT_ATTR(mn_req_dvmsync_latency, 0x85), + HISI_PMU_EVENT_ATTR(pa_req_dvm_latency, 0x86), + HISI_PMU_EVENT_ATTR(pa_req_dvmsync_latency, 0x87), + NULL +}; + +static const struct attribute_group hisi_mn_pmu_events_group = { + .name = "events", + .attrs = hisi_mn_pmu_events_attr, +}; + +static const struct attribute_group *hisi_mn_pmu_attr_groups[] = { + &hisi_mn_pmu_format_group, + &hisi_mn_pmu_events_group, + &hisi_pmu_cpumask_attr_group, + &hisi_pmu_identifier_group, + NULL +}; + +static const struct hisi_uncore_ops hisi_uncore_mn_ops = { + .write_evtype = hisi_mn_pmu_write_evtype, + .get_event_idx = hisi_uncore_pmu_get_event_idx, + .start_counters = hisi_mn_pmu_start_counters, + .stop_counters = hisi_mn_pmu_stop_counters, + .enable_counter = hisi_mn_pmu_enable_counter, + .disable_counter = hisi_mn_pmu_disable_counter, + .enable_counter_int = hisi_mn_pmu_enable_counter_int, + .disable_counter_int = hisi_mn_pmu_disable_counter_int, + .write_counter = hisi_mn_pmu_write_counter, + .read_counter = hisi_mn_pmu_read_counter, + .get_int_status = hisi_mn_pmu_get_int_status, + .clear_int_status = hisi_mn_pmu_clear_int_status, +}; + +static int hisi_mn_pmu_dev_init(struct platform_device *pdev, + struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info; + int ret; + + hisi_uncore_pmu_init_topology(mn_pmu, &pdev->dev); + + if (mn_pmu->topo.scl_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, + "Failed to read MN scl id\n"); + + if (mn_pmu->topo.index_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, + "Failed to read MN index id\n"); + + mn_pmu->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(mn_pmu->base)) + return dev_err_probe(&pdev->dev, PTR_ERR(mn_pmu->base), + "Failed to ioremap resource\n"); + + ret = hisi_uncore_pmu_init_irq(mn_pmu, pdev); + if (ret) + return ret; + + mn_pmu->dev_info = device_get_match_data(&pdev->dev); + if (!mn_pmu->dev_info) + return -ENODEV; + + mn_pmu->pmu_events.attr_groups = mn_pmu->dev_info->attr_groups; + mn_pmu->counter_bits = mn_pmu->dev_info->counter_bits; + mn_pmu->check_event = mn_pmu->dev_info->check_event; + mn_pmu->num_counters = HISI_MN_NR_COUNTERS; + mn_pmu->ops = &hisi_uncore_mn_ops; + mn_pmu->dev = &pdev->dev; + mn_pmu->on_cpu = -1; + + reg_info = mn_pmu->dev_info->private; + mn_pmu->identifier = readl(mn_pmu->base + reg_info->version); + + return 0; +} + +static void hisi_mn_pmu_remove_cpuhp(void *hotplug_node) +{ + cpuhp_state_remove_instance_nocalls(hisi_mn_pmu_online, hotplug_node); +} + +static void hisi_mn_pmu_unregister(void *pmu) +{ + perf_pmu_unregister(pmu); +} + +static int hisi_mn_pmu_probe(struct platform_device *pdev) +{ + struct hisi_pmu *mn_pmu; + char *name; + int ret; + + mn_pmu = devm_kzalloc(&pdev->dev, sizeof(*mn_pmu), GFP_KERNEL); + if (!mn_pmu) + return -ENOMEM; + + platform_set_drvdata(pdev, mn_pmu); + + ret = hisi_mn_pmu_dev_init(pdev, mn_pmu); + if (ret) + return ret; + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_scl%d_mn%d", + mn_pmu->topo.scl_id, mn_pmu->topo.index_id); + if (!name) + return -ENOMEM; + + ret = cpuhp_state_add_instance(hisi_mn_pmu_online, &mn_pmu->node); + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to register cpu hotplug\n"); + + ret = devm_add_action_or_reset(&pdev->dev, hisi_mn_pmu_remove_cpuhp, &mn_pmu->node); + if (ret) + return ret; + + hisi_pmu_init(mn_pmu, THIS_MODULE); + + ret = perf_pmu_register(&mn_pmu->pmu, name, -1); + if (ret) + return dev_err_probe(mn_pmu->dev, ret, "Failed to register MN PMU\n"); + + return devm_add_action_or_reset(&pdev->dev, hisi_mn_pmu_unregister, &mn_pmu->pmu); +} + +static struct hisi_mn_pmu_regs hisi_mn_v1_pmu_regs = { + .version = HISI_MN_VERSION_REG, + .dyn_ctrl = HISI_MN_DYNAMIC_CTRL_REG, + .perf_ctrl = HISI_MN_PERF_CTRL_REG, + .int_mask = HISI_MN_INT_MASK_REG, + .int_clear = HISI_MN_INT_CLEAR_REG, + .int_status = HISI_MN_INT_STATUS_REG, + .event_ctrl = HISI_MN_EVENT_CTRL_REG, + .event_type0 = HISI_MN_EVTYPE0_REG, + .event_cntr0 = HISI_MN_CNTR0_REG, +}; + +static const struct hisi_pmu_dev_info hisi_mn_v1 = { + .attr_groups = hisi_mn_pmu_attr_groups, + .counter_bits = 48, + .check_event = HISI_MN_EVTYPE_MASK, + .private = &hisi_mn_v1_pmu_regs, +}; + +static const struct acpi_device_id hisi_mn_pmu_acpi_match[] = { + { "HISI0222", (kernel_ulong_t) &hisi_mn_v1 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, hisi_mn_pmu_acpi_match); + +static struct platform_driver hisi_mn_pmu_driver = { + .driver = { + .name = "hisi_mn_pmu", + .acpi_match_table = hisi_mn_pmu_acpi_match, + /* + * We have not worked out a safe bind/unbind process, + * Forcefully unbinding during sampling will lead to a + * kernel panic, so this is not supported yet. + */ + .suppress_bind_attrs = true, + }, + .probe = hisi_mn_pmu_probe, +}; + +static int __init hisi_mn_pmu_module_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/hisi/mn:online", + hisi_uncore_pmu_online_cpu, + hisi_uncore_pmu_offline_cpu); + if (ret < 0) { + pr_err("hisi_mn_pmu: Failed to setup MN PMU hotplug: %d\n", ret); + return ret; + } + hisi_mn_pmu_online = ret; + + ret = platform_driver_register(&hisi_mn_pmu_driver); + if (ret) + cpuhp_remove_multi_state(hisi_mn_pmu_online); + + return ret; +} +module_init(hisi_mn_pmu_module_init); + +static void __exit hisi_mn_pmu_module_exit(void) +{ + platform_driver_unregister(&hisi_mn_pmu_driver); + cpuhp_remove_multi_state(hisi_mn_pmu_online); +} +module_exit(hisi_mn_pmu_module_exit); + +MODULE_IMPORT_NS("HISI_PMU"); +MODULE_DESCRIPTION("HiSilicon SoC MN uncore PMU driver"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Junhao He "); From 542342d27122e4a0dc90025748f7821f8e15920a Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 14 Aug 2025 17:16:22 +0800 Subject: [PATCH 2118/3206] MAINTAINERS: Remove myself from HiSilicon PMU maintainers Remove myself as I'm leaving HiSilicon and not suitable for maintaining this. Thanks for the journey. Signed-off-by: Yicong Yang Acked-by: Jonathan Cameron Signed-off-by: Will Deacon --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3815a2c4b3a84c..c482d641fbe176 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11062,7 +11062,6 @@ F: Documentation/devicetree/bindings/net/hisilicon*.txt F: drivers/net/ethernet/hisilicon/ HISILICON PMU DRIVER -M: Yicong Yang M: Jonathan Cameron S: Supported W: http://www.hisilicon.com From 17e9521044c9b3ee839f861d1ac35c5b5c20d16b Mon Sep 17 00:00:00 2001 From: Junhui Liu Date: Tue, 22 Jul 2025 00:53:11 +0800 Subject: [PATCH 2119/3206] riscv: mm: Use mmu-type from FDT to limit SATP mode Some RISC-V implementations may hang when attempting to write an unsupported SATP mode, even though the latest RISC-V specification states such writes should have no effect. To avoid this issue, the logic for selecting SATP mode has been refined: The kernel now determines the SATP mode limit by taking the minimum of the value specified by the kernel command line (noXlvl) and the "mmu-type" property in the device tree (FDT). If only one is specified, use that. - If the resulting limit is sv48 or higher, the kernel will probe SATP modes from this limit downward until a supported mode is found. - If the limit is sv39, the kernel will directly use sv39 without probing. This ensures SATP mode selection is safe and compatible with both hardware and user configuration, minimizing the risk of hangs. Signed-off-by: Junhui Liu Reviewed-by: Alexandre Ghiti Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250722-satp-from-fdt-v1-2-5ba22218fa5f@pigmoral.tech Signed-off-by: Paul Walmsley --- arch/riscv/kernel/pi/fdt_early.c | 40 ++++++++++++++++++++++++++++++++ arch/riscv/kernel/pi/pi.h | 1 + arch/riscv/mm/init.c | 11 ++++++--- 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/pi/fdt_early.c b/arch/riscv/kernel/pi/fdt_early.c index 9bdee2fafe47e4..a12ff8090f1903 100644 --- a/arch/riscv/kernel/pi/fdt_early.c +++ b/arch/riscv/kernel/pi/fdt_early.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "pi.h" @@ -183,3 +184,42 @@ bool fdt_early_match_extension_isa(const void *fdt, const char *ext_name) return ret; } + +/** + * set_satp_mode_from_fdt - determine SATP mode based on the MMU type in fdt + * + * @dtb_pa: physical address of the device tree blob + * + * Returns the SATP mode corresponding to the MMU type of the first enabled CPU, + * 0 otherwise + */ +u64 set_satp_mode_from_fdt(uintptr_t dtb_pa) +{ + const void *fdt = (const void *)dtb_pa; + const char *mmu_type; + int node, parent; + + parent = fdt_path_offset(fdt, "/cpus"); + if (parent < 0) + return 0; + + fdt_for_each_subnode(node, fdt, parent) { + if (!fdt_node_name_eq(fdt, node, "cpu")) + continue; + + if (!fdt_device_is_available(fdt, node)) + continue; + + mmu_type = fdt_getprop(fdt, node, "mmu-type", NULL); + if (!mmu_type) + break; + + if (!strcmp(mmu_type, "riscv,sv39")) + return SATP_MODE_39; + else if (!strcmp(mmu_type, "riscv,sv48")) + return SATP_MODE_48; + break; + } + + return 0; +} diff --git a/arch/riscv/kernel/pi/pi.h b/arch/riscv/kernel/pi/pi.h index 21141d84fea603..3fee2cfddf7cfb 100644 --- a/arch/riscv/kernel/pi/pi.h +++ b/arch/riscv/kernel/pi/pi.h @@ -14,6 +14,7 @@ u64 get_kaslr_seed(uintptr_t dtb_pa); u64 get_kaslr_seed_zkr(const uintptr_t dtb_pa); bool set_nokaslr_from_cmdline(uintptr_t dtb_pa); u64 set_satp_mode_from_cmdline(uintptr_t dtb_pa); +u64 set_satp_mode_from_fdt(uintptr_t dtb_pa); bool fdt_early_match_extension_isa(const void *fdt, const char *ext_name); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 054265b3f26803..85cb70b10c0715 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -816,6 +816,7 @@ static __meminit pgprot_t pgprot_from_va(uintptr_t va) #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) u64 __pi_set_satp_mode_from_cmdline(uintptr_t dtb_pa); +u64 __pi_set_satp_mode_from_fdt(uintptr_t dtb_pa); static void __init disable_pgtable_l5(void) { @@ -855,18 +856,22 @@ static void __init set_mmap_rnd_bits_max(void) * underlying hardware: establish 1:1 mapping in 4-level page table mode * then read SATP to see if the configuration was taken into account * meaning sv48 is supported. + * The maximum SATP mode is limited by both the command line and the "mmu-type" + * property in the device tree, since some platforms may hang if an unsupported + * SATP mode is attempted. */ static __init void set_satp_mode(uintptr_t dtb_pa) { u64 identity_satp, hw_satp; uintptr_t set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; - u64 satp_mode_cmdline = __pi_set_satp_mode_from_cmdline(dtb_pa); + u64 satp_mode_limit = min_not_zero(__pi_set_satp_mode_from_cmdline(dtb_pa), + __pi_set_satp_mode_from_fdt(dtb_pa)); kernel_map.page_offset = PAGE_OFFSET_L5; - if (satp_mode_cmdline == SATP_MODE_48) { + if (satp_mode_limit == SATP_MODE_48) { disable_pgtable_l5(); - } else if (satp_mode_cmdline == SATP_MODE_39) { + } else if (satp_mode_limit == SATP_MODE_39) { disable_pgtable_l5(); disable_pgtable_l4(); return; From 01dc937ac18dd0bc0fc77c24030639553f977ffe Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Thu, 28 Aug 2025 20:25:09 +0800 Subject: [PATCH 2120/3206] drivers/perf: riscv: Remove redundant ternary operators For ternary operators in the form of "a ? true : false", if 'a' itself returns a boolean result, the ternary operator can be omitted. Remove redundant ternary operators to clean up the code. Signed-off-by: Liao Yuanhong Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20250828122510.30843-1-liaoyuanhong@vivo.com Signed-off-by: Paul Walmsley --- drivers/perf/riscv_pmu_sbi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 698de8ddf895ba..c18dbffa983450 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -339,7 +339,7 @@ static bool pmu_sbi_ctr_is_fw(int cidx) if (!info) return false; - return (info->type == SBI_PMU_CTR_TYPE_FW) ? true : false; + return info->type == SBI_PMU_CTR_TYPE_FW; } /* From 316b60b984d5be9b86047cdf3bf16d51c7c70cc5 Mon Sep 17 00:00:00 2001 From: Jessica Liu Date: Fri, 1 Aug 2025 10:49:48 +0800 Subject: [PATCH 2121/3206] riscv: mmap(): use unsigned offset type in riscv_sys_mmap The variable type of offset should be consistent with the relevant interfaces of mmap which described in commit 295f10061af0 ("syscalls: mmap(): use unsigned offset type consistently"). Otherwise, a user input with the top bit set would result in a negative page offset rather than a large one. Signed-off-by: Jessica Liu Tested-by: Han Gao Reviewed-by: Alexandre Ghiti Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250801104948133AaMr5S6E382PbNNhoJgHA@zte.com.cn [pjw@kernel.org: hand-applied mangled patch; fixed checkpatch error] Signed-off-by: Paul Walmsley --- arch/riscv/kernel/sys_riscv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index d77afe05578f23..795b2e815ac923 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -10,7 +10,7 @@ static long riscv_sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long fd, off_t offset, + unsigned long fd, unsigned long offset, unsigned long page_shift_offset) { if (unlikely(offset & (~PAGE_MASK >> page_shift_offset))) From cc2294d3f9c99c216ef563b83b08d2c0604f9b92 Mon Sep 17 00:00:00 2001 From: Ignacio Encinas Date: Wed, 23 Jul 2025 20:47:29 +0100 Subject: [PATCH 2122/3206] riscv: introduce asm/swab.h Implement endianness swap macros for RISC-V. Use the rev8 instruction when Zbb is available. Otherwise, rely on the default mask-and-shift implementation. Acked-by: Palmer Dabbelt Reviewed-by: Alexandre Ghiti Tested-by: Alexandre Ghiti Signed-off-by: Ignacio Encinas Link: https://lore.kernel.org/r/20250723-riscv-swab-v6-1-fc11e9a2efc9@iencinas.com Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/swab.h | 87 +++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 arch/riscv/include/asm/swab.h diff --git a/arch/riscv/include/asm/swab.h b/arch/riscv/include/asm/swab.h new file mode 100644 index 00000000000000..c1da22aa13268f --- /dev/null +++ b/arch/riscv/include/asm/swab.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_RISCV_SWAB_H +#define _ASM_RISCV_SWAB_H + +#include +#include +#include +#include +#include + +#if defined(CONFIG_TOOLCHAIN_HAS_ZBB) && defined(CONFIG_RISCV_ISA_ZBB) && !defined(NO_ALTERNATIVE) + +// Duplicated from include/uapi/linux/swab.h +#define ___constant_swab16(x) ((__u16)( \ + (((__u16)(x) & (__u16)0x00ffU) << 8) | \ + (((__u16)(x) & (__u16)0xff00U) >> 8))) + +#define ___constant_swab32(x) ((__u32)( \ + (((__u32)(x) & (__u32)0x000000ffUL) << 24) | \ + (((__u32)(x) & (__u32)0x0000ff00UL) << 8) | \ + (((__u32)(x) & (__u32)0x00ff0000UL) >> 8) | \ + (((__u32)(x) & (__u32)0xff000000UL) >> 24))) + +#define ___constant_swab64(x) ((__u64)( \ + (((__u64)(x) & (__u64)0x00000000000000ffULL) << 56) | \ + (((__u64)(x) & (__u64)0x000000000000ff00ULL) << 40) | \ + (((__u64)(x) & (__u64)0x0000000000ff0000ULL) << 24) | \ + (((__u64)(x) & (__u64)0x00000000ff000000ULL) << 8) | \ + (((__u64)(x) & (__u64)0x000000ff00000000ULL) >> 8) | \ + (((__u64)(x) & (__u64)0x0000ff0000000000ULL) >> 24) | \ + (((__u64)(x) & (__u64)0x00ff000000000000ULL) >> 40) | \ + (((__u64)(x) & (__u64)0xff00000000000000ULL) >> 56))) + +#define ARCH_SWAB(size, value) \ +({ \ + unsigned long x = value; \ + \ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBB)) { \ + asm volatile (".option push\n" \ + ".option arch,+zbb\n" \ + "rev8 %0, %1\n" \ + ".option pop\n" \ + : "=r" (x) : "r" (x)); \ + x = x >> (BITS_PER_LONG - size); \ + } else { \ + x = ___constant_swab##size(value); \ + } \ + x; \ +}) + +static __always_inline __u16 __arch_swab16(__u16 value) +{ + return ARCH_SWAB(16, value); +} + +static __always_inline __u32 __arch_swab32(__u32 value) +{ + return ARCH_SWAB(32, value); +} + +#ifdef CONFIG_64BIT +static __always_inline __u64 __arch_swab64(__u64 value) +{ + return ARCH_SWAB(64, value); +} +#else +static __always_inline __u64 __arch_swab64(__u64 value) +{ + __u32 h = value >> 32; + __u32 l = value & ((1ULL << 32) - 1); + + return ((__u64)(__arch_swab32(l)) << 32) | ((__u64)(__arch_swab32(h))); +} +#endif + +#define __arch_swab64 __arch_swab64 +#define __arch_swab32 __arch_swab32 +#define __arch_swab16 __arch_swab16 + +#undef ___constant_swab16 +#undef ___constant_swab32 +#undef ___constant_swab64 + +#undef ARCH_SWAB + +#endif /* defined(CONFIG_TOOLCHAIN_HAS_ZBB) && defined(CONFIG_RISCV_ISA_ZBB) && !defined(NO_ALTERNATIVE) */ +#endif /* _ASM_RISCV_SWAB_H */ From 6dab7e15c0b312be79ccadf85c9ec7332427ba7b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 3 Jun 2025 03:10:18 +0900 Subject: [PATCH 2123/3206] riscv: pi: use 'targets' instead of extra-y in Makefile %.pi.o files are built as prerequisites of other objects. There is no need to use extra-y, which is planned for deprecation. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20250602181023.528550-1-masahiroy@kernel.org Signed-off-by: Paul Walmsley --- arch/riscv/kernel/pi/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/pi/Makefile b/arch/riscv/kernel/pi/Makefile index 7dd15be69c9007..bc098edac89813 100644 --- a/arch/riscv/kernel/pi/Makefile +++ b/arch/riscv/kernel/pi/Makefile @@ -39,4 +39,4 @@ $(obj)/ctype.o: $(srctree)/lib/ctype.c FORCE $(call if_changed_rule,cc_o_c) obj-y := cmdline_early.pi.o fdt_early.pi.o string.pi.o ctype.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o archrandom_early.pi.o -extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) +targets := $(patsubst %.pi.o,%.o,$(obj-y)) From 205cbc714842478df4239b5be205b9b459fd9fbc Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Fri, 11 Jul 2025 09:04:43 +0000 Subject: [PATCH 2124/3206] riscv: Enable ARCH_HAVE_NMI_SAFE_CMPXCHG The implement of cmpxchg() in riscv is based on atomic primitives and has NMI-safe features, so it can be used safely in the in_nmi context. ftrace's ringbuffer relies on NMI-safe cmpxchg() in the NMI context. Currently, in_nmi() is true when riscv kprobe is in trap-based mode, so this config needs to be selected, otherwise kprobetrace will not be available. Signed-off-by: Pu Lehui Reviewed-by: Alexandre Ghiti Tested-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250711090443.1688404-1-pulehui@huaweicloud.com [pjw@kernel.org: moved to preserve alphabetical order] Signed-off-by: Paul Walmsley --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 51dcd8eaa24356..715e59f1e287dc 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -53,6 +53,7 @@ config RISCV select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN select ARCH_HAS_VDSO_ARCH_DATA if GENERIC_VDSO_DATA_STORE + select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK if ACPI select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if 64BIT && MMU select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX From 92c4995b4d494f197858a79c6c6af7b6b06d38bf Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Tue, 29 Jul 2025 15:15:34 +0200 Subject: [PATCH 2125/3206] RISC-V: ACPI: enable parsing the BGRT table The BGRT table is used to display a vendor logo during the boot process. Add the code for parsing it. Signed-off-by: Heinrich Schuchardt Reviewed-by: Sunil V L Link: https://lore.kernel.org/r/20250729131535.522205-2-heinrich.schuchardt@canonical.com Signed-off-by: Paul Walmsley --- arch/riscv/kernel/acpi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c index 3f6d5a6789e878..71698ee11621ac 100644 --- a/arch/riscv/kernel/acpi.c +++ b/arch/riscv/kernel/acpi.c @@ -14,6 +14,7 @@ */ #include +#include #include #include #include @@ -160,6 +161,8 @@ void __init acpi_boot_table_init(void) early_init_dt_scan_chosen_stdout(); } else { acpi_parse_spcr(earlycon_acpi_spcr_enable, true); + if (IS_ENABLED(CONFIG_ACPI_BGRT)) + acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); } } From 16d18e3eaf29be1d987f5238ec03226f15dad5f5 Mon Sep 17 00:00:00 2001 From: "Guo Ren (Alibaba DAMO Academy)" Date: Sun, 13 Jul 2025 11:53:20 -0400 Subject: [PATCH 2126/3206] riscv: Move vendor errata definitions to new header Move vendor errata definitions into errata_list_vendors.h. Signed-off-by: Guo Ren (Alibaba DAMO Academy) Reviewed-by: Alexandre Ghiti Tested-by: Han Gao Link: https://lore.kernel.org/r/20250713155321.2064856-2-guoren@kernel.org [pjw@kernel.org: updated to apply and to make the whitespace consistent] Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/errata_list.h | 19 +--------------- arch/riscv/include/asm/errata_list_vendors.h | 24 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 18 deletions(-) create mode 100644 arch/riscv/include/asm/errata_list_vendors.h diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index e17d6c98b3bfdf..a2481f14b68d56 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -10,24 +10,7 @@ #include #include #include - -#ifdef CONFIG_ERRATA_ANDES -#define ERRATA_ANDES_NO_IOCP 0 -#define ERRATA_ANDES_NUMBER 1 -#endif - -#ifdef CONFIG_ERRATA_SIFIVE -#define ERRATA_SIFIVE_CIP_453 0 -#define ERRATA_SIFIVE_CIP_1200 1 -#define ERRATA_SIFIVE_NUMBER 2 -#endif - -#ifdef CONFIG_ERRATA_THEAD -#define ERRATA_THEAD_MAE 0 -#define ERRATA_THEAD_PMU 1 -#define ERRATA_THEAD_GHOSTWRITE 2 -#define ERRATA_THEAD_NUMBER 3 -#endif +#include #ifdef __ASSEMBLER__ diff --git a/arch/riscv/include/asm/errata_list_vendors.h b/arch/riscv/include/asm/errata_list_vendors.h new file mode 100644 index 00000000000000..d448b9ce7c7c58 --- /dev/null +++ b/arch/riscv/include/asm/errata_list_vendors.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef ASM_ERRATA_LIST_VENDORS_H +#define ASM_ERRATA_LIST_VENDORS_H + +#ifdef CONFIG_ERRATA_ANDES +#define ERRATA_ANDES_NO_IOCP 0 +#define ERRATA_ANDES_NUMBER 1 +#endif + +#ifdef CONFIG_ERRATA_SIFIVE +#define ERRATA_SIFIVE_CIP_453 0 +#define ERRATA_SIFIVE_CIP_1200 1 +#define ERRATA_SIFIVE_NUMBER 2 +#endif + +#ifdef CONFIG_ERRATA_THEAD +#define ERRATA_THEAD_MAE 0 +#define ERRATA_THEAD_PMU 1 +#define ERRATA_THEAD_GHOSTWRITE 2 +#define ERRATA_THEAD_NUMBER 3 +#endif + +#endif /* ASM_ERRATA_LIST_VENDORS_H */ From 105f56877f2d5f82d71e20b45eb7be7c24c3d908 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Sep 2025 18:41:38 +0100 Subject: [PATCH 2127/3206] coresight: trbe: Prevent overflow in PERF_IDX2OFF() Cast nr_pages to unsigned long to avoid overflow when handling large AUX buffer sizes (>= 2 GiB). Fixes: 3fbf7f011f24 ("coresight: sink: Add TRBE driver") Signed-off-by: Leo Yan Signed-off-by: Will Deacon --- drivers/hwtracing/coresight/coresight-trbe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c index 8267dd1a2130d3..8f426f94e32a15 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.c +++ b/drivers/hwtracing/coresight/coresight-trbe.c @@ -23,7 +23,8 @@ #include "coresight-self-hosted-trace.h" #include "coresight-trbe.h" -#define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT)) +#define PERF_IDX2OFF(idx, buf) \ + ((idx) % ((unsigned long)(buf)->nr_pages << PAGE_SHIFT)) /* * A padding packet that will help the user space tools From a29fea30dd93da16652930162b177941abd8c75e Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Sep 2025 18:41:39 +0100 Subject: [PATCH 2128/3206] perf: arm_spe: Prevent overflow in PERF_IDX2OFF() Cast nr_pages to unsigned long to avoid overflow when handling large AUX buffer sizes (>= 2 GiB). Fixes: d5d9696b0380 ("drivers/perf: Add support for ARMv8.2 Statistical Profiling Extension") Signed-off-by: Leo Yan Signed-off-by: Will Deacon --- drivers/perf/arm_spe_pmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 591f72fa032760..fa50645feddadb 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -99,7 +99,8 @@ struct arm_spe_pmu { #define to_spe_pmu(p) (container_of(p, struct arm_spe_pmu, pmu)) /* Convert a free-running index from perf into an SPE buffer offset */ -#define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT)) +#define PERF_IDX2OFF(idx, buf) \ + ((idx) % ((unsigned long)(buf)->nr_pages << PAGE_SHIFT)) /* Keep track of our dynamic hotplug state */ static enum cpuhp_state arm_spe_pmu_online; From 3fbfe251cc9f6d391944282cdb9bcf0bd02e01f8 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 17 Sep 2025 16:48:54 +0300 Subject: [PATCH 2129/3206] Revert "net/mlx5e: Update and set Xon/Xoff upon port speed set" This reverts commit d24341740fe48add8a227a753e68b6eedf4b385a. It causes errors when trying to configure QoS, as well as loss of L2 connectivity (on multi-host devices). Reported-by: Jakub Kicinski Link: https://lore.kernel.org/20250910170011.70528106@kernel.org Fixes: d24341740fe4 ("net/mlx5e: Update and set Xon/Xoff upon port speed set") Signed-off-by: Tariq Toukan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index e680673ffb725c..15eded36b872a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -139,8 +139,6 @@ void mlx5e_update_carrier(struct mlx5e_priv *priv) if (up) { netdev_info(priv->netdev, "Link up\n"); netif_carrier_on(priv->netdev); - mlx5e_port_manual_buffer_config(priv, 0, priv->netdev->mtu, - NULL, NULL, NULL); } else { netdev_info(priv->netdev, "Link down\n"); netif_carrier_off(priv->netdev); From 87ebb628a5acb892eba41ef1d8989beb8f036034 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Sep 2025 13:53:37 +0000 Subject: [PATCH 2130/3206] net: clear sk->sk_ino in sk_set_socket(sk, NULL) Andrei Vagin reported that blamed commit broke CRIU. Indeed, while we want to keep sk_uid unchanged when a socket is cloned, we want to clear sk->sk_ino. Otherwise, sock_diag might report multiple sockets sharing the same inode number. Move the clearing part from sock_orphan() to sk_set_socket(sk, NULL), called both from sock_orphan() and sk_clone_lock(). Fixes: 5d6b58c932ec ("net: lockless sock_i_ino()") Closes: https://lore.kernel.org/netdev/aMhX-VnXkYDpKd9V@google.com/ Closes: https://github.com/checkpoint-restore/criu/issues/2744 Reported-by: Andrei Vagin Signed-off-by: Eric Dumazet Acked-by: Andrei Vagin Link: https://patch.msgid.link/20250917135337.1736101-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index fb13322a11fcf7..2e14283c5be1ad 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2061,6 +2061,9 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) if (sock) { WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); + } else { + /* Note: sk_uid is unchanged. */ + WRITE_ONCE(sk->sk_ino, 0); } } @@ -2082,8 +2085,6 @@ static inline void sock_orphan(struct sock *sk) sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; - /* Note: sk_uid is unchanged. */ - WRITE_ONCE(sk->sk_ino, 0); write_unlock_bh(&sk->sk_callback_lock); } From cca7b1cfd7b8a0eff2a3510c5e0f10efe8fa3758 Mon Sep 17 00:00:00 2001 From: Alexey Nepomnyashih Date: Wed, 17 Sep 2025 15:30:58 +0000 Subject: [PATCH 2131/3206] net: liquidio: fix overflow in octeon_init_instr_queue() The expression `(conf->instr_type == 64) << iq_no` can overflow because `iq_no` may be as high as 64 (`CN23XX_MAX_RINGS_PER_PF`). Casting the operand to `u64` ensures correct 64-bit arithmetic. Fixes: f21fb3ed364b ("Add support of Cavium Liquidio ethernet adapters") Signed-off-by: Alexey Nepomnyashih Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cavium/liquidio/request_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/liquidio/request_manager.c b/drivers/net/ethernet/cavium/liquidio/request_manager.c index de8a6ce86ad7e2..12105ffb5dac6d 100644 --- a/drivers/net/ethernet/cavium/liquidio/request_manager.c +++ b/drivers/net/ethernet/cavium/liquidio/request_manager.c @@ -126,7 +126,7 @@ int octeon_init_instr_queue(struct octeon_device *oct, oct->io_qmask.iq |= BIT_ULL(iq_no); /* Set the 32B/64B mode for each input queue */ - oct->io_qmask.iq64B |= ((conf->instr_type == 64) << iq_no); + oct->io_qmask.iq64B |= ((u64)(conf->instr_type == 64) << iq_no); iq->iqcmd_64B = (conf->instr_type == 64); oct->fn_list.setup_iq_regs(oct, iq_no); From 7736aff4704188d4483b7b36ff06d201c8be4844 Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Thu, 18 Sep 2025 12:15:56 +0300 Subject: [PATCH 2132/3206] MAINTAINERS: update sundance entry Signed-off-by: Denis Kirjanov Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 086b4a7bd04f42..70c9d368c8d2dc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24255,7 +24255,7 @@ F: Documentation/devicetree/bindings/input/allwinner,sun4i-a10-lradc-keys.yaml F: drivers/input/keyboard/sun4i-lradc-keys.c SUNDANCE NETWORK DRIVER -M: Denis Kirjanov +M: Denis Kirjanov L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/dlink/sundance.c From 3191df0a4882c827cac29925e80ecb1775b904bd Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 18 Sep 2025 13:15:06 +0300 Subject: [PATCH 2133/3206] devlink rate: Remove unnecessary 'static' from a couple places devlink_rate_node_get_by_name() and devlink_rate_nodes_destroy() have a couple of unnecessary static variables for iterating over devlink rates. This could lead to races/corruption/unhappiness if two concurrent operations execute the same function. Remove 'static' from both. It's amazing this was missed for 4+ years. While at it, I confirmed there are no more examples of this mistake in net/ with 1, 2 or 3 levels of indentation. Fixes: a8ecb93ef03d ("devlink: Introduce rate nodes") Signed-off-by: Cosmin Ratiu Signed-off-by: Jakub Kicinski --- net/devlink/rate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 110b3fa8a0b1b1..264fb82cba196e 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -34,7 +34,7 @@ devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { - static struct devlink_rate *devlink_rate; + struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && @@ -819,8 +819,8 @@ EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); */ void devl_rate_nodes_destroy(struct devlink *devlink) { - static struct devlink_rate *devlink_rate, *tmp; const struct devlink_ops *ops = devlink->ops; + struct devlink_rate *devlink_rate, *tmp; devl_assert_locked(devlink); From cfa7d9b1e3a8604afc84e9e51d789c29574fb216 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Wed, 17 Sep 2025 13:46:02 +0800 Subject: [PATCH 2134/3206] cnic: Fix use-after-free bugs in cnic_delete_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original code uses cancel_delayed_work() in cnic_cm_stop_bnx2x_hw(), which does not guarantee that the delayed work item 'delete_task' has fully completed if it was already running. Additionally, the delayed work item is cyclic, the flush_workqueue() in cnic_cm_stop_bnx2x_hw() only blocks and waits for work items that were already queued to the workqueue prior to its invocation. Any work items submitted after flush_workqueue() is called are not included in the set of tasks that the flush operation awaits. This means that after the cyclic work items have finished executing, a delayed work item may still exist in the workqueue. This leads to use-after-free scenarios where the cnic_dev is deallocated by cnic_free_dev(), while delete_task remains active and attempt to dereference cnic_dev in cnic_delete_task(). A typical race condition is illustrated below: CPU 0 (cleanup) | CPU 1 (delayed work callback) cnic_netdev_event() | cnic_stop_hw() | cnic_delete_task() cnic_cm_stop_bnx2x_hw() | ... cancel_delayed_work() | /* the queue_delayed_work() flush_workqueue() | executes after flush_workqueue()*/ | queue_delayed_work() cnic_free_dev(dev)//free | cnic_delete_task() //new instance | dev = cp->dev; //use Replace cancel_delayed_work() with cancel_delayed_work_sync() to ensure that the cyclic delayed work item is properly canceled and that any ongoing execution of the work item completes before the cnic_dev is deallocated. Furthermore, since cancel_delayed_work_sync() uses __flush_work(work, true) to synchronously wait for any currently executing instance of the work item to finish, the flush_workqueue() becomes redundant and should be removed. This bug was identified through static analysis. To reproduce the issue and validate the fix, I simulated the cnic PCI device in QEMU and introduced intentional delays — such as inserting calls to ssleep() within the cnic_delete_task() function — to increase the likelihood of triggering the bug. Fixes: fdf24086f475 ("cnic: Defer iscsi connection cleanup") Signed-off-by: Duoming Zhou Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/cnic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index a9040c42d2ff97..6e97a5a7daaf9c 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -4230,8 +4230,7 @@ static void cnic_cm_stop_bnx2x_hw(struct cnic_dev *dev) cnic_bnx2x_delete_wait(dev, 0); - cancel_delayed_work(&cp->delete_task); - flush_workqueue(cnic_wq); + cancel_delayed_work_sync(&cp->delete_task); if (atomic_read(&cp->iscsi_conn) != 0) netdev_warn(dev->netdev, "%d iSCSI connections not destroyed\n", From f8b4687151021db61841af983f1cb7be6915d4ef Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Wed, 17 Sep 2025 14:38:53 +0800 Subject: [PATCH 2135/3206] octeontx2-pf: Fix use-after-free bugs in otx2_sync_tstamp() The original code relies on cancel_delayed_work() in otx2_ptp_destroy(), which does not ensure that the delayed work item synctstamp_work has fully completed if it was already running. This leads to use-after-free scenarios where otx2_ptp is deallocated by otx2_ptp_destroy(), while synctstamp_work remains active and attempts to dereference otx2_ptp in otx2_sync_tstamp(). Furthermore, the synctstamp_work is cyclic, the likelihood of triggering the bug is nonnegligible. A typical race condition is illustrated below: CPU 0 (cleanup) | CPU 1 (delayed work callback) otx2_remove() | otx2_ptp_destroy() | otx2_sync_tstamp() cancel_delayed_work() | kfree(ptp) | | ptp = container_of(...); //UAF | ptp-> //UAF This is confirmed by a KASAN report: BUG: KASAN: slab-use-after-free in __run_timer_base.part.0+0x7d7/0x8c0 Write of size 8 at addr ffff88800aa09a18 by task bash/136 ... Call Trace: dump_stack_lvl+0x55/0x70 print_report+0xcf/0x610 ? __run_timer_base.part.0+0x7d7/0x8c0 kasan_report+0xb8/0xf0 ? __run_timer_base.part.0+0x7d7/0x8c0 __run_timer_base.part.0+0x7d7/0x8c0 ? __pfx___run_timer_base.part.0+0x10/0x10 ? __pfx_read_tsc+0x10/0x10 ? ktime_get+0x60/0x140 ? lapic_next_event+0x11/0x20 ? clockevents_program_event+0x1d4/0x2a0 run_timer_softirq+0xd1/0x190 handle_softirqs+0x16a/0x550 irq_exit_rcu+0xaf/0xe0 sysvec_apic_timer_interrupt+0x70/0x80 ... Allocated by task 1: kasan_save_stack+0x24/0x50 kasan_save_track+0x14/0x30 __kasan_kmalloc+0x7f/0x90 otx2_ptp_init+0xb1/0x860 otx2_probe+0x4eb/0xc30 local_pci_probe+0xdc/0x190 pci_device_probe+0x2fe/0x470 really_probe+0x1ca/0x5c0 __driver_probe_device+0x248/0x310 driver_probe_device+0x44/0x120 __driver_attach+0xd2/0x310 bus_for_each_dev+0xed/0x170 bus_add_driver+0x208/0x500 driver_register+0x132/0x460 do_one_initcall+0x89/0x300 kernel_init_freeable+0x40d/0x720 kernel_init+0x1a/0x150 ret_from_fork+0x10c/0x1a0 ret_from_fork_asm+0x1a/0x30 Freed by task 136: kasan_save_stack+0x24/0x50 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3a/0x60 __kasan_slab_free+0x3f/0x50 kfree+0x137/0x370 otx2_ptp_destroy+0x38/0x80 otx2_remove+0x10d/0x4c0 pci_device_remove+0xa6/0x1d0 device_release_driver_internal+0xf8/0x210 pci_stop_bus_device+0x105/0x150 pci_stop_and_remove_bus_device_locked+0x15/0x30 remove_store+0xcc/0xe0 kernfs_fop_write_iter+0x2c3/0x440 vfs_write+0x871/0xd70 ksys_write+0xee/0x1c0 do_syscall_64+0xac/0x280 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... Replace cancel_delayed_work() with cancel_delayed_work_sync() to ensure that the delayed work item is properly canceled before the otx2_ptp is deallocated. This bug was initially identified through static analysis. To reproduce and test it, I simulated the OcteonTX2 PCI device in QEMU and introduced artificial delays within the otx2_sync_tstamp() function to increase the likelihood of triggering the bug. Fixes: 2958d17a8984 ("octeontx2-pf: Add support for ptp 1-step mode on CN10K silicon") Signed-off-by: Duoming Zhou Reviewed-by: Vadim Fedorenko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c index e52cc6b1a26cc8..dedd586ed3108e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c @@ -491,7 +491,7 @@ void otx2_ptp_destroy(struct otx2_nic *pfvf) if (!ptp) return; - cancel_delayed_work(&pfvf->ptp->synctstamp_work); + cancel_delayed_work_sync(&pfvf->ptp->synctstamp_work); ptp_clock_unregister(ptp->ptp_clock); kfree(ptp); From d5409ebf46bb0735ba26c64d7a9c80dde3f94eb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Sep 2025 07:34:33 -0700 Subject: [PATCH 2136/3206] xfs: remove xfs_errortag_get xfs_errortag_get is only called by xfs_errortag_attr_show, which does not need to validate the error tag, because it can only be called on valid error tags that had a sysfs attribute registered. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_error.c | 16 ++-------------- fs/xfs/xfs_error.h | 1 - 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index dbd87e1376943a..45a43e47ce926a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -118,10 +118,9 @@ xfs_errortag_attr_show( char *buf) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; - return snprintf(buf, PAGE_SIZE, "%u\n", - xfs_errortag_get(mp, xfs_attr->tag)); + return snprintf(buf, PAGE_SIZE, "%u\n", mp->m_errortag[error_tag]); } static const struct sysfs_ops xfs_errortag_sysfs_ops = { @@ -326,17 +325,6 @@ xfs_errortag_test( return true; } -int -xfs_errortag_get( - struct xfs_mount *mp, - unsigned int error_tag) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - return mp->m_errortag[error_tag]; -} - int xfs_errortag_set( struct xfs_mount *mp, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 0b9c5ba8a5981a..3aeb03001acfcc 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -58,7 +58,6 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); mdelay((mp)->m_errortag[(tag)]); \ } while (0) -extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, unsigned int tag_value); extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); From 991dcadaddcced38c5d2da656862d94a1fc9e6e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Sep 2025 07:34:34 -0700 Subject: [PATCH 2137/3206] xfs: remove xfs_errortag_set xfs_errortag_set is only called by xfs_errortag_attr_store, , which does not need to validate the error tag, because it can only be called on valid error tags that had a sysfs attribute registered. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_error.c | 29 ++++++----------------------- fs/xfs/xfs_error.h | 3 --- 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 45a43e47ce926a..fac35ff3da65ca 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -93,21 +93,18 @@ xfs_errortag_attr_store( size_t count) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; int ret; - unsigned int val; if (strcmp(buf, "default") == 0) { - val = xfs_errortag_random_default[xfs_attr->tag]; + mp->m_errortag[error_tag] = + xfs_errortag_random_default[error_tag]; } else { - ret = kstrtouint(buf, 0, &val); + ret = kstrtouint(buf, 0, &mp->m_errortag[error_tag]); if (ret) return ret; } - ret = xfs_errortag_set(mp, xfs_attr->tag, val); - if (ret) - return ret; return count; } @@ -325,19 +322,6 @@ xfs_errortag_test( return true; } -int -xfs_errortag_set( - struct xfs_mount *mp, - unsigned int error_tag, - unsigned int tag_value) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - mp->m_errortag[error_tag] = tag_value; - return 0; -} - int xfs_errortag_add( struct xfs_mount *mp, @@ -347,9 +331,8 @@ xfs_errortag_add( if (!xfs_errortag_valid(error_tag)) return -EINVAL; - - return xfs_errortag_set(mp, error_tag, - xfs_errortag_random_default[error_tag]); + mp->m_errortag[error_tag] = xfs_errortag_random_default[error_tag]; + return 0; } int diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 3aeb03001acfcc..fd60a008f9d22d 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -58,8 +58,6 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); mdelay((mp)->m_errortag[(tag)]); \ } while (0) -extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, - unsigned int tag_value); extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); extern int xfs_errortag_clearall(struct xfs_mount *mp); #else @@ -67,7 +65,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define xfs_errortag_del(mp) #define XFS_TEST_ERROR(expr, mp, tag) (expr) #define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) -#define xfs_errortag_set(mp, tag, val) (ENOSYS) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) #endif /* DEBUG */ From 807df3227d7674d7957c576551d552acf15bb96f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Sep 2025 07:34:35 -0700 Subject: [PATCH 2138/3206] xfs: remove the expr argument to XFS_TEST_ERROR Don't pass expr to XFS_TEST_ERROR. Most calls pass a constant false, and the places that do pass an expression become cleaner by moving it out. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_ag_resv.c | 7 +++---- fs/xfs/libxfs/xfs_alloc.c | 5 ++--- fs/xfs/libxfs/xfs_attr_leaf.c | 2 +- fs/xfs/libxfs/xfs_bmap.c | 17 ++++++++--------- fs/xfs/libxfs/xfs_btree.c | 2 +- fs/xfs/libxfs/xfs_da_btree.c | 2 +- fs/xfs/libxfs/xfs_dir2.c | 2 +- fs/xfs/libxfs/xfs_exchmaps.c | 4 ++-- fs/xfs/libxfs/xfs_ialloc.c | 2 +- fs/xfs/libxfs/xfs_inode_buf.c | 4 ++-- fs/xfs/libxfs/xfs_inode_fork.c | 3 +-- fs/xfs/libxfs/xfs_metafile.c | 2 +- fs/xfs/libxfs/xfs_refcount.c | 7 +++---- fs/xfs/libxfs/xfs_rmap.c | 2 +- fs/xfs/libxfs/xfs_rtbitmap.c | 2 +- fs/xfs/scrub/cow_repair.c | 4 ++-- fs/xfs/scrub/repair.c | 2 +- fs/xfs/xfs_attr_item.c | 2 +- fs/xfs/xfs_buf.c | 4 ++-- fs/xfs/xfs_error.c | 5 ++--- fs/xfs/xfs_error.h | 10 +++++----- fs/xfs/xfs_inode.c | 28 +++++++++++++--------------- fs/xfs/xfs_iomap.c | 4 ++-- fs/xfs/xfs_log.c | 8 ++++---- fs/xfs/xfs_trans_ail.c | 2 +- 25 files changed, 62 insertions(+), 70 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fb79215a509d21..8ac8230c3d3cc2 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -92,9 +92,8 @@ xfs_ag_resv_critical( trace_xfs_ag_resv_critical(pag, type, avail); /* Critically low if less than 10% or max btree height remains. */ - return XFS_TEST_ERROR(avail < orig / 10 || - avail < mp->m_agbtree_maxlevels, - mp, XFS_ERRTAG_AG_RESV_CRITICAL); + return avail < orig / 10 || avail < mp->m_agbtree_maxlevels || + XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_CRITICAL); } /* @@ -203,7 +202,7 @@ __xfs_ag_resv_init( return -EINVAL; } - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_FAIL)) error = -ENOSPC; else error = xfs_dec_fdblocks(mp, hidden_space, true); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 000cc7f4a3ce50..ad381c73abc4de 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3321,7 +3321,7 @@ xfs_agf_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agf_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } @@ -4019,8 +4019,7 @@ __xfs_free_extent( ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_free_extent_fix_freelist(tp, pag, &agbp); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 47213e6023dd4b..91c1b30ebaab31 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1212,7 +1212,7 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { error = -EIO; goto out; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 80bdb537fcf783..53ef4b7e504d62 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3654,8 +3654,7 @@ xfs_bmap_btalloc( /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + if (unlikely(XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) error = xfs_bmap_exact_minlen_extent_alloc(ap, &args); else if ((ap->datatype & XFS_ALLOC_USERDATA) && xfs_inode_is_filestream(ap->ip)) @@ -3841,7 +3840,7 @@ xfs_bmapi_read( } if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4192,7 +4191,7 @@ xfs_bmapi_write( (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4537,7 +4536,7 @@ xfs_bmapi_remap( (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5671,7 +5670,7 @@ xfs_bmap_collapse_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5787,7 +5786,7 @@ xfs_bmap_insert_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5892,7 +5891,7 @@ xfs_bmap_split_extent( int i = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -6057,7 +6056,7 @@ xfs_bmap_finish_one( trace_xfs_bmap_deferred(bi); - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (bi->bi_type) { diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index a61211d253f1ca..dbe9df8c33004e 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -306,7 +306,7 @@ xfs_btree_check_block( fa = __xfs_btree_check_block(cur, block, level, bp); if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { + XFS_TEST_ERROR(mp, xfs_btree_block_errtag(cur))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); xfs_btree_mark_sick(cur); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 723a0643b8386c..90f7fc219fccc8 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -565,7 +565,7 @@ xfs_da3_split( trace_xfs_da_split(state->args); - if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + if (XFS_TEST_ERROR(state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 1775abcfa04d61..82a338458a5179 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -223,7 +223,7 @@ xfs_dir_ino_validate( bool ino_ok = xfs_verify_dir_ino(mp, ino); if (XFS_IS_CORRUPT(mp, !ino_ok) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); return -EFSCORRUPTED; diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c index 3f1d6a98c11819..932ee4619e9ea6 100644 --- a/fs/xfs/libxfs/xfs_exchmaps.c +++ b/fs/xfs/libxfs/xfs_exchmaps.c @@ -616,7 +616,7 @@ xfs_exchmaps_finish_one( return error; } - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) return -EIO; /* If we still have work to do, ask for a new transaction. */ @@ -882,7 +882,7 @@ xmi_ensure_delta_nextents( &new_nextents)) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && new_nextents > 10) return -EFBIG; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 5fefdd4fe75dbd..d97295eaebe631 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2706,7 +2706,7 @@ xfs_agi_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agi_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_IALLOC_READ_AGI)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index aa13fc00afd707..b1812b2c3ccece 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -61,8 +61,8 @@ xfs_inode_buf_verify( di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP))) { + if (unlikely(!di_ok || + XFS_TEST_ERROR(mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; xfs_buf_ioerror(bp, -EIO); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 4f99b90add5526..1772d82f2d68b6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -756,8 +756,7 @@ xfs_iext_count_extend( if (nr_exts < ifp->if_nextents) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && - nr_exts > 10) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && nr_exts > 10) return -EFBIG; if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) { diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index 225923e463c41e..b02e3d6c0868c6 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -121,7 +121,7 @@ xfs_metafile_resv_critical( div_u64(mp->m_metafile_resv_target, 10))) return true; - return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + return XFS_TEST_ERROR(mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); } /* Allocate a block from the metadata file's reservation. */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 8977840374836e..2484dc9f6d7ecc 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1113,8 +1113,7 @@ xfs_refcount_still_have_space( * refcount continue update "error" has been injected. */ if (cur->bc_refc.nr_ops > 2 && - XFS_TEST_ERROR(false, cur->bc_mp, - XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) + XFS_TEST_ERROR(cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; if (cur->bc_refc.nr_ops == 0) @@ -1398,7 +1397,7 @@ xfs_refcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* @@ -1511,7 +1510,7 @@ xfs_rtrefcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 3cdf50563fecb9..83e0488ff77364 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2690,7 +2690,7 @@ xfs_rmap_finish_one( trace_xfs_rmap_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_RMAP_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5057536e586ca4..618061d898d4a8 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1067,7 +1067,7 @@ xfs_rtfree_extent( ASSERT(rbmip->i_itemp != NULL); xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_rtcheck_alloc_range(&args, start, len); diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 38a246b8bf11c9..b2a83801412e91 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -300,7 +300,7 @@ xrep_cow_find_bad( * on the debugging knob, replace everything in the CoW fork. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) @@ -385,7 +385,7 @@ xrep_cow_find_bad_rt( * CoW fork and then scan for staging extents in the refcountbt. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index d00c18954a26b8..efd5a7ccdf624a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1110,7 +1110,7 @@ xrep_will_attempt( return true; /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + if (XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) return true; /* Metadata is corrupt or failed cross-referencing. */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5eef3bc30bda1a..c3a593319bee71 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -491,7 +491,7 @@ xfs_attr_finish_item( /* Reset trans after EAGAIN cycle since the transaction is new */ args->trans = tp; - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + if (XFS_TEST_ERROR(args->dp->i_mount, XFS_ERRTAG_LARP)) { error = -EIO; goto out; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f9ef3b2a332a6f..8360e77b3215dd 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1299,7 +1299,7 @@ xfs_buf_bio_end_io( if (bio->bi_status) xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) + XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) xfs_buf_ioerror(bp, -EIO); if (bp->b_flags & XBF_ASYNC) { @@ -2084,7 +2084,7 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) * This allows userspace to disrupt buffer caching for debug/testing * purposes. */ - if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) + if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) lru_ref = 0; atomic_set(&bp->b_lru_ref, lru_ref); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index fac35ff3da65ca..44dd8aba00979c 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -291,7 +291,6 @@ xfs_errortag_enabled( bool xfs_errortag_test( struct xfs_mount *mp, - const char *expression, const char *file, int line, unsigned int error_tag) @@ -317,8 +316,8 @@ xfs_errortag_test( return false; xfs_warn_ratelimited(mp, -"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", - expression, file, line, mp->m_super->s_id); +"Injecting error at file %s, line %d, on filesystem \"%s\"", + file, line, mp->m_super->s_id); return true; } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index fd60a008f9d22d..8429c1ee86e793 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -41,10 +41,10 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, #ifdef DEBUG extern int xfs_errortag_init(struct xfs_mount *mp); extern void xfs_errortag_del(struct xfs_mount *mp); -extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, - const char *file, int line, unsigned int error_tag); -#define XFS_TEST_ERROR(expr, mp, tag) \ - ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) +bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line, + unsigned int error_tag); +#define XFS_TEST_ERROR(mp, tag) \ + xfs_errortag_test((mp), __FILE__, __LINE__, (tag)) bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); #define XFS_ERRORTAG_DELAY(mp, tag) \ do { \ @@ -63,7 +63,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #else #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) -#define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define XFS_TEST_ERROR(mp, tag) (false) #define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0ddb9ce0f5e36f..2bfe87b09c2ada 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2381,8 +2381,8 @@ xfs_iflush( * error handling as the caller will shutdown and fail the buffer. */ error = -EFSCORRUPTED; - if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), - mp, XFS_ERRTAG_IFLUSH_1)) { + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); @@ -2398,29 +2398,27 @@ xfs_iflush( goto flush_out; } } else if (S_ISREG(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE, - mp, XFS_ERRTAG_IFLUSH_3)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE && - ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, - mp, XFS_ERRTAG_IFLUSH_4)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > - ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { + if (ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > + ip->i_nblocks || XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %llu, " "total extents = %llu nblocks = %lld, ptr "PTR_FMT, @@ -2429,8 +2427,8 @@ xfs_iflush( ip->i_nblocks, ip); goto flush_out; } - if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, - mp, XFS_ERRTAG_IFLUSH_6)) { + if (ip->i_forkoff > mp->m_sb.sb_inodesize || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_forkoff, ip); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2a74f295734103..2570d0a6604738 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1554,7 +1554,7 @@ xfs_zoned_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; @@ -1728,7 +1728,7 @@ xfs_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2978de9da38edb..6397a45b293de7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -969,8 +969,8 @@ xfs_log_unmount_write( * counters will be recalculated. Refer to xlog_check_unmount_rec for * more details. */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + if (xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { xfs_alert(mp, "%s: will fix summary counters at next mount", __func__); return; @@ -1240,7 +1240,7 @@ xlog_ioend_work( /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { + if (error || XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } @@ -1827,7 +1827,7 @@ xlog_sync( * detects the bad CRC and attempts to recover. */ #ifdef DEBUG - if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { + if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_fail_crc = true; xfs_warn(log->l_mp, diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 67c328d23e4ae9..38983c6777df31 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -374,7 +374,7 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; /* From b55dd72798115015908f4a17a1f8d70e8e974ab4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Sep 2025 07:34:36 -0700 Subject: [PATCH 2139/3206] xfs: remove pointless externs in xfs_error.h Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_error.h | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 8429c1ee86e793..fe6a71bbe9cde9 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -8,22 +8,17 @@ struct xfs_mount; -extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, - xfs_failaddr_t failaddr); -extern void xfs_corruption_error(const char *tag, int level, - struct xfs_mount *mp, const void *buf, size_t bufsize, - const char *filename, int linenum, - xfs_failaddr_t failaddr); +void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, xfs_failaddr_t failaddr); +void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, + const void *buf, size_t bufsize, const char *filename, + int linenum, xfs_failaddr_t failaddr); void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa); -extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); -extern void xfs_verifier_error(struct xfs_buf *bp, int error, - xfs_failaddr_t failaddr); -extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); +void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); +void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); +void xfs_inode_verifier_error(struct xfs_inode *ip, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) @@ -39,8 +34,8 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, #define XFS_CORRUPTION_DUMP_LEN (128) #ifdef DEBUG -extern int xfs_errortag_init(struct xfs_mount *mp); -extern void xfs_errortag_del(struct xfs_mount *mp); +int xfs_errortag_init(struct xfs_mount *mp); +void xfs_errortag_del(struct xfs_mount *mp); bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line, unsigned int error_tag); #define XFS_TEST_ERROR(mp, tag) \ @@ -58,8 +53,8 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); mdelay((mp)->m_errortag[(tag)]); \ } while (0) -extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); -extern int xfs_errortag_clearall(struct xfs_mount *mp); +int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); +int xfs_errortag_clearall(struct xfs_mount *mp); #else #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) From 71fa062196ae3abab790c91f1bdf09dcdc6fb1fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Sep 2025 07:34:37 -0700 Subject: [PATCH 2140/3206] xfs: centralize error tag definitions Right now 5 places in the kernel and one in xfsprogs need to be updated for each new error tag. Add a bit of macro magic so that only the error tag definition and a single table, which reside next to each other, need to be updated. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_errortag.h | 114 ++++++++++++++---------- fs/xfs/xfs_error.c | 166 +++++------------------------------ 2 files changed, 87 insertions(+), 193 deletions(-) diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index a53c5d40e084dc..de840abc0bcd44 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -4,14 +4,22 @@ * Copyright (C) 2017 Oracle. * All Rights Reserved. */ -#ifndef __XFS_ERRORTAG_H_ +#if !defined(__XFS_ERRORTAG_H_) || defined(XFS_ERRTAG) #define __XFS_ERRORTAG_H_ /* - * error injection tags - the labels can be anything you want - * but each tag should have its own unique number + * There are two ways to use this header file. The first way is to #include it + * bare, which will define all the XFS_ERRTAG_* error injection knobs for use + * with the XFS_TEST_ERROR macro. The second way is to enclose the #include + * with a #define for an XFS_ERRTAG macro, in which case the header will define + " an XFS_ERRTAGS macro that expands to invoke that XFS_ERRTAG macro for each + * defined error injection knob. */ +/* + * These are the actual error injection tags. The numbers should be consecutive + * because arrays are sized based on the maximum. + */ #define XFS_ERRTAG_NOERROR 0 #define XFS_ERRTAG_IFLUSH_1 1 #define XFS_ERRTAG_IFLUSH_2 2 @@ -71,49 +79,61 @@ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. */ #define XFS_RANDOM_DEFAULT 100 -#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) -#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT -#define XFS_RANDOM_FREE_EXTENT 1 -#define XFS_RANDOM_RMAP_FINISH_ONE 1 -#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 -#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 -#define XFS_RANDOM_BMAP_FINISH_ONE 1 -#define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_LOG_BAD_CRC 1 -#define XFS_RANDOM_LOG_ITEM_PIN 1 -#define XFS_RANDOM_BUF_LRU_REF 2 -#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 -#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 -#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT -#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 -#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 -#define XFS_RANDOM_AG_RESV_FAIL 1 -#define XFS_RANDOM_LARP 1 -#define XFS_RANDOM_DA_LEAF_SPLIT 1 -#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 -#define XFS_RANDOM_WB_DELAY_MS 3000 -#define XFS_RANDOM_WRITE_DELAY_MS 3000 -#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 -#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4 + +/* + * Table of errror injection knobs. The parameters to the XFS_ERRTAG macro are: + * 1. The XFS_ERRTAG_ flag but without the prefix; + * 2. The name of the sysfs knob; and + * 3. The default value for the knob. + */ +#ifdef XFS_ERRTAG +# undef XFS_ERRTAGS +# define XFS_ERRTAGS \ +XFS_ERRTAG(NOERROR, noerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_1, iflush1, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_2, iflush2, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_3, iflush3, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_4, iflush4, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_5, iflush5, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_6, iflush6, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DA_READ_BUF, dareadbuf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BTREE_CHECK_LBLOCK, btree_chk_lblk, XFS_RANDOM_DEFAULT/4) \ +XFS_ERRTAG(BTREE_CHECK_SBLOCK, btree_chk_sblk, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ALLOC_READ_AGF, readagf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IALLOC_READ_AGI, readagi, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ITOBP_INOTOBP, itobp, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK, iunlink, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK_REMOVE, iunlinkrm, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DIR_INO_VALIDATE, dirinovalid, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BULKSTAT_READ_CHUNK, bulkstat, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IODONE_IOERR, logiodone, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATREAD_IOERR, stratread, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATCMPL_IOERR, stratcmpl, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(DIOWRITE_IOERR, diowrite, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BMAPIFORMAT, bmapifmt, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(FREE_EXTENT, free_extent, 1) \ +XFS_ERRTAG(RMAP_FINISH_ONE, rmap_finish_one, 1) \ +XFS_ERRTAG(REFCOUNT_CONTINUE_UPDATE, refcount_continue_update, 1) \ +XFS_ERRTAG(REFCOUNT_FINISH_ONE, refcount_finish_one, 1) \ +XFS_ERRTAG(BMAP_FINISH_ONE, bmap_finish_one, 1) \ +XFS_ERRTAG(AG_RESV_CRITICAL, ag_resv_critical, 4) \ +XFS_ERRTAG(LOG_BAD_CRC, log_bad_crc, 1) \ +XFS_ERRTAG(LOG_ITEM_PIN, log_item_pin, 1) \ +XFS_ERRTAG(BUF_LRU_REF, buf_lru_ref, 2) \ +XFS_ERRTAG(FORCE_SCRUB_REPAIR, force_repair, 1) \ +XFS_ERRTAG(FORCE_SUMMARY_RECALC, bad_summary, 1) \ +XFS_ERRTAG(IUNLINK_FALLBACK, iunlink_fallback, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BUF_IOERROR, buf_ioerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(REDUCE_MAX_IEXTENTS, reduce_max_iextents, 1) \ +XFS_ERRTAG(BMAP_ALLOC_MINLEN_EXTENT, bmap_alloc_minlen_extent, 1) \ +XFS_ERRTAG(AG_RESV_FAIL, ag_resv_fail, 1) \ +XFS_ERRTAG(LARP, larp, 1) \ +XFS_ERRTAG(DA_LEAF_SPLIT, da_leaf_split, 1) \ +XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \ +XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ +XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ +XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ +XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) +#endif /* XFS_ERRTAG */ #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 44dd8aba00979c..ac895cd2bc0a01 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -10,61 +10,17 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_sysfs.h" #include "xfs_inode.h" #ifdef DEBUG -static unsigned int xfs_errortag_random_default[] = { - XFS_RANDOM_DEFAULT, - XFS_RANDOM_IFLUSH_1, - XFS_RANDOM_IFLUSH_2, - XFS_RANDOM_IFLUSH_3, - XFS_RANDOM_IFLUSH_4, - XFS_RANDOM_IFLUSH_5, - XFS_RANDOM_IFLUSH_6, - XFS_RANDOM_DA_READ_BUF, - XFS_RANDOM_BTREE_CHECK_LBLOCK, - XFS_RANDOM_BTREE_CHECK_SBLOCK, - XFS_RANDOM_ALLOC_READ_AGF, - XFS_RANDOM_IALLOC_READ_AGI, - XFS_RANDOM_ITOBP_INOTOBP, - XFS_RANDOM_IUNLINK, - XFS_RANDOM_IUNLINK_REMOVE, - XFS_RANDOM_DIR_INO_VALIDATE, - XFS_RANDOM_BULKSTAT_READ_CHUNK, - XFS_RANDOM_IODONE_IOERR, - XFS_RANDOM_STRATREAD_IOERR, - XFS_RANDOM_STRATCMPL_IOERR, - XFS_RANDOM_DIOWRITE_IOERR, - XFS_RANDOM_BMAPIFORMAT, - XFS_RANDOM_FREE_EXTENT, - XFS_RANDOM_RMAP_FINISH_ONE, - XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE, - XFS_RANDOM_REFCOUNT_FINISH_ONE, - XFS_RANDOM_BMAP_FINISH_ONE, - XFS_RANDOM_AG_RESV_CRITICAL, - 0, /* XFS_RANDOM_DROP_WRITES has been removed */ - XFS_RANDOM_LOG_BAD_CRC, - XFS_RANDOM_LOG_ITEM_PIN, - XFS_RANDOM_BUF_LRU_REF, - XFS_RANDOM_FORCE_SCRUB_REPAIR, - XFS_RANDOM_FORCE_SUMMARY_RECALC, - XFS_RANDOM_IUNLINK_FALLBACK, - XFS_RANDOM_BUF_IOERROR, - XFS_RANDOM_REDUCE_MAX_IEXTENTS, - XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, - XFS_RANDOM_AG_RESV_FAIL, - XFS_RANDOM_LARP, - XFS_RANDOM_DA_LEAF_SPLIT, - XFS_RANDOM_ATTR_LEAF_TO_NODE, - XFS_RANDOM_WB_DELAY_MS, - XFS_RANDOM_WRITE_DELAY_MS, - XFS_RANDOM_EXCHMAPS_FINISH_ONE, - XFS_RANDOM_METAFILE_RESV_CRITICAL, -}; +#define XFS_ERRTAG(_tag, _name, _default) \ + [XFS_ERRTAG_##_tag] = (_default), +#include "xfs_errortag.h" +static unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS }; +#undef XFS_ERRTAG struct xfs_errortag_attr { struct attribute attr; @@ -125,110 +81,28 @@ static const struct sysfs_ops xfs_errortag_sysfs_ops = { .store = xfs_errortag_attr_store, }; -#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \ +#define XFS_ERRTAG(_tag, _name, _default) \ static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \ - .tag = (_tag), \ -} - -#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr - -XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR); -XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1); -XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2); -XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3); -XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4); -XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5); -XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6); -XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF); -XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK); -XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK); -XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF); -XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI); -XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP); -XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK); -XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE); -XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE); -XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK); -XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR); -XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR); -XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR); -XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR); -XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT); -XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT); -XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE); -XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); -XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); -XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); -XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); -XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); -XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); -XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); -XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); -XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); -XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); -XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); -XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); -XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); -XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + .tag = XFS_ERRTAG_##_tag, \ +}; +#include "xfs_errortag.h" +XFS_ERRTAGS +#undef XFS_ERRTAG +#define XFS_ERRTAG(_tag, _name, _default) \ + &xfs_errortag_attr_##_name.attr, +#include "xfs_errortag.h" static struct attribute *xfs_errortag_attrs[] = { - XFS_ERRORTAG_ATTR_LIST(noerror), - XFS_ERRORTAG_ATTR_LIST(iflush1), - XFS_ERRORTAG_ATTR_LIST(iflush2), - XFS_ERRORTAG_ATTR_LIST(iflush3), - XFS_ERRORTAG_ATTR_LIST(iflush4), - XFS_ERRORTAG_ATTR_LIST(iflush5), - XFS_ERRORTAG_ATTR_LIST(iflush6), - XFS_ERRORTAG_ATTR_LIST(dareadbuf), - XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk), - XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk), - XFS_ERRORTAG_ATTR_LIST(readagf), - XFS_ERRORTAG_ATTR_LIST(readagi), - XFS_ERRORTAG_ATTR_LIST(itobp), - XFS_ERRORTAG_ATTR_LIST(iunlink), - XFS_ERRORTAG_ATTR_LIST(iunlinkrm), - XFS_ERRORTAG_ATTR_LIST(dirinovalid), - XFS_ERRORTAG_ATTR_LIST(bulkstat), - XFS_ERRORTAG_ATTR_LIST(logiodone), - XFS_ERRORTAG_ATTR_LIST(stratread), - XFS_ERRORTAG_ATTR_LIST(stratcmpl), - XFS_ERRORTAG_ATTR_LIST(diowrite), - XFS_ERRORTAG_ATTR_LIST(bmapifmt), - XFS_ERRORTAG_ATTR_LIST(free_extent), - XFS_ERRORTAG_ATTR_LIST(rmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(refcount_continue_update), - XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), - XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(log_bad_crc), - XFS_ERRORTAG_ATTR_LIST(log_item_pin), - XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), - XFS_ERRORTAG_ATTR_LIST(force_repair), - XFS_ERRORTAG_ATTR_LIST(bad_summary), - XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), - XFS_ERRORTAG_ATTR_LIST(buf_ioerror), - XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), - XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), - XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), - XFS_ERRORTAG_ATTR_LIST(larp), - XFS_ERRORTAG_ATTR_LIST(da_leaf_split), - XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), - XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), - XFS_ERRORTAG_ATTR_LIST(write_delay_ms), - XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one), - XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit), - NULL, + XFS_ERRTAGS + NULL }; ATTRIBUTE_GROUPS(xfs_errortag); +#undef XFS_ERRTAG + +/* -1 because XFS_ERRTAG_DROP_WRITES got removed, + 1 for NULL termination */ +static_assert(ARRAY_SIZE(xfs_errortag_attrs) == XFS_ERRTAG_MAX); static const struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, From 8e1cfa51320da0cf599d286c89db043f329ca6b0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 18 Sep 2025 22:01:10 +0900 Subject: [PATCH 2141/3206] xfs: improve zone statistics message Reword the information message displayed in xfs_mount_zones() indicating the total zone count and maximum number of open zones. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 23a027387933d5..f152b218200488 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1244,7 +1244,7 @@ xfs_mount_zones( if (!mp->m_zone_info) return -ENOMEM; - xfs_info(mp, "%u zones of %u blocks size (%u max open)", + xfs_info(mp, "%u zones of %u blocks (%u max open zones)", mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); From ff3d90903f8f525eedb26efe6fea03c39476cb69 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 18 Sep 2025 22:01:11 +0900 Subject: [PATCH 2142/3206] xfs: improve default maximum number of open zones For regular block devices using the zoned allocator, the default maximum number of open zones is set to 1/4 of the number of realtime groups. For a large capacity device, this leads to a very large limit. E.g. with a 26 TB HDD: mount /dev/sdb /mnt ... XFS (sdb): 95836 zones of 65536 blocks size (23959 max open) In turn such large limit on the number of open zones can lead, depending on the workload, on a very large number of concurrent write streams which devices generally do not handle well, leading to poor performance. Introduce the default limit XFS_DEFAULT_MAX_OPEN_ZONES, defined as 128 to match the hardware limit of most SMR HDDs available today, and use this limit to set mp->m_max_open_zones in xfs_calc_open_zones() instead of calling xfs_max_open_zones(), when the user did not specify a limit with the max_open_zones mount option. For the 26 TB HDD example, we now get: mount /dev/sdb /mnt ... XFS (sdb): 95836 zones of 65536 blocks (128 max open zones) This change does not prevent the user from specifying a lareger number for the open zones limit. E.g. mount -o max_open_zones=4096 /dev/sdb /mnt ... XFS (sdb): 95836 zones of 65536 blocks (4096 max open zones) Finally, since xfs_calc_open_zones() checks and caps the mp->m_max_open_zones limit against the value calculated by xfs_max_open_zones() for any type of device, this new default limit does not increase m_max_open_zones for small capacity devices. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_zones.h | 7 +++++++ fs/xfs/xfs_zone_alloc.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h index c4f1367b2cca4d..5fefd132e002ee 100644 --- a/fs/xfs/libxfs/xfs_zones.h +++ b/fs/xfs/libxfs/xfs_zones.h @@ -29,6 +29,13 @@ struct xfs_rtgroup; #define XFS_OPEN_GC_ZONES 1U #define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) +/* + * For zoned devices that do not have a limit on the number of open zones, and + * for regular devices using the zoned allocator, use the most common SMR disks + * limit (128) as the default limit on the number of open zones. + */ +#define XFS_DEFAULT_MAX_OPEN_ZONES 128 + bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, xfs_rgblock_t *write_pointer); diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index f152b218200488..1147bacb2da8e6 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1131,7 +1131,7 @@ xfs_calc_open_zones( if (bdev_open_zones) mp->m_max_open_zones = bdev_open_zones; else - mp->m_max_open_zones = xfs_max_open_zones(mp); + mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES; } if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { From 3539b1467e94336d5854ebf976d9627bfb65d6c3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 18 Sep 2025 10:21:14 -0600 Subject: [PATCH 2143/3206] io_uring: include dying ring in task_work "should cancel" state When running task_work for an exiting task, rather than perform the issue retry attempt, the task_work is canceled. However, this isn't done for a ring that has been closed. This can lead to requests being successfully completed post the ring being closed, which is somewhat confusing and surprising to an application. Rather than just check the task exit state, also include the ring ref state in deciding whether or not to terminate a given request when run from task_work. Cc: stable@vger.kernel.org # 6.1+ Link: https://github.com/axboe/liburing/discussions/1459 Reported-by: Benedek Thaler Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 ++++-- io_uring/io_uring.h | 4 ++-- io_uring/poll.c | 2 +- io_uring/timeout.c | 2 +- io_uring/uring_cmd.c | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 93633613a1657c..bcec12256f3407 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1406,8 +1406,10 @@ static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, tw); - if (unlikely(io_should_terminate_tw())) + struct io_ring_ctx *ctx = req->ctx; + + io_tw_lock(ctx, tw); + if (unlikely(io_should_terminate_tw(ctx))) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index abc6de227f74d2..1880902be6fd72 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -476,9 +476,9 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) * 2) PF_KTHREAD is set, in which case the invoker of the task_work is * our fallback task_work. */ -static inline bool io_should_terminate_tw(void) +static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) { - return current->flags & (PF_KTHREAD | PF_EXITING); + return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(&ctx->refs); } static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) diff --git a/io_uring/poll.c b/io_uring/poll.c index c786e587563b05..6090a26975d400 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -224,7 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; - if (unlikely(io_should_terminate_tw())) + if (unlikely(io_should_terminate_tw(req->ctx))) return -ECANCELED; do { diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 7f13bfa9f2b617..17e3aab0af3676 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -324,7 +324,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) int ret; if (prev) { - if (!io_should_terminate_tw()) { + if (!io_should_terminate_tw(req->ctx)) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 053bac89b6c0fc..213716e10d704a 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -118,7 +118,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; - if (io_should_terminate_tw()) + if (io_should_terminate_tw(req->ctx)) flags |= IO_URING_F_TASK_DEAD; /* task_work executor checks the deffered list completion */ From 52b49bd6de29a89a040fa33e980270904c734d69 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 18 Sep 2025 15:11:31 +0100 Subject: [PATCH 2144/3206] arm64: cputype: Remove duplicate Cortex-X1C definitions We currently have duplicate definitions for ARM_CPU_PART_CORTEX_X1C and MIDR_CORTEX_X1C as a result of commits: 58d245e03c324d08 ("arm64: cputype: Add Cortex-X1C definitions") efe676a1a7554219 ("arm64: proton-pack: Add new CPUs 'k' values for branch mitigation") Due to inconsistent sorting when adding entries, there was no textual conflict between the two patches. Delete the duplicate definitions added by the latter commit. The definitions in general are largely (but not entirely) in order of the MIDR_EL1.PartNum value rather than by CPU name, and the remaining Cortex-X1C definitions appear later in the list. For now I haven't sorted the remaining MIDR definitions to minimize churn. I intend to perform some larger cleanup of these in the near future which should supersede that anyhow. Signed-off-by: Mark Rutland Cc: James Morse Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index b10eba7f52476b..98bedc3706eabb 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -81,7 +81,6 @@ #define ARM_CPU_PART_CORTEX_A78AE 0xD42 #define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A510 0xD46 -#define ARM_CPU_PART_CORTEX_X1C 0xD4C #define ARM_CPU_PART_CORTEX_A520 0xD80 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_CORTEX_A715 0xD4D @@ -171,7 +170,6 @@ #define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) #define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) -#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C) #define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_CORTEX_A715 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A715) From ac6772e8bcdaaaf3605e306859b54d821efef7fd Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 18 Sep 2025 19:06:02 +0200 Subject: [PATCH 2145/3206] sched_ext: Add migration-disabled counter to error state dump Include the task's migration-disabled counter when dumping task state during an error exit. This can help diagnose cases where tasks can get stuck, because they're unable to migrate elsewhere. tj: s/nomig/no_mig/ for readability and consistency with other keys. Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 477eccf0233888..f5873f8ed669a3 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4167,7 +4167,8 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", p->scx.dsq_vtime, p->scx.slice, p->scx.weight); - dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); + dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), + p->migration_disabled); if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); From 2ade36eaa9ac05e4913e9785df19c2cde8f912fb Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 17 Sep 2025 12:42:09 -0400 Subject: [PATCH 2146/3206] drm/amdkfd: add proper handling for S0ix When in S0i3, the GFX state is retained, so all we need to do is stop the runlist so GFX can enter gfxoff. Reviewed-by: Mario Limonciello (AMD) Tested-by: David Perry Signed-off-by: Alex Deucher (cherry picked from commit 4bfa8609934dbf39bbe6e75b4f971469384b50b1) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 16 +++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 12 ++++++++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 36 ++++++++++++++++++++++ 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index fbe7616555c83f..a2879d2b7c8ec1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -250,16 +250,24 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev, void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc) { - if (adev->kfd.dev) - kgd2kfd_suspend(adev->kfd.dev, suspend_proc); + if (adev->kfd.dev) { + if (adev->in_s0ix) + kgd2kfd_stop_sched_all_nodes(adev->kfd.dev); + else + kgd2kfd_suspend(adev->kfd.dev, suspend_proc); + } } int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool resume_proc) { int r = 0; - if (adev->kfd.dev) - r = kgd2kfd_resume(adev->kfd.dev, resume_proc); + if (adev->kfd.dev) { + if (adev->in_s0ix) + r = kgd2kfd_start_sched_all_nodes(adev->kfd.dev); + else + r = kgd2kfd_resume(adev->kfd.dev, resume_proc); + } return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 33eb4826b58b1a..aa88bad7416bf4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -426,7 +426,9 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask); int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd); void kgd2kfd_unlock_kfd(struct kfd_dev *kfd); int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id); +int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd); int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id); +int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd); bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id); bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry, bool retry_fault); @@ -516,11 +518,21 @@ static inline int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id) return 0; } +static inline int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd) +{ + return 0; +} + static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) { return 0; } +static inline int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd) +{ + return 0; +} + static inline bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) { return false; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 7e749f9b6d69da..349c351e242b59 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1550,6 +1550,25 @@ int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id) return ret; } +int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd) +{ + struct kfd_node *node; + int i, r; + + if (!kfd->init_complete) + return 0; + + for (i = 0; i < kfd->num_nodes; i++) { + node = kfd->nodes[i]; + r = node->dqm->ops.unhalt(node->dqm); + if (r) { + dev_err(kfd_device, "Error in starting scheduler\n"); + return r; + } + } + return 0; +} + int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) { struct kfd_node *node; @@ -1567,6 +1586,23 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) return node->dqm->ops.halt(node->dqm); } +int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd) +{ + struct kfd_node *node; + int i, r; + + if (!kfd->init_complete) + return 0; + + for (i = 0; i < kfd->num_nodes; i++) { + node = kfd->nodes[i]; + r = node->dqm->ops.halt(node->dqm); + if (r) + return r; + } + return 0; +} + bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) { struct kfd_node *node; From 9272bb34b066993f5f468b219b4a26ba3f2b25a1 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 17 Sep 2025 12:42:11 -0400 Subject: [PATCH 2147/3206] drm/amdgpu: suspend KFD and KGD user queues for S0ix We need to make sure the user queues are preempted so GFX can enter gfxoff. Reviewed-by: Mario Limonciello (AMD) Tested-by: David Perry Signed-off-by: Alex Deucher (cherry picked from commit f8b367e6fa1716cab7cc232b9e3dff29187fc99d) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 +++++++++------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 01d234cf815647..c8459337fcb898 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5136,7 +5136,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) adev->in_suspend = true; if (amdgpu_sriov_vf(adev)) { - if (!adev->in_s0ix && !adev->in_runpm) + if (!adev->in_runpm) amdgpu_amdkfd_suspend_process(adev); amdgpu_virt_fini_data_exchange(adev); r = amdgpu_virt_request_full_gpu(adev, false); @@ -5156,10 +5156,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) amdgpu_device_ip_suspend_phase1(adev); - if (!adev->in_s0ix) { - amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); - amdgpu_userq_suspend(adev); - } + amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); + amdgpu_userq_suspend(adev); r = amdgpu_device_evict_resources(adev); if (r) @@ -5254,15 +5252,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) goto exit; } - if (!adev->in_s0ix) { - r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); - if (r) - goto exit; + r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); + if (r) + goto exit; - r = amdgpu_userq_resume(adev); - if (r) - goto exit; - } + r = amdgpu_userq_resume(adev); + if (r) + goto exit; r = amdgpu_device_ip_late_init(adev); if (r) @@ -5275,7 +5271,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) amdgpu_virt_init_data_exchange(adev); amdgpu_virt_release_full_gpu(adev, true); - if (!adev->in_s0ix && !r && !adev->in_runpm) + if (!r && !adev->in_runpm) r = amdgpu_amdkfd_resume_process(adev); } From d33c3471047fc54966621d19329e6a23ebc8ec50 Mon Sep 17 00:00:00 2001 From: Praful Adiga Date: Thu, 18 Sep 2025 12:40:18 -0400 Subject: [PATCH 2148/3206] ALSA: hda/realtek: Fix mute led for HP Laptop 15-dw4xx This laptop uses the ALC236 codec with COEF 0x7 and idx 1 to control the mute LED. Enable the existing quirk for this device. Signed-off-by: Praful Adiga Cc: Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 618ec00d3f9897..f267437c96981e 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -6476,6 +6476,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8992, "HP EliteBook 845 G9", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8994, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8995, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x89a0, "HP Laptop 15-dw4xxx", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x89a4, "HP ProBook 440 G9", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89a6, "HP ProBook 450 G9", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89aa, "HP EliteBook 630 G9", ALC236_FIXUP_HP_GPIO_LED), From 07db1def8f0aad25612b35377b63aa20b032c86d Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Mon, 8 Sep 2025 09:04:19 -0400 Subject: [PATCH 2149/3206] MAINTAINERS: remove Alyssa Rosenzweig I'm moving on to other projects [1] and no longer wish to be copied on kernel patches. Remove my MAINTAINERS entries: both related to Apple driver support. So long and thanks for all the fish. [1] https://rosenzweig.io/blog/asahi-gpu-part-n.html Reviewed-by: Neal Gompa Signed-off-by: Alyssa Rosenzweig Signed-off-by: Sven Peter --- .get_maintainer.ignore | 1 + MAINTAINERS | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.get_maintainer.ignore b/.get_maintainer.ignore index b458815f1d1bf4..e8d2269bad9d00 100644 --- a/.get_maintainer.ignore +++ b/.get_maintainer.ignore @@ -1,5 +1,6 @@ Alan Cox Alan Cox +Alyssa Rosenzweig Christoph Hellwig Jeff Kirsher Marc Gonzalez diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..4e0b7ea9f4e709 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1845,7 +1845,6 @@ S: Odd fixes F: drivers/input/mouse/bcm5974.c APPLE PCIE CONTROLLER DRIVER -M: Alyssa Rosenzweig M: Marc Zyngier L: linux-pci@vger.kernel.org S: Maintained @@ -2364,7 +2363,6 @@ F: sound/soc/codecs/ssm3515.c ARM/APPLE MACHINE SUPPORT M: Sven Peter M: Janne Grunau -R: Alyssa Rosenzweig R: Neal Gompa L: asahi@lists.linux.dev L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) From b3fe1c83a56f3cb7c475747ee1c6ec5a9dd5f60e Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 18 Sep 2025 17:25:31 +0100 Subject: [PATCH 2150/3206] perf/arm-cmn: Fix CMN S3 DTM offset CMN S3's DTM offset is different between r0px and r1p0, and it turns out this was not a error in the earlier documentation, but does actually exist in the design. Lovely. Cc: stable@vger.kernel.org Fixes: 0dc2f4963f7e ("perf/arm-cmn: Support CMN S3") Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 11fb2234b10fcf..23245352a3fc0a 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -65,7 +65,7 @@ /* PMU registers occupy the 3rd 4KB page of each node's region */ #define CMN_PMU_OFFSET 0x2000 /* ...except when they don't :( */ -#define CMN_S3_DTM_OFFSET 0xa000 +#define CMN_S3_R1_DTM_OFFSET 0xa000 #define CMN_S3_PMU_OFFSET 0xd900 /* For most nodes, this is all there is */ @@ -233,6 +233,9 @@ enum cmn_revision { REV_CMN700_R1P0, REV_CMN700_R2P0, REV_CMN700_R3P0, + REV_CMNS3_R0P0 = 0, + REV_CMNS3_R0P1, + REV_CMNS3_R1P0, REV_CI700_R0P0 = 0, REV_CI700_R1P0, REV_CI700_R2P0, @@ -425,8 +428,8 @@ static enum cmn_model arm_cmn_model(const struct arm_cmn *cmn) static int arm_cmn_pmu_offset(const struct arm_cmn *cmn, const struct arm_cmn_node *dn) { if (cmn->part == PART_CMN_S3) { - if (dn->type == CMN_TYPE_XP) - return CMN_S3_DTM_OFFSET; + if (cmn->rev >= REV_CMNS3_R1P0 && dn->type == CMN_TYPE_XP) + return CMN_S3_R1_DTM_OFFSET; return CMN_S3_PMU_OFFSET; } return CMN_PMU_OFFSET; From e185c8a0d84236d14af61faff8147c953a878a77 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Thu, 18 Sep 2025 08:25:47 -0500 Subject: [PATCH 2151/3206] arm64: cputype: Add NVIDIA Olympus definitions Add cpu part and model macro definitions for NVIDIA Olympus core. Signed-off-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 98bedc3706eabb..67ac757bc9c043 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -129,6 +129,7 @@ #define NVIDIA_CPU_PART_DENVER 0x003 #define NVIDIA_CPU_PART_CARMEL 0x004 +#define NVIDIA_CPU_PART_OLYMPUS 0x010 #define FUJITSU_CPU_PART_A64FX 0x001 @@ -220,6 +221,7 @@ #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER) #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) +#define MIDR_NVIDIA_OLYMPUS MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_OLYMPUS) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) #define MIDR_HISI_HIP09 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP09) From cc80537caaa789e097f35b2032ddc693a4e136f9 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Thu, 18 Sep 2025 08:25:48 -0500 Subject: [PATCH 2152/3206] arm64: cpufeature: Add Olympus MIDR to BBML2 allow list The NVIDIA Olympus core supports BBML2 without conflict abort. Add its MIDR to the allow list to enable FEAT_BBM. Signed-off-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b1219f14459f49..c6cbeee48c5718 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2235,6 +2235,7 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco static const struct midr_range supports_bbml2_noabort_list[] = { MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf), MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf), + MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), {} }; From df8922afc37aa2111ca79a216653a629146763ad Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 18 Sep 2025 13:59:15 -0600 Subject: [PATCH 2153/3206] io_uring/msg_ring: kill alloc_cache for io_kiocb allocations A recent commit: fc582cd26e88 ("io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU") fixed an issue with not deferring freeing of io_kiocb structs that msg_ring allocates to after the current RCU grace period. But this only covers requests that don't end up in the allocation cache. If a request goes into the alloc cache, it can get reused before it is sane to do so. A recent syzbot report would seem to indicate that there's something there, however it may very well just be because of the KASAN poisoning that the alloc_cache handles manually. Rather than attempt to make the alloc_cache sane for that use case, just drop the usage of the alloc_cache for msg_ring request payload data. Fixes: 50cf5f3842af ("io_uring/msg_ring: add an alloc cache for io_kiocb entries") Link: https://lore.kernel.org/io-uring/68cc2687.050a0220.139b6.0005.GAE@google.com/ Reported-by: syzbot+baa2e0f4e02df602583e@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- io_uring/io_uring.c | 4 ---- io_uring/msg_ring.c | 24 ++---------------------- 3 files changed, 2 insertions(+), 29 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 80a178f3d89688..12f5ee43850ea5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -420,9 +420,6 @@ struct io_ring_ctx { struct list_head defer_list; unsigned nr_drained; - struct io_alloc_cache msg_cache; - spinlock_t msg_lock; - #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bcec12256f3407..93665cebe9bdd7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -290,7 +290,6 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); - io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_rsrc_cache_free(ctx); } @@ -337,9 +336,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_cmd), sizeof(struct io_async_cmd)); - spin_lock_init(&ctx->msg_lock); - ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); ret |= io_rsrc_cache_init(ctx); if (ret) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 4c2578f2efcb0e..5e5b94236d7204 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,7 +11,6 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" -#include "alloc_cache.h" #include "msg_ring.h" /* All valid masks for MSG_RING */ @@ -76,13 +75,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) struct io_ring_ctx *ctx = req->ctx; io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); - if (spin_trylock(&ctx->msg_lock)) { - if (io_alloc_cache_put(&ctx->msg_cache, req)) - req = NULL; - spin_unlock(&ctx->msg_lock); - } - if (req) - kfree_rcu(req, rcu_head); + kfree_rcu(req, rcu_head); percpu_ref_put(&ctx->refs); } @@ -104,26 +97,13 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, return 0; } -static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) -{ - struct io_kiocb *req = NULL; - - if (spin_trylock(&ctx->msg_lock)) { - req = io_alloc_cache_get(&ctx->msg_cache); - spin_unlock(&ctx->msg_lock); - if (req) - return req; - } - return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); -} - static int io_msg_data_remote(struct io_ring_ctx *target_ctx, struct io_msg *msg) { struct io_kiocb *target; u32 flags = 0; - target = io_msg_get_kiocb(target_ctx); + target = kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO) ; if (unlikely(!target)) return -ENOMEM; From ea87c5536aa8c2b5bcd2fb482df6f11e5517df06 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Thu, 18 Sep 2025 12:54:24 -0500 Subject: [PATCH 2154/3206] arm64: probes: Fix incorrect bl/blr address and register usage The pt_regs registers are 64-bit on arm64, and should be u64 when manipulated. Correct this so that we aren't truncating the address during br/blr sequences. Fixes: efb07ac534e2 ("arm64: probes: Add GCS support to bl/blr/ret") Signed-off-by: Jeremy Linton Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/simulate-insn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 97ed4db7541796..89fbeb32107e31 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -145,7 +145,7 @@ void __kprobes simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs) { int xn = (opcode >> 5) & 0x1f; - int b_target = get_x_reg(regs, xn); + u64 b_target = get_x_reg(regs, xn); if (((opcode >> 21) & 0x3) == 1) if (update_lr(regs, addr + 4)) @@ -160,7 +160,7 @@ simulate_ret(u32 opcode, long addr, struct pt_regs *regs) u64 ret_addr; int err = 0; int xn = (opcode >> 5) & 0x1f; - unsigned long r_target = get_x_reg(regs, xn); + u64 r_target = get_x_reg(regs, xn); if (user_mode(regs) && task_gcs_el0_enabled(current)) { ret_addr = pop_user_gcs(&err); From ef442fc5c1a9a2a232de85a0e6967f388b6c0c8e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Sep 2025 11:57:44 -0400 Subject: [PATCH 2155/3206] rv: Add Gabriele Monaco as maintainer for Runtime Verification Gabriele will start taking over managing the changes to the Runtime Verification. Make him officially one of the maintainers. Cc: Gabriele Monaco Cc: Linus Torvalds Link: https://lore.kernel.org/20250911115744.66ccade3@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index cd7ff55b5d3217..17073c075bf763 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22048,6 +22048,7 @@ F: drivers/infiniband/ulp/rtrs/ RUNTIME VERIFICATION (RV) M: Steven Rostedt +M: Gabriele Monaco L: linux-trace-kernel@vger.kernel.org S: Maintained F: Documentation/trace/rv/ From ccf09357ffef2ab472369ab9cdf470c9bc9b821a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 9 Sep 2025 13:44:14 +0200 Subject: [PATCH 2156/3206] smp: Fix up and expand the smp_call_function_many() kerneldoc The smp_call_function_many() kerneldoc comment got out of sync with the function definition (bool parameter "wait" is incorrectly described as a bitmask in it), so fix it up by copying the "wait" description from the smp_call_function() kerneldoc and add information regarding the handling of the local CPU to it. Fixes: 49b3bd213a9f ("smp: Fix all kernel-doc warnings") Signed-off-by: Rafael J. Wysocki Signed-off-by: Thomas Gleixner --- kernel/smp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index 56f83aa58ec82f..02f52291fae425 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -884,16 +884,15 @@ static void smp_call_function_many_cond(const struct cpumask *mask, * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait - * (atomically) until function has completed on other CPUs. If - * %SCF_RUN_LOCAL is set, the function will also be run locally - * if the local CPU is set in the @cpumask. - * - * If @wait is true, then returns once @func has returned. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. + * + * @func is not called on the local CPU even if @mask contains it. Consider + * using on_each_cpu_cond_mask() instead if this is not desirable. */ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) From 13efe932d2fc3f4f753e7662d550e903b7ce8e88 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 17 Sep 2025 12:02:08 -0700 Subject: [PATCH 2157/3206] arm64: cpufeature: add AmpereOne to BBML2 allow list AmpereOne supports BBML2 without conflict abort, add to the allow list. Reviewed-by: Christoph Lameter (Ampere) Reviewed-by: Ryan Roberts Acked-by: Catalin Marinas Signed-off-by: Yang Shi Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c6cbeee48c5718..cf2dd5ea173fe4 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2236,6 +2236,8 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf), MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf), MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), {} }; From a660194dd101e937c319171ad99c3fbe466fd825 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Wed, 17 Sep 2025 12:02:07 -0700 Subject: [PATCH 2158/3206] arm64: Enable permission change on arm64 kernel block mappings This patch paves the path to enable huge mappings in vmalloc space and linear map space by default on arm64. For this we must ensure that we can handle any permission games on the kernel (init_mm) pagetable. Previously, __change_memory_common() used apply_to_page_range() which does not support changing permissions for block mappings. We move away from this by using the pagewalk API, similar to what riscv does right now. It is the responsibility of the caller to ensure that the range over which permissions are being changed falls on leaf mapping boundaries. For systems with BBML2, this will be handled in future patches by dyanmically splitting the mappings when required. Unlike apply_to_page_range(), the pagewalk API currently enforces the init_mm.mmap_lock to be held. To avoid the unnecessary bottleneck of the mmap_lock for our usecase, this patch extends this generic API to be used locklessly, so as to retain the existing behaviour for changing permissions. Apart from this reason, it is noted at [1] that KFENCE can manipulate kernel pgtable entries during softirqs. It does this by calling set_memory_valid() -> __change_memory_common(). This being a non-sleepable context, we cannot take the init_mm mmap lock. Add comments to highlight the conditions under which we can use the lockless variant - no underlying VMA, and the user having exclusive control over the range, thus guaranteeing no concurrent access. We require that the start and end of a given range do not partially overlap block mappings, or cont mappings. Return -EINVAL in case a partial block mapping is detected in any of the PGD/P4D/PUD/PMD levels; add a corresponding comment in update_range_prot() to warn that eliminating such a condition is the responsibility of the caller. Note that, the pte level callback may change permissions for a whole contpte block, and that will be done one pte at a time, as opposed to an atomic operation for the block mappings. This is fine as any access will decode either the old or the new permission until the TLBI. apply_to_page_range() currently performs all pte level callbacks while in lazy mmu mode. Since arm64 can optimize performance by batching barriers when modifying kernel pgtables in lazy mmu mode, we would like to continue to benefit from this optimisation. Unfortunately walk_kernel_page_table_range() does not use lazy mmu mode. However, since the pagewalk framework is not allocating any memory, we can safely bracket the whole operation inside lazy mmu mode ourselves. Therefore, wrap the call to walk_kernel_page_table_range() with the lazy MMU helpers. Link: https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/ [1] Signed-off-by: Dev Jain Signed-off-by: Yang Shi Reviewed-by: Ryan Roberts Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/mm/pageattr.c | 119 +++++++++++++++++++++++++++++---------- include/linux/pagewalk.h | 3 + mm/pagewalk.c | 36 ++++++++---- 3 files changed, 115 insertions(+), 43 deletions(-) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 667aff1efe4928..c0648764c403d7 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,65 @@ struct page_change_data { pgprot_t clear_mask; }; +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) +{ + struct page_change_data *masks = walk->private; + + val &= ~(pgprot_val(masks->clear_mask)); + val |= (pgprot_val(masks->set_mask)); + + return val; +} + +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t val = pudp_get(pud); + + if (pud_sect(val)) { + if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) + return -EINVAL; + val = __pud(set_pageattr_masks(pud_val(val), walk)); + set_pud(pud, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t val = pmdp_get(pmd); + + if (pmd_sect(val)) { + if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) + return -EINVAL; + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); + set_pmd(pmd, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t val = __ptep_get(pte); + + val = __pte(set_pageattr_masks(pte_val(val), walk)); + __set_pte(pte, val); + + return 0; +} + +static const struct mm_walk_ops pageattr_ops = { + .pud_entry = pageattr_pud_entry, + .pmd_entry = pageattr_pmd_entry, + .pte_entry = pageattr_pte_entry, +}; + bool rodata_full __ro_after_init = true; bool can_set_direct_map(void) @@ -37,32 +97,35 @@ bool can_set_direct_map(void) arm64_kfence_can_set_direct_map() || is_realm_world(); } -static int change_page_range(pte_t *ptep, unsigned long addr, void *data) +static int update_range_prot(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) { - struct page_change_data *cdata = data; - pte_t pte = __ptep_get(ptep); + struct page_change_data data; + int ret; - pte = clear_pte_bit(pte, cdata->clear_mask); - pte = set_pte_bit(pte, cdata->set_mask); + data.set_mask = set_mask; + data.clear_mask = clear_mask; - __set_pte(ptep, pte); - return 0; + arch_enter_lazy_mmu_mode(); + + /* + * The caller must ensure that the range we are operating on does not + * partially overlap a block mapping, or a cont mapping. Any such case + * must be eliminated by splitting the mapping. + */ + ret = walk_kernel_page_table_range_lockless(start, start + size, + &pageattr_ops, NULL, &data); + arch_leave_lazy_mmu_mode(); + + return ret; } -/* - * This function assumes that the range is mapped with PAGE_SIZE pages. - */ static int __change_memory_common(unsigned long start, unsigned long size, - pgprot_t set_mask, pgprot_t clear_mask) + pgprot_t set_mask, pgprot_t clear_mask) { - struct page_change_data data; int ret; - data.set_mask = set_mask; - data.clear_mask = clear_mask; - - ret = apply_to_page_range(&init_mm, start, size, change_page_range, - &data); + ret = update_range_prot(start, size, set_mask, clear_mask); /* * If the memory is being made valid without changing any other bits @@ -174,32 +237,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) int set_direct_map_invalid_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(0), - .clear_mask = __pgprot(PTE_VALID), - }; + pgprot_t clear_mask = __pgprot(PTE_VALID); + pgprot_t set_mask = __pgprot(0); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } int set_direct_map_default_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(PTE_VALID | PTE_WRITE), - .clear_mask = __pgprot(PTE_RDONLY), - }; + pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE); + pgprot_t clear_mask = __pgprot(PTE_RDONLY); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } static int __set_memory_enc_dec(unsigned long addr, diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 682472c1549526..88e18615dd726b 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -134,6 +134,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); +int walk_kernel_page_table_range_lockless(unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 648038247a8d28..936689d8bcac5e 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -606,10 +606,32 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { - struct mm_struct *mm = &init_mm; + /* + * Kernel intermediate page tables are usually not freed, so the mmap + * read lock is sufficient. But there are some exceptions. + * E.g. memory hot-remove. In which case, the mmap lock is insufficient + * to prevent the intermediate kernel pages tables belonging to the + * specified address range from being freed. The caller should take + * other actions to prevent this race. + */ + mmap_assert_locked(&init_mm); + + return walk_kernel_page_table_range_lockless(start, end, ops, pgd, + private); +} + +/* + * Use this function to walk the kernel page tables locklessly. It should be + * guaranteed that the caller has exclusive access over the range they are + * operating on - that there should be no concurrent access, for example, + * changing permissions for vmalloc objects. + */ +int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) +{ struct mm_walk walk = { .ops = ops, - .mm = mm, + .mm = &init_mm, .pgd = pgd, .private = private, .no_vma = true @@ -620,16 +642,6 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end, if (!check_ops_valid(ops)) return -EINVAL; - /* - * Kernel intermediate page tables are usually not freed, so the mmap - * read lock is sufficient. But there are some exceptions. - * E.g. memory hot-remove. In which case, the mmap lock is insufficient - * to prevent the intermediate kernel pages tables belonging to the - * specified address range from being freed. The caller should take - * other actions to prevent this race. - */ - mmap_assert_locked(mm); - return walk_pgd_range(start, end, &walk); } From a166563e7ec375b38a0fd3a58f7b77e50a6bc6a8 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 17 Sep 2025 12:02:09 -0700 Subject: [PATCH 2159/3206] arm64: mm: support large block mapping when rodata=full When rodata=full is specified, kernel linear mapping has to be mapped at PTE level since large page table can't be split due to break-before-make rule on ARM64. This resulted in a couple of problems: - performance degradation - more TLB pressure - memory waste for kernel page table With FEAT_BBM level 2 support, splitting large block page table to smaller ones doesn't need to make the page table entry invalid anymore. This allows kernel split large block mapping on the fly. Add kernel page table split support and use large block mapping by default when FEAT_BBM level 2 is supported for rodata=full. When changing permissions for kernel linear mapping, the page table will be split to smaller size. The machine without FEAT_BBM level 2 will fallback to have kernel linear mapping PTE-mapped when rodata=full. With this we saw significant performance boost with some benchmarks and much less memory consumption on my AmpereOne machine (192 cores, 1P) with 256GB memory. * Memory use after boot Before: MemTotal: 258988984 kB MemFree: 254821700 kB After: MemTotal: 259505132 kB MemFree: 255410264 kB Around 500MB more memory are free to use. The larger the machine, the more memory saved. * Memcached We saw performance degradation when running Memcached benchmark with rodata=full vs rodata=on. Our profiling pointed to kernel TLB pressure. With this patchset we saw ops/sec is increased by around 3.5%, P99 latency is reduced by around 9.6%. The gain mainly came from reduced kernel TLB misses. The kernel TLB MPKI is reduced by 28.5%. The benchmark data is now on par with rodata=on too. * Disk encryption (dm-crypt) benchmark Ran fio benchmark with the below command on a 128G ramdisk (ext4) with disk encryption (by dm-crypt). fio --directory=/data --random_generator=lfsr --norandommap \ --randrepeat 1 --status-interval=999 --rw=write --bs=4k --loops=1 \ --ioengine=sync --iodepth=1 --numjobs=1 --fsync_on_close=1 \ --group_reporting --thread --name=iops-test-job --eta-newline=1 \ --size 100G The IOPS is increased by 90% - 150% (the variance is high, but the worst number of good case is around 90% more than the best number of bad case). The bandwidth is increased and the avg clat is reduced proportionally. * Sequential file read Read 100G file sequentially on XFS (xfs_io read with page cache populated). The bandwidth is increased by 150%. Co-developed-by: Ryan Roberts Signed-off-by: Ryan Roberts Reviewed-by: Catalin Marinas Signed-off-by: Yang Shi Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 2 + arch/arm64/include/asm/mmu.h | 1 + arch/arm64/include/asm/pgtable.h | 5 + arch/arm64/kernel/cpufeature.c | 7 +- arch/arm64/mm/mmu.c | 264 +++++++++++++++++++++++++++- arch/arm64/mm/pageattr.c | 4 + 6 files changed, 277 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index bf13d676aae2cc..e223cbf350e49c 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -871,6 +871,8 @@ static inline bool system_supports_pmuv3(void) return cpus_have_final_cap(ARM64_HAS_PMUV3); } +bool cpu_supports_bbml2_noabort(void); + static inline bool system_supports_bbml2_noabort(void) { return alternative_has_cap_unlikely(ARM64_HAS_BBML2_NOABORT); diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 6e8aa8e726015e..56fca81f60ade2 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -71,6 +71,7 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgprot_t prot, bool page_mappings_only); extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot); extern void mark_linear_text_alias_ro(void); +extern int split_kernel_leaf_mapping(unsigned long start, unsigned long end); /* * This check is triggered during the early boot before the cpufeature diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index abd2dee416b3b3..aa89c2e67ebc84 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -371,6 +371,11 @@ static inline pmd_t pmd_mkcont(pmd_t pmd) return __pmd(pmd_val(pmd) | PMD_SECT_CONT); } +static inline pmd_t pmd_mknoncont(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) & ~PMD_SECT_CONT); +} + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9ad065f15f1d61..e15472beff3f84 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2217,7 +2217,7 @@ static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE); } -static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope) +bool cpu_supports_bbml2_noabort(void) { /* * We want to allow usage of BBML2 in as wide a range of kernel contexts @@ -2249,6 +2249,11 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco return true; } +static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope) +{ + return cpu_supports_bbml2_noabort(); +} + #ifdef CONFIG_ARM64_PAN static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index de463040582c49..a7b29daf1a38f1 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -481,6 +481,8 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, int flags); #endif +#define INVALID_PHYS_ADDR (-1ULL) + static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, enum pgtable_type pgtable_type) { @@ -488,7 +490,9 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0); phys_addr_t pa; - BUG_ON(!ptdesc); + if (!ptdesc) + return INVALID_PHYS_ADDR; + pa = page_to_phys(ptdesc_page(ptdesc)); switch (pgtable_type) { @@ -509,16 +513,256 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, return pa; } +static phys_addr_t +try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) +{ + return __pgd_pgtable_alloc(&init_mm, pgtable_type); +} + static phys_addr_t __maybe_unused pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) { - return __pgd_pgtable_alloc(&init_mm, pgtable_type); + phys_addr_t pa; + + pa = __pgd_pgtable_alloc(&init_mm, pgtable_type); + BUG_ON(pa == INVALID_PHYS_ADDR); + return pa; } static phys_addr_t pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) { - return __pgd_pgtable_alloc(NULL, pgtable_type); + phys_addr_t pa; + + pa = __pgd_pgtable_alloc(NULL, pgtable_type); + BUG_ON(pa == INVALID_PHYS_ADDR); + return pa; +} + +static void split_contpte(pte_t *ptep) +{ + int i; + + ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); + for (i = 0; i < CONT_PTES; i++, ptep++) + __set_pte(ptep, pte_mknoncont(__ptep_get(ptep))); +} + +static int split_pmd(pmd_t *pmdp, pmd_t pmd) +{ + pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; + unsigned long pfn = pmd_pfn(pmd); + pgprot_t prot = pmd_pgprot(pmd); + phys_addr_t pte_phys; + pte_t *ptep; + int i; + + pte_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PTE); + if (pte_phys == INVALID_PHYS_ADDR) + return -ENOMEM; + ptep = (pte_t *)phys_to_virt(pte_phys); + + if (pgprot_val(prot) & PMD_SECT_PXN) + tableprot |= PMD_TABLE_PXN; + + prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE); + prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++) + __set_pte(ptep, pfn_pte(pfn, prot)); + + /* + * Ensure the pte entries are visible to the table walker by the time + * the pmd entry that points to the ptes is visible. + */ + dsb(ishst); + __pmd_populate(pmdp, pte_phys, tableprot); + + return 0; +} + +static void split_contpmd(pmd_t *pmdp) +{ + int i; + + pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS); + for (i = 0; i < CONT_PMDS; i++, pmdp++) + set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp))); +} + +static int split_pud(pud_t *pudp, pud_t pud) +{ + pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; + unsigned int step = PMD_SIZE >> PAGE_SHIFT; + unsigned long pfn = pud_pfn(pud); + pgprot_t prot = pud_pgprot(pud); + phys_addr_t pmd_phys; + pmd_t *pmdp; + int i; + + pmd_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PMD); + if (pmd_phys == INVALID_PHYS_ADDR) + return -ENOMEM; + pmdp = (pmd_t *)phys_to_virt(pmd_phys); + + if (pgprot_val(prot) & PMD_SECT_PXN) + tableprot |= PUD_TABLE_PXN; + + prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); + prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step) + set_pmd(pmdp, pfn_pmd(pfn, prot)); + + /* + * Ensure the pmd entries are visible to the table walker by the time + * the pud entry that points to the pmds is visible. + */ + dsb(ishst); + __pud_populate(pudp, pmd_phys, tableprot); + + return 0; +} + +static int split_kernel_leaf_mapping_locked(unsigned long addr) +{ + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + int ret = 0; + + /* + * PGD: If addr is PGD aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. + */ + if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr) + goto out; + pgdp = pgd_offset_k(addr); + pgd = pgdp_get(pgdp); + if (!pgd_present(pgd)) + goto out; + + /* + * P4D: If addr is P4D aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. + */ + if (ALIGN_DOWN(addr, P4D_SIZE) == addr) + goto out; + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + if (!p4d_present(p4d)) + goto out; + + /* + * PUD: If addr is PUD aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. Otherwise, + * if we have a pud leaf, split to contpmd. + */ + if (ALIGN_DOWN(addr, PUD_SIZE) == addr) + goto out; + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + if (!pud_present(pud)) + goto out; + if (pud_leaf(pud)) { + ret = split_pud(pudp, pud); + if (ret) + goto out; + } + + /* + * CONTPMD: If addr is CONTPMD aligned then addr already describes a + * leaf boundary. If not present then there is nothing to split. + * Otherwise, if we have a contpmd leaf, split to pmd. + */ + if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr) + goto out; + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + if (!pmd_present(pmd)) + goto out; + if (pmd_leaf(pmd)) { + if (pmd_cont(pmd)) + split_contpmd(pmdp); + /* + * PMD: If addr is PMD aligned then addr already describes a + * leaf boundary. Otherwise, split to contpte. + */ + if (ALIGN_DOWN(addr, PMD_SIZE) == addr) + goto out; + ret = split_pmd(pmdp, pmd); + if (ret) + goto out; + } + + /* + * CONTPTE: If addr is CONTPTE aligned then addr already describes a + * leaf boundary. If not present then there is nothing to split. + * Otherwise, if we have a contpte leaf, split to pte. + */ + if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr) + goto out; + ptep = pte_offset_kernel(pmdp, addr); + pte = __ptep_get(ptep); + if (!pte_present(pte)) + goto out; + if (pte_cont(pte)) + split_contpte(ptep); + +out: + return ret; +} + +static DEFINE_MUTEX(pgtable_split_lock); + +int split_kernel_leaf_mapping(unsigned long start, unsigned long end) +{ + int ret; + + /* + * !BBML2_NOABORT systems should not be trying to change permissions on + * anything that is not pte-mapped in the first place. Just return early + * and let the permission change code raise a warning if not already + * pte-mapped. + */ + if (!system_supports_bbml2_noabort()) + return 0; + + /* + * Ensure start and end are at least page-aligned since this is the + * finest granularity we can split to. + */ + if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end)) + return -EINVAL; + + mutex_lock(&pgtable_split_lock); + arch_enter_lazy_mmu_mode(); + + /* + * The split_kernel_leaf_mapping_locked() may sleep, it is not a + * problem for ARM64 since ARM64's lazy MMU implementation allows + * sleeping. + * + * Optimize for the common case of splitting out a single page from a + * larger mapping. Here we can just split on the "least aligned" of + * start and end and this will guarantee that there must also be a split + * on the more aligned address since the both addresses must be in the + * same contpte block and it must have been split to ptes. + */ + if (end - start == PAGE_SIZE) { + start = __ffs(start) < __ffs(end) ? start : end; + ret = split_kernel_leaf_mapping_locked(start); + } else { + ret = split_kernel_leaf_mapping_locked(start); + if (!ret) + ret = split_kernel_leaf_mapping_locked(end); + } + + arch_leave_lazy_mmu_mode(); + mutex_unlock(&pgtable_split_lock); + return ret; } /* @@ -640,6 +884,16 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { #endif /* CONFIG_KFENCE */ +static inline bool force_pte_mapping(void) +{ + bool bbml2 = system_capabilities_finalized() ? + system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); + + return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() || + is_realm_world())) || + debug_pagealloc_enabled(); +} + static void __init map_mem(pgd_t *pgdp) { static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); @@ -665,7 +919,7 @@ static void __init map_mem(pgd_t *pgdp) early_kfence_pool = arm64_kfence_alloc_pool(); - if (can_set_direct_map()) + if (force_pte_mapping()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* @@ -1368,7 +1622,7 @@ int arch_add_memory(int nid, u64 start, u64 size, VM_BUG_ON(!mhp_range_allowed(start, size, true)); - if (can_set_direct_map()) + if (force_pte_mapping()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index c0648764c403d7..5135f2d66958d9 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -106,6 +106,10 @@ static int update_range_prot(unsigned long start, unsigned long size, data.set_mask = set_mask; data.clear_mask = clear_mask; + ret = split_kernel_leaf_mapping(start, start + size); + if (WARN_ON_ONCE(ret)) + return ret; + arch_enter_lazy_mmu_mode(); /* From 62a7b3bbb6b873fdcc85a37efbd0102d66c8a73e Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 19 Aug 2025 17:05:25 +0100 Subject: [PATCH 2160/3206] ASoC: SOF: ipc4-pcm: Fix incorrect comparison with number of tdm_slots In ipc4_ssp_dai_config_pcm_params_match() when comparing params_channels() against hw_config->tdm_slots the comparison should be a <= not a ==. The number of TDM slots must be enough for the number of required channels. But it can be greater. There are various reason why a I2S/TDM link has more TDM slots than a particular audio stream needs. The original comparison would fail on systems that had more TDM slots. Signed-off-by: Richard Fitzgerald Fixes: 8a07944a77e9 ("ASoC: SOF: ipc4-pcm: Look for best matching hw_config for SSP") Link: https://patch.msgid.link/20250819160525.423416-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 374dc10d10fd52..86f7377fb92fac 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -639,14 +639,14 @@ static int ipc4_ssp_dai_config_pcm_params_match(struct snd_sof_dev *sdev, if (params_rate(params) == le32_to_cpu(hw_config->fsync_rate) && params_width(params) == le32_to_cpu(hw_config->tdm_slot_width) && - params_channels(params) == le32_to_cpu(hw_config->tdm_slots)) { + params_channels(params) <= le32_to_cpu(hw_config->tdm_slots)) { current_config = le32_to_cpu(hw_config->id); partial_match = false; /* best match found */ break; } else if (current_config < 0 && params_rate(params) == le32_to_cpu(hw_config->fsync_rate) && - params_channels(params) == le32_to_cpu(hw_config->tdm_slots)) { + params_channels(params) <= le32_to_cpu(hw_config->tdm_slots)) { current_config = le32_to_cpu(hw_config->id); partial_match = true; /* keep looking for better match */ From 30dbc1c8d50f13c1581b49abe46fe89f393eacbf Mon Sep 17 00:00:00 2001 From: Khairul Anuar Romli Date: Wed, 10 Sep 2025 16:06:32 +0800 Subject: [PATCH 2161/3206] spi: cadence-qspi: defer runtime support on socfpga if reset bit is enabled Enabling runtime PM allows the kernel to gate clocks and power to idle devices. On SoCFPGA, a warm reset does not fully reinitialize these domains.This leaves devices suspended and powered down, preventing U-Boot or the kernel from reusing them after a warm reset, which breaks the boot process. Fixes: 4892b374c9b7 ("mtd: spi-nor: cadence-quadspi: Add runtime PM support") CC: stable@vger.kernel.org # 6.12+ Signed-off-by: Khairul Anuar Romli Signed-off-by: Adrian Ng Ho Yin Reviewed-by: Niravkumar L Rabara Reviewed-by: Matthew Gerlach Link: https://patch.msgid.link/910aad68ba5d948919a7b90fa85a2fadb687229b.1757491372.git.khairul.anuar.romli@altera.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 53 +++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 9bf823348cd30d..d288e9d9c18739 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -46,6 +46,7 @@ static_assert(CQSPI_MAX_CHIPSELECT <= SPI_CS_CNT_MAX); #define CQSPI_DMA_SET_MASK BIT(7) #define CQSPI_SUPPORT_DEVICE_RESET BIT(8) #define CQSPI_DISABLE_STIG_MODE BIT(9) +#define CQSPI_DISABLE_RUNTIME_PM BIT(10) /* Capabilities */ #define CQSPI_SUPPORTS_OCTAL BIT(0) @@ -1468,14 +1469,17 @@ static int cqspi_exec_mem_op(struct spi_mem *mem, const struct spi_mem_op *op) int ret; struct cqspi_st *cqspi = spi_controller_get_devdata(mem->spi->controller); struct device *dev = &cqspi->pdev->dev; + const struct cqspi_driver_platdata *ddata = of_device_get_match_data(dev); if (refcount_read(&cqspi->inflight_ops) == 0) return -ENODEV; - ret = pm_runtime_resume_and_get(dev); - if (ret) { - dev_err(&mem->spi->dev, "resume failed with %d\n", ret); - return ret; + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) { + ret = pm_runtime_resume_and_get(dev); + if (ret) { + dev_err(&mem->spi->dev, "resume failed with %d\n", ret); + return ret; + } } if (!refcount_read(&cqspi->refcount)) @@ -1491,7 +1495,8 @@ static int cqspi_exec_mem_op(struct spi_mem *mem, const struct spi_mem_op *op) ret = cqspi_mem_process(mem, op); - pm_runtime_put_autosuspend(dev); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) + pm_runtime_put_autosuspend(dev); if (ret) dev_err(&mem->spi->dev, "operation failed with %d\n", ret); @@ -1985,11 +1990,12 @@ static int cqspi_probe(struct platform_device *pdev) goto probe_setup_failed; } - pm_runtime_enable(dev); - - pm_runtime_set_autosuspend_delay(dev, CQSPI_AUTOSUSPEND_TIMEOUT); - pm_runtime_use_autosuspend(dev); - pm_runtime_get_noresume(dev); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) { + pm_runtime_enable(dev); + pm_runtime_set_autosuspend_delay(dev, CQSPI_AUTOSUSPEND_TIMEOUT); + pm_runtime_use_autosuspend(dev); + pm_runtime_get_noresume(dev); + } ret = spi_register_controller(host); if (ret) { @@ -1997,12 +2003,17 @@ static int cqspi_probe(struct platform_device *pdev) goto probe_setup_failed; } - pm_runtime_put_autosuspend(dev); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) { + pm_runtime_put_autosuspend(dev); + pm_runtime_mark_last_busy(dev); + pm_runtime_put_autosuspend(dev); + } return 0; probe_setup_failed: cqspi_controller_enable(cqspi, 0); - pm_runtime_disable(dev); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) + pm_runtime_disable(dev); probe_reset_failed: if (cqspi->is_jh7110) cqspi_jh7110_disable_clk(pdev, cqspi); @@ -2013,7 +2024,11 @@ static int cqspi_probe(struct platform_device *pdev) static void cqspi_remove(struct platform_device *pdev) { + const struct cqspi_driver_platdata *ddata; struct cqspi_st *cqspi = platform_get_drvdata(pdev); + struct device *dev = &pdev->dev; + + ddata = of_device_get_match_data(dev); refcount_set(&cqspi->refcount, 0); @@ -2026,14 +2041,17 @@ static void cqspi_remove(struct platform_device *pdev) if (cqspi->rx_chan) dma_release_channel(cqspi->rx_chan); - if (pm_runtime_get_sync(&pdev->dev) >= 0) - clk_disable(cqspi->clk); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) + if (pm_runtime_get_sync(&pdev->dev) >= 0) + clk_disable(cqspi->clk); if (cqspi->is_jh7110) cqspi_jh7110_disable_clk(pdev, cqspi); - pm_runtime_put_sync(&pdev->dev); - pm_runtime_disable(&pdev->dev); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) { + pm_runtime_put_sync(&pdev->dev); + pm_runtime_disable(&pdev->dev); + } } static int cqspi_runtime_suspend(struct device *dev) @@ -2112,7 +2130,8 @@ static const struct cqspi_driver_platdata socfpga_qspi = { .quirks = CQSPI_DISABLE_DAC_MODE | CQSPI_NO_SUPPORT_WR_COMPLETION | CQSPI_SLOW_SRAM - | CQSPI_DISABLE_STIG_MODE, + | CQSPI_DISABLE_STIG_MODE + | CQSPI_DISABLE_RUNTIME_PM, }; static const struct cqspi_driver_platdata versal_ospi = { From 6b8ba0db92cd01450acaf375caf4c126aa913d72 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:08 +0000 Subject: [PATCH 2162/3206] ASoC: soc-dapm: add snd_soc_dapm_to_dev() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Some drivers need to get dev from dapm (which will be removed). We need such function. Add it. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87cy86x06z.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 3 ++- sound/soc/soc-dapm.c | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index ed39458b94bf9b..ccd36a198a1346 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -587,7 +587,7 @@ struct snd_soc_dapm_context { unsigned int idle_bias_off:1; /* Use BIAS_OFF instead of STANDBY */ unsigned int suspend_bias_off:1; /* Use BIAS_OFF in suspend if the DAPM is idle */ - struct device *dev; /* from parent - for debug */ + struct device *dev; /* from parent - for debug */ /* REMOVE ME */ struct snd_soc_component *component; /* parent component */ struct snd_soc_card *card; /* parent card */ @@ -660,6 +660,7 @@ void snd_soc_dapm_connect_dai_link_widgets(struct snd_soc_card *card); int snd_soc_dapm_update_dai(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); int snd_soc_dapm_widget_name_cmp(struct snd_soc_dapm_widget *widget, const char *s); +struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm); /* dapm path setup */ int snd_soc_dapm_new_widgets(struct snd_soc_card *card); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index d74c096cc20840..83cdf97edb9da0 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -165,6 +165,15 @@ static void pop_dbg(struct device *dev, u32 pop_time, const char *fmt, ...) kfree(buf); } +struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm) +{ + if (dapm->component) + return dapm->component->dev; + + return dapm->card->dev; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_to_dev); + static bool dapm_dirty_widget(struct snd_soc_dapm_widget *w) { return !list_empty(&w->dirty); From c8df096bca84c9eb04b656015c8430d0b87ebbcf Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:13 +0000 Subject: [PATCH 2163/3206] ASoC: soc-dapm: add snd_soc_dapm_to_card() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Some drivers need to get card from dapm (which will be removed). We need such function. Add it. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87bjnqx06v.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 1 + sound/soc/soc-dapm.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index ccd36a198a1346..dbb71e396febe5 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -661,6 +661,7 @@ int snd_soc_dapm_update_dai(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); int snd_soc_dapm_widget_name_cmp(struct snd_soc_dapm_widget *widget, const char *s); struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm); +struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm); /* dapm path setup */ int snd_soc_dapm_new_widgets(struct snd_soc_card *card); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 83cdf97edb9da0..cd8d3857988622 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -174,6 +174,12 @@ struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm) } EXPORT_SYMBOL_GPL(snd_soc_dapm_to_dev); +struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm) +{ + return dapm->card; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_to_card); + static bool dapm_dirty_widget(struct snd_soc_dapm_widget *w) { return !list_empty(&w->dirty); From 96e311b561a2d393a786a2aeb50cd5e02d06afb3 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:17 +0000 Subject: [PATCH 2164/3206] ASoC: soc-dapm: use dapm->component instead of container_of() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Now, snd_soc_dapm_to_component() (A) will convert dapm to component by container_of() (a). (A) static inline struct snd_soc_component *snd_soc_dapm_to_component( struct snd_soc_dapm_context *dapm) { (a) return container_of(dapm, struct snd_soc_component, dapm); } dapm of component works, but dapm of card will be "unknown" pointer (= not NULL), because (a) is using "container_of()". OTOH, ASoC will call snd_soc_dapm_init() (X) to initialize dapm, and it will be called from snd_soc_bind_card() (p) (for card) or soc_probe_component() (q) (for component) with component pointer. (p) static int snd_soc_bind_card(...) { ... (X) snd_soc_dapm_init(..., NULL); ... ^^^^ } (q) static int soc_probe_component(...) { ... (X) snd_soc_dapm_init(..., component); ... ^^^^^^^^^ } And snd_soc_dapm_init() (X) will fill dapm->component (x) (X) void snd_soc_dapm_init(..., component, ...) { ... (x) dapm->component = component; ... } We can simply use dapm->component in snd_soc_dapm_to_component() (A). In this case, dapm of card (p) will be just NULL. Use dapm->component instead of container_of(). The picky note can be removed by this patch. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87a53ax06q.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 15 --------------- include/sound/soc-dapm.h | 1 + sound/soc/soc-dapm.c | 6 ++++++ 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index 48e45cbe82e53b..7322d5d4c0bdcf 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -260,21 +260,6 @@ struct snd_soc_component { #define for_each_component_dais_safe(component, dai, _dai)\ list_for_each_entry_safe(dai, _dai, &(component)->dai_list, list) -/** - * snd_soc_dapm_to_component() - Casts a DAPM context to the component it is - * embedded in - * @dapm: The DAPM context to cast to the component - * - * This function must only be used on DAPM contexts that are known to be part of - * a component (e.g. in a component driver). Otherwise the behavior is - * undefined. - */ -static inline struct snd_soc_component *snd_soc_dapm_to_component( - struct snd_soc_dapm_context *dapm) -{ - return container_of(dapm, struct snd_soc_component, dapm); -} - /** * snd_soc_component_get_dapm() - Returns the DAPM context associated with a * component diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index dbb71e396febe5..c6470d391eeffa 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -662,6 +662,7 @@ int snd_soc_dapm_update_dai(struct snd_pcm_substream *substream, int snd_soc_dapm_widget_name_cmp(struct snd_soc_dapm_widget *widget, const char *s); struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm); struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm); +struct snd_soc_component *snd_soc_dapm_to_component(struct snd_soc_dapm_context *dapm); /* dapm path setup */ int snd_soc_dapm_new_widgets(struct snd_soc_card *card); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index cd8d3857988622..4550bf33add272 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -180,6 +180,12 @@ struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm) } EXPORT_SYMBOL_GPL(snd_soc_dapm_to_card); +struct snd_soc_component *snd_soc_dapm_to_component(struct snd_soc_dapm_context *dapm) +{ + return dapm->component; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_to_component); + static bool dapm_dirty_widget(struct snd_soc_dapm_widget *w) { return !list_empty(&w->dirty); From a1c99b6097afe64ed493c05b522ee4d6f9b0094d Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:21 +0000 Subject: [PATCH 2165/3206] ASoC: soc-component: add snd_soc_component_to_dapm() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Current dapm of card/component are using "instance", but it will be "pointer" if snd_soc_dapm_context was removed from header. snd_soc_component_to_dapm() is needed to switch to the new style while maintaining compatibility Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/878qiux06m.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index 7322d5d4c0bdcf..b954f34d6025e7 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -261,16 +261,19 @@ struct snd_soc_component { list_for_each_entry_safe(dai, _dai, &(component)->dai_list, list) /** - * snd_soc_component_get_dapm() - Returns the DAPM context associated with a + * snd_soc_component_to_dapm() - Returns the DAPM context associated with a * component * @component: The component for which to get the DAPM context */ -static inline struct snd_soc_dapm_context *snd_soc_component_get_dapm( +static inline struct snd_soc_dapm_context *snd_soc_component_to_dapm( struct snd_soc_component *component) { return &component->dapm; } +// FIXME +#define snd_soc_component_get_dapm snd_soc_component_to_dapm + /** * snd_soc_component_cache_sync() - Sync the register cache with the hardware * @component: COMPONENT to sync From e38a80c5c24f3058bd5da6f2910e2b672493f4f2 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:25 +0000 Subject: [PATCH 2166/3206] ASoC: soc-card: add snd_soc_card_to_dapm() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Current dapm of card/component are using "instance", but it will be "pointer" if snd_soc_dapm_context was removed from header. snd_soc_card_to_dapm() is needed to switch to the new style while maintaining compatibility Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/877byex06i.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/sound/soc.h b/include/sound/soc.h index 1fffef311c413f..ddc508ff7b9be1 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1120,6 +1120,11 @@ static inline int snd_soc_card_is_instantiated(struct snd_soc_card *card) return card && card->instantiated; } +static inline struct snd_soc_dapm_context *snd_soc_card_to_dapm(struct snd_soc_card *card) +{ + return &card->dapm; +} + /* SoC machine DAI configuration, glues a codec and cpu DAI together */ struct snd_soc_pcm_runtime { struct device *dev; From 3bc0a92cb2062fce54ddd97ad68ad6fe358c3ff0 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:29 +0000 Subject: [PATCH 2167/3206] ASoC: soc-dapm: remove suspend_bias_off from snd_soc_dapm_context We can directly use suspend_bias_off via snd_soc_component, no need to keep it on dapm. Remove it. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/875xdyx06e.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 1 - sound/soc/soc-dapm.c | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index c6470d391eeffa..498f8af79cfa10 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -585,7 +585,6 @@ struct snd_soc_dapm_context { /* bit field */ unsigned int idle_bias_off:1; /* Use BIAS_OFF instead of STANDBY */ - unsigned int suspend_bias_off:1; /* Use BIAS_OFF in suspend if the DAPM is idle */ struct device *dev; /* from parent - for debug */ /* REMOVE ME */ struct snd_soc_component *component; /* parent component */ diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 4550bf33add272..b90d0adb77132f 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2182,13 +2182,16 @@ static void dapm_power_one_widget(struct snd_soc_dapm_widget *w, static bool dapm_idle_bias_off(struct snd_soc_dapm_context *dapm) { + struct snd_soc_component *component = snd_soc_dapm_to_component(dapm); if (dapm->idle_bias_off) return true; switch (snd_power_get_state(dapm->card->snd_card)) { case SNDRV_CTL_POWER_D3hot: case SNDRV_CTL_POWER_D3cold: - return dapm->suspend_bias_off; + if (component) + return component->driver->suspend_bias_off; + fallthrough; default: break; } @@ -4823,7 +4826,6 @@ void snd_soc_dapm_init(struct snd_soc_dapm_context *dapm, if (component) { dapm->dev = component->dev; dapm->idle_bias_off = !component->driver->idle_bias_on; - dapm->suspend_bias_off = component->driver->suspend_bias_off; } else { dapm->dev = card->dev; } From 889dd56f8c03586e5489050e7457a405fae6a420 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:33 +0000 Subject: [PATCH 2168/3206] ASoC: soc-dapm: tidyup idle_bias handling - step1 Current soc-dapm is using "idle_bias_off", and its default settings came from snd_soc_component "idle_bias_on". It is complicated/confusable. Let's handling it as "idle_bias". Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/874itix06a.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 3 +-- sound/soc/codecs/wm8993.c | 2 +- sound/soc/codecs/wm8994.c | 6 +++--- sound/soc/intel/avs/boards/es8336.c | 2 +- sound/soc/intel/avs/boards/rt274.c | 2 +- sound/soc/intel/avs/boards/rt5640.c | 2 +- sound/soc/intel/boards/bytcht_cx2072x.c | 2 +- sound/soc/intel/boards/bytcht_es8316.c | 2 +- sound/soc/intel/boards/bytcr_rt5640.c | 2 +- sound/soc/intel/boards/bytcr_rt5651.c | 2 +- sound/soc/intel/boards/bytcr_wm5102.c | 2 +- sound/soc/intel/boards/sof_es8336.c | 2 +- sound/soc/soc-core.c | 4 ++-- sound/soc/soc-dapm.c | 20 ++++++++++---------- sound/soc/sof/sof-client-probes.c | 2 +- 15 files changed, 27 insertions(+), 28 deletions(-) diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 498f8af79cfa10..9618a54a53485a 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -583,8 +583,7 @@ struct snd_soc_dapm_update { struct snd_soc_dapm_context { enum snd_soc_bias_level bias_level; - /* bit field */ - unsigned int idle_bias_off:1; /* Use BIAS_OFF instead of STANDBY */ + bool idle_bias; /* Use BIAS_OFF instead of STANDBY when false */ struct device *dev; /* from parent - for debug */ /* REMOVE ME */ struct snd_soc_component *component; /* parent component */ diff --git a/sound/soc/codecs/wm8993.c b/sound/soc/codecs/wm8993.c index 9be4f6cadba39e..75d923c2c9cac8 100644 --- a/sound/soc/codecs/wm8993.c +++ b/sound/soc/codecs/wm8993.c @@ -1536,7 +1536,7 @@ static int wm8993_probe(struct snd_soc_component *component) * VMID as an output and can disable it. */ if (wm8993->pdata.lineout1_diff && wm8993->pdata.lineout2_diff) - dapm->idle_bias_off = 1; + dapm->idle_bias = false; return 0; diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c index 240ec1bed23470..128c3a59beac3d 100644 --- a/sound/soc/codecs/wm8994.c +++ b/sound/soc/codecs/wm8994.c @@ -4182,8 +4182,8 @@ static int wm8994_component_probe(struct snd_soc_component *component) wm8994->micdet_irq = control->pdata.micdet_irq; - /* By default use idle_bias_off, will override for WM8994 */ - dapm->idle_bias_off = 1; + /* By default use idle_bias false, will override for WM8994 */ + dapm->idle_bias = false; /* Set revision-specific configuration */ switch (control->type) { @@ -4191,7 +4191,7 @@ static int wm8994_component_probe(struct snd_soc_component *component) /* Single ended line outputs should have VMID on. */ if (!control->pdata.lineout1_diff || !control->pdata.lineout2_diff) - dapm->idle_bias_off = 0; + dapm->idle_bias = true; switch (control->revision) { case 2: diff --git a/sound/soc/intel/avs/boards/es8336.c b/sound/soc/intel/avs/boards/es8336.c index 12e4e0aba5fa24..eb2b40894e3f20 100644 --- a/sound/soc/intel/avs/boards/es8336.c +++ b/sound/soc/intel/avs/boards/es8336.c @@ -132,7 +132,7 @@ static int avs_es8336_codec_init(struct snd_soc_pcm_runtime *runtime) snd_jack_set_key(data->jack.jack, SND_JACK_BTN_0, KEY_PLAYPAUSE); snd_soc_component_set_jack(component, &data->jack, NULL); - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; return 0; } diff --git a/sound/soc/intel/avs/boards/rt274.c b/sound/soc/intel/avs/boards/rt274.c index 67d2c4585cddc4..4055ecc60838f3 100644 --- a/sound/soc/intel/avs/boards/rt274.c +++ b/sound/soc/intel/avs/boards/rt274.c @@ -117,7 +117,7 @@ static int avs_rt274_codec_init(struct snd_soc_pcm_runtime *runtime) return ret; } - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; return 0; } diff --git a/sound/soc/intel/avs/boards/rt5640.c b/sound/soc/intel/avs/boards/rt5640.c index 706b84ffe1ef02..97d1fa944188fa 100644 --- a/sound/soc/intel/avs/boards/rt5640.c +++ b/sound/soc/intel/avs/boards/rt5640.c @@ -67,7 +67,7 @@ static int avs_rt5640_codec_init(struct snd_soc_pcm_runtime *runtime) return ret; snd_soc_component_set_jack(codec_dai->component, jack, NULL); - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; return 0; } diff --git a/sound/soc/intel/boards/bytcht_cx2072x.c b/sound/soc/intel/boards/bytcht_cx2072x.c index 68a3d345dc25df..27b63a853a48e4 100644 --- a/sound/soc/intel/boards/bytcht_cx2072x.c +++ b/sound/soc/intel/boards/bytcht_cx2072x.c @@ -77,7 +77,7 @@ static int byt_cht_cx2072x_init(struct snd_soc_pcm_runtime *rtd) byt_cht_cx2072x_acpi_gpios)) dev_warn(rtd->dev, "Unable to add GPIO mapping table\n"); - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; /* set the default PLL rate, the clock is handled by the codec driver */ ret = snd_soc_dai_set_sysclk(snd_soc_rtd_to_codec(rtd, 0), CX2072X_MCLK_EXTERNAL_PLL, diff --git a/sound/soc/intel/boards/bytcht_es8316.c b/sound/soc/intel/boards/bytcht_es8316.c index b384d38654e658..3b5f63112237ea 100644 --- a/sound/soc/intel/boards/bytcht_es8316.c +++ b/sound/soc/intel/boards/bytcht_es8316.c @@ -179,7 +179,7 @@ static int byt_cht_es8316_init(struct snd_soc_pcm_runtime *runtime) int num_routes; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; switch (BYT_CHT_ES8316_MAP(quirk)) { case BYT_CHT_ES8316_INTMIC_IN1_MAP: diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index bc846558480e41..1e9b1903fae810 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -1324,7 +1324,7 @@ static int byt_rt5640_init(struct snd_soc_pcm_runtime *runtime) int num_routes = 0; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; jack_data->use_platform_clock = true; /* Start with RC clk for jack-detect (we disable MCLK below) */ diff --git a/sound/soc/intel/boards/bytcr_rt5651.c b/sound/soc/intel/boards/bytcr_rt5651.c index 604a35d380e9ab..ca540a66f22ce0 100644 --- a/sound/soc/intel/boards/bytcr_rt5651.c +++ b/sound/soc/intel/boards/bytcr_rt5651.c @@ -586,7 +586,7 @@ static int byt_rt5651_init(struct snd_soc_pcm_runtime *runtime) int report; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; /* Start with RC clk for jack-detect (we disable MCLK below) */ if (byt_rt5651_quirk & BYT_RT5651_MCLK_EN) diff --git a/sound/soc/intel/boards/bytcr_wm5102.c b/sound/soc/intel/boards/bytcr_wm5102.c index a6dfbcfdf74e31..72c0e5941ae840 100644 --- a/sound/soc/intel/boards/bytcr_wm5102.c +++ b/sound/soc/intel/boards/bytcr_wm5102.c @@ -288,7 +288,7 @@ static int byt_wm5102_init(struct snd_soc_pcm_runtime *runtime) const struct snd_soc_dapm_route *custom_map = NULL; int ret, jack_type, num_routes = 0; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; ret = snd_soc_add_card_controls(card, byt_wm5102_controls, ARRAY_SIZE(byt_wm5102_controls)); diff --git a/sound/soc/intel/boards/sof_es8336.c b/sound/soc/intel/boards/sof_es8336.c index 1211a2b8a2a2c7..10b189ea88dba8 100644 --- a/sound/soc/intel/boards/sof_es8336.c +++ b/sound/soc/intel/boards/sof_es8336.c @@ -276,7 +276,7 @@ static int sof_es8316_init(struct snd_soc_pcm_runtime *runtime) int num_routes; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; if (quirk & SOC_ES8336_HEADSET_MIC1) { custom_map = sof_es8316_headset_mic1_map; diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index cc9125ffe92ac0..9dd84d73046be7 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -717,7 +717,7 @@ int snd_soc_suspend(struct device *dev) * means it's doing something, * otherwise fall through. */ - if (dapm->idle_bias_off) { + if (!dapm->idle_bias) { dev_dbg(component->dev, "ASoC: idle_bias_off CODEC on over suspend\n"); break; @@ -1652,7 +1652,7 @@ static int soc_probe_component(struct snd_soc_card *card, if (ret < 0) goto err_probe; - WARN(dapm->idle_bias_off && + WARN(!dapm->idle_bias && dapm->bias_level != SND_SOC_BIAS_OFF, "codec %s can not start from non-off bias with idle_bias_off==1\n", component->name); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index b90d0adb77132f..cbe945f1dc9cf1 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2180,23 +2180,23 @@ static void dapm_power_one_widget(struct snd_soc_dapm_widget *w, dapm_seq_insert(w, down_list, false); } -static bool dapm_idle_bias_off(struct snd_soc_dapm_context *dapm) +static bool dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) { struct snd_soc_component *component = snd_soc_dapm_to_component(dapm); - if (dapm->idle_bias_off) - return true; + if (!dapm->idle_bias) + return false; switch (snd_power_get_state(dapm->card->snd_card)) { case SNDRV_CTL_POWER_D3hot: case SNDRV_CTL_POWER_D3cold: if (component) - return component->driver->suspend_bias_off; + return !component->driver->suspend_bias_off; fallthrough; default: break; } - return false; + return true; } /* @@ -2224,10 +2224,10 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event, trace_snd_soc_dapm_start(card, event); for_each_card_dapms(card, d) { - if (dapm_idle_bias_off(d)) - d->target_bias_level = SND_SOC_BIAS_OFF; - else + if (dapm_get_idle_bias(d)) d->target_bias_level = SND_SOC_BIAS_STANDBY; + else + d->target_bias_level = SND_SOC_BIAS_OFF; } dapm_reset(card); @@ -2291,7 +2291,7 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event, if (d->target_bias_level > bias) bias = d->target_bias_level; for_each_card_dapms(card, d) - if (!dapm_idle_bias_off(d)) + if (dapm_get_idle_bias(d)) d->target_bias_level = bias; trace_snd_soc_dapm_walk_done(card); @@ -4825,7 +4825,7 @@ void snd_soc_dapm_init(struct snd_soc_dapm_context *dapm, if (component) { dapm->dev = component->dev; - dapm->idle_bias_off = !component->driver->idle_bias_on; + dapm->idle_bias = component->driver->idle_bias_on; } else { dapm->dev = card->dev; } diff --git a/sound/soc/sof/sof-client-probes.c b/sound/soc/sof/sof-client-probes.c index 3ca8460774bb4d..aaf0ae4bf01f8f 100644 --- a/sound/soc/sof/sof-client-probes.c +++ b/sound/soc/sof/sof-client-probes.c @@ -525,7 +525,7 @@ static int sof_probes_client_probe(struct auxiliary_device *auxdev, card->dai_link = links; /* set idle_bias_off to prevent the core from resuming the card->dev */ - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; snd_soc_card_set_drvdata(card, cdev); From 4b4fdc8b75a902eeb7441dff1876c2bb31c7715f Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:37 +0000 Subject: [PATCH 2169/3206] ASoC: soc-dapm: tidyup idle_bias handling - step2 Current dapm_get_idle_bias() is unnecessarily complicated/confusable. Tidyup it. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/873492x066.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/soc-dapm.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index cbe945f1dc9cf1..a263696ef8acc6 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2182,21 +2182,16 @@ static void dapm_power_one_widget(struct snd_soc_dapm_widget *w, static bool dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) { - struct snd_soc_component *component = snd_soc_dapm_to_component(dapm); - if (!dapm->idle_bias) - return false; + if (dapm->idle_bias) { + struct snd_soc_component *component = snd_soc_dapm_to_component(dapm); + unsigned int state = snd_power_get_state(dapm->card->snd_card); - switch (snd_power_get_state(dapm->card->snd_card)) { - case SNDRV_CTL_POWER_D3hot: - case SNDRV_CTL_POWER_D3cold: - if (component) + if ((state == SNDRV_CTL_POWER_D3hot || (state == SNDRV_CTL_POWER_D3cold)) && + component) return !component->driver->suspend_bias_off; - fallthrough; - default: - break; } - return true; + return dapm->idle_bias; } /* From 2e7f0a86123d54a94fa3d309efdfbac02f2999b8 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:41 +0000 Subject: [PATCH 2170/3206] ASoC: soc-dapm: add snd_soc_dapm_get_bias_level() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Many drivers are directly using dapm->idle_bias, but it should get it via get_idle_bias() function. Makes it as global function. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/871pomx062.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 2 ++ sound/soc/soc-dapm.c | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 9618a54a53485a..e978be4010b841 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -662,6 +662,8 @@ struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm); struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm); struct snd_soc_component *snd_soc_dapm_to_component(struct snd_soc_dapm_context *dapm); +bool snd_soc_dapm_get_idle_bias(struct snd_soc_dapm_context *dapm); + /* dapm path setup */ int snd_soc_dapm_new_widgets(struct snd_soc_card *card); void snd_soc_dapm_free(struct snd_soc_dapm_context *dapm); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index a263696ef8acc6..209da43bc023b9 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2180,7 +2180,7 @@ static void dapm_power_one_widget(struct snd_soc_dapm_widget *w, dapm_seq_insert(w, down_list, false); } -static bool dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) +bool snd_soc_dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) { if (dapm->idle_bias) { struct snd_soc_component *component = snd_soc_dapm_to_component(dapm); @@ -2193,6 +2193,7 @@ static bool dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) return dapm->idle_bias; } +EXPORT_SYMBOL_GPL(snd_soc_dapm_get_idle_bias); /* * Scan each dapm widget for complete audio path. @@ -2219,7 +2220,7 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event, trace_snd_soc_dapm_start(card, event); for_each_card_dapms(card, d) { - if (dapm_get_idle_bias(d)) + if (snd_soc_dapm_get_idle_bias(d)) d->target_bias_level = SND_SOC_BIAS_STANDBY; else d->target_bias_level = SND_SOC_BIAS_OFF; @@ -2286,7 +2287,7 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event, if (d->target_bias_level > bias) bias = d->target_bias_level; for_each_card_dapms(card, d) - if (dapm_get_idle_bias(d)) + if (snd_soc_dapm_get_idle_bias(d)) d->target_bias_level = bias; trace_snd_soc_dapm_walk_done(card); From cb3c715d89607f8896c0f20fe528a08e7ebffea9 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:45 +0000 Subject: [PATCH 2171/3206] ASoC: soc-dapm: add snd_soc_dapm_set_idle_bias() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Many drivers are directly setting dapm->idle_bias, but it will be impossible soon. adds snd_soc_dapm_set_idle_bias() for them. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87zfbavllj.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 1 + sound/soc/soc-dapm.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index e978be4010b841..75941324886be3 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -663,6 +663,7 @@ struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm); struct snd_soc_component *snd_soc_dapm_to_component(struct snd_soc_dapm_context *dapm); bool snd_soc_dapm_get_idle_bias(struct snd_soc_dapm_context *dapm); +void snd_soc_dapm_set_idle_bias(struct snd_soc_dapm_context *dapm, bool on); /* dapm path setup */ int snd_soc_dapm_new_widgets(struct snd_soc_card *card); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 209da43bc023b9..51fb09d56c5a17 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2195,6 +2195,12 @@ bool snd_soc_dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) } EXPORT_SYMBOL_GPL(snd_soc_dapm_get_idle_bias); +void snd_soc_dapm_set_idle_bias(struct snd_soc_dapm_context *dapm, bool on) +{ + dapm->idle_bias = on; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_set_idle_bias); + /* * Scan each dapm widget for complete audio path. * A complete path is a route that has valid endpoints i.e.:- From 66a940b1bf48a7095162688332d725ba160154eb Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:42 +0100 Subject: [PATCH 2172/3206] ASoC: codecs: wcd937x: set the comp soundwire port correctly For some reason we endup with setting soundwire port for HPHL_COMP and HPHR_COMP as zero, this can potentially result in a memory corruption due to accessing and setting -1 th element of port_map array. Fixes: 82be8c62a38c ("ASoC: codecs: wcd937x: add basic controls") Cc: Stable@vger.kernel.org Signed-off-by: Srinivas Kandagatla Reviewed-by: Alexey Klimov Link: https://patch.msgid.link/20250909121954.225833-2-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/codecs/wcd937x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wcd937x.c b/sound/soc/codecs/wcd937x.c index 3b0a8cc314e059..de2dff3c56d328 100644 --- a/sound/soc/codecs/wcd937x.c +++ b/sound/soc/codecs/wcd937x.c @@ -2046,9 +2046,9 @@ static const struct snd_kcontrol_new wcd937x_snd_controls[] = { SOC_ENUM_EXT("RX HPH Mode", rx_hph_mode_mux_enum, wcd937x_rx_hph_mode_get, wcd937x_rx_hph_mode_put), - SOC_SINGLE_EXT("HPHL_COMP Switch", SND_SOC_NOPM, 0, 1, 0, + SOC_SINGLE_EXT("HPHL_COMP Switch", WCD937X_COMP_L, 0, 1, 0, wcd937x_get_compander, wcd937x_set_compander), - SOC_SINGLE_EXT("HPHR_COMP Switch", SND_SOC_NOPM, 1, 1, 0, + SOC_SINGLE_EXT("HPHR_COMP Switch", WCD937X_COMP_R, 1, 1, 0, wcd937x_get_compander, wcd937x_set_compander), SOC_SINGLE_TLV("HPHL Volume", WCD937X_HPH_L_EN, 0, 20, 1, line_gain), From c4bb62eb594418a6bd05ff03bb9072ee1fef29c2 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:43 +0100 Subject: [PATCH 2173/3206] ASoC: codecs: wcd937x: make stub functions inline For some reason we ended up with stub functions that are not inline, this can result in build error if its included multiple places, as we will be redefining the same function Fixes: c99a515ff153 ("ASoC: codecs: wcd937x-sdw: add SoundWire driver") Cc: Stable@vger.kernel.org Signed-off-by: Srinivas Kandagatla Link: https://patch.msgid.link/20250909121954.225833-3-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/codecs/wcd937x.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/wcd937x.h b/sound/soc/codecs/wcd937x.h index 3ab21bb5846e2c..d20886a2803a4c 100644 --- a/sound/soc/codecs/wcd937x.h +++ b/sound/soc/codecs/wcd937x.h @@ -552,21 +552,21 @@ int wcd937x_sdw_hw_params(struct wcd937x_sdw_priv *wcd, struct device *wcd937x_sdw_device_get(struct device_node *np); #else -int wcd937x_sdw_free(struct wcd937x_sdw_priv *wcd, +static inline int wcd937x_sdw_free(struct wcd937x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_soc_dai *dai) { return -EOPNOTSUPP; } -int wcd937x_sdw_set_sdw_stream(struct wcd937x_sdw_priv *wcd, +static inline int wcd937x_sdw_set_sdw_stream(struct wcd937x_sdw_priv *wcd, struct snd_soc_dai *dai, void *stream, int direction) { return -EOPNOTSUPP; } -int wcd937x_sdw_hw_params(struct wcd937x_sdw_priv *wcd, +static inline int wcd937x_sdw_hw_params(struct wcd937x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai) From 76cffc3eb1bdee0a7e8cca090adfd46a740f1cb0 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:44 +0100 Subject: [PATCH 2174/3206] soundwire: bus: add of_sdw_find_device_by_node helper There has been more than 3 instances of this helper in multiple codec drivers, it does not make sense to keep duplicating this part of code. Lets add a helper of_sdw_find_device_by_node for codec drivers to use it. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Acked-by: Vinod Koul Link: https://patch.msgid.link/20250909121954.225833-4-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- drivers/soundwire/slave.c | 6 ++++++ include/linux/soundwire/sdw.h | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/soundwire/slave.c b/drivers/soundwire/slave.c index d2d99555ec5a50..3d4d00188c26cc 100644 --- a/drivers/soundwire/slave.c +++ b/drivers/soundwire/slave.c @@ -273,4 +273,10 @@ int sdw_of_find_slaves(struct sdw_bus *bus) return 0; } +struct device *of_sdw_find_device_by_node(struct device_node *np) +{ + return bus_find_device_by_of_node(&sdw_bus_type, np); +} +EXPORT_SYMBOL_GPL(of_sdw_find_device_by_node); + MODULE_IMPORT_NS("SND_SOC_SDCA"); diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 0832776262ac3b..096213956d311c 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -19,6 +19,7 @@ struct dentry; struct fwnode_handle; +struct device_node; struct sdw_bus; struct sdw_slave; @@ -1086,6 +1087,8 @@ int sdw_stream_add_slave(struct sdw_slave *slave, int sdw_stream_remove_slave(struct sdw_slave *slave, struct sdw_stream_runtime *stream); +struct device *of_sdw_find_device_by_node(struct device_node *np); + int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base); /* messaging and data APIs */ @@ -1119,6 +1122,12 @@ static inline int sdw_stream_remove_slave(struct sdw_slave *slave, return -EINVAL; } +static inline struct device *of_sdw_find_device_by_node(struct device_node *np) +{ + WARN_ONCE(1, "SoundWire API is disabled"); + return NULL; +} + /* messaging and data APIs */ static inline int sdw_read(struct sdw_slave *slave, u32 addr) { From 2e07017b28e8bbace4a4973d11d0646575d36f94 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:45 +0100 Subject: [PATCH 2175/3206] soundwire: bus: add sdw_slave_get_current_bank helper There has been 2 instances of this helper in codec drivers, it does not make sense to keep duplicating this part of code. Lets add a helper sdw_get_current_bank() for codec drivers to use it. Signed-off-by: Srinivas Kandagatla Acked-by: Vinod Koul Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250909121954.225833-5-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- drivers/soundwire/bus.c | 12 ++++++++++++ include/linux/soundwire/sdw.h | 8 ++++++++ 2 files changed, 20 insertions(+) diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index 4fd5cac799c547..55c1db81653400 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -1360,6 +1360,18 @@ int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base) } EXPORT_SYMBOL(sdw_slave_get_scale_index); +int sdw_slave_get_current_bank(struct sdw_slave *slave) +{ + int tmp; + + tmp = sdw_read(slave, SDW_SCP_CTRL); + if (tmp < 0) + return tmp; + + return FIELD_GET(SDW_SCP_STAT_CURR_BANK, tmp); +} +EXPORT_SYMBOL_GPL(sdw_slave_get_current_bank); + static int sdw_slave_set_frequency(struct sdw_slave *slave) { int scale_index; diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 096213956d311c..e6a3476bcef1ae 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1089,6 +1089,8 @@ int sdw_stream_remove_slave(struct sdw_slave *slave, struct device *of_sdw_find_device_by_node(struct device_node *np); +int sdw_slave_get_current_bank(struct sdw_slave *sdev); + int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base); /* messaging and data APIs */ @@ -1128,6 +1130,12 @@ static inline struct device *of_sdw_find_device_by_node(struct device_node *np) return NULL; } +static inline int sdw_slave_get_current_bank(struct sdw_slave *sdev) +{ + WARN_ONCE(1, "SoundWire API is disabled"); + return -EINVAL; +} + /* messaging and data APIs */ static inline int sdw_read(struct sdw_slave *slave, u32 addr) { From 772ed12bd04e6e6ad6d3fbc34016a2f88e63af7d Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:46 +0100 Subject: [PATCH 2176/3206] ASoC: codecs: wcdxxxx: use of_sdw_find_device_by_node helper use of_sdw_find_device_by_node helper function, rather than duplicating this function in every codec driver. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250909121954.225833-6-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/codecs/wcd937x-sdw.c | 6 ------ sound/soc/codecs/wcd937x.c | 4 ++-- sound/soc/codecs/wcd937x.h | 2 -- sound/soc/codecs/wcd938x-sdw.c | 7 ------- sound/soc/codecs/wcd938x.c | 4 ++-- sound/soc/codecs/wcd938x.h | 6 ------ sound/soc/codecs/wcd939x-sdw.c | 6 ------ sound/soc/codecs/wcd939x.c | 4 ++-- sound/soc/codecs/wcd939x.h | 6 ------ 9 files changed, 6 insertions(+), 39 deletions(-) diff --git a/sound/soc/codecs/wcd937x-sdw.c b/sound/soc/codecs/wcd937x-sdw.c index 1bfe7383b31117..e7cc699bd8bcf1 100644 --- a/sound/soc/codecs/wcd937x-sdw.c +++ b/sound/soc/codecs/wcd937x-sdw.c @@ -78,12 +78,6 @@ static struct sdw_dpn_prop wcd937x_dpn_prop[WCD937X_MAX_SWR_PORTS] = { } }; -struct device *wcd937x_sdw_device_get(struct device_node *np) -{ - return bus_find_device_by_of_node(&sdw_bus_type, np); -} -EXPORT_SYMBOL_GPL(wcd937x_sdw_device_get); - int wcd937x_sdw_hw_params(struct wcd937x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, diff --git a/sound/soc/codecs/wcd937x.c b/sound/soc/codecs/wcd937x.c index de2dff3c56d328..b78f37c582cade 100644 --- a/sound/soc/codecs/wcd937x.c +++ b/sound/soc/codecs/wcd937x.c @@ -2788,7 +2788,7 @@ static int wcd937x_bind(struct device *dev) return ret; } - wcd937x->rxdev = wcd937x_sdw_device_get(wcd937x->rxnode); + wcd937x->rxdev = of_sdw_find_device_by_node(wcd937x->rxnode); if (!wcd937x->rxdev) { dev_err(dev, "could not find slave with matching of node\n"); return -EINVAL; @@ -2797,7 +2797,7 @@ static int wcd937x_bind(struct device *dev) wcd937x->sdw_priv[AIF1_PB] = dev_get_drvdata(wcd937x->rxdev); wcd937x->sdw_priv[AIF1_PB]->wcd937x = wcd937x; - wcd937x->txdev = wcd937x_sdw_device_get(wcd937x->txnode); + wcd937x->txdev = of_sdw_find_device_by_node(wcd937x->txnode); if (!wcd937x->txdev) { dev_err(dev, "could not find txslave with matching of node\n"); return -EINVAL; diff --git a/sound/soc/codecs/wcd937x.h b/sound/soc/codecs/wcd937x.h index d20886a2803a4c..0f96b7108a7ebe 100644 --- a/sound/soc/codecs/wcd937x.h +++ b/sound/soc/codecs/wcd937x.h @@ -549,8 +549,6 @@ int wcd937x_sdw_hw_params(struct wcd937x_sdw_priv *wcd, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); -struct device *wcd937x_sdw_device_get(struct device_node *np); - #else static inline int wcd937x_sdw_free(struct wcd937x_sdw_priv *wcd, struct snd_pcm_substream *substream, diff --git a/sound/soc/codecs/wcd938x-sdw.c b/sound/soc/codecs/wcd938x-sdw.c index e822cc14525066..a7514d716086ca 100644 --- a/sound/soc/codecs/wcd938x-sdw.c +++ b/sound/soc/codecs/wcd938x-sdw.c @@ -82,13 +82,6 @@ static struct sdw_dpn_prop wcd938x_dpn_prop[WCD938X_MAX_SWR_PORTS] = { } }; -struct device *wcd938x_sdw_device_get(struct device_node *np) -{ - return bus_find_device_by_of_node(&sdw_bus_type, np); - -} -EXPORT_SYMBOL_GPL(wcd938x_sdw_device_get); - int wcd938x_swr_get_current_bank(struct sdw_slave *sdev) { int bank; diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c index 711f373ece24cf..e2cb0758bca790 100644 --- a/sound/soc/codecs/wcd938x.c +++ b/sound/soc/codecs/wcd938x.c @@ -3400,7 +3400,7 @@ static int wcd938x_bind(struct device *dev) return ret; } - wcd938x->rxdev = wcd938x_sdw_device_get(wcd938x->rxnode); + wcd938x->rxdev = of_sdw_find_device_by_node(wcd938x->rxnode); if (!wcd938x->rxdev) { dev_err(dev, "could not find slave with matching of node\n"); ret = -EINVAL; @@ -3409,7 +3409,7 @@ static int wcd938x_bind(struct device *dev) wcd938x->sdw_priv[AIF1_PB] = dev_get_drvdata(wcd938x->rxdev); wcd938x->sdw_priv[AIF1_PB]->wcd938x = wcd938x; - wcd938x->txdev = wcd938x_sdw_device_get(wcd938x->txnode); + wcd938x->txdev = of_sdw_find_device_by_node(wcd938x->txnode); if (!wcd938x->txdev) { dev_err(dev, "could not find txslave with matching of node\n"); ret = -EINVAL; diff --git a/sound/soc/codecs/wcd938x.h b/sound/soc/codecs/wcd938x.h index fb6a0e4ef33774..dbafcae247f4fb 100644 --- a/sound/soc/codecs/wcd938x.h +++ b/sound/soc/codecs/wcd938x.h @@ -670,7 +670,6 @@ int wcd938x_sdw_hw_params(struct wcd938x_sdw_priv *wcd, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); -struct device *wcd938x_sdw_device_get(struct device_node *np); int wcd938x_swr_get_current_bank(struct sdw_slave *sdev); #else @@ -697,11 +696,6 @@ static inline int wcd938x_sdw_hw_params(struct wcd938x_sdw_priv *wcd, return -EOPNOTSUPP; } -static inline struct device *wcd938x_sdw_device_get(struct device_node *np) -{ - return NULL; -} - static inline int wcd938x_swr_get_current_bank(struct sdw_slave *sdev) { return 0; diff --git a/sound/soc/codecs/wcd939x-sdw.c b/sound/soc/codecs/wcd939x-sdw.c index f7a9323a9feadb..e487a1bb0194e1 100644 --- a/sound/soc/codecs/wcd939x-sdw.c +++ b/sound/soc/codecs/wcd939x-sdw.c @@ -128,12 +128,6 @@ static struct sdw_dpn_prop wcd939x_tx_dpn_prop[WCD939X_MAX_TX_SWR_PORTS] = { } }; -struct device *wcd939x_sdw_device_get(struct device_node *np) -{ - return bus_find_device_by_of_node(&sdw_bus_type, np); -} -EXPORT_SYMBOL_GPL(wcd939x_sdw_device_get); - unsigned int wcd939x_swr_get_current_bank(struct sdw_slave *sdev) { return FIELD_GET(SDW_SCP_STAT_CURR_BANK, diff --git a/sound/soc/codecs/wcd939x.c b/sound/soc/codecs/wcd939x.c index 64f082e474c1d4..5a56c79a892277 100644 --- a/sound/soc/codecs/wcd939x.c +++ b/sound/soc/codecs/wcd939x.c @@ -3383,7 +3383,7 @@ static int wcd939x_bind(struct device *dev) goto err_put_typec_switch; } - wcd939x->rxdev = wcd939x_sdw_device_get(wcd939x->rxnode); + wcd939x->rxdev = of_sdw_find_device_by_node(wcd939x->rxnode); if (!wcd939x->rxdev) { dev_err(dev, "could not find slave with matching of node\n"); ret = -EINVAL; @@ -3392,7 +3392,7 @@ static int wcd939x_bind(struct device *dev) wcd939x->sdw_priv[AIF1_PB] = dev_get_drvdata(wcd939x->rxdev); wcd939x->sdw_priv[AIF1_PB]->wcd939x = wcd939x; - wcd939x->txdev = wcd939x_sdw_device_get(wcd939x->txnode); + wcd939x->txdev = of_sdw_find_device_by_node(wcd939x->txnode); if (!wcd939x->txdev) { dev_err(dev, "could not find txslave with matching of node\n"); ret = -EINVAL; diff --git a/sound/soc/codecs/wcd939x.h b/sound/soc/codecs/wcd939x.h index 3204fb10b58d7d..3f189e5cafd517 100644 --- a/sound/soc/codecs/wcd939x.h +++ b/sound/soc/codecs/wcd939x.h @@ -930,7 +930,6 @@ int wcd939x_sdw_hw_params(struct wcd939x_sdw_priv *wcd, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); -struct device *wcd939x_sdw_device_get(struct device_node *np); unsigned int wcd939x_swr_get_current_bank(struct sdw_slave *sdev); struct regmap *wcd939x_swr_get_regmap(struct wcd939x_sdw_priv *wcd); @@ -958,11 +957,6 @@ static inline int wcd939x_sdw_hw_params(struct wcd939x_sdw_priv *wcd, return -EOPNOTSUPP; } -static inline struct device *wcd939x_sdw_device_get(struct device_node *np) -{ - return NULL; -} - static inline unsigned int wcd939x_swr_get_current_bank(struct sdw_slave *sdev) { return 0; From 45a3295a3005f7782054a153312ba81d28eb7664 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:47 +0100 Subject: [PATCH 2177/3206] ASoC: codecs: wcdxxxx: use sdw_slave_get_current_bank helper use sdw_slave_get_current_bank() helper function, rather than duplicating this function in every codec driver. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Acked-by: Vinod Koul Link: https://patch.msgid.link/20250909121954.225833-7-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/codecs/wcd938x-sdw.c | 10 ---------- sound/soc/codecs/wcd938x.c | 3 +-- sound/soc/codecs/wcd938x.h | 7 ------- sound/soc/codecs/wcd939x-sdw.c | 7 ------- sound/soc/codecs/wcd939x.c | 2 +- sound/soc/codecs/wcd939x.h | 7 ------- 6 files changed, 2 insertions(+), 34 deletions(-) diff --git a/sound/soc/codecs/wcd938x-sdw.c b/sound/soc/codecs/wcd938x-sdw.c index a7514d716086ca..8bcd8396f3750a 100644 --- a/sound/soc/codecs/wcd938x-sdw.c +++ b/sound/soc/codecs/wcd938x-sdw.c @@ -82,16 +82,6 @@ static struct sdw_dpn_prop wcd938x_dpn_prop[WCD938X_MAX_SWR_PORTS] = { } }; -int wcd938x_swr_get_current_bank(struct sdw_slave *sdev) -{ - int bank; - - bank = sdw_read(sdev, SDW_SCP_CTRL); - - return ((bank & 0x40) ? 1 : 0); -} -EXPORT_SYMBOL_GPL(wcd938x_swr_get_current_bank); - int wcd938x_sdw_hw_params(struct wcd938x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c index e2cb0758bca790..f8d7bf27a6edbc 100644 --- a/sound/soc/codecs/wcd938x.c +++ b/sound/soc/codecs/wcd938x.c @@ -1094,8 +1094,7 @@ static int wcd938x_tx_swr_ctrl(struct snd_soc_dapm_widget *w, int bank; int rate; - bank = (wcd938x_swr_get_current_bank(wcd938x->sdw_priv[AIF1_CAP]->sdev)) ? 0 : 1; - bank = bank ? 0 : 1; + bank = sdw_slave_get_current_bank(wcd938x->sdw_priv[AIF1_CAP]->sdev); switch (event) { case SND_SOC_DAPM_PRE_PMU: diff --git a/sound/soc/codecs/wcd938x.h b/sound/soc/codecs/wcd938x.h index dbafcae247f4fb..54ee56b7fbd6ab 100644 --- a/sound/soc/codecs/wcd938x.h +++ b/sound/soc/codecs/wcd938x.h @@ -669,9 +669,6 @@ int wcd938x_sdw_hw_params(struct wcd938x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); - -int wcd938x_swr_get_current_bank(struct sdw_slave *sdev); - #else static inline int wcd938x_sdw_free(struct wcd938x_sdw_priv *wcd, @@ -696,9 +693,5 @@ static inline int wcd938x_sdw_hw_params(struct wcd938x_sdw_priv *wcd, return -EOPNOTSUPP; } -static inline int wcd938x_swr_get_current_bank(struct sdw_slave *sdev) -{ - return 0; -} #endif /* CONFIG_SND_SOC_WCD938X_SDW */ #endif /* __WCD938X_H__ */ diff --git a/sound/soc/codecs/wcd939x-sdw.c b/sound/soc/codecs/wcd939x-sdw.c index e487a1bb0194e1..477d6cf27d32c3 100644 --- a/sound/soc/codecs/wcd939x-sdw.c +++ b/sound/soc/codecs/wcd939x-sdw.c @@ -128,13 +128,6 @@ static struct sdw_dpn_prop wcd939x_tx_dpn_prop[WCD939X_MAX_TX_SWR_PORTS] = { } }; -unsigned int wcd939x_swr_get_current_bank(struct sdw_slave *sdev) -{ - return FIELD_GET(SDW_SCP_STAT_CURR_BANK, - sdw_read(sdev, SDW_SCP_CTRL)); -} -EXPORT_SYMBOL_GPL(wcd939x_swr_get_current_bank); - int wcd939x_sdw_hw_params(struct wcd939x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, diff --git a/sound/soc/codecs/wcd939x.c b/sound/soc/codecs/wcd939x.c index 5a56c79a892277..85730ae40c2c24 100644 --- a/sound/soc/codecs/wcd939x.c +++ b/sound/soc/codecs/wcd939x.c @@ -1017,7 +1017,7 @@ static int wcd939x_tx_swr_ctrl(struct snd_soc_dapm_widget *w, int bank; int rate; - bank = wcd939x_swr_get_current_bank(wcd939x->sdw_priv[AIF1_CAP]->sdev); + bank = sdw_slave_get_current_bank(wcd939x->sdw_priv[AIF1_CAP]->sdev); switch (event) { case SND_SOC_DAPM_PRE_PMU: diff --git a/sound/soc/codecs/wcd939x.h b/sound/soc/codecs/wcd939x.h index 3f189e5cafd517..e70445b1a4bc8a 100644 --- a/sound/soc/codecs/wcd939x.h +++ b/sound/soc/codecs/wcd939x.h @@ -930,8 +930,6 @@ int wcd939x_sdw_hw_params(struct wcd939x_sdw_priv *wcd, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); -unsigned int wcd939x_swr_get_current_bank(struct sdw_slave *sdev); - struct regmap *wcd939x_swr_get_regmap(struct wcd939x_sdw_priv *wcd); #else @@ -957,11 +955,6 @@ static inline int wcd939x_sdw_hw_params(struct wcd939x_sdw_priv *wcd, return -EOPNOTSUPP; } -static inline unsigned int wcd939x_swr_get_current_bank(struct sdw_slave *sdev) -{ - return 0; -} - struct regmap *wcd939x_swr_get_regmap(struct wcd939x_sdw_priv *wcd) { return PTR_ERR(-EINVAL); From 4f16b6351bbff629e1a2a9d902b96210a50d65f0 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:48 +0100 Subject: [PATCH 2178/3206] ASoC: codecs: wcd: add common helper for wcd codecs All the Qualcomm WCD codecs have most of its code duplicated across all these 3/4 drivers. This is an attempt to remove those duplicate parts by adding a common helper library for these codecs. To start with move all the micbias parsing and voltage settings these are identical in WCD934x, WCD937x, WCD938x and WCD939x codec driver. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250909121954.225833-8-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/codecs/Kconfig | 7 +++ sound/soc/codecs/Makefile | 2 + sound/soc/codecs/wcd-common.c | 70 +++++++++++++++++++++++++++ sound/soc/codecs/wcd-common.h | 27 +++++++++++ sound/soc/codecs/wcd934x.c | 82 +++++++++---------------------- sound/soc/codecs/wcd937x.c | 81 ++++++++----------------------- sound/soc/codecs/wcd938x.c | 88 +++++++--------------------------- sound/soc/codecs/wcd939x.c | 90 +++++++---------------------------- 8 files changed, 182 insertions(+), 265 deletions(-) create mode 100644 sound/soc/codecs/wcd-common.c create mode 100644 sound/soc/codecs/wcd-common.h diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 6d7e4725d89cd3..3f11b2c9cce880 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -2233,6 +2233,9 @@ config SND_SOC_UDA1380 config SND_SOC_WCD_CLASSH tristate +config SND_SOC_WCD_COMMON + tristate + config SND_SOC_WCD9335 tristate "WCD9335 Codec" depends on SLIMBUS @@ -2254,6 +2257,7 @@ config SND_SOC_WCD934X select REGMAP_IRQ select REGMAP_SLIMBUS select SND_SOC_WCD_CLASSH + select SND_SOC_WCD_COMMON select SND_SOC_WCD_MBHC depends on MFD_WCD934X || COMPILE_TEST help @@ -2265,6 +2269,7 @@ config SND_SOC_WCD937X tristate depends on SOUNDWIRE || !SOUNDWIRE select SND_SOC_WCD_CLASSH + select SND_SOC_WCD_COMMON config SND_SOC_WCD937X_SDW tristate "WCD9370/WCD9375 Codec - SDW" @@ -2284,6 +2289,7 @@ config SND_SOC_WCD938X tristate depends on SOUNDWIRE || !SOUNDWIRE select SND_SOC_WCD_CLASSH + select SND_SOC_WCD_COMMON select MULTIPLEXER config SND_SOC_WCD938X_SDW @@ -2303,6 +2309,7 @@ config SND_SOC_WCD939X depends on SOUNDWIRE || !SOUNDWIRE depends on TYPEC || !TYPEC select SND_SOC_WCD_CLASSH + select SND_SOC_WCD_COMMON config SND_SOC_WCD939X_SDW tristate "WCD9390/WCD9395 Codec - SDW" diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile index a68c3d192a1b6c..79e32f319cabca 100644 --- a/sound/soc/codecs/Makefile +++ b/sound/soc/codecs/Makefile @@ -339,6 +339,7 @@ snd-soc-uda1334-y := uda1334.o snd-soc-uda1342-y := uda1342.o snd-soc-uda1380-y := uda1380.o snd-soc-wcd-classh-y := wcd-clsh-v2.o +snd-soc-wcd-common-y := wcd-common.o snd-soc-wcd-mbhc-y := wcd-mbhc-v2.o snd-soc-wcd9335-y := wcd9335.o snd-soc-wcd934x-y := wcd934x.o @@ -761,6 +762,7 @@ obj-$(CONFIG_SND_SOC_UDA1334) += snd-soc-uda1334.o obj-$(CONFIG_SND_SOC_UDA1342) += snd-soc-uda1342.o obj-$(CONFIG_SND_SOC_UDA1380) += snd-soc-uda1380.o obj-$(CONFIG_SND_SOC_WCD_CLASSH) += snd-soc-wcd-classh.o +obj-$(CONFIG_SND_SOC_WCD_COMMON) += snd-soc-wcd-common.o obj-$(CONFIG_SND_SOC_WCD_MBHC) += snd-soc-wcd-mbhc.o obj-$(CONFIG_SND_SOC_WCD9335) += snd-soc-wcd9335.o obj-$(CONFIG_SND_SOC_WCD934X) += snd-soc-wcd934x.o diff --git a/sound/soc/codecs/wcd-common.c b/sound/soc/codecs/wcd-common.c new file mode 100644 index 00000000000000..8f3c0806cdc92c --- /dev/null +++ b/sound/soc/codecs/wcd-common.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2025, Qualcomm Technologies, Inc. and/or its subsidiaries. + +#include +#include +#include +#include +#include +#include + +#include "wcd-common.h" + +#define WCD_MIN_MICBIAS_MV 1000 +#define WCD_DEF_MICBIAS_MV 1800 +#define WCD_MAX_MICBIAS_MV 2850 + +int wcd_get_micb_vout_ctl_val(struct device *dev, u32 micb_mv) +{ + /* min micbias voltage is 1V and maximum is 2.85V */ + if (micb_mv < WCD_MIN_MICBIAS_MV || micb_mv > WCD_MAX_MICBIAS_MV) { + dev_err(dev, "Unsupported micbias voltage (%u mV)\n", micb_mv); + return -EINVAL; + } + + return (micb_mv - WCD_MIN_MICBIAS_MV) / 50; +} +EXPORT_SYMBOL_GPL(wcd_get_micb_vout_ctl_val); + +static int wcd_get_micbias_val(struct device *dev, int micb_num, u32 *micb_mv) +{ + char micbias[64]; + int mv; + + sprintf(micbias, "qcom,micbias%d-microvolt", micb_num); + + if (of_property_read_u32(dev->of_node, micbias, &mv)) { + dev_err(dev, "%s value not found, using default\n", micbias); + mv = WCD_DEF_MICBIAS_MV; + } else { + /* convert it to milli volts */ + mv = mv/1000; + } + if (micb_mv) + *micb_mv = mv; + + mv = wcd_get_micb_vout_ctl_val(dev, mv); + if (mv < 0) { + dev_err(dev, "Unsupported %s voltage (%d mV), falling back to default (%d mV)\n", + micbias, mv, WCD_DEF_MICBIAS_MV); + return wcd_get_micb_vout_ctl_val(dev, WCD_DEF_MICBIAS_MV); + } + + return mv; +} + +int wcd_dt_parse_micbias_info(struct wcd_common *common) +{ + int i; + + for (i = 0; i < common->max_bias; i++) { + common->micb_vout[i] = wcd_get_micbias_val(common->dev, i + 1, &common->micb_mv[i]); + if (common->micb_vout[i] < 0) + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(wcd_dt_parse_micbias_info); +MODULE_DESCRIPTION("Common Qualcomm WCD Codec helpers driver"); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/wcd-common.h b/sound/soc/codecs/wcd-common.h new file mode 100644 index 00000000000000..611f06cdec550d --- /dev/null +++ b/sound/soc/codecs/wcd-common.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025, Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef __WCD_COMMON_H__ +#define __WCD_COMMON_H__ + +struct device; +struct sdw_slave; +struct sdw_bus_params; +struct irq_domain; +enum sdw_slave_status; + +#define WCD_MAX_MICBIAS 4 + +struct wcd_common { + struct device *dev; + int max_bias; + u32 micb_mv[WCD_MAX_MICBIAS]; + u32 micb_vout[WCD_MAX_MICBIAS]; +}; + +int wcd_get_micb_vout_ctl_val(struct device *dev, u32 micb_mv); +int wcd_dt_parse_micbias_info(struct wcd_common *common); + +#endif /* __WCD_COMMON_H__ */ diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c index 1bb7e1dc7e6b0a..d10b457e6c9ac4 100644 --- a/sound/soc/codecs/wcd934x.c +++ b/sound/soc/codecs/wcd934x.c @@ -21,6 +21,7 @@ #include #include #include "wcd-clsh-v2.h" +#include "wcd-common.h" #include "wcd-mbhc-v2.h" #include @@ -116,9 +117,6 @@ #define WCD934X_DEC_PWR_LVL_DF 0x00 #define WCD934X_DEC_PWR_LVL_HYBRID WCD934X_DEC_PWR_LVL_DF -#define WCD934X_DEF_MICBIAS_MV 1800 -#define WCD934X_MAX_MICBIAS_MV 2850 - #define WCD_IIR_FILTER_SIZE (sizeof(u32) * BAND_MAX) #define WCD_IIR_FILTER_CTL(xname, iidx, bidx) \ @@ -530,6 +528,7 @@ struct wcd934x_codec { struct slim_device *sdev; struct slim_device *sidev; struct wcd_clsh_ctrl *clsh_ctrl; + struct wcd_common common; struct snd_soc_component *component; struct wcd934x_slim_ch rx_chs[WCD934X_RX_MAX]; struct wcd934x_slim_ch tx_chs[WCD934X_TX_MAX]; @@ -555,7 +554,6 @@ struct wcd934x_codec { struct mutex micb_lock; u32 micb_ref[WCD934X_MAX_MICBIAS]; u32 pullup_ref[WCD934X_MAX_MICBIAS]; - u32 micb2_mv; }; #define to_wcd934x_codec(_hw) container_of(_hw, struct wcd934x_codec, hw) @@ -2168,55 +2166,24 @@ static struct clk *wcd934x_register_mclk_output(struct wcd934x_codec *wcd) return NULL; } -static int wcd934x_get_micbias_val(struct device *dev, const char *micbias, - u32 *micb_mv) -{ - int mv; - - if (of_property_read_u32(dev->parent->of_node, micbias, &mv)) { - dev_err(dev, "%s value not found, using default\n", micbias); - mv = WCD934X_DEF_MICBIAS_MV; - } else { - /* convert it to milli volts */ - mv = mv/1000; - } - - if (mv < 1000 || mv > 2850) { - dev_err(dev, "%s value not in valid range, using default\n", - micbias); - mv = WCD934X_DEF_MICBIAS_MV; - } - - if (micb_mv) - *micb_mv = mv; - - return (mv - 1000) / 50; -} - static int wcd934x_init_dmic(struct snd_soc_component *comp) { - int vout_ctl_1, vout_ctl_2, vout_ctl_3, vout_ctl_4; struct wcd934x_codec *wcd = dev_get_drvdata(comp->dev); u32 def_dmic_rate, dmic_clk_drv; + int ret; - vout_ctl_1 = wcd934x_get_micbias_val(comp->dev, - "qcom,micbias1-microvolt", NULL); - vout_ctl_2 = wcd934x_get_micbias_val(comp->dev, - "qcom,micbias2-microvolt", - &wcd->micb2_mv); - vout_ctl_3 = wcd934x_get_micbias_val(comp->dev, - "qcom,micbias3-microvolt", NULL); - vout_ctl_4 = wcd934x_get_micbias_val(comp->dev, - "qcom,micbias4-microvolt", NULL); + ret = wcd_dt_parse_mbhc_data(comp->dev, &wcd->mbhc_cfg); + if (ret) + return ret; snd_soc_component_update_bits(comp, WCD934X_ANA_MICB1, - WCD934X_MICB_VAL_MASK, vout_ctl_1); + WCD934X_MICB_VAL_MASK, wcd->common.micb_vout[0]); snd_soc_component_update_bits(comp, WCD934X_ANA_MICB2, - WCD934X_MICB_VAL_MASK, vout_ctl_2); + WCD934X_MICB_VAL_MASK, wcd->common.micb_vout[1]); snd_soc_component_update_bits(comp, WCD934X_ANA_MICB3, - WCD934X_MICB_VAL_MASK, vout_ctl_3); + WCD934X_MICB_VAL_MASK, wcd->common.micb_vout[2]); snd_soc_component_update_bits(comp, WCD934X_ANA_MICB4, - WCD934X_MICB_VAL_MASK, vout_ctl_4); + WCD934X_MICB_VAL_MASK, wcd->common.micb_vout[3]); if (wcd->rate == WCD934X_MCLK_CLK_9P6MHZ) def_dmic_rate = WCD9XXX_DMIC_SAMPLE_RATE_4P8MHZ; @@ -2517,15 +2484,6 @@ static void wcd934x_mbhc_micb_ramp_control(struct snd_soc_component *component, } } -static int wcd934x_get_micb_vout_ctl_val(u32 micb_mv) -{ - /* min micbias voltage is 1V and maximum is 2.85V */ - if (micb_mv < 1000 || micb_mv > 2850) - return -EINVAL; - - return (micb_mv - 1000) / 50; -} - static int wcd934x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, int req_volt, int micb_num) { @@ -2562,7 +2520,7 @@ static int wcd934x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, cur_vout_ctl = snd_soc_component_read_field(component, micb_reg, WCD934X_MICB_VAL_MASK); - req_vout_ctl = wcd934x_get_micb_vout_ctl_val(req_volt); + req_vout_ctl = wcd_get_micb_vout_ctl_val(component->dev, req_volt); if (req_vout_ctl < 0) { ret = -EINVAL; goto exit; @@ -2610,10 +2568,10 @@ static int wcd934x_mbhc_micb_ctrl_threshold_mic(struct snd_soc_component *compon * voltage needed to detect threshold microphone, then do * not change the micbias, just return. */ - if (wcd934x->micb2_mv >= WCD_MBHC_THR_HS_MICB_MV) + if (wcd934x->common.micb_mv[1] >= WCD_MBHC_THR_HS_MICB_MV) return 0; - micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd934x->micb2_mv; + micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd934x->common.micb_mv[1]; rc = wcd934x_mbhc_micb_adjust_voltage(component, micb_mv, MIC_BIAS_2); @@ -3036,7 +2994,7 @@ static void wcd934x_mbhc_deinit(struct snd_soc_component *component) static int wcd934x_comp_probe(struct snd_soc_component *component) { struct wcd934x_codec *wcd = dev_get_drvdata(component->dev); - int i; + int i, ret; snd_soc_component_init_regmap(component, wcd->regmap); wcd->component = component; @@ -3054,7 +3012,12 @@ static int wcd934x_comp_probe(struct snd_soc_component *component) for (i = 0; i < NUM_CODEC_DAIS; i++) INIT_LIST_HEAD(&wcd->dai[i].slim_ch_list); - wcd934x_init_dmic(component); + + ret = wcd934x_init_dmic(component); + if (ret) { + dev_err(component->dev, "Failed to Initialize micbias\n"); + return ret; + } if (wcd934x_mbhc_init(component)) dev_err(component->dev, "Failed to Initialize MBHC\n"); @@ -5860,14 +5823,13 @@ static int wcd934x_codec_parse_data(struct wcd934x_codec *wcd) cfg->anc_micbias = MIC_BIAS_2; cfg->v_hs_max = WCD_MBHC_HS_V_MAX; cfg->num_btn = WCD934X_MBHC_MAX_BUTTONS; - cfg->micb_mv = wcd->micb2_mv; + cfg->micb_mv = wcd->common.micb_mv[1]; cfg->linein_th = 5000; cfg->hs_thr = 1700; cfg->hph_thr = 50; wcd_dt_parse_mbhc_data(dev, cfg); - return 0; } @@ -5888,6 +5850,8 @@ static int wcd934x_codec_probe(struct platform_device *pdev) wcd->sdev = to_slim_device(data->dev); mutex_init(&wcd->sysclk_mutex); mutex_init(&wcd->micb_lock); + wcd->common.dev = dev->parent; + wcd->common.max_bias = 4; ret = wcd934x_codec_parse_data(wcd); if (ret) diff --git a/sound/soc/codecs/wcd937x.c b/sound/soc/codecs/wcd937x.c index b78f37c582cade..ffe6508fd84c45 100644 --- a/sound/soc/codecs/wcd937x.c +++ b/sound/soc/codecs/wcd937x.c @@ -21,6 +21,7 @@ #include #include "wcd-clsh-v2.h" +#include "wcd-common.h" #include "wcd-mbhc-v2.h" #include "wcd937x.h" @@ -85,6 +86,7 @@ struct wcd937x_priv { struct wcd_mbhc_config mbhc_cfg; struct wcd_mbhc_intr intr_ids; struct wcd_clsh_ctrl *clsh_info; + struct wcd_common common; struct irq_domain *virq; struct regmap_irq_chip_data *irq_chip; struct snd_soc_jack *jack; @@ -93,9 +95,6 @@ struct wcd937x_priv { s32 pullup_ref[WCD937X_MAX_MICBIAS]; u32 hph_mode; int ear_rx_path; - u32 micb1_mv; - u32 micb2_mv; - u32 micb3_mv; int hphr_pdm_wd_int; int hphl_pdm_wd_int; int aux_pdm_wd_int; @@ -872,15 +871,6 @@ static int wcd937x_enable_rx3(struct snd_soc_dapm_widget *w, return 0; } -static int wcd937x_get_micb_vout_ctl_val(u32 micb_mv) -{ - if (micb_mv < 1000 || micb_mv > 2850) { - pr_err("Unsupported micbias voltage (%u mV)\n", micb_mv); - return -EINVAL; - } - - return (micb_mv - 1000) / 50; -} static int wcd937x_tx_swr_ctrl(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event) @@ -1481,7 +1471,7 @@ static int wcd937x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, cur_vout_ctl = snd_soc_component_read_field(component, micb_reg, WCD937X_MICB_VOUT_MASK); - req_vout_ctl = wcd937x_get_micb_vout_ctl_val(req_volt); + req_vout_ctl = wcd_get_micb_vout_ctl_val(component->dev, req_volt); if (req_vout_ctl < 0) { ret = -EINVAL; goto exit; @@ -1529,10 +1519,10 @@ static int wcd937x_mbhc_micb_ctrl_threshold_mic(struct snd_soc_component *compon * voltage needed to detect threshold microphone, then do * not change the micbias, just return. */ - if (wcd937x->micb2_mv >= WCD_MBHC_THR_HS_MICB_MV) + if (wcd937x->common.micb_mv[2] >= WCD_MBHC_THR_HS_MICB_MV) return 0; - micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd937x->micb2_mv; + micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd937x->common.micb_mv[2]; return wcd937x_mbhc_micb_adjust_voltage(component, micb_mv, MIC_BIAS_2); } @@ -2436,22 +2426,14 @@ static const struct snd_soc_dapm_route wcd9375_audio_map[] = { { "DMIC6_MIXER", "Switch", "DMIC6" }, }; -static int wcd937x_set_micbias_data(struct wcd937x_priv *wcd937x) +static void wcd937x_set_micbias_data(struct device *dev, struct wcd937x_priv *wcd937x) { - int vout_ctl[3]; - - /* Set micbias voltage */ - vout_ctl[0] = wcd937x_get_micb_vout_ctl_val(wcd937x->micb1_mv); - vout_ctl[1] = wcd937x_get_micb_vout_ctl_val(wcd937x->micb2_mv); - vout_ctl[2] = wcd937x_get_micb_vout_ctl_val(wcd937x->micb3_mv); - if ((vout_ctl[0] | vout_ctl[1] | vout_ctl[2]) < 0) - return -EINVAL; - - regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB1, WCD937X_ANA_MICB_VOUT, vout_ctl[0]); - regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB2, WCD937X_ANA_MICB_VOUT, vout_ctl[1]); - regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB3, WCD937X_ANA_MICB_VOUT, vout_ctl[2]); - - return 0; + regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB1, WCD937X_ANA_MICB_VOUT, + wcd937x->common.micb_vout[0]); + regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB2, WCD937X_ANA_MICB_VOUT, + wcd937x->common.micb_vout[1]); + regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB3, WCD937X_ANA_MICB_VOUT, + wcd937x->common.micb_vout[2]); } static irqreturn_t wcd937x_wd_handle_irq(int irq, void *data) @@ -2630,31 +2612,6 @@ static const struct snd_soc_component_driver soc_codec_dev_wcd937x = { .endianness = 1, }; -static void wcd937x_dt_parse_micbias_info(struct device *dev, struct wcd937x_priv *wcd) -{ - struct device_node *np = dev->of_node; - u32 prop_val = 0; - int ret = 0; - - ret = of_property_read_u32(np, "qcom,micbias1-microvolt", &prop_val); - if (!ret) - wcd->micb1_mv = prop_val / 1000; - else - dev_warn(dev, "Micbias1 DT property not found\n"); - - ret = of_property_read_u32(np, "qcom,micbias2-microvolt", &prop_val); - if (!ret) - wcd->micb2_mv = prop_val / 1000; - else - dev_warn(dev, "Micbias2 DT property not found\n"); - - ret = of_property_read_u32(np, "qcom,micbias3-microvolt", &prop_val); - if (!ret) - wcd->micb3_mv = prop_val / 1000; - else - dev_warn(dev, "Micbias3 DT property not found\n"); -} - static bool wcd937x_swap_gnd_mic(struct snd_soc_component *component) { int value; @@ -2848,11 +2805,7 @@ static int wcd937x_bind(struct device *dev) wcd937x->sdw_priv[AIF1_PB]->slave_irq = wcd937x->virq; wcd937x->sdw_priv[AIF1_CAP]->slave_irq = wcd937x->virq; - ret = wcd937x_set_micbias_data(wcd937x); - if (ret < 0) { - dev_err(dev, "Bad micbias pdata\n"); - return ret; - } + wcd937x_set_micbias_data(dev, wcd937x); ret = snd_soc_register_component(dev, &soc_codec_dev_wcd937x, wcd937x_dais, ARRAY_SIZE(wcd937x_dais)); @@ -2920,6 +2873,8 @@ static int wcd937x_probe(struct platform_device *pdev) dev_set_drvdata(dev, wcd937x); mutex_init(&wcd937x->micb_lock); + wcd937x->common.dev = dev; + wcd937x->common.max_bias = 3; wcd937x->reset_gpio = devm_gpiod_get(dev, "reset", GPIOD_OUT_LOW); if (IS_ERR(wcd937x->reset_gpio)) @@ -2939,13 +2894,15 @@ static int wcd937x_probe(struct platform_device *pdev) if (ret) return dev_err_probe(dev, ret, "Failed to get and enable supplies\n"); - wcd937x_dt_parse_micbias_info(dev, wcd937x); + ret = wcd_dt_parse_micbias_info(&wcd937x->common); + if (ret) + return dev_err_probe(dev, ret, "Failed to get micbias\n"); cfg->mbhc_micbias = MIC_BIAS_2; cfg->anc_micbias = MIC_BIAS_2; cfg->v_hs_max = WCD_MBHC_HS_V_MAX; cfg->num_btn = WCD937X_MBHC_MAX_BUTTONS; - cfg->micb_mv = wcd937x->micb2_mv; + cfg->micb_mv = wcd937x->common.micb_mv[2]; cfg->linein_th = 5000; cfg->hs_thr = 1700; cfg->hph_thr = 50; diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c index f8d7bf27a6edbc..c8b6e543986d33 100644 --- a/sound/soc/codecs/wcd938x.c +++ b/sound/soc/codecs/wcd938x.c @@ -22,6 +22,7 @@ #include #include "wcd-clsh-v2.h" +#include "wcd-common.h" #include "wcd-mbhc-v2.h" #include "wcd938x.h" @@ -155,6 +156,7 @@ struct wcd938x_priv { struct wcd_mbhc_config mbhc_cfg; struct wcd_mbhc_intr intr_ids; struct wcd_clsh_ctrl *clsh_info; + struct wcd_common common; struct irq_domain *virq; struct regmap_irq_chip_data *irq_chip; struct snd_soc_jack *jack; @@ -169,10 +171,6 @@ struct wcd938x_priv { struct gpio_desc *us_euro_gpio; struct mux_control *us_euro_mux; unsigned int mux_state; - u32 micb1_mv; - u32 micb2_mv; - u32 micb3_mv; - u32 micb4_mv; int hphr_pdm_wd_int; int hphl_pdm_wd_int; int aux_pdm_wd_int; @@ -1974,15 +1972,6 @@ static void wcd938x_mbhc_micb_ramp_control(struct snd_soc_component *component, } } -static int wcd938x_get_micb_vout_ctl_val(u32 micb_mv) -{ - /* min micbias voltage is 1V and maximum is 2.85V */ - if (micb_mv < 1000 || micb_mv > 2850) - return -EINVAL; - - return (micb_mv - 1000) / 50; -} - static int wcd938x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, int req_volt, int micb_num) { @@ -2019,7 +2008,7 @@ static int wcd938x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, cur_vout_ctl = snd_soc_component_read_field(component, micb_reg, WCD938X_MICB_VOUT_MASK); - req_vout_ctl = wcd938x_get_micb_vout_ctl_val(req_volt); + req_vout_ctl = wcd_get_micb_vout_ctl_val(component->dev, req_volt); if (req_vout_ctl < 0) { ret = -EINVAL; goto exit; @@ -2067,10 +2056,10 @@ static int wcd938x_mbhc_micb_ctrl_threshold_mic(struct snd_soc_component *compon * voltage needed to detect threshold microphone, then do * not change the micbias, just return. */ - if (wcd938x->micb2_mv >= WCD_MBHC_THR_HS_MICB_MV) + if (wcd938x->common.micb_mv[2] >= WCD_MBHC_THR_HS_MICB_MV) return 0; - micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd938x->micb2_mv; + micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd938x->common.micb_mv[2]; return wcd938x_mbhc_micb_adjust_voltage(component, micb_mv, MIC_BIAS_2); } @@ -2975,28 +2964,16 @@ static const struct snd_soc_dapm_route wcd938x_audio_map[] = { {"EAR", NULL, "EAR PGA"}, }; -static int wcd938x_set_micbias_data(struct wcd938x_priv *wcd938x) +static void wcd938x_set_micbias_data(struct device *dev, struct wcd938x_priv *wcd938x) { - int vout_ctl_1, vout_ctl_2, vout_ctl_3, vout_ctl_4; - - /* set micbias voltage */ - vout_ctl_1 = wcd938x_get_micb_vout_ctl_val(wcd938x->micb1_mv); - vout_ctl_2 = wcd938x_get_micb_vout_ctl_val(wcd938x->micb2_mv); - vout_ctl_3 = wcd938x_get_micb_vout_ctl_val(wcd938x->micb3_mv); - vout_ctl_4 = wcd938x_get_micb_vout_ctl_val(wcd938x->micb4_mv); - if (vout_ctl_1 < 0 || vout_ctl_2 < 0 || vout_ctl_3 < 0 || vout_ctl_4 < 0) - return -EINVAL; - regmap_update_bits(wcd938x->regmap, WCD938X_ANA_MICB1, - WCD938X_MICB_VOUT_MASK, vout_ctl_1); + WCD938X_MICB_VOUT_MASK, wcd938x->common.micb_vout[0]); regmap_update_bits(wcd938x->regmap, WCD938X_ANA_MICB2, - WCD938X_MICB_VOUT_MASK, vout_ctl_2); + WCD938X_MICB_VOUT_MASK, wcd938x->common.micb_vout[1]); regmap_update_bits(wcd938x->regmap, WCD938X_ANA_MICB3, - WCD938X_MICB_VOUT_MASK, vout_ctl_3); + WCD938X_MICB_VOUT_MASK, wcd938x->common.micb_vout[2]); regmap_update_bits(wcd938x->regmap, WCD938X_ANA_MICB4, - WCD938X_MICB_VOUT_MASK, vout_ctl_4); - - return 0; + WCD938X_MICB_VOUT_MASK, wcd938x->common.micb_vout[3]); } static irqreturn_t wcd938x_wd_handle_irq(int irq, void *data) @@ -3200,37 +3177,6 @@ static const struct snd_soc_component_driver soc_codec_dev_wcd938x = { .endianness = 1, }; -static void wcd938x_dt_parse_micbias_info(struct device *dev, struct wcd938x_priv *wcd) -{ - struct device_node *np = dev->of_node; - u32 prop_val = 0; - int rc = 0; - - rc = of_property_read_u32(np, "qcom,micbias1-microvolt", &prop_val); - if (!rc) - wcd->micb1_mv = prop_val/1000; - else - dev_info(dev, "%s: Micbias1 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias2-microvolt", &prop_val); - if (!rc) - wcd->micb2_mv = prop_val/1000; - else - dev_info(dev, "%s: Micbias2 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias3-microvolt", &prop_val); - if (!rc) - wcd->micb3_mv = prop_val/1000; - else - dev_info(dev, "%s: Micbias3 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias4-microvolt", &prop_val); - if (!rc) - wcd->micb4_mv = prop_val/1000; - else - dev_info(dev, "%s: Micbias4 DT property not found\n", __func__); -} - static bool wcd938x_swap_gnd_mic(struct snd_soc_component *component) { struct wcd938x_priv *wcd938x = snd_soc_component_get_drvdata(component); @@ -3295,13 +3241,15 @@ static int wcd938x_populate_dt_data(struct wcd938x_priv *wcd938x, struct device if (ret) return dev_err_probe(dev, ret, "Failed to get and enable supplies\n"); - wcd938x_dt_parse_micbias_info(dev, wcd938x); + ret = wcd_dt_parse_micbias_info(&wcd938x->common); + if (ret) + return dev_err_probe(dev, ret, "Failed to get and enable supplies\n"); cfg->mbhc_micbias = MIC_BIAS_2; cfg->anc_micbias = MIC_BIAS_2; cfg->v_hs_max = WCD_MBHC_HS_V_MAX; cfg->num_btn = WCD938X_MBHC_MAX_BUTTONS; - cfg->micb_mv = wcd938x->micb2_mv; + cfg->micb_mv = wcd938x->common.micb_mv[2]; cfg->linein_th = 5000; cfg->hs_thr = 1700; cfg->hph_thr = 50; @@ -3457,11 +3405,7 @@ static int wcd938x_bind(struct device *dev) wcd938x->sdw_priv[AIF1_PB]->slave_irq = wcd938x->virq; wcd938x->sdw_priv[AIF1_CAP]->slave_irq = wcd938x->virq; - ret = wcd938x_set_micbias_data(wcd938x); - if (ret < 0) { - dev_err(dev, "%s: bad micbias pdata\n", __func__); - goto err_remove_rx_link; - } + wcd938x_set_micbias_data(dev, wcd938x); ret = snd_soc_register_component(dev, &soc_codec_dev_wcd938x, wcd938x_dais, ARRAY_SIZE(wcd938x_dais)); @@ -3550,6 +3494,8 @@ static int wcd938x_probe(struct platform_device *pdev) dev_set_drvdata(dev, wcd938x); mutex_init(&wcd938x->micb_lock); + wcd938x->common.dev = dev; + wcd938x->common.max_bias = 4; ret = wcd938x_populate_dt_data(wcd938x, dev); if (ret) diff --git a/sound/soc/codecs/wcd939x.c b/sound/soc/codecs/wcd939x.c index 85730ae40c2c24..59c920c50c00f8 100644 --- a/sound/soc/codecs/wcd939x.c +++ b/sound/soc/codecs/wcd939x.c @@ -28,6 +28,7 @@ #include #include "wcd-clsh-v2.h" +#include "wcd-common.h" #include "wcd-mbhc-v2.h" #include "wcd939x.h" @@ -191,6 +192,7 @@ struct wcd939x_priv { struct wcd_mbhc_config mbhc_cfg; struct wcd_mbhc_intr intr_ids; struct wcd_clsh_ctrl *clsh_info; + struct wcd_common common; struct irq_domain *virq; struct regmap_irq_chip_data *irq_chip; struct snd_soc_jack *jack; @@ -201,10 +203,6 @@ struct wcd939x_priv { u32 tx_mode[TX_ADC_MAX]; int variant; struct gpio_desc *reset_gpio; - u32 micb1_mv; - u32 micb2_mv; - u32 micb3_mv; - u32 micb4_mv; int hphr_pdm_wd_int; int hphl_pdm_wd_int; int ear_pdm_wd_int; @@ -1919,17 +1917,6 @@ static void wcd939x_mbhc_micb_ramp_control(struct snd_soc_component *component, } } -static int wcd939x_get_micb_vout_ctl_val(u32 micb_mv) -{ - /* min micbias voltage is 1V and maximum is 2.85V */ - if (micb_mv < 1000 || micb_mv > 2850) { - pr_err("%s: unsupported micbias voltage\n", __func__); - return -EINVAL; - } - - return (micb_mv - 1000) / 50; -} - static int wcd939x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, int req_volt, int micb_num) { @@ -1969,7 +1956,7 @@ static int wcd939x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, cur_vout_ctl = snd_soc_component_read_field(component, micb_reg, WCD939X_MICB_VOUT_CTL); - req_vout_ctl = wcd939x_get_micb_vout_ctl_val(req_volt); + req_vout_ctl = wcd_get_micb_vout_ctl_val(component->dev, req_volt); if (req_vout_ctl < 0) { ret = req_vout_ctl; goto exit; @@ -2021,10 +2008,10 @@ static int wcd939x_mbhc_micb_ctrl_threshold_mic(struct snd_soc_component *compon * voltage needed to detect threshold microphone, then do * not change the micbias, just return. */ - if (wcd939x->micb2_mv >= WCD_MBHC_THR_HS_MICB_MV) + if (wcd939x->common.micb_mv[1] >= WCD_MBHC_THR_HS_MICB_MV) return 0; - micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd939x->micb2_mv; + micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd939x->common.micb_mv[1]; return wcd939x_mbhc_micb_adjust_voltage(component, micb_mv, MIC_BIAS_2); } @@ -2895,28 +2882,16 @@ static const struct snd_soc_dapm_route wcd939x_audio_map[] = { {"EAR", NULL, "EAR PGA"}, }; -static int wcd939x_set_micbias_data(struct wcd939x_priv *wcd939x) +static void wcd939x_set_micbias_data(struct device *dev, struct wcd939x_priv *wcd939x) { - int vout_ctl_1, vout_ctl_2, vout_ctl_3, vout_ctl_4; - - /* set micbias voltage */ - vout_ctl_1 = wcd939x_get_micb_vout_ctl_val(wcd939x->micb1_mv); - vout_ctl_2 = wcd939x_get_micb_vout_ctl_val(wcd939x->micb2_mv); - vout_ctl_3 = wcd939x_get_micb_vout_ctl_val(wcd939x->micb3_mv); - vout_ctl_4 = wcd939x_get_micb_vout_ctl_val(wcd939x->micb4_mv); - if (vout_ctl_1 < 0 || vout_ctl_2 < 0 || vout_ctl_3 < 0 || vout_ctl_4 < 0) - return -EINVAL; - regmap_update_bits(wcd939x->regmap, WCD939X_ANA_MICB1, - WCD939X_MICB_VOUT_CTL, vout_ctl_1); + WCD939X_MICB_VOUT_CTL, wcd939x->common.micb_vout[0]); regmap_update_bits(wcd939x->regmap, WCD939X_ANA_MICB2, - WCD939X_MICB_VOUT_CTL, vout_ctl_2); + WCD939X_MICB_VOUT_CTL, wcd939x->common.micb_vout[1]); regmap_update_bits(wcd939x->regmap, WCD939X_ANA_MICB3, - WCD939X_MICB_VOUT_CTL, vout_ctl_3); + WCD939X_MICB_VOUT_CTL, wcd939x->common.micb_vout[2]); regmap_update_bits(wcd939x->regmap, WCD939X_ANA_MICB4, - WCD939X_MICB_VOUT_CTL, vout_ctl_4); - - return 0; + WCD939X_MICB_VOUT_CTL, wcd939x->common.micb_vout[3]); } static irqreturn_t wcd939x_wd_handle_irq(int irq, void *data) @@ -3186,37 +3161,6 @@ static int wcd939x_typec_mux_set(struct typec_mux_dev *mux, } #endif /* CONFIG_TYPEC */ -static void wcd939x_dt_parse_micbias_info(struct device *dev, struct wcd939x_priv *wcd) -{ - struct device_node *np = dev->of_node; - u32 prop_val = 0; - int rc = 0; - - rc = of_property_read_u32(np, "qcom,micbias1-microvolt", &prop_val); - if (!rc) - wcd->micb1_mv = prop_val / 1000; - else - dev_info(dev, "%s: Micbias1 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias2-microvolt", &prop_val); - if (!rc) - wcd->micb2_mv = prop_val / 1000; - else - dev_info(dev, "%s: Micbias2 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias3-microvolt", &prop_val); - if (!rc) - wcd->micb3_mv = prop_val / 1000; - else - dev_info(dev, "%s: Micbias3 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias4-microvolt", &prop_val); - if (!rc) - wcd->micb4_mv = prop_val / 1000; - else - dev_info(dev, "%s: Micbias4 DT property not found\n", __func__); -} - #if IS_ENABLED(CONFIG_TYPEC) static bool wcd939x_swap_gnd_mic(struct snd_soc_component *component) { @@ -3252,13 +3196,15 @@ static int wcd939x_populate_dt_data(struct wcd939x_priv *wcd939x, struct device if (ret) return dev_err_probe(dev, ret, "Failed to get and enable supplies\n"); - wcd939x_dt_parse_micbias_info(dev, wcd939x); + ret = wcd_dt_parse_micbias_info(&wcd939x->common); + if (ret) + return dev_err_probe(dev, ret, "Failed to get micbias\n"); cfg->mbhc_micbias = MIC_BIAS_2; cfg->anc_micbias = MIC_BIAS_2; cfg->v_hs_max = WCD_MBHC_HS_V_MAX; cfg->num_btn = WCD939X_MBHC_MAX_BUTTONS; - cfg->micb_mv = wcd939x->micb2_mv; + cfg->micb_mv = wcd939x->common.micb_mv[1]; cfg->linein_th = 5000; cfg->hs_thr = 1700; cfg->hph_thr = 50; @@ -3444,11 +3390,7 @@ static int wcd939x_bind(struct device *dev) wcd939x->sdw_priv[AIF1_PB]->slave_irq = wcd939x->virq; wcd939x->sdw_priv[AIF1_CAP]->slave_irq = wcd939x->virq; - ret = wcd939x_set_micbias_data(wcd939x); - if (ret < 0) { - dev_err(dev, "%s: bad micbias pdata\n", __func__); - goto err_remove_rx_link; - } + wcd939x_set_micbias_data(dev, wcd939x); /* Check WCD9395 version */ regmap_read(wcd939x->regmap, WCD939X_DIGITAL_CHIP_ID1, &id1); @@ -3613,6 +3555,8 @@ static int wcd939x_probe(struct platform_device *pdev) dev_set_drvdata(dev, wcd939x); mutex_init(&wcd939x->micb_lock); + wcd939x->common.dev = dev; + wcd939x->common.max_bias = 4; ret = wcd939x_populate_dt_data(wcd939x, dev); if (ret) { From 4652f02cf6150ae496eec582e76b7cc7bb3089a1 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:49 +0100 Subject: [PATCH 2179/3206] ASoC: codecs: wcd-common: move WCD_SDW_CH to common sdw_ch_info and WCD_SDW_CH macro is duplicated across wcd937x, wcd938x, wcd939x soundwire codec drivers. Move this to wcd common driver to remove this code duplication. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250909121954.225833-9-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/codecs/wcd-common.h | 13 +++++++++++++ sound/soc/codecs/wcd937x-sdw.c | 4 ++-- sound/soc/codecs/wcd937x.c | 2 +- sound/soc/codecs/wcd937x.h | 16 ++-------------- sound/soc/codecs/wcd938x-sdw.c | 5 +++-- sound/soc/codecs/wcd938x.c | 2 +- sound/soc/codecs/wcd938x.h | 13 +------------ sound/soc/codecs/wcd939x-sdw.c | 5 +++-- sound/soc/codecs/wcd939x.c | 2 +- sound/soc/codecs/wcd939x.h | 13 +------------ 10 files changed, 28 insertions(+), 47 deletions(-) diff --git a/sound/soc/codecs/wcd-common.h b/sound/soc/codecs/wcd-common.h index 611f06cdec550d..d94e8879a1d77f 100644 --- a/sound/soc/codecs/wcd-common.h +++ b/sound/soc/codecs/wcd-common.h @@ -14,6 +14,19 @@ enum sdw_slave_status; #define WCD_MAX_MICBIAS 4 +struct wcd_sdw_ch_info { + int port_num; + unsigned int ch_mask; + unsigned int master_ch_mask; +}; + +#define WCD_SDW_CH(id, pn, cmask) \ + [id] = { \ + .port_num = pn, \ + .ch_mask = cmask, \ + .master_ch_mask = cmask, \ + } + struct wcd_common { struct device *dev; int max_bias; diff --git a/sound/soc/codecs/wcd937x-sdw.c b/sound/soc/codecs/wcd937x-sdw.c index e7cc699bd8bcf1..8f28191635277e 100644 --- a/sound/soc/codecs/wcd937x-sdw.c +++ b/sound/soc/codecs/wcd937x-sdw.c @@ -19,7 +19,7 @@ #include #include "wcd937x.h" -static struct wcd937x_sdw_ch_info wcd937x_sdw_rx_ch_info[] = { +static struct wcd_sdw_ch_info wcd937x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD937X_HPH_L, WCD937X_HPH_PORT, BIT(0)), WCD_SDW_CH(WCD937X_HPH_R, WCD937X_HPH_PORT, BIT(1)), WCD_SDW_CH(WCD937X_CLSH, WCD937X_CLSH_PORT, BIT(0)), @@ -30,7 +30,7 @@ static struct wcd937x_sdw_ch_info wcd937x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD937X_DSD_R, WCD937X_DSD_PORT, BIT(1)), }; -static struct wcd937x_sdw_ch_info wcd937x_sdw_tx_ch_info[] = { +static struct wcd_sdw_ch_info wcd937x_sdw_tx_ch_info[] = { WCD_SDW_CH(WCD937X_ADC1, WCD937X_ADC_1_PORT, BIT(0)), WCD_SDW_CH(WCD937X_ADC2, WCD937X_ADC_2_3_PORT, BIT(0)), WCD_SDW_CH(WCD937X_ADC3, WCD937X_ADC_2_3_PORT, BIT(0)), diff --git a/sound/soc/codecs/wcd937x.c b/sound/soc/codecs/wcd937x.c index ffe6508fd84c45..4c040e3862f481 100644 --- a/sound/soc/codecs/wcd937x.c +++ b/sound/soc/codecs/wcd937x.c @@ -1183,7 +1183,7 @@ static int wcd937x_codec_enable_micbias_pullup(struct snd_soc_dapm_widget *w, static int wcd937x_connect_port(struct wcd937x_sdw_priv *wcd, u8 port_idx, u8 ch_id, bool enable) { struct sdw_port_config *port_config = &wcd->port_config[port_idx - 1]; - const struct wcd937x_sdw_ch_info *ch_info = &wcd->ch_info[ch_id]; + const struct wcd_sdw_ch_info *ch_info = &wcd->ch_info[ch_id]; u8 port_num = ch_info->port_num; u8 ch_mask = ch_info->ch_mask; u8 mstr_port_num, mstr_ch_mask; diff --git a/sound/soc/codecs/wcd937x.h b/sound/soc/codecs/wcd937x.h index 0f96b7108a7ebe..3d0ba3cc0ee614 100644 --- a/sound/soc/codecs/wcd937x.h +++ b/sound/soc/codecs/wcd937x.h @@ -7,6 +7,7 @@ #include #include +#include "wcd-common.h" #define WCD937X_BASE_ADDRESS 0x3000 #define WCD937X_ANA_BIAS 0x3001 @@ -507,26 +508,13 @@ enum wcd937x_rx_sdw_ports { WCD937X_MAX_SWR_PORTS = WCD937X_DSD_PORT, }; -struct wcd937x_sdw_ch_info { - int port_num; - unsigned int ch_mask; - unsigned int master_ch_mask; -}; - -#define WCD_SDW_CH(id, pn, cmask) \ - [id] = { \ - .port_num = pn, \ - .ch_mask = cmask, \ - .master_ch_mask = cmask, \ - } - struct wcd937x_priv; struct wcd937x_sdw_priv { struct sdw_slave *sdev; struct sdw_stream_config sconfig; struct sdw_stream_runtime *sruntime; struct sdw_port_config port_config[WCD937X_MAX_SWR_PORTS]; - struct wcd937x_sdw_ch_info *ch_info; + struct wcd_sdw_ch_info *ch_info; bool port_enable[WCD937X_MAX_SWR_CH_IDS]; unsigned int master_channel_map[SDW_MAX_PORTS]; int active_ports; diff --git a/sound/soc/codecs/wcd938x-sdw.c b/sound/soc/codecs/wcd938x-sdw.c index 8bcd8396f3750a..1dc13b6fabfa6e 100644 --- a/sound/soc/codecs/wcd938x-sdw.c +++ b/sound/soc/codecs/wcd938x-sdw.c @@ -18,10 +18,11 @@ #include #include #include "wcd938x.h" +#include "wcd-common.h" #define SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(m) (0xE0 + 0x10 * (m)) -static const struct wcd938x_sdw_ch_info wcd938x_sdw_rx_ch_info[] = { +static const struct wcd_sdw_ch_info wcd938x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD938X_HPH_L, WCD938X_HPH_PORT, BIT(0)), WCD_SDW_CH(WCD938X_HPH_R, WCD938X_HPH_PORT, BIT(1)), WCD_SDW_CH(WCD938X_CLSH, WCD938X_CLSH_PORT, BIT(0)), @@ -32,7 +33,7 @@ static const struct wcd938x_sdw_ch_info wcd938x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD938X_DSD_R, WCD938X_DSD_PORT, BIT(1)), }; -static const struct wcd938x_sdw_ch_info wcd938x_sdw_tx_ch_info[] = { +static const struct wcd_sdw_ch_info wcd938x_sdw_tx_ch_info[] = { WCD_SDW_CH(WCD938X_ADC1, WCD938X_ADC_1_2_PORT, BIT(0)), WCD_SDW_CH(WCD938X_ADC2, WCD938X_ADC_1_2_PORT, BIT(1)), WCD_SDW_CH(WCD938X_ADC3, WCD938X_ADC_3_4_PORT, BIT(0)), diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c index c8b6e543986d33..e495f98972f1a2 100644 --- a/sound/soc/codecs/wcd938x.c +++ b/sound/soc/codecs/wcd938x.c @@ -394,7 +394,7 @@ static int wcd938x_io_init(struct wcd938x_priv *wcd938x) } -static int wcd938x_sdw_connect_port(const struct wcd938x_sdw_ch_info *ch_info, +static int wcd938x_sdw_connect_port(const struct wcd_sdw_ch_info *ch_info, struct sdw_port_config *port_config, u8 enable) { diff --git a/sound/soc/codecs/wcd938x.h b/sound/soc/codecs/wcd938x.h index 54ee56b7fbd6ab..c18610466d7d83 100644 --- a/sound/soc/codecs/wcd938x.h +++ b/sound/soc/codecs/wcd938x.h @@ -587,17 +587,6 @@ #define WCD938X_MAX_SWR_CH_IDS 15 -struct wcd938x_sdw_ch_info { - int port_num; - unsigned int ch_mask; -}; - -#define WCD_SDW_CH(id, pn, cmask) \ - [id] = { \ - .port_num = pn, \ - .ch_mask = cmask, \ - } - enum wcd938x_tx_sdw_ports { WCD938X_ADC_1_2_PORT = 1, WCD938X_ADC_3_4_PORT, @@ -649,7 +638,7 @@ struct wcd938x_sdw_priv { struct sdw_stream_config sconfig; struct sdw_stream_runtime *sruntime; struct sdw_port_config port_config[WCD938X_MAX_SWR_PORTS]; - const struct wcd938x_sdw_ch_info *ch_info; + const struct wcd_sdw_ch_info *ch_info; bool port_enable[WCD938X_MAX_SWR_CH_IDS]; int active_ports; bool is_tx; diff --git a/sound/soc/codecs/wcd939x-sdw.c b/sound/soc/codecs/wcd939x-sdw.c index 477d6cf27d32c3..97a829e3ce4f51 100644 --- a/sound/soc/codecs/wcd939x-sdw.c +++ b/sound/soc/codecs/wcd939x-sdw.c @@ -20,10 +20,11 @@ #include #include #include "wcd939x.h" +#include "wcd-common.h" #define SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(m) (0xE0 + 0x10 * (m)) -static const struct wcd939x_sdw_ch_info wcd939x_sdw_rx_ch_info[] = { +static const struct wcd_sdw_ch_info wcd939x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD939X_HPH_L, WCD939X_HPH_PORT, BIT(0)), WCD_SDW_CH(WCD939X_HPH_R, WCD939X_HPH_PORT, BIT(1)), WCD_SDW_CH(WCD939X_CLSH, WCD939X_CLSH_PORT, BIT(0)), @@ -36,7 +37,7 @@ static const struct wcd939x_sdw_ch_info wcd939x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD939X_HIFI_PCM_R, WCD939X_HIFI_PCM_PORT, BIT(1)), }; -static const struct wcd939x_sdw_ch_info wcd939x_sdw_tx_ch_info[] = { +static const struct wcd_sdw_ch_info wcd939x_sdw_tx_ch_info[] = { WCD_SDW_CH(WCD939X_ADC1, WCD939X_ADC_1_4_PORT, BIT(0)), WCD_SDW_CH(WCD939X_ADC2, WCD939X_ADC_1_4_PORT, BIT(1)), WCD_SDW_CH(WCD939X_ADC3, WCD939X_ADC_1_4_PORT, BIT(2)), diff --git a/sound/soc/codecs/wcd939x.c b/sound/soc/codecs/wcd939x.c index 59c920c50c00f8..a414cd99b9466a 100644 --- a/sound/soc/codecs/wcd939x.c +++ b/sound/soc/codecs/wcd939x.c @@ -413,7 +413,7 @@ static int wcd939x_io_init(struct snd_soc_component *component) return 0; } -static int wcd939x_sdw_connect_port(const struct wcd939x_sdw_ch_info *ch_info, +static int wcd939x_sdw_connect_port(const struct wcd_sdw_ch_info *ch_info, struct sdw_port_config *port_config, u8 enable) { diff --git a/sound/soc/codecs/wcd939x.h b/sound/soc/codecs/wcd939x.h index e70445b1a4bc8a..ca6353222ea01e 100644 --- a/sound/soc/codecs/wcd939x.h +++ b/sound/soc/codecs/wcd939x.h @@ -844,17 +844,6 @@ #define WCD939X_MAX_SWR_CH_IDS (15) -struct wcd939x_sdw_ch_info { - int port_num; - unsigned int ch_mask; -}; - -#define WCD_SDW_CH(id, pn, cmask) \ - [id] = { \ - .port_num = pn, \ - .ch_mask = cmask, \ - } - enum wcd939x_tx_sdw_ports { WCD939X_ADC_1_4_PORT = 1, WCD939X_ADC_DMIC_1_2_PORT, @@ -909,7 +898,7 @@ struct wcd939x_sdw_priv { struct sdw_stream_config sconfig; struct sdw_stream_runtime *sruntime; struct sdw_port_config port_config[WCD939X_MAX_SWR_PORTS]; - const struct wcd939x_sdw_ch_info *ch_info; + const struct wcd_sdw_ch_info *ch_info; bool port_enable[WCD939X_MAX_SWR_CH_IDS]; int active_ports; bool is_tx; From ebaf88c0546ddfd5efe5d7867a2e8e9f0e5969ed Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:50 +0100 Subject: [PATCH 2180/3206] ASoC: codecs: wcd-common: move component ops to common component_ops for wcd97x, wcd938x, wcd939x soundwire codecs are exactly identlical, move them to common driver to remove this duplicate code. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250909121954.225833-10-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/codecs/wcd-common.c | 27 +++++++++++++++++++++++++++ sound/soc/codecs/wcd-common.h | 1 + sound/soc/codecs/wcd937x-sdw.c | 29 ++--------------------------- sound/soc/codecs/wcd938x-sdw.c | 20 ++------------------ sound/soc/codecs/wcd939x-sdw.c | 29 ++--------------------------- 5 files changed, 34 insertions(+), 72 deletions(-) diff --git a/sound/soc/codecs/wcd-common.c b/sound/soc/codecs/wcd-common.c index 8f3c0806cdc92c..7a48cc5064ccf4 100644 --- a/sound/soc/codecs/wcd-common.c +++ b/sound/soc/codecs/wcd-common.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include "wcd-common.h" @@ -66,5 +68,30 @@ int wcd_dt_parse_micbias_info(struct wcd_common *common) return 0; } EXPORT_SYMBOL_GPL(wcd_dt_parse_micbias_info); + +static int wcd_sdw_component_bind(struct device *dev, struct device *master, void *data) +{ + pm_runtime_set_autosuspend_delay(dev, 3000); + pm_runtime_use_autosuspend(dev); + pm_runtime_mark_last_busy(dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + + return 0; +} + +static void wcd_sdw_component_unbind(struct device *dev, struct device *master, void *data) +{ + pm_runtime_disable(dev); + pm_runtime_set_suspended(dev); + pm_runtime_dont_use_autosuspend(dev); +} + +const struct component_ops wcd_sdw_component_ops = { + .bind = wcd_sdw_component_bind, + .unbind = wcd_sdw_component_unbind, +}; +EXPORT_SYMBOL_GPL(wcd_sdw_component_ops); + MODULE_DESCRIPTION("Common Qualcomm WCD Codec helpers driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/wcd-common.h b/sound/soc/codecs/wcd-common.h index d94e8879a1d77f..0d4e9f8e39f215 100644 --- a/sound/soc/codecs/wcd-common.h +++ b/sound/soc/codecs/wcd-common.h @@ -34,6 +34,7 @@ struct wcd_common { u32 micb_vout[WCD_MAX_MICBIAS]; }; +extern const struct component_ops wcd_sdw_component_ops; int wcd_get_micb_vout_ctl_val(struct device *dev, u32 micb_mv); int wcd_dt_parse_micbias_info(struct wcd_common *common); diff --git a/sound/soc/codecs/wcd937x-sdw.c b/sound/soc/codecs/wcd937x-sdw.c index 8f28191635277e..59c353cafd31c6 100644 --- a/sound/soc/codecs/wcd937x-sdw.c +++ b/sound/soc/codecs/wcd937x-sdw.c @@ -983,31 +983,6 @@ static const struct sdw_slave_ops wcd9370_slave_ops = { .interrupt_callback = wcd9370_interrupt_callback, }; -static int wcd937x_sdw_component_bind(struct device *dev, - struct device *master, void *data) -{ - pm_runtime_set_autosuspend_delay(dev, 3000); - pm_runtime_use_autosuspend(dev); - pm_runtime_mark_last_busy(dev); - pm_runtime_set_active(dev); - pm_runtime_enable(dev); - - return 0; -} - -static void wcd937x_sdw_component_unbind(struct device *dev, - struct device *master, void *data) -{ - pm_runtime_disable(dev); - pm_runtime_set_suspended(dev); - pm_runtime_dont_use_autosuspend(dev); -} - -static const struct component_ops wcd937x_sdw_component_ops = { - .bind = wcd937x_sdw_component_bind, - .unbind = wcd937x_sdw_component_unbind, -}; - static int wcd9370_probe(struct sdw_slave *pdev, const struct sdw_device_id *id) { @@ -1093,7 +1068,7 @@ static int wcd9370_probe(struct sdw_slave *pdev, } - ret = component_add(dev, &wcd937x_sdw_component_ops); + ret = component_add(dev, &wcd_sdw_component_ops); if (ret) return ret; @@ -1107,7 +1082,7 @@ static int wcd9370_remove(struct sdw_slave *pdev) { struct device *dev = &pdev->dev; - component_del(dev, &wcd937x_sdw_component_ops); + component_del(dev, &wcd_sdw_component_ops); return 0; } diff --git a/sound/soc/codecs/wcd938x-sdw.c b/sound/soc/codecs/wcd938x-sdw.c index 1dc13b6fabfa6e..92714aef09d50e 100644 --- a/sound/soc/codecs/wcd938x-sdw.c +++ b/sound/soc/codecs/wcd938x-sdw.c @@ -1182,22 +1182,6 @@ static const struct sdw_slave_ops wcd9380_slave_ops = { .bus_config = wcd9380_bus_config, }; -static int wcd938x_sdw_component_bind(struct device *dev, - struct device *master, void *data) -{ - return 0; -} - -static void wcd938x_sdw_component_unbind(struct device *dev, - struct device *master, void *data) -{ -} - -static const struct component_ops wcd938x_sdw_component_ops = { - .bind = wcd938x_sdw_component_bind, - .unbind = wcd938x_sdw_component_unbind, -}; - static int wcd9380_probe(struct sdw_slave *pdev, const struct sdw_device_id *id) { @@ -1262,7 +1246,7 @@ static int wcd9380_probe(struct sdw_slave *pdev, pm_runtime_set_active(dev); pm_runtime_enable(dev); - ret = component_add(dev, &wcd938x_sdw_component_ops); + ret = component_add(dev, &wcd_sdw_component_ops); if (ret) goto err_disable_rpm; @@ -1280,7 +1264,7 @@ static int wcd9380_remove(struct sdw_slave *pdev) { struct device *dev = &pdev->dev; - component_del(dev, &wcd938x_sdw_component_ops); + component_del(dev, &wcd_sdw_component_ops); pm_runtime_disable(dev); pm_runtime_set_suspended(dev); diff --git a/sound/soc/codecs/wcd939x-sdw.c b/sound/soc/codecs/wcd939x-sdw.c index 97a829e3ce4f51..6aecad2a28aaae 100644 --- a/sound/soc/codecs/wcd939x-sdw.c +++ b/sound/soc/codecs/wcd939x-sdw.c @@ -1378,31 +1378,6 @@ static const struct sdw_slave_ops wcd9390_slave_ops = { .bus_config = wcd9390_bus_config, }; -static int wcd939x_sdw_component_bind(struct device *dev, struct device *master, - void *data) -{ - pm_runtime_set_autosuspend_delay(dev, 3000); - pm_runtime_use_autosuspend(dev); - pm_runtime_mark_last_busy(dev); - pm_runtime_set_active(dev); - pm_runtime_enable(dev); - - return 0; -} - -static void wcd939x_sdw_component_unbind(struct device *dev, - struct device *master, void *data) -{ - pm_runtime_disable(dev); - pm_runtime_set_suspended(dev); - pm_runtime_dont_use_autosuspend(dev); -} - -static const struct component_ops wcd939x_sdw_component_ops = { - .bind = wcd939x_sdw_component_bind, - .unbind = wcd939x_sdw_component_unbind, -}; - static int wcd9390_probe(struct sdw_slave *pdev, const struct sdw_device_id *id) { struct device *dev = &pdev->dev; @@ -1466,7 +1441,7 @@ static int wcd9390_probe(struct sdw_slave *pdev, const struct sdw_device_id *id) regcache_cache_only(wcd->regmap, true); } - ret = component_add(dev, &wcd939x_sdw_component_ops); + ret = component_add(dev, &wcd_sdw_component_ops); if (ret) return ret; @@ -1481,7 +1456,7 @@ static int wcd9390_remove(struct sdw_slave *pdev) struct device *dev = &pdev->dev; struct wcd939x_sdw_priv *wcd = dev_get_drvdata(dev); - component_del(dev, &wcd939x_sdw_component_ops); + component_del(dev, &wcd_sdw_component_ops); if (wcd->regmap) regmap_exit(wcd->regmap); From 45f2c5e1d1fa413e862379f0ec765c3fdd07ec8e Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:51 +0100 Subject: [PATCH 2181/3206] ASoC: codecs: wcd939x: get regmap directly for some reason we ended up with a boiler plate for dev_get_regmap in wcd939x codec and started exporting a symbol for this. Remove this redundant wrapper and direclty get regmap. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250909121954.225833-11-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/codecs/wcd939x-sdw.c | 9 --------- sound/soc/codecs/wcd939x.c | 6 +++--- sound/soc/codecs/wcd939x.h | 6 ------ 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/sound/soc/codecs/wcd939x-sdw.c b/sound/soc/codecs/wcd939x-sdw.c index 6aecad2a28aaae..38da706d80be15 100644 --- a/sound/soc/codecs/wcd939x-sdw.c +++ b/sound/soc/codecs/wcd939x-sdw.c @@ -187,15 +187,6 @@ int wcd939x_sdw_set_sdw_stream(struct wcd939x_sdw_priv *wcd, } EXPORT_SYMBOL_GPL(wcd939x_sdw_set_sdw_stream); -struct regmap *wcd939x_swr_get_regmap(struct wcd939x_sdw_priv *wcd) -{ - if (wcd->regmap) - return wcd->regmap; - - return ERR_PTR(-EINVAL); -} -EXPORT_SYMBOL_GPL(wcd939x_swr_get_regmap); - static int wcd9390_update_status(struct sdw_slave *slave, enum sdw_slave_status status) { diff --git a/sound/soc/codecs/wcd939x.c b/sound/soc/codecs/wcd939x.c index a414cd99b9466a..e74e6f0131318c 100644 --- a/sound/soc/codecs/wcd939x.c +++ b/sound/soc/codecs/wcd939x.c @@ -3374,10 +3374,10 @@ static int wcd939x_bind(struct device *dev) } /* Get regmap from TX SoundWire device */ - wcd939x->regmap = wcd939x_swr_get_regmap(wcd939x->sdw_priv[AIF1_CAP]); - if (IS_ERR(wcd939x->regmap)) { + wcd939x->regmap = wcd939x->sdw_priv[AIF1_CAP]->regmap; + if (!wcd939x->regmap) { dev_err(dev, "could not get TX device regmap\n"); - ret = PTR_ERR(wcd939x->regmap); + ret = -ENODEV; goto err_remove_rx_link; } diff --git a/sound/soc/codecs/wcd939x.h b/sound/soc/codecs/wcd939x.h index ca6353222ea01e..6bd2366587a8f1 100644 --- a/sound/soc/codecs/wcd939x.h +++ b/sound/soc/codecs/wcd939x.h @@ -918,8 +918,6 @@ int wcd939x_sdw_hw_params(struct wcd939x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); - -struct regmap *wcd939x_swr_get_regmap(struct wcd939x_sdw_priv *wcd); #else static inline int wcd939x_sdw_free(struct wcd939x_sdw_priv *wcd, @@ -944,10 +942,6 @@ static inline int wcd939x_sdw_hw_params(struct wcd939x_sdw_priv *wcd, return -EOPNOTSUPP; } -struct regmap *wcd939x_swr_get_regmap(struct wcd939x_sdw_priv *wcd) -{ - return PTR_ERR(-EINVAL); -} #endif /* CONFIG_SND_SOC_WCD939X_SDW */ #endif /* __WCD939X_H__ */ From 59aebbbb0b47ee97c15cb6992c0fd665289544de Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:52 +0100 Subject: [PATCH 2182/3206] ASoC: codecs: wcd-common: move status_update callback to common Soundwire update_status, bus_config and interrupt callbacks for wcd937x, wcd938x, wcd939x soundwire codecs are exactly identlical, move them to common driver to remove this duplicate code. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250909121954.225833-12-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/codecs/wcd-common.c | 46 ++++++++++++++++++++++++++++++++++ sound/soc/codecs/wcd-common.h | 5 ++++ sound/soc/codecs/wcd937x-sdw.c | 28 +++------------------ sound/soc/codecs/wcd938x-sdw.c | 41 +++--------------------------- sound/soc/codecs/wcd939x-sdw.c | 42 +++---------------------------- 5 files changed, 62 insertions(+), 100 deletions(-) diff --git a/sound/soc/codecs/wcd-common.c b/sound/soc/codecs/wcd-common.c index 7a48cc5064ccf4..9bbfda82837794 100644 --- a/sound/soc/codecs/wcd-common.c +++ b/sound/soc/codecs/wcd-common.c @@ -9,6 +9,9 @@ #include #include #include +#include +#include +#include #include "wcd-common.h" @@ -16,6 +19,8 @@ #define WCD_DEF_MICBIAS_MV 1800 #define WCD_MAX_MICBIAS_MV 2850 +#define SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(m) (0xE0 + 0x10 * (m)) + int wcd_get_micb_vout_ctl_val(struct device *dev, u32 micb_mv) { /* min micbias voltage is 1V and maximum is 2.85V */ @@ -93,5 +98,46 @@ const struct component_ops wcd_sdw_component_ops = { }; EXPORT_SYMBOL_GPL(wcd_sdw_component_ops); +int wcd_update_status(struct sdw_slave *slave, enum sdw_slave_status status) +{ + struct regmap *regmap = dev_get_regmap(&slave->dev, NULL); + + if (regmap && status == SDW_SLAVE_ATTACHED) { + /* Write out any cached changes that happened between probe and attach */ + regcache_cache_only(regmap, false); + return regcache_sync(regmap); + } + + return 0; +} +EXPORT_SYMBOL_GPL(wcd_update_status); + +int wcd_bus_config(struct sdw_slave *slave, struct sdw_bus_params *params) +{ + sdw_write(slave, SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(params->next_bank), 0x01); + + return 0; +} +EXPORT_SYMBOL_GPL(wcd_bus_config); + +int wcd_interrupt_callback(struct sdw_slave *slave, struct irq_domain *slave_irq, + unsigned int wcd_intr_status0, unsigned int wcd_intr_status1, + unsigned int wcd_intr_status2) +{ + struct regmap *regmap = dev_get_regmap(&slave->dev, NULL); + u32 sts1, sts2, sts3; + + do { + handle_nested_irq(irq_find_mapping(slave_irq, 0)); + regmap_read(regmap, wcd_intr_status0, &sts1); + regmap_read(regmap, wcd_intr_status1, &sts2); + regmap_read(regmap, wcd_intr_status2, &sts3); + + } while (sts1 || sts2 || sts3); + + return IRQ_HANDLED; +} +EXPORT_SYMBOL_GPL(wcd_interrupt_callback); + MODULE_DESCRIPTION("Common Qualcomm WCD Codec helpers driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/wcd-common.h b/sound/soc/codecs/wcd-common.h index 0d4e9f8e39f215..d5c156e641fc33 100644 --- a/sound/soc/codecs/wcd-common.h +++ b/sound/soc/codecs/wcd-common.h @@ -37,5 +37,10 @@ struct wcd_common { extern const struct component_ops wcd_sdw_component_ops; int wcd_get_micb_vout_ctl_val(struct device *dev, u32 micb_mv); int wcd_dt_parse_micbias_info(struct wcd_common *common); +int wcd_update_status(struct sdw_slave *slave, enum sdw_slave_status status); +int wcd_bus_config(struct sdw_slave *slave, struct sdw_bus_params *params); +int wcd_interrupt_callback(struct sdw_slave *slave, struct irq_domain *slave_irq, + unsigned int wcd_intr_status0, unsigned int wcd_intr_status1, + unsigned int wcd_intr_status2); #endif /* __WCD_COMMON_H__ */ diff --git a/sound/soc/codecs/wcd937x-sdw.c b/sound/soc/codecs/wcd937x-sdw.c index 59c353cafd31c6..1878d67e3fa109 100644 --- a/sound/soc/codecs/wcd937x-sdw.c +++ b/sound/soc/codecs/wcd937x-sdw.c @@ -112,19 +112,6 @@ int wcd937x_sdw_hw_params(struct wcd937x_sdw_priv *wcd, } EXPORT_SYMBOL_GPL(wcd937x_sdw_hw_params); -static int wcd9370_update_status(struct sdw_slave *slave, enum sdw_slave_status status) -{ - struct wcd937x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - - if (wcd->regmap && status == SDW_SLAVE_ATTACHED) { - /* Write out any cached changes that happened between probe and attach */ - regcache_cache_only(wcd->regmap, false); - return regcache_sync(wcd->regmap); - } - - return 0; -} - /* * Handle Soundwire out-of-band interrupt event by triggering * the first irq of the slave_irq irq domain, which then will @@ -135,18 +122,9 @@ static int wcd9370_interrupt_callback(struct sdw_slave *slave, struct sdw_slave_intr_status *status) { struct wcd937x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - struct irq_domain *slave_irq = wcd->slave_irq; - u32 sts1, sts2, sts3; - - do { - handle_nested_irq(irq_find_mapping(slave_irq, 0)); - regmap_read(wcd->regmap, WCD937X_DIGITAL_INTR_STATUS_0, &sts1); - regmap_read(wcd->regmap, WCD937X_DIGITAL_INTR_STATUS_1, &sts2); - regmap_read(wcd->regmap, WCD937X_DIGITAL_INTR_STATUS_2, &sts3); - - } while (sts1 || sts2 || sts3); - return IRQ_HANDLED; + return wcd_interrupt_callback(slave, wcd->slave_irq, WCD937X_DIGITAL_INTR_STATUS_0, + WCD937X_DIGITAL_INTR_STATUS_1, WCD937X_DIGITAL_INTR_STATUS_2); } static const struct reg_default wcd937x_defaults[] = { @@ -979,7 +957,7 @@ static const struct regmap_config wcd937x_regmap_config = { }; static const struct sdw_slave_ops wcd9370_slave_ops = { - .update_status = wcd9370_update_status, + .update_status = wcd_update_status, .interrupt_callback = wcd9370_interrupt_callback, }; diff --git a/sound/soc/codecs/wcd938x-sdw.c b/sound/soc/codecs/wcd938x-sdw.c index 92714aef09d50e..add907cb270629 100644 --- a/sound/soc/codecs/wcd938x-sdw.c +++ b/sound/soc/codecs/wcd938x-sdw.c @@ -20,8 +20,6 @@ #include "wcd938x.h" #include "wcd-common.h" -#define SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(m) (0xE0 + 0x10 * (m)) - static const struct wcd_sdw_ch_info wcd938x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD938X_HPH_L, WCD938X_HPH_PORT, BIT(0)), WCD_SDW_CH(WCD938X_HPH_R, WCD938X_HPH_PORT, BIT(1)), @@ -142,44 +140,13 @@ int wcd938x_sdw_set_sdw_stream(struct wcd938x_sdw_priv *wcd, } EXPORT_SYMBOL_GPL(wcd938x_sdw_set_sdw_stream); -static int wcd9380_update_status(struct sdw_slave *slave, - enum sdw_slave_status status) -{ - struct wcd938x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - - if (wcd->regmap && (status == SDW_SLAVE_ATTACHED)) { - /* Write out any cached changes that happened between probe and attach */ - regcache_cache_only(wcd->regmap, false); - return regcache_sync(wcd->regmap); - } - - return 0; -} - -static int wcd9380_bus_config(struct sdw_slave *slave, - struct sdw_bus_params *params) -{ - sdw_write(slave, SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(params->next_bank), 0x01); - - return 0; -} - static int wcd9380_interrupt_callback(struct sdw_slave *slave, struct sdw_slave_intr_status *status) { struct wcd938x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - struct irq_domain *slave_irq = wcd->slave_irq; - u32 sts1, sts2, sts3; - - do { - handle_nested_irq(irq_find_mapping(slave_irq, 0)); - regmap_read(wcd->regmap, WCD938X_DIGITAL_INTR_STATUS_0, &sts1); - regmap_read(wcd->regmap, WCD938X_DIGITAL_INTR_STATUS_1, &sts2); - regmap_read(wcd->regmap, WCD938X_DIGITAL_INTR_STATUS_2, &sts3); - - } while (sts1 || sts2 || sts3); - return IRQ_HANDLED; + return wcd_interrupt_callback(slave, wcd->slave_irq, WCD938X_DIGITAL_INTR_STATUS_0, + WCD938X_DIGITAL_INTR_STATUS_1, WCD938X_DIGITAL_INTR_STATUS_2); } static const struct reg_default wcd938x_defaults[] = { @@ -1177,9 +1144,9 @@ static const struct regmap_config wcd938x_regmap_config = { }; static const struct sdw_slave_ops wcd9380_slave_ops = { - .update_status = wcd9380_update_status, + .update_status = wcd_update_status, .interrupt_callback = wcd9380_interrupt_callback, - .bus_config = wcd9380_bus_config, + .bus_config = wcd_bus_config, }; static int wcd9380_probe(struct sdw_slave *pdev, diff --git a/sound/soc/codecs/wcd939x-sdw.c b/sound/soc/codecs/wcd939x-sdw.c index 38da706d80be15..d369100a245742 100644 --- a/sound/soc/codecs/wcd939x-sdw.c +++ b/sound/soc/codecs/wcd939x-sdw.c @@ -22,8 +22,6 @@ #include "wcd939x.h" #include "wcd-common.h" -#define SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(m) (0xE0 + 0x10 * (m)) - static const struct wcd_sdw_ch_info wcd939x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD939X_HPH_L, WCD939X_HPH_PORT, BIT(0)), WCD_SDW_CH(WCD939X_HPH_R, WCD939X_HPH_PORT, BIT(1)), @@ -187,29 +185,6 @@ int wcd939x_sdw_set_sdw_stream(struct wcd939x_sdw_priv *wcd, } EXPORT_SYMBOL_GPL(wcd939x_sdw_set_sdw_stream); -static int wcd9390_update_status(struct sdw_slave *slave, - enum sdw_slave_status status) -{ - struct wcd939x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - - if (wcd->regmap && status == SDW_SLAVE_ATTACHED) { - /* Write out any cached changes that happened between probe and attach */ - regcache_cache_only(wcd->regmap, false); - return regcache_sync(wcd->regmap); - } - - return 0; -} - -static int wcd9390_bus_config(struct sdw_slave *slave, - struct sdw_bus_params *params) -{ - sdw_write(slave, SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(params->next_bank), - 0x01); - - return 0; -} - /* * Handle Soundwire out-of-band interrupt event by triggering * the first irq of the slave_irq irq domain, which then will @@ -220,18 +195,9 @@ static int wcd9390_interrupt_callback(struct sdw_slave *slave, struct sdw_slave_intr_status *status) { struct wcd939x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - struct irq_domain *slave_irq = wcd->slave_irq; - u32 sts1, sts2, sts3; - - do { - handle_nested_irq(irq_find_mapping(slave_irq, 0)); - regmap_read(wcd->regmap, WCD939X_DIGITAL_INTR_STATUS_0, &sts1); - regmap_read(wcd->regmap, WCD939X_DIGITAL_INTR_STATUS_1, &sts2); - regmap_read(wcd->regmap, WCD939X_DIGITAL_INTR_STATUS_2, &sts3); - - } while (sts1 || sts2 || sts3); - return IRQ_HANDLED; + return wcd_interrupt_callback(slave, wcd->slave_irq, WCD939X_DIGITAL_INTR_STATUS_0, + WCD939X_DIGITAL_INTR_STATUS_1, WCD939X_DIGITAL_INTR_STATUS_2); } static const struct reg_default wcd939x_defaults[] = { @@ -1364,9 +1330,9 @@ static const struct regmap_config wcd939x_regmap_config = { }; static const struct sdw_slave_ops wcd9390_slave_ops = { - .update_status = wcd9390_update_status, + .update_status = wcd_update_status, .interrupt_callback = wcd9390_interrupt_callback, - .bus_config = wcd9390_bus_config, + .bus_config = wcd_bus_config, }; static int wcd9390_probe(struct sdw_slave *pdev, const struct sdw_device_id *id) From edf8918028e226515c3869c3b8b16f12fe6e62fe Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:53 +0100 Subject: [PATCH 2183/3206] ASoC: codecs: wcd938x: get regmap directly Remove usage of dev_get_regmap, as this its more efficient to directly reference the pointer. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250909121954.225833-13-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/codecs/wcd938x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c index e495f98972f1a2..e1a4783b984c17 100644 --- a/sound/soc/codecs/wcd938x.c +++ b/sound/soc/codecs/wcd938x.c @@ -3389,7 +3389,7 @@ static int wcd938x_bind(struct device *dev) goto err_remove_tx_link; } - wcd938x->regmap = dev_get_regmap(&wcd938x->tx_sdw_dev->dev, NULL); + wcd938x->regmap = wcd938x->sdw_priv[AIF1_CAP]->regmap; if (!wcd938x->regmap) { dev_err(dev, "could not get TX device regmap\n"); ret = -EINVAL; From 0266f9541038b9b98ddd387132b5bdfe32a304e3 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:54 +0100 Subject: [PATCH 2184/3206] ASoC: codecs: wcd937x: get regmap directly Remove usage of dev_get_regmap, as this its more efficient to directly reference the pointer. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250909121954.225833-14-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/codecs/wcd937x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/wcd937x.c b/sound/soc/codecs/wcd937x.c index 4c040e3862f481..421ec7a2d6bdc8 100644 --- a/sound/soc/codecs/wcd937x.c +++ b/sound/soc/codecs/wcd937x.c @@ -2790,7 +2790,7 @@ static int wcd937x_bind(struct device *dev) return -EINVAL; } - wcd937x->regmap = dev_get_regmap(&wcd937x->tx_sdw_dev->dev, NULL); + wcd937x->regmap = wcd937x->sdw_priv[AIF1_CAP]->regmap; if (!wcd937x->regmap) { dev_err(dev, "could not get TX device regmap\n"); return -EINVAL; From ee6cd8f3e28ee5a929c3b67c01a350f550f9b73a Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:45 +0300 Subject: [PATCH 2185/3206] power: supply: max77976_charger: fix constant current reporting CHARGE_CONTROL_LIMIT is a wrong property to report charge current limit, because `CHARGE_*` attributes represents capacity, not current. The correct attribute to report and set charge current limit is CONSTANT_CHARGE_CURRENT. Rename CHARGE_CONTROL_LIMIT to CONSTANT_CHARGE_CURRENT. Cc: stable@vger.kernel.org Fixes: 715ecbc10d6a ("power: supply: max77976: add Maxim MAX77976 charger driver") Signed-off-by: Dzmitry Sankouski Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77976_charger.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/power/supply/max77976_charger.c b/drivers/power/supply/max77976_charger.c index e6fe68cebc32b6..3d6ff400553305 100644 --- a/drivers/power/supply/max77976_charger.c +++ b/drivers/power/supply/max77976_charger.c @@ -292,10 +292,10 @@ static int max77976_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_ONLINE: err = max77976_get_online(chg, &val->intval); break; - case POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: val->intval = MAX77976_CHG_CC_MAX; break; - case POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: err = max77976_get_integer(chg, CHG_CC, MAX77976_CHG_CC_MIN, MAX77976_CHG_CC_MAX, @@ -330,7 +330,7 @@ static int max77976_set_property(struct power_supply *psy, int err = 0; switch (psp) { - case POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: err = max77976_set_integer(chg, CHG_CC, MAX77976_CHG_CC_MIN, MAX77976_CHG_CC_MAX, @@ -355,7 +355,7 @@ static int max77976_property_is_writeable(struct power_supply *psy, enum power_supply_property psp) { switch (psp) { - case POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: return true; default: @@ -368,8 +368,8 @@ static enum power_supply_property max77976_psy_props[] = { POWER_SUPPLY_PROP_CHARGE_TYPE, POWER_SUPPLY_PROP_HEALTH, POWER_SUPPLY_PROP_ONLINE, - POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT, - POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX, + POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT, + POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX, POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT, POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_MANUFACTURER, From ce2335cd14b6585d57dc05a4375a506079adba81 Mon Sep 17 00:00:00 2001 From: Samasth Norway Ananda Date: Tue, 9 Sep 2025 15:51:11 -0700 Subject: [PATCH 2186/3206] ASoC: SOF: ipc3-dtrace: fix potential integer overflow in allocation Fix a potential integer overflow vulnerability in trace_filter_parse() where the allocation size calculation could overflow. The issue occurs when: 1. capacity is calculated by adding TRACE_FILTER_ELEMENTS_PER_ENTRY in a loop for each entry found in the input string. 2. capacity * sizeof(**out) multiplication could overflow if many entries are present in the input. 3. This results in a smaller allocation than expected, leading to potential buffer overflow. Replace kmalloc() with kmalloc_array() which provides built-in overflow checking and will safely fail the allocation if overflow would occur, preventing memory corruption. Signed-off-by: Samasth Norway Ananda Link: https://patch.msgid.link/20250909225111.3740029-1-samasth.norway.ananda@oracle.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc3-dtrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sof/ipc3-dtrace.c b/sound/soc/sof/ipc3-dtrace.c index e5c8fec173c4f2..6ec391fd39a902 100644 --- a/sound/soc/sof/ipc3-dtrace.c +++ b/sound/soc/sof/ipc3-dtrace.c @@ -126,7 +126,7 @@ static int trace_filter_parse(struct snd_sof_dev *sdev, char *string, capacity += TRACE_FILTER_ELEMENTS_PER_ENTRY; entry = strchr(entry + 1, entry_delimiter[0]); } - *out = kmalloc(capacity * sizeof(**out), GFP_KERNEL); + *out = kmalloc_array(capacity, sizeof(**out), GFP_KERNEL); if (!*out) return -ENOMEM; From 9565c9d53c5b440f0dde6fa731a99c1b14d879d2 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Thu, 11 Sep 2025 16:43:40 +0100 Subject: [PATCH 2187/3206] ASoC: qcom: sc8280xp: explicitly set S16LE format in sc8280xp_be_hw_params_fixup() Setting format to s16le is required for compressed playback on compatible soundcards. Signed-off-by: Alexey Klimov Link: https://patch.msgid.link/20250911154340.2798304-1-alexey.klimov@linaro.org Signed-off-by: Mark Brown --- sound/soc/qcom/sc8280xp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/qcom/sc8280xp.c b/sound/soc/qcom/sc8280xp.c index 73f9f82c4e2581..5d10b1c5909e83 100644 --- a/sound/soc/qcom/sc8280xp.c +++ b/sound/soc/qcom/sc8280xp.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -82,8 +83,10 @@ static int sc8280xp_be_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, SNDRV_PCM_HW_PARAM_RATE); struct snd_interval *channels = hw_param_interval(params, SNDRV_PCM_HW_PARAM_CHANNELS); + struct snd_mask *fmt = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT); rate->min = rate->max = 48000; + snd_mask_set_format(fmt, SNDRV_PCM_FORMAT_S16_LE); channels->min = 2; channels->max = 2; switch (cpu_dai->id) { From 2aa28b748fc967a2f2566c06bdad155fba8af7d8 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 4 Sep 2025 17:31:42 +0200 Subject: [PATCH 2188/3206] ASoC: da7213: Convert to DEFINE_RUNTIME_DEV_PM_OPS() Convert the Dialog DA7213 CODEC driver from an open-coded dev_pm_ops structure to DEFINE_RUNTIME_DEV_PM_OPS(), to simplify the code. Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/0c001e0f7658c2d5f33faea963d6ca64f60ccea8.1756999876.git.geert+renesas@glider.be Signed-off-by: Mark Brown --- sound/soc/codecs/da7213.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/da7213.c b/sound/soc/codecs/da7213.c index a4496cc26902b8..ae89260ca215ff 100644 --- a/sound/soc/codecs/da7213.c +++ b/sound/soc/codecs/da7213.c @@ -2247,10 +2247,8 @@ static int da7213_runtime_resume(struct device *dev) return regcache_sync(da7213->regmap); } -static const struct dev_pm_ops da7213_pm = { - RUNTIME_PM_OPS(da7213_runtime_suspend, da7213_runtime_resume, NULL) - SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) -}; +static DEFINE_RUNTIME_DEV_PM_OPS(da7213_pm, da7213_runtime_suspend, + da7213_runtime_resume, NULL); static const struct i2c_device_id da7213_i2c_id[] = { { "da7213" }, From 68f6b403ee90293b6fa4eb094dcee79138b692b2 Mon Sep 17 00:00:00 2001 From: Dharma Balasubiramani Date: Mon, 8 Sep 2025 09:44:16 +0530 Subject: [PATCH 2189/3206] dt-bindings: spi: Document sam9x7 QSPI Document the sam9x7 quad spi that supports interface to serial memories operating in - Single-bit SPI, Dual SPI, Quad SPI and Octal SPI - Single Data Rate or Double Data Rate modes Signed-off-by: Dharma Balasubiramani Acked-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250908-microchip-qspi-v2-1-8f3d69fdd5c9@microchip.com Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/atmel,quadspi.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml b/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml index b0d99bc105352c..c17114123034dd 100644 --- a/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml +++ b/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml @@ -17,6 +17,7 @@ properties: enum: - atmel,sama5d2-qspi - microchip,sam9x60-qspi + - microchip,sam9x7-ospi - microchip,sama7g5-qspi - microchip,sama7g5-ospi From f3837edc05c66de0104041d3f300524773b46526 Mon Sep 17 00:00:00 2001 From: Dharma Balasubiramani Date: Mon, 8 Sep 2025 09:44:17 +0530 Subject: [PATCH 2190/3206] dt-bindings: spi: Define sama7d65 QSPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sama7d65 has 2 instances of the QSPI controller: • One Octal Serial Peripheral Interface (QSPI0) supporting DDR. Octal, Twin-Quad, HyperFlashTM and OctaFlashTM protocols supported. • One Quad Serial Peripheral Interface (QSPI1) supporting DDR/SDR. Signed-off-by: Dharma Balasubiramani Acked-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250908-microchip-qspi-v2-2-8f3d69fdd5c9@microchip.com Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/atmel,quadspi.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml b/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml index c17114123034dd..30ab42c95c0894 100644 --- a/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml +++ b/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml @@ -18,6 +18,8 @@ properties: - atmel,sama5d2-qspi - microchip,sam9x60-qspi - microchip,sam9x7-ospi + - microchip,sama7d65-qspi + - microchip,sama7d65-ospi - microchip,sama7g5-qspi - microchip,sama7g5-ospi From 86d074921e34db6daaa7ea2976b1dfe7d507309b Mon Sep 17 00:00:00 2001 From: Varshini Rajendran Date: Mon, 8 Sep 2025 09:44:18 +0530 Subject: [PATCH 2191/3206] spi: atmel-quadspi: add padcalib, 2xgclk, and dllon capabilities Introduce capability flags for SoC-specific variations of the QuadSPI controller: - has_padcalib: controller supports pad calibration - has_2xgclk: requires GCLK at half the data rate (2x clocking) - has_dllon: controller supports DLL clock Set `has_padcalib` for Octal controllers that provide pad calibration support. Use `has_2xgclk` for controllers that require the GCLK to run at twice the data rate. Differentiate SoC integration variants with the `has_dllon` flag and set it as needed. Signed-off-by: Varshini Rajendran Signed-off-by: Dharma Balasubiramani Link: https://patch.msgid.link/20250908-microchip-qspi-v2-3-8f3d69fdd5c9@microchip.com Signed-off-by: Mark Brown --- drivers/spi/atmel-quadspi.c | 92 ++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 32 deletions(-) diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c index 4e9bfd26aa80b7..83cea5faff783b 100644 --- a/drivers/spi/atmel-quadspi.c +++ b/drivers/spi/atmel-quadspi.c @@ -262,6 +262,9 @@ struct atmel_qspi_caps { bool has_ricr; bool octal; bool has_dma; + bool has_2xgclk; + bool has_padcalib; + bool has_dllon; }; struct atmel_qspi_ops; @@ -1027,13 +1030,25 @@ static int atmel_qspi_set_pad_calibration(struct atmel_qspi *aq) aq, QSPI_PCALCFG); /* DLL On + start calibration. */ - atmel_qspi_write(QSPI_CR_DLLON | QSPI_CR_STPCAL, aq, QSPI_CR); + if (aq->caps->has_dllon) + atmel_qspi_write(QSPI_CR_DLLON | QSPI_CR_STPCAL, aq, QSPI_CR); + /* If there is no DLL support only start calibration. */ + else + atmel_qspi_write(QSPI_CR_STPCAL, aq, QSPI_CR); - /* Check synchronization status before updating configuration. */ - ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, - (val & QSPI_SR2_DLOCK) && - !(val & QSPI_SR2_CALBSY), 40, - ATMEL_QSPI_TIMEOUT); + /* + * Check DLL clock lock and synchronization status before updating + * configuration. + */ + if (aq->caps->has_dllon) + ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, + (val & QSPI_SR2_DLOCK) && + !(val & QSPI_SR2_CALBSY), 40, + ATMEL_QSPI_TIMEOUT); + else + ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, + !(val & QSPI_SR2_CALBSY), 40, + ATMEL_QSPI_TIMEOUT); /* Refresh analogic blocks every 1 ms.*/ atmel_qspi_write(FIELD_PREP(QSPI_REFRESH_DELAY_COUNTER, @@ -1049,23 +1064,28 @@ static int atmel_qspi_set_gclk(struct atmel_qspi *aq) int ret; /* Disable DLL before setting GCLK */ - status = atmel_qspi_read(aq, QSPI_SR2); - if (status & QSPI_SR2_DLOCK) { - atmel_qspi_write(QSPI_CR_DLLOFF, aq, QSPI_CR); + if (aq->caps->has_dllon) { + status = atmel_qspi_read(aq, QSPI_SR2); + if (status & QSPI_SR2_DLOCK) { + atmel_qspi_write(QSPI_CR_DLLOFF, aq, QSPI_CR); + ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, + !(val & QSPI_SR2_DLOCK), 40, + ATMEL_QSPI_TIMEOUT); + if (ret) + return ret; + } - ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, - !(val & QSPI_SR2_DLOCK), 40, - ATMEL_QSPI_TIMEOUT); - if (ret) - return ret; + if (aq->target_max_speed_hz > QSPI_DLLCFG_THRESHOLD_FREQ) + atmel_qspi_write(QSPI_DLLCFG_RANGE, aq, QSPI_DLLCFG); + else + atmel_qspi_write(0, aq, QSPI_DLLCFG); } - if (aq->target_max_speed_hz > QSPI_DLLCFG_THRESHOLD_FREQ) - atmel_qspi_write(QSPI_DLLCFG_RANGE, aq, QSPI_DLLCFG); + if (aq->caps->has_2xgclk) + ret = clk_set_rate(aq->gclk, 2 * aq->target_max_speed_hz); else - atmel_qspi_write(0, aq, QSPI_DLLCFG); + ret = clk_set_rate(aq->gclk, aq->target_max_speed_hz); - ret = clk_set_rate(aq->gclk, aq->target_max_speed_hz); if (ret) { dev_err(&aq->pdev->dev, "Failed to set generic clock rate.\n"); return ret; @@ -1088,11 +1108,16 @@ static int atmel_qspi_sama7g5_init(struct atmel_qspi *aq) if (ret) return ret; - if (aq->caps->octal) { + /* + * Check if the SoC supports pad calibration in Octal SPI mode. + * Proceed only if both the capabilities are true. + */ + if (aq->caps->octal && aq->caps->has_padcalib) { ret = atmel_qspi_set_pad_calibration(aq); if (ret) return ret; - } else { + /* Start DLL on only if the SoC supports the same */ + } else if (aq->caps->has_dllon) { atmel_qspi_write(QSPI_CR_DLLON, aq, QSPI_CR); ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, (val & QSPI_SR2_DLOCK), 40, @@ -1458,19 +1483,19 @@ static int atmel_qspi_sama7g5_suspend(struct atmel_qspi *aq) clk_disable_unprepare(aq->gclk); - atmel_qspi_write(QSPI_CR_DLLOFF, aq, QSPI_CR); - ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, - !(val & QSPI_SR2_DLOCK), 40, - ATMEL_QSPI_TIMEOUT); - if (ret) - return ret; - - ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, - !(val & QSPI_SR2_CALBSY), 40, - ATMEL_QSPI_TIMEOUT); - if (ret) - return ret; + if (aq->caps->has_dllon) { + atmel_qspi_write(QSPI_CR_DLLOFF, aq, QSPI_CR); + ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, + !(val & QSPI_SR2_DLOCK), 40, + ATMEL_QSPI_TIMEOUT); + if (ret) + return ret; + } + if (aq->caps->has_padcalib) + return readl_poll_timeout(aq->regs + QSPI_SR2, val, + !(val & QSPI_SR2_CALBSY), 40, + ATMEL_QSPI_TIMEOUT); return 0; } @@ -1607,12 +1632,15 @@ static const struct atmel_qspi_caps atmel_sama7g5_ospi_caps = { .has_gclk = true, .octal = true, .has_dma = true, + .has_padcalib = true, + .has_dllon = true, }; static const struct atmel_qspi_caps atmel_sama7g5_qspi_caps = { .max_speed_hz = SAMA7G5_QSPI1_SDR_MAX_SPEED_HZ, .has_gclk = true, .has_dma = true, + .has_dllon = true, }; static const struct of_device_id atmel_qspi_dt_ids[] = { From 65a977d752d72a53d16ce32b85ef9db9b0747df6 Mon Sep 17 00:00:00 2001 From: Varshini Rajendran Date: Mon, 8 Sep 2025 09:44:19 +0530 Subject: [PATCH 2192/3206] spi: atmel-quadspi: add support for SAM9X7 QSPI controller Add support for the QuadSPI controller found on the SAM9X7 SoC. This controller does not implement pad calibration. It supports operation up to 100 MHz, and requires the GCK to run at twice the data rate. Signed-off-by: Varshini Rajendran Signed-off-by: Dharma Balasubiramani Link: https://patch.msgid.link/20250908-microchip-qspi-v2-4-8f3d69fdd5c9@microchip.com Signed-off-by: Mark Brown --- drivers/spi/atmel-quadspi.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c index 83cea5faff783b..342cdd6e8d64d0 100644 --- a/drivers/spi/atmel-quadspi.c +++ b/drivers/spi/atmel-quadspi.c @@ -63,6 +63,7 @@ #define SAMA7G5_QSPI0_MAX_SPEED_HZ 200000000 #define SAMA7G5_QSPI1_SDR_MAX_SPEED_HZ 133000000 +#define SAM9X7_QSPI_MAX_SPEED_HZ 100000000 /* Bitfields in QSPI_CR (Control Register) */ #define QSPI_CR_QSPIEN BIT(0) @@ -1627,6 +1628,16 @@ static const struct atmel_qspi_caps atmel_sam9x60_qspi_caps = { .has_ricr = true, }; +static const struct atmel_qspi_caps atmel_sam9x7_ospi_caps = { + .max_speed_hz = SAM9X7_QSPI_MAX_SPEED_HZ, + .has_gclk = true, + .octal = true, + .has_dma = true, + .has_2xgclk = true, + .has_padcalib = false, + .has_dllon = false, +}; + static const struct atmel_qspi_caps atmel_sama7g5_ospi_caps = { .max_speed_hz = SAMA7G5_QSPI0_MAX_SPEED_HZ, .has_gclk = true, @@ -1660,6 +1671,10 @@ static const struct of_device_id atmel_qspi_dt_ids[] = { .compatible = "microchip,sama7g5-qspi", .data = &atmel_sama7g5_qspi_caps, }, + { + .compatible = "microchip,sam9x7-ospi", + .data = &atmel_sam9x7_ospi_caps, + }, { /* sentinel */ } }; From 20253f806818e9a1657a832ebcf4141d0a08c02a Mon Sep 17 00:00:00 2001 From: Varshini Rajendran Date: Mon, 8 Sep 2025 09:44:20 +0530 Subject: [PATCH 2193/3206] spi: atmel-quadspi: Add support for sama7d65 QSPI Add support for sama7d65 QSPI controller and define its caps. Signed-off-by: Varshini Rajendran Link: https://patch.msgid.link/20250908-microchip-qspi-v2-5-8f3d69fdd5c9@microchip.com Signed-off-by: Mark Brown --- drivers/spi/atmel-quadspi.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c index 342cdd6e8d64d0..d7a3d85d00c2f3 100644 --- a/drivers/spi/atmel-quadspi.c +++ b/drivers/spi/atmel-quadspi.c @@ -1638,6 +1638,24 @@ static const struct atmel_qspi_caps atmel_sam9x7_ospi_caps = { .has_dllon = false, }; +static const struct atmel_qspi_caps atmel_sama7d65_ospi_caps = { + .max_speed_hz = SAMA7G5_QSPI0_MAX_SPEED_HZ, + .has_gclk = true, + .octal = true, + .has_dma = true, + .has_2xgclk = true, + .has_padcalib = true, + .has_dllon = false, +}; + +static const struct atmel_qspi_caps atmel_sama7d65_qspi_caps = { + .max_speed_hz = SAMA7G5_QSPI1_SDR_MAX_SPEED_HZ, + .has_gclk = true, + .has_dma = true, + .has_2xgclk = true, + .has_dllon = false, +}; + static const struct atmel_qspi_caps atmel_sama7g5_ospi_caps = { .max_speed_hz = SAMA7G5_QSPI0_MAX_SPEED_HZ, .has_gclk = true, @@ -1675,6 +1693,15 @@ static const struct of_device_id atmel_qspi_dt_ids[] = { .compatible = "microchip,sam9x7-ospi", .data = &atmel_sam9x7_ospi_caps, }, + { + .compatible = "microchip,sama7d65-ospi", + .data = &atmel_sama7d65_ospi_caps, + }, + { + .compatible = "microchip,sama7d65-qspi", + .data = &atmel_sama7d65_qspi_caps, + }, + { /* sentinel */ } }; From 614180a54d5f21ccb4f60042d19744694d31d3f8 Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Wed, 17 Sep 2025 15:27:06 +0800 Subject: [PATCH 2194/3206] spi: spi-nxp-fspi: extract function nxp_fspi_dll_override() Extract function nxp_fspi_dll_override(), this is the suggested setting when clock rate < 100MHz. Just the preparation of supportting DTR mode. Signed-off-by: Haibo Chen Reviewed-by: Frank Li Link: https://patch.msgid.link/20250917-flexspi-ddr-v2-1-bb9fe2a01889@nxp.com Signed-off-by: Mark Brown --- drivers/spi/spi-nxp-fspi.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c index 848fa9319e36d6..db4b92490de957 100644 --- a/drivers/spi/spi-nxp-fspi.c +++ b/drivers/spi/spi-nxp-fspi.c @@ -674,6 +674,17 @@ static void nxp_fspi_dll_calibration(struct nxp_fspi *f) dev_warn(f->dev, "DLL lock failed, please fix it!\n"); } +/* + * Config the DLL register to default value, enable the target clock delay + * line delay cell override mode, and use 1 fixed delay cell in DLL delay + * chain, this is the suggested setting when clock rate < 100MHz. + */ +static void nxp_fspi_dll_override(struct nxp_fspi *f) +{ + fspi_writel(f, FSPI_DLLACR_OVRDEN, f->iobase + FSPI_DLLACR); + fspi_writel(f, FSPI_DLLBCR_OVRDEN, f->iobase + FSPI_DLLBCR); +} + /* * In FlexSPI controller, flash access is based on value of FSPI_FLSHXXCR0 * register and start base address of the target device. @@ -1071,13 +1082,7 @@ static int nxp_fspi_default_setup(struct nxp_fspi *f) /* Disable the module */ fspi_writel(f, FSPI_MCR0_MDIS, base + FSPI_MCR0); - /* - * Config the DLL register to default value, enable the target clock delay - * line delay cell override mode, and use 1 fixed delay cell in DLL delay - * chain, this is the suggested setting when clock rate < 100MHz. - */ - fspi_writel(f, FSPI_DLLACR_OVRDEN, base + FSPI_DLLACR); - fspi_writel(f, FSPI_DLLBCR_OVRDEN, base + FSPI_DLLBCR); + nxp_fspi_dll_override(f); /* enable module */ fspi_writel(f, FSPI_MCR0_AHB_TIMEOUT(0xFF) | From a9888b3222ec73d055447a39cf9a0118f67497f4 Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Wed, 17 Sep 2025 15:27:07 +0800 Subject: [PATCH 2195/3206] spi: spi-nxp-fspi: set back to dll override mode when clock rate < 100MHz Preparation of supportting DTR mode. In nor suspend, driver will disable DTR mode, and enable DTR mode back in nor resume. This require the flexspi driver has the ability to set back to dll override mode in STR mode when clock rate < 100MHz. Signed-off-by: Haibo Chen Reviewed-by: Frank Li Link: https://patch.msgid.link/20250917-flexspi-ddr-v2-2-bb9fe2a01889@nxp.com Signed-off-by: Mark Brown --- drivers/spi/spi-nxp-fspi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c index db4b92490de957..a1c9ad03379682 100644 --- a/drivers/spi/spi-nxp-fspi.c +++ b/drivers/spi/spi-nxp-fspi.c @@ -767,6 +767,8 @@ static void nxp_fspi_select_mem(struct nxp_fspi *f, struct spi_device *spi, */ if (rate > 100000000) nxp_fspi_dll_calibration(f); + else + nxp_fspi_dll_override(f); f->selected = spi_get_chipselect(spi, 0); } From 3c1000e15fd0eb387fcca420c9fb36ae07887782 Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Wed, 17 Sep 2025 15:27:08 +0800 Subject: [PATCH 2196/3206] spi: spi-nxp-fspi: Add the DDR LUT command support For DTR mode, flexspi need to use DDR LUT command, flexspi will switch to DDR mode when detect the DDR LUT command. Reviewed-by: Frank Li Signed-off-by: Haibo Chen Link: https://patch.msgid.link/20250917-flexspi-ddr-v2-3-bb9fe2a01889@nxp.com Signed-off-by: Mark Brown --- drivers/spi/spi-nxp-fspi.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c index a1c9ad03379682..bd61f951d6befd 100644 --- a/drivers/spi/spi-nxp-fspi.c +++ b/drivers/spi/spi-nxp-fspi.c @@ -559,12 +559,21 @@ static void nxp_fspi_prepare_lut(struct nxp_fspi *f, u32 target_lut_reg; /* cmd */ - lutval[0] |= LUT_DEF(0, LUT_CMD, LUT_PAD(op->cmd.buswidth), - op->cmd.opcode); + if (op->cmd.dtr) { + lutval[0] |= LUT_DEF(0, LUT_CMD_DDR, LUT_PAD(op->cmd.buswidth), + op->cmd.opcode >> 8); + lutval[lutidx / 2] |= LUT_DEF(lutidx, LUT_CMD_DDR, + LUT_PAD(op->cmd.buswidth), + op->cmd.opcode & 0xFF); + lutidx++; + } else { + lutval[0] |= LUT_DEF(0, LUT_CMD, LUT_PAD(op->cmd.buswidth), + op->cmd.opcode); + } /* addr bytes */ if (op->addr.nbytes) { - lutval[lutidx / 2] |= LUT_DEF(lutidx, LUT_ADDR, + lutval[lutidx / 2] |= LUT_DEF(lutidx, op->addr.dtr ? LUT_ADDR_DDR : LUT_ADDR, LUT_PAD(op->addr.buswidth), op->addr.nbytes * 8); lutidx++; @@ -572,7 +581,7 @@ static void nxp_fspi_prepare_lut(struct nxp_fspi *f, /* dummy bytes, if needed */ if (op->dummy.nbytes) { - lutval[lutidx / 2] |= LUT_DEF(lutidx, LUT_DUMMY, + lutval[lutidx / 2] |= LUT_DEF(lutidx, op->dummy.dtr ? LUT_DUMMY_DDR : LUT_DUMMY, /* * Due to FlexSPI controller limitation number of PAD for dummy * buswidth needs to be programmed as equal to data buswidth. @@ -587,7 +596,8 @@ static void nxp_fspi_prepare_lut(struct nxp_fspi *f, if (op->data.nbytes) { lutval[lutidx / 2] |= LUT_DEF(lutidx, op->data.dir == SPI_MEM_DATA_IN ? - LUT_NXP_READ : LUT_NXP_WRITE, + (op->data.dtr ? LUT_READ_DDR : LUT_NXP_READ) : + (op->data.dtr ? LUT_WRITE_DDR : LUT_NXP_WRITE), LUT_PAD(op->data.buswidth), 0); lutidx++; From c07f270323175b83779c2c2b80b360ed476baec5 Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Wed, 17 Sep 2025 15:27:09 +0800 Subject: [PATCH 2197/3206] spi: spi-nxp-fspi: add the support for sample data from DQS pad flexspi define four mode for sample clock source selection. Here is the list of modes: mode 0: Dummy Read strobe generated by FlexSPI Controller and loopback internally mode 1: Dummy Read strobe generated by FlexSPI Controller and loopback from DQS pad mode 2: Reserved mode 3: Flash provided Read strobe and input from DQS pad In default, flexspi use mode 0 after reset. And for DTR mode, flexspi only support 8D-8D-8D mode. For 8D-8D-8D mode, IC suggest to use mode 3, otherwise read always get incorrect data. For DTR mode, flexspi will automatically div 2 of the root clock and output to device. the formula is: device_clock = root_clock / (is_dtr ? 2 : 1) So correct the clock rate setting for DTR mode to get the max performance. Signed-off-by: Haibo Chen Reviewed-by: Frank Li Link: https://patch.msgid.link/20250917-flexspi-ddr-v2-4-bb9fe2a01889@nxp.com Signed-off-by: Mark Brown --- drivers/spi/spi-nxp-fspi.c | 56 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c index bd61f951d6befd..d25679fafad7a9 100644 --- a/drivers/spi/spi-nxp-fspi.c +++ b/drivers/spi/spi-nxp-fspi.c @@ -399,7 +399,8 @@ struct nxp_fspi { struct mutex lock; struct pm_qos_request pm_qos_req; int selected; -#define FSPI_NEED_INIT (1 << 0) +#define FSPI_NEED_INIT BIT(0) +#define FSPI_DTR_MODE BIT(1) int flags; }; @@ -655,6 +656,40 @@ static void nxp_fspi_clk_disable_unprep(struct nxp_fspi *f) return; } +/* + * Sample Clock source selection for Flash Reading + * Four modes defined by fspi: + * mode 0: Dummy Read strobe generated by FlexSPI Controller + * and loopback internally + * mode 1: Dummy Read strobe generated by FlexSPI Controller + * and loopback from DQS pad + * mode 2: Reserved + * mode 3: Flash provided Read strobe and input from DQS pad + * + * fspi default use mode 0 after reset + */ +static void nxp_fspi_select_rx_sample_clk_source(struct nxp_fspi *f, + bool op_is_dtr) +{ + u32 reg; + + /* + * For 8D-8D-8D mode, need to use mode 3 (Flash provided Read + * strobe and input from DQS pad), otherwise read operaton may + * meet issue. + * This mode require flash device connect the DQS pad on board. + * For other modes, still use mode 0, keep align with before. + * spi_nor_suspend will disable 8D-8D-8D mode, also need to + * change the mode back to mode 0. + */ + reg = fspi_readl(f, f->iobase + FSPI_MCR0); + if (op_is_dtr) + reg |= FSPI_MCR0_RXCLKSRC(3); + else /*select mode 0 */ + reg &= ~FSPI_MCR0_RXCLKSRC(3); + fspi_writel(f, reg, f->iobase + FSPI_MCR0); +} + static void nxp_fspi_dll_calibration(struct nxp_fspi *f) { int ret; @@ -736,15 +771,18 @@ static void nxp_fspi_dll_override(struct nxp_fspi *f) static void nxp_fspi_select_mem(struct nxp_fspi *f, struct spi_device *spi, const struct spi_mem_op *op) { + /* flexspi only support one DTR mode: 8D-8D-8D */ + bool op_is_dtr = op->cmd.dtr && op->addr.dtr && op->dummy.dtr && op->data.dtr; unsigned long rate = op->max_freq; int ret; uint64_t size_kb; /* * Return, if previously selected target device is same as current - * requested target device. + * requested target device. Also the DTR or STR mode do not change. */ - if (f->selected == spi_get_chipselect(spi, 0)) + if ((f->selected == spi_get_chipselect(spi, 0)) && + (!!(f->flags & FSPI_DTR_MODE) == op_is_dtr)) return; /* Reset FLSHxxCR0 registers */ @@ -761,6 +799,18 @@ static void nxp_fspi_select_mem(struct nxp_fspi *f, struct spi_device *spi, dev_dbg(f->dev, "Target device [CS:%x] selected\n", spi_get_chipselect(spi, 0)); + nxp_fspi_select_rx_sample_clk_source(f, op_is_dtr); + + if (op_is_dtr) { + f->flags |= FSPI_DTR_MODE; + /* For DTR mode, flexspi will default div 2 and output to device. + * so here to config the root clock to 2 * device rate. + */ + rate = rate * 2; + } else { + f->flags &= ~FSPI_DTR_MODE; + } + nxp_fspi_clk_disable_unprep(f); ret = clk_set_rate(f->clk, rate); From 0f67557763accbdd56681f17ed5350735198c57b Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Wed, 17 Sep 2025 15:27:10 +0800 Subject: [PATCH 2198/3206] spi: spi-nxp-fspi: Add OCT-DTR mode support Add OCT-DTR mode support in default, since flexspi do not supports swapping bytes on a 16 bit boundary in OCT-DTR mode, so mark swap16 as false. lx2160a do not support DQS, so add a quirk to disable DTR mode for this platform. Signed-off-by: Haibo Chen Reviewed-by: Frank Li Link: https://patch.msgid.link/20250917-flexspi-ddr-v2-5-bb9fe2a01889@nxp.com Signed-off-by: Mark Brown --- drivers/spi/spi-nxp-fspi.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c index d25679fafad7a9..f9371f98a65bdc 100644 --- a/drivers/spi/spi-nxp-fspi.c +++ b/drivers/spi/spi-nxp-fspi.c @@ -330,6 +330,8 @@ /* Access flash memory using IP bus only */ #define FSPI_QUIRK_USE_IP_ONLY BIT(0) +/* Disable DTR */ +#define FSPI_QUIRK_DISABLE_DTR BIT(1) struct nxp_fspi_devtype_data { unsigned int rxfifo; @@ -344,7 +346,7 @@ static struct nxp_fspi_devtype_data lx2160a_data = { .rxfifo = SZ_512, /* (64 * 64 bits) */ .txfifo = SZ_1K, /* (128 * 64 bits) */ .ahb_buf_size = SZ_2K, /* (256 * 64 bits) */ - .quirks = 0, + .quirks = FSPI_QUIRK_DISABLE_DTR, .lut_num = 32, .little_endian = true, /* little-endian */ }; @@ -1231,6 +1233,13 @@ static const struct spi_controller_mem_ops nxp_fspi_mem_ops = { }; static const struct spi_controller_mem_caps nxp_fspi_mem_caps = { + .dtr = true, + .swap16 = false, + .per_op_freq = true, +}; + +static const struct spi_controller_mem_caps nxp_fspi_mem_caps_disable_dtr = { + .dtr = false, .per_op_freq = true, }; @@ -1346,7 +1355,12 @@ static int nxp_fspi_probe(struct platform_device *pdev) ctlr->bus_num = -1; ctlr->num_chipselect = NXP_FSPI_MAX_CHIPSELECT; ctlr->mem_ops = &nxp_fspi_mem_ops; - ctlr->mem_caps = &nxp_fspi_mem_caps; + + if (f->devtype_data->quirks & FSPI_QUIRK_DISABLE_DTR) + ctlr->mem_caps = &nxp_fspi_mem_caps_disable_dtr; + else + ctlr->mem_caps = &nxp_fspi_mem_caps; + ctlr->dev.of_node = np; ret = devm_add_action_or_reset(dev, nxp_fspi_cleanup, f); From a24802b0a2a238eaa610b0b0e87a4500a35de64a Mon Sep 17 00:00:00 2001 From: Gabor Juhos Date: Tue, 16 Sep 2025 19:43:46 +0200 Subject: [PATCH 2199/3206] spi: spi-qpic-snand: simplify clock handling by using devm_clk_get_enabled() The devm_clk_get_enabled() function prepares and enables the particular clock, which then automatically gets disabled and unprepared on probe failure and on device removal. Use that function instead of devm_clk_get() and remove the clk_prepare_enable()/clk_disable_unprepare() calls in order to simplify the code. This also ensures that the clocks are handled in the correct order during device removal. Signed-off-by: Gabor Juhos Link: https://patch.msgid.link/20250916-qpic-snand-devm_clk_get_enabled-v1-1-09953493b7f1@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-qpic-snand.c | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/drivers/spi/spi-qpic-snand.c b/drivers/spi/spi-qpic-snand.c index 28755dbce399d8..58ceea1ea8fb4e 100644 --- a/drivers/spi/spi-qpic-snand.c +++ b/drivers/spi/spi-qpic-snand.c @@ -1542,15 +1542,15 @@ static int qcom_spi_probe(struct platform_device *pdev) snandc->props = dev_data; - snandc->core_clk = devm_clk_get(dev, "core"); + snandc->core_clk = devm_clk_get_enabled(dev, "core"); if (IS_ERR(snandc->core_clk)) return PTR_ERR(snandc->core_clk); - snandc->aon_clk = devm_clk_get(dev, "aon"); + snandc->aon_clk = devm_clk_get_enabled(dev, "aon"); if (IS_ERR(snandc->aon_clk)) return PTR_ERR(snandc->aon_clk); - snandc->qspi->iomacro_clk = devm_clk_get(dev, "iom"); + snandc->qspi->iomacro_clk = devm_clk_get_enabled(dev, "iom"); if (IS_ERR(snandc->qspi->iomacro_clk)) return PTR_ERR(snandc->qspi->iomacro_clk); @@ -1564,18 +1564,6 @@ static int qcom_spi_probe(struct platform_device *pdev) if (dma_mapping_error(dev, snandc->base_dma)) return -ENXIO; - ret = clk_prepare_enable(snandc->core_clk); - if (ret) - goto err_dis_core_clk; - - ret = clk_prepare_enable(snandc->aon_clk); - if (ret) - goto err_dis_aon_clk; - - ret = clk_prepare_enable(snandc->qspi->iomacro_clk); - if (ret) - goto err_dis_iom_clk; - ret = qcom_nandc_alloc(snandc); if (ret) goto err_snand_alloc; @@ -1616,12 +1604,6 @@ static int qcom_spi_probe(struct platform_device *pdev) err_spi_init: qcom_nandc_unalloc(snandc); err_snand_alloc: - clk_disable_unprepare(snandc->qspi->iomacro_clk); -err_dis_iom_clk: - clk_disable_unprepare(snandc->aon_clk); -err_dis_aon_clk: - clk_disable_unprepare(snandc->core_clk); -err_dis_core_clk: dma_unmap_resource(dev, res->start, resource_size(res), DMA_BIDIRECTIONAL, 0); return ret; @@ -1636,11 +1618,6 @@ static void qcom_spi_remove(struct platform_device *pdev) spi_unregister_controller(ctlr); nand_ecc_unregister_on_host_hw_engine(&snandc->qspi->ecc_eng); qcom_nandc_unalloc(snandc); - - clk_disable_unprepare(snandc->aon_clk); - clk_disable_unprepare(snandc->core_clk); - clk_disable_unprepare(snandc->qspi->iomacro_clk); - dma_unmap_resource(&pdev->dev, snandc->base_dma, resource_size(res), DMA_BIDIRECTIONAL, 0); } From c24928ac69be2390cdf456d126b464af079c57ef Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:46 +0300 Subject: [PATCH 2200/3206] mfd: max77705: max77705_charger: move active discharge setting to mfd parent Active discharge setting is a part of MFD top level i2c device, hence cannot be controlled by charger. Writing to MAX77705_PMIC_REG_MAINCTRL1 register from charger driver is a mistake. Move active discharge setting to MFD parent driver. Fixes: a6a494c8e3ce ("power: supply: max77705: Add charger driver for Maxim 77705") Signed-off-by: Dzmitry Sankouski Acked-by: Lee Jones Signed-off-by: Sebastian Reichel --- drivers/mfd/max77705.c | 3 +++ drivers/power/supply/max77705_charger.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/max77705.c b/drivers/mfd/max77705.c index 6b263bacb8c28d..ff07d0e0d5f8ee 100644 --- a/drivers/mfd/max77705.c +++ b/drivers/mfd/max77705.c @@ -108,6 +108,9 @@ static int max77705_i2c_probe(struct i2c_client *i2c) if (pmic_rev != MAX77705_PASS3) return dev_err_probe(dev, -ENODEV, "Rev.0x%x is not tested\n", pmic_rev); + /* Active Discharge Enable */ + regmap_update_bits(max77705->regmap, MAX77705_PMIC_REG_MAINCTRL1, 1, 1); + ret = devm_regmap_add_irq_chip(dev, max77705->regmap, i2c->irq, IRQF_ONESHOT | IRQF_SHARED, 0, diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index 329b430d0e5065..3b75c82b9b9ead 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -487,9 +487,6 @@ static void max77705_charger_initialize(struct max77705_charger_data *chg) regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_00, MAX77705_WDTEN_MASK, 0); - /* Active Discharge Enable */ - regmap_update_bits(regmap, MAX77705_PMIC_REG_MAINCTRL1, 1, 1); - /* VBYPSET=5.0V */ regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_11, MAX77705_VBYPSET_MASK, 0); From 251090e2c2c1be60607d1c521af2c993f04d4f61 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Thu, 18 Sep 2025 12:30:32 -0300 Subject: [PATCH 2201/3206] smb: client: fix file open check in __cifs_unlink() Fix the file open check to decide whether or not silly-rename the file in SMB2+. Fixes: c5ea3065586d ("smb: client: fix data loss due to broken rename(2)") Signed-off-by: Paulo Alcantara (Red Hat) Cc: Frank Sorenson Reviewed-by: David Howells Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/inode.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 1703f1285d36d2..0f0d2dae6283ad 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2003,8 +2003,21 @@ static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyren goto psx_del_no_retry; } - if (sillyrename || (server->vals->protocol_id > SMB10_PROT_ID && - d_is_positive(dentry) && d_count(dentry) > 2)) + /* For SMB2+, if the file is open, we always perform a silly rename. + * + * We check for d_count() right after calling + * cifs_close_deferred_file_under_dentry() to make sure that the + * dentry's refcount gets dropped in case the file had any deferred + * close. + */ + if (!sillyrename && server->vals->protocol_id > SMB10_PROT_ID) { + spin_lock(&dentry->d_lock); + if (d_count(dentry) > 1) + sillyrename = true; + spin_unlock(&dentry->d_lock); + } + + if (sillyrename) rc = -EBUSY; else rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); From daac51c7032036a0ca5f1aa419ad1b0471d1c6e0 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 18 Sep 2025 03:06:46 +0200 Subject: [PATCH 2202/3206] smb: client: fix smbdirect_recv_io leak in smbd_negotiate() error path During tests of another unrelated patch I was able to trigger this error: Objects remaining on __kmem_cache_shutdown() Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Fixes: f198186aa9bb ("CIFS: SMBD: Establish SMB Direct connection") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 1f3d6414586361..e0fce5033004c7 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1108,8 +1108,10 @@ static int smbd_negotiate(struct smbd_connection *info) log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", rc, response->sge.addr, response->sge.length, response->sge.lkey); - if (rc) + if (rc) { + put_receive_buffer(info, response); return rc; + } init_completion(&info->negotiate_completion); info->negotiate_done = false; From c2a60426e94a56e5329f6c2681c251281f63ab24 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Sep 2025 17:34:08 +0200 Subject: [PATCH 2203/3206] soc: fsl: qmc: Only set completion interrupt when needed When no post-completion processing is expected, don't waste time handling useless interrupts. Only set QMC_BD_[R/T]X_I when a completion function is passed in, and perform seamless completion on submit for interruptless buffers. Acked-by: Herve Codina Signed-off-by: Christophe Leroy Link: https://patch.msgid.link/40b41b53a26e77a50b3a5f68fcecc6f9a40a84b4.1758209158.git.christophe.leroy@csgroup.eu Signed-off-by: Mark Brown --- drivers/soc/fsl/qe/qmc.c | 44 ++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/drivers/soc/fsl/qe/qmc.c b/drivers/soc/fsl/qe/qmc.c index 36c0ccc06151f3..da5ea6d3561840 100644 --- a/drivers/soc/fsl/qe/qmc.c +++ b/drivers/soc/fsl/qe/qmc.c @@ -461,9 +461,16 @@ int qmc_chan_write_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length, ctrl = qmc_read16(&bd->cbd_sc); if (ctrl & (QMC_BD_TX_R | QMC_BD_TX_UB)) { - /* We are full ... */ - ret = -EBUSY; - goto end; + if (!(ctrl & (QMC_BD_TX_R | QMC_BD_TX_I)) && bd == chan->txbd_done) { + if (ctrl & QMC_BD_TX_W) + chan->txbd_done = chan->txbds; + else + chan->txbd_done++; + } else { + /* We are full ... */ + ret = -EBUSY; + goto end; + } } qmc_write16(&bd->cbd_datlen, length); @@ -475,6 +482,10 @@ int qmc_chan_write_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length, /* Activate the descriptor */ ctrl |= (QMC_BD_TX_R | QMC_BD_TX_UB); + if (complete) + ctrl |= QMC_BD_TX_I; + else + ctrl &= ~QMC_BD_TX_I; wmb(); /* Be sure to flush the descriptor before control update */ qmc_write16(&bd->cbd_sc, ctrl); @@ -569,9 +580,16 @@ int qmc_chan_read_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length, ctrl = qmc_read16(&bd->cbd_sc); if (ctrl & (QMC_BD_RX_E | QMC_BD_RX_UB)) { - /* We are full ... */ - ret = -EBUSY; - goto end; + if (!(ctrl & (QMC_BD_RX_E | QMC_BD_RX_I)) && bd == chan->rxbd_done) { + if (ctrl & QMC_BD_RX_W) + chan->rxbd_done = chan->rxbds; + else + chan->rxbd_done++; + } else { + /* We are full ... */ + ret = -EBUSY; + goto end; + } } qmc_write16(&bd->cbd_datlen, 0); /* data length is updated by the QMC */ @@ -587,6 +605,10 @@ int qmc_chan_read_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length, /* Activate the descriptor */ ctrl |= (QMC_BD_RX_E | QMC_BD_RX_UB); + if (complete) + ctrl |= QMC_BD_RX_I; + else + ctrl &= ~QMC_BD_RX_I; wmb(); /* Be sure to flush data before descriptor activation */ qmc_write16(&bd->cbd_sc, ctrl); @@ -1482,19 +1504,19 @@ static int qmc_setup_chan(struct qmc *qmc, struct qmc_chan *chan) /* Init Rx BDs and set Wrap bit on last descriptor */ BUILD_BUG_ON(QMC_NB_RXBDS == 0); - val = QMC_BD_RX_I; for (i = 0; i < QMC_NB_RXBDS; i++) { bd = chan->rxbds + i; - qmc_write16(&bd->cbd_sc, val); + qmc_write16(&bd->cbd_sc, 0); } bd = chan->rxbds + QMC_NB_RXBDS - 1; - qmc_write16(&bd->cbd_sc, val | QMC_BD_RX_W); + qmc_write16(&bd->cbd_sc, QMC_BD_RX_W); /* Init Tx BDs and set Wrap bit on last descriptor */ BUILD_BUG_ON(QMC_NB_TXBDS == 0); - val = QMC_BD_TX_I; if (chan->mode == QMC_HDLC) - val |= QMC_BD_TX_L | QMC_BD_TX_TC; + val = QMC_BD_TX_L | QMC_BD_TX_TC; + else + val = 0; for (i = 0; i < QMC_NB_TXBDS; i++) { bd = chan->txbds + i; qmc_write16(&bd->cbd_sc, val); From fb418fe26d28378700bddc16f5fa3362dda86d1b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Sep 2025 17:34:09 +0200 Subject: [PATCH 2204/3206] ASoC: fsl: fsl_qmc_audio: Ensure audio channels are ordered in TDM bus To reduce complexity of interrupt handling in following patch, ensure audio channels are configured in the same order as timeslots on the TDM bus. If we need a given ordering of audio sources in the audio frame, it is possible to re-order codecs on the TDM bus, no need to mix up timeslots in channels. Acked-by: Herve Codina Signed-off-by: Christophe Leroy Link: https://patch.msgid.link/4ff40afdf3d032b05dd4af6c0f777d4d4b445a76.1758209158.git.christophe.leroy@csgroup.eu Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_qmc_audio.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/sound/soc/fsl/fsl_qmc_audio.c b/sound/soc/fsl/fsl_qmc_audio.c index 5614a8b909edf8..c0c7ef0a151190 100644 --- a/sound/soc/fsl/fsl_qmc_audio.c +++ b/sound/soc/fsl/fsl_qmc_audio.c @@ -791,12 +791,17 @@ static int qmc_audio_dai_parse(struct qmc_audio *qmc_audio, struct device_node * struct qmc_dai *qmc_dai, struct snd_soc_dai_driver *qmc_soc_dai_driver) { + struct qmc_chan_ts_info ts_info; struct qmc_chan_info info; unsigned long rx_fs_rate; unsigned long tx_fs_rate; + int prev_last_rx_ts = 0; + int prev_last_tx_ts = 0; unsigned int nb_tx_ts; unsigned int nb_rx_ts; unsigned int i; + int last_rx_ts; + int last_tx_ts; int count; u32 val; int ret; @@ -879,6 +884,30 @@ static int qmc_audio_dai_parse(struct qmc_audio *qmc_audio, struct device_node * return -EINVAL; } } + + ret = qmc_chan_get_ts_info(qmc_dai->chans[i].qmc_chan, &ts_info); + if (ret) { + dev_err(qmc_audio->dev, "dai %d get QMC %d channel TS info failed %d\n", + qmc_dai->id, i, ret); + return ret; + } + + last_rx_ts = fls64(ts_info.rx_ts_mask); + last_tx_ts = fls64(ts_info.tx_ts_mask); + + if (prev_last_rx_ts > last_rx_ts) { + dev_err(qmc_audio->dev, "dai %d QMC chan %d unordered channels (RX timeslot %d before %d)\n", + qmc_dai->id, i, prev_last_rx_ts, last_rx_ts); + return -EINVAL; + } + if (prev_last_tx_ts > last_tx_ts) { + dev_err(qmc_audio->dev, "dai %d QMC chan %d unordered channels (TX timeslot %d before %d)\n", + qmc_dai->id, i, prev_last_tx_ts, last_tx_ts); + return -EINVAL; + } + + prev_last_rx_ts = last_rx_ts; + prev_last_tx_ts = last_tx_ts; } qmc_dai->nb_chans_avail = count; From 4c5f8c25561f36407cb137d4c350651820068148 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Sep 2025 17:34:10 +0200 Subject: [PATCH 2205/3206] ASoC: fsl: fsl_qmc_audio: Only request completion on last channel In non-interleaved mode, several QMC channels are used in sync. More details can be found in commit 188d9cae5438 ("ASoC: fsl: fsl_qmc_audio: Add support for non-interleaved mode.") At the time being, an interrupt is requested on each channel to perform capture/playback completion, allthough the completion is really performed only once all channels have completed their work. This leads to a lot more interrupts than really needed. Looking at /proc/interrupts shows ~3800 interrupts per second when using 4 capture and 4 playback devices with 5ms periods while only 1600 (200 x 4 + 200 x 4) periods are processed during one second. The QMC channels work in sync, the one started first is the one finishing first and the one started last is the one finishing last, so when the last one finishes it is guaranteed that the other ones are finished as well. Therefore only request completion processing on the last QMC channel. On my board with the above exemple, on a kernel started with 'threadirqs' option, the QMC irq thread uses 16% CPU time with this patch while it uses 26% CPU time without this patch. Acked-by: Herve Codina Signed-off-by: Christophe Leroy Link: https://patch.msgid.link/bbd5167d190bbb45c3a4cd6ef2dece8817e0cc1e.1758209158.git.christophe.leroy@csgroup.eu Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_qmc_audio.c | 46 +++++------------------------------ 1 file changed, 6 insertions(+), 40 deletions(-) diff --git a/sound/soc/fsl/fsl_qmc_audio.c b/sound/soc/fsl/fsl_qmc_audio.c index c0c7ef0a151190..2790953543c54f 100644 --- a/sound/soc/fsl/fsl_qmc_audio.c +++ b/sound/soc/fsl/fsl_qmc_audio.c @@ -57,7 +57,6 @@ struct qmc_dai_prtd { size_t ch_dma_offset; unsigned int channels; - DECLARE_BITMAP(chans_pending, 64); struct snd_pcm_substream *substream; }; @@ -126,17 +125,14 @@ static int qmc_audio_pcm_write_submit(struct qmc_dai_prtd *prtd) int ret; for (i = 0; i < prtd->channels; i++) { - bitmap_set(prtd->chans_pending, i, 1); - ret = qmc_chan_write_submit(prtd->qmc_dai->chans[i].qmc_chan, prtd->ch_dma_addr_current + i * prtd->ch_dma_offset, prtd->ch_dma_size, - qmc_audio_pcm_write_complete, - &prtd->qmc_dai->chans[i]); + i == prtd->channels - 1 ? qmc_audio_pcm_write_complete : + NULL, prtd); if (ret) { dev_err(prtd->qmc_dai->dev, "write_submit %u failed %d\n", i, ret); - bitmap_clear(prtd->chans_pending, i, 1); return ret; } } @@ -146,20 +142,7 @@ static int qmc_audio_pcm_write_submit(struct qmc_dai_prtd *prtd) static void qmc_audio_pcm_write_complete(void *context) { - struct qmc_dai_chan *chan = context; - struct qmc_dai_prtd *prtd; - - prtd = chan->prtd_tx; - - /* Mark the current channel as completed */ - bitmap_clear(prtd->chans_pending, chan - prtd->qmc_dai->chans, 1); - - /* - * All QMC channels involved must have completed their transfer before - * submitting a new one. - */ - if (!bitmap_empty(prtd->chans_pending, 64)) - return; + struct qmc_dai_prtd *prtd = context; prtd->buffer_ended += prtd->period_size; if (prtd->buffer_ended >= prtd->buffer_size) @@ -182,17 +165,14 @@ static int qmc_audio_pcm_read_submit(struct qmc_dai_prtd *prtd) int ret; for (i = 0; i < prtd->channels; i++) { - bitmap_set(prtd->chans_pending, i, 1); - ret = qmc_chan_read_submit(prtd->qmc_dai->chans[i].qmc_chan, prtd->ch_dma_addr_current + i * prtd->ch_dma_offset, prtd->ch_dma_size, - qmc_audio_pcm_read_complete, - &prtd->qmc_dai->chans[i]); + i == prtd->channels - 1 ? qmc_audio_pcm_read_complete : + NULL, prtd); if (ret) { dev_err(prtd->qmc_dai->dev, "read_submit %u failed %d\n", i, ret); - bitmap_clear(prtd->chans_pending, i, 1); return ret; } } @@ -202,26 +182,13 @@ static int qmc_audio_pcm_read_submit(struct qmc_dai_prtd *prtd) static void qmc_audio_pcm_read_complete(void *context, size_t length, unsigned int flags) { - struct qmc_dai_chan *chan = context; - struct qmc_dai_prtd *prtd; - - prtd = chan->prtd_rx; - - /* Mark the current channel as completed */ - bitmap_clear(prtd->chans_pending, chan - prtd->qmc_dai->chans, 1); + struct qmc_dai_prtd *prtd = context; if (length != prtd->ch_dma_size) { dev_err(prtd->qmc_dai->dev, "read complete length = %zu, exp %zu\n", length, prtd->ch_dma_size); } - /* - * All QMC channels involved must have completed their transfer before - * submitting a new one. - */ - if (!bitmap_empty(prtd->chans_pending, 64)) - return; - prtd->buffer_ended += prtd->period_size; if (prtd->buffer_ended >= prtd->buffer_size) prtd->buffer_ended = 0; @@ -249,7 +216,6 @@ static int qmc_audio_pcm_trigger(struct snd_soc_component *component, switch (cmd) { case SNDRV_PCM_TRIGGER_START: - bitmap_zero(prtd->chans_pending, 64); prtd->buffer_ended = 0; prtd->ch_dma_addr_current = prtd->ch_dma_addr_start; From 2c618f361ae6b9da7fafafc289051728ef4c6ea3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Sep 2025 17:34:11 +0200 Subject: [PATCH 2206/3206] ASoC: fsl: fsl_qmc_audio: Drop struct qmc_dai_chan prtd_tx and prtd_rx members are not used anymore and only qmc_chan member remains so struct qmc_dai_chan has become pointless. Use qmc_chan directly and drop struct qmc_dai_chan. Acked-by: Herve Codina Signed-off-by: Christophe Leroy Link: https://patch.msgid.link/9c729bbd9f1b61120a09a87fb76176ef344c5153.1758209158.git.christophe.leroy@csgroup.eu Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_qmc_audio.c | 52 ++++++++++++++--------------------- 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/sound/soc/fsl/fsl_qmc_audio.c b/sound/soc/fsl/fsl_qmc_audio.c index 2790953543c54f..3de448ef724cb3 100644 --- a/sound/soc/fsl/fsl_qmc_audio.c +++ b/sound/soc/fsl/fsl_qmc_audio.c @@ -17,12 +17,6 @@ #include #include -struct qmc_dai_chan { - struct qmc_dai_prtd *prtd_tx; - struct qmc_dai_prtd *prtd_rx; - struct qmc_chan *qmc_chan; -}; - struct qmc_dai { char *name; int id; @@ -33,7 +27,7 @@ struct qmc_dai { unsigned int nb_chans_avail; unsigned int nb_chans_used_tx; unsigned int nb_chans_used_rx; - struct qmc_dai_chan *chans; + struct qmc_chan **qmc_chans; }; struct qmc_audio { @@ -125,7 +119,7 @@ static int qmc_audio_pcm_write_submit(struct qmc_dai_prtd *prtd) int ret; for (i = 0; i < prtd->channels; i++) { - ret = qmc_chan_write_submit(prtd->qmc_dai->chans[i].qmc_chan, + ret = qmc_chan_write_submit(prtd->qmc_dai->qmc_chans[i], prtd->ch_dma_addr_current + i * prtd->ch_dma_offset, prtd->ch_dma_size, i == prtd->channels - 1 ? qmc_audio_pcm_write_complete : @@ -165,7 +159,7 @@ static int qmc_audio_pcm_read_submit(struct qmc_dai_prtd *prtd) int ret; for (i = 0; i < prtd->channels; i++) { - ret = qmc_chan_read_submit(prtd->qmc_dai->chans[i].qmc_chan, + ret = qmc_chan_read_submit(prtd->qmc_dai->qmc_chans[i], prtd->ch_dma_addr_current + i * prtd->ch_dma_offset, prtd->ch_dma_size, i == prtd->channels - 1 ? qmc_audio_pcm_read_complete : @@ -206,7 +200,6 @@ static int qmc_audio_pcm_trigger(struct snd_soc_component *component, struct snd_pcm_substream *substream, int cmd) { struct qmc_dai_prtd *prtd = substream->runtime->private_data; - unsigned int i; int ret; if (!prtd->qmc_dai) { @@ -220,9 +213,6 @@ static int qmc_audio_pcm_trigger(struct snd_soc_component *component, prtd->ch_dma_addr_current = prtd->ch_dma_addr_start; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { - for (i = 0; i < prtd->channels; i++) - prtd->qmc_dai->chans[i].prtd_tx = prtd; - /* Submit first chunk ... */ ret = qmc_audio_pcm_write_submit(prtd); if (ret) @@ -238,9 +228,6 @@ static int qmc_audio_pcm_trigger(struct snd_soc_component *component, if (ret) return ret; } else { - for (i = 0; i < prtd->channels; i++) - prtd->qmc_dai->chans[i].prtd_rx = prtd; - /* Submit first chunk ... */ ret = qmc_audio_pcm_read_submit(prtd); if (ret) @@ -610,9 +597,9 @@ static int qmc_dai_hw_params(struct snd_pcm_substream *substream, chan_param.mode = QMC_TRANSPARENT; chan_param.transp.max_rx_buf_size = params_period_bytes(params) / nb_chans_used; for (i = 0; i < nb_chans_used; i++) { - ret = qmc_chan_set_param(qmc_dai->chans[i].qmc_chan, &chan_param); + ret = qmc_chan_set_param(qmc_dai->qmc_chans[i], &chan_param); if (ret) { - dev_err(dai->dev, "chans[%u], set param failed %d\n", + dev_err(dai->dev, "qmc_chans[%u], set param failed %d\n", i, ret); return ret; } @@ -654,7 +641,7 @@ static int qmc_dai_trigger(struct snd_pcm_substream *substream, int cmd, case SNDRV_PCM_TRIGGER_RESUME: case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: for (i = 0; i < nb_chans_used; i++) { - ret = qmc_chan_start(qmc_dai->chans[i].qmc_chan, direction); + ret = qmc_chan_start(qmc_dai->qmc_chans[i], direction); if (ret) goto err_stop; } @@ -663,13 +650,13 @@ static int qmc_dai_trigger(struct snd_pcm_substream *substream, int cmd, case SNDRV_PCM_TRIGGER_STOP: /* Stop and reset all QMC channels and return the first error encountered */ for (i = 0; i < nb_chans_used; i++) { - ret_tmp = qmc_chan_stop(qmc_dai->chans[i].qmc_chan, direction); + ret_tmp = qmc_chan_stop(qmc_dai->qmc_chans[i], direction); if (!ret) ret = ret_tmp; if (ret_tmp) continue; - ret_tmp = qmc_chan_reset(qmc_dai->chans[i].qmc_chan, direction); + ret_tmp = qmc_chan_reset(qmc_dai->qmc_chans[i], direction); if (!ret) ret = ret_tmp; } @@ -681,7 +668,7 @@ static int qmc_dai_trigger(struct snd_pcm_substream *substream, int cmd, case SNDRV_PCM_TRIGGER_PAUSE_PUSH: /* Stop all QMC channels and return the first error encountered */ for (i = 0; i < nb_chans_used; i++) { - ret_tmp = qmc_chan_stop(qmc_dai->chans[i].qmc_chan, direction); + ret_tmp = qmc_chan_stop(qmc_dai->qmc_chans[i], direction); if (!ret) ret = ret_tmp; } @@ -697,8 +684,8 @@ static int qmc_dai_trigger(struct snd_pcm_substream *substream, int cmd, err_stop: while (i--) { - qmc_chan_stop(qmc_dai->chans[i].qmc_chan, direction); - qmc_chan_reset(qmc_dai->chans[i].qmc_chan, direction); + qmc_chan_stop(qmc_dai->qmc_chans[i], direction); + qmc_chan_reset(qmc_dai->qmc_chans[i], direction); } return ret; } @@ -794,19 +781,20 @@ static int qmc_audio_dai_parse(struct qmc_audio *qmc_audio, struct device_node * return dev_err_probe(qmc_audio->dev, -EINVAL, "dai %d no QMC channel defined\n", qmc_dai->id); - qmc_dai->chans = devm_kcalloc(qmc_audio->dev, count, sizeof(*qmc_dai->chans), GFP_KERNEL); - if (!qmc_dai->chans) + qmc_dai->qmc_chans = devm_kcalloc(qmc_audio->dev, count, sizeof(*qmc_dai->qmc_chans), + GFP_KERNEL); + if (!qmc_dai->qmc_chans) return -ENOMEM; for (i = 0; i < count; i++) { - qmc_dai->chans[i].qmc_chan = devm_qmc_chan_get_byphandles_index(qmc_audio->dev, np, - "fsl,qmc-chan", i); - if (IS_ERR(qmc_dai->chans[i].qmc_chan)) { - return dev_err_probe(qmc_audio->dev, PTR_ERR(qmc_dai->chans[i].qmc_chan), + qmc_dai->qmc_chans[i] = devm_qmc_chan_get_byphandles_index(qmc_audio->dev, np, + "fsl,qmc-chan", i); + if (IS_ERR(qmc_dai->qmc_chans[i])) { + return dev_err_probe(qmc_audio->dev, PTR_ERR(qmc_dai->qmc_chans[i]), "dai %d get QMC channel %d failed\n", qmc_dai->id, i); } - ret = qmc_chan_get_info(qmc_dai->chans[i].qmc_chan, &info); + ret = qmc_chan_get_info(qmc_dai->qmc_chans[i], &info); if (ret) { dev_err(qmc_audio->dev, "dai %d get QMC %d channel info failed %d\n", qmc_dai->id, i, ret); @@ -851,7 +839,7 @@ static int qmc_audio_dai_parse(struct qmc_audio *qmc_audio, struct device_node * } } - ret = qmc_chan_get_ts_info(qmc_dai->chans[i].qmc_chan, &ts_info); + ret = qmc_chan_get_ts_info(qmc_dai->qmc_chans[i], &ts_info); if (ret) { dev_err(qmc_audio->dev, "dai %d get QMC %d channel TS info failed %d\n", qmc_dai->id, i, ret); From 1512231b6cc860ffbfbd85b295449dfb6977d357 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 17 Sep 2025 03:27:54 +0000 Subject: [PATCH 2207/3206] bpf: Enforce RCU protection for KF_RCU_PROTECTED Currently, KF_RCU_PROTECTED only applies to iterator APIs and that too in a convoluted fashion: the presence of this flag on the kfunc is used to set MEM_RCU in iterator type, and the lack of RCU protection results in an error only later, once next() or destroy() methods are invoked on the iterator. While there is no bug, this is certainly a bit unintuitive, and makes the enforcement of the flag iterator specific. In the interest of making this flag useful for other upcoming kfuncs, e.g. scx_bpf_cpu_curr() [0][1], add enforcement for invoking the kfunc in an RCU critical section in general. This would also mean that iterator APIs using KF_RCU_PROTECTED will error out earlier, instead of throwing an error for lack of RCU CS protection when next() or destroy() methods are invoked. In addition to this, if the kfuncs tagged KF_RCU_PROTECTED return a pointer value, ensure that this pointer value is only usable in an RCU critical section. There might be edge cases where the return value is special and doesn't need to imply MEM_RCU semantics, but in general, the assumption should hold for the majority of kfuncs, and we can revisit things if necessary later. [0]: https://lore.kernel.org/all/20250903212311.369697-3-christian.loehle@arm.com [1]: https://lore.kernel.org/all/20250909195709.92669-1-arighi@nvidia.com Tested-by: Andrea Righi Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250917032755.4068726-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 19 ++++++++++++++++++- kernel/bpf/verifier.c | 10 ++++++++++ .../selftests/bpf/progs/cgroup_read_xattr.c | 2 +- .../selftests/bpf/progs/iters_task_failure.c | 4 ++-- 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index ae468b781d3118..e38941370b90c9 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -335,9 +335,26 @@ consider doing refcnt != 0 check, especially when returning a KF_ACQUIRE pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should very likely also be KF_RET_NULL. +2.4.8 KF_RCU_PROTECTED flag +--------------------------- + +The KF_RCU_PROTECTED flag is used to indicate that the kfunc must be invoked in +an RCU critical section. This is assumed by default in non-sleepable programs, +and must be explicitly ensured by calling ``bpf_rcu_read_lock`` for sleepable +ones. + +If the kfunc returns a pointer value, this flag also enforces that the returned +pointer is RCU protected, and can only be used while the RCU critical section is +active. + +The flag is distinct from the ``KF_RCU`` flag, which only ensures that its +arguments are at least RCU protected pointers. This may transitively imply that +RCU protection is ensured, but it does not work in cases of kfuncs which require +RCU protection but do not take RCU protected arguments. + .. _KF_deprecated_flag: -2.4.8 KF_DEPRECATED flag +2.4.9 KF_DEPRECATED flag ------------------------ The KF_DEPRECATED flag is used for kfuncs which are scheduled to be diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index beaa391e02fb09..6625570ac23de2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13931,6 +13931,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + if (is_kfunc_rcu_protected(&meta) && !in_rcu_cs(env)) { + verbose(env, "kernel func %s requires RCU critical section protection\n", func_name); + return -EACCES; + } + /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -14044,6 +14049,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Ensures we don't access the memory after a release_reference() */ if (meta.ref_obj_id) regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + + if (is_kfunc_rcu_protected(&meta)) + regs[BPF_REG_0].type |= MEM_RCU; } else { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].btf = desc_btf; @@ -14052,6 +14060,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) regs[BPF_REG_0].type |= PTR_UNTRUSTED; + else if (is_kfunc_rcu_protected(&meta)) + regs[BPF_REG_0].type |= MEM_RCU; if (is_iter_next_kfunc(&meta)) { struct bpf_reg_state *cur_iter; diff --git a/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c b/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c index 092db1d0435e66..88e13e17ec9e97 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c +++ b/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c @@ -73,7 +73,7 @@ int BPF_PROG(use_css_iter_non_sleepable) } SEC("lsm.s/socket_connect") -__failure __msg("expected an RCU CS") +__failure __msg("kernel func bpf_iter_css_new requires RCU critical section protection") int BPF_PROG(use_css_iter_sleepable_missing_rcu_lock) { u64 cgrp_id = bpf_get_current_cgroup_id(); diff --git a/tools/testing/selftests/bpf/progs/iters_task_failure.c b/tools/testing/selftests/bpf/progs/iters_task_failure.c index 6b1588d7065273..fe3663dedbe14d 100644 --- a/tools/testing/selftests/bpf/progs/iters_task_failure.c +++ b/tools/testing/selftests/bpf/progs/iters_task_failure.c @@ -15,7 +15,7 @@ void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") -__failure __msg("expected an RCU CS when using bpf_iter_task_next") +__failure __msg("kernel func bpf_iter_task_new requires RCU critical section protection") int BPF_PROG(iter_tasks_without_lock) { struct task_struct *pos; @@ -27,7 +27,7 @@ int BPF_PROG(iter_tasks_without_lock) } SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") -__failure __msg("expected an RCU CS when using bpf_iter_css_next") +__failure __msg("kernel func bpf_iter_css_new requires RCU critical section protection") int BPF_PROG(iter_css_without_lock) { u64 cg_id = bpf_get_current_cgroup_id(); From 8b788d663861271c10905b23195bc6ae862caad2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 17 Sep 2025 03:27:55 +0000 Subject: [PATCH 2208/3206] selftests/bpf: Add tests for KF_RCU_PROTECTED Add a couple of test cases to ensure RCU protection is kicked in automatically, and the return type is as expected. Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250917032755.4068726-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/iters_testmod.c | 46 +++++++++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod.c | 12 +++++ .../bpf/test_kmods/bpf_testmod_kfunc.h | 2 + 3 files changed, 60 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/iters_testmod.c b/tools/testing/selftests/bpf/progs/iters_testmod.c index 9e4b45201e6927..5379e9960ffd2d 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod.c @@ -123,3 +123,49 @@ int iter_next_ptr_mem_not_trusted(const void *ctx) bpf_iter_num_destroy(&num_it); return 0; } + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("kernel func bpf_kfunc_ret_rcu_test requires RCU critical section protection") +int iter_ret_rcu_test_protected(const void *ctx) +{ + struct task_struct *p; + + p = bpf_kfunc_ret_rcu_test(); + return p->pid; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("R1 type=rcu_ptr_or_null_ expected=") +int iter_ret_rcu_test_type(const void *ctx) +{ + struct task_struct *p; + + bpf_rcu_read_lock(); + p = bpf_kfunc_ret_rcu_test(); + bpf_this_cpu_ptr(p); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("kernel func bpf_kfunc_ret_rcu_test_nostruct requires RCU critical section protection") +int iter_ret_rcu_test_protected_nostruct(const void *ctx) +{ + void *p; + + p = bpf_kfunc_ret_rcu_test_nostruct(4); + return *(int *)p; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("R1 type=rdonly_rcu_mem_or_null expected=") +int iter_ret_rcu_test_type_nostruct(const void *ctx) +{ + void *p; + + bpf_rcu_read_lock(); + p = bpf_kfunc_ret_rcu_test_nostruct(4); + bpf_this_cpu_ptr(p); + bpf_rcu_read_unlock(); + return 0; +} diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 2beb9b2fcbd876..d6ce51df9ed4a2 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -218,6 +218,16 @@ __bpf_kfunc void bpf_kfunc_rcu_task_test(struct task_struct *ptr) { } +__bpf_kfunc struct task_struct *bpf_kfunc_ret_rcu_test(void) +{ + return NULL; +} + +__bpf_kfunc int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) +{ + return NULL; +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@ -623,6 +633,8 @@ BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) +BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test, KF_RET_NULL | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test_nostruct, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 286e7faa47544e..4df6fa6a92cba7 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -158,6 +158,8 @@ void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr) __ksym; void bpf_kfunc_trusted_task_test(struct task_struct *ptr) __ksym; void bpf_kfunc_trusted_num_test(int *ptr) __ksym; void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym; +struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym; +int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym; int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym; From d84510db8c1414b67167cdc452103c1f429588cc Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:47 +0300 Subject: [PATCH 2209/3206] power: supply: max77705_charger: refactoring: rename charger to chg Rename struct max77705_charger_data variable to chg for consistency. Signed-off-by: Dzmitry Sankouski Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 80 ++++++++++++------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index b8f648dd4d63fa..883affd18c8dd5 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -42,9 +42,9 @@ static enum power_supply_property max77705_charger_props[] = { static int max77705_chgin_irq(void *irq_drv_data) { - struct max77705_charger_data *charger = irq_drv_data; + struct max77705_charger_data *chg = irq_drv_data; - queue_work(charger->wqueue, &charger->chgin_work); + queue_work(chg->wqueue, &chg->chgin_work); return 0; } @@ -109,19 +109,19 @@ static int max77705_get_online(struct regmap *regmap, int *val) return 0; } -static int max77705_check_battery(struct max77705_charger_data *charger, int *val) +static int max77705_check_battery(struct max77705_charger_data *chg, int *val) { unsigned int reg_data; unsigned int reg_data2; - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; regmap_read(regmap, MAX77705_CHG_REG_INT_OK, ®_data); - dev_dbg(charger->dev, "CHG_INT_OK(0x%x)\n", reg_data); + dev_dbg(chg->dev, "CHG_INT_OK(0x%x)\n", reg_data); regmap_read(regmap, MAX77705_CHG_REG_DETAILS_00, ®_data2); - dev_dbg(charger->dev, "CHG_DETAILS00(0x%x)\n", reg_data2); + dev_dbg(chg->dev, "CHG_DETAILS00(0x%x)\n", reg_data2); if ((reg_data & MAX77705_BATP_OK) || !(reg_data2 & MAX77705_BATP_DTLS)) *val = true; @@ -131,9 +131,9 @@ static int max77705_check_battery(struct max77705_charger_data *charger, int *va return 0; } -static int max77705_get_charge_type(struct max77705_charger_data *charger, int *val) +static int max77705_get_charge_type(struct max77705_charger_data *chg, int *val) { - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; unsigned int reg_data; regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); @@ -159,9 +159,9 @@ static int max77705_get_charge_type(struct max77705_charger_data *charger, int * return 0; } -static int max77705_get_status(struct max77705_charger_data *charger, int *val) +static int max77705_get_status(struct max77705_charger_data *chg, int *val) { - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; unsigned int reg_data; regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); @@ -234,10 +234,10 @@ static int max77705_get_vbus_state(struct regmap *regmap, int *value) return 0; } -static int max77705_get_battery_health(struct max77705_charger_data *charger, +static int max77705_get_battery_health(struct max77705_charger_data *chg, int *value) { - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; unsigned int bat_dtls; regmap_read(regmap, MAX77705_CHG_REG_DETAILS_01, &bat_dtls); @@ -245,16 +245,16 @@ static int max77705_get_battery_health(struct max77705_charger_data *charger, switch (bat_dtls) { case MAX77705_BATTERY_NOBAT: - dev_dbg(charger->dev, "%s: No battery and the charger is suspended\n", + dev_dbg(chg->dev, "%s: No battery and the chg is suspended\n", __func__); *value = POWER_SUPPLY_HEALTH_NO_BATTERY; break; case MAX77705_BATTERY_PREQUALIFICATION: - dev_dbg(charger->dev, "%s: battery is okay but its voltage is low(~VPQLB)\n", + dev_dbg(chg->dev, "%s: battery is okay but its voltage is low(~VPQLB)\n", __func__); break; case MAX77705_BATTERY_DEAD: - dev_dbg(charger->dev, "%s: battery dead\n", __func__); + dev_dbg(chg->dev, "%s: battery dead\n", __func__); *value = POWER_SUPPLY_HEALTH_DEAD; break; case MAX77705_BATTERY_GOOD: @@ -262,11 +262,11 @@ static int max77705_get_battery_health(struct max77705_charger_data *charger, *value = POWER_SUPPLY_HEALTH_GOOD; break; case MAX77705_BATTERY_OVERVOLTAGE: - dev_dbg(charger->dev, "%s: battery ovp\n", __func__); + dev_dbg(chg->dev, "%s: battery ovp\n", __func__); *value = POWER_SUPPLY_HEALTH_OVERVOLTAGE; break; default: - dev_dbg(charger->dev, "%s: battery unknown\n", __func__); + dev_dbg(chg->dev, "%s: battery unknown\n", __func__); *value = POWER_SUPPLY_HEALTH_UNSPEC_FAILURE; break; } @@ -274,9 +274,9 @@ static int max77705_get_battery_health(struct max77705_charger_data *charger, return 0; } -static int max77705_get_health(struct max77705_charger_data *charger, int *val) +static int max77705_get_health(struct max77705_charger_data *chg, int *val) { - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; int ret, is_online = 0; ret = max77705_get_online(regmap, &is_online); @@ -287,15 +287,15 @@ static int max77705_get_health(struct max77705_charger_data *charger, int *val) if (ret || (*val != POWER_SUPPLY_HEALTH_GOOD)) return ret; } - return max77705_get_battery_health(charger, val); + return max77705_get_battery_health(chg, val); } -static int max77705_get_input_current(struct max77705_charger_data *charger, +static int max77705_get_input_current(struct max77705_charger_data *chg, int *val) { unsigned int reg_data; int get_current = 0; - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); @@ -313,11 +313,11 @@ static int max77705_get_input_current(struct max77705_charger_data *charger, return 0; } -static int max77705_get_charge_current(struct max77705_charger_data *charger, +static int max77705_get_charge_current(struct max77705_charger_data *chg, int *val) { unsigned int reg_data; - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; regmap_read(regmap, MAX77705_CHG_REG_CNFG_02, ®_data); reg_data &= MAX77705_CHG_CC; @@ -327,12 +327,12 @@ static int max77705_get_charge_current(struct max77705_charger_data *charger, return 0; } -static int max77705_set_float_voltage(struct max77705_charger_data *charger, +static int max77705_set_float_voltage(struct max77705_charger_data *chg, int float_voltage) { int float_voltage_mv; unsigned int reg_data = 0; - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; float_voltage_mv = float_voltage / 1000; reg_data = float_voltage_mv <= 4000 ? 0x0 : @@ -345,12 +345,12 @@ static int max77705_set_float_voltage(struct max77705_charger_data *charger, (reg_data << MAX77705_CHG_CV_PRM_SHIFT)); } -static int max77705_get_float_voltage(struct max77705_charger_data *charger, +static int max77705_get_float_voltage(struct max77705_charger_data *chg, int *val) { unsigned int reg_data = 0; int voltage_mv; - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; regmap_read(regmap, MAX77705_CHG_REG_CNFG_04, ®_data); reg_data &= MAX77705_CHG_PRM_MASK; @@ -365,28 +365,28 @@ static int max77705_chg_get_property(struct power_supply *psy, enum power_supply_property psp, union power_supply_propval *val) { - struct max77705_charger_data *charger = power_supply_get_drvdata(psy); - struct regmap *regmap = charger->regmap; + struct max77705_charger_data *chg = power_supply_get_drvdata(psy); + struct regmap *regmap = chg->regmap; switch (psp) { case POWER_SUPPLY_PROP_ONLINE: return max77705_get_online(regmap, &val->intval); case POWER_SUPPLY_PROP_PRESENT: - return max77705_check_battery(charger, &val->intval); + return max77705_check_battery(chg, &val->intval); case POWER_SUPPLY_PROP_STATUS: - return max77705_get_status(charger, &val->intval); + return max77705_get_status(chg, &val->intval); case POWER_SUPPLY_PROP_CHARGE_TYPE: - return max77705_get_charge_type(charger, &val->intval); + return max77705_get_charge_type(chg, &val->intval); case POWER_SUPPLY_PROP_HEALTH: - return max77705_get_health(charger, &val->intval); + return max77705_get_health(chg, &val->intval); case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: - return max77705_get_input_current(charger, &val->intval); + return max77705_get_input_current(chg, &val->intval); case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: - return max77705_get_charge_current(charger, &val->intval); + return max77705_get_charge_current(chg, &val->intval); case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE: - return max77705_get_float_voltage(charger, &val->intval); + return max77705_get_float_voltage(chg, &val->intval); case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN: - val->intval = charger->bat_info->voltage_max_design_uv; + val->intval = chg->bat_info->voltage_max_design_uv; break; case POWER_SUPPLY_PROP_MODEL_NAME: val->strval = max77705_charger_model; @@ -410,10 +410,10 @@ static const struct power_supply_desc max77705_charger_psy_desc = { static void max77705_chgin_isr_work(struct work_struct *work) { - struct max77705_charger_data *charger = + struct max77705_charger_data *chg = container_of(work, struct max77705_charger_data, chgin_work); - power_supply_changed(charger->psy_chg); + power_supply_changed(chg->psy_chg); } static void max77705_charger_initialize(struct max77705_charger_data *chg) From ef1e734dbe257ce8bc42383b9977b5558f061288 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:48 +0300 Subject: [PATCH 2210/3206] power: supply: max77705_charger: use regfields for config registers Using regfields allows to cleanup masks and register offset definition, allowing to access register info by it's functional name. Signed-off-by: Dzmitry Sankouski Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 105 +++++++++--------------- include/linux/power/max77705_charger.h | 102 ++++++++++++----------- 2 files changed, 93 insertions(+), 114 deletions(-) diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index 883affd18c8dd5..837e03bafcc6db 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -74,8 +74,7 @@ static int max77705_charger_enable(struct max77705_charger_data *chg) { int rv; - rv = regmap_update_bits(chg->regmap, MAX77705_CHG_REG_CNFG_09, - MAX77705_CHG_EN_MASK, MAX77705_CHG_EN_MASK); + rv = regmap_field_write(chg->rfield[MAX77705_CHG_EN], 1); if (rv) dev_err(chg->dev, "unable to enable the charger: %d\n", rv); @@ -87,10 +86,7 @@ static void max77705_charger_disable(void *data) struct max77705_charger_data *chg = data; int rv; - rv = regmap_update_bits(chg->regmap, - MAX77705_CHG_REG_CNFG_09, - MAX77705_CHG_EN_MASK, - MAX77705_CHG_DISABLE); + rv = regmap_field_write(chg->rfield[MAX77705_CHG_EN], MAX77705_CHG_DISABLE); if (rv) dev_err(chg->dev, "unable to disable the charger: %d\n", rv); } @@ -134,10 +130,10 @@ static int max77705_check_battery(struct max77705_charger_data *chg, int *val) static int max77705_get_charge_type(struct max77705_charger_data *chg, int *val) { struct regmap *regmap = chg->regmap; - unsigned int reg_data; + unsigned int reg_data, chg_en; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - if (!MAX77705_CHARGER_CHG_CHARGING(reg_data)) { + regmap_field_read(chg->rfield[MAX77705_CHG_EN], &chg_en); + if (!chg_en) { *val = POWER_SUPPLY_CHARGE_TYPE_NONE; return 0; } @@ -162,10 +158,10 @@ static int max77705_get_charge_type(struct max77705_charger_data *chg, int *val) static int max77705_get_status(struct max77705_charger_data *chg, int *val) { struct regmap *regmap = chg->regmap; - unsigned int reg_data; + unsigned int reg_data, chg_en; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - if (!MAX77705_CHARGER_CHG_CHARGING(reg_data)) { + regmap_field_read(chg->rfield[MAX77705_CHG_EN], &chg_en); + if (!chg_en) { *val = POWER_SUPPLY_CHARGE_TYPE_NONE; return 0; } @@ -295,16 +291,11 @@ static int max77705_get_input_current(struct max77705_charger_data *chg, { unsigned int reg_data; int get_current = 0; - struct regmap *regmap = chg->regmap; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - - reg_data &= MAX77705_CHG_CHGIN_LIM_MASK; + regmap_field_read(chg->rfield[MAX77705_CHG_CHGIN_LIM], ®_data); if (reg_data <= 3) get_current = MAX77705_CURRENT_CHGIN_MIN; - else if (reg_data >= MAX77705_CHG_CHGIN_LIM_MASK) - get_current = MAX77705_CURRENT_CHGIN_MAX; else get_current = (reg_data + 1) * MAX77705_CURRENT_CHGIN_STEP; @@ -317,10 +308,8 @@ static int max77705_get_charge_current(struct max77705_charger_data *chg, int *val) { unsigned int reg_data; - struct regmap *regmap = chg->regmap; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_02, ®_data); - reg_data &= MAX77705_CHG_CC; + regmap_field_read(chg->rfield[MAX77705_CHG_CC_LIM], ®_data); *val = reg_data <= 0x2 ? MAX77705_CURRENT_CHGIN_MIN : reg_data * MAX77705_CURRENT_CHG_STEP; @@ -332,7 +321,6 @@ static int max77705_set_float_voltage(struct max77705_charger_data *chg, { int float_voltage_mv; unsigned int reg_data = 0; - struct regmap *regmap = chg->regmap; float_voltage_mv = float_voltage / 1000; reg_data = float_voltage_mv <= 4000 ? 0x0 : @@ -340,9 +328,7 @@ static int max77705_set_float_voltage(struct max77705_charger_data *chg, (float_voltage_mv <= 4200) ? (float_voltage_mv - 4000) / 50 : (((float_voltage_mv - 4200) / 10) + 0x04); - return regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_04, - MAX77705_CHG_CV_PRM_MASK, - (reg_data << MAX77705_CHG_CV_PRM_SHIFT)); + return regmap_field_write(chg->rfield[MAX77705_CHG_CV_PRM], reg_data); } static int max77705_get_float_voltage(struct max77705_charger_data *chg, @@ -350,10 +336,8 @@ static int max77705_get_float_voltage(struct max77705_charger_data *chg, { unsigned int reg_data = 0; int voltage_mv; - struct regmap *regmap = chg->regmap; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_04, ®_data); - reg_data &= MAX77705_CHG_PRM_MASK; + regmap_field_read(chg->rfield[MAX77705_CHG_CV_PRM], ®_data); voltage_mv = reg_data <= 0x04 ? reg_data * 50 + 4000 : (reg_data - 4) * 10 + 4200; *val = voltage_mv * 1000; @@ -418,7 +402,6 @@ static void max77705_chgin_isr_work(struct work_struct *work) static void max77705_charger_initialize(struct max77705_charger_data *chg) { - u8 reg_data; struct power_supply_battery_info *info; struct regmap *regmap = chg->regmap; @@ -429,45 +412,31 @@ static void max77705_charger_initialize(struct max77705_charger_data *chg) /* unlock charger setting protect */ /* slowest LX slope */ - reg_data = MAX77705_CHGPROT_MASK | MAX77705_SLOWEST_LX_SLOPE; - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_06, reg_data, - reg_data); + regmap_field_write(chg->rfield[MAX77705_CHGPROT], MAX77705_CHGPROT_UNLOCKED); + regmap_field_write(chg->rfield[MAX77705_LX_SLOPE], MAX77705_SLOWEST_LX_SLOPE); /* fast charge timer disable */ /* restart threshold disable */ /* pre-qual charge disable */ - reg_data = (MAX77705_FCHGTIME_DISABLE << MAX77705_FCHGTIME_SHIFT) | - (MAX77705_CHG_RSTRT_DISABLE << MAX77705_CHG_RSTRT_SHIFT) | - (MAX77705_CHG_PQEN_DISABLE << MAX77705_PQEN_SHIFT); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_01, - (MAX77705_FCHGTIME_MASK | - MAX77705_CHG_RSTRT_MASK | - MAX77705_PQEN_MASK), - reg_data); - - /* OTG off(UNO on), boost off */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_00, - MAX77705_OTG_CTRL, 0); + regmap_field_write(chg->rfield[MAX77705_FCHGTIME], MAX77705_FCHGTIME_DISABLE); + regmap_field_write(chg->rfield[MAX77705_CHG_RSTRT], MAX77705_CHG_RSTRT_DISABLE); + regmap_field_write(chg->rfield[MAX77705_CHG_PQEN], MAX77705_CHG_PQEN_DISABLE); + + regmap_field_write(chg->rfield[MAX77705_MODE], + MAX77705_CHG_MASK | MAX77705_BUCK_MASK); /* charge current 450mA(default) */ /* otg current limit 900mA */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_02, - MAX77705_OTG_ILIM_MASK, - MAX77705_OTG_ILIM_900 << MAX77705_OTG_ILIM_SHIFT); + regmap_field_write(chg->rfield[MAX77705_OTG_ILIM], MAX77705_OTG_ILIM_900); /* BAT to SYS OCP 4.80A */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_05, - MAX77705_REG_B2SOVRC_MASK, - MAX77705_B2SOVRC_4_8A << MAX77705_REG_B2SOVRC_SHIFT); + regmap_field_write(chg->rfield[MAX77705_REG_B2SOVRC], MAX77705_B2SOVRC_4_8A); + /* top off current 150mA */ /* top off timer 30min */ - reg_data = (MAX77705_TO_ITH_150MA << MAX77705_TO_ITH_SHIFT) | - (MAX77705_TO_TIME_30M << MAX77705_TO_TIME_SHIFT) | - (MAX77705_SYS_TRACK_DISABLE << MAX77705_SYS_TRACK_DIS_SHIFT); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_03, - (MAX77705_TO_ITH_MASK | - MAX77705_TO_TIME_MASK | - MAX77705_SYS_TRACK_DIS_MASK), reg_data); + regmap_field_write(chg->rfield[MAX77705_TO], MAX77705_TO_ITH_150MA); + regmap_field_write(chg->rfield[MAX77705_TO_TIME], MAX77705_TO_TIME_30M); + regmap_field_write(chg->rfield[MAX77705_SYS_TRACK], MAX77705_SYS_TRACK_DISABLE); /* cv voltage 4.2V or 4.35V */ /* MINVSYS 3.6V(default) */ @@ -478,25 +447,21 @@ static void max77705_charger_initialize(struct max77705_charger_data *chg) max77705_set_float_voltage(chg, info->voltage_max_design_uv); } - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, - MAX77705_VCHGIN_REG_MASK, MAX77705_VCHGIN_4_5); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, - MAX77705_WCIN_REG_MASK, MAX77705_WCIN_4_5); + regmap_field_write(chg->rfield[MAX77705_VCHGIN], MAX77705_VCHGIN_4_5); + regmap_field_write(chg->rfield[MAX77705_WCIN], MAX77705_WCIN_4_5); /* Watchdog timer */ regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_00, MAX77705_WDTEN_MASK, 0); /* VBYPSET=5.0V */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_11, MAX77705_VBYPSET_MASK, 0); + regmap_field_write(chg->rfield[MAX77705_VBYPSET], 0); /* Switching Frequency : 1.5MHz */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_08, MAX77705_REG_FSW_MASK, - (MAX77705_CHG_FSW_1_5MHz << MAX77705_REG_FSW_SHIFT)); + regmap_field_write(chg->rfield[MAX77705_REG_FSW], MAX77705_CHG_FSW_1_5MHz); /* Auto skip mode */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, MAX77705_REG_DISKIP_MASK, - (MAX77705_AUTO_SKIP << MAX77705_REG_DISKIP_SHIFT)); + regmap_field_write(chg->rfield[MAX77705_REG_DISKIP], MAX77705_AUTO_SKIP); } static int max77705_charger_probe(struct i2c_client *i2c) @@ -520,6 +485,14 @@ static int max77705_charger_probe(struct i2c_client *i2c) if (IS_ERR(chg->regmap)) return PTR_ERR(chg->regmap); + for (int i = 0; i < MAX77705_N_REGMAP_FIELDS; i++) { + chg->rfield[i] = devm_regmap_field_alloc(dev, chg->regmap, + max77705_reg_field[i]); + if (IS_ERR(chg->rfield[i])) + return dev_err_probe(dev, PTR_ERR(chg->rfield[i]), + "cannot allocate regmap field\n"); + } + ret = regmap_update_bits(chg->regmap, MAX77705_CHG_REG_INT_MASK, MAX77705_CHGIN_IM, 0); diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h index fdec9af9c54183..a612795577b621 100644 --- a/include/linux/power/max77705_charger.h +++ b/include/linux/power/max77705_charger.h @@ -9,6 +9,8 @@ #ifndef __MAX77705_CHARGER_H #define __MAX77705_CHARGER_H __FILE__ +#include + /* MAX77705_CHG_REG_CHG_INT */ #define MAX77705_BYP_I BIT(0) #define MAX77705_INP_LIMIT_I BIT(1) @@ -63,7 +65,6 @@ #define MAX77705_BUCK_SHIFT 2 #define MAX77705_BOOST_SHIFT 3 #define MAX77705_WDTEN_SHIFT 4 -#define MAX77705_MODE_MASK GENMASK(3, 0) #define MAX77705_CHG_MASK BIT(MAX77705_CHG_SHIFT) #define MAX77705_UNO_MASK BIT(MAX77705_UNO_SHIFT) #define MAX77705_OTG_MASK BIT(MAX77705_OTG_SHIFT) @@ -74,34 +75,19 @@ #define MAX77705_OTG_CTRL (MAX77705_OTG_MASK | MAX77705_BOOST_MASK) /* MAX77705_CHG_REG_CNFG_01 */ -#define MAX77705_FCHGTIME_SHIFT 0 -#define MAX77705_FCHGTIME_MASK GENMASK(2, 0) -#define MAX77705_CHG_RSTRT_SHIFT 4 -#define MAX77705_CHG_RSTRT_MASK GENMASK(5, 4) #define MAX77705_FCHGTIME_DISABLE 0 #define MAX77705_CHG_RSTRT_DISABLE 0x3 -#define MAX77705_PQEN_SHIFT 7 -#define MAX77705_PQEN_MASK BIT(7) #define MAX77705_CHG_PQEN_DISABLE 0 #define MAX77705_CHG_PQEN_ENABLE 1 /* MAX77705_CHG_REG_CNFG_02 */ -#define MAX77705_OTG_ILIM_SHIFT 6 -#define MAX77705_OTG_ILIM_MASK GENMASK(7, 6) #define MAX77705_OTG_ILIM_500 0 #define MAX77705_OTG_ILIM_900 1 #define MAX77705_OTG_ILIM_1200 2 #define MAX77705_OTG_ILIM_1500 3 -#define MAX77705_CHG_CC GENMASK(5, 0) /* MAX77705_CHG_REG_CNFG_03 */ -#define MAX77705_TO_ITH_SHIFT 0 -#define MAX77705_TO_ITH_MASK GENMASK(2, 0) -#define MAX77705_TO_TIME_SHIFT 3 -#define MAX77705_TO_TIME_MASK GENMASK(5, 3) -#define MAX77705_SYS_TRACK_DIS_SHIFT 7 -#define MAX77705_SYS_TRACK_DIS_MASK BIT(7) #define MAX77705_TO_ITH_150MA 0 #define MAX77705_TO_TIME_30M 3 #define MAX77705_SYS_TRACK_ENABLE 0 @@ -110,15 +96,8 @@ /* MAX77705_CHG_REG_CNFG_04 */ #define MAX77705_CHG_MINVSYS_SHIFT 6 #define MAX77705_CHG_MINVSYS_MASK GENMASK(7, 6) -#define MAX77705_CHG_PRM_SHIFT 0 -#define MAX77705_CHG_PRM_MASK GENMASK(5, 0) - -#define MAX77705_CHG_CV_PRM_SHIFT 0 -#define MAX77705_CHG_CV_PRM_MASK GENMASK(5, 0) /* MAX77705_CHG_REG_CNFG_05 */ -#define MAX77705_REG_B2SOVRC_SHIFT 0 -#define MAX77705_REG_B2SOVRC_MASK GENMASK(3, 0) #define MAX77705_B2SOVRC_DISABLE 0 #define MAX77705_B2SOVRC_4_5A 6 #define MAX77705_B2SOVRC_4_8A 8 @@ -128,9 +107,8 @@ #define MAX77705_WDTCLR_SHIFT 0 #define MAX77705_WDTCLR_MASK GENMASK(1, 0) #define MAX77705_WDTCLR 1 -#define MAX77705_CHGPROT_MASK GENMASK(3, 2) -#define MAX77705_CHGPROT_UNLOCKED GENMASK(3, 2) -#define MAX77705_SLOWEST_LX_SLOPE GENMASK(6, 5) +#define MAX77705_CHGPROT_UNLOCKED 3 +#define MAX77705_SLOWEST_LX_SLOPE 3 /* MAX77705_CHG_REG_CNFG_07 */ #define MAX77705_CHG_FMBST 4 @@ -140,36 +118,14 @@ #define MAX77705_REG_FGSRC_MASK BIT(MAX77705_REG_FGSRC_SHIFT) /* MAX77705_CHG_REG_CNFG_08 */ -#define MAX77705_REG_FSW_SHIFT 0 -#define MAX77705_REG_FSW_MASK GENMASK(1, 0) #define MAX77705_CHG_FSW_3MHz 0 #define MAX77705_CHG_FSW_2MHz 1 #define MAX77705_CHG_FSW_1_5MHz 2 /* MAX77705_CHG_REG_CNFG_09 */ -#define MAX77705_CHG_CHGIN_LIM_MASK GENMASK(6, 0) -#define MAX77705_CHG_EN_MASK BIT(7) #define MAX77705_CHG_DISABLE 0 -#define MAX77705_CHARGER_CHG_CHARGING(_reg) \ - (((_reg) & MAX77705_CHG_EN_MASK) > 1) - - -/* MAX77705_CHG_REG_CNFG_10 */ -#define MAX77705_CHG_WCIN_LIM GENMASK(5, 0) - -/* MAX77705_CHG_REG_CNFG_11 */ -#define MAX77705_VBYPSET_SHIFT 0 -#define MAX77705_VBYPSET_MASK GENMASK(6, 0) /* MAX77705_CHG_REG_CNFG_12 */ -#define MAX77705_CHGINSEL_SHIFT 5 -#define MAX77705_CHGINSEL_MASK BIT(MAX77705_CHGINSEL_SHIFT) -#define MAX77705_WCINSEL_SHIFT 6 -#define MAX77705_WCINSEL_MASK BIT(MAX77705_WCINSEL_SHIFT) -#define MAX77705_VCHGIN_REG_MASK GENMASK(4, 3) -#define MAX77705_WCIN_REG_MASK GENMASK(2, 1) -#define MAX77705_REG_DISKIP_SHIFT 0 -#define MAX77705_REG_DISKIP_MASK BIT(MAX77705_REG_DISKIP_SHIFT) /* REG=4.5V, UVLO=4.7V */ #define MAX77705_VCHGIN_4_5 0 /* REG=4.5V, UVLO=4.7V */ @@ -183,9 +139,59 @@ #define MAX77705_CURRENT_CHGIN_MIN 100000 #define MAX77705_CURRENT_CHGIN_MAX 3200000 +enum max77705_field_idx { + MAX77705_CHGPROT, + MAX77705_CHG_EN, + MAX77705_CHG_CC_LIM, + MAX77705_CHG_CHGIN_LIM, + MAX77705_CHG_CV_PRM, + MAX77705_CHG_PQEN, + MAX77705_CHG_RSTRT, + MAX77705_CHG_WCIN, + MAX77705_FCHGTIME, + MAX77705_LX_SLOPE, + MAX77705_MODE, + MAX77705_OTG_ILIM, + MAX77705_REG_B2SOVRC, + MAX77705_REG_DISKIP, + MAX77705_REG_FSW, + MAX77705_SYS_TRACK, + MAX77705_TO, + MAX77705_TO_TIME, + MAX77705_VBYPSET, + MAX77705_VCHGIN, + MAX77705_WCIN, + MAX77705_N_REGMAP_FIELDS, +}; + +static const struct reg_field max77705_reg_field[MAX77705_N_REGMAP_FIELDS] = { + [MAX77705_MODE] = REG_FIELD(MAX77705_CHG_REG_CNFG_00, 0, 3), + [MAX77705_FCHGTIME] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 0, 2), + [MAX77705_CHG_RSTRT] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 4, 5), + [MAX77705_CHG_PQEN] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 7, 7), + [MAX77705_CHG_CC_LIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_02, 0, 5), + [MAX77705_OTG_ILIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_02, 6, 7), + [MAX77705_TO] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 0, 2), + [MAX77705_TO_TIME] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 3, 5), + [MAX77705_SYS_TRACK] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 7, 7), + [MAX77705_CHG_CV_PRM] = REG_FIELD(MAX77705_CHG_REG_CNFG_04, 0, 5), + [MAX77705_REG_B2SOVRC] = REG_FIELD(MAX77705_CHG_REG_CNFG_05, 0, 3), + [MAX77705_CHGPROT] = REG_FIELD(MAX77705_CHG_REG_CNFG_06, 2, 3), + [MAX77705_LX_SLOPE] = REG_FIELD(MAX77705_CHG_REG_CNFG_06, 5, 6), + [MAX77705_REG_FSW] = REG_FIELD(MAX77705_CHG_REG_CNFG_08, 0, 1), + [MAX77705_CHG_CHGIN_LIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_09, 0, 6), + [MAX77705_CHG_EN] = REG_FIELD(MAX77705_CHG_REG_CNFG_09, 7, 7), + [MAX77705_CHG_WCIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_10, 0, 5), + [MAX77705_VBYPSET] = REG_FIELD(MAX77705_CHG_REG_CNFG_11, 0, 6), + [MAX77705_REG_DISKIP] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 0, 0), + [MAX77705_WCIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 1, 2), + [MAX77705_VCHGIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 3, 4), +}; + struct max77705_charger_data { struct device *dev; struct regmap *regmap; + struct regmap_field *rfield[MAX77705_N_REGMAP_FIELDS]; struct power_supply_battery_info *bat_info; struct workqueue_struct *wqueue; struct work_struct chgin_work; From 55af7b9bb66c1cf796142f75a76914e2c3df5d06 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:49 +0300 Subject: [PATCH 2211/3206] power: supply: max77705_charger: return error when config fails Handle error, returned from register writes in init function. Signed-off-by: Dzmitry Sankouski Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 90 +++++++++++++++++++------ 1 file changed, 70 insertions(+), 20 deletions(-) diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index 837e03bafcc6db..23c643a307bd01 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -400,43 +400,72 @@ static void max77705_chgin_isr_work(struct work_struct *work) power_supply_changed(chg->psy_chg); } -static void max77705_charger_initialize(struct max77705_charger_data *chg) +static int max77705_charger_initialize(struct max77705_charger_data *chg) { struct power_supply_battery_info *info; struct regmap *regmap = chg->regmap; + int err; - if (power_supply_get_battery_info(chg->psy_chg, &info) < 0) - return; + err = power_supply_get_battery_info(chg->psy_chg, &info); + if (err) + return dev_err_probe(chg->dev, err, "error on getting battery info"); chg->bat_info = info; /* unlock charger setting protect */ /* slowest LX slope */ - regmap_field_write(chg->rfield[MAX77705_CHGPROT], MAX77705_CHGPROT_UNLOCKED); - regmap_field_write(chg->rfield[MAX77705_LX_SLOPE], MAX77705_SLOWEST_LX_SLOPE); + err = regmap_field_write(chg->rfield[MAX77705_CHGPROT], MAX77705_CHGPROT_UNLOCKED); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_LX_SLOPE], MAX77705_SLOWEST_LX_SLOPE); + if (err) + goto err; /* fast charge timer disable */ /* restart threshold disable */ /* pre-qual charge disable */ - regmap_field_write(chg->rfield[MAX77705_FCHGTIME], MAX77705_FCHGTIME_DISABLE); - regmap_field_write(chg->rfield[MAX77705_CHG_RSTRT], MAX77705_CHG_RSTRT_DISABLE); - regmap_field_write(chg->rfield[MAX77705_CHG_PQEN], MAX77705_CHG_PQEN_DISABLE); + err = regmap_field_write(chg->rfield[MAX77705_FCHGTIME], MAX77705_FCHGTIME_DISABLE); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_CHG_RSTRT], MAX77705_CHG_RSTRT_DISABLE); + if (err) + goto err; - regmap_field_write(chg->rfield[MAX77705_MODE], + err = regmap_field_write(chg->rfield[MAX77705_CHG_PQEN], MAX77705_CHG_PQEN_DISABLE); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_MODE], MAX77705_CHG_MASK | MAX77705_BUCK_MASK); + if (err) + goto err; /* charge current 450mA(default) */ /* otg current limit 900mA */ - regmap_field_write(chg->rfield[MAX77705_OTG_ILIM], MAX77705_OTG_ILIM_900); + err = regmap_field_write(chg->rfield[MAX77705_OTG_ILIM], MAX77705_OTG_ILIM_900); + if (err) + goto err; /* BAT to SYS OCP 4.80A */ - regmap_field_write(chg->rfield[MAX77705_REG_B2SOVRC], MAX77705_B2SOVRC_4_8A); + err = regmap_field_write(chg->rfield[MAX77705_REG_B2SOVRC], MAX77705_B2SOVRC_4_8A); + if (err) + goto err; /* top off current 150mA */ /* top off timer 30min */ - regmap_field_write(chg->rfield[MAX77705_TO], MAX77705_TO_ITH_150MA); - regmap_field_write(chg->rfield[MAX77705_TO_TIME], MAX77705_TO_TIME_30M); - regmap_field_write(chg->rfield[MAX77705_SYS_TRACK], MAX77705_SYS_TRACK_DISABLE); + err = regmap_field_write(chg->rfield[MAX77705_TO], MAX77705_TO_ITH_150MA); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_TO_TIME], MAX77705_TO_TIME_30M); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_SYS_TRACK], MAX77705_SYS_TRACK_DISABLE); + if (err) + goto err; /* cv voltage 4.2V or 4.35V */ /* MINVSYS 3.6V(default) */ @@ -447,21 +476,38 @@ static void max77705_charger_initialize(struct max77705_charger_data *chg) max77705_set_float_voltage(chg, info->voltage_max_design_uv); } - regmap_field_write(chg->rfield[MAX77705_VCHGIN], MAX77705_VCHGIN_4_5); - regmap_field_write(chg->rfield[MAX77705_WCIN], MAX77705_WCIN_4_5); + err = regmap_field_write(chg->rfield[MAX77705_VCHGIN], MAX77705_VCHGIN_4_5); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_WCIN], MAX77705_WCIN_4_5); + if (err) + goto err; /* Watchdog timer */ regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_00, MAX77705_WDTEN_MASK, 0); /* VBYPSET=5.0V */ - regmap_field_write(chg->rfield[MAX77705_VBYPSET], 0); + err = regmap_field_write(chg->rfield[MAX77705_VBYPSET], 0); + if (err) + goto err; /* Switching Frequency : 1.5MHz */ - regmap_field_write(chg->rfield[MAX77705_REG_FSW], MAX77705_CHG_FSW_1_5MHz); + err = regmap_field_write(chg->rfield[MAX77705_REG_FSW], MAX77705_CHG_FSW_1_5MHz); + if (err) + goto err; /* Auto skip mode */ - regmap_field_write(chg->rfield[MAX77705_REG_DISKIP], MAX77705_AUTO_SKIP); + err = regmap_field_write(chg->rfield[MAX77705_REG_DISKIP], MAX77705_AUTO_SKIP); + if (err) + goto err; + + return 0; + +err: + return dev_err_probe(chg->dev, err, "error while configuring"); + } static int max77705_charger_probe(struct i2c_client *i2c) @@ -524,7 +570,11 @@ static int max77705_charger_probe(struct i2c_client *i2c) goto destroy_wq; } - max77705_charger_initialize(chg); + ret = max77705_charger_initialize(chg); + if (ret) { + dev_err_probe(dev, ret, "failed to initialize charger IC\n"); + goto destroy_wq; + } ret = max77705_charger_enable(chg); if (ret) { From baedd8be7036233025527a78f209e34d03057872 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:50 +0300 Subject: [PATCH 2212/3206] power: supply: max77705_charger: add writable properties Add INPUT_CURRENT_LIMIT, CONSTANT_CHARGE_CURRENT properties as writeable to be able to control input power consumption and charging speed. Signed-off-by: Dzmitry Sankouski Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 56 ++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index 23c643a307bd01..a613235289b540 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -105,6 +105,17 @@ static int max77705_get_online(struct regmap *regmap, int *val) return 0; } +static int max77705_set_integer(struct max77705_charger_data *chg, enum max77705_field_idx fidx, + unsigned int clamp_min, unsigned int clamp_max, + unsigned int div, int val) +{ + unsigned int regval; + + regval = clamp_val(val, clamp_min, clamp_max) / div; + + return regmap_field_write(chg->rfield[fidx], regval); +} + static int max77705_check_battery(struct max77705_charger_data *chg, int *val) { unsigned int reg_data; @@ -384,12 +395,55 @@ static int max77705_chg_get_property(struct power_supply *psy, return 0; } +static int max77705_set_property(struct power_supply *psy, + enum power_supply_property psp, + const union power_supply_propval *val) +{ + struct max77705_charger_data *chg = power_supply_get_drvdata(psy); + int err = 0; + + switch (psp) { + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: + err = max77705_set_integer(chg, MAX77705_CHG_CC_LIM, + MAX77705_CURRENT_CHGIN_MIN, + MAX77705_CURRENT_CHGIN_MAX, + MAX77705_CURRENT_CHG_STEP, + val->intval); + break; + case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: + err = max77705_set_integer(chg, MAX77705_CHG_CHGIN_LIM, + MAX77705_CURRENT_CHGIN_MIN, + MAX77705_CURRENT_CHGIN_MAX, + MAX77705_CURRENT_CHGIN_STEP, + val->intval); + break; + default: + err = -EINVAL; + } + + return err; +}; + +static int max77705_property_is_writeable(struct power_supply *psy, + enum power_supply_property psp) +{ + switch (psp) { + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: + case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: + return true; + default: + return false; + } +} + static const struct power_supply_desc max77705_charger_psy_desc = { .name = "max77705-charger", - .type = POWER_SUPPLY_TYPE_USB, + .type = POWER_SUPPLY_TYPE_USB, .properties = max77705_charger_props, + .property_is_writeable = max77705_property_is_writeable, .num_properties = ARRAY_SIZE(max77705_charger_props), .get_property = max77705_chg_get_property, + .set_property = max77705_set_property, }; static void max77705_chgin_isr_work(struct work_struct *work) From 12a1185a06e3377af777e792ba7436862f8e528a Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:51 +0300 Subject: [PATCH 2213/3206] power: supply: max77705_charger: rework interrupts Current implementation uses handle_post_irq to actually handle chgin irq. This is not how things are meant to work in regmap-irq. Remove handle_post_irq, and request a threaded interrupt for chgin. Fixes: a6a494c8e3ce ("power: supply: max77705: Add charger driver for Maxim 77705") Signed-off-by: Dzmitry Sankouski Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index a613235289b540..bf065693af45d5 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -40,13 +40,13 @@ static enum power_supply_property max77705_charger_props[] = { POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT, }; -static int max77705_chgin_irq(void *irq_drv_data) +static irqreturn_t max77705_chgin_irq(int irq, void *irq_drv_data) { struct max77705_charger_data *chg = irq_drv_data; queue_work(chg->wqueue, &chg->chgin_work); - return 0; + return IRQ_HANDLED; } static const struct regmap_irq max77705_charger_irqs[] = { @@ -64,7 +64,6 @@ static struct regmap_irq_chip max77705_charger_irq_chip = { .name = "max77705-charger", .status_base = MAX77705_CHG_REG_INT, .mask_base = MAX77705_CHG_REG_INT_MASK, - .handle_post_irq = max77705_chgin_irq, .num_regs = 1, .irqs = max77705_charger_irqs, .num_irqs = ARRAY_SIZE(max77705_charger_irqs), @@ -593,12 +592,6 @@ static int max77705_charger_probe(struct i2c_client *i2c) "cannot allocate regmap field\n"); } - ret = regmap_update_bits(chg->regmap, - MAX77705_CHG_REG_INT_MASK, - MAX77705_CHGIN_IM, 0); - if (ret) - return ret; - pscfg.fwnode = dev_fwnode(dev); pscfg.drv_data = chg; @@ -608,7 +601,7 @@ static int max77705_charger_probe(struct i2c_client *i2c) max77705_charger_irq_chip.irq_drv_data = chg; ret = devm_regmap_add_irq_chip(chg->dev, chg->regmap, i2c->irq, - IRQF_ONESHOT | IRQF_SHARED, 0, + IRQF_ONESHOT, 0, &max77705_charger_irq_chip, &irq_data); if (ret) @@ -630,6 +623,15 @@ static int max77705_charger_probe(struct i2c_client *i2c) goto destroy_wq; } + ret = devm_request_threaded_irq(dev, regmap_irq_get_virq(irq_data, MAX77705_CHGIN_I), + NULL, max77705_chgin_irq, + IRQF_TRIGGER_NONE, + "chgin-irq", chg); + if (ret) { + dev_err_probe(dev, ret, "Failed to Request chgin IRQ\n"); + goto destroy_wq; + } + ret = max77705_charger_enable(chg); if (ret) { dev_err_probe(dev, ret, "failed to enable charge\n"); From bc7d3a0f92dad811110f5602f58fe756cefce2b8 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:52 +0300 Subject: [PATCH 2214/3206] power: supply: max77705_charger: use REGMAP_IRQ_REG_LINE macro Refactor regmap_irq declarations with REGMAP_IRQ_REG_LINE saves a few lines on definitions. Signed-off-by: Dzmitry Sankouski Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 16 +++++----- include/linux/power/max77705_charger.h | 42 ++++++++++--------------- 2 files changed, 24 insertions(+), 34 deletions(-) diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index bf065693af45d5..b1a227bf72e26f 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -50,14 +50,14 @@ static irqreturn_t max77705_chgin_irq(int irq, void *irq_drv_data) } static const struct regmap_irq max77705_charger_irqs[] = { - { .mask = MAX77705_BYP_IM, }, - { .mask = MAX77705_INP_LIMIT_IM, }, - { .mask = MAX77705_BATP_IM, }, - { .mask = MAX77705_BAT_IM, }, - { .mask = MAX77705_CHG_IM, }, - { .mask = MAX77705_WCIN_IM, }, - { .mask = MAX77705_CHGIN_IM, }, - { .mask = MAX77705_AICL_IM, }, + REGMAP_IRQ_REG_LINE(MAX77705_BYP_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_INP_LIMIT_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_BATP_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_BAT_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_CHG_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_WCIN_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_CHGIN_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_AICL_I, BITS_PER_BYTE), }; static struct regmap_irq_chip max77705_charger_irq_chip = { diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h index a612795577b621..6653abfdf7470d 100644 --- a/include/linux/power/max77705_charger.h +++ b/include/linux/power/max77705_charger.h @@ -12,34 +12,24 @@ #include /* MAX77705_CHG_REG_CHG_INT */ -#define MAX77705_BYP_I BIT(0) -#define MAX77705_INP_LIMIT_I BIT(1) -#define MAX77705_BATP_I BIT(2) -#define MAX77705_BAT_I BIT(3) -#define MAX77705_CHG_I BIT(4) -#define MAX77705_WCIN_I BIT(5) -#define MAX77705_CHGIN_I BIT(6) -#define MAX77705_AICL_I BIT(7) - -/* MAX77705_CHG_REG_CHG_INT_MASK */ -#define MAX77705_BYP_IM BIT(0) -#define MAX77705_INP_LIMIT_IM BIT(1) -#define MAX77705_BATP_IM BIT(2) -#define MAX77705_BAT_IM BIT(3) -#define MAX77705_CHG_IM BIT(4) -#define MAX77705_WCIN_IM BIT(5) -#define MAX77705_CHGIN_IM BIT(6) -#define MAX77705_AICL_IM BIT(7) +#define MAX77705_BYP_I (0) +#define MAX77705_INP_LIMIT_I (1) +#define MAX77705_BATP_I (2) +#define MAX77705_BAT_I (3) +#define MAX77705_CHG_I (4) +#define MAX77705_WCIN_I (5) +#define MAX77705_CHGIN_I (6) +#define MAX77705_AICL_I (7) /* MAX77705_CHG_REG_CHG_INT_OK */ -#define MAX77705_BYP_OK BIT(0) -#define MAX77705_DISQBAT_OK BIT(1) -#define MAX77705_BATP_OK BIT(2) -#define MAX77705_BAT_OK BIT(3) -#define MAX77705_CHG_OK BIT(4) -#define MAX77705_WCIN_OK BIT(5) -#define MAX77705_CHGIN_OK BIT(6) -#define MAX77705_AICL_OK BIT(7) +#define MAX77705_BYP_OK BIT(MAX77705_BYP_I) +#define MAX77705_DISQBAT_OK BIT(MAX77705_INP_LIMIT_I) +#define MAX77705_BATP_OK BIT(MAX77705_BATP_I) +#define MAX77705_BAT_OK BIT(MAX77705_BAT_I) +#define MAX77705_CHG_OK BIT(MAX77705_CHG_I) +#define MAX77705_WCIN_OK BIT(MAX77705_WCIN_I) +#define MAX77705_CHGIN_OK BIT(MAX77705_CHGIN_I) +#define MAX77705_AICL_OK BIT(MAX77705_AICL_I) /* MAX77705_CHG_REG_DETAILS_00 */ #define MAX77705_BATP_DTLS BIT(0) From 55d225670def06b01af2e7a5e0446fbe946289e8 Mon Sep 17 00:00:00 2001 From: Lukasz Czapnik Date: Wed, 13 Aug 2025 12:45:11 +0200 Subject: [PATCH 2215/3206] i40e: add validation for ring_len param The `ring_len` parameter provided by the virtual function (VF) is assigned directly to the hardware memory context (HMC) without any validation. To address this, introduce an upper boundary check for both Tx and Rx queue lengths. The maximum number of descriptors supported by the hardware is 8k-32. Additionally, enforce alignment constraints: Tx rings must be a multiple of 8, and Rx rings must be a multiple of 32. Fixes: 5c3c48ac6bf5 ("i40e: implement virtual device interface") Cc: stable@vger.kernel.org Signed-off-by: Lukasz Czapnik Reviewed-by: Aleksandr Loktionov Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 9b8efdeafbcf13..cb37b2ac56f132 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -653,6 +653,13 @@ static int i40e_config_vsi_tx_queue(struct i40e_vf *vf, u16 vsi_id, /* only set the required fields */ tx_ctx.base = info->dma_ring_addr / 128; + + /* ring_len has to be multiple of 8 */ + if (!IS_ALIGNED(info->ring_len, 8) || + info->ring_len > I40E_MAX_NUM_DESCRIPTORS_XL710) { + ret = -EINVAL; + goto error_context; + } tx_ctx.qlen = info->ring_len; tx_ctx.rdylist = le16_to_cpu(vsi->info.qs_handle[0]); tx_ctx.rdylist_act = 0; @@ -716,6 +723,13 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id, /* only set the required fields */ rx_ctx.base = info->dma_ring_addr / 128; + + /* ring_len has to be multiple of 32 */ + if (!IS_ALIGNED(info->ring_len, 32) || + info->ring_len > I40E_MAX_NUM_DESCRIPTORS_XL710) { + ret = -EINVAL; + goto error_param; + } rx_ctx.qlen = info->ring_len; if (info->splithdr_enabled) { From aa68d3c3ac8d1dcec40d52ae27e39f6d32207009 Mon Sep 17 00:00:00 2001 From: Lukasz Czapnik Date: Wed, 13 Aug 2025 12:45:12 +0200 Subject: [PATCH 2216/3206] i40e: fix idx validation in i40e_validate_queue_map Ensure idx is within range of active/initialized TCs when iterating over vf->ch[idx] in i40e_validate_queue_map(). Fixes: c27eac48160d ("i40e: Enable ADq and create queue channel/s on VF") Cc: stable@vger.kernel.org Signed-off-by: Lukasz Czapnik Reviewed-by: Aleksandr Loktionov Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Kamakshi Nellore (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index cb37b2ac56f132..1c4f86221255ec 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2466,8 +2466,10 @@ static int i40e_validate_queue_map(struct i40e_vf *vf, u16 vsi_id, u16 vsi_queue_id, queue_id; for_each_set_bit(vsi_queue_id, &queuemap, I40E_MAX_VSI_QP) { - if (vf->adq_enabled) { - vsi_id = vf->ch[vsi_queue_id / I40E_MAX_VF_VSI].vsi_id; + u16 idx = vsi_queue_id / I40E_MAX_VF_VSI; + + if (vf->adq_enabled && idx < vf->num_tc) { + vsi_id = vf->ch[idx].vsi_id; queue_id = (vsi_queue_id % I40E_DEFAULT_QUEUES_PER_VF); } else { queue_id = vsi_queue_id; From f1ad24c5abe1eaef69158bac1405a74b3c365115 Mon Sep 17 00:00:00 2001 From: Lukasz Czapnik Date: Wed, 13 Aug 2025 12:45:13 +0200 Subject: [PATCH 2217/3206] i40e: fix idx validation in config queues msg Ensure idx is within range of active/initialized TCs when iterating over vf->ch[idx] in i40e_vc_config_queues_msg(). Fixes: c27eac48160d ("i40e: Enable ADq and create queue channel/s on VF") Cc: stable@vger.kernel.org Signed-off-by: Lukasz Czapnik Reviewed-by: Aleksandr Loktionov Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Kamakshi Nellore (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 1c4f86221255ec..b6db4d78c02d4f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2395,7 +2395,7 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg) } if (vf->adq_enabled) { - if (idx >= ARRAY_SIZE(vf->ch)) { + if (idx >= vf->num_tc) { aq_ret = -ENODEV; goto error_param; } @@ -2416,7 +2416,7 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg) * to its appropriate VSIs based on TC mapping */ if (vf->adq_enabled) { - if (idx >= ARRAY_SIZE(vf->ch)) { + if (idx >= vf->num_tc) { aq_ret = -ENODEV; goto error_param; } From 9739d5830497812b0bdeaee356ddefbe60830b88 Mon Sep 17 00:00:00 2001 From: Lukasz Czapnik Date: Wed, 13 Aug 2025 12:45:14 +0200 Subject: [PATCH 2218/3206] i40e: fix input validation logic for action_meta Fix condition to check 'greater or equal' to prevent OOB dereference. Fixes: e284fc280473 ("i40e: Add and delete cloud filter") Cc: stable@vger.kernel.org Signed-off-by: Lukasz Czapnik Reviewed-by: Aleksandr Loktionov Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index b6db4d78c02d4f..c85715f75435f3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3603,7 +3603,7 @@ static int i40e_validate_cloud_filter(struct i40e_vf *vf, /* action_meta is TC number here to which the filter is applied */ if (!tc_filter->action_meta || - tc_filter->action_meta > vf->num_tc) { + tc_filter->action_meta >= vf->num_tc) { dev_info(&pf->pdev->dev, "VF %d: Invalid TC number %u\n", vf->vf_id, tc_filter->action_meta); goto err; From 877b7e6ffc23766448236e8732254534c518ba42 Mon Sep 17 00:00:00 2001 From: Lukasz Czapnik Date: Wed, 13 Aug 2025 12:45:15 +0200 Subject: [PATCH 2219/3206] i40e: fix validation of VF state in get resources VF state I40E_VF_STATE_ACTIVE is not the only state in which VF is actually active so it should not be used to determine if a VF is allowed to obtain resources. Use I40E_VF_STATE_RESOURCES_LOADED that is set only in i40e_vc_get_vf_resources_msg() and cleared during reset. Fixes: 61125b8be85d ("i40e: Fix failed opcode appearing if handling messages from VF") Cc: stable@vger.kernel.org Signed-off-by: Lukasz Czapnik Reviewed-by: Aleksandr Loktionov Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 7 ++++++- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index c85715f75435f3..5ef3dc43a8a07a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -1464,6 +1464,7 @@ static void i40e_trigger_vf_reset(struct i40e_vf *vf, bool flr) * functions that may still be running at this point. */ clear_bit(I40E_VF_STATE_INIT, &vf->vf_states); + clear_bit(I40E_VF_STATE_RESOURCES_LOADED, &vf->vf_states); /* In the case of a VFLR, the HW has already reset the VF and we * just need to clean up, so don't hit the VFRTRIG register. @@ -2130,7 +2131,10 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg) size_t len = 0; int ret; - if (!i40e_sync_vf_state(vf, I40E_VF_STATE_INIT)) { + i40e_sync_vf_state(vf, I40E_VF_STATE_INIT); + + if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states) || + test_bit(I40E_VF_STATE_RESOURCES_LOADED, &vf->vf_states)) { aq_ret = -EINVAL; goto err; } @@ -2233,6 +2237,7 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg) vf->default_lan_addr.addr); } set_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states); + set_bit(I40E_VF_STATE_RESOURCES_LOADED, &vf->vf_states); err: /* send the response back to the VF */ diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 5cf74f16f433f3..f558b45725c816 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -41,7 +41,8 @@ enum i40e_vf_states { I40E_VF_STATE_MC_PROMISC, I40E_VF_STATE_UC_PROMISC, I40E_VF_STATE_PRE_ENABLE, - I40E_VF_STATE_RESETTING + I40E_VF_STATE_RESETTING, + I40E_VF_STATE_RESOURCES_LOADED, }; /* VF capabilities */ From cb79fa7118c150c3c76a327894bb2eb878c02619 Mon Sep 17 00:00:00 2001 From: Lukasz Czapnik Date: Wed, 13 Aug 2025 12:45:16 +0200 Subject: [PATCH 2220/3206] i40e: add max boundary check for VF filters There is no check for max filters that VF can request. Add it. Fixes: e284fc280473 ("i40e: Add and delete cloud filter") Cc: stable@vger.kernel.org Signed-off-by: Lukasz Czapnik Reviewed-by: Aleksandr Loktionov Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 5ef3dc43a8a07a..f29941c0034299 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3905,6 +3905,8 @@ static int i40e_vc_del_cloud_filter(struct i40e_vf *vf, u8 *msg) aq_ret); } +#define I40E_MAX_VF_CLOUD_FILTER 0xFF00 + /** * i40e_vc_add_cloud_filter * @vf: pointer to the VF info @@ -3944,6 +3946,14 @@ static int i40e_vc_add_cloud_filter(struct i40e_vf *vf, u8 *msg) goto err_out; } + if (vf->num_cloud_filters >= I40E_MAX_VF_CLOUD_FILTER) { + dev_warn(&pf->pdev->dev, + "VF %d: Max number of filters reached, can't apply cloud filter\n", + vf->vf_id); + aq_ret = -ENOSPC; + goto err_out; + } + cfilter = kzalloc(sizeof(*cfilter), GFP_KERNEL); if (!cfilter) { aq_ret = -ENOMEM; From eac04428abe9f9cb203ffae4600791ea1d24eb18 Mon Sep 17 00:00:00 2001 From: Lukasz Czapnik Date: Wed, 13 Aug 2025 12:45:17 +0200 Subject: [PATCH 2221/3206] i40e: add mask to apply valid bits for itr_idx The ITR index (itr_idx) is only 2 bits wide. When constructing the register value for QINT_RQCTL, all fields are ORed together. Without masking, higher bits from itr_idx may overwrite adjacent fields in the register. Apply I40E_QINT_RQCTL_ITR_INDX_MASK to ensure only the intended bits are set. Fixes: 5c3c48ac6bf5 ("i40e: implement virtual device interface") Cc: stable@vger.kernel.org Signed-off-by: Lukasz Czapnik Reviewed-by: Aleksandr Loktionov Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index f29941c0034299..f9b2197f09425f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -448,7 +448,7 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id, (qtype << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) | (pf_queue_id << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) | BIT(I40E_QINT_RQCTL_CAUSE_ENA_SHIFT) | - (itr_idx << I40E_QINT_RQCTL_ITR_INDX_SHIFT); + FIELD_PREP(I40E_QINT_RQCTL_ITR_INDX_MASK, itr_idx); wr32(hw, reg_idx, reg); } From b99dd77076bd3fddac6f7f1cbfa081c38fde17f5 Mon Sep 17 00:00:00 2001 From: Lukasz Czapnik Date: Wed, 13 Aug 2025 12:45:18 +0200 Subject: [PATCH 2222/3206] i40e: improve VF MAC filters accounting When adding new VM MAC, driver checks only *active* filters in vsi->mac_filter_hash. Each MAC, even in non-active state is using resources. To determine number of MACs VM uses, count VSI filters in *any* state. Add i40e_count_all_filters() to simply count all filters, and rename i40e_count_filters() to i40e_count_active_filters() to avoid ambiguity. Fixes: cfb1d572c986 ("i40e: Add ensurance of MacVlan resources for every trusted VF") Cc: stable@vger.kernel.org Signed-off-by: Lukasz Czapnik Reviewed-by: Aleksandr Loktionov Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 3 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 26 ++++++-- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 65 ++++++++----------- 3 files changed, 50 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 49aa4497efce7d..801a57a925dadc 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -1278,7 +1278,8 @@ struct i40e_mac_filter *i40e_add_mac_filter(struct i40e_vsi *vsi, const u8 *macaddr); int i40e_del_mac_filter(struct i40e_vsi *vsi, const u8 *macaddr); bool i40e_is_vsi_in_vlan(struct i40e_vsi *vsi); -int i40e_count_filters(struct i40e_vsi *vsi); +int i40e_count_all_filters(struct i40e_vsi *vsi); +int i40e_count_active_filters(struct i40e_vsi *vsi); struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, const u8 *macaddr); void i40e_vlan_stripping_enable(struct i40e_vsi *vsi); static inline bool i40e_is_sw_dcb(struct i40e_pf *pf) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index b14019d44b5811..529d5501baacad 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1243,12 +1243,30 @@ void i40e_update_stats(struct i40e_vsi *vsi) } /** - * i40e_count_filters - counts VSI mac filters + * i40e_count_all_filters - counts VSI MAC filters * @vsi: the VSI to be searched * - * Returns count of mac filters - **/ -int i40e_count_filters(struct i40e_vsi *vsi) + * Return: count of MAC filters in any state. + */ +int i40e_count_all_filters(struct i40e_vsi *vsi) +{ + struct i40e_mac_filter *f; + struct hlist_node *h; + int bkt, cnt = 0; + + hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) + cnt++; + + return cnt; +} + +/** + * i40e_count_active_filters - counts VSI MAC filters + * @vsi: the VSI to be searched + * + * Return: count of active MAC filters. + */ +int i40e_count_active_filters(struct i40e_vsi *vsi) { struct i40e_mac_filter *f; struct hlist_node *h; diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index f9b2197f09425f..081a4526a2f000 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2862,24 +2862,6 @@ static int i40e_vc_get_stats_msg(struct i40e_vf *vf, u8 *msg) (u8 *)&stats, sizeof(stats)); } -/** - * i40e_can_vf_change_mac - * @vf: pointer to the VF info - * - * Return true if the VF is allowed to change its MAC filters, false otherwise - */ -static bool i40e_can_vf_change_mac(struct i40e_vf *vf) -{ - /* If the VF MAC address has been set administratively (via the - * ndo_set_vf_mac command), then deny permission to the VF to - * add/delete unicast MAC addresses, unless the VF is trusted - */ - if (vf->pf_set_mac && !vf->trusted) - return false; - - return true; -} - #define I40E_MAX_MACVLAN_PER_HW 3072 #define I40E_MAX_MACVLAN_PER_PF(num_ports) (I40E_MAX_MACVLAN_PER_HW / \ (num_ports)) @@ -2918,8 +2900,10 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, struct i40e_pf *pf = vf->pf; struct i40e_vsi *vsi = pf->vsi[vf->lan_vsi_idx]; struct i40e_hw *hw = &pf->hw; - int mac2add_cnt = 0; - int i; + int i, mac_add_max, mac_add_cnt = 0; + bool vf_trusted; + + vf_trusted = test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps); for (i = 0; i < al->num_elements; i++) { struct i40e_mac_filter *f; @@ -2939,9 +2923,8 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, * The VF may request to set the MAC address filter already * assigned to it so do not return an error in that case. */ - if (!i40e_can_vf_change_mac(vf) && - !is_multicast_ether_addr(addr) && - !ether_addr_equal(addr, vf->default_lan_addr.addr)) { + if (!vf_trusted && !is_multicast_ether_addr(addr) && + vf->pf_set_mac && !ether_addr_equal(addr, vf->default_lan_addr.addr)) { dev_err(&pf->pdev->dev, "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n"); return -EPERM; @@ -2950,29 +2933,33 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, /*count filters that really will be added*/ f = i40e_find_mac(vsi, addr); if (!f) - ++mac2add_cnt; + ++mac_add_cnt; } /* If this VF is not privileged, then we can't add more than a limited - * number of addresses. Check to make sure that the additions do not - * push us over the limit. - */ - if (!test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) { - if ((i40e_count_filters(vsi) + mac2add_cnt) > - I40E_VC_MAX_MAC_ADDR_PER_VF) { - dev_err(&pf->pdev->dev, - "Cannot add more MAC addresses, VF is not trusted, switch the VF to trusted to add more functionality\n"); - return -EPERM; - } - /* If this VF is trusted, it can use more resources than untrusted. + * number of addresses. + * + * If this VF is trusted, it can use more resources than untrusted. * However to ensure that every trusted VF has appropriate number of * resources, divide whole pool of resources per port and then across * all VFs. */ - } else { - if ((i40e_count_filters(vsi) + mac2add_cnt) > - I40E_VC_MAX_MACVLAN_PER_TRUSTED_VF(pf->num_alloc_vfs, - hw->num_ports)) { + if (!vf_trusted) + mac_add_max = I40E_VC_MAX_MAC_ADDR_PER_VF; + else + mac_add_max = I40E_VC_MAX_MACVLAN_PER_TRUSTED_VF(pf->num_alloc_vfs, hw->num_ports); + + /* VF can replace all its filters in one step, in this case mac_add_max + * will be added as active and another mac_add_max will be in + * a to-be-removed state. Account for that. + */ + if ((i40e_count_active_filters(vsi) + mac_add_cnt) > mac_add_max || + (i40e_count_all_filters(vsi) + mac_add_cnt) > 2 * mac_add_max) { + if (!vf_trusted) { + dev_err(&pf->pdev->dev, + "Cannot add more MAC addresses, VF is not trusted, switch the VF to trusted to add more functionality\n"); + return -EPERM; + } else { dev_err(&pf->pdev->dev, "Cannot add more MAC addresses, trusted VF exhausted it's resources\n"); return -EPERM; From 70ddf86d76c1a560095c397c01b0862fe302b500 Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Wed, 13 Aug 2025 10:18:55 -0500 Subject: [PATCH 2223/3206] riscv: sbi: Switch to new sys-off handler API Kernel now supports chained power-off handlers. Use register_platform_power_off() that registers a platform level power-off handler. Legacy pm_power_off() will be removed once all drivers and archs are converted to the new sys-off API. Signed-off-by: Andrew Davis Tested-by: Alexandre Ghiti Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250813151855.105237-1-afd@ti.com Signed-off-by: Paul Walmsley --- arch/riscv/kernel/sbi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 53836a9235e320..5e8cde05526435 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -148,7 +148,7 @@ static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask, static void sbi_set_power_off(void) { - pm_power_off = sbi_shutdown; + register_platform_power_off(sbi_shutdown); } #else static void __sbi_set_timer_v01(uint64_t stime_value) @@ -682,7 +682,7 @@ void __init sbi_init(void) if (sbi_spec_version >= sbi_mk_version(0, 3) && sbi_probe_extension(SBI_EXT_SRST)) { pr_info("SBI SRST extension detected\n"); - pm_power_off = sbi_srst_power_off; + register_platform_power_off(sbi_srst_power_off); sbi_srst_reboot_nb.notifier_call = sbi_srst_reboot; sbi_srst_reboot_nb.priority = 192; register_restart_handler(&sbi_srst_reboot_nb); From 182edc05b087f58245d5481e2d4adb231a45a903 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 19 Sep 2025 08:54:43 +0900 Subject: [PATCH 2224/3206] firewire: core: remove useless generation check Two functions, fw_core_handle_bus_reset() and bm_work(), are serialized by a commit 3d91fd440cc7 ("firewire: core: disable bus management work temporarily during updating topology"). Therefore the generation member of fw_card is immutable in bm_work(). This commit removes useless generation check in bm_work(). Link: https://lore.kernel.org/r/20250918235448.129705-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 4fcd5ce4b2cef2..ef00125fb01ac8 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -362,14 +362,12 @@ static void bm_work(struct work_struct *work) if (rcode == RCODE_COMPLETE) { int bm_id = be32_to_cpu(data[0]); - if (generation == card->generation) { - // Used by cdev layer for "struct fw_cdev_event_bus_reset". - scoped_guard(spinlock, &card->lock) { - if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) - card->bm_node_id = 0xffc0 & bm_id; - else - card->bm_node_id = local_id; - } + // Used by cdev layer for "struct fw_cdev_event_bus_reset". + scoped_guard(spinlock, &card->lock) { + if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) + card->bm_node_id = 0xffc0 & bm_id; + else + card->bm_node_id = local_id; } if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) { From 52561ebfae9dc9871d6ca2c9e72a4a4e246c4476 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 19 Sep 2025 08:54:44 +0900 Subject: [PATCH 2225/3206] firewire: core: use switch statement to evaluate transaction result to CSR_BUS_MANAGER_ID The result of the lock transaction to swap bus manager on isochronous resource manager looks like an ad-hoc style. It is hard to read. This commit uses switch statement to evaluate the result. Link: https://lore.kernel.org/r/20250918235448.129705-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 50 +++++++++++++++++------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index ef00125fb01ac8..e9bf8d93f5b781 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -355,11 +355,18 @@ static void bm_work(struct work_struct *work) CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, data, sizeof(data)); - // Another bus reset, BM work has been rescheduled. - if (rcode == RCODE_GENERATION) + switch (rcode) { + case RCODE_GENERATION: + // Another bus reset, BM work has been rescheduled. return; - - if (rcode == RCODE_COMPLETE) { + case RCODE_SEND_ERROR: + // We have been unable to send the lock request due to + // some local problem. Let's try again later and hope + // that the problem has gone away by then. + fw_schedule_bm_work(card, msecs_to_jiffies(125)); + return; + case RCODE_COMPLETE: + { int bm_id = be32_to_cpu(data[0]); // Used by cdev layer for "struct fw_cdev_event_bus_reset". @@ -376,29 +383,20 @@ static void bm_work(struct work_struct *work) allocate_broadcast_channel(card, generation); return; } + break; } - - if (rcode == RCODE_SEND_ERROR) { - /* - * We have been unable to send the lock request due to - * some local problem. Let's try again later and hope - * that the problem has gone away by then. - */ - fw_schedule_bm_work(card, msecs_to_jiffies(125)); - return; - } - - if (rcode != RCODE_COMPLETE && !keep_this_irm) { - /* - * The lock request failed, maybe the IRM - * isn't really IRM capable after all. Let's - * do a bus reset and pick the local node as - * root, and thus, IRM. - */ - new_root_id = local_id; - fw_notice(card, "BM lock failed (%s), making local node (%02x) root\n", - fw_rcode_string(rcode), new_root_id); - goto pick_me; + default: + if (!keep_this_irm) { + // The lock request failed, maybe the IRM + // isn't really IRM capable after all. Let's + // do a bus reset and pick the local node as + // root, and thus, IRM. + new_root_id = local_id; + fw_notice(card, "BM lock failed (%s), making local node (%02x) root\n", + fw_rcode_string(rcode), new_root_id); + goto pick_me; + } + break; } } else if (card->bm_generation != generation) { /* From e31b990cafd49a8c56eac55094c1a783f5826b47 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 19 Sep 2025 08:54:45 +0900 Subject: [PATCH 2226/3206] firewire: core: code refactoring for the case of generation mismatch Current implementation stores the bus generation at which the bus manager contending procedure finishes. The condition for the procedure is the mismatch of the stored generation against current bus generation. This commit refactors the code for the contending procedure. Two existing branches are put into a new branch to detect the generation mismatch, thus the most of change is indentation. Link: https://lore.kernel.org/r/20250918235448.129705-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 188 +++++++++++++++++------------------ 1 file changed, 93 insertions(+), 95 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index e9bf8d93f5b781..02058af705cc3f 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -290,7 +290,7 @@ static void bm_work(struct work_struct *work) struct fw_card *card __free(card_unref) = from_work(card, work, bm_work.work); struct fw_node *root_node __free(node_unref) = NULL; int root_id, new_root_id, irm_id, local_id; - int expected_gap_count, generation, grace; + int expected_gap_count, generation; bool do_reset = false; if (card->local_node == NULL) @@ -304,107 +304,107 @@ static void bm_work(struct work_struct *work) irm_id = card->irm_node->node_id; local_id = card->local_node->node_id; - grace = time_is_before_jiffies64(card->reset_jiffies + msecs_to_jiffies(125)); - - if ((is_next_generation(generation, card->bm_generation) && - !card->bm_abdicate) || - (card->bm_generation != generation && grace)) { - /* - * This first step is to figure out who is IRM and - * then try to become bus manager. If the IRM is not - * well defined (e.g. does not have an active link - * layer or does not responds to our lock request, we - * will have to do a little vigilante bus management. - * In that case, we do a goto into the gap count logic - * so that when we do the reset, we still optimize the - * gap count. That could well save a reset in the - * next generation. - */ - __be32 data[2] = { - cpu_to_be32(BUS_MANAGER_ID_NOT_REGISTERED), - cpu_to_be32(local_id), - }; - struct fw_device *irm_device = fw_node_get_device(card->irm_node); - bool irm_is_1394_1995_only = false; - bool keep_this_irm = false; - int rcode; - - if (!card->irm_node->link_on) { - new_root_id = local_id; - fw_notice(card, "%s, making local node (%02x) root\n", - "IRM has link off", new_root_id); - goto pick_me; - } - - if (irm_device && irm_device->config_rom) { - irm_is_1394_1995_only = (irm_device->config_rom[2] & 0x000000f0) == 0; - - // Canon MV5i works unreliably if it is not root node. - keep_this_irm = irm_device->config_rom[3] >> 8 == CANON_OUI; - } + if (card->bm_generation != generation) { + bool grace = time_is_before_jiffies64(card->reset_jiffies + msecs_to_jiffies(125)); + + if (grace || + (is_next_generation(generation, card->bm_generation) && !card->bm_abdicate)) { + // This first step is to figure out who is IRM and + // then try to become bus manager. If the IRM is not + // well defined (e.g. does not have an active link + // layer or does not responds to our lock request, we + // will have to do a little vigilante bus management. + // In that case, we do a goto into the gap count logic + // so that when we do the reset, we still optimize the + // gap count. That could well save a reset in the + // next generation. + __be32 data[2] = { + cpu_to_be32(BUS_MANAGER_ID_NOT_REGISTERED), + cpu_to_be32(local_id), + }; + struct fw_device *irm_device = fw_node_get_device(card->irm_node); + bool irm_is_1394_1995_only = false; + bool keep_this_irm = false; + int rcode; + + if (!card->irm_node->link_on) { + new_root_id = local_id; + fw_notice(card, "%s, making local node (%02x) root\n", + "IRM has link off", new_root_id); + goto pick_me; + } - if (irm_is_1394_1995_only && !keep_this_irm) { - new_root_id = local_id; - fw_notice(card, "%s, making local node (%02x) root\n", - "IRM is not 1394a compliant", new_root_id); - goto pick_me; - } + if (irm_device && irm_device->config_rom) { + irm_is_1394_1995_only = (irm_device->config_rom[2] & 0x000000f0) == 0; - rcode = fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP, - irm_id, generation, SCODE_100, - CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, - data, sizeof(data)); + // Canon MV5i works unreliably if it is not root node. + keep_this_irm = irm_device->config_rom[3] >> 8 == CANON_OUI; + } - switch (rcode) { - case RCODE_GENERATION: - // Another bus reset, BM work has been rescheduled. - return; - case RCODE_SEND_ERROR: - // We have been unable to send the lock request due to - // some local problem. Let's try again later and hope - // that the problem has gone away by then. - fw_schedule_bm_work(card, msecs_to_jiffies(125)); - return; - case RCODE_COMPLETE: - { - int bm_id = be32_to_cpu(data[0]); - - // Used by cdev layer for "struct fw_cdev_event_bus_reset". - scoped_guard(spinlock, &card->lock) { - if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) - card->bm_node_id = 0xffc0 & bm_id; - else - card->bm_node_id = local_id; + if (irm_is_1394_1995_only && !keep_this_irm) { + new_root_id = local_id; + fw_notice(card, "%s, making local node (%02x) root\n", + "IRM is not 1394a compliant", new_root_id); + goto pick_me; } - if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) { - // Somebody else is BM. Only act as IRM. - if (local_id == irm_id) - allocate_broadcast_channel(card, generation); + rcode = fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP, + irm_id, generation, SCODE_100, + CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, + data, sizeof(data)); + + switch (rcode) { + case RCODE_GENERATION: + // Another bus reset, BM work has been rescheduled. return; + case RCODE_SEND_ERROR: + // We have been unable to send the lock request due to + // some local problem. Let's try again later and hope + // that the problem has gone away by then. + fw_schedule_bm_work(card, msecs_to_jiffies(125)); + return; + case RCODE_COMPLETE: + { + int bm_id = be32_to_cpu(data[0]); + + // Used by cdev layer for "struct fw_cdev_event_bus_reset". + scoped_guard(spinlock, &card->lock) { + if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) + card->bm_node_id = 0xffc0 & bm_id; + else + card->bm_node_id = local_id; + } + + if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) { + // Somebody else is BM. Only act as IRM. + if (local_id == irm_id) + allocate_broadcast_channel(card, generation); + return; + } + break; } - break; - } - default: - if (!keep_this_irm) { - // The lock request failed, maybe the IRM - // isn't really IRM capable after all. Let's - // do a bus reset and pick the local node as - // root, and thus, IRM. - new_root_id = local_id; - fw_notice(card, "BM lock failed (%s), making local node (%02x) root\n", - fw_rcode_string(rcode), new_root_id); - goto pick_me; + default: + if (!keep_this_irm) { + // The lock request failed, maybe the IRM + // isn't really IRM capable after all. Let's + // do a bus reset and pick the local node as + // root, and thus, IRM. + new_root_id = local_id; + fw_notice(card, "BM lock failed (%s), making local node (%02x) root\n", + fw_rcode_string(rcode), new_root_id); + goto pick_me; + } + break; } - break; + + // A node contends for bus manager in this generation. + card->bm_generation = generation; + } else { + // We weren't BM in the last generation, and the last + // bus reset is less than 125ms ago. Reschedule this job. + fw_schedule_bm_work(card, msecs_to_jiffies(125)); + return; } - } else if (card->bm_generation != generation) { - /* - * We weren't BM in the last generation, and the last - * bus reset is less than 125ms ago. Reschedule this job. - */ - fw_schedule_bm_work(card, msecs_to_jiffies(125)); - return; } /* @@ -412,8 +412,6 @@ static void bm_work(struct work_struct *work) * make sure we have an active cycle master and do gap count * optimization. */ - card->bm_generation = generation; - if (card->gap_count == GAP_COUNT_MISMATCHED) { /* * If self IDs have inconsistent gap counts, do a From 4ff62194d3739c8c7c2d2f365cbc069753387d79 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 19 Sep 2025 08:54:46 +0900 Subject: [PATCH 2227/3206] firewire: core: code refactoring to split contention procedure for bus manager The precedure to contend for bus manager has much code. It is better to split it into a helper function. This commit refactors in the point. Link: https://lore.kernel.org/r/20250918235448.129705-5-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 223 ++++++++++++++++++++--------------- 1 file changed, 127 insertions(+), 96 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 02058af705cc3f..6268b595d4fa0d 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -279,6 +279,102 @@ void fw_schedule_bm_work(struct fw_card *card, unsigned long delay) fw_card_put(card); } +enum bm_contention_outcome { + // The bus management contention window is not expired. + BM_CONTENTION_OUTCOME_WITHIN_WINDOW = 0, + // The IRM node has link off. + BM_CONTENTION_OUTCOME_IRM_HAS_LINK_OFF, + // The IRM node complies IEEE 1394:1994 only. + BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY, + // Another bus reset, BM work has been rescheduled. + BM_CONTENTION_OUTCOME_AT_NEW_GENERATION, + // We have been unable to send the lock request to IRM node due to some local problem. + BM_CONTENTION_OUTCOME_LOCAL_PROBLEM_AT_TRANSACTION, + // The lock request failed, maybe the IRM isn't really IRM capable after all. + BM_CONTENTION_OUTCOME_IRM_IS_NOT_CAPABLE_FOR_IRM, + // Somebody else is BM. + BM_CONTENTION_OUTCOME_IRM_HOLDS_ANOTHER_NODE_AS_BM, + // The local node succeeds after contending for bus manager. + BM_CONTENTION_OUTCOME_IRM_HOLDS_LOCAL_NODE_AS_BM, +}; + +static enum bm_contention_outcome contend_for_bm(struct fw_card *card) +{ + int generation = card->generation; + int local_id = card->local_node->node_id; + __be32 data[2] = { + cpu_to_be32(BUS_MANAGER_ID_NOT_REGISTERED), + cpu_to_be32(local_id), + }; + bool grace = time_is_before_jiffies64(card->reset_jiffies + msecs_to_jiffies(125)); + bool irm_is_1394_1995_only = false; + bool keep_this_irm = false; + struct fw_node *irm_node; + struct fw_device *irm_device; + int rcode; + + if (!grace) { + if (!is_next_generation(generation, card->bm_generation) || card->bm_abdicate) + return BM_CONTENTION_OUTCOME_WITHIN_WINDOW; + } + + irm_node = card->irm_node; + if (!irm_node->link_on) { + fw_notice(card, "IRM has link off, making local node (%02x) root\n", local_id); + return BM_CONTENTION_OUTCOME_IRM_HAS_LINK_OFF; + } + + irm_device = fw_node_get_device(irm_node); + if (irm_device && irm_device->config_rom) { + irm_is_1394_1995_only = (irm_device->config_rom[2] & 0x000000f0) == 0; + + // Canon MV5i works unreliably if it is not root node. + keep_this_irm = irm_device->config_rom[3] >> 8 == CANON_OUI; + } + + if (irm_is_1394_1995_only && !keep_this_irm) { + fw_notice(card, "IRM is not 1394a compliant, making local node (%02x) root\n", + local_id); + return BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY; + } + + rcode = fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP, irm_node->node_id, generation, + SCODE_100, CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, data, + sizeof(data)); + + switch (rcode) { + case RCODE_GENERATION: + return BM_CONTENTION_OUTCOME_AT_NEW_GENERATION; + case RCODE_SEND_ERROR: + return BM_CONTENTION_OUTCOME_LOCAL_PROBLEM_AT_TRANSACTION; + case RCODE_COMPLETE: + { + int bm_id = be32_to_cpu(data[0]); + + // Used by cdev layer for "struct fw_cdev_event_bus_reset". + scoped_guard(spinlock, &card->lock) { + if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) + card->bm_node_id = 0xffc0 & bm_id; + else + card->bm_node_id = local_id; + } + + if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) + return BM_CONTENTION_OUTCOME_IRM_HOLDS_ANOTHER_NODE_AS_BM; + else + return BM_CONTENTION_OUTCOME_IRM_HOLDS_LOCAL_NODE_AS_BM; + } + default: + if (!keep_this_irm) { + fw_notice(card, "BM lock failed (%s), making local node (%02x) root\n", + fw_rcode_string(rcode), local_id); + return BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY; + } else { + return BM_CONTENTION_OUTCOME_IRM_IS_NOT_CAPABLE_FOR_IRM; + } + } +} + DEFINE_FREE(node_unref, struct fw_node *, if (_T) fw_node_put(_T)) DEFINE_FREE(card_unref, struct fw_card *, if (_T) fw_card_put(_T)) @@ -305,105 +401,40 @@ static void bm_work(struct work_struct *work) local_id = card->local_node->node_id; if (card->bm_generation != generation) { - bool grace = time_is_before_jiffies64(card->reset_jiffies + msecs_to_jiffies(125)); - - if (grace || - (is_next_generation(generation, card->bm_generation) && !card->bm_abdicate)) { - // This first step is to figure out who is IRM and - // then try to become bus manager. If the IRM is not - // well defined (e.g. does not have an active link - // layer or does not responds to our lock request, we - // will have to do a little vigilante bus management. - // In that case, we do a goto into the gap count logic - // so that when we do the reset, we still optimize the - // gap count. That could well save a reset in the - // next generation. - __be32 data[2] = { - cpu_to_be32(BUS_MANAGER_ID_NOT_REGISTERED), - cpu_to_be32(local_id), - }; - struct fw_device *irm_device = fw_node_get_device(card->irm_node); - bool irm_is_1394_1995_only = false; - bool keep_this_irm = false; - int rcode; - - if (!card->irm_node->link_on) { - new_root_id = local_id; - fw_notice(card, "%s, making local node (%02x) root\n", - "IRM has link off", new_root_id); - goto pick_me; - } + enum bm_contention_outcome result = contend_for_bm(card); - if (irm_device && irm_device->config_rom) { - irm_is_1394_1995_only = (irm_device->config_rom[2] & 0x000000f0) == 0; - - // Canon MV5i works unreliably if it is not root node. - keep_this_irm = irm_device->config_rom[3] >> 8 == CANON_OUI; - } - - if (irm_is_1394_1995_only && !keep_this_irm) { - new_root_id = local_id; - fw_notice(card, "%s, making local node (%02x) root\n", - "IRM is not 1394a compliant", new_root_id); - goto pick_me; - } - - rcode = fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP, - irm_id, generation, SCODE_100, - CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, - data, sizeof(data)); - - switch (rcode) { - case RCODE_GENERATION: - // Another bus reset, BM work has been rescheduled. - return; - case RCODE_SEND_ERROR: - // We have been unable to send the lock request due to - // some local problem. Let's try again later and hope - // that the problem has gone away by then. - fw_schedule_bm_work(card, msecs_to_jiffies(125)); - return; - case RCODE_COMPLETE: - { - int bm_id = be32_to_cpu(data[0]); - - // Used by cdev layer for "struct fw_cdev_event_bus_reset". - scoped_guard(spinlock, &card->lock) { - if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) - card->bm_node_id = 0xffc0 & bm_id; - else - card->bm_node_id = local_id; - } - - if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) { - // Somebody else is BM. Only act as IRM. - if (local_id == irm_id) - allocate_broadcast_channel(card, generation); - return; - } - break; - } - default: - if (!keep_this_irm) { - // The lock request failed, maybe the IRM - // isn't really IRM capable after all. Let's - // do a bus reset and pick the local node as - // root, and thus, IRM. - new_root_id = local_id; - fw_notice(card, "BM lock failed (%s), making local node (%02x) root\n", - fw_rcode_string(rcode), new_root_id); - goto pick_me; - } - break; - } - - // A node contends for bus manager in this generation. - card->bm_generation = generation; - } else { - // We weren't BM in the last generation, and the last - // bus reset is less than 125ms ago. Reschedule this job. + switch (result) { + case BM_CONTENTION_OUTCOME_WITHIN_WINDOW: fw_schedule_bm_work(card, msecs_to_jiffies(125)); return; + case BM_CONTENTION_OUTCOME_IRM_HAS_LINK_OFF: + new_root_id = local_id; + goto pick_me; + case BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY: + new_root_id = local_id; + goto pick_me; + case BM_CONTENTION_OUTCOME_AT_NEW_GENERATION: + // BM work has been rescheduled. + return; + case BM_CONTENTION_OUTCOME_LOCAL_PROBLEM_AT_TRANSACTION: + // Let's try again later and hope that the local problem has gone away by + // then. + fw_schedule_bm_work(card, msecs_to_jiffies(125)); + return; + case BM_CONTENTION_OUTCOME_IRM_IS_NOT_CAPABLE_FOR_IRM: + // Let's do a bus reset and pick the local node as root, and thus, IRM. + new_root_id = local_id; + goto pick_me; + case BM_CONTENTION_OUTCOME_IRM_HOLDS_ANOTHER_NODE_AS_BM: + if (local_id == irm_id) { + // Only acts as IRM. + allocate_broadcast_channel(card, generation); + } + fallthrough; + case BM_CONTENTION_OUTCOME_IRM_HOLDS_LOCAL_NODE_AS_BM: + default: + card->bm_generation = generation; + break; } } From 9192c37929ff26eae7ab4e82ddb4af54ed73c6e2 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 19 Sep 2025 08:54:47 +0900 Subject: [PATCH 2228/3206] firewire: core; eliminate pick_me goto label This commit uses condition statements instead of pick_me goto label. Link: https://lore.kernel.org/r/20250918235448.129705-6-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 101 ++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 50 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 6268b595d4fa0d..58d1f58a4a0f1a 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -388,6 +388,7 @@ static void bm_work(struct work_struct *work) int root_id, new_root_id, irm_id, local_id; int expected_gap_count, generation; bool do_reset = false; + bool stand_for_root = false; if (card->local_node == NULL) return; @@ -408,11 +409,11 @@ static void bm_work(struct work_struct *work) fw_schedule_bm_work(card, msecs_to_jiffies(125)); return; case BM_CONTENTION_OUTCOME_IRM_HAS_LINK_OFF: - new_root_id = local_id; - goto pick_me; + stand_for_root = true; + break; case BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY: - new_root_id = local_id; - goto pick_me; + stand_for_root = true; + break; case BM_CONTENTION_OUTCOME_AT_NEW_GENERATION: // BM work has been rescheduled. return; @@ -423,8 +424,8 @@ static void bm_work(struct work_struct *work) return; case BM_CONTENTION_OUTCOME_IRM_IS_NOT_CAPABLE_FOR_IRM: // Let's do a bus reset and pick the local node as root, and thus, IRM. - new_root_id = local_id; - goto pick_me; + stand_for_root = true; + break; case BM_CONTENTION_OUTCOME_IRM_HOLDS_ANOTHER_NODE_AS_BM: if (local_id == irm_id) { // Only acts as IRM. @@ -438,56 +439,56 @@ static void bm_work(struct work_struct *work) } } - /* - * We're bus manager for this generation, so next step is to - * make sure we have an active cycle master and do gap count - * optimization. - */ - if (card->gap_count == GAP_COUNT_MISMATCHED) { - /* - * If self IDs have inconsistent gap counts, do a - * bus reset ASAP. The config rom read might never - * complete, so don't wait for it. However, still - * send a PHY configuration packet prior to the - * bus reset. The PHY configuration packet might - * fail, but 1394-2008 8.4.5.2 explicitly permits - * it in this case, so it should be safe to try. - */ - new_root_id = local_id; - /* - * We must always send a bus reset if the gap count - * is inconsistent, so bypass the 5-reset limit. - */ - card->bm_retries = 0; - } else { - // Now investigate root node. - struct fw_device *root_device = fw_node_get_device(root_node); - - if (root_device == NULL) { - // Either link_on is false, or we failed to read the - // config rom. In either case, pick another root. - new_root_id = local_id; + // We're bus manager for this generation, so next step is to make sure we have an active + // cycle master and do gap count optimization. + if (!stand_for_root) { + if (card->gap_count == GAP_COUNT_MISMATCHED) { + // If self IDs have inconsistent gap counts, do a + // bus reset ASAP. The config rom read might never + // complete, so don't wait for it. However, still + // send a PHY configuration packet prior to the + // bus reset. The PHY configuration packet might + // fail, but 1394-2008 8.4.5.2 explicitly permits + // it in this case, so it should be safe to try. + stand_for_root = true; + + // We must always send a bus reset if the gap count + // is inconsistent, so bypass the 5-reset limit. + card->bm_retries = 0; } else { - bool root_device_is_running = - atomic_read(&root_device->state) == FW_DEVICE_RUNNING; + // Now investigate root node. + struct fw_device *root_device = fw_node_get_device(root_node); - if (!root_device_is_running) { - // If we haven't probed this device yet, bail out now - // and let's try again once that's done. - return; - } else if (root_device->cmc) { - // We will send out a force root packet for this - // node as part of the gap count optimization. - new_root_id = root_id; + if (root_device == NULL) { + // Either link_on is false, or we failed to read the + // config rom. In either case, pick another root. + stand_for_root = true; } else { - // Current root has an active link layer and we - // successfully read the config rom, but it's not - // cycle master capable. - new_root_id = local_id; + bool root_device_is_running = + atomic_read(&root_device->state) == FW_DEVICE_RUNNING; + + if (!root_device_is_running) { + // If we haven't probed this device yet, bail out now + // and let's try again once that's done. + return; + } else if (!root_device->cmc) { + // Current root has an active link layer and we + // successfully read the config rom, but it's not + // cycle master capable. + stand_for_root = true; + } } } } - pick_me: + + if (stand_for_root) { + new_root_id = local_id; + } else { + // We will send out a force root packet for this node as part of the gap count + // optimization on behalf of the node. + new_root_id = root_id; + } + /* * Pick a gap count from 1394a table E-1. The table doesn't cover * the typically much larger 1394b beta repeater delays though. From 19e73f65940d3d3357c637f3d7e19a59305a748f Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 19 Sep 2025 08:54:48 +0900 Subject: [PATCH 2229/3206] firewire: core: minor code refactoring to delete useless local variable The do_reset local variable has less merit. Let's remove it. Link: https://lore.kernel.org/r/20250918235448.129705-7-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 58d1f58a4a0f1a..4a5459696093a1 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -387,7 +387,6 @@ static void bm_work(struct work_struct *work) struct fw_node *root_node __free(node_unref) = NULL; int root_id, new_root_id, irm_id, local_id; int expected_gap_count, generation; - bool do_reset = false; bool stand_for_root = false; if (card->local_node == NULL) @@ -499,16 +498,10 @@ static void bm_work(struct work_struct *work) else expected_gap_count = 63; - /* - * Finally, figure out if we should do a reset or not. If we have - * done less than 5 resets with the same physical topology and we - * have either a new root or a new gap count setting, let's do it. - */ - - if (card->bm_retries++ < 5 && (card->gap_count != expected_gap_count || new_root_id != root_id)) - do_reset = true; - - if (do_reset) { + // Finally, figure out if we should do a reset or not. If we have done less than 5 resets + // with the same physical topology and we have either a new root or a new gap count + // setting, let's do it. + if (card->bm_retries++ < 5 && (card->gap_count != expected_gap_count || new_root_id != root_id)) { int card_gap_count = card->gap_count; fw_notice(card, "phy config: new root=%x, gap_count=%d\n", From f68cd7ddd014b60451922207bff2ea3beea0d8cb Mon Sep 17 00:00:00 2001 From: Bala-Vignesh-Reddy Date: Fri, 15 Aug 2025 23:37:24 +0530 Subject: [PATCH 2230/3206] selftests: riscv: Add README for RISC-V KSelfTest Add a README file for RISC-V specific kernel selftests under tools/testing/selftests/riscv/. This mirrors the existing README for arm64, providing clear guidance on how the tests are architecture specific and skipped on non-riscv systems. It also includes standard make commands for building, running and installing the tests, along with a reference to general kselftest documentation. Signed-off-by: Bala-Vignesh-Reddy Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20250815180724.14459-1-reddybalavignesh9979@gmail.com Signed-off-by: Paul Walmsley --- tools/testing/selftests/riscv/README | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 tools/testing/selftests/riscv/README diff --git a/tools/testing/selftests/riscv/README b/tools/testing/selftests/riscv/README new file mode 100644 index 00000000000000..443da395da6897 --- /dev/null +++ b/tools/testing/selftests/riscv/README @@ -0,0 +1,24 @@ +KSelfTest RISC-V +================ + +- These tests are riscv specific and so not built or run but just skipped + completely when env-variable ARCH is found to be different than 'riscv'. + +- Holding true the above, RISC-V KSFT tests can be run within the + KSelfTest framework using standard Linux top-level-makefile targets: + + $ make TARGETS=riscv kselftest-clean + $ make TARGETS=riscv kselftest + + or + + $ make -C tools/testing/selftests TARGETS=riscv \ + INSTALL_PATH= install + + or, alternatively, only specific riscv/ subtargets can be picked: + + $ make -C tools/testing/selftests TARGETS=riscv RISCV_SUBTARGETS="mm vector" \ + INSTALL_PATH= install + + Further details on building and running KSFT can be found in: + Documentation/dev-tools/kselftest.rst From 568a2fa10dd06bbd8160e3f8cce9483fabcb7121 Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Mon, 4 Aug 2025 10:51:10 +0800 Subject: [PATCH 2231/3206] perf: riscv: skip empty batches in counter start Avoid unnecessary SBI calls when starting non-overflowed counters in pmu_sbi_start_ovf_ctrs_sbi() by checking ctr_start_mask. Signed-off-by: Yunhui Cui Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250804025110.11088-1-cuiyunhui@bytedance.com Signed-off-by: Paul Walmsley --- drivers/perf/riscv_pmu_sbi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index c18dbffa983450..3fc16bbab0250b 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -877,8 +877,10 @@ static inline void pmu_sbi_start_ovf_ctrs_sbi(struct cpu_hw_events *cpu_hw_evt, for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++) { ctr_start_mask = cpu_hw_evt->used_hw_ctrs[i] & ~ctr_ovf_mask; /* Start all the counters that did not overflow in a single shot */ - sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, i * BITS_PER_LONG, ctr_start_mask, - 0, 0, 0, 0); + if (ctr_start_mask) { + sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, i * BITS_PER_LONG, + ctr_start_mask, 0, 0, 0, 0); + } } /* Reinitialize and start all the counter that overflowed */ From 2e2cf5581fccc562f7faf174ffb9866fed5cafbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Tue, 27 May 2025 12:00:00 +0200 Subject: [PATCH 2232/3206] riscv: cpufeature: add validation for zfa, zfh and zfhmin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These extensions depends on the F one. Add a validation callback checking for the F extension to be present. Now that extensions are correctly reported using the F/D presence, we can remove the has_fpu() check in hwprobe_isa_ext0(). Signed-off-by: Clément Léger Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20250527100001.33284-1-cleger@rivosinc.com Signed-off-by: Paul Walmsley --- arch/riscv/kernel/cpufeature.c | 6 +++--- arch/riscv/kernel/sys_hwprobe.c | 14 ++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 743d53415572e0..67b59699357da8 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -474,10 +474,10 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(zacas, RISCV_ISA_EXT_ZACAS), __RISCV_ISA_EXT_DATA(zalrsc, RISCV_ISA_EXT_ZALRSC), __RISCV_ISA_EXT_DATA(zawrs, RISCV_ISA_EXT_ZAWRS), - __RISCV_ISA_EXT_DATA(zfa, RISCV_ISA_EXT_ZFA), + __RISCV_ISA_EXT_DATA_VALIDATE(zfa, RISCV_ISA_EXT_ZFA, riscv_ext_f_depends), __RISCV_ISA_EXT_DATA_VALIDATE(zfbfmin, RISCV_ISA_EXT_ZFBFMIN, riscv_ext_f_depends), - __RISCV_ISA_EXT_DATA(zfh, RISCV_ISA_EXT_ZFH), - __RISCV_ISA_EXT_DATA(zfhmin, RISCV_ISA_EXT_ZFHMIN), + __RISCV_ISA_EXT_DATA_VALIDATE(zfh, RISCV_ISA_EXT_ZFH, riscv_ext_f_depends), + __RISCV_ISA_EXT_DATA_VALIDATE(zfhmin, RISCV_ISA_EXT_ZFHMIN, riscv_ext_f_depends), __RISCV_ISA_EXT_DATA(zca, RISCV_ISA_EXT_ZCA), __RISCV_ISA_EXT_DATA_VALIDATE(zcb, RISCV_ISA_EXT_ZCB, riscv_ext_zca_depends), __RISCV_ISA_EXT_DATA_VALIDATE(zcd, RISCV_ISA_EXT_ZCD, riscv_ext_zcd_validate), diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 0b170e18a2beba..3e9259790816e4 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -153,14 +153,12 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, EXT_KEY(ZVKT); } - if (has_fpu()) { - EXT_KEY(ZCD); - EXT_KEY(ZCF); - EXT_KEY(ZFA); - EXT_KEY(ZFBFMIN); - EXT_KEY(ZFH); - EXT_KEY(ZFHMIN); - } + EXT_KEY(ZCD); + EXT_KEY(ZCF); + EXT_KEY(ZFA); + EXT_KEY(ZFBFMIN); + EXT_KEY(ZFH); + EXT_KEY(ZFHMIN); if (IS_ENABLED(CONFIG_RISCV_ISA_SUPM)) EXT_KEY(SUPM); From 603b4416232524dafde8e2cf859788dae786dea1 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:30 +0200 Subject: [PATCH 2233/3206] bpf: Update the bpf_prog_calc_tag to use SHA256 Exclusive maps restrict map access to specific programs using a hash. The current hash used for this is SHA1, which is prone to collisions. This patch uses SHA256, which is more resilient against collisions. This new hash is stored in bpf_prog and used by the verifier to determine if a program can access a given exclusive map. The original 64-bit tags are kept, as they are used by users as a short, possibly colliding program identifier for non-security purposes. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-2-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++++- kernel/bpf/Kconfig | 2 +- kernel/bpf/core.c | 5 ++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 41f776071ff517..d75902074bd1f3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -31,6 +31,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -1717,7 +1718,10 @@ struct bpf_prog { enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ - u8 tag[BPF_TAG_SIZE]; + union { + u8 digest[SHA256_DIGEST_SIZE]; + u8 tag[BPF_TAG_SIZE]; + }; struct bpf_prog_stats __percpu *stats; int __percpu *active; unsigned int (*bpf_func)(const void *ctx, diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 17067dcb438610..eb3de35734f092 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -3,7 +3,7 @@ # BPF interpreter that, for example, classic socket filters depend on. config BPF bool - select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 # Used by archs to tell that they support BPF JIT compiler plus which # flavour. Only one of the two can be selected for a specific arch since diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1cda2589d4b35c..9b64674df16b88 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -296,7 +297,6 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { size_t size = bpf_prog_insn_size(fp); - u8 digest[SHA1_DIGEST_SIZE]; struct bpf_insn *dst; bool was_ld_map; u32 i; @@ -327,8 +327,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) was_ld_map = false; } } - sha1((const u8 *)dst, size, digest); - memcpy(fp->tag, digest, sizeof(fp->tag)); + sha256((u8 *)dst, size, fp->digest); vfree(dst); return 0; } From baefdbdf6812e120c9fba9cfb101d3656f478026 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:31 +0200 Subject: [PATCH 2234/3206] bpf: Implement exclusive map creation Exclusive maps allow maps to only be accessed by program with a program with a matching hash which is specified in the excl_prog_hash attr. For the signing use-case, this allows the trusted loader program to load the map and verify the integrity Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-3-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++++ kernel/bpf/syscall.c | 31 +++++++++++++++++++++++++++---- kernel/bpf/verifier.c | 6 ++++++ tools/include/uapi/linux/bpf.h | 6 ++++++ 5 files changed, 46 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d75902074bd1f3..c6a6ee1b29388d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -329,6 +329,7 @@ struct bpf_map { atomic64_t sleepable_refcnt; s64 __percpu *elem_count; u64 cookie; /* write-once */ + char *excl_prog_sha; }; static inline const char *btf_field_type_name(enum btf_field_type type) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 233de8677382ec..57687b2e1c4770 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3f178a0f8eb12c..c8ef91acfe98aa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -860,6 +860,7 @@ static void bpf_map_free(struct bpf_map *map) * the free of values or special fields allocated from bpf memory * allocator. */ + kfree(map->excl_prog_sha); migrate_disable(); map->ops->map_free(map); migrate_enable(); @@ -1338,9 +1339,9 @@ static bool bpf_net_capable(void) return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); } -#define BPF_MAP_CREATE_LAST_FIELD map_token_fd +#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int map_create(union bpf_attr *attr, bool kernel) +static int map_create(union bpf_attr *attr, bpfptr_t uattr) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1534,7 +1535,29 @@ static int map_create(union bpf_attr *attr, bool kernel) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token, kernel); + if (attr->excl_prog_hash) { + bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); + + if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { + err = -EINVAL; + goto free_map; + } + + map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!map->excl_prog_sha) { + err = -ENOMEM; + goto free_map; + } + + if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) { + err = -EFAULT; + goto free_map; + } + } else if (attr->excl_prog_hash_size) { + return -EINVAL; + } + + err = security_bpf_map_create(map, attr, token, uattr.is_kernel); if (err) goto free_map_sec; @@ -6008,7 +6031,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr, uattr.is_kernel); + err = map_create(&attr, uattr); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6625570ac23de2..aef6b266f08d1f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20407,6 +20407,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); + if (map->excl_prog_sha && + memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) { + verbose(env, "program's hash doesn't match map's excl_prog_hash\n"); + return -EACCES; + } + if (btf_record_has_field(map->record, BPF_LIST_HEAD) || btf_record_has_field(map->record, BPF_RB_ROOT)) { if (is_tracing_prog_type(prog_type)) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 233de8677382ec..57687b2e1c4770 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ From c297fe3e9f9917e8bfd569442290736a551bfc60 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:32 +0200 Subject: [PATCH 2235/3206] libbpf: Implement SHA256 internal helper Use AF_ALG sockets to not have libbpf depend on OpenSSL. The helper is used for the loader generation code to embed the metadata hash in the loader program and also by the bpf_map__make_exclusive API to calculate the hash of the program the map is exclusive to. Acked-by: Andrii Nakryiko Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-4-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 59 +++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf_internal.h | 4 +++ 2 files changed, 63 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index fe4fc5438678c5..a39640bd544812 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -43,6 +43,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -14217,3 +14220,59 @@ void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s) free(s->progs); free(s); } + +int libbpf_sha256(const void *data, size_t data_sz, void *sha_out, size_t sha_out_sz) +{ + struct sockaddr_alg sa = { + .salg_family = AF_ALG, + .salg_type = "hash", + .salg_name = "sha256" + }; + int sock_fd = -1; + int op_fd = -1; + int err = 0; + + if (sha_out_sz != SHA256_DIGEST_LENGTH) { + pr_warn("sha_out_sz should be exactly 32 bytes for a SHA256 digest"); + return -EINVAL; + } + + sock_fd = socket(AF_ALG, SOCK_SEQPACKET, 0); + if (sock_fd < 0) { + err = -errno; + pr_warn("failed to create AF_ALG socket for SHA256: %s\n", errstr(err)); + return err; + } + + if (bind(sock_fd, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + err = -errno; + pr_warn("failed to bind to AF_ALG socket for SHA256: %s\n", errstr(err)); + goto out; + } + + op_fd = accept(sock_fd, NULL, 0); + if (op_fd < 0) { + err = -errno; + pr_warn("failed to accept from AF_ALG socket for SHA256: %s\n", errstr(err)); + goto out; + } + + if (write(op_fd, data, data_sz) != data_sz) { + err = -errno; + pr_warn("failed to write data to AF_ALG socket for SHA256: %s\n", errstr(err)); + goto out; + } + + if (read(op_fd, sha_out, SHA256_DIGEST_LENGTH) != SHA256_DIGEST_LENGTH) { + err = -errno; + pr_warn("failed to read SHA256 from AF_ALG socket: %s\n", errstr(err)); + goto out; + } + +out: + if (op_fd >= 0) + close(op_fd); + if (sock_fd >= 0) + close(sock_fd); + return err; +} diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 477a3b3389a091..8a055de0d32483 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -736,4 +736,8 @@ int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern, int probe_fd(int fd); +#define SHA256_DIGEST_LENGTH 32 +#define SHA256_DWORD_SIZE SHA256_DIGEST_LENGTH / sizeof(__u64) + +int libbpf_sha256(const void *data, size_t data_sz, void *sha_out, size_t sha_out_sz); #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ From 567010a5478ff4cbf1f14a01fa03cb50f68ac354 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:33 +0200 Subject: [PATCH 2236/3206] libbpf: Support exclusive map creation Implement setters and getters that allow map to be registered as exclusive to the specified program. The registration should be done before the exclusive program is loaded. Signed-off-by: KP Singh Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250914215141.15144-5-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 4 ++- tools/lib/bpf/bpf.h | 5 ++- tools/lib/bpf/libbpf.c | 69 ++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 22 +++++++++++++ tools/lib/bpf/libbpf.map | 3 ++ 5 files changed, 101 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index ab40dbf9f020fb..19ad7bcf0c2ff3 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -172,7 +172,7 @@ int bpf_map_create(enum bpf_map_type map_type, __u32 max_entries, const struct bpf_map_create_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, map_token_fd); + const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size); union bpf_attr attr; int fd; @@ -203,6 +203,8 @@ int bpf_map_create(enum bpf_map_type map_type, attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0); attr.map_token_fd = OPTS_GET(opts, token_fd, 0); + attr.excl_prog_hash = ptr_to_u64(OPTS_GET(opts, excl_prog_hash, NULL)); + attr.excl_prog_hash_size = OPTS_GET(opts, excl_prog_hash_size, 0); fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); return libbpf_err_errno(fd); diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 7252150e7ad357..e983a3e40d6120 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -54,9 +54,12 @@ struct bpf_map_create_opts { __s32 value_type_btf_obj_fd; __u32 token_fd; + + const void *excl_prog_hash; + __u32 excl_prog_hash_size; size_t :0; }; -#define bpf_map_create_opts__last_field token_fd +#define bpf_map_create_opts__last_field excl_prog_hash_size LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, const char *map_name, diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a39640bd544812..5161c2b398753d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -499,6 +499,7 @@ struct bpf_program { __u32 line_info_rec_size; __u32 line_info_cnt; __u32 prog_flags; + __u8 hash[SHA256_DIGEST_LENGTH]; }; struct bpf_struct_ops { @@ -578,6 +579,7 @@ struct bpf_map { bool autocreate; bool autoattach; __u64 map_extra; + struct bpf_program *excl_prog; }; enum extern_type { @@ -4488,6 +4490,44 @@ bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) } } +static int bpf_prog_compute_hash(struct bpf_program *prog) +{ + struct bpf_insn *purged; + int i, err; + + purged = calloc(prog->insns_cnt, BPF_INSN_SZ); + if (!purged) + return -ENOMEM; + + /* If relocations have been done, the map_fd needs to be + * discarded for the digest calculation. + */ + for (i = 0; i < prog->insns_cnt; i++) { + purged[i] = prog->insns[i]; + if (purged[i].code == (BPF_LD | BPF_IMM | BPF_DW) && + (purged[i].src_reg == BPF_PSEUDO_MAP_FD || + purged[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { + purged[i].imm = 0; + i++; + if (i >= prog->insns_cnt || + prog->insns[i].code != 0 || + prog->insns[i].dst_reg != 0 || + prog->insns[i].src_reg != 0 || + prog->insns[i].off != 0) { + err = -EINVAL; + goto out; + } + purged[i] = prog->insns[i]; + purged[i].imm = 0; + } + } + err = libbpf_sha256(purged, prog->insns_cnt * sizeof(struct bpf_insn), + prog->hash, SHA256_DIGEST_LENGTH); +out: + free(purged); + return err; +} + static int bpf_program__record_reloc(struct bpf_program *prog, struct reloc_desc *reloc_desc, __u32 insn_idx, const char *sym_name, @@ -5237,6 +5277,14 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b create_attr.token_fd = obj->token_fd; if (obj->token_fd) create_attr.map_flags |= BPF_F_TOKEN_FD; + if (map->excl_prog) { + err = bpf_prog_compute_hash(map->excl_prog); + if (err) + return err; + + create_attr.excl_prog_hash = map->excl_prog->hash; + create_attr.excl_prog_hash_size = SHA256_DIGEST_LENGTH; + } if (bpf_map__is_struct_ops(map)) { create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; @@ -10527,6 +10575,27 @@ int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd) return 0; } +int bpf_map__set_exclusive_program(struct bpf_map *map, struct bpf_program *prog) +{ + if (map_is_created(map)) { + pr_warn("exclusive programs must be set before map creation\n"); + return libbpf_err(-EINVAL); + } + + if (map->obj != prog->obj) { + pr_warn("excl_prog and map must be from the same bpf object\n"); + return libbpf_err(-EINVAL); + } + + map->excl_prog = prog; + return 0; +} + +struct bpf_program *bpf_map__exclusive_program(struct bpf_map *map) +{ + return map->excl_prog; +} + static struct bpf_map * __bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 2e91148d9b44d1..e978bc093c3914 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1291,6 +1291,28 @@ LIBBPF_API int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, */ LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, const void *cur_key, void *next_key, size_t key_sz); +/** + * @brief **bpf_map__set_exclusive_program()** sets a map to be exclusive to the + * specified program. This must be called *before* the map is created. + * + * @param map BPF map to make exclusive. + * @param prog BPF program to be the exclusive user of the map. Must belong + * to the same bpf_object as the map. + * @return 0 on success; a negative error code otherwise. + * + * This function must be called after the BPF object is opened but before + * it is loaded. Once the object is loaded, only the specified program + * will be able to access the map's contents. + */ +LIBBPF_API int bpf_map__set_exclusive_program(struct bpf_map *map, struct bpf_program *prog); + +/** + * @brief **bpf_map__exclusive_program()** returns the exclusive program + * that is registered with the map (if any). + * @param map BPF map to which the exclusive program is registered. + * @return the registered exclusive program. + */ +LIBBPF_API struct bpf_program *bpf_map__exclusive_program(struct bpf_map *map); struct bpf_xdp_set_link_opts { size_t sz; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index d7bd463e7017e7..8ed8749907d472 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -448,4 +448,7 @@ LIBBPF_1.6.0 { } LIBBPF_1.5.0; LIBBPF_1.7.0 { + global: + bpf_map__set_exclusive_program; + bpf_map__exclusive_program; } LIBBPF_1.6.0; From 6c850cbca82c2d20bc1b1c5e0e1c25c515292abd Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:34 +0200 Subject: [PATCH 2237/3206] selftests/bpf: Add tests for exclusive maps Check if access is denied to another program for an exclusive map Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-6-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/map_excl.c | 54 +++++++++++++++++++ tools/testing/selftests/bpf/progs/map_excl.c | 34 ++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_excl.c create mode 100644 tools/testing/selftests/bpf/progs/map_excl.c diff --git a/tools/testing/selftests/bpf/prog_tests/map_excl.c b/tools/testing/selftests/bpf/prog_tests/map_excl.c new file mode 100644 index 00000000000000..6bdc6d6de0daf2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_excl.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Google LLC. */ +#define _GNU_SOURCE +#include +#include +#include +#include + +#include "map_excl.skel.h" + +static void test_map_excl_allowed(void) +{ + struct map_excl *skel = map_excl__open(); + int err; + + err = bpf_map__set_exclusive_program(skel->maps.excl_map, skel->progs.should_have_access); + if (!ASSERT_OK(err, "bpf_map__set_exclusive_program")) + goto out; + + bpf_program__set_autoload(skel->progs.should_have_access, true); + bpf_program__set_autoload(skel->progs.should_not_have_access, false); + + err = map_excl__load(skel); + ASSERT_OK(err, "map_excl__load"); +out: + map_excl__destroy(skel); +} + +static void test_map_excl_denied(void) +{ + struct map_excl *skel = map_excl__open(); + int err; + + err = bpf_map__set_exclusive_program(skel->maps.excl_map, skel->progs.should_have_access); + if (!ASSERT_OK(err, "bpf_map__make_exclusive")) + goto out; + + bpf_program__set_autoload(skel->progs.should_have_access, false); + bpf_program__set_autoload(skel->progs.should_not_have_access, true); + + err = map_excl__load(skel); + ASSERT_EQ(err, -EACCES, "exclusive map access not denied\n"); +out: + map_excl__destroy(skel); + +} + +void test_map_excl(void) +{ + if (test__start_subtest("map_excl_allowed")) + test_map_excl_allowed(); + if (test__start_subtest("map_excl_denied")) + test_map_excl_denied(); +} diff --git a/tools/testing/selftests/bpf/progs/map_excl.c b/tools/testing/selftests/bpf/progs/map_excl.c new file mode 100644 index 00000000000000..d461684728e4e7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/map_excl.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Google LLC. */ +#include +#include +#include + +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 1); +} excl_map SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int should_have_access(void *ctx) +{ + int key = 0, value = 0xdeadbeef; + + bpf_map_update_elem(&excl_map, &key, &value, 0); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int should_not_have_access(void *ctx) +{ + int key = 0, value = 0xdeadbeef; + + bpf_map_update_elem(&excl_map, &key, &value, 0); + return 0; +} From ea2e6467ac36bf3d785defc89e58269b15d182f7 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:35 +0200 Subject: [PATCH 2238/3206] bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD Currently only array maps are supported, but the implementation can be extended for other maps and objects. The hash is memoized only for exclusive and frozen maps as their content is stable until the exclusive program modifies the map. This is required for BPF signing, enabling a trusted loader program to verify a map's integrity. The loader retrieves the map's runtime hash from the kernel and compares it against an expected hash computed at build time. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-7-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 13 +++++++++++ kernel/bpf/syscall.c | 23 +++++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ .../selftests/bpf/progs/verifier_map_ptr.c | 7 ++++-- 6 files changed, 48 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c6a6ee1b29388d..e0c2c78a5faae8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -110,6 +111,7 @@ struct bpf_map_ops { long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); + int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -289,6 +291,7 @@ struct bpf_map_owner { }; struct bpf_map { + u8 sha[SHA256_DIGEST_SIZE]; const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 57687b2e1c4770..0987b52d564842 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6672,6 +6672,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d080916faf976..26d5dda989bc1a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "map_in_map.h" @@ -174,6 +175,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } +static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, + void *hash_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + sha256(array->value, (u64)array->elem_size * array->map.max_entries, + hash_buf); + memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + return 0; +} + static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) { @@ -800,6 +812,7 @@ const struct bpf_map_ops array_map_ops = { .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, + .map_get_hash = &array_map_get_hash, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c8ef91acfe98aa..cf7173b1bb83dc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ +#include #include #include #include @@ -5184,6 +5185,9 @@ static int bpf_map_get_info_by_fd(struct file *file, info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -5208,6 +5212,25 @@ static int bpf_map_get_info_by_fd(struct file *file, return err; } + if (info.hash) { + char __user *uhash = u64_to_user_ptr(info.hash); + + if (!map->ops->map_get_hash) + return -EINVAL; + + if (info.hash_size != SHA256_DIGEST_SIZE) + return -EINVAL; + + err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); + if (err != 0) + return err; + + if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) + return -EFAULT; + } else if (info.hash_size) { + return -EINVAL; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 57687b2e1c4770..0987b52d564842 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6672,6 +6672,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c index 11a0791459669e..e2767d27d8aaf8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c @@ -70,10 +70,13 @@ __naked void bpf_map_ptr_write_rejected(void) : __clobber_all); } +/* The first element of struct bpf_map is a SHA256 hash of 32 bytes, accessing + * into this array is valid. The opts field is now at offset 33. + */ SEC("socket") __description("bpf_map_ptr: read non-existent field rejected") __failure -__msg("cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4") +__msg("cannot access ptr member ops with moff 32 in struct bpf_map with off 33 size 4") __failure_unpriv __msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") __flag(BPF_F_ANY_ALIGNMENT) @@ -82,7 +85,7 @@ __naked void read_non_existent_field_rejected(void) asm volatile (" \ r6 = 0; \ r1 = %[map_array_48b] ll; \ - r6 = *(u32*)(r1 + 1); \ + r6 = *(u32*)(r1 + 33); \ r0 = 1; \ exit; \ " : From 8cd189e414bb705312fbfff7f7b5605f6de2459a Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:36 +0200 Subject: [PATCH 2239/3206] bpf: Move the signature kfuncs to helpers.c No functional changes, except for the addition of the headers for the kfuncs so that they can be used for signature verification. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-8-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 32 +++++++ kernel/bpf/helpers.c | 166 +++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 183 --------------------------------------- 3 files changed, 198 insertions(+), 183 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e0c2c78a5faae8..dfc1a27b56d550 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3424,6 +3424,38 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, #endif /* CONFIG_BPF_SYSCALL */ #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ +#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) + +struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags); +struct bpf_key *bpf_lookup_system_key(u64 id); +void bpf_key_put(struct bpf_key *bkey); +int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring); + +#else +static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +{ + return NULL; +} + +static inline struct bpf_key *bpf_lookup_system_key(u64 id) +{ + return NULL; +} + +static inline void bpf_key_put(struct bpf_key *bkey) +{ +} + +static inline int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring) +{ + return -EOPNOTSUPP; +} +#endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 51229aba531818..ef4ede8bb74fd8 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -3747,6 +3748,163 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) { return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); } +#ifdef CONFIG_KEYS +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +/** + * bpf_verify_pkcs7_signature - verify a PKCS#7 signature + * @data_p: data to verify + * @sig_p: signature of the data + * @trusted_keyring: keyring with keys trusted for signature verification + * + * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* + * with keys in a keyring referenced by *trusted_keyring*. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring) +{ +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION + struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; + const void *data, *sig; + u32 data_len, sig_len; + int ret; + + if (trusted_keyring->has_ref) { + /* + * Do the permission check deferred in bpf_lookup_user_key(). + * See bpf_lookup_user_key() for more details. + * + * A call to key_task_permission() here would be redundant, as + * it is already done by keyring_search() called by + * find_asymmetric_key(). + */ + ret = key_validate(trusted_keyring->key); + if (ret < 0) + return ret; + } + + data_len = __bpf_dynptr_size(data_ptr); + data = __bpf_dynptr_data(data_ptr, data_len); + sig_len = __bpf_dynptr_size(sig_ptr); + sig = __bpf_dynptr_data(sig_ptr, sig_len); + + return verify_pkcs7_signature(data, data_len, sig, sig_len, + trusted_keyring->key, + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + NULL); +#else + return -EOPNOTSUPP; +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ +} +#endif /* CONFIG_KEYS */ __bpf_kfunc_end_defs(); @@ -3788,6 +3946,14 @@ BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) #endif +#ifdef CONFIG_KEYS +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) +#endif +#endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 606007c387c52f..f2360579658ead 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -1241,188 +1240,6 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_KEYS -__bpf_kfunc_start_defs(); - -/** - * bpf_lookup_user_key - lookup a key by its serial - * @serial: key handle serial number - * @flags: lookup-specific flags - * - * Search a key with a given *serial* and the provided *flags*. - * If found, increment the reference count of the key by one, and - * return it in the bpf_key structure. - * - * The bpf_key structure must be passed to bpf_key_put() when done - * with it, so that the key reference count is decremented and the - * bpf_key structure is freed. - * - * Permission checks are deferred to the time the key is used by - * one of the available key-specific kfuncs. - * - * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested - * special keyring (e.g. session keyring), if it doesn't yet exist. - * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting - * for the key construction, and to retrieve uninstantiated keys (keys - * without data attached to them). - * - * Return: a bpf_key pointer with a valid key pointer if the key is found, a - * NULL pointer otherwise. - */ -__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) -{ - key_ref_t key_ref; - struct bpf_key *bkey; - - if (flags & ~KEY_LOOKUP_ALL) - return NULL; - - /* - * Permission check is deferred until the key is used, as the - * intent of the caller is unknown here. - */ - key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); - if (IS_ERR(key_ref)) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); - if (!bkey) { - key_put(key_ref_to_ptr(key_ref)); - return NULL; - } - - bkey->key = key_ref_to_ptr(key_ref); - bkey->has_ref = true; - - return bkey; -} - -/** - * bpf_lookup_system_key - lookup a key by a system-defined ID - * @id: key ID - * - * Obtain a bpf_key structure with a key pointer set to the passed key ID. - * The key pointer is marked as invalid, to prevent bpf_key_put() from - * attempting to decrement the key reference count on that pointer. The key - * pointer set in such way is currently understood only by - * verify_pkcs7_signature(). - * - * Set *id* to one of the values defined in include/linux/verification.h: - * 0 for the primary keyring (immutable keyring of system keys); - * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring - * (where keys can be added only if they are vouched for by existing keys - * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform - * keyring (primarily used by the integrity subsystem to verify a kexec'ed - * kerned image and, possibly, the initramfs signature). - * - * Return: a bpf_key pointer with an invalid key pointer set from the - * pre-determined ID on success, a NULL pointer otherwise - */ -__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) -{ - struct bpf_key *bkey; - - if (system_keyring_id_check(id) < 0) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); - if (!bkey) - return NULL; - - bkey->key = (struct key *)(unsigned long)id; - bkey->has_ref = false; - - return bkey; -} - -/** - * bpf_key_put - decrement key reference count if key is valid and free bpf_key - * @bkey: bpf_key structure - * - * Decrement the reference count of the key inside *bkey*, if the pointer - * is valid, and free *bkey*. - */ -__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) -{ - if (bkey->has_ref) - key_put(bkey->key); - - kfree(bkey); -} - -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -/** - * bpf_verify_pkcs7_signature - verify a PKCS#7 signature - * @data_p: data to verify - * @sig_p: signature of the data - * @trusted_keyring: keyring with keys trusted for signature verification - * - * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* - * with keys in a keyring referenced by *trusted_keyring*. - * - * Return: 0 on success, a negative value on error. - */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, - struct bpf_key *trusted_keyring) -{ - struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; - struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; - const void *data, *sig; - u32 data_len, sig_len; - int ret; - - if (trusted_keyring->has_ref) { - /* - * Do the permission check deferred in bpf_lookup_user_key(). - * See bpf_lookup_user_key() for more details. - * - * A call to key_task_permission() here would be redundant, as - * it is already done by keyring_search() called by - * find_asymmetric_key(). - */ - ret = key_validate(trusted_keyring->key); - if (ret < 0) - return ret; - } - - data_len = __bpf_dynptr_size(data_ptr); - data = __bpf_dynptr_data(data_ptr, data_len); - sig_len = __bpf_dynptr_size(sig_ptr); - sig = __bpf_dynptr_data(sig_ptr, sig_len); - - return verify_pkcs7_signature(data, data_len, sig, sig_len, - trusted_keyring->key, - VERIFYING_UNSPECIFIED_SIGNATURE, NULL, - NULL); -} -#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(key_sig_kfunc_set) -BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) -#endif -BTF_KFUNCS_END(key_sig_kfunc_set) - -static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { - .owner = THIS_MODULE, - .set = &key_sig_kfunc_set, -}; - -static int __init bpf_key_sig_kfuncs_init(void) -{ - return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, - &bpf_key_sig_kfunc_set); -} - -late_initcall(bpf_key_sig_kfuncs_init); -#endif /* CONFIG_KEYS */ - static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { From f79671dc87b6cea78dbe429969eb5549fca1bcc1 Mon Sep 17 00:00:00 2001 From: Aleksa Paunovic Date: Thu, 24 Jul 2025 17:23:25 +0200 Subject: [PATCH 2240/3206] dt-bindings: riscv: Add xmipsexectl ISA extension description MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xmipsexectl extension is described in the MIPS RV64 P8700/P8700-F Multiprocessing System Programmer’s Guide linked at [1]. Link: https://mips.com/wp-content/uploads/2025/06/P8700_Programmers_Reference_Manual_Rev1.84_5-31-2025.pdf Signed-off-by: Aleksa Paunovic Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250724-p8700-pause-v5-1-a6cbbe1c3412@htecgroup.com Signed-off-by: Paul Walmsley --- Documentation/devicetree/bindings/riscv/extensions.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/riscv/extensions.yaml b/Documentation/devicetree/bindings/riscv/extensions.yaml index ede6a58ccf5347..de41a6f074d3af 100644 --- a/Documentation/devicetree/bindings/riscv/extensions.yaml +++ b/Documentation/devicetree/bindings/riscv/extensions.yaml @@ -662,6 +662,12 @@ properties: Registers in the AX45MP datasheet. https://www.andestech.com/wp-content/uploads/AX45MP-1C-Rev.-5.0.0-Datasheet.pdf + # MIPS + - const: xmipsexectl + description: + The MIPS extension for execution control as documented in + https://mips.com/wp-content/uploads/2025/06/P8700_Programmers_Reference_Manual_Rev1.84_5-31-2025.pdf + # SiFive - const: xsfvqmaccdod description: From a8fed1bc03ace27902338e4f0d318335883ac847 Mon Sep 17 00:00:00 2001 From: Aleksa Paunovic Date: Thu, 24 Jul 2025 17:23:26 +0200 Subject: [PATCH 2241/3206] riscv: Add xmipsexectl as a vendor extension Add support for MIPS vendor extensions. Add support for the xmipsexectl vendor extension. Signed-off-by: Aleksa Paunovic Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250724-p8700-pause-v5-2-a6cbbe1c3412@htecgroup.com [pjw@kernel.org: added the MIPS vendor ID from another patch to fix the build] Signed-off-by: Paul Walmsley --- arch/riscv/Kconfig.vendor | 13 +++++++++++ .../include/asm/vendor_extensions/mips.h | 18 +++++++++++++++ arch/riscv/include/asm/vendorid_list.h | 1 + arch/riscv/kernel/vendor_extensions.c | 10 +++++++++ arch/riscv/kernel/vendor_extensions/Makefile | 1 + arch/riscv/kernel/vendor_extensions/mips.c | 22 +++++++++++++++++++ 6 files changed, 65 insertions(+) create mode 100644 arch/riscv/include/asm/vendor_extensions/mips.h create mode 100644 arch/riscv/kernel/vendor_extensions/mips.c diff --git a/arch/riscv/Kconfig.vendor b/arch/riscv/Kconfig.vendor index e14f26368963c1..3c1f92e406c3f2 100644 --- a/arch/riscv/Kconfig.vendor +++ b/arch/riscv/Kconfig.vendor @@ -16,6 +16,19 @@ config RISCV_ISA_VENDOR_EXT_ANDES If you don't know what to do here, say Y. endmenu +menu "MIPS" +config RISCV_ISA_VENDOR_EXT_MIPS + bool "MIPS vendor extension support" + select RISCV_ISA_VENDOR_EXT + default y + help + Say N here to disable detection of and support for all MIPS vendor + extensions. Without this option enabled, MIPS vendor extensions will + not be detected at boot and their presence not reported to userspace. + + If you don't know what to do here, say Y. +endmenu + menu "SiFive" config RISCV_ISA_VENDOR_EXT_SIFIVE bool "SiFive vendor extension support" diff --git a/arch/riscv/include/asm/vendor_extensions/mips.h b/arch/riscv/include/asm/vendor_extensions/mips.h new file mode 100644 index 00000000000000..133e55985d827c --- /dev/null +++ b/arch/riscv/include/asm/vendor_extensions/mips.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 MIPS. + */ + +#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_H +#define _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_H + +#include + +#define RISCV_ISA_VENDOR_EXT_XMIPSEXECTL 0 + +#ifndef __ASSEMBLER__ +struct riscv_isa_vendor_ext_data_list; +extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_mips; +#endif + +#endif // _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_H diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h index a5150cdf34d87f..3b09874d7a6dfb 100644 --- a/arch/riscv/include/asm/vendorid_list.h +++ b/arch/riscv/include/asm/vendorid_list.h @@ -9,5 +9,6 @@ #define MICROCHIP_VENDOR_ID 0x029 #define SIFIVE_VENDOR_ID 0x489 #define THEAD_VENDOR_ID 0x5b7 +#define MIPS_VENDOR_ID 0x722 #endif diff --git a/arch/riscv/kernel/vendor_extensions.c b/arch/riscv/kernel/vendor_extensions.c index 92d8ff81f42c9c..bb4a7592368560 100644 --- a/arch/riscv/kernel/vendor_extensions.c +++ b/arch/riscv/kernel/vendor_extensions.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -16,6 +17,9 @@ struct riscv_isa_vendor_ext_data_list *riscv_isa_vendor_ext_list[] = { #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES &riscv_isa_vendor_ext_list_andes, #endif +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_MIPS + &riscv_isa_vendor_ext_list_mips, +#endif #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE &riscv_isa_vendor_ext_list_sifive, #endif @@ -49,6 +53,12 @@ bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsig cpu_bmap = riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap; break; #endif + #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_MIPS + case MIPS_VENDOR_ID: + bmap = &riscv_isa_vendor_ext_list_mips.all_harts_isa_bitmap; + cpu_bmap = riscv_isa_vendor_ext_list_mips.per_hart_isa_bitmap; + break; + #endif #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE case SIFIVE_VENDOR_ID: bmap = &riscv_isa_vendor_ext_list_sifive.all_harts_isa_bitmap; diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile index a4eca96d1c8a2f..ccad4ebafb4341 100644 --- a/arch/riscv/kernel/vendor_extensions/Makefile +++ b/arch/riscv/kernel/vendor_extensions/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES) += andes.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_MIPS) += mips.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive_hwprobe.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD) += thead.o diff --git a/arch/riscv/kernel/vendor_extensions/mips.c b/arch/riscv/kernel/vendor_extensions/mips.c new file mode 100644 index 00000000000000..f691129f96c21f --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/mips.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 MIPS. + */ + +#include +#include +#include + +#include +#include +#include + +/* All MIPS vendor extensions supported in Linux */ +static const struct riscv_isa_ext_data riscv_isa_vendor_ext_mips[] = { + __RISCV_ISA_EXT_DATA(xmipsexectl, RISCV_ISA_VENDOR_EXT_XMIPSEXECTL), +}; + +struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_mips = { + .ext_data_count = ARRAY_SIZE(riscv_isa_vendor_ext_mips), + .ext_data = riscv_isa_vendor_ext_mips, +}; From c7c31f8dc54aa3c9b2c994b5f1ff7e740a654e97 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Wed, 17 Sep 2025 12:43:46 -0700 Subject: [PATCH 2242/3206] drm/ast: Use msleep instead of mdelay for edid read The busy-waiting in `mdelay()` can cause CPU stalls and kernel timeouts during boot. Signed-off-by: Nirmoy Das Reviewed-by: Thomas Zimmermann Tested-by: Carol L Soto csoto@nvidia.com Fixes: 594e9c04b586 ("drm/ast: Create the driver for ASPEED proprietory Display-Port") Cc: KuoHsiang Chou Cc: Thomas Zimmermann Cc: Dave Airlie Cc: Jocelyn Falempe Cc: dri-devel@lists.freedesktop.org Cc: # v5.19+ Signed-off-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20250917194346.2905522-1-nirmoyd@nvidia.com --- drivers/gpu/drm/ast/ast_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ast/ast_dp.c b/drivers/gpu/drm/ast/ast_dp.c index 19c04687b0fe1f..8e650a02c5287b 100644 --- a/drivers/gpu/drm/ast/ast_dp.c +++ b/drivers/gpu/drm/ast/ast_dp.c @@ -134,7 +134,7 @@ static int ast_astdp_read_edid_block(void *data, u8 *buf, unsigned int block, si * 3. The Delays are often longer a lot when system resume from S3/S4. */ if (j) - mdelay(j + 1); + msleep(j + 1); /* Wait for EDID offset to show up in mirror register */ vgacrd7 = ast_get_index_reg(ast, AST_IO_VGACRI, 0xd7); From 1e56310b40fd2e7e0b9493da9ff488af145bdd0c Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Sat, 13 Sep 2025 06:26:57 +0000 Subject: [PATCH 2243/3206] iommu/amd/pgtbl: Fix possible race while increase page table level The AMD IOMMU host page table implementation supports dynamic page table levels (up to 6 levels), starting with a 3-level configuration that expands based on IOVA address. The kernel maintains a root pointer and current page table level to enable proper page table walks in alloc_pte()/fetch_pte() operations. The IOMMU IOVA allocator initially starts with 32-bit address and onces its exhuasted it switches to 64-bit address (max address is determined based on IOMMU and device DMA capability). To support larger IOVA, AMD IOMMU driver increases page table level. But in unmap path (iommu_v1_unmap_pages()), fetch_pte() reads pgtable->[root/mode] without lock. So its possible that in exteme corner case, when increase_address_space() is updating pgtable->[root/mode], fetch_pte() reads wrong page table level (pgtable->mode). It does compare the value with level encoded in page table and returns NULL. This will result is iommu_unmap ops to fail and upper layer may retry/log WARN_ON. CPU 0 CPU 1 ------ ------ map pages unmap pages alloc_pte() -> increase_address_space() iommu_v1_unmap_pages() -> fetch_pte() pgtable->root = pte (new root value) READ pgtable->[mode/root] Reads new root, old mode Updates mode (pgtable->mode += 1) Since Page table level updates are infrequent and already synchronized with a spinlock, implement seqcount to enable lock-free read operations on the read path. Fixes: 754265bcab7 ("iommu/amd: Fix race in increase_address_space()") Reported-by: Alejandro Jimenez Cc: stable@vger.kernel.org Cc: Joao Martins Cc: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 1 + drivers/iommu/amd/io_pgtable.c | 25 +++++++++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 5219d7ddfdaa8b..95f63c5f6159f7 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -555,6 +555,7 @@ struct gcr3_tbl_info { }; struct amd_io_pgtable { + seqcount_t seqcount; /* Protects root/mode update */ struct io_pgtable pgtbl; int mode; u64 *root; diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index a91e71f981efb9..70c2f5b1631b05 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -130,8 +131,11 @@ static bool increase_address_space(struct amd_io_pgtable *pgtable, *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); + write_seqcount_begin(&pgtable->seqcount); pgtable->root = pte; pgtable->mode += 1; + write_seqcount_end(&pgtable->seqcount); + amd_iommu_update_and_flush_device_table(domain); pte = NULL; @@ -153,6 +157,7 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable, { unsigned long last_addr = address + (page_size - 1); struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; + unsigned int seqcount; int level, end_lvl; u64 *pte, *page; @@ -170,8 +175,14 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable, } - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + do { + seqcount = read_seqcount_begin(&pgtable->seqcount); + + level = pgtable->mode - 1; + pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); + + address = PAGE_SIZE_ALIGN(address, page_size); end_lvl = PAGE_SIZE_LEVEL(page_size); @@ -249,6 +260,7 @@ static u64 *fetch_pte(struct amd_io_pgtable *pgtable, unsigned long *page_size) { int level; + unsigned int seqcount; u64 *pte; *page_size = 0; @@ -256,8 +268,12 @@ static u64 *fetch_pte(struct amd_io_pgtable *pgtable, if (address > PM_LEVEL_SIZE(pgtable->mode)) return NULL; - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + do { + seqcount = read_seqcount_begin(&pgtable->seqcount); + level = pgtable->mode - 1; + pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); + *page_size = PTE_LEVEL_PAGE_SIZE(level); while (level > 0) { @@ -541,6 +557,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo if (!pgtable->root) return NULL; pgtable->mode = PAGE_MODE_3_LEVEL; + seqcount_init(&pgtable->seqcount); cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; From 90beccb3e1287b8d596c4816530ef54df01aa11f Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 17 Sep 2025 14:30:43 +0800 Subject: [PATCH 2244/3206] vhost-net: unbreak busy polling Commit 67a873df0c41 ("vhost: basic in order support") pass the number of used elem to vhost_net_rx_peek_head_len() to make sure it can signal the used correctly before trying to do busy polling. But it forgets to clear the count, this would cause the count run out of sync with handle_rx() and break the busy polling. Fixing this by passing the pointer of the count and clearing it after the signaling the used. Acked-by: Michael S. Tsirkin Cc: stable@vger.kernel.org Fixes: 67a873df0c41 ("vhost: basic in order support") Signed-off-by: Jason Wang Message-Id: <20250917063045.2042-1-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index c6508fe0d5c8e5..16e39f3ab9566f 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1014,7 +1014,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) } static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, - bool *busyloop_intr, unsigned int count) + bool *busyloop_intr, unsigned int *count) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; @@ -1024,7 +1024,8 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, if (!len && rvq->busyloop_timeout) { /* Flush batched heads first */ - vhost_net_signal_used(rnvq, count); + vhost_net_signal_used(rnvq, *count); + *count = 0; /* Both tx vq and rx socket were polled here */ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); @@ -1180,7 +1181,7 @@ static void handle_rx(struct vhost_net *net) do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, - &busyloop_intr, count); + &busyloop_intr, &count); if (!sock_len) break; sock_len += sock_hlen; From 4174152771bf0d014d58f7d7e148bb0c8830fe53 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 17 Sep 2025 14:30:44 +0800 Subject: [PATCH 2245/3206] Revert "vhost/net: Defer TX queue re-enable until after sendmsg" This reverts commit 8c2e6b26ffe243be1e78f5a4bfb1a857d6e6f6d6. It tries to defer the notification enabling by moving the logic out of the loop after the vhost_tx_batch() when nothing new is spotted. This will bring side effects as the new logic would be reused for several other error conditions. One example is the IOTLB: when there's an IOTLB miss, get_tx_bufs() might return -EAGAIN and exit the loop and see there's still available buffers, so it will queue the tx work again until userspace feed the IOTLB entry correctly. This will slowdown the tx processing and trigger the TX watchdog in the guest as reported in https://lkml.org/lkml/2025/9/10/1596. To fix, revert the change. A follow up patch will bring the performance back in a safe way. Reported-by: Jon Kohler Cc: stable@vger.kernel.org Fixes: 8c2e6b26ffe2 ("vhost/net: Defer TX queue re-enable until after sendmsg") Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <20250917063045.2042-2-jasowang@redhat.com> --- drivers/vhost/net.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 16e39f3ab9566f..57efd5c55f89e9 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -765,11 +765,11 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) int err; int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); - bool busyloop_intr; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); do { - busyloop_intr = false; + bool busyloop_intr = false; + if (nvq->done_idx == VHOST_NET_BATCH) vhost_tx_batch(net, nvq, sock, &msg); @@ -780,10 +780,13 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { - /* Kicks are disabled at this point, break loop and - * process any remaining batched packets. Queue will - * be re-enabled afterwards. - */ + if (unlikely(busyloop_intr)) { + vhost_poll_queue(&vq->poll); + } else if (unlikely(vhost_enable_notify(&net->dev, + vq))) { + vhost_disable_notify(&net->dev, vq); + continue; + } break; } @@ -839,22 +842,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); - /* Kicks are still disabled, dispatch any remaining batched msgs. */ vhost_tx_batch(net, nvq, sock, &msg); - - if (unlikely(busyloop_intr)) - /* If interrupted while doing busy polling, requeue the - * handler to be fair handle_rx as well as other tasks - * waiting on cpu. - */ - vhost_poll_queue(&vq->poll); - else - /* All of our work has been completed; however, before - * leaving the TX handler, do one last check for work, - * and requeue handler if necessary. If there is no work, - * queue will be reenabled. - */ - vhost_net_busy_poll_try_queue(net, vq); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) From e430451613c7a27beeadd00d707bcf7ceec6328e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 17 Sep 2025 14:30:45 +0800 Subject: [PATCH 2246/3206] vhost-net: flush batched before enabling notifications Commit 8c2e6b26ffe2 ("vhost/net: Defer TX queue re-enable until after sendmsg") tries to defer the notification enabling by moving the logic out of the loop after the vhost_tx_batch() when nothing new is spotted. This caused unexpected side effects as the new logic is reused for several other error conditions. A previous patch reverted 8c2e6b26ffe2. Now, bring the performance back up by flushing batched buffers before enabling notifications. Reported-by: Jon Kohler Cc: stable@vger.kernel.org Fixes: 8c2e6b26ffe2 ("vhost/net: Defer TX queue re-enable until after sendmsg") Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <20250917063045.2042-3-jasowang@redhat.com> --- drivers/vhost/net.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 57efd5c55f89e9..35ded43304319c 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -780,6 +780,11 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { + /* Flush batched packets to handle pending RX + * work (if busyloop_intr is set) and to avoid + * unnecessary virtqueue kicks. + */ + vhost_tx_batch(net, nvq, sock, &msg); if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, From fa84e534c3ec2904d8718a83180294f7b5afecc7 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 18 Sep 2025 13:56:16 +0100 Subject: [PATCH 2247/3206] arm64: realm: ioremap: Allow mapping memory as encrypted For ioremap(), so far we only checked if it was a device (RIPAS_DEV) to choose an encrypted vs decrypted mapping. However, we may have firmware reserved memory regions exposed to the OS (e.g., EFI Coco Secret Securityfs, ACPI CCEL). We need to make sure that anything that is RIPAS_RAM (i.e., Guest protected memory with RMM guarantees) are also mapped as encrypted. Rephrasing the above, anything that is not RIPAS_EMPTY is guaranteed to be protected by the RMM. Thus we choose encrypted mapping for anything that is not RIPAS_EMPTY. While at it, rename the helper function __arm64_is_protected_mmio => arm64_rsi_is_protected to clearly indicate that this not an arm64 generic helper, but something to do with Realms. Cc: Sami Mujawar Cc: Will Deacon Cc: Catalin Marinas Cc: Aneesh Kumar K.V Cc: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Steven Price Tested-by: Sami Mujawar Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- arch/arm64/include/asm/io.h | 2 +- arch/arm64/include/asm/rsi.h | 2 +- arch/arm64/kernel/rsi.c | 26 ++++++++++++++++++++++---- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 9b96840fb979bf..82276282a3c725 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -311,7 +311,7 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size) { if (unlikely(is_realm_world())) - return __arm64_is_protected_mmio(phys_addr, size); + return arm64_rsi_is_protected(phys_addr, size); return false; } diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h index b42aeac05340ed..88b50d660e85a0 100644 --- a/arch/arm64/include/asm/rsi.h +++ b/arch/arm64/include/asm/rsi.h @@ -16,7 +16,7 @@ DECLARE_STATIC_KEY_FALSE(rsi_present); void __init arm64_rsi_init(void); -bool __arm64_is_protected_mmio(phys_addr_t base, size_t size); +bool arm64_rsi_is_protected(phys_addr_t base, size_t size); static inline bool is_realm_world(void) { diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index ce4778141ec7b8..c64a06f58c0bc0 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -84,7 +84,25 @@ static void __init arm64_rsi_setup_memory(void) } } -bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) +/* + * Check if a given PA range is Trusted (e.g., Protected memory, a Trusted Device + * mapping, or an MMIO emulated in the Realm world). + * + * We can rely on the RIPAS value of the region to detect if a given region is + * protected. + * + * RIPAS_DEV - A trusted device memory or a trusted emulated MMIO (in the Realm + * world + * RIPAS_RAM - Memory (RAM), protected by the RMM guarantees. (e.g., Firmware + * reserved regions for data sharing). + * + * RIPAS_DESTROYED is a special case of one of the above, where the host did + * something without our permission and as such we can't do anything about it. + * + * The only case where something is emulated by the untrusted hypervisor or is + * backed by shared memory is indicated by RSI_RIPAS_EMPTY. + */ +bool arm64_rsi_is_protected(phys_addr_t base, size_t size) { enum ripas ripas; phys_addr_t end, top; @@ -101,18 +119,18 @@ bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) break; if (WARN_ON(top <= base)) break; - if (ripas != RSI_RIPAS_DEV) + if (ripas == RSI_RIPAS_EMPTY) break; base = top; } return base >= end; } -EXPORT_SYMBOL(__arm64_is_protected_mmio); +EXPORT_SYMBOL(arm64_rsi_is_protected); static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) { - if (__arm64_is_protected_mmio(phys, size)) + if (arm64_rsi_is_protected(phys, size)) *prot = pgprot_encrypted(*prot); else *prot = pgprot_decrypted(*prot); From 9e8a3df3e7f762966762a6fbf3282b9da2074127 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 18 Sep 2025 13:56:17 +0100 Subject: [PATCH 2248/3206] arm64: Enable EFI secret area Securityfs support Enable EFI COCO secrets support. Provide the ioremap_encrypted() support required by the driver. Cc: Sami Mujawar Cc: Will Deacon Cc: Catalin Marinas Cc: Aneesh Kumar K.V Cc: Steven Price Reviewed-by: Gavin Shan Tested-by: Sami Mujawar Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- arch/arm64/include/asm/io.h | 4 ++++ drivers/virt/coco/efi_secret/Kconfig | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 82276282a3c725..83e03abbb2ca92 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -274,6 +274,10 @@ int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); #define ioremap_np(addr, size) \ ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE)) + +#define ioremap_encrypted(addr, size) \ + ioremap_prot((addr), (size), PAGE_KERNEL) + /* * io{read,write}{16,32,64}be() macros */ diff --git a/drivers/virt/coco/efi_secret/Kconfig b/drivers/virt/coco/efi_secret/Kconfig index 4404d198f3b200..94d88e5da70721 100644 --- a/drivers/virt/coco/efi_secret/Kconfig +++ b/drivers/virt/coco/efi_secret/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config EFI_SECRET tristate "EFI secret area securityfs support" - depends on EFI && X86_64 + depends on EFI && (X86_64 || ARM64) select EFI_COCO_SECRET select SECURITYFS help From d02c2e45b1e7767b177f6854026e4ad0d70b4a4d Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 18 Sep 2025 13:56:18 +0100 Subject: [PATCH 2249/3206] arm64: acpi: Enable ACPI CCEL support Add support for ACPI CCEL by handling the EfiACPIMemoryNVS type memory. As per UEFI specifications NVS memory is reserved for Firmware use even after exiting boot services. Thus map the region as read-only. Cc: Sami Mujawar Cc: Will Deacon Cc: Catalin Marinas Cc: Aneesh Kumar K.V Cc: Steven Price Cc: Sudeep Holla Cc: Gavin Shan Reviewed-by: Gavin Shan Tested-by: Sami Mujawar Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- arch/arm64/kernel/acpi.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 4d529ff7ba513a..b3195b3b895fae 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -357,6 +357,16 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) * as long as we take care not to create a writable * mapping for executable code. */ + fallthrough; + + case EFI_ACPI_MEMORY_NVS: + /* + * ACPI NVS marks an area reserved for use by the + * firmware, even after exiting the boot service. + * This may be used by the firmware for sharing dynamic + * tables/data (e.g., ACPI CCEL) with the OS. Map it + * as read-only. + */ prot = PAGE_KERNEL_RO; break; From 352e66900cde63f3dadb142364d3c35170bbaaff Mon Sep 17 00:00:00 2001 From: Zabelin Nikita Date: Thu, 18 Sep 2025 18:06:59 +0300 Subject: [PATCH 2250/3206] drm/gma500: Fix null dereference in hdmi teardown pci_set_drvdata sets the value of pdev->driver_data to NULL, after which the driver_data obtained from the same dev is dereferenced in oaktrail_hdmi_i2c_exit, and the i2c_dev is extracted from it. To prevent this, swap these calls. Found by Linux Verification Center (linuxtesting.org) with Svacer. Fixes: 1b082ccf5901 ("gma500: Add Oaktrail support") Signed-off-by: Zabelin Nikita Signed-off-by: Patrik Jakobsson Link: https://lore.kernel.org/r/20250918150703.2562604-1-n.zabelin@mt-integration.ru --- drivers/gpu/drm/gma500/oaktrail_hdmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/gma500/oaktrail_hdmi.c b/drivers/gpu/drm/gma500/oaktrail_hdmi.c index 1cf39436912776..c0feca58511df3 100644 --- a/drivers/gpu/drm/gma500/oaktrail_hdmi.c +++ b/drivers/gpu/drm/gma500/oaktrail_hdmi.c @@ -726,8 +726,8 @@ void oaktrail_hdmi_teardown(struct drm_device *dev) if (hdmi_dev) { pdev = hdmi_dev->dev; - pci_set_drvdata(pdev, NULL); oaktrail_hdmi_i2c_exit(pdev); + pci_set_drvdata(pdev, NULL); iounmap(hdmi_dev->regs); kfree(hdmi_dev); pci_dev_put(pdev); From 41307ec7df057239aae3d0f089cc35a0d735cdf8 Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Fri, 19 Sep 2025 11:18:51 +0800 Subject: [PATCH 2251/3206] power: supply: qcom_battmgr: handle charging state change notifications The X1E80100 battery management firmware sends a notification with code 0x83 when the battery charging state changes, such as switching between fast charge, taper charge, end of charge, or any other error charging states. The same notification code is used with bit[8] set when charging stops because the charge control end threshold is reached. Additionally, a 2-bit value is included in bit[10:9] with the same code to indicate the charging source capability, which is determined by the calculated power from voltage and current readings from PDOs: 2 means a strong charger over 60W, 1 indicates a weak charger, and 0 means there is no charging source. These 3-MSB [10:8] in the notification code is not much useful for now, hence just ignore them and trigger a power supply change event whenever 0x83 notification code is received. This helps to eliminate the unknown notification error messages. Reported-by: Sebastian Reichel Closes: https://lore.kernel.org/all/r65idyc4of5obo6untebw4iqfj2zteiggnnzabrqtlcinvtddx@xc4aig5abesu/ Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- drivers/power/supply/qcom_battmgr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index 0fe14a109b70fc..3c2837ef346173 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -34,8 +34,9 @@ enum qcom_battmgr_variant { #define NOTIF_BAT_PROPERTY 0x30 #define NOTIF_USB_PROPERTY 0x32 #define NOTIF_WLS_PROPERTY 0x34 -#define NOTIF_BAT_INFO 0x81 #define NOTIF_BAT_STATUS 0x80 +#define NOTIF_BAT_INFO 0x81 +#define NOTIF_BAT_CHARGING_STATE 0x83 #define BATTMGR_BAT_INFO 0x9 @@ -1209,12 +1210,14 @@ static void qcom_battmgr_notification(struct qcom_battmgr *battmgr, } notification = le32_to_cpu(msg->notification); + notification &= 0xff; switch (notification) { case NOTIF_BAT_INFO: battmgr->info.valid = false; fallthrough; case NOTIF_BAT_STATUS: case NOTIF_BAT_PROPERTY: + case NOTIF_BAT_CHARGING_STATE: power_supply_changed(battmgr->bat_psy); break; case NOTIF_USB_PROPERTY: From 2d81a24a74e577b0d34266059ff95f56150b40f9 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Fri, 19 Sep 2025 01:33:22 +0800 Subject: [PATCH 2252/3206] driver: reset: th1520-aon: add driver for poweroff/reboot via AON FW This driver implements poweroff/reboot support for T-Head TH1520 SoCs running the AON firmware by sending a message to the AON firmware's WDG part. This is a auxiliary device driver, and expects the AON channel to be passed via the platform_data of the auxiliary device. Signed-off-by: Icenowy Zheng Acked-by: Sebastian Reichel Signed-off-by: Ulf Hansson --- MAINTAINERS | 1 + drivers/power/reset/Kconfig | 7 ++ drivers/power/reset/Makefile | 1 + drivers/power/reset/th1520-aon-reboot.c | 98 +++++++++++++++++++++++++ 4 files changed, 107 insertions(+) create mode 100644 drivers/power/reset/th1520-aon-reboot.c diff --git a/MAINTAINERS b/MAINTAINERS index 918df6d74c2a50..a84bfa671dfdd9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21732,6 +21732,7 @@ F: drivers/mailbox/mailbox-th1520.c F: drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c F: drivers/pinctrl/pinctrl-th1520.c F: drivers/pmdomain/thead/ +F: drivers/power/reset/th1520-aon-reboot.c F: drivers/power/sequencing/pwrseq-thead-gpu.c F: drivers/reset/reset-th1520.c F: include/dt-bindings/clock/thead,th1520-clk-ap.h diff --git a/drivers/power/reset/Kconfig b/drivers/power/reset/Kconfig index 77ea3129c70806..8248895ca90389 100644 --- a/drivers/power/reset/Kconfig +++ b/drivers/power/reset/Kconfig @@ -225,6 +225,13 @@ config POWER_RESET_ST help Reset support for STMicroelectronics boards. +config POWER_RESET_TH1520_AON + tristate "T-Head TH1520 AON firmware poweroff and reset driver" + depends on TH1520_PM_DOMAINS + help + This driver supports power-off and reset operations for T-Head + TH1520 SoCs running the AON firmware. + config POWER_RESET_TORADEX_EC tristate "Toradex Embedded Controller power-off and reset driver" depends on ARCH_MXC || COMPILE_TEST diff --git a/drivers/power/reset/Makefile b/drivers/power/reset/Makefile index b7c2b5940be997..51da87e05ce76b 100644 --- a/drivers/power/reset/Makefile +++ b/drivers/power/reset/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_POWER_RESET_QNAP) += qnap-poweroff.o obj-$(CONFIG_POWER_RESET_REGULATOR) += regulator-poweroff.o obj-$(CONFIG_POWER_RESET_RESTART) += restart-poweroff.o obj-$(CONFIG_POWER_RESET_ST) += st-poweroff.o +obj-$(CONFIG_POWER_RESET_TH1520_AON) += th1520-aon-reboot.o obj-$(CONFIG_POWER_RESET_TORADEX_EC) += tdx-ec-poweroff.o obj-$(CONFIG_POWER_RESET_TPS65086) += tps65086-restart.o obj-$(CONFIG_POWER_RESET_VERSATILE) += arm-versatile-reboot.o diff --git a/drivers/power/reset/th1520-aon-reboot.c b/drivers/power/reset/th1520-aon-reboot.c new file mode 100644 index 00000000000000..ec249667a0ffd7 --- /dev/null +++ b/drivers/power/reset/th1520-aon-reboot.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * T-HEAD TH1520 AON Firmware Reboot Driver + * + * Copyright (c) 2025 Icenowy Zheng + */ + +#include +#include +#include +#include +#include +#include +#include + +#define TH1520_AON_REBOOT_PRIORITY 200 + +struct th1520_aon_msg_empty_body { + struct th1520_aon_rpc_msg_hdr hdr; + u16 reserved[12]; +} __packed __aligned(1); + +static int th1520_aon_pwroff_handler(struct sys_off_data *data) +{ + struct th1520_aon_chan *aon_chan = data->cb_data; + struct th1520_aon_msg_empty_body msg = {}; + + msg.hdr.svc = TH1520_AON_RPC_SVC_WDG; + msg.hdr.func = TH1520_AON_WDG_FUNC_POWER_OFF; + msg.hdr.size = TH1520_AON_RPC_MSG_NUM; + + th1520_aon_call_rpc(aon_chan, &msg); + + return NOTIFY_DONE; +} + +static int th1520_aon_restart_handler(struct sys_off_data *data) +{ + struct th1520_aon_chan *aon_chan = data->cb_data; + struct th1520_aon_msg_empty_body msg = {}; + + msg.hdr.svc = TH1520_AON_RPC_SVC_WDG; + msg.hdr.func = TH1520_AON_WDG_FUNC_RESTART; + msg.hdr.size = TH1520_AON_RPC_MSG_NUM; + + th1520_aon_call_rpc(aon_chan, &msg); + + return NOTIFY_DONE; +} + +static int th1520_aon_reboot_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct device *dev = &adev->dev; + int ret; + + /* Expect struct th1520_aon_chan to be passed via platform_data */ + ret = devm_register_sys_off_handler(dev, SYS_OFF_MODE_POWER_OFF, + TH1520_AON_REBOOT_PRIORITY, + th1520_aon_pwroff_handler, + adev->dev.platform_data); + + if (ret) { + dev_err(dev, "Failed to register power off handler\n"); + return ret; + } + + ret = devm_register_sys_off_handler(dev, SYS_OFF_MODE_RESTART, + TH1520_AON_REBOOT_PRIORITY, + th1520_aon_restart_handler, + adev->dev.platform_data); + + if (ret) { + dev_err(dev, "Failed to register restart handler\n"); + return ret; + } + + return 0; +} + +static const struct auxiliary_device_id th1520_aon_reboot_id_table[] = { + { .name = "th1520_pm_domains.reboot" }, + {}, +}; +MODULE_DEVICE_TABLE(auxiliary, th1520_aon_reboot_id_table); + +static struct auxiliary_driver th1520_aon_reboot_driver = { + .driver = { + .name = "th1520-aon-reboot", + }, + .probe = th1520_aon_reboot_probe, + .id_table = th1520_aon_reboot_id_table, +}; +module_auxiliary_driver(th1520_aon_reboot_driver); + +MODULE_AUTHOR("Icenowy Zheng "); +MODULE_DESCRIPTION("T-HEAD TH1520 AON-firmware-based reboot driver"); +MODULE_LICENSE("GPL"); From 64581f41f4c4aa1845edeee6bb0c8f2a7103d9aa Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Fri, 19 Sep 2025 01:33:23 +0800 Subject: [PATCH 2253/3206] pmdomain: thead: create auxiliary device for rebooting The reboot / power off operations require communication with the AON firmware too. As the driver is already present, create an auxiliary device with name "reboot" to match that driver, and pass the AON channel by using platform_data. Signed-off-by: Icenowy Zheng Signed-off-by: Ulf Hansson --- drivers/pmdomain/thead/th1520-pm-domains.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/pmdomain/thead/th1520-pm-domains.c b/drivers/pmdomain/thead/th1520-pm-domains.c index 9040b698e7f7f2..5213994101a591 100644 --- a/drivers/pmdomain/thead/th1520-pm-domains.c +++ b/drivers/pmdomain/thead/th1520-pm-domains.c @@ -173,6 +173,16 @@ static int th1520_pd_pwrseq_gpu_init(struct device *dev) adev); } +static int th1520_pd_reboot_init(struct device *dev, + struct th1520_aon_chan *aon_chan) +{ + struct auxiliary_device *adev; + + adev = devm_auxiliary_device_create(dev, "reboot", aon_chan); + + return PTR_ERR_OR_ZERO(adev); +} + static int th1520_pd_probe(struct platform_device *pdev) { struct generic_pm_domain **domains; @@ -235,6 +245,10 @@ static int th1520_pd_probe(struct platform_device *pdev) if (ret) goto err_clean_provider; + ret = th1520_pd_reboot_init(dev, aon_chan); + if (ret) + goto err_clean_provider; + return 0; err_clean_provider: From b9cb7e59ac4ae68940347ebfc41e0436d32d3c6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Wed, 10 Sep 2025 21:26:05 +0200 Subject: [PATCH 2254/3206] pid: use ns_capable_noaudit() when determining net sysctl permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The capability check should not be audited since it is only being used to determine the inode permissions. A failed check does not indicate a violation of security policy but, when an LSM is enabled, a denial audit message was being generated. The denial audit message can either lead to the capability being unnecessarily allowed in a security policy, or being silenced potentially masking a legitimate capability check at a later point in time. Similar to commit d6169b0206db ("net: Use ns_capable_noaudit() when determining net sysctl permissions") Fixes: 7863dcc72d0f ("pid: allow pid_max to be set per pid namespace") CC: Christian Brauner CC: linux-security-module@vger.kernel.org CC: selinux@vger.kernel.org Signed-off-by: Christian Göttsche Acked-by: Serge Hallyn Reviewed-by: Paul Moore Signed-off-by: Christian Brauner --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/pid.c b/kernel/pid.c index c45a28c16cd256..d94ce025050127 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -680,7 +680,7 @@ static int pid_table_root_permissions(struct ctl_table_header *head, container_of(head->set, struct pid_namespace, set); int mode = table->mode; - if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) || + if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) || uid_eq(current_euid(), make_kuid(pidns->user_ns, 0))) mode = (mode & S_IRWXU) >> 6; else if (in_egroup_p(make_kgid(pidns->user_ns, 0))) From e1b849cfa6b61f1c866a908c9e8dd9b5aaab820b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Apr 2025 17:12:59 +0200 Subject: [PATCH 2255/3206] writeback: Avoid contention on wb->list_lock when switching inodes There can be multiple inode switch works that are trying to switch inodes to / from the same wb. This can happen in particular if some cgroup exits which owns many (thousands) inodes and we need to switch them all. In this case several inode_switch_wbs_work_fn() instances will be just spinning on the same wb->list_lock while only one of them makes forward progress. This wastes CPU cycles and quickly leads to softlockup reports and unusable system. Instead of running several inode_switch_wbs_work_fn() instances in parallel switching to the same wb and contending on wb->list_lock, run just one work item per wb and manage a queue of isw items switching to this wb. Acked-by: Tejun Heo Signed-off-by: Jan Kara --- fs/fs-writeback.c | 99 ++++++++++++++++++++------------ include/linux/backing-dev-defs.h | 4 ++ include/linux/writeback.h | 2 + mm/backing-dev.c | 5 ++ 4 files changed, 74 insertions(+), 36 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641d7..b0e9092ccf0401 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct rcu_work work; + /* List of queued switching contexts for the wb */ + struct llist_node list; /* * Multiple inodes can be switched at once. The switching procedure @@ -378,7 +379,6 @@ struct inode_switch_wbs_context { * array embedded into struct inode_switch_wbs_context. Otherwise * an inode could be left in a non-consistent state. */ - struct bdi_writeback *new_wb; struct inode *inodes[]; }; @@ -486,13 +486,11 @@ static bool inode_do_switch_wbs(struct inode *inode, return switched; } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static void process_inode_switch_wbs(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw) { - struct inode_switch_wbs_context *isw = - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; unsigned long nr_switched = 0; struct inode **inodep; @@ -543,6 +541,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) atomic_dec(&isw_nr_in_flight); } +void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback, + switch_work); + struct inode_switch_wbs_context *isw, *next_isw; + struct llist_node *list; + + /* + * Grab out reference to wb so that it cannot get freed under us + * after we process all the isw items. + */ + wb_get(new_wb); + while (1) { + list = llist_del_all(&new_wb->switch_wbs_ctxs); + /* Nothing to do? */ + if (!list) + break; + /* + * In addition to synchronizing among switchers, I_WB_SWITCH + * tells the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be + * visible. + */ + synchronize_rcu(); + + llist_for_each_entry_safe(isw, next_isw, list, list) + process_inode_switch_wbs(new_wb, isw); + } + wb_put(new_wb); +} + static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { @@ -572,6 +602,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode, return true; } +static void wb_queue_isw(struct bdi_writeback *wb, + struct inode_switch_wbs_context *isw) +{ + if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) + queue_work(isw_wq, &wb->switch_work); +} + /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode @@ -585,6 +622,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb = NULL; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) @@ -609,40 +647,34 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!memcg_css) goto out_free; - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); - if (!isw->new_wb) + if (!new_wb) goto out_free; - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) goto out_free; isw->inodes[0] = inode; - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + wb_queue_isw(new_wb, isw); return; out_free: atomic_dec(&isw_nr_in_flight); - if (isw->new_wb) - wb_put(isw->new_wb); + if (new_wb) + wb_put(new_wb); kfree(isw); } -static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, +static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw, struct list_head *list, int *nr) { struct inode *inode; list_for_each_entry(inode, list, i_io_list) { - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) continue; isw->inodes[*nr] = inode; @@ -666,6 +698,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb; int nr; bool restart = false; @@ -678,12 +711,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) for (memcg_css = wb->memcg_css->parent; memcg_css; memcg_css = memcg_css->parent) { - isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); - if (isw->new_wb) + new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + if (new_wb) break; } - if (unlikely(!isw->new_wb)) - isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + if (unlikely(!new_wb)) + new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ nr = 0; spin_lock(&wb->list_lock); @@ -695,27 +728,21 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * bandwidth restrictions, as writeback of inode metadata is not * accounted for. */ - restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr); if (!restart) - restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time, + &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ if (nr == 0) { atomic_dec(&isw_nr_in_flight); - wb_put(isw->new_wb); + wb_put(new_wb); kfree(isw); return restart; } - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + wb_queue_isw(new_wb, isw); return restart; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 2ad261082bba5f..c5c9d89c73edcc 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -152,6 +152,10 @@ struct bdi_writeback { struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ struct list_head offline_node; /* anchored at offline_cgwbs */ + struct work_struct switch_work; /* work used to perform inode switching + * to this wb */ + struct llist_head switch_wbs_ctxs; /* queued contexts for + * writeback switching */ union { struct work_struct release_work; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a2848d731a4663..15a4bc4ab81953 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } +void inode_switch_wbs_work_fn(struct work_struct *work); + #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef86..0beaca6bacf778 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work) wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); + WARN_ON_ONCE(work_pending(&wb->switch_work)); call_rcu(&wb->rcu, cgwb_free_rcu); } @@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi, wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); + INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn); + init_llist_head(&wb->switch_wbs_ctxs); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); @@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; + INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn); + init_llist_head(&bdi->wb.switch_wbs_ctxs); } return ret; } From 66c14dccd810d42ec5c73bb8a9177489dfd62278 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 Sep 2025 12:38:36 +0200 Subject: [PATCH 2256/3206] writeback: Avoid softlockup when switching many inodes process_inode_switch_wbs_work() can be switching over 100 inodes to a different cgroup. Since switching an inode requires counting all dirty & under-writeback pages in the address space of each inode, this can take a significant amount of time. Add a possibility to reschedule after processing each inode to avoid softlockups. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b0e9092ccf0401..36ef1a796d4b98 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -500,6 +500,7 @@ static void process_inode_switch_wbs(struct bdi_writeback *new_wb, */ down_read(&bdi->wb_switch_rwsem); + inodep = isw->inodes; /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions @@ -510,6 +511,7 @@ static void process_inode_switch_wbs(struct bdi_writeback *new_wb, * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ +relock: if (old_wb < new_wb) { spin_lock(&old_wb->list_lock); spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); @@ -518,10 +520,17 @@ static void process_inode_switch_wbs(struct bdi_writeback *new_wb, spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } - for (inodep = isw->inodes; *inodep; inodep++) { + while (*inodep) { WARN_ON_ONCE((*inodep)->i_wb != old_wb); if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) nr_switched++; + inodep++; + if (*inodep && need_resched()) { + spin_unlock(&new_wb->list_lock); + spin_unlock(&old_wb->list_lock); + cond_resched(); + goto relock; + } } spin_unlock(&new_wb->list_lock); From 9a6ebbdbd41235ea3bc0c4f39e2076599b8113cc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 Sep 2025 12:38:37 +0200 Subject: [PATCH 2257/3206] writeback: Avoid excessively long inode switching times With lazytime mount option enabled we can be switching many dirty inodes on cgroup exit to the parent cgroup. The numbers observed in practice when systemd slice of a large cron job exits can easily reach hundreds of thousands or millions. The logic in inode_do_switch_wbs() which sorts the inode into appropriate place in b_dirty list of the target wb however has linear complexity in the number of dirty inodes thus overall time complexity of switching all the inodes is quadratic leading to workers being pegged for hours consuming 100% of the CPU and switching inodes to the parent wb. Simple reproducer of the issue: FILES=10000 # Filesystem mounted with lazytime mount option MNT=/mnt/ echo "Creating files and switching timestamps" for (( j = 0; j < 50; j ++ )); do mkdir $MNT/dir$j for (( i = 0; i < $FILES; i++ )); do echo "foo" >$MNT/dir$j/file$i done touch -a -t 202501010000 $MNT/dir$j/file* done wait echo "Syncing and flushing" sync echo 3 >/proc/sys/vm/drop_caches echo "Reading all files from a cgroup" mkdir /sys/fs/cgroup/unified/mycg1 || exit echo $$ >/sys/fs/cgroup/unified/mycg1/cgroup.procs || exit for (( j = 0; j < 50; j ++ )); do cat /mnt/dir$j/file* >/dev/null & done wait echo "Switching wbs" # Now rmdir the cgroup after the script exits We need to maintain b_dirty list ordering to keep writeback happy so instead of sorting inode into appropriate place just append it at the end of the list and clobber dirtied_time_when. This may result in inode writeback starting later after cgroup switch however cgroup switches are rare so it shouldn't matter much. Since the cgroup had write access to the inode, there are no practical concerns of the possible DoS issues. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 36ef1a796d4b98..af5f396449f16e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode, * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, * the specific list @inode was on is ignored and the @inode is put on * ->b_dirty which is always correct including from ->b_dirty_time. - * The transfer preserves @inode->dirtied_when ordering. If the @inode - * was clean, it means it was on the b_attached list, so move it onto - * the b_attached list of @new_wb. + * If the @inode was clean, it means it was on the b_attached list, so + * move it onto the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { inode->i_wb = new_wb; if (inode->i_state & I_DIRTY_ALL) { - struct inode *pos; - - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) - if (time_after_eq(inode->dirtied_when, - pos->dirtied_when)) - break; + /* + * We need to keep b_dirty list sorted by + * dirtied_time_when. However properly sorting the + * inode in the list gets too expensive when switching + * many inodes. So just attach inode at the end of the + * dirty list and clobber the dirtied_time_when. + */ + inode->dirtied_time_when = jiffies; inode_io_list_move_locked(inode, new_wb, - pos->i_io_list.prev); + &new_wb->b_dirty); } else { inode_cgwb_move_to_attached(inode, new_wb); } From 0cee64c547e3c9cda646af3e075a64f445ee8148 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 Sep 2025 12:38:38 +0200 Subject: [PATCH 2258/3206] writeback: Add tracepoint to track pending inode switches Add trace_inode_switch_wbs_queue tracepoint to allow insight into how many inodes are queued to switch their bdi_writeback structure. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 2 ++ include/trace/events/writeback.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index af5f396449f16e..52129267e3bd8a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -667,6 +667,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) isw->inodes[0] = inode; + trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1); wb_queue_isw(new_wb, isw); return; @@ -752,6 +753,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) return restart; } + trace_inode_switch_wbs_queue(wb, new_wb, nr); wb_queue_isw(new_wb, isw); return restart; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 1e23919c0da981..c08aff044e8073 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -213,6 +213,35 @@ TRACE_EVENT(inode_foreign_history, ) ); +TRACE_EVENT(inode_switch_wbs_queue, + + TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb, + unsigned int count), + + TP_ARGS(old_wb, new_wb, count), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(ino_t, old_cgroup_ino) + __field(ino_t, new_cgroup_ino) + __field(unsigned int, count) + ), + + TP_fast_assign( + strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); + __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); + __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); + __entry->count = count; + ), + + TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u", + __entry->name, + (unsigned long)__entry->old_cgroup_ino, + (unsigned long)__entry->new_cgroup_ino, + __entry->count + ) +); + TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, From 777fb19ed8d6f193c2811b76371ffb41acbca1da Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 18 Sep 2025 20:42:07 +0100 Subject: [PATCH 2259/3206] kselftest/arm64: Add lsfe to the hwcaps test This feature has no traps associated with it so the SIGILL is not reliable. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/hwcap.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index 27d4790c2f0c22..3b96d090c5ebe7 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -17,6 +17,8 @@ #include #include +#include + #include "../../kselftest.h" #define TESTS_PER_HWCAP 3 @@ -168,6 +170,18 @@ static void lse128_sigill(void) : "cc", "memory"); } +static void lsfe_sigill(void) +{ + float __attribute__ ((aligned (16))) mem; + register float *memp asm ("x0") = &mem; + + /* STFADD H0, [X0] */ + asm volatile(".inst 0x7c20801f" + : "+r" (memp) + : + : "memory"); +} + static void lut_sigill(void) { /* LUTI2 V0.16B, { V0.16B }, V[0] */ @@ -761,6 +775,13 @@ static const struct hwcap_data { .cpuinfo = "lse128", .sigill_fn = lse128_sigill, }, + { + .name = "LSFE", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_LSFE, + .cpuinfo = "lsfe", + .sigill_fn = lsfe_sigill, + }, { .name = "LUT", .at_hwcap = AT_HWCAP2, From 2c139a47eff8de24e3350dadb4c9d5e3426db826 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Fri, 19 Sep 2025 17:03:52 +0800 Subject: [PATCH 2260/3206] io_uring: fix incorrect io_kiocb reference in io_link_skb In io_link_skb function, there is a bug where prev_notif is incorrectly assigned using 'nd' instead of 'prev_nd'. This causes the context validation check to compare the current notification with itself instead of comparing it with the previous notification. Fix by using the correct prev_nd parameter when obtaining prev_notif. Signed-off-by: Yang Xiuwei Reviewed-by: Pavel Begunkov Fixes: 6fe4220912d19 ("io_uring/notif: implement notification stacking") Signed-off-by: Jens Axboe --- io_uring/notif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/notif.c b/io_uring/notif.c index 9a6f6e92d74242..ea9c0116cec2df 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -85,7 +85,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) return -EEXIST; prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); - prev_notif = cmd_to_io_kiocb(nd); + prev_notif = cmd_to_io_kiocb(prev_nd); /* make sure all noifications can be finished in the same task_work */ if (unlikely(notif->ctx != prev_notif->ctx || From 2ef435a872abc347dc0a92f1c213bb0af3cbf195 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 17 Sep 2025 17:36:31 +0200 Subject: [PATCH 2261/3206] fs: add might_sleep() annotation to iput() and more When iput() drops the reference counter to zero, it may sleep via inode_wait_for_writeback(). This happens rarely because it's usually the dcache which evicts inodes, but really iput() should only ever be called in contexts where sleeping is allowed. This annotation allows finding buggy callers. Additionally, this patch annotates a few low-level functions that can call iput() conditionally. Cc: Mateusz Guzik Signed-off-by: Max Kellermann Link: https://lore.kernel.org/20250917153632.2228828-1-max.kellermann@ionos.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index bf750376020665..1a9d4fa5e0cb1c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1279,6 +1279,8 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; + might_sleep(); + again: spin_lock(&inode_hash_lock); old = find_inode(inode->i_sb, head, test, data, true); @@ -1382,6 +1384,8 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; + might_sleep(); + again: inode = find_inode(sb, head, test, data, false); if (inode) { @@ -1422,6 +1426,9 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + + might_sleep(); + again: inode = find_inode_fast(sb, head, ino, false); if (inode) { @@ -1605,6 +1612,9 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; + + might_sleep(); + again: inode = ilookup5_nowait(sb, hashval, test, data); if (inode) { @@ -1630,6 +1640,9 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + + might_sleep(); + again: inode = find_inode_fast(sb, head, ino, false); @@ -1780,6 +1793,8 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); + might_sleep(); + while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); @@ -1826,6 +1841,8 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, { struct inode *old; + might_sleep(); + inode->i_state |= I_CREATING; old = inode_insert5(inode, hashval, test, NULL, data); @@ -1908,6 +1925,7 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { + might_sleep(); if (unlikely(!inode)) return; From 231af8c14f0f1574ce5d5e66b48f0087a6a764e3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 16 Sep 2025 08:00:29 -0700 Subject: [PATCH 2262/3206] iomap: trace iomap_zero_iter zeroing activities Trace which bytes actually get zeroed. Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/175803480303.966383.2380024013746734540.stgit@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 3 +++ fs/iomap/trace.h | 1 + 2 files changed, 4 insertions(+) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index fd827398afd2ff..ed7dd6426b0eec 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1396,6 +1396,9 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, /* warn about zeroing folios beyond eof that won't write back */ WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); + trace_iomap_zero_iter(iter->inode, folio_pos(folio) + offset, + bytes); + folio_zero_range(folio, offset, bytes); folio_mark_accessed(folio); diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 6ad66e6ba653e8..a61c1dae474270 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -84,6 +84,7 @@ DEFINE_RANGE_EVENT(iomap_release_folio); DEFINE_RANGE_EVENT(iomap_invalidate_folio); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); DEFINE_RANGE_EVENT(iomap_dio_rw_queued); +DEFINE_RANGE_EVENT(iomap_zero_iter); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ From 6a96fb653b6481ec73e9627ade216b299e4de9ea Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 16 Sep 2025 08:00:45 -0700 Subject: [PATCH 2263/3206] iomap: error out on file IO when there is no inline_data buffer Return IO errors if an ->iomap_begin implementation returns an IOMAP_INLINE buffer but forgets to set the inline_data pointer. Filesystems should never do this, but we could help fs developers (me) fix their bugs by handling this more gracefully than crashing the kernel. Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/175803480324.966383.7414345025943296442.stgit@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 15 ++++++++++----- fs/iomap/direct-io.c | 3 +++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index ed7dd6426b0eec..8b847a1e27f13e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -304,6 +304,9 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, size_t size = i_size_read(iter->inode) - iomap->offset; size_t offset = offset_in_folio(folio, iomap->offset); + if (WARN_ON_ONCE(!iomap->inline_data)) + return -EIO; + if (folio_test_uptodate(folio)) return 0; @@ -894,7 +897,7 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, return true; } -static void iomap_write_end_inline(const struct iomap_iter *iter, +static bool iomap_write_end_inline(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t copied) { const struct iomap *iomap = &iter->iomap; @@ -903,12 +906,16 @@ static void iomap_write_end_inline(const struct iomap_iter *iter, WARN_ON_ONCE(!folio_test_uptodate(folio)); BUG_ON(!iomap_inline_data_valid(iomap)); + if (WARN_ON_ONCE(!iomap->inline_data)) + return false; + flush_dcache_folio(folio); addr = kmap_local_folio(folio, pos); memcpy(iomap_inline_data(iomap, pos), addr, copied); kunmap_local(addr); mark_inode_dirty(iter->inode); + return true; } /* @@ -921,10 +928,8 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; - if (srcmap->type == IOMAP_INLINE) { - iomap_write_end_inline(iter, folio, pos, copied); - return true; - } + if (srcmap->type == IOMAP_INLINE) + return iomap_write_end_inline(iter, folio, pos, copied); if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { size_t bh_written; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 6f25d4cfea9f7e..836f082651e52f 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -519,6 +519,9 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) loff_t pos = iomi->pos; u64 copied; + if (WARN_ON_ONCE(!inline_data)) + return -EIO; + if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) return -EIO; From fa8ee8627b7411f2157aad59d3d2a68ab00adeeb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:27 +0200 Subject: [PATCH 2264/3206] block: use extensible_ioctl_valid() Use the new extensible_ioctl_valid() helper which is equivalent to what is done here. Reviewed-by: Jens Axboe Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/blk-integrity.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 056b8948369d55..ce08ad4565e283 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -58,16 +58,14 @@ int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp) { - struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk); + struct blk_integrity *bi; struct logical_block_metadata_cap meta_cap = {}; size_t usize = _IOC_SIZE(cmd); - if (_IOC_DIR(cmd) != _IOC_DIR(FS_IOC_GETLBMD_CAP) || - _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) || - _IOC_NR(cmd) != _IOC_NR(FS_IOC_GETLBMD_CAP) || - _IOC_SIZE(cmd) < LBMD_SIZE_VER0) + if (!extensible_ioctl_valid(cmd, FS_IOC_GETLBMD_CAP, LBMD_SIZE_VER0)) return -ENOIOCTLCMD; + bi = blk_get_integrity(bdev->bd_disk); if (!bi) goto out; From e3e1812f8e25ac277f5cc9249802365300c582e3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:28 +0200 Subject: [PATCH 2265/3206] ns: move to_ns_common() to ns_common.h Move the helper to ns_common.h where it belongs. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 20 ++++++++++++++++++++ include/linux/nsproxy.h | 11 ----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7d22ea50b09841..bc2e0758e1c939 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -6,6 +6,15 @@ struct proc_ns_operations; +struct cgroup_namespace; +struct ipc_namespace; +struct mnt_namespace; +struct net; +struct pid_namespace; +struct time_namespace; +struct user_namespace; +struct uts_namespace; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -13,4 +22,15 @@ struct ns_common { refcount_t count; }; +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns) + #endif diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a2265..e6bec522b1391f 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -42,17 +42,6 @@ struct nsproxy { }; extern struct nsproxy init_nsproxy; -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns->ns), \ - struct ipc_namespace *: &(__ns->ns), \ - struct net *: &(__ns->ns), \ - struct pid_namespace *: &(__ns->ns), \ - struct mnt_namespace *: &(__ns->ns), \ - struct time_namespace *: &(__ns->ns), \ - struct user_namespace *: &(__ns->ns), \ - struct uts_namespace *: &(__ns->ns)) - /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. From 9296f46a9645cf753d2522093485cebe77635aa6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:29 +0200 Subject: [PATCH 2266/3206] nsfs: add nsfs.h header And move the stuff out from proc_ns.h where it really doesn't belong. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/nsfs.h | 26 ++++++++++++++++++++++++++ include/linux/proc_ns.h | 13 +------------ 2 files changed, 27 insertions(+), 12 deletions(-) create mode 100644 include/linux/nsfs.h diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h new file mode 100644 index 00000000000000..fb84aa538091ff --- /dev/null +++ b/include/linux/nsfs.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner */ + +#ifndef _LINUX_NSFS_H +#define _LINUX_NSFS_H + +#include + +struct path; +struct task_struct; +struct proc_ns_operations; + +int ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +typedef struct ns_common *ns_get_path_helper_t(void *); +int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, + void *private_data); + +bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); + +int ns_get_name(char *buf, size_t size, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +void nsfs_init(void); + +#endif /* _LINUX_NSFS_H */ + diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 4b20375f3783e7..5e1a4b378b790b 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -5,7 +5,7 @@ #ifndef _LINUX_PROC_NS_H #define _LINUX_PROC_NS_H -#include +#include #include struct pid_namespace; @@ -75,16 +75,5 @@ static inline int ns_alloc_inum(struct ns_common *ns) #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) -extern int ns_get_path(struct path *path, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -typedef struct ns_common *ns_get_path_helper_t(void *); -extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, - void *private_data); - -extern bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); - -extern int ns_get_name(char *buf, size_t size, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -extern void nsfs_init(void); #endif /* _LINUX_PROC_NS_H */ From 660def10b01b248fd97255afacb7b0e305ac833a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:30 +0200 Subject: [PATCH 2267/3206] ns: uniformly initialize ns_common No point in cargo-culting the same code across all the different types. Use one common initializer. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 5e1a4b378b790b..dbb119bda097f3 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -72,6 +72,22 @@ static inline int ns_alloc_inum(struct ns_common *ns) return proc_alloc_inum(&ns->inum); } +static inline int ns_common_init(struct ns_common *ns, + const struct proc_ns_operations *ops, + bool alloc_inum) +{ + if (alloc_inum) { + int ret; + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + } + refcount_set(&ns->count, 1); + ns->stashed = NULL; + ns->ops = ops; + return 0; +} + #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) From 0b40774ef06cee7c3334dcee068f06d331ff2749 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:31 +0200 Subject: [PATCH 2268/3206] cgroup: use ns_common_init() Don't cargo-cult the same thing over and over. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/cgroup/namespace.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c664..0391b6ab0bf1be 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -21,20 +21,16 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts) static struct cgroup_namespace *alloc_cgroup_ns(void) { - struct cgroup_namespace *new_ns; + struct cgroup_namespace *new_ns __free(kfree) = NULL; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); + ret = ns_common_init(&new_ns->ns, &cgroupns_operations, true); + if (ret) return ERR_PTR(ret); - } - refcount_set(&new_ns->ns.count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; + return no_free_ptr(new_ns); } void free_cgroup_ns(struct cgroup_namespace *ns) From 90d4d9f4d235673da83574f0f84be3e37f955a39 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:32 +0200 Subject: [PATCH 2269/3206] ipc: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- ipc/namespace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ipc/namespace.c b/ipc/namespace.c index 4df91ceeeafe9f..d4188a88ee57b3 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -61,12 +61,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(&ns->ns, &ipcns_operations, true); if (err) goto fail_free; - ns->ns.ops = &ipcns_operations; - refcount_set(&ns->ns.count, 1); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; From 96ece8eb676453d09036c9a1eed7b680267f7778 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:33 +0200 Subject: [PATCH 2270/3206] mnt: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d33837..14c5cdbdd6e16f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4177,18 +4177,15 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } - if (!anon) { - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - dec_mnt_namespaces(ucounts); - return ERR_PTR(ret); - } + + ret = ns_common_init(&new_ns->ns, &mntns_operations, !anon); + if (ret) { + kfree(new_ns); + dec_mnt_namespaces(ucounts); + return ERR_PTR(ret); } - new_ns->ns.ops = &mntns_operations; if (!anon) new_ns->seq = atomic64_inc_return(&mnt_ns_seq); - refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; INIT_LIST_HEAD(&new_ns->mnt_ns_list); From 08027f6b790be1e444e4182fb4dc53faa6539d16 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:34 +0200 Subject: [PATCH 2271/3206] net: use ns_common_init() Don't cargo-cult the same thing over and over. Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 46 ++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1b6f3826dd0e1f..5fb7bd8ac45afc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -397,10 +397,22 @@ static __net_init void preinit_net_sysctl(struct net *net) } /* init code that must occur even if setup_net() is not called. */ -static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) +static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) { + const struct proc_ns_operations *ns_ops; + int ret; + +#ifdef CONFIG_NET_NS + ns_ops = &netns_operations; +#else + ns_ops = NULL; +#endif + + ret = ns_common_init(&net->ns, ns_ops, false); + if (ret) + return ret; + refcount_set(&net->passive, 1); - refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); @@ -420,6 +432,7 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ INIT_LIST_HEAD(&net->ptype_all); INIT_LIST_HEAD(&net->ptype_specific); preinit_net_sysctl(net); + return 0; } /* @@ -559,7 +572,9 @@ struct net *copy_net_ns(unsigned long flags, goto dec_ucounts; } - preinit_net(net, user_ns); + rv = preinit_net(net, user_ns); + if (rv < 0) + goto dec_ucounts; net->ucounts = ucounts; get_user_ns(user_ns); @@ -812,15 +827,15 @@ static void net_ns_net_debugfs(struct net *net) static __net_init int net_ns_net_init(struct net *net) { -#ifdef CONFIG_NET_NS - net->ns.ops = &netns_operations; -#endif - net->ns.inum = PROC_NET_INIT_INO; - if (net != &init_net) { - int ret = ns_alloc_inum(&net->ns); - if (ret) - return ret; - } + int ret = 0; + + if (net == &init_net) + net->ns.inum = PROC_NET_INIT_INO; + else + ret = proc_alloc_inum(&to_ns_common(net)->inum); + if (ret) + return ret; + net_ns_net_debugfs(net); return 0; } @@ -1282,7 +1297,12 @@ void __init net_ns_init(void) #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif - preinit_net(&init_net, &init_user_ns); + /* + * This currently cannot fail as the initial network namespace + * has a static inode number. + */ + if (preinit_net(&init_net, &init_user_ns)) + panic("Could not preinitialize the initial network namespace"); down_write(&pernet_ops_rwsem); if (setup_net(&init_net)) From 8e199cd6e3303d8f9352030ab9e4a221f277bd51 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:35 +0200 Subject: [PATCH 2272/3206] pid: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/pid_namespace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717d3..20ce4052d1c513 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -102,17 +102,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(&ns->ns, &pidns_operations, true); if (err) goto out_free_idr; - ns->ns.ops = &pidns_operations; ns->pid_max = PID_MAX_LIMIT; err = register_pidns_sysctls(ns); if (err) goto out_free_inum; - refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); From 7b0e2c83624b02a8a11f04f0581721d95ea6e4a6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:36 +0200 Subject: [PATCH 2273/3206] time: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Reviewed-by: Thomas Gleixner Signed-off-by: Christian Brauner --- kernel/time/namespace.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 667452768ed3b5..0be93d8f2896d6 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -88,22 +88,19 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); + ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; - refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(&ns->ns, &timens_operations, true); if (err) goto fail_free_page; ns->ucounts = ucounts; - ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; From 00ed42285c46e2c84223e3ec5e1d07638948b4d1 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:37 +0200 Subject: [PATCH 2274/3206] user: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/user_namespace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 682f40d5632d44..98f4fe84d039a4 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -124,12 +124,11 @@ int create_user_ns(struct cred *new) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_alloc_inum(&ns->ns); + + ret = ns_common_init(&ns->ns, &userns_operations, true); if (ret) goto fail_free; - ns->ns.ops = &userns_operations; - refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; From 09337e064cbb692d5e0e34d7e1e2f2c53166f91a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:38 +0200 Subject: [PATCH 2275/3206] uts: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/utsname.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/kernel/utsname.c b/kernel/utsname.c index b1ac3ca870f24e..02037010b3780f 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -27,16 +27,6 @@ static void dec_uts_namespaces(struct ucounts *ucounts) dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); } -static struct uts_namespace *create_uts_ns(void) -{ - struct uts_namespace *uts_ns; - - uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); - if (uts_ns) - refcount_set(&uts_ns->ns.count, 1); - return uts_ns; -} - /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone @@ -55,17 +45,15 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = create_uts_ns(); + ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL); if (!ns) goto fail_dec; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(&ns->ns, &utsns_operations, true); if (err) goto fail_free; ns->ucounts = ucounts; - ns->ns.ops = &utsns_operations; - down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); From 86c5aba210b145d7de011a5abaf9b785aa70a183 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:39 +0200 Subject: [PATCH 2276/3206] ns: remove ns_alloc_inum() It's now unused. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index dbb119bda097f3..e50d312f9fee24 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,12 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -static inline int ns_alloc_inum(struct ns_common *ns) -{ - WRITE_ONCE(ns->stashed, NULL); - return proc_alloc_inum(&ns->inum); -} - static inline int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, bool alloc_inum) From 885fc8ac0a4dc70f5d87b80b0977292870e35c60 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:40 +0200 Subject: [PATCH 2277/3206] nstree: make iterator generic Move the namespace iteration infrastructure originally introduced for mount namespaces into a generic library usable by all namespace types. Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 9 ++ include/linux/nstree.h | 91 +++++++++++++++ include/linux/proc_ns.h | 3 + kernel/Makefile | 2 +- kernel/nstree.c | 233 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 include/linux/nstree.h create mode 100644 kernel/nstree.c diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index bc2e0758e1c939..7224072cccc5e6 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -3,6 +3,7 @@ #define _LINUX_NS_COMMON_H #include +#include struct proc_ns_operations; @@ -20,6 +21,14 @@ struct ns_common { const struct proc_ns_operations *ops; unsigned int inum; refcount_t count; + union { + struct { + u64 ns_id; + struct rb_node ns_tree_node; + struct list_head ns_list_node; + }; + struct rcu_head ns_rcu; + }; }; #define to_ns_common(__ns) \ diff --git a/include/linux/nstree.h b/include/linux/nstree.h new file mode 100644 index 00000000000000..29ad6402260cc2 --- /dev/null +++ b/include/linux/nstree.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NSTREE_H +#define _LINUX_NSTREE_H + +#include +#include +#include +#include +#include +#include + +/** + * struct ns_tree - Namespace tree + * @ns_tree: Rbtree of namespaces of a particular type + * @ns_list: Sequentially walkable list of all namespaces of this type + * @ns_tree_lock: Seqlock to protect the tree and list + */ +struct ns_tree { + struct rb_root ns_tree; + struct list_head ns_list; + seqlock_t ns_tree_lock; + int type; +}; + +extern struct ns_tree cgroup_ns_tree; +extern struct ns_tree ipc_ns_tree; +extern struct ns_tree mnt_ns_tree; +extern struct ns_tree net_ns_tree; +extern struct ns_tree pid_ns_tree; +extern struct ns_tree time_ns_tree; +extern struct ns_tree user_ns_tree; +extern struct ns_tree uts_ns_tree; + +#define to_ns_tree(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(cgroup_ns_tree), \ + struct ipc_namespace *: &(ipc_ns_tree), \ + struct net *: &(net_ns_tree), \ + struct pid_namespace *: &(pid_ns_tree), \ + struct mnt_namespace *: &(mnt_ns_tree), \ + struct time_namespace *: &(time_ns_tree), \ + struct user_namespace *: &(user_ns_tree), \ + struct uts_namespace *: &(uts_ns_tree)) + +u64 ns_tree_gen_id(struct ns_common *ns); +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, + bool previous); + +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) +{ + ns_tree_gen_id(ns); + __ns_tree_add_raw(ns, ns_tree); +} + +/** + * ns_tree_add_raw - Add a namespace to a namespace + * @ns: Namespace to add + * + * This function adds a namespace to the appropriate namespace tree + * without assigning a id. + */ +#define ns_tree_add_raw(__ns) __ns_tree_add_raw(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_add - Add a namespace to a namespace tree + * @ns: Namespace to add + * + * This function assigns a new id to the namespace and adds it to the + * appropriate namespace tree and list. + */ +#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_remove - Remove a namespace from a namespace tree + * @ns: Namespace to remove + * + * This function removes a namespace from the appropriate namespace + * tree and list. + */ +#define ns_tree_remove(__ns) __ns_tree_remove(to_ns_common(__ns), to_ns_tree(__ns)) + +#define ns_tree_adjoined_rcu(__ns, __previous) \ + __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) + +#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) + +#endif /* _LINUX_NSTREE_H */ diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index e50d312f9fee24..7f89f0829e60ba 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -79,6 +79,9 @@ static inline int ns_common_init(struct ns_common *ns, refcount_set(&ns->count, 1); ns->stashed = NULL; ns->ops = ops; + ns->ns_id = 0; + RB_CLEAR_NODE(&ns->ns_tree_node); + INIT_LIST_HEAD(&ns->ns_list_node); return 0; } diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235f2..b807516a1b43aa 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ + kthread.o sys_ni.o nsproxy.o nstree.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o diff --git a/kernel/nstree.c b/kernel/nstree.c new file mode 100644 index 00000000000000..bbe8bedc924ca9 --- /dev/null +++ b/kernel/nstree.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +struct ns_tree mnt_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), + .type = CLONE_NEWNS, +}; + +struct ns_tree net_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), + .type = CLONE_NEWNET, +}; +EXPORT_SYMBOL_GPL(net_ns_tree); + +struct ns_tree uts_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), + .type = CLONE_NEWUTS, +}; + +struct ns_tree user_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), + .type = CLONE_NEWUSER, +}; + +struct ns_tree ipc_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), + .type = CLONE_NEWIPC, +}; + +struct ns_tree pid_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), + .type = CLONE_NEWPID, +}; + +struct ns_tree cgroup_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), + .type = CLONE_NEWCGROUP, +}; + +struct ns_tree time_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), + .type = CLONE_NEWTIME, +}; + +DEFINE_COOKIE(namespace_cookie); + +static inline struct ns_common *node_to_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_tree_node); +} + +static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct ns_common *ns_a = node_to_ns(a); + struct ns_common *ns_b = node_to_ns(b); + u64 ns_id_a = ns_a->ns_id; + u64 ns_id_b = ns_b->ns_id; + + if (ns_id_a < ns_id_b) + return -1; + if (ns_id_a > ns_id_b) + return 1; + return 0; +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +{ + struct rb_node *node, *prev; + + VFS_WARN_ON_ONCE(!ns->ns_id); + + write_seqlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + + node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); + /* + * If there's no previous entry simply add it after the + * head and if there is add it after the previous entry. + */ + prev = rb_prev(&ns->ns_tree_node); + if (!prev) + list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); + else + list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + + write_sequnlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(node); +} + +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +{ + VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); + VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + + write_seqlock(&ns_tree->ns_tree_lock); + rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); + list_bidir_del_rcu(&ns->ns_list_node); + RB_CLEAR_NODE(&ns->ns_tree_node); + write_sequnlock(&ns_tree->ns_tree_lock); +} +EXPORT_SYMBOL_GPL(__ns_tree_remove); + +static int ns_find(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns(node); + + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + + +static struct ns_tree *ns_tree_from_type(int ns_type) +{ + switch (ns_type) { + case CLONE_NEWCGROUP: + return &cgroup_ns_tree; + case CLONE_NEWIPC: + return &ipc_ns_tree; + case CLONE_NEWNS: + return &mnt_ns_tree; + case CLONE_NEWNET: + return &net_ns_tree; + case CLONE_NEWPID: + return &pid_ns_tree; + case CLONE_NEWUSER: + return &user_ns_tree; + case CLONE_NEWUTS: + return &uts_ns_tree; + case CLONE_NEWTIME: + return &time_ns_tree; + } + + return NULL; +} + +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree *ns_tree; + struct rb_node *node; + unsigned int seq; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + + do { + seq = read_seqbegin(&ns_tree->ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + if (node) + break; + } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + + if (!node) + return NULL; + + VFS_WARN_ON_ONCE(node_to_ns(node)->ops->type != ns_type); + + return node_to_ns(node); +} + +/** + * ns_tree_adjoined_rcu - find the next/previous namespace in the same + * tree + * @ns: namespace to start from + * @previous: if true find the previous namespace, otherwise the next + * + * Find the next or previous namespace in the same tree as @ns. If + * there is no next/previous namespace, -ENOENT is returned. + */ +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, bool previous) +{ + struct list_head *list; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); + + if (previous) + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + else + list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); + if (list_is_head(list, &ns_tree->ns_list)) + return ERR_PTR(-ENOENT); + + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ops->type != ns_tree->type); + + return list_entry_rcu(list, struct ns_common, ns_list_node); +} + +/** + * ns_tree_gen_id - generate a new namespace id + * @ns: namespace to generate id for + * + * Generates a new namespace id and assigns it to the namespace. All + * namespaces types share the same id space and thus can be compared + * directly. IOW, when two ids of two namespace are equal, they are + * identical. + */ +u64 ns_tree_gen_id(struct ns_common *ns) +{ + guard(preempt)(); + ns->ns_id = gen_cookie_next(&namespace_cookie); + return ns->ns_id; +} From 7d7d164989586c0ad61934975c40ca795dc134c7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:41 +0200 Subject: [PATCH 2278/3206] mnt: support ns lookup Move the mount namespace to the generic ns lookup infrastructure. This allows us to drop a bunch of members from struct mnt_namespace. Signed-off-by: Christian Brauner --- fs/mount.h | 10 +--- fs/namespace.c | 139 +++++++++++-------------------------------------- fs/nsfs.c | 4 +- 3 files changed, 33 insertions(+), 120 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 97737051a8b9df..76bf863c9ae281 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -17,11 +17,7 @@ struct mnt_namespace { }; struct user_namespace *user_ns; struct ucounts *ucounts; - u64 seq; /* Sequence number to prevent loops */ - union { - wait_queue_head_t poll; - struct rcu_head mnt_ns_rcu; - }; + wait_queue_head_t poll; u64 seq_origin; /* Sequence number of origin mount namespace */ u64 event; #ifdef CONFIG_FSNOTIFY @@ -30,8 +26,6 @@ struct mnt_namespace { #endif unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ - struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; @@ -173,7 +167,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry) static inline bool is_anon_ns(struct mnt_namespace *ns) { - return ns->seq == 0; + return ns->ns.ns_id == 0; } static inline bool anon_ns_root(const struct mount *m) diff --git a/fs/namespace.c b/fs/namespace.c index 5b2c15f7170fae..a6899844969897 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "pnode.h" #include "internal.h" @@ -80,13 +81,10 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ -static DEFINE_SEQLOCK(mnt_ns_tree_lock); #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif -static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ -static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ enum mount_kattr_flags_t { MOUNT_KATTR_RECURSE = (1 << 0), @@ -119,53 +117,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { + struct ns_common *ns; + if (!node) return NULL; - return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); -} - -static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) -{ - struct mnt_namespace *ns_a = node_to_mnt_ns(a); - struct mnt_namespace *ns_b = node_to_mnt_ns(b); - u64 seq_a = ns_a->seq; - u64 seq_b = ns_b->seq; - - if (seq_a < seq_b) - return -1; - if (seq_a > seq_b) - return 1; - return 0; -} - -static inline void mnt_ns_tree_write_lock(void) -{ - write_seqlock(&mnt_ns_tree_lock); -} - -static inline void mnt_ns_tree_write_unlock(void) -{ - write_sequnlock(&mnt_ns_tree_lock); -} - -static void mnt_ns_tree_add(struct mnt_namespace *ns) -{ - struct rb_node *node, *prev; - - mnt_ns_tree_write_lock(); - node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); - /* - * If there's no previous entry simply add it after the - * head and if there is add it after the previous entry. - */ - prev = rb_prev(&ns->mnt_ns_tree_node); - if (!prev) - list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); - else - list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); - mnt_ns_tree_write_unlock(); - - WARN_ON_ONCE(node); + ns = rb_entry(node, struct ns_common, ns_tree_node); + return container_of(ns, struct mnt_namespace, ns); } static void mnt_ns_release(struct mnt_namespace *ns) @@ -181,32 +138,16 @@ DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { - mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); + mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu)); } static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ - if (!RB_EMPTY_NODE(&ns->mnt_ns_tree_node)) { - mnt_ns_tree_write_lock(); - rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); - list_bidir_del_rcu(&ns->mnt_ns_list); - mnt_ns_tree_write_unlock(); - } - - call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); -} - -static int mnt_ns_find(const void *key, const struct rb_node *node) -{ - const u64 mnt_ns_id = *(u64 *)key; - const struct mnt_namespace *ns = node_to_mnt_ns(node); + if (ns_tree_active(ns)) + ns_tree_remove(ns); - if (mnt_ns_id < ns->seq) - return -1; - if (mnt_ns_id > ns->seq) - return 1; - return 0; + call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu); } /* @@ -225,28 +166,21 @@ static int mnt_ns_find(const void *key, const struct rb_node *node) */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; - struct rb_node *node; - unsigned int seq; + struct mnt_namespace *mnt_ns; + struct ns_common *ns; guard(rcu)(); - do { - seq = read_seqbegin(&mnt_ns_tree_lock); - node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); - if (node) - break; - } while (read_seqretry(&mnt_ns_tree_lock, seq)); - - if (!node) + ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS); + if (!ns) return NULL; /* * The last reference count is put with RCU delay so we can * unconditonally acquire a reference here. */ - ns = node_to_mnt_ns(node); - refcount_inc(&ns->passive); - return ns; + mnt_ns = container_of(ns, struct mnt_namespace, ns); + refcount_inc(&mnt_ns->passive); + return mnt_ns; } static inline void lock_mount_hash(void) @@ -1017,7 +951,7 @@ static inline bool check_anonymous_mnt(struct mount *mnt) return false; seq = mnt->mnt_ns->seq_origin; - return !seq || (seq == current->nsproxy->mnt_ns->seq); + return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id); } /* @@ -2152,19 +2086,16 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { + struct ns_common *ns; + guard(rcu)(); for (;;) { - struct list_head *list; - - if (previous) - list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); - else - list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); - if (list_is_head(list, &mnt_ns_list)) - return ERR_PTR(-ENOENT); + ns = ns_tree_adjoined_rcu(mntns, previous); + if (IS_ERR(ns)) + return ERR_CAST(ns); - mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); + mntns = to_mnt_ns(ns); /* * The last passive reference count is put with RCU @@ -2204,7 +2135,7 @@ static bool mnt_ns_loop(struct dentry *dentry) if (!mnt_ns) return false; - return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; + return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id; } struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, @@ -3080,7 +3011,7 @@ static struct file *open_detached_copy(struct path *path, bool recursive) if (is_anon_ns(src_mnt_ns)) ns->seq_origin = src_mnt_ns->seq_origin; else - ns->seq_origin = src_mnt_ns->seq; + ns->seq_origin = src_mnt_ns->ns.ns_id; } mnt = __do_loopback(path, recursive); @@ -4156,15 +4087,6 @@ static void free_mnt_ns(struct mnt_namespace *ns) mnt_ns_tree_remove(ns); } -/* - * Assign a sequence number so we can detect when we attempt to bind - * mount a reference to an older mount namespace into the current - * mount namespace, preventing reference counting loops. A 64bit - * number incrementing at 10Ghz will take 12,427 years to wrap which - * is effectively never, so we can ignore the possibility. - */ -static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); - static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; @@ -4188,11 +4110,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a return ERR_PTR(ret); } if (!anon) - new_ns->seq = atomic64_inc_return(&mnt_ns_seq); + ns_tree_gen_id(&new_ns->ns); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; - INIT_LIST_HEAD(&new_ns->mnt_ns_list); - RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; @@ -4278,7 +4198,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (pwdmnt) mntput(pwdmnt); - mnt_ns_tree_add(new_ns); + ns_tree_add_raw(new_ns); return new_ns; } @@ -5397,7 +5317,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; - s->sm.mnt_ns_id = ns->seq; + s->sm.mnt_ns_id = ns->ns.ns_id; } static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) @@ -6102,7 +6022,6 @@ static void __init init_mount_tree(void) ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); - ns->seq = atomic64_inc_return(&mnt_ns_seq); ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); ns->root = m; @@ -6117,7 +6036,7 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); - mnt_ns_tree_add(ns); + ns_tree_add(ns); } void __init mnt_init(void) diff --git a/fs/nsfs.c b/fs/nsfs.c index d016d36272d49e..80e631aeb3ce13 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -139,7 +139,7 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, * the size value will be set to the size the kernel knows about. */ kinfo->size = min(usize, sizeof(*kinfo)); - kinfo->mnt_ns_id = mnt_ns->seq; + kinfo->mnt_ns_id = mnt_ns->ns.ns_id; kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts); /* Subtract the root mount of the mount namespace. */ if (kinfo->nr_mounts) @@ -221,7 +221,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, mnt_ns = container_of(ns, struct mnt_namespace, ns); idp = (__u64 __user *)arg; - id = mnt_ns->seq; + id = mnt_ns->ns.ns_id; return put_user(id, idp); } case NS_GET_PID_FROM_PIDNS: From 7c60593985331e7839ec3fea6328a3253a325e82 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:42 +0200 Subject: [PATCH 2279/3206] cgroup: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Acked-by: Tejun Heo Signed-off-by: Christian Brauner --- kernel/cgroup/cgroup.c | 2 ++ kernel/cgroup/namespace.c | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb73..092e6bf081ed1f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -6312,6 +6313,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cpuset_fs_type)); #endif + ns_tree_add(&init_cgroup_ns); return 0; } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 0391b6ab0bf1be..fc12c416dfeb34 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -5,7 +5,7 @@ #include #include #include - +#include /* cgroup namespaces */ @@ -30,16 +30,19 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) ret = ns_common_init(&new_ns->ns, &cgroupns_operations, true); if (ret) return ERR_PTR(ret); + ns_tree_add(new_ns); return no_free_ptr(new_ns); } void free_cgroup_ns(struct cgroup_namespace *ns) { + ns_tree_remove(ns); put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); - kfree(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } EXPORT_SYMBOL(free_cgroup_ns); From 74b24a582e1ff3960f0454f57afc2bcdbc52562e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:43 +0200 Subject: [PATCH 2280/3206] ipc: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Signed-off-by: Christian Brauner --- ipc/msgutil.c | 1 + ipc/namespace.c | 3 +++ ipc/shm.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/ipc/msgutil.c b/ipc/msgutil.c index c7be0c79264767..bbf61275df41e1 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "util.h" diff --git a/ipc/namespace.c b/ipc/namespace.c index d4188a88ee57b3..9f923c1a1eb30c 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "util.h" @@ -85,6 +86,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, sem_init_ns(ns); shm_init_ns(ns); + ns_tree_add(ns); return ns; @@ -201,6 +203,7 @@ void put_ipc_ns(struct ipc_namespace *ns) mq_clear_sbinfo(ns); spin_unlock(&mq_lock); + ns_tree_remove(ns); if (llist_add(&ns->mnt_llist, &free_ipc_list)) schedule_work(&free_ipc_work); } diff --git a/ipc/shm.c b/ipc/shm.c index a9310b6dbbc369..3db36773dd1023 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -148,6 +149,7 @@ void shm_exit_ns(struct ipc_namespace *ns) static int __init ipc_ns_init(void) { shm_init_ns(&init_ipc_ns); + ns_tree_add(&init_ipc_ns); return 0; } From 195f7422298d711e89643369988ed285d484dd74 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:44 +0200 Subject: [PATCH 2281/3206] net: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. The network namespace has a separate list with different lifetime rules which we can just leave in tact. We have a similar concept for mount namespaces as well where it is on two differenet lists for different purposes. Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 5fb7bd8ac45afc..169ec22c4758f4 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -445,7 +446,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); int error = 0; - net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie); + net->net_cookie = ns_tree_gen_id(&net->ns); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -455,6 +456,7 @@ static __net_init int setup_net(struct net *net) down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); up_write(&net_rwsem); + ns_tree_add_raw(net); out: return error; @@ -674,8 +676,10 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ down_write(&net_rwsem); - llist_for_each_entry(net, net_kill_list, cleanup_list) + llist_for_each_entry(net, net_kill_list, cleanup_list) { + ns_tree_remove(net); list_del_rcu(&net->list); + } /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer * to a net from net_kill_list (see peernet2id_alloc()). From 488acdcec8e24377506934a95e0ba21619073e8f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:45 +0200 Subject: [PATCH 2282/3206] pid: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Signed-off-by: Christian Brauner --- kernel/pid_namespace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 20ce4052d1c513..228ae20299f9aa 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "pid_sysctl.h" @@ -122,6 +123,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif + ns_tree_add(ns); return ns; out_free_inum: @@ -147,6 +149,7 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { + ns_tree_remove(ns); unregister_pidns_sysctls(ns); ns_free_inum(&ns->ns); @@ -473,6 +476,7 @@ static __init int pid_namespaces_init(void) #endif register_pid_ns_sysctl_table_vm(); + ns_tree_add(&init_pid_ns); return 0; } From b36c823b9a4be5b0c8e38c3fd60cade7d41c216c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:46 +0200 Subject: [PATCH 2283/3206] time: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Reviewed-by: Thomas Gleixner Signed-off-by: Christian Brauner --- include/linux/time_namespace.h | 5 +++++ init/main.c | 2 ++ kernel/time/namespace.c | 11 ++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb2c52f4fc9497..7f6af7a9771e9a 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,6 +33,7 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); @@ -108,6 +109,10 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #else +static inline void __init time_ns_init(void) +{ +} + static inline int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { diff --git a/init/main.c b/init/main.c index 0ee0ee7b7c2c0a..e7d2c57c65a7b1 100644 --- a/init/main.c +++ b/init/main.c @@ -103,6 +103,7 @@ #include #include #include +#include #include #include @@ -1072,6 +1073,7 @@ void start_kernel(void) fork_init(); proc_caches_init(); uts_ns_init(); + time_ns_init(); key_init(); security_init(); dbg_late_init(); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0be93d8f2896d6..408f60d0a3b6ca 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; + ns_tree_add(ns); return ns; fail_free_page: @@ -250,11 +252,13 @@ static void timens_set_vvar_page(struct task_struct *task, void free_time_ns(struct time_namespace *ns) { + ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); __free_page(ns->vvar_page); - kfree(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct time_namespace *to_time_ns(struct ns_common *ns) @@ -487,3 +491,8 @@ struct time_namespace init_time_ns = { .ns.ops = &timens_operations, .frozen_offsets = true, }; + +void __init time_ns_init(void) +{ + ns_tree_add(&init_time_ns); +} From 2f5243cbba6cff66cc8b43bdc14853282f5b1e67 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:47 +0200 Subject: [PATCH 2284/3206] user: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Signed-off-by: Christian Brauner --- kernel/user_namespace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 98f4fe84d039a4..ade5b6806c5c31 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,6 +21,7 @@ #include #include #include +#include static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); @@ -158,6 +159,7 @@ int create_user_ns(struct cred *new) goto fail_keyring; set_cred_user_ns(new, ns); + ns_tree_add(ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS @@ -200,6 +202,7 @@ static void free_user_ns(struct work_struct *work) do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + ns_tree_remove(ns); if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); @@ -218,7 +221,8 @@ static void free_user_ns(struct work_struct *work) retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); - kmem_cache_free(user_ns_cachep, ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); @@ -1412,6 +1416,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); + ns_tree_add(&init_user_ns); return 0; } subsys_initcall(user_namespaces_init); From 58f976d41fd915acd2403a2b8e0eef8e9c478357 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:48 +0200 Subject: [PATCH 2285/3206] uts: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Signed-off-by: Christian Brauner --- kernel/utsname.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/utsname.c b/kernel/utsname.c index 02037010b3780f..64155417ae0c86 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -13,6 +13,7 @@ #include #include #include +#include #include static struct kmem_cache *uts_ns_cache __ro_after_init; @@ -58,6 +59,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); + ns_tree_add(ns); return ns; fail_free: @@ -93,10 +95,12 @@ struct uts_namespace *copy_utsname(unsigned long flags, void free_uts_ns(struct uts_namespace *ns) { + ns_tree_remove(ns); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); - kmem_cache_free(uts_ns_cache, ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) @@ -162,4 +166,5 @@ void __init uts_ns_init(void) offsetof(struct uts_namespace, name), sizeof_field(struct uts_namespace, name), NULL); + ns_tree_add(&init_uts_ns); } From d7afdf889561058068ab46fd8f306c70ef29216a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:49 +0200 Subject: [PATCH 2286/3206] ns: add to__ns() to respective headers Every namespace type has a container_of(ns, , ns) static inline function that is currently not exposed in the header. So we have a bunch of places that open-code it via container_of(). Move it to the headers so we can use it directly. Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 5 +++++ include/linux/ipc_namespace.h | 5 +++++ include/linux/pid_namespace.h | 5 +++++ include/linux/time_namespace.h | 4 ++++ include/linux/user_namespace.h | 5 +++++ include/linux/utsname.h | 5 +++++ include/net/net_namespace.h | 5 +++++ ipc/namespace.c | 5 ----- kernel/cgroup/namespace.c | 5 ----- kernel/pid_namespace.c | 5 ----- kernel/time/namespace.c | 5 ----- kernel/user_namespace.c | 5 ----- kernel/utsname.c | 5 ----- net/core/net_namespace.c | 5 ----- 14 files changed, 34 insertions(+), 35 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e23..9ca25346f7cb5c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -794,6 +794,11 @@ extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611ad6..924e4754374fba 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,6 +129,11 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) +static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) +{ + return container_of(ns, struct ipc_namespace, ns); +} + extern struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a58111998d..ba0efc8c8596b3 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -54,6 +54,11 @@ extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 7f6af7a9771e9a..a47a4ce4183e1d 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,6 +33,10 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +static inline struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a0bb6d01213780..a09056ad090e1a 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -168,6 +168,11 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412bfe..5d34c4f0f94549 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -30,6 +30,11 @@ struct uts_namespace { extern struct uts_namespace init_uts_ns; #ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + static inline void get_uts_ns(struct uts_namespace *ns) { refcount_inc(&ns->ns.count); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275f3..fd090ceb80bf48 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -262,6 +262,11 @@ void ipx_unregister_sysctl(void); #ifdef CONFIG_NET_NS void __put_net(struct net *net); +static inline struct net *to_net_ns(struct ns_common *ns) +{ + return container_of(ns, struct net, ns); +} + /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { diff --git a/ipc/namespace.c b/ipc/namespace.c index 9f923c1a1eb30c..89588819956b20 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -209,11 +209,6 @@ void put_ipc_ns(struct ipc_namespace *ns) } } -static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) -{ - return container_of(ns, struct ipc_namespace, ns); -} - static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index fc12c416dfeb34..5a327914b5654e 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -89,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 228ae20299f9aa..9b327420309e8b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -345,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) -{ - return container_of(ns, struct pid_namespace, ns); -} - static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 408f60d0a3b6ca..20b65f90549e20 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -261,11 +261,6 @@ void free_time_ns(struct time_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); -} - static struct ns_common *timens_get(struct task_struct *task) { struct time_namespace *ns = NULL; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ade5b6806c5c31..cfb0e28f2779c9 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1325,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns) } EXPORT_SYMBOL(current_in_userns); -static inline struct user_namespace *to_user_ns(struct ns_common *ns) -{ - return container_of(ns, struct user_namespace, ns); -} - static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; diff --git a/kernel/utsname.c b/kernel/utsname.c index 64155417ae0c86..a682830742d344 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -103,11 +103,6 @@ void free_uts_ns(struct uts_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); -} - static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 169ec22c4758f4..a57b3cda8dbc31 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1541,11 +1541,6 @@ static struct ns_common *netns_get(struct task_struct *task) return net ? &net->ns : NULL; } -static inline struct net *to_net_ns(struct ns_common *ns) -{ - return container_of(ns, struct net, ns); -} - static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); From d2afdb73f8ad77b49eca9d110d0c54bf30d1df0f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:50 +0200 Subject: [PATCH 2287/3206] nsfs: add current_in_namespace() Add a helper to easily check whether a given namespace is the caller's current namespace. This is currently open-coded in a lot of places. Simply switch on the type and compare the results. Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- include/linux/nsfs.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h index fb84aa538091ff..e5a5fa83d36bde 100644 --- a/include/linux/nsfs.h +++ b/include/linux/nsfs.h @@ -5,6 +5,8 @@ #define _LINUX_NSFS_H #include +#include +#include struct path; struct task_struct; @@ -22,5 +24,17 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops); void nsfs_init(void); -#endif /* _LINUX_NSFS_H */ +#define __current_namespace_from_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: current->nsproxy->cgroup_ns, \ + struct ipc_namespace *: current->nsproxy->ipc_ns, \ + struct net *: current->nsproxy->net_ns, \ + struct pid_namespace *: task_active_pid_ns(current), \ + struct mnt_namespace *: current->nsproxy->mnt_ns, \ + struct time_namespace *: current->nsproxy->time_ns, \ + struct user_namespace *: current_user_ns(), \ + struct uts_namespace *: current->nsproxy->uts_ns) + +#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) +#endif /* _LINUX_NSFS_H */ From 5222470b2fbb3740f931f189db33dd1367b1ae75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:51 +0200 Subject: [PATCH 2288/3206] nsfs: support file handles A while ago we added support for file handles to pidfs so pidfds can be encoded and decoded as file handles. Userspace has adopted this quickly and it's proven very useful. Implement file handles for namespaces as well. A process is not always able to open /proc/self/ns/. That requires procfs to be mounted and for /proc/self/ or /proc/self/ns/ to not be overmounted. However, userspace can always derive a namespace fd from a pidfd. And that always works for a task's own namespace. There's no need to introduce unnecessary behavioral differences between /proc/self/ns/ fds, pidfd-derived namespace fds, and file-handle-derived namespace fds. So namespace file handles are always decodable if the caller is located in the namespace the file handle refers to. This also allows a task to e.g., store a set of file handles to its namespaces in a file on-disk so it can verify when it gets rexeced that they're still valid and so on. This is akin to the pidfd use-case. Or just plainly for namespace comparison reasons where a file handle to the task's own namespace can be easily compared against others. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/nsfs.c | 158 ++++++++++++++++++++++++++++++++++++++ include/linux/exportfs.h | 6 ++ include/uapi/linux/nsfs.h | 9 +++ 3 files changed, 173 insertions(+) diff --git a/fs/nsfs.c b/fs/nsfs.c index 80e631aeb3ce13..926e2680414ed4 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,6 +13,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include "mount.h" #include "internal.h" @@ -417,12 +423,164 @@ static const struct stashed_operations nsfs_stashed_ops = { .put_data = nsfs_put_data, }; +#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32)) +#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32)) + +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct ns_common *ns = inode->i_private; + int len = *max_len; + + if (parent) + return FILEID_INVALID; + + if (len < NSFS_FID_SIZE_U32_VER0) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + return FILEID_INVALID; + } else if (len > NSFS_FID_SIZE_U32_LATEST) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + } + + fid->ns_id = ns->ns_id; + fid->ns_type = ns->ops->type; + fid->ns_inum = inode->i_ino; + return FILEID_NSFS; +} + +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct path path __free(path_put) = {}; + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct user_namespace *owning_ns = NULL; + struct ns_common *ns; + int ret; + + if (fh_len < NSFS_FID_SIZE_U32_VER0) + return NULL; + + /* Check that any trailing bytes are zero. */ + if ((fh_len > NSFS_FID_SIZE_U32_LATEST) && + memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0, + fh_len - NSFS_FID_SIZE_U32_LATEST)) + return NULL; + + switch (fh_type) { + case FILEID_NSFS: + break; + default: + return NULL; + } + + scoped_guard(rcu) { + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); + if (!ns) + return NULL; + + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); + VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + + if (!refcount_inc_not_zero(&ns->count)) + return NULL; + } + + switch (ns->ops->type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + if (!current_in_namespace(to_cg_ns(ns))) + owning_ns = to_cg_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + if (!current_in_namespace(to_ipc_ns(ns))) + owning_ns = to_ipc_ns(ns)->user_ns; + break; +#endif + case CLONE_NEWNS: + if (!current_in_namespace(to_mnt_ns(ns))) + owning_ns = to_mnt_ns(ns)->user_ns; + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + if (!current_in_namespace(to_net_ns(ns))) + owning_ns = to_net_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + if (!current_in_namespace(to_pid_ns(ns))) { + owning_ns = to_pid_ns(ns)->user_ns; + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + if (!current_in_namespace(to_time_ns(ns))) + owning_ns = to_time_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + if (!current_in_namespace(to_user_ns(ns))) + owning_ns = to_user_ns(ns); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + if (!current_in_namespace(to_uts_ns(ns))) + owning_ns = to_uts_ns(ns)->user_ns; + break; +#endif + default: + return ERR_PTR(-EOPNOTSUPP); + } + + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + + /* path_from_stashed() unconditionally consumes the reference. */ + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (ret) + return ERR_PTR(ret); + + return no_free_ptr(path.dentry); +} + +static int nsfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + /* nsfs_fh_to_dentry() performs all permission checks. */ + return 0; +} + +static struct file *nsfs_export_open(struct path *path, unsigned int oflags) +{ + return file_open_root(path, "", oflags, 0); +} + +static const struct export_operations nsfs_export_operations = { + .encode_fh = nsfs_encode_fh, + .fh_to_dentry = nsfs_fh_to_dentry, + .open = nsfs_export_open, + .permission = nsfs_export_permission, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &nsfs_ops; + ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cfb0dd1ea49c70..3aac58a520c70a 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -122,6 +122,12 @@ enum fid_type { FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, FILEID_BCACHEFS_WITH_PARENT = 0xb2, + /* + * + * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. + */ + FILEID_NSFS = 0xf1, + /* * 64 bit unique kernfs id */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 97d8d80d139fc4..fa86fe3c8bd3b4 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -53,4 +53,13 @@ enum init_ns_ino { MNT_NS_INIT_INO = 0xEFFFFFF8U, }; +struct nsfs_file_handle { + __u64 ns_id; + __u32 ns_type; + __u32 ns_inum; +}; + +#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ +#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ + #endif /* __LINUX_NSFS_H */ From e83f0b5d10dcf62833008327cb661c7d118bca85 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:52 +0200 Subject: [PATCH 2289/3206] nsfs: support exhaustive file handles Pidfd file handles are exhaustive meaning they don't require a handle on another pidfd to pass to open_by_handle_at() so it can derive the filesystem to decode in. Instead it can be derived from the file handle itself. The same is possible for namespace file handles. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/fhandle.c | 6 ++++++ fs/internal.h | 1 + fs/nsfs.c | 10 ++++++++++ include/uapi/linux/fcntl.h | 1 + 4 files changed, 18 insertions(+) diff --git a/fs/fhandle.c b/fs/fhandle.c index 68a7d2861c58fe..dd734d8828d04a 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root) return 0; } + if (fd == FD_NSFS_ROOT) { + nsfs_get_root(root); + return 0; + } + return -EBADF; } diff --git a/fs/internal.h b/fs/internal.h index 38e8aab27bbda3..a33d18ee5b74d2 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); void pidfs_get_root(struct path *path); +void nsfs_get_root(struct path *path); diff --git a/fs/nsfs.c b/fs/nsfs.c index 926e2680414ed4..22765fcab18e6d 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -25,6 +25,14 @@ static struct vfsmount *nsfs_mnt; +static struct path nsfs_root_path = {}; + +void nsfs_get_root(struct path *path) +{ + *path = nsfs_root_path; + path_get(path); +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static const struct file_operations ns_file_operations = { @@ -598,4 +606,6 @@ void __init nsfs_init(void) if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; + nsfs_root_path.mnt = nsfs_mnt; + nsfs_root_path.dentry = nsfs_mnt->mnt_root; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index f291ab4f94ebcc..3741ea1b73d850 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -111,6 +111,7 @@ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ #define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ From f861225b9ee9cb2da1c7b2f5f921856cb8ca86bb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:53 +0200 Subject: [PATCH 2290/3206] nsfs: add missing id retrieval support The mount namespace has supported id retrieval for a while already. Add support for the other types as well. Signed-off-by: Christian Brauner --- fs/nsfs.c | 25 +++++++++++++------------ include/uapi/linux/nsfs.h | 6 ++++-- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 22765fcab18e6d..8484bc4dd3debc 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -177,6 +177,7 @@ static bool nsfs_ioctl_valid(unsigned int cmd) case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: + case NS_GET_ID: return true; } @@ -226,18 +227,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); - case NS_GET_MNTNS_ID: { - __u64 __user *idp; - __u64 id; - - if (ns->ops->type != CLONE_NEWNS) - return -EINVAL; - - mnt_ns = container_of(ns, struct mnt_namespace, ns); - idp = (__u64 __user *)arg; - id = mnt_ns->ns.ns_id; - return put_user(id, idp); - } case NS_GET_PID_FROM_PIDNS: fallthrough; case NS_GET_TGID_FROM_PIDNS: @@ -283,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, ret = -ESRCH; return ret; } + case NS_GET_MNTNS_ID: + if (ns->ops->type != CLONE_NEWNS) + return -EINVAL; + fallthrough; + case NS_GET_ID: { + __u64 __user *idp; + __u64 id; + + idp = (__u64 __user *)arg; + id = ns->ns_id; + return put_user(id, idp); + } } /* extensible ioctls */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index fa86fe3c8bd3b4..5d5bf22464c92e 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -16,8 +16,6 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) -/* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ @@ -42,6 +40,10 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +/* Retrieve namespace identifiers. */ +#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) +#define NS_GET_ID _IOR(NSIO, 13, __u64) + enum init_ns_ino { IPC_NS_INIT_INO = 0xEFFFFFFFU, UTS_NS_INIT_INO = 0xEFFFFFFEU, From 87a1716c7d6551e59ca4bd9cdcdf04c67deaa337 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:54 +0200 Subject: [PATCH 2291/3206] tools: update nsfs.h uapi header Update the nsfs.h tools header to the uapi/nsfs.h header so we can rely on it in the selftests. Signed-off-by: Christian Brauner --- tools/include/uapi/linux/nsfs.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h index 34127653fd0070..33c9b578b3b243 100644 --- a/tools/include/uapi/linux/nsfs.h +++ b/tools/include/uapi/linux/nsfs.h @@ -16,8 +16,6 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) -/* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ @@ -42,4 +40,19 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +/* Retrieve namespace identifiers. */ +#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) +#define NS_GET_ID _IOR(NSIO, 13, __u64) + +enum init_ns_ino { + IPC_NS_INIT_INO = 0xEFFFFFFFU, + UTS_NS_INIT_INO = 0xEFFFFFFEU, + USER_NS_INIT_INO = 0xEFFFFFFDU, + PID_NS_INIT_INO = 0xEFFFFFFCU, + CGROUP_NS_INIT_INO = 0xEFFFFFFBU, + TIME_NS_INIT_INO = 0xEFFFFFFAU, + NET_NS_INIT_INO = 0xEFFFFFF9U, + MNT_NS_INIT_INO = 0xEFFFFFF8U, +}; + #endif /* __LINUX_NSFS_H */ From 14f98438f0ed7b121a847260023de64ae128887e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:55 +0200 Subject: [PATCH 2292/3206] selftests/namespaces: add identifier selftests Add a bunch of selftests for the identifier retrieval ioctls. Signed-off-by: Christian Brauner --- tools/testing/selftests/namespaces/.gitignore | 1 + tools/testing/selftests/namespaces/Makefile | 7 + tools/testing/selftests/namespaces/config | 7 + .../testing/selftests/namespaces/nsid_test.c | 986 ++++++++++++++++++ 4 files changed, 1001 insertions(+) create mode 100644 tools/testing/selftests/namespaces/.gitignore create mode 100644 tools/testing/selftests/namespaces/Makefile create mode 100644 tools/testing/selftests/namespaces/config create mode 100644 tools/testing/selftests/namespaces/nsid_test.c diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore new file mode 100644 index 00000000000000..c1e8d634dd21b1 --- /dev/null +++ b/tools/testing/selftests/namespaces/.gitignore @@ -0,0 +1 @@ +nsid_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile new file mode 100644 index 00000000000000..9280c703533e30 --- /dev/null +++ b/tools/testing/selftests/namespaces/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) + +TEST_GEN_PROGS := nsid_test + +include ../lib.mk + diff --git a/tools/testing/selftests/namespaces/config b/tools/testing/selftests/namespaces/config new file mode 100644 index 00000000000000..d09836260262f4 --- /dev/null +++ b/tools/testing/selftests/namespaces/config @@ -0,0 +1,7 @@ +CONFIG_UTS_NS=y +CONFIG_TIME_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CGROUPS=y diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c new file mode 100644 index 00000000000000..e28accd74a57e0 --- /dev/null +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -0,0 +1,986 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest_harness.h" + +TEST(nsid_mntns_basic) +{ + __u64 mnt_ns_id = 0; + int fd_mntns; + int ret; + + /* Open the current mount namespace */ + fd_mntns = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(fd_mntns, 0); + + /* Get the mount namespace ID */ + ret = ioctl(fd_mntns, NS_GET_MNTNS_ID, &mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(mnt_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 mnt_ns_id2 = 0; + ret = ioctl(fd_mntns, NS_GET_ID, &mnt_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(mnt_ns_id, mnt_ns_id2); + + close(fd_mntns); +} + +TEST(nsid_mntns_separate) +{ + __u64 parent_mnt_ns_id = 0; + __u64 child_mnt_ns_id = 0; + int fd_parent_mntns, fd_child_mntns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's mount namespace ID */ + fd_parent_mntns = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(fd_parent_mntns, 0); + ret = ioctl(fd_parent_mntns, NS_GET_ID, &parent_mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_mnt_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new mount namespace */ + ret = unshare(CLONE_NEWNS); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_mntns); + SKIP(return, "No permission to create mount namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's mount namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/mnt", pid); + fd_child_mntns = open(path, O_RDONLY); + ASSERT_GE(fd_child_mntns, 0); + + /* Get child's mount namespace ID */ + ret = ioctl(fd_child_mntns, NS_GET_ID, &child_mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_mnt_ns_id, 0); + + /* Parent and child should have different mount namespace IDs */ + ASSERT_NE(parent_mnt_ns_id, child_mnt_ns_id); + + close(fd_parent_mntns); + close(fd_child_mntns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_cgroupns_basic) +{ + __u64 cgroup_ns_id = 0; + int fd_cgroupns; + int ret; + + /* Open the current cgroup namespace */ + fd_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); + ASSERT_GE(fd_cgroupns, 0); + + /* Get the cgroup namespace ID */ + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(cgroup_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 cgroup_ns_id2 = 0; + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(cgroup_ns_id, cgroup_ns_id2); + + close(fd_cgroupns); +} + +TEST(nsid_cgroupns_separate) +{ + __u64 parent_cgroup_ns_id = 0; + __u64 child_cgroup_ns_id = 0; + int fd_parent_cgroupns, fd_child_cgroupns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's cgroup namespace ID */ + fd_parent_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); + ASSERT_GE(fd_parent_cgroupns, 0); + ret = ioctl(fd_parent_cgroupns, NS_GET_ID, &parent_cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_cgroup_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new cgroup namespace */ + ret = unshare(CLONE_NEWCGROUP); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_cgroupns); + SKIP(return, "No permission to create cgroup namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's cgroup namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/cgroup", pid); + fd_child_cgroupns = open(path, O_RDONLY); + ASSERT_GE(fd_child_cgroupns, 0); + + /* Get child's cgroup namespace ID */ + ret = ioctl(fd_child_cgroupns, NS_GET_ID, &child_cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_cgroup_ns_id, 0); + + /* Parent and child should have different cgroup namespace IDs */ + ASSERT_NE(parent_cgroup_ns_id, child_cgroup_ns_id); + + close(fd_parent_cgroupns); + close(fd_child_cgroupns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_ipcns_basic) +{ + __u64 ipc_ns_id = 0; + int fd_ipcns; + int ret; + + /* Open the current IPC namespace */ + fd_ipcns = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(fd_ipcns, 0); + + /* Get the IPC namespace ID */ + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(ipc_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 ipc_ns_id2 = 0; + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(ipc_ns_id, ipc_ns_id2); + + close(fd_ipcns); +} + +TEST(nsid_ipcns_separate) +{ + __u64 parent_ipc_ns_id = 0; + __u64 child_ipc_ns_id = 0; + int fd_parent_ipcns, fd_child_ipcns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's IPC namespace ID */ + fd_parent_ipcns = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(fd_parent_ipcns, 0); + ret = ioctl(fd_parent_ipcns, NS_GET_ID, &parent_ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_ipc_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new IPC namespace */ + ret = unshare(CLONE_NEWIPC); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_ipcns); + SKIP(return, "No permission to create IPC namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's IPC namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/ipc", pid); + fd_child_ipcns = open(path, O_RDONLY); + ASSERT_GE(fd_child_ipcns, 0); + + /* Get child's IPC namespace ID */ + ret = ioctl(fd_child_ipcns, NS_GET_ID, &child_ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_ipc_ns_id, 0); + + /* Parent and child should have different IPC namespace IDs */ + ASSERT_NE(parent_ipc_ns_id, child_ipc_ns_id); + + close(fd_parent_ipcns); + close(fd_child_ipcns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_utsns_basic) +{ + __u64 uts_ns_id = 0; + int fd_utsns; + int ret; + + /* Open the current UTS namespace */ + fd_utsns = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(fd_utsns, 0); + + /* Get the UTS namespace ID */ + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(uts_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 uts_ns_id2 = 0; + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(uts_ns_id, uts_ns_id2); + + close(fd_utsns); +} + +TEST(nsid_utsns_separate) +{ + __u64 parent_uts_ns_id = 0; + __u64 child_uts_ns_id = 0; + int fd_parent_utsns, fd_child_utsns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's UTS namespace ID */ + fd_parent_utsns = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(fd_parent_utsns, 0); + ret = ioctl(fd_parent_utsns, NS_GET_ID, &parent_uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_uts_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new UTS namespace */ + ret = unshare(CLONE_NEWUTS); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_utsns); + SKIP(return, "No permission to create UTS namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's UTS namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid); + fd_child_utsns = open(path, O_RDONLY); + ASSERT_GE(fd_child_utsns, 0); + + /* Get child's UTS namespace ID */ + ret = ioctl(fd_child_utsns, NS_GET_ID, &child_uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_uts_ns_id, 0); + + /* Parent and child should have different UTS namespace IDs */ + ASSERT_NE(parent_uts_ns_id, child_uts_ns_id); + + close(fd_parent_utsns); + close(fd_child_utsns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_userns_basic) +{ + __u64 user_ns_id = 0; + int fd_userns; + int ret; + + /* Open the current user namespace */ + fd_userns = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(fd_userns, 0); + + /* Get the user namespace ID */ + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(user_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 user_ns_id2 = 0; + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(user_ns_id, user_ns_id2); + + close(fd_userns); +} + +TEST(nsid_userns_separate) +{ + __u64 parent_user_ns_id = 0; + __u64 child_user_ns_id = 0; + int fd_parent_userns, fd_child_userns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's user namespace ID */ + fd_parent_userns = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(fd_parent_userns, 0); + ret = ioctl(fd_parent_userns, NS_GET_ID, &parent_user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_user_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new user namespace */ + ret = unshare(CLONE_NEWUSER); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_userns); + SKIP(return, "No permission to create user namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's user namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/user", pid); + fd_child_userns = open(path, O_RDONLY); + ASSERT_GE(fd_child_userns, 0); + + /* Get child's user namespace ID */ + ret = ioctl(fd_child_userns, NS_GET_ID, &child_user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_user_ns_id, 0); + + /* Parent and child should have different user namespace IDs */ + ASSERT_NE(parent_user_ns_id, child_user_ns_id); + + close(fd_parent_userns); + close(fd_child_userns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_timens_basic) +{ + __u64 time_ns_id = 0; + int fd_timens; + int ret; + + /* Open the current time namespace */ + fd_timens = open("/proc/self/ns/time", O_RDONLY); + if (fd_timens < 0) { + SKIP(return, "Time namespaces not supported"); + } + + /* Get the time namespace ID */ + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(time_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 time_ns_id2 = 0; + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(time_ns_id, time_ns_id2); + + close(fd_timens); +} + +TEST(nsid_timens_separate) +{ + __u64 parent_time_ns_id = 0; + __u64 child_time_ns_id = 0; + int fd_parent_timens, fd_child_timens; + int ret; + pid_t pid; + int pipefd[2]; + + /* Open the current time namespace */ + fd_parent_timens = open("/proc/self/ns/time", O_RDONLY); + if (fd_parent_timens < 0) { + SKIP(return, "Time namespaces not supported"); + } + + /* Get parent's time namespace ID */ + ret = ioctl(fd_parent_timens, NS_GET_ID, &parent_time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_time_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new time namespace */ + ret = unshare(CLONE_NEWTIME); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES || errno == EINVAL) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Fork a grandchild to actually enter the new namespace */ + pid_t grandchild = fork(); + if (grandchild == 0) { + /* Grandchild is in the new namespace */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + pause(); + _exit(0); + } else if (grandchild > 0) { + /* Child writes grandchild PID and waits */ + write(pipefd[1], "Y", 1); + write(pipefd[1], &grandchild, sizeof(grandchild)); + close(pipefd[1]); + pause(); /* Keep the parent alive to maintain the grandchild */ + _exit(0); + } else { + _exit(1); + } + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_timens); + close(pipefd[0]); + SKIP(return, "Cannot create time namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + pid_t grandchild_pid; + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + close(pipefd[0]); + + /* Open grandchild's time namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/time", grandchild_pid); + fd_child_timens = open(path, O_RDONLY); + ASSERT_GE(fd_child_timens, 0); + + /* Get child's time namespace ID */ + ret = ioctl(fd_child_timens, NS_GET_ID, &child_time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_time_ns_id, 0); + + /* Parent and child should have different time namespace IDs */ + ASSERT_NE(parent_time_ns_id, child_time_ns_id); + + close(fd_parent_timens); + close(fd_child_timens); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_pidns_basic) +{ + __u64 pid_ns_id = 0; + int fd_pidns; + int ret; + + /* Open the current PID namespace */ + fd_pidns = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(fd_pidns, 0); + + /* Get the PID namespace ID */ + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(pid_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 pid_ns_id2 = 0; + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(pid_ns_id, pid_ns_id2); + + close(fd_pidns); +} + +TEST(nsid_pidns_separate) +{ + __u64 parent_pid_ns_id = 0; + __u64 child_pid_ns_id = 0; + int fd_parent_pidns, fd_child_pidns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's PID namespace ID */ + fd_parent_pidns = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(fd_parent_pidns, 0); + ret = ioctl(fd_parent_pidns, NS_GET_ID, &parent_pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_pid_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new PID namespace */ + ret = unshare(CLONE_NEWPID); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Fork a grandchild to actually enter the new namespace */ + pid_t grandchild = fork(); + if (grandchild == 0) { + /* Grandchild is in the new namespace */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + pause(); + _exit(0); + } else if (grandchild > 0) { + /* Child writes grandchild PID and waits */ + write(pipefd[1], "Y", 1); + write(pipefd[1], &grandchild, sizeof(grandchild)); + close(pipefd[1]); + pause(); /* Keep the parent alive to maintain the grandchild */ + _exit(0); + } else { + _exit(1); + } + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_pidns); + close(pipefd[0]); + SKIP(return, "No permission to create PID namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + pid_t grandchild_pid; + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + close(pipefd[0]); + + /* Open grandchild's PID namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/pid", grandchild_pid); + fd_child_pidns = open(path, O_RDONLY); + ASSERT_GE(fd_child_pidns, 0); + + /* Get child's PID namespace ID */ + ret = ioctl(fd_child_pidns, NS_GET_ID, &child_pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_pid_ns_id, 0); + + /* Parent and child should have different PID namespace IDs */ + ASSERT_NE(parent_pid_ns_id, child_pid_ns_id); + + close(fd_parent_pidns); + close(fd_child_pidns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_netns_basic) +{ + __u64 net_ns_id = 0; + __u64 netns_cookie = 0; + int fd_netns; + int sock; + socklen_t optlen; + int ret; + + /* Open the current network namespace */ + fd_netns = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(fd_netns, 0); + + /* Get the network namespace ID via ioctl */ + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(net_ns_id, 0); + + /* Create a socket to get the SO_NETNS_COOKIE */ + sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(sock, 0); + + /* Get the network namespace cookie via socket option */ + optlen = sizeof(netns_cookie); + ret = getsockopt(sock, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + ASSERT_EQ(optlen, sizeof(netns_cookie)); + + /* The namespace ID and cookie should be identical */ + ASSERT_EQ(net_ns_id, netns_cookie); + + /* Verify we can get the same ID again */ + __u64 net_ns_id2 = 0; + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(net_ns_id, net_ns_id2); + + close(sock); + close(fd_netns); +} + +TEST(nsid_netns_separate) +{ + __u64 parent_net_ns_id = 0; + __u64 parent_netns_cookie = 0; + __u64 child_net_ns_id = 0; + __u64 child_netns_cookie = 0; + int fd_parent_netns, fd_child_netns; + int parent_sock, child_sock; + socklen_t optlen; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's network namespace ID */ + fd_parent_netns = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(fd_parent_netns, 0); + ret = ioctl(fd_parent_netns, NS_GET_ID, &parent_net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_net_ns_id, 0); + + /* Get parent's network namespace cookie */ + parent_sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(parent_sock, 0); + optlen = sizeof(parent_netns_cookie); + ret = getsockopt(parent_sock, SOL_SOCKET, SO_NETNS_COOKIE, &parent_netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + + /* Verify parent's ID and cookie match */ + ASSERT_EQ(parent_net_ns_id, parent_netns_cookie); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_netns); + close(parent_sock); + SKIP(return, "No permission to create network namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's network namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/net", pid); + fd_child_netns = open(path, O_RDONLY); + ASSERT_GE(fd_child_netns, 0); + + /* Get child's network namespace ID */ + ret = ioctl(fd_child_netns, NS_GET_ID, &child_net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_net_ns_id, 0); + + /* Create socket in child's namespace to get cookie */ + ret = setns(fd_child_netns, CLONE_NEWNET); + if (ret == 0) { + child_sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(child_sock, 0); + + optlen = sizeof(child_netns_cookie); + ret = getsockopt(child_sock, SOL_SOCKET, SO_NETNS_COOKIE, &child_netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + + /* Verify child's ID and cookie match */ + ASSERT_EQ(child_net_ns_id, child_netns_cookie); + + close(child_sock); + + /* Return to parent namespace */ + setns(fd_parent_netns, CLONE_NEWNET); + } + + /* Parent and child should have different network namespace IDs */ + ASSERT_NE(parent_net_ns_id, child_net_ns_id); + if (child_netns_cookie != 0) { + ASSERT_NE(parent_netns_cookie, child_netns_cookie); + } + + close(fd_parent_netns); + close(fd_child_netns); + close(parent_sock); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST_HARNESS_MAIN From 28ef38a9a2c7aa4dd8dfbb1d2b12f9ea84044327 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:56 +0200 Subject: [PATCH 2293/3206] selftests/namespaces: add file handle selftests Add a bunch of selftests for namespace file handles. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- tools/testing/selftests/namespaces/.gitignore | 1 + tools/testing/selftests/namespaces/Makefile | 2 +- .../selftests/namespaces/file_handle_test.c | 1429 +++++++++++++++++ 3 files changed, 1431 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/namespaces/file_handle_test.c diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index c1e8d634dd21b1..7639dbf58bbf87 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -1 +1,2 @@ nsid_test +file_handle_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index 9280c703533e30..f6c117ce2c2b80 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -TEST_GEN_PROGS := nsid_test +TEST_GEN_PROGS := nsid_test file_handle_test include ../lib.mk diff --git a/tools/testing/selftests/namespaces/file_handle_test.c b/tools/testing/selftests/namespaces/file_handle_test.c new file mode 100644 index 00000000000000..f1bc5773f55216 --- /dev/null +++ b/tools/testing/selftests/namespaces/file_handle_test.c @@ -0,0 +1,1429 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest_harness.h" + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ +#endif + +TEST(nsfs_net_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open a namespace file descriptor */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT as unprivileged user */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + if (fd < 0 && errno == EPERM) { + SKIP(free(handle); close(ns_fd); + return, + "Permission denied for unprivileged user (expected)"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_uts_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open UTS namespace file descriptor */ + ns_fd = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_ipc_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open IPC namespace file descriptor */ + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_pid_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open PID namespace file descriptor */ + ns_fd = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_mnt_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open mount namespace file descriptor */ + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_user_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open user namespace file descriptor */ + ns_fd = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_cgroup_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open cgroup namespace file descriptor */ + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); return, "cgroup namespace not available"); + } + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_time_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open time namespace file descriptor */ + ns_fd = open("/proc/self/ns/time", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); return, "time namespace not available"); + } + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_user_net_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current network namespace */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create network namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's network namespace handle from new user+net namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new network namespace"); + } + + /* Should fail with permission denied since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_uts_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current UTS namespace */ + ns_fd = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new UTS namespace */ + ret = unshare(CLONE_NEWUTS); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create UTS namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's UTS namespace handle from new user+uts namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new UTS namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_ipc_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current IPC namespace */ + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new IPC namespace */ + ret = unshare(CLONE_NEWIPC); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create IPC namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's IPC namespace handle from new user+ipc namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new IPC namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_mnt_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current mount namespace */ + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new mount namespace */ + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create mount namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's mount namespace handle from new user+mnt namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new mount namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_cgroup_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current cgroup namespace */ + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); + return, "cgroup namespace not available"); + } + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new cgroup namespace */ + ret = unshare(CLONE_NEWCGROUP); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create cgroup namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's cgroup namespace handle from new user+cgroup namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new cgroup namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_pid_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current PID namespace */ + ns_fd = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new PID namespace - requires fork to take effect */ + ret = unshare(CLONE_NEWPID); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create PID namespace */ + close(pipefd[1]); + exit(0); + } + + /* Fork again for PID namespace to take effect */ + pid_t child_pid = fork(); + if (child_pid < 0) { + write(pipefd[1], "N", + 1); /* Unable to fork in PID namespace */ + close(pipefd[1]); + exit(0); + } + + if (child_pid == 0) { + /* Grandchild in new PID namespace */ + /* Try to open parent's PID namespace handle from new user+pid namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", + 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child_pid, NULL, 0); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new PID namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_time_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current time namespace */ + ns_fd = open("/proc/self/ns/time", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); + return, "time namespace not available"); + } + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new time namespace - requires fork to take effect */ + ret = unshare(CLONE_NEWTIME); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create time namespace */ + close(pipefd[1]); + exit(0); + } + + /* Fork again for time namespace to take effect */ + pid_t child_pid = fork(); + if (child_pid < 0) { + write(pipefd[1], "N", + 1); /* Unable to fork in time namespace */ + close(pipefd[1]); + exit(0); + } + + if (child_pid == 0) { + /* Grandchild in new time namespace */ + /* Try to open parent's time namespace handle from new user+time namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", + 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child_pid, NULL, 0); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new time namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_open_flags) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open a namespace file descriptor */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Test invalid flags that should fail */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_WRONLY); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDWR); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TRUNC); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECT); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EINVAL); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TMPFILE); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EINVAL); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECTORY); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, ENOTDIR); + + close(ns_fd); + free(handle); +} + +TEST_HARNESS_MAIN From 93f67a7ddadf6ed8997c000df9790e5d64617196 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:00 +0200 Subject: [PATCH 2294/3206] uts: split namespace into separate header We have dedicated headers for all namespace types. Add one for the uts namespace as well. Now it's consistent for all namespace types. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/uts_namespace.h | 65 +++++++++++++++++++++++++++++++++++ include/linux/utsname.h | 58 +------------------------------ 2 files changed, 66 insertions(+), 57 deletions(-) create mode 100644 include/linux/uts_namespace.h diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h new file mode 100644 index 00000000000000..c2b619bb4e573a --- /dev/null +++ b/include/linux/uts_namespace.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_UTS_NAMESPACE_H +#define _LINUX_UTS_NAMESPACE_H + +#include +#include + +struct user_namespace; +extern struct user_namespace init_user_ns; + +struct uts_namespace { + struct new_utsname name; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct ns_common ns; +} __randomize_layout; + +extern struct uts_namespace init_uts_ns; + +#ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + +static inline void get_uts_ns(struct uts_namespace *ns) +{ + refcount_inc(&ns->ns.count); +} + +extern struct uts_namespace *copy_utsname(unsigned long flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns); +extern void free_uts_ns(struct uts_namespace *ns); + +static inline void put_uts_ns(struct uts_namespace *ns) +{ + if (refcount_dec_and_test(&ns->ns.count)) + free_uts_ns(ns); +} + +void uts_ns_init(void); +#else +static inline void get_uts_ns(struct uts_namespace *ns) +{ +} + +static inline void put_uts_ns(struct uts_namespace *ns) +{ +} + +static inline struct uts_namespace *copy_utsname(unsigned long flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns) +{ + if (flags & CLONE_NEWUTS) + return ERR_PTR(-EINVAL); + + return old_ns; +} + +static inline void uts_ns_init(void) +{ +} +#endif + +#endif /* _LINUX_UTS_NAMESPACE_H */ diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 5d34c4f0f94549..547bd4439706e6 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include enum uts_proc { UTS_PROC_ARCH, @@ -18,62 +18,6 @@ enum uts_proc { UTS_PROC_DOMAINNAME, }; -struct user_namespace; -extern struct user_namespace init_user_ns; - -struct uts_namespace { - struct new_utsname name; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct ns_common ns; -} __randomize_layout; -extern struct uts_namespace init_uts_ns; - -#ifdef CONFIG_UTS_NS -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); -} - -static inline void get_uts_ns(struct uts_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -extern struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns); -extern void free_uts_ns(struct uts_namespace *ns); - -static inline void put_uts_ns(struct uts_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_uts_ns(ns); -} - -void uts_ns_init(void); -#else -static inline void get_uts_ns(struct uts_namespace *ns) -{ -} - -static inline void put_uts_ns(struct uts_namespace *ns) -{ -} - -static inline struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns) -{ - if (flags & CLONE_NEWUTS) - return ERR_PTR(-EINVAL); - - return old_ns; -} - -static inline void uts_ns_init(void) -{ -} -#endif - #ifdef CONFIG_PROC_SYSCTL extern void uts_proc_notify(enum uts_proc proc); #else From b2a0b192084acd0a86d66cbbc61e17ba1f5bd583 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:01 +0200 Subject: [PATCH 2295/3206] mnt: expose pointer to init_mnt_ns There's various scenarios where we need to know whether we are in the initial set of namespaces or not to e.g., shortcut permission checking. All namespaces expose that information. Let's do that too. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 27 ++++++++++++++++----------- include/linux/mnt_namespace.h | 2 ++ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index a6899844969897..f0bddc9cf2a6d5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6008,27 +6008,32 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, return ret; } +struct mnt_namespace init_mnt_ns = { + .ns.inum = PROC_MNT_INIT_INO, + .ns.ops = &mntns_operations, + .user_ns = &init_user_ns, + .ns.count = REFCOUNT_INIT(1), + .passive = REFCOUNT_INIT(1), + .mounts = RB_ROOT, + .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), +}; + static void __init init_mount_tree(void) { struct vfsmount *mnt; struct mount *m; - struct mnt_namespace *ns; struct path root; mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, true); - if (IS_ERR(ns)) - panic("Can't allocate initial namespace"); - ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); - ns->root = m; - ns->nr_mounts = 1; - mnt_add_to_ns(ns, m); - init_task.nsproxy->mnt_ns = ns; - get_mnt_ns(ns); + init_mnt_ns.root = m; + init_mnt_ns.nr_mounts = 1; + mnt_add_to_ns(&init_mnt_ns, m); + init_task.nsproxy->mnt_ns = &init_mnt_ns; + get_mnt_ns(&init_mnt_ns); root.mnt = mnt; root.dentry = mnt->mnt_root; @@ -6036,7 +6041,7 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); - ns_tree_add(ns); + ns_tree_add(&init_mnt_ns); } void __init mnt_init(void) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 70b366b6481605..6d1c4c218c14f8 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,6 +11,8 @@ struct fs_struct; struct user_namespace; struct ns_common; +extern struct mnt_namespace init_mnt_ns; + extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); From f74ca6da113d5d4b21c00bb4da3f3c137162b4fe Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:02 +0200 Subject: [PATCH 2296/3206] nscommon: move to separate file It's really awkward spilling the ns common infrastructure into multiple headers. Move it to a separate file. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 3 +++ include/linux/proc_ns.h | 19 ------------------- kernel/Makefile | 2 +- kernel/nscommon.c | 21 +++++++++++++++++++++ 4 files changed, 25 insertions(+), 20 deletions(-) create mode 100644 kernel/nscommon.c diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7224072cccc5e6..78b17fe80b62d4 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -31,6 +31,9 @@ struct ns_common { }; }; +int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, + bool alloc_inum); + #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns)->ns, \ diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 7f89f0829e60ba..9f21670b5824bb 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,25 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -static inline int ns_common_init(struct ns_common *ns, - const struct proc_ns_operations *ops, - bool alloc_inum) -{ - if (alloc_inum) { - int ret; - ret = proc_alloc_inum(&ns->inum); - if (ret) - return ret; - } - refcount_set(&ns->count, 1); - ns->stashed = NULL; - ns->ops = ops; - ns->ns_id = 0; - RB_CLEAR_NODE(&ns->ns_tree_node); - INIT_LIST_HEAD(&ns->ns_list_node); - return 0; -} - #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) diff --git a/kernel/Makefile b/kernel/Makefile index b807516a1b43aa..1f48f7cd2d7bb7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o nstree.o \ + kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o diff --git a/kernel/nscommon.c b/kernel/nscommon.c new file mode 100644 index 00000000000000..ebf4783d050540 --- /dev/null +++ b/kernel/nscommon.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include + +int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, + bool alloc_inum) +{ + if (alloc_inum) { + int ret; + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + } + refcount_set(&ns->count, 1); + ns->stashed = NULL; + ns->ops = ops; + ns->ns_id = 0; + RB_CLEAR_NODE(&ns->ns_tree_node); + INIT_LIST_HEAD(&ns->ns_list_node); + return 0; +} From 5fc6bef178f1b644f1439e520c8f83bfc83a1252 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:03 +0200 Subject: [PATCH 2297/3206] cgroup: split namespace into separate header We have dedicated headers for all namespace types. Add one for the cgroup namespace as well. Now it's consistent for all namespace types and easy to figure out what to include. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 51 +---------------------------- include/linux/cgroup_namespace.h | 56 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 50 deletions(-) create mode 100644 include/linux/cgroup_namespace.h diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9ca25346f7cb5c..5156fed8cbc3f6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,6 +27,7 @@ #include #include +#include struct kernel_clone_args; @@ -783,56 +784,6 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ -struct cgroup_namespace { - struct ns_common ns; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct css_set *root_cset; -}; - -extern struct cgroup_namespace init_cgroup_ns; - -#ifdef CONFIG_CGROUPS - -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - -void free_cgroup_ns(struct cgroup_namespace *ns); - -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, - struct user_namespace *user_ns, - struct cgroup_namespace *old_ns); - -int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns); - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -static inline void put_cgroup_ns(struct cgroup_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_cgroup_ns(ns); -} - -#else /* !CONFIG_CGROUPS */ - -static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } -static inline struct cgroup_namespace * -copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, - struct cgroup_namespace *old_ns) -{ - return old_ns; -} - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } -static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } - -#endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h new file mode 100644 index 00000000000000..c02bb76c5e328b --- /dev/null +++ b/include/linux/cgroup_namespace.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CGROUP_NAMESPACE_H +#define _LINUX_CGROUP_NAMESPACE_H + +struct cgroup_namespace { + struct ns_common ns; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct css_set *root_cset; +}; + +extern struct cgroup_namespace init_cgroup_ns; + +#ifdef CONFIG_CGROUPS + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +void free_cgroup_ns(struct cgroup_namespace *ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns); + +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) +{ + refcount_inc(&ns->ns.count); +} + +static inline void put_cgroup_ns(struct cgroup_namespace *ns) +{ + if (refcount_dec_and_test(&ns->ns.count)) + free_cgroup_ns(ns); +} + +#else /* !CONFIG_CGROUPS */ + +static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } +static inline struct cgroup_namespace * +copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + return old_ns; +} + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } +static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } + +#endif /* !CONFIG_CGROUPS */ + +#endif /* _LINUX_CGROUP_NAMESPACE_H */ From cc47f434271ba90c18c16e0bba360df38a8bc954 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:04 +0200 Subject: [PATCH 2298/3206] nsfs: add inode number for anon namespace Add an inode number anonymous namespaces. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/uapi/linux/nsfs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 5d5bf22464c92e..e098759ec917ac 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -53,6 +53,9 @@ enum init_ns_ino { TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, MNT_NS_INIT_INO = 0xEFFFFFF8U, +#ifdef __KERNEL__ + MNT_NS_ANON_INO = 0xEFFFFFF7U, +#endif }; struct nsfs_file_handle { From 86cdbae5c61c6b8c0a2adc78dbbb0314b3254a9c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:05 +0200 Subject: [PATCH 2299/3206] mnt: simplify ns_common_init() handling Assign the reserved MNT_NS_ANON_INO sentinel to anonymous mount namespaces and cleanup the initial mount ns allocation. This is just a preparatory patch and the ns->inum check in ns_common_init() will be dropped in the next patch. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 2 ++ kernel/nscommon.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index f0bddc9cf2a6d5..b2fcb901ad8c1e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4103,6 +4103,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a return ERR_PTR(-ENOMEM); } + if (anon) + new_ns->ns.inum = MNT_NS_ANON_INO; ret = ns_common_init(&new_ns->ns, &mntns_operations, !anon); if (ret) { kfree(new_ns); diff --git a/kernel/nscommon.c b/kernel/nscommon.c index ebf4783d050540..e10fad8afe61b7 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -5,7 +5,7 @@ int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, bool alloc_inum) { - if (alloc_inum) { + if (alloc_inum && !ns->inum) { int ret; ret = proc_alloc_inum(&ns->inum); if (ret) From d5b27cb8c5f30c972e041b30bc38fa5875b1a469 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:06 +0200 Subject: [PATCH 2300/3206] net: centralize ns_common initialization Centralize ns_common initialization. Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a57b3cda8dbc31..9df23681145470 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -409,7 +409,7 @@ static __net_init int preinit_net(struct net *net, struct user_namespace *user_n ns_ops = NULL; #endif - ret = ns_common_init(&net->ns, ns_ops, false); + ret = ns_common_init(&net->ns, ns_ops, true); if (ret) return ret; @@ -590,6 +590,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: + ns_free_inum(&net->ns); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif @@ -712,6 +713,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); + ns_free_inum(&net->ns); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); @@ -831,31 +833,12 @@ static void net_ns_net_debugfs(struct net *net) static __net_init int net_ns_net_init(struct net *net) { - int ret = 0; - - if (net == &init_net) - net->ns.inum = PROC_NET_INIT_INO; - else - ret = proc_alloc_inum(&to_ns_common(net)->inum); - if (ret) - return ret; - net_ns_net_debugfs(net); return 0; } -static __net_exit void net_ns_net_exit(struct net *net) -{ - /* - * Initial network namespace doesn't exit so we don't need any - * special checks here. - */ - ns_free_inum(&net->ns); -} - static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, - .exit = net_ns_net_exit, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { From 5612ff3ec588be09f11a9424db6d1186bcdeb3fa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:07 +0200 Subject: [PATCH 2301/3206] nscommon: simplify initialization There's a lot of information that namespace implementers don't need to know about at all. Encapsulate this all in the initialization helper. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 5 +++-- include/linux/ns_common.h | 39 +++++++++++++++++++++++++++++++++++++-- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/nscommon.c | 17 ++++++++--------- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- 10 files changed, 55 insertions(+), 20 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index b2fcb901ad8c1e..699b8c770c47da 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4104,8 +4104,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } if (anon) - new_ns->ns.inum = MNT_NS_ANON_INO; - ret = ns_common_init(&new_ns->ns, &mntns_operations, !anon); + ret = ns_common_init_inum(new_ns, &mntns_operations, MNT_NS_ANON_INO); + else + ret = ns_common_init(new_ns, &mntns_operations); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 78b17fe80b62d4..05c7a7dd211bd3 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -16,6 +16,15 @@ struct time_namespace; struct user_namespace; struct uts_namespace; +extern struct cgroup_namespace init_cgroup_ns; +extern struct ipc_namespace init_ipc_ns; +extern struct mnt_namespace init_mnt_ns; +extern struct net init_net; +extern struct pid_namespace init_pid_ns; +extern struct time_namespace init_time_ns; +extern struct user_namespace init_user_ns; +extern struct uts_namespace init_uts_ns; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -31,8 +40,7 @@ struct ns_common { }; }; -int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, - bool alloc_inum); +int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); #define to_ns_common(__ns) \ _Generic((__ns), \ @@ -45,4 +53,31 @@ int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, struct user_namespace *: &(__ns)->ns, \ struct uts_namespace *: &(__ns)->ns) +#define ns_init_inum(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ + struct ipc_namespace *: IPC_NS_INIT_INO, \ + struct mnt_namespace *: MNT_NS_INIT_INO, \ + struct net *: NET_NS_INIT_INO, \ + struct pid_namespace *: PID_NS_INIT_INO, \ + struct time_namespace *: TIME_NS_INIT_INO, \ + struct user_namespace *: USER_NS_INIT_INO, \ + struct uts_namespace *: UTS_NS_INIT_INO) + +#define ns_init_ns(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &init_cgroup_ns, \ + struct ipc_namespace *: &init_ipc_ns, \ + struct mnt_namespace *: &init_mnt_ns, \ + struct net *: &init_net, \ + struct pid_namespace *: &init_pid_ns, \ + struct time_namespace *: &init_time_ns, \ + struct user_namespace *: &init_user_ns, \ + struct uts_namespace *: &init_uts_ns) + +#define ns_common_init(__ns, __ops) \ + __ns_common_init(to_ns_common(__ns), __ops, (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) + #endif diff --git a/ipc/namespace.c b/ipc/namespace.c index 89588819956b20..0f8bbd18a47580 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -62,7 +62,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_common_init(&ns->ns, &ipcns_operations, true); + err = ns_common_init(ns, &ipcns_operations); if (err) goto fail_free; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 5a327914b5654e..d928c557e28b1a 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -27,7 +27,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_common_init(&new_ns->ns, &cgroupns_operations, true); + ret = ns_common_init(new_ns, &cgroupns_operations); if (ret) return ERR_PTR(ret); ns_tree_add(new_ns); diff --git a/kernel/nscommon.c b/kernel/nscommon.c index e10fad8afe61b7..c3a90bb665ad67 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -1,21 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include -int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, - bool alloc_inum) +int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) { - if (alloc_inum && !ns->inum) { - int ret; - ret = proc_alloc_inum(&ns->inum); - if (ret) - return ret; - } refcount_set(&ns->count, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); - return 0; + + if (inum) { + ns->inum = inum; + return 0; + } + return proc_alloc_inum(&ns->inum); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 9b327420309e8b..170757c265c299 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -103,7 +103,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_common_init(&ns->ns, &pidns_operations, true); + err = ns_common_init(ns, &pidns_operations); if (err) goto out_free_idr; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 20b65f90549e20..ce8e952104a7e7 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -97,7 +97,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns->vvar_page) goto fail_free; - err = ns_common_init(&ns->ns, &timens_operations, true); + err = ns_common_init(ns, &timens_operations); if (err) goto fail_free_page; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index cfb0e28f2779c9..db9f0463219c16 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -126,7 +126,7 @@ int create_user_ns(struct cred *new) ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_common_init(&ns->ns, &userns_operations, true); + ret = ns_common_init(ns, &userns_operations); if (ret) goto fail_free; diff --git a/kernel/utsname.c b/kernel/utsname.c index a682830742d344..399888be66bd1f 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -50,7 +50,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - err = ns_common_init(&ns->ns, &utsns_operations, true); + err = ns_common_init(ns, &utsns_operations); if (err) goto fail_free; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9df23681145470..e50897fba8cde1 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -409,7 +409,7 @@ static __net_init int preinit_net(struct net *net, struct user_namespace *user_n ns_ops = NULL; #endif - ret = ns_common_init(&net->ns, ns_ops, true); + ret = ns_common_init(net, ns_ops); if (ret) return ret; From 5d36370f34312776d202e5c35d1a786d8b07a9c3 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Wed, 17 Sep 2025 08:32:50 +0100 Subject: [PATCH 2302/3206] ALSA: compress: add raw opus codec define and opus decoder structs Adds a raw opus codec define and raw opus decoder structs. This is for raw OPUS packets not packed in any type of container (for instance OGG container). The decoder struct fields are taken from corresponding RFC document: RFC 7845 Section 5. Cc: Srinivas Kandagatla Cc: Vinod Koul Co-developed-by: Annemarie Porter Signed-off-by: Annemarie Porter Signed-off-by: Alexey Klimov Signed-off-by: Takashi Iwai --- include/uapi/sound/compress_params.h | 43 +++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/include/uapi/sound/compress_params.h b/include/uapi/sound/compress_params.h index bc7648a30746f4..faf4fa911f7fc2 100644 --- a/include/uapi/sound/compress_params.h +++ b/include/uapi/sound/compress_params.h @@ -43,7 +43,8 @@ #define SND_AUDIOCODEC_BESPOKE ((__u32) 0x0000000E) #define SND_AUDIOCODEC_ALAC ((__u32) 0x0000000F) #define SND_AUDIOCODEC_APE ((__u32) 0x00000010) -#define SND_AUDIOCODEC_MAX SND_AUDIOCODEC_APE +#define SND_AUDIOCODEC_OPUS_RAW ((__u32) 0x00000011) +#define SND_AUDIOCODEC_MAX SND_AUDIOCODEC_OPUS_RAW /* * Profile and modes are listed with bit masks. This allows for a @@ -324,6 +325,45 @@ struct snd_dec_ape { __u32 seek_table_present; } __attribute__((packed, aligned(4))); +/** + * struct snd_dec_opus - Opus decoder parameters (raw opus packets) + * @version: Usually should be '1' but can be split into major (4 upper bits) + * and minor (4 lower bits) sub-fields. + * @num_channels: Number of output channels. + * @pre_skip: Number of samples to discard at 48 kHz. + * @sample_rate: Sample rate of original input. + * @output_gain: Gain to apply when decoding (in Q7.8 format). + * @mapping_family: Order and meaning of output channels. Only values 0 and 1 + * are expected; values 2..255 are not recommended for playback. + * + * Optional channel mapping table. Describes mapping of opus streams to decoded + * channels. + * @struct snd_dec_opus_ch_map + * @stream_count: Number of streams encoded in each Ogg packet. + * @coupled_count: Number of streams whose decoders are used for two + * channels. + * @channel_map: describes which decoded channel to be used for each one. + * See RFC doc for details. + * This supports only mapping families 0 and 1, therefore max + * number of channels is 8. + * + * These options were extracted from RFC7845 Section 5. + */ + +struct snd_dec_opus { + __u8 version; + __u8 num_channels; + __u16 pre_skip; + __u32 sample_rate; + __u16 output_gain; + __u8 mapping_family; + struct snd_dec_opus_ch_map { + __u8 stream_count; + __u8 coupled_count; + __u8 channel_map[8]; + } chan_map; +} __attribute__((packed, aligned(4))); + union snd_codec_options { struct snd_enc_wma wma; struct snd_enc_vorbis vorbis; @@ -334,6 +374,7 @@ union snd_codec_options { struct snd_dec_wma wma_d; struct snd_dec_alac alac_d; struct snd_dec_ape ape_d; + struct snd_dec_opus opus_d; struct { __u32 out_sample_rate; } src_d; From b07d2514b91c30ab16fdf8f9cc3523bef969becf Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Wed, 17 Sep 2025 08:32:51 +0100 Subject: [PATCH 2303/3206] ALSA: compress_offload: increase SNDRV_COMPRESS_VERSION minor version by 1 Since addition of raw opus codec support we need to update compress API minor version by one. Bump the SNDRV_COMPRESS_VERSION to 0.4.1. Signed-off-by: Alexey Klimov Acked-by: Vinod Koul Signed-off-by: Takashi Iwai --- include/uapi/sound/compress_offload.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/sound/compress_offload.h b/include/uapi/sound/compress_offload.h index 26f756cc2e6215..b610683fd8dbe4 100644 --- a/include/uapi/sound/compress_offload.h +++ b/include/uapi/sound/compress_offload.h @@ -13,7 +13,7 @@ #include #include -#define SNDRV_COMPRESS_VERSION SNDRV_PROTOCOL_VERSION(0, 4, 0) +#define SNDRV_COMPRESS_VERSION SNDRV_PROTOCOL_VERSION(0, 4, 1) /** * struct snd_compressed_buffer - compressed buffer * @fragment_size: size of buffer fragment in bytes From fc87f70bd133afd5b41fa8c128beb58c1ccc6e99 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Wed, 17 Sep 2025 08:32:52 +0100 Subject: [PATCH 2304/3206] ASoC: qcom: qdsp6/audioreach: add support for offloading raw opus playback Add support for OPUS module, OPUS format ID, media format payload struct and make it all recognizable by audioreach compress playback path. At this moment this only supports raw or plain OPUS packets not encapsulated in container (for instance OGG container). For this usecase each OPUS packet needs to be prepended with 4-bytes long length field which is expected to be done by userspace applications. This is Qualcomm DSP specific requirement. Cc: Annemarie Porter Cc: Vinod Koul Co-developed-by: Srinivas Kandagatla Signed-off-by: Srinivas Kandagatla Signed-off-by: Alexey Klimov Acked-by: Mark Brown Signed-off-by: Takashi Iwai --- sound/soc/qcom/qdsp6/audioreach.c | 27 +++++++++++++++++++++++++++ sound/soc/qcom/qdsp6/audioreach.h | 17 +++++++++++++++++ sound/soc/qcom/qdsp6/q6apm-dai.c | 3 ++- sound/soc/qcom/qdsp6/q6apm.c | 3 +++ 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/sound/soc/qcom/qdsp6/audioreach.c b/sound/soc/qcom/qdsp6/audioreach.c index 4ebaaf736fb98a..2d27e33c545357 100644 --- a/sound/soc/qcom/qdsp6/audioreach.c +++ b/sound/soc/qcom/qdsp6/audioreach.c @@ -859,6 +859,7 @@ static int audioreach_set_compr_media_format(struct media_format *media_fmt_hdr, struct payload_media_fmt_aac_t *aac_cfg; struct payload_media_fmt_pcm *mp3_cfg; struct payload_media_fmt_flac_t *flac_cfg; + struct payload_media_fmt_opus_t *opus_cfg; switch (mcfg->fmt) { case SND_AUDIOCODEC_MP3: @@ -901,6 +902,32 @@ static int audioreach_set_compr_media_format(struct media_format *media_fmt_hdr, flac_cfg->min_frame_size = mcfg->codec.options.flac_d.min_frame_size; flac_cfg->max_frame_size = mcfg->codec.options.flac_d.max_frame_size; break; + case SND_AUDIOCODEC_OPUS_RAW: + media_fmt_hdr->data_format = DATA_FORMAT_RAW_COMPRESSED; + media_fmt_hdr->fmt_id = MEDIA_FMT_ID_OPUS; + media_fmt_hdr->payload_size = sizeof(*opus_cfg); + p = p + sizeof(*media_fmt_hdr); + opus_cfg = p; + /* raw opus packets prepended with 4 bytes of length */ + opus_cfg->bitstream_format = 1; + /* + * payload_type: + * 0 -- read metadata from opus stream; + * 1 -- metadata is provided by filling in the struct here. + */ + opus_cfg->payload_type = 1; + opus_cfg->version = mcfg->codec.options.opus_d.version; + opus_cfg->num_channels = mcfg->codec.options.opus_d.num_channels; + opus_cfg->pre_skip = mcfg->codec.options.opus_d.pre_skip; + opus_cfg->sample_rate = mcfg->codec.options.opus_d.sample_rate; + opus_cfg->output_gain = mcfg->codec.options.opus_d.output_gain; + opus_cfg->mapping_family = mcfg->codec.options.opus_d.mapping_family; + opus_cfg->stream_count = mcfg->codec.options.opus_d.chan_map.stream_count; + opus_cfg->coupled_count = mcfg->codec.options.opus_d.chan_map.coupled_count; + memcpy(opus_cfg->channel_mapping, mcfg->codec.options.opus_d.chan_map.channel_map, + sizeof(opus_cfg->channel_mapping)); + opus_cfg->reserved[0] = opus_cfg->reserved[1] = opus_cfg->reserved[2] = 0; + break; default: return -EINVAL; } diff --git a/sound/soc/qcom/qdsp6/audioreach.h b/sound/soc/qcom/qdsp6/audioreach.h index 61a69df4f50f6c..512ea24fd402c9 100644 --- a/sound/soc/qcom/qdsp6/audioreach.h +++ b/sound/soc/qcom/qdsp6/audioreach.h @@ -29,6 +29,7 @@ struct q6apm_graph; #define MODULE_ID_MP3_DECODE 0x0700103B #define MODULE_ID_GAPLESS 0x0700104D #define MODULE_ID_DISPLAY_PORT_SINK 0x07001069 +#define MODULE_ID_OPUS_DEC 0x07001174 #define APM_CMD_GET_SPF_STATE 0x01001021 #define APM_CMD_RSP_GET_SPF_STATE 0x02001007 @@ -255,6 +256,22 @@ struct payload_media_fmt_aac_t { uint32_t sample_rate; } __packed; +#define MEDIA_FMT_ID_OPUS 0x09001039 +struct payload_media_fmt_opus_t { + uint16_t bitstream_format; + uint16_t payload_type; + uint8_t version; + uint8_t num_channels; + uint16_t pre_skip; + uint32_t sample_rate; + uint16_t output_gain; + uint8_t mapping_family; + uint8_t stream_count; + uint8_t coupled_count; + uint8_t channel_mapping[8]; + uint8_t reserved[3]; +} __packed; + #define DATA_CMD_WR_SH_MEM_EP_EOS 0x04001002 #define WR_SH_MEM_EP_EOS_POLICY_LAST 1 #define WR_SH_MEM_EP_EOS_POLICY_EACH 2 diff --git a/sound/soc/qcom/qdsp6/q6apm-dai.c b/sound/soc/qcom/qdsp6/q6apm-dai.c index 09da26f712a6ad..4ecaff45c51860 100644 --- a/sound/soc/qcom/qdsp6/q6apm-dai.c +++ b/sound/soc/qcom/qdsp6/q6apm-dai.c @@ -551,10 +551,11 @@ static int q6apm_dai_compr_get_caps(struct snd_soc_component *component, caps->max_fragment_size = COMPR_PLAYBACK_MAX_FRAGMENT_SIZE; caps->min_fragments = COMPR_PLAYBACK_MIN_NUM_FRAGMENTS; caps->max_fragments = COMPR_PLAYBACK_MAX_NUM_FRAGMENTS; - caps->num_codecs = 3; + caps->num_codecs = 4; caps->codecs[0] = SND_AUDIOCODEC_MP3; caps->codecs[1] = SND_AUDIOCODEC_AAC; caps->codecs[2] = SND_AUDIOCODEC_FLAC; + caps->codecs[3] = SND_AUDIOCODEC_OPUS_RAW; return 0; } diff --git a/sound/soc/qcom/qdsp6/q6apm.c b/sound/soc/qcom/qdsp6/q6apm.c index b4ffa0f0b188e2..0e667a7eb5467b 100644 --- a/sound/soc/qcom/qdsp6/q6apm.c +++ b/sound/soc/qcom/qdsp6/q6apm.c @@ -354,6 +354,9 @@ int q6apm_set_real_module_id(struct device *dev, struct q6apm_graph *graph, case SND_AUDIOCODEC_FLAC: module_id = MODULE_ID_FLAC_DEC; break; + case SND_AUDIOCODEC_OPUS_RAW: + module_id = MODULE_ID_OPUS_DEC; + break; default: return -EINVAL; } From 7a425ec75d2bb30a1c959a8676ef8c5ef285095d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Sep 2025 12:42:56 -0300 Subject: [PATCH 2305/3206] iommufd: Fix refcounting race during mmap The owner object of the imap can be destroyed while the imap remains in the mtree. So access to the imap pointer without holding locks is racy with destruction. The imap is safe to access outside the lock once a users refcount is obtained, the owner object cannot start destruction until users is 0. Thus the users refcount should not be obtained at the end of iommufd_fops_mmap() but instead inside the mtree lock held around the mtree_load(). Move the refcount there and use refcount_inc_not_zero() as we can have a 0 refcount inside the mtree during destruction races. Link: https://patch.msgid.link/r/0-v1-e6faace50971+3cc-iommufd_mmap_fix_jgg@nvidia.com Cc: stable@vger.kernel.org Fixes: 56e9a0d8e53f ("iommufd: Add mmap interface") Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/main.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 15af7ced0501d6..a9d4decc8ba165 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -550,16 +550,23 @@ static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_flags & VM_EXEC) return -EPERM; + mtree_lock(&ictx->mt_mmap); /* vma->vm_pgoff carries a page-shifted start position to an immap */ immap = mtree_load(&ictx->mt_mmap, vma->vm_pgoff << PAGE_SHIFT); - if (!immap) + if (!immap || !refcount_inc_not_zero(&immap->owner->users)) { + mtree_unlock(&ictx->mt_mmap); return -ENXIO; + } + mtree_unlock(&ictx->mt_mmap); + /* * mtree_load() returns the immap for any contained mmio_addr, so only * allow the exact immap thing to be mapped */ - if (vma->vm_pgoff != immap->vm_pgoff || length != immap->length) - return -ENXIO; + if (vma->vm_pgoff != immap->vm_pgoff || length != immap->length) { + rc = -ENXIO; + goto err_refcount; + } vma->vm_pgoff = 0; vma->vm_private_data = immap; @@ -570,10 +577,11 @@ static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma) immap->mmio_addr >> PAGE_SHIFT, length, vma->vm_page_prot); if (rc) - return rc; + goto err_refcount; + return 0; - /* vm_ops.open won't be called for mmap itself. */ - refcount_inc(&immap->owner->users); +err_refcount: + refcount_dec(&immap->owner->users); return rc; } From 4e034bf045b12852a24d5d33f2451850818ba0c1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Sep 2025 20:47:10 -0300 Subject: [PATCH 2306/3206] iommufd: Fix race during abort for file descriptors fput() doesn't actually call file_operations release() synchronously, it puts the file on a work queue and it will be released eventually. This is normally fine, except for iommufd the file and the iommufd_object are tied to gether. The file has the object as it's private_data and holds a users refcount, while the object is expected to remain alive as long as the file is. When the allocation of a new object aborts before installing the file it will fput() the file and then go on to immediately kfree() the obj. This causes a UAF once the workqueue completes the fput() and tries to decrement the users refcount. Fix this by putting the core code in charge of the file lifetime, and call __fput_sync() during abort to ensure that release() is called before kfree. __fput_sync() is a bit too tricky to open code in all the object implementations. Instead the objects tell the core code where the file pointer is and the core will take care of the life cycle. If the object is successfully allocated then the file will hold a users refcount and the iommufd_object cannot be destroyed. It is worth noting that close(); ioctl(IOMMU_DESTROY); doesn't have an issue because close() is already using a synchronous version of fput(). The UAF looks like this: BUG: KASAN: slab-use-after-free in iommufd_eventq_fops_release+0x45/0xc0 drivers/iommu/iommufd/eventq.c:376 Write of size 4 at addr ffff888059c97804 by task syz.0.46/6164 CPU: 0 UID: 0 PID: 6164 Comm: syz.0.46 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/18/2025 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xcd/0x630 mm/kasan/report.c:482 kasan_report+0xe0/0x110 mm/kasan/report.c:595 check_region_inline mm/kasan/generic.c:183 [inline] kasan_check_range+0x100/0x1b0 mm/kasan/generic.c:189 instrument_atomic_read_write include/linux/instrumented.h:96 [inline] atomic_fetch_sub_release include/linux/atomic/atomic-instrumented.h:400 [inline] __refcount_dec include/linux/refcount.h:455 [inline] refcount_dec include/linux/refcount.h:476 [inline] iommufd_eventq_fops_release+0x45/0xc0 drivers/iommu/iommufd/eventq.c:376 __fput+0x402/0xb70 fs/file_table.c:468 task_work_run+0x14d/0x240 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop+0xeb/0x110 kernel/entry/common.c:43 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline] syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline] syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline] do_syscall_64+0x41c/0x4c0 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f Link: https://patch.msgid.link/r/1-v1-02cd136829df+31-iommufd_syz_fput_jgg@nvidia.com Cc: stable@vger.kernel.org Fixes: 07838f7fd529 ("iommufd: Add iommufd fault object") Reviewed-by: Nicolin Chen Reviewed-by: Nirmoy Das Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Reported-by: syzbot+80620e2d0d0a33b09f93@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/68c8583d.050a0220.2ff435.03a2.GAE@google.com Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/eventq.c | 9 ++------- drivers/iommu/iommufd/main.c | 35 +++++++++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c index fc4de63b0bce64..e23d9ee4fe3806 100644 --- a/drivers/iommu/iommufd/eventq.c +++ b/drivers/iommu/iommufd/eventq.c @@ -393,12 +393,12 @@ static int iommufd_eventq_init(struct iommufd_eventq *eventq, char *name, const struct file_operations *fops) { struct file *filep; - int fdno; spin_lock_init(&eventq->lock); INIT_LIST_HEAD(&eventq->deliver); init_waitqueue_head(&eventq->wait_queue); + /* The filep is fput() by the core code during failure */ filep = anon_inode_getfile(name, fops, eventq, O_RDWR); if (IS_ERR(filep)) return PTR_ERR(filep); @@ -408,10 +408,7 @@ static int iommufd_eventq_init(struct iommufd_eventq *eventq, char *name, eventq->filep = filep; refcount_inc(&eventq->obj.users); - fdno = get_unused_fd_flags(O_CLOEXEC); - if (fdno < 0) - fput(filep); - return fdno; + return get_unused_fd_flags(O_CLOEXEC); } static const struct file_operations iommufd_fault_fops = @@ -452,7 +449,6 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) return 0; out_put_fdno: put_unused_fd(fdno); - fput(fault->common.filep); return rc; } @@ -536,7 +532,6 @@ int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd) out_put_fdno: put_unused_fd(fdno); - fput(veventq->common.filep); out_abort: iommufd_object_abort_and_destroy(ucmd->ictx, &veventq->common.obj); out_unlock_veventqs: diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index a9d4decc8ba165..88be2e15724501 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -23,6 +23,7 @@ #include "iommufd_test.h" struct iommufd_object_ops { + size_t file_offset; void (*pre_destroy)(struct iommufd_object *obj); void (*destroy)(struct iommufd_object *obj); void (*abort)(struct iommufd_object *obj); @@ -131,10 +132,30 @@ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj) { - if (iommufd_object_ops[obj->type].abort) - iommufd_object_ops[obj->type].abort(obj); + const struct iommufd_object_ops *ops = &iommufd_object_ops[obj->type]; + + if (ops->file_offset) { + struct file **filep = ((void *)obj) + ops->file_offset; + + /* + * A file should hold a users refcount while the file is open + * and put it back in its release. The file should hold a + * pointer to obj in their private data. Normal fput() is + * deferred to a workqueue and can get out of order with the + * following kfree(obj). Using the sync version ensures the + * release happens immediately. During abort we require the file + * refcount is one at this point - meaning the object alloc + * function cannot do anything to allow another thread to take a + * refcount prior to a guaranteed success. + */ + if (*filep) + __fput_sync(*filep); + } + + if (ops->abort) + ops->abort(obj); else - iommufd_object_ops[obj->type].destroy(obj); + ops->destroy(obj); iommufd_object_abort(ictx, obj); } @@ -659,6 +680,12 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx) } EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, "IOMMUFD"); +#define IOMMUFD_FILE_OFFSET(_struct, _filep, _obj) \ + .file_offset = (offsetof(_struct, _filep) + \ + BUILD_BUG_ON_ZERO(!__same_type( \ + struct file *, ((_struct *)NULL)->_filep)) + \ + BUILD_BUG_ON_ZERO(offsetof(_struct, _obj))) + static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_ACCESS] = { .destroy = iommufd_access_destroy_object, @@ -669,6 +696,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { }, [IOMMUFD_OBJ_FAULT] = { .destroy = iommufd_fault_destroy, + IOMMUFD_FILE_OFFSET(struct iommufd_fault, common.filep, common.obj), }, [IOMMUFD_OBJ_HW_QUEUE] = { .destroy = iommufd_hw_queue_destroy, @@ -691,6 +719,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_VEVENTQ] = { .destroy = iommufd_veventq_destroy, .abort = iommufd_veventq_abort, + IOMMUFD_FILE_OFFSET(struct iommufd_veventq, common.filep, common.obj), }, [IOMMUFD_OBJ_VIOMMU] = { .destroy = iommufd_viommu_destroy, From 53d0584eeb2c85a46c83656246d61a89558d74b3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 17 Sep 2025 15:59:59 -0300 Subject: [PATCH 2307/3206] iommufd: WARN if an object is aborted with an elevated refcount If something holds a refcount then it is at risk of UAFing. For abort paths we expect the caller to never share the object with a parallel thread and to clean up any refcounts it obtained on its own. Add the missing dec inside iommufd_hwpt_paging_alloc() during error unwind by making iommufd_hw_pagetable_attach/detach() proper pairs. Link: https://patch.msgid.link/r/2-v1-02cd136829df+31-iommufd_syz_fput_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 3 ++- drivers/iommu/iommufd/iommufd_private.h | 3 +-- drivers/iommu/iommufd/main.c | 4 ++++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 65fbd098f9e98f..4c842368289f08 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -711,6 +711,8 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); mutex_unlock(&igroup->lock); + iommufd_hw_pagetable_put(idev->ictx, hwpt); + /* Caller must destroy hwpt */ return hwpt; } @@ -1057,7 +1059,6 @@ void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid) hwpt = iommufd_hw_pagetable_detach(idev, pasid); if (!hwpt) return; - iommufd_hw_pagetable_put(idev->ictx, hwpt); refcount_dec(&idev->obj.users); } EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, "IOMMUFD"); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 0da2a81eedfa8b..627f9b78483a0e 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -454,9 +454,8 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) { struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); - lockdep_assert_not_held(&hwpt_paging->ioas->mutex); - if (hwpt_paging->auto_domain) { + lockdep_assert_not_held(&hwpt_paging->ioas->mutex); iommufd_object_put_and_try_destroy(ictx, &hwpt->obj); return; } diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 88be2e15724501..ce775fbbae94e7 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -122,6 +122,10 @@ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) old = xas_store(&xas, NULL); xa_unlock(&ictx->objects); WARN_ON(old != XA_ZERO_ENTRY); + + if (WARN_ON(!refcount_dec_and_test(&obj->users))) + return; + kfree(obj); } From 43f6bee02196e56720dd68eea847d213c6e69328 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 17 Sep 2025 16:55:51 -0300 Subject: [PATCH 2308/3206] iommufd/selftest: Update the fail_nth limit There are more failure conditions now so 400 iterations is not enough pass them all, up it to 1000. The limit exists so it doesn't infinite loop. Link: https://patch.msgid.link/r/3-v1-02cd136829df+31-iommufd_syz_fput_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/iommufd_fail_nth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 651fc9f13c0810..45c14323a6183c 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -113,7 +113,7 @@ static bool fail_nth_next(struct __test_metadata *_metadata, * necessarily mean a test failure, just that the limit has to be made * bigger. */ - ASSERT_GT(400, nth_state->iteration); + ASSERT_GT(1000, nth_state->iteration); if (nth_state->iteration != 0) { ssize_t res; ssize_t res2; From 92d051a1c1e30d6f8655a3fab91e19fdad72040b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 19 Sep 2025 14:56:43 +0100 Subject: [PATCH 2309/3206] arm64: Kconfig: Spell out "ARMv9.4" in menuconfig text The menuconfig entries to configure various architectural features are all formatted as "ARMvx.y architecture features" with the unusual exception of 9.4, which omits the "ARM" prefix. Add the "ARM" prefix to the menuconfig entry for the ARMv9.4 architectural features. Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a64d..514038b18ebaea 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2218,7 +2218,7 @@ config ARM64_HAFT endmenu # "ARMv8.9 architectural features" -menu "v9.4 architectural features" +menu "ARMv9.4 architectural features" config ARM64_GCS bool "Enable support for Guarded Control Stack (GCS)" @@ -2237,7 +2237,7 @@ config ARM64_GCS The feature is detected at runtime, and will remain disabled if the system does not implement the feature. -endmenu # "v9.4 architectural features" +endmenu # "ARMv9.4 architectural features" config ARM64_SVE bool "ARM Scalable Vector Extension support" From 87cab86925b7fa4c1c977bc191ac549a3b23f0ea Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 19 Sep 2025 15:02:35 +0100 Subject: [PATCH 2310/3206] ASoC: Intel: sof_sdw: Prevent jump to NULL add_sidecar callback In create_sdw_dailink() check that sof_end->codec_info->add_sidecar is not NULL before calling it. The original code assumed that if include_sidecar is true, the codec on that link has an add_sidecar callback. But there could be other codecs on the same link that do not have an add_sidecar callback. Fixes: da5244180281 ("ASoC: Intel: sof_sdw: Add callbacks to register sidecar devices") Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20250919140235.1071941-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 28f03a5f29f741..c013e31d098e71 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -841,7 +841,7 @@ static int create_sdw_dailink(struct snd_soc_card *card, (*codec_conf)++; } - if (sof_end->include_sidecar) { + if (sof_end->include_sidecar && sof_end->codec_info->add_sidecar) { ret = sof_end->codec_info->add_sidecar(card, dai_links, codec_conf); if (ret) return ret; From 7a4f92d39f66f890cbb157dd4d7daf6a9298324a Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Tue, 16 Sep 2025 10:29:04 +0200 Subject: [PATCH 2311/3206] fs: replace use of system_unbound_wq with system_dfl_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://lore.kernel.org/20250916082906.77439-2-marco.crivellari@suse.com Signed-off-by: Christian Brauner --- fs/afs/callback.c | 4 ++-- fs/afs/write.c | 2 +- fs/bcachefs/btree_write_buffer.c | 2 +- fs/bcachefs/io_read.c | 8 ++++---- fs/bcachefs/journal_io.c | 2 +- fs/btrfs/block-group.c | 2 +- fs/btrfs/extent_map.c | 2 +- fs/btrfs/space-info.c | 4 ++-- fs/btrfs/zoned.c | 2 +- fs/coredump.c | 2 +- fs/ext4/mballoc.c | 2 +- fs/netfs/misc.c | 2 +- fs/netfs/objects.c | 2 +- fs/nfsd/filecache.c | 2 +- fs/notify/mark.c | 4 ++-- fs/quota/dquot.c | 2 +- 16 files changed, 22 insertions(+), 22 deletions(-) diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 69e1dd55b16010..894d2bad6b6cec 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -42,7 +42,7 @@ static void afs_volume_init_callback(struct afs_volume *volume) list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) { if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) { afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb); - queue_work(system_unbound_wq, &vnode->cb_work); + queue_work(system_dfl_wq, &vnode->cb_work); } } @@ -90,7 +90,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas if (reason != afs_cb_break_for_deleted && vnode->status.type == AFS_FTYPE_FILE && atomic_read(&vnode->cb_nr_mmap)) - queue_work(system_unbound_wq, &vnode->cb_work); + queue_work(system_dfl_wq, &vnode->cb_work); trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true); } else { diff --git a/fs/afs/write.c b/fs/afs/write.c index 2e7526ea883ae2..93ad86ff33453f 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -172,7 +172,7 @@ static void afs_issue_write_worker(struct work_struct *work) void afs_issue_write(struct netfs_io_subrequest *subreq) { subreq->work.func = afs_issue_write_worker; - if (!queue_work(system_unbound_wq, &subreq->work)) + if (!queue_work(system_dfl_wq, &subreq->work)) WARN_ON_ONCE(1); } diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 4b095235a0d221..0afb44ce1a8562 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -827,7 +827,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ if (bch2_btree_write_buffer_should_flush(c) && __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) && - !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) + !queue_work(system_dfl_wq, &c->btree_write_buffer.flush_work)) enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); if (dst->wb == &wb->flushing) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index e0874ad9a6cf24..460e2e6341f1ce 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -684,7 +684,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_unbound_wq); + RBIO_CONTEXT_UNBOUND, system_dfl_wq); } else { rbio = bch2_rbio_free(rbio); @@ -921,10 +921,10 @@ static void __bch2_read_endio(struct work_struct *work) bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); goto out; decompression_err: - bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_dfl_wq); goto out; decrypt_err: - bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_dfl_wq); goto out; } @@ -963,7 +963,7 @@ static void bch2_read_endio(struct bio *bio) rbio->promote || crc_is_compressed(rbio->pick.crc) || bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) - context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; + context = RBIO_CONTEXT_UNBOUND, wq = system_dfl_wq; else if (rbio->pick.crc.csum_type) context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 9e028dbcc3d02d..29bea8e0e49541 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1362,7 +1362,7 @@ int bch2_journal_read(struct bch_fs *c, BCH_DEV_READ_REF_journal_read)) closure_call(&ca->journal.read, bch2_journal_read_device, - system_unbound_wq, + system_dfl_wq, &jlist.cl); else degraded = true; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9bf282d2453c02..9a0af7e4a935c5 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2031,7 +2031,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) btrfs_reclaim_sweep(fs_info); spin_lock(&fs_info->unused_bgs_lock); if (!list_empty(&fs_info->reclaim_bgs)) - queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); + queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work); spin_unlock(&fs_info->unused_bgs_lock); } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 57f52585a6dde9..9a5a497edc97ae 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1372,7 +1372,7 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) return; - queue_work(system_unbound_wq, &fs_info->em_shrinker_work); + queue_work(system_dfl_wq, &fs_info->em_shrinker_work); } void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 0481c693ac2eaf..c573d80550adf2 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1830,7 +1830,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, flush, "enospc"); - queue_work(system_unbound_wq, async_work); + queue_work(system_dfl_wq, async_work); } } else { list_add_tail(&ticket.list, @@ -1847,7 +1847,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, need_preemptive_reclaim(fs_info, space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); - queue_work(system_unbound_wq, + queue_work(system_dfl_wq, &fs_info->preempt_reclaim_work); } } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 245e813ecd785a..0f493db7ae44c2 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2488,7 +2488,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, refcount_inc(&eb->refs); bg->last_eb = eb; INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); - queue_work(system_unbound_wq, &bg->zone_finish_work); + queue_work(system_dfl_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) diff --git a/fs/coredump.c b/fs/coredump.c index fedbead956ed16..b22f99cb6818ce 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -635,7 +635,7 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) /* * Usermode helpers are childen of either - * system_unbound_wq or of kthreadd. So we know that + * system_dfl_wq or of kthreadd. So we know that * we're starting off with a clean file descriptor * table. So we should always be able to use * COREDUMP_PIDFD_NUMBER as our file descriptor value. diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5898d92ba19f14..8b18802e83ebd2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3995,7 +3995,7 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) list_splice_tail(&freed_data_list, &sbi->s_discard_list); spin_unlock(&sbi->s_md_lock); if (wake) - queue_work(system_unbound_wq, &sbi->s_discard_work); + queue_work(system_dfl_wq, &sbi->s_discard_work); } else { list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) kmem_cache_free(ext4_free_data_cachep, entry); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 20748bcfbf5902..486166460e177d 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -321,7 +321,7 @@ void netfs_wake_collector(struct netfs_io_request *rreq) { if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { - queue_work(system_unbound_wq, &rreq->work); + queue_work(system_dfl_wq, &rreq->work); } else { trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue); wake_up(&rreq->waitq); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e8c99738b5bbf2..2ebe56b24ddd10 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -163,7 +163,7 @@ void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace dead = __refcount_dec_and_test(&rreq->ref, &r); trace_netfs_rreq_ref(debug_id, r - 1, what); if (dead) - WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work)); + WARN_ON(!queue_work(system_dfl_wq, &rreq->cleanup_work)); } } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 732abf6b92a569..85ca663c052c1f 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -113,7 +113,7 @@ static void nfsd_file_schedule_laundrette(void) { if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags)) - queue_delayed_work(system_unbound_wq, &nfsd_filecache_laundrette, + queue_delayed_work(system_dfl_wq, &nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY); } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 798340db69d761..55a03bb05aa118 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -428,7 +428,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) conn->destroy_next = connector_destroy_list; connector_destroy_list = conn; spin_unlock(&destroy_lock); - queue_work(system_unbound_wq, &connector_reaper_work); + queue_work(system_dfl_wq, &connector_reaper_work); } /* * Note that we didn't update flags telling whether inode cares about @@ -439,7 +439,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); - queue_delayed_work(system_unbound_wq, &reaper_work, + queue_delayed_work(system_dfl_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } EXPORT_SYMBOL_GPL(fsnotify_put_mark); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index df4a9b34876965..afa15a21453822 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -881,7 +881,7 @@ void dqput(struct dquot *dquot) put_releasing_dquots(dquot); atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); - queue_delayed_work(system_unbound_wq, "a_release_work, 1); + queue_delayed_work(system_dfl_wq, "a_release_work, 1); } EXPORT_SYMBOL(dqput); From 4ef64db060619be040351e3960e151e5fef3f895 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Tue, 16 Sep 2025 10:29:05 +0200 Subject: [PATCH 2312/3206] fs: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue, yet nothing in its name tells about that CPU affinity constraint, which is very often not required by users. Make it clear by adding a system_percpu_wq to all the fs subsystem. The old wq will be kept for a few release cylces. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://lore.kernel.org/20250916082906.77439-3-marco.crivellari@suse.com Signed-off-by: Christian Brauner --- fs/aio.c | 2 +- fs/fs-writeback.c | 2 +- fs/fuse/dev.c | 2 +- fs/fuse/inode.c | 2 +- fs/nfs/namespace.c | 2 +- fs/nfs/nfs4renewd.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 7fc7b6221312c3..6002617f078c6f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -636,7 +636,7 @@ static void free_ioctx_reqs(struct percpu_ref *ref) /* Synchronize against RCU protected table->table[] dereferences */ INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); - queue_rcu_work(system_wq, &ctx->free_rwork); + queue_rcu_work(system_percpu_wq, &ctx->free_rwork); } /* diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641d7..cf51a265bf2712 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2442,7 +2442,7 @@ static int dirtytime_interval_handler(const struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) - mod_delayed_work(system_wq, &dirtytime_work, 0); + mod_delayed_work(system_percpu_wq, &dirtytime_work, 0); return ret; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e80cd8f2c049f9..8520eb94c527a3 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -119,7 +119,7 @@ void fuse_check_timeout(struct work_struct *work) goto abort_conn; out: - queue_delayed_work(system_wq, &fc->timeout.work, + queue_delayed_work(system_percpu_wq, &fc->timeout.work, fuse_timeout_timer_freq); return; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ecb869e895ab1d..b12cd19a9bcae8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1273,7 +1273,7 @@ static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout) { fc->timeout.req_timeout = secs_to_jiffies(timeout); INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout); - queue_delayed_work(system_wq, &fc->timeout.work, + queue_delayed_work(system_percpu_wq, &fc->timeout.work, fuse_timeout_timer_freq); } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 7f1ec9c67ff21d..f9a3a1fbf44ce8 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -335,7 +335,7 @@ static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp) num *= HZ; *((int *)kp->arg) = num; if (!list_empty(&nfs_automount_list)) - mod_delayed_work(system_wq, &nfs_automount_task, num); + mod_delayed_work(system_percpu_wq, &nfs_automount_task, num); } else { *((int *)kp->arg) = -1*HZ; cancel_delayed_work(&nfs_automount_task); diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index db3811af079691..18ae614e5a6c39 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -122,7 +122,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) timeout = 5 * HZ; dprintk("%s: requeueing work. Lease period = %ld\n", __func__, (timeout + HZ - 1) / HZ); - mod_delayed_work(system_wq, &clp->cl_renewd, timeout); + mod_delayed_work(system_percpu_wq, &clp->cl_renewd, timeout); set_bit(NFS_CS_RENEWD, &clp->cl_res_state); spin_unlock(&clp->cl_lock); } From 69635d7f4b344e6f5344bba3c3de92e4fb8b0d2a Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Tue, 16 Sep 2025 10:29:06 +0200 Subject: [PATCH 2313/3206] fs: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch adds a new WQ_PERCPU flag to all the fs subsystem users to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://lore.kernel.org/20250916082906.77439-4-marco.crivellari@suse.com Signed-off-by: Christian Brauner --- fs/afs/main.c | 4 ++-- fs/bcachefs/super.c | 10 +++++----- fs/btrfs/disk-io.c | 2 +- fs/ceph/super.c | 2 +- fs/dlm/lowcomms.c | 2 +- fs/dlm/main.c | 2 +- fs/fs-writeback.c | 2 +- fs/gfs2/main.c | 5 +++-- fs/gfs2/ops_fstype.c | 6 ++++-- fs/ocfs2/dlm/dlmdomain.c | 3 ++- fs/ocfs2/dlmfs/dlmfs.c | 3 ++- fs/smb/client/cifsfs.c | 16 +++++++++++----- fs/smb/server/ksmbd_work.c | 2 +- fs/smb/server/transport_rdma.c | 3 ++- fs/super.c | 3 ++- fs/verity/verify.c | 2 +- fs/xfs/xfs_log.c | 3 +-- fs/xfs/xfs_mru_cache.c | 3 ++- fs/xfs/xfs_super.c | 15 ++++++++------- 19 files changed, 51 insertions(+), 37 deletions(-) diff --git a/fs/afs/main.c b/fs/afs/main.c index 02475d415d885e..e6bb8237db989a 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -169,13 +169,13 @@ static int __init afs_init(void) printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); - afs_wq = alloc_workqueue("afs", 0, 0); + afs_wq = alloc_workqueue("afs", WQ_PERCPU, 0); if (!afs_wq) goto error_afs_wq; afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!afs_async_calls) goto error_async; - afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0); + afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!afs_lock_manager) goto error_lockmgr; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index c46b1053a02c90..f2417d298b844f 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -801,13 +801,13 @@ int bch2_fs_init_rw(struct bch_fs *c) if (!(c->btree_update_wq = alloc_workqueue("bcachefs", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_PERCPU, 1)) || !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 1)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", - WQ_FREEZABLE, 0))) + WQ_FREEZABLE|WQ_PERCPU, 0))) return bch_err_throw(c, ENOMEM_fs_other_alloc); int ret = bch2_fs_btree_interior_update_init(c) ?: @@ -975,7 +975,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, sizeof(struct sort_iter_set); if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 512)) || enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, bch2_writes_disabled) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 70fc4e7cc5a0e6..aa4393eba99779 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1958,7 +1958,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; - unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE; + unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU; fs_info->workers = btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c3eb651862c555..2dc64515f259d9 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -862,7 +862,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0); if (!fsc->inode_wq) goto fail_client; - fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); + fsc->cap_wq = alloc_workqueue("ceph-cap", WQ_PERCPU, 1); if (!fsc->cap_wq) goto fail_inode_wq; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index e4373bce1bc239..9a0b6c2b6b01e4 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1703,7 +1703,7 @@ static int work_start(void) return -ENOMEM; } - process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH, 0); + process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH | WQ_PERCPU, 0); if (!process_workqueue) { log_print("can't start dlm_process"); destroy_workqueue(io_workqueue); diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 4887c8a05318dd..a44d16da7187c8 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -52,7 +52,7 @@ static int __init init_dlm(void) if (error) goto out_user; - dlm_wq = alloc_workqueue("dlm_wq", 0, 0); + dlm_wq = alloc_workqueue("dlm_wq", WQ_PERCPU, 0); if (!dlm_wq) { error = -ENOMEM; goto out_plock; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cf51a265bf2712..4b1a53a3266bb3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1180,7 +1180,7 @@ void cgroup_writeback_umount(struct super_block *sb) static int __init cgroup_writeback_init(void) { - isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); + isw_wq = alloc_workqueue("inode_switch_wbs", WQ_PERCPU, 0); if (!isw_wq) return -ENOMEM; return 0; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 0727f60ad02883..9d65719353faad 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -151,7 +151,8 @@ static int __init init_gfs2_fs(void) error = -ENOMEM; gfs2_recovery_wq = alloc_workqueue("gfs2_recovery", - WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU, + 0); if (!gfs2_recovery_wq) goto fail_wq1; @@ -160,7 +161,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_control_wq) goto fail_wq2; - gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", 0, 0); + gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", WQ_PERCPU, 0); if (!gfs2_freeze_wq) goto fail_wq3; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index efe99b73255137..05f936cc5db78f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1193,13 +1193,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) error = -ENOMEM; sdp->sd_glock_wq = alloc_workqueue("gfs2-glock/%s", - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0, + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE | WQ_PERCPU, + 0, sdp->sd_fsname); if (!sdp->sd_glock_wq) goto fail_iput; sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s", - WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname); + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU, 0, + sdp->sd_fsname); if (!sdp->sd_delete_wq) goto fail_glock_wq; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 2018501b224937..2347a50f079b7b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1876,7 +1876,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) dlm_debug_init(dlm); snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); - dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0); + dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 5130ec44e5e158..0b730535b2c880 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -595,7 +595,8 @@ static int __init init_dlmfs_fs(void) } cleanup_inode = 1; - user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0); + user_dlm_worker = alloc_workqueue("user_dlm", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!user_dlm_worker) { status = -ENOMEM; goto bail; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 3bd85ab2deb192..7882b034686332 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1881,7 +1881,9 @@ init_cifs(void) cifs_dbg(VFS, "dir_cache_timeout set to max of 65000 seconds\n"); } - cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + cifsiod_wq = alloc_workqueue("cifsiod", + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cifsiod_wq) { rc = -ENOMEM; goto out_clean_proc; @@ -1909,28 +1911,32 @@ init_cifs(void) } cifsoplockd_wq = alloc_workqueue("cifsoplockd", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cifsoplockd_wq) { rc = -ENOMEM; goto out_destroy_fileinfo_put_wq; } deferredclose_wq = alloc_workqueue("deferredclose", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!deferredclose_wq) { rc = -ENOMEM; goto out_destroy_cifsoplockd_wq; } serverclose_wq = alloc_workqueue("serverclose", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!serverclose_wq) { rc = -ENOMEM; goto out_destroy_deferredclose_wq; } cfid_put_wq = alloc_workqueue("cfid_put_wq", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cfid_put_wq) { rc = -ENOMEM; goto out_destroy_serverclose_wq; diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c index 72b00ca6e45517..4a71f46d7020be 100644 --- a/fs/smb/server/ksmbd_work.c +++ b/fs/smb/server/ksmbd_work.c @@ -78,7 +78,7 @@ int ksmbd_work_pool_init(void) int ksmbd_workqueue_init(void) { - ksmbd_wq = alloc_workqueue("ksmbd-io", 0, 0); + ksmbd_wq = alloc_workqueue("ksmbd-io", WQ_PERCPU, 0); if (!ksmbd_wq) return -ENOMEM; return 0; diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 8d366db5f60547..9394b358e79555 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -2177,7 +2177,8 @@ int ksmbd_rdma_init(void) * for lack of credits */ smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq", - WQ_HIGHPRI | WQ_MEM_RECLAIM, 0); + WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!smb_direct_wq) return -ENOMEM; diff --git a/fs/super.c b/fs/super.c index 7f876f32343ad4..5cb3f41e42cad0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -2314,7 +2314,8 @@ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", - WQ_MEM_RECLAIM, 0, + WQ_MEM_RECLAIM | WQ_PERCPU, + 0, sb->s_id); if (!wq) return -ENOMEM; diff --git a/fs/verity/verify.c b/fs/verity/verify.c index a1f00c3fd3b276..628c23710f9fb9 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -355,7 +355,7 @@ void __init fsverity_init_workqueue(void) * latency on ARM64. */ fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue", - WQ_HIGHPRI, + WQ_HIGHPRI | WQ_PERCPU, num_online_cpus()); if (!fsverity_read_workqueue) panic("failed to allocate fsverity_read_queue"); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c8a57e21a1d3e0..0da19472d60309 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1489,8 +1489,7 @@ xlog_alloc_log( log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | - WQ_HIGHPRI), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU), 0, mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 866c71d9fbaed4..73b7e72944e47f 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -293,7 +293,8 @@ int xfs_mru_cache_init(void) { xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 1); if (!xfs_mru_reap_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a770d..43e9aab7161031 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -578,19 +578,19 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) goto out_destroy_buf; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) goto out_destroy_unwritten; @@ -602,13 +602,14 @@ xfs_init_mount_workqueues( goto out_destroy_reclaim; mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_inodegc_wq) goto out_destroy_blockgc; mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", - XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0, + mp->m_super->s_id); if (!mp->m_sync_workqueue) goto out_destroy_inodegc; @@ -2596,8 +2597,8 @@ xfs_init_workqueues(void) * AGs in all the filesystems mounted. Hence use the default large * max_active value for this workqueue. */ - xfs_alloc_wq = alloc_workqueue("xfsalloc", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0); + xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 0); if (!xfs_alloc_wq) return -ENOMEM; From 224ef741ce87aa6474b82e0eb76e0e8e1bafe544 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:46 +0200 Subject: [PATCH 2314/3206] ns: add reference count helpers Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 45 ++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 19833ac547f9af..65e258e1fdc6dd 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -43,16 +43,24 @@ struct ns_common { int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns)->ns, \ - struct ipc_namespace *: &(__ns)->ns, \ - struct mnt_namespace *: &(__ns)->ns, \ - struct net *: &(__ns)->ns, \ - struct pid_namespace *: &(__ns)->ns, \ - struct time_namespace *: &(__ns)->ns, \ - struct user_namespace *: &(__ns)->ns, \ - struct uts_namespace *: &(__ns)->ns) +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + const struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + const struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + const struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + const struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + const struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + const struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + const struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns, \ + const struct uts_namespace *: &(__ns)->ns) #define ns_init_inum(__ns) \ _Generic((__ns), \ @@ -83,4 +91,21 @@ void __ns_common_free(struct ns_common *ns); #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) +{ + return refcount_dec_and_test(&ns->count); +} + +static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) +{ + return refcount_inc_not_zero(&ns->count); +} + +#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->count) +#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->count) +#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) +#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) +#define ns_ref_put_and_lock(__ns, __lock) \ + refcount_dec_and_lock(&to_ns_common((__ns))->count, (__lock)) + #endif From 2e9e6972279fec7dd352ff05abe596e55988ec41 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:47 +0200 Subject: [PATCH 2315/3206] mnt: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/mount.h | 2 +- fs/namespace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 76bf863c9ae281..79c85639a7ba0e 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -143,7 +143,7 @@ static inline void detach_mounts(struct dentry *dentry) static inline void get_mnt_ns(struct mnt_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } extern seqlock_t mount_lock; diff --git a/fs/namespace.c b/fs/namespace.c index b9f94769ec11e1..9109069d85cd8e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2110,7 +2110,7 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr * the mount namespace and it might already be on its * deathbed. */ - if (!refcount_inc_not_zero(&mntns->ns.count)) + if (!ns_ref_get(mntns)) continue; return mntns; @@ -6084,7 +6084,7 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - if (!refcount_dec_and_test(&ns->ns.count)) + if (!ns_ref_put(ns)) return; namespace_lock(); emptied_ns = ns; From be5f21d3985f00827e09b798f7a07ebd6dd7f54a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:08 +0200 Subject: [PATCH 2316/3206] ns: add ns_common_free() And drop ns_free_inum(). Anything common that can be wasted centrally should be wasted in the new common helper. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- include/linux/ns_common.h | 3 +++ include/linux/proc_ns.h | 2 -- ipc/namespace.c | 4 ++-- kernel/cgroup/namespace.c | 2 +- kernel/nscommon.c | 5 +++++ kernel/pid_namespace.c | 4 ++-- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 4 ++-- kernel/utsname.c | 2 +- net/core/net_namespace.c | 4 ++-- 11 files changed, 21 insertions(+), 15 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 699b8c770c47da..b9f94769ec11e1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4082,7 +4082,7 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) - ns_free_inum(&ns->ns); + ns_common_free(ns); dec_mnt_namespaces(ns->ucounts); mnt_ns_tree_remove(ns); } @@ -4154,7 +4154,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); - ns_free_inum(&new_ns->ns); + ns_common_free(ns); dec_mnt_namespaces(new_ns->ucounts); mnt_ns_release(new_ns); return ERR_CAST(new); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 05c7a7dd211bd3..19833ac547f9af 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -41,6 +41,7 @@ struct ns_common { }; int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); +void __ns_common_free(struct ns_common *ns); #define to_ns_common(__ns) \ _Generic((__ns), \ @@ -80,4 +81,6 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, #define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) +#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) + #endif diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 9f21670b5824bb..08016f6e0e6fb1 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,8 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -#define ns_free_inum(ns) proc_free_inum((ns)->inum) - #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) #endif /* _LINUX_PROC_NS_H */ diff --git a/ipc/namespace.c b/ipc/namespace.c index 0f8bbd18a47580..09d261a1a2aa4f 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -97,7 +97,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, fail_put: put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kfree(ns); fail_dec: @@ -161,7 +161,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); kfree(ns); } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index d928c557e28b1a..16ead750837109 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -40,7 +40,7 @@ void free_cgroup_ns(struct cgroup_namespace *ns) put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c3a90bb665ad67..7c1b07e2a6c9ed 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -18,3 +18,8 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, } return proc_alloc_inum(&ns->inum); } + +void __ns_common_free(struct ns_common *ns) +{ + proc_free_inum(ns->inum); +} diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 170757c265c299..27e2dd9ee0515c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -127,7 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns return ns; out_free_inum: - ns_free_inum(&ns->ns); + ns_common_free(ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); @@ -152,7 +152,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_tree_remove(ns); unregister_pidns_sysctls(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index ce8e952104a7e7..d49c73015d6ec3 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -255,7 +255,7 @@ void free_time_ns(struct time_namespace *ns) ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); __free_page(ns->vvar_page); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index db9f0463219c16..32406bcab526fa 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -165,7 +165,7 @@ int create_user_ns(struct cred *new) #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: @@ -220,7 +220,7 @@ static void free_user_ns(struct work_struct *work) #endif retire_userns_sysctls(ns); key_free_user_ns(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); diff --git a/kernel/utsname.c b/kernel/utsname.c index 399888be66bd1f..95d733eb2c985b 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -98,7 +98,7 @@ void free_uts_ns(struct uts_namespace *ns) ns_tree_remove(ns); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e50897fba8cde1..a6a3de56a81caa 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -590,7 +590,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: - ns_free_inum(&net->ns); + ns_common_free(net); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif @@ -713,7 +713,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); - ns_free_inum(&net->ns); + ns_common_free(net); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); From 06099e374f3ab818f0501671b21493ba2e1b94b9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:48 +0200 Subject: [PATCH 2317/3206] cgroup: port to ns_ref_*() helpers Stop accessing ns.count directly. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h index c02bb76c5e328b..b7dbf4d623d26b 100644 --- a/include/linux/cgroup_namespace.h +++ b/include/linux/cgroup_namespace.h @@ -29,12 +29,12 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, static inline void get_cgroup_ns(struct cgroup_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_cgroup_ns(ns); } From d4825c99d6a738c565d5142ce37369368a4352da Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:49 +0200 Subject: [PATCH 2318/3206] ipc: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ipc_namespace.h | 4 ++-- ipc/namespace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 924e4754374fba..21eff63f47dae1 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -140,14 +140,14 @@ extern struct ipc_namespace *copy_ipcs(unsigned long flags, static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) { if (ns) { - if (refcount_inc_not_zero(&ns->ns.count)) + if (ns_ref_get(ns)) return ns; } diff --git a/ipc/namespace.c b/ipc/namespace.c index 09d261a1a2aa4f..bd85d1c9d2c213 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -199,7 +199,7 @@ static void free_ipc(struct work_struct *unused) */ void put_ipc_ns(struct ipc_namespace *ns) { - if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { + if (ns_ref_put_and_lock(ns, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); From 07897b38eadf5a370a6001790239f23036d5b970 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:50 +0200 Subject: [PATCH 2319/3206] pid: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/pid_namespace.h | 2 +- kernel/pid_namespace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index ba0efc8c8596b3..5b2f29d369c476 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -62,7 +62,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 27e2dd9ee0515c..162f5fb63d75a8 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -169,7 +169,7 @@ static void destroy_pid_namespace_work(struct work_struct *work) parent = ns->parent; destroy_pid_namespace(ns); ns = parent; - } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); + } while (ns != &init_pid_ns && ns_ref_put(ns)); } struct pid_namespace *copy_pid_ns(unsigned long flags, @@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns != &init_pid_ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); From e0c173f1fa02c0b08720aa8aa0cc91c3063146ae Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:51 +0200 Subject: [PATCH 2320/3206] time: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/time_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index a47a4ce4183e1d..f3b9567cf1f4fc 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -44,7 +44,7 @@ extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -57,7 +57,7 @@ struct page *find_timens_vvar_page(struct vm_area_struct *vma); static inline void put_time_ns(struct time_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_time_ns(ns); } From 96d997ea5ad1911cc393ffdb5c928b532f2f921a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:52 +0200 Subject: [PATCH 2321/3206] user: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/user_namespace.h | 4 ++-- kernel/user_namespace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a09056ad090e1a..9a9aebbf96b9f2 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -176,7 +176,7 @@ static inline struct user_namespace *to_user_ns(struct ns_common *ns) static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -186,7 +186,7 @@ extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { - if (ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns_ref_put(ns)) __put_user_ns(ns); } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 32406bcab526fa..f9df45c4623525 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -225,7 +225,7 @@ static void free_user_ns(struct work_struct *work) kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; - } while (refcount_dec_and_test(&parent->ns.count)); + } while (ns_ref_put(parent)); } void __put_user_ns(struct user_namespace *ns) From 83914de1c1d39dca4a3196a03bcd64d0a861d551 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:53 +0200 Subject: [PATCH 2322/3206] net-sysfs: use check_net() Don't directly acess the namespace count. There's even a dedicated helper for this. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- net/core/net-sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c28cd66654447d..3c2dc4c5e683ea 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1328,7 +1328,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) struct netdev_rx_queue *queue = &dev->_rx[i]; struct kobject *kobj = &queue->kobj; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -2061,7 +2061,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) queue->kobj.uevent_suppress = 1; if (netdev_uses_bql(dev)) @@ -2315,7 +2315,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!refcount_read(&dev_net(ndev)->ns.count)) + if (!check_net(dev_net(ndev))) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); From dc41b844da530e94f5b8384deb2af602cbeb312a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:54 +0200 Subject: [PATCH 2323/3206] net: use check_net() Don't directly acess the namespace count. There's even a dedicated helper for this. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a6a3de56a81caa..d5e3fd81916316 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -315,7 +315,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; - if (refcount_read(&net->ns.count) == 0) + if (!check_net(net)) return NETNSA_NSID_NOT_ASSIGNED; spin_lock(&net->nsid_lock); From f12021e68a13f4f867b8d55212254f1f83f75e00 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:55 +0200 Subject: [PATCH 2324/3206] ipv4: use check_net() Don't directly acess the namespace count. There's even a dedicated helper for this. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- net/ipv4/inet_timewait_sock.c | 4 ++-- net/ipv4/tcp_metrics.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 875ff923a8ed05..56a117560c0cbb 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -329,13 +329,13 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo) TCPF_NEW_SYN_RECV)) continue; - if (refcount_read(&sock_net(sk)->ns.count)) + if (check_net(sock_net(sk))) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (refcount_read(&sock_net(sk)->ns.count)) { + if (check_net(sock_net(sk))) { sock_gen_put(sk); goto restart; } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 03c068ea27b6ad..b67f94c60f9ffc 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -912,7 +912,7 @@ static void tcp_metrics_flush_all(struct net *net) spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : - !refcount_read(&tm_net(tm)->ns.count); + !check_net(tm_net(tm)); if (match) { rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); From 2438b7d63ad866d6b2bb7b8d3455a6365d9b0fbe Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:56 +0200 Subject: [PATCH 2325/3206] uts: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/uts_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h index c2b619bb4e573a..23b4f0e1b33848 100644 --- a/include/linux/uts_namespace.h +++ b/include/linux/uts_namespace.h @@ -25,7 +25,7 @@ static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) static inline void get_uts_ns(struct uts_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } extern struct uts_namespace *copy_utsname(unsigned long flags, @@ -34,7 +34,7 @@ extern void free_uts_ns(struct uts_namespace *ns); static inline void put_uts_ns(struct uts_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_uts_ns(ns); } From 99d33ce100cbc982647c9299cadb1277cfad503e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:57 +0200 Subject: [PATCH 2326/3206] net: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/net/net_namespace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fd090ceb80bf48..3e7c825e581046 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -270,7 +270,7 @@ static inline struct net *to_net_ns(struct ns_common *ns) /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { - refcount_inc(&net->ns.count); + ns_ref_inc(net); return net; } @@ -281,7 +281,7 @@ static inline struct net *maybe_get_net(struct net *net) * exists. If the reference count is zero this * function fails and returns NULL. */ - if (!refcount_inc_not_zero(&net->ns.count)) + if (!ns_ref_get(net)) net = NULL; return net; } @@ -289,7 +289,7 @@ static inline struct net *maybe_get_net(struct net *net) /* Try using put_net_track() instead */ static inline void put_net(struct net *net) { - if (refcount_dec_and_test(&net->ns.count)) + if (ns_ref_put(net)) __put_net(net); } @@ -301,7 +301,7 @@ int net_eq(const struct net *net1, const struct net *net2) static inline int check_net(const struct net *net) { - return refcount_read(&net->ns.count) != 0; + return ns_ref_read(net) != 0; } void net_drop_ns(void *); From b3d8ff0679507a21c1917f6ae0b56ed9b9c625f6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:58 +0200 Subject: [PATCH 2327/3206] nsfs: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/nsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 8484bc4dd3debc..dc0a4404b9718e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -492,7 +492,7 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); - if (!refcount_inc_not_zero(&ns->count)) + if (!__ns_ref_get(ns)) return NULL; } From 024596a4e2802e457a9f92af79f246fa9631f8de Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:59 +0200 Subject: [PATCH 2328/3206] ns: rename to __ns_ref Make it easier to grep and rename to ns_count. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- include/linux/ns_common.h | 12 ++++++------ init/version-timestamp.c | 2 +- ipc/msgutil.c | 2 +- kernel/cgroup/cgroup.c | 2 +- kernel/nscommon.c | 2 +- kernel/pid.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user.c | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 9109069d85cd8e..740a6ba524d0ce 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6015,7 +6015,7 @@ struct mnt_namespace init_mnt_ns = { .ns.inum = PROC_MNT_INIT_INO, .ns.ops = &mntns_operations, .user_ns = &init_user_ns, - .ns.count = REFCOUNT_INIT(1), + .ns.__ns_ref = REFCOUNT_INIT(1), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 65e258e1fdc6dd..aea8528d799a60 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -29,7 +29,7 @@ struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; - refcount_t count; + refcount_t __ns_ref; /* do not use directly */ union { struct { u64 ns_id; @@ -93,19 +93,19 @@ void __ns_common_free(struct ns_common *ns); static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { - return refcount_dec_and_test(&ns->count); + return refcount_dec_and_test(&ns->__ns_ref); } static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { - return refcount_inc_not_zero(&ns->count); + return refcount_inc_not_zero(&ns->__ns_ref); } -#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->count) -#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->count) +#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) +#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) #define ns_ref_put_and_lock(__ns, __lock) \ - refcount_dec_and_lock(&to_ns_common((__ns))->count, (__lock)) + refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) #endif diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 043cbf80a766de..547e522e6016c1 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,7 +8,7 @@ #include struct uts_namespace init_uts_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .name = { .sysname = UTS_SYSNAME, .nodename = UTS_NODENAME, diff --git a/ipc/msgutil.c b/ipc/msgutil.c index bbf61275df41e1..d0f7dcf4c20840 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -27,7 +27,7 @@ DEFINE_SPINLOCK(mq_lock); * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { - .ns.count = REFCOUNT_INIT(1), + .ns.__ns_ref = REFCOUNT_INIT(1), .user_ns = &init_user_ns, .ns.inum = PROC_IPC_INIT_INO, #ifdef CONFIG_IPC_NS diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 092e6bf081ed1f..a0e24adceef005 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -219,7 +219,7 @@ static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_D /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 7c1b07e2a6c9ed..7aa2be6a0c32a7 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -5,7 +5,7 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) { - refcount_set(&ns->count, 1); + refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; diff --git a/kernel/pid.c b/kernel/pid.c index c45a28c16cd256..e222426f745dd2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,7 +71,7 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index d49c73015d6ec3..d70bdfb7b0014a 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -480,7 +480,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.count = REFCOUNT_INIT(3), + .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = PROC_TIME_INIT_INO, .ns.ops = &timens_operations, diff --git a/kernel/user.c b/kernel/user.c index f46b1d41163b20..17a742fb4e10ef 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,7 +65,7 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.count = REFCOUNT_INIT(3), + .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .ns.inum = PROC_USER_INIT_INO, From d093090ea79979ca7198b03bb2c31cd518b80b59 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Sep 2025 12:01:38 +0200 Subject: [PATCH 2329/3206] selftests/namespaces: verify initial namespace inode numbers Make sure that all works correctly. Signed-off-by: Christian Brauner --- tools/testing/selftests/namespaces/.gitignore | 1 + tools/testing/selftests/namespaces/Makefile | 2 +- .../selftests/namespaces/init_ino_test.c | 61 +++++++++++++++++++ 3 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/namespaces/init_ino_test.c diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index 7639dbf58bbf87..ccfb40837a7333 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -1,2 +1,3 @@ nsid_test file_handle_test +init_ino_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index f6c117ce2c2b80..5fe4b3dc07d3ed 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -TEST_GEN_PROGS := nsid_test file_handle_test +TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test include ../lib.mk diff --git a/tools/testing/selftests/namespaces/init_ino_test.c b/tools/testing/selftests/namespaces/init_ino_test.c new file mode 100644 index 00000000000000..5b6993c3740b7d --- /dev/null +++ b/tools/testing/selftests/namespaces/init_ino_test.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2025 Christian Brauner + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +struct ns_info { + const char *name; + const char *proc_path; + unsigned int expected_ino; +}; + +static struct ns_info namespaces[] = { + { "ipc", "/proc/1/ns/ipc", IPC_NS_INIT_INO }, + { "uts", "/proc/1/ns/uts", UTS_NS_INIT_INO }, + { "user", "/proc/1/ns/user", USER_NS_INIT_INO }, + { "pid", "/proc/1/ns/pid", PID_NS_INIT_INO }, + { "cgroup", "/proc/1/ns/cgroup", CGROUP_NS_INIT_INO }, + { "time", "/proc/1/ns/time", TIME_NS_INIT_INO }, + { "net", "/proc/1/ns/net", NET_NS_INIT_INO }, + { "mnt", "/proc/1/ns/mnt", MNT_NS_INIT_INO }, +}; + +TEST(init_namespace_inodes) +{ + struct stat st; + + for (int i = 0; i < sizeof(namespaces) / sizeof(namespaces[0]); i++) { + int ret = stat(namespaces[i].proc_path, &st); + + /* Some namespaces might not be available (e.g., time namespace on older kernels) */ + if (ret < 0) { + if (errno == ENOENT) { + ksft_test_result_skip("%s namespace not available\n", + namespaces[i].name); + continue; + } + ASSERT_GE(ret, 0) + TH_LOG("Failed to stat %s: %s", + namespaces[i].proc_path, strerror(errno)); + } + + ASSERT_EQ(st.st_ino, namespaces[i].expected_ino) + TH_LOG("Namespace %s has inode 0x%lx, expected 0x%x", + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); + + ksft_print_msg("Namespace %s: inode 0x%lx matches expected 0x%x\n", + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); + } +} + +TEST_HARNESS_MAIN From 7cf730321132e726ff949c6f3c0d5c598788f7a2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Sep 2025 11:29:49 +0200 Subject: [PATCH 2330/3206] ns: use inode initializer for initial namespaces Just use the common helper we have. Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- init/version-timestamp.c | 2 +- ipc/msgutil.c | 2 +- kernel/cgroup/cgroup.c | 2 +- kernel/pid.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 740a6ba524d0ce..271cd6294c8a73 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6012,7 +6012,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, } struct mnt_namespace init_mnt_ns = { - .ns.inum = PROC_MNT_INIT_INO, + .ns.inum = ns_init_inum(&init_mnt_ns), .ns.ops = &mntns_operations, .user_ns = &init_user_ns, .ns.__ns_ref = REFCOUNT_INIT(1), diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 547e522e6016c1..376b7c856d4d03 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -18,7 +18,7 @@ struct uts_namespace init_uts_ns = { .domainname = UTS_DOMAINNAME, }, .user_ns = &init_user_ns, - .ns.inum = PROC_UTS_INIT_INO, + .ns.inum = ns_init_inum(&init_uts_ns), #ifdef CONFIG_UTS_NS .ns.ops = &utsns_operations, #endif diff --git a/ipc/msgutil.c b/ipc/msgutil.c index d0f7dcf4c20840..dca6c8ec8f5ff9 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -29,7 +29,7 @@ DEFINE_SPINLOCK(mq_lock); struct ipc_namespace init_ipc_ns = { .ns.__ns_ref = REFCOUNT_INIT(1), .user_ns = &init_user_ns, - .ns.inum = PROC_IPC_INIT_INO, + .ns.inum = ns_init_inum(&init_ipc_ns), #ifdef CONFIG_IPC_NS .ns.ops = &ipcns_operations, #endif diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a0e24adceef005..245b43ff2fa481 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -222,7 +222,7 @@ struct cgroup_namespace init_cgroup_ns = { .ns.__ns_ref = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, - .ns.inum = PROC_CGROUP_INIT_INO, + .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, }; diff --git a/kernel/pid.c b/kernel/pid.c index e222426f745dd2..7e8c66e0bf676f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -77,7 +77,7 @@ struct pid_namespace init_pid_ns = { .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .ns.inum = PROC_PID_INIT_INO, + .ns.inum = ns_init_inum(&init_pid_ns), #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index d70bdfb7b0014a..7aa4d6fedd4961 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -482,7 +482,7 @@ const struct proc_ns_operations timens_for_children_operations = { struct time_namespace init_time_ns = { .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, - .ns.inum = PROC_TIME_INIT_INO, + .ns.inum = ns_init_inum(&init_time_ns), .ns.ops = &timens_operations, .frozen_offsets = true, }; diff --git a/kernel/user.c b/kernel/user.c index 17a742fb4e10ef..b2a53674d506f8 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -68,7 +68,7 @@ struct user_namespace init_user_ns = { .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .ns.inum = PROC_USER_INIT_INO, + .ns.inum = ns_init_inum(&init_user_ns), #ifdef CONFIG_USER_NS .ns.ops = &userns_operations, #endif From bba920e6f803138587248079de47ad3464a396f6 Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Thu, 18 Sep 2025 18:02:02 +0530 Subject: [PATCH 2331/3206] HID: amd_sfh: Add sync across amd sfh work functions The process of the report is delegated across different work functions. Hence, add a sync mechanism to protect SFH work data across functions. Fixes: 4b2c53d93a4b ("SFH:Transport Driver to add support of AMD Sensor Fusion Hub (SFH)") Reported-by: Matthew Schwartz Closes: https://lore.kernel.org/all/a21abca5-4268-449d-95f1-bdd7a25894a5@linux.dev/ Tested-by: Prakruthi SP Co-developed-by: Akshata MukundShetty Signed-off-by: Akshata MukundShetty Signed-off-by: Basavaraj Natikar Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Jiri Kosina --- drivers/hid/amd-sfh-hid/amd_sfh_client.c | 12 ++++++++++-- drivers/hid/amd-sfh-hid/amd_sfh_common.h | 3 +++ drivers/hid/amd-sfh-hid/amd_sfh_pcie.c | 4 ++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_client.c b/drivers/hid/amd-sfh-hid/amd_sfh_client.c index 0f2cbae39b2bb4..7017bfa590931b 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_client.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_client.c @@ -39,8 +39,12 @@ int amd_sfh_get_report(struct hid_device *hid, int report_id, int report_type) struct amdtp_hid_data *hid_data = hid->driver_data; struct amdtp_cl_data *cli_data = hid_data->cli_data; struct request_list *req_list = &cli_data->req_list; + struct amd_input_data *in_data = cli_data->in_data; + struct amd_mp2_dev *mp2; int i; + mp2 = container_of(in_data, struct amd_mp2_dev, in_data); + guard(mutex)(&mp2->lock); for (i = 0; i < cli_data->num_hid_devices; i++) { if (cli_data->hid_sensor_hubs[i] == hid) { struct request_list *new = kzalloc(sizeof(*new), GFP_KERNEL); @@ -75,6 +79,8 @@ void amd_sfh_work(struct work_struct *work) u8 report_id, node_type; u8 report_size = 0; + mp2 = container_of(in_data, struct amd_mp2_dev, in_data); + guard(mutex)(&mp2->lock); req_node = list_last_entry(&req_list->list, struct request_list, list); list_del(&req_node->list); current_index = req_node->current_index; @@ -83,7 +89,6 @@ void amd_sfh_work(struct work_struct *work) node_type = req_node->report_type; kfree(req_node); - mp2 = container_of(in_data, struct amd_mp2_dev, in_data); mp2_ops = mp2->mp2_ops; if (node_type == HID_FEATURE_REPORT) { report_size = mp2_ops->get_feat_rep(sensor_index, report_id, @@ -107,6 +112,8 @@ void amd_sfh_work(struct work_struct *work) cli_data->cur_hid_dev = current_index; cli_data->sensor_requested_cnt[current_index] = 0; amdtp_hid_wakeup(cli_data->hid_sensor_hubs[current_index]); + if (!list_empty(&req_list->list)) + schedule_delayed_work(&cli_data->work, 0); } void amd_sfh_work_buffer(struct work_struct *work) @@ -117,9 +124,10 @@ void amd_sfh_work_buffer(struct work_struct *work) u8 report_size; int i; + mp2 = container_of(in_data, struct amd_mp2_dev, in_data); + guard(mutex)(&mp2->lock); for (i = 0; i < cli_data->num_hid_devices; i++) { if (cli_data->sensor_sts[i] == SENSOR_ENABLED) { - mp2 = container_of(in_data, struct amd_mp2_dev, in_data); report_size = mp2->mp2_ops->get_in_rep(i, cli_data->sensor_idx[i], cli_data->report_id[i], in_data); hid_input_report(cli_data->hid_sensor_hubs[i], HID_INPUT_REPORT, diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_common.h b/drivers/hid/amd-sfh-hid/amd_sfh_common.h index f44a3bb2fbd4fe..78f830c133e5cd 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_common.h +++ b/drivers/hid/amd-sfh-hid/amd_sfh_common.h @@ -10,6 +10,7 @@ #ifndef AMD_SFH_COMMON_H #define AMD_SFH_COMMON_H +#include #include #include "amd_sfh_hid.h" @@ -59,6 +60,8 @@ struct amd_mp2_dev { u32 mp2_acs; struct sfh_dev_status dev_en; struct work_struct work; + /* mp2 to protect data */ + struct mutex lock; u8 init_done; u8 rver; }; diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c b/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c index 2983af969579ea..1d9f955573aa43 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c @@ -466,6 +466,10 @@ static int amd_mp2_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i if (!privdata->cl_data) return -ENOMEM; + rc = devm_mutex_init(&pdev->dev, &privdata->lock); + if (rc) + return rc; + privdata->sfh1_1_ops = (const struct amd_sfh1_1_ops *)id->driver_data; if (privdata->sfh1_1_ops) { if (boot_cpu_data.x86 >= 0x1A) From 3c54e6027f14c4f54be5508af748f6cc2fd72f89 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Sep 2025 06:14:11 -0700 Subject: [PATCH 2332/3206] xfs: constify xfs_errortag_random_default This table is never modified, so mark it const. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_error.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ac895cd2bc0a01..39830b252ac88d 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -19,7 +19,7 @@ #define XFS_ERRTAG(_tag, _name, _default) \ [XFS_ERRTAG_##_tag] = (_default), #include "xfs_errortag.h" -static unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS }; +static const unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS }; #undef XFS_ERRTAG struct xfs_errortag_attr { From daf4c2929fb792d24af0cd7bb6ca1f2949190fa4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:34 -0700 Subject: [PATCH 2333/3206] bpf: bpf_verifier_state->cleaned flag instead of REG_LIVE_DONE Prepare for bpf_reg_state->live field removal by introducing a separate flag to track if clean_verifier_state() had been applied to the state. No functional changes. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-1-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/log.c | 6 ++---- kernel/bpf/verifier.c | 13 ++++--------- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 020de62bd09cdd..ac16da8b49dc1c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -45,7 +45,6 @@ enum bpf_reg_liveness { REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ - REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; #define ITER_PREFIX "bpf_iter_" @@ -445,6 +444,7 @@ struct bpf_verifier_state { bool speculative; bool in_sleepable; + bool cleaned; /* first and last insn idx of this verifier state */ u32 first_insn_idx; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index e4983c1303e760..0d6d7bfb2fd05e 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -545,14 +545,12 @@ static char slot_type_char[] = { static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) - verbose(env, "_"); + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + verbose(env, "_"); if (live & REG_LIVE_READ) verbose(env, "r"); if (live & REG_LIVE_WRITTEN) verbose(env, "w"); - if (live & REG_LIVE_DONE) - verbose(env, "D"); } #define UNUM_MAX_DECIMAL U16_MAX diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aef6b266f08d1f..47cec5c8abff79 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1758,6 +1758,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return err; dst_state->speculative = src->speculative; dst_state->in_sleepable = src->in_sleepable; + dst_state->cleaned = src->cleaned; dst_state->curframe = src->curframe; dst_state->branches = src->branches; dst_state->parent = src->parent; @@ -3589,11 +3590,6 @@ static int mark_reg_read(struct bpf_verifier_env *env, /* if read wasn't screened by an earlier write ... */ if (writes && state->live & REG_LIVE_WRITTEN) break; - if (verifier_bug_if(parent->live & REG_LIVE_DONE, env, - "type %s var_off %lld off %d", - reg_type_str(env, parent->type), - parent->var_off.value, parent->off)) - return -EFAULT; /* The first condition is more likely to be true than the * second, checked it first. */ @@ -18501,7 +18497,6 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < BPF_REG_FP; i++) { live = st->regs[i].live; /* liveness must not touch this register anymore */ - st->regs[i].live |= REG_LIVE_DONE; if (!(live & REG_LIVE_READ)) /* since the register is unused, clear its state * to make further comparison simpler @@ -18512,7 +18507,6 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { live = st->stack[i].spilled_ptr.live; /* liveness must not touch this stack slot anymore */ - st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; if (!(live & REG_LIVE_READ)) { __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) @@ -18526,6 +18520,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env, { int i; + st->cleaned = true; for (i = 0; i <= st->curframe; i++) clean_func_state(env, st->frame[i]); } @@ -18553,7 +18548,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * their final liveness marks are already propagated. * Hence when the verifier completes the search of state list in is_state_visited() * we can call this clean_live_states() function to mark all liveness states - * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' + * as st->cleaned to indicate that 'parent' pointers of 'struct bpf_reg_state' * will not be used. * This function also clears the registers and stack for states that !READ * to simplify state merging. @@ -18576,7 +18571,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) continue; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) + if (sl->state.cleaned) /* all regs in this state in all frames were already marked */ continue; if (incomplete_read_marks(env, &sl->state)) From 6cd21eb9adc924237a6f398a7f6c9f3da251df71 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:35 -0700 Subject: [PATCH 2334/3206] bpf: use compute_live_registers() info in clean_func_state Prepare for bpf_reg_state->live field removal by leveraging insn_aux_data->live_regs_before instead of bpf_reg_state->live in compute_live_registers(). This is similar to logic in func_states_equal(). No changes in verification performance for selftests or sched_ext. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-2-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 47cec5c8abff79..64186ea0839bed 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18489,15 +18489,16 @@ static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) } static void clean_func_state(struct bpf_verifier_env *env, - struct bpf_func_state *st) + struct bpf_func_state *st, + u32 ip) { + u16 live_regs = env->insn_aux_data[ip].live_regs_before; enum bpf_reg_liveness live; int i, j; for (i = 0; i < BPF_REG_FP; i++) { - live = st->regs[i].live; /* liveness must not touch this register anymore */ - if (!(live & REG_LIVE_READ)) + if (!(live_regs & BIT(i))) /* since the register is unused, clear its state * to make further comparison simpler */ @@ -18518,11 +18519,13 @@ static void clean_func_state(struct bpf_verifier_env *env, static void clean_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { - int i; + int i, ip; st->cleaned = true; - for (i = 0; i <= st->curframe; i++) - clean_func_state(env, st->frame[i]); + for (i = 0; i <= st->curframe; i++) { + ip = frame_insn_idx(st, i); + clean_func_state(env, st->frame[i], ip); + } } /* the parentage chains form a tree. From 12a23f93a50dad7f820ca4326c7e289e7e13fb9f Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:36 -0700 Subject: [PATCH 2335/3206] bpf: remove redundant REG_LIVE_READ check in stacksafe() stacksafe() is called in exact == NOT_EXACT mode only for states that had been porcessed by clean_verifier_states(). The latter replaces dead stack spills with a series of STACK_INVALID masks. Such masks are already handled by stacksafe(). Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-3-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 64186ea0839bed..74a96a0d6c8a12 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18785,13 +18785,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, cur->stack[spi].slot_type[i % BPF_REG_SIZE])) return false; - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) - && exact == NOT_EXACT) { - i += BPF_REG_SIZE - 1; - /* explored state didn't use this */ - continue; - } - if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; From 3b20d3c120bae1e18ee11aa04531b161743db682 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:37 -0700 Subject: [PATCH 2336/3206] bpf: declare a few utility functions as internal api Namely, rename the following functions and add prototypes to bpf_verifier.h: - find_containing_subprog -> bpf_find_containing_subprog - insn_successors -> bpf_insn_successors - calls_callback -> bpf_calls_callback - fmt_stack_mask -> bpf_fmt_stack_mask Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-4-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ kernel/bpf/verifier.c | 34 ++++++++++++++++------------------ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ac16da8b49dc1c..93563564bde594 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1065,4 +1065,9 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, u32 frameno); +struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); +int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); +void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); +bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 74a96a0d6c8a12..921a5fa06df7a3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2979,7 +2979,7 @@ static int cmp_subprogs(const void *a, const void *b) } /* Find subprogram that contains instruction at 'off' */ -static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off) +struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *vals = env->subprog_info; int l, r, m; @@ -3004,7 +3004,7 @@ static int find_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *p; - p = find_containing_subprog(env, off); + p = bpf_find_containing_subprog(env, off); if (!p || p->start != off) return -ENOENT; return p - env->subprog_info; @@ -4211,7 +4211,7 @@ static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) } } /* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */ -static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) +void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) { DECLARE_BITMAP(mask, 64); bool first = true; @@ -4266,8 +4266,6 @@ static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_histo } } -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); - /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -4294,7 +4292,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt)); verbose(env, "mark_precise: frame%d: regs=%s ", bt->frame, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); verbose_insn(env, insn); @@ -4495,7 +4493,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * backtracking, as these registers are set by the function * invoking callback. */ - if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) + if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx)) for (i = BPF_REG_1; i <= BPF_REG_5; i++) bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { @@ -4934,7 +4932,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, bt_frame_reg_mask(bt, fr)); verbose(env, "mark_precise: frame%d: parent state regs=%s ", fr, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_stack_mask(bt, fr)); verbose(env, "stack=%s: ", env->tmp_str_buf); print_verifier_state(env, st, fr, true); @@ -11023,7 +11021,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) "At callback return", "R0"); return -EINVAL; } - if (!calls_callback(env, callee->callsite)) { + if (!bpf_calls_callback(env, callee->callsite)) { verifier_bug(env, "in callback at %d, callsite %d !calls_callback", *insn_idx, callee->callsite); return -EFAULT; @@ -17298,7 +17296,7 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *subprog; - subprog = find_containing_subprog(env, off); + subprog = bpf_find_containing_subprog(env, off); subprog->changes_pkt_data = true; } @@ -17306,7 +17304,7 @@ static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *subprog; - subprog = find_containing_subprog(env, off); + subprog = bpf_find_containing_subprog(env, off); subprog->might_sleep = true; } @@ -17320,8 +17318,8 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) { struct bpf_subprog_info *caller, *callee; - caller = find_containing_subprog(env, t); - callee = find_containing_subprog(env, w); + caller = bpf_find_containing_subprog(env, t); + callee = bpf_find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; caller->might_sleep |= callee->might_sleep; } @@ -17391,7 +17389,7 @@ static void mark_calls_callback(struct bpf_verifier_env *env, int idx) env->insn_aux_data[idx].calls_callback = true; } -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx) +bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx) { return env->insn_aux_data[insn_idx].calls_callback; } @@ -19439,7 +19437,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) goto hit; } } - if (calls_callback(env, insn_idx)) { + if (bpf_calls_callback(env, insn_idx)) { if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) goto hit; goto skip_inf_loop_check; @@ -24171,7 +24169,7 @@ static bool can_jump(struct bpf_insn *insn) return false; } -static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) { struct bpf_insn *insn = &prog->insnsi[idx]; int i = 0, insn_sz; @@ -24387,7 +24385,7 @@ static int compute_live_registers(struct bpf_verifier_env *env) u16 new_out = 0; u16 new_in = 0; - succ_num = insn_successors(env->prog, insn_idx, succ); + succ_num = bpf_insn_successors(env->prog, insn_idx, succ); for (int s = 0; s < succ_num; ++s) new_out |= state[succ[s]].in; new_in = (new_out & ~live->def) | live->use; @@ -24556,7 +24554,7 @@ static int compute_scc(struct bpf_verifier_env *env) stack[stack_sz++] = w; } /* Visit 'w' successors */ - succ_cnt = insn_successors(env->prog, w, succ); + succ_cnt = bpf_insn_successors(env->prog, w, succ); for (j = 0; j < succ_cnt; ++j) { if (pre[succ[j]]) { low[w] = min(low[w], low[succ[j]]); From efcda22aa541bbda827e54302baf9ae4fd44cdf2 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:38 -0700 Subject: [PATCH 2337/3206] bpf: compute instructions postorder per subprogram The next patch would require doing postorder traversal of individual subprograms. Facilitate this by moving env->cfg.insn_postorder computation from check_cfg() to a separate pass, as check_cfg() descends into called subprograms (and it needs to, because of merge_callee_effects() logic). env->cfg.insn_postorder is used only by compute_live_registers(), this function does not track cross subprogram dependencies, thus the change does not affect it's operation. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-5-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +++- kernel/bpf/verifier.c | 68 +++++++++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 93563564bde594..bd87e80f942316 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -665,6 +665,7 @@ struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ + u32 postorder_start; /* The idx to the env->cfg.insn_postorder */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; /* offsets in range [stack_depth .. fastcall_stack_off) @@ -794,7 +795,10 @@ struct bpf_verifier_env { struct { int *insn_state; int *insn_stack; - /* vector of instruction indexes sorted in post-order */ + /* + * vector of instruction indexes sorted in post-order, grouped by subprogram, + * see bpf_subprog_info->postorder_start. + */ int *insn_postorder; int cur_stack; /* current position in the insn_postorder vector */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 921a5fa06df7a3..dc8d26dc9bf192 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17863,7 +17863,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; - int *insn_stack, *insn_state, *insn_postorder; + int *insn_stack, *insn_state; int ex_insn_beg, i, ret = 0; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); @@ -17876,14 +17876,6 @@ static int check_cfg(struct bpf_verifier_env *env) return -ENOMEM; } - insn_postorder = env->cfg.insn_postorder = - kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); - if (!insn_postorder) { - kvfree(insn_state); - kvfree(insn_stack); - return -ENOMEM; - } - ex_insn_beg = env->exception_callback_subprog ? env->subprog_info[env->exception_callback_subprog].start : 0; @@ -17901,7 +17893,6 @@ static int check_cfg(struct bpf_verifier_env *env) case DONE_EXPLORING: insn_state[t] = EXPLORED; env->cfg.cur_stack--; - insn_postorder[env->cfg.cur_postorder++] = t; break; case KEEP_EXPLORING: break; @@ -17955,6 +17946,56 @@ static int check_cfg(struct bpf_verifier_env *env) return ret; } +/* + * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range + * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start) + * with indices of 'i' instructions in postorder. + */ +static int compute_postorder(struct bpf_verifier_env *env) +{ + u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2]; + int *stack = NULL, *postorder = NULL, *state = NULL; + + postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + stack = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + if (!postorder || !state || !stack) { + kvfree(postorder); + kvfree(state); + kvfree(stack); + return -ENOMEM; + } + cur_postorder = 0; + for (i = 0; i < env->subprog_cnt; i++) { + env->subprog_info[i].postorder_start = cur_postorder; + stack[0] = env->subprog_info[i].start; + stack_sz = 1; + do { + top = stack[stack_sz - 1]; + state[top] |= DISCOVERED; + if (state[top] & EXPLORED) { + postorder[cur_postorder++] = top; + stack_sz--; + continue; + } + succ_cnt = bpf_insn_successors(env->prog, top, succ); + for (s = 0; s < succ_cnt; ++s) { + if (!state[succ[s]]) { + stack[stack_sz++] = succ[s]; + state[succ[s]] |= DISCOVERED; + } + } + state[top] |= EXPLORED; + } while (stack_sz); + } + env->subprog_info[i].postorder_start = cur_postorder; + env->cfg.insn_postorder = postorder; + env->cfg.cur_postorder = cur_postorder; + kvfree(stack); + kvfree(state); + return 0; +} + static int check_abnormal_return(struct bpf_verifier_env *env) { int i; @@ -24422,9 +24463,6 @@ static int compute_live_registers(struct bpf_verifier_env *env) out: kvfree(state); - kvfree(env->cfg.insn_postorder); - env->cfg.insn_postorder = NULL; - env->cfg.cur_postorder = 0; return err; } @@ -24727,6 +24765,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = compute_postorder(env); + if (ret < 0) + goto skip_full_check; + ret = check_attach_btf_id(env); if (ret) goto skip_full_check; From b3698c356ad92bcdb9920655bc9df02a2a8946f9 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:39 -0700 Subject: [PATCH 2338/3206] bpf: callchain sensitive stack liveness tracking using CFG This commit adds a flow-sensitive, context-sensitive, path-insensitive data flow analysis for live stack slots: - flow-sensitive: uses program control flow graph to compute data flow values; - context-sensitive: collects data flow values for each possible call chain in a program; - path-insensitive: does not distinguish between separate control flow graph paths reaching the same instruction. Compared to the current path-sensitive analysis, this approach trades some precision for not having to enumerate every path in the program. This gives a theoretical capability to run the analysis before main verification pass. See cover letter for motivation. The basic idea is as follows: - Data flow values indicate stack slots that might be read and stack slots that are definitely written. - Data flow values are collected for each (call chain, instruction number) combination in the program. - Within a subprogram, data flow values are propagated using control flow graph. - Data flow values are transferred from entry instructions of callee subprograms to call sites in caller subprograms. In other words, a tree of all possible call chains is constructed. Each node of this tree represents a subprogram. Read and write marks are collected for each instruction of each node. Live stack slots are first computed for lower level nodes. Then, information about outer stack slots that might be read or are definitely written by a subprogram is propagated one level up, to the corresponding call instructions of the upper nodes. Procedure repeats until root node is processed. In the absence of value range analysis, stack read/write marks are collected during main verification pass, and data flow computation is triggered each time verifier.c:states_equal() needs to query the information. Implementation details are documented in kernel/bpf/liveness.c. Quantitative data about verification performance changes and memory consumption is in the cover letter. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-6-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 14 + kernel/bpf/Makefile | 2 +- kernel/bpf/liveness.c | 677 +++++++++++++++++++++++++++++++++++ 3 files changed, 692 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/liveness.c diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bd87e80f942316..2e3bdd50e2ba46 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -745,6 +745,8 @@ struct bpf_scc_info { struct bpf_scc_visit visits[]; }; +struct bpf_liveness; + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -846,6 +848,7 @@ struct bpf_verifier_env { struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; struct bpf_scc_callchain callchain_buf; + struct bpf_liveness *liveness; /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; @@ -1074,4 +1077,15 @@ int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); +int bpf_stack_liveness_init(struct bpf_verifier_env *env); +void bpf_stack_liveness_free(struct bpf_verifier_env *env); +int bpf_update_live_stack(struct bpf_verifier_env *env); +int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frameno, u32 insn_idx, u64 mask); +void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frameno, u64 mask); +int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx); +int bpf_commit_stack_write_marks(struct bpf_verifier_env *env); +int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st); +bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi); +void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f6cf8c2af5f7b3..7fd0badfacb12f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c new file mode 100644 index 00000000000000..6f9dfaaf6e64fa --- /dev/null +++ b/kernel/bpf/liveness.c @@ -0,0 +1,677 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include + +/* + * This file implements live stack slots analysis. After accumulating + * stack usage data, the analysis answers queries about whether a + * particular stack slot may be read by an instruction or any of it's + * successors. This data is consumed by the verifier states caching + * mechanism to decide which stack slots are important when looking for a + * visited state corresponding to the current state. + * + * The analysis is call chain sensitive, meaning that data is collected + * and queried for tuples (call chain, subprogram instruction index). + * Such sensitivity allows identifying if some subprogram call always + * leads to writes in the caller's stack. + * + * The basic idea is as follows: + * - As the verifier accumulates a set of visited states, the analysis instance + * accumulates a conservative estimate of stack slots that can be read + * or must be written for each visited tuple (call chain, instruction index). + * - If several states happen to visit the same instruction with the same + * call chain, stack usage information for the corresponding tuple is joined: + * - "may_read" set represents a union of all possibly read slots + * (any slot in "may_read" set might be read at or after the instruction); + * - "must_write" set represents an intersection of all possibly written slots + * (any slot in "must_write" set is guaranteed to be written by the instruction). + * - The analysis is split into two phases: + * - read and write marks accumulation; + * - read and write marks propagation. + * - The propagation phase is a textbook live variable data flow analysis: + * + * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)] + * state[cc, i].live_before = + * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read + * + * Where: + * - `U` stands for set union + * - `/` stands for set difference; + * - `cc` stands for a call chain; + * - `i` and `s` are instruction indexes; + * + * The above equations are computed for each call chain and instruction + * index until state stops changing. + * - Additionally, in order to transfer "must_write" information from a + * subprogram to call instructions invoking this subprogram, + * the "must_write_acc" set is tracked for each (cc, i) tuple. + * A set of stack slots that are guaranteed to be written by this + * instruction or any of its successors (within the subprogram). + * The equation for "must_write_acc" propagation looks as follows: + * + * state[cc, i].must_write_acc = + * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)] + * U state[cc, i].must_write + * + * (An intersection of all "must_write_acc" for instruction successors + * plus all "must_write" slots for the instruction itself). + * - After the propagation phase completes for a subprogram, information from + * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain: + * - "must_write_acc" set is intersected with the call site's "must_write" set; + * - "may_read" set is added to the call site's "may_read" set. + * - Any live stack queries must be taken after the propagation phase. + * - Accumulation and propagation phases can be entered multiple times, + * at any point in time: + * - "may_read" set only grows; + * - "must_write" set only shrinks; + * - for each visited verifier state with zero branches, all relevant + * read and write marks are already recorded by the analysis instance. + * + * Technically, the analysis is facilitated by the following data structures: + * - Call chain: for given verifier state, the call chain is a tuple of call + * instruction indexes leading to the current subprogram plus the subprogram + * entry point index. + * - Function instance: for a given call chain, for each instruction in + * the current subprogram, a mapping between instruction index and a + * set of "may_read", "must_write" and other marks accumulated for this + * instruction. + * - A hash table mapping call chains to function instances. + */ + +struct callchain { + u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */ + /* cached subprog_info[*].start for functions owning the frames: + * - sp_starts[curframe] used to get insn relative index within current function; + * - sp_starts[0..current-1] used for fast callchain_frame_up(). + */ + u32 sp_starts[MAX_CALL_FRAMES]; + u32 curframe; /* depth of callsites and sp_starts arrays */ +}; + +struct per_frame_masks { + u64 may_read; /* stack slots that may be read by this instruction */ + u64 must_write; /* stack slots written by this instruction */ + u64 must_write_acc; /* stack slots written by this instruction and its successors */ + u64 live_before; /* stack slots that may be read by this insn and its successors */ +}; + +/* + * A function instance created for a specific callchain. + * Encapsulates read and write marks for each instruction in the function. + * Marks are tracked for each frame in the callchain. + */ +struct func_instance { + struct hlist_node hl_node; + struct callchain callchain; + u32 insn_cnt; /* cached number of insns in the function */ + bool updated; + bool must_write_dropped; + /* Per frame, per instruction masks, frames allocated lazily. */ + struct per_frame_masks *frames[MAX_CALL_FRAMES]; + /* For each instruction a flag telling if "must_write" had been initialized for it. */ + bool *must_write_set; +}; + +struct live_stack_query { + struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */ + u32 curframe; + u32 insn_idx; +}; + +struct bpf_liveness { + DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */ + struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */ + /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */ + struct func_instance *cur_instance; + /* + * Below fields are used to accumulate stack write marks for instruction at + * @write_insn_idx before submitting the marks to @cur_instance. + */ + u64 write_masks_acc[MAX_CALL_FRAMES]; + u32 write_insn_idx; +}; + +/* Compute callchain corresponding to state @st at depth @frameno */ +static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st, + struct callchain *callchain, u32 frameno) +{ + struct bpf_subprog_info *subprog_info = env->subprog_info; + u32 i; + + memset(callchain, 0, sizeof(*callchain)); + for (i = 0; i <= frameno; i++) { + callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start; + if (i < st->curframe) + callchain->callsites[i] = st->frame[i + 1]->callsite; + } + callchain->curframe = frameno; + callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe]; +} + +static u32 hash_callchain(struct callchain *callchain) +{ + return jhash2(callchain->callsites, callchain->curframe, 0); +} + +static bool same_callsites(struct callchain *a, struct callchain *b) +{ + int i; + + if (a->curframe != b->curframe) + return false; + for (i = a->curframe; i >= 0; i--) + if (a->callsites[i] != b->callsites[i]) + return false; + return true; +} + +/* + * Find existing or allocate new function instance corresponding to @callchain. + * Instances are accumulated in env->liveness->func_instances and persist + * until the end of the verification process. + */ +static struct func_instance *__lookup_instance(struct bpf_verifier_env *env, + struct callchain *callchain) +{ + struct bpf_liveness *liveness = env->liveness; + struct bpf_subprog_info *subprog; + struct func_instance *result; + u32 subprog_sz, size, key; + + key = hash_callchain(callchain); + hash_for_each_possible(liveness->func_instances, result, hl_node, key) + if (same_callsites(&result->callchain, callchain)) + return result; + + subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]); + subprog_sz = (subprog + 1)->start - subprog->start; + size = sizeof(struct func_instance); + result = kvzalloc(size, GFP_KERNEL_ACCOUNT); + if (!result) + return ERR_PTR(-ENOMEM); + result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set), + GFP_KERNEL_ACCOUNT); + if (!result->must_write_set) + return ERR_PTR(-ENOMEM); + memcpy(&result->callchain, callchain, sizeof(*callchain)); + result->insn_cnt = subprog_sz; + hash_add(liveness->func_instances, &result->hl_node, key); + return result; +} + +static struct func_instance *lookup_instance(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + u32 frameno) +{ + struct callchain callchain; + + compute_callchain(env, st, &callchain, frameno); + return __lookup_instance(env, &callchain); +} + +int bpf_stack_liveness_init(struct bpf_verifier_env *env) +{ + env->liveness = kvzalloc(sizeof(*env->liveness), GFP_KERNEL_ACCOUNT); + if (!env->liveness) + return -ENOMEM; + hash_init(env->liveness->func_instances); + return 0; +} + +void bpf_stack_liveness_free(struct bpf_verifier_env *env) +{ + struct func_instance *instance; + struct hlist_node *tmp; + int bkt, i; + + if (!env->liveness) + return; + hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) { + for (i = 0; i <= instance->callchain.curframe; i++) + kvfree(instance->frames[i]); + kvfree(instance->must_write_set); + kvfree(instance); + } + kvfree(env->liveness); +} + +/* + * Convert absolute instruction index @insn_idx to an index relative + * to start of the function corresponding to @instance. + */ +static int relative_idx(struct func_instance *instance, u32 insn_idx) +{ + return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe]; +} + +static struct per_frame_masks *get_frame_masks(struct func_instance *instance, + u32 frame, u32 insn_idx) +{ + if (!instance->frames[frame]) + return NULL; + + return &instance->frames[frame][relative_idx(instance, insn_idx)]; +} + +static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env, + struct func_instance *instance, + u32 frame, u32 insn_idx) +{ + struct per_frame_masks *arr; + + if (!instance->frames[frame]) { + arr = kvcalloc(instance->insn_cnt, sizeof(*arr), GFP_KERNEL_ACCOUNT); + instance->frames[frame] = arr; + if (!arr) + return ERR_PTR(-ENOMEM); + } + return get_frame_masks(instance, frame, insn_idx); +} + +void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env) +{ + env->liveness->cur_instance = NULL; +} + +/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */ +static int ensure_cur_instance(struct bpf_verifier_env *env) +{ + struct bpf_liveness *liveness = env->liveness; + struct func_instance *instance; + + if (liveness->cur_instance) + return 0; + + instance = lookup_instance(env, env->cur_state, env->cur_state->curframe); + if (IS_ERR(instance)) + return PTR_ERR(instance); + + liveness->cur_instance = instance; + return 0; +} + +/* Accumulate may_read masks for @frame at @insn_idx */ +static int mark_stack_read(struct bpf_verifier_env *env, + struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask) +{ + struct per_frame_masks *masks; + u64 new_may_read; + + masks = alloc_frame_masks(env, instance, frame, insn_idx); + if (IS_ERR(masks)) + return PTR_ERR(masks); + new_may_read = masks->may_read | mask; + if (new_may_read != masks->may_read && + ((new_may_read | masks->live_before) != masks->live_before)) + instance->updated = true; + masks->may_read |= mask; + return 0; +} + +int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask) +{ + int err; + + err = ensure_cur_instance(env); + err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask); + return err; +} + +static void reset_stack_write_marks(struct bpf_verifier_env *env, + struct func_instance *instance, u32 insn_idx) +{ + struct bpf_liveness *liveness = env->liveness; + int i; + + liveness->write_insn_idx = insn_idx; + for (i = 0; i <= instance->callchain.curframe; i++) + liveness->write_masks_acc[i] = 0; +} + +int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx) +{ + struct bpf_liveness *liveness = env->liveness; + int err; + + err = ensure_cur_instance(env); + if (err) + return err; + + reset_stack_write_marks(env, liveness->cur_instance, insn_idx); + return 0; +} + +void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask) +{ + env->liveness->write_masks_acc[frame] |= mask; +} + +static int commit_stack_write_marks(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct bpf_liveness *liveness = env->liveness; + u32 idx, frame, curframe, old_must_write; + struct per_frame_masks *masks; + u64 mask; + + if (!instance) + return 0; + + curframe = instance->callchain.curframe; + idx = relative_idx(instance, liveness->write_insn_idx); + for (frame = 0; frame <= curframe; frame++) { + mask = liveness->write_masks_acc[frame]; + /* avoid allocating frames for zero masks */ + if (mask == 0 && !instance->must_write_set[idx]) + continue; + masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx); + if (IS_ERR(masks)) + return PTR_ERR(masks); + old_must_write = masks->must_write; + /* + * If instruction at this callchain is seen for a first time, set must_write equal + * to @mask. Otherwise take intersection with the previous value. + */ + if (instance->must_write_set[idx]) + mask &= old_must_write; + if (old_must_write != mask) { + masks->must_write = mask; + instance->updated = true; + } + if (old_must_write & ~mask) + instance->must_write_dropped = true; + } + instance->must_write_set[idx] = true; + liveness->write_insn_idx = 0; + return 0; +} + +/* + * Merge stack writes marks in @env->liveness->write_masks_acc + * with information already in @env->liveness->cur_instance. + */ +int bpf_commit_stack_write_marks(struct bpf_verifier_env *env) +{ + return commit_stack_write_marks(env, env->liveness->cur_instance); +} + +static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain) +{ + char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf); + char *buf = env->tmp_str_buf; + int i; + + buf += snprintf(buf, buf_end - buf, "("); + for (i = 0; i <= callchain->curframe; i++) + buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]); + snprintf(buf, buf_end - buf, ")"); + return env->tmp_str_buf; +} + +static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain, + char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new) +{ + u64 changed_bits = old ^ new; + u64 new_ones = new & changed_bits; + u64 new_zeros = ~new & changed_bits; + + if (!changed_bits) + return; + bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx); + if (new_ones) { + bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones); + bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf); + } + if (new_zeros) { + bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros); + bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf); + } + bpf_log(&env->log, "\n"); +} + +static struct func_instance *get_outer_instance(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct callchain callchain = instance->callchain; + + /* Adjust @callchain to represent callchain one frame up */ + callchain.callsites[callchain.curframe] = 0; + callchain.sp_starts[callchain.curframe] = 0; + callchain.curframe--; + callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe]; + return __lookup_instance(env, &callchain); +} + +static u32 callchain_subprog_start(struct callchain *callchain) +{ + return callchain->sp_starts[callchain->curframe]; +} + +/* + * Transfer @may_read and @must_write_acc marks from the first instruction of @instance, + * to the call instruction in function instance calling @instance. + */ +static int propagate_to_outer_instance(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct callchain *callchain = &instance->callchain; + u32 this_subprog_start, callsite, frame; + struct func_instance *outer_instance; + struct per_frame_masks *insn; + int err; + + this_subprog_start = callchain_subprog_start(callchain); + outer_instance = get_outer_instance(env, instance); + callsite = callchain->callsites[callchain->curframe - 1]; + + reset_stack_write_marks(env, outer_instance, callsite); + for (frame = 0; frame < callchain->curframe; frame++) { + insn = get_frame_masks(instance, frame, this_subprog_start); + if (!insn) + continue; + bpf_mark_stack_write(env, frame, insn->must_write_acc); + err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before); + if (err) + return err; + } + commit_stack_write_marks(env, outer_instance); + return 0; +} + +static inline bool update_insn(struct bpf_verifier_env *env, + struct func_instance *instance, u32 frame, u32 insn_idx) +{ + struct bpf_insn_aux_data *aux = env->insn_aux_data; + u64 new_before, new_after, must_write_acc; + struct per_frame_masks *insn, *succ_insn; + u32 succ_num, s, succ[2]; + bool changed; + + succ_num = bpf_insn_successors(env->prog, insn_idx, succ); + if (unlikely(succ_num == 0)) + return false; + + changed = false; + insn = get_frame_masks(instance, frame, insn_idx); + new_before = 0; + new_after = 0; + /* + * New "must_write_acc" is an intersection of all "must_write_acc" + * of successors plus all "must_write" slots of instruction itself. + */ + must_write_acc = U64_MAX; + for (s = 0; s < succ_num; ++s) { + succ_insn = get_frame_masks(instance, frame, succ[s]); + new_after |= succ_insn->live_before; + must_write_acc &= succ_insn->must_write_acc; + } + must_write_acc |= insn->must_write; + /* + * New "live_before" is a union of all "live_before" of successors + * minus slots written by instruction plus slots read by instruction. + */ + new_before = (new_after & ~insn->must_write) | insn->may_read; + changed |= new_before != insn->live_before; + changed |= must_write_acc != insn->must_write_acc; + if (unlikely(env->log.level & BPF_LOG_LEVEL2) && + (insn->may_read || insn->must_write || + insn_idx == callchain_subprog_start(&instance->callchain) || + aux[insn_idx].prune_point)) { + log_mask_change(env, &instance->callchain, "live", + frame, insn_idx, insn->live_before, new_before); + log_mask_change(env, &instance->callchain, "written", + frame, insn_idx, insn->must_write_acc, must_write_acc); + } + insn->live_before = new_before; + insn->must_write_acc = must_write_acc; + return changed; +} + +/* Fixed-point computation of @live_before and @must_write_acc marks */ +static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance) +{ + u32 i, frame, po_start, po_end, cnt, this_subprog_start; + struct callchain *callchain = &instance->callchain; + int *insn_postorder = env->cfg.insn_postorder; + struct bpf_subprog_info *subprog; + struct per_frame_masks *insn; + bool changed; + int err; + + this_subprog_start = callchain_subprog_start(callchain); + /* + * If must_write marks were updated must_write_acc needs to be reset + * (to account for the case when new must_write sets became smaller). + */ + if (instance->must_write_dropped) { + for (frame = 0; frame <= callchain->curframe; frame++) { + if (!instance->frames[frame]) + continue; + + for (i = 0; i < instance->insn_cnt; i++) { + insn = get_frame_masks(instance, frame, this_subprog_start + i); + insn->must_write_acc = 0; + } + } + } + + subprog = bpf_find_containing_subprog(env, this_subprog_start); + po_start = subprog->postorder_start; + po_end = (subprog + 1)->postorder_start; + cnt = 0; + /* repeat until fixed point is reached */ + do { + cnt++; + changed = false; + for (frame = 0; frame <= instance->callchain.curframe; frame++) { + if (!instance->frames[frame]) + continue; + + for (i = po_start; i < po_end; i++) + changed |= update_insn(env, instance, frame, insn_postorder[i]); + } + } while (changed); + + if (env->log.level & BPF_LOG_LEVEL2) + bpf_log(&env->log, "%s live stack update done in %d iterations\n", + fmt_callchain(env, callchain), cnt); + + /* transfer marks accumulated for outer frames to outer func instance (caller) */ + if (callchain->curframe > 0) { + err = propagate_to_outer_instance(env, instance); + if (err) + return err; + } + + return 0; +} + +/* + * Prepare all callchains within @env->cur_state for querying. + * This function should be called after each verifier.c:pop_stack() + * and whenever verifier.c:do_check_insn() processes subprogram exit. + * This would guarantee that visited verifier states with zero branches + * have their bpf_mark_stack_{read,write}() effects propagated in + * @env->liveness. + */ +int bpf_update_live_stack(struct bpf_verifier_env *env) +{ + struct func_instance *instance; + int err, frame; + + bpf_reset_live_stack_callchain(env); + for (frame = env->cur_state->curframe; frame >= 0; --frame) { + instance = lookup_instance(env, env->cur_state, frame); + if (IS_ERR(instance)) + return PTR_ERR(instance); + + if (instance->updated) { + err = update_instance(env, instance); + if (err) + return err; + instance->updated = false; + instance->must_write_dropped = false; + } + } + return 0; +} + +static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi) +{ + struct per_frame_masks *masks; + + masks = get_frame_masks(instance, frameno, insn_idx); + return masks && (masks->live_before & BIT(spi)); +} + +int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct live_stack_query *q = &env->liveness->live_stack_query; + struct func_instance *instance; + u32 frame; + + memset(q, 0, sizeof(*q)); + for (frame = 0; frame <= st->curframe; frame++) { + instance = lookup_instance(env, st, frame); + if (IS_ERR(instance)) + return PTR_ERR(instance); + q->instances[frame] = instance; + } + q->curframe = st->curframe; + q->insn_idx = st->insn_idx; + return 0; +} + +bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi) +{ + /* + * Slot is alive if it is read before q->st->insn_idx in current func instance, + * or if for some outer func instance: + * - alive before callsite if callsite calls callback, otherwise + * - alive after callsite + */ + struct live_stack_query *q = &env->liveness->live_stack_query; + struct func_instance *instance, *curframe_instance; + u32 i, callsite; + bool alive; + + curframe_instance = q->instances[q->curframe]; + if (is_live_before(curframe_instance, q->insn_idx, frameno, spi)) + return true; + + for (i = frameno; i < q->curframe; i++) { + callsite = curframe_instance->callchain.callsites[i]; + instance = q->instances[i]; + alive = bpf_calls_callback(env, callsite) + ? is_live_before(instance, callsite, frameno, spi) + : is_live_before(instance, callsite + 1, frameno, spi); + if (alive) + return true; + } + + return false; +} From e41c237953b36cdd025b82996a74bfe39c509d20 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:40 -0700 Subject: [PATCH 2339/3206] bpf: enable callchain sensitive stack liveness tracking Allocate analysis instance: - Add bpf_stack_liveness_{init,free}() calls to bpf_check(). Notify the instance about any stack reads and writes: - Add bpf_mark_stack_write() call at every location where REG_LIVE_WRITTEN is recorded for a stack slot. - Add bpf_mark_stack_read() call at every location mark_reg_read() is called. - Both bpf_mark_stack_{read,write}() rely on env->liveness->cur_instance callchain being in sync with env->cur_state. It is possible to update env->liveness->cur_instance every time a mark read/write is called, but that costs a hash table lookup and is noticeable in the performance profile. Hence, manually reset env->liveness->cur_instance whenever the verifier changes env->cur_state call stack: - call bpf_reset_live_stack_callchain() when the verifier enters a subprogram; - call bpf_update_live_stack() when the verifier exits a subprogram (it implies the reset). Make sure bpf_update_live_stack() is called for a callchain before issuing liveness queries. And make sure that bpf_update_live_stack() is called for any callee callchain first: - Add bpf_update_live_stack() call at every location that processes BPF_EXIT: - exit from a subprogram; - before pop_stack() call. This makes sure that bpf_update_live_stack() is called for callee callchains before caller callchains. Make sure must_write marks are set to zero for instructions that do not always access the stack: - Wrap do_check_insn() with bpf_reset_stack_write_marks() / bpf_commit_stack_write_marks() calls. Any calls to bpf_mark_stack_write() are accumulated between this pair of calls. If no bpf_mark_stack_write() calls were made it means that the instruction does not access stack (at-least on the current verification path) and it is important to record this fact. Finally, use bpf_live_stack_query_init() / bpf_stack_slot_alive() to query stack liveness info. The manual tracking of the correct order for callee/caller bpf_update_live_stack() calls is a bit convoluted and may warrant some automation in future revisions. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-7-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 61 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dc8d26dc9bf192..bb931a144b9532 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -789,6 +789,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; } @@ -828,6 +829,7 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat */ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); } static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) @@ -939,6 +941,7 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, /* Same reason as unmark_stack_slots_dynptr above */ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; } @@ -1066,6 +1069,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_ITER; + bpf_mark_stack_write(env, state->frameno, BIT(spi - i)); mark_stack_slot_scratched(env, spi - i); } @@ -1097,6 +1101,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_INVALID; + bpf_mark_stack_write(env, state->frameno, BIT(spi - i)); mark_stack_slot_scratched(env, spi - i); } @@ -1186,6 +1191,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, slot = &state->stack[spi]; st = &slot->spilled_ptr; + bpf_mark_stack_write(env, reg->frameno, BIT(spi)); __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ st->live |= REG_LIVE_WRITTEN; @@ -1244,6 +1250,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ st->live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, reg->frameno, BIT(spi)); for (i = 0; i < BPF_REG_SIZE; i++) slot->slot_type[i] = STACK_INVALID; @@ -3634,6 +3641,9 @@ static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg if (err) return err; + err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i)); + if (err) + return err; mark_stack_slot_scratched(env, spi - i); } return 0; @@ -5166,6 +5176,18 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, if (err) return err; + if (!(off % BPF_REG_SIZE) && size == BPF_REG_SIZE) { + /* only mark the slot as written if all 8 bytes were written + * otherwise read propagation may incorrectly stop too soon + * when stack slots are partially written. + * This heuristic means that read propagation will be + * conservative, since it will add reg_live_read marks + * to stack slots all the way to first state when programs + * writes+reads less than 8 bytes + */ + bpf_mark_stack_write(env, state->frameno, BIT(spi)); + } + check_fastcall_stack_contract(env, state, insn_idx, off); mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) { @@ -5435,12 +5457,16 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg; u8 *stype, type; int insn_flags = insn_stack_access_flags(reg_state->frameno, spi); + int err; stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; mark_stack_slot_scratched(env, spi); check_fastcall_stack_contract(env, state, env->insn_idx, off); + err = bpf_mark_stack_read(env, reg_state->frameno, env->insn_idx, BIT(spi)); + if (err) + return err; if (is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; @@ -8174,6 +8200,9 @@ static int check_stack_range_initialized( mark_reg_read(env, &state->stack[spi].spilled_ptr, state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64); + err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi)); + if (err) + return err; /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not * be sure that whether stack slot is written to or not. Hence, * we must still conservatively propagate reads upwards even if @@ -10735,6 +10764,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* and go analyze first insn of the callee */ *insn_idx = env->subprog_info[subprog].start - 1; + bpf_reset_live_stack_callchain(env); + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); print_verifier_state(env, state, caller->frameno, true); @@ -18532,7 +18563,6 @@ static void clean_func_state(struct bpf_verifier_env *env, u32 ip) { u16 live_regs = env->insn_aux_data[ip].live_regs_before; - enum bpf_reg_liveness live; int i, j; for (i = 0; i < BPF_REG_FP; i++) { @@ -18545,9 +18575,7 @@ static void clean_func_state(struct bpf_verifier_env *env, } for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { - live = st->stack[i].spilled_ptr.live; - /* liveness must not touch this stack slot anymore */ - if (!(live & REG_LIVE_READ)) { + if (!bpf_stack_slot_alive(env, st->frameno, i)) { __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; @@ -18560,6 +18588,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env, { int i, ip; + bpf_live_stack_query_init(env, st); st->cleaned = true; for (i = 0; i <= st->curframe; i++) { ip = frame_insn_idx(st, i); @@ -18645,9 +18674,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (exact == EXACT) return regs_exact(rold, rcur, idmap); - if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT) - /* explored state didn't use this */ - return true; if (rold->type == NOT_INIT) { if (exact == NOT_EXACT || rcur->type == NOT_INIT) /* explored state can't have used this */ @@ -19886,6 +19912,9 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, return PROCESS_BPF_EXIT; if (env->cur_state->curframe) { + err = bpf_update_live_stack(env); + if (err) + return err; /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); if (err) @@ -20071,7 +20100,7 @@ static int do_check(struct bpf_verifier_env *env) for (;;) { struct bpf_insn *insn; struct bpf_insn_aux_data *insn_aux; - int err; + int err, marks_err; /* reset current history entry on each new instruction */ env->cur_hist_ent = NULL; @@ -20164,7 +20193,15 @@ static int do_check(struct bpf_verifier_env *env) if (state->speculative && insn_aux->nospec) goto process_bpf_exit; + err = bpf_reset_stack_write_marks(env, env->insn_idx); + if (err) + return err; err = do_check_insn(env, &do_print_state); + if (err >= 0 || error_recoverable_with_nospec(err)) { + marks_err = bpf_commit_stack_write_marks(env); + if (marks_err) + return marks_err; + } if (error_recoverable_with_nospec(err) && state->speculative) { /* Prevent this speculative path from ever reaching the * insn that would have been unsafe to execute. @@ -20203,6 +20240,9 @@ static int do_check(struct bpf_verifier_env *env) process_bpf_exit: mark_verifier_state_scratched(env); err = update_branch_counts(env, env->cur_state); + if (err) + return err; + err = bpf_update_live_stack(env); if (err) return err; err = pop_stack(env, &prev_insn_idx, &env->insn_idx, @@ -24769,6 +24809,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = bpf_stack_liveness_init(env); + if (ret) + goto skip_full_check; + ret = check_attach_btf_id(env); if (ret) goto skip_full_check; @@ -24918,6 +24962,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: + bpf_stack_liveness_free(env); kvfree(env->cfg.insn_postorder); kvfree(env->scc_info); kvfree(env); From ccf25a67c7e29cfa6815d193054789b45ef825ad Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:41 -0700 Subject: [PATCH 2340/3206] bpf: signal error if old liveness is more conservative than new Unlike the new algorithm, register chain based liveness tracking is fully path sensitive, and thus should be strictly more accurate. Validate the new algorithm by signaling an error whenever it considers a stack slot dead while the old algorithm considers it alive. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-8-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2e3bdd50e2ba46..dec5da3a2e59dc 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -852,6 +852,7 @@ struct bpf_verifier_env { /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; + bool internal_error; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb931a144b9532..f70e34a38c1314 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18576,6 +18576,11 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { if (!bpf_stack_slot_alive(env, st->frameno, i)) { + if (st->stack[i].spilled_ptr.live & REG_LIVE_READ) { + verifier_bug(env, "incorrect live marks #1 for insn %d frameno %d spi %d\n", + env->insn_idx, st->frameno, i); + env->internal_error = true; + } __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; @@ -19546,6 +19551,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) loop = incomplete_read_marks(env, &sl->state); if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: + if (env->internal_error) + return -EFAULT; sl->hit_cnt++; /* reached equivalent register/stack state, * prune the search. @@ -19660,6 +19667,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 1; } miss: + if (env->internal_error) + return -EFAULT; /* when new state is not going to be added do not increase miss count. * Otherwise several loop iterations will remove the state * recorded earlier. The goal of these heuristics is to have From 107e169799057bc6a379ddb625cbe1e51cfc7d72 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:42 -0700 Subject: [PATCH 2341/3206] bpf: disable and remove registers chain based liveness Remove register chain based liveness tracking: - struct bpf_reg_state->{parent,live} fields are no longer needed; - REG_LIVE_WRITTEN marks are superseded by bpf_mark_stack_write() calls; - mark_reg_read() calls are superseded by bpf_mark_stack_read(); - log.c:print_liveness() is superseded by logging in liveness.c; - propagate_liveness() is superseded by bpf_update_live_stack(); - no need to establish register chains in is_state_visited() anymore; - fix a bunch of tests expecting "_w" suffixes in verifier log messages. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-9-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/verifier.rst | 264 --------------- include/linux/bpf_verifier.h | 25 -- kernel/bpf/log.c | 26 +- kernel/bpf/verifier.c | 315 ++---------------- .../testing/selftests/bpf/prog_tests/align.c | 178 +++++----- .../selftests/bpf/prog_tests/spin_lock.c | 12 +- .../selftests/bpf/prog_tests/test_veristat.c | 44 +-- .../selftests/bpf/progs/exceptions_assert.c | 34 +- .../selftests/bpf/progs/iters_state_safety.c | 4 +- .../selftests/bpf/progs/iters_testmod_seq.c | 6 +- .../bpf/progs/mem_rdonly_untrusted.c | 4 +- .../selftests/bpf/progs/verifier_bounds.c | 38 +-- .../bpf/progs/verifier_global_ptr_args.c | 4 +- .../selftests/bpf/progs/verifier_ldsx.c | 2 +- .../selftests/bpf/progs/verifier_precision.c | 16 +- .../selftests/bpf/progs/verifier_scalar_ids.c | 10 +- .../selftests/bpf/progs/verifier_spill_fill.c | 40 +-- .../bpf/progs/verifier_subprog_precision.c | 6 +- .../selftests/bpf/verifier/bpf_st_mem.c | 4 +- 19 files changed, 226 insertions(+), 806 deletions(-) diff --git a/Documentation/bpf/verifier.rst b/Documentation/bpf/verifier.rst index 95e6f80a407e52..510d15bc697b86 100644 --- a/Documentation/bpf/verifier.rst +++ b/Documentation/bpf/verifier.rst @@ -347,270 +347,6 @@ However, only the value of register ``r1`` is important to successfully finish verification. The goal of the liveness tracking algorithm is to spot this fact and figure out that both states are actually equivalent. -Data structures -~~~~~~~~~~~~~~~ - -Liveness is tracked using the following data structures:: - - enum bpf_reg_liveness { - REG_LIVE_NONE = 0, - REG_LIVE_READ32 = 0x1, - REG_LIVE_READ64 = 0x2, - REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, - REG_LIVE_WRITTEN = 0x4, - REG_LIVE_DONE = 0x8, - }; - - struct bpf_reg_state { - ... - struct bpf_reg_state *parent; - ... - enum bpf_reg_liveness live; - ... - }; - - struct bpf_stack_state { - struct bpf_reg_state spilled_ptr; - ... - }; - - struct bpf_func_state { - struct bpf_reg_state regs[MAX_BPF_REG]; - ... - struct bpf_stack_state *stack; - } - - struct bpf_verifier_state { - struct bpf_func_state *frame[MAX_CALL_FRAMES]; - struct bpf_verifier_state *parent; - ... - } - -* ``REG_LIVE_NONE`` is an initial value assigned to ``->live`` fields upon new - verifier state creation; - -* ``REG_LIVE_WRITTEN`` means that the value of the register (or stack slot) is - defined by some instruction verified between this verifier state's parent and - verifier state itself; - -* ``REG_LIVE_READ{32,64}`` means that the value of the register (or stack slot) - is read by a some child state of this verifier state; - -* ``REG_LIVE_DONE`` is a marker used by ``clean_verifier_state()`` to avoid - processing same verifier state multiple times and for some sanity checks; - -* ``->live`` field values are formed by combining ``enum bpf_reg_liveness`` - values using bitwise or. - -Register parentage chains -~~~~~~~~~~~~~~~~~~~~~~~~~ - -In order to propagate information between parent and child states, a *register -parentage chain* is established. Each register or stack slot is linked to a -corresponding register or stack slot in its parent state via a ``->parent`` -pointer. This link is established upon state creation in ``is_state_visited()`` -and might be modified by ``set_callee_state()`` called from -``__check_func_call()``. - -The rules for correspondence between registers / stack slots are as follows: - -* For the current stack frame, registers and stack slots of the new state are - linked to the registers and stack slots of the parent state with the same - indices. - -* For the outer stack frames, only callee saved registers (r6-r9) and stack - slots are linked to the registers and stack slots of the parent state with the - same indices. - -* When function call is processed a new ``struct bpf_func_state`` instance is - allocated, it encapsulates a new set of registers and stack slots. For this - new frame, parent links for r6-r9 and stack slots are set to nil, parent links - for r1-r5 are set to match caller r1-r5 parent links. - -This could be illustrated by the following diagram (arrows stand for -``->parent`` pointers):: - - ... ; Frame #0, some instructions - --- checkpoint #0 --- - 1 : r6 = 42 ; Frame #0 - --- checkpoint #1 --- - 2 : call foo() ; Frame #0 - ... ; Frame #1, instructions from foo() - --- checkpoint #2 --- - ... ; Frame #1, instructions from foo() - --- checkpoint #3 --- - exit ; Frame #1, return from foo() - 3 : r1 = r6 ; Frame #0 <- current state - - +-------------------------------+-------------------------------+ - | Frame #0 | Frame #1 | - Checkpoint +-------------------------------+-------------------------------+ - #0 | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - ^ ^ ^ ^ - | | | | - Checkpoint +-------------------------------+ - #1 | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - ^ ^ ^ - |_______|_______|_______________ - | | | - nil nil | | | nil nil - | | | | | | | - Checkpoint +-------------------------------+-------------------------------+ - #2 | r0 | r1-r5 | r6-r9 | fp-8 ... | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+-------------------------------+ - ^ ^ ^ ^ ^ - nil nil | | | | | - | | | | | | | - Checkpoint +-------------------------------+-------------------------------+ - #3 | r0 | r1-r5 | r6-r9 | fp-8 ... | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+-------------------------------+ - ^ ^ - nil nil | | - | | | | - Current +-------------------------------+ - state | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - \ - r6 read mark is propagated via these links - all the way up to checkpoint #1. - The checkpoint #1 contains a write mark for r6 - because of instruction (1), thus read propagation - does not reach checkpoint #0 (see section below). - -Liveness marks tracking -~~~~~~~~~~~~~~~~~~~~~~~ - -For each processed instruction, the verifier tracks read and written registers -and stack slots. The main idea of the algorithm is that read marks propagate -back along the state parentage chain until they hit a write mark, which 'screens -off' earlier states from the read. The information about reads is propagated by -function ``mark_reg_read()`` which could be summarized as follows:: - - mark_reg_read(struct bpf_reg_state *state, ...): - parent = state->parent - while parent: - if state->live & REG_LIVE_WRITTEN: - break - if parent->live & REG_LIVE_READ64: - break - parent->live |= REG_LIVE_READ64 - state = parent - parent = state->parent - -Notes: - -* The read marks are applied to the **parent** state while write marks are - applied to the **current** state. The write mark on a register or stack slot - means that it is updated by some instruction in the straight-line code leading - from the parent state to the current state. - -* Details about REG_LIVE_READ32 are omitted. - -* Function ``propagate_liveness()`` (see section :ref:`read_marks_for_cache_hits`) - might override the first parent link. Please refer to the comments in the - ``propagate_liveness()`` and ``mark_reg_read()`` source code for further - details. - -Because stack writes could have different sizes ``REG_LIVE_WRITTEN`` marks are -applied conservatively: stack slots are marked as written only if write size -corresponds to the size of the register, e.g. see function ``save_register_state()``. - -Consider the following example:: - - 0: (*u64)(r10 - 8) = 0 ; define 8 bytes of fp-8 - --- checkpoint #0 --- - 1: (*u32)(r10 - 8) = 1 ; redefine lower 4 bytes - 2: r1 = (*u32)(r10 - 8) ; read lower 4 bytes defined at (1) - 3: r2 = (*u32)(r10 - 4) ; read upper 4 bytes defined at (0) - -As stated above, the write at (1) does not count as ``REG_LIVE_WRITTEN``. Should -it be otherwise, the algorithm above wouldn't be able to propagate the read mark -from (3) to checkpoint #0. - -Once the ``BPF_EXIT`` instruction is reached ``update_branch_counts()`` is -called to update the ``->branches`` counter for each verifier state in a chain -of parent verifier states. When the ``->branches`` counter reaches zero the -verifier state becomes a valid entry in a set of cached verifier states. - -Each entry of the verifier states cache is post-processed by a function -``clean_live_states()``. This function marks all registers and stack slots -without ``REG_LIVE_READ{32,64}`` marks as ``NOT_INIT`` or ``STACK_INVALID``. -Registers/stack slots marked in this way are ignored in function ``stacksafe()`` -called from ``states_equal()`` when a state cache entry is considered for -equivalence with a current state. - -Now it is possible to explain how the example from the beginning of the section -works:: - - 0: call bpf_get_prandom_u32() - 1: r1 = 0 - 2: if r0 == 0 goto +1 - 3: r0 = 1 - --- checkpoint[0] --- - 4: r0 = r1 - 5: exit - -* At instruction #2 branching point is reached and state ``{ r0 == 0, r1 == 0, pc == 4 }`` - is pushed to states processing queue (pc stands for program counter). - -* At instruction #4: - - * ``checkpoint[0]`` states cache entry is created: ``{ r0 == 1, r1 == 0, pc == 4 }``; - * ``checkpoint[0].r0`` is marked as written; - * ``checkpoint[0].r1`` is marked as read; - -* At instruction #5 exit is reached and ``checkpoint[0]`` can now be processed - by ``clean_live_states()``. After this processing ``checkpoint[0].r1`` has a - read mark and all other registers and stack slots are marked as ``NOT_INIT`` - or ``STACK_INVALID`` - -* The state ``{ r0 == 0, r1 == 0, pc == 4 }`` is popped from the states queue - and is compared against a cached state ``{ r1 == 0, pc == 4 }``, the states - are considered equivalent. - -.. _read_marks_for_cache_hits: - -Read marks propagation for cache hits -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Another point is the handling of read marks when a previously verified state is -found in the states cache. Upon cache hit verifier must behave in the same way -as if the current state was verified to the program exit. This means that all -read marks, present on registers and stack slots of the cached state, must be -propagated over the parentage chain of the current state. Example below shows -why this is important. Function ``propagate_liveness()`` handles this case. - -Consider the following state parentage chain (S is a starting state, A-E are -derived states, -> arrows show which state is derived from which):: - - r1 read - <------------- A[r1] == 0 - C[r1] == 0 - S ---> A ---> B ---> exit E[r1] == 1 - | - ` ---> C ---> D - | - ` ---> E ^ - |___ suppose all these - ^ states are at insn #Y - | - suppose all these - states are at insn #X - -* Chain of states ``S -> A -> B -> exit`` is verified first. - -* While ``B -> exit`` is verified, register ``r1`` is read and this read mark is - propagated up to state ``A``. - -* When chain of states ``C -> D`` is verified the state ``D`` turns out to be - equivalent to state ``B``. - -* The read mark for ``r1`` has to be propagated to state ``C``, otherwise state - ``C`` might get mistakenly marked as equivalent to state ``E`` even though - values for register ``r1`` differ between ``C`` and ``E``. - Understanding eBPF verifier messages ==================================== diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index dec5da3a2e59dc..c7515da8500c56 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -26,27 +26,6 @@ /* Patch buffer size */ #define INSN_BUF_SIZE 32 -/* Liveness marks, used for registers and spilled-regs (in stack slots). - * Read marks propagate upwards until they find a write mark; they record that - * "one of this state's descendants read this reg" (and therefore the reg is - * relevant for states_equal() checks). - * Write marks collect downwards and do not propagate; they record that "the - * straight-line code that reached this state (from its parent) wrote this reg" - * (and therefore that reads propagated from this state or its descendants - * should not propagate to its parent). - * A state with a write mark can receive read marks; it just won't propagate - * them to its parent, since the write mark is a property, not of the state, - * but of the link between it and its parent. See mark_reg_read() and - * mark_stack_slot_read() in kernel/bpf/verifier.c. - */ -enum bpf_reg_liveness { - REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ - REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ - REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ - REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, - REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ -}; - #define ITER_PREFIX "bpf_iter_" enum bpf_iter_state { @@ -211,8 +190,6 @@ struct bpf_reg_state { * allowed and has the same effect as bpf_sk_release(sk). */ u32 ref_obj_id; - /* parentage chain for liveness checking */ - struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' @@ -225,7 +202,6 @@ struct bpf_reg_state { * patching which only happens after main verification finished. */ s32 subreg_def; - enum bpf_reg_liveness live; /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ bool precise; }; @@ -852,7 +828,6 @@ struct bpf_verifier_env { /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; - bool internal_error; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 0d6d7bfb2fd05e..f50533169cc34e 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -542,17 +542,6 @@ static char slot_type_char[] = { [STACK_IRQ_FLAG] = 'f' }; -static void print_liveness(struct bpf_verifier_env *env, - enum bpf_reg_liveness live) -{ - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) - verbose(env, "_"); - if (live & REG_LIVE_READ) - verbose(env, "r"); - if (live & REG_LIVE_WRITTEN) - verbose(env, "w"); -} - #define UNUM_MAX_DECIMAL U16_MAX #define SNUM_MAX_DECIMAL S16_MAX #define SNUM_MIN_DECIMAL S16_MIN @@ -770,7 +759,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie if (!print_all && !reg_scratched(env, i)) continue; verbose(env, " R%d", i); - print_liveness(env, reg->live); verbose(env, "="); print_reg_state(env, state, reg); } @@ -803,9 +791,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie break; types_buf[j] = '\0'; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); + verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf); print_reg_state(env, state, reg); break; case STACK_DYNPTR: @@ -814,7 +800,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie reg = &state->stack[i].spilled_ptr; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); if (reg->id) verbose_a("id=%d", reg->id); @@ -829,9 +814,8 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie if (!reg->ref_obj_id) continue; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", + verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)", + (-i - 1) * BPF_REG_SIZE, iter_type_str(reg->iter.btf, reg->iter.btf_id), reg->ref_obj_id, iter_state_str(reg->iter.state), reg->iter.depth); @@ -839,9 +823,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie case STACK_MISC: case STACK_ZERO: default: - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); + verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf); break; } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f70e34a38c1314..e1da2471442b28 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -787,8 +787,6 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; @@ -806,29 +804,6 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot? - * - * While we don't allow reading STACK_INVALID, it is still possible to - * do <8 byte writes marking some but not all slots as STACK_MISC. Then, - * helpers or insns can do partial read of that part without failing, - * but check_stack_range_initialized, check_stack_read_var_off, and - * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of - * the slot conservatively. Hence we need to prevent those liveness - * marking walks. - * - * This was not a problem before because STACK_INVALID is only set by - * default (where the default reg state has its reg->parent as NULL), or - * in clean_live_states after REG_LIVE_DONE (at which point - * mark_reg_read won't walk reg->parent chain), but not randomly during - * verifier state exploration (like we did above). Hence, for our case - * parentage chain will still be live (i.e. reg->parent may be - * non-NULL), while earlier reg->parent was NULL, so we need - * REG_LIVE_WRITTEN to screen off read marker propagation when it is - * done later on reads or by mark_dynptr_read as well to unnecessary - * mark registers in verifier state. - */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); } @@ -938,9 +913,6 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - /* Same reason as unmark_stack_slots_dynptr above */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; @@ -1059,7 +1031,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, else st->type |= PTR_UNTRUSTED; } - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = i == 0 ? id : 0; st->iter.btf = btf; st->iter.btf_id = btf_id; @@ -1095,9 +1066,6 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, __mark_reg_not_init(env, st); - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; - for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_INVALID; @@ -1194,7 +1162,6 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, bpf_mark_stack_write(env, reg->frameno, BIT(spi)); __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = id; st->irq.kfunc_class = kfunc_class; @@ -1248,8 +1215,6 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r __mark_reg_not_init(env, st); - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, reg->frameno, BIT(spi)); for (i = 0; i < BPF_REG_SIZE; i++) @@ -2901,8 +2866,6 @@ static void init_reg_state(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(env, regs, i); - regs[i].live = REG_LIVE_NONE; - regs[i].parent = NULL; regs[i].subreg_def = DEF_NOT_SUBREG; } @@ -3583,64 +3546,12 @@ static int check_subprogs(struct bpf_verifier_env *env) return 0; } -/* Parentage chain of this register (or stack slot) should take care of all - * issues like callee-saved registers, stack slot allocation time, etc. - */ -static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_reg_state *state, - struct bpf_reg_state *parent, u8 flag) -{ - bool writes = parent == state->parent; /* Observe write marks */ - int cnt = 0; - - while (parent) { - /* if read wasn't screened by an earlier write ... */ - if (writes && state->live & REG_LIVE_WRITTEN) - break; - /* The first condition is more likely to be true than the - * second, checked it first. - */ - if ((parent->live & REG_LIVE_READ) == flag || - parent->live & REG_LIVE_READ64) - /* The parentage chain never changes and - * this parent was already marked as LIVE_READ. - * There is no need to keep walking the chain again and - * keep re-marking all parents as LIVE_READ. - * This case happens when the same register is read - * multiple times without writes into it in-between. - * Also, if parent has the stronger REG_LIVE_READ64 set, - * then no need to set the weak REG_LIVE_READ32. - */ - break; - /* ... then we depend on parent's value */ - parent->live |= flag; - /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ - if (flag == REG_LIVE_READ64) - parent->live &= ~REG_LIVE_READ32; - state = parent; - parent = state->parent; - writes = true; - cnt++; - } - - if (env->longest_mark_read_walk < cnt) - env->longest_mark_read_walk = cnt; - return 0; -} - static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi, int nr_slots) { - struct bpf_func_state *state = func(env, reg); int err, i; for (i = 0; i < nr_slots; i++) { - struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; - - err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); - if (err) - return err; - err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i)); if (err) return err; @@ -3852,15 +3763,13 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r if (rw64) mark_insn_zext(env, reg); - return mark_reg_read(env, reg, reg->parent, - rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); + return 0; } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) mark_reg_unknown(env, regs, regno); @@ -5065,12 +4974,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, /* Copy src state preserving dst->parent and dst->live fields */ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) { - struct bpf_reg_state *parent = dst->parent; - enum bpf_reg_liveness live = dst->live; - *dst = *src; - dst->parent = parent; - dst->live = live; } static void save_register_state(struct bpf_verifier_env *env, @@ -5081,8 +4985,6 @@ static void save_register_state(struct bpf_verifier_env *env, int i; copy_register_state(&state->stack[spi].spilled_ptr, reg); - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--) state->stack[spi].slot_type[i - 1] = STACK_SPILL; @@ -5231,17 +5133,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, for (i = 0; i < BPF_REG_SIZE; i++) scrub_spilled_slot(&state->stack[spi].slot_type[i]); - /* only mark the slot as written if all 8 bytes were written - * otherwise read propagation may incorrectly stop too soon - * when stack slots are partially written. - * This heuristic means that read propagation will be - * conservative, since it will add reg_live_read marks - * to stack slots all the way to first state when programs - * writes+reads less than 8 bytes - */ - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - /* when we zero initialize stack slots mark them as such */ if ((reg && register_is_null(reg)) || (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { @@ -5434,7 +5325,6 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env, /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, dst_regno); } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } /* Read the stack at 'off' and put the results into the register indicated by @@ -5481,7 +5371,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno < 0) return 0; @@ -5535,7 +5424,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* not restoring original register state */ } } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (dst_regno >= 0) { /* restore register state from stack */ copy_register_state(&state->regs[dst_regno], reg); @@ -5543,7 +5431,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE @@ -5555,7 +5442,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, off); return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { for (i = 0; i < size; i++) { type = stype[(slot - i) % BPF_REG_SIZE]; @@ -5569,7 +5455,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno >= 0) mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); insn_flags = 0; /* we are not restoring spilled register */ @@ -8197,13 +8082,10 @@ static int check_stack_range_initialized( /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ - mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent, - REG_LIVE_READ64); err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi)); if (err) return err; - /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not + /* We do not call bpf_mark_stack_write(), as we can not * be sure that whether stack slot is written to or not. Hence, * we must still conservatively propagate reads upwards even if * helper may write to the entire memory range. @@ -11041,8 +10923,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) } /* we are going to rely on register's precise value */ - err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64); - err = err ?: mark_chain_precision(env, BPF_REG_0); + err = mark_chain_precision(env, BPF_REG_0); if (err) return err; @@ -11946,17 +11827,11 @@ static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_re if (regno == BPF_REG_0) { /* Function return value */ - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = reg_size == sizeof(u64) ? DEF_NOT_SUBREG : env->insn_idx + 1; - } else { + } else if (reg_size == sizeof(u64)) { /* Function argument */ - if (reg_size == sizeof(u64)) { - mark_insn_zext(env, reg); - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - } else { - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); - } + mark_insn_zext(env, reg); } } @@ -15710,7 +15585,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* case: R1 = (s8, s16 s32)R2 */ @@ -15729,7 +15603,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (!no_sext) dst_reg->id = 0; coerce_reg_to_size_sx(dst_reg, insn->off >> 3); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -15755,7 +15628,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ if (!is_src_reg_u32) dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; } else { /* case: W1 = (s8, s16)W2 */ @@ -15766,7 +15638,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) copy_register_state(dst_reg, src_reg); if (!no_sext) dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; coerce_subreg_to_size_sx(dst_reg, insn->off >> 3); } @@ -18576,11 +18447,6 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { if (!bpf_stack_slot_alive(env, st->frameno, i)) { - if (st->stack[i].spilled_ptr.live & REG_LIVE_READ) { - verifier_bug(env, "incorrect live marks #1 for insn %d frameno %d spi %d\n", - env->insn_idx, st->frameno, i); - env->internal_error = true; - } __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; @@ -18609,25 +18475,23 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * but a lot of states will get revised from liveness point of view when * the verifier explores other branches. * Example: - * 1: r0 = 1 + * 1: *(u64)(r10 - 8) = 1 * 2: if r1 == 100 goto pc+1 - * 3: r0 = 2 - * 4: exit - * when the verifier reaches exit insn the register r0 in the state list of - * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch - * of insn 2 and goes exploring further. At the insn 4 it will walk the - * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ. + * 3: *(u64)(r10 - 8) = 2 + * 4: r0 = *(u64)(r10 - 8) + * 5: exit + * when the verifier reaches exit insn the stack slot -8 in the state list of + * insn 2 is not yet marked alive. Then the verifier pops the other_branch + * of insn 2 and goes exploring further. After the insn 4 read, liveness + * analysis would propagate read mark for -8 at insn 2. * * Since the verifier pushes the branch states as it sees them while exploring * the program the condition of walking the branch instruction for the second * time means that all states below this branch were already explored and * their final liveness marks are already propagated. * Hence when the verifier completes the search of state list in is_state_visited() - * we can call this clean_live_states() function to mark all liveness states - * as st->cleaned to indicate that 'parent' pointers of 'struct bpf_reg_state' - * will not be used. - * This function also clears the registers and stack for states that !READ - * to simplify state merging. + * we can call this clean_live_states() function to clear dead the registers and stack + * slots to simplify state merging. * * Important note here that walking the same branch instruction in the callee * doesn't meant that the states are DONE. The verifier has to compare @@ -18802,7 +18666,6 @@ static struct bpf_reg_state unbound_reg; static __init int unbound_reg_init(void) { __mark_reg_unknown_imprecise(&unbound_reg); - unbound_reg.live |= REG_LIVE_READ; return 0; } late_initcall(unbound_reg_init); @@ -19097,91 +18960,6 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } -/* Return 0 if no propagation happened. Return negative error code if error - * happened. Otherwise, return the propagated bit. - */ -static int propagate_liveness_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - struct bpf_reg_state *parent_reg) -{ - u8 parent_flag = parent_reg->live & REG_LIVE_READ; - u8 flag = reg->live & REG_LIVE_READ; - int err; - - /* When comes here, read flags of PARENT_REG or REG could be any of - * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need - * of propagation if PARENT_REG has strongest REG_LIVE_READ64. - */ - if (parent_flag == REG_LIVE_READ64 || - /* Or if there is no read flag from REG. */ - !flag || - /* Or if the read flag from REG is the same as PARENT_REG. */ - parent_flag == flag) - return 0; - - err = mark_reg_read(env, reg, parent_reg, flag); - if (err) - return err; - - return flag; -} - -/* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at an - * equivalent state (jump target or such) we didn't arrive by the straight-line - * code, so read marks in the state must propagate to the parent regardless - * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() is for. - */ -static int propagate_liveness(struct bpf_verifier_env *env, - const struct bpf_verifier_state *vstate, - struct bpf_verifier_state *vparent, - bool *changed) -{ - struct bpf_reg_state *state_reg, *parent_reg; - struct bpf_func_state *state, *parent; - int i, frame, err = 0; - bool tmp = false; - - changed = changed ?: &tmp; - if (vparent->curframe != vstate->curframe) { - WARN(1, "propagate_live: parent frame %d current frame %d\n", - vparent->curframe, vstate->curframe); - return -EFAULT; - } - /* Propagate read liveness of registers... */ - BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - for (frame = 0; frame <= vstate->curframe; frame++) { - parent = vparent->frame[frame]; - state = vstate->frame[frame]; - parent_reg = parent->regs; - state_reg = state->regs; - /* We don't need to worry about FP liveness, it's read-only */ - for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - err = propagate_liveness_reg(env, &state_reg[i], - &parent_reg[i]); - if (err < 0) - return err; - *changed |= err > 0; - if (err == REG_LIVE_READ64) - mark_insn_zext(env, &parent_reg[i]); - } - - /* Propagate stack slots. */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - parent_reg = &parent->stack[i].spilled_ptr; - state_reg = &state->stack[i].spilled_ptr; - err = propagate_liveness_reg(env, state_reg, - parent_reg); - *changed |= err > 0; - if (err < 0) - return err; - } - } - return 0; -} - /* find precise scalars in the previous equivalent state and * propagate them into the current state */ @@ -19201,8 +18979,7 @@ static int propagate_precision(struct bpf_verifier_env *env, first = true; for (i = 0; i < BPF_REG_FP; i++, state_reg++) { if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) + !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) @@ -19219,8 +18996,7 @@ static int propagate_precision(struct bpf_verifier_env *env, continue; state_reg = &state->stack[i].spilled_ptr; if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) + !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) @@ -19270,9 +19046,6 @@ static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visi changed = false; for (backedge = visit->backedges; backedge; backedge = backedge->next) { st = &backedge->state; - err = propagate_liveness(env, st->equal_state, st, &changed); - if (err) - return err; err = propagate_precision(env, st->equal_state, st, &changed); if (err) return err; @@ -19296,7 +19069,7 @@ static bool states_maybe_looping(struct bpf_verifier_state *old, fcur = cur->frame[fr]; for (i = 0; i < MAX_BPF_REG; i++) if (memcmp(&fold->regs[i], &fcur->regs[i], - offsetof(struct bpf_reg_state, parent))) + offsetof(struct bpf_reg_state, frameno))) return false; return true; } @@ -19394,7 +19167,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new; bool force_new_state, add_new_state, loop; - int i, j, n, err, states_cnt = 0; + int n, err, states_cnt = 0; struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || @@ -19551,28 +19324,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) loop = incomplete_read_marks(env, &sl->state); if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: - if (env->internal_error) - return -EFAULT; sl->hit_cnt++; - /* reached equivalent register/stack state, - * prune the search. - * Registers read by the continuation are read by us. - * If we have any write marks in env->cur_state, they - * will prevent corresponding reads in the continuation - * from reaching our parent (an explored_state). Our - * own state will get the read marks recorded, but - * they'll be immediately forgotten as we're pruning - * this state and will pop a new one. - */ - err = propagate_liveness(env, &sl->state, cur, NULL); /* if previous state reached the exit with precision and * current state is equivalent to it (except precision marks) * the precision needs to be propagated back in * the current state. */ + err = 0; if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_jmp_history(env, cur, 0, 0); + err = push_jmp_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; @@ -19667,8 +19428,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 1; } miss: - if (env->internal_error) - return -EFAULT; /* when new state is not going to be added do not increase miss count. * Otherwise several loop iterations will remove the state * recorded earlier. The goal of these heuristics is to have @@ -19754,38 +19513,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) cur->dfs_depth = new->dfs_depth + 1; clear_jmp_history(cur); list_add(&new_sl->node, head); - - /* connect new state to parentage chain. Current frame needs all - * registers connected. Only r6 - r9 of the callers are alive (pushed - * to the stack implicitly by JITs) so in callers' frames connect just - * r6 - r9 as an optimization. Callers will have r1 - r5 connected to - * the state of the call instruction (with WRITTEN set), and r0 comes - * from callee with its full parentage chain, anyway. - */ - /* clear write marks in current state: the writes we did are not writes - * our child did, so they don't screen off its reads from us. - * (There are no read marks in current state, because reads always mark - * their parent and current state never has children yet. Only - * explored_states can get read marks.) - */ - for (j = 0; j <= cur->curframe; j++) { - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; - for (i = 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].live = REG_LIVE_NONE; - } - - /* all stack frames are accessible from callee, clear them all */ - for (j = 0; j <= cur->curframe; j++) { - struct bpf_func_state *frame = cur->frame[j]; - struct bpf_func_state *newframe = new->frame[j]; - - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { - frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; - frame->stack[i].spilled_ptr.parent = - &newframe->stack[i].spilled_ptr; - } - } return 0; } diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 1d53a8561ee2fc..24c509ce4e5b2b 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -42,11 +42,11 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "2"}, - {1, "R3_w", "4"}, - {2, "R3_w", "8"}, - {3, "R3_w", "16"}, - {4, "R3_w", "32"}, + {0, "R3", "2"}, + {1, "R3", "4"}, + {2, "R3", "8"}, + {3, "R3", "16"}, + {4, "R3", "32"}, }, }, { @@ -70,17 +70,17 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "1"}, - {1, "R3_w", "2"}, - {2, "R3_w", "4"}, - {3, "R3_w", "8"}, - {4, "R3_w", "16"}, - {5, "R3_w", "1"}, - {6, "R4_w", "32"}, - {7, "R4_w", "16"}, - {8, "R4_w", "8"}, - {9, "R4_w", "4"}, - {10, "R4_w", "2"}, + {0, "R3", "1"}, + {1, "R3", "2"}, + {2, "R3", "4"}, + {3, "R3", "8"}, + {4, "R3", "16"}, + {5, "R3", "1"}, + {6, "R4", "32"}, + {7, "R4", "16"}, + {8, "R4", "8"}, + {9, "R4", "4"}, + {10, "R4", "2"}, }, }, { @@ -99,12 +99,12 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "4"}, - {1, "R3_w", "8"}, - {2, "R3_w", "10"}, - {3, "R4_w", "8"}, - {4, "R4_w", "12"}, - {5, "R4_w", "14"}, + {0, "R3", "4"}, + {1, "R3", "8"}, + {2, "R3", "10"}, + {3, "R4", "8"}, + {4, "R4", "12"}, + {5, "R4", "14"}, }, }, { @@ -121,10 +121,10 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "7"}, - {1, "R3_w", "7"}, - {2, "R3_w", "14"}, - {3, "R3_w", "56"}, + {0, "R3", "7"}, + {1, "R3", "7"}, + {2, "R3", "14"}, + {3, "R3", "56"}, }, }, @@ -162,19 +162,19 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {6, "R0_w", "pkt(off=8,r=8)"}, - {6, "R3_w", "var_off=(0x0; 0xff)"}, - {7, "R3_w", "var_off=(0x0; 0x1fe)"}, - {8, "R3_w", "var_off=(0x0; 0x3fc)"}, - {9, "R3_w", "var_off=(0x0; 0x7f8)"}, - {10, "R3_w", "var_off=(0x0; 0xff0)"}, - {12, "R3_w", "pkt_end()"}, - {17, "R4_w", "var_off=(0x0; 0xff)"}, - {18, "R4_w", "var_off=(0x0; 0x1fe0)"}, - {19, "R4_w", "var_off=(0x0; 0xff0)"}, - {20, "R4_w", "var_off=(0x0; 0x7f8)"}, - {21, "R4_w", "var_off=(0x0; 0x3fc)"}, - {22, "R4_w", "var_off=(0x0; 0x1fe)"}, + {6, "R0", "pkt(off=8,r=8)"}, + {6, "R3", "var_off=(0x0; 0xff)"}, + {7, "R3", "var_off=(0x0; 0x1fe)"}, + {8, "R3", "var_off=(0x0; 0x3fc)"}, + {9, "R3", "var_off=(0x0; 0x7f8)"}, + {10, "R3", "var_off=(0x0; 0xff0)"}, + {12, "R3", "pkt_end()"}, + {17, "R4", "var_off=(0x0; 0xff)"}, + {18, "R4", "var_off=(0x0; 0x1fe0)"}, + {19, "R4", "var_off=(0x0; 0xff0)"}, + {20, "R4", "var_off=(0x0; 0x7f8)"}, + {21, "R4", "var_off=(0x0; 0x3fc)"}, + {22, "R4", "var_off=(0x0; 0x1fe)"}, }, }, { @@ -195,16 +195,16 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {6, "R3_w", "var_off=(0x0; 0xff)"}, - {7, "R4_w", "var_off=(0x0; 0xff)"}, - {8, "R4_w", "var_off=(0x0; 0xff)"}, - {9, "R4_w", "var_off=(0x0; 0xff)"}, - {10, "R4_w", "var_off=(0x0; 0x1fe)"}, - {11, "R4_w", "var_off=(0x0; 0xff)"}, - {12, "R4_w", "var_off=(0x0; 0x3fc)"}, - {13, "R4_w", "var_off=(0x0; 0xff)"}, - {14, "R4_w", "var_off=(0x0; 0x7f8)"}, - {15, "R4_w", "var_off=(0x0; 0xff0)"}, + {6, "R3", "var_off=(0x0; 0xff)"}, + {7, "R4", "var_off=(0x0; 0xff)"}, + {8, "R4", "var_off=(0x0; 0xff)"}, + {9, "R4", "var_off=(0x0; 0xff)"}, + {10, "R4", "var_off=(0x0; 0x1fe)"}, + {11, "R4", "var_off=(0x0; 0xff)"}, + {12, "R4", "var_off=(0x0; 0x3fc)"}, + {13, "R4", "var_off=(0x0; 0xff)"}, + {14, "R4", "var_off=(0x0; 0x7f8)"}, + {15, "R4", "var_off=(0x0; 0xff0)"}, }, }, { @@ -235,14 +235,14 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {2, "R5_w", "pkt(r=0)"}, - {4, "R5_w", "pkt(off=14,r=0)"}, - {5, "R4_w", "pkt(off=14,r=0)"}, + {2, "R5", "pkt(r=0)"}, + {4, "R5", "pkt(off=14,r=0)"}, + {5, "R4", "pkt(off=14,r=0)"}, {9, "R2", "pkt(r=18)"}, {10, "R5", "pkt(off=14,r=18)"}, - {10, "R4_w", "var_off=(0x0; 0xff)"}, - {13, "R4_w", "var_off=(0x0; 0xffff)"}, - {14, "R4_w", "var_off=(0x0; 0xffff)"}, + {10, "R4", "var_off=(0x0; 0xff)"}, + {13, "R4", "var_off=(0x0; 0xffff)"}, + {14, "R4", "var_off=(0x0; 0xffff)"}, }, }, { @@ -299,12 +299,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {7, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {7, "R6", "var_off=(0x0; 0x3fc)"}, /* Offset is added to packet pointer R5, resulting in * known fixed offset, and variable offset from R6. */ - {11, "R5_w", "pkt(id=1,off=14,"}, + {11, "R5", "pkt(id=1,off=14,"}, /* At the time the word size load is performed from R5, * it's total offset is NET_IP_ALIGN + reg->off (0) + * reg->aux_off (14) which is 16. Then the variable @@ -320,12 +320,12 @@ static struct bpf_align_test tests[] = { * instruction to validate R5 state. We also check * that R4 is what it should be in such case. */ - {18, "R4_w", "var_off=(0x0; 0x3fc)"}, - {18, "R5_w", "var_off=(0x0; 0x3fc)"}, + {18, "R4", "var_off=(0x0; 0x3fc)"}, + {18, "R5", "var_off=(0x0; 0x3fc)"}, /* Constant offset is added to R5, resulting in * reg->off of 14. */ - {19, "R5_w", "pkt(id=2,off=14,"}, + {19, "R5", "pkt(id=2,off=14,"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off * (14) which is 16. Then the variable offset is 4-byte @@ -337,21 +337,21 @@ static struct bpf_align_test tests[] = { /* Constant offset is added to R5 packet pointer, * resulting in reg->off value of 14. */ - {26, "R5_w", "pkt(off=14,r=8)"}, + {26, "R5", "pkt(off=14,r=8)"}, /* Variable offset is added to R5, resulting in a * variable offset of (4n). See comment for insn #18 * for R4 = R5 trick. */ - {28, "R4_w", "var_off=(0x0; 0x3fc)"}, - {28, "R5_w", "var_off=(0x0; 0x3fc)"}, + {28, "R4", "var_off=(0x0; 0x3fc)"}, + {28, "R5", "var_off=(0x0; 0x3fc)"}, /* Constant is added to R5 again, setting reg->off to 18. */ - {29, "R5_w", "pkt(id=3,off=18,"}, + {29, "R5", "pkt(id=3,off=18,"}, /* And once more we add a variable; resulting var_off * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ - {31, "R4_w", "var_off=(0x0; 0x7fc)"}, - {31, "R5_w", "var_off=(0x0; 0x7fc)"}, + {31, "R4", "var_off=(0x0; 0x7fc)"}, + {31, "R5", "var_off=(0x0; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so @@ -397,12 +397,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {7, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {7, "R6", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ - {8, "R6_w", "var_off=(0x2; 0x7fc)"}, + {8, "R6", "var_off=(0x2; 0x7fc)"}, /* Packet pointer has (4n+2) offset */ - {11, "R5_w", "var_off=(0x2; 0x7fc)"}, + {11, "R5", "var_off=(0x2; 0x7fc)"}, {12, "R4", "var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) @@ -414,11 +414,11 @@ static struct bpf_align_test tests[] = { /* Newly read value in R6 was shifted left by 2, so has * known alignment of 4. */ - {17, "R6_w", "var_off=(0x0; 0x3fc)"}, + {17, "R6", "var_off=(0x0; 0x3fc)"}, /* Added (4n) to packet pointer's (4n+2) var_off, giving * another (4n+2). */ - {19, "R5_w", "var_off=(0x2; 0xffc)"}, + {19, "R5", "var_off=(0x2; 0xffc)"}, {20, "R4", "var_off=(0x2; 0xffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) @@ -459,18 +459,18 @@ static struct bpf_align_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .matches = { - {3, "R5_w", "pkt_end()"}, + {3, "R5", "pkt_end()"}, /* (ptr - ptr) << 2 == unknown, (4n) */ - {5, "R5_w", "var_off=(0x0; 0xfffffffffffffffc)"}, + {5, "R5", "var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because * the add could overflow. */ - {6, "R5_w", "var_off=(0x2; 0xfffffffffffffffc)"}, + {6, "R5", "var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ {9, "R5", "var_off=(0x2; 0x7ffffffffffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, - {12, "R4_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {11, "R6", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {12, "R4", "var_off=(0x2; 0x7ffffffffffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -478,7 +478,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {15, "R6", "var_off=(0x2; 0x7ffffffffffffffc)"}, } }, { @@ -513,12 +513,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {8, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {8, "R6", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ - {9, "R6_w", "var_off=(0x2; 0x7fc)"}, + {9, "R6", "var_off=(0x2; 0x7fc)"}, /* New unknown value in R7 is (4n) */ - {10, "R7_w", "var_off=(0x0; 0x3fc)"}, + {10, "R7", "var_off=(0x0; 0x3fc)"}, /* Subtracting it from R6 blows our unsigned bounds */ {11, "R6", "var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>= 0 */ @@ -566,16 +566,16 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {9, "R6_w", "var_off=(0x0; 0x3c)"}, + {6, "R2", "pkt(r=8)"}, + {9, "R6", "var_off=(0x0; 0x3c)"}, /* Adding 14 makes R6 be (4n+2) */ - {10, "R6_w", "var_off=(0x2; 0x7c)"}, + {10, "R6", "var_off=(0x2; 0x7c)"}, /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w", "var_off=(0xffffffffffffff82; 0x7c)"}, + {13, "R5", "var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ - {14, "R7_w", "var_off=(0x0; 0x7fc)"}, + {14, "R7", "var_off=(0x0; 0x7fc)"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w", "var_off=(0x2; 0x7fc)"}, + {16, "R5", "var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index e3ea5dc2f697c4..254fbfeab06a22 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -13,22 +13,22 @@ static struct { const char *err_msg; } spin_lock_fail_tests[] = { { "lock_id_kptr_preserve", - "5: (bf) r1 = r0 ; R0_w=ptr_foo(id=2,ref_obj_id=2) " - "R1_w=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" + "5: (bf) r1 = r0 ; R0=ptr_foo(id=2,ref_obj_id=2) " + "R1=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" "R1 type=ptr_ expected=percpu_ptr_" }, { "lock_id_global_zero", - "; R1_w=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n" + "; R1=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" - " R0_w=map_value(id=1,map=array_map,ks=4,vs=8)" - " R1_w=map_value(id=1,map=array_map,ks=4,vs=8)\n" + " R0=map_value(id=1,map=array_map,ks=4,vs=8)" + " R1=map_value(id=1,map=array_map,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_innermapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" " R0=map_value(id=2,ks=4,vs=8)" - " R1_w=map_value(id=2,ks=4,vs=8)\n" + " R1=map_value(id=2,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mismatch_kptr_kptr", "bpf_spin_unlock of different lock" }, diff --git a/tools/testing/selftests/bpf/prog_tests/test_veristat.c b/tools/testing/selftests/bpf/prog_tests/test_veristat.c index 367f47e4a936d4..b38c16b4247f77 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_veristat.c +++ b/tools/testing/selftests/bpf/prog_tests/test_veristat.c @@ -75,26 +75,26 @@ static void test_set_global_vars_succeeds(void) " -vl2 > %s", fix->veristat, fix->tmpfile); read(fix->fd, fix->output, fix->sz); - __CHECK_STR("_w=0xf000000000000001 ", "var_s64 = 0xf000000000000001"); - __CHECK_STR("_w=0xfedcba9876543210 ", "var_u64 = 0xfedcba9876543210"); - __CHECK_STR("_w=0x80000000 ", "var_s32 = -0x80000000"); - __CHECK_STR("_w=0x76543210 ", "var_u32 = 0x76543210"); - __CHECK_STR("_w=0x8000 ", "var_s16 = -32768"); - __CHECK_STR("_w=0xecec ", "var_u16 = 60652"); - __CHECK_STR("_w=128 ", "var_s8 = -128"); - __CHECK_STR("_w=255 ", "var_u8 = 255"); - __CHECK_STR("_w=11 ", "var_ea = EA2"); - __CHECK_STR("_w=12 ", "var_eb = EB2"); - __CHECK_STR("_w=13 ", "var_ec = EC2"); - __CHECK_STR("_w=1 ", "var_b = 1"); - __CHECK_STR("_w=170 ", "struct1[2].struct2[1][2].u.var_u8[2]=170"); - __CHECK_STR("_w=0xaaaa ", "union1.var_u16 = 0xaaaa"); - __CHECK_STR("_w=171 ", "arr[3]= 171"); - __CHECK_STR("_w=172 ", "arr[EA2] =172"); - __CHECK_STR("_w=10 ", "enum_arr[EC2]=EA3"); - __CHECK_STR("_w=173 ", "matrix[31][7][11]=173"); - __CHECK_STR("_w=174 ", "struct1[2].struct2[1][2].u.mat[5][3]=174"); - __CHECK_STR("_w=175 ", "struct11[7][5].struct2[0][1].u.mat[3][0]=175"); + __CHECK_STR("=0xf000000000000001 ", "var_s64 = 0xf000000000000001"); + __CHECK_STR("=0xfedcba9876543210 ", "var_u64 = 0xfedcba9876543210"); + __CHECK_STR("=0x80000000 ", "var_s32 = -0x80000000"); + __CHECK_STR("=0x76543210 ", "var_u32 = 0x76543210"); + __CHECK_STR("=0x8000 ", "var_s16 = -32768"); + __CHECK_STR("=0xecec ", "var_u16 = 60652"); + __CHECK_STR("=128 ", "var_s8 = -128"); + __CHECK_STR("=255 ", "var_u8 = 255"); + __CHECK_STR("=11 ", "var_ea = EA2"); + __CHECK_STR("=12 ", "var_eb = EB2"); + __CHECK_STR("=13 ", "var_ec = EC2"); + __CHECK_STR("=1 ", "var_b = 1"); + __CHECK_STR("=170 ", "struct1[2].struct2[1][2].u.var_u8[2]=170"); + __CHECK_STR("=0xaaaa ", "union1.var_u16 = 0xaaaa"); + __CHECK_STR("=171 ", "arr[3]= 171"); + __CHECK_STR("=172 ", "arr[EA2] =172"); + __CHECK_STR("=10 ", "enum_arr[EC2]=EA3"); + __CHECK_STR("=173 ", "matrix[31][7][11]=173"); + __CHECK_STR("=174 ", "struct1[2].struct2[1][2].u.mat[5][3]=174"); + __CHECK_STR("=175 ", "struct11[7][5].struct2[0][1].u.mat[3][0]=175"); out: teardown_fixture(fix); @@ -117,8 +117,8 @@ static void test_set_global_vars_from_file_succeeds(void) SYS(out, "%s set_global_vars.bpf.o -G \"@%s\" -vl2 > %s", fix->veristat, input_file, fix->tmpfile); read(fix->fd, fix->output, fix->sz); - __CHECK_STR("_w=0x8000 ", "var_s16 = -32768"); - __CHECK_STR("_w=0xecec ", "var_u16 = 60652"); + __CHECK_STR("=0x8000 ", "var_s16 = -32768"); + __CHECK_STR("=0xecec ", "var_u16 = 60652"); out: close(fd); diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index 5e0a1ca96d4e27..a01c2736890f94 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -18,43 +18,43 @@ return *(u64 *)num; \ } -__msg(": R0_w=0xffffffff80000000") +__msg(": R0=0xffffffff80000000") check_assert(s64, ==, eq_int_min, INT_MIN); -__msg(": R0_w=0x7fffffff") +__msg(": R0=0x7fffffff") check_assert(s64, ==, eq_int_max, INT_MAX); -__msg(": R0_w=0") +__msg(": R0=0") check_assert(s64, ==, eq_zero, 0); -__msg(": R0_w=0x8000000000000000 R1_w=0x8000000000000000") +__msg(": R0=0x8000000000000000 R1=0x8000000000000000") check_assert(s64, ==, eq_llong_min, LLONG_MIN); -__msg(": R0_w=0x7fffffffffffffff R1_w=0x7fffffffffffffff") +__msg(": R0=0x7fffffffffffffff R1=0x7fffffffffffffff") check_assert(s64, ==, eq_llong_max, LLONG_MAX); -__msg(": R0_w=scalar(id=1,smax=0x7ffffffe)") +__msg(": R0=scalar(id=1,smax=0x7ffffffe)") check_assert(s64, <, lt_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") check_assert(s64, <, lt_zero, 0); -__msg(": R0_w=scalar(id=1,smax=0xffffffff7fffffff") +__msg(": R0=scalar(id=1,smax=0xffffffff7fffffff") check_assert(s64, <, lt_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smax=0x7fffffff)") +__msg(": R0=scalar(id=1,smax=0x7fffffff)") check_assert(s64, <=, le_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smax=0)") +__msg(": R0=scalar(id=1,smax=0)") check_assert(s64, <=, le_zero, 0); -__msg(": R0_w=scalar(id=1,smax=0xffffffff80000000") +__msg(": R0=scalar(id=1,smax=0xffffffff80000000") check_assert(s64, <=, le_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >, gt_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >, gt_zero, 0); -__msg(": R0_w=scalar(id=1,smin=0xffffffff80000001") +__msg(": R0=scalar(id=1,smin=0xffffffff80000001") check_assert(s64, >, gt_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >=, ge_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >=, ge_zero, 0); -__msg(": R0_w=scalar(id=1,smin=0xffffffff80000000") +__msg(": R0=scalar(id=1,smin=0xffffffff80000000") check_assert(s64, >=, ge_neg, INT_MIN); SEC("?tc") diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c index b381ac0c736cf0..d273b46dfc7c19 100644 --- a/tools/testing/selftests/bpf/progs/iters_state_safety.c +++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c @@ -30,7 +30,7 @@ int force_clang_to_emit_btf_for_externs(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") int create_and_destroy(void *ctx) { struct bpf_iter_num iter; @@ -196,7 +196,7 @@ int leak_iter_from_subprog_fail(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") int valid_stack_reuse(void *ctx) { struct bpf_iter_num iter; diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 6543d5b6e0a976..83791348bed526 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -20,7 +20,7 @@ __s64 res_empty; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_empty(const void *ctx) @@ -38,7 +38,7 @@ __s64 res_full; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_full(const void *ctx) @@ -58,7 +58,7 @@ static volatile int zero = 0; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_truncated(const void *ctx) diff --git a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c index 4f94c971ae8626..3b984b6ae7c0b9 100644 --- a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c +++ b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c @@ -8,8 +8,8 @@ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r8 = *(u64 *)(r7 +0) ; R7_w=ptr_nameidata(off={{[0-9]+}}) R8_w=rdonly_untrusted_mem(sz=0)") -__msg("r9 = *(u8 *)(r8 +0) ; R8_w=rdonly_untrusted_mem(sz=0) R9_w=scalar") +__msg("r8 = *(u64 *)(r7 +0) ; R7=ptr_nameidata(off={{[0-9]+}}) R8=rdonly_untrusted_mem(sz=0)") +__msg("r9 = *(u8 *)(r8 +0) ; R8=rdonly_untrusted_mem(sz=0) R9=scalar") int btf_id_to_ptr_mem(void *ctx) { struct task_struct *task; diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index fbccc20555f48d..0a72e0228ea9a2 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -926,7 +926,7 @@ l1_%=: r0 = 0; \ SEC("socket") __description("bounds check for non const xor src dst") __success __log_level(2) -__msg("5: (af) r0 ^= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__msg("5: (af) r0 ^= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") __naked void non_const_xor_src_dst(void) { asm volatile (" \ @@ -947,7 +947,7 @@ __naked void non_const_xor_src_dst(void) SEC("socket") __description("bounds check for non const or src dst") __success __log_level(2) -__msg("5: (4f) r0 |= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__msg("5: (4f) r0 |= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") __naked void non_const_or_src_dst(void) { asm volatile (" \ @@ -968,7 +968,7 @@ __naked void non_const_or_src_dst(void) SEC("socket") __description("bounds check for non const mul regs") __success __log_level(2) -__msg("5: (2f) r0 *= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") +__msg("5: (2f) r0 *= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") __naked void non_const_mul_regs(void) { asm volatile (" \ @@ -1241,7 +1241,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("multiply mixed sign bounds. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") __naked void mult_mixed0_sign(void) { asm volatile ( @@ -1264,7 +1264,7 @@ __naked void mult_mixed0_sign(void) SEC("tc") __description("multiply mixed sign bounds. test 2") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=smin32=-100,smax=smax32=200)") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=smin32=-100,smax=smax32=200)") __naked void mult_mixed1_sign(void) { asm volatile ( @@ -1287,7 +1287,7 @@ __naked void mult_mixed1_sign(void) SEC("tc") __description("multiply negative bounds") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=smin32=umin32=0x3ff280b0,smax=umax=smax32=umax32=0x3fff0001,var_off=(0x3ff00000; 0xf81ff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=smin32=umin32=0x3ff280b0,smax=umax=smax32=umax32=0x3fff0001,var_off=(0x3ff00000; 0xf81ff))") __naked void mult_sign_bounds(void) { asm volatile ( @@ -1311,7 +1311,7 @@ __naked void mult_sign_bounds(void) SEC("tc") __description("multiply bounds that don't cross signed boundary") __success __log_level(2) -__msg("r8 *= r6 {{.*}}; R6_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=11,var_off=(0x0; 0xb)) R8_w=scalar(smin=0,smax=umax=0x7b96bb0a94a3a7cd,var_off=(0x0; 0x7fffffffffffffff))") +__msg("r8 *= r6 {{.*}}; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=11,var_off=(0x0; 0xb)) R8=scalar(smin=0,smax=umax=0x7b96bb0a94a3a7cd,var_off=(0x0; 0x7fffffffffffffff))") __naked void mult_no_sign_crossing(void) { asm volatile ( @@ -1331,7 +1331,7 @@ __naked void mult_no_sign_crossing(void) SEC("tc") __description("multiplication overflow, result in unbounded reg. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar()") +__msg("r6 *= r7 {{.*}}; R6=scalar()") __naked void mult_unsign_ovf(void) { asm volatile ( @@ -1353,7 +1353,7 @@ __naked void mult_unsign_ovf(void) SEC("tc") __description("multiplication overflow, result in unbounded reg. test 2") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar()") +__msg("r6 *= r7 {{.*}}; R6=scalar()") __naked void mult_sign_ovf(void) { asm volatile ( @@ -1376,7 +1376,7 @@ __naked void mult_sign_ovf(void) SEC("socket") __description("64-bit addition, all outcomes overflow") __success __log_level(2) -__msg("5: (0f) r3 += r3 {{.*}} R3_w=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe)") +__msg("5: (0f) r3 += r3 {{.*}} R3=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe)") __retval(0) __naked void add64_full_overflow(void) { @@ -1396,7 +1396,7 @@ __naked void add64_full_overflow(void) SEC("socket") __description("64-bit addition, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("4: (0f) r3 += r3 {{.*}} R3_w=scalar()") +__msg("4: (0f) r3 += r3 {{.*}} R3=scalar()") __retval(0) __naked void add64_partial_overflow(void) { @@ -1416,7 +1416,7 @@ __naked void add64_partial_overflow(void) SEC("socket") __description("32-bit addition overflow, all outcomes overflow") __success __log_level(2) -__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=umin=umin32=0x40000000,smax=umax=umax32=0xfffffffe,var_off=(0x0; 0xffffffff))") +__msg("4: (0c) w3 += w3 {{.*}} R3=scalar(smin=umin=umin32=0x40000000,smax=umax=umax32=0xfffffffe,var_off=(0x0; 0xffffffff))") __retval(0) __naked void add32_full_overflow(void) { @@ -1436,7 +1436,7 @@ __naked void add32_full_overflow(void) SEC("socket") __description("32-bit addition, partial overflow, result in unbounded u32 bounds") __success __log_level(2) -__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__msg("4: (0c) w3 += w3 {{.*}} R3=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") __retval(0) __naked void add32_partial_overflow(void) { @@ -1456,7 +1456,7 @@ __naked void add32_partial_overflow(void) SEC("socket") __description("64-bit subtraction, all outcomes underflow") __success __log_level(2) -__msg("6: (1f) r3 -= r1 {{.*}} R3_w=scalar(umin=1,umax=0x8000000000000000)") +__msg("6: (1f) r3 -= r1 {{.*}} R3=scalar(umin=1,umax=0x8000000000000000)") __retval(0) __naked void sub64_full_overflow(void) { @@ -1477,7 +1477,7 @@ __naked void sub64_full_overflow(void) SEC("socket") __description("64-bit subtraction, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("3: (1f) r3 -= r2 {{.*}} R3_w=scalar()") +__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar()") __retval(0) __naked void sub64_partial_overflow(void) { @@ -1496,7 +1496,7 @@ __naked void sub64_partial_overflow(void) SEC("socket") __description("32-bit subtraction overflow, all outcomes underflow") __success __log_level(2) -__msg("5: (1c) w3 -= w1 {{.*}} R3_w=scalar(smin=umin=umin32=1,smax=umax=umax32=0x80000000,var_off=(0x0; 0xffffffff))") +__msg("5: (1c) w3 -= w1 {{.*}} R3=scalar(smin=umin=umin32=1,smax=umax=umax32=0x80000000,var_off=(0x0; 0xffffffff))") __retval(0) __naked void sub32_full_overflow(void) { @@ -1517,7 +1517,7 @@ __naked void sub32_full_overflow(void) SEC("socket") __description("32-bit subtraction, partial overflow, result in unbounded u32 bounds") __success __log_level(2) -__msg("3: (1c) w3 -= w2 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__msg("3: (1c) w3 -= w2 {{.*}} R3=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") __retval(0) __naked void sub32_partial_overflow(void) { @@ -1617,7 +1617,7 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, positive overlap") __success __log_level(2) __flag(BPF_F_TEST_REG_INVARIANTS) -__msg("3: (2d) if r0 > r1 {{.*}} R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f))") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f))") __retval(0) __naked void bounds_deduct_positive_overlap(void) { @@ -1650,7 +1650,7 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, two overlaps") __failure __flag(BPF_F_TEST_REG_INVARIANTS) -__msg("3: (2d) if r0 > r1 {{.*}} R0_w=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") __msg("frame pointer is read only") __naked void bounds_deduct_two_overlaps(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index 181da86ba5f040..6630a92b1b47e7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -215,7 +215,7 @@ __weak int subprog_untrusted(const volatile struct task_struct *restrict task __ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r1 = {{.*}}; {{.*}}R1_w=trusted_ptr_task_struct()") +__msg("r1 = {{.*}}; {{.*}}R1=trusted_ptr_task_struct()") __msg("Func#1 ('subprog_untrusted') is global and assumed valid.") __msg("Validating subprog_untrusted() func#1...") __msg(": R1=untrusted_ptr_task_struct") @@ -278,7 +278,7 @@ __weak int subprog_enum_untrusted(enum bpf_attach_type *p __arg_untrusted) SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r1 = {{.*}}; {{.*}}R1_w=trusted_ptr_task_struct()") +__msg("r1 = {{.*}}; {{.*}}R1=trusted_ptr_task_struct()") __msg("Func#1 ('subprog_void_untrusted') is global and assumed valid.") __msg("Validating subprog_void_untrusted() func#1...") __msg(": R1=rdonly_untrusted_mem(sz=0)") diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index 52edee41caf674..f087ffb79f203c 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -65,7 +65,7 @@ __naked void ldsx_s32(void) SEC("socket") __description("LDSX, S8 range checking, privileged") __log_level(2) __success __retval(1) -__msg("R1_w=scalar(smin=smin32=-128,smax=smax32=127)") +__msg("R1=scalar(smin=smin32=-128,smax=smax32=127)") __naked void ldsx_s8_range_priv(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c index 73fee2aec6983c..1fe090cd674495 100644 --- a/tools/testing/selftests/bpf/progs/verifier_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -144,21 +144,21 @@ SEC("?raw_tp") __success __log_level(2) /* * Without the bug fix there will be no history between "last_idx 3 first_idx 3" - * and "parent state regs=" lines. "R0_w=6" parts are here to help anchor + * and "parent state regs=" lines. "R0=6" parts are here to help anchor * expected log messages to the one specific mark_chain_precision operation. * * This is quite fragile: if verifier checkpointing heuristic changes, this * might need adjusting. */ -__msg("2: (07) r0 += 1 ; R0_w=6") +__msg("2: (07) r0 += 1 ; R0=6") __msg("3: (35) if r0 >= 0xa goto pc+1") __msg("mark_precise: frame0: last_idx 3 first_idx 3 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 2: (07) r0 += 1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (07) r0 += 1") __msg("mark_precise: frame0: regs=r0 stack= before 4: (05) goto pc-4") __msg("mark_precise: frame0: regs=r0 stack= before 3: (35) if r0 >= 0xa goto pc+1") -__msg("mark_precise: frame0: parent state regs= stack=: R0_rw=P4") -__msg("3: R0_w=6") +__msg("mark_precise: frame0: parent state regs= stack=: R0=P4") +__msg("3: R0=6") __naked int state_loop_first_last_equal(void) { asm volatile ( @@ -233,8 +233,8 @@ __naked void bpf_cond_op_not_r10(void) SEC("lsm.s/socket_connect") __success __log_level(2) -__msg("0: (b7) r0 = 1 ; R0_w=1") -__msg("1: (84) w0 = -w0 ; R0_w=0xffffffff") +__msg("0: (b7) r0 = 1 ; R0=1") +__msg("1: (84) w0 = -w0 ; R0=0xffffffff") __msg("mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (84) w0 = -w0") __msg("mark_precise: frame0: regs=r0 stack= before 0: (b7) r0 = 1") @@ -268,8 +268,8 @@ __naked int bpf_neg_3(void) SEC("lsm.s/socket_connect") __success __log_level(2) -__msg("0: (b7) r0 = 1 ; R0_w=1") -__msg("1: (87) r0 = -r0 ; R0_w=-1") +__msg("0: (b7) r0 = 1 ; R0=1") +__msg("1: (87) r0 = -r0 ; R0=-1") __msg("mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (87) r0 = -r0") __msg("mark_precise: frame0: regs=r0 stack= before 0: (b7) r0 = 1") diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index dba3ca728f6e6f..c0ce690ddb68a7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -353,7 +353,7 @@ __flag(BPF_F_TEST_STATE_FREQ) * collect_linked_regs() can't tie more than 6 registers for a single insn. */ __msg("8: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") -__msg("9: (bf) r6 = r6 ; R6_w=scalar(id=2") +__msg("9: (bf) r6 = r6 ; R6=scalar(id=2") /* check that r{0-5} are marked precise after 'if' */ __msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0") __msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:") @@ -779,12 +779,12 @@ __success __retval(0) /* Check that verifier believes r1/r0 are zero at exit */ __log_level(2) -__msg("4: (77) r1 >>= 32 ; R1_w=0") -__msg("5: (bf) r0 = r1 ; R0_w=0 R1_w=0") +__msg("4: (77) r1 >>= 32 ; R1=0") +__msg("5: (bf) r0 = r1 ; R0=0 R1=0") __msg("6: (95) exit") __msg("from 3 to 4") -__msg("4: (77) r1 >>= 32 ; R1_w=0") -__msg("5: (bf) r0 = r1 ; R0_w=0 R1_w=0") +__msg("4: (77) r1 >>= 32 ; R1=0") +__msg("5: (bf) r0 = r1 ; R0=0 R1=0") __msg("6: (95) exit") /* Verify that statements to randomize upper half of r1 had not been * generated. diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 1e5a511e8494af..7a13dbd794b2fb 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -506,17 +506,17 @@ SEC("raw_tp") __log_level(2) __success /* fp-8 is spilled IMPRECISE value zero (represented by a zero value fake reg) */ -__msg("2: (7a) *(u64 *)(r10 -8) = 0 ; R10=fp0 fp-8_w=0") +__msg("2: (7a) *(u64 *)(r10 -8) = 0 ; R10=fp0 fp-8=0") /* but fp-16 is spilled IMPRECISE zero const reg */ -__msg("4: (7b) *(u64 *)(r10 -16) = r0 ; R0_w=0 R10=fp0 fp-16_w=0") +__msg("4: (7b) *(u64 *)(r10 -16) = r0 ; R0=0 R10=fp0 fp-16=0") /* validate that assigning R2 from STACK_SPILL with zero value doesn't mark register * precise immediately; if necessary, it will be marked precise later */ -__msg("6: (71) r2 = *(u8 *)(r10 -1) ; R2_w=0 R10=fp0 fp-8_w=0") +__msg("6: (71) r2 = *(u8 *)(r10 -1) ; R2=0 R10=fp0 fp-8=0") /* similarly, when R2 is assigned from spilled register, it is initially * imprecise, but will be marked precise later once it is used in precise context */ -__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2_w=0 R10=fp0 fp-16_w=0") +__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2=0 R10=fp0 fp-16=0") __msg("11: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 11 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 10: (71) r2 = *(u8 *)(r10 -9)") @@ -598,7 +598,7 @@ __log_level(2) __success /* fp-4 is STACK_ZERO */ __msg("2: (62) *(u32 *)(r10 -4) = 0 ; R10=fp0 fp-8=0000????") -__msg("4: (71) r2 = *(u8 *)(r10 -1) ; R2_w=0 R10=fp0 fp-8=0000????") +__msg("4: (71) r2 = *(u8 *)(r10 -1) ; R2=0 R10=fp0 fp-8=0000????") __msg("5: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 5 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 4: (71) r2 = *(u8 *)(r10 -1)") @@ -640,25 +640,25 @@ SEC("raw_tp") __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) __success /* make sure fp-8 is IMPRECISE fake register spill */ -__msg("3: (7a) *(u64 *)(r10 -8) = 1 ; R10=fp0 fp-8_w=1") +__msg("3: (7a) *(u64 *)(r10 -8) = 1 ; R10=fp0 fp-8=1") /* and fp-16 is spilled IMPRECISE const reg */ -__msg("5: (7b) *(u64 *)(r10 -16) = r0 ; R0_w=1 R10=fp0 fp-16_w=1") +__msg("5: (7b) *(u64 *)(r10 -16) = r0 ; R0=1 R10=fp0 fp-16=1") /* validate load from fp-8, which was initialized using BPF_ST_MEM */ -__msg("8: (79) r2 = *(u64 *)(r10 -8) ; R2_w=1 R10=fp0 fp-8=1") +__msg("8: (79) r2 = *(u64 *)(r10 -8) ; R2=1 R10=fp0 fp-8=1") __msg("9: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 8: (79) r2 = *(u64 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6") /* note, fp-8 is precise, fp-16 is not yet precise, we'll get there */ -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_rw=P1 fp-16_w=1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (7a) *(u64 *)(r10 -8) = 1") -__msg("10: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ -__msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2_w=1 R10=fp0 fp-16=1") +__msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2=1 R10=fp0 fp-16=1") __msg("13: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 13 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 12: (79) r2 = *(u64 *)(r10 -16)") @@ -668,12 +668,12 @@ __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2") __msg("mark_precise: frame0: regs= stack=-16 before 8: (79) r2 = *(u64 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6") /* now both fp-8 and fp-16 are precise, very good */ -__msg("mark_precise: frame0: parent state regs= stack=-16: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_rw=P1 fp-16_rw=P1") +__msg("mark_precise: frame0: parent state regs= stack=-16: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=P1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") __naked void stack_load_preserves_const_precision(void) { asm volatile ( @@ -719,22 +719,22 @@ __success /* make sure fp-8 is 32-bit FAKE subregister spill */ __msg("3: (62) *(u32 *)(r10 -8) = 1 ; R10=fp0 fp-8=????1") /* but fp-16 is spilled IMPRECISE zero const reg */ -__msg("5: (63) *(u32 *)(r10 -16) = r0 ; R0_w=1 R10=fp0 fp-16=????1") +__msg("5: (63) *(u32 *)(r10 -16) = r0 ; R0=1 R10=fp0 fp-16=????1") /* validate load from fp-8, which was initialized using BPF_ST_MEM */ -__msg("8: (61) r2 = *(u32 *)(r10 -8) ; R2_w=1 R10=fp0 fp-8=????1") +__msg("8: (61) r2 = *(u32 *)(r10 -8) ; R2=1 R10=fp0 fp-8=????1") __msg("9: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 8: (61) r2 = *(u32 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6") -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_r=????P1 fp-16=????1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (62) *(u32 *)(r10 -8) = 1") -__msg("10: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ -__msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2_w=1 R10=fp0 fp-16=????1") +__msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2=1 R10=fp0 fp-16=????1") __msg("13: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 13 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 12: (61) r2 = *(u32 *)(r10 -16)") @@ -743,12 +743,12 @@ __msg("mark_precise: frame0: regs= stack=-16 before 10: (73) *(u8 *)(r1 +0) = r2 __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2") __msg("mark_precise: frame0: regs= stack=-16 before 8: (61) r2 = *(u32 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6") -__msg("mark_precise: frame0: parent state regs= stack=-16: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_r=????P1 fp-16_r=????P1") +__msg("mark_precise: frame0: parent state regs= stack=-16: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????P1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") __naked void stack_load_preserves_const_precision_subreg(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 9d415f7ce599b0..ac3e418c2a9616 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -105,7 +105,7 @@ __msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") __msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") __msg("mark_precise: frame0: regs=r0 stack= before 10: (95) exit") __msg("mark_precise: frame1: regs=r0 stack= before 9: (bf) r0 = (s8)r10") -__msg("7: R0_w=scalar") +__msg("7: R0=scalar") __naked int fp_precise_subprog_result(void) { asm volatile ( @@ -141,7 +141,7 @@ __msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = (s8)r1") * anyways, at which point we'll break precision chain */ __msg("mark_precise: frame1: regs=r1 stack= before 9: (bf) r1 = r10") -__msg("7: R0_w=scalar") +__msg("7: R0=scalar") __naked int sneaky_fp_precise_subprog_result(void) { asm volatile ( @@ -681,7 +681,7 @@ __msg("mark_precise: frame0: last_idx 10 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r7 stack= before 9: (bf) r1 = r8") __msg("mark_precise: frame0: regs=r7 stack= before 8: (27) r7 *= 4") __msg("mark_precise: frame0: regs=r7 stack= before 7: (79) r7 = *(u64 *)(r10 -8)") -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=2 R6_w=1 R8_rw=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8_rw=P1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=2 R6=1 R8=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8=P1") __msg("mark_precise: frame0: last_idx 18 first_idx 0 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit") __msg("mark_precise: frame1: regs= stack= before 17: (0f) r0 += r2") diff --git a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c index b616575c3b00a5..ce13002c7a199b 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c +++ b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c @@ -93,7 +93,7 @@ .expected_attach_type = BPF_SK_LOOKUP, .result = VERBOSE_ACCEPT, .runs = -1, - .errstr = "0: (7a) *(u64 *)(r10 -8) = -44 ; R10=fp0 fp-8_w=-44\ + .errstr = "0: (7a) *(u64 *)(r10 -8) = -44 ; R10=fp0 fp-8=-44\ 2: (c5) if r0 s< 0x0 goto pc+2\ - R0_w=-44", + R0=-44", }, From 79f047c7d968b21ff4b72bd70c4533140553c56c Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:43 -0700 Subject: [PATCH 2342/3206] bpf: table based bpf_insn_successors() Converting bpf_insn_successors() to use lookup table makes it ~1.5 times faster. Also remove unnecessary conditionals: - `idx + 1 < prog->len` is unnecessary because after check_cfg() all jump targets are guaranteed to be within a program; - `i == 0 || succ[0] != dst` is unnecessary because any client of bpf_insn_successors() can handle duplicate edges: - compute_live_registers() - compute_scc() Moving bpf_insn_successors() to liveness.c allows its inlining in liveness.c:__update_stack_liveness(). Such inlining speeds up __update_stack_liveness() by ~40%. bpf_insn_successors() is used in both verifier.c and liveness.c. perf shows such move does not negatively impact users in verifier.c, as these are executed only once before main varification pass. Unlike __update_stack_liveness() which can be triggered multiple times. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-10-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/liveness.c | 56 ++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 72 +----------------------------------- 3 files changed, 58 insertions(+), 71 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c7515da8500c56..4c497e839526a4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1049,6 +1049,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st u32 frameno); struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); +int bpf_jmp_offset(struct bpf_insn *insn); int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 6f9dfaaf6e64fa..3c611aba7f52c5 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -433,6 +433,62 @@ static void log_mask_change(struct bpf_verifier_env *env, struct callchain *call bpf_log(&env->log, "\n"); } +int bpf_jmp_offset(struct bpf_insn *insn) +{ + u8 code = insn->code; + + if (code == (BPF_JMP32 | BPF_JA)) + return insn->imm; + return insn->off; +} + +__diag_push(); +__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl"); + +inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +{ + static const struct opcode_info { + bool can_jump; + bool can_fallthrough; + } opcode_info_tbl[256] = { + [0 ... 255] = {.can_jump = false, .can_fallthrough = true}, + #define _J(code, ...) \ + [BPF_JMP | code] = __VA_ARGS__, \ + [BPF_JMP32 | code] = __VA_ARGS__ + + _J(BPF_EXIT, {.can_jump = false, .can_fallthrough = false}), + _J(BPF_JA, {.can_jump = true, .can_fallthrough = false}), + _J(BPF_JEQ, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JNE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JLT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JLE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JGT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JGE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSGT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSGE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSLT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSLE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JCOND, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}), + #undef _J + }; + struct bpf_insn *insn = &prog->insnsi[idx]; + const struct opcode_info *opcode_info; + int i = 0, insn_sz; + + opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)]; + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + if (opcode_info->can_fallthrough) + succ[i++] = idx + insn_sz; + + if (opcode_info->can_jump) + succ[i++] = idx + bpf_jmp_offset(insn) + 1; + + return i; +} + +__diag_pop(); + static struct func_instance *get_outer_instance(struct bpf_verifier_env *env, struct func_instance *instance) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e1da2471442b28..1d4183bc3cd1e9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3485,15 +3485,6 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return 0; } -static int jmp_offset(struct bpf_insn *insn) -{ - u8 code = insn->code; - - if (code == (BPF_JMP32 | BPF_JA)) - return insn->imm; - return insn->off; -} - static int check_subprogs(struct bpf_verifier_env *env) { int i, subprog_start, subprog_end, off, cur_subprog = 0; @@ -3520,7 +3511,7 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - off = i + jmp_offset(&insn[i]) + 1; + off = i + bpf_jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -23944,67 +23935,6 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } -static bool can_fallthrough(struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - - if (class != BPF_JMP && class != BPF_JMP32) - return true; - - if (opcode == BPF_EXIT || opcode == BPF_JA) - return false; - - return true; -} - -static bool can_jump(struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - - if (class != BPF_JMP && class != BPF_JMP32) - return false; - - switch (opcode) { - case BPF_JA: - case BPF_JEQ: - case BPF_JNE: - case BPF_JLT: - case BPF_JLE: - case BPF_JGT: - case BPF_JGE: - case BPF_JSGT: - case BPF_JSGE: - case BPF_JSLT: - case BPF_JSLE: - case BPF_JCOND: - case BPF_JSET: - return true; - } - - return false; -} - -int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) -{ - struct bpf_insn *insn = &prog->insnsi[idx]; - int i = 0, insn_sz; - u32 dst; - - insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; - if (can_fallthrough(insn) && idx + 1 < prog->len) - succ[i++] = idx + insn_sz; - - if (can_jump(insn)) { - dst = idx + jmp_offset(insn) + 1; - if (i == 0 || succ[0] != dst) - succ[i++] = dst; - } - - return i; -} - /* Each field is a register bitmask */ struct insn_live_regs { u16 use; /* registers read by instruction */ From 34c513be3dada9fb6314ea8c9ec35d71d09a2e58 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:44 -0700 Subject: [PATCH 2343/3206] selftests/bpf: __not_msg() tag for test_loader framework This patch adds tags __not_msg() and __not_msg_unpriv(). Test fails if is found in verifier log. If __msg_not() is situated between __msg() tags framework matches __msg() tags first, and then checks that is not present in a portion of a log between bracketing __msg() tags. __msg_not() tags bracketed by a same __msg() group are effectively unordered. The idea is borrowed from LLVM's CheckFile with its CHECK-NOT syntax. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-11-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/prog_tests_framework.c | 125 +++++++++++ tools/testing/selftests/bpf/progs/bpf_misc.h | 9 + tools/testing/selftests/bpf/test_loader.c | 201 +++++++++++++----- tools/testing/selftests/bpf/test_progs.h | 17 ++ 4 files changed, 302 insertions(+), 50 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c b/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c index 14f2796076e0c2..7607cfc2408c2d 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c @@ -54,3 +54,128 @@ void test_prog_tests_framework(void) return; clear_test_state(state); } + +static void dummy_emit(const char *buf, bool force) {} + +void test_prog_tests_framework_expected_msgs(void) +{ + struct expected_msgs msgs; + int i, j, error_cnt; + const struct { + const char *name; + const char *log; + const char *expected; + struct expect_msg *pats; + } cases[] = { + { + .name = "simple-ok", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "simple-fail", + .log = "aaabbbddd", + .expected = "MATCHED SUBSTR: 'aaa'\n" + "EXPECTED SUBSTR: 'ccc'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-ok-mid", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "foo", .negative = true }, + { .substr = "bar", .negative = true }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-ok-tail", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "foo", .negative = true }, + {} + } + }, + { + .name = "negative-ok-head", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "foo", .negative = true }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-fail-head", + .log = "aaabbbccc", + .expected = "UNEXPECTED SUBSTR: 'aaa'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa", .negative = true }, + { .substr = "bbb" }, + {} + } + }, + { + .name = "negative-fail-tail", + .log = "aaabbbccc", + .expected = "UNEXPECTED SUBSTR: 'ccc'\n", + .pats = (struct expect_msg[]) { + { .substr = "bbb" }, + { .substr = "ccc", .negative = true }, + {} + } + }, + { + .name = "negative-fail-mid-1", + .log = "aaabbbccc", + .expected = "UNEXPECTED SUBSTR: 'bbb'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "bbb", .negative = true }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-fail-mid-2", + .log = "aaabbb222ccc", + .expected = "UNEXPECTED SUBSTR: '222'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "222", .negative = true }, + { .substr = "bbb", .negative = true }, + { .substr = "ccc" }, + {} + } + } + }; + + for (i = 0; i < ARRAY_SIZE(cases); i++) { + if (test__start_subtest(cases[i].name)) { + error_cnt = env.subtest_state->error_cnt; + msgs.patterns = cases[i].pats; + msgs.cnt = 0; + for (j = 0; cases[i].pats[j].substr; j++) + msgs.cnt++; + validate_msgs(cases[i].log, &msgs, dummy_emit); + fflush(stderr); + env.subtest_state->error_cnt = error_cnt; + if (cases[i].expected) + ASSERT_HAS_SUBSTR(env.subtest_state->log_buf, cases[i].expected, "expected output"); + else + ASSERT_STREQ(env.subtest_state->log_buf, "", "expected no output"); + test__end_subtest(); + } + } +} diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 1004c4a64aafb9..a7a1a684eed116 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -33,7 +33,14 @@ * e.g. "foo{{[0-9]+}}" matches strings like "foo007". * Extended POSIX regular expression syntax is allowed * inside the brackets. + * __not_msg Message not expected to be found in verifier log. + * If __msg_not is situated between __msg tags + * framework matches __msg tags first, and then + * checks that __msg_not is not present in a portion of + * a log between bracketing __msg tags. + * Same regex syntax as for __msg is supported. * __msg_unpriv Same as __msg but for unprivileged mode. + * __not_msg_unpriv Same as __not_msg but for unprivileged mode. * * __stderr Message expected to be found in bpf stderr stream. The * same regex rules apply like __msg. @@ -121,12 +128,14 @@ * __caps_unpriv Specify the capabilities that should be set when running the test. */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) +#define __not_msg(msg) __attribute__((btf_decl_tag("comment:test_expect_not_msg=" XSTR(__COUNTER__) "=" msg))) #define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg))) #define __jited(msg) __attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg))) #define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) #define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) #define __description(desc) __attribute__((btf_decl_tag("comment:test_description=" desc))) #define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __not_msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_not_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __jited_unpriv(msg) __attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg))) #define __failure_unpriv __attribute__((btf_decl_tag("comment:test_expect_failure_unpriv"))) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index e065b467d5090e..74ecc281bb8c12 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -2,7 +2,6 @@ /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include #include -#include #include #include @@ -20,10 +19,12 @@ #define TEST_TAG_EXPECT_FAILURE "comment:test_expect_failure" #define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success" #define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg=" +#define TEST_TAG_EXPECT_NOT_MSG_PFX "comment:test_expect_not_msg=" #define TEST_TAG_EXPECT_XLATED_PFX "comment:test_expect_xlated=" #define TEST_TAG_EXPECT_FAILURE_UNPRIV "comment:test_expect_failure_unpriv" #define TEST_TAG_EXPECT_SUCCESS_UNPRIV "comment:test_expect_success_unpriv" #define TEST_TAG_EXPECT_MSG_PFX_UNPRIV "comment:test_expect_msg_unpriv=" +#define TEST_TAG_EXPECT_NOT_MSG_PFX_UNPRIV "comment:test_expect_not_msg_unpriv=" #define TEST_TAG_EXPECT_XLATED_PFX_UNPRIV "comment:test_expect_xlated_unpriv=" #define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level=" #define TEST_TAG_PROG_FLAGS_PFX "comment:test_prog_flags=" @@ -65,18 +66,6 @@ enum load_mode { NO_JITED = 1 << 1, }; -struct expect_msg { - const char *substr; /* substring match */ - regex_t regex; - bool is_regex; - bool on_next_line; -}; - -struct expected_msgs { - struct expect_msg *patterns; - size_t cnt; -}; - struct test_subspec { char *name; bool expect_failure; @@ -216,7 +205,8 @@ static int compile_regex(const char *pattern, regex_t *regex) return 0; } -static int __push_msg(const char *pattern, bool on_next_line, struct expected_msgs *msgs) +static int __push_msg(const char *pattern, bool on_next_line, bool negative, + struct expected_msgs *msgs) { struct expect_msg *msg; void *tmp; @@ -232,6 +222,7 @@ static int __push_msg(const char *pattern, bool on_next_line, struct expected_ms msg = &msgs->patterns[msgs->cnt]; msg->on_next_line = on_next_line; msg->substr = pattern; + msg->negative = negative; msg->is_regex = false; if (strstr(pattern, "{{")) { err = compile_regex(pattern, &msg->regex); @@ -250,16 +241,16 @@ static int clone_msgs(struct expected_msgs *from, struct expected_msgs *to) for (i = 0; i < from->cnt; i++) { msg = &from->patterns[i]; - err = __push_msg(msg->substr, msg->on_next_line, to); + err = __push_msg(msg->substr, msg->on_next_line, msg->negative, to); if (err) return err; } return 0; } -static int push_msg(const char *substr, struct expected_msgs *msgs) +static int push_msg(const char *substr, bool negative, struct expected_msgs *msgs) { - return __push_msg(substr, false, msgs); + return __push_msg(substr, false, negative, msgs); } static int push_disasm_msg(const char *regex_str, bool *on_next_line, struct expected_msgs *msgs) @@ -270,7 +261,7 @@ static int push_disasm_msg(const char *regex_str, bool *on_next_line, struct exp *on_next_line = false; return 0; } - err = __push_msg(regex_str, *on_next_line, msgs); + err = __push_msg(regex_str, *on_next_line, false, msgs); if (err) return err; *on_next_line = true; @@ -482,12 +473,22 @@ static int parse_test_spec(struct test_loader *tester, spec->auxiliary = true; spec->mode_mask |= UNPRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX))) { - err = push_msg(msg, &spec->priv.expect_msgs); + err = push_msg(msg, false, &spec->priv.expect_msgs); + if (err) + goto cleanup; + spec->mode_mask |= PRIV; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_NOT_MSG_PFX))) { + err = push_msg(msg, true, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV))) { - err = push_msg(msg, &spec->unpriv.expect_msgs); + err = push_msg(msg, false, &spec->unpriv.expect_msgs); + if (err) + goto cleanup; + spec->mode_mask |= UNPRIV; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_NOT_MSG_PFX_UNPRIV))) { + err = push_msg(msg, true, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; @@ -764,44 +765,141 @@ static void emit_stdout(const char *bpf_stdout, bool force) fprintf(stdout, "STDOUT:\n=============\n%s=============\n", bpf_stdout); } -static void validate_msgs(char *log_buf, struct expected_msgs *msgs, - void (*emit_fn)(const char *buf, bool force)) +static const char *match_msg(struct expect_msg *msg, const char **log) { - const char *log = log_buf, *prev_match; + const char *match = NULL; regmatch_t reg_match[1]; - int prev_match_line; - int match_line; - int i, j, err; + int err; + + if (!msg->is_regex) { + match = strstr(*log, msg->substr); + if (match) + *log = match + strlen(msg->substr); + } else { + err = regexec(&msg->regex, *log, 1, reg_match, 0); + if (err == 0) { + match = *log + reg_match[0].rm_so; + *log += reg_match[0].rm_eo; + } + } + return match; +} + +static int count_lines(const char *start, const char *end) +{ + const char *tmp; + int n = 0; + + for (tmp = start; tmp < end; ++tmp) + if (*tmp == '\n') + n++; + return n; +} + +struct match { + const char *start; + const char *end; + int line; +}; + +/* + * Positive messages are matched sequentially, each next message + * is looked for starting from the end of a previous matched one. + */ +static void match_positive_msgs(const char *log, struct expected_msgs *msgs, struct match *matches) +{ + const char *prev_match; + int i, line; - prev_match_line = -1; - match_line = 0; prev_match = log; + line = 0; + for (i = 0; i < msgs->cnt; i++) { + struct expect_msg *msg = &msgs->patterns[i]; + const char *match = NULL; + + if (msg->negative) + continue; + + match = match_msg(msg, &log); + if (match) { + line += count_lines(prev_match, match); + matches[i].start = match; + matches[i].end = log; + matches[i].line = line; + prev_match = match; + } + } +} + +/* + * Each negative messages N located between positive messages P1 and P2 + * is matched in the span P1.end .. P2.start. Consequently, negative messages + * are unordered within the span. + */ +static void match_negative_msgs(const char *log, struct expected_msgs *msgs, struct match *matches) +{ + const char *start = log, *end, *next, *match; + const char *log_end = log + strlen(log); + int i, j, next_positive; + for (i = 0; i < msgs->cnt; i++) { struct expect_msg *msg = &msgs->patterns[i]; - const char *match = NULL, *pat_status; - bool wrong_line = false; - - if (!msg->is_regex) { - match = strstr(log, msg->substr); - if (match) - log = match + strlen(msg->substr); - } else { - err = regexec(&msg->regex, log, 1, reg_match, 0); - if (err == 0) { - match = log + reg_match[0].rm_so; - log += reg_match[0].rm_eo; + + /* positive message bumps span start */ + if (!msg->negative) { + start = matches[i].end ?: start; + continue; + } + + /* count stride of negative patterns and adjust span end */ + end = log_end; + for (next_positive = i + 1; next_positive < msgs->cnt; next_positive++) { + if (!msgs->patterns[next_positive].negative) { + end = matches[next_positive].start; + break; } } - if (match) { - for (; prev_match < match; ++prev_match) - if (*prev_match == '\n') - ++match_line; - wrong_line = msg->on_next_line && prev_match_line >= 0 && - prev_match_line + 1 != match_line; + /* try matching negative messages within identified span */ + for (j = i; j < next_positive; j++) { + next = start; + match = match_msg(msg, &next); + if (match && next <= end) { + matches[j].start = match; + matches[j].end = next; + } } - if (!match || wrong_line) { + /* -1 to account for i++ */ + i = next_positive - 1; + } +} + +void validate_msgs(const char *log_buf, struct expected_msgs *msgs, + void (*emit_fn)(const char *buf, bool force)) +{ + struct match matches[msgs->cnt]; + struct match *prev_match = NULL; + int i, j; + + memset(matches, 0, sizeof(*matches) * msgs->cnt); + match_positive_msgs(log_buf, msgs, matches); + match_negative_msgs(log_buf, msgs, matches); + + for (i = 0; i < msgs->cnt; i++) { + struct expect_msg *msg = &msgs->patterns[i]; + struct match *match = &matches[i]; + const char *pat_status; + bool unexpected; + bool wrong_line; + bool no_match; + + no_match = !msg->negative && !match->start; + wrong_line = !msg->negative && + msg->on_next_line && + prev_match && prev_match->line + 1 != match->line; + unexpected = msg->negative && match->start; + if (no_match || wrong_line || unexpected) { PRINT_FAIL("expect_msg\n"); if (env.verbosity == VERBOSE_NONE) emit_fn(log_buf, true /*force*/); @@ -811,8 +909,10 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, pat_status = "MATCHED "; else if (wrong_line) pat_status = "WRONG LINE"; - else + else if (no_match) pat_status = "EXPECTED "; + else + pat_status = "UNEXPECTED"; msg = &msgs->patterns[j]; fprintf(stderr, "%s %s: '%s'\n", pat_status, @@ -822,12 +922,13 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, if (wrong_line) { fprintf(stderr, "expecting match at line %d, actual match is at line %d\n", - prev_match_line + 1, match_line); + prev_match->line + 1, match->line); } break; } - prev_match_line = match_line; + if (!msg->negative) + prev_match = match; } } diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index df2222a1806fd1..eebfc18cdcd21d 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -546,4 +547,20 @@ extern void test_loader_fini(struct test_loader *tester); test_loader_fini(&tester); \ }) +struct expect_msg { + const char *substr; /* substring match */ + regex_t regex; + bool is_regex; + bool on_next_line; + bool negative; +}; + +struct expected_msgs { + struct expect_msg *patterns; + size_t cnt; +}; + +void validate_msgs(const char *log_buf, struct expected_msgs *msgs, + void (*emit_fn)(const char *buf, bool force)); + #endif /* __TEST_PROGS_H */ From fdcecdff905cf712279d3ba72ec8ac7bc02be7ff Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:45 -0700 Subject: [PATCH 2344/3206] selftests/bpf: test cases for callchain sensitive live stack tracking - simple propagation of read/write marks; - joining read/write marks from conditional branches; - avoid must_write marks in when same instruction accesses different stack offsets on different execution paths; - avoid must_write marks in case same instruction accesses stack and non-stack pointers on different execution paths; - read/write marks propagation to outer stack frame; - independent read marks for different callchains ending with the same function; - bpf_calls_callback() dependent logic in liveness.c:bpf_stack_slot_alive(). Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-12-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_live_stack.c | 294 ++++++++++++++++++ 2 files changed, 296 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_live_stack.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index e35c216dbaf21c..28e81161e6fca9 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -46,6 +46,7 @@ #include "verifier_ldsx.skel.h" #include "verifier_leak_ptr.skel.h" #include "verifier_linked_scalars.skel.h" +#include "verifier_live_stack.skel.h" #include "verifier_load_acquire.skel.h" #include "verifier_loops1.skel.h" #include "verifier_lwt.skel.h" @@ -184,6 +185,7 @@ void test_verifier_ld_ind(void) { RUN(verifier_ld_ind); } void test_verifier_ldsx(void) { RUN(verifier_ldsx); } void test_verifier_leak_ptr(void) { RUN(verifier_leak_ptr); } void test_verifier_linked_scalars(void) { RUN(verifier_linked_scalars); } +void test_verifier_live_stack(void) { RUN(verifier_live_stack); } void test_verifier_loops1(void) { RUN(verifier_loops1); } void test_verifier_lwt(void) { RUN(verifier_lwt); } void test_verifier_map_in_map(void) { RUN(verifier_map_in_map); } diff --git a/tools/testing/selftests/bpf/progs/verifier_live_stack.c b/tools/testing/selftests/bpf/progs/verifier_live_stack.c new file mode 100644 index 00000000000000..c0e80850926827 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_live_stack.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, long long); +} map SEC(".maps"); + +SEC("socket") +__log_level(2) +__msg("(0) frame 0 insn 2 +written -8") +__msg("(0) frame 0 insn 1 +live -24") +__msg("(0) frame 0 insn 1 +written -8") +__msg("(0) frame 0 insn 0 +live -8,-24") +__msg("(0) frame 0 insn 0 +written -8") +__msg("(0) live stack update done in 2 iterations") +__naked void simple_read_simple_write(void) +{ + asm volatile ( + "r1 = *(u64 *)(r10 - 8);" + "r2 = *(u64 *)(r10 - 24);" + "*(u64 *)(r10 - 8) = r1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("(0) frame 0 insn 1 +live -8") +__not_msg("(0) frame 0 insn 1 +written") +__msg("(0) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 1 +live -16") +__msg("(0) frame 0 insn 1 +written -32") +__msg("(0) live stack update done in 2 iterations") +__naked void read_write_join(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "if r0 > 42 goto 1f;" + "r0 = *(u64 *)(r10 - 8);" + "*(u64 *)(r10 - 32) = r0;" + "*(u64 *)(r10 - 40) = r0;" + "exit;" +"1:" + "r0 = *(u64 *)(r10 - 16);" + "*(u64 *)(r10 - 32) = r0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("2: (25) if r0 > 0x2a goto pc+1") +__msg("7: (95) exit") +__msg("(0) frame 0 insn 2 +written -16") +__msg("(0) live stack update done in 2 iterations") +__msg("7: (95) exit") +__not_msg("(0) frame 0 insn 2") +__msg("(0) live stack update done in 1 iterations") +__naked void must_write_not_same_slot(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r1 = -8;" + "if r0 > 42 goto 1f;" + "r1 = -16;" +"1:" + "r2 = r10;" + "r2 += r1;" + "*(u64 *)(r2 + 0) = r0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("(0) frame 0 insn 0 +written -8,-16") +__msg("(0) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 0 +written -8") +__msg("(0) live stack update done in 2 iterations") +__naked void must_write_not_same_type(void) +{ + asm volatile ( + "*(u64*)(r10 - 8) = 0;" + "r2 = r10;" + "r2 += -8;" + "r1 = %[map] ll;" + "call %[bpf_map_lookup_elem];" + "if r0 != 0 goto 1f;" + "r0 = r10;" + "r0 += -16;" +"1:" + "*(u64 *)(r0 + 0) = 42;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_map_lookup_elem), + __imm_addr(map) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("(2,4) frame 0 insn 4 +written -8") +__msg("(2,4) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 2 +written -8") +__msg("(0) live stack update done in 2 iterations") +__naked void caller_stack_write(void) +{ + asm volatile ( + "r1 = r10;" + "r1 += -8;" + "call write_first_param;" + "exit;" + ::: __clobber_all); +} + +static __used __naked void write_first_param(void) +{ + asm volatile ( + "*(u64 *)(r1 + 0) = 7;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +/* caller_stack_read() function */ +__msg("2: .12345.... (85) call pc+4") +__msg("5: .12345.... (85) call pc+1") +__msg("6: 0......... (95) exit") +/* read_first_param() function */ +__msg("7: .1........ (79) r0 = *(u64 *)(r1 +0)") +__msg("8: 0......... (95) exit") +/* update for callsite at (2) */ +__msg("(2,7) frame 0 insn 7 +live -8") +__msg("(2,7) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 2 +live -8") +__msg("(0) live stack update done in 2 iterations") +/* update for callsite at (5) */ +__msg("(5,7) frame 0 insn 7 +live -16") +__msg("(5,7) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 5 +live -16") +__msg("(0) live stack update done in 2 iterations") +__naked void caller_stack_read(void) +{ + asm volatile ( + "r1 = r10;" + "r1 += -8;" + "call read_first_param;" + "r1 = r10;" + "r1 += -16;" + "call read_first_param;" + "exit;" + ::: __clobber_all); +} + +static __used __naked void read_first_param(void) +{ + asm volatile ( + "r0 = *(u64 *)(r1 + 0);" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__flag(BPF_F_TEST_STATE_FREQ) +__log_level(2) +/* read_first_param2() function */ +__msg(" 9: .1........ (79) r0 = *(u64 *)(r1 +0)") +__msg("10: .......... (b7) r0 = 0") +__msg("11: 0......... (05) goto pc+0") +__msg("12: 0......... (95) exit") +/* + * The purpose of the test is to check that checkpoint in + * read_first_param2() stops path traversal. This will only happen if + * verifier understands that fp[0]-8 at insn (12) is not alive. + */ +__msg("12: safe") +__msg("processed 20 insns") +__naked void caller_stack_pruning(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "if r0 == 42 goto 1f;" + "r0 = %[map] ll;" +"1:" + "*(u64 *)(r10 - 8) = r0;" + "r1 = r10;" + "r1 += -8;" + /* + * fp[0]-8 is either pointer to map or a scalar, + * preventing state pruning at checkpoint created for call. + */ + "call read_first_param2;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm_addr(map) + : __clobber_all); +} + +static __used __naked void read_first_param2(void) +{ + asm volatile ( + "r0 = *(u64 *)(r1 + 0);" + "r0 = 0;" + /* + * Checkpoint at goto +0 should fire, + * as caller stack fp[0]-8 is not alive at this point. + */ + "goto +0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg("R1 type=scalar expected=map_ptr") +__naked void caller_stack_pruning_callback(void) +{ + asm volatile ( + "r0 = %[map] ll;" + "*(u64 *)(r10 - 8) = r0;" + "r1 = 2;" + "r2 = loop_cb ll;" + "r3 = r10;" + "r3 += -8;" + "r4 = 0;" + /* + * fp[0]-8 is either pointer to map or a scalar, + * preventing state pruning at checkpoint created for call. + */ + "call %[bpf_loop];" + "r0 = 42;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_loop), + __imm_addr(map) + : __clobber_all); +} + +static __used __naked void loop_cb(void) +{ + asm volatile ( + /* + * Checkpoint at function entry should not fire, as caller + * stack fp[0]-8 is alive at this point. + */ + "r6 = r2;" + "r1 = *(u64 *)(r6 + 0);" + "*(u64*)(r10 - 8) = 7;" + "r2 = r10;" + "r2 += -8;" + "call %[bpf_map_lookup_elem];" + /* + * This should stop verifier on a second loop iteration, + * but only if verifier correctly maintains that fp[0]-8 + * is still alive. + */ + "*(u64 *)(r6 + 0) = 0;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_map_lookup_elem), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* + * Because of a bug in verifier.c:compute_postorder() + * the program below overflowed traversal queue in that function. + */ +SEC("socket") +__naked void syzbot_postorder_bug1(void) +{ + asm volatile ( + "r0 = 0;" + "if r0 != 0 goto -1;" + "exit;" + ::: __clobber_all); +} From 1d4ce63e338fc62f47f22d61cf4b1624caa8cf1c Mon Sep 17 00:00:00 2001 From: Aleksa Paunovic Date: Thu, 24 Jul 2025 17:23:27 +0200 Subject: [PATCH 2345/3206] riscv: Add xmipsexectl instructions Add xmipsexectl instruction opcodes. This includes the MIPS.PAUSE, MIPS.EHB, and MIPS.IHB instructions. Signed-off-by: Aleksa Paunovic Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250724-p8700-pause-v5-3-a6cbbe1c3412@htecgroup.com Signed-off-by: Paul Walmsley --- .../include/asm/vendor_extensions/mips.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/riscv/include/asm/vendor_extensions/mips.h b/arch/riscv/include/asm/vendor_extensions/mips.h index 133e55985d827c..ea8ca747d691df 100644 --- a/arch/riscv/include/asm/vendor_extensions/mips.h +++ b/arch/riscv/include/asm/vendor_extensions/mips.h @@ -15,4 +15,23 @@ struct riscv_isa_vendor_ext_data_list; extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_mips; #endif +/* Extension specific instructions */ + +/* + * All of the xmipsexectl extension instructions are + * ‘hint’ encodings of the SLLI instruction, + * with rd = 0, rs1 = 0 and imm = 1 for IHB, imm = 3 for EHB, + * and imm = 5 for PAUSE. + * MIPS.PAUSE is an alternative opcode which is implemented to have the + * same behavior as PAUSE on some MIPS RISCV cores. + * MIPS.EHB clears all execution hazards before allowing + * any subsequent instructions to execute. + * MIPS.IHB clears all instruction hazards before + * allowing any subsequent instructions to fetch. + */ + +#define MIPS_PAUSE ".4byte 0x00501013\n\t" +#define MIPS_EHB ".4byte 0x00301013\n\t" +#define MIPS_IHB ".4byte 0x00101013\n\t" + #endif // _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_H From bb4b0f8a1bcbf8f4e3a0841aaefb3fd580d12fc9 Mon Sep 17 00:00:00 2001 From: Aleksa Paunovic Date: Thu, 24 Jul 2025 17:23:28 +0200 Subject: [PATCH 2346/3206] riscv: hwprobe: Add MIPS vendor extension probing Add a new hwprobe key "RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0" which allows userspace to probe for the new xmipsexectl vendor extension. Signed-off-by: Aleksa Paunovic Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250724-p8700-pause-v5-4-a6cbbe1c3412@htecgroup.com [pjw@kernel.org: fixed some checkpatch issues] Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/hwprobe.h | 3 ++- .../asm/vendor_extensions/mips_hwprobe.h | 22 ++++++++++++++++++ arch/riscv/include/uapi/asm/hwprobe.h | 1 + arch/riscv/include/uapi/asm/vendor/mips.h | 3 +++ arch/riscv/kernel/sys_hwprobe.c | 4 ++++ arch/riscv/kernel/vendor_extensions/Makefile | 1 + .../kernel/vendor_extensions/mips_hwprobe.c | 23 +++++++++++++++++++ 7 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/include/asm/vendor_extensions/mips_hwprobe.h create mode 100644 arch/riscv/include/uapi/asm/vendor/mips.h create mode 100644 arch/riscv/kernel/vendor_extensions/mips_hwprobe.c diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index 7fe0a379474ae2..948d2b34e94e84 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,7 +8,7 @@ #include -#define RISCV_HWPROBE_MAX_KEY 13 +#define RISCV_HWPROBE_MAX_KEY 14 static inline bool riscv_hwprobe_key_is_valid(__s64 key) { @@ -22,6 +22,7 @@ static inline bool hwprobe_key_is_bitmask(__s64 key) case RISCV_HWPROBE_KEY_IMA_EXT_0: case RISCV_HWPROBE_KEY_CPUPERF_0: case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0: + case RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0: case RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0: return true; } diff --git a/arch/riscv/include/asm/vendor_extensions/mips_hwprobe.h b/arch/riscv/include/asm/vendor_extensions/mips_hwprobe.h new file mode 100644 index 00000000000000..e63f664b6b1766 --- /dev/null +++ b/arch/riscv/include/asm/vendor_extensions/mips_hwprobe.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 MIPS. + */ + +#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_HWPROBE_H_ +#define _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_HWPROBE_H_ + +#include +#include + +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_MIPS +void hwprobe_isa_vendor_ext_mips_0(struct riscv_hwprobe *pair, const struct cpumask *cpus); +#else +static inline void hwprobe_isa_vendor_ext_mips_0(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + pair->value = 0; +} +#endif + +#endif // _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_HWPROBE_H_ diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index aaf6ad97049931..5d30a4fae37a82 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -106,6 +106,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0 11 #define RISCV_HWPROBE_KEY_ZICBOM_BLOCK_SIZE 12 #define RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0 13 +#define RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0 14 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/include/uapi/asm/vendor/mips.h b/arch/riscv/include/uapi/asm/vendor/mips.h new file mode 100644 index 00000000000000..e65ab268b26551 --- /dev/null +++ b/arch/riscv/include/uapi/asm/vendor/mips.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#define RISCV_HWPROBE_VENDOR_EXT_XMIPSEXECTL BIT(0) diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 3e9259790816e4..000f4451a9d873 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -307,6 +308,9 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0: hwprobe_isa_vendor_ext_thead_0(pair, cpus); break; + case RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0: + hwprobe_isa_vendor_ext_mips_0(pair, cpus); + break; /* * For forward compatibility, unknown keys don't fail the whole diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile index ccad4ebafb4341..bf116c82b6bdb3 100644 --- a/arch/riscv/kernel/vendor_extensions/Makefile +++ b/arch/riscv/kernel/vendor_extensions/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES) += andes.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_MIPS) += mips.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_MIPS) += mips_hwprobe.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive_hwprobe.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD) += thead.o diff --git a/arch/riscv/kernel/vendor_extensions/mips_hwprobe.c b/arch/riscv/kernel/vendor_extensions/mips_hwprobe.c new file mode 100644 index 00000000000000..dc213a2ca70d95 --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/mips_hwprobe.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 MIPS. + */ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +void hwprobe_isa_vendor_ext_mips_0(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + VENDOR_EXTENSION_SUPPORTED(pair, cpus, + riscv_isa_vendor_ext_list_mips.per_hart_isa_bitmap, + { VENDOR_EXT_KEY(XMIPSEXECTL); }); +} From c9a9fc23228f447beefe473224207944521b14a1 Mon Sep 17 00:00:00 2001 From: Aleksa Paunovic Date: Thu, 24 Jul 2025 17:23:29 +0200 Subject: [PATCH 2347/3206] riscv: hwprobe: Document MIPS xmipsexectl vendor extension Document support for MIPS vendor extensions using the key "RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0" and xmipsexectl vendor extension using the key "RISCV_HWPROBE_VENDOR_EXT_XMIPSEXECTL". Signed-off-by: Aleksa Paunovic Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250724-p8700-pause-v5-5-a6cbbe1c3412@htecgroup.com Signed-off-by: Paul Walmsley --- Documentation/arch/riscv/hwprobe.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst index 2aa9be272d5de1..2f449c9b15bdd6 100644 --- a/Documentation/arch/riscv/hwprobe.rst +++ b/Documentation/arch/riscv/hwprobe.rst @@ -327,6 +327,15 @@ The following keys are defined: * :c:macro:`RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED`: Misaligned vector accesses are not supported at all and will generate a misaligned address fault. +* :c:macro:`RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0`: A bitmask containing the + mips vendor extensions that are compatible with the + :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: base system behavior. + + * MIPS + + * :c:macro:`RISCV_HWPROBE_VENDOR_EXT_XMIPSEXECTL`: The xmipsexectl vendor + extension is supported in the MIPS ISA extensions spec. + * :c:macro:`RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0`: A bitmask containing the thead vendor extensions that are compatible with the :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: base system behavior. From 0b0ca959d20689fece038954bbf1d7b14c0b11c3 Mon Sep 17 00:00:00 2001 From: Djordje Todorovic Date: Thu, 24 Jul 2025 17:23:31 +0200 Subject: [PATCH 2348/3206] riscv: errata: Fix the PAUSE Opcode for MIPS P8700 Add ERRATA_MIPS and ERRATA_MIPS_P8700_PAUSE_OPCODE configs. Handle errata for the MIPS PAUSE instruction. Signed-off-by: Djordje Todorovic Signed-off-by: Aleksandar Rikalo Signed-off-by: Raj Vishwanathan4 Signed-off-by: Aleksa Paunovic Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250724-p8700-pause-v5-7-a6cbbe1c3412@htecgroup.com [pjw@kernel.org: updated to apply and compile; fixed a checkpatch issue] Signed-off-by: Paul Walmsley --- arch/riscv/Kconfig.errata | 23 +++++++ arch/riscv/errata/Makefile | 1 + arch/riscv/errata/mips/Makefile | 5 ++ arch/riscv/errata/mips/errata.c | 67 ++++++++++++++++++++ arch/riscv/include/asm/alternative.h | 3 + arch/riscv/include/asm/cmpxchg.h | 3 +- arch/riscv/include/asm/errata_list.h | 13 +++- arch/riscv/include/asm/errata_list_vendors.h | 5 ++ arch/riscv/include/asm/vdso/processor.h | 3 +- arch/riscv/kernel/alternative.c | 5 ++ arch/riscv/kernel/entry.S | 1 + arch/riscv/mm/init.c | 1 + 12 files changed, 127 insertions(+), 3 deletions(-) create mode 100644 arch/riscv/errata/mips/Makefile create mode 100644 arch/riscv/errata/mips/errata.c diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata index e318119d570de0..aca9b0cfcfecf9 100644 --- a/arch/riscv/Kconfig.errata +++ b/arch/riscv/Kconfig.errata @@ -21,6 +21,29 @@ config ERRATA_ANDES_CMO If you don't know what to do here, say "Y". +config ERRATA_MIPS + bool "MIPS errata" + depends on RISCV_ALTERNATIVE + help + All MIPS errata Kconfig depend on this Kconfig. Disabling + this Kconfig will disable all MIPS errata. Please say "Y" + here if your platform uses MIPS CPU cores. + + Otherwise, please say "N" here to avoid unnecessary overhead. + +config ERRATA_MIPS_P8700_PAUSE_OPCODE + bool "Fix the PAUSE Opcode for MIPS P8700" + depends on ERRATA_MIPS && 64BIT + default n + help + The RISCV MIPS P8700 uses a different opcode for PAUSE. + It is a 'hint' encoding of the SLLI instruction, + with rd=0, rs1=0 and imm=5. It will behave as a NOP + instruction if no additional behavior beyond that of + SLLI is implemented. + + If you are not using the P8700 processor, say n. + config ERRATA_SIFIVE bool "SiFive errata" depends on RISCV_ALTERNATIVE diff --git a/arch/riscv/errata/Makefile b/arch/riscv/errata/Makefile index bc6c77ba837d2d..02a7a3335b1d55 100644 --- a/arch/riscv/errata/Makefile +++ b/arch/riscv/errata/Makefile @@ -13,5 +13,6 @@ endif endif obj-$(CONFIG_ERRATA_ANDES) += andes/ +obj-$(CONFIG_ERRATA_MIPS) += mips/ obj-$(CONFIG_ERRATA_SIFIVE) += sifive/ obj-$(CONFIG_ERRATA_THEAD) += thead/ diff --git a/arch/riscv/errata/mips/Makefile b/arch/riscv/errata/mips/Makefile new file mode 100644 index 00000000000000..6278c389b801ee --- /dev/null +++ b/arch/riscv/errata/mips/Makefile @@ -0,0 +1,5 @@ +ifdef CONFIG_RISCV_ALTERNATIVE_EARLY +CFLAGS_errata.o := -mcmodel=medany +endif + +obj-y += errata.o diff --git a/arch/riscv/errata/mips/errata.c b/arch/riscv/errata/mips/errata.c new file mode 100644 index 00000000000000..e984a8152208c3 --- /dev/null +++ b/arch/riscv/errata/mips/errata.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 MIPS. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static inline bool errata_probe_pause(void) +{ + if (!IS_ENABLED(CONFIG_ERRATA_MIPS_P8700_PAUSE_OPCODE)) + return false; + + if (!riscv_isa_vendor_extension_available(MIPS_VENDOR_ID, XMIPSEXECTL)) + return false; + + return true; +} + +static u32 mips_errata_probe(void) +{ + u32 cpu_req_errata = 0; + + if (errata_probe_pause()) + cpu_req_errata |= BIT(ERRATA_MIPS_P8700_PAUSE_OPCODE); + + return cpu_req_errata; +} + +void mips_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid, + unsigned int stage) +{ + struct alt_entry *alt; + u32 cpu_req_errata = mips_errata_probe(); + u32 tmp; + + BUILD_BUG_ON(ERRATA_MIPS_NUMBER >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE); + + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) + return; + + for (alt = begin; alt < end; alt++) { + if (alt->vendor_id != MIPS_VENDOR_ID) + continue; + + if (alt->patch_id >= ERRATA_MIPS_NUMBER) { + WARN(1, "MIPS errata id:%d not in kernel errata list\n", + alt->patch_id); + continue; + } + + tmp = (1U << alt->patch_id); + if (cpu_req_errata && tmp) { + mutex_lock(&text_mutex); + patch_text_nosync(ALT_OLD_PTR(alt), ALT_ALT_PTR(alt), + alt->alt_len); + mutex_unlock(&text_mutex); + } + } +} diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index 0e95539ba451ba..8407d1d535b852 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -48,6 +48,9 @@ struct alt_entry { void andes_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); +void mips_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid, + unsigned int stage); void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 0b749e71021624..80bd52363c6869 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -14,6 +14,7 @@ #include #include #include +#include #define __arch_xchg_masked(sc_sfx, swap_sfx, prepend, sc_append, \ swap_append, r, p, n) \ @@ -438,7 +439,7 @@ static __always_inline void __cmpwait(volatile void *ptr, return; no_zawrs: - asm volatile(RISCV_PAUSE : : : "memory"); + ALT_RISCV_PAUSE(); } #define __cmpwait_relaxed(ptr, val) \ diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index a2481f14b68d56..6694b5ccdcf85c 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -5,12 +5,12 @@ #ifndef ASM_ERRATA_LIST_H #define ASM_ERRATA_LIST_H -#include #include #include #include #include #include +#include #ifdef __ASSEMBLER__ @@ -42,6 +42,17 @@ asm(ALTERNATIVE("sfence.vma %0, %1", "sfence.vma", SIFIVE_VENDOR_ID, \ ERRATA_SIFIVE_CIP_1200, CONFIG_ERRATA_SIFIVE_CIP_1200) \ : : "r" (addr), "r" (asid) : "memory") +#define ALT_RISCV_PAUSE() \ +asm(ALTERNATIVE( \ + RISCV_PAUSE, /* Original RISC‑V pause insn */ \ + MIPS_PAUSE, /* Replacement for MIPS P8700 */ \ + MIPS_VENDOR_ID, /* Vendor ID to match */ \ + ERRATA_MIPS_P8700_PAUSE_OPCODE, /* patch_id */ \ + CONFIG_ERRATA_MIPS_P8700_PAUSE_OPCODE) \ + : /* no outputs */ \ + : /* no inputs */ \ + : "memory") + /* * _val is marked as "will be overwritten", so need to set it to 0 * in the default case. diff --git a/arch/riscv/include/asm/errata_list_vendors.h b/arch/riscv/include/asm/errata_list_vendors.h index d448b9ce7c7c58..ec7eba3734371a 100644 --- a/arch/riscv/include/asm/errata_list_vendors.h +++ b/arch/riscv/include/asm/errata_list_vendors.h @@ -21,4 +21,9 @@ #define ERRATA_THEAD_NUMBER 3 #endif +#ifdef CONFIG_ERRATA_MIPS +#define ERRATA_MIPS_P8700_PAUSE_OPCODE 0 +#define ERRATA_MIPS_NUMBER 1 +#endif + #endif /* ASM_ERRATA_LIST_VENDORS_H */ diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h index 98fb44336c055f..c42f95dc8811d8 100644 --- a/arch/riscv/include/asm/vdso/processor.h +++ b/arch/riscv/include/asm/vdso/processor.h @@ -5,6 +5,7 @@ #ifndef __ASSEMBLER__ #include +#include #include static inline void cpu_relax(void) @@ -19,7 +20,7 @@ static inline void cpu_relax(void) * Reduce instruction retirement. * This assumes the PC changes. */ - __asm__ __volatile__ (RISCV_PAUSE); + ALT_RISCV_PAUSE(); barrier(); } diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index 7eb3cb1215c621..7642704c7f1841 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -47,6 +47,11 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info cpu_mfr_info->patch_func = andes_errata_patch_func; break; #endif +#ifdef CONFIG_ERRATA_MIPS + case MIPS_VENDOR_ID: + cpu_mfr_info->patch_func = mips_errata_patch_func; + break; +#endif #ifdef CONFIG_ERRATA_SIFIVE case SIFIVE_VENDOR_ID: cpu_mfr_info->patch_func = sifive_errata_patch_func; diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index d0ded2438533c4..d3d92a4becc726 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -7,6 +7,7 @@ #include #include +#include #include #include #include diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 85cb70b10c0715..6091f3f06fa35d 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include From 1f1cb5d60c76e4ebbdfc5aa673233b305589876d Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 19 Sep 2025 14:43:48 +0700 Subject: [PATCH 2349/3206] Documentation: cgroup-v2: Sync manual toctree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Manually-arranged toctree comment in cgroup v2 documentation is a rather out-of-sync with actual contents: a few sections are missing and/or named differently. Sync the toctree. Signed-off-by: Bagas Sanjaya Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index a1e3d431974c20..0e6c67ac585a08 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -15,6 +15,9 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst Date: Fri, 19 Sep 2025 01:12:26 +0000 Subject: [PATCH 2350/3206] cpuset: fix failure to enable isolated partition when containing isolcpus The 'isolcpus' parameter specified at boot time can be assigned to an isolated partition. While it is valid put the 'isolcpus' in an isolated partition, attempting to change a member cpuset to an isolated partition will fail if the cpuset contains any 'isolcpus'. For example, the system boots with 'isolcpus=9', and the following configuration works correctly: # cd /sys/fs/cgroup/ # mkdir test # echo 1 > test/cpuset.cpus # echo isolated > test/cpuset.cpus.partition # cat test/cpuset.cpus.partition isolated # echo 9 > test/cpuset.cpus # cat test/cpuset.cpus.partition isolated # cat test/cpuset.cpus 9 However, the following steps to convert a member cpuset to an isolated partition will fail: # cd /sys/fs/cgroup/ # mkdir test # echo 9 > test/cpuset.cpus # echo isolated > test/cpuset.cpus.partition # cat test/cpuset.cpus.partition isolated invalid (partition config conflicts with housekeeping setup) The issue occurs because the new partition state (new_prs) is used for validation against housekeeping constraints before it has been properly updated. To resolve this, move the assignment of new_prs before the housekeeping validation check when enabling a root partition. Fixes: 4a74e418881f ("cgroup/cpuset: Check partition conflict with housekeeping setup") Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 44231cb1d83fe8..2b7e2f17577ead 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1806,6 +1806,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, xcpus = tmp->delmask; if (compute_excpus(cs, xcpus)) WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; /* * Enabling partition root is not allowed if its @@ -1838,7 +1839,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, deleting = true; subparts_delta++; - new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* * May need to add cpus back to parent's effective_cpus From 59d5de3655698679ad8fd2cc82228de4679c4263 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 19 Sep 2025 01:12:27 +0000 Subject: [PATCH 2351/3206] cpuset: Use new excpus for nocpu error check when enabling root partition A previous patch fixed a bug where new_prs should be assigned before checking housekeeping conflicts. This patch addresses another potential issue: the nocpu error check currently uses the xcpus which is not updated. Although no issue has been observed so far, the check should be performed using the new effective exclusive cpus. The comment has been removed because the function returns an error if nocpu checking fails, which is unrelated to the parent. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2b7e2f17577ead..44d65890326a66 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1818,11 +1818,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; - /* - * A parent can be left with no CPU as long as there is no - * task directly associated with the parent partition. - */ - if (nocpu) + if (tasks_nocpu_error(parent, cs, xcpus)) return PERR_NOCPUS; /* From 51840f7ba393dce7624a759cc4cee8c2bedf9068 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 19 Sep 2025 09:49:03 +0000 Subject: [PATCH 2352/3206] cpuset: fix missing error return in update_cpumask The commit c6366739804f ("cpuset: refactor cpus_allowed_validate_change") inadvertently removed the error return when cpus_allowed_validate_change() fails. This patch restores the proper error handling by returning retval when the validation check fails. Fixes: c6366739804f ("cpuset: refactor cpus_allowed_validate_change") Signed-off-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 44d65890326a66..535174ed712682 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2515,7 +2515,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, compute_trialcs_excpus(trialcs, cs); trialcs->prs_err = PERR_NONE; - if (cpus_allowed_validate_change(cs, trialcs, &tmp) < 0) + retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + if (retval < 0) goto out_free; /* From 6b696808472197b77b888f50bc789a3bae077743 Mon Sep 17 00:00:00 2001 From: Chen Yufeng Date: Thu, 11 Sep 2025 23:08:20 +0800 Subject: [PATCH 2353/3206] can: hi311x: fix null pointer dereference when resuming from sleep before interface was enabled This issue is similar to the vulnerability in the `mcp251x` driver, which was fixed in commit 03c427147b2d ("can: mcp251x: fix resume from sleep before interface was brought up"). In the `hi311x` driver, when the device resumes from sleep, the driver schedules `priv->restart_work`. However, if the network interface was not previously enabled, the `priv->wq` (workqueue) is not allocated and initialized, leading to a null pointer dereference. To fix this, we move the allocation and initialization of the workqueue from the `hi3110_open` function to the `hi3110_can_probe` function. This ensures that the workqueue is properly initialized before it is used during device resume. And added logic to destroy the workqueue in the error handling paths of `hi3110_can_probe` and in the `hi3110_can_remove` function to prevent resource leaks. Signed-off-by: Chen Yufeng Link: https://patch.msgid.link/20250911150820.250-1-chenyufeng@iie.ac.cn Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/hi311x.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c index 09ae218315d73d..96bef8f384c4a0 100644 --- a/drivers/net/can/spi/hi311x.c +++ b/drivers/net/can/spi/hi311x.c @@ -545,8 +545,6 @@ static int hi3110_stop(struct net_device *net) priv->force_quit = 1; free_irq(spi->irq, priv); - destroy_workqueue(priv->wq); - priv->wq = NULL; mutex_lock(&priv->hi3110_lock); @@ -770,34 +768,23 @@ static int hi3110_open(struct net_device *net) goto out_close; } - priv->wq = alloc_workqueue("hi3110_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM, - 0); - if (!priv->wq) { - ret = -ENOMEM; - goto out_free_irq; - } - INIT_WORK(&priv->tx_work, hi3110_tx_work_handler); - INIT_WORK(&priv->restart_work, hi3110_restart_work_handler); - ret = hi3110_hw_reset(spi); if (ret) - goto out_free_wq; + goto out_free_irq; ret = hi3110_setup(net); if (ret) - goto out_free_wq; + goto out_free_irq; ret = hi3110_set_normal_mode(spi); if (ret) - goto out_free_wq; + goto out_free_irq; netif_wake_queue(net); mutex_unlock(&priv->hi3110_lock); return 0; - out_free_wq: - destroy_workqueue(priv->wq); out_free_irq: free_irq(spi->irq, priv); hi3110_hw_sleep(spi); @@ -908,6 +895,15 @@ static int hi3110_can_probe(struct spi_device *spi) if (ret) goto out_clk; + priv->wq = alloc_workqueue("hi3110_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM, + 0); + if (!priv->wq) { + ret = -ENOMEM; + goto out_clk; + } + INIT_WORK(&priv->tx_work, hi3110_tx_work_handler); + INIT_WORK(&priv->restart_work, hi3110_restart_work_handler); + priv->spi = spi; mutex_init(&priv->hi3110_lock); @@ -943,6 +939,8 @@ static int hi3110_can_probe(struct spi_device *spi) return 0; error_probe: + destroy_workqueue(priv->wq); + priv->wq = NULL; hi3110_power_enable(priv->power, 0); out_clk: @@ -963,6 +961,9 @@ static void hi3110_can_remove(struct spi_device *spi) hi3110_power_enable(priv->power, 0); + destroy_workqueue(priv->wq); + priv->wq = NULL; + clk_disable_unprepare(priv->clk); free_candev(net); From 5cff263606a10102a0ea19ff579eaa18fd5577ad Mon Sep 17 00:00:00 2001 From: Duy Nguyen Date: Thu, 18 Sep 2025 07:03:45 +0000 Subject: [PATCH 2354/3206] can: rcar_canfd: Fix controller mode setting Driver configures register to choose controller mode before setting all channels to reset mode leading to failure. The patch corrects operation of mode setting. Signed-off-by: Duy Nguyen Signed-off-by: Tranh Ha Link: https://patch.msgid.link/TYWPR01MB87434739F83E27EDCD23DF44B416A@TYWPR01MB8743.jpnprd01.prod.outlook.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/rcar/rcar_canfd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index b3c8c592fb0e04..7e8b1d2f1af651 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -823,9 +823,6 @@ static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv) /* Reset Global error flags */ rcar_canfd_write(gpriv->base, RCANFD_GERFL, 0x0); - /* Set the controller into appropriate mode */ - rcar_canfd_set_mode(gpriv); - /* Transition all Channels to reset mode */ for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels) { rcar_canfd_clear_bit(gpriv->base, @@ -844,6 +841,10 @@ static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv) return err; } } + + /* Set the controller into appropriate mode */ + rcar_canfd_set_mode(gpriv); + return 0; } From 38c0abad45b190a30d8284a37264d2127a6ec303 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 18 Sep 2025 18:00:24 +0900 Subject: [PATCH 2355/3206] can: etas_es58x: populate ndo_change_mtu() to prevent buffer overflow Sending an PF_PACKET allows to bypass the CAN framework logic and to directly reach the xmit() function of a CAN driver. The only check which is performed by the PF_PACKET framework is to make sure that skb->len fits the interface's MTU. Unfortunately, because the etas_es58x driver does not populate its net_device_ops->ndo_change_mtu(), it is possible for an attacker to configure an invalid MTU by doing, for example: $ ip link set can0 mtu 9999 After doing so, the attacker could open a PF_PACKET socket using the ETH_P_CANXL protocol: socket(PF_PACKET, SOCK_RAW, htons(ETH_P_CANXL)); to inject a malicious CAN XL frames. For example: struct canxl_frame frame = { .flags = 0xff, .len = 2048, }; The CAN drivers' xmit() function are calling can_dev_dropped_skb() to check that the skb is valid, unfortunately under above conditions, the malicious packet is able to go through can_dev_dropped_skb() checks: 1. the skb->protocol is set to ETH_P_CANXL which is valid (the function does not check the actual device capabilities). 2. the length is a valid CAN XL length. And so, es58x_start_xmit() receives a CAN XL frame which it is not able to correctly handle and will thus misinterpret it as a CAN(FD) frame. This can result in a buffer overflow. For example, using the es581.4 variant, the frame will be dispatched to es581_4_tx_can_msg(), go through the last check at the beginning of this function: if (can_is_canfd_skb(skb)) return -EMSGSIZE; and reach this line: memcpy(tx_can_msg->data, cf->data, cf->len); Here, cf->len corresponds to the flags field of the CAN XL frame. In our previous example, we set canxl_frame->flags to 0xff. Because the maximum expected length is 8, a buffer overflow of 247 bytes occurs! Populate net_device_ops->ndo_change_mtu() to ensure that the interface's MTU can not be set to anything bigger than CAN_MTU or CANFD_MTU (depending on the device capabilities). By fixing the root cause, this prevents the buffer overflow. Fixes: 8537257874e9 ("can: etas_es58x: add core support for ETAS ES58X CAN USB interfaces") Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250918-can-fix-mtu-v1-1-0d1cada9393b@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/etas_es58x/es58x_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c index db1acf6d504cf3..adc91873c083f9 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_core.c +++ b/drivers/net/can/usb/etas_es58x/es58x_core.c @@ -7,7 +7,7 @@ * * Copyright (c) 2019 Robert Bosch Engineering and Business Solutions. All rights reserved. * Copyright (c) 2020 ETAS K.K.. All rights reserved. - * Copyright (c) 2020-2022 Vincent Mailhol + * Copyright (c) 2020-2025 Vincent Mailhol */ #include @@ -1977,6 +1977,7 @@ static const struct net_device_ops es58x_netdev_ops = { .ndo_stop = es58x_stop, .ndo_start_xmit = es58x_start_xmit, .ndo_eth_ioctl = can_eth_ioctl_hwts, + .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops es58x_ethtool_ops = { From ac1c7656fa717f29fac3ea073af63f0b9919ec9a Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 18 Sep 2025 18:00:25 +0900 Subject: [PATCH 2356/3206] can: hi311x: populate ndo_change_mtu() to prevent buffer overflow Sending an PF_PACKET allows to bypass the CAN framework logic and to directly reach the xmit() function of a CAN driver. The only check which is performed by the PF_PACKET framework is to make sure that skb->len fits the interface's MTU. Unfortunately, because the sun4i_can driver does not populate its net_device_ops->ndo_change_mtu(), it is possible for an attacker to configure an invalid MTU by doing, for example: $ ip link set can0 mtu 9999 After doing so, the attacker could open a PF_PACKET socket using the ETH_P_CANXL protocol: socket(PF_PACKET, SOCK_RAW, htons(ETH_P_CANXL)) to inject a malicious CAN XL frames. For example: struct canxl_frame frame = { .flags = 0xff, .len = 2048, }; The CAN drivers' xmit() function are calling can_dev_dropped_skb() to check that the skb is valid, unfortunately under above conditions, the malicious packet is able to go through can_dev_dropped_skb() checks: 1. the skb->protocol is set to ETH_P_CANXL which is valid (the function does not check the actual device capabilities). 2. the length is a valid CAN XL length. And so, hi3110_hard_start_xmit() receives a CAN XL frame which it is not able to correctly handle and will thus misinterpret it as a CAN frame. The driver will consume frame->len as-is with no further checks. This can result in a buffer overflow later on in hi3110_hw_tx() on this line: memcpy(buf + HI3110_FIFO_EXT_DATA_OFF, frame->data, frame->len); Here, frame->len corresponds to the flags field of the CAN XL frame. In our previous example, we set canxl_frame->flags to 0xff. Because the maximum expected length is 8, a buffer overflow of 247 bytes occurs! Populate net_device_ops->ndo_change_mtu() to ensure that the interface's MTU can not be set to anything bigger than CAN_MTU. By fixing the root cause, this prevents the buffer overflow. Fixes: 57e83fb9b746 ("can: hi311x: Add Holt HI-311x CAN driver") Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250918-can-fix-mtu-v1-2-0d1cada9393b@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/hi311x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c index 96bef8f384c4a0..963ea8510dd9bf 100644 --- a/drivers/net/can/spi/hi311x.c +++ b/drivers/net/can/spi/hi311x.c @@ -799,6 +799,7 @@ static const struct net_device_ops hi3110_netdev_ops = { .ndo_open = hi3110_open, .ndo_stop = hi3110_stop, .ndo_start_xmit = hi3110_hard_start_xmit, + .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops hi3110_ethtool_ops = { From 61da0bd4102c459823fbe6b8b43b01fb6ace4a22 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 18 Sep 2025 18:00:26 +0900 Subject: [PATCH 2357/3206] can: sun4i_can: populate ndo_change_mtu() to prevent buffer overflow Sending an PF_PACKET allows to bypass the CAN framework logic and to directly reach the xmit() function of a CAN driver. The only check which is performed by the PF_PACKET framework is to make sure that skb->len fits the interface's MTU. Unfortunately, because the sun4i_can driver does not populate its net_device_ops->ndo_change_mtu(), it is possible for an attacker to configure an invalid MTU by doing, for example: $ ip link set can0 mtu 9999 After doing so, the attacker could open a PF_PACKET socket using the ETH_P_CANXL protocol: socket(PF_PACKET, SOCK_RAW, htons(ETH_P_CANXL)) to inject a malicious CAN XL frames. For example: struct canxl_frame frame = { .flags = 0xff, .len = 2048, }; The CAN drivers' xmit() function are calling can_dev_dropped_skb() to check that the skb is valid, unfortunately under above conditions, the malicious packet is able to go through can_dev_dropped_skb() checks: 1. the skb->protocol is set to ETH_P_CANXL which is valid (the function does not check the actual device capabilities). 2. the length is a valid CAN XL length. And so, sun4ican_start_xmit() receives a CAN XL frame which it is not able to correctly handle and will thus misinterpret it as a CAN frame. This can result in a buffer overflow. The driver will consume cf->len as-is with no further checks on this line: dlc = cf->len; Here, cf->len corresponds to the flags field of the CAN XL frame. In our previous example, we set canxl_frame->flags to 0xff. Because the maximum expected length is 8, a buffer overflow of 247 bytes occurs a couple line below when doing: for (i = 0; i < dlc; i++) writel(cf->data[i], priv->base + (dreg + i * 4)); Populate net_device_ops->ndo_change_mtu() to ensure that the interface's MTU can not be set to anything bigger than CAN_MTU. By fixing the root cause, this prevents the buffer overflow. Fixes: 0738eff14d81 ("can: Allwinner A10/A20 CAN Controller support - Kernel module") Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250918-can-fix-mtu-v1-3-0d1cada9393b@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/sun4i_can.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c index 6fcb301ef611d0..53bfd873de9bde 100644 --- a/drivers/net/can/sun4i_can.c +++ b/drivers/net/can/sun4i_can.c @@ -768,6 +768,7 @@ static const struct net_device_ops sun4ican_netdev_ops = { .ndo_open = sun4ican_open, .ndo_stop = sun4ican_close, .ndo_start_xmit = sun4ican_start_xmit, + .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops sun4ican_ethtool_ops = { From 17c8d794527f01def0d1c8b7dc2d7b8d34fed0e6 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 18 Sep 2025 18:00:27 +0900 Subject: [PATCH 2358/3206] can: mcba_usb: populate ndo_change_mtu() to prevent buffer overflow Sending an PF_PACKET allows to bypass the CAN framework logic and to directly reach the xmit() function of a CAN driver. The only check which is performed by the PF_PACKET framework is to make sure that skb->len fits the interface's MTU. Unfortunately, because the mcba_usb driver does not populate its net_device_ops->ndo_change_mtu(), it is possible for an attacker to configure an invalid MTU by doing, for example: $ ip link set can0 mtu 9999 After doing so, the attacker could open a PF_PACKET socket using the ETH_P_CANXL protocol: socket(PF_PACKET, SOCK_RAW, htons(ETH_P_CANXL)) to inject a malicious CAN XL frames. For example: struct canxl_frame frame = { .flags = 0xff, .len = 2048, }; The CAN drivers' xmit() function are calling can_dev_dropped_skb() to check that the skb is valid, unfortunately under above conditions, the malicious packet is able to go through can_dev_dropped_skb() checks: 1. the skb->protocol is set to ETH_P_CANXL which is valid (the function does not check the actual device capabilities). 2. the length is a valid CAN XL length. And so, mcba_usb_start_xmit() receives a CAN XL frame which it is not able to correctly handle and will thus misinterpret it as a CAN frame. This can result in a buffer overflow. The driver will consume cf->len as-is with no further checks on these lines: usb_msg.dlc = cf->len; memcpy(usb_msg.data, cf->data, usb_msg.dlc); Here, cf->len corresponds to the flags field of the CAN XL frame. In our previous example, we set canxl_frame->flags to 0xff. Because the maximum expected length is 8, a buffer overflow of 247 bytes occurs! Populate net_device_ops->ndo_change_mtu() to ensure that the interface's MTU can not be set to anything bigger than CAN_MTU. By fixing the root cause, this prevents the buffer overflow. Fixes: 51f3baad7de9 ("can: mcba_usb: Add support for Microchip CAN BUS Analyzer") Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250918-can-fix-mtu-v1-4-0d1cada9393b@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/mcba_usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index 41c0a1c399bf36..1f9b915094e64d 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -761,6 +761,7 @@ static const struct net_device_ops mcba_netdev_ops = { .ndo_open = mcba_usb_open, .ndo_stop = mcba_usb_close, .ndo_start_xmit = mcba_usb_start_xmit, + .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops mcba_ethtool_ops = { From c443be70aaee42c2d1d251e0329e0a69dd96ae54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Grosjean?= Date: Thu, 18 Sep 2025 15:23:57 +0200 Subject: [PATCH 2359/3206] can: peak_usb: fix shift-out-of-bounds issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explicitly uses a 64-bit constant when the number of bits used for its shifting is 32 (which is the case for PC CAN FD interfaces supported by this driver). Signed-off-by: Stéphane Grosjean Link: https://patch.msgid.link/20250918132413.30071-1-stephane.grosjean@free.fr Reported-by: Marc Kleine-Budde Closes: https://lore.kernel.org/20250917-aboriginal-refined-honeybee-82b1aa-mkl@pengutronix.de Fixes: bb4785551f64 ("can: usb: PEAK-System Technik USB adapters driver core") [mkl: update subject, apply manually] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/peak_usb/pcan_usb_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c index 117637b9b995b9..dd5caa1c302b99 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c @@ -111,7 +111,7 @@ void peak_usb_update_ts_now(struct peak_time_ref *time_ref, u32 ts_now) u32 delta_ts = time_ref->ts_dev_2 - time_ref->ts_dev_1; if (time_ref->ts_dev_2 < time_ref->ts_dev_1) - delta_ts &= (1 << time_ref->adapter->ts_used_bits) - 1; + delta_ts &= (1ULL << time_ref->adapter->ts_used_bits) - 1; time_ref->ts_total += delta_ts; } From 3df6979d222b8638a60aa6921b73cb5f3e4f5dcb Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 17 Sep 2025 12:02:10 -0700 Subject: [PATCH 2360/3206] arm64: mm: split linear mapping if BBML2 unsupported on secondary CPUs The kernel linear mapping is painted in very early stage of system boot. The cpufeature has not been finalized yet at this point. So the linear mapping is determined by the capability of boot CPU only. If the boot CPU supports BBML2, large block mappings will be used for linear mapping. But the secondary CPUs may not support BBML2, so repaint the linear mapping if large block mapping is used and the secondary CPUs don't support BBML2 once cpufeature is finalized on all CPUs. If the boot CPU doesn't support BBML2 or the secondary CPUs have the same BBML2 capability with the boot CPU, repainting the linear mapping is not needed. Repainting is implemented by the boot CPU, which we know supports BBML2, so it is safe for the live mapping size to change for this CPU. The linear map region is walked using the pagewalk API and any discovered large leaf mappings are split to pte mappings using the existing helper functions. Since the repainting is performed inside of a stop_machine(), we must use GFP_ATOMIC to allocate the extra intermediate pgtables. But since we are still early in boot, it is expected that there is plenty of memory available so we will never need to sleep for reclaim, and so GFP_ATOMIC is acceptable here. The secondary CPUs are all put into a waiting area with the idmap in TTBR0 and reserved map in TTBR1 while this is performed since they cannot be allowed to observe any size changes on the live mappings. Some of this infrastructure is reused from the kpti case. Specifically we share the same flag (was __idmap_kpti_flag, now idmap_kpti_bbml2_flag) since it means we don't have to reserve any extra pgtable memory to idmap the extra flag. Co-developed-by: Yang Shi Signed-off-by: Yang Shi Signed-off-by: Ryan Roberts Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu.h | 2 + arch/arm64/kernel/cpufeature.c | 3 + arch/arm64/mm/mmu.c | 182 +++++++++++++++++++++++++++++---- arch/arm64/mm/proc.S | 27 +++-- 4 files changed, 187 insertions(+), 27 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 56fca81f60ade2..2acfa7801d02d5 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -72,6 +72,8 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot); extern void mark_linear_text_alias_ro(void); extern int split_kernel_leaf_mapping(unsigned long start, unsigned long end); +extern void init_idmap_kpti_bbml2_flag(void); +extern void linear_map_maybe_split_to_ptes(void); /* * This check is triggered during the early boot before the cpufeature diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e15472beff3f84..3780f343fd2c64 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -85,6 +85,7 @@ #include #include #include +#include #include #include #include @@ -2027,6 +2028,7 @@ static void __init kpti_install_ng_mappings(void) if (arm64_use_ng_mappings) return; + init_idmap_kpti_bbml2_flag(); stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); } @@ -3928,6 +3930,7 @@ void __init setup_system_features(void) { setup_system_capabilities(); + linear_map_maybe_split_to_ptes(); kpti_install_ng_mappings(); sve_setup(); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a7b29daf1a38f1..431ed90914bb48 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include @@ -483,11 +485,11 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, #define INVALID_PHYS_ADDR (-1ULL) -static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, +static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, enum pgtable_type pgtable_type) { /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ - struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0); + struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0); phys_addr_t pa; if (!ptdesc) @@ -514,9 +516,9 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, } static phys_addr_t -try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) +try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type, gfp_t gfp) { - return __pgd_pgtable_alloc(&init_mm, pgtable_type); + return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type); } static phys_addr_t __maybe_unused @@ -524,7 +526,7 @@ pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) { phys_addr_t pa; - pa = __pgd_pgtable_alloc(&init_mm, pgtable_type); + pa = __pgd_pgtable_alloc(&init_mm, GFP_PGTABLE_KERNEL, pgtable_type); BUG_ON(pa == INVALID_PHYS_ADDR); return pa; } @@ -534,7 +536,7 @@ pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) { phys_addr_t pa; - pa = __pgd_pgtable_alloc(NULL, pgtable_type); + pa = __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type); BUG_ON(pa == INVALID_PHYS_ADDR); return pa; } @@ -548,7 +550,7 @@ static void split_contpte(pte_t *ptep) __set_pte(ptep, pte_mknoncont(__ptep_get(ptep))); } -static int split_pmd(pmd_t *pmdp, pmd_t pmd) +static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont) { pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; unsigned long pfn = pmd_pfn(pmd); @@ -557,7 +559,7 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd) pte_t *ptep; int i; - pte_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PTE); + pte_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PTE, gfp); if (pte_phys == INVALID_PHYS_ADDR) return -ENOMEM; ptep = (pte_t *)phys_to_virt(pte_phys); @@ -566,7 +568,9 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd) tableprot |= PMD_TABLE_PXN; prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE); - prot = __pgprot(pgprot_val(prot) | PTE_CONT); + prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); + if (to_cont) + prot = __pgprot(pgprot_val(prot) | PTE_CONT); for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++) __set_pte(ptep, pfn_pte(pfn, prot)); @@ -590,7 +594,7 @@ static void split_contpmd(pmd_t *pmdp) set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp))); } -static int split_pud(pud_t *pudp, pud_t pud) +static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont) { pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; unsigned int step = PMD_SIZE >> PAGE_SHIFT; @@ -600,7 +604,7 @@ static int split_pud(pud_t *pudp, pud_t pud) pmd_t *pmdp; int i; - pmd_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PMD); + pmd_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PMD, gfp); if (pmd_phys == INVALID_PHYS_ADDR) return -ENOMEM; pmdp = (pmd_t *)phys_to_virt(pmd_phys); @@ -609,7 +613,9 @@ static int split_pud(pud_t *pudp, pud_t pud) tableprot |= PUD_TABLE_PXN; prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); - prot = __pgprot(pgprot_val(prot) | PTE_CONT); + prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); + if (to_cont) + prot = __pgprot(pgprot_val(prot) | PTE_CONT); for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step) set_pmd(pmdp, pfn_pmd(pfn, prot)); @@ -667,7 +673,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr) if (!pud_present(pud)) goto out; if (pud_leaf(pud)) { - ret = split_pud(pudp, pud); + ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true); if (ret) goto out; } @@ -692,7 +698,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr) */ if (ALIGN_DOWN(addr, PMD_SIZE) == addr) goto out; - ret = split_pmd(pmdp, pmd); + ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true); if (ret) goto out; } @@ -765,6 +771,138 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) return ret; } +static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr, + unsigned long next, + struct mm_walk *walk) +{ + pud_t pud = pudp_get(pudp); + int ret = 0; + + if (pud_leaf(pud)) + ret = split_pud(pudp, pud, GFP_ATOMIC, false); + + return ret; +} + +static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, + unsigned long next, + struct mm_walk *walk) +{ + pmd_t pmd = pmdp_get(pmdp); + int ret = 0; + + if (pmd_leaf(pmd)) { + if (pmd_cont(pmd)) + split_contpmd(pmdp); + ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false); + + /* + * We have split the pmd directly to ptes so there is no need to + * visit each pte to check if they are contpte. + */ + walk->action = ACTION_CONTINUE; + } + + return ret; +} + +static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, + unsigned long next, + struct mm_walk *walk) +{ + pte_t pte = __ptep_get(ptep); + + if (pte_cont(pte)) + split_contpte(ptep); + + return 0; +} + +static const struct mm_walk_ops split_to_ptes_ops __initconst = { + .pud_entry = split_to_ptes_pud_entry, + .pmd_entry = split_to_ptes_pmd_entry, + .pte_entry = split_to_ptes_pte_entry, +}; + +static bool linear_map_requires_bbml2 __initdata; + +u32 idmap_kpti_bbml2_flag; + +void __init init_idmap_kpti_bbml2_flag(void) +{ + WRITE_ONCE(idmap_kpti_bbml2_flag, 1); + /* Must be visible to other CPUs before stop_machine() is called. */ + smp_mb(); +} + +static int __init linear_map_split_to_ptes(void *__unused) +{ + /* + * Repainting the linear map must be done by CPU0 (the boot CPU) because + * that's the only CPU that we know supports BBML2. The other CPUs will + * be held in a waiting area with the idmap active. + */ + if (!smp_processor_id()) { + unsigned long lstart = _PAGE_OFFSET(vabits_actual); + unsigned long lend = PAGE_END; + unsigned long kstart = (unsigned long)lm_alias(_stext); + unsigned long kend = (unsigned long)lm_alias(__init_begin); + int ret; + + /* + * Wait for all secondary CPUs to be put into the waiting area. + */ + smp_cond_load_acquire(&idmap_kpti_bbml2_flag, VAL == num_online_cpus()); + + /* + * Walk all of the linear map [lstart, lend), except the kernel + * linear map alias [kstart, kend), and split all mappings to + * PTE. The kernel alias remains static throughout runtime so + * can continue to be safely mapped with large mappings. + */ + ret = walk_kernel_page_table_range_lockless(lstart, kstart, + &split_to_ptes_ops, NULL, NULL); + if (!ret) + ret = walk_kernel_page_table_range_lockless(kend, lend, + &split_to_ptes_ops, NULL, NULL); + if (ret) + panic("Failed to split linear map\n"); + flush_tlb_kernel_range(lstart, lend); + + /* + * Relies on dsb in flush_tlb_kernel_range() to avoid reordering + * before any page table split operations. + */ + WRITE_ONCE(idmap_kpti_bbml2_flag, 0); + } else { + typedef void (wait_split_fn)(void); + extern wait_split_fn wait_linear_map_split_to_ptes; + wait_split_fn *wait_fn; + + wait_fn = (void *)__pa_symbol(wait_linear_map_split_to_ptes); + + /* + * At least one secondary CPU doesn't support BBML2 so cannot + * tolerate the size of the live mappings changing. So have the + * secondary CPUs wait for the boot CPU to make the changes + * with the idmap active and init_mm inactive. + */ + cpu_install_idmap(); + wait_fn(); + cpu_uninstall_idmap(); + } + + return 0; +} + +void __init linear_map_maybe_split_to_ptes(void) +{ + if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) { + init_idmap_kpti_bbml2_flag(); + stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask); + } +} + /* * This function can only be used to modify existing table entries, * without allocating new levels of table. Note that this permits the @@ -919,6 +1057,8 @@ static void __init map_mem(pgd_t *pgdp) early_kfence_pool = arm64_kfence_alloc_pool(); + linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map(); + if (force_pte_mapping()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; @@ -1053,7 +1193,7 @@ void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, u64 va_offset); static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, - kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; + kpti_bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; static void __init create_idmap(void) { @@ -1065,15 +1205,17 @@ static void __init create_idmap(void) IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, __phys_to_virt(ptep) - ptep); - if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) { - extern u32 __idmap_kpti_flag; - phys_addr_t pa = __pa_symbol(&__idmap_kpti_flag); + if (linear_map_requires_bbml2 || + (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings)) { + phys_addr_t pa = __pa_symbol(&idmap_kpti_bbml2_flag); /* * The KPTI G-to-nG conversion code needs a read-write mapping - * of its synchronization flag in the ID map. + * of its synchronization flag in the ID map. This is also used + * when splitting the linear map to ptes if a secondary CPU + * doesn't support bbml2. */ - ptep = __pa_symbol(kpti_ptes); + ptep = __pa_symbol(kpti_bbml2_ptes); __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL, IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, __phys_to_virt(ptep) - ptep); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 8c75965afc9e59..86818511962b61 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -245,10 +245,6 @@ SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1) * * Called exactly once from stop_machine context by each CPU found during boot. */ - .pushsection ".data", "aw", %progbits -SYM_DATA(__idmap_kpti_flag, .long 1) - .popsection - SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) cpu .req w0 temp_pte .req x0 @@ -273,7 +269,7 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) mov x5, x3 // preserve temp_pte arg mrs swapper_ttb, ttbr1_el1 - adr_l flag_ptr, __idmap_kpti_flag + adr_l flag_ptr, idmap_kpti_bbml2_flag cbnz cpu, __idmap_kpti_secondary @@ -416,7 +412,25 @@ alternative_else_nop_endif __idmap_kpti_secondary: /* Uninstall swapper before surgery begins */ __idmap_cpu_set_reserved_ttbr1 x16, x17 + b scondary_cpu_wait + + .unreq swapper_ttb + .unreq flag_ptr +SYM_FUNC_END(idmap_kpti_install_ng_mappings) + .popsection +#endif + + .pushsection ".idmap.text", "a" +SYM_TYPED_FUNC_START(wait_linear_map_split_to_ptes) + /* Must be same registers as in idmap_kpti_install_ng_mappings */ + swapper_ttb .req x3 + flag_ptr .req x4 + + mrs swapper_ttb, ttbr1_el1 + adr_l flag_ptr, idmap_kpti_bbml2_flag + __idmap_cpu_set_reserved_ttbr1 x16, x17 +scondary_cpu_wait: /* Increment the flag to let the boot CPU we're ready */ 1: ldxr w16, [flag_ptr] add w16, w16, #1 @@ -436,9 +450,8 @@ __idmap_kpti_secondary: .unreq swapper_ttb .unreq flag_ptr -SYM_FUNC_END(idmap_kpti_install_ng_mappings) +SYM_FUNC_END(wait_linear_map_split_to_ptes) .popsection -#endif /* * __cpu_setup From 32278c677947ae2f042c9535674a7fff9a245dd3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Aug 2025 10:23:56 -0700 Subject: [PATCH 2361/3206] x86/umip: Check that the instruction opcode is at least two bytes When checking for a potential UMIP violation on #GP, verify the decoder found at least two opcode bytes to avoid false positives when the kernel encounters an unknown instruction that starts with 0f. Because the array of opcode.bytes is zero-initialized by insn_init(), peeking at bytes[1] will misinterpret garbage as a potential SLDT or STR instruction, and can incorrectly trigger emulation. E.g. if a VPALIGNR instruction 62 83 c5 05 0f 08 ff vpalignr xmm17{k5},xmm23,XMMWORD PTR [r8],0xff hits a #GP, the kernel emulates it as STR and squashes the #GP (and corrupts the userspace code stream). Arguably the check should look for exactly two bytes, but no three byte opcodes use '0f 00 xx' or '0f 01 xx' as an escape, i.e. it should be impossible to get a false positive if the first two opcode bytes match '0f 00' or '0f 01'. Go with a more conservative check with respect to the existing code to minimize the chances of breaking userspace, e.g. due to decoder weirdness. Analyzed by Nick Bray . Fixes: 1e5db223696a ("x86/umip: Add emulation code for UMIP instructions") Reported-by: Dan Snyder Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov (AMD) Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- arch/x86/kernel/umip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 5a4b21389b1d98..406ac01ce16dc0 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -156,8 +156,8 @@ static int identify_insn(struct insn *insn) if (!insn->modrm.nbytes) return -EINVAL; - /* All the instructions of interest start with 0x0f. */ - if (insn->opcode.bytes[0] != 0xf) + /* The instructions of interest have 2-byte opcodes: 0F 00 or 0F 01. */ + if (insn->opcode.nbytes < 2 || insn->opcode.bytes[0] != 0xf) return -EINVAL; if (insn->opcode.bytes[1] == 0x1) { From 27b1fd62012dfe9d3eb8ecde344d7aa673695ecf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Aug 2025 10:23:57 -0700 Subject: [PATCH 2362/3206] x86/umip: Fix decoding of register forms of 0F 01 (SGDT and SIDT aliases) Filter out the register forms of 0F 01 when determining whether or not to emulate in response to a potential UMIP violation #GP, as SGDT and SIDT only accept memory operands. The register variants of 0F 01 are used to encode instructions for things like VMX and SGX, i.e. not checking the Mod field would cause the kernel to incorrectly emulate on #GP, e.g. due to a CPL violation on VMLAUNCH. Fixes: 1e5db223696a ("x86/umip: Add emulation code for UMIP instructions") Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov (AMD) Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- arch/x86/kernel/umip.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 406ac01ce16dc0..d432f3824f0c29 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -163,8 +163,19 @@ static int identify_insn(struct insn *insn) if (insn->opcode.bytes[1] == 0x1) { switch (X86_MODRM_REG(insn->modrm.value)) { case 0: + /* The reg form of 0F 01 /0 encodes VMX instructions. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SGDT; case 1: + /* + * The reg form of 0F 01 /1 encodes MONITOR/MWAIT, + * STAC/CLAC, and ENCLS. + */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SIDT; case 4: return UMIP_INST_SMSW; From 8a1b5d412cb405df402cdc59135655788fc59d0f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 18 Sep 2025 23:09:17 +0200 Subject: [PATCH 2363/3206] ACPI: processor: Update cpuidle driver check in __acpi_processor_start() Commit 7a8c994cbb2d ("ACPI: processor: idle: Optimize ACPI idle driver registration") moved the ACPI idle driver registration to acpi_processor_driver_init() and acpi_processor_power_init() does not register an idle driver any more. Accordingly, the cpuidle driver check in __acpi_processor_start() needs to be updated to avoid calling acpi_processor_power_init() without a cpuidle driver, in which case the registration of the cpuidle device in that function would lead to a NULL pointer dereference in __cpuidle_register_device(). Fixes: 7a8c994cbb2d ("ACPI: processor: idle: Optimize ACPI idle driver registration") Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/5044465.31r3eYUQgx@rafael.j.wysocki [ rjw: Changelog update ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/processor_driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index bc9f58a02c1db5..de17c141267809 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -166,7 +166,7 @@ static int __acpi_processor_start(struct acpi_device *device) if (result && !IS_ENABLED(CONFIG_ACPI_CPU_FREQ_PSS)) dev_dbg(&device->dev, "CPPC data invalid or not present\n"); - if (!cpuidle_get_driver() || cpuidle_get_driver() == &acpi_idle_driver) + if (cpuidle_get_driver() == &acpi_idle_driver) acpi_processor_power_init(pr); acpi_pss_perf_init(pr); From fbd401e95e569ad0307e4301012f2d8e1ec1ee98 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 18 Sep 2025 23:10:31 +0200 Subject: [PATCH 2364/3206] ACPI: processor: idle: Redefine two functions as void Notice that acpi_processor_power_init() and acpi_processor_power_exit() don't need to return any values because their callers don't check them anyway, so redefine those functions as void. While at it, rearrange the code in acpi_processor_power_init() to reduce the indentation level, get rid of a redundant local variable in that function, and rephrase a code comment in it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) --- drivers/acpi/processor_idle.c | 41 ++++++++++++++++------------------- include/acpi/processor.h | 4 ++-- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 5dacf41d7cc0a0..698d14c1958737 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -1400,47 +1400,45 @@ void acpi_processor_unregister_idle_driver(void) cpuidle_unregister_driver(&acpi_idle_driver); } -int acpi_processor_power_init(struct acpi_processor *pr) +void acpi_processor_power_init(struct acpi_processor *pr) { - int retval; struct cpuidle_device *dev; if (disabled_by_idle_boot_param()) - return 0; + return; acpi_processor_cstate_first_run_checks(); if (!acpi_processor_get_power_info(pr)) pr->flags.power_setup_done = 1; - if (pr->flags.power) { - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - per_cpu(acpi_cpuidle_device, pr->id) = dev; + if (!pr->flags.power) + return; - acpi_processor_setup_cpuidle_dev(pr, dev); + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return; - /* Register per-cpu cpuidle_device. Cpuidle driver - * must already be registered before registering device - */ - retval = cpuidle_register_device(dev); - if (retval) { + per_cpu(acpi_cpuidle_device, pr->id) = dev; - per_cpu(acpi_cpuidle_device, pr->id) = NULL; - kfree(dev); - return retval; - } + acpi_processor_setup_cpuidle_dev(pr, dev); + + /* + * Register a cpuidle device for this CPU. The cpuidle driver using + * this device is expected to be registered. + */ + if (cpuidle_register_device(dev)) { + per_cpu(acpi_cpuidle_device, pr->id) = NULL; + kfree(dev); } - return 0; } -int acpi_processor_power_exit(struct acpi_processor *pr) +void acpi_processor_power_exit(struct acpi_processor *pr) { struct cpuidle_device *dev = per_cpu(acpi_cpuidle_device, pr->id); if (disabled_by_idle_boot_param()) - return 0; + return; if (pr->flags.power) { cpuidle_unregister_device(dev); @@ -1448,7 +1446,6 @@ int acpi_processor_power_exit(struct acpi_processor *pr) } pr->flags.power_setup_done = 0; - return 0; } MODULE_IMPORT_NS("ACPI_PROCESSOR_IDLE"); diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 6ee4a69412de67..24fdaa3c28992e 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -419,8 +419,8 @@ static inline void acpi_processor_throttling_init(void) {} /* in processor_idle.c */ extern struct cpuidle_driver acpi_idle_driver; #ifdef CONFIG_ACPI_PROCESSOR_IDLE -int acpi_processor_power_init(struct acpi_processor *pr); -int acpi_processor_power_exit(struct acpi_processor *pr); +void acpi_processor_power_init(struct acpi_processor *pr); +void acpi_processor_power_exit(struct acpi_processor *pr); int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); void acpi_processor_register_idle_driver(void); From 46c435cbaf5591a349c339e080ac2a228aa99649 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 11 Sep 2025 14:02:33 +0200 Subject: [PATCH 2365/3206] cpufreq: intel_pstate: Enable HWP without EPP if DEC is enabled So far, HWP has never been enabled without EPP (Energy-Performance Preference) interface support, since the lack of the latter indicates an incomplete implementation of HWP, which was the case on early development vehicle platforms. However, HWP can be expected to work if DEC (Dynamic Efficiency Control) is enabled as indicated by setting bit 27 in MSR_IA32_POWER_CTL (DEC enable bit). Accordingly, allow HWP to be enabled if the EPP interface is not supported so long as DEC is enabled in the processor. Still, the EPP control sysfs interface is useless when EPP is not supported, so do not expose it in that case. Link: https://lore.kernel.org/linux-pm/20250904000608.260817-2-srinivas.pandruvada@linux.intel.com/ Signed-off-by: Srinivas Pandruvada Co-developed-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 72 ++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 20746c35d3ba27..285051e4a5dd86 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -897,11 +897,19 @@ static ssize_t show_base_frequency(struct cpufreq_policy *policy, char *buf) cpufreq_freq_attr_ro(base_frequency); +enum hwp_cpufreq_attr_index { + HWP_BASE_FREQUENCY_INDEX = 0, + HWP_PERFORMANCE_PREFERENCE_INDEX, + HWP_PERFORMANCE_AVAILABLE_PREFERENCES_INDEX, + HWP_CPUFREQ_ATTR_COUNT, +}; + static struct freq_attr *hwp_cpufreq_attrs[] = { - &energy_performance_preference, - &energy_performance_available_preferences, - &base_frequency, - NULL, + [HWP_BASE_FREQUENCY_INDEX] = &base_frequency, + [HWP_PERFORMANCE_PREFERENCE_INDEX] = &energy_performance_preference, + [HWP_PERFORMANCE_AVAILABLE_PREFERENCES_INDEX] = + &energy_performance_available_preferences, + [HWP_CPUFREQ_ATTR_COUNT] = NULL, }; static bool no_cas __ro_after_init; @@ -1370,6 +1378,9 @@ static void intel_pstate_hwp_offline(struct cpudata *cpu) #define POWER_CTL_EE_ENABLE 1 #define POWER_CTL_EE_DISABLE 2 +/* Enable bit for Dynamic Efficiency Control (DEC) */ +#define POWER_CTL_DEC_ENABLE 27 + static int power_ctl_ee_state; static void set_power_ctl_ee_state(bool input) @@ -3758,6 +3769,26 @@ static const struct x86_cpu_id intel_hybrid_scaling_factor[] = { {} }; +static bool hwp_check_epp(void) +{ + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) + return true; + + /* Without EPP support, don't expose EPP-related sysfs attributes. */ + hwp_cpufreq_attrs[HWP_PERFORMANCE_PREFERENCE_INDEX] = NULL; + hwp_cpufreq_attrs[HWP_PERFORMANCE_AVAILABLE_PREFERENCES_INDEX] = NULL; + + return false; +} + +static bool hwp_check_dec(void) +{ + u64 power_ctl; + + rdmsrq(MSR_IA32_POWER_CTL, power_ctl); + return !!(power_ctl & BIT(POWER_CTL_DEC_ENABLE)); +} + static int __init intel_pstate_init(void) { static struct cpudata **_all_cpu_data; @@ -3778,23 +3809,32 @@ static int __init intel_pstate_init(void) id = x86_match_cpu(hwp_support_ids); if (id) { - hwp_forced = intel_pstate_hwp_is_enabled(); + bool epp_present = hwp_check_epp(); - if (hwp_forced) + /* + * If HWP is enabled already, there is no choice but to deal + * with it. + */ + hwp_forced = intel_pstate_hwp_is_enabled(); + if (hwp_forced) { pr_info("HWP enabled by BIOS\n"); - else if (no_load) + no_hwp = 0; + } else if (no_load) { return -ENODEV; + } else if (!epp_present && !hwp_check_dec()) { + /* + * Avoid enabling HWP for processors without EPP support + * unless the Dynamic Efficiency Control (DEC) enable + * bit (MSR_IA32_POWER_CTL, bit 27) is set because that + * means incomplete HWP implementation which is a corner + * case and supporting it is generally problematic. + */ + no_hwp = 1; + } copy_cpu_funcs(&core_funcs); - /* - * Avoid enabling HWP for processors without EPP support, - * because that means incomplete HWP implementation which is a - * corner case and supporting it is generally problematic. - * - * If HWP is enabled already, though, there is no choice but to - * deal with it. - */ - if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) || hwp_forced) { + + if (!no_hwp) { hwp_active = true; hwp_mode_bdw = id->driver_data; intel_pstate.attr = hwp_cpufreq_attrs; From 7c0dde86c17665cb27e1c8dd23d263e2ed2d5b50 Mon Sep 17 00:00:00 2001 From: Zihuan Zhang Date: Mon, 8 Sep 2025 16:57:38 +0800 Subject: [PATCH 2366/3206] cpufreq: Add defensive check during driver registration Currently, cpufreq allows drivers to implement both ->target() and ->target_index() callbacks, but that can lead to ambiguous or incorrect behavior. For this reason, prevent cpufreq drivers implementing both ->target() and ->target_index() at the same time from registering. This check can help to catch driver implementation mistakes early and improve overall robustness, without affecting existing valid drivers. Signed-off-by: Zihuan Zhang Link: https://patch.msgid.link/20250908085738.31602-1-zhangzihuan@kylinos.cn [ rjw: Subject adjustment and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 30e8e9b3c12fc0..599a7af76b3db1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2911,6 +2911,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) return -EPROBE_DEFER; if (!driver_data || !driver_data->verify || !driver_data->init || + (driver_data->target_index && driver_data->target) || (!!driver_data->setpolicy == (driver_data->target_index || driver_data->target)) || (!driver_data->get_intermediate != !driver_data->target_intermediate) || (!driver_data->online != !driver_data->offline) || From 02d09026a88f227aa1f4687bd31d6ffc4970e8be Mon Sep 17 00:00:00 2001 From: Yaxiong Tian Date: Fri, 12 Sep 2025 15:35:02 +0800 Subject: [PATCH 2367/3206] cpufreq: intel_pstate: Use likely() optimization in intel_pstate_sample() The comment above the condition `if (cpu->last_sample_time)` clearly indicates that the branch is taken for the vast majority of invocations after the first sample in a cycle. The first sample is a one-time initialization case. Add likely() hint to the condition to improve branch prediction for this performance-critical path in intel_pstate_sample(). Signed-off-by: Yaxiong Tian Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 285051e4a5dd86..05e440a6aa4a25 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2542,7 +2542,7 @@ static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) * that sample.time will always be reset before setting the utilization * update hook and make the caller skip the sample then. */ - if (cpu->last_sample_time) { + if (likely(cpu->last_sample_time)) { intel_pstate_calc_avg_perf(cpu); return true; } From bce5749b02019f9b7e60fdff584098e57337c8b6 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Fri, 19 Sep 2025 11:48:15 +0800 Subject: [PATCH 2368/3206] bpftool: Add HELP_SPEC_OPTIONS in token.c $ ./bpftool token help Usage: bpftool token { show | list } bpftool token help OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} } Fixes: 2d812311c2b2 ("bpftool: Add bpf_token show") Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20250919034816.1287280-1-chen.dylane@linux.dev --- tools/bpf/bpftool/token.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/token.c b/tools/bpf/bpftool/token.c index 6312e662a1299a..82b829e44c82aa 100644 --- a/tools/bpf/bpftool/token.c +++ b/tools/bpf/bpftool/token.c @@ -206,6 +206,7 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %1$s %2$s { show | list }\n" " %1$s %2$s help\n" + " " HELP_SPEC_OPTIONS " }\n" "\n" "", bin_name, argv[-2]); From 57cb26950112f0dfa9077b2710b1c280efa97e81 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Fri, 19 Sep 2025 11:48:16 +0800 Subject: [PATCH 2369/3206] bpftool: Fix UAF in get_delegate_value The return value ret pointer is pointing opts_copy, but opts_copy gets freed in get_delegate_value before return, fix this by free the mntent->mnt_opts strdup memory after show delegate value. Fixes: 2d812311c2b2 ("bpftool: Add bpf_token show") Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20250919034816.1287280-2-chen.dylane@linux.dev --- tools/bpf/bpftool/token.c | 90 ++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 53 deletions(-) diff --git a/tools/bpf/bpftool/token.c b/tools/bpf/bpftool/token.c index 82b829e44c82aa..c08f34b9d51b58 100644 --- a/tools/bpf/bpftool/token.c +++ b/tools/bpf/bpftool/token.c @@ -20,6 +20,16 @@ #define MOUNTS_FILE "/proc/mounts" +static struct { + const char *header; + const char *key; +} sets[] = { + {"allowed_cmds", "delegate_cmds"}, + {"allowed_maps", "delegate_maps"}, + {"allowed_progs", "delegate_progs"}, + {"allowed_attachs", "delegate_attachs"}, +}; + static bool has_delegate_options(const char *mnt_ops) { return strstr(mnt_ops, "delegate_cmds") || @@ -28,15 +38,14 @@ static bool has_delegate_options(const char *mnt_ops) strstr(mnt_ops, "delegate_attachs"); } -static char *get_delegate_value(const char *opts, const char *key) +static char *get_delegate_value(char *opts, const char *key) { char *token, *rest, *ret = NULL; - char *opts_copy = strdup(opts); - if (!opts_copy) + if (!opts) return NULL; - for (token = strtok_r(opts_copy, ",", &rest); token; + for (token = strtok_r(opts, ",", &rest); token; token = strtok_r(NULL, ",", &rest)) { if (strncmp(token, key, strlen(key)) == 0 && token[strlen(key)] == '=') { @@ -44,24 +53,19 @@ static char *get_delegate_value(const char *opts, const char *key) break; } } - free(opts_copy); return ret; } -static void print_items_per_line(const char *input, int items_per_line) +static void print_items_per_line(char *input, int items_per_line) { - char *str, *rest, *strs; + char *str, *rest; int cnt = 0; if (!input) return; - strs = strdup(input); - if (!strs) - return; - - for (str = strtok_r(strs, ":", &rest); str; + for (str = strtok_r(input, ":", &rest); str; str = strtok_r(NULL, ":", &rest)) { if (cnt % items_per_line == 0) printf("\n\t "); @@ -69,38 +73,31 @@ static void print_items_per_line(const char *input, int items_per_line) printf("%-20s", str); cnt++; } - - free(strs); } #define ITEMS_PER_LINE 4 static void show_token_info_plain(struct mntent *mntent) { - char *value; + size_t i; printf("token_info %s", mntent->mnt_dir); - printf("\n\tallowed_cmds:"); - value = get_delegate_value(mntent->mnt_opts, "delegate_cmds"); - print_items_per_line(value, ITEMS_PER_LINE); - - printf("\n\tallowed_maps:"); - value = get_delegate_value(mntent->mnt_opts, "delegate_maps"); - print_items_per_line(value, ITEMS_PER_LINE); + for (i = 0; i < ARRAY_SIZE(sets); i++) { + char *opts, *value; - printf("\n\tallowed_progs:"); - value = get_delegate_value(mntent->mnt_opts, "delegate_progs"); - print_items_per_line(value, ITEMS_PER_LINE); + printf("\n\t%s:", sets[i].header); + opts = strdup(mntent->mnt_opts); + value = get_delegate_value(opts, sets[i].key); + print_items_per_line(value, ITEMS_PER_LINE); + free(opts); + } - printf("\n\tallowed_attachs:"); - value = get_delegate_value(mntent->mnt_opts, "delegate_attachs"); - print_items_per_line(value, ITEMS_PER_LINE); printf("\n"); } -static void split_json_array_str(const char *input) +static void split_json_array_str(char *input) { - char *str, *rest, *strs; + char *str, *rest; if (!input) { jsonw_start_array(json_wtr); @@ -108,43 +105,30 @@ static void split_json_array_str(const char *input) return; } - strs = strdup(input); - if (!strs) - return; - jsonw_start_array(json_wtr); - for (str = strtok_r(strs, ":", &rest); str; + for (str = strtok_r(input, ":", &rest); str; str = strtok_r(NULL, ":", &rest)) { jsonw_string(json_wtr, str); } jsonw_end_array(json_wtr); - - free(strs); } static void show_token_info_json(struct mntent *mntent) { - char *value; + size_t i; jsonw_start_object(json_wtr); - jsonw_string_field(json_wtr, "token_info", mntent->mnt_dir); - jsonw_name(json_wtr, "allowed_cmds"); - value = get_delegate_value(mntent->mnt_opts, "delegate_cmds"); - split_json_array_str(value); + for (i = 0; i < ARRAY_SIZE(sets); i++) { + char *opts, *value; - jsonw_name(json_wtr, "allowed_maps"); - value = get_delegate_value(mntent->mnt_opts, "delegate_maps"); - split_json_array_str(value); - - jsonw_name(json_wtr, "allowed_progs"); - value = get_delegate_value(mntent->mnt_opts, "delegate_progs"); - split_json_array_str(value); - - jsonw_name(json_wtr, "allowed_attachs"); - value = get_delegate_value(mntent->mnt_opts, "delegate_attachs"); - split_json_array_str(value); + jsonw_name(json_wtr, sets[i].header); + opts = strdup(mntent->mnt_opts); + value = get_delegate_value(opts, sets[i].key); + split_json_array_str(value); + free(opts); + } jsonw_end_object(json_wtr); } From 5612ea8b554375d45c14cbb0f8ea93ec5d172891 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 17 Sep 2025 11:38:47 -0700 Subject: [PATCH 2370/3206] bpftool: Fix -Wuninitialized-const-pointer warnings with clang >= 21 This fixes the build with -Werror -Wall. btf_dumper.c:71:31: error: variable 'finfo' is uninitialized when passed as a const pointer argument here [-Werror,-Wuninitialized-const-pointer] 71 | info.func_info = ptr_to_u64(&finfo); | ^~~~~ prog.c:2294:31: error: variable 'func_info' is uninitialized when passed as a const pointer argument here [-Werror,-Wuninitialized-const-pointer] 2294 | info.func_info = ptr_to_u64(&func_info); | v2: - Initialize instead of using memset. Signed-off-by: Tom Stellard Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20250917183847.318163-1-tstellar@redhat.com --- tools/bpf/bpftool/btf_dumper.c | 2 +- tools/bpf/bpftool/prog.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 4e896d8a2416e9..ff12628593aecd 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -38,7 +38,7 @@ static int dump_prog_id_as_func_ptr(const struct btf_dumper *d, __u32 info_len = sizeof(info); const char *prog_name = NULL; struct btf *prog_btf = NULL; - struct bpf_func_info finfo; + struct bpf_func_info finfo = {}; __u32 finfo_rec_size; char prog_str[1024]; int err; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index cf18c387968014..4dccc75b0bab01 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -2262,7 +2262,7 @@ static void profile_print_readings(void) static char *profile_target_name(int tgt_fd) { - struct bpf_func_info func_info; + struct bpf_func_info func_info = {}; struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); const struct btf_type *t; From 1091860a16a86ccdd77c09f2b21a5f634f5ab9ec Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Wed, 17 Sep 2025 19:39:19 +0800 Subject: [PATCH 2371/3206] net: tun: Update napi->skb after XDP process The syzbot report a UAF issue: BUG: KASAN: slab-use-after-free in skb_reset_mac_header include/linux/skbuff.h:3150 [inline] BUG: KASAN: slab-use-after-free in napi_frags_skb net/core/gro.c:723 [inline] BUG: KASAN: slab-use-after-free in napi_gro_frags+0x6e/0x1030 net/core/gro.c:758 Read of size 8 at addr ffff88802ef22c18 by task syz.0.17/6079 CPU: 0 UID: 0 PID: 6079 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x240 mm/kasan/report.c:482 kasan_report+0x118/0x150 mm/kasan/report.c:595 skb_reset_mac_header include/linux/skbuff.h:3150 [inline] napi_frags_skb net/core/gro.c:723 [inline] napi_gro_frags+0x6e/0x1030 net/core/gro.c:758 tun_get_user+0x28cb/0x3e20 drivers/net/tun.c:1920 tun_chr_write_iter+0x113/0x200 drivers/net/tun.c:1996 new_sync_write fs/read_write.c:593 [inline] vfs_write+0x5c9/0xb30 fs/read_write.c:686 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Allocated by task 6079: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 unpoison_slab_object mm/kasan/common.c:330 [inline] __kasan_mempool_unpoison_object+0xa0/0x170 mm/kasan/common.c:558 kasan_mempool_unpoison_object include/linux/kasan.h:388 [inline] napi_skb_cache_get+0x37b/0x6d0 net/core/skbuff.c:295 __alloc_skb+0x11e/0x2d0 net/core/skbuff.c:657 napi_alloc_skb+0x84/0x7d0 net/core/skbuff.c:811 napi_get_frags+0x69/0x140 net/core/gro.c:673 tun_napi_alloc_frags drivers/net/tun.c:1404 [inline] tun_get_user+0x77c/0x3e20 drivers/net/tun.c:1784 tun_chr_write_iter+0x113/0x200 drivers/net/tun.c:1996 new_sync_write fs/read_write.c:593 [inline] vfs_write+0x5c9/0xb30 fs/read_write.c:686 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 6079: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:243 [inline] __kasan_slab_free+0x5b/0x80 mm/kasan/common.c:275 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2422 [inline] slab_free mm/slub.c:4695 [inline] kmem_cache_free+0x18f/0x400 mm/slub.c:4797 skb_pp_cow_data+0xdd8/0x13e0 net/core/skbuff.c:969 netif_skb_check_for_xdp net/core/dev.c:5390 [inline] netif_receive_generic_xdp net/core/dev.c:5431 [inline] do_xdp_generic+0x699/0x11a0 net/core/dev.c:5499 tun_get_user+0x2523/0x3e20 drivers/net/tun.c:1872 tun_chr_write_iter+0x113/0x200 drivers/net/tun.c:1996 new_sync_write fs/read_write.c:593 [inline] vfs_write+0x5c9/0xb30 fs/read_write.c:686 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f After commit e6d5dbdd20aa ("xdp: add multi-buff support for xdp running in generic mode"), the original skb may be freed in skb_pp_cow_data() when XDP program was attached, which was allocated in tun_napi_alloc_frags(). However, the napi->skb still point to the original skb, update it after XDP process. Reported-by: syzbot+64e24275ad95a915a313@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=64e24275ad95a915a313 Fixes: e6d5dbdd20aa ("xdp: add multi-buff support for xdp running in generic mode") Signed-off-by: Wang Liang Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250917113919.3991267-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index cc6c5018066370..47ddcb4b9a788b 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1875,6 +1875,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, local_bh_enable(); goto unlock_frags; } + + if (frags && skb != tfile->napi.skb) + tfile->napi.skb = skb; } rcu_read_unlock(); local_bh_enable(); From a35c04de2565db191726b5741e6b66a35002c652 Mon Sep 17 00:00:00 2001 From: Sidraya Jayagond Date: Wed, 17 Sep 2025 20:42:20 +0200 Subject: [PATCH 2372/3206] net/smc: fix warning in smc_rx_splice() when calling get_page() smc_lo_register_dmb() allocates DMB buffers with kzalloc(), which are later passed to get_page() in smc_rx_splice(). Since kmalloc memory is not page-backed, this triggers WARN_ON_ONCE() in get_page() and prevents holding a refcount on the buffer. This can lead to use-after-free if the memory is released before splice_to_pipe() completes. Use folio_alloc() instead, ensuring DMBs are page-backed and safe for get_page(). WARNING: CPU: 18 PID: 12152 at ./include/linux/mm.h:1330 smc_rx_splice+0xaf8/0xe20 [smc] CPU: 18 UID: 0 PID: 12152 Comm: smcapp Kdump: loaded Not tainted 6.17.0-rc3-11705-g9cf4672ecfee #10 NONE Hardware name: IBM 3931 A01 704 (z/VM 7.4.0) Krnl PSW : 0704e00180000000 000793161032696c (smc_rx_splice+0xafc/0xe20 [smc]) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3 Krnl GPRS: 0000000000000000 001cee80007d3001 00077400000000f8 0000000000000005 0000000000000001 001cee80007d3006 0007740000001000 001c000000000000 000000009b0c99e0 0000000000001000 001c0000000000f8 001c000000000000 000003ffcc6f7c88 0007740003e98000 0007931600000005 000792969b2ff7b8 Krnl Code: 0007931610326960: af000000 mc 0,0 0007931610326964: a7f4ff43 brc 15,00079316103267ea #0007931610326968: af000000 mc 0,0 >000793161032696c: a7f4ff3f brc 15,00079316103267ea 0007931610326970: e320f1000004 lg %r2,256(%r15) 0007931610326976: c0e53fd1b5f5 brasl %r14,000793168fd5d560 000793161032697c: a7f4fbb5 brc 15,00079316103260e6 0007931610326980: b904002b lgr %r2,%r11 Call Trace: smc_rx_splice+0xafc/0xe20 [smc] smc_rx_splice+0x756/0xe20 [smc]) smc_rx_recvmsg+0xa74/0xe00 [smc] smc_splice_read+0x1ce/0x3b0 [smc] sock_splice_read+0xa2/0xf0 do_splice_read+0x198/0x240 splice_file_to_pipe+0x7e/0x110 do_splice+0x59e/0xde0 __do_splice+0x11a/0x2d0 __s390x_sys_splice+0x140/0x1f0 __do_syscall+0x122/0x280 system_call+0x6e/0x90 Last Breaking-Event-Address: smc_rx_splice+0x960/0xe20 [smc] ---[ end trace 0000000000000000 ]--- Fixes: f7a22071dbf3 ("net/smc: implement DMB-related operations of loopback-ism") Reviewed-by: Mahanta Jambigi Signed-off-by: Sidraya Jayagond Link: https://patch.msgid.link/20250917184220.801066-1-sidraya@linux.ibm.com Signed-off-by: Jakub Kicinski --- net/smc/smc_loopback.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 0eb00bbefd1748..77cc1c6dc3e977 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -56,6 +56,7 @@ static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, { struct smc_lo_dmb_node *dmb_node, *tmp_node; struct smc_lo_dev *ldev = smcd->priv; + struct folio *folio; int sba_idx, rc; /* check space for new dmb */ @@ -74,13 +75,16 @@ static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, dmb_node->sba_idx = sba_idx; dmb_node->len = dmb->dmb_len; - dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | - __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC); - if (!dmb_node->cpu_addr) { + + /* not critical; fail under memory pressure and fallback to TCP */ + folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_ZERO, + get_order(dmb_node->len)); + if (!folio) { rc = -ENOMEM; goto err_node; } + dmb_node->cpu_addr = folio_address(folio); dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; refcount_set(&dmb_node->refcnt, 1); @@ -122,7 +126,7 @@ static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, write_unlock_bh(&ldev->dmb_ht_lock); clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); - kvfree(dmb_node->cpu_addr); + folio_put(virt_to_folio(dmb_node->cpu_addr)); kfree(dmb_node); if (atomic_dec_and_test(&ldev->dmb_cnt)) From b65678cacc030efd53c38c089fb9b741a2ee34c8 Mon Sep 17 00:00:00 2001 From: Petr Malat Date: Thu, 18 Sep 2025 17:21:07 +0200 Subject: [PATCH 2373/3206] ethernet: rvu-af: Remove slash from the driver name Having a slash in the driver name leads to EIO being returned while reading /sys/module/rvu_af/drivers content. Remove DRV_STRING as it's not used anywhere. Fixes: 91c6945ea1f9 ("octeontx2-af: cn10k: Add RPM MAC support") Signed-off-by: Petr Malat Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250918152106.1798299-1-oss@malat.biz Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 0c46ba8a5adc8f..69324ae093973e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -21,8 +21,7 @@ #include "rvu.h" #include "lmac_common.h" -#define DRV_NAME "Marvell-CGX/RPM" -#define DRV_STRING "Marvell CGX/RPM Driver" +#define DRV_NAME "Marvell-CGX-RPM" #define CGX_RX_STAT_GLOBAL_INDEX 9 From 853a57ba263adfecf4430b936d6862bc475b4bb5 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sat, 20 Sep 2025 11:51:48 +0900 Subject: [PATCH 2374/3206] firewire: core: fix overlooked update of subsystem ABI version In kernel v6.5, several functions were added to the cdev layer. This required updating the default version of subsystem ABI up to 6, but this requirement was overlooked. This commit updates the version accordingly. Fixes: 6add87e9764d ("firewire: cdev: add new version of ABI to notify time stamp at request/response subaction of transaction#") Link: https://lore.kernel.org/r/20250920025148.163402-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-cdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 78b10c6ef7fec7..2e93189d7142ae 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -41,7 +41,7 @@ /* * ABI version history is documented in linux/firewire-cdev.h. */ -#define FW_CDEV_KERNEL_VERSION 5 +#define FW_CDEV_KERNEL_VERSION 6 #define FW_CDEV_VERSION_EVENT_REQUEST2 4 #define FW_CDEV_VERSION_ALLOCATE_REGION_END 4 #define FW_CDEV_VERSION_AUTO_FLUSH_ISO_OVERFLOW 5 From 3ec09344b01a15901ba824e877a0562ed8103e27 Mon Sep 17 00:00:00 2001 From: Yao Zi Date: Fri, 19 Sep 2025 12:58:29 +0000 Subject: [PATCH 2375/3206] LoongArch: Fix bitflag conflict for TIF_FIXADE After LoongArch was converted to use the generic TIF bits in commit f9629891d407 ("loongarch: Use generic TIF bits"), its TIF_FIXADE flag takes the same bit with TIF_RESTORE_SIGMASK in thread_info.flags. Such conflict causes TIF_FIXADE being considered cleared when TIF_RESTORE_SIGMASK is cleared during deliver of a signal. And since TIF_FIXADE determines whether unaligned access emulation works for a task, userspace making use of unaligned access will receive unexpected SIGBUS (and likely terminate) after receiving its first signal. This conflict looks like a simple typo, switch it to the free bit 19. Fixes: f9629891d407 ("loongarch: Use generic TIF bits") Signed-off-by: Yao Zi Signed-off-by: Thomas Gleixner Reviewed-by: Wentao Guan --- arch/loongarch/include/asm/thread_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h index def7cb14467ec6..4d7117fcdc78c0 100644 --- a/arch/loongarch/include/asm/thread_info.h +++ b/arch/loongarch/include/asm/thread_info.h @@ -77,7 +77,7 @@ register unsigned long current_stack_pointer __asm__("$sp"); #define TIF_NOHZ 16 /* in adaptive nohz mode */ #define TIF_USEDFPU 17 /* FPU was used by this task this quantum (SMP) */ #define TIF_USEDSIMD 18 /* SIMD has been used this quantum */ -#define TIF_FIXADE 10 /* Fix address errors in software */ +#define TIF_FIXADE 19 /* Fix address errors in software */ #define TIF_LOGADE 20 /* Log address errors to syslog */ #define TIF_32BIT_REGS 21 /* 32-bit general purpose registers */ #define TIF_32BIT_ADDR 22 /* 32-bit address space */ From 0ff52df6b32a6b04a7c9dfe3d7a387aff215b482 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Fri, 19 Sep 2025 01:46:43 +0000 Subject: [PATCH 2376/3206] tools/nolibc: make time_t robust if __kernel_old_time_t is missing in host headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit d5094bcb5bfd ("tools/nolibc: define time_t in terms of __kernel_old_time_t") made nolibc use the kernel's time type so that `time_t` matches `timespec::tv_sec` on all ABIs (notably x32). But since __kernel_old_time_t is fairly new, notably from 2020 in commit 94c467ddb273 ("y2038: add __kernel_old_timespec and __kernel_old_time_t"), nolibc builds that rely on host headers may fail. Switch to __kernel_time_t, which is the same as __kernel_old_time_t and has existed for longer. Tested in PPC VM of Open Source Lab of Oregon State University (./tools/testing/selftests/rcutorture/bin/mkinitrd.sh) Fixes: d5094bcb5bfd ("tools/nolibc: define time_t in terms of __kernel_old_time_t") Signed-off-by: Zhouyi Zhou [Thomas: Reformat commit and its message a bit] Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/std.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/nolibc/std.h b/tools/include/nolibc/std.h index ba950f0e733843..2c1ad23b9b5c17 100644 --- a/tools/include/nolibc/std.h +++ b/tools/include/nolibc/std.h @@ -29,6 +29,6 @@ typedef unsigned long nlink_t; typedef signed long off_t; typedef signed long blksize_t; typedef signed long blkcnt_t; -typedef __kernel_old_time_t time_t; +typedef __kernel_time_t time_t; #endif /* _NOLIBC_STD_H */ From 8ffe28b4e8d8b18cb2f2933410322c24f039d5d6 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Thu, 18 Sep 2025 11:15:52 +0100 Subject: [PATCH 2377/3206] cpufreq: Initialize cpufreq-based invariance before subsys commit 2a6c72738706 ("cpufreq: Initialize cpufreq-based frequency-invariance later") postponed the frequency invariance initialization to avoid disabling it in the error case. This isn't locking safe, instead move the initialization up before the subsys interface is registered (which will rebuild the sched_domains) and add the corresponding disable on the error path. Observed lockdep without this patch: [ 0.989686] ====================================================== [ 0.989688] WARNING: possible circular locking dependency detected [ 0.989690] 6.17.0-rc4-cix-build+ #31 Tainted: G S [ 0.989691] ------------------------------------------------------ [ 0.989692] swapper/0/1 is trying to acquire lock: [ 0.989693] ffff800082ada7f8 (sched_energy_mutex){+.+.}-{4:4}, at: rebuild_sched_domains_energy+0x30/0x58 [ 0.989705] but task is already holding lock: [ 0.989706] ffff000088c89bc8 (&policy->rwsem){+.+.}-{4:4}, at: cpufreq_online+0x7f8/0xbe0 [ 0.989713] which lock already depends on the new lock. Fixes: 2a6c72738706 ("cpufreq: Initialize cpufreq-based frequency-invariance later") Signed-off-by: Christian Loehle Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index fc7eace8b65bed..58e3839a214008 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2953,6 +2953,15 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) goto err_null_driver; } + /* + * Mark support for the scheduler's frequency invariance engine for + * drivers that implement target(), target_index() or fast_switch(). + */ + if (!cpufreq_driver->setpolicy) { + static_branch_enable_cpuslocked(&cpufreq_freq_invariance); + pr_debug("cpufreq: supports frequency invariance\n"); + } + ret = subsys_interface_register(&cpufreq_interface); if (ret) goto err_boost_unreg; @@ -2974,21 +2983,14 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) hp_online = ret; ret = 0; - /* - * Mark support for the scheduler's frequency invariance engine for - * drivers that implement target(), target_index() or fast_switch(). - */ - if (!cpufreq_driver->setpolicy) { - static_branch_enable_cpuslocked(&cpufreq_freq_invariance); - pr_debug("supports frequency invariance"); - } - pr_debug("driver %s up and running\n", driver_data->name); goto out; err_if_unreg: subsys_interface_unregister(&cpufreq_interface); err_boost_unreg: + if (!cpufreq_driver->setpolicy) + static_branch_disable_cpuslocked(&cpufreq_freq_invariance); remove_boost_sysfs_file(); err_null_driver: write_lock_irqsave(&cpufreq_driver_lock, flags); From 8dba0fd9cee34b80d6e26a188115e5427773285a Mon Sep 17 00:00:00 2001 From: Vivek Yadav Date: Fri, 19 Sep 2025 22:26:57 +0530 Subject: [PATCH 2378/3206] cpuidle: sysfs: Use sysfs_emit()/sysfs_emit_at() instead of sprintf()/scnprintf() The ->show() callbacks in sysfs should use sysfs_emit() or sysfs_emit_at() when formatting values for user space. These helpers are the recommended way to ensure correct buffer handling and consistency across the kernel. See Documentation/filesystems/sysfs.rst for details. No functional change intended. Suggested-by: Joe Perches Signed-off-by: Vivek Yadav Link: https://patch.msgid.link/20250919165657.233349-1-vivekyadav1207731111@gmail.com [ rjw: Minor subject edits ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/sysfs.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index d6f5da61cb7d86..61de6481760471 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -27,14 +27,14 @@ static ssize_t show_available_governors(struct device *dev, mutex_lock(&cpuidle_lock); list_for_each_entry(tmp, &cpuidle_governors, governor_list) { - if (i >= (ssize_t) (PAGE_SIZE - (CPUIDLE_NAME_LEN + 2))) + if (i >= (ssize_t)(PAGE_SIZE - (CPUIDLE_NAME_LEN + 2))) goto out; - i += scnprintf(&buf[i], CPUIDLE_NAME_LEN + 1, "%s ", tmp->name); + i += sysfs_emit_at(buf, i, "%.*s ", CPUIDLE_NAME_LEN, tmp->name); } out: - i+= sprintf(&buf[i], "\n"); + i += sysfs_emit_at(buf, i, "\n"); mutex_unlock(&cpuidle_lock); return i; } @@ -49,9 +49,9 @@ static ssize_t show_current_driver(struct device *dev, spin_lock(&cpuidle_driver_lock); drv = cpuidle_get_driver(); if (drv) - ret = sprintf(buf, "%s\n", drv->name); + ret = sysfs_emit(buf, "%s\n", drv->name); else - ret = sprintf(buf, "none\n"); + ret = sysfs_emit(buf, "none\n"); spin_unlock(&cpuidle_driver_lock); return ret; @@ -65,9 +65,9 @@ static ssize_t show_current_governor(struct device *dev, mutex_lock(&cpuidle_lock); if (cpuidle_curr_governor) - ret = sprintf(buf, "%s\n", cpuidle_curr_governor->name); + ret = sysfs_emit(buf, "%s\n", cpuidle_curr_governor->name); else - ret = sprintf(buf, "none\n"); + ret = sysfs_emit(buf, "none\n"); mutex_unlock(&cpuidle_lock); return ret; @@ -230,7 +230,7 @@ static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0644, show, store) static ssize_t show_state_##_name(struct cpuidle_state *state, \ struct cpuidle_state_usage *state_usage, char *buf) \ { \ - return sprintf(buf, "%u\n", state->_name);\ + return sysfs_emit(buf, "%u\n", state->_name);\ } #define define_show_state_ull_function(_name) \ @@ -238,7 +238,7 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \ struct cpuidle_state_usage *state_usage, \ char *buf) \ { \ - return sprintf(buf, "%llu\n", state_usage->_name);\ + return sysfs_emit(buf, "%llu\n", state_usage->_name);\ } #define define_show_state_str_function(_name) \ @@ -247,8 +247,8 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \ char *buf) \ { \ if (state->_name[0] == '\0')\ - return sprintf(buf, "\n");\ - return sprintf(buf, "%s\n", state->_name);\ + return sysfs_emit(buf, "\n");\ + return sysfs_emit(buf, "%s\n", state->_name);\ } #define define_show_state_time_function(_name) \ @@ -256,7 +256,7 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \ struct cpuidle_state_usage *state_usage, \ char *buf) \ { \ - return sprintf(buf, "%llu\n", ktime_to_us(state->_name##_ns)); \ + return sysfs_emit(buf, "%llu\n", ktime_to_us(state->_name##_ns)); \ } define_show_state_time_function(exit_latency) @@ -273,14 +273,14 @@ static ssize_t show_state_time(struct cpuidle_state *state, struct cpuidle_state_usage *state_usage, char *buf) { - return sprintf(buf, "%llu\n", ktime_to_us(state_usage->time_ns)); + return sysfs_emit(buf, "%llu\n", ktime_to_us(state_usage->time_ns)); } static ssize_t show_state_disable(struct cpuidle_state *state, struct cpuidle_state_usage *state_usage, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", state_usage->disable & CPUIDLE_STATE_DISABLED_BY_USER); } @@ -310,7 +310,7 @@ static ssize_t show_state_default_status(struct cpuidle_state *state, struct cpuidle_state_usage *state_usage, char *buf) { - return sprintf(buf, "%s\n", + return sysfs_emit(buf, "%s\n", state->flags & CPUIDLE_FLAG_OFF ? "disabled" : "enabled"); } @@ -358,7 +358,7 @@ static ssize_t show_state_s2idle_##_name(struct cpuidle_state *state, \ struct cpuidle_state_usage *state_usage, \ char *buf) \ { \ - return sprintf(buf, "%llu\n", state_usage->s2idle_##_name);\ + return sysfs_emit(buf, "%llu\n", state_usage->s2idle_##_name);\ } define_show_state_s2idle_ull_function(usage); @@ -550,7 +550,7 @@ static ssize_t show_driver_name(struct cpuidle_driver *drv, char *buf) ssize_t ret; spin_lock(&cpuidle_driver_lock); - ret = sprintf(buf, "%s\n", drv ? drv->name : "none"); + ret = sysfs_emit(buf, "%s\n", drv ? drv->name : "none"); spin_unlock(&cpuidle_driver_lock); return ret; From 7b1b7961170e4fcad488755e5ffaaaf9bd527e8f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 19 Sep 2025 13:22:20 +0200 Subject: [PATCH 2379/3206] cpuidle: Fail cpuidle device registration if there is one already Refuse to register a cpuidle device if the given CPU has a cpuidle device already and print a message regarding it. Without this, an attempt to register a new cpuidle device without unregistering the existing one leads to the removal of the existing cpuidle device without removing its sysfs interface. Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 0835da449db8b4..56132e843c9919 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -635,8 +635,14 @@ static void __cpuidle_device_init(struct cpuidle_device *dev) static int __cpuidle_register_device(struct cpuidle_device *dev) { struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + unsigned int cpu = dev->cpu; int i, ret; + if (per_cpu(cpuidle_devices, cpu)) { + pr_info("CPU%d: cpuidle device already registered\n", cpu); + return -EEXIST; + } + if (!try_module_get(drv->owner)) return -EINVAL; @@ -648,7 +654,7 @@ static int __cpuidle_register_device(struct cpuidle_device *dev) dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_USER; } - per_cpu(cpuidle_devices, dev->cpu) = dev; + per_cpu(cpuidle_devices, cpu) = dev; list_add(&dev->device_list, &cpuidle_detected_devices); ret = cpuidle_coupled_register_device(dev); From 1c5091a9b4c990a6fcf8de7d0e97dba4cf878f17 Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Tue, 2 Sep 2025 08:43:44 -0700 Subject: [PATCH 2380/3206] Bluetooth: Fix build after header cleanup Some Kconfig dependencies are needed after my recent cleanup, since the core code has its own option. Since btmtksdio does not actually call h4_recv_buf(), move the definitions it uses outside the BT_HCIUART_H4 gate in hci_uart.h to avoid adding a dependency for btmtksdio. The rest I touched (bpa10x, btmtkuart, and btnxpuart) do really call h4_recv_buf(), so the dependency is required, add it for them. Fixes: 0e272fc7e17d ("Bluetooth: remove duplicate h4_recv_buf() in header") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508300413.OnIedvRh-lkp@intel.com/ Signed-off-by: Calvin Owens Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/Kconfig | 6 ++++++ drivers/bluetooth/hci_uart.h | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig index 4ab32abf0f4864..7df69ccb660053 100644 --- a/drivers/bluetooth/Kconfig +++ b/drivers/bluetooth/Kconfig @@ -312,7 +312,9 @@ config BT_HCIBCM4377 config BT_HCIBPA10X tristate "HCI BPA10x USB driver" + depends on BT_HCIUART depends on USB + select BT_HCIUART_H4 help Bluetooth HCI BPA10x USB driver. This driver provides support for the Digianswer BPA 100/105 Bluetooth @@ -437,8 +439,10 @@ config BT_MTKSDIO config BT_MTKUART tristate "MediaTek HCI UART driver" + depends on BT_HCIUART depends on SERIAL_DEV_BUS depends on USB || !BT_HCIBTUSB_MTK + select BT_HCIUART_H4 select BT_MTK help MediaTek Bluetooth HCI UART driver. @@ -483,7 +487,9 @@ config BT_VIRTIO config BT_NXPUART tristate "NXP protocol support" + depends on BT_HCIUART depends on SERIAL_DEV_BUS + select BT_HCIUART_H4 select CRC32 select CRC8 help diff --git a/drivers/bluetooth/hci_uart.h b/drivers/bluetooth/hci_uart.h index 5ea5dd80e297c7..cbbe79b241ce97 100644 --- a/drivers/bluetooth/hci_uart.h +++ b/drivers/bluetooth/hci_uart.h @@ -121,10 +121,6 @@ void hci_uart_set_flow_control(struct hci_uart *hu, bool enable); void hci_uart_set_speeds(struct hci_uart *hu, unsigned int init_speed, unsigned int oper_speed); -#ifdef CONFIG_BT_HCIUART_H4 -int h4_init(void); -int h4_deinit(void); - struct h4_recv_pkt { u8 type; /* Packet type */ u8 hlen; /* Header length */ @@ -162,6 +158,10 @@ struct h4_recv_pkt { .lsize = 2, \ .maxlen = HCI_MAX_FRAME_SIZE \ +#ifdef CONFIG_BT_HCIUART_H4 +int h4_init(void); +int h4_deinit(void); + struct sk_buff *h4_recv_buf(struct hci_dev *hdev, struct sk_buff *skb, const unsigned char *buffer, int count, const struct h4_recv_pkt *pkts, int pkts_count); From 1488af7b8b5f9896ea88ee35aa3301713f72737c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Sep 2025 10:29:18 -0400 Subject: [PATCH 2381/3206] Bluetooth: hci_sync: Fix hci_resume_advertising_sync hci_resume_advertising_sync is suppose to resume all instance paused by hci_pause_advertising_sync, this logic is used for procedures are only allowed when not advertising, but instance 0x00 was not being re-enabled. Fixes: ad383c2c65a5 ("Bluetooth: hci_sync: Enable advertising when LL privacy is enabled") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index b6f888d8354e3f..7a7d4989085848 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2594,6 +2594,13 @@ static int hci_resume_advertising_sync(struct hci_dev *hdev) hci_remove_ext_adv_instance_sync(hdev, adv->instance, NULL); } + + /* If current advertising instance is set to instance 0x00 + * then we need to re-enable it. + */ + if (!hdev->cur_adv_instance) + err = hci_enable_ext_advertising_sync(hdev, + hdev->cur_adv_instance); } else { /* Schedule for most recent instance to be restarted and begin * the software rotation loop From 2e128683176a56459cef8705fc7c35f438f88abd Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Aug 2025 10:27:29 -0400 Subject: [PATCH 2382/3206] Bluetooth: hci_event: Fix UAF in hci_conn_tx_dequeue This fixes the following UAF caused by not properly locking hdev when processing HCI_EV_NUM_COMP_PKTS: BUG: KASAN: slab-use-after-free in hci_conn_tx_dequeue+0x1be/0x220 net/bluetooth/hci_conn.c:3036 Read of size 4 at addr ffff8880740f0940 by task kworker/u11:0/54 CPU: 1 UID: 0 PID: 54 Comm: kworker/u11:0 Not tainted 6.16.0-rc7 #3 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Workqueue: hci1 hci_rx_work Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x230 mm/kasan/report.c:480 kasan_report+0x118/0x150 mm/kasan/report.c:593 hci_conn_tx_dequeue+0x1be/0x220 net/bluetooth/hci_conn.c:3036 hci_num_comp_pkts_evt+0x1c8/0xa50 net/bluetooth/hci_event.c:4404 hci_event_func net/bluetooth/hci_event.c:7477 [inline] hci_event_packet+0x7e0/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Allocated by task 54: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4359 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] __hci_conn_add+0x233/0x1b30 net/bluetooth/hci_conn.c:939 le_conn_complete_evt+0x3d6/0x1220 net/bluetooth/hci_event.c:5628 hci_le_enh_conn_complete_evt+0x189/0x470 net/bluetooth/hci_event.c:5794 hci_event_func net/bluetooth/hci_event.c:7474 [inline] hci_event_packet+0x78c/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Freed by task 9572: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4643 [inline] kfree+0x18e/0x440 mm/slub.c:4842 device_release+0x9c/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x22b/0x480 lib/kobject.c:737 hci_conn_cleanup net/bluetooth/hci_conn.c:175 [inline] hci_conn_del+0x8ff/0xcb0 net/bluetooth/hci_conn.c:1173 hci_abort_conn_sync+0x5d1/0xdf0 net/bluetooth/hci_sync.c:5689 hci_cmd_sync_work+0x210/0x3a0 net/bluetooth/hci_sync.c:332 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Fixes: 134f4b39df7b ("Bluetooth: add support for skb TX SND/COMPLETION timestamping") Reported-by: Junvyyang, Tencent Zhuque Lab Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 7a217485185741..97f543824bb054 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4391,6 +4391,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "num %d", ev->num); + hci_dev_lock(hdev); + for (i = 0; i < ev->num; i++) { struct hci_comp_pkts_info *info = &ev->handles[i]; struct hci_conn *conn; @@ -4472,6 +4474,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, } queue_work(hdev->workqueue, &hdev->tx_work); + + hci_dev_unlock(hdev); } static void hci_mode_change_evt(struct hci_dev *hdev, void *data, From 9e622804d57e2d08f0271200606bd1270f75126f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Aug 2025 11:10:20 -0400 Subject: [PATCH 2383/3206] Bluetooth: hci_event: Fix UAF in hci_acl_create_conn_sync This fixes the following UFA in hci_acl_create_conn_sync where a connection still pending is command submission (conn->state == BT_OPEN) maybe freed, also since this also can happen with the likes of hci_le_create_conn_sync fix it as well: BUG: KASAN: slab-use-after-free in hci_acl_create_conn_sync+0x5ef/0x790 net/bluetooth/hci_sync.c:6861 Write of size 2 at addr ffff88805ffcc038 by task kworker/u11:2/9541 CPU: 1 UID: 0 PID: 9541 Comm: kworker/u11:2 Not tainted 6.16.0-rc7 #3 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Workqueue: hci3 hci_cmd_sync_work Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x230 mm/kasan/report.c:480 kasan_report+0x118/0x150 mm/kasan/report.c:593 hci_acl_create_conn_sync+0x5ef/0x790 net/bluetooth/hci_sync.c:6861 hci_cmd_sync_work+0x210/0x3a0 net/bluetooth/hci_sync.c:332 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Allocated by task 123736: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4359 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] __hci_conn_add+0x233/0x1b30 net/bluetooth/hci_conn.c:939 hci_conn_add_unset net/bluetooth/hci_conn.c:1051 [inline] hci_connect_acl+0x16c/0x4e0 net/bluetooth/hci_conn.c:1634 pair_device+0x418/0xa70 net/bluetooth/mgmt.c:3556 hci_mgmt_cmd+0x9c9/0xef0 net/bluetooth/hci_sock.c:1719 hci_sock_sendmsg+0x6ca/0xef0 net/bluetooth/hci_sock.c:1839 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg+0x219/0x270 net/socket.c:727 sock_write_iter+0x258/0x330 net/socket.c:1131 new_sync_write fs/read_write.c:593 [inline] vfs_write+0x54b/0xa90 fs/read_write.c:686 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 103680: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4643 [inline] kfree+0x18e/0x440 mm/slub.c:4842 device_release+0x9c/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x22b/0x480 lib/kobject.c:737 hci_conn_cleanup net/bluetooth/hci_conn.c:175 [inline] hci_conn_del+0x8ff/0xcb0 net/bluetooth/hci_conn.c:1173 hci_conn_complete_evt+0x3c7/0x1040 net/bluetooth/hci_event.c:3199 hci_event_func net/bluetooth/hci_event.c:7477 [inline] hci_event_packet+0x7e0/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Last potentially related work creation: kasan_save_stack+0x3e/0x60 mm/kasan/common.c:47 kasan_record_aux_stack+0xbd/0xd0 mm/kasan/generic.c:548 insert_work+0x3d/0x330 kernel/workqueue.c:2183 __queue_work+0xbd9/0xfe0 kernel/workqueue.c:2345 queue_delayed_work_on+0x18b/0x280 kernel/workqueue.c:2561 pairing_complete+0x1e7/0x2b0 net/bluetooth/mgmt.c:3451 pairing_complete_cb+0x1ac/0x230 net/bluetooth/mgmt.c:3487 hci_connect_cfm include/net/bluetooth/hci_core.h:2064 [inline] hci_conn_failed+0x24d/0x310 net/bluetooth/hci_conn.c:1275 hci_conn_complete_evt+0x3c7/0x1040 net/bluetooth/hci_event.c:3199 hci_event_func net/bluetooth/hci_event.c:7477 [inline] hci_event_packet+0x7e0/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Fixes: aef2aa4fa98e ("Bluetooth: hci_event: Fix creating hci_conn object on error status") Reported-by: Junvyyang, Tencent Zhuque Lab Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 21 +++++++++++++++++++++ net/bluetooth/hci_event.c | 26 +++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6906af7a8f2412..6560b32f312557 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1245,6 +1245,27 @@ static inline struct hci_conn *hci_conn_hash_lookup_ba(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_role(struct hci_dev *hdev, + __u8 type, __u8 role, + bdaddr_t *ba) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == type && c->role == role && !bacmp(&c->dst, ba)) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev, bdaddr_t *ba, __u8 ba_type) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 97f543824bb054..fe49e8a7969ffc 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3087,8 +3087,18 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + /* Check for existing connection: + * + * 1. If it doesn't exist then it must be receiver/slave role. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); - if (!conn) { + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ @@ -5638,8 +5648,18 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr); - if (!conn) { + /* Check for existing connection: + * + * 1. If it doesn't exist then use the role to create a new object. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ + conn = hci_conn_hash_lookup_role(hdev, LE_LINK, role, bdaddr); + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ From b549113738e8c751b613118032a724b772aa83f2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 10 Sep 2025 12:42:43 +0200 Subject: [PATCH 2384/3206] futex: Prevent use-after-free during requeue-PI syzbot managed to trigger the following race: T1 T2 futex_wait_requeue_pi() futex_do_wait() schedule() futex_requeue() futex_proxy_trylock_atomic() futex_requeue_pi_prepare() requeue_pi_wake_futex() futex_requeue_pi_complete() /* preempt */ * timeout/ signal wakes T1 * futex_requeue_pi_wakeup_sync() // Q_REQUEUE_PI_LOCKED futex_hash_put() // back to userland, on stack futex_q is garbage /* back */ wake_up_state(q->task, TASK_NORMAL); In this scenario futex_wait_requeue_pi() is able to leave without using futex_q::lock_ptr for synchronization. This can be prevented by reading futex_q::task before updating the futex_q::requeue_state. A reference on the task_struct is not needed because requeue_pi_wake_futex() is invoked with a spinlock_t held which implies a RCU read section. Even if T1 terminates immediately after, the task_struct will remain valid during T2's wake_up_state(). A READ_ONCE on futex_q::task before futex_requeue_pi_complete() is enough because it ensures that the variable is read before the state is updated. Read futex_q::task before updating the requeue state, use it for the following wakeup. Fixes: 07d91ef510fb1 ("futex: Prevent requeue_pi() lock nesting issue on RT") Reported-by: syzbot+034246a838a10d181e78@syzkaller.appspotmail.com Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Closes: https://lore.kernel.org/all/68b75989.050a0220.3db4df.01dd.GAE@google.com/ --- kernel/futex/requeue.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index c716a66f86929c..d818b4d47f1bad 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -230,8 +230,9 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - q->key = *key; + struct task_struct *task; + q->key = *key; __futex_unqueue(q); WARN_ON(!q->rt_waiter); @@ -243,10 +244,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, futex_hash_get(hb); q->drop_hb_ref = true; q->lock_ptr = &hb->lock; + task = READ_ONCE(q->task); /* Signal locked state to the waiter */ futex_requeue_pi_complete(q, 1); - wake_up_state(q->task, TASK_NORMAL); + wake_up_state(task, TASK_NORMAL); } /** From 6b54082c3ed4dc9821cdf0edb17302355cc5bb45 Mon Sep 17 00:00:00 2001 From: Pranav Tyagi Date: Mon, 15 Sep 2025 23:51:54 +0530 Subject: [PATCH 2385/3206] futex: Don't leak robust_list pointer on exec race sys_get_robust_list() and compat_get_robust_list() use ptrace_may_access() to check if the calling task is allowed to access another task's robust_list pointer. This check is racy against a concurrent exec() in the target process. During exec(), a task may transition from a non-privileged binary to a privileged one (e.g., setuid binary) and its credentials/memory mappings may change. If get_robust_list() performs ptrace_may_access() before this transition, it may erroneously allow access to sensitive information after the target becomes privileged. A racy access allows an attacker to exploit a window during which ptrace_may_access() passes before a target process transitions to a privileged state via exec(). For example, consider a non-privileged task T that is about to execute a setuid-root binary. An attacker task A calls get_robust_list(T) while T is still unprivileged. Since ptrace_may_access() checks permissions based on current credentials, it succeeds. However, if T begins exec immediately afterwards, it becomes privileged and may change its memory mappings. Because get_robust_list() proceeds to access T->robust_list without synchronizing with exec() it may read user-space pointers from a now-privileged process. This violates the intended post-exec access restrictions and could expose sensitive memory addresses or be used as a primitive in a larger exploit chain. Consequently, the race can lead to unauthorized disclosure of information across privilege boundaries and poses a potential security risk. Take a read lock on signal->exec_update_lock prior to invoking ptrace_may_access() and accessing the robust_list/compat_robust_list. This ensures that the target task's exec state remains stable during the check, allowing for consistent and synchronized validation of credentials. Suggested-by: Jann Horn Signed-off-by: Pranav Tyagi Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/linux-fsdevel/1477863998-3298-5-git-send-email-jann@thejh.net/ Link: https://github.com/KSPP/linux/issues/119 --- kernel/futex/syscalls.c | 106 +++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 50 deletions(-) diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 4b6da9116aa6c3..880c9bf2f31504 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -39,6 +39,56 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, return 0; } +static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat) +{ +#ifdef CONFIG_COMPAT + if (compat) + return p->compat_robust_list; +#endif + return p->robust_list; +} + +static void __user *futex_get_robust_list_common(int pid, bool compat) +{ + struct task_struct *p = current; + void __user *head; + int ret; + + scoped_guard(rcu) { + if (pid) { + p = find_task_by_vpid(pid); + if (!p) + return (void __user *)ERR_PTR(-ESRCH); + } + get_task_struct(p); + } + + /* + * Hold exec_update_lock to serialize with concurrent exec() + * so ptrace_may_access() is checked against stable credentials + */ + ret = down_read_killable(&p->signal->exec_update_lock); + if (ret) + goto err_put; + + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) + goto err_unlock; + + head = futex_task_robust_list(p, compat); + + up_read(&p->signal->exec_update_lock); + put_task_struct(p); + + return head; + +err_unlock: + up_read(&p->signal->exec_update_lock); +err_put: + put_task_struct(p); + return (void __user *)ERR_PTR(ret); +} + /** * sys_get_robust_list() - Get the robust-futex list head of a task * @pid: pid of the process [zero for current task] @@ -49,36 +99,14 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, size_t __user *, len_ptr) { - struct robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; + struct robust_list_head __user *head = futex_get_robust_list_common(pid, false); - head = p->robust_list; - rcu_read_unlock(); + if (IS_ERR(head)) + return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, @@ -455,36 +483,14 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, compat_uptr_t __user *, head_ptr, compat_size_t __user *, len_ptr) { - struct compat_robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; + struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true); - head = p->compat_robust_list; - rcu_read_unlock(); + if (IS_ERR(head)) + return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(ptr_to_compat(head), head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; } #endif /* CONFIG_COMPAT */ From f2662ec26b26adb71783fa5e5ee75aff6f18a940 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:40 -0300 Subject: [PATCH 2386/3206] selftests: kselftest: Create ksft_print_dbg_msg() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create ksft_print_dbg_msg() so testers can enable extra debug messages when running a test with the flag -d. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- tools/testing/selftests/kselftest.h | 14 ++++++++++++++ tools/testing/selftests/kselftest_harness.h | 13 +++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index c3b6d2604b1e48..8deeb4b72e7338 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,7 @@ struct ksft_count { static struct ksft_count ksft_cnt; static unsigned int ksft_plan; +static bool ksft_debug_enabled; static inline unsigned int ksft_test_num(void) { @@ -175,6 +177,18 @@ static inline __printf(1, 2) void ksft_print_msg(const char *msg, ...) va_end(args); } +static inline void ksft_print_dbg_msg(const char *msg, ...) +{ + va_list args; + + if (!ksft_debug_enabled) + return; + + va_start(args, msg); + ksft_print_msg(msg, args); + va_end(args); +} + static inline void ksft_perror(const char *msg) { ksft_print_msg("%s: %s (%d)\n", msg, strerror(errno), errno); diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 2925e47db995d7..ffefd2704ca8f6 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -1091,7 +1091,7 @@ static int test_harness_argv_check(int argc, char **argv) { int opt; - while ((opt = getopt(argc, argv, "hlF:f:V:v:t:T:r:")) != -1) { + while ((opt = getopt(argc, argv, "dhlF:f:V:v:t:T:r:")) != -1) { switch (opt) { case 'f': case 'F': @@ -1104,12 +1104,16 @@ static int test_harness_argv_check(int argc, char **argv) case 'l': test_harness_list_tests(); return KSFT_SKIP; + case 'd': + ksft_debug_enabled = true; + break; case 'h': default: fprintf(stderr, - "Usage: %s [-h|-l] [-t|-T|-v|-V|-f|-F|-r name]\n" + "Usage: %s [-h|-l|-d] [-t|-T|-v|-V|-f|-F|-r name]\n" "\t-h print help\n" "\t-l list all tests\n" + "\t-d enable debug prints\n" "\n" "\t-t name include test\n" "\t-T name exclude test\n" @@ -1142,8 +1146,9 @@ static bool test_enabled(int argc, char **argv, int opt; optind = 1; - while ((opt = getopt(argc, argv, "F:f:V:v:t:T:r:")) != -1) { - has_positive |= islower(opt); + while ((opt = getopt(argc, argv, "dF:f:V:v:t:T:r:")) != -1) { + if (opt != 'd') + has_positive |= islower(opt); switch (tolower(opt)) { case 't': From d060495a37cb437459b3d202572d48a73bee8a66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:41 -0300 Subject: [PATCH 2387/3206] selftests/futex: Refactor futex_requeue_pi with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_requeue_pi test to use kselftest_harness header instead of futex's logging header. Use kselftest fixture feature to make it easy to repeat the same test with different parameters. With that, drop all repetitive test calls from run.sh. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../futex/functional/futex_requeue_pi.c | 266 ++++++++---------- .../testing/selftests/futex/functional/run.sh | 26 +- 2 files changed, 124 insertions(+), 168 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi.c b/tools/testing/selftests/futex/functional/futex_requeue_pi.c index 215c6cb539b4ab..f299d75848cd43 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi.c @@ -26,11 +26,11 @@ #include #include #include + #include "atomic.h" #include "futextest.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue-pi" #define MAX_WAKE_ITERS 1000 #define THREAD_MAX 10 #define SIGNAL_PERIOD_US 100 @@ -42,12 +42,6 @@ futex_t f1 = FUTEX_INITIALIZER; futex_t f2 = FUTEX_INITIALIZER; futex_t wake_complete = FUTEX_INITIALIZER; -/* Test option defaults */ -static long timeout_ns; -static int broadcast; -static int owner; -static int locked; - struct thread_arg { long id; struct timespec *timeout; @@ -56,18 +50,73 @@ struct thread_arg { }; #define THREAD_ARG_INITIALIZER { 0, NULL, 0, 0 } -void usage(char *prog) +FIXTURE(args) { - printf("Usage: %s\n", prog); - printf(" -b Broadcast wakeup (all waiters)\n"); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -l Lock the pi futex across requeue\n"); - printf(" -o Use a third party pi futex owner during requeue (cancels -l)\n"); - printf(" -t N Timeout in nanoseconds (default: 0)\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} +}; + +FIXTURE_SETUP(args) +{ +}; + +FIXTURE_TEARDOWN(args) +{ +}; + +FIXTURE_VARIANT(args) +{ + long timeout_ns; + bool broadcast; + bool owner; + bool locked; +}; + +/* + * For a given timeout value, this macro creates a test input with all the + * possible combinations of valid arguments + */ +#define FIXTURE_VARIANT_ADD_TIMEOUT(timeout) \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout) \ +{ \ + .timeout_ns = timeout, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast_locked) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ + .locked = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast_owner) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ + .owner = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_locked) \ +{ \ + .timeout_ns = timeout, \ + .locked = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_owner) \ +{ \ + .timeout_ns = timeout, \ + .owner = true, \ +}; \ + +FIXTURE_VARIANT_ADD_TIMEOUT(0); +FIXTURE_VARIANT_ADD_TIMEOUT(5000); +FIXTURE_VARIANT_ADD_TIMEOUT(500000); +FIXTURE_VARIANT_ADD_TIMEOUT(2000000000); int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, int policy, int prio) @@ -81,26 +130,26 @@ int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); if (ret) { - error("pthread_attr_setinheritsched\n", ret); + ksft_exit_fail_msg("pthread_attr_setinheritsched\n"); return -1; } ret = pthread_attr_setschedpolicy(&attr, policy); if (ret) { - error("pthread_attr_setschedpolicy\n", ret); + ksft_exit_fail_msg("pthread_attr_setschedpolicy\n"); return -1; } schedp.sched_priority = prio; ret = pthread_attr_setschedparam(&attr, &schedp); if (ret) { - error("pthread_attr_setschedparam\n", ret); + ksft_exit_fail_msg("pthread_attr_setschedparam\n"); return -1; } ret = pthread_create(pth, &attr, func, arg); if (ret) { - error("pthread_create\n", ret); + ksft_exit_fail_msg("pthread_create\n"); return -1; } return 0; @@ -112,7 +161,7 @@ void *waiterfn(void *arg) struct thread_arg *args = (struct thread_arg *)arg; futex_t old_val; - info("Waiter %ld: running\n", args->id); + ksft_print_dbg_msg("Waiter %ld: running\n", args->id); /* Each thread sleeps for a different amount of time * This is to avoid races, because we don't lock the * external mutex here */ @@ -120,26 +169,25 @@ void *waiterfn(void *arg) old_val = f1; atomic_inc(&waiters_blocked); - info("Calling futex_wait_requeue_pi: %p (%u) -> %p\n", + ksft_print_dbg_msg("Calling futex_wait_requeue_pi: %p (%u) -> %p\n", &f1, f1, &f2); args->ret = futex_wait_requeue_pi(&f1, old_val, &f2, args->timeout, FUTEX_PRIVATE_FLAG); - info("waiter %ld woke with %d %s\n", args->id, args->ret, + ksft_print_dbg_msg("waiter %ld woke with %d %s\n", args->id, args->ret, args->ret < 0 ? strerror(errno) : ""); atomic_inc(&waiters_woken); if (args->ret < 0) { if (args->timeout && errno == ETIMEDOUT) args->ret = 0; else { - args->ret = RET_ERROR; - error("futex_wait_requeue_pi\n", errno); + ksft_exit_fail_msg("futex_wait_requeue_pi\n"); } futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - info("Waiter %ld: exiting with %d\n", args->id, args->ret); + ksft_print_dbg_msg("Waiter %ld: exiting with %d\n", args->id, args->ret); pthread_exit((void *)&args->ret); } @@ -152,14 +200,14 @@ void *broadcast_wakerfn(void *arg) int nr_wake = 1; int i = 0; - info("Waker: waiting for waiters to block\n"); + ksft_print_dbg_msg("Waker: waiting for waiters to block\n"); while (waiters_blocked.val < THREAD_MAX) usleep(1000); usleep(1000); - info("Waker: Calling broadcast\n"); + ksft_print_dbg_msg("Waker: Calling broadcast\n"); if (args->lock) { - info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", f2, &f2); futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } continue_requeue: @@ -167,16 +215,14 @@ void *broadcast_wakerfn(void *arg) args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, nr_wake, nr_requeue, FUTEX_PRIVATE_FLAG); if (args->ret < 0) { - args->ret = RET_ERROR; - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); } else if (++i < MAX_WAKE_ITERS) { task_count += args->ret; if (task_count < THREAD_MAX - waiters_woken.val) goto continue_requeue; } else { - error("max broadcast iterations (%d) reached with %d/%d tasks woken or requeued\n", - 0, MAX_WAKE_ITERS, task_count, THREAD_MAX); - args->ret = RET_ERROR; + ksft_exit_fail_msg("max broadcast iterations (%d) reached with %d/%d tasks woken or requeued\n", + MAX_WAKE_ITERS, task_count, THREAD_MAX); } futex_wake(&wake_complete, 1, FUTEX_PRIVATE_FLAG); @@ -187,7 +233,7 @@ void *broadcast_wakerfn(void *arg) if (args->ret > 0) args->ret = task_count; - info("Waker: exiting with %d\n", args->ret); + ksft_print_dbg_msg("Waker: exiting with %d\n", args->ret); pthread_exit((void *)&args->ret); } @@ -200,20 +246,20 @@ void *signal_wakerfn(void *arg) int nr_wake = 1; int i = 0; - info("Waker: waiting for waiters to block\n"); + ksft_print_dbg_msg("Waker: waiting for waiters to block\n"); while (waiters_blocked.val < THREAD_MAX) usleep(1000); usleep(1000); while (task_count < THREAD_MAX && waiters_woken.val < THREAD_MAX) { - info("task_count: %d, waiters_woken: %d\n", + ksft_print_dbg_msg("task_count: %d, waiters_woken: %d\n", task_count, waiters_woken.val); if (args->lock) { - info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", - f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", + f2, &f2); futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } - info("Waker: Calling signal\n"); + ksft_print_dbg_msg("Waker: Calling signal\n"); /* cond_signal */ old_val = f1; args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, @@ -221,28 +267,23 @@ void *signal_wakerfn(void *arg) FUTEX_PRIVATE_FLAG); if (args->ret < 0) args->ret = -errno; - info("futex: %x\n", f2); + ksft_print_dbg_msg("futex: %x\n", f2); if (args->lock) { - info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", - f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", + f2, &f2); futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); } - info("futex: %x\n", f2); - if (args->ret < 0) { - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); - args->ret = RET_ERROR; - break; - } + ksft_print_dbg_msg("futex: %x\n", f2); + if (args->ret < 0) + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); task_count += args->ret; usleep(SIGNAL_PERIOD_US); i++; /* we have to loop at least THREAD_MAX times */ if (i > MAX_WAKE_ITERS + THREAD_MAX) { - error("max signaling iterations (%d) reached, giving up on pending waiters.\n", - 0, MAX_WAKE_ITERS + THREAD_MAX); - args->ret = RET_ERROR; - break; + ksft_exit_fail_msg("max signaling iterations (%d) reached, giving up on pending waiters.\n", + MAX_WAKE_ITERS + THREAD_MAX); } } @@ -251,8 +292,8 @@ void *signal_wakerfn(void *arg) if (args->ret >= 0) args->ret = task_count; - info("Waker: exiting with %d\n", args->ret); - info("Waker: waiters_woken: %d\n", waiters_woken.val); + ksft_print_dbg_msg("Waker: exiting with %d\n", args->ret); + ksft_print_dbg_msg("Waker: waiters_woken: %d\n", waiters_woken.val); pthread_exit((void *)&args->ret); } @@ -269,35 +310,40 @@ void *third_party_blocker(void *arg) ret2 = futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); out: - if (args->ret || ret2) { - error("third_party_blocker() futex error", 0); - args->ret = RET_ERROR; - } + if (args->ret || ret2) + ksft_exit_fail_msg("third_party_blocker() futex error"); pthread_exit((void *)&args->ret); } -int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) +TEST_F(args, futex_requeue_pi) { - void *(*wakerfn)(void *) = signal_wakerfn; struct thread_arg blocker_arg = THREAD_ARG_INITIALIZER; struct thread_arg waker_arg = THREAD_ARG_INITIALIZER; pthread_t waiter[THREAD_MAX], waker, blocker; - struct timespec ts, *tsp = NULL; + void *(*wakerfn)(void *) = signal_wakerfn; + bool third_party_owner = variant->owner; + long timeout_ns = variant->timeout_ns; + bool broadcast = variant->broadcast; struct thread_arg args[THREAD_MAX]; - int *waiter_ret; - int i, ret = RET_PASS; + struct timespec ts, *tsp = NULL; + bool lock = variant->locked; + int *waiter_ret, i, ret = 0; + + ksft_print_msg( + "\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n", + broadcast, lock, third_party_owner, timeout_ns); if (timeout_ns) { time_t secs; - info("timeout_ns = %ld\n", timeout_ns); + ksft_print_dbg_msg("timeout_ns = %ld\n", timeout_ns); ret = clock_gettime(CLOCK_MONOTONIC, &ts); secs = (ts.tv_nsec + timeout_ns) / 1000000000; ts.tv_nsec = ((int64_t)ts.tv_nsec + timeout_ns) % 1000000000; ts.tv_sec += secs; - info("ts.tv_sec = %ld\n", ts.tv_sec); - info("ts.tv_nsec = %ld\n", ts.tv_nsec); + ksft_print_dbg_msg("ts.tv_sec = %ld\n", ts.tv_sec); + ksft_print_dbg_msg("ts.tv_nsec = %ld\n", ts.tv_nsec); tsp = &ts; } @@ -307,10 +353,7 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) if (third_party_owner) { if (create_rt_thread(&blocker, third_party_blocker, (void *)&blocker_arg, SCHED_FIFO, 1)) { - error("Creating third party blocker thread failed\n", - errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating third party blocker thread failed\n"); } } @@ -318,20 +361,16 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) for (i = 0; i < THREAD_MAX; i++) { args[i].id = i; args[i].timeout = tsp; - info("Starting thread %d\n", i); + ksft_print_dbg_msg("Starting thread %d\n", i); if (create_rt_thread(&waiter[i], waiterfn, (void *)&args[i], SCHED_FIFO, 1)) { - error("Creating waiting thread failed\n", errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating waiting thread failed\n"); } } waker_arg.lock = lock; if (create_rt_thread(&waker, wakerfn, (void *)&waker_arg, SCHED_FIFO, 1)) { - error("Creating waker thread failed\n", errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating waker thread failed\n"); } /* Wait for threads to finish */ @@ -345,7 +384,6 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) pthread_join(blocker, NULL); pthread_join(waker, NULL); -out: if (!ret) { if (*waiter_ret) ret = *waiter_ret; @@ -355,66 +393,8 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) ret = blocker_arg.ret; } - return ret; + if (ret) + ksft_test_result_fail("fail"); } -int main(int argc, char *argv[]) -{ - char *test_name; - int c, ret; - - while ((c = getopt(argc, argv, "bchlot:v:")) != -1) { - switch (c) { - case 'b': - broadcast = 1; - break; - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'l': - locked = 1; - break; - case 'o': - owner = 1; - locked = 0; - break; - case 't': - timeout_ns = atoi(optarg); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test requeue functionality\n", basename(argv[0])); - ksft_print_msg( - "\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n", - broadcast, locked, owner, timeout_ns); - - ret = asprintf(&test_name, - "%s broadcast=%d locked=%d owner=%d timeout=%ldns", - TEST_NAME, broadcast, locked, owner, timeout_ns); - if (ret < 0) { - ksft_print_msg("Failed to generate test name\n"); - test_name = TEST_NAME; - } - - /* - * FIXME: unit_test is obsolete now that we parse options and the - * various style of runs are done by run.sh - simplify the code and move - * unit_test into main() - */ - ret = unit_test(broadcast, locked, owner, timeout_ns); - - print_result(test_name, ret); - return ret; -} +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 5470088dc4dfb6..d34e2235ac10d2 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -32,31 +32,7 @@ fi echo -# requeue pi testing -# without timeouts -./futex_requeue_pi $COLOR -./futex_requeue_pi $COLOR -b -./futex_requeue_pi $COLOR -b -l -./futex_requeue_pi $COLOR -b -o -./futex_requeue_pi $COLOR -l -./futex_requeue_pi $COLOR -o -# with timeouts -./futex_requeue_pi $COLOR -b -l -t 5000 -./futex_requeue_pi $COLOR -l -t 5000 -./futex_requeue_pi $COLOR -b -l -t 500000 -./futex_requeue_pi $COLOR -l -t 500000 -./futex_requeue_pi $COLOR -b -t 5000 -./futex_requeue_pi $COLOR -t 5000 -./futex_requeue_pi $COLOR -b -t 500000 -./futex_requeue_pi $COLOR -t 500000 -./futex_requeue_pi $COLOR -b -o -t 5000 -./futex_requeue_pi $COLOR -l -t 5000 -./futex_requeue_pi $COLOR -b -o -t 500000 -./futex_requeue_pi $COLOR -l -t 500000 -# with long timeout -./futex_requeue_pi $COLOR -b -l -t 2000000000 -./futex_requeue_pi $COLOR -l -t 2000000000 - +./futex_requeue_pi echo ./futex_requeue_pi_mismatched_ops $COLOR From 65a12ce20fb27067c0d946e7dcf83b9c9ff365fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:42 -0300 Subject: [PATCH 2388/3206] selftests/futex: Refactor futex_requeue_pi_mismatched_ops with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_requeue_pi_mismatched_ops test to use kselftest_harness header instead of futex's logging header. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../futex_requeue_pi_mismatched_ops.c | 86 +++++-------------- .../testing/selftests/futex/functional/run.sh | 2 +- 2 files changed, 23 insertions(+), 65 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c index d0a4d332ea4413..77135a22a583e1 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c @@ -23,67 +23,32 @@ #include #include #include -#include "futextest.h" -#include "logging.h" -#define TEST_NAME "futex-requeue-pi-mismatched-ops" +#include "futextest.h" +#include "../../kselftest_harness.h" futex_t f1 = FUTEX_INITIALIZER; futex_t f2 = FUTEX_INITIALIZER; int child_ret = 0; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *blocking_child(void *arg) { child_ret = futex_wait(&f1, f1, NULL, FUTEX_PRIVATE_FLAG); if (child_ret < 0) { child_ret = -errno; - error("futex_wait\n", errno); + ksft_exit_fail_msg("futex_wait\n"); } return (void *)&child_ret; } -int main(int argc, char *argv[]) +TEST(requeue_pi_mismatched_ops) { - int ret = RET_PASS; pthread_t child; - int c; + int ret; - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Detect mismatched requeue_pi operations\n", - basename(argv[0])); + if (pthread_create(&child, NULL, blocking_child, NULL)) + ksft_exit_fail_msg("pthread_create\n"); - if (pthread_create(&child, NULL, blocking_child, NULL)) { - error("pthread_create\n", errno); - ret = RET_ERROR; - goto out; - } /* Allow the child to block in the kernel. */ sleep(1); @@ -102,34 +67,27 @@ int main(int argc, char *argv[]) * FUTEX_WAKE. */ ret = futex_wake(&f1, 1, FUTEX_PRIVATE_FLAG); - if (ret == 1) { - ret = RET_PASS; - } else if (ret < 0) { - error("futex_wake\n", errno); - ret = RET_ERROR; - } else { - error("futex_wake did not wake the child\n", 0); - ret = RET_ERROR; - } + if (ret == 1) + ret = 0; + else if (ret < 0) + ksft_exit_fail_msg("futex_wake\n"); + else + ksft_exit_fail_msg("futex_wake did not wake the child\n"); } else { - error("futex_cmp_requeue_pi\n", errno); - ret = RET_ERROR; + ksft_exit_fail_msg("futex_cmp_requeue_pi\n"); } } else if (ret > 0) { - fail("futex_cmp_requeue_pi failed to detect the mismatch\n"); - ret = RET_FAIL; + ksft_test_result_fail("futex_cmp_requeue_pi failed to detect the mismatch\n"); } else { - error("futex_cmp_requeue_pi found no waiters\n", 0); - ret = RET_ERROR; + ksft_exit_fail_msg("futex_cmp_requeue_pi found no waiters\n"); } pthread_join(child, NULL); - if (!ret) - ret = child_ret; - - out: - /* If the kernel crashes, we shouldn't return at all. */ - print_result(TEST_NAME, ret); - return ret; + if (!ret && !child_ret) + ksft_test_result_pass("futex_requeue_pi_mismatched_ops passed\n"); + else + ksft_test_result_pass("futex_requeue_pi_mismatched_ops failed\n"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index d34e2235ac10d2..cc1b743668a208 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -35,7 +35,7 @@ echo ./futex_requeue_pi echo -./futex_requeue_pi_mismatched_ops $COLOR +./futex_requeue_pi_mismatched_ops echo ./futex_requeue_pi_signal_restart $COLOR From 2ef0615685094f6c7ad7e18498759f90015b7de8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:43 -0300 Subject: [PATCH 2389/3206] selftests/futex: Refactor futex_requeue_pi_signal_restart with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_requeue_pi_signal_restart test to use kselftest_harness header instead of futex's logging header. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../futex_requeue_pi_signal_restart.c | 129 +++++------------- .../testing/selftests/futex/functional/run.sh | 2 +- 2 files changed, 38 insertions(+), 93 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c index c6b8f32990c875..e34ee0f9ebccdb 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c @@ -24,11 +24,11 @@ #include #include #include + #include "atomic.h" #include "futextest.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue-pi-signal-restart" #define DELAY_US 100 futex_t f1 = FUTEX_INITIALIZER; @@ -37,15 +37,6 @@ atomic_t requeued = ATOMIC_INITIALIZER; int waiter_ret = 0; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, int policy, int prio) { @@ -57,35 +48,28 @@ int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, memset(&schedp, 0, sizeof(schedp)); ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); - if (ret) { - error("pthread_attr_setinheritsched\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setinheritsched\n"); ret = pthread_attr_setschedpolicy(&attr, policy); - if (ret) { - error("pthread_attr_setschedpolicy\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setschedpolicy\n"); schedp.sched_priority = prio; ret = pthread_attr_setschedparam(&attr, &schedp); - if (ret) { - error("pthread_attr_setschedparam\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setschedparam\n"); ret = pthread_create(pth, &attr, func, arg); - if (ret) { - error("pthread_create\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_create\n"); + return 0; } void handle_signal(int signo) { - info("signal received %s requeue\n", + ksft_print_dbg_msg("signal received %s requeue\n", requeued.val ? "after" : "prior to"); } @@ -94,78 +78,46 @@ void *waiterfn(void *arg) unsigned int old_val; int res; - waiter_ret = RET_PASS; - - info("Waiter running\n"); - info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Waiter running\n"); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); old_val = f1; res = futex_wait_requeue_pi(&f1, old_val, &(f2), NULL, FUTEX_PRIVATE_FLAG); if (!requeued.val || errno != EWOULDBLOCK) { - fail("unexpected return from futex_wait_requeue_pi: %d (%s)\n", + ksft_test_result_fail("unexpected return from futex_wait_requeue_pi: %d (%s)\n", res, strerror(errno)); - info("w2:futex: %x\n", f2); + ksft_print_dbg_msg("w2:futex: %x\n", f2); if (!res) futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - waiter_ret = RET_FAIL; } - info("Waiter exiting with %d\n", waiter_ret); pthread_exit(NULL); } -int main(int argc, char *argv[]) +TEST(futex_requeue_pi_signal_restart) { unsigned int old_val; struct sigaction sa; pthread_t waiter; - int c, res, ret = RET_PASS; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test signal handling during requeue_pi\n", - basename(argv[0])); - ksft_print_msg("\tArguments: \n"); + int res; sa.sa_handler = handle_signal; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; - if (sigaction(SIGUSR1, &sa, NULL)) { - error("sigaction\n", errno); - exit(1); - } + if (sigaction(SIGUSR1, &sa, NULL)) + ksft_exit_fail_msg("sigaction\n"); - info("m1:f2: %x\n", f2); - info("Creating waiter\n"); + ksft_print_dbg_msg("m1:f2: %x\n", f2); + ksft_print_dbg_msg("Creating waiter\n"); res = create_rt_thread(&waiter, waiterfn, NULL, SCHED_FIFO, 1); - if (res) { - error("Creating waiting thread failed", res); - ret = RET_ERROR; - goto out; - } + if (res) + ksft_exit_fail_msg("Creating waiting thread failed"); - info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); - info("m2:f2: %x\n", f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("m2:f2: %x\n", f2); futex_lock_pi(&f2, 0, 0, FUTEX_PRIVATE_FLAG); - info("m3:f2: %x\n", f2); + ksft_print_dbg_msg("m3:f2: %x\n", f2); while (1) { /* @@ -173,11 +125,11 @@ int main(int argc, char *argv[]) * restart futex_wait_requeue_pi() in the kernel. Wait for the * waiter to block on f1 again. */ - info("Issuing SIGUSR1 to waiter\n"); + ksft_print_dbg_msg("Issuing SIGUSR1 to waiter\n"); pthread_kill(waiter, SIGUSR1); usleep(DELAY_US); - info("Requeueing waiter via FUTEX_CMP_REQUEUE_PI\n"); + ksft_print_dbg_msg("Requeueing waiter via FUTEX_CMP_REQUEUE_PI\n"); old_val = f1; res = futex_cmp_requeue_pi(&f1, old_val, &(f2), 1, 0, FUTEX_PRIVATE_FLAG); @@ -191,12 +143,10 @@ int main(int argc, char *argv[]) atomic_set(&requeued, 1); break; } else if (res < 0) { - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); - ret = RET_ERROR; - break; + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); } } - info("m4:f2: %x\n", f2); + ksft_print_dbg_msg("m4:f2: %x\n", f2); /* * Signal the waiter after requeue, waiter should return from @@ -204,19 +154,14 @@ int main(int argc, char *argv[]) * futex_unlock_pi() can't happen before the signal wakeup is detected * in the kernel. */ - info("Issuing SIGUSR1 to waiter\n"); + ksft_print_dbg_msg("Issuing SIGUSR1 to waiter\n"); pthread_kill(waiter, SIGUSR1); - info("Waiting for waiter to return\n"); + ksft_print_dbg_msg("Waiting for waiter to return\n"); pthread_join(waiter, NULL); - info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", f2, &f2); futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - info("m5:f2: %x\n", f2); - - out: - if (ret == RET_PASS && waiter_ret) - ret = waiter_ret; - - print_result(TEST_NAME, ret); - return ret; + ksft_print_dbg_msg("m5:f2: %x\n", f2); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index cc1b743668a208..6d48a7ea95cf99 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -38,7 +38,7 @@ echo ./futex_requeue_pi_mismatched_ops echo -./futex_requeue_pi_signal_restart $COLOR +./futex_requeue_pi_signal_restart echo ./futex_wait_timeout $COLOR From 0c02abf6389e01a3b78091394bc9624d5f1d84fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:44 -0300 Subject: [PATCH 2390/3206] selftests/futex: Refactor futex_wait_timeout with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_wait_timeout test to use kselftest_harness header instead of futex's logging header. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../futex/functional/futex_wait_timeout.c | 139 ++++++++---------- .../testing/selftests/futex/functional/run.sh | 2 +- 2 files changed, 61 insertions(+), 80 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c index d183f878360bcd..0c8766aced2e4d 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c +++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c @@ -16,26 +16,15 @@ *****************************************************************************/ #include + #include "futextest.h" #include "futex2test.h" -#include "logging.h" - -#define TEST_NAME "futex-wait-timeout" +#include "../../kselftest_harness.h" static long timeout_ns = 100000; /* 100us default timeout */ static futex_t futex_pi; static pthread_barrier_t barrier; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -t N Timeout in nanoseconds (default: 100,000)\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - /* * Get a PI lock and hold it forever, so the main thread lock_pi will block * and we can test the timeout @@ -47,13 +36,13 @@ void *get_pi_lock(void *arg) ret = futex_lock_pi(&futex_pi, NULL, 0, 0); if (ret != 0) - error("futex_lock_pi failed\n", ret); + ksft_exit_fail_msg("futex_lock_pi failed\n"); pthread_barrier_wait(&barrier); /* Blocks forever */ ret = futex_wait(&lock, 0, NULL, 0); - error("futex_wait failed\n", ret); + ksft_exit_fail_msg("futex_wait failed\n"); return NULL; } @@ -61,12 +50,11 @@ void *get_pi_lock(void *arg) /* * Check if the function returned the expected error */ -static void test_timeout(int res, int *ret, char *test_name, int err) +static void test_timeout(int res, char *test_name, int err) { if (!res || errno != err) { ksft_test_result_fail("%s returned %d\n", test_name, res < 0 ? errno : res); - *ret = RET_FAIL; } else { ksft_test_result_pass("%s succeeds\n", test_name); } @@ -78,10 +66,8 @@ static void test_timeout(int res, int *ret, char *test_name, int err) static int futex_get_abs_timeout(clockid_t clockid, struct timespec *to, long timeout_ns) { - if (clock_gettime(clockid, to)) { - error("clock_gettime failed\n", errno); - return errno; - } + if (clock_gettime(clockid, to)) + ksft_exit_fail_msg("clock_gettime failed\n"); to->tv_nsec += timeout_ns; @@ -93,83 +79,66 @@ static int futex_get_abs_timeout(clockid_t clockid, struct timespec *to, return 0; } -int main(int argc, char *argv[]) +TEST(wait_bitset) { futex_t f1 = FUTEX_INITIALIZER; - int res, ret = RET_PASS; struct timespec to; - pthread_t thread; - int c; - struct futex_waitv waitv = { - .uaddr = (uintptr_t)&f1, - .val = f1, - .flags = FUTEX_32, - .__reserved = 0 - }; - - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 't': - timeout_ns = atoi(optarg); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(9); - ksft_print_msg("%s: Block on a futex and wait for timeout\n", - basename(argv[0])); - ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); - - pthread_barrier_init(&barrier, NULL, 2); - pthread_create(&thread, NULL, get_pi_lock, NULL); + int res; /* initialize relative timeout */ to.tv_sec = 0; to.tv_nsec = timeout_ns; res = futex_wait(&f1, f1, &to, 0); - test_timeout(res, &ret, "futex_wait relative", ETIMEDOUT); + test_timeout(res, "futex_wait relative", ETIMEDOUT); /* FUTEX_WAIT_BITSET with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_bitset(&f1, f1, &to, 1, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_wait_bitset realtime", ETIMEDOUT); + test_timeout(res, "futex_wait_bitset realtime", ETIMEDOUT); /* FUTEX_WAIT_BITSET with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_bitset(&f1, f1, &to, 1, 0); - test_timeout(res, &ret, "futex_wait_bitset monotonic", ETIMEDOUT); + test_timeout(res, "futex_wait_bitset monotonic", ETIMEDOUT); +} + +TEST(requeue_pi) +{ + futex_t f1 = FUTEX_INITIALIZER; + struct timespec to; + int res; /* FUTEX_WAIT_REQUEUE_PI with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_requeue_pi(&f1, f1, &futex_pi, &to, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_wait_requeue_pi realtime", ETIMEDOUT); + test_timeout(res, "futex_wait_requeue_pi realtime", ETIMEDOUT); /* FUTEX_WAIT_REQUEUE_PI with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_requeue_pi(&f1, f1, &futex_pi, &to, 0); - test_timeout(res, &ret, "futex_wait_requeue_pi monotonic", ETIMEDOUT); + test_timeout(res, "futex_wait_requeue_pi monotonic", ETIMEDOUT); + +} + +TEST(lock_pi) +{ + struct timespec to; + pthread_t thread; + int res; + + /* Create a thread that will lock forever so any waiter will timeout */ + pthread_barrier_init(&barrier, NULL, 2); + pthread_create(&thread, NULL, get_pi_lock, NULL); /* Wait until the other thread calls futex_lock_pi() */ pthread_barrier_wait(&barrier); pthread_barrier_destroy(&barrier); + /* * FUTEX_LOCK_PI with CLOCK_REALTIME * Due to historical reasons, FUTEX_LOCK_PI supports only realtime @@ -181,26 +150,38 @@ int main(int argc, char *argv[]) * smaller than realtime and the syscall will timeout immediately. */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_lock_pi(&futex_pi, &to, 0, 0); - test_timeout(res, &ret, "futex_lock_pi realtime", ETIMEDOUT); + test_timeout(res, "futex_lock_pi realtime", ETIMEDOUT); /* Test operations that don't support FUTEX_CLOCK_REALTIME */ res = futex_lock_pi(&futex_pi, NULL, 0, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_lock_pi invalid timeout flag", ENOSYS); + test_timeout(res, "futex_lock_pi invalid timeout flag", ENOSYS); +} + +TEST(waitv) +{ + futex_t f1 = FUTEX_INITIALIZER; + struct futex_waitv waitv = { + .uaddr = (uintptr_t)&f1, + .val = f1, + .flags = FUTEX_32, + .__reserved = 0, + }; + struct timespec to; + int res; /* futex_waitv with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); - test_timeout(res, &ret, "futex_waitv monotonic", ETIMEDOUT); + test_timeout(res, "futex_waitv monotonic", ETIMEDOUT); /* futex_waitv with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_REALTIME); - test_timeout(res, &ret, "futex_waitv realtime", ETIMEDOUT); - - ksft_print_cnts(); - return ret; + test_timeout(res, "futex_waitv realtime", ETIMEDOUT); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 6d48a7ea95cf99..0af695038003b1 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -41,7 +41,7 @@ echo ./futex_requeue_pi_signal_restart echo -./futex_wait_timeout $COLOR +./futex_wait_timeout echo ./futex_wait_wouldblock $COLOR From f5a16834410a89f929657d9b8a5a4011c05293c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:45 -0300 Subject: [PATCH 2391/3206] selftests/futex: Refactor futex_wait_wouldblock with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_wait_wouldblock test to use kselftest_harness header instead of futex's logging header. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../futex/functional/futex_wait_wouldblock.c | 76 ++++++------------- .../testing/selftests/futex/functional/run.sh | 2 +- 2 files changed, 24 insertions(+), 54 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c index 2d8230da906429..36b7a54a40851a 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c @@ -21,72 +21,44 @@ #include #include #include + #include "futextest.h" #include "futex2test.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-wouldblock" #define timeout_ns 100000 -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - -int main(int argc, char *argv[]) +TEST(futex_wait_wouldblock) { struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; futex_t f1 = FUTEX_INITIALIZER; - int res, ret = RET_PASS; - int c; - struct futex_waitv waitv = { - .uaddr = (uintptr_t)&f1, - .val = f1+1, - .flags = FUTEX_32, - .__reserved = 0 - }; + int res; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(2); - ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", - basename(argv[0])); - - info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + ksft_print_dbg_msg("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG); if (!res || errno != EWOULDBLOCK) { ksft_test_result_fail("futex_wait returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wait\n"); } +} - if (clock_gettime(CLOCK_MONOTONIC, &to)) { - error("clock_gettime failed\n", errno); - return errno; - } +TEST(futex_waitv_wouldblock) +{ + struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; + futex_t f1 = FUTEX_INITIALIZER; + struct futex_waitv waitv = { + .uaddr = (uintptr_t)&f1, + .val = f1 + 1, + .flags = FUTEX_32, + .__reserved = 0, + }; + int res; + + if (clock_gettime(CLOCK_MONOTONIC, &to)) + ksft_exit_fail_msg("clock_gettime failed %d\n", errno); to.tv_nsec += timeout_ns; @@ -95,17 +67,15 @@ int main(int argc, char *argv[]) to.tv_nsec -= 1000000000; } - info("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + ksft_print_dbg_msg("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); if (!res || errno != EWOULDBLOCK) { ksft_test_result_fail("futex_waitv returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 0af695038003b1..a5246d59849853 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -44,7 +44,7 @@ echo ./futex_wait_timeout echo -./futex_wait_wouldblock $COLOR +./futex_wait_wouldblock echo ./futex_wait_uninitialized_heap $COLOR From af3c79f8575d751914d1175c02dc024ecc173c58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:46 -0300 Subject: [PATCH 2392/3206] selftests/futex: Refactor futex_wait_unitialized_heap with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_wait_unitialized_heap test to use kselftest_harness header instead of futex's logging header. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../futex_wait_uninitialized_heap.c | 76 +++++-------------- .../testing/selftests/futex/functional/run.sh | 2 +- 2 files changed, 19 insertions(+), 59 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c index ed9cd07e31c1a9..ce2301500d839c 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c +++ b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c @@ -29,95 +29,55 @@ #include #include -#include "logging.h" #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-uninitialized-heap" #define WAIT_US 5000000 static int child_blocked = 1; -static int child_ret; +static bool child_ret; void *buf; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *wait_thread(void *arg) { int res; - child_ret = RET_PASS; + child_ret = true; res = futex_wait(buf, 1, NULL, 0); child_blocked = 0; if (res != 0 && errno != EWOULDBLOCK) { - error("futex failure\n", errno); - child_ret = RET_ERROR; + ksft_exit_fail_msg("futex failure\n"); + child_ret = false; } pthread_exit(NULL); } -int main(int argc, char **argv) +TEST(futex_wait_uninitialized_heap) { - int c, ret = RET_PASS; long page_size; pthread_t thr; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } + int ret; page_size = sysconf(_SC_PAGESIZE); buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); - if (buf == (void *)-1) { - error("mmap\n", errno); - exit(1); - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test the uninitialized futex value in FUTEX_WAIT\n", - basename(argv[0])); - + if (buf == (void *)-1) + ksft_exit_fail_msg("mmap\n"); ret = pthread_create(&thr, NULL, wait_thread, NULL); - if (ret) { - error("pthread_create\n", errno); - ret = RET_ERROR; - goto out; - } + if (ret) + ksft_exit_fail_msg("pthread_create\n"); - info("waiting %dus for child to return\n", WAIT_US); + ksft_print_dbg_msg("waiting %dus for child to return\n", WAIT_US); usleep(WAIT_US); - ret = child_ret; - if (child_blocked) { - fail("child blocked in kernel\n"); - ret = RET_FAIL; - } + if (child_blocked) + ksft_test_result_fail("child blocked in kernel\n"); - out: - print_result(TEST_NAME, ret); - return ret; + if (!child_ret) + ksft_test_result_fail("child error\n"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index a5246d59849853..6cb07a4e760d2a 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -47,7 +47,7 @@ echo ./futex_wait_wouldblock echo -./futex_wait_uninitialized_heap $COLOR +./futex_wait_uninitialized_heap ./futex_wait_private_mapped_file $COLOR echo From 14d016bd72822c0464e9aed4b6a620474923818d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:47 -0300 Subject: [PATCH 2393/3206] selftests/futex: Refactor futex_wait_private_mapped_file with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_wait_private_mapped_file test to use kselftest_harness header instead of futex's logging header. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../futex_wait_private_mapped_file.c | 83 +++++-------------- .../testing/selftests/futex/functional/run.sh | 2 +- 2 files changed, 21 insertions(+), 64 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c index fb4148f23fa372..8952ebda14ab8a 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c +++ b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c @@ -27,10 +27,9 @@ #include #include -#include "logging.h" #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-private-mapped-file" #define PAGE_SZ 4096 char pad[PAGE_SZ] = {1}; @@ -40,86 +39,44 @@ char pad2[PAGE_SZ] = {1}; #define WAKE_WAIT_US 3000000 struct timespec wait_timeout = { .tv_sec = 5, .tv_nsec = 0}; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *thr_futex_wait(void *arg) { int ret; - info("futex wait\n"); + ksft_print_dbg_msg("futex wait\n"); ret = futex_wait(&val, 1, &wait_timeout, 0); - if (ret && errno != EWOULDBLOCK && errno != ETIMEDOUT) { - error("futex error.\n", errno); - print_result(TEST_NAME, RET_ERROR); - exit(RET_ERROR); - } + if (ret && errno != EWOULDBLOCK && errno != ETIMEDOUT) + ksft_exit_fail_msg("futex error.\n"); if (ret && errno == ETIMEDOUT) - fail("waiter timedout\n"); + ksft_exit_fail_msg("waiter timedout\n"); - info("futex_wait: ret = %d, errno = %d\n", ret, errno); + ksft_print_dbg_msg("futex_wait: ret = %d, errno = %d\n", ret, errno); return NULL; } -int main(int argc, char **argv) +TEST(wait_private_mapped_file) { pthread_t thr; - int ret = RET_PASS; int res; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg( - "%s: Test the futex value of private file mappings in FUTEX_WAIT\n", - basename(argv[0])); - - ret = pthread_create(&thr, NULL, thr_futex_wait, NULL); - if (ret < 0) { - fprintf(stderr, "pthread_create error\n"); - ret = RET_ERROR; - goto out; - } - - info("wait a while\n"); + + res = pthread_create(&thr, NULL, thr_futex_wait, NULL); + if (res < 0) + ksft_exit_fail_msg("pthread_create error\n"); + + ksft_print_dbg_msg("wait a while\n"); usleep(WAKE_WAIT_US); val = 2; res = futex_wake(&val, 1, 0); - info("futex_wake %d\n", res); - if (res != 1) { - fail("FUTEX_WAKE didn't find the waiting thread.\n"); - ret = RET_FAIL; - } + ksft_print_dbg_msg("futex_wake %d\n", res); + if (res != 1) + ksft_exit_fail_msg("FUTEX_WAKE didn't find the waiting thread.\n"); - info("join\n"); + ksft_print_dbg_msg("join\n"); pthread_join(thr, NULL); - out: - print_result(TEST_NAME, ret); - return ret; + ksft_test_result_pass("wait_private_mapped_file"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 6cb07a4e760d2a..87666f21fa3bf9 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -48,7 +48,7 @@ echo echo ./futex_wait_uninitialized_heap -./futex_wait_private_mapped_file $COLOR +./futex_wait_private_mapped_file echo ./futex_wait $COLOR From e5c04d0f3ea0dfa4858a449ff3153bc3cef6a140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:48 -0300 Subject: [PATCH 2394/3206] selftests/futex: Refactor futex_wait with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_wait test to use kselftest_harness header instead of futex's logging header. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../selftests/futex/functional/futex_wait.c | 103 +++++++----------- .../testing/selftests/futex/functional/run.sh | 2 +- 2 files changed, 39 insertions(+), 66 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_wait.c b/tools/testing/selftests/futex/functional/futex_wait.c index 685140d9b93d2f..152ca46128866f 100644 --- a/tools/testing/selftests/futex/functional/futex_wait.c +++ b/tools/testing/selftests/futex/functional/futex_wait.c @@ -9,25 +9,16 @@ #include #include #include -#include "logging.h" + #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait" #define timeout_ns 30000000 #define WAKE_WAIT_US 10000 #define SHM_PATH "futex_shm_file" void *futex; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - static void *waiterfn(void *arg) { struct timespec to; @@ -45,53 +36,37 @@ static void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(private_futex) { - int res, ret = RET_PASS, fd, c, shm_id; - u_int32_t f_private = 0, *shared_data; unsigned int flags = FUTEX_PRIVATE_FLAG; + u_int32_t f_private = 0; pthread_t waiter; - void *shm; + int res; futex = &f_private; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(3); - ksft_print_msg("%s: Test futex_wait\n", basename(argv[0])); - /* Testing a private futex */ - info("Calling private futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling private futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, (void *) &flags)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling private futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling private futex_wake on futex: %p\n", futex); res = futex_wake(futex, 1, FUTEX_PRIVATE_FLAG); if (res != 1) { ksft_test_result_fail("futex_wake private returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake private succeeds\n"); } +} + +TEST(anon_page) +{ + u_int32_t *shared_data; + pthread_t waiter; + int res, shm_id; /* Testing an anon page shared memory */ shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); @@ -105,67 +80,65 @@ int main(int argc, char *argv[]) *shared_data = 0; futex = shared_data; - info("Calling shared (page anon) futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (page anon) futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling shared (page anon) futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (page anon) futex_wake on futex: %p\n", futex); res = futex_wake(futex, 1, 0); if (res != 1) { ksft_test_result_fail("futex_wake shared (page anon) returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake shared (page anon) succeeds\n"); } + shmdt(shared_data); +} + +TEST(file_backed) +{ + u_int32_t f_private = 0; + pthread_t waiter; + int res, fd; + void *shm; /* Testing a file backed shared memory */ fd = open(SHM_PATH, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); - if (fd < 0) { - perror("open"); - exit(1); - } + if (fd < 0) + ksft_exit_fail_msg("open"); - if (ftruncate(fd, sizeof(f_private))) { - perror("ftruncate"); - exit(1); - } + if (ftruncate(fd, sizeof(f_private))) + ksft_exit_fail_msg("ftruncate"); shm = mmap(NULL, sizeof(f_private), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (shm == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (shm == MAP_FAILED) + ksft_exit_fail_msg("mmap"); memcpy(shm, &f_private, sizeof(f_private)); futex = shm; - info("Calling shared (file backed) futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (file backed) futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling shared (file backed) futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (file backed) futex_wake on futex: %p\n", futex); res = futex_wake(shm, 1, 0); if (res != 1) { ksft_test_result_fail("futex_wake shared (file backed) returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake shared (file backed) succeeds\n"); } - /* Freeing resources */ - shmdt(shared_data); munmap(shm, sizeof(f_private)); remove(SHM_PATH); close(fd); - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 87666f21fa3bf9..1ce0f20cadfcc3 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -51,7 +51,7 @@ echo ./futex_wait_private_mapped_file echo -./futex_wait $COLOR +./futex_wait echo ./futex_requeue $COLOR From f341a20f6d7e25669b2fdf2b071bdd92962e5897 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:49 -0300 Subject: [PATCH 2395/3206] selftests/futex: Refactor futex_requeue with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_requeue test to use kselftest_harness header instead of futex's logging header. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../futex/functional/futex_requeue.c | 76 ++++++------------- .../testing/selftests/futex/functional/run.sh | 2 +- 2 files changed, 24 insertions(+), 54 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_requeue.c b/tools/testing/selftests/futex/functional/futex_requeue.c index 51485be6eb2f1b..69e2555b603991 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue.c +++ b/tools/testing/selftests/futex/functional/futex_requeue.c @@ -7,24 +7,15 @@ #include #include -#include "logging.h" + #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue" #define timeout_ns 30000000 #define WAKE_WAIT_US 10000 volatile futex_t *f1; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *waiterfn(void *arg) { struct timespec to; @@ -38,67 +29,49 @@ void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(requeue_single) { - pthread_t waiter[10]; - int res, ret = RET_PASS; - int c, i; volatile futex_t _f1 = 0; volatile futex_t f2 = 0; + pthread_t waiter[10]; + int res; f1 = &_f1; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(2); - ksft_print_msg("%s: Test futex_requeue\n", - basename(argv[0])); - /* * Requeue a waiter from f1 to f2, and wake f2. */ if (pthread_create(&waiter[0], NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Requeuing 1 futex from f1 to f2\n"); + ksft_print_dbg_msg("Requeuing 1 futex from f1 to f2\n"); res = futex_cmp_requeue(f1, 0, &f2, 0, 1, 0); - if (res != 1) { + if (res != 1) ksft_test_result_fail("futex_requeue simple returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; - } - - info("Waking 1 futex at f2\n"); + ksft_print_dbg_msg("Waking 1 futex at f2\n"); res = futex_wake(&f2, 1, 0); if (res != 1) { ksft_test_result_fail("futex_requeue simple returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_requeue simple succeeds\n"); } +} + +TEST(requeue_multiple) +{ + volatile futex_t _f1 = 0; + volatile futex_t f2 = 0; + pthread_t waiter[10]; + int res, i; + f1 = &_f1; /* * Create 10 waiters at f1. At futex_requeue, wake 3 and requeue 7. @@ -106,31 +79,28 @@ int main(int argc, char *argv[]) */ for (i = 0; i < 10; i++) { if (pthread_create(&waiter[i], NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); } usleep(WAKE_WAIT_US); - info("Waking 3 futexes at f1 and requeuing 7 futexes from f1 to f2\n"); + ksft_print_dbg_msg("Waking 3 futexes at f1 and requeuing 7 futexes from f1 to f2\n"); res = futex_cmp_requeue(f1, 0, &f2, 3, 7, 0); if (res != 10) { ksft_test_result_fail("futex_requeue many returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } - info("Waking INT_MAX futexes at f2\n"); + ksft_print_dbg_msg("Waking INT_MAX futexes at f2\n"); res = futex_wake(&f2, INT_MAX, 0); if (res != 7) { ksft_test_result_fail("futex_requeue many returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_requeue many succeeds\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 1ce0f20cadfcc3..b711351a10ec8a 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -54,7 +54,7 @@ echo ./futex_wait echo -./futex_requeue $COLOR +./futex_requeue echo ./futex_waitv $COLOR From a91e8e372e4f1ecce2a87905d4fa437865158b1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:50 -0300 Subject: [PATCH 2396/3206] selftests/futex: Refactor futex_waitv with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_waitv test to use kselftest_harness header instead of futex's logging header. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../selftests/futex/functional/futex_waitv.c | 99 +++++++++---------- .../testing/selftests/futex/functional/run.sh | 2 +- 2 files changed, 45 insertions(+), 56 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_waitv.c b/tools/testing/selftests/futex/functional/futex_waitv.c index a94337f677e181..c684b10eb76e29 100644 --- a/tools/testing/selftests/futex/functional/futex_waitv.c +++ b/tools/testing/selftests/futex/functional/futex_waitv.c @@ -15,25 +15,16 @@ #include #include #include + #include "futextest.h" #include "futex2test.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait" #define WAKE_WAIT_US 10000 #define NR_FUTEXES 30 static struct futex_waitv waitv[NR_FUTEXES]; u_int32_t futexes[NR_FUTEXES] = {0}; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *waiterfn(void *arg) { struct timespec to; @@ -41,7 +32,7 @@ void *waiterfn(void *arg) /* setting absolute timeout for futex2 */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -57,34 +48,10 @@ void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(private_waitv) { pthread_t waiter; - int res, ret = RET_PASS; - struct timespec to; - int c, i; - - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(7); - ksft_print_msg("%s: Test FUTEX_WAITV\n", - basename(argv[0])); + int res, i; for (i = 0; i < NR_FUTEXES; i++) { waitv[i].uaddr = (uintptr_t)&futexes[i]; @@ -95,7 +62,7 @@ int main(int argc, char *argv[]) /* Private waitv */ if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); @@ -104,10 +71,15 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv private\n"); } +} + +TEST(shared_waitv) +{ + pthread_t waiter; + int res, i; /* Shared waitv */ for (i = 0; i < NR_FUTEXES; i++) { @@ -128,7 +100,7 @@ int main(int argc, char *argv[]) } if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); @@ -137,19 +109,24 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake shared returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv shared\n"); } for (i = 0; i < NR_FUTEXES; i++) shmdt(u64_to_ptr(waitv[i].uaddr)); +} + +TEST(invalid_flag) +{ + struct timespec to; + int res; /* Testing a waiter without FUTEX_32 flag */ waitv[0].flags = FUTEX_PRIVATE_FLAG; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -158,17 +135,22 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv without FUTEX_32\n"); } +} + +TEST(unaligned_address) +{ + struct timespec to; + int res; /* Testing a waiter with an unaligned address */ waitv[0].flags = FUTEX_PRIVATE_FLAG | FUTEX_32; waitv[0].uaddr = 1; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -177,16 +159,21 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv with an unaligned address\n"); } +} + +TEST(null_address) +{ + struct timespec to; + int res; /* Testing a NULL address for waiters.uaddr */ waitv[0].uaddr = 0x00000000; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -195,14 +182,13 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv NULL address in waitv.uaddr\n"); } /* Testing a NULL address for *waiters */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -211,14 +197,19 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv NULL address in *waiters\n"); } +} + +TEST(invalid_clockid) +{ + struct timespec to; + int res; /* Testing an invalid clockid */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -227,11 +218,9 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv invalid clockid\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index b711351a10ec8a..4a7b0efbcce296 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -57,7 +57,7 @@ echo ./futex_requeue echo -./futex_waitv $COLOR +./futex_waitv echo ./futex_priv_hash $COLOR From 4ba629e6c6dc6eef1a6b96dc06c140e622a8f836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:51 -0300 Subject: [PATCH 2397/3206] selftests/futex: Refactor futex_priv_hash with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_priv_hash test to use kselftest_harness header instead of futex's logging header. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../futex/functional/futex_priv_hash.c | 48 +++---------------- .../testing/selftests/futex/functional/run.sh | 2 +- 2 files changed, 8 insertions(+), 42 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_priv_hash.c b/tools/testing/selftests/futex/functional/futex_priv_hash.c index 95f01603a6813c..3b7b5851f290fc 100644 --- a/tools/testing/selftests/futex/functional/futex_priv_hash.c +++ b/tools/testing/selftests/futex/functional/futex_priv_hash.c @@ -14,7 +14,7 @@ #include #include -#include "logging.h" +#include "../../kselftest_harness.h" #define MAX_THREADS 64 @@ -128,45 +128,14 @@ static void futex_dummy_op(void) ksft_exit_fail_msg("pthread_mutex_timedlock() did not timeout: %d.\n", ret); } -static void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - static const char *test_msg_auto_create = "Automatic hash bucket init on thread creation.\n"; static const char *test_msg_auto_inc = "Automatic increase with more than 16 CPUs\n"; -int main(int argc, char *argv[]) +TEST(priv_hash) { int futex_slots1, futex_slotsn, online_cpus; pthread_mutexattr_t mutex_attr_pi; int ret, retry = 20; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(21); ret = pthread_mutexattr_init(&mutex_attr_pi); ret |= pthread_mutexattr_setprotocol(&mutex_attr_pi, PTHREAD_PRIO_INHERIT); @@ -279,7 +248,7 @@ int main(int argc, char *argv[]) ret = futex_hash_slots_set(0); ksft_test_result(ret == 0, "Global hash request\n"); if (ret != 0) - goto out; + return; futex_hash_slots_set_must_fail(4); futex_hash_slots_set_must_fail(8); @@ -288,17 +257,14 @@ int main(int argc, char *argv[]) futex_hash_slots_set_must_fail(6); ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS); - if (ret != 0) { + if (ret != 0) ksft_exit_fail_msg("pthread_barrier_init failed: %m\n"); - return 1; - } + create_max_threads(thread_lock_fn); join_max_threads(); ret = futex_hash_slots_get(); ksft_test_result(ret == 0, "Continue to use global hash\n"); - -out: - ksft_finished(); - return 0; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 4a7b0efbcce296..f725531f06c4a8 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -60,7 +60,7 @@ echo ./futex_waitv echo -./futex_priv_hash $COLOR +./futex_priv_hash echo ./futex_numa_mpol $COLOR From d35ca2f642726a2c8a85c5b864fd05f59f3965af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:52 -0300 Subject: [PATCH 2398/3206] selftests/futex: Refactor futex_numa_mpol with kselftest_harness.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To reduce the boilerplate code, refactor futex_numa_mpol test to use kselftest_harness header instead of futex's logging header. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../futex/functional/futex_numa_mpol.c | 41 ++++--------------- .../testing/selftests/futex/functional/run.sh | 15 +------ 2 files changed, 9 insertions(+), 47 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index 722427fe90bf0f..afe5d95b644150 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -16,9 +16,9 @@ #include #include -#include "logging.h" #include "futextest.h" #include "futex2test.h" +#include "../../kselftest_harness.h" #define MAX_THREADS 64 @@ -131,42 +131,16 @@ static void test_futex(void *futex_ptr, int err_value) __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); } -static void usage(char *prog) +static void test_futex_mpol(void *futex_ptr, int err_value) { - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); + __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); } -int main(int argc, char *argv[]) +TEST(futex_numa_mpol) { struct futex32_numa *futex_numa; - int mem_size; void *futex_ptr; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(2); + int mem_size; mem_size = sysconf(_SC_PAGE_SIZE); futex_ptr = mmap(NULL, mem_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); @@ -239,6 +213,7 @@ int main(int argc, char *argv[]) #else ksft_test_result_skip("futex2 MPOL hints test requires libnuma 2.0.16+\n"); #endif - ksft_finished(); - return 0; + munmap(futex_ptr, mem_size * 2); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index f725531f06c4a8..e88545c06d57a7 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -18,19 +18,6 @@ # ############################################################################### -# Test for a color capable console -if [ -z "$USE_COLOR" ]; then - tput setf 7 || tput setaf 7 - if [ $? -eq 0 ]; then - USE_COLOR=1 - tput sgr0 - fi -fi -if [ "$USE_COLOR" -eq 1 ]; then - COLOR="-c" -fi - - echo ./futex_requeue_pi @@ -63,4 +50,4 @@ echo ./futex_priv_hash echo -./futex_numa_mpol $COLOR +./futex_numa_mpol From b257d91c4db5bafcc3f67eab120e75bda4f61137 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:53 -0300 Subject: [PATCH 2399/3206] selftests/futex: Drop logging.h include from futex_numa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit futex_numa doesn't really use logging.h helpers, it's only need two includes from this file. So drop it and include the two missing includes. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- tools/testing/selftests/futex/functional/futex_numa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/futex/functional/futex_numa.c b/tools/testing/selftests/futex/functional/futex_numa.c index f29e4d627e7942..e0a33510ccb60c 100644 --- a/tools/testing/selftests/futex/functional/futex_numa.c +++ b/tools/testing/selftests/futex/functional/futex_numa.c @@ -5,9 +5,10 @@ #include #include #include +#include +#include #include #include -#include "logging.h" #include "futextest.h" #include "futex2test.h" From 520db0559deff096c33a95dd3be2583e02771261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 17 Sep 2025 18:21:54 -0300 Subject: [PATCH 2400/3206] selftests/futex: Remove logging.h file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every futex selftest uses the kselftest_harness.h helper and don't need the logging.h file. Delete it. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner --- .../selftests/futex/functional/Makefile | 3 +- .../testing/selftests/futex/include/logging.h | 148 ------------------ 2 files changed, 1 insertion(+), 150 deletions(-) delete mode 100644 tools/testing/selftests/futex/include/logging.h diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index bd50aecfca8a31..490ace1f017e86 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -8,8 +8,7 @@ LDLIBS := -lpthread -lrt -lnuma LOCAL_HDRS := \ ../include/futextest.h \ - ../include/atomic.h \ - ../include/logging.h + ../include/atomic.h TEST_GEN_PROGS := \ futex_wait_timeout \ futex_wait_wouldblock \ diff --git a/tools/testing/selftests/futex/include/logging.h b/tools/testing/selftests/futex/include/logging.h deleted file mode 100644 index 874c69ce5cce9e..00000000000000 --- a/tools/testing/selftests/futex/include/logging.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/****************************************************************************** - * - * Copyright © International Business Machines Corp., 2009 - * - * DESCRIPTION - * Glibc independent futex library for testing kernel functionality. - * - * AUTHOR - * Darren Hart - * - * HISTORY - * 2009-Nov-6: Initial version by Darren Hart - * - *****************************************************************************/ - -#ifndef _LOGGING_H -#define _LOGGING_H - -#include -#include -#include -#include -#include "kselftest.h" - -/* - * Define PASS, ERROR, and FAIL strings with and without color escape - * sequences, default to no color. - */ -#define ESC 0x1B, '[' -#define BRIGHT '1' -#define GREEN '3', '2' -#define YELLOW '3', '3' -#define RED '3', '1' -#define ESCEND 'm' -#define BRIGHT_GREEN ESC, BRIGHT, ';', GREEN, ESCEND -#define BRIGHT_YELLOW ESC, BRIGHT, ';', YELLOW, ESCEND -#define BRIGHT_RED ESC, BRIGHT, ';', RED, ESCEND -#define RESET_COLOR ESC, '0', 'm' -static const char PASS_COLOR[] = {BRIGHT_GREEN, ' ', 'P', 'A', 'S', 'S', - RESET_COLOR, 0}; -static const char ERROR_COLOR[] = {BRIGHT_YELLOW, 'E', 'R', 'R', 'O', 'R', - RESET_COLOR, 0}; -static const char FAIL_COLOR[] = {BRIGHT_RED, ' ', 'F', 'A', 'I', 'L', - RESET_COLOR, 0}; -static const char INFO_NORMAL[] = " INFO"; -static const char PASS_NORMAL[] = " PASS"; -static const char ERROR_NORMAL[] = "ERROR"; -static const char FAIL_NORMAL[] = " FAIL"; -const char *INFO = INFO_NORMAL; -const char *PASS = PASS_NORMAL; -const char *ERROR = ERROR_NORMAL; -const char *FAIL = FAIL_NORMAL; - -/* Verbosity setting for INFO messages */ -#define VQUIET 0 -#define VCRITICAL 1 -#define VINFO 2 -#define VMAX VINFO -int _verbose = VCRITICAL; - -/* Functional test return codes */ -#define RET_PASS 0 -#define RET_ERROR -1 -#define RET_FAIL -2 - -/** - * log_color() - Use colored output for PASS, ERROR, and FAIL strings - * @use_color: use color (1) or not (0) - */ -void log_color(int use_color) -{ - if (use_color) { - PASS = PASS_COLOR; - ERROR = ERROR_COLOR; - FAIL = FAIL_COLOR; - } else { - PASS = PASS_NORMAL; - ERROR = ERROR_NORMAL; - FAIL = FAIL_NORMAL; - } -} - -/** - * log_verbosity() - Set verbosity of test output - * @verbose: Enable (1) verbose output or not (0) - * - * Currently setting verbose=1 will enable INFO messages and 0 will disable - * them. FAIL and ERROR messages are always displayed. - */ -void log_verbosity(int level) -{ - if (level > VMAX) - level = VMAX; - else if (level < 0) - level = 0; - _verbose = level; -} - -/** - * print_result() - Print standard PASS | ERROR | FAIL results - * @ret: the return value to be considered: 0 | RET_ERROR | RET_FAIL - * - * print_result() is primarily intended for functional tests. - */ -void print_result(const char *test_name, int ret) -{ - switch (ret) { - case RET_PASS: - ksft_test_result_pass("%s\n", test_name); - ksft_print_cnts(); - return; - case RET_ERROR: - ksft_test_result_error("%s\n", test_name); - ksft_print_cnts(); - return; - case RET_FAIL: - ksft_test_result_fail("%s\n", test_name); - ksft_print_cnts(); - return; - } -} - -/* log level macros */ -#define info(message, vargs...) \ -do { \ - if (_verbose >= VINFO) \ - fprintf(stderr, "\t%s: "message, INFO, ##vargs); \ -} while (0) - -#define error(message, err, args...) \ -do { \ - if (_verbose >= VCRITICAL) {\ - if (err) \ - fprintf(stderr, "\t%s: %s: "message, \ - ERROR, strerror(err), ##args); \ - else \ - fprintf(stderr, "\t%s: "message, ERROR, ##args); \ - } \ -} while (0) - -#define fail(message, args...) \ -do { \ - if (_verbose >= VCRITICAL) \ - fprintf(stderr, "\t%s: "message, FAIL, ##args); \ -} while (0) - -#endif From 4386f71623b77215c9502e60fc399e76ec337fec Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 19 Sep 2025 17:57:11 +0100 Subject: [PATCH 2401/3206] selftest/futex: Fix spelling mistake "boundarie" -> "boundary" There is a spelling mistake in a test message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner --- tools/testing/selftests/futex/functional/futex_numa_mpol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index afe5d95b644150..d037a3f10ee85d 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -180,7 +180,7 @@ TEST(futex_numa_mpol) ksft_print_msg("Memory back to RW\n"); test_futex(futex_ptr, 0); - ksft_test_result_pass("futex2 memory boundarie tests passed\n"); + ksft_test_result_pass("futex2 memory boundary tests passed\n"); /* MPOL test. Does not work as expected */ #ifdef LIBNUMA_VER_SUFFICIENT From 5a427fddec5e76360725a0f03df3a2a003efbe2e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 19 Sep 2025 21:58:05 -0700 Subject: [PATCH 2402/3206] selftests/bpf: Fix selftest verifier_arena_large failure With latest llvm22, I got the following verification failure: ... ; int big_alloc2(void *ctx) @ verifier_arena_large.c:207 0: (b4) w6 = 1 ; R6_w=1 ... ; if (err) @ verifier_arena_large.c:233 53: (56) if w6 != 0x0 goto pc+62 ; R6=0 54: (b7) r7 = -4 ; R7_w=-4 55: (18) r8 = 0x7f4000000000 ; R8_w=scalar() 57: (bf) r9 = addr_space_cast(r8, 0, 1) ; R8_w=scalar() R9_w=arena 58: (b4) w6 = 5 ; R6_w=5 ; pg = page[i]; @ verifier_arena_large.c:238 59: (bf) r1 = r7 ; R1_w=-4 R7_w=-4 60: (07) r1 += 4 ; R1_w=0 61: (79) r2 = *(u64 *)(r9 +0) ; R2_w=scalar() R9_w=arena ; if (*pg != i) @ verifier_arena_large.c:239 62: (bf) r3 = addr_space_cast(r2, 0, 1) ; R2_w=scalar() R3_w=arena 63: (71) r3 = *(u8 *)(r3 +0) ; R3_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 64: (5d) if r1 != r3 goto pc+51 ; R1_w=0 R3_w=0 ; bpf_arena_free_pages(&arena, (void __arena *)pg, 2); @ verifier_arena_large.c:241 65: (18) r1 = 0xff11000114548000 ; R1_w=map_ptr(map=arena,ks=0,vs=0) 67: (b4) w3 = 2 ; R3_w=2 68: (85) call bpf_arena_free_pages#72675 ; 69: (b7) r1 = 0 ; R1_w=0 ; page[i + 1] = NULL; @ verifier_arena_large.c:243 70: (7b) *(u64 *)(r8 +8) = r1 R8 invalid mem access 'scalar' processed 61 insns (limit 1000000) max_states_per_insn 0 total_states 6 peak_states 6 mark_read 2 ============= #489/5 verifier_arena_large/big_alloc2:FAIL The main reason is that 'r8' in insn '70' is not an arena pointer. Further debugging at llvm side shows that llvm commit ([1]) caused the failure. For the original code: page[i] = NULL; page[i + 1] = NULL; the llvm transformed it to something like below at source level: __builtin_memset(&page[i], 0, 16) Such transformation prevents llvm BPFCheckAndAdjustIR pass from generating proper addr_space_cast insns ([2]). Adding support in llvm BPFCheckAndAdjustIR pass should work, but not sure that such a pattern exists or not in real applications. At the same time, simply adding a memory barrier between two 'page' assignment can fix the issue. [1] https://github.com/llvm/llvm-project/pull/155415 [2] https://github.com/llvm/llvm-project/pull/84410 Cc: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20250920045805.3288551-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_arena_large.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index 9dbdf123542d3b..f19e15400b3e10 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -240,6 +240,7 @@ int big_alloc2(void *ctx) return 5; bpf_arena_free_pages(&arena, (void __arena *)pg, 2); page[i] = NULL; + barrier(); page[i + 1] = NULL; cond_break; } From 391253b25f078d2fe5657a1dedd360396d186407 Mon Sep 17 00:00:00 2001 From: Haofeng Li Date: Wed, 10 Sep 2025 17:37:03 +0800 Subject: [PATCH 2403/3206] time: Fix spelling mistakes in comments Correct several typos found in comments across various files in the kernel/time directory. No functional changes are introduced by these corrections. Signed-off-by: Haofeng Li Signed-off-by: Thomas Gleixner --- kernel/time/alarmtimer.c | 2 +- kernel/time/clocksource.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/posix-timers.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 577f0e6842d4ce..069d93bfb0c75c 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -35,7 +35,7 @@ /** * struct alarm_base - Alarm timer bases - * @lock: Lock for syncrhonized access to the base + * @lock: Lock for synchronized access to the base * @timerqueue: Timerqueue head managing the list of events * @get_ktime: Function to read the time correlating to the base * @get_timespec: Function to read the namespace time correlating to the base diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 3edb01db3aa146..a1890a073196b1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -144,7 +144,7 @@ static u64 suspend_start; * Default for maximum permissible skew when cs->uncertainty_margin is * not specified, and the lower bound even when cs->uncertainty_margin * is specified. This is also the default that is used when registering - * clocks with unspecifed cs->uncertainty_margin, so this macro is used + * clocks with unspecified cs->uncertainty_margin, so this macro is used * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. */ #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f383df28c53259..7e7b2b471baec2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -201,7 +201,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_ /* * The offline local CPU can't be the default target if the * next remote target event is after this timer. Keep the - * elected new base. An IPI will we issued to reprogram + * elected new base. An IPI will be issued to reprogram * it as a last resort. */ if (!hrtimer_base_is_online(this_cpu_base)) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 2741f3725de48e..aa3120104a5128 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -534,7 +534,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, goto out; } /* - * After succesful copy out, the timer ID is visible to user space + * After successful copy out, the timer ID is visible to user space * now but not yet valid because new_timer::signal low order bit is 1. * * Complete the initialization with the clock specific create From afe16653e05db07d658b55245c7a2e0603f136c0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Sep 2025 20:11:44 +0200 Subject: [PATCH 2404/3206] vhost: Take a reference on the task in struct vhost_task. vhost_task_create() creates a task and keeps a reference to its task_struct. That task may exit early via a signal and its task_struct will be released. A pending vhost_task_wake() will then attempt to wake the task and access a task_struct which is no longer there. Acquire a reference on the task_struct while creating the thread and release the reference while the struct vhost_task itself is removed. If the task exits early due to a signal, then the vhost_task_wake() will still access a valid task_struct. The wake is safe and will be skipped in this case. Fixes: f9010dbdce911 ("fork, vhost: Use CLONE_THREAD to fix freezer/ps regression") Reported-by: Sean Christopherson Closes: https://lore.kernel.org/all/aKkLEtoDXKxAAWju@google.com/ Signed-off-by: Sebastian Andrzej Siewior Message-Id: <20250918181144.Ygo8BZ-R@linutronix.de> Signed-off-by: Michael S. Tsirkin Tested-by: Sean Christopherson --- kernel/vhost_task.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index bc738fa90c1d68..27107dcc1cbfeb 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -100,6 +100,7 @@ void vhost_task_stop(struct vhost_task *vtsk) * freeing it below. */ wait_for_completion(&vtsk->exited); + put_task_struct(vtsk->task); kfree(vtsk); } EXPORT_SYMBOL_GPL(vhost_task_stop); @@ -148,7 +149,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), return ERR_CAST(tsk); } - vtsk->task = tsk; + vtsk->task = get_task_struct(tsk); return vtsk; } EXPORT_SYMBOL_GPL(vhost_task_create); From d6d673dd1e92b2bed0096e7e7e9fe5d7e7d2156c Mon Sep 17 00:00:00 2001 From: Ashwini Sahu Date: Mon, 8 Sep 2025 15:26:45 +0530 Subject: [PATCH 2405/3206] uapi: vduse: fix typo in comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a spelling mistake in vduse.h: "regsion" → "region" in the documentation for struct vduse_iova_info. No functional change. Signed-off-by: Ashwini Sahu Message-Id: <20250908095645.610336-1-ashwini@wisig.com> Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/vduse.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 68a627d04afa12..10ad71aa00d67b 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -237,7 +237,7 @@ struct vduse_iova_umem { * struct vduse_iova_info - information of one IOVA region * @start: start of the IOVA region * @last: last of the IOVA region - * @capability: capability of the IOVA regsion + * @capability: capability of the IOVA region * @reserved: for future use, needs to be initialized to zero * * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of From a05e4e935a6689542d86162b33a484cc704ce39a Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Fri, 29 Aug 2025 17:09:44 +0200 Subject: [PATCH 2406/3206] virtio_config: clarify output parameters This was ambiguous enough for a broken patch (206cc44588f7 ("virtio: reject shm region if length is zero")) to make it into the kernel, so make it clearer. Link: https://lore.kernel.org/r/20250816071600-mutt-send-email-mst@kernel.org/ Signed-off-by: Alyssa Ross Message-Id: <20250829150944.233505-1-hi@alyssa.is> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 8bf156dde554a5..7427b79d6f3d54 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -193,14 +193,15 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, } static inline void virtio_get_features(struct virtio_device *vdev, - u64 *features) + u64 *features_out) { if (vdev->config->get_extended_features) { - vdev->config->get_extended_features(vdev, features); + vdev->config->get_extended_features(vdev, features_out); return; } - virtio_features_from_u64(features, vdev->config->get_features(vdev)); + virtio_features_from_u64(features_out, + vdev->config->get_features(vdev)); } /** @@ -326,11 +327,11 @@ int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) static inline bool virtio_get_shm_region(struct virtio_device *vdev, - struct virtio_shm_region *region, u8 id) + struct virtio_shm_region *region_out, u8 id) { if (!vdev->config->get_shm_region) return false; - return vdev->config->get_shm_region(vdev, region, id); + return vdev->config->get_shm_region(vdev, region_out, id); } static inline bool virtio_is_little_endian(struct virtio_device *vdev) From cde7e7c3f8745a61458cea61aa28f37c3f5ae2b4 Mon Sep 17 00:00:00 2001 From: Peter Hilber Date: Tue, 26 Aug 2025 15:00:15 +0200 Subject: [PATCH 2407/3206] MAINTAINERS, mailmap: Update address for Peter Hilber Going forward, I will use another Qualcomm address, peter.hilber@oss.qualcomm.com. Map past contributions on behalf of Qualcomm to the new address as well. Signed-off-by: Peter Hilber Message-Id: <20250826130015.6218-1-peter.hilber@oss.qualcomm.com> Signed-off-by: Michael S. Tsirkin --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index aa09e792017f77..45069c831f1458 100644 --- a/.mailmap +++ b/.mailmap @@ -623,6 +623,7 @@ Paulo Alcantara Paulo Alcantara Pavankumar Kondeti Peter A Jonsson +Peter Hilber Peter Oruba Peter Oruba Pierre-Louis Bossart diff --git a/MAINTAINERS b/MAINTAINERS index cd7ff55b5d3217..cee85eae911321 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26782,7 +26782,7 @@ F: drivers/nvdimm/nd_virtio.c F: drivers/nvdimm/virtio_pmem.c VIRTIO RTC DRIVER -M: Peter Hilber +M: Peter Hilber L: virtualization@lists.linux.dev S: Maintained F: drivers/virtio/virtio_rtc_* From 07e27ad16399afcd693be20211b0dfae63e0615f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 21 Sep 2025 15:08:52 -0700 Subject: [PATCH 2408/3206] Linux 6.17-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9771619ac596c8..10355ecf32cb47 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Baby Opossum Posse # *DOCUMENTATION* From 1cde0a74a7a8951b3097417847a458e557be0b5b Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 17:55:17 +0200 Subject: [PATCH 2409/3206] smb: server: don't use delayed_work for post_recv_credits_work If we are using a hardcoded delay of 0 there's no point in using delayed_work it only adds confusion. The client also uses a normal work_struct and now it is easier to move it to the common smbdirect_socket. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers") Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 6550bd9f002c27..10a6b4ed1a037b 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -148,7 +148,7 @@ struct smb_direct_transport { wait_queue_head_t wait_send_pending; atomic_t send_pending; - struct delayed_work post_recv_credits_work; + struct work_struct post_recv_credits_work; struct work_struct send_immediate_work; struct work_struct disconnect_work; @@ -367,8 +367,8 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) spin_lock_init(&t->lock_new_recv_credits); - INIT_DELAYED_WORK(&t->post_recv_credits_work, - smb_direct_post_recv_credits); + INIT_WORK(&t->post_recv_credits_work, + smb_direct_post_recv_credits); INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work); INIT_WORK(&t->disconnect_work, smb_direct_disconnect_rdma_work); @@ -400,7 +400,7 @@ static void free_transport(struct smb_direct_transport *t) atomic_read(&t->send_pending) == 0); cancel_work_sync(&t->disconnect_work); - cancel_delayed_work_sync(&t->post_recv_credits_work); + cancel_work_sync(&t->post_recv_credits_work); cancel_work_sync(&t->send_immediate_work); if (t->qp) { @@ -615,8 +615,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) wake_up_interruptible(&t->wait_send_credits); if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count)) - mod_delayed_work(smb_direct_wq, - &t->post_recv_credits_work, 0); + queue_work(smb_direct_wq, &t->post_recv_credits_work); if (data_length) { enqueue_reassembly(t, recvmsg, (int)data_length); @@ -773,8 +772,7 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, st->count_avail_recvmsg += queue_removed; if (is_receive_credit_post_required(st->recv_credits, st->count_avail_recvmsg)) { spin_unlock(&st->receive_credit_lock); - mod_delayed_work(smb_direct_wq, - &st->post_recv_credits_work, 0); + queue_work(smb_direct_wq, &st->post_recv_credits_work); } else { spin_unlock(&st->receive_credit_lock); } @@ -801,7 +799,7 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, static void smb_direct_post_recv_credits(struct work_struct *work) { struct smb_direct_transport *t = container_of(work, - struct smb_direct_transport, post_recv_credits_work.work); + struct smb_direct_transport, post_recv_credits_work); struct smb_direct_recvmsg *recvmsg; int receive_credits, credits = 0; int ret; @@ -1734,7 +1732,7 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) goto out_err; } - smb_direct_post_recv_credits(&t->post_recv_credits_work.work); + smb_direct_post_recv_credits(&t->post_recv_credits_work); return 0; out_err: put_recvmsg(t, recvmsg); From f7f89250175e0a82e99ed66da7012e869c36497d Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 13 Aug 2025 08:48:42 +0200 Subject: [PATCH 2410/3206] smb: server: use disable_work_sync in transport_rdma.c This makes it safer during the disconnect and avoids requeueing. It's ok to call disable_work[_sync]() more than once. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers") Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 10a6b4ed1a037b..74dfb6496095db 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -399,9 +399,9 @@ static void free_transport(struct smb_direct_transport *t) wait_event(t->wait_send_pending, atomic_read(&t->send_pending) == 0); - cancel_work_sync(&t->disconnect_work); - cancel_work_sync(&t->post_recv_credits_work); - cancel_work_sync(&t->send_immediate_work); + disable_work_sync(&t->disconnect_work); + disable_work_sync(&t->post_recv_credits_work); + disable_work_sync(&t->send_immediate_work); if (t->qp) { ib_drain_qp(t->qp); From 1cf9f2a6a544288516a7b9e883a48eba6246bcf2 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 19 Sep 2025 14:13:15 -0300 Subject: [PATCH 2411/3206] smb: client: handle unlink(2) of files open by different clients In order to identify whether a certain file is open by a different client, start the unlink process by sending a compound request of CREATE(DELETE_ON_CLOSE) + CLOSE with only FILE_SHARE_DELETE bit set in smb2_create_req::ShareAccess. If the file is currently open, then the server will fail the request with STATUS_SHARING_VIOLATION, in which case we'll map it to -EBUSY, so __cifs_unlink() will fall back to silly-rename the file. This fixes the following case where open(O_CREAT) fails with -ENOENT (STATUS_DELETE_PENDING) due to file still open by a different client. * Before patch $ mount.cifs //srv/share /mnt/1 -o ...,nosharesock $ mount.cifs //srv/share /mnt/2 -o ...,nosharesock $ cd /mnt/1 $ touch foo $ exec 3<>foo $ cd /mnt/2 $ rm foo $ touch foo touch: cannot touch 'foo': No such file or directory $ exec 3>&- * After patch $ mount.cifs //srv/share /mnt/1 -o ...,nosharesock $ mount.cifs //srv/share /mnt/2 -o ...,nosharesock $ cd /mnt/1 $ touch foo $ exec 3<>foo $ cd /mnt/2 $ rm foo $ touch foo $ exec 3>&- Signed-off-by: Paulo Alcantara (Red Hat) Reviewed-by: David Howells Cc: Frank Sorenson Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/smb2inode.c | 99 ++++++++++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 11 deletions(-) diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 7cadc8ca4f55b7..e32a3f33879339 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -1175,23 +1175,92 @@ int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb, struct dentry *dentry) { + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + __le16 *utf16_path __free(kfree) = NULL; + int retries = 0, cur_sleep = 1; + struct TCP_Server_Info *server; struct cifs_open_parms oparms; + struct smb2_create_req *creq; struct inode *inode = NULL; + struct smb_rqst rqst[2]; + struct kvec rsp_iov[2]; + struct kvec close_iov; + int resp_buftype[2]; + struct cifs_fid fid; + int flags = 0; + __u8 oplock; int rc; - if (dentry) + utf16_path = cifs_convert_path_to_utf16(name, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; +again: + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(tcon->ses); + + memset(rqst, 0, sizeof(rqst)); + memset(resp_buftype, 0, sizeof(resp_buftype)); + memset(rsp_iov, 0, sizeof(rsp_iov)); + + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = ARRAY_SIZE(open_iov); + + oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE | FILE_READ_ATTRIBUTES, + FILE_OPEN, CREATE_DELETE_ON_CLOSE | + OPEN_REPARSE_POINT, ACL_NO_MODE); + oparms.fid = &fid; + + if (dentry) { inode = d_inode(dentry); + if (CIFS_I(inode)->lease_granted && server->ops->get_lease_key) { + oplock = SMB2_OPLOCK_LEVEL_LEASE; + server->ops->get_lease_key(inode, &fid); + } + } - oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE, - FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE); - rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_UNLINK}, - 1, NULL, NULL, NULL, dentry); - if (rc == -EINVAL) { - cifs_dbg(FYI, "invalid lease key, resending request without lease"); - rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_UNLINK}, - 1, NULL, NULL, NULL, NULL); + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, utf16_path); + if (rc) + goto err_free; + smb2_set_next_command(tcon, &rqst[0]); + creq = rqst[0].rq_iov[0].iov_base; + creq->ShareAccess = FILE_SHARE_DELETE_LE; + + rqst[1].rq_iov = &close_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_close_init(tcon, server, &rqst[1], + COMPOUND_FID, COMPOUND_FID, false); + smb2_set_related(&rqst[1]); + if (rc) + goto err_free; + + if (retries) { + for (int i = 0; i < ARRAY_SIZE(rqst); i++) + smb2_set_replay(server, &rqst[i]); + } + + rc = compound_send_recv(xid, tcon->ses, server, flags, + ARRAY_SIZE(rqst), rqst, + resp_buftype, rsp_iov); + SMB2_open_free(&rqst[0]); + SMB2_close_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto again; + + /* Retry compound request without lease */ + if (rc == -EINVAL && dentry) { + dentry = NULL; + retries = 0; + cur_sleep = 1; + goto again; } /* * If dentry (hence, inode) is NULL, lease break is going to @@ -1199,6 +1268,14 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, */ if (!rc && inode) cifs_mark_open_handles_for_deleted_file(inode, name); + + return rc; + +err_free: + SMB2_open_free(&rqst[0]); + SMB2_close_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); return rc; } From 1a194e6c8e1ee745e914b0b7f50fa86c89ed13fe Mon Sep 17 00:00:00 2001 From: Samasth Norway Ananda Date: Fri, 12 Sep 2025 10:00:23 -0700 Subject: [PATCH 2412/3206] fbcon: fix integer overflow in fbcon_do_set_font MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix integer overflow vulnerabilities in fbcon_do_set_font() where font size calculations could overflow when handling user-controlled font parameters. The vulnerabilities occur when: 1. CALC_FONTSZ(h, pitch, charcount) performs h * pith * charcount multiplication with user-controlled values that can overflow. 2. FONT_EXTRA_WORDS * sizeof(int) + size addition can also overflow 3. This results in smaller allocations than expected, leading to buffer overflows during font data copying. Add explicit overflow checking using check_mul_overflow() and check_add_overflow() kernel helpers to safety validate all size calculations before allocation. Signed-off-by: Samasth Norway Ananda Reviewed-by: Thomas Zimmermann Fixes: 39b3cffb8cf3 ("fbcon: prevent user font height or width change from causing potential out-of-bounds access") Cc: George Kennedy Cc: stable Cc: syzbot+38a3699c7eaf165b97a6@syzkaller.appspotmail.com Cc: Greg Kroah-Hartman Cc: Simona Vetter Cc: Helge Deller Cc: Thomas Zimmermann Cc: "Ville Syrjälä" Cc: Sam Ravnborg Cc: Qianqiang Liu Cc: Shixiong Ou Cc: Kees Cook Cc: # v5.9+ Signed-off-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20250912170023.3931881-1-samasth.norway.ananda@oracle.com --- drivers/video/fbdev/core/fbcon.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 55f5731e94c318..a507d05f8feaa8 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2531,9 +2531,16 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, if (fbcon_invalid_charcount(info, charcount)) return -EINVAL; - size = CALC_FONTSZ(h, pitch, charcount); + /* Check for integer overflow in font size calculation */ + if (check_mul_overflow(h, pitch, &size) || + check_mul_overflow(size, charcount, &size)) + return -EINVAL; + + /* Check for overflow in allocation size calculation */ + if (check_add_overflow(FONT_EXTRA_WORDS * sizeof(int), size, &size)) + return -EINVAL; - new_data = kmalloc(FONT_EXTRA_WORDS * sizeof(int) + size, GFP_USER); + new_data = kmalloc(size, GFP_USER); if (!new_data) return -ENOMEM; From 9d52b0b41be5b932a0a929c10038f1bb04af4ca5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Sun, 21 Sep 2025 18:28:47 +0200 Subject: [PATCH 2413/3206] xen: take system_transition_mutex on suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Xen's do_suspend() calls dpm_suspend_start() without taking required system_transition_mutex. Since 12ffc3b1513eb moved the pm_restrict_gfp_mask() call, not taking that mutex results in a WARN. Take the mutex in do_suspend(), and use mutex_trylock() to follow how enter_state() does this. Suggested-by: Jürgen Groß Fixes: 12ffc3b1513eb "PM: Restrict swap use to later in the suspend sequence" Link: https://lore.kernel.org/xen-devel/aKiBJeqsYx_4Top5@mail-itl/ Signed-off-by: Marek Marczykowski-Górecki Cc: stable@vger.kernel.org # v6.16+ Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Message-ID: <20250921162853.223116-1-marmarek@invisiblethingslab.com> --- drivers/xen/manage.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 1f5a7a42fc3278..e20c40a62e64e2 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -95,10 +96,16 @@ static void do_suspend(void) shutting_down = SHUTDOWN_SUSPEND; + if (!mutex_trylock(&system_transition_mutex)) + { + pr_err("%s: failed to take system_transition_mutex\n", __func__); + goto out; + } + err = freeze_processes(); if (err) { pr_err("%s: freeze processes failed %d\n", __func__, err); - goto out; + goto out_unlock; } err = freeze_kernel_threads(); @@ -155,6 +162,8 @@ static void do_suspend(void) out_thaw: thaw_processes(); +out_unlock: + mutex_unlock(&system_transition_mutex); out: shutting_down = SHUTDOWN_INVALID; } From 81ef2022b311c7c4f29011f778442635acfaba90 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Sun, 21 Sep 2025 12:26:45 +0100 Subject: [PATCH 2414/3206] spi: rpc-if: Drop deprecated SIMPLE_DEV_PM_OPS Replace SIMPLE_DEV_PM_OPS->DEFINE_SIMPLE_DEV_PM_OPS macro and use pm_sleep_ptr(). This lets us drop the check for CONFIG_PM_SLEEP, and reduces kernel size in case CONFIG_PM or CONFIG_PM_SLEEP is disabled, while increasing build coverage. Also drop the __maybe_unused attribute from PM functions. Signed-off-by: Biju Das Link: https://patch.msgid.link/20250921112649.104516-2-biju.das.jz@bp.renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rpc-if.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-rpc-if.c b/drivers/spi/spi-rpc-if.c index 627cffea5d5c7e..a8e783cc66e78c 100644 --- a/drivers/spi/spi-rpc-if.c +++ b/drivers/spi/spi-rpc-if.c @@ -196,21 +196,21 @@ static void rpcif_spi_remove(struct platform_device *pdev) pm_runtime_disable(rpc->dev); } -static int __maybe_unused rpcif_spi_suspend(struct device *dev) +static int rpcif_spi_suspend(struct device *dev) { struct spi_controller *ctlr = dev_get_drvdata(dev); return spi_controller_suspend(ctlr); } -static int __maybe_unused rpcif_spi_resume(struct device *dev) +static int rpcif_spi_resume(struct device *dev) { struct spi_controller *ctlr = dev_get_drvdata(dev); return spi_controller_resume(ctlr); } -static SIMPLE_DEV_PM_OPS(rpcif_spi_pm_ops, rpcif_spi_suspend, rpcif_spi_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(rpcif_spi_pm_ops, rpcif_spi_suspend, rpcif_spi_resume); static const struct platform_device_id rpc_if_spi_id_table[] = { { .name = "rpc-if-spi" }, @@ -224,9 +224,7 @@ static struct platform_driver rpcif_spi_driver = { .id_table = rpc_if_spi_id_table, .driver = { .name = "rpc-if-spi", -#ifdef CONFIG_PM_SLEEP - .pm = &rpcif_spi_pm_ops, -#endif + .pm = pm_sleep_ptr(&rpcif_spi_pm_ops), }, }; module_platform_driver(rpcif_spi_driver); From ad4728740bd68d74365a43acc25a65339a9b2173 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Sun, 21 Sep 2025 12:26:46 +0100 Subject: [PATCH 2415/3206] spi: rpc-if: Add resume support for RZ/G3E On RZ/G3E using PSCI, s2ram powers down the SoC. After resume, reinitialize the hardware for SPI operations. Signed-off-by: Biju Das Link: https://patch.msgid.link/20250921112649.104516-3-biju.das.jz@bp.renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rpc-if.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/spi/spi-rpc-if.c b/drivers/spi/spi-rpc-if.c index a8e783cc66e78c..6edc0c4db854db 100644 --- a/drivers/spi/spi-rpc-if.c +++ b/drivers/spi/spi-rpc-if.c @@ -207,6 +207,8 @@ static int rpcif_spi_resume(struct device *dev) { struct spi_controller *ctlr = dev_get_drvdata(dev); + rpcif_hw_init(dev, false); + return spi_controller_resume(ctlr); } From 9cf5b8b69bfccf3d98d31f640244437a452daa80 Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Sat, 20 Sep 2025 14:20:44 +0800 Subject: [PATCH 2416/3206] ASoC: tas2781: Correct the wrong description and register address on tas2781 Correct the wrong description for TAS257X. Combined TAS5825 with TAS2563, as they use the same register address and number. Correct the register address and number for TAS5827. Fixes: 7095d688de38 ("ASoC: tas2781: Add tas2118, tas2x20, tas5825 support") Signed-off-by: Baojun Xu Acked-by: Mark Brown Signed-off-by: Takashi Iwai --- .../devicetree/bindings/sound/ti,tas2781.yaml | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/devicetree/bindings/sound/ti,tas2781.yaml b/Documentation/devicetree/bindings/sound/ti,tas2781.yaml index 011211112be4be..bd00afa47d62b3 100644 --- a/Documentation/devicetree/bindings/sound/ti,tas2781.yaml +++ b/Documentation/devicetree/bindings/sound/ti,tas2781.yaml @@ -11,11 +11,13 @@ maintainers: - Shenghao Ding description: | - The TAS2118/TAS2X20/TAS257x is mono, digital input Class-D audio + The TAS2118/TAS2X20 is mono, digital input Class-D audio amplifier optimized for efficiently driving high peak power into small loudspeakers. - Integrated speaker voltage and current sense provides for - real time monitoring of loudspeaker behavior. + The TAS257x is mono, digital input Class-D audio amplifier optimized + for efficiently driving high peak power into small loudspeakers. + Integrated speaker voltage and current sense provides for real time + monitoring of loudspeaker behavior. The TAS2563/TAS2781 is a mono, digital input Class-D audio amplifier optimized for efficiently driving high peak power into small loudspeakers. An integrated on-chip DSP supports Texas @@ -25,9 +27,7 @@ description: | The TAS5825/TAS5827 is a stereo, digital input Class-D audio amplifier optimized for efficiently driving high peak power into small loudspeakers. An integrated on-chip DSP supports Texas - Instruments Smart Amp speaker protection algorithm. The - integrated speaker voltage and current sense provides for real time - monitoring of loudspeaker behavior. + Instruments Smart Amp speaker protection algorithm. Specifications about the audio amplifier can be found at: https://www.ti.com/lit/gpn/tas2120 @@ -131,6 +131,7 @@ allOf: contains: enum: - ti,tas2563 + - ti,tas5825 then: properties: reg: @@ -181,15 +182,14 @@ allOf: compatible: contains: enum: - - ti,tas5825 - ti,tas5827 then: properties: reg: - maxItems: 4 + maxItems: 6 items: - minimum: 0x4c - maximum: 0x4f + minimum: 0x60 + maximum: 0x65 additionalProperties: false From 398a8a4e51dbd03e4103ea596ea4ea037fe67175 Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (Schneider Electric)" Date: Fri, 12 Sep 2025 09:08:58 +0200 Subject: [PATCH 2417/3206] spi: omap2-mcspi: drive SPI_CLK on transfer_setup() If the cached contents of the CHCONF register doesn't have the FORCE bit set, the setup() function failed to set the relevant idle state of the SPI_CLK pin. In such case, the SPI_CLK's idle state is reached later with set_cs(), but it's too late for the first SPI transfer which fails since the CS is asserted before the clock reaching its idle state. Add a first write in setup() that always sets the FORCE bit. Keep the current write afterwards to ensure the FORCE bit won't stay in the cached contents of the CHCONF register unless it's intended. Signed-off-by: Bastien Curutchet (Schneider Electric) Link: https://patch.msgid.link/20250912-omap-spi-fix-v1-1-f925b0d27ede@bootlin.com Signed-off-by: Mark Brown --- drivers/spi/spi-omap2-mcspi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c index 6dc58a30804a12..69c2e9d9be3c38 100644 --- a/drivers/spi/spi-omap2-mcspi.c +++ b/drivers/spi/spi-omap2-mcspi.c @@ -988,6 +988,7 @@ static int omap2_mcspi_setup_transfer(struct spi_device *spi, else l &= ~OMAP2_MCSPI_CHCONF_PHA; + mcspi_write_chconf0(spi, l | OMAP2_MCSPI_CHCONF_FORCE); mcspi_write_chconf0(spi, l); cs->mode = spi->mode; From f8673e4069b2032bf9f854bae818a7bdbdca7520 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Wed, 17 Sep 2025 16:37:11 +0100 Subject: [PATCH 2418/3206] ASoC: dt-bindings: cirrus,cs35l41: Document the cirrus,subsystem-id property Add new property: cirrus,subsystem-id This new property is used to uniquely identify the system if device tree is used, to allow the driver to select the correct firmware and tuning for the system. The DSP driver searches for a compatible firmware (and tuning) based on what it is able to read from the hardware. However, the SSID is based on the system, and cannot be read from the hardware, therefore it needs to be read from the Device Tree. On ACPI-based systems, it is able to read this from the ACPI _SUB property, and to maintain compatibility with the driver between ACPI and Device Tree systems we need an equivalent property. Signed-off-by: Stefan Binding Link: https://patch.msgid.link/20250917153722.94978-2-sbinding@opensource.cirrus.com Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/cirrus,cs35l41.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/sound/cirrus,cs35l41.yaml b/Documentation/devicetree/bindings/sound/cirrus,cs35l41.yaml index 14dea1feefc5ae..e6cf2ebcd77714 100644 --- a/Documentation/devicetree/bindings/sound/cirrus,cs35l41.yaml +++ b/Documentation/devicetree/bindings/sound/cirrus,cs35l41.yaml @@ -151,6 +151,12 @@ properties: minimum: 0 maximum: 5 + cirrus,subsystem-id: + $ref: /schemas/types.yaml#/definitions/string + description: + Subsystem ID. If this property is present, it sets the system name, + used to identify the firmware and tuning to load. + required: - compatible - reg From 46c8b4d2a693eca69a2191436cffa44f489e98c7 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Wed, 17 Sep 2025 16:37:12 +0100 Subject: [PATCH 2419/3206] ASoC: cs35l41: Fallback to reading Subsystem ID property if not ACPI If ACPI is not used, then there is currently no way of reading a Subsystem ID property used for a system name to uniquely identify the system in order to load the correct firmware and tuning. Add a new property which can be read from device tree to be able to set the system name. Signed-off-by: Stefan Binding Link: https://patch.msgid.link/20250917153722.94978-3-sbinding@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l41.c | 77 ++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/sound/soc/codecs/cs35l41.c b/sound/soc/codecs/cs35l41.c index 224d65987a8df3..173d7c59b7254d 100644 --- a/sound/soc/codecs/cs35l41.c +++ b/sound/soc/codecs/cs35l41.c @@ -7,6 +7,7 @@ // Author: David Rhodes #include +#include #include #include #include @@ -1147,45 +1148,55 @@ static int cs35l41_dsp_init(struct cs35l41_private *cs35l41) return ret; } -#ifdef CONFIG_ACPI -static int cs35l41_acpi_get_name(struct cs35l41_private *cs35l41) +static int cs35l41_get_system_name(struct cs35l41_private *cs35l41) { struct acpi_device *adev = ACPI_COMPANION(cs35l41->dev); - acpi_handle handle = acpi_device_handle(adev); - const char *hid; - const char *sub; - - /* If there is no acpi_device, there is no ACPI for this system, return 0 */ - if (!adev) - return 0; + const char *sub = NULL; + const char *tmp; + int ret = 0; - sub = acpi_get_subsystem_id(handle); - if (IS_ERR(sub)) { - /* If no _SUB, fallback to _HID, otherwise fail */ - if (PTR_ERR(sub) == -ENODATA) { - hid = acpi_device_hid(adev); - /* If dummy hid, return 0 and fallback to legacy firmware path */ - if (!strcmp(hid, "device")) - return 0; - sub = kstrdup(hid, GFP_KERNEL); - if (!sub) - sub = ERR_PTR(-ENOMEM); - - } else - return PTR_ERR(sub); + /* If there is no acpi_device, there is no ACPI for this system, skip checking ACPI */ + if (adev) { + acpi_handle handle = acpi_device_handle(adev); + + sub = acpi_get_subsystem_id(handle); + ret = PTR_ERR_OR_ZERO(sub); + if (ret) { + sub = NULL; + /* If no _SUB, fallback to _HID, otherwise fail */ + if (ret == -ENODATA) { + tmp = acpi_device_hid(adev); + /* If dummy hid, return 0 and fallback to legacy firmware path */ + if (!strcmp(tmp, "device")) { + ret = 0; + goto err; + } + sub = kstrdup(tmp, GFP_KERNEL); + if (!sub) { + ret = -ENOMEM; + goto err; + } + } + } + } else { + if (!device_property_read_string(cs35l41->dev, "cirrus,subsystem-id", &tmp)) { + sub = kstrdup(tmp, GFP_KERNEL); + if (!sub) { + ret = -ENOMEM; + goto err; + } + } } - cs35l41->dsp.system_name = sub; - dev_dbg(cs35l41->dev, "Subsystem ID: %s\n", cs35l41->dsp.system_name); +err: + if (sub) { + cs35l41->dsp.system_name = sub; + dev_info(cs35l41->dev, "Subsystem ID: %s\n", cs35l41->dsp.system_name); + } else + dev_warn(cs35l41->dev, "Subsystem ID not found\n"); - return 0; -} -#else -static int cs35l41_acpi_get_name(struct cs35l41_private *cs35l41) -{ - return 0; + return ret; } -#endif /* CONFIG_ACPI */ int cs35l41_probe(struct cs35l41_private *cs35l41, const struct cs35l41_hw_cfg *hw_cfg) { @@ -1317,7 +1328,7 @@ int cs35l41_probe(struct cs35l41_private *cs35l41, const struct cs35l41_hw_cfg * goto err; } - ret = cs35l41_acpi_get_name(cs35l41); + ret = cs35l41_get_system_name(cs35l41); if (ret < 0) goto err; From d9a2211dd3aee3ef29fc675f70a1941bc3f4f51f Mon Sep 17 00:00:00 2001 From: Haixu Cui Date: Mon, 8 Sep 2025 17:23:46 +0800 Subject: [PATCH 2420/3206] virtio: Add ID for virtio SPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add VIRTIO_ID_SPI definition for virtio SPI. Signed-off-by: Haixu Cui Reviewed-by: Viresh Kumar Reviewed-by: Alex Bennée Link: https://patch.msgid.link/20250908092348.1283552-2-quic_haixcui@quicinc.com Signed-off-by: Mark Brown --- include/uapi/linux/virtio_ids.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 7aa2eb76620508..6c12db16faa3ad 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -68,6 +68,7 @@ #define VIRTIO_ID_AUDIO_POLICY 39 /* virtio audio policy */ #define VIRTIO_ID_BT 40 /* virtio bluetooth */ #define VIRTIO_ID_GPIO 41 /* virtio gpio */ +#define VIRTIO_ID_SPI 45 /* virtio spi */ /* * Virtio Transitional IDs From 6a1f3390fafeafe130b8128b3047452b92911a98 Mon Sep 17 00:00:00 2001 From: Haixu Cui Date: Mon, 8 Sep 2025 17:23:47 +0800 Subject: [PATCH 2421/3206] virtio-spi: Add virtio-spi.h Add virtio-spi.h header for virtio SPI. Signed-off-by: Haixu Cui Link: https://patch.msgid.link/20250908092348.1283552-3-quic_haixcui@quicinc.com Signed-off-by: Mark Brown --- MAINTAINERS | 6 ++ include/uapi/linux/virtio_spi.h | 181 ++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 include/uapi/linux/virtio_spi.h diff --git a/MAINTAINERS b/MAINTAINERS index 931e68699ab9ac..18cd254aa85be9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26803,6 +26803,12 @@ S: Maintained F: include/uapi/linux/virtio_snd.h F: sound/virtio/* +VIRTIO SPI DRIVER +M: Haixu Cui +L: virtualization@lists.linux.dev +S: Maintained +F: include/uapi/linux/virtio_spi.h + VIRTUAL BOX GUEST DEVICE DRIVER M: Hans de Goede M: Arnd Bergmann diff --git a/include/uapi/linux/virtio_spi.h b/include/uapi/linux/virtio_spi.h new file mode 100644 index 00000000000000..8ab3c970cdd311 --- /dev/null +++ b/include/uapi/linux/virtio_spi.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* + * Copyright (C) 2023 OpenSynergy GmbH + * Copyright (C) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + */ +#ifndef _LINUX_VIRTIO_VIRTIO_SPI_H +#define _LINUX_VIRTIO_VIRTIO_SPI_H + +#include +#include +#include +#include + +/* Sample data on trailing clock edge */ +#define VIRTIO_SPI_CPHA _BITUL(0) +/* Clock is high when IDLE */ +#define VIRTIO_SPI_CPOL _BITUL(1) +/* Chip Select is active high */ +#define VIRTIO_SPI_CS_HIGH _BITUL(2) +/* Transmit LSB first */ +#define VIRTIO_SPI_MODE_LSB_FIRST _BITUL(3) +/* Loopback mode */ +#define VIRTIO_SPI_MODE_LOOP _BITUL(4) + +/** + * struct virtio_spi_config - All config fields are read-only for the + * Virtio SPI driver + * @cs_max_number: maximum number of chipselect the host SPI controller + * supports. + * @cs_change_supported: indicates if the host SPI controller supports to toggle + * chipselect after each transfer in one message: + * 0: unsupported, chipselect will be kept in active state throughout the + * message transaction; + * 1: supported. + * Note: Message here contains a sequence of SPI transfers. + * @tx_nbits_supported: indicates the supported number of bit for writing: + * bit 0: DUAL (2-bit transfer), 1 for supported + * bit 1: QUAD (4-bit transfer), 1 for supported + * bit 2: OCTAL (8-bit transfer), 1 for supported + * other bits are reserved as 0, 1-bit transfer is always supported. + * @rx_nbits_supported: indicates the supported number of bit for reading: + * bit 0: DUAL (2-bit transfer), 1 for supported + * bit 1: QUAD (4-bit transfer), 1 for supported + * bit 2: OCTAL (8-bit transfer), 1 for supported + * other bits are reserved as 0, 1-bit transfer is always supported. + * @bits_per_word_mask: mask indicating which values of bits_per_word are + * supported. If not set, no limitation for bits_per_word. + * @mode_func_supported: indicates the following features are supported or not: + * bit 0-1: CPHA feature + * 0b00: invalid, should support as least one CPHA setting + * 0b01: supports CPHA=0 only + * 0b10: supports CPHA=1 only + * 0b11: supports CPHA=0 and CPHA=1. + * bit 2-3: CPOL feature + * 0b00: invalid, should support as least one CPOL setting + * 0b01: supports CPOL=0 only + * 0b10: supports CPOL=1 only + * 0b11: supports CPOL=0 and CPOL=1. + * bit 4: chipselect active high feature, 0 for unsupported and 1 for + * supported, chipselect active low is supported by default. + * bit 5: LSB first feature, 0 for unsupported and 1 for supported, + * MSB first is supported by default. + * bit 6: loopback mode feature, 0 for unsupported and 1 for supported, + * normal mode is supported by default. + * @max_freq_hz: the maximum clock rate supported in Hz unit, 0 means no + * limitation for transfer speed. + * @max_word_delay_ns: the maximum word delay supported, in nanoseconds. + * A value of 0 indicates that word delay is unsupported. + * Each transfer may consist of a sequence of words. + * @max_cs_setup_ns: the maximum delay supported after chipselect is asserted, + * in ns unit, 0 means delay is not supported to introduce after chipselect is + * asserted. + * @max_cs_hold_ns: the maximum delay supported before chipselect is deasserted, + * in ns unit, 0 means delay is not supported to introduce before chipselect + * is deasserted. + * @max_cs_incative_ns: maximum delay supported after chipselect is deasserted, + * in ns unit, 0 means delay is not supported to introduce after chipselect is + * deasserted. + */ +struct virtio_spi_config { + __u8 cs_max_number; + __u8 cs_change_supported; +#define VIRTIO_SPI_RX_TX_SUPPORT_DUAL _BITUL(0) +#define VIRTIO_SPI_RX_TX_SUPPORT_QUAD _BITUL(1) +#define VIRTIO_SPI_RX_TX_SUPPORT_OCTAL _BITUL(2) + __u8 tx_nbits_supported; + __u8 rx_nbits_supported; + __le32 bits_per_word_mask; +#define VIRTIO_SPI_MF_SUPPORT_CPHA_0 _BITUL(0) +#define VIRTIO_SPI_MF_SUPPORT_CPHA_1 _BITUL(1) +#define VIRTIO_SPI_MF_SUPPORT_CPOL_0 _BITUL(2) +#define VIRTIO_SPI_MF_SUPPORT_CPOL_1 _BITUL(3) +#define VIRTIO_SPI_MF_SUPPORT_CS_HIGH _BITUL(4) +#define VIRTIO_SPI_MF_SUPPORT_LSB_FIRST _BITUL(5) +#define VIRTIO_SPI_MF_SUPPORT_LOOPBACK _BITUL(6) + __le32 mode_func_supported; + __le32 max_freq_hz; + __le32 max_word_delay_ns; + __le32 max_cs_setup_ns; + __le32 max_cs_hold_ns; + __le32 max_cs_inactive_ns; +}; + +/** + * struct spi_transfer_head - virtio SPI transfer descriptor + * @chip_select_id: chipselect index the SPI transfer used. + * @bits_per_word: the number of bits in each SPI transfer word. + * @cs_change: whether to deselect device after finishing this transfer + * before starting the next transfer, 0 means cs keep asserted and + * 1 means cs deasserted then asserted again. + * @tx_nbits: bus width for write transfer. + * 0,1: bus width is 1, also known as SINGLE + * 2 : bus width is 2, also known as DUAL + * 4 : bus width is 4, also known as QUAD + * 8 : bus width is 8, also known as OCTAL + * other values are invalid. + * @rx_nbits: bus width for read transfer. + * 0,1: bus width is 1, also known as SINGLE + * 2 : bus width is 2, also known as DUAL + * 4 : bus width is 4, also known as QUAD + * 8 : bus width is 8, also known as OCTAL + * other values are invalid. + * @reserved: for future use. + * @mode: SPI transfer mode. + * bit 0: CPHA, determines the timing (i.e. phase) of the data + * bits relative to the clock pulses.For CPHA=0, the + * "out" side changes the data on the trailing edge of the + * preceding clock cycle, while the "in" side captures the data + * on (or shortly after) the leading edge of the clock cycle. + * For CPHA=1, the "out" side changes the data on the leading + * edge of the current clock cycle, while the "in" side + * captures the data on (or shortly after) the trailing edge of + * the clock cycle. + * bit 1: CPOL, determines the polarity of the clock. CPOL=0 is a + * clock which idles at 0, and each cycle consists of a pulse + * of 1. CPOL=1 is a clock which idles at 1, and each cycle + * consists of a pulse of 0. + * bit 2: CS_HIGH, if 1, chip select active high, else active low. + * bit 3: LSB_FIRST, determines per-word bits-on-wire, if 0, MSB + * first, else LSB first. + * bit 4: LOOP, loopback mode. + * @freq: the transfer speed in Hz. + * @word_delay_ns: delay to be inserted between consecutive words of a + * transfer, in ns unit. + * @cs_setup_ns: delay to be introduced after CS is asserted, in ns + * unit. + * @cs_delay_hold_ns: delay to be introduced before CS is deasserted + * for each transfer, in ns unit. + * @cs_change_delay_inactive_ns: delay to be introduced after CS is + * deasserted and before next asserted, in ns unit. + */ +struct spi_transfer_head { + __u8 chip_select_id; + __u8 bits_per_word; + __u8 cs_change; + __u8 tx_nbits; + __u8 rx_nbits; + __u8 reserved[3]; + __le32 mode; + __le32 freq; + __le32 word_delay_ns; + __le32 cs_setup_ns; + __le32 cs_delay_hold_ns; + __le32 cs_change_delay_inactive_ns; +}; + +/** + * struct spi_transfer_result - virtio SPI transfer result + * @result: Transfer result code. + * VIRTIO_SPI_TRANS_OK: Transfer successful. + * VIRTIO_SPI_PARAM_ERR: Parameter error. + * VIRTIO_SPI_TRANS_ERR: Transfer error. + */ +struct spi_transfer_result { +#define VIRTIO_SPI_TRANS_OK 0 +#define VIRTIO_SPI_PARAM_ERR 1 +#define VIRTIO_SPI_TRANS_ERR 2 + __u8 result; +}; + +#endif /* #ifndef _LINUX_VIRTIO_VIRTIO_SPI_H */ From f98cabe3f6cf6396b3ae0264800d9b53d7612433 Mon Sep 17 00:00:00 2001 From: Haixu Cui Date: Mon, 8 Sep 2025 17:23:48 +0800 Subject: [PATCH 2422/3206] SPI: Add virtio SPI driver This is the virtio SPI Linux kernel driver. Signed-off-by: Haixu Cui Link: https://patch.msgid.link/20250908092348.1283552-4-quic_haixcui@quicinc.com Signed-off-by: Mark Brown --- MAINTAINERS | 1 + drivers/spi/Kconfig | 11 + drivers/spi/Makefile | 1 + drivers/spi/spi-virtio.c | 431 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 444 insertions(+) create mode 100644 drivers/spi/spi-virtio.c diff --git a/MAINTAINERS b/MAINTAINERS index 18cd254aa85be9..21abd9cf86f9d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26807,6 +26807,7 @@ VIRTIO SPI DRIVER M: Haixu Cui L: virtualization@lists.linux.dev S: Maintained +F: drivers/spi/spi-virtio.c F: include/uapi/linux/virtio_spi.h VIRTUAL BOX GUEST DEVICE DRIVER diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 3b864df9b4d6a3..e8a39e304c7eb6 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -1235,6 +1235,17 @@ config SPI_UNIPHIER If your SoC supports SCSSI, say Y here. +config SPI_VIRTIO + tristate "Virtio SPI Controller" + depends on SPI_MASTER && VIRTIO + help + If you say yes to this option, support will be included for the virtio + SPI controller driver. The hardware can be emulated by any device model + software according to the virtio protocol. + + This driver can also be built as a module. If so, the module + will be called spi-virtio. + config SPI_XCOMM tristate "Analog Devices AD-FMCOMMS1-EBZ SPI-I2C-bridge driver" depends on I2C diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile index eefaeca0974566..8ff74a13faaa88 100644 --- a/drivers/spi/Makefile +++ b/drivers/spi/Makefile @@ -159,6 +159,7 @@ spi-thunderx-objs := spi-cavium.o spi-cavium-thunderx.o obj-$(CONFIG_SPI_THUNDERX) += spi-thunderx.o obj-$(CONFIG_SPI_TOPCLIFF_PCH) += spi-topcliff-pch.o obj-$(CONFIG_SPI_UNIPHIER) += spi-uniphier.o +obj-$(CONFIG_SPI_VIRTIO) += spi-virtio.o obj-$(CONFIG_SPI_XCOMM) += spi-xcomm.o obj-$(CONFIG_SPI_XILINX) += spi-xilinx.o obj-$(CONFIG_SPI_XLP) += spi-xlp.o diff --git a/drivers/spi/spi-virtio.c b/drivers/spi/spi-virtio.c new file mode 100644 index 00000000000000..2acb929b2c6907 --- /dev/null +++ b/drivers/spi/spi-virtio.c @@ -0,0 +1,431 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SPI bus driver for the Virtio SPI controller + * Copyright (C) 2023 OpenSynergy GmbH + * Copyright (C) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VIRTIO_SPI_MODE_MASK \ + (SPI_MODE_X_MASK | SPI_CS_HIGH | SPI_LSB_FIRST) + +struct virtio_spi_req { + struct completion completion; + const u8 *tx_buf; + u8 *rx_buf; + struct spi_transfer_head transfer_head ____cacheline_aligned; + struct spi_transfer_result result; +}; + +struct virtio_spi_priv { + /* The virtio device we're associated with */ + struct virtio_device *vdev; + /* Pointer to the virtqueue */ + struct virtqueue *vq; + /* Copy of config space mode_func_supported */ + u32 mode_func_supported; + /* Copy of config space max_freq_hz */ + u32 max_freq_hz; +}; + +static void virtio_spi_msg_done(struct virtqueue *vq) +{ + struct virtio_spi_req *req; + unsigned int len; + + while ((req = virtqueue_get_buf(vq, &len))) + complete(&req->completion); +} + +/* + * virtio_spi_set_delays - Set delay parameters for SPI transfer + * + * This function sets various delay parameters for SPI transfer, + * including delay after CS asserted, timing intervals between + * adjacent words within a transfer, delay before and after CS + * deasserted. It converts these delay parameters to nanoseconds + * using spi_delay_to_ns and stores the results in spi_transfer_head + * structure. + * If the conversion fails, the function logs a warning message and + * returns an error code. + * . . . . . . . . . . + * Delay + A + + B + + C + D + E + F + A + + * . . . . . . . . . . + * ___. . . . . . .___.___. . + * CS# |___.______.____.____.___.___| . |___._____________ + * . . . . . . . . . . + * . . . . . . . . . . + * SCLK__.___.___NNN_____NNN__.___.___.___.___.___.___NNN_______ + * + * NOTE: 1st transfer has two words, the delay between these two words are + * 'B' in the diagram. + * + * A => struct spi_device -> cs_setup + * B => max{struct spi_transfer -> word_delay, struct spi_device -> word_delay} + * Note: spi_device and spi_transfer both have word_delay, Linux + * choose the bigger one, refer to _spi_xfer_word_delay_update function + * C => struct spi_transfer -> delay + * D => struct spi_device -> cs_hold + * E => struct spi_device -> cs_inactive + * F => struct spi_transfer -> cs_change_delay + * + * So the corresponding relationship: + * A <===> cs_setup_ns (after CS asserted) + * B <===> word_delay_ns (delay between adjacent words within a transfer) + * C+D <===> cs_delay_hold_ns (before CS deasserted) + * E+F <===> cs_change_delay_inactive_ns (after CS deasserted, these two + * values are also recommended in the Linux driver to be added up) + */ +static int virtio_spi_set_delays(struct spi_transfer_head *th, + struct spi_device *spi, + struct spi_transfer *xfer) +{ + int cs_setup; + int cs_word_delay_xfer; + int cs_word_delay_spi; + int delay; + int cs_hold; + int cs_inactive; + int cs_change_delay; + + cs_setup = spi_delay_to_ns(&spi->cs_setup, xfer); + if (cs_setup < 0) { + dev_warn(&spi->dev, "Cannot convert cs_setup\n"); + return cs_setup; + } + th->cs_setup_ns = cpu_to_le32(cs_setup); + + cs_word_delay_xfer = spi_delay_to_ns(&xfer->word_delay, xfer); + if (cs_word_delay_xfer < 0) { + dev_warn(&spi->dev, "Cannot convert cs_word_delay_xfer\n"); + return cs_word_delay_xfer; + } + cs_word_delay_spi = spi_delay_to_ns(&spi->word_delay, xfer); + if (cs_word_delay_spi < 0) { + dev_warn(&spi->dev, "Cannot convert cs_word_delay_spi\n"); + return cs_word_delay_spi; + } + + th->word_delay_ns = cpu_to_le32(max(cs_word_delay_spi, cs_word_delay_xfer)); + + delay = spi_delay_to_ns(&xfer->delay, xfer); + if (delay < 0) { + dev_warn(&spi->dev, "Cannot convert delay\n"); + return delay; + } + cs_hold = spi_delay_to_ns(&spi->cs_hold, xfer); + if (cs_hold < 0) { + dev_warn(&spi->dev, "Cannot convert cs_hold\n"); + return cs_hold; + } + th->cs_delay_hold_ns = cpu_to_le32(delay + cs_hold); + + cs_inactive = spi_delay_to_ns(&spi->cs_inactive, xfer); + if (cs_inactive < 0) { + dev_warn(&spi->dev, "Cannot convert cs_inactive\n"); + return cs_inactive; + } + cs_change_delay = spi_delay_to_ns(&xfer->cs_change_delay, xfer); + if (cs_change_delay < 0) { + dev_warn(&spi->dev, "Cannot convert cs_change_delay\n"); + return cs_change_delay; + } + th->cs_change_delay_inactive_ns = + cpu_to_le32(cs_inactive + cs_change_delay); + + return 0; +} + +static int virtio_spi_transfer_one(struct spi_controller *ctrl, + struct spi_device *spi, + struct spi_transfer *xfer) +{ + struct virtio_spi_priv *priv = spi_controller_get_devdata(ctrl); + struct virtio_spi_req *spi_req __free(kfree) = NULL; + struct spi_transfer_head *th; + struct scatterlist sg_out_head, sg_out_payload; + struct scatterlist sg_in_result, sg_in_payload; + struct scatterlist *sgs[4]; + unsigned int outcnt = 0; + unsigned int incnt = 0; + int ret; + + spi_req = kzalloc(sizeof(*spi_req), GFP_KERNEL); + if (!spi_req) + return -ENOMEM; + + init_completion(&spi_req->completion); + + th = &spi_req->transfer_head; + + /* Fill struct spi_transfer_head */ + th->chip_select_id = spi_get_chipselect(spi, 0); + th->bits_per_word = spi->bits_per_word; + th->cs_change = xfer->cs_change; + th->tx_nbits = xfer->tx_nbits; + th->rx_nbits = xfer->rx_nbits; + th->reserved[0] = 0; + th->reserved[1] = 0; + th->reserved[2] = 0; + + static_assert(VIRTIO_SPI_CPHA == SPI_CPHA, + "VIRTIO_SPI_CPHA must match SPI_CPHA"); + static_assert(VIRTIO_SPI_CPOL == SPI_CPOL, + "VIRTIO_SPI_CPOL must match SPI_CPOL"); + static_assert(VIRTIO_SPI_CS_HIGH == SPI_CS_HIGH, + "VIRTIO_SPI_CS_HIGH must match SPI_CS_HIGH"); + static_assert(VIRTIO_SPI_MODE_LSB_FIRST == SPI_LSB_FIRST, + "VIRTIO_SPI_MODE_LSB_FIRST must match SPI_LSB_FIRST"); + + th->mode = cpu_to_le32(spi->mode & VIRTIO_SPI_MODE_MASK); + if (spi->mode & SPI_LOOP) + th->mode |= cpu_to_le32(VIRTIO_SPI_MODE_LOOP); + + th->freq = cpu_to_le32(xfer->speed_hz); + + ret = virtio_spi_set_delays(th, spi, xfer); + if (ret) + goto msg_done; + + /* Set buffers */ + spi_req->tx_buf = xfer->tx_buf; + spi_req->rx_buf = xfer->rx_buf; + + /* Prepare sending of virtio message */ + init_completion(&spi_req->completion); + + sg_init_one(&sg_out_head, th, sizeof(*th)); + sgs[outcnt] = &sg_out_head; + outcnt++; + + if (spi_req->tx_buf) { + sg_init_one(&sg_out_payload, spi_req->tx_buf, xfer->len); + sgs[outcnt] = &sg_out_payload; + outcnt++; + } + + if (spi_req->rx_buf) { + sg_init_one(&sg_in_payload, spi_req->rx_buf, xfer->len); + sgs[outcnt] = &sg_in_payload; + incnt++; + } + + sg_init_one(&sg_in_result, &spi_req->result, + sizeof(struct spi_transfer_result)); + sgs[outcnt + incnt] = &sg_in_result; + incnt++; + + ret = virtqueue_add_sgs(priv->vq, sgs, outcnt, incnt, spi_req, + GFP_KERNEL); + if (ret) + goto msg_done; + + /* Simple implementation: There can be only one transfer in flight */ + virtqueue_kick(priv->vq); + + wait_for_completion(&spi_req->completion); + + /* Read result from message and translate return code */ + switch (spi_req->result.result) { + case VIRTIO_SPI_TRANS_OK: + break; + case VIRTIO_SPI_PARAM_ERR: + ret = -EINVAL; + break; + case VIRTIO_SPI_TRANS_ERR: + ret = -EIO; + break; + default: + ret = -EIO; + break; + } + +msg_done: + if (ret) + ctrl->cur_msg->status = ret; + + return ret; +} + +static void virtio_spi_read_config(struct virtio_device *vdev) +{ + struct spi_controller *ctrl = dev_get_drvdata(&vdev->dev); + struct virtio_spi_priv *priv = vdev->priv; + u8 cs_max_number; + u8 tx_nbits_supported; + u8 rx_nbits_supported; + + cs_max_number = virtio_cread8(vdev, offsetof(struct virtio_spi_config, + cs_max_number)); + ctrl->num_chipselect = cs_max_number; + + /* Set the mode bits which are understood by this driver */ + priv->mode_func_supported = + virtio_cread32(vdev, offsetof(struct virtio_spi_config, + mode_func_supported)); + ctrl->mode_bits = priv->mode_func_supported & + (VIRTIO_SPI_CS_HIGH | VIRTIO_SPI_MODE_LSB_FIRST); + if (priv->mode_func_supported & VIRTIO_SPI_MF_SUPPORT_CPHA_1) + ctrl->mode_bits |= VIRTIO_SPI_CPHA; + if (priv->mode_func_supported & VIRTIO_SPI_MF_SUPPORT_CPOL_1) + ctrl->mode_bits |= VIRTIO_SPI_CPOL; + if (priv->mode_func_supported & VIRTIO_SPI_MF_SUPPORT_LSB_FIRST) + ctrl->mode_bits |= SPI_LSB_FIRST; + if (priv->mode_func_supported & VIRTIO_SPI_MF_SUPPORT_LOOPBACK) + ctrl->mode_bits |= SPI_LOOP; + tx_nbits_supported = + virtio_cread8(vdev, offsetof(struct virtio_spi_config, + tx_nbits_supported)); + if (tx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_DUAL) + ctrl->mode_bits |= SPI_TX_DUAL; + if (tx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_QUAD) + ctrl->mode_bits |= SPI_TX_QUAD; + if (tx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_OCTAL) + ctrl->mode_bits |= SPI_TX_OCTAL; + rx_nbits_supported = + virtio_cread8(vdev, offsetof(struct virtio_spi_config, + rx_nbits_supported)); + if (rx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_DUAL) + ctrl->mode_bits |= SPI_RX_DUAL; + if (rx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_QUAD) + ctrl->mode_bits |= SPI_RX_QUAD; + if (rx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_OCTAL) + ctrl->mode_bits |= SPI_RX_OCTAL; + + ctrl->bits_per_word_mask = + virtio_cread32(vdev, offsetof(struct virtio_spi_config, + bits_per_word_mask)); + + priv->max_freq_hz = + virtio_cread32(vdev, offsetof(struct virtio_spi_config, + max_freq_hz)); +} + +static int virtio_spi_find_vqs(struct virtio_spi_priv *priv) +{ + struct virtqueue *vq; + + vq = virtio_find_single_vq(priv->vdev, virtio_spi_msg_done, "spi-rq"); + if (IS_ERR(vq)) + return PTR_ERR(vq); + priv->vq = vq; + return 0; +} + +/* Function must not be called before virtio_spi_find_vqs() has been run */ +static void virtio_spi_del_vq(void *data) +{ + struct virtio_device *vdev = data; + + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); +} + +static int virtio_spi_probe(struct virtio_device *vdev) +{ + struct virtio_spi_priv *priv; + struct spi_controller *ctrl; + int ret; + + ctrl = devm_spi_alloc_host(&vdev->dev, sizeof(*priv)); + if (!ctrl) + return -ENOMEM; + + priv = spi_controller_get_devdata(ctrl); + priv->vdev = vdev; + vdev->priv = priv; + + device_set_node(&ctrl->dev, dev_fwnode(&vdev->dev)); + + dev_set_drvdata(&vdev->dev, ctrl); + + virtio_spi_read_config(vdev); + + ctrl->transfer_one = virtio_spi_transfer_one; + + ret = virtio_spi_find_vqs(priv); + if (ret) + return dev_err_probe(&vdev->dev, ret, "Cannot setup virtqueues\n"); + + /* Register cleanup for virtqueues using devm */ + ret = devm_add_action_or_reset(&vdev->dev, virtio_spi_del_vq, vdev); + if (ret) + return dev_err_probe(&vdev->dev, ret, "Cannot register virtqueue cleanup\n"); + + /* Use devm version to register controller */ + ret = devm_spi_register_controller(&vdev->dev, ctrl); + if (ret) + return dev_err_probe(&vdev->dev, ret, "Cannot register controller\n"); + + return 0; +} + +static int virtio_spi_freeze(struct device *dev) +{ + struct spi_controller *ctrl = dev_get_drvdata(dev); + struct virtio_device *vdev = dev_to_virtio(dev); + int ret; + + ret = spi_controller_suspend(ctrl); + if (ret) { + dev_warn(dev, "cannot suspend controller (%d)\n", ret); + return ret; + } + + virtio_spi_del_vq(vdev); + return 0; +} + +static int virtio_spi_restore(struct device *dev) +{ + struct spi_controller *ctrl = dev_get_drvdata(dev); + struct virtio_device *vdev = dev_to_virtio(dev); + int ret; + + ret = virtio_spi_find_vqs(vdev->priv); + if (ret) { + dev_err(dev, "problem starting vqueue (%d)\n", ret); + return ret; + } + + ret = spi_controller_resume(ctrl); + if (ret) + dev_err(dev, "problem resuming controller (%d)\n", ret); + + return ret; +} + +static struct virtio_device_id virtio_spi_id_table[] = { + { VIRTIO_ID_SPI, VIRTIO_DEV_ANY_ID }, + {} +}; +MODULE_DEVICE_TABLE(virtio, virtio_spi_id_table); + +static const struct dev_pm_ops virtio_spi_pm_ops = { + .freeze = pm_sleep_ptr(virtio_spi_freeze), + .restore = pm_sleep_ptr(virtio_spi_restore), +}; + +static struct virtio_driver virtio_spi_driver = { + .driver = { + .name = KBUILD_MODNAME, + .pm = &virtio_spi_pm_ops, + }, + .id_table = virtio_spi_id_table, + .probe = virtio_spi_probe, +}; +module_virtio_driver(virtio_spi_driver); + +MODULE_AUTHOR("OpenSynergy GmbH"); +MODULE_AUTHOR("Haixu Cui "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Virtio SPI bus driver"); From 188f63235bcdd207646773a8739387d85347ed76 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:19 +0200 Subject: [PATCH 2423/3206] spi: fix return code when spi device has too many chipselects Don't return a positive value when there are too many chipselects. Fixes: 4d8ff6b0991d ("spi: Add multi-cs memories support in SPI core") Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-2-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index f95c4304df8e91..b07d6cdf587cee 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2445,7 +2445,7 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, if (rc > ctlr->num_chipselect) { dev_err(&ctlr->dev, "%pOF has number of CS > ctlr->num_chipselect (%d)\n", nc, rc); - return rc; + return -EINVAL; } if ((of_property_present(nc, "parallel-memories")) && (!(ctlr->flags & SPI_CONTROLLER_MULTI_CS))) { From 099f942182e3695554cba44e4bafb08a4111b50f Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:20 +0200 Subject: [PATCH 2424/3206] spi: keep track of number of chipselects in spi_device There are several places where we need to iterate over a device's chipselect. To be able to do it efficiently, store the number of chipselects in spi_device, like we do for controllers. Since we now use a device supplied value, add a check to make sure it isn't more than we can support. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-3-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 29 +++++++++++++++++++++-------- include/linux/spi/spi.h | 4 +++- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index b07d6cdf587cee..6598fb862d80a7 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -586,6 +586,7 @@ struct spi_device *spi_alloc_device(struct spi_controller *ctlr) spi->dev.bus = &spi_bus_type; spi->dev.release = spidev_release; spi->mode = ctlr->buswidth_override_bits; + spi->num_chipselect = 1; device_initialize(&spi->dev); return spi; @@ -635,7 +636,7 @@ static inline int spi_dev_check_cs(struct device *dev, u8 idx_new; cs = spi_get_chipselect(spi, idx); - for (idx_new = new_idx; idx_new < SPI_CS_CNT_MAX; idx_new++) { + for (idx_new = new_idx; idx_new < new_spi->num_chipselect; idx_new++) { cs_new = spi_get_chipselect(new_spi, idx_new); if (is_valid_cs(cs) && is_valid_cs(cs_new) && cs == cs_new) { dev_err(dev, "chipselect %u already in use\n", cs_new); @@ -652,7 +653,7 @@ static int spi_dev_check(struct device *dev, void *data) int status, idx; if (spi->controller == new_spi->controller) { - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { status = spi_dev_check_cs(dev, spi, idx, new_spi, 0); if (status) return status; @@ -674,7 +675,13 @@ static int __spi_add_device(struct spi_device *spi) int status, idx; u8 cs; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if (spi->num_chipselect > SPI_CS_CNT_MAX) { + dev_err(dev, "num_cs %d > max %d\n", spi->num_chipselect, + SPI_CS_CNT_MAX); + return -EOVERFLOW; + } + + for (idx = 0; idx < spi->num_chipselect; idx++) { /* Chipselects are numbered 0..max; validate. */ cs = spi_get_chipselect(spi, idx); if (is_valid_cs(cs) && cs >= ctlr->num_chipselect) { @@ -689,7 +696,7 @@ static int __spi_add_device(struct spi_device *spi) * For example, spi->chip_select[0] != spi->chip_select[1] and so on. */ if (!spi_controller_is_target(ctlr)) { - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { status = spi_dev_check_cs(dev, spi, idx, spi, idx + 1); if (status) return status; @@ -717,7 +724,7 @@ static int __spi_add_device(struct spi_device *spi) if (ctlr->cs_gpiods) { u8 cs; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { cs = spi_get_chipselect(spi, idx); if (is_valid_cs(cs)) spi_set_csgpiod(spi, idx, ctlr->cs_gpiods[cs]); @@ -1024,7 +1031,7 @@ static void spi_res_release(struct spi_controller *ctlr, struct spi_message *mes /*-------------------------------------------------------------------------*/ #define spi_for_each_valid_cs(spi, idx) \ - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) \ + for (idx = 0; idx < spi->num_chipselect; idx++) \ if (!(spi->cs_index_mask & BIT(idx))) {} else static inline bool spi_is_last_cs(struct spi_device *spi) @@ -1080,8 +1087,12 @@ static void spi_set_cs(struct spi_device *spi, bool enable, bool force) trace_spi_set_cs(spi, activate); spi->controller->last_cs_index_mask = spi->cs_index_mask; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) - spi->controller->last_cs[idx] = enable ? spi_get_chipselect(spi, 0) : SPI_INVALID_CS; + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if (enable && idx < spi->num_chipselect) + spi->controller->last_cs[idx] = spi_get_chipselect(spi, 0); + else + spi->controller->last_cs[idx] = SPI_INVALID_CS; + } spi->controller->last_cs_mode_high = spi->mode & SPI_CS_HIGH; if (spi->controller->last_cs_mode_high) @@ -2452,6 +2463,8 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, dev_err(&ctlr->dev, "SPI controller doesn't support multi CS\n"); return -EINVAL; } + + spi->num_chipselect = rc; for (idx = 0; idx < rc; idx++) spi_set_chipselect(spi, idx, cs[idx]); diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e9ea43234d9a87..49c048277e9790 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -170,6 +170,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * two delays will be added up. * @chip_select: Array of physical chipselect, spi->chipselect[i] gives * the corresponding physical CS for logical CS i. + * @num_chipselect: Number of physical chipselects used. * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines * (optional, NULL when not using a GPIO line) @@ -229,6 +230,7 @@ struct spi_device { struct spi_delay cs_inactive; u8 chip_select[SPI_CS_CNT_MAX]; + u8 num_chipselect; /* * Bit mask of the chipselect(s) that the driver need to use from @@ -315,7 +317,7 @@ static inline bool spi_is_csgpiod(struct spi_device *spi) { u8 idx; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { if (spi_get_csgpiod(spi, idx)) return true; } From 1c923f624439b26b6740cdd2a9f7a12b1968f3f3 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:21 +0200 Subject: [PATCH 2425/3206] spi: move unused device CS initialization to __spi_add_device() Using spi_device::num_chipselect, initialize unused device CS as invalid at registration time in __spi_add_device(), and drop it from the different allocation paths. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-4-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 6598fb862d80a7..ad965eae9c9fbc 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -703,6 +703,10 @@ static int __spi_add_device(struct spi_device *spi) } } + /* Initialize unused logical CS as invalid */ + for (idx = spi->num_chipselect; idx < SPI_CS_CNT_MAX; idx++) + spi_set_chipselect(spi, idx, SPI_INVALID_CS); + /* Set the bus ID string */ spi_dev_set_name(spi); @@ -780,14 +784,6 @@ int spi_add_device(struct spi_device *spi) } EXPORT_SYMBOL_GPL(spi_add_device); -static void spi_set_all_cs_unused(struct spi_device *spi) -{ - u8 idx; - - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) - spi_set_chipselect(spi, idx, SPI_INVALID_CS); -} - /** * spi_new_device - instantiate one new SPI device * @ctlr: Controller to which device is connected @@ -823,7 +819,6 @@ struct spi_device *spi_new_device(struct spi_controller *ctlr, WARN_ON(strlen(chip->modalias) >= sizeof(proxy->modalias)); /* Use provided chip-select for proxy device */ - spi_set_all_cs_unused(proxy); spi_set_chipselect(proxy, 0, chip->chip_select); proxy->max_speed_hz = chip->max_speed_hz; @@ -2443,8 +2438,6 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, return -EINVAL; } - spi_set_all_cs_unused(spi); - /* Device address */ rc = of_property_read_variable_u32_array(nc, "reg", &cs[0], 1, SPI_CS_CNT_MAX); @@ -2589,7 +2582,6 @@ struct spi_device *spi_new_ancillary_device(struct spi_device *spi, strscpy(ancillary->modalias, "dummy", sizeof(ancillary->modalias)); /* Use provided chip-select for ancillary device */ - spi_set_all_cs_unused(ancillary); spi_set_chipselect(ancillary, 0, chip_select); /* Take over SPI mode/speed from SPI main device */ @@ -2837,7 +2829,6 @@ struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, return ERR_PTR(-ENOMEM); } - spi_set_all_cs_unused(spi); spi_set_chipselect(spi, 0, lookup.chip_select); ACPI_COMPANION_SET(&spi->dev, adev); From f3982daccf42cefcd80218c76a6b5dd134fe97e3 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:22 +0200 Subject: [PATCH 2426/3206] spi: drop check for validity of device chip selects Now that we know the number of chip selects of a device, we can assume these are valid, and do not need to check them first. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-5-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index ad965eae9c9fbc..91e2f4f504e88e 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -623,11 +623,6 @@ static void spi_dev_set_name(struct spi_device *spi) */ #define SPI_INVALID_CS ((s8)-1) -static inline bool is_valid_cs(s8 chip_select) -{ - return chip_select != SPI_INVALID_CS; -} - static inline int spi_dev_check_cs(struct device *dev, struct spi_device *spi, u8 idx, struct spi_device *new_spi, u8 new_idx) @@ -638,7 +633,7 @@ static inline int spi_dev_check_cs(struct device *dev, cs = spi_get_chipselect(spi, idx); for (idx_new = new_idx; idx_new < new_spi->num_chipselect; idx_new++) { cs_new = spi_get_chipselect(new_spi, idx_new); - if (is_valid_cs(cs) && is_valid_cs(cs_new) && cs == cs_new) { + if (cs == cs_new) { dev_err(dev, "chipselect %u already in use\n", cs_new); return -EBUSY; } @@ -684,7 +679,7 @@ static int __spi_add_device(struct spi_device *spi) for (idx = 0; idx < spi->num_chipselect; idx++) { /* Chipselects are numbered 0..max; validate. */ cs = spi_get_chipselect(spi, idx); - if (is_valid_cs(cs) && cs >= ctlr->num_chipselect) { + if (cs >= ctlr->num_chipselect) { dev_err(dev, "cs%d >= max %d\n", spi_get_chipselect(spi, idx), ctlr->num_chipselect); return -EINVAL; @@ -730,8 +725,7 @@ static int __spi_add_device(struct spi_device *spi) for (idx = 0; idx < spi->num_chipselect; idx++) { cs = spi_get_chipselect(spi, idx); - if (is_valid_cs(cs)) - spi_set_csgpiod(spi, idx, ctlr->cs_gpiods[cs]); + spi_set_csgpiod(spi, idx, ctlr->cs_gpiods[cs]); } } From 83c522fb642384aef43697aa5c7686363e9e92dd Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:23 +0200 Subject: [PATCH 2427/3206] spi: don't check spi_controller::num_chipselect when parsing a dt device Do not validate spi_controller::num_chipselect against SPI_CS_CNT_MAX when parsing an spi device firmware node. Firstly this is the wrong place, and this should be done while registering/validating the controller. Secondly, there is no reason for that check, as SPI_CS_CNT_MAX controls the amount of chipselects a device may have, not a controller may have. So drop that check as it needlessly limits controllers to SPI_CS_CNT_MAX number of chipselects. Likewise, drop the check for number of device chipselects larger than controller's number of chipselects, as __spi_add_device() will already catch that as either one of the chip selects will be out of range, or there is a duplicate one. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-6-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 91e2f4f504e88e..2eb361e9e44d0f 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2427,11 +2427,6 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, return 0; } - if (ctlr->num_chipselect > SPI_CS_CNT_MAX) { - dev_err(&ctlr->dev, "No. of CS is more than max. no. of supported CS\n"); - return -EINVAL; - } - /* Device address */ rc = of_property_read_variable_u32_array(nc, "reg", &cs[0], 1, SPI_CS_CNT_MAX); @@ -2440,11 +2435,7 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, nc, rc); return rc; } - if (rc > ctlr->num_chipselect) { - dev_err(&ctlr->dev, "%pOF has number of CS > ctlr->num_chipselect (%d)\n", - nc, rc); - return -EINVAL; - } + if ((of_property_present(nc, "parallel-memories")) && (!(ctlr->flags & SPI_CONTROLLER_MULTI_CS))) { dev_err(&ctlr->dev, "SPI controller doesn't support multi CS\n"); From 08fda410bae41cc8dde9697f9104da525be53153 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:24 +0200 Subject: [PATCH 2428/3206] spi: reduce device chip select limit again The spi chipselect limit SPI_CS_CNT_MAX was raised with commit 2f8c7c3715f2 ("spi: Raise limit on number of chip selects") from 4 to 16 to accommodate spi controllers with more than 4 chip selects, and then later to 24 with commit 96893cdd4760 ("spi: Raise limit on number of chip selects to 24"). Now that we removed SPI_CS_CNT_MAX limiting the chip selects of controllers, we can reduce the amount of chip selects per device again to 4, the original value. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-7-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 49c048277e9790..df4842abbc6ff0 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -21,7 +21,7 @@ #include /* Max no. of CS supported per spi device */ -#define SPI_CS_CNT_MAX 24 +#define SPI_CS_CNT_MAX 4 struct dma_chan; struct software_node; From e336ab509b43ea601801dfa05b4270023c3ed007 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:25 +0200 Subject: [PATCH 2429/3206] spi: rename SPI_CS_CNT_MAX => SPI_DEVICE_CS_CNT_MAX Rename SPI_CS_CNT_MAX to SPI_DEVICE_CS_CNT_MAX to make it more obvious that this is the max number of CS per device supported, not per controller. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-8-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 2 +- drivers/spi/spi.c | 14 +++++++------- include/linux/spi/spi.h | 12 ++++++------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index af253b86f1ab26..2ce4fc136a51ce 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -33,7 +33,7 @@ #define CQSPI_NAME "cadence-qspi" #define CQSPI_MAX_CHIPSELECT 4 -static_assert(CQSPI_MAX_CHIPSELECT <= SPI_CS_CNT_MAX); +static_assert(CQSPI_MAX_CHIPSELECT <= SPI_DEVICE_CS_CNT_MAX); /* Quirks */ #define CQSPI_NEEDS_WR_DELAY BIT(0) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 2eb361e9e44d0f..2e0647a0689029 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -670,9 +670,9 @@ static int __spi_add_device(struct spi_device *spi) int status, idx; u8 cs; - if (spi->num_chipselect > SPI_CS_CNT_MAX) { + if (spi->num_chipselect > SPI_DEVICE_CS_CNT_MAX) { dev_err(dev, "num_cs %d > max %d\n", spi->num_chipselect, - SPI_CS_CNT_MAX); + SPI_DEVICE_CS_CNT_MAX); return -EOVERFLOW; } @@ -699,7 +699,7 @@ static int __spi_add_device(struct spi_device *spi) } /* Initialize unused logical CS as invalid */ - for (idx = spi->num_chipselect; idx < SPI_CS_CNT_MAX; idx++) + for (idx = spi->num_chipselect; idx < SPI_DEVICE_CS_CNT_MAX; idx++) spi_set_chipselect(spi, idx, SPI_INVALID_CS); /* Set the bus ID string */ @@ -1076,7 +1076,7 @@ static void spi_set_cs(struct spi_device *spi, bool enable, bool force) trace_spi_set_cs(spi, activate); spi->controller->last_cs_index_mask = spi->cs_index_mask; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < SPI_DEVICE_CS_CNT_MAX; idx++) { if (enable && idx < spi->num_chipselect) spi->controller->last_cs[idx] = spi_get_chipselect(spi, 0); else @@ -2354,7 +2354,7 @@ static void of_spi_parse_dt_cs_delay(struct device_node *nc, static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, struct device_node *nc) { - u32 value, cs[SPI_CS_CNT_MAX]; + u32 value, cs[SPI_DEVICE_CS_CNT_MAX]; int rc, idx; /* Mode (clock phase/polarity/etc.) */ @@ -2429,7 +2429,7 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, /* Device address */ rc = of_property_read_variable_u32_array(nc, "reg", &cs[0], 1, - SPI_CS_CNT_MAX); + SPI_DEVICE_CS_CNT_MAX); if (rc < 0) { dev_err(&ctlr->dev, "%pOF has no valid 'reg' property (%d)\n", nc, rc); @@ -3313,7 +3313,7 @@ int spi_register_controller(struct spi_controller *ctlr) } /* Setting last_cs to SPI_INVALID_CS means no chip selected */ - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) + for (idx = 0; idx < SPI_DEVICE_CS_CNT_MAX; idx++) ctlr->last_cs[idx] = SPI_INVALID_CS; status = device_add(&ctlr->dev); diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index df4842abbc6ff0..cb2c2df3108999 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -21,7 +21,7 @@ #include /* Max no. of CS supported per spi device */ -#define SPI_CS_CNT_MAX 4 +#define SPI_DEVICE_CS_CNT_MAX 4 struct dma_chan; struct software_node; @@ -229,7 +229,7 @@ struct spi_device { struct spi_delay cs_hold; struct spi_delay cs_inactive; - u8 chip_select[SPI_CS_CNT_MAX]; + u8 chip_select[SPI_DEVICE_CS_CNT_MAX]; u8 num_chipselect; /* @@ -238,9 +238,9 @@ struct spi_device { * multiple chip selects & memories are connected in parallel * then more than one bit need to be set in cs_index_mask. */ - u32 cs_index_mask : SPI_CS_CNT_MAX; + u32 cs_index_mask : SPI_DEVICE_CS_CNT_MAX; - struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ + struct gpio_desc *cs_gpiod[SPI_DEVICE_CS_CNT_MAX]; /* Chip select gpio desc */ /* * Likely need more hooks for more protocol options affecting how @@ -721,8 +721,8 @@ struct spi_controller { bool auto_runtime_pm; bool fallback; bool last_cs_mode_high; - s8 last_cs[SPI_CS_CNT_MAX]; - u32 last_cs_index_mask : SPI_CS_CNT_MAX; + s8 last_cs[SPI_DEVICE_CS_CNT_MAX]; + u32 last_cs_index_mask : SPI_DEVICE_CS_CNT_MAX; struct completion xfer_completion; size_t max_dma_len; From aa8fc9469d20d59dc36aae64226362303daeab4a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 19 Jul 2025 18:17:26 +0930 Subject: [PATCH 2430/3206] btrfs: replace double boolean parameters of cow_file_range() The function cow_file_range() has two boolean parameters. Replace it with a single @flags parameter, with two flags: - COW_FILE_RANGE_NO_INLINE - COW_FILE_RANGE_KEEP_LOCKED And since we're here, also update the comments of cow_file_range() to replace the old "page" usage with "folio". Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 18db1053cdf087..225e9221c4569d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -72,6 +72,9 @@ #include "raid-stripe-tree.h" #include "fiemap.h" +#define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0) +#define COW_FILE_RANGE_NO_INLINE (1UL << 1) + struct btrfs_iget_args { u64 ino; struct btrfs_root *root; @@ -1245,18 +1248,18 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_folio. + * When this function fails, it unlocks all folios except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_folio and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_folio is - * the only page handled anyway). + * unlocks all folios including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single block, so locked_folio is + * the only folio handled anyway). * - * When this function succeed and creates a normal extent, the page locking + * When this function succeed and creates a normal extent, the folio locking * status depends on the passed in flags: * - * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_folio are unlocked. + * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked. + * - Else all folios except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are cleaned up. @@ -1264,7 +1267,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, u64 *done_offset, - bool keep_locked, bool no_inline) + unsigned long flags) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1292,7 +1295,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - if (!no_inline) { + if (!(flags & COW_FILE_RANGE_NO_INLINE)) { /* lets try to make an inline extent */ ret = cow_file_range_inline(inode, locked_folio, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); @@ -1320,7 +1323,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Do set the Ordered (Private2) bit so we know this page was properly * setup for writepage. */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; /* @@ -1687,7 +1690,7 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, while (start <= end) { ret = cow_file_range(inode, locked_folio, start, end, - &done_offset, true, false); + &done_offset, COW_FILE_RANGE_KEEP_LOCKED); if (ret) return ret; extent_write_locked_range(&inode->vfs_inode, locked_folio, @@ -1769,8 +1772,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, locked_folio, start, end, NULL, false, - true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, + COW_FILE_RANGE_NO_INLINE); ASSERT(ret != 1); return ret; } @@ -2349,8 +2352,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_folio, start, end, NULL, - false, false); + ret = cow_file_range(inode, locked_folio, start, end, NULL, 0); return ret; } From 6ebd726b104fa99d47c0d45979e6a6109844ac18 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 16 Jul 2025 14:56:11 +0100 Subject: [PATCH 2431/3206] btrfs: abort transaction on specific error places when walking log tree We do several things while walking a log tree (for replaying and for freeing a log tree) like reading extent buffers and cleaning them up, but we don't immediately abort the transaction, or turn the fs into an error state, when one of these things fails. Instead we the transaction abort or turn the fs into error state in the caller of the entry point function that walks a log tree - walk_log_tree() - which means we don't get to know exactly where an error came from. Improve on this by doing a transaction abort / turn fs into error state after each such failure so that when it happens we have a better understanding where the failure comes from. This deliberately leaves the transaction abort / turn fs into error state in the callers of walk_log_tree() as to ensure we don't get into an inconsistent state in case we forget to do it deeper in call chain. It also deliberately does not do it after errors from the calls to the callback defined in struct walk_control::process_func(), as we will do it later on another patch. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7a63afedd01e6e..6d92326a1a0c74 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2630,15 +2630,24 @@ static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) static int clean_log_buffer(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { + int ret; + btrfs_tree_lock(eb); btrfs_clear_buffer_dirty(trans, eb); wait_on_extent_buffer_writeback(eb); btrfs_tree_unlock(eb); - if (trans) - return btrfs_pin_reserved_extent(trans, eb); + if (trans) { + ret = btrfs_pin_reserved_extent(trans, eb); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; + } - return unaccount_log_buffer(eb->fs_info, eb->start); + ret = unaccount_log_buffer(eb->fs_info, eb->start); + if (ret) + btrfs_handle_fs_error(eb->fs_info, ret, NULL); + return ret; } static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, @@ -2674,8 +2683,14 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_header_owner(cur), *level - 1); - if (IS_ERR(next)) - return PTR_ERR(next); + if (IS_ERR(next)) { + ret = PTR_ERR(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; + } if (*level == 1) { ret = wc->process_func(root, next, wc, ptr_gen, @@ -2690,6 +2705,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; } @@ -2705,6 +2724,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; } From e6dd405b6671b9753b98d8bdf76f8f0ed36c11cd Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 16 Jul 2025 15:49:31 +0100 Subject: [PATCH 2432/3206] btrfs: abort transaction in the process_one_buffer() log tree walk callback In the process_one_buffer() log tree walk callback we return errors to the log tree walk caller and then the caller aborts the transaction, if we have one, or turns the fs into error state if we don't have one. While this reduces code it makes it harder to figure out where exactly an error came from. So add the transaction aborts after every failure inside the process_one_buffer() callback, so that it helps figuring out why failures happen. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6d92326a1a0c74..50ed84cb68a696 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -347,6 +347,7 @@ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; @@ -361,18 +362,29 @@ static int process_one_buffer(struct btrfs_root *log, }; ret = btrfs_read_extent_buffer(eb, &check); - if (ret) + if (ret) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; + } } if (wc->pin) { - ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); - if (ret) + ASSERT(trans != NULL); + ret = btrfs_pin_extent_for_log_replay(trans, eb); + if (ret) { + btrfs_abort_transaction(trans, ret); return ret; + } if (btrfs_buffer_uptodate(eb, gen, 0) && - btrfs_header_level(eb) == 0) + btrfs_header_level(eb) == 0) { ret = btrfs_exclude_logged_extents(eb); + if (ret) + btrfs_abort_transaction(trans, ret); + } } return ret; } From 425652cf102838676d128e639ec5c2e416264007 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 16 Jul 2025 16:22:32 +0100 Subject: [PATCH 2433/3206] btrfs: use local variable for the transaction handle in replay_one_buffer() Instead of keep dereferencing the walk_control structure to extract the transaction handle whenever is needed, do it once by storing it in a local variable and then use the variable everywhere. This reduces code verbosity and eliminates the need for some split lines. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 50ed84cb68a696..bb8d4d1a30ef09 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2457,6 +2457,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, }; struct btrfs_path *path; struct btrfs_root *root = wc->replay_dest; + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_key key; int i; int ret; @@ -2514,18 +2515,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, wc->stage == LOG_WALK_REPLAY_INODES) { u32 mode; - ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); + ret = replay_xattr_deletes(trans, root, log, path, key.objectid); if (ret) break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { - ret = replay_dir_deletes(wc->trans, root, log, path, + ret = replay_dir_deletes(trans, root, log, path, key.objectid, false); if (ret) break; } - ret = overwrite_item(wc->trans, root, path, - eb, i, &key); + ret = overwrite_item(trans, root, path, eb, i, &key); if (ret) break; @@ -2552,21 +2552,19 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, inode, - &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (!ret) { inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, inode); + ret = btrfs_update_inode(trans, inode); } iput(&inode->vfs_inode); if (ret) break; } - ret = link_to_fixup_dir(wc->trans, root, - path, key.objectid); + ret = link_to_fixup_dir(trans, root, path, key.objectid); if (ret) break; } @@ -2576,8 +2574,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { - ret = replay_one_dir_item(wc->trans, root, path, - eb, i, &key); + ret = replay_one_dir_item(trans, root, path, eb, i, &key); if (ret) break; } @@ -2587,19 +2584,16 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, /* these keys are simply copied */ if (key.type == BTRFS_XATTR_ITEM_KEY) { - ret = overwrite_item(wc->trans, root, path, - eb, i, &key); + ret = overwrite_item(trans, root, path, eb, i, &key); if (ret) break; } else if (key.type == BTRFS_INODE_REF_KEY || key.type == BTRFS_INODE_EXTREF_KEY) { - ret = add_inode_ref(wc->trans, root, log, path, - eb, i, &key); + ret = add_inode_ref(trans, root, log, path, eb, i, &key); if (ret) break; } else if (key.type == BTRFS_EXTENT_DATA_KEY) { - ret = replay_one_extent(wc->trans, root, path, - eb, i, &key); + ret = replay_one_extent(trans, root, path, eb, i, &key); if (ret) break; } From 874576d2a79a2e94cffc89b3bd0c285141b99208 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 16 Jul 2025 17:36:13 +0100 Subject: [PATCH 2434/3206] btrfs: return real error from read_alloc_one_name() in drop_one_dir_item() If read_alloc_one_name() we explicitly return -ENOMEM and currently that is fine since it's the only error read_alloc_one_name() can return for now. However this is fragile and not future proof, so return instead what read_alloc_one_name() returned. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bb8d4d1a30ef09..a4068be25996bb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -943,7 +943,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(leaf, di, &location); ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); if (ret) - return -ENOMEM; + return ret; btrfs_release_path(path); From 912c257c88cd8c9a466a3cf702912a03a979fd06 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 18 Jul 2025 08:30:21 +0100 Subject: [PATCH 2435/3206] btrfs: abort transaction where errors happen during log tree replay In the replay_one_buffer() log tree walk callback we return errors to the log tree walk caller and then the caller aborts the transaction, if we have one, or turns the fs into error state if we don't have one. While this reduces code it makes it harder to figure out where exactly an error came from. So add the transaction aborts after every failure inside the replay_one_buffer() callback and the functions it calls, making it as fine grained as possible, so that it helps figuring out why failures happen. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 237 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 186 insertions(+), 51 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a4068be25996bb..776f3dfdfa860a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -433,8 +433,10 @@ static int overwrite_item(struct btrfs_trans_handle *trans, /* Look for the key in the destination tree. */ ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); return ret; + } dst_eb = path->nodes[0]; dst_slot = path->slots[0]; @@ -453,6 +455,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, src_copy = kmalloc(item_size, GFP_NOFS); if (!src_copy) { btrfs_release_path(path); + btrfs_abort_transaction(trans, -ENOMEM); return -ENOMEM; } @@ -537,6 +540,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, else if (found_size < item_size) btrfs_extend_item(trans, path, item_size - found_size); } else if (ret) { + btrfs_abort_transaction(trans, ret); return ret; } dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); @@ -667,6 +671,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = ALIGN(start + size, fs_info->sectorsize); } else { + btrfs_abort_transaction(trans, -EUCLEAN); btrfs_err(fs_info, "unexpected extent type=%d root=%llu inode=%llu offset=%llu", found_type, btrfs_root_id(root), key->objectid, key->offset); @@ -674,8 +679,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } inode = btrfs_iget_logging(key->objectid, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + btrfs_abort_transaction(trans, ret); + return ret; + } /* * first check to see if we already have this extent in the @@ -710,8 +718,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, drop_args.end = extent_end; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -725,8 +735,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } dest_offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); copy_extent_buffer(path->nodes[0], eb, dest_offset, @@ -748,8 +760,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_qgroup_trace_extent(trans, btrfs_file_extent_disk_bytenr(eb, item), btrfs_file_extent_disk_num_bytes(eb, item)); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; + } if (ins.objectid > 0) { u64 csum_start; @@ -763,6 +777,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; } else if (ret == 0) { struct btrfs_ref ref = { @@ -775,8 +790,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, btrfs_init_data_ref(&ref, key->objectid, offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } } else { /* * insert the extent pointer in the extent @@ -785,8 +802,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root), key->objectid, offset, &ins); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } } btrfs_release_path(path); @@ -803,8 +822,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, &ordered_sums, false); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; + } ret = 0; /* * Now delete all existing cums in the csum root that @@ -864,14 +885,20 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, list); csum_root = btrfs_csum_root(fs_info, sums->logical); - if (!ret) + if (!ret) { ret = btrfs_del_csums(trans, csum_root, sums->logical, sums->len); - if (!ret) + if (ret) + btrfs_abort_transaction(trans, ret); + } + if (!ret) { ret = btrfs_csum_file_blocks(trans, csum_root, sums); + if (ret) + btrfs_abort_transaction(trans, ret); + } list_del(&sums->list); kfree(sums); } @@ -888,12 +915,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } update_inode: btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_transaction(trans, ret); out: iput(&inode->vfs_inode); return ret; @@ -907,15 +938,20 @@ static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, int ret; ret = btrfs_unlink_inode(trans, dir, inode, name); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); return ret; + } /* * Whenever we need to check if a name exists or not, we check the * fs/subvolume tree. So after an unlink we must run delayed items, so * that future checks for a name during log replay see that the name * does not exists anymore. */ - return btrfs_run_delayed_items(trans); + ret = btrfs_run_delayed_items(trans); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; } /* @@ -942,14 +978,17 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(leaf, di, &location); ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); return ret; + } btrfs_release_path(path); inode = btrfs_iget_logging(location.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_transaction(trans, ret); inode = NULL; goto out; } @@ -1080,14 +1119,18 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, ret = read_alloc_one_name(leaf, (victim_ref + 1), btrfs_inode_ref_name_len(leaf, victim_ref), &victim_name); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); return ret; + } ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); if (ret) { kfree(victim_name.name); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); return ret; + } ptr = (unsigned long)(victim_ref + 1) + victim_name.len; continue; } @@ -1133,8 +1176,10 @@ static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, ret = read_alloc_one_name(leaf, &extref->name, victim_name.len, &victim_name); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); return ret; + } search_key->objectid = inode_objectid; search_key->type = BTRFS_INODE_EXTREF_KEY; @@ -1144,8 +1189,10 @@ static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); if (ret) { kfree(victim_name.name); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); return ret; + } next: cur_offset += victim_name.len + sizeof(*extref); continue; @@ -1154,7 +1201,9 @@ static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, victim_parent = btrfs_iget_logging(parent_objectid, root); if (IS_ERR(victim_parent)) { kfree(victim_name.name); - return PTR_ERR(victim_parent); + ret = PTR_ERR(victim_parent); + btrfs_abort_transaction(trans, ret); + return ret; } inc_nlink(&inode->vfs_inode); @@ -1193,6 +1242,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, search_key.offset = parent_objectid; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) { + btrfs_abort_transaction(trans, ret); return ret; } else if (ret == 0) { /* @@ -1230,7 +1280,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), ref_index, name, 0); if (IS_ERR(di)) { - return PTR_ERR(di); + ret = PTR_ERR(di); + btrfs_abort_transaction(trans, ret); + return ret; } else if (di) { ret = drop_one_dir_item(trans, path, dir, di); if (ret) @@ -1320,8 +1372,10 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, ret = 0; goto out; } - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; + } eb = path->nodes[0]; ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); @@ -1333,12 +1387,18 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, if (key->type == BTRFS_INODE_EXTREF_KEY) { ret = extref_get_fields(eb, ref_ptr, &name, NULL, &parent_id); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } } else { parent_id = key->offset; ret = ref_get_fields(eb, ref_ptr, &name, NULL); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } } - if (ret) - goto out; if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, @@ -1354,6 +1414,7 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, if (IS_ERR(dir)) { ret = PTR_ERR(dir); kfree(name.name); + btrfs_abort_transaction(trans, ret); goto out; } ret = unlink_inode_for_log_replay(trans, dir, inode, &name); @@ -1428,6 +1489,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir); if (ret == -ENOENT) ret = 0; + else + btrfs_abort_transaction(trans, ret); dir = NULL; goto out; } @@ -1435,6 +1498,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, inode = btrfs_iget_logging(inode_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_transaction(trans, ret); inode = NULL; goto out; } @@ -1443,8 +1507,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (is_extref_item) { ret = extref_get_fields(eb, ref_ptr, &name, &ref_index, &parent_objectid); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } /* * parent object can change from one array * item to another. @@ -1469,19 +1535,24 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, */ ret = 0; goto next; + } else { + btrfs_abort_transaction(trans, ret); } goto out; } } } else { ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } } ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), ref_index, &name); if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; } else if (ret == 0) { /* @@ -1502,12 +1573,16 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, /* insert our name */ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } ret = btrfs_update_inode(trans, inode); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } } /* Else, ret == 1, we already have a perfect match, we're done. */ @@ -1779,8 +1854,11 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct inode *vfs_inode; inode = btrfs_iget_logging(objectid, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + btrfs_abort_transaction(trans, ret); + return ret; + } vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; @@ -1798,6 +1876,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, inode); } else if (ret == -EEXIST) { ret = 0; + } else { + btrfs_abort_transaction(trans, ret); } iput(vfs_inode); @@ -1904,19 +1984,26 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool name_added = false; dir = btrfs_iget_logging(key->objectid, root); - if (IS_ERR(dir)) - return PTR_ERR(dir); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + btrfs_abort_transaction(trans, ret); + return ret; + } ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } log_flags = btrfs_dir_flags(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &log_key); ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; + } exists = (ret == 0); ret = 0; @@ -1924,12 +2011,15 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, &name, 1); if (IS_ERR(dir_dst_di)) { ret = PTR_ERR(dir_dst_di); + btrfs_abort_transaction(trans, ret); goto out; } else if (dir_dst_di) { ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, &log_key, log_flags, exists); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; + } dir_dst_matches = (ret == 1); } @@ -1940,12 +2030,15 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, &name, 1); if (IS_ERR(index_dst_di)) { ret = PTR_ERR(index_dst_di); + btrfs_abort_transaction(trans, ret); goto out; } else if (index_dst_di) { ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, &log_key, log_flags, exists); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; + } index_dst_matches = (ret == 1); } @@ -1966,6 +2059,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.offset = key->objectid; ret = backref_in_log(root->log_root, &search_key, 0, &name); if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; } else if (ret) { /* The dentry will be added later. */ @@ -1979,6 +2073,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len); ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; } else if (ret) { /* The dentry will be added later. */ @@ -1989,8 +2084,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = insert_one_name(trans, root, key->objectid, key->offset, &name, &log_key); - if (ret && ret != -ENOENT && ret != -EEXIST) + if (ret && ret != -ENOENT && ret != -EEXIST) { + btrfs_abort_transaction(trans, ret); goto out; + } if (!ret) name_added = true; update_size = false; @@ -2000,6 +2097,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, if (!ret && update_size) { btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); ret = btrfs_update_inode(trans, dir); + if (ret) + btrfs_abort_transaction(trans, ret); } kfree(name.name); iput(&dir->vfs_inode); @@ -2057,8 +2156,10 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_key di_key; fixup_path = btrfs_alloc_path(); - if (!fixup_path) + if (!fixup_path) { + btrfs_abort_transaction(trans, -ENOMEM); return -ENOMEM; + } btrfs_dir_item_key_to_cpu(eb, di, &di_key); ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); @@ -2183,8 +2284,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, slot = path->slots[0]; di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } if (log) { struct btrfs_dir_item *log_di; @@ -2194,6 +2297,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, dir_key->offset, &name, 0); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); + btrfs_abort_transaction(trans, ret); goto out; } else if (log_di) { /* The dentry exists in the log, we have nothing to do. */ @@ -2209,6 +2313,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; + btrfs_abort_transaction(trans, ret); goto out; } @@ -2245,16 +2350,20 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, int ret; log_path = btrfs_alloc_path(); - if (!log_path) + if (!log_path) { + btrfs_abort_transaction(trans, -ENOMEM); return -ENOMEM; + } search_key.objectid = ino; search_key.type = BTRFS_XATTR_ITEM_KEY; search_key.offset = 0; again: ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; + } process_leaf: nritems = btrfs_header_nritems(path->nodes[0]); for (i = path->slots[0]; i < nritems; i++) { @@ -2282,6 +2391,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } read_extent_buffer(path->nodes[0], name, @@ -2298,13 +2408,16 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, kfree(name); if (IS_ERR(di)) { ret = PTR_ERR(di); + btrfs_abort_transaction(trans, ret); goto out; } ASSERT(di); ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); search_key = key; goto again; @@ -2312,6 +2425,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, kfree(name); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); + btrfs_abort_transaction(trans, ret); goto out; } cur += this_len; @@ -2323,6 +2437,8 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, ret = 0; else if (ret == 0) goto process_leaf; + else + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(log_path); btrfs_release_path(path); @@ -2357,8 +2473,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; log_path = btrfs_alloc_path(); - if (!log_path) + if (!log_path) { + btrfs_abort_transaction(trans, -ENOMEM); return -ENOMEM; + } dir = btrfs_iget_logging(dirid, root); /* @@ -2370,6 +2488,8 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir); if (ret == -ENOENT) ret = 0; + else + btrfs_abort_transaction(trans, ret); return ret; } @@ -2381,10 +2501,12 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, else { ret = find_dir_range(log, path, dirid, &range_start, &range_end); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; - else if (ret > 0) + } else if (ret > 0) { break; + } } dir_key.offset = range_start; @@ -2392,16 +2514,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, int nritems; ret = btrfs_search_slot(NULL, root, &dir_key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; + } nritems = btrfs_header_nritems(path->nodes[0]); if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); - if (ret == 1) + if (ret == 1) { break; - else if (ret < 0) + } else if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out; + } } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -2463,8 +2589,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, int ret; ret = btrfs_read_extent_buffer(eb, &check); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); return ret; + } level = btrfs_header_level(eb); @@ -2472,8 +2600,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, return 0; path = btrfs_alloc_path(); - if (!path) + if (!path) { + btrfs_abort_transaction(trans, -ENOMEM); return -ENOMEM; + } nritems = btrfs_header_nritems(eb); for (i = 0; i < nritems; i++) { @@ -2545,6 +2675,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, inode = btrfs_iget_logging(key.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_transaction(trans, ret); break; } from = ALIGN(i_size_read(&inode->vfs_inode), @@ -2553,11 +2684,15 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, drop_args.end = (u64)-1; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (!ret) { + if (ret) { + btrfs_abort_transaction(trans, ret); + } else { inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_transaction(trans, ret); } iput(&inode->vfs_inode); if (ret) From 88666b6df97ec3ac8e6522584bec4963cf7ce6f8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Jul 2025 12:54:48 +0100 Subject: [PATCH 2436/3206] btrfs: exit early when replaying hole file extent item from a log tree At replay_one_extent(), we can jump to the code that updates the file extent range and updates the inode when processing a file extent item that represents a hole and we don't have the NO_HOLES feature enabled. This helps reduce the high indentation level by one in replay_one_extent() and avoid splitting some lines to make the code easier to read. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 261 +++++++++++++++++++++----------------------- 1 file changed, 126 insertions(+), 135 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 776f3dfdfa860a..1e01ba43becce5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -725,6 +725,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); u64 offset; unsigned long dest_offset; struct btrfs_key ins; @@ -744,6 +747,17 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, copy_extent_buffer(path->nodes[0], eb, dest_offset, (unsigned long)item, sizeof(*item)); + /* + * We have an explicit hole and NO_HOLES is not enabled. We have + * added the hole file extent item to the subvolume tree, so we + * don't have anything else to do other than update the file + * extent item range and update the inode item. + */ + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) { + btrfs_release_path(path); + goto update_inode; + } + ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); @@ -765,148 +779,125 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } - if (ins.objectid > 0) { - u64 csum_start; - u64 csum_end; - LIST_HEAD(ordered_sums); - - /* - * is this extent already allocated in the extent - * allocation tree? If so, just add a reference - */ - ret = btrfs_lookup_data_extent(fs_info, ins.objectid, - ins.offset); - if (ret < 0) { + /* + * Is this extent already allocated in the extent tree? + * If so, just add a reference. + */ + ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } else if (ret == 0) { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + + btrfs_init_data_ref(&ref, key->objectid, offset, 0, false); + ret = btrfs_inc_extent_ref(trans, &ref); + if (ret) { btrfs_abort_transaction(trans, ret); goto out; - } else if (ret == 0) { - struct btrfs_ref ref = { - .action = BTRFS_ADD_DELAYED_REF, - .bytenr = ins.objectid, - .num_bytes = ins.offset, - .owning_root = btrfs_root_id(root), - .ref_root = btrfs_root_id(root), - }; - btrfs_init_data_ref(&ref, key->objectid, offset, - 0, false); - ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - } else { - /* - * insert the extent pointer in the extent - * allocation tree - */ - ret = btrfs_alloc_logged_file_extent(trans, - btrfs_root_id(root), - key->objectid, offset, &ins); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - } - btrfs_release_path(path); - - if (btrfs_file_extent_compression(eb, item)) { - csum_start = ins.objectid; - csum_end = csum_start + ins.offset; - } else { - csum_start = ins.objectid + - btrfs_file_extent_offset(eb, item); - csum_end = csum_start + - btrfs_file_extent_num_bytes(eb, item); } - - ret = btrfs_lookup_csums_list(root->log_root, - csum_start, csum_end - 1, - &ordered_sums, false); - if (ret < 0) { + } else { + /* Insert the extent pointer in the extent tree. */ + ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root), + key->objectid, offset, &ins); + if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - ret = 0; - /* - * Now delete all existing cums in the csum root that - * cover our range. We do this because we can have an - * extent that is completely referenced by one file - * extent item and partially referenced by another - * file extent item (like after using the clone or - * extent_same ioctls). In this case if we end up doing - * the replay of the one that partially references the - * extent first, and we do not do the csum deletion - * below, we can get 2 csum items in the csum tree that - * overlap each other. For example, imagine our log has - * the two following file extent items: - * - * key (257 EXTENT_DATA 409600) - * extent data disk byte 12845056 nr 102400 - * extent data offset 20480 nr 20480 ram 102400 - * - * key (257 EXTENT_DATA 819200) - * extent data disk byte 12845056 nr 102400 - * extent data offset 0 nr 102400 ram 102400 - * - * Where the second one fully references the 100K extent - * that starts at disk byte 12845056, and the log tree - * has a single csum item that covers the entire range - * of the extent: - * - * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 - * - * After the first file extent item is replayed, the - * csum tree gets the following csum item: - * - * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 - * - * Which covers the 20K sub-range starting at offset 20K - * of our extent. Now when we replay the second file - * extent item, if we do not delete existing csum items - * that cover any of its blocks, we end up getting two - * csum items in our csum tree that overlap each other: - * - * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 - * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 - * - * Which is a problem, because after this anyone trying - * to lookup up for the checksum of any block of our - * extent starting at an offset of 40K or higher, will - * end up looking at the second csum item only, which - * does not contain the checksum for any block starting - * at offset 40K or higher of our extent. - */ - while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums; - struct btrfs_root *csum_root; - - sums = list_first_entry(&ordered_sums, - struct btrfs_ordered_sum, - list); - csum_root = btrfs_csum_root(fs_info, - sums->logical); - if (!ret) { - ret = btrfs_del_csums(trans, csum_root, - sums->logical, - sums->len); - if (ret) - btrfs_abort_transaction(trans, ret); - } - if (!ret) { - ret = btrfs_csum_file_blocks(trans, - csum_root, - sums); - if (ret) - btrfs_abort_transaction(trans, ret); - } - list_del(&sums->list); - kfree(sums); - } - if (ret) - goto out; + } + + btrfs_release_path(path); + + if (btrfs_file_extent_compression(eb, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; } else { - btrfs_release_path(path); + csum_start = ins.objectid + btrfs_file_extent_offset(eb, item); + csum_end = csum_start + btrfs_file_extent_num_bytes(eb, item); + } + + ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, + &ordered_sums, false); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; } + ret = 0; + /* + * Now delete all existing cums in the csum root that cover our + * range. We do this because we can have an extent that is + * completely referenced by one file extent item and partially + * referenced by another file extent item (like after using the + * clone or extent_same ioctls). In this case if we end up doing + * the replay of the one that partially references the extent + * first, and we do not do the csum deletion below, we can get 2 + * csum items in the csum tree that overlap each other. For + * example, imagine our log has the two following file extent items: + * + * key (257 EXTENT_DATA 409600) + * extent data disk byte 12845056 nr 102400 + * extent data offset 20480 nr 20480 ram 102400 + * + * key (257 EXTENT_DATA 819200) + * extent data disk byte 12845056 nr 102400 + * extent data offset 0 nr 102400 ram 102400 + * + * Where the second one fully references the 100K extent that + * starts at disk byte 12845056, and the log tree has a single + * csum item that covers the entire range of the extent: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * + * After the first file extent item is replayed, the csum tree + * gets the following csum item: + * + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which covers the 20K sub-range starting at offset 20K of our + * extent. Now when we replay the second file extent item, if we + * do not delete existing csum items that cover any of its + * blocks, we end up getting two csum items in our csum tree + * that overlap each other: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which is a problem, because after this anyone trying to + * lookup up for the checksum of any block of our extent + * starting at an offset of 40K or higher, will end up looking + * at the second csum item only, which does not contain the + * checksum for any block starting at offset 40K or higher of + * our extent. + */ + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + struct btrfs_root *csum_root; + + sums = list_first_entry(&ordered_sums, + struct btrfs_ordered_sum, list); + csum_root = btrfs_csum_root(fs_info, sums->logical); + if (!ret) { + ret = btrfs_del_csums(trans, csum_root, sums->logical, + sums->len); + if (ret) + btrfs_abort_transaction(trans, ret); + } + if (!ret) { + ret = btrfs_csum_file_blocks(trans, csum_root, sums); + if (ret) + btrfs_abort_transaction(trans, ret); + } + list_del(&sums->list); + kfree(sums); + } + if (ret) + goto out; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { /* inline extents are easy, we just overwrite them */ ret = overwrite_item(trans, root, path, eb, slot, key); @@ -914,13 +905,13 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } +update_inode: ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } -update_inode: btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); if (ret) From 575f52a77a0aa045943cca67fdf9f0388e86a5f8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Jul 2025 13:41:15 +0100 Subject: [PATCH 2437/3206] btrfs: process inline extent earlier in replay_one_extent() Instead of having an if statement to check for regular and prealloc extents first, process them in a block, and then following with an else statement to check for an inline extent, check for an inline extent first, process it and jump to the 'update_inode' label, allowing us to avoid having the code for processing regular and prealloc extents inside a block, reducing the high indentation level by one and making the code easier to read and avoid line splittings too. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 327 ++++++++++++++++++++++---------------------- 1 file changed, 163 insertions(+), 164 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1e01ba43becce5..4fad7cbfcb2633 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -646,6 +646,12 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, u64 extent_end; u64 start = key->offset; u64 nbytes = 0; + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); + u64 offset; + unsigned long dest_offset; + struct btrfs_key ins; struct btrfs_file_extent_item *item; struct btrfs_inode *inode = NULL; unsigned long size; @@ -723,187 +729,180 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 csum_start; - u64 csum_end; - LIST_HEAD(ordered_sums); - u64 offset; - unsigned long dest_offset; - struct btrfs_key ins; + if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(trans, root, path, eb, slot, key); + if (ret) + goto out; + goto update_inode; + } - if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && - btrfs_fs_incompat(fs_info, NO_HOLES)) - goto update_inode; + /* + * If not an inline extent, it can only be a regular or prealloc one. + * We have checked that above and returned -EUCLEAN if not. + */ - ret = btrfs_insert_empty_item(trans, root, path, key, - sizeof(*item)); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - dest_offset = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); - copy_extent_buffer(path->nodes[0], eb, dest_offset, - (unsigned long)item, sizeof(*item)); + /* A hole and NO_HOLES feature enabled, nothing else to do. */ + if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && + btrfs_fs_incompat(fs_info, NO_HOLES)) + goto update_inode; - /* - * We have an explicit hole and NO_HOLES is not enabled. We have - * added the hole file extent item to the subvolume tree, so we - * don't have anything else to do other than update the file - * extent item range and update the inode item. - */ - if (btrfs_file_extent_disk_bytenr(eb, item) == 0) { - btrfs_release_path(path); - goto update_inode; - } + ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + dest_offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + copy_extent_buffer(path->nodes[0], eb, dest_offset, (unsigned long)item, + sizeof(*item)); - ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); - ins.type = BTRFS_EXTENT_ITEM_KEY; - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); - offset = key->offset - btrfs_file_extent_offset(eb, item); + /* + * We have an explicit hole and NO_HOLES is not enabled. We have added + * the hole file extent item to the subvolume tree, so we don't have + * anything else to do other than update the file extent item range and + * update the inode item. + */ + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) { + btrfs_release_path(path); + goto update_inode; + } - /* - * Manually record dirty extent, as here we did a shallow - * file extent item copy and skip normal backref update, - * but modifying extent tree all by ourselves. - * So need to manually record dirty extent for qgroup, - * as the owner of the file extent changed from log tree - * (doesn't affect qgroup) to fs/file tree(affects qgroup) - */ - ret = btrfs_qgroup_trace_extent(trans, - btrfs_file_extent_disk_bytenr(eb, item), - btrfs_file_extent_disk_num_bytes(eb, item)); - if (ret < 0) { + ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); + offset = key->offset - btrfs_file_extent_offset(eb, item); + + /* + * Manually record dirty extent, as here we did a shallow file extent + * item copy and skip normal backref update, but modifying extent tree + * all by ourselves. So need to manually record dirty extent for qgroup, + * as the owner of the file extent changed from log tree (doesn't affect + * qgroup) to fs/file tree (affects qgroup). + */ + ret = btrfs_qgroup_trace_extent(trans, + btrfs_file_extent_disk_bytenr(eb, item), + btrfs_file_extent_disk_num_bytes(eb, item)); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + /* + * Is this extent already allocated in the extent tree? + * If so, just add a reference. + */ + ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } else if (ret == 0) { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + + btrfs_init_data_ref(&ref, key->objectid, offset, 0, false); + ret = btrfs_inc_extent_ref(trans, &ref); + if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - - /* - * Is this extent already allocated in the extent tree? - * If so, just add a reference. - */ - ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); - if (ret < 0) { + } else { + /* Insert the extent pointer in the extent tree. */ + ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root), + key->objectid, offset, &ins); + if (ret) { btrfs_abort_transaction(trans, ret); goto out; - } else if (ret == 0) { - struct btrfs_ref ref = { - .action = BTRFS_ADD_DELAYED_REF, - .bytenr = ins.objectid, - .num_bytes = ins.offset, - .owning_root = btrfs_root_id(root), - .ref_root = btrfs_root_id(root), - }; - - btrfs_init_data_ref(&ref, key->objectid, offset, 0, false); - ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - } else { - /* Insert the extent pointer in the extent tree. */ - ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root), - key->objectid, offset, &ins); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } } + } - btrfs_release_path(path); + btrfs_release_path(path); - if (btrfs_file_extent_compression(eb, item)) { - csum_start = ins.objectid; - csum_end = csum_start + ins.offset; - } else { - csum_start = ins.objectid + btrfs_file_extent_offset(eb, item); - csum_end = csum_start + btrfs_file_extent_num_bytes(eb, item); - } + if (btrfs_file_extent_compression(eb, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; + } else { + csum_start = ins.objectid + btrfs_file_extent_offset(eb, item); + csum_end = csum_start + btrfs_file_extent_num_bytes(eb, item); + } - ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, - &ordered_sums, false); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); - goto out; - } - ret = 0; - /* - * Now delete all existing cums in the csum root that cover our - * range. We do this because we can have an extent that is - * completely referenced by one file extent item and partially - * referenced by another file extent item (like after using the - * clone or extent_same ioctls). In this case if we end up doing - * the replay of the one that partially references the extent - * first, and we do not do the csum deletion below, we can get 2 - * csum items in the csum tree that overlap each other. For - * example, imagine our log has the two following file extent items: - * - * key (257 EXTENT_DATA 409600) - * extent data disk byte 12845056 nr 102400 - * extent data offset 20480 nr 20480 ram 102400 - * - * key (257 EXTENT_DATA 819200) - * extent data disk byte 12845056 nr 102400 - * extent data offset 0 nr 102400 ram 102400 - * - * Where the second one fully references the 100K extent that - * starts at disk byte 12845056, and the log tree has a single - * csum item that covers the entire range of the extent: - * - * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 - * - * After the first file extent item is replayed, the csum tree - * gets the following csum item: - * - * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 - * - * Which covers the 20K sub-range starting at offset 20K of our - * extent. Now when we replay the second file extent item, if we - * do not delete existing csum items that cover any of its - * blocks, we end up getting two csum items in our csum tree - * that overlap each other: - * - * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 - * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 - * - * Which is a problem, because after this anyone trying to - * lookup up for the checksum of any block of our extent - * starting at an offset of 40K or higher, will end up looking - * at the second csum item only, which does not contain the - * checksum for any block starting at offset 40K or higher of - * our extent. - */ - while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums; - struct btrfs_root *csum_root; + ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, + &ordered_sums, false); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + ret = 0; + /* + * Now delete all existing cums in the csum root that cover our range. + * We do this because we can have an extent that is completely + * referenced by one file extent item and partially referenced by + * another file extent item (like after using the clone or extent_same + * ioctls). In this case if we end up doing the replay of the one that + * partially references the extent first, and we do not do the csum + * deletion below, we can get 2 csum items in the csum tree that overlap + * each other. For example, imagine our log has the two following file + * extent items: + * + * key (257 EXTENT_DATA 409600) + * extent data disk byte 12845056 nr 102400 + * extent data offset 20480 nr 20480 ram 102400 + * + * key (257 EXTENT_DATA 819200) + * extent data disk byte 12845056 nr 102400 + * extent data offset 0 nr 102400 ram 102400 + * + * Where the second one fully references the 100K extent that starts at + * disk byte 12845056, and the log tree has a single csum item that + * covers the entire range of the extent: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * + * After the first file extent item is replayed, the csum tree gets the + * following csum item: + * + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which covers the 20K sub-range starting at offset 20K of our extent. + * Now when we replay the second file extent item, if we do not delete + * existing csum items that cover any of its blocks, we end up getting + * two csum items in our csum tree that overlap each other: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which is a problem, because after this anyone trying to lookup for + * the checksum of any block of our extent starting at an offset of 40K + * or higher, will end up looking at the second csum item only, which + * does not contain the checksum for any block starting at offset 40K or + * higher of our extent. + */ + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + struct btrfs_root *csum_root; - sums = list_first_entry(&ordered_sums, - struct btrfs_ordered_sum, list); - csum_root = btrfs_csum_root(fs_info, sums->logical); - if (!ret) { - ret = btrfs_del_csums(trans, csum_root, sums->logical, - sums->len); - if (ret) - btrfs_abort_transaction(trans, ret); - } - if (!ret) { - ret = btrfs_csum_file_blocks(trans, csum_root, sums); - if (ret) - btrfs_abort_transaction(trans, ret); - } - list_del(&sums->list); - kfree(sums); + sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list); + csum_root = btrfs_csum_root(fs_info, sums->logical); + if (!ret) { + ret = btrfs_del_csums(trans, csum_root, sums->logical, + sums->len); + if (ret) + btrfs_abort_transaction(trans, ret); } - if (ret) - goto out; - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - /* inline extents are easy, we just overwrite them */ - ret = overwrite_item(trans, root, path, eb, slot, key); - if (ret) - goto out; + if (!ret) { + ret = btrfs_csum_file_blocks(trans, csum_root, sums); + if (ret) + btrfs_abort_transaction(trans, ret); + } + list_del(&sums->list); + kfree(sums); } + if (ret) + goto out; update_inode: ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); From cac2ab34d88e2e4aaa0991038854849a2eaff942 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Jul 2025 13:48:12 +0100 Subject: [PATCH 2438/3206] btrfs: use local key variable to pass arguments in replay_one_extent() Instead of extracting again the disk_bytenr and disk_num_bytes values from the file extent item to pass to btrfs_qgroup_trace_extent(), use the key local variable 'ins' which already has those values, reducing the size of the source code. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4fad7cbfcb2633..265ee7557ecb3d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -779,9 +779,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * as the owner of the file extent changed from log tree (doesn't affect * qgroup) to fs/file tree (affects qgroup). */ - ret = btrfs_qgroup_trace_extent(trans, - btrfs_file_extent_disk_bytenr(eb, item), - btrfs_file_extent_disk_num_bytes(eb, item)); + ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; From 669d38bf587f27df76bd485c88bac40694b19fbb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Jul 2025 17:43:41 +0100 Subject: [PATCH 2439/3206] btrfs: collapse unaccount_log_buffer() into clean_log_buffer() There's one only one caller of unaccount_log_buffer() and both this function and the caller are short, so move its code into the caller. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 51 +++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 265ee7557ecb3d..cbdc0c0b5e7c55 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2731,35 +2731,11 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, return ret; } -/* - * Correctly adjust the reserved bytes occupied by a log tree extent buffer - */ -static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) -{ - struct btrfs_block_group *cache; - - cache = btrfs_lookup_block_group(fs_info, start); - if (!cache) { - btrfs_err(fs_info, "unable to find block group for %llu", start); - return -ENOENT; - } - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->reserved -= fs_info->nodesize; - cache->space_info->bytes_reserved -= fs_info->nodesize; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - btrfs_put_block_group(cache); - - return 0; -} - static int clean_log_buffer(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { - int ret; + struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_block_group *bg; btrfs_tree_lock(eb); btrfs_clear_buffer_dirty(trans, eb); @@ -2767,16 +2743,31 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans, btrfs_tree_unlock(eb); if (trans) { + int ret; + ret = btrfs_pin_reserved_extent(trans, eb); if (ret) btrfs_abort_transaction(trans, ret); return ret; } - ret = unaccount_log_buffer(eb->fs_info, eb->start); - if (ret) - btrfs_handle_fs_error(eb->fs_info, ret, NULL); - return ret; + bg = btrfs_lookup_block_group(fs_info, eb->start); + if (!bg) { + btrfs_err(fs_info, "unable to find block group for %llu", eb->start); + btrfs_handle_fs_error(fs_info, -ENOENT, NULL); + return -ENOENT; + } + + spin_lock(&bg->space_info->lock); + spin_lock(&bg->lock); + bg->reserved -= fs_info->nodesize; + bg->space_info->bytes_reserved -= fs_info->nodesize; + spin_unlock(&bg->lock); + spin_unlock(&bg->space_info->lock); + + btrfs_put_block_group(bg); + + return 0; } static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, From 3d16abf6c88ae20cc4168756e77ad270a9b37182 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 22 Jul 2025 13:39:10 +0200 Subject: [PATCH 2440/3206] btrfs: zoned: directly call do_zone_finish() from btrfs_zone_finish_endio_workfn() When btrfs_zone_finish_endio_workfn() is calling btrfs_zone_finish_endio() it already has a pointer to the block group. Furthermore btrfs_zone_finish_endio() does additional checks if the block group can be finished or not. But in the context of btrfs_zone_finish_endio_workfn() only the actual call to do_zone_finish() is of interest, as the skipping condition when there is still room to allocate from the block group cannot be checked. Directly call do_zone_finish() on the block group. Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index efc2a81f50e551..c5c4e512a58683 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2488,12 +2488,16 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len static void btrfs_zone_finish_endio_workfn(struct work_struct *work) { + int ret; struct btrfs_block_group *bg = container_of(work, struct btrfs_block_group, zone_finish_work); wait_on_extent_buffer_writeback(bg->last_eb); free_extent_buffer(bg->last_eb); - btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); + ret = do_zone_finish(bg, true); + if (ret) + btrfs_handle_fs_error(bg->fs_info, ret, + "Failed to finish block-group's zone"); btrfs_put_block_group(bg); } From 3c44cd3c79fcb38a86836dea6ff8fec322a9e68c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 22 Jul 2025 13:39:11 +0200 Subject: [PATCH 2441/3206] btrfs: zoned: return error from btrfs_zone_finish_endio() Now that btrfs_zone_finish_endio_workfn() is directly calling do_zone_finish() the only caller of btrfs_zone_finish_endio() is btrfs_finish_one_ordered(). btrfs_finish_one_ordered() already has error handling in-place so btrfs_zone_finish_endio() can return an error if the block group lookup fails. Also as btrfs_zone_finish_endio() already checks for zoned filesystems and returns early, there's no need to do this in the caller. Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 ++++--- fs/btrfs/zoned.c | 8 +++++--- fs/btrfs/zoned.h | 9 ++++++--- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 225e9221c4569d..9eb0253507a08e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3109,9 +3109,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (btrfs_is_zoned(fs_info)) - btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes); + ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); + if (ret) + goto out; if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c5c4e512a58683..ba444e412613f2 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2458,16 +2458,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) return ret; } -void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) +int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) - return; + return 0; block_group = btrfs_lookup_block_group(fs_info, logical); - ASSERT(block_group); + if (WARN_ON_ONCE(!block_group)) + return -ENOENT; /* No MIXED_BG on zoned btrfs. */ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) @@ -2484,6 +2485,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len out: btrfs_put_block_group(block_group); + return 0; } static void btrfs_zone_finish_endio_workfn(struct work_struct *work) diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 6e11533b8e14c2..17c5656580dd97 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -83,7 +83,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); -void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, +int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); @@ -234,8 +234,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, return true; } -static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) { } +static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + return 0; +} static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb) { } From 6d9cce2d1bb6b70916de351490d1ff197058b88f Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 25 Jul 2025 16:07:05 +0800 Subject: [PATCH 2442/3206] btrfs: remove duplicate inclusion of linux/types.h In messages.h there's linux/types.h included more than once. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=22939 Signed-off-by: Jiapeng Chong Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/messages.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 022ebc89af8550..4416c165644fa4 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -3,7 +3,6 @@ #ifndef BTRFS_MESSAGES_H #define BTRFS_MESSAGES_H -#include #include #include #include From f07b855c56b1fd724c018c1d4ff8d319b9afd5a9 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 21 Oct 2024 12:01:53 -0700 Subject: [PATCH 2443/3206] btrfs: try to search for data csums in commit root If you run a workload with: - a cgroup that does tons of parallel data reading, with a working set much larger than its memory limit - a second cgroup that writes relatively fewer files, with overwrites, with no memory limit (see full code listing at the bottom for a reproducer) Then what quickly occurs is: - we have a large number of threads trying to read the csum tree - we have a decent number of threads deleting csums running delayed refs - we have a large number of threads in direct reclaim and thus high memory pressure The result of this is that we writeback the csum tree repeatedly mid transaction, to get back the extent_buffer folios for reclaim. As a result, we repeatedly COW the csum tree for the delayed refs that are deleting csums. This means repeatedly write locking the higher levels of the tree. As a result of this, we achieve an unpleasant priority inversion. We have: - a high degree of contention on the csum root node (and other upper nodes) eb rwsem - a memory starved cgroup doing tons of reclaim on CPU. - many reader threads in the memory starved cgroup "holding" the sem as readers, but not scheduling promptly. i.e., task __state == 0, but not running on a cpu. - btrfs_commit_transaction stuck trying to acquire the sem as a writer. (running delayed_refs, deleting csums for unreferenced data extents) This results in arbitrarily long transactions. This then results in seriously degraded performance for any cgroup using the filesystem (the victim cgroup in the script). It isn't an academic problem, as we see this exact problem in production at Meta with one cgroup over its memory limit ruining btrfs performance for the whole system, stalling critical system services that depend on btrfs syncs. The underlying scheduling "problem" with global rwsems is sort of thorny and apparently well known and was discussed at LPC 2024, for example. As a result, our main lever in the short term is just trying to reduce contention on our various rwsems with an eye to reducing the frequency of write locking, to avoid disabling the read lock fast acquisition path. Luckily, it seems likely that many reads are for old extents written many transactions ago, and that for those we *can* in fact search the commit root. The commit_root_sem only gets taken write once, near the end of transaction commit, no matter how much memory pressure there is, so we have much less contention between readers and writers. This change detects when we are trying to read an old extent (according to extent map generation) and then wires that through bio_ctrl to the btrfs_bio, which unfortunately isn't allocated yet when we have this information. When we go to lookup the csums in lookup_bio_sums we can check this condition on the btrfs_bio and do the commit root lookup accordingly. Note that a single bio_ctrl might collect a few extent_maps into a single bio, so it is important to track a maximum generation across all the extent_maps used for each bio to make an accurate decision on whether it is valid to look in the commit root. If any extent_map is updated in the current generation, we can't use the commit root. To test and reproduce this issue, I used the following script and accompanying C program (to avoid bottlenecks in constantly forking thousands of dd processes): ====== big-read.c ====== #include #include #include #include #include #include #include #define BUF_SZ (128 * (1 << 10UL)) int read_once(int fd, size_t sz) { char buf[BUF_SZ]; size_t rd = 0; int ret = 0; while (rd < sz) { ret = read(fd, buf, BUF_SZ); if (ret < 0) { if (errno == EINTR) continue; fprintf(stderr, "read failed: %d\n", errno); return -errno; } else if (ret == 0) { break; } else { rd += ret; } } return rd; } int read_loop(char *fname) { int fd; struct stat st; size_t sz = 0; int ret; while (1) { fd = open(fname, O_RDONLY); if (fd == -1) { perror("open"); return 1; } if (!sz) { if (!fstat(fd, &st)) { sz = st.st_size; } else { perror("stat"); return 1; } } ret = read_once(fd, sz); close(fd); } } int main(int argc, char *argv[]) { int fd; struct stat st; off_t sz; char *buf; int ret; if (argc != 2) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } return read_loop(argv[1]); } ====== repro.sh ====== #!/usr/bin/env bash SCRIPT=$(readlink -f "$0") DIR=$(dirname "$SCRIPT") dev=$1 mnt=$2 shift shift CG_ROOT=/sys/fs/cgroup BAD_CG=$CG_ROOT/bad-nbr GOOD_CG=$CG_ROOT/good-nbr NR_BIGGOS=1 NR_LITTLE=10 NR_VICTIMS=32 NR_VILLAINS=512 START_SEC=$(date +%s) _elapsed() { echo "elapsed: $(($(date +%s) - $START_SEC))" } _stats() { local sysfs=/sys/fs/btrfs/$(findmnt -no UUID $dev) echo "================" date _elapsed cat $sysfs/commit_stats cat $BAD_CG/memory.pressure } _setup_cgs() { echo "+memory +cpuset" > $CG_ROOT/cgroup.subtree_control mkdir -p $GOOD_CG mkdir -p $BAD_CG echo max > $BAD_CG/memory.max # memory.high much less than the working set will cause heavy reclaim echo $((1 << 30)) > $BAD_CG/memory.high # victims get a subset of villain CPUs echo 0 > $GOOD_CG/cpuset.cpus echo 0,1,2,3 > $BAD_CG/cpuset.cpus } _kill_cg() { local cg=$1 local attempts=0 echo "kill cgroup $cg" [ -f $cg/cgroup.procs ] || return while true; do attempts=$((attempts + 1)) echo 1 > $cg/cgroup.kill sleep 1 procs=$(wc -l $cg/cgroup.procs | cut -d' ' -f1) [ $procs -eq 0 ] && break done rmdir $cg echo "killed cgroup $cg in $attempts attempts" } _biggo_vol() { echo $mnt/biggo_vol.$1 } _biggo_file() { echo $(_biggo_vol $1)/biggo } _subvoled_biggos() { total_sz=$((10 << 30)) per_sz=$((total_sz / $NR_VILLAINS)) dd_count=$((per_sz >> 20)) echo "create $NR_VILLAINS subvols with a file of size $per_sz bytes for a total of $total_sz bytes." for i in $(seq $NR_VILLAINS) do btrfs subvol create $(_biggo_vol $i) &>/dev/null dd if=/dev/zero of=$(_biggo_file $i) bs=1M count=$dd_count &>/dev/null done echo "done creating subvols." } _setup() { [ -f .done ] && rm .done findmnt -n $dev && exit 1 if [ -f .re-mkfs ]; then mkfs.btrfs -f -m single -d single $dev >/dev/null || exit 2 else echo "touch .re-mkfs to populate the test fs" fi mount -o noatime $dev $mnt || exit 3 [ -f .re-mkfs ] && _subvoled_biggos _setup_cgs } _my_cleanup() { echo "CLEANUP!" _kill_cg $BAD_CG _kill_cg $GOOD_CG sleep 1 umount $mnt } _bad_exit() { _err "Unexpected Exit! $?" _stats exit $? } trap _my_cleanup EXIT trap _bad_exit INT TERM _setup # Use a lot of page cache reading the big file _villain() { local i=$1 echo $BASHPID > $BAD_CG/cgroup.procs $DIR/big-read $(_biggo_file $i) } # Hit del_csum a lot by overwriting lots of small new files _victim() { echo $BASHPID > $GOOD_CG/cgroup.procs i=0; while (true) do local tmp=$mnt/tmp.$i dd if=/dev/zero of=$tmp bs=4k count=2 >/dev/null 2>&1 i=$((i+1)) [ $i -eq $NR_LITTLE ] && i=0 done } _one_sync() { echo "sync..." before=$(date +%s) sync after=$(date +%s) echo "sync done in $((after - before))s" _stats } # sync in a loop _sync() { echo "start sync loop" syncs=0 echo $BASHPID > $GOOD_CG/cgroup.procs while true do [ -f .done ] && break _one_sync syncs=$((syncs + 1)) [ -f .done ] && break sleep 10 done if [ $syncs -eq 0 ]; then echo "do at least one sync!" _one_sync fi echo "sync loop done." } _sleep() { local time=${1-60} local now=$(date +%s) local end=$((now + time)) while [ $now -lt $end ]; do echo "SLEEP: $((end - now))s left. Sleep 10." sleep 10 now=$(date +%s) done } echo "start $NR_VILLAINS villains" for i in $(seq $NR_VILLAINS) do _villain $i & disown # get rid of annoying log on kill (done via cgroup anyway) done echo "start $NR_VICTIMS victims" for i in $(seq $NR_VICTIMS) do _victim & disown done _sync & SYNC_PID=$! _sleep $1 _elapsed touch .done wait $SYNC_PID echo "OK" exit 0 Without this patch, that reproducer: - Ran for 6+ minutes instead of 60s - Hung hundreds of threads in D state on the csum reader lock - Got a commit stuck for 3 minutes sync done in 388s ================ Wed Jul 9 09:52:31 PM UTC 2025 elapsed: 420 commits 2 cur_commit_ms 0 last_commit_ms 159446 max_commit_ms 159446 total_commit_ms 160058 some avg10=99.03 avg60=98.97 avg300=75.43 total=418033386 full avg10=82.79 avg60=80.52 avg300=59.45 total=324995274 419 hits state R, D comms big-read btrfs_tree_read_lock_nested btrfs_read_lock_root_node btrfs_search_slot btrfs_lookup_csum btrfs_lookup_bio_sums btrfs_submit_bbio 1 hits state D comms btrfs-transacti btrfs_tree_lock_nested btrfs_lock_root_node btrfs_search_slot btrfs_del_csums __btrfs_run_delayed_refs btrfs_run_delayed_refs With the patch, the reproducer exits naturally, in 65s, completing a pretty decent 4 commits, despite heavy memory pressure. Occasionally you can still trigger a rather long commit (couple seconds) but never one that is minutes long. sync done in 3s ================ elapsed: 65 commits 4 cur_commit_ms 0 last_commit_ms 485 max_commit_ms 689 total_commit_ms 2453 some avg10=98.28 avg60=64.54 avg300=19.39 total=64849893 full avg10=74.43 avg60=48.50 avg300=14.53 total=48665168 some random rwalker samples showed the most common stack in reclaim, rather than the csum tree: 145 hits state R comms bash, sleep, dd, shuf shrink_folio_list shrink_lruvec shrink_node do_try_to_free_pages try_to_free_mem_cgroup_pages reclaim_high Link: https://lpc.events/event/18/contributions/1883/ Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/bio.c | 1 + fs/btrfs/bio.h | 2 ++ fs/btrfs/compression.c | 1 + fs/btrfs/extent_io.c | 64 ++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/file-item.c | 32 +++++++++++++++++++++ 5 files changed, 97 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 50b5fc1c06d7cc..ea7f7a17a3d5bb 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -93,6 +93,7 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, refcount_inc(&orig_bbio->ordered->refs); bbio->ordered = orig_bbio->ordered; } + bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; atomic_inc(&orig_bbio->pending_ios); return bbio; } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index dc2eb43b70970b..00883aea55d70f 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -82,6 +82,8 @@ struct btrfs_bio { /* Save the first error status of split bio. */ blk_status_t status; + /* Use the commit root to look up csums (data read bio only). */ + bool csum_search_commit_root; /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 35e3071cec0636..06e119ee264924 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -602,6 +602,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->compressed_len = compressed_len; cb->compress_type = btrfs_extent_map_compression(em); cb->orig_bbio = bbio; + cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root; btrfs_free_extent_map(em); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b21cb72835ccf4..7ab45b2346216e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -101,6 +101,26 @@ struct btrfs_bio_ctrl { enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; + /* + * For data read bios, we attempt to optimize csum lookups if the extent + * generation is older than the current one. To make this possible, we + * need to track the maximum generation of an extent in a bio_ctrl to + * make the decision when submitting the bio. + * + * The pattern between do_readpage(), submit_one_bio() and + * submit_extent_folio() is quite subtle, so tracking this is tricky. + * + * As we process extent E, we might submit a bio with existing built up + * extents before adding E to a new bio, or we might just add E to the + * bio. As a result, E's generation could apply to the current bio or + * to the next one, so we need to be careful to update the bio_ctrl's + * generation with E's only when we are sure E is added to bio_ctrl->bbio + * in submit_extent_folio(). + * + * See the comment in btrfs_lookup_bio_sums() for more detail on the + * need for this optimization. + */ + u64 generation; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; @@ -131,6 +151,26 @@ struct btrfs_bio_ctrl { u64 last_em_start; }; +/* + * Helper to set the csum search commit root option for a bio_ctrl's bbio + * before submitting the bio. + * + * Only for use by submit_one_bio(). + */ +static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl) +{ + struct btrfs_bio *bbio = bio_ctrl->bbio; + + ASSERT(bbio); + + if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode))) + return; + + bio_ctrl->bbio->csum_search_commit_root = + (bio_ctrl->generation && + bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info)); +} + static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_bio *bbio = bio_ctrl->bbio; @@ -141,6 +181,8 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* Caller should ensure the bio has at least some range added */ ASSERT(bbio->bio.bi_iter.bi_size); + bio_set_csum_search_commit_root(bio_ctrl); + if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); @@ -149,6 +191,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; + /* + * We used the generation to decide whether to lookup csums in the + * commit_root or not when we called bio_set_csum_search_commit_root() + * above. Now, reset the generation for the next bio. + */ + bio_ctrl->generation = 0; } /* @@ -719,6 +767,8 @@ static void alloc_new_bio(struct btrfs_inode *inode, * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one + * @read_em_generation: generation of the extent_map we are submitting + * (only used for read) * * The will either add the page into the existing @bio_ctrl->bbio, or allocate a * new one in @bio_ctrl->bbio. @@ -727,7 +777,8 @@ static void alloc_new_bio(struct btrfs_inode *inode, */ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, u64 disk_bytenr, struct folio *folio, - size_t size, unsigned long pg_offset) + size_t size, unsigned long pg_offset, + u64 read_em_generation) { struct btrfs_inode *inode = folio_to_inode(folio); loff_t file_offset = folio_pos(folio) + pg_offset; @@ -758,6 +809,11 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, submit_one_bio(bio_ctrl); continue; } + /* + * Now that the folio is definitely added to the bio, include its + * generation in the max generation calculation. + */ + bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation); bio_ctrl->next_file_offset += len; if (bio_ctrl->wbc) @@ -960,6 +1016,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, bool force_bio_submit = false; u64 disk_bytenr; u64 block_start; + u64 em_gen; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { @@ -1043,6 +1100,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, bio_ctrl->last_em_start = em->start; + em_gen = em->generation; btrfs_free_extent_map(em); em = NULL; @@ -1066,7 +1124,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, - pg_offset); + pg_offset, em_gen); } return 0; } @@ -1600,7 +1658,7 @@ static int submit_one_sector(struct btrfs_inode *inode, ASSERT(folio_test_writeback(folio)); submit_extent_folio(bio_ctrl, disk_bytenr, folio, - sectorsize, filepos - folio_pos(folio)); + sectorsize, filepos - folio_pos(folio), 0); return 0; } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c09fbc257634ab..4dd3d8a02519ec 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -397,6 +397,36 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) path->skip_locking = 1; } + /* + * If we are searching for a csum of an extent from a past + * transaction, we can search in the commit root and reduce + * lock contention on the csum tree extent buffers. + * + * This is important because that lock is an rwsem which gets + * pretty heavy write load under memory pressure and sustained + * csum overwrites, unlike the commit_root_sem. (Memory pressure + * makes us writeback the nodes multiple times per transaction, + * which makes us cow them each time, taking the write lock.) + * + * Due to how rwsem is implemented, there is a possible + * priority inversion where the readers holding the lock don't + * get scheduled (say they're in a cgroup stuck in heavy reclaim) + * which then blocks writers, including transaction commit. By + * using a semaphore with fewer writers (only a commit switching + * the roots), we make this issue less likely. + * + * Note that we don't rely on btrfs_search_slot to lock the + * commit root csum. We call search_slot multiple times, which would + * create a potential race where a commit comes in between searches + * while we are not holding the commit_root_sem, and we get csums + * from across transactions. + */ + if (bbio->csum_search_commit_root) { + path->search_commit_root = 1; + path->skip_locking = 1; + down_read(&fs_info->commit_root_sem); + } + while (bio_offset < orig_len) { int count; u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset; @@ -442,6 +472,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } + if (bbio->csum_search_commit_root) + up_read(&fs_info->commit_root_sem); return ret; } From 0d703963d297964451783e1a0688ebdf74cd6151 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 16 Jul 2025 11:13:15 +0900 Subject: [PATCH 2444/3206] btrfs: zoned: refine extent allocator hint selection The hint block group selection in the extent allocator is wrong in the first place, as it can select the dedicated data relocation block group for the normal data allocation. Since we separated the normal data space_info and the data relocation space_info, we can easily identify a block group is for data relocation or not. Do not choose it for the normal data allocation. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 97d517cdf2df75..682d21a73a67a4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4297,7 +4297,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, } static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, - struct find_free_extent_ctl *ffe_ctl) + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info) { if (ffe_ctl->for_treelog) { spin_lock(&fs_info->treelog_bg_lock); @@ -4321,6 +4322,7 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, u64 avail = block_group->zone_capacity - block_group->alloc_offset; if (block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && avail >= ffe_ctl->num_bytes) { ffe_ctl->hint_byte = block_group->start; break; @@ -4342,7 +4344,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - return prepare_allocation_zoned(fs_info, ffe_ctl); + return prepare_allocation_zoned(fs_info, ffe_ctl, space_info); default: BUG(); } From d71b419f274c2eea83038c8623ddc45d51af70e9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 10 Aug 2025 20:14:57 +0930 Subject: [PATCH 2445/3206] btrfs: pass btrfs_inode pointer directly into btrfs_compress_folios() For the 3 supported compression algorithms, two of them (zstd and zlib) are already grabbing the btrfs inode for error messages. It's more common to pass btrfs_inode and grab the address space from it. Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 12 ++++++------ fs/btrfs/compression.h | 8 ++++---- fs/btrfs/inode.c | 2 +- fs/btrfs/lzo.c | 5 +++-- fs/btrfs/zlib.c | 7 ++----- fs/btrfs/zstd.c | 9 ++------- 6 files changed, 18 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 06e119ee264924..839c8c33aa5aa4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -90,19 +90,19 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len) } static int compression_compress_pages(int type, struct list_head *ws, - struct address_space *mapping, u64 start, + struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { switch (type) { case BTRFS_COMPRESS_ZLIB: - return zlib_compress_folios(ws, mapping, start, folios, + return zlib_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_LZO: - return lzo_compress_folios(ws, mapping, start, folios, + return lzo_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_ZSTD: - return zstd_compress_folios(ws, mapping, start, folios, + return zstd_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_NONE: default: @@ -1034,7 +1034,7 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { @@ -1044,7 +1044,7 @@ int btrfs_compress_folios(unsigned int type, int level, struct address_space *ma level = btrfs_compress_set_level(type, level); workspace = get_workspace(type, level); - ret = compression_compress_pages(type, workspace, mapping, start, folios, + ret = compression_compress_pages(type, workspace, inode, start, folios, out_folios, total_in, total_out); /* The total read-in bytes should be no larger than the input. */ ASSERT(*total_in <= orig_len); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 7b41b2b5ff4488..e7282fb7c13b97 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -88,7 +88,7 @@ int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); bool btrfs_compress_level_valid(unsigned int type, int level); -int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, @@ -155,7 +155,7 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret); -int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, +int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); @@ -166,7 +166,7 @@ struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); struct list_head *zlib_get_workspace(unsigned int level); -int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, +int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); @@ -176,7 +176,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, struct list_head *lzo_alloc_workspace(void); void lzo_free_workspace(struct list_head *ws); -int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, +int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9eb0253507a08e..bdbef8e24f932a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -959,7 +959,7 @@ static void compress_file_range(struct btrfs_work *work) /* Compression level is applied here. */ ret = btrfs_compress_folios(compress_type, compress_level, - mapping, start, folios, &nr_folios, &total_in, + inode, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) goto mark_incompressible; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d403641889caf3..7b46aef7dd93af 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -209,12 +209,13 @@ static int copy_compressed_data_to_page(char *compressed_data, return 0; } -int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, +int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); - const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize; + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct address_space *mapping = inode->vfs_inode.i_mapping; struct folio *folio_in = NULL; char *sizes_ptr; const unsigned long max_nr_folio = *out_folios; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 5292cd341f70f2..21af68f93a2d1f 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -133,11 +133,12 @@ static int copy_data_into_buffer(struct address_space *mapping, return 0; } -int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, +int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; char *data_in = NULL; char *cfolio_out; @@ -155,8 +156,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = zlib_deflateInit(&workspace->strm, workspace->level); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - btrfs_err(inode->root->fs_info, "zlib compression init failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); @@ -225,8 +224,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - btrfs_warn(inode->root->fs_info, "zlib compression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index ff0292615e1f37..00159e0e921ea5 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -384,11 +384,12 @@ struct list_head *zstd_alloc_workspace(int level) return ERR_PTR(-ENOMEM); } -int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, +int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct address_space *mapping = inode->vfs_inode.i_mapping; zstd_cstream *stream; int ret = 0; int nr_folios = 0; @@ -411,8 +412,6 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - btrfs_err(inode->root->fs_info, "zstd compression init level %d failed, root %llu inode %llu offset %llu", workspace->req_level, btrfs_root_id(inode->root), @@ -447,8 +446,6 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret2))) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - btrfs_warn(inode->root->fs_info, "zstd compression level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), @@ -522,8 +519,6 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret2 = zstd_end_stream(stream, &workspace->out_buf); if (unlikely(zstd_is_error(ret2))) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - btrfs_err(inode->root->fs_info, "zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), From 0a6dcd42353b96ab4a74796aed1541591de5890c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 11 Aug 2025 09:43:42 +0930 Subject: [PATCH 2446/3206] btrfs: use blocksize to check if compression is making things larger [BEHAVIOR DIFFERENCE BETWEEN COMPRESSION ALGOS] Currently LZO compression algorithm will check if we're making the compressed data larger after compressing more than 2 blocks. But zlib and zstd do the same checks after compressing more than 8192 bytes. This is not a big deal, but since we're already supporting larger block size (e.g. 64K block size if page size is also 64K), this check is not suitable for all block sizes. For example, if our page and block size are both 16KiB, and after the first block compressed using zlib, the resulted compressed data is slightly larger than 16KiB, we will immediately abort the compression. This makes zstd and zlib compression algorithms to behave slightly different from LZO, which only aborts after compressing two blocks. [ENHANCEMENT] To unify the behavior, only abort the compression after compressing at least two blocks. Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 3 ++- fs/btrfs/zstd.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 21af68f93a2d1f..33dc7e7b5c364c 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -148,6 +148,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const u32 blocksize = inode->root->fs_info->sectorsize; const u64 orig_end = start + len; *out_folios = 0; @@ -234,7 +235,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, } /* we're making it bigger, give up */ - if (workspace->strm.total_in > 8192 && + if (workspace->strm.total_in > blocksize * 2 && workspace->strm.total_in < workspace->strm.total_out) { ret = -E2BIG; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 00159e0e921ea5..d521187336a59f 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -400,6 +400,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, unsigned long len = *total_out; const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; + const u32 blocksize = inode->root->fs_info->sectorsize; unsigned long max_out = nr_dest_folios * PAGE_SIZE; unsigned int cur_len; @@ -456,7 +457,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, } /* Check to see if we are making it bigger */ - if (tot_in + workspace->in_buf.pos > 8192 && + if (tot_in + workspace->in_buf.pos > blocksize * 2 && tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) { ret = -E2BIG; From d728f2e5f8a075756ce60410c1ecb3a0b61f2bf8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 9 Aug 2025 14:10:48 +0930 Subject: [PATCH 2447/3206] btrfs: simplify support block size check Currently we manually check the block size against 3 different values: - 4K - PAGE_SIZE - MIN_BLOCKSIZE Those 3 values can match or differ from each other. This makes it pretty complex to output the supported block sizes. Considering we're going to add block size > page size support soon, this can make the support block size sysfs attribute much harder to implement. To make it easier, factor out a helper, btrfs_supported_blocksize() to do a simple check for the block size. Then utilize it in the two locations: - btrfs_validate_super() This is very straightforward - supported_sectorsizes_show() Iterate through all valid block sizes, and only output supported ones. This is to make future full range block sizes support much easier. Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 16 +--------------- fs/btrfs/fs.c | 27 +++++++++++++++++++++++++++ fs/btrfs/fs.h | 3 +++ fs/btrfs/sysfs.c | 16 ++++++++++------ 4 files changed, 41 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 70fc4e7cc5a0e6..123c397ca8f88f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2438,21 +2438,7 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } - /* - * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. - * - * For 4K page sized systems with non-debug builds, all 3 matches (4K). - * For 4K page sized systems with debug builds, there are two block sizes - * supported. (4K and 2K) - * - * We can support 16K sectorsize with 64K page size without problem, - * but such sectorsize/pagesize combination doesn't make much sense. - * 4K will be our future standard, PAGE_SIZE is supported from the very - * beginning. - */ - if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && - sectorsize != PAGE_SIZE && - sectorsize != BTRFS_MIN_BLOCKSIZE)) { + if (!btrfs_supported_blocksize(sectorsize)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index b2bb86f8d7cf0c..8b118c03cdb881 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -54,6 +54,33 @@ size_t __attribute_const__ btrfs_get_num_csums(void) return ARRAY_SIZE(btrfs_csums); } +/* + * We support the following block sizes for all systems: + * + * - 4K + * This is the most common block size. For PAGE SIZE > 4K cases the subage + * mode is used. + * + * - PAGE_SIZE + * The straightforward block size to support. + * + * And extra support for the following block sizes based on the kernel config: + * + * - MIN_BLOCKSIZE + * This is either 4K (regular builds) or 2K (debug builds) + * This allows testing subpage routines on x86_64. + */ +bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize) +{ + /* @blocksize should be validated first. */ + ASSERT(is_power_of_2(blocksize) && blocksize >= BTRFS_MIN_BLOCKSIZE && + blocksize <= BTRFS_MAX_BLOCKSIZE); + + if (blocksize == PAGE_SIZE || blocksize == SZ_4K || blocksize == BTRFS_MIN_BLOCKSIZE) + return true; + return false; +} + /* * Start exclusive operation @type, return true on success. */ diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 8cc07cc70b1283..f0b090d4ac0466 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -59,6 +59,8 @@ struct btrfs_space_info; #define BTRFS_MIN_BLOCKSIZE (SZ_4K) #endif +#define BTRFS_MAX_BLOCKSIZE (SZ_64K) + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -997,6 +999,7 @@ static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs return folio_size(folio) >> fs_info->sectorsize_bits; } +bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize); bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 9d398f7a36addb..81f52c1f55ce57 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -409,13 +409,17 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, char *buf) { ssize_t ret = 0; + bool has_output = false; - if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE) - ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE); - if (PAGE_SIZE > SZ_4K) - ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); - ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); - + for (u32 cur = BTRFS_MIN_BLOCKSIZE; cur <= BTRFS_MAX_BLOCKSIZE; cur *= 2) { + if (!btrfs_supported_blocksize(cur)) + continue; + if (has_output) + ret += sysfs_emit_at(buf, ret, " "); + ret += sysfs_emit_at(buf, ret, "%u", cur); + has_output = true; + } + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_sectorsizes, From 28a38e20acf58ddbeb65eb1c359cc43414094f5a Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 12 Aug 2025 16:25:54 +0800 Subject: [PATCH 2448/3206] btrfs: use PTR_ERR_OR_ZERO() to simplify code inbtrfs_control_ioctl() Use the standard error pointer macro to simplify the code. Reviewed-by: Daniel Vacek Signed-off-by: Xichao Zhao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b06b8f32553781..ee2418bb2efd0b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2274,10 +2274,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, device = btrfs_scan_one_device(vol->name, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); - if (IS_ERR(device)) - ret = PTR_ERR(device); - else - ret = 0; + ret = PTR_ERR_OR_ZERO(device); break; } ret = !(device->fs_devices->num_devices == From cba7c35fec267188a9708deae857e9116c57497b Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 12 Aug 2025 16:28:27 -0700 Subject: [PATCH 2449/3206] btrfs: move ref-verify under CONFIG_BTRFS_DEBUG Remove CONFIG_BTRFS_FS_REF_VERIFY Kconfig and add it as part of CONFIG_BTRFS_DEBUG. This should not be impactful to the performance of debug. The struct btrfs_ref takes an additional u64, btrfs_fs_info takes an additional spinlock_t and rb_root. All of the ref_verify logic is still protected by a mount option. Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 11 ----------- fs/btrfs/Makefile | 2 +- fs/btrfs/delayed-ref.c | 4 ++-- fs/btrfs/delayed-ref.h | 9 +++++---- fs/btrfs/fs.h | 4 +--- fs/btrfs/ref-verify.h | 4 ++-- fs/btrfs/super.c | 9 --------- 7 files changed, 11 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ea95c90c847489..86d791210c393f 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -117,14 +117,3 @@ config BTRFS_EXPERIMENTAL - large folio support If unsure, say N. - -config BTRFS_FS_REF_VERIFY - bool "Btrfs with the ref verify tool compiled in" - depends on BTRFS_FS - default n - help - Enable run-time extent reference verification instrumentation. This - is meant to be used by btrfs developers for tracking down extent - reference problems or verifying they didn't break something. - - If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 2d5f0482678b82..743d7677b175df 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -36,7 +36,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o -btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o +btrfs-$(CONFIG_BTRFS_DEBUG) += ref-verify.o btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o btrfs-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ca382c5b186f47..5d8361bac497fc 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -952,7 +952,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, bool skip_qgroup) { -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif @@ -969,7 +969,7 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, u64 mod_root, bool skip_qgroup) { -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 552ec4fa645d4b..5ce94053214452 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -276,10 +276,6 @@ struct btrfs_ref { */ bool skip_qgroup; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* Through which root is this modification. */ - u64 real_root; -#endif u64 bytenr; u64 num_bytes; u64 owning_root; @@ -296,6 +292,11 @@ struct btrfs_ref { struct btrfs_data_ref data_ref; struct btrfs_tree_ref tree_ref; }; + +#ifdef CONFIG_BTRFS_DEBUG + /* Through which root is this modification. */ + u64 real_root; +#endif }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index f0b090d4ac0466..66e08e20ff5acc 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -880,12 +880,10 @@ struct btrfs_fs_info { struct lockdep_map btrfs_trans_pending_ordered_map; struct lockdep_map btrfs_ordered_extent_map; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG spinlock_t ref_verify_lock; struct rb_root block_tree; -#endif -#ifdef CONFIG_BTRFS_DEBUG struct kobject *debug_kobj; struct list_head allocated_roots; diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 559bd25a2b7ab1..1ce544d53cc569 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -12,7 +12,7 @@ struct btrfs_fs_info; struct btrfs_ref; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG #include @@ -53,6 +53,6 @@ static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) { } -#endif /* CONFIG_BTRFS_FS_REF_VERIFY */ +#endif /* CONFIG_BTRFS_DEBUG */ #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ee2418bb2efd0b..3bc19989e7fbf3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -133,8 +133,6 @@ enum { Opt_enospc_debug, #ifdef CONFIG_BTRFS_DEBUG Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY Opt_ref_verify, #endif Opt_err, @@ -257,8 +255,6 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { fsparam_flag_no("enospc_debug", Opt_enospc_debug), #ifdef CONFIG_BTRFS_DEBUG fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment), -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY fsparam_flag("ref_verify", Opt_ref_verify), #endif {} @@ -646,8 +642,6 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } break; -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY case Opt_ref_verify: btrfs_set_opt(ctx->mount_opt, REF_VERIFY); break; @@ -2483,9 +2477,6 @@ static int __init btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_ASSERT ", assert=on" #endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - ", ref-verify=on" -#endif #ifdef CONFIG_BLK_DEV_ZONED ", zoned=yes" #else From 67e78f983e6a1208c2f52adad4b06a388f4a15a5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 13 Aug 2025 12:41:11 +0200 Subject: [PATCH 2450/3206] btrfs: convert several int parameters to bool We're almost done cleaning misused int/bool parameters. Convert a bunch of them, found by manual grepping. Note that btrfs_sync_fs() needs an int as it's mandated by the struct super_operations prototype. Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/ctree.c | 16 ++++++++-------- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/disk-io.h | 3 +-- fs/btrfs/extent-io-tree.c | 2 +- fs/btrfs/extent-io-tree.h | 2 +- fs/btrfs/extent-tree.c | 16 ++++++++-------- fs/btrfs/extent-tree.h | 7 +++---- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_map.c | 20 ++++++++++---------- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 10 +++++----- fs/btrfs/qgroup.c | 2 +- fs/btrfs/reflink.c | 4 ++-- fs/btrfs/relocation.c | 4 ++-- fs/btrfs/scrub.c | 6 +++--- fs/btrfs/scrub.h | 2 +- fs/btrfs/send.c | 28 +++++++++++++--------------- fs/btrfs/transaction.c | 6 +++--- fs/btrfs/tree-log.c | 6 +++--- fs/btrfs/volumes.c | 2 +- 22 files changed, 72 insertions(+), 76 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 499a9edf0ca315..b6328d50ffe78d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1358,7 +1358,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( * data in this block group. That check should be done by relocation routine, * not this function. */ -static int inc_block_group_ro(struct btrfs_block_group *cache, int force) +static int inc_block_group_ro(struct btrfs_block_group *cache, bool force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0387b9f43a5292..df3445448b7d64 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -558,7 +558,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, const struct fscrypt_str *name); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct fscrypt_str *name, int add_backref, u64 index); + const struct fscrypt_str *name, bool add_backref, u64 index); int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 74e6d7f3d2660e..6f9465d4ce54ce 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -30,10 +30,10 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *ins_key, struct btrfs_path *path, - int data_size, int extend); + int data_size, bool extend); static int push_node_left(struct btrfs_trans_handle *trans, struct extent_buffer *dst, - struct extent_buffer *src, int empty); + struct extent_buffer *src, bool empty); static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); @@ -1484,7 +1484,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, reada_for_search(fs_info, p, parent_level, slot, key->objectid); /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple @@ -2686,7 +2686,7 @@ static bool check_sibling_keys(const struct extent_buffer *left, */ static int push_node_left(struct btrfs_trans_handle *trans, struct extent_buffer *dst, - struct extent_buffer *src, int empty) + struct extent_buffer *src, bool empty) { struct btrfs_fs_info *fs_info = trans->fs_info; int push_items = 0; @@ -3102,7 +3102,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf) */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_path *path, - int data_size, int empty, + int data_size, bool empty, struct extent_buffer *right, int free_space, u32 left_nritems, u32 min_slot) @@ -3239,7 +3239,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int min_data_size, int data_size, - int empty, u32 min_slot) + bool empty, u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; @@ -3316,7 +3316,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_path *path, int data_size, - int empty, struct extent_buffer *left, + bool empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) { @@ -3642,7 +3642,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *ins_key, struct btrfs_path *path, int data_size, - int extend) + bool extend) { struct btrfs_disk_key disk_key; struct extent_buffer *l; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 123c397ca8f88f..b8ea63e444c724 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -116,7 +116,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) * detect blocks that either didn't get written at all or got written * in the wrong place. */ -int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic) +int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic) { if (!extent_buffer_uptodate(eb)) return 0; @@ -1047,7 +1047,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, root->node = NULL; goto fail; } - if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + if (!btrfs_buffer_uptodate(root->node, generation, false)) { ret = -EIO; goto fail; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 864a55a96226e7..57920f2c6fe4ef 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -106,8 +106,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, - int atomic); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, bool atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 66361325f6dcea..0c58342c6125e1 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1664,7 +1664,7 @@ void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, */ u64 btrfs_count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig, + u32 bits, bool contig, struct extent_state **cached_state) { struct extent_state *state = NULL; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 36facca379738b..6f07b965e8da52 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -163,7 +163,7 @@ void __cold btrfs_extent_state_free_cachep(void); u64 btrfs_count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig, + u64 max_bytes, u32 bits, bool contig, struct extent_state **cached_state); void btrfs_free_extent_state(struct extent_state *state); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 682d21a73a67a4..e117b5cbefae73 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2457,7 +2457,7 @@ int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc) + bool full_backref, bool inc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 parent; @@ -2543,15 +2543,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, bool full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1); + return __btrfs_mod_ref(trans, root, buf, full_backref, true); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, bool full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0); + return __btrfs_mod_ref(trans, root, buf, full_backref, false); } static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) @@ -5584,7 +5584,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); - if (btrfs_buffer_uptodate(next, generation, 0)) + if (btrfs_buffer_uptodate(next, generation, false)) return 0; check.level = level - 1; @@ -6051,9 +6051,9 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. * - * If called with for_reloc == 0, may exit early with -EAGAIN + * If called with for_reloc set, may exit early with -EAGAIN */ -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) +int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc) { const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 82d3a82dc712a4..e970ac42a871ad 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -140,9 +140,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, bool full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, bool full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); @@ -155,8 +155,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, - int for_reloc); +int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7ab45b2346216e..152c9d3e8cce8b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4543,7 +4543,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, if (IS_ERR(eb)) return; - if (btrfs_buffer_uptodate(eb, gen, 1)) { + if (btrfs_buffer_uptodate(eb, gen, true)) { free_extent_buffer(eb); return; } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 57f52585a6dde9..ac28eee7ae32c5 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -460,7 +460,7 @@ void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) static inline void setup_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, - int modified) + bool modified) { refcount_inc(&em->refs); @@ -486,7 +486,7 @@ static inline void setup_extent_mapping(struct btrfs_inode *inode, * taken, or a reference dropped if the merge attempt was successful. */ static int add_extent_mapping(struct btrfs_inode *inode, - struct extent_map *em, int modified) + struct extent_map *em, bool modified) { struct extent_map_tree *tree = &inode->extent_tree; struct btrfs_root *root = inode->root; @@ -509,7 +509,7 @@ static int add_extent_mapping(struct btrfs_inode *inode, } static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len, int strict) + u64 start, u64 len, bool strict) { struct extent_map *em; struct rb_node *rb_node; @@ -548,7 +548,7 @@ static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - return lookup_extent_mapping(tree, start, len, 1); + return lookup_extent_mapping(tree, start, len, true); } /* @@ -566,7 +566,7 @@ struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - return lookup_extent_mapping(tree, start, len, 0); + return lookup_extent_mapping(tree, start, len, false); } /* @@ -594,7 +594,7 @@ void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *e static void replace_extent_mapping(struct btrfs_inode *inode, struct extent_map *cur, struct extent_map *new, - int modified) + bool modified) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *tree = &inode->extent_tree; @@ -670,7 +670,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, em->len = end - start; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) em->offset += start_diff; - return add_extent_mapping(inode, em, 0); + return add_extent_mapping(inode, em, false); } /* @@ -707,7 +707,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, if (em->disk_bytenr == EXTENT_MAP_INLINE) ASSERT(em->start == 0); - ret = add_extent_mapping(inode, em, 0); + ret = add_extent_mapping(inode, em, false); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree @@ -1082,7 +1082,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr split_pre->flags = flags; split_pre->generation = em->generation; - replace_extent_mapping(inode, em, split_pre, 1); + replace_extent_mapping(inode, em, split_pre, true); /* * Now we only have an extent_map at: @@ -1098,7 +1098,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; split_mid->generation = em->generation; - add_extent_mapping(inode, split_mid, 1); + add_extent_mapping(inode, split_mid, true); /* Once for us */ btrfs_free_extent_map(em); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bdbef8e24f932a..0c9c51b3952c99 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6671,7 +6671,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct fscrypt_str *name, int add_backref, u64 index) + const struct fscrypt_str *name, bool add_backref, u64 index) { int ret = 0; struct btrfs_key key; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e13de2bdcbfab..e10daf6631afd6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1251,7 +1251,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, } static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg, int subvol) + void __user *arg, bool subvol) { struct btrfs_ioctl_vol_args *vol_args; int ret; @@ -5223,13 +5223,13 @@ long btrfs_ioctl(struct file *file, unsigned int case FITRIM: return btrfs_ioctl_fitrim(fs_info, argp); case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, argp, 0); + return btrfs_ioctl_snap_create(file, argp, false); case BTRFS_IOC_SNAP_CREATE_V2: - return btrfs_ioctl_snap_create_v2(file, argp, 0); + return btrfs_ioctl_snap_create_v2(file, argp, false); case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, argp, 1); + return btrfs_ioctl_snap_create(file, argp, true); case BTRFS_IOC_SUBVOL_CREATE_V2: - return btrfs_ioctl_snap_create_v2(file, argp, 1); + return btrfs_ioctl_snap_create_v2(file, argp, true); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp, false); case BTRFS_IOC_SNAP_DESTROY_V2: diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index da102da169fde2..2d8667cc609a2b 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2729,7 +2729,7 @@ static void qgroup_iterator_nested_clean(struct list_head *head) */ static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, struct ulist *roots, struct list_head *qgroups, - u64 seq, int update_old) + u64 seq, bool update_old) { struct ulist_node *unode; struct ulist_iterator uiter; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index ce25ab7f0e9965..0a85e1da97777a 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -23,7 +23,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, u64 endoff, const u64 destoff, const u64 olen, - int no_time_update) + bool no_time_update) { int ret; @@ -337,7 +337,7 @@ static int clone_copy_inline_extent(struct btrfs_inode *inode, */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff, int no_time_update) + const u64 destoff, bool no_time_update) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path = NULL; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7256f6748c8f92..8bd97b24f375fd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1490,7 +1490,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) * ->reloc_root. If it fails however we must * drop the ref ourselves. */ - ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); + ret2 = btrfs_drop_snapshot(reloc_root, false, true); if (ret2 < 0) { btrfs_put_root(reloc_root); if (!ret) @@ -1500,7 +1500,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ - ret2 = btrfs_drop_snapshot(root, 0, 1); + ret2 = btrfs_drop_snapshot(root, false, true); if (ret2 < 0) { btrfs_put_root(root); if (!ret) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6776e6ab8d1080..ce5f6732bfb585 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -206,7 +206,7 @@ struct scrub_ctx { ktime_t throttle_deadline; u64 throttle_sent; - int is_dev_replace; + bool is_dev_replace; u64 write_pointer; struct mutex wr_lock; @@ -446,7 +446,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx) } static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( - struct btrfs_fs_info *fs_info, int is_dev_replace) + struct btrfs_fs_info *fs_info, bool is_dev_replace) { struct scrub_ctx *sctx; int i; @@ -3013,7 +3013,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace) + bool readonly, bool is_dev_replace) { struct btrfs_dev_lookup_args args = { .devid = devid }; struct scrub_ctx *sctx; diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h index f0df597b75c7c7..aa68b6ebaf555c 100644 --- a/fs/btrfs/scrub.h +++ b/fs/btrfs/scrub.h @@ -11,7 +11,7 @@ struct btrfs_scrub_progress; int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace); + bool readonly, bool is_dev_replace); void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); int btrfs_scrub_cancel(struct btrfs_fs_info *info); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7664025a5af431..faa3710fa074fe 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -973,7 +973,7 @@ typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); * path must point to the INODE_REF or INODE_EXTREF when called. */ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *found_key, int resolve, + struct btrfs_key *found_key, bool resolve, iterate_inode_ref_t iterate, void *ctx) { struct extent_buffer *eb = path->nodes[0]; @@ -1251,8 +1251,7 @@ static int get_inode_path(struct btrfs_root *root, goto out; } - ret = iterate_inode_ref(root, p, &found_key, 1, - __copy_first_ref, path); + ret = iterate_inode_ref(root, p, &found_key, true, __copy_first_ref, path); if (ret < 0) goto out; ret = 0; @@ -4756,8 +4755,8 @@ static int record_new_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, record_new_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, + false, record_new_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4768,9 +4767,8 @@ static int record_deleted_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, record_deleted_ref_if_needed, - sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, + false, record_deleted_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4781,12 +4779,12 @@ static int record_changed_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, record_new_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, + false, record_new_ref_if_needed, sctx); if (ret < 0) return ret; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, + false, record_deleted_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4835,7 +4833,7 @@ static int process_all_refs(struct send_ctx *sctx, found_key.type != BTRFS_INODE_EXTREF_KEY)) break; - ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); + ret = iterate_inode_ref(root, path, &found_key, false, cb, sctx); if (ret < 0) goto out; } @@ -6578,7 +6576,7 @@ static int process_all_extents(struct send_ctx *sctx) return ret; } -static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, +static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end, int *pending_move, int *refs_processed) { @@ -6601,7 +6599,7 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, return ret; } -static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) +static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end) { int ret = 0; struct btrfs_inode_info info; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c5c0d9cf1a8088..25ee0183e17812 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -404,7 +404,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, */ static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int force) + bool force) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; @@ -2657,9 +2657,9 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - ret = btrfs_drop_snapshot(root, 0, 0); + ret = btrfs_drop_snapshot(root, false, false); else - ret = btrfs_drop_snapshot(root, 1, 0); + ret = btrfs_drop_snapshot(root, true, false); btrfs_put_root(root); return (ret < 0) ? 0 : 1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cbdc0c0b5e7c55..c3cfffc2100854 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -379,7 +379,7 @@ static int process_one_buffer(struct btrfs_root *log, return ret; } - if (btrfs_buffer_uptodate(eb, gen, 0) && + if (btrfs_buffer_uptodate(eb, gen, false) && btrfs_header_level(eb) == 0) { ret = btrfs_exclude_logged_extents(eb); if (ret) @@ -4398,7 +4398,7 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans, static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, - struct inode *inode, int log_inode_only, + struct inode *inode, bool log_inode_only, u64 logged_isize) { u64 flags; @@ -4494,7 +4494,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, - 0, 0); + false, 0); btrfs_release_path(path); return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c6e3efd6f60218..cc75b4e49662a8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4269,7 +4269,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) * @flags: profile to validate * @extended: if true @flags is treated as an extended profile */ -static int alloc_profile_is_valid(u64 flags, int extended) +static int alloc_profile_is_valid(u64 flags, bool extended) { u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : BTRFS_BLOCK_GROUP_PROFILE_MASK); From e8513c012de75fd65e2df5499572bc6ef3f6e409 Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 12 Aug 2025 16:04:39 -0700 Subject: [PATCH 2451/3206] btrfs: implement ref_tracker for delayed_nodes Add ref_tracker infrastructure for struct btrfs_delayed_node. It is a response to the largest btrfs related crash in our fleet. We're seeing soft lockups in btrfs_kill_all_delayed_nodes() that seem to be a result of delayed_nodes not being released properly. A ref_tracker object is allocated on reference count increases and freed on reference count decreases. The ref_tracker object stores a stack trace of where it is allocated. The ref_tracker_dir object is embedded in btrfs_delayed_node and keeps track of all current and some old/freed ref_tracker objects. When a leak is detected we can print the stack traces for all ref_trackers that have not yet been freed. Here is a common example of taking a reference to a delayed_node and freeing it with ref_tracker. struct btrfs_ref_tracker tracker; struct btrfs_delayed_node *node; node = btrfs_get_delayed_node(inode, &tracker); // use delayed_node... btrfs_release_delayed_node(node, &tracker); There are two special cases where the delayed_node reference is "long lived", meaning that the thread that takes the reference and the thread that releases the reference are different. The 'inode_cache_tracker' tracks the delayed_node stored in btrfs_inode. The 'node_list_tracker' tracks the delayed_node stored in the btrfs_delayed_root node_list/prepare_list. These trackers are embedded in the btrfs_delayed_node. btrfs_ref_tracker and btrfs_ref_tracker_dir are wrappers that either compile to the corresponding ref_tracker structs or empty structs depending on CONFIG_BTRFS_DEBUG. There are also btrfs wrappers for the ref_tracker API. Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 1 + fs/btrfs/delayed-inode.c | 175 +++++++++++++++++++++++++++------------ fs/btrfs/delayed-inode.h | 70 ++++++++++++++++ 3 files changed, 191 insertions(+), 55 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 86d791210c393f..4438637c8900cd 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -62,6 +62,7 @@ config BTRFS_FS_RUN_SANITY_TESTS config BTRFS_DEBUG bool "Btrfs debugging support" depends on BTRFS_FS + select REF_TRACKER if STACKTRACE_SUPPORT help Enable run-time debugging support for the btrfs filesystem. diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c0c1ddd46b67a2..364814642a91d6 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -57,6 +57,7 @@ static inline void btrfs_init_delayed_node( delayed_node->root = root; delayed_node->inode_id = inode_id; refcount_set(&delayed_node->refs, 0); + btrfs_delayed_node_ref_tracker_dir_init(delayed_node); delayed_node->ins_root = RB_ROOT_CACHED; delayed_node->del_root = RB_ROOT_CACHED; mutex_init(&delayed_node->mutex); @@ -65,7 +66,8 @@ static inline void btrfs_init_delayed_node( } static struct btrfs_delayed_node *btrfs_get_delayed_node( - struct btrfs_inode *btrfs_inode) + struct btrfs_inode *btrfs_inode, + struct btrfs_ref_tracker *tracker) { struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(btrfs_inode); @@ -74,6 +76,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = READ_ONCE(btrfs_inode->delayed_node); if (node) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS); return node; } @@ -83,6 +86,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( if (node) { if (btrfs_inode->delayed_node) { refcount_inc(&node->refs); /* can be accessed */ + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); BUG_ON(btrfs_inode->delayed_node != node); xa_unlock(&root->delayed_nodes); return node; @@ -106,6 +110,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( */ if (refcount_inc_not_zero(&node->refs)) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, + GFP_ATOMIC); btrfs_inode->delayed_node = node; } else { node = NULL; @@ -126,7 +133,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * Return the delayed node, or error pointer on failure. */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( - struct btrfs_inode *btrfs_inode) + struct btrfs_inode *btrfs_inode, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; struct btrfs_root *root = btrfs_inode->root; @@ -135,7 +143,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( void *ptr; again: - node = btrfs_get_delayed_node(btrfs_inode); + node = btrfs_get_delayed_node(btrfs_inode, tracker); if (node) return node; @@ -144,12 +152,10 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); - /* Cached in the inode and can be accessed. */ - refcount_set(&node->refs, 2); - /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */ ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS); if (ret == -ENOMEM) { + btrfs_delayed_node_ref_tracker_dir_exit(node); kmem_cache_free(delayed_node_cache, node); return ERR_PTR(-ENOMEM); } @@ -158,6 +164,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( if (ptr) { /* Somebody inserted it, go back and read it. */ xa_unlock(&root->delayed_nodes); + btrfs_delayed_node_ref_tracker_dir_exit(node); kmem_cache_free(delayed_node_cache, node); node = NULL; goto again; @@ -166,6 +173,12 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( ASSERT(xa_err(ptr) != -EINVAL); ASSERT(xa_err(ptr) != -ENOMEM); ASSERT(ptr == NULL); + + /* Cached in the inode and can be accessed. */ + refcount_set(&node->refs, 2); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, GFP_ATOMIC); + btrfs_inode->delayed_node = node; xa_unlock(&root->delayed_nodes); @@ -191,6 +204,8 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, list_add_tail(&node->n_list, &root->node_list); list_add_tail(&node->p_list, &root->prepare_list); refcount_inc(&node->refs); /* inserted into list */ + btrfs_delayed_node_ref_tracker_alloc(node, &node->node_list_tracker, + GFP_ATOMIC); root->nodes++; set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } @@ -204,6 +219,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, spin_lock(&root->lock); if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; + btrfs_delayed_node_ref_tracker_free(node, &node->node_list_tracker); refcount_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) @@ -214,22 +230,26 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, } static struct btrfs_delayed_node *btrfs_first_delayed_node( - struct btrfs_delayed_root *delayed_root) + struct btrfs_delayed_root *delayed_root, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); node = list_first_entry_or_null(&delayed_root->node_list, struct btrfs_delayed_node, n_list); - if (node) + if (node) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + } spin_unlock(&delayed_root->lock); return node; } static struct btrfs_delayed_node *btrfs_next_delayed_node( - struct btrfs_delayed_node *node) + struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_root *delayed_root; struct list_head *p; @@ -249,6 +269,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( next = list_entry(p, struct btrfs_delayed_node, n_list); refcount_inc(&next->refs); + btrfs_delayed_node_ref_tracker_alloc(next, tracker, GFP_ATOMIC); out: spin_unlock(&delayed_root->lock); @@ -257,7 +278,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( static void __btrfs_release_delayed_node( struct btrfs_delayed_node *delayed_node, - int mod) + int mod, struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_root *delayed_root; @@ -273,6 +294,7 @@ static void __btrfs_release_delayed_node( btrfs_dequeue_delayed_node(delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); + btrfs_delayed_node_ref_tracker_free(delayed_node, tracker); if (refcount_dec_and_test(&delayed_node->refs)) { struct btrfs_root *root = delayed_node->root; @@ -282,17 +304,20 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); + btrfs_delayed_node_ref_tracker_dir_exit(delayed_node); kmem_cache_free(delayed_node_cache, delayed_node); } } -static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) +static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { - __btrfs_release_delayed_node(node, 0); + __btrfs_release_delayed_node(node, 0, tracker); } static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( - struct btrfs_delayed_root *delayed_root) + struct btrfs_delayed_root *delayed_root, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; @@ -302,6 +327,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( if (node) { list_del_init(&node->p_list); refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); } spin_unlock(&delayed_root->lock); @@ -309,9 +335,10 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( } static inline void btrfs_release_prepared_delayed_node( - struct btrfs_delayed_node *node) + struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { - __btrfs_release_delayed_node(node, 1); + __btrfs_release_delayed_node(node, 1, tracker); } static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, @@ -1126,6 +1153,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret = 0; @@ -1143,7 +1171,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) delayed_root = fs_info->delayed_root; - curr_node = btrfs_first_delayed_node(delayed_root); + curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker); while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); @@ -1153,7 +1181,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) } prev_node = curr_node; - curr_node = btrfs_next_delayed_node(curr_node); + prev_delayed_node_tracker = curr_delayed_node_tracker; + curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker); /* * See the comment below about releasing path before releasing * node. If the commit of delayed items was successful the path @@ -1161,7 +1190,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) * point to locked extent buffers (a leaf at the very least). */ ASSERT(path->nodes[0] == NULL); - btrfs_release_delayed_node(prev_node); + btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker); } /* @@ -1174,7 +1203,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) btrfs_free_path(path); if (curr_node) - btrfs_release_delayed_node(curr_node); + btrfs_release_delayed_node(curr_node, &curr_delayed_node_tracker); trans->block_rsv = block_rsv; return ret; @@ -1193,7 +1222,9 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr) int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node = + btrfs_get_delayed_node(inode, &delayed_node_tracker); BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_rsv *block_rsv; int ret; @@ -1204,14 +1235,14 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); if (!delayed_node->count) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } mutex_unlock(&delayed_node->mutex); path = btrfs_alloc_path(); if (!path) { - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -ENOMEM; } @@ -1220,7 +1251,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); trans->block_rsv = block_rsv; return ret; @@ -1230,18 +1261,20 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_trans_handle *trans; - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node; struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret; + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return 0; mutex_lock(&delayed_node->mutex); if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } mutex_unlock(&delayed_node->mutex); @@ -1275,7 +1308,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1289,7 +1322,8 @@ void btrfs_remove_delayed_node(struct btrfs_inode *inode) return; inode->delayed_node = NULL; - btrfs_release_delayed_node(delayed_node); + + btrfs_release_delayed_node(delayed_node, &delayed_node->inode_cache_tracker); } struct btrfs_async_delayed_work { @@ -1305,6 +1339,7 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; int total_done = 0; @@ -1321,7 +1356,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) BTRFS_DELAYED_BACKGROUND / 2) break; - delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + delayed_node = btrfs_first_prepared_delayed_node(delayed_root, + &delayed_node_tracker); if (!delayed_node) break; @@ -1330,7 +1366,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_release_path(path); - btrfs_release_prepared_delayed_node(delayed_node); + btrfs_release_prepared_delayed_node(delayed_node, + &delayed_node_tracker); total_done++; continue; } @@ -1345,7 +1382,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) btrfs_btree_balance_dirty_nodelay(root->fs_info); btrfs_release_path(path); - btrfs_release_prepared_delayed_node(delayed_node); + btrfs_release_prepared_delayed_node(delayed_node, + &delayed_node_tracker); total_done++; } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) @@ -1377,10 +1415,15 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) { - struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *node; - if (WARN_ON(node)) + node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker); + if (WARN_ON(node)) { + btrfs_delayed_node_ref_tracker_free(node, + &delayed_node_tracker); refcount_dec(&node->refs); + } } static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) @@ -1454,13 +1497,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_delayed_item *delayed_item; struct btrfs_dir_item *dir_item; bool reserve_leaf_space; u32 data_len; int ret; - delayed_node = btrfs_get_or_create_delayed_node(dir); + delayed_node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1536,7 +1580,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_unlock(&delayed_node->mutex); release_node: - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1591,10 +1635,11 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, u64 index) { struct btrfs_delayed_node *node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_delayed_item *item; int ret; - node = btrfs_get_or_create_delayed_node(dir); + node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker); if (IS_ERR(node)) return PTR_ERR(node); @@ -1635,14 +1680,16 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } mutex_unlock(&node->mutex); end: - btrfs_release_delayed_node(node); + btrfs_release_delayed_node(node, &delayed_node_tracker); return ret; } int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) { - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node; + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return -ENOENT; @@ -1652,12 +1699,12 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) * is updated now. So we needn't lock the delayed node. */ if (!delayed_node->index_cnt) { - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -EINVAL; } inode->index_cnt = delayed_node->index_cnt; - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -1668,8 +1715,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, { struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *item; + struct btrfs_ref_tracker delayed_node_tracker; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return false; @@ -1704,6 +1752,7 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, * insert/delete delayed items in this period. So we also needn't * requeue or dequeue this delayed node. */ + btrfs_delayed_node_ref_tracker_free(delayed_node, &delayed_node_tracker); refcount_dec(&delayed_node->refs); return true; @@ -1844,17 +1893,18 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_inode_item *inode_item; struct inode *vfs_inode = &inode->vfs_inode; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return -ENOENT; mutex_lock(&delayed_node->mutex); if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -ENOENT; } @@ -1892,7 +1942,7 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) inode->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -1901,9 +1951,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; int ret = 0; - delayed_node = btrfs_get_or_create_delayed_node(inode); + delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1923,7 +1974,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, atomic_inc(&root->fs_info->delayed_root->items); release_node: mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1931,6 +1982,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; /* * we don't do delayed inode updates during log recovery because it @@ -1940,7 +1992,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) return -EAGAIN; - delayed_node = btrfs_get_or_create_delayed_node(inode); + delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1967,7 +2019,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) atomic_inc(&fs_info->delayed_root->items); release_node: mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -2011,19 +2063,21 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return; __btrfs_kill_delayed_node(delayed_node); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); } void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { unsigned long index = 0; struct btrfs_delayed_node *delayed_nodes[8]; + struct btrfs_ref_tracker delayed_node_trackers[8]; while (1) { struct btrfs_delayed_node *node; @@ -2042,6 +2096,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) * about to be removed from the tree in the loop below */ if (refcount_inc_not_zero(&node->refs)) { + btrfs_delayed_node_ref_tracker_alloc(node, + &delayed_node_trackers[count], + GFP_ATOMIC); delayed_nodes[count] = node; count++; } @@ -2053,7 +2110,8 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) for (int i = 0; i < count; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); - btrfs_release_delayed_node(delayed_nodes[i]); + btrfs_release_delayed_node(delayed_nodes[i], + &delayed_node_trackers[i]); } } } @@ -2061,14 +2119,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) { struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; - curr_node = btrfs_first_delayed_node(fs_info->delayed_root); + curr_node = btrfs_first_delayed_node(fs_info->delayed_root, + &curr_delayed_node_tracker); while (curr_node) { __btrfs_kill_delayed_node(curr_node); prev_node = curr_node; - curr_node = btrfs_next_delayed_node(curr_node); - btrfs_release_delayed_node(prev_node); + prev_delayed_node_tracker = curr_delayed_node_tracker; + curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker); + btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker); } } @@ -2078,8 +2139,9 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, { struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; + struct btrfs_ref_tracker delayed_node_tracker; - node = btrfs_get_delayed_node(inode); + node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!node) return; @@ -2137,6 +2199,7 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, * delete delayed items. */ ASSERT(refcount_read(&node->refs) > 1); + btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); refcount_dec(&node->refs); } @@ -2147,8 +2210,9 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode, struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; struct btrfs_delayed_item *next; + struct btrfs_ref_tracker delayed_node_tracker; - node = btrfs_get_delayed_node(inode); + node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!node) return; @@ -2180,5 +2244,6 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode, * delete delayed items. */ ASSERT(refcount_read(&node->refs) > 1); + btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); refcount_dec(&node->refs); } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index e6e763ad2d421f..93149c32d3fa41 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -16,6 +16,7 @@ #include #include #include +#include #include "ctree.h" struct btrfs_disk_key; @@ -44,6 +45,22 @@ struct btrfs_delayed_root { wait_queue_head_t wait; }; +struct btrfs_ref_tracker_dir { +#ifdef CONFIG_BTRFS_DEBUG + struct ref_tracker_dir dir; +#else + struct {} tracker; +#endif +}; + +struct btrfs_ref_tracker { +#ifdef CONFIG_BTRFS_DEBUG + struct ref_tracker *tracker; +#else + struct {} tracker; +#endif +}; + #define BTRFS_DELAYED_NODE_IN_LIST 0 #define BTRFS_DELAYED_NODE_INODE_DIRTY 1 #define BTRFS_DELAYED_NODE_DEL_IREF 2 @@ -78,6 +95,12 @@ struct btrfs_delayed_node { * actual number of leaves we end up using. Protected by @mutex. */ u32 index_item_leaves; + /* Track all references to this delayed node. */ + struct btrfs_ref_tracker_dir ref_dir; + /* Track delayed node reference stored in node list. */ + struct btrfs_ref_tracker node_list_tracker; + /* Track delayed node reference stored in inode cache. */ + struct btrfs_ref_tracker inode_cache_tracker; }; struct btrfs_delayed_item { @@ -169,4 +192,51 @@ void __cold btrfs_delayed_inode_exit(void); /* for debugging */ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info); +#define BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT 16 +#define BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT 16 + +#ifdef CONFIG_BTRFS_DEBUG +static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) +{ + ref_tracker_dir_init(&node->ref_dir.dir, + BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT, + "delayed_node"); +} + +static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) +{ + ref_tracker_dir_exit(&node->ref_dir.dir); +} + +static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker, + gfp_t gfp) +{ + return ref_tracker_alloc(&node->ref_dir.dir, &tracker->tracker, gfp); +} + +static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) +{ + return ref_tracker_free(&node->ref_dir.dir, &tracker->tracker); +} +#else +static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) { } + +static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) { } + +static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker, + gfp_t gfp) +{ + return 0; +} + +static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) +{ + return 0; +} +#endif + #endif From b767a28d6154986929a2231d48b637b18b2aabb3 Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 12 Aug 2025 16:04:40 -0700 Subject: [PATCH 2452/3206] btrfs: print leaked references in kill_all_delayed_nodes() We are seeing soft lockups in kill_all_delayed_nodes due to a delayed_node with a lingering reference count of 1. Printing at this point will reveal the guilty stack trace. If the delayed_node has no references there should be no output. Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 1 + fs/btrfs/delayed-inode.h | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 364814642a91d6..6adfe62cd0c4dd 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -2112,6 +2112,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i], &delayed_node_trackers[i]); + btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]); } } } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 93149c32d3fa41..7f2db9905aeaee 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -208,6 +208,12 @@ static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_ ref_tracker_dir_exit(&node->ref_dir.dir); } +static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) +{ + ref_tracker_dir_print(&node->ref_dir.dir, + BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT); +} + static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node, struct btrfs_ref_tracker *tracker, gfp_t gfp) @@ -225,6 +231,8 @@ static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_ static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) { } +static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) { } + static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node, struct btrfs_ref_tracker *tracker, gfp_t gfp) From 46d33a0cc484ceb11aa47166e66d349ab3074eef Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 12 Aug 2025 16:04:41 -0700 Subject: [PATCH 2453/3206] btrfs: add mount option for ref_tracker The ref_tracker infrastructure aids debugging but is not enabled by default as it has a performance impact. Add mount option 'ref_tracker' so it can be selectively enabled on a filesystem. Currently it track references of 'delayed inodes'. Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.h | 15 +++++++++++++++ fs/btrfs/fs.h | 1 + fs/btrfs/super.c | 7 +++++++ 3 files changed, 23 insertions(+) diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 7f2db9905aeaee..0d949edc0caf16 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -198,6 +198,9 @@ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info); #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) { + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + ref_tracker_dir_init(&node->ref_dir.dir, BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT, "delayed_node"); @@ -205,11 +208,17 @@ static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_ static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) { + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + ref_tracker_dir_exit(&node->ref_dir.dir); } static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) { + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + ref_tracker_dir_print(&node->ref_dir.dir, BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT); } @@ -218,12 +227,18 @@ static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node struct btrfs_ref_tracker *tracker, gfp_t gfp) { + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return 0; + return ref_tracker_alloc(&node->ref_dir.dir, &tracker->tracker, gfp); } static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node, struct btrfs_ref_tracker *tracker) { + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return 0; + return ref_tracker_free(&node->ref_dir.dir, &tracker->tracker); } #else diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 66e08e20ff5acc..a84128e991a9d4 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -245,6 +245,7 @@ enum { BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30), BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31), BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32), + BTRFS_MOUNT_REF_TRACKER = (1ULL << 33), }; /* diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3bc19989e7fbf3..4951f50e9823e7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -134,6 +134,7 @@ enum { #ifdef CONFIG_BTRFS_DEBUG Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, Opt_ref_verify, + Opt_ref_tracker, #endif Opt_err, }; @@ -255,6 +256,7 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { fsparam_flag_no("enospc_debug", Opt_enospc_debug), #ifdef CONFIG_BTRFS_DEBUG fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment), + fsparam_flag("ref_tracker", Opt_ref_tracker), fsparam_flag("ref_verify", Opt_ref_verify), #endif {} @@ -645,6 +647,9 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_ref_verify: btrfs_set_opt(ctx->mount_opt, REF_VERIFY); break; + case Opt_ref_tracker: + btrfs_set_opt(ctx->mount_opt, REF_TRACKER); + break; #endif default: btrfs_err(NULL, "unrecognized mount option '%s'", param->key); @@ -1150,6 +1155,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) #endif if (btrfs_test_opt(info, REF_VERIFY)) seq_puts(seq, ",ref_verify"); + if (btrfs_test_opt(info, REF_TRACKER)) + seq_puts(seq, ",ref_tracker"); seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); subvol_name = btrfs_get_subvol_name_from_objectid(info, btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); From 3239c44df756bbc28182eb05aeeea84875c315fe Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 28 Jul 2025 17:57:54 +0930 Subject: [PATCH 2454/3206] btrfs: rework error handling of run_delalloc_nocow() Currently the error handling of run_delalloc_nocow() modifies @cur_offset to handle different parts of the delalloc range. However the error handling can always be split into 3 parts: 1) The range with ordered extents allocated (OE cleanup) We have to cleanup the ordered extents and unlock the folios. 2) The range that have been cleaned up (skip) For now it's only for the range of fallback_to_cow(). We should not touch it at all. 3) The range that is not yet touched (untouched) We have to unlock the folios and clear any reserved space. This 3 ranges split has the same principle as cow_file_range(), however the NOCOW/COW handling makes the above 3 range split much more complex: a) Failure immediately after a successful OE allocation Thus no @cow_start nor @cow_end set. start cur_offset end | OE cleanup | untouched | b) Failure after hitting a COW range but before calling fallback_to_cow() start cow_start cur_offset end | OE Cleanup | untouched | c) Failure to call fallback_to_cow() start cow_start cow_end end | OE Cleanup | skip | untouched | Instead of modifying @cur_offset, do proper range calculation for OE-cleanup and untouched ranges using above 3 cases with proper range charts. This avoid updating @cur_offset, as it will an extra output for debug purposes later, and explain the behavior easier. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 120 ++++++++++++++++++++++++----------------------- 1 file changed, 61 insertions(+), 59 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0c9c51b3952c99..4583d6331bfd9d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2047,6 +2047,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, bool check_prev = true; u64 ino = btrfs_ino(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; + /* The range that has ordered extent(s). */ + u64 oe_cleanup_start; + u64 oe_cleanup_len = 0; + /* The range that is untouched. */ + u64 untouched_start; + u64 untouched_len = 0; /* * Normally on a zoned device we're only doing COW writes, but in case @@ -2232,76 +2238,72 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, return 0; error: - /* - * There are several error cases: - * - * 1) Failed without falling back to COW - * start cur_offset end - * |/////////////| | - * - * In this case, cow_start should be (u64)-1. - * - * For range [start, cur_offset) the folios are already unlocked (except - * @locked_folio), EXTENT_DELALLOC already removed. - * Need to clear the dirty flags and finish the ordered extents. - * - * 2) Failed with error before calling fallback_to_cow() - * - * start cow_start end - * |/////////////| | - * - * In this case, only @cow_start is set, @cur_offset is between - * [cow_start, end) - * - * It's mostly the same as case 1), just replace @cur_offset with - * @cow_start. - * - * 3) Failed with error from fallback_to_cow() - * - * start cow_start cow_end end - * |/////////////|-----------| | - * - * In this case, both @cow_start and @cow_end is set. - * - * For range [start, cow_start) it's the same as case 1). - * But for range [cow_start, cow_end), all the cleanup is handled by - * cow_file_range(), we should not touch anything in that range. - * - * So for all above cases, if @cow_start is set, cleanup ordered extents - * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). - */ - if (cow_start != (u64)-1) - cur_offset = cow_start; - - if (cur_offset > start) { - btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); - cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + if (cow_start == (u64)-1) { + /* + * case a) + * start cur_offset end + * | OE cleanup | Untouched | + * + * We finished a fallback_to_cow() or nocow_one_range() call, but + * failed to check the next range. + */ + oe_cleanup_start = start; + oe_cleanup_len = cur_offset - start; + untouched_start = cur_offset; + untouched_len = end + 1 - untouched_start; + } else if (cow_start != (u64)-1 && cow_end == 0) { + /* + * case b) + * start cow_start cur_offset end + * | OE cleanup | Untouched | + * + * We got a range that needs COW, but before we hit the next NOCOW range, + * thus [cow_start, cur_offset) doesn't yet have any OE. + */ + oe_cleanup_start = start; + oe_cleanup_len = cow_start - start; + untouched_start = cow_start; + untouched_len = end + 1 - untouched_start; + } else { + /* + * case c) + * start cow_start cow_end end + * | OE cleanup | Skip | Untouched | + * + * fallback_to_cow() failed, and fallback_to_cow() will do the + * cleanup for its range, we shouldn't touch the range + * [cow_start, cow_end]. + */ + ASSERT(cow_start != (u64)-1 && cow_end != 0); + oe_cleanup_start = start; + oe_cleanup_len = cow_start - start; + untouched_start = cow_end + 1; + untouched_len = end + 1 - untouched_start; } - /* - * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to @cow_end + 1 to skip the COW range, as - * cow_file_range() will do the proper cleanup at error. - */ - if (cow_end) - cur_offset = cow_end + 1; + if (oe_cleanup_len) { + btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len); + cleanup_dirty_folios(inode, locked_folio, oe_cleanup_start, + oe_cleanup_start + oe_cleanup_len - 1, ret); + } - /* - * We need to lock the extent here because we're clearing DELALLOC and - * we're not locked at this point. - */ - if (cur_offset < end) { + if (untouched_len) { struct extent_state *cached = NULL; + const u64 untouched_end = untouched_start + untouched_len - 1; - btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached); - extent_clear_unlock_delalloc(inode, cur_offset, end, + /* + * We need to lock the extent here because we're clearing DELALLOC and + * we're not locked at this point. + */ + btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached); + extent_clear_unlock_delalloc(inode, untouched_start, untouched_end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); + btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL); } btrfs_free_path(path); btrfs_err_rl(fs_info, From 2dc019ca39347f76891d34a992b67258078aa45d Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Mon, 15 Sep 2025 15:59:41 +0530 Subject: [PATCH 2455/3206] powerpc/time: Expose boot_tb via accessor - Define accessor function get_boot_tb() to safely return boot_tb value, this is only needed when running in SPLPAR environments, so the accessor is built conditionally under CONFIG_PPC_SPLPAR. - Tag boot_tb as __ro_after_init since it is written once at initialized and never updated afterwards. Signed-off-by: Aboorva Devarajan Tested-by: Tejas Manhas Tested-by: Venkat Rao Bagalkote Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250915102947.26681-2-atrajeev@linux.ibm.com --- arch/powerpc/include/asm/time.h | 4 ++++ arch/powerpc/kernel/time.c | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index f8885586efafb3..7991ab1d4cb893 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -29,6 +29,10 @@ extern u64 decrementer_max; extern void generic_calibrate_decr(void); +#ifdef CONFIG_PPC_SPLPAR +extern u64 get_boot_tb(void); +#endif + /* Some sane defaults: 125 MHz timebase, 1GHz processor */ extern unsigned long ppc_proc_freq; #define DEFAULT_PROC_FREQ (DEFAULT_TB_FREQ * 8) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 8224381c1dba34..4bbeb8644d3da4 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -137,7 +137,7 @@ EXPORT_SYMBOL_GPL(rtc_lock); static u64 tb_to_ns_scale __read_mostly; static unsigned tb_to_ns_shift __read_mostly; -static u64 boot_tb __read_mostly; +static u64 boot_tb __ro_after_init; extern struct timezone sys_tz; static long timezone_offset; @@ -639,6 +639,12 @@ notrace unsigned long long sched_clock(void) return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift; } +#ifdef CONFIG_PPC_SPLPAR +u64 get_boot_tb(void) +{ + return boot_tb; +} +#endif #ifdef CONFIG_PPC_PSERIES From 4708fba19adee9ba14ef28af6face4ab043d9cd6 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Mon, 15 Sep 2025 15:59:42 +0530 Subject: [PATCH 2456/3206] powerpc/vpa_dtl: Add interface to expose vpa dtl counters via perf The pseries Shared Processor Logical Partition(SPLPAR) machines can retrieve a log of dispatch and preempt events from the hypervisor using data from Disptach Trace Log(DTL) buffer. With this information, user can retrieve when and why each dispatch & preempt has occurred. Added an interface to expose the Virtual Processor Area(VPA) DTL counters via perf. The following events are available and exposed in sysfs: vpa_dtl/dtl_cede/ - Trace voluntary (OS initiated) virtual processor waits vpa_dtl/dtl_preempt/ - Trace time slice preempts vpa_dtl/dtl_fault/ - Trace virtual partition memory page faults. vpa_dtl/dtl_all/ - Trace all (dtl_cede/dtl_preempt/dtl_fault) Added interface defines supported event list, config fields for the event attributes and their corresponding bit values which are exported via sysfs. User could use the standard perf tool to access perf events exposed via vpa-dtl pmu. The VPA DTL PMU counters do not interrupt on overflow or generate any PMI interrupts. Therefore, the kernel needs to poll the counters, added hrtimer code to do that. The timer interval can be provided by user via sample_period field in nano seconds. There is one hrtimer added per vpa-dtl pmu thread. To ensure there are no other conflicting dtl users (example: debugfs dtl or /proc/powerpc/vcpudispatch_stats), interface added code to use "down_write_trylock" call to take the dtl_access_lock. The dtl_access_lock is defined in dtl.h file. Also added global reference count variable called "dtl_global_refc", to ensure dtl data can be captured per-cpu. Code also added global lock called "dtl_global_lock" to avoid race condition. Signed-off-by: Kajol Jain Tested-by: Tejas Manhas Tested-by: Venkat Rao Bagalkote Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250915102947.26681-3-atrajeev@linux.ibm.com --- arch/powerpc/perf/Makefile | 2 +- arch/powerpc/perf/vpa-dtl.c | 340 ++++++++++++++++++++++++++++++++++++ 2 files changed, 341 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/perf/vpa-dtl.c diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index 7f53fcb7495a8a..78dd7e25219ee5 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_PPC_POWERNV) += imc-pmu.o obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o e6500-pmu.o -obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o +obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o vpa-dtl.o obj-$(CONFIG_VPA_PMU) += vpa-pmu.o diff --git a/arch/powerpc/perf/vpa-dtl.c b/arch/powerpc/perf/vpa-dtl.c new file mode 100644 index 00000000000000..4a8ba5b3ae3f2e --- /dev/null +++ b/arch/powerpc/perf/vpa-dtl.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Perf interface to expose Dispatch Trace Log counters. + * + * Copyright (C) 2024 Kajol Jain, IBM Corporation + */ + +#ifdef CONFIG_PPC_SPLPAR +#define pr_fmt(fmt) "vpa_dtl: " fmt + +#include +#include +#include + +#define EVENT(_name, _code) enum{_name = _code} + +/* + * Based on Power Architecture Platform Reference(PAPR) documentation, + * Table 14.14. Per Virtual Processor Area, below Dispatch Trace Log(DTL) + * Enable Mask used to get corresponding virtual processor dispatch + * to preempt traces: + * DTL_CEDE(0x1): Trace voluntary (OS initiated) virtual + * processor waits + * DTL_PREEMPT(0x2): Trace time slice preempts + * DTL_FAULT(0x4): Trace virtual partition memory page + faults. + * DTL_ALL(0x7): Trace all (DTL_CEDE | DTL_PREEMPT | DTL_FAULT) + * + * Event codes based on Dispatch Trace Log Enable Mask. + */ +EVENT(DTL_CEDE, 0x1); +EVENT(DTL_PREEMPT, 0x2); +EVENT(DTL_FAULT, 0x4); +EVENT(DTL_ALL, 0x7); + +GENERIC_EVENT_ATTR(dtl_cede, DTL_CEDE); +GENERIC_EVENT_ATTR(dtl_preempt, DTL_PREEMPT); +GENERIC_EVENT_ATTR(dtl_fault, DTL_FAULT); +GENERIC_EVENT_ATTR(dtl_all, DTL_ALL); + +PMU_FORMAT_ATTR(event, "config:0-7"); + +static struct attribute *events_attr[] = { + GENERIC_EVENT_PTR(DTL_CEDE), + GENERIC_EVENT_PTR(DTL_PREEMPT), + GENERIC_EVENT_PTR(DTL_FAULT), + GENERIC_EVENT_PTR(DTL_ALL), + NULL +}; + +static struct attribute_group event_group = { + .name = "events", + .attrs = events_attr, +}; + +static struct attribute *format_attrs[] = { + &format_attr_event.attr, + NULL, +}; + +static const struct attribute_group format_group = { + .name = "format", + .attrs = format_attrs, +}; + +static const struct attribute_group *attr_groups[] = { + &format_group, + &event_group, + NULL, +}; + +struct vpa_dtl { + struct dtl_entry *buf; + u64 last_idx; +}; + +static DEFINE_PER_CPU(struct vpa_dtl, vpa_dtl_cpu); + +/* variable to capture reference count for the active dtl threads */ +static int dtl_global_refc; +static spinlock_t dtl_global_lock = __SPIN_LOCK_UNLOCKED(dtl_global_lock); + +/* + * Function to dump the dispatch trace log buffer data to the + * perf data. + */ +static void vpa_dtl_dump_sample_data(struct perf_event *event) +{ + return; +} + +/* + * The VPA Dispatch Trace log counters do not interrupt on overflow. + * Therefore, the kernel needs to poll the counters to avoid missing + * an overflow using hrtimer. The timer interval is based on sample_period + * count provided by user, and minimum interval is 1 millisecond. + */ +static enum hrtimer_restart vpa_dtl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return HRTIMER_NORESTART; + + vpa_dtl_dump_sample_data(event); + period = max_t(u64, NSEC_PER_MSEC, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return HRTIMER_RESTART; +} + +static void vpa_dtl_start_hrtimer(struct perf_event *event) +{ + u64 period; + struct hw_perf_event *hwc = &event->hw; + + period = max_t(u64, NSEC_PER_MSEC, hwc->sample_period); + hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED); +} + +static void vpa_dtl_stop_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + hrtimer_cancel(&hwc->hrtimer); +} + +static void vpa_dtl_reset_global_refc(struct perf_event *event) +{ + spin_lock(&dtl_global_lock); + dtl_global_refc--; + if (dtl_global_refc <= 0) { + dtl_global_refc = 0; + up_write(&dtl_access_lock); + } + spin_unlock(&dtl_global_lock); +} + +static int vpa_dtl_mem_alloc(int cpu) +{ + struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, cpu); + struct dtl_entry *buf = NULL; + + /* Check for dispatch trace log buffer cache */ + if (!dtl_cache) + return -ENOMEM; + + buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL | GFP_ATOMIC, cpu_to_node(cpu)); + if (!buf) { + pr_warn("buffer allocation failed for cpu %d\n", cpu); + return -ENOMEM; + } + dtl->buf = buf; + return 0; +} + +static int vpa_dtl_event_init(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* test the event attr type for PMU enumeration */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (!perfmon_capable()) + return -EACCES; + + /* Return if this is a counting event */ + if (!is_sampling_event(event)) + return -EOPNOTSUPP; + + /* no branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + /* Invalid eventcode */ + switch (event->attr.config) { + case DTL_LOG_CEDE: + case DTL_LOG_PREEMPT: + case DTL_LOG_FAULT: + case DTL_LOG_ALL: + break; + default: + return -EINVAL; + } + + spin_lock(&dtl_global_lock); + + /* + * To ensure there are no other conflicting dtl users + * (example: /proc/powerpc/vcpudispatch_stats or debugfs dtl), + * below code try to take the dtl_access_lock. + * The dtl_access_lock is a rwlock defined in dtl.h, which is used + * to unsure there is no conflicting dtl users. + * Based on below code, vpa_dtl pmu tries to take write access lock + * and also checks for dtl_global_refc, to make sure that the + * dtl_access_lock is taken by vpa_dtl pmu interface. + */ + if (dtl_global_refc == 0 && !down_write_trylock(&dtl_access_lock)) { + spin_unlock(&dtl_global_lock); + return -EBUSY; + } + + /* Allocate dtl buffer memory */ + if (vpa_dtl_mem_alloc(event->cpu)) { + spin_unlock(&dtl_global_lock); + return -ENOMEM; + } + + /* + * Increment the number of active vpa_dtl pmu threads. The + * dtl_global_refc is used to keep count of cpu threads that + * currently capturing dtl data using vpa_dtl pmu interface. + */ + dtl_global_refc++; + + spin_unlock(&dtl_global_lock); + + hrtimer_setup(&hwc->hrtimer, vpa_dtl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + + /* + * Since hrtimers have a fixed rate, we can do a static freq->period + * mapping and avoid the whole period adjust feedback stuff. + */ + if (event->attr.freq) { + long freq = event->attr.sample_freq; + + event->attr.sample_period = NSEC_PER_SEC / freq; + hwc->sample_period = event->attr.sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + hwc->last_period = hwc->sample_period; + event->attr.freq = 0; + } + + event->destroy = vpa_dtl_reset_global_refc; + return 0; +} + +static int vpa_dtl_event_add(struct perf_event *event, int flags) +{ + int ret, hwcpu; + unsigned long addr; + struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu); + + /* + * Register our dtl buffer with the hypervisor. The + * HV expects the buffer size to be passed in the second + * word of the buffer. Refer section '14.11.3.2. H_REGISTER_VPA' + * from PAPR for more information. + */ + ((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES); + dtl->last_idx = 0; + + hwcpu = get_hard_smp_processor_id(event->cpu); + addr = __pa(dtl->buf); + + ret = register_dtl(hwcpu, addr); + if (ret) { + pr_warn("DTL registration for cpu %d (hw %d) failed with %d\n", + event->cpu, hwcpu, ret); + return ret; + } + + /* set our initial buffer indices */ + lppaca_of(event->cpu).dtl_idx = 0; + + /* + * Ensure that our updates to the lppaca fields have + * occurred before we actually enable the logging + */ + smp_wmb(); + + /* enable event logging */ + lppaca_of(event->cpu).dtl_enable_mask = event->attr.config; + + vpa_dtl_start_hrtimer(event); + + return 0; +} + +static void vpa_dtl_event_del(struct perf_event *event, int flags) +{ + int hwcpu = get_hard_smp_processor_id(event->cpu); + struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu); + + vpa_dtl_stop_hrtimer(event); + unregister_dtl(hwcpu); + kmem_cache_free(dtl_cache, dtl->buf); + dtl->buf = NULL; + lppaca_of(event->cpu).dtl_enable_mask = 0x0; +} + +/* + * This function definition is empty as vpa_dtl_dump_sample_data + * is used to parse and dump the dispatch trace log data, + * to perf data. + */ +static void vpa_dtl_event_read(struct perf_event *event) +{ +} + +static struct pmu vpa_dtl_pmu = { + .task_ctx_nr = perf_invalid_context, + + .name = "vpa_dtl", + .attr_groups = attr_groups, + .event_init = vpa_dtl_event_init, + .add = vpa_dtl_event_add, + .del = vpa_dtl_event_del, + .read = vpa_dtl_event_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_EXCLUSIVE, +}; + +static int vpa_dtl_init(void) +{ + int r; + + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) { + pr_debug("not a shared virtualized system, not enabling\n"); + return -ENODEV; + } + + /* This driver is intended only for L1 host. */ + if (is_kvm_guest()) { + pr_debug("Only supported for L1 host system\n"); + return -ENODEV; + } + + r = perf_pmu_register(&vpa_dtl_pmu, vpa_dtl_pmu.name, -1); + if (r) + return r; + + return 0; +} + +device_initcall(vpa_dtl_init); +#endif //CONFIG_PPC_SPLPAR From 6f2c65680c336a274b69b1fdcbfa2eeb8159bee8 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Mon, 15 Sep 2025 15:59:43 +0530 Subject: [PATCH 2457/3206] docs: ABI: sysfs-bus-event_source-devices-vpa-dtl: Document sysfs event format entries for vpa_dtl pmu Details are added for the vpa_dtl pmu event and format attributes in the ABI documentation. Signed-off-by: Kajol Jain Tested-by: Tejas Manhas Tested-by: Venkat Rao Bagalkote Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250915102947.26681-4-atrajeev@linux.ibm.com --- .../sysfs-bus-event_source-devices-vpa-dtl | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-vpa-dtl diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-vpa-dtl b/Documentation/ABI/testing/sysfs-bus-event_source-devices-vpa-dtl new file mode 100644 index 00000000000000..7b7c789a5cf59c --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-vpa-dtl @@ -0,0 +1,25 @@ +What: /sys/bus/event_source/devices/vpa_dtl/format +Date: February 2025 +Contact: Linux on PowerPC Developer List +Description: Read-only. Attribute group to describe the magic bits + that go into perf_event_attr.config for a particular pmu. + (See ABI/testing/sysfs-bus-event_source-devices-format). + + Each attribute under this group defines a bit range of the + perf_event_attr.config. Supported attribute are listed + below:: + + event = "config:0-7" - event ID + + For example:: + + dtl_cede = "event=0x1" + +What: /sys/bus/event_source/devices/vpa_dtl/events +Date: February 2025 +Contact: Linux on PowerPC Developer List +Description: (RO) Attribute group to describe performance monitoring events + for the Virtual Processor Dispatch Trace Log. Each attribute in + this group describes a single performance monitoring event + supported by vpa_dtl pmu. The name of the file is the name of + the event (See ABI/testing/sysfs-bus-event_source-devices-events). From 5d75aed84d3b6d25c7c4bb4a212b14fae4d1020b Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Mon, 15 Sep 2025 15:59:44 +0530 Subject: [PATCH 2458/3206] powerpc/perf/vpa-dtl: Add support to setup and free aux buffer for capturing DTL data vpa dtl pmu has one hrtimer added per vpa-dtl pmu thread. When the hrtimer expires, in the timer handler, code is added to save the DTL data to perf event record. DTL (Dispatch Trace Log) contains information about dispatch/preempt, enqueue time etc. We directly copy the DTL buffer data as part of auxiliary buffer and it will be postprocessed later. To enable the support for aux buffer, add the PMU callbacks for setup_aux and free_aux. In setup_aux, set up pmu-private data structures for an AUX area. rb_alloc_aux uses "alloc_pages_node" and returns pointer to each page address. Map these pages to contiguous space using vmap and use that as base address. The aux private data structure ie, "struct vpa_pmu_buf" mainly saves: 1. buf->base: aux buffer base address 2. buf->head: offset from base address where data will be written to. 3. buf->size: Size of allocated memory free_aux will free pmu-private AUX data structures. Signed-off-by: Athira Rajeev Tested-by: Tejas Manhas Tested-by: Venkat Rao Bagalkote Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250915102947.26681-5-atrajeev@linux.ibm.com --- arch/powerpc/perf/vpa-dtl.c | 77 +++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/arch/powerpc/perf/vpa-dtl.c b/arch/powerpc/perf/vpa-dtl.c index 4a8ba5b3ae3f2e..28e7fc6e701509 100644 --- a/arch/powerpc/perf/vpa-dtl.c +++ b/arch/powerpc/perf/vpa-dtl.c @@ -11,6 +11,7 @@ #include #include #include +#include #define EVENT(_name, _code) enum{_name = _code} @@ -74,6 +75,19 @@ struct vpa_dtl { u64 last_idx; }; +struct vpa_pmu_ctx { + struct perf_output_handle handle; +}; + +struct vpa_pmu_buf { + int nr_pages; + bool snapshot; + u64 *base; + u64 size; + u64 head; +}; + +static DEFINE_PER_CPU(struct vpa_pmu_ctx, vpa_pmu_ctx); static DEFINE_PER_CPU(struct vpa_dtl, vpa_dtl_cpu); /* variable to capture reference count for the active dtl threads */ @@ -302,6 +316,67 @@ static void vpa_dtl_event_read(struct perf_event *event) { } +/* + * Set up pmu-private data structures for an AUX area + * **pages contains the aux buffer allocated for this event + * for the corresponding cpu. rb_alloc_aux uses "alloc_pages_node" + * and returns pointer to each page address. Map these pages to + * contiguous space using vmap and use that as base address. + * + * The aux private data structure ie, "struct vpa_pmu_buf" mainly + * saves + * - buf->base: aux buffer base address + * - buf->head: offset from base address where data will be written to. + * - buf->size: Size of allocated memory + */ +static void *vpa_dtl_setup_aux(struct perf_event *event, void **pages, + int nr_pages, bool snapshot) +{ + int i, cpu = event->cpu; + struct vpa_pmu_buf *buf __free(kfree) = NULL; + struct page **pglist __free(kfree) = NULL; + + /* We need at least one page for this to work. */ + if (!nr_pages) + return NULL; + + if (cpu == -1) + cpu = raw_smp_processor_id(); + + buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu)); + if (!buf) + return NULL; + + pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL); + if (!pglist) + return NULL; + + for (i = 0; i < nr_pages; ++i) + pglist[i] = virt_to_page(pages[i]); + + buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL); + if (!buf->base) + return NULL; + + buf->nr_pages = nr_pages; + buf->snapshot = false; + + buf->size = nr_pages << PAGE_SHIFT; + buf->head = 0; + return no_free_ptr(buf); +} + +/* + * free pmu-private AUX data structures + */ +static void vpa_dtl_free_aux(void *aux) +{ + struct vpa_pmu_buf *buf = aux; + + vunmap(buf->base); + kfree(buf); +} + static struct pmu vpa_dtl_pmu = { .task_ctx_nr = perf_invalid_context, @@ -311,6 +386,8 @@ static struct pmu vpa_dtl_pmu = { .add = vpa_dtl_event_add, .del = vpa_dtl_event_del, .read = vpa_dtl_event_read, + .setup_aux = vpa_dtl_setup_aux, + .free_aux = vpa_dtl_free_aux, .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_EXCLUSIVE, }; From 2de8b6dd5ae72eb6fb7c756a3f2c131171fe3b8b Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Mon, 15 Sep 2025 15:59:45 +0530 Subject: [PATCH 2459/3206] powerpc/perf/vpa-dtl: Add support to capture DTL data in aux buffer vpa dtl pmu has one hrtimer added per vpa-dtl pmu thread. When the hrtimer expires, in the timer handler, code is added to save the DTL data to perf event record via vpa_dtl_capture_aux() function. The DTL (Dispatch Trace Log) contains information about dispatch/preempt, enqueue time etc. We directly copy the DTL buffer data as part of auxiliary buffer. Data will be written to disk only when the allocated buffer is full. By this approach, all the DTL data will be present as-is in the perf.data. The data will be post-processed in perf tools side when doing perf report/perf script and this will avoid time taken to create samples in the kernel space. To corelate each DTL entry with other events across CPU's, we need to map timebase from "struct dtl_entry" which phyp provides with boot timebase. This also needs timebase frequency. Define "struct boottb_freq" to save these details. Added changes to capture the Dispatch Trace Log details to AUX buffer in vpa_dtl_dump_sample_data(). Boot timebase and frequency needs to be saved only at once, added field to indicate this as part of "vpa_pmu_buf" structure. perf_aux_output_begin: This function is called before writing to AUX area. This returns the pointer to aux area private structure, ie "struct vpa_pmu_buf". The function obtains the output handle (used in perf_aux_output_end). when capture completes in vpa_dtl_capture_aux(), call perf_aux_output_end() to commit the recorded data. perf_aux_output_end() is called to move the aux->head of "struct perf_buffer" to indicate size of data in aux buffer. aux_tail will be moved in perf tools side when writing the data from aux buffer to perf.data file in disk. It is responsiblity of PMU driver to make sure data is copied between perf_aux_output_begin and perf_aux_output_end. Signed-off-by: Athira Rajeev Tested-by: Tejas Manhas Tested-by: Venkat Rao Bagalkote Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250915102947.26681-6-atrajeev@linux.ibm.com --- arch/powerpc/perf/vpa-dtl.c | 131 +++++++++++++++++++++++++++++++++++- 1 file changed, 130 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/vpa-dtl.c b/arch/powerpc/perf/vpa-dtl.c index 28e7fc6e701509..ead96af37997f2 100644 --- a/arch/powerpc/perf/vpa-dtl.c +++ b/arch/powerpc/perf/vpa-dtl.c @@ -85,6 +85,24 @@ struct vpa_pmu_buf { u64 *base; u64 size; u64 head; + /* boot timebase and frequency needs to be saved only at once */ + int boottb_freq_saved; +}; + +/* + * To corelate each DTL entry with other events across CPU's, + * we need to map timebase from "struct dtl_entry" which phyp + * provides with boot timebase. This also needs timebase frequency. + * Formula is: ((timbase from DTL entry - boot time) / frequency) + * + * To match with size of "struct dtl_entry" to ease post processing, + * padded 24 bytes to the structure. + */ +struct boottb_freq { + u64 boot_tb; + u64 tb_freq; + u64 timebase; + u64 padded[3]; }; static DEFINE_PER_CPU(struct vpa_pmu_ctx, vpa_pmu_ctx); @@ -94,13 +112,123 @@ static DEFINE_PER_CPU(struct vpa_dtl, vpa_dtl_cpu); static int dtl_global_refc; static spinlock_t dtl_global_lock = __SPIN_LOCK_UNLOCKED(dtl_global_lock); +/* + * Capture DTL data in AUX buffer + */ +static void vpa_dtl_capture_aux(long *n_entries, struct vpa_pmu_buf *buf, + struct vpa_dtl *dtl, int index) +{ + struct dtl_entry *aux_copy_buf = (struct dtl_entry *)buf->base; + + /* + * Copy to AUX buffer from per-thread address + */ + memcpy(aux_copy_buf + buf->head, &dtl->buf[index], *n_entries * sizeof(struct dtl_entry)); + + buf->head += *n_entries; + + return; +} + /* * Function to dump the dispatch trace log buffer data to the * perf data. + * + * perf_aux_output_begin: This function is called before writing + * to AUX area. This returns the pointer to aux area private structure, + * ie "struct vpa_pmu_buf" here which is set in setup_aux() function. + * The function obtains the output handle (used in perf_aux_output_end). + * when capture completes in vpa_dtl_capture_aux(), call perf_aux_output_end() + * to commit the recorded data. + * + * perf_aux_output_end: This function commits data by adjusting the + * aux_head of "struct perf_buffer". aux_tail will be moved in perf tools + * side when writing the data from aux buffer to perf.data file in disk. + * + * Here in the private aux structure, we maintain head to know where + * to copy data next time in the PMU driver. vpa_pmu_buf->head is moved to + * maintain the aux head for PMU driver. It is responsiblity of PMU + * driver to make sure data is copied between perf_aux_output_begin and + * perf_aux_output_end. + * + * After data is copied in vpa_dtl_capture_aux() function, perf_aux_output_end() + * is called to move the aux->head of "struct perf_buffer" to indicate size of + * data in aux buffer. This will post a PERF_RECORD_AUX into the perf buffer. + * Data will be written to disk only when the allocated buffer is full. + * + * By this approach, all the DTL data will be present as-is in the + * perf.data. The data will be pre-processed in perf tools side when doing + * perf report/perf script and this will avoid time taken to create samples + * in the kernel space. */ static void vpa_dtl_dump_sample_data(struct perf_event *event) { - return; + u64 cur_idx, last_idx, i; + u64 boot_tb; + struct boottb_freq boottb_freq; + + /* actual number of entries read */ + long n_read = 0, read_size = 0; + + /* number of entries added to dtl buffer */ + long n_req; + + struct vpa_pmu_ctx *vpa_ctx = this_cpu_ptr(&vpa_pmu_ctx); + + struct vpa_pmu_buf *aux_buf; + + struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu); + + cur_idx = be64_to_cpu(lppaca_of(event->cpu).dtl_idx); + last_idx = dtl->last_idx; + + if (last_idx + N_DISPATCH_LOG <= cur_idx) + last_idx = cur_idx - N_DISPATCH_LOG + 1; + + n_req = cur_idx - last_idx; + + /* no new entry added to the buffer, return */ + if (n_req <= 0) + return; + + dtl->last_idx = last_idx + n_req; + boot_tb = get_boot_tb(); + + i = last_idx % N_DISPATCH_LOG; + + aux_buf = perf_aux_output_begin(&vpa_ctx->handle, event); + if (!aux_buf) { + pr_debug("returning. no aux\n"); + return; + } + + if (!aux_buf->boottb_freq_saved) { + pr_debug("Copying boot tb to aux buffer: %lld\n", boot_tb); + /* Save boot_tb to convert raw timebase to it's relative system boot time */ + boottb_freq.boot_tb = boot_tb; + /* Save tb_ticks_per_sec to convert timebase to sec */ + boottb_freq.tb_freq = tb_ticks_per_sec; + boottb_freq.timebase = 0; + memcpy(aux_buf->base, &boottb_freq, sizeof(boottb_freq)); + aux_buf->head += 1; + aux_buf->boottb_freq_saved = 1; + n_read += 1; + } + + /* read the tail of the buffer if we've wrapped */ + if (i + n_req > N_DISPATCH_LOG) { + read_size = N_DISPATCH_LOG - i; + vpa_dtl_capture_aux(&read_size, aux_buf, dtl, i); + n_req -= read_size; + n_read += read_size; + i = 0; + } + + /* .. and now the head */ + vpa_dtl_capture_aux(&n_req, aux_buf, dtl, i); + + /* Move the aux->head to indicate size of data in aux buffer */ + perf_aux_output_end(&vpa_ctx->handle, (n_req + n_read) * sizeof(struct dtl_entry)); } /* @@ -363,6 +491,7 @@ static void *vpa_dtl_setup_aux(struct perf_event *event, void **pages, buf->size = nr_pages << PAGE_SHIFT; buf->head = 0; + buf->boottb_freq_saved = 0; return no_free_ptr(buf); } From b5e71cafa02d4e673639a3bd4c03d84db5dd8b8a Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Mon, 15 Sep 2025 15:59:46 +0530 Subject: [PATCH 2460/3206] powerpc/perf/vpa-dtl: Handle the writing of perf record when aux wake up is needed Handle the case when the aux buffer is going to be full and data needs to be written to the data file. perf_aux_output_begin() function checks if there is enough space depending on the values of aux_wakeup and aux_watermark which is part of "struct perf_buffer". Inorder to maintain where to write to aux buffer, add two fields to "struct vpa_pmu_buf". Field "threshold" to indicate total possible DTL entries that can be contained in aux buffer and field "full" to indicate anytime when buffer is full. In perf_aux_output_end, there is check to see if wake up is needed based on aux head value. In vpa_dtl_capture_aux(), check if there is enough space to contain the DTL data. If not, save the data for available memory and set full to true. Set head of private aux to zero when buffer is full so that next data will be copied to beginning of the buffer. The address used for copying to aux is "aux_copy_buf + buf->head". So once buffer is full, set head to zero, so that next time it will be written from start of the buffer. Signed-off-by: Athira Rajeev Tested-by: Tejas Manhas Tested-by: Venkat Rao Bagalkote Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250915102947.26681-7-atrajeev@linux.ibm.com --- arch/powerpc/perf/vpa-dtl.c | 54 +++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/vpa-dtl.c b/arch/powerpc/perf/vpa-dtl.c index ead96af37997f2..3c1d1c28deb9a2 100644 --- a/arch/powerpc/perf/vpa-dtl.c +++ b/arch/powerpc/perf/vpa-dtl.c @@ -85,8 +85,11 @@ struct vpa_pmu_buf { u64 *base; u64 size; u64 head; + u64 head_size; /* boot timebase and frequency needs to be saved only at once */ int boottb_freq_saved; + u64 threshold; + bool full; }; /* @@ -120,11 +123,31 @@ static void vpa_dtl_capture_aux(long *n_entries, struct vpa_pmu_buf *buf, { struct dtl_entry *aux_copy_buf = (struct dtl_entry *)buf->base; + /* + * check if there is enough space to contain the + * DTL data. If not, save the data for available + * memory and set full to true. + */ + if (buf->head + *n_entries >= buf->threshold) { + *n_entries = buf->threshold - buf->head; + buf->full = 1; + } + /* * Copy to AUX buffer from per-thread address */ memcpy(aux_copy_buf + buf->head, &dtl->buf[index], *n_entries * sizeof(struct dtl_entry)); + if (buf->full) { + /* + * Set head of private aux to zero when buffer is full + * so that next data will be copied to beginning of the + * buffer + */ + buf->head = 0; + return; + } + buf->head += *n_entries; return; @@ -178,6 +201,7 @@ static void vpa_dtl_dump_sample_data(struct perf_event *event) struct vpa_pmu_buf *aux_buf; struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu); + u64 size; cur_idx = be64_to_cpu(lppaca_of(event->cpu).dtl_idx); last_idx = dtl->last_idx; @@ -222,13 +246,37 @@ static void vpa_dtl_dump_sample_data(struct perf_event *event) n_req -= read_size; n_read += read_size; i = 0; + if (aux_buf->full) { + size = (n_read * sizeof(struct dtl_entry)); + if ((size + aux_buf->head_size) > aux_buf->size) { + size = aux_buf->size - aux_buf->head_size; + perf_aux_output_end(&vpa_ctx->handle, size); + aux_buf->head = 0; + aux_buf->head_size = 0; + } else { + aux_buf->head_size += (n_read * sizeof(struct dtl_entry)); + perf_aux_output_end(&vpa_ctx->handle, n_read * sizeof(struct dtl_entry)); + } + goto out; + } } /* .. and now the head */ vpa_dtl_capture_aux(&n_req, aux_buf, dtl, i); - /* Move the aux->head to indicate size of data in aux buffer */ - perf_aux_output_end(&vpa_ctx->handle, (n_req + n_read) * sizeof(struct dtl_entry)); + size = ((n_req + n_read) * sizeof(struct dtl_entry)); + if ((size + aux_buf->head_size) > aux_buf->size) { + size = aux_buf->size - aux_buf->head_size; + perf_aux_output_end(&vpa_ctx->handle, size); + aux_buf->head = 0; + aux_buf->head_size = 0; + } else { + aux_buf->head_size += ((n_req + n_read) * sizeof(struct dtl_entry)); + /* Move the aux->head to indicate size of data in aux buffer */ + perf_aux_output_end(&vpa_ctx->handle, (n_req + n_read) * sizeof(struct dtl_entry)); + } +out: + aux_buf->full = 0; } /* @@ -491,7 +539,9 @@ static void *vpa_dtl_setup_aux(struct perf_event *event, void **pages, buf->size = nr_pages << PAGE_SHIFT; buf->head = 0; + buf->head_size = 0; buf->boottb_freq_saved = 0; + buf->threshold = ((buf->size - 32) / sizeof(struct dtl_entry)); return no_free_ptr(buf); } From 4a774b39e68fac7d6c7c9cffeb6a4ea4b6dc8b41 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Mon, 15 Sep 2025 15:59:47 +0530 Subject: [PATCH 2461/3206] powerpc/perf/vpa-dtl: Add documentation for VPA dispatch trace log PMU Documentation for vpa-dtl (Virtual Processor Area - Dispatch Trace Log) PMU interface. And how it can be used to collect the distrace trace log entries in perf data, how to process/report as part of perf report/perf script. Signed-off-by: Athira Rajeev Tested-by: Tejas Manhas Tested-by: Venkat Rao Bagalkote Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250915102947.26681-8-atrajeev@linux.ibm.com --- Documentation/arch/powerpc/index.rst | 1 + Documentation/arch/powerpc/vpa-dtl.rst | 156 +++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 Documentation/arch/powerpc/vpa-dtl.rst diff --git a/Documentation/arch/powerpc/index.rst b/Documentation/arch/powerpc/index.rst index 53fc9f89f3e420..1be2ee3f0361f7 100644 --- a/Documentation/arch/powerpc/index.rst +++ b/Documentation/arch/powerpc/index.rst @@ -37,6 +37,7 @@ powerpc vas-api vcpudispatch_stats vmemmap_dedup + vpa-dtl features diff --git a/Documentation/arch/powerpc/vpa-dtl.rst b/Documentation/arch/powerpc/vpa-dtl.rst new file mode 100644 index 00000000000000..58d0022f993ab1 --- /dev/null +++ b/Documentation/arch/powerpc/vpa-dtl.rst @@ -0,0 +1,156 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. _vpa-dtl: + +=================================== +DTL (Dispatch Trace Log) +=================================== + +Athira Rajeev, 19 April 2025 + +.. contents:: + :depth: 3 + + +Basic overview +============== + +The pseries Shared Processor Logical Partition(SPLPAR) machines can +retrieve a log of dispatch and preempt events from the hypervisor +using data from Disptach Trace Log(DTL) buffer. With this information, +user can retrieve when and why each dispatch & preempt has occurred. +The vpa-dtl PMU exposes the Virtual Processor Area(VPA) DTL counters +via perf. + +Infrastructure used +=================== + +The VPA DTL PMU counters do not interrupt on overflow or generate any +PMI interrupts. Therefore, hrtimer is used to poll the DTL data. The timer +nterval can be provided by user via sample_period field in nano seconds. +vpa dtl pmu has one hrtimer added per vpa-dtl pmu thread. DTL (Dispatch +Trace Log) contains information about dispatch/preempt, enqueue time etc. +We directly copy the DTL buffer data as part of auxiliary buffer and it +will be processed later. This will avoid time taken to create samples +in the kernel space. The PMU driver collecting Dispatch Trace Log (DTL) +entries makes use of AUX support in perf infrastructure. On the tools side, +this data is made available as PERF_RECORD_AUXTRACE records. + +To correlate each DTL entry with other events across CPU's, an auxtrace_queue +is created for each CPU. Each auxtrace queue has a array/list of auxtrace buffers. +All auxtrace queues is maintained in auxtrace heap. The queues are sorted +based on timestamp. When the different PERF_RECORD_XX records are processed, +compare the timestamp of perf record with timestamp of top element in the +auxtrace heap so that DTL events can be co-related with other events +Process the auxtrace queue if the timestamp of element from heap is +lower than timestamp from entry in perf record. Sometimes it could happen that +one buffer is only partially processed. if the timestamp of occurrence of +another event is more than currently processed element in the queue, it will +move on to next perf record. So keep track of position of buffer to continue +processing next time. Update the timestamp of the auxtrace heap with the timestamp +of last processed entry from the auxtrace buffer. + +This infrastructure ensures dispatch trace log entries can be correlated +and presented along with other events like sched. + +vpa-dtl PMU example usage +========================= + +.. code-block:: sh + + # ls /sys/devices/vpa_dtl/ + events format perf_event_mux_interval_ms power subsystem type uevent + + +To capture the DTL data using perf record: +.. code-block:: sh + + # ./perf record -a -e sched:\*,vpa_dtl/dtl_all/ -c 1000000000 sleep 1 + +The result can be interpreted using perf record. Snippet of perf report -D + +.. code-block:: sh + + # ./perf report -D + +There are different PERF_RECORD_XX records. In that records corresponding to +auxtrace buffers includes: + +1. PERF_RECORD_AUX + Conveys that new data is available in AUX area + +2. PERF_RECORD_AUXTRACE_INFO + Describes offset and size of auxtrace data in the buffers + +3. PERF_RECORD_AUXTRACE + This is the record that defines the auxtrace data which here in case of + vpa-dtl pmu is dispatch trace log data. + +Snippet from perf report -D showing the PERF_RECORD_AUXTRACE dump + +.. code-block:: sh + +0 0 0x39b10 [0x30]: PERF_RECORD_AUXTRACE size: 0x690 offset: 0 ref: 0 idx: 0 tid: -1 cpu: 0 +. +. ... VPA DTL PMU data: size 1680 bytes, entries is 35 +. 00000000: boot_tb: 21349649546353231, tb_freq: 512000000 +. 00000030: dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:7064, ready_to_enqueue_time:187, waiting_to_ready_time:6611773 +. 00000060: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:146, ready_to_enqueue_time:0, waiting_to_ready_time:15359437 +. 00000090: dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:4868, ready_to_enqueue_time:232, waiting_to_ready_time:5100709 +. 000000c0: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:179, ready_to_enqueue_time:0, waiting_to_ready_time:30714243 +. 000000f0: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:197, ready_to_enqueue_time:0, waiting_to_ready_time:15350648 +. 00000120: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:213, ready_to_enqueue_time:0, waiting_to_ready_time:15353446 +. 00000150: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:212, ready_to_enqueue_time:0, waiting_to_ready_time:15355126 +. 00000180: dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:6368, ready_to_enqueue_time:164, waiting_to_ready_time:5104665 + +Above is representation of dtl entry of below format: + +struct dtl_entry { + u8 dispatch_reason; + u8 preempt_reason; + u16 processor_id; + u32 enqueue_to_dispatch_time; + u32 ready_to_enqueue_time; + u32 waiting_to_ready_time; + u64 timebase; + u64 fault_addr; + u64 srr0; + u64 srr1; + +}; + +First two fields represent the dispatch reason and preempt reason. The post +processing of PERF_RECORD_AUXTRACE records will translate to meaningful data +for user to consume. + +Visualize the dispatch trace log entries with perf report +========================================================= + +.. code-block:: sh + + # ./perf record -a -e sched:*,vpa_dtl/dtl_all/ -c 1000000000 sleep 1 + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 0.300 MB perf.data ] + + # ./perf report + # Samples: 321 of event 'vpa-dtl' + # Event count (approx.): 321 + # + # Children Self Command Shared Object Symbol + # ........ ........ ....... ................. .............................. + # + 100.00% 100.00% swapper [kernel.kallsyms] [k] plpar_hcall_norets_notrace + +Visualize the dispatch trace log entries with perf script +========================================================= + +.. code-block:: sh + + # ./perf script + migration/9 67 [009] 105373.359903: sched:sched_waking: comm=perf pid=13418 prio=120 target_cpu=009 + migration/9 67 [009] 105373.359904: sched:sched_migrate_task: comm=perf pid=13418 prio=120 orig_cpu=9 dest_cpu=10 + migration/9 67 [009] 105373.359907: sched:sched_stat_runtime: comm=migration/9 pid=67 runtime=4050 [ns] + migration/9 67 [009] 105373.359908: sched:sched_switch: prev_comm=migration/9 prev_pid=67 prev_prio=0 prev_state=S ==> next_comm=swapper/9 next_pid=0 next_prio=120 + :256 256 [016] 105373.359913: vpa-dtl: timebase: 21403600706628832 dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:4854, ready_to_enqueue_time:139, waiting_to_ready_time:511842115 c0000000000fcd28 plpar_hcall_norets_notrace+0x18 ([kernel.kallsyms]) + :256 256 [017] 105373.360012: vpa-dtl: timebase: 21403600706679454 dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:236, ready_to_enqueue_time:0, waiting_to_ready_time:133864583 c0000000000fcd28 plpar_hcall_norets_notrace+0x18 ([kernel.kallsyms]) + perf 13418 [010] 105373.360048: sched:sched_stat_runtime: comm=perf pid=13418 runtime=139748 [ns] + perf 13418 [010] 105373.360052: sched:sched_waking: comm=migration/10 pid=72 prio=0 target_cpu=010 From 3bbf004c4808e2c3241e5c1ad6cc102f38a03c39 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 19 Sep 2025 15:58:28 +0100 Subject: [PATCH 2462/3206] arm64: cputype: Add Neoverse-V3AE definitions Add cputype definitions for Neoverse-V3AE. These will be used for errata detection in subsequent patches. These values can be found in the Neoverse-V3AE TRM: https://developer.arm.com/documentation/SDEN-2615521/9-0/ ... in section A.6.1 ("MIDR_EL1, Main ID Register"). Signed-off-by: Mark Rutland Cc: James Morse Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Ryan Roberts Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 67ac757bc9c043..9b00b75acbf296 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -92,6 +92,7 @@ #define ARM_CPU_PART_NEOVERSE_V2 0xD4F #define ARM_CPU_PART_CORTEX_A720 0xD81 #define ARM_CPU_PART_CORTEX_X4 0xD82 +#define ARM_CPU_PART_NEOVERSE_V3AE 0xD83 #define ARM_CPU_PART_NEOVERSE_V3 0xD84 #define ARM_CPU_PART_CORTEX_X925 0xD85 #define ARM_CPU_PART_CORTEX_A725 0xD87 @@ -182,6 +183,7 @@ #define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2) #define MIDR_CORTEX_A720 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720) #define MIDR_CORTEX_X4 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X4) +#define MIDR_NEOVERSE_V3AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3AE) #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3) #define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925) #define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725) From 0c33aa1804d101c11ba1992504f17a42233f0e11 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 19 Sep 2025 15:58:29 +0100 Subject: [PATCH 2463/3206] arm64: errata: Apply workarounds for Neoverse-V3AE Neoverse-V3AE is also affected by erratum #3312417, as described in its Software Developer Errata Notice (SDEN) document: Neoverse V3AE (MP172) SDEN v9.0, erratum 3312417 https://developer.arm.com/documentation/SDEN-2615521/9-0/ Enable the workaround for Neoverse-V3AE, and document this. Signed-off-by: Mark Rutland Cc: James Morse Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Ryan Roberts Signed-off-by: Will Deacon --- Documentation/arch/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 1 + arch/arm64/kernel/cpu_errata.c | 1 + 3 files changed, 4 insertions(+) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index b18ef4064bc046..a7ec57060f64f5 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -200,6 +200,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-V3 | #3312417 | ARM64_ERRATUM_3194386 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Neoverse-V3AE | #3312417 | ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | MMU-500 | #841119,826419 | ARM_SMMU_MMU_500_CPRE_ERRATA| | | | #562869,1047329 | | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a64d..93f391e67af151 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1138,6 +1138,7 @@ config ARM64_ERRATUM_3194386 * ARM Neoverse-V1 erratum 3324341 * ARM Neoverse V2 erratum 3324336 * ARM Neoverse-V3 erratum 3312417 + * ARM Neoverse-V3AE erratum 3312417 On affected cores "MSR SSBS, #0" instructions may not affect subsequent speculative instructions, which may permit unexepected diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 7ff6b49beaaffc..8cb3b575a03165 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -546,6 +546,7 @@ static const struct midr_range erratum_spec_ssbs_list[] = { MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3AE), {} }; #endif From 8fca3852e33d762b8d8beed5458c99ffb7fd5975 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 19 Sep 2025 15:58:30 +0100 Subject: [PATCH 2464/3206] arm64: cpufeature: add Neoverse-V3AE to BBML2 allow list Neoverse-V3AE advertises support for BBML2 and is known to not raise conflict aborts. So add it to the BBML2_NOABORT allow list. However, just like Neoverse-V3, Neoverse-V3AE r0p0 and r0p1 suffer from erratum #3053180, for which the workaround is to always observe break-before-make requirements for affected revisions. Therefore only add to the allow list from r0p2 onwards. For more details see Software Developer Errata Notice (SDEN) document: Neoverse V3AE (MP172) SDEN v9.0, erratum 3053180 https://developer.arm.com/documentation/SDEN-2615521/9-0/ Signed-off-by: Ryan Roberts Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index cf2dd5ea173fe4..30244c1f833c1d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2235,6 +2235,7 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco static const struct midr_range supports_bbml2_noabort_list[] = { MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf), MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf), + MIDR_REV_RANGE(MIDR_NEOVERSE_V3AE, 0, 2, 0xf), MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), MIDR_ALL_VERSIONS(MIDR_AMPERE1), MIDR_ALL_VERSIONS(MIDR_AMPERE1A), From fa93b45fd397e25265ff618de26dd5c74ee403d3 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Mon, 22 Sep 2025 12:11:26 +0530 Subject: [PATCH 2465/3206] arm64: Enable vmalloc-huge with ptdump Our goal is to move towards enabling vmalloc-huge by default on arm64 so as to reduce TLB pressure. Therefore, we need a way to analyze the portion of block mappings in vmalloc space we can get on a production system; this can be done through ptdump, but currently we disable vmalloc-huge if CONFIG_PTDUMP_DEBUGFS is on. The reason is that lazy freeing of kernel pagetables via vmap_try_huge_pxd() may race with ptdump, so ptdump may dereference a bogus address. To solve this, we need to synchronize ptdump_walk() and ptdump_check_wx() with pud_free_pmd_page() and pmd_free_pte_page(). Since this race is very unlikely to happen in practice, we do not want to penalize the vmalloc pagetable tearing path by taking the init_mm mmap_lock. Therefore, we use static keys. ptdump_walk() and ptdump_check_wx() are the pagetable walkers; they will enable the static key - upon observing that, the vmalloc pagetable tearing path will get patched in with an mmap_read_lock/unlock sequence. A combination of the patched-in mmap_read_lock/unlock, the acquire semantics of static_branch_inc(), and the barriers in __flush_tlb_kernel_pgtable() ensures that ptdump will never get a hold on the address of a freed PMD or PTE table. We can verify the correctness of the algorithm via the following litmus test (thanks to James Houghton and Will Deacon): AArch64 ptdump Variant=Ifetch { uint64_t pud=0xa110c; uint64_t pmd; 0:X0=label:"P1:L0"; 0:X1=instr:"NOP"; 0:X2=lock; 0:X3=pud; 0:X4=pmd; 1:X1=0xdead; 1:X2=lock; 1:X3=pud; 1:X4=pmd; } P0 | P1 ; (* static_key_enable *) | (* pud_free_pmd_page *) ; STR W1, [X0] | LDR X9, [X3] ; DC CVAU,X0 | STR XZR, [X3] ; DSB ISH | DSB ISH ; IC IVAU,X0 | ISB ; DSB ISH | ; ISB | (* static key *) ; | L0: ; (* mmap_lock *) | B out1 ; Lwlock: | ; MOV W7, #1 | (* mmap_lock *) ; SWPA W7, W8, [X2] | Lrlock: ; | MOV W7, #1 ; | SWPA W7, W8, [X2] ; (* walk pgtable *) | ; LDR X9, [X3] | (* mmap_unlock *) ; CBZ X9, out0 | STLR WZR, [X2] ; EOR X10, X9, X9 | ; LDR X11, [X4, X10] | out1: ; | EOR X10, X9, X9 ; out0: | STR X1, [X4, X10] ; exists (0:X8=0 /\ 1:X8=0 /\ (* Lock acquisitions succeed *) 0:X9=0xa110c /\ (* P0 sees the valid PUD ...*) 0:X11=0xdead) (* ... but the freed PMD *) For an approximate written proof of why this algorithm works, please read the code comment in [1], which is now removed for the sake of simplicity. mm-selftests pass. No issues were observed while parallelly running test_vmalloc.sh (which stresses the vmalloc subsystem), and cat /sys/kernel/debug/{kernel_page_tables, check_wx_pages} in a loop. Link: https://lore.kernel.org/all/20250723161827.15802-1-dev.jain@arm.com/ [1] Reviewed-by: Ryan Roberts Signed-off-by: Dev Jain Signed-off-by: Will Deacon --- arch/arm64/include/asm/ptdump.h | 2 ++ arch/arm64/include/asm/vmalloc.h | 9 ++----- arch/arm64/mm/mmu.c | 43 +++++++++++++++++++++++++++++--- arch/arm64/mm/ptdump.c | 11 ++++++-- 4 files changed, 52 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index fded5358641f8f..baff24004459ea 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -7,6 +7,8 @@ #include +DECLARE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); + #ifdef CONFIG_PTDUMP #include diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index 12f534e8f3edf8..4ec1acd3c1b348 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -9,18 +9,13 @@ #define arch_vmap_pud_supported arch_vmap_pud_supported static inline bool arch_vmap_pud_supported(pgprot_t prot) { - /* - * SW table walks can't handle removal of intermediate entries. - */ - return pud_sect_supported() && - !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); + return pud_sect_supported(); } #define arch_vmap_pmd_supported arch_vmap_pmd_supported static inline bool arch_vmap_pmd_supported(pgprot_t prot) { - /* See arch_vmap_pud_supported() */ - return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); + return true; } #define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 431ed90914bb48..0ba1a15e7e744d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -56,6 +56,8 @@ enum pgtable_type { TABLE_P4D, }; +DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); + u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); @@ -1665,7 +1667,8 @@ int pmd_clear_huge(pmd_t *pmdp) return 1; } -int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) +static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr, + bool acquire_mmap_lock) { pte_t *table; pmd_t pmd; @@ -1677,13 +1680,25 @@ int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) return 1; } + /* See comment in pud_free_pmd_page for static key logic */ table = pte_offset_kernel(pmdp, addr); pmd_clear(pmdp); __flush_tlb_kernel_pgtable(addr); + if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) { + mmap_read_lock(&init_mm); + mmap_read_unlock(&init_mm); + } + pte_free_kernel(NULL, table); return 1; } +int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) +{ + /* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */ + return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true); +} + int pud_free_pmd_page(pud_t *pudp, unsigned long addr) { pmd_t *table; @@ -1699,16 +1714,36 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr) } table = pmd_offset(pudp, addr); + + /* + * Our objective is to prevent ptdump from reading a PMD table which has + * been freed. In this race, if pud_free_pmd_page observes the key on + * (which got flipped by ptdump) then the mmap lock sequence here will, + * as a result of the mmap write lock/unlock sequence in ptdump, give + * us the correct synchronization. If not, this means that ptdump has + * yet not started walking the pagetables - the sequence of barriers + * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will + * observe an empty PUD. + */ + pud_clear(pudp); + __flush_tlb_kernel_pgtable(addr); + if (static_branch_unlikely(&arm64_ptdump_lock_key)) { + mmap_read_lock(&init_mm); + mmap_read_unlock(&init_mm); + } + pmdp = table; next = addr; end = addr + PUD_SIZE; do { if (pmd_present(pmdp_get(pmdp))) - pmd_free_pte_page(pmdp, next); + /* + * PMD has been isolated, so ptdump won't see it. No + * need to acquire init_mm.mmap_lock. + */ + __pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false); } while (pmdp++, next += PMD_SIZE, next != end); - pud_clear(pudp); - __flush_tlb_kernel_pgtable(addr); pmd_free(NULL, table); return 1; } diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 421a5de806c62d..ab9899ca1e5f21 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -283,6 +283,13 @@ void note_page_flush(struct ptdump_state *pt_st) note_page(pt_st, 0, -1, pte_val(pte_zero)); } +static void arm64_ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm) +{ + static_branch_inc(&arm64_ptdump_lock_key); + ptdump_walk_pgd(st, mm, NULL); + static_branch_dec(&arm64_ptdump_lock_key); +} + void ptdump_walk(struct seq_file *s, struct ptdump_info *info) { unsigned long end = ~0UL; @@ -311,7 +318,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info) } }; - ptdump_walk_pgd(&st.ptdump, info->mm, NULL); + arm64_ptdump_walk_pgd(&st.ptdump, info->mm); } static void __init ptdump_initialize(void) @@ -353,7 +360,7 @@ bool ptdump_check_wx(void) } }; - ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + arm64_ptdump_walk_pgd(&st.ptdump, &init_mm); if (st.wx_pages || st.uxn_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", From 42852fe57c6d2a0abb10429841cb1226b7186b7a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Sep 2025 06:12:08 -0700 Subject: [PATCH 2466/3206] xfs: track the number of blocks in each buftarg Add a bt_nr_sectors to track the number of sector in each buftarg, and replace the check that hard codes sb_dblock in xfs_buf_map_verify with this new value so that it is correct for non-ddev buftargs. The RT buftarg only has a superblock in the first block, so it is unlikely to trigger this, or are we likely to ever have enough blocks in the in-memory buftargs, but we might as well get the check right. Fixes: 10616b806d1d ("xfs: fix _xfs_buf_find oops on blocks beyond the filesystem end") Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_buf.c | 42 +++++++++++++++++++---------------- fs/xfs/xfs_buf.h | 4 +++- fs/xfs/xfs_buf_item_recover.c | 10 +++++++++ fs/xfs/xfs_super.c | 7 +++--- fs/xfs/xfs_trans.c | 23 ++++++++++--------- 5 files changed, 52 insertions(+), 34 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8360e77b3215dd..773d959965dc29 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -387,8 +387,6 @@ xfs_buf_map_verify( struct xfs_buftarg *btp, struct xfs_buf_map *map) { - xfs_daddr_t eofs; - /* Check for IOs smaller than the sector size / not sector aligned */ ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); @@ -397,11 +395,10 @@ xfs_buf_map_verify( * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ - eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (map->bm_bn < 0 || map->bm_bn >= eofs) { + if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) { xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", - __func__, map->bm_bn, eofs); + __func__, map->bm_bn, btp->bt_nr_sectors); WARN_ON(1); return -EFSCORRUPTED; } @@ -1720,26 +1717,30 @@ xfs_configure_buftarg_atomic_writes( int xfs_configure_buftarg( struct xfs_buftarg *btp, - unsigned int sectorsize) + unsigned int sectorsize, + xfs_rfsblock_t nr_blocks) { - int error; + struct xfs_mount *mp = btp->bt_mount; - ASSERT(btp->bt_bdev != NULL); + if (btp->bt_bdev) { + int error; - /* Set up metadata sector size info */ - btp->bt_meta_sectorsize = sectorsize; - btp->bt_meta_sectormask = sectorsize - 1; + error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); + if (error) { + xfs_warn(mp, + "Cannot use blocksize %u on device %pg, err %d", + sectorsize, btp->bt_bdev, error); + return -EINVAL; + } - error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); - if (error) { - xfs_warn(btp->bt_mount, - "Cannot use blocksize %u on device %pg, err %d", - sectorsize, btp->bt_bdev, error); - return -EINVAL; + if (bdev_can_atomic_write(btp->bt_bdev)) + xfs_configure_buftarg_atomic_writes(btp); } - if (bdev_can_atomic_write(btp->bt_bdev)) - xfs_configure_buftarg_atomic_writes(btp); + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; + /* m_blkbb_log is not set up yet */ + btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT); return 0; } @@ -1749,6 +1750,9 @@ xfs_init_buftarg( size_t logical_sectorsize, const char *descr) { + /* The maximum size of the buftarg is only known once the sb is read. */ + btp->bt_nr_sectors = (xfs_daddr_t)-1; + /* Set up device logical sector size mask */ btp->bt_logical_sectorsize = logical_sectorsize; btp->bt_logical_sectormask = logical_sectorsize - 1; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b269e115d9ace0..8fa7bdf59c9110 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -103,6 +103,7 @@ struct xfs_buftarg { size_t bt_meta_sectormask; size_t bt_logical_sectorsize; size_t bt_logical_sectormask; + xfs_daddr_t bt_nr_sectors; /* LRU control structures */ struct shrinker *bt_shrinker; @@ -372,7 +373,8 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); -int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize); +int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize, + xfs_fsblock_t nr_blocks); #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 5d58e2ae4972da..e4c8af87363243 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -736,6 +736,16 @@ xlog_recover_do_primary_sb_buffer( */ xfs_sb_from_disk(&mp->m_sb, dsb); + /* + * Grow can change the device size. Mirror that into the buftarg. + */ + mp->m_ddev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) { + mp->m_rtdev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + } + if (mp->m_sb.sb_agcount < orig_agcount) { xfs_alert(mp, "Shrinking AG count in log recovery not supported"); return -EFSCORRUPTED; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 77acb3e5a4eca1..9e759c5b1096ff 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -535,7 +535,8 @@ xfs_setup_devices( { int error; - error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize, + mp->m_sb.sb_dblocks); if (error) return error; @@ -545,7 +546,7 @@ xfs_setup_devices( if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; error = xfs_configure_buftarg(mp->m_logdev_targp, - log_sector_size); + log_sector_size, mp->m_sb.sb_logblocks); if (error) return error; } @@ -559,7 +560,7 @@ xfs_setup_devices( mp->m_rtdev_targp = mp->m_ddev_targp; } else if (mp->m_rtname) { error = xfs_configure_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_sectsize); + mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks); if (error) return error; } diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 575e7028f423a8..474f5a04ec6367 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -452,19 +452,17 @@ xfs_trans_mod_sb( */ STATIC void xfs_trans_apply_sb_deltas( - xfs_trans_t *tp) + struct xfs_trans *tp) { - struct xfs_dsb *sbp; - struct xfs_buf *bp; - int whole = 0; - - bp = xfs_trans_getsb(tp); - sbp = bp->b_addr; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp = xfs_trans_getsb(tp); + struct xfs_dsb *sbp = bp->b_addr; + int whole = 0; /* * Only update the superblock counters if we are logging them */ - if (!xfs_has_lazysbcount((tp->t_mountp))) { + if (!xfs_has_lazysbcount(mp)) { if (tp->t_icount_delta) be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); if (tp->t_ifree_delta) @@ -491,8 +489,7 @@ xfs_trans_apply_sb_deltas( * write the correct value ondisk. */ if ((tp->t_frextents_delta || tp->t_res_frextents_delta) && - !xfs_has_rtgroups(tp->t_mountp)) { - struct xfs_mount *mp = tp->t_mountp; + !xfs_has_rtgroups(mp)) { int64_t rtxdelta; rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta; @@ -505,6 +502,8 @@ xfs_trans_apply_sb_deltas( if (tp->t_dblocks_delta) { be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); + mp->m_ddev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_dblocks_delta); whole = 1; } if (tp->t_agcount_delta) { @@ -524,7 +523,7 @@ xfs_trans_apply_sb_deltas( * recompute the ondisk rtgroup block log. The incore values * will be recomputed in xfs_trans_unreserve_and_mod_sb. */ - if (xfs_has_rtgroups(tp->t_mountp)) { + if (xfs_has_rtgroups(mp)) { sbp->sb_rgblklog = xfs_compute_rgblklog( be32_to_cpu(sbp->sb_rgextents), be32_to_cpu(sbp->sb_rextsize)); @@ -537,6 +536,8 @@ xfs_trans_apply_sb_deltas( } if (tp->t_rblocks_delta) { be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta); + mp->m_rtdev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_rblocks_delta); whole = 1; } if (tp->t_rextents_delta) { From 6ef2175fce30ccab80d519f2afcc93d8b138c16c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Sep 2025 06:12:09 -0700 Subject: [PATCH 2467/3206] xfs: use bt_nr_sectors in xfs_dax_translate_range Only ranges inside the file system can be translated, and the file system can be smaller than the containing device. Fixes: f4ed93037966 ("xfs: don't shut down the filesystem for media failures beyond end of log") Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_notify_failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index fbeddcac479208..b1767288994206 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -165,7 +165,7 @@ xfs_dax_translate_range( uint64_t *bblen) { u64 dev_start = btp->bt_dax_part_off; - u64 dev_len = bdev_nr_bytes(btp->bt_bdev); + u64 dev_len = BBTOB(btp->bt_nr_sectors); u64 dev_end = dev_start + dev_len - 1; /* Notify failure on the whole device. */ From 14f158552eec700ae0e52b91aa17168a7b168c0c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Sun, 21 Sep 2025 06:22:58 +0530 Subject: [PATCH 2468/3206] arm64/sysreg: Update TCR_EL1 register Update TCR_EL1 register fields as per latest ARM ARM DDI 0487 L.B and while here drop an explicit sysreg definition SYS_TCR_EL1 from sysreg.h, which is now redundant. Cc: Catalin Marinas Cc: Will Deacon Cc: Marc Zyngier Cc: Mark Brown Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Mark Brown Signed-off-by: Anshuman Khandual Signed-off-by: Will Deacon --- arch/arm64/include/asm/sysreg.h | 2 -- arch/arm64/tools/sysreg | 52 ++++++++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index d5b5f2ae1afaaa..ad5c901af22998 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -281,8 +281,6 @@ #define SYS_RGSR_EL1 sys_reg(3, 0, 1, 0, 5) #define SYS_GCR_EL1 sys_reg(3, 0, 1, 0, 6) -#define SYS_TCR_EL1 sys_reg(3, 0, 2, 0, 2) - #define SYS_APIAKEYLO_EL1 sys_reg(3, 0, 2, 1, 0) #define SYS_APIAKEYHI_EL1 sys_reg(3, 0, 2, 1, 1) #define SYS_APIBKEYLO_EL1 sys_reg(3, 0, 2, 1, 2) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index d396fa587ec182..d5eb5b67145f17 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -4748,17 +4748,53 @@ Field 37 TBI0 Field 36 AS Res0 35 Field 34:32 IPS -Field 31:30 TG1 -Field 29:28 SH1 -Field 27:26 ORGN1 -Field 25:24 IRGN1 +Enum 31:30 TG1 + 0b01 16K + 0b10 4K + 0b11 64K +EndEnum +Enum 29:28 SH1 + 0b00 NONE + 0b10 OUTER + 0b11 INNER +EndEnum +Enum 27:26 ORGN1 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum +Enum 25:24 IRGN1 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum Field 23 EPD1 Field 22 A1 Field 21:16 T1SZ -Field 15:14 TG0 -Field 13:12 SH0 -Field 11:10 ORGN0 -Field 9:8 IRGN0 +Enum 15:14 TG0 + 0b00 4K + 0b01 64K + 0b10 16K +EndEnum +Enum 13:12 SH0 + 0b00 NONE + 0b10 OUTER + 0b11 INNER +EndEnum +Enum 11:10 ORGN0 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum +Enum 9:8 IRGN0 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum Field 7 EPD0 Res0 6 Field 5:0 T0SZ From fc0d192303bd385ac24dc52eb31ceb6ca7e027d0 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 18 Sep 2025 14:14:03 +0300 Subject: [PATCH 2469/3206] xfs: scrub: use kstrdup_const() for metapath scan setups Except 'xchk_setup_metapath_rtginode()' case, 'path' argument of 'xchk_setup_metapath_scan()' is a compile-time constant. So it may be reasonable to use 'kstrdup_const()' / 'kree_const()' to manage 'path' field of 'struct xchk_metapath' in attempt to reuse .rodata instance rather than making a copy. Compile tested only. Signed-off-by: Dmitry Antipov Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/metapath.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c index 14939d7de34966..378ec7c8d38eeb 100644 --- a/fs/xfs/scrub/metapath.c +++ b/fs/xfs/scrub/metapath.c @@ -79,7 +79,7 @@ xchk_metapath_cleanup( if (mpath->dp_ilock_flags) xfs_iunlock(mpath->dp, mpath->dp_ilock_flags); - kfree(mpath->path); + kfree_const(mpath->path); } /* Set up a metadir path scan. @path must be dynamically allocated. */ @@ -98,13 +98,13 @@ xchk_setup_metapath_scan( error = xchk_install_live_inode(sc, ip); if (error) { - kfree(path); + kfree_const(path); return error; } mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS); if (!mpath) { - kfree(path); + kfree_const(path); return -ENOMEM; } @@ -132,7 +132,7 @@ xchk_setup_metapath_rtdir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip); + kstrdup_const("rtgroups", GFP_KERNEL), sc->mp->m_rtdirip); } /* Scan a rtgroup inode under the /rtgroups directory. */ @@ -179,7 +179,7 @@ xchk_setup_metapath_quotadir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kstrdup("quota", GFP_KERNEL), qi->qi_dirip); + kstrdup_const("quota", GFP_KERNEL), qi->qi_dirip); } /* Scan a quota inode under the /quota directory. */ @@ -212,7 +212,7 @@ xchk_setup_metapath_dqinode( return -ENOENT; return xchk_setup_metapath_scan(sc, qi->qi_dirip, - kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip); + kstrdup_const(xfs_dqinode_path(type), GFP_KERNEL), ip); } #else # define xchk_setup_metapath_quotadir(...) (-ENOENT) From 5973a62efa34c80c9a4e5eac1fca6f6209b902af Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 19 Sep 2025 14:27:51 -0700 Subject: [PATCH 2470/3206] arm64: map [_text, _stext) virtual address range non-executable+read-only Since the referenced fixes commit, the kernel's .text section is only mapped starting from _stext; the region [_text, _stext) is omitted. As a result, other vmalloc/vmap allocations may use the virtual addresses nominally in the range [_text, _stext). This address reuse confuses multiple things: 1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore mapping the entire range [_text, _end) to [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in [_text, _stext) from /proc/vmcore therefore gives the incorrect result. 2. Tools doing symbolization (either by reading /proc/kallsyms or based on the vmlinux ELF file) will incorrectly identify vmalloc/vmap allocations in [_text, _stext) as kernel symbols. In practice, both of these issues affect the drgn debugger. Specifically, there were cases where the vmap IRQ stacks for some CPUs were allocated in [_text, _stext). As a result, drgn could not get the stack trace for a crash in an IRQ handler because the core dump contained invalid data for the IRQ stack address. The stack addresses were also symbolized as being in the _text symbol. Fix this by bringing back the mapping of [_text, _stext), but now make it non-executable and read-only. This prevents other allocations from using it while still achieving the original goal of not mapping unpredictable data as executable. Other than the changed protection, this is effectively a revert of the fixes commit. Fixes: e2a073dde921 ("arm64: omit [_text, _stext) from permanent kernel mapping") Cc: stable@vger.kernel.org Signed-off-by: Omar Sandoval Signed-off-by: Will Deacon --- arch/arm64/kernel/pi/map_kernel.c | 6 ++++++ arch/arm64/kernel/setup.c | 4 ++-- arch/arm64/mm/init.c | 2 +- arch/arm64/mm/mmu.c | 14 +++++++++----- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index e6d35eff1486b0..e8ddbde31a833d 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -78,6 +78,12 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) twopass |= enable_scs; prot = twopass ? data_prot : text_prot; + /* + * [_stext, _text) isn't executed after boot and contains some + * non-executable, unpredictable data, so map it non-executable. + */ + map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot, + false, root_level); map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot, !twopass, root_level); map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata, diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 77c7926a4df660..23c05dc7a8f2ac 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -214,7 +214,7 @@ static void __init request_standard_resources(void) unsigned long i = 0; size_t res_size; - kernel_code.start = __pa_symbol(_stext); + kernel_code.start = __pa_symbol(_text); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); @@ -280,7 +280,7 @@ u64 cpu_logical_map(unsigned int cpu) void __init __no_sanitize_address setup_arch(char **cmdline_p) { - setup_initial_init_mm(_stext, _etext, _edata, _end); + setup_initial_init_mm(_text, _etext, _edata, _end); *cmdline_p = boot_command_line; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 70c2ca813c1856..524d34a0e92198 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -279,7 +279,7 @@ void __init arm64_memblock_init(void) * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. */ - memblock_reserve(__pa_symbol(_stext), _end - _stext); + memblock_reserve(__pa_symbol(_text), _end - _text); if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { /* the generic initrd code expects virtual addresses */ initrd_start = __phys_to_virt(phys_initrd_start); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 0ba1a15e7e744d..10c25809958144 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -965,8 +965,8 @@ void __init mark_linear_text_alias_ro(void) /* * Remove the write permissions from the linear alias of .text/.rodata */ - update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext), - (unsigned long)__init_begin - (unsigned long)_stext, + update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), + (unsigned long)__init_begin - (unsigned long)_text, PAGE_KERNEL_RO); } @@ -1037,7 +1037,7 @@ static inline bool force_pte_mapping(void) static void __init map_mem(pgd_t *pgdp) { static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); - phys_addr_t kernel_start = __pa_symbol(_stext); + phys_addr_t kernel_start = __pa_symbol(_text); phys_addr_t kernel_end = __pa_symbol(__init_begin); phys_addr_t start, end; phys_addr_t early_kfence_pool; @@ -1086,7 +1086,7 @@ static void __init map_mem(pgd_t *pgdp) } /* - * Map the linear alias of the [_stext, __init_begin) interval + * Map the linear alias of the [_text, __init_begin) interval * as non-executable now, and remove the write permission in * mark_linear_text_alias_ro() below (which will be called after * alternative patching has completed). This makes the contents @@ -1113,6 +1113,10 @@ void mark_rodata_ro(void) WRITE_ONCE(rodata_is_rw, false); update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); + /* mark the range between _text and _stext as read only. */ + update_mapping_prot(__pa_symbol(_text), (unsigned long)_text, + (unsigned long)_stext - (unsigned long)_text, + PAGE_KERNEL_RO); } static void __init declare_vma(struct vm_struct *vma, @@ -1183,7 +1187,7 @@ static void __init declare_kernel_vmas(void) { static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; - declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); + declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD); declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); From a0ce874cfaaab9792d657440b9d050e2112f6e4d Mon Sep 17 00:00:00 2001 From: Niranjan H Y Date: Fri, 12 Sep 2025 14:06:20 +0530 Subject: [PATCH 2471/3206] ASoC: ops: improve snd_soc_get_volsw * clamp the values if the register value read is out of range Signed-off-by: Niranjan H Y [This patch originally had two changes in it, I removed a second buggy one -- broonie] -- v5: - remove clamp parameter - move the boundary check after sign-bit extension Link: https://patch.msgid.link/20250912083624.804-1-niranjan.hy@ti.com Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index a629e0eacb20eb..d2b6fb8e0b6c69 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -118,6 +118,7 @@ static int soc_mixer_reg_to_ctl(struct soc_mixer_control *mc, unsigned int reg_v if (mc->sign_bit) val = sign_extend32(val, mc->sign_bit); + val = clamp(val, mc->min, mc->max); val -= mc->min; if (mc->invert) From 4cc9bd8d7b32d59b86cb489a96aa8a7b9dd6a21b Mon Sep 17 00:00:00 2001 From: Niranjan H Y Date: Fri, 12 Sep 2025 14:06:21 +0530 Subject: [PATCH 2472/3206] ASoc: tas2783A: Add soundwire based codec driver TAS2783 is mono digital input class-D Smart Amplifier based on MIPI Alliance Soundwire interface. The driver supports loading the algorithm coefficients for one or more tas2783 chips. Signed-off-by: Niranjan H Y Reviewed-by: Bard Liao Link: https://patch.msgid.link/20250912083624.804-2-niranjan.hy@ti.com Signed-off-by: Mark Brown --- sound/soc/codecs/Kconfig | 14 + sound/soc/codecs/Makefile | 2 + sound/soc/codecs/tas2783-sdw.c | 1331 ++++++++++++++++++++++++++++++++ sound/soc/codecs/tas2783.h | 110 +++ 4 files changed, 1457 insertions(+) create mode 100644 sound/soc/codecs/tas2783-sdw.c create mode 100644 sound/soc/codecs/tas2783.h diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 59616f590a00b8..18131144ebbd30 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -268,6 +268,7 @@ config SND_SOC_ALL_CODECS imply SND_SOC_TAS2770 imply SND_SOC_TAS2780 imply SND_SOC_TAS2781_I2C + imply SND_SOC_TAS2783_SDW imply SND_SOC_TAS5086 imply SND_SOC_TAS571X imply SND_SOC_TAS5720 @@ -2097,6 +2098,19 @@ config SND_SOC_TAS2781_I2C algo coefficient setting, for one, two or even multiple TAS2781 chips. +config SND_SOC_TAS2783_SDW + tristate "Texas Instruments TAS2783 speaker amplifier (sdw)" + depends on SOUNDWIRE + depends on EFI + select REGMAP_SOUNDWIRE + select REGMAP_SOUNDWIRE_MBQ + select CRC32 + help + Enable support for Texas Instruments TAS2783A Digital input + mono Class-D and DSP-inside audio power amplifiers. TAS2783 + driver implements a flexible and configurable algorithm + cofficient setting, for one, two or multiple TAS2783 chips. + config SND_SOC_TAS5086 tristate "Texas Instruments TAS5086 speaker amplifier" depends on I2C diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile index 438eabd9418875..bd95a7c911d5c1 100644 --- a/sound/soc/codecs/Makefile +++ b/sound/soc/codecs/Makefile @@ -319,6 +319,7 @@ snd-soc-tas2781-comlib-y := tas2781-comlib.o snd-soc-tas2781-comlib-i2c-y := tas2781-comlib-i2c.o snd-soc-tas2781-fmwlib-y := tas2781-fmwlib.o snd-soc-tas2781-i2c-y := tas2781-i2c.o +snd-soc-tas2783-sdw-y := tas2783-sdw.o snd-soc-tfa9879-y := tfa9879.o snd-soc-tfa989x-y := tfa989x.o snd-soc-tlv320adc3xxx-y := tlv320adc3xxx.o @@ -743,6 +744,7 @@ obj-$(CONFIG_SND_SOC_TAS2781_COMLIB) += snd-soc-tas2781-comlib.o obj-$(CONFIG_SND_SOC_TAS2781_COMLIB_I2C) += snd-soc-tas2781-comlib-i2c.o obj-$(CONFIG_SND_SOC_TAS2781_FMWLIB) += snd-soc-tas2781-fmwlib.o obj-$(CONFIG_SND_SOC_TAS2781_I2C) += snd-soc-tas2781-i2c.o +obj-$(CONFIG_SND_SOC_TAS2783_SDW) += snd-soc-tas2783-sdw.o obj-$(CONFIG_SND_SOC_TAS5086) += snd-soc-tas5086.o obj-$(CONFIG_SND_SOC_TAS571X) += snd-soc-tas571x.o obj-$(CONFIG_SND_SOC_TAS5720) += snd-soc-tas5720.o diff --git a/sound/soc/codecs/tas2783-sdw.c b/sound/soc/codecs/tas2783-sdw.c new file mode 100644 index 00000000000000..3e03e68932f6b7 --- /dev/null +++ b/sound/soc/codecs/tas2783-sdw.c @@ -0,0 +1,1331 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// ALSA SoC Texas Instruments TAS2783 Audio Smart Amplifier +// +// Copyright (C) 2025 Texas Instruments Incorporated +// https://www.ti.com +// +// The TAS2783 driver implements a flexible and configurable +// algo coefficient setting for single TAS2783 chips. +// +// Author: Niranjan H Y +// Author: Baojun Xu +// Author: Kevin Lu + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tas2783.h" + +#define TIMEOUT_FW_DL_MS (3000) +#define FW_DL_OFFSET 36 +#define FW_FL_HDR 12 +#define TAS2783_PROBE_TIMEOUT 5000 +#define TAS2783_CALI_GUID EFI_GUID(0x1f52d2a1, 0xbb3a, 0x457d, 0xbc, \ + 0x09, 0x43, 0xa3, 0xf4, 0x31, 0x0a, 0x92) + +static const u32 tas2783_cali_reg[] = { + TAS2783_CAL_R0, + TAS2783_CAL_INVR0, + TAS2783_CAL_R0LOW, + TAS2783_CAL_POWER, + TAS2783_CAL_TLIM, +}; + +struct bin_header_t { + u16 vendor_id; + u16 version; + u32 file_id; + u32 length; +}; + +struct calibration_data { + u32 is_valid; + unsigned long read_sz; + u8 data[TAS2783_CALIB_DATA_SZ]; +}; + +struct tas2783_prv { + struct snd_soc_component *component; + struct calibration_data cali_data; + struct sdw_slave *sdw_peripheral; + enum sdw_slave_status status; + /* calibration */ + struct mutex calib_lock; + /* pde and firmware download */ + struct mutex pde_lock; + struct regmap *regmap; + struct device *dev; + struct class *class; + struct attribute_group *cal_attr_groups; + struct tm tm; + u8 rca_binaryname[64]; + u8 dev_name[32]; + bool hw_init; + /* wq for firmware download */ + wait_queue_head_t fw_wait; + bool fw_dl_task_done; + bool fw_dl_success; +}; + +static const struct reg_default tas2783_reg_default[] = { + {TAS2783_AMP_LEVEL, 0x28}, + {TASDEV_REG_SDW(0, 0, 0x03), 0x28}, + {TASDEV_REG_SDW(0, 0, 0x04), 0x21}, + {TASDEV_REG_SDW(0, 0, 0x05), 0x41}, + {TASDEV_REG_SDW(0, 0, 0x06), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x07), 0x20}, + {TASDEV_REG_SDW(0, 0, 0x08), 0x09}, + {TASDEV_REG_SDW(0, 0, 0x09), 0x02}, + {TASDEV_REG_SDW(0, 0, 0x0a), 0x0a}, + {TASDEV_REG_SDW(0, 0, 0x0c), 0x10}, + {TASDEV_REG_SDW(0, 0, 0x0d), 0x13}, + {TASDEV_REG_SDW(0, 0, 0x0e), 0xc2}, + {TASDEV_REG_SDW(0, 0, 0x0f), 0x40}, + {TASDEV_REG_SDW(0, 0, 0x10), 0x04}, + {TASDEV_REG_SDW(0, 0, 0x13), 0x13}, + {TASDEV_REG_SDW(0, 0, 0x14), 0x12}, + {TASDEV_REG_SDW(0, 0, 0x15), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x16), 0x12}, + {TASDEV_REG_SDW(0, 0, 0x17), 0x80}, + {TAS2783_DVC_LVL, 0x00}, + {TASDEV_REG_SDW(0, 0, 0x1b), 0x61}, + {TASDEV_REG_SDW(0, 0, 0x1c), 0x36}, + {TASDEV_REG_SDW(0, 0, 0x1d), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x1f), 0x01}, + {TASDEV_REG_SDW(0, 0, 0x20), 0x2e}, + {TASDEV_REG_SDW(0, 0, 0x21), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x34), 0x06}, + {TASDEV_REG_SDW(0, 0, 0x35), 0xbd}, + {TASDEV_REG_SDW(0, 0, 0x36), 0xad}, + {TASDEV_REG_SDW(0, 0, 0x37), 0xa8}, + {TASDEV_REG_SDW(0, 0, 0x38), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x3b), 0xfc}, + {TASDEV_REG_SDW(0, 0, 0x3d), 0xdd}, + {TASDEV_REG_SDW(0, 0, 0x40), 0xf6}, + {TASDEV_REG_SDW(0, 0, 0x41), 0x14}, + {TASDEV_REG_SDW(0, 0, 0x5c), 0x19}, + {TASDEV_REG_SDW(0, 0, 0x5d), 0x80}, + {TASDEV_REG_SDW(0, 0, 0x63), 0x48}, + {TASDEV_REG_SDW(0, 0, 0x65), 0x08}, + {TASDEV_REG_SDW(0, 0, 0x66), 0xb2}, + {TASDEV_REG_SDW(0, 0, 0x67), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x6a), 0x12}, + {TASDEV_REG_SDW(0, 0, 0x6b), 0xfb}, + {TASDEV_REG_SDW(0, 0, 0x6c), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x6d), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x6e), 0x1a}, + {TASDEV_REG_SDW(0, 0, 0x6f), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x70), 0x96}, + {TASDEV_REG_SDW(0, 0, 0x71), 0x02}, + {TASDEV_REG_SDW(0, 0, 0x73), 0x08}, + {TASDEV_REG_SDW(0, 0, 0x75), 0xe0}, + {TASDEV_REG_SDW(0, 0, 0x7a), 0x60}, + {TASDEV_REG_SDW(0, 0, 0x60), 0x21}, + {TASDEV_REG_SDW(0, 1, 0x02), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x17), 0xc0}, + {TASDEV_REG_SDW(0, 1, 0x19), 0x60}, + {TASDEV_REG_SDW(0, 1, 0x35), 0x75}, + {TASDEV_REG_SDW(0, 1, 0x3d), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x3e), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x3f), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x40), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x41), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x42), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x43), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x44), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x45), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x47), 0xab}, + {TASDEV_REG_SDW(0, 0xfd, 0x0d), 0x0d}, + {TASDEV_REG_SDW(0, 0xfd, 0x39), 0x00}, + {TASDEV_REG_SDW(0, 0xfd, 0x3e), 0x00}, + {TASDEV_REG_SDW(0, 0xfd, 0x45), 0x00}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS21, 0x02, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS21, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS24, 0x02, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS24, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS26, 0x02, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS26, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS28, 0x02, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS28, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS127, 0x02, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS127, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU21, 0x01, 1), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU21, 0x02, 1), 0x9c00}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x01, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x01, 1), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x0b, 1), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x01, 1), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x01, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x0b, 1), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 1), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 2), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 1), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 2), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x01, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x05, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x01, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x05, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 1), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 2), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 3), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 4), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 5), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 6), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 7), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x06, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT23, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT23, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 1), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 2), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 3), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 4), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 5), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 6), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 7), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 8), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 9), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xa), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xb), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xc), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xd), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xe), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xf), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, 0x1, 0), 0x3}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, 0x10, 0), 0x3}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x06, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x13, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x06, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x13, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x05, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x10, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_TG23, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x01, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x06, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x07, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x09, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x0a, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x10, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x13, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x14, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x15, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x16, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_UDMPU23, 0x10, 0), 0x0}, +}; + +static const struct reg_sequence tas2783_init_seq[] = { + REG_SEQ0(SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x10, 0x00), 0x04), + REG_SEQ0(0x00800418, 0x00), + REG_SEQ0(0x00800419, 0x00), + REG_SEQ0(0x0080041a, 0x00), + REG_SEQ0(0x0080041b, 0x00), + REG_SEQ0(0x00800428, 0x40), + REG_SEQ0(0x00800429, 0x00), + REG_SEQ0(0x0080042a, 0x00), + REG_SEQ0(0x0080042b, 0x00), + REG_SEQ0(SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x1, 0x00), 0x00), + REG_SEQ0(0x0080005c, 0xD9), + REG_SEQ0(0x00800082, 0x20), + REG_SEQ0(0x008000a1, 0x00), + REG_SEQ0(0x00800097, 0xc8), + REG_SEQ0(0x00800099, 0x20), + REG_SEQ0(0x008000c7, 0xaa), + REG_SEQ0(0x008000b5, 0x74), + REG_SEQ0(0x00800082, 0x20), + REG_SEQ0(0x00807e8d, 0x0d), + REG_SEQ0(0x00807eb9, 0x53), + REG_SEQ0(0x00807ebe, 0x42), + REG_SEQ0(0x00807ec5, 0x37), + REG_SEQ0(0x00800066, 0x92), + REG_SEQ0(0x00800003, 0x28), + REG_SEQ0(0x00800004, 0x21), + REG_SEQ0(0x00800005, 0x41), + REG_SEQ0(0x00800006, 0x00), + REG_SEQ0(0x00800007, 0x20), + REG_SEQ0(0x0080000c, 0x10), + REG_SEQ0(0x00800013, 0x08), + REG_SEQ0(0x00800015, 0x00), + REG_SEQ0(0x00800017, 0x80), + REG_SEQ0(0x0080001a, 0x00), + REG_SEQ0(0x0080001b, 0x22), + REG_SEQ0(0x0080001c, 0x36), + REG_SEQ0(0x0080001d, 0x01), + REG_SEQ0(0x0080001f, 0x00), + REG_SEQ0(0x00800020, 0x2e), + REG_SEQ0(0x00800034, 0x06), + REG_SEQ0(0x00800035, 0xb9), + REG_SEQ0(0x00800036, 0xad), + REG_SEQ0(0x00800037, 0xa8), + REG_SEQ0(0x00800038, 0x00), + REG_SEQ0(0x0080003b, 0xfc), + REG_SEQ0(0x0080003d, 0xdd), + REG_SEQ0(0x00800040, 0xf6), + REG_SEQ0(0x00800041, 0x14), + REG_SEQ0(0x0080005c, 0x19), + REG_SEQ0(0x0080005d, 0x80), + REG_SEQ0(0x00800063, 0x48), + REG_SEQ0(0x00800065, 0x08), + REG_SEQ0(0x00800067, 0x00), + REG_SEQ0(0x0080006a, 0x12), + REG_SEQ0(0x0080006b, 0x7b), + REG_SEQ0(0x0080006c, 0x00), + REG_SEQ0(0x0080006d, 0x00), + REG_SEQ0(0x0080006e, 0x1a), + REG_SEQ0(0x0080006f, 0x00), + REG_SEQ0(0x00800070, 0x96), + REG_SEQ0(0x00800071, 0x02), + REG_SEQ0(0x00800073, 0x08), + REG_SEQ0(0x00800075, 0xe0), + REG_SEQ0(0x0080007a, 0x60), + REG_SEQ0(0x008000bd, 0x00), + REG_SEQ0(0x008000be, 0x00), + REG_SEQ0(0x008000bf, 0x00), + REG_SEQ0(0x008000c0, 0x00), + REG_SEQ0(0x008000c1, 0x00), + REG_SEQ0(0x008000c2, 0x00), + REG_SEQ0(0x008000c3, 0x00), + REG_SEQ0(0x008000c4, 0x00), + REG_SEQ0(0x008000c5, 0x00), + REG_SEQ0(0x00800008, 0x49), + REG_SEQ0(0x00800009, 0x02), + REG_SEQ0(0x0080000a, 0x1a), + REG_SEQ0(0x0080000d, 0x93), + REG_SEQ0(0x0080000e, 0x82), + REG_SEQ0(0x0080000f, 0x42), + REG_SEQ0(0x00800010, 0x84), + REG_SEQ0(0x00800014, 0x0a), + REG_SEQ0(0x00800016, 0x00), + REG_SEQ0(0x00800060, 0x21), +}; + +static int tas2783_sdca_mbq_size(struct device *dev, u32 reg) +{ + switch (reg) { + case 0x000 ... 0x080: /* Data port 0. */ + case 0x100 ... 0x140: /* Data port 1. */ + case 0x200 ... 0x240: /* Data port 2. */ + case 0x300 ... 0x340: /* Data port 3. */ + case 0x400 ... 0x440: /* Data port 4. */ + case 0x500 ... 0x540: /* Data port 5. */ + case 0x800000 ... 0x803fff: /* Page 0 ~ 127. */ + case 0x807e80 ... 0x807eff: /* Page 253. */ + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_UDMPU23, + TAS2783_SDCA_CTL_UDMPU_CLUSTER, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU21, TAS2783_SDCA_CTL_FU_MUTE, + TAS2783_DEVICE_CHANNEL_LEFT): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, 0x1, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_TG23, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x0a, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x14, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x15, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x16, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT23, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 2): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 3): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 4): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 5): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 6): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 7): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 8): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 9): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xa): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xb): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xc): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xd): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xe): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xf): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS21, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS21, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS24, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS24, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS25, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS25, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS127, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS127, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS26, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS26, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS28, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS28, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x05, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 2): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x01, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x05, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x01, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x04, 0): + return 1; + + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 2): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 3): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 4): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 5): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 6): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 7): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU21, 0x02, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x0b, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 2): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x0b, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x0b, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x07, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x09, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x13, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x13, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x11, 0): + return 2; + + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT23, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x06, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x06, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x13, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x05, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x06, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x06, 0): + return 4; + + default: + return 0; + } +} + +static bool tas2783_readable_register(struct device *dev, unsigned int reg) +{ + return tas2783_sdca_mbq_size(dev, reg) > 0; +} + +static bool tas2783_volatile_register(struct device *dev, u32 reg) +{ + switch (reg) { + case 0x000 ... 0x080: /* Data port 0. */ + case 0x100 ... 0x140: /* Data port 1. */ + case 0x200 ... 0x240: /* Data port 2. */ + case 0x300 ... 0x340: /* Data port 3. */ + case 0x400 ... 0x440: /* Data port 4. */ + case 0x500 ... 0x540: /* Data port 5. */ + case 0x800001: + return true; + + default: + return false; + } +} + +static const struct regmap_config tas_regmap = { + .reg_bits = 32, + .val_bits = 8, + .readable_reg = tas2783_readable_register, + .volatile_reg = tas2783_volatile_register, + .reg_defaults = tas2783_reg_default, + .num_reg_defaults = ARRAY_SIZE(tas2783_reg_default), + .max_register = 0x41008000 + TASDEV_REG_SDW(0xa1, 0x60, 0x7f), + .cache_type = REGCACHE_MAPLE, + .use_single_read = true, + .use_single_write = true, +}; + +static const struct regmap_sdw_mbq_cfg tas2783_mbq_cfg = { + .mbq_size = tas2783_sdca_mbq_size, +}; + +static s32 tas2783_digital_getvol(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return snd_soc_get_volsw(kcontrol, ucontrol); +} + +static s32 tas2783_digital_putvol(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return snd_soc_put_volsw(kcontrol, ucontrol); +} + +static s32 tas2783_amp_getvol(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return snd_soc_get_volsw(kcontrol, ucontrol); +} + +static s32 tas2783_amp_putvol(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return snd_soc_put_volsw(kcontrol, ucontrol); +} + +static const struct snd_kcontrol_new tas2783_snd_controls[] = { + SOC_SINGLE_RANGE_EXT_TLV("Amp Volume", TAS2783_AMP_LEVEL, + 1, 0, 20, 0, tas2783_amp_getvol, + tas2783_amp_putvol, tas2781_amp_tlv), + SOC_SINGLE_RANGE_EXT_TLV("Speaker Volume", TAS2783_DVC_LVL, + 0, 0, 200, 1, tas2783_digital_getvol, + tas2783_digital_putvol, tas2781_dvc_tlv), +}; + +static s32 tas2783_validate_calibdata(struct tas2783_prv *tas_dev, + u8 *data, u32 size) +{ + u32 ts, spk_count, size_calculated; + u32 crc_calculated, crc_read, i; + u32 *tmp_val; + struct tm tm; + + i = 0; + tmp_val = (u32 *)data; + if (tmp_val[i++] != 2783) { + dev_err(tas_dev->dev, "cal data magic number mismatch"); + return -EINVAL; + } + + spk_count = tmp_val[i++]; + if (spk_count > TAS2783_CALIB_MAX_SPK_COUNT) { + dev_err(tas_dev->dev, "cal data spk_count too large"); + return -EINVAL; + } + + ts = tmp_val[i++]; + time64_to_tm(ts, 0, &tm); + dev_dbg(tas_dev->dev, "cal data timestamp: %ld-%d-%d %d:%d:%d", + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); + + size_calculated = + (spk_count * TAS2783_CALIB_PARAMS * sizeof(u32)) + + TAS2783_CALIB_HDR_SZ + TAS2783_CALIB_CRC_SZ; + if (size_calculated > TAS2783_CALIB_DATA_SZ) { + dev_err(tas_dev->dev, "cali data sz too large"); + return -EINVAL; + } else if (size < size_calculated) { + dev_err(tas_dev->dev, "cali data size mismatch calc=%u vs %d\n", + size, size_calculated); + return -EINVAL; + } + + crc_calculated = crc32(~0, data, + size_calculated - TAS2783_CALIB_CRC_SZ) ^ ~0; + crc_read = tmp_val[(size_calculated - TAS2783_CALIB_CRC_SZ) / sizeof(u32)]; + if (crc_calculated != crc_read) { + dev_err(tas_dev->dev, + "calib data integrity check fail, 0x%08x vs 0x%08x\n", + crc_calculated, crc_read); + return -EINVAL; + } + + return 0; +} + +static void tas2783_set_calib_params_to_device(struct tas2783_prv *tas_dev, u32 *cali_data) +{ + u32 dev_count, offset, i, device_num; + u32 reg_value; + u8 buf[4]; + + dev_count = cali_data[1]; + offset = 3; + + for (device_num = 0; device_num < dev_count; device_num++) { + if (cali_data[offset] != tas_dev->sdw_peripheral->id.unique_id) { + offset += TAS2783_CALIB_PARAMS; + continue; + } + offset++; + + for (i = 0; i < ARRAY_SIZE(tas2783_cali_reg); i++) { + reg_value = cali_data[offset + i]; + buf[0] = reg_value >> 24; + buf[1] = reg_value >> 16; + buf[2] = reg_value >> 8; + buf[3] = reg_value & 0xff; + regmap_bulk_write(tas_dev->regmap, tas2783_cali_reg[i], + buf, sizeof(u32)); + } + break; + } + + if (device_num == dev_count) + dev_err(tas_dev->dev, "device not found\n"); + else + dev_dbg(tas_dev->dev, "calib data update done\n"); +} + +static s32 tas2783_update_calibdata(struct tas2783_prv *tas_dev) +{ + efi_guid_t efi_guid = TAS2783_CALI_GUID; + u32 attr, i, *tmp_val; + unsigned long size; + s32 ret; + efi_status_t status; + static efi_char16_t efi_names[][32] = { + L"SmartAmpCalibrationData", L"CALI_DATA"}; + + tmp_val = (u32 *)tas_dev->cali_data.data; + attr = 0; + i = 0; + + /* + * In some cases, the calibration is performed in Windows, + * and data was saved in UEFI. Linux can access it. + */ + for (i = 0; i < ARRAY_SIZE(efi_names); i++) { + size = 0; + status = efi.get_variable(efi_names[i], &efi_guid, &attr, + &size, NULL); + if (size > TAS2783_CALIB_DATA_SZ) { + dev_err(tas_dev->dev, "cali data too large\n"); + break; + } + + tas_dev->cali_data.read_sz = size; + if (status == EFI_BUFFER_TOO_SMALL) { + status = efi.get_variable(efi_names[i], &efi_guid, &attr, + &tas_dev->cali_data.read_sz, + tas_dev->cali_data.data); + dev_dbg(tas_dev->dev, "cali get %lu bytes result:%ld\n", + tas_dev->cali_data.read_sz, status); + } + if (status == EFI_SUCCESS) + break; + } + + if (status != EFI_SUCCESS) { + /* Failed got calibration data from EFI. */ + dev_dbg(tas_dev->dev, "No calibration data in UEFI."); + return 0; + } + + mutex_lock(&tas_dev->calib_lock); + ret = tas2783_validate_calibdata(tas_dev, tas_dev->cali_data.data, + tas_dev->cali_data.read_sz); + if (!ret) + tas2783_set_calib_params_to_device(tas_dev, tmp_val); + mutex_unlock(&tas_dev->calib_lock); + + return ret; +} + +static s32 read_header(const u8 *data, struct bin_header_t *hdr) +{ + hdr->vendor_id = get_unaligned_le16(&data[0]); + hdr->file_id = get_unaligned_le32(&data[2]); + hdr->version = get_unaligned_le16(&data[6]); + hdr->length = get_unaligned_le32(&data[8]); + return 12; +} + +static void tas2783_fw_ready(const struct firmware *fmw, void *context) +{ + struct tas2783_prv *tas_dev = + (struct tas2783_prv *)context; + const u8 *buf = NULL; + s32 offset = 0, img_sz, file_blk_size, ret; + struct bin_header_t hdr; + + if (!fmw || !fmw->data) { + /* No firmware binary, devices will work in ROM mode. */ + dev_err(tas_dev->dev, + "Failed to read %s, no side-effect on driver running\n", + tas_dev->rca_binaryname); + ret = -EINVAL; + goto out; + } + + mutex_lock(&tas_dev->pde_lock); + img_sz = fmw->size; + buf = fmw->data; + offset += FW_DL_OFFSET; + while (offset < (img_sz - FW_FL_HDR)) { + memset(&hdr, 0, sizeof(hdr)); + offset += read_header(&buf[offset], &hdr); + dev_dbg(tas_dev->dev, + "vndr=%d, file=%d, version=%d, len=%d, off=%d\n", + hdr.vendor_id, hdr.file_id, hdr.version, + hdr.length, offset); + /* size also includes the header */ + file_blk_size = hdr.length - FW_FL_HDR; + + switch (hdr.file_id) { + case 0: + ret = sdw_nwrite_no_pm(tas_dev->sdw_peripheral, + PRAM_ADDR_START, file_blk_size, + &buf[offset]); + if (ret < 0) + dev_err(tas_dev->dev, + "PRAM update failed: %d", ret); + break; + + case 1: + ret = sdw_nwrite_no_pm(tas_dev->sdw_peripheral, + YRAM_ADDR_START, file_blk_size, + &buf[offset]); + if (ret < 0) + dev_err(tas_dev->dev, + "YRAM update failed: %d", ret); + + break; + + default: + ret = -EINVAL; + dev_err(tas_dev->dev, "Unsupported file"); + break; + } + + if (ret == 0) + offset += file_blk_size; + else + break; + }; + mutex_unlock(&tas_dev->pde_lock); + tas2783_update_calibdata(tas_dev); + +out: + if (!ret) + tas_dev->fw_dl_success = true; + tas_dev->fw_dl_task_done = true; + wake_up(&tas_dev->fw_wait); + if (fmw) + release_firmware(fmw); +} + +static inline s32 tas_clear_latch(struct tas2783_prv *priv) +{ + return regmap_update_bits(priv->regmap, + TASDEV_REG_SDW(0, 0, 0x5c), + 0x04, 0x04); +} + +static s32 tas_fu21_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *k, s32 event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct tas2783_prv *tas_dev = snd_soc_component_get_drvdata(component); + s32 mute; + + switch (event) { + case SND_SOC_DAPM_POST_PMU: + mute = 0; + break; + + case SND_SOC_DAPM_PRE_PMD: + mute = 1; + break; + } + + return sdw_write_no_pm(tas_dev->sdw_peripheral, + SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU21, + TAS2783_SDCA_CTL_FU_MUTE, 1), mute); +} + +static s32 tas_fu23_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *k, s32 event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct tas2783_prv *tas_dev = snd_soc_component_get_drvdata(component); + s32 mute; + + switch (event) { + case SND_SOC_DAPM_POST_PMU: + mute = 0; + break; + + case SND_SOC_DAPM_PRE_PMD: + mute = 1; + break; + } + + return sdw_write_no_pm(tas_dev->sdw_peripheral, + SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, + TAS2783_SDCA_CTL_FU_MUTE, 1), mute); +} + +static const struct snd_soc_dapm_widget tas_dapm_widgets[] = { + SND_SOC_DAPM_AIF_IN("ASI", "ASI Playback", 0, SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_AIF_OUT("ASI OUT", "ASI Capture", 0, SND_SOC_NOPM, + 0, 0), + SND_SOC_DAPM_DAC_E("FU21", NULL, SND_SOC_NOPM, 0, 0, tas_fu21_event, + SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD), + SND_SOC_DAPM_DAC_E("FU23", NULL, SND_SOC_NOPM, 0, 0, tas_fu23_event, + SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD), + SND_SOC_DAPM_OUTPUT("SPK"), + SND_SOC_DAPM_INPUT("DMIC"), +}; + +static const struct snd_soc_dapm_route tas_audio_map[] = { + {"FU21", NULL, "ASI"}, + {"SPK", NULL, "FU21"}, + {"FU23", NULL, "ASI"}, + {"SPK", NULL, "FU23"}, + {"ASI OUT", NULL, "DMIC"}, +}; + +static s32 tas_set_sdw_stream(struct snd_soc_dai *dai, + void *sdw_stream, s32 direction) +{ + if (!sdw_stream) + return 0; + + snd_soc_dai_dma_data_set(dai, direction, sdw_stream); + + return 0; +} + +static void tas_sdw_shutdown(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + snd_soc_dai_set_dma_data(dai, substream, NULL); +} + +static s32 tas_sdw_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct tas2783_prv *tas_dev = + snd_soc_component_get_drvdata(component); + struct sdw_stream_config stream_config = {0}; + struct sdw_port_config port_config = {0}; + struct sdw_stream_runtime *sdw_stream; + struct sdw_slave *sdw_peripheral = tas_dev->sdw_peripheral; + s32 ret, retry = 3; + + if (!tas_dev->fw_dl_success) { + dev_err(tas_dev->dev, "error playback without fw download"); + return -EINVAL; + } + + sdw_stream = snd_soc_dai_get_dma_data(dai, substream); + if (!sdw_stream) + return -EINVAL; + + ret = tas_clear_latch(tas_dev); + if (ret) + dev_err(tas_dev->dev, + "clear latch failed, err=%d", ret); + + mutex_lock(&tas_dev->pde_lock); + /* + * Sometimes, there is error returned during power on. + * So added retry logic to ensure power on so that + * port prepare succeeds + */ + do { + ret = regmap_write(tas_dev->regmap, + SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, + TAS2783_SDCA_CTL_REQ_POW_STATE, 0), + TAS2783_SDCA_POW_STATE_ON); + if (!ret) + break; + usleep_range(2000, 2200); + } while (retry--); + mutex_unlock(&tas_dev->pde_lock); + if (ret) + return ret; + + /* SoundWire specific configuration */ + snd_sdw_params_to_config(substream, params, + &stream_config, &port_config); + /* port 1 for playback */ + if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) + port_config.num = 1; + else + port_config.num = 2; + + ret = sdw_stream_add_slave(sdw_peripheral, + &stream_config, &port_config, 1, sdw_stream); + if (ret) + dev_err(dai->dev, "Unable to configure port\n"); + + return ret; +} + +static s32 tas_sdw_pcm_hw_free(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + s32 ret; + struct snd_soc_component *component = dai->component; + struct tas2783_prv *tas_dev = + snd_soc_component_get_drvdata(component); + struct sdw_stream_runtime *sdw_stream = + snd_soc_dai_get_dma_data(dai, substream); + + sdw_stream_remove_slave(tas_dev->sdw_peripheral, sdw_stream); + + mutex_lock(&tas_dev->pde_lock); + ret = regmap_write(tas_dev->regmap, + SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, + TAS2783_SDCA_CTL_REQ_POW_STATE, 0), + TAS2783_SDCA_POW_STATE_OFF); + mutex_unlock(&tas_dev->pde_lock); + + return ret; +} + +static const struct snd_soc_dai_ops tas_dai_ops = { + .hw_params = tas_sdw_hw_params, + .hw_free = tas_sdw_pcm_hw_free, + .set_stream = tas_set_sdw_stream, + .shutdown = tas_sdw_shutdown, +}; + +static struct snd_soc_dai_driver tas_dai_driver[] = { + { + .name = "tas2783-codec", + .id = 0, + .playback = { + .stream_name = "Playback", + .channels_min = 1, + .channels_max = 4, + .rates = TAS2783_DEVICE_RATES, + .formats = TAS2783_DEVICE_FORMATS, + }, + .capture = { + .stream_name = "Capture", + .channels_min = 1, + .channels_max = 4, + .rates = TAS2783_DEVICE_RATES, + .formats = TAS2783_DEVICE_FORMATS, + }, + .ops = &tas_dai_ops, + .symmetric_rate = 1, + }, +}; + +static s32 tas_component_probe(struct snd_soc_component *component) +{ + struct tas2783_prv *tas_dev = + snd_soc_component_get_drvdata(component); + + tas_dev->component = component; + tas25xx_register_misc(tas_dev->sdw_peripheral); + + return 0; +} + +static void tas_component_remove(struct snd_soc_component *codec) +{ + struct tas2783_prv *tas_dev = + snd_soc_component_get_drvdata(codec); + tas25xx_deregister_misc(); + tas_dev->component = NULL; +} + +static const struct snd_soc_component_driver soc_codec_driver_tasdevice = { + .probe = tas_component_probe, + .remove = tas_component_remove, + .controls = tas2783_snd_controls, + .num_controls = ARRAY_SIZE(tas2783_snd_controls), + .dapm_widgets = tas_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(tas_dapm_widgets), + .dapm_routes = tas_audio_map, + .num_dapm_routes = ARRAY_SIZE(tas_audio_map), + .idle_bias_on = 1, + .endianness = 1, +}; + +static s32 tas_init(struct tas2783_prv *tas_dev) +{ + s32 ret; + + dev_set_drvdata(tas_dev->dev, tas_dev); + ret = devm_snd_soc_register_component(tas_dev->dev, + &soc_codec_driver_tasdevice, + tas_dai_driver, + ARRAY_SIZE(tas_dai_driver)); + if (ret) { + dev_err(tas_dev->dev, "%s: codec register error:%d.\n", + __func__, ret); + return ret; + } + + /* set autosuspend parameters */ + pm_runtime_set_autosuspend_delay(tas_dev->dev, 3000); + pm_runtime_use_autosuspend(tas_dev->dev); + /* make sure the device does not suspend immediately */ + pm_runtime_mark_last_busy(tas_dev->dev); + pm_runtime_enable(tas_dev->dev); + + return ret; +} + +static s32 tas_read_prop(struct sdw_slave *slave) +{ + struct sdw_slave_prop *prop = &slave->prop; + s32 nval; + s32 i, j; + u32 bit; + unsigned long addr; + struct sdw_dpn_prop *dpn; + + prop->scp_int1_mask = + SDW_SCP_INT1_BUS_CLASH | SDW_SCP_INT1_PARITY; + prop->quirks = SDW_SLAVE_QUIRKS_INVALID_INITIAL_PARITY; + + prop->paging_support = true; + + /* first we need to allocate memory for set bits in port lists */ + prop->source_ports = 0x04; /* BITMAP: 00000100 */ + prop->sink_ports = 0x2; /* BITMAP: 00000010 */ + + nval = hweight32(prop->source_ports); + prop->src_dpn_prop = devm_kcalloc(&slave->dev, nval, + sizeof(*prop->src_dpn_prop), GFP_KERNEL); + if (!prop->src_dpn_prop) + return -ENOMEM; + + i = 0; + dpn = prop->src_dpn_prop; + addr = prop->source_ports; + for_each_set_bit(bit, &addr, 32) { + dpn[i].num = bit; + dpn[i].type = SDW_DPN_FULL; + dpn[i].simple_ch_prep_sm = false; + dpn[i].ch_prep_timeout = 10; + i++; + } + + /* do this again for sink now */ + nval = hweight32(prop->sink_ports); + prop->sink_dpn_prop = devm_kcalloc(&slave->dev, nval, + sizeof(*prop->sink_dpn_prop), GFP_KERNEL); + if (!prop->sink_dpn_prop) + return -ENOMEM; + + j = 0; + dpn = prop->sink_dpn_prop; + addr = prop->sink_ports; + for_each_set_bit(bit, &addr, 32) { + dpn[j].num = bit; + dpn[j].type = SDW_DPN_FULL; + dpn[j].simple_ch_prep_sm = false; + dpn[j].ch_prep_timeout = 10; + j++; + } + + /* set the timeout values */ + prop->clk_stop_timeout = 200; + + return 0; +} + +static s32 tas2783_sdca_dev_suspend(struct device *dev) +{ + struct tas2783_prv *tas_dev = dev_get_drvdata(dev); + + if (!tas_dev->hw_init) + return 0; + + regcache_cache_only(tas_dev->regmap, true); + return 0; +} + +static s32 tas2783_sdca_dev_system_suspend(struct device *dev) +{ + return tas2783_sdca_dev_suspend(dev); +} + +static s32 tas2783_sdca_dev_resume(struct device *dev) +{ + struct sdw_slave *slave = dev_to_sdw_dev(dev); + struct tas2783_prv *tas_dev = dev_get_drvdata(dev); + unsigned long t; + + if (!slave->unattach_request) + goto regmap_sync; + + t = wait_for_completion_timeout(&slave->initialization_complete, + msecs_to_jiffies(TAS2783_PROBE_TIMEOUT)); + if (!t) { + dev_err(&slave->dev, "resume: initialization timed out\n"); + sdw_show_ping_status(slave->bus, true); + return -ETIMEDOUT; + } + + slave->unattach_request = 0; + +regmap_sync: + regcache_cache_only(tas_dev->regmap, false); + regcache_sync(tas_dev->regmap); + return 0; +} + +static const struct dev_pm_ops tas2783_sdca_pm = { + SYSTEM_SLEEP_PM_OPS(tas2783_sdca_dev_system_suspend, tas2783_sdca_dev_resume) + RUNTIME_PM_OPS(tas2783_sdca_dev_suspend, tas2783_sdca_dev_resume, NULL) +}; + +static s32 tas_io_init(struct device *dev, struct sdw_slave *slave) +{ + struct tas2783_prv *tas_dev = dev_get_drvdata(dev); + s32 ret; + u8 unique_id = tas_dev->sdw_peripheral->id.unique_id; + + if (tas_dev->hw_init) + return 0; + + tas_dev->fw_dl_task_done = false; + tas_dev->fw_dl_success = false; + scnprintf(tas_dev->rca_binaryname, sizeof(tas_dev->rca_binaryname), + "tas2783-%01x.bin", unique_id); + + ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_UEVENT, + tas_dev->rca_binaryname, tas_dev->dev, + GFP_KERNEL, tas_dev, tas2783_fw_ready); + if (ret) { + dev_err(tas_dev->dev, + "firmware request failed for uid=%d, ret=%d\n", + unique_id, ret); + return ret; + } + + ret = wait_event_timeout(tas_dev->fw_wait, tas_dev->fw_dl_task_done, + msecs_to_jiffies(TIMEOUT_FW_DL_MS)); + if (!ret) { + dev_err(tas_dev->dev, "fw request, wait_event timeout\n"); + ret = -EAGAIN; + } else { + ret = regmap_multi_reg_write(tas_dev->regmap, tas2783_init_seq, + ARRAY_SIZE(tas2783_init_seq)); + tas_dev->hw_init = true; + } + + return ret; +} + +static s32 tas_update_status(struct sdw_slave *slave, + enum sdw_slave_status status) +{ + struct tas2783_prv *tas_dev = dev_get_drvdata(&slave->dev); + struct device *dev = &slave->dev; + + dev_dbg(dev, "Perifpheral status = %s", + status == SDW_SLAVE_UNATTACHED ? "unattached" : + status == SDW_SLAVE_ATTACHED ? "attached" : "alert"); + + tas_dev->status = status; + if (status == SDW_SLAVE_UNATTACHED) + tas_dev->hw_init = false; + + /* Perform initialization only if slave status + * is present and hw_init flag is false + */ + if (tas_dev->hw_init || tas_dev->status != SDW_SLAVE_ATTACHED) + return 0; + + /* updated the cache data to device */ + regcache_cache_only(tas_dev->regmap, false); + regcache_sync(tas_dev->regmap); + + /* perform I/O transfers required for Slave initialization */ + return tas_io_init(&slave->dev, slave); +} + +static const struct sdw_slave_ops tas_sdw_ops = { + .read_prop = tas_read_prop, + .update_status = tas_update_status, +}; + +static void tas_remove(struct tas2783_prv *tas_dev) +{ + snd_soc_unregister_component(tas_dev->dev); +} + +static s32 tas_sdw_probe(struct sdw_slave *peripheral, + const struct sdw_device_id *id) +{ + struct regmap *regmap; + struct device *dev = &peripheral->dev; + struct tas2783_prv *tas_dev; + + tas_dev = devm_kzalloc(dev, sizeof(*tas_dev), GFP_KERNEL); + if (!tas_dev) + return dev_err_probe(dev, -ENOMEM, + "Failed devm_kzalloc"); + + tas_dev->dev = dev; + tas_dev->sdw_peripheral = peripheral; + tas_dev->hw_init = false; + mutex_init(&tas_dev->calib_lock); + mutex_init(&tas_dev->pde_lock); + + init_waitqueue_head(&tas_dev->fw_wait); + dev_set_drvdata(dev, tas_dev); + regmap = devm_regmap_init_sdw_mbq_cfg(peripheral, + &tas_regmap, + &tas2783_mbq_cfg); + if (IS_ERR(regmap)) + return dev_err_probe(dev, PTR_ERR(tas_dev->regmap), + "Failed devm_regmap_init_sdw."); + + /* keep in cache until the device is fully initialized */ + regcache_cache_only(regmap, true); + tas_dev->regmap = regmap; + return tas_init(tas_dev); +} + +static s32 tas_sdw_remove(struct sdw_slave *peripheral) +{ + struct tas2783_prv *tas_dev = dev_get_drvdata(&peripheral->dev); + + pm_runtime_disable(tas_dev->dev); + tas_remove(tas_dev); + mutex_destroy(&tas_dev->calib_lock); + mutex_destroy(&tas_dev->pde_lock); + dev_set_drvdata(&peripheral->dev, NULL); + + return 0; +} + +static const struct sdw_device_id tas_sdw_id[] = { + /* chipid for the TAS2783 is 0x0000 */ + SDW_SLAVE_ENTRY(0x0102, 0x0000, 0), + {}, +}; +MODULE_DEVICE_TABLE(sdw, tas_sdw_id); + +static struct sdw_driver tas_sdw_driver = { + .driver = { + .name = "slave-tas2783", + .pm = pm_ptr(&tas2783_sdca_pm), + }, + .probe = tas_sdw_probe, + .remove = tas_sdw_remove, + .ops = &tas_sdw_ops, + .id_table = tas_sdw_id, +}; +module_sdw_driver(tas_sdw_driver); + +MODULE_AUTHOR("Texas Instruments Inc."); +MODULE_DESCRIPTION("ASoC TAS2783 SoundWire Driver"); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/tas2783.h b/sound/soc/codecs/tas2783.h new file mode 100644 index 00000000000000..794333e0a35029 --- /dev/null +++ b/sound/soc/codecs/tas2783.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * ALSA SoC Texas Instruments TAS2783 Audio Smart Amplifier + * + * Copyright (C) 2025 Texas Instruments Incorporated + * https://www.ti.com + * + * The TAS2783 driver implements a flexible and configurable + * algo coefficient setting for single TAS2783 chips. + * + * Author: Niranjan H Y + * Author: Baojun Xu + */ +#include + +#ifndef __TAS2783_H__ +#define __TAS2783_H__ + +#define TAS2783_DEVICE_RATES (SNDRV_PCM_RATE_44100 | \ + SNDRV_PCM_RATE_48000 | \ + SNDRV_PCM_RATE_96000 | \ + SNDRV_PCM_RATE_88200) +#define TAS2783_DEVICE_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | \ + SNDRV_PCM_FMTBIT_S24_LE | \ + SNDRV_PCM_FMTBIT_S32_LE) + +/* book, page, register */ +#define TASDEV_REG_SDW(book, page, reg) (((book) * 256 * 128) + \ + 0x800000 + ((page) * 128) + (reg)) + +/* Volume control */ +#define TAS2783_DVC_LVL TASDEV_REG_SDW(0x0, 0x00, 0x1A) +#define TAS2783_AMP_LEVEL TASDEV_REG_SDW(0x0, 0x00, 0x03) +#define TAS2783_AMP_LEVEL_MASK GENMASK(5, 1) + +#define PRAM_ADDR_START TASDEV_REG_SDW(0x8c, 0x01, 0x8) +#define PRAM_ADDR_END TASDEV_REG_SDW(0x8c, 0xff, 0x7f) +#define YRAM_ADDR_START TASDEV_REG_SDW(0x00, 0x02, 0x8) +#define YRAM_ADDR_END TASDEV_REG_SDW(0x00, 0x37, 0x7f) + +/* Calibration data */ +#define TAS2783_CAL_R0 TASDEV_REG_SDW(0, 0x16, 0x4C) +#define TAS2783_CAL_INVR0 TASDEV_REG_SDW(0, 0x16, 0x5C) +#define TAS2783_CAL_R0LOW TASDEV_REG_SDW(0, 0x16, 0x64) +#define TAS2783_CAL_POWER TASDEV_REG_SDW(0, 0x15, 0x44) +#define TAS2783_CAL_TLIM TASDEV_REG_SDW(0, 0x17, 0x58) + +/* TAS2783 SDCA Control - function number */ +#define FUNC_NUM_SMART_AMP 0x01 + +/* TAS2783 SDCA entity */ + +#define TAS2783_SDCA_ENT_FU21 0x01 +#define TAS2783_SDCA_ENT_FU23 0x02 +#define TAS2783_SDCA_ENT_FU26 0x03 +#define TAS2783_SDCA_ENT_XU22 0x04 +#define TAS2783_SDCA_ENT_CS24 0x05 +#define TAS2783_SDCA_ENT_CS21 0x06 +#define TAS2783_SDCA_ENT_CS25 0x07 +#define TAS2783_SDCA_ENT_CS26 0x08 +#define TAS2783_SDCA_ENT_CS28 0x09 +#define TAS2783_SDCA_ENT_PDE23 0x0C +#define TAS2783_SDCA_ENT_UDMPU23 0x0E +#define TAS2783_SDCA_ENT_SAPU29 0x0F +#define TAS2783_SDCA_ENT_PPU21 0x10 +#define TAS2783_SDCA_ENT_PPU26 0x11 +#define TAS2783_SDCA_ENT_TG23 0x12 +#define TAS2783_SDCA_ENT_IT21 0x13 +#define TAS2783_SDCA_ENT_IT29 0x14 +#define TAS2783_SDCA_ENT_IT26 0x15 +#define TAS2783_SDCA_ENT_IT28 0x16 +#define TAS2783_SDCA_ENT_OT24 0x17 +#define TAS2783_SDCA_ENT_OT23 0x18 +#define TAS2783_SDCA_ENT_OT25 0x19 +#define TAS2783_SDCA_ENT_OT28 0x1A +#define TAS2783_SDCA_ENT_MU26 0x1b +#define TAS2783_SDCA_ENT_OT127 0x1E +#define TAS2783_SDCA_ENT_FU127 0x1F +#define TAS2783_SDCA_ENT_CS127 0x20 +#define TAS2783_SDCA_ENT_MFPU21 0x22 +#define TAS2783_SDCA_ENT_MFPU26 0x23 + +/* TAS2783 SDCA control */ +#define TAS2783_SDCA_CTL_REQ_POW_STATE 0x01 +#define TAS2783_SDCA_CTL_FU_MUTE 0x01 +#define TAS2783_SDCA_CTL_UDMPU_CLUSTER 0x10 + +#define TAS2783_DEVICE_CHANNEL_LEFT 1 +#define TAS2783_DEVICE_CHANNEL_RIGHT 2 + +#define TAS2783_SDCA_POW_STATE_ON 0 +#define TAS2783_SDCA_POW_STATE_OFF 3 + +/* calibration data */ +#define TAS2783_CALIB_PARAMS 6 /* 5 + 1 unique id */ +#define TAS2783_CALIB_MAX_SPK_COUNT 8 +#define TAS2783_CALIB_HDR_SZ 12 +#define TAS2783_CALIB_CRC_SZ 4 +#define TAS2783_CALIB_DATA_SZ ((TAS2783_CALIB_HDR_SZ) + TAS2783_CALIB_CRC_SZ + \ + ((TAS2783_CALIB_PARAMS) * 4 * (TAS2783_CALIB_MAX_SPK_COUNT))) + +#if IS_ENABLED(CONFIG_SND_SOC_TAS2783_UTIL) +int32_t tas25xx_register_misc(struct sdw_slave *peripheral); +int32_t tas25xx_deregister_misc(void); +#else +static void tas25xx_register_misc(struct sdw_slave *peripheral) {} +static void tas25xx_deregister_misc(void) {} +#endif + +#endif /*__TAS2783_H__ */ From 96384a34dd15b0e7357a34af5c848d1115a35e62 Mon Sep 17 00:00:00 2001 From: Niranjan H Y Date: Fri, 12 Sep 2025 14:06:22 +0530 Subject: [PATCH 2473/3206] ASoc: tas2783A: machine driver amp utility for TI devices Machine driver amp utility file to initialize and support multiple tas2783a devices are added. Reviewed-by: Bard Liao Signed-off-by: Niranjan H Y -- v5: - removed empty line in soc_sdw_ti_amp.c Link: https://patch.msgid.link/20250912083624.804-3-niranjan.hy@ti.com Signed-off-by: Mark Brown --- include/sound/soc_sdw_utils.h | 8 +++ sound/soc/sdw_utils/Makefile | 3 +- sound/soc/sdw_utils/soc_sdw_ti_amp.c | 92 ++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 sound/soc/sdw_utils/soc_sdw_ti_amp.c diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h index 6049a5d0cfcd8e..3c5e9b2af7f1a6 100644 --- a/include/sound/soc_sdw_utils.h +++ b/include/sound/soc_sdw_utils.h @@ -248,5 +248,13 @@ int asoc_sdw_cs42l43_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_so int asoc_sdw_cs42l43_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_cs_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_maxim_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +/* TI */ +int asoc_sdw_ti_amp_init(struct snd_soc_card *card, + struct snd_soc_dai_link *dai_links, + struct asoc_sdw_codec_info *info, + bool playback); +int asoc_sdw_ti_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +int asoc_sdw_ti_amp_initial_settings(struct snd_soc_card *card, + const char *name_prefix); #endif diff --git a/sound/soc/sdw_utils/Makefile b/sound/soc/sdw_utils/Makefile index daf01911355371..a87c53e1a2c18e 100644 --- a/sound/soc/sdw_utils/Makefile +++ b/sound/soc/sdw_utils/Makefile @@ -6,5 +6,6 @@ snd-soc-sdw-utils-y := soc_sdw_utils.o soc_sdw_dmic.o soc_sdw_rt_dmic.o \ soc_sdw_bridge_cs35l56.o \ soc_sdw_cs42l42.o soc_sdw_cs42l43.o \ soc_sdw_cs_amp.o \ - soc_sdw_maxim.o + soc_sdw_maxim.o \ + soc_sdw_ti_amp.o obj-$(CONFIG_SND_SOC_SDW_UTILS) += snd-soc-sdw-utils.o diff --git a/sound/soc/sdw_utils/soc_sdw_ti_amp.c b/sound/soc/sdw_utils/soc_sdw_ti_amp.c new file mode 100644 index 00000000000000..f0011360ae9b6c --- /dev/null +++ b/sound/soc/sdw_utils/soc_sdw_ti_amp.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2025 Texas Instruments Inc. + +/* + * soc_sdw_ti_amp - Helpers to handle TI's soundwire based codecs + */ + +#include +#include +#include +#include +#include +#include + +#define TIAMP_SPK_VOLUME_0DB 200 + +int asoc_sdw_ti_amp_initial_settings(struct snd_soc_card *card, + const char *name_prefix) +{ + char *volume_ctl_name; + int ret; + + volume_ctl_name = kasprintf(GFP_KERNEL, "%s Speaker Volume", + name_prefix); + if (!volume_ctl_name) + return -ENOMEM; + + ret = snd_soc_limit_volume(card, volume_ctl_name, + TIAMP_SPK_VOLUME_0DB); + if (ret) + dev_err(card->dev, + "%s update failed %d\n", + volume_ctl_name, ret); + + kfree(volume_ctl_name); + return 0; +} +EXPORT_SYMBOL_NS(asoc_sdw_ti_amp_initial_settings, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_ti_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, + struct snd_soc_dai *dai) +{ + struct snd_soc_card *card = rtd->card; + char widget_name[16]; + char speaker[16]; + struct snd_soc_dapm_route route = {speaker, NULL, widget_name}; + struct snd_soc_dai *codec_dai; + const char *prefix; + int i, ret = 0; + + for_each_rtd_codec_dais(rtd, i, codec_dai) { + if (!strstr(codec_dai->name, "tas2783")) + continue; + + prefix = codec_dai->component->name_prefix; + if (!strncmp(prefix, "tas2783-1", strlen("tas2783-1"))) { + strscpy(speaker, "Left Spk", sizeof(speaker)); + } else if (!strncmp(prefix, "tas2783-2", strlen("tas2783-2"))) { + strscpy(speaker, "Right Spk", sizeof(speaker)); + } else { + ret = -EINVAL; + dev_err(card->dev, "unhandled prefix %s", prefix); + break; + } + + snprintf(widget_name, sizeof(widget_name), "%s SPK", prefix); + ret = asoc_sdw_ti_amp_initial_settings(card, prefix); + if (ret) + return ret; + + ret = snd_soc_dapm_add_routes(&card->dapm, &route, 1); + if (ret) + return ret; + } + + return ret; +} +EXPORT_SYMBOL_NS(asoc_sdw_ti_spk_rtd_init, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_ti_amp_init(struct snd_soc_card *card, + struct snd_soc_dai_link *dai_links, + struct asoc_sdw_codec_info *info, + bool playback) +{ + if (!playback) + return 0; + + info->amp_num++; + + return 0; +} +EXPORT_SYMBOL_NS(asoc_sdw_ti_amp_init, "SND_SOC_SDW_UTILS"); From b41949a2109e49cb96a1dc292efa249933e5232e Mon Sep 17 00:00:00 2001 From: Niranjan H Y Date: Fri, 12 Sep 2025 14:06:23 +0530 Subject: [PATCH 2474/3206] ASoc: tas2783A: add machine driver changes Add tas2783-codec for codec_info Reviewed-by: Bard Liao Signed-off-by: Niranjan H Y Link: https://patch.msgid.link/20250912083624.804-4-niranjan.hy@ti.com Signed-off-by: Mark Brown --- sound/soc/sdw_utils/soc_sdw_utils.c | 38 +++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index 1580331cd34c58..56c72ef27e7b1f 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -35,12 +35,12 @@ static const struct snd_kcontrol_new generic_spk_controls[] = { SOC_DAPM_PIN_SWITCH("Speaker"), }; -static const struct snd_soc_dapm_widget maxim_widgets[] = { +static const struct snd_soc_dapm_widget lr_spk_widgets[] = { SND_SOC_DAPM_SPK("Left Spk", NULL), SND_SOC_DAPM_SPK("Right Spk", NULL), }; -static const struct snd_kcontrol_new maxim_controls[] = { +static const struct snd_kcontrol_new lr_spk_controls[] = { SOC_DAPM_PIN_SWITCH("Left Spk"), SOC_DAPM_PIN_SWITCH("Right Spk"), }; @@ -58,6 +58,24 @@ static const struct snd_kcontrol_new rt700_controls[] = { }; struct asoc_sdw_codec_info codec_info_list[] = { + { + .part_id = 0x0000, /* TAS2783A */ + .dais = { + { + .direction = {true, true}, + .dai_name = "tas2783-codec", + .dai_type = SOC_SDW_DAI_TYPE_AMP, + .dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_AMP_IN_DAI_ID}, + .init = asoc_sdw_ti_amp_init, + .rtd_init = asoc_sdw_ti_spk_rtd_init, + .controls = lr_spk_controls, + .num_controls = ARRAY_SIZE(lr_spk_controls), + .widgets = lr_spk_widgets, + .num_widgets = ARRAY_SIZE(lr_spk_widgets), + }, + }, + .dai_num = 1, + }, { .part_id = 0x700, .dais = { @@ -450,10 +468,10 @@ struct asoc_sdw_codec_info codec_info_list[] = { .dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_AMP_IN_DAI_ID}, .init = asoc_sdw_maxim_init, .rtd_init = asoc_sdw_maxim_spk_rtd_init, - .controls = maxim_controls, - .num_controls = ARRAY_SIZE(maxim_controls), - .widgets = maxim_widgets, - .num_widgets = ARRAY_SIZE(maxim_widgets), + .controls = lr_spk_controls, + .num_controls = ARRAY_SIZE(lr_spk_controls), + .widgets = lr_spk_widgets, + .num_widgets = ARRAY_SIZE(lr_spk_widgets), }, }, .dai_num = 1, @@ -469,10 +487,10 @@ struct asoc_sdw_codec_info codec_info_list[] = { .dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID}, .init = asoc_sdw_maxim_init, .rtd_init = asoc_sdw_maxim_spk_rtd_init, - .controls = maxim_controls, - .num_controls = ARRAY_SIZE(maxim_controls), - .widgets = maxim_widgets, - .num_widgets = ARRAY_SIZE(maxim_widgets), + .controls = lr_spk_controls, + .num_controls = ARRAY_SIZE(lr_spk_controls), + .widgets = lr_spk_widgets, + .num_widgets = ARRAY_SIZE(lr_spk_widgets), }, }, .dai_num = 1, From 63b4c34635cf32af023796b64c855dd1ed0f0a4f Mon Sep 17 00:00:00 2001 From: Niranjan H Y Date: Fri, 12 Sep 2025 14:06:24 +0530 Subject: [PATCH 2475/3206] tas2783A: Add acpi match changes for Intel MTL acpi match changes to support tas2783a on mtl on link 0 for 2 devices Reviewed-by: Bard Liao Signed-off-by: Niranjan H Y Link: https://patch.msgid.link/20250912083624.804-5-niranjan.hy@ti.com Signed-off-by: Mark Brown --- .../intel/common/soc-acpi-intel-mtl-match.c | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/sound/soc/intel/common/soc-acpi-intel-mtl-match.c b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c index 75dc8935a79460..ec9fd8486c0534 100644 --- a/sound/soc/intel/common/soc-acpi-intel-mtl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c @@ -948,6 +948,30 @@ static const struct snd_soc_acpi_adr_device cs42l42_0_adr[] = { } }; +static const struct snd_soc_acpi_adr_device tas2783_0_adr[] = { + { + .adr = 0x0000380102000001ull, + .num_endpoints = 1, + .endpoints = &spk_l_endpoint, + .name_prefix = "tas2783-1" + }, + { + .adr = 0x0000390102000001ull, + .num_endpoints = 1, + .endpoints = &spk_r_endpoint, + .name_prefix = "tas2783-2" + } +}; + +static const struct snd_soc_acpi_link_adr tas2783_link0[] = { + { + .mask = BIT(0), + .num_adr = ARRAY_SIZE(tas2783_0_adr), + .adr_d = tas2783_0_adr, + }, + {} +}; + static const struct snd_soc_acpi_link_adr cs42l42_link0_max98363_link2[] = { /* Expected order: jack -> amp */ { @@ -1080,6 +1104,12 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_mtl_sdw_machines[] = { .drv_name = "sof_sdw", .sof_tplg_filename = "sof-mtl-rt715-rt711-rt1308-mono.tplg", }, + { + .link_mask = BIT(0), + .links = tas2783_link0, + .drv_name = "sof_sdw", + .sof_tplg_filename = "sof-mtl-tas2783.tplg", + }, { .link_mask = GENMASK(3, 0), .links = mtl_rt713_l0_rt1316_l12_rt1713_l3, From bad11557eed2592c017a06752e58df49080b4a6a Mon Sep 17 00:00:00 2001 From: Koichi Okuno Date: Tue, 9 Sep 2025 12:02:50 +0900 Subject: [PATCH 2476/3206] perf: Fujitsu: Add the Uncore PMU driver This adds a new dynamic PMU to the Perf Events framework to program and control the Uncore PMUs in Fujitsu chips. This driver exports formatting and event information to sysfs so it can be used by the perf user space tools with the syntaxes: perf stat -e pci_iod0_pci0/ea-pci/ ls perf stat -e pci_iod0_pci0/event=0x80/ ls perf stat -e mac_iod0_mac0_ch0/ea-mac/ ls perf stat -e mac_iod0_mac0_ch0/event=0x80/ ls FUJITSU-MONAKA PMU Events Specification v1.1 URL: https://github.com/fujitsu/FUJITSU-MONAKA Reviewed-by: Yicong Yang Signed-off-by: Koichi Okuno Signed-off-by: Will Deacon --- .../admin-guide/perf/fujitsu_uncore_pmu.rst | 110 ++++ Documentation/admin-guide/perf/index.rst | 1 + drivers/perf/Kconfig | 9 + drivers/perf/Makefile | 1 + drivers/perf/fujitsu_uncore_pmu.c | 613 ++++++++++++++++++ 5 files changed, 734 insertions(+) create mode 100644 Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst create mode 100644 drivers/perf/fujitsu_uncore_pmu.c diff --git a/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst b/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst new file mode 100644 index 00000000000000..46595b788d3acd --- /dev/null +++ b/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst @@ -0,0 +1,110 @@ +.. SPDX-License-Identifier: GPL-2.0-only + +================================================ +Fujitsu Uncore Performance Monitoring Unit (PMU) +================================================ + +This driver supports the Uncore MAC PMUs and the Uncore PCI PMUs found +in Fujitsu chips. +Each MAC PMU on these chips is exposed as a uncore perf PMU with device name +mac_iod_mac_ch. +And each PCI PMU on these chips is exposed as a uncore perf PMU with device name +pci_iod_pci. + +The driver provides a description of its available events and configuration +options in sysfs, see /sys/bus/event_sources/devices/mac_iod_mac_ch/ +and /sys/bus/event_sources/devices/pci_iod_pci/. +This driver exports: +- formats, used by perf user space and other tools to configure events +- events, used by perf user space and other tools to create events + symbolically, e.g.: + perf stat -a -e mac_iod0_mac0_ch0/event=0x21/ ls + perf stat -a -e pci_iod0_pci0/event=0x24/ ls +- cpumask, used by perf user space and other tools to know on which CPUs + to open the events + +This driver supports the following events for MAC: +- cycles + This event counts MAC cycles at MAC frequency. +- read-count + This event counts the number of read requests to MAC. +- read-count-request + This event counts the number of read requests including retry to MAC. +- read-count-return + This event counts the number of responses to read requests to MAC. +- read-count-request-pftgt + This event counts the number of read requests including retry with PFTGT + flag. +- read-count-request-normal + This event counts the number of read requests including retry without PFTGT + flag. +- read-count-return-pftgt-hit + This event counts the number of responses to read requests which hit the + PFTGT buffer. +- read-count-return-pftgt-miss + This event counts the number of responses to read requests which miss the + PFTGT buffer. +- read-wait + This event counts outstanding read requests issued by DDR memory controller + per cycle. +- write-count + This event counts the number of write requests to MAC (including zero write, + full write, partial write, write cancel). +- write-count-write + This event counts the number of full write requests to MAC (not including + zero write). +- write-count-pwrite + This event counts the number of partial write requests to MAC. +- memory-read-count + This event counts the number of read requests from MAC to memory. +- memory-write-count + This event counts the number of full write requests from MAC to memory. +- memory-pwrite-count + This event counts the number of partial write requests from MAC to memory. +- ea-mac + This event counts energy consumption of MAC. +- ea-memory + This event counts energy consumption of memory. +- ea-memory-mac-write + This event counts the number of write requests from MAC to memory. +- ea-ha + This event counts energy consumption of HA. + + 'ea' is the abbreviation for 'Energy Analyzer'. + +Examples for use with perf:: + + perf stat -e mac_iod0_mac0_ch0/ea-mac/ ls + +And, this driver supports the following events for PCI: +- pci-port0-cycles + This event counts PCI cycles at PCI frequency in port0. +- pci-port0-read-count + This event counts read transactions for data transfer in port0. +- pci-port0-read-count-bus + This event counts read transactions for bus usage in port0. +- pci-port0-write-count + This event counts write transactions for data transfer in port0. +- pci-port0-write-count-bus + This event counts write transactions for bus usage in port0. +- pci-port1-cycles + This event counts PCI cycles at PCI frequency in port1. +- pci-port1-read-count + This event counts read transactions for data transfer in port1. +- pci-port1-read-count-bus + This event counts read transactions for bus usage in port1. +- pci-port1-write-count + This event counts write transactions for data transfer in port1. +- pci-port1-write-count-bus + This event counts write transactions for bus usage in port1. +- ea-pci + This event counts energy consumption of PCI. + + 'ea' is the abbreviation for 'Energy Analyzer'. + +Examples for use with perf:: + + perf stat -e pci_iod0_pci0/ea-pci/ ls + +Given that these are uncore PMUs the driver does not support sampling, therefore +"perf record" will not work. Per-task perf sessions are not supported. diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index 072b510385c417..47d9a3df6329bc 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -29,3 +29,4 @@ Performance monitor support cxl ampere_cspmu mrvl-pem-pmu + fujitsu_uncore_pmu diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index a9188dec36fe15..638321fc9800ca 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -178,6 +178,15 @@ config FSL_IMX9_DDR_PMU can give information about memory throughput and other related events. +config FUJITSU_UNCORE_PMU + tristate "Fujitsu Uncore PMU" + depends on (ARM64 && ACPI) || (COMPILE_TEST && 64BIT) + help + Provides support for the Uncore performance monitor unit (PMU) + in Fujitsu processors. + Adds the Uncore PMU into the perf events subsystem for + monitoring Uncore events. + config QCOM_L2_PMU bool "Qualcomm Technologies L2-cache PMU" depends on ARCH_QCOM && ARM64 && ACPI diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 192fc8b16204dc..ea52711a87e326 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_ARM_XSCALE_PMU) += arm_xscale_pmu.o obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o obj-$(CONFIG_FSL_IMX8_DDR_PMU) += fsl_imx8_ddr_perf.o obj-$(CONFIG_FSL_IMX9_DDR_PMU) += fsl_imx9_ddr_perf.o +obj-$(CONFIG_FUJITSU_UNCORE_PMU) += fujitsu_uncore_pmu.o obj-$(CONFIG_HISI_PMU) += hisilicon/ obj-$(CONFIG_QCOM_L2_PMU) += qcom_l2_pmu.o obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o diff --git a/drivers/perf/fujitsu_uncore_pmu.c b/drivers/perf/fujitsu_uncore_pmu.c new file mode 100644 index 00000000000000..c3c6f56474adda --- /dev/null +++ b/drivers/perf/fujitsu_uncore_pmu.c @@ -0,0 +1,613 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Driver for the Uncore PMUs in Fujitsu chips. + * + * See Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst for more details. + * + * Copyright (c) 2025 Fujitsu. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Number of counters on each PMU */ +#define MAC_NUM_COUNTERS 8 +#define PCI_NUM_COUNTERS 8 +/* Mask for the event type field within perf_event_attr.config and EVTYPE reg */ +#define UNCORE_EVTYPE_MASK 0xFF + +/* Perfmon registers */ +#define PM_EVCNTR(__cntr) (0x000 + (__cntr) * 8) +#define PM_CNTCTL(__cntr) (0x100 + (__cntr) * 8) +#define PM_CNTCTL_RESET 0 +#define PM_EVTYPE(__cntr) (0x200 + (__cntr) * 8) +#define PM_EVTYPE_EVSEL(__val) FIELD_GET(UNCORE_EVTYPE_MASK, __val) +#define PM_CR 0x400 +#define PM_CR_RESET BIT(1) +#define PM_CR_ENABLE BIT(0) +#define PM_CNTENSET 0x410 +#define PM_CNTENSET_IDX(__cntr) BIT(__cntr) +#define PM_CNTENCLR 0x418 +#define PM_CNTENCLR_IDX(__cntr) BIT(__cntr) +#define PM_CNTENCLR_RESET 0xFF +#define PM_INTENSET 0x420 +#define PM_INTENSET_IDX(__cntr) BIT(__cntr) +#define PM_INTENCLR 0x428 +#define PM_INTENCLR_IDX(__cntr) BIT(__cntr) +#define PM_INTENCLR_RESET 0xFF +#define PM_OVSR 0x440 +#define PM_OVSR_OVSRCLR_RESET 0xFF + +enum fujitsu_uncore_pmu { + FUJITSU_UNCORE_PMU_MAC = 1, + FUJITSU_UNCORE_PMU_PCI = 2, +}; + +struct uncore_pmu { + int num_counters; + struct pmu pmu; + struct hlist_node node; + void __iomem *regs; + struct perf_event **events; + unsigned long *used_mask; + int cpu; + int irq; + struct device *dev; +}; + +#define to_uncore_pmu(p) (container_of(p, struct uncore_pmu, pmu)) + +static int uncore_pmu_cpuhp_state; + +static void fujitsu_uncore_counter_start(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + int idx = event->hw.idx; + + /* Initialize the hardware counter and reset prev_count*/ + local64_set(&event->hw.prev_count, 0); + writeq_relaxed(0, uncorepmu->regs + PM_EVCNTR(idx)); + + /* Set the event type */ + writeq_relaxed(PM_EVTYPE_EVSEL(event->attr.config), uncorepmu->regs + PM_EVTYPE(idx)); + + /* Enable interrupt generation by this counter */ + writeq_relaxed(PM_INTENSET_IDX(idx), uncorepmu->regs + PM_INTENSET); + + /* Finally, enable the counter */ + writeq_relaxed(PM_CNTCTL_RESET, uncorepmu->regs + PM_CNTCTL(idx)); + writeq_relaxed(PM_CNTENSET_IDX(idx), uncorepmu->regs + PM_CNTENSET); +} + +static void fujitsu_uncore_counter_stop(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + int idx = event->hw.idx; + + /* Disable the counter */ + writeq_relaxed(PM_CNTENCLR_IDX(idx), uncorepmu->regs + PM_CNTENCLR); + + /* Disable interrupt generation by this counter */ + writeq_relaxed(PM_INTENCLR_IDX(idx), uncorepmu->regs + PM_INTENCLR); +} + +static void fujitsu_uncore_counter_update(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + int idx = event->hw.idx; + u64 prev, new; + + do { + prev = local64_read(&event->hw.prev_count); + new = readq_relaxed(uncorepmu->regs + PM_EVCNTR(idx)); + } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + + local64_add(new - prev, &event->count); +} + +static inline void fujitsu_uncore_init(struct uncore_pmu *uncorepmu) +{ + int i; + + writeq_relaxed(PM_CR_RESET, uncorepmu->regs + PM_CR); + + writeq_relaxed(PM_CNTENCLR_RESET, uncorepmu->regs + PM_CNTENCLR); + writeq_relaxed(PM_INTENCLR_RESET, uncorepmu->regs + PM_INTENCLR); + writeq_relaxed(PM_OVSR_OVSRCLR_RESET, uncorepmu->regs + PM_OVSR); + + for (i = 0; i < uncorepmu->num_counters; ++i) { + writeq_relaxed(PM_CNTCTL_RESET, uncorepmu->regs + PM_CNTCTL(i)); + writeq_relaxed(PM_EVTYPE_EVSEL(0), uncorepmu->regs + PM_EVTYPE(i)); + } + writeq_relaxed(PM_CR_ENABLE, uncorepmu->regs + PM_CR); +} + +static irqreturn_t fujitsu_uncore_handle_irq(int irq_num, void *data) +{ + struct uncore_pmu *uncorepmu = data; + /* Read the overflow status register */ + long status = readq_relaxed(uncorepmu->regs + PM_OVSR); + int idx; + + if (status == 0) + return IRQ_NONE; + + /* Clear the bits we read on the overflow status register */ + writeq_relaxed(status, uncorepmu->regs + PM_OVSR); + + for_each_set_bit(idx, &status, uncorepmu->num_counters) { + struct perf_event *event; + + event = uncorepmu->events[idx]; + if (!event) + continue; + + fujitsu_uncore_counter_update(event); + } + + return IRQ_HANDLED; +} + +static void fujitsu_uncore_pmu_enable(struct pmu *pmu) +{ + writeq_relaxed(PM_CR_ENABLE, to_uncore_pmu(pmu)->regs + PM_CR); +} + +static void fujitsu_uncore_pmu_disable(struct pmu *pmu) +{ + writeq_relaxed(0, to_uncore_pmu(pmu)->regs + PM_CR); +} + +static bool fujitsu_uncore_validate_event_group(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct perf_event *leader = event->group_leader; + struct perf_event *sibling; + int counters = 1; + + if (leader == event) + return true; + + if (leader->pmu == event->pmu) + counters++; + + for_each_sibling_event(sibling, leader) { + if (sibling->pmu == event->pmu) + counters++; + } + + /* + * If the group requires more counters than the HW has, it + * cannot ever be scheduled. + */ + return counters <= uncorepmu->num_counters; +} + +static int fujitsu_uncore_event_init(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + /* Is the event for this PMU? */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* + * Sampling not supported since these events are not + * core-attributable. + */ + if (is_sampling_event(event)) + return -EINVAL; + + /* + * Task mode not available, we run the counters as socket counters, + * not attributable to any CPU and therefore cannot attribute per-task. + */ + if (event->cpu < 0) + return -EINVAL; + + /* Validate the group */ + if (!fujitsu_uncore_validate_event_group(event)) + return -EINVAL; + + hwc->idx = -1; + + event->cpu = uncorepmu->cpu; + + return 0; +} + +static void fujitsu_uncore_event_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->state = 0; + fujitsu_uncore_counter_start(event); +} + +static void fujitsu_uncore_event_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->state & PERF_HES_STOPPED) + return; + + fujitsu_uncore_counter_stop(event); + if (flags & PERF_EF_UPDATE) + fujitsu_uncore_counter_update(event); + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; +} + +static int fujitsu_uncore_event_add(struct perf_event *event, int flags) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx; + + /* Try to allocate a counter. */ + idx = bitmap_find_free_region(uncorepmu->used_mask, uncorepmu->num_counters, 0); + if (idx < 0) + /* The counters are all in use. */ + return -EAGAIN; + + hwc->idx = idx; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + uncorepmu->events[idx] = event; + + if (flags & PERF_EF_START) + fujitsu_uncore_event_start(event, 0); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); + + return 0; +} + +static void fujitsu_uncore_event_del(struct perf_event *event, int flags) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + /* Stop and clean up */ + fujitsu_uncore_event_stop(event, flags | PERF_EF_UPDATE); + uncorepmu->events[hwc->idx] = NULL; + bitmap_release_region(uncorepmu->used_mask, hwc->idx, 0); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); +} + +static void fujitsu_uncore_event_read(struct perf_event *event) +{ + fujitsu_uncore_counter_update(event); +} + +#define UNCORE_PMU_FORMAT_ATTR(_name, _config) \ + (&((struct dev_ext_attribute[]) { \ + { .attr = __ATTR(_name, 0444, device_show_string, NULL), \ + .var = (void *)_config, } \ + })[0].attr.attr) + +static struct attribute *fujitsu_uncore_pmu_formats[] = { + UNCORE_PMU_FORMAT_ATTR(event, "config:0-7"), + NULL +}; + +static const struct attribute_group fujitsu_uncore_pmu_format_group = { + .name = "format", + .attrs = fujitsu_uncore_pmu_formats, +}; + +static ssize_t fujitsu_uncore_pmu_event_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); + return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id); +} + +#define MAC_EVENT_ATTR(_name, _id) \ + PMU_EVENT_ATTR_ID(_name, fujitsu_uncore_pmu_event_show, _id) + +static struct attribute *fujitsu_uncore_mac_pmu_events[] = { + MAC_EVENT_ATTR(cycles, 0x00), + MAC_EVENT_ATTR(read-count, 0x10), + MAC_EVENT_ATTR(read-count-request, 0x11), + MAC_EVENT_ATTR(read-count-return, 0x12), + MAC_EVENT_ATTR(read-count-request-pftgt, 0x13), + MAC_EVENT_ATTR(read-count-request-normal, 0x14), + MAC_EVENT_ATTR(read-count-return-pftgt-hit, 0x15), + MAC_EVENT_ATTR(read-count-return-pftgt-miss, 0x16), + MAC_EVENT_ATTR(read-wait, 0x17), + MAC_EVENT_ATTR(write-count, 0x20), + MAC_EVENT_ATTR(write-count-write, 0x21), + MAC_EVENT_ATTR(write-count-pwrite, 0x22), + MAC_EVENT_ATTR(memory-read-count, 0x40), + MAC_EVENT_ATTR(memory-write-count, 0x50), + MAC_EVENT_ATTR(memory-pwrite-count, 0x60), + MAC_EVENT_ATTR(ea-mac, 0x80), + MAC_EVENT_ATTR(ea-memory, 0x90), + MAC_EVENT_ATTR(ea-memory-mac-write, 0x92), + MAC_EVENT_ATTR(ea-ha, 0xa0), + NULL +}; + +#define PCI_EVENT_ATTR(_name, _id) \ + PMU_EVENT_ATTR_ID(_name, fujitsu_uncore_pmu_event_show, _id) + +static struct attribute *fujitsu_uncore_pci_pmu_events[] = { + PCI_EVENT_ATTR(pci-port0-cycles, 0x00), + PCI_EVENT_ATTR(pci-port0-read-count, 0x10), + PCI_EVENT_ATTR(pci-port0-read-count-bus, 0x14), + PCI_EVENT_ATTR(pci-port0-write-count, 0x20), + PCI_EVENT_ATTR(pci-port0-write-count-bus, 0x24), + PCI_EVENT_ATTR(pci-port1-cycles, 0x40), + PCI_EVENT_ATTR(pci-port1-read-count, 0x50), + PCI_EVENT_ATTR(pci-port1-read-count-bus, 0x54), + PCI_EVENT_ATTR(pci-port1-write-count, 0x60), + PCI_EVENT_ATTR(pci-port1-write-count-bus, 0x64), + PCI_EVENT_ATTR(ea-pci, 0x80), + NULL +}; + +static const struct attribute_group fujitsu_uncore_mac_pmu_events_group = { + .name = "events", + .attrs = fujitsu_uncore_mac_pmu_events, +}; + +static const struct attribute_group fujitsu_uncore_pci_pmu_events_group = { + .name = "events", + .attrs = fujitsu_uncore_pci_pmu_events, +}; + +static ssize_t cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(dev_get_drvdata(dev)); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(uncorepmu->cpu)); +} +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *fujitsu_uncore_pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static const struct attribute_group fujitsu_uncore_pmu_cpumask_attr_group = { + .attrs = fujitsu_uncore_pmu_cpumask_attrs, +}; + +static const struct attribute_group *fujitsu_uncore_mac_pmu_attr_grps[] = { + &fujitsu_uncore_pmu_format_group, + &fujitsu_uncore_mac_pmu_events_group, + &fujitsu_uncore_pmu_cpumask_attr_group, + NULL +}; + +static const struct attribute_group *fujitsu_uncore_pci_pmu_attr_grps[] = { + &fujitsu_uncore_pmu_format_group, + &fujitsu_uncore_pci_pmu_events_group, + &fujitsu_uncore_pmu_cpumask_attr_group, + NULL +}; + +static void fujitsu_uncore_pmu_migrate(struct uncore_pmu *uncorepmu, unsigned int cpu) +{ + perf_pmu_migrate_context(&uncorepmu->pmu, uncorepmu->cpu, cpu); + irq_set_affinity(uncorepmu->irq, cpumask_of(cpu)); + uncorepmu->cpu = cpu; +} + +static int fujitsu_uncore_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct uncore_pmu *uncorepmu; + int node; + + uncorepmu = hlist_entry_safe(cpuhp_node, struct uncore_pmu, node); + node = dev_to_node(uncorepmu->dev); + if (cpu_to_node(uncorepmu->cpu) != node && cpu_to_node(cpu) == node) + fujitsu_uncore_pmu_migrate(uncorepmu, cpu); + + return 0; +} + +static int fujitsu_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct uncore_pmu *uncorepmu; + unsigned int target; + int node; + + uncorepmu = hlist_entry_safe(cpuhp_node, struct uncore_pmu, node); + if (cpu != uncorepmu->cpu) + return 0; + + node = dev_to_node(uncorepmu->dev); + target = cpumask_any_and_but(cpumask_of_node(node), cpu_online_mask, cpu); + if (target >= nr_cpu_ids) + target = cpumask_any_but(cpu_online_mask, cpu); + + if (target < nr_cpu_ids) + fujitsu_uncore_pmu_migrate(uncorepmu, target); + + return 0; +} + +static int fujitsu_uncore_pmu_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + unsigned long device_type = (unsigned long)device_get_match_data(dev); + const struct attribute_group **attr_groups; + struct uncore_pmu *uncorepmu; + struct resource *memrc; + size_t alloc_size; + char *name; + int ret; + int irq; + u64 uid; + + ret = acpi_dev_uid_to_integer(ACPI_COMPANION(dev), &uid); + if (ret) + return dev_err_probe(dev, ret, "unable to read ACPI uid\n"); + + uncorepmu = devm_kzalloc(dev, sizeof(*uncorepmu), GFP_KERNEL); + if (!uncorepmu) + return -ENOMEM; + uncorepmu->dev = dev; + uncorepmu->cpu = cpumask_local_spread(0, dev_to_node(dev)); + platform_set_drvdata(pdev, uncorepmu); + + switch (device_type) { + case FUJITSU_UNCORE_PMU_MAC: + uncorepmu->num_counters = MAC_NUM_COUNTERS; + attr_groups = fujitsu_uncore_mac_pmu_attr_grps; + name = devm_kasprintf(dev, GFP_KERNEL, "mac_iod%llu_mac%llu_ch%llu", + (uid >> 8) & 0xF, (uid >> 4) & 0xF, uid & 0xF); + break; + case FUJITSU_UNCORE_PMU_PCI: + uncorepmu->num_counters = PCI_NUM_COUNTERS; + attr_groups = fujitsu_uncore_pci_pmu_attr_grps; + name = devm_kasprintf(dev, GFP_KERNEL, "pci_iod%llu_pci%llu", + (uid >> 4) & 0xF, uid & 0xF); + break; + default: + return dev_err_probe(dev, -EINVAL, "illegal device type: %lu\n", device_type); + } + if (!name) + return -ENOMEM; + + uncorepmu->pmu = (struct pmu) { + .parent = dev, + .task_ctx_nr = perf_invalid_context, + + .attr_groups = attr_groups, + + .pmu_enable = fujitsu_uncore_pmu_enable, + .pmu_disable = fujitsu_uncore_pmu_disable, + .event_init = fujitsu_uncore_event_init, + .add = fujitsu_uncore_event_add, + .del = fujitsu_uncore_event_del, + .start = fujitsu_uncore_event_start, + .stop = fujitsu_uncore_event_stop, + .read = fujitsu_uncore_event_read, + + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + }; + + alloc_size = sizeof(uncorepmu->events[0]) * uncorepmu->num_counters; + uncorepmu->events = devm_kzalloc(dev, alloc_size, GFP_KERNEL); + if (!uncorepmu->events) + return -ENOMEM; + + alloc_size = sizeof(uncorepmu->used_mask[0]) * BITS_TO_LONGS(uncorepmu->num_counters); + uncorepmu->used_mask = devm_kzalloc(dev, alloc_size, GFP_KERNEL); + if (!uncorepmu->used_mask) + return -ENOMEM; + + uncorepmu->regs = devm_platform_get_and_ioremap_resource(pdev, 0, &memrc); + if (IS_ERR(uncorepmu->regs)) + return PTR_ERR(uncorepmu->regs); + + fujitsu_uncore_init(uncorepmu); + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + ret = devm_request_irq(dev, irq, fujitsu_uncore_handle_irq, + IRQF_NOBALANCING | IRQF_NO_THREAD, + name, uncorepmu); + if (ret) + return dev_err_probe(dev, ret, "Failed to request IRQ:%d\n", irq); + + ret = irq_set_affinity(irq, cpumask_of(uncorepmu->cpu)); + if (ret) + return dev_err_probe(dev, ret, "Failed to set irq affinity:%d\n", irq); + + uncorepmu->irq = irq; + + /* Add this instance to the list used by the offline callback */ + ret = cpuhp_state_add_instance(uncore_pmu_cpuhp_state, &uncorepmu->node); + if (ret) + return dev_err_probe(dev, ret, "Error registering hotplug"); + + ret = perf_pmu_register(&uncorepmu->pmu, name, -1); + if (ret < 0) { + cpuhp_state_remove_instance_nocalls(uncore_pmu_cpuhp_state, &uncorepmu->node); + return dev_err_probe(dev, ret, "Failed to register %s PMU\n", name); + } + + dev_dbg(dev, "Registered %s, type: %d\n", name, uncorepmu->pmu.type); + + return 0; +} + +static void fujitsu_uncore_pmu_remove(struct platform_device *pdev) +{ + struct uncore_pmu *uncorepmu = platform_get_drvdata(pdev); + + writeq_relaxed(0, uncorepmu->regs + PM_CR); + + perf_pmu_unregister(&uncorepmu->pmu); + cpuhp_state_remove_instance_nocalls(uncore_pmu_cpuhp_state, &uncorepmu->node); +} + +static const struct acpi_device_id fujitsu_uncore_pmu_acpi_match[] = { + { "FUJI200C", FUJITSU_UNCORE_PMU_MAC }, + { "FUJI200D", FUJITSU_UNCORE_PMU_PCI }, + { } +}; +MODULE_DEVICE_TABLE(acpi, fujitsu_uncore_pmu_acpi_match); + +static struct platform_driver fujitsu_uncore_pmu_driver = { + .driver = { + .name = "fujitsu-uncore-pmu", + .acpi_match_table = fujitsu_uncore_pmu_acpi_match, + .suppress_bind_attrs = true, + }, + .probe = fujitsu_uncore_pmu_probe, + .remove = fujitsu_uncore_pmu_remove, +}; + +static int __init fujitsu_uncore_pmu_init(void) +{ + int ret; + + /* Install a hook to update the reader CPU in case it goes offline */ + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/fujitsu/uncore:online", + fujitsu_uncore_pmu_online_cpu, + fujitsu_uncore_pmu_offline_cpu); + if (ret < 0) + return ret; + + uncore_pmu_cpuhp_state = ret; + + ret = platform_driver_register(&fujitsu_uncore_pmu_driver); + if (ret) + cpuhp_remove_multi_state(uncore_pmu_cpuhp_state); + + return ret; +} + +static void __exit fujitsu_uncore_pmu_exit(void) +{ + platform_driver_unregister(&fujitsu_uncore_pmu_driver); + cpuhp_remove_multi_state(uncore_pmu_cpuhp_state); +} + +module_init(fujitsu_uncore_pmu_init); +module_exit(fujitsu_uncore_pmu_exit); + +MODULE_AUTHOR("Koichi Okuno "); +MODULE_DESCRIPTION("Fujitsu Uncore PMU driver"); +MODULE_LICENSE("GPL"); From 43de0ac332b815cf56dbdce63687de9acfd35d49 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:19 +0800 Subject: [PATCH 2477/3206] drivers/perf: hisi: Relax the event ID check in the framework Event ID is only using the attr::config bit [7, 0] but we check the event range using the whole 64bit field. It blocks the usage of the rest field of attr::config. Relax the check by only using the bit [7, 0]. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_pmu.c | 2 +- drivers/perf/hisilicon/hisi_uncore_pmu.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index a449651f79c9f6..6594d64b03a9e8 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -234,7 +234,7 @@ int hisi_uncore_pmu_event_init(struct perf_event *event) return -EINVAL; hisi_pmu = to_hisi_pmu(event->pmu); - if (event->attr.config > hisi_pmu->check_event) + if ((event->attr.config & HISI_EVENTID_MASK) > hisi_pmu->check_event) return -EINVAL; if (hisi_pmu->on_cpu == -1) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h index 777675838b8081..e69660f72be677 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.h +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h @@ -43,7 +43,8 @@ return FIELD_GET(GENMASK_ULL(hi, lo), event->attr.config); \ } -#define HISI_GET_EVENTID(ev) (ev->hw.config_base & 0xff) +#define HISI_EVENTID_MASK GENMASK(7, 0) +#define HISI_GET_EVENTID(ev) ((ev)->hw.config_base & HISI_EVENTID_MASK) #define HISI_PMU_EVTYPE_BITS 8 #define HISI_PMU_EVTYPE_SHIFT(idx) ((idx) % 4 * HISI_PMU_EVTYPE_BITS) From 4550244b53b7ef81607c0d52c72f835718497218 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:20 +0800 Subject: [PATCH 2478/3206] drivers/perf: hisi: Export hisi_uncore_pmu_isr() Currently Uncore PMU framework assume one PMU device only have one interrupt and will help register the interrupt handler. It cannot support a PMU with multiple interrupt resources. An uncore PMU may have multiple interrupts that can share the same handler. Export hisi_uncore_pmu_isr() to allow drivers register the irq handler by their own routine. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Reviewed-by: Jonathan Cameron Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_pmu.c | 3 ++- drivers/perf/hisilicon/hisi_uncore_pmu.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index 6594d64b03a9e8..de71dcf116538b 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -149,7 +149,7 @@ static void hisi_uncore_pmu_clear_event_idx(struct hisi_pmu *hisi_pmu, int idx) clear_bit(idx, hisi_pmu->pmu_events.used_mask); } -static irqreturn_t hisi_uncore_pmu_isr(int irq, void *data) +irqreturn_t hisi_uncore_pmu_isr(int irq, void *data) { struct hisi_pmu *hisi_pmu = data; struct perf_event *event; @@ -178,6 +178,7 @@ static irqreturn_t hisi_uncore_pmu_isr(int irq, void *data) return IRQ_HANDLED; } +EXPORT_SYMBOL_NS_GPL(hisi_uncore_pmu_isr, "HISI_PMU"); int hisi_uncore_pmu_init_irq(struct hisi_pmu *hisi_pmu, struct platform_device *pdev) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h index e69660f72be677..8649be6f716ac9 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.h +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h @@ -165,6 +165,7 @@ int hisi_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node); ssize_t hisi_uncore_pmu_identifier_attr_show(struct device *dev, struct device_attribute *attr, char *page); +irqreturn_t hisi_uncore_pmu_isr(int irq, void *data); int hisi_uncore_pmu_init_irq(struct hisi_pmu *hisi_pmu, struct platform_device *pdev); void hisi_uncore_pmu_init_topology(struct hisi_pmu *hisi_pmu, struct device *dev); From 0960e535be5403954701b06e722b68b53463cbe0 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:21 +0800 Subject: [PATCH 2479/3206] drivers/perf: hisi: Simplify the probe process of each L3C PMU version Version 1 and 2 of L3C PMU also use different HID. Make use of struct acpi_device_id::driver_data for version specific information rather than judge the version register. This will help to simplify the probe process and also a bit easier for extension. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Reviewed-by: Jonathan Cameron Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 43 ++++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index 412fc3a979639c..db683dd7375c79 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -345,13 +345,6 @@ static void hisi_l3c_pmu_clear_int_status(struct hisi_pmu *l3c_pmu, int idx) writel(1 << idx, l3c_pmu->base + L3C_INT_CLEAR); } -static const struct acpi_device_id hisi_l3c_pmu_acpi_match[] = { - { "HISI0213", }, - { "HISI0214", }, - {} -}; -MODULE_DEVICE_TABLE(acpi, hisi_l3c_pmu_acpi_match); - static int hisi_l3c_pmu_init_data(struct platform_device *pdev, struct hisi_pmu *l3c_pmu) { @@ -371,6 +364,10 @@ static int hisi_l3c_pmu_init_data(struct platform_device *pdev, return -EINVAL; } + l3c_pmu->dev_info = device_get_match_data(&pdev->dev); + if (!l3c_pmu->dev_info) + return -ENODEV; + l3c_pmu->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(l3c_pmu->base)) { dev_err(&pdev->dev, "ioremap failed for l3c_pmu resource\n"); @@ -457,6 +454,18 @@ static const struct attribute_group *hisi_l3c_pmu_v2_attr_groups[] = { NULL }; +static const struct hisi_pmu_dev_info hisi_l3c_pmu_v1 = { + .attr_groups = hisi_l3c_pmu_v1_attr_groups, + .counter_bits = 48, + .check_event = L3C_V1_NR_EVENTS, +}; + +static const struct hisi_pmu_dev_info hisi_l3c_pmu_v2 = { + .attr_groups = hisi_l3c_pmu_v2_attr_groups, + .counter_bits = 64, + .check_event = L3C_V2_NR_EVENTS, +}; + static const struct hisi_uncore_ops hisi_uncore_l3c_ops = { .write_evtype = hisi_l3c_pmu_write_evtype, .get_event_idx = hisi_uncore_pmu_get_event_idx, @@ -487,16 +496,9 @@ static int hisi_l3c_pmu_dev_probe(struct platform_device *pdev, if (ret) return ret; - if (l3c_pmu->identifier >= HISI_PMU_V2) { - l3c_pmu->counter_bits = 64; - l3c_pmu->check_event = L3C_V2_NR_EVENTS; - l3c_pmu->pmu_events.attr_groups = hisi_l3c_pmu_v2_attr_groups; - } else { - l3c_pmu->counter_bits = 48; - l3c_pmu->check_event = L3C_V1_NR_EVENTS; - l3c_pmu->pmu_events.attr_groups = hisi_l3c_pmu_v1_attr_groups; - } - + l3c_pmu->pmu_events.attr_groups = l3c_pmu->dev_info->attr_groups; + l3c_pmu->counter_bits = l3c_pmu->dev_info->counter_bits; + l3c_pmu->check_event = l3c_pmu->dev_info->check_event; l3c_pmu->num_counters = L3C_NR_COUNTERS; l3c_pmu->ops = &hisi_uncore_l3c_ops; l3c_pmu->dev = &pdev->dev; @@ -554,6 +556,13 @@ static void hisi_l3c_pmu_remove(struct platform_device *pdev) &l3c_pmu->node); } +static const struct acpi_device_id hisi_l3c_pmu_acpi_match[] = { + { "HISI0213", (kernel_ulong_t)&hisi_l3c_pmu_v1 }, + { "HISI0214", (kernel_ulong_t)&hisi_l3c_pmu_v2 }, + {} +}; +MODULE_DEVICE_TABLE(acpi, hisi_l3c_pmu_acpi_match); + static struct platform_driver hisi_l3c_pmu_driver = { .driver = { .name = "hisi_l3c_pmu", From 2271f1634243897cf18763386994d613a0594d98 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:22 +0800 Subject: [PATCH 2480/3206] drivers/perf: hisi: Extract the event filter check of L3C PMU L3C PMU has 4 filter options which are sharing perf_event_attr::config1. Driver will check config1 to see whether a certain event has a filter setting. It'll be incorrect if we make use of other bits in config1 for non-filter options. So check whether each filter options are set directly in a separate function instead. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Reviewed-by: Jonathan Cameron Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index db683dd7375c79..a372dd2c07b5de 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -204,9 +204,15 @@ static void hisi_l3c_pmu_clear_core_tracetag(struct perf_event *event) } } +static bool hisi_l3c_pmu_have_filter(struct perf_event *event) +{ + return hisi_get_tt_req(event) || hisi_get_tt_core(event) || + hisi_get_datasrc_cfg(event) || hisi_get_datasrc_skt(event); +} + static void hisi_l3c_pmu_enable_filter(struct perf_event *event) { - if (event->attr.config1 != 0x0) { + if (hisi_l3c_pmu_have_filter(event)) { hisi_l3c_pmu_config_req_tracetag(event); hisi_l3c_pmu_config_core_tracetag(event); hisi_l3c_pmu_config_ds(event); @@ -215,7 +221,7 @@ static void hisi_l3c_pmu_enable_filter(struct perf_event *event) static void hisi_l3c_pmu_disable_filter(struct perf_event *event) { - if (event->attr.config1 != 0x0) { + if (hisi_l3c_pmu_have_filter(event)) { hisi_l3c_pmu_clear_ds(event); hisi_l3c_pmu_clear_core_tracetag(event); hisi_l3c_pmu_clear_req_tracetag(event); From ede339ff61c61a1ac3bedd01528e4d701d4aea22 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:23 +0800 Subject: [PATCH 2481/3206] drivers/perf: hisi: Extend the field of tt_core Currently the tt_core's using config1's bit [7, 0] and can not be extended. For some platforms there's more the 8 CPUs sharing the L3 cache. So make tt_core use config2's bit [15, 0] and the remaining bits in config2 is reserved for extension. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Reviewed-by: Jonathan Cameron Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index a372dd2c07b5de..39444f11cbad3a 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -55,10 +55,10 @@ #define L3C_V1_NR_EVENTS 0x59 #define L3C_V2_NR_EVENTS 0xFF -HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config1, 7, 0); HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_req, config1, 10, 8); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_cfg, config1, 15, 11); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_skt, config1, 16, 16); +HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config2, 15, 0); static void hisi_l3c_pmu_config_req_tracetag(struct perf_event *event) { @@ -397,7 +397,7 @@ static const struct attribute_group hisi_l3c_pmu_v1_format_group = { static struct attribute *hisi_l3c_pmu_v2_format_attr[] = { HISI_PMU_FORMAT_ATTR(event, "config:0-7"), - HISI_PMU_FORMAT_ATTR(tt_core, "config1:0-7"), + HISI_PMU_FORMAT_ATTR(tt_core, "config2:0-15"), HISI_PMU_FORMAT_ATTR(tt_req, "config1:8-10"), HISI_PMU_FORMAT_ATTR(datasrc_cfg, "config1:11-15"), HISI_PMU_FORMAT_ATTR(datasrc_skt, "config1:16"), From b3abb08d6f628a76c36bf7da9508e1a67bf186a0 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:24 +0800 Subject: [PATCH 2482/3206] drivers/perf: hisi: Refactor the event configuration of L3C PMU The event register is configured using hisi_pmu::base directly since only one address space is supported for L3C PMU. We need to extend if events configuration locates in different address space. In order to make preparation for such hardware, extract the event register configuration to separate function using hw_perf_event::event_base as each event's base address. Implement a private hisi_uncore_ops::get_event_idx() callback for initialize the event_base besides get the hardware index. No functional changes intended. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 129 ++++++++++++------- 1 file changed, 84 insertions(+), 45 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index 39444f11cbad3a..7928b9bb3e7e68 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -60,51 +60,87 @@ HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_cfg, config1, 15, 11); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_skt, config1, 16, 16); HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config2, 15, 0); -static void hisi_l3c_pmu_config_req_tracetag(struct perf_event *event) +static int hisi_l3c_pmu_get_event_idx(struct perf_event *event) { struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + unsigned long *used_mask = l3c_pmu->pmu_events.used_mask; + u32 num_counters = l3c_pmu->num_counters; + int idx; + + idx = find_first_zero_bit(used_mask, num_counters); + if (idx == num_counters) + return -EAGAIN; + + set_bit(idx, used_mask); + event->hw.event_base = (unsigned long)l3c_pmu->base; + + return idx; +} + +static u32 hisi_l3c_pmu_event_readl(struct hw_perf_event *hwc, u32 reg) +{ + return readl((void __iomem *)hwc->event_base + reg); +} + +static void hisi_l3c_pmu_event_writel(struct hw_perf_event *hwc, u32 reg, u32 val) +{ + writel(val, (void __iomem *)hwc->event_base + reg); +} + +static u64 hisi_l3c_pmu_event_readq(struct hw_perf_event *hwc, u32 reg) +{ + return readq((void __iomem *)hwc->event_base + reg); +} + +static void hisi_l3c_pmu_event_writeq(struct hw_perf_event *hwc, u32 reg, u64 val) +{ + writeq(val, (void __iomem *)hwc->event_base + reg); +} + +static void hisi_l3c_pmu_config_req_tracetag(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; u32 tt_req = hisi_get_tt_req(event); if (tt_req) { u32 val; /* Set request-type for tracetag */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val |= tt_req << L3C_TRACETAG_REQ_SHIFT; val |= L3C_TRACETAG_REQ_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); /* Enable request-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val |= L3C_TRACETAG_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); } } static void hisi_l3c_pmu_clear_req_tracetag(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 tt_req = hisi_get_tt_req(event); if (tt_req) { u32 val; /* Clear request-type */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val &= ~(tt_req << L3C_TRACETAG_REQ_SHIFT); val &= ~L3C_TRACETAG_REQ_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); /* Disable request-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val &= ~L3C_TRACETAG_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); } } static void hisi_l3c_pmu_write_ds(struct perf_event *event, u32 ds_cfg) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; u32 reg, reg_idx, shift, val; int idx = hwc->idx; @@ -120,15 +156,15 @@ static void hisi_l3c_pmu_write_ds(struct perf_event *event, u32 ds_cfg) reg_idx = idx % 4; shift = 8 * reg_idx; - val = readl(l3c_pmu->base + reg); + val = hisi_l3c_pmu_event_readl(hwc, reg); val &= ~(L3C_DATSRC_MASK << shift); val |= ds_cfg << shift; - writel(val, l3c_pmu->base + reg); + hisi_l3c_pmu_event_writel(hwc, reg, val); } static void hisi_l3c_pmu_config_ds(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 ds_cfg = hisi_get_datasrc_cfg(event); u32 ds_skt = hisi_get_datasrc_skt(event); @@ -138,15 +174,15 @@ static void hisi_l3c_pmu_config_ds(struct perf_event *event) if (ds_skt) { u32 val; - val = readl(l3c_pmu->base + L3C_DATSRC_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_DATSRC_CTRL); val |= L3C_DATSRC_SKT_EN; - writel(val, l3c_pmu->base + L3C_DATSRC_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_DATSRC_CTRL, val); } } static void hisi_l3c_pmu_clear_ds(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 ds_cfg = hisi_get_datasrc_cfg(event); u32 ds_skt = hisi_get_datasrc_skt(event); @@ -156,51 +192,51 @@ static void hisi_l3c_pmu_clear_ds(struct perf_event *event) if (ds_skt) { u32 val; - val = readl(l3c_pmu->base + L3C_DATSRC_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_DATSRC_CTRL); val &= ~L3C_DATSRC_SKT_EN; - writel(val, l3c_pmu->base + L3C_DATSRC_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_DATSRC_CTRL, val); } } static void hisi_l3c_pmu_config_core_tracetag(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 core = hisi_get_tt_core(event); if (core) { u32 val; /* Config and enable core information */ - writel(core, l3c_pmu->base + L3C_CORE_CTRL); - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_CORE_CTRL, core); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val |= L3C_CORE_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); /* Enable core-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val |= L3C_TRACETAG_CORE_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); } } static void hisi_l3c_pmu_clear_core_tracetag(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 core = hisi_get_tt_core(event); if (core) { u32 val; /* Clear core information */ - writel(L3C_COER_NONE, l3c_pmu->base + L3C_CORE_CTRL); - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_CORE_CTRL, L3C_COER_NONE); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val &= ~L3C_CORE_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); /* Disable core-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val &= ~L3C_TRACETAG_CORE_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); } } @@ -239,18 +275,19 @@ static u32 hisi_l3c_pmu_get_counter_offset(int cntr_idx) static u64 hisi_l3c_pmu_read_counter(struct hisi_pmu *l3c_pmu, struct hw_perf_event *hwc) { - return readq(l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(hwc->idx)); + return hisi_l3c_pmu_event_readq(hwc, hisi_l3c_pmu_get_counter_offset(hwc->idx)); } static void hisi_l3c_pmu_write_counter(struct hisi_pmu *l3c_pmu, struct hw_perf_event *hwc, u64 val) { - writeq(val, l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(hwc->idx)); + hisi_l3c_pmu_event_writeq(hwc, hisi_l3c_pmu_get_counter_offset(hwc->idx), val); } static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx, u32 type) { + struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw; u32 reg, reg_idx, shift, val; /* @@ -265,10 +302,10 @@ static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx, shift = 8 * reg_idx; /* Write event code to L3C_EVENT_TYPEx Register */ - val = readl(l3c_pmu->base + reg); + val = hisi_l3c_pmu_event_readl(hwc, reg); val &= ~(L3C_EVTYPE_NONE << shift); val |= (type << shift); - writel(val, l3c_pmu->base + reg); + hisi_l3c_pmu_event_writel(hwc, reg, val); } static void hisi_l3c_pmu_start_counters(struct hisi_pmu *l3c_pmu) @@ -303,9 +340,9 @@ static void hisi_l3c_pmu_enable_counter(struct hisi_pmu *l3c_pmu, u32 val; /* Enable counter index in L3C_EVENT_CTRL register */ - val = readl(l3c_pmu->base + L3C_EVENT_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL); val |= (1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_EVENT_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val); } static void hisi_l3c_pmu_disable_counter(struct hisi_pmu *l3c_pmu, @@ -314,9 +351,9 @@ static void hisi_l3c_pmu_disable_counter(struct hisi_pmu *l3c_pmu, u32 val; /* Clear counter index in L3C_EVENT_CTRL register */ - val = readl(l3c_pmu->base + L3C_EVENT_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL); val &= ~(1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_EVENT_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val); } static void hisi_l3c_pmu_enable_counter_int(struct hisi_pmu *l3c_pmu, @@ -324,10 +361,10 @@ static void hisi_l3c_pmu_enable_counter_int(struct hisi_pmu *l3c_pmu, { u32 val; - val = readl(l3c_pmu->base + L3C_INT_MASK); + val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK); /* Write 0 to enable interrupt */ val &= ~(1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_INT_MASK); + hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val); } static void hisi_l3c_pmu_disable_counter_int(struct hisi_pmu *l3c_pmu, @@ -335,10 +372,10 @@ static void hisi_l3c_pmu_disable_counter_int(struct hisi_pmu *l3c_pmu, { u32 val; - val = readl(l3c_pmu->base + L3C_INT_MASK); + val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK); /* Write 1 to mask interrupt */ val |= (1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_INT_MASK); + hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val); } static u32 hisi_l3c_pmu_get_int_status(struct hisi_pmu *l3c_pmu) @@ -348,7 +385,9 @@ static u32 hisi_l3c_pmu_get_int_status(struct hisi_pmu *l3c_pmu) static void hisi_l3c_pmu_clear_int_status(struct hisi_pmu *l3c_pmu, int idx) { - writel(1 << idx, l3c_pmu->base + L3C_INT_CLEAR); + struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw; + + hisi_l3c_pmu_event_writel(hwc, L3C_INT_CLEAR, 1 << idx); } static int hisi_l3c_pmu_init_data(struct platform_device *pdev, @@ -474,7 +513,7 @@ static const struct hisi_pmu_dev_info hisi_l3c_pmu_v2 = { static const struct hisi_uncore_ops hisi_uncore_l3c_ops = { .write_evtype = hisi_l3c_pmu_write_evtype, - .get_event_idx = hisi_uncore_pmu_get_event_idx, + .get_event_idx = hisi_l3c_pmu_get_event_idx, .start_counters = hisi_l3c_pmu_start_counters, .stop_counters = hisi_l3c_pmu_stop_counters, .enable_counter = hisi_l3c_pmu_enable_counter, From 475d94dfe7c635b4311b6c9d87f49534eab6589c Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:25 +0800 Subject: [PATCH 2483/3206] drivers/perf: hisi: Add support for L3C PMU v3 This patch adds support for L3C PMU v3. The v3 L3C PMU supports an extended events space which can be controlled in up to 2 extra address spaces with separate overflow interrupts. The layout of the control/event registers are kept the same. The extended events with original ones together cover the monitoring job of all transactions on L3C. The extended events is specified with `ext=[1|2]` option for the driver to distinguish, like below: perf stat -e hisi_sccl0_l3c0_0/event=,ext=1/ Currently only event option using config bit [7, 0]. There's still plenty unused space. Make ext using config [16, 17] and reserve bit [15, 8] for event option for future extension. With the capability of extra counters, number of counters for HiSilicon uncore PMU could reach up to 24, the usedmap is extended accordingly. The hw_perf_event::event_base is initialized to the base MMIO address of the event and will be used for later control, overflow handling and counts readout. We still make use of the Uncore PMU framework for handling the events and interrupt migration on CPU hotplug. The framework's cpuhp callback will handle the event migration and interrupt migration of orginial event, if PMU supports extended events then the interrupt of extended events is migrated to the same CPU choosed by the framework. A new HID of HISI0215 is used for this version of L3C PMU. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Co-developed-by: Yushan Wang Signed-off-by: Yushan Wang Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 352 +++++++++++++++++-- drivers/perf/hisilicon/hisi_uncore_pmu.h | 2 +- 2 files changed, 324 insertions(+), 30 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index 7928b9bb3e7e68..bbd81a43047d28 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -39,6 +39,7 @@ /* L3C has 8-counters */ #define L3C_NR_COUNTERS 0x8 +#define L3C_MAX_EXT 2 #define L3C_PERF_CTRL_EN 0x10000 #define L3C_TRACETAG_EN BIT(31) @@ -55,24 +56,81 @@ #define L3C_V1_NR_EVENTS 0x59 #define L3C_V2_NR_EVENTS 0xFF +HISI_PMU_EVENT_ATTR_EXTRACTOR(ext, config, 17, 16); HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_req, config1, 10, 8); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_cfg, config1, 15, 11); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_skt, config1, 16, 16); HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config2, 15, 0); +struct hisi_l3c_pmu { + struct hisi_pmu l3c_pmu; + + /* MMIO and IRQ resources for extension events */ + void __iomem *ext_base[L3C_MAX_EXT]; + int ext_irq[L3C_MAX_EXT]; + int ext_num; +}; + +#define to_hisi_l3c_pmu(_l3c_pmu) \ + container_of(_l3c_pmu, struct hisi_l3c_pmu, l3c_pmu) + +/* + * The hardware counter idx used in counter enable/disable, + * interrupt enable/disable and status check, etc. + */ +#define L3C_HW_IDX(_cntr_idx) ((_cntr_idx) % L3C_NR_COUNTERS) + +/* Range of ext counters in used mask. */ +#define L3C_CNTR_EXT_L(_ext) (((_ext) + 1) * L3C_NR_COUNTERS) +#define L3C_CNTR_EXT_H(_ext) (((_ext) + 2) * L3C_NR_COUNTERS) + +struct hisi_l3c_pmu_ext { + bool support_ext; +}; + +static bool support_ext(struct hisi_l3c_pmu *pmu) +{ + struct hisi_l3c_pmu_ext *l3c_pmu_ext = pmu->l3c_pmu.dev_info->private; + + return l3c_pmu_ext->support_ext; +} + static int hisi_l3c_pmu_get_event_idx(struct perf_event *event) { struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); unsigned long *used_mask = l3c_pmu->pmu_events.used_mask; - u32 num_counters = l3c_pmu->num_counters; + int ext = hisi_get_ext(event); int idx; - idx = find_first_zero_bit(used_mask, num_counters); - if (idx == num_counters) + /* + * For an L3C PMU that supports extension events, we can monitor + * maximum 2 * num_counters to 3 * num_counters events, depending on + * the number of ext regions supported by hardware. Thus use bit + * [0, num_counters - 1] for normal events and bit + * [ext * num_counters, (ext + 1) * num_counters - 1] for extension + * events. The idx allocation will keep unchanged for normal events and + * we can also use the idx to distinguish whether it's an extension + * event or not. + * + * Since normal events and extension events locates on the different + * address space, save the base address to the event->hw.event_base. + */ + if (ext && !support_ext(hisi_l3c_pmu)) + return -EOPNOTSUPP; + + if (ext) + event->hw.event_base = (unsigned long)hisi_l3c_pmu->ext_base[ext - 1]; + else + event->hw.event_base = (unsigned long)l3c_pmu->base; + + ext -= 1; + idx = find_next_zero_bit(used_mask, L3C_CNTR_EXT_H(ext), L3C_CNTR_EXT_L(ext)); + + if (idx >= L3C_CNTR_EXT_H(ext)) return -EAGAIN; set_bit(idx, used_mask); - event->hw.event_base = (unsigned long)l3c_pmu->base; return idx; } @@ -143,7 +201,7 @@ static void hisi_l3c_pmu_write_ds(struct perf_event *event, u32 ds_cfg) { struct hw_perf_event *hwc = &event->hw; u32 reg, reg_idx, shift, val; - int idx = hwc->idx; + int idx = L3C_HW_IDX(hwc->idx); /* * Select the appropriate datasource register(L3C_DATSRC_TYPE0/1). @@ -264,12 +322,24 @@ static void hisi_l3c_pmu_disable_filter(struct perf_event *event) } } +static int hisi_l3c_pmu_check_filter(struct perf_event *event) +{ + struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ext = hisi_get_ext(event); + + if (ext < 0 || ext > hisi_l3c_pmu->ext_num) + return -EINVAL; + + return 0; +} + /* * Select the counter register offset using the counter index */ static u32 hisi_l3c_pmu_get_counter_offset(int cntr_idx) { - return (L3C_CNTR0_LOWER + (cntr_idx * 8)); + return L3C_CNTR0_LOWER + L3C_HW_IDX(cntr_idx) * 8; } static u64 hisi_l3c_pmu_read_counter(struct hisi_pmu *l3c_pmu, @@ -290,6 +360,8 @@ static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx, struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw; u32 reg, reg_idx, shift, val; + idx = L3C_HW_IDX(idx); + /* * Select the appropriate event select register(L3C_EVENT_TYPE0/1). * There are 2 event select registers for the 8 hardware counters. @@ -304,34 +376,70 @@ static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx, /* Write event code to L3C_EVENT_TYPEx Register */ val = hisi_l3c_pmu_event_readl(hwc, reg); val &= ~(L3C_EVTYPE_NONE << shift); - val |= (type << shift); + val |= type << shift; hisi_l3c_pmu_event_writel(hwc, reg, val); } static void hisi_l3c_pmu_start_counters(struct hisi_pmu *l3c_pmu) { + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + unsigned long *used_mask = l3c_pmu->pmu_events.used_mask; + unsigned long used_cntr = find_first_bit(used_mask, l3c_pmu->num_counters); u32 val; + int i; /* - * Set perf_enable bit in L3C_PERF_CTRL register to start counting - * for all enabled counters. + * Check if any counter belongs to the normal range (instead of ext + * range). If so, enable it. */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); - val |= L3C_PERF_CTRL_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + if (used_cntr < L3C_NR_COUNTERS) { + val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val |= L3C_PERF_CTRL_EN; + writel(val, l3c_pmu->base + L3C_PERF_CTRL); + } + + /* If not, do enable it on ext ranges. */ + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + /* Find used counter in this ext range, skip the range if not. */ + used_cntr = find_next_bit(used_mask, L3C_CNTR_EXT_H(i), L3C_CNTR_EXT_L(i)); + if (used_cntr >= L3C_CNTR_EXT_H(i)) + continue; + + val = readl(hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + val |= L3C_PERF_CTRL_EN; + writel(val, hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + } } static void hisi_l3c_pmu_stop_counters(struct hisi_pmu *l3c_pmu) { + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + unsigned long *used_mask = l3c_pmu->pmu_events.used_mask; + unsigned long used_cntr = find_first_bit(used_mask, l3c_pmu->num_counters); u32 val; + int i; /* - * Clear perf_enable bit in L3C_PERF_CTRL register to stop counting - * for all enabled counters. + * Check if any counter belongs to the normal range (instead of ext + * range). If so, stop it. */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); - val &= ~(L3C_PERF_CTRL_EN); - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + if (used_cntr < L3C_NR_COUNTERS) { + val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val &= ~L3C_PERF_CTRL_EN; + writel(val, l3c_pmu->base + L3C_PERF_CTRL); + } + + /* If not, do stop it on ext ranges. */ + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + /* Find used counter in this ext range, skip the range if not. */ + used_cntr = find_next_bit(used_mask, L3C_CNTR_EXT_H(i), L3C_CNTR_EXT_L(i)); + if (used_cntr >= L3C_CNTR_EXT_H(i)) + continue; + + val = readl(hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + val &= ~L3C_PERF_CTRL_EN; + writel(val, hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + } } static void hisi_l3c_pmu_enable_counter(struct hisi_pmu *l3c_pmu, @@ -341,7 +449,7 @@ static void hisi_l3c_pmu_enable_counter(struct hisi_pmu *l3c_pmu, /* Enable counter index in L3C_EVENT_CTRL register */ val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL); - val |= (1 << hwc->idx); + val |= 1 << L3C_HW_IDX(hwc->idx); hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val); } @@ -352,7 +460,7 @@ static void hisi_l3c_pmu_disable_counter(struct hisi_pmu *l3c_pmu, /* Clear counter index in L3C_EVENT_CTRL register */ val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL); - val &= ~(1 << hwc->idx); + val &= ~(1 << L3C_HW_IDX(hwc->idx)); hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val); } @@ -363,7 +471,7 @@ static void hisi_l3c_pmu_enable_counter_int(struct hisi_pmu *l3c_pmu, val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK); /* Write 0 to enable interrupt */ - val &= ~(1 << hwc->idx); + val &= ~(1 << L3C_HW_IDX(hwc->idx)); hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val); } @@ -374,20 +482,34 @@ static void hisi_l3c_pmu_disable_counter_int(struct hisi_pmu *l3c_pmu, val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK); /* Write 1 to mask interrupt */ - val |= (1 << hwc->idx); + val |= 1 << L3C_HW_IDX(hwc->idx); hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val); } static u32 hisi_l3c_pmu_get_int_status(struct hisi_pmu *l3c_pmu) { - return readl(l3c_pmu->base + L3C_INT_STATUS); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + u32 ext_int, status, status_ext = 0; + int i; + + status = readl(l3c_pmu->base + L3C_INT_STATUS); + + if (!support_ext(hisi_l3c_pmu)) + return status; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + ext_int = readl(hisi_l3c_pmu->ext_base[i] + L3C_INT_STATUS); + status_ext |= ext_int << (L3C_NR_COUNTERS * i); + } + + return status | (status_ext << L3C_NR_COUNTERS); } static void hisi_l3c_pmu_clear_int_status(struct hisi_pmu *l3c_pmu, int idx) { struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw; - hisi_l3c_pmu_event_writel(hwc, L3C_INT_CLEAR, 1 << idx); + hisi_l3c_pmu_event_writel(hwc, L3C_INT_CLEAR, 1 << L3C_HW_IDX(idx)); } static int hisi_l3c_pmu_init_data(struct platform_device *pdev, @@ -424,6 +546,50 @@ static int hisi_l3c_pmu_init_data(struct platform_device *pdev, return 0; } +static int hisi_l3c_pmu_init_ext(struct hisi_pmu *l3c_pmu, struct platform_device *pdev) +{ + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ret, irq, ext_num, i; + char *irqname; + + /* HiSilicon L3C PMU supporting ext should have more than 1 irq resources. */ + ext_num = platform_irq_count(pdev); + if (ext_num < L3C_MAX_EXT) + return -ENODEV; + + /* + * The number of ext supported equals the number of irq - 1, since one + * of the irqs belongs to the normal part of PMU. + */ + hisi_l3c_pmu->ext_num = ext_num - 1; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + hisi_l3c_pmu->ext_base[i] = devm_platform_ioremap_resource(pdev, i + 1); + if (IS_ERR(hisi_l3c_pmu->ext_base[i])) + return PTR_ERR(hisi_l3c_pmu->ext_base[i]); + + irq = platform_get_irq(pdev, i + 1); + if (irq < 0) + return irq; + + irqname = devm_kasprintf(&pdev->dev, GFP_KERNEL, "%s ext%d", + dev_name(&pdev->dev), i + 1); + if (!irqname) + return -ENOMEM; + + ret = devm_request_irq(&pdev->dev, irq, hisi_uncore_pmu_isr, + IRQF_NOBALANCING | IRQF_NO_THREAD, + irqname, l3c_pmu); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "Fail to request EXT IRQ: %d.\n", irq); + + hisi_l3c_pmu->ext_irq[i] = irq; + } + + return 0; +} + static struct attribute *hisi_l3c_pmu_v1_format_attr[] = { HISI_PMU_FORMAT_ATTR(event, "config:0-7"), NULL, @@ -448,6 +614,19 @@ static const struct attribute_group hisi_l3c_pmu_v2_format_group = { .attrs = hisi_l3c_pmu_v2_format_attr, }; +static struct attribute *hisi_l3c_pmu_v3_format_attr[] = { + HISI_PMU_FORMAT_ATTR(event, "config:0-7"), + HISI_PMU_FORMAT_ATTR(ext, "config:16-17"), + HISI_PMU_FORMAT_ATTR(tt_req, "config1:8-10"), + HISI_PMU_FORMAT_ATTR(tt_core, "config2:0-15"), + NULL +}; + +static const struct attribute_group hisi_l3c_pmu_v3_format_group = { + .name = "format", + .attrs = hisi_l3c_pmu_v3_format_attr, +}; + static struct attribute *hisi_l3c_pmu_v1_events_attr[] = { HISI_PMU_EVENT_ATTR(rd_cpipe, 0x00), HISI_PMU_EVENT_ATTR(wr_cpipe, 0x01), @@ -483,6 +662,26 @@ static const struct attribute_group hisi_l3c_pmu_v2_events_group = { .attrs = hisi_l3c_pmu_v2_events_attr, }; +static struct attribute *hisi_l3c_pmu_v3_events_attr[] = { + HISI_PMU_EVENT_ATTR(rd_spipe, 0x18), + HISI_PMU_EVENT_ATTR(rd_hit_spipe, 0x19), + HISI_PMU_EVENT_ATTR(wr_spipe, 0x1a), + HISI_PMU_EVENT_ATTR(wr_hit_spipe, 0x1b), + HISI_PMU_EVENT_ATTR(io_rd_spipe, 0x1c), + HISI_PMU_EVENT_ATTR(io_rd_hit_spipe, 0x1d), + HISI_PMU_EVENT_ATTR(io_wr_spipe, 0x1e), + HISI_PMU_EVENT_ATTR(io_wr_hit_spipe, 0x1f), + HISI_PMU_EVENT_ATTR(cycles, 0x7f), + HISI_PMU_EVENT_ATTR(l3c_ref, 0xbc), + HISI_PMU_EVENT_ATTR(l3c2ring, 0xbd), + NULL +}; + +static const struct attribute_group hisi_l3c_pmu_v3_events_group = { + .name = "events", + .attrs = hisi_l3c_pmu_v3_events_attr, +}; + static const struct attribute_group *hisi_l3c_pmu_v1_attr_groups[] = { &hisi_l3c_pmu_v1_format_group, &hisi_l3c_pmu_v1_events_group, @@ -499,16 +698,41 @@ static const struct attribute_group *hisi_l3c_pmu_v2_attr_groups[] = { NULL }; +static const struct attribute_group *hisi_l3c_pmu_v3_attr_groups[] = { + &hisi_l3c_pmu_v3_format_group, + &hisi_l3c_pmu_v3_events_group, + &hisi_pmu_cpumask_attr_group, + &hisi_pmu_identifier_group, + NULL +}; + +static struct hisi_l3c_pmu_ext hisi_l3c_pmu_support_ext = { + .support_ext = true, +}; + +static struct hisi_l3c_pmu_ext hisi_l3c_pmu_not_support_ext = { + .support_ext = false, +}; + static const struct hisi_pmu_dev_info hisi_l3c_pmu_v1 = { .attr_groups = hisi_l3c_pmu_v1_attr_groups, .counter_bits = 48, .check_event = L3C_V1_NR_EVENTS, + .private = &hisi_l3c_pmu_not_support_ext, }; static const struct hisi_pmu_dev_info hisi_l3c_pmu_v2 = { .attr_groups = hisi_l3c_pmu_v2_attr_groups, .counter_bits = 64, .check_event = L3C_V2_NR_EVENTS, + .private = &hisi_l3c_pmu_not_support_ext, +}; + +static const struct hisi_pmu_dev_info hisi_l3c_pmu_v3 = { + .attr_groups = hisi_l3c_pmu_v3_attr_groups, + .counter_bits = 64, + .check_event = L3C_V2_NR_EVENTS, + .private = &hisi_l3c_pmu_support_ext, }; static const struct hisi_uncore_ops hisi_uncore_l3c_ops = { @@ -526,11 +750,14 @@ static const struct hisi_uncore_ops hisi_uncore_l3c_ops = { .clear_int_status = hisi_l3c_pmu_clear_int_status, .enable_filter = hisi_l3c_pmu_enable_filter, .disable_filter = hisi_l3c_pmu_disable_filter, + .check_filter = hisi_l3c_pmu_check_filter, }; static int hisi_l3c_pmu_dev_probe(struct platform_device *pdev, struct hisi_pmu *l3c_pmu) { + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + struct hisi_l3c_pmu_ext *l3c_pmu_dev_ext; int ret; ret = hisi_l3c_pmu_init_data(pdev, l3c_pmu); @@ -549,27 +776,47 @@ static int hisi_l3c_pmu_dev_probe(struct platform_device *pdev, l3c_pmu->dev = &pdev->dev; l3c_pmu->on_cpu = -1; + l3c_pmu_dev_ext = l3c_pmu->dev_info->private; + if (l3c_pmu_dev_ext->support_ext) { + ret = hisi_l3c_pmu_init_ext(l3c_pmu, pdev); + if (ret) + return ret; + /* + * The extension events have their own counters with the + * same number of the normal events counters. So we can + * have at maximum num_counters * ext events monitored. + */ + l3c_pmu->num_counters += hisi_l3c_pmu->ext_num * L3C_NR_COUNTERS; + } + return 0; } static int hisi_l3c_pmu_probe(struct platform_device *pdev) { + struct hisi_l3c_pmu *hisi_l3c_pmu; struct hisi_pmu *l3c_pmu; char *name; int ret; - l3c_pmu = devm_kzalloc(&pdev->dev, sizeof(*l3c_pmu), GFP_KERNEL); - if (!l3c_pmu) + hisi_l3c_pmu = devm_kzalloc(&pdev->dev, sizeof(*hisi_l3c_pmu), GFP_KERNEL); + if (!hisi_l3c_pmu) return -ENOMEM; + l3c_pmu = &hisi_l3c_pmu->l3c_pmu; platform_set_drvdata(pdev, l3c_pmu); ret = hisi_l3c_pmu_dev_probe(pdev, l3c_pmu); if (ret) return ret; - name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d", - l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id); + if (l3c_pmu->topo.sub_id >= 0) + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d_%d", + l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id, + l3c_pmu->topo.sub_id); + else + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d", + l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id); if (!name) return -ENOMEM; @@ -604,6 +851,7 @@ static void hisi_l3c_pmu_remove(struct platform_device *pdev) static const struct acpi_device_id hisi_l3c_pmu_acpi_match[] = { { "HISI0213", (kernel_ulong_t)&hisi_l3c_pmu_v1 }, { "HISI0214", (kernel_ulong_t)&hisi_l3c_pmu_v2 }, + { "HISI0215", (kernel_ulong_t)&hisi_l3c_pmu_v3 }, {} }; MODULE_DEVICE_TABLE(acpi, hisi_l3c_pmu_acpi_match); @@ -618,14 +866,60 @@ static struct platform_driver hisi_l3c_pmu_driver = { .remove = hisi_l3c_pmu_remove, }; +static int hisi_l3c_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct hisi_pmu *l3c_pmu = hlist_entry_safe(node, struct hisi_pmu, node); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ret, i; + + ret = hisi_uncore_pmu_online_cpu(cpu, node); + if (ret) + return ret; + + /* Avoid L3C pmu not supporting ext from ext irq migrating. */ + if (!support_ext(hisi_l3c_pmu)) + return 0; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) + WARN_ON(irq_set_affinity(hisi_l3c_pmu->ext_irq[i], + cpumask_of(l3c_pmu->on_cpu))); + + return 0; +} + +static int hisi_l3c_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct hisi_pmu *l3c_pmu = hlist_entry_safe(node, struct hisi_pmu, node); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ret, i; + + ret = hisi_uncore_pmu_offline_cpu(cpu, node); + if (ret) + return ret; + + /* If failed to find any available CPU, skip irq migration. */ + if (l3c_pmu->on_cpu < 0) + return 0; + + /* Avoid L3C pmu not supporting ext from ext irq migrating. */ + if (!support_ext(hisi_l3c_pmu)) + return 0; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) + WARN_ON(irq_set_affinity(hisi_l3c_pmu->ext_irq[i], + cpumask_of(l3c_pmu->on_cpu))); + + return 0; +} + static int __init hisi_l3c_pmu_module_init(void) { int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, "AP_PERF_ARM_HISI_L3_ONLINE", - hisi_uncore_pmu_online_cpu, - hisi_uncore_pmu_offline_cpu); + hisi_l3c_pmu_online_cpu, + hisi_l3c_pmu_offline_cpu); if (ret) { pr_err("L3C PMU: Error setup hotplug, ret = %d\n", ret); return ret; diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h index 8649be6f716ac9..3ffe6acda65391 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.h +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h @@ -24,7 +24,7 @@ #define pr_fmt(fmt) "hisi_pmu: " fmt #define HISI_PMU_V2 0x30 -#define HISI_MAX_COUNTERS 0x10 +#define HISI_MAX_COUNTERS 0x18 #define to_hisi_pmu(p) (container_of(p, struct hisi_pmu, pmu)) #define HISI_PMU_ATTR(_name, _func, _config) \ From 272dd0e5e58d9c216771aa4a9dc1e36a662792da Mon Sep 17 00:00:00 2001 From: Yushan Wang Date: Fri, 29 Aug 2025 18:14:26 +0800 Subject: [PATCH 2484/3206] Documentation: hisi-pmu: Fix of minor format error The inline path of sysfs should be placed in literal blocks to make documentation look better. Acked-by: Jonathan Cameron Acked-by: Yicong Yang Signed-off-by: Yushan Wang Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/hisi-pmu.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst index 6f0ea4f641cc23..8df048c2649875 100644 --- a/Documentation/admin-guide/perf/hisi-pmu.rst +++ b/Documentation/admin-guide/perf/hisi-pmu.rst @@ -18,9 +18,10 @@ HiSilicon SoC uncore PMU driver Each device PMU has separate registers for event counting, control and interrupt, and the PMU driver shall register perf PMU drivers like L3C, HHA and DDRC etc. The available events and configuration options shall -be described in the sysfs, see: +be described in the sysfs, see:: + +/sys/bus/event_source/devices/hisi_sccl{X}_ -/sys/bus/event_source/devices/hisi_sccl{X}_. The "perf list" command shall list the available events from sysfs. Each L3C, HHA and DDRC is registered as a separate PMU with perf. The PMU From 6d2f913fda5683fbd4c3580262e10386c1263dfb Mon Sep 17 00:00:00 2001 From: Yushan Wang Date: Fri, 29 Aug 2025 18:14:27 +0800 Subject: [PATCH 2485/3206] Documentation: hisi-pmu: Add introduction to HiSilicon V3 PMU Some of HiSilicon V3 PMU hardware is divided into parts to fulfill the job of monitoring specific parts of a device. Add description on that as well as the newly added ext option for L3C PMU. Acked-by: Jonathan Cameron Signed-off-by: Yushan Wang Reviewed-by: Yicong Yang Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/hisi-pmu.rst | 33 +++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst index 8df048c2649875..c4c2cbbf88cb82 100644 --- a/Documentation/admin-guide/perf/hisi-pmu.rst +++ b/Documentation/admin-guide/perf/hisi-pmu.rst @@ -124,6 +124,39 @@ channel with this option. The current supported channels are as follows: 7. tt_en: NoC PMU supports counting only transactions that have tracetag set if this option is set. See the 2nd list for more information about tracetag. +For HiSilicon uncore PMU v3 whose identifier is 0x40, some uncore PMUs are +further divided into parts for finer granularity of tracing, each part has its +own dedicated PMU, and all such PMUs together cover the monitoring job of events +on particular uncore device. Such PMUs are described in sysfs with name format +slightly changed:: + +/sys/bus/event_source/devices/hisi_sccl{X}_ + +Z is the sub-id, indicating different PMUs for part of hardware device. + +Usage of most PMUs with different sub-ids are identical. Specially, L3C PMU +provides ``ext`` option to allow exploration of even finer granual statistics +of L3C PMU. L3C PMU driver uses that as hint of termination when delivering +perf command to hardware: + +- ext=0: Default, could be used with event names. +- ext=1 and ext=2: Must be used with event codes, event names are not supported. + +An example of perf command could be:: + + $# perf stat -a -e hisi_sccl0_l3c1_0/rd_spipe/ sleep 5 + +or:: + + $# perf stat -a -e hisi_sccl0_l3c1_0/event=0x1,ext=1/ sleep 5 + +As above, ``hisi_sccl0_l3c1_0`` locates PMU of Super CPU CLuster 0, L3 cache 1 +pipe0. + +First command locates the first part of L3C since ``ext=0`` is implied by +default. Second command issues the counting on another part of L3C with the +event ``0x1``. + Users could configure IDs to count data come from specific CCL/ICL, by setting srcid_cmd & srcid_msk, and data desitined for specific CCL/ICL by setting tgtid_cmd & tgtid_msk. A set bit in srcid_msk/tgtid_msk means the PMU will not From 878702702dbbd933a5da601c75b8e58eadeec311 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 22 Sep 2025 15:06:31 +0300 Subject: [PATCH 2486/3206] spi: ljca: Remove Wentong's e-mail address Wentong's e-mail address no longer works, remove it. Signed-off-by: Sakari Ailus Link: https://patch.msgid.link/20250922120632.10460-4-sakari.ailus@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-ljca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-ljca.c b/drivers/spi/spi-ljca.c index 2cab79ad2b98f2..3f412cf8f1cd97 100644 --- a/drivers/spi/spi-ljca.c +++ b/drivers/spi/spi-ljca.c @@ -289,7 +289,7 @@ static struct auxiliary_driver ljca_spi_driver = { }; module_auxiliary_driver(ljca_spi_driver); -MODULE_AUTHOR("Wentong Wu "); +MODULE_AUTHOR("Wentong Wu"); MODULE_AUTHOR("Zhifeng Wang "); MODULE_AUTHOR("Lixu Zhang "); MODULE_DESCRIPTION("Intel La Jolla Cove Adapter USB-SPI driver"); From da9e5c04be589524101aac31746902b6803581e4 Mon Sep 17 00:00:00 2001 From: Can Peng Date: Fri, 19 Sep 2025 18:00:42 +0800 Subject: [PATCH 2487/3206] arm/syscalls: mark syscall invocation as likely in invoke_syscall The invoke_syscall() function is overwhelmingly called for valid system call entries. Annotate the main path with likely() to help the compiler generate better branch prediction hints, reducing CPU pipeline stalls due to mispredictions. This is a micro-optimization targeting syscall-heavy workloads [1]. Link: https://lore.kernel.org/r/20250922121730.986761-1-pengcan@kylinos.cn [1] Signed-off-by: Can Peng Signed-off-by: Will Deacon --- arch/arm64/kernel/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index c442fcec6b9e8c..aba7ca6bca2d1a 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -43,7 +43,7 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, add_random_kstack_offset(); - if (scno < sc_nr) { + if (likely(scno < sc_nr)) { syscall_fn_t syscall_fn; syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; ret = __invoke_syscall(regs, syscall_fn); From 4f115596133fa168bac06bb34c6efd8f4d84c22e Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Sun, 21 Sep 2025 23:58:15 +0300 Subject: [PATCH 2488/3206] x86/Kconfig: Reenable PTDUMP on i386 The commit f9aad622006bd64c ("mm: rename GENERIC_PTDUMP and PTDUMP_CORE") has broken PTDUMP and the Kconfig options that use it on ARCH=i386, including CONFIG_DEBUG_WX. CONFIG_GENERIC_PTDUMP was renamed into CONFIG_ARCH_HAS_PTDUMP, but it was mistakenly moved from "config X86" to "config X86_64". That made PTDUMP unavailable for i386. Move CONFIG_ARCH_HAS_PTDUMP back to "config X86" to fix it. [ bp: Massage commit message. ] Fixes: f9aad622006bd64c ("mm: rename GENERIC_PTDUMP and PTDUMP_CORE") Signed-off-by: Alexander Popov Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 52c8910ba2efdd..05880301212e3a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -26,7 +26,6 @@ config X86_64 depends on 64BIT # Options that are inherently 64-bit kernel only: select ARCH_HAS_GIGANTIC_PAGE - select ARCH_HAS_PTDUMP select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK @@ -99,6 +98,7 @@ config X86 select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PREEMPT_LAZY + select ARCH_HAS_PTDUMP select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_HW_PTE_YOUNG select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2 From 8535bd38b4d58a3d19bf8e7dfa66e1d8180b316a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 22 Sep 2025 14:42:35 +0200 Subject: [PATCH 2489/3206] cgroup: add missing ns_common include Add the missing include of the ns_common header. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup_namespace.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h index b7dbf4d623d26b..81ccbdee425b79 100644 --- a/include/linux/cgroup_namespace.h +++ b/include/linux/cgroup_namespace.h @@ -2,6 +2,8 @@ #ifndef _LINUX_CGROUP_NAMESPACE_H #define _LINUX_CGROUP_NAMESPACE_H +#include + struct cgroup_namespace { struct ns_common ns; struct user_namespace *user_ns; From d7610cb7454bbd8bf6d58f71b0ed57155d3c545f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 22 Sep 2025 14:42:36 +0200 Subject: [PATCH 2490/3206] ns: simplify ns_common_init() further Simply derive the ns operations from the namespace type. Acked-by: Thomas Gleixner Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- include/linux/ns_common.h | 30 ++++++++++++++++++++++++++---- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 9 +-------- 9 files changed, 35 insertions(+), 20 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 271cd6294c8a73..d65917ec5544fe 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4104,9 +4104,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } if (anon) - ret = ns_common_init_inum(new_ns, &mntns_operations, MNT_NS_ANON_INO); + ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); else - ret = ns_common_init(new_ns, &mntns_operations); + ret = ns_common_init(new_ns); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index aea8528d799a60..56492cd9ff8d43 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -25,6 +25,17 @@ extern struct time_namespace init_time_ns; extern struct user_namespace init_user_ns; extern struct uts_namespace init_uts_ns; +extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations utsns_operations; +extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -84,10 +95,21 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: &init_user_ns, \ struct uts_namespace *: &init_uts_ns) -#define ns_common_init(__ns, __ops) \ - __ns_common_init(to_ns_common(__ns), __ops, (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) - -#define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) +#define to_ns_operations(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ + struct mnt_namespace *: &mntns_operations, \ + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) + +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), __inum) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) diff --git a/ipc/namespace.c b/ipc/namespace.c index bd85d1c9d2c213..d89dfd718d2b12 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -62,7 +62,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_common_init(ns, &ipcns_operations); + err = ns_common_init(ns); if (err) goto fail_free; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 16ead750837109..04c98338ac08d1 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -27,7 +27,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_common_init(new_ns, &cgroupns_operations); + ret = ns_common_init(new_ns); if (ret) return ERR_PTR(ret); ns_tree_add(new_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 162f5fb63d75a8..a262a3f1944321 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -103,7 +103,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_common_init(ns, &pidns_operations); + err = ns_common_init(ns); if (err) goto out_free_idr; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 7aa4d6fedd4961..9f26e61be044f3 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -97,7 +97,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns->vvar_page) goto fail_free; - err = ns_common_init(ns, &timens_operations); + err = ns_common_init(ns); if (err) goto fail_free_page; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f9df45c4623525..e1559e8a8a02c5 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -126,7 +126,7 @@ int create_user_ns(struct cred *new) ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_common_init(ns, &userns_operations); + ret = ns_common_init(ns); if (ret) goto fail_free; diff --git a/kernel/utsname.c b/kernel/utsname.c index 95d733eb2c985b..00001592ad134b 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -50,7 +50,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - err = ns_common_init(ns, &utsns_operations); + err = ns_common_init(ns); if (err) goto fail_free; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index d5e3fd81916316..bdea7d5fac5608 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -400,16 +400,9 @@ static __net_init void preinit_net_sysctl(struct net *net) /* init code that must occur even if setup_net() is not called. */ static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) { - const struct proc_ns_operations *ns_ops; int ret; -#ifdef CONFIG_NET_NS - ns_ops = &netns_operations; -#else - ns_ops = NULL; -#endif - - ret = ns_common_init(net, ns_ops); + ret = ns_common_init(net); if (ret) return ret; From 5890f504ef543190beae2a4e244bbfa7c3e0b57c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 22 Sep 2025 14:42:37 +0200 Subject: [PATCH 2491/3206] ns: add ns_debug() Add ns_debug() that asserts that the correct operations are used for the namespace type. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/nscommon.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 7aa2be6a0c32a7..3cef89ddef41a3 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -2,6 +2,55 @@ #include #include +#include + +#ifdef CONFIG_DEBUG_VFS +static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) +{ + switch (ns->ops->type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + VFS_WARN_ON_ONCE(ops != &cgroupns_operations); + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + VFS_WARN_ON_ONCE(ops != &ipcns_operations); + break; +#endif + case CLONE_NEWNS: + VFS_WARN_ON_ONCE(ops != &mntns_operations); + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + VFS_WARN_ON_ONCE(ops != &netns_operations); + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + VFS_WARN_ON_ONCE(ops != &pidns_operations); + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + VFS_WARN_ON_ONCE(ops != &timens_operations); + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + VFS_WARN_ON_ONCE(ops != &userns_operations); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + VFS_WARN_ON_ONCE(ops != &utsns_operations); + break; +#endif + default: + VFS_WARN_ON_ONCE(true); + } +} +#endif int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) { @@ -12,6 +61,10 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); +#ifdef CONFIG_DEBUG_VFS + ns_debug(ns, ops); +#endif + if (inum) { ns->inum = inum; return 0; From e11727b2b0ca23d147c4d42f494a59aba0749c89 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Thu, 18 Sep 2025 16:53:41 +0200 Subject: [PATCH 2492/3206] s390/configs: Enable additional network features Enable AF_XDP, kTLS, and Mellanox subfunctions to accelerate network packet processing. Signed-off-by: Hendrik Brueckner Signed-off-by: Alexander Gordeev --- arch/s390/configs/debug_defconfig | 6 ++++++ arch/s390/configs/defconfig | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 6b33429f1c4d4b..e68c56ff6fd835 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -114,8 +114,13 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +CONFIG_TLS_TOE=y CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y @@ -547,6 +552,7 @@ CONFIG_NLMON=m CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_SF=y # CONFIG_NET_VENDOR_META is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index b75eb277585072..d6be7372f41ebf 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -105,8 +105,13 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +CONFIG_TLS_TOE=y CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y @@ -537,6 +542,7 @@ CONFIG_NLMON=m CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_SF=y # CONFIG_NET_VENDOR_META is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set From c6ccc4dde17676dfe617b9a37bd9ba19a8fc87ee Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 20 Sep 2025 22:09:55 +0200 Subject: [PATCH 2493/3206] gpiolib: Extend software-node support to support secondary software-nodes When a software-node gets added to a device which already has another fwnode as primary node it will become the secondary fwnode for that device. Currently if a software-node with GPIO properties ends up as the secondary fwnode then gpiod_find_by_fwnode() will fail to find the GPIOs. Add a new gpiod_fwnode_lookup() helper which falls back to calling gpiod_find_by_fwnode() with the secondary fwnode if the GPIO was not found in the primary fwnode. Fixes: e7f9ff5dc90c ("gpiolib: add support for software nodes") Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Reviewed-by: Dmitry Torokhov Link: https://lore.kernel.org/r/20250920200955.20403-1-hansg@kernel.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 0d2b470a252eeb..74d54513730a70 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -4604,6 +4604,23 @@ static struct gpio_desc *gpiod_find_by_fwnode(struct fwnode_handle *fwnode, return desc; } +static struct gpio_desc *gpiod_fwnode_lookup(struct fwnode_handle *fwnode, + struct device *consumer, + const char *con_id, + unsigned int idx, + enum gpiod_flags *flags, + unsigned long *lookupflags) +{ + struct gpio_desc *desc; + + desc = gpiod_find_by_fwnode(fwnode, consumer, con_id, idx, flags, lookupflags); + if (gpiod_not_found(desc) && !IS_ERR_OR_NULL(fwnode)) + desc = gpiod_find_by_fwnode(fwnode->secondary, consumer, con_id, + idx, flags, lookupflags); + + return desc; +} + struct gpio_desc *gpiod_find_and_request(struct device *consumer, struct fwnode_handle *fwnode, const char *con_id, @@ -4622,8 +4639,8 @@ struct gpio_desc *gpiod_find_and_request(struct device *consumer, int ret = 0; scoped_guard(srcu, &gpio_devices_srcu) { - desc = gpiod_find_by_fwnode(fwnode, consumer, con_id, idx, - &flags, &lookupflags); + desc = gpiod_fwnode_lookup(fwnode, consumer, con_id, idx, + &flags, &lookupflags); if (gpiod_not_found(desc) && platform_lookup_allowed) { /* * Either we are not using DT or ACPI, or their lookup From 302a1f674c00dd5581ab8e493ef44767c5101aab Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Aug 2025 10:03:07 -0400 Subject: [PATCH 2494/3206] Bluetooth: MGMT: Fix possible UAFs This attemps to fix possible UAFs caused by struct mgmt_pending being freed while still being processed like in the following trace, in order to fix mgmt_pending_valid is introduce and use to check if the mgmt_pending hasn't been removed from the pending list, on the complete callbacks it is used to check and in addtion remove the cmd from the list while holding mgmt_pending_lock to avoid TOCTOU problems since if the cmd is left on the list it can still be accessed and freed. BUG: KASAN: slab-use-after-free in mgmt_add_adv_patterns_monitor_sync+0x35/0x50 net/bluetooth/mgmt.c:5223 Read of size 8 at addr ffff8880709d4dc0 by task kworker/u11:0/55 CPU: 0 UID: 0 PID: 55 Comm: kworker/u11:0 Not tainted 6.16.4 #2 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Workqueue: hci0 hci_cmd_sync_work Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x240 mm/kasan/report.c:482 kasan_report+0x118/0x150 mm/kasan/report.c:595 mgmt_add_adv_patterns_monitor_sync+0x35/0x50 net/bluetooth/mgmt.c:5223 hci_cmd_sync_work+0x210/0x3a0 net/bluetooth/hci_sync.c:332 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xade/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x711/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16.4/arch/x86/entry/entry_64.S:245 Allocated by task 12210: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4364 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] mgmt_pending_new+0x65/0x1e0 net/bluetooth/mgmt_util.c:269 mgmt_pending_add+0x35/0x140 net/bluetooth/mgmt_util.c:296 __add_adv_patterns_monitor+0x130/0x200 net/bluetooth/mgmt.c:5247 add_adv_patterns_monitor+0x214/0x360 net/bluetooth/mgmt.c:5364 hci_mgmt_cmd+0x9c9/0xef0 net/bluetooth/hci_sock.c:1719 hci_sock_sendmsg+0x6ca/0xef0 net/bluetooth/hci_sock.c:1839 sock_sendmsg_nosec net/socket.c:714 [inline] __sock_sendmsg+0x219/0x270 net/socket.c:729 sock_write_iter+0x258/0x330 net/socket.c:1133 new_sync_write fs/read_write.c:593 [inline] vfs_write+0x5c9/0xb30 fs/read_write.c:686 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 12221: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4648 [inline] kfree+0x18e/0x440 mm/slub.c:4847 mgmt_pending_free net/bluetooth/mgmt_util.c:311 [inline] mgmt_pending_foreach+0x30d/0x380 net/bluetooth/mgmt_util.c:257 __mgmt_power_off+0x169/0x350 net/bluetooth/mgmt.c:9444 hci_dev_close_sync+0x754/0x1330 net/bluetooth/hci_sync.c:5290 hci_dev_do_close net/bluetooth/hci_core.c:501 [inline] hci_dev_close+0x108/0x200 net/bluetooth/hci_core.c:526 sock_do_ioctl+0xd9/0x300 net/socket.c:1192 sock_ioctl+0x576/0x790 net/socket.c:1313 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xf9/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED") Fixes: 2bd1b237616b ("Bluetooth: hci_sync: Convert MGMT_OP_SET_DISCOVERABLE to use cmd_sync") Fixes: f056a65783cc ("Bluetooth: hci_sync: Convert MGMT_OP_SET_CONNECTABLE to use cmd_sync") Fixes: 3244845c6307 ("Bluetooth: hci_sync: Convert MGMT_OP_SSP") Fixes: d81a494c43df ("Bluetooth: hci_sync: Convert MGMT_OP_SET_LE") Fixes: b338d91703fa ("Bluetooth: Implement support for Mesh") Fixes: 6f6ff38a1e14 ("Bluetooth: hci_sync: Convert MGMT_OP_SET_LOCAL_NAME") Fixes: 71efbb08b538 ("Bluetooth: hci_sync: Convert MGMT_OP_SET_PHY_CONFIGURATION") Fixes: b747a83690c8 ("Bluetooth: hci_sync: Refactor add Adv Monitor") Fixes: abfeea476c68 ("Bluetooth: hci_sync: Convert MGMT_OP_START_DISCOVERY") Fixes: 26ac4c56f03f ("Bluetooth: hci_sync: Convert MGMT_OP_SET_ADVERTISING") Reported-by: cen zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 259 ++++++++++++++++++++++++++------------ net/bluetooth/mgmt_util.c | 46 +++++++ net/bluetooth/mgmt_util.h | 3 + 3 files changed, 231 insertions(+), 77 deletions(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 50634ef5c8b707..225140fcb3d6c8 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1323,8 +1323,7 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) struct mgmt_mode *cp; /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; cp = cmd->param; @@ -1351,23 +1350,29 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) mgmt_status(err)); } - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_powered_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp; + struct mgmt_mode cp; + + mutex_lock(&hdev->mgmt_pending_lock); /* Make sure cmd still outstanding. */ - if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); return -ECANCELED; + } - cp = cmd->param; + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); BT_DBG("%s", hdev->name); - return hci_set_powered_sync(hdev, cp->val); + return hci_set_powered_sync(hdev, cp.val); } static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, @@ -1516,8 +1521,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; hci_dev_lock(hdev); @@ -1539,12 +1543,15 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); } static int set_discoverable_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + BT_DBG("%s", hdev->name); return hci_update_discoverable_sync(hdev); @@ -1691,8 +1698,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; hci_dev_lock(hdev); @@ -1707,7 +1713,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); } @@ -1743,6 +1749,9 @@ static int set_connectable_update_settings(struct hci_dev *hdev, static int set_connectable_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + BT_DBG("%s", hdev->name); return hci_update_connectable_sync(hdev); @@ -1919,14 +1928,17 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) { struct cmd_lookup match = { NULL, hdev }; struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 enable = cp->val; + struct mgmt_mode *cp; + u8 enable; bool changed; /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_SSP, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; + cp = cmd->param; + enable = cp->val; + if (err) { u8 mgmt_err = mgmt_status(err); @@ -1935,8 +1947,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) new_settings(hdev, NULL); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, - cmd_status_rsp, &mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); return; } @@ -1946,7 +1957,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, settings_rsp, &match); + settings_rsp(cmd, &match); if (changed) new_settings(hdev, match.sk); @@ -1960,14 +1971,25 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) static int set_ssp_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; + struct mgmt_mode cp; bool changed = false; int err; - if (cp->val) + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + + if (cp.val) changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED); - err = hci_write_ssp_mode_sync(hdev, cp->val); + err = hci_write_ssp_mode_sync(hdev, cp.val); if (!err && changed) hci_dev_clear_flag(hdev, HCI_SSP_ENABLED); @@ -2060,32 +2082,50 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) static void set_le_complete(struct hci_dev *hdev, void *data, int err) { + struct mgmt_pending_cmd *cmd = data; struct cmd_lookup match = { NULL, hdev }; u8 status = mgmt_status(err); bt_dev_dbg(hdev, "err %d", err); - if (status) { - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, cmd_status_rsp, - &status); + if (err == -ECANCELED || !mgmt_pending_valid(hdev, data)) return; + + if (status) { + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status); + goto done; } - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, settings_rsp, &match); + settings_rsp(cmd, &match); new_settings(hdev, match.sk); if (match.sk) sock_put(match.sk); + +done: + mgmt_pending_free(cmd); } static int set_le_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 val = !!cp->val; + struct mgmt_mode cp; + u8 val; int err; + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + val = !!cp.val; + + mutex_unlock(&hdev->mgmt_pending_lock); + if (!val) { hci_clear_adv_instance_sync(hdev, NULL, 0x00, true); @@ -2127,7 +2167,12 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; u8 status = mgmt_status(err); - struct sock *sk = cmd->sk; + struct sock *sk; + + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) + return; + + sk = cmd->sk; if (status) { mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true, @@ -2142,24 +2187,37 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) static int set_mesh_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_mesh *cp = cmd->param; - size_t len = cmd->param_len; + struct mgmt_cp_set_mesh cp; + size_t len; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + + len = cmd->param_len; memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types)); - if (cp->enable) + if (cp.enable) hci_dev_set_flag(hdev, HCI_MESH); else hci_dev_clear_flag(hdev, HCI_MESH); - hdev->le_scan_interval = __le16_to_cpu(cp->period); - hdev->le_scan_window = __le16_to_cpu(cp->window); + hdev->le_scan_interval = __le16_to_cpu(cp.period); + hdev->le_scan_window = __le16_to_cpu(cp.window); - len -= sizeof(*cp); + len -= sizeof(cp); /* If filters don't fit, forward all adv pkts */ if (len <= sizeof(hdev->mesh_ad_types)) - memcpy(hdev->mesh_ad_types, cp->ad_types, len); + memcpy(hdev->mesh_ad_types, cp.ad_types, len); hci_update_passive_scan_sync(hdev); return 0; @@ -3867,15 +3925,16 @@ static int name_changed_sync(struct hci_dev *hdev, void *data) static void set_name_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_local_name *cp = cmd->param; + struct mgmt_cp_set_local_name *cp; u8 status = mgmt_status(err); bt_dev_dbg(hdev, "err %d", err); - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; + cp = cmd->param; + if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, status); @@ -3887,16 +3946,27 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) hci_cmd_sync_queue(hdev, name_changed_sync, NULL, NULL); } - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_name_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_local_name *cp = cmd->param; + struct mgmt_cp_set_local_name cp; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); if (lmp_bredr_capable(hdev)) { - hci_update_name_sync(hdev, cp->name); + hci_update_name_sync(hdev, cp.name); hci_update_eir_sync(hdev); } @@ -4048,12 +4118,10 @@ int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip) static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct sk_buff *skb = cmd->skb; + struct sk_buff *skb; u8 status = mgmt_status(err); - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) - return; + skb = cmd->skb; if (!status) { if (!skb) @@ -4080,7 +4148,7 @@ static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) if (skb && !IS_ERR(skb)) kfree_skb(skb); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_default_phy_sync(struct hci_dev *hdev, void *data) @@ -4088,7 +4156,9 @@ static int set_default_phy_sync(struct hci_dev *hdev, void *data) struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_set_phy_configuration *cp = cmd->param; struct hci_cp_le_set_default_phy cp_phy; - u32 selected_phys = __le32_to_cpu(cp->selected_phys); + u32 selected_phys; + + selected_phys = __le32_to_cpu(cp->selected_phys); memset(&cp_phy, 0, sizeof(cp_phy)); @@ -4228,7 +4298,7 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, goto unlock; } - cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data, + cmd = mgmt_pending_new(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data, len); if (!cmd) err = -ENOMEM; @@ -5189,7 +5259,17 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, { struct mgmt_rp_add_adv_patterns_monitor rp; struct mgmt_pending_cmd *cmd = data; - struct adv_monitor *monitor = cmd->user_data; + struct adv_monitor *monitor; + + /* This is likely the result of hdev being closed and mgmt_index_removed + * is attempting to clean up any pending command so + * hci_adv_monitors_clear is about to be called which will take care of + * freeing the adv_monitor instances. + */ + if (status == -ECANCELED && !mgmt_pending_valid(hdev, cmd)) + return; + + monitor = cmd->user_data; hci_dev_lock(hdev); @@ -5215,9 +5295,20 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, static int mgmt_add_adv_patterns_monitor_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct adv_monitor *monitor = cmd->user_data; + struct adv_monitor *mon; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + mon = cmd->user_data; + + mutex_unlock(&hdev->mgmt_pending_lock); - return hci_add_adv_monitor(hdev, monitor); + return hci_add_adv_monitor(hdev, mon); } static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, @@ -5484,7 +5575,8 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, status); } -static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, int err) +static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, + int err) { struct mgmt_rp_read_local_oob_data mgmt_rp; size_t rp_size = sizeof(mgmt_rp); @@ -5504,7 +5596,8 @@ static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, int e bt_dev_dbg(hdev, "status %d", status); if (status) { - mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, status); + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, + status); goto remove; } @@ -5786,17 +5879,12 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - if (err == -ECANCELED) - return; - - if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) && - cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) && - cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED: DISCOVERY_FINDING); @@ -5804,6 +5892,9 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) static int start_discovery_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + return hci_start_discovery_sync(hdev); } @@ -6009,15 +6100,14 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); if (!err) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); @@ -6025,6 +6115,9 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) static int stop_discovery_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + return hci_stop_discovery_sync(hdev); } @@ -6234,14 +6327,18 @@ static void enable_advertising_instance(struct hci_dev *hdev, int err) static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) { + struct mgmt_pending_cmd *cmd = data; struct cmd_lookup match = { NULL, hdev }; u8 instance; struct adv_info *adv_instance; u8 status = mgmt_status(err); + if (err == -ECANCELED || !mgmt_pending_valid(hdev, data)) + return; + if (status) { - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, - cmd_status_rsp, &status); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status); + mgmt_pending_free(cmd); return; } @@ -6250,8 +6347,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) else hci_dev_clear_flag(hdev, HCI_ADVERTISING); - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, settings_rsp, - &match); + settings_rsp(cmd, &match); new_settings(hdev, match.sk); @@ -6283,10 +6379,23 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) static int set_adv_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 val = !!cp->val; + struct mgmt_mode cp; + u8 val; - if (cp->val == 0x02) + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + + val = !!cp.val; + + if (cp.val == 0x02) hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE); else hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); @@ -8039,10 +8148,6 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, u8 status = mgmt_status(err); u16 eir_len; - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) - return; - if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -8149,7 +8254,7 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, kfree_skb(skb); kfree(mgmt_rp); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk, @@ -8158,7 +8263,7 @@ static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk, struct mgmt_pending_cmd *cmd; int err; - cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev, + cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev, cp, sizeof(*cp)); if (!cmd) return -ENOMEM; diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index a88a07da394734..aa7b5585cb268b 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -320,6 +320,52 @@ void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) mgmt_pending_free(cmd); } +bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + struct mgmt_pending_cmd *tmp; + + lockdep_assert_held(&hdev->mgmt_pending_lock); + + if (!cmd) + return false; + + list_for_each_entry(tmp, &hdev->mgmt_pending, list) { + if (cmd == tmp) + return true; + } + + return false; +} + +bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + bool listed; + + mutex_lock(&hdev->mgmt_pending_lock); + listed = __mgmt_pending_listed(hdev, cmd); + mutex_unlock(&hdev->mgmt_pending_lock); + + return listed; +} + +bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + bool listed; + + if (!cmd) + return false; + + mutex_lock(&hdev->mgmt_pending_lock); + + listed = __mgmt_pending_listed(hdev, cmd); + if (listed) + list_del(&cmd->list); + + mutex_unlock(&hdev->mgmt_pending_lock); + + return listed; +} + void mgmt_mesh_foreach(struct hci_dev *hdev, void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), void *data, struct sock *sk) diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h index 024e51dd693756..bcba8c9d895285 100644 --- a/net/bluetooth/mgmt_util.h +++ b/net/bluetooth/mgmt_util.h @@ -65,6 +65,9 @@ struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, void *data, u16 len); void mgmt_pending_free(struct mgmt_pending_cmd *cmd); void mgmt_pending_remove(struct mgmt_pending_cmd *cmd); +bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); +bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); +bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); void mgmt_mesh_foreach(struct hci_dev *hdev, void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), void *data, struct sock *sk); From 3bd44edd6c55828fd4e11cb0efce5b7160bfa2de Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Mon, 22 Sep 2025 17:24:21 +0300 Subject: [PATCH 2495/3206] gpio: regmap: fix memory leak of gpio_regmap structure The gpio_regmap structure is leaked on the error path. Fix this by jumping to the appropriate kfree instead of returning directly. Fixes: db305161880a ("gpio: regmap: Allow ngpio to be read from the property") Signed-off-by: Ioana Ciornei Suggested-by: Michael Walle Reviewed-by: Michael Walle Link: https://lore.kernel.org/r/20250922142427.3310221-7-ioana.ciornei@nxp.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-regmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c index e8a32dfebdcb31..3f8b72311f8e84 100644 --- a/drivers/gpio/gpio-regmap.c +++ b/drivers/gpio/gpio-regmap.c @@ -274,7 +274,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config if (!chip->ngpio) { ret = gpiochip_get_ngpios(chip, chip->parent); if (ret) - return ERR_PTR(ret); + goto err_free_gpio; } /* if not set, assume there is only one register */ From ab073abf6d974d3fe998fc6731ca80e2b57ffd69 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Sep 2025 05:55:41 -0600 Subject: [PATCH 2496/3206] block: fix EOD return for device with nr_sectors == 0 A recent commit skipped dumping the usual "attempt to access beyond end of device" message if the device size is 0 sectors, as that's a common pattern for devices that have been hot removed. But while it stopped that message, it also prevented returning -EIO for that condition. Reinstate the -EIO return, while retaining the quiet operation for triggering EOD for a device with 0 sectors. Reported-by: syzbot+4b12286339fe4c2700c1@syzkaller.appspotmail.com Reported-by: Sahil Chandna Fixes: d0a2b527d8c3 ("block: tone down bio_check_eod") Tested-by: Sahil Chandna Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 4201504158a17e..a27185cd8edead 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -557,9 +557,11 @@ static inline int bio_check_eod(struct bio *bio) sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio); - if (nr_sectors && maxsector && + if (nr_sectors && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { + if (!maxsector) + return -EIO; pr_info_ratelimited("%s: attempt to access beyond end of device\n" "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", current->comm, bio->bi_bdev, bio->bi_opf, From 0950c64ae38661bd97127e9aa0522f1624f82006 Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Mon, 22 Sep 2025 12:26:06 +0000 Subject: [PATCH 2497/3206] workqueue: fix texinfodocs warning for WQ_* flags reference Sphinx emitted a warning during make texinfodocs: WARNING: Inline literal start-string without end-string. This was caused by the trailing '*' in "%WQ_*" being parsed as reStructuredText markup in the kernel-doc comment. Escape the '*' in the comment so that Sphinx treats it as a literal character, resolving the warning. Signed-off-by: Kriish Sharma Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 71a9900c03c706..dabc351cc127d4 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -502,7 +502,7 @@ void workqueue_softirq_dead(unsigned int cpu); * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means * that the sum of per-node max_active's may be larger than @max_active. * - * For detailed information on %WQ_* flags, please refer to + * For detailed information on %WQ_\* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: From 500dad428e5b0de4c1bdfa893822a6e06ddad0b5 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Tue, 16 Sep 2025 19:00:28 +0200 Subject: [PATCH 2498/3206] drm/xe/vf: Don't expose sysfs attributes not applicable for VFs VFs can't read BMG_PCIE_CAP(0x138340) register nor access PCODE (already guarded by the info.skip_pcode flag) so we shouldn't expose attributes that require any of them to avoid errors like: [] xe 0000:03:00.1: [drm] Tile0: GT0: VF is trying to read an \ inaccessible register 0x138340+0x0 [] RIP: 0010:xe_gt_sriov_vf_read32+0x6c2/0x9a0 [xe] [] Call Trace: [] xe_mmio_read32+0x110/0x280 [xe] [] auto_link_downgrade_capable_show+0x2e/0x70 [xe] [] dev_attr_show+0x1a/0x70 [] sysfs_kf_seq_show+0xaa/0x120 [] kernfs_seq_show+0x41/0x60 Fixes: 0e414bf7ad01 ("drm/xe: Expose PCIe link downgrade attributes") Fixes: cdc36b66cd41 ("drm/xe: Expose fan control and voltage regulator version") Signed-off-by: Michal Wajdeczko Cc: Lucas De Marchi Cc: Lukasz Laguna Reviewed-by: Raag Jadav Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250916170029.3313-2-michal.wajdeczko@intel.com (cherry picked from commit a2d6223d224f333f705ed8495bf8bebfbc585c35) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c index 3e3b2d9033a7f8..927ee7991696ba 100644 --- a/drivers/gpu/drm/xe/xe_device_sysfs.c +++ b/drivers/gpu/drm/xe/xe_device_sysfs.c @@ -308,7 +308,7 @@ int xe_device_sysfs_init(struct xe_device *xe) return ret; } - if (xe->info.platform == XE_BATTLEMAGE) { + if (xe->info.platform == XE_BATTLEMAGE && !IS_SRIOV_VF(xe)) { ret = sysfs_create_files(&dev->kobj, auto_link_downgrade_attrs); if (ret) goto cleanup; From b67e7422d229dead0dddaad7e7c05558f24d552f Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 12 Sep 2025 14:54:51 -0700 Subject: [PATCH 2499/3206] drm/xe: Fix build with CONFIG_MODULES=n When building with CONFIG_MODULES=n, the __exit functions are dropped. However our init functions may call them for error handling, so they are not good candidates for the exit sections. Fix this error reported by 0day: ld.lld: error: relocation refers to a symbol in a discarded section: xe_configfs_exit >>> defined in vmlinux.a(drivers/gpu/drm/xe/xe_configfs.o) >>> referenced by xe_module.c >>> drivers/gpu/drm/xe/xe_module.o:(init_funcs) in archive vmlinux.a This is the only exit function using __exit. Drop it to fix the build. Cc: Riana Tauro Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202506092221.1FmUQmI8-lkp@intel.com/ Fixes: 16280ded45fb ("drm/xe: Add configfs to enable survivability mode") Reviewed-by: Balasubramani Vivekanandan Link: https://lore.kernel.org/r/20250912-fix-nomodule-build-v1-1-d11b70a92516@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit d9b2623319fa20c2206754284291817488329648) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_configfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c index e9b46a2d00195a..58c1f397c68c94 100644 --- a/drivers/gpu/drm/xe/xe_configfs.c +++ b/drivers/gpu/drm/xe/xe_configfs.c @@ -404,7 +404,7 @@ int __init xe_configfs_init(void) return 0; } -void __exit xe_configfs_exit(void) +void xe_configfs_exit(void) { configfs_unregister_subsystem(&xe_configfs); } From 77c8ede611c6a70a95f7b15648551d0121b40d6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Thu, 18 Sep 2025 11:22:05 +0200 Subject: [PATCH 2500/3206] drm/xe: Don't copy pinned kernel bos twice on suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We were copying the bo content the bos on the list "xe->pinned.late.kernel_bo_present" twice on suspend. Presumingly the intent is to copy the pinned external bos on the first pass. This is harmless since we (currently) should have no pinned external bos needing copy since a) exernal system bos don't have compressed content, b) We do not (yet) allow pinning of VRAM bos. Still, fix this up so that we copy pinned external bos on the first pass. We're about to allow bos pinned in VRAM. Fixes: c6a4d46ec1d7 ("drm/xe: evict user memory in PM notifier") Cc: Matthew Auld Cc: # v6.16+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Link: https://lore.kernel.org/r/20250918092207.54472-2-thomas.hellstrom@linux.intel.com (cherry picked from commit 9e69bafece43dcefec864f00b3ec7e088aa7fcbc) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_bo_evict.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo_evict.c b/drivers/gpu/drm/xe/xe_bo_evict.c index 7484ce55a303d6..d5dbc51e8612d8 100644 --- a/drivers/gpu/drm/xe/xe_bo_evict.c +++ b/drivers/gpu/drm/xe/xe_bo_evict.c @@ -158,8 +158,8 @@ int xe_bo_evict_all(struct xe_device *xe) if (ret) return ret; - ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.kernel_bo_present, - &xe->pinned.late.evicted, xe_bo_evict_pinned); + ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.external, + &xe->pinned.late.external, xe_bo_evict_pinned); if (!ret) ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.kernel_bo_present, From 55ed11b181c43d81ce03b50209e4e7c4a14ba099 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 20 Sep 2025 15:26:21 +0200 Subject: [PATCH 2501/3206] sched_ext: idle: Handle migration-disabled tasks in BPF code When scx_bpf_select_cpu_dfl()/and() kfuncs are invoked outside of ops.select_cpu() we can't rely on @p->migration_disabled to determine if migration is disabled for the task @p. In fact, migration is always disabled for the current task while running BPF code: __bpf_prog_enter() disables migration and __bpf_prog_exit() re-enables it. To handle this, when @p->migration_disabled == 1, check whether @p is the current task. If so, migration was not disabled before entering the callback, otherwise migration was disabled. This ensures correct idle CPU selection in all cases. The behavior of ops.select_cpu() remains unchanged, because this callback is never invoked for the current task and migration-disabled tasks are always excluded. Example: without this change scx_bpf_select_cpu_and() called from ops.enqueue() always returns -EBUSY; with this change applied, it correctly returns idle CPUs. Fixes: 06efc9fe0b8de ("sched_ext: idle: Handle migration-disabled tasks in idle selection") Cc: stable@vger.kernel.org # v6.16+ Signed-off-by: Andrea Righi Acked-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext_idle.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7174e1c1a392c7..537c6992bb63f7 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -856,6 +856,32 @@ static bool check_builtin_idle_enabled(void) return false; } +/* + * Determine whether @p is a migration-disabled task in the context of BPF + * code. + * + * We can't simply check whether @p->migration_disabled is set in a + * sched_ext callback, because migration is always disabled for the current + * task while running BPF code. + * + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively + * disable and re-enable migration. For this reason, the current task + * inside a sched_ext callback is always a migration-disabled task. + * + * Therefore, when @p->migration_disabled == 1, check whether @p is the + * current task or not: if it is, then migration was not disabled before + * entering the callback, otherwise migration was disabled. + * + * Returns true if @p is migration-disabled, false otherwise. + */ +static bool is_bpf_migration_disabled(const struct task_struct *p) +{ + if (p->migration_disabled == 1) + return p != current; + else + return p->migration_disabled; +} + static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *allowed, u64 flags) { @@ -898,7 +924,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f * selection optimizations and simply check whether the previously * used CPU is idle and within the allowed cpumask. */ - if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) { + if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) { if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && scx_idle_test_and_clear_cpu(prev_cpu)) cpu = prev_cpu; From 4d330fe54145ecfbb657ac01a554fdedf3c1927e Mon Sep 17 00:00:00 2001 From: Chen Pei Date: Sat, 13 Sep 2025 15:08:15 +0800 Subject: [PATCH 2502/3206] ACPI: SPCR: Support Precise Baud Rate field The Microsoft Serial Port Console Redirection (SPCR) specification revision 1.09 comprises additional field: Precise Baud Rate [1]. It is used to describe non-traditional baud rates (such as those used by high-speed UARTs). It contains a specific non-zero baud rate which overrides the value of the Configured Baud Rate field. If this field is zero or not present, Configured Baud Rate is used. Link: https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table [1] Signed-off-by: Chen Pei Link: https://patch.msgid.link/20250913070815.16758-1-cp0613@linux.alibaba.com [ rjw: Corrected typo in the subject ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/spcr.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/spcr.c b/drivers/acpi/spcr.c index 208d6bbc65e037..d4d52d5e9016ca 100644 --- a/drivers/acpi/spcr.c +++ b/drivers/acpi/spcr.c @@ -149,7 +149,15 @@ int __init acpi_parse_spcr(bool enable_earlycon, bool enable_console) goto done; } - switch (table->baud_rate) { + /* + * SPCR 1.09 defines Precise Baud Rate Filed contains a specific + * non-zero baud rate which overrides the value of the Configured + * Baud Rate field. If this field is zero or not present, Configured + * Baud Rate is used. + */ + if (table->precise_baudrate) + baud_rate = table->precise_baudrate; + else switch (table->baud_rate) { case 0: /* * SPCR 1.04 defines 0 as a preconfigured state of UART. From 5b87014e999903b63c6416183446bcc18aabe5ef Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Mon, 15 Sep 2025 15:43:16 -0500 Subject: [PATCH 2503/3206] x86/acpi/cstate: Remove open coded check for cpu_feature_enabled() acpi_processor_power_init_bm_check() has an open coded implementation of cpu_feature_enabled() to detect X86_FEATURE_ZEN. Switch to using cpu_feature_enabled(). Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/cstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 8698d66563ed64..0281703da5e262 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -89,7 +89,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, */ flags->bm_control = 0; } - if (c->x86_vendor == X86_VENDOR_AMD && c->x86 >= 0x17) { + if (cpu_feature_enabled(X86_FEATURE_ZEN)) { /* * For all AMD Zen or newer CPUs that support C3, caches * should not be flushed by software while entering C3 From 6e6c88d85623dc0c5c3faf185c12bd723efde5ee Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 18 Sep 2025 17:33:16 -0700 Subject: [PATCH 2504/3206] broadcom: fix support for PTP_PEROUT_DUTY_CYCLE The bcm_ptp_perout_locked() function has support for handling PTP_PEROUT_DUTY_CYCLE, but its not listed in the supported_perout_flags. Attempts to use the duty cycle support will be rejected since commit d9f3e9ecc456 ("net: ptp: introduce .supported_perout_flags to ptp_clock_info"), as this flag accidentally missed while doing the conversion. Drop the unnecessary supported flags check from the bcm_ptp_perout_locked() function and correctly set the supported_perout_flags. This fixes use of the PTP_PEROUT_DUTY_CYCLE support for the broadcom driver. Reported-by: James Clark Fixes: d9f3e9ecc456 ("net: ptp: introduce .supported_perout_flags to ptp_clock_info") Signed-off-by: Jacob Keller Reviewed-by: Vadim Fedorenko Acked-by: Richard Cochran Reviewed-by: Kory Maincent Tested-by: James Clark Link: https://patch.msgid.link/20250918-jk-fix-bcm-phy-supported-flags-v1-1-747b60407c9c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/bcm-phy-ptp.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/phy/bcm-phy-ptp.c b/drivers/net/phy/bcm-phy-ptp.c index eba8b5fb1365f4..1cf695ac73cc57 100644 --- a/drivers/net/phy/bcm-phy-ptp.c +++ b/drivers/net/phy/bcm-phy-ptp.c @@ -597,10 +597,6 @@ static int bcm_ptp_perout_locked(struct bcm_ptp_private *priv, period = BCM_MAX_PERIOD_8NS; /* write nonzero value */ - /* Reject unsupported flags */ - if (req->flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - if (req->flags & PTP_PEROUT_DUTY_CYCLE) pulse = ktime_to_ns(ktime_set(req->on.sec, req->on.nsec)); else @@ -741,6 +737,7 @@ static const struct ptp_clock_info bcm_ptp_clock_info = { .n_pins = 1, .n_per_out = 1, .n_ext_ts = 1, + .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE, }; static void bcm_ptp_txtstamp(struct mii_timestamper *mii_ts, From 3200fdd4021de1d182fa3b6db5ad936d519f3848 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 18 Sep 2025 17:33:17 -0700 Subject: [PATCH 2505/3206] broadcom: fix support for PTP_EXTTS_REQUEST2 ioctl Commit 7c571ac57d9d ("net: ptp: introduce .supported_extts_flags to ptp_clock_info") modified the PTP core kernel logic to validate the supported flags for the PTP_EXTTS_REQUEST ioctls, rather than relying on each individual driver correctly checking its flags. The bcm_ptp_enable() function implements support for PTP_CLK_REQ_EXTTS, but does not check the flags, and does not forward the request structure into bcm_ptp_extts_locked(). When originally converting the bcm-phy-ptp.c code, it was unclear what edges the hardware actually timestamped. Thus, no flags were initialized in the .supported_extts_flags field. This results in the kernel automatically rejecting all userspace requests for the PTP_EXTTS_REQUEST2 ioctl. This occurs because the PTP_STRICT_FLAGS is always assumed when operating under PTP_EXTTS_REQUEST2. This has been the case since the flags introduction by commit 6138e687c7b6 ("ptp: Introduce strict checking of external time stamp options."). The bcm-phy-ptp.c logic never properly supported strict flag validation, as it previously ignored all flags including both PTP_STRICT_FLAGS and the PTP_FALLING_EDGE and PTP_RISING_EDGE flags. Reports from users in the field prove that the hardware timestamps the rising edge. Encode this in the .supported_extts_flags field. This re-enables support for the PTP_EXTTS_REQUEST2 ioctl. Reported-by: James Clark Fixes: 7c571ac57d9d ("net: ptp: introduce .supported_extts_flags to ptp_clock_info") Signed-off-by: Jacob Keller Reviewed-by: Vadim Fedorenko Acked-by: Richard Cochran Reviewed-by: Kory Maincent Tested-by: James Clark Link: https://patch.msgid.link/20250918-jk-fix-bcm-phy-supported-flags-v1-2-747b60407c9c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/bcm-phy-ptp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/bcm-phy-ptp.c b/drivers/net/phy/bcm-phy-ptp.c index 1cf695ac73cc57..d3501f8487d964 100644 --- a/drivers/net/phy/bcm-phy-ptp.c +++ b/drivers/net/phy/bcm-phy-ptp.c @@ -738,6 +738,7 @@ static const struct ptp_clock_info bcm_ptp_clock_info = { .n_per_out = 1, .n_ext_ts = 1, .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE, + .supported_extts_flags = PTP_STRICT_FLAGS | PTP_RISING_EDGE, }; static void bcm_ptp_txtstamp(struct mii_timestamper *mii_ts, From cd875625b475dc4e28ac302ccb3422cc9f678f89 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 18 Sep 2025 17:33:18 -0700 Subject: [PATCH 2506/3206] ptp: document behavior of PTP_STRICT_FLAGS Commit 6138e687c7b6 ("ptp: Introduce strict checking of external time stamp options.") added the PTP_STRICT_FLAGS to the set of flags supported for the external timestamp request ioctl. It is only supported by PTP_EXTTS_REQUEST2, as it was introduced the introduction of the new ioctls. Further, the kernel has always set this flag for PTP_EXTTS_REQUEST2 regardless of whether or not the user requested the behavior. This effectively means that the flag is not useful for userspace. If the user issues a PTP_EXTTS_REQUEST ioctl, the flag is ignored due to not being supported on the old ioctl. If the user issues a PTP_EXTTS_REQUEST2 ioctl, the flag will be set by the kernel regardless of whether the user set the flag in their structure. Add a comment documenting this behavior in the uAPI header file. Signed-off-by: Jacob Keller Reviewed-by: Vadim Fedorenko Acked-by: Richard Cochran Reviewed-by: Kory Maincent Tested-by: James Clark Link: https://patch.msgid.link/20250918-jk-fix-bcm-phy-supported-flags-v1-3-747b60407c9c@intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ptp_clock.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 18eefa6d93d62f..2c3346e91dbe59 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -37,6 +37,9 @@ /* * flag fields valid for the new PTP_EXTTS_REQUEST2 ioctl. + * + * Note: PTP_STRICT_FLAGS is always enabled by the kernel for + * PTP_EXTTS_REQUEST2 regardless of whether it is set by userspace. */ #define PTP_EXTTS_VALID_FLAGS (PTP_ENABLE_FEATURE | \ PTP_RISING_EDGE | \ From 9578c3906c7d9a6f7c1ffefc73a4e8fc3452f336 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 13 Aug 2025 11:45:05 -0400 Subject: [PATCH 2507/3206] rust: macros: reduce collections in `quote!` macro Remove a handful of unnecessary intermediate vectors and token streams; mainly the top-level stream can be directly extended with the notable exception of groups. Remove an unnecessary `#[allow(dead_code)]` added in commit dbd5058ba60c ("rust: make pin-init its own crate"). Acked-by: Greg Kroah-Hartman Reviewed-by: Alice Ryhl Acked-by: Danilo Krummrich Signed-off-by: Tamir Duberstein Signed-off-by: Miguel Ojeda --- rust/macros/quote.rs | 104 ++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 55 deletions(-) diff --git a/rust/macros/quote.rs b/rust/macros/quote.rs index 92cacc4067c9ac..acc140c186538d 100644 --- a/rust/macros/quote.rs +++ b/rust/macros/quote.rs @@ -2,7 +2,6 @@ use proc_macro::{TokenStream, TokenTree}; -#[allow(dead_code)] pub(crate) trait ToTokens { fn to_tokens(&self, tokens: &mut TokenStream); } @@ -47,121 +46,116 @@ impl ToTokens for TokenStream { /// `quote` crate but provides only just enough functionality needed by the current `macros` crate. macro_rules! quote_spanned { ($span:expr => $($tt:tt)*) => {{ - let mut tokens: ::std::vec::Vec<::proc_macro::TokenTree>; - #[allow(clippy::vec_init_then_push)] + let mut tokens = ::proc_macro::TokenStream::new(); { - tokens = ::std::vec::Vec::new(); let span = $span; quote_spanned!(@proc tokens span $($tt)*); } - ::proc_macro::TokenStream::from_iter(tokens) + tokens }}; (@proc $v:ident $span:ident) => {}; (@proc $v:ident $span:ident #$id:ident $($tt:tt)*) => { - let mut ts = ::proc_macro::TokenStream::new(); - $crate::quote::ToTokens::to_tokens(&$id, &mut ts); - $v.extend(ts); + $crate::quote::ToTokens::to_tokens(&$id, &mut $v); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident #(#$id:ident)* $($tt:tt)*) => { for token in $id { - let mut ts = ::proc_macro::TokenStream::new(); - $crate::quote::ToTokens::to_tokens(&token, &mut ts); - $v.extend(ts); + $crate::quote::ToTokens::to_tokens(&token, &mut $v); } quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident ( $($inner:tt)* ) $($tt:tt)*) => { #[allow(unused_mut)] - let mut tokens = ::std::vec::Vec::<::proc_macro::TokenTree>::new(); + let mut tokens = ::proc_macro::TokenStream::new(); quote_spanned!(@proc tokens $span $($inner)*); - $v.push(::proc_macro::TokenTree::Group(::proc_macro::Group::new( + $v.extend([::proc_macro::TokenTree::Group(::proc_macro::Group::new( ::proc_macro::Delimiter::Parenthesis, - ::proc_macro::TokenStream::from_iter(tokens) - ))); + tokens, + ))]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident [ $($inner:tt)* ] $($tt:tt)*) => { - let mut tokens = ::std::vec::Vec::new(); + let mut tokens = ::proc_macro::TokenStream::new(); quote_spanned!(@proc tokens $span $($inner)*); - $v.push(::proc_macro::TokenTree::Group(::proc_macro::Group::new( + $v.extend([::proc_macro::TokenTree::Group(::proc_macro::Group::new( ::proc_macro::Delimiter::Bracket, - ::proc_macro::TokenStream::from_iter(tokens) - ))); + tokens, + ))]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident { $($inner:tt)* } $($tt:tt)*) => { - let mut tokens = ::std::vec::Vec::new(); + let mut tokens = ::proc_macro::TokenStream::new(); quote_spanned!(@proc tokens $span $($inner)*); - $v.push(::proc_macro::TokenTree::Group(::proc_macro::Group::new( + $v.extend([::proc_macro::TokenTree::Group(::proc_macro::Group::new( ::proc_macro::Delimiter::Brace, - ::proc_macro::TokenStream::from_iter(tokens) - ))); + tokens, + ))]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident :: $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new(':', ::proc_macro::Spacing::Joint) - )); - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new(':', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::Spacing::Joint, ::proc_macro::Spacing::Alone].map(|spacing| { + ::proc_macro::TokenTree::Punct(::proc_macro::Punct::new(':', spacing)) + })); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident : $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new(':', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new(':', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident , $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new(',', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new(',', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident @ $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new('@', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new('@', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident ! $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new('!', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new('!', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident ; $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new(';', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new(';', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident + $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new('+', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new('+', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident = $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new('=', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new('=', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident # $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new('#', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new('#', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident _ $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Ident(::proc_macro::Ident::new("_", $span))); + $v.extend([::proc_macro::TokenTree::Ident( + ::proc_macro::Ident::new("_", $span), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident $id:ident $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Ident(::proc_macro::Ident::new(stringify!($id), $span))); + $v.extend([::proc_macro::TokenTree::Ident( + ::proc_macro::Ident::new(stringify!($id), $span), + )]); quote_spanned!(@proc $v $span $($tt)*); }; } From 2066f00e5b2dc061fb6d8c88fadaebc97f11feaa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 21 Sep 2025 10:56:40 +0200 Subject: [PATCH 2508/3206] x86/topology: Implement topology_is_core_online() to address SMT regression Christian reported that commit a430c11f4015 ("intel_idle: Rescan "dead" SMT siblings during initialization") broke the use case in which both 'nosmt' and 'maxcpus' are on the kernel command line because it onlines primary threads, which were offline due to the maxcpus limit. The initially proposed fix to skip primary threads in the loop is inconsistent. While it prevents the primary thread to be onlined, it then onlines the corresponding hyperthread(s), which does not really make sense. The CPU iterator in cpuhp_smt_enable() contains a check which excludes all threads of a core, when the primary thread is offline. The default implementation is a NOOP and therefore not effective on x86. Implement topology_is_core_online() on x86 to address this issue. This makes the behaviour consistent between x86 and PowerPC. Fixes: a430c11f4015 ("intel_idle: Rescan "dead" SMT siblings during initialization") Fixes: f694481b1d31 ("ACPI: processor: Rescan "dead" SMT siblings during initialization") Closes: https://lore.kernel.org/linux-pm/724616a2-6374-4ba3-8ce3-ea9c45e2ae3b@arm.com/ Reported-by: Christian Loehle Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Rafael J. Wysocki (Intel) Tested-by: Christian Loehle Cc: stable@vger.kernel.org Link: https://lore.kernel.org/12740505.O9o76ZdvQC@rafael.j.wysocki --- arch/x86/include/asm/topology.h | 10 ++++++++++ arch/x86/kernel/cpu/topology.c | 13 +++++++++++++ 2 files changed, 23 insertions(+) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 6c79ee7c0957a7..21041898157a1f 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -231,6 +231,16 @@ static inline bool topology_is_primary_thread(unsigned int cpu) } #define topology_is_primary_thread topology_is_primary_thread +int topology_get_primary_thread(unsigned int cpu); + +static inline bool topology_is_core_online(unsigned int cpu) +{ + int pcpu = topology_get_primary_thread(cpu); + + return pcpu >= 0 ? cpu_online(pcpu) : false; +} +#define topology_is_core_online topology_is_core_online + #else /* CONFIG_SMP */ static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } static inline int topology_max_smt_threads(void) { return 1; } diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index e35ccdc84910f5..6073a16628f9e4 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -372,6 +372,19 @@ unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_uni return topo_unit_count(lvlid, at_level, apic_maps[which_units].map); } +#ifdef CONFIG_SMP +int topology_get_primary_thread(unsigned int cpu) +{ + u32 apic_id = cpuid_to_apicid[cpu]; + + /* + * Get the core domain level APIC id, which is the primary thread + * and return the CPU number assigned to it. + */ + return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN)); +} +#endif + #ifdef CONFIG_ACPI_HOTPLUG_CPU /** * topology_hotplug_apic - Handle a physical hotplugged APIC after boot From ea60cea07d8c632e8e593bb9de804abb0f6aee99 Mon Sep 17 00:00:00 2001 From: Alexandre Courbot Date: Mon, 8 Sep 2025 22:25:54 +0900 Subject: [PATCH 2509/3206] rust: add `Alignment` type Alignment operations are very common in the kernel. Since they are always performed using a power-of-two value, enforcing this invariant through a dedicated type leads to fewer bugs and can improve the generated code. Introduce the `Alignment` type, inspired by the nightly Rust type of the same name and providing the same interface, and a new `Alignable` trait allowing unsigned integers to be aligned up or down. Reviewed-by: Alice Ryhl Reviewed-by: Danilo Krummrich Signed-off-by: Alexandre Courbot [ Used `build_assert!`, added intra-doc link, `allow`ed `clippy::incompatible_msrv`, added `feature(const_option)`, capitalized safety comment. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/kernel/lib.rs | 3 + rust/kernel/ptr.rs | 228 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+) create mode 100644 rust/kernel/ptr.rs diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 733f4d4e9baeb0..f910a5ab80ba50 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -17,6 +17,7 @@ // the unstable features in use. // // Stable since Rust 1.79.0. +#![feature(generic_nonzero)] #![feature(inline_const)] // // Stable since Rust 1.81.0. @@ -28,6 +29,7 @@ // Stable since Rust 1.83.0. #![feature(const_maybe_uninit_as_mut_ptr)] #![feature(const_mut_refs)] +#![feature(const_option)] #![feature(const_ptr_write)] #![feature(const_refs_to_cell)] // @@ -110,6 +112,7 @@ pub mod pid_namespace; pub mod platform; pub mod prelude; pub mod print; +pub mod ptr; pub mod rbtree; pub mod regulator; pub mod revocable; diff --git a/rust/kernel/ptr.rs b/rust/kernel/ptr.rs new file mode 100644 index 00000000000000..2e5e2a090480aa --- /dev/null +++ b/rust/kernel/ptr.rs @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Types and functions to work with pointers and addresses. + +use core::fmt::Debug; +use core::mem::align_of; +use core::num::NonZero; + +use crate::build_assert; + +/// Type representing an alignment, which is always a power of two. +/// +/// It is used to validate that a given value is a valid alignment, and to perform masking and +/// alignment operations. +/// +/// This is a temporary substitute for the [`Alignment`] nightly type from the standard library, +/// and to be eventually replaced by it. +/// +/// [`Alignment`]: https://github.com/rust-lang/rust/issues/102070 +/// +/// # Invariants +/// +/// An alignment is always a power of two. +#[repr(transparent)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Alignment(NonZero); + +impl Alignment { + /// Validates that `ALIGN` is a power of two at build-time, and returns an [`Alignment`] of the + /// same value. + /// + /// A build error is triggered if `ALIGN` is not a power of two. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// let v = Alignment::new::<16>(); + /// assert_eq!(v.as_usize(), 16); + /// ``` + #[inline(always)] + pub const fn new() -> Self { + build_assert!( + ALIGN.is_power_of_two(), + "Provided alignment is not a power of two." + ); + + // INVARIANT: `align` is a power of two. + // SAFETY: `align` is a power of two, and thus non-zero. + Self(unsafe { NonZero::new_unchecked(ALIGN) }) + } + + /// Validates that `align` is a power of two at runtime, and returns an + /// [`Alignment`] of the same value. + /// + /// Returns [`None`] if `align` is not a power of two. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// assert_eq!(Alignment::new_checked(16), Some(Alignment::new::<16>())); + /// assert_eq!(Alignment::new_checked(15), None); + /// assert_eq!(Alignment::new_checked(1), Some(Alignment::new::<1>())); + /// assert_eq!(Alignment::new_checked(0), None); + /// ``` + #[inline(always)] + pub const fn new_checked(align: usize) -> Option { + if align.is_power_of_two() { + // INVARIANT: `align` is a power of two. + // SAFETY: `align` is a power of two, and thus non-zero. + Some(Self(unsafe { NonZero::new_unchecked(align) })) + } else { + None + } + } + + /// Returns the alignment of `T`. + /// + /// This is equivalent to [`align_of`], but with the return value provided as an [`Alignment`]. + #[inline(always)] + pub const fn of() -> Self { + #![allow(clippy::incompatible_msrv)] + // This cannot panic since alignments are always powers of two. + // + // We unfortunately cannot use `new` as it would require the `generic_const_exprs` feature. + const { Alignment::new_checked(align_of::()).unwrap() } + } + + /// Returns this alignment as a [`usize`]. + /// + /// It is guaranteed to be a power of two. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// assert_eq!(Alignment::new::<16>().as_usize(), 16); + /// ``` + #[inline(always)] + pub const fn as_usize(self) -> usize { + self.as_nonzero().get() + } + + /// Returns this alignment as a [`NonZero`]. + /// + /// It is guaranteed to be a power of two. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// assert_eq!(Alignment::new::<16>().as_nonzero().get(), 16); + /// ``` + #[inline(always)] + pub const fn as_nonzero(self) -> NonZero { + // Allow the compiler to know that the value is indeed a power of two. This can help + // optimize some operations down the line, like e.g. replacing divisions by bit shifts. + if !self.0.is_power_of_two() { + // SAFETY: Per the invariants, `self.0` is always a power of two so this block will + // never be reached. + unsafe { core::hint::unreachable_unchecked() } + } + self.0 + } + + /// Returns the base-2 logarithm of the alignment. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// assert_eq!(Alignment::of::().log2(), 0); + /// assert_eq!(Alignment::new::<16>().log2(), 4); + /// ``` + #[inline(always)] + pub const fn log2(self) -> u32 { + self.0.ilog2() + } + + /// Returns the mask for this alignment. + /// + /// This is equivalent to `!(self.as_usize() - 1)`. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// assert_eq!(Alignment::new::<0x10>().mask(), !0xf); + /// ``` + #[inline(always)] + pub const fn mask(self) -> usize { + // No underflow can occur as the alignment is guaranteed to be a power of two, and thus is + // non-zero. + !(self.as_usize() - 1) + } +} + +/// Trait for items that can be aligned against an [`Alignment`]. +pub trait Alignable: Sized { + /// Aligns `self` down to `alignment`. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::{Alignable, Alignment}; + /// + /// assert_eq!(0x2f_usize.align_down(Alignment::new::<0x10>()), 0x20); + /// assert_eq!(0x30usize.align_down(Alignment::new::<0x10>()), 0x30); + /// assert_eq!(0xf0u8.align_down(Alignment::new::<0x1000>()), 0x0); + /// ``` + fn align_down(self, alignment: Alignment) -> Self; + + /// Aligns `self` up to `alignment`, returning `None` if aligning would result in an overflow. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::{Alignable, Alignment}; + /// + /// assert_eq!(0x4fusize.align_up(Alignment::new::<0x10>()), Some(0x50)); + /// assert_eq!(0x40usize.align_up(Alignment::new::<0x10>()), Some(0x40)); + /// assert_eq!(0x0usize.align_up(Alignment::new::<0x10>()), Some(0x0)); + /// assert_eq!(u8::MAX.align_up(Alignment::new::<0x10>()), None); + /// assert_eq!(0x10u8.align_up(Alignment::new::<0x100>()), None); + /// assert_eq!(0x0u8.align_up(Alignment::new::<0x100>()), Some(0x0)); + /// ``` + fn align_up(self, alignment: Alignment) -> Option; +} + +/// Implement [`Alignable`] for unsigned integer types. +macro_rules! impl_alignable_uint { + ($($t:ty),*) => { + $( + impl Alignable for $t { + #[inline(always)] + fn align_down(self, alignment: Alignment) -> Self { + // The operands of `&` need to be of the same type so convert the alignment to + // `Self`. This means we need to compute the mask ourselves. + ::core::num::NonZero::::try_from(alignment.as_nonzero()) + .map(|align| self & !(align.get() - 1)) + // An alignment larger than `Self` always aligns down to `0`. + .unwrap_or(0) + } + + #[inline(always)] + fn align_up(self, alignment: Alignment) -> Option { + let aligned_down = self.align_down(alignment); + if self == aligned_down { + Some(aligned_down) + } else { + Self::try_from(alignment.as_usize()) + .ok() + .and_then(|align| aligned_down.checked_add(align)) + } + } + } + )* + }; +} + +impl_alignable_uint!(u8, u16, u32, u64, usize); From f3f6b3664302e16ef1c6b91034a72df5564d6b8a Mon Sep 17 00:00:00 2001 From: Alexandre Courbot Date: Mon, 8 Sep 2025 22:25:55 +0900 Subject: [PATCH 2510/3206] gpu: nova-core: use Alignment for alignment-related operations Make use of the newly-available `Alignment` type and remove the corresponding TODO item. Signed-off-by: Alexandre Courbot Reviewed-by: Danilo Krummrich Acked-by: Alexandre Courbot Acked-by: Danilo Krummrich Signed-off-by: Miguel Ojeda --- Documentation/gpu/nova/core/todo.rst | 1 - drivers/gpu/nova-core/fb.rs | 6 +++--- drivers/gpu/nova-core/vbios.rs | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Documentation/gpu/nova/core/todo.rst b/Documentation/gpu/nova/core/todo.rst index 894a1e9c3741a4..8fdb5bced3460a 100644 --- a/Documentation/gpu/nova/core/todo.rst +++ b/Documentation/gpu/nova/core/todo.rst @@ -147,7 +147,6 @@ Numerical operations [NUMM] Nova uses integer operations that are not part of the standard library (or not implemented in an optimized way for the kernel). These include: -- Aligning up and down to a power of two, - The "Find Last Set Bit" (`fls` function of the C part of the kernel) operation. diff --git a/drivers/gpu/nova-core/fb.rs b/drivers/gpu/nova-core/fb.rs index 4a702525fff4f3..e4dc74f2f90a7b 100644 --- a/drivers/gpu/nova-core/fb.rs +++ b/drivers/gpu/nova-core/fb.rs @@ -3,6 +3,7 @@ use core::ops::Range; use kernel::prelude::*; +use kernel::ptr::{Alignable, Alignment}; use kernel::sizes::*; use kernel::types::ARef; use kernel::{dev_warn, device}; @@ -130,10 +131,9 @@ impl FbLayout { }; let frts = { - const FRTS_DOWN_ALIGN: u64 = SZ_128K as u64; + const FRTS_DOWN_ALIGN: Alignment = Alignment::new::(); const FRTS_SIZE: u64 = SZ_1M as u64; - // TODO[NUMM]: replace with `align_down` once it lands. - let frts_base = (vga_workspace.start & !(FRTS_DOWN_ALIGN - 1)) - FRTS_SIZE; + let frts_base = vga_workspace.start.align_down(FRTS_DOWN_ALIGN) - FRTS_SIZE; frts_base..frts_base + FRTS_SIZE }; diff --git a/drivers/gpu/nova-core/vbios.rs b/drivers/gpu/nova-core/vbios.rs index 5b5d9f38cbb3a6..091642d6a5a158 100644 --- a/drivers/gpu/nova-core/vbios.rs +++ b/drivers/gpu/nova-core/vbios.rs @@ -10,6 +10,7 @@ use kernel::device; use kernel::error::Result; use kernel::pci; use kernel::prelude::*; +use kernel::ptr::{Alignable, Alignment}; /// The offset of the VBIOS ROM in the BAR0 space. const ROM_OFFSET: usize = 0x300000; @@ -177,8 +178,7 @@ impl<'a> Iterator for VbiosIterator<'a> { // Advance to next image (aligned to 512 bytes). self.current_offset += image_size; - // TODO[NUMM]: replace with `align_up` once it lands. - self.current_offset = self.current_offset.next_multiple_of(512); + self.current_offset = self.current_offset.align_up(Alignment::new::<512>())?; Some(Ok(full_image)) } From 42520df65bf67189541a425f7d36b0b3e7bd7844 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Fri, 19 Sep 2025 12:12:44 -0700 Subject: [PATCH 2511/3206] hfsplus: fix slab-out-of-bounds read in hfsplus_strcasecmp() The hfsplus_strcasecmp() logic can trigger the issue: [ 117.317703][ T9855] ================================================================== [ 117.318353][ T9855] BUG: KASAN: slab-out-of-bounds in hfsplus_strcasecmp+0x1bc/0x490 [ 117.318991][ T9855] Read of size 2 at addr ffff88802160f40c by task repro/9855 [ 117.319577][ T9855] [ 117.319773][ T9855] CPU: 0 UID: 0 PID: 9855 Comm: repro Not tainted 6.17.0-rc6 #33 PREEMPT(full) [ 117.319780][ T9855] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 117.319783][ T9855] Call Trace: [ 117.319785][ T9855] [ 117.319788][ T9855] dump_stack_lvl+0x1c1/0x2a0 [ 117.319795][ T9855] ? __virt_addr_valid+0x1c8/0x5c0 [ 117.319803][ T9855] ? __pfx_dump_stack_lvl+0x10/0x10 [ 117.319808][ T9855] ? rcu_is_watching+0x15/0xb0 [ 117.319816][ T9855] ? lock_release+0x4b/0x3e0 [ 117.319821][ T9855] ? __kasan_check_byte+0x12/0x40 [ 117.319828][ T9855] ? __virt_addr_valid+0x1c8/0x5c0 [ 117.319835][ T9855] ? __virt_addr_valid+0x4a5/0x5c0 [ 117.319842][ T9855] print_report+0x17e/0x7e0 [ 117.319848][ T9855] ? __virt_addr_valid+0x1c8/0x5c0 [ 117.319855][ T9855] ? __virt_addr_valid+0x4a5/0x5c0 [ 117.319862][ T9855] ? __phys_addr+0xd3/0x180 [ 117.319869][ T9855] ? hfsplus_strcasecmp+0x1bc/0x490 [ 117.319876][ T9855] kasan_report+0x147/0x180 [ 117.319882][ T9855] ? hfsplus_strcasecmp+0x1bc/0x490 [ 117.319891][ T9855] hfsplus_strcasecmp+0x1bc/0x490 [ 117.319900][ T9855] ? __pfx_hfsplus_cat_case_cmp_key+0x10/0x10 [ 117.319906][ T9855] hfs_find_rec_by_key+0xa9/0x1e0 [ 117.319913][ T9855] __hfsplus_brec_find+0x18e/0x470 [ 117.319920][ T9855] ? __pfx_hfsplus_bnode_find+0x10/0x10 [ 117.319926][ T9855] ? __pfx_hfs_find_rec_by_key+0x10/0x10 [ 117.319933][ T9855] ? __pfx___hfsplus_brec_find+0x10/0x10 [ 117.319942][ T9855] hfsplus_brec_find+0x28f/0x510 [ 117.319949][ T9855] ? __pfx_hfs_find_rec_by_key+0x10/0x10 [ 117.319956][ T9855] ? __pfx_hfsplus_brec_find+0x10/0x10 [ 117.319963][ T9855] ? __kmalloc_noprof+0x2a9/0x510 [ 117.319969][ T9855] ? hfsplus_find_init+0x8c/0x1d0 [ 117.319976][ T9855] hfsplus_brec_read+0x2b/0x120 [ 117.319983][ T9855] hfsplus_lookup+0x2aa/0x890 [ 117.319990][ T9855] ? __pfx_hfsplus_lookup+0x10/0x10 [ 117.320003][ T9855] ? d_alloc_parallel+0x2f0/0x15e0 [ 117.320008][ T9855] ? __lock_acquire+0xaec/0xd80 [ 117.320013][ T9855] ? __pfx_d_alloc_parallel+0x10/0x10 [ 117.320019][ T9855] ? __raw_spin_lock_init+0x45/0x100 [ 117.320026][ T9855] ? __init_waitqueue_head+0xa9/0x150 [ 117.320034][ T9855] __lookup_slow+0x297/0x3d0 [ 117.320039][ T9855] ? __pfx___lookup_slow+0x10/0x10 [ 117.320045][ T9855] ? down_read+0x1ad/0x2e0 [ 117.320055][ T9855] lookup_slow+0x53/0x70 [ 117.320065][ T9855] walk_component+0x2f0/0x430 [ 117.320073][ T9855] path_lookupat+0x169/0x440 [ 117.320081][ T9855] filename_lookup+0x212/0x590 [ 117.320089][ T9855] ? __pfx_filename_lookup+0x10/0x10 [ 117.320098][ T9855] ? strncpy_from_user+0x150/0x290 [ 117.320105][ T9855] ? getname_flags+0x1e5/0x540 [ 117.320112][ T9855] user_path_at+0x3a/0x60 [ 117.320117][ T9855] __x64_sys_umount+0xee/0x160 [ 117.320123][ T9855] ? __pfx___x64_sys_umount+0x10/0x10 [ 117.320129][ T9855] ? do_syscall_64+0xb7/0x3a0 [ 117.320135][ T9855] ? entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 117.320141][ T9855] ? entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 117.320145][ T9855] do_syscall_64+0xf3/0x3a0 [ 117.320150][ T9855] ? exc_page_fault+0x9f/0xf0 [ 117.320154][ T9855] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 117.320158][ T9855] RIP: 0033:0x7f7dd7908b07 [ 117.320163][ T9855] Code: 23 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 31 f6 e9 09 00 00 00 66 0f 1f 84 00 00 08 [ 117.320167][ T9855] RSP: 002b:00007ffd5ebd9698 EFLAGS: 00000202 ORIG_RAX: 00000000000000a6 [ 117.320172][ T9855] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f7dd7908b07 [ 117.320176][ T9855] RDX: 0000000000000009 RSI: 0000000000000009 RDI: 00007ffd5ebd9740 [ 117.320179][ T9855] RBP: 00007ffd5ebda780 R08: 0000000000000005 R09: 00007ffd5ebd9530 [ 117.320181][ T9855] R10: 00007f7dd799bfc0 R11: 0000000000000202 R12: 000055e2008b32d0 [ 117.320184][ T9855] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 117.320189][ T9855] [ 117.320190][ T9855] [ 117.351311][ T9855] Allocated by task 9855: [ 117.351683][ T9855] kasan_save_track+0x3e/0x80 [ 117.352093][ T9855] __kasan_kmalloc+0x8d/0xa0 [ 117.352490][ T9855] __kmalloc_noprof+0x288/0x510 [ 117.352914][ T9855] hfsplus_find_init+0x8c/0x1d0 [ 117.353342][ T9855] hfsplus_lookup+0x19c/0x890 [ 117.353747][ T9855] __lookup_slow+0x297/0x3d0 [ 117.354148][ T9855] lookup_slow+0x53/0x70 [ 117.354514][ T9855] walk_component+0x2f0/0x430 [ 117.354921][ T9855] path_lookupat+0x169/0x440 [ 117.355325][ T9855] filename_lookup+0x212/0x590 [ 117.355740][ T9855] user_path_at+0x3a/0x60 [ 117.356115][ T9855] __x64_sys_umount+0xee/0x160 [ 117.356529][ T9855] do_syscall_64+0xf3/0x3a0 [ 117.356920][ T9855] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 117.357429][ T9855] [ 117.357636][ T9855] The buggy address belongs to the object at ffff88802160f000 [ 117.357636][ T9855] which belongs to the cache kmalloc-2k of size 2048 [ 117.358827][ T9855] The buggy address is located 0 bytes to the right of [ 117.358827][ T9855] allocated 1036-byte region [ffff88802160f000, ffff88802160f40c) [ 117.360061][ T9855] [ 117.360266][ T9855] The buggy address belongs to the physical page: [ 117.360813][ T9855] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x21608 [ 117.361562][ T9855] head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [ 117.362285][ T9855] flags: 0xfff00000000040(head|node=0|zone=1|lastcpupid=0x7ff) [ 117.362929][ T9855] page_type: f5(slab) [ 117.363282][ T9855] raw: 00fff00000000040 ffff88801a842f00 ffffea0000932000 dead000000000002 [ 117.364015][ T9855] raw: 0000000000000000 0000000080080008 00000000f5000000 0000000000000000 [ 117.364750][ T9855] head: 00fff00000000040 ffff88801a842f00 ffffea0000932000 dead000000000002 [ 117.365491][ T9855] head: 0000000000000000 0000000080080008 00000000f5000000 0000000000000000 [ 117.366232][ T9855] head: 00fff00000000003 ffffea0000858201 00000000ffffffff 00000000ffffffff [ 117.366968][ T9855] head: ffffffffffffffff 0000000000000000 00000000ffffffff 0000000000000008 [ 117.367711][ T9855] page dumped because: kasan: bad access detected [ 117.368259][ T9855] page_owner tracks the page as allocated [ 117.368745][ T9855] page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN1 [ 117.370541][ T9855] post_alloc_hook+0x240/0x2a0 [ 117.370954][ T9855] get_page_from_freelist+0x2101/0x21e0 [ 117.371435][ T9855] __alloc_frozen_pages_noprof+0x274/0x380 [ 117.371935][ T9855] alloc_pages_mpol+0x241/0x4b0 [ 117.372360][ T9855] allocate_slab+0x8d/0x380 [ 117.372752][ T9855] ___slab_alloc+0xbe3/0x1400 [ 117.373159][ T9855] __kmalloc_cache_noprof+0x296/0x3d0 [ 117.373621][ T9855] nexthop_net_init+0x75/0x100 [ 117.374038][ T9855] ops_init+0x35c/0x5c0 [ 117.374400][ T9855] setup_net+0x10c/0x320 [ 117.374768][ T9855] copy_net_ns+0x31b/0x4d0 [ 117.375156][ T9855] create_new_namespaces+0x3f3/0x720 [ 117.375613][ T9855] unshare_nsproxy_namespaces+0x11c/0x170 [ 117.376094][ T9855] ksys_unshare+0x4ca/0x8d0 [ 117.376477][ T9855] __x64_sys_unshare+0x38/0x50 [ 117.376879][ T9855] do_syscall_64+0xf3/0x3a0 [ 117.377265][ T9855] page last free pid 9110 tgid 9110 stack trace: [ 117.377795][ T9855] __free_frozen_pages+0xbeb/0xd50 [ 117.378229][ T9855] __put_partials+0x152/0x1a0 [ 117.378625][ T9855] put_cpu_partial+0x17c/0x250 [ 117.379026][ T9855] __slab_free+0x2d4/0x3c0 [ 117.379404][ T9855] qlist_free_all+0x97/0x140 [ 117.379790][ T9855] kasan_quarantine_reduce+0x148/0x160 [ 117.380250][ T9855] __kasan_slab_alloc+0x22/0x80 [ 117.380662][ T9855] __kmalloc_noprof+0x232/0x510 [ 117.381074][ T9855] tomoyo_supervisor+0xc0a/0x1360 [ 117.381498][ T9855] tomoyo_env_perm+0x149/0x1e0 [ 117.381903][ T9855] tomoyo_find_next_domain+0x15ad/0x1b90 [ 117.382378][ T9855] tomoyo_bprm_check_security+0x11c/0x180 [ 117.382859][ T9855] security_bprm_check+0x89/0x280 [ 117.383289][ T9855] bprm_execve+0x8f1/0x14a0 [ 117.383673][ T9855] do_execveat_common+0x528/0x6b0 [ 117.384103][ T9855] __x64_sys_execve+0x94/0xb0 [ 117.384500][ T9855] [ 117.384706][ T9855] Memory state around the buggy address: [ 117.385179][ T9855] ffff88802160f300: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 117.385854][ T9855] ffff88802160f380: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 117.386534][ T9855] >ffff88802160f400: 00 04 fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 117.387204][ T9855] ^ [ 117.387566][ T9855] ffff88802160f480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 117.388243][ T9855] ffff88802160f500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 117.388918][ T9855] ================================================================== The issue takes place if the length field of struct hfsplus_unistr is bigger than HFSPLUS_MAX_STRLEN. The patch simply checks the length of comparing strings. And if the strings' length is bigger than HFSPLUS_MAX_STRLEN, then it is corrected to this value. v2 The string length correction has been added for hfsplus_strcmp(). Reported-by: Jiaming Zhang Signed-off-by: Viacheslav Dubeyko cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org cc: syzkaller@googlegroups.com Link: https://lore.kernel.org/r/20250919191243.1370388-1-slava@dubeyko.com Signed-off-by: Viacheslav Dubeyko --- fs/hfsplus/unicode.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 862ba27f1628a8..11e08a4a18b295 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -40,6 +40,18 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, p1 = s1->unicode; p2 = s2->unicode; + if (len1 > HFSPLUS_MAX_STRLEN) { + len1 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s1->length), len1); + } + + if (len2 > HFSPLUS_MAX_STRLEN) { + len2 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s2->length), len2); + } + while (1) { c1 = c2 = 0; @@ -74,6 +86,18 @@ int hfsplus_strcmp(const struct hfsplus_unistr *s1, p1 = s1->unicode; p2 = s2->unicode; + if (len1 > HFSPLUS_MAX_STRLEN) { + len1 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s1->length), len1); + } + + if (len2 > HFSPLUS_MAX_STRLEN) { + len2 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s2->length), len2); + } + for (len = min(len1, len2); len > 0; len--) { c1 = be16_to_cpu(*p1); c2 = be16_to_cpu(*p2); From b72af996b67ccb1dea988765bf046cba2d4c6898 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Mon, 22 Sep 2025 13:02:31 +0000 Subject: [PATCH 2512/3206] cpuset: remove redundant special case for null input in node mask update The nodelist_parse function already handles empty nodemask input appropriately, making it unnecessary to handle this case separately during the node mask update process. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 535174ed712682..20dface3c3e0ec 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2847,22 +2847,16 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. - * Since nodelist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have memory. + * The validate_change() call ensures that cpusets with tasks have memory. */ - if (!*buf) { - nodes_clear(trialcs->mems_allowed); - } else { - retval = nodelist_parse(buf, trialcs->mems_allowed); - if (retval < 0) - goto done; + retval = nodelist_parse(buf, trialcs->mems_allowed); + if (retval < 0) + goto done; - if (!nodes_subset(trialcs->mems_allowed, - top_cpuset.mems_allowed)) { - retval = -EINVAL; - goto done; - } + if (!nodes_subset(trialcs->mems_allowed, + top_cpuset.mems_allowed)) { + retval = -EINVAL; + goto done; } if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { From 39431592e93530d7f543e3d7204a8f9a062e6ac7 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Mon, 22 Sep 2025 13:02:32 +0000 Subject: [PATCH 2513/3206] cpuset: remove impossible warning in update_parent_effective_cpumask If the parent is not a valid partition, an error will be returned before any partition update command is processed. This means the WARN_ON_ONCE(!is_partition_valid(parent)) can never be triggered, so it is safe to remove. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 20dface3c3e0ec..196645b38b248b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1923,7 +1923,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * A partition error happens when parent has tasks and all * its effective CPUs will have to be distributed out. */ - WARN_ON_ONCE(!is_partition_valid(parent)); if (nocpu) { part_error = PERR_NOCPUS; if (is_partition_valid(cs)) From 8f0fdbd4a06bf795c68bc9839d9c349ab592654f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Mon, 22 Sep 2025 13:02:33 +0000 Subject: [PATCH 2514/3206] cpuset: remove is_prs_invalid helper The is_prs_invalid helper function is redundant as it serves a similar purpose to is_partition_invalid. It can be fully replaced by the existing is_partition_invalid function, so this patch removes the is_prs_invalid helper. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 196645b38b248b..52468d2c178a3e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -132,11 +132,6 @@ static bool force_sd_rebuild; #define PRS_INVALID_ROOT -1 #define PRS_INVALID_ISOLATED -2 -static inline bool is_prs_invalid(int prs_state) -{ - return prs_state < 0; -} - /* * Temporary cpumasks for working with partitions that are passed among * functions to avoid memory allocation in inner functions. @@ -1767,7 +1762,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, old_prs = new_prs = cs->partition_root_state; if (cmd == partcmd_invalidate) { - if (is_prs_invalid(old_prs)) + if (is_partition_invalid(cs)) return 0; /* @@ -1874,7 +1869,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * For invalid partition: * delmask = newmask & parent->effective_xcpus */ - if (is_prs_invalid(old_prs)) { + if (is_partition_invalid(cs)) { adding = false; deleting = cpumask_and(tmp->delmask, newmask, parent->effective_xcpus); @@ -2964,7 +2959,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) /* * Treat a previously invalid partition root as if it is a "member". */ - if (new_prs && is_prs_invalid(old_prs)) + if (new_prs && is_partition_invalid(cs)) old_prs = PRS_MEMBER; if (alloc_tmpmasks(&tmpmask)) From 3d3aa9472c6dd0704e9961ed4769caac5b1c8d52 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Sat, 20 Sep 2025 05:11:17 -0700 Subject: [PATCH 2515/3206] bnxt_en: correct offset handling for IPv6 destination address In bnxt_tc_parse_pedit(), the code incorrectly writes IPv6 destination values to the source address field (saddr) when processing pedit offsets within the destination address range. This patch corrects the assignment to use daddr instead of saddr, ensuring that pedit operations on IPv6 destination addresses are applied correctly. Fixes: 9b9eb518e338 ("bnxt_en: Add support for NAT(L3/L4 rewrite)") Signed-off-by: Alok Tiwari Reviewed-by: Somnath Kotur Link: https://patch.msgid.link/20250920121157.351921-1-alok.a.tiwari@oracle.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index d72fd248f3aa92..2d66bf59cd64d6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -244,7 +244,7 @@ bnxt_tc_parse_pedit(struct bnxt *bp, struct bnxt_tc_actions *actions, offset < offset_of_ip6_daddr + 16) { actions->nat.src_xlate = false; idx = (offset - offset_of_ip6_daddr) / 4; - actions->nat.l3.ipv6.saddr.s6_addr32[idx] = htonl(val); + actions->nat.l3.ipv6.daddr.s6_addr32[idx] = htonl(val); } else { netdev_err(bp->dev, "%s: IPv6_hdr: Invalid pedit field\n", From 349271568303695f0ac3563af153d2b4542f6986 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 21 Sep 2025 18:01:16 +0200 Subject: [PATCH 2516/3206] bpf: Implement signature verification for BPF programs This patch extends the BPF_PROG_LOAD command by adding three new fields to `union bpf_attr` in the user-space API: - signature: A pointer to the signature blob. - signature_size: The size of the signature blob. - keyring_id: The serial number of a loaded kernel keyring (e.g., the user or session keyring) containing the trusted public keys. When a BPF program is loaded with a signature, the kernel: 1. Retrieves the trusted keyring using the provided `keyring_id`. 2. Verifies the supplied signature against the BPF program's instruction buffer. 3. If the signature is valid and was generated by a key in the trusted keyring, the program load proceeds. 4. If no signature is provided, the load proceeds as before, allowing for backward compatibility. LSMs can chose to restrict unsigned programs and implement a security policy. 5. If signature verification fails for any reason, the program is not loaded. Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250921160120.9711-2-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- crypto/asymmetric_keys/pkcs7_verify.c | 1 + include/linux/verification.h | 1 + include/uapi/linux/bpf.h | 10 ++++++ kernel/bpf/helpers.c | 2 +- kernel/bpf/syscall.c | 45 ++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 10 ++++++ tools/lib/bpf/bpf.c | 2 +- 7 files changed, 68 insertions(+), 3 deletions(-) diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c index f0d4ff3c20a832..6d6475e3a9bf2b 100644 --- a/crypto/asymmetric_keys/pkcs7_verify.c +++ b/crypto/asymmetric_keys/pkcs7_verify.c @@ -429,6 +429,7 @@ int pkcs7_verify(struct pkcs7_message *pkcs7, /* Authattr presence checked in parser */ break; case VERIFYING_UNSPECIFIED_SIGNATURE: + case VERIFYING_BPF_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid unspecified sig (not pkcs7-data)\n"); return -EKEYREJECTED; diff --git a/include/linux/verification.h b/include/linux/verification.h index 4f3022d081c31b..dec7f2beabfd4b 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -36,6 +36,7 @@ enum key_being_used_for { VERIFYING_KEY_SIGNATURE, VERIFYING_KEY_SELF_SIGNATURE, VERIFYING_UNSPECIFIED_SIGNATURE, + VERIFYING_BPF_SIGNATURE, NR__KEY_BEING_USED_FOR }; #ifdef CONFIG_SYSTEM_DATA_VERIFICATION diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0987b52d564842..f3b173e48b0fa5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1611,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ef4ede8bb74fd8..969f63f8ca28f7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3898,7 +3898,7 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, - VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + VERIFYING_BPF_SIGNATURE, NULL, NULL); #else return -EOPNOTSUPP; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf7173b1bb83dc..8a3c3d26f6e21b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2785,8 +2786,44 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } } +static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, + bool is_kernel) +{ + bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); + struct bpf_dynptr_kern sig_ptr, insns_ptr; + struct bpf_key *key = NULL; + void *sig; + int err = 0; + + if (system_keyring_id_check(attr->keyring_id) == 0) + key = bpf_lookup_system_key(attr->keyring_id); + else + key = bpf_lookup_user_key(attr->keyring_id, 0); + + if (!key) + return -EINVAL; + + sig = kvmemdup_bpfptr(usig, attr->signature_size); + if (IS_ERR(sig)) { + bpf_key_put(key); + return -ENOMEM; + } + + bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, + attr->signature_size); + bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0, + prog->len * sizeof(struct bpf_insn)); + + err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, + (struct bpf_dynptr *)&sig_ptr, key); + + bpf_key_put(key); + kvfree(sig); + return err; +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt +#define BPF_PROG_LOAD_LAST_FIELD keyring_id static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { @@ -2950,6 +2987,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; + if (attr->signature) { + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + if (err) + goto free_prog; + } + prog->orig_prog = NULL; prog->jited = 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0987b52d564842..f3b173e48b0fa5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1611,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 19ad7bcf0c2ff3..339b197972374f 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -240,7 +240,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insn_cnt, struct bpf_prog_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, fd_array_cnt); + const size_t attr_sz = offsetofend(union bpf_attr, keyring_id); void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; From fb2b0e290147ba01a53dfd92cf91058c9d2ee254 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 21 Sep 2025 18:01:17 +0200 Subject: [PATCH 2517/3206] libbpf: Update light skeleton for signing * The metadata map is created with as an exclusive map (with an excl_prog_hash) This restricts map access exclusively to the signed loader program, preventing tampering by other processes. * The map is then frozen, making it read-only from userspace. * BPF_OBJ_GET_INFO_BY_ID instructs the kernel to compute the hash of the metadata map (H') and store it in bpf_map->sha. * The loader is then loaded with the signature which is then verified by the kernel. loading signed programs prebuilt into the kernel are not currently supported. These can supported by enabling BPF_OBJ_GET_INFO_BY_ID to be called from the kernel. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250921160120.9711-3-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/skel_internal.h | 76 +++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h index 4d5fa079b5d626..6a8f5c7a02eb97 100644 --- a/tools/lib/bpf/skel_internal.h +++ b/tools/lib/bpf/skel_internal.h @@ -13,10 +13,15 @@ #include #include #include +#include #include #include "bpf.h" #endif +#ifndef SHA256_DIGEST_LENGTH +#define SHA256_DIGEST_LENGTH 32 +#endif + #ifndef __NR_bpf # if defined(__mips__) && defined(_ABIO32) # define __NR_bpf 4355 @@ -64,6 +69,11 @@ struct bpf_load_and_run_opts { __u32 data_sz; __u32 insns_sz; const char *errstr; + void *signature; + __u32 signature_sz; + __s32 keyring_id; + void *excl_prog_hash; + __u32 excl_prog_hash_sz; }; long kern_sys_bpf(__u32 cmd, void *attr, __u32 attr_size); @@ -220,14 +230,19 @@ static inline int skel_map_create(enum bpf_map_type map_type, const char *map_name, __u32 key_size, __u32 value_size, - __u32 max_entries) + __u32 max_entries, + const void *excl_prog_hash, + __u32 excl_prog_hash_sz) { - const size_t attr_sz = offsetofend(union bpf_attr, map_extra); + const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size); union bpf_attr attr; memset(&attr, 0, attr_sz); attr.map_type = map_type; + attr.excl_prog_hash = (unsigned long) excl_prog_hash; + attr.excl_prog_hash_size = excl_prog_hash_sz; + strncpy(attr.map_name, map_name, sizeof(attr.map_name)); attr.key_size = key_size; attr.value_size = value_size; @@ -300,6 +315,35 @@ static inline int skel_link_create(int prog_fd, int target_fd, return skel_sys_bpf(BPF_LINK_CREATE, &attr, attr_sz); } +static inline int skel_obj_get_info_by_fd(int fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, info); + __u8 sha[SHA256_DIGEST_LENGTH]; + struct bpf_map_info info; + __u32 info_len = sizeof(info); + union bpf_attr attr; + + memset(&info, 0, sizeof(info)); + info.hash = (long) &sha; + info.hash_size = SHA256_DIGEST_LENGTH; + + memset(&attr, 0, attr_sz); + attr.info.bpf_fd = fd; + attr.info.info = (long) &info; + attr.info.info_len = info_len; + return skel_sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, attr_sz); +} + +static inline int skel_map_freeze(int fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_fd); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + + return skel_sys_bpf(BPF_MAP_FREEZE, &attr, attr_sz); +} #ifdef __KERNEL__ #define set_err #else @@ -308,12 +352,13 @@ static inline int skel_link_create(int prog_fd, int target_fd, static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) { - const size_t prog_load_attr_sz = offsetofend(union bpf_attr, fd_array); + const size_t prog_load_attr_sz = offsetofend(union bpf_attr, keyring_id); const size_t test_run_attr_sz = offsetofend(union bpf_attr, test); int map_fd = -1, prog_fd = -1, key = 0, err; union bpf_attr attr; - err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1); + err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1, + opts->excl_prog_hash, opts->excl_prog_hash_sz); if (map_fd < 0) { opts->errstr = "failed to create loader map"; set_err; @@ -327,11 +372,34 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) goto out; } +#ifndef __KERNEL__ + err = skel_map_freeze(map_fd); + if (err < 0) { + opts->errstr = "failed to freeze map"; + set_err; + goto out; + } + err = skel_obj_get_info_by_fd(map_fd); + if (err < 0) { + opts->errstr = "failed to fetch obj info"; + set_err; + goto out; + } +#endif + memset(&attr, 0, prog_load_attr_sz); attr.prog_type = BPF_PROG_TYPE_SYSCALL; attr.insns = (long) opts->insns; attr.insn_cnt = opts->insns_sz / sizeof(struct bpf_insn); attr.license = (long) "Dual BSD/GPL"; +#ifndef __KERNEL__ + attr.signature = (long) opts->signature; + attr.signature_size = opts->signature_sz; +#else + if (opts->signature || opts->signature_sz) + pr_warn("signatures are not supported from bpf_preload\n"); +#endif + attr.keyring_id = opts->keyring_id; memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog")); attr.fd_array = (long) &map_fd; attr.log_level = opts->ctx->log_level; From ea923080c14578504c2e142760d9de547e38e87c Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 21 Sep 2025 18:01:18 +0200 Subject: [PATCH 2518/3206] libbpf: Embed and verify the metadata hash in the loader To fulfill the BPF signing contract, represented as Sig(I_loader || H_meta), the generated trusted loader program must verify the integrity of the metadata. This signature cryptographically binds the loader's instructions (I_loader) to a hash of the metadata (H_meta). The verification process is embedded directly into the loader program. Upon execution, the loader loads the runtime hash from struct bpf_map i.e. BPF_PSEUDO_MAP_IDX and compares this runtime hash against an expected hash value that has been hardcoded directly by bpf_obj__gen_loader. The load from bpf_map can be improved by calling BPF_OBJ_GET_INFO_BY_FD from the kernel context after BPF_OBJ_GET_INFO_BY_FD has been updated for being called from the kernel context. The following instructions are generated: ld_imm64 r1, const_ptr_to_map // insn[0].src_reg == BPF_PSEUDO_MAP_IDX r2 = *(u64 *)(r1 + 0); ld_imm64 r3, sha256_of_map_part1 // constant precomputed by bpftool (part of H_meta) if r2 != r3 goto out; r2 = *(u64 *)(r1 + 8); ld_imm64 r3, sha256_of_map_part2 // (part of H_meta) if r2 != r3 goto out; r2 = *(u64 *)(r1 + 16); ld_imm64 r3, sha256_of_map_part3 // (part of H_meta) if r2 != r3 goto out; r2 = *(u64 *)(r1 + 24); ld_imm64 r3, sha256_of_map_part4 // (part of H_meta) if r2 != r3 goto out; ... Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250921160120.9711-4-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf_gen_internal.h | 2 ++ tools/lib/bpf/gen_loader.c | 55 ++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 3 +- 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h index 6ff963a491d972..49af4260b8e6b7 100644 --- a/tools/lib/bpf/bpf_gen_internal.h +++ b/tools/lib/bpf/bpf_gen_internal.h @@ -4,6 +4,7 @@ #define __BPF_GEN_INTERNAL_H #include "bpf.h" +#include "libbpf_internal.h" struct ksym_relo_desc { const char *name; @@ -50,6 +51,7 @@ struct bpf_gen { __u32 nr_ksyms; int fd_array; int nr_fd_array; + int hash_insn_offset[SHA256_DWORD_SIZE]; }; void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps); diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 113ae4abd345f2..376eef292d3a8b 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -110,6 +110,7 @@ static void emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn in static int add_data(struct bpf_gen *gen, const void *data, __u32 size); static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off); +static void emit_signature_match(struct bpf_gen *gen); void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps) { @@ -152,6 +153,8 @@ void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps /* R7 contains the error code from sys_bpf. Copy it into R0 and exit. */ emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7)); emit(gen, BPF_EXIT_INSN()); + if (OPTS_GET(gen->opts, gen_hash, false)) + emit_signature_match(gen); } static int add_data(struct bpf_gen *gen, const void *data, __u32 size) @@ -368,6 +371,8 @@ static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off) __emit_sys_close(gen); } +static int compute_sha_udpate_offsets(struct bpf_gen *gen); + int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) { int i; @@ -394,6 +399,12 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) blob_fd_array_off(gen, i)); emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0)); emit(gen, BPF_EXIT_INSN()); + if (OPTS_GET(gen->opts, gen_hash, false)) { + gen->error = compute_sha_udpate_offsets(gen); + if (gen->error) + return gen->error; + } + pr_debug("gen: finish %s\n", errstr(gen->error)); if (!gen->error) { struct gen_loader_opts *opts = gen->opts; @@ -446,6 +457,27 @@ void bpf_gen__free(struct bpf_gen *gen) _val; \ }) +static int compute_sha_udpate_offsets(struct bpf_gen *gen) +{ + __u64 sha[SHA256_DWORD_SIZE]; + __u64 sha_dw; + int i, err; + + err = libbpf_sha256(gen->data_start, gen->data_cur - gen->data_start, sha, SHA256_DIGEST_LENGTH); + if (err < 0) { + pr_warn("sha256 computation of the metadata failed"); + return err; + } + for (i = 0; i < SHA256_DWORD_SIZE; i++) { + struct bpf_insn *insn = + (struct bpf_insn *)(gen->insn_start + gen->hash_insn_offset[i]); + sha_dw = tgt_endian(sha[i]); + insn[0].imm = (__u32)sha_dw; + insn[1].imm = sha_dw >> 32; + } + return 0; +} + void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data, __u32 btf_raw_size) { @@ -557,6 +589,29 @@ void bpf_gen__map_create(struct bpf_gen *gen, emit_sys_close_stack(gen, stack_off(inner_map_fd)); } +static void emit_signature_match(struct bpf_gen *gen) +{ + __s64 off; + int i; + + for (i = 0; i < SHA256_DWORD_SIZE; i++) { + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX, + 0, 0, 0, 0)); + emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, i * sizeof(__u64))); + gen->hash_insn_offset[i] = gen->insn_cur - gen->insn_start; + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_3, 0, 0, 0, 0, 0)); + + off = -(gen->insn_cur - gen->insn_start - gen->cleanup_label) / 8 - 1; + if (is_simm16(off)) { + emit(gen, BPF_MOV64_IMM(BPF_REG_7, -EINVAL)); + emit(gen, BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, off)); + } else { + gen->error = -ERANGE; + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, -1)); + } + } +} + void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *attach_name, enum bpf_attach_type type) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index e978bc093c3914..5118d0a90e243a 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1857,9 +1857,10 @@ struct gen_loader_opts { const char *insns; __u32 data_sz; __u32 insns_sz; + bool gen_hash; }; -#define gen_loader_opts__last_field insns_sz +#define gen_loader_opts__last_field gen_hash LIBBPF_API int bpf_object__gen_loader(struct bpf_object *obj, struct gen_loader_opts *opts); From 40863f4d6ef2c34bb00dd1070dfaf9d5f27a497e Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 21 Sep 2025 18:01:19 +0200 Subject: [PATCH 2519/3206] bpftool: Add support for signing BPF programs Two modes of operation being added: Add two modes of operation: * For prog load, allow signing a program immediately before loading. This is essential for command-line testing and administration. bpftool prog load -S -k -i fentry_test.bpf.o * For gen skeleton, embed a pre-generated signature into the C skeleton file. This supports the use of signed programs in compiled applications. bpftool gen skeleton -S -k -i fentry_test.bpf.o Generation of the loader program and its metadata map is implemented in libbpf (bpf_obj__gen_loader). bpftool generates a skeleton that loads the program and automates the required steps: freezing the map, creating an exclusive map, loading, and running. Users can use standard libbpf APIs directly or integrate loader program generation into their own toolchains. Signed-off-by: KP Singh Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20250921160120.9711-5-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/bpftool/Documentation/bpftool-gen.rst | 13 +- .../bpftool/Documentation/bpftool-prog.rst | 14 +- tools/bpf/bpftool/Makefile | 6 +- tools/bpf/bpftool/cgroup.c | 4 + tools/bpf/bpftool/gen.c | 68 +++++- tools/bpf/bpftool/main.c | 26 ++- tools/bpf/bpftool/main.h | 11 + tools/bpf/bpftool/prog.c | 29 ++- tools/bpf/bpftool/sign.c | 212 ++++++++++++++++++ 9 files changed, 372 insertions(+), 11 deletions(-) create mode 100644 tools/bpf/bpftool/sign.c diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index ca860fd97d8dfc..d0a36f442db72f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -16,7 +16,7 @@ SYNOPSIS **bpftool** [*OPTIONS*] **gen** *COMMAND* -*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } | [ { **-S** | **--sign** } {**-k** } **-i** ] } *COMMAND* := { **object** | **skeleton** | **help** } @@ -186,6 +186,17 @@ OPTIONS skeleton). A light skeleton contains a loader eBPF program. It does not use the majority of the libbpf infrastructure, and does not need libelf. +-S, --sign + For skeletons, generate a signed skeleton. This option must be used with + **-k** and **-i**. Using this flag implicitly enables **--use-loader**. + +-k + Path to the private key file in PEM format, required for signing. + +-i + Path to the X.509 certificate file in PEM or DER format, required for + signing. + EXAMPLES ======== **$ cat example1.bpf.c** diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index f69fd92df8d89d..009633294b0934 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -18,7 +18,7 @@ SYNOPSIS *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | -{ **-L** | **--use-loader** } } +{ **-L** | **--use-loader** } | [ { **-S** | **--sign** } **-k** **-i** ] } *COMMANDS* := { **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | @@ -248,6 +248,18 @@ OPTIONS creating the maps, and loading the programs (see **bpftool prog tracelog** as a way to dump those messages). +-S, --sign + Enable signing of the BPF program before loading. This option must be + used with **-k** and **-i**. Using this flag implicitly enables + **--use-loader**. + +-k + Path to the private key file in PEM format, required when signing. + +-i + Path to the X.509 certificate file in PEM or DER format, required when + signing. + EXAMPLES ======== **# bpftool prog show** diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 9e9a5f006cd2aa..586d1b2595d16b 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -130,8 +130,8 @@ include $(FEATURES_DUMP) endif endif -LIBS = $(LIBBPF) -lelf -lz -LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz +LIBS = $(LIBBPF) -lelf -lz -lcrypto +LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz -lcrypto ifeq ($(feature-libelf-zstd),1) LIBS += -lzstd @@ -194,7 +194,7 @@ endif BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool -BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) +BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o sign.o) $(BOOTSTRAP_OBJS): $(LIBBPF_BOOTSTRAP) OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 944ebe21a2169a..ec356deb27c9ec 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -2,6 +2,10 @@ // Copyright (C) 2017 Facebook // Author: Roman Gushchin +#undef GCC_VERSION +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif #define _XOPEN_SOURCE 500 #include #include diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 67a60114368f50..993c7d9484a463 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -688,10 +688,17 @@ static void codegen_destroy(struct bpf_object *obj, const char *obj_name) static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *header_guard) { DECLARE_LIBBPF_OPTS(gen_loader_opts, opts); + struct bpf_load_and_run_opts sopts = {}; + char sig_buf[MAX_SIG_SIZE]; + __u8 prog_sha[SHA256_DIGEST_LENGTH]; struct bpf_map *map; + char ident[256]; int err = 0; + if (sign_progs) + opts.gen_hash = true; + err = bpf_object__gen_loader(obj, &opts); if (err) return err; @@ -701,6 +708,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h p_err("failed to load object file"); goto out; } + /* If there was no error during load then gen_loader_opts * are populated with the loader program. */ @@ -780,8 +788,52 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h print_hex(opts.insns, opts.insns_sz); codegen("\ \n\ - \"; \n\ - \n\ + \";\n"); + + if (sign_progs) { + sopts.insns = opts.insns; + sopts.insns_sz = opts.insns_sz; + sopts.excl_prog_hash = prog_sha; + sopts.excl_prog_hash_sz = sizeof(prog_sha); + sopts.signature = sig_buf; + sopts.signature_sz = MAX_SIG_SIZE; + + err = bpftool_prog_sign(&sopts); + if (err < 0) { + p_err("failed to sign program"); + goto out; + } + + codegen("\ + \n\ + static const char opts_sig[] __attribute__((__aligned__(8))) = \"\\\n\ + "); + print_hex((const void *)sig_buf, sopts.signature_sz); + codegen("\ + \n\ + \";\n"); + + codegen("\ + \n\ + static const char opts_excl_hash[] __attribute__((__aligned__(8))) = \"\\\n\ + "); + print_hex((const void *)prog_sha, sizeof(prog_sha)); + codegen("\ + \n\ + \";\n"); + + codegen("\ + \n\ + opts.signature = (void *)opts_sig; \n\ + opts.signature_sz = sizeof(opts_sig) - 1; \n\ + opts.excl_prog_hash = (void *)opts_excl_hash; \n\ + opts.excl_prog_hash_sz = sizeof(opts_excl_hash) - 1; \n\ + opts.keyring_id = skel->keyring_id; \n\ + "); + } + + codegen("\ + \n\ opts.ctx = (struct bpf_loader_ctx *)skel; \n\ opts.data_sz = sizeof(opts_data) - 1; \n\ opts.data = (void *)opts_data; \n\ @@ -1240,7 +1292,7 @@ static int do_skeleton(int argc, char **argv) err = -errno; libbpf_strerror(err, err_buf, sizeof(err_buf)); p_err("failed to open BPF object file: %s", err_buf); - goto out; + goto out_obj; } bpf_object__for_each_map(map, obj) { @@ -1355,6 +1407,13 @@ static int do_skeleton(int argc, char **argv) printf("\t} links;\n"); } + if (sign_progs) { + codegen("\ + \n\ + __s32 keyring_id; \n\ + "); + } + if (btf) { err = codegen_datasecs(obj, obj_name); if (err) @@ -1552,6 +1611,7 @@ static int do_skeleton(int argc, char **argv) err = 0; out: bpf_object__close(obj); +out_obj: if (obj_data) munmap(obj_data, mmap_sz); close(fd); @@ -1930,7 +1990,7 @@ static int do_help(int argc, char **argv) " %1$s %2$s help\n" "\n" " " HELP_SPEC_OPTIONS " |\n" - " {-L|--use-loader} }\n" + " {-L|--use-loader} | [ {-S|--sign } {-k} {-i} ]}\n" "", bin_name, "gen"); diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 0f1183b2ed0a07..a829a6a49037ad 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -33,6 +33,9 @@ bool relaxed_maps; bool use_loader; struct btf *base_btf; struct hashmap *refs_table; +bool sign_progs; +const char *private_key_path; +const char *cert_path; static void __noreturn clean_and_exit(int i) { @@ -448,6 +451,7 @@ int main(int argc, char **argv) { "nomount", no_argument, NULL, 'n' }, { "debug", no_argument, NULL, 'd' }, { "use-loader", no_argument, NULL, 'L' }, + { "sign", no_argument, NULL, 'S' }, { "base-btf", required_argument, NULL, 'B' }, { 0 } }; @@ -474,7 +478,7 @@ int main(int argc, char **argv) bin_name = "bpftool"; opterr = 0; - while ((opt = getopt_long(argc, argv, "VhpjfLmndB:l", + while ((opt = getopt_long(argc, argv, "VhpjfLmndSi:k:B:l", options, NULL)) >= 0) { switch (opt) { case 'V': @@ -520,6 +524,16 @@ int main(int argc, char **argv) case 'L': use_loader = true; break; + case 'S': + sign_progs = true; + use_loader = true; + break; + case 'k': + private_key_path = optarg; + break; + case 'i': + cert_path = optarg; + break; default: p_err("unrecognized option '%s'", argv[optind - 1]); if (json_output) @@ -534,6 +548,16 @@ int main(int argc, char **argv) if (argc < 0) usage(); + if (sign_progs && (private_key_path == NULL || cert_path == NULL)) { + p_err("-i and -k must be supplied with -S for signing"); + return -EINVAL; + } + + if (!sign_progs && (private_key_path != NULL || cert_path != NULL)) { + p_err("--sign (or -S) must be explicitly passed with -i and -k to sign the programs"); + return -EINVAL; + } + if (version_requested) ret = do_version(argc, argv); else diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 374cac2a8c66ca..1130299cede0b8 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -6,9 +6,14 @@ /* BFD and kernel.h both define GCC_VERSION, differently */ #undef GCC_VERSION +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif #include #include +#include #include +#include #include #include #include @@ -52,6 +57,7 @@ static inline void *u64_to_ptr(__u64 ptr) }) #define ERR_MAX_LEN 1024 +#define MAX_SIG_SIZE 4096 #define BPF_TAG_FMT "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx" @@ -85,6 +91,9 @@ extern bool relaxed_maps; extern bool use_loader; extern struct btf *base_btf; extern struct hashmap *refs_table; +extern bool sign_progs; +extern const char *private_key_path; +extern const char *cert_path; void __printf(1, 2) p_err(const char *fmt, ...); void __printf(1, 2) p_info(const char *fmt, ...); @@ -284,4 +293,6 @@ struct kernel_config_option { int read_kernel_config(const struct kernel_config_option *requested_options, size_t num_options, char **out_values, const char *define_prefix); +int bpftool_prog_sign(struct bpf_load_and_run_opts *opts); +__u32 register_session_key(const char *key_der_path); #endif diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 4dccc75b0bab01..6daf19809ca4a3 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -1930,6 +1931,8 @@ static int try_loader(struct gen_loader_opts *gen) { struct bpf_load_and_run_opts opts = {}; struct bpf_loader_ctx *ctx; + char sig_buf[MAX_SIG_SIZE]; + __u8 prog_sha[SHA256_DIGEST_LENGTH]; int ctx_sz = sizeof(*ctx) + 64 * max(sizeof(struct bpf_map_desc), sizeof(struct bpf_prog_desc)); int log_buf_sz = (1u << 24) - 1; @@ -1953,6 +1956,26 @@ static int try_loader(struct gen_loader_opts *gen) opts.insns = gen->insns; opts.insns_sz = gen->insns_sz; fds_before = count_open_fds(); + + if (sign_progs) { + opts.excl_prog_hash = prog_sha; + opts.excl_prog_hash_sz = sizeof(prog_sha); + opts.signature = sig_buf; + opts.signature_sz = MAX_SIG_SIZE; + opts.keyring_id = KEY_SPEC_SESSION_KEYRING; + + err = bpftool_prog_sign(&opts); + if (err < 0) { + p_err("failed to sign program"); + goto out; + } + + err = register_session_key(cert_path); + if (err < 0) { + p_err("failed to add session key"); + goto out; + } + } err = bpf_load_and_run(&opts); fd_delta = count_open_fds() - fds_before; if (err < 0 || verifier_logs) { @@ -1961,6 +1984,7 @@ static int try_loader(struct gen_loader_opts *gen) fprintf(stderr, "loader prog leaked %d FDs\n", fd_delta); } +out: free(log_buf); return err; } @@ -1988,6 +2012,9 @@ static int do_loader(int argc, char **argv) goto err_close_obj; } + if (sign_progs) + gen.gen_hash = true; + err = bpf_object__gen_loader(obj, &gen); if (err) goto err_close_obj; @@ -2562,7 +2589,7 @@ static int do_help(int argc, char **argv) " METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} |\n" - " {-L|--use-loader} }\n" + " {-L|--use-loader} | [ {-S|--sign } {-k} {-i} ] \n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/sign.c b/tools/bpf/bpftool/sign.c new file mode 100644 index 00000000000000..b29d825bb1d418 --- /dev/null +++ b/tools/bpf/bpftool/sign.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* + * Copyright (C) 2025 Google LLC. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "main.h" + +#define OPEN_SSL_ERR_BUF_LEN 256 + +static void display_openssl_errors(int l) +{ + char buf[OPEN_SSL_ERR_BUF_LEN]; + const char *file; + const char *data; + unsigned long e; + int flags; + int line; + + while ((e = ERR_get_error_all(&file, &line, NULL, &data, &flags))) { + ERR_error_string_n(e, buf, sizeof(buf)); + if (data && (flags & ERR_TXT_STRING)) { + p_err("OpenSSL %s: %s:%d: %s", buf, file, line, data); + } else { + p_err("OpenSSL %s: %s:%d", buf, file, line); + } + } +} + +#define DISPLAY_OSSL_ERR(cond) \ + do { \ + bool __cond = (cond); \ + if (__cond && ERR_peek_error()) \ + display_openssl_errors(__LINE__);\ + } while (0) + +static EVP_PKEY *read_private_key(const char *pkey_path) +{ + EVP_PKEY *private_key = NULL; + BIO *b; + + b = BIO_new_file(pkey_path, "rb"); + private_key = PEM_read_bio_PrivateKey(b, NULL, NULL, NULL); + BIO_free(b); + DISPLAY_OSSL_ERR(!private_key); + return private_key; +} + +static X509 *read_x509(const char *x509_name) +{ + unsigned char buf[2]; + X509 *x509 = NULL; + BIO *b; + int n; + + b = BIO_new_file(x509_name, "rb"); + if (!b) + goto cleanup; + + /* Look at the first two bytes of the file to determine the encoding */ + n = BIO_read(b, buf, 2); + if (n != 2) + goto cleanup; + + if (BIO_reset(b) != 0) + goto cleanup; + + if (buf[0] == 0x30 && buf[1] >= 0x81 && buf[1] <= 0x84) + /* Assume raw DER encoded X.509 */ + x509 = d2i_X509_bio(b, NULL); + else + /* Assume PEM encoded X.509 */ + x509 = PEM_read_bio_X509(b, NULL, NULL, NULL); + +cleanup: + BIO_free(b); + DISPLAY_OSSL_ERR(!x509); + return x509; +} + +__u32 register_session_key(const char *key_der_path) +{ + unsigned char *der_buf = NULL; + X509 *x509 = NULL; + int key_id = -1; + int der_len; + + if (!key_der_path) + return key_id; + x509 = read_x509(key_der_path); + if (!x509) + goto cleanup; + der_len = i2d_X509(x509, &der_buf); + if (der_len < 0) + goto cleanup; + key_id = syscall(__NR_add_key, "asymmetric", key_der_path, der_buf, + (size_t)der_len, KEY_SPEC_SESSION_KEYRING); +cleanup: + X509_free(x509); + OPENSSL_free(der_buf); + DISPLAY_OSSL_ERR(key_id == -1); + return key_id; +} + +int bpftool_prog_sign(struct bpf_load_and_run_opts *opts) +{ + BIO *bd_in = NULL, *bd_out = NULL; + EVP_PKEY *private_key = NULL; + CMS_ContentInfo *cms = NULL; + long actual_sig_len = 0; + X509 *x509 = NULL; + int err = 0; + + bd_in = BIO_new_mem_buf(opts->insns, opts->insns_sz); + if (!bd_in) { + err = -ENOMEM; + goto cleanup; + } + + private_key = read_private_key(private_key_path); + if (!private_key) { + err = -EINVAL; + goto cleanup; + } + + x509 = read_x509(cert_path); + if (!x509) { + err = -EINVAL; + goto cleanup; + } + + cms = CMS_sign(NULL, NULL, NULL, NULL, + CMS_NOCERTS | CMS_PARTIAL | CMS_BINARY | CMS_DETACHED | + CMS_STREAM); + if (!cms) { + err = -EINVAL; + goto cleanup; + } + + if (!CMS_add1_signer(cms, x509, private_key, EVP_sha256(), + CMS_NOCERTS | CMS_BINARY | CMS_NOSMIMECAP | + CMS_USE_KEYID | CMS_NOATTR)) { + err = -EINVAL; + goto cleanup; + } + + if (CMS_final(cms, bd_in, NULL, CMS_NOCERTS | CMS_BINARY) != 1) { + err = -EIO; + goto cleanup; + } + + EVP_Digest(opts->insns, opts->insns_sz, opts->excl_prog_hash, + &opts->excl_prog_hash_sz, EVP_sha256(), NULL); + + bd_out = BIO_new(BIO_s_mem()); + if (!bd_out) { + err = -ENOMEM; + goto cleanup; + } + + if (!i2d_CMS_bio_stream(bd_out, cms, NULL, 0)) { + err = -EIO; + goto cleanup; + } + + actual_sig_len = BIO_get_mem_data(bd_out, NULL); + if (actual_sig_len <= 0) { + err = -EIO; + goto cleanup; + } + + if ((size_t)actual_sig_len > opts->signature_sz) { + err = -ENOSPC; + goto cleanup; + } + + if (BIO_read(bd_out, opts->signature, actual_sig_len) != actual_sig_len) { + err = -EIO; + goto cleanup; + } + + opts->signature_sz = actual_sig_len; +cleanup: + BIO_free(bd_out); + CMS_ContentInfo_free(cms); + X509_free(x509); + EVP_PKEY_free(private_key); + BIO_free(bd_in); + DISPLAY_OSSL_ERR(err < 0); + return err; +} From b720903e2b14d319268e1348a32c46a6fcbfd327 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 21 Sep 2025 18:01:20 +0200 Subject: [PATCH 2520/3206] selftests/bpf: Enable signature verification for some lskel tests The test harness uses the verify_sig_setup.sh to generate the required key material for program signing. Generate key material for signing LSKEL some lskel programs and use xxd to convert the verification certificate into a C header file. Finally, update the main test runner to load this certificate into the session keyring via the add_key() syscall before executing any tests. Use the session keyring in the tests with signed programs. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250921160120.9711-6-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 35 ++++++++++++++++--- .../selftests/bpf/prog_tests/atomics.c | 10 ++++-- .../selftests/bpf/prog_tests/fentry_fexit.c | 15 ++++++-- .../selftests/bpf/prog_tests/fentry_test.c | 9 +++-- .../selftests/bpf/prog_tests/fexit_test.c | 9 +++-- tools/testing/selftests/bpf/test_progs.c | 13 +++++++ .../testing/selftests/bpf/verify_sig_setup.sh | 11 ++++-- 8 files changed, 89 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 3d8378972d26cc..be1ee7ba7ce031 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -44,3 +44,4 @@ xdp_redirect_multi xdp_synproxy xdp_hw_metadata xdp_features +verification_cert.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 11d2a368db3eaa..0b6ee902bce512 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -496,15 +496,16 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ test_subskeleton.skel.h test_subskeleton_lib.skel.h \ test_usdt.skel.h -LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ - trace_printk.c trace_vprintk.c map_ptr_kern.c \ +LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c +LSKELS_SIGNED := fentry_test.c fexit_test.c atomics.c + # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ kfunc_call_test_subprog.c -SKEL_BLACKLIST += $$(LSKELS) +SKEL_BLACKLIST += $$(LSKELS) $$(LSKELS_SIGNED) test_static_linked.skel.h-deps := test_static_linked1.bpf.o test_static_linked2.bpf.o linked_funcs.skel.h-deps := linked_funcs1.bpf.o linked_funcs2.bpf.o @@ -535,6 +536,7 @@ HEADERS_FOR_BPF_OBJS := $(wildcard $(BPFDIR)/*.bpf.h) \ # $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER +LSKEL_SIGN := -S -k $(PRIVATE_KEY) -i $(VERIFICATION_CERT) TRUNNER_OUTPUT := $(OUTPUT)$(if $2,/)$2 TRUNNER_BINARY := $1$(if $2,-)$2 TRUNNER_TEST_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.test.o, \ @@ -550,6 +552,7 @@ TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ $$(TRUNNER_BPF_SRCS))) TRUNNER_BPF_LSKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.lskel.h, $$(LSKELS) $$(LSKELS_EXTRA)) TRUNNER_BPF_SKELS_LINKED := $$(addprefix $$(TRUNNER_OUTPUT)/,$(LINKED_SKELS)) +TRUNNER_BPF_LSKELS_SIGNED := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.lskel.h, $$(LSKELS_SIGNED)) TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) # Evaluate rules now with extra TRUNNER_XXX variables above already defined @@ -604,6 +607,15 @@ $(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) +$(TRUNNER_BPF_LSKELS_SIGNED): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) + $$(call msg,GEN-SKEL,$(TRUNNER_BINARY) (signed),$$@) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< + $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) + $(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) + $(Q)$$(BPFTOOL) gen skeleton $(LSKEL_SIGN) $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ + $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) + $(LINKED_BPF_OBJS): %: $(TRUNNER_OUTPUT)/% # .SECONDEXPANSION here allows to correctly expand %-deps variables as prerequisites @@ -653,6 +665,7 @@ $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_EXTRA_HDRS) \ $(TRUNNER_BPF_SKELS) \ $(TRUNNER_BPF_LSKELS) \ + $(TRUNNER_BPF_LSKELS_SIGNED) \ $(TRUNNER_BPF_SKELS_LINKED) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) @@ -667,6 +680,7 @@ $(foreach N,$(patsubst $(TRUNNER_OUTPUT)/%.o,%,$(TRUNNER_EXTRA_OBJS)), \ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ $(TRUNNER_EXTRA_HDRS) \ + $(VERIFY_SIG_HDR) \ $(TRUNNER_TESTS_HDR) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@) @@ -697,6 +711,18 @@ $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ endef +VERIFY_SIG_SETUP := $(CURDIR)/verify_sig_setup.sh +VERIFY_SIG_HDR := verification_cert.h +VERIFICATION_CERT := $(BUILD_DIR)/signing_key.der +PRIVATE_KEY := $(BUILD_DIR)/signing_key.pem + +$(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP) + $(Q)mkdir -p $(BUILD_DIR) + $(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR) + +$(VERIFY_SIG_HDR): $(VERIFICATION_CERT) + $(Q)xxd -i -n test_progs_verification_cert $< > $@ + # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs @@ -716,6 +742,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ disasm.c \ disasm_helpers.c \ json_writer.c \ + $(VERIFY_SIG_HDR) \ flow_dissector_load.h \ ip_check_defrag_frags.h TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ @@ -725,7 +752,7 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/uprobe_multi \ $(TEST_KMOD_TARGETS) \ ima_setup.sh \ - verify_sig_setup.sh \ + $(VERIFY_SIG_SETUP) \ $(wildcard progs/btf_dump_test_case_*.c) \ $(wildcard progs/*.bpf.o) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c index 13e101f370a1d3..92b5f378bfb826 100644 --- a/tools/testing/selftests/bpf/prog_tests/atomics.c +++ b/tools/testing/selftests/bpf/prog_tests/atomics.c @@ -165,11 +165,17 @@ static void test_xchg(struct atomics_lskel *skel) void test_atomics(void) { struct atomics_lskel *skel; + int err; - skel = atomics_lskel__open_and_load(); - if (!ASSERT_OK_PTR(skel, "atomics skeleton load")) + skel = atomics_lskel__open(); + if (!ASSERT_OK_PTR(skel, "atomics skeleton open")) return; + skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = atomics_lskel__load(skel); + if (!ASSERT_OK(err, "atomics skeleton load")) + goto cleanup; + if (skel->data->skip_tests) { printf("%s:SKIP:no ENABLE_ATOMICS_TESTS (missing Clang BPF atomics support)", __func__); diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c index 130f5b82d2e601..5ef1804e44dfd5 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c @@ -12,13 +12,24 @@ void test_fentry_fexit(void) int err, prog_fd, i; LIBBPF_OPTS(bpf_test_run_opts, topts); - fentry_skel = fentry_test_lskel__open_and_load(); + fentry_skel = fentry_test_lskel__open(); if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load")) goto close_prog; - fexit_skel = fexit_test_lskel__open_and_load(); + + fentry_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fentry_test_lskel__load(fentry_skel); + if (!ASSERT_OK(err, "fentry_skel_load")) + goto close_prog; + + fexit_skel = fexit_test_lskel__open(); if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load")) goto close_prog; + fexit_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fexit_test_lskel__load(fexit_skel); + if (!ASSERT_OK(err, "fexit_skel_load")) + goto close_prog; + err = fentry_test_lskel__attach(fentry_skel); if (!ASSERT_OK(err, "fentry_attach")) goto close_prog; diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index aee1bc77a17f7b..ec882328eb591d 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -43,8 +43,13 @@ static void fentry_test(void) struct fentry_test_lskel *fentry_skel = NULL; int err; - fentry_skel = fentry_test_lskel__open_and_load(); - if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load")) + fentry_skel = fentry_test_lskel__open(); + if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_open")) + goto cleanup; + + fentry_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fentry_test_lskel__load(fentry_skel); + if (!ASSERT_OK(err, "fentry_skel_load")) goto cleanup; err = fentry_test_common(fentry_skel); diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c index 1c13007e37dd2c..94eed753560c1c 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c @@ -43,8 +43,13 @@ static void fexit_test(void) struct fexit_test_lskel *fexit_skel = NULL; int err; - fexit_skel = fexit_test_lskel__open_and_load(); - if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load")) + fexit_skel = fexit_test_lskel__open(); + if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_open")) + goto cleanup; + + fexit_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fexit_test_lskel__load(fexit_skel); + if (!ASSERT_OK(err, "fexit_skel_load")) goto cleanup; err = fexit_test_common(fexit_skel); diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 309d9d4a8ace1d..02a85dda30e649 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -14,12 +14,14 @@ #include #include #include +#include #include #include #include #include "json_writer.h" #include "network_helpers.h" +#include "verification_cert.h" /* backtrace() and backtrace_symbols_fd() are glibc specific, * use header file when glibc is available and provide stub @@ -1928,6 +1930,13 @@ static void free_test_states(void) } } +static __u32 register_session_key(const char *key_data, size_t key_data_size) +{ + return syscall(__NR_add_key, "asymmetric", "libbpf_session_key", + (const void *)key_data, key_data_size, + KEY_SPEC_SESSION_KEYRING); +} + int main(int argc, char **argv) { static const struct argp argp = { @@ -1961,6 +1970,10 @@ int main(int argc, char **argv) /* Use libbpf 1.0 API mode */ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); libbpf_set_print(libbpf_print_fn); + err = register_session_key((const char *)test_progs_verification_cert, + test_progs_verification_cert_len); + if (err < 0) + return err; traffic_monitor_set_print(traffic_monitor_print_fn); diff --git a/tools/testing/selftests/bpf/verify_sig_setup.sh b/tools/testing/selftests/bpf/verify_sig_setup.sh index f2cac42298ba3b..09179fb551f09a 100755 --- a/tools/testing/selftests/bpf/verify_sig_setup.sh +++ b/tools/testing/selftests/bpf/verify_sig_setup.sh @@ -32,7 +32,7 @@ usage() exit 1 } -setup() +genkey() { local tmp_dir="$1" @@ -45,9 +45,14 @@ setup() openssl x509 -in ${tmp_dir}/signing_key.pem -out \ ${tmp_dir}/signing_key.der -outform der +} - key_id=$(cat ${tmp_dir}/signing_key.der | keyctl padd asymmetric ebpf_testing_key @s) +setup() +{ + local tmp_dir="$1" + genkey "${tmp_dir}" + key_id=$(cat ${tmp_dir}/signing_key.der | keyctl padd asymmetric ebpf_testing_key @s) keyring_id=$(keyctl newring ebpf_testing_keyring @s) keyctl link $key_id $keyring_id } @@ -105,6 +110,8 @@ main() if [[ "${action}" == "setup" ]]; then setup "${tmp_dir}" + elif [[ "${action}" == "genkey" ]]; then + genkey "${tmp_dir}" elif [[ "${action}" == "cleanup" ]]; then cleanup "${tmp_dir}" elif [[ "${action}" == "fsverity-create-sign" ]]; then From d1d6ad7f6686e208aba06b7af3feef7a7cba61cf Mon Sep 17 00:00:00 2001 From: Roy Vegard Ovesen Date: Mon, 22 Sep 2025 20:54:10 +0200 Subject: [PATCH 2521/3206] ALSA: usb-audio: don't apply interface quirk to Presonus S1824c Testing with a Presonus STUDIO 1824c together with a Behringer ultragain digital ADAT device shows that using all 3 altno settings works fine. When selecting sample rate, the driver sets the interface to the correct altno setting and the correct number of channels is set. Selecting the correct altno setting via Ardour, Reaper or whatever other way to set the sample rate is more convenient than re-loading the driver module with device_setup to set altno. Signed-off-by: Roy Vegard Ovesen Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 766db7d00cbc95..4a35f962527e91 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1599,9 +1599,6 @@ int snd_usb_apply_interface_quirk(struct snd_usb_audio *chip, /* presonus studio 1810c: skip altsets incompatible with device_setup */ if (chip->usb_id == USB_ID(0x194f, 0x010c)) return s1810c_skip_setting_quirk(chip, iface, altno); - /* presonus studio 1824c: skip altsets incompatible with device_setup */ - if (chip->usb_id == USB_ID(0x194f, 0x010d)) - return s1810c_skip_setting_quirk(chip, iface, altno); return 0; } From 6a378edc9a61f0276e9c0261b0d65b2353c7f9e4 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 28 Jul 2025 17:57:55 +0930 Subject: [PATCH 2522/3206] btrfs: enhance error messages for delalloc range failure When running emulated write error tests like generic/475, we can hit error messages like this: BTRFS error (device dm-12 state EA): run_delalloc_nocow failed, root=596 inode=264 start=1605632 len=73728: -5 BTRFS error (device dm-12 state EA): failed to run delalloc range, root=596 ino=264 folio=1605632 submit_bitmap=0-7 start=1605632 len=73728: -5 Which is normally buried by direct IO error messages. However above error messages are not enough to determine which is the real range that caused the error. Considering we can have multiple different extents in one delalloc range (e.g. some COW extents along with some NOCOW extents), just outputting the error at the end of run_delalloc_nocow() is not enough. To enhance the error messages: - Remove the rate limit on the existing error messages In the generic/475 example, most error messages are from direct IO, not really from the delalloc range. Considering how useful the delalloc range error messages are, we don't want they to be rate limited. - Add extra @cur_offset output for cow_file_range() - Add extra variable output for run_delalloc_nocow() This is especially important for run_delalloc_nocow(), as there are extra error paths where we can hit error without into nocow_one_range() nor fallback_to_cow(). - Add an error message for nocow_one_range() That's the missing part. For fallback_to_cow(), we have error message from cow_file_range() already. - Constify the @len and @end local variables for nocow_one_range() This makes it much easier to make sure @len and @end are not modified at runtime. Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4583d6331bfd9d..56f56206a83293 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1534,10 +1534,11 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, end - start - cur_alloc_size + 1, NULL); } - btrfs_err_rl(fs_info, - "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", - __func__, btrfs_root_id(inode->root), - btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); + btrfs_err(fs_info, +"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, + start, cur_alloc_size, ret); return ret; } @@ -1969,8 +1970,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio u64 file_pos, bool is_prealloc) { struct btrfs_ordered_extent *ordered; - u64 len = nocow_args->file_extent.num_bytes; - u64 end = file_pos + len - 1; + const u64 len = nocow_args->file_extent.num_bytes; + const u64 end = file_pos + len - 1; int ret = 0; btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); @@ -2017,8 +2018,13 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio * We do not clear the folio Dirty flags because they are set and * cleaered by the caller. */ - if (ret < 0) + if (ret < 0) { btrfs_cleanup_ordered_extents(inode, file_pos, len); + btrfs_err(inode->root->fs_info, + "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), btrfs_ino(inode), + file_pos, len, ret); + } return ret; } @@ -2306,10 +2312,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL); } btrfs_free_path(path); - btrfs_err_rl(fs_info, - "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", - __func__, btrfs_root_id(inode->root), - btrfs_ino(inode), start, end + 1 - start, ret); + btrfs_err(fs_info, +"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d", + __func__, btrfs_root_id(inode->root), btrfs_ino(inode), + start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len, + untouched_start, untouched_len, ret); return ret; } From 13141df705b7145e46b88b1147d67d7a43f2d9e8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 28 Jul 2025 17:57:56 +0930 Subject: [PATCH 2523/3206] btrfs: make nocow_one_range() to do cleanup on error Currently if we hit an error inside nocow_one_range(), we do not clear the page dirty, and let the caller to handle it. This is very different compared to fallback_to_cow(), when that function failed, everything will be cleaned up by cow_file_range(). Enhance the situation by: - Use a common error handling for nocow_one_range() If we failed anything, use the same btrfs_cleanup_ordered_extents() and extent_clear_unlock_delalloc(). btrfs_cleanup_ordered_extents() is safe even if we haven't created new ordered extent, in that case there should be no OE and that function will do nothing. The same applies to extent_clear_unlock_delalloc(), and since we're passing PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK, it will also clear folio dirty flag during error handling. - Avoid touching the failed range of nocow_one_range() As the failed range will be cleaned up and unlocked by that function. Here we introduce a new variable @nocow_end to record the failed range, so that we can skip it during the error handling of run_delalloc_nocow(). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 60 +++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 56f56206a83293..80e02b49b3a945 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1982,8 +1982,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { - btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); - return PTR_ERR(em); + ret = PTR_ERR(em); + goto error; } btrfs_free_extent_map(em); } @@ -1995,8 +1995,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio if (IS_ERR(ordered)) { if (is_prealloc) btrfs_drop_extent_map_range(inode, file_pos, end, false); - btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); - return PTR_ERR(ordered); + ret = PTR_ERR(ordered); + goto error; } if (btrfs_is_data_reloc_root(inode->root)) @@ -2008,23 +2008,25 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio ret = btrfs_reloc_clone_csums(ordered); btrfs_put_ordered_extent(ordered); + if (ret < 0) + goto error; extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); - /* - * On error, we need to cleanup the ordered extents we created. - * - * We do not clear the folio Dirty flags because they are set and - * cleaered by the caller. - */ - if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, file_pos, len); - btrfs_err(inode->root->fs_info, - "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d", - __func__, btrfs_root_id(inode->root), btrfs_ino(inode), - file_pos, len, ret); - } + return ret; + +error: + btrfs_cleanup_ordered_extents(inode, file_pos, len); + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + btrfs_err(inode->root->fs_info, + "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), btrfs_ino(inode), + file_pos, len, ret); return ret; } @@ -2046,8 +2048,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, /* * If not 0, represents the inclusive end of the last fallback_to_cow() * range. Only for error handling. + * + * The same for nocow_end, it's to avoid double cleaning up the range + * already cleaned by nocow_one_range(). */ u64 cow_end = 0; + u64 nocow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; @@ -2222,8 +2228,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, &nocow_args, cur_offset, extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (ret < 0) + if (ret < 0) { + nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; goto error; + } cur_offset = extent_end; } btrfs_release_path(path); @@ -2250,12 +2258,22 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, * start cur_offset end * | OE cleanup | Untouched | * - * We finished a fallback_to_cow() or nocow_one_range() call, but - * failed to check the next range. + * We finished a fallback_to_cow() or nocow_one_range() call, + * but failed to check the next range. + * + * or + * start cur_offset nocow_end end + * | OE cleanup | Skip | Untouched | + * + * nocow_one_range() failed, the range [cur_offset, nocow_end] is + * alread cleaned up. */ oe_cleanup_start = start; oe_cleanup_len = cur_offset - start; - untouched_start = cur_offset; + if (nocow_end) + untouched_start = nocow_end + 1; + else + untouched_start = cur_offset; untouched_len = end + 1 - untouched_start; } else if (cow_start != (u64)-1 && cow_end == 0) { /* From 737852c060fb54e5c43c2843fc4bad3c78cef51a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 28 Jul 2025 17:57:57 +0930 Subject: [PATCH 2524/3206] btrfs: keep folios locked inside run_delalloc_nocow() [BUG] There is a very low chance that DEBUG_WARN() inside btrfs_writepage_cow_fixup() can be triggered when CONFIG_BTRFS_EXPERIMENTAL is enabled. This only happens after run_delalloc_nocow() failed. Unfortunately I haven't hit it for a while thus no real world dmesg for now. [CAUSE] There is a race window where after run_delalloc_nocow() failed, error handling can race with writeback thread. Before we hit run_delalloc_nocow(), there is an inode with the following dirty pages: (4K page size, 4K block size, no large folio) 0 4K 8K 12K 16K |/////////|///////////|///////////|////////////| The inode also have NODATACOW flag, and the above dirty range will go through different extents during run_delalloc_range(): 0 4K 8K 12K 16K | NOCOW | COW | COW | NOCOW | The race happen like this: writeback thread A | writeback thread B ----------------------------------+-------------------------------------- Writeback for folio 0 | run_delalloc_nocow() | |- nocow_one_range() | | For range [0, 4K), ret = 0 | | | |- fallback_to_cow() | | For range [4K, 8K), ret = 0 | | Folio 4K *UNLOCKED* | | | Writeback for folio 4K |- fallback_to_cow() | extent_writepage() | For range [8K, 12K), failure | |- writepage_delalloc() | | | |- btrfs_cleanup_ordered_extents()| | |- btrfs_folio_clear_ordered() | | | Folio 0 still locked, safe | | | | | Ordered extent already allocated. | | | Nothing to do. | | |- extent_writepage_io() | | |- btrfs_writepage_cow_fixup() |- btrfs_folio_clear_ordered() | | Folio 4K hold by thread B, | | UNSAFE! | |- btrfs_test_ordered() | | Cleared by thread A, | | | |- DEBUG_WARN(); This is only possible after run_delalloc_nocow() failure, as cow_file_range() will keep all folios and io tree range locked, until everything is finished or after error handling. The root cause is we allow fallback_to_cow() and nocow_one_range() to unlock the folios after a successful run, so that during error handling we're no longer safe to use btrfs_cleanup_ordered_extents() as the folios are already unlocked. [FIX] - Make fallback_to_cow() and nocow_one_range() to keep folios locked after a successful run For fallback_to_cow() we can pass COW_FILE_RANGE_KEEP_LOCKED flag into cow_file_range(). For nocow_one_range() we have to remove the PAGE_UNLOCK flag from extent_clear_unlock_delalloc(). - Unlock folios if everything is fine in run_delalloc_nocow() - Use extent_clear_unlock_delalloc() to handle range [@start, @cur_offset) inside run_delalloc_nocow() Since folios are still locked, we do not need cleanup_dirty_folios() to do the cleanup. extent_clear_unlock_delalloc() with "PAGE_START_WRITEBACK | PAGE_END_WRITEBACK" will clear the dirty flags. - Remove cleanup_dirty_folios() Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 73 +++++++++++++++--------------------------------- 1 file changed, 22 insertions(+), 51 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 80e02b49b3a945..33cabe0a54a492 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1772,9 +1772,15 @@ static int fallback_to_cow(struct btrfs_inode *inode, * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work. + * + * And here we do not unlock the folio after a successful run. + * The folios will be unlocked after everything is finished, or by error handling. + * + * This is to ensure error handling won't need to clear dirty/ordered flags without + * a locked folio, which can race with writeback. */ ret = cow_file_range(inode, locked_folio, start, end, NULL, - COW_FILE_RANGE_NO_INLINE); + COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED); ASSERT(ret != 1); return ret; } @@ -1917,53 +1923,6 @@ static int can_nocow_file_extent(struct btrfs_path *path, return ret < 0 ? ret : can_nocow; } -/* - * Cleanup the dirty folios which will never be submitted due to error. - * - * When running a delalloc range, we may need to split the ranges (due to - * fragmentation or NOCOW). If we hit an error in the later part, we will error - * out and previously successfully executed range will never be submitted, thus - * we have to cleanup those folios by clearing their dirty flag, starting and - * finishing the writeback. - */ -static void cleanup_dirty_folios(struct btrfs_inode *inode, - struct folio *locked_folio, - u64 start, u64 end, int error) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct address_space *mapping = inode->vfs_inode.i_mapping; - pgoff_t start_index = start >> PAGE_SHIFT; - pgoff_t end_index = end >> PAGE_SHIFT; - u32 len; - - ASSERT(end + 1 - start < U32_MAX); - ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(end + 1, fs_info->sectorsize)); - len = end + 1 - start; - - /* - * Handle the locked folio first. - * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. - */ - btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); - - for (pgoff_t index = start_index; index <= end_index; index++) { - struct folio *folio; - - /* Already handled at the beginning. */ - if (index == locked_folio->index) - continue; - folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); - /* Cache already dropped, no need to do any cleanup. */ - if (IS_ERR(folio)) - continue; - btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); - folio_unlock(folio); - folio_put(folio); - } - mapping_set_error(mapping, error); -} - static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, struct extent_state **cached, struct can_nocow_file_extent_args *nocow_args, @@ -2013,7 +1972,7 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); + PAGE_SET_ORDERED); return ret; error: @@ -2248,6 +2207,14 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, cow_start = (u64)-1; } + /* + * Everything is finished without an error, can unlock the folios now. + * + * No need to touch the io tree range nor set folio ordered flag, as + * fallback_to_cow() and nocow_one_range() have already handled them. + */ + extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK); + btrfs_free_path(path); return 0; @@ -2306,9 +2273,13 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, } if (oe_cleanup_len) { + const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1; btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len); - cleanup_dirty_folios(inode, locked_folio, oe_cleanup_start, - oe_cleanup_start + oe_cleanup_len - 1, ret); + extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end, + locked_folio, NULL, + EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); } if (untouched_len) { From 2c5cca03c1738e1342cbe57671d463f275c00623 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 13 Aug 2025 14:33:38 +0930 Subject: [PATCH 2525/3206] btrfs: add an fs_info parameter for compression workspace manager [BACKGROUND] Currently btrfs shares workspaces and their managers for all filesystems, this is mostly fine as all those workspaces are using page size based buffers, and btrfs only support block size (bs) <= page size (ps). This means even if bs < ps, we at most waste some buffer space in the workspace, but everything will still work fine. The problem here is that is limiting our support for bs > ps cases. As now a workspace now may need larger buffer to handle bs > ps cases, but since the pool has no way to distinguish different workspaces, a regular workspace (which is still using buffer size based on ps) can be passed to a btrfs whose bs > ps. In that case the buffer is not large enough, and will cause various problems. [ENHANCEMENT] To prepare for the per-fs workspace migration, add an fs_info parameter to all workspace related functions. For now this new fs_info parameter is not yet utilized. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 73 ++++++++++++++++++++++-------------------- fs/btrfs/compression.h | 23 +++++++------ fs/btrfs/lzo.c | 2 +- fs/btrfs/zlib.c | 6 ++-- fs/btrfs/zstd.c | 12 +++---- 5 files changed, 62 insertions(+), 54 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 839c8c33aa5aa4..d8ba6d0d4e8374 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -702,7 +702,7 @@ static void free_heuristic_ws(struct list_head *ws) kfree(workspace); } -static struct list_head *alloc_heuristic_ws(void) +static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info) { struct heuristic_ws *ws; @@ -741,13 +741,13 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -static struct list_head *alloc_workspace(int type, int level) +static struct list_head *alloc_workspace(struct btrfs_fs_info *fs_info, int type, int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); - case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); - case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(); - case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(fs_info); + case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(fs_info, level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(fs_info); + case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(fs_info, level); default: /* * This can't happen, the type is validated several times @@ -773,7 +773,7 @@ static void free_workspace(int type, struct list_head *ws) } } -static void btrfs_init_workspace_manager(int type) +static void btrfs_init_workspace_manager(struct btrfs_fs_info *fs_info, int type) { struct workspace_manager *wsm; struct list_head *workspace; @@ -788,9 +788,9 @@ static void btrfs_init_workspace_manager(int type) * Preallocate one workspace for each compression type so we can * guarantee forward progress in the worst case */ - workspace = alloc_workspace(type, 0); + workspace = alloc_workspace(fs_info, type, 0); if (IS_ERR(workspace)) { - btrfs_warn(NULL, + btrfs_warn(fs_info, "cannot preallocate compression workspace, will try later"); } else { atomic_set(&wsm->total_ws, 1); @@ -819,7 +819,7 @@ static void btrfs_cleanup_workspace_manager(int type) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(int type, int level) +struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level) { struct workspace_manager *wsm; struct list_head *workspace; @@ -867,7 +867,7 @@ struct list_head *btrfs_get_workspace(int type, int level) * context of btrfs_compress_bio/btrfs_compress_pages */ nofs_flag = memalloc_nofs_save(); - workspace = alloc_workspace(type, level); + workspace = alloc_workspace(fs_info, type, level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(workspace)) { @@ -890,7 +890,7 @@ struct list_head *btrfs_get_workspace(int type, int level) /* no burst */ 1); if (__ratelimit(&_rs)) - btrfs_warn(NULL, + btrfs_warn(fs_info, "no compression workspaces, low memory, retrying"); } goto again; @@ -898,13 +898,13 @@ struct list_head *btrfs_get_workspace(int type, int level) return workspace; } -static struct list_head *get_workspace(int type, int level) +static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level); - case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level); - case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level); - case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level); + case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(fs_info, type, level); + case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(fs_info, level); + case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(fs_info, type, level); + case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(fs_info, level); default: /* * This can't happen, the type is validated several times @@ -918,7 +918,7 @@ static struct list_head *get_workspace(int type, int level) * put a workspace struct back on the list or free it if we have enough * idle ones sitting around */ -void btrfs_put_workspace(int type, struct list_head *ws) +void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws) { struct workspace_manager *wsm; struct list_head *idle_ws; @@ -949,13 +949,13 @@ void btrfs_put_workspace(int type, struct list_head *ws) cond_wake_up(ws_wait); } -static void put_workspace(int type, struct list_head *ws) +static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws) { switch (type) { - case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws); + case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(fs_info, ws); default: /* * This can't happen, the type is validated several times @@ -1038,29 +1038,31 @@ int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inod u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; level = btrfs_compress_set_level(type, level); - workspace = get_workspace(type, level); + workspace = get_workspace(fs_info, type, level); ret = compression_compress_pages(type, workspace, inode, start, folios, out_folios, total_in, total_out); /* The total read-in bytes should be no larger than the input. */ ASSERT(*total_in <= orig_len); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); return ret; } static int btrfs_decompress_bio(struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct list_head *workspace; int ret; int type = cb->compress_type; - workspace = get_workspace(type, 0); + workspace = get_workspace(fs_info, type, 0); ret = compression_decompress_bio(workspace, cb); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); if (!ret) zero_fill_bio(&cb->orig_bbio->bio); @@ -1087,10 +1089,10 @@ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, */ ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); - workspace = get_workspace(type, 0); + workspace = get_workspace(fs_info, type, 0); ret = compression_decompress(type, workspace, data_in, dest_folio, dest_pgoff, srclen, destlen); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); return ret; } @@ -1106,10 +1108,10 @@ int __init btrfs_init_compress(void) if (!compr_pool.shrinker) return -ENOMEM; - btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); - btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); - btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); - zstd_init_workspace_manager(); + btrfs_init_workspace_manager(NULL, BTRFS_COMPRESS_NONE); + btrfs_init_workspace_manager(NULL, BTRFS_COMPRESS_ZLIB); + btrfs_init_workspace_manager(NULL, BTRFS_COMPRESS_LZO); + zstd_init_workspace_manager(NULL); spin_lock_init(&compr_pool.lock); INIT_LIST_HEAD(&compr_pool.list); @@ -1543,7 +1545,8 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, */ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) { - struct list_head *ws_list = get_workspace(0, 0); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct list_head *ws_list = get_workspace(fs_info, 0, 0); struct heuristic_ws *ws; u32 i; u8 byte; @@ -1612,7 +1615,7 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) } out: - put_workspace(0, ws_list); + put_workspace(fs_info, 0, ws_list); return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index e7282fb7c13b97..f600555b758388 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -75,6 +75,11 @@ struct compressed_bio { struct btrfs_bio bbio; }; +static inline struct btrfs_fs_info *cb_to_fs_info(const struct compressed_bio *cb) +{ + return cb->bbio.fs_info; +} + /* @range_end must be exclusive. */ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur) { @@ -128,8 +133,8 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -struct list_head *btrfs_get_workspace(int type, int level); -void btrfs_put_workspace(int type, struct list_head *ws); +struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level); +void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws); struct btrfs_compress_op { struct workspace_manager *workspace_manager; @@ -162,9 +167,9 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *zlib_alloc_workspace(unsigned int level); +struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level); void zlib_free_workspace(struct list_head *ws); -struct list_head *zlib_get_workspace(unsigned int level); +struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level); int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, @@ -173,7 +178,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *lzo_alloc_workspace(void); +struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info); void lzo_free_workspace(struct list_head *ws); int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, @@ -183,11 +188,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -void zstd_init_workspace_manager(void); +void zstd_init_workspace_manager(struct btrfs_fs_info *fs_info); void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(int level); +struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level); void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(int level); -void zstd_put_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level); +void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws); #endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 7b46aef7dd93af..82407d7d9502eb 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *lzo_alloc_workspace(void) +struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info) { struct workspace *workspace; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 33dc7e7b5c364c..5fc045aeaa199e 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -36,9 +36,9 @@ struct workspace { static struct workspace_manager wsm; -struct list_head *zlib_get_workspace(unsigned int level) +struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { - struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level); + struct list_head *ws = btrfs_get_workspace(fs_info, BTRFS_COMPRESS_ZLIB, level); struct workspace *workspace = list_entry(ws, struct workspace, list); workspace->level = level; @@ -55,7 +55,7 @@ void zlib_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zlib_alloc_workspace(unsigned int level) +struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { struct workspace *workspace; int workspacesize; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index d521187336a59f..24bf5cfaecec2a 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -182,7 +182,7 @@ static void zstd_calc_ws_mem_sizes(void) } } -void zstd_init_workspace_manager(void) +void zstd_init_workspace_manager(struct btrfs_fs_info *fs_info) { struct list_head *ws; int i; @@ -198,7 +198,7 @@ void zstd_init_workspace_manager(void) for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&wsm.idle_ws[i]); - ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); + ws = zstd_alloc_workspace(fs_info, ZSTD_BTRFS_MAX_LEVEL); if (IS_ERR(ws)) { btrfs_warn(NULL, "cannot preallocate zstd compression workspace"); } else { @@ -276,7 +276,7 @@ static struct list_head *zstd_find_workspace(int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -struct list_head *zstd_get_workspace(int level) +struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level) { struct list_head *ws; unsigned int nofs_flag; @@ -291,7 +291,7 @@ struct list_head *zstd_get_workspace(int level) return ws; nofs_flag = memalloc_nofs_save(); - ws = zstd_alloc_workspace(level); + ws = zstd_alloc_workspace(fs_info, level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ws)) { @@ -318,7 +318,7 @@ struct list_head *zstd_get_workspace(int level) * isn't set, it is also set here. Only the max level workspace tries and wakes * up waiting workspaces. */ -void zstd_put_workspace(struct list_head *ws) +void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws) { struct workspace *workspace = list_to_workspace(ws); @@ -357,7 +357,7 @@ void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zstd_alloc_workspace(int level) +struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level) { struct workspace *workspace; From 330f02b136a8c2e025548683c265fc2be844614c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 13 Aug 2025 15:05:20 +0930 Subject: [PATCH 2526/3206] btrfs: add workspace manager initialization for zstd This involves: - Add zstd_alloc_workspace_manager() and zstd_free_workspace_manager() Those two functions will accept an fs_info pointer, and alloc/free fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] pointer. - Add btrfs_alloc_compress_wsm() and btrfs_free_compress_wsm() Those are helpers allocating the workspace managers for all algorithms. For now only zstd is supported, and the timing is a little unusual, the btrfs_alloc_compress_wsm() should only be called after the sectorsize being initialized. Meanwhile btrfs_free_fs_info_compress() is called in btrfs_free_fs_info(). - Move the definition of btrfs_compression_type to "fs.h" The reason is that "compression.h" has already included "fs.h", thus we can not just include "compression.h" to get the definition of BTRFS_NR_COMPRESS_TYPES to define fs_info::compr_wsm[]. For now the per-fs zstd workspace manager won't really have any effect, and all compression is still going through the global workspace manager. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 18 ++++++++++++++ fs/btrfs/compression.h | 15 ++++-------- fs/btrfs/disk-io.c | 4 ++++ fs/btrfs/fs.h | 13 +++++++++++ fs/btrfs/zstd.c | 53 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 93 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d8ba6d0d4e8374..d019a00055fd40 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1097,6 +1097,24 @@ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, return ret; } +int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info) +{ + int ret; + + ret = zstd_alloc_workspace_manager(fs_info); + if (ret < 0) + goto error; + return 0; +error: + btrfs_free_compress_wsm(fs_info); + return ret; +} + +void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info) +{ + zstd_free_workspace_manager(fs_info); +} + int __init btrfs_init_compress(void) { if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE, diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index f600555b758388..70a3294bd14a8d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -89,6 +89,9 @@ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u6 return min(range_end, folio_end(folio)) - cur; } +int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info); +void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info); + int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); @@ -112,16 +115,6 @@ int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret) struct folio *btrfs_alloc_compr_folio(void); void btrfs_free_compr_folio(struct folio *folio); -enum btrfs_compression_type { - BTRFS_COMPRESS_NONE = 0, - BTRFS_COMPRESS_ZLIB = 1, - BTRFS_COMPRESS_LZO = 2, - BTRFS_COMPRESS_ZSTD = 3, - BTRFS_NR_COMPRESS_TYPES = 4, - - BTRFS_DEFRAG_DONT_COMPRESS, -}; - struct workspace_manager { struct list_head idle_ws; spinlock_t ws_lock; @@ -188,6 +181,8 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); +int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info); +void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info); void zstd_init_workspace_manager(struct btrfs_fs_info *fs_info); void zstd_cleanup_workspace_manager(void); struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b8ea63e444c724..eb55ecf4bf2582 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1248,6 +1248,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) if (fs_info->fs_devices) btrfs_close_devices(fs_info->fs_devices); + btrfs_free_compress_wsm(fs_info); percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); @@ -3407,6 +3408,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); + ret = btrfs_alloc_compress_wsm(fs_info); + if (ret) + goto fail_sb_buffer; ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index a84128e991a9d4..2ccba95af06076 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -306,6 +306,16 @@ enum { #define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) +enum btrfs_compression_type { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_ZSTD = 3, + BTRFS_NR_COMPRESS_TYPES = 4, + + BTRFS_DEFRAG_DONT_COMPRESS, +}; + struct btrfs_dev_replace { /* See #define above */ u64 replace_state; @@ -508,6 +518,9 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long long mount_opt; + /* Compress related structures. */ + void *compr_wsm[BTRFS_NR_COMPRESS_TYPES]; + int compress_type; int compress_level; u32 commit_interval; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 24bf5cfaecec2a..24898aee3ee00d 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -182,6 +182,36 @@ static void zstd_calc_ws_mem_sizes(void) } } +int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info) +{ + struct zstd_workspace_manager *zwsm; + struct list_head *ws; + + ASSERT(fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] == NULL); + zwsm = kzalloc(sizeof(*zwsm), GFP_KERNEL); + if (!zwsm) + return -ENOMEM; + zstd_calc_ws_mem_sizes(); + zwsm->ops = &btrfs_zstd_compress; + spin_lock_init(&zwsm->lock); + init_waitqueue_head(&zwsm->wait); + timer_setup(&zwsm->timer, zstd_reclaim_timer_fn, 0); + + INIT_LIST_HEAD(&zwsm->lru_list); + for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&zwsm->idle_ws[i]); + fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = zwsm; + + ws = zstd_alloc_workspace(fs_info, ZSTD_BTRFS_MAX_LEVEL); + if (IS_ERR(ws)) { + btrfs_warn(NULL, "cannot preallocate zstd compression workspace"); + } else { + set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &zwsm->active_map); + list_add(ws, &zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); + } + return 0; +} + void zstd_init_workspace_manager(struct btrfs_fs_info *fs_info) { struct list_head *ws; @@ -207,6 +237,29 @@ void zstd_init_workspace_manager(struct btrfs_fs_info *fs_info) } } +void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info) +{ + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; + struct workspace *workspace; + + if (!zwsm) + return; + fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = NULL; + spin_lock_bh(&zwsm->lock); + for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { + while (!list_empty(&zwsm->idle_ws[i])) { + workspace = container_of(zwsm->idle_ws[i].next, + struct workspace, list); + list_del(&workspace->list); + list_del(&workspace->lru_list); + zstd_free_workspace(&workspace->list); + } + } + spin_unlock_bh(&zwsm->lock); + timer_delete_sync(&zwsm->timer); + kfree(zwsm); +} + void zstd_cleanup_workspace_manager(void) { struct workspace *workspace; From 6f9c3f48acffaffe7bb643b3bc04f8a99021179a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 13 Aug 2025 16:36:57 +0930 Subject: [PATCH 2527/3206] btrfs: add generic workspace manager initialization This involves: - Add (alloc|free)_workspace_manager helpers. These are the helper to alloc/free workspace_manager structure. The allocator will allocate a workspace_manager structure, initialize it, and pre-allocate one workspace for it. The freer will do the cleanup and set the manager pointer to NULL. - Call alloc_workspace_manager() inside btrfs_alloc_compress_wsm() - Call alloc_workspace_manager() inside btrfs_free_compress_wsm() For none, zlib and lzo compression algorithms. For now the generic per-fs workspace managers won't really have any effect, and all compression is still going through the global workspace manager. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 66 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d019a00055fd40..26c632f0a20cc5 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -773,6 +773,40 @@ static void free_workspace(int type, struct list_head *ws) } } +static int alloc_workspace_manager(struct btrfs_fs_info *fs_info, + enum btrfs_compression_type type) +{ + struct workspace_manager *gwsm; + struct list_head *workspace; + + ASSERT(fs_info->compr_wsm[type] == NULL); + gwsm = kzalloc(sizeof(*gwsm), GFP_KERNEL); + if (!gwsm) + return -ENOMEM; + + INIT_LIST_HEAD(&gwsm->idle_ws); + spin_lock_init(&gwsm->ws_lock); + atomic_set(&gwsm->total_ws, 0); + init_waitqueue_head(&gwsm->ws_wait); + fs_info->compr_wsm[type] = gwsm; + + /* + * Preallocate one workspace for each compression type so we can + * guarantee forward progress in the worst case + */ + workspace = alloc_workspace(fs_info, type, 0); + if (IS_ERR(workspace)) { + btrfs_warn(fs_info, + "cannot preallocate compression workspace for %s, will try later", + btrfs_compress_type2str(type)); + } else { + atomic_set(&gwsm->total_ws, 1); + gwsm->free_ws = 1; + list_add(workspace, &gwsm->idle_ws); + } + return 0; +} + static void btrfs_init_workspace_manager(struct btrfs_fs_info *fs_info, int type) { struct workspace_manager *wsm; @@ -799,6 +833,26 @@ static void btrfs_init_workspace_manager(struct btrfs_fs_info *fs_info, int type } } +static void free_workspace_manager(struct btrfs_fs_info *fs_info, + enum btrfs_compression_type type) +{ + struct list_head *ws; + struct workspace_manager *gwsm = fs_info->compr_wsm[type]; + + /* ZSTD uses its own workspace manager, should enter here. */ + ASSERT(type != BTRFS_COMPRESS_ZSTD && type < BTRFS_NR_COMPRESS_TYPES); + if (!gwsm) + return; + fs_info->compr_wsm[type] = NULL; + while (!list_empty(&gwsm->idle_ws)) { + ws = gwsm->idle_ws.next; + list_del(ws); + free_workspace(type, ws); + atomic_dec(&gwsm->total_ws); + } + kfree(gwsm); +} + static void btrfs_cleanup_workspace_manager(int type) { struct workspace_manager *wsman; @@ -1101,6 +1155,15 @@ int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info) { int ret; + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_NONE); + if (ret < 0) + goto error; + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB); + if (ret < 0) + goto error; + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_LZO); + if (ret < 0) + goto error; ret = zstd_alloc_workspace_manager(fs_info); if (ret < 0) goto error; @@ -1112,6 +1175,9 @@ int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info) void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info) { + free_workspace_manager(fs_info, BTRFS_COMPRESS_NONE); + free_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB); + free_workspace_manager(fs_info, BTRFS_COMPRESS_LZO); zstd_free_workspace_manager(fs_info); } From 856d46c31343169a566f44035c4c74b2f0842438 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 14 Aug 2025 09:49:50 +0930 Subject: [PATCH 2528/3206] btrfs: migrate to use per-fs workspace manager There are several interfaces involved for each algorithm: - alloc workspace All algorithms allocate a workspace without the need for workspace manager. So no change needs to be done. - get workspace This involves checking the workspace manager to find a free one, and if not, allocate a new one. For none and lzo, they share the same generic btrfs_get_workspace() helper, only needs to update that function to use the per-fs manager. For zlib it uses a wrapper around btrfs_get_workspace(), so no special work needed. For zstd, update zstd_find_workspace() and zstd_get_workspace() to utilize the per-fs manager. - put workspace For none/zlib/lzo they share the same btrfs_put_workspace(), update that function to use the per-fs manager. For zstd, it's zstd_put_workspace(), the same update. - zstd specific timer This is the timer to reclaim workspace, change it to grab the per-fs workspace manager instead. Now all workspace are managed by the per-fs manager. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 18 +++++------ fs/btrfs/zstd.c | 71 +++++++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 26c632f0a20cc5..cd479016d73d86 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -875,7 +875,7 @@ static void btrfs_cleanup_workspace_manager(int type) */ struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level) { - struct workspace_manager *wsm; + struct workspace_manager *wsm = fs_info->compr_wsm[type]; struct list_head *workspace; int cpus = num_online_cpus(); unsigned nofs_flag; @@ -885,7 +885,7 @@ struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, i wait_queue_head_t *ws_wait; int *free_ws; - wsm = btrfs_compress_op[type]->workspace_manager; + ASSERT(wsm); idle_ws = &wsm->idle_ws; ws_lock = &wsm->ws_lock; total_ws = &wsm->total_ws; @@ -974,19 +974,19 @@ static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, */ void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws) { - struct workspace_manager *wsm; + struct workspace_manager *gwsm = fs_info->compr_wsm[type]; struct list_head *idle_ws; spinlock_t *ws_lock; atomic_t *total_ws; wait_queue_head_t *ws_wait; int *free_ws; - wsm = btrfs_compress_op[type]->workspace_manager; - idle_ws = &wsm->idle_ws; - ws_lock = &wsm->ws_lock; - total_ws = &wsm->total_ws; - ws_wait = &wsm->ws_wait; - free_ws = &wsm->free_ws; + ASSERT(gwsm); + idle_ws = &gwsm->idle_ws; + ws_lock = &gwsm->ws_lock; + total_ws = &gwsm->total_ws; + ws_wait = &gwsm->ws_wait; + free_ws = &gwsm->free_ws; spin_lock(ws_lock); if (*free_ws <= num_online_cpus()) { diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 24898aee3ee00d..41019fcebc13aa 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -112,19 +112,19 @@ static inline int clip_level(int level) */ static void zstd_reclaim_timer_fn(struct timer_list *timer) { + struct zstd_workspace_manager *zwsm = + container_of(timer, struct zstd_workspace_manager, timer); unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; - ASSERT(timer == &wsm.timer); + spin_lock(&zwsm->lock); - spin_lock(&wsm.lock); - - if (list_empty(&wsm.lru_list)) { - spin_unlock(&wsm.lock); + if (list_empty(&zwsm->lru_list)) { + spin_unlock(&zwsm->lock); return; } - list_for_each_prev_safe(pos, next, &wsm.lru_list) { + list_for_each_prev_safe(pos, next, &zwsm->lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); int level; @@ -141,15 +141,15 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_del(&victim->list); zstd_free_workspace(&victim->list); - if (list_empty(&wsm.idle_ws[level])) - clear_bit(level, &wsm.active_map); + if (list_empty(&zwsm->idle_ws[level])) + clear_bit(level, &zwsm->active_map); } - if (!list_empty(&wsm.lru_list)) - mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); + if (!list_empty(&zwsm->lru_list)) + mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); - spin_unlock(&wsm.lock); + spin_unlock(&zwsm->lock); } /* @@ -292,29 +292,31 @@ void zstd_cleanup_workspace_manager(void) * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ -static struct list_head *zstd_find_workspace(int level) +static struct list_head *zstd_find_workspace(struct btrfs_fs_info *fs_info, int level) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct list_head *ws; struct workspace *workspace; int i = clip_level(level); - spin_lock_bh(&wsm.lock); - for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { - if (!list_empty(&wsm.idle_ws[i])) { - ws = wsm.idle_ws[i].next; + ASSERT(zwsm); + spin_lock_bh(&zwsm->lock); + for_each_set_bit_from(i, &zwsm->active_map, ZSTD_BTRFS_MAX_LEVEL) { + if (!list_empty(&zwsm->idle_ws[i])) { + ws = zwsm->idle_ws[i].next; workspace = list_to_workspace(ws); list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; if (clip_level(level) == workspace->level) list_del(&workspace->lru_list); - if (list_empty(&wsm.idle_ws[i])) - clear_bit(i, &wsm.active_map); - spin_unlock_bh(&wsm.lock); + if (list_empty(&zwsm->idle_ws[i])) + clear_bit(i, &zwsm->active_map); + spin_unlock_bh(&zwsm->lock); return ws; } } - spin_unlock_bh(&wsm.lock); + spin_unlock_bh(&zwsm->lock); return NULL; } @@ -331,15 +333,18 @@ static struct list_head *zstd_find_workspace(int level) */ struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct list_head *ws; unsigned int nofs_flag; + ASSERT(zwsm); + /* level == 0 means we can use any workspace */ if (!level) level = 1; again: - ws = zstd_find_workspace(level); + ws = zstd_find_workspace(fs_info, level); if (ws) return ws; @@ -350,9 +355,9 @@ struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level) if (IS_ERR(ws)) { DEFINE_WAIT(wait); - prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&zwsm->wait, &wait, TASK_UNINTERRUPTIBLE); schedule(); - finish_wait(&wsm.wait, &wait); + finish_wait(&zwsm->wait, &wait); goto again; } @@ -373,32 +378,34 @@ struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level) */ void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct workspace *workspace = list_to_workspace(ws); - spin_lock_bh(&wsm.lock); + ASSERT(zwsm); + spin_lock_bh(&zwsm->lock); /* A node is only taken off the lru if we are the corresponding level */ if (clip_level(workspace->req_level) == workspace->level) { /* Hide a max level workspace from reclaim */ - if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { + if (list_empty(&zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); } else { workspace->last_used = jiffies; - list_add(&workspace->lru_list, &wsm.lru_list); - if (!timer_pending(&wsm.timer)) - mod_timer(&wsm.timer, + list_add(&workspace->lru_list, &zwsm->lru_list); + if (!timer_pending(&zwsm->timer)) + mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); } } - set_bit(workspace->level, &wsm.active_map); - list_add(&workspace->list, &wsm.idle_ws[workspace->level]); + set_bit(workspace->level, &zwsm->active_map); + list_add(&workspace->list, &zwsm->idle_ws[workspace->level]); workspace->req_level = 0; - spin_unlock_bh(&wsm.lock); + spin_unlock_bh(&zwsm->lock); if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL)) - cond_wake_up(&wsm.wait); + cond_wake_up(&zwsm->wait); } void zstd_free_workspace(struct list_head *ws) From 9c8f4cf45651b8f07bd06df443871d526dc53cea Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 14 Aug 2025 10:29:58 +0930 Subject: [PATCH 2529/3206] btrfs: cleanup the per-module compression workspace managers Since all workspaces are handled by the per-fs workspace managers, we can safely remove the old per-module managers. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 55 +----------------------------------------- fs/btrfs/compression.h | 2 -- fs/btrfs/lzo.c | 3 --- fs/btrfs/zlib.c | 3 --- fs/btrfs/zstd.c | 49 ------------------------------------- 5 files changed, 1 insertion(+), 111 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index cd479016d73d86..ff2bf3f358a54e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -688,8 +688,6 @@ struct heuristic_ws { struct list_head list; }; -static struct workspace_manager heuristic_wsm; - static void free_heuristic_ws(struct list_head *ws) { struct heuristic_ws *workspace; @@ -729,9 +727,7 @@ static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info) return ERR_PTR(-ENOMEM); } -const struct btrfs_compress_op btrfs_heuristic_compress = { - .workspace_manager = &heuristic_wsm, -}; +const struct btrfs_compress_op btrfs_heuristic_compress = { 0 }; static const struct btrfs_compress_op * const btrfs_compress_op[] = { /* The heuristic is represented as compression type 0 */ @@ -807,32 +803,6 @@ static int alloc_workspace_manager(struct btrfs_fs_info *fs_info, return 0; } -static void btrfs_init_workspace_manager(struct btrfs_fs_info *fs_info, int type) -{ - struct workspace_manager *wsm; - struct list_head *workspace; - - wsm = btrfs_compress_op[type]->workspace_manager; - INIT_LIST_HEAD(&wsm->idle_ws); - spin_lock_init(&wsm->ws_lock); - atomic_set(&wsm->total_ws, 0); - init_waitqueue_head(&wsm->ws_wait); - - /* - * Preallocate one workspace for each compression type so we can - * guarantee forward progress in the worst case - */ - workspace = alloc_workspace(fs_info, type, 0); - if (IS_ERR(workspace)) { - btrfs_warn(fs_info, - "cannot preallocate compression workspace, will try later"); - } else { - atomic_set(&wsm->total_ws, 1); - wsm->free_ws = 1; - list_add(workspace, &wsm->idle_ws); - } -} - static void free_workspace_manager(struct btrfs_fs_info *fs_info, enum btrfs_compression_type type) { @@ -853,20 +823,6 @@ static void free_workspace_manager(struct btrfs_fs_info *fs_info, kfree(gwsm); } -static void btrfs_cleanup_workspace_manager(int type) -{ - struct workspace_manager *wsman; - struct list_head *ws; - - wsman = btrfs_compress_op[type]->workspace_manager; - while (!list_empty(&wsman->idle_ws)) { - ws = wsman->idle_ws.next; - list_del(ws); - free_workspace(type, ws); - atomic_dec(&wsman->total_ws); - } -} - /* * This finds an available workspace or allocates a new one. * If it's not possible to allocate a new one, waits until there's one. @@ -1192,11 +1148,6 @@ int __init btrfs_init_compress(void) if (!compr_pool.shrinker) return -ENOMEM; - btrfs_init_workspace_manager(NULL, BTRFS_COMPRESS_NONE); - btrfs_init_workspace_manager(NULL, BTRFS_COMPRESS_ZLIB); - btrfs_init_workspace_manager(NULL, BTRFS_COMPRESS_LZO); - zstd_init_workspace_manager(NULL); - spin_lock_init(&compr_pool.lock); INIT_LIST_HEAD(&compr_pool.list); compr_pool.count = 0; @@ -1217,10 +1168,6 @@ void __cold btrfs_exit_compress(void) btrfs_compr_pool_scan(NULL, NULL); shrinker_free(compr_pool.shrinker); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); - zstd_cleanup_workspace_manager(); bioset_exit(&btrfs_compressed_bioset); } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 70a3294bd14a8d..4fa4d3b9fa570a 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -183,8 +183,6 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, size_t destlen); int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info); void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info); -void zstd_init_workspace_manager(struct btrfs_fs_info *fs_info); -void zstd_cleanup_workspace_manager(void); struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level); void zstd_free_workspace(struct list_head *ws); struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 82407d7d9502eb..3456a1dcd42073 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -68,8 +68,6 @@ struct workspace { struct list_head list; }; -static struct workspace_manager wsm; - void lzo_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -489,7 +487,6 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, } const struct btrfs_compress_op btrfs_lzo_compress = { - .workspace_manager = &wsm, .max_level = 1, .default_level = 1, }; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 5fc045aeaa199e..aecc5805404505 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -34,8 +34,6 @@ struct workspace { int level; }; -static struct workspace_manager wsm; - struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { struct list_head *ws = btrfs_get_workspace(fs_info, BTRFS_COMPRESS_ZLIB, level); @@ -483,7 +481,6 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, } const struct btrfs_compress_op btrfs_zlib_compress = { - .workspace_manager = &wsm, .min_level = 1, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 41019fcebc13aa..d514a73a4015f5 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -86,8 +86,6 @@ struct zstd_workspace_manager { struct timer_list timer; }; -static struct zstd_workspace_manager wsm; - static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL]; static inline struct workspace *list_to_workspace(struct list_head *list) @@ -212,31 +210,6 @@ int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info) return 0; } -void zstd_init_workspace_manager(struct btrfs_fs_info *fs_info) -{ - struct list_head *ws; - int i; - - zstd_calc_ws_mem_sizes(); - - wsm.ops = &btrfs_zstd_compress; - spin_lock_init(&wsm.lock); - init_waitqueue_head(&wsm.wait); - timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0); - - INIT_LIST_HEAD(&wsm.lru_list); - for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) - INIT_LIST_HEAD(&wsm.idle_ws[i]); - - ws = zstd_alloc_workspace(fs_info, ZSTD_BTRFS_MAX_LEVEL); - if (IS_ERR(ws)) { - btrfs_warn(NULL, "cannot preallocate zstd compression workspace"); - } else { - set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map); - list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); - } -} - void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info) { struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; @@ -260,26 +233,6 @@ void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info) kfree(zwsm); } -void zstd_cleanup_workspace_manager(void) -{ - struct workspace *workspace; - int i; - - spin_lock_bh(&wsm.lock); - for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { - while (!list_empty(&wsm.idle_ws[i])) { - workspace = container_of(wsm.idle_ws[i].next, - struct workspace, list); - list_del(&workspace->list); - list_del(&workspace->lru_list); - zstd_free_workspace(&workspace->list); - } - } - spin_unlock_bh(&wsm.lock); - - timer_delete_sync(&wsm.timer); -} - /* * Find workspace for given level. * @@ -775,8 +728,6 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, } const struct btrfs_compress_op btrfs_zstd_compress = { - /* ZSTD uses own workspace manager */ - .workspace_manager = NULL, .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, From 0d0b80929eff93e7e0323060899d04905b8e6de9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 14 Aug 2025 10:35:23 +0930 Subject: [PATCH 2530/3206] btrfs: rename btrfs_compress_op to btrfs_compress_levels Since all workspace managers are per-fs, there is no need nor no way to store them inside btrfs_compress_op::wsm anymore. With that said, we can do the following modifications: - Remove zstd_workspace_mananger::ops Zstd always grab the global btrfs_compress_op[]. - Remove btrfs_compress_op::wsm member - Rename btrfs_compress_op to btrfs_compress_levels This should make it more clear that btrfs_compress_levels structures are only to indicate the levels of each compress algorithm. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 14 +++++++------- fs/btrfs/compression.h | 11 +++++------ fs/btrfs/lzo.c | 2 +- fs/btrfs/zlib.c | 2 +- fs/btrfs/zstd.c | 4 +--- 5 files changed, 15 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ff2bf3f358a54e..d1de0151a1540a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -727,9 +727,9 @@ static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info) return ERR_PTR(-ENOMEM); } -const struct btrfs_compress_op btrfs_heuristic_compress = { 0 }; +const struct btrfs_compress_levels btrfs_heuristic_compress = { 0 }; -static const struct btrfs_compress_op * const btrfs_compress_op[] = { +static const struct btrfs_compress_levels * const btrfs_compress_levels[] = { /* The heuristic is represented as compression type 0 */ &btrfs_heuristic_compress, &btrfs_zlib_compress, @@ -981,12 +981,12 @@ static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_h */ static int btrfs_compress_set_level(unsigned int type, int level) { - const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + const struct btrfs_compress_levels *levels = btrfs_compress_levels[type]; if (level == 0) - level = ops->default_level; + level = levels->default_level; else - level = clamp(level, ops->min_level, ops->max_level); + level = clamp(level, levels->min_level, levels->max_level); return level; } @@ -996,9 +996,9 @@ static int btrfs_compress_set_level(unsigned int type, int level) */ bool btrfs_compress_level_valid(unsigned int type, int level) { - const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + const struct btrfs_compress_levels *levels = btrfs_compress_levels[type]; - return ops->min_level <= level && level <= ops->max_level; + return levels->min_level <= level && level <= levels->max_level; } /* Wrapper around find_get_page(), with extra error message. */ diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 4fa4d3b9fa570a..243771ae7d6439 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -129,8 +129,7 @@ struct workspace_manager { struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level); void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws); -struct btrfs_compress_op { - struct workspace_manager *workspace_manager; +struct btrfs_compress_levels { /* Maximum level supported by the compression algorithm */ int min_level; int max_level; @@ -140,10 +139,10 @@ struct btrfs_compress_op { /* The heuristic workspaces are managed via the 0th workspace manager */ #define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES -extern const struct btrfs_compress_op btrfs_heuristic_compress; -extern const struct btrfs_compress_op btrfs_zlib_compress; -extern const struct btrfs_compress_op btrfs_lzo_compress; -extern const struct btrfs_compress_op btrfs_zstd_compress; +extern const struct btrfs_compress_levels btrfs_heuristic_compress; +extern const struct btrfs_compress_levels btrfs_zlib_compress; +extern const struct btrfs_compress_levels btrfs_lzo_compress; +extern const struct btrfs_compress_levels btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); bool btrfs_compress_is_valid_type(const char *str, size_t len); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 3456a1dcd42073..2983214643daf3 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -486,7 +486,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, return ret; } -const struct btrfs_compress_op btrfs_lzo_compress = { +const struct btrfs_compress_levels btrfs_lzo_compress = { .max_level = 1, .default_level = 1, }; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index aecc5805404505..8e0bf34e099813 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -480,7 +480,7 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, return ret; } -const struct btrfs_compress_op btrfs_zlib_compress = { +const struct btrfs_compress_levels btrfs_zlib_compress = { .min_level = 1, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index d514a73a4015f5..63faeb3ed9acb8 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -77,7 +77,6 @@ struct workspace { */ struct zstd_workspace_manager { - const struct btrfs_compress_op *ops; spinlock_t lock; struct list_head lru_list; struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL]; @@ -190,7 +189,6 @@ int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info) if (!zwsm) return -ENOMEM; zstd_calc_ws_mem_sizes(); - zwsm->ops = &btrfs_zstd_compress; spin_lock_init(&zwsm->lock); init_waitqueue_head(&zwsm->wait); timer_setup(&zwsm->timer, zstd_reclaim_timer_fn, 0); @@ -727,7 +725,7 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, return ret; } -const struct btrfs_compress_op btrfs_zstd_compress = { +const struct btrfs_compress_levels btrfs_zstd_compress = { .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, From 74e8f002b772686408d62e420d9f70a4bcb1c2c4 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 14 Aug 2025 10:44:48 +0930 Subject: [PATCH 2531/3206] btrfs: reduce compression workspace buffer space to block size Currently the compression workspace buffer size is always based on PAGE_SIZE, but btrfs has support subpage sized block size for some time. This means for one-shot compression algorithm like lzo, we're wasting quite some memory if the block size is smaller than page size, as the LZO only works on one block (thus one-shot). On 64K page sized systems with 4K block size, it means we only need at most 8K buffer space for lzo, but in reality we're allocating 64K buffer. So to reduce the memory usage, change all workspace buffer to base its size based on block size other than page size. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/lzo.c | 20 +++++++++++++------- fs/btrfs/zlib.c | 5 +++-- fs/btrfs/zstd.c | 6 ++++-- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 2983214643daf3..047d90e216f6ae 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -58,9 +58,6 @@ * 0x1000 | SegHdr N+1| Data payload N+1 ... | */ -#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) -#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) - struct workspace { void *mem; void *buf; /* where decompressed data goes */ @@ -68,6 +65,15 @@ struct workspace { struct list_head list; }; +static u32 workspace_buf_length(const struct btrfs_fs_info *fs_info) +{ + return lzo1x_worst_compress(fs_info->sectorsize); +} +static u32 workspace_cbuf_length(const struct btrfs_fs_info *fs_info) +{ + return lzo1x_worst_compress(fs_info->sectorsize); +} + void lzo_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -87,8 +93,8 @@ struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info) return ERR_PTR(-ENOMEM); workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN); - workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); - workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); + workspace->buf = kvmalloc(workspace_buf_length(fs_info), GFP_KERNEL | __GFP_NOWARN); + workspace->cbuf = kvmalloc(workspace_cbuf_length(fs_info), GFP_KERNEL | __GFP_NOWARN); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -384,7 +390,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) kunmap_local(kaddr); cur_in += LZO_LEN; - if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) { + if (unlikely(seg_len > workspace_cbuf_length(fs_info))) { struct btrfs_inode *inode = cb->bbio.inode; /* @@ -444,7 +450,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; - size_t max_segment_len = WORKSPACE_BUF_LENGTH; + size_t max_segment_len = workspace_buf_length(fs_info); int ret = 0; if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8e0bf34e099813..d72566a87afa2c 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -55,6 +55,7 @@ void zlib_free_workspace(struct list_head *ws) struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { + const u32 blocksize = fs_info->sectorsize; struct workspace *workspace; int workspacesize; @@ -78,8 +79,8 @@ struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned i workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE; } if (!workspace->buf) { - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - workspace->buf_size = PAGE_SIZE; + workspace->buf = kmalloc(blocksize, GFP_KERNEL); + workspace->buf_size = blocksize; } if (!workspace->strm.workspace || !workspace->buf) goto fail; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 63faeb3ed9acb8..b11a87842cdaef 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -370,6 +370,7 @@ void zstd_free_workspace(struct list_head *ws) struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level) { + const u32 blocksize = fs_info->sectorsize; struct workspace *workspace; workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); @@ -382,7 +383,7 @@ struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level) workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf = kmalloc(blocksize, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; @@ -590,6 +591,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; + const u32 blocksize = cb_to_fs_info(cb)->sectorsize; unsigned long folio_in_index = 0; unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; @@ -613,7 +615,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; + workspace->out_buf.size = blocksize; while (1) { size_t ret2; From 17dc82dc1e77a6fce07252ce894748190d1487d0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 22 Aug 2025 00:57:42 +0200 Subject: [PATCH 2532/3206] btrfs: fix typos in comments and strings Annual typo fixing pass. Strangely codespell found only about 30% of what is in this patch, the rest was done manually using text spellchecker with a custom dictionary of acceptable terms. Reviewed-by: Neal Gompa Signed-off-by: David Sterba --- fs/btrfs/accessors.c | 2 +- fs/btrfs/backref.c | 2 +- fs/btrfs/backref.h | 4 ++-- fs/btrfs/block-group.c | 4 ++-- fs/btrfs/block-group.h | 2 +- fs/btrfs/compression.c | 2 +- fs/btrfs/defrag.c | 2 +- fs/btrfs/delayed-ref.c | 2 +- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-io-tree.c | 2 +- fs/btrfs/extent-tree.c | 8 ++++---- fs/btrfs/extent_io.c | 10 +++++----- fs/btrfs/fiemap.c | 2 +- fs/btrfs/file.c | 4 ++-- fs/btrfs/free-space-cache.c | 4 ++-- fs/btrfs/fs.c | 2 +- fs/btrfs/fs.h | 2 +- fs/btrfs/inode.c | 10 +++++----- fs/btrfs/ioctl.c | 2 +- fs/btrfs/locking.c | 2 +- fs/btrfs/locking.h | 2 +- fs/btrfs/scrub.c | 2 +- fs/btrfs/send.c | 6 +++--- fs/btrfs/space-info.c | 4 ++-- fs/btrfs/subpage.c | 2 +- fs/btrfs/subpage.h | 2 +- fs/btrfs/super.c | 2 +- fs/btrfs/tests/delayed-refs-tests.c | 4 ++-- fs/btrfs/tests/extent-map-tests.c | 2 +- fs/btrfs/transaction.c | 4 ++-- fs/btrfs/tree-checker.c | 2 +- fs/btrfs/tree-log.c | 4 ++-- fs/btrfs/volumes.c | 10 ++++------ fs/btrfs/volumes.h | 4 ++-- 35 files changed, 60 insertions(+), 62 deletions(-) diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 861c7d92c437aa..1248aa2535d306 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -44,7 +44,7 @@ static __always_inline void memcpy_split_src(char *dest, const char *src1, * gives us all the type checking. * * The extent buffer pages stored in the array folios may not form a contiguous - * phyusical range, but the API functions assume the linear offset to the range + * physical range, but the API functions assume the linear offset to the range * from 0 to metadata node size. */ diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 6a450be293b1cd..c6573e845e43f6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1690,7 +1690,7 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are * added to the ulist at @ctx->refs, and that ulist is allocated by this * function. The caller should free the ulist with free_leaf_list() if - * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is + * @ctx->ignore_extent_item_pos is false, otherwise a simple ulist_free() is * enough. * * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated. diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 34b0193a181c88..25d51c2460703b 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -190,7 +190,7 @@ struct btrfs_backref_share_check_ctx { * It's very common to have several file extent items that point to the * same extent (bytenr) but with different offsets and lengths. This * typically happens for COW writes, partial writes into prealloc - * extents, NOCOW writes after snapshoting a root, hole punching or + * extents, NOCOW writes after snapshotting a root, hole punching or * reflinking within the same file (less common perhaps). * So keep a small cache with the lookup results for the extent pointed * by the last few file extent items. This cache is checked, with a @@ -414,7 +414,7 @@ struct btrfs_backref_cache { /* * Whether this cache is for relocation * - * Reloction backref cache require more info for reloc root compared + * Relocation backref cache require more info for reloc root compared * to generic backref cache. */ bool is_reloc; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b6328d50ffe78d..548483a84466b6 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1971,7 +1971,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) * called, which is where we will transfer a reserved extent's * size from the "reserved" counter to the "used" counter - this * happens when running delayed references. When we relocate the - * chunk below, relocation first flushes dellaloc, waits for + * chunk below, relocation first flushes delalloc, waits for * ordered extent completion (which is where we create delayed * references for data extents) and commits the current * transaction (which runs delayed references), and only after @@ -2839,7 +2839,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) * space or none at all (due to no need to COW, extent buffers * were already COWed in the current transaction and still * unwritten, tree heights lower than the maximum possible - * height, etc). For data we generally reserve the axact amount + * height, etc). For data we generally reserve the exact amount * of space we are going to allocate later, the exception is * when using compression, as we must reserve space based on the * uncompressed data size, because the compression is only done diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a8bb8429c96635..9172104a5889ec 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -63,7 +63,7 @@ enum btrfs_discard_state { * CHUNK_ALLOC_FORCE means it must try to allocate one * * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from - * find_free_extent() that also activaes the zone + * find_free_extent() that also activates the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d1de0151a1540a..c865e0f2a7e871 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1290,7 +1290,7 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, #define ENTROPY_LVL_HIGH (80) /* - * For increasead precision in shannon_entropy calculation, + * For increased precision in shannon_entropy calculation, * let's do pow(n, M) to save more digits after comma: * * - maximum int bit length is 64 diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 738179a5e17060..84ba9906f0d818 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -153,7 +153,7 @@ void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) } /* - * Pick the defragable inode that we want, if it doesn't exist, we will get the + * Pick the defraggable inode that we want, if it doesn't exist, we will get the * next one. */ static struct inode_defrag *btrfs_pick_defrag_inode( diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 5d8361bac497fc..6170803d8a1b44 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -895,7 +895,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } /* - * Initialize the structure which represents a modification to a an extent. + * Initialize the structure which represents a modification to an extent. * * @fs_info: Internal to the mounted filesystem mount structure. * diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 4675bcd5f92efb..ed3b07fdaab87b 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -637,7 +637,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - DEBUG_WARN("unexpected STARTED ot SUSPENDED dev-replace state"); + DEBUG_WARN("unexpected STARTED or SUSPENDED dev-replace state"); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; up_write(&dev_replace->rwsem); goto leave; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb55ecf4bf2582..7b06bbc4089878 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3245,7 +3245,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) /* * Subpage runtime limitation on v1 cache. * - * V1 space cache still has some hard codeed PAGE_SIZE usage, while + * V1 space cache still has some hard coded PAGE_SIZE usage, while * we're already defaulting to v2 cache, no need to bother v1 as it's * going to be deprecated anyway. */ diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 0c58342c6125e1..bb2ca1c9c7b026 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1237,7 +1237,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, state = next_search_state(inserted_state, end); /* * If there's a next state, whether contiguous or not, we don't - * need to unlock and start search agian. If it's not contiguous + * need to unlock and start search again. If it's not contiguous * we will end up here and try to allocate a prealloc state and insert. */ if (state) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e117b5cbefae73..a4416c451b250b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -325,7 +325,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, - * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, + * is_data == BTRFS_REF_TYPE_DATA, data type is required, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -4316,7 +4316,7 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { /* - * No lock is OK here because avail is monotinically + * No lock is OK here because avail is monotonically * decreasing, and this is just a hint. */ u64 avail = block_group->zone_capacity - block_group->alloc_offset; @@ -5613,7 +5613,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. * * If we are DROP_REFERENCE this will figure out if we need to drop our current - * reference, skipping it if we dropped it from a previous incompleted drop, or + * reference, skipping it if we dropped it from a previous uncompleted drop, or * dropping it if we still have a reference to it. */ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -5760,7 +5760,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, /* * We have to walk down into this node, and if we're currently at the - * DROP_REFERNCE stage and this block is shared then we need to switch + * DROP_REFERENCE stage and this block is shared then we need to switch * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF. */ if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 152c9d3e8cce8b..ca7174fa024056 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -418,7 +418,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1; - /* step two, lock all the folioss after the folios that has start */ + /* step two, lock all the folios after the folios that has start */ ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, delalloc_end); ASSERT(!ret || ret == -EAGAIN); @@ -772,7 +772,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, * * The will either add the page into the existing @bio_ctrl->bbio, or allocate a * new one in @bio_ctrl->bbio. - * The mirror number for this IO should already be initizlied in + * The mirror number for this IO should already be initialized in * @bio_ctrl->mirror_num. */ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, @@ -2225,7 +2225,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * @fs_info: The fs_info for this file system. * @start: The offset of the range to start waiting on writeback. * @end: The end of the range, inclusive. This is meant to be used in - * conjuction with wait_marked_extents, so this will usually be + * conjunction with wait_marked_extents, so this will usually be * the_next_eb->start - 1. */ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, @@ -2495,7 +2495,7 @@ static int extent_write_cache_pages(struct address_space *mapping, * In above case, [32K, 96K) is asynchronously submitted * for compression, and [124K, 128K) needs to be written back. * - * If we didn't wait wrtiteback for page 64K, [128K, 128K) + * If we didn't wait writeback for page 64K, [128K, 128K) * won't be submitted as the page still has writeback flag * and will be skipped in the next check. * @@ -2979,7 +2979,7 @@ static void cleanup_extent_buffer_folios(struct extent_buffer *eb) { const int num_folios = num_extent_folios(eb); - /* We canont use num_extent_folios() as loop bound as eb->folios changes. */ + /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */ for (int i = 0; i < num_folios; i++) { ASSERT(eb->folios[i]); detach_extent_buffer_folio(eb, eb->folios[i]); diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index 7935586a9dbd0f..f2eaaef8422bf3 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -153,7 +153,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, if (cache_end > offset) { if (offset == cache->offset) { /* - * We cached a dealloc range (found in the io tree) for + * We cached a delalloc range (found in the io tree) for * a hole or prealloc extent and we have now found a * file extent item for the same offset. What we have * now is more recent and up to date, so discard what diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 204674934795cb..4daec404fec621 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -970,7 +970,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, * Return: * > 0 If we can nocow, and updates @write_bytes. * 0 If we can't do a nocow write. - * -EAGAIN If we can't do a nocow write because snapshoting of the inode's + * -EAGAIN If we can't do a nocow write because snapshotting of the inode's * root is in progress or because we are in a non-blocking IO * context and need to block (@nowait is true). * < 0 If an error happened. @@ -3345,7 +3345,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end * We could also use the extent map tree to find such delalloc that is * being flushed, but using the ordered extents tree is more efficient * because it's usually much smaller as ordered extents are removed from - * the tree once they complete. With the extent maps, we mau have them + * the tree once they complete. With the extent maps, we may have them * in the extent map tree for a very long time, and they were either * created by previous writes or loaded by read operations. */ diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5d8d1570a5c948..c2730740d92821 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2282,7 +2282,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want * to reserve them to larger extents, however if we have plenty - * of cache left then go ahead an dadd them, no sense in adding + * of cache left then go ahead and add them, no sense in adding * the overhead of a bitmap if we don't have to. */ if (info->bytes <= fs_info->sectorsize * 8) { @@ -3829,7 +3829,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, /* * If we break out of trimming a bitmap prematurely, we should reset the - * trimming bit. In a rather contrieved case, it's possible to race here so + * trimming bit. In a rather contrived case, it's possible to race here so * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. * * start = start of bitmap diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 8b118c03cdb881..335209fe37347f 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -58,7 +58,7 @@ size_t __attribute_const__ btrfs_get_num_csums(void) * We support the following block sizes for all systems: * * - 4K - * This is the most common block size. For PAGE SIZE > 4K cases the subage + * This is the most common block size. For PAGE SIZE > 4K cases the subpage * mode is used. * * - PAGE_SIZE diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 2ccba95af06076..5f0b185a7f21ab 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -283,7 +283,7 @@ enum { #ifdef CONFIG_BTRFS_EXPERIMENTAL /* - * Features under developmen like Extent tree v2 support is enabled + * Features under development like Extent tree v2 support is enabled * only under CONFIG_BTRFS_EXPERIMENTAL */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 33cabe0a54a492..dd503dba33cf65 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -370,7 +370,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) } /* - * Unock inode i_rwsem. + * Unlock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. @@ -1990,7 +1990,7 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio } /* - * when nowcow writeback call back. This checks for snapshots or COW copies + * When nocow writeback calls back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * * If no cow copies or snapshots exist, we write directly to the existing @@ -2233,7 +2233,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, * | OE cleanup | Skip | Untouched | * * nocow_one_range() failed, the range [cur_offset, nocow_end] is - * alread cleaned up. + * already cleaned up. */ oe_cleanup_start = start; oe_cleanup_len = cur_offset - start; @@ -2986,7 +2986,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the * number of bytes only for that range containing the inline extent. - * The remaining of the range will be processed when clearning the + * The remaining of the range will be processed when clearing the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { @@ -4905,7 +4905,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e goto out; /* - * Skip the truncatioin if the range in the target block is already aligned. + * Skip the truncation if the range in the target block is already aligned. * The seemingly complex check will also handle the same block case. */ if (in_head_block && !IS_ALIGNED(start, blocksize)) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e10daf6631afd6..063291519b363b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -957,7 +957,7 @@ static noinline int btrfs_mksnapshot(struct dentry *parent, /* * Force new buffered writes to reserve space even when NOCOW is - * possible. This is to avoid later writeback (running dealloc) to + * possible. This is to avoid later writeback (running delalloc) to * fallback to COW mode and unexpectedly fail with ENOSPC. */ btrfs_drew_read_lock(&root->snapshot_lock); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index a3e6d9616e60bf..0035851d72b00f 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -361,7 +361,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) atomic_inc(&lock->readers); /* - * Ensure the pending reader count is perceieved BEFORE this reader + * Ensure the pending reader count is perceived BEFORE this reader * goes to sleep in case of active writers. This guarantees new writers * won't be allowed and that the current reader will be woken up when * the last active writer finishes its jobs. diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index af29df98ac1454..a4673e7d95d705 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -74,7 +74,7 @@ enum btrfs_lock_nesting { BTRFS_NESTING_NEW_ROOT, /* - * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so + * We are limited to MAX_LOCKDEP_SUBCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over * the limit. As of this writing we're limited to 8, and we're * definitely using 8, hence this check to keep us from messing up in diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ce5f6732bfb585..d86020ace69c6d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -113,7 +113,7 @@ enum { /* Which blocks are covered by extent items. */ scrub_bitmap_nr_has_extent = 0, - /* Which blocks are meteadata. */ + /* Which blocks are metadata. */ scrub_bitmap_nr_is_metadata, /* diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index faa3710fa074fe..c5771df3a2c7c0 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1738,7 +1738,7 @@ static int read_symlink(struct btrfs_root *root, * An empty symlink inode. Can happen in rare error paths when * creating a symlink (transaction committed before the inode * eviction handler removed the symlink inode items and a crash - * happened in between or the subvol was snapshoted in between). + * happened in between or the subvol was snapshotted in between). * Print an informative message to dmesg/syslog so that the user * can delete the symlink. */ @@ -2768,7 +2768,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) * processing an inode that is a directory and it just got renamed, and existing * entries in the cache may refer to inodes that have the directory in their * full path - in which case we would generate outdated paths (pre-rename) - * for the inodes that the cache entries point to. Instead of prunning the + * for the inodes that the cache entries point to. Instead of pruning the * cache when inserting, do it after we finish processing each inode at * finish_inode_if_needed(). */ @@ -7984,7 +7984,7 @@ static int ensure_commit_roots_uptodate(struct send_ctx *sctx) } /* - * Make sure any existing dellaloc is flushed for any root used by a send + * Make sure any existing delalloc is flushed for any root used by a send * operation so that we do not miss any data and we do not race with writeback * finishing and changing a tree while send is using the tree. This could * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 0481c693ac2eaf..0e5c0c80e0fe6a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -479,7 +479,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, /* * On the zoned mode, we always allocate one zone as one chunk. - * Returning non-zone size alingned bytes here will result in + * Returning non-zone size aligned bytes here will result in * less pressure for the async metadata reclaim process, and it * will over-commit too much leading to ENOSPC. Align down to the * zone size to avoid that. @@ -1528,7 +1528,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, * turned into error mode due to a transaction abort when flushing space * above, in that case fail with the abort error instead of returning * success to the caller if we can steal from the global rsv - this is - * just to have caller fail immeditelly instead of later when trying to + * just to have caller fail immediately instead of later when trying to * modify the fs, making it easier to debug -ENOSPC problems. */ if (BTRFS_FS_ERROR(fs_info)) { diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index cb4f97833dc34a..5ca8d4db67220c 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -690,7 +690,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, \ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ - "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ + "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ start, len, folio_pos(folio), \ blocks_per_folio, &bitmap); \ } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index ee0710eb13fd0a..ad0552db7c7dcb 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -13,7 +13,7 @@ struct address_space; struct folio; /* - * Extra info for subpapge bitmap. + * Extra info for subpage bitmap. * * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into * one larger bitmap. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4951f50e9823e7..49893c8858556b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1283,7 +1283,7 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); /* - * We need to cleanup all defragable inodes if the autodefragment is + * We need to cleanup all defraggable inodes if the autodefragment is * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c index 265370e79a546d..e2248acb906b74 100644 --- a/fs/btrfs/tests/delayed-refs-tests.c +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -997,12 +997,12 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) ret = simple_tests(&trans); if (!ret) { - test_msg("running delayed refs merg tests on metadata refs"); + test_msg("running delayed refs merge tests on metadata refs"); ret = merge_tests(&trans, BTRFS_REF_METADATA); } if (!ret) { - test_msg("running delayed refs merg tests on data refs"); + test_msg("running delayed refs merge tests on data refs"); ret = merge_tests(&trans, BTRFS_REF_DATA); } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 3a86534c116f2f..42af6c737c6e6f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1095,7 +1095,7 @@ int btrfs_test_extent_map(void) /* * Test a chunk with 2 data stripes one of which * intersects the physical address of the super block - * is correctly recognised. + * is correctly recognized. */ .raid_type = BTRFS_BLOCK_GROUP_RAID1, .physical_start = SZ_64M - SZ_4M, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 25ee0183e17812..d04fa6ce83901e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -103,7 +103,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep; * | attached to transid N+1. | * | | * | To next stage: | - * | Until all tree blocks are super blocks are | + * | Until all tree blocks and super blocks are | * | written to block devices | * V | * Transaction N [[TRANS_STATE_COMPLETED]] V @@ -2423,7 +2423,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * them. * * We needn't worry that this operation will corrupt the snapshots, - * because all the tree which are snapshoted will be forced to COW + * because all the tree which are snapshotted will be forced to COW * the nodes and leaves. */ ret = btrfs_run_delayed_items(trans); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a997c7cc35a26f..c2aac08055fb84 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1209,7 +1209,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, /* * For legacy root item, the members starting at generation_v2 will be * all filled with 0. - * And since we allow geneartion_v2 as 0, it will still pass the check. + * And since we allow generation_v2 as 0, it will still pass the check. */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), btrfs_item_size(leaf, slot)); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c3cfffc2100854..861f96ef28cf66 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1816,7 +1816,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, /* * fixup on a directory may create new entries, - * make sure we always look for the highset possible + * make sure we always look for the highest possible * offset */ key.offset = (u64)-1; @@ -3619,7 +3619,7 @@ static int inode_logged(const struct btrfs_trans_handle *trans, /* * The inode was previously logged and then evicted, set logged_trans to - * the current transacion's ID, to avoid future tree searches as long as + * the current transaction's ID, to avoid future tree searches as long as * the inode is not evicted again. */ spin_lock(&inode->lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cc75b4e49662a8..4f20c25a39c81d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1377,8 +1377,8 @@ struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, } /* - * Make sure the last byte of label is properly NUL termiated. We use - * '%s' to print the label, if not properly NUL termiated we can access + * Make sure the last byte of label is properly NUL terminated. We use + * '%s' to print the label, if not properly NUL terminated we can access * beyond the label. */ if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1]) @@ -4463,7 +4463,7 @@ static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) } /* - * Should be called with balance mutexe held + * Should be called with balance mutex held */ int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, @@ -7486,7 +7486,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) /* * Lockdep complains about possible circular locking dependency between * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores - * used for freeze procection of a fs (struct super_block.s_writers), + * used for freeze protection of a fs (struct super_block.s_writers), * which we take when starting a transaction, and extent buffers of the * chunk tree if we call read_one_dev() while holding a lock on an * extent buffer of the chunk tree. Since we are mounting the filesystem @@ -7919,8 +7919,6 @@ int btrfs_bg_type_to_factor(u64 flags) return btrfs_raid_array[index].ncopies; } - - static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a56e873a30295e..2cbf8080eade06 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -34,7 +34,7 @@ struct btrfs_zoned_device_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) /* - * Arbitratry maximum size of one discard request to limit potentially long time + * Arbitrary maximum size of one discard request to limit potentially long time * spent in blkdev_issue_discard(). */ #define BTRFS_MAX_DISCARD_CHUNK_SIZE (SZ_1G) @@ -495,7 +495,7 @@ struct btrfs_discard_stripe { }; /* - * Context for IO subsmission for device stripe. + * Context for IO submission for device stripe. * * - Track the unfinished mirrors for mirror based profiles * Mirror based profiles are SINGLE/DUP/RAID1/RAID10. From a7f3dfb8293c4cee99743132d69863a92e8f4875 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 1 Sep 2025 17:01:44 +0200 Subject: [PATCH 2533/3206] btrfs: scrub: replace max_t()/min_t() with clamp() in scrub_throttle_dev_io() Replace max_t() followed by min_t() with a single clamp(). As was pointed by David Laight in https://lore.kernel.org/linux-btrfs/20250906122458.75dfc8f0@pumpkin/ the calculation may overflow u32 when the input value is too large, so clamp_t() is not used. In practice the expected values are in range of megabytes to gigabytes (throughput limit) so the bug would not happen. Signed-off-by: Thorsten Blum Reviewed-by: David Sterba [ Use clamp() and add explanation. ] Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d86020ace69c6d..2f10c65929dc97 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1369,8 +1369,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d * Slice is divided into intervals when the IO is submitted, adjust by * bwlimit and maximum of 64 intervals. */ - div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); - div = min_t(u32, 64, div); + div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); /* Start new epoch, set deadline */ now = ktime_get(); From 2ccfaf73690960e800c2dd7debea559de4b58010 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 9 Aug 2025 14:19:16 +0930 Subject: [PATCH 2534/3206] btrfs: support all block sizes which is no larger than page size Currently if block size < page size, btrfs only supports one single config, 4K. This is mostly to reduce the test configurations, as 4K is going to be the default block size for all architectures. However all other major filesystems have no artificial limits on the support block size, and some are already supporting block size > page sizes. Since the btrfs subpage block support has been there for a long time, it's time for us to enable all block size <= page size support. So here enable all block sizes support as long as it's no larger than page size for experimental builds. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 335209fe37347f..014fb8b12f96b9 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -78,6 +78,10 @@ bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize) if (blocksize == PAGE_SIZE || blocksize == SZ_4K || blocksize == BTRFS_MIN_BLOCKSIZE) return true; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (blocksize <= PAGE_SIZE) + return true; +#endif return false; } From 35aff706dccbc51d30df6fedba76a746a1322839 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 2 Sep 2025 10:51:25 +0930 Subject: [PATCH 2535/3206] btrfs: concentrate highmem handling for data verification Currently for btrfs checksum verification, we do it in the following pattern: kaddr = kmap_local_*(); ret = btrfs_check_csum_csum(kaddr); kunmap_local(kaddr); It's OK for now, but it's still not following the patterns of helpers inside linux/highmem.h, which never requires a virt memory address. In those highmem helpers, they mostly accept a folio, some offset/length inside the folio, and in the implementation they check if the folio needs partial kmap, and do the handling. Inspired by those formal highmem helpers, enhance the highmem handling of data checksum verification by: - Rename btrfs_check_sector_csum() to btrfs_check_block_csum() To follow the more common term "block" used in all other major filesystems. - Pass a physical address into btrfs_check_block_csum() and btrfs_data_csum_ok() The physical address is always available even for a highmem page. Since it's page frame number << PAGE_SHIFT + offset in page. And with that physical address, we can grab the folio covering the page, and do extra checks to ensure it covers at least one block. This also allows us to do the kmap inside btrfs_check_block_csum(). This means all the extra HIGHMEM handling will be concentrated into btrfs_check_block_csum(), and no callers will need to bother highmem by themselves. - Properly zero out the block if csum mismatch Since btrfs_data_csum_ok() only got a paddr, we can not and should not use memzero_bvec(), which only accepts single page bvec. Instead use paddr to grab the folio and call folio_zero_range() Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 4 ++-- fs/btrfs/btrfs_inode.h | 6 +++--- fs/btrfs/inode.c | 47 +++++++++++++++++++++++++++++------------- fs/btrfs/raid56.c | 13 +++--------- fs/btrfs/scrub.c | 18 ++++++++++++++-- 5 files changed, 57 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index ea7f7a17a3d5bb..493135bfa518f1 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -167,7 +167,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, int mirror = repair_bbio->mirror_num; if (repair_bbio->bio.bi_status || - !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { + !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); repair_bbio->bio.bi_iter = repair_bbio->saved_iter; @@ -280,7 +280,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); bv.bv_len = min(bv.bv_len, sectorsize); - if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) + if (status || !btrfs_data_csum_ok(bbio, dev, offset, bvec_phys(&bv))) fbio = repair_one_sector(bbio, offset, &bv, fbio); bio_advance_iter_single(&bbio->bio, iter, sectorsize); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index df3445448b7d64..077b2f178816f0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -542,10 +542,10 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, - const u8 * const csum_expected); +int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, + const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, struct bio_vec *bv); + u32 bio_offset, phys_addr_t paddr); noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd503dba33cf65..98877535f213e9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3334,13 +3334,35 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) * * @kaddr must be a properly kmapped address. */ -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, - const u8 * const csum_expected) +int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, + const u8 * const csum_expected) { + struct folio *folio = page_folio(phys_to_page(paddr)); + const u32 blocksize = fs_info->sectorsize; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); shash->tfm = fs_info->csum_shash; - crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); + /* The full block must be inside the folio. */ + ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); + + if (folio_test_partial_kmap(folio)) { + size_t cur = paddr; + + crypto_shash_init(shash); + while (cur < paddr + blocksize) { + void *kaddr; + size_t len = min(paddr + blocksize - cur, + PAGE_SIZE - offset_in_page(cur)); + + kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur)); + crypto_shash_update(shash, kaddr, len); + kunmap_local(kaddr); + cur += len; + } + crypto_shash_final(shash, csum); + } else { + crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, csum); + } if (memcmp(csum, csum_expected, fs_info->csum_size)) return -EIO; @@ -3361,17 +3383,16 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum * Return %true if the sector is ok or had no checksum to start with, else %false. */ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, struct bio_vec *bv) + u32 bio_offset, phys_addr_t paddr) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 blocksize = fs_info->sectorsize; + struct folio *folio; u64 file_offset = bbio->file_offset + bio_offset; - u64 end = file_offset + bv->bv_len - 1; + u64 end = file_offset + blocksize - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; - void *kaddr; - - ASSERT(bv->bv_len == fs_info->sectorsize); if (!bbio->csum) return true; @@ -3387,12 +3408,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - kaddr = bvec_kmap_local(bv); - if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) { - kunmap_local(kaddr); + if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected)) goto zeroit; - } - kunmap_local(kaddr); return true; zeroit: @@ -3400,7 +3417,9 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, bbio->mirror_num); if (dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); - memzero_bvec(bv); + folio = page_folio(phys_to_page(paddr)); + ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); + folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize); return false; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3ff2bedfb3a4c9..e88699460dda6a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1585,9 +1585,6 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, return; bio_for_each_segment_all(bvec, bio, iter_all) { - void *kaddr; - - kaddr = bvec_kmap_local(bvec); for (u32 off = 0; off < bvec->bv_len; off += fs_info->sectorsize, total_sector_nr++) { u8 csum_buf[BTRFS_CSUM_SIZE]; @@ -1599,12 +1596,11 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (!test_bit(total_sector_nr, rbio->csum_bitmap)) continue; - ret = btrfs_check_sector_csum(fs_info, kaddr + off, - csum_buf, expected_csum); + ret = btrfs_check_block_csum(fs_info, bvec_phys(bvec) + off, + csum_buf, expected_csum); if (ret < 0) set_bit(total_sector_nr, rbio->error_bitmap); } - kunmap_local(kaddr); } } @@ -1802,7 +1798,6 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; - void *kaddr; int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1824,9 +1819,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - kaddr = kmap_local_sector(sector); - ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected); - kunmap_local(kaddr); + ret = btrfs_check_block_csum(fs_info, sector->paddr, csum_buf, csum_expected); return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2f10c65929dc97..cef260ed854c15 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -696,6 +696,20 @@ static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) return page_address(page) + offset_in_page(offset); } +static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + u32 offset = (sector_nr << fs_info->sectorsize_bits); + const struct page *page = stripe->pages[offset >> PAGE_SHIFT]; + + /* stripe->pages[] is allocated by us and no highmem is allowed. */ + ASSERT(page); + ASSERT(!PageHighMem(page)); + /* And the range must be contained inside the page. */ + ASSERT(offset_in_page(offset) + fs_info->sectorsize <= PAGE_SIZE); + return page_to_phys(page) + offset_in_page(offset); +} + static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; @@ -788,7 +802,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; - void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr); u8 csum_buf[BTRFS_CSUM_SIZE]; int ret; @@ -833,7 +847,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) return; } - ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum); + ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum); if (ret < 0) { scrub_bitmap_set_bit_csum_error(stripe, sector_nr); scrub_bitmap_set_bit_error(stripe, sector_nr); From 9afc617265383f591614e94b702d558dfb1519c0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 2 Sep 2025 14:51:49 +0930 Subject: [PATCH 2536/3206] btrfs: introduce btrfs_bio_for_each_block() helper Currently if we want to iterate a bio in block unit, we do something like this: while (iter->bi_size) { struct bio_vec bv = bio_iter_iovec(); /* Do something with using the bv */ bio_advance_iter_single(&bbio->bio, iter, sectorsize); } That's fine for now, but it will not handle future bs > ps, as bio_iter_iovec() returns a single-page bvec, meaning the bv_len will not exceed page size. This means the code using that bv can only handle a block if bs <= ps. To address this problem and handle future bs > ps cases better: - Introduce a helper btrfs_bio_for_each_block() Instead of bio_vec, which has single and multiple page version and multiple page version has quite some limits, use my favorite way to represent a block, phys_addr_t. For bs <= ps cases, nothing is changed, except we will do a very small overhead to convert phys_addr_t to a folio, then use the proper folio helpers to handle the possible highmem cases. For bs > ps cases, all blocks will be backed by large folios, meaning every folio will cover at least one block. And still use proper folio helpers to handle highmem cases. With phys_addr_t, we will handle both large folio and highmem properly. So there is no better single variable to present a btrfs block than phys_addr_t. - Extract the data block csum calculation into a helper The new helper, btrfs_calculate_block_csum() will be utilized by btrfs_csum_one_bio(). - Use btrfs_bio_for_each_block() to replace existing call sites Including: * index_one_bio() from raid56.c Very straight-forward. * btrfs_check_read_bio() Also update repair_one_sector() to grab the folio using phys_addr_t, and do extra checks to make sure the folio covers at least one block. We do not need to bother bv_len at all now. * btrfs_csum_one_bio() Now we can move the highmem handling into a dedicated helper, calculate_block_csum(), and use btrfs_bio_for_each_block() helper. There is one exception in btrfs_decompress_buf2page(), which is copying decompressed data into the original bio, which is not iterating using block size thus we don't need to bother. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 20 +++++++++----------- fs/btrfs/btrfs_inode.h | 2 ++ fs/btrfs/file-item.c | 26 ++++++-------------------- fs/btrfs/inode.c | 26 +++++++++++++++----------- fs/btrfs/misc.h | 25 +++++++++++++++++++++++++ fs/btrfs/raid56.c | 7 +++---- 6 files changed, 60 insertions(+), 46 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 493135bfa518f1..909b208f9ef3c8 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -204,18 +204,21 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, */ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, u32 bio_offset, - struct bio_vec *bv, + phys_addr_t paddr, struct btrfs_failed_bio *fbio) { struct btrfs_inode *inode = failed_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio = page_folio(phys_to_page(paddr)); const u32 sectorsize = fs_info->sectorsize; + const u32 foff = offset_in_folio(folio, paddr); const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); struct btrfs_bio *repair_bbio; struct bio *repair_bio; int num_copies; int mirror; + ASSERT(foff + sectorsize <= folio_size(folio)); btrfs_debug(fs_info, "repair read error: read error at %llu", failed_bbio->file_offset + bio_offset); @@ -238,7 +241,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &btrfs_repair_bioset); repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; - __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + bio_add_folio_nofail(repair_bio, folio, sectorsize, foff); repair_bbio = btrfs_bio(repair_bio); btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); @@ -259,6 +262,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de struct bvec_iter *iter = &bbio->saved_iter; blk_status_t status = bbio->bio.bi_status; struct btrfs_failed_bio *fbio = NULL; + phys_addr_t paddr; u32 offset = 0; /* Read-repair requires the inode field to be set by the submitter. */ @@ -276,17 +280,11 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de /* Clear the I/O error. A failed repair will reset it. */ bbio->bio.bi_status = BLK_STS_OK; - while (iter->bi_size) { - struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); - - bv.bv_len = min(bv.bv_len, sectorsize); - if (status || !btrfs_data_csum_ok(bbio, dev, offset, bvec_phys(&bv))) - fbio = repair_one_sector(bbio, offset, &bv, fbio); - - bio_advance_iter_single(&bbio->bio, iter, sectorsize); + btrfs_bio_for_each_block(paddr, &bbio->bio, iter, fs_info->sectorsize) { + if (status || !btrfs_data_csum_ok(bbio, dev, offset, paddr)) + fbio = repair_one_sector(bbio, offset, paddr, fbio); offset += sectorsize; } - if (bbio->csum != bbio->csum_inline) kfree(bbio->csum); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 077b2f178816f0..c40e99ec13bf2b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -542,6 +542,8 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes +void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, + u8 *dest); int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 4dd3d8a02519ec..7906aea75ee46e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -775,12 +775,10 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio) SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; struct btrfs_ordered_sum *sums; - char *data; - struct bvec_iter iter; - struct bio_vec bvec; + struct bvec_iter iter = bio->bi_iter; + phys_addr_t paddr; + const u32 blocksize = fs_info->sectorsize; int index; - unsigned int blockcount; - int i; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); @@ -799,21 +797,9 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio) shash->tfm = fs_info->csum_shash; - bio_for_each_segment(bvec, bio, iter) { - blockcount = BTRFS_BYTES_TO_BLKS(fs_info, - bvec.bv_len + fs_info->sectorsize - - 1); - - for (i = 0; i < blockcount; i++) { - data = bvec_kmap_local(&bvec); - crypto_shash_digest(shash, - data + (i * fs_info->sectorsize), - fs_info->sectorsize, - sums->sums + index); - kunmap_local(data); - index += fs_info->csum_size; - } - + btrfs_bio_for_each_block(paddr, bio, &iter, blocksize) { + btrfs_calculate_block_csum(fs_info, paddr, sums->sums + index); + index += fs_info->csum_size; } bbio->sums = sums; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98877535f213e9..5fad6af5794466 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3328,14 +3328,8 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) return btrfs_finish_one_ordered(ordered); } -/* - * Verify the checksum for a single sector without any extra action that depend - * on the type of I/O. - * - * @kaddr must be a properly kmapped address. - */ -int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, - const u8 * const csum_expected) +void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, + u8 *dest) { struct folio *folio = page_folio(phys_to_page(paddr)); const u32 blocksize = fs_info->sectorsize; @@ -3359,11 +3353,21 @@ int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 kunmap_local(kaddr); cur += len; } - crypto_shash_final(shash, csum); + crypto_shash_final(shash, dest); } else { - crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, csum); + crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest); } - +} +/* + * Verify the checksum for a single sector without any extra action that depend + * on the type of I/O. + * + * @kaddr must be a properly kmapped address. + */ +int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, + const u8 * const csum_expected) +{ + btrfs_calculate_block_csum(fs_info, paddr, csum); if (memcmp(csum, csum_expected, fs_info->csum_size)) return -EIO; return 0; diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index ff5eac84d819d8..f8f055fdc55176 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. @@ -20,6 +21,30 @@ name = (1U << __ ## name ## _BIT), \ __ ## name ## _SEQ = __ ## name ## _BIT +static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter) +{ + struct bio_vec bv = bio_iter_iovec(bio, *iter); + + return bvec_phys(&bv); +} + +/* + * Iterate bio using btrfs block size. + * + * This will handle large folio and highmem. + * + * @paddr: Physical memory address of each iteration + * @bio: The bio to iterate + * @iter: The bvec_iter (pointer) to use. + * @blocksize: The blocksize to iterate. + * + * This requires all folios in the bio to cover at least one block. + */ +#define btrfs_bio_for_each_block(paddr, bio, iter, blocksize) \ + for (; (iter)->bi_size && \ + (paddr = bio_iter_phys((bio), (iter)), 1); \ + bio_advance_iter_single((bio), (iter), (blocksize))) + static inline void cond_wake_up(struct wait_queue_head *wq) { /* diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index e88699460dda6a..389f1b617fe795 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1208,17 +1208,16 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) const u32 sectorsize = rbio->bioc->fs_info->sectorsize; const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; struct bvec_iter iter = bio->bi_iter; + phys_addr_t paddr; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; - while (iter.bi_size) { + btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) { unsigned int index = (offset >> sectorsize_bits); struct sector_ptr *sector = &rbio->bio_sectors[index]; - struct bio_vec bv = bio_iter_iovec(bio, iter); sector->has_paddr = true; - sector->paddr = bvec_phys(&bv); - bio_advance_iter_single(bio, &iter, sectorsize); + sector->paddr = paddr; offset += sectorsize; } } From 7425a2894019778460e65d24e8adfaeddac3b3b1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 2 Sep 2025 16:45:02 +0930 Subject: [PATCH 2537/3206] btrfs: introduce btrfs_bio_for_each_block_all() helper Currently if we want to iterate all blocks inside a bio, we do something like this: bio_for_each_segment_all(bvec, bio, iter_all) { for (off = 0; off < bvec->bv_len; off += sectorsize) { /* Iterate blocks using bv + off */ } } That's fine for now, but it will not handle future bs > ps, as bio_for_each_segment_all() is a single-page iterator, it will always return a bvec that's no larger than a page. But for bs > ps cases, we need a full folio (which covers at least one block) so that we can work on the block. To address this problem and handle future bs > ps cases better: - Introduce a helper btrfs_bio_for_each_block_all() This helper will create a local bvec_iter, which has the size of the target bio. Then grab the current physical address of the current location, then advance the iterator by block size. - Use btrfs_bio_for_each_block_all() to replace existing call sites Including: * set_bio_pages_uptodate() in raid56 * verify_bio_data_sectors() in raid56 Both will result much easier to read code. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/misc.h | 24 +++++++++++++++++++++++ fs/btrfs/raid56.c | 49 +++++++++++++++++++---------------------------- 2 files changed, 44 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index f8f055fdc55176..60f9b000d644bb 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -45,6 +45,30 @@ static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter) (paddr = bio_iter_phys((bio), (iter)), 1); \ bio_advance_iter_single((bio), (iter), (blocksize))) +/* Initialize a bvec_iter to the size of the specified bio. */ +static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio) +{ + struct bio_vec *bvec; + u32 bio_size = 0; + int i; + + bio_for_each_bvec_all(bvec, bio, i) + bio_size += bvec->bv_len; + + return (struct bvec_iter) { + .bi_sector = 0, + .bi_size = bio_size, + .bi_idx = 0, + .bi_bvec_done = 0, + }; +} + +#define btrfs_bio_for_each_block_all(paddr, bio, blocksize) \ + for (struct bvec_iter iter = init_bvec_iter_for_bio(bio); \ + (iter).bi_size && \ + (paddr = bio_iter_phys((bio), &(iter)), 1); \ + bio_advance_iter_single((bio), &(iter), (blocksize))) + static inline void cond_wake_up(struct wait_queue_head *wq) { /* diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 389f1b617fe795..2b4f577dcf39de 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1510,22 +1510,17 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, */ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + const u32 blocksize = rbio->bioc->fs_info->sectorsize; + phys_addr_t paddr; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct sector_ptr *sector; - phys_addr_t paddr = bvec_phys(bvec); + btrfs_bio_for_each_block_all(paddr, bio, blocksize) { + struct sector_ptr *sector = find_stripe_sector(rbio, paddr); - for (u32 off = 0; off < bvec->bv_len; off += sectorsize) { - sector = find_stripe_sector(rbio, paddr + off); - ASSERT(sector); - if (sector) - sector->uptodate = 1; - } + ASSERT(sector); + if (sector) + sector->uptodate = 1; } } @@ -1572,8 +1567,7 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; int total_sector_nr = get_bio_sector_nr(rbio, bio); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + phys_addr_t paddr; /* No data csum for the whole stripe, no need to verify. */ if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1583,23 +1577,20 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return; - bio_for_each_segment_all(bvec, bio, iter_all) { - for (u32 off = 0; off < bvec->bv_len; - off += fs_info->sectorsize, total_sector_nr++) { - u8 csum_buf[BTRFS_CSUM_SIZE]; - u8 *expected_csum = rbio->csum_buf + - total_sector_nr * fs_info->csum_size; - int ret; + btrfs_bio_for_each_block_all(paddr, bio, fs_info->sectorsize) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; + int ret; - /* No csum for this sector, skip to the next sector. */ - if (!test_bit(total_sector_nr, rbio->csum_bitmap)) - continue; + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; - ret = btrfs_check_block_csum(fs_info, bvec_phys(bvec) + off, - csum_buf, expected_csum); - if (ret < 0) - set_bit(total_sector_nr, rbio->error_bitmap); - } + ret = btrfs_check_block_csum(fs_info, paddr, + csum_buf, expected_csum); + if (ret < 0) + set_bit(total_sector_nr, rbio->error_bitmap); + total_sector_nr++; } } From ea77a1c1c78942578a2f6995d60757c34fe50980 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 1 Sep 2025 07:55:00 +0930 Subject: [PATCH 2538/3206] btrfs: cache max and min order inside btrfs_fs_info Inside btrfs_fs_info we cache several bits shift like sectorsize_bits. Apply this to max and min folio orders so that every time mapping order needs to be applied we can skip the calculation. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 6 +++--- fs/btrfs/disk-io.c | 2 ++ fs/btrfs/fs.h | 2 ++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c40e99ec13bf2b..f45dcdde0efc77 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -532,9 +532,9 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) /* We only allow BITS_PER_LONGS blocks for each bitmap. */ #ifdef CONFIG_BTRFS_EXPERIMENTAL - mapping_set_folio_order_range(inode->vfs_inode.i_mapping, 0, - ilog2(((BITS_PER_LONG << inode->root->fs_info->sectorsize_bits) - >> PAGE_SHIFT))); + mapping_set_folio_order_range(inode->vfs_inode.i_mapping, + inode->root->fs_info->block_min_order, + inode->root->fs_info->block_max_order); #endif } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7b06bbc4089878..a2eba8bc4336d3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3383,6 +3383,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize_bits = ilog2(nodesize); fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT); + fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 5f0b185a7f21ab..bf4a1b75b0cf48 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -825,6 +825,8 @@ struct btrfs_fs_info { u32 sectorsize; /* ilog2 of sectorsize, use to avoid 64bit division */ u32 sectorsize_bits; + u32 block_min_order; + u32 block_max_order; u32 csum_size; u32 csums_per_leaf; u32 stripesize; From 6803bff896ef96c9b2ee436d9e4f17ba7357d601 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 26 Aug 2025 16:12:53 +0100 Subject: [PATCH 2539/3206] btrfs: use booleans in walk control structure for log replay The 'free' and 'pin' member of struct walk_control, used during log replay and when freeing a log tree, are defined as integers but in practice are used as booleans. Change their type to bool and while at it update their comments to be more detailed and comply with the preferred comment style (first word in a sentence is capitalized, sentences end with punctuation and the comment opening (/*) is on a line of its own). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 861f96ef28cf66..c5c5fc05eabb4e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -306,15 +306,20 @@ void btrfs_end_log_trans(struct btrfs_root *root) * are state fields used for that specific part */ struct walk_control { - /* should we free the extent on disk when done? This is used - * at transaction commit time while freeing a log tree + /* + * Signal that we are freeing the metadata extents of a log tree. + * This is used at transaction commit time while freeing a log tree. */ - int free; + bool free; - /* pin only walk, we record which extents on disk belong to the - * log trees + /* + * Signal that we are pinning the metadata extents of a log tree and the + * data extents its leaves point to (if using mixed block groups). + * This happens in the first stage of log replay to ensure that during + * replay, while we are modifying subvolume trees, we don't overwrite + * the metadata extents of log trees. */ - int pin; + bool pin; /* what stage of the replay code we're currently in */ int stage; @@ -3415,7 +3420,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, { int ret; struct walk_control wc = { - .free = 1, + .free = true, .process_func = process_one_buffer }; @@ -7433,7 +7438,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) } wc.trans = trans; - wc.pin = 1; + wc.pin = true; ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { @@ -7557,7 +7562,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) /* step one is to pin it all, step two is to replay just inodes */ if (wc.pin) { - wc.pin = 0; + wc.pin = false; wc.process_func = replay_one_buffer; wc.stage = LOG_WALK_REPLAY_INODES; goto again; From 2c123db1f0e1fbba424263851973e2bb862c26fb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 26 Aug 2025 16:27:46 +0100 Subject: [PATCH 2540/3206] btrfs: rename replay_dest member of struct walk_control to root Everywhere else we refer to a subvolume root we are replaying to simply as 'root', so rename from 'replay_dest' to 'root' for consistency and having a more meaningful and shorter name. While at it also update the comment to be more detailed and comply to preferred style (first word in a sentence is capitalized and sentence ends with punctuation). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c5c5fc05eabb4e..c0cc94efbcaa22 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -330,8 +330,11 @@ struct walk_control { */ bool ignore_cur_inode; - /* the root we are currently replaying */ - struct btrfs_root *replay_dest; + /* + * The root we are currently replaying to. This is NULL for the replay + * stage LOG_WALK_PIN_ONLY. + */ + struct btrfs_root *root; /* the trans handle for the current replay */ struct btrfs_trans_handle *trans; @@ -2575,7 +2578,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, .level = level }; struct btrfs_path *path; - struct btrfs_root *root = wc->replay_dest; + struct btrfs_root *root = wc->root; struct btrfs_trans_handle *trans = wc->trans; struct btrfs_key key; int i; @@ -7479,11 +7482,10 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) goto error; } - wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, - true); - if (IS_ERR(wc.replay_dest)) { - ret = PTR_ERR(wc.replay_dest); - wc.replay_dest = NULL; + wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true); + if (IS_ERR(wc.root)) { + ret = PTR_ERR(wc.root); + wc.root = NULL; if (ret != -ENOENT) { btrfs_put_root(log); btrfs_abort_transaction(trans, ret); @@ -7510,8 +7512,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) goto next; } - wc.replay_dest->log_root = log; - ret = btrfs_record_root_in_trans(trans, wc.replay_dest); + wc.root->log_root = log; + ret = btrfs_record_root_in_trans(trans, wc.root); if (ret) { btrfs_abort_transaction(trans, ret); goto next; @@ -7524,9 +7526,9 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) } if (wc.stage == LOG_WALK_REPLAY_ALL) { - struct btrfs_root *root = wc.replay_dest; + struct btrfs_root *root = wc.root; - ret = fixup_inode_link_counts(trans, wc.replay_dest, path); + ret = fixup_inode_link_counts(trans, root, path); if (ret) { btrfs_abort_transaction(trans, ret); goto next; @@ -7546,9 +7548,9 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) } } next: - if (wc.replay_dest) { - wc.replay_dest->log_root = NULL; - btrfs_put_root(wc.replay_dest); + if (wc.root) { + wc.root->log_root = NULL; + btrfs_put_root(wc.root); } btrfs_put_root(log); From 60ac80242be1ce5e75f7f1e79f0d02996702388c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 26 Aug 2025 16:43:48 +0100 Subject: [PATCH 2541/3206] btrfs: rename root to log in walk_down_log_tree() and walk_up_log_tree() Everywhere we have a log root we name it as 'log' or 'log_root' except in walk_down_log_tree() and walk_up_log_tree() where we name it as 'root', which not only it's inconsistent, it's also confusing since we typically use 'root' when naming variables that refer to a subvolume tree. So for clairty and consistency rename the 'root' argument to 'log'. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c0cc94efbcaa22..321ec0de07336a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2779,11 +2779,11 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans, } static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_root *log, struct btrfs_path *path, int *level, struct walk_control *wc) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = log->fs_info; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; @@ -2821,8 +2821,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, } if (*level == 1) { - ret = wc->process_func(root, next, wc, ptr_gen, - *level - 1); + ret = wc->process_func(log, next, wc, ptr_gen, *level - 1); if (ret) { free_extent_buffer(next); return ret; @@ -2873,7 +2872,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, } static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_root *log, struct btrfs_path *path, int *level, struct walk_control *wc) { @@ -2889,7 +2888,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level == 0); return 0; } else { - ret = wc->process_func(root, path->nodes[*level], wc, + ret = wc->process_func(log, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level]), *level); if (ret) From efa44fc4fde3eb60da436d31707c00f6c182a6c8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 26 Aug 2025 17:25:56 +0100 Subject: [PATCH 2542/3206] btrfs: add and use a log root field to struct walk_control Instead of passing an extra log root parameter for the log tree walk functions and callbacks, add the log tree to struct walk_control and make those functions and callbacks extract the log root from that structure, reducing the number of parameters. This also simplifies further upcoming changes to report log tree replay failures. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 62 +++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 321ec0de07336a..7d1b21df698d9f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -336,6 +336,9 @@ struct walk_control { */ struct btrfs_root *root; + /* The log tree we are currently processing (not NULL for any stage). */ + struct btrfs_root *log; + /* the trans handle for the current replay */ struct btrfs_trans_handle *trans; @@ -344,17 +347,17 @@ struct walk_control { * passed in, and it must be checked or read if you need the data * inside it */ - int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, + int (*process_func)(struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level); }; /* * process_func used to pin down extents, write them or wait on them */ -static int process_one_buffer(struct btrfs_root *log, - struct extent_buffer *eb, +static int process_one_buffer(struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { + struct btrfs_root *log = wc->log; struct btrfs_trans_handle *trans = wc->trans; struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; @@ -2569,7 +2572,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, * only in the log (references come from either directory items or inode * back refs). */ -static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, +static int replay_one_buffer(struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { int nritems; @@ -2579,6 +2582,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, }; struct btrfs_path *path; struct btrfs_root *root = wc->root; + struct btrfs_root *log = wc->log; struct btrfs_trans_handle *trans = wc->trans; struct btrfs_key key; int i; @@ -2779,11 +2783,10 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans, } static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *log, struct btrfs_path *path, int *level, struct walk_control *wc) { - struct btrfs_fs_info *fs_info = log->fs_info; + struct btrfs_fs_info *fs_info = wc->log->fs_info; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; @@ -2821,7 +2824,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, } if (*level == 1) { - ret = wc->process_func(log, next, wc, ptr_gen, *level - 1); + ret = wc->process_func(next, wc, ptr_gen, *level - 1); if (ret) { free_extent_buffer(next); return ret; @@ -2872,7 +2875,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, } static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *log, struct btrfs_path *path, int *level, struct walk_control *wc) { @@ -2888,7 +2890,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level == 0); return 0; } else { - ret = wc->process_func(log, path->nodes[*level], wc, + ret = wc->process_func(path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level]), *level); if (ret) @@ -2912,9 +2914,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, * the tree freeing any blocks that have a ref count of zero after being * decremented. */ -static int walk_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *log, struct walk_control *wc) +static int walk_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { + struct btrfs_root *log = wc->log; int ret = 0; int wret; int level; @@ -2932,7 +2934,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, path->slots[level] = 0; while (1) { - wret = walk_down_log_tree(trans, log, path, &level, wc); + wret = walk_down_log_tree(trans, path, &level, wc); if (wret > 0) break; if (wret < 0) { @@ -2940,7 +2942,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, goto out; } - wret = walk_up_log_tree(trans, log, path, &level, wc); + wret = walk_up_log_tree(trans, path, &level, wc); if (wret > 0) break; if (wret < 0) { @@ -2951,7 +2953,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, /* was the root node processed? if not, catch it here */ if (path->nodes[orig_level]) { - ret = wc->process_func(log, path->nodes[orig_level], wc, + ret = wc->process_func(path->nodes[orig_level], wc, btrfs_header_generation(path->nodes[orig_level]), orig_level); if (ret) @@ -3423,11 +3425,12 @@ static void free_log_tree(struct btrfs_trans_handle *trans, int ret; struct walk_control wc = { .free = true, - .process_func = process_one_buffer + .process_func = process_one_buffer, + .log = log, }; if (log->node) { - ret = walk_log_tree(trans, log, &wc); + ret = walk_log_tree(trans, &wc); if (ret) { /* * We weren't able to traverse the entire log tree, the @@ -7441,8 +7444,10 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) wc.trans = trans; wc.pin = true; + wc.log = log_root_tree; - ret = walk_log_tree(trans, log_root_tree, &wc); + ret = walk_log_tree(trans, &wc); + wc.log = NULL; if (ret) { btrfs_abort_transaction(trans, ret); goto error; @@ -7454,7 +7459,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) key.offset = (u64)-1; while (1) { - struct btrfs_root *log; struct btrfs_key found_key; ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); @@ -7474,9 +7478,10 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_tree_root(log_root_tree, &found_key); - if (IS_ERR(log)) { - ret = PTR_ERR(log); + wc.log = btrfs_read_tree_root(log_root_tree, &found_key); + if (IS_ERR(wc.log)) { + ret = PTR_ERR(wc.log); + wc.log = NULL; btrfs_abort_transaction(trans, ret); goto error; } @@ -7486,7 +7491,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = PTR_ERR(wc.root); wc.root = NULL; if (ret != -ENOENT) { - btrfs_put_root(log); + btrfs_put_root(wc.log); + wc.log = NULL; btrfs_abort_transaction(trans, ret); goto error; } @@ -7502,23 +7508,24 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) * block from being modified, and we'll just bail for * each subsequent pass. */ - ret = btrfs_pin_extent_for_log_replay(trans, log->node); + ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node); if (ret) { - btrfs_put_root(log); + btrfs_put_root(wc.log); + wc.log = NULL; btrfs_abort_transaction(trans, ret); goto error; } goto next; } - wc.root->log_root = log; + wc.root->log_root = wc.log; ret = btrfs_record_root_in_trans(trans, wc.root); if (ret) { btrfs_abort_transaction(trans, ret); goto next; } - ret = walk_log_tree(trans, log, &wc); + ret = walk_log_tree(trans, &wc); if (ret) { btrfs_abort_transaction(trans, ret); goto next; @@ -7551,7 +7558,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) wc.root->log_root = NULL; btrfs_put_root(wc.root); } - btrfs_put_root(log); + btrfs_put_root(wc.log); + wc.log = NULL; if (ret) goto error; From 7f09699e5e6127db7bcb6d49d8fc674a74f23659 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 26 Aug 2025 17:30:16 +0100 Subject: [PATCH 2543/3206] btrfs: deduplicate log root free in error paths from btrfs_recover_log_trees() Instead of duplicating the dropping of a log tree in case we jump to the 'error' label, move the dropping under the 'error' label and get rid of the the unnecessary setting of the log root to NULL since we return immediately after. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7d1b21df698d9f..f039f0613a38b7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7491,8 +7491,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = PTR_ERR(wc.root); wc.root = NULL; if (ret != -ENOENT) { - btrfs_put_root(wc.log); - wc.log = NULL; btrfs_abort_transaction(trans, ret); goto error; } @@ -7510,8 +7508,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) */ ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node); if (ret) { - btrfs_put_root(wc.log); - wc.log = NULL; btrfs_abort_transaction(trans, ret); goto error; } @@ -7597,6 +7593,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) error: if (wc.trans) btrfs_end_transaction(wc.trans); + btrfs_put_root(wc.log); clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); btrfs_free_path(path); return ret; From d73896a55c047d0e7269128346f163a8efc908aa Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Aug 2025 11:48:43 +0100 Subject: [PATCH 2544/3206] btrfs: stop passing transaction parameter to log tree walk functions It's unncessary to pass a transaction parameter since struct walk_control already has a member that points to the transaction, so we can make the functions access the structure. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f039f0613a38b7..dee306101d8e77 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2782,10 +2782,10 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans, return 0; } -static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_path *path, int *level, - struct walk_control *wc) +static noinline int walk_down_log_tree(struct btrfs_path *path, int *level, + struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_fs_info *fs_info = wc->log->fs_info; u64 bytenr; u64 ptr_gen; @@ -2874,9 +2874,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return 0; } -static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_path *path, int *level, - struct walk_control *wc) +static noinline int walk_up_log_tree(struct btrfs_path *path, int *level, + struct walk_control *wc) { int i; int slot; @@ -2897,7 +2896,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, return ret; if (wc->free) { - ret = clean_log_buffer(trans, path->nodes[*level]); + ret = clean_log_buffer(wc->trans, path->nodes[*level]); if (ret) return ret; } @@ -2914,7 +2913,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, * the tree freeing any blocks that have a ref count of zero after being * decremented. */ -static int walk_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) +static int walk_log_tree(struct walk_control *wc) { struct btrfs_root *log = wc->log; int ret = 0; @@ -2934,7 +2933,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, struct walk_control * path->slots[level] = 0; while (1) { - wret = walk_down_log_tree(trans, path, &level, wc); + wret = walk_down_log_tree(path, &level, wc); if (wret > 0) break; if (wret < 0) { @@ -2942,7 +2941,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, struct walk_control * goto out; } - wret = walk_up_log_tree(trans, path, &level, wc); + wret = walk_up_log_tree(path, &level, wc); if (wret > 0) break; if (wret < 0) { @@ -2959,7 +2958,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, struct walk_control * if (ret) goto out; if (wc->free) - ret = clean_log_buffer(trans, path->nodes[orig_level]); + ret = clean_log_buffer(wc->trans, path->nodes[orig_level]); } out: @@ -3427,10 +3426,11 @@ static void free_log_tree(struct btrfs_trans_handle *trans, .free = true, .process_func = process_one_buffer, .log = log, + .trans = trans, }; if (log->node) { - ret = walk_log_tree(trans, &wc); + ret = walk_log_tree(&wc); if (ret) { /* * We weren't able to traverse the entire log tree, the @@ -7446,7 +7446,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) wc.pin = true; wc.log = log_root_tree; - ret = walk_log_tree(trans, &wc); + ret = walk_log_tree(&wc); wc.log = NULL; if (ret) { btrfs_abort_transaction(trans, ret); @@ -7521,7 +7521,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) goto next; } - ret = walk_log_tree(trans, &wc); + ret = walk_log_tree(&wc); if (ret) { btrfs_abort_transaction(trans, ret); goto next; From 4b7699f406225ce13afc6cbb320d4b84e9c8fe0f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Aug 2025 12:03:52 +0100 Subject: [PATCH 2545/3206] btrfs: stop setting log_root_tree->log_root to NULL in btrfs_recover_log_trees() There's no point in setting log_root_tree->log_root to NULL as this is already NULL, we never assigned anything to it before and it's meaningless as a log root never has a value other than NULL for the ->log_root field, that can be not NULL only for non log roots. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index dee306101d8e77..ab2f6bab096b87 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7585,7 +7585,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) if (ret) return ret; - log_root_tree->log_root = NULL; clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); btrfs_put_root(log_root_tree); From 2f5b8095ea47b142c56c09755a8b1e14145a2d30 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Aug 2025 12:10:28 +0100 Subject: [PATCH 2546/3206] btrfs: always drop log root tree reference in btrfs_replay_log() Currently we have this odd behaviour: 1) At btrfs_replay_log() we drop the reference of the log root tree if the call to btrfs_recover_log_trees() failed; 2) But if the call to btrfs_recover_log_trees() did not fail, we don't drop the reference in btrfs_replay_log() - we expect that btrfs_recover_log_trees() does it in case it returns success. Let's simplify this and make btrfs_replay_log() always drop the reference on the log root tree, not only this simplifies code as it's what makes sense since it's btrfs_replay_log() who grabbed the reference in the first place. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/tree-log.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a2eba8bc4336d3..a9e82e062bd588 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2088,10 +2088,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); + btrfs_put_root(log_tree_root); if (ret) { btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); - btrfs_put_root(log_tree_root); return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ab2f6bab096b87..4d34aee0cafa17 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7586,7 +7586,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) return ret; clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); - btrfs_put_root(log_root_tree); return 0; error: From 7790a882ca6bfe24392080605e5ba19f138ac29d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Aug 2025 15:24:08 +0100 Subject: [PATCH 2547/3206] btrfs: pass walk_control structure to replay_xattr_deletes() Instead of passing the transaction, subvolume root and log tree as arguments to replay_xattr_deletes(), pass the walk_control structure as we can grab all of those from the structure. This reduces the number of arguments passed and it's going to be needed by an incoming change that improves error reporting for log replay. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4d34aee0cafa17..cd4c5ae3e0a30d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2336,12 +2336,13 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, return ret; } -static int replay_xattr_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - const u64 ino) +static int replay_xattr_deletes(struct walk_control *wc, + struct btrfs_path *path, + const u64 ino) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; + struct btrfs_root *log = wc->log; struct btrfs_key search_key; struct btrfs_path *log_path; int i; @@ -2645,7 +2646,7 @@ static int replay_one_buffer(struct extent_buffer *eb, wc->stage == LOG_WALK_REPLAY_INODES) { u32 mode; - ret = replay_xattr_deletes(trans, root, log, path, key.objectid); + ret = replay_xattr_deletes(wc, path, key.objectid); if (ret) break; mode = btrfs_inode_mode(eb, inode_item); From 94a5ac668a49c4fc1bd8e35a3c717ba0ae9f7cf8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Aug 2025 15:48:49 +0100 Subject: [PATCH 2548/3206] btrfs: move up the definition of struct walk_control In upcoming changes we need to pass struct walk_control as an argument to replay_dir_deletes() and link_to_fixup_dir() so we need to move its definition above the prototypes of those functions. So move it up right below the enum that defines log replay stages and before any functions and function prototypes are declared. Also fixup the comments while moving it so that they comply with the preferred code style (capitalize the first word in a sentence, end sentences with punctuation, makes lines wider and closer to the 80 characters limit). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 103 ++++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cd4c5ae3e0a30d..2780f0e1db0195 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -101,6 +101,57 @@ enum { LOG_WALK_REPLAY_ALL, }; +/* + * The walk control struct is used to pass state down the chain when processing + * the log tree. The stage field tells us which part of the log tree processing + * we are currently doing. + */ +struct walk_control { + /* + * Signal that we are freeing the metadata extents of a log tree. + * This is used at transaction commit time while freeing a log tree. + */ + bool free; + + /* + * Signal that we are pinning the metadata extents of a log tree and the + * data extents its leaves point to (if using mixed block groups). + * This happens in the first stage of log replay to ensure that during + * replay, while we are modifying subvolume trees, we don't overwrite + * the metadata extents of log trees. + */ + bool pin; + + /* What stage of the replay code we're currently in. */ + int stage; + + /* + * Ignore any items from the inode currently being processed. Needs + * to be set every time we find a BTRFS_INODE_ITEM_KEY. + */ + bool ignore_cur_inode; + + /* + * The root we are currently replaying to. This is NULL for the replay + * stage LOG_WALK_PIN_ONLY. + */ + struct btrfs_root *root; + + /* The log tree we are currently processing (not NULL for any stage). */ + struct btrfs_root *log; + + /* The transaction handle used for replaying all log trees. */ + struct btrfs_trans_handle *trans; + + /* + * The function that gets used to process blocks we find in the tree. + * Note the extent_buffer might not be up to date when it is passed in, + * and it must be checked or read if you need the data inside it. + */ + int (*process_func)(struct extent_buffer *eb, + struct walk_control *wc, u64 gen, int level); +}; + static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, int inode_only, @@ -299,58 +350,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } -/* - * the walk control struct is used to pass state down the chain when - * processing the log tree. The stage field tells us which part - * of the log tree processing we are currently doing. The others - * are state fields used for that specific part - */ -struct walk_control { - /* - * Signal that we are freeing the metadata extents of a log tree. - * This is used at transaction commit time while freeing a log tree. - */ - bool free; - - /* - * Signal that we are pinning the metadata extents of a log tree and the - * data extents its leaves point to (if using mixed block groups). - * This happens in the first stage of log replay to ensure that during - * replay, while we are modifying subvolume trees, we don't overwrite - * the metadata extents of log trees. - */ - bool pin; - - /* what stage of the replay code we're currently in */ - int stage; - - /* - * Ignore any items from the inode currently being processed. Needs - * to be set every time we find a BTRFS_INODE_ITEM_KEY. - */ - bool ignore_cur_inode; - - /* - * The root we are currently replaying to. This is NULL for the replay - * stage LOG_WALK_PIN_ONLY. - */ - struct btrfs_root *root; - - /* The log tree we are currently processing (not NULL for any stage). */ - struct btrfs_root *log; - - /* the trans handle for the current replay */ - struct btrfs_trans_handle *trans; - - /* the function that gets used to process blocks we find in the - * tree. Note the extent_buffer might not be up to date when it is - * passed in, and it must be checked or read if you need the data - * inside it - */ - int (*process_func)(struct extent_buffer *eb, - struct walk_control *wc, u64 gen, int level); -}; - /* * process_func used to pin down extents, write them or wait on them */ From 82d1db6f465748d914cc95fb550ed786119f2925 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Aug 2025 16:10:39 +0100 Subject: [PATCH 2549/3206] btrfs: pass walk_control structure to replay_dir_deletes() Instead of passing the transaction, subvolume root and log tree as arguments to replay_dir_deletes(), pass the walk_control structure as we can grab all of those from the structure. This reduces the number of arguments passed and it's going to be needed by an incoming change that improves error reporting for log replay. This also requires changing fixup_inode_link_counts() and fixup_inode_link_count() to take that structure as an argument since fixup_inode_link_count() makes a call to replay_dir_deletes(). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2780f0e1db0195..460dc51e8c820d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -159,9 +159,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); -static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, +static noinline int replay_dir_deletes(struct walk_control *wc, struct btrfs_path *path, u64 dirid, bool del_all); static void wait_log_commit(struct btrfs_root *root, int transid); @@ -1727,9 +1725,10 @@ static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path) * number of back refs found. If it goes down to zero, the iput * will free the inode. */ -static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, +static noinline int fixup_inode_link_count(struct walk_control *wc, struct btrfs_inode *inode) { + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = inode->root; struct btrfs_path *path; int ret; @@ -1765,7 +1764,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (inode->vfs_inode.i_nlink == 0) { if (S_ISDIR(inode->vfs_inode.i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, ino, true); + ret = replay_dir_deletes(wc, path, ino, true); if (ret) goto out; } @@ -1779,8 +1778,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, return ret; } -static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static noinline int fixup_inode_link_counts(struct walk_control *wc, struct btrfs_path *path) { int ret; @@ -1790,6 +1788,8 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_inode *inode; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); @@ -1819,7 +1819,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; } - ret = fixup_inode_link_count(trans, inode); + ret = fixup_inode_link_count(wc, inode); iput(&inode->vfs_inode); if (ret) break; @@ -2455,12 +2455,13 @@ static int replay_xattr_deletes(struct walk_control *wc, * Anything we don't find in the log is unlinked and removed from the * directory. */ -static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, +static noinline int replay_dir_deletes(struct walk_control *wc, struct btrfs_path *path, u64 dirid, bool del_all) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; + struct btrfs_root *log = (del_all ? NULL : wc->log); u64 range_start; u64 range_end; int ret = 0; @@ -2650,8 +2651,7 @@ static int replay_one_buffer(struct extent_buffer *eb, break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { - ret = replay_dir_deletes(trans, root, log, path, - key.objectid, false); + ret = replay_dir_deletes(wc, path, key.objectid, false); if (ret) break; } @@ -7530,7 +7530,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) if (wc.stage == LOG_WALK_REPLAY_ALL) { struct btrfs_root *root = wc.root; - ret = fixup_inode_link_counts(trans, root, path); + ret = fixup_inode_link_counts(&wc, path); if (ret) { btrfs_abort_transaction(trans, ret); goto next; From b150f1c32138d6cdf1e6330521a5bbff3cdd9018 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Aug 2025 16:27:57 +0100 Subject: [PATCH 2550/3206] btrfs: pass walk_control structure to check_item_in_log() Instead of passing the transaction and log tree as arguments to check_item_in_log(), pass the walk_control structure as we can grab those from the structure. This reduces the number of arguments passed and it's going to be needed by an incoming change that improves error reporting for log replay. Notice that a NULL log root argument to check_item_in_log() makes it unconditionally delete a directory entry, so since the walk_control always has a non-NULL log root, we add an extra boolean to check_item_in_log() to tell it if it should unconditionally delete a directory entry, preserving the behaviour and also making it a bit more clear. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 460dc51e8c820d..ca0946f947dfa1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2254,13 +2254,14 @@ static noinline int find_dir_range(struct btrfs_root *root, * item is not in the log, the item is removed and the inode it points * to is unlinked */ -static noinline int check_item_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *log, +static noinline int check_item_in_log(struct walk_control *wc, struct btrfs_path *path, struct btrfs_path *log_path, struct btrfs_inode *dir, - struct btrfs_key *dir_key) + struct btrfs_key *dir_key, + bool force_remove) { + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; @@ -2287,10 +2288,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, goto out; } - if (log) { + if (!force_remove) { struct btrfs_dir_item *log_di; - log_di = btrfs_lookup_dir_index_item(trans, log, log_path, + log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path, dir_key->objectid, dir_key->offset, &name, 0); if (IS_ERR(log_di)) { @@ -2540,9 +2541,8 @@ static noinline int replay_dir_deletes(struct walk_control *wc, if (found_key.offset > range_end) break; - ret = check_item_in_log(trans, log, path, - log_path, dir, - &found_key); + ret = check_item_in_log(wc, path, log_path, dir, + &found_key, del_all); if (ret) goto out; if (found_key.offset == (u64)-1) From c7da72022bb2cfeb21951b875db1f029ff236a72 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 28 Aug 2025 13:05:52 +0100 Subject: [PATCH 2551/3206] btrfs: pass walk_control structure to replay_one_extent() Instead of passing the transaction and subvolume root as arguments to replay_one_extent(), pass the walk_control structure as we can grab all of those from the structure. This reduces the number of arguments passed and it's going to be needed by an incoming change that improves error reporting for log replay. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ca0946f947dfa1..aac648ae30fbb3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -642,12 +642,13 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, * The extent is inserted into the file, dropping any existing extents * from the file that overlap the new one. */ -static noinline int replay_one_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static noinline int replay_one_extent(struct walk_control *wc, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_fs_info *fs_info = root->fs_info; int found_type; @@ -2728,7 +2729,7 @@ static int replay_one_buffer(struct extent_buffer *eb, if (ret) break; } else if (key.type == BTRFS_EXTENT_DATA_KEY) { - ret = replay_one_extent(trans, root, path, eb, i, &key); + ret = replay_one_extent(wc, path, eb, i, &key); if (ret) break; } From 44463eb0799226e60c8ff358da78e33edac83ee5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 28 Aug 2025 13:11:17 +0100 Subject: [PATCH 2552/3206] btrfs: pass walk_control structure to add_inode_ref() and helpers Instead of passing the transaction, subvolume root and log tree as arguments to add_inode_ref() and its helpers (__add_inode_ref(), unlink_refs_not_in_log(), unlink_extrefs_not_in_log() and unlink_old_inode_refs()), pass the walk_control structure as we can access all of those from the structure. This reduces the number of arguments passed and it's going to be needed by an incoming change that improves error reporting for log replay. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 47 ++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index aac648ae30fbb3..2ec9252115fd56 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1088,9 +1088,8 @@ static noinline int backref_in_log(struct btrfs_root *log, return ret; } -static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, +static int unlink_refs_not_in_log(struct walk_control *wc, struct btrfs_path *path, - struct btrfs_root *log_root, struct btrfs_key *search_key, struct btrfs_inode *dir, struct btrfs_inode *inode, @@ -1108,6 +1107,7 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); while (ptr < ptr_end) { + struct btrfs_trans_handle *trans = wc->trans; struct fscrypt_str victim_name; struct btrfs_inode_ref *victim_ref; int ret; @@ -1121,7 +1121,7 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, return ret; } - ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + ret = backref_in_log(wc->log, search_key, parent_objectid, &victim_name); if (ret) { kfree(victim_name.name); if (ret < 0) { @@ -1145,10 +1145,8 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, return 0; } -static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, +static int unlink_extrefs_not_in_log(struct walk_control *wc, struct btrfs_path *path, - struct btrfs_root *root, - struct btrfs_root *log_root, struct btrfs_key *search_key, struct btrfs_inode *inode, u64 inode_objectid, @@ -1160,6 +1158,9 @@ static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, u32 cur_offset = 0; while (cur_offset < item_size) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; + struct btrfs_root *log_root = wc->log; struct btrfs_inode_extref *extref; struct btrfs_inode *victim_parent; struct fscrypt_str victim_name; @@ -1218,16 +1219,16 @@ static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, return 0; } -static inline int __add_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static inline int __add_inode_ref(struct walk_control *wc, struct btrfs_path *path, - struct btrfs_root *log_root, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, u64 ref_index, struct fscrypt_str *name) { int ret; + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_dir_item *di; struct btrfs_key search_key; struct btrfs_inode_extref *extref; @@ -1249,8 +1250,8 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, if (search_key.objectid == search_key.offset) return 1; - ret = unlink_refs_not_in_log(trans, path, log_root, &search_key, - dir, inode, parent_objectid); + ret = unlink_refs_not_in_log(wc, path, &search_key, dir, inode, + parent_objectid); if (ret == -EAGAIN) goto again; else if (ret) @@ -1263,8 +1264,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, if (IS_ERR(extref)) { return PTR_ERR(extref); } else if (extref) { - ret = unlink_extrefs_not_in_log(trans, path, root, log_root, - &search_key, inode, + ret = unlink_extrefs_not_in_log(wc, path, &search_key, inode, inode_objectid, parent_objectid); if (ret == -EAGAIN) goto again; @@ -1349,14 +1349,15 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, * proper unlink of that name (that is, remove its entry from the inode * reference item and both dir index keys). */ -static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_path *path, struct btrfs_inode *inode, struct extent_buffer *log_eb, int log_slot, struct btrfs_key *key) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; int ret; unsigned long ref_ptr; unsigned long ref_end; @@ -1441,13 +1442,13 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, * root is the destination we are replaying into, and path is for temp * use by this function. (it should be released on return). */ -static noinline int add_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, +static noinline int add_inode_ref(struct walk_control *wc, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_inode *dir = NULL; struct btrfs_inode *inode = NULL; unsigned long ref_ptr; @@ -1559,9 +1560,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, dir, inode, - inode_objectid, parent_objectid, - ref_index, &name); + ret = __add_inode_ref(wc, path, dir, inode, inode_objectid, + parent_objectid, ref_index, &name); if (ret) { if (ret == 1) ret = 0; @@ -1601,7 +1601,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); + ret = unlink_old_inode_refs(wc, path, inode, eb, slot, key); if (ret) goto out; @@ -2584,7 +2584,6 @@ static int replay_one_buffer(struct extent_buffer *eb, }; struct btrfs_path *path; struct btrfs_root *root = wc->root; - struct btrfs_root *log = wc->log; struct btrfs_trans_handle *trans = wc->trans; struct btrfs_key key; int i; @@ -2725,7 +2724,7 @@ static int replay_one_buffer(struct extent_buffer *eb, break; } else if (key.type == BTRFS_INODE_REF_KEY || key.type == BTRFS_INODE_EXTREF_KEY) { - ret = add_inode_ref(trans, root, log, path, eb, i, &key); + ret = add_inode_ref(wc, path, eb, i, &key); if (ret) break; } else if (key.type == BTRFS_EXTENT_DATA_KEY) { From 744e0cebb4fa51fcca2d9d916dd3a8d353e2e6d1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 28 Aug 2025 13:54:40 +0100 Subject: [PATCH 2553/3206] btrfs: pass walk_control structure to replay_one_dir_item() and replay_one_name() Instead of passing the transaction and subvolume root and log tree as arguments, pass the walk_control structure as we can grab all of those from the structure. This reduces the number of arguments passed and it's going to be needed by an incoming change that improves error reporting for log replay. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2ec9252115fd56..c4c2fbf291a1d1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1961,13 +1961,14 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a * non-existing inode) and 1 if the name was replayed. */ -static noinline int replay_one_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static noinline int replay_one_name(struct walk_control *wc, struct btrfs_path *path, struct extent_buffer *eb, struct btrfs_dir_item *di, struct btrfs_key *key) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct fscrypt_str name = { 0 }; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; @@ -2107,8 +2108,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, } /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ -static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static noinline int replay_one_dir_item(struct walk_control *wc, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) @@ -2120,7 +2120,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ASSERT(key->type == BTRFS_DIR_INDEX_KEY); di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); - ret = replay_one_name(trans, root, path, eb, di, key); + ret = replay_one_name(wc, path, eb, di, key); if (ret < 0) return ret; @@ -2156,12 +2156,12 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, fixup_path = btrfs_alloc_path(); if (!fixup_path) { - btrfs_abort_transaction(trans, -ENOMEM); + btrfs_abort_transaction(wc->trans, -ENOMEM); return -ENOMEM; } btrfs_dir_item_key_to_cpu(eb, di, &di_key); - ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); + ret = link_to_fixup_dir(wc->trans, wc->root, fixup_path, di_key.objectid); btrfs_free_path(fixup_path); } @@ -2709,7 +2709,7 @@ static int replay_one_buffer(struct extent_buffer *eb, if (key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { - ret = replay_one_dir_item(trans, root, path, eb, i, &key); + ret = replay_one_dir_item(wc, path, eb, i, &key); if (ret) break; } From aa5b6635b0e6e059badf8f16a431c5647105383e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 28 Aug 2025 16:40:48 +0100 Subject: [PATCH 2554/3206] btrfs: pass walk_control structure to drop_one_dir_item() and helpers Instead of passing the transaction as an argument to drop_one_dir_item() and its helpers (link_to_fixup_dir() and unlink_inode_for_log_replay()), pass the walk_control structure as we can access the transaction from it and the subvolume root. This is going to be needed by an incoming change that improves error reporting for log replay and also reduces the number of arguments passed to link_to_fixup_dir(). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c4c2fbf291a1d1..01a0f7cbcd4b00 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -156,8 +156,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx); -static int link_to_fixup_dir(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int link_to_fixup_dir(struct walk_control *wc, struct btrfs_path *path, u64 objectid); static noinline int replay_dir_deletes(struct walk_control *wc, struct btrfs_path *path, @@ -927,11 +926,12 @@ static noinline int replay_one_extent(struct walk_control *wc, return ret; } -static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, +static int unlink_inode_for_log_replay(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_inode *inode, const struct fscrypt_str *name) { + struct btrfs_trans_handle *trans = wc->trans; int ret; ret = btrfs_unlink_inode(trans, dir, inode, name); @@ -959,11 +959,12 @@ static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, * This is a helper function to do the unlink of a specific directory * item */ -static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, +static noinline int drop_one_dir_item(struct walk_control *wc, struct btrfs_path *path, struct btrfs_inode *dir, struct btrfs_dir_item *di) { + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = dir->root; struct btrfs_inode *inode; struct fscrypt_str name; @@ -990,11 +991,11 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, goto out; } - ret = link_to_fixup_dir(trans, root, path, location.objectid); + ret = link_to_fixup_dir(wc, path, location.objectid); if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); out: kfree(name.name); if (inode) @@ -1135,7 +1136,7 @@ static int unlink_refs_not_in_log(struct walk_control *wc, inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name); kfree(victim_name.name); if (ret) return ret; @@ -1207,7 +1208,7 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = unlink_inode_for_log_replay(trans, victim_parent, inode, + ret = unlink_inode_for_log_replay(wc, victim_parent, inode, &victim_name); iput(&victim_parent->vfs_inode); kfree(victim_name.name); @@ -1281,7 +1282,7 @@ static inline int __add_inode_ref(struct walk_control *wc, btrfs_abort_transaction(trans, ret); return ret; } else if (di) { - ret = drop_one_dir_item(trans, path, dir, di); + ret = drop_one_dir_item(wc, path, dir, di); if (ret) return ret; } @@ -1292,7 +1293,7 @@ static inline int __add_inode_ref(struct walk_control *wc, if (IS_ERR(di)) { return PTR_ERR(di); } else if (di) { - ret = drop_one_dir_item(trans, path, dir, di); + ret = drop_one_dir_item(wc, path, dir, di); if (ret) return ret; } @@ -1415,7 +1416,7 @@ static int unlink_old_inode_refs(struct walk_control *wc, btrfs_abort_transaction(trans, ret); goto out; } - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); kfree(name.name); iput(&dir->vfs_inode); if (ret) @@ -1842,11 +1843,12 @@ static noinline int fixup_inode_link_counts(struct walk_control *wc, * count when replay is done. The link count is incremented here * so the inode won't go away until we check it */ -static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static noinline int link_to_fixup_dir(struct walk_control *wc, struct btrfs_path *path, u64 objectid) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_key key; int ret = 0; struct btrfs_inode *inode; @@ -1917,7 +1919,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, return ret; } -static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, +static int delete_conflicting_dir_entry(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_path *path, struct btrfs_dir_item *dst_di, @@ -1942,7 +1944,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, if (!exists) return 0; - return drop_one_dir_item(trans, path, dir, dst_di); + return drop_one_dir_item(wc, path, dir, dst_di); } /* @@ -2014,7 +2016,7 @@ static noinline int replay_one_name(struct walk_control *wc, btrfs_abort_transaction(trans, ret); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + ret = delete_conflicting_dir_entry(wc, dir, path, dir_dst_di, &log_key, log_flags, exists); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -2033,7 +2035,7 @@ static noinline int replay_one_name(struct walk_control *wc, btrfs_abort_transaction(trans, ret); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + ret = delete_conflicting_dir_entry(wc, dir, path, index_dst_di, &log_key, log_flags, exists); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -2161,7 +2163,7 @@ static noinline int replay_one_dir_item(struct walk_control *wc, } btrfs_dir_item_key_to_cpu(eb, di, &di_key); - ret = link_to_fixup_dir(wc->trans, wc->root, fixup_path, di_key.objectid); + ret = link_to_fixup_dir(wc, fixup_path, di_key.objectid); btrfs_free_path(fixup_path); } @@ -2317,12 +2319,12 @@ static noinline int check_item_in_log(struct walk_control *wc, goto out; } - ret = link_to_fixup_dir(trans, root, path, location.objectid); + ret = link_to_fixup_dir(wc, path, location.objectid); if (ret) goto out; inc_nlink(&inode->vfs_inode); - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2699,7 +2701,7 @@ static int replay_one_buffer(struct extent_buffer *eb, break; } - ret = link_to_fixup_dir(trans, root, path, key.objectid); + ret = link_to_fixup_dir(wc, path, key.objectid); if (ret) break; } From 266967c0e225fe164819fa655a0829db8506ddde Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 28 Aug 2025 17:04:40 +0100 Subject: [PATCH 2555/3206] btrfs: pass walk_control structure to overwrite_item() Instead of passing the transaction and subvolume root as arguments to overwrite_item(), pass the walk_control structure as we can grab them from the structure. This reduces the number of arguments passed and it's going to be needed by an incoming change that improves error reporting for log replay. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 01a0f7cbcd4b00..2060f0d99f6e10 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -410,12 +410,13 @@ static int process_one_buffer(struct extent_buffer *eb, * * If the key isn't in the destination yet, a new item is inserted. */ -static int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int overwrite_item(struct walk_control *wc, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; int ret; u32 item_size; u64 saved_i_size = 0; @@ -739,7 +740,7 @@ static noinline int replay_one_extent(struct walk_control *wc, if (found_type == BTRFS_FILE_EXTENT_INLINE) { /* inline extents are easy, we just overwrite them */ - ret = overwrite_item(trans, root, path, eb, slot, key); + ret = overwrite_item(wc, path, eb, slot, key); if (ret) goto out; goto update_inode; @@ -1607,7 +1608,7 @@ static noinline int add_inode_ref(struct walk_control *wc, goto out; /* finally write the back reference in the inode */ - ret = overwrite_item(trans, root, path, eb, slot, key); + ret = overwrite_item(wc, path, eb, slot, key); out: btrfs_release_path(path); kfree(name.name); @@ -2657,7 +2658,7 @@ static int replay_one_buffer(struct extent_buffer *eb, if (ret) break; } - ret = overwrite_item(trans, root, path, eb, i, &key); + ret = overwrite_item(wc, path, eb, i, &key); if (ret) break; @@ -2721,7 +2722,7 @@ static int replay_one_buffer(struct extent_buffer *eb, /* these keys are simply copied */ if (key.type == BTRFS_XATTR_ITEM_KEY) { - ret = overwrite_item(trans, root, path, eb, i, &key); + ret = overwrite_item(wc, path, eb, i, &key); if (ret) break; } else if (key.type == BTRFS_INODE_REF_KEY || From c2ef817b285fb893a2a7b410e2cdb46f9badf06b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 28 Aug 2025 17:20:24 +0100 Subject: [PATCH 2556/3206] btrfs: use level argument in log tree walk callback process_one_buffer() We already have the extent buffer's level in an argument, there's no need to call btrfs_header_level(). So use the level argument and make the code shorter. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2060f0d99f6e10..166ceb003a1eb1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -386,8 +386,7 @@ static int process_one_buffer(struct extent_buffer *eb, return ret; } - if (btrfs_buffer_uptodate(eb, gen, false) && - btrfs_header_level(eb) == 0) { + if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) { ret = btrfs_exclude_logged_extents(eb); if (ret) btrfs_abort_transaction(trans, ret); From 6cb7f0b8c9b0d6a35682335fea88bd26f089306f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 28 Aug 2025 17:46:18 +0100 Subject: [PATCH 2557/3206] btrfs: use level argument in log tree walk callback replay_one_buffer() We already have the extent buffer's level in an argument, there's no need to first ensure the extent buffer's data is loaded (by calling btrfs_read_extent_buffer()) and then call btrfs_header_level() to check the level. So use the level argument and do the check before calling btrfs_read_extent_buffer(). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 166ceb003a1eb1..88e813bb28d8d3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2591,17 +2591,15 @@ static int replay_one_buffer(struct extent_buffer *eb, int i; int ret; + if (level != 0) + return 0; + ret = btrfs_read_extent_buffer(eb, &check); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } - level = btrfs_header_level(eb); - - if (level != 0) - return 0; - path = btrfs_alloc_path(); if (!path) { btrfs_abort_transaction(trans, -ENOMEM); From 2a13cfc949e591702f1288d62c691099c67e04ca Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 29 Aug 2025 16:46:15 +0100 Subject: [PATCH 2558/3206] btrfs: use the inode item boolean everywhere in overwrite_item() We have this boolean 'inode_item' to tell if we are processing an inode item key and we use it in a couple of places while in another two places we open code by checking if the key type matches the inode item type. Make this consistent and use the boolean everywhere. Also rename it from 'inode_item' to 'is_inode_item', which makes it more clear that it's a boolean and not an instance of struct btrfs_inode_item, and make it const too. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 88e813bb28d8d3..d830c33be7c65b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -424,7 +424,7 @@ static int overwrite_item(struct walk_control *wc, unsigned long dst_ptr; struct extent_buffer *dst_eb; int dst_slot; - bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; + const bool is_inode_item = (key->type == BTRFS_INODE_ITEM_KEY); /* * This is only used during log replay, so the root is always from a @@ -486,7 +486,7 @@ static int overwrite_item(struct walk_control *wc, * We need to load the old nbytes into the inode so when we * replay the extents we've logged we get the right nbytes. */ - if (inode_item) { + if (is_inode_item) { struct btrfs_inode_item *item; u64 nbytes; u32 mode; @@ -507,7 +507,7 @@ static int overwrite_item(struct walk_control *wc, if (S_ISDIR(mode)) btrfs_set_inode_size(eb, item, 0); } - } else if (inode_item) { + } else if (is_inode_item) { struct btrfs_inode_item *item; u32 mode; @@ -561,7 +561,7 @@ static int overwrite_item(struct walk_control *wc, * state of the tree found in the subvolume, and i_size is modified * as it goes */ - if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + if (is_inode_item && ret == -EEXIST) { struct btrfs_inode_item *src_item; struct btrfs_inode_item *dst_item; @@ -602,7 +602,7 @@ static int overwrite_item(struct walk_control *wc, } /* make sure the generation is filled in */ - if (key->type == BTRFS_INODE_ITEM_KEY) { + if (is_inode_item) { struct btrfs_inode_item *dst_item; dst_item = (struct btrfs_inode_item *)dst_ptr; From 2ac70946621ff3d90eb84d2e3b74563f4154f24c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 1 Sep 2025 12:13:48 +0100 Subject: [PATCH 2559/3206] btrfs: add current log leaf, key and slot to struct walk_control A lot of the log replay functions get passed the current log leaf being processed as well as the current slot and the key at that slot. Instead of passing them as parameters, add them to struct walk_control so that we reduce the numbers of parameters. This is also going to be needed to further changes that improve error reporting during log replay. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 255 ++++++++++++++++++++++---------------------- 1 file changed, 126 insertions(+), 129 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d830c33be7c65b..b4e901da9e8ba0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -150,6 +150,18 @@ struct walk_control { */ int (*process_func)(struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level); + + /* + * The following are used only when stage is >= LOG_WALK_REPLAY_INODES + * and by the replay_one_buffer() callback. + */ + + /* The current log leaf being processed. */ + struct extent_buffer *log_leaf; + /* The key being processed of the current log leaf. */ + struct btrfs_key log_key; + /* The slot being processed of the current log leaf. */ + int log_slot; }; static int btrfs_log_inode(struct btrfs_trans_handle *trans, @@ -396,8 +408,9 @@ static int process_one_buffer(struct extent_buffer *eb, } /* - * Item overwrite used by log replay. The given eb, slot and key all refer to - * the source data we are copying out. + * Item overwrite used by log replay. The given log tree leaf, slot and key + * from the walk_control structure all refer to the source data we are copying + * out. * * The given root is for the tree we are copying into, and path is a scratch * path for use in this function (it should be released on entry and will be @@ -409,10 +422,7 @@ static int process_one_buffer(struct extent_buffer *eb, * * If the key isn't in the destination yet, a new item is inserted. */ -static int overwrite_item(struct walk_control *wc, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static int overwrite_item(struct walk_control *wc, struct btrfs_path *path) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -424,7 +434,7 @@ static int overwrite_item(struct walk_control *wc, unsigned long dst_ptr; struct extent_buffer *dst_eb; int dst_slot; - const bool is_inode_item = (key->type == BTRFS_INODE_ITEM_KEY); + const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY); /* * This is only used during log replay, so the root is always from a @@ -435,11 +445,11 @@ static int overwrite_item(struct walk_control *wc, */ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); - item_size = btrfs_item_size(eb, slot); - src_ptr = btrfs_item_ptr_offset(eb, slot); + item_size = btrfs_item_size(wc->log_leaf, wc->log_slot); + src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); /* Look for the key in the destination tree. */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &wc->log_key, path, 0, 0); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; @@ -466,7 +476,7 @@ static int overwrite_item(struct walk_control *wc, return -ENOMEM; } - read_extent_buffer(eb, src_copy, src_ptr, item_size); + read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size); dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size); @@ -494,18 +504,18 @@ static int overwrite_item(struct walk_control *wc, item = btrfs_item_ptr(dst_eb, dst_slot, struct btrfs_inode_item); nbytes = btrfs_inode_nbytes(dst_eb, item); - item = btrfs_item_ptr(eb, slot, + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item); - btrfs_set_inode_nbytes(eb, item, nbytes); + btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes); /* * If this is a directory we need to reset the i_size to * 0 so that we can set it up properly when replaying * the rest of the items in this log. */ - mode = btrfs_inode_mode(eb, item); + mode = btrfs_inode_mode(wc->log_leaf, item); if (S_ISDIR(mode)) - btrfs_set_inode_size(eb, item, 0); + btrfs_set_inode_size(wc->log_leaf, item, 0); } } else if (is_inode_item) { struct btrfs_inode_item *item; @@ -515,24 +525,23 @@ static int overwrite_item(struct walk_control *wc, * New inode, set nbytes to 0 so that the nbytes comes out * properly when we replay the extents. */ - item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); - btrfs_set_inode_nbytes(eb, item, 0); + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item); + btrfs_set_inode_nbytes(wc->log_leaf, item, 0); /* * If this is a directory we need to reset the i_size to 0 so * that we can set it up properly when replaying the rest of * the items in this log. */ - mode = btrfs_inode_mode(eb, item); + mode = btrfs_inode_mode(wc->log_leaf, item); if (S_ISDIR(mode)) - btrfs_set_inode_size(eb, item, 0); + btrfs_set_inode_size(wc->log_leaf, item, 0); } insert: btrfs_release_path(path); /* try to insert the key into the destination tree */ path->skip_release_on_error = 1; - ret = btrfs_insert_empty_item(trans, root, path, - key, item_size); + ret = btrfs_insert_empty_item(trans, root, path, &wc->log_key, item_size); path->skip_release_on_error = 0; dst_eb = path->nodes[0]; @@ -568,8 +577,8 @@ static int overwrite_item(struct walk_control *wc, src_item = (struct btrfs_inode_item *)src_ptr; dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(eb, src_item) == 0) { - const u64 ino_size = btrfs_inode_size(eb, src_item); + if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) { + const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item); /* * For regular files an ino_size == 0 is used only when @@ -578,21 +587,21 @@ static int overwrite_item(struct walk_control *wc, * case don't set the size of the inode in the fs/subvol * tree, otherwise we would be throwing valid data away. */ - if (S_ISREG(btrfs_inode_mode(eb, src_item)) && + if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) && S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && ino_size != 0) btrfs_set_inode_size(dst_eb, dst_item, ino_size); goto no_copy; } - if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && + if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) && S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) { save_old_i_size = 1; saved_i_size = btrfs_inode_size(dst_eb, dst_item); } } - copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size); + copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size); if (save_old_i_size) { struct btrfs_inode_item *dst_item; @@ -641,10 +650,7 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, * The extent is inserted into the file, dropping any existing extents * from the file that overlap the new one. */ -static noinline int replay_one_extent(struct walk_control *wc, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static noinline int replay_one_extent(struct walk_control *wc, struct btrfs_path *path) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -652,7 +658,7 @@ static noinline int replay_one_extent(struct walk_control *wc, struct btrfs_fs_info *fs_info = root->fs_info; int found_type; u64 extent_end; - u64 start = key->offset; + const u64 start = wc->log_key.offset; u64 nbytes = 0; u64 csum_start; u64 csum_end; @@ -665,34 +671,35 @@ static noinline int replay_one_extent(struct walk_control *wc, unsigned long size; int ret = 0; - item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(eb, item); + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(wc->log_leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - nbytes = btrfs_file_extent_num_bytes(eb, item); + nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item); extent_end = start + nbytes; /* * We don't add to the inodes nbytes if we are prealloc or a * hole. */ - if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) nbytes = 0; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_ram_bytes(eb, item); - nbytes = btrfs_file_extent_ram_bytes(eb, item); + size = btrfs_file_extent_ram_bytes(wc->log_leaf, item); + nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item); extent_end = ALIGN(start + size, fs_info->sectorsize); } else { btrfs_abort_transaction(trans, -EUCLEAN); btrfs_err(fs_info, "unexpected extent type=%d root=%llu inode=%llu offset=%llu", - found_type, btrfs_root_id(root), key->objectid, key->offset); + found_type, btrfs_root_id(root), wc->log_key.objectid, + wc->log_key.offset); return -EUCLEAN; } - inode = btrfs_iget_logging(key->objectid, root); + inode = btrfs_iget_logging(wc->log_key.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); btrfs_abort_transaction(trans, ret); @@ -719,7 +726,7 @@ static noinline int replay_one_extent(struct walk_control *wc, * we already have a pointer to this exact extent, * we don't have to do anything */ - if (memcmp_extent_buffer(eb, &existing, (unsigned long)item, + if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item, sizeof(existing)) == 0) { btrfs_release_path(path); goto out; @@ -739,7 +746,7 @@ static noinline int replay_one_extent(struct walk_control *wc, if (found_type == BTRFS_FILE_EXTENT_INLINE) { /* inline extents are easy, we just overwrite them */ - ret = overwrite_item(wc, path, eb, slot, key); + ret = overwrite_item(wc, path); if (ret) goto out; goto update_inode; @@ -751,18 +758,18 @@ static noinline int replay_one_extent(struct walk_control *wc, */ /* A hole and NO_HOLES feature enabled, nothing else to do. */ - if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 && btrfs_fs_incompat(fs_info, NO_HOLES)) goto update_inode; - ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); + ret = btrfs_insert_empty_item(trans, root, path, &wc->log_key, sizeof(*item)); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } dest_offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - copy_extent_buffer(path->nodes[0], eb, dest_offset, (unsigned long)item, - sizeof(*item)); + copy_extent_buffer(path->nodes[0], wc->log_leaf, dest_offset, + (unsigned long)item, sizeof(*item)); /* * We have an explicit hole and NO_HOLES is not enabled. We have added @@ -770,15 +777,15 @@ static noinline int replay_one_extent(struct walk_control *wc, * anything else to do other than update the file extent item range and * update the inode item. */ - if (btrfs_file_extent_disk_bytenr(eb, item) == 0) { + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) { btrfs_release_path(path); goto update_inode; } - ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); + ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item); ins.type = BTRFS_EXTENT_ITEM_KEY; - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); - offset = key->offset - btrfs_file_extent_offset(eb, item); + ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item); + offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item); /* * Manually record dirty extent, as here we did a shallow file extent @@ -810,7 +817,7 @@ static noinline int replay_one_extent(struct walk_control *wc, .ref_root = btrfs_root_id(root), }; - btrfs_init_data_ref(&ref, key->objectid, offset, 0, false); + btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -819,7 +826,7 @@ static noinline int replay_one_extent(struct walk_control *wc, } else { /* Insert the extent pointer in the extent tree. */ ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root), - key->objectid, offset, &ins); + wc->log_key.objectid, offset, &ins); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -828,12 +835,12 @@ static noinline int replay_one_extent(struct walk_control *wc, btrfs_release_path(path); - if (btrfs_file_extent_compression(eb, item)) { + if (btrfs_file_extent_compression(wc->log_leaf, item)) { csum_start = ins.objectid; csum_end = csum_start + ins.offset; } else { - csum_start = ins.objectid + btrfs_file_extent_offset(eb, item); - csum_end = csum_start + btrfs_file_extent_num_bytes(eb, item); + csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item); + csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item); } ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, @@ -1352,10 +1359,7 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, */ static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_path *path, - struct btrfs_inode *inode, - struct extent_buffer *log_eb, - int log_slot, - struct btrfs_key *key) + struct btrfs_inode *inode) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -1366,7 +1370,7 @@ static int unlink_old_inode_refs(struct walk_control *wc, again: btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &wc->log_key, path, 0, 0); if (ret > 0) { ret = 0; goto out; @@ -1383,7 +1387,7 @@ static int unlink_old_inode_refs(struct walk_control *wc, struct fscrypt_str name; u64 parent_id; - if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) { ret = extref_get_fields(eb, ref_ptr, &name, NULL, &parent_id); if (ret) { @@ -1391,7 +1395,7 @@ static int unlink_old_inode_refs(struct walk_control *wc, goto out; } } else { - parent_id = key->offset; + parent_id = wc->log_key.offset; ret = ref_get_fields(eb, ref_ptr, &name, NULL); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1399,11 +1403,12 @@ static int unlink_old_inode_refs(struct walk_control *wc, } } - if (key->type == BTRFS_INODE_EXTREF_KEY) - ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) + ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot, parent_id, &name); else - ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); + ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot, + &name); if (!ret) { struct btrfs_inode *dir; @@ -1426,7 +1431,7 @@ static int unlink_old_inode_refs(struct walk_control *wc, kfree(name.name); ref_ptr += name.len; - if (key->type == BTRFS_INODE_EXTREF_KEY) + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) ref_ptr += sizeof(struct btrfs_inode_extref); else ref_ptr += sizeof(struct btrfs_inode_ref); @@ -1438,15 +1443,10 @@ static int unlink_old_inode_refs(struct walk_control *wc, } /* - * replay one inode back reference item found in the log tree. - * eb, slot and key refer to the buffer and key found in the log tree. - * root is the destination we are replaying into, and path is for temp - * use by this function. (it should be released on return). + * Replay one inode back reference item found in the log tree. + * Path is for temporary use by this function (it should be released on return). */ -static noinline int add_inode_ref(struct walk_control *wc, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static noinline int add_inode_ref(struct walk_control *wc, struct btrfs_path *path) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -1456,26 +1456,26 @@ static noinline int add_inode_ref(struct walk_control *wc, unsigned long ref_end; struct fscrypt_str name = { 0 }; int ret; - const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY); + const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY); u64 parent_objectid; u64 inode_objectid; u64 ref_index = 0; int ref_struct_size; - ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size(eb, slot); + ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); + ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot); if (is_extref_item) { struct btrfs_inode_extref *r; ref_struct_size = sizeof(struct btrfs_inode_extref); r = (struct btrfs_inode_extref *)ref_ptr; - parent_objectid = btrfs_inode_extref_parent(eb, r); + parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r); } else { ref_struct_size = sizeof(struct btrfs_inode_ref); - parent_objectid = key->offset; + parent_objectid = wc->log_key.offset; } - inode_objectid = key->objectid; + inode_objectid = wc->log_key.objectid; /* * it is possible that we didn't log all the parent directories @@ -1504,7 +1504,7 @@ static noinline int add_inode_ref(struct walk_control *wc, while (ref_ptr < ref_end) { if (is_extref_item) { - ret = extref_get_fields(eb, ref_ptr, &name, + ret = extref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index, &parent_objectid); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1541,7 +1541,7 @@ static noinline int add_inode_ref(struct walk_control *wc, } } } else { - ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); + ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -1602,12 +1602,12 @@ static noinline int add_inode_ref(struct walk_control *wc, * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(wc, path, inode, eb, slot, key); + ret = unlink_old_inode_refs(wc, path, inode); if (ret) goto out; /* finally write the back reference in the inode */ - ret = overwrite_item(wc, path, eb, slot, key); + ret = overwrite_item(wc, path); out: btrfs_release_path(path); kfree(name.name); @@ -1965,9 +1965,7 @@ static int delete_conflicting_dir_entry(struct walk_control *wc, */ static noinline int replay_one_name(struct walk_control *wc, struct btrfs_path *path, - struct extent_buffer *eb, - struct btrfs_dir_item *di, - struct btrfs_key *key) + struct btrfs_dir_item *di) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -1985,21 +1983,22 @@ static noinline int replay_one_name(struct walk_control *wc, bool update_size = true; bool name_added = false; - dir = btrfs_iget_logging(key->objectid, root); + dir = btrfs_iget_logging(wc->log_key.objectid, root); if (IS_ERR(dir)) { ret = PTR_ERR(dir); btrfs_abort_transaction(trans, ret); return ret; } - ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); + ret = read_alloc_one_name(wc->log_leaf, di + 1, + btrfs_dir_name_len(wc->log_leaf, di), &name); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - log_flags = btrfs_dir_flags(eb, di); - btrfs_dir_item_key_to_cpu(eb, di, &log_key); + log_flags = btrfs_dir_flags(wc->log_leaf, di); + btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key); ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); if (ret < 0) { @@ -2009,7 +2008,7 @@ static noinline int replay_one_name(struct walk_control *wc, exists = (ret == 0); ret = 0; - dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + dir_dst_di = btrfs_lookup_dir_item(trans, root, path, wc->log_key.objectid, &name, 1); if (IS_ERR(dir_dst_di)) { ret = PTR_ERR(dir_dst_di); @@ -2028,8 +2027,8 @@ static noinline int replay_one_name(struct walk_control *wc, btrfs_release_path(path); index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, - key->objectid, key->offset, - &name, 1); + wc->log_key.objectid, + wc->log_key.offset, &name, 1); if (IS_ERR(index_dst_di)) { ret = PTR_ERR(index_dst_di); btrfs_abort_transaction(trans, ret); @@ -2058,7 +2057,7 @@ static noinline int replay_one_name(struct walk_control *wc, */ search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_REF_KEY; - search_key.offset = key->objectid; + search_key.offset = wc->log_key.objectid; ret = backref_in_log(root->log_root, &search_key, 0, &name); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -2072,8 +2071,8 @@ static noinline int replay_one_name(struct walk_control *wc, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len); - ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); + search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len); + ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -2084,7 +2083,7 @@ static noinline int replay_one_name(struct walk_control *wc, goto out; } btrfs_release_path(path); - ret = insert_one_name(trans, root, key->objectid, key->offset, + ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset, &name, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); @@ -2111,18 +2110,16 @@ static noinline int replay_one_name(struct walk_control *wc, /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ static noinline int replay_one_dir_item(struct walk_control *wc, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) + struct btrfs_path *path) { int ret; struct btrfs_dir_item *di; /* We only log dir index keys, which only contain a single dir item. */ - ASSERT(key->type == BTRFS_DIR_INDEX_KEY); + ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY); - di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); - ret = replay_one_name(wc, path, eb, di, key); + di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item); + ret = replay_one_name(wc, path, di); if (ret < 0) return ret; @@ -2152,7 +2149,7 @@ static noinline int replay_one_dir_item(struct walk_control *wc, * to ever delete the parent directory has it would result in stale * dentries that can never be deleted. */ - if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { + if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) { struct btrfs_path *fixup_path; struct btrfs_key di_key; @@ -2162,7 +2159,7 @@ static noinline int replay_one_dir_item(struct walk_control *wc, return -ENOMEM; } - btrfs_dir_item_key_to_cpu(eb, di, &di_key); + btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key); ret = link_to_fixup_dir(wc, fixup_path, di_key.objectid); btrfs_free_path(fixup_path); } @@ -2339,15 +2336,14 @@ static noinline int check_item_in_log(struct walk_control *wc, return ret; } -static int replay_xattr_deletes(struct walk_control *wc, - struct btrfs_path *path, - const u64 ino) +static int replay_xattr_deletes(struct walk_control *wc, struct btrfs_path *path) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; struct btrfs_root *log = wc->log; struct btrfs_key search_key; struct btrfs_path *log_path; + const u64 ino = wc->log_key.objectid; int i; int nritems; int ret; @@ -2587,8 +2583,6 @@ static int replay_one_buffer(struct extent_buffer *eb, struct btrfs_path *path; struct btrfs_root *root = wc->root; struct btrfs_trans_handle *trans = wc->trans; - struct btrfs_key key; - int i; int ret; if (level != 0) @@ -2606,14 +2600,17 @@ static int replay_one_buffer(struct extent_buffer *eb, return -ENOMEM; } + wc->log_leaf = eb; + nritems = btrfs_header_nritems(eb); - for (i = 0; i < nritems; i++) { + for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) { struct btrfs_inode_item *inode_item; - btrfs_item_key_to_cpu(eb, &key, i); + btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot); - if (key.type == BTRFS_INODE_ITEM_KEY) { - inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(eb, wc->log_slot, + struct btrfs_inode_item); /* * An inode with no links is either: * @@ -2642,20 +2639,20 @@ static int replay_one_buffer(struct extent_buffer *eb, } /* Inode keys are done during the first stage. */ - if (key.type == BTRFS_INODE_ITEM_KEY && + if (wc->log_key.type == BTRFS_INODE_ITEM_KEY && wc->stage == LOG_WALK_REPLAY_INODES) { u32 mode; - ret = replay_xattr_deletes(wc, path, key.objectid); + ret = replay_xattr_deletes(wc, path); if (ret) break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { - ret = replay_dir_deletes(wc, path, key.objectid, false); + ret = replay_dir_deletes(wc, path, wc->log_key.objectid, false); if (ret) break; } - ret = overwrite_item(wc, path, eb, i, &key); + ret = overwrite_item(wc, path); if (ret) break; @@ -2672,7 +2669,7 @@ static int replay_one_buffer(struct extent_buffer *eb, struct btrfs_inode *inode; u64 from; - inode = btrfs_iget_logging(key.objectid, root); + inode = btrfs_iget_logging(wc->log_key.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); btrfs_abort_transaction(trans, ret); @@ -2699,7 +2696,7 @@ static int replay_one_buffer(struct extent_buffer *eb, break; } - ret = link_to_fixup_dir(wc, path, key.objectid); + ret = link_to_fixup_dir(wc, path, wc->log_key.objectid); if (ret) break; } @@ -2707,9 +2704,9 @@ static int replay_one_buffer(struct extent_buffer *eb, if (wc->ignore_cur_inode) continue; - if (key.type == BTRFS_DIR_INDEX_KEY && + if (wc->log_key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { - ret = replay_one_dir_item(wc, path, eb, i, &key); + ret = replay_one_dir_item(wc, path); if (ret) break; } @@ -2718,17 +2715,17 @@ static int replay_one_buffer(struct extent_buffer *eb, continue; /* these keys are simply copied */ - if (key.type == BTRFS_XATTR_ITEM_KEY) { - ret = overwrite_item(wc, path, eb, i, &key); + if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc, path); if (ret) break; - } else if (key.type == BTRFS_INODE_REF_KEY || - key.type == BTRFS_INODE_EXTREF_KEY) { - ret = add_inode_ref(wc, path, eb, i, &key); + } else if (wc->log_key.type == BTRFS_INODE_REF_KEY || + wc->log_key.type == BTRFS_INODE_EXTREF_KEY) { + ret = add_inode_ref(wc, path); if (ret) break; - } else if (key.type == BTRFS_EXTENT_DATA_KEY) { - ret = replay_one_extent(wc, path, eb, i, &key); + } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc, path); if (ret) break; } From 29d9c5e0370593cec8298601ab3e9ce29799fb6f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 1 Sep 2025 12:43:42 +0100 Subject: [PATCH 2560/3206] btrfs: avoid unnecessary path allocation at fixup_inode_link_count() There's no need to allocate a path as our single caller already has a path that we can use. So pass the caller's path and use it. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b4e901da9e8ba0..5754333ae732df 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1728,19 +1728,15 @@ static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path) * will free the inode. */ static noinline int fixup_inode_link_count(struct walk_control *wc, + struct btrfs_path *path, struct btrfs_inode *inode) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = inode->root; - struct btrfs_path *path; int ret; u64 nlink = 0; const u64 ino = btrfs_ino(inode); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = count_inode_refs(inode, path); if (ret < 0) goto out; @@ -1776,7 +1772,7 @@ static noinline int fixup_inode_link_count(struct walk_control *wc, } out: - btrfs_free_path(path); + btrfs_release_path(path); return ret; } @@ -1821,7 +1817,7 @@ static noinline int fixup_inode_link_counts(struct walk_control *wc, break; } - ret = fixup_inode_link_count(wc, inode); + ret = fixup_inode_link_count(wc, path, inode); iput(&inode->vfs_inode); if (ret) break; From b343047c1a08779efb0d2691a8c6391318025cc6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 1 Sep 2025 15:51:19 +0100 Subject: [PATCH 2561/3206] btrfs: avoid path allocations when dropping extents during log replay We can avoid a path allocation in the btrfs_drop_extents() calls we have at replay_one_extent() and replay_one_buffer() by passing the path we already have in those contextes as it's unused by the time they call btrfs_drop_extents(). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5754333ae732df..a912ccdf14855c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -738,6 +738,7 @@ static noinline int replay_one_extent(struct walk_control *wc, struct btrfs_path drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; + drop_args.path = path; ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2676,6 +2677,7 @@ static int replay_one_buffer(struct extent_buffer *eb, drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; + drop_args.path = path; ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); From f366722f3370dd561d45e667484c458245977e79 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 1 Sep 2025 16:37:05 +0100 Subject: [PATCH 2562/3206] btrfs: avoid unnecessary path allocation when replaying a dir item There's no need to allocate 'fixup_path' at replay_one_dir_item(), as the path passed as an argument is unused by the time link_to_fixup_dir() is called (replay_one_name() releases the path before it returns). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a912ccdf14855c..de1f1c024dc0f9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2147,18 +2147,10 @@ static noinline int replay_one_dir_item(struct walk_control *wc, * dentries that can never be deleted. */ if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) { - struct btrfs_path *fixup_path; struct btrfs_key di_key; - fixup_path = btrfs_alloc_path(); - if (!fixup_path) { - btrfs_abort_transaction(wc->trans, -ENOMEM); - return -ENOMEM; - } - btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key); - ret = link_to_fixup_dir(wc, fixup_path, di_key.objectid); - btrfs_free_path(fixup_path); + ret = link_to_fixup_dir(wc, path, di_key.objectid); } return ret; From 9bdfa3eddb67e73f57d1e527e1ba2dcca3bd08c4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 1 Sep 2025 17:03:46 +0100 Subject: [PATCH 2563/3206] btrfs: remove redundant path release when processing dentry during log replay At replay_one_one() we have a redundant btrfs_release_path() just before calling insert_one_name(), as some lines above we have already released the path with another btrfs_release_path() call. So remove it. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index de1f1c024dc0f9..65b8858e82d1d7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2079,7 +2079,6 @@ static noinline int replay_one_name(struct walk_control *wc, update_size = false; goto out; } - btrfs_release_path(path); ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset, &name, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) { From f9c02e4b525d766bebfcbeac1b25b03aacfe62f1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 2 Sep 2025 17:08:19 +0100 Subject: [PATCH 2564/3206] btrfs: remove redundant path release when overwriting item during log replay At overwrite_item() we have a redundant btrfs_release_path() just before failing with -ENOMEM, as the caller who passed in the path will free it and therefore also release any refcounts and locks on the extent buffers of the path. So remove it. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 65b8858e82d1d7..66500ebdd35c77 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -471,7 +471,6 @@ static int overwrite_item(struct walk_control *wc, struct btrfs_path *path) } src_copy = kmalloc(item_size, GFP_NOFS); if (!src_copy) { - btrfs_release_path(path); btrfs_abort_transaction(trans, -ENOMEM); return -ENOMEM; } From 1ebeee283a2ac8ea29a705d7f03a15e3aae30f22 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 2 Sep 2025 12:16:17 +0100 Subject: [PATCH 2565/3206] btrfs: add path for subvolume tree changes to struct walk_control While replaying log trees we need to do searches and updates to subvolume trees and for that we use a path that we allocate in replay_one_buffer() and then pass it as a parameter to other functions deeper in the log replay call chain. Instead of passing it as parameter, add it to struct walk_control since we pass a pointer to that structure for every log replay function. This reduces the number of arguments passed to the functions and it will be needed and important for an upcoming changes that improves error reporting for log replay. Also name the new filed in struct walk_control to 'subvol_path' - while that is longer to type, the naming makes it clear it's used for subvolume tree operations since many of the log replay functions operate both on subvolume and log trees, and for the log tree searches we have struct walk_control::log_leaf to also make it obvious it's an extent buffer for a log tree extent buffer. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 323 +++++++++++++++++++++----------------------- 1 file changed, 156 insertions(+), 167 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 66500ebdd35c77..747daaaaf332a0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -162,16 +162,17 @@ struct walk_control { struct btrfs_key log_key; /* The slot being processed of the current log leaf. */ int log_slot; + + /* A path used for searches and modifications to subvolume trees. */ + struct btrfs_path *subvol_path; }; static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx); -static int link_to_fixup_dir(struct walk_control *wc, - struct btrfs_path *path, u64 objectid); +static int link_to_fixup_dir(struct walk_control *wc, u64 objectid); static noinline int replay_dir_deletes(struct walk_control *wc, - struct btrfs_path *path, u64 dirid, bool del_all); static void wait_log_commit(struct btrfs_root *root, int transid); @@ -422,7 +423,7 @@ static int process_one_buffer(struct extent_buffer *eb, * * If the key isn't in the destination yet, a new item is inserted. */ -static int overwrite_item(struct walk_control *wc, struct btrfs_path *path) +static int overwrite_item(struct walk_control *wc) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -449,14 +450,14 @@ static int overwrite_item(struct walk_control *wc, struct btrfs_path *path) src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); /* Look for the key in the destination tree. */ - ret = btrfs_search_slot(NULL, root, &wc->log_key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; } - dst_eb = path->nodes[0]; - dst_slot = path->slots[0]; + dst_eb = wc->subvol_path->nodes[0]; + dst_slot = wc->subvol_path->slots[0]; if (ret == 0) { char *src_copy; @@ -466,7 +467,7 @@ static int overwrite_item(struct walk_control *wc, struct btrfs_path *path) goto insert; if (item_size == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } src_copy = kmalloc(item_size, GFP_NOFS); @@ -487,7 +488,7 @@ static int overwrite_item(struct walk_control *wc, struct btrfs_path *path) * sync */ if (ret == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -537,23 +538,23 @@ static int overwrite_item(struct walk_control *wc, struct btrfs_path *path) btrfs_set_inode_size(wc->log_leaf, item, 0); } insert: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* try to insert the key into the destination tree */ - path->skip_release_on_error = 1; - ret = btrfs_insert_empty_item(trans, root, path, &wc->log_key, item_size); - path->skip_release_on_error = 0; + wc->subvol_path->skip_release_on_error = 1; + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size); + wc->subvol_path->skip_release_on_error = 0; - dst_eb = path->nodes[0]; - dst_slot = path->slots[0]; + dst_eb = wc->subvol_path->nodes[0]; + dst_slot = wc->subvol_path->slots[0]; /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { const u32 found_size = btrfs_item_size(dst_eb, dst_slot); if (found_size > item_size) - btrfs_truncate_item(trans, path, item_size, 1); + btrfs_truncate_item(trans, wc->subvol_path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(trans, path, item_size - found_size); + btrfs_extend_item(trans, wc->subvol_path, item_size - found_size); } else if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -618,7 +619,7 @@ static int overwrite_item(struct walk_control *wc, struct btrfs_path *path) btrfs_set_inode_generation(dst_eb, dst_item, trans->transid); } no_copy: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -649,7 +650,7 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, * The extent is inserted into the file, dropping any existing extents * from the file that overlap the new one. */ -static noinline int replay_one_extent(struct walk_control *wc, struct btrfs_path *path) +static noinline int replay_one_extent(struct walk_control *wc) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -710,16 +711,18 @@ static noinline int replay_one_extent(struct walk_control *wc, struct btrfs_path * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); + ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path, + btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC)) { + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; struct btrfs_file_extent_item existing; unsigned long ptr; - ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing)); + ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + read_extent_buffer(leaf, &existing, ptr, sizeof(existing)); /* * we already have a pointer to this exact extent, @@ -727,17 +730,17 @@ static noinline int replay_one_extent(struct walk_control *wc, struct btrfs_path */ if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item, sizeof(existing)) == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); goto out; } } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* drop any overlapping extents */ drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; - drop_args.path = path; + drop_args.path = wc->subvol_path; ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); @@ -746,7 +749,7 @@ static noinline int replay_one_extent(struct walk_control *wc, struct btrfs_path if (found_type == BTRFS_FILE_EXTENT_INLINE) { /* inline extents are easy, we just overwrite them */ - ret = overwrite_item(wc, path); + ret = overwrite_item(wc); if (ret) goto out; goto update_inode; @@ -762,13 +765,15 @@ static noinline int replay_one_extent(struct walk_control *wc, struct btrfs_path btrfs_fs_incompat(fs_info, NO_HOLES)) goto update_inode; - ret = btrfs_insert_empty_item(trans, root, path, &wc->log_key, sizeof(*item)); + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, + &wc->log_key, sizeof(*item)); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - dest_offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - copy_extent_buffer(path->nodes[0], wc->log_leaf, dest_offset, + dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0], + wc->subvol_path->slots[0]); + copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset, (unsigned long)item, sizeof(*item)); /* @@ -778,7 +783,7 @@ static noinline int replay_one_extent(struct walk_control *wc, struct btrfs_path * update the inode item. */ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); goto update_inode; } @@ -833,7 +838,7 @@ static noinline int replay_one_extent(struct walk_control *wc, struct btrfs_path } } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (btrfs_file_extent_compression(wc->log_leaf, item)) { csum_start = ins.objectid; @@ -967,7 +972,6 @@ static int unlink_inode_for_log_replay(struct walk_control *wc, * item */ static noinline int drop_one_dir_item(struct walk_control *wc, - struct btrfs_path *path, struct btrfs_inode *dir, struct btrfs_dir_item *di) { @@ -975,12 +979,10 @@ static noinline int drop_one_dir_item(struct walk_control *wc, struct btrfs_root *root = dir->root; struct btrfs_inode *inode; struct fscrypt_str name; - struct extent_buffer *leaf; + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; struct btrfs_key location; int ret; - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &location); ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); if (ret) { @@ -988,7 +990,7 @@ static noinline int drop_one_dir_item(struct walk_control *wc, return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); inode = btrfs_iget_logging(location.objectid, root); if (IS_ERR(inode)) { @@ -998,7 +1000,7 @@ static noinline int drop_one_dir_item(struct walk_control *wc, goto out; } - ret = link_to_fixup_dir(wc, path, location.objectid); + ret = link_to_fixup_dir(wc, location.objectid); if (ret) goto out; @@ -1097,13 +1099,12 @@ static noinline int backref_in_log(struct btrfs_root *log, } static int unlink_refs_not_in_log(struct walk_control *wc, - struct btrfs_path *path, struct btrfs_key *search_key, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 parent_objectid) { - struct extent_buffer *leaf = path->nodes[0]; + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; unsigned long ptr; unsigned long ptr_end; @@ -1112,8 +1113,8 @@ static int unlink_refs_not_in_log(struct walk_control *wc, * log. If so, we allow them to stay otherwise they must be unlinked as * a conflict. */ - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]); while (ptr < ptr_end) { struct btrfs_trans_handle *trans = wc->trans; struct fscrypt_str victim_name; @@ -1141,7 +1142,7 @@ static int unlink_refs_not_in_log(struct walk_control *wc, } inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name); kfree(victim_name.name); @@ -1154,15 +1155,14 @@ static int unlink_refs_not_in_log(struct walk_control *wc, } static int unlink_extrefs_not_in_log(struct walk_control *wc, - struct btrfs_path *path, struct btrfs_key *search_key, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid) { - struct extent_buffer *leaf = path->nodes[0]; - const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]); - const u32 item_size = btrfs_item_size(leaf, path->slots[0]); + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; + const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]); u32 cur_offset = 0; while (cur_offset < item_size) { @@ -1213,7 +1213,7 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, } inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); ret = unlink_inode_for_log_replay(wc, victim_parent, inode, &victim_name); @@ -1228,7 +1228,6 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, } static inline int __add_inode_ref(struct walk_control *wc, - struct btrfs_path *path, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, @@ -1246,7 +1245,7 @@ static inline int __add_inode_ref(struct walk_control *wc, search_key.objectid = inode_objectid; search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = parent_objectid; - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; @@ -1258,53 +1257,54 @@ static inline int __add_inode_ref(struct walk_control *wc, if (search_key.objectid == search_key.offset) return 1; - ret = unlink_refs_not_in_log(wc, path, &search_key, dir, inode, + ret = unlink_refs_not_in_log(wc, &search_key, dir, inode, parent_objectid); if (ret == -EAGAIN) goto again; else if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* Same search but for extended refs */ - extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid); + extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name, + inode_objectid, parent_objectid); if (IS_ERR(extref)) { return PTR_ERR(extref); } else if (extref) { - ret = unlink_extrefs_not_in_log(wc, path, &search_key, inode, + ret = unlink_extrefs_not_in_log(wc, &search_key, inode, inode_objectid, parent_objectid); if (ret == -EAGAIN) goto again; else if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* look for a conflicting sequence number */ - di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), + di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir), ref_index, name, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); btrfs_abort_transaction(trans, ret); return ret; } else if (di) { - ret = drop_one_dir_item(wc, path, dir, di); + ret = drop_one_dir_item(wc, dir, di); if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* look for a conflicting name */ - di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); + di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0); if (IS_ERR(di)) { return PTR_ERR(di); } else if (di) { - ret = drop_one_dir_item(wc, path, dir, di); + ret = drop_one_dir_item(wc, dir, di); if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -1357,9 +1357,7 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, * proper unlink of that name (that is, remove its entry from the inode * reference item and both dir index keys). */ -static int unlink_old_inode_refs(struct walk_control *wc, - struct btrfs_path *path, - struct btrfs_inode *inode) +static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -1369,8 +1367,8 @@ static int unlink_old_inode_refs(struct walk_control *wc, struct extent_buffer *eb; again: - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &wc->log_key, path, 0, 0); + btrfs_release_path(wc->subvol_path); + ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); if (ret > 0) { ret = 0; goto out; @@ -1380,9 +1378,9 @@ static int unlink_old_inode_refs(struct walk_control *wc, goto out; } - eb = path->nodes[0]; - ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); + eb = wc->subvol_path->nodes[0]; + ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]); + ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]); while (ref_ptr < ref_end) { struct fscrypt_str name; u64 parent_id; @@ -1413,7 +1411,7 @@ static int unlink_old_inode_refs(struct walk_control *wc, if (!ret) { struct btrfs_inode *dir; - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); dir = btrfs_iget_logging(parent_id, root); if (IS_ERR(dir)) { ret = PTR_ERR(dir); @@ -1438,7 +1436,7 @@ static int unlink_old_inode_refs(struct walk_control *wc, } ret = 0; out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } @@ -1446,7 +1444,7 @@ static int unlink_old_inode_refs(struct walk_control *wc, * Replay one inode back reference item found in the log tree. * Path is for temporary use by this function (it should be released on return). */ -static noinline int add_inode_ref(struct walk_control *wc, struct btrfs_path *path) +static noinline int add_inode_ref(struct walk_control *wc) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -1548,8 +1546,8 @@ static noinline int add_inode_ref(struct walk_control *wc, struct btrfs_path *pa } } - ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), - ref_index, &name); + ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir), + btrfs_ino(inode), ref_index, &name); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -1561,7 +1559,7 @@ static noinline int add_inode_ref(struct walk_control *wc, struct btrfs_path *pa * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(wc, path, dir, inode, inode_objectid, + ret = __add_inode_ref(wc, dir, inode, inode_objectid, parent_objectid, ref_index, &name); if (ret) { if (ret == 1) @@ -1602,14 +1600,14 @@ static noinline int add_inode_ref(struct walk_control *wc, struct btrfs_path *pa * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(wc, path, inode); + ret = unlink_old_inode_refs(wc, inode); if (ret) goto out; /* finally write the back reference in the inode */ - ret = overwrite_item(wc, path); + ret = overwrite_item(wc); out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); kfree(name.name); if (dir) iput(&dir->vfs_inode); @@ -1728,7 +1726,6 @@ static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path) * will free the inode. */ static noinline int fixup_inode_link_count(struct walk_control *wc, - struct btrfs_path *path, struct btrfs_inode *inode) { struct btrfs_trans_handle *trans = wc->trans; @@ -1737,13 +1734,13 @@ static noinline int fixup_inode_link_count(struct walk_control *wc, u64 nlink = 0; const u64 ino = btrfs_ino(inode); - ret = count_inode_refs(inode, path); + ret = count_inode_refs(inode, wc->subvol_path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(inode, path); + ret = count_inode_extrefs(inode, wc->subvol_path); if (ret < 0) goto out; @@ -1762,7 +1759,7 @@ static noinline int fixup_inode_link_count(struct walk_control *wc, if (inode->vfs_inode.i_nlink == 0) { if (S_ISDIR(inode->vfs_inode.i_mode)) { - ret = replay_dir_deletes(wc, path, ino, true); + ret = replay_dir_deletes(wc, ino, true); if (ret) goto out; } @@ -1772,12 +1769,11 @@ static noinline int fixup_inode_link_count(struct walk_control *wc, } out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } -static noinline int fixup_inode_link_counts(struct walk_control *wc, - struct btrfs_path *path) +static noinline int fixup_inode_link_counts(struct walk_control *wc) { int ret; struct btrfs_key key; @@ -1790,34 +1786,34 @@ static noinline int fixup_inode_link_counts(struct walk_control *wc, struct btrfs_root *root = wc->root; struct btrfs_inode *inode; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1); if (ret < 0) break; if (ret == 1) { ret = 0; - if (path->slots[0] == 0) + if (wc->subvol_path->slots[0] == 0) break; - path->slots[0]--; + wc->subvol_path->slots[0]--; } - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]); if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || key.type != BTRFS_ORPHAN_ITEM_KEY) break; - ret = btrfs_del_item(trans, root, path); + ret = btrfs_del_item(trans, root, wc->subvol_path); if (ret) break; - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); inode = btrfs_iget_logging(key.offset, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); break; } - ret = fixup_inode_link_count(wc, path, inode); + ret = fixup_inode_link_count(wc, inode); iput(&inode->vfs_inode); if (ret) break; @@ -1829,7 +1825,7 @@ static noinline int fixup_inode_link_counts(struct walk_control *wc, */ key.offset = (u64)-1; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } @@ -1839,9 +1835,7 @@ static noinline int fixup_inode_link_counts(struct walk_control *wc, * count when replay is done. The link count is incremented here * so the inode won't go away until we check it */ -static noinline int link_to_fixup_dir(struct walk_control *wc, - struct btrfs_path *path, - u64 objectid) +static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -1862,9 +1856,9 @@ static noinline int link_to_fixup_dir(struct walk_control *wc, key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (ret == 0) { if (!vfs_inode->i_nlink) set_nlink(vfs_inode, 1); @@ -1917,7 +1911,6 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, static int delete_conflicting_dir_entry(struct walk_control *wc, struct btrfs_inode *dir, - struct btrfs_path *path, struct btrfs_dir_item *dst_di, const struct btrfs_key *log_key, u8 log_flags, @@ -1925,12 +1918,12 @@ static int delete_conflicting_dir_entry(struct walk_control *wc, { struct btrfs_key found_key; - btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key); /* The existing dentry points to the same inode, don't delete it. */ if (found_key.objectid == log_key->objectid && found_key.type == log_key->type && found_key.offset == log_key->offset && - btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) + btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags) return 1; /* @@ -1940,7 +1933,7 @@ static int delete_conflicting_dir_entry(struct walk_control *wc, if (!exists) return 0; - return drop_one_dir_item(wc, path, dir, dst_di); + return drop_one_dir_item(wc, dir, dst_di); } /* @@ -1959,9 +1952,7 @@ static int delete_conflicting_dir_entry(struct walk_control *wc, * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a * non-existing inode) and 1 if the name was replayed. */ -static noinline int replay_one_name(struct walk_control *wc, - struct btrfs_path *path, - struct btrfs_dir_item *di) +static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -1995,8 +1986,8 @@ static noinline int replay_one_name(struct walk_control *wc, log_flags = btrfs_dir_flags(wc->log_leaf, di); btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key); - ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); - btrfs_release_path(path); + ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0); + btrfs_release_path(wc->subvol_path); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -2004,14 +1995,14 @@ static noinline int replay_one_name(struct walk_control *wc, exists = (ret == 0); ret = 0; - dir_dst_di = btrfs_lookup_dir_item(trans, root, path, wc->log_key.objectid, - &name, 1); + dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, + wc->log_key.objectid, &name, 1); if (IS_ERR(dir_dst_di)) { ret = PTR_ERR(dir_dst_di); btrfs_abort_transaction(trans, ret); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(wc, dir, path, dir_dst_di, + ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di, &log_key, log_flags, exists); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -2020,9 +2011,9 @@ static noinline int replay_one_name(struct walk_control *wc, dir_dst_matches = (ret == 1); } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); - index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, + index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, wc->log_key.objectid, wc->log_key.offset, &name, 1); if (IS_ERR(index_dst_di)) { @@ -2030,7 +2021,7 @@ static noinline int replay_one_name(struct walk_control *wc, btrfs_abort_transaction(trans, ret); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(wc, dir, path, index_dst_di, + ret = delete_conflicting_dir_entry(wc, dir, index_dst_di, &log_key, log_flags, exists); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -2039,7 +2030,7 @@ static noinline int replay_one_name(struct walk_control *wc, index_dst_matches = (ret == 1); } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (dir_dst_matches && index_dst_matches) { ret = 0; @@ -2104,8 +2095,7 @@ static noinline int replay_one_name(struct walk_control *wc, } /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ -static noinline int replay_one_dir_item(struct walk_control *wc, - struct btrfs_path *path) +static noinline int replay_one_dir_item(struct walk_control *wc) { int ret; struct btrfs_dir_item *di; @@ -2114,7 +2104,7 @@ static noinline int replay_one_dir_item(struct walk_control *wc, ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY); di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item); - ret = replay_one_name(wc, path, di); + ret = replay_one_name(wc, di); if (ret < 0) return ret; @@ -2148,7 +2138,7 @@ static noinline int replay_one_dir_item(struct walk_control *wc, struct btrfs_key di_key; btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key); - ret = link_to_fixup_dir(wc, path, di_key.objectid); + ret = link_to_fixup_dir(wc, di_key.objectid); } return ret; @@ -2242,7 +2232,6 @@ static noinline int find_dir_range(struct btrfs_root *root, * to is unlinked */ static noinline int check_item_in_log(struct walk_control *wc, - struct btrfs_path *path, struct btrfs_path *log_path, struct btrfs_inode *dir, struct btrfs_key *dir_key, @@ -2266,8 +2255,8 @@ static noinline int check_item_in_log(struct walk_control *wc, */ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); - eb = path->nodes[0]; - slot = path->slots[0]; + eb = wc->subvol_path->nodes[0]; + slot = wc->subvol_path->slots[0]; di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); if (ret) { @@ -2293,7 +2282,7 @@ static noinline int check_item_in_log(struct walk_control *wc, } btrfs_dir_item_key_to_cpu(eb, di, &location); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_release_path(log_path); inode = btrfs_iget_logging(location.objectid, root); if (IS_ERR(inode)) { @@ -2303,7 +2292,7 @@ static noinline int check_item_in_log(struct walk_control *wc, goto out; } - ret = link_to_fixup_dir(wc, path, location.objectid); + ret = link_to_fixup_dir(wc, location.objectid); if (ret) goto out; @@ -2315,7 +2304,7 @@ static noinline int check_item_in_log(struct walk_control *wc, * (an index number), so we're done. */ out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_release_path(log_path); kfree(name.name); if (inode) @@ -2323,7 +2312,7 @@ static noinline int check_item_in_log(struct walk_control *wc, return ret; } -static int replay_xattr_deletes(struct walk_control *wc, struct btrfs_path *path) +static int replay_xattr_deletes(struct walk_control *wc) { struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; @@ -2331,7 +2320,6 @@ static int replay_xattr_deletes(struct walk_control *wc, struct btrfs_path *path struct btrfs_key search_key; struct btrfs_path *log_path; const u64 ino = wc->log_key.objectid; - int i; int nritems; int ret; @@ -2345,32 +2333,32 @@ static int replay_xattr_deletes(struct walk_control *wc, struct btrfs_path *path search_key.type = BTRFS_XATTR_ITEM_KEY; search_key.offset = 0; again: - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } process_leaf: - nritems = btrfs_header_nritems(path->nodes[0]); - for (i = path->slots[0]; i < nritems; i++) { + nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]); + for (int i = wc->subvol_path->slots[0]; i < nritems; i++) { struct btrfs_key key; struct btrfs_dir_item *di; struct btrfs_dir_item *log_di; u32 total_size; u32 cur; - btrfs_item_key_to_cpu(path->nodes[0], &key, i); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i); if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { ret = 0; goto out; } - di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); - total_size = btrfs_item_size(path->nodes[0], i); + di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item); + total_size = btrfs_item_size(wc->subvol_path->nodes[0], i); cur = 0; while (cur < total_size) { - u16 name_len = btrfs_dir_name_len(path->nodes[0], di); - u16 data_len = btrfs_dir_data_len(path->nodes[0], di); + u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di); + u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di); u32 this_len = sizeof(*di) + name_len + data_len; char *name; @@ -2380,7 +2368,7 @@ static int replay_xattr_deletes(struct walk_control *wc, struct btrfs_path *path btrfs_abort_transaction(trans, ret); goto out; } - read_extent_buffer(path->nodes[0], name, + read_extent_buffer(wc->subvol_path->nodes[0], name, (unsigned long)(di + 1), name_len); log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, @@ -2388,8 +2376,8 @@ static int replay_xattr_deletes(struct walk_control *wc, struct btrfs_path *path btrfs_release_path(log_path); if (!log_di) { /* Doesn't exist in log tree, so delete it. */ - btrfs_release_path(path); - di = btrfs_lookup_xattr(trans, root, path, ino, + btrfs_release_path(wc->subvol_path); + di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino, name, name_len, -1); kfree(name); if (IS_ERR(di)) { @@ -2399,12 +2387,12 @@ static int replay_xattr_deletes(struct walk_control *wc, struct btrfs_path *path } ASSERT(di); ret = btrfs_delete_one_dir_name(trans, root, - path, di); + wc->subvol_path, di); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); search_key = key; goto again; } @@ -2418,7 +2406,7 @@ static int replay_xattr_deletes(struct walk_control *wc, struct btrfs_path *path di = (struct btrfs_dir_item *)((char *)di + this_len); } } - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(root, wc->subvol_path); if (ret > 0) ret = 0; else if (ret == 0) @@ -2427,7 +2415,7 @@ static int replay_xattr_deletes(struct walk_control *wc, struct btrfs_path *path btrfs_abort_transaction(trans, ret); out: btrfs_free_path(log_path); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } @@ -2443,7 +2431,6 @@ static int replay_xattr_deletes(struct walk_control *wc, struct btrfs_path *path * directory. */ static noinline int replay_dir_deletes(struct walk_control *wc, - struct btrfs_path *path, u64 dirid, bool del_all) { struct btrfs_trans_handle *trans = wc->trans; @@ -2486,7 +2473,7 @@ static noinline int replay_dir_deletes(struct walk_control *wc, if (del_all) range_end = (u64)-1; else { - ret = find_dir_range(log, path, dirid, + ret = find_dir_range(log, wc->subvol_path, dirid, &range_start, &range_end); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -2499,16 +2486,16 @@ static noinline int replay_dir_deletes(struct walk_control *wc, dir_key.offset = range_start; while (1) { int nritems; - ret = btrfs_search_slot(NULL, root, &dir_key, path, - 0, 0); + ret = btrfs_search_slot(NULL, root, &dir_key, + wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); + nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]); + if (wc->subvol_path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, wc->subvol_path); if (ret == 1) { break; } else if (ret < 0) { @@ -2516,8 +2503,8 @@ static noinline int replay_dir_deletes(struct walk_control *wc, goto out; } } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key, + wc->subvol_path->slots[0]); if (found_key.objectid != dirid || found_key.type != dir_key.type) { ret = 0; @@ -2527,22 +2514,21 @@ static noinline int replay_dir_deletes(struct walk_control *wc, if (found_key.offset > range_end) break; - ret = check_item_in_log(wc, path, log_path, dir, - &found_key, del_all); + ret = check_item_in_log(wc, log_path, dir, &found_key, del_all); if (ret) goto out; if (found_key.offset == (u64)-1) break; dir_key.offset = found_key.offset + 1; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (range_end == (u64)-1) break; range_start = range_end + 1; } ret = 0; out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_free_path(log_path); iput(&dir->vfs_inode); return ret; @@ -2567,7 +2553,6 @@ static int replay_one_buffer(struct extent_buffer *eb, .transid = gen, .level = level }; - struct btrfs_path *path; struct btrfs_root *root = wc->root; struct btrfs_trans_handle *trans = wc->trans; int ret; @@ -2581,8 +2566,9 @@ static int replay_one_buffer(struct extent_buffer *eb, return ret; } - path = btrfs_alloc_path(); - if (!path) { + ASSERT(wc->subvol_path == NULL); + wc->subvol_path = btrfs_alloc_path(); + if (!wc->subvol_path) { btrfs_abort_transaction(trans, -ENOMEM); return -ENOMEM; } @@ -2630,16 +2616,16 @@ static int replay_one_buffer(struct extent_buffer *eb, wc->stage == LOG_WALK_REPLAY_INODES) { u32 mode; - ret = replay_xattr_deletes(wc, path); + ret = replay_xattr_deletes(wc); if (ret) break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { - ret = replay_dir_deletes(wc, path, wc->log_key.objectid, false); + ret = replay_dir_deletes(wc, wc->log_key.objectid, false); if (ret) break; } - ret = overwrite_item(wc, path); + ret = overwrite_item(wc); if (ret) break; @@ -2667,7 +2653,7 @@ static int replay_one_buffer(struct extent_buffer *eb, drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - drop_args.path = path; + drop_args.path = wc->subvol_path; ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2684,7 +2670,7 @@ static int replay_one_buffer(struct extent_buffer *eb, break; } - ret = link_to_fixup_dir(wc, path, wc->log_key.objectid); + ret = link_to_fixup_dir(wc, wc->log_key.objectid); if (ret) break; } @@ -2694,7 +2680,7 @@ static int replay_one_buffer(struct extent_buffer *eb, if (wc->log_key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { - ret = replay_one_dir_item(wc, path); + ret = replay_one_dir_item(wc); if (ret) break; } @@ -2704,16 +2690,16 @@ static int replay_one_buffer(struct extent_buffer *eb, /* these keys are simply copied */ if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) { - ret = overwrite_item(wc, path); + ret = overwrite_item(wc); if (ret) break; } else if (wc->log_key.type == BTRFS_INODE_REF_KEY || wc->log_key.type == BTRFS_INODE_EXTREF_KEY) { - ret = add_inode_ref(wc, path); + ret = add_inode_ref(wc); if (ret) break; } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) { - ret = replay_one_extent(wc, path); + ret = replay_one_extent(wc); if (ret) break; } @@ -2724,7 +2710,8 @@ static int replay_one_buffer(struct extent_buffer *eb, * older kernel with such keys, ignore them. */ } - btrfs_free_path(path); + btrfs_free_path(wc->subvol_path); + wc->subvol_path = NULL; return ret; } @@ -7515,7 +7502,9 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) if (wc.stage == LOG_WALK_REPLAY_ALL) { struct btrfs_root *root = wc.root; - ret = fixup_inode_link_counts(&wc, path); + wc.subvol_path = path; + ret = fixup_inode_link_counts(&wc); + wc.subvol_path = NULL; if (ret) { btrfs_abort_transaction(trans, ret); goto next; From bd9c063e6ff2657ae89de9838fd792c1157ec96c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 2 Sep 2025 18:08:15 +0100 Subject: [PATCH 2566/3206] btrfs: stop passing inode object IDs to __add_inode_ref() in log replay There's no point in passing the inode and parent inode object IDs to __add_inode_ref() and its helpers because we can get them by calling btrfs_ino() against the inode and the directory inode, and we pass both inodes to __add_inode_ref() and its helpers. So remove the object IDs parameters to reduce arguments passed and to make things less confusing. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 747daaaaf332a0..6186300923b7c4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1101,8 +1101,7 @@ static noinline int backref_in_log(struct btrfs_root *log, static int unlink_refs_not_in_log(struct walk_control *wc, struct btrfs_key *search_key, struct btrfs_inode *dir, - struct btrfs_inode *inode, - u64 parent_objectid) + struct btrfs_inode *inode) { struct extent_buffer *leaf = wc->subvol_path->nodes[0]; unsigned long ptr; @@ -1130,7 +1129,7 @@ static int unlink_refs_not_in_log(struct walk_control *wc, return ret; } - ret = backref_in_log(wc->log, search_key, parent_objectid, &victim_name); + ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name); if (ret) { kfree(victim_name.name); if (ret < 0) { @@ -1156,9 +1155,8 @@ static int unlink_refs_not_in_log(struct walk_control *wc, static int unlink_extrefs_not_in_log(struct walk_control *wc, struct btrfs_key *search_key, - struct btrfs_inode *inode, - u64 inode_objectid, - u64 parent_objectid) + struct btrfs_inode *dir, + struct btrfs_inode *inode) { struct extent_buffer *leaf = wc->subvol_path->nodes[0]; const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); @@ -1177,7 +1175,7 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, extref = (struct btrfs_inode_extref *)(base + cur_offset); victim_name.len = btrfs_inode_extref_name_len(leaf, extref); - if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir)) goto next; ret = read_alloc_one_name(leaf, &extref->name, victim_name.len, @@ -1187,12 +1185,12 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, return ret; } - search_key->objectid = inode_objectid; + search_key->objectid = btrfs_ino(inode); search_key->type = BTRFS_INODE_EXTREF_KEY; - search_key->offset = btrfs_extref_hash(parent_objectid, + search_key->offset = btrfs_extref_hash(btrfs_ino(dir), victim_name.name, victim_name.len); - ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name); if (ret) { kfree(victim_name.name); if (ret < 0) { @@ -1204,7 +1202,7 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, continue; } - victim_parent = btrfs_iget_logging(parent_objectid, root); + victim_parent = btrfs_iget_logging(btrfs_ino(dir), root); if (IS_ERR(victim_parent)) { kfree(victim_name.name); ret = PTR_ERR(victim_parent); @@ -1230,7 +1228,6 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, static inline int __add_inode_ref(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_inode *inode, - u64 inode_objectid, u64 parent_objectid, u64 ref_index, struct fscrypt_str *name) { int ret; @@ -1242,9 +1239,9 @@ static inline int __add_inode_ref(struct walk_control *wc, again: /* Search old style refs */ - search_key.objectid = inode_objectid; + search_key.objectid = btrfs_ino(inode); search_key.type = BTRFS_INODE_REF_KEY; - search_key.offset = parent_objectid; + search_key.offset = btrfs_ino(dir); ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -1257,8 +1254,7 @@ static inline int __add_inode_ref(struct walk_control *wc, if (search_key.objectid == search_key.offset) return 1; - ret = unlink_refs_not_in_log(wc, &search_key, dir, inode, - parent_objectid); + ret = unlink_refs_not_in_log(wc, &search_key, dir, inode); if (ret == -EAGAIN) goto again; else if (ret) @@ -1268,12 +1264,11 @@ static inline int __add_inode_ref(struct walk_control *wc, /* Same search but for extended refs */ extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name, - inode_objectid, parent_objectid); + btrfs_ino(inode), btrfs_ino(dir)); if (IS_ERR(extref)) { return PTR_ERR(extref); } else if (extref) { - ret = unlink_extrefs_not_in_log(wc, &search_key, inode, - inode_objectid, parent_objectid); + ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode); if (ret == -EAGAIN) goto again; else if (ret) @@ -1559,8 +1554,7 @@ static noinline int add_inode_ref(struct walk_control *wc) * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(wc, dir, inode, inode_objectid, - parent_objectid, ref_index, &name); + ret = __add_inode_ref(wc, dir, inode, ref_index, &name); if (ret) { if (ret == 1) ret = 0; From e41c5e611a65799747709b8fed461dd07647c40d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 3 Sep 2025 12:10:01 +0100 Subject: [PATCH 2567/3206] btrfs: remove pointless inode lookup when processing extrefs during log replay At unlink_extrefs_not_in_log() we do an inode lookup of the directory but we already have the directory inode accessible as a function argument, so the lookup is redudant. Remove it and use the directory inode passed in as an argument. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6186300923b7c4..86c595ef57f474 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1165,10 +1165,8 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, while (cur_offset < item_size) { struct btrfs_trans_handle *trans = wc->trans; - struct btrfs_root *root = wc->root; struct btrfs_root *log_root = wc->log; struct btrfs_inode_extref *extref; - struct btrfs_inode *victim_parent; struct fscrypt_str victim_name; int ret; @@ -1202,20 +1200,10 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, continue; } - victim_parent = btrfs_iget_logging(btrfs_ino(dir), root); - if (IS_ERR(victim_parent)) { - kfree(victim_name.name); - ret = PTR_ERR(victim_parent); - btrfs_abort_transaction(trans, ret); - return ret; - } - inc_nlink(&inode->vfs_inode); btrfs_release_path(wc->subvol_path); - ret = unlink_inode_for_log_replay(wc, victim_parent, inode, - &victim_name); - iput(&victim_parent->vfs_inode); + ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name); kfree(victim_name.name); if (ret) return ret; From 0b7453b7a1c1f8aa1570da2d0cc81bf7691eb5f1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 3 Sep 2025 17:39:24 +0100 Subject: [PATCH 2568/3206] btrfs: abort transaction if we fail to find dir item during log replay At __add_inode_ref() if we get an error when trying to lookup a dir item we don't abort the transaction and propagate the error up the call chain, so that somewhere else up in the call chain the transaction is aborted. This however makes it hard to know that the failure comes from looking up a dir item, so add a transaction abort in case we fail there, so that we immediately pinpoint where the problem comes from during log replay. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 86c595ef57f474..7b91248b38dc94 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1281,7 +1281,9 @@ static inline int __add_inode_ref(struct walk_control *wc, /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0); if (IS_ERR(di)) { - return PTR_ERR(di); + ret = PTR_ERR(di); + btrfs_abort_transaction(trans, ret); + return ret; } else if (di) { ret = drop_one_dir_item(wc, dir, di); if (ret) From 5a0565cad3ef7cbf4cf43d1dd1e849b156205292 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 3 Sep 2025 17:43:04 +0100 Subject: [PATCH 2569/3206] btrfs: abort transaction if we fail to update inode in log replay dir fixup If we fail to update the inode at link_to_fixup_dir(), we don't abort the transaction and propagate the error up the call chain, which makes it hard to pinpoint the error to the inode update. So abort the transaction if the inode update call fails, so that if it happens we known immediately. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7b91248b38dc94..83b79023baaed5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1849,6 +1849,8 @@ static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid) else inc_nlink(vfs_inode); ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_transaction(trans, ret); } else if (ret == -EEXIST) { ret = 0; } else { From 2753e49176240f21e4bb10e03514f99e732704bb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 5 Sep 2025 12:58:19 +0100 Subject: [PATCH 2570/3206] btrfs: dump detailed info and specific messages on log replay failures Currently debugging log replay failures can be harder than needed, since all we do now is abort a transaction, which gives us a line number, a stack trace and an error code. But that is most of the times not enough to give some clue about what went wrong. So add a new helper to abort log replay and provide contextual information: 1) Dump the current leaf of the log tree being processed and print the slot we are currently at and the key at that slot; 2) Dump the current subvolume tree leaf if we have any; 3) Print the current stage of log replay; 4) Print the id of the subvolume root associated with the log tree we are currently processing (as we can have multiple); 5) Print some error message to mention what we were trying to do when we got an error. Replace all transaction abort calls (btrfs_abort_transaction()) with the new helper btrfs_abort_log_replay(), which besides dumping all that extra information, it also aborts the current transaction. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.h | 2 + fs/btrfs/messages.c | 1 + fs/btrfs/tree-log.c | 432 +++++++++++++++++++++++++++++++++++--------- 3 files changed, 349 insertions(+), 86 deletions(-) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index bf4a1b75b0cf48..ba63a40b2bde1f 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -104,6 +104,8 @@ enum { BTRFS_FS_STATE_RO, /* Track if a transaction abort has been reported on this filesystem */ BTRFS_FS_STATE_TRANS_ABORTED, + /* Track if log replay has failed. */ + BTRFS_FS_STATE_LOG_REPLAY_ABORTED, /* * Bio operations should be blocked on this filesystem because a source * or target device is being destroyed as part of a device replace diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 363fd28c026880..a0cf8effe008e1 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -18,6 +18,7 @@ static const char fs_state_chars[] = { [BTRFS_FS_STATE_REMOUNTING] = 'M', [BTRFS_FS_STATE_RO] = 0, [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_LOG_REPLAY_ABORTED] = 'O', [BTRFS_FS_STATE_DEV_REPLACING] = 'R', [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C', diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 83b79023baaed5..144b1272536506 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "file-item.h" #include "file.h" #include "orphan.h" +#include "print-tree.h" #include "tree-checker.h" #define MAX_CONFLICT_INODES 10 @@ -167,6 +168,62 @@ struct walk_control { struct btrfs_path *subvol_path; }; +static void do_abort_log_replay(struct walk_control *wc, const char *function, + unsigned int line, int error, const char *fmt, ...) +{ + struct btrfs_fs_info *fs_info = wc->trans->fs_info; + struct va_format vaf; + va_list args; + + /* + * Do nothing if we already aborted, to avoid dumping leaves again which + * can be verbose. Further more, only the first call is useful since it + * is where we have a problem. Note that we do not use the flag + * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that + * are outside of tree-log.c that can abort transactions (such as + * btrfs_add_link() for example), so if that happens we still want to + * dump all log replay specific information below. + */ + if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state)) + return; + + btrfs_abort_transaction(wc->trans, error); + + if (wc->subvol_path->nodes[0]) { + btrfs_crit(fs_info, + "subvolume (root %llu) leaf currently being processed:", + btrfs_root_id(wc->root)); + btrfs_print_leaf(wc->subvol_path->nodes[0]); + } + + if (wc->log_leaf) { + btrfs_crit(fs_info, + "log tree (for root %llu) leaf currently being processed (slot %d key %llu %u %llu):", + btrfs_root_id(wc->root), wc->log_slot, + wc->log_key.objectid, wc->log_key.type, wc->log_key.offset); + btrfs_print_leaf(wc->log_leaf); + } + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV", + function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf); + + va_end(args); +} + +/* + * Use this for aborting a transaction during log replay while we are down the + * call chain of replay_one_buffer(), so that we get a lot more useful + * information for debugging issues when compared to a plain call to + * btrfs_abort_transaction(). + */ +#define btrfs_abort_log_replay(wc, error, fmt, args...) \ + do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args) + static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, int inode_only, @@ -452,7 +509,10 @@ static int overwrite_item(struct walk_control *wc) /* Look for the key in the destination tree. */ ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); return ret; } @@ -472,7 +532,8 @@ static int overwrite_item(struct walk_control *wc) } src_copy = kmalloc(item_size, GFP_NOFS); if (!src_copy) { - btrfs_abort_transaction(trans, -ENOMEM); + btrfs_abort_log_replay(wc, -ENOMEM, + "failed to allocate memory for log leaf item"); return -ENOMEM; } @@ -556,7 +617,10 @@ static int overwrite_item(struct walk_control *wc) else if (found_size < item_size) btrfs_extend_item(trans, wc->subvol_path, item_size - found_size); } else if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to insert item for key (%llu %u %llu)", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset); return ret; } dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); @@ -691,18 +755,19 @@ static noinline int replay_one_extent(struct walk_control *wc) extent_end = ALIGN(start + size, fs_info->sectorsize); } else { - btrfs_abort_transaction(trans, -EUCLEAN); - btrfs_err(fs_info, - "unexpected extent type=%d root=%llu inode=%llu offset=%llu", - found_type, btrfs_root_id(root), wc->log_key.objectid, - wc->log_key.offset); + btrfs_abort_log_replay(wc, -EUCLEAN, + "unexpected extent type=%d root=%llu inode=%llu offset=%llu", + found_type, btrfs_root_id(root), + wc->log_key.objectid, wc->log_key.offset); return -EUCLEAN; } inode = btrfs_iget_logging(wc->log_key.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to get inode %llu for root %llu", + wc->log_key.objectid, btrfs_root_id(root)); return ret; } @@ -743,7 +808,10 @@ static noinline int replay_one_extent(struct walk_control *wc) drop_args.path = wc->subvol_path; ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to drop extents for inode %llu range [%llu, %llu) root %llu", + wc->log_key.objectid, start, extent_end, + btrfs_root_id(root)); goto out; } @@ -768,7 +836,10 @@ static noinline int replay_one_extent(struct walk_control *wc) ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, sizeof(*item)); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to insert item with key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); goto out; } dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0], @@ -801,7 +872,10 @@ static noinline int replay_one_extent(struct walk_control *wc) */ ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, btrfs_root_id(root)); goto out; } @@ -811,7 +885,10 @@ static noinline int replay_one_extent(struct walk_control *wc) */ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, btrfs_root_id(root)); goto out; } else if (ret == 0) { struct btrfs_ref ref = { @@ -825,7 +902,11 @@ static noinline int replay_one_extent(struct walk_control *wc) btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, + btrfs_root_id(root)); goto out; } } else { @@ -833,7 +914,10 @@ static noinline int replay_one_extent(struct walk_control *wc) ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root), wc->log_key.objectid, offset, &ins); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu", + ins.objectid, ins.offset, offset, + wc->log_key.objectid, btrfs_root_id(root)); goto out; } } @@ -851,7 +935,10 @@ static noinline int replay_one_extent(struct walk_control *wc) ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, &ordered_sums, false); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookups csums for range [%llu, %llu) inode %llu root %llu", + csum_start, csum_end, wc->log_key.objectid, + btrfs_root_id(root)); goto out; } ret = 0; @@ -909,12 +996,22 @@ static noinline int replay_one_extent(struct walk_control *wc) ret = btrfs_del_csums(trans, csum_root, sums->logical, sums->len); if (ret) - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to delete csums for range [%llu, %llu) inode %llu root %llu", + sums->logical, + sums->logical + sums->len, + wc->log_key.objectid, + btrfs_root_id(root)); } if (!ret) { ret = btrfs_csum_file_blocks(trans, csum_root, sums); if (ret) - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to add csums for range [%llu, %llu) inode %llu root %llu", + sums->logical, + sums->logical + sums->len, + wc->log_key.objectid, + btrfs_root_id(root)); } list_del(&sums->list); kfree(sums); @@ -925,14 +1022,19 @@ static noinline int replay_one_extent(struct walk_control *wc) update_inode: ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to set file extent range [%llu, %llu) inode %llu root %llu", + start, extent_end, wc->log_key.objectid, + btrfs_root_id(root)); goto out; } btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); if (ret) - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + wc->log_key.objectid, btrfs_root_id(root)); out: iput(&inode->vfs_inode); return ret; @@ -948,7 +1050,10 @@ static int unlink_inode_for_log_replay(struct walk_control *wc, ret = btrfs_unlink_inode(trans, dir, inode, name); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to unlink inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), name->len, + name->name, btrfs_root_id(inode->root)); return ret; } /* @@ -959,7 +1064,11 @@ static int unlink_inode_for_log_replay(struct walk_control *wc, */ ret = btrfs_run_delayed_items(trans); if (ret) - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), name->len, + name->name, btrfs_root_id(inode->root)); + return ret; } @@ -975,7 +1084,6 @@ static noinline int drop_one_dir_item(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_dir_item *di) { - struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = dir->root; struct btrfs_inode *inode; struct fscrypt_str name; @@ -986,7 +1094,9 @@ static noinline int drop_one_dir_item(struct walk_control *wc, btrfs_dir_item_key_to_cpu(leaf, di, &location); ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); return ret; } @@ -995,7 +1105,10 @@ static noinline int drop_one_dir_item(struct walk_control *wc, inode = btrfs_iget_logging(location.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to open inode %llu parent dir %llu name %.*s root %llu", + location.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); inode = NULL; goto out; } @@ -1115,7 +1228,6 @@ static int unlink_refs_not_in_log(struct walk_control *wc, ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]); while (ptr < ptr_end) { - struct btrfs_trans_handle *trans = wc->trans; struct fscrypt_str victim_name; struct btrfs_inode_ref *victim_ref; int ret; @@ -1125,17 +1237,25 @@ static int unlink_refs_not_in_log(struct walk_control *wc, btrfs_inode_ref_name_len(leaf, victim_ref), &victim_name); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for inode %llu parent dir %llu root %llu", + btrfs_ino(inode), btrfs_ino(dir), + btrfs_root_id(inode->root)); return ret; } ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name); if (ret) { - kfree(victim_name.name); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + victim_name.len, victim_name.name, + btrfs_root_id(inode->root)); + kfree(victim_name.name); return ret; } + kfree(victim_name.name); ptr = (unsigned long)(victim_ref + 1) + victim_name.len; continue; } @@ -1164,7 +1284,6 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, u32 cur_offset = 0; while (cur_offset < item_size) { - struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *log_root = wc->log; struct btrfs_inode_extref *extref; struct fscrypt_str victim_name; @@ -1179,7 +1298,10 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, ret = read_alloc_one_name(leaf, &extref->name, victim_name.len, &victim_name); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for inode %llu parent dir %llu root %llu", + btrfs_ino(inode), btrfs_ino(dir), + btrfs_root_id(inode->root)); return ret; } @@ -1190,11 +1312,16 @@ static int unlink_extrefs_not_in_log(struct walk_control *wc, victim_name.len); ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name); if (ret) { - kfree(victim_name.name); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + victim_name.len, victim_name.name, + btrfs_root_id(inode->root)); + kfree(victim_name.name); return ret; } + kfree(victim_name.name); next: cur_offset += victim_name.len + sizeof(*extref); continue; @@ -1232,7 +1359,10 @@ static inline int __add_inode_ref(struct walk_control *wc, search_key.offset = btrfs_ino(dir); ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + search_key.objectid, search_key.type, + search_key.offset, btrfs_root_id(root)); return ret; } else if (ret == 0) { /* @@ -1269,7 +1399,10 @@ static inline int __add_inode_ref(struct walk_control *wc, ref_index, name, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(dir), ref_index, name->len, + name->name, btrfs_root_id(root)); return ret; } else if (di) { ret = drop_one_dir_item(wc, dir, di); @@ -1282,7 +1415,10 @@ static inline int __add_inode_ref(struct walk_control *wc, di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir item for dir %llu name %.*s root %llu", + btrfs_ino(dir), name->len, name->name, + btrfs_root_id(root)); return ret; } else if (di) { ret = drop_one_dir_item(wc, dir, di); @@ -1344,7 +1480,6 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, */ static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode) { - struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; int ret; unsigned long ref_ptr; @@ -1359,7 +1494,10 @@ static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *in goto out; } if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); goto out; } @@ -1374,14 +1512,20 @@ static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *in ret = extref_get_fields(eb, ref_ptr, &name, NULL, &parent_id); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to get extref details for inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); goto out; } } else { parent_id = wc->log_key.offset; ret = ref_get_fields(eb, ref_ptr, &name, NULL); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to get ref details for inode %llu parent_id %llu root %llu", + btrfs_ino(inode), parent_id, + btrfs_root_id(root)); goto out; } } @@ -1401,7 +1545,9 @@ static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *in if (IS_ERR(dir)) { ret = PTR_ERR(dir); kfree(name.name); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_id, btrfs_root_id(root)); goto out; } ret = unlink_inode_for_log_replay(wc, dir, inode, &name); @@ -1472,7 +1618,9 @@ static noinline int add_inode_ref(struct walk_control *wc) if (ret == -ENOENT) ret = 0; else - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_objectid, btrfs_root_id(root)); dir = NULL; goto out; } @@ -1480,7 +1628,9 @@ static noinline int add_inode_ref(struct walk_control *wc) inode = btrfs_iget_logging(inode_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + inode_objectid, btrfs_root_id(root)); inode = NULL; goto out; } @@ -1490,7 +1640,10 @@ static noinline int add_inode_ref(struct walk_control *wc) ret = extref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index, &parent_objectid); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to get extref details for inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); goto out; } /* @@ -1518,7 +1671,10 @@ static noinline int add_inode_ref(struct walk_control *wc) ret = 0; goto next; } else { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_objectid, + btrfs_root_id(root)); } goto out; } @@ -1526,7 +1682,11 @@ static noinline int add_inode_ref(struct walk_control *wc) } else { ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to get ref details for inode %llu parent_objectid %llu root %llu", + btrfs_ino(inode), + parent_objectid, + btrfs_root_id(root)); goto out; } } @@ -1534,7 +1694,11 @@ static noinline int add_inode_ref(struct walk_control *wc) ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir), btrfs_ino(inode), ref_index, &name); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + ref_index, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (ret == 0) { /* @@ -1554,13 +1718,21 @@ static noinline int add_inode_ref(struct walk_control *wc) /* insert our name */ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(inode), + btrfs_ino(dir), ref_index, + name.len, name.name, + btrfs_root_id(root)); goto out; } ret = btrfs_update_inode(trans, inode); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); goto out; } } @@ -1831,7 +2003,9 @@ static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid) inode = btrfs_iget_logging(objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + objectid, btrfs_root_id(root)); return ret; } @@ -1850,11 +2024,15 @@ static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid) inc_nlink(vfs_inode); ret = btrfs_update_inode(trans, inode); if (ret) - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + objectid, btrfs_root_id(root)); } else if (ret == -EEXIST) { ret = 0; } else { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to insert fixup item for inode %llu root %llu", + objectid, btrfs_root_id(root)); } iput(vfs_inode); @@ -1959,14 +2137,18 @@ static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_it dir = btrfs_iget_logging(wc->log_key.objectid, root); if (IS_ERR(dir)) { ret = PTR_ERR(dir); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + wc->log_key.objectid, btrfs_root_id(root)); return ret; } ret = read_alloc_one_name(wc->log_leaf, di + 1, btrfs_dir_name_len(wc->log_leaf, di), &name); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); goto out; } @@ -1975,7 +2157,9 @@ static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_it ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0); btrfs_release_path(wc->subvol_path); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + log_key.objectid, btrfs_root_id(root)); goto out; } exists = (ret == 0); @@ -1985,13 +2169,19 @@ static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_it wc->log_key.objectid, &name, 1); if (IS_ERR(dir_dst_di)) { ret = PTR_ERR(dir_dst_di); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir item for dir %llu name %.*s root %llu", + wc->log_key.objectid, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (dir_dst_di) { ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di, &log_key, log_flags, exists); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to delete conflicting entry for dir %llu name %.*s root %llu", + btrfs_ino(dir), name.len, name.name, + btrfs_root_id(root)); goto out; } dir_dst_matches = (ret == 1); @@ -2004,13 +2194,19 @@ static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_it wc->log_key.offset, &name, 1); if (IS_ERR(index_dst_di)) { ret = PTR_ERR(index_dst_di); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir index item for dir %llu name %.*s root %llu", + wc->log_key.objectid, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (index_dst_di) { ret = delete_conflicting_dir_entry(wc, dir, index_dst_di, &log_key, log_flags, exists); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to delete conflicting entry for dir %llu name %.*s root %llu", + btrfs_ino(dir), name.len, name.name, + btrfs_root_id(root)); goto out; } index_dst_matches = (ret == 1); @@ -2033,7 +2229,10 @@ static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_it search_key.offset = wc->log_key.objectid; ret = backref_in_log(root->log_root, &search_key, 0, &name); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu", + search_key.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); goto out; } else if (ret) { /* The dentry will be added later. */ @@ -2047,7 +2246,10 @@ static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_it search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len); ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, +"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu", + search_key.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); goto out; } else if (ret) { /* The dentry will be added later. */ @@ -2058,7 +2260,10 @@ static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_it ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset, &name, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to insert name %.*s for inode %llu dir %llu root %llu", + name.len, name.name, log_key.objectid, + btrfs_ino(dir), btrfs_root_id(root)); goto out; } if (!ret) @@ -2071,7 +2276,9 @@ static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_it btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); ret = btrfs_update_inode(trans, dir); if (ret) - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to update dir inode %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); } kfree(name.name); iput(&dir->vfs_inode); @@ -2246,7 +2453,10 @@ static noinline int check_item_in_log(struct walk_control *wc, di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu index %llu root %llu", + btrfs_ino(dir), dir_key->offset, + btrfs_root_id(root)); goto out; } @@ -2258,7 +2468,11 @@ static noinline int check_item_in_log(struct walk_control *wc, dir_key->offset, &name, 0); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir index item for dir %llu index %llu name %.*s root %llu", + btrfs_ino(dir), dir_key->offset, + name.len, name.name, + btrfs_root_id(root)); goto out; } else if (log_di) { /* The dentry exists in the log, we have nothing to do. */ @@ -2274,7 +2488,9 @@ static noinline int check_item_in_log(struct walk_control *wc, if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + location.objectid, btrfs_root_id(root)); goto out; } @@ -2311,7 +2527,7 @@ static int replay_xattr_deletes(struct walk_control *wc) log_path = btrfs_alloc_path(); if (!log_path) { - btrfs_abort_transaction(trans, -ENOMEM); + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; } @@ -2321,7 +2537,9 @@ static int replay_xattr_deletes(struct walk_control *wc) again: ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to search xattrs for inode %llu root %llu", + ino, btrfs_root_id(root)); goto out; } process_leaf: @@ -2351,7 +2569,9 @@ static int replay_xattr_deletes(struct walk_control *wc) name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to allocate memory for name of length %u", + name_len); goto out; } read_extent_buffer(wc->subvol_path->nodes[0], name, @@ -2365,29 +2585,41 @@ static int replay_xattr_deletes(struct walk_control *wc) btrfs_release_path(wc->subvol_path); di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino, name, name_len, -1); - kfree(name); if (IS_ERR(di)) { ret = PTR_ERR(di); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup xattr with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; } ASSERT(di); ret = btrfs_delete_one_dir_name(trans, root, wc->subvol_path, di); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to delete xattr with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; } btrfs_release_path(wc->subvol_path); + kfree(name); search_key = key; goto again; } - kfree(name); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup xattr in log tree with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; } + kfree(name); cur += this_len; di = (struct btrfs_dir_item *)((char *)di + this_len); } @@ -2398,7 +2630,9 @@ static int replay_xattr_deletes(struct walk_control *wc) else if (ret == 0) goto process_leaf; else - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to get next leaf in subvolume root %llu", + btrfs_root_id(root)); out: btrfs_free_path(log_path); btrfs_release_path(wc->subvol_path); @@ -2419,7 +2653,6 @@ static int replay_xattr_deletes(struct walk_control *wc) static noinline int replay_dir_deletes(struct walk_control *wc, u64 dirid, bool del_all) { - struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = wc->root; struct btrfs_root *log = (del_all ? NULL : wc->log); u64 range_start; @@ -2434,7 +2667,7 @@ static noinline int replay_dir_deletes(struct walk_control *wc, dir_key.type = BTRFS_DIR_INDEX_KEY; log_path = btrfs_alloc_path(); if (!log_path) { - btrfs_abort_transaction(trans, -ENOMEM); + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; } @@ -2449,7 +2682,9 @@ static noinline int replay_dir_deletes(struct walk_control *wc, if (ret == -ENOENT) ret = 0; else - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + dirid, btrfs_root_id(root)); return ret; } @@ -2462,7 +2697,9 @@ static noinline int replay_dir_deletes(struct walk_control *wc, ret = find_dir_range(log, wc->subvol_path, dirid, &range_start, &range_end); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to find range for dir %llu in log tree root %llu", + dirid, btrfs_root_id(root)); goto out; } else if (ret > 0) { break; @@ -2475,7 +2712,11 @@ static noinline int replay_dir_deletes(struct walk_control *wc, ret = btrfs_search_slot(NULL, root, &dir_key, wc->subvol_path, 0, 0); if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to search root %llu for key (%llu %u %llu)", + btrfs_root_id(root), + dir_key.objectid, dir_key.type, + dir_key.offset); goto out; } @@ -2485,7 +2726,9 @@ static noinline int replay_dir_deletes(struct walk_control *wc, if (ret == 1) { break; } else if (ret < 0) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to get next leaf in subvolume root %llu", + btrfs_root_id(root)); goto out; } } @@ -2546,16 +2789,23 @@ static int replay_one_buffer(struct extent_buffer *eb, if (level != 0) return 0; + /* + * Set to NULL since it was not yet read and in case we abort log replay + * on error, we have no valid log tree leaf to dump. + */ + wc->log_leaf = NULL; ret = btrfs_read_extent_buffer(eb, &check); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to read log tree leaf %llu for root %llu", + eb->start, btrfs_root_id(root)); return ret; } ASSERT(wc->subvol_path == NULL); wc->subvol_path = btrfs_alloc_path(); if (!wc->subvol_path) { - btrfs_abort_transaction(trans, -ENOMEM); + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; } @@ -2631,7 +2881,10 @@ static int replay_one_buffer(struct extent_buffer *eb, inode = btrfs_iget_logging(wc->log_key.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + wc->log_key.objectid, + btrfs_root_id(root)); break; } from = ALIGN(i_size_read(&inode->vfs_inode), @@ -2642,14 +2895,21 @@ static int replay_one_buffer(struct extent_buffer *eb, drop_args.path = wc->subvol_path; ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to drop extents for inode %llu root %llu offset %llu", + btrfs_ino(inode), + btrfs_root_id(root), + from); } else { inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ ret = btrfs_update_inode(trans, inode); if (ret) - btrfs_abort_transaction(trans, ret); + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); } iput(&inode->vfs_inode); if (ret) From 0dc93e465289d3eee30abd4710bac8d9f475311f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 8 Sep 2025 12:19:29 +0100 Subject: [PATCH 2571/3206] btrfs: send: index backref cache by node number instead of by sector number We now have a nodesize_bits member in fs_info so we can index an extent buffer in the backref cache by node number instead of by sector number. While this allows for a denser index space with the possibility of using less maple tree nodes, in practice it's unlikely to hit such benefits since we currently limit the maximum number of keys in the cache to 128, so unless all extent buffers are contiguous we are unlikely to see a memory usage reduction in the backing maple tree due to fewer nodes. Nevertheless it doesn't cost anything to index by node number and it's more logical. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c5771df3a2c7c0..32653fc44a7583 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1388,7 +1388,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct backref_ctx *bctx = ctx; struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; - const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + const u64 key = leaf_bytenr >> fs_info->nodesize_bits; struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; @@ -1443,7 +1443,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, if (!new_entry) return; - new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.key = leaf_bytenr >> fs_info->nodesize_bits; new_entry->entry.gen = 0; new_entry->num_roots = 0; ULIST_ITER_INIT(&uiter); From aab9458b9f0019e97fae394c2d6d9d1a03addfb3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 16 Sep 2025 08:34:05 +0930 Subject: [PATCH 2572/3206] btrfs: tree-checker: add inode extref checks Like inode refs, inode extrefs have a variable length name, which means we have to do a proper check to make sure no header nor name can exceed the item limits. The check itself is very similar to check_inode_ref(), just a different structure (btrfs_inode_extref vs btrfs_inode_ref). Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index c2aac08055fb84..ca30b15ea45234 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -183,6 +183,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, /* Only these key->types needs to be checked */ ASSERT(key->type == BTRFS_XATTR_ITEM_KEY || key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY || key->type == BTRFS_DIR_INDEX_KEY || key->type == BTRFS_DIR_ITEM_KEY || key->type == BTRFS_EXTENT_DATA_KEY); @@ -1782,6 +1783,39 @@ static int check_inode_ref(struct extent_buffer *leaf, return 0; } +static int check_inode_extref(struct extent_buffer *leaf, + struct btrfs_key *key, struct btrfs_key *prev_key, + int slot) +{ + unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); + unsigned long end = ptr + btrfs_item_size(leaf, slot); + + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) + return -EUCLEAN; + + while (ptr < end) { + struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr; + u16 namelen; + + if (unlikely(ptr + sizeof(*extref)) > end) { + inode_ref_err(leaf, slot, + "inode extref overflow, ptr %lu end %lu inode_extref size %zu", + ptr, end, sizeof(*extref)); + return -EUCLEAN; + } + + namelen = btrfs_inode_extref_name_len(leaf, extref); + if (unlikely(ptr + sizeof(*extref) + namelen > end)) { + inode_ref_err(leaf, slot, + "inode extref overflow, ptr %lu end %lu namelen %u", + ptr, end, namelen); + return -EUCLEAN; + } + ptr += sizeof(*extref) + namelen; + } + return 0; +} + static int check_raid_stripe_extent(const struct extent_buffer *leaf, const struct btrfs_key *key, int slot) { @@ -1893,6 +1927,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_INODE_REF_KEY: ret = check_inode_ref(leaf, key, prev_key, slot); break; + case BTRFS_INODE_EXTREF_KEY: + ret = check_inode_extref(leaf, key, prev_key, slot); + break; case BTRFS_BLOCK_GROUP_ITEM_KEY: ret = check_block_group_item(leaf, key, slot); break; From ac9affd89949cb7680e29fdb3f5d6d41e4997cd6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 10 Sep 2025 15:11:41 +0100 Subject: [PATCH 2573/3206] btrfs: print-tree: print missing fields for inode items We are not dumping a lot of fields for an inode item which are useful for debugging whenever we dump a leaf (log replay failure for example), so add them and make it as close as possible to the print tree implementation in btrfs-progs (things like converting timespecs to human readable dates and converting flags to strings are missing since they are not so practical to do in the kernel). Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 74e38da9bd39cd..ce89974f8fd4fb 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -227,6 +227,36 @@ static void print_eb_refs_lock(const struct extent_buffer *eb) #endif } +static void print_timespec(const struct extent_buffer *eb, + struct btrfs_timespec *timespec, + const char *prefix, const char *suffix) +{ + const u64 secs = btrfs_timespec_sec(eb, timespec); + const u32 nsecs = btrfs_timespec_nsec(eb, timespec); + + pr_info("%s%llu.%u%s", prefix, secs, nsecs, suffix); +} + +static void print_inode_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_inode_item *ii = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + + pr_info("\t\tinode generation %llu transid %llu size %llu nbytes %llu\n", + btrfs_inode_generation(eb, ii), btrfs_inode_transid(eb, ii), + btrfs_inode_size(eb, ii), btrfs_inode_nbytes(eb, ii)); + pr_info("\t\tblock group %llu mode %o links %u uid %u gid %u\n", + btrfs_inode_block_group(eb, ii), btrfs_inode_mode(eb, ii), + btrfs_inode_nlink(eb, ii), btrfs_inode_uid(eb, ii), + btrfs_inode_gid(eb, ii)); + pr_info("\t\trdev %llu sequence %llu flags 0x%llx\n", + btrfs_inode_rdev(eb, ii), btrfs_inode_sequence(eb, ii), + btrfs_inode_flags(eb, ii)); + print_timespec(eb, &ii->atime, "\t\tatime ", "\n"); + print_timespec(eb, &ii->ctime, "\t\tctime ", "\n"); + print_timespec(eb, &ii->mtime, "\t\tmtime ", "\n"); + print_timespec(eb, &ii->otime, "\t\totime ", "\n"); +} + void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; @@ -234,7 +264,6 @@ void btrfs_print_leaf(const struct extent_buffer *l) u32 type, nr; struct btrfs_root_item *ri; struct btrfs_dir_item *di; - struct btrfs_inode_item *ii; struct btrfs_block_group_item *bi; struct btrfs_file_extent_item *fi; struct btrfs_extent_data_ref *dref; @@ -262,11 +291,7 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_item_offset(l, i), btrfs_item_size(l, i)); switch (type) { case BTRFS_INODE_ITEM_KEY: - ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); - pr_info("\t\tinode generation %llu size %llu mode %o\n", - btrfs_inode_generation(l, ii), - btrfs_inode_size(l, ii), - btrfs_inode_mode(l, ii)); + print_inode_item(l, i); break; case BTRFS_DIR_ITEM_KEY: di = btrfs_item_ptr(l, i, struct btrfs_dir_item); From 96fb032238d9bede9d3ade60047b8d604fcb4363 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 10 Sep 2025 16:44:15 +0100 Subject: [PATCH 2574/3206] btrfs: print-tree: print more information about dir items Currently we only print the object id component of the location key from a dir item and the flags. We are missing the whole key, transid and the name and data lengths. We are also ignoring the fact that we can have multiple dir item objects encoded in a single item for a BTRFS_DIR_ITEM_KEY key, so what we print is only for the first item. Improve on this by iterating on all dir items and print the missing information. This is done with the same format as in btrfs-progs, what we miss is printing the names and data since not only that would require some processing and escaping like in btrfs-progs, but it would also reveal information that may be sensitive and users may not want to share that in case that get a leaf dumped in dmesg. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index ce89974f8fd4fb..df7fe10061ab6a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -257,20 +257,41 @@ static void print_inode_item(const struct extent_buffer *eb, int i) print_timespec(eb, &ii->otime, "\t\totime ", "\n"); } +static void print_dir_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_dir_item *di = btrfs_item_ptr(eb, i, struct btrfs_dir_item); + u32 cur = 0; + + while (cur < size) { + const u32 name_len = btrfs_dir_name_len(eb, di); + const u32 data_len = btrfs_dir_data_len(eb, di); + const u32 len = sizeof(*di) + name_len + data_len; + struct btrfs_key location; + + btrfs_dir_item_key_to_cpu(eb, di, &location); + pr_info("\t\tlocation key (%llu %u %llu) type %d\n", + location.objectid, location.type, location.offset, + btrfs_dir_ftype(eb, di)); + pr_info("\t\ttransid %llu data_len %u name_len %u\n", + btrfs_dir_transid(eb, di), data_len, name_len); + di = (struct btrfs_dir_item *)((char *)di + len); + cur += len; + } +} + void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; int i; u32 type, nr; struct btrfs_root_item *ri; - struct btrfs_dir_item *di; struct btrfs_block_group_item *bi; struct btrfs_file_extent_item *fi; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; struct btrfs_dev_extent *dev_extent; struct btrfs_key key; - struct btrfs_key found_key; if (!l) return; @@ -294,11 +315,7 @@ void btrfs_print_leaf(const struct extent_buffer *l) print_inode_item(l, i); break; case BTRFS_DIR_ITEM_KEY: - di = btrfs_item_ptr(l, i, struct btrfs_dir_item); - btrfs_dir_item_key_to_cpu(l, di, &found_key); - pr_info("\t\tdir oid %llu flags %u\n", - found_key.objectid, - btrfs_dir_flags(l, di)); + print_dir_item(l, i); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); From 93f818e62a081c707e6833f3ba2b7fa25c90199c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 10 Sep 2025 17:10:37 +0100 Subject: [PATCH 2575/3206] btrfs: print-tree: print dir items for dir index and xattr keys too Currently we only print the dir items for BTRFS_DIR_ITEM_KEY keys, but we also have dir items for BTRFS_DIR_INDEX_KEY and BTRFS_XATTR_ITEM_KEY keys too. So print them for those keys too. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index df7fe10061ab6a..5ae611cb3f2e89 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -315,6 +315,8 @@ void btrfs_print_leaf(const struct extent_buffer *l) print_inode_item(l, i); break; case BTRFS_DIR_ITEM_KEY: + case BTRFS_DIR_INDEX_KEY: + case BTRFS_XATTR_ITEM_KEY: print_dir_item(l, i); break; case BTRFS_ROOT_ITEM_KEY: From cee3aa138724fd354acd5947d96a3d295b606899 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 11 Sep 2025 17:36:28 +0100 Subject: [PATCH 2576/3206] btrfs: print-tree: print information about inode ref items Currently we ignore inode ref items, we just print their key, item offset in the leaf and their size, no information about their content like the index number, name length and name. Improve on this by printing the index and name length in the same format as btrfs-progs. Note that we don't print the name, as that would require some processing and escaping like we do in btrfs-progs, and that could expose sensitive information for some users in case they share their dmesg/syslog and it contains a leaf dump. So for now leave names out. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 5ae611cb3f2e89..7a7156257d5930 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -280,6 +280,23 @@ static void print_dir_item(const struct extent_buffer *eb, int i) } } +static void print_inode_ref_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_inode_ref *ref = btrfs_item_ptr(eb, i, struct btrfs_inode_ref); + u32 cur = 0; + + while (cur < size) { + const u64 index = btrfs_inode_ref_index(eb, ref); + const u32 name_len = btrfs_inode_ref_name_len(eb, ref); + const u32 len = sizeof(*ref) + name_len; + + pr_info("\t\tindex %llu name_len %u\n", index, name_len); + ref = (struct btrfs_inode_ref *)((char *)ref + len); + cur += len; + } +} + void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; @@ -314,6 +331,9 @@ void btrfs_print_leaf(const struct extent_buffer *l) case BTRFS_INODE_ITEM_KEY: print_inode_item(l, i); break; + case BTRFS_INODE_REF_KEY: + print_inode_ref_item(l, i); + break; case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: case BTRFS_XATTR_ITEM_KEY: From 7317555f4553aaec9dc732e24c88b8258b3c9deb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 12 Sep 2025 17:25:12 +0100 Subject: [PATCH 2577/3206] btrfs: print-tree: print information about inode extref items Currently we ignore inode extref items, we just print their key, item offset in the leaf and their size, no information about their content like the index number, parent inode, name length and name. Improve on this by printing the index, parent and name length in the same format as btrfs-progs. Note that we don't print the name, as that would require some processing and escaping like we do in btrfs-progs, and that could expose sensitive information for some users in case they share their dmesg/syslog and it contains a leaf dump. So for now leave names out. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 7a7156257d5930..0a2a6e82a3bfc7 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -297,6 +297,26 @@ static void print_inode_ref_item(const struct extent_buffer *eb, int i) } } +static void print_inode_extref_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_inode_extref *extref; + u32 cur = 0; + + extref = btrfs_item_ptr(eb, i, struct btrfs_inode_extref); + while (cur < size) { + const u64 index = btrfs_inode_extref_index(eb, extref); + const u32 name_len = btrfs_inode_extref_name_len(eb, extref); + const u64 parent = btrfs_inode_extref_parent(eb, extref); + const u32 len = sizeof(*extref) + name_len; + + pr_info("\t\tindex %llu parent %llu name_len %u\n", + index, parent, name_len); + extref = (struct btrfs_inode_extref *)((char *)extref + len); + cur += len; + } +} + void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; @@ -334,6 +354,9 @@ void btrfs_print_leaf(const struct extent_buffer *l) case BTRFS_INODE_REF_KEY: print_inode_ref_item(l, i); break; + case BTRFS_INODE_EXTREF_KEY: + print_inode_extref_item(l, i); + break; case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: case BTRFS_XATTR_ITEM_KEY: From 7d2197b5dc0cd02e9758fd4dd0d1e7e3707dfbe7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 12 Sep 2025 17:43:46 +0100 Subject: [PATCH 2578/3206] btrfs: print-tree: print information about dir log items We currently don't print information about dir log items (other than the key, item offset and item size), which is useful to look at when debugging problems with a log tree. So print their specific information (currently they only have an end index number) in a format similar to btrfs-progs. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0a2a6e82a3bfc7..a66aced1d29c6f 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -317,6 +317,14 @@ static void print_inode_extref_item(const struct extent_buffer *eb, int i) } } +static void print_dir_log_index_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_dir_log_item *dlog; + + dlog = btrfs_item_ptr(eb, i, struct btrfs_dir_log_item); + pr_info("\t\tdir log end %llu\n", btrfs_dir_log_end(eb, dlog)); +} + void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; @@ -362,6 +370,9 @@ void btrfs_print_leaf(const struct extent_buffer *l) case BTRFS_XATTR_ITEM_KEY: print_dir_item(l, i); break; + case BTRFS_DIR_LOG_INDEX_KEY: + print_dir_log_index_item(l, i); + break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); pr_info("\t\troot data bytenr %llu refs %u\n", From 4dc1c3d0ae6fbfaf8ce5727d1ae098275eb276c6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Sep 2025 11:09:10 +0100 Subject: [PATCH 2579/3206] btrfs: print-tree: print range information for extent csum items Currently we don't print anything for extent csum items other than the generic line with the key, item offset and item size. While one can still determine the range the extent csum covers by doing a few simple computations, it makes it more time consuming to analyse a leaf dump. So add a line that prints information about the range covered by the checksum using the same format as btrfs-progs. This is useful when debugging log tree issues since we log extent csum items for new extents. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index a66aced1d29c6f..c2898fa6d4ba44 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -325,6 +325,18 @@ static void print_dir_log_index_item(const struct extent_buffer *eb, int i) pr_info("\t\tdir log end %llu\n", btrfs_dir_log_end(eb, dlog)); } +static void print_extent_csum(const struct extent_buffer *eb, int i) +{ + const struct btrfs_fs_info *fs_info = eb->fs_info; + const u32 size = btrfs_item_size(eb, i); + const u32 csum_bytes = (size / fs_info->csum_size) * fs_info->sectorsize; + struct btrfs_key key; + + btrfs_item_key_to_cpu(eb, &key, i); + pr_info("\t\trange start %llu end %llu length %u\n", + key.offset, key.offset + csum_bytes, csum_bytes); +} + void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; @@ -373,6 +385,9 @@ void btrfs_print_leaf(const struct extent_buffer *l) case BTRFS_DIR_LOG_INDEX_KEY: print_dir_log_index_item(l, i); break; + case BTRFS_EXTENT_CSUM_KEY: + print_extent_csum(l, i); + break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); pr_info("\t\troot data bytenr %llu refs %u\n", From c1b9a4782bc6b9169be6b721b161781696cbd1a7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Sep 2025 11:37:34 +0100 Subject: [PATCH 2580/3206] btrfs: print-tree: print correct inline extent data size We are advertising the ram_bytes of an inline extent as its data size, but that is not true for compressed extents. The ram_bytes corresponds to the uncompressed data size while the data size (compressed data) is given by btrfs_file_extent_inline_item_len(). So fix this and print both values in the same format as in btrfs-progs. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index c2898fa6d4ba44..835b41874a7aa8 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -6,6 +6,7 @@ #include "messages.h" #include "ctree.h" #include "disk-io.h" +#include "file-item.h" #include "print-tree.h" #include "accessors.h" #include "tree-checker.h" @@ -423,8 +424,9 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_file_extent_type(l, fi)); if (btrfs_file_extent_type(l, fi) == BTRFS_FILE_EXTENT_INLINE) { - pr_info("\t\tinline extent data size %llu\n", - btrfs_file_extent_ram_bytes(l, fi)); + pr_info("\t\tinline extent data size %u ram_bytes %llu\n", + btrfs_file_extent_inline_item_len(l, i), + btrfs_file_extent_ram_bytes(l, fi)); break; } pr_info("\t\textent data disk bytenr %llu nr %llu\n", From caac17073760d30a879517145f033a72eee507ae Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Sep 2025 11:54:49 +0100 Subject: [PATCH 2581/3206] btrfs: print-tree: print compression type for file extent items We are not printing anything about the compression type, so add that useful information in the same format as btrfs-progs. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 835b41874a7aa8..417e3860cd251e 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -424,9 +424,10 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_file_extent_type(l, fi)); if (btrfs_file_extent_type(l, fi) == BTRFS_FILE_EXTENT_INLINE) { - pr_info("\t\tinline extent data size %u ram_bytes %llu\n", + pr_info("\t\tinline extent data size %u ram_bytes %llu compression %hhu\n", btrfs_file_extent_inline_item_len(l, i), - btrfs_file_extent_ram_bytes(l, fi)); + btrfs_file_extent_ram_bytes(l, fi), + btrfs_file_extent_compression(l, fi)); break; } pr_info("\t\textent data disk bytenr %llu nr %llu\n", @@ -436,6 +437,8 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_file_extent_offset(l, fi), btrfs_file_extent_num_bytes(l, fi), btrfs_file_extent_ram_bytes(l, fi)); + pr_info("\t\textent compression %hhu\n", + btrfs_file_extent_compression(l, fi)); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, From 00b7eaaaa5ac1d686f8f5edc2f999710304236dc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Sep 2025 15:23:43 +0100 Subject: [PATCH 2582/3206] btrfs: print-tree: move code for processing file extent item into helper The code for processing file extent items is quite large and it's better to have it in a dedicated helper rather than in a huge switch statement, just like we do in btrfs-progs. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 52 ++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 417e3860cd251e..02ad18abefb9f7 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -338,6 +338,34 @@ static void print_extent_csum(const struct extent_buffer *eb, int i) key.offset, key.offset + csum_bytes, csum_bytes); } +static void print_file_extent_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + pr_info("\t\tgeneration %llu type %hhu\n", + btrfs_file_extent_generation(eb, fi), + btrfs_file_extent_type(eb, fi)); + + if (btrfs_file_extent_type(eb, fi) == BTRFS_FILE_EXTENT_INLINE) { + pr_info("\t\tinline extent data size %u ram_bytes %llu compression %hhu\n", + btrfs_file_extent_inline_item_len(eb, i), + btrfs_file_extent_ram_bytes(eb, fi), + btrfs_file_extent_compression(eb, fi)); + return; + } + + pr_info("\t\textent data disk bytenr %llu nr %llu\n", + btrfs_file_extent_disk_bytenr(eb, fi), + btrfs_file_extent_disk_num_bytes(eb, fi)); + pr_info("\t\textent data offset %llu nr %llu ram %llu\n", + btrfs_file_extent_offset(eb, fi), + btrfs_file_extent_num_bytes(eb, fi), + btrfs_file_extent_ram_bytes(eb, fi)); + pr_info("\t\textent compression %hhu\n", + btrfs_file_extent_compression(eb, fi)); +} + void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; @@ -345,7 +373,6 @@ void btrfs_print_leaf(const struct extent_buffer *l) u32 type, nr; struct btrfs_root_item *ri; struct btrfs_block_group_item *bi; - struct btrfs_file_extent_item *fi; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; struct btrfs_dev_extent *dev_extent; @@ -417,28 +444,7 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_shared_data_ref_count(l, sref)); break; case BTRFS_EXTENT_DATA_KEY: - fi = btrfs_item_ptr(l, i, - struct btrfs_file_extent_item); - pr_info("\t\tgeneration %llu type %hhu\n", - btrfs_file_extent_generation(l, fi), - btrfs_file_extent_type(l, fi)); - if (btrfs_file_extent_type(l, fi) == - BTRFS_FILE_EXTENT_INLINE) { - pr_info("\t\tinline extent data size %u ram_bytes %llu compression %hhu\n", - btrfs_file_extent_inline_item_len(l, i), - btrfs_file_extent_ram_bytes(l, fi), - btrfs_file_extent_compression(l, fi)); - break; - } - pr_info("\t\textent data disk bytenr %llu nr %llu\n", - btrfs_file_extent_disk_bytenr(l, fi), - btrfs_file_extent_disk_num_bytes(l, fi)); - pr_info("\t\textent data offset %llu nr %llu ram %llu\n", - btrfs_file_extent_offset(l, fi), - btrfs_file_extent_num_bytes(l, fi), - btrfs_file_extent_ram_bytes(l, fi)); - pr_info("\t\textent compression %hhu\n", - btrfs_file_extent_compression(l, fi)); + print_file_extent_item(l, i); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, From 26baec69ac85d6ce899ae9c43827250551e494f2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Sep 2025 16:23:04 +0100 Subject: [PATCH 2583/3206] btrfs: print-tree: print key types as human readable strings Looking at a leaf dump from the kernel's print-tree implementation is not so friendly to analyze since key types are printed as numbers. Improve on this by printing key types as strings that are a diminutive of the macro names for key types, just like we do in btrfs-progs. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 68 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 02ad18abefb9f7..62b993fae54ff3 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -13,6 +13,12 @@ #include "volumes.h" #include "raid-stripe-tree.h" +/* + * Large enough buffer size for the stringification of any key type yet short + * enough to use the stack and avoid allocations. + */ +#define KEY_TYPE_BUF_SIZE 32 + struct root_name_map { u64 id; const char *name; @@ -366,6 +372,60 @@ static void print_file_extent_item(const struct extent_buffer *eb, int i) btrfs_file_extent_compression(eb, fi)); } +static void key_type_string(const struct btrfs_key *key, char *buf, int buf_size) +{ + static const char *key_to_str[256] = { + [BTRFS_INODE_ITEM_KEY] = "INODE_ITEM", + [BTRFS_INODE_REF_KEY] = "INODE_REF", + [BTRFS_INODE_EXTREF_KEY] = "INODE_EXTREF", + [BTRFS_DIR_ITEM_KEY] = "DIR_ITEM", + [BTRFS_DIR_INDEX_KEY] = "DIR_INDEX", + [BTRFS_DIR_LOG_ITEM_KEY] = "DIR_LOG_ITEM", + [BTRFS_DIR_LOG_INDEX_KEY] = "DIR_LOG_INDEX", + [BTRFS_XATTR_ITEM_KEY] = "XATTR_ITEM", + [BTRFS_VERITY_DESC_ITEM_KEY] = "VERITY_DESC_ITEM", + [BTRFS_VERITY_MERKLE_ITEM_KEY] = "VERITY_MERKLE_ITEM", + [BTRFS_ORPHAN_ITEM_KEY] = "ORPHAN_ITEM", + [BTRFS_ROOT_ITEM_KEY] = "ROOT_ITEM", + [BTRFS_ROOT_REF_KEY] = "ROOT_REF", + [BTRFS_ROOT_BACKREF_KEY] = "ROOT_BACKREF", + [BTRFS_EXTENT_ITEM_KEY] = "EXTENT_ITEM", + [BTRFS_METADATA_ITEM_KEY] = "METADATA_ITEM", + [BTRFS_TREE_BLOCK_REF_KEY] = "TREE_BLOCK_REF", + [BTRFS_SHARED_BLOCK_REF_KEY] = "SHARED_BLOCK_REF", + [BTRFS_EXTENT_DATA_REF_KEY] = "EXTENT_DATA_REF", + [BTRFS_SHARED_DATA_REF_KEY] = "SHARED_DATA_REF", + [BTRFS_EXTENT_OWNER_REF_KEY] = "EXTENT_OWNER_REF", + [BTRFS_EXTENT_CSUM_KEY] = "EXTENT_CSUM", + [BTRFS_EXTENT_DATA_KEY] = "EXTENT_DATA", + [BTRFS_BLOCK_GROUP_ITEM_KEY] = "BLOCK_GROUP_ITEM", + [BTRFS_FREE_SPACE_INFO_KEY] = "FREE_SPACE_INFO", + [BTRFS_FREE_SPACE_EXTENT_KEY] = "FREE_SPACE_EXTENT", + [BTRFS_FREE_SPACE_BITMAP_KEY] = "FREE_SPACE_BITMAP", + [BTRFS_CHUNK_ITEM_KEY] = "CHUNK_ITEM", + [BTRFS_DEV_ITEM_KEY] = "DEV_ITEM", + [BTRFS_DEV_EXTENT_KEY] = "DEV_EXTENT", + [BTRFS_TEMPORARY_ITEM_KEY] = "TEMPORARY_ITEM", + [BTRFS_DEV_REPLACE_KEY] = "DEV_REPLACE", + [BTRFS_STRING_ITEM_KEY] = "STRING_ITEM", + [BTRFS_QGROUP_STATUS_KEY] = "QGROUP_STATUS", + [BTRFS_QGROUP_RELATION_KEY] = "QGROUP_RELATION", + [BTRFS_QGROUP_INFO_KEY] = "QGROUP_INFO", + [BTRFS_QGROUP_LIMIT_KEY] = "QGROUP_LIMIT", + [BTRFS_PERSISTENT_ITEM_KEY] = "PERSISTENT_ITEM", + [BTRFS_UUID_KEY_SUBVOL] = "UUID_KEY_SUBVOL", + [BTRFS_UUID_KEY_RECEIVED_SUBVOL] = "UUID_KEY_RECEIVED_SUBVOL", + [BTRFS_RAID_STRIPE_KEY] = "RAID_STRIPE", + }; + + if (key->type == 0 && key->objectid == BTRFS_FREE_SPACE_OBJECTID) + scnprintf(buf, buf_size, "UNTYPED"); + else if (key_to_str[key->type]) + scnprintf(buf, buf_size, key_to_str[key->type]); + else + scnprintf(buf, buf_size, "UNKNOWN.%d", key->type); +} + void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; @@ -390,10 +450,14 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_leaf_free_space(l), btrfs_header_owner(l)); print_eb_refs_lock(l); for (i = 0 ; i < nr ; i++) { + char key_buf[KEY_TYPE_BUF_SIZE]; + btrfs_item_key_to_cpu(l, &key, i); type = key.type; - pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n", - i, key.objectid, type, key.offset, + key_type_string(&key, key_buf, KEY_TYPE_BUF_SIZE); + + pr_info("\titem %d key (%llu %s %llu) itemoff %d itemsize %d\n", + i, key.objectid, key_buf, key.offset, btrfs_item_offset(l, i), btrfs_item_size(l, i)); switch (type) { case BTRFS_INODE_ITEM_KEY: From 6a9e1d1a65fef95fcf3097a75ad399f38c7ecb64 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Sep 2025 12:13:39 +0100 Subject: [PATCH 2584/3206] btrfs: store and use node size in local variable in check_eb_alignment() Instead of dereferencing fs_info every time we need to access the node size, store in a local variable to make the code less verbose and avoid a line split too. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ca7174fa024056..681f4f2e44197a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3226,29 +3226,30 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, */ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) { + const u32 nodesize = fs_info->nodesize; + if (!IS_ALIGNED(start, fs_info->sectorsize)) { btrfs_err(fs_info, "bad tree block start %llu", start); return true; } - if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) { + if (nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize)) { btrfs_err(fs_info, "tree block is not nodesize aligned, start %llu nodesize %u", - start, fs_info->nodesize); + start, nodesize); return true; } - if (fs_info->nodesize >= PAGE_SIZE && - !PAGE_ALIGNED(start)) { + if (nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start)) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", - start, fs_info->nodesize); + start, nodesize); return true; } - if (!IS_ALIGNED(start, fs_info->nodesize) && + if (!IS_ALIGNED(start, nodesize) && !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { btrfs_warn(fs_info, "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", - start, fs_info->nodesize); + start, nodesize); } return false; } From 8f0534ec96e3cbba2ecefe560a482382198c044f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Sep 2025 15:50:49 +0100 Subject: [PATCH 2585/3206] btrfs: mark extent buffer alignment checks as unlikely We are not expecting to ever fail the extent buffer alignment checks, so mark them as unlikely to allow the compiler to potentially generate more optimized code. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 681f4f2e44197a..5f0cce1bb7c6f8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3228,25 +3228,25 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) { const u32 nodesize = fs_info->nodesize; - if (!IS_ALIGNED(start, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) { btrfs_err(fs_info, "bad tree block start %llu", start); return true; } - if (nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize)) { + if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) { btrfs_err(fs_info, "tree block is not nodesize aligned, start %llu nodesize %u", start, nodesize); return true; } - if (nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start)) { + if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", start, nodesize); return true; } - if (!IS_ALIGNED(start, nodesize) && - !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { + if (unlikely(!IS_ALIGNED(start, nodesize) && + !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) { btrfs_warn(fs_info, "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", start, nodesize); From b0e30e373e3752d4a59cb0f2b6a4aa8b3b54f041 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Sep 2025 16:09:50 +0100 Subject: [PATCH 2586/3206] btrfs: mark as unlikely not uptodate extent buffer checks when navigating btrees We expect that after attempting to read an extent buffer we had no errors therefore the extent buffer is up to date, so mark the checks for a not up to date extent buffer as unlikely and allow the compiler to pontentially generate better code. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6f9465d4ce54ce..f6d2a4a4b9eb82 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -844,7 +844,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, &check); if (IS_ERR(eb)) return eb; - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); return ERR_PTR(-EIO); } @@ -1571,7 +1571,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!extent_buffer_uptodate(tmp)) { + if (unlikely(!extent_buffer_uptodate(tmp))) { ret = -EIO; goto out; } @@ -1752,7 +1752,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * The root may have failed to write out at some point, and thus is no * longer valid, return an error in this case. */ - if (!extent_buffer_uptodate(b)) { + if (unlikely(!extent_buffer_uptodate(b))) { if (root_lock) btrfs_tree_unlock_rw(b, root_lock); free_extent_buffer(b); @@ -2260,7 +2260,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, again: b = btrfs_get_old_root(root, time_seq); - if (!b) { + if (unlikely(!b)) { ret = -EIO; goto done; } From 5afe85b771eefe91891cca332cb916f8da6d48e1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Sep 2025 16:33:00 +0100 Subject: [PATCH 2587/3206] btrfs: mark leaf space and overflow checks as unlikely on insert and extension We have several sanity checks when inserting or extending items in a btree that verify we didn't overflow the leaf or access a slot beyond the last one. These are cases that are never expected to be hit so mark them as unlikely, allowing the compiler to potentially generate better code. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f6d2a4a4b9eb82..dc185322362b37 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3086,7 +3086,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf) int ret; ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_crit(fs_info, "leaf free space ret %d, leaf data size %lu, used %d nritems %d", ret, @@ -4075,7 +4075,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, btrfs_set_item_size(leaf, slot, new_size); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } @@ -4108,7 +4108,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, old_data = btrfs_item_data_end(leaf, slot); BUG_ON(slot < 0); - if (slot >= nritems) { + if (unlikely(slot >= nritems)) { btrfs_print_leaf(leaf); btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", slot, nritems); @@ -4135,7 +4135,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, btrfs_set_item_size(leaf, slot, old_size + data_size); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } @@ -4183,7 +4183,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, data_end = leaf_data_end(leaf); total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); - if (btrfs_leaf_free_space(leaf) < total_size) { + if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) { btrfs_print_leaf(leaf); btrfs_crit(fs_info, "not enough freespace need %u have %d", total_size, btrfs_leaf_free_space(leaf)); @@ -4193,7 +4193,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, if (slot != nritems) { unsigned int old_data = btrfs_item_data_end(leaf, slot); - if (old_data < data_end) { + if (unlikely(old_data < data_end)) { btrfs_print_leaf(leaf); btrfs_crit(fs_info, "item at slot %d with data offset %u beyond data end of leaf %u", @@ -4232,7 +4232,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(leaf, nritems + batch->nr); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } From 2d83ed6c6c4607b42ee7927e92a9d2fa31d6f30b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 18 Sep 2025 08:40:45 +0930 Subject: [PATCH 2588/3206] btrfs: return any hit error from extent_writepage_io() Since the support of bs < ps support, extent_writepage_io() will submit multiple blocks inside the folio. But if we hit error submitting one sector, but the next sector can still be submitted successfully, the function extent_writepage_io() will still return 0. This will make btrfs to silently ignore the error without setting error flag for the filemap. Fix it by recording the first error hit, and always return that value. Fixes: 8bf334beb349 ("btrfs: fix double accounting race when extent_writepage_io() failed") Reviewed-by: Daniel Vacek Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5f0cce1bb7c6f8..0782533aad5174 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1679,7 +1679,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; bool submitted_io = false; - bool error = false; + int found_error = 0; const u64 folio_start = folio_pos(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; @@ -1743,7 +1743,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, */ btrfs_mark_ordered_io_finished(inode, folio, cur, fs_info->sectorsize, false); - error = true; + if (!found_error) + found_error = ret; continue; } submitted_io = true; @@ -1760,11 +1761,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * If we hit any error, the corresponding sector will have its dirty * flag cleared and writeback finished, thus no need to handle the error case. */ - if (!submitted_io && !error) { + if (!submitted_io && !found_error) { btrfs_folio_set_writeback(fs_info, folio, start, len); btrfs_folio_clear_writeback(fs_info, folio, start, len); } - return ret; + return found_error; } /* From a35b3dd59bf6fe2bf3c2777bdde623d722caa68c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 18 Sep 2025 12:58:53 +0100 Subject: [PATCH 2589/3206] btrfs: fix comment about nbytes increase at replay_one_extent() The comment is wrong about the part where it says a prealloc extent does not contribute to an inode's nbytes - it does. Only holes don't contribute and that's what we are checking for, as prealloc extents always have a disk_bytenr different from 0. So fix the comment and re-organize the code to not set nbytes twice and set it to the extent item's number of bytes only if it doesn't represent a hole - in case it's a hole we have already initialized nbytes to 0 when we declared it. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 144b1272536506..96492080fed85a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -740,15 +740,10 @@ static noinline int replay_one_extent(struct walk_control *wc) if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item); - extent_end = start + nbytes; - - /* - * We don't add to the inodes nbytes if we are prealloc or a - * hole. - */ - if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) - nbytes = 0; + extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item); + /* Holes don't take up space. */ + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0) + nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size = btrfs_file_extent_ram_bytes(wc->log_leaf, item); nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item); From b7ff7b0d76e5478c319ab2066b287e156968231f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 18 Sep 2025 13:29:16 +0100 Subject: [PATCH 2590/3206] btrfs: simplify inline extent end calculation at replay_one_extent() There is no need to store the extent's ram_bytes in two variables, further more one of them, named 'size', is used only for the extent's end offset calculation. So remove the 'size' variable and use 'nbytes' only. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 96492080fed85a..ac7805d40ab24a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -732,7 +732,6 @@ static noinline int replay_one_extent(struct walk_control *wc) struct btrfs_key ins; struct btrfs_file_extent_item *item; struct btrfs_inode *inode = NULL; - unsigned long size; int ret = 0; item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item); @@ -745,10 +744,8 @@ static noinline int replay_one_extent(struct walk_control *wc) if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0) nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_ram_bytes(wc->log_leaf, item); nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item); - extent_end = ALIGN(start + size, - fs_info->sectorsize); + extent_end = ALIGN(start + nbytes, fs_info->sectorsize); } else { btrfs_abort_log_replay(wc, -EUCLEAN, "unexpected extent type=%d root=%llu inode=%llu offset=%llu", From f07575bab632af8d665c8cc95be0e83ca5d3bd80 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 19 Sep 2025 09:16:20 +0100 Subject: [PATCH 2591/3206] btrfs: make the rule checking more readable for should_cow_block() It's quite hard and unreadable the way the rule checks are organized in should_cow_block(). We have a single if statement that returns 0 (false) and it checks several conditions, with one them being a negated compound condition which is particularly hard to reason immediately. Improve on this by using multiple if statements, each checking a single condition and returning immediately. Also change the return type from an integer to a boolean, since all we need is to return true or false. At least on x86_64 with Debian's gcc 14.2.0-19, this also reduces the object code size by 64 bytes. Before this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1913327 161567 15592 2090486 1fe5f6 fs/btrfs/btrfs.ko After this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1913263 161567 15592 2090422 1fe5b6 fs/btrfs/btrfs.ko Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dc185322362b37..13dc44e9076246 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -613,15 +613,12 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, return ret; } -static inline int should_cow_block(const struct btrfs_trans_handle *trans, - const struct btrfs_root *root, - const struct extent_buffer *buf) +static inline bool should_cow_block(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { if (btrfs_is_testing(root->fs_info)) - return 0; - - /* Ensure we can see the FORCE_COW bit */ - smp_mb__before_atomic(); + return false; /* * We do not need to cow a block if @@ -634,13 +631,25 @@ static inline int should_cow_block(const struct btrfs_trans_handle *trans, * after we've finished copying src root, we must COW the shared * block to ensure the metadata consistency. */ - if (btrfs_header_generation(buf) == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && - !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && - !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) - return 0; - return 1; + + if (btrfs_header_generation(buf) != trans->transid) + return true; + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) + return true; + + /* Ensure we can see the FORCE_COW bit. */ + smp_mb__before_atomic(); + if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) + return true; + + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return false; + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) + return true; + + return false; } /* From db524fd9802f7ec3281a7f1fdae66c67c9525ba3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 19 Sep 2025 09:55:12 +0100 Subject: [PATCH 2592/3206] btrfs: annotate btrfs_is_testing() as unlikely and make it return bool We can annotate btrfs_is_testing() as unlikely since that's the most expected scenario and it's desirable for the compiler to optimize for the case we are not running the self tests. So add the annotation to btrfs_is_testing() and while at it also make it return bool instead of int. Also make two of the existing callers use btrfs_is_testing() directly instead of storing its result in a local variable. On x86_64 with Debian's gcc 14.2.0-19 this resulted in a very tiny object code reduction. Before this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1913263 161567 15592 2090422 1fe5b6 fs/btrfs/btrfs.ko After this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1913257 161567 15592 2090416 1fe5b0 fs/btrfs/btrfs.ko Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 7 +++---- fs/btrfs/disk-io.c | 3 +-- fs/btrfs/fs.h | 8 ++++---- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6170803d8a1b44..481802efaa1436 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1251,7 +1251,6 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) { struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; - bool testing = btrfs_is_testing(fs_info); spin_lock(&delayed_refs->lock); while (true) { @@ -1281,7 +1280,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); - if (!testing && pin_bytes) { + if (!btrfs_is_testing(fs_info) && pin_bytes) { struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, head->bytenr); @@ -1312,14 +1311,14 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) btrfs_error_unpin_extent_range(fs_info, head->bytenr, head->bytenr + head->num_bytes - 1); } - if (!testing) + if (!btrfs_is_testing(fs_info)) btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } - if (!testing) + if (!btrfs_is_testing(fs_info)) btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a9e82e062bd588..5c57f523f4498d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -639,7 +639,6 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, u64 objectid, gfp_t flags) { struct btrfs_root *root; - bool dummy = btrfs_is_testing(fs_info); root = kzalloc(sizeof(*root), flags); if (!root) @@ -696,7 +695,7 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, root->log_transid_committed = -1; btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; - if (!dummy) { + if (!btrfs_is_testing(fs_info)) { btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES); btrfs_extent_io_tree_init(fs_info, &root->log_csum_range, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index ba63a40b2bde1f..26b91edf239efc 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -1126,9 +1126,9 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) #define EXPORT_FOR_TESTS -static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) +static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info) { - return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + return unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)); } void btrfs_test_destroy_inode(struct inode *inode); @@ -1137,9 +1137,9 @@ void btrfs_test_destroy_inode(struct inode *inode); #define EXPORT_FOR_TESTS static -static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) +static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info) { - return 0; + return false; } #endif From 62701f419027711b9c18a1fc1c20714ceeea9e81 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 19 Sep 2025 17:46:05 +0100 Subject: [PATCH 2593/3206] btrfs: remove pointless key offset setup in create_pending_snapshot() There's no point in setting the key's offset to (u64)-1 since we never use it before setting it to the current transaction's ID. So remove the assignment of (u64)-1 to the key's offset and move the remainder of the key initialization close to where it's used. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d04fa6ce83901e..febf456a9ab099 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1694,10 +1694,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto clear_skip_qgroup; } - key.objectid = objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; trans->bytes_reserved = trans->block_rsv->reserved; @@ -1810,6 +1806,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = trans->transid; ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); From 7b26da407420e5054e3f06c5d13271697add9423 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 19 Sep 2025 14:33:23 +0930 Subject: [PATCH 2594/3206] btrfs: fix the incorrect max_bytes value for find_lock_delalloc_range() [BUG] With my local branch to enable bs > ps support for btrfs, sometimes I hit the following ASSERT() inside submit_one_sector(): ASSERT(block_start != EXTENT_MAP_HOLE); Please note that it's not yet possible to hit this ASSERT() in the wild yet, as it requires btrfs bs > ps support, which is not even in the development branch. But on the other hand, there is also a very low chance to hit above ASSERT() with bs < ps cases, so this is an existing bug affect not only the incoming bs > ps support but also the existing bs < ps support. [CAUSE] Firstly that ASSERT() means we're trying to submit a dirty block but without a real extent map nor ordered extent map backing it. Furthermore with extra debugging, the folio triggering such ASSERT() is always larger than the fs block size in my bs > ps case. (8K block size, 4K page size) After some more debugging, the ASSERT() is trigger by the following sequence: extent_writepage() | We got a 32K folio (4 fs blocks) at file offset 0, and the fs block | size is 8K, page size is 4K. | And there is another 8K folio at file offset 32K, which is also | dirty. | So the filemap layout looks like the following: | | "||" is the filio boundary in the filemap. | "//| is the dirty range. | | 0 8K 16K 24K 32K 40K | |////////| |//////////////////////||////////| | |- writepage_delalloc() | |- find_lock_delalloc_range() for [0, 8K) | | Now range [0, 8K) is properly locked. | | | |- find_lock_delalloc_range() for [16K, 40K) | | |- btrfs_find_delalloc_range() returned range [16K, 40K) | | |- lock_delalloc_folios() locked folio 0 successfully | | | | | | The filemap range [32K, 40K) got dropped from filemap. | | | | | |- lock_delalloc_folios() failed with -EAGAIN on folio 32K | | | As the folio at 32K is dropped. | | | | | |- loops = 1; | | |- max_bytes = PAGE_SIZE; | | |- goto again; | | | This will re-do the lookup for dirty delalloc ranges. | | | | | |- btrfs_find_delalloc_range() called with @max_bytes == 4K | | | This is smaller than block size, so | | | btrfs_find_delalloc_range() is unable to return any range. | | \- return false; | | | \- Now only range [0, 8K) has an OE for it, but for dirty range | [16K, 32K) it's dirty without an OE. | This breaks the assumption that writepage_delalloc() will find | and lock all dirty ranges inside the folio. | |- extent_writepage_io() |- submit_one_sector() for [0, 8K) | Succeeded | |- submit_one_sector() for [16K, 24K) Triggering the ASSERT(), as there is no OE, and the original extent map is a hole. Please note that, this also exposed the same problem for bs < ps support. E.g. with 64K page size and 4K block size. If we failed to lock a folio, and falls back into the "loops = 1;" branch, we will re-do the search using 64K as max_bytes. Which may fail again to lock the next folio, and exit early without handling all dirty blocks inside the folio. [FIX] Instead of using the fixed size PAGE_SIZE as @max_bytes, use @sectorsize, so that we are ensured to find and lock any remaining blocks inside the folio. And since we're here, add an extra ASSERT() to before calling btrfs_find_delalloc_range() to make sure the @max_bytes is at least no smaller than a block to avoid false negative. Cc: stable@vger.kernel.org # 5.15+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0782533aad5174..2b6027ebf26590 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -393,6 +393,13 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; + + /* + * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can + * return early without handling any dirty ranges. + */ + ASSERT(max_bytes >= fs_info->sectorsize); + found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, max_bytes, &cached_state); if (!found || delalloc_end <= *start || delalloc_start > orig_end) { @@ -423,13 +430,14 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { - /* some of the folios are gone, lets avoid looping by - * shortening the size of the delalloc range we're searching + /* + * Some of the folios are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching. */ btrfs_free_extent_state(cached_state); cached_state = NULL; if (!loops) { - max_bytes = PAGE_SIZE; + max_bytes = fs_info->sectorsize; loops = 1; goto again; } else { From c2ffb1ec1a7cfc754c1d2fe66d317f0aa4c0f1e6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Sep 2025 16:07:57 +0930 Subject: [PATCH 2595/3206] btrfs: prepare compression folio alloc/free for bs > ps cases This includes the following preparation for bs > ps cases: - Always alloc/free the folio directly if bs > ps This adds a new @fs_info parameter for btrfs_alloc_compr_folio(), thus affecting all compression algorithms. For btrfs_free_compr_folio() it needs no parameter for now, as we can use the folio size to skip the caching part. For now the change is just to passing a @fs_info into the function, all the folio size assumption is still based on page size. - Properly zero the last folio in compress_file_range() Since the compressed folios can be larger than a page, we need to properly zero the whole folio. - Use correct folio size for btrfs_add_compressed_bio_folios() Instead of page size, use the correct folio size. - Use correct folio size/shift for btrfs_compress_filemap_get_folio() As we are not only using simple page sized folios anymore. - Use correct folio size for btrfs_decompress() There is an ASSERT() making sure the decompressed range is no larger than a page, which will be triggered for bs > ps cases. - Skip readahead for compressed pages Similar to subpage cases. - Make btrfs_alloc_folio_array() to accept a new @order parameter - Add a helper to calculate the minimal folio size All those changes should not affect the existing bs <= ps handling. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 42 ++++++++++++++++++++++++++++++------------ fs/btrfs/compression.h | 2 +- fs/btrfs/extent_io.c | 7 +++++-- fs/btrfs/extent_io.h | 3 ++- fs/btrfs/fs.h | 6 ++++++ fs/btrfs/inode.c | 16 +++++++++------- fs/btrfs/lzo.c | 18 ++++++++++-------- fs/btrfs/zlib.c | 13 +++++++------ fs/btrfs/zstd.c | 15 ++++++++------- 9 files changed, 78 insertions(+), 44 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c865e0f2a7e871..bacad18357b338 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -223,10 +223,14 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co /* * Common wrappers for page allocation from compression wrappers */ -struct folio *btrfs_alloc_compr_folio(void) +struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info) { struct folio *folio = NULL; + /* For bs > ps cases, no cached folio pool for now. */ + if (fs_info->block_min_order) + goto alloc; + spin_lock(&compr_pool.lock); if (compr_pool.count > 0) { folio = list_first_entry(&compr_pool.list, struct folio, lru); @@ -238,13 +242,18 @@ struct folio *btrfs_alloc_compr_folio(void) if (folio) return folio; - return folio_alloc(GFP_NOFS, 0); +alloc: + return folio_alloc(GFP_NOFS, fs_info->block_min_order); } void btrfs_free_compr_folio(struct folio *folio) { bool do_free = false; + /* The folio is from bs > ps fs, no cached pool for now. */ + if (folio_order(folio)) + goto free; + spin_lock(&compr_pool.lock); if (compr_pool.count > compr_pool.thresh) { do_free = true; @@ -257,6 +266,7 @@ void btrfs_free_compr_folio(struct folio *folio) if (!do_free) return; +free: ASSERT(folio_ref_count(folio) == 1); folio_put(folio); } @@ -344,16 +354,19 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio) static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb->bbio.fs_info; struct bio *bio = &cb->bbio.bio; u32 offset = 0; while (offset < cb->compressed_len) { + struct folio *folio; int ret; - u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); + u32 len = min_t(u32, cb->compressed_len - offset, + btrfs_min_folio_size(fs_info)); + folio = cb->compressed_folios[offset >> (PAGE_SHIFT + fs_info->block_min_order)]; /* Maximum compressed extent is smaller than bio size limit. */ - ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT], - len, 0); + ret = bio_add_folio(bio, folio, len, 0); ASSERT(ret); offset += len; } @@ -443,6 +456,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (fs_info->sectorsize < PAGE_SIZE) return 0; + /* For bs > ps cases, we don't support readahead for compressed folios for now. */ + if (fs_info->block_min_order) + return 0; + end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; while (cur < compressed_end) { @@ -606,14 +623,15 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) btrfs_free_extent_map(em); - cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info)); cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); if (!cb->compressed_folios) { status = BLK_STS_RESOURCE; goto out_free_bio; } - ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); + ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order, + cb->compressed_folios); if (ret) { status = BLK_STS_RESOURCE; goto out_free_compressed_pages; @@ -1033,12 +1051,12 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, * - compression algo are 0-3 * - the level are bits 4-7 * - * @out_pages is an in/out parameter, holds maximum number of pages to allocate - * and returns number of actually allocated pages + * @out_folios is an in/out parameter, holds maximum number of folios to allocate + * and returns number of actually allocated folios * * @total_in is used to return the number of bytes actually read. It * may be smaller than the input length if we had to exit early because we - * ran out of room in the pages array or because we cross the + * ran out of room in the folios array or because we cross the * max_out threshold. * * @total_out is an in/out parameter, must be set to the input length and will @@ -1093,11 +1111,11 @@ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, int ret; /* - * The full destination page range should not exceed the page size. + * The full destination folio range should not exceed the folio size. * And the @destlen should not exceed sectorsize, as this is only called for * inline file extents, which should not exceed sectorsize. */ - ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); + ASSERT(dest_pgoff + destlen <= folio_size(dest_folio) && destlen <= sectorsize); workspace = get_workspace(fs_info, type, 0); ret = compression_decompress(type, workspace, data_in, dest_folio, diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 243771ae7d6439..eba188a9e3bb58 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -112,7 +112,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio); int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); -struct folio *btrfs_alloc_compr_folio(void); +struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info); void btrfs_free_compr_folio(struct folio *folio); struct workspace_manager { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2b6027ebf26590..4342d654a95403 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -626,6 +626,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Populate every free slot in a provided array with folios using GFP_NOFS. * * @nr_folios: number of folios to allocate + * @order: the order of the folios to be allocated * @folio_array: the array to fill with folios; any existing non-NULL entries in * the array will be skipped * @@ -633,12 +634,13 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * -ENOMEM otherwise, the partially allocated folios would be freed and * the array slots zeroed */ -int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array) +int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, + struct folio **folio_array) { for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) continue; - folio_array[i] = folio_alloc(GFP_NOFS, 0); + folio_array[i] = folio_alloc(GFP_NOFS, order); if (!folio_array[i]) goto error; } @@ -647,6 +649,7 @@ int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array) for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) folio_put(folio_array[i]); + folio_array[i] = NULL; } return -ENOMEM; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 61130786b9a3ad..5fcbfe44218c44 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -366,7 +366,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, bool nofail); -int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array); +int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, + struct folio **folio_array); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 26b91edf239efc..814bbc9417d2a2 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -923,6 +923,12 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } +/* Return the minimal folio size of the fs. */ +static inline unsigned int btrfs_min_folio_size(struct btrfs_fs_info *fs_info) +{ + return 1U << (PAGE_SHIFT + fs_info->block_min_order); +} + static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->generation); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5fad6af5794466..9ba0be5d6db17d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -854,6 +854,8 @@ static void compress_file_range(struct btrfs_work *work) struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -864,7 +866,7 @@ static void compress_file_range(struct btrfs_work *work) unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned int poff; + unsigned int loff; int i; int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; @@ -902,8 +904,8 @@ static void compress_file_range(struct btrfs_work *work) actual_end = min_t(u64, i_size, end + 1); again: folios = NULL; - nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); + nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1; + nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift); /* * we don't want to send crud past the end of i_size through @@ -965,12 +967,12 @@ static void compress_file_range(struct btrfs_work *work) goto mark_incompressible; /* - * Zero the tail end of the last page, as we might be sending it down + * Zero the tail end of the last folio, as we might be sending it down * to disk. */ - poff = offset_in_page(total_compressed); - if (poff) - folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); + loff = (total_compressed & (min_folio_size - 1)); + if (loff) + folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff); /* * Try to create an inline extent. diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 047d90e216f6ae..c5a25fd872bdfc 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -132,13 +132,14 @@ static inline size_t read_compress_length(const char *buf) * * Will allocate new pages when needed. */ -static int copy_compressed_data_to_page(char *compressed_data, +static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info, + char *compressed_data, size_t compressed_size, struct folio **out_folios, unsigned long max_nr_folio, - u32 *cur_out, - const u32 sectorsize) + u32 *cur_out) { + const u32 sectorsize = fs_info->sectorsize; u32 sector_bytes_left; u32 orig_out; struct folio *cur_folio; @@ -156,7 +157,7 @@ static int copy_compressed_data_to_page(char *compressed_data, cur_folio = out_folios[*cur_out / PAGE_SIZE]; /* Allocate a new page */ if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(); + cur_folio = btrfs_alloc_compr_folio(fs_info); if (!cur_folio) return -ENOMEM; out_folios[*cur_out / PAGE_SIZE] = cur_folio; @@ -182,7 +183,7 @@ static int copy_compressed_data_to_page(char *compressed_data, cur_folio = out_folios[*cur_out / PAGE_SIZE]; /* Allocate a new page */ if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(); + cur_folio = btrfs_alloc_compr_folio(fs_info); if (!cur_folio) return -ENOMEM; out_folios[*cur_out / PAGE_SIZE] = cur_folio; @@ -217,8 +218,9 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); - const u32 sectorsize = inode->root->fs_info->sectorsize; + const u32 sectorsize = fs_info->sectorsize; struct address_space *mapping = inode->vfs_inode.i_mapping; struct folio *folio_in = NULL; char *sizes_ptr; @@ -268,9 +270,9 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, goto out; } - ret = copy_compressed_data_to_page(workspace->cbuf, out_len, + ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len, folios, max_nr_folio, - &cur_out, sectorsize); + &cur_out); if (ret < 0) goto out; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index d72566a87afa2c..ccf77a0fa96c98 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -136,6 +136,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; @@ -147,7 +148,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; - const u32 blocksize = inode->root->fs_info->sectorsize; + const u32 blocksize = fs_info->sectorsize; const u64 orig_end = start + len; *out_folios = 0; @@ -156,7 +157,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, ret = zlib_deflateInit(&workspace->strm, workspace->level); if (unlikely(ret != Z_OK)) { - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zlib compression init failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; @@ -166,7 +167,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -224,7 +225,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); if (unlikely(ret != Z_OK)) { - btrfs_warn(inode->root->fs_info, + btrfs_warn(fs_info, "zlib compression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); @@ -249,7 +250,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -285,7 +286,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index b11a87842cdaef..28e2e99a24639b 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -400,6 +400,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); struct address_space *mapping = inode->vfs_inode.i_mapping; zstd_cstream *stream; @@ -412,7 +413,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, unsigned long len = *total_out; const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; - const u32 blocksize = inode->root->fs_info->sectorsize; + const u32 blocksize = fs_info->sectorsize; unsigned long max_out = nr_dest_folios * PAGE_SIZE; unsigned int cur_len; @@ -425,7 +426,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zstd compression init level %d failed, root %llu inode %llu offset %llu", workspace->req_level, btrfs_root_id(inode->root), btrfs_ino(inode), start); @@ -443,7 +444,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, workspace->in_buf.size = cur_len; /* Allocate and map in the output buffer */ - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -459,7 +460,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret2))) { - btrfs_warn(inode->root->fs_info, + btrfs_warn(fs_info, "zstd compression level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), @@ -491,7 +492,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -532,7 +533,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, ret2 = zstd_end_stream(stream, &workspace->out_buf); if (unlikely(zstd_is_error(ret2))) { - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), @@ -556,7 +557,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; From a6452b85b3e55aafceb84c25784f25f63ba620b0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Sep 2025 16:28:16 +0930 Subject: [PATCH 2596/3206] btrfs: prepare zstd to support bs > ps cases This involves converting the following functions to use proper folio sizes/shifts: - zstd_compress_folios() - zstd_decompress_bio() The function zstd_decompress() is already using block size correctly without using page size, thus it needs no modification. And since zstd compression is calling kmap_local_folio(), the existing code cannot handle large folios with HIGHMEM, as kmap_local_folio() requires us to handle one page range each time. I do not really think it's worth to spend time on some feature that will be deprecated eventually. So here just add an extra explicit rejection for bs > ps with HIGHMEM feature enabled kernels. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.c | 18 ++++++++++++++++++ fs/btrfs/zstd.c | 30 ++++++++++++++++-------------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 014fb8b12f96b9..29ad1c8591944b 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -79,6 +79,24 @@ bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize) if (blocksize == PAGE_SIZE || blocksize == SZ_4K || blocksize == BTRFS_MIN_BLOCKSIZE) return true; #ifdef CONFIG_BTRFS_EXPERIMENTAL + /* + * For bs > ps support it's done by specifying a minimal folio order + * for filemap, thus implying large data folios. + * For HIGHMEM systems, we can not always access the content of a (large) + * folio in one go, but go through them page by page. + * + * A lot of features don't implement a proper PAGE sized loop for large + * folios, this includes: + * + * - compression + * - verity + * - encoded write + * + * Considering HIGHMEM is such a pain to deal with and it's going + * to be deprecated eventually, just reject HIGHMEM && bs > ps cases. + */ + if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE) + return false; if (blocksize <= PAGE_SIZE) return true; #endif diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 28e2e99a24639b..8f7c8d27c9ca8d 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -414,7 +414,8 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; const u32 blocksize = fs_info->sectorsize; - unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + unsigned long max_out = nr_dest_folios * min_folio_size; unsigned int cur_len; workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); @@ -452,7 +453,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); while (1) { size_t ret2; @@ -486,8 +487,8 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, /* Check if we need more output space */ if (workspace->out_buf.pos == workspace->out_buf.size) { - tot_out += PAGE_SIZE; - max_out -= PAGE_SIZE; + tot_out += min_folio_size; + max_out -= min_folio_size; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; @@ -500,8 +501,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, - PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); } /* We've reached the end of the input */ @@ -551,8 +551,8 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, goto out; } - tot_out += PAGE_SIZE; - max_out -= PAGE_SIZE; + tot_out += min_folio_size; + max_out -= min_folio_size; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; @@ -565,7 +565,7 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); } if (tot_out >= tot_in) { @@ -587,14 +587,16 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); struct folio **folios_in = cb->compressed_folios; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; - const u32 blocksize = cb_to_fs_info(cb)->sectorsize; + const u32 blocksize = fs_info->sectorsize; + const unsigned int min_folio_size = btrfs_min_folio_size(fs_info); unsigned long folio_in_index = 0; - unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); unsigned long buf_start; unsigned long total_out = 0; @@ -612,7 +614,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; @@ -657,11 +659,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret = -EIO; goto done; } - srclen -= PAGE_SIZE; + srclen -= min_folio_size; workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); } } ret = 0; From 4fd188a4fe5877a95e4812617a6c6ae66644427b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Sep 2025 18:21:47 +0930 Subject: [PATCH 2597/3206] btrfs: prepare lzo to support bs > ps cases This involves converting the following functions to use correct folio sizes/shifts: - copy_compress_data_to_page() - lzo_compress_folios() - lzo_decompress_bio() Just like zstd, lzo has some extra incorrect usage of kmap_local_folio() that the offset is always 0. This will not handle HIGHMEM large folios correctly, but those cases are already rejected explicitly so it should not cause problems when bs > ps support is enabled. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/lzo.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index c5a25fd872bdfc..b93bdac91b30e7 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -140,12 +140,13 @@ static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info, u32 *cur_out) { const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; u32 sector_bytes_left; u32 orig_out; struct folio *cur_folio; char *kaddr; - if ((*cur_out / PAGE_SIZE) >= max_nr_folio) + if ((*cur_out >> min_folio_shift) >= max_nr_folio) return -E2BIG; /* @@ -154,18 +155,17 @@ static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info, */ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); - cur_folio = out_folios[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out >> min_folio_shift]; /* Allocate a new page */ if (!cur_folio) { cur_folio = btrfs_alloc_compr_folio(fs_info); if (!cur_folio) return -ENOMEM; - out_folios[*cur_out / PAGE_SIZE] = cur_folio; + out_folios[*cur_out >> min_folio_shift] = cur_folio; } - kaddr = kmap_local_folio(cur_folio, 0); - write_compress_length(kaddr + offset_in_page(*cur_out), - compressed_size); + kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out)); + write_compress_length(kaddr, compressed_size); *cur_out += LZO_LEN; orig_out = *cur_out; @@ -177,20 +177,20 @@ static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info, kunmap_local(kaddr); - if ((*cur_out / PAGE_SIZE) >= max_nr_folio) + if ((*cur_out >> min_folio_shift) >= max_nr_folio) return -E2BIG; - cur_folio = out_folios[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out >> min_folio_shift]; /* Allocate a new page */ if (!cur_folio) { cur_folio = btrfs_alloc_compr_folio(fs_info); if (!cur_folio) return -ENOMEM; - out_folios[*cur_out / PAGE_SIZE] = cur_folio; + out_folios[*cur_out >> min_folio_shift] = cur_folio; } kaddr = kmap_local_folio(cur_folio, 0); - memcpy(kaddr + offset_in_page(*cur_out), + memcpy(kaddr + offset_in_folio(cur_folio, *cur_out), compressed_data + *cur_out - orig_out, copy_len); *cur_out += copy_len; @@ -221,6 +221,7 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); struct address_space *mapping = inode->vfs_inode.i_mapping; struct folio *folio_in = NULL; char *sizes_ptr; @@ -287,8 +288,8 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, goto out; } - /* Check if we have reached page boundary */ - if (PAGE_ALIGNED(cur_in)) { + /* Check if we have reached folio boundary. */ + if (IS_ALIGNED(cur_in, min_folio_size)) { folio_put(folio_in); folio_in = NULL; } @@ -305,7 +306,7 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, out: if (folio_in) folio_put(folio_in); - *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE); + *out_folios = DIV_ROUND_UP(cur_out, min_folio_size); return ret; } @@ -317,15 +318,16 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, static void copy_compressed_segment(struct compressed_bio *cb, char *dest, u32 len, u32 *cur_in) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - struct folio *cur_folio; - u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), - orig_in + len - *cur_in); + struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift]; + u32 copy_len = min_t(u32, orig_in + len - *cur_in, + folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in)); ASSERT(copy_len); - cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE]; memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, offset_in_folio(cur_folio, *cur_in), copy_len); @@ -339,6 +341,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; char *kaddr; int ret; /* Compressed data length, can be unaligned */ @@ -385,10 +388,10 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); - cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE]; + cur_folio = cb->compressed_folios[cur_in >> min_folio_shift]; ASSERT(cur_folio); kaddr = kmap_local_folio(cur_folio, 0); - seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); + seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in)); kunmap_local(kaddr); cur_in += LZO_LEN; From e88cb48e6709d73fe9f82f2b56a43ac9ba2a1fe2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Sep 2025 18:36:37 +0930 Subject: [PATCH 2598/3206] btrfs: prepare zlib to support bs > ps cases This involves converting the following functions to use correct folio sizes/shifts: - zlib_compress_folios() - zlib_decompress_bio() There is a special handling for s390 hardware acceleration. With bs > ps cases, we can go with 16K block size on s390 (which uses fixed 4K page size). In that case we do not need to do the buffer copy as our folio is large enough for hardware acceleration. So factor out the s390 specific and folio size check into a helper, need_special_buffer(). Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ccf77a0fa96c98..493e5872c6a33d 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -53,6 +53,22 @@ void zlib_free_workspace(struct list_head *ws) kfree(workspace); } +/* + * For s390 hardware acceleration, the buffer size should be at least + * ZLIB_DFLTCC_BUF_SIZE to achieve the best performance. + * + * But if bs > ps we can have large enough folios that meet the s390 hardware + * handling. + */ +static bool need_special_buffer(struct btrfs_fs_info *fs_info) +{ + if (!zlib_deflate_dfltcc_enabled()) + return false; + if (btrfs_min_folio_size(fs_info) >= ZLIB_DFLTCC_BUF_SIZE) + return false; + return true; +} + struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { const u32 blocksize = fs_info->sectorsize; @@ -68,11 +84,7 @@ struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned i workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN); workspace->level = level; workspace->buf = NULL; - /* - * In case of s390 zlib hardware support, allocate lager workspace - * buffer. If allocator fails, fall back to a single page buffer. - */ - if (zlib_deflate_dfltcc_enabled()) { + if (need_special_buffer(fs_info)) { workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE, __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | GFP_NOIO); @@ -139,6 +151,8 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); struct address_space *mapping = inode->vfs_inode.i_mapping; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret; char *data_in = NULL; char *cfolio_out; @@ -147,7 +161,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, struct folio *out_folio = NULL; unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; - const unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const unsigned long max_out = nr_dest_folios << min_folio_shift; const u32 blocksize = fs_info->sectorsize; const u64 orig_end = start + len; @@ -179,7 +193,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, workspace->strm.next_in = workspace->buf; workspace->strm.avail_in = 0; workspace->strm.next_out = cfolio_out; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; while (workspace->strm.total_in < len) { /* @@ -191,10 +205,11 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, unsigned int copy_length = min(bytes_left, workspace->buf_size); /* - * This can only happen when hardware zlib compression is - * enabled. + * For s390 hardware accelerated zlib, and our folio is smaller + * than the copy_length, we need to fill the buffer so that + * we can take full advantage of hardware acceleration. */ - if (copy_length > PAGE_SIZE) { + if (need_special_buffer(fs_info)) { ret = copy_data_into_buffer(mapping, workspace, start, copy_length); if (ret < 0) @@ -258,7 +273,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, cfolio_out = folio_address(out_folio); folios[nr_folios] = out_folio; nr_folios++; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; workspace->strm.next_out = cfolio_out; } /* we're all done */ @@ -294,7 +309,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, cfolio_out = folio_address(out_folio); folios[nr_folios] = out_folio; nr_folios++; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; workspace->strm.next_out = cfolio_out; } } @@ -320,20 +335,22 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); + const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret = 0, ret2; int wbits = MAX_WBITS; char *data_in; size_t total_out = 0; unsigned long folio_in_index = 0; size_t srclen = cb->compressed_len; - unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); unsigned long buf_start; struct folio **folios_in = cb->compressed_folios; data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; - workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); + workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size); workspace->strm.total_in = 0; workspace->strm.total_out = 0; @@ -394,7 +411,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; - workspace->strm.avail_in = min(tmp, PAGE_SIZE); + workspace->strm.avail_in = min(tmp, min_folio_size); } } if (unlikely(ret != Z_STREAM_END)) { From 5fbaae4b8567b19cd65b847043588e10af86eb60 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 12 Sep 2025 13:12:54 +0930 Subject: [PATCH 2599/3206] btrfs: prepare scrub to support bs > ps cases This involves: - Migrate scrub_stripe::pages[] to folios[] - Use btrfs_alloc_folio_array() and folio_put() to alloc above array. - Migrate scrub_stripe_get_kaddr() and scrub_stripe_get_paddr() to use folio interfaces - Migrate raid56_parity_cache_data_pages() to raid56_parity_cache_data_folios() Since scrub is the only caller still using pages. This helper will copy the folio array contents into rbio::stripe_pages, with sector uptodate flags updated. And a new ASSERT() to make sure bs > ps cases will not hit this path. Since most scrub code is based on kaddr/paddr, the migration itself is pretty straightforward. And since we're here, also move the loop to set the stripe_sectors[].uptodate out of the copy loop. As we always mark all the sectors as uptodate for the data stripe, it's easier to do in one go, other than doing it inside the copy loop. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 44 ++++++++++++++++++++++++++-------------- fs/btrfs/raid56.h | 4 ++-- fs/btrfs/scrub.c | 51 +++++++++++++++++++++++++++-------------------- 3 files changed, 60 insertions(+), 39 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2b4f577dcf39de..c90612f6cc1b0a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2844,19 +2844,22 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) * This is for scrub call sites where we already have correct data contents. * This allows us to avoid reading data stripes again. * - * Unfortunately here we have to do page copy, other than reusing the pages. + * Unfortunately here we have to do folio copy, other than reusing the pages. * This is due to the fact rbio has its own page management for its cache. */ -void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, - struct page **data_pages, u64 data_logical) +void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, + struct folio **data_folios, u64 data_logical) { + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; const u64 offset_in_full_stripe = data_logical - rbio->bioc->full_stripe_logical; - const int page_index = offset_in_full_stripe >> PAGE_SHIFT; - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; + unsigned int findex = 0; + unsigned int foffset = 0; int ret; + /* We shouldn't hit RAID56 for bs > ps cases for now. */ + ASSERT(fs_info->sectorsize <= PAGE_SIZE); + /* * If we hit ENOMEM temporarily, but later at * raid56_parity_submit_scrub_rbio() time it succeeded, we just do @@ -2873,14 +2876,25 @@ void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); - for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { - struct page *dst = rbio->stripe_pages[page_nr + page_index]; - struct page *src = data_pages[page_nr]; - - memcpy_page(dst, 0, src, 0, PAGE_SIZE); - for (int sector_nr = sectors_per_page * page_index; - sector_nr < sectors_per_page * (page_index + 1); - sector_nr++) - rbio->stripe_sectors[sector_nr].uptodate = true; + for (unsigned int cur_off = offset_in_full_stripe; + cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN; + cur_off += PAGE_SIZE) { + const unsigned int pindex = cur_off >> PAGE_SHIFT; + void *kaddr; + + kaddr = kmap_local_page(rbio->stripe_pages[pindex]); + memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE); + kunmap_local(kaddr); + + foffset += PAGE_SIZE; + ASSERT(foffset <= folio_size(data_folios[findex])); + if (foffset == folio_size(data_folios[findex])) { + findex++; + foffset = 0; + } } + for (unsigned int sector_nr = offset_in_full_stripe >> fs_info->sectorsize_bits; + sector_nr < (offset_in_full_stripe + BTRFS_STRIPE_LEN) >> fs_info->sectorsize_bits; + sector_nr++) + rbio->stripe_sectors[sector_nr].uptodate = true; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0d7b4c2fb6ae80..84c4d1d29c7a88 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -201,8 +201,8 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); -void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, - struct page **data_pages, u64 data_logical); +void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, + struct folio **data_folios, u64 data_logical); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cef260ed854c15..ce28dcb60f7870 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -130,7 +130,7 @@ enum { scrub_bitmap_nr_last, }; -#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) +#define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE) /* * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. @@ -139,7 +139,7 @@ struct scrub_stripe { struct scrub_ctx *sctx; struct btrfs_block_group *bg; - struct page *pages[SCRUB_STRIPE_PAGES]; + struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS]; struct scrub_sector_verification *sectors; struct btrfs_device *dev; @@ -339,10 +339,10 @@ static void release_scrub_stripe(struct scrub_stripe *stripe) if (!stripe) return; - for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) { - if (stripe->pages[i]) - __free_page(stripe->pages[i]); - stripe->pages[i] = NULL; + for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) { + if (stripe->folios[i]) + folio_put(stripe->folios[i]); + stripe->folios[i] = NULL; } kfree(stripe->sectors); kfree(stripe->csums); @@ -355,6 +355,7 @@ static void release_scrub_stripe(struct scrub_stripe *stripe) static int init_scrub_stripe(struct btrfs_fs_info *fs_info, struct scrub_stripe *stripe) { + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; int ret; memset(stripe, 0, sizeof(*stripe)); @@ -367,7 +368,9 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info, atomic_set(&stripe->pending_io, 0); spin_lock_init(&stripe->write_error_lock); - ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false); + ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS); + ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift, + fs_info->block_min_order, stripe->folios); if (ret < 0) goto error; @@ -687,27 +690,30 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) { - u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits); - const struct page *page = stripe->pages[offset >> PAGE_SHIFT]; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + u32 offset = (sector_nr << fs_info->sectorsize_bits); + const struct folio *folio = stripe->folios[offset >> min_folio_shift]; - /* stripe->pages[] is allocated by us and no highmem is allowed. */ - ASSERT(page); - ASSERT(!PageHighMem(page)); - return page_address(page) + offset_in_page(offset); + /* stripe->folios[] is allocated by us and no highmem is allowed. */ + ASSERT(folio); + ASSERT(!folio_test_partial_kmap(folio)); + return folio_address(folio) + offset_in_folio(folio, offset); } static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; u32 offset = (sector_nr << fs_info->sectorsize_bits); - const struct page *page = stripe->pages[offset >> PAGE_SHIFT]; + const struct folio *folio = stripe->folios[offset >> min_folio_shift]; - /* stripe->pages[] is allocated by us and no highmem is allowed. */ - ASSERT(page); - ASSERT(!PageHighMem(page)); - /* And the range must be contained inside the page. */ - ASSERT(offset_in_page(offset) + fs_info->sectorsize <= PAGE_SIZE); - return page_to_phys(page) + offset_in_page(offset); + /* stripe->folios[] is allocated by us and no highmem is allowed. */ + ASSERT(folio); + ASSERT(!folio_test_partial_kmap(folio)); + /* And the range must be contained inside the folio. */ + ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); + return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); } static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) @@ -1872,6 +1878,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; int mirror = stripe->mirror_num; @@ -1884,7 +1891,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, return; } - bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, + bbio = btrfs_bio_alloc(BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, fs_info, scrub_read_endio, stripe); bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; @@ -2215,7 +2222,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; - raid56_parity_cache_data_pages(rbio, stripe->pages, + raid56_parity_cache_data_folios(rbio, stripe->folios, full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); } raid56_parity_submit_scrub_rbio(rbio); From 67378b754608a3524d125bfa5744508a49fe48be Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 15 Sep 2025 14:29:42 +0930 Subject: [PATCH 2600/3206] btrfs: fix symbolic link reading when bs > ps [BUG DURING BS > PS TEST] When running the following script on a btrfs whose block size is larger than page size, e.g. 8K block size and 4K page size, it will trigger a kernel BUG: # mkfs.btrfs -s 8k $dev # mount $dev $mnt # mkdir $mnt/dir # ln -s dir $mnt/link # ls $mnt/link The call trace looks like this: BTRFS warning (device dm-2): support for block size 8192 with page size 4096 is experimental, some features may be missing BTRFS info (device dm-2): checking UUID tree BTRFS info (device dm-2): enabling ssd optimizations BTRFS info (device dm-2): enabling free space tree ------------[ cut here ]------------ kernel BUG at /home/adam/linux/include/linux/highmem.h:275! Oops: invalid opcode: 0000 [#1] SMP CPU: 8 UID: 0 PID: 667 Comm: ls Tainted: G OE 6.17.0-rc4-custom+ #283 PREEMPT(full) Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 02/02/2022 RIP: 0010:zero_user_segments.constprop.0+0xdc/0xe0 [btrfs] Call Trace: btrfs_get_extent.cold+0x85/0x101 [btrfs 7453c70c03e631c8d8bfdd4264fa62d3e238da6f] btrfs_do_readpage+0x244/0x750 [btrfs 7453c70c03e631c8d8bfdd4264fa62d3e238da6f] btrfs_read_folio+0x9c/0x100 [btrfs 7453c70c03e631c8d8bfdd4264fa62d3e238da6f] filemap_read_folio+0x37/0xe0 do_read_cache_folio+0x94/0x3e0 __page_get_link.isra.0+0x20/0x90 page_get_link+0x16/0x40 step_into+0x69b/0x830 path_lookupat+0xa7/0x170 filename_lookup+0xf7/0x200 ? set_ptes.isra.0+0x36/0x70 vfs_statx+0x7a/0x160 do_statx+0x63/0xa0 __x64_sys_statx+0x90/0xe0 do_syscall_64+0x82/0xae0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Please note bs > ps support is still under development and the enablement patch is not even in btrfs development branch. [CAUSE] Btrfs reuses its data folio read path to handle symbolic links, as the symbolic link target is stored as an inline data extent. But for newly created inodes, btrfs only set the minimal order if the target inode is a regular file. Thus for above newly created symbolic link, it doesn't properly respect the minimal folio order, and triggered the above crash. [FIX] Call btrfs_set_inode_mapping_order() unconditionally inside btrfs_create_new_inode(). For symbolic links this will fix the crash as now the folio will meet the minimal order. For regular files this brings no change. For directory/bdev/char and all the other types of inodes, they won't go through the data read path, thus no effect either. Fixes: cc38d178ff33 ("btrfs: enable large data folio support under CONFIG_BTRFS_EXPERIMENTAL") Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9ba0be5d6db17d..36a030a2a0a4fd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6505,6 +6505,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (!args->subvol) btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); + btrfs_set_inode_mapping_order(BTRFS_I(inode)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; @@ -6512,7 +6513,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; btrfs_update_inode_mapping_flags(BTRFS_I(inode)); - btrfs_set_inode_mapping_order(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); From e9bed72e883e7c6e6a2057ea29fcd4ba69225f91 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 15 Sep 2025 18:33:50 +0930 Subject: [PATCH 2601/3206] btrfs: add extra ASSERT()s to catch unaligned bios Btrfs uses btrfs_bio to handle read/write of logical address, for the incoming bs > ps support, btrfs has extra requirements: - One folio must contain at least one fs block - No fs block can cross folio boundaries This requirement is not hard to maintain, thanks to the address space's minimal folio order. But not all btrfs bios are generated through address space, e.g. compression and scrub. To catch possible unaligned bios, introduce a helper, assert_bbio_alginment(), for each btrfs_bio in btrfs_submit_bbio(). This will check the following things: - bv_offset is aligned to block size - bv_len is aligned to block size With a btrfs bio passing above checks, unless it's empty it will ensure the requirements for bs > ps support. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 909b208f9ef3c8..db2deaa4aad4e2 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -779,11 +779,38 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) return true; } +static void assert_bbio_alignment(struct btrfs_bio *bbio) +{ +#ifdef CONFIG_BTRFS_ASSERT + struct btrfs_fs_info *fs_info = bbio->fs_info; + struct bio_vec bvec; + struct bvec_iter iter; + const u32 blocksize = fs_info->sectorsize; + + /* Metadata has no extra bs > ps alignment requirement. */ + if (!is_data_bbio(bbio)) + return; + + bio_for_each_bvec(bvec, &bbio->bio, iter) + ASSERT(IS_ALIGNED(bvec.bv_offset, blocksize) && + IS_ALIGNED(bvec.bv_len, blocksize), + "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u", + btrfs_root_id(bbio->inode->root), + btrfs_ino(bbio->inode), + bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT, + bbio->bio.bi_iter.bi_size, iter.bi_idx, + bvec.bv_offset, + bvec.bv_len); +#endif +} + void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) { /* If bbio->inode is not populated, its file_offset must be 0. */ ASSERT(bbio->inode || bbio->file_offset == 0); + assert_bbio_alignment(bbio); + while (!btrfs_submit_chunk(bbio, mirror_num)) ; } From 98077f7f2180fa996710452564ebe71adc66af59 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 9 Sep 2025 12:38:47 +0930 Subject: [PATCH 2602/3206] btrfs: enable experimental bs > ps support With all the preparation patches, we're able to finally enable btrfs block size (sector size) larger than page size support and give it a full fstests run. And obviously this new feature is hidden behind experimental flags, and should not be considered as a core feature yet as btrfs' default block size is still 4K. But this is still a feature that will shine in the future where 16K block sized device are widely adopted. For now there are some features explicitly disabled: - Direct IO This is the most complex part to support, the root reason is we can not control the pages of iov iter passed in. User space programs can only ensure the virtual addresses are contiguous, but have no control on their physical addresses. Our bs > ps support heavily relies on large folios, and direct IO memory can easily break it. So direct IO is disabled and will always fall back to buffered IO. - RAID56 In theory we can convert RAID56 to use large folios, but it will need to be converted back to page based if we want to support direct IO in the future. So just reject it for now. - Encoded send - Encoded read Both are utilizing btrfs_encoded_read_regular_fill_pages(), and send is utilizing vmallocated memory. Unfortunately for vmallocated memory we can not guarantee the minimal folio order. For send, it will just always fallback to regular writes, which reads from page cache and will follow the existing folio order requirement. - Encoded write Encoded write itself is allocating pages by themselves, and we can easily change it to follow the minimal order. But since encoded read is already disabled, there is no need to only enable encoded write. Finally just like what we did for bs < ps support in the past, add a warning message for bs > ps mounts. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/direct-io.c | 12 ++++++++++++ fs/btrfs/disk-io.c | 14 ++++++++++++-- fs/btrfs/fs.c | 3 +-- fs/btrfs/ioctl.c | 35 +++++++++++++++++++++++++---------- fs/btrfs/send.c | 9 ++++++++- 5 files changed, 58 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index fe9a4bd7e6e683..802d4dbe5b3817 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -786,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, if (iov_iter_alignment(iter) & blocksize_mask) return -EINVAL; + /* + * For bs > ps support, we heavily rely on large folios to make sure no + * block will cross large folio boundaries. + * + * But memory provided by direct IO is only virtually contiguous, not + * physically contiguous, and will break the btrfs' large folio requirement. + * + * So for bs > ps support, all direct IOs should fallback to buffered ones. + */ + if (fs_info->sectorsize > PAGE_SIZE) + return -EINVAL; + return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5c57f523f4498d..0aee3239518d76 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3242,18 +3242,24 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) } /* - * Subpage runtime limitation on v1 cache. + * Subpage/bs > ps runtime limitation on v1 cache. * * V1 space cache still has some hard coded PAGE_SIZE usage, while * we're already defaulting to v2 cache, no need to bother v1 as it's * going to be deprecated anyway. */ - if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { btrfs_warn(fs_info, "v1 space cache is not supported for page size %lu with sectorsize %u", PAGE_SIZE, fs_info->sectorsize); return -EINVAL; } + if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) { + btrfs_err(fs_info, + "RAID56 is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + return -EINVAL; + } /* This can be called by remount, we need to protect the super block. */ spin_lock(&fs_info->super_lock); @@ -3388,6 +3394,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; + if (fs_info->sectorsize > PAGE_SIZE) + btrfs_warn(fs_info, + "support for block size %u with page size %zu is experimental, some features may be missing", + fs_info->sectorsize, PAGE_SIZE); /* * Handle the space caching options appropriately now that we have the * super block loaded and validated. diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 29ad1c8591944b..feb0a2faa8379b 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -97,8 +97,7 @@ bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize) */ if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE) return false; - if (blocksize <= PAGE_SIZE) - return true; + return true; #endif return false; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 063291519b363b..0e9e2b99939232 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4418,6 +4418,10 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; @@ -4509,6 +4513,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) { + struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); struct btrfs_ioctl_encoded_io_args args; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -4522,6 +4527,11 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } + if (!(file->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out_acct; @@ -4780,14 +4790,14 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { + struct file *file = cmd->file; + struct btrfs_inode *inode = BTRFS_I(file->f_inode); + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; int ret; u64 disk_bytenr, disk_io_size; - struct file *file; - struct btrfs_inode *inode; - struct btrfs_fs_info *fs_info; - struct extent_io_tree *io_tree; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; @@ -4803,10 +4813,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue ret = -EPERM; goto out_acct; } - file = cmd->file; - inode = BTRFS_I(file->f_inode); - fs_info = inode->root->fs_info; - io_tree = &inode->io_tree; + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (issue_flags & IO_URING_F_COMPAT) { @@ -4933,9 +4944,10 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) { + struct file *file = cmd->file; + struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); loff_t pos; struct kiocb kiocb; - struct file *file; ssize_t ret; void __user *sqe_addr; struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); @@ -4948,8 +4960,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu ret = -EPERM; goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } - file = cmd->file; sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (!(file->f_mode & FMODE_WRITE)) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 32653fc44a7583..5e073502b9e8e9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5654,7 +5654,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + /* + * Do not go through encoded read for bs > ps cases. + * + * Encoded send is using vmallocated pages as buffer, which we can + * not ensure every folio is large enough to contain a block. + */ + if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE && + (sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { bool is_inline = (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE); From f08d7147da5fb5ab4c26199dcbebb614283c99bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= Date: Fri, 19 Sep 2025 16:58:15 +0200 Subject: [PATCH 2603/3206] btrfs: use kmalloc_array() for open-coded arithmetic in kmalloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As pointed out in the documentation, calling 'kmalloc' with open-coded arithmetic can lead to unfortunate overflows and this particular way of using it has been deprecated. Instead, it's preferred to use 'kmalloc_array' in cases where it might apply so an overflow check is performed. Note this is an API cleanup and is not fixing any overflows because in all cases the multipliers are bounded small numbers derived from number of items in leaves/nodes. Signed-off-by: Miquel Sabaté Solà Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/tree-log.c | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6adfe62cd0c4dd..81577a0c601f58 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -738,8 +738,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, u32 *ins_sizes; int i = 0; - ins_data = kmalloc(batch.nr * sizeof(u32) + - batch.nr * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(batch.nr, + sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ac7805d40ab24a..78d59b63748bac 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4054,8 +4054,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, struct btrfs_key *ins_keys; u32 *ins_sizes; - ins_data = kmalloc(count * sizeof(u32) + - count * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) return -ENOMEM; @@ -4818,8 +4817,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src = src_path->nodes[0]; - ins_data = kmalloc(nr * sizeof(struct btrfs_key) + - nr * sizeof(u32), GFP_NOFS); + ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS); if (!ins_data) return -ENOMEM; @@ -6524,8 +6522,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, if (!first) return 0; - ins_data = kmalloc(max_batch_size * sizeof(u32) + - max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) return -ENOMEM; ins_sizes = (u32 *)ins_data; From c9ff83963a49a413b8ace36de21f61b47f461b98 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 18 Sep 2025 12:51:19 +0200 Subject: [PATCH 2604/3206] btrfs: zoned: don't fail mount needlessly due to too many active zones Previously BTRFS did not look at a device's reported max_open_zones limit, but starting with commit 04147d8394e8 ("btrfs: zoned: limit active zones to max_open_zones"), zoned BTRFS limited the number of concurrently used block-groups to the number of max_open_zones a device reported, if it hadn't already reported a number of max_active_zones. Starting with commit 04147d8394e8 the number of open zones is treated the same way as active zones. But this leads to mount failures on filesystems which have been used before 04147d8394e8 because too many zones are in an open state. Ignore the new limitations on these filesystems, so zones can be finished or evacuated. Reported-by: Yuwei Han Link: https://lore.kernel.org/all/2F48A90AF7DDF380+1790bcfd-cb6f-456b-870d-7982f21b5eae@bupt.moe/ Fixes: 04147d8394e8 ("btrfs: zoned: limit active zones to max_open_zones") Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index ba444e412613f2..e21b57a68b3c3b 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -514,6 +514,11 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (max_active_zones) { if (nactive > max_active_zones) { + if (bdev_max_active_zones(bdev) == 0) { + max_active_zones = 0; + zone_info->max_active_zones = 0; + goto validate; + } btrfs_err(device->fs_info, "zoned: %u active zones on %s exceeds max_active_zones %u", nactive, rcu_dereference(device->name), @@ -526,6 +531,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); } +validate: /* Validate superblock log */ nr_zones = BTRFS_NR_SB_LOG_ZONES; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { From 4ca6f24a52c4e94bd09f70cf132d0a38db7996b0 Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Mon, 22 Sep 2025 19:30:07 +0800 Subject: [PATCH 2605/3206] btrfs: more trivial BTRFS_PATH_AUTO_FREE conversions Trivial pattern for the auto freeing with goto -> return conversions if possible. The following cases are considered trivial in this patch: 1. Cases where there are no operations between btrfs_free_path() and the function returns. 2. Cases where only simple cleanup operations (such as kfree(), kvfree(), clear_bit(), and fs_path_free()) are present between btrfs_free_path() and the function return. Signed-off-by: Sun YangKai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 15 +- fs/btrfs/ref-verify.c | 3 +- fs/btrfs/reflink.c | 3 +- fs/btrfs/relocation.c | 47 ++---- fs/btrfs/root-tree.c | 54 +++---- fs/btrfs/scrub.c | 11 +- fs/btrfs/send.c | 314 +++++++++++++----------------------- fs/btrfs/super.c | 5 +- fs/btrfs/transaction.c | 3 +- fs/btrfs/tree-log.c | 91 ++++------- 10 files changed, 195 insertions(+), 351 deletions(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index cab0b291088c67..a69eaf0cef47d9 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -67,7 +67,7 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *stripe_root = fs_info->stripe_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf; u64 found_start; @@ -260,7 +260,6 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le btrfs_release_path(path); } - btrfs_free_path(path); return ret; } @@ -269,7 +268,7 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, struct btrfs_stripe_extent *stripe_extent, const size_t item_size) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; int ret; int slot; @@ -288,7 +287,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), item_size); - btrfs_free_path(path); return ret; } @@ -376,7 +374,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, struct btrfs_stripe_extent *stripe_extent; struct btrfs_key stripe_key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; const u64 end = logical + *length; int num_stripes; @@ -402,7 +400,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); if (ret < 0) - goto free_path; + return ret; if (ret) { if (path->slots[0] != 0) path->slots[0]--; @@ -459,8 +457,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, trace_btrfs_get_raid_extent_offset(fs_info, logical, *length, stripe->physical, devid); - ret = 0; - goto free_path; + return 0; } /* If we're here, we haven't found the requested devid in the stripe. */ @@ -474,8 +471,6 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, logical, logical + *length, stripe->dev->devid, btrfs_bg_type_to_raid_name(map_type)); } -free_path: - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 9f1858b42c0e21..de4cb0f3fbd046 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -971,7 +971,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *extent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int tree_block_level = 0; u64 bytenr = 0, num_bytes = 0; @@ -1021,6 +1021,5 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 0a85e1da97777a..f78ac494197f0e 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -340,7 +340,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 destoff, bool no_time_update) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_trans_handle *trans; char *buf = NULL; @@ -611,7 +611,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, } out: - btrfs_free_path(path); kvfree(buf); clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8bd97b24f375fd..7a90a4ef8375d9 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -821,7 +821,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, u64 bytenr, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(reloc_inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; int ret; @@ -834,11 +834,9 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + return ret; + if (ret > 0) + return -ENOENT; leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], @@ -849,16 +847,11 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)); - if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { - ret = -EINVAL; - goto out; - } + if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) + return -EINVAL; *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -3158,7 +3151,7 @@ static int __add_tree_block(struct reloc_control *rc, struct rb_root *blocks) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret; bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); @@ -3186,7 +3179,7 @@ static int __add_tree_block(struct reloc_control *rc, path->skip_locking = 1; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && skinny) { if (path->slots[0]) { @@ -3213,14 +3206,10 @@ static int __add_tree_block(struct reloc_control *rc, "tree block extent item (%llu) is not found in extent tree", bytenr); WARN_ON(1); - ret = -EINVAL; - goto out; + return -EINVAL; } - ret = add_tree_block(rc, &key, path, blocks); -out: - btrfs_free_path(path); - return ret; + return add_tree_block(rc, &key, path, blocks); } static int delete_block_group_cache(struct btrfs_block_group *block_group, @@ -3510,7 +3499,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) struct rb_root blocks = RB_ROOT; struct btrfs_key key; struct btrfs_trans_handle *trans = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; u64 flags; int ret; @@ -3679,14 +3668,13 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) if (ret < 0 && !err) err = ret; btrfs_free_block_rsv(fs_info, rc->block_rsv); - btrfs_free_path(path); return err; } static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_item *item; struct extent_buffer *leaf; int ret; @@ -3697,7 +3685,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_inode(trans, root, path, objectid); if (ret) - goto out; + return ret; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -3707,15 +3695,13 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); -out: - btrfs_free_path(path); - return ret; + return 0; } static void delete_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -3738,7 +3724,6 @@ static void delete_orphan_inode(struct btrfs_trans_handle *trans, out: if (ret) btrfs_abort_transaction(trans, ret); - btrfs_free_path(path); } /* diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index e22e6b06927ab3..d1f5f6c42ef30d 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -130,7 +130,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *item) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *l; int ret; int slot; @@ -143,7 +143,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret < 0) - goto out; + return ret; if (ret > 0) { btrfs_crit(fs_info, @@ -151,7 +151,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root key->objectid, key->type, key->offset, btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } l = path->nodes[0]; @@ -170,20 +170,20 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root -1, 1); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ret = btrfs_del_item(trans, root, path); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } l = path->nodes[0]; slot = path->slots[0]; @@ -197,8 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); -out: - btrfs_free_path(path); return ret; } @@ -216,7 +214,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root; int err = 0; @@ -309,7 +307,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) btrfs_put_root(root); } - btrfs_free_path(path); return err; } @@ -318,7 +315,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key) { struct btrfs_root *root = trans->fs_info->tree_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; path = btrfs_alloc_path(); @@ -326,17 +323,12 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) - goto out; - if (ret != 0) { + return ret; + if (ret > 0) /* The root must exist but we did not find it by the key. */ - ret = -EUCLEAN; - goto out; - } + return -EUCLEAN; - ret = btrfs_del_item(trans, root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, @@ -344,7 +336,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root_ref *ref; struct extent_buffer *leaf; struct btrfs_key key; @@ -361,7 +353,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) { - goto out; + return ret; } else if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], @@ -369,18 +361,16 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, ptr = (unsigned long)(ref + 1); if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || (btrfs_root_ref_name_len(leaf, ref) != name->len) || - memcmp_extent_buffer(leaf, name->name, ptr, name->len)) { - ret = -ENOENT; - goto out; - } + memcmp_extent_buffer(leaf, name->name, ptr, name->len)) + return -ENOENT; + *sequence = btrfs_root_ref_sequence(leaf, ref); ret = btrfs_del_item(trans, tree_root, path); if (ret) - goto out; + return ret; } else { - ret = -ENOENT; - goto out; + return -ENOENT; } if (key.type == BTRFS_ROOT_BACKREF_KEY) { @@ -391,8 +381,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, goto again; } -out: - btrfs_free_path(path); return ret; } @@ -418,7 +406,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_key key; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root_ref *ref; struct extent_buffer *leaf; unsigned long ptr; @@ -435,7 +423,6 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, sizeof(*ref) + name->len); if (ret) { btrfs_abort_transaction(trans, ret); - btrfs_free_path(path); return ret; } @@ -455,7 +442,6 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, goto again; } - btrfs_free_path(path); return 0; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ce28dcb60f7870..e19e6bc5f07492 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -588,7 +588,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * bool is_super, u64 logical, u64 physical) { struct btrfs_fs_info *fs_info = dev->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key found_key; struct extent_buffer *eb; struct btrfs_extent_item *ei; @@ -615,7 +615,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, &flags); if (ret < 0) - goto out; + return; swarn.extent_item_size = found_key.offset; @@ -661,9 +661,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); } - -out: - btrfs_free_path(path); } static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) @@ -2606,7 +2603,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root = fs_info->dev_root; u64 chunk_offset; @@ -2892,8 +2889,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_release_path(path); } - btrfs_free_path(path); - return ret; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5e073502b9e8e9..d493d32e8367ac 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -909,7 +909,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, struct btrfs_inode_info *info) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_item *ii; struct btrfs_key key; @@ -924,11 +924,11 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, if (ret) { if (ret > 0) ret = -ENOENT; - goto out; + return ret; } if (!info) - goto out; + return 0; ii = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -945,9 +945,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, */ info->fileattr = btrfs_inode_flags(path->nodes[0], ii); -out: - btrfs_free_path(path); - return ret; + return 0; } static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) @@ -979,7 +977,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb = path->nodes[0]; struct btrfs_inode_ref *iref; struct btrfs_inode_extref *extref; - struct btrfs_path *tmp_path; + BTRFS_PATH_AUTO_FREE(tmp_path); struct fs_path *p; u32 cur = 0; u32 total; @@ -1076,7 +1074,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } out: - btrfs_free_path(tmp_path); fs_path_free(p); return ret; } @@ -1224,7 +1221,7 @@ static int get_inode_path(struct btrfs_root *root, { int ret; struct btrfs_key key, found_key; - struct btrfs_path *p; + BTRFS_PATH_AUTO_FREE(p); p = alloc_path_for_send(); if (!p) @@ -1238,27 +1235,20 @@ static int get_inode_path(struct btrfs_root *root, ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); if (ret < 0) - goto out; - if (ret) { - ret = 1; - goto out; - } + return ret; + if (ret) + return 1; + btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); if (found_key.objectid != ino || (found_key.type != BTRFS_INODE_REF_KEY && - found_key.type != BTRFS_INODE_EXTREF_KEY)) { - ret = -ENOENT; - goto out; - } + found_key.type != BTRFS_INODE_EXTREF_KEY)) + return -ENOENT; ret = iterate_inode_ref(root, p, &found_key, true, __copy_first_ref, path); if (ret < 0) - goto out; - ret = 0; - -out: - btrfs_free_path(p); - return ret; + return ret; + return 0; } struct backref_ctx { @@ -1715,7 +1705,7 @@ static int read_symlink(struct btrfs_root *root, struct fs_path *dest) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_file_extent_item *ei; u8 type; @@ -1732,7 +1722,7 @@ static int read_symlink(struct btrfs_root *root, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret) { /* * An empty symlink inode. Can happen in rare error paths when @@ -1745,8 +1735,7 @@ static int read_symlink(struct btrfs_root *root, btrfs_err(root->fs_info, "Found empty symlink inode %llu at root %llu", ino, btrfs_root_id(root)); - ret = -EIO; - goto out; + return -EIO; } ei = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -1757,7 +1746,7 @@ static int read_symlink(struct btrfs_root *root, btrfs_crit(root->fs_info, "send: found symlink extent that is not inline, ino %llu root %llu extent type %d", ino, btrfs_root_id(root), type); - goto out; + return ret; } compression = btrfs_file_extent_compression(path->nodes[0], ei); if (unlikely(compression != BTRFS_COMPRESS_NONE)) { @@ -1765,17 +1754,13 @@ static int read_symlink(struct btrfs_root *root, btrfs_crit(root->fs_info, "send: found symlink extent with compression, ino %llu root %llu compression type %d", ino, btrfs_root_id(root), compression); - goto out; + return ret; } off = btrfs_file_extent_inline_start(ei); len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); - ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); - -out: - btrfs_free_path(path); - return ret; + return fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); } /* @@ -1786,8 +1771,7 @@ static int gen_unique_name(struct send_ctx *sctx, u64 ino, u64 gen, struct fs_path *dest) { - int ret = 0; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; char tmp[64]; int len; @@ -1810,10 +1794,9 @@ static int gen_unique_name(struct send_ctx *sctx, path, BTRFS_FIRST_FREE_OBJECTID, &tmp_name, 0); btrfs_release_path(path); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } + if (IS_ERR(di)) + return PTR_ERR(di); + if (di) { /* not unique, try again */ idx++; @@ -1822,7 +1805,6 @@ static int gen_unique_name(struct send_ctx *sctx, if (!sctx->parent_root) { /* unique */ - ret = 0; break; } @@ -1830,10 +1812,9 @@ static int gen_unique_name(struct send_ctx *sctx, path, BTRFS_FIRST_FREE_OBJECTID, &tmp_name, 0); btrfs_release_path(path); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } + if (IS_ERR(di)) + return PTR_ERR(di); + if (di) { /* not unique, try again */ idx++; @@ -1843,11 +1824,7 @@ static int gen_unique_name(struct send_ctx *sctx, break; } - ret = fs_path_add(dest, tmp, len); - -out: - btrfs_free_path(path); - return ret; + return fs_path_add(dest, tmp, len); } enum inode_state { @@ -1959,7 +1936,7 @@ static int lookup_dir_item_inode(struct btrfs_root *root, int ret = 0; struct btrfs_dir_item *di; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); path = alloc_path_for_send(); @@ -1967,19 +1944,15 @@ static int lookup_dir_item_inode(struct btrfs_root *root, return -ENOMEM; di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); - if (IS_ERR_OR_NULL(di)) { - ret = di ? PTR_ERR(di) : -ENOENT; - goto out; - } + if (IS_ERR_OR_NULL(di)) + return di ? PTR_ERR(di) : -ENOENT; + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); - if (key.type == BTRFS_ROOT_ITEM_KEY) { - ret = -ENOENT; - goto out; - } + if (key.type == BTRFS_ROOT_ITEM_KEY) + return -ENOENT; + *found_inode = key.objectid; -out: - btrfs_free_path(path); return ret; } @@ -1993,7 +1966,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, int ret; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int len; u64 parent_dir; @@ -2007,16 +1980,14 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); if (ret < 0) - goto out; + return ret; if (!ret) btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (ret || found_key.objectid != ino || (found_key.type != BTRFS_INODE_REF_KEY && - found_key.type != BTRFS_INODE_EXTREF_KEY)) { - ret = -ENOENT; - goto out; - } + found_key.type != BTRFS_INODE_EXTREF_KEY)) + return -ENOENT; if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; @@ -2037,19 +2008,17 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); } if (ret < 0) - goto out; + return ret; btrfs_release_path(path); if (dir_gen) { ret = get_inode_gen(root, parent_dir, dir_gen); if (ret < 0) - goto out; + return ret; } *dir = parent_dir; -out: - btrfs_free_path(path); return ret; } @@ -2485,7 +2454,7 @@ static int send_subvol_begin(struct send_ctx *sctx) int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_root *parent_root = sctx->parent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -2497,10 +2466,8 @@ static int send_subvol_begin(struct send_ctx *sctx) return -ENOMEM; name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); - if (!name) { - btrfs_free_path(path); + if (!name) return -ENOMEM; - } key.objectid = btrfs_root_id(send_root); key.type = BTRFS_ROOT_BACKREF_KEY; @@ -2563,7 +2530,6 @@ static int send_subvol_begin(struct send_ctx *sctx) tlv_put_failure: out: - btrfs_free_path(path); kfree(name); return ret; } @@ -2714,7 +2680,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) int ret = 0; struct fs_path *p = NULL; struct btrfs_inode_item *ii; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; struct btrfs_key key; int slot; @@ -2758,7 +2724,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) tlv_put_failure: out: free_path_for_command(sctx, p); - btrfs_free_path(path); return ret; } @@ -2929,7 +2894,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) { int ret = 0; int iter_ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; struct btrfs_key di_key; @@ -2969,7 +2934,6 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } @@ -3749,7 +3713,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key di_key; struct btrfs_dir_item *di; @@ -3770,19 +3734,15 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - goto out; - } + if (ret < 0) + return ret; + if (ret > 0) + return 0; di = btrfs_match_dir_item_name(path, parent_ref->name, parent_ref->name_len); - if (!di) { - ret = 0; - goto out; - } + if (!di) + return 0; /* * di_key.objectid has the number of the inode that has a dentry in the * parent directory with the same name that sctx->cur_ino is being @@ -3792,26 +3752,22 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, * that it happens after that other inode is renamed. */ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); - if (di_key.type != BTRFS_INODE_ITEM_KEY) { - ret = 0; - goto out; - } + if (di_key.type != BTRFS_INODE_ITEM_KEY) + return 0; ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen); if (ret < 0) - goto out; + return ret; ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen); if (ret < 0) { if (ret == -ENOENT) ret = 0; - goto out; + return ret; } /* Different inode, no need to delay the rename of sctx->cur_ino */ - if (right_gen != left_gen) { - ret = 0; - goto out; - } + if (right_gen != left_gen) + return 0; wdm = get_waiting_dir_move(sctx, di_key.objectid); if (wdm && !wdm->orphanized) { @@ -3825,8 +3781,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, if (!ret) ret = 1; } -out: - btrfs_free_path(path); return ret; } @@ -3876,7 +3830,7 @@ static int is_ancestor(struct btrfs_root *root, bool free_fs_path = false; int ret = 0; int iter_ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; if (!fs_path) { @@ -3944,7 +3898,6 @@ static int is_ancestor(struct btrfs_root *root, ret = iter_ret; out: - btrfs_free_path(path); if (free_fs_path) fs_path_free(fs_path); return ret; @@ -4801,7 +4754,7 @@ static int process_all_refs(struct send_ctx *sctx, int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; iterate_inode_ref_t cb; @@ -4820,8 +4773,7 @@ static int process_all_refs(struct send_ctx *sctx, } else { btrfs_err(sctx->send_root->fs_info, "Wrong command %d in process_all_refs", cmd); - ret = -EINVAL; - goto out; + return -EINVAL; } key.objectid = sctx->cmp_key->objectid; @@ -4835,13 +4787,12 @@ static int process_all_refs(struct send_ctx *sctx, ret = iterate_inode_ref(root, path, &found_key, false, cb, sctx); if (ret < 0) - goto out; + return ret; } /* Catch error found during iteration */ - if (iter_ret < 0) { - ret = iter_ret; - goto out; - } + if (iter_ret < 0) + return iter_ret; + btrfs_release_path(path); /* @@ -4849,10 +4800,7 @@ static int process_all_refs(struct send_ctx *sctx, * re-creating this inode and will be rename'ing it into place once we * rename the parent directory. */ - ret = process_recorded_refs(sctx, &pending_move); -out: - btrfs_free_path(path); - return ret; + return process_recorded_refs(sctx, &pending_move); } static int send_set_xattr(struct send_ctx *sctx, @@ -5078,7 +5026,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx) int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; @@ -5106,7 +5054,6 @@ static int process_all_new_xattrs(struct send_ctx *sctx) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } @@ -5771,7 +5718,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, */ static int send_capabilities(struct send_ctx *sctx) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct extent_buffer *leaf; unsigned long data_ptr; @@ -5809,7 +5756,6 @@ static int send_capabilities(struct send_ctx *sctx) strlen(XATTR_NAME_CAPS), buf, buf_len); out: kfree(buf); - btrfs_free_path(path); return ret; } @@ -5817,7 +5763,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, struct clone_root *clone_root, const u64 disk_byte, u64 data_offset, u64 offset, u64 len) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret; struct btrfs_inode_info info; @@ -5853,7 +5799,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = get_inode_info(clone_root->root, clone_root->ino, &info); btrfs_release_path(path); if (ret < 0) - goto out; + return ret; clone_src_i_size = info.size; /* @@ -5883,7 +5829,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, key.offset = clone_root->offset; ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && path->slots[0] > 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); if (key.objectid == clone_root->ino && @@ -5904,7 +5850,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); if (ret < 0) - goto out; + return ret; else if (ret > 0) break; continue; @@ -5941,7 +5887,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_extent_data(sctx, dst_path, offset, hole_len); if (ret < 0) - goto out; + return ret; len -= hole_len; if (len == 0) @@ -6012,7 +5958,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_clone(sctx, offset, slen, clone_root); if (ret < 0) - goto out; + return ret; } ret = send_extent_data(sctx, dst_path, offset + slen, @@ -6046,7 +5992,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, } if (ret < 0) - goto out; + return ret; len -= clone_len; if (len == 0) @@ -6077,8 +6023,6 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_extent_data(sctx, dst_path, offset, len); else ret = 0; -out: - btrfs_free_path(path); return ret; } @@ -6167,7 +6111,7 @@ static int is_extent_unchanged(struct send_ctx *sctx, { int ret = 0; struct btrfs_key key; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int slot; struct btrfs_key found_key; @@ -6193,10 +6137,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); left_type = btrfs_file_extent_type(eb, ei); - if (left_type != BTRFS_FILE_EXTENT_REG) { - ret = 0; - goto out; - } + if (left_type != BTRFS_FILE_EXTENT_REG) + return 0; + left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); left_len = btrfs_file_extent_num_bytes(eb, ei); left_offset = btrfs_file_extent_offset(eb, ei); @@ -6228,11 +6171,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, key.offset = ekey->offset; ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + return ret; + if (ret) + return 0; /* * Handle special case where the right side has no extents at all. @@ -6241,11 +6182,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || - found_key.type != key.type) { + found_key.type != key.type) /* If we're a hole then just pretend nothing changed */ - ret = (left_disknr) ? 0 : 1; - goto out; - } + return (left_disknr ? 0 : 1); /* * We're now on 2a, 2b or 7. @@ -6255,10 +6194,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); if (right_type != BTRFS_FILE_EXTENT_REG && - right_type != BTRFS_FILE_EXTENT_INLINE) { - ret = 0; - goto out; - } + right_type != BTRFS_FILE_EXTENT_INLINE) + return 0; if (right_type == BTRFS_FILE_EXTENT_INLINE) { right_len = btrfs_file_extent_ram_bytes(eb, ei); @@ -6271,11 +6208,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, * Are we at extent 8? If yes, we know the extent is changed. * This may only happen on the first iteration. */ - if (found_key.offset + right_len <= ekey->offset) { + if (found_key.offset + right_len <= ekey->offset) /* If we're a hole just pretend nothing changed */ - ret = (left_disknr) ? 0 : 1; - goto out; - } + return (left_disknr ? 0 : 1); /* * We just wanted to see if when we have an inline extent, what @@ -6285,10 +6220,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, * compressed extent representing data with a size matching * the page size (currently the same as sector size). */ - if (right_type == BTRFS_FILE_EXTENT_INLINE) { - ret = 0; - goto out; - } + if (right_type == BTRFS_FILE_EXTENT_INLINE) + return 0; right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); right_offset = btrfs_file_extent_offset(eb, ei); @@ -6308,17 +6241,15 @@ static int is_extent_unchanged(struct send_ctx *sctx, */ if (left_disknr != right_disknr || left_offset_fixed != right_offset || - left_gen != right_gen) { - ret = 0; - goto out; - } + left_gen != right_gen) + return 0; /* * Go to the next extent. */ ret = btrfs_next_item(sctx->parent_root, path); if (ret < 0) - goto out; + return ret; if (!ret) { eb = path->nodes[0]; slot = path->slots[0]; @@ -6329,10 +6260,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, key.offset += right_len; break; } - if (found_key.offset != key.offset + right_len) { - ret = 0; - goto out; - } + if (found_key.offset != key.offset + right_len) + return 0; + key = found_key; } @@ -6345,15 +6275,12 @@ static int is_extent_unchanged(struct send_ctx *sctx, else ret = 0; - -out: - btrfs_free_path(path); return ret; } static int get_last_extent(struct send_ctx *sctx, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = sctx->send_root; struct btrfs_key key; int ret; @@ -6369,15 +6296,13 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) key.offset = offset; ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); if (ret < 0) - goto out; + return ret; ret = 0; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) - goto out; + return ret; sctx->cur_inode_last_extent = btrfs_file_extent_end(path); -out: - btrfs_free_path(path); return ret; } @@ -6385,7 +6310,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, const u64 start, const u64 end) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root = sctx->parent_root; u64 search_start = start; @@ -6400,7 +6325,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, key.offset = search_start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; @@ -6413,8 +6338,8 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -6436,15 +6361,11 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, search_start = extent_end; goto next; } - ret = 0; - goto out; + return 0; next: path->slots[0]++; } - ret = 1; -out: - btrfs_free_path(path); - return ret; + return 1; } static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, @@ -6552,7 +6473,7 @@ static int process_all_extents(struct send_ctx *sctx) int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; @@ -6579,7 +6500,6 @@ static int process_all_extents(struct send_ctx *sctx) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } @@ -7329,7 +7249,7 @@ static int full_send_tree(struct send_ctx *sctx) struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_fs_info *fs_info = send_root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = alloc_path_for_send(); if (!path) @@ -7346,7 +7266,7 @@ static int full_send_tree(struct send_ctx *sctx) ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) - goto out; + return ret; if (ret) goto out_finish; @@ -7356,7 +7276,7 @@ static int full_send_tree(struct send_ctx *sctx) ret = changed_cb(path, NULL, &key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) - goto out; + return ret; down_read(&fs_info->commit_root_sem); if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { @@ -7375,14 +7295,14 @@ static int full_send_tree(struct send_ctx *sctx) btrfs_release_path(path); ret = search_key_again(sctx, send_root, path, &key); if (ret < 0) - goto out; + return ret; } else { up_read(&fs_info->commit_root_sem); } ret = btrfs_next_item(send_root, path); if (ret < 0) - goto out; + return ret; if (ret) { ret = 0; break; @@ -7390,11 +7310,7 @@ static int full_send_tree(struct send_ctx *sctx) } out_finish: - ret = finish_inode_if_needed(sctx, 1); - -out: - btrfs_free_path(path); - return ret; + return finish_inode_if_needed(sctx, 1); } static int replace_node_with_clone(struct btrfs_path *path, int level) @@ -7649,8 +7565,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; int cmp; - struct btrfs_path *left_path = NULL; - struct btrfs_path *right_path = NULL; + BTRFS_PATH_AUTO_FREE(left_path); + BTRFS_PATH_AUTO_FREE(right_path); struct btrfs_key left_key; struct btrfs_key right_key; char *tmp_buf = NULL; @@ -7923,8 +7839,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, out_unlock: up_read(&fs_info->commit_root_sem); out: - btrfs_free_path(left_path); - btrfs_free_path(right_path); kvfree(tmp_buf); return ret; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 49893c8858556b..a2593acc49700b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -925,7 +925,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec { struct btrfs_root *root = fs_info->tree_root; struct btrfs_dir_item *di; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key location; struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; @@ -942,7 +942,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0); if (IS_ERR(di)) { - btrfs_free_path(path); return PTR_ERR(di); } if (!di) { @@ -951,13 +950,11 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec * it's always been there, but don't freak out, just try and * mount the top-level subvolume. */ - btrfs_free_path(path); *objectid = BTRFS_FS_TREE_OBJECTID; return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - btrfs_free_path(path); *objectid = location.objectid; return 0; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index febf456a9ab099..12558404f92767 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1641,7 +1641,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; struct btrfs_inode *parent_inode = pending->dir; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; struct extent_buffer *old; @@ -1905,7 +1905,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, free_pending: kfree(new_root_item); pending->root_item = NULL; - btrfs_free_path(path); pending->path = NULL; return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 78d59b63748bac..b34ae17c5d70f0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1176,7 +1176,7 @@ static noinline int backref_in_log(struct btrfs_root *log, u64 ref_objectid, const struct fscrypt_str *name) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; path = btrfs_alloc_path(); @@ -1184,12 +1184,10 @@ static noinline int backref_in_log(struct btrfs_root *log, return -ENOMEM; ret = btrfs_search_slot(NULL, log, key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret == 1) { - ret = 0; - goto out; - } + if (ret < 0) + return ret; + if (ret == 1) + return 0; if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], @@ -1198,8 +1196,6 @@ static noinline int backref_in_log(struct btrfs_root *log, else ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name); -out: - btrfs_free_path(path); return ret; } @@ -2512,7 +2508,7 @@ static int replay_xattr_deletes(struct walk_control *wc) struct btrfs_root *root = wc->root; struct btrfs_root *log = wc->log; struct btrfs_key search_key; - struct btrfs_path *log_path; + BTRFS_PATH_AUTO_FREE(log_path); const u64 ino = wc->log_key.objectid; int nritems; int ret; @@ -2626,7 +2622,6 @@ static int replay_xattr_deletes(struct walk_control *wc) "failed to get next leaf in subvolume root %llu", btrfs_root_id(root)); out: - btrfs_free_path(log_path); btrfs_release_path(wc->subvol_path); return ret; } @@ -3129,7 +3124,7 @@ static int walk_log_tree(struct walk_control *wc) int ret = 0; int wret; int level; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int orig_level; path = btrfs_alloc_path(); @@ -3146,18 +3141,14 @@ static int walk_log_tree(struct walk_control *wc) wret = walk_down_log_tree(path, &level, wc); if (wret > 0) break; - if (wret < 0) { - ret = wret; - goto out; - } + if (wret < 0) + return wret; wret = walk_up_log_tree(path, &level, wc); if (wret > 0) break; - if (wret < 0) { - ret = wret; - goto out; - } + if (wret < 0) + return wret; } /* was the root node processed? if not, catch it here */ @@ -3166,13 +3157,11 @@ static int walk_log_tree(struct walk_control *wc) btrfs_header_generation(path->nodes[orig_level]), orig_level); if (ret) - goto out; + return ret; if (wc->free) ret = clean_log_buffer(wc->trans, path->nodes[orig_level]); } -out: - btrfs_free_path(path); return ret; } @@ -3910,13 +3899,13 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; ret = inode_logged(trans, dir, NULL); if (ret == 0) return; - else if (ret < 0) { + if (ret < 0) { btrfs_set_log_full_commit(trans); return; } @@ -3930,7 +3919,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, ret = join_running_log_trans(root); ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); if (WARN_ON(ret)) - goto out; + return; mutex_lock(&dir->log_mutex); @@ -3940,8 +3929,6 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, if (ret < 0) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); -out: - btrfs_free_path(path); } /* see comments for btrfs_del_dir_entries_in_log */ @@ -5218,7 +5205,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct btrfs_key key; const u64 i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); - struct btrfs_path *dst_path = NULL; + BTRFS_PATH_AUTO_FREE(dst_path); bool dropped_extents = false; u64 truncate_offset = i_size; struct extent_buffer *leaf; @@ -5336,7 +5323,6 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, start_slot, ins_nr, 1, 0, ctx); out: btrfs_release_path(path); - btrfs_free_path(dst_path); return ret; } @@ -5709,7 +5695,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, u64 *other_ino, u64 *other_parent) { int ret; - struct btrfs_path *search_path; + BTRFS_PATH_AUTO_FREE(search_path); char *name = NULL; u32 name_len = 0; u32 item_size = btrfs_item_size(eb, slot); @@ -5794,7 +5780,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, } ret = 0; out: - btrfs_free_path(search_path); kfree(name); return ret; } @@ -7176,7 +7161,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root = inode->root; const u64 ino = btrfs_ino(inode); @@ -7192,7 +7177,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; while (true) { struct extent_buffer *leaf = path->nodes[0]; @@ -7204,8 +7189,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -7263,10 +7248,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, * at both parents and the old parent B would still * exist. */ - if (IS_ERR(dir_inode)) { - ret = PTR_ERR(dir_inode); - goto out; - } + if (IS_ERR(dir_inode)) + return PTR_ERR(dir_inode); if (!need_log_inode(trans, dir_inode)) { btrfs_add_delayed_iput(dir_inode); @@ -7279,14 +7262,11 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ret = log_new_dir_dentries(trans, dir_inode, ctx); btrfs_add_delayed_iput(dir_inode); if (ret) - goto out; + return ret; } path->slots[0]++; } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } static int log_new_ancestors(struct btrfs_trans_handle *trans, @@ -7397,7 +7377,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; const u64 ino = btrfs_ino(inode); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret; @@ -7418,7 +7398,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) path->slots[0]++; @@ -7430,8 +7410,8 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -7448,10 +7428,8 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, * this loop, etc). So just return some error to fallback to * a transaction commit. */ - if (found_key.type == BTRFS_INODE_EXTREF_KEY) { - ret = -EMLINK; - goto out; - } + if (found_key.type == BTRFS_INODE_EXTREF_KEY) + return -EMLINK; /* * Logging ancestors needs to do more searches on the fs/subvol @@ -7463,14 +7441,11 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, ret = log_new_ancestors(trans, root, path, ctx); if (ret) - goto out; + return ret; btrfs_release_path(path); goto again; } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } /* From 9264d004a6c9788354b45553b9e4fe910e71b387 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 17 Sep 2025 19:53:54 +0200 Subject: [PATCH 2606/3206] btrfs: add unlikely annotations to branches leading to EUCLEAN The unlikely() annotation is a static prediction hint that compiler may use to reorder code out of hot path. We use it elsewhere (namely tree-checker.c) for error branches that almost never happen, where EUCLEAN (a corruption) is one of them. Reviewed-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 20 ++++++++++---------- fs/btrfs/block-group.c | 14 +++++++------- fs/btrfs/ctree.c | 10 +++++----- fs/btrfs/dev-replace.c | 4 ++-- fs/btrfs/disk-io.c | 28 ++++++++++++++-------------- fs/btrfs/export.c | 2 +- fs/btrfs/extent-tree.c | 32 ++++++++++++++++---------------- fs/btrfs/inode.c | 10 +++++----- fs/btrfs/ioctl.c | 6 +++--- fs/btrfs/lzo.c | 6 +++--- fs/btrfs/qgroup.c | 14 +++++++------- fs/btrfs/relocation.c | 8 ++++---- fs/btrfs/root-tree.c | 6 +++--- fs/btrfs/scrub.c | 4 ++-- fs/btrfs/send.c | 2 +- fs/btrfs/super.c | 6 +++--- fs/btrfs/verity.c | 4 ++-- fs/btrfs/volumes.c | 30 +++++++++++++++--------------- fs/btrfs/zoned.c | 14 +++++++------- 19 files changed, 110 insertions(+), 110 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index c6573e845e43f6..650083e0f1cb8b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1062,7 +1062,7 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); - if (type == BTRFS_REF_TYPE_INVALID) + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) return -EUCLEAN; offset = btrfs_extent_inline_ref_offset(leaf, iref); @@ -1422,7 +1422,7 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -1652,7 +1652,7 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, * case. */ ASSERT(eie); - if (!eie) { + if (unlikely(!eie)) { ret = -EUCLEAN; goto out; } @@ -2215,7 +2215,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2312,7 +2312,7 @@ static int get_extent_inline_ref(unsigned long *ptr, *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); *out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref, BTRFS_REF_TYPE_ANY); - if (*out_type == BTRFS_REF_TYPE_INVALID) + if (unlikely(*out_type == BTRFS_REF_TYPE_INVALID)) return -EUCLEAN; *ptr += btrfs_extent_inline_ref_size(*out_type); @@ -2868,7 +2868,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2876,7 +2876,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) ret = -EUCLEAN; goto release; } - if (path->slots[0] == 0) { + if (unlikely(path->slots[0] == 0)) { DEBUG_WARN(); ret = -EUCLEAN; goto release; @@ -3457,7 +3457,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, if (ret < 0) goto out; /* No extra backref? This means the tree block is corrupted */ - if (ret > 0) { + if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -3500,7 +3500,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, ((unsigned long)iter->cur_ptr); type = btrfs_get_extent_inline_ref_type(eb, iref, BTRFS_REF_TYPE_BLOCK); - if (type == BTRFS_REF_TYPE_INVALID) { + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) { ret = -EUCLEAN; goto out; } @@ -3612,7 +3612,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, } /* Sanity check, we shouldn't have any unchecked nodes */ - if (!upper->checked) { + if (unlikely(!upper->checked)) { DEBUG_WARN("we should not have any unchecked nodes"); return -EUCLEAN; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 548483a84466b6..6f7974060a1ab5 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2071,7 +2071,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key return -ENOENT; } - if (map->start != key->objectid || map->chunk_len != key->offset) { + if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) { btrfs_err(fs_info, "block group %llu len %llu mismatch with chunk %llu len %llu", key->objectid, key->offset, map->start, map->chunk_len); @@ -2084,7 +2084,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key flags = btrfs_stack_block_group_flags(&bg) & BTRFS_BLOCK_GROUP_TYPE_MASK; - if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) { btrfs_err(fs_info, "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", key->objectid, key->offset, flags, @@ -2245,7 +2245,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return ret; /* Shouldn't have super stripes in sequential zones */ - if (zoned && nr) { + if (unlikely(zoned && nr)) { kfree(logical); btrfs_err(fs_info, "zoned: block group %llu must not contain super block", @@ -2336,7 +2336,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) break; bg = btrfs_lookup_block_group(fs_info, map->start); - if (!bg) { + if (unlikely(!bg)) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", map->start, map->chunk_len); @@ -2344,9 +2344,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) btrfs_free_chunk_map(map); break; } - if (bg->start != map->start || bg->length != map->chunk_len || - (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (unlikely(bg->start != map->start || bg->length != map->chunk_len || + (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != + (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) { btrfs_err(fs_info, "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", map->start, map->chunk_len, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 13dc44e9076246..71256ed909f06f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1499,7 +1499,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * being cached, read from scrub, or have multiple * parents (shared tree blocks). */ - if (btrfs_verify_level_key(tmp, &check)) { + if (unlikely(btrfs_verify_level_key(tmp, &check))) { ret = -EUCLEAN; goto out; } @@ -2731,7 +2731,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); /* dst is the left eb, src is the middle eb */ - if (check_sibling_keys(dst, src)) { + if (unlikely(check_sibling_keys(dst, src))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; @@ -2805,7 +2805,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, push_items = max_push; /* dst is the right eb, src is the middle eb */ - if (check_sibling_keys(src, dst)) { + if (unlikely(check_sibling_keys(src, dst))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; @@ -3287,7 +3287,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; - if (check_sibling_keys(left, right)) { + if (unlikely(check_sibling_keys(left, right))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); btrfs_tree_unlock(right); @@ -3503,7 +3503,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - if (check_sibling_keys(left, right)) { + if (unlikely(check_sibling_keys(left, right))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); goto out; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ed3b07fdaab87b..b37e9a893b918a 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -98,7 +98,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) * We don't have a replace item or it's corrupted. If there is * a replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, &args)) { + if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) { btrfs_err(fs_info, "found replace target device without a valid replace item"); return -EUCLEAN; @@ -158,7 +158,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) * We don't have an active replace item but if there is a * replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, &args)) { + if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) { btrfs_err(fs_info, "replace without active item, run 'device scan --forget' on the target device"); ret = -EUCLEAN; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0aee3239518d76..dbc72135b5e7a0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -404,7 +404,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb), ignore_csum ? ", ignored" : ""); - if (!ignore_csum) { + if (unlikely(!ignore_csum)) { ret = -EUCLEAN; goto out; } @@ -1055,10 +1055,10 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, * For real fs, and not log/reloc trees, root owner must * match its root node owner */ - if (!btrfs_is_testing(fs_info) && - btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && - btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && - btrfs_root_id(root) != btrfs_header_owner(root->node)) { + if (unlikely(!btrfs_is_testing(fs_info) && + btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && + btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + btrfs_root_id(root) != btrfs_header_owner(root->node))) { btrfs_crit(fs_info, "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", btrfs_root_id(root), root->node->start, @@ -2324,7 +2324,7 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, const u32 sectorsize = btrfs_super_sectorsize(sb); u32 sys_array_size = btrfs_super_sys_array_size(sb); - if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + if (unlikely(sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)) { btrfs_err(fs_info, "system chunk array too big %u > %u", sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); return -EUCLEAN; @@ -2342,12 +2342,12 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); len = sizeof(*disk_key); - if (cur + len > sys_array_size) + if (unlikely(cur + len > sys_array_size)) goto short_read; cur += len; btrfs_disk_key_to_cpu(&key, disk_key); - if (key.type != BTRFS_CHUNK_ITEM_KEY) { + if (unlikely(key.type != BTRFS_CHUNK_ITEM_KEY)) { btrfs_err(fs_info, "unexpected item type %u in sys_array at offset %u", key.type, cur); @@ -2355,10 +2355,10 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, } chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); num_stripes = btrfs_stack_chunk_num_stripes(chunk); - if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) + if (unlikely(cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)) goto short_read; type = btrfs_stack_chunk_type(chunk); - if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + if (unlikely(!(type & BTRFS_BLOCK_GROUP_SYSTEM))) { btrfs_err(fs_info, "invalid chunk type %llu in sys_array at offset %u", type, cur); @@ -2605,13 +2605,13 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, ret = btrfs_validate_super(fs_info, sb, -1); if (ret < 0) goto out; - if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { + if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid csum type, has %u want %u", btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); goto out; } - if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + if (unlikely(btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP)) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid incompat flags, has 0x%llx valid mask 0x%llx", @@ -4065,7 +4065,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); ret = btrfs_validate_write_super(fs_info, sb); - if (ret < 0) { + if (unlikely(ret < 0)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_handle_fs_error(fs_info, -EUCLEAN, "unexpected superblock corruption detected"); @@ -4881,7 +4881,7 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 7fc8a3200b4005..d062ac521051b8 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -174,7 +174,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto fail; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset of -1 found, there would have to exist an * inode with such number or a root with such id. diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a4416c451b250b..5982683d8e4594 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -879,7 +879,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, ptr += btrfs_extent_inline_ref_size(type); continue; } - if (type == BTRFS_REF_TYPE_INVALID) { + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) { ret = -EUCLEAN; goto out; } @@ -1210,7 +1210,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, * We're adding refs to a tree block we already own, this * should not happen at all. */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (unlikely(owner < BTRFS_FIRST_FREE_OBJECTID)) { btrfs_print_leaf(path->nodes[0]); btrfs_crit(trans->fs_info, "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", @@ -2355,7 +2355,7 @@ static noinline int check_committed_ref(struct btrfs_inode *inode, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2760,7 +2760,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); - if (cache == NULL) { + if (unlikely(cache == NULL)) { /* Logic error, something removed the block group. */ ret = -EUCLEAN; goto out; @@ -3162,7 +3162,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (!found_extent) { - if (iref) { + if (unlikely(iref)) { abort_and_dump(trans, path, "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", path->slots[0]); @@ -3254,7 +3254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - if (item_size < sizeof(*ei) + sizeof(*bi)) { + if (unlikely(item_size < sizeof(*ei) + sizeof(*bi))) { abort_and_dump(trans, path, "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", key.objectid, key.type, key.offset, @@ -3268,7 +3268,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } refs = btrfs_extent_refs(leaf, ei); - if (refs < refs_to_drop) { + if (unlikely(refs < refs_to_drop)) { abort_and_dump(trans, path, "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", refs_to_drop, refs, bytenr, path->slots[0]); @@ -3285,7 +3285,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - if (!found_extent) { + if (unlikely(!found_extent)) { abort_and_dump(trans, path, "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", path->slots[0]); @@ -3314,8 +3314,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, /* In this branch refs == 1 */ if (found_extent) { - if (is_data && refs_to_drop != - extent_data_ref_count(path, iref)) { + if (unlikely(is_data && refs_to_drop != + extent_data_ref_count(path, iref))) { abort_and_dump(trans, path, "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", extent_data_ref_count(path, iref), @@ -3324,7 +3324,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } if (iref) { - if (path->slots[0] != extent_slot) { + if (unlikely(path->slots[0] != extent_slot)) { abort_and_dump(trans, path, "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", key.objectid, key.type, @@ -3339,7 +3339,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * | extent_slot ||extent_slot + 1| * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] */ - if (path->slots[0] != extent_slot + 1) { + if (unlikely(path->slots[0] != extent_slot + 1)) { abort_and_dump(trans, path, "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", path->slots[0]); @@ -5063,7 +5063,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; - if (check_eb_lock_owner(buf)) { + if (unlikely(check_eb_lock_owner(buf))) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } @@ -5910,13 +5910,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else if (btrfs_root_id(root) != btrfs_header_owner(eb)) + else if (unlikely(btrfs_root_id(root) != btrfs_header_owner(eb))) goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else if (btrfs_root_id(root) != - btrfs_header_owner(path->nodes[level + 1])) + else if (unlikely(btrfs_root_id(root) != + btrfs_header_owner(path->nodes[level + 1]))) goto owner_mismatch; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 36a030a2a0a4fd..3e0b699c938d62 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4551,7 +4551,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. @@ -5626,8 +5626,8 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, } btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); - if (location->type != BTRFS_INODE_ITEM_KEY && - location->type != BTRFS_ROOT_ITEM_KEY) { + if (unlikely(location->type != BTRFS_INODE_ITEM_KEY && + location->type != BTRFS_ROOT_ITEM_KEY)) { ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", @@ -5918,7 +5918,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ - if (btrfs_inode_type(inode) != di_type) { + if (unlikely(btrfs_inode_type(inode) != di_type)) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", inode->vfs_inode.i_mode, btrfs_inode_type(inode), @@ -7102,7 +7102,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ - if (!S_ISREG(inode->vfs_inode.i_mode)) { + if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) { ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0e9e2b99939232..7f347baa13df1c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2133,7 +2133,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) ret = btrfs_next_leaf(fs_info->tree_root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -2216,7 +2216,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, ret = btrfs_next_leaf(root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -2245,7 +2245,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, ret = btrfs_next_item(root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b93bdac91b30e7..4758f66da449c0 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -458,16 +458,16 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, size_t max_segment_len = workspace_buf_length(fs_info); int ret = 0; - if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) + if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)) return -EUCLEAN; in_len = read_compress_length(data_in); - if (in_len != srclen) + if (unlikely(in_len != srclen)) return -EUCLEAN; data_in += LZO_LEN; in_len = read_compress_length(data_in); - if (in_len != srclen - LZO_LEN * 2) { + if (unlikely(in_len != srclen - LZO_LEN * 2)) { ret = -EUCLEAN; goto out; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2d8667cc609a2b..8154393449b841 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2426,9 +2426,9 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, int i; /* Level sanity check */ - if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || - root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || - root_level < cur_level) { + if (unlikely(cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || + root_level < cur_level)) { btrfs_err_rl(fs_info, "%s: bad levels, cur_level=%d root_level=%d", __func__, cur_level, root_level); @@ -2444,7 +2444,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, * dst_path->nodes[root_level] must be initialized before * calling this function. */ - if (cur_level == root_level) { + if (unlikely(cur_level == root_level)) { btrfs_err_rl(fs_info, "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", __func__, root_level, root_level, cur_level); @@ -2530,7 +2530,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, return 0; /* Wrong parameter order */ - if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { + if (unlikely(btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb))) { btrfs_err_rl(fs_info, "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, btrfs_header_generation(src_eb), @@ -4710,8 +4710,8 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, if (!btrfs_qgroup_full_accounting(fs_info)) return 0; - if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > - btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { + if (unlikely(btrfs_node_ptr_generation(subvol_parent, subvol_slot) > + btrfs_node_ptr_generation(reloc_parent, reloc_slot))) { btrfs_err_rl(fs_info, "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", __func__, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7a90a4ef8375d9..94cd55e97f5bd9 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1953,7 +1953,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); return PTR_ERR(root); } - if (root->reloc_root != reloc_root) { + if (unlikely(root->reloc_root != reloc_root)) { DEBUG_WARN("unexpected reloc root found"); btrfs_err(fs_info, "root %llu has two reloc roots associated with it", @@ -2024,7 +2024,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOENT); - if (next->new_bytenr) { + if (unlikely(next->new_bytenr)) { /* * We just created the reloc root, so we shouldn't have * ->new_bytenr set yet. If it is then we have multiple roots @@ -2083,7 +2083,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) * This can occur if we have incomplete extent refs leading all * the way up a particular path, in this case return -EUCLEAN. */ - if (!root) + if (unlikely(!root)) return ERR_PTR(-EUCLEAN); /* No other choice for non-shareable tree */ @@ -2512,7 +2512,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, * normal user in the case of corruption. */ ASSERT(node->new_bytenr == 0); - if (node->new_bytenr) { + if (unlikely(node->new_bytenr)) { btrfs_err(root->fs_info, "bytenr %llu has improper references to it", node->bytenr); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index d1f5f6c42ef30d..8d4dd48beb3a30 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -85,7 +85,7 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, * Key with offset -1 found, there would have to exist a root * with such id, but this is out of the valid range. */ - if (ret == 0) { + if (unlikely(ret == 0)) { ret = -EUCLEAN; goto out; } @@ -145,7 +145,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret < 0) return ret; - if (ret > 0) { + if (unlikely(ret > 0)) { btrfs_crit(fs_info, "unable to find root key (%llu %u %llu) in tree %llu", key->objectid, key->type, key->offset, btrfs_root_id(root)); @@ -324,7 +324,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) return ret; - if (ret > 0) + if (unlikely(ret > 0)) /* The root must exist but we did not find it by the key. */ return -EUCLEAN; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e19e6bc5f07492..641d7975b93634 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1529,7 +1529,7 @@ static int find_first_extent_item(struct btrfs_root *extent_root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2910,7 +2910,7 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, physical, dev->devid); return -EIO; } - if (btrfs_super_generation(sb) != generation) { + if (unlikely(btrfs_super_generation(sb) != generation)) { btrfs_err_rl(fs_info, "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", physical, dev->devid, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d493d32e8367ac..be830d295a6bd8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7229,7 +7229,7 @@ static int search_key_again(const struct send_ctx *sctx, */ ret = btrfs_search_slot(NULL, root, key, path, 0, 0); ASSERT(ret <= 0); - if (ret > 0) { + if (unlikely(ret > 0)) { btrfs_print_tree(path->nodes[path->lowest_level], false); btrfs_err(root->fs_info, "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a2593acc49700b..d6e496436539d2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2325,14 +2325,14 @@ static int check_dev_super(struct btrfs_device *dev) /* Verify the checksum. */ csum_type = btrfs_super_csum_type(sb); - if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) { + if (unlikely(csum_type != btrfs_super_csum_type(fs_info->super_copy))) { btrfs_err(fs_info, "csum type changed, has %u expect %u", csum_type, btrfs_super_csum_type(fs_info->super_copy)); ret = -EUCLEAN; goto out; } - if (btrfs_check_super_csum(fs_info, sb)) { + if (unlikely(btrfs_check_super_csum(fs_info, sb))) { btrfs_err(fs_info, "csum for on-disk super block no longer matches"); ret = -EUCLEAN; goto out; @@ -2344,7 +2344,7 @@ static int check_dev_super(struct btrfs_device *dev) goto out; last_trans = btrfs_get_last_trans_committed(fs_info); - if (btrfs_super_generation(sb) != last_trans) { + if (unlikely(btrfs_super_generation(sb) != last_trans)) { btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", btrfs_super_generation(sb), last_trans); ret = -EUCLEAN; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index b7a96a005487e1..0bd7ea2c58ec6a 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -676,11 +676,11 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) if (ret < 0) return ret; - if (item.reserved[0] != 0 || item.reserved[1] != 0) + if (unlikely(item.reserved[0] != 0 || item.reserved[1] != 0)) return -EUCLEAN; true_size = btrfs_stack_verity_descriptor_size(&item); - if (true_size > INT_MAX) + if (unlikely(true_size > INT_MAX)) return -EUCLEAN; if (buf_size == 0) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4f20c25a39c81d..897294519a99c0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1911,7 +1911,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, if (ret < 0) goto error; - if (ret == 0) { + if (unlikely(ret == 0)) { /* Corruption */ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); ret = -EUCLEAN; @@ -3049,7 +3049,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; - else if (ret > 0) { /* Logic error or corruption */ + else if (unlikely(ret > 0)) { /* Logic error or corruption */ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", chunk_offset); btrfs_abort_transaction(trans, -ENOENT); @@ -3527,7 +3527,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } - if (ret == 0) { + if (unlikely(ret == 0)) { /* * On the first search we would find chunk tree with * offset -1, which is not possible. On subsequent @@ -7932,7 +7932,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, int i; map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); - if (!map) { + if (unlikely(!map)) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", physical_offset, devid); @@ -7941,7 +7941,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } stripe_len = btrfs_calc_stripe_length(map); - if (physical_len != stripe_len) { + if (unlikely(physical_len != stripe_len)) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", physical_offset, devid, map->start, physical_len, @@ -7961,8 +7961,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, devid, physical_offset, physical_len); for (i = 0; i < map->num_stripes; i++) { - if (map->stripes[i].dev->devid == devid && - map->stripes[i].physical == physical_offset) { + if (unlikely(map->stripes[i].dev->devid == devid && + map->stripes[i].physical == physical_offset)) { found = true; if (map->verified_stripes >= map->num_stripes) { btrfs_err(fs_info, @@ -7975,7 +7975,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, break; } } - if (!found) { + if (unlikely(!found)) { btrfs_err(fs_info, "dev extent physical offset %llu devid %llu has no corresponding chunk", physical_offset, devid); @@ -7984,13 +7984,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* Make sure no dev extent is beyond device boundary */ dev = btrfs_find_device(fs_info->fs_devices, &args); - if (!dev) { + if (unlikely(!dev)) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; goto out; } - if (physical_offset + physical_len > dev->disk_total_bytes) { + if (unlikely(physical_offset + physical_len > dev->disk_total_bytes)) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", devid, physical_offset, physical_len, @@ -8002,8 +8002,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, if (dev->zone_info) { u64 zone_size = dev->zone_info->zone_size; - if (!IS_ALIGNED(physical_offset, zone_size) || - !IS_ALIGNED(physical_len, zone_size)) { + if (unlikely(!IS_ALIGNED(physical_offset, zone_size) || + !IS_ALIGNED(physical_len, zone_size))) { btrfs_err(fs_info, "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", devid, physical_offset, physical_len); @@ -8027,7 +8027,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) struct btrfs_chunk_map *map; map = rb_entry(node, struct btrfs_chunk_map, rb_node); - if (map->num_stripes != map->verified_stripes) { + if (unlikely(map->num_stripes != map->verified_stripes)) { btrfs_err(fs_info, "chunk %llu has missing dev extent, have %d expect %d", map->start, map->verified_stripes, map->num_stripes); @@ -8087,7 +8087,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) if (ret < 0) goto out; /* No dev extents at all? Not good */ - if (ret > 0) { + if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -8112,7 +8112,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) physical_len = btrfs_dev_extent_length(leaf, dext); /* Check if this dev extent overlaps with the previous one */ - if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + if (unlikely(devid == prev_devid && physical_offset < prev_dev_ext_end)) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", devid, physical_offset, prev_dev_ext_end); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index e21b57a68b3c3b..39c1f36df8f7b3 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -315,7 +315,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) if (ret < 0) return ret; /* No dev extents at all? Not good */ - if (ret > 0) + if (unlikely(ret > 0)) return -EUCLEAN; } @@ -550,7 +550,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (ret) goto out; - if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { + if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) { btrfs_err(device->fs_info, "zoned: failed to read super block log zone info at devid %llu zone %u", device->devid, sb_zone); @@ -568,7 +568,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) ret = sb_write_pointer(device->bdev, &zone_info->sb_zones[sb_pos], &sb_wp); - if (ret != -ENOENT && ret) { + if (unlikely(ret != -ENOENT && ret)) { btrfs_err(device->fs_info, "zoned: super block log zone corrupted devid %llu zone %u", device->devid, sb_zone); @@ -1253,7 +1253,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, root = btrfs_extent_root(fs_info, key.objectid); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); /* We should not find the exact match */ - if (!ret) + if (unlikely(!ret)) ret = -EUCLEAN; if (ret < 0) return ret; @@ -1274,8 +1274,8 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, else length = fs_info->nodesize; - if (!(found_key.objectid >= cache->start && - found_key.objectid + length <= cache->start + cache->length)) { + if (unlikely(!(found_key.objectid >= cache->start && + found_key.objectid + length <= cache->start + cache->length))) { return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; @@ -2145,7 +2145,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, if (physical_pos == wp) return 0; - if (physical_pos > wp) + if (unlikely(physical_pos > wp)) return -EUCLEAN; length = wp - physical_pos; From cc53bd2085c8fa7b199a9a8e10e634b62a6d3fa8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 17 Sep 2025 19:53:55 +0200 Subject: [PATCH 2607/3206] btrfs: add unlikely annotations to branches leading to EIO The unlikely() annotation is a static prediction hint that compiler may use to reorder code out of hot path. We use it elsewhere (namely tree-checker.c) for error branches that almost never happen, where EIO is one of them. Reviewed-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 4 ++-- fs/btrfs/bio.c | 4 ++-- fs/btrfs/defrag.c | 2 +- fs/btrfs/dev-replace.c | 6 ++---- fs/btrfs/disk-io.c | 24 ++++++++++++------------ fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_map.c | 2 +- fs/btrfs/file.c | 2 +- fs/btrfs/free-space-tree.c | 12 ++++++------ fs/btrfs/inode.c | 12 ++++++------ fs/btrfs/qgroup.c | 4 ++-- fs/btrfs/raid56.c | 14 +++++++------- fs/btrfs/relocation.c | 6 +++--- fs/btrfs/scrub.c | 14 +++++++------- fs/btrfs/send.c | 10 +++++----- fs/btrfs/zlib.c | 2 +- fs/btrfs/zoned.c | 38 +++++++++++++++++++------------------- fs/btrfs/zstd.c | 2 +- 19 files changed, 81 insertions(+), 83 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 650083e0f1cb8b..2ab550a1e715a7 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -859,7 +859,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, free_pref(ref); return PTR_ERR(eb); } - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_pref(ref); free_extent_buffer(eb); return -EIO; @@ -1614,7 +1614,7 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, ret = PTR_ERR(eb); goto out; } - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); ret = -EIO; goto out; diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index db2deaa4aad4e2..21df48e6c4fa20 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -849,8 +849,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (ret < 0) goto out_counter_dec; - if (!smap.dev->bdev || - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { + if (unlikely(!smap.dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) { ret = -EIO; goto out_counter_dec; } diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 84ba9906f0d818..7b277934f66f92 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -924,7 +924,7 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-EIO); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index b37e9a893b918a..a4eaef60549eed 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -177,8 +177,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) * allow 'btrfs dev replace_cancel' if src/tgt device is * missing */ - if (!dev_replace->srcdev && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely(!dev_replace->srcdev && !btrfs_test_opt(fs_info, DEGRADED))) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -186,8 +185,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", src_devid); } - if (!dev_replace->tgtdev && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely(!dev_replace->tgtdev && !btrfs_test_opt(fs_info, DEGRADED))) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dbc72135b5e7a0..21c2a19d690f58 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -370,21 +370,21 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, ASSERT(check); found_start = btrfs_header_bytenr(eb); - if (found_start != eb->start) { + if (unlikely(found_start != eb->start)) { btrfs_err_rl(fs_info, "bad tree block start, mirror %u want %llu have %llu", eb->read_mirror, eb->start, found_start); ret = -EIO; goto out; } - if (check_tree_block_fsid(eb)) { + if (unlikely(check_tree_block_fsid(eb))) { btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", eb->start, eb->read_mirror); ret = -EIO; goto out; } found_level = btrfs_header_level(eb); - if (found_level >= BTRFS_MAX_LEVEL) { + if (unlikely(found_level >= BTRFS_MAX_LEVEL)) { btrfs_err(fs_info, "bad tree block level, mirror %u level %d on logical %llu", eb->read_mirror, btrfs_header_level(eb), eb->start); @@ -410,7 +410,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, } } - if (found_level != check->level) { + if (unlikely(found_level != check->level)) { btrfs_err(fs_info, "level verify failed on logical %llu mirror %u wanted %u found %u", eb->start, eb->read_mirror, check->level, found_level); @@ -1046,7 +1046,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, root->node = NULL; goto fail; } - if (!btrfs_buffer_uptodate(root->node, generation, false)) { + if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) { ret = -EIO; goto fail; } @@ -2058,7 +2058,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, u64 bytenr = btrfs_super_log_root(disk_super); int level = btrfs_super_log_root_level(disk_super); - if (fs_devices->rw_devices == 0) { + if (unlikely(fs_devices->rw_devices == 0)) { btrfs_warn(fs_info, "log replay required on RO media"); return -EIO; } @@ -2079,7 +2079,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, btrfs_put_root(log_tree_root); return ret; } - if (!extent_buffer_uptodate(log_tree_root->node)) { + if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) { btrfs_err(fs_info, "failed to read log tree"); btrfs_put_root(log_tree_root); return -EIO; @@ -2641,7 +2641,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev root->node = NULL; return ret; } - if (!extent_buffer_uptodate(root->node)) { + if (unlikely(!extent_buffer_uptodate(root->node))) { free_extent_buffer(root->node); root->node = NULL; return -EIO; @@ -3469,7 +3469,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * below in btrfs_init_dev_replace(). */ btrfs_free_extra_devids(fs_devices); - if (!fs_devices->latest_dev->bdev) { + if (unlikely(!fs_devices->latest_dev->bdev)) { btrfs_err(fs_info, "failed to read devices"); ret = -EIO; goto fail_tree_roots; @@ -3963,7 +3963,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) * Checks last_flush_error of disks in order to determine the device * state. */ - if (errors_wait && !btrfs_check_rw_degradable(info, NULL)) + if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL))) return -EIO; return 0; @@ -4076,7 +4076,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) if (ret) total_errors++; } - if (total_errors > max_errors) { + if (unlikely(total_errors > max_errors)) { btrfs_err(fs_info, "%d errors while writing supers", total_errors); mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4101,7 +4101,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) total_errors++; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); - if (total_errors > max_errors) { + if (unlikely(total_errors > max_errors)) { btrfs_handle_fs_error(fs_info, -EIO, "%d errors while writing supers", total_errors); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5982683d8e4594..6a258bd8693e02 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5638,7 +5638,7 @@ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_r ref.parent = path->nodes[level]->start; } else { ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); - if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) { + if (unlikely(btrfs_root_id(root) != btrfs_header_owner(path->nodes[level]))) { btrfs_err(root->fs_info, "mismatched block owner"); return -EIO; } @@ -5774,7 +5774,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, level--; ASSERT(level == btrfs_header_level(next)); - if (level != btrfs_header_level(next)) { + if (unlikely(level != btrfs_header_level(next))) { btrfs_err(root->fs_info, "mismatched level"); ret = -EIO; goto out_unlock; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4342d654a95403..c123a3ef154ae5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3880,7 +3880,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, return ret; wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) return -EIO; return 0; } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ac28eee7ae32c5..c1e5c7c158b80a 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1057,7 +1057,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); em = btrfs_lookup_extent_mapping(em_tree, start, len); - if (!em) { + if (unlikely(!em)) { ret = -EIO; goto out_unlock; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4daec404fec621..e7765578349b6d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -815,7 +815,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 if (ret) return ret; folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); return -EIO; } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index eba7f22ae49c67..2fee9f3b60d439 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -137,12 +137,12 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { DEBUG_WARN(); return -EIO; } - if (p->slots[0] == 0) { + if (unlikely(p->slots[0] == 0)) { DEBUG_WARN("no previous slot found"); return -EIO; } @@ -293,7 +293,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, expected_extent_count = btrfs_free_space_extent_count(leaf, info); btrfs_release_path(path); - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -465,7 +465,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, start_bit = find_next_bit_le(bitmap, nrbits, end_bit); } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -1611,7 +1611,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, extent_count++; } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -1672,7 +1672,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, extent_count++; } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3e0b699c938d62..6403312505c4d1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3104,7 +3104,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) if (!freespace_inode) btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); - if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { + if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) { ret = -EIO; goto out; } @@ -3370,7 +3370,7 @@ int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 const u8 * const csum_expected) { btrfs_calculate_block_csum(fs_info, paddr, csum); - if (memcmp(csum, csum_expected, fs_info->csum_size)) + if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) return -EIO; return 0; } @@ -4842,7 +4842,7 @@ static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start) folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto out_unlock; } @@ -4986,7 +4986,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto out_unlock; } @@ -7179,7 +7179,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, insert: ret = 0; btrfs_release_path(path); - if (em->start > start || btrfs_extent_map_end(em) <= start) { + if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); @@ -9298,7 +9298,7 @@ static ssize_t btrfs_encoded_read_inline( ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { - if (ret > 0) { + if (unlikely(ret > 0)) { /* The extent item disappeared? */ return -EIO; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8154393449b841..4c6097b7e56afb 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2538,7 +2538,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, return -EUCLEAN; } - if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { + if (unlikely(!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb))) { ret = -EIO; goto out; } @@ -4843,7 +4843,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, reloc_eb = NULL; goto free_out; } - if (!extent_buffer_uptodate(reloc_eb)) { + if (unlikely(!extent_buffer_uptodate(reloc_eb))) { ret = -EIO; goto free_out; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c90612f6cc1b0a..0135dceb7baaa0 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1167,7 +1167,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, /* Check if we have reached tolerance early. */ found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) + if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; return 0; } @@ -1847,7 +1847,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, if (!found_errors) return 0; - if (found_errors > rbio->bioc->max_errors) + if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; /* @@ -2399,7 +2399,7 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio) int found_errors; found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; } @@ -2688,7 +2688,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; goto out; } @@ -2712,7 +2712,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) * data, so the capability of the repair is declined. (In the * case of RAID5, we can not repair anything.) */ - if (dfail > rbio->bioc->max_errors - 1) { + if (unlikely(dfail > rbio->bioc->max_errors - 1)) { ret = -EIO; goto out; } @@ -2729,7 +2729,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) * scrubbing parity, luckily, use the other one to repair the * data, or we can not repair the data stripe. */ - if (failp != rbio->scrubp) { + if (unlikely(failp != rbio->scrubp)) { ret = -EIO; goto out; } @@ -2820,7 +2820,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio) int found_errors; found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 94cd55e97f5bd9..7027572cb6c16a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2270,7 +2270,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(upper->eb, slot); if (lowest) { - if (bytenr != node->bytenr) { + if (unlikely(bytenr != node->bytenr)) { btrfs_err(root->fs_info, "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", bytenr, node->bytenr, slot, @@ -2447,7 +2447,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, eb = read_tree_block(fs_info, block->bytenr, &check); if (IS_ERR(eb)) return PTR_ERR(eb); - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); return -EIO; } @@ -2832,7 +2832,7 @@ static int relocate_one_folio(struct reloc_control *rc, if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto release_folio; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 641d7975b93634..4691d0bdb2e86c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1987,7 +1987,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) * metadata, we should immediately abort. */ for (int i = 0; i < nr_stripes; i++) { - if (stripe_has_metadata_error(&sctx->stripes[i])) { + if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) { ret = -EIO; goto out; } @@ -2181,7 +2181,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, * As we may hit an empty data stripe while it's missing. */ bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); - if (!bitmap_empty(&error, stripe->nr_sectors)) { + if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) { btrfs_err(fs_info, "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, @@ -2875,8 +2875,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_put_block_group(cache); if (ret) break; - if (sctx->is_dev_replace && - atomic64_read(&dev_replace->num_write_errors) > 0) { + if (unlikely(sctx->is_dev_replace && + atomic64_read(&dev_replace->num_write_errors) > 0)) { ret = -EIO; break; } @@ -2904,7 +2904,7 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, if (ret < 0) return ret; ret = btrfs_check_super_csum(fs_info, sb); - if (ret != 0) { + if (unlikely(ret != 0)) { btrfs_err_rl(fs_info, "scrub: super block at physical %llu devid %llu has bad csum", physical, dev->devid); @@ -3080,8 +3080,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } mutex_lock(&fs_info->scrub_lock); - if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { + if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EIO; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index be830d295a6bd8..9230e5066fc6b7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -646,7 +646,7 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) ret = kernel_write(filp, buf + pos, len - pos, off); if (ret < 0) return ret; - if (ret == 0) + if (unlikely(ret == 0)) return -EIO; pos += ret; } @@ -1723,7 +1723,7 @@ static int read_symlink(struct btrfs_root *root, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret) { + if (unlikely(ret)) { /* * An empty symlink inode. Can happen in rare error paths when * creating a symlink (transaction committed before the inode @@ -5199,7 +5199,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); btrfs_err(fs_info, "send: IO error at offset %llu for inode %llu root %llu", @@ -6961,7 +6961,7 @@ static int changed_ref(struct send_ctx *sctx, { int ret = 0; - if (sctx->cur_ino != sctx->cmp_key->objectid) { + if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) { inconsistent_snapshot_error(sctx, result, "reference"); return -EIO; } @@ -6989,7 +6989,7 @@ static int changed_xattr(struct send_ctx *sctx, { int ret = 0; - if (sctx->cur_ino != sctx->cmp_key->objectid) { + if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) { inconsistent_snapshot_error(sctx, result, "xattr"); return -EIO; } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 493e5872c6a33d..6caba8be7c845c 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -291,7 +291,7 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, ret = zlib_deflate(&workspace->strm, Z_FINISH); if (ret == Z_STREAM_END) break; - if (ret != Z_OK && ret != Z_BUF_ERROR) { + if (unlikely(ret != Z_OK && ret != Z_BUF_ERROR)) { zlib_deflateEnd(&workspace->strm); ret = -EIO; goto out; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 39c1f36df8f7b3..e3341a84f4ab39 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -274,7 +274,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return ret; } *nr_zones = ret; - if (!ret) + if (unlikely(!ret)) return -EIO; /* Populate cache */ @@ -503,7 +503,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; } - if (nreported != zone_info->nr_zones) { + if (unlikely(nreported != zone_info->nr_zones)) { btrfs_err(device->fs_info, "inconsistent number of zones on %s (%u/%u)", rcu_dereference(device->name), nreported, @@ -513,7 +513,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } if (max_active_zones) { - if (nactive > max_active_zones) { + if (unlikely(nactive > max_active_zones)) { if (bdev_max_active_zones(bdev) == 0) { max_active_zones = 0; zone_info->max_active_zones = 0; @@ -901,7 +901,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, zones); if (ret < 0) return ret; - if (ret != BTRFS_NR_SB_LOG_ZONES) + if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES)) return -EIO; return sb_log_location(bdev, zones, rw, bytenr_ret); @@ -1357,7 +1357,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } - if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) { btrfs_err(fs_info, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", zone.start << SECTOR_SHIFT, rcu_dereference(device->name), @@ -1399,7 +1399,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, struct zone_info *info, unsigned long *active) { - if (info->alloc_offset == WP_MISSING_DEV) { + if (unlikely(info->alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", info->physical); @@ -1428,13 +1428,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); - if (zone_info[0].alloc_offset == WP_MISSING_DEV) { + if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[0].physical); return -EIO; } - if (zone_info[1].alloc_offset == WP_MISSING_DEV) { + if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[1].physical); @@ -1447,14 +1447,14 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, if (zone_info[1].alloc_offset == WP_CONVENTIONAL) zone_info[1].alloc_offset = last_alloc; - if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { + if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); return -EIO; } if (test_bit(0, active) != test_bit(1, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else if (test_bit(0, active)) { set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); @@ -1489,16 +1489,16 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, if (zone_info[i].alloc_offset == WP_CONVENTIONAL) zone_info[i].alloc_offset = last_alloc; - if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && + !btrfs_test_opt(fs_info, DEGRADED))) { btrfs_err(fs_info, "zoned: write pointer offset mismatch of zones in %s profile", btrfs_bg_type_to_raid_name(map->type)); return -EIO; } if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_test_opt(fs_info, DEGRADED) && - !btrfs_zone_activate(bg)) { + if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) && + !btrfs_zone_activate(bg))) { return -EIO; } } else { @@ -1554,7 +1554,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else { if (test_bit(0, active)) @@ -1586,7 +1586,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, continue; if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else { if (test_bit(0, active)) @@ -1643,7 +1643,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return 0; /* Sanity check */ - if (!IS_ALIGNED(length, fs_info->zone_size)) { + if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) { btrfs_err(fs_info, "zoned: block group %llu len %llu unaligned to zone size %llu", logical, length, fs_info->zone_size); @@ -1756,7 +1756,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return -EINVAL; } - if (cache->alloc_offset > cache->zone_capacity) { + if (unlikely(cache->alloc_offset > cache->zone_capacity)) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", cache->alloc_offset, cache->zone_capacity, @@ -2087,7 +2087,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc, NULL, NULL); - if (ret || !bioc || mapped_length < PAGE_SIZE) { + if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) { ret = -EIO; goto out_put_bioc; } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 8f7c8d27c9ca8d..c9cddcfa337b91 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -654,7 +654,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap_local(workspace->in_buf.src); folio_in_index++; - if (folio_in_index >= total_folios_in) { + if (unlikely(folio_in_index >= total_folios_in)) { workspace->in_buf.src = NULL; ret = -EIO; goto done; From a929904cf73b650f49cc60941e6e618240096fcb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 17 Sep 2025 19:53:56 +0200 Subject: [PATCH 2608/3206] btrfs: add unlikely annotations to branches leading to transaction abort The unlikely() annotation is a static prediction hint that compiler may use to reorder code out of hot path. We use it elsewhere (namely tree-checker.c) for error branches that almost never happen. Transaction abort is one such error, the btrfs_abort_transaction() inlines code to check the state and print a warning, this ought to be out of the hot path. The most common pattern is when transaction abort is called after checking a return value and the control flow leads to a quick return. In other cases it may not be necessary to add unlikely() e.g. when the function returns anyway or the control flow is not changed noticeably. Reviewed-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 10 ++-- fs/btrfs/ctree.c | 50 +++++++++---------- fs/btrfs/delayed-inode.c | 6 +-- fs/btrfs/extent-tree.c | 38 +++++++------- fs/btrfs/file-item.c | 2 +- fs/btrfs/file.c | 43 ++++++++-------- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/free-space-tree.c | 48 +++++++++--------- fs/btrfs/inode-item.c | 10 ++-- fs/btrfs/inode.c | 98 ++++++++++++++++++------------------- fs/btrfs/ioctl.c | 16 +++--- fs/btrfs/qgroup.c | 24 ++++----- fs/btrfs/raid-stripe-tree.c | 2 +- fs/btrfs/reflink.c | 8 +-- fs/btrfs/relocation.c | 16 +++--- fs/btrfs/root-tree.c | 8 +-- fs/btrfs/transaction.c | 28 +++++------ fs/btrfs/tree-log.c | 22 ++++----- fs/btrfs/verity.c | 4 +- fs/btrfs/volumes.c | 28 +++++------ 20 files changed, 231 insertions(+), 232 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6f7974060a1ab5..4330f5ba02dd7b 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3248,7 +3248,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group, */ BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { /* * So theoretically we could recover from this, simply set the * super cache generation to 0 so we know to invalidate the @@ -3995,7 +3995,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans struct btrfs_space_info *sys_space_info; sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags); - if (!sys_space_info) { + if (unlikely(!sys_space_info)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -4009,17 +4009,17 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } - } else if (ret) { + } else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 71256ed909f06f..561658aca018b4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -293,11 +293,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_inc_ref(trans, root, cow, 1); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } else { ret = btrfs_inc_ref(trans, root, cow, 0); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } if (ret) { @@ -536,14 +536,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -556,7 +556,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, parent_start = buf->start; ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -567,7 +567,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, parent_start, last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -575,7 +575,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, WARN_ON(trans->transid != btrfs_header_generation(parent)); ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -586,14 +586,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(trans, parent); if (last_ref) { ret = btrfs_tree_mod_log_free_eb(buf); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } } ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, parent_start, last_ref); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -922,7 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } ret = btrfs_tree_mod_log_insert_root(root->node, child, true); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(child); free_extent_buffer(child); btrfs_abort_transaction(trans, ret); @@ -944,7 +944,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1019,7 +1019,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right, 0, 1); free_extent_buffer_stale(right); right = NULL; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1028,7 +1028,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1080,7 +1080,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1090,7 +1090,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1195,7 +1195,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(left); free_extent_buffer(left); btrfs_abort_transaction(trans, ret); @@ -1255,7 +1255,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(right); free_extent_buffer(right); btrfs_abort_transaction(trans, ret); @@ -2737,7 +2737,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, return ret; } ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2822,7 +2822,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, push_items); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2892,7 +2892,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_clear_buffer_dirty(trans, c); ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); - if (ret2 < 0) + if (unlikely(ret2 < 0)) btrfs_abort_transaction(trans, ret2); btrfs_tree_unlock(c); free_extent_buffer(c); @@ -2937,7 +2937,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, slot, nritems - slot); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2950,7 +2950,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, BTRFS_MOD_LOG_KEY_ADD); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -3026,7 +3026,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ASSERT(btrfs_header_level(c) == level); ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); - if (ret) { + if (unlikely(ret)) { btrfs_tree_unlock(split); free_extent_buffer(split); btrfs_abort_transaction(trans, ret); @@ -4383,7 +4383,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (level) { ret = btrfs_tree_mod_log_insert_move(parent, slot, slot + 1, nritems - slot - 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -4396,7 +4396,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, BTRFS_MOD_LOG_KEY_REMOVE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 81577a0c601f58..41e37f7f67cc01 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1038,7 +1038,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, * transaction, because we could leave the inode with the * improper counts behind. */ - if (ret != -ENOENT) + if (unlikely(ret != -ENOENT)) btrfs_abort_transaction(trans, ret); goto out; } @@ -1066,7 +1066,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto err_out; } @@ -1175,7 +1175,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6a258bd8693e02..dc4ca98c37800a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2157,7 +2157,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif ret = __btrfs_run_delayed_refs(trans, min_bytes); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2982,26 +2982,26 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, csum_root = btrfs_csum_root(trans->fs_info, bytenr); ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } } ret = btrfs_record_squota_delta(trans->fs_info, delta); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -3115,7 +3115,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - if (!is_data && refs_to_drop != 1) { + if (unlikely(!is_data && refs_to_drop != 1)) { btrfs_crit(info, "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", node->bytenr, refs_to_drop); @@ -3172,7 +3172,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3221,7 +3221,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "umm, got %d back from search, was looking for %llu, slot %d", ret, bytenr, path->slots[0]); } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3298,7 +3298,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3363,7 +3363,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -5472,17 +5472,17 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, if (!(wc->flags[level] & flag)) { ASSERT(path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_dec_ref(trans, root, eb, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_set_disk_extent_flags(trans, eb, flag); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -5885,7 +5885,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } } else { ret = btrfs_dec_ref(trans, root, eb, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -6180,13 +6180,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc while (1) { ret = walk_down_tree(trans, root, path, wc); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); break; } @@ -6213,7 +6213,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -6249,7 +6249,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc goto out_end_trans; ret = btrfs_del_root(trans, &root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -6257,7 +6257,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc if (!is_reloc_root) { ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } else if (ret > 0) { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 7906aea75ee46e..a42e6d54e7cd74 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -1011,7 +1011,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, * item changed size or key */ ret = btrfs_split_item(trans, root, path, &key, offset); - if (ret && ret != -EAGAIN) { + if (unlikely(ret && ret != -EAGAIN)) { btrfs_abort_transaction(trans, ret); break; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e7765578349b6d..7efd1f8a19121f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -327,7 +327,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, args->start - extent_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -426,7 +426,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, key.offset - extent_offset, 0, false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -443,7 +443,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -587,21 +587,20 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ino || - key.type != BTRFS_EXTENT_DATA_KEY) { + if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) { + if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (key.offset > start || extent_end < end) { + if (unlikely(key.offset > start || extent_end < end)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -676,7 +675,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, btrfs_release_path(path); goto again; } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -704,7 +703,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ref.ref_root = btrfs_root_id(root); btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -712,7 +711,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, if (split == start) { key.offset = start; } else { - if (start != key.offset) { + if (unlikely(start != key.offset)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -744,7 +743,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, del_slot = path->slots[0] + 1; del_nr++; ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -762,7 +761,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, del_slot = path->slots[0]; del_nr++; ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -783,7 +782,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, extent_end - key.offset); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -2460,9 +2459,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, * got EOPNOTSUPP via prealloc then we messed up and * need to abort. */ - if (ret && - (ret != -EOPNOTSUPP || - (extent_info && extent_info->is_new_extent))) + if (unlikely(ret && + (ret != -EOPNOTSUPP || + (extent_info && extent_info->is_new_extent)))) btrfs_abort_transaction(trans, ret); break; } @@ -2473,7 +2472,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, cur_offset < ino_size) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); - if (ret) { + if (unlikely(ret)) { /* * If we failed then we didn't insert our hole * entries for the area we dropped, so now the @@ -2493,7 +2492,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); - if (ret) { + if (unlikely(ret)) { /* * We couldn't clear our area, so we could * presumably adjust up and corrupt the fs, so @@ -2512,7 +2511,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, replace_len, drop_args.bytes_found); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -2607,7 +2606,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, cur_offset < drop_args.drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); - if (ret) { + if (unlikely(ret)) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2616,7 +2615,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_trans; } @@ -2626,7 +2625,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, extent_info->data_len, drop_args.bytes_found); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_trans; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index c2730740d92821..ab873bd6719209 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -4142,7 +4142,7 @@ int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool act if (!active) { set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); ret = cleanup_free_space_cache_v1(fs_info, trans); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 2fee9f3b60d439..dad0b492a66351 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -218,7 +218,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (!bitmap) { + if (unlikely(!bitmap)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -233,7 +233,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -271,7 +271,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -320,7 +320,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, data_size); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -361,7 +361,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (!bitmap) { + if (unlikely(!bitmap)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -376,7 +376,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -420,7 +420,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -454,7 +454,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, key.offset = (end_bit - start_bit) * fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -848,14 +848,14 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, return 0; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); - if (!block_group) { + if (unlikely(!block_group)) { DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); @@ -1030,14 +1030,14 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, return 0; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); - if (!block_group) { + if (unlikely(!block_group)) { DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); @@ -1185,7 +1185,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) goto out_clear; } ret = btrfs_global_root_insert(free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_put_root(free_space_root); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -1197,7 +1197,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out_clear; @@ -1290,14 +1290,14 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); ret = clear_free_space_tree(trans, free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } ret = btrfs_del_root(trans, &free_space_root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1315,7 +1315,7 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), free_space_root->node, 0, 1); btrfs_put_root(free_space_root); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1344,7 +1344,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); ret = clear_free_space_tree(trans, free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1362,7 +1362,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) goto next; ret = populate_free_space_tree(trans, block_group); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1422,7 +1422,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, if (!path) { path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { btrfs_abort_transaction(trans, -ENOMEM); return -ENOMEM; } @@ -1430,7 +1430,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, } ret = add_new_free_space_info(trans, block_group, path); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1481,7 +1481,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -1496,7 +1496,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1527,7 +1527,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index f06cf701ae5ae0..1bd73b80f9fac8 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -137,7 +137,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, name); - if (!extref) { + if (unlikely(!extref)) { btrfs_abort_transaction(trans, -ENOENT); return -ENOENT; } @@ -627,7 +627,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (control->clear_extent_range) { ret = btrfs_inode_clear_file_extent_range(control->inode, clear_start, clear_len); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -666,7 +666,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, btrfs_init_data_ref(&ref, control->ino, extent_offset, btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -684,7 +684,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -720,7 +720,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int ret2; ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - if (ret2) { + if (unlikely(ret2)) { btrfs_abort_transaction(trans, ret2); ret = ret2; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6403312505c4d1..02cb081697fea4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -634,7 +634,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, drop_args.replace_extent = true; drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -642,7 +642,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, compressed_folio, update_i_size); - if (ret && ret != -ENOSPC) { + if (unlikely(ret && ret != -ENOSPC)) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { @@ -652,7 +652,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); - if (ret && ret != -ENOSPC) { + if (unlikely(ret && ret != -ENOSPC)) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { @@ -3150,7 +3150,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3158,7 +3158,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { /* Logic error */ ASSERT(list_empty(&ordered_extent->list)); - if (!list_empty(&ordered_extent->list)) { + if (unlikely(!list_empty(&ordered_extent->list))) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -3166,7 +3166,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); - if (ret) { + if (unlikely(ret)) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); } @@ -3193,20 +3193,20 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } ret = add_pending_csums(trans, &ordered_extent->list); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3224,7 +3224,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); - if (ret) { /* -ENOMEM or corruption */ + if (unlikely(ret)) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } @@ -3539,7 +3539,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, int ret; ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); - if (ret && ret != -EEXIST) { + if (unlikely(ret && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -4288,7 +4288,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, } ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); - if (ret) { + if (unlikely(ret)) { btrfs_crit(fs_info, "failed to delete reference to %.*s, root %llu inode %llu parent %llu", name->len, name->name, btrfs_root_id(root), ino, dir_ino); @@ -4300,7 +4300,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, rename_ctx->index = index; ret = btrfs_delete_delayed_dir_index(trans, dir, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -4455,7 +4455,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(leaf, di, &key); WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4486,14 +4486,14 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_del_root_ref(trans, objectid, btrfs_root_id(root), dir_ino, &index, &fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_delete_delayed_dir_index(trans, dir, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4665,13 +4665,13 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_record_snapshot_destroy(trans, dir); ret = btrfs_unlink_subvol(trans, dir, dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } ret = btrfs_record_root_in_trans(trans, dest); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4685,7 +4685,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, btrfs_root_id(dest)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4693,7 +4693,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4702,7 +4702,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(dest)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -5106,7 +5106,7 @@ static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -6559,7 +6559,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); batch.nr = args->orphan ? 1 : 2; ret = btrfs_insert_empty_items(trans, root, path, &batch); - if (ret != 0) { + if (unlikely(ret != 0)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6636,7 +6636,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, */ if (!args->subvol) { ret = btrfs_init_inode_security(trans, args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6656,14 +6656,14 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (args->orphan) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 0, BTRFS_I(inode)->dir_index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6727,7 +6727,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; - else if (ret) { + else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -6883,7 +6883,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, /* Link added now we update the inode item with the new link count. */ inc_nlink(inode); ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -6894,7 +6894,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, * open(2) O_TMPFILE flag. */ ret = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -8208,7 +8208,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { - if (need_abort) + if (unlikely(need_abort)) btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8256,7 +8256,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8264,12 +8264,12 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8278,7 +8278,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8286,12 +8286,12 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8299,14 +8299,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_name, 0, old_idx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), old_name, 0, new_idx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8547,7 +8547,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8555,12 +8555,12 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8571,7 +8571,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8580,7 +8580,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8588,7 +8588,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8597,7 +8597,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), &new_fname.disk_name, 0, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8611,7 +8611,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (flags & RENAME_WHITEOUT) { ret = btrfs_create_new_inode(trans, &whiteout_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } else { @@ -8905,7 +8905,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); discard_new_inode(inode); @@ -8917,7 +8917,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_free_path(path); discard_new_inode(inode); @@ -9130,7 +9130,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7f347baa13df1c..a454b5ba209750 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -376,13 +376,13 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (comp) { ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, strlen(comp), 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); - if (ret && ret != -ENODATA) { + if (unlikely(ret && ret != -ENODATA)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -633,7 +633,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1); - if (ret2 < 0) + if (unlikely(ret2 < 0)) btrfs_abort_transaction(trans, ret2); free_extent_buffer(leaf); goto out; @@ -654,14 +654,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap, /* ... and new_root is owned by new_inode_args.inode now. */ ret = btrfs_record_root_in_trans(trans, new_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -669,7 +669,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_record_new_subvolume(trans, BTRFS_I(dir)); ret = btrfs_create_new_inode(trans, &new_inode_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4008,7 +4008,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(root)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; @@ -4032,7 +4032,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(root)); - if (ret < 0 && ret != -EEXIST) { + if (unlikely(ret < 0 && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4c6097b7e56afb..1175b8192cd7de 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1069,7 +1069,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, } path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_root; @@ -1081,7 +1081,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*ptr)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1111,7 +1111,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); if (ret > 0) goto out_add_root; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1129,7 +1129,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, /* We should not have a stray @prealloc pointer. */ ASSERT(prealloc == NULL); prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); - if (!prealloc) { + if (unlikely(!prealloc)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_path; @@ -1137,7 +1137,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = add_qgroup_item(trans, quota_root, found_key.offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1145,13 +1145,13 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } ret = btrfs_search_slot_for_read(tree_root, &found_key, path, 1, 0); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1165,7 +1165,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, } } ret = btrfs_next_item(tree_root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1176,7 +1176,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, out_add_root: btrfs_release_path(path); ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1190,7 +1190,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1376,13 +1376,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_free_qgroup_config(fs_info); ret = btrfs_clean_quota_tree(trans, quota_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_del_root(trans, "a_root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index a69eaf0cef47d9..cc6f6095cc9fd0 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -304,7 +304,7 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, int ret; stripe_extent = kzalloc(item_size, GFP_NOFS); - if (!stripe_extent) { + if (!unlikely(stripe_extent)) { btrfs_abort_transaction(trans, -ENOMEM); btrfs_end_transaction(trans); return -ENOMEM; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f78ac494197f0e..5465a5eae9b2d1 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -43,7 +43,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, } ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -268,12 +268,12 @@ static int clone_copy_inline_extent(struct btrfs_inode *inode, drop_args.end = aligned_end; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_insert_empty_item(trans, root, path, new_key, size); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -285,7 +285,7 @@ static int clone_copy_inline_extent(struct btrfs_inode *inode, btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); btrfs_set_inode_full_sync(inode); ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); out: if (!ret && !trans) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7027572cb6c16a..8dd8de6b9fb89e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -967,7 +967,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -981,7 +981,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1192,7 +1192,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, ref.ref_root = btrfs_root_id(src); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1205,7 +1205,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, ref.ref_root = btrfs_root_id(dest); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1219,7 +1219,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, ref.ref_root = btrfs_root_id(src); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1233,7 +1233,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, ref.ref_root = btrfs_root_id(dest); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1784,7 +1784,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); if (!err) err = ret; @@ -2325,7 +2325,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!ret) ret = btrfs_drop_subtree(trans, root, eb, upper->eb); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } next: diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 8d4dd48beb3a30..d07eab70f759d9 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -168,20 +168,20 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_release_path(path); ret = btrfs_search_slot(trans, root, key, path, -1, 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_del_item(trans, root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -421,7 +421,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name->len); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 12558404f92767..23bcd3c0fc1c28 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1569,7 +1569,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * qgroup counters could end up wrong. */ ret = btrfs_run_delayed_refs(trans, U64_MAX); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -1710,7 +1710,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert the directory item */ ret = btrfs_set_inode_index(parent_inode, &index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1731,7 +1731,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_create_qgroup(trans, objectid); if (ret && ret != -EEXIST) { - if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) { + if (unlikely(ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info))) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1744,13 +1744,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * snapshot */ ret = btrfs_run_delayed_items(trans); - if (ret) { /* Transaction aborted */ + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } ret = record_root_in_trans(trans, root, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1785,7 +1785,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, old = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, BTRFS_NESTING_COW); - if (ret) { + if (unlikely(ret)) { btrfs_tree_unlock(old); free_extent_buffer(old); btrfs_abort_transaction(trans, ret); @@ -1796,7 +1796,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* clean up in any case */ btrfs_tree_unlock(old); free_extent_buffer(old); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1812,7 +1812,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1824,7 +1824,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_root_id(parent_root), btrfs_ino(parent_inode), index, &fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1839,7 +1839,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } ret = btrfs_reloc_post_snapshot(trans, pending); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1862,7 +1862,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, &fname.disk_name, parent_inode, &key, BTRFS_FT_DIR, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1872,14 +1872,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, inode_set_mtime_to_ts(&parent_inode->vfs_inode, inode_set_ctime_current(&parent_inode->vfs_inode)); ret = btrfs_update_inode_fallback(trans, parent_inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1887,7 +1887,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); - if (ret && ret != -EEXIST) { + if (unlikely(ret && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); goto fail; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b34ae17c5d70f0..6aad6b65522b21 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -439,7 +439,7 @@ static int process_one_buffer(struct extent_buffer *eb, }; ret = btrfs_read_extent_buffer(eb, &check); - if (ret) { + if (unlikely(ret)) { if (trans) btrfs_abort_transaction(trans, ret); else @@ -451,7 +451,7 @@ static int process_one_buffer(struct extent_buffer *eb, if (wc->pin) { ASSERT(trans != NULL); ret = btrfs_pin_extent_for_log_replay(trans, eb); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -3570,7 +3570,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); ret = write_all_supers(fs_info, 1); mutex_unlock(&fs_info->tree_log_mutex); - if (ret) { + if (unlikely(ret)) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); goto out_wake_log_root; @@ -7630,7 +7630,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(&wc); wc.log = NULL; - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7645,7 +7645,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7672,7 +7672,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) if (IS_ERR(wc.root)) { ret = PTR_ERR(wc.root); wc.root = NULL; - if (ret != -ENOENT) { + if (unlikely(ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7689,7 +7689,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) * each subsequent pass. */ ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7698,13 +7698,13 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) wc.root->log_root = wc.log; ret = btrfs_record_root_in_trans(trans, wc.root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } ret = walk_log_tree(&wc); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } @@ -7715,7 +7715,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) wc.subvol_path = path; ret = fixup_inode_link_counts(&wc); wc.subvol_path = NULL; - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } @@ -7728,7 +7728,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) * could only happen during mount. */ ret = btrfs_init_root_free_objectid(root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 0bd7ea2c58ec6a..df5bff34b1590b 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -487,12 +487,12 @@ static int rollback_verity(struct btrfs_inode *inode) inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = del_orphan(trans, inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 897294519a99c0..2bec544d8ba300 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2243,7 +2243,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } ret = btrfs_rm_dev_item(trans, device); - if (ret) { + if (unlikely(ret)) { /* Any error in dev item removal is critical */ btrfs_crit(fs_info, "failed to remove device item for devid %llu: %d", @@ -2843,21 +2843,21 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_lock(&fs_info->chunk_mutex); ret = init_first_rw_device(trans); mutex_unlock(&fs_info->chunk_mutex); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } } ret = btrfs_add_dev_item(trans, device); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } if (seeding_dev) { ret = btrfs_finish_sprout(trans); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } @@ -3058,7 +3058,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } ret = btrfs_del_item(trans, root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); btrfs_abort_transaction(trans, ret); goto out; @@ -3283,7 +3283,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); - if (ret) { + if (unlikely(ret)) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, ret); goto out; @@ -3353,7 +3353,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) struct btrfs_space_info *space_info; space_info = btrfs_find_space_info(fs_info, sys_flags); - if (!space_info) { + if (unlikely(!space_info)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -3367,17 +3367,17 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = remove_chunk_item(trans, map, chunk_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } - } else if (ret) { + } else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3386,7 +3386,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(fs_info, chunk_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3402,7 +3402,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) btrfs_trans_release_chunk_metadata(trans); ret = btrfs_remove_block_group(trans, map); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -5041,7 +5041,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); btrfs_trans_release_chunk_metadata(trans); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); } else { @@ -5701,7 +5701,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, item_size = btrfs_chunk_item_size(map->num_stripes); chunk = kzalloc(item_size, GFP_NOFS); - if (!chunk) { + if (unlikely(!chunk)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; From 45c222468d33202c07c41c113301a4b9c8451b8f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 22 Sep 2025 12:09:14 +0100 Subject: [PATCH 2609/3206] btrfs: use smp_mb__after_atomic() when forcing COW in create_pending_snapshot() After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a full write barrier, smp_wmb(), but we don't need to, all we need is a smp_mb__after_atomic(). The use of the smp_wmb() is from the old days when we didn't use a bit and used instead an int field in the root to signal if cow is forced. After the int field was changed to a bit in the root's state (flags field), we forgot to update the memory barrier in create_pending_snapshot() to smp_mb__after_atomic(), but we did the change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer data type for the some variants in btrfs_root"). On the reader side, in should_cow_block(), we also use the counterpart smp_mb__before_atomic() which generates further confusion. So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't even need any barrier at all since create_pending_snapshot() is called in the critical section of a transaction commit and therefore no one can concurrently join/attach the transaction, or start a new one, until the transaction is unblocked. By the time someone starts a new transaction and enters should_cow_block(), a lot of implicit memory barriers already took place by having acquired several locks such as fs_info->trans_lock and extent buffer locks on the root node at least. Nevertlheless, for consistency use smp_mb__after_atomic() after setting the force cow bit in create_pending_snapshot(). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 23bcd3c0fc1c28..89ae0c7a610aa5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1802,7 +1802,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); - smp_wmb(); + smp_mb__after_atomic(); btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ From c1e7254935c0414e7a31737bd357872d1b0f346b Mon Sep 17 00:00:00 2001 From: Taotao Chen Date: Fri, 22 Aug 2025 03:06:59 +0000 Subject: [PATCH 2610/3206] drm/i915: set O_LARGEFILE in __create_shmem() Without O_LARGEFILE, file->f_op->write_iter calls generic_write_check_limits(), which enforces a 2GB (MAX_NON_LFS) limit, causing -EFBIG on large writes. In shmem_pwrite(), this error is later masked as -EIO due to the error handling order, leading to igt failures like gen9_exec_parse(bb-large). Set O_LARGEFILE in __create_shmem() to prevent -EFBIG on large writes. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202508081029.343192ec-lkp@intel.com Fixes: 048832a3f400 ("drm/i915: Refactor shmem_pwrite() to use kiocb and write_iter") Signed-off-by: Taotao Chen Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250822030651.28099-1-chentaotao@didiglobal.com (cherry picked from commit e296a2266c572a7537e638b0dbbfc66d11df46f9) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index e3d188455f6754..b9dae15c1d1667 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -514,6 +514,13 @@ static int __create_shmem(struct drm_i915_private *i915, if (IS_ERR(filp)) return PTR_ERR(filp); + /* + * Prevent -EFBIG by allowing large writes beyond MAX_NON_LFS on shmem + * objects by setting O_LARGEFILE. + */ + if (force_o_largefile()) + filp->f_flags |= O_LARGEFILE; + obj->filp = filp; return 0; } From 7f97a0a871d9532f2e1a5ee7d16d0e364215bcac Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Mon, 8 Sep 2025 09:52:08 +0530 Subject: [PATCH 2611/3206] drm/i915/ddi: Guard reg_val against a INVALID_TRANSCODER Currently we check if the encoder is INVALID or -1 and throw a WARN_ON but we still end up writing the temp value which will overflow and corrupt the whole programmed value. --v2 -Assign a bogus transcoder to master in case we get a INVALID TRANSCODER [Jani] Fixes: 6671c367a9bea ("drm/i915/tgl: Select master transcoder for MST stream") Signed-off-by: Suraj Kandpal Reviewed-by: Jani Nikula Link: https://lore.kernel.org/r/20250908042208.1011144-1-suraj.kandpal@intel.com (cherry picked from commit c8e8e9ab14a6ea926641d161768e1e3ef286a853) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_ddi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 0405396c7750ea..9ecbb4b99c3786 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -596,8 +596,9 @@ intel_ddi_transcoder_func_reg_val_get(struct intel_encoder *encoder, enum transcoder master; master = crtc_state->mst_master_transcoder; - drm_WARN_ON(display->drm, - master == INVALID_TRANSCODER); + if (drm_WARN_ON(display->drm, + master == INVALID_TRANSCODER)) + master = TRANSCODER_A; temp |= TRANS_DDI_MST_TRANSPORT_SELECT(master); } } else { From 334c0e493c2aa3e843a80bb9f3862bb50360cb36 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 16 Sep 2025 16:48:51 +0800 Subject: [PATCH 2612/3206] erofs: avoid reading more for fragment maps Since all real encoded extents (directly handled by the decompression subsystem) have a sane, limited maximum decoded length (Z_EROFS_PCLUSTER_MAX_DSIZE), and the read-more policy is only applied if needed. However, it makes no sense to read more for non-encoded maps, such as fragment extents, since such extents can be huge (up to i_size) and there is no benefit to reading more at this layer. For normal images, it does not really matter, but for crafted images generated by syzbot, excessively large fragment extents can cause read-more to run for an overly long time. Reported-and-tested-by: syzbot+1a9af3ef3c84c5e14dcc@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/68c8583d.050a0220.2ff435.03a3.GAE@google.com Fixes: b44686c8391b ("erofs: fix large fragment handling") Fixes: b15b2e307c3a ("erofs: support on-disk compressed fragments data") Reviewed-by: Hongbo Li Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 2d73297003d25a..625b8ae8f67f09 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1835,7 +1835,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); - if (err) + if (err || !(map->m_flags & EROFS_MAP_ENCODED)) return; /* expand ra for the trailing edge if readahead */ @@ -1847,7 +1847,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, end = round_up(end, PAGE_SIZE); } else { end = round_up(map->m_la, PAGE_SIZE); - if (!map->m_llen) + if (!(map->m_flags & EROFS_MAP_ENCODED) || !map->m_llen) return; } From d6bfdeb4fde74c9a0e227cc1af9dda69c57489f4 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 2 Jun 2025 17:18:45 +0200 Subject: [PATCH 2613/3206] clocksource/drivers/scx200: Add module owner The conversion to modules requires a correct handling of the module refcount in order to prevent to unload it if it is in use. That is especially true with the clockevents where there is no function to unregister them. The core time framework correctly handles the module refcount with the different clocksource and clockevents if the module owner is set. Add the module owner to make sure the core framework will prevent stupid things happening when the driver will be converted into a module. Signed-off-by: Daniel Lezcano Reviewed-by: Will McVicker Link: https://lore.kernel.org/r/20250602151853.1942521-2-daniel.lezcano@linaro.org --- drivers/clocksource/scx200_hrt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clocksource/scx200_hrt.c b/drivers/clocksource/scx200_hrt.c index c3536fffbe9a0f..5a99801a165717 100644 --- a/drivers/clocksource/scx200_hrt.c +++ b/drivers/clocksource/scx200_hrt.c @@ -52,6 +52,7 @@ static struct clocksource cs_hrt = { .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, /* mult, shift are set based on mhz27 flag */ + .owner = THIS_MODULE, }; static int __init init_hrt_clocksource(void) From 4e65bda8273c938039403144730923e77916a3d7 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Tue, 23 Sep 2025 14:52:12 +0800 Subject: [PATCH 2614/3206] ASoC: wcd934x: fix error handling in wcd934x_codec_parse_data() wcd934x_codec_parse_data() contains a device reference count leak in of_slim_get_device() where device_find_child() increases the reference count of the device but this reference is not properly decreased in the success path. Add put_device() in wcd934x_codec_parse_data() and add devm_add_action_or_reset() in the probe function, which ensures that the reference count of the device is correctly managed. Memory leak in regmap_init_slimbus() as the allocated regmap is not released when the device is removed. Using devm_regmap_init_slimbus() instead of regmap_init_slimbus() to ensure automatic regmap cleanup on device removal. Calling path: of_slim_get_device() -> of_find_slim_device() -> device_find_child(). As comment of device_find_child() says, 'NOTE: you will need to drop the reference with put_device() after use.'. Found by code review. Cc: stable@vger.kernel.org Fixes: a61f3b4f476e ("ASoC: wcd934x: add support to wcd9340/wcd9341 codec") Signed-off-by: Ma Ke Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250923065212.26660-1-make24@iscas.ac.cn Signed-off-by: Mark Brown --- sound/soc/codecs/wcd934x.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c index 1bb7e1dc7e6b0a..e92939068bf75e 100644 --- a/sound/soc/codecs/wcd934x.c +++ b/sound/soc/codecs/wcd934x.c @@ -5831,6 +5831,13 @@ static const struct snd_soc_component_driver wcd934x_component_drv = { .endianness = 1, }; +static void wcd934x_put_device_action(void *data) +{ + struct device *dev = data; + + put_device(dev); +} + static int wcd934x_codec_parse_data(struct wcd934x_codec *wcd) { struct device *dev = &wcd->sdev->dev; @@ -5847,11 +5854,13 @@ static int wcd934x_codec_parse_data(struct wcd934x_codec *wcd) return dev_err_probe(dev, -EINVAL, "Unable to get SLIM Interface device\n"); slim_get_logical_addr(wcd->sidev); - wcd->if_regmap = regmap_init_slimbus(wcd->sidev, + wcd->if_regmap = devm_regmap_init_slimbus(wcd->sidev, &wcd934x_ifc_regmap_config); - if (IS_ERR(wcd->if_regmap)) + if (IS_ERR(wcd->if_regmap)) { + put_device(&wcd->sidev->dev); return dev_err_probe(dev, PTR_ERR(wcd->if_regmap), "Failed to allocate ifc register map\n"); + } of_property_read_u32(dev->parent->of_node, "qcom,dmic-sample-rate", &wcd->dmic_sample_rate); @@ -5893,6 +5902,10 @@ static int wcd934x_codec_probe(struct platform_device *pdev) if (ret) return ret; + ret = devm_add_action_or_reset(dev, wcd934x_put_device_action, &wcd->sidev->dev); + if (ret) + return ret; + /* set default rate 9P6MHz */ regmap_update_bits(wcd->regmap, WCD934X_CODEC_RPM_CLK_MCLK_CFG, WCD934X_CODEC_RPM_CLK_MCLK_CFG_MCLK_MASK, From 2cf51ab7f5c52f11dc12a11055f4b628121802d9 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 2 Jun 2025 17:18:46 +0200 Subject: [PATCH 2615/3206] clocksource/drivers/stm32-lp: Add module owner The conversion to modules requires a correct handling of the module refcount in order to prevent to unload it if it is in use. That is especially true with the clockevents where there is no function to unregister them. The core time framework correctly handles the module refcount with the different clocksource and clockevents if the module owner is set. Add the module owner to make sure the core framework will prevent stupid things happening when the driver will be converted into a module. Signed-off-by: Daniel Lezcano Reviewed-by: Will McVicker Link: https://lore.kernel.org/r/20250602151853.1942521-3-daniel.lezcano@linaro.org --- drivers/clocksource/timer-stm32-lp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clocksource/timer-stm32-lp.c b/drivers/clocksource/timer-stm32-lp.c index 6e7944ffd7c032..c2a699f5c1dd72 100644 --- a/drivers/clocksource/timer-stm32-lp.c +++ b/drivers/clocksource/timer-stm32-lp.c @@ -211,6 +211,7 @@ static void stm32_clkevent_lp_init(struct stm32_lp_private *priv, priv->clkevt.rating = STM32_LP_RATING; priv->clkevt.suspend = stm32_clkevent_lp_suspend; priv->clkevt.resume = stm32_clkevent_lp_resume; + priv->clkevt.owner = THIS_MODULE; clockevents_config_and_register(&priv->clkevt, rate, 0x1, STM32_LPTIM_MAX_ARR); From 376d11d32718c4d965714278ac6e6605d944fca2 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 2 Jun 2025 17:18:47 +0200 Subject: [PATCH 2616/3206] clocksource/drivers/sun5i: Add module owner The conversion to modules requires a correct handling of the module refcount in order to prevent to unload it if it is in use. That is especially true with the clockevents where there is no function to unregister them. The core time framework correctly handles the module refcount with the different clocksource and clockevents if the module owner is set. Add the module owner to make sure the core framework will prevent stupid things happening when the driver will be converted into a module. Signed-off-by: Daniel Lezcano Reviewed-by: Will McVicker Acked-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20250602151853.1942521-4-daniel.lezcano@linaro.org --- drivers/clocksource/timer-sun5i.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c index 6b48a9006444dd..f827d3f98f60e6 100644 --- a/drivers/clocksource/timer-sun5i.c +++ b/drivers/clocksource/timer-sun5i.c @@ -185,6 +185,7 @@ static int sun5i_setup_clocksource(struct platform_device *pdev, cs->clksrc.read = sun5i_clksrc_read; cs->clksrc.mask = CLOCKSOURCE_MASK(32); cs->clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS; + cs->clksrc.owner = THIS_MODULE; ret = clocksource_register_hz(&cs->clksrc, rate); if (ret) { @@ -214,6 +215,7 @@ static int sun5i_setup_clockevent(struct platform_device *pdev, ce->clkevt.rating = 340; ce->clkevt.irq = irq; ce->clkevt.cpumask = cpu_possible_mask; + ce->clkevt.owner = THIS_MODULE; /* Enable timer0 interrupt */ val = readl(base + TIMER_IRQ_EN_REG); From afe904f5091e2ceaa95b451f61a115a0224e8e38 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 2 Jun 2025 17:18:48 +0200 Subject: [PATCH 2617/3206] clocksource/drivers/tegra186: Add module owner The conversion to modules requires a correct handling of the module refcount in order to prevent to unload it if it is in use. That is especially true with the clockevents where there is no function to unregister them. The core time framework correctly handles the module refcount with the different clocksource and clockevents if the module owner is set. Add the module owner to make sure the core framework will prevent stupid things happening when the driver will be converted into a module. Signed-off-by: Daniel Lezcano Reviewed-by: Will McVicker Link: https://lore.kernel.org/r/20250602151853.1942521-5-daniel.lezcano@linaro.org --- drivers/clocksource/timer-tegra186.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/clocksource/timer-tegra186.c b/drivers/clocksource/timer-tegra186.c index e5394f98a02e66..56a5342bcf7825 100644 --- a/drivers/clocksource/timer-tegra186.c +++ b/drivers/clocksource/timer-tegra186.c @@ -373,6 +373,7 @@ static int tegra186_timer_tsc_init(struct tegra186_timer *tegra) tegra->tsc.read = tegra186_timer_tsc_read; tegra->tsc.mask = CLOCKSOURCE_MASK(56); tegra->tsc.flags = CLOCK_SOURCE_IS_CONTINUOUS; + tegra->tsc.owner = THIS_MODULE; return clocksource_register_hz(&tegra->tsc, 31250000); } @@ -392,6 +393,7 @@ static int tegra186_timer_osc_init(struct tegra186_timer *tegra) tegra->osc.read = tegra186_timer_osc_read; tegra->osc.mask = CLOCKSOURCE_MASK(32); tegra->osc.flags = CLOCK_SOURCE_IS_CONTINUOUS; + tegra->osc.owner = THIS_MODULE; return clocksource_register_hz(&tegra->osc, 38400000); } @@ -411,6 +413,7 @@ static int tegra186_timer_usec_init(struct tegra186_timer *tegra) tegra->usec.read = tegra186_timer_usec_read; tegra->usec.mask = CLOCKSOURCE_MASK(32); tegra->usec.flags = CLOCK_SOURCE_IS_CONTINUOUS; + tegra->usec.owner = THIS_MODULE; return clocksource_register_hz(&tegra->usec, USEC_PER_SEC); } From edef59887b5c5a527ad4b287a51f8f9fc239d867 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 2 Jun 2025 17:18:49 +0200 Subject: [PATCH 2618/3206] clocksource/drivers/stm: Add module owner The conversion to modules requires a correct handling of the module refcount in order to prevent to unload it if it is in use. That is especially true with the clockevents where there is no function to unregister them. The core time framework correctly handles the module refcount with the different clocksource and clockevents if the module owner is set. Add the module owner to make sure the core framework will prevent stupid things happening when the driver will be converted into a module. Signed-off-by: Daniel Lezcano Reviewed-by: Will McVicker Link: https://lore.kernel.org/r/20250602151853.1942521-6-daniel.lezcano@linaro.org --- drivers/clocksource/timer-nxp-stm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/clocksource/timer-nxp-stm.c b/drivers/clocksource/timer-nxp-stm.c index d7ccf900172983..bbc40623728fa5 100644 --- a/drivers/clocksource/timer-nxp-stm.c +++ b/drivers/clocksource/timer-nxp-stm.c @@ -201,6 +201,7 @@ static int __init nxp_stm_clocksource_init(struct device *dev, struct stm_timer stm_timer->cs.resume = nxp_stm_clocksource_resume; stm_timer->cs.mask = CLOCKSOURCE_MASK(32); stm_timer->cs.flags = CLOCK_SOURCE_IS_CONTINUOUS; + stm_timer->cs.owner = THIS_MODULE; ret = clocksource_register_hz(&stm_timer->cs, stm_timer->rate); if (ret) @@ -314,6 +315,7 @@ static int __init nxp_stm_clockevent_per_cpu_init(struct device *dev, struct stm stm_timer->ced.cpumask = cpumask_of(cpu); stm_timer->ced.rating = 460; stm_timer->ced.irq = irq; + stm_timer->ced.owner = THIS_MODULE; per_cpu(stm_timers, cpu) = stm_timer; From eea65574e259f812e84080d56eca51f1a1889f8c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 2 Jun 2025 17:18:50 +0200 Subject: [PATCH 2619/3206] clocksource/drivers/cs5535: Add module owner The conversion to modules requires a correct handling of the module refcount in order to prevent to unload it if it is in use. That is especially true with the clockevents where there is no function to unregister them. The core time framework correctly handles the module refcount with the different clocksource and clockevents if the module owner is set. Add the module owner to make sure the core framework will prevent stupid things happening when the driver will be converted into a module. Signed-off-by: Daniel Lezcano Reviewed-by: Will McVicker Link: https://lore.kernel.org/r/20250602151853.1942521-7-daniel.lezcano@linaro.org --- drivers/clocksource/timer-cs5535.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clocksource/timer-cs5535.c b/drivers/clocksource/timer-cs5535.c index d47acfe848ae45..8af666c398900b 100644 --- a/drivers/clocksource/timer-cs5535.c +++ b/drivers/clocksource/timer-cs5535.c @@ -101,6 +101,7 @@ static struct clock_event_device cs5535_clockevent = { .tick_resume = mfgpt_shutdown, .set_next_event = mfgpt_next_event, .rating = 250, + .owner = THIS_MODULE, }; static irqreturn_t mfgpt_tick(int irq, void *dev_id) From 84b1a903aed876a3fd6bb04786947f640d4d8e62 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 2 Jun 2025 17:18:51 +0200 Subject: [PATCH 2620/3206] time/sched_clock: Export symbol for sched_clock register function The timer drivers could be converted into modules. The different functions to register the clocksource or the clockevent are already exporting their symbols for modules but the sched_clock_register() function is missing. Export the symbols so the drivers using this function can be converted into modules. Signed-off-by: Daniel Lezcano Reviewed-by: Thomas Gleixner Reviewed-by: Carlos Llamas Reviewed-by: Will McVicker Acked-by: John Stultz Link: https://lore.kernel.org/r/20250602151853.1942521-8-daniel.lezcano@linaro.org --- kernel/time/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index cc15fe293719f5..cc1afec306b3f0 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -174,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) return HRTIMER_RESTART; } -void __init -sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) +void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; @@ -247,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pS as sched_clock source\n", read); } +EXPORT_SYMBOL_GPL(sched_clock_register); void __init generic_sched_clock_init(void) { From ef0e000cd162a58dba10608ef6959f172d1bc5f4 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 11 Jun 2025 18:26:20 -0500 Subject: [PATCH 2621/3206] dt-bindings: timer: Convert faraday,fttmr010 to DT schema Convert the Faraday fttmr010 Timer binding to DT schema format. Adjust the compatible string values to match what's in use. The number of interrupts can also be anywhere from 1 to 8. The clock-names order was reversed compared to what's used. Signed-off-by: Rob Herring (Arm) Signed-off-by: Daniel Lezcano Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250611232621.1508116-1-robh@kernel.org --- .../bindings/timer/faraday,fttmr010.txt | 38 -------- .../bindings/timer/faraday,fttmr010.yaml | 89 +++++++++++++++++++ 2 files changed, 89 insertions(+), 38 deletions(-) delete mode 100644 Documentation/devicetree/bindings/timer/faraday,fttmr010.txt create mode 100644 Documentation/devicetree/bindings/timer/faraday,fttmr010.yaml diff --git a/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt b/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt deleted file mode 100644 index 3cb2f4c98d6436..00000000000000 --- a/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt +++ /dev/null @@ -1,38 +0,0 @@ -Faraday Technology timer - -This timer is a generic IP block from Faraday Technology, embedded in the -Cortina Systems Gemini SoCs and other designs. - -Required properties: - -- compatible : Must be one of - "faraday,fttmr010" - "cortina,gemini-timer", "faraday,fttmr010" - "moxa,moxart-timer", "faraday,fttmr010" - "aspeed,ast2400-timer" - "aspeed,ast2500-timer" - "aspeed,ast2600-timer" - -- reg : Should contain registers location and length -- interrupts : Should contain the three timer interrupts usually with - flags for falling edge - -Optionally required properties: - -- clocks : a clock to provide the tick rate for "faraday,fttmr010" -- clock-names : should be "EXTCLK" and "PCLK" for the external tick timer - and peripheral clock respectively, for "faraday,fttmr010" -- syscon : a phandle to the global Gemini system controller if the compatible - type is "cortina,gemini-timer" - -Example: - -timer@43000000 { - compatible = "faraday,fttmr010"; - reg = <0x43000000 0x1000>; - interrupts = <14 IRQ_TYPE_EDGE_FALLING>, /* Timer 1 */ - <15 IRQ_TYPE_EDGE_FALLING>, /* Timer 2 */ - <16 IRQ_TYPE_EDGE_FALLING>; /* Timer 3 */ - clocks = <&extclk>, <&pclk>; - clock-names = "EXTCLK", "PCLK"; -}; diff --git a/Documentation/devicetree/bindings/timer/faraday,fttmr010.yaml b/Documentation/devicetree/bindings/timer/faraday,fttmr010.yaml new file mode 100644 index 00000000000000..39506323556c57 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/faraday,fttmr010.yaml @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/faraday,fttmr010.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Faraday FTTMR010 timer + +maintainers: + - Joel Stanley + - Linus Walleij + +description: + This timer is a generic IP block from Faraday Technology, embedded in the + Cortina Systems Gemini SoCs and other designs. + +properties: + compatible: + oneOf: + - items: + - const: moxa,moxart-timer + - const: faraday,fttmr010 + - enum: + - aspeed,ast2400-timer + - aspeed,ast2500-timer + - aspeed,ast2600-timer + - cortina,gemini-timer + - faraday,fttmr010 + + reg: + maxItems: 1 + + interrupts: + minItems: 1 + maxItems: 8 + description: One interrupt per timer + + clocks: + minItems: 1 + items: + - description: Peripheral clock + - description: External tick clock + + clock-names: + minItems: 1 + items: + - const: PCLK + - const: EXTCLK + + resets: + maxItems: 1 + + syscon: + description: System controller phandle for Gemini systems + $ref: /schemas/types.yaml#/definitions/phandle + +required: + - compatible + - reg + - interrupts + +allOf: + - if: + properties: + compatible: + contains: + const: cortina,gemini-timer + then: + required: + - syscon + else: + properties: + syscon: false + +additionalProperties: false + +examples: + - | + #include + + timer@43000000 { + compatible = "faraday,fttmr010"; + reg = <0x43000000 0x1000>; + interrupts = <14 IRQ_TYPE_EDGE_FALLING>, /* Timer 1 */ + <15 IRQ_TYPE_EDGE_FALLING>, /* Timer 2 */ + <16 IRQ_TYPE_EDGE_FALLING>; /* Timer 3 */ + clocks = <&pclk>, <&extclk>; + clock-names = "PCLK", "EXTCLK"; + }; From bb7bf8b44de1b13b8b7b10ca166b545ff22edac9 Mon Sep 17 00:00:00 2001 From: Max Shevchenko Date: Wed, 2 Jul 2025 13:50:40 +0300 Subject: [PATCH 2622/3206] dt-bindings: timer: mediatek: add MT6572 Add a compatible string for timer on the MT6572 SoC. Signed-off-by: Max Shevchenko Signed-off-by: Daniel Lezcano Reviewed-by: AngeloGioacchino Del Regno Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250702-mt6572-v4-3-bde75b7ed445@proton.me --- Documentation/devicetree/bindings/timer/mediatek,timer.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/timer/mediatek,timer.yaml b/Documentation/devicetree/bindings/timer/mediatek,timer.yaml index f68fc7050c5687..d5b574bfd2caad 100644 --- a/Documentation/devicetree/bindings/timer/mediatek,timer.yaml +++ b/Documentation/devicetree/bindings/timer/mediatek,timer.yaml @@ -26,6 +26,7 @@ properties: - items: - enum: - mediatek,mt2701-timer + - mediatek,mt6572-timer - mediatek,mt6580-timer - mediatek,mt6582-timer - mediatek,mt6589-timer From c1ff9e919addb8cf0414b08bd996f11a4a2e7297 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 23 May 2025 10:14:37 -0400 Subject: [PATCH 2623/3206] dt-bindings: timer: fsl,ftm-timer: use items for reg The original txt binding doc is: reg : Specifies base physical address and size of the register sets for the clock event device and clock source device. And existed dts provide two reg MMIO spaces. So change to use items to descript reg property. Update examples. Fixes: 8fc30d8f8e86 ("dt-bindings: timer: fsl,ftm-timer: Convert to dtschema") Signed-off-by: Frank Li Signed-off-by: Daniel Lezcano Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250523141437.533643-1-Frank.Li@nxp.com --- Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml b/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml index 0e4a8ddc3de327..e3b61b62521e8b 100644 --- a/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml +++ b/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml @@ -14,7 +14,9 @@ properties: const: fsl,ftm-timer reg: - maxItems: 1 + items: + - description: clock event device + - description: clock source device interrupts: maxItems: 1 @@ -50,7 +52,8 @@ examples: ftm@400b8000 { compatible = "fsl,ftm-timer"; - reg = <0x400b8000 0x1000>; + reg = <0x400b8000 0x1000>, + <0x400b9000 0x1000>; interrupts = <0 44 IRQ_TYPE_LEVEL_HIGH>; clock-names = "ftm-evt", "ftm-src", "ftm-evt-counter-en", "ftm-src-counter-en"; clocks = <&clks VF610_CLK_FTM2>, <&clks VF610_CLK_FTM3>, From be26ec8b1479a0bb1888ed93d7bb5ce4d8eaee1e Mon Sep 17 00:00:00 2001 From: Will McVicker Date: Fri, 20 Jun 2025 11:17:04 -0700 Subject: [PATCH 2624/3206] of/irq: Export of_irq_count for modules Need to export `of_irq_count` in preparation for modularizing the Exynos MCT driver which uses this API for setting up the timer IRQs. Signed-off-by: Will McVicker Signed-off-by: Daniel Lezcano Tested-by: Youngmin Nam Reviewed-by: Linus Walleij Reviewed-by: Youngmin Nam Acked-by: Rob Herring (Arm) Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250620181719.1399856-2-willmcvicker@google.com --- drivers/of/irq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 74aaea61de13c9..d2b690857e5885 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -519,6 +519,7 @@ int of_irq_count(struct device_node *dev) return nr; } +EXPORT_SYMBOL_GPL(of_irq_count); /** * of_irq_to_resource_table - Fill in resource table with node's IRQ info From 916aa36042db8ee230543ffe0d192f900e8b8c9f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 14 Jun 2025 10:55:55 -0700 Subject: [PATCH 2625/3206] clocksource/drivers/timer-tegra186: Avoid 64-bit divide operation Building the driver on xtensa fails with tensa-linux-ld: drivers/clocksource/timer-tegra186.o: in function `tegra186_timer_remove': timer-tegra186.c:(.text+0x350): undefined reference to `__udivdi3' Avoid the problem by rearranging the offending code to avoid the 64-bit divide operation. Fixes: 28c842c8b0f5 ("clocksource/drivers/timer-tegra186: Add WDIOC_GETTIMELEFT support") Signed-off-by: Guenter Roeck Signed-off-by: Daniel Lezcano Reviewed-by: Jon Hunter Cc: Pohsun Su Cc: Robert Lin Link: https://lore.kernel.org/r/20250614175556.922159-1-linux@roeck-us.net --- drivers/clocksource/timer-tegra186.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-tegra186.c b/drivers/clocksource/timer-tegra186.c index 56a5342bcf7825..76a5481c8eb689 100644 --- a/drivers/clocksource/timer-tegra186.c +++ b/drivers/clocksource/timer-tegra186.c @@ -267,7 +267,7 @@ static unsigned int tegra186_wdt_get_timeleft(struct watchdog_device *wdd) * counter value to the time of the counter expirations that * remain. */ - timeleft += (((u64)wdt->base.timeout * USEC_PER_SEC) / 5) * (4 - expiration); + timeleft += ((u64)wdt->base.timeout * (USEC_PER_SEC / 5)) * (4 - expiration); /* * Convert the current counter value to seconds, From 7f3abae5b447a7f8458a0f58a003c11c46aade99 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 14 Jun 2025 10:55:56 -0700 Subject: [PATCH 2626/3206] clocksource/drivers/timer-tegra186: Simplify calculating timeleft It is not necessary to use 64-bit operations to calculate the remaining watchdog timeout. Simplify to use 32-bit operations, and add comments explaining why there will be no overflow. Signed-off-by: Guenter Roeck Signed-off-by: Daniel Lezcano Reviewed-by: Jon Hunter Cc: Pohsun Su Cc: Robert Lin Link: https://lore.kernel.org/r/20250614175556.922159-2-linux@roeck-us.net --- drivers/clocksource/timer-tegra186.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/clocksource/timer-tegra186.c b/drivers/clocksource/timer-tegra186.c index 76a5481c8eb689..d403a3fe25e76a 100644 --- a/drivers/clocksource/timer-tegra186.c +++ b/drivers/clocksource/timer-tegra186.c @@ -231,7 +231,7 @@ static unsigned int tegra186_wdt_get_timeleft(struct watchdog_device *wdd) { struct tegra186_wdt *wdt = to_tegra186_wdt(wdd); u32 expiration, val; - u64 timeleft; + u32 timeleft; if (!watchdog_active(&wdt->base)) { /* return zero if the watchdog timer is not activated. */ @@ -266,21 +266,26 @@ static unsigned int tegra186_wdt_get_timeleft(struct watchdog_device *wdd) * Calculate the time remaining by adding the time for the * counter value to the time of the counter expirations that * remain. + * Note: Since wdt->base.timeout is bound to 255, the maximum + * value added to timeleft is + * 255 * (1,000,000 / 5) * 4 + * = 255 * 200,000 * 4 + * = 204,000,000 + * TMRSR_PCV is a 29-bit field. + * Its maximum value is 0x1fffffff = 536,870,911. + * 204,000,000 + 536,870,911 = 740,870,911 = 0x2C28CAFF. + * timeleft can therefore not overflow, and 64-bit calculations + * are not necessary. */ - timeleft += ((u64)wdt->base.timeout * (USEC_PER_SEC / 5)) * (4 - expiration); + timeleft += (wdt->base.timeout * (USEC_PER_SEC / 5)) * (4 - expiration); /* * Convert the current counter value to seconds, - * rounding up to the nearest second. Cast u64 to - * u32 under the assumption that no overflow happens - * when coverting to seconds. + * rounding to the nearest second. */ - timeleft = DIV_ROUND_CLOSEST_ULL(timeleft, USEC_PER_SEC); + timeleft = DIV_ROUND_CLOSEST(timeleft, USEC_PER_SEC); - if (WARN_ON_ONCE(timeleft > U32_MAX)) - return U32_MAX; - - return lower_32_bits(timeleft); + return timeleft; } static const struct watchdog_ops tegra186_wdt_ops = { From 409f8fe03e08f92bf5be96cedbcd7a3e8fb2eeaf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 13:19:35 +0200 Subject: [PATCH 2627/3206] clocksource/drivers/tegra186: Avoid 64-bit division The newly added function causes a build failure on 32-bit targets with older compiler version such as gcc-10: arm-linux-gnueabi-ld: drivers/clocksource/timer-tegra186.o: in function `tegra186_wdt_get_timeleft': timer-tegra186.c:(.text+0x3c2): undefined reference to `__aeabi_uldivmod' The calculation can trivially be changed to avoid the division entirely, as USEC_PER_SEC is a multiple of 5. Change both such calculation for consistency, even though gcc apparently managed to optimize the other one properly already. [dlezcano : Fixed conflict with 20250614175556.922159-2-linux@roeck-us.net ] Fixes: 28c842c8b0f5 ("clocksource/drivers/timer-tegra186: Add WDIOC_GETTIMELEFT support") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250620111939.3395525-1-arnd@kernel.org --- drivers/clocksource/timer-tegra186.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-tegra186.c b/drivers/clocksource/timer-tegra186.c index d403a3fe25e76a..1ec7b38ff8c6f2 100644 --- a/drivers/clocksource/timer-tegra186.c +++ b/drivers/clocksource/timer-tegra186.c @@ -159,7 +159,7 @@ static void tegra186_wdt_enable(struct tegra186_wdt *wdt) tmr_writel(wdt->tmr, TMRCSSR_SRC_USEC, TMRCSSR); /* configure timer (system reset happens on the fifth expiration) */ - value = TMRCR_PTV(wdt->base.timeout * USEC_PER_SEC / 5) | + value = TMRCR_PTV(wdt->base.timeout * (USEC_PER_SEC / 5)) | TMRCR_PERIODIC | TMRCR_ENABLE; tmr_writel(wdt->tmr, value, TMRCR); From ffc5870fc4e0ea72bf73ad5ab1d648aa0b4f7cbf Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 28 May 2025 12:53:50 -0400 Subject: [PATCH 2628/3206] dt-bindings: timer: Add fsl,timrot.yaml Add fsl,timrot.yaml for i.MX23/i.MX28 timer. Also add a generic fallback compatible string "fsl,timrot" for legacy devices, which have existed for over 15 years. Signed-off-by: Frank Li Signed-off-by: Daniel Lezcano Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250528165351.691848-1-Frank.Li@nxp.com --- .../devicetree/bindings/timer/fsl,timrot.yaml | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 Documentation/devicetree/bindings/timer/fsl,timrot.yaml diff --git a/Documentation/devicetree/bindings/timer/fsl,timrot.yaml b/Documentation/devicetree/bindings/timer/fsl,timrot.yaml new file mode 100644 index 00000000000000..d181f274ef9f89 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/fsl,timrot.yaml @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/fsl,timrot.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Freescale MXS Timer + +maintainers: + - Frank Li + +properties: + compatible: + items: + - enum: + - fsl,imx23-timrot + - fsl,imx28-timrot + - const: fsl,timrot + + reg: + maxItems: 1 + + interrupts: + items: + - description: irq for timer0 + - description: irq for timer1 + - description: irq for timer2 + - description: irq for timer3 + + clocks: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + - clocks + +additionalProperties: false + +examples: + - | + timer: timer@80068000 { + compatible = "fsl,imx28-timrot", "fsl,timrot"; + reg = <0x80068000 0x2000>; + interrupts = <48>, <49>, <50>, <51>; + clocks = <&clks 26>; + }; From d27b4e33c954f5fa6b6e366736838c98d3996f02 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 3 Jun 2025 14:04:50 +0800 Subject: [PATCH 2629/3206] clocksource/timer-econet-en751221: Convert comma to semicolon Replace comma between expressions with semicolons. Using a ',' in place of a ';' can have unintended side effects. Although that is not the case here, it is seems best to use ';' unless ',' is intended. Found by inspection. No functional change intended. Compile tested only. Signed-off-by: Chen Ni Signed-off-by: Daniel Lezcano Tested-by: Caleb James DeLisle Link: https://lore.kernel.org/r/20250603060450.1310204-1-nichen@iscas.ac.cn --- drivers/clocksource/timer-econet-en751221.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-econet-en751221.c b/drivers/clocksource/timer-econet-en751221.c index 3b449fdaafee03..4008076b1a2109 100644 --- a/drivers/clocksource/timer-econet-en751221.c +++ b/drivers/clocksource/timer-econet-en751221.c @@ -146,7 +146,7 @@ static int __init cevt_init(struct device_node *np) for_each_possible_cpu(i) { struct clock_event_device *cd = &per_cpu(econet_timer_pcpu, i); - cd->rating = 310, + cd->rating = 310; cd->features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_PERCPU; From 99d19715daf5553a28452a4ef95e9692277e2787 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 11 Jun 2025 13:07:58 +0200 Subject: [PATCH 2630/3206] dt-bindings: timer: mediatek,timer: Add MediaTek MT8196 compatible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new compatible for the MediaTek MT8196 SoC, fully compatible with MT6765. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Daniel Lezcano Reviewed-by: Nícolas F. R. A. Prado Acked-by: Rob Herring (Arm) Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250611110800.458164-2-angelogioacchino.delregno@collabora.com --- Documentation/devicetree/bindings/timer/mediatek,timer.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/timer/mediatek,timer.yaml b/Documentation/devicetree/bindings/timer/mediatek,timer.yaml index d5b574bfd2caad..e3e38066c2cb72 100644 --- a/Documentation/devicetree/bindings/timer/mediatek,timer.yaml +++ b/Documentation/devicetree/bindings/timer/mediatek,timer.yaml @@ -45,6 +45,7 @@ properties: - mediatek,mt8188-timer - mediatek,mt8192-timer - mediatek,mt8195-timer + - mediatek,mt8196-timer - mediatek,mt8365-systimer - const: mediatek,mt6765-timer From 5fa7d739f811bdffb5fc99696c2e821344fe0b88 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 21 Sep 2025 10:09:17 +0300 Subject: [PATCH 2631/3206] regulator: dt-bindings: qcom,sdm845-refgen-regulator: document more platforms Document refgen block being present on SDM670, Lemans and QCS8300 platforms. It should be used to provide reference voltage to DSI controller. Signed-off-by: Dmitry Baryshkov Acked-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250921-refgen-v1-1-9d93e64133ea@oss.qualcomm.com Signed-off-by: Mark Brown --- .../bindings/regulator/qcom,sdm845-refgen-regulator.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml b/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml index f02f97d4fdd215..40f9223d4c2721 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml @@ -23,11 +23,14 @@ properties: - enum: - qcom,sc7180-refgen-regulator - qcom,sc8180x-refgen-regulator + - qcom,sdm670-refgen-regulator - qcom,sm8150-refgen-regulator - const: qcom,sdm845-refgen-regulator - items: - enum: + - qcom,qcs8300-refgen-regulator + - qcom,sa8775p-refgen-regulator - qcom,sc7280-refgen-regulator - qcom,sc8280xp-refgen-regulator - qcom,sm6350-refgen-regulator From e609438851928381e39b5393f17156955a84122a Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 21 Sep 2025 10:09:17 +0300 Subject: [PATCH 2632/3206] regulator: dt-bindings: qcom,sdm845-refgen-regulator: document more platforms Document refgen block being present on SDM670, Lemans and QCS8300 platforms. It should be used to provide reference voltage to DSI controller. Signed-off-by: Dmitry Baryshkov Acked-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250921-refgen-v1-1-9d93e64133ea@oss.qualcomm.com Signed-off-by: Mark Brown --- .../bindings/regulator/qcom,sdm845-refgen-regulator.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml b/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml index f02f97d4fdd215..40f9223d4c2721 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml @@ -23,11 +23,14 @@ properties: - enum: - qcom,sc7180-refgen-regulator - qcom,sc8180x-refgen-regulator + - qcom,sdm670-refgen-regulator - qcom,sm8150-refgen-regulator - const: qcom,sdm845-refgen-regulator - items: - enum: + - qcom,qcs8300-refgen-regulator + - qcom,sa8775p-refgen-regulator - qcom,sc7280-refgen-regulator - qcom,sc8280xp-refgen-regulator - qcom,sm6350-refgen-regulator From 91daac8a6893c65e18f194946ad3ad9df5e9de8d Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Tue, 16 Sep 2025 08:10:07 +0200 Subject: [PATCH 2633/3206] genirq/msi: Remove msi_post_free() The only user of msi_post_free() - powerpc/pseries - has been changed to use msi_teardown(). Remove this unused callback. Signed-off-by: Nam Cao Reviewed-by: Thomas Gleixner Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250916061007.964005-1-namcao@linutronix.de --- include/linux/msi.h | 4 ---- kernel/irq/msi.c | 3 --- 2 files changed, 7 deletions(-) diff --git a/include/linux/msi.h b/include/linux/msi.h index e5e86a8529fb6f..faac634ac23077 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -431,8 +431,6 @@ struct msi_domain_info; * function. * @domain_free_irqs: Optional function to override the default free * function. - * @msi_post_free: Optional function which is invoked after freeing - * all interrupts. * @msi_translate: Optional translate callback to support the odd wire to * MSI bridges, e.g. MBIGEN * @@ -473,8 +471,6 @@ struct msi_domain_ops { struct device *dev, int nvec); void (*domain_free_irqs)(struct irq_domain *domain, struct device *dev); - void (*msi_post_free)(struct irq_domain *domain, - struct device *dev); int (*msi_translate)(struct irq_domain *domain, struct irq_fwspec *fwspec, irq_hw_number_t *hwirq, unsigned int *type); }; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9b09ad3f9914cb..e7ad992548416a 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1644,9 +1644,6 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) else __msi_domain_free_irqs(dev, domain, ctrl); - if (ops->msi_post_free) - ops->msi_post_free(domain, dev); - if (info->flags & MSI_FLAG_FREE_MSI_DESCS) msi_domain_free_descs(dev, ctrl); } From ef104054a312608deab266f95945057fa73eeaad Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Mon, 22 Sep 2025 02:11:08 -0700 Subject: [PATCH 2634/3206] powerpc/pseries: Define __u{8,32} types in papr_hvpipe_hdr struct Fix the the following build errors with CONFIG_UAPI_HEADER_TEST: ./usr/include/asm/papr-hvpipe.h:16:9: error: unknown type name 'u8' 16 | u8 version; ./usr/include/asm/papr-hvpipe.h:17:9: error: unknown type name 'u8' 17 | u8 reserved[3]; ./usr/include/asm/papr-hvpipe.h:18:9: error: unknown type name 'u32' 18 | u32 flags; ./usr/include/asm/papr-hvpipe.h:19:9: error: unknown type name 'u8' 19 | u8 reserved2[40]; Fixes: 043439ad1a23c ("powerpc/pseries: Define papr-hvpipe ioctl") Reported-by: Venkat Rao Bagalkote Signed-off-by: Haren Myneni Tested-by: Venkat Rao Bagalkote Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250922091108.1483970-1-haren@linux.ibm.com --- arch/powerpc/include/uapi/asm/papr-hvpipe.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/uapi/asm/papr-hvpipe.h b/arch/powerpc/include/uapi/asm/papr-hvpipe.h index 459a7bb0e6c9c6..f8794139d06a4a 100644 --- a/arch/powerpc/include/uapi/asm/papr-hvpipe.h +++ b/arch/powerpc/include/uapi/asm/papr-hvpipe.h @@ -13,10 +13,10 @@ * closed or the buffer has the payload. */ struct papr_hvpipe_hdr { - u8 version; - u8 reserved[3]; - u32 flags; - u8 reserved2[40]; + __u8 version; + __u8 reserved[3]; + __u32 flags; + __u8 reserved2[40]; }; /* From 12a3dd4d2cd9232d4e4df3b9a5b3d745db559941 Mon Sep 17 00:00:00 2001 From: Christoffer Sandberg Date: Tue, 16 Sep 2025 18:46:49 +0200 Subject: [PATCH 2635/3206] platform/x86/amd/pmc: Add Stellaris Slim Gen6 AMD to spurious 8042 quirks list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevents instant wakeup ~1s after suspend Signed-off-by: Christoffer Sandberg Signed-off-by: Werner Sembach Link: https://patch.msgid.link/20250916164700.32896-1-wse@tuxedocomputers.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc-quirks.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index 4d0a38e06f0830..d63aaad7ef5998 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -256,6 +256,13 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Lafite Pro V 14M"), } }, + { + .ident = "TUXEDO Stellaris Slim 15 AMD Gen6", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GMxHGxx"), + } + }, { .ident = "TUXEDO InfinityBook Pro 14/15 AMD Gen10", .driver_data = &quirk_spurious_8042, From 2c61c45af153243baf591a77ec187be2b9cfe302 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 16 Sep 2025 17:21:42 +0530 Subject: [PATCH 2636/3206] platform/x86/dell: Set USTT mode according to BIOS after reboot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a reboot, if the user changes the thermal setting in the BIOS, the BIOS applies this change. However, the current `dell-pc` driver does not recognize the updated USTT value, resulting in inconsistent thermal profiles between Windows and Linux. To ensure alignment with Windows behavior, read the current USTT settings during driver initialization and update the dell-pc USTT profile accordingly whenever a change is detected. Cc: Yijun Shen Co-developed-by: Patil Rajesh Reddy Signed-off-by: Patil Rajesh Reddy Signed-off-by: Shyam Sundar S K Reviewed-by: Lyndon Sanche Reviewed-by: Mario Limonciello (AMD) Tested-By: Yijun Shen Link: https://patch.msgid.link/20250916115142.188535-1-Shyam-sundar.S-k@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/dell-pc.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/dell/dell-pc.c b/drivers/platform/x86/dell/dell-pc.c index 48cc7511905a62..becdd9aaef2970 100644 --- a/drivers/platform/x86/dell/dell-pc.c +++ b/drivers/platform/x86/dell/dell-pc.c @@ -228,6 +228,8 @@ static int thermal_platform_profile_get(struct device *dev, static int thermal_platform_profile_probe(void *drvdata, unsigned long *choices) { + int current_mode; + if (supported_modes & DELL_QUIET) __set_bit(PLATFORM_PROFILE_QUIET, choices); if (supported_modes & DELL_COOL_BOTTOM) @@ -237,6 +239,13 @@ static int thermal_platform_profile_probe(void *drvdata, unsigned long *choices) if (supported_modes & DELL_PERFORMANCE) __set_bit(PLATFORM_PROFILE_PERFORMANCE, choices); + /* Make sure that ACPI is in sync with the profile set by USTT */ + current_mode = thermal_get_mode(); + if (current_mode < 0) + return current_mode; + + thermal_set_mode(current_mode); + return 0; } From a15b5aefa8178846ed614745569fed0d1fb6cb87 Mon Sep 17 00:00:00 2001 From: Nickolay Goppen Date: Wed, 17 Sep 2025 22:10:01 +0300 Subject: [PATCH 2637/3206] platform/x86: dell-lis3lv02d: Add Latitude E6530 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 0x29 as the accelerometer address for the Dell Latitude E6530 to lis3lv02d_devices[]. The address was verified as below: $ cd /sys/bus/pci/drivers/i801_smbus/0000:00:1f.3 $ ls -d i2c-* i2c-20 $ sudo modprobe i2c-dev $ sudo i2cdetect 20 WARNING! This program can confuse your I2C bus, cause data loss and worse! I will probe file /dev/i2c-20. I will probe address range 0x08-0x77. Continue? [Y/n] Y 0 1 2 3 4 5 6 7 8 9 a b c d e f 00: 08 -- -- -- -- -- -- -- 10: -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 20: -- -- -- -- -- -- -- -- -- UU -- 2b -- -- -- -- 30: -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 40: -- -- -- -- 44 -- -- -- -- -- -- -- -- -- -- -- 50: UU -- 52 -- -- -- -- -- -- -- -- -- -- -- -- -- 60: -- 61 -- -- -- -- -- -- -- -- -- -- -- -- -- -- 70: -- -- -- -- -- -- -- -- $ cat /proc/cmdline BOOT_IMAGE=/vmlinuz-linux-cachyos-bore root=UUID= rw loglevel=3 quiet dell_lis3lv02d.probe_i2c_addr=1 $ sudo dmesg [ 0.000000] Linux version 6.16.6-2-cachyos-bore (linux-cachyos-bore@cachyos) (gcc (GCC) 15.2.1 20250813, GNU ld (GNU Binutils) 2.45.0) #1 SMP PREEMPT_DYNAMIC Thu, 11 Sep 2025 16:01:12 +0000 […] [ 0.000000] DMI: Dell Inc. Latitude E6530/07Y85M, BIOS A22 11/30/2018 […] [ 5.166442] i2c i2c-20: Probing for lis3lv02d on address 0x29 [ 5.167854] i2c i2c-20: Detected lis3lv02d on address 0x29, please report this upstream to platform-driver-x86@vger.kernel.org so that a quirk can be added Signed-off-by: Nickolay Goppen Reviewed-by: Hans de Goede Link: https://patch.msgid.link/20250917-dell-lis3lv02d-latitude-e6530-v1-1-8a6dec4e51e9@mainlining.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/dell-lis3lv02d.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/dell/dell-lis3lv02d.c b/drivers/platform/x86/dell/dell-lis3lv02d.c index 732de5f556f83b..77905a9ddde9dd 100644 --- a/drivers/platform/x86/dell/dell-lis3lv02d.c +++ b/drivers/platform/x86/dell/dell-lis3lv02d.c @@ -48,6 +48,7 @@ static const struct dmi_system_id lis3lv02d_devices[] __initconst = { DELL_LIS3LV02D_DMI_ENTRY("Latitude 5500", 0x29), DELL_LIS3LV02D_DMI_ENTRY("Latitude E6330", 0x29), DELL_LIS3LV02D_DMI_ENTRY("Latitude E6430", 0x29), + DELL_LIS3LV02D_DMI_ENTRY("Latitude E6530", 0x29), DELL_LIS3LV02D_DMI_ENTRY("Precision 3540", 0x29), DELL_LIS3LV02D_DMI_ENTRY("Precision 3551", 0x29), DELL_LIS3LV02D_DMI_ENTRY("Precision M6800", 0x29), From 53de7ee4e28f6e866ac319b9db6e6c1b05664c32 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 18 Sep 2025 12:51:19 +0200 Subject: [PATCH 2638/3206] btrfs: zoned: don't fail mount needlessly due to too many active zones Previously BTRFS did not look at a device's reported max_open_zones limit, but starting with commit 04147d8394e8 ("btrfs: zoned: limit active zones to max_open_zones"), zoned BTRFS limited the number of concurrently used block-groups to the number of max_open_zones a device reported, if it hadn't already reported a number of max_active_zones. Starting with commit 04147d8394e8 the number of open zones is treated the same way as active zones. But this leads to mount failures on filesystems which have been used before 04147d8394e8 because too many zones are in an open state. Ignore the new limitations on these filesystems, so zones can be finished or evacuated. Reported-by: Yuwei Han Link: https://lore.kernel.org/all/2F48A90AF7DDF380+1790bcfd-cb6f-456b-870d-7982f21b5eae@bupt.moe/ Fixes: 04147d8394e8 ("btrfs: zoned: limit active zones to max_open_zones") Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index efc2a81f50e551..f426276e2b6bfe 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -514,6 +514,11 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (max_active_zones) { if (nactive > max_active_zones) { + if (bdev_max_active_zones(bdev) == 0) { + max_active_zones = 0; + zone_info->max_active_zones = 0; + goto validate; + } btrfs_err(device->fs_info, "zoned: %u active zones on %s exceeds max_active_zones %u", nactive, rcu_dereference(device->name), @@ -526,6 +531,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); } +validate: /* Validate superblock log */ nr_zones = BTRFS_NR_SB_LOG_ZONES; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { From 9b2f5ef00e852f8e8902a4d4f73aeedc60220c12 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 22 Sep 2025 15:45:54 +0200 Subject: [PATCH 2639/3206] fbcon: Fix OOB access in font allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1a194e6c8e1e ("fbcon: fix integer overflow in fbcon_do_set_font") introduced an out-of-bounds access by storing data and allocation sizes in the same variable. Restore the old size calculation and use the new variable 'alloc_size' for the allocation. Signed-off-by: Thomas Zimmermann Fixes: 1a194e6c8e1e ("fbcon: fix integer overflow in fbcon_do_set_font") Reported-by: Jani Nikula Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/15020 Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6201 Cc: Samasth Norway Ananda Cc: Thomas Zimmermann Cc: George Kennedy Cc: Greg Kroah-Hartman Cc: Simona Vetter Cc: Helge Deller Cc: "Ville Syrjälä" Cc: Sam Ravnborg Cc: Qianqiang Liu Cc: Shixiong Ou Cc: Kees Cook Cc: # v5.9+ Cc: Zsolt Kajtar Reviewed-by: Lucas De Marchi Reviewed-by: Qianqiang Liu Link: https://lore.kernel.org/r/20250922134619.257684-1-tzimmermann@suse.de --- drivers/video/fbdev/core/fbcon.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index a507d05f8feaa8..5940e2eb92316c 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2504,7 +2504,7 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, unsigned charcount = font->charcount; int w = font->width; int h = font->height; - int size; + int size, alloc_size; int i, csum; u8 *new_data, *data = font->data; int pitch = PITCH(font->width); @@ -2537,10 +2537,10 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, return -EINVAL; /* Check for overflow in allocation size calculation */ - if (check_add_overflow(FONT_EXTRA_WORDS * sizeof(int), size, &size)) + if (check_add_overflow(FONT_EXTRA_WORDS * sizeof(int), size, &alloc_size)) return -EINVAL; - new_data = kmalloc(size, GFP_USER); + new_data = kmalloc(alloc_size, GFP_USER); if (!new_data) return -ENOMEM; From 0b781f527d6f99e68e5b3780ae03cd69a7cb5c0c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:19 +0200 Subject: [PATCH 2640/3206] clocksource/drivers/vf-pit: Replace raw_readl/writel to readl/writel The driver uses the raw_readl() and raw_writel() functions. Those are not for MMIO devices. Replace them with readl() and writel() [ dlezcano: Fixed typo in the subject s/reald/readl/ ] Signed-off-by: Daniel Lezcano Acked-by: Arnd Bergmann Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20250804152344.1109310-2-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 911c92146eca6d..8041a8f62d1fa4 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -35,30 +35,30 @@ static unsigned long cycle_per_jiffy; static inline void pit_timer_enable(void) { - __raw_writel(PITTCTRL_TEN | PITTCTRL_TIE, clkevt_base + PITTCTRL); + writel(PITTCTRL_TEN | PITTCTRL_TIE, clkevt_base + PITTCTRL); } static inline void pit_timer_disable(void) { - __raw_writel(0, clkevt_base + PITTCTRL); + writel(0, clkevt_base + PITTCTRL); } static inline void pit_irq_acknowledge(void) { - __raw_writel(PITTFLG_TIF, clkevt_base + PITTFLG); + writel(PITTFLG_TIF, clkevt_base + PITTFLG); } static u64 notrace pit_read_sched_clock(void) { - return ~__raw_readl(clksrc_base + PITCVAL); + return ~readl(clksrc_base + PITCVAL); } static int __init pit_clocksource_init(unsigned long rate) { /* set the max load value and start the clock source counter */ - __raw_writel(0, clksrc_base + PITTCTRL); - __raw_writel(~0UL, clksrc_base + PITLDVAL); - __raw_writel(PITTCTRL_TEN, clksrc_base + PITTCTRL); + writel(0, clksrc_base + PITTCTRL); + writel(~0UL, clksrc_base + PITLDVAL); + writel(PITTCTRL_TEN, clksrc_base + PITTCTRL); sched_clock_register(pit_read_sched_clock, 32, rate); return clocksource_mmio_init(clksrc_base + PITCVAL, "vf-pit", rate, @@ -76,7 +76,7 @@ static int pit_set_next_event(unsigned long delta, * hardware requirement. */ pit_timer_disable(); - __raw_writel(delta - 1, clkevt_base + PITLDVAL); + writel(delta - 1, clkevt_base + PITLDVAL); pit_timer_enable(); return 0; @@ -125,8 +125,8 @@ static struct clock_event_device clockevent_pit = { static int __init pit_clockevent_init(unsigned long rate, int irq) { - __raw_writel(0, clkevt_base + PITTCTRL); - __raw_writel(PITTFLG_TIF, clkevt_base + PITTFLG); + writel(0, clkevt_base + PITTCTRL); + writel(PITTFLG_TIF, clkevt_base + PITTFLG); BUG_ON(request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL, "VF pit timer", &clockevent_pit)); @@ -183,7 +183,7 @@ static int __init pit_timer_init(struct device_node *np) cycle_per_jiffy = clk_rate / (HZ); /* enable the pit module */ - __raw_writel(~PITMCR_MDIS, timer_base + PITMCR); + writel(~PITMCR_MDIS, timer_base + PITMCR); ret = pit_clocksource_init(clk_rate); if (ret) From 2decd0b63eb0a6773e9a5541300cd92e469f541b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:20 +0200 Subject: [PATCH 2641/3206] clocksource/drivers/vf-pit: Add COMPILE_TEST option The VF PIT driver is a silent koption. In order to allow a better compilation test coverage, let's add the COMPILE_TEST option so it can be selected on other platforms than the Vybrid Family. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-3-daniel.lezcano@linaro.org --- drivers/clocksource/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 645f517a1ac26d..6f7d371904df44 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -475,7 +475,7 @@ config FSL_FTM_TIMER Support for Freescale FlexTimer Module (FTM) timer. config VF_PIT_TIMER - bool + bool "Vybrid Family Programmable timer" if COMPILE_TEST select CLKSRC_MMIO help Support for Periodic Interrupt Timer on Freescale Vybrid Family SoCs. From 3996232e6e7e34a1f783c778f6c7075293912365 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:21 +0200 Subject: [PATCH 2642/3206] clocksource/drivers/vf-pit: Set the scene for multiple timers The driver is implemented as using a single timer and a single clocksource. In order to take advantage of the multiple timers supported in the PIT hardware and introduce different setup for a new platform, let's encapsulate the data into a structure and pass this structure around in the function parameter. The structure will be a per timer instansiation in the next changes. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-4-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 121 +++++++++++++++++------------ 1 file changed, 72 insertions(+), 49 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 8041a8f62d1fa4..e4a8b32fff756d 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -15,7 +15,7 @@ */ #define PITMCR 0x00 #define PIT0_OFFSET 0x100 -#define PITn_OFFSET(n) (PIT0_OFFSET + 0x10 * (n)) +#define PIT_CH(n) (PIT0_OFFSET + 0x10 * (n)) #define PITLDVAL 0x00 #define PITCVAL 0x04 #define PITTCTRL 0x08 @@ -29,23 +29,36 @@ #define PITTFLG_TIF 0x1 +struct pit_timer { + void __iomem *clksrc_base; + void __iomem *clkevt_base; + unsigned long cycle_per_jiffy; + struct clock_event_device ced; + struct clocksource cs; +}; + +static struct pit_timer pit_timer; + static void __iomem *clksrc_base; -static void __iomem *clkevt_base; -static unsigned long cycle_per_jiffy; -static inline void pit_timer_enable(void) +static inline struct pit_timer *ced_to_pit(struct clock_event_device *ced) { - writel(PITTCTRL_TEN | PITTCTRL_TIE, clkevt_base + PITTCTRL); + return container_of(ced, struct pit_timer, ced); } -static inline void pit_timer_disable(void) +static inline void pit_timer_enable(struct pit_timer *pit) { - writel(0, clkevt_base + PITTCTRL); + writel(PITTCTRL_TEN | PITTCTRL_TIE, pit->clkevt_base + PITTCTRL); } -static inline void pit_irq_acknowledge(void) +static inline void pit_timer_disable(struct pit_timer *pit) { - writel(PITTFLG_TIF, clkevt_base + PITTFLG); + writel(0, pit->clkevt_base + PITTCTRL); +} + +static inline void pit_irq_acknowledge(struct pit_timer *pit) +{ + writel(PITTFLG_TIF, pit->clkevt_base + PITTFLG); } static u64 notrace pit_read_sched_clock(void) @@ -53,21 +66,24 @@ static u64 notrace pit_read_sched_clock(void) return ~readl(clksrc_base + PITCVAL); } -static int __init pit_clocksource_init(unsigned long rate) +static int __init pit_clocksource_init(struct pit_timer *pit, unsigned long rate) { /* set the max load value and start the clock source counter */ - writel(0, clksrc_base + PITTCTRL); - writel(~0UL, clksrc_base + PITLDVAL); - writel(PITTCTRL_TEN, clksrc_base + PITTCTRL); + writel(0, pit->clksrc_base + PITTCTRL); + writel(~0, pit->clksrc_base + PITLDVAL); + writel(PITTCTRL_TEN, pit->clksrc_base + PITTCTRL); + + clksrc_base = pit->clksrc_base; sched_clock_register(pit_read_sched_clock, 32, rate); - return clocksource_mmio_init(clksrc_base + PITCVAL, "vf-pit", rate, + return clocksource_mmio_init(pit->clksrc_base + PITCVAL, "vf-pit", rate, 300, 32, clocksource_mmio_readl_down); } -static int pit_set_next_event(unsigned long delta, - struct clock_event_device *unused) +static int pit_set_next_event(unsigned long delta, struct clock_event_device *ced) { + struct pit_timer *pit = ced_to_pit(ced); + /* * set a new value to PITLDVAL register will not restart the timer, * to abort the current cycle and start a timer period with the new @@ -75,30 +91,37 @@ static int pit_set_next_event(unsigned long delta, * and the PITLAVAL should be set to delta minus one according to pit * hardware requirement. */ - pit_timer_disable(); - writel(delta - 1, clkevt_base + PITLDVAL); - pit_timer_enable(); + pit_timer_disable(pit); + writel(delta - 1, pit->clkevt_base + PITLDVAL); + pit_timer_enable(pit); return 0; } -static int pit_shutdown(struct clock_event_device *evt) +static int pit_shutdown(struct clock_event_device *ced) { - pit_timer_disable(); + struct pit_timer *pit = ced_to_pit(ced); + + pit_timer_disable(pit); + return 0; } -static int pit_set_periodic(struct clock_event_device *evt) +static int pit_set_periodic(struct clock_event_device *ced) { - pit_set_next_event(cycle_per_jiffy, evt); + struct pit_timer *pit = ced_to_pit(ced); + + pit_set_next_event(pit->cycle_per_jiffy, ced); + return 0; } static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) { - struct clock_event_device *evt = dev_id; + struct clock_event_device *ced = dev_id; + struct pit_timer *pit = ced_to_pit(ced); - pit_irq_acknowledge(); + pit_irq_acknowledge(pit); /* * pit hardware doesn't support oneshot, it will generate an interrupt @@ -106,33 +129,33 @@ static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) * and start the counter again. So software need to disable the timer * to stop the counter loop in ONESHOT mode. */ - if (likely(clockevent_state_oneshot(evt))) - pit_timer_disable(); + if (likely(clockevent_state_oneshot(ced))) + pit_timer_disable(pit); - evt->event_handler(evt); + ced->event_handler(ced); return IRQ_HANDLED; } -static struct clock_event_device clockevent_pit = { - .name = "VF pit timer", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_state_shutdown = pit_shutdown, - .set_state_periodic = pit_set_periodic, - .set_next_event = pit_set_next_event, - .rating = 300, -}; - -static int __init pit_clockevent_init(unsigned long rate, int irq) +static int __init pit_clockevent_init(struct pit_timer *pit, unsigned long rate, int irq) { - writel(0, clkevt_base + PITTCTRL); - writel(PITTFLG_TIF, clkevt_base + PITTFLG); + writel(0, pit->clkevt_base + PITTCTRL); + + writel(PITTFLG_TIF, pit->clkevt_base + PITTFLG); BUG_ON(request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL, - "VF pit timer", &clockevent_pit)); + "VF pit timer", &pit->ced)); + + pit->ced.cpumask = cpumask_of(0); + pit->ced.irq = irq; + + pit->ced.name = "VF pit timer"; + pit->ced.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; + pit->ced.set_state_shutdown = pit_shutdown; + pit->ced.set_state_periodic = pit_set_periodic; + pit->ced.set_next_event = pit_set_next_event; + pit->ced.rating = 300; - clockevent_pit.cpumask = cpumask_of(0); - clockevent_pit.irq = irq; /* * The value for the LDVAL register trigger is calculated as: * LDVAL trigger = (period / clock period) - 1 @@ -141,7 +164,7 @@ static int __init pit_clockevent_init(unsigned long rate, int irq) * LDVAL trigger value is 1. And then the min_delta is * minimal LDVAL trigger value + 1, and the max_delta is full 32-bit. */ - clockevents_config_and_register(&clockevent_pit, rate, 2, 0xffffffff); + clockevents_config_and_register(&pit->ced, rate, 2, 0xffffffff); return 0; } @@ -164,8 +187,8 @@ static int __init pit_timer_init(struct device_node *np) * so choose PIT2 as clocksource, PIT3 as clockevent device, * and leave PIT0 and PIT1 unused for anyone else who needs them. */ - clksrc_base = timer_base + PITn_OFFSET(2); - clkevt_base = timer_base + PITn_OFFSET(3); + pit_timer.clksrc_base = timer_base + PIT_CH(2); + pit_timer.clkevt_base = timer_base + PIT_CH(3); irq = irq_of_parse_and_map(np, 0); if (irq <= 0) @@ -180,15 +203,15 @@ static int __init pit_timer_init(struct device_node *np) return ret; clk_rate = clk_get_rate(pit_clk); - cycle_per_jiffy = clk_rate / (HZ); + pit_timer.cycle_per_jiffy = clk_rate / (HZ); /* enable the pit module */ writel(~PITMCR_MDIS, timer_base + PITMCR); - ret = pit_clocksource_init(clk_rate); + ret = pit_clocksource_init(&pit_timer, clk_rate); if (ret) return ret; - return pit_clockevent_init(clk_rate, irq); + return pit_clockevent_init(&pit_timer, clk_rate, irq); } TIMER_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); From 361580317976ff25ce4eddbf207d06b0aca9ab22 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:22 +0200 Subject: [PATCH 2643/3206] clocksource/drivers/vf-pit: Rework the base address usage This change passes the base address to the clockevent and clocksource initialization functions in order to use different base address in the next changes. No functional changes intended. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-5-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 35 +++++++++++++++++++----------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index e4a8b32fff756d..6a5f940ad0bcd2 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -66,8 +66,16 @@ static u64 notrace pit_read_sched_clock(void) return ~readl(clksrc_base + PITCVAL); } -static int __init pit_clocksource_init(struct pit_timer *pit, unsigned long rate) +static int __init pit_clocksource_init(struct pit_timer *pit, void __iomem *base, + unsigned long rate) { + /* + * The channels 0 and 1 can be chained to build a 64-bit + * timer. Let's use the channel 2 as a clocksource and leave + * the channels 0 and 1 unused for anyone else who needs them + */ + pit->clksrc_base = base + PIT_CH(2); + /* set the max load value and start the clock source counter */ writel(0, pit->clksrc_base + PITTCTRL); writel(~0, pit->clksrc_base + PITLDVAL); @@ -76,8 +84,9 @@ static int __init pit_clocksource_init(struct pit_timer *pit, unsigned long rate clksrc_base = pit->clksrc_base; sched_clock_register(pit_read_sched_clock, 32, rate); + return clocksource_mmio_init(pit->clksrc_base + PITCVAL, "vf-pit", rate, - 300, 32, clocksource_mmio_readl_down); + 300, 32, clocksource_mmio_readl_down); } static int pit_set_next_event(unsigned long delta, struct clock_event_device *ced) @@ -137,8 +146,16 @@ static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static int __init pit_clockevent_init(struct pit_timer *pit, unsigned long rate, int irq) +static int __init pit_clockevent_init(struct pit_timer *pit, void __iomem *base, + unsigned long rate, int irq) { + /* + * The channels 0 and 1 can be chained to build a 64-bit + * timer. Let's use the channel 3 as a clockevent and leave + * the channels 0 and 1 unused for anyone else who needs them + */ + pit->clkevt_base = base + PIT_CH(3); + writel(0, pit->clkevt_base + PITTCTRL); writel(PITTFLG_TIF, pit->clkevt_base + PITTFLG); @@ -182,14 +199,6 @@ static int __init pit_timer_init(struct device_node *np) return -ENXIO; } - /* - * PIT0 and PIT1 can be chained to build a 64-bit timer, - * so choose PIT2 as clocksource, PIT3 as clockevent device, - * and leave PIT0 and PIT1 unused for anyone else who needs them. - */ - pit_timer.clksrc_base = timer_base + PIT_CH(2); - pit_timer.clkevt_base = timer_base + PIT_CH(3); - irq = irq_of_parse_and_map(np, 0); if (irq <= 0) return -EINVAL; @@ -208,10 +217,10 @@ static int __init pit_timer_init(struct device_node *np) /* enable the pit module */ writel(~PITMCR_MDIS, timer_base + PITMCR); - ret = pit_clocksource_init(&pit_timer, clk_rate); + ret = pit_clocksource_init(&pit_timer, timer_base, clk_rate); if (ret) return ret; - return pit_clockevent_init(&pit_timer, clk_rate, irq); + return pit_clockevent_init(&pit_timer, timer_base, clk_rate, irq); } TIMER_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); From 995ebf16043392d131bd22ed8484c9c952bfb45f Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:23 +0200 Subject: [PATCH 2644/3206] clocksource/drivers/vf-pit: Pass the cpu number as parameter In order to initialize the timer with a cpumask tied to a cpu, let's pass it as a parameter instead of hardwiring it in the init function. No functional changes intended. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-6-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 6a5f940ad0bcd2..9f3b72be987ad8 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -147,7 +147,7 @@ static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) } static int __init pit_clockevent_init(struct pit_timer *pit, void __iomem *base, - unsigned long rate, int irq) + unsigned long rate, int irq, unsigned int cpu) { /* * The channels 0 and 1 can be chained to build a 64-bit @@ -163,7 +163,7 @@ static int __init pit_clockevent_init(struct pit_timer *pit, void __iomem *base, BUG_ON(request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL, "VF pit timer", &pit->ced)); - pit->ced.cpumask = cpumask_of(0); + pit->ced.cpumask = cpumask_of(cpu); pit->ced.irq = irq; pit->ced.name = "VF pit timer"; @@ -221,6 +221,6 @@ static int __init pit_timer_init(struct device_node *np) if (ret) return ret; - return pit_clockevent_init(&pit_timer, timer_base, clk_rate, irq); + return pit_clockevent_init(&pit_timer, timer_base, clk_rate, irq, 0); } TIMER_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); From 8b0795e0fc0cf181ecab59d3a9a1618886ea995b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:24 +0200 Subject: [PATCH 2645/3206] clocksource/drivers/vf-pit: Encapsulate the initialization of the cycles_per_jiffy Move the cycles_per_jiffy initialization to the same place where the other pit timer fields are initialized. No functional changes intended. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-7-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 9f3b72be987ad8..a11e6a63c79f99 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -155,6 +155,7 @@ static int __init pit_clockevent_init(struct pit_timer *pit, void __iomem *base, * the channels 0 and 1 unused for anyone else who needs them */ pit->clkevt_base = base + PIT_CH(3); + pit->cycle_per_jiffy = rate / (HZ); writel(0, pit->clkevt_base + PITTCTRL); @@ -212,7 +213,6 @@ static int __init pit_timer_init(struct device_node *np) return ret; clk_rate = clk_get_rate(pit_clk); - pit_timer.cycle_per_jiffy = clk_rate / (HZ); /* enable the pit module */ writel(~PITMCR_MDIS, timer_base + PITMCR); From 375fbfc66ca23873c3d9b1c5f6739bf0e2875a57 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:25 +0200 Subject: [PATCH 2646/3206] clocksource/drivers/vf-pit: Allocate the struct timer at init time Instead of having a static global structure for a timer, let's allocate it dynamically so we can create multiple instances in the future to support multiple timers. At the same time, add the rollbacking code in case of error. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-8-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 48 +++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index a11e6a63c79f99..d408dcddb4e94f 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -37,8 +37,6 @@ struct pit_timer { struct clocksource cs; }; -static struct pit_timer pit_timer; - static void __iomem *clksrc_base; static inline struct pit_timer *ced_to_pit(struct clock_event_device *ced) @@ -189,38 +187,66 @@ static int __init pit_clockevent_init(struct pit_timer *pit, void __iomem *base, static int __init pit_timer_init(struct device_node *np) { + struct pit_timer *pit; struct clk *pit_clk; void __iomem *timer_base; unsigned long clk_rate; int irq, ret; + pit = kzalloc(sizeof(*pit), GFP_KERNEL); + if (!pit) + return -ENOMEM; + + ret = -ENXIO; timer_base = of_iomap(np, 0); if (!timer_base) { pr_err("Failed to iomap\n"); - return -ENXIO; + goto out_kfree; } + ret = -EINVAL; irq = irq_of_parse_and_map(np, 0); - if (irq <= 0) - return -EINVAL; + if (irq <= 0) { + pr_err("Failed to irq_of_parse_and_map\n"); + goto out_iounmap; + } pit_clk = of_clk_get(np, 0); - if (IS_ERR(pit_clk)) - return PTR_ERR(pit_clk); + if (IS_ERR(pit_clk)) { + ret = PTR_ERR(pit_clk); + goto out_iounmap; + } ret = clk_prepare_enable(pit_clk); if (ret) - return ret; + goto out_clk_put; clk_rate = clk_get_rate(pit_clk); /* enable the pit module */ writel(~PITMCR_MDIS, timer_base + PITMCR); - ret = pit_clocksource_init(&pit_timer, timer_base, clk_rate); + ret = pit_clocksource_init(pit, timer_base, clk_rate); if (ret) - return ret; + goto out_disable_unprepare; + + ret = pit_clockevent_init(pit, timer_base, clk_rate, irq, 0); + if (ret) + goto out_pit_clocksource_unregister; + + return 0; - return pit_clockevent_init(&pit_timer, timer_base, clk_rate, irq, 0); +out_pit_clocksource_unregister: + clocksource_unregister(&pit->cs); +out_disable_unprepare: + clk_disable_unprepare(pit_clk); +out_clk_put: + clk_put(pit_clk); +out_iounmap: + iounmap(timer_base); +out_kfree: + kfree(pit); + + return ret; } TIMER_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); From 0c063c9afc1b5243adde544637e273c1ac0a31d9 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:26 +0200 Subject: [PATCH 2647/3206] clocksource/drivers/vf-pit: Convert raw values to BIT macros Use the BIT macros instead of the shifting syntax. No functional change intended. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-9-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index d408dcddb4e94f..d1aec6aaeb02f9 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -21,11 +21,11 @@ #define PITTCTRL 0x08 #define PITTFLG 0x0c -#define PITMCR_MDIS (0x1 << 1) +#define PITMCR_MDIS BIT(1) -#define PITTCTRL_TEN (0x1 << 0) -#define PITTCTRL_TIE (0x1 << 1) -#define PITCTRL_CHN (0x1 << 2) +#define PITTCTRL_TEN BIT(0) +#define PITTCTRL_TIE BIT(1) +#define PITCTRL_CHN BIT(2) #define PITTFLG_TIF 0x1 From c106b698ab8d1899d62de880d73dd99edb319849 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:27 +0200 Subject: [PATCH 2648/3206] clocksource/drivers/vf-pit: Register the clocksource from the driver The function clocksource_mmio_init() uses its own global static clocksource variable making no possible to have several instances of a clocksource using this function. In order to support that, let's add the clocksource structure to the pit structure and use the clocksource_register_hz() function instead. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-10-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index d1aec6aaeb02f9..6a4043801eebda 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -44,6 +44,11 @@ static inline struct pit_timer *ced_to_pit(struct clock_event_device *ced) return container_of(ced, struct pit_timer, ced); } +static inline struct pit_timer *cs_to_pit(struct clocksource *cs) +{ + return container_of(cs, struct pit_timer, cs); +} + static inline void pit_timer_enable(struct pit_timer *pit) { writel(PITTCTRL_TEN | PITTCTRL_TIE, pit->clkevt_base + PITTCTRL); @@ -64,6 +69,13 @@ static u64 notrace pit_read_sched_clock(void) return ~readl(clksrc_base + PITCVAL); } +static u64 pit_timer_clocksource_read(struct clocksource *cs) +{ + struct pit_timer *pit = cs_to_pit(cs); + + return (u64)~readl(pit->clksrc_base + PITCVAL); +} + static int __init pit_clocksource_init(struct pit_timer *pit, void __iomem *base, unsigned long rate) { @@ -73,6 +85,11 @@ static int __init pit_clocksource_init(struct pit_timer *pit, void __iomem *base * the channels 0 and 1 unused for anyone else who needs them */ pit->clksrc_base = base + PIT_CH(2); + pit->cs.name = "vf-pit"; + pit->cs.rating = 300; + pit->cs.read = pit_timer_clocksource_read; + pit->cs.mask = CLOCKSOURCE_MASK(32); + pit->cs.flags = CLOCK_SOURCE_IS_CONTINUOUS; /* set the max load value and start the clock source counter */ writel(0, pit->clksrc_base + PITTCTRL); @@ -83,8 +100,7 @@ static int __init pit_clocksource_init(struct pit_timer *pit, void __iomem *base sched_clock_register(pit_read_sched_clock, 32, rate); - return clocksource_mmio_init(pit->clksrc_base + PITCVAL, "vf-pit", rate, - 300, 32, clocksource_mmio_readl_down); + return clocksource_register_hz(&pit->cs, rate); } static int pit_set_next_event(unsigned long delta, struct clock_event_device *ced) From 1ba63930e72356315e1a664952f52ee340edbbd3 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:28 +0200 Subject: [PATCH 2649/3206] clocksource/drivers/vf-pit: Encapsulate the macros Pass the base address to the macro, so we can use the macro with multiple instances of the timer because we deal with different base address. At the same time, change writes to the register to the existing corresponding functions. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-11-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 34 ++++++++++++++++-------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 6a4043801eebda..c81c68b826a0c9 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -16,18 +16,20 @@ #define PITMCR 0x00 #define PIT0_OFFSET 0x100 #define PIT_CH(n) (PIT0_OFFSET + 0x10 * (n)) -#define PITLDVAL 0x00 + #define PITCVAL 0x04 -#define PITTCTRL 0x08 -#define PITTFLG 0x0c #define PITMCR_MDIS BIT(1) -#define PITTCTRL_TEN BIT(0) -#define PITTCTRL_TIE BIT(1) -#define PITCTRL_CHN BIT(2) +#define PITLDVAL(__base) (__base) +#define PITTCTRL(__base) ((__base) + 0x08) + +#define PITTCTRL_TEN BIT(0) +#define PITTCTRL_TIE BIT(1) + +#define PITTFLG(__base) ((__base) + 0x0c) -#define PITTFLG_TIF 0x1 +#define PITTFLG_TIF BIT(0) struct pit_timer { void __iomem *clksrc_base; @@ -51,17 +53,17 @@ static inline struct pit_timer *cs_to_pit(struct clocksource *cs) static inline void pit_timer_enable(struct pit_timer *pit) { - writel(PITTCTRL_TEN | PITTCTRL_TIE, pit->clkevt_base + PITTCTRL); + writel(PITTCTRL_TEN | PITTCTRL_TIE, PITTCTRL(pit->clkevt_base)); } static inline void pit_timer_disable(struct pit_timer *pit) { - writel(0, pit->clkevt_base + PITTCTRL); + writel(0, PITTCTRL(pit->clkevt_base)); } static inline void pit_irq_acknowledge(struct pit_timer *pit) { - writel(PITTFLG_TIF, pit->clkevt_base + PITTFLG); + writel(PITTFLG_TIF, PITTFLG(pit->clkevt_base)); } static u64 notrace pit_read_sched_clock(void) @@ -92,9 +94,9 @@ static int __init pit_clocksource_init(struct pit_timer *pit, void __iomem *base pit->cs.flags = CLOCK_SOURCE_IS_CONTINUOUS; /* set the max load value and start the clock source counter */ - writel(0, pit->clksrc_base + PITTCTRL); - writel(~0, pit->clksrc_base + PITLDVAL); - writel(PITTCTRL_TEN, pit->clksrc_base + PITTCTRL); + pit_timer_disable(pit); + writel(~0, PITLDVAL(pit->clksrc_base)); + writel(PITTCTRL_TEN, PITTCTRL(pit->clksrc_base)); clksrc_base = pit->clksrc_base; @@ -115,7 +117,7 @@ static int pit_set_next_event(unsigned long delta, struct clock_event_device *ce * hardware requirement. */ pit_timer_disable(pit); - writel(delta - 1, pit->clkevt_base + PITLDVAL); + writel(delta - 1, PITLDVAL(pit->clkevt_base)); pit_timer_enable(pit); return 0; @@ -171,9 +173,9 @@ static int __init pit_clockevent_init(struct pit_timer *pit, void __iomem *base, pit->clkevt_base = base + PIT_CH(3); pit->cycle_per_jiffy = rate / (HZ); - writel(0, pit->clkevt_base + PITTCTRL); + pit_timer_disable(pit); - writel(PITTFLG_TIF, pit->clkevt_base + PITTFLG); + pit_irq_acknowledge(pit); BUG_ON(request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL, "VF pit timer", &pit->ced)); From d8629b9b2c17a458ca504b10b604b8cfe95df3ab Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:29 +0200 Subject: [PATCH 2650/3206] clocksource/drivers/vf-pit: Encapsulate the PTLCVAL macro Pass the channel and the base address to the PITLCVAL macro so it is possible to use multiple instances of the timer with the macro. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-12-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index c81c68b826a0c9..4f1b85ba5de3aa 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -17,13 +17,14 @@ #define PIT0_OFFSET 0x100 #define PIT_CH(n) (PIT0_OFFSET + 0x10 * (n)) -#define PITCVAL 0x04 - #define PITMCR_MDIS BIT(1) #define PITLDVAL(__base) (__base) #define PITTCTRL(__base) ((__base) + 0x08) +#define PITCVAL_OFFSET 0x04 +#define PITCVAL(__base) ((__base) + 0x04) + #define PITTCTRL_TEN BIT(0) #define PITTCTRL_TIE BIT(1) @@ -39,7 +40,7 @@ struct pit_timer { struct clocksource cs; }; -static void __iomem *clksrc_base; +static void __iomem *sched_clock_base; static inline struct pit_timer *ced_to_pit(struct clock_event_device *ced) { @@ -68,14 +69,14 @@ static inline void pit_irq_acknowledge(struct pit_timer *pit) static u64 notrace pit_read_sched_clock(void) { - return ~readl(clksrc_base + PITCVAL); + return ~readl(sched_clock_base); } static u64 pit_timer_clocksource_read(struct clocksource *cs) { struct pit_timer *pit = cs_to_pit(cs); - return (u64)~readl(pit->clksrc_base + PITCVAL); + return (u64)~readl(PITCVAL(pit->clksrc_base)); } static int __init pit_clocksource_init(struct pit_timer *pit, void __iomem *base, @@ -98,8 +99,7 @@ static int __init pit_clocksource_init(struct pit_timer *pit, void __iomem *base writel(~0, PITLDVAL(pit->clksrc_base)); writel(PITTCTRL_TEN, PITTCTRL(pit->clksrc_base)); - clksrc_base = pit->clksrc_base; - + sched_clock_base = pit->clksrc_base + PITCVAL_OFFSET; sched_clock_register(pit_read_sched_clock, 32, rate); return clocksource_register_hz(&pit->cs, rate); From 7201c95c258936e32c950f447c59875780046848 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:30 +0200 Subject: [PATCH 2651/3206] clocksource/drivers/vf-pit: Use the node name for the interrupt and timer names In order to differentiate from userspace the pit timer given the device tree, let's use the node name for the interrupt and the timer names. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-13-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 4f1b85ba5de3aa..2a255b45561d1a 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -79,8 +79,8 @@ static u64 pit_timer_clocksource_read(struct clocksource *cs) return (u64)~readl(PITCVAL(pit->clksrc_base)); } -static int __init pit_clocksource_init(struct pit_timer *pit, void __iomem *base, - unsigned long rate) +static int __init pit_clocksource_init(struct pit_timer *pit, const char *name, + void __iomem *base, unsigned long rate) { /* * The channels 0 and 1 can be chained to build a 64-bit @@ -88,7 +88,7 @@ static int __init pit_clocksource_init(struct pit_timer *pit, void __iomem *base * the channels 0 and 1 unused for anyone else who needs them */ pit->clksrc_base = base + PIT_CH(2); - pit->cs.name = "vf-pit"; + pit->cs.name = name; pit->cs.rating = 300; pit->cs.read = pit_timer_clocksource_read; pit->cs.mask = CLOCKSOURCE_MASK(32); @@ -162,8 +162,9 @@ static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static int __init pit_clockevent_init(struct pit_timer *pit, void __iomem *base, - unsigned long rate, int irq, unsigned int cpu) +static int __init pit_clockevent_init(struct pit_timer *pit, const char *name, + void __iomem *base, unsigned long rate, + int irq, unsigned int cpu) { /* * The channels 0 and 1 can be chained to build a 64-bit @@ -178,12 +179,12 @@ static int __init pit_clockevent_init(struct pit_timer *pit, void __iomem *base, pit_irq_acknowledge(pit); BUG_ON(request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL, - "VF pit timer", &pit->ced)); + name, &pit->ced)); pit->ced.cpumask = cpumask_of(cpu); pit->ced.irq = irq; - pit->ced.name = "VF pit timer"; + pit->ced.name = name; pit->ced.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; pit->ced.set_state_shutdown = pit_shutdown; pit->ced.set_state_periodic = pit_set_periodic; @@ -208,6 +209,7 @@ static int __init pit_timer_init(struct device_node *np) struct pit_timer *pit; struct clk *pit_clk; void __iomem *timer_base; + const char *name = of_node_full_name(np); unsigned long clk_rate; int irq, ret; @@ -244,11 +246,11 @@ static int __init pit_timer_init(struct device_node *np) /* enable the pit module */ writel(~PITMCR_MDIS, timer_base + PITMCR); - ret = pit_clocksource_init(pit, timer_base, clk_rate); + ret = pit_clocksource_init(pit, name, timer_base, clk_rate); if (ret) goto out_disable_unprepare; - ret = pit_clockevent_init(pit, timer_base, clk_rate, irq, 0); + ret = pit_clockevent_init(pit, name, timer_base, clk_rate, irq, 0); if (ret) goto out_pit_clocksource_unregister; From fcf25b4427c7d1edb91dd02753d6153fb25da094 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:31 +0200 Subject: [PATCH 2652/3206] clocksource/drivers/vf-pit: Encapsulate clocksource enable / disable For the sake of lisibility, let's encapsulate the writel calls to enable and disable the timer into a function with a self-explainatory name. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-14-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 2a255b45561d1a..96377088a048e1 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -62,6 +62,16 @@ static inline void pit_timer_disable(struct pit_timer *pit) writel(0, PITTCTRL(pit->clkevt_base)); } +static inline void pit_clocksource_enable(struct pit_timer *pit) +{ + writel(PITTCTRL_TEN, PITTCTRL(pit->clksrc_base)); +} + +static inline void pit_clocksource_disable(struct pit_timer *pit) +{ + pit_timer_disable(pit); +} + static inline void pit_irq_acknowledge(struct pit_timer *pit) { writel(PITTFLG_TIF, PITTFLG(pit->clkevt_base)); @@ -95,9 +105,9 @@ static int __init pit_clocksource_init(struct pit_timer *pit, const char *name, pit->cs.flags = CLOCK_SOURCE_IS_CONTINUOUS; /* set the max load value and start the clock source counter */ - pit_timer_disable(pit); + pit_clocksource_disable(pit); writel(~0, PITLDVAL(pit->clksrc_base)); - writel(PITTCTRL_TEN, PITTCTRL(pit->clksrc_base)); + pit_clocksource_enable(pit); sched_clock_base = pit->clksrc_base + PITCVAL_OFFSET; sched_clock_register(pit_read_sched_clock, 32, rate); From 13cea8527c95e1359191347abe5d94cccc47a311 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:32 +0200 Subject: [PATCH 2653/3206] clocksource/drivers/vf-pit: Enable and disable module on error Encapsulate the calls to writel to enable and disable the PIT module and make use of them. Add the missing module disablement in case of error. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-15-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 96377088a048e1..609a4d9deb646d 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -13,10 +13,12 @@ /* * Each pit takes 0x10 Bytes register space */ -#define PITMCR 0x00 #define PIT0_OFFSET 0x100 #define PIT_CH(n) (PIT0_OFFSET + 0x10 * (n)) +#define PITMCR(__base) (__base) + +#define PITMCR_FRZ BIT(0) #define PITMCR_MDIS BIT(1) #define PITLDVAL(__base) (__base) @@ -52,6 +54,16 @@ static inline struct pit_timer *cs_to_pit(struct clocksource *cs) return container_of(cs, struct pit_timer, cs); } +static inline void pit_module_enable(void __iomem *base) +{ + writel(0, PITMCR(base)); +} + +static inline void pit_module_disable(void __iomem *base) +{ + writel(PITMCR_MDIS, PITMCR(base)); +} + static inline void pit_timer_enable(struct pit_timer *pit) { writel(PITTCTRL_TEN | PITTCTRL_TIE, PITTCTRL(pit->clkevt_base)); @@ -254,11 +266,11 @@ static int __init pit_timer_init(struct device_node *np) clk_rate = clk_get_rate(pit_clk); /* enable the pit module */ - writel(~PITMCR_MDIS, timer_base + PITMCR); + pit_module_enable(timer_base); ret = pit_clocksource_init(pit, name, timer_base, clk_rate); if (ret) - goto out_disable_unprepare; + goto out_pit_module_disable; ret = pit_clockevent_init(pit, name, timer_base, clk_rate, irq, 0); if (ret) @@ -268,7 +280,8 @@ static int __init pit_timer_init(struct device_node *np) out_pit_clocksource_unregister: clocksource_unregister(&pit->cs); -out_disable_unprepare: +out_pit_module_disable: + pit_module_disable(timer_base); clk_disable_unprepare(pit_clk); out_clk_put: clk_put(pit_clk); From 46e83e4afc05c87b5edc2b0b4765283f101af0c6 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:33 +0200 Subject: [PATCH 2654/3206] clocksource/drivers/vf-pit: Encapsulate set counter function Encapsulate the writel() calls to set the counter into a self-explainatory function. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-16-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 609a4d9deb646d..5551b61483f89a 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -74,6 +74,11 @@ static inline void pit_timer_disable(struct pit_timer *pit) writel(0, PITTCTRL(pit->clkevt_base)); } +static inline void pit_timer_set_counter(void __iomem *base, unsigned int cnt) +{ + writel(cnt, PITLDVAL(base)); +} + static inline void pit_clocksource_enable(struct pit_timer *pit) { writel(PITTCTRL_TEN, PITTCTRL(pit->clksrc_base)); @@ -118,7 +123,7 @@ static int __init pit_clocksource_init(struct pit_timer *pit, const char *name, /* set the max load value and start the clock source counter */ pit_clocksource_disable(pit); - writel(~0, PITLDVAL(pit->clksrc_base)); + pit_timer_set_counter(pit->clksrc_base, ~0); pit_clocksource_enable(pit); sched_clock_base = pit->clksrc_base + PITCVAL_OFFSET; @@ -139,7 +144,7 @@ static int pit_set_next_event(unsigned long delta, struct clock_event_device *ce * hardware requirement. */ pit_timer_disable(pit); - writel(delta - 1, PITLDVAL(pit->clkevt_base)); + pit_timer_set_counter(pit->clkevt_base, delta - 1); pit_timer_enable(pit); return 0; From 5ba405c719ce68a86308dd3bd90aeea59959030d Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:34 +0200 Subject: [PATCH 2655/3206] clocksource/drivers/vf-pit: Consolidate calls to pit_*_disable/enable The difference between the pit_clocksource_enable() and pit_clocksource_disable() is only setting the TIF flag for the clockevent. Let's group them and pass the TIF flag parameter to the function so we save some lines of code. But as the base address is different regarding if it is a clocksource or a clockevent, we pass the base address in parameter instead of the struct pit_timer. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-17-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 34 ++++++++++++------------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 5551b61483f89a..3825159a0ca755 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -64,14 +64,16 @@ static inline void pit_module_disable(void __iomem *base) writel(PITMCR_MDIS, PITMCR(base)); } -static inline void pit_timer_enable(struct pit_timer *pit) +static inline void pit_timer_enable(void __iomem *base, bool tie) { - writel(PITTCTRL_TEN | PITTCTRL_TIE, PITTCTRL(pit->clkevt_base)); + u32 val = PITTCTRL_TEN | (tie ? PITTCTRL_TIE : 0); + + writel(val, PITTCTRL(base)); } -static inline void pit_timer_disable(struct pit_timer *pit) +static inline void pit_timer_disable(void __iomem *base) { - writel(0, PITTCTRL(pit->clkevt_base)); + writel(0, PITTCTRL(base)); } static inline void pit_timer_set_counter(void __iomem *base, unsigned int cnt) @@ -79,16 +81,6 @@ static inline void pit_timer_set_counter(void __iomem *base, unsigned int cnt) writel(cnt, PITLDVAL(base)); } -static inline void pit_clocksource_enable(struct pit_timer *pit) -{ - writel(PITTCTRL_TEN, PITTCTRL(pit->clksrc_base)); -} - -static inline void pit_clocksource_disable(struct pit_timer *pit) -{ - pit_timer_disable(pit); -} - static inline void pit_irq_acknowledge(struct pit_timer *pit) { writel(PITTFLG_TIF, PITTFLG(pit->clkevt_base)); @@ -122,9 +114,9 @@ static int __init pit_clocksource_init(struct pit_timer *pit, const char *name, pit->cs.flags = CLOCK_SOURCE_IS_CONTINUOUS; /* set the max load value and start the clock source counter */ - pit_clocksource_disable(pit); + pit_timer_disable(pit->clksrc_base); pit_timer_set_counter(pit->clksrc_base, ~0); - pit_clocksource_enable(pit); + pit_timer_enable(pit->clksrc_base, 0); sched_clock_base = pit->clksrc_base + PITCVAL_OFFSET; sched_clock_register(pit_read_sched_clock, 32, rate); @@ -143,9 +135,9 @@ static int pit_set_next_event(unsigned long delta, struct clock_event_device *ce * and the PITLAVAL should be set to delta minus one according to pit * hardware requirement. */ - pit_timer_disable(pit); + pit_timer_disable(pit->clkevt_base); pit_timer_set_counter(pit->clkevt_base, delta - 1); - pit_timer_enable(pit); + pit_timer_enable(pit->clkevt_base, true); return 0; } @@ -154,7 +146,7 @@ static int pit_shutdown(struct clock_event_device *ced) { struct pit_timer *pit = ced_to_pit(ced); - pit_timer_disable(pit); + pit_timer_disable(pit->clkevt_base); return 0; } @@ -182,7 +174,7 @@ static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) * to stop the counter loop in ONESHOT mode. */ if (likely(clockevent_state_oneshot(ced))) - pit_timer_disable(pit); + pit_timer_disable(pit->clkevt_base); ced->event_handler(ced); @@ -201,7 +193,7 @@ static int __init pit_clockevent_init(struct pit_timer *pit, const char *name, pit->clkevt_base = base + PIT_CH(3); pit->cycle_per_jiffy = rate / (HZ); - pit_timer_disable(pit); + pit_timer_disable(pit->clkevt_base); pit_irq_acknowledge(pit); From 3c34321e9b5965fdbf51fd407a35322a64c9e9c6 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:35 +0200 Subject: [PATCH 2656/3206] clocksource/drivers/vf-pit: Unify the function name for irq ack Most the function are under the form pit_timer_*, let's change the interrupt acknowledgment function name to have the same format. No functional changes intended. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-18-daniel.lezcano@linaro.org --- drivers/clocksource/timer-vf-pit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 3825159a0ca755..2a0ee4109ead92 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -81,7 +81,7 @@ static inline void pit_timer_set_counter(void __iomem *base, unsigned int cnt) writel(cnt, PITLDVAL(base)); } -static inline void pit_irq_acknowledge(struct pit_timer *pit) +static inline void pit_timer_irqack(struct pit_timer *pit) { writel(PITTFLG_TIF, PITTFLG(pit->clkevt_base)); } @@ -165,7 +165,7 @@ static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) struct clock_event_device *ced = dev_id; struct pit_timer *pit = ced_to_pit(ced); - pit_irq_acknowledge(pit); + pit_timer_irqack(pit); /* * pit hardware doesn't support oneshot, it will generate an interrupt @@ -195,7 +195,7 @@ static int __init pit_clockevent_init(struct pit_timer *pit, const char *name, pit_timer_disable(pit->clkevt_base); - pit_irq_acknowledge(pit); + pit_timer_irqack(pit); BUG_ON(request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL, name, &pit->ced)); From fc346a155fe910a1cf4639b00b131f9a10284bdd Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:36 +0200 Subject: [PATCH 2657/3206] clocksource/drivers/vf-pit: Rename the VF PIT to NXP PIT The PIT acronym stands for Periodic Interrupt Timer which is found on different NXP platforms not only on the Vybrid Family. Change the name to be more generic for the NXP platforms in general. That will be consistent with the NXP STM driver naming convention. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-19-daniel.lezcano@linaro.org --- drivers/clocksource/Kconfig | 9 ++++++--- drivers/clocksource/Makefile | 2 +- drivers/clocksource/{timer-vf-pit.c => timer-nxp-pit.c} | 0 3 files changed, 7 insertions(+), 4 deletions(-) rename drivers/clocksource/{timer-vf-pit.c => timer-nxp-pit.c} (100%) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 6f7d371904df44..0fd662f67d296a 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -474,11 +474,14 @@ config FSL_FTM_TIMER help Support for Freescale FlexTimer Module (FTM) timer. -config VF_PIT_TIMER - bool "Vybrid Family Programmable timer" if COMPILE_TEST +config NXP_PIT_TIMER + bool "NXP Periodic Interrupt Timer" if COMPILE_TEST select CLKSRC_MMIO help - Support for Periodic Interrupt Timer on Freescale Vybrid Family SoCs. + Support for Periodic Interrupt Timer on Freescale / NXP + SoCs. This periodic timer is found on the Vybrid Family and + the Automotive S32G2/3 platforms. It contains 4 channels + where two can be coupled to form a 64 bits channel. config SYS_SUPPORTS_SH_CMT bool diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 205bf3b0a8f3f2..77a0f08eb43b75 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -49,7 +49,7 @@ obj-$(CONFIG_CLKSRC_LPC32XX) += timer-lpc32xx.o obj-$(CONFIG_CLKSRC_MPS2) += mps2-timer.o obj-$(CONFIG_CLKSRC_SAMSUNG_PWM) += samsung_pwm_timer.o obj-$(CONFIG_FSL_FTM_TIMER) += timer-fsl-ftm.o -obj-$(CONFIG_VF_PIT_TIMER) += timer-vf-pit.o +obj-$(CONFIG_NXP_PIT_TIMER) += timer-nxp-pit.o obj-$(CONFIG_CLKSRC_QCOM) += timer-qcom.o obj-$(CONFIG_MTK_TIMER) += timer-mediatek.o obj-$(CONFIG_MTK_CPUX_TIMER) += timer-mediatek-cpux.o diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-nxp-pit.c similarity index 100% rename from drivers/clocksource/timer-vf-pit.c rename to drivers/clocksource/timer-nxp-pit.c From adaf5b248ff38f98ea1b3696b9a793db270f8c3e Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:37 +0200 Subject: [PATCH 2658/3206] dt: bindings: fsl,vf610-pit: Add compatible for s32g2 and s32g3 The Vybrid Family is a NXP (formerly Freescale) platform having a Programmable Interrupt Timer (PIT). This timer is an IP found also on the NXP Automotive platform S32G2 and S32G3. Add the compatible for those platforms to describe the timer. Signed-off-by: Daniel Lezcano Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250804152344.1109310-20-daniel.lezcano@linaro.org --- .../devicetree/bindings/timer/fsl,vf610-pit.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml b/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml index bee2c35bd0e293..42e130654d58e0 100644 --- a/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml +++ b/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml @@ -15,8 +15,13 @@ description: properties: compatible: - enum: - - fsl,vf610-pit + oneOf: + - enum: + - fsl,vf610-pit + - nxp,s32g2-pit + - items: + - const: nxp,s32g3-pit + - const: nxp,s32g2-pit reg: maxItems: 1 From bee33f22d7c30626e711b4900e3f460b6e0e104f Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 4 Aug 2025 17:23:38 +0200 Subject: [PATCH 2659/3206] clocksource/drivers/nxp-pit: Add NXP Automotive s32g2 / s32g3 support The previous changes put in place the encapsulation of the code in order to allow multiple instances of the driver. The S32G platform has two Periodic Interrupt Timer (PIT). The IP is exactly the same as the VF platform. Each PIT has four channels which are 32 bits wide and counting down. The two first channels can be chained to implement a 64 bits counter. The channel usage is kept unchanged with the original driver, channel 2 is used as a clocksource, channel 3 is used as a clockevent. Other channels are unused. In order to support the S32G platform which has two PIT, we initialize the timer and bind it to a CPU. The S32G platforms can have 2, 4 or 8 CPUs and this kind of configuration can appear unusual as we may endup with two PIT used as a clockevent for the two first CPUs while the other CPUs use the architected timers. However, in the context of the automotive, the platform can be partioned to assign 2 CPUs for Linux and the others CPUs to third party OS. The PIT is then used with their specifities like the ability to freeze the time which is needed for instance for debugging purpose. The setup found for this platform is each timer instance is bound to CPU0 and CPU1. A counter is incremented when a timer is successfully initialized and assigned to a CPU. This counter is used as an index for the CPU number and to detect when we reach the maximum possible instances for the platform. That in turn triggers the CPU hotplug callbacks to achieve the per CPU setup. It is the exact same mechanism found in the NXP STM driver. If the timers must be bound to different CPUs, it would require an additionnal mechanism which is not part of these changes. Tested on a s32g274a-rdb2. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804152344.1109310-21-daniel.lezcano@linaro.org --- drivers/clocksource/timer-nxp-pit.c | 128 +++++++++++++++++++++++----- 1 file changed, 109 insertions(+), 19 deletions(-) diff --git a/drivers/clocksource/timer-nxp-pit.c b/drivers/clocksource/timer-nxp-pit.c index 2a0ee4109ead92..2d0a3554b6bf7d 100644 --- a/drivers/clocksource/timer-nxp-pit.c +++ b/drivers/clocksource/timer-nxp-pit.c @@ -1,14 +1,16 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2012-2013 Freescale Semiconductor, Inc. + * Copyright 2018,2021-2025 NXP */ - #include #include +#include #include #include #include #include +#include /* * Each pit takes 0x10 Bytes register space @@ -37,11 +39,23 @@ struct pit_timer { void __iomem *clksrc_base; void __iomem *clkevt_base; - unsigned long cycle_per_jiffy; struct clock_event_device ced; struct clocksource cs; + int rate; +}; + +struct pit_timer_data { + int max_pit_instances; }; +static DEFINE_PER_CPU(struct pit_timer *, pit_timers); + +/* + * Global structure for multiple PITs initialization + */ +static int pit_instances; +static int max_pit_instances = 1; + static void __iomem *sched_clock_base; static inline struct pit_timer *ced_to_pit(struct clock_event_device *ced) @@ -98,8 +112,8 @@ static u64 pit_timer_clocksource_read(struct clocksource *cs) return (u64)~readl(PITCVAL(pit->clksrc_base)); } -static int __init pit_clocksource_init(struct pit_timer *pit, const char *name, - void __iomem *base, unsigned long rate) +static int pit_clocksource_init(struct pit_timer *pit, const char *name, + void __iomem *base, unsigned long rate) { /* * The channels 0 and 1 can be chained to build a 64-bit @@ -155,7 +169,7 @@ static int pit_set_periodic(struct clock_event_device *ced) { struct pit_timer *pit = ced_to_pit(ced); - pit_set_next_event(pit->cycle_per_jiffy, ced); + pit_set_next_event(pit->rate / HZ, ced); return 0; } @@ -181,24 +195,28 @@ static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static int __init pit_clockevent_init(struct pit_timer *pit, const char *name, - void __iomem *base, unsigned long rate, - int irq, unsigned int cpu) +static int pit_clockevent_per_cpu_init(struct pit_timer *pit, const char *name, + void __iomem *base, unsigned long rate, + int irq, unsigned int cpu) { + int ret; + /* * The channels 0 and 1 can be chained to build a 64-bit * timer. Let's use the channel 3 as a clockevent and leave * the channels 0 and 1 unused for anyone else who needs them */ pit->clkevt_base = base + PIT_CH(3); - pit->cycle_per_jiffy = rate / (HZ); + pit->rate = rate; pit_timer_disable(pit->clkevt_base); pit_timer_irqack(pit); - BUG_ON(request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL, - name, &pit->ced)); + ret = request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_NOBALANCING, + name, &pit->ced); + if (ret) + return ret; pit->ced.cpumask = cpumask_of(cpu); pit->ced.irq = irq; @@ -210,6 +228,32 @@ static int __init pit_clockevent_init(struct pit_timer *pit, const char *name, pit->ced.set_next_event = pit_set_next_event; pit->ced.rating = 300; + per_cpu(pit_timers, cpu) = pit; + + return 0; +} + +static void pit_clockevent_per_cpu_exit(struct pit_timer *pit, unsigned int cpu) +{ + pit_timer_disable(pit->clkevt_base); + free_irq(pit->ced.irq, &pit->ced); + per_cpu(pit_timers, cpu) = NULL; +} + +static int pit_clockevent_starting_cpu(unsigned int cpu) +{ + struct pit_timer *pit = per_cpu(pit_timers, cpu); + int ret; + + if (!pit) + return 0; + + ret = irq_force_affinity(pit->ced.irq, cpumask_of(cpu)); + if (ret) { + pit_clockevent_per_cpu_exit(pit, cpu); + return ret; + } + /* * The value for the LDVAL register trigger is calculated as: * LDVAL trigger = (period / clock period) - 1 @@ -218,12 +262,12 @@ static int __init pit_clockevent_init(struct pit_timer *pit, const char *name, * LDVAL trigger value is 1. And then the min_delta is * minimal LDVAL trigger value + 1, and the max_delta is full 32-bit. */ - clockevents_config_and_register(&pit->ced, rate, 2, 0xffffffff); + clockevents_config_and_register(&pit->ced, pit->rate, 2, 0xffffffff); return 0; } -static int __init pit_timer_init(struct device_node *np) +static int pit_timer_init(struct device_node *np) { struct pit_timer *pit; struct clk *pit_clk; @@ -253,7 +297,7 @@ static int __init pit_timer_init(struct device_node *np) pit_clk = of_clk_get(np, 0); if (IS_ERR(pit_clk)) { ret = PTR_ERR(pit_clk); - goto out_iounmap; + goto out_irq_dispose_mapping; } ret = clk_prepare_enable(pit_clk); @@ -262,16 +306,31 @@ static int __init pit_timer_init(struct device_node *np) clk_rate = clk_get_rate(pit_clk); - /* enable the pit module */ - pit_module_enable(timer_base); + pit_module_disable(timer_base); ret = pit_clocksource_init(pit, name, timer_base, clk_rate); - if (ret) + if (ret) { + pr_err("Failed to initialize clocksource '%pOF'\n", np); goto out_pit_module_disable; + } - ret = pit_clockevent_init(pit, name, timer_base, clk_rate, irq, 0); - if (ret) + ret = pit_clockevent_per_cpu_init(pit, name, timer_base, clk_rate, irq, pit_instances); + if (ret) { + pr_err("Failed to initialize clockevent '%pOF'\n", np); goto out_pit_clocksource_unregister; + } + + /* enable the pit module */ + pit_module_enable(timer_base); + + pit_instances++; + + if (pit_instances == max_pit_instances) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "PIT timer:starting", + pit_clockevent_starting_cpu, NULL); + if (ret < 0) + goto out_pit_clocksource_unregister; + } return 0; @@ -282,6 +341,8 @@ static int __init pit_timer_init(struct device_node *np) clk_disable_unprepare(pit_clk); out_clk_put: clk_put(pit_clk); +out_irq_dispose_mapping: + irq_dispose_mapping(irq); out_iounmap: iounmap(timer_base); out_kfree: @@ -289,4 +350,33 @@ static int __init pit_timer_init(struct device_node *np) return ret; } + +static int pit_timer_probe(struct platform_device *pdev) +{ + const struct pit_timer_data *pit_timer_data; + + pit_timer_data = of_device_get_match_data(&pdev->dev); + if (pit_timer_data) + max_pit_instances = pit_timer_data->max_pit_instances; + + return pit_timer_init(pdev->dev.of_node); +} + +static struct pit_timer_data s32g2_data = { .max_pit_instances = 2 }; + +static const struct of_device_id pit_timer_of_match[] = { + { .compatible = "nxp,s32g2-pit", .data = &s32g2_data }, + { } +}; +MODULE_DEVICE_TABLE(of, pit_timer_of_match); + +static struct platform_driver nxp_pit_driver = { + .driver = { + .name = "nxp-pit", + .of_match_table = pit_timer_of_match, + }, + .probe = pit_timer_probe, +}; +module_platform_driver(nxp_pit_driver); + TIMER_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); From 5669d92f3efa449c3906cbf15e676768a8f4d502 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 14 Aug 2025 16:46:19 +0100 Subject: [PATCH 2660/3206] ACPI: GTDT: Generate platform devices for MMIO timers In preparation for the MMIO timer support code becoming an actual driver, mimic what is done for the SBSA watchdog and expose a synthetic device for each MMIO timer block. Signed-off-by: Marc Zyngier Signed-off-by: Daniel Lezcano Tested-by: Sudeep Holla Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250807160243.1970533-2-maz@kernel.org --- drivers/acpi/arm64/gtdt.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index 70f8290b659de5..fd995a1d3d248b 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -388,11 +388,11 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd, return 0; } -static int __init gtdt_sbsa_gwdt_init(void) +static int __init gtdt_platform_timer_init(void) { void *platform_timer; struct acpi_table_header *table; - int ret, timer_count, gwdt_count = 0; + int ret, timer_count, gwdt_count = 0, mmio_timer_count = 0; if (acpi_disabled) return 0; @@ -414,20 +414,41 @@ static int __init gtdt_sbsa_gwdt_init(void) goto out_put_gtdt; for_each_platform_timer(platform_timer) { + ret = 0; + if (is_non_secure_watchdog(platform_timer)) { ret = gtdt_import_sbsa_gwdt(platform_timer, gwdt_count); if (ret) - break; + continue; gwdt_count++; + } else if (is_timer_block(platform_timer)) { + struct arch_timer_mem atm = {}; + struct platform_device *pdev; + + ret = gtdt_parse_timer_block(platform_timer, &atm); + if (ret) + continue; + + pdev = platform_device_register_data(NULL, "gtdt-arm-mmio-timer", + gwdt_count, &atm, + sizeof(atm)); + if (IS_ERR(pdev)) { + pr_err("Can't register timer %d\n", gwdt_count); + continue; + } + + mmio_timer_count++; } } if (gwdt_count) pr_info("found %d SBSA generic Watchdog(s).\n", gwdt_count); + if (mmio_timer_count) + pr_info("found %d Generic MMIO timer(s).\n", mmio_timer_count); out_put_gtdt: acpi_put_table(table); return ret; } -device_initcall(gtdt_sbsa_gwdt_init); +device_initcall(gtdt_platform_timer_init); From 4891f01527bbbb0cf0e515c803ade67a17e247bb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 14 Aug 2025 16:46:20 +0100 Subject: [PATCH 2661/3206] clocksource/drivers/arm_arch_timer: Add standalone MMIO driver Add a new driver for the MMIO side of the ARM architected timer. Most of it has been lifted from the existing arch timer code, massaged, and finally rewritten. It supports both DT and ACPI as firmware descriptions. Signed-off-by: Marc Zyngier Signed-off-by: Daniel Lezcano Tested-by: Sudeep Holla Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250814154622.10193-3-maz@kernel.org --- MAINTAINERS | 1 + drivers/clocksource/arm_arch_timer_mmio.c | 421 ++++++++++++++++++++++ 2 files changed, 422 insertions(+) create mode 100644 drivers/clocksource/arm_arch_timer_mmio.c diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..2243b726edd7e0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1990,6 +1990,7 @@ S: Maintained F: arch/arm/include/asm/arch_timer.h F: arch/arm64/include/asm/arch_timer.h F: drivers/clocksource/arm_arch_timer.c +F: drivers/clocksource/arm_arch_timer_mmio.c ARM GENERIC INTERRUPT CONTROLLER DRIVERS M: Marc Zyngier diff --git a/drivers/clocksource/arm_arch_timer_mmio.c b/drivers/clocksource/arm_arch_timer_mmio.c new file mode 100644 index 00000000000000..3522d1d8cb97bd --- /dev/null +++ b/drivers/clocksource/arm_arch_timer_mmio.c @@ -0,0 +1,421 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ARM Generic Memory Mapped Timer support + * + * Split from drivers/clocksource/arm_arch_timer.c + * + * Copyright (C) 2011 ARM Ltd. + * All Rights Reserved + */ + +#define pr_fmt(fmt) "arch_timer_mmio: " fmt + +#include +#include +#include +#include +#include +#include + +#include + +#define CNTTIDR 0x08 +#define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) + +#define CNTACR(n) (0x40 + ((n) * 4)) +#define CNTACR_RPCT BIT(0) +#define CNTACR_RVCT BIT(1) +#define CNTACR_RFRQ BIT(2) +#define CNTACR_RVOFF BIT(3) +#define CNTACR_RWVT BIT(4) +#define CNTACR_RWPT BIT(5) + +#define CNTPCT_LO 0x00 +#define CNTVCT_LO 0x08 +#define CNTFRQ 0x10 +#define CNTP_CVAL_LO 0x20 +#define CNTP_CTL 0x2c +#define CNTV_CVAL_LO 0x30 +#define CNTV_CTL 0x3c + +enum arch_timer_access { + PHYS_ACCESS, + VIRT_ACCESS, +}; + +struct arch_timer { + struct clock_event_device evt; + struct arch_timer_mem *gt_block; + void __iomem *base; + enum arch_timer_access access; + u32 rate; +}; + +#define evt_to_arch_timer(e) container_of(e, struct arch_timer, evt) + +static void arch_timer_mmio_write(struct arch_timer *timer, + enum arch_timer_reg reg, u64 val) +{ + switch (timer->access) { + case PHYS_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + writel_relaxed((u32)val, timer->base + CNTP_CTL); + return; + case ARCH_TIMER_REG_CVAL: + /* + * Not guaranteed to be atomic, so the timer + * must be disabled at this point. + */ + writeq_relaxed(val, timer->base + CNTP_CVAL_LO); + return; + } + break; + case VIRT_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + writel_relaxed((u32)val, timer->base + CNTV_CTL); + return; + case ARCH_TIMER_REG_CVAL: + /* Same restriction as above */ + writeq_relaxed(val, timer->base + CNTV_CVAL_LO); + return; + } + break; + } + + /* Should never be here */ + WARN_ON_ONCE(1); +} + +static u32 arch_timer_mmio_read(struct arch_timer *timer, enum arch_timer_reg reg) +{ + switch (timer->access) { + case PHYS_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + return readl_relaxed(timer->base + CNTP_CTL); + default: + break; + } + break; + case VIRT_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + return readl_relaxed(timer->base + CNTV_CTL); + default: + break; + } + break; + } + + /* Should never be here */ + WARN_ON_ONCE(1); + return 0; +} + +static noinstr u64 arch_counter_mmio_get_cnt(struct arch_timer *t) +{ + int offset_lo = t->access == VIRT_ACCESS ? CNTVCT_LO : CNTPCT_LO; + u32 cnt_lo, cnt_hi, tmp_hi; + + do { + cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); + cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo)); + tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); + } while (cnt_hi != tmp_hi); + + return ((u64) cnt_hi << 32) | cnt_lo; +} + +static int arch_timer_mmio_shutdown(struct clock_event_device *clk) +{ + struct arch_timer *at = evt_to_arch_timer(clk); + unsigned long ctrl; + + ctrl = arch_timer_mmio_read(at, ARCH_TIMER_REG_CTRL); + ctrl &= ~ARCH_TIMER_CTRL_ENABLE; + arch_timer_mmio_write(at, ARCH_TIMER_REG_CTRL, ctrl); + + return 0; +} + +static int arch_timer_mmio_set_next_event(unsigned long evt, + struct clock_event_device *clk) +{ + struct arch_timer *timer = evt_to_arch_timer(clk); + unsigned long ctrl; + u64 cnt; + + ctrl = arch_timer_mmio_read(timer, ARCH_TIMER_REG_CTRL); + + /* Timer must be disabled before programming CVAL */ + if (ctrl & ARCH_TIMER_CTRL_ENABLE) { + ctrl &= ~ARCH_TIMER_CTRL_ENABLE; + arch_timer_mmio_write(timer, ARCH_TIMER_REG_CTRL, ctrl); + } + + ctrl |= ARCH_TIMER_CTRL_ENABLE; + ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; + + cnt = arch_counter_mmio_get_cnt(timer); + + arch_timer_mmio_write(timer, ARCH_TIMER_REG_CVAL, evt + cnt); + arch_timer_mmio_write(timer, ARCH_TIMER_REG_CTRL, ctrl); + return 0; +} + +static irqreturn_t arch_timer_mmio_handler(int irq, void *dev_id) +{ + struct clock_event_device *evt = dev_id; + struct arch_timer *at = evt_to_arch_timer(evt); + unsigned long ctrl; + + ctrl = arch_timer_mmio_read(at, ARCH_TIMER_REG_CTRL); + if (ctrl & ARCH_TIMER_CTRL_IT_STAT) { + ctrl |= ARCH_TIMER_CTRL_IT_MASK; + arch_timer_mmio_write(at, ARCH_TIMER_REG_CTRL, ctrl); + evt->event_handler(evt); + return IRQ_HANDLED; + } + + return IRQ_NONE; +} + +static struct arch_timer_mem_frame *find_best_frame(struct platform_device *pdev) +{ + struct arch_timer_mem_frame *frame, *best_frame = NULL; + struct arch_timer *at = platform_get_drvdata(pdev); + void __iomem *cntctlbase; + u32 cnttidr; + + cntctlbase = ioremap(at->gt_block->cntctlbase, at->gt_block->size); + if (!cntctlbase) { + dev_err(&pdev->dev, "Can't map CNTCTLBase @ %pa\n", + &at->gt_block->cntctlbase); + return NULL; + } + + cnttidr = readl_relaxed(cntctlbase + CNTTIDR); + + /* + * Try to find a virtual capable frame. Otherwise fall back to a + * physical capable frame. + */ + for (int i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { + u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT | + CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT; + + frame = &at->gt_block->frame[i]; + if (!frame->valid) + continue; + + /* Try enabling everything, and see what sticks */ + writel_relaxed(cntacr, cntctlbase + CNTACR(i)); + cntacr = readl_relaxed(cntctlbase + CNTACR(i)); + + /* Pick a suitable frame for which we have an IRQ */ + if ((cnttidr & CNTTIDR_VIRT(i)) && + !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT)) && + frame->virt_irq) { + best_frame = frame; + at->access = VIRT_ACCESS; + break; + } + + if ((~cntacr & (CNTACR_RWPT | CNTACR_RPCT)) || + !frame->phys_irq) + continue; + + at->access = PHYS_ACCESS; + best_frame = frame; + } + + iounmap(cntctlbase); + + return best_frame; +} + +static void arch_timer_mmio_setup(struct arch_timer *at, int irq) +{ + at->evt = (struct clock_event_device) { + .features = (CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_DYNIRQ), + .name = "arch_mem_timer", + .rating = 400, + .cpumask = cpu_possible_mask, + .irq = irq, + .set_next_event = arch_timer_mmio_set_next_event, + .set_state_oneshot_stopped = arch_timer_mmio_shutdown, + .set_state_shutdown = arch_timer_mmio_shutdown, + }; + + at->evt.set_state_shutdown(&at->evt); + + clockevents_config_and_register(&at->evt, at->rate, 0xf, + (unsigned long)CLOCKSOURCE_MASK(56)); + + enable_irq(at->evt.irq); +} + +static int arch_timer_mmio_frame_register(struct platform_device *pdev, + struct arch_timer_mem_frame *frame) +{ + struct arch_timer *at = platform_get_drvdata(pdev); + struct device_node *np = pdev->dev.of_node; + int ret, irq; + u32 rate; + + if (!devm_request_mem_region(&pdev->dev, frame->cntbase, frame->size, + "arch_mem_timer")) + return -EBUSY; + + at->base = devm_ioremap(&pdev->dev, frame->cntbase, frame->size); + if (!at->base) { + dev_err(&pdev->dev, "Can't map frame's registers\n"); + return -ENXIO; + } + + /* + * Allow "clock-frequency" to override the probed rate. If neither + * lead to something useful, use the CPU timer frequency as the + * fallback. The nice thing about that last point is that we woudn't + * made it here if we didn't have a valid frequency. + */ + rate = readl_relaxed(at->base + CNTFRQ); + + if (!np || of_property_read_u32(np, "clock-frequency", &at->rate)) + at->rate = rate; + + if (!at->rate) + at->rate = arch_timer_get_rate(); + + irq = at->access == VIRT_ACCESS ? frame->virt_irq : frame->phys_irq; + ret = devm_request_irq(&pdev->dev, irq, arch_timer_mmio_handler, + IRQF_TIMER | IRQF_NO_AUTOEN, "arch_mem_timer", + &at->evt); + if (ret) { + dev_err(&pdev->dev, "Failed to request mem timer irq\n"); + return ret; + } + + /* Afer this point, we're not allowed to fail anymore */ + arch_timer_mmio_setup(at, irq); + return 0; +} + +static int of_populate_gt_block(struct platform_device *pdev, + struct arch_timer *at) +{ + struct resource res; + + if (of_address_to_resource(pdev->dev.of_node, 0, &res)) + return -EINVAL; + + at->gt_block->cntctlbase = res.start; + at->gt_block->size = resource_size(&res); + + for_each_available_child_of_node_scoped(pdev->dev.of_node, frame_node) { + struct arch_timer_mem_frame *frame; + u32 n; + + if (of_property_read_u32(frame_node, "frame-number", &n)) { + dev_err(&pdev->dev, FW_BUG "Missing frame-number\n"); + return -EINVAL; + } + if (n >= ARCH_TIMER_MEM_MAX_FRAMES) { + dev_err(&pdev->dev, + FW_BUG "Wrong frame-number, only 0-%u are permitted\n", + ARCH_TIMER_MEM_MAX_FRAMES - 1); + return -EINVAL; + } + + frame = &at->gt_block->frame[n]; + + if (frame->valid) { + dev_err(&pdev->dev, FW_BUG "Duplicated frame-number\n"); + return -EINVAL; + } + + if (of_address_to_resource(frame_node, 0, &res)) + return -EINVAL; + + frame->cntbase = res.start; + frame->size = resource_size(&res); + + frame->phys_irq = irq_of_parse_and_map(frame_node, 0); + frame->virt_irq = irq_of_parse_and_map(frame_node, 1); + + frame->valid = true; + } + + return 0; +} + +static int arch_timer_mmio_probe(struct platform_device *pdev) +{ + struct arch_timer_mem_frame *frame; + struct arch_timer *at; + struct device_node *np; + int ret; + + np = pdev->dev.of_node; + + at = devm_kmalloc(&pdev->dev, sizeof(*at), GFP_KERNEL | __GFP_ZERO); + if (!at) + return -ENOMEM; + + if (np) { + at->gt_block = devm_kmalloc(&pdev->dev, sizeof(*at->gt_block), + GFP_KERNEL | __GFP_ZERO); + if (!at->gt_block) + return -ENOMEM; + ret = of_populate_gt_block(pdev, at); + if (ret) + return ret; + } else { + at->gt_block = dev_get_platdata(&pdev->dev); + } + + platform_set_drvdata(pdev, at); + + frame = find_best_frame(pdev); + if (!frame) { + dev_err(&pdev->dev, + "Unable to find a suitable frame in timer @ %pa\n", + &at->gt_block->cntctlbase); + return -EINVAL; + } + + ret = arch_timer_mmio_frame_register(pdev, frame); + if (!ret) + dev_info(&pdev->dev, + "mmio timer running at %lu.%02luMHz (%s)\n", + (unsigned long)at->rate / 1000000, + (unsigned long)(at->rate / 10000) % 100, + at->access == VIRT_ACCESS ? "virt" : "phys"); + + return ret; +} + +static const struct of_device_id arch_timer_mmio_of_table[] = { + { .compatible = "arm,armv7-timer-mem", }, + {} +}; + +static struct platform_driver arch_timer_mmio_drv = { + .driver = { + .name = "arch-timer-mmio", + .of_match_table = arch_timer_mmio_of_table, + }, + .probe = arch_timer_mmio_probe, +}; +builtin_platform_driver(arch_timer_mmio_drv); + +static struct platform_driver arch_timer_mmio_acpi_drv = { + .driver = { + .name = "gtdt-arm-mmio-timer", + }, + .probe = arch_timer_mmio_probe, +}; +builtin_platform_driver(arch_timer_mmio_acpi_drv); From 0f67b56d84b4c49adfd61f19f81f84ec613ab51a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 14 Aug 2025 16:46:21 +0100 Subject: [PATCH 2662/3206] clocksource/drivers/arm_arch_timer_mmio: Switch over to standalone driver Remove all the MMIO support from the per-CPU timer driver, and switch over to the standalove driver. Signed-off-by: Marc Zyngier Signed-off-by: Daniel Lezcano Tested-by: Sudeep Holla Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250814154622.10193-4-maz@kernel.org --- drivers/clocksource/Makefile | 1 + drivers/clocksource/arm_arch_timer.c | 686 +++------------------------ include/clocksource/arm_arch_timer.h | 5 - 3 files changed, 66 insertions(+), 626 deletions(-) diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 77a0f08eb43b75..ec4452ee958f1a 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_REALTEK_OTTO_TIMER) += timer-rtl-otto.o obj-$(CONFIG_ARC_TIMERS) += arc_timer.o obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer.o +obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer_mmio.o obj-$(CONFIG_ARM_GLOBAL_TIMER) += arm_global_timer.o obj-$(CONFIG_ARMV7M_SYSTICK) += armv7m_systick.o obj-$(CONFIG_ARM_TIMER_SP804) += timer-sp804.o diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 80ba6a54248c4c..90aeff44a2764c 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -34,42 +34,12 @@ #include -#define CNTTIDR 0x08 -#define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) - -#define CNTACR(n) (0x40 + ((n) * 4)) -#define CNTACR_RPCT BIT(0) -#define CNTACR_RVCT BIT(1) -#define CNTACR_RFRQ BIT(2) -#define CNTACR_RVOFF BIT(3) -#define CNTACR_RWVT BIT(4) -#define CNTACR_RWPT BIT(5) - -#define CNTPCT_LO 0x00 -#define CNTVCT_LO 0x08 -#define CNTFRQ 0x10 -#define CNTP_CVAL_LO 0x20 -#define CNTP_CTL 0x2c -#define CNTV_CVAL_LO 0x30 -#define CNTV_CTL 0x3c - /* * The minimum amount of time a generic counter is guaranteed to not roll over * (40 years) */ #define MIN_ROLLOVER_SECS (40ULL * 365 * 24 * 3600) -static unsigned arch_timers_present __initdata; - -struct arch_timer { - void __iomem *base; - struct clock_event_device evt; -}; - -static struct arch_timer *arch_timer_mem __ro_after_init; - -#define to_arch_timer(e) container_of(e, struct arch_timer, evt) - static u32 arch_timer_rate __ro_after_init; static int arch_timer_ppi[ARCH_TIMER_MAX_TIMER_PPI] __ro_after_init; @@ -85,7 +55,6 @@ static struct clock_event_device __percpu *arch_timer_evt; static enum arch_timer_ppi_nr arch_timer_uses_ppi __ro_after_init = ARCH_TIMER_VIRT_PPI; static bool arch_timer_c3stop __ro_after_init; -static bool arch_timer_mem_use_virtual __ro_after_init; static bool arch_counter_suspend_stop __ro_after_init; #ifdef CONFIG_GENERIC_GETTIMEOFDAY static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_ARCHTIMER; @@ -121,76 +90,6 @@ static int arch_counter_get_width(void) /* * Architected system timer support. */ - -static __always_inline -void arch_timer_reg_write(int access, enum arch_timer_reg reg, u64 val, - struct clock_event_device *clk) -{ - if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - writel_relaxed((u32)val, timer->base + CNTP_CTL); - break; - case ARCH_TIMER_REG_CVAL: - /* - * Not guaranteed to be atomic, so the timer - * must be disabled at this point. - */ - writeq_relaxed(val, timer->base + CNTP_CVAL_LO); - break; - default: - BUILD_BUG(); - } - } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - writel_relaxed((u32)val, timer->base + CNTV_CTL); - break; - case ARCH_TIMER_REG_CVAL: - /* Same restriction as above */ - writeq_relaxed(val, timer->base + CNTV_CVAL_LO); - break; - default: - BUILD_BUG(); - } - } else { - arch_timer_reg_write_cp15(access, reg, val); - } -} - -static __always_inline -u32 arch_timer_reg_read(int access, enum arch_timer_reg reg, - struct clock_event_device *clk) -{ - u32 val; - - if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - val = readl_relaxed(timer->base + CNTP_CTL); - break; - default: - BUILD_BUG(); - } - } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - val = readl_relaxed(timer->base + CNTV_CTL); - break; - default: - BUILD_BUG(); - } - } else { - val = arch_timer_reg_read_cp15(access, reg); - } - - return val; -} - static noinstr u64 raw_counter_get_cntpct_stable(void) { return __arch_counter_get_cntpct_stable(); @@ -424,7 +323,7 @@ void erratum_set_next_event_generic(const int access, unsigned long evt, unsigned long ctrl; u64 cval; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; @@ -436,7 +335,7 @@ void erratum_set_next_event_generic(const int access, unsigned long evt, write_sysreg(cval, cntv_cval_el0); } - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); } static __maybe_unused int erratum_set_next_event_virt(unsigned long evt, @@ -667,10 +566,10 @@ static __always_inline irqreturn_t timer_handler(const int access, { unsigned long ctrl; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, evt); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); if (ctrl & ARCH_TIMER_CTRL_IT_STAT) { ctrl |= ARCH_TIMER_CTRL_IT_MASK; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, evt); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); evt->event_handler(evt); return IRQ_HANDLED; } @@ -692,28 +591,14 @@ static irqreturn_t arch_timer_handler_phys(int irq, void *dev_id) return timer_handler(ARCH_TIMER_PHYS_ACCESS, evt); } -static irqreturn_t arch_timer_handler_phys_mem(int irq, void *dev_id) -{ - struct clock_event_device *evt = dev_id; - - return timer_handler(ARCH_TIMER_MEM_PHYS_ACCESS, evt); -} - -static irqreturn_t arch_timer_handler_virt_mem(int irq, void *dev_id) -{ - struct clock_event_device *evt = dev_id; - - return timer_handler(ARCH_TIMER_MEM_VIRT_ACCESS, evt); -} - static __always_inline int arch_timer_shutdown(const int access, struct clock_event_device *clk) { unsigned long ctrl; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl &= ~ARCH_TIMER_CTRL_ENABLE; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); return 0; } @@ -728,23 +613,13 @@ static int arch_timer_shutdown_phys(struct clock_event_device *clk) return arch_timer_shutdown(ARCH_TIMER_PHYS_ACCESS, clk); } -static int arch_timer_shutdown_virt_mem(struct clock_event_device *clk) -{ - return arch_timer_shutdown(ARCH_TIMER_MEM_VIRT_ACCESS, clk); -} - -static int arch_timer_shutdown_phys_mem(struct clock_event_device *clk) -{ - return arch_timer_shutdown(ARCH_TIMER_MEM_PHYS_ACCESS, clk); -} - static __always_inline void set_next_event(const int access, unsigned long evt, struct clock_event_device *clk) { unsigned long ctrl; u64 cnt; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; @@ -753,8 +628,8 @@ static __always_inline void set_next_event(const int access, unsigned long evt, else cnt = __arch_counter_get_cntvct(); - arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CVAL, evt + cnt); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); } static int arch_timer_set_next_event_virt(unsigned long evt, @@ -771,60 +646,6 @@ static int arch_timer_set_next_event_phys(unsigned long evt, return 0; } -static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo) -{ - u32 cnt_lo, cnt_hi, tmp_hi; - - do { - cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); - cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo)); - tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); - } while (cnt_hi != tmp_hi); - - return ((u64) cnt_hi << 32) | cnt_lo; -} - -static __always_inline void set_next_event_mem(const int access, unsigned long evt, - struct clock_event_device *clk) -{ - struct arch_timer *timer = to_arch_timer(clk); - unsigned long ctrl; - u64 cnt; - - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); - - /* Timer must be disabled before programming CVAL */ - if (ctrl & ARCH_TIMER_CTRL_ENABLE) { - ctrl &= ~ARCH_TIMER_CTRL_ENABLE; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); - } - - ctrl |= ARCH_TIMER_CTRL_ENABLE; - ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; - - if (access == ARCH_TIMER_MEM_VIRT_ACCESS) - cnt = arch_counter_get_cnt_mem(timer, CNTVCT_LO); - else - cnt = arch_counter_get_cnt_mem(timer, CNTPCT_LO); - - arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); -} - -static int arch_timer_set_next_event_virt_mem(unsigned long evt, - struct clock_event_device *clk) -{ - set_next_event_mem(ARCH_TIMER_MEM_VIRT_ACCESS, evt, clk); - return 0; -} - -static int arch_timer_set_next_event_phys_mem(unsigned long evt, - struct clock_event_device *clk) -{ - set_next_event_mem(ARCH_TIMER_MEM_PHYS_ACCESS, evt, clk); - return 0; -} - static u64 __arch_timer_check_delta(void) { #ifdef CONFIG_ARM64 @@ -850,63 +671,41 @@ static u64 __arch_timer_check_delta(void) return CLOCKSOURCE_MASK(arch_counter_get_width()); } -static void __arch_timer_setup(unsigned type, - struct clock_event_device *clk) +static void __arch_timer_setup(struct clock_event_device *clk) { + typeof(clk->set_next_event) sne; u64 max_delta; clk->features = CLOCK_EVT_FEAT_ONESHOT; - if (type == ARCH_TIMER_TYPE_CP15) { - typeof(clk->set_next_event) sne; - - arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL); - - if (arch_timer_c3stop) - clk->features |= CLOCK_EVT_FEAT_C3STOP; - clk->name = "arch_sys_timer"; - clk->rating = 450; - clk->cpumask = cpumask_of(smp_processor_id()); - clk->irq = arch_timer_ppi[arch_timer_uses_ppi]; - switch (arch_timer_uses_ppi) { - case ARCH_TIMER_VIRT_PPI: - clk->set_state_shutdown = arch_timer_shutdown_virt; - clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; - sne = erratum_handler(set_next_event_virt); - break; - case ARCH_TIMER_PHYS_SECURE_PPI: - case ARCH_TIMER_PHYS_NONSECURE_PPI: - case ARCH_TIMER_HYP_PPI: - clk->set_state_shutdown = arch_timer_shutdown_phys; - clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; - sne = erratum_handler(set_next_event_phys); - break; - default: - BUG(); - } + arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL); - clk->set_next_event = sne; - max_delta = __arch_timer_check_delta(); - } else { - clk->features |= CLOCK_EVT_FEAT_DYNIRQ; - clk->name = "arch_mem_timer"; - clk->rating = 400; - clk->cpumask = cpu_possible_mask; - if (arch_timer_mem_use_virtual) { - clk->set_state_shutdown = arch_timer_shutdown_virt_mem; - clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem; - clk->set_next_event = - arch_timer_set_next_event_virt_mem; - } else { - clk->set_state_shutdown = arch_timer_shutdown_phys_mem; - clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem; - clk->set_next_event = - arch_timer_set_next_event_phys_mem; - } - - max_delta = CLOCKSOURCE_MASK(56); + if (arch_timer_c3stop) + clk->features |= CLOCK_EVT_FEAT_C3STOP; + clk->name = "arch_sys_timer"; + clk->rating = 450; + clk->cpumask = cpumask_of(smp_processor_id()); + clk->irq = arch_timer_ppi[arch_timer_uses_ppi]; + switch (arch_timer_uses_ppi) { + case ARCH_TIMER_VIRT_PPI: + clk->set_state_shutdown = arch_timer_shutdown_virt; + clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; + sne = erratum_handler(set_next_event_virt); + break; + case ARCH_TIMER_PHYS_SECURE_PPI: + case ARCH_TIMER_PHYS_NONSECURE_PPI: + case ARCH_TIMER_HYP_PPI: + clk->set_state_shutdown = arch_timer_shutdown_phys; + clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; + sne = erratum_handler(set_next_event_phys); + break; + default: + BUG(); } + clk->set_next_event = sne; + max_delta = __arch_timer_check_delta(); + clk->set_state_shutdown(clk); clockevents_config_and_register(clk, arch_timer_rate, 0xf, max_delta); @@ -1029,7 +828,7 @@ static int arch_timer_starting_cpu(unsigned int cpu) struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt); u32 flags; - __arch_timer_setup(ARCH_TIMER_TYPE_CP15, clk); + __arch_timer_setup(clk); flags = check_ppi_trigger(arch_timer_ppi[arch_timer_uses_ppi]); enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], flags); @@ -1075,22 +874,12 @@ static void __init arch_timer_of_configure_rate(u32 rate, struct device_node *np pr_warn("frequency not available\n"); } -static void __init arch_timer_banner(unsigned type) +static void __init arch_timer_banner(void) { - pr_info("%s%s%s timer(s) running at %lu.%02luMHz (%s%s%s).\n", - type & ARCH_TIMER_TYPE_CP15 ? "cp15" : "", - type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? - " and " : "", - type & ARCH_TIMER_TYPE_MEM ? "mmio" : "", + pr_info("cp15 timer running at %lu.%02luMHz (%s).\n", (unsigned long)arch_timer_rate / 1000000, (unsigned long)(arch_timer_rate / 10000) % 100, - type & ARCH_TIMER_TYPE_CP15 ? - (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys" : - "", - type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? "/" : "", - type & ARCH_TIMER_TYPE_MEM ? - arch_timer_mem_use_virtual ? "virt" : "phys" : - ""); + (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys"); } u32 arch_timer_get_rate(void) @@ -1108,11 +897,6 @@ bool arch_timer_evtstrm_available(void) return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available); } -static noinstr u64 arch_counter_get_cntvct_mem(void) -{ - return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO); -} - static struct arch_timer_kvm_info arch_timer_kvm_info; struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) @@ -1120,42 +904,35 @@ struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) return &arch_timer_kvm_info; } -static void __init arch_counter_register(unsigned type) +static void __init arch_counter_register(void) { u64 (*scr)(void); + u64 (*rd)(void); u64 start_count; int width; - /* Register the CP15 based counter if we have one */ - if (type & ARCH_TIMER_TYPE_CP15) { - u64 (*rd)(void); - - if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || - arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { - if (arch_timer_counter_has_wa()) { - rd = arch_counter_get_cntvct_stable; - scr = raw_counter_get_cntvct_stable; - } else { - rd = arch_counter_get_cntvct; - scr = arch_counter_get_cntvct; - } + if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || + arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { + if (arch_timer_counter_has_wa()) { + rd = arch_counter_get_cntvct_stable; + scr = raw_counter_get_cntvct_stable; } else { - if (arch_timer_counter_has_wa()) { - rd = arch_counter_get_cntpct_stable; - scr = raw_counter_get_cntpct_stable; - } else { - rd = arch_counter_get_cntpct; - scr = arch_counter_get_cntpct; - } + rd = arch_counter_get_cntvct; + scr = arch_counter_get_cntvct; } - - arch_timer_read_counter = rd; - clocksource_counter.vdso_clock_mode = vdso_default; } else { - arch_timer_read_counter = arch_counter_get_cntvct_mem; - scr = arch_counter_get_cntvct_mem; + if (arch_timer_counter_has_wa()) { + rd = arch_counter_get_cntpct_stable; + scr = raw_counter_get_cntpct_stable; + } else { + rd = arch_counter_get_cntpct; + scr = arch_counter_get_cntpct; + } } + arch_timer_read_counter = rd; + clocksource_counter.vdso_clock_mode = vdso_default; + width = arch_counter_get_width(); clocksource_counter.mask = CLOCKSOURCE_MASK(width); cyclecounter.mask = CLOCKSOURCE_MASK(width); @@ -1303,76 +1080,10 @@ static int __init arch_timer_register(void) return err; } -static int __init arch_timer_mem_register(void __iomem *base, unsigned int irq) -{ - int ret; - irq_handler_t func; - - arch_timer_mem = kzalloc(sizeof(*arch_timer_mem), GFP_KERNEL); - if (!arch_timer_mem) - return -ENOMEM; - - arch_timer_mem->base = base; - arch_timer_mem->evt.irq = irq; - __arch_timer_setup(ARCH_TIMER_TYPE_MEM, &arch_timer_mem->evt); - - if (arch_timer_mem_use_virtual) - func = arch_timer_handler_virt_mem; - else - func = arch_timer_handler_phys_mem; - - ret = request_irq(irq, func, IRQF_TIMER, "arch_mem_timer", &arch_timer_mem->evt); - if (ret) { - pr_err("Failed to request mem timer irq\n"); - kfree(arch_timer_mem); - arch_timer_mem = NULL; - } - - return ret; -} - -static const struct of_device_id arch_timer_of_match[] __initconst = { - { .compatible = "arm,armv7-timer", }, - { .compatible = "arm,armv8-timer", }, - {}, -}; - -static const struct of_device_id arch_timer_mem_of_match[] __initconst = { - { .compatible = "arm,armv7-timer-mem", }, - {}, -}; - -static bool __init arch_timer_needs_of_probing(void) -{ - struct device_node *dn; - bool needs_probing = false; - unsigned int mask = ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM; - - /* We have two timers, and both device-tree nodes are probed. */ - if ((arch_timers_present & mask) == mask) - return false; - - /* - * Only one type of timer is probed, - * check if we have another type of timer node in device-tree. - */ - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) - dn = of_find_matching_node(NULL, arch_timer_mem_of_match); - else - dn = of_find_matching_node(NULL, arch_timer_of_match); - - if (dn && of_device_is_available(dn)) - needs_probing = true; - - of_node_put(dn); - - return needs_probing; -} - static int __init arch_timer_common_init(void) { - arch_timer_banner(arch_timers_present); - arch_counter_register(arch_timers_present); + arch_timer_banner(); + arch_counter_register(); return arch_timer_arch_init(); } @@ -1421,13 +1132,11 @@ static int __init arch_timer_of_init(struct device_node *np) u32 rate; bool has_names; - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { + if (arch_timer_evt) { pr_warn("multiple nodes in dt, skipping\n"); return 0; } - arch_timers_present |= ARCH_TIMER_TYPE_CP15; - has_names = of_property_present(np, "interrupt-names"); for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) { @@ -1472,283 +1181,22 @@ static int __init arch_timer_of_init(struct device_node *np) if (ret) return ret; - if (arch_timer_needs_of_probing()) - return 0; - return arch_timer_common_init(); } TIMER_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init); TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init); -static u32 __init -arch_timer_mem_frame_get_cntfrq(struct arch_timer_mem_frame *frame) -{ - void __iomem *base; - u32 rate; - - base = ioremap(frame->cntbase, frame->size); - if (!base) { - pr_err("Unable to map frame @ %pa\n", &frame->cntbase); - return 0; - } - - rate = readl_relaxed(base + CNTFRQ); - - iounmap(base); - - return rate; -} - -static struct arch_timer_mem_frame * __init -arch_timer_mem_find_best_frame(struct arch_timer_mem *timer_mem) -{ - struct arch_timer_mem_frame *frame, *best_frame = NULL; - void __iomem *cntctlbase; - u32 cnttidr; - int i; - - cntctlbase = ioremap(timer_mem->cntctlbase, timer_mem->size); - if (!cntctlbase) { - pr_err("Can't map CNTCTLBase @ %pa\n", - &timer_mem->cntctlbase); - return NULL; - } - - cnttidr = readl_relaxed(cntctlbase + CNTTIDR); - - /* - * Try to find a virtual capable frame. Otherwise fall back to a - * physical capable frame. - */ - for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { - u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT | - CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT; - - frame = &timer_mem->frame[i]; - if (!frame->valid) - continue; - - /* Try enabling everything, and see what sticks */ - writel_relaxed(cntacr, cntctlbase + CNTACR(i)); - cntacr = readl_relaxed(cntctlbase + CNTACR(i)); - - if ((cnttidr & CNTTIDR_VIRT(i)) && - !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) { - best_frame = frame; - arch_timer_mem_use_virtual = true; - break; - } - - if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT)) - continue; - - best_frame = frame; - } - - iounmap(cntctlbase); - - return best_frame; -} - -static int __init -arch_timer_mem_frame_register(struct arch_timer_mem_frame *frame) -{ - void __iomem *base; - int ret, irq; - - if (arch_timer_mem_use_virtual) - irq = frame->virt_irq; - else - irq = frame->phys_irq; - - if (!irq) { - pr_err("Frame missing %s irq.\n", - arch_timer_mem_use_virtual ? "virt" : "phys"); - return -EINVAL; - } - - if (!request_mem_region(frame->cntbase, frame->size, - "arch_mem_timer")) - return -EBUSY; - - base = ioremap(frame->cntbase, frame->size); - if (!base) { - pr_err("Can't map frame's registers\n"); - return -ENXIO; - } - - ret = arch_timer_mem_register(base, irq); - if (ret) { - iounmap(base); - return ret; - } - - arch_timers_present |= ARCH_TIMER_TYPE_MEM; - - return 0; -} - -static int __init arch_timer_mem_of_init(struct device_node *np) -{ - struct arch_timer_mem *timer_mem; - struct arch_timer_mem_frame *frame; - struct resource res; - int ret = -EINVAL; - u32 rate; - - timer_mem = kzalloc(sizeof(*timer_mem), GFP_KERNEL); - if (!timer_mem) - return -ENOMEM; - - if (of_address_to_resource(np, 0, &res)) - goto out; - timer_mem->cntctlbase = res.start; - timer_mem->size = resource_size(&res); - - for_each_available_child_of_node_scoped(np, frame_node) { - u32 n; - struct arch_timer_mem_frame *frame; - - if (of_property_read_u32(frame_node, "frame-number", &n)) { - pr_err(FW_BUG "Missing frame-number.\n"); - goto out; - } - if (n >= ARCH_TIMER_MEM_MAX_FRAMES) { - pr_err(FW_BUG "Wrong frame-number, only 0-%u are permitted.\n", - ARCH_TIMER_MEM_MAX_FRAMES - 1); - goto out; - } - frame = &timer_mem->frame[n]; - - if (frame->valid) { - pr_err(FW_BUG "Duplicated frame-number.\n"); - goto out; - } - - if (of_address_to_resource(frame_node, 0, &res)) - goto out; - - frame->cntbase = res.start; - frame->size = resource_size(&res); - - frame->virt_irq = irq_of_parse_and_map(frame_node, - ARCH_TIMER_VIRT_SPI); - frame->phys_irq = irq_of_parse_and_map(frame_node, - ARCH_TIMER_PHYS_SPI); - - frame->valid = true; - } - - frame = arch_timer_mem_find_best_frame(timer_mem); - if (!frame) { - pr_err("Unable to find a suitable frame in timer @ %pa\n", - &timer_mem->cntctlbase); - ret = -EINVAL; - goto out; - } - - rate = arch_timer_mem_frame_get_cntfrq(frame); - arch_timer_of_configure_rate(rate, np); - - ret = arch_timer_mem_frame_register(frame); - if (!ret && !arch_timer_needs_of_probing()) - ret = arch_timer_common_init(); -out: - kfree(timer_mem); - return ret; -} -TIMER_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", - arch_timer_mem_of_init); - #ifdef CONFIG_ACPI_GTDT -static int __init -arch_timer_mem_verify_cntfrq(struct arch_timer_mem *timer_mem) -{ - struct arch_timer_mem_frame *frame; - u32 rate; - int i; - - for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { - frame = &timer_mem->frame[i]; - - if (!frame->valid) - continue; - - rate = arch_timer_mem_frame_get_cntfrq(frame); - if (rate == arch_timer_rate) - continue; - - pr_err(FW_BUG "CNTFRQ mismatch: frame @ %pa: (0x%08lx), CPU: (0x%08lx)\n", - &frame->cntbase, - (unsigned long)rate, (unsigned long)arch_timer_rate); - - return -EINVAL; - } - - return 0; -} - -static int __init arch_timer_mem_acpi_init(int platform_timer_count) -{ - struct arch_timer_mem *timers, *timer; - struct arch_timer_mem_frame *frame, *best_frame = NULL; - int timer_count, i, ret = 0; - - timers = kcalloc(platform_timer_count, sizeof(*timers), - GFP_KERNEL); - if (!timers) - return -ENOMEM; - - ret = acpi_arch_timer_mem_init(timers, &timer_count); - if (ret || !timer_count) - goto out; - - /* - * While unlikely, it's theoretically possible that none of the frames - * in a timer expose the combination of feature we want. - */ - for (i = 0; i < timer_count; i++) { - timer = &timers[i]; - - frame = arch_timer_mem_find_best_frame(timer); - if (!best_frame) - best_frame = frame; - - ret = arch_timer_mem_verify_cntfrq(timer); - if (ret) { - pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n"); - goto out; - } - - if (!best_frame) /* implies !frame */ - /* - * Only complain about missing suitable frames if we - * haven't already found one in a previous iteration. - */ - pr_err("Unable to find a suitable frame in timer @ %pa\n", - &timer->cntctlbase); - } - - if (best_frame) - ret = arch_timer_mem_frame_register(best_frame); -out: - kfree(timers); - return ret; -} - -/* Initialize per-processor generic timer and memory-mapped timer(if present) */ static int __init arch_timer_acpi_init(struct acpi_table_header *table) { - int ret, platform_timer_count; + int ret; - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { + if (arch_timer_evt) { pr_warn("already initialized, skipping\n"); return -EINVAL; } - arch_timers_present |= ARCH_TIMER_TYPE_CP15; - - ret = acpi_gtdt_init(table, &platform_timer_count); + ret = acpi_gtdt_init(table, NULL); if (ret) return ret; @@ -1790,10 +1238,6 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table) if (ret) return ret; - if (platform_timer_count && - arch_timer_mem_acpi_init(platform_timer_count)) - pr_err("Failed to initialize memory-mapped timer.\n"); - return arch_timer_common_init(); } TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init); diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index ce6521ad04d121..2eda895f19f54c 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -9,9 +9,6 @@ #include #include -#define ARCH_TIMER_TYPE_CP15 BIT(0) -#define ARCH_TIMER_TYPE_MEM BIT(1) - #define ARCH_TIMER_CTRL_ENABLE (1 << 0) #define ARCH_TIMER_CTRL_IT_MASK (1 << 1) #define ARCH_TIMER_CTRL_IT_STAT (1 << 2) @@ -51,8 +48,6 @@ enum arch_timer_spi_nr { #define ARCH_TIMER_PHYS_ACCESS 0 #define ARCH_TIMER_VIRT_ACCESS 1 -#define ARCH_TIMER_MEM_PHYS_ACCESS 2 -#define ARCH_TIMER_MEM_VIRT_ACCESS 3 #define ARCH_TIMER_MEM_MAX_FRAMES 8 From 4e9bfe6969a768ef40c669fd100c9be70fb78a1f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 14 Aug 2025 16:46:22 +0100 Subject: [PATCH 2663/3206] clocksource/drivers/arm_arch_timer_mmio: Add MMIO clocksource The MMIO driver can also double as a clocksource, something that was missing in its previous incarnation. Add it for completeness. Signed-off-by: Marc Zyngier Signed-off-by: Daniel Lezcano Tested-by: Sudeep Holla Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250814154622.10193-5-maz@kernel.org --- drivers/clocksource/arm_arch_timer_mmio.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/clocksource/arm_arch_timer_mmio.c b/drivers/clocksource/arm_arch_timer_mmio.c index 3522d1d8cb97bd..ebe1987d651ebc 100644 --- a/drivers/clocksource/arm_arch_timer_mmio.c +++ b/drivers/clocksource/arm_arch_timer_mmio.c @@ -45,6 +45,7 @@ enum arch_timer_access { struct arch_timer { struct clock_event_device evt; + struct clocksource cs; struct arch_timer_mem *gt_block; void __iomem *base; enum arch_timer_access access; @@ -52,6 +53,7 @@ struct arch_timer { }; #define evt_to_arch_timer(e) container_of(e, struct arch_timer, evt) +#define cs_to_arch_timer(c) container_of(c, struct arch_timer, cs) static void arch_timer_mmio_write(struct arch_timer *timer, enum arch_timer_reg reg, u64 val) @@ -128,6 +130,13 @@ static noinstr u64 arch_counter_mmio_get_cnt(struct arch_timer *t) return ((u64) cnt_hi << 32) | cnt_lo; } +static u64 arch_mmio_counter_read(struct clocksource *cs) +{ + struct arch_timer *at = cs_to_arch_timer(cs); + + return arch_counter_mmio_get_cnt(at); +} + static int arch_timer_mmio_shutdown(struct clock_event_device *clk) { struct arch_timer *at = evt_to_arch_timer(clk); @@ -256,6 +265,16 @@ static void arch_timer_mmio_setup(struct arch_timer *at, int irq) (unsigned long)CLOCKSOURCE_MASK(56)); enable_irq(at->evt.irq); + + at->cs = (struct clocksource) { + .name = "arch_mmio_counter", + .rating = 300, + .read = arch_mmio_counter_read, + .mask = CLOCKSOURCE_MASK(56), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + }; + + clocksource_register_hz(&at->cs, at->rate); } static int arch_timer_mmio_frame_register(struct platform_device *pdev, From 0494fc345b377d1207c2cbfef67dc51f6ec874c0 Mon Sep 17 00:00:00 2001 From: Gokul Praveen Date: Tue, 12 Aug 2025 16:23:46 +0530 Subject: [PATCH 2664/3206] clocksource/drivers/timer-ti-dm : Capture functionality for OMAP DM timer Add PWM capture function in DM timer driver. OMAP DM timer hardware supports capture feature.It can be used to timestamp events (falling/rising edges) detected on input signal. Signed-off-by: Gokul Praveen Signed-off-by: Daniel Lezcano Reviewed-by: Neha Malcom Francis Link: https://lore.kernel.org/r/20250812105346.203541-1-g-praveen@ti.com --- drivers/clocksource/timer-ti-dm.c | 119 ++++++++++++++++++++- include/linux/platform_data/dmtimer-omap.h | 4 + 2 files changed, 121 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index e9e32df6b56664..793e7cdcb1b16b 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -31,6 +31,7 @@ #include #include +#include /* * timer errata flags @@ -836,6 +837,48 @@ static int omap_dm_timer_set_match(struct omap_dm_timer *cookie, int enable, return 0; } +static int omap_dm_timer_set_cap(struct omap_dm_timer *cookie, + int autoreload, bool config_period) +{ + struct dmtimer *timer; + struct device *dev; + int rc; + u32 l; + + timer = to_dmtimer(cookie); + if (unlikely(!timer)) + return -EINVAL; + + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + /* + * 1. Select autoreload mode. TIMER_TCLR[1] AR bit. + * 2. TIMER_TCLR[14]: Sets the functionality of the TIMER IO pin. + * 3. TIMER_TCLR[13] : Capture mode select bit. + * 3. TIMER_TCLR[9-8] : Select transition capture mode. + */ + + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + + if (autoreload) + l |= OMAP_TIMER_CTRL_AR; + + l |= OMAP_TIMER_CTRL_CAPTMODE | OMAP_TIMER_CTRL_GPOCFG; + + if (config_period == true) + l |= OMAP_TIMER_CTRL_TCM_LOWTOHIGH; /* Time Period config */ + else + l |= OMAP_TIMER_CTRL_TCM_BOTHEDGES; /* Duty Cycle config */ + + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + pm_runtime_put_sync(dev); + + return 0; +} + static int omap_dm_timer_set_pwm(struct omap_dm_timer *cookie, int def_on, int toggle, int trigger, int autoreload) { @@ -1023,23 +1066,92 @@ static unsigned int omap_dm_timer_read_counter(struct omap_dm_timer *cookie) return __omap_dm_timer_read_counter(timer); } +static inline unsigned int __omap_dm_timer_cap(struct dmtimer *timer, int idx) +{ + return idx == 0 ? dmtimer_read(timer, OMAP_TIMER_CAPTURE_REG) : + dmtimer_read(timer, OMAP_TIMER_CAPTURE2_REG); +} + static int omap_dm_timer_write_counter(struct omap_dm_timer *cookie, unsigned int value) { struct dmtimer *timer; + struct device *dev; timer = to_dmtimer(cookie); - if (unlikely(!timer || !atomic_read(&timer->enabled))) { - pr_err("%s: timer not available or enabled.\n", __func__); + if (unlikely(!timer)) { + pr_err("%s: timer not available.\n", __func__); return -EINVAL; } + dev = &timer->pdev->dev; + + pm_runtime_resume_and_get(dev); dmtimer_write(timer, OMAP_TIMER_COUNTER_REG, value); + pm_runtime_put_sync(dev); /* Save the context */ timer->context.tcrr = value; return 0; } +/** + * omap_dm_timer_cap_counter() - Calculate the high count or period count depending on the + * configuration. + * @cookie:Pointer to OMAP DM timer + * @is_period:Whether to configure timer in period or duty cycle mode + * + * Return high count or period count if timer is enabled else appropriate error. + */ +static unsigned int omap_dm_timer_cap_counter(struct omap_dm_timer *cookie, bool is_period) +{ + struct dmtimer *timer; + unsigned int cap1 = 0; + unsigned int cap2 = 0; + u32 l, ret; + + timer = to_dmtimer(cookie); + if (unlikely(!timer || !atomic_read(&timer->enabled))) { + pr_err("%s:timer is not available or enabled.%p\n", __func__, (void *)timer); + return -EINVAL; + } + + /* Stop the timer */ + omap_dm_timer_stop(cookie); + + /* Clear the timer counter value to 0 */ + ret = omap_dm_timer_write_counter(cookie, 0); + if (ret) + return ret; + + /* Sets the timer capture configuration for period/duty cycle calculation */ + ret = omap_dm_timer_set_cap(cookie, true, is_period); + if (ret) { + pr_err("%s: Failed to set timer capture configuration.\n", __func__); + return ret; + } + /* Start the timer */ + omap_dm_timer_start(cookie); + + /* + * 1 sec delay is given so as to provide + * enough time to capture low frequency signals. + */ + msleep(1000); + + cap1 = __omap_dm_timer_cap(timer, 0); + cap2 = __omap_dm_timer_cap(timer, 1); + + /* + * Clears the TCLR configuration. + * The start bit must be set to 1 as the timer is already in start mode. + */ + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + l &= ~(0xffff) | 0x1; + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + return (cap2-cap1); +} + static int __maybe_unused omap_dm_timer_runtime_suspend(struct device *dev) { struct dmtimer *timer = dev_get_drvdata(dev); @@ -1246,6 +1358,9 @@ static const struct omap_dm_timer_ops dmtimer_ops = { .write_counter = omap_dm_timer_write_counter, .read_status = omap_dm_timer_read_status, .write_status = omap_dm_timer_write_status, + .set_cap = omap_dm_timer_set_cap, + .get_cap_status = omap_dm_timer_get_pwm_status, + .read_cap = omap_dm_timer_cap_counter, }; static const struct dmtimer_platform_data omap3plus_pdata = { diff --git a/include/linux/platform_data/dmtimer-omap.h b/include/linux/platform_data/dmtimer-omap.h index 95d852aef130e3..726d8914384293 100644 --- a/include/linux/platform_data/dmtimer-omap.h +++ b/include/linux/platform_data/dmtimer-omap.h @@ -36,9 +36,13 @@ struct omap_dm_timer_ops { int (*set_pwm)(struct omap_dm_timer *timer, int def_on, int toggle, int trigger, int autoreload); int (*get_pwm_status)(struct omap_dm_timer *timer); + int (*set_cap)(struct omap_dm_timer *timer, + int autoreload, bool config_period); + int (*get_cap_status)(struct omap_dm_timer *timer); int (*set_prescaler)(struct omap_dm_timer *timer, int prescaler); unsigned int (*read_counter)(struct omap_dm_timer *timer); + unsigned int (*read_cap)(struct omap_dm_timer *timer, bool is_period); int (*write_counter)(struct omap_dm_timer *timer, unsigned int value); unsigned int (*read_status)(struct omap_dm_timer *timer); From e7a25106335041aeca4fdf50a84804c90142c886 Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Mon, 4 Aug 2025 04:03:25 -0400 Subject: [PATCH 2665/3206] clocksource/drivers/timer-rtl-otto: Work around dying timers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OpenWrt distribution has switched from kernel longterm 6.6 to 6.12. Reports show that devices with the Realtek Otto switch platform die during operation and are rebooted by the watchdog. Sorting out other possible reasons the Otto timer is to blame. The platform currently consists of 4 targets with different hardware revisions. It is not 100% clear which devices and revisions are affected. Analysis shows: A more aggressive sched/deadline handling leads to more timer starts with small intervals. This increases the bug chances. See https://marc.info/?l=linux-kernel&m=175276556023276&w=2 Focusing on the real issue a hardware limitation on some devices was found. There is a minimal chance that a timer ends without firing an interrupt if it is reprogrammed within the 5us before its expiration time. Work around this issue by introducing a bounce() function. It restarts the timer directly before the normal restart functions as follows: - Stop timer - Restart timer with a slow frequency. - Target time will be >5us - The subsequent normal restart is outside the critical window Downstream has already tested and confirmed a patch. See https://github.com/openwrt/openwrt/pull/19468 https://forum.openwrt.org/t/support-for-rtl838x-based-managed-switches/57875/3788 Signed-off-by: Markus Stockhausen Signed-off-by: Daniel Lezcano Tested-by: Stephen Howell Tested-by: Bjørn Mork Link: https://lore.kernel.org/r/20250804080328.2609287-2-markus.stockhausen@gmx.de --- drivers/clocksource/timer-rtl-otto.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/clocksource/timer-rtl-otto.c b/drivers/clocksource/timer-rtl-otto.c index 8a3068b36e7529..8be45a11fb8b61 100644 --- a/drivers/clocksource/timer-rtl-otto.c +++ b/drivers/clocksource/timer-rtl-otto.c @@ -38,6 +38,7 @@ #define RTTM_BIT_COUNT 28 #define RTTM_MIN_DELTA 8 #define RTTM_MAX_DELTA CLOCKSOURCE_MASK(28) +#define RTTM_MAX_DIVISOR GENMASK(15, 0) /* * Timers are derived from the LXB clock frequency. Usually this is a fixed @@ -112,6 +113,22 @@ static irqreturn_t rttm_timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static void rttm_bounce_timer(void __iomem *base, u32 mode) +{ + /* + * When a running timer has less than ~5us left, a stop/start sequence + * might fail. While the details are unknown the most evident effect is + * that the subsequent interrupt will not be fired. + * + * As a workaround issue an intermediate restart with a very slow + * frequency of ~3kHz keeping the target counter (>=8). So the follow + * up restart will always be issued outside the critical window. + */ + + rttm_disable_timer(base); + rttm_enable_timer(base, mode, RTTM_MAX_DIVISOR); +} + static void rttm_stop_timer(void __iomem *base) { rttm_disable_timer(base); @@ -129,6 +146,7 @@ static int rttm_next_event(unsigned long delta, struct clock_event_device *clkev struct timer_of *to = to_timer_of(clkevt); RTTM_DEBUG(to->of_base.base); + rttm_bounce_timer(to->of_base.base, RTTM_CTRL_COUNTER); rttm_stop_timer(to->of_base.base); rttm_set_period(to->of_base.base, delta); rttm_start_timer(to, RTTM_CTRL_COUNTER); @@ -141,6 +159,7 @@ static int rttm_state_oneshot(struct clock_event_device *clkevt) struct timer_of *to = to_timer_of(clkevt); RTTM_DEBUG(to->of_base.base); + rttm_bounce_timer(to->of_base.base, RTTM_CTRL_COUNTER); rttm_stop_timer(to->of_base.base); rttm_set_period(to->of_base.base, RTTM_TICKS_PER_SEC / HZ); rttm_start_timer(to, RTTM_CTRL_COUNTER); @@ -153,6 +172,7 @@ static int rttm_state_periodic(struct clock_event_device *clkevt) struct timer_of *to = to_timer_of(clkevt); RTTM_DEBUG(to->of_base.base); + rttm_bounce_timer(to->of_base.base, RTTM_CTRL_TIMER); rttm_stop_timer(to->of_base.base); rttm_set_period(to->of_base.base, RTTM_TICKS_PER_SEC / HZ); rttm_start_timer(to, RTTM_CTRL_TIMER); From ca90147e55a78441794aef5cb4a8d1cf8d0e209f Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Mon, 4 Aug 2025 04:03:26 -0400 Subject: [PATCH 2666/3206] clocksource/drivers/timer-rtl-otto: Drop set_counter function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current counter value is a read only register. It will be reset when writing a new target timer value with rttm_set_period(). rttm_set_counter() is essentially a noop. Drop it. While this makes rttm_start_timer() and rttm_enable_timer() the same functions keep both to make the established abstraction layers for register and control functions active. Downstream has already tested and confirmed a patch. See https://github.com/openwrt/openwrt/pull/19468 https://forum.openwrt.org/t/support-for-rtl838x-based-managed-switches/57875/3788 Signed-off-by: Markus Stockhausen Signed-off-by: Daniel Lezcano Tested-by: Stephen Howell Tested-by: Bjørn Mork Link: https://lore.kernel.org/r/20250804080328.2609287-3-markus.stockhausen@gmx.de --- drivers/clocksource/timer-rtl-otto.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/clocksource/timer-rtl-otto.c b/drivers/clocksource/timer-rtl-otto.c index 8be45a11fb8b61..48ba1164f3fbfb 100644 --- a/drivers/clocksource/timer-rtl-otto.c +++ b/drivers/clocksource/timer-rtl-otto.c @@ -56,11 +56,6 @@ struct rttm_cs { }; /* Simple internal register functions */ -static inline void rttm_set_counter(void __iomem *base, unsigned int counter) -{ - iowrite32(counter, base + RTTM_CNT); -} - static inline unsigned int rttm_get_counter(void __iomem *base) { return ioread32(base + RTTM_CNT); @@ -137,7 +132,6 @@ static void rttm_stop_timer(void __iomem *base) static void rttm_start_timer(struct timer_of *to, u32 mode) { - rttm_set_counter(to->of_base.base, 0); rttm_enable_timer(to->of_base.base, mode, to->of_clk.rate / RTTM_TICKS_PER_SEC); } From 17eb98d6b517b6e7faaebed496fd688dbb1771d9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:48 +1000 Subject: [PATCH 2667/3206] VFS/ovl: add lookup_one_positive_killable() ovl wants a lookup which won't block on a fatal signal. It currently uses down_write_killable() and then repeatedly calls to lookup_one() The lock may not be needed if the name is already in the dcache and it aids proposed future changes if the locking is kept internal to namei.c So this patch adds lookup_one_positive_killable() which is like lookup_one_positive() but will abort in the face of a fatal signal. overlayfs is changed to use this. Note that instead of always getting an exclusive lock, ovl now only gets a shared lock, and only sometimes. The exclusive lock was never needed. However down_read_killable() was only added in v4.15 but overlayfs started using down_write_killable() here in v4.7. Note that the linked list ->first_maybe_whiteout ->next_maybe_white is local to the thread so there is no concurrency in that list which could be threatened by removing the locking. Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- fs/namei.c | 55 ++++++++++++++++++++++++++++++++++++++++++ fs/overlayfs/readdir.c | 28 ++++++++++----------- include/linux/namei.h | 3 +++ 3 files changed, 72 insertions(+), 14 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index cd43ff89fbaa38..c7c6b255db2c55 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1827,6 +1827,20 @@ static struct dentry *lookup_slow(const struct qstr *name, return res; } +static struct dentry *lookup_slow_killable(const struct qstr *name, + struct dentry *dir, + unsigned int flags) +{ + struct inode *inode = dir->d_inode; + struct dentry *res; + + if (inode_lock_shared_killable(inode)) + return ERR_PTR(-EINTR); + res = __lookup_slow(name, dir, flags); + inode_unlock_shared(inode); + return res; +} + static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *restrict nd) { @@ -3010,6 +3024,47 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, } EXPORT_SYMBOL(lookup_one_unlocked); +/** + * lookup_one_positive_killable - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr olding pathname component to lookup + * @base: base directory to lookup from + * + * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns + * known positive or ERR_PTR(). This is what most of the users want. + * + * Note that pinned negative with unlocked parent _can_ become positive at any + * time, so callers of lookup_one_unlocked() need to be very careful; pinned + * positives have >d_inode stable, so this one avoids such problems. + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * It should be called without the parent i_rwsem held, and will take + * the i_rwsem itself if necessary. If a fatal signal is pending or + * delivered, it will return %-EINTR if the lock is needed. + */ +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base) +{ + int err; + struct dentry *ret; + + err = lookup_one_common(idmap, name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow_killable(name, base, 0); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_one_positive_killable); + /** * lookup_one_positive_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index b65cdfce31ce27..15cb06fa0c9a10 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -270,26 +270,26 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { - int err; + int err = 0; struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; old_cred = ovl_override_creds(rdd->dentry->d_sb); - err = down_write_killable(&dir->d_inode->i_rwsem); - if (!err) { - while (rdd->first_maybe_whiteout) { - struct ovl_cache_entry *p = - rdd->first_maybe_whiteout; - rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one(mnt_idmap(path->mnt), - &QSTR_LEN(p->name, p->len), dir); - if (!IS_ERR(dentry)) { - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } + while (rdd->first_maybe_whiteout) { + struct ovl_cache_entry *p = + rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p->next_maybe_whiteout; + dentry = lookup_one_positive_killable(mnt_idmap(path->mnt), + &QSTR_LEN(p->name, p->len), + dir); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } else if (PTR_ERR(dentry) == -EINTR) { + err = -EINTR; + break; } - inode_unlock(dir->d_inode); } ovl_revert_creds(old_cred); diff --git a/include/linux/namei.h b/include/linux/namei.h index 5d085428e471d9..551a1a01e5e72e 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -80,6 +80,9 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base); extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); From e66ccd30dcdc2053388bd2ec20bd53e9897248d5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:49 +1000 Subject: [PATCH 2668/3206] VFS: discard err2 in filename_create() Since 204a575e91f3 "VFS: add common error checks to lookup_one_qstr_excl()" filename_create() does not need to stash the error value from mnt_want_write() into a separate variable - the logic that used to clobber 'error' after the call of mnt_want_write() has migrated into lookup_one_qstr_excl(). So there is no need for two different err variables. This patch discards "err2" and uses "error' throughout. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- fs/namei.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index c7c6b255db2c55..e2c2ab286bc032 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4169,7 +4169,6 @@ static struct dentry *filename_create(int dfd, struct filename *name, unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; - int err2; int error; error = filename_parentat(dfd, name, reval_flag, path, &last, &type); @@ -4184,7 +4183,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, goto out; /* don't fail immediately if it's r/o, at least try to report other errors */ - err2 = mnt_want_write(path->mnt); + error = mnt_want_write(path->mnt); /* * Do the final lookup. Suppress 'create' if there is a trailing * '/', and a directory wasn't requested. @@ -4197,17 +4196,16 @@ static struct dentry *filename_create(int dfd, struct filename *name, if (IS_ERR(dentry)) goto unlock; - if (unlikely(err2)) { - error = err2; + if (unlikely(error)) goto fail; - } + return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: inode_unlock(path->dentry->d_inode); - if (!err2) + if (!error) mnt_drop_write(path->mnt); out: path_put(path); From d7fb2c410240348edee7867c29b60688175dcc11 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:50 +1000 Subject: [PATCH 2669/3206] VFS: unify old_mnt_idmap and new_mnt_idmap in renamedata A rename operation can only rename within a single mount. Callers of vfs_rename() must and do ensure this is the case. So there is no point in having two mnt_idmaps in renamedata as they are always the same. Only one of them is passed to ->rename in any case. This patch replaces both with a single "mnt_idmap" and changes all callers. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- fs/cachefiles/namei.c | 3 +-- fs/ecryptfs/inode.c | 3 +-- fs/namei.c | 17 ++++++++--------- fs/nfsd/vfs.c | 3 +-- fs/overlayfs/overlayfs.h | 3 +-- fs/smb/server/vfs.c | 3 +-- include/linux/fs.h | 6 ++---- 7 files changed, 15 insertions(+), 23 deletions(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 91dfd02318772f..d1edb2ac38376c 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -387,10 +387,9 @@ int cachefiles_bury_object(struct cachefiles_cache *cache, cachefiles_io_error(cache, "Rename security error %d", ret); } else { struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, + .mnt_idmap = &nop_mnt_idmap, .old_parent = dir, .old_dentry = rep, - .new_mnt_idmap = &nop_mnt_idmap, .new_parent = cache->graveyard, .new_dentry = grave, }; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 72fbe1316ab883..abd954c6a14e95 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -634,10 +634,9 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_lock; } - rd.old_mnt_idmap = &nop_mnt_idmap; + rd.mnt_idmap = &nop_mnt_idmap; rd.old_parent = lower_old_dir_dentry; rd.old_dentry = lower_old_dentry; - rd.new_mnt_idmap = &nop_mnt_idmap; rd.new_parent = lower_new_dir_dentry; rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); diff --git a/fs/namei.c b/fs/namei.c index e2c2ab286bc032..180037b96956a2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5077,20 +5077,20 @@ int vfs_rename(struct renamedata *rd) if (source == target) return 0; - error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); + error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); + error = may_create(rd->mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(rd->new_mnt_idmap, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, is_dir); else - error = may_delete(rd->new_mnt_idmap, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) @@ -5105,13 +5105,13 @@ int vfs_rename(struct renamedata *rd) */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(rd->old_mnt_idmap, source, + error = inode_permission(rd->mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(rd->new_mnt_idmap, target, + error = inode_permission(rd->mnt_idmap, target, MAY_WRITE); if (error) return error; @@ -5179,7 +5179,7 @@ int vfs_rename(struct renamedata *rd) if (error) goto out; } - error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, + error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; @@ -5322,10 +5322,9 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, rd.old_parent = old_path.dentry; rd.old_dentry = old_dentry; - rd.old_mnt_idmap = mnt_idmap(old_path.mnt); + rd.mnt_idmap = mnt_idmap(old_path.mnt); rd.new_parent = new_path.dentry; rd.new_dentry = new_dentry; - rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 98ab55ba3ced76..5f3e99f956cab6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1943,10 +1943,9 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out_dput_old; } else { struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, + .mnt_idmap = &nop_mnt_idmap, .old_parent = fdentry, .old_dentry = odentry, - .new_mnt_idmap = &nop_mnt_idmap, .new_parent = tdentry, .new_dentry = ndentry, }; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index bb0d7ded8e763a..4f84abaa0d6805 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -361,10 +361,9 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, { int err; struct renamedata rd = { - .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), + .mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_parent = olddir, .old_dentry = olddentry, - .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), .new_parent = newdir, .new_dentry = newdentry, .flags = flags, diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 04539037108c93..07739055ac9f33 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -770,10 +770,9 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, goto out4; } - rd.old_mnt_idmap = mnt_idmap(old_path->mnt), + rd.mnt_idmap = mnt_idmap(old_path->mnt), rd.old_parent = old_parent, rd.old_dentry = old_child, - rd.new_mnt_idmap = mnt_idmap(new_path.mnt), rd.new_parent = new_path.dentry, rd.new_dentry = new_dentry, rd.flags = flags, diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d7051f..73b39e5bb9e45a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2008,20 +2008,18 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, /** * struct renamedata - contains all information required for renaming - * @old_mnt_idmap: idmap of the old mount the inode was found from + * @mnt_idmap: idmap of the mount in which the rename is happening. * @old_parent: parent of source * @old_dentry: source - * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_parent: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { - struct mnt_idmap *old_mnt_idmap; + struct mnt_idmap *mnt_idmap; struct dentry *old_parent; struct dentry *old_dentry; - struct mnt_idmap *new_mnt_idmap; struct dentry *new_parent; struct dentry *new_dentry; struct inode **delegated_inode; From 76a53de6f7ff0641570364234fb4489f4d4fc8e9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:51 +1000 Subject: [PATCH 2670/3206] VFS/audit: introduce kern_path_parent() for audit audit_alloc_mark() and audit_get_nd() both need to perform a path lookup getting the parent dentry (which must exist) and the final target (following a LAST_NORM name) which sometimes doesn't need to exist. They don't need the parent to be locked, but use kern_path_locked() or kern_path_locked_negative() anyway. This is somewhat misleading to the casual reader. This patch introduces a more targeted function, kern_path_parent(), which returns not holding locks. On success the "path" will be set to the parent, which must be found, and the return value is the dentry of the target, which might be negative. This will clear the way to rename kern_path_locked() which is otherwise only used to prepare for removing something. It also allows us to remove kern_path_locked_negative(), which is transformed into the new kern_path_parent(). Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- fs/namei.c | 23 +++++++++++++++++------ include/linux/namei.h | 2 +- kernel/audit_fsnotify.c | 11 ++++++----- kernel/audit_watch.c | 3 +-- 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 180037b96956a2..4017bc8641d3b1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2781,7 +2781,20 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct return d; } -struct dentry *kern_path_locked_negative(const char *name, struct path *path) +/** + * kern_path_parent: lookup path returning parent and target + * @name: path name + * @path: path to store parent in + * + * The path @name should end with a normal component, not "." or ".." or "/". + * A lookup is performed and if successful the parent information + * is store in @parent and the dentry is returned. + * + * The dentry maybe negative, the parent will be positive. + * + * Returns: dentry or error. + */ +struct dentry *kern_path_parent(const char *name, struct path *path) { struct path parent_path __free(path_put) = {}; struct filename *filename __free(putname) = getname_kernel(name); @@ -2794,12 +2807,10 @@ struct dentry *kern_path_locked_negative(const char *name, struct path *path) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); - inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); + + d = lookup_noperm_unlocked(&last, parent_path.dentry); + if (IS_ERR(d)) return d; - } path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; diff --git a/include/linux/namei.h b/include/linux/namei.h index 551a1a01e5e72e..1d5038c21c20ae 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -57,12 +57,12 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags); extern int kern_path(const char *, unsigned, struct path *); +struct dentry *kern_path_parent(const char *name, struct path *parent); extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); -extern struct dentry *kern_path_locked_negative(const char *, struct path *); extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index c565fbf66ac876..b92805b317a2d4 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -76,17 +76,18 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa struct audit_fsnotify_mark *audit_mark; struct path path; struct dentry *dentry; - struct inode *inode; int ret; if (pathname[0] != '/' || pathname[len-1] == '/') return ERR_PTR(-EINVAL); - dentry = kern_path_locked(pathname, &path); + dentry = kern_path_parent(pathname, &path); if (IS_ERR(dentry)) return ERR_CAST(dentry); /* returning an error */ - inode = path.dentry->d_inode; - inode_unlock(inode); + if (d_really_is_negative(dentry)) { + audit_mark = ERR_PTR(-ENOENT); + goto out; + } audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { @@ -100,7 +101,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); + ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0); if (ret < 0) { audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0ebbbe37a60f02..a700e3c8925ff7 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -349,7 +349,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { struct dentry *d; - d = kern_path_locked_negative(watch->path, parent); + d = kern_path_parent(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); @@ -359,7 +359,6 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) watch->ino = d_backing_inode(d)->i_ino; } - inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; } From 3d18f80ce181ba27f37d0ec1c550b22acb01dd49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:52 +1000 Subject: [PATCH 2671/3206] VFS: rename kern_path_locked() and related functions. kern_path_locked() is now only used to prepare for removing an object from the filesystem (and that is the only credible reason for wanting a positive locked dentry). Thus it corresponds to kern_path_create() and so should have a corresponding name. Unfortunately the name "kern_path_create" is somewhat misleading as it doesn't actually create anything. The recently added simple_start_creating() provides a better pattern I believe. The "start" can be matched with "end" to bracket the creating or removing. So this patch changes names: kern_path_locked -> start_removing_path kern_path_create -> start_creating_path user_path_create -> start_creating_user_path user_path_locked_at -> start_removing_user_path_at done_path_create -> end_creating_path and also introduces end_removing_path() which is identical to end_creating_path(). __start_removing_path (which was __kern_path_locked) is enhanced to call mnt_want_write() for consistency with the start_creating_path(). Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 12 ++++ arch/powerpc/platforms/cell/spufs/syscalls.c | 4 +- drivers/base/devtmpfs.c | 22 +++----- fs/bcachefs/fs-ioctl.c | 10 ++-- fs/init.c | 17 +++--- fs/namei.c | 59 ++++++++++++-------- fs/ocfs2/refcounttree.c | 4 +- fs/smb/server/vfs.c | 8 +-- include/linux/namei.h | 14 +++-- kernel/bpf/inode.c | 4 +- net/unix/af_unix.c | 6 +- 11 files changed, 93 insertions(+), 67 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f0750..e0494860be6b86 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1285,3 +1285,15 @@ rather than a VMA, as the VMA at this stage is not yet valid. The vm_area_desc provides the minimum required information for a filesystem to initialise state upon memory mapping of a file-backed region, and output parameters for the file system to set this state. + +--- + +**mandatory** + +Several functions are renamed: + +- kern_path_locked -> start_removing_path +- kern_path_create -> start_creating_path +- user_path_create -> start_creating_user_path +- user_path_locked_at -> start_removing_user_path_at +- done_path_create -> end_creating_path diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 157e046e6e93cf..ea4ba1b6ce6a96 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -67,11 +67,11 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, struct dentry *dentry; int ret; - dentry = user_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_user_path(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); ret = PTR_ERR(dentry); if (!IS_ERR(dentry)) { ret = spufs_create(&path, dentry, flags, mode, neighbor); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); } return ret; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 31bfb3194b4c29..9d4e46ad835225 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -176,7 +176,7 @@ static int dev_mkdir(const char *name, umode_t mode) struct dentry *dentry; struct path path; - dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -184,7 +184,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (!IS_ERR(dentry)) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return PTR_ERR_OR_ZERO(dentry); } @@ -222,10 +222,10 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, struct path path; int err; - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); } if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -246,7 +246,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -256,7 +256,7 @@ static int dev_rmdir(const char *name) struct dentry *dentry; int err; - dentry = kern_path_locked(name, &parent); + dentry = start_removing_path(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_inode(dentry)->i_private == &thread) @@ -265,9 +265,7 @@ static int dev_rmdir(const char *name) else err = -EPERM; - dput(dentry); - inode_unlock(d_inode(parent.dentry)); - path_put(&parent); + end_removing_path(&parent, dentry); return err; } @@ -325,7 +323,7 @@ static int handle_remove(const char *nodename, struct device *dev) int deleted = 0; int err = 0; - dentry = kern_path_locked(nodename, &parent); + dentry = start_removing_path(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -349,10 +347,8 @@ static int handle_remove(const char *nodename, struct device *dev) if (!err || err == -ENOENT) deleted = 1; } - dput(dentry); - inode_unlock(d_inode(parent.dentry)); + end_removing_path(&parent, dentry); - path_put(&parent); if (deleted && strchr(nodename, '/')) delete_path(nodename); return err; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 4e72e654da9666..43510da5e73440 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -255,7 +255,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); } - dst_dentry = user_path_create(arg.dirfd, + dst_dentry = start_creating_user_path(arg.dirfd, (const char __user *)(unsigned long)arg.dst_ptr, &dst_path, lookup_flags); error = PTR_ERR_OR_ZERO(dst_dentry); @@ -314,7 +314,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, d_instantiate(dst_dentry, &inode->v); fsnotify_mkdir(dir, dst_dentry); err3: - done_path_create(&dst_path, dst_dentry); + end_creating_path(&dst_path, dst_dentry); err2: if (arg.src_ptr) path_put(&src_path); @@ -334,7 +334,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, if (arg.flags) return -EINVAL; - victim = user_path_locked_at(arg.dirfd, name, &path); + victim = start_removing_user_path_at(arg.dirfd, name, &path); if (IS_ERR(victim)) return PTR_ERR(victim); @@ -351,9 +351,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, d_invalidate(victim); } err: - inode_unlock(dir); - dput(victim); - path_put(&path); + end_removing_path(&path, victim); return ret; } diff --git a/fs/init.c b/fs/init.c index eef5124885e372..07f592ccdba868 100644 --- a/fs/init.c +++ b/fs/init.c @@ -149,7 +149,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) else if (!(S_ISBLK(mode) || S_ISCHR(mode))) return -EINVAL; - dentry = kern_path_create(AT_FDCWD, filename, &path, 0); + dentry = start_creating_path(AT_FDCWD, filename, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -158,7 +158,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -173,7 +173,7 @@ int __init init_link(const char *oldname, const char *newname) if (error) return error; - new_dentry = kern_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out; @@ -191,7 +191,7 @@ int __init init_link(const char *oldname, const char *newname) error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, NULL); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); return error; @@ -203,14 +203,14 @@ int __init init_symlink(const char *oldname, const char *newname) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, newname, &path, 0); + dentry = start_creating_path(AT_FDCWD, newname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, oldname); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -225,7 +225,8 @@ int __init init_mkdir(const char *pathname, umode_t mode) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, pathname, &path, + LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); mode = mode_strip_umask(d_inode(path.dentry), mode); @@ -236,7 +237,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } diff --git a/fs/namei.c b/fs/namei.c index 4017bc8641d3b1..92973a7a809124 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2758,7 +2758,8 @@ static int filename_parentat(int dfd, struct filename *name, } /* does lookup, returns the object with parent locked */ -static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) +static struct dentry *__start_removing_path(int dfd, struct filename *name, + struct path *path) { struct path parent_path __free(path_put) = {}; struct dentry *d; @@ -2770,15 +2771,26 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); + /* don't fail immediately if it's r/o, at least try to report other errors */ + error = mnt_want_write(parent_path.mnt); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); - return d; - } + if (IS_ERR(d)) + goto unlock; + if (error) + goto fail; path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; + +fail: + dput(d); + d = ERR_PTR(error); +unlock: + inode_unlock(parent_path.dentry->d_inode); + if (!error) + mnt_drop_write(parent_path.mnt); + return d; } /** @@ -2816,24 +2828,26 @@ struct dentry *kern_path_parent(const char *name, struct path *path) return d; } -struct dentry *kern_path_locked(const char *name, struct path *path) +struct dentry *start_removing_path(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); - struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); + struct dentry *res = __start_removing_path(AT_FDCWD, filename, path); putname(filename); return res; } -struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) +struct dentry *start_removing_user_path_at(int dfd, + const char __user *name, + struct path *path) { struct filename *filename = getname(name); - struct dentry *res = __kern_path_locked(dfd, filename, path); + struct dentry *res = __start_removing_path(dfd, filename, path); putname(filename); return res; } -EXPORT_SYMBOL(user_path_locked_at); +EXPORT_SYMBOL(start_removing_user_path_at); int kern_path(const char *name, unsigned int flags, struct path *path) { @@ -4223,8 +4237,8 @@ static struct dentry *filename_create(int dfd, struct filename *name, return dentry; } -struct dentry *kern_path_create(int dfd, const char *pathname, - struct path *path, unsigned int lookup_flags) +struct dentry *start_creating_path(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4232,9 +4246,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname, putname(filename); return res; } -EXPORT_SYMBOL(kern_path_create); +EXPORT_SYMBOL(start_creating_path); -void done_path_create(struct path *path, struct dentry *dentry) +void end_creating_path(struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); @@ -4242,10 +4256,11 @@ void done_path_create(struct path *path, struct dentry *dentry) mnt_drop_write(path->mnt); path_put(path); } -EXPORT_SYMBOL(done_path_create); +EXPORT_SYMBOL(end_creating_path); -inline struct dentry *user_path_create(int dfd, const char __user *pathname, - struct path *path, unsigned int lookup_flags) +inline struct dentry *start_creating_user_path( + int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4253,7 +4268,7 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname, putname(filename); return res; } -EXPORT_SYMBOL(user_path_create); +EXPORT_SYMBOL(start_creating_user_path); /** * vfs_mknod - create device node or file @@ -4361,7 +4376,7 @@ static int do_mknodat(int dfd, struct filename *name, umode_t mode, break; } out2: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4465,7 +4480,7 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode) if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4819,7 +4834,7 @@ int do_symlinkat(struct filename *from, int newdfd, struct filename *to) if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4988,7 +5003,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 8f732742b26e36..267b50e8e42e5c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4418,7 +4418,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, return error; } - new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) { mlog_errno(error); @@ -4435,7 +4435,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, d_inode(new_path.dentry), new_dentry, preserve); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 07739055ac9f33..1cfa688904b2c4 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -196,7 +196,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) pr_err("File(%s): creation failed (err:%d)\n", name, err); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -237,7 +237,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) if (!err && dentry != d) ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); return err; @@ -669,7 +669,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, ksmbd_debug(VFS, "vfs_link failed err %d\n", err); out3: - done_path_create(&newpath, dentry); + end_creating_path(&newpath, dentry); out2: path_put(&oldpath); out1: @@ -1325,7 +1325,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, if (!abs_name) return ERR_PTR(-ENOMEM); - dent = kern_path_create(AT_FDCWD, abs_name, path, flags); + dent = start_creating_path(AT_FDCWD, abs_name, path, flags); kfree(abs_name); return dent; } diff --git a/include/linux/namei.h b/include/linux/namei.h index 1d5038c21c20ae..a7800ef04e7619 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -59,11 +59,15 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, extern int kern_path(const char *, unsigned, struct path *); struct dentry *kern_path_parent(const char *name, struct path *parent); -extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); -extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); -extern void done_path_create(struct path *, struct dentry *); -extern struct dentry *kern_path_locked(const char *, struct path *); -extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); +extern struct dentry *start_creating_path(int, const char *, struct path *, unsigned int); +extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int); +extern void end_creating_path(struct path *, struct dentry *); +extern struct dentry *start_removing_path(const char *, struct path *); +extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *); +static inline void end_removing_path(struct path *path , struct dentry *dentry) +{ + end_creating_path(path, dentry); +} int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392ae..fadf3817a9c5e7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -442,7 +442,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(path_fd, pathname, &path, 0); + dentry = start_creating_user_path(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -471,7 +471,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, ret = -EPERM; } out: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return ret; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6d7c110814ffa7..768098dec23100 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1387,7 +1387,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); + dentry = start_creating_path(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; @@ -1417,7 +1417,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, unix_table_double_unlock(net, old_hash, new_hash); unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); return 0; out_unlock: @@ -1427,7 +1427,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); out: unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; From 0a2c70594704b199849e8add59263fc98e057a63 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:53 +1000 Subject: [PATCH 2672/3206] debugfs: rename start_creating() to debugfs_start_creating() start_creating() is a generic name which I would like to use for a function similar to simple_start_creating(), only not quite so simple. debugfs is using this name which, though static, will cause complaints if then name is given a different signature in a header file. So rename it to debugfs_start_creating(). Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- fs/debugfs/inode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index a0357b0cf362d8..19aff6fb65991d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -353,7 +353,8 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) } EXPORT_SYMBOL_GPL(debugfs_lookup); -static struct dentry *start_creating(const char *name, struct dentry *parent) +static struct dentry *debugfs_start_creating(const char *name, + struct dentry *parent) { struct dentry *dentry; int error; @@ -419,7 +420,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); - dentry = start_creating(name, parent); + dentry = debugfs_start_creating(name, parent); if (IS_ERR(dentry)) return dentry; @@ -568,7 +569,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size); */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { - struct dentry *dentry = start_creating(name, parent); + struct dentry *dentry = debugfs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) @@ -615,7 +616,7 @@ struct dentry *debugfs_create_automount(const char *name, debugfs_automount_t f, void *data) { - struct dentry *dentry = start_creating(name, parent); + struct dentry *dentry = debugfs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) @@ -678,7 +679,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, if (!link) return ERR_PTR(-ENOMEM); - dentry = start_creating(name, parent); + dentry = debugfs_start_creating(name, parent); if (IS_ERR(dentry)) { kfree(link); return dentry; From c445bffbf28f721e05d0ce06895045fc62aaff7c Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Mon, 4 Aug 2025 04:03:27 -0400 Subject: [PATCH 2673/3206] clocksource/drivers/timer-rtl-otto: Do not interfere with interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During normal operation the timers are reprogrammed including an interrupt acknowledgement. This has no effect as the whole timer is setup from scratch afterwards. Especially in an interrupt this has already been done by rttm_timer_interrupt(). Change the behaviour as follows: - Use rttm_disable_timer() during reprogramming - Keep rttm_stop_timer() for all other use cases. Downstream has already tested and confirmed a patch. See https://github.com/openwrt/openwrt/pull/19468 https://forum.openwrt.org/t/support-for-rtl838x-based-managed-switches/57875/3788 Signed-off-by: Markus Stockhausen Signed-off-by: Daniel Lezcano Tested-by: Stephen Howell Tested-by: Bjørn Mork Link: https://lore.kernel.org/r/20250804080328.2609287-4-markus.stockhausen@gmx.de --- drivers/clocksource/timer-rtl-otto.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/timer-rtl-otto.c b/drivers/clocksource/timer-rtl-otto.c index 48ba1164f3fbfb..42f702aca68968 100644 --- a/drivers/clocksource/timer-rtl-otto.c +++ b/drivers/clocksource/timer-rtl-otto.c @@ -141,7 +141,7 @@ static int rttm_next_event(unsigned long delta, struct clock_event_device *clkev RTTM_DEBUG(to->of_base.base); rttm_bounce_timer(to->of_base.base, RTTM_CTRL_COUNTER); - rttm_stop_timer(to->of_base.base); + rttm_disable_timer(to->of_base.base); rttm_set_period(to->of_base.base, delta); rttm_start_timer(to, RTTM_CTRL_COUNTER); @@ -154,7 +154,7 @@ static int rttm_state_oneshot(struct clock_event_device *clkevt) RTTM_DEBUG(to->of_base.base); rttm_bounce_timer(to->of_base.base, RTTM_CTRL_COUNTER); - rttm_stop_timer(to->of_base.base); + rttm_disable_timer(to->of_base.base); rttm_set_period(to->of_base.base, RTTM_TICKS_PER_SEC / HZ); rttm_start_timer(to, RTTM_CTRL_COUNTER); @@ -167,7 +167,7 @@ static int rttm_state_periodic(struct clock_event_device *clkevt) RTTM_DEBUG(to->of_base.base); rttm_bounce_timer(to->of_base.base, RTTM_CTRL_TIMER); - rttm_stop_timer(to->of_base.base); + rttm_disable_timer(to->of_base.base); rttm_set_period(to->of_base.base, RTTM_TICKS_PER_SEC / HZ); rttm_start_timer(to, RTTM_CTRL_TIMER); From 931bd9273848aca9dc40dd5cad3fcfe5d0818972 Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Mon, 4 Aug 2025 04:03:28 -0400 Subject: [PATCH 2674/3206] clocksource/drivers/timer-rtl-otto: Simplify documentation While the main SoC PLL is responsible for the lexra bus frequency it has no implications on the the timer divisior. Update the comments accordingly. Signed-off-by: Markus Stockhausen Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250804080328.2609287-5-markus.stockhausen@gmx.de --- drivers/clocksource/timer-rtl-otto.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/timer-rtl-otto.c b/drivers/clocksource/timer-rtl-otto.c index 42f702aca68968..6113d2fdd4de19 100644 --- a/drivers/clocksource/timer-rtl-otto.c +++ b/drivers/clocksource/timer-rtl-otto.c @@ -41,12 +41,10 @@ #define RTTM_MAX_DIVISOR GENMASK(15, 0) /* - * Timers are derived from the LXB clock frequency. Usually this is a fixed - * multiple of the 25 MHz oscillator. The 930X SOC is an exception from that. - * Its LXB clock has only dividers and uses the switch PLL of 2.45 GHz as its - * base. The only meaningful frequencies we can achieve from that are 175.000 - * MHz and 153.125 MHz. The greatest common divisor of all explained possible - * speeds is 3125000. Pin the timers to this 3.125 MHz reference frequency. + * Timers are derived from the lexra bus (LXB) clock frequency. This is 175 MHz + * on RTL930x and 200 MHz on the other platforms. With 3.125 MHz choose a common + * divisor to have enough range and detail. This provides comparability between + * the different platforms. */ #define RTTM_TICKS_PER_SEC 3125000 From 764d0654114b5ce52aafabdb2bf9ccee0e998651 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Aug 2025 21:06:58 +0200 Subject: [PATCH 2675/3206] clocksource/drivers/timer-tegra186: Don't print superfluous errors The watchdog core will handle error messages already. Signed-off-by: Wolfram Sang Signed-off-by: Daniel Lezcano Reviewed-by: Mikko Perttunen Link: https://lore.kernel.org/r/20250813190657.3628-2-wsa+renesas@sang-engineering.com --- drivers/clocksource/timer-tegra186.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/timer-tegra186.c b/drivers/clocksource/timer-tegra186.c index 1ec7b38ff8c6f2..355558893e5f3c 100644 --- a/drivers/clocksource/timer-tegra186.c +++ b/drivers/clocksource/timer-tegra186.c @@ -333,16 +333,12 @@ static struct tegra186_wdt *tegra186_wdt_create(struct tegra186_timer *tegra, wdt->base.parent = tegra->dev; err = watchdog_init_timeout(&wdt->base, 5, tegra->dev); - if (err < 0) { - dev_err(tegra->dev, "failed to initialize timeout: %d\n", err); + if (err < 0) return ERR_PTR(err); - } err = devm_watchdog_register_device(tegra->dev, &wdt->base); - if (err < 0) { - dev_err(tegra->dev, "failed to register WDT: %d\n", err); + if (err < 0) return ERR_PTR(err); - } return wdt; } From 21b8a635f3b3d6a165fa257808ed381c13c72e9b Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Sun, 10 Aug 2025 18:37:10 -0400 Subject: [PATCH 2676/3206] clocksource/drivers/ingenic-sysost: Convert from round_rate() to determine_rate() The round_rate() clk ops is deprecated, so migrate this driver from round_rate() to determine_rate() using the Coccinelle semantic patch appended to the "under-the-cut" portion of the patch. While changes are being made to 'struct clk_ops', let's also go ahead and fix the formatting of set_rate so that everything lines up as expected. Signed-off-by: Brian Masney Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250810-clocksource-round-rate-v1-1-486ef53e45eb@redhat.com --- drivers/clocksource/ingenic-sysost.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/clocksource/ingenic-sysost.c b/drivers/clocksource/ingenic-sysost.c index cb6fc2f152d467..e79cfb0b8e05f6 100644 --- a/drivers/clocksource/ingenic-sysost.c +++ b/drivers/clocksource/ingenic-sysost.c @@ -127,18 +127,23 @@ static u8 ingenic_ost_get_prescale(unsigned long rate, unsigned long req_rate) return 2; /* /16 divider */ } -static long ingenic_ost_round_rate(struct clk_hw *hw, unsigned long req_rate, - unsigned long *parent_rate) +static int ingenic_ost_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { - unsigned long rate = *parent_rate; + unsigned long rate = req->best_parent_rate; u8 prescale; - if (req_rate > rate) - return rate; + if (req->rate > rate) { + req->rate = rate; - prescale = ingenic_ost_get_prescale(rate, req_rate); + return 0; + } + + prescale = ingenic_ost_get_prescale(rate, req->rate); - return rate >> (prescale * 2); + req->rate = rate >> (prescale * 2); + + return 0; } static int ingenic_ost_percpu_timer_set_rate(struct clk_hw *hw, unsigned long req_rate, @@ -175,14 +180,14 @@ static int ingenic_ost_global_timer_set_rate(struct clk_hw *hw, unsigned long re static const struct clk_ops ingenic_ost_percpu_timer_ops = { .recalc_rate = ingenic_ost_percpu_timer_recalc_rate, - .round_rate = ingenic_ost_round_rate, - .set_rate = ingenic_ost_percpu_timer_set_rate, + .determine_rate = ingenic_ost_determine_rate, + .set_rate = ingenic_ost_percpu_timer_set_rate, }; static const struct clk_ops ingenic_ost_global_timer_ops = { .recalc_rate = ingenic_ost_global_timer_recalc_rate, - .round_rate = ingenic_ost_round_rate, - .set_rate = ingenic_ost_global_timer_set_rate, + .determine_rate = ingenic_ost_determine_rate, + .set_rate = ingenic_ost_global_timer_set_rate, }; static const char * const ingenic_ost_clk_parents[] = { "ext" }; From 1c4b87c921fb158d853adcb8fd48c2dc07fc6f91 Mon Sep 17 00:00:00 2001 From: Markus Schneider-Pargmann Date: Tue, 19 Aug 2025 09:52:41 +0200 Subject: [PATCH 2677/3206] clocksource/drivers/arm_global_timer: Add auto-detection for initial prescaler values am43xx has a clock tree where the global timer clock is an indirect child of the CPU clock used for frequency scaling: dpll_mpu_ck -- CPU/cpufreq | v dpll_mpu_m2_ck -- divider | v mpu_periphclk -- fixed divider by 2 used for global timer When CPU frequency changes, the global timer's clock notifier rejects the change because the hardcoded prescaler (1 or 2) cannot accommodate the frequency range across all CPU OPPs (300, 600, 720, 800, 1000 MHz). Add platform-specific prescaler auto-detection to solve this issue: - am43xx: prescaler = 50 (calculated as initial_freq/GCD of all OPP freqs) This allows the timer to work across all CPU frequencies after the fixed divider by 2. Tested on am4372-idk-evm. - zynq-7000: prescaler = 2 (preserves previous Kconfig default) - Other platforms: prescaler = 1 (previous default) The Kconfig option now defaults to 0 (auto-detection) but can still override the auto-detected value when set to a non-zero value, preserving existing customization workflows. Signed-off-by: Markus Schneider-Pargmann Signed-off-by: Daniel Lezcano Tested-by: Kevin Hilman Tested-by: Patrice Chotard Tested-by: Judith Mendez Reviewed-by: Kevin Hilman Link: https://lore.kernel.org/r/20250819-topic-am43-arm-global-timer-v6-16-v2-1-6d082e2a5161@baylibre.com --- drivers/clocksource/Kconfig | 4 +-- drivers/clocksource/arm_global_timer.c | 44 +++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 0fd662f67d296a..ffcd23668763fe 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -395,8 +395,7 @@ config ARM_GLOBAL_TIMER config ARM_GT_INITIAL_PRESCALER_VAL int "ARM global timer initial prescaler value" - default 2 if ARCH_ZYNQ - default 1 + default 0 depends on ARM_GLOBAL_TIMER help When the ARM global timer initializes, its current rate is declared @@ -406,6 +405,7 @@ config ARM_GT_INITIAL_PRESCALER_VAL bounds about how much the parent clock is allowed to decrease or increase wrt the initial clock value. This affects CPU_FREQ max delta from the initial frequency. + Use 0 to use auto-detection in the driver. config ARM_TIMER_SP804 bool "Support for Dual Timer SP804 module" diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index 2d86bbc2764a04..5e3d6bb7e437ba 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -263,14 +263,13 @@ static void __init gt_delay_timer_init(void) register_current_timer_delay(>_delay_timer); } -static int __init gt_clocksource_init(void) +static int __init gt_clocksource_init(unsigned int psv) { writel(0, gt_base + GT_CONTROL); writel(0, gt_base + GT_COUNTER0); writel(0, gt_base + GT_COUNTER1); /* set prescaler and enable timer on all the cores */ - writel(FIELD_PREP(GT_CONTROL_PRESCALER_MASK, - CONFIG_ARM_GT_INITIAL_PRESCALER_VAL - 1) | + writel(FIELD_PREP(GT_CONTROL_PRESCALER_MASK, psv - 1) | GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); #ifdef CONFIG_CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK @@ -338,11 +337,45 @@ static int gt_clk_rate_change_cb(struct notifier_block *nb, return NOTIFY_DONE; } +struct gt_prescaler_config { + const char *compatible; + unsigned long prescaler; +}; + +static const struct gt_prescaler_config gt_prescaler_configs[] = { + /* + * On am43 the global timer clock is a child of the clock used for CPU + * OPPs, so the initial prescaler has to be compatible with all OPPs + * which are 300, 600, 720, 800 and 1000 with a fixed divider of 2, this + * gives us a GCD of 10. Initial frequency is 1000, so the prescaler is + * 50. + */ + { .compatible = "ti,am43", .prescaler = 50 }, + { .compatible = "xlnx,zynq-7000", .prescaler = 2 }, + { .compatible = NULL } +}; + +static unsigned long gt_get_initial_prescaler_value(struct device_node *np) +{ + const struct gt_prescaler_config *config; + + if (CONFIG_ARM_GT_INITIAL_PRESCALER_VAL != 0) + return CONFIG_ARM_GT_INITIAL_PRESCALER_VAL; + + for (config = gt_prescaler_configs; config->compatible; config++) { + if (of_machine_is_compatible(config->compatible)) + return config->prescaler; + } + + return 1; +} + static int __init global_timer_of_register(struct device_node *np) { struct clk *gt_clk; static unsigned long gt_clk_rate; int err; + unsigned long psv; /* * In A9 r2p0 the comparators for each processor with the global timer @@ -378,8 +411,9 @@ static int __init global_timer_of_register(struct device_node *np) goto out_unmap; } + psv = gt_get_initial_prescaler_value(np); gt_clk_rate = clk_get_rate(gt_clk); - gt_target_rate = gt_clk_rate / CONFIG_ARM_GT_INITIAL_PRESCALER_VAL; + gt_target_rate = gt_clk_rate / psv; gt_clk_rate_change_nb.notifier_call = gt_clk_rate_change_cb; err = clk_notifier_register(gt_clk, >_clk_rate_change_nb); @@ -404,7 +438,7 @@ static int __init global_timer_of_register(struct device_node *np) } /* Register and immediately configure the timer on the boot CPU */ - err = gt_clocksource_init(); + err = gt_clocksource_init(psv); if (err) goto out_irq; From cd32e596f02fc981674573402c1138f616df1728 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Thu, 14 Aug 2025 20:33:24 +0800 Subject: [PATCH 2678/3206] clocksource/drivers/clps711x: Fix resource leaks in error paths The current implementation of clps711x_timer_init() has multiple error paths that directly return without releasing the base I/O memory mapped via of_iomap(). Fix of_iomap leaks in error paths. Fixes: 04410efbb6bc ("clocksource/drivers/clps711x: Convert init function to return error") Fixes: 2a6a8e2d9004 ("clocksource/drivers/clps711x: Remove board support") Signed-off-by: Zhen Ni Signed-off-by: Daniel Lezcano Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250814123324.1516495-1-zhen.ni@easystack.cn --- drivers/clocksource/clps711x-timer.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/clocksource/clps711x-timer.c b/drivers/clocksource/clps711x-timer.c index e95fdc49c2269c..bbceb0289d457a 100644 --- a/drivers/clocksource/clps711x-timer.c +++ b/drivers/clocksource/clps711x-timer.c @@ -78,24 +78,33 @@ static int __init clps711x_timer_init(struct device_node *np) unsigned int irq = irq_of_parse_and_map(np, 0); struct clk *clock = of_clk_get(np, 0); void __iomem *base = of_iomap(np, 0); + int ret = 0; if (!base) return -ENOMEM; - if (!irq) - return -EINVAL; - if (IS_ERR(clock)) - return PTR_ERR(clock); + if (!irq) { + ret = -EINVAL; + goto unmap_io; + } + if (IS_ERR(clock)) { + ret = PTR_ERR(clock); + goto unmap_io; + } switch (of_alias_get_id(np, "timer")) { case CLPS711X_CLKSRC_CLOCKSOURCE: clps711x_clksrc_init(clock, base); break; case CLPS711X_CLKSRC_CLOCKEVENT: - return _clps711x_clkevt_init(clock, base, irq); + ret = _clps711x_clkevt_init(clock, base, irq); + break; default: - return -EINVAL; + ret = -EINVAL; + break; } - return 0; +unmap_io: + iounmap(base); + return ret; } TIMER_OF_DECLARE(clps711x, "cirrus,ep7209-timer", clps711x_timer_init); From 0c617a3f62100ff25c291735ff907a7ca1c084ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Wed, 10 Sep 2025 16:26:56 +0200 Subject: [PATCH 2679/3206] clocksource/drivers/sh_cmt: Split start/stop of clock source and events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CMT do a housekeeping such as dealing with runtime PM and enable/disable clocks when either a clock source is enabled, or when a new clock event is registered. Doing this type of housekeeping for when a clock event is registered is not always possible as it can happen in contexts where holding spinlocks is not possible. However doing it when registering a clock source is possible. As a first step to address this design break apart the CMT start and stop functions. The path for clock sources need not change, while the one for clock events need to be reworked in future work. There is no indented functional change, just breaking the two use-cases controlled by a flag into two distinct functions. Signed-off-by: Niklas Söderlund Signed-off-by: Daniel Lezcano Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250910142657.1148696-2-niklas.soderlund+renesas@ragnatech.se --- drivers/clocksource/sh_cmt.c | 84 +++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index b72b36e0abed86..385eb94bbe7ce5 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -578,37 +578,74 @@ static irqreturn_t sh_cmt_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static int sh_cmt_start(struct sh_cmt_channel *ch, unsigned long flag) +static int sh_cmt_start_clocksource(struct sh_cmt_channel *ch) { int ret = 0; unsigned long flags; - if (flag & FLAG_CLOCKSOURCE) - pm_runtime_get_sync(&ch->cmt->pdev->dev); + pm_runtime_get_sync(&ch->cmt->pdev->dev); raw_spin_lock_irqsave(&ch->lock, flags); - if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) { - if (flag & FLAG_CLOCKEVENT) - pm_runtime_get_sync(&ch->cmt->pdev->dev); + if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) ret = sh_cmt_enable(ch); - } if (ret) goto out; - ch->flags |= flag; + + ch->flags |= FLAG_CLOCKSOURCE; /* setup timeout if no clockevent */ - if (ch->cmt->num_channels == 1 && - flag == FLAG_CLOCKSOURCE && (!(ch->flags & FLAG_CLOCKEVENT))) + if (ch->cmt->num_channels == 1 && !(ch->flags & FLAG_CLOCKEVENT)) __sh_cmt_set_next(ch, ch->max_match_value); +out: + raw_spin_unlock_irqrestore(&ch->lock, flags); + + return ret; +} + +static void sh_cmt_stop_clocksource(struct sh_cmt_channel *ch) +{ + unsigned long flags; + unsigned long f; + + raw_spin_lock_irqsave(&ch->lock, flags); + + f = ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE); + + ch->flags &= ~FLAG_CLOCKSOURCE; + + if (f && !(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) + sh_cmt_disable(ch); + + raw_spin_unlock_irqrestore(&ch->lock, flags); + + pm_runtime_put(&ch->cmt->pdev->dev); +} + +static int sh_cmt_start_clockevent(struct sh_cmt_channel *ch) +{ + int ret = 0; + unsigned long flags; + + raw_spin_lock_irqsave(&ch->lock, flags); + + if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) { + pm_runtime_get_sync(&ch->cmt->pdev->dev); + ret = sh_cmt_enable(ch); + } + + if (ret) + goto out; + + ch->flags |= FLAG_CLOCKEVENT; out: raw_spin_unlock_irqrestore(&ch->lock, flags); return ret; } -static void sh_cmt_stop(struct sh_cmt_channel *ch, unsigned long flag) +static void sh_cmt_stop_clockevent(struct sh_cmt_channel *ch) { unsigned long flags; unsigned long f; @@ -616,22 +653,19 @@ static void sh_cmt_stop(struct sh_cmt_channel *ch, unsigned long flag) raw_spin_lock_irqsave(&ch->lock, flags); f = ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE); - ch->flags &= ~flag; + + ch->flags &= ~FLAG_CLOCKEVENT; if (f && !(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) { sh_cmt_disable(ch); - if (flag & FLAG_CLOCKEVENT) - pm_runtime_put(&ch->cmt->pdev->dev); + pm_runtime_put(&ch->cmt->pdev->dev); } /* adjust the timeout to maximum if only clocksource left */ - if ((flag == FLAG_CLOCKEVENT) && (ch->flags & FLAG_CLOCKSOURCE)) + if (ch->flags & FLAG_CLOCKSOURCE) __sh_cmt_set_next(ch, ch->max_match_value); raw_spin_unlock_irqrestore(&ch->lock, flags); - - if (flag & FLAG_CLOCKSOURCE) - pm_runtime_put(&ch->cmt->pdev->dev); } static struct sh_cmt_channel *cs_to_sh_cmt(struct clocksource *cs) @@ -672,7 +706,7 @@ static int sh_cmt_clocksource_enable(struct clocksource *cs) ch->total_cycles = 0; - ret = sh_cmt_start(ch, FLAG_CLOCKSOURCE); + ret = sh_cmt_start_clocksource(ch); if (!ret) ch->cs_enabled = true; @@ -685,7 +719,7 @@ static void sh_cmt_clocksource_disable(struct clocksource *cs) WARN_ON(!ch->cs_enabled); - sh_cmt_stop(ch, FLAG_CLOCKSOURCE); + sh_cmt_stop_clocksource(ch); ch->cs_enabled = false; } @@ -696,7 +730,7 @@ static void sh_cmt_clocksource_suspend(struct clocksource *cs) if (!ch->cs_enabled) return; - sh_cmt_stop(ch, FLAG_CLOCKSOURCE); + sh_cmt_stop_clocksource(ch); dev_pm_genpd_suspend(&ch->cmt->pdev->dev); } @@ -708,7 +742,7 @@ static void sh_cmt_clocksource_resume(struct clocksource *cs) return; dev_pm_genpd_resume(&ch->cmt->pdev->dev); - sh_cmt_start(ch, FLAG_CLOCKSOURCE); + sh_cmt_start_clocksource(ch); } static int sh_cmt_register_clocksource(struct sh_cmt_channel *ch, @@ -740,7 +774,7 @@ static struct sh_cmt_channel *ced_to_sh_cmt(struct clock_event_device *ced) static void sh_cmt_clock_event_start(struct sh_cmt_channel *ch, int periodic) { - sh_cmt_start(ch, FLAG_CLOCKEVENT); + sh_cmt_start_clockevent(ch); if (periodic) sh_cmt_set_next(ch, ((ch->cmt->rate + HZ/2) / HZ) - 1); @@ -752,7 +786,7 @@ static int sh_cmt_clock_event_shutdown(struct clock_event_device *ced) { struct sh_cmt_channel *ch = ced_to_sh_cmt(ced); - sh_cmt_stop(ch, FLAG_CLOCKEVENT); + sh_cmt_stop_clockevent(ch); return 0; } @@ -763,7 +797,7 @@ static int sh_cmt_clock_event_set_state(struct clock_event_device *ced, /* deal with old setting first */ if (clockevent_state_oneshot(ced) || clockevent_state_periodic(ced)) - sh_cmt_stop(ch, FLAG_CLOCKEVENT); + sh_cmt_stop_clockevent(ch); dev_info(&ch->cmt->pdev->dev, "ch%u: used for %s clock events\n", ch->index, periodic ? "periodic" : "oneshot"); From 285213a65e91d0295751d740e2320d8fcd75d56e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Sep 2025 05:19:16 -0600 Subject: [PATCH 2680/3206] MAINTAINERS: update io_uring and block tree git trees Move to using the git.kernel.org trees as the canonical trees for both the block and io_uring tree. Signed-off-by: Jens Axboe --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index b47daf498a97bc..60d29ff64a5576 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6224,7 +6224,7 @@ M: Josef Bacik M: Jens Axboe L: cgroups@vger.kernel.org L: linux-block@vger.kernel.org -T: git git://git.kernel.dk/linux-block +T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git F: Documentation/admin-guide/cgroup-v1/blkio-controller.rst F: block/bfq-cgroup.c F: block/blk-cgroup.c @@ -12860,8 +12860,8 @@ IO_URING M: Jens Axboe L: io-uring@vger.kernel.org S: Maintained -T: git git://git.kernel.dk/linux-block -T: git git://git.kernel.dk/liburing +T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/liburing.git F: include/linux/io_uring/ F: include/linux/io_uring.h F: include/linux/io_uring_types.h From bc7d684fea18cc48c3630d2b7f1789000ff2df5b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 18 Sep 2025 08:12:53 +1000 Subject: [PATCH 2681/3206] xfs: rearrange code in xfs_inode_item_precommit There are similar extsize checks and updates done inside and outside the inode item lock, which could all be done under a single top level logic branch outside the ili_lock. The COW extsize fixup can potentially miss updating the XFS_ILOG_CORE in ili_fsync_fields, so moving this code up above the ili_fsync_fields update could also be considered a fix. Further, to make the next change a bit cleaner, move where we calculate the on-disk flag mask to after we attach the cluster buffer to the the inode log item. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_inode_item.c | 65 ++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 829675700fcdd4..678ca95793e0af 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -131,46 +131,28 @@ xfs_inode_item_precommit( } /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. + * Inode verifiers do not check that the extent size hints are an + * integer multiple of the rt extent size on a directory with + * rtinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the bad hints. */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) { + if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + flags |= XFS_ILOG_CORE; + } } - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it - * to XFS_ILOG_CORE so that the actual on-disk dirty tracking - * (ili_fields) correctly tracks that the version has changed. - */ spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); - if (flags & XFS_ILOG_IVERSION) - flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); - - /* - * Inode verifiers do not check that the CoW extent size hint is an - * integer multiple of the rt extent size on a directory with both - * rtinherit and cowextsize flags set. If we're logging a directory - * that is misconfigured in this way, clear the hint. - */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { - ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; - ip->i_cowextsize = 0; - flags |= XFS_ILOG_CORE; - } - if (!iip->ili_item.li_buf) { struct xfs_buf *bp; int error; @@ -204,6 +186,17 @@ xfs_inode_item_precommit( xfs_trans_brelse(tp, bp); } + /* + * Record the specific change for fdatasync optimisation. This allows + * fdatasync to skip log forces for inodes that are only timestamp + * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it + * to XFS_ILOG_CORE so that the actual on-disk dirty tracking + * (ili_fields) correctly tracks that the version has changed. + */ + iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + /* * Always OR in the bits from the ili_last_fields field. This is to * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines From c91d38b57f2c4784d885c874b2a1234a01361afd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 18 Sep 2025 08:12:54 +1000 Subject: [PATCH 2682/3206] xfs: rework datasync tracking and execution Jan Kara reported that the shared ILOCK held across the journal flush during fdatasync operations slows down O_DSYNC DIO on unwritten extents significantly. The underlying issue is that unwritten extent conversion needs the ILOCK exclusive, whilst the datasync operation after the extent conversion holds it shared. Hence we cannot be flushing the journal for one IO completion whilst at the same time doing unwritten extent conversion on another IO completion on the same inode. This means that IO completions lock-step, and IO performance is dependent on the journal flush latency. Jan demonstrated that reducing the ifdatasync lock hold time can improve O_DSYNC DIO to unwritten extents performance by 2.5x. Discussion on that patch found issues with the method, and we came to the conclusion that separately tracking datasync flush sequences was the best approach to solving the problem. The fsync code uses the ILOCK to serialise against concurrent modifications in the transaction commit phase. In a transaction commit, there are several disjoint updates to inode log item state that need to be considered atomically by the fsync code. These operations are all done under ILOCK_EXCL context: 1. ili_fsync_flags is updated in ->iop_precommit 2. i_pincount is updated in ->iop_pin before it is added to the CIL 3. ili_commit_seq is updated in ->iop_committing, after it has been added to the CIL In fsync, we need to: 1. check that the inode is dirty in the journal (ipincount) 2. check that ili_fsync_flags is set 3. grab the ili_commit_seq if a journal flush is needed 4. clear the ili_fsync_flags to ensure that new modifications that require fsync are tracked in ->iop_precommit correctly The serialisation of ipincount/ili_commit_seq is needed to ensure that we don't try to unnecessarily flush the journal. The serialisation of ili_fsync_flags being set in ->iop_precommit and cleared in fsync post journal flush is required for correctness. Hence holding the ILOCK_SHARED in xfs_file_fsync() performs all this serialisation for us. Ideally, we want to remove the need to hold the ILOCK_SHARED in xfs_file_fsync() for best performance. We start with the observation that fsync/fdatasync() only need to wait for operations that have been completed. Hence operations that are still being committed have not completed and datasync operations do not need to wait for them. This means we can use a single point in time in the commit process to signal "this modification is complete". This is what ->iop_committing is supposed to provide - it is the point at which the object is unlocked after the modification has been recorded in the CIL. Hence we could use ili_commit_seq to determine if we should flush the journal. In theory, we can already do this. However, in practice this will expose an internal global CIL lock to the IO path. The ipincount() checks optimise away the need to take this lock - if the inode is not pinned, then it is not in the CIL and we don't need to check if a journal flush at ili_commit_seq needs to be performed. The reason this is needed is that the ili_commit_seq is never cleared. Once it is set, it remains set even once the journal has been committed and the object has been unpinned. Hence we have to look that journal internal commit sequence state to determine if ili_commit_seq needs to be acted on or not. We can solve this by clearing ili_commit_seq when the inode is unpinned. If we clear it atomically with the last unpin going away, then we are guaranteed that new modifications will order correctly as they add a new pin counts and we won't clear a sequence number for an active modification in the CIL. Further, we can then allow the per-transaction flag state to propagate into ->iop_committing (instead of clearing it in ->iop_precommit) and that will allow us to determine if the modification needs a full fsync or just a datasync, and so we can record a separate datasync sequence number (Jan's idea!) and then use that in the fdatasync path instead of the full fsync sequence number. With this infrastructure in place, we no longer need the ILOCK_SHARED in the fsync path. All serialisation is done against the commit sequence numbers - if the sequence number is set, then we have to flush the journal. If it is not set, then we have nothing to do. This greatly simplifies the fsync implementation.... Signed-off-by: Dave Chinner Tested-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_file.c | 75 +++++++++++++++++++---------------------- fs/xfs/xfs_inode.c | 25 +++++++++----- fs/xfs/xfs_inode_item.c | 72 ++++++++++++++++++++++++++++++--------- fs/xfs/xfs_inode_item.h | 10 +++++- fs/xfs/xfs_iomap.c | 15 +++++++-- 5 files changed, 128 insertions(+), 69 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f96fbf5c54c999..2702fef2c90cd2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -75,52 +75,47 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } -static xfs_csn_t -xfs_fsync_seq( - struct xfs_inode *ip, - bool datasync) -{ - if (!xfs_ipincount(ip)) - return 0; - if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - return 0; - return ip->i_itemp->ili_commit_seq; -} - /* - * All metadata updates are logged, which means that we just have to flush the - * log up to the latest LSN that touched the inode. + * All metadata updates are logged, which means that we just have to push the + * journal to the required sequence number than holds the updates. We track + * datasync commits separately to full sync commits, and hence only need to + * select the correct sequence number for the log force here. * - * If we have concurrent fsync/fdatasync() calls, we need them to all block on - * the log force before we clear the ili_fsync_fields field. This ensures that - * we don't get a racing sync operation that does not wait for the metadata to - * hit the journal before returning. If we race with clearing ili_fsync_fields, - * then all that will happen is the log force will do nothing as the lsn will - * already be on disk. We can't race with setting ili_fsync_fields because that - * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock - * shared until after the ili_fsync_fields is cleared. + * We don't have to serialise against concurrent modifications, as we do not + * have to wait for modifications that have not yet completed. We define a + * transaction commit as completing when the commit sequence number is updated, + * hence if the sequence number has not updated, the sync operation has been + * run before the commit completed and we don't have to wait for it. + * + * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain + * set on the log item until - at least - the journal flush completes. In + * reality, they are only cleared when the inode is fully unpinned (i.e. + * persistent in the journal and not dirty in the CIL), and so we rely on + * xfs_log_force_seq() either skipping sequences that have been persisted or + * waiting on sequences that are still in flight to correctly order concurrent + * sync operations. */ -static int +static int xfs_fsync_flush_log( struct xfs_inode *ip, bool datasync, int *log_flushed) { - int error = 0; - xfs_csn_t seq; + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - seq = xfs_fsync_seq(ip, datasync); - if (seq) { - error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, - log_flushed); + spin_lock(&iip->ili_lock); + if (datasync) + seq = iip->ili_datasync_seq; + else + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); - spin_lock(&ip->i_itemp->ili_lock); - ip->i_itemp->ili_fsync_fields = 0; - spin_unlock(&ip->i_itemp->ili_lock); - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; + if (!seq) + return 0; + + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, + log_flushed); } STATIC int @@ -158,12 +153,10 @@ xfs_file_fsync( error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* - * Any inode that has dirty modifications in the log is pinned. The - * racy check here for a pinned inode will not catch modifications - * that happen concurrently to the fsync call, but fsync semantics - * only require to sync previously completed I/O. + * If the inode has a inode log item attached, it may need the journal + * flushed to persist any changes the log item might be tracking. */ - if (xfs_ipincount(ip)) { + if (ip->i_itemp) { err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); if (err2 && !error) error = err2; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2bfe87b09c2ada..047a5260cf7084 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1667,7 +1667,6 @@ xfs_ifree_mark_inode_stale( spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); @@ -1832,12 +1831,20 @@ static void xfs_iunpin( struct xfs_inode *ip) { - xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); + if (!seq) + return; /* Give the log a push to start the unpinning I/O */ - xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); + xfs_log_force_seq(ip->i_mount, seq, 0, NULL); } @@ -2504,7 +2511,6 @@ xfs_iflush( spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); spin_unlock(&iip->ili_lock); @@ -2663,12 +2669,15 @@ int xfs_log_force_inode( struct xfs_inode *ip) { + struct xfs_inode_log_item *iip = ip->i_itemp; xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - seq = ip->i_itemp->ili_commit_seq; - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!iip) + return 0; + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); if (!seq) return 0; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 678ca95793e0af..1bd411a1114c7e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -187,13 +187,16 @@ xfs_inode_item_precommit( } /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it - * to XFS_ILOG_CORE so that the actual on-disk dirty tracking - * (ili_fields) correctly tracks that the version has changed. + * Store the dirty flags back into the inode item as this state is used + * later on in xfs_inode_item_committing() to determine whether the + * transaction is relevant to fsync state or not. + */ + iip->ili_dirty_flags = flags; + + /* + * Convert the flags on-disk fields that have been modified in the + * transaction so that ili_fields tracks the changes correctly. */ - iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); if (flags & XFS_ILOG_IVERSION) flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); @@ -207,12 +210,6 @@ xfs_inode_item_precommit( spin_unlock(&iip->ili_lock); xfs_inode_item_precommit_check(ip); - - /* - * We are done with the log item transaction dirty state, so clear it so - * that it doesn't pollute future transactions. - */ - iip->ili_dirty_flags = 0; return 0; } @@ -722,13 +719,24 @@ xfs_inode_item_unpin( struct xfs_log_item *lip, int remove) { - struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); ASSERT(atomic_read(&ip->i_pincount) > 0); - if (atomic_dec_and_test(&ip->i_pincount)) + + /* + * If this is the last unpin, then the inode no longer needs a journal + * flush to persist it. Hence we can clear the commit sequence numbers + * as a fsync/fdatasync operation on the inode at this point is a no-op. + */ + if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) { + iip->ili_commit_seq = 0; + iip->ili_datasync_seq = 0; + spin_unlock(&iip->ili_lock); wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); + } } STATIC uint @@ -851,12 +859,45 @@ xfs_inode_item_committed( return lsn; } +/* + * The modification is now complete, so before we unlock the inode we need to + * update the commit sequence numbers for data integrity journal flushes. We + * always record the commit sequence number (ili_commit_seq) so that anything + * that needs a full journal sync will capture all of this modification. + * + * We then + * check if the changes will impact a datasync (O_DSYNC) journal flush. If the + * changes will require a datasync flush, then we also record the sequence in + * ili_datasync_seq. + * + * These commit sequence numbers will get cleared atomically with the inode being + * unpinned (i.e. pin count goes to zero), and so it will only be set when the + * inode is dirty in the journal. This removes the need for checking if the + * inode is pinned to determine if a journal flush is necessary, and hence + * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to + * serialise pin counts against commit sequence number updates. + * + */ STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, xfs_csn_t seq) { - INODE_ITEM(lip)->ili_commit_seq = seq; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + + spin_lock(&iip->ili_lock); + iip->ili_commit_seq = seq; + if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP)) + iip->ili_datasync_seq = seq; + spin_unlock(&iip->ili_lock); + + /* + * Clear the per-transaction dirty flags now that we have finished + * recording the transaction's inode modifications in the CIL and are + * about to release and (maybe) unlock the inode. + */ + iip->ili_dirty_flags = 0; + return xfs_inode_item_release(lip); } @@ -1048,7 +1089,6 @@ xfs_iflush_abort_clean( { iip->ili_last_fields = 0; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; iip->ili_flush_lsn = 0; iip->ili_item.li_buf = NULL; list_del_init(&iip->ili_item.li_bio_list); diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index ba92ce11a01111..2ddcca41714f7a 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -32,9 +32,17 @@ struct xfs_inode_log_item { spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - unsigned int ili_fsync_fields; /* logged since last fsync */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + + /* + * We record the sequence number for every inode modification, as + * well as those that only require fdatasync operations for data + * integrity. This allows optimisation of the O_DSYNC/fdatasync path + * without needing to track what modifications the journal is currently + * carrying for the inode. These are protected by the above ili_lock. + */ xfs_csn_t ili_commit_seq; /* last transaction commit */ + xfs_csn_t ili_datasync_seq; /* for datasync optimisation */ }; static inline int xfs_inode_clean(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2570d0a6604738..d3f6e3e42a1191 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -149,9 +149,18 @@ xfs_bmbt_to_iomap( iomap->bdev = target->bt_bdev; iomap->flags = iomap_flags; - if (xfs_ipincount(ip) && - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; + /* + * If the inode is dirty for datasync purposes, let iomap know so it + * doesn't elide the IO completion journal flushes on O_DSYNC IO. + */ + if (ip->i_itemp) { + struct xfs_inode_log_item *iip = ip->i_itemp; + + spin_lock(&iip->ili_lock); + if (iip->ili_datasync_seq) + iomap->flags |= IOMAP_F_DIRTY; + spin_unlock(&iip->ili_lock); + } iomap->validity_cookie = sequence_cookie; return 0; From 559f2eacc8a23c7f44daac09d4f3efd958d497f2 Mon Sep 17 00:00:00 2001 From: Huisong Li Date: Tue, 23 Sep 2025 11:24:28 +0800 Subject: [PATCH 2683/3206] ACPI: processor: Do not expose global variable acpi_idle_driver Move the cpuidle driver check from __acpi_processor_start() to acpi_processor_power_init() which allows variable acpi_idle_driver to become static. No intentional functional impact. Signed-off-by: Huisong Li Link: https://patch.msgid.link/20250923032428.2656329-2-lihuisong@huawei.com [ rjw: Subject tweak, new changelog, adjustment of a new comment ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/processor_driver.c | 3 +-- drivers/acpi/processor_idle.c | 9 ++++++++- include/acpi/processor.h | 1 - 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index de17c141267809..5d824435b26be3 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -166,8 +166,7 @@ static int __acpi_processor_start(struct acpi_device *device) if (result && !IS_ENABLED(CONFIG_ACPI_CPU_FREQ_PSS)) dev_dbg(&device->dev, "CPPC data invalid or not present\n"); - if (cpuidle_get_driver() == &acpi_idle_driver) - acpi_processor_power_init(pr); + acpi_processor_power_init(pr); acpi_pss_perf_init(pr); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 698d14c1958737..22b051b94a86c1 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -51,7 +51,7 @@ module_param(latency_factor, uint, 0644); static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device); -struct cpuidle_driver acpi_idle_driver = { +static struct cpuidle_driver acpi_idle_driver = { .name = "acpi_idle", .owner = THIS_MODULE, }; @@ -1404,6 +1404,13 @@ void acpi_processor_power_init(struct acpi_processor *pr) { struct cpuidle_device *dev; + /* + * The code below only works if the current cpuidle driver is the ACPI + * idle driver. + */ + if (cpuidle_get_driver() != &acpi_idle_driver) + return; + if (disabled_by_idle_boot_param()) return; diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 24fdaa3c28992e..7146a8e9e9c25b 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -417,7 +417,6 @@ static inline void acpi_processor_throttling_init(void) {} #endif /* CONFIG_ACPI_CPU_FREQ_PSS */ /* in processor_idle.c */ -extern struct cpuidle_driver acpi_idle_driver; #ifdef CONFIG_ACPI_PROCESSOR_IDLE void acpi_processor_power_init(struct acpi_processor *pr); void acpi_processor_power_exit(struct acpi_processor *pr); From 496f9372eae14775e0524e83e952814691fe850a Mon Sep 17 00:00:00 2001 From: Amir Mohammad Jahangirzad Date: Tue, 23 Sep 2025 05:01:13 +0330 Subject: [PATCH 2684/3206] ACPI: debug: fix signedness issues in read/write helpers In the ACPI debugger interface, the helper functions for read and write operations use "int" as the length parameter data type. When a large "size_t count" is passed from the file operations, this cast to "int" results in truncation and a negative value due to signed integer representation. Logically, this negative number propagates to the min() calculation, where it is selected over the positive buffer space value, leading to unexpected behavior. Subsequently, when this negative value is used in copy_to_user() or copy_from_user(), it is interpreted as a large positive value due to the unsigned nature of the size parameter in these functions, causing the copy operations to attempt handling sizes far beyond the intended buffer limits. Address the issue by: - Changing the length parameters in acpi_aml_read_user() and acpi_aml_write_user() from "int" to "size_t", aligning with the expected unsigned size semantics. - Updating return types and local variables in acpi_aml_read() and acpi_aml_write() to "ssize_t" for consistency with kernel file operation conventions. - Using "size_t" for the "n" variable to ensure calculations remain unsigned. - Using min_t() for circ_count_to_end() and circ_space_to_end() to ensure type-safe comparisons and prevent integer overflow. Signed-off-by: Amir Mohammad Jahangirzad Link: https://patch.msgid.link/20250923013113.20615-1-a.jahangirzad@gmail.com [ rjw: Changelog tweaks, local variable definitions ordering adjustments ] Fixes: 8cfb0cdf07e2 ("ACPI / debugger: Add IO interface to access debugger functionalities") Cc: 4.5+ # 4.5+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_dbg.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/acpi/acpi_dbg.c b/drivers/acpi/acpi_dbg.c index d50261d05f3a1a..515b20d0b698a4 100644 --- a/drivers/acpi/acpi_dbg.c +++ b/drivers/acpi/acpi_dbg.c @@ -569,11 +569,11 @@ static int acpi_aml_release(struct inode *inode, struct file *file) return 0; } -static int acpi_aml_read_user(char __user *buf, int len) +static ssize_t acpi_aml_read_user(char __user *buf, size_t len) { - int ret; struct circ_buf *crc = &acpi_aml_io.out_crc; - int n; + ssize_t ret; + size_t n; char *p; ret = acpi_aml_lock_read(crc, ACPI_AML_OUT_USER); @@ -582,7 +582,7 @@ static int acpi_aml_read_user(char __user *buf, int len) /* sync head before removing logs */ smp_rmb(); p = &crc->buf[crc->tail]; - n = min(len, circ_count_to_end(crc)); + n = min_t(size_t, len, circ_count_to_end(crc)); if (copy_to_user(buf, p, n)) { ret = -EFAULT; goto out; @@ -599,8 +599,8 @@ static int acpi_aml_read_user(char __user *buf, int len) static ssize_t acpi_aml_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - int ret = 0; - int size = 0; + ssize_t ret = 0; + ssize_t size = 0; if (!count) return 0; @@ -639,11 +639,11 @@ static ssize_t acpi_aml_read(struct file *file, char __user *buf, return size > 0 ? size : ret; } -static int acpi_aml_write_user(const char __user *buf, int len) +static ssize_t acpi_aml_write_user(const char __user *buf, size_t len) { - int ret; struct circ_buf *crc = &acpi_aml_io.in_crc; - int n; + ssize_t ret; + size_t n; char *p; ret = acpi_aml_lock_write(crc, ACPI_AML_IN_USER); @@ -652,7 +652,7 @@ static int acpi_aml_write_user(const char __user *buf, int len) /* sync tail before inserting cmds */ smp_mb(); p = &crc->buf[crc->head]; - n = min(len, circ_space_to_end(crc)); + n = min_t(size_t, len, circ_space_to_end(crc)); if (copy_from_user(p, buf, n)) { ret = -EFAULT; goto out; @@ -663,14 +663,14 @@ static int acpi_aml_write_user(const char __user *buf, int len) ret = n; out: acpi_aml_unlock_fifo(ACPI_AML_IN_USER, ret >= 0); - return n; + return ret; } static ssize_t acpi_aml_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - int ret = 0; - int size = 0; + ssize_t ret = 0; + ssize_t size = 0; if (!count) return 0; From 5fc4ab3269dea6a0b00c7256cb6f6c0101b6a44b Mon Sep 17 00:00:00 2001 From: Louis-Alexis Eyraud Date: Tue, 23 Sep 2025 10:52:12 +0200 Subject: [PATCH 2685/3206] pmdomain: mediatek: set default off flag for MT8195 AUDIO power domain In MT8195 power domain data array, set the KEEP_DEFAULT_OFF and ACTIVE_WAKEUP flags for the AUDIO power domain entry to avoid having this domain being on during boot sequence when unneeded. Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Signed-off-by: Louis-Alexis Eyraud Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Ulf Hansson --- drivers/pmdomain/mediatek/mt8195-pm-domains.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pmdomain/mediatek/mt8195-pm-domains.h b/drivers/pmdomain/mediatek/mt8195-pm-domains.h index 59aa031ae6323b..414dfe82361f20 100644 --- a/drivers/pmdomain/mediatek/mt8195-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8195-pm-domains.h @@ -123,6 +123,7 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8195[] = { MT8195_TOP_AXI_PROT_EN_2_CLR, MT8195_TOP_AXI_PROT_EN_2_STA1), }, + .caps = MTK_SCPD_KEEP_DEFAULT_OFF | MTK_SCPD_ACTIVE_WAKEUP, }, [MT8195_POWER_DOMAIN_MFG0] = { .name = "mfg0", From f90213261681268b6dc4c944503ddcc20e15f8fe Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:23:56 +0100 Subject: [PATCH 2686/3206] bpf: refactor special field-type detection Reduce code duplication in detection of the known special field types in map values. This refactoring helps to avoid copying a chunk of code in the next patch of the series. Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250923112404.668720-2-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 84 +++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 51 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 64739308902f7a..c51e16bbf0c183 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3478,60 +3478,44 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, return BTF_FIELD_FOUND; } -#define field_mask_test_name(field_type, field_type_str) \ - if (field_mask & field_type && !strcmp(name, field_type_str)) { \ - type = field_type; \ - goto end; \ - } - static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type, - u32 field_mask, u32 *seen_mask, - int *align, int *sz) -{ - int type = 0; + u32 field_mask, u32 *seen_mask, int *align, int *sz) +{ + const struct { + enum btf_field_type type; + const char *const name; + const bool is_unique; + } field_types[] = { + { BPF_SPIN_LOCK, "bpf_spin_lock", true }, + { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true }, + { BPF_TIMER, "bpf_timer", true }, + { BPF_WORKQUEUE, "bpf_wq", true }, + { BPF_LIST_HEAD, "bpf_list_head", false }, + { BPF_LIST_NODE, "bpf_list_node", false }, + { BPF_RB_ROOT, "bpf_rb_root", false }, + { BPF_RB_NODE, "bpf_rb_node", false }, + { BPF_REFCOUNT, "bpf_refcount", false }, + }; + int type = 0, i; const char *name = __btf_name_by_offset(btf, var_type->name_off); - - if (field_mask & BPF_SPIN_LOCK) { - if (!strcmp(name, "bpf_spin_lock")) { - if (*seen_mask & BPF_SPIN_LOCK) - return -E2BIG; - *seen_mask |= BPF_SPIN_LOCK; - type = BPF_SPIN_LOCK; - goto end; - } - } - if (field_mask & BPF_RES_SPIN_LOCK) { - if (!strcmp(name, "bpf_res_spin_lock")) { - if (*seen_mask & BPF_RES_SPIN_LOCK) - return -E2BIG; - *seen_mask |= BPF_RES_SPIN_LOCK; - type = BPF_RES_SPIN_LOCK; - goto end; - } - } - if (field_mask & BPF_TIMER) { - if (!strcmp(name, "bpf_timer")) { - if (*seen_mask & BPF_TIMER) - return -E2BIG; - *seen_mask |= BPF_TIMER; - type = BPF_TIMER; - goto end; - } - } - if (field_mask & BPF_WORKQUEUE) { - if (!strcmp(name, "bpf_wq")) { - if (*seen_mask & BPF_WORKQUEUE) + const char *field_type_name; + enum btf_field_type field_type; + bool is_unique; + + for (i = 0; i < ARRAY_SIZE(field_types); ++i) { + field_type = field_types[i].type; + field_type_name = field_types[i].name; + is_unique = field_types[i].is_unique; + if (!(field_mask & field_type) || strcmp(name, field_type_name)) + continue; + if (is_unique) { + if (*seen_mask & field_type) return -E2BIG; - *seen_mask |= BPF_WORKQUEUE; - type = BPF_WORKQUEUE; - goto end; + *seen_mask |= field_type; } + type = field_type; + goto end; } - field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); - field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); - field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); - field_mask_test_name(BPF_RB_NODE, "bpf_rb_node"); - field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); /* Only return BPF_KPTR when all other types with matchable names fail */ if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { @@ -3545,8 +3529,6 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ return type; } -#undef field_mask_test_name - /* Repeat a number of fields for a specified number of times. * * Copy the fields starting from the first field and repeat them for From 5eab266b801f4b938fea6d112c560c84ab489627 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:23:57 +0100 Subject: [PATCH 2687/3206] bpf: extract generic helper from process_timer_func() Refactor the verifier by pulling the common logic from process_timer_func() into a dedicated helper. This allows reusing process_async_func() helper for verifying bpf_task_work struct in the next patch. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Tested-by: syzbot@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20250923112404.668720-3-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 47 +++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d4183bc3cd1e9..c1b726fb22c8ee 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8431,34 +8431,59 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) return 0; } -static int process_timer_func(struct bpf_verifier_env *env, int regno, - struct bpf_call_arg_meta *meta) +/* Check if @regno is a pointer to a specific field in a map value */ +static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, + enum btf_field_type field_type) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; bool is_const = tnum_is_const(reg->var_off); struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; + const char *struct_name = btf_field_type_name(field_type); + int field_off = -1; if (!is_const) { verbose(env, - "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s has to be at the constant offset\n", + regno, struct_name); return -EINVAL; } if (!map->btf) { - verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n", - map->name); + verbose(env, "map '%s' has to have BTF in order to use %s\n", map->name, + struct_name); return -EINVAL; } - if (!btf_record_has_field(map->record, BPF_TIMER)) { - verbose(env, "map '%s' has no valid bpf_timer\n", map->name); + if (!btf_record_has_field(map->record, field_type)) { + verbose(env, "map '%s' has no valid %s\n", map->name, struct_name); return -EINVAL; } - if (map->record->timer_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", - val + reg->off, map->record->timer_off); + switch (field_type) { + case BPF_TIMER: + field_off = map->record->timer_off; + break; + default: + verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; } + if (field_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n", + val + reg->off, struct_name, field_off); + return -EINVAL; + } + return 0; +} + +static int process_timer_func(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_map_field_pointer(env, regno, BPF_TIMER); + if (err) + return err; + if (meta->map_ptr) { verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; From acc3a0d2506c1b8186e9190adcd5bee4a1932abc Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:23:58 +0100 Subject: [PATCH 2688/3206] bpf: htab: extract helper for freeing special structs Extract the cleanup of known embedded structs into the dedicated helper. Remove duplication and introduce a single source of truth for freeing special embedded structs in hashtab. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250923112404.668720-4-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 71f9931ac64cd4..2319f8f8fa3e98 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -215,6 +215,16 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } +static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem) +{ + if (btf_record_has_field(htab->map.record, BPF_TIMER)) + bpf_obj_free_timer(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); +} + static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; @@ -227,12 +237,7 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - if (btf_record_has_field(htab->map.record, BPF_TIMER)) - bpf_obj_free_timer(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); + htab_free_internal_structs(htab, elem); cond_resched(); } } @@ -1502,12 +1507,7 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER)) - bpf_obj_free_timer(htab->map.record, - htab_elem_value(l, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(htab->map.record, - htab_elem_value(l, htab->map.key_size)); + htab_free_internal_structs(htab, l); } cond_resched_rcu(); } From d2699bdb6ebad4fecd0f8765f35bd32a4a142b16 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:23:59 +0100 Subject: [PATCH 2689/3206] bpf: verifier: permit non-zero returns from async callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The verifier currently enforces a zero return value for all async callbacks—a constraint originally introduced for bpf_timer. That restriction is too narrow for other async use cases. Relax the rule by allowing non-zero return codes from async callbacks in general, while preserving the zero-return requirement for bpf_timer to maintain its existing semantics. Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250923112404.668720-5-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c1b726fb22c8ee..02b93a54a4464a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10789,7 +10789,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; - callee->callback_ret_range = retval_range(0, 1); + callee->callback_ret_range = retval_range(0, 0); return 0; } @@ -17073,9 +17073,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char } if (frame->in_async_callback_fn) { - /* enforce return zero from async callbacks like timer */ exit_ctx = "At async callback return"; - range = retval_range(0, 0); + range = frame->callback_ret_range; goto enforce_retval; } From 5c8fd7e2b5b0a527cf88740da122166695382a78 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:24:00 +0100 Subject: [PATCH 2690/3206] bpf: bpf task work plumbing This patch adds necessary plumbing in verifier, syscall and maps to support handling new kfunc bpf_task_work_schedule and kernel structure bpf_task_work. The idea is similar to how we already handle bpf_wq and bpf_timer. verifier changes validate calls to bpf_task_work_schedule to make sure it is safe and expected invariants hold. btf part is required to detect bpf_task_work structure inside map value and store its offset, which will be used in the next patch to calculate key and value addresses. arraymap and hashtab changes are needed to handle freeing of the bpf_task_work: run code needed to deinitialize it, for example cancel task_work callback if possible. The use of bpf_task_work and proper implementation for kfuncs are introduced in the next patch. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250923112404.668720-6-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++ include/uapi/linux/bpf.h | 4 ++ kernel/bpf/arraymap.c | 8 ++- kernel/bpf/btf.c | 7 ++ kernel/bpf/hashtab.c | 19 +++--- kernel/bpf/helpers.c | 40 +++++++++++ kernel/bpf/syscall.c | 16 ++++- kernel/bpf/verifier.c | 117 +++++++++++++++++++++++++++++++-- tools/include/uapi/linux/bpf.h | 4 ++ 9 files changed, 208 insertions(+), 18 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dfc1a27b56d550..a2ab51fa8b0ab9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -209,6 +209,7 @@ enum btf_field_type { BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), BPF_RES_SPIN_LOCK = (1 << 12), + BPF_TASK_WORK = (1 << 13), }; enum bpf_cgroup_storage_type { @@ -262,6 +263,7 @@ struct btf_record { int timer_off; int wq_off; int refcount_off; + int task_work_off; struct btf_field fields[]; }; @@ -363,6 +365,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; + case BPF_TASK_WORK: + return "bpf_task_work"; default: WARN_ON_ONCE(1); return "unknown"; @@ -401,6 +405,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); + case BPF_TASK_WORK: + return sizeof(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -433,6 +439,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); + case BPF_TASK_WORK: + return __alignof__(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -464,6 +472,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: + case BPF_TASK_WORK: break; default: WARN_ON_ONCE(1); @@ -600,6 +609,7 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); void bpf_wq_cancel_and_free(void *timer); +void bpf_task_work_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, @@ -2426,6 +2436,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f3b173e48b0fa5..ae83d8649ef1cd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7436,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 26d5dda989bc1a..80b1765a315969 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -443,7 +443,7 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers_wq(struct bpf_map *map) +static void array_map_free_internal_structs(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -451,12 +451,14 @@ static void array_map_free_timers_wq(struct bpf_map *map) /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { for (i = 0; i < array->map.max_entries; i++) { if (btf_record_has_field(map->record, BPF_TIMER)) bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); if (btf_record_has_field(map->record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); } } } @@ -795,7 +797,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers_wq, + .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c51e16bbf0c183..9f47a3aa7ff899 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3490,6 +3490,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true }, { BPF_TIMER, "bpf_timer", true }, { BPF_WORKQUEUE, "bpf_wq", true }, + { BPF_TASK_WORK, "bpf_task_work", true }, { BPF_LIST_HEAD, "bpf_list_head", false }, { BPF_LIST_NODE, "bpf_list_node", false }, { BPF_RB_ROOT, "bpf_rb_root", false }, @@ -3675,6 +3676,7 @@ static int btf_find_field_one(const struct btf *btf, case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: + case BPF_TASK_WORK: ret = btf_find_struct(btf, var_type, off, sz, field_type, info_cnt ? &info[0] : &tmp); if (ret < 0) @@ -3967,6 +3969,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; + rec->task_work_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { @@ -4006,6 +4009,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->wq_off = rec->fields[i].offset; break; + case BPF_TASK_WORK: + WARN_ON_ONCE(rec->task_work_off >= 0); + rec->task_work_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2319f8f8fa3e98..c2fcd0cd51e51b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -223,9 +223,12 @@ static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem * if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, htab_elem_value(elem, htab->map.key_size)); + if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) + bpf_obj_free_task_work(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); } -static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; @@ -1495,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } -static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_malloced_internal_structs(struct bpf_htab *htab) { int i; @@ -1514,16 +1517,16 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) rcu_read_unlock(); } -static void htab_map_free_timers_and_wq(struct bpf_map *map) +static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_and_wq(htab); + htab_free_malloced_internal_structs(htab); else - htab_free_prealloced_timers_and_wq(htab); + htab_free_prealloced_internal_structs(htab); } } @@ -2255,7 +2258,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2276,7 +2279,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 969f63f8ca28f7..7f5e528df13b6c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3906,8 +3906,48 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, } #endif /* CONFIG_KEYS */ +typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); + +/** + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return 0; +} + +/** + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return 0; +} + __bpf_kfunc_end_defs(); +void bpf_task_work_cancel_and_free(void *val) +{ +} + BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a3c3d26f6e21b..adb05d235011ff 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -674,6 +674,7 @@ void btf_record_free(struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to release */ break; default: @@ -727,6 +728,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to acquire */ break; default: @@ -785,6 +787,13 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) bpf_wq_cancel_and_free(obj + rec->wq_off); } +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) + return; + bpf_task_work_cancel_and_free(obj + rec->task_work_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -809,6 +818,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_WORKQUEUE: bpf_wq_cancel_and_free(field_ptr); break; + case BPF_TASK_WORK: + bpf_task_work_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -1240,7 +1252,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | + BPF_TASK_WORK, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1272,6 +1285,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, break; case BPF_TIMER: case BPF_WORKQUEUE: + case BPF_TASK_WORK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 02b93a54a4464a..ceeb0ffe7d6743 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2224,10 +2224,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) - reg->map_uid = reg->id; - if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + if (btf_record_has_field(map->inner_map_meta->record, + BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { reg->map_uid = reg->id; + } } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -8461,6 +8461,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, case BPF_TIMER: field_off = map->record->timer_off; break; + case BPF_TASK_WORK: + field_off = map->record->task_work_off; + break; default: verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; @@ -8514,6 +8517,26 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_task_work_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_map_field_pointer(env, regno, BPF_TASK_WORK); + if (err) + return err; + + if (meta->map.ptr) { + verifier_bug(env, "Two map pointers in a bpf_task_work helper"); + return -EFAULT; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -10343,6 +10366,8 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); +static bool is_task_work_add_kfunc(u32 func_id); + static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); @@ -10561,7 +10586,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm)); + is_bpf_wq_set_callback_impl_kfunc(insn->imm) || + is_task_work_add_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -10876,6 +10902,36 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr; + + /* + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_async_callback_fn = true; + callee->callback_ret_range = retval_range(S32_MIN, S32_MAX); + return 0; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id); /* Are we currently verifying the callback for a rbtree helper that must @@ -12000,6 +12056,7 @@ enum { KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, + KF_ARG_TASK_WORK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12010,6 +12067,7 @@ BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) +BTF_ID(struct, bpf_task_work) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12058,6 +12116,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID); +} + static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); @@ -12145,6 +12208,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, + KF_ARG_PTR_TO_TASK_WORK, }; enum special_kfunc_type { @@ -12194,6 +12258,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, }; BTF_ID_LIST(special_kfunc_list) @@ -12262,6 +12328,14 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) + +static bool is_task_work_add_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; +} static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12352,6 +12426,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TASK_WORK; + if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; @@ -12695,7 +12772,8 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + is_task_work_add_kfunc(btf_id); } static bool is_bpf_throw_kfunc(struct bpf_insn *insn) @@ -13076,7 +13154,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL; } - if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || + reg->map_ptr->record->task_work_off >= 0)) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -13091,6 +13170,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ */ if (meta->map.ptr != reg->map_ptr || meta->map.uid != reg->map_uid) { + if (reg->map_ptr->record->task_work_off >= 0) { + verbose(env, + "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } verbose(env, "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", meta->map.uid, reg->map_uid); @@ -13129,6 +13214,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; @@ -13422,6 +13508,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_TASK_WORK: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_task_work_func(env, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); @@ -13788,6 +13883,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_task_work_add_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_task_work_schedule_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f3b173e48b0fa5..ae83d8649ef1cd 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7436,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); From 5e8134f50d3041a9922a732f24d705a238b61aad Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:24:01 +0100 Subject: [PATCH 2691/3206] bpf: extract map key pointer calculation Calculation of the BPF map key, given the pointer to a value is duplicated in a couple of places in helpers already, in the next patch another use case is introduced as well. This patch extracts that functionality into a separate function. Signed-off-by: Mykyta Yatsenko Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250923112404.668720-7-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 7f5e528df13b6c..c28f48a310e353 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1082,6 +1082,17 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) +{ + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + *arr_idx = ((char *)value - array->value) / array->elem_size; + return arr_idx; + } + return (void *)value - round_up(map->key_size, 8); +} + struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; @@ -1164,15 +1175,8 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) * bpf_map_delete_elem() on the same timer. */ this_cpu_write(hrtimer_running, t); - if (map->map_type == BPF_MAP_TYPE_ARRAY) { - struct bpf_array *array = container_of(map, struct bpf_array, map); - /* compute the key */ - idx = ((char *)value - array->value) / array->elem_size; - key = &idx; - } else { /* hash or lru */ - key = value - round_up(map->key_size, 8); - } + key = map_key_from_value(map, value, &idx); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); /* The verifier checked that return value is zero. */ @@ -1198,15 +1202,7 @@ static void bpf_wq_work(struct work_struct *work) if (!callback_fn) return; - if (map->map_type == BPF_MAP_TYPE_ARRAY) { - struct bpf_array *array = container_of(map, struct bpf_array, map); - - /* compute the key */ - idx = ((char *)value - array->value) / array->elem_size; - key = &idx; - } else { /* hash or lru */ - key = value - round_up(map->key_size, 8); - } + key = map_key_from_value(map, value, &idx); rcu_read_lock_trace(); migrate_disable(); From 38aa7003e369802f81a078f6673d10d97013f04f Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:24:02 +0100 Subject: [PATCH 2692/3206] bpf: task work scheduling kfuncs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implementation of the new bpf_task_work_schedule kfuncs, that let a BPF program schedule task_work callbacks for a target task: * bpf_task_work_schedule_signal() - schedules with TWA_SIGNAL * bpf_task_work_schedule_resume() - schedules with TWA_RESUME Each map value should embed a struct bpf_task_work, which the kernel side pairs with struct bpf_task_work_kern, containing a pointer to struct bpf_task_work_ctx, that maintains metadata relevant for the concrete callback scheduling. A small state machine and refcounting scheme ensures safe reuse and teardown. State transitions: _______________________________ | | v | [standby] ---> [pending] --> [scheduling] --> [scheduled] ^ |________________|_________ | | | v | [running] |_______________________________________________________| All states may transition into FREED state: [pending] [scheduling] [scheduled] [running] [standby] -> [freed] A FREED terminal state coordinates with map-value deletion (bpf_task_work_cancel_and_free()). Scheduling itself is deferred via irq_work to keep the kfunc callable from NMI context. Lifetime is guarded with refcount_t + RCU Tasks Trace. Main components: * struct bpf_task_work_context – Metadata and state management per task work. * enum bpf_task_work_state – A state machine to serialize work scheduling and execution. * bpf_task_work_schedule() – The central helper that initiates scheduling. * bpf_task_work_acquire_ctx() - Attempts to take ownership of the context, pointed by passed struct bpf_task_work, allocates new context if none exists yet. * bpf_task_work_callback() – Invoked when the actual task_work runs. * bpf_task_work_irq() – An intermediate step (runs in softirq context) to enqueue task work. * bpf_task_work_cancel_and_free() – Cleanup for deleted BPF map entries. Flow of successful task work scheduling 1) bpf_task_work_schedule_* is called from BPF code. 2) Transition state from STANDBY to PENDING, mark context as owned by this task work scheduler 3) irq_work_queue() schedules bpf_task_work_irq(). 4) Transition state from PENDING to SCHEDULING (noop if transition successful) 5) bpf_task_work_irq() attempts task_work_add(). If successful, state transitions to SCHEDULED. 6) Task work calls bpf_task_work_callback(), which transition state to RUNNING. 7) BPF callback is executed 8) Context is cleaned up, refcounts released, context state set back to STANDBY. Signed-off-by: Mykyta Yatsenko Reviewed-by: Andrii Nakryiko Reviewed-by: Eduard Zingerman Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250923112404.668720-8-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 292 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 290 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c28f48a310e353..c9fab9a356dfc1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include "../../lib/kstrtox.h" @@ -3904,6 +3906,265 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); +enum bpf_task_work_state { + /* bpf_task_work is ready to be used */ + BPF_TW_STANDBY = 0, + /* irq work scheduling in progress */ + BPF_TW_PENDING, + /* task work scheduling in progress */ + BPF_TW_SCHEDULING, + /* task work is scheduled successfully */ + BPF_TW_SCHEDULED, + /* callback is running */ + BPF_TW_RUNNING, + /* associated BPF map value is deleted */ + BPF_TW_FREED, +}; + +struct bpf_task_work_ctx { + enum bpf_task_work_state state; + refcount_t refcnt; + struct callback_head work; + struct irq_work irq_work; + /* bpf_prog that schedules task work */ + struct bpf_prog *prog; + /* task for which callback is scheduled */ + struct task_struct *task; + /* the map and map value associated with this context */ + struct bpf_map *map; + void *map_val; + enum task_work_notify_mode mode; + bpf_task_work_callback_t callback_fn; + struct rcu_head rcu; +} __aligned(8); + +/* Actual type for struct bpf_task_work */ +struct bpf_task_work_kern { + struct bpf_task_work_ctx *ctx; +}; + +static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx) +{ + if (ctx->prog) { + bpf_prog_put(ctx->prog); + ctx->prog = NULL; + } + if (ctx->task) { + bpf_task_release(ctx->task); + ctx->task = NULL; + } +} + +static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx) +{ + return refcount_inc_not_zero(&ctx->refcnt); +} + +static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx) +{ + if (!refcount_dec_and_test(&ctx->refcnt)) + return; + + bpf_task_work_ctx_reset(ctx); + + /* bpf_mem_free expects migration to be disabled */ + migrate_disable(); + bpf_mem_free(&bpf_global_ma, ctx); + migrate_enable(); +} + +static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx) +{ + /* + * Scheduled task_work callback holds ctx ref, so if we successfully + * cancelled, we put that ref on callback's behalf. If we couldn't + * cancel, callback will inevitably run or has already completed + * running, and it would have taken care of its ctx ref itself. + */ + if (task_work_cancel(ctx->task, &ctx->work)) + bpf_task_work_ctx_put(ctx); +} + +static void bpf_task_work_callback(struct callback_head *cb) +{ + struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work); + enum bpf_task_work_state state; + u32 idx; + void *key; + + /* Read lock is needed to protect ctx and map key/value access */ + guard(rcu_tasks_trace)(); + /* + * This callback may start running before bpf_task_work_irq() switched to + * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING. + */ + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING); + if (state == BPF_TW_SCHEDULED) + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING); + if (state == BPF_TW_FREED) { + bpf_task_work_ctx_put(ctx); + return; + } + + key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx); + + migrate_disable(); + ctx->callback_fn(ctx->map, key, ctx->map_val); + migrate_enable(); + + bpf_task_work_ctx_reset(ctx); + (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY); + + bpf_task_work_ctx_put(ctx); +} + +static void bpf_task_work_irq(struct irq_work *irq_work) +{ + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); + enum bpf_task_work_state state; + int err; + + guard(rcu_tasks_trace)(); + + if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) { + bpf_task_work_ctx_put(ctx); + return; + } + + err = task_work_add(ctx->task, &ctx->work, ctx->mode); + if (err) { + bpf_task_work_ctx_reset(ctx); + /* + * try to switch back to STANDBY for another task_work reuse, but we might have + * gone to FREED already, which is fine as we already cleaned up after ourselves + */ + (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY); + bpf_task_work_ctx_put(ctx); + return; + } + + /* + * It's technically possible for just scheduled task_work callback to + * complete running by now, going SCHEDULING -> RUNNING and then + * dropping its ctx refcount. Instead of capturing extra ref just to + * protected below ctx->state access, we rely on RCU protection to + * perform below SCHEDULING -> SCHEDULED attempt. + */ + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED); + if (state == BPF_TW_FREED) + bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */ +} + +static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw, + struct bpf_map *map) +{ + struct bpf_task_work_kern *twk = (void *)tw; + struct bpf_task_work_ctx *ctx, *old_ctx; + + ctx = READ_ONCE(twk->ctx); + if (ctx) + return ctx; + + ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx)); + if (!ctx) + return ERR_PTR(-ENOMEM); + + memset(ctx, 0, sizeof(*ctx)); + refcount_set(&ctx->refcnt, 1); /* map's own ref */ + ctx->state = BPF_TW_STANDBY; + + old_ctx = cmpxchg(&twk->ctx, NULL, ctx); + if (old_ctx) { + /* + * tw->ctx is set by concurrent BPF program, release allocated + * memory and try to reuse already set context. + */ + bpf_mem_free(&bpf_global_ma, ctx); + return old_ctx; + } + + return ctx; /* Success */ +} + +static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw, + struct bpf_map *map) +{ + struct bpf_task_work_ctx *ctx; + + ctx = bpf_task_work_fetch_ctx(tw, map); + if (IS_ERR(ctx)) + return ctx; + + /* try to get ref for task_work callback to hold */ + if (!bpf_task_work_ctx_tryget(ctx)) + return ERR_PTR(-EBUSY); + + if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) { + /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */ + bpf_task_work_ctx_put(ctx); + return ERR_PTR(-EBUSY); + } + + /* + * If no process or bpffs is holding a reference to the map, no new callbacks should be + * scheduled. This does not address any race or correctness issue, but rather is a policy + * choice: dropping user references should stop everything. + */ + if (!atomic64_read(&map->usercnt)) { + /* drop ref we just got for task_work callback itself */ + bpf_task_work_ctx_put(ctx); + /* transfer map's ref into cancel_and_free() */ + bpf_task_work_cancel_and_free(tw); + return ERR_PTR(-EBUSY); + } + + return ctx; +} + +static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw, + struct bpf_map *map, bpf_task_work_callback_t callback_fn, + struct bpf_prog_aux *aux, enum task_work_notify_mode mode) +{ + struct bpf_prog *prog; + struct bpf_task_work_ctx *ctx; + int err; + + BTF_TYPE_EMIT(struct bpf_task_work); + + prog = bpf_prog_inc_not_zero(aux->prog); + if (IS_ERR(prog)) + return -EBADF; + task = bpf_task_acquire(task); + if (!task) { + err = -EBADF; + goto release_prog; + } + + ctx = bpf_task_work_acquire_ctx(tw, map); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto release_all; + } + + ctx->task = task; + ctx->callback_fn = callback_fn; + ctx->prog = prog; + ctx->mode = mode; + ctx->map = map; + ctx->map_val = (void *)tw - map->record->task_work_off; + init_task_work(&ctx->work, bpf_task_work_callback); + init_irq_work(&ctx->irq_work, bpf_task_work_irq); + + irq_work_queue(&ctx->irq_work); + return 0; + +release_all: + bpf_task_release(task); +release_prog: + bpf_prog_put(prog); + return err; +} + /** * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode * @task: Task struct for which callback should be scheduled @@ -3918,7 +4179,7 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b void *map__map, bpf_task_work_callback_t callback, void *aux__prog) { - return 0; + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); } /** @@ -3935,13 +4196,38 @@ __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct b void *map__map, bpf_task_work_callback_t callback, void *aux__prog) { - return 0; + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } __bpf_kfunc_end_defs(); +static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) +{ + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); + + bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */ + bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */ +} + void bpf_task_work_cancel_and_free(void *val) { + struct bpf_task_work_kern *twk = val; + struct bpf_task_work_ctx *ctx; + enum bpf_task_work_state state; + + ctx = xchg(&twk->ctx, NULL); + if (!ctx) + return; + + state = xchg(&ctx->state, BPF_TW_FREED); + if (state == BPF_TW_SCHEDULED) { + /* run in irq_work to avoid locks in NMI */ + init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled); + irq_work_queue(&ctx->irq_work); + return; + } + + bpf_task_work_ctx_put(ctx); /* put bpf map's ref */ } BTF_KFUNCS_START(generic_btf_ids) @@ -4086,6 +4372,8 @@ BTF_ID_FLAGS(func, bpf_strnstr); BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { From 39fd74dfd5d2ea9d1e94b2e03076284616798551 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:24:03 +0100 Subject: [PATCH 2693/3206] selftests/bpf: BPF task work scheduling tests Introducing selftests that check BPF task work scheduling mechanism. Validate that verifier does not accepts incorrect calls to bpf_task_work_schedule kfunc. Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250923112404.668720-9-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/test_task_work.c | 150 ++++++++++++++++++ tools/testing/selftests/bpf/progs/task_work.c | 107 +++++++++++++ .../selftests/bpf/progs/task_work_fail.c | 96 +++++++++++ 3 files changed, 353 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_task_work.c create mode 100644 tools/testing/selftests/bpf/progs/task_work.c create mode 100644 tools/testing/selftests/bpf/progs/task_work_fail.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_work.c b/tools/testing/selftests/bpf/prog_tests/test_task_work.c new file mode 100644 index 00000000000000..666585270fbf99 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_task_work.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "task_work.skel.h" +#include "task_work_fail.skel.h" +#include +#include +#include +#include + +static int perf_event_open(__u32 type, __u64 config, int pid) +{ + struct perf_event_attr attr = { + .type = type, + .config = config, + .size = sizeof(struct perf_event_attr), + .sample_period = 100000, + }; + + return syscall(__NR_perf_event_open, &attr, pid, -1, -1, 0); +} + +struct elem { + char data[128]; + struct bpf_task_work tw; +}; + +static int verify_map(struct bpf_map *map, const char *expected_data) +{ + int err; + struct elem value; + int processed_values = 0; + int k, sz; + + sz = bpf_map__max_entries(map); + for (k = 0; k < sz; ++k) { + err = bpf_map__lookup_elem(map, &k, sizeof(int), &value, sizeof(struct elem), 0); + if (err) + continue; + if (!ASSERT_EQ(strcmp(expected_data, value.data), 0, "map data")) { + fprintf(stderr, "expected '%s', found '%s' in %s map", expected_data, + value.data, bpf_map__name(map)); + return 2; + } + processed_values++; + } + + return processed_values == 0; +} + +static void task_work_run(const char *prog_name, const char *map_name) +{ + struct task_work *skel; + struct bpf_program *prog; + struct bpf_map *map; + struct bpf_link *link; + int err, pe_fd = 0, pid, status, pipefd[2]; + char user_string[] = "hello world"; + + if (!ASSERT_NEQ(pipe(pipefd), -1, "pipe")) + return; + + pid = fork(); + if (pid == 0) { + __u64 num = 1; + int i; + char buf; + + close(pipefd[1]); + read(pipefd[0], &buf, sizeof(buf)); + close(pipefd[0]); + + for (i = 0; i < 10000; ++i) + num *= time(0) % 7; + (void)num; + exit(0); + } + ASSERT_GT(pid, 0, "fork() failed"); + + skel = task_work__open(); + if (!ASSERT_OK_PTR(skel, "task_work__open")) + return; + + bpf_object__for_each_program(prog, skel->obj) { + bpf_program__set_autoload(prog, false); + } + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "prog_name")) + goto cleanup; + bpf_program__set_autoload(prog, true); + skel->bss->user_ptr = (char *)user_string; + + err = task_work__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + pe_fd = perf_event_open(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, pid); + if (pe_fd == -1 && (errno == ENOENT || errno == EOPNOTSUPP)) { + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); + test__skip(); + goto cleanup; + } + if (!ASSERT_NEQ(pe_fd, -1, "pe_fd")) { + fprintf(stderr, "perf_event_open errno: %d, pid: %d\n", errno, pid); + goto cleanup; + } + + link = bpf_program__attach_perf_event(prog, pe_fd); + if (!ASSERT_OK_PTR(link, "attach_perf_event")) + goto cleanup; + + close(pipefd[0]); + write(pipefd[1], user_string, 1); + close(pipefd[1]); + /* Wait to collect some samples */ + waitpid(pid, &status, 0); + pid = 0; + map = bpf_object__find_map_by_name(skel->obj, map_name); + if (!ASSERT_OK_PTR(map, "find map_name")) + goto cleanup; + if (!ASSERT_OK(verify_map(map, user_string), "verify map")) + goto cleanup; +cleanup: + if (pe_fd >= 0) + close(pe_fd); + task_work__destroy(skel); + if (pid) { + close(pipefd[0]); + write(pipefd[1], user_string, 1); + close(pipefd[1]); + waitpid(pid, &status, 0); + } +} + +void test_task_work(void) +{ + if (test__start_subtest("test_task_work_hash_map")) + task_work_run("oncpu_hash_map", "hmap"); + + if (test__start_subtest("test_task_work_array_map")) + task_work_run("oncpu_array_map", "arrmap"); + + if (test__start_subtest("test_task_work_lru_map")) + task_work_run("oncpu_lru_map", "lrumap"); + + RUN_TESTS(task_work_fail); +} diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c new file mode 100644 index 00000000000000..23217f06a3ece6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include "bpf_misc.h" +#include "errno.h" + +char _license[] SEC("license") = "GPL"; + +const void *user_ptr = NULL; + +struct elem { + char data[128]; + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} arrmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} lrumap SEC(".maps"); + +static int process_work(struct bpf_map *map, void *key, void *value) +{ + struct elem *work = value; + + bpf_copy_from_user_str(work->data, sizeof(work->data), (const void *)user_ptr, 0); + return 0; +} + +int key = 0; + +SEC("perf_event") +int oncpu_hash_map(struct pt_regs *args) +{ + struct elem empty_work = { .data = { 0 } }; + struct elem *work; + struct task_struct *task; + int err; + + task = bpf_get_current_task_btf(); + err = bpf_map_update_elem(&hmap, &key, &empty_work, BPF_NOEXIST); + if (err) + return 0; + work = bpf_map_lookup_elem(&hmap, &key); + if (!work) + return 0; + + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +int oncpu_array_map(struct pt_regs *args) +{ + struct elem *work; + struct task_struct *task; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&arrmap, &key); + if (!work) + return 0; + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +int oncpu_lru_map(struct pt_regs *args) +{ + struct elem empty_work = { .data = { 0 } }; + struct elem *work; + struct task_struct *task; + int err; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&lrumap, &key); + if (work) + return 0; + err = bpf_map_update_elem(&lrumap, &key, &empty_work, BPF_NOEXIST); + if (err) + return 0; + work = bpf_map_lookup_elem(&lrumap, &key); + if (!work || work->data[0]) + return 0; + bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work, NULL); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c new file mode 100644 index 00000000000000..77fe8f28facdb6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +const void *user_ptr = NULL; + +struct elem { + char data[128]; + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} arrmap SEC(".maps"); + +static int process_work(struct bpf_map *map, void *key, void *value) +{ + struct elem *work = value; + + bpf_copy_from_user_str(work->data, sizeof(work->data), (const void *)user_ptr, 0); + return 0; +} + +int key = 0; + +SEC("perf_event") +__failure __msg("doesn't match map pointer in R3") +int mismatch_map(struct pt_regs *args) +{ + struct elem *work; + struct task_struct *task; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&arrmap, &key); + if (!work) + return 0; + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +__failure __msg("arg#1 doesn't point to a map value") +int no_map_task_work(struct pt_regs *args) +{ + struct task_struct *task; + struct bpf_task_work tw; + + task = bpf_get_current_task_btf(); + bpf_task_work_schedule_resume(task, &tw, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +__failure __msg("Possibly NULL pointer passed to trusted arg1") +int task_work_null(struct pt_regs *args) +{ + struct task_struct *task; + + task = bpf_get_current_task_btf(); + bpf_task_work_schedule_resume(task, NULL, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +__failure __msg("Possibly NULL pointer passed to trusted arg2") +int map_null(struct pt_regs *args) +{ + struct elem *work; + struct task_struct *task; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&arrmap, &key); + if (!work) + return 0; + bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work, NULL); + return 0; +} From c6ae18e0af5e7c809ac26350043924e062dbb76f Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:24:04 +0100 Subject: [PATCH 2694/3206] selftests/bpf: add bpf task work stress tests Add stress tests for BPF task-work scheduling kfuncs. The tests spawn multiple threads that concurrently schedule task_work callbacks against the same and different map values to exercise the kfuncs under high contention. Verify callbacks are reliably enqueued and executed with no drops. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20250923112404.668720-10-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/task_work_stress.c | 130 ++++++++++++++++++ .../selftests/bpf/progs/task_work_stress.c | 73 ++++++++++ 2 files changed, 203 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/task_work_stress.c create mode 100644 tools/testing/selftests/bpf/progs/task_work_stress.c diff --git a/tools/testing/selftests/bpf/prog_tests/task_work_stress.c b/tools/testing/selftests/bpf/prog_tests/task_work_stress.c new file mode 100644 index 00000000000000..450d17d91a56fb --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_work_stress.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "task_work_stress.skel.h" +#include +#include +#include +#include +#include +#include + +struct test_data { + int prog_fd; + atomic_int exit; +}; + +void *runner(void *test_data) +{ + struct test_data *td = test_data; + int err = 0; + LIBBPF_OPTS(bpf_test_run_opts, opts); + + while (!err && !atomic_load(&td->exit)) + err = bpf_prog_test_run_opts(td->prog_fd, &opts); + + return NULL; +} + +static int get_env_int(const char *str, int def) +{ + const char *s = getenv(str); + char *end; + int retval; + + if (!s || !*s) + return def; + errno = 0; + retval = strtol(s, &end, 10); + if (errno || *end || retval < 0) + return def; + return retval; +} + +static void task_work_run(bool enable_delete) +{ + struct task_work_stress *skel; + struct bpf_program *scheduler, *deleter; + int nthreads = 16; + int test_time_s = get_env_int("BPF_TASK_WORK_TEST_TIME", 1); + pthread_t tid[nthreads], tid_del; + bool started[nthreads], started_del = false; + struct test_data td_sched = { .exit = 0 }, td_del = { .exit = 1 }; + int i, err; + + skel = task_work_stress__open(); + if (!ASSERT_OK_PTR(skel, "task_work__open")) + return; + + scheduler = bpf_object__find_program_by_name(skel->obj, "schedule_task_work"); + bpf_program__set_autoload(scheduler, true); + + deleter = bpf_object__find_program_by_name(skel->obj, "delete_task_work"); + bpf_program__set_autoload(deleter, true); + + err = task_work_stress__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + for (i = 0; i < nthreads; ++i) + started[i] = false; + + td_sched.prog_fd = bpf_program__fd(scheduler); + for (i = 0; i < nthreads; ++i) { + if (pthread_create(&tid[i], NULL, runner, &td_sched) != 0) { + fprintf(stderr, "could not start thread"); + goto cancel; + } + started[i] = true; + } + + if (enable_delete) + atomic_store(&td_del.exit, 0); + + td_del.prog_fd = bpf_program__fd(deleter); + if (pthread_create(&tid_del, NULL, runner, &td_del) != 0) { + fprintf(stderr, "could not start thread"); + goto cancel; + } + started_del = true; + + /* Run stress test for some time */ + sleep(test_time_s); + +cancel: + atomic_store(&td_sched.exit, 1); + atomic_store(&td_del.exit, 1); + for (i = 0; i < nthreads; ++i) { + if (started[i]) + pthread_join(tid[i], NULL); + } + + if (started_del) + pthread_join(tid_del, NULL); + + ASSERT_GT(skel->bss->callback_scheduled, 0, "work scheduled"); + /* Some scheduling attempts should have failed due to contention */ + ASSERT_GT(skel->bss->schedule_error, 0, "schedule error"); + + if (enable_delete) { + /* If delete thread is enabled, it has cancelled some callbacks */ + ASSERT_GT(skel->bss->delete_success, 0, "delete success"); + ASSERT_LT(skel->bss->callback_success, skel->bss->callback_scheduled, "callbacks"); + } else { + /* Without delete thread number of scheduled callbacks is the same as fired */ + ASSERT_EQ(skel->bss->callback_success, skel->bss->callback_scheduled, "callbacks"); + } + +cleanup: + task_work_stress__destroy(skel); +} + +void test_task_work_stress(void) +{ + if (test__start_subtest("no_delete")) + task_work_run(false); + if (test__start_subtest("with_delete")) + task_work_run(true); +} diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c new file mode 100644 index 00000000000000..90fca06fff56ca --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include "bpf_misc.h" + +#define ENTRIES 128 + +char _license[] SEC("license") = "GPL"; + +__u64 callback_scheduled = 0; +__u64 callback_success = 0; +__u64 schedule_error = 0; +__u64 delete_success = 0; + +struct elem { + __u32 count; + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, ENTRIES); + __type(key, int); + __type(value, struct elem); +} hmap SEC(".maps"); + +static int process_work(struct bpf_map *map, void *key, void *value) +{ + __sync_fetch_and_add(&callback_success, 1); + return 0; +} + +SEC("syscall") +int schedule_task_work(void *ctx) +{ + struct elem empty_work = {.count = 0}; + struct elem *work; + int key = 0, err; + + key = bpf_ktime_get_ns() % ENTRIES; + work = bpf_map_lookup_elem(&hmap, &key); + if (!work) { + bpf_map_update_elem(&hmap, &key, &empty_work, BPF_NOEXIST); + work = bpf_map_lookup_elem(&hmap, &key); + if (!work) + return 0; + } + err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work, NULL); + if (err) + __sync_fetch_and_add(&schedule_error, 1); + else + __sync_fetch_and_add(&callback_scheduled, 1); + return 0; +} + +SEC("syscall") +int delete_task_work(void *ctx) +{ + int key = 0, err; + + key = bpf_get_prandom_u32() % ENTRIES; + err = bpf_map_delete_elem(&hmap, &key); + if (!err) + __sync_fetch_and_add(&delete_success, 1); + return 0; +} From 7aada81cd75ad844c84fb1dcdce2d67ec41763f8 Mon Sep 17 00:00:00 2001 From: Ivaylo Ivanov Date: Sun, 14 Sep 2025 16:18:48 +0300 Subject: [PATCH 2695/3206] dt-bindings: mmc: samsung,exynos-dw-mshc: add specific compatible for exynos8890 Add samsung,exynos8890-dw-mshc-smu specific compatible to the bindings documentation. Since Samsung, as usual, likes reusing devices from older designs, use the samsung,exynos7-dw-mshc-smu compatible. Signed-off-by: Ivaylo Ivanov Acked-by: Rob Herring (Arm) Signed-off-by: Ulf Hansson --- .../devicetree/bindings/mmc/samsung,exynos-dw-mshc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/mmc/samsung,exynos-dw-mshc.yaml b/Documentation/devicetree/bindings/mmc/samsung,exynos-dw-mshc.yaml index e8bd49d46794ee..27c4060f2f9174 100644 --- a/Documentation/devicetree/bindings/mmc/samsung,exynos-dw-mshc.yaml +++ b/Documentation/devicetree/bindings/mmc/samsung,exynos-dw-mshc.yaml @@ -31,6 +31,7 @@ properties: - samsung,exynos5433-dw-mshc-smu - samsung,exynos7885-dw-mshc-smu - samsung,exynos850-dw-mshc-smu + - samsung,exynos8890-dw-mshc-smu - samsung,exynos8895-dw-mshc-smu - const: samsung,exynos7-dw-mshc-smu From a2501032de0d1bc7971b2e43c03da534ac10ee9b Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Tue, 16 Sep 2025 14:39:48 +0800 Subject: [PATCH 2696/3206] tracing/osnoise: Fix slab-out-of-bounds in _parse_integer_limit() When config osnoise cpus by write() syscall, the following KASAN splat may be observed: BUG: KASAN: slab-out-of-bounds in _parse_integer_limit+0x103/0x130 Read of size 1 at addr ffff88810121e3a1 by task test/447 CPU: 1 UID: 0 PID: 447 Comm: test Not tainted 6.17.0-rc6-dirty #288 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: dump_stack_lvl+0x55/0x70 print_report+0xcb/0x610 kasan_report+0xb8/0xf0 _parse_integer_limit+0x103/0x130 bitmap_parselist+0x16d/0x6f0 osnoise_cpus_write+0x116/0x2d0 vfs_write+0x21e/0xcc0 ksys_write+0xee/0x1c0 do_syscall_64+0xa8/0x2a0 entry_SYSCALL_64_after_hwframe+0x77/0x7f This issue can be reproduced by below code: const char *cpulist = "1"; int fd=open("/sys/kernel/debug/tracing/osnoise/cpus", O_WRONLY); write(fd, cpulist, strlen(cpulist)); Function bitmap_parselist() was called to parse cpulist, it require that the parameter 'buf' must be terminated with a '\0' or '\n'. Fix this issue by adding a '\0' to 'buf' in osnoise_cpus_write(). Cc: Cc: Cc: Link: https://lore.kernel.org/20250916063948.3154627-1-wangliang74@huawei.com Fixes: 17f89102fe23 ("tracing/osnoise: Allow arbitrarily long CPU string") Signed-off-by: Wang Liang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 337bc0eb5d71bf..dc734867f0fc44 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2325,12 +2325,13 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, if (count < 1) return 0; - buf = kmalloc(count, GFP_KERNEL); + buf = kmalloc(count + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_user(buf, ubuf, count)) return -EFAULT; + buf[count] = '\0'; if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) return -ENOMEM; From 1da3f145ede481607a93472d2a1f597d6e998eb4 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 19 Sep 2025 10:15:56 +0900 Subject: [PATCH 2697/3206] tracing: dynevent: Add a missing lockdown check on dynevent Since dynamic_events interface on tracefs is compatible with kprobe_events and uprobe_events, it should also check the lockdown status and reject if it is set. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/175824455687.45175.3734166065458520748.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_dynevent.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 5d64a18cacacc6..d06854bd32b357 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -230,6 +230,10 @@ static int dyn_event_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = tracing_check_open_get_tr(NULL); if (ret) return ret; From 340de1f673ceb0ab46470cb19b7c773e3359a3e5 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 23 Sep 2025 11:16:34 +0200 Subject: [PATCH 2698/3206] sched_ext: Verify RCU protection in scx_bpf_cpu_curr() scx_bpf_cpu_curr() has been introduced to retrieve the current task of a given runqueue, allowing schedulers to interact with that task. The kfunc assumes that it is always called in an RCU context, but this is not always guaranteed and some BPF schedulers can trigger the following warning: WARNING: suspicious RCU usage sched_ext: BPF scheduler "cosmos_1.0.2_gd0e71ca_x86_64_unknown_linux_gnu_debug" enabled 6.17.0-rc1 #1-NixOS Not tainted ----------------------------- kernel/sched/ext.c:6415 suspicious rcu_dereference_check() usage! ... Call Trace: dump_stack_lvl+0x6f/0xb0 lockdep_rcu_suspicious.cold+0x4e/0x96 scx_bpf_cpu_curr+0x7e/0x80 bpf_prog_c68b2b6b6b1b0ff8_sched_timerfn+0xce/0x1dc bpf_timer_cb+0x7b/0x130 __hrtimer_run_queues+0x1ea/0x380 hrtimer_run_softirq+0x8c/0xd0 handle_softirqs+0xc9/0x3b0 __irq_exit_rcu+0x96/0xc0 irq_exit_rcu+0xe/0x20 sysvec_apic_timer_interrupt+0x73/0x80 To address this, mark the kfunc with KF_RCU_PROTECTED, so the verifier can enforce its usage only inside RCU-protected sections. Note: this also requires commit 1512231b6cc86 ("bpf: Enforce RCU protection for KF_RCU_PROTECTED"), currently in bpf-next, to enforce the proper KF_RCU_PROTECTED. Fixes: 20b158094a1ad ("sched_ext: Introduce scx_bpf_cpu_curr()") Cc: Christian Loehle Cc: Kumar Kartikeya Dwivedi Signed-off-by: Andrea Righi Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f5873f8ed669a3..f7e17dc0422e26 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6570,7 +6570,7 @@ BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL) -BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) #ifdef CONFIG_CGROUP_SCHED BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif From 9a1aa642c1fb5a3776447182aaf37dfaafb36134 Mon Sep 17 00:00:00 2001 From: Zihuan Zhang Date: Tue, 23 Sep 2025 15:55:53 +0800 Subject: [PATCH 2699/3206] cpufreq: Replace pointer subtraction with iteration macro The cpufreq documentation suggests avoiding direct pointer subtraction when working with entries in driver_freq_table, as it is relatively costly. Instead, the recommended approach is to use the provided iteration macros, like cpufreq_for_each_valid_entry_idx(). Use cpufreq_for_each_entry_idx() instead of pointer subtraction in cpufreq_frequency_table_cpuinfo() which improves code clarity and follows the established cpufreq coding style. While at it, remove redundant local variable initialization from cpufreq_table_index_unsorted(). No functional change intended. Signed-off-by: Zihuan Zhang Link: https://patch.msgid.link/20250923075553.45532-1-zhangzihuan@kylinos.cn [ rjw: Subject tweak, changelog edits, local variable definition tweak ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/freq_table.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index d5111ee56e3803..7f251daf03ce32 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -33,16 +33,16 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy) struct cpufreq_frequency_table *pos, *table = policy->freq_table; unsigned int min_freq = ~0; unsigned int max_freq = 0; - unsigned int freq; + unsigned int freq, i; - cpufreq_for_each_valid_entry(pos, table) { + cpufreq_for_each_valid_entry_idx(pos, table, i) { freq = pos->frequency; if ((!cpufreq_boost_enabled() || !policy->boost_enabled) && (pos->flags & CPUFREQ_BOOST_FREQ)) continue; - pr_debug("table entry %u: %u kHz\n", (int)(pos - table), freq); + pr_debug("table entry %u: %u kHz\n", i, freq); if (freq < min_freq) min_freq = freq; if (freq > max_freq) @@ -126,7 +126,7 @@ int cpufreq_table_index_unsorted(struct cpufreq_policy *policy, }; struct cpufreq_frequency_table *pos; struct cpufreq_frequency_table *table = policy->freq_table; - unsigned int freq, diff, i = 0; + unsigned int freq, diff, i; int index; pr_debug("request for target %u kHz (relation: %u) for cpu %u\n", From 44b0fed0a5947f54fd14255cd0766df952267bc5 Mon Sep 17 00:00:00 2001 From: Matthew Schwartz Date: Thu, 11 Sep 2025 10:48:51 -0700 Subject: [PATCH 2700/3206] drm/amd/display: Only restore backlight after amdgpu_dm_init or dm_resume On clients that utilize AMD_PRIVATE_COLOR properties for HDR support, brightness sliders can include a hardware controlled portion and a gamma-based portion. This is the case on the Steam Deck OLED when using gamescope with Steam as a client. When a user sets a brightness level while HDR is active, the gamma-based portion and/or hardware portion are adjusted to achieve the desired brightness. However, when a modeset takes place while the gamma-based portion is in-use, restoring the hardware brightness level overrides the user's overall brightness level and results in a mismatch between what the slider reports and the display's current brightness. To avoid overriding gamma-based brightness, only restore HW backlight level after boot or resume. This ensures that the backlight level is set correctly after the DC layer resets it while avoiding interference with subsequent modesets. Fixes: 7875afafba84 ("drm/amd/display: Fix brightness level not retained over reboot") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4551 Signed-off-by: Matthew Schwartz Reviewed-by: Mario Limonciello Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit a490c8d77d500b5981e739be3d59c60cfe382536) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 ++++++++---- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 7 +++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 97d9eba1796357..ef026143dc1ca9 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2037,6 +2037,8 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) dc_hardware_init(adev->dm.dc); + adev->dm.restore_backlight = true; + adev->dm.hpd_rx_offload_wq = hpd_rx_irq_create_workqueue(adev); if (!adev->dm.hpd_rx_offload_wq) { drm_err(adev_to_drm(adev), "failed to create hpd rx offload workqueue.\n"); @@ -3399,6 +3401,7 @@ static int dm_resume(struct amdgpu_ip_block *ip_block) dc_set_power_state(dm->dc, DC_ACPI_CM_POWER_STATE_D0); dc_resume(dm->dc); + adev->dm.restore_backlight = true; amdgpu_dm_irq_resume_early(adev); @@ -9829,7 +9832,6 @@ static void amdgpu_dm_commit_streams(struct drm_atomic_state *state, bool mode_set_reset_required = false; u32 i; struct dc_commit_streams_params params = {dc_state->streams, dc_state->stream_count}; - bool set_backlight_level = false; /* Disable writeback */ for_each_old_connector_in_state(state, connector, old_con_state, i) { @@ -9949,7 +9951,6 @@ static void amdgpu_dm_commit_streams(struct drm_atomic_state *state, acrtc->hw_mode = new_crtc_state->mode; crtc->hwmode = new_crtc_state->mode; mode_set_reset_required = true; - set_backlight_level = true; } else if (modereset_required(new_crtc_state)) { drm_dbg_atomic(dev, "Atomic commit: RESET. crtc id %d:[%p]\n", @@ -10006,13 +10007,16 @@ static void amdgpu_dm_commit_streams(struct drm_atomic_state *state, * to fix a flicker issue. * It will cause the dm->actual_brightness is not the current panel brightness * level. (the dm->brightness is the correct panel level) - * So we set the backlight level with dm->brightness value after set mode + * So we set the backlight level with dm->brightness value after initial + * set mode. Use restore_backlight flag to avoid setting backlight level + * for every subsequent mode set. */ - if (set_backlight_level) { + if (dm->restore_backlight) { for (i = 0; i < dm->num_of_edps; i++) { if (dm->backlight_dev[i]) amdgpu_dm_backlight_set_level(dm, i, dm->brightness[i]); } + dm->restore_backlight = false; } } diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index b937da0a4e4a00..6aae51c1beb363 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -610,6 +610,13 @@ struct amdgpu_display_manager { */ u32 actual_brightness[AMDGPU_DM_MAX_NUM_EDP]; + /** + * @restore_backlight: + * + * Flag to indicate whether to restore backlight after modeset. + */ + bool restore_backlight; + /** * @aux_hpd_discon_quirk: * From 1c3217dd557d4f04eb4c2afcd7c76de6b4d66ccb Mon Sep 17 00:00:00 2001 From: Alvin Lee Date: Tue, 9 Sep 2025 16:03:08 -0400 Subject: [PATCH 2701/3206] drm/amd/display: Use mpc.preblend flag to indicate preblend [Description] Modifications in per asic capability means mpc.preblend flag should be used to indicate preblend. Update relevant paths to use this flag. Fixes: 39923050615c ("drm/amd/display: Clear DPP 3DLUT Cap") Reviewed-by: Dillon Varone Signed-off-by: Alvin Lee Signed-off-by: Ivan Lipski Tested-by: Dan Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 9e5d4a5e27c6dc4e1b4fc9d654d13de12b8ce156) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c | 2 +- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c index ebabfe3a512f49..c0dfe2d8b3becd 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c @@ -821,7 +821,7 @@ int amdgpu_dm_verify_lut3d_size(struct amdgpu_device *adev, struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state); const struct drm_color_lut *shaper = NULL, *lut3d = NULL; uint32_t exp_size, size, dim_size = MAX_COLOR_3DLUT_SIZE; - bool has_3dlut = adev->dm.dc->caps.color.dpp.hw_3d_lut; + bool has_3dlut = adev->dm.dc->caps.color.dpp.hw_3d_lut || adev->dm.dc->caps.color.mpc.preblend; /* shaper LUT is only available if 3D LUT color caps */ exp_size = has_3dlut ? MAX_COLOR_LUT_ENTRIES : 0; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c index eef51652ca3560..3d2f8eedeef23b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c @@ -1633,7 +1633,7 @@ dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm, drm_object_attach_property(&plane->base, dm->adev->mode_info.plane_ctm_property, 0); - if (dpp_color_caps.hw_3d_lut) { + if (dpp_color_caps.hw_3d_lut || dm->dc->caps.color.mpc.preblend) { drm_object_attach_property(&plane->base, mode_info.plane_shaper_lut_property, 0); drm_object_attach_property(&plane->base, From 361ee85e980c16c9b9e236ccfac33014e8602485 Mon Sep 17 00:00:00 2001 From: Leo Li Date: Fri, 12 Sep 2025 11:01:50 -0400 Subject: [PATCH 2702/3206] drm/amd/display: Init DCN35 clocks from pre-os HW values [Why] We did not initialize dc clocks with boot-time hw values during init. This lead to incorrect clock values in dc, causing `dcn35_update_clocks` to make incorrect updates. [How] Correctly initialize DC with pre-os clk values from HW. s/dump/save/ as that accurately reflects the purpose of the functions. Fixes: 8774029f76b9 ("drm/amd/display: Add DCN35 CLK_MGR") Reviewed-by: Aurabindo Pillai Signed-off-by: Leo Li Signed-off-by: Fangzhi Zuo Signed-off-by: Ivan Lipski Tested-by: Dan Wheeler Signed-off-by: Alex Deucher (cherry picked from commit d43cc4ea1f9d720ab4bf06806f79260bfe981508) --- .../display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c | 121 +++++++++++++++++- 1 file changed, 119 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c index bb1ac12a2b0955..0e638bc6bf77bf 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c @@ -587,9 +587,118 @@ bool dcn35_are_clock_states_equal(struct dc_clocks *a, return true; } -static void dcn35_dump_clk_registers(struct clk_state_registers_and_bypass *regs_and_bypass, +static void dcn35_save_clk_registers_internal(struct dcn35_clk_internal *internal, struct clk_mgr *clk_mgr_base) +{ + struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base); + + // read dtbclk + internal->CLK1_CLK4_CURRENT_CNT = REG_READ(CLK1_CLK4_CURRENT_CNT); + internal->CLK1_CLK4_BYPASS_CNTL = REG_READ(CLK1_CLK4_BYPASS_CNTL); + + // read dcfclk + internal->CLK1_CLK3_CURRENT_CNT = REG_READ(CLK1_CLK3_CURRENT_CNT); + internal->CLK1_CLK3_BYPASS_CNTL = REG_READ(CLK1_CLK3_BYPASS_CNTL); + + // read dcf deep sleep divider + internal->CLK1_CLK3_DS_CNTL = REG_READ(CLK1_CLK3_DS_CNTL); + internal->CLK1_CLK3_ALLOW_DS = REG_READ(CLK1_CLK3_ALLOW_DS); + + // read dppclk + internal->CLK1_CLK1_CURRENT_CNT = REG_READ(CLK1_CLK1_CURRENT_CNT); + internal->CLK1_CLK1_BYPASS_CNTL = REG_READ(CLK1_CLK1_BYPASS_CNTL); + + // read dprefclk + internal->CLK1_CLK2_CURRENT_CNT = REG_READ(CLK1_CLK2_CURRENT_CNT); + internal->CLK1_CLK2_BYPASS_CNTL = REG_READ(CLK1_CLK2_BYPASS_CNTL); + + // read dispclk + internal->CLK1_CLK0_CURRENT_CNT = REG_READ(CLK1_CLK0_CURRENT_CNT); + internal->CLK1_CLK0_BYPASS_CNTL = REG_READ(CLK1_CLK0_BYPASS_CNTL); +} + +static void dcn35_save_clk_registers(struct clk_state_registers_and_bypass *regs_and_bypass, struct clk_mgr_dcn35 *clk_mgr) { + struct dcn35_clk_internal internal = {0}; + char *bypass_clks[5] = {"0x0 DFS", "0x1 REFCLK", "0x2 ERROR", "0x3 400 FCH", "0x4 600 FCH"}; + + dcn35_save_clk_registers_internal(&internal, &clk_mgr->base.base); + + regs_and_bypass->dcfclk = internal.CLK1_CLK3_CURRENT_CNT / 10; + regs_and_bypass->dcf_deep_sleep_divider = internal.CLK1_CLK3_DS_CNTL / 10; + regs_and_bypass->dcf_deep_sleep_allow = internal.CLK1_CLK3_ALLOW_DS; + regs_and_bypass->dprefclk = internal.CLK1_CLK2_CURRENT_CNT / 10; + regs_and_bypass->dispclk = internal.CLK1_CLK0_CURRENT_CNT / 10; + regs_and_bypass->dppclk = internal.CLK1_CLK1_CURRENT_CNT / 10; + regs_and_bypass->dtbclk = internal.CLK1_CLK4_CURRENT_CNT / 10; + + regs_and_bypass->dppclk_bypass = internal.CLK1_CLK1_BYPASS_CNTL & 0x0007; + if (regs_and_bypass->dppclk_bypass < 0 || regs_and_bypass->dppclk_bypass > 4) + regs_and_bypass->dppclk_bypass = 0; + regs_and_bypass->dcfclk_bypass = internal.CLK1_CLK3_BYPASS_CNTL & 0x0007; + if (regs_and_bypass->dcfclk_bypass < 0 || regs_and_bypass->dcfclk_bypass > 4) + regs_and_bypass->dcfclk_bypass = 0; + regs_and_bypass->dispclk_bypass = internal.CLK1_CLK0_BYPASS_CNTL & 0x0007; + if (regs_and_bypass->dispclk_bypass < 0 || regs_and_bypass->dispclk_bypass > 4) + regs_and_bypass->dispclk_bypass = 0; + regs_and_bypass->dprefclk_bypass = internal.CLK1_CLK2_BYPASS_CNTL & 0x0007; + if (regs_and_bypass->dprefclk_bypass < 0 || regs_and_bypass->dprefclk_bypass > 4) + regs_and_bypass->dprefclk_bypass = 0; + + if (clk_mgr->base.base.ctx->dc->debug.pstate_enabled) { + DC_LOG_SMU("clk_type,clk_value,deepsleep_cntl,deepsleep_allow,bypass\n"); + + DC_LOG_SMU("dcfclk,%d,%d,%d,%s\n", + regs_and_bypass->dcfclk, + regs_and_bypass->dcf_deep_sleep_divider, + regs_and_bypass->dcf_deep_sleep_allow, + bypass_clks[(int) regs_and_bypass->dcfclk_bypass]); + + DC_LOG_SMU("dprefclk,%d,N/A,N/A,%s\n", + regs_and_bypass->dprefclk, + bypass_clks[(int) regs_and_bypass->dprefclk_bypass]); + + DC_LOG_SMU("dispclk,%d,N/A,N/A,%s\n", + regs_and_bypass->dispclk, + bypass_clks[(int) regs_and_bypass->dispclk_bypass]); + + // REGISTER VALUES + DC_LOG_SMU("reg_name,value,clk_type"); + + DC_LOG_SMU("CLK1_CLK3_CURRENT_CNT,%d,dcfclk", + internal.CLK1_CLK3_CURRENT_CNT); + + DC_LOG_SMU("CLK1_CLK4_CURRENT_CNT,%d,dtbclk", + internal.CLK1_CLK4_CURRENT_CNT); + + DC_LOG_SMU("CLK1_CLK3_DS_CNTL,%d,dcf_deep_sleep_divider", + internal.CLK1_CLK3_DS_CNTL); + + DC_LOG_SMU("CLK1_CLK3_ALLOW_DS,%d,dcf_deep_sleep_allow", + internal.CLK1_CLK3_ALLOW_DS); + + DC_LOG_SMU("CLK1_CLK2_CURRENT_CNT,%d,dprefclk", + internal.CLK1_CLK2_CURRENT_CNT); + + DC_LOG_SMU("CLK1_CLK0_CURRENT_CNT,%d,dispclk", + internal.CLK1_CLK0_CURRENT_CNT); + + DC_LOG_SMU("CLK1_CLK1_CURRENT_CNT,%d,dppclk", + internal.CLK1_CLK1_CURRENT_CNT); + + DC_LOG_SMU("CLK1_CLK3_BYPASS_CNTL,%d,dcfclk_bypass", + internal.CLK1_CLK3_BYPASS_CNTL); + + DC_LOG_SMU("CLK1_CLK2_BYPASS_CNTL,%d,dprefclk_bypass", + internal.CLK1_CLK2_BYPASS_CNTL); + + DC_LOG_SMU("CLK1_CLK0_BYPASS_CNTL,%d,dispclk_bypass", + internal.CLK1_CLK0_BYPASS_CNTL); + + DC_LOG_SMU("CLK1_CLK1_BYPASS_CNTL,%d,dppclk_bypass", + internal.CLK1_CLK1_BYPASS_CNTL); + + } } static bool dcn35_is_spll_ssc_enabled(struct clk_mgr *clk_mgr_base) @@ -623,6 +732,7 @@ static void init_clk_states(struct clk_mgr *clk_mgr) void dcn35_init_clocks(struct clk_mgr *clk_mgr) { struct clk_mgr_internal *clk_mgr_int = TO_CLK_MGR_INTERNAL(clk_mgr); + struct clk_mgr_dcn35 *clk_mgr_dcn35 = TO_CLK_MGR_DCN35(clk_mgr_int); init_clk_states(clk_mgr); @@ -633,6 +743,13 @@ void dcn35_init_clocks(struct clk_mgr *clk_mgr) else clk_mgr->dp_dto_source_clock_in_khz = clk_mgr->dprefclk_khz; + dcn35_save_clk_registers(&clk_mgr->boot_snapshot, clk_mgr_dcn35); + + clk_mgr->clks.ref_dtbclk_khz = clk_mgr->boot_snapshot.dtbclk * 10; + if (clk_mgr->boot_snapshot.dtbclk > 59000) { + /*dtbclk enabled based on */ + clk_mgr->clks.dtbclk_en = true; + } } static struct clk_bw_params dcn35_bw_params = { .vram_type = Ddr4MemType, @@ -1323,7 +1440,7 @@ void dcn35_clk_mgr_construct( dcn35_bw_params.wm_table = ddr5_wm_table; } /* Saved clocks configured at boot for debug purposes */ - dcn35_dump_clk_registers(&clk_mgr->base.base.boot_snapshot, clk_mgr); + dcn35_save_clk_registers(&clk_mgr->base.base.boot_snapshot, clk_mgr); clk_mgr->base.base.dprefclk_khz = dcn35_smu_get_dprefclk(&clk_mgr->base); clk_mgr->base.base.clks.ref_dtbclk_khz = 600000; From 41b1f9fcba62b06195e625bb88c1031102892439 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Mon, 1 Sep 2025 18:51:05 -0300 Subject: [PATCH 2703/3206] drm/amd/display: remove output_tf_change flag Remove this flag as the driver stopped managing it individually since commit a4056c2a6344 ("drm/amd/display: use HW hdr mult for brightness boost"). After some back and forth it was reintroduced as a condition to `set_output_transfer_func()` in [1]. Without direct management, this flag only changes value when all surface update flags are set true on UPDATE_TYPE_FULL with no output TF status meaning. Fixes: bb622e0c0044 ("drm/amd/display: program output tf when required") [1] Signed-off-by: Melissa Wen Reviewed-by: Alex Hung Signed-off-by: Alex Deucher (cherry picked from commit 752e6f283ec59ae007aa15a93d5a4b2eefa8cec9) --- drivers/gpu/drm/amd/display/dc/dc.h | 1 - drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c | 6 ++---- drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c | 6 ++---- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index f24e1da68269e4..8c230cf8939b53 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1348,7 +1348,6 @@ union surface_update_flags { uint32_t in_transfer_func_change:1; uint32_t input_csc_change:1; uint32_t coeff_reduction_change:1; - uint32_t output_tf_change:1; uint32_t pixel_format_change:1; uint32_t plane_size_change:1; uint32_t gamut_remap_change:1; diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index b7c2d3095b25c5..5e57bd1a08e73d 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -1982,10 +1982,8 @@ static void dcn20_program_pipe( * updating on slave planes */ if (pipe_ctx->update_flags.bits.enable || - pipe_ctx->update_flags.bits.plane_changed || - pipe_ctx->stream->update_flags.bits.out_tf || - (pipe_ctx->plane_state && - pipe_ctx->plane_state->update_flags.bits.output_tf_change)) + pipe_ctx->update_flags.bits.plane_changed || + pipe_ctx->stream->update_flags.bits.out_tf) hws->funcs.set_output_transfer_func(dc, pipe_ctx, pipe_ctx->stream); /* If the pipe has been enabled or has a different opp, we diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c index cc9f40d97af2fb..61167c19359d57 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c @@ -2019,10 +2019,8 @@ void dcn401_program_pipe( * updating on slave planes */ if (pipe_ctx->update_flags.bits.enable || - pipe_ctx->update_flags.bits.plane_changed || - pipe_ctx->stream->update_flags.bits.out_tf || - (pipe_ctx->plane_state && - pipe_ctx->plane_state->update_flags.bits.output_tf_change)) + pipe_ctx->update_flags.bits.plane_changed || + pipe_ctx->stream->update_flags.bits.out_tf) hws->funcs.set_output_transfer_func(dc, pipe_ctx, pipe_ctx->stream); /* If the pipe has been enabled or has a different opp, we From e3f761be5a178bdd5cd80351c5ca0a0cf675ef7e Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Mon, 25 Aug 2025 04:09:02 +0800 Subject: [PATCH 2704/3206] tools/power/x86/amd_pstate_tracer: Fix python gnuplot package names The prerequisites section listed non-existent packages "phython-gnuplot" and "phython3-gnuplot", which may mislead users and cause installation failures. Update the names to the correct distribution package names "python-gnuplot" and "python3-gnuplot", helping users avoid confusion and saving time for those following the instructions. Signed-off-by: Kuan-Wei Chiu Reviewed-by: Mario Limonciello Signed-off-by: Mario Limonciello (AMD) --- tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py b/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py index feb9f9421c7bcd..875b086550d1ba 100755 --- a/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py +++ b/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py @@ -11,7 +11,7 @@ gnuplot 5.0 or higher gnuplot-py 1.8 or higher (Most of the distributions have these required packages. They may be called - gnuplot-py, phython-gnuplot or phython3-gnuplot, gnuplot-nox, ... ) + gnuplot-py, python-gnuplot or python3-gnuplot, gnuplot-nox, ... ) Kernel config for Linux trace is enabled From a91ae3c89311648cbaa9b46b860e4f76004a24b8 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Sep 2025 11:01:49 +0000 Subject: [PATCH 2705/3206] bpf, x86: Add support for signed arena loads Currently, signed load instructions into arena memory are unsupported. The compiler is free to generate these, and on GCC-14 we see a corresponding error when it happens. The hurdle in supporting them is deciding which unused opcode to use to mark them for the JIT's own consumption. After much thinking, it appears 0xc0 / BPF_NOSPEC can be combined with load instructions to identify signed arena loads. Use this to recognize and JIT them appropriately, and remove the verifier side limitation on the program if the JIT supports them. Co-developed-by: Puranjay Mohan Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20250923110157.18326-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 5 +++++ arch/riscv/net/bpf_jit_comp64.c | 5 +++++ arch/s390/net/bpf_jit_comp.c | 5 +++++ arch/x86/net/bpf_jit_comp.c | 40 ++++++++++++++++++++++++++++++--- include/linux/filter.h | 3 +++ kernel/bpf/verifier.c | 11 ++++++--- 6 files changed, 63 insertions(+), 6 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index e36261c6395296..796938b535cd1f 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -3064,6 +3064,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) if (!bpf_atomic_is_load_store(insn) && !cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) return false; + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; } return true; } diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 14d7aab61fcb36..83672373d02640 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -2066,6 +2066,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) case BPF_STX | BPF_ATOMIC | BPF_DW: if (insn->imm == BPF_CMPXCHG) return rv_ext_enabled(ZACAS); + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; } } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 8b57d8532f362e..cf461d76e9da32 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2967,6 +2967,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) case BPF_STX | BPF_ATOMIC | BPF_DW: if (bpf_atomic_is_load_store(insn)) return false; + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; } return true; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8d34a9400a5e49..fc13306af15fa8 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1152,11 +1152,38 @@ static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 i *pprog = prog; } +static void emit_ldsx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* movsx rax, byte ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBE); + break; + case BPF_H: + /* movsx rax, word ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBF); + break; + case BPF_W: + /* movsx rax, dword ptr [rax + r12 + off] */ + EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x63); + break; + } + emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off); + *pprog = prog; +} + static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off); } +static void emit_ldsx_r12(u8 **prog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + emit_ldsx_index(prog, size, dst_reg, src_reg, X86_REG_R12, off); +} + /* STX: *(u8*)(dst_reg + off) = src_reg */ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -2109,15 +2136,22 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_B: case BPF_STX | BPF_PROBE_MEM32 | BPF_H: case BPF_STX | BPF_PROBE_MEM32 | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: start_of_ldx = prog; - if (BPF_CLASS(insn->code) == BPF_LDX) - emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); - else + if (BPF_CLASS(insn->code) == BPF_LDX) { + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) + emit_ldsx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + else + emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } else { emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } populate_extable: { struct exception_table_entry *ex; diff --git a/include/linux/filter.h b/include/linux/filter.h index 4241a885975fa1..f5c859b8131a3e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -78,6 +78,9 @@ struct ctl_table_header; /* unused opcode to mark special atomic instruction */ #define BPF_PROBE_ATOMIC 0xe0 +/* unused opcode to mark special ldsx instruction. Same as BPF_NOSPEC */ +#define BPF_PROBE_MEM32SX 0xc0 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ceeb0ffe7d6743..1641992371761f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21379,10 +21379,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; case PTR_TO_ARENA: if (BPF_MODE(insn->code) == BPF_MEMSX) { - verbose(env, "sign extending loads from arena are not supported yet\n"); - return -EOPNOTSUPP; + if (!bpf_jit_supports_insn(insn, true)) { + verbose(env, "sign extending loads from arena are not supported yet\n"); + return -EOPNOTSUPP; + } + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code); + } else { + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); } - insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); env->prog->aux->num_exentries++; continue; default: @@ -21588,6 +21592,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEM32 || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) num_exentries++; if ((BPF_CLASS(insn->code) == BPF_STX || From eab2a71f3a6a5c5f4d4acaffb962cb39c199421c Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 23 Sep 2025 11:01:50 +0000 Subject: [PATCH 2706/3206] bpf, arm64: Add support for signed arena loads Add support for signed loads from arena which are internally converted to loads with mode set BPF_PROBE_MEM32SX by the verifier. The implementation is similar to BPF_PROBE_MEMSX and BPF_MEMSX but for BPF_PROBE_MEM32SX, arena_vm_base is added to the src register to form the address. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20250923110157.18326-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 796938b535cd1f..288d9cc73919ff 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1133,12 +1133,14 @@ static int add_exception_handler(const struct bpf_insn *insn, return 0; if (BPF_MODE(insn->code) != BPF_PROBE_MEM && - BPF_MODE(insn->code) != BPF_PROBE_MEMSX && - BPF_MODE(insn->code) != BPF_PROBE_MEM32 && - BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) + BPF_MODE(insn->code) != BPF_PROBE_MEMSX && + BPF_MODE(insn->code) != BPF_PROBE_MEM32 && + BPF_MODE(insn->code) != BPF_PROBE_MEM32SX && + BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) return 0; is_arena = (BPF_MODE(insn->code) == BPF_PROBE_MEM32) || + (BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) || (BPF_MODE(insn->code) == BPF_PROBE_ATOMIC); if (!ctx->prog->aux->extable || @@ -1659,7 +1661,11 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: - if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_W: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32 || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) { emit(A64_ADD(1, tmp2, src, arena_vm_base), ctx); src = tmp2; } @@ -1671,7 +1677,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, off_adj = off; } sign_extend = (BPF_MODE(insn->code) == BPF_MEMSX || - BPF_MODE(insn->code) == BPF_PROBE_MEMSX); + BPF_MODE(insn->code) == BPF_PROBE_MEMSX || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX); switch (BPF_SIZE(code)) { case BPF_W: if (is_lsi_offset(off_adj, 2)) { @@ -1879,9 +1886,11 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (ret) return ret; - ret = add_exception_handler(insn, ctx, dst); - if (ret) - return ret; + if (BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) { + ret = add_exception_handler(insn, ctx, dst); + if (ret) + return ret; + } break; default: @@ -3064,11 +3073,6 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) if (!bpf_atomic_is_load_store(insn) && !cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) return false; - break; - case BPF_LDX | BPF_MEMSX | BPF_B: - case BPF_LDX | BPF_MEMSX | BPF_H: - case BPF_LDX | BPF_MEMSX | BPF_W: - return false; } return true; } From f61654912404155e9097aaf88aa2445edf80f8d3 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 23 Sep 2025 11:01:51 +0000 Subject: [PATCH 2707/3206] selftests: bpf: Add tests for signed loads from arena Add tests for loading 8, 16, and 32 bits with sign extension from arena, also verify that exception handling is working correctly and correct assembly is being generated by the x86 and arm64 JITs. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20250923110157.18326-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_ldsx.c | 176 ++++++++++++++++++ 1 file changed, 176 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index f087ffb79f203c..c8494b682c3193 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -3,6 +3,7 @@ #include #include #include "bpf_misc.h" +#include "bpf_arena_common.h" #if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ @@ -10,6 +11,12 @@ defined(__TARGET_ARCH_loongarch)) && \ __clang_major__ >= 18 +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 1); +} arena SEC(".maps"); + SEC("socket") __description("LDSX, S8") __success __success_unpriv __retval(-2) @@ -256,6 +263,175 @@ __naked void ldsx_ctx_8(void) : __clobber_all); } +SEC("syscall") +__description("Arena LDSX Disasm") +__success +__arch_x86_64 +__jited("movslq 0x10(%rax,%r12), %r14") +__jited("movswq 0x18(%rax,%r12), %r14") +__jited("movsbq 0x20(%rax,%r12), %r14") +__jited("movslq 0x10(%rdi,%r12), %r15") +__jited("movswq 0x18(%rdi,%r12), %r15") +__jited("movsbq 0x20(%rdi,%r12), %r15") +__arch_arm64 +__jited("add x11, x7, x28") +__jited("ldrsw x21, [x11, #0x10]") +__jited("add x11, x7, x28") +__jited("ldrsh x21, [x11, #0x18]") +__jited("add x11, x7, x28") +__jited("ldrsb x21, [x11, #0x20]") +__jited("add x11, x0, x28") +__jited("ldrsw x22, [x11, #0x10]") +__jited("add x11, x0, x28") +__jited("ldrsh x22, [x11, #0x18]") +__jited("add x11, x0, x28") +__jited("ldrsb x22, [x11, #0x20]") +__naked void arena_ldsx_disasm(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = r0;" + "r8 = *(s32 *)(r0 + 16);" + "r8 = *(s16 *)(r0 + 24);" + "r8 = *(s8 *)(r0 + 32);" + "r9 = *(s32 *)(r1 + 16);" + "r9 = *(s16 *)(r1 + 24);" + "r9 = *(s8 *)(r1 + 32);" + "r0 = 0;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX Exception") +__success __retval(0) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_exception(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r0 = 0xdeadbeef;" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0x3fe;" + "*(u64 *)(r0 + 0) = r1;" + "r0 = *(s8 *)(r0 + 0);" + "exit;" + : + : __imm_addr(arena) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX, S8") +__success __retval(-1) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_s8(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0x3fe;" + "*(u64 *)(r0 + 0) = r1;" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r0 = *(s8 *)(r0 + 0);" +#else + "r0 = *(s8 *)(r0 + 7);" +#endif + "r0 >>= 1;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX, S16") +__success __retval(-1) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_s16(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0x3fffe;" + "*(u64 *)(r0 + 0) = r1;" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r0 = *(s16 *)(r0 + 0);" +#else + "r0 = *(s16 *)(r0 + 6);" +#endif + "r0 >>= 1;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX, S32") +__success __retval(-1) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_s32(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0xfffffffe;" + "*(u64 *)(r0 + 0) = r1;" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r0 = *(s32 *)(r0 + 0);" +#else + "r0 = *(s32 *)(r0 + 4);" +#endif + "r0 >>= 1;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +/* to retain debug info for BTF generation */ +void kfunc_root(void) +{ + bpf_arena_alloc_pages(0, 0, 0, 0, 0); +} + #else SEC("socket") From c8191ee8e64a8c5c021a34e32868f2380965e82b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:25 -1000 Subject: [PATCH 2708/3206] sched_ext: Use rhashtable_lookup() instead of rhashtable_lookup_fast() The find_user_dsq() function is called from contexts that are already under RCU read lock protection. Switch from rhashtable_lookup_fast() to rhashtable_lookup() to avoid redundant RCU locking. Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f7e17dc0422e26..0f17b715461f18 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -207,7 +207,7 @@ static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) { - return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); + return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); } /* From edf005fa274a0c224e550a52726aa7a426384e36 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2709/3206] sched_ext: Improve SCX_KF_DISPATCH comment The comment for SCX_KF_DISPATCH was incomplete and didn't explain that ops.dispatch() may temporarily release the rq lock, allowing ENQUEUE and SELECT_CPU operations to be nested inside DISPATCH contexts. Update the comment to clarify this nesting behavior and provide better context for when these operations can occur within dispatch. Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 7047101dbf587a..d82b7a9b0658be 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -108,7 +108,11 @@ enum scx_kf_mask { SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ - /* ops.dequeue (in REST) may be nested inside DISPATCH */ + /* + * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and + * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be + * nested inside DISPATCH. + */ SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ From f75efc8f4c0d52f6fe53a0acd9629e3ac017fc3e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2710/3206] sched_ext: Fix stray scx_root usage in task_can_run_on_remote_rq() task_can_run_on_remote_rq() takes @sch but it is using scx_root when incrementing SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, which is inconsistent and gets in the way of implementing multiple scheduler support. Use @sch instead. As currently scx_root is the only possible scheduler instance, this doesn't cause any behavior changes. Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 0f17b715461f18..8769cfdc22e3ac 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1622,8 +1622,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch, if (!scx_rq_online(rq)) { if (enforce) - __scx_add_event(scx_root, - SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); + __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; } From c7e739746dbde9ae401fd88824f5656c5e2361fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2711/3206] sched_ext: Use bitfields for boolean warning flags Convert warned_zero_slice and warned_deprecated_rq in scx_sched struct to single-bit bitfields. While this doesn't reduce struct size immediately, it prepares for future bitfield additions. v2: Update patch description. Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext_internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 2e289931e567fe..1a80d01b1f0c43 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -871,8 +871,8 @@ struct scx_sched { struct scx_dispatch_q **global_dsqs; struct scx_sched_pcpu __percpu *pcpu; - bool warned_zero_slice; - bool warned_deprecated_rq; + bool warned_zero_slice:1; + bool warned_deprecated_rq:1; atomic_t exit_kind; struct scx_exit_info *exit_info; From f3aec2adce8dbe37dabff47a16bfb260b987e0b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2712/3206] sched_ext: Add SCX_EFLAG_INITIALIZED to indicate successful ops.init() ops.exit() may be called even if the loading failed before ops.init() finishes successfully. This is because ops.exit() allows rich exit info communication. Add SCX_EFLAG_INITIALIZED flag to scx_exit_info.flags to indicate whether ops.init() finished successfully. This enables BPF schedulers to distinguish between exit scenarios and handle cleanup appropriately based on initialization state. Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 1 + kernel/sched/ext_internal.h | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8769cfdc22e3ac..7368075dbfd283 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4554,6 +4554,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_error(sch, "ops.init() failed (%d)", ret); goto err_disable; } + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; } for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 1a80d01b1f0c43..b3617abed51081 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -62,6 +62,16 @@ enum scx_exit_code { SCX_ECODE_ACT_RESTART = 1LLU << 48, }; +enum scx_exit_flags { + /* + * ops.exit() may be called even if the loading failed before ops.init() + * finishes successfully. This is because ops.exit() allows rich exit + * info communication. The following flag indicates whether ops.init() + * finished successfully. + */ + SCX_EFLAG_INITIALIZED, +}; + /* * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is * being disabled. @@ -73,6 +83,9 @@ struct scx_exit_info { /* exit code if gracefully exiting */ s64 exit_code; + /* %SCX_EFLAG_* */ + u64 flags; + /* textual representation of the above */ const char *reason; From d452972858e5cfa4262320ab74fe8f016460b96f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2713/3206] sched_ext: Make qmap dump operation non-destructive The qmap dump operation was destructively consuming queue entries while displaying them. As dump can be triggered anytime, this can easily lead to stalls. Add a temporary dump_store queue and modify the dump logic to pop entries, display them, and then restore them back to the original queue. This allows dump operations to be performed without affecting the scheduler's queue state. Note that if racing against new enqueues during dump, ordering can get mixed up, but this is acceptable for debugging purposes. Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/scx_qmap.bpf.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 69d877501cb727..cd50a94326e3a9 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -56,7 +56,8 @@ struct qmap { queue1 SEC(".maps"), queue2 SEC(".maps"), queue3 SEC(".maps"), - queue4 SEC(".maps"); + queue4 SEC(".maps"), + dump_store SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); @@ -578,11 +579,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) return; scx_bpf_dump("QMAP FIFO[%d]:", i); + + /* + * Dump can be invoked anytime and there is no way to iterate in + * a non-destructive way. Pop and store in dump_store and then + * restore afterwards. If racing against new enqueues, ordering + * can get mixed up. + */ bpf_repeat(4096) { if (bpf_map_pop_elem(fifo, &pid)) break; + bpf_map_push_elem(&dump_store, &pid, 0); scx_bpf_dump(" %d", pid); } + + bpf_repeat(4096) { + if (bpf_map_pop_elem(&dump_store, &pid)) + break; + bpf_map_push_elem(fifo, &pid, 0); + } + scx_bpf_dump("\n"); } } From 7852e0fd1de05efadcd8b6f72140cd208c79ba56 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2714/3206] tools/sched_ext: scx_qmap: Make debug output quieter by default scx_qmap currently outputs verbose debug messages including cgroup operations and CPU online/offline events by default, which can be noisy during normal operation. While the existing -P option controls DSQ dumps and event statistics, there's no way to suppress the other debug messages. Split the debug output controls to make scx_qmap quieter by default. The -P option continues to control DSQ dumps and event statistics (print_dsqs_and_events), while a new -M option controls debug messages like cgroup operations and CPU events (print_msgs). This allows users to run scx_qmap with minimal output and selectively enable debug information as needed. Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/scx_qmap.bpf.c | 80 +++++++++++++++++++--------------- tools/sched_ext/scx_qmap.c | 12 +++-- 2 files changed, 53 insertions(+), 39 deletions(-) diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index cd50a94326e3a9..3072b593f89816 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -39,7 +39,8 @@ const volatile u32 stall_kernel_nth; const volatile u32 dsp_inf_loop_after; const volatile u32 dsp_batch; const volatile bool highpri_boosting; -const volatile bool print_shared_dsq; +const volatile bool print_dsqs_and_events; +const volatile bool print_msgs; const volatile s32 disallow_tgid; const volatile bool suppress_dump; @@ -633,22 +634,25 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args) { - bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", - cgrp->kn->id, args->weight, args->bw_period_us, - args->bw_quota_us, args->bw_burst_us); + if (print_msgs) + bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", + cgrp->kn->id, args->weight, args->bw_period_us, + args->bw_quota_us, args->bw_burst_us); return 0; } void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight) { - bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); + if (print_msgs) + bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); } void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) { - bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id, - period_us, quota_us, burst_us); + if (print_msgs) + bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", + cgrp->kn->id, period_us, quota_us, burst_us); } /* @@ -692,16 +696,20 @@ static void print_cpus(void) void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) { - bpf_printk("CPU %d coming online", cpu); - /* @cpu is already online at this point */ - print_cpus(); + if (print_msgs) { + bpf_printk("CPU %d coming online", cpu); + /* @cpu is already online at this point */ + print_cpus(); + } } void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) { - bpf_printk("CPU %d going offline", cpu); - /* @cpu is still online at this point */ - print_cpus(); + if (print_msgs) { + bpf_printk("CPU %d going offline", cpu); + /* @cpu is still online at this point */ + print_cpus(); + } } struct monitor_timer { @@ -799,35 +807,36 @@ static void dump_shared_dsq(void) static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { - struct scx_event_stats events; - bpf_rcu_read_lock(); dispatch_highpri(true); bpf_rcu_read_unlock(); monitor_cpuperf(); - if (print_shared_dsq) + if (print_dsqs_and_events) { + struct scx_event_stats events; + dump_shared_dsq(); - __COMPAT_scx_bpf_events(&events, sizeof(events)); - - bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", - scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", - scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", - scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); - bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", - scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); - bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", - scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", - scx_read_event(&events, SCX_EV_BYPASS_DURATION)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", - scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", - scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + __COMPAT_scx_bpf_events(&events, sizeof(events)); + + bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", + scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + } bpf_timer_start(timer, ONE_SEC_IN_NS, 0); return 0; @@ -839,7 +848,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) struct bpf_timer *timer; s32 ret; - print_cpus(); + if (print_msgs) + print_cpus(); ret = scx_bpf_create_dsq(SHARED_DSQ, -1); if (ret) diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index c4912ab2e76f21..ef701d45ba4358 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -20,7 +20,7 @@ const char help_fmt[] = "See the top-level comment in .bpf.c for more details.\n" "\n" "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" -" [-P] [-d PID] [-D LEN] [-p] [-v]\n" +" [-P] [-M] [-d PID] [-D LEN] [-p] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" @@ -28,7 +28,8 @@ const char help_fmt[] = " -T COUNT Stall every COUNT'th kernel thread\n" " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -b COUNT Dispatch upto COUNT tasks together\n" -" -P Print out DSQ content to trace_pipe every second, use with -b\n" +" -P Print out DSQ content and event counters to trace_pipe every second\n" +" -M Print out debug messages to trace_pipe\n" " -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" @@ -66,7 +67,7 @@ int main(int argc, char **argv) skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -87,7 +88,10 @@ int main(int argc, char **argv) skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); break; case 'P': - skel->rodata->print_shared_dsq = true; + skel->rodata->print_dsqs_and_events = true; + break; + case 'M': + skel->rodata->print_msgs = true; break; case 'H': skel->rodata->highpri_boosting = true; From 9fc687edf205dbc45bbdec60ea31e934a05ab6bc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2715/3206] sched_ext: Separate out scx_kick_cpu() and add @sch to it In preparation for multiple scheduler support, separate out scx_kick_cpu() from scx_bpf_kick_cpu() and add the @sch parameter to it. scx_bpf_kick_cpu() now acquires an RCU read lock, reads $scx_root, and calls scx_kick_cpu() with it if non-NULL. The passed in @sch parameter is not used yet. Internal uses of scx_bpf_kick_cpu() are converted to scx_kick_cpu(). Where $sch is available, it's used. In the pick_task_scx() path where no associated scheduler can be identified, $scx_root is used directly. Note that $scx_root cannot be NULL in this case. Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7368075dbfd283..12759b9769a54f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -136,7 +136,7 @@ static struct kset *scx_kset; #include static void process_ddsp_deferred_locals(struct rq *rq); -static void scx_bpf_kick_cpu(s32 cpu, u64 flags); +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); @@ -2125,10 +2125,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * balance(), we want to complete this scheduling cycle and then * start a new one. IOW, we want to call resched_curr() on the * next, most likely idle, task, not the current one. Use - * scx_bpf_kick_cpu() for deferred kicking. + * scx_kick_cpu() for deferred kicking. */ if (unlikely(!--nr_loops)) { - scx_bpf_kick_cpu(cpu_of(rq), 0); + scx_kick_cpu(sch, cpu_of(rq), 0); break; } } while (dspc->nr_tasks); @@ -2417,7 +2417,8 @@ static struct task_struct *pick_task_scx(struct rq *rq) p = first_local_task(rq); if (!p) { if (kick_idle) - scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); + scx_kick_cpu(rcu_dereference_sched(scx_root), + cpu_of(rq), SCX_KICK_IDLE); return NULL; } @@ -3721,7 +3722,7 @@ static void scx_clear_softlockup(void) * * - pick_next_task() suppresses zero slice warning. * - * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM * operations. * * - scx_prio_less() reverts to the default core_sched_at order. @@ -5809,17 +5810,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { __bpf_kfunc_start_defs(); -/** - * scx_bpf_kick_cpu - Trigger reschedule on a CPU - * @cpu: cpu to kick - * @flags: %SCX_KICK_* flags - * - * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or - * trigger rescheduling on a busy CPU. This can be called from any online - * scx_ops operation and the actual kicking is performed asynchronously through - * an irq work. - */ -__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) { struct rq *this_rq; unsigned long irq_flags; @@ -5872,6 +5863,26 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) local_irq_restore(irq_flags); } +/** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (likely(sch)) + scx_kick_cpu(sch, cpu, flags); +} + /** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ From fc6a93aa623f3e40bcddbacf5fc60d5aceda9cab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2716/3206] sched_ext: Add the @sch parameter to __bstr_format() In preparation for multiple scheduler support, add the @sch parameter to __bstr_format() and update the callers to read $scx_root, verify that it's not NULL and pass it in. The passed in @sch parameter is not used yet. Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 12759b9769a54f..7e8430ad06963b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6063,8 +6063,9 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __bpf_kfunc_end_defs(); -static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, - char *fmt, unsigned long long *data, u32 data__sz) +static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, + size_t line_size, char *fmt, unsigned long long *data, + u32 data__sz) { struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; s32 ret; @@ -6099,10 +6100,10 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, return ret; } -static s32 bstr_format(struct scx_bstr_buf *buf, +static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, char *fmt, unsigned long long *data, u32 data__sz) { - return __bstr_format(buf->data, buf->line, sizeof(buf->line), + return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), fmt, data, data__sz); } @@ -6121,10 +6122,13 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -6141,10 +6145,13 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -6164,17 +6171,24 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; struct scx_dump_data *dd = &scx_dump_data; struct scx_bstr_buf *buf = &dd->buf; s32 ret; + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + if (raw_smp_processor_id() != dd->cpu) { scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); return; } /* append the formatted string to the line buf */ - ret = __bstr_format(buf->data, buf->line + dd->cursor, + ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, sizeof(buf->line) - dd->cursor, fmt, data, data__sz); if (ret < 0) { dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", From 2407bae23d1e93186afccb6a6a98d9bd6a7c8a74 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2717/3206] sched_ext: Add the @sch parameter to ext_idle helpers In preparation for multiple scheduler support, add the @sch parameter to validate_node(), check_builtin_idle_enabled() and select_cpu_from_kfunc(), and update their callers to read $scx_root, verify that it's not NULL and pass it in. The passed in @sch parameter is not used yet. Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext_idle.c | 109 ++++++++++++++++++++++++++++++++++------ 1 file changed, 94 insertions(+), 15 deletions(-) diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7174e1c1a392c7..6e2504ae73574d 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -819,7 +819,7 @@ void scx_idle_disable(void) * Helpers that can be called from the BPF scheduler. */ -static int validate_node(int node) +static int validate_node(struct scx_sched *sch, int node) { if (!static_branch_likely(&scx_builtin_idle_per_node)) { scx_kf_error("per-node idle tracking is disabled"); @@ -847,7 +847,7 @@ static int validate_node(int node) __bpf_kfunc_start_defs(); -static bool check_builtin_idle_enabled(void) +static bool check_builtin_idle_enabled(struct scx_sched *sch) { if (static_branch_likely(&scx_builtin_idle_enabled)) return true; @@ -856,7 +856,8 @@ static bool check_builtin_idle_enabled(void) return false; } -static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, +static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, + s32 prev_cpu, u64 wake_flags, const struct cpumask *allowed, u64 flags) { struct rq *rq; @@ -866,7 +867,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f if (!kf_cpu_valid(prev_cpu, NULL)) return -EINVAL; - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return -EBUSY; /* @@ -946,15 +947,21 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { + struct scx_sched *sch; s32 cpu; - cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0); + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { *is_idle = true; return cpu; } *is_idle = false; - return prev_cpu; } @@ -981,7 +988,16 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *cpus_allowed, u64 flags) { - return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, + cpus_allowed, flags); } /** @@ -995,7 +1011,15 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + + node = validate_node(sch, node); if (node < 0) return cpu_none_mask; @@ -1011,12 +1035,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return cpu_none_mask; return idle_cpumask(NUMA_NO_NODE)->cpu; @@ -1034,7 +1066,15 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + + node = validate_node(sch, node); if (node < 0) return cpu_none_mask; @@ -1054,12 +1094,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return cpu_none_mask; if (sched_smt_active()) @@ -1095,7 +1143,15 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) */ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) { - if (!check_builtin_idle_enabled()) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return false; + + if (!check_builtin_idle_enabled(sch)) return false; if (!kf_cpu_valid(cpu, NULL)) @@ -1126,7 +1182,15 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, int node, u64 flags) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + node = validate_node(sch, node); if (node < 0) return node; @@ -1158,12 +1222,20 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { scx_kf_error("per-node idle tracking is enabled"); return -EBUSY; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return -EBUSY; return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); @@ -1193,9 +1265,16 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, int node, u64 flags) { + struct scx_sched *sch; s32 cpu; - node = validate_node(node); + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + node = validate_node(sch, node); if (node < 0) return node; From 956f2b11a8a4fd2793aaa8a672c70206f0ce4655 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2718/3206] sched_ext: Drop kf_cpu_valid() The intention behind kf_cpu_valid() was that when called from kfuncs, kf_cpu_valid() would be able to implicitly determine the scx_sched instance being operated on and thus wouldn't need @sch passed in explicitly. This turned out to be unnecessarily complicated to implement and not have justifiable practical benefits. Replace kf_cpu_valid() usages with ops_cpu_valid() which takes explicit @sch. Callers which don't have $sch available in the context are updated to read $scx_root under RCU read lock, verify that it's not NULL and pass it in. scx_bpf_cpu_rq() is restructured to use guard(rcu)() instead of explicit rcu_read_[un]lock(). Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 67 ++++++++++++++++++++++++----------------- kernel/sched/ext_idle.c | 12 +++++--- 2 files changed, 48 insertions(+), 31 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7e8430ad06963b..8a8fd57c62c639 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -736,23 +736,6 @@ static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) } } -/** - * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args - * @cpu: cpu number which came from a BPF ops - * @where: extra information reported on error - * - * The same as ops_cpu_valid() but @sch is implicit. - */ -static bool kf_cpu_valid(u32 cpu, const char *where) -{ - if (__cpu_valid(cpu)) { - return true; - } else { - scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); - return false; - } -} - /** * ops_sanitize_err - Sanitize a -errno value * @sch: scx_sched to error out on error @@ -5815,7 +5798,7 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) struct rq *this_rq; unsigned long irq_flags; - if (!kf_cpu_valid(cpu, NULL)) + if (!ops_cpu_valid(sch, cpu, NULL)) return; local_irq_save(irq_flags); @@ -6224,7 +6207,12 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, */ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) { - if (kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_cpu_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -6246,7 +6234,12 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) */ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) { - if (kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_freq_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -6268,12 +6261,20 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) */ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(sch); + if (unlikely(!sch)) + return; + if (unlikely(perf > SCX_CPUPERF_ONE)) { scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); return; } - if (kf_cpu_valid(cpu, NULL)) { + if (ops_cpu_valid(sch, cpu, NULL)) { struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); struct rq_flags rf; @@ -6379,18 +6380,21 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) { struct scx_sched *sch; - if (!kf_cpu_valid(cpu, NULL)) - return NULL; + guard(rcu)(); - rcu_read_lock(); sch = rcu_dereference(scx_root); - if (likely(sch) && !sch->warned_deprecated_rq) { + if (unlikely(!sch)) + return NULL; + + if (!ops_cpu_valid(sch, cpu, NULL)) + return NULL; + + if (!sch->warned_deprecated_rq) { printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " "use scx_bpf_locked_rq() when holding rq lock " "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); sch->warned_deprecated_rq = true; } - rcu_read_unlock(); return cpu_rq(cpu); } @@ -6425,8 +6429,17 @@ __bpf_kfunc struct rq *scx_bpf_locked_rq(void) */ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) { - if (!kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) return NULL; + + if (!ops_cpu_valid(sch, cpu, NULL)) + return NULL; + return rcu_dereference(cpu_rq(cpu)->curr); } diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 6e2504ae73574d..a576ec10522e4d 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -864,7 +864,7 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, struct rq_flags rf; s32 cpu; - if (!kf_cpu_valid(prev_cpu, NULL)) + if (!ops_cpu_valid(sch, prev_cpu, NULL)) return -EINVAL; if (!check_builtin_idle_enabled(sch)) @@ -923,9 +923,13 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, */ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) { - if (!kf_cpu_valid(cpu, NULL)) - return NUMA_NO_NODE; + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL)) + return NUMA_NO_NODE; return cpu_to_node(cpu); } @@ -1154,7 +1158,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) if (!check_builtin_idle_enabled(sch)) return false; - if (!kf_cpu_valid(cpu, NULL)) + if (!ops_cpu_valid(sch, cpu, NULL)) return false; return scx_idle_test_and_clear_cpu(cpu); From 4d9553fee3e278ca3d90c54c063ce2db01e93268 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2719/3206] sched_ext: Add the @sch parameter to scx_dsq_insert_preamble/commit() In preparation for multiple scheduler support, add the @sch parameter to scx_dsq_insert_preamble/commit() and update the callers to read $scx_root and pass it in. The passed in @sch parameter is not used yet. Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8a8fd57c62c639..ed72de7d43d381 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5240,7 +5240,8 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) +static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, + u64 enq_flags) { if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) return false; @@ -5260,8 +5261,8 @@ static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) return true; } -static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, - u64 enq_flags) +static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 enq_flags) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct task_struct *ddsp_task; @@ -5325,7 +5326,14 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) { - if (!scx_dsq_insert_preamble(p, enq_flags)) + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) return; if (slice) @@ -5333,7 +5341,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice else p->scx.slice = p->scx.slice ?: 1; - scx_dsq_insert_commit(p, dsq_id, enq_flags); + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); } /** @@ -5360,7 +5368,14 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) { - if (!scx_dsq_insert_preamble(p, enq_flags)) + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) return; if (slice) @@ -5370,7 +5385,7 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, p->scx.dsq_vtime = vtime; - scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } __bpf_kfunc_end_defs(); From d4f7d866667c32b097721a96ebf0b19e1c85a75a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2720/3206] sched_ext: Drop scx_kf_exit() and scx_kf_error() The intention behind scx_kf_exit/error() was that when called from kfuncs, scx_kf_exit/error() would be able to implicitly determine the scx_sched instance being operated on and thus wouldn't need the @sch parameter passed in explicitly. This turned out to be unnecessarily complicated to implement and not have enough practical benefits. Replace scx_kf_exit/error() usages with scx_exit/error() which take an explicit @sch parameter. - Add the @sch parameter to scx_kf_allowed(), scx_kf_allowed_on_arg_tasks, mark_direct_dispatch() and other intermediate functions transitively. - In callers that don't already have @sch available, grab RCU, read $scx_root, verify it's not NULL and use it. Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 126 +++++++++++++++++++++++----------------- kernel/sched/ext_idle.c | 25 +++++--- 2 files changed, 88 insertions(+), 63 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ed72de7d43d381..ad25e939886887 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -151,24 +151,7 @@ static __printf(4, 5) void scx_exit(struct scx_sched *sch, va_end(args); } -static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code, - const char *fmt, ...) -{ - struct scx_sched *sch; - va_list args; - - rcu_read_lock(); - sch = rcu_dereference(scx_root); - if (sch) { - va_start(args, fmt); - scx_vexit(sch, kind, exit_code, fmt, args); - va_end(args); - } - rcu_read_unlock(); -} - #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) -#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args) #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) @@ -329,11 +312,11 @@ do { \ }) /* @mask is constant, always inline to cull unnecessary branches */ -static __always_inline bool scx_kf_allowed(u32 mask) +static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask) { if (unlikely(!(current->scx.kf_mask & mask))) { - scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", - mask, current->scx.kf_mask); + scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); return false; } @@ -346,13 +329,13 @@ static __always_inline bool scx_kf_allowed(u32 mask) */ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { - scx_kf_error("cpu_release kfunc called from a nested operation"); + scx_error(sch, "cpu_release kfunc called from a nested operation"); return false; } if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { - scx_kf_error("dispatch kfunc called from a nested operation"); + scx_error(sch, "dispatch kfunc called from a nested operation"); return false; } @@ -360,15 +343,16 @@ static __always_inline bool scx_kf_allowed(u32 mask) } /* see SCX_CALL_OP_TASK() */ -static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, +static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, + u32 mask, struct task_struct *p) { - if (!scx_kf_allowed(mask)) + if (!scx_kf_allowed(sch, mask)) return false; if (unlikely((p != current->scx.kf_tasks[0] && p != current->scx.kf_tasks[1]))) { - scx_kf_error("called on a task not being operated on"); + scx_error(sch, "called on a task not being operated on"); return false; } @@ -1115,7 +1099,8 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, return dsq; } -static void mark_direct_dispatch(struct task_struct *ddsp_task, +static void mark_direct_dispatch(struct scx_sched *sch, + struct task_struct *ddsp_task, struct task_struct *p, u64 dsq_id, u64 enq_flags) { @@ -1129,10 +1114,10 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, /* @p must match the task on the enqueue path */ if (unlikely(p != ddsp_task)) { if (IS_ERR(ddsp_task)) - scx_kf_error("%s[%d] already direct-dispatched", + scx_error(sch, "%s[%d] already direct-dispatched", p->comm, p->pid); else - scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", ddsp_task->comm, ddsp_task->pid, p->comm, p->pid); return; @@ -5243,18 +5228,18 @@ void __init init_sched_ext_class(void) static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, u64 enq_flags) { - if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) + if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) return false; lockdep_assert_irqs_disabled(); if (unlikely(!p)) { - scx_kf_error("called with NULL task"); + scx_error(sch, "called with NULL task"); return false; } if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { - scx_kf_error("invalid enq_flags 0x%llx", enq_flags); + scx_error(sch, "invalid enq_flags 0x%llx", enq_flags); return false; } @@ -5269,12 +5254,12 @@ static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, ddsp_task = __this_cpu_read(direct_dispatch_task); if (ddsp_task) { - mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); + mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); return; } if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { - scx_kf_error("dispatch buffer overflow"); + scx_error(sch, "dispatch buffer overflow"); return; } @@ -5410,7 +5395,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, bool in_balance; unsigned long flags; - if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) + if (!scx_kf_allowed_if_unlocked() && + !scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; /* @@ -5495,7 +5481,15 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) { - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return 0; return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); @@ -5510,14 +5504,21 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) __bpf_kfunc void scx_bpf_dispatch_cancel(void) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_sched *sch; - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return; if (dspc->cursor > 0) dspc->cursor--; else - scx_kf_error("dispatch buffer underflow"); + scx_error(sch, "dispatch buffer underflow"); } /** @@ -5540,7 +5541,7 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; flush_dispatch_buf(sch, dspc->rq); @@ -5687,12 +5688,18 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) { + struct scx_sched *sch; LIST_HEAD(tasks); u32 nr_enqueued = 0; struct rq *rq; struct task_struct *p, *n; - if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) return 0; rq = cpu_rq(smp_processor_id()); @@ -5837,7 +5844,7 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) struct rq *target_rq = cpu_rq(cpu); if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) - scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); if (raw_spin_rq_trylock(target_rq)) { if (can_skip_idle_kick(target_rq)) { @@ -6070,20 +6077,20 @@ static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || (data__sz && !data)) { - scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz); + scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); return -EINVAL; } ret = copy_from_kernel_nofault(data_buf, data, data__sz); if (ret < 0) { - scx_kf_error("failed to read data fields (%d)", ret); + scx_error(sch, "failed to read data fields (%d)", ret); return ret; } ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, &bprintf_data); if (ret < 0) { - scx_kf_error("format preparation failed (%d)", ret); + scx_error(sch, "format preparation failed (%d)", ret); return ret; } @@ -6091,7 +6098,7 @@ static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, bprintf_data.bin_args); bpf_bprintf_cleanup(&bprintf_data); if (ret < 0) { - scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz); + scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); return ret; } @@ -6127,7 +6134,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, sch = rcu_dereference_bh(scx_root); if (likely(sch) && bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); + scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -6150,7 +6157,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, sch = rcu_dereference_bh(scx_root); if (likely(sch) && bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); + scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -6181,7 +6188,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, return; if (raw_smp_processor_id() != dd->cpu) { - scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); + scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); return; } @@ -6285,7 +6292,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) return; if (unlikely(perf > SCX_CPUPERF_ONE)) { - scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); return; } @@ -6298,7 +6305,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) * to the corresponding CPU to prevent ABBA deadlocks. */ if (locked_rq && rq != locked_rq) { - scx_kf_error("Invalid target CPU %d", cpu); + scx_error(sch, "Invalid target CPU %d", cpu); return; } @@ -6422,16 +6429,20 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) */ __bpf_kfunc struct rq *scx_bpf_locked_rq(void) { + struct scx_sched *sch; struct rq *rq; - preempt_disable(); + guard(preempt)(); + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return NULL; + rq = scx_locked_rq(); if (!rq) { - preempt_enable(); - scx_kf_error("accessing rq without holding rq lock"); + scx_error(sch, "accessing rq without holding rq lock"); return NULL; } - preempt_enable(); return rq; } @@ -6474,8 +6485,15 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) { struct task_group *tg = p->sched_task_group; struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + goto out; - if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p)) goto out; cgrp = tg_cgrp(tg); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index a576ec10522e4d..c57779f0ad5766 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -822,7 +822,7 @@ void scx_idle_disable(void) static int validate_node(struct scx_sched *sch, int node) { if (!static_branch_likely(&scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is disabled"); + scx_error(sch, "per-node idle tracking is disabled"); return -EOPNOTSUPP; } @@ -832,13 +832,13 @@ static int validate_node(struct scx_sched *sch, int node) /* Make sure node is in a valid range */ if (node < 0 || node >= nr_node_ids) { - scx_kf_error("invalid node %d", node); + scx_error(sch, "invalid node %d", node); return -EINVAL; } /* Make sure the node is part of the set of possible nodes */ if (!node_possible(node)) { - scx_kf_error("unavailable node %d", node); + scx_error(sch, "unavailable node %d", node); return -EINVAL; } @@ -852,7 +852,7 @@ static bool check_builtin_idle_enabled(struct scx_sched *sch) if (static_branch_likely(&scx_builtin_idle_enabled)) return true; - scx_kf_error("built-in idle tracking is disabled"); + scx_error(sch, "built-in idle tracking is disabled"); return false; } @@ -880,7 +880,7 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, if (scx_kf_allowed_if_unlocked()) { rq = task_rq_lock(p, &rf); } else { - if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) + if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) return -EPERM; rq = scx_locked_rq(); } @@ -1048,7 +1048,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) return cpu_none_mask; if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } @@ -1107,7 +1107,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) return cpu_none_mask; if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } @@ -1235,7 +1235,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, return -ENODEV; if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is enabled"); + scx_error(sch, "per-node idle tracking is enabled"); return -EBUSY; } @@ -1316,10 +1316,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, u64 flags) { + struct scx_sched *sch; s32 cpu; + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is enabled"); + scx_error(sch, "per-node idle tracking is enabled"); return -EBUSY; } From c0008a5632103eae31302e83d012e2d3b0cfad41 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: [PATCH 2721/3206] sched_ext: Misc updates around scx_sched instance pointer In preparation for multiple scheduler support: - Add the @sch parameter to find_global_dsq() and refill_task_slice_dfl(). - Restructure scx_allow_ttwu_queue() and make it read scx_root into $sch. - Make RCU protection in scx_dsq_move() and scx_bpf_dsq_move_to_local() explicit. v2: Add scx_root -> sch conversion in scx_allow_ttwu_queue(). Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 62 ++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ad25e939886887..fa3696f9e7d23a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -181,10 +181,9 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } -static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, + struct task_struct *p) { - struct scx_sched *sch = scx_root; - return sch->global_dsqs[cpu_to_node(task_cpu(p))]; } @@ -880,10 +879,10 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) WRITE_ONCE(dsq->nr, dsq->nr + delta); } -static void refill_task_slice_dfl(struct task_struct *p) +static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1); + __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, @@ -901,7 +900,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); raw_spin_lock(&dsq->lock); } } @@ -1080,20 +1079,20 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return find_global_dsq(p); + return find_global_dsq(sch, p); return &cpu_rq(cpu)->scx.local_dsq; } if (dsq_id == SCX_DSQ_GLOBAL) - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); else dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); - return find_global_dsq(p); + return find_global_dsq(sch, p); } return dsq; @@ -1272,15 +1271,15 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, * higher priority it becomes from scx_prio_less()'s POV. */ touch_core_sched(rq, p); - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); local_norefill: dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); return; global: touch_core_sched(rq, p); /* see the comment in local: */ - refill_task_slice_dfl(p); - dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags); + refill_task_slice_dfl(sch, p); + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -1692,7 +1691,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dst_dsq = find_global_dsq(p); + dst_dsq = find_global_dsq(sch, p); dst_rq = src_rq; } } else { @@ -1848,7 +1847,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dispatch_enqueue(sch, find_global_dsq(p), p, + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -2380,7 +2379,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (keep_prev) { p = prev; if (!p->scx.slice) - refill_task_slice_dfl(p); + refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); } else { p = first_local_task(rq); if (!p) { @@ -2391,14 +2390,14 @@ static struct task_struct *pick_task_scx(struct rq *rq) } if (unlikely(!p->scx.slice)) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = rcu_dereference_sched(scx_root); if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); sch->warned_zero_slice = true; } - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); } } @@ -2487,7 +2486,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; } else { cpu = prev_cpu; @@ -3572,9 +3571,22 @@ bool task_should_scx(int policy) bool scx_allow_ttwu_queue(const struct task_struct *p) { - return !scx_enabled() || - (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) || - p->sched_class != &ext_sched_class; + struct scx_sched *sch; + + if (!scx_enabled()) + return true; + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return true; + + if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) + return true; + + if (unlikely(p->sched_class != &ext_sched_class)) + return true; + + return false; } /** @@ -5537,9 +5549,15 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) */ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) { - struct scx_sched *sch = scx_root; struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return false; if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; From ccb4f5d91ec43c05ba165ccfc7ed889eb9cdfd05 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Fri, 19 Sep 2025 12:41:09 +0800 Subject: [PATCH 2722/3206] bpf: Allow union argument in trampoline based programs Currently, functions with 'union' arguments cannot be traced with fentry/fexit: bpftrace -e 'fentry:release_pages { exit(); }' -v The function release_pages arg0 type UNION is unsupported. The type of the 'release_pages' arg0 is defined as: typedef union { struct page **pages; struct folio **folios; struct encoded_page **encoded_pages; } release_pages_arg __attribute__ ((__transparent_union__)); This patch relaxes the restriction by allowing function arguments of type 'union' to be traced in verifier. Reviewed-by: Amery Hung Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250919044110.23729-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/btf.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a2ab51fa8b0ab9..ba3a3be7eb2a8a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1128,7 +1128,7 @@ struct bpf_prog_offload { */ #define MAX_BPF_FUNC_REG_ARGS 5 -/* The argument is a structure. */ +/* The argument is a structure or a union. */ #define BTF_FMODEL_STRUCT_ARG BIT(0) /* The argument is signed. */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9f47a3aa7ff899..0de8fc8a0e0b32 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6751,7 +6751,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -7323,7 +7323,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t)) return t->size; return -EINVAL; } @@ -7332,7 +7332,7 @@ static u8 __get_type_fmodel_flags(const struct btf_type *t) { u8 flags = 0; - if (__btf_type_is_struct(t)) + if (btf_type_is_struct(t)) flags |= BTF_FMODEL_STRUCT_ARG; if (btf_type_is_signed_int(t)) flags |= BTF_FMODEL_SIGNED_ARG; @@ -7373,7 +7373,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, func->type, &t); - if (ret < 0 || __btf_type_is_struct(t)) { + if (ret < 0 || btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", tname, btf_type_str(t)); From 1c6686bf7fc161ed87b44b523a96c5ae4bfed351 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Fri, 19 Sep 2025 12:41:10 +0800 Subject: [PATCH 2723/3206] selftests/bpf: Add union argument tests using fexit programs Add test coverage for union argument support using fexit programs: * 8B union argument - verify that the verifier accepts it and that fexit programs can trace such functions. * 16B union argument - verify that the verifier accepts it and that fexit programs can access the argument, which is passed using two registers. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250919044110.23729-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/tracing_struct.c | 29 ++++++++++++++++ .../selftests/bpf/progs/tracing_struct.c | 33 +++++++++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod.c | 31 +++++++++++++++++ 3 files changed, 93 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_struct.c b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c index 19e68d4b353278..6f8c0bfb041559 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_struct.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c @@ -112,10 +112,39 @@ static void test_struct_many_args(void) tracing_struct_many_args__destroy(skel); } +static void test_union_args(void) +{ + struct tracing_struct *skel; + int err; + + skel = tracing_struct__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_struct__open_and_load")) + return; + + err = tracing_struct__attach(skel); + if (!ASSERT_OK(err, "tracing_struct__attach")) + goto out; + + ASSERT_OK(trigger_module_test_read(256), "trigger_read"); + + ASSERT_EQ(skel->bss->ut1_a_a, 1, "ut1:a.arg.a"); + ASSERT_EQ(skel->bss->ut1_b, 4, "ut1:b"); + ASSERT_EQ(skel->bss->ut1_c, 5, "ut1:c"); + + ASSERT_EQ(skel->bss->ut2_a, 6, "ut2:a"); + ASSERT_EQ(skel->bss->ut2_b_a, 2, "ut2:b.arg.a"); + ASSERT_EQ(skel->bss->ut2_b_b, 3, "ut2:b.arg.b"); + +out: + tracing_struct__destroy(skel); +} + void test_tracing_struct(void) { if (test__start_subtest("struct_args")) test_struct_args(); if (test__start_subtest("struct_many_args")) test_struct_many_args(); + if (test__start_subtest("union_args")) + test_union_args(); } diff --git a/tools/testing/selftests/bpf/progs/tracing_struct.c b/tools/testing/selftests/bpf/progs/tracing_struct.c index c435a3a8328ab1..d460732e202395 100644 --- a/tools/testing/selftests/bpf/progs/tracing_struct.c +++ b/tools/testing/selftests/bpf/progs/tracing_struct.c @@ -18,6 +18,18 @@ struct bpf_testmod_struct_arg_3 { int b[]; }; +union bpf_testmod_union_arg_1 { + char a; + short b; + struct bpf_testmod_struct_arg_1 arg; +}; + +union bpf_testmod_union_arg_2 { + int a; + long b; + struct bpf_testmod_struct_arg_2 arg; +}; + long t1_a_a, t1_a_b, t1_b, t1_c, t1_ret, t1_nregs; __u64 t1_reg0, t1_reg1, t1_reg2, t1_reg3; long t2_a, t2_b_a, t2_b_b, t2_c, t2_ret; @@ -26,6 +38,9 @@ long t4_a_a, t4_b, t4_c, t4_d, t4_e_a, t4_e_b, t4_ret; long t5_ret; int t6; +long ut1_a_a, ut1_b, ut1_c; +long ut2_a, ut2_b_a, ut2_b_b; + SEC("fentry/bpf_testmod_test_struct_arg_1") int BPF_PROG2(test_struct_arg_1, struct bpf_testmod_struct_arg_2, a, int, b, int, c) { @@ -130,4 +145,22 @@ int BPF_PROG2(test_struct_arg_11, struct bpf_testmod_struct_arg_3 *, a) return 0; } +SEC("fexit/bpf_testmod_test_union_arg_1") +int BPF_PROG2(test_union_arg_1, union bpf_testmod_union_arg_1, a, int, b, int, c) +{ + ut1_a_a = a.arg.a; + ut1_b = b; + ut1_c = c; + return 0; +} + +SEC("fexit/bpf_testmod_test_union_arg_2") +int BPF_PROG2(test_union_arg_2, int, a, union bpf_testmod_union_arg_2, b) +{ + ut2_a = a; + ut2_b_a = b.arg.a; + ut2_b_b = b.arg.b; + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index d6ce51df9ed4a2..6df6475f5dbc2c 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -62,6 +62,18 @@ struct bpf_testmod_struct_arg_5 { long d; }; +union bpf_testmod_union_arg_1 { + char a; + short b; + struct bpf_testmod_struct_arg_1 arg; +}; + +union bpf_testmod_union_arg_2 { + int a; + long b; + struct bpf_testmod_struct_arg_2 arg; +}; + __bpf_hook_start(); noinline int @@ -128,6 +140,20 @@ bpf_testmod_test_struct_arg_9(u64 a, void *b, short c, int d, void *e, char f, return bpf_testmod_test_struct_arg_result; } +noinline int +bpf_testmod_test_union_arg_1(union bpf_testmod_union_arg_1 a, int b, int c) +{ + bpf_testmod_test_struct_arg_result = a.arg.a + b + c; + return bpf_testmod_test_struct_arg_result; +} + +noinline int +bpf_testmod_test_union_arg_2(int a, union bpf_testmod_union_arg_2 b) +{ + bpf_testmod_test_struct_arg_result = a + b.arg.a + b.arg.b; + return bpf_testmod_test_struct_arg_result; +} + noinline int bpf_testmod_test_arg_ptr_to_struct(struct bpf_testmod_struct_arg_1 *a) { bpf_testmod_test_struct_arg_result = a->a; @@ -408,6 +434,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, struct bpf_testmod_struct_arg_3 *struct_arg3; struct bpf_testmod_struct_arg_4 struct_arg4 = {21, 22}; struct bpf_testmod_struct_arg_5 struct_arg5 = {23, 24, 25, 26}; + union bpf_testmod_union_arg_1 union_arg1 = { .arg = {1} }; + union bpf_testmod_union_arg_2 union_arg2 = { .arg = {2, 3} }; int i = 1; while (bpf_testmod_return_ptr(i)) @@ -425,6 +453,9 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, (void)bpf_testmod_test_struct_arg_9(16, (void *)17, 18, 19, (void *)20, 21, 22, struct_arg5, 27); + (void)bpf_testmod_test_union_arg_1(union_arg1, 4, 5); + (void)bpf_testmod_test_union_arg_2(6, union_arg2); + (void)bpf_testmod_test_arg_ptr_to_struct(&struct_arg1_2); (void)trace_bpf_testmod_test_raw_tp_null_tp(NULL); From 2f6aa0acdc5d09d1fee3049593c81affa9c58838 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Tue, 23 Sep 2025 16:26:03 +0300 Subject: [PATCH 2724/3206] MAINTAINERS: Remove myself as Synopsys DesignWare I2C maintainer My address is going to bounce soon and I won't have access to the Synopsys datasheets so I'm going step down being a maintainer for this driver. Signed-off-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 967de62878b3dd..e2330f9a5d76f3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24481,7 +24481,6 @@ F: Documentation/devicetree/bindings/media/snps,dw-hdmi-rx.yaml F: drivers/media/platform/synopsys/hdmirx/* SYNOPSYS DESIGNWARE I2C DRIVER -M: Jarkko Nikula R: Andy Shevchenko R: Mika Westerberg R: Jan Dabros From dde9a38195b0c268ff55d3a8aa62faefd0e988c8 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Tue, 23 Sep 2025 17:18:26 +0300 Subject: [PATCH 2725/3206] i2c: riic: Allow setting frequencies lower than 50KHz The MR1.CKS field is 3 bits wide and all the possible values (from 0 to 7) are valid. This is true for all the SoCs currently integrated in upstream Linux. Take into account CKS=7 which allows setting bus frequencies lower than 50KHz. This may be useful at least for debugging. Fixes: d982d6651419 ("i2c: riic: remove clock and frequency restrictions") Signed-off-by: Claudiu Beznea Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-riic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-riic.c b/drivers/i2c/busses/i2c-riic.c index 9c164a4b9bb91c..b0ee9ac45a976d 100644 --- a/drivers/i2c/busses/i2c-riic.c +++ b/drivers/i2c/busses/i2c-riic.c @@ -386,7 +386,7 @@ static int riic_init_hw(struct riic_dev *riic) */ total_ticks = DIV_ROUND_UP(rate, t->bus_freq_hz ?: 1); - for (cks = 0; cks < 7; cks++) { + for (cks = 0; cks <= 7; cks++) { /* * 60% low time must be less than BRL + 2 + 1 * BRL max register value is 0x1F. From 8d13f91d7fce4e842dfc7edbcba0690f7ed11c18 Mon Sep 17 00:00:00 2001 From: Yixun Lan Date: Wed, 17 Sep 2025 11:38:13 +0800 Subject: [PATCH 2726/3206] dt-bindings: i2c: spacemit: extend and validate all properties Extend the K1 I2C properties by including generic i2c-controller schema. and this will enable it to do the DT validation check later. Signed-off-by: Yixun Lan Reviewed-by: Troy Mitchell Tested-by: Alex Elder Acked-by: Conor Dooley Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml b/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml index 3d6aefb0d0f185..226c600deae142 100644 --- a/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml @@ -9,6 +9,9 @@ title: I2C controller embedded in SpacemiT's K1 SoC maintainers: - Troy Mitchell +allOf: + - $ref: /schemas/i2c/i2c-controller.yaml# + properties: compatible: const: spacemit,k1-i2c From 0d3bf643b41bc339a02562a4aa382542d046bd0a Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 23 Sep 2025 11:38:02 +0100 Subject: [PATCH 2727/3206] bpftool: Add bash completion for program signing options Commit 40863f4d6ef2 ("bpftool: Add support for signing BPF programs") added new options for "bpftool prog load" and "bpftool gen skeleton". This commit brings the relevant update to the bash completion file. We rework slightly the processing of options to make completion more resilient for options that take an argument. Signed-off-by: Quentin Monnet Link: https://lore.kernel.org/r/20250923103802.57695-1-qmo@kernel.org Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/bash-completion/bpftool | 26 +++++++++++++++-------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 527bb47ac4629d..53bcfeb1a76e62 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -262,7 +262,7 @@ _bpftool() # Deal with options if [[ ${words[cword]} == -* ]]; then local c='--version --json --pretty --bpffs --mapcompat --debug \ - --use-loader --base-btf' + --use-loader --base-btf --sign -i -k' COMPREPLY=( $( compgen -W "$c" -- "$cur" ) ) return 0 fi @@ -283,7 +283,7 @@ _bpftool() _sysfs_get_netdevs return 0 ;; - file|pinned|-B|--base-btf) + file|pinned|-B|--base-btf|-i|-k) _filedir return 0 ;; @@ -296,13 +296,21 @@ _bpftool() # Remove all options so completions don't have to deal with them. local i pprev for (( i=1; i < ${#words[@]}; )); do - if [[ ${words[i]::1} == - ]] && - [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then - words=( "${words[@]:0:i}" "${words[@]:i+1}" ) - [[ $i -le $cword ]] && cword=$(( cword - 1 )) - else - i=$(( ++i )) - fi + case ${words[i]} in + # Remove option and its argument + -B|--base-btf|-i|-k) + words=( "${words[@]:0:i}" "${words[@]:i+2}" ) + [[ $i -le $(($cword + 1)) ]] && cword=$(( cword - 2 )) + ;; + # No argument, remove option only + -*) + words=( "${words[@]:0:i}" "${words[@]:i+1}" ) + [[ $i -le $cword ]] && cword=$(( cword - 1 )) + ;; + *) + i=$(( ++i )) + ;; + esac done cur=${words[cword]} prev=${words[cword - 1]} From d0bf7cd5df18466d969bb60e8890b74cf96081ca Mon Sep 17 00:00:00 2001 From: Chenghao Duan Date: Mon, 22 Sep 2025 14:22:44 +0800 Subject: [PATCH 2728/3206] riscv: bpf: Fix uninitialized symbol 'retval_off' In the __arch_prepare_bpf_trampoline() function, retval_off is only meaningful when save_ret is true, so the current logic is correct. However, in the original logic, retval_off is only initialized under certain conditions; for example, in the fmod_ret logic, the compiler is not aware that the flags of the fmod_ret program (prog) have set BPF_TRAMP_F_CALL_ORIG, which results in an uninitialized symbol compilation warning. So initialize retval_off unconditionally to fix it. Signed-off-by: Chenghao Duan Reviewed-by: Pu Lehui Link: https://lore.kernel.org/r/20250922062244.822937-2-duanchenghao@kylinos.cn Signed-off-by: Alexei Starovoitov --- arch/riscv/net/bpf_jit_comp64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 83672373d02640..d0d2768d100d6d 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1061,10 +1061,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, stack_size += 16; save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); - if (save_ret) { + if (save_ret) stack_size += 16; /* Save both A5 (BPF R0) and A0 */ - retval_off = stack_size; - } + retval_off = stack_size; stack_size += nr_arg_slots * 8; args_off = stack_size; From 8ab3bd59f9fc210a1a88b509f6931cf3f5ea229a Mon Sep 17 00:00:00 2001 From: Ivaylo Ivanov Date: Sun, 14 Sep 2025 16:16:20 +0300 Subject: [PATCH 2729/3206] dt-bindings: i2c: exynos5: add samsung,exynos8890-hsi2c compatible Add samsung,exynos8890-hsi2c compatible, reusing the 8895 support since it's compatible with exynos8890's i2c controllers. Signed-off-by: Ivaylo Ivanov Acked-by: Rob Herring (Arm) Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml b/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml index 6fc03281a8f844..32269239bae467 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml +++ b/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml @@ -33,6 +33,10 @@ properties: - samsung,exynos7870-hsi2c - tesla,fsd-hsi2c - const: samsung,exynos7-hsi2c + - items: + - enum: + - samsung,exynos8890-hsi2c + - const: samsung,exynos8895-hsi2c - items: - enum: - google,gs101-hsi2c From 958e55f90a01451de77e22063d37b60388219b96 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 21 Aug 2025 10:32:06 +0200 Subject: [PATCH 2730/3206] dt-bindings: i2c: spacemit,k1-i2c: Minor whitespace cleanup in example The DTS code coding style expects exactly one space around '=' character. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Troy Mitchell Acked-by: Rob Herring (Arm) Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml b/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml index 3d6aefb0d0f185..c1a3004df71d8d 100644 --- a/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml @@ -53,7 +53,7 @@ examples: reg = <0xd4010800 0x38>; interrupt-parent = <&plic>; interrupts = <36>; - clocks =<&ccu 32>, <&ccu 84>; + clocks = <&ccu 32>, <&ccu 84>; clock-names = "func", "bus"; clock-frequency = <100000>; }; From 8f12d1137c2382c80aada8e05d7cc650cd4e403c Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:49 -0700 Subject: [PATCH 2731/3206] bpf: Clear pfmemalloc flag when freeing all fragments It is possible for bpf_xdp_adjust_tail() to free all fragments. The kfunc currently clears the XDP_FLAGS_HAS_FRAGS bit, but not XDP_FLAGS_FRAGS_PF_MEMALLOC. So far, this has not caused a issue when building sk_buff from xdp_buff since all readers of xdp_buff->flags use the flag only when there are fragments. Clear the XDP_FLAGS_FRAGS_PF_MEMALLOC bit as well to make the flags correct. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Reviewed-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250922233356.3356453-2-ameryhung@gmail.com --- include/net/xdp.h | 5 +++++ net/core/filter.c | 1 + 2 files changed, 6 insertions(+) diff --git a/include/net/xdp.h b/include/net/xdp.h index b40f1f96cb1177..f288c348a6c132 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -115,6 +115,11 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } +static __always_inline void xdp_buff_clear_frag_pfmemalloc(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_FRAGS_PF_MEMALLOC; +} + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { diff --git a/net/core/filter.c b/net/core/filter.c index 63f3baee2dafa7..5837534f4352bf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4210,6 +4210,7 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); xdp->data_end -= offset; } From dea1526fbafb55099a788cde0b659530ee5b1c66 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:50 -0700 Subject: [PATCH 2732/3206] bpf: Allow bpf_xdp_shrink_data to shrink a frag from head and tail Move skb_frag_t adjustment into bpf_xdp_shrink_data() and extend its functionality to be able to shrink an xdp fragment from both head and tail. In a later patch, bpf_xdp_pull_data() will reuse it to shrink an xdp fragment from head. Additionally, in bpf_xdp_frags_shrink_tail(), breaking the loop when bpf_xdp_shrink_data() returns false (i.e., not releasing the current fragment) is not necessary as the loop condition, offset > 0, has the same effect. Remove the else branch to simplify the code. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Reviewed-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250922233356.3356453-3-ameryhung@gmail.com --- include/net/xdp_sock_drv.h | 21 ++++++++++++++++--- net/core/filter.c | 41 ++++++++++++++++++++++---------------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 513c8e9704f657..4f2d3268a6769d 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -160,13 +160,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) return ret; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) { - struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); list_del(&xskb->list_node); } +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_first_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + list_node); + return &frag->xdp; +} + static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); @@ -389,8 +399,13 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) return NULL; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) +{ +} + +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) { + return NULL; } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) diff --git a/net/core/filter.c b/net/core/filter.c index 5837534f4352bf..8cae575ad43746 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4153,34 +4153,45 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) return 0; } -static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - enum xdp_mem_type mem_type, bool release) +static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, + bool tail, bool release) { - struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); + struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) : + xsk_buff_get_head(xdp); if (release) { - xsk_buff_del_tail(zc_frag); - __xdp_return(0, mem_type, false, zc_frag); + xsk_buff_del_frag(zc_frag); } else { - zc_frag->data_end -= shrink; + if (tail) + zc_frag->data_end -= shrink; + else + zc_frag->data += shrink; } + + return zc_frag; } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, - int shrink) + int shrink, bool tail) { enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; + netmem_ref netmem = skb_frag_netmem(frag); + struct xdp_buff *zc_frag = NULL; if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); - goto out; + netmem = 0; + zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release); } - if (release) - __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); + if (release) { + __xdp_return(netmem, mem_type, false, zc_frag); + } else { + if (!tail) + skb_frag_off_add(frag, shrink); + skb_frag_size_sub(frag, shrink); + } -out: return release; } @@ -4198,12 +4209,8 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) len_free += shrink; offset -= shrink; - if (bpf_xdp_shrink_data(xdp, frag, shrink)) { + if (bpf_xdp_shrink_data(xdp, frag, shrink, true)) n_frags_free++; - } else { - skb_frag_size_sub(frag, shrink); - break; - } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; From 4dce1a0d7cf39575a5880414ea882890edd8d26f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:51 -0700 Subject: [PATCH 2733/3206] bpf: Support pulling non-linear xdp data Add kfunc, bpf_xdp_pull_data(), to support pulling data from xdp fragments. Similar to bpf_skb_pull_data(), bpf_xdp_pull_data() makes the first len bytes of data directly readable and writable in bpf programs. If the "len" argument is larger than the linear data size, data in fragments will be copied to the linear data area when there is enough room. Specifically, the kfunc will try to use the tailroom first. When the tailroom is not enough, metadata and data will be shifted down to make room for pulling data. A use case of the kfunc is to decapsulate headers residing in xdp fragments. It is possible for a NIC driver to place headers in xdp fragments. To keep using direct packet access for parsing and decapsulating headers, users can pull headers into the linear data area by calling bpf_xdp_pull_data() and then pop the header with bpf_xdp_adjust_head(). Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20250922233356.3356453-4-ameryhung@gmail.com --- net/core/filter.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 8cae575ad43746..6c8a075a3016a2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12214,6 +12214,98 @@ __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, return 0; } +/** + * bpf_xdp_pull_data() - Pull in non-linear xdp data. + * @x: &xdp_md associated with the XDP buffer + * @len: length of data to be made directly accessible in the linear part + * + * Pull in data in case the XDP buffer associated with @x is non-linear and + * not all @len are in the linear data area. + * + * Direct packet access allows reading and writing linear XDP data through + * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which + * ends up in the linear part of the xdp_buff depends on the NIC and its + * configuration. When a frag-capable XDP program wants to directly access + * headers that may be in the non-linear area, call this kfunc to make sure + * the data is available in the linear area. Alternatively, use dynptr or + * bpf_xdp_{load,store}_bytes() to access data without pulling. + * + * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate + * headers in the non-linear data area. + * + * A call to this kfunc may reduce headroom. If there is not enough tailroom + * in the linear data area, metadata and data will be shifted down. + * + * A call to this kfunc is susceptible to change the buffer geometry. + * Therefore, at load time, all checks on pointers previously done by the + * verifier are invalidated and must be performed again, if the kfunc is used + * in combination with direct packet access. + * + * Return: + * * %0 - success + * * %-EINVAL - invalid len + */ +__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)x; + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, delta, shift, headroom, tailroom, n_frags_free = 0; + void *data_hard_end = xdp_data_hard_end(xdp); + int data_len = xdp->data_end - xdp->data; + void *start; + + if (len <= data_len) + return 0; + + if (unlikely(len > xdp_get_buff_len(xdp))) + return -EINVAL; + + start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta; + + headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame); + tailroom = data_hard_end - xdp->data_end; + + delta = len - data_len; + if (unlikely(delta > tailroom + headroom)) + return -EINVAL; + + shift = delta - tailroom; + if (shift > 0) { + memmove(start - shift, start, xdp->data_end - start); + + xdp->data_meta -= shift; + xdp->data -= shift; + xdp->data_end -= shift; + } + + for (i = 0; i < sinfo->nr_frags && delta; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + u32 shrink = min_t(u32, delta, skb_frag_size(frag)); + + memcpy(xdp->data_end, skb_frag_address(frag), shrink); + + xdp->data_end += shrink; + sinfo->xdp_frags_size -= shrink; + delta -= shrink; + if (bpf_xdp_shrink_data(xdp, frag, shrink, false)) + n_frags_free++; + } + + if (unlikely(n_frags_free)) { + memmove(sinfo->frags, sinfo->frags + n_frags_free, + (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t)); + + sinfo->nr_frags -= n_frags_free; + + if (!sinfo->nr_frags) { + xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); + } + } + + return 0; +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, @@ -12241,6 +12333,7 @@ BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) +BTF_ID_FLAGS(func, bpf_xdp_pull_data) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) From 0e7a733ab3d7be8d745e8ee38d637ea7a9b24343 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:52 -0700 Subject: [PATCH 2734/3206] bpf: Clear packet pointers after changing packet data in kfuncs bpf_xdp_pull_data() may change packet data and therefore packet pointers need to be invalidated. Add bpf_xdp_pull_data() to the special kfunc list instead of introducing a new KF_ flag until there are more kfuncs changing packet data. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250922233356.3356453-5-ameryhung@gmail.com --- kernel/bpf/verifier.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5964bed40ffbf0..a1dd3c31a09e63 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12235,6 +12235,7 @@ enum special_kfunc_type { KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, KF_bpf_dynptr_from_skb_meta, + KF_bpf_xdp_pull_data, KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, @@ -12285,10 +12286,12 @@ BTF_ID(func, bpf_rbtree_right) BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_from_skb_meta) +BTF_ID(func, bpf_xdp_pull_data) #else BTF_ID_UNUSED BTF_ID_UNUSED BTF_ID_UNUSED +BTF_ID_UNUSED #endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) @@ -12358,6 +12361,11 @@ static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta) return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable]; } +static bool is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data]; +} + static enum kfunc_ptr_arg_type get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, @@ -14077,6 +14085,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_kfunc_pkt_changing(&meta)) + clear_all_pkt_pointers(env); + nargs = btf_type_vlen(meta.func_proto); args = (const struct btf_param *)(meta.func_proto + 1); for (i = 0; i < nargs; i++) { @@ -17794,6 +17805,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) */ if (ret == 0 && is_kfunc_sleepable(&meta)) mark_subprog_might_sleep(env, t); + if (ret == 0 && is_kfunc_pkt_changing(&meta)) + mark_subprog_changes_pkt_data(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); From 7eb83bff02ad5e82e8c456c58717ef181c220870 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:53 -0700 Subject: [PATCH 2735/3206] bpf: Make variables in bpf_prog_test_run_xdp less confusing Change the variable naming in bpf_prog_test_run_xdp() to make the overall logic less confusing. As different modes were added to the function over the time, some variables got overloaded, making it hard to understand and changing the code becomes error-prone. Replace "size" with "linear_sz" where it refers to the size of metadata and data. If "size" refers to input data size, use test.data_size_in directly. Replace "max_data_sz" with "max_linear_sz" to better reflect the fact that it is the maximum size of metadata and data (i.e., linear_sz). Also, xdp_rxq.frags_size is always PAGE_SIZE, so just set it directly instead of subtracting headroom and tailroom and adding them back. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250922233356.3356453-6-ameryhung@gmail.com --- net/bpf/test_run.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 9728dbd4c66c51..82af47d8c12388 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1207,9 +1207,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, { bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + u32 retval = 0, duration, max_linear_sz, size; + u32 linear_sz = kattr->test.data_size_in; u32 batch_size = kattr->test.batch_size; - u32 retval = 0, duration, max_data_sz; - u32 size = kattr->test.data_size_in; u32 headroom = XDP_PACKET_HEADROOM; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; @@ -1246,7 +1246,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (ctx) { /* There can't be user provided data before the meta data */ - if (ctx->data_meta || ctx->data_end != size || + if (ctx->data_meta || ctx->data_end != kattr->test.data_size_in || ctx->data > ctx->data_end || unlikely(xdp_metalen_invalid(ctx->data)) || (do_live && (kattr->test.data_out || kattr->test.ctx_out))) @@ -1255,30 +1255,30 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, headroom -= ctx->data; } - max_data_sz = PAGE_SIZE - headroom - tailroom; - if (size > max_data_sz) { - /* disallow live data mode for jumbo frames */ - if (do_live) - goto free_ctx; - size = max_data_sz; - } + max_linear_sz = PAGE_SIZE - headroom - tailroom; + linear_sz = min_t(u32, linear_sz, max_linear_sz); + + /* disallow live data mode for jumbo frames */ + if (do_live && kattr->test.data_size_in > linear_sz) + goto free_ctx; - data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom); + data = bpf_test_init(kattr, linear_sz, max_linear_sz, headroom, tailroom); if (IS_ERR(data)) { ret = PTR_ERR(data); goto free_ctx; } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); - rxqueue->xdp_rxq.frag_size = headroom + max_data_sz + tailroom; + rxqueue->xdp_rxq.frag_size = PAGE_SIZE; xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq); - xdp_prepare_buff(&xdp, data, headroom, size, true); + xdp_prepare_buff(&xdp, data, headroom, linear_sz, true); sinfo = xdp_get_shared_info_from_buff(&xdp); ret = xdp_convert_md_to_buff(ctx, &xdp); if (ret) goto free_data; + size = linear_sz; if (unlikely(kattr->test.data_size_in > size)) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); From fe9544ed1a2e9217b2c5285c3a4ac0dc5a38bd7b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:54 -0700 Subject: [PATCH 2736/3206] bpf: Support specifying linear xdp packet data size for BPF_PROG_TEST_RUN To test bpf_xdp_pull_data(), an xdp packet containing fragments as well as free linear data area after xdp->data_end needs to be created. However, bpf_prog_test_run_xdp() always fills the linear area with data_in before creating fragments, leaving no space to pull data. This patch will allow users to specify the linear data size through ctx->data_end. Currently, ctx_in->data_end must match data_size_in and will not be the final ctx->data_end seen by xdp programs. This is because ctx->data_end is populated according to the xdp_buff passed to test_run. The linear data area available in an xdp_buff, max_linear_sz, is alawys filled up before copying data_in into fragments. This patch will allow users to specify the size of data that goes into the linear area. When ctx_in->data_end is different from data_size_in, only ctx_in->data_end bytes of data will be put into the linear area when creating the xdp_buff. While ctx_in->data_end will be allowed to be different from data_size_in, it cannot be larger than the data_size_in as there will be no data to copy from user space. If it is larger than the maximum linear data area size, the layout suggested by the user will not be honored. Data beyond max_linear_sz bytes will still be copied into fragments. Finally, since it is possible for a NIC to produce a xdp_buff with empty linear data area, allow it when calling bpf_test_init() from bpf_prog_test_run_xdp() so that we can test XDP kfuncs with such xdp_buff. This is done by moving lower-bound check to callers as most of them already do except bpf_prog_test_run_skb(). The change also fixes a bug that allows passing an xdp_buff with data < ETH_HLEN. This can happen when ctx is used and metadata is at least ETH_HLEN. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250922233356.3356453-7-ameryhung@gmail.com --- net/bpf/test_run.c | 15 ++++++++++++--- .../bpf/prog_tests/xdp_context_test_run.c | 4 +--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 82af47d8c12388..3df3fe46beb3ca 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -665,7 +665,7 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; - if (user_size < ETH_HLEN || user_size > PAGE_SIZE - headroom - tailroom) + if (user_size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); size = SKB_DATA_ALIGN(size); @@ -1001,6 +1001,9 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, kattr->test.cpu || kattr->test.batch_size) return -EINVAL; + if (size < ETH_HLEN) + return -EINVAL; + data = bpf_test_init(kattr, kattr->test.data_size_in, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); @@ -1207,7 +1210,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, { bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - u32 retval = 0, duration, max_linear_sz, size; + u32 retval = 0, meta_sz = 0, duration, max_linear_sz, size; u32 linear_sz = kattr->test.data_size_in; u32 batch_size = kattr->test.batch_size; u32 headroom = XDP_PACKET_HEADROOM; @@ -1246,13 +1249,16 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (ctx) { /* There can't be user provided data before the meta data */ - if (ctx->data_meta || ctx->data_end != kattr->test.data_size_in || + if (ctx->data_meta || ctx->data_end > kattr->test.data_size_in || ctx->data > ctx->data_end || unlikely(xdp_metalen_invalid(ctx->data)) || (do_live && (kattr->test.data_out || kattr->test.ctx_out))) goto free_ctx; /* Meta data is allocated from the headroom */ headroom -= ctx->data; + + meta_sz = ctx->data; + linear_sz = ctx->data_end; } max_linear_sz = PAGE_SIZE - headroom - tailroom; @@ -1262,6 +1268,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (do_live && kattr->test.data_size_in > linear_sz) goto free_ctx; + if (kattr->test.data_size_in - meta_sz < ETH_HLEN) + return -EINVAL; + data = bpf_test_init(kattr, linear_sz, max_linear_sz, headroom, tailroom); if (IS_ERR(data)) { ret = PTR_ERR(data); diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 46e0730174ed66..178292d1251a10 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -97,9 +97,7 @@ void test_xdp_context_test_run(void) /* Meta data must be 255 bytes or smaller */ test_xdp_context_error(prog_fd, opts, 0, 256, sizeof(data), 0, 0, 0); - /* Total size of data must match data_end - data_meta */ - test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), - sizeof(data) - 1, 0, 0, 0); + /* Total size of data must be data_end - data_meta or larger */ test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data) + 1, 0, 0, 0); From 323302f54db92dc1c80ff5b114c20f19ec0adf81 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:55 -0700 Subject: [PATCH 2737/3206] selftests/bpf: Test bpf_xdp_pull_data Test bpf_xdp_pull_data() with xdp packets with different layouts. The xdp bpf program first checks if the layout is as expected. Then, it calls bpf_xdp_pull_data(). Finally, it checks the 0xbb marker at offset 1024 using directly packet access. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250922233356.3356453-8-ameryhung@gmail.com --- .../selftests/bpf/prog_tests/xdp_pull_data.c | 179 ++++++++++++++++++ .../selftests/bpf/progs/test_xdp_pull_data.c | 48 +++++ 2 files changed, 227 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_pull_data.c diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c new file mode 100644 index 00000000000000..efa350d04ec5ff --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "test_xdp_pull_data.skel.h" + +#define PULL_MAX (1 << 31) +#define PULL_PLUS_ONE (1 << 30) + +#define XDP_PACKET_HEADROOM 256 + +/* Find headroom and tailroom occupied by struct xdp_frame and struct + * skb_shared_info so that we can calculate the maximum pull lengths for + * test cases. They might not be the real size of the structures due to + * cache alignment. + */ +static int find_xdp_sizes(struct test_xdp_pull_data *skel, int frame_sz) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct xdp_md ctx = {}; + int prog_fd, err; + __u8 *buf; + + buf = calloc(frame_sz, sizeof(__u8)); + if (!ASSERT_OK_PTR(buf, "calloc buf")) + return -ENOMEM; + + topts.data_in = buf; + topts.data_out = buf; + topts.data_size_in = frame_sz; + topts.data_size_out = frame_sz; + /* Pass a data_end larger than the linear space available to make sure + * bpf_prog_test_run_xdp() will fill the linear data area so that + * xdp_find_sizes can infer the size of struct skb_shared_info + */ + ctx.data_end = frame_sz; + topts.ctx_in = &ctx; + topts.ctx_out = &ctx; + topts.ctx_size_in = sizeof(ctx); + topts.ctx_size_out = sizeof(ctx); + + prog_fd = bpf_program__fd(skel->progs.xdp_find_sizes); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + free(buf); + + return err; +} + +/* xdp_pull_data_prog will directly read a marker 0xbb stored at buf[1024] + * so caller expecting XDP_PASS should always pass pull_len no less than 1024 + */ +static void run_test(struct test_xdp_pull_data *skel, int retval, + int frame_sz, int buff_len, int meta_len, int data_len, + int pull_len) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct xdp_md ctx = {}; + int prog_fd, err; + __u8 *buf; + + buf = calloc(buff_len, sizeof(__u8)); + if (!ASSERT_OK_PTR(buf, "calloc buf")) + return; + + buf[meta_len + 1023] = 0xaa; + buf[meta_len + 1024] = 0xbb; + buf[meta_len + 1025] = 0xcc; + + topts.data_in = buf; + topts.data_out = buf; + topts.data_size_in = buff_len; + topts.data_size_out = buff_len; + ctx.data = meta_len; + ctx.data_end = meta_len + data_len; + topts.ctx_in = &ctx; + topts.ctx_out = &ctx; + topts.ctx_size_in = sizeof(ctx); + topts.ctx_size_out = sizeof(ctx); + + skel->bss->data_len = data_len; + if (pull_len & PULL_MAX) { + int headroom = XDP_PACKET_HEADROOM - meta_len - skel->bss->xdpf_sz; + int tailroom = frame_sz - XDP_PACKET_HEADROOM - + data_len - skel->bss->sinfo_sz; + + pull_len = pull_len & PULL_PLUS_ONE ? 1 : 0; + pull_len += headroom + tailroom + data_len; + } + skel->bss->pull_len = pull_len; + + prog_fd = bpf_program__fd(skel->progs.xdp_pull_data_prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_EQ(topts.retval, retval, "xdp_pull_data_prog retval"); + + if (retval == XDP_DROP) + goto out; + + ASSERT_EQ(ctx.data_end, meta_len + pull_len, "linear data size"); + ASSERT_EQ(topts.data_size_out, buff_len, "linear + non-linear data size"); + /* Make sure data around xdp->data_end was not messed up by + * bpf_xdp_pull_data() + */ + ASSERT_EQ(buf[meta_len + 1023], 0xaa, "data[1023]"); + ASSERT_EQ(buf[meta_len + 1024], 0xbb, "data[1024]"); + ASSERT_EQ(buf[meta_len + 1025], 0xcc, "data[1025]"); +out: + free(buf); +} + +static void test_xdp_pull_data_basic(void) +{ + u32 pg_sz, max_meta_len, max_data_len; + struct test_xdp_pull_data *skel; + + skel = test_xdp_pull_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_xdp_pull_data__open_and_load")) + return; + + pg_sz = sysconf(_SC_PAGE_SIZE); + + if (find_xdp_sizes(skel, pg_sz)) + goto out; + + max_meta_len = XDP_PACKET_HEADROOM - skel->bss->xdpf_sz; + max_data_len = pg_sz - XDP_PACKET_HEADROOM - skel->bss->sinfo_sz; + + /* linear xdp pkt, pull 0 byte */ + run_test(skel, XDP_PASS, pg_sz, 2048, 0, 2048, 2048); + + /* multi-buf pkt, pull results in linear xdp pkt */ + run_test(skel, XDP_PASS, pg_sz, 2048, 0, 1024, 2048); + + /* multi-buf pkt, pull 1 byte to linear data area */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, 1024, 1025); + + /* multi-buf pkt, pull 0 byte to linear data area */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, 1025, 1025); + + /* multi-buf pkt, empty linear data area, pull requires memmove */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, 0, PULL_MAX); + + /* multi-buf pkt, no headroom */ + run_test(skel, XDP_PASS, pg_sz, 9000, max_meta_len, 1024, PULL_MAX); + + /* multi-buf pkt, no tailroom, pull requires memmove */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, max_data_len, PULL_MAX); + + /* Test cases with invalid pull length */ + + /* linear xdp pkt, pull more than total data len */ + run_test(skel, XDP_DROP, pg_sz, 2048, 0, 2048, 2049); + + /* multi-buf pkt with no space left in linear data area */ + run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, max_data_len, + PULL_MAX | PULL_PLUS_ONE); + + /* multi-buf pkt, empty linear data area */ + run_test(skel, XDP_DROP, pg_sz, 9000, 0, 0, PULL_MAX | PULL_PLUS_ONE); + + /* multi-buf pkt, no headroom */ + run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, 1024, + PULL_MAX | PULL_PLUS_ONE); + + /* multi-buf pkt, no tailroom */ + run_test(skel, XDP_DROP, pg_sz, 9000, 0, max_data_len, + PULL_MAX | PULL_PLUS_ONE); + +out: + test_xdp_pull_data__destroy(skel); +} + +void test_xdp_pull_data(void) +{ + if (test__start_subtest("xdp_pull_data")) + test_xdp_pull_data_basic(); +} diff --git a/tools/testing/selftests/bpf/progs/test_xdp_pull_data.c b/tools/testing/selftests/bpf/progs/test_xdp_pull_data.c new file mode 100644 index 00000000000000..c41a21413eaa21 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_pull_data.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include + +int xdpf_sz; +int sinfo_sz; +int data_len; +int pull_len; + +#define XDP_PACKET_HEADROOM 256 + +SEC("xdp.frags") +int xdp_find_sizes(struct xdp_md *ctx) +{ + xdpf_sz = sizeof(struct xdp_frame); + sinfo_sz = __PAGE_SIZE - XDP_PACKET_HEADROOM - + (ctx->data_end - ctx->data); + + return XDP_PASS; +} + +SEC("xdp.frags") +int xdp_pull_data_prog(struct xdp_md *ctx) +{ + __u8 *data_end = (void *)(long)ctx->data_end; + __u8 *data = (void *)(long)ctx->data; + __u8 *val_p; + int err; + + if (data_len != data_end - data) + return XDP_DROP; + + err = bpf_xdp_pull_data(ctx, pull_len); + if (err) + return XDP_DROP; + + val_p = (void *)(long)ctx->data + 1024; + if (val_p + 1 > (void *)(long)ctx->data_end) + return XDP_DROP; + + if (*val_p != 0xbb) + return XDP_DROP; + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; From a40282dd3c484e6c882e93f4680e0a3ef3814453 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 20 Sep 2025 16:45:23 -0700 Subject: [PATCH 2738/3206] gcc-plugins: Remove TODO_verify_il for GCC >= 16 GCC now runs TODO_verify_il automatically[1], so it is no longer exposed to plugins. Only use the flag on GCC < 16. Link: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=9739ae9384dd7cd3bb1c7683d6b80b7a9116eaf8 [1] Suggested-by: Christopher Fore Link: https://lore.kernel.org/r/20250920234519.work.915-kees@kernel.org Signed-off-by: Kees Cook --- scripts/gcc-plugins/gcc-common.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h index 6cb6d105181520..8f1b3500f8e2dc 100644 --- a/scripts/gcc-plugins/gcc-common.h +++ b/scripts/gcc-plugins/gcc-common.h @@ -173,10 +173,17 @@ static inline opt_pass *get_pass_for_id(int id) return g->get_passes()->get_pass_for_id(id); } +#if BUILDING_GCC_VERSION < 16000 #define TODO_verify_ssa TODO_verify_il #define TODO_verify_flow TODO_verify_il #define TODO_verify_stmts TODO_verify_il #define TODO_verify_rtl_sharing TODO_verify_il +#else +#define TODO_verify_ssa 0 +#define TODO_verify_flow 0 +#define TODO_verify_stmts 0 +#define TODO_verify_rtl_sharing 0 +#endif #define INSN_DELETED_P(insn) (insn)->deleted() From efec2e55bdefb889639a6e7fe1f1f2431cdddc6a Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:56 -0700 Subject: [PATCH 2739/3206] selftests: drv-net: Pull data before parsing headers It is possible for drivers to generate xdp packets with data residing entirely in fragments. To keep parsing headers using direct packet access, call bpf_xdp_pull_data() to pull headers into the linear data area. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250922233356.3356453-9-ameryhung@gmail.com --- .../selftests/net/lib/xdp_native.bpf.c | 89 +++++++++++++++---- 1 file changed, 74 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/net/lib/xdp_native.bpf.c b/tools/testing/selftests/net/lib/xdp_native.bpf.c index 521ba38f2dddad..df4eea5c192b3b 100644 --- a/tools/testing/selftests/net/lib/xdp_native.bpf.c +++ b/tools/testing/selftests/net/lib/xdp_native.bpf.c @@ -14,6 +14,8 @@ #define MAX_PAYLOAD_LEN 5000 #define MAX_HDR_LEN 64 +extern int bpf_xdp_pull_data(struct xdp_md *xdp, __u32 len) __ksym __weak; + enum { XDP_MODE = 0, XDP_PORT = 1, @@ -68,30 +70,57 @@ static void record_stats(struct xdp_md *ctx, __u32 stat_type) static struct udphdr *filter_udphdr(struct xdp_md *ctx, __u16 port) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; struct udphdr *udph = NULL; - struct ethhdr *eth = data; + void *data, *data_end; + struct ethhdr *eth; + int err; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth)); + if (err) + return NULL; + + data_end = (void *)(long)ctx->data_end; + data = eth = (void *)(long)ctx->data; if (data + sizeof(*eth) > data_end) return NULL; if (eth->h_proto == bpf_htons(ETH_P_IP)) { - struct iphdr *iph = data + sizeof(*eth); + struct iphdr *iph; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*iph) + + sizeof(*udph)); + if (err) + return NULL; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + iph = data + sizeof(*eth); if (iph + 1 > (struct iphdr *)data_end || iph->protocol != IPPROTO_UDP) return NULL; - udph = (void *)eth + sizeof(*iph) + sizeof(*eth); - } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { - struct ipv6hdr *ipv6h = data + sizeof(*eth); + udph = data + sizeof(*iph) + sizeof(*eth); + } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { + struct ipv6hdr *ipv6h; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*ipv6h) + + sizeof(*udph)); + if (err) + return NULL; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + ipv6h = data + sizeof(*eth); if (ipv6h + 1 > (struct ipv6hdr *)data_end || ipv6h->nexthdr != IPPROTO_UDP) return NULL; - udph = (void *)eth + sizeof(*ipv6h) + sizeof(*eth); + udph = data + sizeof(*ipv6h) + sizeof(*eth); } else { return NULL; } @@ -145,17 +174,34 @@ static void swap_machdr(void *data) static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; struct udphdr *udph = NULL; - struct ethhdr *eth = data; + void *data, *data_end; + struct ethhdr *eth; + int err; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth)); + if (err) + return XDP_PASS; + + data_end = (void *)(long)ctx->data_end; + data = eth = (void *)(long)ctx->data; if (data + sizeof(*eth) > data_end) return XDP_PASS; if (eth->h_proto == bpf_htons(ETH_P_IP)) { - struct iphdr *iph = data + sizeof(*eth); - __be32 tmp_ip = iph->saddr; + struct iphdr *iph; + __be32 tmp_ip; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*iph) + + sizeof(*udph)); + if (err) + return XDP_PASS; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + iph = data + sizeof(*eth); if (iph + 1 > (struct iphdr *)data_end || iph->protocol != IPPROTO_UDP) @@ -169,8 +215,10 @@ static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) return XDP_PASS; record_stats(ctx, STATS_RX); + eth = data; swap_machdr((void *)eth); + tmp_ip = iph->saddr; iph->saddr = iph->daddr; iph->daddr = tmp_ip; @@ -178,9 +226,19 @@ static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) return XDP_TX; - } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { - struct ipv6hdr *ipv6h = data + sizeof(*eth); + } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { struct in6_addr tmp_ipv6; + struct ipv6hdr *ipv6h; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*ipv6h) + + sizeof(*udph)); + if (err) + return XDP_PASS; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + ipv6h = data + sizeof(*eth); if (ipv6h + 1 > (struct ipv6hdr *)data_end || ipv6h->nexthdr != IPPROTO_UDP) @@ -194,6 +252,7 @@ static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) return XDP_PASS; record_stats(ctx, STATS_RX); + eth = data; swap_machdr((void *)eth); __builtin_memcpy(&tmp_ipv6, &ipv6h->saddr, sizeof(tmp_ipv6)); From da3a88e9656c17a34daf49c9acc6d85f73b4d3d9 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:25:46 +0200 Subject: [PATCH 2740/3206] pinctrl: use more common syntax for compound literals The (typeof(foo)) construct is unusual in the kernel, use a more typical syntax by explicitly spelling out the type. Link: https://lore.kernel.org/all/20250909-gpio-mmio-gpio-conv-part4-v1-13-9f723dc3524a@linaro.org/ Suggested-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Signed-off-by: Linus Walleij --- drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c | 2 +- drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c | 2 +- drivers/pinctrl/nuvoton/pinctrl-wpcm450.c | 2 +- drivers/pinctrl/pinctrl-equilibrium.c | 2 +- drivers/pinctrl/stm32/pinctrl-stm32-hdp.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c index 7f45c2897c3f27..4e8b5e6d1e4d48 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c @@ -1836,7 +1836,7 @@ static int npcm7xx_gpio_of(struct npcm7xx_pinctrl *pctrl) if (!pctrl->gpio_bank[id].base) return -EINVAL; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DIN, diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c index 920dd207792596..fd4270a8fb734e 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c @@ -2329,7 +2329,7 @@ static int npcm8xx_gpio_fw(struct npcm8xx_pinctrl *pctrl) if (!pctrl->gpio_bank[id].base) return dev_err_probe(dev, -ENXIO, "fwnode_iomap id %d failed\n", id); - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DIN, diff --git a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c index 4dd8a3daa83e44..ef569525e9c6b0 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c +++ b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c @@ -1064,7 +1064,7 @@ static int wpcm450_gpio_register(struct platform_device *pdev, flags = BGPIOF_NO_OUTPUT; } - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = dat, diff --git a/drivers/pinctrl/pinctrl-equilibrium.c b/drivers/pinctrl/pinctrl-equilibrium.c index 7e655b0444b359..2d04829b29c997 100644 --- a/drivers/pinctrl/pinctrl-equilibrium.c +++ b/drivers/pinctrl/pinctrl-equilibrium.c @@ -241,7 +241,7 @@ static int gpiolib_reg(struct eqbr_pinctrl_drv_data *drvdata) } raw_spin_lock_init(&gctrl->lock); - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = gctrl->bank->nr_pins / 8, .dat = gctrl->membase + GPIO_IN, diff --git a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c index a8a4c2eee837ad..22d9104499af67 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c @@ -642,7 +642,7 @@ static int stm32_hdp_probe(struct platform_device *pdev) hdp->gpio_chip.gc.can_sleep = true; hdp->gpio_chip.gc.names = stm32_hdp_pins_group; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = dev, .sz = 4, .dat = hdp->base + HDP_GPOVAL, From 649764145b70a0328fc020fe31fc80594761a707 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 23 Sep 2025 22:35:58 +0200 Subject: [PATCH 2741/3206] i3c: Remove superfluous FIXME I2C adapters can already change timeout and retry parameters via IOCTL. This allows for better tuning to workloads compared to per-adapter defaults. So, the FIXME is not needed. Signed-off-by: Wolfram Sang Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250923203557.18298-2-wsa+renesas@sang-engineering.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index acaba4d5369712..d946db75df7068 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -2564,8 +2564,6 @@ static int i3c_master_i2c_adapter_init(struct i3c_master_controller *master) adap->owner = master->dev.parent->driver->owner; adap->algo = &i3c_master_i2c_algo; strscpy(adap->name, dev_name(master->dev.parent), sizeof(adap->name)); - - /* FIXME: Should we allow i3c masters to override these values? */ adap->timeout = HZ; adap->retries = 3; From ca9f9cdc4de97d0221100b11224738416696163c Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 22 Sep 2025 15:19:57 -0400 Subject: [PATCH 2742/3206] net: allow alloc_skb_with_frags() to use MAX_SKB_FRAGS Currently, alloc_skb_with_frags() will only fill (MAX_SKB_FRAGS - 1) slots. I think it should use all MAX_SKB_FRAGS slots, as callers of alloc_skb_with_frags() will size their allocation of frags based on MAX_SKB_FRAGS. This issue was discovered via a test patch that sets 'order' to 0 in alloc_skb_with_frags(), which effectively tests/simulates high fragmentation. In this case sendmsg() on unix sockets will fail every time for large allocations. If the PAGE_SIZE is 4K, then data_len will request 68K or 17 pages, but alloc_skb_with_frags() can only allocate 64K in this case or 16 pages. Fixes: 09c2c90705bb ("net: allow alloc_skb_with_frags() to allocate bigger packets") Signed-off-by: Jason Baron Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250922191957.2855612-1-jbaron@akamai.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ee0274417948e0..1c0279b9cb9f48 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6667,7 +6667,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, return NULL; while (data_len) { - if (nr_frags == MAX_SKB_FRAGS - 1) + if (nr_frags == MAX_SKB_FRAGS) goto failure; while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; From 390b3a300d7872cef9588f003b204398be69ce08 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 21 Sep 2025 18:08:22 +0300 Subject: [PATCH 2743/3206] nexthop: Forbid FDB status change while nexthop is in a group The kernel forbids the creation of non-FDB nexthop groups with FDB nexthops: # ip nexthop add id 1 via 192.0.2.1 fdb # ip nexthop add id 2 group 1 Error: Non FDB nexthop group cannot have fdb nexthops. And vice versa: # ip nexthop add id 3 via 192.0.2.2 dev dummy1 # ip nexthop add id 4 group 3 fdb Error: FDB nexthop group can only have fdb nexthops. However, as long as no routes are pointing to a non-FDB nexthop group, the kernel allows changing the type of a nexthop from FDB to non-FDB and vice versa: # ip nexthop add id 5 via 192.0.2.2 dev dummy1 # ip nexthop add id 6 group 5 # ip nexthop replace id 5 via 192.0.2.2 fdb # echo $? 0 This configuration is invalid and can result in a NPD [1] since FDB nexthops are not associated with a nexthop device: # ip route add 198.51.100.1/32 nhid 6 # ping 198.51.100.1 Fix by preventing nexthop FDB status change while the nexthop is in a group: # ip nexthop add id 7 via 192.0.2.2 dev dummy1 # ip nexthop add id 8 group 7 # ip nexthop replace id 7 via 192.0.2.2 fdb Error: Cannot change nexthop FDB status while in a group. [1] BUG: kernel NULL pointer dereference, address: 00000000000003c0 [...] Oops: Oops: 0000 [#1] SMP CPU: 6 UID: 0 PID: 367 Comm: ping Not tainted 6.17.0-rc6-virtme-gb65678cacc03 #1 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-4.fc41 04/01/2014 RIP: 0010:fib_lookup_good_nhc+0x1e/0x80 [...] Call Trace: fib_table_lookup+0x541/0x650 ip_route_output_key_hash_rcu+0x2ea/0x970 ip_route_output_key_hash+0x55/0x80 __ip4_datagram_connect+0x250/0x330 udp_connect+0x2b/0x60 __sys_connect+0x9c/0xd0 __x64_sys_connect+0x18/0x20 do_syscall_64+0xa4/0x2a0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 38428d68719c ("nexthop: support for fdb ecmp nexthops") Reported-by: syzbot+6596516dd2b635ba2350@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68c9a4d2.050a0220.3c6139.0e63.GAE@google.com/ Tested-by: syzbot+6596516dd2b635ba2350@syzkaller.appspotmail.com Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20250921150824.149157-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 29118c43ebf5f1..34137768e7f9a2 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2399,6 +2399,13 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, return -EINVAL; } + if (!list_empty(&old->grp_list) && + rtnl_dereference(new->nh_info)->fdb_nh != + rtnl_dereference(old->nh_info)->fdb_nh) { + NL_SET_ERR_MSG(extack, "Cannot change nexthop FDB status while in a group"); + return -EINVAL; + } + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; From c29913109c70383cdf90b6fc792353e1009f24f5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 21 Sep 2025 18:08:23 +0300 Subject: [PATCH 2744/3206] selftests: fib_nexthops: Fix creation of non-FDB nexthops The test creates non-FDB nexthops without a nexthop device which leads to the expected failure, but for the wrong reason: # ./fib_nexthops.sh -t "ipv6_fdb_grp_fcnal ipv4_fdb_grp_fcnal" -v IPv6 fdb groups functional -------------------------- [...] COMMAND: ip -netns me-nRsN3E nexthop add id 63 via 2001:db8:91::4 Error: Device attribute required for non-blackhole and non-fdb nexthops. COMMAND: ip -netns me-nRsN3E nexthop add id 64 via 2001:db8:91::5 Error: Device attribute required for non-blackhole and non-fdb nexthops. COMMAND: ip -netns me-nRsN3E nexthop add id 103 group 63/64 fdb Error: Invalid nexthop id. TEST: Fdb Nexthop group with non-fdb nexthops [ OK ] [...] IPv4 fdb groups functional -------------------------- [...] COMMAND: ip -netns me-nRsN3E nexthop add id 14 via 172.16.1.2 Error: Device attribute required for non-blackhole and non-fdb nexthops. COMMAND: ip -netns me-nRsN3E nexthop add id 15 via 172.16.1.3 Error: Device attribute required for non-blackhole and non-fdb nexthops. COMMAND: ip -netns me-nRsN3E nexthop add id 103 group 14/15 fdb Error: Invalid nexthop id. TEST: Fdb Nexthop group with non-fdb nexthops [ OK ] COMMAND: ip -netns me-nRsN3E nexthop add id 16 via 172.16.1.2 fdb COMMAND: ip -netns me-nRsN3E nexthop add id 17 via 172.16.1.3 fdb COMMAND: ip -netns me-nRsN3E nexthop add id 104 group 14/15 Error: Invalid nexthop id. TEST: Non-Fdb Nexthop group with fdb nexthops [ OK ] [...] COMMAND: ip -netns me-0dlhyd ro add 172.16.0.0/22 nhid 15 Error: Nexthop id does not exist. TEST: Route add with fdb nexthop [ OK ] In addition, as can be seen in the above output, a couple of IPv4 test cases used the non-FDB nexthops (14 and 15) when they intended to use the FDB nexthops (16 and 17). These test cases only passed because failure was expected, but they failed for the wrong reason. Fix the test to create the non-FDB nexthops with a nexthop device and adjust the IPv4 test cases to use the FDB nexthops instead of the non-FDB nexthops. Output after the fix: # ./fib_nexthops.sh -t "ipv6_fdb_grp_fcnal ipv4_fdb_grp_fcnal" -v IPv6 fdb groups functional -------------------------- [...] COMMAND: ip -netns me-lNzfHP nexthop add id 63 via 2001:db8:91::4 dev veth1 COMMAND: ip -netns me-lNzfHP nexthop add id 64 via 2001:db8:91::5 dev veth1 COMMAND: ip -netns me-lNzfHP nexthop add id 103 group 63/64 fdb Error: FDB nexthop group can only have fdb nexthops. TEST: Fdb Nexthop group with non-fdb nexthops [ OK ] [...] IPv4 fdb groups functional -------------------------- [...] COMMAND: ip -netns me-lNzfHP nexthop add id 14 via 172.16.1.2 dev veth1 COMMAND: ip -netns me-lNzfHP nexthop add id 15 via 172.16.1.3 dev veth1 COMMAND: ip -netns me-lNzfHP nexthop add id 103 group 14/15 fdb Error: FDB nexthop group can only have fdb nexthops. TEST: Fdb Nexthop group with non-fdb nexthops [ OK ] COMMAND: ip -netns me-lNzfHP nexthop add id 16 via 172.16.1.2 fdb COMMAND: ip -netns me-lNzfHP nexthop add id 17 via 172.16.1.3 fdb COMMAND: ip -netns me-lNzfHP nexthop add id 104 group 16/17 Error: Non FDB nexthop group cannot have fdb nexthops. TEST: Non-Fdb Nexthop group with fdb nexthops [ OK ] [...] COMMAND: ip -netns me-lNzfHP ro add 172.16.0.0/22 nhid 16 Error: Route cannot point to a fdb nexthop. TEST: Route add with fdb nexthop [ OK ] [...] Tests passed: 30 Tests failed: 0 Tests skipped: 0 Fixes: 0534c5489c11 ("selftests: net: add fdb nexthop tests") Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20250921150824.149157-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_nexthops.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index b39f748c25722a..2ac394c99d0183 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -467,8 +467,8 @@ ipv6_fdb_grp_fcnal() log_test $? 0 "Get Fdb nexthop group by id" # fdb nexthop group can only contain fdb nexthops - run_cmd "$IP nexthop add id 63 via 2001:db8:91::4" - run_cmd "$IP nexthop add id 64 via 2001:db8:91::5" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::4 dev veth1" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::5 dev veth1" run_cmd "$IP nexthop add id 103 group 63/64 fdb" log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" @@ -547,15 +547,15 @@ ipv4_fdb_grp_fcnal() log_test $? 0 "Get Fdb nexthop group by id" # fdb nexthop group can only contain fdb nexthops - run_cmd "$IP nexthop add id 14 via 172.16.1.2" - run_cmd "$IP nexthop add id 15 via 172.16.1.3" + run_cmd "$IP nexthop add id 14 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 15 via 172.16.1.3 dev veth1" run_cmd "$IP nexthop add id 103 group 14/15 fdb" log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" # Non fdb nexthop group can not contain fdb nexthops run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb" run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb" - run_cmd "$IP nexthop add id 104 group 14/15" + run_cmd "$IP nexthop add id 104 group 16/17" log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" # fdb nexthop cannot have blackhole @@ -582,7 +582,7 @@ ipv4_fdb_grp_fcnal() run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self" log_test $? 255 "Fdb mac add with nexthop" - run_cmd "$IP ro add 172.16.0.0/22 nhid 15" + run_cmd "$IP ro add 172.16.0.0/22 nhid 16" log_test $? 2 "Route add with fdb nexthop" run_cmd "$IP ro add 172.16.0.0/22 nhid 103" From 00af023d90f9087ed5a371302ab442ed5736c3b7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 21 Sep 2025 18:08:24 +0300 Subject: [PATCH 2745/3206] selftests: fib_nexthops: Add test cases for FDB status change Add the following test cases for both IPv4 and IPv6: * Can change from FDB nexthop to non-FDB nexthop and vice versa. * Can change FDB nexthop address while in a group. * Cannot change from FDB nexthop to non-FDB nexthop and vice versa while in a group. Output without "nexthop: Forbid FDB status change while nexthop is in a group": # ./fib_nexthops.sh -t "ipv6_fdb_grp_fcnal ipv4_fdb_grp_fcnal" IPv6 fdb groups functional -------------------------- [...] TEST: Replace FDB nexthop to non-FDB nexthop [ OK ] TEST: Replace non-FDB nexthop to FDB nexthop [ OK ] TEST: Replace FDB nexthop address while in a group [ OK ] TEST: Replace FDB nexthop to non-FDB nexthop while in a group [FAIL] TEST: Replace non-FDB nexthop to FDB nexthop while in a group [FAIL] [...] IPv4 fdb groups functional -------------------------- [...] TEST: Replace FDB nexthop to non-FDB nexthop [ OK ] TEST: Replace non-FDB nexthop to FDB nexthop [ OK ] TEST: Replace FDB nexthop address while in a group [ OK ] TEST: Replace FDB nexthop to non-FDB nexthop while in a group [FAIL] TEST: Replace non-FDB nexthop to FDB nexthop while in a group [FAIL] [...] Tests passed: 36 Tests failed: 4 Tests skipped: 0 Output with "nexthop: Forbid FDB status change while nexthop is in a group": # ./fib_nexthops.sh -t "ipv6_fdb_grp_fcnal ipv4_fdb_grp_fcnal" IPv6 fdb groups functional -------------------------- [...] TEST: Replace FDB nexthop to non-FDB nexthop [ OK ] TEST: Replace non-FDB nexthop to FDB nexthop [ OK ] TEST: Replace FDB nexthop address while in a group [ OK ] TEST: Replace FDB nexthop to non-FDB nexthop while in a group [ OK ] TEST: Replace non-FDB nexthop to FDB nexthop while in a group [ OK ] [...] IPv4 fdb groups functional -------------------------- [...] TEST: Replace FDB nexthop to non-FDB nexthop [ OK ] TEST: Replace non-FDB nexthop to FDB nexthop [ OK ] TEST: Replace FDB nexthop address while in a group [ OK ] TEST: Replace FDB nexthop to non-FDB nexthop while in a group [ OK ] TEST: Replace non-FDB nexthop to FDB nexthop while in a group [ OK ] [...] Tests passed: 40 Tests failed: 0 Tests skipped: 0 Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20250921150824.149157-4-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_nexthops.sh | 40 +++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 2ac394c99d0183..2b0a90581e2f15 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -494,6 +494,26 @@ ipv6_fdb_grp_fcnal() run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb" log_test $? 2 "Fdb Nexthop with encap" + # Replace FDB nexthop to non-FDB and vice versa + run_cmd "$IP nexthop add id 70 via 2001:db8:91::2 fdb" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 dev veth1" + log_test $? 0 "Replace FDB nexthop to non-FDB nexthop" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 fdb" + log_test $? 0 "Replace non-FDB nexthop to FDB nexthop" + + # Replace FDB nexthop address while in a group + run_cmd "$IP nexthop add id 71 group 70 fdb" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::3 fdb" + log_test $? 0 "Replace FDB nexthop address while in a group" + + # Cannot replace FDB nexthop to non-FDB and vice versa while in a group + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 dev veth1" + log_test $? 2 "Replace FDB nexthop to non-FDB nexthop while in a group" + run_cmd "$IP nexthop add id 72 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 73 group 72" + run_cmd "$IP nexthop replace id 72 via 2001:db8:91::2 fdb" + log_test $? 2 "Replace non-FDB nexthop to FDB nexthop while in a group" + run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100" run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" log_test $? 0 "Fdb mac add with nexthop group" @@ -574,6 +594,26 @@ ipv4_fdb_grp_fcnal() run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb" log_test $? 2 "Fdb Nexthop with encap" + # Replace FDB nexthop to non-FDB and vice versa + run_cmd "$IP nexthop add id 18 via 172.16.1.2 fdb" + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 dev veth1" + log_test $? 0 "Replace FDB nexthop to non-FDB nexthop" + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 fdb" + log_test $? 0 "Replace non-FDB nexthop to FDB nexthop" + + # Replace FDB nexthop address while in a group + run_cmd "$IP nexthop add id 19 group 18 fdb" + run_cmd "$IP nexthop replace id 18 via 172.16.1.3 fdb" + log_test $? 0 "Replace FDB nexthop address while in a group" + + # Cannot replace FDB nexthop to non-FDB and vice versa while in a group + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 dev veth1" + log_test $? 2 "Replace FDB nexthop to non-FDB nexthop while in a group" + run_cmd "$IP nexthop add id 20 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 21 group 20" + run_cmd "$IP nexthop replace id 20 via 172.16.1.2 fdb" + log_test $? 2 "Replace non-FDB nexthop to FDB nexthop while in a group" + run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100" run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" log_test $? 0 "Fdb mac add with nexthop group" From 6043819e707cefb1c9e59d6e431dcfa735c4f975 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 22 Sep 2025 10:11:32 +0300 Subject: [PATCH 2746/3206] net/mlx5: fs, fix UAF in flow counter release Fix a kernel trace [1] caused by releasing an HWS action of a local flow counter in mlx5_cmd_hws_delete_fte(), where the HWS action refcount and mutex were not initialized and the counter struct could already be freed when deleting the rule. Fix it by adding the missing initializations and adding refcount for the local flow counter struct. [1] Kernel log: Call Trace: dump_stack_lvl+0x34/0x48 mlx5_fs_put_hws_action.part.0.cold+0x21/0x94 [mlx5_core] mlx5_fc_put_hws_action+0x96/0xad [mlx5_core] mlx5_fs_destroy_fs_actions+0x8b/0x152 [mlx5_core] mlx5_cmd_hws_delete_fte+0x5a/0xa0 [mlx5_core] del_hw_fte+0x1ce/0x260 [mlx5_core] mlx5_del_flow_rules+0x12d/0x240 [mlx5_core] ? ttwu_queue_wakelist+0xf4/0x110 mlx5_ib_destroy_flow+0x103/0x1b0 [mlx5_ib] uverbs_free_flow+0x20/0x50 [ib_uverbs] destroy_hw_idr_uobject+0x1b/0x50 [ib_uverbs] uverbs_destroy_uobject+0x34/0x1a0 [ib_uverbs] uobj_destroy+0x3c/0x80 [ib_uverbs] ib_uverbs_run_method+0x23e/0x360 [ib_uverbs] ? uverbs_finalize_object+0x60/0x60 [ib_uverbs] ib_uverbs_cmd_verbs+0x14f/0x2c0 [ib_uverbs] ? do_tty_write+0x1a9/0x270 ? file_tty_write.constprop.0+0x98/0xc0 ? new_sync_write+0xfc/0x190 ib_uverbs_ioctl+0xd7/0x160 [ib_uverbs] __x64_sys_ioctl+0x87/0xc0 do_syscall_64+0x59/0x90 Fixes: b581f4266928 ("net/mlx5: fs, manage flow counters HWS action sharing by refcount") Signed-off-by: Moshe Shemesh Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1758525094-816583-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/fs_core.c | 2 +- .../net/ethernet/mellanox/mlx5/core/fs_core.h | 1 + .../ethernet/mellanox/mlx5/core/fs_counters.c | 25 ++++++++++++++++--- .../mlx5/core/steering/hws/fs_hws_pools.c | 8 +++++- include/linux/mlx5/fs.h | 2 ++ 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index db552c012b4f4f..80245c38dbad39 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -663,7 +663,7 @@ static void del_sw_hw_rule(struct fs_node *node) BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) | BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS); fte->act_dests.action.action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT; - mlx5_fc_local_destroy(rule->dest_attr.counter); + mlx5_fc_local_put(rule->dest_attr.counter); goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 500826229b0beb..e6a95b310b5554 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -343,6 +343,7 @@ struct mlx5_fc { enum mlx5_fc_type type; struct mlx5_fc_bulk *bulk; struct mlx5_fc_cache cache; + refcount_t fc_local_refcount; /* last{packets,bytes} are used for calculating deltas since last reading. */ u64 lastpackets; u64 lastbytes; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 492775d3d193a3..83001eda38842a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -562,17 +562,36 @@ mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size) counter->id = counter_id; fc_bulk->base_id = counter_id - offset; fc_bulk->fs_bulk.bulk_len = bulk_size; + refcount_set(&fc_bulk->hws_data.hws_action_refcount, 0); + mutex_init(&fc_bulk->hws_data.lock); counter->bulk = fc_bulk; + refcount_set(&counter->fc_local_refcount, 1); return counter; } EXPORT_SYMBOL(mlx5_fc_local_create); void mlx5_fc_local_destroy(struct mlx5_fc *counter) { - if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) - return; - kfree(counter->bulk); kfree(counter); } EXPORT_SYMBOL(mlx5_fc_local_destroy); + +void mlx5_fc_local_get(struct mlx5_fc *counter) +{ + if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) + return; + + refcount_inc(&counter->fc_local_refcount); +} + +void mlx5_fc_local_put(struct mlx5_fc *counter) +{ + if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) + return; + + if (!refcount_dec_and_test(&counter->fc_local_refcount)) + return; + + mlx5_fc_local_destroy(counter); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c index f1ecdba74e1f46..839d71bd42164f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c @@ -407,15 +407,21 @@ struct mlx5hws_action *mlx5_fc_get_hws_action(struct mlx5hws_context *ctx, { struct mlx5_fs_hws_create_action_ctx create_ctx; struct mlx5_fc_bulk *fc_bulk = counter->bulk; + struct mlx5hws_action *hws_action; create_ctx.hws_ctx = ctx; create_ctx.id = fc_bulk->base_id; create_ctx.actions_type = MLX5HWS_ACTION_TYP_CTR; - return mlx5_fs_get_hws_action(&fc_bulk->hws_data, &create_ctx); + mlx5_fc_local_get(counter); + hws_action = mlx5_fs_get_hws_action(&fc_bulk->hws_data, &create_ctx); + if (!hws_action) + mlx5_fc_local_put(counter); + return hws_action; } void mlx5_fc_put_hws_action(struct mlx5_fc *counter) { mlx5_fs_put_hws_action(&counter->bulk->hws_data); + mlx5_fc_local_put(counter); } diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 86055d55836d94..6ac76a0c382773 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -308,6 +308,8 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); struct mlx5_fc *mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size); void mlx5_fc_local_destroy(struct mlx5_fc *counter); +void mlx5_fc_local_get(struct mlx5_fc *counter); +void mlx5_fc_local_put(struct mlx5_fc *counter); u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); From efb877cf27e300e47e1c051f4e8fd80fc42325d5 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 22 Sep 2025 10:11:33 +0300 Subject: [PATCH 2747/3206] net/mlx5: HWS, ignore flow level for multi-dest table When HWS creates multi-dest FW table and adds rules to forward to other tables, ignore the flow level enforcement in FW, because HWS is responsible for table levels. This fixes the following error: mlx5_core 0000:08:00.0: mlx5_cmd_out_err:818:(pid 192306): SET_FLOW_TABLE_ENTRY(0x936) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x6ae84c), err(-22) Fixes: 504e536d9010 ("net/mlx5: HWS, added actions handling") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Moshe Shemesh Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1758525094-816583-3-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/steering/hws/action.c | 4 ++-- .../ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c | 11 +++-------- .../mellanox/mlx5/core/steering/hws/mlx5hws.h | 3 +-- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index 6b36a4a7d895fc..fe56b59e24c59c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -1360,7 +1360,7 @@ mlx5hws_action_create_modify_header(struct mlx5hws_context *ctx, struct mlx5hws_action * mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, size_t num_dest, struct mlx5hws_action_dest_attr *dests, - bool ignore_flow_level, u32 flags) + u32 flags) { struct mlx5hws_cmd_set_fte_dest *dest_list = NULL; struct mlx5hws_cmd_ft_create_attr ft_attr = {0}; @@ -1397,7 +1397,7 @@ mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, size_t num_dest, MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dest_list[i].destination_id = dests[i].dest->dest_obj.obj_id; fte_attr.action_flags |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - fte_attr.ignore_flow_level = ignore_flow_level; + fte_attr.ignore_flow_level = 1; if (dests[i].is_wire_ft) last_dest_idx = i; break; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c index 131e74b2b77435..6a4c4cccd64342 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c @@ -572,12 +572,12 @@ static void mlx5_fs_put_dest_action_sampler(struct mlx5_fs_hws_context *fs_ctx, static struct mlx5hws_action * mlx5_fs_create_action_dest_array(struct mlx5hws_context *ctx, struct mlx5hws_action_dest_attr *dests, - u32 num_of_dests, bool ignore_flow_level) + u32 num_of_dests) { u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; return mlx5hws_action_create_dest_array(ctx, num_of_dests, dests, - ignore_flow_level, flags); + flags); } static struct mlx5hws_action * @@ -1014,19 +1014,14 @@ static int mlx5_fs_fte_get_hws_actions(struct mlx5_flow_root_namespace *ns, } (*ractions)[num_actions++].action = dest_actions->dest; } else if (num_dest_actions > 1) { - bool ignore_flow_level; - if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || num_fs_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { err = -EOPNOTSUPP; goto free_actions; } - ignore_flow_level = - !!(fte_action->flags & FLOW_ACT_IGNORE_FLOW_LEVEL); tmp_action = mlx5_fs_create_action_dest_array(ctx, dest_actions, - num_dest_actions, - ignore_flow_level); + num_dest_actions); if (!tmp_action) { err = -EOPNOTSUPP; goto free_actions; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h index 2498ceff2060fd..1ad7a50d938b6c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h @@ -735,7 +735,6 @@ mlx5hws_action_create_push_vlan(struct mlx5hws_context *ctx, u32 flags); * @num_dest: The number of dests attributes. * @dests: The destination array. Each contains a destination action and can * have additional actions. - * @ignore_flow_level: Whether to turn on 'ignore_flow_level' for this dest. * @flags: Action creation flags (enum mlx5hws_action_flags). * * Return: pointer to mlx5hws_action on success NULL otherwise. @@ -743,7 +742,7 @@ mlx5hws_action_create_push_vlan(struct mlx5hws_context *ctx, u32 flags); struct mlx5hws_action * mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, size_t num_dest, struct mlx5hws_action_dest_attr *dests, - bool ignore_flow_level, u32 flags); + u32 flags); /** * mlx5hws_action_create_insert_header - Create insert header action. From 6d0477d0d067a53c1d48d0aff1fd52e151721871 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Mon, 22 Sep 2025 10:11:34 +0300 Subject: [PATCH 2748/3206] net/mlx5e: Fix missing FEC RS stats for RS_544_514_INTERLEAVED_QUAD Include MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD in the FEC RS stats handling. This addresses a gap introduced when adding support for 200G/lane link modes. Fixes: 4e343c11efbb ("net/mlx5e: Support FEC settings for 200G per lane link modes") Signed-off-by: Carolina Jubran Reviewed-by: Dragos Tatulea Reviewed-by: Yael Chemla Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1758525094-816583-4-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 87536f158d07b4..c6185ddba04b84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -1466,6 +1466,7 @@ static void fec_set_block_stats(struct mlx5e_priv *priv, case MLX5E_FEC_RS_528_514: case MLX5E_FEC_RS_544_514: case MLX5E_FEC_LLRS_272_257_1: + case MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD: fec_set_rs_stats(fec_stats, out); return; case MLX5E_FEC_FIRECODE: From 546e42c8c6d9498d5eac14bf2aca0383a11b145a Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 23 Sep 2025 18:25:52 -0600 Subject: [PATCH 2749/3206] riscv: Use an atomic xchg in pudp_huge_get_and_clear() Make sure we return the right pud value and not a value that could have been overwritten in between by a different core. Fixes: c3cc2a4a3a23 ("riscv: Add support for PUD THP") Cc: stable@vger.kernel.org Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250814-dev-alex-thp_pud_xchg-v1-1-b4704dfae206@rivosinc.com [pjw@kernel.org: use xchg rather than atomic_long_xchg; avoid atomic op for !CONFIG_SMP like x86] Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/pgtable.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 91697fbf1f9013..81506774293980 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -942,6 +942,23 @@ static inline int pudp_test_and_clear_young(struct vm_area_struct *vma, return ptep_test_and_clear_young(vma, address, (pte_t *)pudp); } +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, pud_t *pudp) +{ +#ifdef CONFIG_SMP + pud_t pud = __pud(xchg(&pudp->pud, 0)); +#else + pud_t pud = *pudp; + + pud_clear(pudp); +#endif + + page_table_check_pud_clear(mm, pud); + + return pud; +} + static inline int pud_young(pud_t pud) { return pte_young(pud_pte(pud)); From e24108012ce9662d90093f91d5ffebcbf78da7de Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Tue, 23 Sep 2025 18:25:52 -0600 Subject: [PATCH 2750/3206] MAINTAINERS: Update Paul Walmsley's E-mail address My experiment with using corporate Gmail for Linux kernel list interaction has come to an end. For my MAINTAINERS entries that use that E-mail address, let's switch those to use the k.org E-mail forwarding. Signed-off-by: Paul Walmsley Signed-off-by: Paul Walmsley --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..9bde7a8e21d1dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1772,7 +1772,7 @@ F: drivers/staging/iio/*/ad* X: drivers/iio/*/adjd* ANALOGBITS PLL LIBRARIES -M: Paul Walmsley +M: Paul Walmsley M: Samuel Holland S: Supported F: drivers/clk/analogbits/* @@ -19265,7 +19265,7 @@ S: Maintained F: drivers/pci/controller/dwc/*layerscape* PCI DRIVER FOR FU740 -M: Paul Walmsley +M: Paul Walmsley M: Greentime Hu M: Samuel Holland L: linux-pci@vger.kernel.org @@ -21625,7 +21625,7 @@ F: Documentation/devicetree/bindings/timer/andestech,plmt0.yaml F: arch/riscv/boot/dts/andes/ RISC-V ARCHITECTURE -M: Paul Walmsley +M: Paul Walmsley M: Palmer Dabbelt M: Albert Ou R: Alexandre Ghiti @@ -23076,7 +23076,7 @@ S: Maintained F: drivers/watchdog/simatic-ipc-wdt.c SIFIVE DRIVERS -M: Paul Walmsley +M: Paul Walmsley M: Samuel Holland L: linux-riscv@lists.infradead.org S: Supported From df10932ad740ba1f871b6dd2ddafc7dc8cea944f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 20:38:23 -1000 Subject: [PATCH 2751/3206] Revert "sched_ext: Use rhashtable_lookup() instead of rhashtable_lookup_fast()" This reverts commit c8191ee8e64a8c5c021a34e32868f2380965e82b which triggers the following suspicious RCU usage warning: [ 6.647598] ============================= [ 6.647603] WARNING: suspicious RCU usage [ 6.647605] 6.17.0-rc7-virtme #1 Not tainted [ 6.647608] ----------------------------- [ 6.647608] ./include/linux/rhashtable.h:602 suspicious rcu_dereference_check() usage! [ 6.647610] [ 6.647610] other info that might help us debug this: [ 6.647610] [ 6.647612] [ 6.647612] rcu_scheduler_active = 2, debug_locks = 1 [ 6.647613] 1 lock held by swapper/10/0: [ 6.647614] #0: ffff8b14bbb3cc98 (&rq->__lock){-.-.}-{2:2}, at: +raw_spin_rq_lock_nested+0x20/0x90 [ 6.647630] [ 6.647630] stack backtrace: [ 6.647633] CPU: 10 UID: 0 PID: 0 Comm: swapper/10 Not tainted 6.17.0-rc7-virtme #1 +PREEMPT(full) [ 6.647643] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 6.647646] Sched_ext: beerland_1.0.2_g27d63fc3_x86_64_unknown_linux_gnu (enabled+all) [ 6.647648] Call Trace: [ 6.647652] [ 6.647655] dump_stack_lvl+0x78/0xe0 [ 6.647665] lockdep_rcu_suspicious+0x14a/0x1b0 [ 6.647672] __rhashtable_lookup.constprop.0+0x1d5/0x250 [ 6.647680] find_dsq_for_dispatch+0xbc/0x190 [ 6.647684] do_enqueue_task+0x25b/0x550 [ 6.647689] enqueue_task_scx+0x21d/0x360 [ 6.647692] ? trace_lock_acquire+0x22/0xb0 [ 6.647695] enqueue_task+0x2e/0xd0 [ 6.647698] ttwu_do_activate+0xa2/0x290 [ 6.647703] sched_ttwu_pending+0xfd/0x250 [ 6.647706] __flush_smp_call_function_queue+0x1cd/0x610 [ 6.647714] __sysvec_call_function_single+0x34/0x150 [ 6.647720] sysvec_call_function_single+0x6e/0x80 [ 6.647726] [ 6.647726] [ 6.647727] asm_sysvec_call_function_single+0x1a/0x20 Reported-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 99a5f6429207dc..2b0e88206d0768 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -189,7 +189,7 @@ static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) { - return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); + return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); } /* From 4ec3c15462b9f44562f45723a92e2807746ba7d1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Sep 2025 15:09:45 +0200 Subject: [PATCH 2752/3206] futex: Use correct exit on failure from futex_hash_allocate_default() copy_process() uses the wrong error exit path from futex_hash_allocate_default(). After exiting from futex_hash_allocate_default(), neither tasklist_lock nor siglock has been acquired. The exit label bad_fork_core_free unlocks both of these locks which is wrong. The next exit label, bad_fork_cancel_cgroup, is the correct exit. sched_cgroup_fork() did not allocate any resources that need to freed. Use bad_fork_cancel_cgroup on error exit from futex_hash_allocate_default(). Fixes: 7c4f75a21f636 ("futex: Allow automatic allocation of process wide futex hash") Reported-by: syzbot+80cb3cc5c14fad191a10@syzkaller.appspotmail.com Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (Google) Closes: https://lore.kernel.org/all/68cb1cbd.050a0220.2ff435.0599.GAE@google.com --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index c4ada32598bd5e..6ca8689a83b5bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2295,7 +2295,7 @@ __latent_entropy struct task_struct *copy_process( if (need_futex_hash_allocate_default(clone_flags)) { retval = futex_hash_allocate_default(); if (retval) - goto bad_fork_core_free; + goto bad_fork_cancel_cgroup; /* * If we fail beyond this point we don't free the allocated * futex hash map. We assume that another thread will be created From e2ffa15b9baa447e444d654ffd47123ba6443ae4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 Sep 2025 15:21:51 +0200 Subject: [PATCH 2753/3206] kbuild: Disable CC_HAS_ASM_GOTO_OUTPUT on clang < 17 clang < 17 fails to use scope local labels with CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y: { __label__ local_lbl; ... unsafe_get_user(uval, uaddr, local_lbl); ... return 0; local_lbl: return -EFAULT; } when two such scopes exist in the same function: error: cannot jump from this asm goto statement to one of its possible targets There are other failure scenarios. Shuffling code around slightly makes it worse and fail even with one instance. That issue prevents using local labels for a cleanup based user access mechanism. After failed attempts to provide a simple enough test case for the 'depends on' test in Kconfig, the initial cure was to mark ASM goto broken on clang versions < 17 to get this road block out of the way. But Nathan pointed out that this is a known clang issue and indeed affects clang < version 17 in combination with cleanup(). It's not even required to use local labels for that. The clang issue tracker has a small enough test case, which can be used as a test in the 'depends on' section of CC_HAS_ASM_GOTO_OUTPUT: void bar(void **); void* baz(void); int foo (void) { { asm goto("jmp %l0"::::l0); return 0; l0: return 1; } void *x __attribute__((cleanup(bar))) = baz(); { asm goto("jmp %l0"::::l1); return 42; l1: return 0xff; } } Add another dependency to config CC_HAS_ASM_GOTO_OUTPUT for it and use the clang issue tracker test case for detection by condensing it to obfuscated C-code contest format. This reliably catches the problem on clang < 17 and did not show any issues on the non broken GCC versions. That test might be sufficient to catch all issues and therefore could replace the existing test, but keeping that around does no harm either. Thanks to Nathan for pointing to the relevant clang issue! Suggested-by: Nathan Chancellor Signed-off-by: Thomas Gleixner Cc: Nathan Chancellor Reviewed-by: Nathan Chancellor Link: https://github.com/ClangBuiltLinux/linux/issues/1886 Link: https://github.com/llvm/llvm-project/commit/f023f5cdb2e6c19026f04a15b5a935c041835d14 --- init/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index e3eb63eadc8757..ecddb94db8dc01 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -99,7 +99,10 @@ config GCC_ASM_GOTO_OUTPUT_BROKEN config CC_HAS_ASM_GOTO_OUTPUT def_bool y depends on !GCC_ASM_GOTO_OUTPUT_BROKEN + # Detect basic support depends on $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null) + # Detect clang (< v17) scoped label issues + depends on $(success,echo 'void b(void **);void* c(void);int f(void){{asm goto("jmp %l0"::::l0);return 0;l0:return 1;}void *x __attribute__((cleanup(b)))=c();{asm goto("jmp %l0"::::l1);return 2;l1:return 3;}}' | $(CC) -x c - -c -o /dev/null) config CC_HAS_ASM_GOTO_TIED_OUTPUT depends on CC_HAS_ASM_GOTO_OUTPUT From 70a0bcde87512c269269443444c109740dcacc19 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 24 Sep 2025 10:05:40 +0800 Subject: [PATCH 2754/3206] ASoc: tas2783A: Remove unneeded semicolon Remove unnecessary semicolons reported by Coccinelle/coccicheck and the semantic patch at scripts/coccinelle/misc/semicolon.cocci. Signed-off-by: Chen Ni Link: https://patch.msgid.link/20250924020540.234560-1-nichen@iscas.ac.cn Signed-off-by: Mark Brown --- sound/soc/codecs/tas2783-sdw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/tas2783-sdw.c b/sound/soc/codecs/tas2783-sdw.c index 3e03e68932f6b7..d5d33afdfa1016 100644 --- a/sound/soc/codecs/tas2783-sdw.c +++ b/sound/soc/codecs/tas2783-sdw.c @@ -806,7 +806,7 @@ static void tas2783_fw_ready(const struct firmware *fmw, void *context) offset += file_blk_size; else break; - }; + } mutex_unlock(&tas_dev->pde_lock); tas2783_update_calibdata(tas_dev); From 6be988660b474564c77cb6ff60776dafcd850a18 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 23 Sep 2025 19:50:06 +0100 Subject: [PATCH 2755/3206] ASoc: tas2783A: Fix spelling mistake "Perifpheral" -> "Peripheral" There is a spelling mistake in a dev_dbg debug message. Fix it. Signed-off-by: Colin Ian King Link: https://patch.msgid.link/20250923185006.213861-1-colin.i.king@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/tas2783-sdw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/tas2783-sdw.c b/sound/soc/codecs/tas2783-sdw.c index d5d33afdfa1016..c2a7be51bdc601 100644 --- a/sound/soc/codecs/tas2783-sdw.c +++ b/sound/soc/codecs/tas2783-sdw.c @@ -1229,7 +1229,7 @@ static s32 tas_update_status(struct sdw_slave *slave, struct tas2783_prv *tas_dev = dev_get_drvdata(&slave->dev); struct device *dev = &slave->dev; - dev_dbg(dev, "Perifpheral status = %s", + dev_dbg(dev, "Peripheral status = %s", status == SDW_SLAVE_UNATTACHED ? "unattached" : status == SDW_SLAVE_ATTACHED ? "attached" : "alert"); From 52aefc1e3c5fbdfdd216796fbe78443ae67e447f Mon Sep 17 00:00:00 2001 From: Julien Massot Date: Tue, 26 Aug 2025 09:39:35 +0200 Subject: [PATCH 2756/3206] ASoC: dt-binding: Convert mt8183-afe-pcm to dt-schema Convert the MediaTek MT8183 AFE PCM Device Tree binding from the old .txt format to dt-schema format to improve validation. While converting, also document all clock inputs and memory-region used by the AFE block. Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Rob Herring (Arm) Signed-off-by: Julien Massot Link: https://patch.msgid.link/20250826-mtk-dtb-warnings-v3-2-20e89886a20e@collabora.com Signed-off-by: Mark Brown --- .../bindings/sound/mediatek,mt8183-audio.yaml | 228 ++++++++++++++++++ .../bindings/sound/mt8183-afe-pcm.txt | 42 ---- 2 files changed, 228 insertions(+), 42 deletions(-) create mode 100644 Documentation/devicetree/bindings/sound/mediatek,mt8183-audio.yaml delete mode 100644 Documentation/devicetree/bindings/sound/mt8183-afe-pcm.txt diff --git a/Documentation/devicetree/bindings/sound/mediatek,mt8183-audio.yaml b/Documentation/devicetree/bindings/sound/mediatek,mt8183-audio.yaml new file mode 100644 index 00000000000000..031b0fa7b4dc1b --- /dev/null +++ b/Documentation/devicetree/bindings/sound/mediatek,mt8183-audio.yaml @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/mediatek,mt8183-audio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Mediatek AFE PCM controller for mt8183 + +maintainers: + - Julien Massot + +properties: + compatible: + const: mediatek,mt8183-audio + + interrupts: + maxItems: 1 + + resets: + maxItems: 1 + + reset-names: + const: audiosys + + power-domains: + maxItems: 1 + + memory-region: + maxItems: 1 + + clocks: + items: + - description: AFE clock + - description: ADDA DAC clock + - description: ADDA DAC pre-distortion clock + - description: ADDA ADC clock + - description: ADDA6 ADC clock + - description: Audio low-jitter 22.5792m clock + - description: Audio low-jitter 24.576m clock + - description: Audio PLL1 tuner clock + - description: Audio PLL2 tuner clock + - description: I2S1 bit clock + - description: I2S2 bit clock + - description: I2S3 bit clock + - description: I2S4 bit clock + - description: Audio Time-Division Multiplexing interface clock + - description: Powerdown Audio test model clock + - description: Audio infra sys clock + - description: Audio infra 26M clock + - description: Mux for audio clock + - description: Mux for audio internal bus clock + - description: Mux main divider by 4 + - description: Primary audio mux + - description: Primary audio PLL + - description: Secondary audio mux + - description: Secondary audio PLL + - description: Primary audio en-generator clock + - description: Primary PLL divider by 4 for IEC + - description: Secondary audio en-generator clock + - description: Secondary PLL divider by 8 for IEC + - description: Mux selector for I2S port 0 + - description: Mux selector for I2S port 1 + - description: Mux selector for I2S port 2 + - description: Mux selector for I2S port 3 + - description: Mux selector for I2S port 4 + - description: Mux selector for I2S port 5 + - description: APLL1 and APLL2 divider for I2S port 0 + - description: APLL1 and APLL2 divider for I2S port 1 + - description: APLL1 and APLL2 divider for I2S port 2 + - description: APLL1 and APLL2 divider for I2S port 3 + - description: APLL1 and APLL2 divider for I2S port 4 + - description: APLL1 and APLL2 divider for IEC + - description: 26MHz clock for audio subsystem + + clock-names: + items: + - const: aud_afe_clk + - const: aud_dac_clk + - const: aud_dac_predis_clk + - const: aud_adc_clk + - const: aud_adc_adda6_clk + - const: aud_apll22m_clk + - const: aud_apll24m_clk + - const: aud_apll1_tuner_clk + - const: aud_apll2_tuner_clk + - const: aud_i2s1_bclk_sw + - const: aud_i2s2_bclk_sw + - const: aud_i2s3_bclk_sw + - const: aud_i2s4_bclk_sw + - const: aud_tdm_clk + - const: aud_tml_clk + - const: aud_infra_clk + - const: mtkaif_26m_clk + - const: top_mux_audio + - const: top_mux_aud_intbus + - const: top_syspll_d2_d4 + - const: top_mux_aud_1 + - const: top_apll1_ck + - const: top_mux_aud_2 + - const: top_apll2_ck + - const: top_mux_aud_eng1 + - const: top_apll1_d8 + - const: top_mux_aud_eng2 + - const: top_apll2_d8 + - const: top_i2s0_m_sel + - const: top_i2s1_m_sel + - const: top_i2s2_m_sel + - const: top_i2s3_m_sel + - const: top_i2s4_m_sel + - const: top_i2s5_m_sel + - const: top_apll12_div0 + - const: top_apll12_div1 + - const: top_apll12_div2 + - const: top_apll12_div3 + - const: top_apll12_div4 + - const: top_apll12_divb + - const: top_clk26m_clk + +required: + - compatible + - interrupts + - resets + - reset-names + - power-domains + - clocks + - clock-names + +additionalProperties: false + +examples: + - | + #include + #include + #include + #include + #include + + audio-controller { + compatible = "mediatek,mt8183-audio"; + interrupts = ; + resets = <&watchdog MT8183_TOPRGU_AUDIO_SW_RST>; + reset-names = "audiosys"; + power-domains = <&spm MT8183_POWER_DOMAIN_AUDIO>; + clocks = <&audiosys CLK_AUDIO_AFE>, + <&audiosys CLK_AUDIO_DAC>, + <&audiosys CLK_AUDIO_DAC_PREDIS>, + <&audiosys CLK_AUDIO_ADC>, + <&audiosys CLK_AUDIO_PDN_ADDA6_ADC>, + <&audiosys CLK_AUDIO_22M>, + <&audiosys CLK_AUDIO_24M>, + <&audiosys CLK_AUDIO_APLL_TUNER>, + <&audiosys CLK_AUDIO_APLL2_TUNER>, + <&audiosys CLK_AUDIO_I2S1>, + <&audiosys CLK_AUDIO_I2S2>, + <&audiosys CLK_AUDIO_I2S3>, + <&audiosys CLK_AUDIO_I2S4>, + <&audiosys CLK_AUDIO_TDM>, + <&audiosys CLK_AUDIO_TML>, + <&infracfg CLK_INFRA_AUDIO>, + <&infracfg CLK_INFRA_AUDIO_26M_BCLK>, + <&topckgen CLK_TOP_MUX_AUDIO>, + <&topckgen CLK_TOP_MUX_AUD_INTBUS>, + <&topckgen CLK_TOP_SYSPLL_D2_D4>, + <&topckgen CLK_TOP_MUX_AUD_1>, + <&topckgen CLK_TOP_APLL1_CK>, + <&topckgen CLK_TOP_MUX_AUD_2>, + <&topckgen CLK_TOP_APLL2_CK>, + <&topckgen CLK_TOP_MUX_AUD_ENG1>, + <&topckgen CLK_TOP_APLL1_D8>, + <&topckgen CLK_TOP_MUX_AUD_ENG2>, + <&topckgen CLK_TOP_APLL2_D8>, + <&topckgen CLK_TOP_MUX_APLL_I2S0>, + <&topckgen CLK_TOP_MUX_APLL_I2S1>, + <&topckgen CLK_TOP_MUX_APLL_I2S2>, + <&topckgen CLK_TOP_MUX_APLL_I2S3>, + <&topckgen CLK_TOP_MUX_APLL_I2S4>, + <&topckgen CLK_TOP_MUX_APLL_I2S5>, + <&topckgen CLK_TOP_APLL12_DIV0>, + <&topckgen CLK_TOP_APLL12_DIV1>, + <&topckgen CLK_TOP_APLL12_DIV2>, + <&topckgen CLK_TOP_APLL12_DIV3>, + <&topckgen CLK_TOP_APLL12_DIV4>, + <&topckgen CLK_TOP_APLL12_DIVB>, + <&clk26m>; + clock-names = "aud_afe_clk", + "aud_dac_clk", + "aud_dac_predis_clk", + "aud_adc_clk", + "aud_adc_adda6_clk", + "aud_apll22m_clk", + "aud_apll24m_clk", + "aud_apll1_tuner_clk", + "aud_apll2_tuner_clk", + "aud_i2s1_bclk_sw", + "aud_i2s2_bclk_sw", + "aud_i2s3_bclk_sw", + "aud_i2s4_bclk_sw", + "aud_tdm_clk", + "aud_tml_clk", + "aud_infra_clk", + "mtkaif_26m_clk", + "top_mux_audio", + "top_mux_aud_intbus", + "top_syspll_d2_d4", + "top_mux_aud_1", + "top_apll1_ck", + "top_mux_aud_2", + "top_apll2_ck", + "top_mux_aud_eng1", + "top_apll1_d8", + "top_mux_aud_eng2", + "top_apll2_d8", + "top_i2s0_m_sel", + "top_i2s1_m_sel", + "top_i2s2_m_sel", + "top_i2s3_m_sel", + "top_i2s4_m_sel", + "top_i2s5_m_sel", + "top_apll12_div0", + "top_apll12_div1", + "top_apll12_div2", + "top_apll12_div3", + "top_apll12_div4", + "top_apll12_divb", + "top_clk26m_clk"; + }; + +... diff --git a/Documentation/devicetree/bindings/sound/mt8183-afe-pcm.txt b/Documentation/devicetree/bindings/sound/mt8183-afe-pcm.txt deleted file mode 100644 index 1f1cba4152ceec..00000000000000 --- a/Documentation/devicetree/bindings/sound/mt8183-afe-pcm.txt +++ /dev/null @@ -1,42 +0,0 @@ -Mediatek AFE PCM controller for mt8183 - -Required properties: -- compatible = "mediatek,mt68183-audio"; -- reg: register location and size -- interrupts: should contain AFE interrupt -- resets: Must contain an entry for each entry in reset-names - See ../reset/reset.txt for details. -- reset-names: should have these reset names: - "audiosys"; -- power-domains: should define the power domain -- clocks: Must contain an entry for each entry in clock-names -- clock-names: should have these clock names: - "infra_sys_audio_clk", - "mtkaif_26m_clk", - "top_mux_audio", - "top_mux_aud_intbus", - "top_sys_pll3_d4", - "top_clk26m_clk"; - -Example: - - afe: mt8183-afe-pcm@11220000 { - compatible = "mediatek,mt8183-audio"; - reg = <0 0x11220000 0 0x1000>; - interrupts = ; - resets = <&watchdog MT8183_TOPRGU_AUDIO_SW_RST>; - reset-names = "audiosys"; - power-domains = <&scpsys MT8183_POWER_DOMAIN_AUDIO>; - clocks = <&infrasys CLK_INFRA_AUDIO>, - <&infrasys CLK_INFRA_AUDIO_26M_BCLK>, - <&topckgen CLK_TOP_MUX_AUDIO>, - <&topckgen CLK_TOP_MUX_AUD_INTBUS>, - <&topckgen CLK_TOP_SYSPLL_D2_D4>, - <&clk26m>; - clock-names = "infra_sys_audio_clk", - "mtkaif_26m_clk", - "top_mux_audio", - "top_mux_aud_intbus", - "top_sys_pll_d2_d4", - "top_clk26m_clk"; - }; From cf5be90ee4dfa3c38dc64fbcc4fb70fa0180b7b7 Mon Sep 17 00:00:00 2001 From: Julien Massot Date: Tue, 26 Aug 2025 09:39:38 +0200 Subject: [PATCH 2757/3206] ASoC: Convert MT8183 DA7219 sound card to DT schema Convert the Device Tree binding for MT8183-based boards using the DA7219 headset codec and optional MAX98357, RT1015 or RT1015P speaker amplifiers from the legacy .txt format to DT schema. This improves binding validation and removes DT schema warnings for boards using these audio components. Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Rob Herring (Arm) Signed-off-by: Julien Massot Link: https://patch.msgid.link/20250826-mtk-dtb-warnings-v3-5-20e89886a20e@collabora.com Signed-off-by: Mark Brown --- .../sound/mediatek,mt8183_da7219.yaml | 49 +++++++++++++++++++ .../bindings/sound/mt8183-da7219-max98357.txt | 21 -------- 2 files changed, 49 insertions(+), 21 deletions(-) create mode 100644 Documentation/devicetree/bindings/sound/mediatek,mt8183_da7219.yaml delete mode 100644 Documentation/devicetree/bindings/sound/mt8183-da7219-max98357.txt diff --git a/Documentation/devicetree/bindings/sound/mediatek,mt8183_da7219.yaml b/Documentation/devicetree/bindings/sound/mediatek,mt8183_da7219.yaml new file mode 100644 index 00000000000000..b526e8123182bc --- /dev/null +++ b/Documentation/devicetree/bindings/sound/mediatek,mt8183_da7219.yaml @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/mediatek,mt8183_da7219.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek MT8183 sound card with external codecs + +maintainers: + - Julien Massot + +description: + MediaTek MT8183 SoC-based sound cards with DA7219 as headset codec, + and MAX98357A, RT1015 or RT1015P as speaker amplifiers. Optionally includes HDMI codec. + +properties: + compatible: + enum: + - mediatek,mt8183_da7219_max98357 + - mediatek,mt8183_da7219_rt1015 + - mediatek,mt8183_da7219_rt1015p + + mediatek,headset-codec: + $ref: /schemas/types.yaml#/definitions/phandle + description: Phandle to the DA7219 headset codec. + + mediatek,platform: + $ref: /schemas/types.yaml#/definitions/phandle + description: Phandle to the MT8183 ASoC platform (e.g., AFE node). + + mediatek,hdmi-codec: + $ref: /schemas/types.yaml#/definitions/phandle + description: Optional phandle to the HDMI codec (e.g., IT6505). + +required: + - compatible + - mediatek,headset-codec + - mediatek,platform + +additionalProperties: false + +examples: + - | + sound { + compatible = "mediatek,mt8183_da7219_max98357"; + mediatek,headset-codec = <&da7219>; + mediatek,hdmi-codec = <&it6505dptx>; + mediatek,platform = <&afe>; + }; diff --git a/Documentation/devicetree/bindings/sound/mt8183-da7219-max98357.txt b/Documentation/devicetree/bindings/sound/mt8183-da7219-max98357.txt deleted file mode 100644 index f276dfc74b4654..00000000000000 --- a/Documentation/devicetree/bindings/sound/mt8183-da7219-max98357.txt +++ /dev/null @@ -1,21 +0,0 @@ -MT8183 with MT6358, DA7219, MAX98357, and RT1015 CODECS - -Required properties: -- compatible : "mediatek,mt8183_da7219_max98357" for MAX98357A codec - "mediatek,mt8183_da7219_rt1015" for RT1015 codec - "mediatek,mt8183_da7219_rt1015p" for RT1015P codec -- mediatek,headset-codec: the phandles of da7219 codecs -- mediatek,platform: the phandle of MT8183 ASoC platform - -Optional properties: -- mediatek,hdmi-codec: the phandles of HDMI codec - -Example: - - sound { - compatible = "mediatek,mt8183_da7219_max98357"; - mediatek,headset-codec = <&da7219>; - mediatek,hdmi-codec = <&it6505dptx>; - mediatek,platform = <&afe>; - }; - From 82fd5dc99d63f948c59ac3b08137ef49125938bc Mon Sep 17 00:00:00 2001 From: Julien Massot Date: Tue, 26 Aug 2025 09:39:39 +0200 Subject: [PATCH 2758/3206] ASoC: dt-binding: Convert MediaTek mt8183-mt6358 to DT schema Convert the existing text-based DT binding for MT8183 sound cards using MT6358 and various other codecs to a DT schema. Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Rob Herring (Arm) Signed-off-by: Julien Massot Link: https://patch.msgid.link/20250826-mtk-dtb-warnings-v3-6-20e89886a20e@collabora.com Signed-off-by: Mark Brown --- .../sound/mediatek,mt8183_mt6358_ts3a227.yaml | 59 +++++++++++++++++++ .../sound/mt8183-mt6358-ts3a227-max98357.txt | 25 -------- 2 files changed, 59 insertions(+), 25 deletions(-) create mode 100644 Documentation/devicetree/bindings/sound/mediatek,mt8183_mt6358_ts3a227.yaml delete mode 100644 Documentation/devicetree/bindings/sound/mt8183-mt6358-ts3a227-max98357.txt diff --git a/Documentation/devicetree/bindings/sound/mediatek,mt8183_mt6358_ts3a227.yaml b/Documentation/devicetree/bindings/sound/mediatek,mt8183_mt6358_ts3a227.yaml new file mode 100644 index 00000000000000..43a6f9d40644c2 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/mediatek,mt8183_mt6358_ts3a227.yaml @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/mediatek,mt8183_mt6358_ts3a227.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek MT8183 sound card with MT6358, TS3A227, and MAX98357/RT1015 codecs + +maintainers: + - Julien Massot + +description: + MediaTek MT8183 SoC-based sound cards using the MT6358 codec, + with optional TS3A227 headset codec, EC codec (via Chrome EC), and HDMI audio. + Speaker amplifier can be one of MAX98357A/B, RT1015, or RT1015P. + +properties: + compatible: + enum: + - mediatek,mt8183_mt6358_ts3a227_max98357 + - mediatek,mt8183_mt6358_ts3a227_max98357b + - mediatek,mt8183_mt6358_ts3a227_rt1015 + - mediatek,mt8183_mt6358_ts3a227_rt1015p + + mediatek,platform: + $ref: /schemas/types.yaml#/definitions/phandle + description: Phandle to the MT8183 ASoC platform node (e.g., AFE). + + mediatek,headset-codec: + $ref: /schemas/types.yaml#/definitions/phandle + description: Phandle to the TS3A227 headset codec. + + mediatek,ec-codec: + $ref: /schemas/types.yaml#/definitions/phandle + description: | + Optional phandle to a ChromeOS EC codec node. + See bindings in google,cros-ec-codec.yaml. + + mediatek,hdmi-codec: + $ref: /schemas/types.yaml#/definitions/phandle + description: Optional phandle to an HDMI audio codec node. + +required: + - compatible + - mediatek,platform + +additionalProperties: false + +examples: + - | + sound { + compatible = "mediatek,mt8183_mt6358_ts3a227_max98357"; + mediatek,headset-codec = <&ts3a227>; + mediatek,ec-codec = <&ec_codec>; + mediatek,hdmi-codec = <&it6505dptx>; + mediatek,platform = <&afe>; + }; + +... diff --git a/Documentation/devicetree/bindings/sound/mt8183-mt6358-ts3a227-max98357.txt b/Documentation/devicetree/bindings/sound/mt8183-mt6358-ts3a227-max98357.txt deleted file mode 100644 index ecd46ed8eb98b9..00000000000000 --- a/Documentation/devicetree/bindings/sound/mt8183-mt6358-ts3a227-max98357.txt +++ /dev/null @@ -1,25 +0,0 @@ -MT8183 with MT6358, TS3A227, MAX98357, and RT1015 CODECS - -Required properties: -- compatible : "mediatek,mt8183_mt6358_ts3a227_max98357" for MAX98357A codec - "mediatek,mt8183_mt6358_ts3a227_max98357b" for MAX98357B codec - "mediatek,mt8183_mt6358_ts3a227_rt1015" for RT1015 codec - "mediatek,mt8183_mt6358_ts3a227_rt1015p" for RT1015P codec -- mediatek,platform: the phandle of MT8183 ASoC platform - -Optional properties: -- mediatek,headset-codec: the phandles of ts3a227 codecs -- mediatek,ec-codec: the phandle of EC codecs. - See google,cros-ec-codec.txt for more details. -- mediatek,hdmi-codec: the phandles of HDMI codec - -Example: - - sound { - compatible = "mediatek,mt8183_mt6358_ts3a227_max98357"; - mediatek,headset-codec = <&ts3a227>; - mediatek,ec-codec = <&ec_codec>; - mediatek,hdmi-codec = <&it6505dptx>; - mediatek,platform = <&afe>; - }; - From 7384893d970ea114952aef54ad7e3d7d2ca82d4f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 16 Sep 2025 23:52:56 +0200 Subject: [PATCH 2759/3206] bpf: Allow uprobe program to change context registers Currently uprobe (BPF_PROG_TYPE_KPROBE) program can't write to the context registers data. While this makes sense for kprobe attachments, for uprobe attachment it might make sense to be able to change user space registers to alter application execution. Since uprobe and kprobe programs share the same type (BPF_PROG_TYPE_KPROBE), we can't deny write access to context during the program load. We need to check on it during program attachment to see if it's going to be kprobe or uprobe. Storing the program's write attempt to context and checking on it during the attachment. Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20250916215301.664963-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/events/core.c | 4 ++++ kernel/trace/bpf_trace.c | 9 +++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ba3a3be7eb2a8a..ea2ed6771cc60c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1639,6 +1639,7 @@ struct bpf_prog_aux { bool priv_stack_requested; bool changes_pkt_data; bool might_sleep; + bool kprobe_write_ctx; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; diff --git a/kernel/events/core.c b/kernel/events/core.c index 820127536e62b7..1d354778dcd420 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11232,6 +11232,10 @@ static int __perf_event_set_bpf_prog(struct perf_event *event, if (prog->kprobe_override && !is_kprobe) return -EINVAL; + /* Writing to context allowed only for uprobes. */ + if (prog->aux->kprobe_write_ctx && !is_uprobe) + return -EINVAL; + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f2360579658ead..8f23f5273babf9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1338,8 +1338,6 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type { if (off < 0 || off >= sizeof(struct pt_regs)) return false; - if (type != BPF_READ) - return false; if (off % size != 0) return false; /* @@ -1349,6 +1347,9 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type if (off + size > sizeof(struct pt_regs)) return false; + if (type == BPF_WRITE) + prog->aux->kprobe_write_ctx = true; + return true; } @@ -2735,6 +2736,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!is_kprobe_multi(prog)) return -EINVAL; + /* Writing to context is not allowed for kprobes. */ + if (prog->aux->kprobe_write_ctx) + return -EINVAL; + flags = attr->link_create.kprobe_multi.flags; if (flags & ~BPF_F_KPROBE_MULTI_RETURN) return -EINVAL; From 4363264111e1297fa37aa39b0598faa19298ecca Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 16 Sep 2025 23:52:57 +0200 Subject: [PATCH 2760/3206] uprobe: Do not emulate/sstep original instruction when ip is changed If uprobe handler changes instruction pointer we still execute single step) or emulate the original instruction and increment the (new) ip with its length. This makes the new instruction pointer bogus and application will likely crash on illegal instruction execution. If user decided to take execution elsewhere, it makes little sense to execute the original instruction, so let's skip it. Acked-by: Oleg Nesterov Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20250916215301.664963-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/events/uprobes.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd83..2b32c32bcb776f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2741,6 +2741,13 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* + * If user decided to take execution elsewhere, it makes little sense + * to execute the original instruction, so let's skip it. + */ + if (instruction_pointer(regs) != bp_vaddr) + goto out; + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; From 7f8a05c5d388668d5631275b3e3a59bfc8669e06 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 16 Sep 2025 23:52:58 +0200 Subject: [PATCH 2761/3206] selftests/bpf: Add uprobe context registers changes test Adding test to check we can change common register values through uprobe program. It's x86_64 specific test. Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20250916215301.664963-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/uprobe.c | 114 +++++++++++++++++- .../testing/selftests/bpf/progs/test_uprobe.c | 24 ++++ 2 files changed, 137 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe.c b/tools/testing/selftests/bpf/prog_tests/uprobe.c index cf3e0e7a64fae2..19dd900df188a4 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe.c @@ -2,6 +2,7 @@ /* Copyright (c) 2023 Hengqi Chen */ #include +#include #include "test_uprobe.skel.h" static FILE *urand_spawn(int *pid) @@ -33,7 +34,7 @@ static int urand_trigger(FILE **urand_pipe) return exit_code; } -void test_uprobe(void) +static void test_uprobe_attach(void) { LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); struct test_uprobe *skel; @@ -93,3 +94,114 @@ void test_uprobe(void) pclose(urand_pipe); test_uprobe__destroy(skel); } + +#ifdef __x86_64__ +__naked __maybe_unused unsigned long uprobe_regs_change_trigger(void) +{ + asm volatile ( + "ret\n" + ); +} + +static __naked void uprobe_regs_change(struct pt_regs *before, struct pt_regs *after) +{ + asm volatile ( + "movq %r11, 48(%rdi)\n" + "movq %r10, 56(%rdi)\n" + "movq %r9, 64(%rdi)\n" + "movq %r8, 72(%rdi)\n" + "movq %rax, 80(%rdi)\n" + "movq %rcx, 88(%rdi)\n" + "movq %rdx, 96(%rdi)\n" + "movq %rsi, 104(%rdi)\n" + "movq %rdi, 112(%rdi)\n" + + /* save 2nd argument */ + "pushq %rsi\n" + "call uprobe_regs_change_trigger\n" + + /* save return value and load 2nd argument pointer to rax */ + "pushq %rax\n" + "movq 8(%rsp), %rax\n" + + "movq %r11, 48(%rax)\n" + "movq %r10, 56(%rax)\n" + "movq %r9, 64(%rax)\n" + "movq %r8, 72(%rax)\n" + "movq %rcx, 88(%rax)\n" + "movq %rdx, 96(%rax)\n" + "movq %rsi, 104(%rax)\n" + "movq %rdi, 112(%rax)\n" + + /* restore return value and 2nd argument */ + "pop %rax\n" + "pop %rsi\n" + + "movq %rax, 80(%rsi)\n" + "ret\n" + ); +} + +static void regs_common(void) +{ + struct pt_regs before = {}, after = {}, expected = { + .rax = 0xc0ffe, + .rcx = 0xbad, + .rdx = 0xdead, + .r8 = 0x8, + .r9 = 0x9, + .r10 = 0x10, + .r11 = 0x11, + .rdi = 0x12, + .rsi = 0x13, + }; + LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); + struct test_uprobe *skel; + + skel = test_uprobe__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->my_pid = getpid(); + skel->bss->regs = expected; + + uprobe_opts.func_name = "uprobe_regs_change_trigger"; + skel->links.test_regs_change = bpf_program__attach_uprobe_opts(skel->progs.test_regs_change, + -1, + "/proc/self/exe", + 0 /* offset */, + &uprobe_opts); + if (!ASSERT_OK_PTR(skel->links.test_regs_change, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + uprobe_regs_change(&before, &after); + + ASSERT_EQ(after.rax, expected.rax, "ax"); + ASSERT_EQ(after.rcx, expected.rcx, "cx"); + ASSERT_EQ(after.rdx, expected.rdx, "dx"); + ASSERT_EQ(after.r8, expected.r8, "r8"); + ASSERT_EQ(after.r9, expected.r9, "r9"); + ASSERT_EQ(after.r10, expected.r10, "r10"); + ASSERT_EQ(after.r11, expected.r11, "r11"); + ASSERT_EQ(after.rdi, expected.rdi, "rdi"); + ASSERT_EQ(after.rsi, expected.rsi, "rsi"); + +cleanup: + test_uprobe__destroy(skel); +} + +static void test_uprobe_regs_change(void) +{ + if (test__start_subtest("regs_change_common")) + regs_common(); +} +#else +static void test_uprobe_regs_change(void) { } +#endif + +void test_uprobe(void) +{ + if (test__start_subtest("attach")) + test_uprobe_attach(); + test_uprobe_regs_change(); +} diff --git a/tools/testing/selftests/bpf/progs/test_uprobe.c b/tools/testing/selftests/bpf/progs/test_uprobe.c index 896c88a4960df2..9437bd76a437c6 100644 --- a/tools/testing/selftests/bpf/progs/test_uprobe.c +++ b/tools/testing/selftests/bpf/progs/test_uprobe.c @@ -59,3 +59,27 @@ int BPF_UPROBE(test4) test4_result = 1; return 0; } + +#if defined(__TARGET_ARCH_x86) +struct pt_regs regs; + +SEC("uprobe") +int BPF_UPROBE(test_regs_change) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + ctx->ax = regs.ax; + ctx->cx = regs.cx; + ctx->dx = regs.dx; + ctx->r8 = regs.r8; + ctx->r9 = regs.r9; + ctx->r10 = regs.r10; + ctx->r11 = regs.r11; + ctx->di = regs.di; + ctx->si = regs.si; + return 0; +} +#endif From 6a4ea0d1cb4408a7ca8ad0d12bd1d08a35838160 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 16 Sep 2025 23:52:59 +0200 Subject: [PATCH 2762/3206] selftests/bpf: Add uprobe context ip register change test Adding test to check we can change the application execution through instruction pointer change through uprobe program. It's x86_64 specific test. Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20250916215301.664963-5-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/uprobe.c | 42 +++++++++++++++++++ .../testing/selftests/bpf/progs/test_uprobe.c | 14 +++++++ 2 files changed, 56 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe.c b/tools/testing/selftests/bpf/prog_tests/uprobe.c index 19dd900df188a4..86404476c1dafe 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe.c @@ -190,10 +190,52 @@ static void regs_common(void) test_uprobe__destroy(skel); } +static noinline unsigned long uprobe_regs_change_ip_1(void) +{ + return 0xc0ffee; +} + +static noinline unsigned long uprobe_regs_change_ip_2(void) +{ + return 0xdeadbeef; +} + +static void regs_ip(void) +{ + LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); + struct test_uprobe *skel; + unsigned long ret; + + skel = test_uprobe__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->my_pid = getpid(); + skel->bss->ip = (unsigned long) uprobe_regs_change_ip_2; + + uprobe_opts.func_name = "uprobe_regs_change_ip_1"; + skel->links.test_regs_change_ip = bpf_program__attach_uprobe_opts( + skel->progs.test_regs_change_ip, + -1, + "/proc/self/exe", + 0 /* offset */, + &uprobe_opts); + if (!ASSERT_OK_PTR(skel->links.test_regs_change_ip, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + ret = uprobe_regs_change_ip_1(); + ASSERT_EQ(ret, 0xdeadbeef, "ret"); + +cleanup: + test_uprobe__destroy(skel); +} + static void test_uprobe_regs_change(void) { if (test__start_subtest("regs_change_common")) regs_common(); + if (test__start_subtest("regs_change_ip")) + regs_ip(); } #else static void test_uprobe_regs_change(void) { } diff --git a/tools/testing/selftests/bpf/progs/test_uprobe.c b/tools/testing/selftests/bpf/progs/test_uprobe.c index 9437bd76a437c6..12f4065fca2016 100644 --- a/tools/testing/selftests/bpf/progs/test_uprobe.c +++ b/tools/testing/selftests/bpf/progs/test_uprobe.c @@ -82,4 +82,18 @@ int BPF_UPROBE(test_regs_change) ctx->si = regs.si; return 0; } + +unsigned long ip; + +SEC("uprobe") +int BPF_UPROBE(test_regs_change_ip) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + ctx->ip = ip; + return 0; +} #endif From 1b881ee294b2d8929a77b0e489047765d55f0191 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 16 Sep 2025 23:53:00 +0200 Subject: [PATCH 2763/3206] selftests/bpf: Add kprobe write ctx attach test Adding test to check we can't attach standard kprobe program that writes to the context. It's x86_64 specific test. Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20250916215301.664963-6-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/attach_probe.c | 28 +++++++++++++++++++ .../selftests/bpf/progs/kprobe_write_ctx.c | 15 ++++++++++ 2 files changed, 43 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/kprobe_write_ctx.c diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index cabc51c2ca6bd5..9e77e5da7097c3 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -3,6 +3,7 @@ #include "test_attach_kprobe_sleepable.skel.h" #include "test_attach_probe_manual.skel.h" #include "test_attach_probe.skel.h" +#include "kprobe_write_ctx.skel.h" /* this is how USDT semaphore is actually defined, except volatile modifier */ volatile unsigned short uprobe_ref_ctr __attribute__((unused)) __attribute((section(".probes"))); @@ -201,6 +202,31 @@ static void test_attach_kprobe_long_event_name(void) test_attach_probe_manual__destroy(skel); } +#ifdef __x86_64__ +/* attach kprobe/kretprobe long event name testings */ +static void test_attach_kprobe_write_ctx(void) +{ + struct kprobe_write_ctx *skel = NULL; + struct bpf_link *link = NULL; + + skel = kprobe_write_ctx__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_write_ctx__open_and_load")) + return; + + link = bpf_program__attach_kprobe_opts(skel->progs.kprobe_write_ctx, + "bpf_fentry_test1", NULL); + if (!ASSERT_ERR_PTR(link, "bpf_program__attach_kprobe_opts")) + bpf_link__destroy(link); + + kprobe_write_ctx__destroy(skel); +} +#else +static void test_attach_kprobe_write_ctx(void) +{ + test__skip(); +} +#endif + static void test_attach_probe_auto(struct test_attach_probe *skel) { struct bpf_link *uprobe_err_link; @@ -406,6 +432,8 @@ void test_attach_probe(void) test_attach_uprobe_long_event_name(); if (test__start_subtest("kprobe-long_name")) test_attach_kprobe_long_event_name(); + if (test__start_subtest("kprobe-write-ctx")) + test_attach_kprobe_write_ctx(); cleanup: test_attach_probe__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c b/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c new file mode 100644 index 00000000000000..4621a5bef4e295 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +#if defined(__TARGET_ARCH_x86) +SEC("kprobe") +int kprobe_write_ctx(struct pt_regs *ctx) +{ + ctx->ax = 0; + return 0; +} +#endif From 3d237467a444d4f23b78ac72210fe5860cd7aed8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 16 Sep 2025 23:53:01 +0200 Subject: [PATCH 2764/3206] selftests/bpf: Add kprobe multi write ctx attach test Adding test to check we can't attach kprobe multi program that writes to the context. It's x86_64 specific test. Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20250916215301.664963-7-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/kprobe_multi_test.c | 27 +++++++++++++++++++ .../selftests/bpf/progs/kprobe_write_ctx.c | 7 +++++ 2 files changed, 34 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 171706e78da88c..6cfaa978bc9af2 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -7,6 +7,7 @@ #include "kprobe_multi_session.skel.h" #include "kprobe_multi_session_cookie.skel.h" #include "kprobe_multi_verifier.skel.h" +#include "kprobe_write_ctx.skel.h" #include "bpf/libbpf_internal.h" #include "bpf/hashmap.h" @@ -539,6 +540,30 @@ static void test_attach_override(void) kprobe_multi_override__destroy(skel); } +#ifdef __x86_64__ +static void test_attach_write_ctx(void) +{ + struct kprobe_write_ctx *skel = NULL; + struct bpf_link *link = NULL; + + skel = kprobe_write_ctx__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_write_ctx__open_and_load")) + return; + + link = bpf_program__attach_kprobe_opts(skel->progs.kprobe_multi_write_ctx, + "bpf_fentry_test1", NULL); + if (!ASSERT_ERR_PTR(link, "bpf_program__attach_kprobe_opts")) + bpf_link__destroy(link); + + kprobe_write_ctx__destroy(skel); +} +#else +static void test_attach_write_ctx(void) +{ + test__skip(); +} +#endif + void serial_test_kprobe_multi_bench_attach(void) { if (test__start_subtest("kernel")) @@ -578,5 +603,7 @@ void test_kprobe_multi_test(void) test_session_cookie_skel_api(); if (test__start_subtest("unique_match")) test_unique_match(); + if (test__start_subtest("attach_write_ctx")) + test_attach_write_ctx(); RUN_TESTS(kprobe_multi_verifier); } diff --git a/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c b/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c index 4621a5bef4e295..f77aef0474d3b6 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c +++ b/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c @@ -12,4 +12,11 @@ int kprobe_write_ctx(struct pt_regs *ctx) ctx->ax = 0; return 0; } + +SEC("kprobe.multi") +int kprobe_multi_write_ctx(struct pt_regs *ctx) +{ + ctx->ax = 0; + return 0; +} #endif From d4680a11e14c7baf683cb8453d91d71d2e0b9d3e Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 24 Sep 2025 10:14:26 +0200 Subject: [PATCH 2765/3206] bpf: Mark kfuncs as __noclone Some distributions (e.g., CachyOS) support building the kernel with -O3, but doing so may break kfuncs, resulting in their symbols not being properly exported. In fact, with gcc -O3, some kfuncs may be optimized away despite being annotated as noinline. This happens because gcc can still clone the function during IPA optimizations, e.g., by duplicating or inlining it into callers, and then dropping the standalone symbol. This breaks BTF ID resolution since resolve_btfids relies on the presence of a global symbol for each kfunc. Currently, this is not an issue for upstream, because we don't allow building the kernel with -O3, but it may be safer to address it anyway, to prevent potential issues in the future if compilers become more aggressive with optimizations. Therefore, add __noclone to __bpf_kfunc to ensure kfuncs are never cloned and remain distinct, globally visible symbols, regardless of the optimization level. Fixes: 57e7c169cd6af ("bpf: Add __bpf_kfunc tag for marking kernel functions as kfuncs") Acked-by: David Vernet Acked-by: Yonghong Song Signed-off-by: Andrea Righi Link: https://lore.kernel.org/r/20250924081426.156934-1-arighi@nvidia.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 9eda6b113f9b48..f06976ffb63f94 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -86,7 +86,7 @@ * as to avoid issues such as the compiler inlining or eliding either a static * kfunc, or a global kfunc in an LTO build. */ -#define __bpf_kfunc __used __retain noinline +#define __bpf_kfunc __used __retain __noclone noinline #define __bpf_kfunc_start_defs() \ __diag_push(); \ From 2084660ad288c998b6f0c885e266deb364f65fba Mon Sep 17 00:00:00 2001 From: Ilkka Koskinen Date: Tue, 23 Sep 2025 14:31:36 -0700 Subject: [PATCH 2766/3206] perf/dwc_pcie: Fix use of uninitialized variable Fix use of uninitialized variable in group validation code. Fixes: 71396cfac97d ("perf/dwc_pcie: Support counting multiple lane events in parallel") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202509231223.gZsX6Eio-lkp@intel.com/ Signed-off-by: Ilkka Koskinen Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index d77f767cde8934..22f73ac894e959 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -402,7 +402,7 @@ static int dwc_pcie_pmu_validate_group(struct perf_event *event) { struct perf_event *sibling, *leader = event->group_leader; DECLARE_BITMAP(val_lane_events, 2 * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP); - bool time_event; + bool time_event = false; int type; type = DWC_PCIE_EVENT_TYPE(leader); From dc64b3d42cb361d4b39eb7cc73037fec52ef9676 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 Sep 2025 14:28:39 +0300 Subject: [PATCH 2767/3206] ASoC: codecs: wcd-common: fix signedness bug in wcd_dt_parse_micbias_info() The error handling does not work because common->micb_vout[] is an array of u32. We need a signed variable to store negative error codes. Fixes: 4f16b6351bbf ("ASoC: codecs: wcd: add common helper for wcd codecs") Signed-off-by: Dan Carpenter Reviewed-by: Srinivas Kandagatla Link: https://patch.msgid.link/aNKEZ3VqJ8js208v@stanley.mountain Signed-off-by: Mark Brown --- sound/soc/codecs/wcd-common.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/wcd-common.c b/sound/soc/codecs/wcd-common.c index 9bbfda82837794..9016e974582f55 100644 --- a/sound/soc/codecs/wcd-common.c +++ b/sound/soc/codecs/wcd-common.c @@ -62,12 +62,13 @@ static int wcd_get_micbias_val(struct device *dev, int micb_num, u32 *micb_mv) int wcd_dt_parse_micbias_info(struct wcd_common *common) { - int i; + int ret, i; for (i = 0; i < common->max_bias; i++) { - common->micb_vout[i] = wcd_get_micbias_val(common->dev, i + 1, &common->micb_mv[i]); - if (common->micb_vout[i] < 0) - return -EINVAL; + ret = wcd_get_micbias_val(common->dev, i + 1, &common->micb_mv[i]); + if (ret < 0) + return ret; + common->micb_vout[i] = ret; } return 0; From 64f89f6e1f2b7f8f203d218a8c8d90922e1d4048 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 17 Sep 2025 10:54:05 +0200 Subject: [PATCH 2768/3206] gpio: generic: rename BGPIOF_ flags to GPIO_GENERIC_ Make the flags passed to gpio_generic_chip_init() use the same prefix as the rest of the modernized generic GPIO chip API. Link: https://lore.kernel.org/r/20250917-gpio-generic-flags-v1-1-69f51fee8c89@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-amdpt.c | 2 +- drivers/gpio/gpio-brcmstb.c | 2 +- drivers/gpio/gpio-cadence.c | 2 +- drivers/gpio/gpio-ge.c | 2 +- drivers/gpio/gpio-grgpio.c | 2 +- drivers/gpio/gpio-hisi.c | 3 ++- drivers/gpio/gpio-hlwd.c | 2 +- drivers/gpio/gpio-ixp4xx.c | 2 +- drivers/gpio/gpio-mmio.c | 28 +++++++++++------------ drivers/gpio/gpio-mpc8xxx.c | 4 ++-- drivers/gpio/gpio-mt7621.c | 2 +- drivers/gpio/gpio-mxc.c | 2 +- drivers/gpio/gpio-rda.c | 2 +- drivers/gpio/gpio-realtek-otto.c | 2 +- drivers/gpio/gpio-sifive.c | 2 +- drivers/gpio/gpio-spacemit-k1.c | 3 ++- drivers/gpio/gpio-vf610.c | 4 ++-- drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c | 2 +- drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c | 2 +- drivers/pinctrl/nuvoton/pinctrl-wpcm450.c | 2 +- drivers/pinctrl/stm32/pinctrl-stm32-hdp.c | 2 +- include/linux/gpio/driver.h | 18 +++++++-------- 22 files changed, 47 insertions(+), 45 deletions(-) diff --git a/drivers/gpio/gpio-amdpt.c b/drivers/gpio/gpio-amdpt.c index bbaf42307bc3d7..8458a6949c65d3 100644 --- a/drivers/gpio/gpio-amdpt.c +++ b/drivers/gpio/gpio-amdpt.c @@ -94,7 +94,7 @@ static int pt_gpio_probe(struct platform_device *pdev) .dat = pt_gpio->reg_base + PT_INPUTDATA_REG, .set = pt_gpio->reg_base + PT_OUTPUTDATA_REG, .dirout = pt_gpio->reg_base + PT_DIRECTION_REG, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&pt_gpio->chip, &config); diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c index be3ff916e134a6..f40c9472588bc7 100644 --- a/drivers/gpio/gpio-brcmstb.c +++ b/drivers/gpio/gpio-brcmstb.c @@ -630,7 +630,7 @@ static int brcmstb_gpio_probe(struct platform_device *pdev) * else leave I/O in little endian mode. */ #if defined(CONFIG_MIPS) && defined(__BIG_ENDIAN) - flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; #endif of_property_for_each_u32(np, "brcm,gpio-bank-widths", bank_width) { diff --git a/drivers/gpio/gpio-cadence.c b/drivers/gpio/gpio-cadence.c index c647953521c716..b75734ca22dd73 100644 --- a/drivers/gpio/gpio-cadence.c +++ b/drivers/gpio/gpio-cadence.c @@ -181,7 +181,7 @@ static int cdns_gpio_probe(struct platform_device *pdev) config.dat = cgpio->regs + CDNS_GPIO_INPUT_VALUE; config.set = cgpio->regs + CDNS_GPIO_OUTPUT_VALUE; config.dirin = cgpio->regs + CDNS_GPIO_DIRECTION_MODE; - config.flags = BGPIOF_READ_OUTPUT_REG_SET; + config.flags = GPIO_GENERIC_READ_OUTPUT_REG_SET; ret = gpio_generic_chip_init(&cgpio->gen_gc, &config); if (ret) { diff --git a/drivers/gpio/gpio-ge.c b/drivers/gpio/gpio-ge.c index b5cbf27b8f4422..66bdff36eb615e 100644 --- a/drivers/gpio/gpio-ge.c +++ b/drivers/gpio/gpio-ge.c @@ -73,7 +73,7 @@ static int __init gef_gpio_probe(struct platform_device *pdev) .dat = regs + GEF_GPIO_IN, .set = regs + GEF_GPIO_OUT, .dirin = regs + GEF_GPIO_DIRECT, - .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, }; ret = gpio_generic_chip_init(chip, &config); diff --git a/drivers/gpio/gpio-grgpio.c b/drivers/gpio/gpio-grgpio.c index 5930f4c6f2b578..0c0f97fa14fc9d 100644 --- a/drivers/gpio/gpio-grgpio.c +++ b/drivers/gpio/gpio-grgpio.c @@ -359,7 +359,7 @@ static int grgpio_probe(struct platform_device *ofdev) .dat = regs + GRGPIO_DATA, .set = regs + GRGPIO_OUTPUT, .dirout = regs + GRGPIO_DIR, - .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, }; gc = &priv->chip.gc; diff --git a/drivers/gpio/gpio-hisi.c b/drivers/gpio/gpio-hisi.c index d8c4ab02ceaef7..d26298c8351b71 100644 --- a/drivers/gpio/gpio-hisi.c +++ b/drivers/gpio/gpio-hisi.c @@ -300,7 +300,8 @@ static int hisi_gpio_probe(struct platform_device *pdev) .clr = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_CLR_WX, .dirout = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_SET_WX, .dirin = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_CLR_WX, - .flags = BGPIOF_NO_SET_ON_INPUT | BGPIOF_UNREADABLE_REG_DIR, + .flags = GPIO_GENERIC_NO_SET_ON_INPUT | + GPIO_GENERIC_UNREADABLE_REG_DIR, }; ret = gpio_generic_chip_init(&hisi_gpio->chip, &config); diff --git a/drivers/gpio/gpio-hlwd.c b/drivers/gpio/gpio-hlwd.c index a395f87436ac4d..043ce5ef3b07e9 100644 --- a/drivers/gpio/gpio-hlwd.c +++ b/drivers/gpio/gpio-hlwd.c @@ -253,7 +253,7 @@ static int hlwd_gpio_probe(struct platform_device *pdev) .dat = hlwd->regs + HW_GPIOB_IN, .set = hlwd->regs + HW_GPIOB_OUT, .dirout = hlwd->regs + HW_GPIOB_DIR, - .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, }; res = gpio_generic_chip_init(&hlwd->gpioc, &config); diff --git a/drivers/gpio/gpio-ixp4xx.c b/drivers/gpio/gpio-ixp4xx.c index 8a3b6b192288c8..f34d87869c8b04 100644 --- a/drivers/gpio/gpio-ixp4xx.c +++ b/drivers/gpio/gpio-ixp4xx.c @@ -289,7 +289,7 @@ static int ixp4xx_gpio_probe(struct platform_device *pdev) * for big endian. */ #if defined(CONFIG_CPU_BIG_ENDIAN) - flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; #else flags = 0; #endif diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index a3df14d672a92a..7d6dd36cf1aeff 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -554,7 +554,7 @@ static int bgpio_setup_io(struct gpio_generic_chip *chip, chip->reg_set = cfg->set; gc->set = bgpio_set_set; gc->set_multiple = bgpio_set_multiple_set; - } else if (cfg->flags & BGPIOF_NO_OUTPUT) { + } else if (cfg->flags & GPIO_GENERIC_NO_OUTPUT) { gc->set = bgpio_set_none; gc->set_multiple = NULL; } else { @@ -562,8 +562,8 @@ static int bgpio_setup_io(struct gpio_generic_chip *chip, gc->set_multiple = bgpio_set_multiple; } - if (!(cfg->flags & BGPIOF_UNREADABLE_REG_SET) && - (cfg->flags & BGPIOF_READ_OUTPUT_REG_SET)) { + if (!(cfg->flags & GPIO_GENERIC_UNREADABLE_REG_SET) && + (cfg->flags & GPIO_GENERIC_READ_OUTPUT_REG_SET)) { gc->get = bgpio_get_set; if (!chip->be_bits) gc->get_multiple = bgpio_get_set_multiple; @@ -593,19 +593,19 @@ static int bgpio_setup_direction(struct gpio_generic_chip *chip, if (cfg->dirout || cfg->dirin) { chip->reg_dir_out = cfg->dirout; chip->reg_dir_in = cfg->dirin; - if (cfg->flags & BGPIOF_NO_SET_ON_INPUT) + if (cfg->flags & GPIO_GENERIC_NO_SET_ON_INPUT) gc->direction_output = bgpio_dir_out_dir_first; else gc->direction_output = bgpio_dir_out_val_first; gc->direction_input = bgpio_dir_in; gc->get_direction = bgpio_get_dir; } else { - if (cfg->flags & BGPIOF_NO_OUTPUT) + if (cfg->flags & GPIO_GENERIC_NO_OUTPUT) gc->direction_output = bgpio_dir_out_err; else gc->direction_output = bgpio_simple_dir_out; - if (cfg->flags & BGPIOF_NO_INPUT) + if (cfg->flags & GPIO_GENERIC_NO_INPUT) gc->direction_input = bgpio_dir_in_err; else gc->direction_input = bgpio_simple_dir_in; @@ -654,7 +654,7 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, gc->label = dev_name(dev); gc->base = -1; gc->request = bgpio_request; - chip->be_bits = !!(flags & BGPIOF_BIG_ENDIAN); + chip->be_bits = !!(flags & GPIO_GENERIC_BIG_ENDIAN); ret = gpiochip_get_ngpios(gc, dev); if (ret) @@ -665,7 +665,7 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, return ret; ret = bgpio_setup_accessors(dev, chip, - flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER); + flags & GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER); if (ret) return ret; @@ -673,7 +673,7 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, if (ret) return ret; - if (flags & BGPIOF_PINCTRL_BACKEND) { + if (flags & GPIO_GENERIC_PINCTRL_BACKEND) { chip->pinctrl = true; /* Currently this callback is only used for pincontrol */ gc->free = gpiochip_generic_free; @@ -681,17 +681,17 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, chip->sdata = chip->read_reg(chip->reg_dat); if (gc->set == bgpio_set_set && - !(flags & BGPIOF_UNREADABLE_REG_SET)) + !(flags & GPIO_GENERIC_UNREADABLE_REG_SET)) chip->sdata = chip->read_reg(chip->reg_set); - if (flags & BGPIOF_UNREADABLE_REG_DIR) + if (flags & GPIO_GENERIC_UNREADABLE_REG_DIR) chip->dir_unreadable = true; /* * Inspect hardware to find initial direction setting. */ if ((chip->reg_dir_out || chip->reg_dir_in) && - !(flags & BGPIOF_UNREADABLE_REG_DIR)) { + !(flags & GPIO_GENERIC_UNREADABLE_REG_DIR)) { if (chip->reg_dir_out) chip->sdir = chip->read_reg(chip->reg_dir_out); else if (chip->reg_dir_in) @@ -787,10 +787,10 @@ static int bgpio_pdev_probe(struct platform_device *pdev) return -ENOMEM; if (device_is_big_endian(dev)) - flags |= BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags |= GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; if (device_property_read_bool(dev, "no-output")) - flags |= BGPIOF_NO_OUTPUT; + flags |= GPIO_GENERIC_NO_OUTPUT; config = (struct gpio_generic_chip_config) { .dev = dev, diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index a2a83afb41bbb9..bfe828734ee1ba 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -350,13 +350,13 @@ static int mpc8xxx_probe(struct platform_device *pdev) .sz = 4, .dat = mpc8xxx_gc->regs + GPIO_DAT, .dirout = mpc8xxx_gc->regs + GPIO_DIR, - .flags = BGPIOF_BIG_ENDIAN + .flags = GPIO_GENERIC_BIG_ENDIAN }; if (device_property_read_bool(dev, "little-endian")) { dev_dbg(dev, "GPIO registers are LITTLE endian\n"); } else { - config.flags |= BGPIOF_BIG_ENDIAN_BYTE_ORDER; + config.flags |= GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; dev_dbg(dev, "GPIO registers are BIG endian\n"); } diff --git a/drivers/gpio/gpio-mt7621.c b/drivers/gpio/gpio-mt7621.c index e7bb9b2cd6cf32..91230be5158790 100644 --- a/drivers/gpio/gpio-mt7621.c +++ b/drivers/gpio/gpio-mt7621.c @@ -242,7 +242,7 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) .set = set, .clr = ctrl, .dirout = diro, - .flags = BGPIOF_NO_SET_ON_INPUT, + .flags = GPIO_GENERIC_NO_SET_ON_INPUT, }; ret = gpio_generic_chip_init(&rg->chip, &config); diff --git a/drivers/gpio/gpio-mxc.c b/drivers/gpio/gpio-mxc.c index 433cbadc3a4cc6..52060b3ec7458a 100644 --- a/drivers/gpio/gpio-mxc.c +++ b/drivers/gpio/gpio-mxc.c @@ -481,7 +481,7 @@ static int mxc_gpio_probe(struct platform_device *pdev) config.dat = port->base + GPIO_PSR; config.set = port->base + GPIO_DR; config.dirout = port->base + GPIO_GDIR; - config.flags = BGPIOF_READ_OUTPUT_REG_SET; + config.flags = GPIO_GENERIC_READ_OUTPUT_REG_SET; err = gpio_generic_chip_init(&port->gen_gc, &config); if (err) diff --git a/drivers/gpio/gpio-rda.c b/drivers/gpio/gpio-rda.c index fb479d13eb01a4..7bbc6f0ce4c8a7 100644 --- a/drivers/gpio/gpio-rda.c +++ b/drivers/gpio/gpio-rda.c @@ -245,7 +245,7 @@ static int rda_gpio_probe(struct platform_device *pdev) .clr = rda_gpio->base + RDA_GPIO_CLR, .dirout = rda_gpio->base + RDA_GPIO_OEN_SET_OUT, .dirin = rda_gpio->base + RDA_GPIO_OEN_SET_IN, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&rda_gpio->chip, &config); diff --git a/drivers/gpio/gpio-realtek-otto.c b/drivers/gpio/gpio-realtek-otto.c index 37b4f73771e651..de527f4fc6c2ad 100644 --- a/drivers/gpio/gpio-realtek-otto.c +++ b/drivers/gpio/gpio-realtek-otto.c @@ -395,7 +395,7 @@ static int realtek_gpio_probe(struct platform_device *pdev) ctrl->bank_write = realtek_gpio_bank_write; ctrl->line_imr_pos = realtek_gpio_line_imr_pos; } else { - gen_gc_flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + gen_gc_flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; ctrl->bank_read = realtek_gpio_bank_read_swapped; ctrl->bank_write = realtek_gpio_bank_write_swapped; ctrl->line_imr_pos = realtek_gpio_line_imr_pos_swapped; diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index 2ced87ffd3bbf2..94ef2efbd14f57 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -223,7 +223,7 @@ static int sifive_gpio_probe(struct platform_device *pdev) .set = chip->base + SIFIVE_GPIO_OUTPUT_VAL, .dirout = chip->base + SIFIVE_GPIO_OUTPUT_EN, .dirin = chip->base + SIFIVE_GPIO_INPUT_EN, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&chip->gen_gc, &config); diff --git a/drivers/gpio/gpio-spacemit-k1.c b/drivers/gpio/gpio-spacemit-k1.c index a0af23f732819b..eb66a15c002fc3 100644 --- a/drivers/gpio/gpio-spacemit-k1.c +++ b/drivers/gpio/gpio-spacemit-k1.c @@ -197,7 +197,8 @@ static int spacemit_gpio_add_bank(struct spacemit_gpio *sg, .clr = clr, .dirout = dirout, .dirin = dirin, - .flags = BGPIOF_UNREADABLE_REG_SET | BGPIOF_UNREADABLE_REG_DIR, + .flags = GPIO_GENERIC_UNREADABLE_REG_SET | + GPIO_GENERIC_UNREADABLE_REG_DIR, }; /* This registers 32 GPIO lines per bank */ diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c index f3590db72b1412..aa8586d8a787f0 100644 --- a/drivers/gpio/gpio-vf610.c +++ b/drivers/gpio/gpio-vf610.c @@ -296,14 +296,14 @@ static int vf610_gpio_probe(struct platform_device *pdev) } gc = &port->chip.gc; - flags = BGPIOF_PINCTRL_BACKEND; + flags = GPIO_GENERIC_PINCTRL_BACKEND; /* * We only read the output register for current value on output * lines if the direction register is available so we can switch * direction. */ if (port->sdata->have_paddr) - flags |= BGPIOF_READ_OUTPUT_REG_SET; + flags |= GPIO_GENERIC_READ_OUTPUT_REG_SET; config = (struct gpio_generic_chip_config) { .dev = dev, diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c index c2ca71ebb9736d..10765f19b48b61 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c @@ -1842,7 +1842,7 @@ static int npcm7xx_gpio_of(struct npcm7xx_pinctrl *pctrl) .dat = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DIN, .set = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DOUT, .dirin = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_IEM, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&pctrl->gpio_bank[id].chip, &config); diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c index 0f155a685bbae7..1005b464a46964 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c @@ -2335,7 +2335,7 @@ static int npcm8xx_gpio_fw(struct npcm8xx_pinctrl *pctrl) .dat = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DIN, .set = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DOUT, .dirin = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_IEM, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&pctrl->gpio_bank[id].chip, &config); diff --git a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c index 4dd8a3daa83e44..c575949e42e67b 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c +++ b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c @@ -1061,7 +1061,7 @@ static int wpcm450_gpio_register(struct platform_device *pdev, set = pctrl->gpio_base + bank->dataout; dirout = pctrl->gpio_base + bank->cfg0; } else { - flags = BGPIOF_NO_OUTPUT; + flags = GPIO_GENERIC_NO_OUTPUT; } config = (typeof(config)){ diff --git a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c index dea49b9aabf2ae..971959a75b0c29 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c @@ -648,7 +648,7 @@ static int stm32_hdp_probe(struct platform_device *pdev) .dat = hdp->base + HDP_GPOVAL, .set = hdp->base + HDP_GPOSET, .clr = hdp->base + HDP_GPOCLR, - .flags = BGPIOF_NO_INPUT, + .flags = GPIO_GENERIC_NO_INPUT, }; err = gpio_generic_chip_init(&hdp->gpio_chip, &config); diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 9b14fd20f13eee..e62622e42cad37 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -684,15 +684,15 @@ int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -#define BGPIOF_BIG_ENDIAN BIT(0) -#define BGPIOF_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ -#define BGPIOF_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ -#define BGPIOF_BIG_ENDIAN_BYTE_ORDER BIT(3) -#define BGPIOF_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ -#define BGPIOF_NO_OUTPUT BIT(5) /* only input */ -#define BGPIOF_NO_SET_ON_INPUT BIT(6) -#define BGPIOF_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ -#define BGPIOF_NO_INPUT BIT(8) /* only output */ +#define GPIO_GENERIC_BIG_ENDIAN BIT(0) +#define GPIO_GENERIC_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ +#define GPIO_GENERIC_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ +#define GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER BIT(3) +#define GPIO_GENERIC_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ +#define GPIO_GENERIC_NO_OUTPUT BIT(5) /* only input */ +#define GPIO_GENERIC_NO_SET_ON_INPUT BIT(6) +#define GPIO_GENERIC_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ +#define GPIO_GENERIC_NO_INPUT BIT(8) /* only output */ #ifdef CONFIG_GPIOLIB_IRQCHIP int gpiochip_irqchip_add_domain(struct gpio_chip *gc, From 2235b26c1b25daf253748acff501af3ea85faaa8 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 17 Sep 2025 10:54:06 +0200 Subject: [PATCH 2769/3206] gpio: generic: move GPIO_GENERIC_ flags to the correct header These flags are specific to gpio-mmio and belong in linux/gpio/generic.h so move them there. Link: https://lore.kernel.org/r/20250917-gpio-generic-flags-v1-2-69f51fee8c89@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 10 ---------- include/linux/gpio/generic.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index e62622e42cad37..fabe2baf7b5090 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -684,16 +684,6 @@ int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -#define GPIO_GENERIC_BIG_ENDIAN BIT(0) -#define GPIO_GENERIC_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ -#define GPIO_GENERIC_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ -#define GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER BIT(3) -#define GPIO_GENERIC_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ -#define GPIO_GENERIC_NO_OUTPUT BIT(5) /* only input */ -#define GPIO_GENERIC_NO_SET_ON_INPUT BIT(6) -#define GPIO_GENERIC_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ -#define GPIO_GENERIC_NO_INPUT BIT(8) /* only output */ - #ifdef CONFIG_GPIOLIB_IRQCHIP int gpiochip_irqchip_add_domain(struct gpio_chip *gc, struct irq_domain *domain); diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index 162430d96660e9..ff566dc9c3cbed 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -9,6 +9,16 @@ struct device; +#define GPIO_GENERIC_BIG_ENDIAN BIT(0) +#define GPIO_GENERIC_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ +#define GPIO_GENERIC_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ +#define GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER BIT(3) +#define GPIO_GENERIC_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ +#define GPIO_GENERIC_NO_OUTPUT BIT(5) /* only input */ +#define GPIO_GENERIC_NO_SET_ON_INPUT BIT(6) +#define GPIO_GENERIC_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ +#define GPIO_GENERIC_NO_INPUT BIT(8) /* only output */ + /** * struct gpio_generic_chip_config - Generic GPIO chip configuration data * @dev: Parent device of the new GPIO chip (compulsory). From 8ec6a8ec23b9529d6203cab50a22fab3a5fd0d80 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Wed, 24 Sep 2025 22:11:40 +0900 Subject: [PATCH 2770/3206] firewire: core: suppress overflow warning when computing jiffies from isochronous cycle The multiplication by USEC_PER_SEC (=1000000L) may trigger an overflow warning with 32 bit storage. In the case of the subsystem the input value ranges between 800 and 16000, thus the result always fits within 32 bit storage. This commit suppresses the warning by using widening conversion to 64 bit storage before multiplication, then using narrowing conversion to 32 bit storage. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509170136.b5ZHaNAV-lkp@intel.com/ Fixes: 379b870c28c6 ("firewire: core: use helper macros instead of direct access to HZ") Link: https://lore.kernel.org/r/20250924131140.261686-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index 7f2ca93406cedf..2dd715a580ac8b 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -30,7 +30,7 @@ struct fw_packet; // This is the arbitrary value we use to indicate a mismatched gap count. #define GAP_COUNT_MISMATCHED 0 -#define isoc_cycles_to_jiffies(cycles) usecs_to_jiffies(cycles * USEC_PER_SEC / 8000) +#define isoc_cycles_to_jiffies(cycles) usecs_to_jiffies((u32)((u64)(cycles) * USEC_PER_SEC / 8000)) extern __printf(2, 3) void fw_err(const struct fw_card *card, const char *fmt, ...); From a6176b7b2a025f050740887238a7fbd3916ab6b5 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Wed, 24 Sep 2025 22:18:22 +0900 Subject: [PATCH 2771/3206] Revert "firewire: core: shrink critical section of fw_card spinlock in bm_work" This reverts commit 582310376d6e9a8d261b682178713cdc4b251af6. The bus manager work has the race condition against fw_destroy_nodes() called by fw_core_remove_card(). The acquition of spin lock of fw_card is left as is again. Link: https://lore.kernel.org/r/20250924131823.262136-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 38 ++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 4a5459696093a1..e5e0174a0335c2 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -299,6 +299,7 @@ enum bm_contention_outcome { }; static enum bm_contention_outcome contend_for_bm(struct fw_card *card) +__must_hold(&card->lock) { int generation = card->generation; int local_id = card->local_node->node_id; @@ -311,8 +312,11 @@ static enum bm_contention_outcome contend_for_bm(struct fw_card *card) bool keep_this_irm = false; struct fw_node *irm_node; struct fw_device *irm_device; + int irm_node_id; int rcode; + lockdep_assert_held(&card->lock); + if (!grace) { if (!is_next_generation(generation, card->bm_generation) || card->bm_abdicate) return BM_CONTENTION_OUTCOME_WITHIN_WINDOW; @@ -338,10 +342,16 @@ static enum bm_contention_outcome contend_for_bm(struct fw_card *card) return BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY; } - rcode = fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP, irm_node->node_id, generation, + irm_node_id = irm_node->node_id; + + spin_unlock_irq(&card->lock); + + rcode = fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP, irm_node_id, generation, SCODE_100, CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, data, sizeof(data)); + spin_lock_irq(&card->lock); + switch (rcode) { case RCODE_GENERATION: return BM_CONTENTION_OUTCOME_AT_NEW_GENERATION; @@ -352,12 +362,10 @@ static enum bm_contention_outcome contend_for_bm(struct fw_card *card) int bm_id = be32_to_cpu(data[0]); // Used by cdev layer for "struct fw_cdev_event_bus_reset". - scoped_guard(spinlock, &card->lock) { - if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) - card->bm_node_id = 0xffc0 & bm_id; - else - card->bm_node_id = local_id; - } + if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) + card->bm_node_id = 0xffc0 & bm_id; + else + card->bm_node_id = local_id; if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) return BM_CONTENTION_OUTCOME_IRM_HOLDS_ANOTHER_NODE_AS_BM; @@ -389,8 +397,12 @@ static void bm_work(struct work_struct *work) int expected_gap_count, generation; bool stand_for_root = false; - if (card->local_node == NULL) + spin_lock_irq(&card->lock); + + if (card->local_node == NULL) { + spin_unlock_irq(&card->lock); return; + } generation = card->generation; @@ -405,6 +417,7 @@ static void bm_work(struct work_struct *work) switch (result) { case BM_CONTENTION_OUTCOME_WITHIN_WINDOW: + spin_unlock_irq(&card->lock); fw_schedule_bm_work(card, msecs_to_jiffies(125)); return; case BM_CONTENTION_OUTCOME_IRM_HAS_LINK_OFF: @@ -415,10 +428,12 @@ static void bm_work(struct work_struct *work) break; case BM_CONTENTION_OUTCOME_AT_NEW_GENERATION: // BM work has been rescheduled. + spin_unlock_irq(&card->lock); return; case BM_CONTENTION_OUTCOME_LOCAL_PROBLEM_AT_TRANSACTION: // Let's try again later and hope that the local problem has gone away by // then. + spin_unlock_irq(&card->lock); fw_schedule_bm_work(card, msecs_to_jiffies(125)); return; case BM_CONTENTION_OUTCOME_IRM_IS_NOT_CAPABLE_FOR_IRM: @@ -428,7 +443,9 @@ static void bm_work(struct work_struct *work) case BM_CONTENTION_OUTCOME_IRM_HOLDS_ANOTHER_NODE_AS_BM: if (local_id == irm_id) { // Only acts as IRM. + spin_unlock_irq(&card->lock); allocate_broadcast_channel(card, generation); + spin_lock_irq(&card->lock); } fallthrough; case BM_CONTENTION_OUTCOME_IRM_HOLDS_LOCAL_NODE_AS_BM: @@ -469,6 +486,7 @@ static void bm_work(struct work_struct *work) if (!root_device_is_running) { // If we haven't probed this device yet, bail out now // and let's try again once that's done. + spin_unlock_irq(&card->lock); return; } else if (!root_device->cmc) { // Current root has an active link layer and we @@ -504,6 +522,8 @@ static void bm_work(struct work_struct *work) if (card->bm_retries++ < 5 && (card->gap_count != expected_gap_count || new_root_id != root_id)) { int card_gap_count = card->gap_count; + spin_unlock_irq(&card->lock); + fw_notice(card, "phy config: new root=%x, gap_count=%d\n", new_root_id, expected_gap_count); fw_send_phy_config(card, new_root_id, generation, expected_gap_count); @@ -524,6 +544,8 @@ static void bm_work(struct work_struct *work) } else { struct fw_device *root_device = fw_node_get_device(root_node); + spin_unlock_irq(&card->lock); + if (root_device && root_device->cmc) { // Make sure that the cycle master sends cycle start packets. __be32 data = cpu_to_be32(CSR_STATE_BIT_CMSTR); From e216c49b3ebb0729c50870ebfb8b798376dc1edf Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Wed, 24 Sep 2025 22:18:23 +0900 Subject: [PATCH 2772/3206] Revert "firewire: core: disable bus management work temporarily during updating topology" This reverts commit abe7159125702c734e851bc0c52b51cd446298a5. The bus manager work item acquires the spin lock of fw_card again, thus no need to serialize it against fw_core_handle_bus_reset(). Link: https://lore.kernel.org/r/20250924131823.262136-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-topology.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 90b988035a2afa..2f73bcd5696f2b 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -460,14 +460,8 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, { struct fw_node *local_node; - might_sleep(); - trace_bus_reset_handle(card->index, generation, node_id, bm_abdicate, self_ids, self_id_count); - // Disable bus management work during updating the cache of bus topology, since the work - // accesses to some members of fw_card. - disable_delayed_work_sync(&card->bm_work); - scoped_guard(spinlock, &card->lock) { // If the selfID buffer is not the immediate successor of the // previously processed one, we cannot reliably compare the @@ -501,8 +495,6 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, } } - enable_delayed_work(&card->bm_work); - fw_schedule_bm_work(card, 0); // Just used by transaction layer. From 45d78cd0bf2c40e74c31f70340484e20aae45b07 Mon Sep 17 00:00:00 2001 From: SungMin Park Date: Wed, 17 Sep 2025 12:43:11 +0530 Subject: [PATCH 2773/3206] dt-bindings: timer: exynos4210-mct: Add compatible for ARTPEC-9 SoC Add Axis ARTPEC-9 mct compatible to the bindings documentation. The design for the timer is reused from previous Samsung SoCs like exynos4210 and ARTPEC-8. Signed-off-by: SungMin Park Signed-off-by: Ravi Patel Signed-off-by: Daniel Lezcano Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250917071311.1404-1-ravi.patel@samsung.com --- .../devicetree/bindings/timer/samsung,exynos4210-mct.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml index 10578f54458115..a4b229e0e78aa7 100644 --- a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml +++ b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml @@ -26,6 +26,7 @@ properties: - items: - enum: - axis,artpec8-mct + - axis,artpec9-mct - google,gs101-mct - samsung,exynos2200-mct-peris - samsung,exynos3250-mct @@ -131,6 +132,7 @@ allOf: contains: enum: - axis,artpec8-mct + - axis,artpec9-mct - google,gs101-mct - samsung,exynos2200-mct-peris - samsung,exynos5260-mct From c539feff3c8f8c86213eee2b237410714712c326 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 24 Sep 2025 09:26:39 +0900 Subject: [PATCH 2774/3206] tracing: fprobe: Fix to remove recorded module addresses from filter Even if there is a memory allocation failure in fprobe_addr_list_add(), there is a partial list of module addresses. So remove the recorded addresses from filter if exists. This also removes the redundant ret local variable. Fixes: a3dc2983ca7b ("tracing: fprobe: Cleanup fprobe hash when module unloading") Signed-off-by: Masami Hiramatsu (Google) Cc: stable@vger.kernel.org Reviewed-by: Menglong Dong --- kernel/trace/fprobe.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index c8034dfc1070d6..5a807d62e76dce 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -428,8 +428,9 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad { unsigned long *addrs; - if (alist->index >= alist->size) - return -ENOMEM; + /* Previously we failed to expand the list. */ + if (alist->index == alist->size) + return -ENOSPC; alist->addrs[alist->index++] = addr; if (alist->index < alist->size) @@ -489,7 +490,7 @@ static int fprobe_module_callback(struct notifier_block *nb, for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); - if (alist.index < alist.size && alist.index > 0) + if (alist.index > 0) ftrace_set_filter_ips(&fprobe_graph_ops.ops, alist.addrs, alist.index, 1, 0); mutex_unlock(&fprobe_mutex); From 5671ce2a1fc6b4a16cff962423bc416b92cac3c8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Sep 2025 17:24:05 +0200 Subject: [PATCH 2775/3206] s390/mm: Use __GFP_ACCOUNT for user page table allocations Add missing kmemcg accounting of user page table allocations. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/mm/pgalloc.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d2f6f1f6d2fcb9..ad3e0f7f7fc1f9 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -16,9 +16,13 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + gfp_t gfp = GFP_KERNEL_ACCOUNT; + struct ptdesc *ptdesc; unsigned long *table; + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc(gfp, CRST_ALLOC_ORDER); if (!ptdesc) return NULL; table = ptdesc_to_virt(ptdesc); @@ -117,7 +121,7 @@ struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) struct ptdesc *ptdesc; u64 *table; - ptdesc = pagetable_alloc(GFP_KERNEL, 0); + ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, 0); if (ptdesc) { table = (u64 *)ptdesc_to_virt(ptdesc); __arch_set_page_dat(table, 1); @@ -136,10 +140,13 @@ void page_table_free_pgste(struct ptdesc *ptdesc) unsigned long *page_table_alloc(struct mm_struct *mm) { + gfp_t gfp = GFP_KERNEL_ACCOUNT; struct ptdesc *ptdesc; unsigned long *table; - ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(mm, ptdesc)) { From 7b80a23c0e33ae5a3ae68e0cf5b5a59e8a368c37 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Sep 2025 13:40:19 +0200 Subject: [PATCH 2776/3206] s390/bitops: Switch to generic fls(), fls64(), etc. Switch to generic fls(), fls64(), etc. which are implemented with __builtin_ctzl(), __builtin_clzl(). Those builtins are available for all supported compilers. Kernel image size is reduced by ~10kb (gcc 15.1.0 + defconfig). Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 56 +++------------------------------- 1 file changed, 5 insertions(+), 51 deletions(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index e643f24ebc05e5..8b9060c26c5261 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -180,17 +180,6 @@ static __always_inline __attribute_const__ unsigned long __flogr(unsigned long w } } -/** - * __ffs - find first bit in word. - * @word: The word to search - * - * Undefined if no bit exists, so code should check against 0 first. - */ -static __always_inline __flatten unsigned long __ffs(unsigned long word) -{ - return __flogr(-word & word) ^ (BITS_PER_LONG - 1); -} - /** * ffs - find first bit set * @word: the word to search @@ -205,48 +194,13 @@ static __always_inline __flatten int ffs(int word) return BITS_PER_LONG - __flogr(-val & val); } -/** - * __fls - find last (most-significant) set bit in a long word - * @word: the word to search - * - * Undefined if no set bit exists, so code should check against 0 first. - */ -static __always_inline __flatten unsigned long __fls(unsigned long word) -{ - return __flogr(word) ^ (BITS_PER_LONG - 1); -} - -/** - * fls64 - find last set bit in a 64-bit word - * @word: the word to search - * - * This is defined in a similar way as the libc and compiler builtin - * ffsll, but returns the position of the most significant set bit. - * - * fls64(value) returns 0 if value is 0 or the position of the last - * set bit if value is nonzero. The last (most significant) bit is - * at position 64. - */ -static __always_inline __flatten int fls64(unsigned long word) -{ - return BITS_PER_LONG - __flogr(word); -} - -/** - * fls - find last (most-significant) bit set - * @word: the word to search - * - * This is defined the same way as ffs. - * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. - */ -static __always_inline __flatten int fls(unsigned int word) -{ - return fls64(word); -} - +#include +#include +#include +#include +#include #include #include -#include #include #include #include From 6c4e0cb3d87ad63a30e05e7624a45a6f01240e70 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Sep 2025 13:40:20 +0200 Subject: [PATCH 2777/3206] s390/bitops: Switch to generic ffs() if supported by compiler Use generic ffs() / __builtin_ffs() if supported by the compiler. GCC 16 will have support for __builtin_ffs(). See gcc commit f50cff9766c5 ("s390: Implement clz and ctz for SI mode"). In the distant future when GCC 16 becomes the minimum supported version, this allows to get rid of the flogr inline assembly. Kernel image size is reduced by ~500 bytes (gcc 16 beta + defconfig). Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/Kconfig | 7 +++++++ arch/s390/include/asm/bitops.h | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf680c26a33cf7..22862ce7ec68d6 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -49,6 +49,13 @@ config KASAN_SHADOW_OFFSET depends on KASAN default 0x1C000000000000 +config CC_HAS_BUILTIN_FFS + def_bool !(CC_IS_GCC && GCC_VERSION < 160000) + help + GCC versions before 16.0.0 generate library calls to ffs() + for __builtin_ffs() even when __has_builtin(__builtin_ffs) + is true. + config CC_ASM_FLAG_OUTPUT_BROKEN def_bool CC_IS_GCC && GCC_VERSION < 140200 help diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 8b9060c26c5261..1564dd3a5a826a 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -122,6 +122,8 @@ static inline bool test_bit_inv(unsigned long nr, return test_bit(nr ^ (BITS_PER_LONG - 1), ptr); } +#ifndef CONFIG_CC_HAS_BUILTIN_FFS + /** * __flogr - find leftmost one * @word - The word to search @@ -194,6 +196,12 @@ static __always_inline __flatten int ffs(int word) return BITS_PER_LONG - __flogr(-val & val); } +#else /* CONFIG_CC_HAS_BUILTIN_FFS */ + +#include + +#endif /* CONFIG_CC_HAS_BUILTIN_FFS */ + #include #include #include From bca9b6633fb92ff37247442d08f3889a2e87881c Mon Sep 17 00:00:00 2001 From: Cosmo Chou Date: Tue, 16 Sep 2025 17:50:25 +0800 Subject: [PATCH 2778/3206] dt-bindings: trivial-devices: add mps,mp5998 Add dt-bindings for MPS mp5998 hot-swap controller. Signed-off-by: Cosmo Chou Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250916095026.800164-1-chou.cosmo@gmail.com Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/trivial-devices.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index edaba5cd8434fa..12cb7f64b98819 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -315,6 +315,8 @@ properties: - mps,mp5920 # Monolithic Power Systems Inc. multi-phase hot-swap controller mp5990 - mps,mp5990 + # Monolithic Power Systems Inc. multi-phase hot-swap controller mp5998 + - mps,mp5998 # Monolithic Power Systems Inc. digital step-down converter mp9941 - mps,mp9941 # Temperature sensor with integrated fan control From bef3c793542b132ab1dc19076ce4f91594b5fbdc Mon Sep 17 00:00:00 2001 From: Cosmo Chou Date: Tue, 16 Sep 2025 17:50:26 +0800 Subject: [PATCH 2779/3206] hwmon: (pmbus/mp5990) add support for MP5998 Add support for the MPS MP5998 hot-swap controller. Like MP5990, MP5998 uses EFUSE_CFG (0xC4) bit 9 to indicate linear/direct data format. Signed-off-by: Cosmo Chou Link: https://lore.kernel.org/r/20250916095026.800164-2-chou.cosmo@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/mp5990.rst | 30 +++++++++++++-- drivers/hwmon/pmbus/mp5990.c | 67 ++++++++++++++++++++++++++++++---- 2 files changed, 85 insertions(+), 12 deletions(-) diff --git a/Documentation/hwmon/mp5990.rst b/Documentation/hwmon/mp5990.rst index 6f2f0c099d449d..7fd536757ff2b2 100644 --- a/Documentation/hwmon/mp5990.rst +++ b/Documentation/hwmon/mp5990.rst @@ -9,9 +9,13 @@ Supported chips: Prefix: 'mp5990' - * Datasheet + Datasheet: Publicly available at the MPS website: https://www.monolithicpower.com/en/mp5990.html - Publicly available at the MPS website : https://www.monolithicpower.com/en/mp5990.html + * MPS MP5998 + + Prefix: 'mp5998' + + Datasheet: Not publicly available Author: @@ -21,7 +25,7 @@ Description ----------- This driver implements support for Monolithic Power Systems, Inc. (MPS) -MP5990 Hot-Swap Controller. +MP5990 and MP5998 Hot-Swap Controller. Device compliant with: @@ -53,7 +57,7 @@ The driver provides the following attributes for output voltage: **in2_alarm** -The driver provides the following attributes for output current: +The driver provides the following attributes for current: **curr1_input** @@ -63,6 +67,14 @@ The driver provides the following attributes for output current: **curr1_max** +**curr2_input** + +**curr2_label** + +**curr2_max** + +**curr2_max_alarm** + The driver provides the following attributes for input power: **power1_input** @@ -71,6 +83,16 @@ The driver provides the following attributes for input power: **power1_alarm** +The driver provides the following attributes for output power: + +**power2_input** + +**power2_label** + +**power2_max** + +**power2_max_alarm** + The driver provides the following attributes for temperature: **temp1_input** diff --git a/drivers/hwmon/pmbus/mp5990.c b/drivers/hwmon/pmbus/mp5990.c index 4ce381a394807e..9a4ee79712cfcf 100644 --- a/drivers/hwmon/pmbus/mp5990.c +++ b/drivers/hwmon/pmbus/mp5990.c @@ -8,6 +8,8 @@ #include #include "pmbus.h" +enum chips { mp5990, mp5998 }; + #define MP5990_EFUSE_CFG (0xC4) #define MP5990_VOUT_FORMAT BIT(9) @@ -110,10 +112,53 @@ static struct pmbus_driver_info mp5990_info = { .read_word_data = mp5990_read_word_data, }; +static struct pmbus_driver_info mp5998_info = { + .pages = 1, + .format[PSC_VOLTAGE_IN] = direct, + .format[PSC_VOLTAGE_OUT] = direct, + .format[PSC_CURRENT_IN] = direct, + .format[PSC_CURRENT_OUT] = direct, + .format[PSC_POWER] = direct, + .format[PSC_TEMPERATURE] = direct, + .m[PSC_VOLTAGE_IN] = 64, + .b[PSC_VOLTAGE_IN] = 0, + .R[PSC_VOLTAGE_IN] = 0, + .m[PSC_VOLTAGE_OUT] = 64, + .b[PSC_VOLTAGE_OUT] = 0, + .R[PSC_VOLTAGE_OUT] = 0, + .m[PSC_CURRENT_IN] = 16, + .b[PSC_CURRENT_IN] = 0, + .R[PSC_CURRENT_IN] = 0, + .m[PSC_CURRENT_OUT] = 16, + .b[PSC_CURRENT_OUT] = 0, + .R[PSC_CURRENT_OUT] = 0, + .m[PSC_POWER] = 2, + .b[PSC_POWER] = 0, + .R[PSC_POWER] = 0, + .m[PSC_TEMPERATURE] = 1, + .b[PSC_TEMPERATURE] = 0, + .R[PSC_TEMPERATURE] = 0, + .func[0] = + PMBUS_HAVE_VIN | PMBUS_HAVE_VOUT | PMBUS_HAVE_IOUT | + PMBUS_HAVE_IIN | PMBUS_HAVE_PIN | PMBUS_HAVE_POUT | + PMBUS_HAVE_TEMP | PMBUS_HAVE_STATUS_IOUT | + PMBUS_HAVE_STATUS_INPUT | PMBUS_HAVE_STATUS_TEMP, + .read_byte_data = mp5990_read_byte_data, + .read_word_data = mp5990_read_word_data, +}; + +static const struct i2c_device_id mp5990_id[] = { + {"mp5990", mp5990}, + {"mp5998", mp5998}, + { } +}; +MODULE_DEVICE_TABLE(i2c, mp5990_id); + static int mp5990_probe(struct i2c_client *client) { struct pmbus_driver_info *info; struct mp5990_data *data; + enum chips chip; int ret; data = devm_kzalloc(&client->dev, sizeof(struct mp5990_data), @@ -121,7 +166,15 @@ static int mp5990_probe(struct i2c_client *client) if (!data) return -ENOMEM; - memcpy(&data->info, &mp5990_info, sizeof(*info)); + if (client->dev.of_node) + chip = (uintptr_t)of_device_get_match_data(&client->dev); + else + chip = i2c_match_id(mp5990_id, client)->driver_data; + + if (chip == mp5990) + memcpy(&data->info, &mp5990_info, sizeof(*info)); + else + memcpy(&data->info, &mp5998_info, sizeof(*info)); info = &data->info; /* Read Vout Config */ @@ -140,6 +193,9 @@ static int mp5990_probe(struct i2c_client *client) data->info.format[PSC_VOLTAGE_OUT] = linear; data->info.format[PSC_CURRENT_OUT] = linear; data->info.format[PSC_POWER] = linear; + if (chip == mp5998) + data->info.format[PSC_CURRENT_IN] = linear; + ret = i2c_smbus_read_word_data(client, PMBUS_READ_VOUT); if (ret < 0) { dev_err(&client->dev, "Can't get vout exponent."); @@ -153,16 +209,11 @@ static int mp5990_probe(struct i2c_client *client) } static const struct of_device_id mp5990_of_match[] = { - { .compatible = "mps,mp5990" }, + { .compatible = "mps,mp5990", .data = (void *)mp5990 }, + { .compatible = "mps,mp5998", .data = (void *)mp5998 }, {} }; -static const struct i2c_device_id mp5990_id[] = { - {"mp5990"}, - { } -}; -MODULE_DEVICE_TABLE(i2c, mp5990_id); - static struct i2c_driver mp5990_driver = { .driver = { .name = "mp5990", From 1fac317b6cae373d5d1a5695a5175c65c745922f Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Fri, 19 Sep 2025 16:38:49 +0800 Subject: [PATCH 2780/3206] hwmon: (gpd-fan) Fix range check for pwm input Fixed the maximum value in the PWM input range check, allowing the input to be set to 255. Fixes: 0ab88e239439 ("hwmon: add GPD devices sensor driver") Reported-by: Chenx Dust Link: https://github.com/Cryolitia/gpd-fan-driver/pull/18 Co-developed-by: Chenx Dust Signed-off-by: Chenx Dust Signed-off-by: Cryolitia PukNgae Link: https://lore.kernel.org/r/20250919-hwmon-v1-1-2b69c8b9c062@uniontech.com Signed-off-by: Guenter Roeck --- drivers/hwmon/gpd-fan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/gpd-fan.c b/drivers/hwmon/gpd-fan.c index e0b3b46e1bf12a..644dc3ca9df7da 100644 --- a/drivers/hwmon/gpd-fan.c +++ b/drivers/hwmon/gpd-fan.c @@ -571,7 +571,7 @@ static int gpd_fan_hwmon_write(__always_unused struct device *dev, ret = 0; goto OUT; case hwmon_pwm_input: - if (!in_range(val, 0, 255)) { + if (!in_range(val, 0, 256)) { ret = -ERANGE; goto OUT; } From 5d5ec7c81c372d33875e0176f9f6d68e3e813e32 Mon Sep 17 00:00:00 2001 From: Shane Fagan Date: Sun, 14 Sep 2025 09:41:14 +0200 Subject: [PATCH 2781/3206] hwmon: (asus-ec-sensors) add ROG STRIX X670E-E GAMING WIFI Add support for ROG STRIX X670E-E GAMING WIFI Signed-off-by: Shane Fagan Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250914074125.135656-1-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index 6a9c5f30e016a3..836d4137384839 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -33,6 +33,7 @@ Supported boards: * ROG STRIX X570-E GAMING WIFI II * ROG STRIX X570-F GAMING * ROG STRIX X570-I GAMING + * ROG STRIX X670E-E GAMING WIFI * ROG STRIX X670E-I GAMING WIFI * ROG STRIX X870-I GAMING WIFI * ROG STRIX Z390-F GAMING diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 32d4dd26aa84da..3f6d89bcc8a22a 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -614,6 +614,13 @@ static const struct ec_board_info board_info_strix_x570_i_gaming = { .family = family_amd_500_series, }; +static const struct ec_board_info board_info_strix_x670e_e_gaming_wifi = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0, + .family = family_amd_600_series, +}; + static const struct ec_board_info board_info_strix_x670e_i_gaming_wifi = { .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | SENSOR_TEMP_MB | SENSOR_TEMP_VRM, @@ -764,6 +771,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_strix_x570_f_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-I GAMING", &board_info_strix_x570_i_gaming), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X670E-E GAMING WIFI", + &board_info_strix_x670e_e_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X670E-I GAMING WIFI", &board_info_strix_x670e_i_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X870-I GAMING WIFI", From ddb61e737f04e3c6c8299c1e00bf17a42a7f05cf Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Wed, 17 Sep 2025 20:10:33 +0200 Subject: [PATCH 2782/3206] hwmon: (dell-smm) Remove Dell Precision 490 custom config data It turns out the second fan on the Dell Precision 490 does not really support I8K_FAN_TURBO. Setting the fan state to 3 enables automatic fan control, just like on the other two fans. The reason why this was misinterpreted as turbo mode was that the second fan normally spins faster in automatic mode than in the previous fan states. Yet when in state 3, the fan speed reacts to heat exposure, exposing the automatic mode setting. Link: https://github.com/lm-sensors/lm-sensors/pull/383 Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20250917181036.10972-2-W_Armin@gmx.de Signed-off-by: Guenter Roeck --- drivers/hwmon/dell-smm-hwmon.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index 1e2c8e2840015a..3f61b2d7935e4a 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -1331,7 +1331,6 @@ struct i8k_config_data { enum i8k_configs { DELL_LATITUDE_D520, - DELL_PRECISION_490, DELL_STUDIO, DELL_XPS, }; @@ -1341,10 +1340,6 @@ static const struct i8k_config_data i8k_config_data[] __initconst = { .fan_mult = 1, .fan_max = I8K_FAN_TURBO, }, - [DELL_PRECISION_490] = { - .fan_mult = 1, - .fan_max = I8K_FAN_TURBO, - }, [DELL_STUDIO] = { .fan_mult = 1, .fan_max = I8K_FAN_HIGH, @@ -1364,15 +1359,6 @@ static const struct dmi_system_id i8k_config_dmi_table[] __initconst = { }, .driver_data = (void *)&i8k_config_data[DELL_LATITUDE_D520], }, - { - .ident = "Dell Precision 490", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, - "Precision WorkStation 490"), - }, - .driver_data = (void *)&i8k_config_data[DELL_PRECISION_490], - }, { .ident = "Dell Studio", .matches = { From b3499883c6d5f968f44e87d021ff2bd47ab5d094 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Wed, 17 Sep 2025 20:10:34 +0200 Subject: [PATCH 2783/3206] hwmon: (dell-smm) Move clamping of fan speed out of i8k_set_fan() Currently i8k_set_fan() clamps the fan speed before performing the SMM call to ensure that the speed is not negative and not greater than i8k_fan_max. This however is mostly unnecessary as the hwmon and thermal interfaces alread ensure this. Only the legacy ioctl interface does not ensure that the fan speed passed to i8k_set_fan() does meet the above criteria. Move the clamping out of i8k_set_fan() and into the legacy ioctl handler to prepare for future changes. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20250917181036.10972-3-W_Armin@gmx.de Signed-off-by: Guenter Roeck --- drivers/hwmon/dell-smm-hwmon.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index 3f61b2d7935e4a..36576db097067f 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -446,7 +447,6 @@ static int i8k_set_fan(const struct dell_smm_data *data, u8 fan, int speed) if (disallow_fan_support) return -EINVAL; - speed = (speed < 0) ? 0 : ((speed > data->i8k_fan_max) ? data->i8k_fan_max : speed); regs.ebx = fan | (speed << 8); return dell_smm_call(data->ops, ®s); @@ -637,6 +637,8 @@ static long i8k_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) if (copy_from_user(&speed, argp + 1, sizeof(int))) return -EFAULT; + speed = clamp_val(speed, 0, data->i8k_fan_max); + mutex_lock(&data->i8k_mutex); err = i8k_set_fan(data, val, speed); if (err < 0) From 2c8ac03aad7a8dd649de9080503c68319afb43f9 Mon Sep 17 00:00:00 2001 From: Ben Copeland Date: Tue, 23 Sep 2025 21:26:55 +0200 Subject: [PATCH 2784/3206] hwmon: (asus-ec-sensors) add ROG STRIX X870E-E GAMING WIFI Add support for ROG STRIX X870E-E GAMING WIFI This board uses the same sensor configuration as the ProArt X870E-CREATOR WIFI motherboard. Signed-off-by: Ben Copeland Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250923192935.11339-2-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index 836d4137384839..5801bc4279c0c6 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -36,6 +36,7 @@ Supported boards: * ROG STRIX X670E-E GAMING WIFI * ROG STRIX X670E-I GAMING WIFI * ROG STRIX X870-I GAMING WIFI + * ROG STRIX X870E-E GAMING WIFI * ROG STRIX Z390-F GAMING * ROG STRIX Z490-F GAMING * ROG STRIX Z690-A GAMING WIFI D4 diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 3f6d89bcc8a22a..7bb554c48143fd 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -635,6 +635,14 @@ static const struct ec_board_info board_info_strix_x870_i_gaming_wifi = { .family = family_amd_800_series, }; +static const struct ec_board_info board_info_strix_x870e_e_gaming_wifi = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM | + SENSOR_FAN_CPU_OPT, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0, + .family = family_amd_800_series, +}; + static const struct ec_board_info board_info_strix_z390_f_gaming = { .sensors = SENSOR_TEMP_CHIPSET | SENSOR_TEMP_VRM | SENSOR_TEMP_T_SENSOR | @@ -777,6 +785,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_strix_x670e_i_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X870-I GAMING WIFI", &board_info_strix_x870_i_gaming_wifi), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X870E-E GAMING WIFI", + &board_info_strix_x870e_e_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z390-F GAMING", &board_info_strix_z390_f_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z490-F GAMING", From 584d55be66ef151e6ef9ccb3dcbc0a2155559be1 Mon Sep 17 00:00:00 2001 From: Ben Copeland Date: Tue, 23 Sep 2025 21:26:56 +0200 Subject: [PATCH 2785/3206] hwmon: (asus-ec-sensors) increase timeout for locking ACPI mutex Some motherboards require more time to acquire the ACPI mutex, causing "Failed to acquire mutex" messages to appear in the kernel log. Increase the timeout from 500ms to 800ms to accommodate these cases. Signed-off-by: Ben Copeland Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250923192935.11339-3-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/asus-ec-sensors.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 7bb554c48143fd..e27a7b08e7b777 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -49,7 +49,7 @@ static char *mutex_path_override; */ #define ASUS_EC_MAX_BANK 3 -#define ACPI_LOCK_DELAY_MS 500 +#define ACPI_LOCK_DELAY_MS 800 /* ACPI mutex for locking access to the EC for the firmware */ #define ASUS_HW_ACCESS_MUTEX_ASMX "\\AMW0.ASMX" From 205c730262215fe1940668394a856f69ece55c66 Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Wed, 24 Sep 2025 15:48:38 +0800 Subject: [PATCH 2786/3206] hwmon: (gpd-fan) complete Kconfig dependencies DMI and HAS_IOPORT is also needed Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509200214.i2QX7iwD-lkp@intel.com/ Signed-off-by: Cryolitia PukNgae Link: https://lore.kernel.org/r/20250924-hwmon2-v1-1-fc529865a325@uniontech.com Signed-off-by: Guenter Roeck --- drivers/hwmon/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index d6769288a76e81..d61014d7968a4a 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -771,7 +771,7 @@ config SENSORS_GL520SM config SENSORS_GPD tristate "GPD handhelds" - depends on X86 + depends on X86 && DMI && HAS_IOPORT help If you say yes here you get support for fan readings and control over GPD handheld devices. From 1c1658058c99bcfd3b2347e587a556986037f80a Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Wed, 17 Sep 2025 20:10:35 +0200 Subject: [PATCH 2787/3206] hwmon: (dell-smm) Add support for automatic fan mode Many machines treat fan state 3 as some sort of automatic mode, which is superior to the separate SMM calls for switching to automatic fan mode for two reasons: - the fan control mode can be controlled for each fan separately - the current fan control mode can be retrieved from the BIOS On some machines however, this special fan state does not exist. Fan state 3 acts like a regular fan state on such machines or does not exist at all. Such machines usually use separate SMM calls for enabling/disabling automatic fan control. Add support for it. If the machine supports separate SMM calls for changing the fan control mode, then the other interface is ignored. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20250917181036.10972-4-W_Armin@gmx.de Signed-off-by: Guenter Roeck --- Documentation/hwmon/dell-smm-hwmon.rst | 56 +++++++++++-------- drivers/hwmon/dell-smm-hwmon.c | 74 +++++++++++++++++++++----- include/uapi/linux/i8k.h | 2 + 3 files changed, 98 insertions(+), 34 deletions(-) diff --git a/Documentation/hwmon/dell-smm-hwmon.rst b/Documentation/hwmon/dell-smm-hwmon.rst index 5a4edb6565cf95..3e4e2d916ac523 100644 --- a/Documentation/hwmon/dell-smm-hwmon.rst +++ b/Documentation/hwmon/dell-smm-hwmon.rst @@ -38,7 +38,7 @@ fan[1-4]_min RO Minimal Fan speed in RPM fan[1-4]_max RO Maximal Fan speed in RPM fan[1-4]_target RO Expected Fan speed in RPM pwm[1-4] RW Control the fan PWM duty-cycle. -pwm1_enable WO Enable or disable automatic BIOS fan +pwm[1-4]_enable RW/WO Enable or disable automatic BIOS fan control (not supported on all laptops, see below for details). temp[1-10]_input RO Temperature reading in milli-degrees @@ -49,26 +49,40 @@ temp[1-10]_label RO Temperature sensor label. Due to the nature of the SMM interface, each pwmX attribute controls fan number X. -Disabling automatic BIOS fan control ------------------------------------- - -On some laptops the BIOS automatically sets fan speed every few -seconds. Therefore the fan speed set by mean of this driver is quickly -overwritten. - -There is experimental support for disabling automatic BIOS fan -control, at least on laptops where the corresponding SMM command is -known, by writing the value ``1`` in the attribute ``pwm1_enable`` -(writing ``2`` enables automatic BIOS control again). Even if you have -more than one fan, all of them are set to either enabled or disabled -automatic fan control at the same time and, notwithstanding the name, -``pwm1_enable`` sets automatic control for all fans. - -If ``pwm1_enable`` is not available, then it means that SMM codes for -enabling and disabling automatic BIOS fan control are not whitelisted -for your hardware. It is possible that codes that work for other -laptops actually work for yours as well, or that you have to discover -new codes. +Enabling/Disabling automatic BIOS fan control +--------------------------------------------- + +There exist two methods for enabling/disabling automatic BIOS fan control: + +1. Separate SMM commands to enable/disable automatic BIOS fan control for all fans. + +2. A special fan state that enables automatic BIOS fan control for a individual fan. + +The driver cannot reliably detect what method should be used on a given +device, so instead the following heuristic is used: + +- use fan state 3 for enabling BIOS fan control if the maximum fan state + setable by the user is smaller than 3 (default setting). + +- use separate SMM commands if device is whitelisted to support them. + +When using the first method, each fan will have a standard ``pwmX_enable`` +sysfs attribute. Writing ``1`` into this attribute will disable automatic +BIOS fan control for the associated fan and set it to maximum speed. Enabling +BIOS fan control again can be achieved by writing ``2`` into this attribute. +Reading this sysfs attributes returns the current setting as reported by +the underlying hardware. + +When using the second method however, only the ``pwm1_enable`` sysfs attribute +will be available to enable/disable automatic BIOS fan control globaly for all +fans available on a given device. Additionally, this sysfs attribute is write-only +as there exists no SMM command for reading the current fan control setting. + +If no ``pwmX_enable`` attributes are available, then it means that the driver +cannot use the first method and the SMM codes for enabling and disabling automatic +BIOS fan control are not whitelisted for your device. It is possible that codes +that work for other laptops actually work for yours as well, or that you have to +discover new codes. Check the list ``i8k_whitelist_fan_control`` in file ``drivers/hwmon/dell-smm-hwmon.c`` in the kernel tree: as a first diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index 36576db097067f..79befa13b6990b 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -764,6 +764,13 @@ static int dell_smm_get_cur_state(struct thermal_cooling_device *dev, unsigned l if (ret < 0) return ret; + /* + * A fan state bigger than i8k_fan_max might indicate that + * the fan is currently in automatic mode. + */ + if (ret > cdata->data->i8k_fan_max) + return -ENODATA; + *state = ret; return 0; @@ -851,7 +858,14 @@ static umode_t dell_smm_is_visible(const void *drvdata, enum hwmon_sensor_types break; case hwmon_pwm_enable: - if (auto_fan) + if (auto_fan) { + /* + * The setting affects all fans, so only create a + * single attribute. + */ + if (channel != 1) + return 0; + /* * There is no command for retrieve the current status * from BIOS, and userspace/firmware itself can change @@ -859,6 +873,10 @@ static umode_t dell_smm_is_visible(const void *drvdata, enum hwmon_sensor_types * Thus we can only provide write-only access for now. */ return 0200; + } + + if (data->fan[channel] && data->i8k_fan_max < I8K_FAN_AUTO) + return 0644; break; default: @@ -928,14 +946,28 @@ static int dell_smm_read(struct device *dev, enum hwmon_sensor_types type, u32 a } break; case hwmon_pwm: + ret = i8k_get_fan_status(data, channel); + if (ret < 0) + return ret; + switch (attr) { case hwmon_pwm_input: - ret = i8k_get_fan_status(data, channel); - if (ret < 0) - return ret; + /* + * A fan state bigger than i8k_fan_max might indicate that + * the fan is currently in automatic mode. + */ + if (ret > data->i8k_fan_max) + return -ENODATA; *val = clamp_val(ret * data->i8k_pwm_mult, 0, 255); + return 0; + case hwmon_pwm_enable: + if (ret == I8K_FAN_AUTO) + *val = 2; + else + *val = 1; + return 0; default: break; @@ -1022,16 +1054,32 @@ static int dell_smm_write(struct device *dev, enum hwmon_sensor_types type, u32 return 0; case hwmon_pwm_enable: - if (!val) - return -EINVAL; - - if (val == 1) + switch (val) { + case 1: enable = false; - else + break; + case 2: enable = true; + break; + default: + return -EINVAL; + } mutex_lock(&data->i8k_mutex); - err = i8k_enable_fan_auto_mode(data, enable); + if (auto_fan) { + err = i8k_enable_fan_auto_mode(data, enable); + } else { + /* + * When putting the fan into manual control mode we have to ensure + * that the device does not overheat until the userspace fan control + * software takes over. Because of this we set the fan speed to + * i8k_fan_max when disabling automatic fan control. + */ + if (enable) + err = i8k_set_fan(data, channel, I8K_FAN_AUTO); + else + err = i8k_set_fan(data, channel, data->i8k_fan_max); + } mutex_unlock(&data->i8k_mutex); if (err < 0) @@ -1082,9 +1130,9 @@ static const struct hwmon_channel_info * const dell_smm_info[] = { ), HWMON_CHANNEL_INFO(pwm, HWMON_PWM_INPUT | HWMON_PWM_ENABLE, - HWMON_PWM_INPUT, - HWMON_PWM_INPUT, - HWMON_PWM_INPUT + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE ), NULL }; diff --git a/include/uapi/linux/i8k.h b/include/uapi/linux/i8k.h index 268e6268f6c808..a16e4049710fcc 100644 --- a/include/uapi/linux/i8k.h +++ b/include/uapi/linux/i8k.h @@ -36,6 +36,8 @@ #define I8K_FAN_LOW 1 #define I8K_FAN_HIGH 2 #define I8K_FAN_TURBO 3 +/* Many machines treat this mode as some sort of automatic mode */ +#define I8K_FAN_AUTO 3 #define I8K_FAN_MAX I8K_FAN_TURBO #define I8K_VOL_UP 1 From 53d3bd48ef6ff1567a75ca77728968f5ab493cb4 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Wed, 17 Sep 2025 20:10:36 +0200 Subject: [PATCH 2788/3206] hwmon: (dell-smm) Add support for Dell OptiPlex 7040 The Dell OptiPlex 7040 supports the legacy SMM interface for reading sensors and performing fan control. Whitelist this machine so that this driver loads automatically. Closes: https://github.com/Wer-Wolf/i8kutils/issues/15 Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20250917181036.10972-5-W_Armin@gmx.de Signed-off-by: Guenter Roeck --- drivers/hwmon/dell-smm-hwmon.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index 79befa13b6990b..cbe1a74a3deea3 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -1330,6 +1330,13 @@ static const struct dmi_system_id i8k_dmi_table[] __initconst = { DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7050"), }, }, + { + .ident = "Dell OptiPlex 7040", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7040"), + }, + }, { .ident = "Dell Precision", .matches = { From 456c32e3c4316654f95f9d49c12cbecfb77d5660 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 19 Sep 2025 10:15:56 +0900 Subject: [PATCH 2789/3206] tracing: dynevent: Add a missing lockdown check on dynevent Since dynamic_events interface on tracefs is compatible with kprobe_events and uprobe_events, it should also check the lockdown status and reject if it is set. Link: https://lore.kernel.org/all/175824455687.45175.3734166065458520748.stgit@devnote2/ Fixes: 17911ff38aa5 ("tracing: Add locked_down checks to the open calls of files created for tracefs") Signed-off-by: Masami Hiramatsu (Google) Cc: stable@vger.kernel.org --- kernel/trace/trace_dynevent.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 5d64a18cacacc6..d06854bd32b357 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -230,6 +230,10 @@ static int dyn_event_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = tracing_check_open_get_tr(NULL); if (ret) return ret; From 1cf89b6bf660c2e9fa137b3e160c7b1001937a78 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 19 Sep 2025 19:40:25 +0100 Subject: [PATCH 2790/3206] arm64: Kconfig: Make CPU_BIG_ENDIAN depend on BROKEN Big-endian arm64 configurations are vanishingly rare, yet we still claim to support them in Linux despite very limited testing or visible interest. Supporting big-endian adds unnecessary burden to reviewers and contributors which, without any known active users, is hard to justify. For example, recent work to improve our futex routines and to implement nested virtualisation support is non-trivially complicated by having to support both big- and little-endianness. Back in 2019 [1], it was claimed that Huawei were using arm64 big-endian machines in their telecommunication products but I don't know whether that's still the case and certainly haven't seen any patch contributions to help support or maintain it. Make CPU_BIG_ENDIAN depend on BROKEN as an initial deprecation step towards its removal. Cc: Catalin Marinas Cc: Marc Zyngier Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Hanjun Guo Cc: Jonathan Cameron Cc: Guenter Roeck Link: https://lore.kernel.org/linux-arm-kernel/73701e9f-bee1-7ae8-2277-7a3576171cd4@huawei.com/ [1] Acked-by: Catalin Marinas Acked-by: Marc Zyngier Acked-by: Ard Biesheuvel Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 514038b18ebaea..0633520a85f2a5 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1493,7 +1493,7 @@ choice config CPU_BIG_ENDIAN bool "Build big-endian kernel" # https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c - depends on AS_IS_GNU || AS_VERSION >= 150000 + depends on (AS_IS_GNU || AS_VERSION >= 150000) && BROKEN help Say Y if you plan on running a kernel with a big-endian userspace. From ea0b39168d3a2313eabd145fb3440c946ccff4d1 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 23 Sep 2025 11:50:23 +0800 Subject: [PATCH 2791/3206] arm64: cpufeature: Remove duplicate asm/mmu.h header ./arch/arm64/kernel/cpufeature.c: asm/mmu.h is included more than once. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5dabb349986c7c..10362b296f2d62 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -86,7 +86,6 @@ #include #include #include -#include #include #include #include From 1f6113ae5ac4927fe80256154ebb0461e670fa85 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 24 Sep 2025 17:53:11 +0200 Subject: [PATCH 2792/3206] x86/boot: Drop erroneous __init annotation from early_set_pages_state() The kexec code will call set_pages_state() after tearing down all the GHCBs, which will therefore result in a call to early_set_pages_state(). This means the __init annotation is wrong, and must be dropped. Fixes: c5c30a373693 ("x86/boot: Move startup code out of __head section") Reported-by: Srikanth Aithal Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Tested-by: Srikanth Aithal --- arch/x86/boot/startup/sev-startup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index a9b0a9c32d8ff8..09725428d3e657 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -44,7 +44,7 @@ /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" -void __init +void early_set_pages_state(unsigned long vaddr, unsigned long paddr, unsigned long npages, const struct psc_desc *desc) { From 5cf952c54f3546f904ded645aecc1788d1090bd2 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 23 Sep 2025 13:56:31 -0700 Subject: [PATCH 2793/3206] thermal: intel: int340x: Power Slider: Validate slider_balance range When the module parameter slider_balance is set to the performance slider value of 0, the SoC slider profile switches to the performance mode. This can cause the Linux power-profiles-daemon to change the system power mode to performance from balanced mode. This happens when there is only one platform profile registered as there will be no conflict with other platform profiles. Same issue occurs when the slider_balance is set to the power-saver slider value. Prevent module parameter slider_balance from overlapping with performance and power-saver slider values by adding range validation. Return an error when an invalid value is provided. Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/20250923205631.3056590-1-srinivas.pandruvada@linux.intel.com [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- .../intel/int340x_thermal/processor_thermal_soc_slider.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_soc_slider.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_soc_slider.c index 20d70cb0154201..49ff3bae727109 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_soc_slider.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_soc_slider.c @@ -67,7 +67,8 @@ static int slider_def_balance_set(const char *arg, const struct kernel_param *kp ret = kstrtou8(arg, 16, &slider_val); if (!ret) { - if (slider_val > SOC_SLIDER_VALUE_MAXIMUM) + if (slider_val <= slider_values[SOC_POWER_SLIDER_PERFORMANCE] || + slider_val >= slider_values[SOC_POWER_SLIDER_POWERSAVE]) return -EINVAL; slider_balanced_param = slider_val; From 64f4ea200eca05a248533b8374d7b5f0756a3990 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 23 Sep 2025 14:34:17 -0700 Subject: [PATCH 2794/3206] kconfig: Fix BrokenPipeError warnings in selftests The kconfig test harness ("make testconfig") was generating BrokenPipeError warnings when running interactive tests like oldaskconfig and oldconfig: /usr/lib/python3/dist-packages/_pytest/unraisableexception.py:85: PytestUnraisableExceptionWarning: Exception ignored in: <_io.BufferedWriter name=12> Traceback (most recent call last): File "/srv/code/scripts/kconfig/tests/conftest.py", line 127, in oldaskconfig return self._run_conf('--oldaskconfig', dot_config=dot_config, ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ interactive=True, in_keys=in_keys) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ BrokenPipeError: [Errno 32] Broken pipe The issue occurred when the test framework attempted to write to stdin after the conf subprocess had already exited. Wrap stdin write operations in try/except to catch BrokenPipeError and stop sending more input. Add explicit flush() after writes so we can see delivery errors immediately. Ignore BrokenPipeError when closing stdin. Explicitly call wait() to validate subprocess termination. Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250923213422.1105654-1-kees@kernel.org Signed-off-by: Kees Cook --- scripts/kconfig/tests/conftest.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/scripts/kconfig/tests/conftest.py b/scripts/kconfig/tests/conftest.py index 2a2a7e2da06087..d94b79e012c04b 100644 --- a/scripts/kconfig/tests/conftest.py +++ b/scripts/kconfig/tests/conftest.py @@ -81,7 +81,22 @@ def _run_conf(self, mode, dot_config=None, out_file='.config', # For interactive modes such as oldaskconfig, oldconfig, # send 'Enter' key until the program finishes. if interactive: - ps.stdin.write(b'\n') + try: + ps.stdin.write(b'\n') + ps.stdin.flush() + except (BrokenPipeError, OSError): + # Process has exited, stop sending input + break + + # Close stdin gracefully + try: + ps.stdin.close() + except (BrokenPipeError, OSError): + # Ignore broken pipe on close + pass + + # Wait for process to complete + ps.wait() self.retcode = ps.returncode self.stdout = ps.stdout.read().decode() From f9afce4f32e9a120fc902fa6c9e0b90ad799a6ec Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 23 Sep 2025 14:34:18 -0700 Subject: [PATCH 2795/3206] kconfig: Add transitional symbol attribute for migration support During kernel option migrations (e.g. CONFIG_CFI_CLANG to CONFIG_CFI), existing .config files need to maintain backward compatibility while preventing deprecated options from appearing in newly generated configurations. This is challenging with existing Kconfig mechanisms because: 1. Simply removing old options breaks existing .config files. 2. Manually listing an option as "deprecated" leaves it needlessly visible and still writes them to new .config files. 3. Using any method to remove visibility (.e.g no 'prompt', 'if n', etc) prevents the option from being processed at all. Add a "transitional" attribute that creates symbols which are: - Processed during configuration (can influence other symbols' defaults) - Hidden from user menus (no prompts appear) - Omitted from newly written .config files (gets migrated) - Restricted to only having help sections (no defaults, selects, etc) making it truly just a "prior value pass-through" option. The transitional syntax requires a type argument and prevents type redefinition: config NEW_OPTION bool "New option" default OLD_OPTION config OLD_OPTION bool transitional help Transitional config for OLD_OPTION migration. This allows seamless migration: olddefconfig processes existing CONFIG_OLD_OPTION=y settings to enable CONFIG_NEW_OPTION=y, while CONFIG_OLD_OPTION is omitted from newly generated .config files. Added positive and negative testing via "testconfig" make target. Co-developed-by: Vegard Nossum Signed-off-by: Vegard Nossum Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250923213422.1105654-2-kees@kernel.org Signed-off-by: Kees Cook --- Documentation/kbuild/kconfig-language.rst | 32 ++++++ scripts/kconfig/expr.h | 1 + scripts/kconfig/lexer.l | 1 + scripts/kconfig/parser.y | 47 ++++++++ scripts/kconfig/symbol.c | 7 +- .../kconfig/tests/err_transitional/Kconfig | 52 +++++++++ .../tests/err_transitional/__init__.py | 14 +++ .../tests/err_transitional/expected_stderr | 7 ++ scripts/kconfig/tests/transitional/Kconfig | 100 ++++++++++++++++++ .../kconfig/tests/transitional/__init__.py | 18 ++++ .../tests/transitional/expected_config | 12 +++ .../kconfig/tests/transitional/initial_config | 16 +++ 12 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 scripts/kconfig/tests/err_transitional/Kconfig create mode 100644 scripts/kconfig/tests/err_transitional/__init__.py create mode 100644 scripts/kconfig/tests/err_transitional/expected_stderr create mode 100644 scripts/kconfig/tests/transitional/Kconfig create mode 100644 scripts/kconfig/tests/transitional/__init__.py create mode 100644 scripts/kconfig/tests/transitional/expected_config create mode 100644 scripts/kconfig/tests/transitional/initial_config diff --git a/Documentation/kbuild/kconfig-language.rst b/Documentation/kbuild/kconfig-language.rst index a91abb8f6840f7..abce88f15d7cb3 100644 --- a/Documentation/kbuild/kconfig-language.rst +++ b/Documentation/kbuild/kconfig-language.rst @@ -232,6 +232,38 @@ applicable everywhere (see syntax). enables the third modular state for all config symbols. At most one symbol may have the "modules" option set. +- transitional attribute: "transitional" + This declares the symbol as transitional, meaning it should be processed + during configuration but omitted from newly written .config files. + Transitional symbols are useful for backward compatibility during config + option migrations - they allow olddefconfig to process existing .config + files while ensuring the old option doesn't appear in new configurations. + + A transitional symbol: + - Has no prompt (is not visible to users in menus) + - Is processed normally during configuration (values are read and used) + - Can be referenced in default expressions of other symbols + - Is not written to new .config files + - Cannot have any other properties (it is a pass-through option) + + Example migration from OLD_NAME to NEW_NAME:: + + config NEW_NAME + bool "New option name" + default OLD_NAME + help + This replaces the old CONFIG_OLD_NAME option. + + config OLD_NAME + bool + transitional + help + Transitional config for OLD_NAME to NEW_NAME migration. + + With this setup, existing .config files with "CONFIG_OLD_NAME=y" will + result in "CONFIG_NEW_NAME=y" being set, while CONFIG_OLD_NAME will be + omitted from newly written .config files. + Menu dependencies ----------------- diff --git a/scripts/kconfig/expr.h b/scripts/kconfig/expr.h index fe2231e0e6a4cd..5f900d18dae019 100644 --- a/scripts/kconfig/expr.h +++ b/scripts/kconfig/expr.h @@ -145,6 +145,7 @@ struct symbol { #define SYMBOL_CONST 0x0001 /* symbol is const */ #define SYMBOL_CHECK 0x0008 /* used during dependency checking */ #define SYMBOL_VALID 0x0080 /* set when symbol.curr is calculated */ +#define SYMBOL_TRANS 0x0100 /* symbol is transitional only (not visible)*/ #define SYMBOL_WRITE 0x0200 /* write symbol to file (KCONFIG_CONFIG) */ #define SYMBOL_WRITTEN 0x0800 /* track info to avoid double-write to .config */ #define SYMBOL_CHECKED 0x2000 /* used during dependency checking */ diff --git a/scripts/kconfig/lexer.l b/scripts/kconfig/lexer.l index 9c2cdfc33c6f06..6d2c92c6095dd8 100644 --- a/scripts/kconfig/lexer.l +++ b/scripts/kconfig/lexer.l @@ -126,6 +126,7 @@ n [A-Za-z0-9_-] "select" return T_SELECT; "source" return T_SOURCE; "string" return T_STRING; +"transitional" return T_TRANSITIONAL; "tristate" return T_TRISTATE; "visible" return T_VISIBLE; "||" return T_OR; diff --git a/scripts/kconfig/parser.y b/scripts/kconfig/parser.y index e9c3c664e92511..49b79dde1725ab 100644 --- a/scripts/kconfig/parser.y +++ b/scripts/kconfig/parser.y @@ -75,6 +75,7 @@ struct menu *current_menu, *current_entry, *current_choice; %token T_SELECT %token T_SOURCE %token T_STRING +%token T_TRANSITIONAL %token T_TRISTATE %token T_VISIBLE %token T_EOL @@ -205,6 +206,12 @@ config_option: T_PROMPT T_WORD_QUOTE if_expr T_EOL printd(DEBUG_PARSE, "%s:%d:prompt\n", cur_filename, cur_lineno); }; +config_option: T_TRANSITIONAL T_EOL +{ + current_entry->sym->flags |= SYMBOL_TRANS; + printd(DEBUG_PARSE, "%s:%d:transitional\n", cur_filename, cur_lineno); +}; + config_option: default expr if_expr T_EOL { menu_add_expr(P_DEFAULT, $2, $3); @@ -482,6 +489,43 @@ assign_val: %% +/** + * transitional_check_sanity - check transitional symbols have no other + * properties + * + * @menu: menu of the potentially transitional symbol + * + * Return: -1 if an error is found, 0 otherwise. + */ +static int transitional_check_sanity(const struct menu *menu) +{ + struct property *prop; + + if (!menu->sym || !(menu->sym->flags & SYMBOL_TRANS)) + return 0; + + /* Check for depends and visible conditions. */ + if ((menu->dep && !expr_is_yes(menu->dep)) || + (menu->visibility && !expr_is_yes(menu->visibility))) { + fprintf(stderr, "%s:%d: error: %s", + menu->filename, menu->lineno, + "transitional symbols can only have help sections\n"); + return -1; + } + + /* Check for any property other than "help". */ + for (prop = menu->sym->prop; prop; prop = prop->next) { + if (prop->type != P_COMMENT) { + fprintf(stderr, "%s:%d: error: %s", + prop->filename, prop->lineno, + "transitional symbols can only have help sections\n"); + return -1; + } + } + + return 0; +} + /** * choice_check_sanity - check sanity of a choice member * @@ -558,6 +602,9 @@ void conf_parse(const char *name) if (menu->sym && sym_check_deps(menu->sym)) yynerrs++; + if (transitional_check_sanity(menu)) + yynerrs++; + if (menu->sym && sym_is_choice(menu->sym)) { menu_for_each_sub_entry(child, menu) if (child->sym && choice_check_sanity(child)) diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index 26ab10c0fd768f..760cac99838196 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -214,6 +214,11 @@ static void sym_calc_visibility(struct symbol *sym) struct property *prop; tristate tri; + if (sym->flags & SYMBOL_TRANS) { + sym->visible = yes; + return; + } + /* any prompt visible? */ tri = no; for_all_prompts(sym, prop) { @@ -526,7 +531,7 @@ void sym_calc_value(struct symbol *sym) } } - if (sym_is_choice(sym)) + if (sym_is_choice(sym) || sym->flags & SYMBOL_TRANS) sym->flags &= ~SYMBOL_WRITE; } diff --git a/scripts/kconfig/tests/err_transitional/Kconfig b/scripts/kconfig/tests/err_transitional/Kconfig new file mode 100644 index 00000000000000..a75ed3b2fe5e43 --- /dev/null +++ b/scripts/kconfig/tests/err_transitional/Kconfig @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0 +# Test that transitional symbols cannot have properties other than help + +config BAD_DEFAULT + bool + transitional + default y + help + This transitional symbol illegally has a default property. + +config BAD_PROMPT + bool + transitional + prompt "Bad prompt" + help + This transitional symbol illegally has a prompt. + +config BAD_SELECT + bool + transitional + select OTHER_SYMBOL + help + This transitional symbol illegally has a select. + +config BAD_IMPLY + bool + transitional + imply OTHER_SYMBOL + help + This transitional symbol illegally has an imply. + +config BAD_DEPENDS + bool + transitional + depends on OTHER_SYMBOL + help + This transitional symbol illegally has a depends. + +config BAD_RANGE + int + transitional + range 1 10 + help + This transitional symbol illegally has a range. + +config BAD_NO_TYPE + transitional + help + This transitional symbol illegally has no type specified. + +config OTHER_SYMBOL + bool diff --git a/scripts/kconfig/tests/err_transitional/__init__.py b/scripts/kconfig/tests/err_transitional/__init__.py new file mode 100644 index 00000000000000..7dffb5b0833f0e --- /dev/null +++ b/scripts/kconfig/tests/err_transitional/__init__.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +""" +Test that transitional symbols with invalid properties are rejected. + +Transitional symbols can only have help sections. Any other properties +(default, select, depends, etc.) should cause a parser error. +""" + +def test(conf): + # This should fail with exit code 1 due to invalid transitional symbol + assert conf.olddefconfig() == 1 + + # Check that the error message is about transitional symbols + assert conf.stderr_contains('expected_stderr') diff --git a/scripts/kconfig/tests/err_transitional/expected_stderr b/scripts/kconfig/tests/err_transitional/expected_stderr new file mode 100644 index 00000000000000..b52db4f680f430 --- /dev/null +++ b/scripts/kconfig/tests/err_transitional/expected_stderr @@ -0,0 +1,7 @@ +Kconfig:46:warning: config symbol defined without type +Kconfig:7: error: transitional symbols can only have help sections +Kconfig:14: error: transitional symbols can only have help sections +Kconfig:21: error: transitional symbols can only have help sections +Kconfig:28: error: transitional symbols can only have help sections +Kconfig:32: error: transitional symbols can only have help sections +Kconfig:42: error: transitional symbols can only have help sections diff --git a/scripts/kconfig/tests/transitional/Kconfig b/scripts/kconfig/tests/transitional/Kconfig new file mode 100644 index 00000000000000..62c3b24665b992 --- /dev/null +++ b/scripts/kconfig/tests/transitional/Kconfig @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: GPL-2.0 +# Test transitional symbols for config migration with all Kconfig types + +# Enable module support for tristate testing +config MODULES + bool "Enable loadable module support" + modules + default y + +# Basic migration tests for all types +config NEW_BOOL + bool "New bool option" + default OLD_BOOL + +config OLD_BOOL + bool + transitional + +config NEW_TRISTATE + tristate "New tristate option" + default OLD_TRISTATE + +config OLD_TRISTATE + tristate + transitional + +config NEW_STRING + string "New string option" + default OLD_STRING + +config OLD_STRING + string + transitional + +config NEW_HEX + hex "New hex option" + default OLD_HEX + +config OLD_HEX + hex + transitional + +config NEW_INT + int "New int option" + default OLD_INT + +config OLD_INT + int + transitional + +# Precedence tests for all types +config NEW_BOOL_PRECEDENCE + bool "New bool option with precedence" + default OLD_BOOL_PRECEDENCE + +config OLD_BOOL_PRECEDENCE + bool + transitional + +config NEW_STRING_PRECEDENCE + string "New string option with precedence" + default OLD_STRING_PRECEDENCE + +config OLD_STRING_PRECEDENCE + string + transitional + +config NEW_TRISTATE_PRECEDENCE + tristate "New tristate option with precedence" + default OLD_TRISTATE_PRECEDENCE + +config OLD_TRISTATE_PRECEDENCE + tristate + transitional + +config NEW_HEX_PRECEDENCE + hex "New hex option with precedence" + default OLD_HEX_PRECEDENCE + +config OLD_HEX_PRECEDENCE + hex + transitional + +config NEW_INT_PRECEDENCE + int "New int option with precedence" + default OLD_INT_PRECEDENCE + +config OLD_INT_PRECEDENCE + int + transitional + +# Test that help sections are allowed for transitional symbols +config OLD_WITH_HELP + bool + transitional + help + This transitional symbol has a help section to validate that help is allowed. + +config REGULAR_OPTION + bool "Regular option" diff --git a/scripts/kconfig/tests/transitional/__init__.py b/scripts/kconfig/tests/transitional/__init__.py new file mode 100644 index 00000000000000..61937d10edf1ee --- /dev/null +++ b/scripts/kconfig/tests/transitional/__init__.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 +""" +Test transitional symbol migration functionality for all Kconfig types. + +This tests that: +- OLD_* options in existing .config cause NEW_* options to be set +- OLD_* options are not written to the new .config file +- NEW_* options appear in the new .config file with correct values +- All Kconfig types work correctly: bool, tristate, string, hex, int +- User-set NEW values take precedence over conflicting OLD transitional values +""" + +def test(conf): + # Run olddefconfig to process the migration with the initial config + assert conf.olddefconfig(dot_config='initial_config') == 0 + + # Check that the configuration matches expected output + assert conf.config_contains('expected_config') diff --git a/scripts/kconfig/tests/transitional/expected_config b/scripts/kconfig/tests/transitional/expected_config new file mode 100644 index 00000000000000..846e9ddcab9103 --- /dev/null +++ b/scripts/kconfig/tests/transitional/expected_config @@ -0,0 +1,12 @@ +CONFIG_MODULES=y +CONFIG_NEW_BOOL=y +CONFIG_NEW_TRISTATE=m +CONFIG_NEW_STRING="test string" +CONFIG_NEW_HEX=0x1234 +CONFIG_NEW_INT=42 +# CONFIG_NEW_BOOL_PRECEDENCE is not set +CONFIG_NEW_STRING_PRECEDENCE="user value" +CONFIG_NEW_TRISTATE_PRECEDENCE=y +CONFIG_NEW_HEX_PRECEDENCE=0xABCD +CONFIG_NEW_INT_PRECEDENCE=100 +# CONFIG_REGULAR_OPTION is not set diff --git a/scripts/kconfig/tests/transitional/initial_config b/scripts/kconfig/tests/transitional/initial_config new file mode 100644 index 00000000000000..e648a65e504c08 --- /dev/null +++ b/scripts/kconfig/tests/transitional/initial_config @@ -0,0 +1,16 @@ +CONFIG_MODULES=y +CONFIG_OLD_BOOL=y +CONFIG_OLD_TRISTATE=m +CONFIG_OLD_STRING="test string" +CONFIG_OLD_HEX=0x1234 +CONFIG_OLD_INT=42 +# CONFIG_NEW_BOOL_PRECEDENCE is not set +CONFIG_OLD_BOOL_PRECEDENCE=y +CONFIG_NEW_STRING_PRECEDENCE="user value" +CONFIG_OLD_STRING_PRECEDENCE="old value" +CONFIG_NEW_TRISTATE_PRECEDENCE=y +CONFIG_OLD_TRISTATE_PRECEDENCE=m +CONFIG_NEW_HEX_PRECEDENCE=0xABCD +CONFIG_OLD_HEX_PRECEDENCE=0x5678 +CONFIG_NEW_INT_PRECEDENCE=100 +CONFIG_OLD_INT_PRECEDENCE=200 From 23ef9d439769d5f35353650e771c63d13824235b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 23 Sep 2025 14:34:19 -0700 Subject: [PATCH 2796/3206] kcfi: Rename CONFIG_CFI_CLANG to CONFIG_CFI The kernel's CFI implementation uses the KCFI ABI specifically, and is not strictly tied to a particular compiler. In preparation for GCC supporting KCFI, rename CONFIG_CFI_CLANG to CONFIG_CFI (along with associated options). Use new "transitional" Kconfig option for old CONFIG_CFI_CLANG that will enable CONFIG_CFI during olddefconfig. Reviewed-by: Linus Walleij Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250923213422.1105654-3-kees@kernel.org Signed-off-by: Kees Cook --- Makefile | 2 +- arch/Kconfig | 36 ++++++++++++++++--------- arch/arm/Kconfig | 2 +- arch/arm/kernel/hw_breakpoint.c | 2 +- arch/arm/mm/Makefile | 2 +- arch/arm/mm/cache-fa.S | 2 +- arch/arm/mm/cache-v4.S | 2 +- arch/arm/mm/cache-v4wb.S | 4 +-- arch/arm/mm/cache-v4wt.S | 2 +- arch/arm/mm/cache-v6.S | 2 +- arch/arm/mm/cache-v7.S | 2 +- arch/arm/mm/cache-v7m.S | 2 +- arch/arm/mm/proc-arm1020.S | 2 +- arch/arm/mm/proc-arm1020e.S | 2 +- arch/arm/mm/proc-arm1022.S | 2 +- arch/arm/mm/proc-arm1026.S | 2 +- arch/arm/mm/proc-arm920.S | 2 +- arch/arm/mm/proc-arm922.S | 2 +- arch/arm/mm/proc-arm925.S | 2 +- arch/arm/mm/proc-arm926.S | 2 +- arch/arm/mm/proc-arm940.S | 2 +- arch/arm/mm/proc-arm946.S | 2 +- arch/arm/mm/proc-feroceon.S | 2 +- arch/arm/mm/proc-mohawk.S | 2 +- arch/arm/mm/proc-xsc3.S | 2 +- arch/arm/mm/tlb-v4.S | 2 +- arch/arm64/Kconfig | 4 +-- arch/arm64/kernel/debug-monitors.c | 2 +- arch/arm64/kernel/traps.c | 4 +-- arch/arm64/kvm/handle_exit.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/riscv/Kconfig | 6 ++--- arch/riscv/include/asm/cfi.h | 4 +-- arch/riscv/kernel/Makefile | 2 +- arch/riscv/net/bpf_jit_comp64.c | 4 +-- arch/riscv/purgatory/Makefile | 2 +- arch/x86/Kconfig | 12 ++++----- arch/x86/include/asm/cfi.h | 4 +-- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/alternative.c | 4 +-- arch/x86/kernel/kprobes/core.c | 2 +- arch/x86/purgatory/Makefile | 2 +- drivers/misc/lkdtm/cfi.c | 2 +- include/asm-generic/vmlinux.lds.h | 2 +- include/linux/cfi.h | 6 ++--- include/linux/cfi_types.h | 8 +++--- include/linux/compiler.h | 2 +- init/Kconfig | 4 +-- kernel/Makefile | 2 +- kernel/configs/hardening.config | 4 +-- kernel/module/Kconfig | 2 +- kernel/module/tree_lookup.c | 2 +- lib/Kconfig.debug | 2 +- tools/include/linux/cfi_types.h | 6 ++--- tools/perf/util/include/linux/linkage.h | 2 +- 55 files changed, 100 insertions(+), 90 deletions(-) diff --git a/Makefile b/Makefile index d1adb78c3596a5..437989d6e0be62 100644 --- a/Makefile +++ b/Makefile @@ -1020,7 +1020,7 @@ KBUILD_AFLAGS += -fno-lto export CC_FLAGS_LTO endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI CC_FLAGS_CFI := -fsanitize=kcfi ifdef CONFIG_CFI_ICALL_NORMALIZE_INTEGERS CC_FLAGS_CFI += -fsanitize-cfi-icall-experimental-normalize-integers diff --git a/arch/Kconfig b/arch/Kconfig index d1b4ffd6e08564..97642c08a12458 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -867,22 +867,26 @@ config PROPELLER_CLANG If unsure, say N. -config ARCH_SUPPORTS_CFI_CLANG +config ARCH_SUPPORTS_CFI bool help - An architecture should select this option if it can support Clang's - Control-Flow Integrity (CFI) checking. + An architecture should select this option if it can support Kernel + Control-Flow Integrity (CFI) checking (-fsanitize=kcfi). config ARCH_USES_CFI_TRAPS bool + help + An architecture should select this option if it requires the + .kcfi_traps section for KCFI trap handling. -config CFI_CLANG - bool "Use Clang's Control Flow Integrity (CFI)" - depends on ARCH_SUPPORTS_CFI_CLANG +config CFI + bool "Use Kernel Control Flow Integrity (kCFI)" + default CFI_CLANG + depends on ARCH_SUPPORTS_CFI depends on $(cc-option,-fsanitize=kcfi) help - This option enables Clang's forward-edge Control Flow Integrity - (CFI) checking, where the compiler injects a runtime check to each + This option enables forward-edge Control Flow Integrity (CFI) + checking, where the compiler injects a runtime check to each indirect function call to ensure the target is a valid function with the correct static type. This restricts possible call targets and makes it more difficult for an attacker to exploit bugs that allow @@ -891,10 +895,16 @@ config CFI_CLANG https://clang.llvm.org/docs/ControlFlowIntegrity.html +config CFI_CLANG + bool + transitional + help + Transitional config for CFI_CLANG to CFI migration. + config CFI_ICALL_NORMALIZE_INTEGERS bool "Normalize CFI tags for integers" - depends on CFI_CLANG - depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG + depends on CFI + depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS help This option normalizes the CFI tags for integer types so that all integer types of the same size and signedness receive the same CFI @@ -907,7 +917,7 @@ config CFI_ICALL_NORMALIZE_INTEGERS This option is necessary for using CFI with Rust. If unsure, say N. -config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG +config HAVE_CFI_ICALL_NORMALIZE_INTEGERS def_bool y depends on $(cc-option,-fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers) # With GCOV/KASAN we need this fix: https://github.com/llvm/llvm-project/pull/104826 @@ -915,7 +925,7 @@ config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC def_bool y - depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG + depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS depends on RUSTC_VERSION >= 107900 # With GCOV/KASAN we need this fix: https://github.com/rust-lang/rust/pull/129373 depends on (RUSTC_LLVM_VERSION >= 190103 && RUSTC_VERSION >= 108200) || \ @@ -923,7 +933,7 @@ config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC config CFI_PERMISSIVE bool "Use CFI in permissive mode" - depends on CFI_CLANG + depends on CFI help When selected, Control Flow Integrity (CFI) violations result in a warning instead of a kernel panic. This option should only be used diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b1f3df39ed4068..36ab8625be7211 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -38,7 +38,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_NEED_CMPXCHG_1_EMU if CPU_V6 select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_CFI_CLANG + select ARCH_SUPPORTS_CFI select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_USE_BUILTIN_BSWAP diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index a12efd0f43e81a..cd4b34c96e35e9 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -904,7 +904,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs) watchpoint_single_step_handler(addr); } -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI static void hw_breakpoint_cfi_handler(struct pt_regs *regs) { /* diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile index a195cd1d3e6dc4..1e220101337148 100644 --- a/arch/arm/mm/Makefile +++ b/arch/arm/mm/Makefile @@ -89,7 +89,7 @@ obj-$(CONFIG_CPU_V6) += proc-v6.o obj-$(CONFIG_CPU_V6K) += proc-v6.o obj-$(CONFIG_CPU_V7) += proc-v7.o proc-v7-bugs.o obj-$(CONFIG_CPU_V7M) += proc-v7m.o -obj-$(CONFIG_CFI_CLANG) += proc.o +obj-$(CONFIG_CFI) += proc.o obj-$(CONFIG_OUTER_CACHE) += l2c-common.o obj-$(CONFIG_CACHE_B15_RAC) += cache-b15-rac.o diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S index 4a3668b52a2db0..e1641799569bc0 100644 --- a/arch/arm/mm/cache-fa.S +++ b/arch/arm/mm/cache-fa.S @@ -112,7 +112,7 @@ SYM_FUNC_END(fa_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(fa_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b fa_coherent_user_range #endif SYM_FUNC_END(fa_coherent_kern_range) diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S index 0e94e5193dbd41..001d7042bd4696 100644 --- a/arch/arm/mm/cache-v4.S +++ b/arch/arm/mm/cache-v4.S @@ -104,7 +104,7 @@ SYM_FUNC_END(v4_coherent_user_range) * - size - region size */ SYM_TYPED_FUNC_START(v4_flush_kern_dcache_area) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4_dma_flush_range #endif SYM_FUNC_END(v4_flush_kern_dcache_area) diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S index ce55a2eef5da40..874fe5310f9a01 100644 --- a/arch/arm/mm/cache-v4wb.S +++ b/arch/arm/mm/cache-v4wb.S @@ -136,7 +136,7 @@ SYM_FUNC_END(v4wb_flush_user_cache_range) */ SYM_TYPED_FUNC_START(v4wb_flush_kern_dcache_area) add r1, r0, r1 -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wb_coherent_user_range #endif SYM_FUNC_END(v4wb_flush_kern_dcache_area) @@ -152,7 +152,7 @@ SYM_FUNC_END(v4wb_flush_kern_dcache_area) * - end - virtual end address */ SYM_TYPED_FUNC_START(v4wb_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wb_coherent_user_range #endif SYM_FUNC_END(v4wb_coherent_kern_range) diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S index a97dc267b3b0d7..2ee62e4b2b0753 100644 --- a/arch/arm/mm/cache-v4wt.S +++ b/arch/arm/mm/cache-v4wt.S @@ -108,7 +108,7 @@ SYM_FUNC_END(v4wt_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(v4wt_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wt_coherent_user_range #endif SYM_FUNC_END(v4wt_coherent_kern_range) diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S index 9f415476e2183d..5ceea8965ea19d 100644 --- a/arch/arm/mm/cache-v6.S +++ b/arch/arm/mm/cache-v6.S @@ -117,7 +117,7 @@ SYM_FUNC_END(v6_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v6_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v6_coherent_user_range #endif SYM_FUNC_END(v6_coherent_kern_range) diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index 201ca05436fad5..726681fb7d4de9 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -261,7 +261,7 @@ SYM_FUNC_END(v7_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v7_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v7_coherent_user_range #endif SYM_FUNC_END(v7_coherent_kern_range) diff --git a/arch/arm/mm/cache-v7m.S b/arch/arm/mm/cache-v7m.S index 14d719eba729de..7f9cfad2ea2105 100644 --- a/arch/arm/mm/cache-v7m.S +++ b/arch/arm/mm/cache-v7m.S @@ -286,7 +286,7 @@ SYM_FUNC_END(v7m_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v7m_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v7m_coherent_user_range #endif SYM_FUNC_END(v7m_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S index d0ce3414a13e26..4612a4961e817a 100644 --- a/arch/arm/mm/proc-arm1020.S +++ b/arch/arm/mm/proc-arm1020.S @@ -203,7 +203,7 @@ SYM_FUNC_END(arm1020_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1020_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1020_coherent_user_range #endif SYM_FUNC_END(arm1020_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S index 64f031bf6eff5a..b4a8a3a8eda3d4 100644 --- a/arch/arm/mm/proc-arm1020e.S +++ b/arch/arm/mm/proc-arm1020e.S @@ -200,7 +200,7 @@ SYM_FUNC_END(arm1020e_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1020e_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1020e_coherent_user_range #endif SYM_FUNC_END(arm1020e_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S index 42ed5ed0725285..709870e99e1913 100644 --- a/arch/arm/mm/proc-arm1022.S +++ b/arch/arm/mm/proc-arm1022.S @@ -199,7 +199,7 @@ SYM_FUNC_END(arm1022_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1022_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1022_coherent_user_range #endif SYM_FUNC_END(arm1022_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S index b3ae62cd553aac..02f7370a8c5cbf 100644 --- a/arch/arm/mm/proc-arm1026.S +++ b/arch/arm/mm/proc-arm1026.S @@ -194,7 +194,7 @@ SYM_FUNC_END(arm1026_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1026_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1026_coherent_user_range #endif SYM_FUNC_END(arm1026_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S index a30df54ad5fae2..4727f4b5b6e8da 100644 --- a/arch/arm/mm/proc-arm920.S +++ b/arch/arm/mm/proc-arm920.S @@ -180,7 +180,7 @@ SYM_FUNC_END(arm920_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm920_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm920_coherent_user_range #endif SYM_FUNC_END(arm920_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S index aac4e048100d01..5a4a3f4f2683b8 100644 --- a/arch/arm/mm/proc-arm922.S +++ b/arch/arm/mm/proc-arm922.S @@ -182,7 +182,7 @@ SYM_FUNC_END(arm922_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm922_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm922_coherent_user_range #endif SYM_FUNC_END(arm922_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S index 035941faeb2ed4..1c4830afe1d395 100644 --- a/arch/arm/mm/proc-arm925.S +++ b/arch/arm/mm/proc-arm925.S @@ -229,7 +229,7 @@ SYM_FUNC_END(arm925_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm925_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm925_coherent_user_range #endif SYM_FUNC_END(arm925_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S index 6f43d6af2d9a7a..a09cc3e02efda4 100644 --- a/arch/arm/mm/proc-arm926.S +++ b/arch/arm/mm/proc-arm926.S @@ -192,7 +192,7 @@ SYM_FUNC_END(arm926_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm926_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm926_coherent_user_range #endif SYM_FUNC_END(arm926_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S index 0d30bb25c42bf1..545c076c36d241 100644 --- a/arch/arm/mm/proc-arm940.S +++ b/arch/arm/mm/proc-arm940.S @@ -153,7 +153,7 @@ SYM_FUNC_END(arm940_coherent_kern_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm940_coherent_user_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm940_flush_kern_dcache_area #endif SYM_FUNC_END(arm940_coherent_user_range) diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S index 27750ace2cedaa..f3d4e18c3fba5a 100644 --- a/arch/arm/mm/proc-arm946.S +++ b/arch/arm/mm/proc-arm946.S @@ -173,7 +173,7 @@ SYM_FUNC_END(arm946_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm946_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm946_coherent_user_range #endif SYM_FUNC_END(arm946_coherent_kern_range) diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S index f67b2ffac85411..7f08d06c962539 100644 --- a/arch/arm/mm/proc-feroceon.S +++ b/arch/arm/mm/proc-feroceon.S @@ -208,7 +208,7 @@ SYM_FUNC_END(feroceon_flush_user_cache_range) */ .align 5 SYM_TYPED_FUNC_START(feroceon_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b feroceon_coherent_user_range #endif SYM_FUNC_END(feroceon_coherent_kern_range) diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S index 8e9f38da863a52..4669c63e3121d0 100644 --- a/arch/arm/mm/proc-mohawk.S +++ b/arch/arm/mm/proc-mohawk.S @@ -163,7 +163,7 @@ SYM_FUNC_END(mohawk_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(mohawk_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b mohawk_coherent_user_range #endif SYM_FUNC_END(mohawk_coherent_kern_range) diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S index 14927b38045244..fd25634a2ed5dc 100644 --- a/arch/arm/mm/proc-xsc3.S +++ b/arch/arm/mm/proc-xsc3.S @@ -223,7 +223,7 @@ SYM_FUNC_END(xsc3_flush_user_cache_range) * it also trashes the mini I-cache used by JTAG debuggers. */ SYM_TYPED_FUNC_START(xsc3_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b xsc3_coherent_user_range #endif SYM_FUNC_END(xsc3_coherent_kern_range) diff --git a/arch/arm/mm/tlb-v4.S b/arch/arm/mm/tlb-v4.S index 09ff69008d94d2..079774a02be631 100644 --- a/arch/arm/mm/tlb-v4.S +++ b/arch/arm/mm/tlb-v4.S @@ -52,7 +52,7 @@ SYM_FUNC_END(v4_flush_user_tlb_range) * - start - virtual address (may not be aligned) * - end - virtual address (may not be aligned) */ -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI SYM_TYPED_FUNC_START(v4_flush_kern_tlb_range) b .v4_flush_kern_tlb_range SYM_FUNC_END(v4_flush_kern_tlb_range) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a64d..1e38b8885a4627 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -100,7 +100,7 @@ config ARM64 select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK select ARCH_SUPPORTS_LTO_CLANG if CPU_LITTLE_ENDIAN select ARCH_SUPPORTS_LTO_CLANG_THIN - select ARCH_SUPPORTS_CFI_CLANG + select ARCH_SUPPORTS_CFI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING @@ -212,7 +212,7 @@ config ARM64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS \ if DYNAMIC_FTRACE_WITH_ARGS && DYNAMIC_FTRACE_WITH_CALL_OPS select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS \ - if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG && \ + if (DYNAMIC_FTRACE_WITH_ARGS && !CFI && \ (CC_IS_CLANG || !CC_OPTIMIZE_FOR_SIZE)) select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \ if DYNAMIC_FTRACE_WITH_ARGS diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 110d9ff54174f7..ebf010443e22bc 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -212,7 +212,7 @@ static int call_el1_break_hook(struct pt_regs *regs, unsigned long esr) if (esr_brk_comment(esr) == BUG_BRK_IMM) return bug_brk_handler(regs, esr); - if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) + if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) return cfi_brk_handler(regs, esr); if (esr_brk_comment(esr) == FAULT_BRK_IMM) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index f528b6041f6a80..5041817af267f8 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -1015,7 +1015,7 @@ int bug_brk_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_HANDLED; } -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) { unsigned long target; @@ -1039,7 +1039,7 @@ int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ int reserved_fault_brk_handler(struct pt_regs *regs, unsigned long esr) { diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index a598072f36d2ca..8bdb1eed090abf 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -545,7 +545,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else print_nvhe_hyp_panic("BUG", panic_addr); - } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) { + } else if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) { kvm_nvhe_report_cfi_failure(panic_addr); } else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) && ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 52ffe115a8c47c..28996e0a9b00f2 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -185,7 +185,7 @@ static inline void emit_bti(u32 insn, struct jit_ctx *ctx) static inline void emit_kcfi(u32 hash, struct jit_ctx *ctx) { - if (IS_ENABLED(CONFIG_CFI_CLANG)) + if (IS_ENABLED(CONFIG_CFI)) emit_u32_data(hash, ctx); } diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a4b233a0659ed8..6043ad82b73cc8 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -60,7 +60,7 @@ config RISCV select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW # clang >= 17: https://github.com/llvm/llvm-project/commit/62fa708ceb027713b386c7e0efda994f8bdc27e2 - select ARCH_SUPPORTS_CFI_CLANG if CLANG_VERSION >= 170000 + select ARCH_SUPPORTS_CFI if (!CC_IS_CLANG || CLANG_VERSION >= 170000) select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_HUGETLBFS if MMU @@ -76,7 +76,7 @@ config RISCV select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_SYM_ANNOTATIONS - select ARCH_USES_CFI_TRAPS if CFI_CLANG + select ARCH_USES_CFI_TRAPS if CFI select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if MMU select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS @@ -154,7 +154,7 @@ config RISCV select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE) select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS - select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG) + select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI) select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS diff --git a/arch/riscv/include/asm/cfi.h b/arch/riscv/include/asm/cfi.h index 4508aaa7a2fdb1..710aa8192edd97 100644 --- a/arch/riscv/include/asm/cfi.h +++ b/arch/riscv/include/asm/cfi.h @@ -11,7 +11,7 @@ struct pt_regs; -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); #define __bpfcall #else @@ -19,6 +19,6 @@ static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { return BUG_TRAP_TYPE_NONE; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #endif /* _ASM_RISCV_CFI_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index c7b542573407c8..f60fce69b7259f 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -113,7 +113,7 @@ obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_EFI) += efi.o obj-$(CONFIG_COMPAT) += compat_syscall_table.o diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 10e01ff06312d9..24ba546a1c0eb1 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -18,7 +18,7 @@ #define RV_MAX_REG_ARGS 8 #define RV_FENTRY_NINSNS 2 #define RV_FENTRY_NBYTES (RV_FENTRY_NINSNS * 4) -#define RV_KCFI_NINSNS (IS_ENABLED(CONFIG_CFI_CLANG) ? 1 : 0) +#define RV_KCFI_NINSNS (IS_ENABLED(CONFIG_CFI) ? 1 : 0) /* imm that allows emit_imm to emit max count insns */ #define RV_MAX_COUNT_IMM 0x7FFF7FF7FF7FF7FF @@ -469,7 +469,7 @@ static int emit_call(u64 addr, bool fixed_addr, struct rv_jit_context *ctx) static inline void emit_kcfi(u32 hash, struct rv_jit_context *ctx) { - if (IS_ENABLED(CONFIG_CFI_CLANG)) + if (IS_ENABLED(CONFIG_CFI)) emit(hash, ctx); } diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile index 240592e3f5c2f5..530e497ca2f9ce 100644 --- a/arch/riscv/purgatory/Makefile +++ b/arch/riscv/purgatory/Makefile @@ -71,7 +71,7 @@ ifdef CONFIG_STACKPROTECTOR_STRONG PURGATORY_CFLAGS_REMOVE += -fstack-protector-strong endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_CFI) endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100eb..b6da2d37cfd166 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -127,8 +127,8 @@ config X86 select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 - select ARCH_SUPPORTS_CFI_CLANG if X86_64 - select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG + select ARCH_SUPPORTS_CFI if X86_64 + select ARCH_USES_CFI_TRAPS if X86_64 && CFI select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_SUPPORTS_RT @@ -2396,11 +2396,11 @@ config FUNCTION_PADDING_CFI default 3 if FUNCTION_ALIGNMENT_8B default 0 -# Basically: FUNCTION_ALIGNMENT - 5*CFI_CLANG +# Basically: FUNCTION_ALIGNMENT - 5*CFI # except Kconfig can't do arithmetic :/ config FUNCTION_PADDING_BYTES int - default FUNCTION_PADDING_CFI if CFI_CLANG + default FUNCTION_PADDING_CFI if CFI default FUNCTION_ALIGNMENT config CALL_PADDING @@ -2410,7 +2410,7 @@ config CALL_PADDING config FINEIBT def_bool y - depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE + depends on X86_KERNEL_IBT && CFI && MITIGATION_RETPOLINE select CALL_PADDING config FINEIBT_BHI @@ -2427,7 +2427,7 @@ config CALL_THUNKS config PREFIX_SYMBOLS def_bool y - depends on CALL_PADDING && !CFI_CLANG + depends on CALL_PADDING && !CFI menuconfig CPU_MITIGATIONS bool "Mitigations for CPU vulnerabilities" diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 1751f1eb95ef67..976b90a3d190ea 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -113,7 +113,7 @@ extern bhi_thunk __bhi_args_end[]; struct pt_regs; -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); #define __bpfcall @@ -157,7 +157,7 @@ static inline int cfi_get_func_arity(void *func) { return 0; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #if HAS_KERNEL_IBT == 1 #define CFI_NOSEAL(x) asm(IBT_NOSEAL(__stringify(x))) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0d2a6d953be911..bc184dd38d993b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -148,7 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_CALL_THUNKS) += callthunks.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7bde68247b5fc5..79ae9cb5001906 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1170,7 +1170,7 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #ifdef CONFIG_CFI_AUTO_DEFAULT # define __CFI_DEFAULT CFI_AUTO -#elif defined(CONFIG_CFI_CLANG) +#elif defined(CONFIG_CFI) # define __CFI_DEFAULT CFI_KCFI #else # define __CFI_DEFAULT CFI_OFF @@ -1182,7 +1182,7 @@ enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; bool cfi_bhi __ro_after_init = false; #endif -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI u32 cfi_get_func_hash(void *func) { u32 hash; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6079d15dab8ca7..3863d7709386fc 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -339,7 +339,7 @@ static bool can_probe(unsigned long paddr) if (is_exception_insn(&insn)) return false; - if (IS_ENABLED(CONFIG_CFI_CLANG)) { + if (IS_ENABLED(CONFIG_CFI)) { /* * The compiler generates the following instruction sequence * for indirect call checks and cfi.c decodes this; diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index e0a607a14e7ed7..5ce1d426300004 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -57,7 +57,7 @@ ifdef CONFIG_MITIGATION_RETPOLINE PURGATORY_CFLAGS_REMOVE += $(RETPOLINE_CFLAGS) endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_CFI) endif diff --git a/drivers/misc/lkdtm/cfi.c b/drivers/misc/lkdtm/cfi.c index 6a33889d0902af..c3971f7caa65ed 100644 --- a/drivers/misc/lkdtm/cfi.c +++ b/drivers/misc/lkdtm/cfi.c @@ -43,7 +43,7 @@ static void lkdtm_CFI_FORWARD_PROTO(void) lkdtm_indirect_call((void *)lkdtm_increment_int); pr_err("FAIL: survived mismatched prototype function call!\n"); - pr_expected_config(CONFIG_CFI_CLANG); + pr_expected_config(CONFIG_CFI); } /* diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ae2d2359b79e9e..a65a87366c48bc 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -157,7 +157,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) #define PATCHABLE_DISCARDS *(__patchable_function_entries) #endif -#ifndef CONFIG_ARCH_SUPPORTS_CFI_CLANG +#ifndef CONFIG_ARCH_SUPPORTS_CFI /* * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 52a98886a455d0..1fd22ea6eba4fe 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,7 +11,7 @@ #include #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI extern bool cfi_warn; enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, @@ -52,7 +52,7 @@ static inline u32 cfi_get_func_hash(void *func) extern u32 cfi_bpf_hash; extern u32 cfi_bpf_subprog_hash; -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ static inline int cfi_get_offset(void) { return 0; } static inline u32 cfi_get_func_hash(void *func) { return 0; } @@ -60,7 +60,7 @@ static inline u32 cfi_get_func_hash(void *func) { return 0; } #define cfi_bpf_hash 0U #define cfi_bpf_subprog_hash 0U -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifdef CONFIG_ARCH_USES_CFI_TRAPS bool is_cfi_trap(unsigned long addr); diff --git a/include/linux/cfi_types.h b/include/linux/cfi_types.h index 685f7181780f92..a86af9bc8bdc47 100644 --- a/include/linux/cfi_types.h +++ b/include/linux/cfi_types.h @@ -8,7 +8,7 @@ #ifdef __ASSEMBLY__ #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI /* * Use the __kcfi_typeid_ type identifier symbol to * annotate indirectly called assembly functions. The compiler emits @@ -29,12 +29,12 @@ #define SYM_TYPED_START(name, linkage, align...) \ SYM_TYPED_ENTRY(name, linkage, align) -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ #define SYM_TYPED_START(name, linkage, align...) \ SYM_START(name, linkage, align) -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifndef SYM_TYPED_FUNC_START #define SYM_TYPED_FUNC_START(name) \ @@ -43,7 +43,7 @@ #else /* __ASSEMBLY__ */ -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI #define DEFINE_CFI_TYPE(name, func) \ /* \ * Force a reference to the function so the compiler generates \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 6f04a1d8c72094..fb27da2221ee49 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -248,7 +248,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ -#if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_CFI) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) /* * Force a reference to the external symbol so the compiler generates * __kcfi_typid. diff --git a/init/Kconfig b/init/Kconfig index 83632025121937..67f10d8a33b77d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2063,8 +2063,8 @@ config RUST depends on !GCC_PLUGIN_RANDSTRUCT depends on !RANDSTRUCT depends on !DEBUG_INFO_BTF || (PAHOLE_HAS_LANG_EXCLUDE && !LTO) - depends on !CFI_CLANG || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC - select CFI_ICALL_NORMALIZE_INTEGERS if CFI_CLANG + depends on !CFI || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC + select CFI_ICALL_NORMALIZE_INTEGERS if CFI depends on !CALL_PADDING || RUSTC_VERSION >= 108100 depends on !KASAN_SW_TAGS depends on !(MITIGATION_RETHUNK && KASAN) || RUSTC_VERSION >= 108300 diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235f2..27e0e6a336107b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -122,7 +122,7 @@ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 64caaf997fc089..7c3924614e01de 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -93,8 +93,8 @@ CONFIG_SECCOMP_FILTER=y # Provides some protections against SYN flooding. CONFIG_SYN_COOKIES=y -# Enable Kernel Control Flow Integrity (currently Clang only). -CONFIG_CFI_CLANG=y +# Enable Kernel Control Flow Integrity. +CONFIG_CFI=y # CONFIG_CFI_PERMISSIVE is not set # Attack surface reduction: do not autoload TTY line disciplines. diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 39278737bb68fd..2a1beebf1d37a1 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -460,6 +460,6 @@ config UNUSED_KSYMS_WHITELIST config MODULES_TREE_LOOKUP def_bool y - depends on PERF_EVENTS || TRACING || CFI_CLANG + depends on PERF_EVENTS || TRACING || CFI endif # MODULES diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index d3204c5c74eb7c..f8e8c126705cbd 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -14,7 +14,7 @@ * Use a latched RB-tree for __module_address(); this allows us to use * RCU lookups of the address from any context. * - * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can + * This is conditional on PERF_EVENTS || TRACING || CFI because those can * really hit __module_address() hard by doing a lot of stack unwinding; * potentially from NMI context. */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dc0e0c6ed075e9..e3e69df19e7820 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2894,7 +2894,7 @@ config FORTIFY_KUNIT_TEST config LONGEST_SYM_KUNIT_TEST tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS depends on KUNIT && KPROBES - depends on !PREFIX_SYMBOLS && !CFI_CLANG && !GCOV_KERNEL + depends on !PREFIX_SYMBOLS && !CFI && !GCOV_KERNEL default KUNIT_ALL_TESTS help Tests the longest symbol possible diff --git a/tools/include/linux/cfi_types.h b/tools/include/linux/cfi_types.h index 6b87136757655c..2e098274e45cd5 100644 --- a/tools/include/linux/cfi_types.h +++ b/tools/include/linux/cfi_types.h @@ -8,7 +8,7 @@ #ifdef __ASSEMBLY__ #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI /* * Use the __kcfi_typeid_ type identifier symbol to * annotate indirectly called assembly functions. The compiler emits @@ -29,12 +29,12 @@ #define SYM_TYPED_START(name, linkage, align...) \ SYM_TYPED_ENTRY(name, linkage, align) -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ #define SYM_TYPED_START(name, linkage, align...) \ SYM_START(name, linkage, align) -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifndef SYM_TYPED_FUNC_START #define SYM_TYPED_FUNC_START(name) \ diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h index 89979ca23c3f1c..34e2fdfe7300ea 100644 --- a/tools/perf/util/include/linux/linkage.h +++ b/tools/perf/util/include/linux/linkage.h @@ -120,7 +120,7 @@ #endif // In the kernel sources (include/linux/cfi_types.h), this has a different -// definition when CONFIG_CFI_CLANG is used, for tools/ just use the !clang +// definition when CONFIG_CFI is used, for tools/ just use the !cfi // definition: #ifndef SYM_TYPED_START #define SYM_TYPED_START(name, linkage, align...) \ From d0ca0df179c4b21e2a6c4a4fb637aa8fa14575cb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 24 Sep 2025 13:18:22 -0700 Subject: [PATCH 2797/3206] crypto: af_alg - Fix incorrect boolean values in af_alg_ctx Commit 1b34cbbf4f01 ("crypto: af_alg - Disallow concurrent writes in af_alg_sendmsg") changed some fields from bool to 1-bit bitfields of type u32. However, some assignments to these fields, specifically 'more' and 'merge', assign values greater than 1. These relied on C's implicit conversion to bool, such that zero becomes false and nonzero becomes true. With a 1-bit bitfields of type u32 instead, mod 2 of the value is taken instead, resulting in 0 being assigned in some cases when 1 was intended. Fix this by restoring the bool type. Fixes: 1b34cbbf4f01 ("crypto: af_alg - Disallow concurrent writes in af_alg_sendmsg") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Linus Torvalds --- include/crypto/if_alg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index 0c70f3a5557505..107b797c33ecf7 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -152,7 +152,7 @@ struct af_alg_ctx { size_t used; atomic_t rcvused; - u32 more:1, + bool more:1, merge:1, enc:1, write:1, From 23199d2aa6dcaf6dd2da772f93d2c94317d71459 Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Wed, 17 Sep 2025 10:38:20 +0530 Subject: [PATCH 2798/3206] tools/cpupower: Fix incorrect size in cpuidle_state_disable() Fix incorrect size parameter passed to cpuidle_state_write_file() in cpuidle_state_disable(). The function was incorrectly using sizeof(disable) which returns the size of the unsigned int variable (4 bytes) instead of the actual length of the string stored in the 'value' buffer. Since 'value' is populated with snprintf() to contain the string representation of the disable value, we should use the length returned by snprintf() to get the correct string length for writing to the sysfs file. This ensures the correct number of bytes is written to the cpuidle state disable file in sysfs. Link: https://lore.kernel.org/r/20250917050820.1785377-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Signed-off-by: Shuah Khan --- tools/power/cpupower/lib/cpuidle.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/power/cpupower/lib/cpuidle.c b/tools/power/cpupower/lib/cpuidle.c index 0ecac009273ce8..f2c1139adf7169 100644 --- a/tools/power/cpupower/lib/cpuidle.c +++ b/tools/power/cpupower/lib/cpuidle.c @@ -233,6 +233,7 @@ int cpuidle_state_disable(unsigned int cpu, { char value[SYSFS_PATH_MAX]; int bytes_written; + int len; if (cpuidle_state_count(cpu) <= idlestate) return -1; @@ -241,10 +242,10 @@ int cpuidle_state_disable(unsigned int cpu, idlestate_value_files[IDLESTATE_DISABLE])) return -2; - snprintf(value, SYSFS_PATH_MAX, "%u", disable); + len = snprintf(value, SYSFS_PATH_MAX, "%u", disable); bytes_written = cpuidle_state_write_file(cpu, idlestate, "disable", - value, sizeof(disable)); + value, len); if (bytes_written) return 0; return -3; From f32a26fab3672e60f622bd7461bf978fc72f29ec Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Wed, 24 Sep 2025 16:24:41 -0700 Subject: [PATCH 2799/3206] hfs/hfsplus: rework debug output subsystem Currently, HFS/HFS+ has very obsolete and inconvenient debug output subsystem. Also, the code is duplicated in HFS and HFS+ driver. This patch introduces linux/hfs_common.h for gathering common declarations, inline functions, and common short methods. Currently, this file contains only hfs_dbg() function that employs pr_debug() with the goal to print a debug-level messages conditionally. So, now, it is possible to enable the debug output by means of: echo 'file extent.c +p' > /proc/dynamic_debug/control echo 'func hfsplus_evict_inode +p' > /proc/dynamic_debug/control And debug output looks like this: hfs: pid 5831:fs/hfs/catalog.c:228 hfs_cat_delete(): delete_cat: 00,48 hfs: pid 5831:fs/hfs/extent.c:484 hfs_file_truncate(): truncate: 48, 409600 -> 0 hfs: pid 5831:fs/hfs/extent.c:212 hfs_dump_extent(): hfs: pid 5831:fs/hfs/extent.c:214 hfs_dump_extent(): 78:4 hfs: pid 5831:fs/hfs/extent.c:214 hfs_dump_extent(): 0:0 hfs: pid 5831:fs/hfs/extent.c:214 hfs_dump_extent(): 0:0 v4 Debug messages have been reworked and information about new HFS/HFS+ shared declarations file has been added to MAINTAINERS file. v5 Yangtao Li suggested to clean up debug output and fix several typos. Signed-off-by: Viacheslav Dubeyko cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org cc: Johannes Thumshirn Signed-off-by: Viacheslav Dubeyko --- MAINTAINERS | 2 ++ fs/hfs/bfind.c | 4 ++-- fs/hfs/bitmap.c | 4 ++-- fs/hfs/bnode.c | 28 ++++++++++++++-------------- fs/hfs/brec.c | 8 ++++---- fs/hfs/btree.c | 2 +- fs/hfs/catalog.c | 14 +++++++------- fs/hfs/extent.c | 19 ++++++++++--------- fs/hfs/hfs_fs.h | 33 +-------------------------------- fs/hfs/inode.c | 4 ++-- fs/hfsplus/attributes.c | 8 ++++---- fs/hfsplus/bfind.c | 4 ++-- fs/hfsplus/bitmap.c | 10 +++++----- fs/hfsplus/bnode.c | 28 ++++++++++++++-------------- fs/hfsplus/brec.c | 10 +++++----- fs/hfsplus/btree.c | 4 ++-- fs/hfsplus/catalog.c | 6 +++--- fs/hfsplus/extents.c | 27 +++++++++++++-------------- fs/hfsplus/hfsplus_fs.h | 35 +---------------------------------- fs/hfsplus/super.c | 16 ++++++++++++---- fs/hfsplus/xattr.c | 4 ++-- include/linux/hfs_common.h | 20 ++++++++++++++++++++ 22 files changed, 128 insertions(+), 162 deletions(-) create mode 100644 include/linux/hfs_common.h diff --git a/MAINTAINERS b/MAINTAINERS index f5e1540cb70244..83579b15ba43ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10794,6 +10794,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git F: Documentation/filesystems/hfs.rst F: fs/hfs/ +F: include/linux/hfs_common.h HFSPLUS FILESYSTEM M: Viacheslav Dubeyko @@ -10804,6 +10805,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git F: Documentation/filesystems/hfsplus.rst F: fs/hfsplus/ +F: include/linux/hfs_common.h HGA FRAMEBUFFER DRIVER M: Ferenc Bakonyi diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index e46f650b5e9c26..c2f840c49e60b5 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -26,7 +26,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", tree->cnid, __builtin_return_address(0)); switch (tree->cnid) { case HFS_CAT_CNID: @@ -48,7 +48,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c index 28307bc9ec1ee1..5e84833a4743ad 100644 --- a/fs/hfs/bitmap.c +++ b/fs/hfs/bitmap.c @@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) } } - hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); + hfs_dbg("pos %u, num_bits %u\n", pos, *num_bits); HFS_SB(sb)->free_ablocks -= *num_bits; hfs_bitmap_dirty(sb); out: @@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) if (!count) return 0; - hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count); + hfs_dbg("start %u, count %u\n", start, count); /* are all of the bits in range? */ if ((start + count) > HFS_SB(sb)->fs_ablocks) return -2; diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index e8cd1a31f2470c..fcfffe75d84e70 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -200,7 +200,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, { struct page *src_page, *dst_page; - hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -221,7 +221,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) struct page *page; void *ptr; - hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -243,16 +243,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg("node %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - hfs_dbg_cont(BNODE_MOD, " %d", key_off); + hfs_dbg(" key_off %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -260,18 +260,18 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1; else tmp = node->tree->max_key_len + 1; - hfs_dbg_cont(BNODE_MOD, " (%d,%d", - tmp, hfs_bnode_read_u8(node, key_off)); + hfs_dbg(" (%d,%d", + tmp, hfs_bnode_read_u8(node, key_off)); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg(", cnid %d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u8(node, key_off); - hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + hfs_dbg(" (%d)", tmp); } } - hfs_dbg_cont(BNODE_MOD, "\n"); + hfs_dbg("\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -361,7 +361,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + hfs_dbg("cnid %d, node %d, refcnt 1\n", node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); @@ -401,7 +401,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -546,7 +546,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -559,7 +559,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index b01db1fae147cd..e49a141c87e517 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -94,7 +94,7 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -192,7 +192,7 @@ int hfs_brec_remove(struct hfs_find_data *fd) mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg("rec %d, len %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -261,7 +261,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg("this %d, new %d, next %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -397,7 +397,7 @@ static int hfs_brec_update_parent(struct hfs_find_data *fd) newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1; else fd->keylength = newkeylen = tree->max_key_len + 1; - hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg("rec %d, keylength %d, newkeylen %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index e86e1e235658fa..22e62fe7448bf8 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -364,7 +364,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg("node %u\n", node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index b6b18ee6813510..caebabb6642f16 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -87,7 +87,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i int entry_size; int err; - hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg("name %s, cnid %u, i_nlink %d\n", str->name, cnid, inode->i_nlink); if (dir->i_size >= HFS_MAX_VALENCE) return -ENOSPC; @@ -238,7 +238,7 @@ int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) s64 leaf_tail; s64 node_id; - hfs_dbg(CAT_MOD, "correct next unused CNID: cnid %u, next_id %lld\n", + hfs_dbg("cnid %u, next_id %lld\n", cnid, atomic64_read(&HFS_SB(sb)->next_id)); if ((cnid + 1) < atomic64_read(&HFS_SB(sb)->next_id)) { @@ -274,7 +274,7 @@ int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) return -ENOENT; } - hfs_dbg(CAT_MOD, "node_id %lld, leaf_tail %lld, leaf_head %lld\n", + hfs_dbg("node %lld, leaf_tail %lld, leaf_head %lld\n", node_id, leaf_tail, leaf_head); hfs_bnode_dump(node); @@ -309,13 +309,13 @@ int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) if (rec.type == HFS_CDR_DIR) { found_cnid = be32_to_cpu(rec.dir.DirID); - hfs_dbg(CAT_MOD, "found_cnid %u\n", found_cnid); + hfs_dbg("found_cnid %u\n", found_cnid); hfs_set_next_unused_CNID(sb, cnid, found_cnid); hfs_bnode_put(node); return 0; } else if (rec.type == HFS_CDR_FIL) { found_cnid = be32_to_cpu(rec.file.FlNum); - hfs_dbg(CAT_MOD, "found_cnid %u\n", found_cnid); + hfs_dbg("found_cnid %u\n", found_cnid); hfs_set_next_unused_CNID(sb, cnid, found_cnid); hfs_bnode_put(node); return 0; @@ -343,7 +343,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) struct hfs_readdir_data *rd; int res, type; - hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid); sb = dir->i_sb; res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); if (res) @@ -417,7 +417,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, int entry_size, type; int err; - hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg("cnid %u - (ino %lu, name %s) - (ino %lu, name %s)\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); sb = src_dir->i_sb; diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 580c62981dbd3d..a097908b269d0a 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -209,12 +209,12 @@ static void hfs_dump_extent(struct hfs_extent *extent) { int i; - hfs_dbg(EXTENT, " "); + hfs_dbg("extent: "); for (i = 0; i < 3; i++) - hfs_dbg_cont(EXTENT, " %u:%u", - be16_to_cpu(extent[i].block), - be16_to_cpu(extent[i].count)); - hfs_dbg_cont(EXTENT, "\n"); + hfs_dbg(" block %u, count %u", + be16_to_cpu(extent[i].block), + be16_to_cpu(extent[i].count)); + hfs_dbg("\n"); } static int hfs_add_extent(struct hfs_extent *extent, u16 offset, @@ -411,10 +411,11 @@ int hfs_extend_file(struct inode *inode) goto out; } - hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { - hfs_dbg(EXTENT, "first extents\n"); + hfs_dbg("first_extent: start %u, len %u\n", + start, len); /* no extents yet */ HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); @@ -456,7 +457,7 @@ int hfs_extend_file(struct inode *inode) return res; insert_extent: - hfs_dbg(EXTENT, "insert new extent\n"); + hfs_dbg("insert new extent\n"); res = hfs_ext_write_extent(inode); if (res) goto out; @@ -481,7 +482,7 @@ void hfs_file_truncate(struct inode *inode) u32 size; int res; - hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n", + hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)HFS_I(inode)->phys_size, inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index c732a066de787b..fff149af89da31 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -9,12 +9,6 @@ #ifndef _LINUX_HFS_FS_H #define _LINUX_HFS_FS_H -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include @@ -24,35 +18,10 @@ #include #include +#include #include "hfs.h" -#define DBG_BNODE_REFS 0x00000001 -#define DBG_BNODE_MOD 0x00000002 -#define DBG_CAT_MOD 0x00000004 -#define DBG_INODE 0x00000008 -#define DBG_SUPER 0x00000010 -#define DBG_EXTENT 0x00000020 -#define DBG_BITMAP 0x00000040 - -//#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP) -//#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) -//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) -#define DBG_MASK (0) - -#define hfs_dbg(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -} while (0) - -#define hfs_dbg_cont(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - pr_cont(fmt, ##__VA_ARGS__); \ -} while (0) - - /* * struct hfs_inode_info * diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 490b6ad5d9817a..9cd449913dc82a 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -249,7 +249,7 @@ void hfs_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; - hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX); atomic64_dec(&HFS_SB(sb)->folder_count); @@ -436,7 +436,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_cat_rec rec; int res; - hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); res = hfs_ext_write_extent(inode); if (res) return res; diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index eeebe80c6be4aa..ba26980cc5035c 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -139,7 +139,7 @@ int hfsplus_find_attr(struct super_block *sb, u32 cnid, { int err = 0; - hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + hfs_dbg("name %s, cnid %d\n", name ? name : NULL, cnid); if (!HFSPLUS_SB(sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); @@ -201,7 +201,7 @@ int hfsplus_create_attr(struct inode *inode, int entry_size; int err; - hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n", + hfs_dbg("name %s, ino %ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -310,7 +310,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) struct super_block *sb = inode->i_sb; struct hfs_find_data fd; - hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n", + hfs_dbg("name %s, ino %ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -356,7 +356,7 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) int err = 0; struct hfs_find_data fd; - hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid); + hfs_dbg("cnid %d\n", cnid); if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 26ebac4c604242..afc9c89e8c6aff 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", tree->cnid, __builtin_return_address(0)); mutex_lock_nested(&tree->tree_lock, hfsplus_btree_lock_class(tree)); @@ -34,7 +34,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index bd8dcea8558800..1b3af8c87cadb5 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -31,7 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, if (!len) return size; - hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); + hfs_dbg("size %u, offset %u, len %u\n", size, offset, len); mutex_lock(&sbi->alloc_mutex); mapping = sbi->alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -90,14 +90,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, else end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; } - hfs_dbg(BITMAP, "bitmap full\n"); + hfs_dbg("bitmap full\n"); start = size; goto out; found: start = offset + (curr - pptr) * 32 + i; if (start >= size) { - hfs_dbg(BITMAP, "bitmap full\n"); + hfs_dbg("bitmap full\n"); goto out; } /* do any partial u32 at the start */ @@ -155,7 +155,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); - hfs_dbg(BITMAP, "-> %u,%u\n", start, *max); + hfs_dbg("start %u, max %u\n", start, *max); out: mutex_unlock(&sbi->alloc_mutex); return start; @@ -174,7 +174,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) return 0; - hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count); + hfs_dbg("offset %u, count %u\n", offset, count); /* are all of the bits in range? */ if ((offset + count) > sbi->total_blocks) return -ENOENT; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 407d5152eb411e..63e652ad1e0def 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -173,7 +173,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct page **src_page, **dst_page; int l; - hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -231,7 +231,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) void *src_ptr, *dst_ptr; int l; - hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -351,16 +351,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg("node %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - hfs_dbg(BNODE_MOD, " %d", key_off); + hfs_dbg(" key_off %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -369,17 +369,17 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = hfs_bnode_read_u16(node, key_off) + 2; else tmp = node->tree->max_key_len + 2; - hfs_dbg_cont(BNODE_MOD, " (%d", tmp); + hfs_dbg(" (%d", tmp); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg(", cnid %d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u16(node, key_off); - hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + hfs_dbg(" (%d)", tmp); } } - hfs_dbg_cont(BNODE_MOD, "\n"); + hfs_dbg("\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -415,7 +415,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node) /* move down? */ if (!node->prev && !node->next) - hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n"); + hfs_dbg("btree delete level\n"); if (!node->parent) { tree->root = 0; tree->depth = 0; @@ -470,7 +470,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + hfs_dbg("cnid %d, node %d, refcnt 1\n", node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); @@ -510,7 +510,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -656,7 +656,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -669,7 +669,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 1918544a78716e..b4645102feecd4 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -92,7 +92,7 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -193,7 +193,7 @@ int hfs_brec_remove(struct hfs_find_data *fd) mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg("rec %d, len %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -246,7 +246,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg("this %d - new %d - next %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -383,7 +383,7 @@ static int hfs_brec_update_parent(struct hfs_find_data *fd) newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; - hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg("rec %d, keylength %d, newkeylen %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; @@ -395,7 +395,7 @@ static int hfs_brec_update_parent(struct hfs_find_data *fd) end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - hfs_dbg(BNODE_MOD, "splitting index node\n"); + hfs_dbg("splitting index node\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index fe6a54c4083c34..7cc5aea145720c 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -434,7 +434,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap_local(data); nidx = node->next; if (!nidx) { - hfs_dbg(BNODE_MOD, "create new bmap node\n"); + hfs_dbg("create new bmap node\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -460,7 +460,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg("node %u\n", node->this); BUG_ON(!node->this); tree = node->tree; nidx = node->this; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 1995bafee83901..02c1eee4a4b860 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -259,7 +259,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, int entry_size; int err; - hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg("name %s, cnid %u, i_nlink %d\n", str->name, cnid, inode->i_nlink); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) @@ -336,7 +336,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) int err, off; u16 type; - hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return err; @@ -441,7 +441,7 @@ int hfsplus_rename_cat(u32 cnid, int entry_size, type; int err; - hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg("cnid %u - ino %lu, name %s - ino %lu, name %s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index b1699b3c246ae4..8e886514d27f1e 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -275,7 +275,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, mutex_unlock(&hip->extents_lock); done: - hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n", + hfs_dbg("ino %lu, iblock %llu - dblock %u\n", inode->i_ino, (long long)iblock, dblock); mask = (1 << sbi->fs_shift) - 1; @@ -298,12 +298,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent) { int i; - hfs_dbg(EXTENT, " "); + hfs_dbg("extent "); for (i = 0; i < 8; i++) - hfs_dbg_cont(EXTENT, " %u:%u", - be32_to_cpu(extent[i].start_block), - be32_to_cpu(extent[i].block_count)); - hfs_dbg_cont(EXTENT, "\n"); + hfs_dbg(" start_block %u, block_count %u", + be32_to_cpu(extent[i].start_block), + be32_to_cpu(extent[i].block_count)); + hfs_dbg("\n"); } static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset, @@ -359,8 +359,7 @@ static int hfsplus_free_extents(struct super_block *sb, if (count <= block_nr) { err = hfsplus_block_free(sb, start, count); if (err) { - pr_err("can't free extent\n"); - hfs_dbg(EXTENT, " start: %u count: %u\n", + pr_err("can't free extent: start %u, count %u\n", start, count); } extent->block_count = 0; @@ -370,8 +369,7 @@ static int hfsplus_free_extents(struct super_block *sb, count -= block_nr; err = hfsplus_block_free(sb, start + count, block_nr); if (err) { - pr_err("can't free extent\n"); - hfs_dbg(EXTENT, " start: %u count: %u\n", + pr_err("can't free extent: start %u, count %u\n", start, count); } extent->block_count = cpu_to_be32(count); @@ -478,11 +476,12 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout) goto out; } - hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { if (!hip->first_blocks) { - hfs_dbg(EXTENT, "first extents\n"); + hfs_dbg("first_extent: start %u, len %u\n", + start, len); /* no extents yet */ hip->first_extents[0].start_block = cpu_to_be32(start); hip->first_extents[0].block_count = cpu_to_be32(len); @@ -521,7 +520,7 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout) return res; insert_extent: - hfs_dbg(EXTENT, "insert new extent\n"); + hfs_dbg("insert new extent\n"); res = hfsplus_ext_write_extent_locked(inode); if (res) goto out; @@ -546,7 +545,7 @@ void hfsplus_file_truncate(struct inode *inode) u32 alloc_cnt, blk_cnt, start; int res; - hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n", + hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)hip->phys_size, inode->i_size); if (inode->i_size > hip->phys_size) { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 9dd18de0bc891b..89e8b19c127b0a 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -11,47 +11,14 @@ #ifndef _LINUX_HFSPLUS_FS_H #define _LINUX_HFSPLUS_FS_H -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include #include #include +#include #include "hfsplus_raw.h" -#define DBG_BNODE_REFS 0x00000001 -#define DBG_BNODE_MOD 0x00000002 -#define DBG_CAT_MOD 0x00000004 -#define DBG_INODE 0x00000008 -#define DBG_SUPER 0x00000010 -#define DBG_EXTENT 0x00000020 -#define DBG_BITMAP 0x00000040 -#define DBG_ATTR_MOD 0x00000080 - -#if 0 -#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) -#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) -#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) -#endif -#define DBG_MASK (0) - -#define hfs_dbg(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -} while (0) - -#define hfs_dbg_cont(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - pr_cont(fmt, ##__VA_ARGS__); \ -} while (0) - /* Runtime config options */ #define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 77ec048021a01c..16bc4abc67e08f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -163,7 +163,7 @@ static int hfsplus_write_inode(struct inode *inode, { int err; - hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); err = hfsplus_ext_write_extent(inode); if (err) @@ -178,7 +178,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { - hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { @@ -197,7 +197,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; - hfs_dbg(SUPER, "hfsplus_sync_fs\n"); + hfs_dbg("starting...\n"); /* * Explicitly write out the special metadata inodes. @@ -228,6 +228,10 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) vhdr->folder_count = cpu_to_be32(sbi->folder_count); vhdr->file_count = cpu_to_be32(sbi->file_count); + hfs_dbg("free_blocks %u, next_cnid %u, folder_count %u, file_count %u\n", + sbi->free_blocks, sbi->next_cnid, + sbi->folder_count, sbi->file_count); + if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr)); write_backup = 1; @@ -253,6 +257,8 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(sb->s_bdev); + hfs_dbg("finished: err %d\n", error); + return error; } @@ -301,7 +307,7 @@ static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); - hfs_dbg(SUPER, "hfsplus_put_super\n"); + hfs_dbg("starting...\n"); cancel_delayed_work_sync(&sbi->sync_work); @@ -323,6 +329,8 @@ static void hfsplus_put_super(struct super_block *sb) kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); call_rcu(&sbi->rcu, delayed_free); + + hfs_dbg("finished\n"); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index c951fa9835aa12..ece4d29c0ab9c2 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -64,7 +64,7 @@ static void hfsplus_init_header_node(struct inode *attr_file, u32 used_bmp_bytes; u64 tmp; - hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n", + hfs_dbg("clump %u, node_size %u\n", clump_size, node_size); /* The end of the node contains list of record offsets */ @@ -132,7 +132,7 @@ static int hfsplus_create_attributes_file(struct super_block *sb) struct page *page; int old_state = HFSPLUS_EMPTY_ATTR_TREE; - hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID); + hfs_dbg("ino %d\n", HFSPLUS_ATTR_CNID); check_attr_tree_state_again: switch (atomic_read(&sbi->attr_tree_state)) { diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h new file mode 100644 index 00000000000000..8838ca2f3d0893 --- /dev/null +++ b/include/linux/hfs_common.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * HFS/HFS+ common definitions, inline functions, + * and shared functionality. + */ + +#ifndef _HFS_COMMON_H_ +#define _HFS_COMMON_H_ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define hfs_dbg(fmt, ...) \ + pr_debug("pid %d:%s:%d %s(): " fmt, \ + current->pid, __FILE__, __LINE__, __func__, ##__VA_ARGS__) \ + +#endif /* _HFS_COMMON_H_ */ From b7e32ae6664285e156e9f0cd821e63e19798baf7 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 23 Sep 2025 13:56:56 -0700 Subject: [PATCH 2800/3206] libie: fix string names for AQ error codes The LIBIE_AQ_STR macro() introduced by commit 5feaa7a07b85 ("libie: add adminq helper for converting err to str") is used in order to generate strings for printing human readable error codes. Its definition is missing the separating underscore ('_') character which makes the resulting strings difficult to read. Additionally, the string won't match the source code, preventing search tools from working properly. Add the missing underscore character, fixing the error string names. Signed-off-by: Jacob Keller Fixes: 5feaa7a07b85 ("libie: add adminq helper for converting err to str") Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250923205657.846759-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/libie/adminq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/libie/adminq.c b/drivers/net/ethernet/intel/libie/adminq.c index 55356548e3f0a1..7b4ff479e7e57f 100644 --- a/drivers/net/ethernet/intel/libie/adminq.c +++ b/drivers/net/ethernet/intel/libie/adminq.c @@ -6,7 +6,7 @@ static const char * const libie_aq_str_arr[] = { #define LIBIE_AQ_STR(x) \ - [LIBIE_AQ_RC_##x] = "LIBIE_AQ_RC" #x + [LIBIE_AQ_RC_##x] = "LIBIE_AQ_RC_" #x LIBIE_AQ_STR(OK), LIBIE_AQ_STR(EPERM), LIBIE_AQ_STR(ENOENT), From f2b6b51d21ce7b84bc01d252b451684b86ad1cdc Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 30 Aug 2025 12:36:02 +0200 Subject: [PATCH 2801/3206] i2c: s3c2410: Drop S3C2410 OF support Samsung S3C2410 SoC was removed from the Linux kernel in the commit 61b7f8920b17 ("ARM: s3c: remove all s3c24xx support"), in January 2023. There are no in-kernel users of "samsung,s3c2410-i2c" compatible. However, there is still a user of "s3c2410-i2c" platform device ID, S3C64xx platform, so that part needs to stay. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250830103601.82046-3-krzysztof.kozlowski@linaro.org --- drivers/i2c/busses/i2c-s3c2410.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c index f4fa4703acbd0e..8138f5ef40f06b 100644 --- a/drivers/i2c/busses/i2c-s3c2410.c +++ b/drivers/i2c/busses/i2c-s3c2410.c @@ -138,7 +138,6 @@ static void i2c_s3c_irq_nextbyte(struct s3c24xx_i2c *i2c, unsigned long iicstat) #ifdef CONFIG_OF static const struct of_device_id s3c24xx_i2c_match[] = { - { .compatible = "samsung,s3c2410-i2c", .data = (void *)0 }, { .compatible = "samsung,s3c2440-i2c", .data = (void *)QUIRK_S3C2440 }, { .compatible = "samsung,s3c2440-hdmiphy-i2c", .data = (void *)(QUIRK_S3C2440 | QUIRK_HDMIPHY | QUIRK_NO_GPIO) }, From 25aab8f3ce56fd3cdf921c206ce42d1cdede5259 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 30 Aug 2025 12:36:03 +0200 Subject: [PATCH 2802/3206] dt-bindings: i2c: samsung,s3c2410-i2c: Drop S3C2410 Samsung S3C2410 SoC was removed from the Linux kernel in the commit 61b7f8920b17 ("ARM: s3c: remove all s3c24xx support"), in January 2023. There are no in-kernel users of "samsung,s3c2410-i2c" compatible. Signed-off-by: Krzysztof Kozlowski Acked-by: Rob Herring (Arm) Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250830103601.82046-4-krzysztof.kozlowski@linaro.org --- Documentation/devicetree/bindings/i2c/samsung,s3c2410-i2c.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/devicetree/bindings/i2c/samsung,s3c2410-i2c.yaml b/Documentation/devicetree/bindings/i2c/samsung,s3c2410-i2c.yaml index 6ba7d793504c8c..a2ddc680361769 100644 --- a/Documentation/devicetree/bindings/i2c/samsung,s3c2410-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/samsung,s3c2410-i2c.yaml @@ -13,7 +13,6 @@ properties: compatible: oneOf: - enum: - - samsung,s3c2410-i2c - samsung,s3c2440-i2c # For s3c2440-like I2C used inside HDMIPHY block found on several SoCs: - samsung,s3c2440-hdmiphy-i2c @@ -93,7 +92,6 @@ allOf: compatible: contains: enum: - - samsung,s3c2410-i2c - samsung,s3c2440-i2c - samsung,s3c2440-hdmiphy-i2c then: From 217f92d91c9faeb6b78bd6205b3585944cbcb433 Mon Sep 17 00:00:00 2001 From: Igor Belwon Date: Sat, 20 Sep 2025 19:22:52 +0200 Subject: [PATCH 2803/3206] dt-bindings: i2c: i2c-mt65xx: Document MediaTek MT6878 I2C Document the I2C controllers found in the MediaTek MT6878 SoC, by adding a new compatible string for the controllers. Their design is compatible with the design from the MediaTek MT8188 SoC. Signed-off-by: Igor Belwon Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250920-mt6878-i2c-bringup-v2-1-70a951f10be9@mentallysanemainliners.org Signed-off-by: Andi Shyti --- Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml b/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml index 23fe8ff76645e4..bd6811cbde701c 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml +++ b/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml @@ -50,6 +50,10 @@ properties: - enum: - mediatek,mt6795-i2c - const: mediatek,mt8173-i2c + - items: + - enum: + - mediatek,mt6878-i2c + - const: mediatek,mt8188-i2c - items: - enum: - mediatek,mt6893-i2c From 1cf12c7177410afcb53f815315d1247ea57fae4f Mon Sep 17 00:00:00 2001 From: "Bo Liu (OpenAnolis)" Date: Tue, 23 Sep 2025 15:01:12 +0800 Subject: [PATCH 2804/3206] erofs: Add support for FS_IOC_GETFSLABEL Add support for reading to the erofs volume label from the FS_IOC_GETFSLABEL ioctls. Signed-off-by: Bo Liu (OpenAnolis) Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Reviewed-by: Hongbo Li Signed-off-by: Gao Xiang --- fs/erofs/data.c | 4 ++++ fs/erofs/dir.c | 4 ++++ fs/erofs/inode.c | 40 ++++++++++++++++++++++++++++++++++++---- fs/erofs/internal.h | 5 +++++ fs/erofs/super.c | 8 ++++++++ 5 files changed, 57 insertions(+), 4 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 3b1ba571c7286b..8ca29962a3ddef 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -475,6 +475,10 @@ static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence) const struct file_operations erofs_file_fops = { .llseek = erofs_file_llseek, .read_iter = erofs_file_read_iter, + .unlocked_ioctl = erofs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = erofs_compat_ioctl, +#endif .mmap_prepare = erofs_file_mmap_prepare, .get_unmapped_area = thp_get_unmapped_area, .splice_read = filemap_splice_read, diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index debf469ad6bd56..32b4f5aa60c986 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -123,4 +123,8 @@ const struct file_operations erofs_dir_fops = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = erofs_readdir, + .unlocked_ioctl = erofs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = erofs_compat_ioctl, +#endif }; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 9a2f5972152257..cb780c095d282a 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -5,6 +5,7 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" +#include #include static int erofs_fill_symlink(struct inode *inode, void *kaddr, @@ -213,10 +214,7 @@ static int erofs_fill_inode(struct inode *inode) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; - if (erofs_inode_is_data_compressed(vi->datalayout)) - inode->i_fop = &generic_ro_fops; - else - inode->i_fop = &erofs_file_fops; + inode->i_fop = &erofs_file_fops; break; case S_IFDIR: inode->i_op = &erofs_dir_iops; @@ -341,6 +339,40 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, return 0; } +static int erofs_ioctl_get_volume_label(struct inode *inode, void __user *arg) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + int ret; + + if (!sbi->volume_name) + ret = clear_user(arg, 1); + else + ret = copy_to_user(arg, sbi->volume_name, + strlen(sbi->volume_name)); + return ret ? -EFAULT : 0; +} + +long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + void __user *argp = (void __user *)arg; + + switch (cmd) { + case FS_IOC_GETFSLABEL: + return erofs_ioctl_get_volume_label(inode, argp); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long erofs_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + return erofs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + const struct inode_operations erofs_generic_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 9319c66e86c3c3..f7f622836198da 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -153,6 +153,7 @@ struct erofs_sb_info { /* used for statfs, f_files - f_favail */ u64 inos; + char *volume_name; u32 feature_compat; u32 feature_incompat; @@ -536,6 +537,10 @@ static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { static inline void erofs_fscache_submit_bio(struct bio *bio) {} #endif +long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long erofs_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/erofs/super.c b/fs/erofs/super.c index db13b40a78e07d..f3f8d8c066e4e6 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -343,6 +343,13 @@ static int erofs_read_superblock(struct super_block *sb) sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec); super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); + if (dsb->volume_name[0]) { + sbi->volume_name = kstrndup(dsb->volume_name, + sizeof(dsb->volume_name), GFP_KERNEL); + if (!sbi->volume_name) + return -ENOMEM; + } + /* parse on-disk compression configurations */ ret = z_erofs_parse_cfgs(sb, dsb); if (ret < 0) @@ -822,6 +829,7 @@ static void erofs_sb_free(struct erofs_sb_info *sbi) kfree(sbi->domain_id); if (sbi->dif0.file) fput(sbi->dif0.file); + kfree(sbi->volume_name); kfree(sbi); } From e2d3af0d64e5fe2ee269e8f082642f82bcca3903 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 24 Sep 2025 11:28:26 +0800 Subject: [PATCH 2805/3206] erofs: drop redundant sanity check for ztailpacking inline It is already performed in z_erofs_map_blocks_fo(). Also align the error message with that used for the uncompressed inline layout. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 3 --- fs/erofs/zmap.c | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 625b8ae8f67f09..bc80cfe482f73b 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -823,9 +823,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) } rcu_read_unlock(); } - } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { - DBG_BUGON(1); - return -EFSCORRUPTED; } if (pcl) { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 798223e6da9ce1..e5581dbeb4c2bc 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -462,8 +462,8 @@ static int z_erofs_map_blocks_fo(struct inode *inode, map->m_pa = vi->z_fragmentoff; map->m_plen = vi->z_idata_size; if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map->m_plen); + erofs_err(sb, "ztailpacking inline data across blocks @ nid %llu", + vi->nid); err = -EFSCORRUPTED; goto unmap_out; } From 9158c6bb245113d4966df9b2ba602197a379412e Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 23 Sep 2025 15:51:04 +0800 Subject: [PATCH 2806/3206] afs: Fix potential null pointer dereference in afs_put_server afs_put_server() accessed server->debug_id before the NULL check, which could lead to a null pointer dereference. Move the debug_id assignment, ensuring we never dereference a NULL server pointer. Fixes: 2757a4dc1849 ("afs: Fix access after dec in put functions") Cc: stable@vger.kernel.org Signed-off-by: Zhen Ni Acked-by: David Howells Reviewed-by: Jeffrey Altman Signed-off-by: Christian Brauner --- fs/afs/server.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/afs/server.c b/fs/afs/server.c index a97562f831eb5a..c4428ebddb1da6 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -331,13 +331,14 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate, void afs_put_server(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - unsigned int a, debug_id = server->debug_id; + unsigned int a, debug_id; bool zero; int r; if (!server) return; + debug_id = server->debug_id; a = atomic_read(&server->active); zero = __refcount_dec_and_test(&server->ref, &r); trace_afs_server(debug_id, r - 1, a, reason); From a19239ba14525c26ad097d59fd52cd9198b5bcdb Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 24 Sep 2025 13:49:49 +0100 Subject: [PATCH 2807/3206] afs: Add support for RENAME_NOREPLACE and RENAME_EXCHANGE Add support for RENAME_NOREPLACE and RENAME_EXCHANGE, if the server supports them. The default is translated to YFS.Rename_Replace, falling back to YFS.Rename; RENAME_NOREPLACE is translated to YFS.Rename_NoReplace and RENAME_EXCHANGE to YFS.Rename_Exchange, both of which fall back to reporting EINVAL. Signed-off-by: David Howells Link: https://lore.kernel.org/740476.1758718189@warthog.procyon.org.uk cc: Marc Dionne cc: Dan Carpenter cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- fs/afs/dir.c | 223 ++++++++++++++++++++++++++------- fs/afs/dir_edit.c | 18 +-- fs/afs/dir_silly.c | 11 ++ fs/afs/internal.h | 15 ++- fs/afs/misc.c | 1 + fs/afs/protocol_yfs.h | 3 + fs/afs/rotate.c | 17 ++- fs/afs/yfsclient.c | 249 +++++++++++++++++++++++++++++++++++++ fs/dcache.c | 1 + include/trace/events/afs.h | 6 + 10 files changed, 480 insertions(+), 64 deletions(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index bfb69e0666728b..89d36e3e5c7999 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1823,7 +1823,8 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, static void afs_rename_success(struct afs_operation *op) { - struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + struct afs_vnode *vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; _enter("op=%08x", op->debug_id); @@ -1834,22 +1835,40 @@ static void afs_rename_success(struct afs_operation *op) op->ctime = op->file[1].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[1]); } + if (op->more_files[0].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[0]); + if (op->more_files[1].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[1]); /* If we're moving a subdir between dirs, we need to update * its DV counter too as the ".." will be altered. */ - if (S_ISDIR(vnode->netfs.inode.i_mode) && - op->file[0].vnode != op->file[1].vnode) { - u64 new_dv; + if (op->file[0].vnode != op->file[1].vnode) { + if (S_ISDIR(vnode->netfs.inode.i_mode)) { + u64 new_dv; - write_seqlock(&vnode->cb_lock); + write_seqlock(&vnode->cb_lock); - new_dv = vnode->status.data_version + 1; - trace_afs_set_dv(vnode, new_dv); - vnode->status.data_version = new_dv; - inode_set_iversion_raw(&vnode->netfs.inode, new_dv); + new_dv = vnode->status.data_version + 1; + trace_afs_set_dv(vnode, new_dv); + vnode->status.data_version = new_dv; + inode_set_iversion_raw(&vnode->netfs.inode, new_dv); - write_sequnlock(&vnode->cb_lock); + write_sequnlock(&vnode->cb_lock); + } + + if ((op->rename.rename_flags & RENAME_EXCHANGE) && + S_ISDIR(new_vnode->netfs.inode.i_mode)) { + u64 new_dv; + + write_seqlock(&new_vnode->cb_lock); + + new_dv = new_vnode->status.data_version + 1; + new_vnode->status.data_version = new_dv; + inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv); + + write_sequnlock(&new_vnode->cb_lock); + } } } @@ -1900,8 +1919,8 @@ static void afs_rename_edit_dir(struct afs_operation *op) if (S_ISDIR(vnode->netfs.inode.i_mode) && new_dvnode != orig_dvnode && test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) - afs_edit_dir_update_dotdot(vnode, new_dvnode, - afs_edit_dir_for_rename_sub); + afs_edit_dir_update(vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); new_inode = d_inode(new_dentry); if (new_inode) { @@ -1915,9 +1934,6 @@ static void afs_rename_edit_dir(struct afs_operation *op) /* Now we can update d_fsdata on the dentries to reflect their * new parent's data_version. - * - * Note that if we ever implement RENAME_EXCHANGE, we'll have - * to update both dentries with opposing dir versions. */ afs_update_dentry_version(op, new_dvp, op->dentry); afs_update_dentry_version(op, new_dvp, op->dentry_2); @@ -1930,6 +1946,67 @@ static void afs_rename_edit_dir(struct afs_operation *op) fscache_end_operation(&new_cres); } +static void afs_rename_exchange_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode *orig_dvnode = orig_dvp->vnode; + struct afs_vnode *new_dvnode = new_dvp->vnode; + struct afs_vnode *old_vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; + struct dentry *old_dentry = op->dentry; + struct dentry *new_dentry = op->dentry_2; + + _enter("op=%08x", op->debug_id); + + if (new_dvnode == orig_dvnode) { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) { + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + afs_edit_dir_update(orig_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + } + + d_exchange(old_dentry, new_dentry); + up_write(&orig_dvnode->validate_lock); + } else { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + + up_write(&orig_dvnode->validate_lock); + down_write(&new_dvnode->validate_lock); + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) && + new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta) + afs_edit_dir_update(new_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + + if (S_ISDIR(old_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags)) + afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); + + if (S_ISDIR(new_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags)) + afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode, + afs_edit_dir_for_rename_sub); + + /* Now we can update d_fsdata on the dentries to reflect their + * new parents' data_version. + */ + afs_update_dentry_version(op, new_dvp, old_dentry); + afs_update_dentry_version(op, orig_dvp, new_dentry); + + d_exchange(old_dentry, new_dentry); + up_write(&new_dvnode->validate_lock); + } +} + static void afs_rename_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); @@ -1948,6 +2025,32 @@ static const struct afs_operation_ops afs_rename_operation = { .put = afs_rename_put, }; +#if 0 /* Autoswitched in yfs_fs_rename_replace(). */ +static const struct afs_operation_ops afs_rename_replace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_replace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; +#endif + +static const struct afs_operation_ops afs_rename_noreplace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_noreplace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; + +static const struct afs_operation_ops afs_rename_exchange_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_exchange, + .success = afs_rename_success, + .edit_dir = afs_rename_exchange_edit_dir, + .put = afs_rename_put, +}; + /* * rename a file in an AFS filesystem and/or move it between directories */ @@ -1956,10 +2059,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *new_dentry, unsigned int flags) { struct afs_operation *op; - struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; + struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL; int ret; - if (flags) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; /* Don't allow silly-rename files be moved around. */ @@ -1969,6 +2072,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); + if (d_is_positive(new_dentry)) + new_vnode = AFS_FS_I(d_inode(new_dentry)); _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, @@ -1989,6 +2094,11 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (ret < 0) goto error; + ret = -ENOMEM; + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) + goto error; + afs_op_set_vnode(op, 0, orig_dvnode); afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; @@ -1997,46 +2107,63 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = vnode; + op->more_files[0].speculative = true; + op->more_files[1].vnode = new_vnode; + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old_dentry; op->dentry_2 = new_dentry; + op->rename.rename_flags = flags; op->rename.new_negative = d_is_negative(new_dentry); - op->ops = &afs_rename_operation; - /* For non-directories, check whether the target is busy and if so, - * make a copy of the dentry and then do a silly-rename. If the - * silly-rename succeeds, the copied dentry is hashed and becomes the - * new target. - */ - if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { - /* To prevent any new references to the target during the - * rename, we unhash the dentry in advance. + if (flags & RENAME_NOREPLACE) { + op->ops = &afs_rename_noreplace_operation; + } else if (flags & RENAME_EXCHANGE) { + op->ops = &afs_rename_exchange_operation; + d_drop(new_dentry); + } else { + /* If we might displace the target, we might need to do silly + * rename. */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - op->rename.rehash = new_dentry; - } + op->ops = &afs_rename_operation; - if (d_count(new_dentry) > 2) { - /* copy the target dentry's name */ - op->rename.tmp = d_alloc(new_dentry->d_parent, - &new_dentry->d_name); - if (!op->rename.tmp) { - afs_op_nomem(op); - goto error; + /* For non-directories, check whether the target is busy and if + * so, make a copy of the dentry and then do a silly-rename. + * If the silly-rename succeeds, the copied dentry is hashed + * and becomes the new target. + */ + if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { + /* To prevent any new references to the target during + * the rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + op->rename.rehash = new_dentry; } - ret = afs_sillyrename(new_dvnode, - AFS_FS_I(d_inode(new_dentry)), - new_dentry, op->key); - if (ret) { - afs_op_set_error(op, ret); - goto error; + if (d_count(new_dentry) > 2) { + /* copy the target dentry's name */ + op->rename.tmp = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!op->rename.tmp) { + afs_op_nomem(op); + goto error; + } + + ret = afs_sillyrename(new_dvnode, + AFS_FS_I(d_inode(new_dentry)), + new_dentry, op->key); + if (ret) { + afs_op_set_error(op, ret); + goto error; + } + + op->dentry_2 = op->rename.tmp; + op->rename.rehash = NULL; + op->rename.new_negative = true; } - - op->dentry_2 = op->rename.tmp; - op->rename.rehash = NULL; - op->rename.new_negative = true; } } @@ -2052,6 +2179,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, d_drop(old_dentry); ret = afs_do_sync_operation(op); + if (ret == -ENOTSUPP) + ret = -EINVAL; out: afs_dir_unuse_cookie(orig_dvnode, ret); if (new_dvnode != orig_dvnode) diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 60a549f1d9c5fd..4b1342c72089ac 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -522,11 +522,11 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, } /* - * Edit a subdirectory that has been moved between directories to update the - * ".." entry. + * Edit an entry in a directory to update the vnode it refers to. This is also + * used to update the ".." entry in a directory. */ -void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, - enum afs_edit_dir_reason why) +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why) { union afs_xdr_dir_block *block; union afs_xdr_dirent *de; @@ -557,7 +557,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) goto already_invalidated; - slot = afs_dir_scan_block(block, &dotdot_name, b); + slot = afs_dir_scan_block(block, name, b); if (slot >= 0) goto found_dirent; @@ -566,7 +566,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d /* Didn't find the dirent to clobber. Download the directory again. */ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd); goto out; @@ -576,7 +576,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d de->u.unique = htonl(new_dvnode->fid.unique); trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot, - ntohl(de->u.vnode), ntohl(de->u.unique), ".."); + ntohl(de->u.vnode), ntohl(de->u.unique), name->name); kunmap_local(block); netfs_single_mark_inode_dirty(&vnode->netfs.inode); @@ -589,12 +589,12 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d already_invalidated: kunmap_local(block); trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); goto out; error: trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); goto out; } diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 0b80eb93fa40b8..014495d4b8684d 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -69,6 +69,12 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode if (IS_ERR(op)) return PTR_ERR(op); + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) { + afs_put_operation(op); + return -ENOMEM; + } + afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, dvnode); op->file[0].dv_delta = 1; @@ -77,6 +83,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = AFS_FS_I(d_inode(old)); + op->more_files[0].speculative = true; + op->more_files[1].vnode = AFS_FS_I(d_inode(new)); + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old; op->dentry_2 = new; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 1124ea4000cb1b..444a3ea4fdf65f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -562,6 +562,7 @@ struct afs_server { #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ +#define AFS_SERVER_FL_NO_RENAME2 20 /* YFS Fileserver doesn't support enhanced rename */ refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ @@ -891,9 +892,10 @@ struct afs_operation { bool need_rehash; } unlink; struct { - struct dentry *rehash; - struct dentry *tmp; - bool new_negative; + struct dentry *rehash; + struct dentry *tmp; + unsigned int rename_flags; + bool new_negative; } rename; struct { struct netfs_io_subrequest *subreq; @@ -1100,8 +1102,8 @@ int afs_single_writepages(struct address_space *mapping, extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); -void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, - enum afs_edit_dir_reason why); +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); /* @@ -1693,6 +1695,9 @@ extern void yfs_fs_remove_dir(struct afs_operation *); extern void yfs_fs_link(struct afs_operation *); extern void yfs_fs_symlink(struct afs_operation *); extern void yfs_fs_rename(struct afs_operation *); +void yfs_fs_rename_replace(struct afs_operation *op); +void yfs_fs_rename_noreplace(struct afs_operation *op); +void yfs_fs_rename_exchange(struct afs_operation *op); extern void yfs_fs_store_data(struct afs_operation *); extern void yfs_fs_setattr(struct afs_operation *); extern void yfs_fs_get_volume_status(struct afs_operation *); diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 8f2b3a17769082..c8a7f266080d92 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -131,6 +131,7 @@ int afs_abort_to_error(u32 abort_code) case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; case RXGEN_OPCODE: return -ENOTSUPP; + case RX_INVALID_OPERATION: return -ENOTSUPP; default: return -EREMOTEIO; } diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h index e4cd89c44c4654..b2f06c1917c2e2 100644 --- a/fs/afs/protocol_yfs.h +++ b/fs/afs/protocol_yfs.h @@ -50,6 +50,9 @@ enum YFS_FS_Operations { YFSREMOVEACL = 64171, YFSREMOVEFILE2 = 64173, YFSSTOREOPAQUEACL2 = 64174, + YFSRENAME_REPLACE = 64176, + YFSRENAME_NOREPLACE = 64177, + YFSRENAME_EXCHANGE = 64187, YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */ YFSFETCHDATA64 = 64537, /* YFS Fetch file data */ YFSSTOREDATA64 = 64538, /* YFS Store file data */ diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index a1c24f589d9e13..6a4e7da10fc495 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -432,6 +432,16 @@ bool afs_select_fileserver(struct afs_operation *op) afs_op_set_error(op, -EDQUOT); goto failed_but_online; + case RX_INVALID_OPERATION: + case RXGEN_OPCODE: + /* Handle downgrading to an older operation. */ + afs_op_set_error(op, -ENOTSUPP); + if (op->flags & AFS_OPERATION_DOWNGRADE) { + op->flags &= ~AFS_OPERATION_DOWNGRADE; + goto go_again; + } + goto failed_but_online; + default: afs_op_accumulate_error(op, error, abort_code); failed_but_online: @@ -620,12 +630,13 @@ bool afs_select_fileserver(struct afs_operation *op) op->addr_index = addr_index; set_bit(addr_index, &op->addr_tried); - op->volsync.creation = TIME64_MIN; - op->volsync.update = TIME64_MIN; - op->call_responded = false; _debug("address [%u] %u/%u %pISp", op->server_index, addr_index, alist->nr_addrs, rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer)); +go_again: + op->volsync.creation = TIME64_MIN; + op->volsync.update = TIME64_MIN; + op->call_responded = false; _leave(" = t"); return true; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 257af259c04a6b..febf13a49f0bf6 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -1042,6 +1042,9 @@ void yfs_fs_rename(struct afs_operation *op) _enter(""); + if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags)) + return yfs_fs_rename_replace(op); + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename, sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + @@ -1070,6 +1073,252 @@ void yfs_fs_rename(struct afs_operation *op) afs_make_op_call(op, call, GFP_NOFS); } +/* + * Deliver reply data to a YFS.Rename_NoReplace operation. This does not + * return the status of a displaced target inode as there cannot be one. + */ +static int yfs_deliver_fs_rename_1(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +/* + * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange + * operation. These return the status of the displaced target inode if there + * was one. + */ +static int yfs_deliver_fs_rename_2(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + struct afs_vnode_param *new_vp = &op->more_files[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSFid(&bp, &new_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +static void yfs_done_fs_rename_replace(struct afs_call *call) +{ + if (call->error == -ECONNABORTED && + (call->abort_code == RX_INVALID_OPERATION || + call->abort_code == RXGEN_OPCODE)) { + set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags); + call->op->flags |= AFS_OPERATION_DOWNGRADE; + } +} + +/* + * YFS.Rename_Replace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Replace = { + .name = "FS.Rename_Replace", + .op = yfs_FS_Rename_Replace, + .deliver = yfs_deliver_fs_rename_2, + .done = yfs_done_fs_rename_replace, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_NoReplace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_NoReplace = { + .name = "FS.Rename_NoReplace", + .op = yfs_FS_Rename_NoReplace, + .deliver = yfs_deliver_fs_rename_1, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_Exchange operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Exchange = { + .name = "FS.Rename_Exchange", + .op = yfs_FS_Rename_Exchange, + .deliver = yfs_deliver_fs_rename_2, + .destructor = afs_flat_call_destructor, +}; + +/* + * Rename a file or directory, replacing the target if it exists. The status + * of a displaced target is returned. + */ +void yfs_fs_rename_replace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_REPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Rename a file or directory, failing if the target dirent exists. + */ +void yfs_fs_rename_noreplace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Exchange a pair of files directories. + */ +void yfs_fs_rename_exchange(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + /* * YFS.StoreData64 operation type. */ diff --git a/fs/dcache.c b/fs/dcache.c index 60046ae23d5148..9e2bd890b72890 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2922,6 +2922,7 @@ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) write_sequnlock(&rename_lock); } +EXPORT_SYMBOL(d_exchange); /** * d_ancestor - search for an ancestor diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 7f83d242c8e9f8..1b3c48b5591dfd 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -69,6 +69,9 @@ enum afs_fs_operation { yfs_FS_RemoveACL = 64171, yfs_FS_RemoveFile2 = 64173, yfs_FS_StoreOpaqueACL2 = 64174, + yfs_FS_Rename_Replace = 64176, + yfs_FS_Rename_NoReplace = 64177, + yfs_FS_Rename_Exchange = 64187, yfs_FS_InlineBulkStatus = 64536, /* YFS Fetch multiple file statuses with errors */ yfs_FS_FetchData64 = 64537, /* YFS Fetch file data */ yfs_FS_StoreData64 = 64538, /* YFS Store file data */ @@ -300,6 +303,9 @@ enum yfs_cm_operation { EM(yfs_FS_RemoveACL, "YFS.RemoveACL") \ EM(yfs_FS_RemoveFile2, "YFS.RemoveFile2") \ EM(yfs_FS_StoreOpaqueACL2, "YFS.StoreOpaqueACL2") \ + EM(yfs_FS_Rename_Replace, "YFS.Rename_Replace") \ + EM(yfs_FS_Rename_NoReplace, "YFS.Rename_NoReplace") \ + EM(yfs_FS_Rename_Exchange, "YFS.Rename_Exchange") \ EM(yfs_FS_InlineBulkStatus, "YFS.InlineBulkStatus") \ EM(yfs_FS_FetchData64, "YFS.FetchData64") \ EM(yfs_FS_StoreData64, "YFS.StoreData64") \ From 10cdfcd37ade7ce736bc4a1927680f390a6b1f7b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Sep 2025 13:33:58 +0200 Subject: [PATCH 2808/3206] nstree: make struct ns_tree private Don't expose it directly. There's no need to do that. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/nstree.h | 13 ------------- kernel/nstree.c | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 29ad6402260cc2..8b863669047390 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -9,19 +9,6 @@ #include #include -/** - * struct ns_tree - Namespace tree - * @ns_tree: Rbtree of namespaces of a particular type - * @ns_list: Sequentially walkable list of all namespaces of this type - * @ns_tree_lock: Seqlock to protect the tree and list - */ -struct ns_tree { - struct rb_root ns_tree; - struct list_head ns_list; - seqlock_t ns_tree_lock; - int type; -}; - extern struct ns_tree cgroup_ns_tree; extern struct ns_tree ipc_ns_tree; extern struct ns_tree mnt_ns_tree; diff --git a/kernel/nstree.c b/kernel/nstree.c index bbe8bedc924ca9..ecc88b013eff2f 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -4,6 +4,20 @@ #include #include +/** + * struct ns_tree - Namespace tree + * @ns_tree: Rbtree of namespaces of a particular type + * @ns_list: Sequentially walkable list of all namespaces of this type + * @ns_tree_lock: Seqlock to protect the tree and list + * @type: type of namespaces in this tree + */ +struct ns_tree { + struct rb_root ns_tree; + struct list_head ns_list; + seqlock_t ns_tree_lock; + int type; +}; + struct ns_tree mnt_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), From 4055526d35746ce8b04bfa5e14e14f28bb163186 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Sep 2025 13:33:59 +0200 Subject: [PATCH 2809/3206] ns: move ns type into struct ns_common It's misplaced in struct proc_ns_operations and ns->ops might be NULL if the namespace is compiled out but we still want to know the type of the namespace for the initial namespace struct. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 6 +++--- fs/nsfs.c | 18 +++++++++--------- include/linux/ns_common.h | 30 +++++++++++++++++++++++++----- include/linux/proc_ns.h | 1 - init/version-timestamp.c | 1 + ipc/msgutil.c | 1 + ipc/namespace.c | 1 - kernel/cgroup/cgroup.c | 1 + kernel/cgroup/namespace.c | 1 - kernel/nscommon.c | 5 +++-- kernel/nsproxy.c | 4 ++-- kernel/nstree.c | 8 ++++---- kernel/pid.c | 1 + kernel/pid_namespace.c | 2 -- kernel/time/namespace.c | 3 +-- kernel/user.c | 1 + kernel/user_namespace.c | 1 - kernel/utsname.c | 1 - net/core/net_namespace.c | 1 - 19 files changed, 52 insertions(+), 35 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index d65917ec5544fe..01334d5038a271 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4927,7 +4927,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; /* @@ -5830,7 +5830,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); @@ -6016,6 +6016,7 @@ struct mnt_namespace init_mnt_ns = { .ns.ops = &mntns_operations, .user_ns = &init_user_ns, .ns.__ns_ref = REFCOUNT_INIT(1), + .ns.ns_type = ns_common_type(&init_mnt_ns), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), @@ -6333,7 +6334,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns) const struct proc_ns_operations mntns_operations = { .name = "mnt", - .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, diff --git a/fs/nsfs.c b/fs/nsfs.c index dc0a4404b9718e..e7fd8a790aaa4b 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -219,9 +219,9 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); case NS_GET_NSTYPE: - return ns->ops->type; + return ns->ns_type; case NS_GET_OWNER_UID: - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; user_ns = container_of(ns, struct user_namespace, ns); argp = (uid_t __user *) arg; @@ -234,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, case NS_GET_PID_IN_PIDNS: fallthrough; case NS_GET_TGID_IN_PIDNS: { - if (ns->ops->type != CLONE_NEWPID) + if (ns->ns_type != CLONE_NEWPID) return -EINVAL; ret = -ESRCH; @@ -273,7 +273,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return ret; } case NS_GET_MNTNS_ID: - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; fallthrough; case NS_GET_ID: { @@ -293,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (!uinfo) @@ -314,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct file *f __free(fput) = NULL; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) @@ -453,7 +453,7 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, } fid->ns_id = ns->ns_id; - fid->ns_type = ns->ops->type; + fid->ns_type = ns->ns_type; fid->ns_inum = inode->i_ino; return FILEID_NSFS; } @@ -489,14 +489,14 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return NULL; VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); - VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); if (!__ns_ref_get(ns)) return NULL; } - switch (ns->ops->type) { + switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: if (!current_in_namespace(to_cg_ns(ns))) diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 56492cd9ff8d43..f5b68b8abb5433 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -4,6 +4,7 @@ #include #include +#include struct proc_ns_operations; @@ -37,6 +38,7 @@ extern const struct proc_ns_operations timens_operations; extern const struct proc_ns_operations timens_for_children_operations; struct ns_common { + u32 ns_type; struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; @@ -51,7 +53,7 @@ struct ns_common { }; }; -int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); #define to_ns_common(__ns) \ @@ -106,10 +108,28 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) -#define ns_common_init(__ns) \ - __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) - -#define ns_common_init_inum(__ns, __inum) __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), __inum) +#define ns_common_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CLONE_NEWCGROUP, \ + struct ipc_namespace *: CLONE_NEWIPC, \ + struct mnt_namespace *: CLONE_NEWNS, \ + struct net *: CLONE_NEWNET, \ + struct pid_namespace *: CLONE_NEWPID, \ + struct time_namespace *: CLONE_NEWTIME, \ + struct user_namespace *: CLONE_NEWUSER, \ + struct uts_namespace *: CLONE_NEWUTS) + +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + __inum) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 08016f6e0e6fb1..e81b8e596e4f19 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -17,7 +17,6 @@ struct inode; struct proc_ns_operations { const char *name; const char *real_ns_name; - int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); int (*install)(struct nsset *nsset, struct ns_common *ns); diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 376b7c856d4d03..d071835121c2c4 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,6 +8,7 @@ #include struct uts_namespace init_uts_ns = { + .ns.ns_type = ns_common_type(&init_uts_ns), .ns.__ns_ref = REFCOUNT_INIT(2), .name = { .sysname = UTS_SYSNAME, diff --git a/ipc/msgutil.c b/ipc/msgutil.c index dca6c8ec8f5ff9..7a03f6d03de3ad 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -33,6 +33,7 @@ struct ipc_namespace init_ipc_ns = { #ifdef CONFIG_IPC_NS .ns.ops = &ipcns_operations, #endif + .ns.ns_type = ns_common_type(&init_ipc_ns), }; struct msg_msgseg { diff --git a/ipc/namespace.c b/ipc/namespace.c index d89dfd718d2b12..76abac74a5c3bb 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -248,7 +248,6 @@ static struct user_namespace *ipcns_owner(struct ns_common *ns) const struct proc_ns_operations ipcns_operations = { .name = "ipc", - .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 245b43ff2fa481..9b75102e81cb9c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -224,6 +224,7 @@ struct cgroup_namespace init_cgroup_ns = { .ns.ops = &cgroupns_operations, .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, + .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 04c98338ac08d1..241ca05f07c81c 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -137,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns) const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", - .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 3cef89ddef41a3..92c9df1e877413 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -7,7 +7,7 @@ #ifdef CONFIG_DEBUG_VFS static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) { - switch (ns->ops->type) { + switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: VFS_WARN_ON_ONCE(ops != &cgroupns_operations); @@ -52,12 +52,13 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) } #endif -int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; + ns->ns_type = ns_type; RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38f5..8d62449237b6fb 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(fd_file(f))) { ns = get_proc_ns(file_inode(fd_file(f))); - if (flags && (ns->ops->type != flags)) + if (flags && (ns->ns_type != flags)) err = -EINVAL; - flags = ns->ops->type; + flags = ns->ns_type; } else if (!IS_ERR(pidfd_pid(fd_file(f)))) { err = check_setns_flags(flags); } else { diff --git a/kernel/nstree.c b/kernel/nstree.c index ecc88b013eff2f..b24a320a11a683 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -106,7 +106,7 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) write_seqlock(&ns_tree->ns_tree_lock); - VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); /* @@ -128,7 +128,7 @@ void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) { VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); - VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); write_seqlock(&ns_tree->ns_tree_lock); rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); @@ -197,7 +197,7 @@ struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) if (!node) return NULL; - VFS_WARN_ON_ONCE(node_to_ns(node)->ops->type != ns_type); + VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); return node_to_ns(node); } @@ -225,7 +225,7 @@ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, if (list_is_head(list, &ns_tree->ns_list)) return ERR_PTR(-ENOENT); - VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); return list_entry_rcu(list, struct ns_common, ns_list_node); } diff --git a/kernel/pid.c b/kernel/pid.c index 7e8c66e0bf676f..0c2dcddb317a30 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = { #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif + .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a262a3f1944321..f5b222c8ac39ad 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -443,7 +443,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns) const struct proc_ns_operations pidns_operations = { .name = "pid", - .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, @@ -454,7 +453,6 @@ const struct proc_ns_operations pidns_operations = { const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", - .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 9f26e61be044f3..530cf99c221286 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -462,7 +462,6 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -472,7 +471,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -480,6 +478,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { + .ns.ns_type = ns_common_type(&init_time_ns), .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = ns_init_inum(&init_time_ns), diff --git a/kernel/user.c b/kernel/user.c index b2a53674d506f8..0163665914c97c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,6 +65,7 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, + .ns.ns_type = ns_common_type(&init_user_ns), .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e1559e8a8a02c5..03cb63883d041a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1400,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns) const struct proc_ns_operations userns_operations = { .name = "user", - .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, diff --git a/kernel/utsname.c b/kernel/utsname.c index 00001592ad134b..a8cdc84648ee79 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -146,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns) const struct proc_ns_operations utsns_operations = { .name = "uts", - .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index bdea7d5fac5608..dfe84bd35f9811 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1543,7 +1543,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns) const struct proc_ns_operations netns_operations = { .name = "net", - .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, From af075603f27b0f6e05f1bdf64bad42fa7cfb033b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Sep 2025 13:34:00 +0200 Subject: [PATCH 2810/3206] ns: drop assert Otherwise we warn when e.g., no namespaces are configured but the initial namespace for is still around. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/nscommon.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 92c9df1e877413..c1fb2bad6d7295 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -46,8 +46,6 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) VFS_WARN_ON_ONCE(ops != &utsns_operations); break; #endif - default: - VFS_WARN_ON_ONCE(true); } } #endif From 4ae8d9aa9f9dc7137ea5e564d79c5aa5af1bc45c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Sep 2025 23:02:41 +0200 Subject: [PATCH 2811/3206] sched/deadline: Fix dl_server getting stuck John found it was easy to hit lockup warnings when running locktorture on a 2 CPU VM, which he bisected down to: commit cccb45d7c429 ("sched/deadline: Less agressive dl_server handling"). While debugging it seems there is a chance where we end up with the dl_server dequeued, with dl_se->dl_server_active. This causes dl_server_start() to return without enqueueing the dl_server, thus it fails to run when RT tasks starve the cpu. When this happens, dl_server_timer() catches the '!dl_se->server_has_tasks(dl_se)' case, which then calls replenish_dl_entity() and dl_server_stopped() and finally return HRTIMER_NO_RESTART. This ends in no new timer and also no enqueue, leaving the dl_server 'dead', allowing starvation. What should have happened is for the bandwidth timer to start the zero-laxity timer, which in turn would enqueue the dl_server and cause dl_se->server_pick_task() to be called -- which will stop the dl_server if no fair tasks are observed for a whole period. IOW, it is totally irrelevant if there are fair tasks at the moment of bandwidth refresh. This removes all dl_se->server_has_tasks() users, so remove the whole thing. Fixes: cccb45d7c4295 ("sched/deadline: Less agressive dl_server handling") Reported-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Peter Zijlstra (Intel) Tested-by: John Stultz --- include/linux/sched.h | 1 - kernel/sched/deadline.c | 12 +----------- kernel/sched/fair.c | 7 +------ kernel/sched/sched.h | 4 ---- 4 files changed, 2 insertions(+), 22 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f8188b8333503c..f89313b150e6ca 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -733,7 +733,6 @@ struct sched_dl_entity { * runnable task. */ struct rq *rq; - dl_server_has_tasks_f server_has_tasks; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f25301267e4714..5a5080b3a670e5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) */ if (dl_se->dl_defer && !dl_se->dl_defer_running && dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { - if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + if (!is_dl_boosted(dl_se)) { /* * Set dl_se->dl_defer_armed and dl_throttled variables to @@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; -static bool dl_server_stopped(struct sched_dl_entity *dl_se); - static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) { struct rq *rq = rq_of_dl_se(dl_se); @@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ if (!dl_se->dl_runtime) return HRTIMER_NORESTART; - if (!dl_se->server_has_tasks(dl_se)) { - replenish_dl_entity(dl_se); - dl_server_stopped(dl_se); - return HRTIMER_NORESTART; - } - if (dl_se->dl_defer_armed) { /* * First check if the server could consume runtime in background. @@ -1625,11 +1617,9 @@ static bool dl_server_stopped(struct sched_dl_entity *dl_se) } void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task) { dl_se->rq = rq; - dl_se->server_has_tasks = has_tasks; dl_se->server_pick_task = pick_task; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c2e..8ce56a8d507f98 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8859,11 +8859,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru return pick_next_task_fair(rq, prev, NULL); } -static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) -{ - return !!dl_se->rq->cfs.nr_queued; -} - static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) { return pick_task_fair(dl_se->rq); @@ -8875,7 +8870,7 @@ void fair_server_init(struct rq *rq) init_dl_entity(dl_se); - dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); + dl_server_init(dl_se, rq, fair_server_pick_task); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f751..f10d6277dca16d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -365,9 +365,6 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * * dl_se::rq -- runqueue we belong to. * - * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the - * server when it runs out of tasks to run. - * * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this * returns NULL. * @@ -383,7 +380,6 @@ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task); extern void sched_init_dl_servers(void); From a3a70caf7906708bf9bbc80018752a6b36543808 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Sep 2025 12:03:20 +0200 Subject: [PATCH 2812/3206] sched/deadline: Fix dl_server behaviour John reported undesirable behaviour with the dl_server since commit: cccb45d7c4295 ("sched/deadline: Less agressive dl_server handling"). When starving fair tasks on purpose (starting spinning FIFO tasks), his fair workload, which often goes (briefly) idle, would delay fair invocations for a second, running one invocation per second was both unexpected and terribly slow. The reason this happens is that when dl_se->server_pick_task() returns NULL, indicating no runnable tasks, it would yield, pushing any later jobs out a whole period (1 second). Instead simply stop the server. This should restore behaviour in that a later wakeup (which restarts the server) will be able to continue running (subject to the CBS wakeup rules). Notably, this does not re-introduce the behaviour cccb45d7c4295 set out to solve, any start/stop cycle is naturally throttled by the timer period (no active cancel). Fixes: cccb45d7c4295 ("sched/deadline: Less agressive dl_server handling") Reported-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Peter Zijlstra (Intel) Tested-by: John Stultz --- include/linux/sched.h | 1 - kernel/sched/deadline.c | 23 ++--------------------- kernel/sched/sched.h | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 24 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f89313b150e6ca..e4ce0a76831e5f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -706,7 +706,6 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; - unsigned int dl_server_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5a5080b3a670e5..72c1f72463c758 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1571,10 +1571,8 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) { - dl_se->dl_server_idle = 0; + if (dl_se->dl_runtime) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); - } } void dl_server_start(struct sched_dl_entity *dl_se) @@ -1602,20 +1600,6 @@ void dl_server_stop(struct sched_dl_entity *dl_se) dl_se->dl_server_active = 0; } -static bool dl_server_stopped(struct sched_dl_entity *dl_se) -{ - if (!dl_se->dl_server_active) - return true; - - if (dl_se->dl_server_idle) { - dl_server_stop(dl_se); - return true; - } - - dl_se->dl_server_idle = 1; - return false; -} - void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_pick_f pick_task) { @@ -2384,10 +2368,7 @@ static struct task_struct *__pick_task_dl(struct rq *rq) if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - if (!dl_server_stopped(dl_se)) { - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); - } + dl_server_stop(dl_se); goto again; } rq->dl_server = dl_se; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f10d6277dca16d..cf2109b67f9a36 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -371,10 +371,39 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * dl_server_update() -- called from update_curr_common(), propagates runtime * to the server. * - * dl_server_start() - * dl_server_stop() -- start/stop the server when it has (no) tasks. + * dl_server_start() -- start the server when it has tasks; it will stop + * automatically when there are no more tasks, per + * dl_se::server_pick() returning NULL. + * + * dl_server_stop() -- (force) stop the server; use when updating + * parameters. * * dl_server_init() -- initializes the server. + * + * When started the dl_server will (per dl_defer) schedule a timer for its + * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a + * server will run at the very end of its period. + * + * This is done such that any runtime from the target class can be accounted + * against the server -- through dl_server_update() above -- such that when it + * becomes time to run, it might already be out of runtime and get deferred + * until the next period. In this case dl_server_timer() will alternate + * between defer and replenish but never actually enqueue the server. + * + * Only when the target class does not manage to exhaust the server's runtime + * (there's actualy starvation in the given period), will the dl_server get on + * the runqueue. Once queued it will pick tasks from the target class and run + * them until either its runtime is exhaused, at which point its back to + * dl_server_timer, or until there are no more tasks to run, at which point + * the dl_server stops itself. + * + * By stopping at this point the dl_server retains bandwidth, which, if a new + * task wakes up imminently (starting the server again), can be used -- + * subject to CBS wakeup rules -- without having to wait for the next period. + * + * Additionally, because of the dl_defer behaviour the start/stop behaviour is + * naturally thottled to once per period, avoiding high context switch + * workloads from spamming the hrtimer program/cancel paths. */ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); From 35561bab768977c9e05f1f1a9bc00134c85f3e28 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 17 Sep 2025 14:09:13 +0800 Subject: [PATCH 2813/3206] arch: Add the macro COMPILE_OFFSETS to all the asm-offsets.c The include/generated/asm-offsets.h is generated in Kbuild during compiling from arch/SRCARCH/kernel/asm-offsets.c. When we want to generate another similar offset header file, circular dependency can happen. For example, we want to generate a offset file include/generated/test.h, which is included in include/sched/sched.h. If we generate asm-offsets.h first, it will fail, as include/sched/sched.h is included in asm-offsets.c and include/generated/test.h doesn't exist; If we generate test.h first, it can't success neither, as include/generated/asm-offsets.h is included by it. In x86_64, the macro COMPILE_OFFSETS is used to avoid such circular dependency. We can generate asm-offsets.h first, and if the COMPILE_OFFSETS is defined, we don't include the "generated/test.h". And we define the macro COMPILE_OFFSETS for all the asm-offsets.c for this purpose. Signed-off-by: Menglong Dong Signed-off-by: Peter Zijlstra (Intel) --- arch/alpha/kernel/asm-offsets.c | 1 + arch/arc/kernel/asm-offsets.c | 1 + arch/arm/kernel/asm-offsets.c | 2 ++ arch/arm64/kernel/asm-offsets.c | 1 + arch/csky/kernel/asm-offsets.c | 1 + arch/hexagon/kernel/asm-offsets.c | 1 + arch/loongarch/kernel/asm-offsets.c | 2 ++ arch/m68k/kernel/asm-offsets.c | 1 + arch/microblaze/kernel/asm-offsets.c | 1 + arch/mips/kernel/asm-offsets.c | 2 ++ arch/nios2/kernel/asm-offsets.c | 1 + arch/openrisc/kernel/asm-offsets.c | 1 + arch/parisc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/riscv/kernel/asm-offsets.c | 1 + arch/s390/kernel/asm-offsets.c | 1 + arch/sh/kernel/asm-offsets.c | 1 + arch/sparc/kernel/asm-offsets.c | 1 + arch/um/kernel/asm-offsets.c | 2 ++ arch/xtensa/kernel/asm-offsets.c | 1 + 20 files changed, 24 insertions(+) diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c index e9dad60b147f33..1ebb058904992b 100644 --- a/arch/alpha/kernel/asm-offsets.c +++ b/arch/alpha/kernel/asm-offsets.c @@ -4,6 +4,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c index f77deb7991757e..2978da85fcb65b 100644 --- a/arch/arc/kernel/asm-offsets.c +++ b/arch/arc/kernel/asm-offsets.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 123f4a8ef44660..2101938d27fcbc 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -7,6 +7,8 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS + #include #include #include diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 30d4bbe68661f4..b6367ff3a49ca1 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -6,6 +6,7 @@ * 2001-2002 Keith Owens * Copyright (C) 2012 ARM Ltd. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c index d1e9035794733d..5525c8e7e1d9ea 100644 --- a/arch/csky/kernel/asm-offsets.c +++ b/arch/csky/kernel/asm-offsets.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. +#define COMPILE_OFFSETS #include #include diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c index 03a7063f945614..50eea9fa6f1375 100644 --- a/arch/hexagon/kernel/asm-offsets.c +++ b/arch/hexagon/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c index db1e4bb26b6a01..3017c715760099 100644 --- a/arch/loongarch/kernel/asm-offsets.c +++ b/arch/loongarch/kernel/asm-offsets.c @@ -4,6 +4,8 @@ * * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ +#define COMPILE_OFFSETS + #include #include #include diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c index 906d7323053744..67a1990f9d748f 100644 --- a/arch/m68k/kernel/asm-offsets.c +++ b/arch/m68k/kernel/asm-offsets.c @@ -9,6 +9,7 @@ * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #define ASM_OFFSETS_C #include diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c index 104c3ac5f30c88..b4b67d58e7f6ae 100644 --- a/arch/microblaze/kernel/asm-offsets.c +++ b/arch/microblaze/kernel/asm-offsets.c @@ -7,6 +7,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index 1e29efcba46e57..5debd9a3854a9e 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -9,6 +9,8 @@ * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com * Copyright (C) 2000 MIPS Technologies, Inc. */ +#define COMPILE_OFFSETS + #include #include #include diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c index e3d9b7b6fb48aa..88190b503ce5de 100644 --- a/arch/nios2/kernel/asm-offsets.c +++ b/arch/nios2/kernel/asm-offsets.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2011 Tobias Klauser */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c index 710651d5aaae10..3cc826f2216b10 100644 --- a/arch/openrisc/kernel/asm-offsets.c +++ b/arch/openrisc/kernel/asm-offsets.c @@ -18,6 +18,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c index 757816a7bd4b28..9abfe65492c65e 100644 --- a/arch/parisc/kernel/asm-offsets.c +++ b/arch/parisc/kernel/asm-offsets.c @@ -13,6 +13,7 @@ * Copyright (C) 2002 Randolph Chung * Copyright (C) 2003 James Bottomley */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index b3048f6d3822c0..a4bc80b30410ae 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 6e8c0d6feae9e9..7d42d3b8a32a75 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -3,6 +3,7 @@ * Copyright (C) 2012 Regents of the University of California * Copyright (C) 2017 SiFive */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 95ecad9c7d7d27..a8915663e917fa 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -4,6 +4,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c index a0322e8328456e..429b6a76314684 100644 --- a/arch/sh/kernel/asm-offsets.c +++ b/arch/sh/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c index 3d9b9855dce917..6e660bde48dd89 100644 --- a/arch/sparc/kernel/asm-offsets.c +++ b/arch/sparc/kernel/asm-offsets.c @@ -10,6 +10,7 @@ * * On sparc, thread_info data is static and TI_XXX offsets are computed by hand. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c index 1fb12235ab9c84..a69873aa697f4f 100644 --- a/arch/um/kernel/asm-offsets.c +++ b/arch/um/kernel/asm-offsets.c @@ -1 +1,3 @@ +#define COMPILE_OFFSETS + #include diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c index da38de20ae598b..cfbced95e944a4 100644 --- a/arch/xtensa/kernel/asm-offsets.c +++ b/arch/xtensa/kernel/asm-offsets.c @@ -11,6 +11,7 @@ * * Chris Zankel */ +#define COMPILE_OFFSETS #include #include From 88a90315a99a9120cd471bf681515cc77cd7cdb8 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 17 Sep 2025 14:09:14 +0800 Subject: [PATCH 2814/3206] rcu: Replace preempt.h with sched.h in include/linux/rcupdate.h In the next commit, we will move the definition of migrate_enable() and migrate_disable() to linux/sched.h. However, migrate_enable/migrate_disable will be used in commit 1b93c03fb319 ("rcu: add rcu_read_lock_dont_migrate()") in bpf-next tree. In order to fix potential compiling error, replace linux/preempt.h with linux/sched.h in include/linux/rcupdate.h. Signed-off-by: Menglong Dong Signed-off-by: Peter Zijlstra (Intel) --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb1d..8f346c847ee513 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include From 378b7708194fff77c9020392067329931c3fcc04 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 17 Sep 2025 14:09:15 +0800 Subject: [PATCH 2815/3206] sched: Make migrate_{en,dis}able() inline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For now, migrate_enable and migrate_disable are global, which makes them become hotspots in some case. Take BPF for example, the function calling to migrate_enable and migrate_disable in BPF trampoline can introduce significant overhead, and following is the 'perf top' of FENTRY's benchmark (./tools/testing/selftests/bpf/bench trig-fentry): 54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k] bpf_prog_2dcccf652aac1793_bench_trigger_fentry 10.43% [kernel] [k] migrate_enable 10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037 8.06% [kernel] [k] __bpf_prog_exit_recur 4.11% libc.so.6 [.] syscall 2.15% [kernel] [k] entry_SYSCALL_64 1.48% [kernel] [k] memchr_inv 1.32% [kernel] [k] fput 1.16% [kernel] [k] _copy_to_user 0.73% [kernel] [k] bpf_prog_test_run_raw_tp So in this commit, we make migrate_enable/migrate_disable inline to obtain better performance. The struct rq is defined internally in kernel/sched/sched.h, and the field "nr_pinned" is accessed in migrate_enable/migrate_disable, which makes it hard to make them inline. Alexei Starovoitov suggests to generate the offset of "nr_pinned" in [1], so we can define the migrate_enable/migrate_disable in include/linux/sched.h and access "this_rq()->nr_pinned" with "(void *)this_rq() + RQ_nr_pinned". The offset of "nr_pinned" is generated in include/generated/rq-offsets.h by kernel/sched/rq-offsets.c. Generally speaking, we move the definition of migrate_enable and migrate_disable to include/linux/sched.h from kernel/sched/core.c. The calling to __set_cpus_allowed_ptr() is leaved in ___migrate_enable(). The "struct rq" is not available in include/linux/sched.h, so we can't access the "runqueues" with this_cpu_ptr(), as the compilation will fail in this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): typeof((ptr) + 0) So we introduce the this_rq_raw() and access the runqueues with arch_raw_cpu_ptr/PERCPU_PTR directly. The variable "runqueues" is not visible in the kernel modules, and export it is not a good idea. As Peter Zijlstra advised in [2], we define and export migrate_enable/migrate_disable in kernel/sched/core.c too, and use them for the modules. Before this patch, the performance of BPF FENTRY is: fentry : 113.030 ± 0.149M/s fentry : 112.501 ± 0.187M/s fentry : 112.828 ± 0.267M/s fentry : 115.287 ± 0.241M/s After this patch, the performance of BPF FENTRY increases to: fentry : 143.644 ± 0.670M/s fentry : 149.764 ± 0.362M/s fentry : 149.642 ± 0.156M/s fentry : 145.263 ± 0.221M/s Signed-off-by: Menglong Dong Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/CAADnVQ+5sEDKHdsJY5ZsfGDO_1SEhhQWHrt2SMBG5SYyQ+jt7w@mail.gmail.com/ [1] Link: https://lore.kernel.org/all/20250819123214.GH4067720@noisy.programming.kicks-ass.net/ [2] --- Kbuild | 13 ++++- include/linux/preempt.h | 3 - include/linux/sched.h | 113 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 1 + kernel/sched/core.c | 63 +++++---------------- kernel/sched/rq-offsets.c | 12 ++++ 6 files changed, 152 insertions(+), 53 deletions(-) create mode 100644 kernel/sched/rq-offsets.c diff --git a/Kbuild b/Kbuild index f327ca86990cca..13324b4bbe236a 100644 --- a/Kbuild +++ b/Kbuild @@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s: $(timeconst-file) $(bounds-file) $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE $(call filechk,offsets,__ASM_OFFSETS_H__) +# Generate rq-offsets.h + +rq-offsets-file := include/generated/rq-offsets.h + +targets += kernel/sched/rq-offsets.s + +kernel/sched/rq-offsets.s: $(offsets-file) + +$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE + $(call filechk,offsets,__RQ_OFFSETS_H__) + # Check for missing system calls quiet_cmd_syscalls = CALL $< cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags) $(missing_syscalls_flags) PHONY += missing-syscalls -missing-syscalls: scripts/checksyscalls.sh $(offsets-file) +missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file) $(call cmd,syscalls) # Check the manual modification of atomic headers diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 1fad1c8a4c76a6..92237c31903578 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -424,8 +424,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * work-conserving schedulers. * */ -extern void migrate_disable(void); -extern void migrate_enable(void); /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section @@ -471,7 +469,6 @@ static __always_inline void preempt_enable_nested(void) DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) -DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #ifdef CONFIG_PREEMPT_DYNAMIC diff --git a/include/linux/sched.h b/include/linux/sched.h index 644a01bdae700d..d60ecaccdffc67 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -49,6 +49,9 @@ #include #include #include +#ifndef COMPILE_OFFSETS +#include +#endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -2317,4 +2320,114 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #define alloc_tag_restore(_tag, _old) do {} while (0) #endif +#ifndef MODULE +#ifndef COMPILE_OFFSETS + +extern void ___migrate_enable(void); + +struct rq; +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +/* + * The "struct rq" is not available here, so we can't access the + * "runqueues" with this_cpu_ptr(), as the compilation will fail in + * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): + * typeof((ptr) + 0) + * + * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. + */ +#ifdef CONFIG_SMP +#define this_rq_raw() arch_raw_cpu_ptr(&runqueues) +#else +#define this_rq_raw() PERCPU_PTR(&runqueues) +#endif +#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) + +static inline void __migrate_enable(void) +{ + struct task_struct *p = current; + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + guard(preempt)(); + if (unlikely(p->cpus_ptr != &p->cpus_mask)) + ___migrate_enable(); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq_pinned()--; +} + +static inline void __migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif + p->migration_disabled++; + return; + } + + guard(preempt)(); + this_rq_pinned()++; + p->migration_disabled = 1; +} +#else /* !COMPILE_OFFSETS */ +static inline void __migrate_disable(void) { } +static inline void __migrate_enable(void) { } +#endif /* !COMPILE_OFFSETS */ + +/* + * So that it is possible to not export the runqueues variable, define and + * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use + * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will + * be defined in kernel/sched/core.c. + */ +#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE +static inline void migrate_disable(void) +{ + __migrate_disable(); +} + +static inline void migrate_enable(void) +{ + __migrate_enable(); +} +#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ + +#else /* MODULE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* MODULE */ + +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) + #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af690..de9078a9df3a28 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23855,6 +23855,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, BTF_SET_START(btf_id_deny) BTF_ID_UNUSED #ifdef CONFIG_SMP +BTF_ID(func, ___migrate_enable) BTF_ID(func, migrate_disable) BTF_ID(func, migrate_enable) #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index feb750aae71b64..7f1e5cb94c536e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,6 +7,8 @@ * Copyright (C) 1991-2002 Linus Torvalds * Copyright (C) 1998-2024 Ingo Molnar, Red Hat */ +#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE +#include #include #include #include @@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) __do_set_cpus_allowed(p, &ac); } -void migrate_disable(void) -{ - struct task_struct *p = current; - - if (p->migration_disabled) { -#ifdef CONFIG_DEBUG_PREEMPT - /* - *Warn about overflow half-way through the range. - */ - WARN_ON_ONCE((s16)p->migration_disabled < 0); -#endif - p->migration_disabled++; - return; - } - - guard(preempt)(); - this_rq()->nr_pinned++; - p->migration_disabled = 1; -} -EXPORT_SYMBOL_GPL(migrate_disable); - -void migrate_enable(void) +void ___migrate_enable(void) { struct task_struct *p = current; struct affinity_context ac = { @@ -2410,35 +2391,19 @@ void migrate_enable(void) .flags = SCA_MIGRATE_ENABLE, }; -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Check both overflow from migrate_disable() and superfluous - * migrate_enable(). - */ - if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) - return; -#endif + __set_cpus_allowed_ptr(p, &ac); +} +EXPORT_SYMBOL_GPL(___migrate_enable); - if (p->migration_disabled > 1) { - p->migration_disabled--; - return; - } +void migrate_disable(void) +{ + __migrate_disable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); - /* - * Ensure stop_task runs either before or after this, and that - * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). - */ - guard(preempt)(); - if (p->cpus_ptr != &p->cpus_mask) - __set_cpus_allowed_ptr(p, &ac); - /* - * Mustn't clear migration_disabled() until cpus_ptr points back at the - * regular cpus_mask, otherwise things that race (eg. - * select_fallback_rq) get confused. - */ - barrier(); - p->migration_disabled = 0; - this_rq()->nr_pinned--; +void migrate_enable(void) +{ + __migrate_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c new file mode 100644 index 00000000000000..a23747bbe25b4d --- /dev/null +++ b/kernel/sched/rq-offsets.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#define COMPILE_OFFSETS +#include +#include +#include "sched.h" + +int main(void) +{ + DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned)); + + return 0; +} From 45b7f780739a3145aeef24d2dfa02517a6c82ed6 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 17 Sep 2025 14:09:16 +0800 Subject: [PATCH 2816/3206] sched: Fix some typos in include/linux/preempt.h There are some typos in the comments of migrate in include/linux/preempt.h: elegible -> eligible it's -> its migirate_disable -> migrate_disable abritrary -> arbitrary Just fix them. Signed-off-by: Menglong Dong Signed-off-by: Peter Zijlstra (Intel) --- include/linux/preempt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 92237c31903578..102202185d7a2c 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -372,7 +372,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, /* * Migrate-Disable and why it is undesired. * - * When a preempted task becomes elegible to run under the ideal model (IOW it + * When a preempted task becomes eligible to run under the ideal model (IOW it * becomes one of the M highest priority tasks), it might still have to wait * for the preemptee's migrate_disable() section to complete. Thereby suffering * a reduction in bandwidth in the exact duration of the migrate_disable() @@ -387,7 +387,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * - a lower priority tasks; which under preempt_disable() could've instantly * migrated away when another CPU becomes available, is now constrained * by the ability to push the higher priority task away, which might itself be - * in a migrate_disable() section, reducing it's available bandwidth. + * in a migrate_disable() section, reducing its available bandwidth. * * IOW it trades latency / moves the interference term, but it stays in the * system, and as long as it remains unbounded, the system is not fully @@ -399,7 +399,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a * number of primitives into becoming preemptible, they would also allow * migration. This turns out to break a bunch of per-cpu usage. To this end, - * all these primitives employ migirate_disable() to restore this implicit + * all these primitives employ migrate_disable() to restore this implicit * assumption. * * This is a 'temporary' work-around at best. The correct solution is getting @@ -407,7 +407,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * per-cpu locking or short preempt-disable regions. * * The end goal must be to get rid of migrate_disable(), alternatively we need - * a schedulability theory that does not depend on abritrary migration. + * a schedulability theory that does not depend on arbitrary migration. * * * Notes on the implementation. From c0054b25e2f1045f47b4954cf13a539e5e6047df Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 18 Sep 2025 10:21:41 +0300 Subject: [PATCH 2817/3206] net: dsa: lantiq_gswip: move gswip_add_single_port_br() call to port_setup() A port added to a "single port bridge" operates as standalone, and this is mutually exclusive to being part of a Linux bridge. In fact, gswip_port_bridge_join() calls gswip_add_single_port_br() with add=false, i.e. removes the port from the "single port bridge" to enable autonomous forwarding. The blamed commit seems to have incorrectly thought that ds->ops->port_enable() is called one time per port, during the setup phase of the switch. However, it is actually called during the ndo_open() implementation of DSA user ports, which is to say that this sequence of events: 1. ip link set swp0 down 2. ip link add br0 type bridge 3. ip link set swp0 master br0 4. ip link set swp0 up would cause swp0 to join back the "single port bridge" which step 3 had just removed it from. The correct DSA hook for one-time actions per port at switch init time is ds->ops->port_setup(). This is what seems to match the coder's intention; also see the comment at the beginning of the file: * At the initialization the driver allocates one bridge table entry for ~~~~~~~~~~~~~~~~~~~~~ * each switch port which is used when the port is used without an * explicit bridge. Fixes: 8206e0ce96b3 ("net: dsa: lantiq: Add VLAN unaware bridge offloading") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250918072142.894692-2-vladimir.oltean@nxp.com Tested-by: Daniel Golle Reviewed-by: Daniel Golle Signed-off-by: Paolo Abeni --- drivers/net/dsa/lantiq_gswip.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 6eb3140d404449..d416c072dd28cb 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -685,18 +685,27 @@ static int gswip_add_single_port_br(struct gswip_priv *priv, int port, bool add) return 0; } -static int gswip_port_enable(struct dsa_switch *ds, int port, - struct phy_device *phydev) +static int gswip_port_setup(struct dsa_switch *ds, int port) { struct gswip_priv *priv = ds->priv; int err; if (!dsa_is_cpu_port(ds, port)) { - u32 mdio_phy = 0; - err = gswip_add_single_port_br(priv, port, true); if (err) return err; + } + + return 0; +} + +static int gswip_port_enable(struct dsa_switch *ds, int port, + struct phy_device *phydev) +{ + struct gswip_priv *priv = ds->priv; + + if (!dsa_is_cpu_port(ds, port)) { + u32 mdio_phy = 0; if (phydev) mdio_phy = phydev->mdio.addr & GSWIP_MDIO_PHY_ADDR_MASK; @@ -1829,6 +1838,7 @@ static const struct phylink_mac_ops gswip_phylink_mac_ops = { static const struct dsa_switch_ops gswip_xrx200_switch_ops = { .get_tag_protocol = gswip_get_tag_protocol, .setup = gswip_setup, + .port_setup = gswip_port_setup, .port_enable = gswip_port_enable, .port_disable = gswip_port_disable, .port_bridge_join = gswip_port_bridge_join, From 987afe147965ef7a8e7d144ffef0d70af14bb1d4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 18 Sep 2025 10:21:42 +0300 Subject: [PATCH 2818/3206] net: dsa: lantiq_gswip: suppress -EINVAL errors for bridge FDB entries added to the CPU port The blamed commit and others in that patch set started the trend of reusing existing DSA driver API for a new purpose: calling ds->ops->port_fdb_add() on the CPU port. The lantiq_gswip driver was not prepared to handle that, as can be seen from the many errors that Daniel presents in the logs: [ 174.050000] gswip 1e108000.switch: port 2 failed to add fa:aa:72:f4:8b:1e vid 1 to fdb: -22 [ 174.060000] gswip 1e108000.switch lan2: entered promiscuous mode [ 174.070000] gswip 1e108000.switch: port 2 failed to add 00:01:02:03:04:02 vid 0 to fdb: -22 [ 174.090000] gswip 1e108000.switch: port 2 failed to add 00:01:02:03:04:02 vid 1 to fdb: -22 [ 174.090000] gswip 1e108000.switch: port 2 failed to delete fa:aa:72:f4:8b:1e vid 1 from fdb: -2 The errors are because gswip_port_fdb() wants to get a handle to the bridge that originated these FDB events, to associate it with a FID. Absolutely honourable purpose, however this only works for user ports. To get the bridge that generated an FDB entry for the CPU port, one would need to look at the db.bridge.dev argument. But this was introduced in commit c26933639b54 ("net: dsa: request drivers to perform FDB isolation"), first appeared in v5.18, and when the blamed commit was introduced in v5.14, no such API existed. So the core DSA feature was introduced way too soon for lantiq_gswip. Not acting on these host FDB entries and suppressing any errors has no other negative effect, and practically returns us to not supporting the host filtering feature at all - peacefully, this time. Fixes: 10fae4ac89ce ("net: dsa: include bridge addresses which are local in the host fdb list") Reported-by: Daniel Golle Closes: https://lore.kernel.org/netdev/aJfNMLNoi1VOsPrN@pidgin.makrotopia.org/ Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250918072142.894692-3-vladimir.oltean@nxp.com Tested-by: Daniel Golle Reviewed-by: Daniel Golle Signed-off-by: Paolo Abeni --- drivers/net/dsa/lantiq_gswip.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index d416c072dd28cb..84dc6e517acf94 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -1368,8 +1368,9 @@ static int gswip_port_fdb(struct dsa_switch *ds, int port, int i; int err; + /* Operation not supported on the CPU port, don't throw errors */ if (!bridge) - return -EINVAL; + return 0; for (i = max_ports; i < ARRAY_SIZE(priv->vlans); i++) { if (priv->vlans[i].bridge == bridge) { From 7d9c3442b02ab7dd3c44e20095a178fd57d2eccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Larumbe?= Date: Fri, 19 Sep 2025 17:43:48 +0100 Subject: [PATCH 2819/3206] drm/panthor: Defer scheduler entitiy destruction to queue release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit de8548813824 ("drm/panthor: Add the scheduler logical block") handled destruction of a group's queues' drm scheduler entities early into the group destruction procedure. However, that races with the group submit ioctl, because by the time entities are destroyed (through the group destroy ioctl), the submission procedure might've already obtained a group handle, and therefore the ability to push jobs into entities. This is met with a DRM error message within the drm scheduler core as a situation that should never occur. Fix by deferring drm scheduler entity destruction to queue release time. Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block") Signed-off-by: Adrián Larumbe Reviewed-by: Steven Price Reviewed-by: Boris Brezillon Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20250919164436.531930-1-adrian.larumbe@collabora.com --- drivers/gpu/drm/panthor/panthor_sched.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index 8f17394cc82aad..df76653e649a30 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -886,8 +886,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue * if (IS_ERR_OR_NULL(queue)) return; - if (queue->entity.fence_context) - drm_sched_entity_destroy(&queue->entity); + drm_sched_entity_destroy(&queue->entity); if (queue->scheduler.ops) drm_sched_fini(&queue->scheduler); @@ -3558,11 +3557,6 @@ int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle) if (!group) return -EINVAL; - for (u32 i = 0; i < group->queue_count; i++) { - if (group->queues[i]) - drm_sched_entity_destroy(&group->queues[i]->entity); - } - mutex_lock(&sched->reset.lock); mutex_lock(&sched->lock); group->destroyed = true; From d9c70e93ec5988ab07ad2a92d9f9d12867f02c56 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 Sep 2025 14:19:11 +0300 Subject: [PATCH 2820/3206] octeontx2-pf: Fix potential use after free in otx2_tc_add_flow() This code calls kfree_rcu(new_node, rcu) and then dereferences "new_node" and then dereferences it on the next line. Two lines later, we take a mutex so I don't think this is an RCU safe region. Re-order it to do the dereferences before queuing up the free. Fixes: 68fbff68dbea ("octeontx2-pf: Add police action for TC flower") Signed-off-by: Dan Carpenter Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/aNKCL1jKwK8GRJHh@stanley.mountain Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c index 5f80b23c5335cd..26a08d2cfbb1b6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c @@ -1326,7 +1326,6 @@ static int otx2_tc_add_flow(struct otx2_nic *nic, free_leaf: otx2_tc_del_from_flow_list(flow_cfg, new_node); - kfree_rcu(new_node, rcu); if (new_node->is_act_police) { mutex_lock(&nic->mbox.lock); @@ -1346,6 +1345,7 @@ static int otx2_tc_add_flow(struct otx2_nic *nic, mutex_unlock(&nic->mbox.lock); } + kfree_rcu(new_node, rcu); return rc; } From 3ed17349f18774c24505b0c21dfbd3cc4f126518 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Wed, 24 Sep 2025 14:17:17 -0400 Subject: [PATCH 2821/3206] platform/x86: lg-laptop: Fix WMAB call in fan_mode_store() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When WMAB is called to set the fan mode, the new mode is read from either bits 0-1 or bits 4-5 (depending on the value of some other EC register). Thus when WMAB is called with bits 4-5 zeroed and called again with bits 0-1 zeroed, the second call undoes the effect of the first call. This causes writes to /sys/devices/platform/lg-laptop/fan_mode to have no effect (and causes reads to always report a status of zero). Fix this by calling WMAB once, with the mode set in bits 0,1 and 4,5. When the fan mode is returned from WMAB it always has this form, so there is no need to preserve the other bits. As a bonus, the driver now supports the "Performance" fan mode seen in the LG-provided Windows control app, which provides less aggressive CPU throttling but louder fan noise and shorter battery life. Also, correct the documentation to reflect that 0 corresponds to the default mode (what the Windows app calls "Optimal") and 1 corresponds to the silent mode. Fixes: dbf0c5a6b1f8 ("platform/x86: Add LG Gram laptop special features driver") Link: https://bugzilla.kernel.org/show_bug.cgi?id=204913#c4 Signed-off-by: Daniel Lee Link: https://patch.msgid.link/MN2PR06MB55989CB10E91C8DA00EE868DDC1CA@MN2PR06MB5598.namprd06.prod.outlook.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../admin-guide/laptops/lg-laptop.rst | 4 +-- drivers/platform/x86/lg-laptop.c | 34 ++++++++----------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/Documentation/admin-guide/laptops/lg-laptop.rst b/Documentation/admin-guide/laptops/lg-laptop.rst index 67fd6932cef4ff..c4dd534f91edd1 100644 --- a/Documentation/admin-guide/laptops/lg-laptop.rst +++ b/Documentation/admin-guide/laptops/lg-laptop.rst @@ -48,8 +48,8 @@ This value is reset to 100 when the kernel boots. Fan mode -------- -Writing 1/0 to /sys/devices/platform/lg-laptop/fan_mode disables/enables -the fan silent mode. +Writing 0/1/2 to /sys/devices/platform/lg-laptop/fan_mode sets fan mode to +Optimal/Silent/Performance respectively. USB charge diff --git a/drivers/platform/x86/lg-laptop.c b/drivers/platform/x86/lg-laptop.c index 4b57102c7f6270..6af6cf477c5b5b 100644 --- a/drivers/platform/x86/lg-laptop.c +++ b/drivers/platform/x86/lg-laptop.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -75,6 +76,9 @@ MODULE_PARM_DESC(fw_debug, "Enable printing of firmware debug messages"); #define WMBB_USB_CHARGE 0x10B #define WMBB_BATT_LIMIT 0x10C +#define FAN_MODE_LOWER GENMASK(1, 0) +#define FAN_MODE_UPPER GENMASK(5, 4) + #define PLATFORM_NAME "lg-laptop" MODULE_ALIAS("wmi:" WMI_EVENT_GUID0); @@ -274,29 +278,19 @@ static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr, const char *buffer, size_t count) { - bool value; + unsigned long value; union acpi_object *r; - u32 m; int ret; - ret = kstrtobool(buffer, &value); + ret = kstrtoul(buffer, 10, &value); if (ret) return ret; + if (value >= 3) + return -EINVAL; - r = lg_wmab(dev, WM_FAN_MODE, WM_GET, 0); - if (!r) - return -EIO; - - if (r->type != ACPI_TYPE_INTEGER) { - kfree(r); - return -EIO; - } - - m = r->integer.value; - kfree(r); - r = lg_wmab(dev, WM_FAN_MODE, WM_SET, (m & 0xffffff0f) | (value << 4)); - kfree(r); - r = lg_wmab(dev, WM_FAN_MODE, WM_SET, (m & 0xfffffff0) | value); + r = lg_wmab(dev, WM_FAN_MODE, WM_SET, + FIELD_PREP(FAN_MODE_LOWER, value) | + FIELD_PREP(FAN_MODE_UPPER, value)); kfree(r); return count; @@ -305,7 +299,7 @@ static ssize_t fan_mode_store(struct device *dev, static ssize_t fan_mode_show(struct device *dev, struct device_attribute *attr, char *buffer) { - unsigned int status; + unsigned int mode; union acpi_object *r; r = lg_wmab(dev, WM_FAN_MODE, WM_GET, 0); @@ -317,10 +311,10 @@ static ssize_t fan_mode_show(struct device *dev, return -EIO; } - status = r->integer.value & 0x01; + mode = FIELD_GET(FAN_MODE_LOWER, r->integer.value); kfree(r); - return sysfs_emit(buffer, "%d\n", status); + return sysfs_emit(buffer, "%d\n", mode); } static ssize_t usb_charge_store(struct device *dev, From dd948aa63ee48e3032804bd10c87a0f4edaa3515 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 15 Sep 2025 14:01:44 +0200 Subject: [PATCH 2822/3206] MAINTAINERS: Delete inactive maintainers from AF_XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Delete Björn Töpel and Jonathan Lemon as maintainer and reviewer, respectively, as they have not been contributing towards AF_XDP for several years. I have spoken to Björn and he is ok with his removal. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Reviewed-by: Jason Xing Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20250915120148.2922-1-magnus.karlsson@gmail.com --- CREDITS | 6 ++++++ MAINTAINERS | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index a687c3c35c4c23..e47df5e74abee1 100644 --- a/CREDITS +++ b/CREDITS @@ -3912,6 +3912,12 @@ S: C/ Federico Garcia Lorca 1 10-A S: Sevilla 41005 S: Spain +N: Björn Töpel +E: bjorn@kernel.org +D: AF_XDP +S: Gothenburg +S: Sweden + N: Linus Torvalds E: torvalds@linux-foundation.org D: Original kernel hacker diff --git a/MAINTAINERS b/MAINTAINERS index feefebfdb60a38..0760971fe43072 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27443,10 +27443,8 @@ F: tools/testing/selftests/bpf/*xdp* K: (?:\b|_)xdp(?:\b|_) XDP SOCKETS (AF_XDP) -M: Björn Töpel M: Magnus Karlsson M: Maciej Fijalkowski -R: Jonathan Lemon R: Stanislav Fomichev L: netdev@vger.kernel.org L: bpf@vger.kernel.org From a727bc0588e77efb502c52d7356e8ed036b6a83e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 17 Sep 2025 10:09:00 -0700 Subject: [PATCH 2823/3206] tpm: loongson: Add bufsiz parameter to tpm_loongson_send() Commit 5c83b07df9c5 ("tpm: Add a driver for Loongson TPM device") has a semantic conflict with commit 07d8004d6fb9 ("tpm: add bufsiz parameter in the .send callback"), as the former change was developed against a tree without the latter change. This results in a build error: drivers/char/tpm/tpm_loongson.c:48:17: error: initialization of 'int (*)(struct tpm_chip *, u8 *, size_t, size_t)' {aka 'int (*)(struct tpm_chip *, unsigned char *, long unsigned int, long unsigned int)'} from incompatible pointer type 'int (*)(struct tpm_chip *, u8 *, size_t)' {aka 'int (*)(struct tpm_chip *, unsigned char *, long unsigned int)'} [-Wincompatible-pointer-types] 48 | .send = tpm_loongson_send, | ^~~~~~~~~~~~~~~~~ drivers/char/tpm/tpm_loongson.c:48:17: note: (near initialization for 'tpm_loongson_ops.send') drivers/char/tpm/tpm_loongson.c:31:12: note: 'tpm_loongson_send' declared here 31 | static int tpm_loongson_send(struct tpm_chip *chip, u8 *buf, size_t count) | ^~~~~~~~~~~~~~~~~ Add the expected bufsiz parameter to tpm_loongson_send() to resolve the error. Fixes: 5c83b07df9c5 ("tpm: Add a driver for Loongson TPM device") Signed-off-by: Nathan Chancellor Reviewed-by: Jarkko Sakkinen Signed-off-by: Lee Jones --- drivers/char/tpm/tpm_loongson.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm_loongson.c b/drivers/char/tpm/tpm_loongson.c index a4ec23639911ce..9e50250763d16b 100644 --- a/drivers/char/tpm/tpm_loongson.c +++ b/drivers/char/tpm/tpm_loongson.c @@ -28,7 +28,7 @@ static int tpm_loongson_recv(struct tpm_chip *chip, u8 *buf, size_t count) return cmd_ret->data_len; } -static int tpm_loongson_send(struct tpm_chip *chip, u8 *buf, size_t count) +static int tpm_loongson_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz, size_t count) { struct loongson_se_engine *tpm_engine = dev_get_drvdata(&chip->dev); struct tpm_loongson_cmd *cmd = tpm_engine->command; From 017bdcdb3af941ca0070580a16162047090edd07 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 12 Sep 2025 09:46:38 +0200 Subject: [PATCH 2824/3206] MAINTAINERS: Adjust file entry in LOONGSON SECURITY ENGINE DRIVERS Commit 5c83b07df9c5 ("tpm: Add a driver for Loongson TPM device") adds a driver at drivers/char/tpm/tpm_loongson.c, and commit 74fddd5fbab8 ("MAINTAINERS: Add entry for Loongson Security Engine drivers") adds a new section LOONGSON SECURITY ENGINE DRIVERS intending to refer to that driver. It however adds the entry drivers/char/tpm_loongson.c; note that it misses the tpm subdirectory. Adjust the entry to refer to the intended file. Fixes: 74fddd5fbab8 ("MAINTAINERS: Add entry for Loongson Security Engine drivers") Signed-off-by: Lukas Bulwahn Signed-off-by: Lee Jones --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ba3b03fd79d571..c5d9354772e246 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14357,7 +14357,7 @@ LOONGSON SECURITY ENGINE DRIVERS M: Qunqin Zhao L: linux-crypto@vger.kernel.org S: Maintained -F: drivers/char/tpm_loongson.c +F: drivers/char/tpm/tpm_loongson.c F: drivers/crypto/loongson/ F: drivers/mfd/loongson-se.c F: include/linux/mfd/loongson-se.h From 64826db1e2e177b58dcbc7cf1e1379527be2185a Mon Sep 17 00:00:00 2001 From: Harrison Carter Date: Tue, 16 Sep 2025 16:11:01 +0100 Subject: [PATCH 2825/3206] dt-bindings: leds: as3645: Convert to DT schema Convert the ams,as3645a.txt to DT Schema format. Signed-off-by: Harrison Carter Reviewed-by: "Rob Herring (Arm)" Signed-off-by: Lee Jones --- .../devicetree/bindings/leds/ams,as3645a.txt | 85 ------------ .../devicetree/bindings/leds/ams,as3645a.yaml | 130 ++++++++++++++++++ 2 files changed, 130 insertions(+), 85 deletions(-) delete mode 100644 Documentation/devicetree/bindings/leds/ams,as3645a.txt create mode 100644 Documentation/devicetree/bindings/leds/ams,as3645a.yaml diff --git a/Documentation/devicetree/bindings/leds/ams,as3645a.txt b/Documentation/devicetree/bindings/leds/ams,as3645a.txt deleted file mode 100644 index 4af2987b25e923..00000000000000 --- a/Documentation/devicetree/bindings/leds/ams,as3645a.txt +++ /dev/null @@ -1,85 +0,0 @@ -Analog devices AS3645A device tree bindings - -The AS3645A flash LED controller can drive two LEDs, one high current -flash LED and one indicator LED. The high current flash LED can be -used in torch mode as well. - -Ranges below noted as [a, b] are closed ranges between a and b, i.e. a -and b are included in the range. - -Please also see common.txt in the same directory. - - -Required properties -=================== - -compatible : Must be "ams,as3645a". -reg : The I2C address of the device. Typically 0x30. -#address-cells : 1 -#size-cells : 0 - - -Required properties of the flash child node (0) -=============================================== - -reg: 0 -flash-timeout-us: Flash timeout in microseconds. The value must be in - the range [100000, 850000] and divisible by 50000. -flash-max-microamp: Maximum flash current in microamperes. Has to be - in the range between [200000, 500000] and - divisible by 20000. -led-max-microamp: Maximum torch (assist) current in microamperes. The - value must be in the range between [20000, 160000] and - divisible by 20000. -ams,input-max-microamp: Maximum flash controller input current. The - value must be in the range [1250000, 2000000] - and divisible by 50000. - - -Optional properties of the flash child node -=========================================== - -function : See Documentation/devicetree/bindings/leds/common.txt. -color : See Documentation/devicetree/bindings/leds/common.txt. -label : See Documentation/devicetree/bindings/leds/common.txt (deprecated). - - -Required properties of the indicator child node (1) -=================================================== - -reg: 1 -led-max-microamp: Maximum indicator current. The allowed values are - 2500, 5000, 7500 and 10000. - -Optional properties of the indicator child node -=============================================== - -function : See Documentation/devicetree/bindings/leds/common.txt. -color : See Documentation/devicetree/bindings/leds/common.txt. -label : See Documentation/devicetree/bindings/leds/common.txt (deprecated). - - -Example -======= - -#include - - as3645a@30 { - #address-cells = <1>; - #size-cells = <0>; - reg = <0x30>; - compatible = "ams,as3645a"; - led@0 { - reg = <0x0>; - flash-timeout-us = <150000>; - flash-max-microamp = <320000>; - led-max-microamp = <60000>; - ams,input-max-microamp = <1750000>; - function = LED_FUNCTION_FLASH; - }; - led@1 { - reg = <0x1>; - led-max-microamp = <10000>; - function = LED_FUNCTION_INDICATOR; - }; - }; diff --git a/Documentation/devicetree/bindings/leds/ams,as3645a.yaml b/Documentation/devicetree/bindings/leds/ams,as3645a.yaml new file mode 100644 index 00000000000000..250a4b275d8a8a --- /dev/null +++ b/Documentation/devicetree/bindings/leds/ams,as3645a.yaml @@ -0,0 +1,130 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/leds/ams,as3645a.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Analog Devices AS3645A LED Controller + +maintainers: + - Sakari Ailus + +description: + The AS3645A flash LED controller can drive two LEDs, one + high current flash LED and one indicator LED. The high + current flash LED can be used in torch mode as well. + +properties: + compatible: + const: ams,as3645a + + "#address-cells": + const: 1 + + "#size-cells": + const: 0 + + reg: + maxItems: 1 + + led@0: + description: led0 describes the 'flash' feature + type: object + $ref: common.yaml# + unevaluatedProperties: false + + properties: + reg: + const: 0 + + flash-timeout-us: + minimum: 100000 + maximum: 850000 + multipleOf: 50000 + + flash-max-microamp: + minimum: 200000 + maximum: 500000 + multipleOf: 20000 + + led-max-microamp: + minimum: 20000 + maximum: 160000 + multipleOf: 20000 + description: + Maximum current when in torch (assist) mode. + + ams,input-max-microamp: + minimum: 1250000 + maximum: 2000000 + multipleOf: 50000 + + required: + - reg + - flash-timeout-us + - flash-max-microamp + - led-max-microamp + - ams,input-max-microamp + + led@1: + description: led1 describes the 'indicator' feature + type: object + $ref: common.yaml# + unevaluatedProperties: false + + properties: + reg: + const: 1 + + led-max-microamp: + enum: + - 2500 + - 5000 + - 7500 + - 10000 + description: + Maximum indicator current. + + required: + - reg + - led-max-microamp + +required: + - compatible + - reg + - "#size-cells" + - "#address-cells" + +additionalProperties: false + +examples: + - | + #include + + i2c{ + #address-cells = <1>; + #size-cells = <0>; + + led-controller@30 { + compatible = "ams,as3645a"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x30>; + + led@0 { + reg = <0>; + flash-timeout-us = <150000>; + flash-max-microamp = <320000>; + led-max-microamp = <60000>; + ams,input-max-microamp = <1750000>; + function = LED_FUNCTION_FLASH; + }; + + led@1 { + reg = <1>; + led-max-microamp = <10000>; + function = LED_FUNCTION_INDICATOR; + }; + }; + }; +... From f707d2f7a0c7793406daf0e223bad01bb748343e Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Wed, 17 Sep 2025 17:38:57 +0200 Subject: [PATCH 2826/3206] s390/tape: Add WQ_PERCPU to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch adds a new WQ_PERCPU flag to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Acked-by: Alexander Gordeev Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- drivers/s390/char/tape_3590.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/char/tape_3590.c b/drivers/s390/char/tape_3590.c index a1bafaf73f87ac..2a2931d303cb94 100644 --- a/drivers/s390/char/tape_3590.c +++ b/drivers/s390/char/tape_3590.c @@ -1671,7 +1671,7 @@ tape_3590_init(void) DBF_EVENT(3, "3590 init\n"); - tape_3590_wq = alloc_workqueue("tape_3590", 0, 0); + tape_3590_wq = alloc_workqueue("tape_3590", WQ_PERCPU, 0); if (!tape_3590_wq) return -ENOMEM; From dbfe205a344a865b9c36706738f45bc554a040c7 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Wed, 17 Sep 2025 17:38:58 +0200 Subject: [PATCH 2827/3206] s390/diag324: Replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue, yet nothing in its name tells about that CPU affinity constraint, which is very often not required by users. Make it clear by renaming system_wq to system_percpu_wq. queue_work() / queue_delayed_work() mod_delayed_work() will now use the new per-cpu wq. The old wq will be kept for a few release cylces. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/kernel/diag/diag324.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/diag/diag324.c b/arch/s390/kernel/diag/diag324.c index 7fa4c0b7eb6c71..f0a8b4841fb964 100644 --- a/arch/s390/kernel/diag/diag324.c +++ b/arch/s390/kernel/diag/diag324.c @@ -116,7 +116,7 @@ static void pibwork_handler(struct work_struct *work) mutex_lock(&pibmutex); timedout = ktime_add_ns(data->expire, PIBWORK_DELAY); if (ktime_before(ktime_get(), timedout)) { - mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); goto out; } vfree(data->pib); @@ -174,7 +174,7 @@ long diag324_pibbuf(unsigned long arg) pib_update(data); data->sequence++; data->expire = ktime_add_ns(ktime_get(), tod_to_ns(data->pib->intv)); - mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); first = false; } rc = data->rc; From 72105fc1c1cb67e779fe2da9d22ffae189c00cfc Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Wed, 17 Sep 2025 17:38:59 +0200 Subject: [PATCH 2828/3206] s390: Replace use of system_wq with system_dfl_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue (replaced by system_percpu_wq), but the current code does not benefit from it. Because of that, system_wq has been replaced by system_dfl_wq, the new unbound workqueue. The old wq will be kept for a few release cylces. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Acked-by: Heiko Carstens Reviewed-by: Mete Durlu Signed-off-by: Alexander Gordeev --- arch/s390/kernel/hiperdispatch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c index e7b66d046e8d38..2507bc3f775741 100644 --- a/arch/s390/kernel/hiperdispatch.c +++ b/arch/s390/kernel/hiperdispatch.c @@ -191,7 +191,7 @@ int hd_enable_hiperdispatch(void) return 0; if (hd_online_cores <= hd_entitled_cores) return 0; - mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); + mod_delayed_work(system_dfl_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); hd_update_capacities(); return 1; } From 088bb10e37252034ec58a6152f20bfdc8a837f54 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 23 Sep 2025 17:34:30 +0200 Subject: [PATCH 2829/3206] s390/mm: Add memory allocation profiling hooks Similar to common code changes [1] add alloc_hook() wrappers to page table allocation functions to allow for memory allocation profiling. If CONFIG_MEM_ALLOC_PROFILING is enabled call sites of page table allocations are accounted, instead of e.g. only crst_table_alloc() and page_table_alloc(). This allows for slightly better profiling data, and the output of /proc/allocinfo is similar to other architectures. Without alloc_hook() wrappers the output of /proc/allocinfo looks like this: 17096704 4174 mm/memory.c:1061 func:folio_prealloc 17809408 4348 mm/memory.c:1063 func:folio_prealloc 0 0 mm/memory.c:4422 func:alloc_swap_folio 0 0 mm/memory.c:4286 func:__alloc_swap_folio 0 0 mm/memory.c:4971 func:alloc_anon_folio ... 1589248 97 arch/s390/mm/pgalloc.c:25 func:crst_table_alloc 0 0 arch/s390/mm/pgalloc.c:124 func:page_table_alloc_pgste 4280320 1045 arch/s390/mm/pgalloc.c:149 func:page_table_alloc With alloc_hook() wrappers: 1097728 268 mm/memory.c:5147 func:__do_fault 20119552 4912 mm/memory.c:1061 func:folio_prealloc 17534976 4281 mm/memory.c:1063 func:folio_prealloc 0 0 mm/memory.c:4422 func:alloc_swap_folio 0 0 mm/memory.c:4286 func:__alloc_swap_folio 786432 192 mm/memory.c:452 func:__pte_alloc 405504 99 mm/memory.c:464 func:__pte_alloc_kernel 1880064 459 mm/memory.c:5525 func:do_fault_around 0 0 mm/memory.c:6403 func:__p4d_alloc 0 0 mm/memory.c:6426 func:__pud_alloc 1064960 65 mm/memory.c:6450 func:__pmd_alloc 0 0 mm/memory.c:4971 func:alloc_anon_folio 0 0 mm/memory.c:5233 func:do_set_pmd [1] commit 2c321f3f70bc ("mm: change inlined allocation helpers to account at the call site") Signed-off-by: Heiko Carstens Acked-by: Alexander Gordeev Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/pgalloc.h | 30 +++++++++++++++++++----------- arch/s390/mm/pgalloc.c | 12 ++++++------ 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 5345398df65342..a16e650723719a 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -19,12 +19,16 @@ #define CRST_ALLOC_ORDER 2 -unsigned long *crst_table_alloc(struct mm_struct *); +unsigned long *crst_table_alloc_noprof(struct mm_struct *); +#define crst_table_alloc(...) alloc_hooks(crst_table_alloc_noprof(__VA_ARGS__)) void crst_table_free(struct mm_struct *, unsigned long *); -unsigned long *page_table_alloc(struct mm_struct *); -struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm); +unsigned long *page_table_alloc_noprof(struct mm_struct *); +#define page_table_alloc(...) alloc_hooks(page_table_alloc_noprof(__VA_ARGS__)) void page_table_free(struct mm_struct *, unsigned long *); + +struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm); +#define page_table_alloc_pgste(...) alloc_hooks(page_table_alloc_pgste_noprof(__VA_ARGS__)) void page_table_free_pgste(struct ptdesc *ptdesc); static inline void crst_table_init(unsigned long *crst, unsigned long entry) @@ -48,9 +52,9 @@ static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long return addr; } -static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) +static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long address) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -59,6 +63,7 @@ static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) return (p4d_t *) table; } +#define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { @@ -69,9 +74,9 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) crst_table_free(mm, (unsigned long *) p4d); } -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long address) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -80,6 +85,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) return (pud_t *) table; } +#define pud_alloc_one(...) alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__)) static inline void pud_free(struct mm_struct *mm, pud_t *pud) { @@ -90,9 +96,9 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) crst_table_free(mm, (unsigned long *) pud); } -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) +static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -103,6 +109,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) } return (pmd_t *) table; } +#define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { @@ -127,9 +134,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) set_pud(pud, __pud(_REGION3_ENTRY | __pa(pmd))); } -static inline pgd_t *pgd_alloc(struct mm_struct *mm) +static inline pgd_t *pgd_alloc_noprof(struct mm_struct *mm) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -137,6 +144,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return (pgd_t *) table; } +#define pgd_alloc(...) alloc_hooks(pgd_alloc_noprof(__VA_ARGS__)) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index ad3e0f7f7fc1f9..36700384fe6bdd 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -14,7 +14,7 @@ #include #include -unsigned long *crst_table_alloc(struct mm_struct *mm) +unsigned long *crst_table_alloc_noprof(struct mm_struct *mm) { gfp_t gfp = GFP_KERNEL_ACCOUNT; struct ptdesc *ptdesc; @@ -22,7 +22,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - ptdesc = pagetable_alloc(gfp, CRST_ALLOC_ORDER); + ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER); if (!ptdesc) return NULL; table = ptdesc_to_virt(ptdesc); @@ -116,12 +116,12 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) #ifdef CONFIG_PGSTE -struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) +struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm) { struct ptdesc *ptdesc; u64 *table; - ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, 0); + ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0); if (ptdesc) { table = (u64 *)ptdesc_to_virt(ptdesc); __arch_set_page_dat(table, 1); @@ -138,7 +138,7 @@ void page_table_free_pgste(struct ptdesc *ptdesc) #endif /* CONFIG_PGSTE */ -unsigned long *page_table_alloc(struct mm_struct *mm) +unsigned long *page_table_alloc_noprof(struct mm_struct *mm) { gfp_t gfp = GFP_KERNEL_ACCOUNT; struct ptdesc *ptdesc; @@ -146,7 +146,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - ptdesc = pagetable_alloc(gfp, 0); + ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(mm, ptdesc)) { From d53ea977adf913a6e5024323e6b7e02326d4453c Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 24 Sep 2025 18:33:58 -0700 Subject: [PATCH 2830/3206] rust: pci: display symbolic PCI class names The Display implementation for Class was forwarding directly to Debug printing, resulting in raw hex values instead of PCI Class strings. Improve things by doing a stringify!() call for each PCI Class item. This now prints symbolic names such as "DISPLAY_VGA", instead of "Class(0x030000)". It still falls back to Debug formatting for unknown class values. Suggested-by: Danilo Krummrich Signed-off-by: John Hubbard Reviewed-by: Alexandre Courbot Signed-off-by: Danilo Krummrich --- rust/kernel/pci/id.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/rust/kernel/pci/id.rs b/rust/kernel/pci/id.rs index 8ee1dc5c3057f5..6e081de30faf03 100644 --- a/rust/kernel/pci/id.rs +++ b/rust/kernel/pci/id.rs @@ -50,6 +50,17 @@ macro_rules! define_all_pci_classes { pub const $variant: Self = Self(Self::to_24bit_class($binding)); )+ } + + impl fmt::Display for Class { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + $( + &Self::$variant => write!(f, stringify!($variant)), + )+ + _ => ::fmt(self, f), + } + } + } }; } @@ -87,12 +98,6 @@ impl fmt::Debug for Class { } } -impl fmt::Display for Class { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - ::fmt(self, f) - } -} - impl ClassMask { /// Get the raw mask value. #[inline] From 6d97171ac6585de698df019b0bfea3f123fd8385 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 24 Sep 2025 18:33:59 -0700 Subject: [PATCH 2831/3206] rust: pci: display symbolic PCI vendor names The Display implementation for Vendor was forwarding directly to Debug printing, resulting in raw hex values instead of PCI Vendor strings. Improve things by doing a stringify!() call for each PCI Vendor item. This now prints symbolic names such as "NVIDIA", instead of "Vendor(0x10de)". It still falls back to Debug formatting for unknown class values. Suggested-by: Danilo Krummrich Signed-off-by: John Hubbard Reviewed-by: Alexandre Courbot [ Remove #[inline] for Vendor::fmt(). - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/pci/id.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/rust/kernel/pci/id.rs b/rust/kernel/pci/id.rs index 6e081de30faf03..7f2a7f57507f24 100644 --- a/rust/kernel/pci/id.rs +++ b/rust/kernel/pci/id.rs @@ -135,6 +135,17 @@ macro_rules! define_all_pci_vendors { pub const $variant: Self = Self($binding as u16); )+ } + + impl fmt::Display for Vendor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + $( + &Self::$variant => write!(f, stringify!($variant)), + )+ + _ => ::fmt(self, f), + } + } + } }; } @@ -160,13 +171,6 @@ impl fmt::Debug for Vendor { } } -impl fmt::Display for Vendor { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - ::fmt(self, f) - } -} - define_all_pci_classes! { NOT_DEFINED = bindings::PCI_CLASS_NOT_DEFINED, // 0x000000 NOT_DEFINED_VGA = bindings::PCI_CLASS_NOT_DEFINED_VGA, // 0x000100 From 31c092baea5a1c5473d2183c2f2a36c76d083c59 Mon Sep 17 00:00:00 2001 From: Mohamad Kamal Date: Sun, 14 Sep 2025 10:40:10 +0200 Subject: [PATCH 2832/3206] hwmon: (asus-ec-sensors) add TUF GAMING X670E PLUS WIFI Add support for TUF GAMING X670E PLUS WIFI Signed-off-by: Mohamad Kamal Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20250914084019.1108941-1-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/asus_ec_sensors.rst | 1 + drivers/hwmon/asus-ec-sensors.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index 5801bc4279c0c6..a5a58c00c32234 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -46,6 +46,7 @@ Supported boards: * ROG ZENITH II EXTREME * ROG ZENITH II EXTREME ALPHA * TUF GAMING X670E PLUS + * TUF GAMING X670E PLUS WIFI Authors: - Eugene Shalygin diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index e27a7b08e7b777..34a8f6b834c97f 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -805,6 +805,8 @@ static const struct dmi_system_id dmi_table[] = { &board_info_zenith_ii_extreme), DMI_EXACT_MATCH_ASUS_BOARD_NAME("TUF GAMING X670E-PLUS", &board_info_tuf_gaming_x670e_plus), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("TUF GAMING X670E-PLUS WIFI", + &board_info_tuf_gaming_x670e_plus), {}, }; From 0f6eae86e626e0561d2f545a3183be2e12108410 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Fri, 12 Sep 2025 14:07:42 +0200 Subject: [PATCH 2833/3206] dt-bindings: hwmon: sl28cpld: add sa67mcu compatible The Kontron SMARC-sAM67 module features an on-board house keeping uC. It is designed to be compatible with the older sl28cpld implementation, but has different sensors, like voltage and temperature monitoring. Add a new compatible for that board. Signed-off-by: Michael Walle Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250912120745.2295115-5-mwalle@kernel.org Signed-off-by: Guenter Roeck --- .../devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml b/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml index 010333cb25c0e1..ec20625db4545e 100644 --- a/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml +++ b/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml @@ -16,6 +16,7 @@ description: | properties: compatible: enum: + - kontron,sa67mcu-hwmon - kontron,sl28cpld-fan reg: From 443b39c82c322c9f3c38bea0389fe927ba00b3b4 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Fri, 12 Sep 2025 14:07:44 +0200 Subject: [PATCH 2834/3206] hwmon: add SMARC-sAM67 support Add a new driver for the Kontron SMARC-sAM67 board management controller. It has two voltage sensors and one temperature sensor. Signed-off-by: Michael Walle Link: https://lore.kernel.org/r/20250912120745.2295115-7-mwalle@kernel.org [groeck: Added sa67 to index.rst] Signed-off-by: Guenter Roeck --- Documentation/hwmon/index.rst | 1 + Documentation/hwmon/sa67.rst | 41 +++++++++ MAINTAINERS | 1 + drivers/hwmon/Kconfig | 10 +++ drivers/hwmon/Makefile | 1 + drivers/hwmon/sa67mcu-hwmon.c | 161 ++++++++++++++++++++++++++++++++++ 6 files changed, 215 insertions(+) create mode 100644 Documentation/hwmon/sa67.rst create mode 100644 drivers/hwmon/sa67mcu-hwmon.c diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst index 1803a43f6216b2..51a5bdf75b0865 100644 --- a/Documentation/hwmon/index.rst +++ b/Documentation/hwmon/index.rst @@ -214,6 +214,7 @@ Hardware Monitoring Kernel Drivers q54sj108a2 qnap-mcu-hwmon raspberrypi-hwmon + sa67 sbrmi sbtsi_temp sch5627 diff --git a/Documentation/hwmon/sa67.rst b/Documentation/hwmon/sa67.rst new file mode 100644 index 00000000000000..029c7c169b7fd2 --- /dev/null +++ b/Documentation/hwmon/sa67.rst @@ -0,0 +1,41 @@ +.. SPDX-License-Identifier: GPL-2.0-only + +Kernel driver sa67mcu +===================== + +Supported chips: + + * Kontron sa67mcu + + Prefix: 'sa67mcu' + + Datasheet: not available + +Authors: Michael Walle + +Description +----------- + +The sa67mcu is a board management controller which also exposes a hardware +monitoring controller. + +The controller has two voltage and one temperature sensor. The values are +hold in two 8 bit registers to form one 16 bit value. Reading the lower byte +will also capture the high byte to make the access atomic. The unit of the +volatge sensors are 1mV and the unit of the temperature sensor is 0.1degC. + +Sysfs entries +------------- + +The following attributes are supported. + +======================= ======================================================== +in0_label "VDDIN" +in0_input Measured VDDIN voltage. + +in1_label "VDD_RTC" +in1_input Measured VDD_RTC voltage. + +temp1_input MCU temperature. Roughly the board temperature. +======================= ======================================================== + diff --git a/MAINTAINERS b/MAINTAINERS index 5642fd647326b5..8e314837eaa416 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23231,6 +23231,7 @@ F: Documentation/devicetree/bindings/mfd/kontron,sl28cpld.yaml F: Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml F: Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml F: drivers/gpio/gpio-sl28cpld.c +F: drivers/hwmon/sa67mcu-hwmon.c F: drivers/hwmon/sl28cpld-hwmon.c F: drivers/irqchip/irq-sl28cpld.c F: drivers/pwm/pwm-sl28cpld.c diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index d61014d7968a4a..374590fbadac0b 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -1905,6 +1905,16 @@ config SENSORS_RASPBERRYPI_HWMON This driver can also be built as a module. If so, the module will be called raspberrypi-hwmon. +config SENSORS_SA67MCU + tristate "Kontron sa67mcu hardware monitoring driver" + depends on MFD_SL28CPLD || COMPILE_TEST + help + If you say yes here you get support for the voltage and temperature + monitor of the sa67 board management controller. + + This driver can also be built as a module. If so, the module + will be called sa67mcu-hwmon. + config SENSORS_SL28CPLD tristate "Kontron sl28cpld hardware monitoring driver" depends on MFD_SL28CPLD || COMPILE_TEST diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index 051981eb8a5089..2956898776bb17 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -197,6 +197,7 @@ obj-$(CONFIG_SENSORS_PT5161L) += pt5161l.o obj-$(CONFIG_SENSORS_PWM_FAN) += pwm-fan.o obj-$(CONFIG_SENSORS_QNAP_MCU_HWMON) += qnap-mcu-hwmon.o obj-$(CONFIG_SENSORS_RASPBERRYPI_HWMON) += raspberrypi-hwmon.o +obj-$(CONFIG_SENSORS_SA67MCU) += sa67mcu-hwmon.o obj-$(CONFIG_SENSORS_SBTSI) += sbtsi_temp.o obj-$(CONFIG_SENSORS_SBRMI) += sbrmi.o obj-$(CONFIG_SENSORS_SCH56XX_COMMON)+= sch56xx-common.o diff --git a/drivers/hwmon/sa67mcu-hwmon.c b/drivers/hwmon/sa67mcu-hwmon.c new file mode 100644 index 00000000000000..22f703b7b25609 --- /dev/null +++ b/drivers/hwmon/sa67mcu-hwmon.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sl67mcu hardware monitoring driver + * + * Copyright 2025 Kontron Europe GmbH + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define SA67MCU_VOLTAGE(n) (0x00 + ((n) * 2)) +#define SA67MCU_TEMP(n) (0x04 + ((n) * 2)) + +struct sa67mcu_hwmon { + struct regmap *regmap; + u32 offset; +}; + +static int sa67mcu_hwmon_read(struct device *dev, + enum hwmon_sensor_types type, u32 attr, + int channel, long *input) +{ + struct sa67mcu_hwmon *hwmon = dev_get_drvdata(dev); + unsigned int offset; + u8 reg[2]; + int ret; + + switch (type) { + case hwmon_in: + switch (attr) { + case hwmon_in_input: + offset = hwmon->offset + SA67MCU_VOLTAGE(channel); + break; + default: + return -EOPNOTSUPP; + } + break; + case hwmon_temp: + switch (attr) { + case hwmon_temp_input: + offset = hwmon->offset + SA67MCU_TEMP(channel); + break; + default: + return -EOPNOTSUPP; + } + break; + default: + return -EOPNOTSUPP; + } + + /* Reading the low byte will capture the value */ + ret = regmap_bulk_read(hwmon->regmap, offset, reg, ARRAY_SIZE(reg)); + if (ret) + return ret; + + *input = reg[1] << 8 | reg[0]; + + /* Temperatures are s16 and in 0.1degC steps. */ + if (type == hwmon_temp) + *input = sign_extend32(*input, 15) * 100; + + return 0; +} + +static const struct hwmon_channel_info * const sa67mcu_hwmon_info[] = { + HWMON_CHANNEL_INFO(in, + HWMON_I_INPUT | HWMON_I_LABEL, + HWMON_I_INPUT | HWMON_I_LABEL), + HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT), + NULL +}; + +static const char *const sa67mcu_hwmon_in_labels[] = { + "VDDIN", + "VDD_RTC", +}; + +static int sa67mcu_hwmon_read_string(struct device *dev, + enum hwmon_sensor_types type, u32 attr, + int channel, const char **str) +{ + switch (type) { + case hwmon_in: + switch (attr) { + case hwmon_in_label: + *str = sa67mcu_hwmon_in_labels[channel]; + return 0; + default: + return -EOPNOTSUPP; + } + default: + return -EOPNOTSUPP; + } +} + +static const struct hwmon_ops sa67mcu_hwmon_ops = { + .visible = 0444, + .read = sa67mcu_hwmon_read, + .read_string = sa67mcu_hwmon_read_string, +}; + +static const struct hwmon_chip_info sa67mcu_hwmon_chip_info = { + .ops = &sa67mcu_hwmon_ops, + .info = sa67mcu_hwmon_info, +}; + +static int sa67mcu_hwmon_probe(struct platform_device *pdev) +{ + struct sa67mcu_hwmon *hwmon; + struct device *hwmon_dev; + int ret; + + if (!pdev->dev.parent) + return -ENODEV; + + hwmon = devm_kzalloc(&pdev->dev, sizeof(*hwmon), GFP_KERNEL); + if (!hwmon) + return -ENOMEM; + + hwmon->regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!hwmon->regmap) + return -ENODEV; + + ret = device_property_read_u32(&pdev->dev, "reg", &hwmon->offset); + if (ret) + return -EINVAL; + + hwmon_dev = devm_hwmon_device_register_with_info(&pdev->dev, + "sa67mcu_hwmon", hwmon, + &sa67mcu_hwmon_chip_info, + NULL); + if (IS_ERR(hwmon_dev)) + dev_err(&pdev->dev, "failed to register as hwmon device"); + + return PTR_ERR_OR_ZERO(hwmon_dev); +} + +static const struct of_device_id sa67mcu_hwmon_of_match[] = { + { .compatible = "kontron,sa67mcu-hwmon", }, + {} +}; +MODULE_DEVICE_TABLE(of, sa67mcu_hwmon_of_match); + +static struct platform_driver sa67mcu_hwmon_driver = { + .probe = sa67mcu_hwmon_probe, + .driver = { + .name = "sa67mcu-hwmon", + .of_match_table = sa67mcu_hwmon_of_match, + }, +}; +module_platform_driver(sa67mcu_hwmon_driver); + +MODULE_DESCRIPTION("sa67mcu Hardware Monitoring Driver"); +MODULE_AUTHOR("Michael Walle "); +MODULE_LICENSE("GPL"); From 60ac65a31041b0e5dfd736a79027314b9d533ef5 Mon Sep 17 00:00:00 2001 From: Sung-Chi Li Date: Thu, 11 Sep 2025 06:56:34 +0000 Subject: [PATCH 2835/3206] platform/chrome: update pwm fan control host commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update cros_ec_commands.h to include definitions for getting PWM fan duty, getting and setting the fan control mode. Signed-off-by: Sung-Chi Li Acked-by: Tzung-Bi Shih Reviewed-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250911-cros_ec_fan-v6-1-a1446cc098af@google.com Signed-off-by: Guenter Roeck --- .../linux/platform_data/cros_ec_commands.h | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index c19b404e3d8d9e..69294f79cc88aa 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -1825,6 +1825,16 @@ struct ec_response_pwm_get_duty { uint16_t duty; /* Duty cycle, EC_PWM_MAX_DUTY = 100% */ } __ec_align2; +#define EC_CMD_PWM_GET_FAN_DUTY 0x0027 + +struct ec_params_pwm_get_fan_duty { + uint8_t fan_idx; +} __ec_align1; + +struct ec_response_pwm_get_fan_duty { + uint32_t percent; /* Percentage of duty cycle, ranging from 0 ~ 100 */ +} __ec_align4; + /*****************************************************************************/ /* * Lightbar commands. This looks worse than it is. Since we only use one HOST @@ -3127,14 +3137,31 @@ struct ec_params_thermal_set_threshold_v1 { /****************************************************************************/ -/* Toggle automatic fan control */ +/* Set or get fan control mode */ #define EC_CMD_THERMAL_AUTO_FAN_CTRL 0x0052 +enum ec_auto_fan_ctrl_cmd { + EC_AUTO_FAN_CONTROL_CMD_SET = 0, + EC_AUTO_FAN_CONTROL_CMD_GET, +}; + /* Version 1 of input params */ struct ec_params_auto_fan_ctrl_v1 { uint8_t fan_idx; } __ec_align1; +/* Version 2 of input params */ +struct ec_params_auto_fan_ctrl_v2 { + uint8_t fan_idx; + uint8_t cmd; /* enum ec_auto_fan_ctrl_cmd */ + uint8_t set_auto; /* only used with EC_AUTO_FAN_CONTROL_CMD_SET - bool + */ +} __ec_align4; + +struct ec_response_auto_fan_control { + uint8_t is_auto; /* bool */ +} __ec_align1; + /* Get/Set TMP006 calibration data */ #define EC_CMD_TMP006_GET_CALIBRATION 0x0053 #define EC_CMD_TMP006_SET_CALIBRATION 0x0054 From fb8e659309f72e54ed011c6bfe98597b9236805d Mon Sep 17 00:00:00 2001 From: Sung-Chi Li Date: Thu, 11 Sep 2025 06:56:35 +0000 Subject: [PATCH 2836/3206] hwmon: (cros_ec) add PWM control over fans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newer EC firmware supports controlling fans through host commands, so adding corresponding implementations for controlling these fans in the driver for other kernel services and userspace to control them. The driver will first probe the supported host command versions (get and set of fan PWM values, get and set of fan control mode) to see if the connected EC fulfills the requirements of controlling the fan, then exposes corresponding sysfs nodes for userspace to control the fan with corresponding read and write implementations. As EC will automatically change the fan mode to auto when the device is suspended, the power management hooks are added as well to keep the fan control mode and fan PWM value consistent during suspend and resume. As we need to access the hwmon device in the power management hook, update the driver by storing the hwmon device in the driver data as well. Signed-off-by: Sung-Chi Li Acked-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250911-cros_ec_fan-v6-2-a1446cc098af@google.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/cros_ec_hwmon.rst | 5 +- drivers/hwmon/cros_ec_hwmon.c | 230 ++++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 1 deletion(-) diff --git a/Documentation/hwmon/cros_ec_hwmon.rst b/Documentation/hwmon/cros_ec_hwmon.rst index 47ecae983bdbef..355557a08c9a54 100644 --- a/Documentation/hwmon/cros_ec_hwmon.rst +++ b/Documentation/hwmon/cros_ec_hwmon.rst @@ -23,4 +23,7 @@ ChromeOS embedded controller used in Chromebooks and other devices. The channel labels exposed via hwmon are retrieved from the EC itself. -Fan and temperature readings are supported. +Fan and temperature readings are supported. PWM fan control is also supported if +the EC also supports setting fan PWM values and fan mode. Note that EC will +switch fan control mode back to auto when suspended. This driver will restore +the fan state to what they were before suspended when resumed. diff --git a/drivers/hwmon/cros_ec_hwmon.c b/drivers/hwmon/cros_ec_hwmon.c index 9991c3fa020ac8..9eddc554ddefde 100644 --- a/drivers/hwmon/cros_ec_hwmon.c +++ b/drivers/hwmon/cros_ec_hwmon.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -17,10 +18,17 @@ #define DRV_NAME "cros-ec-hwmon" +#define CROS_EC_HWMON_PWM_GET_FAN_DUTY_CMD_VERSION 0 +#define CROS_EC_HWMON_PWM_SET_FAN_DUTY_CMD_VERSION 1 +#define CROS_EC_HWMON_THERMAL_AUTO_FAN_CTRL_CMD_VERSION 2 + struct cros_ec_hwmon_priv { struct cros_ec_device *cros_ec; const char *temp_sensor_names[EC_TEMP_SENSOR_ENTRIES + EC_TEMP_SENSOR_B_ENTRIES]; u8 usable_fans; + bool fan_control_supported; + u8 manual_fans; /* bits to indicate whether the fan is set to manual */ + u8 manual_fan_pwm[EC_FAN_SPEED_ENTRIES]; }; static int cros_ec_hwmon_read_fan_speed(struct cros_ec_device *cros_ec, u8 index, u16 *speed) @@ -36,6 +44,42 @@ static int cros_ec_hwmon_read_fan_speed(struct cros_ec_device *cros_ec, u8 index return 0; } +static int cros_ec_hwmon_read_pwm_value(struct cros_ec_device *cros_ec, u8 index, u8 *pwm_value) +{ + struct ec_params_pwm_get_fan_duty req = { + .fan_idx = index, + }; + struct ec_response_pwm_get_fan_duty resp; + int ret; + + ret = cros_ec_cmd(cros_ec, CROS_EC_HWMON_PWM_GET_FAN_DUTY_CMD_VERSION, + EC_CMD_PWM_GET_FAN_DUTY, &req, sizeof(req), &resp, sizeof(resp)); + if (ret < 0) + return ret; + + *pwm_value = (u8)DIV_ROUND_CLOSEST(le32_to_cpu(resp.percent) * 255, 100); + return 0; +} + +static int cros_ec_hwmon_read_pwm_enable(struct cros_ec_device *cros_ec, u8 index, + u8 *control_method) +{ + struct ec_params_auto_fan_ctrl_v2 req = { + .cmd = EC_AUTO_FAN_CONTROL_CMD_GET, + .fan_idx = index, + }; + struct ec_response_auto_fan_control resp; + int ret; + + ret = cros_ec_cmd(cros_ec, CROS_EC_HWMON_THERMAL_AUTO_FAN_CTRL_CMD_VERSION, + EC_CMD_THERMAL_AUTO_FAN_CTRL, &req, sizeof(req), &resp, sizeof(resp)); + if (ret < 0) + return ret; + + *control_method = resp.is_auto ? 2 : 1; + return 0; +} + static int cros_ec_hwmon_read_temp(struct cros_ec_device *cros_ec, u8 index, u8 *temp) { unsigned int offset; @@ -75,6 +119,8 @@ static int cros_ec_hwmon_read(struct device *dev, enum hwmon_sensor_types type, { struct cros_ec_hwmon_priv *priv = dev_get_drvdata(dev); int ret = -EOPNOTSUPP; + u8 control_method; + u8 pwm_value; u16 speed; u8 temp; @@ -92,6 +138,17 @@ static int cros_ec_hwmon_read(struct device *dev, enum hwmon_sensor_types type, if (ret == 0) *val = cros_ec_hwmon_is_error_fan(speed); } + } else if (type == hwmon_pwm) { + if (attr == hwmon_pwm_enable) { + ret = cros_ec_hwmon_read_pwm_enable(priv->cros_ec, channel, + &control_method); + if (ret == 0) + *val = control_method; + } else if (attr == hwmon_pwm_input) { + ret = cros_ec_hwmon_read_pwm_value(priv->cros_ec, channel, &pwm_value); + if (ret == 0) + *val = pwm_value; + } } else if (type == hwmon_temp) { if (attr == hwmon_temp_input) { ret = cros_ec_hwmon_read_temp(priv->cros_ec, channel, &temp); @@ -124,6 +181,74 @@ static int cros_ec_hwmon_read_string(struct device *dev, enum hwmon_sensor_types return -EOPNOTSUPP; } +static int cros_ec_hwmon_set_fan_pwm_val(struct cros_ec_device *cros_ec, u8 index, u8 val) +{ + struct ec_params_pwm_set_fan_duty_v1 req = { + .fan_idx = index, + .percent = DIV_ROUND_CLOSEST((uint32_t)val * 100, 255), + }; + int ret; + + ret = cros_ec_cmd(cros_ec, CROS_EC_HWMON_PWM_SET_FAN_DUTY_CMD_VERSION, + EC_CMD_PWM_SET_FAN_DUTY, &req, sizeof(req), NULL, 0); + if (ret < 0) + return ret; + return 0; +} + +static int cros_ec_hwmon_write_pwm_input(struct cros_ec_device *cros_ec, u8 index, u8 val) +{ + u8 control_method; + int ret; + + ret = cros_ec_hwmon_read_pwm_enable(cros_ec, index, &control_method); + if (ret) + return ret; + if (control_method != 1) + return -EOPNOTSUPP; + + return cros_ec_hwmon_set_fan_pwm_val(cros_ec, index, val); +} + +static int cros_ec_hwmon_write_pwm_enable(struct cros_ec_device *cros_ec, u8 index, u8 val) +{ + struct ec_params_auto_fan_ctrl_v2 req = { + .fan_idx = index, + .cmd = EC_AUTO_FAN_CONTROL_CMD_SET, + }; + int ret; + + /* No CrOS EC supports no fan speed control */ + if (val == 0) + return -EOPNOTSUPP; + + req.set_auto = (val != 1) ? true : false; + ret = cros_ec_cmd(cros_ec, CROS_EC_HWMON_THERMAL_AUTO_FAN_CTRL_CMD_VERSION, + EC_CMD_THERMAL_AUTO_FAN_CTRL, &req, sizeof(req), NULL, 0); + if (ret < 0) + return ret; + return 0; +} + +static int cros_ec_hwmon_write(struct device *dev, enum hwmon_sensor_types type, u32 attr, + int channel, long val) +{ + struct cros_ec_hwmon_priv *priv = dev_get_drvdata(dev); + + if (type == hwmon_pwm) { + switch (attr) { + case hwmon_pwm_input: + return cros_ec_hwmon_write_pwm_input(priv->cros_ec, channel, val); + case hwmon_pwm_enable: + return cros_ec_hwmon_write_pwm_enable(priv->cros_ec, channel, val); + default: + return -EOPNOTSUPP; + } + } + + return -EOPNOTSUPP; +} + static umode_t cros_ec_hwmon_is_visible(const void *data, enum hwmon_sensor_types type, u32 attr, int channel) { @@ -132,6 +257,9 @@ static umode_t cros_ec_hwmon_is_visible(const void *data, enum hwmon_sensor_type if (type == hwmon_fan) { if (priv->usable_fans & BIT(channel)) return 0444; + } else if (type == hwmon_pwm) { + if (priv->fan_control_supported && priv->usable_fans & BIT(channel)) + return 0644; } else if (type == hwmon_temp) { if (priv->temp_sensor_names[channel]) return 0444; @@ -147,6 +275,11 @@ static const struct hwmon_channel_info * const cros_ec_hwmon_info[] = { HWMON_F_INPUT | HWMON_F_FAULT, HWMON_F_INPUT | HWMON_F_FAULT, HWMON_F_INPUT | HWMON_F_FAULT), + HWMON_CHANNEL_INFO(pwm, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE), HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_FAULT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_FAULT | HWMON_T_LABEL, @@ -178,6 +311,7 @@ static const struct hwmon_channel_info * const cros_ec_hwmon_info[] = { static const struct hwmon_ops cros_ec_hwmon_ops = { .read = cros_ec_hwmon_read, .read_string = cros_ec_hwmon_read_string, + .write = cros_ec_hwmon_write, .is_visible = cros_ec_hwmon_is_visible, }; @@ -233,6 +367,25 @@ static void cros_ec_hwmon_probe_fans(struct cros_ec_hwmon_priv *priv) } } +static inline bool is_cros_ec_cmd_available(struct cros_ec_device *cros_ec, + u16 cmd, u8 version) +{ + int ret; + + ret = cros_ec_get_cmd_versions(cros_ec, cmd); + return ret >= 0 && (ret & EC_VER_MASK(version)); +} + +static bool cros_ec_hwmon_probe_fan_control_supported(struct cros_ec_device *cros_ec) +{ + return is_cros_ec_cmd_available(cros_ec, EC_CMD_PWM_GET_FAN_DUTY, + CROS_EC_HWMON_PWM_GET_FAN_DUTY_CMD_VERSION) && + is_cros_ec_cmd_available(cros_ec, EC_CMD_PWM_SET_FAN_DUTY, + CROS_EC_HWMON_PWM_SET_FAN_DUTY_CMD_VERSION) && + is_cros_ec_cmd_available(cros_ec, EC_CMD_THERMAL_AUTO_FAN_CTRL, + CROS_EC_HWMON_THERMAL_AUTO_FAN_CTRL_CMD_VERSION); +} + static int cros_ec_hwmon_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -259,13 +412,88 @@ static int cros_ec_hwmon_probe(struct platform_device *pdev) cros_ec_hwmon_probe_temp_sensors(dev, priv, thermal_version); cros_ec_hwmon_probe_fans(priv); + priv->fan_control_supported = cros_ec_hwmon_probe_fan_control_supported(priv->cros_ec); hwmon_dev = devm_hwmon_device_register_with_info(dev, "cros_ec", priv, &cros_ec_hwmon_chip_info, NULL); + platform_set_drvdata(pdev, priv); return PTR_ERR_OR_ZERO(hwmon_dev); } +static int cros_ec_hwmon_suspend(struct platform_device *pdev, pm_message_t state) +{ + struct cros_ec_hwmon_priv *priv = platform_get_drvdata(pdev); + u8 control_method; + size_t i; + int ret; + + if (!priv->fan_control_supported) + return 0; + + /* EC sets fan control to auto after suspended, store settings before suspending. */ + for (i = 0; i < EC_FAN_SPEED_ENTRIES; i++) { + if (!(priv->usable_fans & BIT(i))) + continue; + + ret = cros_ec_hwmon_read_pwm_enable(priv->cros_ec, i, &control_method); + if (ret) { + dev_warn(&pdev->dev, "failed to get mode setting for fan %zu: %d\n", i, + ret); + continue; + } + + if (control_method != 1) { + priv->manual_fans &= ~BIT(i); + continue; + } else { + priv->manual_fans |= BIT(i); + } + + ret = cros_ec_hwmon_read_pwm_value(priv->cros_ec, i, &priv->manual_fan_pwm[i]); + /* + * If storing the value failed, invalidate the stored mode value by setting it + * to auto control. EC will automatically switch to auto mode for that fan after + * suspended. + */ + if (ret) { + dev_warn(&pdev->dev, "failed to get PWM setting for fan %zu: %pe\n", i, + ERR_PTR(ret)); + priv->manual_fans &= ~BIT(i); + continue; + } + } + + return 0; +} + +static int cros_ec_hwmon_resume(struct platform_device *pdev) +{ + const struct cros_ec_hwmon_priv *priv = platform_get_drvdata(pdev); + size_t i; + int ret; + + if (!priv->fan_control_supported) + return 0; + + /* EC sets fan control to auto after suspend, restore to settings before suspend. */ + for (i = 0; i < EC_FAN_SPEED_ENTRIES; i++) { + if (!(priv->manual_fans & BIT(i))) + continue; + + /* + * Setting fan PWM value to EC will change the mode to manual for that fan in EC as + * well, so we do not need to issue a separate fan mode to manual call. + */ + ret = cros_ec_hwmon_set_fan_pwm_val(priv->cros_ec, i, priv->manual_fan_pwm[i]); + if (ret) + dev_warn(&pdev->dev, "failed to restore settings for fan %zu: %pe\n", i, + ERR_PTR(ret)); + } + + return 0; +} + static const struct platform_device_id cros_ec_hwmon_id[] = { { DRV_NAME, 0 }, {} @@ -274,6 +502,8 @@ static const struct platform_device_id cros_ec_hwmon_id[] = { static struct platform_driver cros_ec_hwmon_driver = { .driver.name = DRV_NAME, .probe = cros_ec_hwmon_probe, + .suspend = pm_ptr(cros_ec_hwmon_suspend), + .resume = pm_ptr(cros_ec_hwmon_resume), .id_table = cros_ec_hwmon_id, }; module_platform_driver(cros_ec_hwmon_driver); From 5798b62867b47b6ace287d31172ce748ad70d869 Mon Sep 17 00:00:00 2001 From: Sung-Chi Li Date: Thu, 11 Sep 2025 06:56:36 +0000 Subject: [PATCH 2837/3206] hwmon: (cros_ec) register fans into thermal framework cooling devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Register fans connected under EC as thermal cooling devices as well, so these fans can then work with the thermal framework. During the driver probing phase, we will also try to register each fan as a thermal cooling device based on previous probe result (whether the there are fans connected on that channel, and whether EC supports fan control). The basic get max state, get current state, and set current state methods are then implemented as well. Signed-off-by: Sung-Chi Li Acked-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250911-cros_ec_fan-v6-3-a1446cc098af@google.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/cros_ec_hwmon.rst | 2 + drivers/hwmon/cros_ec_hwmon.c | 83 +++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/Documentation/hwmon/cros_ec_hwmon.rst b/Documentation/hwmon/cros_ec_hwmon.rst index 355557a08c9a54..6db812708325f7 100644 --- a/Documentation/hwmon/cros_ec_hwmon.rst +++ b/Documentation/hwmon/cros_ec_hwmon.rst @@ -27,3 +27,5 @@ Fan and temperature readings are supported. PWM fan control is also supported if the EC also supports setting fan PWM values and fan mode. Note that EC will switch fan control mode back to auto when suspended. This driver will restore the fan state to what they were before suspended when resumed. +If a fan is controllable, this driver will register that fan as a cooling device +in the thermal framework as well. diff --git a/drivers/hwmon/cros_ec_hwmon.c b/drivers/hwmon/cros_ec_hwmon.c index 9eddc554ddefde..48331703f2f50d 100644 --- a/drivers/hwmon/cros_ec_hwmon.c +++ b/drivers/hwmon/cros_ec_hwmon.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,11 @@ struct cros_ec_hwmon_priv { u8 manual_fan_pwm[EC_FAN_SPEED_ENTRIES]; }; +struct cros_ec_hwmon_cooling_priv { + struct cros_ec_hwmon_priv *hwmon_priv; + u8 index; +}; + static int cros_ec_hwmon_read_fan_speed(struct cros_ec_device *cros_ec, u8 index, u16 *speed) { int ret; @@ -308,6 +314,42 @@ static const struct hwmon_channel_info * const cros_ec_hwmon_info[] = { NULL }; +static int cros_ec_hwmon_cooling_get_max_state(struct thermal_cooling_device *cdev, + unsigned long *val) +{ + *val = 255; + return 0; +} + +static int cros_ec_hwmon_cooling_get_cur_state(struct thermal_cooling_device *cdev, + unsigned long *val) +{ + const struct cros_ec_hwmon_cooling_priv *priv = cdev->devdata; + u8 read_val; + int ret; + + ret = cros_ec_hwmon_read_pwm_value(priv->hwmon_priv->cros_ec, priv->index, &read_val); + if (ret) + return ret; + + *val = read_val; + return 0; +} + +static int cros_ec_hwmon_cooling_set_cur_state(struct thermal_cooling_device *cdev, + unsigned long val) +{ + const struct cros_ec_hwmon_cooling_priv *priv = cdev->devdata; + + return cros_ec_hwmon_write_pwm_input(priv->hwmon_priv->cros_ec, priv->index, val); +} + +static const struct thermal_cooling_device_ops cros_ec_thermal_cooling_ops = { + .get_max_state = cros_ec_hwmon_cooling_get_max_state, + .get_cur_state = cros_ec_hwmon_cooling_get_cur_state, + .set_cur_state = cros_ec_hwmon_cooling_set_cur_state, +}; + static const struct hwmon_ops cros_ec_hwmon_ops = { .read = cros_ec_hwmon_read, .read_string = cros_ec_hwmon_read_string, @@ -386,6 +428,46 @@ static bool cros_ec_hwmon_probe_fan_control_supported(struct cros_ec_device *cro CROS_EC_HWMON_THERMAL_AUTO_FAN_CTRL_CMD_VERSION); } +static void cros_ec_hwmon_register_fan_cooling_devices(struct device *dev, + struct cros_ec_hwmon_priv *priv) +{ + struct cros_ec_hwmon_cooling_priv *cpriv; + struct thermal_cooling_device *cdev; + const char *type; + size_t i; + + if (!IS_ENABLED(CONFIG_THERMAL)) + return; + + if (!priv->fan_control_supported) + return; + + for (i = 0; i < EC_FAN_SPEED_ENTRIES; i++) { + if (!(priv->usable_fans & BIT(i))) + continue; + + cpriv = devm_kzalloc(dev, sizeof(*cpriv), GFP_KERNEL); + if (!cpriv) + continue; + + type = devm_kasprintf(dev, GFP_KERNEL, "%s-fan%zu", dev_name(dev), i); + if (!type) { + dev_warn(dev, "no memory to compose cooling device type for fan %zu\n", i); + continue; + } + + cpriv->hwmon_priv = priv; + cpriv->index = i; + cdev = devm_thermal_of_cooling_device_register(dev, NULL, type, cpriv, + &cros_ec_thermal_cooling_ops); + if (IS_ERR(cdev)) { + dev_warn(dev, "failed to register fan %zu as a cooling device: %pe\n", i, + cdev); + continue; + } + } +} + static int cros_ec_hwmon_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -413,6 +495,7 @@ static int cros_ec_hwmon_probe(struct platform_device *pdev) cros_ec_hwmon_probe_temp_sensors(dev, priv, thermal_version); cros_ec_hwmon_probe_fans(priv); priv->fan_control_supported = cros_ec_hwmon_probe_fan_control_supported(priv->cros_ec); + cros_ec_hwmon_register_fan_cooling_devices(dev, priv); hwmon_dev = devm_hwmon_device_register_with_info(dev, "cros_ec", priv, &cros_ec_hwmon_chip_info, NULL); From c02e4644f8ac9c501077ef5ac53ae7fc51472d49 Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Mon, 13 Jan 2025 10:48:58 +0200 Subject: [PATCH 2838/3206] hwmon: (mlxreg-fan) Separate methods of fan setting coming from different subsystems Distinct between fan speed setting request coming for hwmon and thermal subsystems. There are fields 'last_hwmon_state' and 'last_thermal_state' in the structure 'mlxreg_fan_pwm', which respectively store the cooling state set by the 'hwmon' and 'thermal' subsystem. The purpose is to make arbitration of fan speed setting. For example, if fan speed required to be not lower than some limit, such setting is to be performed through 'hwmon' subsystem, thus 'thermal' subsystem will not set fan below this limit. Currently, the 'last_thermal_state' is also be updated by 'hwmon' causing cooling state to never be set to a lower value. Eliminate update of 'last_thermal_state', when request is coming from 'hwmon' subsystem. Fixes: da74944d3a46 ("hwmon: (mlxreg-fan) Use pwm attribute for setting fan speed low limit") Signed-off-by: Vadim Pasternak Link: https://lore.kernel.org/r/20250113084859.27064-2-vadimp@nvidia.com Signed-off-by: Guenter Roeck --- drivers/hwmon/mlxreg-fan.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/hwmon/mlxreg-fan.c b/drivers/hwmon/mlxreg-fan.c index c25a54d5b39ad5..0ba9195c9d713e 100644 --- a/drivers/hwmon/mlxreg-fan.c +++ b/drivers/hwmon/mlxreg-fan.c @@ -113,8 +113,8 @@ struct mlxreg_fan { int divider; }; -static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, - unsigned long state); +static int _mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, + unsigned long state, bool thermal); static int mlxreg_fan_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, @@ -224,8 +224,9 @@ mlxreg_fan_write(struct device *dev, enum hwmon_sensor_types type, u32 attr, * last thermal state. */ if (pwm->last_hwmon_state >= pwm->last_thermal_state) - return mlxreg_fan_set_cur_state(pwm->cdev, - pwm->last_hwmon_state); + return _mlxreg_fan_set_cur_state(pwm->cdev, + pwm->last_hwmon_state, + false); return 0; } return regmap_write(fan->regmap, pwm->reg, val); @@ -357,9 +358,8 @@ static int mlxreg_fan_get_cur_state(struct thermal_cooling_device *cdev, return 0; } -static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, - unsigned long state) - +static int _mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, + unsigned long state, bool thermal) { struct mlxreg_fan_pwm *pwm = cdev->devdata; struct mlxreg_fan *fan = pwm->fan; @@ -369,7 +369,8 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, return -EINVAL; /* Save thermal state. */ - pwm->last_thermal_state = state; + if (thermal) + pwm->last_thermal_state = state; state = max_t(unsigned long, state, pwm->last_hwmon_state); err = regmap_write(fan->regmap, pwm->reg, @@ -381,6 +382,13 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, return 0; } +static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, + unsigned long state) + +{ + return _mlxreg_fan_set_cur_state(cdev, state, true); +} + static const struct thermal_cooling_device_ops mlxreg_fan_cooling_ops = { .get_max_state = mlxreg_fan_get_max_state, .get_cur_state = mlxreg_fan_get_cur_state, From 1e11552ee54d10c0b602c76b94db602e2581ce57 Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Mon, 13 Jan 2025 10:48:59 +0200 Subject: [PATCH 2839/3206] hwmon: (mlxreg-fan) Add support for new flavour of capability register FAN platform data is common across the various systems, while fan driver should be able to apply only the fan instances relevant to specific system. For example, platform data might contain descriptions for fan1, fan2, ..., fan{n}, while some systems equipped with all 'n' fans, others with less. Also, on some systems fan drawer can be equipped with several tachometers and on others only with one. For detection of the real number of equipped drawers and tachometers special capability registers are used. These registers used to indicate presence of drawers and tachometers through the bitmap. For some new big modular systems this register will provide presence data by counter. Use slot parameter to distinct whether capability register contains bitmask or counter. Signed-off-by: Vadim Pasternak Link: https://lore.kernel.org/r/20250113084859.27064-3-vadimp@nvidia.com Signed-off-by: Guenter Roeck --- drivers/hwmon/mlxreg-fan.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/mlxreg-fan.c b/drivers/hwmon/mlxreg-fan.c index 0ba9195c9d713e..137a90dd207523 100644 --- a/drivers/hwmon/mlxreg-fan.c +++ b/drivers/hwmon/mlxreg-fan.c @@ -63,12 +63,14 @@ struct mlxreg_fan; * @reg: register offset; * @mask: fault mask; * @prsnt: present register offset; + * @shift: tacho presence bit shift; */ struct mlxreg_fan_tacho { bool connected; u32 reg; u32 mask; u32 prsnt; + u32 shift; }; /* @@ -143,8 +145,10 @@ mlxreg_fan_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, /* * Map channel to presence bit - drawer can be equipped with * one or few FANs, while presence is indicated per drawer. + * Shift channel value if necessary to align with register value. */ - if (BIT(channel / fan->tachos_per_drwr) & regval) { + if (BIT(rol32(channel, tacho->shift) / fan->tachos_per_drwr) & + regval) { /* FAN is not connected - return zero for FAN speed. */ *val = 0; return 0; @@ -408,7 +412,7 @@ static int mlxreg_fan_connect_verify(struct mlxreg_fan *fan, return err; } - return !!(regval & data->bit); + return data->slot ? (data->slot <= regval ? 1 : 0) : !!(regval & data->bit); } static int mlxreg_pwm_connect_verify(struct mlxreg_fan *fan, @@ -545,7 +549,15 @@ static int mlxreg_fan_config(struct mlxreg_fan *fan, return err; } - drwr_avail = hweight32(regval); + /* + * The number of drawers could be specified in registers by counters for newer + * systems, or by bitmasks for older systems. In case the data is provided by + * counter, it is indicated through 'version' field. + */ + if (pdata->version) + drwr_avail = regval; + else + drwr_avail = hweight32(regval); if (!tacho_avail || !drwr_avail || tacho_avail < drwr_avail) { dev_err(fan->dev, "Configuration is invalid: drawers num %d tachos num %d\n", drwr_avail, tacho_avail); From 030c59df83b448fc873d1caabb9ea077ae25268e Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 25 Sep 2025 05:17:04 +0000 Subject: [PATCH 2840/3206] ASoC: renesas: msiof: add unique NOTE name MSIOF will have many NOTE on top of driver, give it a unique NOTE name. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87ecrvyuuo.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/msiof.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/renesas/rcar/msiof.c b/sound/soc/renesas/rcar/msiof.c index f5338bbb037c55..6819471fb5b2b1 100644 --- a/sound/soc/renesas/rcar/msiof.c +++ b/sound/soc/renesas/rcar/msiof.c @@ -7,7 +7,7 @@ // /* - * [NOTE] + * [NOTE-CLOCK-MODE] * * This driver doesn't support Clock/Frame Provider Mode * @@ -121,7 +121,7 @@ static int msiof_hw_start(struct snd_soc_component *component, /* * see - * [NOTE] on top of this driver + * [NOTE-CLOCK-MODE] on top of this driver */ /* * see From 25226abc1affd4bf4f6dd415d475b76e7a273fa8 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 25 Sep 2025 05:17:11 +0000 Subject: [PATCH 2841/3206] ASoC: renesas: msiof: use reset controller MSIOF has TXRST/RXRST to reset FIFO, but it shouldn't be used during SYNC signal was asserted, because it will be cause of HW issue. When MSIOF is used as Sound driver, this driver is assuming it is used as clock consumer mode (= Codec is clock provider). This means, it can't control SYNC signal by itself. We need to use SW reset (= reset_control_xxx()) instead of TXRST/RXRST. Signed-off-by: Kuninori Morimoto Tested-by: Yusuke Goda Link: https://patch.msgid.link/87cy7fyuug.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/msiof.c | 39 +++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/sound/soc/renesas/rcar/msiof.c b/sound/soc/renesas/rcar/msiof.c index 6819471fb5b2b1..87bec248d5c1b1 100644 --- a/sound/soc/renesas/rcar/msiof.c +++ b/sound/soc/renesas/rcar/msiof.c @@ -24,12 +24,25 @@ * Clock/Frame Consumer Mode. */ +/* + * [NOTE-RESET] + * + * MSIOF has TXRST/RXRST to reset FIFO, but it shouldn't be used during SYNC signal was asserted, + * because it will be cause of HW issue. + * + * When MSIOF is used as Sound driver, this driver is assuming it is used as clock consumer mode + * (= Codec is clock provider). This means, it can't control SYNC signal by itself. + * + * We need to use SW reset (= reset_control_xxx()) instead of TXRST/RXRST. + */ + #include #include #include #include #include #include +#include #include #include #include @@ -60,10 +73,13 @@ struct msiof_priv { struct device *dev; struct snd_pcm_substream *substream[SNDRV_PCM_STREAM_LAST + 1]; + struct reset_control *reset; spinlock_t lock; void __iomem *base; resource_size_t phy_addr; + int count; + /* for error */ int err_syc[SNDRV_PCM_STREAM_LAST + 1]; int err_ovf[SNDRV_PCM_STREAM_LAST + 1]; @@ -131,6 +147,16 @@ static int msiof_hw_start(struct snd_soc_component *component, * RX: Fig 109.15 */ + /* + * Use reset_control_xx() instead of TXRST/RXRST. + * see + * [NOTE-RESET] + */ + if (!priv->count) + reset_control_deassert(priv->reset); + + priv->count++; + /* reset errors */ priv->err_syc[substream->stream] = priv->err_ovf[substream->stream] = @@ -152,7 +178,6 @@ static int msiof_hw_start(struct snd_soc_component *component, val = FIELD_PREP(SIMDR2_BITLEN1, width - 1); msiof_write(priv, SITMDR2, val | FIELD_PREP(SIMDR2_GRP, 1)); msiof_write(priv, SITMDR3, val); - } /* SIRMDRx */ else { @@ -227,6 +252,11 @@ static int msiof_hw_stop(struct snd_soc_component *component, priv->err_ovf[substream->stream], priv->err_udf[substream->stream]); + priv->count--; + + if (!priv->count) + reset_control_assert(priv->reset); + return 0; } @@ -490,12 +520,19 @@ static int msiof_probe(struct platform_device *pdev) if (IS_ERR(priv->base)) return PTR_ERR(priv->base); + priv->reset = devm_reset_control_get_exclusive(dev, NULL); + if (IS_ERR(priv->reset)) + return PTR_ERR(priv->reset); + + reset_control_assert(priv->reset); + ret = devm_request_irq(dev, irq, msiof_interrupt, 0, dev_name(dev), priv); if (ret) return ret; priv->dev = dev; priv->phy_addr = res->start; + priv->count = 0; spin_lock_init(&priv->lock); platform_set_drvdata(pdev, priv); From 130947b4681c515a5e5a7961244b502de2de85ca Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 25 Sep 2025 05:17:17 +0000 Subject: [PATCH 2842/3206] ASoC: renesas: msiof: set SIFCTR register Because it uses DMAC, we would like to transfer data if there is any data. Set SIFCTR for it. Signed-off-by: Kuninori Morimoto Tested-by: Yusuke Goda Link: https://patch.msgid.link/87bjmzyuub.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/msiof.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/soc/renesas/rcar/msiof.c b/sound/soc/renesas/rcar/msiof.c index 87bec248d5c1b1..a239a88543ee62 100644 --- a/sound/soc/renesas/rcar/msiof.c +++ b/sound/soc/renesas/rcar/msiof.c @@ -193,6 +193,12 @@ static int msiof_hw_start(struct snd_soc_component *component, msiof_write(priv, SIRMDR3, val); } + /* SIFCTR */ + if (is_play) + msiof_update(priv, SIFCTR, SIFCTR_TFWM, FIELD_PREP(SIFCTR_TFWM, SIFCTR_TFWM_1)); + else + msiof_update(priv, SIFCTR, SIFCTR_RFWM, FIELD_PREP(SIFCTR_RFWM, SIFCTR_RFWM_1)); + /* SIIER */ if (is_play) val = SIIER_TDREQE | SIIER_TDMAE | SISTR_ERR_TX; From ab77fa5533e4d1dcfdd2711b9b1e166e4ed57dab Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 25 Sep 2025 05:17:21 +0000 Subject: [PATCH 2843/3206] ASoC: renesas: msiof: add .symmetric_xxx on snd_soc_dai_driver MSIOF TX/RX are sharing same clock. Adds .symmetric_xxx flags. Signed-off-by: Kuninori Morimoto Tested-by: Yusuke Goda Link: https://patch.msgid.link/87a52jyuu6.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/msiof.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/renesas/rcar/msiof.c b/sound/soc/renesas/rcar/msiof.c index a239a88543ee62..d501ad9d71413b 100644 --- a/sound/soc/renesas/rcar/msiof.c +++ b/sound/soc/renesas/rcar/msiof.c @@ -338,6 +338,9 @@ static struct snd_soc_dai_driver msiof_dai_driver = { .channels_max = 2, }, .ops = &msiof_dai_ops, + .symmetric_rate = 1, + .symmetric_channels = 1, + .symmetric_sample_bits = 1, }; static struct snd_pcm_hardware msiof_pcm_hardware = { From 25aa058b5c83a3c455a2a288bb3295c0b234f093 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 25 Sep 2025 05:17:27 +0000 Subject: [PATCH 2844/3206] ASoC: renesas: msiof: tidyup DMAC stop timing Current DMAC is stopped before HW stop, but it might be cause of sync error. Stop HW first. Signed-off-by: Kuninori Morimoto Tested-by: Yusuke Goda Link: https://patch.msgid.link/878qi3yuu0.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/msiof.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/renesas/rcar/msiof.c b/sound/soc/renesas/rcar/msiof.c index d501ad9d71413b..5b59d4bcd67eb8 100644 --- a/sound/soc/renesas/rcar/msiof.c +++ b/sound/soc/renesas/rcar/msiof.c @@ -238,9 +238,6 @@ static int msiof_hw_stop(struct snd_soc_component *component, val = SIIER_RDREQE | SIIER_RDMAE | SISTR_ERR_RX; msiof_update(priv, SIIER, val, 0); - /* Stop DMAC */ - snd_dmaengine_pcm_trigger(substream, cmd); - /* SICTR */ if (is_play) val = SICTR_TXE; @@ -248,6 +245,9 @@ static int msiof_hw_stop(struct snd_soc_component *component, val = SICTR_RXE; msiof_update_and_wait(priv, SICTR, val, 0, 0); + /* Stop DMAC */ + snd_dmaengine_pcm_trigger(substream, cmd); + /* indicate error status if exist */ if (priv->err_syc[substream->stream] || priv->err_ovf[substream->stream] || From dc7473e6372ee36ff232af10c910ee3a8bad6447 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 25 Sep 2025 05:17:34 +0000 Subject: [PATCH 2845/3206] ASoC: renesas: msiof: setup both (Playback/Capture) in the same time SITMDRn / SIRMDRn and some other registers should not be updated during working even though it was not related the target direction (for example, do TX settings during RX is working), otherwise it cause a FSERR. Setup both direction (Playback/Capture) in the same time. Signed-off-by: Kuninori Morimoto Tested-by: Yusuke Goda Link: https://patch.msgid.link/877bxnyutt.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/msiof.c | 70 ++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/sound/soc/renesas/rcar/msiof.c b/sound/soc/renesas/rcar/msiof.c index 5b59d4bcd67eb8..df664800bf6013 100644 --- a/sound/soc/renesas/rcar/msiof.c +++ b/sound/soc/renesas/rcar/msiof.c @@ -36,6 +36,16 @@ * We need to use SW reset (= reset_control_xxx()) instead of TXRST/RXRST. */ +/* + * [NOTE-BOTH-SETTING] + * + * SITMDRn / SIRMDRn and some other registers should not be updated during working even though it + * was not related the target direction (for example, do TX settings during RX is working), + * otherwise it cause a FSERR. + * + * Setup both direction (Playback/Capture) in the same time. + */ + #include #include #include @@ -165,39 +175,40 @@ static int msiof_hw_start(struct snd_soc_component *component, /* Start DMAC */ snd_dmaengine_pcm_trigger(substream, cmd); + /* + * setup both direction (Playback/Capture) in the same time. + * see + * above [NOTE-BOTH-SETTING] + */ + /* SITMDRx */ - if (is_play) { - val = SITMDR1_PCON | - FIELD_PREP(SIMDR1_SYNCMD, SIMDR1_SYNCMD_LR) | - SIMDR1_SYNCAC | SIMDR1_XXSTP; - if (msiof_flag_has(priv, MSIOF_FLAGS_NEED_DELAY)) - val |= FIELD_PREP(SIMDR1_DTDL, 1); - - msiof_write(priv, SITMDR1, val); - - val = FIELD_PREP(SIMDR2_BITLEN1, width - 1); - msiof_write(priv, SITMDR2, val | FIELD_PREP(SIMDR2_GRP, 1)); - msiof_write(priv, SITMDR3, val); - } + val = SITMDR1_PCON | SIMDR1_SYNCAC | SIMDR1_XXSTP | + FIELD_PREP(SIMDR1_SYNCMD, SIMDR1_SYNCMD_LR); + if (msiof_flag_has(priv, MSIOF_FLAGS_NEED_DELAY)) + val |= FIELD_PREP(SIMDR1_DTDL, 1); + + msiof_write(priv, SITMDR1, val); + + val = FIELD_PREP(SIMDR2_BITLEN1, width - 1); + msiof_write(priv, SITMDR2, val | FIELD_PREP(SIMDR2_GRP, 1)); + msiof_write(priv, SITMDR3, val); + /* SIRMDRx */ - else { - val = FIELD_PREP(SIMDR1_SYNCMD, SIMDR1_SYNCMD_LR) | - SIMDR1_SYNCAC; - if (msiof_flag_has(priv, MSIOF_FLAGS_NEED_DELAY)) - val |= FIELD_PREP(SIMDR1_DTDL, 1); + val = SIMDR1_SYNCAC | + FIELD_PREP(SIMDR1_SYNCMD, SIMDR1_SYNCMD_LR); + if (msiof_flag_has(priv, MSIOF_FLAGS_NEED_DELAY)) + val |= FIELD_PREP(SIMDR1_DTDL, 1); - msiof_write(priv, SIRMDR1, val); + msiof_write(priv, SIRMDR1, val); - val = FIELD_PREP(SIMDR2_BITLEN1, width - 1); - msiof_write(priv, SIRMDR2, val | FIELD_PREP(SIMDR2_GRP, 1)); - msiof_write(priv, SIRMDR3, val); - } + val = FIELD_PREP(SIMDR2_BITLEN1, width - 1); + msiof_write(priv, SIRMDR2, val | FIELD_PREP(SIMDR2_GRP, 1)); + msiof_write(priv, SIRMDR3, val); /* SIFCTR */ - if (is_play) - msiof_update(priv, SIFCTR, SIFCTR_TFWM, FIELD_PREP(SIFCTR_TFWM, SIFCTR_TFWM_1)); - else - msiof_update(priv, SIFCTR, SIFCTR_RFWM, FIELD_PREP(SIFCTR_RFWM, SIFCTR_RFWM_1)); + msiof_write(priv, SIFCTR, + FIELD_PREP(SIFCTR_TFWM, SIFCTR_TFWM_1) | + FIELD_PREP(SIFCTR_RFWM, SIFCTR_RFWM_1)); /* SIIER */ if (is_play) @@ -214,10 +225,11 @@ static int msiof_hw_start(struct snd_soc_component *component, msiof_update(priv, SISTR, val, val); /* SICTR */ + val = SICTR_TEDG | SICTR_REDG; if (is_play) - val = SICTR_TXE | SICTR_TEDG; + val |= SICTR_TXE; else - val = SICTR_RXE | SICTR_REDG; + val |= SICTR_RXE; msiof_update_and_wait(priv, SICTR, val, val, val); return 0; From 8c363f61e5bcb92d5e88ca1b47be74be2683b212 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 25 Sep 2025 05:17:41 +0000 Subject: [PATCH 2846/3206] ASoC: renesas: msiof: Add note for The possibility of R/L opposite Capture This driver is assuming MSIOF is used as Clock/Frame Consumer Mode, and there is a case that some Codec (= Clock/Frame Provider) might output Clock/Frame before setup MSIOF. And, MSIOF will capture data without checking SYNC signal Hi/Low (= R/L). This means, if MSIOF RXE bit was set as 1 in case of SYNC signal was Hi (= R) timing, it will start capture data since next SYNC low signal (= L). Because Linux assumes sound data is lined up as R->L->R->L->..., the data R/L might be opposite. The only solution in this case is start CLK/SYNC *after* MSIOF settings, but it depends when and how Codec driver start it. Signed-off-by: Kuninori Morimoto Tested-by: Yusuke Goda Link: https://patch.msgid.link/875xd7yutm.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/msiof.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sound/soc/renesas/rcar/msiof.c b/sound/soc/renesas/rcar/msiof.c index df664800bf6013..330b65b295976d 100644 --- a/sound/soc/renesas/rcar/msiof.c +++ b/sound/soc/renesas/rcar/msiof.c @@ -46,6 +46,25 @@ * Setup both direction (Playback/Capture) in the same time. */ +/* + * [NOTE-R/L] + * + * The data of Captured might be R/L opposite. + * + * This driver is assuming MSIOF is used as Clock/Frame Consumer Mode, and there is a case that some + * Codec (= Clock/Frame Provider) might output Clock/Frame before setup MSIOF. It depends on Codec + * driver implementation. + * + * MSIOF will capture data without checking SYNC signal Hi/Low (= R/L). + * + * This means, if MSIOF RXE bit was set as 1 in case of SYNC signal was Hi (= R) timing, it will + * start capture data since next SYNC low singla (= L). Because Linux assumes sound data is lined + * up as R->L->R->L->..., the data R/L will be opposite. + * + * The only solution in this case is start CLK/SYNC *after* MSIOF settings, but it depends when and + * how Codec driver start it. + */ + #include #include #include From e26387e950ee4486b4ed5728b5d3c1430c33ba67 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 25 Sep 2025 05:17:47 +0000 Subject: [PATCH 2847/3206] ASoC: renesas: msiof: ignore 1st FSERR Renesas have tried to minimize the occurrence of FSERR errors as much as possible, but unfortunately we cannot remove them completely, because MSIOF might setup its register during CLK/SYNC are inputed. It can be happen because MSIOF is working as Clock/Frame Consumer. Ignore 1st FSERR which we can do nothing Signed-off-by: Kuninori Morimoto Tested-by: Yusuke Goda Link: https://patch.msgid.link/874isryutg.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/msiof.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/sound/soc/renesas/rcar/msiof.c b/sound/soc/renesas/rcar/msiof.c index 330b65b295976d..f2addfbac92374 100644 --- a/sound/soc/renesas/rcar/msiof.c +++ b/sound/soc/renesas/rcar/msiof.c @@ -65,6 +65,16 @@ * how Codec driver start it. */ +/* + * [NOTE-FSERR] + * + * We can't remove all FSERR. + * + * Renesas have tried to minimize the occurrence of FSERR errors as much as possible, but + * unfortunately we cannot remove them completely, because MSIOF might setup its register during + * CLK/SYNC are inputed. It can be happen because MSIOF is working as Clock/Frame Consumer. + */ + #include #include #include @@ -186,8 +196,13 @@ static int msiof_hw_start(struct snd_soc_component *component, priv->count++; - /* reset errors */ - priv->err_syc[substream->stream] = + /* + * Reset errors. ignore 1st FSERR + * + * see + * [NOTE-FSERR] + */ + priv->err_syc[substream->stream] = -1; priv->err_ovf[substream->stream] = priv->err_udf[substream->stream] = 0; @@ -279,6 +294,15 @@ static int msiof_hw_stop(struct snd_soc_component *component, /* Stop DMAC */ snd_dmaengine_pcm_trigger(substream, cmd); + /* + * Ignore 1st FSERR + * + * see + * [NOTE-FSERR] + */ + if (priv->err_syc[substream->stream] < 0) + priv->err_syc[substream->stream] = 0; + /* indicate error status if exist */ if (priv->err_syc[substream->stream] || priv->err_ovf[substream->stream] || From 2d965c1ae4135ed6f505661458f6dabd39488dac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 25 Sep 2025 11:14:23 -0300 Subject: [PATCH 2848/3206] tools/nolibc: add stdbool.h to nolibc includes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise tests compiled with only "-include nolibc.h" will fail with "error: unknown type name 'bool'", even though a stdbool.h is available from nolibc. Fixes: ae1f550efc11 ("tools/nolibc: add stdbool.h header") Fixes: f2662ec26b26 ("selftests: kselftest: Create ksft_print_dbg_msg()") Reported-by: Mark Brown Closes: https://lore.kernel.org/lkml/833f5ae5-190e-47ec-9ad9-127ad166c80c@sirena.org.uk/ Signed-off-by: André Almeida [Thomas: add Fixes tags and massage commit message a bit] Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/nolibc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index c199ade200c240..d2f5aa085f8e36 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -116,6 +116,7 @@ #include "sched.h" #include "signal.h" #include "unistd.h" +#include "stdbool.h" #include "stdio.h" #include "stdlib.h" #include "string.h" From 5730dacb3f172858ca47b8b1aeab083b5713f24b Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 24 Sep 2025 15:29:54 +0100 Subject: [PATCH 2849/3206] selftests/bpf: Task_work selftest cleanup fixes task_work selftest does not properly handle cleanup during failures: * destroy bpf_link * perf event fd is passed to bpf_link, no need to close it if link was created successfully * goto cleanup if fork() failed, close pipe. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250924142954.129519-2-mykyta.yatsenko5@gmail.com --- .../selftests/bpf/prog_tests/test_task_work.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_work.c b/tools/testing/selftests/bpf/prog_tests/test_task_work.c index 666585270fbf99..774b31a5f6ca12 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_task_work.c +++ b/tools/testing/selftests/bpf/prog_tests/test_task_work.c @@ -55,8 +55,8 @@ static void task_work_run(const char *prog_name, const char *map_name) struct task_work *skel; struct bpf_program *prog; struct bpf_map *map; - struct bpf_link *link; - int err, pe_fd = 0, pid, status, pipefd[2]; + struct bpf_link *link = NULL; + int err, pe_fd = -1, pid, status, pipefd[2]; char user_string[] = "hello world"; if (!ASSERT_NEQ(pipe(pipefd), -1, "pipe")) @@ -77,7 +77,11 @@ static void task_work_run(const char *prog_name, const char *map_name) (void)num; exit(0); } - ASSERT_GT(pid, 0, "fork() failed"); + if (!ASSERT_GT(pid, 0, "fork() failed")) { + close(pipefd[0]); + close(pipefd[1]); + return; + } skel = task_work__open(); if (!ASSERT_OK_PTR(skel, "task_work__open")) @@ -112,6 +116,8 @@ static void task_work_run(const char *prog_name, const char *map_name) if (!ASSERT_OK_PTR(link, "attach_perf_event")) goto cleanup; + /* perf event fd ownership is passed to bpf_link */ + pe_fd = -1; close(pipefd[0]); write(pipefd[1], user_string, 1); close(pipefd[1]); @@ -126,8 +132,9 @@ static void task_work_run(const char *prog_name, const char *map_name) cleanup: if (pe_fd >= 0) close(pe_fd); + bpf_link__destroy(link); task_work__destroy(skel); - if (pid) { + if (pid > 0) { close(pipefd[0]); write(pipefd[1], user_string, 1); close(pipefd[1]); From 469d80a3712c66a00b5bb888e62e809db8887ba7 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 25 Sep 2025 13:51:06 -0500 Subject: [PATCH 2850/3206] PM: hibernate: Fix hybrid-sleep Hybrid sleep will hibernate the system followed by running through the suspend routine. Since both the hibernate and the suspend routine will call pm_restrict_gfp_mask(), pm_restore_gfp_mask() must be called before starting the suspend sequence. Add an explicit call to pm_restore_gfp_mask() to power_down() before the suspend sequence starts. Add an extra call for pm_restrict_gfp_mask() when exiting suspend so that the pm_restore_gfp_mask() call in hibernate() is balanced. Reported-by: Ionut Nechita Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4573 Tested-by: Ionut Nechita Fixes: 12ffc3b1513eb ("PM: Restrict swap use to later in the suspend sequence") Tested-by: Kenneth Crudup Acked-by: Alex Deucher Signed-off-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20250925185108.2968494-2-superm1@kernel.org [ rjw: Add comment explainig the new pm_restrict_gfp_mask() call purpose ] Cc: 6.16+ # 6.16+ Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 2f66ab45382319..e0cdb8d9fd45c8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -695,12 +695,16 @@ static void power_down(void) #ifdef CONFIG_SUSPEND if (hibernation_mode == HIBERNATION_SUSPEND) { + pm_restore_gfp_mask(); error = suspend_devices_and_enter(mem_sleep_current); if (error) { hibernation_mode = hibernation_ops ? HIBERNATION_PLATFORM : HIBERNATION_SHUTDOWN; } else { + /* Match pm_restore_gfp_mask() call in hibernate() */ + pm_restrict_gfp_mask(); + /* Restore swap signature. */ error = swsusp_unmark(); if (error) From 495c8d35035edb66e3284113bef01f3b1b843832 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 25 Sep 2025 13:51:07 -0500 Subject: [PATCH 2851/3206] PM: hibernate: Add pm_hibernation_mode_is_suspend() Some drivers have different flows for hibernation and suspend. If the driver opportunistically will skip thaw() then it needs a hint to know what is happening after the hibernate. Introduce a new symbol pm_hibernation_mode_is_suspend() that drivers can call to determine if suspending the system for this purpose. Tested-by: Ionut Nechita Tested-by: Kenneth Crudup Acked-by: Alex Deucher Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 2 ++ kernel/power/hibernate.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 317ae31e89b374..0664c685f0b24e 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -276,6 +276,7 @@ extern void arch_suspend_enable_irqs(void); extern int pm_suspend(suspend_state_t state); extern bool sync_on_suspend_enabled; +bool pm_hibernation_mode_is_suspend(void); #else /* !CONFIG_SUSPEND */ #define suspend_valid_only_mem NULL @@ -288,6 +289,7 @@ static inline bool pm_suspend_via_firmware(void) { return false; } static inline bool pm_resume_via_firmware(void) { return false; } static inline bool pm_suspend_no_platform(void) { return false; } static inline bool pm_suspend_default_s2idle(void) { return false; } +static inline bool pm_hibernation_mode_is_suspend(void) { return false; } static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e0cdb8d9fd45c8..5d146218cae86f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -80,6 +80,17 @@ static const struct platform_hibernation_ops *hibernation_ops; static atomic_t hibernate_atomic = ATOMIC_INIT(1); +#ifdef CONFIG_SUSPEND +/** + * pm_hibernation_mode_is_suspend - Check if hibernation has been set to suspend + */ +bool pm_hibernation_mode_is_suspend(void) +{ + return hibernation_mode == HIBERNATION_SUSPEND; +} +EXPORT_SYMBOL_GPL(pm_hibernation_mode_is_suspend); +#endif + bool hibernate_acquire(void) { return atomic_add_unless(&hibernate_atomic, -1, 0); From 0a6e9e098fcc318fec0f45a05a5c4743a81a60d9 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 25 Sep 2025 13:51:08 -0500 Subject: [PATCH 2852/3206] drm/amd: Fix hybrid sleep [Why] commit 530694f54dd5e ("drm/amdgpu: do not resume device in thaw for normal hibernation") optimized the flow for systems that are going into S4 where the power would be turned off. Basically the thaw() callback wouldn't resume the device if the hibernation image was successfully created since the system would be powered off. This however isn't the correct flow for a system entering into s0i3 after the hibernation image is created. Some of the amdgpu callbacks have different behavior depending upon the intended state of the suspend. [How] Use pm_hibernation_mode_is_suspend() as an input to decide whether to run resume during thaw() callback. Reported-by: Ionut Nechita Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4573 Tested-by: Ionut Nechita Fixes: 530694f54dd5e ("drm/amdgpu: do not resume device in thaw for normal hibernation") Acked-by: Alex Deucher Tested-by: Kenneth Crudup Signed-off-by: Mario Limonciello (AMD) Cc: 6.17+ # 6.17+: 495c8d35035e: PM: hibernate: Add pm_hibernation_mode_is_suspend() Signed-off-by: Rafael J. Wysocki --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 395c6be901ce7a..dcea66aadfa335 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2665,7 +2665,7 @@ static int amdgpu_pmops_thaw(struct device *dev) struct drm_device *drm_dev = dev_get_drvdata(dev); /* do not resume device if it's normal hibernation */ - if (!pm_hibernate_is_recovering()) + if (!pm_hibernate_is_recovering() && !pm_hibernation_mode_is_suspend()) return 0; return amdgpu_device_resume(drm_dev, true); From 4ae50c82a5ab9071b928648da8723153004170bb Mon Sep 17 00:00:00 2001 From: Gaurav Kohli Date: Tue, 24 Jun 2025 12:19:44 +0530 Subject: [PATCH 2853/3206] dt-bindings: thermal: tsens: Add QCS615 compatible Add compatibility string for the thermal sensors on QCS615 platform. Acked-by: Rob Herring (Arm) Signed-off-by: Gaurav Kohli Link: https://lore.kernel.org/r/20250624064945.764245-2-quic_gkohli@quicinc.com Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/thermal/qcom-tsens.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml index 94311ebd7652d4..f65dc829574c70 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml +++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml @@ -54,6 +54,7 @@ properties: - qcom,msm8996-tsens - qcom,msm8998-tsens - qcom,qcm2290-tsens + - qcom,qcs615-tsens - qcom,sa8255p-tsens - qcom,sa8775p-tsens - qcom,sar2130p-tsens From 84fd9e4a6b996ca1ac00eb46fcb4ef3246b6f6a3 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Wed, 25 Jun 2025 20:16:57 +0200 Subject: [PATCH 2854/3206] thermal/drivers/rcar_gen3: Add support for per-SoC default trim values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Working Sample R-Car SoCs may not yet have thermal sensor trimming values programmed into fuses, those fuses are blank instead. For such SoCs, the driver includes fallback trimming values. Those values are currently applied to all SoCs which use this driver. Introduce support for per-SoC fallback trimming values in preparation for SoCs which do not use these current trimming values. No functional change is intended here. Signed-off-by: Marek Vasut Signed-off-by: Daniel Lezcano Reviewed-by: Niklas Söderlund Link: https://lore.kernel.org/r/20250625181739.28391-1-marek.vasut+renesas@mailbox.org --- drivers/thermal/renesas/rcar_gen3_thermal.c | 41 ++++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/drivers/thermal/renesas/rcar_gen3_thermal.c b/drivers/thermal/renesas/rcar_gen3_thermal.c index 24a702ee4c1fb8..413b373523e4be 100644 --- a/drivers/thermal/renesas/rcar_gen3_thermal.c +++ b/drivers/thermal/renesas/rcar_gen3_thermal.c @@ -73,11 +73,17 @@ struct rcar_gen3_thermal_fuse_info { u32 mask; }; +struct rcar_gen3_thermal_fuse_default { + u32 ptat[3]; + u32 thcodes[TSC_MAX_NUM][3]; +}; + struct rcar_thermal_info { int scale; int adj_below; int adj_above; const struct rcar_gen3_thermal_fuse_info *fuses; + const struct rcar_gen3_thermal_fuse_default *fuse_defaults; }; struct equation_set_coef { @@ -289,6 +295,7 @@ static void rcar_gen3_thermal_fetch_fuses(struct rcar_gen3_thermal_priv *priv) static bool rcar_gen3_thermal_read_fuses(struct rcar_gen3_thermal_priv *priv) { + const struct rcar_gen3_thermal_fuse_default *fuse_defaults = priv->info->fuse_defaults; unsigned int i; u32 thscp; @@ -297,24 +304,16 @@ static bool rcar_gen3_thermal_read_fuses(struct rcar_gen3_thermal_priv *priv) if (!priv->info->fuses || (thscp & THSCP_COR_PARA_VLD) != THSCP_COR_PARA_VLD) { /* Default THCODE values in case FUSEs are not set. */ - static const int thcodes[TSC_MAX_NUM][3] = { - { 3397, 2800, 2221 }, - { 3393, 2795, 2216 }, - { 3389, 2805, 2237 }, - { 3415, 2694, 2195 }, - { 3356, 2724, 2244 }, - }; - - priv->ptat[0] = 2631; - priv->ptat[1] = 1509; - priv->ptat[2] = 435; + priv->ptat[0] = fuse_defaults->ptat[0]; + priv->ptat[1] = fuse_defaults->ptat[1]; + priv->ptat[2] = fuse_defaults->ptat[2]; for (i = 0; i < priv->num_tscs; i++) { struct rcar_gen3_thermal_tsc *tsc = priv->tscs[i]; - tsc->thcode[0] = thcodes[i][0]; - tsc->thcode[1] = thcodes[i][1]; - tsc->thcode[2] = thcodes[i][2]; + tsc->thcode[0] = fuse_defaults->thcodes[i][0]; + tsc->thcode[1] = fuse_defaults->thcodes[i][1]; + tsc->thcode[2] = fuse_defaults->thcodes[i][2]; } return false; @@ -361,11 +360,23 @@ static const struct rcar_gen3_thermal_fuse_info rcar_gen3_thermal_fuse_info_gen4 .mask = GEN4_FUSE_MASK, }; +static const struct rcar_gen3_thermal_fuse_default rcar_gen3_thermal_fuse_default_info_gen3 = { + .ptat = { 2631, 1509, 435 }, + .thcodes = { + { 3397, 2800, 2221 }, + { 3393, 2795, 2216 }, + { 3389, 2805, 2237 }, + { 3415, 2694, 2195 }, + { 3356, 2724, 2244 }, + }, +}; + static const struct rcar_thermal_info rcar_m3w_thermal_info = { .scale = 157, .adj_below = -41, .adj_above = 116, .fuses = &rcar_gen3_thermal_fuse_info_gen3, + .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_gen3, }; static const struct rcar_thermal_info rcar_gen3_thermal_info = { @@ -373,6 +384,7 @@ static const struct rcar_thermal_info rcar_gen3_thermal_info = { .adj_below = -41, .adj_above = 126, .fuses = &rcar_gen3_thermal_fuse_info_gen3, + .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_gen3, }; static const struct rcar_thermal_info rcar_gen4_thermal_info = { @@ -380,6 +392,7 @@ static const struct rcar_thermal_info rcar_gen4_thermal_info = { .adj_below = -41, .adj_above = 126, .fuses = &rcar_gen3_thermal_fuse_info_gen4, + .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_gen3, }; static const struct of_device_id rcar_gen3_thermal_dt_ids[] = { From 48bc3b3317b9e67103b82cebbc1475b57abde38b Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Wed, 25 Jun 2025 20:16:58 +0200 Subject: [PATCH 2855/3206] thermal/drivers/rcar_gen3: Add support for R-Car V4H default trim values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add default trimming values for the four temperature sensors located in Renesas R-Car V4H Working Sample SoC. The trimming values are identical for all four THS temperature sensors. Signed-off-by: Marek Vasut Signed-off-by: Daniel Lezcano Reviewed-by: Niklas Söderlund Link: https://lore.kernel.org/r/20250625181739.28391-2-marek.vasut+renesas@mailbox.org --- drivers/thermal/renesas/rcar_gen3_thermal.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/renesas/rcar_gen3_thermal.c b/drivers/thermal/renesas/rcar_gen3_thermal.c index 413b373523e4be..01858e72f4e0ab 100644 --- a/drivers/thermal/renesas/rcar_gen3_thermal.c +++ b/drivers/thermal/renesas/rcar_gen3_thermal.c @@ -371,6 +371,16 @@ static const struct rcar_gen3_thermal_fuse_default rcar_gen3_thermal_fuse_defaul }, }; +static const struct rcar_gen3_thermal_fuse_default rcar_gen3_thermal_fuse_default_info_v4h = { + .ptat = { 3274, 2164, 985 }, + .thcodes = { /* All four THS units share the same trimming */ + { 3218, 2617, 1980 }, + { 3218, 2617, 1980 }, + { 3218, 2617, 1980 }, + { 3218, 2617, 1980 }, + } +}; + static const struct rcar_thermal_info rcar_m3w_thermal_info = { .scale = 157, .adj_below = -41, @@ -395,6 +405,14 @@ static const struct rcar_thermal_info rcar_gen4_thermal_info = { .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_gen3, }; +static const struct rcar_thermal_info rcar_v4h_thermal_info = { + .scale = 167, + .adj_below = -41, + .adj_above = 126, + .fuses = &rcar_gen3_thermal_fuse_info_gen4, + .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_v4h, +}; + static const struct of_device_id rcar_gen3_thermal_dt_ids[] = { { .compatible = "renesas,r8a774a1-thermal", @@ -438,7 +456,7 @@ static const struct of_device_id rcar_gen3_thermal_dt_ids[] = { }, { .compatible = "renesas,r8a779g0-thermal", - .data = &rcar_gen4_thermal_info, + .data = &rcar_v4h_thermal_info, }, { .compatible = "renesas,r8a779h0-thermal", From 5ea75f3479ee7c97a54c8660ce2cd980c9e9afb6 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 1 Aug 2025 14:35:40 +0800 Subject: [PATCH 2856/3206] thermal/drivers/mediatek/lvts_thermal: Remove unneeded semicolon ./drivers/thermal/mediatek/lvts_thermal.c:642:2-3: Unneeded semicolon. A semicolon is present after the closing bracket of the loop, let's remove it. No functional change intended. [ dlezcano : Reworded the description and reordered the tags order ] Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=23244 Signed-off-by: Jiapeng Chong Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250801063540.2959610-1-jiapeng.chong@linux.alibaba.com --- drivers/thermal/mediatek/lvts_thermal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c index f4d1e66d7db9ef..ab55b20cda479c 100644 --- a/drivers/thermal/mediatek/lvts_thermal.c +++ b/drivers/thermal/mediatek/lvts_thermal.c @@ -639,7 +639,7 @@ static int lvts_sensor_init(struct device *dev, struct lvts_ctrl *lvts_ctrl, lvts_sensor[i].low_thresh = INT_MIN; lvts_sensor[i].high_thresh = INT_MIN; - }; + } lvts_ctrl->valid_sensor_mask = lvts_ctrl_data->valid_sensor_mask; From 57eda47bd14b0c2876f2db42e757c57b7a671965 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Mon, 28 Jul 2025 15:18:23 +0300 Subject: [PATCH 2857/3206] thermal/drivers/qcom: Make LMH select QCOM_SCM The QCOM_SCM symbol is not user-visible, so it makes little sense to depend on it. Make LMH driver select QCOM_SCM as all other drivers do and, as the dependecy is now correctly handled, enable || COMPILE_TEST in order to include the driver into broader set of build tests. Fixes: 9e5a4fb84230 ("thermal/drivers/qcom/lmh: make QCOM_LMH depends on QCOM_SCM") Signed-off-by: Dmitry Baryshkov Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250728-lmh-scm-v2-1-33bc58388ca5@oss.qualcomm.com --- drivers/thermal/qcom/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/qcom/Kconfig b/drivers/thermal/qcom/Kconfig index 2c7f3f9a26ebbb..a6bb01082ec697 100644 --- a/drivers/thermal/qcom/Kconfig +++ b/drivers/thermal/qcom/Kconfig @@ -34,7 +34,8 @@ config QCOM_SPMI_TEMP_ALARM config QCOM_LMH tristate "Qualcomm Limits Management Hardware" - depends on ARCH_QCOM && QCOM_SCM + depends on ARCH_QCOM || COMPILE_TEST + select QCOM_SCM help This enables initialization of Qualcomm limits management hardware(LMh). LMh allows for hardware-enforced mitigation for cpus based on From b50b2c53f98fcdb6957e184eb488c16502db9575 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Mon, 28 Jul 2025 15:18:24 +0300 Subject: [PATCH 2858/3206] thermal/drivers/qcom/lmh: Add missing IRQ includes As reported by LKP, the Qualcomm LMH driver needs to include several IRQ-related headers, which decrlare necessary IRQ functionality. Currently driver builds on ARM64 platforms, where the headers are pulled in implicitly by other headers, but fails to build on other platforms. Fixes: 53bca371cdf7 ("thermal/drivers/qcom: Add support for LMh driver") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507270042.KdK0KKht-lkp@intel.com/ Signed-off-by: Dmitry Baryshkov Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250728-lmh-scm-v2-2-33bc58388ca5@oss.qualcomm.com --- drivers/thermal/qcom/lmh.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/thermal/qcom/lmh.c b/drivers/thermal/qcom/lmh.c index 75eaa9a68ab8aa..c681a3c89ffa0b 100644 --- a/drivers/thermal/qcom/lmh.c +++ b/drivers/thermal/qcom/lmh.c @@ -5,6 +5,8 @@ */ #include #include +#include +#include #include #include #include From 14b7ea27bd0fcbaf06f3df1544dcbced43e9fb1b Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Thu, 10 Jul 2025 23:24:25 +0530 Subject: [PATCH 2859/3206] drivers/thermal/qcom/lmh: Fix incorrect error message It was showing wrong error message as ARM threshold thremal trip for setting LOW threshold thermal trip. Fix this incorrect error message for setting LOW threshold thermal trip. Reviewed-by: Konrad Dybcio Signed-off-by: Sumeet Pawnikar Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20250710175426.5789-1-sumeet4linux@gmail.com --- drivers/thermal/qcom/lmh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/qcom/lmh.c b/drivers/thermal/qcom/lmh.c index c681a3c89ffa0b..ddadcfada5136c 100644 --- a/drivers/thermal/qcom/lmh.c +++ b/drivers/thermal/qcom/lmh.c @@ -206,7 +206,7 @@ static int lmh_probe(struct platform_device *pdev) ret = qcom_scm_lmh_dcvsh(LMH_SUB_FN_THERMAL, LMH_TH_LOW_THRESHOLD, temp_low, LMH_NODE_DCVS, node_id, 0); if (ret) { - dev_err(dev, "Error setting thermal ARM threshold%d\n", ret); + dev_err(dev, "Error setting thermal LOW threshold%d\n", ret); return ret; } From 13eac80a2db125c56a13938216c23202cf7f59ac Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 7 Sep 2025 17:41:38 +0200 Subject: [PATCH 2860/3206] thermal/drivers/rcar_gen3: Fix comment typo Fix typo to millidegree Celsius. This aligns the comment with another comment later on the same function. No functional change. Signed-off-by: Marek Vasut Link: https://lore.kernel.org/r/20250907154148.171496-1-marek.vasut+renesas@mailbox.org Signed-off-by: Daniel Lezcano --- drivers/thermal/renesas/rcar_gen3_thermal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/renesas/rcar_gen3_thermal.c b/drivers/thermal/renesas/rcar_gen3_thermal.c index 01858e72f4e0ab..e4201098556b28 100644 --- a/drivers/thermal/renesas/rcar_gen3_thermal.c +++ b/drivers/thermal/renesas/rcar_gen3_thermal.c @@ -171,7 +171,7 @@ static int rcar_gen3_thermal_get_temp(struct thermal_zone_device *tz, int *temp) const struct equation_set_coef *coef; int adj, decicelsius, reg, thcode; - /* Read register and convert to mili Celsius */ + /* Read register and convert to millidegree Celsius */ reg = rcar_gen3_thermal_read(tsc, REG_GEN3_TEMP) & CTEMP_MASK; if (reg < tsc->thcode[1]) { From ec4be3165e4c6af3f03c8a603af5ea118c95dfb4 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 5 Sep 2025 21:32:56 +0200 Subject: [PATCH 2861/3206] thermal/drivers/rcar_gen3: Document Gen4 support in Kconfig entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The R-Car Gen3 thermal driver supports both R-Car Gen3 and Gen4 SoCs. Update the Kconfig entry. Signed-off-by: Marek Vasut Reviewed-by: Niklas Söderlund Link: https://lore.kernel.org/r/20250905193322.148115-1-marek.vasut+renesas@mailbox.org Signed-off-by: Daniel Lezcano --- drivers/thermal/renesas/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/renesas/Kconfig b/drivers/thermal/renesas/Kconfig index dcf5fc5ae08e47..f4af8c7f28b05d 100644 --- a/drivers/thermal/renesas/Kconfig +++ b/drivers/thermal/renesas/Kconfig @@ -10,13 +10,13 @@ config RCAR_THERMAL thermal framework. config RCAR_GEN3_THERMAL - tristate "Renesas R-Car Gen3 and RZ/G2 thermal driver" + tristate "Renesas R-Car Gen3/Gen4 and RZ/G2 thermal driver" depends on ARCH_RENESAS || COMPILE_TEST depends on HAS_IOMEM depends on OF help - Enable this to plug the R-Car Gen3 or RZ/G2 thermal sensor driver into - the Linux thermal framework. + Enable this to plug the R-Car Gen3/Gen4 or RZ/G2 thermal sensor + driver into the Linux thermal framework. config RZG2L_THERMAL tristate "Renesas RZ/G2L thermal driver" From aa0025154855b10b68ada4e60fd99623d51252cc Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Thu, 28 Aug 2025 08:51:00 +0300 Subject: [PATCH 2862/3206] dt-bindings: thermal: Document Tegra114 SOCTHERM Thermal Management System Document SOCTHERM Thermal Management System found in the Tegra 4 SoC. Signed-off-by: Svyatoslav Ryhel Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250828055104.8073-3-clamor95@gmail.com Signed-off-by: Daniel Lezcano --- .../devicetree/bindings/thermal/nvidia,tegra124-soctherm.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.yaml b/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.yaml index cf47a1f3b3847d..25efedced58424 100644 --- a/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.yaml +++ b/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.yaml @@ -18,6 +18,7 @@ description: The SOCTHERM IP block contains thermal sensors, support for properties: compatible: enum: + - nvidia,tegra114-soctherm - nvidia,tegra124-soctherm - nvidia,tegra132-soctherm - nvidia,tegra210-soctherm @@ -206,6 +207,7 @@ allOf: compatible: contains: enum: + - nvidia,tegra114-soctherm - nvidia,tegra124-soctherm - nvidia,tegra210-soctherm - nvidia,tegra210b01-soctherm From 48fc33b95159badfca03d0c48aced60bf9f9325a Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Thu, 28 Aug 2025 08:51:01 +0300 Subject: [PATCH 2863/3206] thermal/drivers/tegra/soctherm-fuse: Prepare calibration for Tegra114 support The Tegra114 has a different fuse calibration register layout and address compared to other Tegra SoCs, requiring SOCTHERM shift, mask, register address, and nominal tf calibration value to be configurable. Signed-off-by: Svyatoslav Ryhel Reviewed-by: Mikko Perttunen Link: https://lore.kernel.org/r/20250828055104.8073-4-clamor95@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/tegra/soctherm-fuse.c | 18 ++++++++++++------ drivers/thermal/tegra/soctherm.h | 7 ++++++- drivers/thermal/tegra/tegra124-soctherm.c | 4 ++++ drivers/thermal/tegra/tegra132-soctherm.c | 4 ++++ drivers/thermal/tegra/tegra210-soctherm.c | 4 ++++ 5 files changed, 30 insertions(+), 7 deletions(-) diff --git a/drivers/thermal/tegra/soctherm-fuse.c b/drivers/thermal/tegra/soctherm-fuse.c index 190f95280e0b82..8d37cd8c9122b3 100644 --- a/drivers/thermal/tegra/soctherm-fuse.c +++ b/drivers/thermal/tegra/soctherm-fuse.c @@ -9,15 +9,12 @@ #include "soctherm.h" -#define NOMINAL_CALIB_FT 105 #define NOMINAL_CALIB_CP 25 #define FUSE_TSENSOR_CALIB_CP_TS_BASE_MASK 0x1fff #define FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK (0x1fff << 13) #define FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT 13 -#define FUSE_TSENSOR_COMMON 0x180 - /* * Tegra210: Layout of bits in FUSE_TSENSOR_COMMON: * 3 2 1 0 @@ -26,7 +23,7 @@ * | BASE_FT | BASE_CP | SHFT_FT | SHIFT_CP | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * - * Tegra12x, etc: + * Tegra124: * In chips prior to Tegra210, this fuse was incorrectly sized as 26 bits, * and didn't hold SHIFT_CP in [31:26]. Therefore these missing six bits * were obtained via the FUSE_SPARE_REALIGNMENT_REG register [5:0]. @@ -44,6 +41,13 @@ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |---------------------------------------------------| SHIFT_CP | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Tegra114: Layout of bits in FUSE_TSENSOR_COMMON aka FUSE_VSENSOR_CALIB: + * 3 2 1 0 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHFT_FT | BASE_FT | SHIFT_CP | BASE_CP | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define CALIB_COEFFICIENT 1000000LL @@ -77,7 +81,7 @@ int tegra_calc_shared_calib(const struct tegra_soctherm_fuse *tfuse, s32 shifted_cp, shifted_ft; int err; - err = tegra_fuse_readl(FUSE_TSENSOR_COMMON, &val); + err = tegra_fuse_readl(tfuse->fuse_common_reg, &val); if (err) return err; @@ -96,10 +100,12 @@ int tegra_calc_shared_calib(const struct tegra_soctherm_fuse *tfuse, return err; } + shifted_cp = (val & tfuse->fuse_shift_cp_mask) >> + tfuse->fuse_shift_cp_shift; shifted_cp = sign_extend32(val, 5); shared->actual_temp_cp = 2 * NOMINAL_CALIB_CP + shifted_cp; - shared->actual_temp_ft = 2 * NOMINAL_CALIB_FT + shifted_ft; + shared->actual_temp_ft = 2 * tfuse->nominal_calib_ft + shifted_ft; return 0; } diff --git a/drivers/thermal/tegra/soctherm.h b/drivers/thermal/tegra/soctherm.h index 70501e73d58623..083388094fd4ce 100644 --- a/drivers/thermal/tegra/soctherm.h +++ b/drivers/thermal/tegra/soctherm.h @@ -56,6 +56,9 @@ #define SENSOR_TEMP2_MEM_TEMP_MASK (0xffff << 16) #define SENSOR_TEMP2_PLLX_TEMP_MASK 0xffff +#define FUSE_VSENSOR_CALIB 0x08c +#define FUSE_TSENSOR_COMMON 0x180 + /** * struct tegra_tsensor_group - SOC_THERM sensor group data * @name: short name of the temperature sensor group @@ -109,9 +112,11 @@ struct tsensor_group_thermtrips { struct tegra_soctherm_fuse { u32 fuse_base_cp_mask, fuse_base_cp_shift; + u32 fuse_shift_cp_mask, fuse_shift_cp_shift; u32 fuse_base_ft_mask, fuse_base_ft_shift; u32 fuse_shift_ft_mask, fuse_shift_ft_shift; - u32 fuse_spare_realignment; + u32 fuse_common_reg, fuse_spare_realignment; + u32 nominal_calib_ft; }; struct tsensor_shared_calib { diff --git a/drivers/thermal/tegra/tegra124-soctherm.c b/drivers/thermal/tegra/tegra124-soctherm.c index 20ad27f4d1a161..d86acff1b234de 100644 --- a/drivers/thermal/tegra/tegra124-soctherm.c +++ b/drivers/thermal/tegra/tegra124-soctherm.c @@ -200,11 +200,15 @@ static const struct tegra_tsensor tegra124_tsensors[] = { static const struct tegra_soctherm_fuse tegra124_soctherm_fuse = { .fuse_base_cp_mask = 0x3ff, .fuse_base_cp_shift = 0, + .fuse_shift_cp_mask = 0x3f, + .fuse_shift_cp_shift = 0, .fuse_base_ft_mask = 0x7ff << 10, .fuse_base_ft_shift = 10, .fuse_shift_ft_mask = 0x1f << 21, .fuse_shift_ft_shift = 21, + .fuse_common_reg = FUSE_TSENSOR_COMMON, .fuse_spare_realignment = 0x1fc, + .nominal_calib_ft = 105, }; const struct tegra_soctherm_soc tegra124_soctherm = { diff --git a/drivers/thermal/tegra/tegra132-soctherm.c b/drivers/thermal/tegra/tegra132-soctherm.c index b76308fdad9e26..64c0363b97171f 100644 --- a/drivers/thermal/tegra/tegra132-soctherm.c +++ b/drivers/thermal/tegra/tegra132-soctherm.c @@ -200,11 +200,15 @@ static struct tegra_tsensor tegra132_tsensors[] = { static const struct tegra_soctherm_fuse tegra132_soctherm_fuse = { .fuse_base_cp_mask = 0x3ff, .fuse_base_cp_shift = 0, + .fuse_shift_cp_mask = 0x3f, + .fuse_shift_cp_shift = 0, .fuse_base_ft_mask = 0x7ff << 10, .fuse_base_ft_shift = 10, .fuse_shift_ft_mask = 0x1f << 21, .fuse_shift_ft_shift = 21, + .fuse_common_reg = FUSE_TSENSOR_COMMON, .fuse_spare_realignment = 0x1fc, + .nominal_calib_ft = 105, }; const struct tegra_soctherm_soc tegra132_soctherm = { diff --git a/drivers/thermal/tegra/tegra210-soctherm.c b/drivers/thermal/tegra/tegra210-soctherm.c index d0ff793f18c561..f6e1493f0202ff 100644 --- a/drivers/thermal/tegra/tegra210-soctherm.c +++ b/drivers/thermal/tegra/tegra210-soctherm.c @@ -201,11 +201,15 @@ static const struct tegra_tsensor tegra210_tsensors[] = { static const struct tegra_soctherm_fuse tegra210_soctherm_fuse = { .fuse_base_cp_mask = 0x3ff << 11, .fuse_base_cp_shift = 11, + .fuse_shift_cp_mask = 0x3f, + .fuse_shift_cp_shift = 0, .fuse_base_ft_mask = 0x7ff << 21, .fuse_base_ft_shift = 21, .fuse_shift_ft_mask = 0x1f << 6, .fuse_shift_ft_shift = 6, + .fuse_common_reg = FUSE_TSENSOR_COMMON, .fuse_spare_realignment = 0, + .nominal_calib_ft = 105, }; static struct tsensor_group_thermtrips tegra210_tsensor_thermtrips[] = { From 10e1dcb62a7e874af43e7dfbef13ef8e3a2ad4a9 Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Thu, 28 Aug 2025 08:51:02 +0300 Subject: [PATCH 2864/3206] dt-bindings: thermal: add Tegra114 soctherm header This adds header for the Tegra114 SOCTHERM device tree node. Signed-off-by: Svyatoslav Ryhel Acked-by: Conor Dooley Reviewed-by: Mikko Perttunen Link: https://lore.kernel.org/r/20250828055104.8073-5-clamor95@gmail.com Signed-off-by: Daniel Lezcano --- .../dt-bindings/thermal/tegra114-soctherm.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/dt-bindings/thermal/tegra114-soctherm.h diff --git a/include/dt-bindings/thermal/tegra114-soctherm.h b/include/dt-bindings/thermal/tegra114-soctherm.h new file mode 100644 index 00000000000000..b766a61cd1ce74 --- /dev/null +++ b/include/dt-bindings/thermal/tegra114-soctherm.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * This header provides constants for binding nvidia,tegra114-soctherm. + */ + +#ifndef _DT_BINDINGS_THERMAL_TEGRA114_SOCTHERM_H +#define _DT_BINDINGS_THERMAL_TEGRA114_SOCTHERM_H + +#define TEGRA114_SOCTHERM_SENSOR_CPU 0 +#define TEGRA114_SOCTHERM_SENSOR_MEM 1 +#define TEGRA114_SOCTHERM_SENSOR_GPU 2 +#define TEGRA114_SOCTHERM_SENSOR_PLLX 3 + +#define TEGRA114_SOCTHERM_THROT_LEVEL_NONE 0 +#define TEGRA114_SOCTHERM_THROT_LEVEL_LOW 1 +#define TEGRA114_SOCTHERM_THROT_LEVEL_MED 2 +#define TEGRA114_SOCTHERM_THROT_LEVEL_HIGH 3 + +#endif From 9d522a877b6f1a8a2675826d14d2edaa9712db9a Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Thu, 28 Aug 2025 08:51:03 +0300 Subject: [PATCH 2865/3206] thermal/drivers/tegra: Add Tegra114 specific SOCTHERM driver Add Tegra114 specific SOCTHERM driver. Signed-off-by: Svyatoslav Ryhel Reviewed-by: Mikko Perttunen Link: https://lore.kernel.org/r/20250828055104.8073-6-clamor95@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/tegra/Makefile | 1 + drivers/thermal/tegra/soctherm.c | 13 ++ drivers/thermal/tegra/soctherm.h | 4 + drivers/thermal/tegra/tegra114-soctherm.c | 209 ++++++++++++++++++++++ 4 files changed, 227 insertions(+) create mode 100644 drivers/thermal/tegra/tegra114-soctherm.c diff --git a/drivers/thermal/tegra/Makefile b/drivers/thermal/tegra/Makefile index eb27d194c58358..9b3e91f7fb97bd 100644 --- a/drivers/thermal/tegra/Makefile +++ b/drivers/thermal/tegra/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_TEGRA_BPMP_THERMAL) += tegra-bpmp-thermal.o obj-$(CONFIG_TEGRA30_TSENSOR) += tegra30-tsensor.o tegra-soctherm-y := soctherm.o soctherm-fuse.o +tegra-soctherm-$(CONFIG_ARCH_TEGRA_114_SOC) += tegra114-soctherm.o tegra-soctherm-$(CONFIG_ARCH_TEGRA_124_SOC) += tegra124-soctherm.o tegra-soctherm-$(CONFIG_ARCH_TEGRA_132_SOC) += tegra132-soctherm.o tegra-soctherm-$(CONFIG_ARCH_TEGRA_210_SOC) += tegra210-soctherm.o diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c index 53a5c649f4b146..5d26b52beaba7a 100644 --- a/drivers/thermal/tegra/soctherm.c +++ b/drivers/thermal/tegra/soctherm.c @@ -31,6 +31,7 @@ #include #include +#include #include #include "../thermal_core.h" @@ -357,6 +358,12 @@ struct soctherm_oc_irq_chip_data { static struct soctherm_oc_irq_chip_data soc_irq_cdata; +/* Ensure that TEGRA114_* and TEGRA124_* counterparts are equal */ +static_assert(TEGRA114_SOCTHERM_SENSOR_CPU == TEGRA124_SOCTHERM_SENSOR_CPU); +static_assert(TEGRA114_SOCTHERM_SENSOR_MEM == TEGRA124_SOCTHERM_SENSOR_MEM); +static_assert(TEGRA114_SOCTHERM_SENSOR_GPU == TEGRA124_SOCTHERM_SENSOR_GPU); +static_assert(TEGRA114_SOCTHERM_SENSOR_PLLX == TEGRA124_SOCTHERM_SENSOR_PLLX); + /** * ccroc_writel() - writes a value to a CCROC register * @ts: pointer to a struct tegra_soctherm @@ -2045,6 +2052,12 @@ static void soctherm_init(struct platform_device *pdev) } static const struct of_device_id tegra_soctherm_of_match[] = { +#ifdef CONFIG_ARCH_TEGRA_114_SOC + { + .compatible = "nvidia,tegra114-soctherm", + .data = &tegra114_soctherm, + }, +#endif #ifdef CONFIG_ARCH_TEGRA_124_SOC { .compatible = "nvidia,tegra124-soctherm", diff --git a/drivers/thermal/tegra/soctherm.h b/drivers/thermal/tegra/soctherm.h index 083388094fd4ce..aa4af9268b05d8 100644 --- a/drivers/thermal/tegra/soctherm.h +++ b/drivers/thermal/tegra/soctherm.h @@ -142,6 +142,10 @@ int tegra_calc_tsensor_calib(const struct tegra_tsensor *sensor, const struct tsensor_shared_calib *shared, u32 *calib); +#ifdef CONFIG_ARCH_TEGRA_114_SOC +extern const struct tegra_soctherm_soc tegra114_soctherm; +#endif + #ifdef CONFIG_ARCH_TEGRA_124_SOC extern const struct tegra_soctherm_soc tegra124_soctherm; #endif diff --git a/drivers/thermal/tegra/tegra114-soctherm.c b/drivers/thermal/tegra/tegra114-soctherm.c new file mode 100644 index 00000000000000..688104f2805280 --- /dev/null +++ b/drivers/thermal/tegra/tegra114-soctherm.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2024, Svyatoslav Ryhel + */ + +#include +#include + +#include + +#include "soctherm.h" + +#define TEGRA114_THERMTRIP_ANY_EN_MASK (0x1 << 28) +#define TEGRA114_THERMTRIP_MEM_EN_MASK (0x1 << 27) +#define TEGRA114_THERMTRIP_GPU_EN_MASK (0x1 << 26) +#define TEGRA114_THERMTRIP_CPU_EN_MASK (0x1 << 25) +#define TEGRA114_THERMTRIP_TSENSE_EN_MASK (0x1 << 24) +#define TEGRA114_THERMTRIP_GPUMEM_THRESH_MASK (0xff << 16) +#define TEGRA114_THERMTRIP_CPU_THRESH_MASK (0xff << 8) +#define TEGRA114_THERMTRIP_TSENSE_THRESH_MASK 0xff + +#define TEGRA114_THERMCTL_LVL0_UP_THRESH_MASK (0xff << 17) +#define TEGRA114_THERMCTL_LVL0_DN_THRESH_MASK (0xff << 9) + +#define TEGRA114_THRESH_GRAIN 1000 +#define TEGRA114_BPTT 8 + +static const struct tegra_tsensor_configuration tegra114_tsensor_config = { + .tall = 16300, + .tiddq_en = 1, + .ten_count = 1, + .tsample = 163, + .tsample_ate = 655, +}; + +static const struct tegra_tsensor_group tegra114_tsensor_group_cpu = { + .id = TEGRA114_SOCTHERM_SENSOR_CPU, + .name = "cpu", + .sensor_temp_offset = SENSOR_TEMP1, + .sensor_temp_mask = SENSOR_TEMP1_CPU_TEMP_MASK, + .pdiv = 10, + .pdiv_ate = 10, + .pdiv_mask = SENSOR_PDIV_CPU_MASK, + .pllx_hotspot_diff = 6, + .pllx_hotspot_mask = SENSOR_HOTSPOT_CPU_MASK, + .thermtrip_any_en_mask = TEGRA114_THERMTRIP_ANY_EN_MASK, + .thermtrip_enable_mask = TEGRA114_THERMTRIP_CPU_EN_MASK, + .thermtrip_threshold_mask = TEGRA114_THERMTRIP_CPU_THRESH_MASK, + .thermctl_isr_mask = THERM_IRQ_CPU_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_CPU, + .thermctl_lvl0_up_thresh_mask = TEGRA114_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA114_THERMCTL_LVL0_DN_THRESH_MASK, +}; + +static const struct tegra_tsensor_group tegra114_tsensor_group_gpu = { + .id = TEGRA114_SOCTHERM_SENSOR_GPU, + .name = "gpu", + .sensor_temp_offset = SENSOR_TEMP1, + .sensor_temp_mask = SENSOR_TEMP1_GPU_TEMP_MASK, + .pdiv = 10, + .pdiv_ate = 10, + .pdiv_mask = SENSOR_PDIV_GPU_MASK, + .pllx_hotspot_diff = 6, + .pllx_hotspot_mask = SENSOR_HOTSPOT_GPU_MASK, + .thermtrip_any_en_mask = TEGRA114_THERMTRIP_ANY_EN_MASK, + .thermtrip_enable_mask = TEGRA114_THERMTRIP_GPU_EN_MASK, + .thermtrip_threshold_mask = TEGRA114_THERMTRIP_GPUMEM_THRESH_MASK, + .thermctl_isr_mask = THERM_IRQ_GPU_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_GPU, + .thermctl_lvl0_up_thresh_mask = TEGRA114_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA114_THERMCTL_LVL0_DN_THRESH_MASK, +}; + +static const struct tegra_tsensor_group tegra114_tsensor_group_pll = { + .id = TEGRA114_SOCTHERM_SENSOR_PLLX, + .name = "pll", + .sensor_temp_offset = SENSOR_TEMP2, + .sensor_temp_mask = SENSOR_TEMP2_PLLX_TEMP_MASK, + .pdiv = 10, + .pdiv_ate = 10, + .pdiv_mask = SENSOR_PDIV_PLLX_MASK, + .thermtrip_any_en_mask = TEGRA114_THERMTRIP_ANY_EN_MASK, + .thermtrip_enable_mask = TEGRA114_THERMTRIP_TSENSE_EN_MASK, + .thermtrip_threshold_mask = TEGRA114_THERMTRIP_TSENSE_THRESH_MASK, + .thermctl_isr_mask = THERM_IRQ_TSENSE_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_TSENSE, + .thermctl_lvl0_up_thresh_mask = TEGRA114_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA114_THERMCTL_LVL0_DN_THRESH_MASK, +}; + +static const struct tegra_tsensor_group tegra114_tsensor_group_mem = { + .id = TEGRA114_SOCTHERM_SENSOR_MEM, + .name = "mem", + .sensor_temp_offset = SENSOR_TEMP2, + .sensor_temp_mask = SENSOR_TEMP2_MEM_TEMP_MASK, + .pdiv = 10, + .pdiv_ate = 10, + .pdiv_mask = SENSOR_PDIV_MEM_MASK, + .pllx_hotspot_diff = 0, + .pllx_hotspot_mask = SENSOR_HOTSPOT_MEM_MASK, + .thermtrip_any_en_mask = TEGRA114_THERMTRIP_ANY_EN_MASK, + .thermtrip_enable_mask = TEGRA114_THERMTRIP_MEM_EN_MASK, + .thermtrip_threshold_mask = TEGRA114_THERMTRIP_GPUMEM_THRESH_MASK, + .thermctl_isr_mask = THERM_IRQ_MEM_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_MEM, + .thermctl_lvl0_up_thresh_mask = TEGRA114_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA114_THERMCTL_LVL0_DN_THRESH_MASK, +}; + +static const struct tegra_tsensor_group *tegra114_tsensor_groups[] = { + &tegra114_tsensor_group_cpu, + &tegra114_tsensor_group_gpu, + &tegra114_tsensor_group_pll, + &tegra114_tsensor_group_mem, +}; + +static const struct tegra_tsensor tegra114_tsensors[] = { + { + .name = "cpu0", + .base = 0xc0, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x098, + .fuse_corr_alpha = 1196400, + .fuse_corr_beta = -13600000, + .group = &tegra114_tsensor_group_cpu, + }, { + .name = "cpu1", + .base = 0xe0, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x084, + .fuse_corr_alpha = 1196400, + .fuse_corr_beta = -13600000, + .group = &tegra114_tsensor_group_cpu, + }, { + .name = "cpu2", + .base = 0x100, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x088, + .fuse_corr_alpha = 1196400, + .fuse_corr_beta = -13600000, + .group = &tegra114_tsensor_group_cpu, + }, { + .name = "cpu3", + .base = 0x120, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x12c, + .fuse_corr_alpha = 1196400, + .fuse_corr_beta = -13600000, + .group = &tegra114_tsensor_group_cpu, + }, { + .name = "mem0", + .base = 0x140, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x158, + .fuse_corr_alpha = 1000000, + .fuse_corr_beta = 0, + .group = &tegra114_tsensor_group_mem, + }, { + .name = "mem1", + .base = 0x160, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x15c, + .fuse_corr_alpha = 1000000, + .fuse_corr_beta = 0, + .group = &tegra114_tsensor_group_mem, + }, { + .name = "gpu", + .base = 0x180, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x154, + .fuse_corr_alpha = 1124500, + .fuse_corr_beta = -9793100, + .group = &tegra114_tsensor_group_gpu, + }, { + .name = "pllx", + .base = 0x1a0, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x160, + .fuse_corr_alpha = 1224200, + .fuse_corr_beta = -14665000, + .group = &tegra114_tsensor_group_pll, + }, +}; + +static const struct tegra_soctherm_fuse tegra114_soctherm_fuse = { + .fuse_base_cp_mask = 0x3ff, + .fuse_base_cp_shift = 0, + .fuse_shift_cp_mask = 0x3f << 10, + .fuse_shift_cp_shift = 10, + .fuse_base_ft_mask = 0x7ff << 16, + .fuse_base_ft_shift = 16, + .fuse_shift_ft_mask = 0x1f << 27, + .fuse_shift_ft_shift = 27, + .fuse_common_reg = FUSE_VSENSOR_CALIB, + .fuse_spare_realignment = 0, + .nominal_calib_ft = 90, +}; + +const struct tegra_soctherm_soc tegra114_soctherm = { + .tsensors = tegra114_tsensors, + .num_tsensors = ARRAY_SIZE(tegra114_tsensors), + .ttgs = tegra114_tsensor_groups, + .num_ttgs = ARRAY_SIZE(tegra114_tsensor_groups), + .tfuse = &tegra114_soctherm_fuse, + .thresh_grain = TEGRA114_THRESH_GRAIN, + .bptt = TEGRA114_BPTT, + .use_ccroc = false, +}; From 117bdda24d68be7a61d460e1ee372458c41e787e Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 11 Sep 2025 09:00:13 +0200 Subject: [PATCH 2866/3206] thermal/drivers/rcar_gen3: Fix mapping SoCs to generic Gen4 entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit S4 was added first so it was assumed to be the blueprint for R-Car Gen4. It turned out now, that S4 is a special mix between Gen3 and Gen4. V4H and V4M are the similar ones as confirmed by HW engineers. So, rename the S4 entry to be specific instead of generic. Rename the V4H entry to be the new generic one, so V4M will use it as well now. Signed-off-by: Wolfram Sang Reviewed-by: Niklas Söderlund Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250911070254.2214-2-wsa+renesas@sang-engineering.com Signed-off-by: Daniel Lezcano --- drivers/thermal/renesas/rcar_gen3_thermal.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/thermal/renesas/rcar_gen3_thermal.c b/drivers/thermal/renesas/rcar_gen3_thermal.c index e4201098556b28..3223de238d0144 100644 --- a/drivers/thermal/renesas/rcar_gen3_thermal.c +++ b/drivers/thermal/renesas/rcar_gen3_thermal.c @@ -371,7 +371,7 @@ static const struct rcar_gen3_thermal_fuse_default rcar_gen3_thermal_fuse_defaul }, }; -static const struct rcar_gen3_thermal_fuse_default rcar_gen3_thermal_fuse_default_info_v4h = { +static const struct rcar_gen3_thermal_fuse_default rcar_gen3_thermal_fuse_default_info_gen4 = { .ptat = { 3274, 2164, 985 }, .thcodes = { /* All four THS units share the same trimming */ { 3218, 2617, 1980 }, @@ -397,7 +397,7 @@ static const struct rcar_thermal_info rcar_gen3_thermal_info = { .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_gen3, }; -static const struct rcar_thermal_info rcar_gen4_thermal_info = { +static const struct rcar_thermal_info rcar_s4_thermal_info = { .scale = 167, .adj_below = -41, .adj_above = 126, @@ -405,12 +405,12 @@ static const struct rcar_thermal_info rcar_gen4_thermal_info = { .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_gen3, }; -static const struct rcar_thermal_info rcar_v4h_thermal_info = { +static const struct rcar_thermal_info rcar_gen4_thermal_info = { .scale = 167, .adj_below = -41, .adj_above = 126, .fuses = &rcar_gen3_thermal_fuse_info_gen4, - .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_v4h, + .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_gen4, }; static const struct of_device_id rcar_gen3_thermal_dt_ids[] = { @@ -452,11 +452,11 @@ static const struct of_device_id rcar_gen3_thermal_dt_ids[] = { }, { .compatible = "renesas,r8a779f0-thermal", - .data = &rcar_gen4_thermal_info, + .data = &rcar_s4_thermal_info, }, { .compatible = "renesas,r8a779g0-thermal", - .data = &rcar_v4h_thermal_info, + .data = &rcar_gen4_thermal_info, }, { .compatible = "renesas,r8a779h0-thermal", From 55173287e7a2dbab8c920f7d5db8ac515ab82b4e Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 28 Aug 2025 14:40:42 +0200 Subject: [PATCH 2867/3206] thermal/drivers/k3_j72xx_bandgap: Register sensors with hwmon Make the sensors available in the hwmon subsystem (if CONFIG_THERMAL_HWMON is enabled). Signed-off-by: Michael Walle Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20250828124042.1680853-1-mwalle@kernel.org Signed-off-by: Daniel Lezcano --- drivers/thermal/k3_j72xx_bandgap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/thermal/k3_j72xx_bandgap.c b/drivers/thermal/k3_j72xx_bandgap.c index a36289e61315a2..d9ec3bf194966c 100644 --- a/drivers/thermal/k3_j72xx_bandgap.c +++ b/drivers/thermal/k3_j72xx_bandgap.c @@ -20,6 +20,8 @@ #include #include +#include "thermal_hwmon.h" + #define K3_VTM_DEVINFO_PWR0_OFFSET 0x4 #define K3_VTM_DEVINFO_PWR0_TEMPSENS_CT_MASK 0xf0 #define K3_VTM_TMPSENS0_CTRL_OFFSET 0x300 @@ -513,6 +515,8 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) ret = PTR_ERR(ti_thermal); goto err_free_ref_table; } + + devm_thermal_add_hwmon_sysfs(bgp->dev, ti_thermal); } platform_set_drvdata(pdev, bgp); From a3152e5c742ca2557c5cd0e5a9e6ca6b9a92df93 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Sun, 10 Aug 2025 15:21:22 +0300 Subject: [PATCH 2868/3206] dt-bindings: thermal: r9a08g045-tsu: Document the TSU unit The Renesas RZ/G3S SoC includes a Thermal Sensor Unit (TSU) block designed to measure the junction temperature. The temperature is measured using the RZ/G3S ADC, with a dedicated ADC channel directly connected to the TSU. Add documentation for it. Reviewed-by: Rob Herring (Arm) Reviewed-by: Geert Uytterhoeven Signed-off-by: Claudiu Beznea Tested-by: Wolfram Sang Link: https://lore.kernel.org/r/20250810122125.792966-2-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Daniel Lezcano --- .../thermal/renesas,r9a08g045-tsu.yaml | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml diff --git a/Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml b/Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml new file mode 100644 index 00000000000000..573e2b9d37524b --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/thermal/renesas,r9a08g045-tsu.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas RZ/G3S Thermal Sensor Unit + +description: + The thermal sensor unit (TSU) measures the temperature(Tj) inside + the LSI. + +maintainers: + - Claudiu Beznea + +$ref: thermal-sensor.yaml# + +properties: + compatible: + const: renesas,r9a08g045-tsu + + reg: + maxItems: 1 + + clocks: + items: + - description: TSU module clock + + power-domains: + maxItems: 1 + + resets: + items: + - description: TSU module reset + + io-channels: + items: + - description: ADC channel which reports the TSU temperature + + io-channel-names: + items: + - const: tsu + + "#thermal-sensor-cells": + const: 0 + +required: + - compatible + - reg + - clocks + - power-domains + - resets + - io-channels + - io-channel-names + - '#thermal-sensor-cells' + +additionalProperties: false + +examples: + - | + #include + + tsu: thermal@10059000 { + compatible = "renesas,r9a08g045-tsu"; + reg = <0x10059000 0x1000>; + clocks = <&cpg CPG_MOD R9A08G045_TSU_PCLK>; + resets = <&cpg R9A08G045_TSU_PRESETN>; + power-domains = <&cpg>; + #thermal-sensor-cells = <0>; + io-channels = <&adc 8>; + io-channel-names = "tsu"; + }; + + thermal-zones { + cpu-thermal { + polling-delay-passive = <250>; + polling-delay = <1000>; + thermal-sensors = <&tsu>; + + trips { + sensor_crit: sensor-crit { + temperature = <125000>; + hysteresis = <1000>; + type = "critical"; + }; + target: trip-point { + temperature = <100000>; + hysteresis = <1000>; + type = "passive"; + }; + }; + }; + }; From dc095b37b09e1abbdb6daf5e69dbd0cd5265087e Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Sun, 10 Aug 2025 15:21:23 +0300 Subject: [PATCH 2869/3206] thermal/drivers/renesas/rzg3s: Add thermal driver for the Renesas RZ/G3S SoC The Renesas RZ/G3S SoC features a Thermal Sensor Unit (TSU) that reports the junction temperature. The temperature is reported through a dedicated ADC channel. Add a driver for the Renesas RZ/G3S TSU. Signed-off-by: Claudiu Beznea Tested-by: Wolfram Sang Link: https://lore.kernel.org/r/20250810122125.792966-3-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Daniel Lezcano --- MAINTAINERS | 7 + drivers/thermal/renesas/Kconfig | 8 + drivers/thermal/renesas/Makefile | 1 + drivers/thermal/renesas/rzg3s_thermal.c | 272 ++++++++++++++++++++++++ 4 files changed, 288 insertions(+) create mode 100644 drivers/thermal/renesas/rzg3s_thermal.c diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa457..7597e8fbc842c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21541,6 +21541,13 @@ S: Maintained F: Documentation/devicetree/bindings/iio/potentiometer/renesas,x9250.yaml F: drivers/iio/potentiometer/x9250.c +RENESAS RZ/G3S THERMAL SENSOR UNIT DRIVER +M: Claudiu Beznea +L: linux-pm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml +F: drivers/thermal/renesas/rzg3s_thermal.c + RESET CONTROLLER FRAMEWORK M: Philipp Zabel S: Maintained diff --git a/drivers/thermal/renesas/Kconfig b/drivers/thermal/renesas/Kconfig index f4af8c7f28b05d..57ea987e231d77 100644 --- a/drivers/thermal/renesas/Kconfig +++ b/drivers/thermal/renesas/Kconfig @@ -26,3 +26,11 @@ config RZG2L_THERMAL help Enable this to plug the RZ/G2L thermal sensor driver into the Linux thermal framework. + +config RZG3S_THERMAL + tristate "Renesas RZ/G3S thermal driver" + depends on ARCH_R9A08G045 || COMPILE_TEST + depends on OF && IIO && RZG2L_ADC + help + Enable this to plug the RZ/G3S thermal sensor driver into the Linux + thermal framework. diff --git a/drivers/thermal/renesas/Makefile b/drivers/thermal/renesas/Makefile index bf9cb3cb94d678..1feb5ab78827e2 100644 --- a/drivers/thermal/renesas/Makefile +++ b/drivers/thermal/renesas/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_RCAR_GEN3_THERMAL) += rcar_gen3_thermal.o obj-$(CONFIG_RCAR_THERMAL) += rcar_thermal.o obj-$(CONFIG_RZG2L_THERMAL) += rzg2l_thermal.o +obj-$(CONFIG_RZG3S_THERMAL) += rzg3s_thermal.o diff --git a/drivers/thermal/renesas/rzg3s_thermal.c b/drivers/thermal/renesas/rzg3s_thermal.c new file mode 100644 index 00000000000000..e25e36c99a8866 --- /dev/null +++ b/drivers/thermal/renesas/rzg3s_thermal.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas RZ/G3S TSU Thermal Sensor Driver + * + * Copyright (C) 2024 Renesas Electronics Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../thermal_hwmon.h" + +#define TSU_SM 0x0 +#define TSU_SM_EN BIT(0) +#define TSU_SM_OE BIT(1) +#define OTPTSUTRIM_REG(n) (0x18 + (n) * 0x4) +#define OTPTSUTRIM_EN_MASK BIT(31) +#define OTPTSUTRIM_MASK GENMASK(11, 0) + +#define TSU_READ_STEPS 8 + +/* Default calibration values, if FUSE values are missing. */ +#define SW_CALIB0_VAL 1297 +#define SW_CALIB1_VAL 751 + +#define MCELSIUS(temp) ((temp) * MILLIDEGREE_PER_DEGREE) + +/** + * struct rzg3s_thermal_priv - RZ/G3S thermal private data structure + * @base: TSU base address + * @dev: device pointer + * @tz: thermal zone pointer + * @rstc: reset control + * @channel: IIO channel to read the TSU + * @mode: current device mode + * @calib0: calibration value + * @calib1: calibration value + */ +struct rzg3s_thermal_priv { + void __iomem *base; + struct device *dev; + struct thermal_zone_device *tz; + struct reset_control *rstc; + struct iio_channel *channel; + enum thermal_device_mode mode; + u16 calib0; + u16 calib1; +}; + +static int rzg3s_thermal_get_temp(struct thermal_zone_device *tz, int *temp) +{ + struct rzg3s_thermal_priv *priv = thermal_zone_device_priv(tz); + int ts_code_ave = 0; + + if (priv->mode != THERMAL_DEVICE_ENABLED) + return -EAGAIN; + + for (u8 i = 0; i < TSU_READ_STEPS; i++) { + int ret, val; + + ret = iio_read_channel_raw(priv->channel, &val); + if (ret < 0) + return ret; + + ts_code_ave += val; + /* + * According to the HW manual (Rev.1.10, section 40.4.4 Procedure for Measuring + * the Temperature) we need to wait here at leat 3us. + */ + usleep_range(5, 10); + } + + ts_code_ave = DIV_ROUND_CLOSEST(MCELSIUS(ts_code_ave), TSU_READ_STEPS); + + /* + * According to the HW manual (Rev.1.10, section 40.4.4 Procedure for Measuring the + * Temperature) the computation formula is as follows: + * + * Tj = (ts_code_ave - priv->calib1) * 165 / (priv->calib0 - priv->calib1) - 40 + * + * Convert everything to milli Celsius before applying the formula to avoid + * losing precision. + */ + + *temp = div_s64((s64)(ts_code_ave - MCELSIUS(priv->calib1)) * MCELSIUS(165), + MCELSIUS(priv->calib0 - priv->calib1)) - MCELSIUS(40); + + /* Report it in milli degrees Celsius and round it up to 0.5 degrees Celsius. */ + *temp = roundup(*temp, 500); + + return 0; +} + +static void rzg3s_thermal_set_mode(struct rzg3s_thermal_priv *priv, + enum thermal_device_mode mode) +{ + struct device *dev = priv->dev; + int ret; + + ret = pm_runtime_resume_and_get(dev); + if (ret) + return; + + if (mode == THERMAL_DEVICE_DISABLED) { + writel(0, priv->base + TSU_SM); + } else { + writel(TSU_SM_EN, priv->base + TSU_SM); + /* + * According to the HW manual (Rev.1.10, section 40.4.1 Procedure for + * Starting the TSU) we need to wait here 30us or more. + */ + usleep_range(30, 40); + + writel(TSU_SM_OE | TSU_SM_EN, priv->base + TSU_SM); + /* + * According to the HW manual (Rev.1.10, section 40.4.1 Procedure for + * Starting the TSU) we need to wait here 50us or more. + */ + usleep_range(50, 60); + } + + pm_runtime_put_autosuspend(dev); +} + +static int rzg3s_thermal_change_mode(struct thermal_zone_device *tz, + enum thermal_device_mode mode) +{ + struct rzg3s_thermal_priv *priv = thermal_zone_device_priv(tz); + + if (priv->mode == mode) + return 0; + + rzg3s_thermal_set_mode(priv, mode); + priv->mode = mode; + + return 0; +} + +static const struct thermal_zone_device_ops rzg3s_tz_of_ops = { + .get_temp = rzg3s_thermal_get_temp, + .change_mode = rzg3s_thermal_change_mode, +}; + +static int rzg3s_thermal_read_calib(struct rzg3s_thermal_priv *priv) +{ + struct device *dev = priv->dev; + u32 val; + int ret; + + ret = pm_runtime_resume_and_get(dev); + if (ret) + return ret; + + val = readl(priv->base + OTPTSUTRIM_REG(0)); + if (val & OTPTSUTRIM_EN_MASK) + priv->calib0 = FIELD_GET(OTPTSUTRIM_MASK, val); + else + priv->calib0 = SW_CALIB0_VAL; + + val = readl(priv->base + OTPTSUTRIM_REG(1)); + if (val & OTPTSUTRIM_EN_MASK) + priv->calib1 = FIELD_GET(OTPTSUTRIM_MASK, val); + else + priv->calib1 = SW_CALIB1_VAL; + + pm_runtime_put_autosuspend(dev); + + return 0; +} + +static int rzg3s_thermal_probe(struct platform_device *pdev) +{ + struct rzg3s_thermal_priv *priv; + struct device *dev = &pdev->dev; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + priv->channel = devm_iio_channel_get(dev, "tsu"); + if (IS_ERR(priv->channel)) + return dev_err_probe(dev, PTR_ERR(priv->channel), "Failed to get IIO channel!\n"); + + priv->rstc = devm_reset_control_get_exclusive_deasserted(dev, NULL); + if (IS_ERR(priv->rstc)) + return dev_err_probe(dev, PTR_ERR(priv->rstc), "Failed to get reset!\n"); + + priv->dev = dev; + priv->mode = THERMAL_DEVICE_DISABLED; + platform_set_drvdata(pdev, priv); + + pm_runtime_set_autosuspend_delay(dev, 300); + pm_runtime_use_autosuspend(dev); + ret = devm_pm_runtime_enable(dev); + if (ret) + return dev_err_probe(dev, ret, "Failed to enable runtime PM!\n"); + + ret = rzg3s_thermal_read_calib(priv); + if (ret) + return dev_err_probe(dev, ret, "Failed to read calibration data!\n"); + + priv->tz = devm_thermal_of_zone_register(dev, 0, priv, &rzg3s_tz_of_ops); + if (IS_ERR(priv->tz)) + return dev_err_probe(dev, PTR_ERR(priv->tz), "Failed to register thermal zone!\n"); + + ret = devm_thermal_add_hwmon_sysfs(dev, priv->tz); + if (ret) + return dev_err_probe(dev, ret, "Failed to add hwmon sysfs!\n"); + + return 0; +} + +static int rzg3s_thermal_suspend(struct device *dev) +{ + struct rzg3s_thermal_priv *priv = dev_get_drvdata(dev); + + rzg3s_thermal_set_mode(priv, THERMAL_DEVICE_DISABLED); + + return reset_control_assert(priv->rstc); +} + +static int rzg3s_thermal_resume(struct device *dev) +{ + struct rzg3s_thermal_priv *priv = dev_get_drvdata(dev); + int ret; + + ret = reset_control_deassert(priv->rstc); + if (ret) + return ret; + + if (priv->mode != THERMAL_DEVICE_DISABLED) + rzg3s_thermal_set_mode(priv, priv->mode); + + return 0; +} + +static const struct dev_pm_ops rzg3s_thermal_pm_ops = { + SYSTEM_SLEEP_PM_OPS(rzg3s_thermal_suspend, rzg3s_thermal_resume) +}; + +static const struct of_device_id rzg3s_thermal_dt_ids[] = { + { .compatible = "renesas,r9a08g045-tsu" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, rzg3s_thermal_dt_ids); + +static struct platform_driver rzg3s_thermal_driver = { + .driver = { + .name = "rzg3s-thermal", + .of_match_table = rzg3s_thermal_dt_ids, + .pm = pm_ptr(&rzg3s_thermal_pm_ops), + }, + .probe = rzg3s_thermal_probe, +}; +module_platform_driver(rzg3s_thermal_driver); + +MODULE_DESCRIPTION("Renesas RZ/G3S Thermal Sensor Unit Driver"); +MODULE_AUTHOR("Claudiu Beznea "); +MODULE_LICENSE("GPL"); From 6f769708d53ab029f09f78660680b4ad161092f2 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Wed, 20 Aug 2025 19:40:47 +0200 Subject: [PATCH 2870/3206] thermal/drivers/rockchip: Unify struct rockchip_tsadc_chip format Unify all chip descriptions to the version without any empty lines. Suggested-by: Heiko Stuebner Signed-off-by: Sebastian Reichel Reviewed-by: Dragan Simic Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250820-thermal-rockchip-grf-warning-v2-1-c7e2d35017b8@kernel.org Signed-off-by: Daniel Lezcano --- drivers/thermal/rockchip_thermal.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c index 3beff9b6fac3ab..7b18a705dfade6 100644 --- a/drivers/thermal/rockchip_thermal.c +++ b/drivers/thermal/rockchip_thermal.c @@ -1098,10 +1098,8 @@ static const struct rockchip_tsadc_chip px30_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* 2 channels for tsadc */ - .tshut_mode = TSHUT_MODE_CRU, /* default TSHUT via CRU */ .tshut_temp = 95000, - .initialize = rk_tsadcv4_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1109,7 +1107,6 @@ static const struct rockchip_tsadc_chip px30_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3328_code_table, .length = ARRAY_SIZE(rk3328_code_table), @@ -1122,11 +1119,9 @@ static const struct rockchip_tsadc_chip rv1108_tsadc_data = { /* cpu */ .chn_offset = 0, .chn_num = 1, /* one channel for tsadc */ - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv2_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1134,7 +1129,6 @@ static const struct rockchip_tsadc_chip rv1108_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rv1108_table, .length = ARRAY_SIZE(rv1108_table), @@ -1147,11 +1141,9 @@ static const struct rockchip_tsadc_chip rk3228_tsadc_data = { /* cpu */ .chn_offset = 0, .chn_num = 1, /* one channel for tsadc */ - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv2_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1159,7 +1151,6 @@ static const struct rockchip_tsadc_chip rk3228_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3228_code_table, .length = ARRAY_SIZE(rk3228_code_table), @@ -1172,11 +1163,9 @@ static const struct rockchip_tsadc_chip rk3288_tsadc_data = { /* cpu, gpu */ .chn_offset = 1, .chn_num = 2, /* two channels for tsadc */ - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv2_initialize, .irq_ack = rk_tsadcv2_irq_ack, .control = rk_tsadcv2_control, @@ -1184,7 +1173,6 @@ static const struct rockchip_tsadc_chip rk3288_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3288_code_table, .length = ARRAY_SIZE(rk3288_code_table), @@ -1197,10 +1185,8 @@ static const struct rockchip_tsadc_chip rk3328_tsadc_data = { /* cpu */ .chn_offset = 0, .chn_num = 1, /* one channels for tsadc */ - .tshut_mode = TSHUT_MODE_CRU, /* default TSHUT via CRU */ .tshut_temp = 95000, - .initialize = rk_tsadcv2_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1208,7 +1194,6 @@ static const struct rockchip_tsadc_chip rk3328_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3328_code_table, .length = ARRAY_SIZE(rk3328_code_table), @@ -1221,11 +1206,9 @@ static const struct rockchip_tsadc_chip rk3366_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv3_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1233,7 +1216,6 @@ static const struct rockchip_tsadc_chip rk3366_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3228_code_table, .length = ARRAY_SIZE(rk3228_code_table), @@ -1246,11 +1228,9 @@ static const struct rockchip_tsadc_chip rk3368_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv2_initialize, .irq_ack = rk_tsadcv2_irq_ack, .control = rk_tsadcv2_control, @@ -1258,7 +1238,6 @@ static const struct rockchip_tsadc_chip rk3368_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3368_code_table, .length = ARRAY_SIZE(rk3368_code_table), @@ -1271,11 +1250,9 @@ static const struct rockchip_tsadc_chip rk3399_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv3_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1283,7 +1260,6 @@ static const struct rockchip_tsadc_chip rk3399_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3399_code_table, .length = ARRAY_SIZE(rk3399_code_table), @@ -1296,11 +1272,9 @@ static const struct rockchip_tsadc_chip rk3568_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv7_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1308,7 +1282,6 @@ static const struct rockchip_tsadc_chip rk3568_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3568_code_table, .length = ARRAY_SIZE(rk3568_code_table), From c268a9d8c18d594418dcf16762bfe9caf6e1bbb6 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Wed, 20 Aug 2025 19:40:48 +0200 Subject: [PATCH 2871/3206] thermal/drivers/rockchip: Shut up GRF warning Most of the recent Rockchip devices do not have a GRF associated with the tsadc IP. Let's avoid printing a warning on those devices. Signed-off-by: Sebastian Reichel Reviewed-by: Dragan Simic Tested-by: Diederik de Haas Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250820-thermal-rockchip-grf-warning-v2-2-c7e2d35017b8@kernel.org Signed-off-by: Daniel Lezcano --- drivers/thermal/rockchip_thermal.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c index 7b18a705dfade6..c49ddf70f86e7b 100644 --- a/drivers/thermal/rockchip_thermal.c +++ b/drivers/thermal/rockchip_thermal.c @@ -74,6 +74,7 @@ struct chip_tsadc_table { * @tshut_temp: the hardware-controlled shutdown temperature value, with no trim * @tshut_mode: the hardware-controlled shutdown mode (0:CRU 1:GPIO) * @tshut_polarity: the hardware-controlled active polarity (0:LOW 1:HIGH) + * @grf_required: true, if a GRF is required for proper functionality * @initialize: SoC special initialize tsadc controller method * @irq_ack: clear the interrupt * @control: enable/disable method for the tsadc controller @@ -97,6 +98,9 @@ struct rockchip_tsadc_chip { enum tshut_mode tshut_mode; enum tshut_polarity tshut_polarity; + /* GRF availability */ + bool grf_required; + /* Chip-wide methods */ void (*initialize)(struct regmap *grf, void __iomem *reg, enum tshut_polarity p); @@ -1098,6 +1102,7 @@ static const struct rockchip_tsadc_chip px30_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* 2 channels for tsadc */ + .grf_required = true, .tshut_mode = TSHUT_MODE_CRU, /* default TSHUT via CRU */ .tshut_temp = 95000, .initialize = rk_tsadcv4_initialize, @@ -1119,6 +1124,7 @@ static const struct rockchip_tsadc_chip rv1108_tsadc_data = { /* cpu */ .chn_offset = 0, .chn_num = 1, /* one channel for tsadc */ + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, @@ -1141,6 +1147,7 @@ static const struct rockchip_tsadc_chip rk3228_tsadc_data = { /* cpu */ .chn_offset = 0, .chn_num = 1, /* one channel for tsadc */ + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, @@ -1163,6 +1170,7 @@ static const struct rockchip_tsadc_chip rk3288_tsadc_data = { /* cpu, gpu */ .chn_offset = 1, .chn_num = 2, /* two channels for tsadc */ + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, @@ -1185,6 +1193,7 @@ static const struct rockchip_tsadc_chip rk3328_tsadc_data = { /* cpu */ .chn_offset = 0, .chn_num = 1, /* one channels for tsadc */ + .grf_required = false, .tshut_mode = TSHUT_MODE_CRU, /* default TSHUT via CRU */ .tshut_temp = 95000, .initialize = rk_tsadcv2_initialize, @@ -1206,6 +1215,7 @@ static const struct rockchip_tsadc_chip rk3366_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ + .grf_required = true, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, @@ -1228,6 +1238,7 @@ static const struct rockchip_tsadc_chip rk3368_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, @@ -1250,6 +1261,7 @@ static const struct rockchip_tsadc_chip rk3399_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ + .grf_required = true, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, @@ -1272,6 +1284,7 @@ static const struct rockchip_tsadc_chip rk3568_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ + .grf_required = true, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, @@ -1294,6 +1307,7 @@ static const struct rockchip_tsadc_chip rk3576_tsadc_data = { /* top, big_core, little_core, ddr, npu, gpu */ .chn_offset = 0, .chn_num = 6, /* six channels for tsadc */ + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, @@ -1318,6 +1332,7 @@ static const struct rockchip_tsadc_chip rk3588_tsadc_data = { /* top, big_core0, big_core1, little_core, center, gpu, npu */ .chn_offset = 0, .chn_num = 7, /* seven channels for tsadc */ + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, @@ -1594,12 +1609,10 @@ static int rockchip_configure_from_dt(struct device *dev, return -EINVAL; } - /* The tsadc wont to handle the error in here since some SoCs didn't - * need this property. - */ thermal->grf = syscon_regmap_lookup_by_phandle(np, "rockchip,grf"); - if (IS_ERR(thermal->grf)) - dev_warn(dev, "Missing rockchip,grf property\n"); + if (IS_ERR(thermal->grf) && thermal->chip->grf_required) + return dev_err_probe(dev, PTR_ERR(thermal->grf), + "Missing rockchip,grf property\n"); rockchip_get_trim_configuration(dev, np, thermal); From e881662aa06af65021c2fa255dd7530235a3d195 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Wed, 20 Aug 2025 19:40:49 +0200 Subject: [PATCH 2872/3206] dt-bindings: thermal: rockchip: Tighten grf requirements Instead of having an optional rockchip,grf property, forbid using it on platforms without registers in a GRF being needed for thermal monitoring and make it mandatory on the platforms actually needing it. Signed-off-by: Sebastian Reichel Reviewed-by: Dragan Simic Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250820-thermal-rockchip-grf-warning-v2-3-c7e2d35017b8@kernel.org Signed-off-by: Daniel Lezcano --- .../bindings/thermal/rockchip-thermal.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/devicetree/bindings/thermal/rockchip-thermal.yaml b/Documentation/devicetree/bindings/thermal/rockchip-thermal.yaml index 573f447cc26ed7..9fa5c4c49d76e3 100644 --- a/Documentation/devicetree/bindings/thermal/rockchip-thermal.yaml +++ b/Documentation/devicetree/bindings/thermal/rockchip-thermal.yaml @@ -119,6 +119,21 @@ required: - resets allOf: + - if: + properties: + compatible: + contains: + enum: + - rockchip,px30-tsadc + - rockchip,rk3366-tsadc + - rockchip,rk3399-tsadc + - rockchip,rk3568-tsadc + then: + required: + - rockchip,grf + else: + properties: + rockchip,grf: false - if: not: properties: From 3762f5851ac5a65dcccadf73dbe853b1b346f561 Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Wed, 3 Sep 2025 19:27:49 +0300 Subject: [PATCH 2873/3206] thermal/drivers/thermal-generic-adc: Add temperature sensor channel To avoid duplicating sensor functionality and conversion tables, this design allows converting an ADC IIO channel's output directly into a temperature IIO channel. This is particularly useful for devices where hwmon isn't suitable or where temperature data must be accessible through IIO. One such device is, for example, the MAX17040 fuel gauge. Signed-off-by: Svyatoslav Ryhel Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20250903162749.109910-2-clamor95@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal-generic-adc.c | 55 ++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/thermal-generic-adc.c b/drivers/thermal/thermal-generic-adc.c index ee3d0aa31406cd..7c844589b153b5 100644 --- a/drivers/thermal/thermal-generic-adc.c +++ b/drivers/thermal/thermal-generic-adc.c @@ -7,6 +7,7 @@ * Author: Laxman Dewangan */ #include +#include #include #include #include @@ -73,6 +74,58 @@ static const struct thermal_zone_device_ops gadc_thermal_ops = { .get_temp = gadc_thermal_get_temp, }; +static const struct iio_chan_spec gadc_thermal_iio_channels[] = { + { + .type = IIO_TEMP, + .info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED), + } +}; + +static int gadc_thermal_read_raw(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, + int *val, int *val2, long mask) +{ + struct gadc_thermal_info *gtinfo = iio_priv(indio_dev); + int ret; + + switch (mask) { + case IIO_CHAN_INFO_PROCESSED: + ret = gadc_thermal_get_temp(gtinfo->tz_dev, val); + if (ret) + return ret; + + return IIO_VAL_INT; + + default: + return -EINVAL; + } +} + +static const struct iio_info gadc_thermal_iio_info = { + .read_raw = gadc_thermal_read_raw, +}; + +static int gadc_iio_register(struct device *dev, struct gadc_thermal_info *gti) +{ + struct gadc_thermal_info *gtinfo; + struct iio_dev *indio_dev; + + indio_dev = devm_iio_device_alloc(dev, sizeof(*gtinfo)); + if (!indio_dev) + return -ENOMEM; + + gtinfo = iio_priv(indio_dev); + memcpy(gtinfo, gti, sizeof(*gtinfo)); + + indio_dev->name = dev_name(dev); + indio_dev->info = &gadc_thermal_iio_info; + indio_dev->modes = INDIO_DIRECT_MODE; + indio_dev->channels = gadc_thermal_iio_channels; + indio_dev->num_channels = ARRAY_SIZE(gadc_thermal_iio_channels); + + return devm_iio_device_register(dev, indio_dev); +} + static int gadc_thermal_read_linear_lookup_table(struct device *dev, struct gadc_thermal_info *gti) { @@ -153,7 +206,7 @@ static int gadc_thermal_probe(struct platform_device *pdev) devm_thermal_add_hwmon_sysfs(dev, gti->tz_dev); - return 0; + return gadc_iio_register(&pdev->dev, gti); } static const struct of_device_id of_adc_thermal_match[] = { From caf41eb4575ddac61dd0f6d4a6108bb7ab9fc408 Mon Sep 17 00:00:00 2001 From: John Madieu Date: Wed, 17 Sep 2025 19:01:55 +0200 Subject: [PATCH 2874/3206] dt-bindings: thermal: r9a09g047-tsu: Document the TSU unit The Renesas RZ/G3E SoC includes a Thermal Sensor Unit (TSU) block designed to measure the junction temperature. The device provides real-time temperature measurements for thermal management, utilizing a single dedicated channel (channel 1) for temperature sensing. Reviewed-by: Rob Herring (Arm) Signed-off-by: John Madieu Link: https://lore.kernel.org/r/20250917170202.197929-2-john.madieu.xa@bp.renesas.com Signed-off-by: Daniel Lezcano --- .../thermal/renesas,r9a09g047-tsu.yaml | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml diff --git a/Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml b/Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml new file mode 100644 index 00000000000000..8d3f3c24f0f270 --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/thermal/renesas,r9a09g047-tsu.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas RZ/G3E Temperature Sensor Unit (TSU) + +maintainers: + - John Madieu + +description: + The Temperature Sensor Unit (TSU) is an integrated thermal sensor that + monitors the chip temperature on the Renesas RZ/G3E SoC. The TSU provides + real-time temperature measurements for thermal management. + +properties: + compatible: + const: renesas,r9a09g047-tsu + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + resets: + maxItems: 1 + + power-domains: + maxItems: 1 + + interrupts: + items: + - description: Conversion complete interrupt signal (pulse) + - description: Comparison result interrupt signal (level) + + interrupt-names: + items: + - const: adi + - const: adcmpi + + "#thermal-sensor-cells": + const: 0 + + renesas,tsu-trim: + $ref: /schemas/types.yaml#/definitions/phandle-array + items: + - items: + - description: phandle to system controller + - description: offset of trim registers + description: + Phandle and offset to the system controller containing the TSU + calibration trim values. The offset points to the first trim register + (OTPTSU1TRMVAL0), with the second trim register (OTPTSU1TRMVAL1) located + at offset + 4. + +required: + - compatible + - reg + - clocks + - resets + - power-domains + - interrupts + - interrupt-names + - "#thermal-sensor-cells" + - renesas,tsu-trim + +additionalProperties: false + +examples: + - | + #include + #include + + thermal-sensor@14002000 { + compatible = "renesas,r9a09g047-tsu"; + reg = <0x14002000 0x1000>; + clocks = <&cpg CPG_MOD 0x10a>; + resets = <&cpg 0xf8>; + power-domains = <&cpg>; + interrupts = , + ; + interrupt-names = "adi", "adcmpi"; + #thermal-sensor-cells = <0>; + renesas,tsu-trim = <&sys 0x330>; + }; From 19d3a401a617c68e9487f55b9f2efe213f8f949d Mon Sep 17 00:00:00 2001 From: John Madieu Date: Wed, 17 Sep 2025 19:01:56 +0200 Subject: [PATCH 2875/3206] thermal/drivers/renesas/rzg3e: Add thermal driver for the Renesas RZ/G3E SoC The RZ/G3E SoC integrates a Temperature Sensor Unit (TSU) block designed to monitor the chip's junction temperature. This sensor is connected to channel 1 of the APB port clock/reset and provides temperature measurements. It also requires calibration values stored in the system controller registers for accurate temperature measurement. Add a driver for the Renesas RZ/G3E TSU. [ dlezcano: Fixed conflict with "renesas: Add support for RZ/G3S" ] Signed-off-by: John Madieu Reviewed-by: Biju Das Link: https://lore.kernel.org/r/20250917170202.197929-3-john.madieu.xa@bp.renesas.com Signed-off-by: Daniel Lezcano --- MAINTAINERS | 7 +++++++ drivers/thermal/renesas/Kconfig | 7 +++++++ drivers/thermal/renesas/Makefile | 2 ++ 3 files changed, 16 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7597e8fbc842c6..5797c5efd300ef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21548,6 +21548,13 @@ S: Maintained F: Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml F: drivers/thermal/renesas/rzg3s_thermal.c +RENESAS RZ/G3E THERMAL SENSOR UNIT DRIVER +M: John Madieu +L: linux-pm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml +F: drivers/thermal/renesas/rzg3e_thermal.c + RESET CONTROLLER FRAMEWORK M: Philipp Zabel S: Maintained diff --git a/drivers/thermal/renesas/Kconfig b/drivers/thermal/renesas/Kconfig index 57ea987e231d77..c762c1c30d5a21 100644 --- a/drivers/thermal/renesas/Kconfig +++ b/drivers/thermal/renesas/Kconfig @@ -34,3 +34,10 @@ config RZG3S_THERMAL help Enable this to plug the RZ/G3S thermal sensor driver into the Linux thermal framework. + +config RZG3E_THERMAL + tristate "Renesas RZ/G3E thermal driver" + depends on ARCH_RENESAS || COMPILE_TEST + help + Enable this to plug the RZ/G3E thermal sensor driver into the Linux + thermal framework. diff --git a/drivers/thermal/renesas/Makefile b/drivers/thermal/renesas/Makefile index 1feb5ab78827e2..0ea59224757226 100644 --- a/drivers/thermal/renesas/Makefile +++ b/drivers/thermal/renesas/Makefile @@ -3,4 +3,6 @@ obj-$(CONFIG_RCAR_GEN3_THERMAL) += rcar_gen3_thermal.o obj-$(CONFIG_RCAR_THERMAL) += rcar_thermal.o obj-$(CONFIG_RZG2L_THERMAL) += rzg2l_thermal.o +obj-$(CONFIG_RZG3E_THERMAL) += rzg3e_thermal.o obj-$(CONFIG_RZG3S_THERMAL) += rzg3s_thermal.o + From 1a2b423be6a89dd07d5fc27ea042be68697a6a49 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 14 Sep 2025 22:24:13 +0200 Subject: [PATCH 2876/3206] i2c: boardinfo: Annotate code used in init phase only Annotate two places in boardinfo code: - __i2c_first_dynamic_bus_num is set in init phase. Annotate it as __ro_after_init to prevent later changes. - i2c_register_board_info() is used in init phase only, so annotate it as __init, allowing to free the memory after init phase. This is safe, see comment: "done in board-specific init code near arch_initcall() time" Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-boardinfo.c | 4 ++-- include/linux/i2c.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/i2c-boardinfo.c b/drivers/i2c/i2c-boardinfo.c index 4df8ad092df383..338800321f8b66 100644 --- a/drivers/i2c/i2c-boardinfo.c +++ b/drivers/i2c/i2c-boardinfo.c @@ -22,7 +22,7 @@ EXPORT_SYMBOL_GPL(__i2c_board_lock); LIST_HEAD(__i2c_board_list); EXPORT_SYMBOL_GPL(__i2c_board_list); -int __i2c_first_dynamic_bus_num; +int __i2c_first_dynamic_bus_num __ro_after_init; EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); @@ -48,7 +48,7 @@ EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); * The board info passed can safely be __initdata, but be careful of embedded * pointers (for platform_data, functions, etc) since that won't be copied. */ -int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) +int __init i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) { int status; diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 20fd41b51d5c85..11a19241e360be 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -499,7 +499,7 @@ static inline struct i2c_client *i2c_verify_client(struct device *dev) * Modules for add-on boards must use other calls. */ #ifdef CONFIG_I2C_BOARDINFO -int +int __init i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n); #else From b492183652808e0f389272bf63dc836241b287ff Mon Sep 17 00:00:00 2001 From: "Leilk.Liu" Date: Sat, 6 Sep 2025 16:24:06 +0800 Subject: [PATCH 2877/3206] i2c: mediatek: fix potential incorrect use of I2C_MASTER_WRRD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old IC does not support the I2C_MASTER_WRRD (write-then-read) function, but the current code’s handling of i2c->auto_restart may potentially lead to entering the I2C_MASTER_WRRD software flow, resulting in unexpected bugs. Instead of repurposing the auto_restart flag, add a separate flag to signal I2C_MASTER_WRRD operations. Also fix handling of msgs. If the operation (i2c->op) is I2C_MASTER_WRRD, then the msgs pointer is incremented by 2. For all other operations, msgs is simply incremented by 1. Fixes: b2ed11e224a2 ("I2C: mediatek: Add driver for MediaTek MT8173 I2C controller") Signed-off-by: Leilk.Liu Suggested-by: Chen-Yu Tsai Reviewed-by: Chen-Yu Tsai Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mt65xx.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/i2c/busses/i2c-mt65xx.c b/drivers/i2c/busses/i2c-mt65xx.c index ab456c3717db18..dee40704825cb4 100644 --- a/drivers/i2c/busses/i2c-mt65xx.c +++ b/drivers/i2c/busses/i2c-mt65xx.c @@ -1243,6 +1243,7 @@ static int mtk_i2c_transfer(struct i2c_adapter *adap, { int ret; int left_num = num; + bool write_then_read_en = false; struct mtk_i2c *i2c = i2c_get_adapdata(adap); ret = clk_bulk_enable(I2C_MT65XX_CLK_MAX, i2c->clocks); @@ -1256,6 +1257,7 @@ static int mtk_i2c_transfer(struct i2c_adapter *adap, if (!(msgs[0].flags & I2C_M_RD) && (msgs[1].flags & I2C_M_RD) && msgs[0].addr == msgs[1].addr) { i2c->auto_restart = 0; + write_then_read_en = true; } } @@ -1280,12 +1282,10 @@ static int mtk_i2c_transfer(struct i2c_adapter *adap, else i2c->op = I2C_MASTER_WR; - if (!i2c->auto_restart) { - if (num > 1) { - /* combined two messages into one transaction */ - i2c->op = I2C_MASTER_WRRD; - left_num--; - } + if (write_then_read_en) { + /* combined two messages into one transaction */ + i2c->op = I2C_MASTER_WRRD; + left_num--; } /* always use DMA mode. */ @@ -1293,7 +1293,10 @@ static int mtk_i2c_transfer(struct i2c_adapter *adap, if (ret < 0) goto err_exit; - msgs++; + if (i2c->op == I2C_MASTER_WRRD) + msgs += 2; + else + msgs++; } /* the return value is number of executed messages */ ret = num; From 41d6f90ef5dc2841bdd09817c63a3d6188473b9b Mon Sep 17 00:00:00 2001 From: Troy Mitchell Date: Thu, 25 Sep 2025 10:02:25 +0800 Subject: [PATCH 2878/3206] i2c: spacemit: ensure bus release check runs when wait_bus_idle() fails spacemit_i2c_wait_bus_idle() only returns 0 on success or a negative error code on failure. Since 'ret' can never be positive, the final 'else' branch was unreachable, and spacemit_i2c_check_bus_release() was never called. This commit guarantees we attempt to release the bus whenever waiting for an idle bus fails. Fixes: 5ea558473fa31 ("i2c: spacemit: add support for SpacemiT K1 SoC") Reviewed-by: Aurelien Jarno Signed-off-by: Troy Mitchell Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-k1.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-k1.c b/drivers/i2c/busses/i2c-k1.c index b68a21fff0b56b..ee08811f4087c8 100644 --- a/drivers/i2c/busses/i2c-k1.c +++ b/drivers/i2c/busses/i2c-k1.c @@ -476,12 +476,13 @@ static int spacemit_i2c_xfer(struct i2c_adapter *adapt, struct i2c_msg *msgs, in spacemit_i2c_enable(i2c); ret = spacemit_i2c_wait_bus_idle(i2c); - if (!ret) + if (!ret) { ret = spacemit_i2c_xfer_msg(i2c); - else if (ret < 0) - dev_dbg(i2c->dev, "i2c transfer error: %d\n", ret); - else + if (ret < 0) + dev_dbg(i2c->dev, "i2c transfer error: %d\n", ret); + } else { spacemit_i2c_check_bus_release(i2c); + } spacemit_i2c_disable(i2c); From 445522fe7aad6131b2747ae8c76f77266054cd84 Mon Sep 17 00:00:00 2001 From: Troy Mitchell Date: Thu, 25 Sep 2025 10:02:26 +0800 Subject: [PATCH 2879/3206] i2c: spacemit: remove stop function to avoid bus error Previously, STOP handling was split into two separate steps: 1) clear TB/STOP/START/ACK bits 2) issue STOP by calling spacemit_i2c_stop() This left a small window where the control register was updated twice, which can confuse the controller. While this race has not been observed with interrupt-driven transfers, it reliably causes bus errors in PIO mode. Inline the STOP sequence into the IRQ handler and ensure that control register bits are updated atomically in a single writel(). Fixes: 5ea558473fa31 ("i2c: spacemit: add support for SpacemiT K1 SoC") Signed-off-by: Troy Mitchell Reviewed-by: Aurelien Jarno Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-k1.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/drivers/i2c/busses/i2c-k1.c b/drivers/i2c/busses/i2c-k1.c index ee08811f4087c8..84f132d0504dc9 100644 --- a/drivers/i2c/busses/i2c-k1.c +++ b/drivers/i2c/busses/i2c-k1.c @@ -267,19 +267,6 @@ static void spacemit_i2c_start(struct spacemit_i2c_dev *i2c) writel(val, i2c->base + SPACEMIT_ICR); } -static void spacemit_i2c_stop(struct spacemit_i2c_dev *i2c) -{ - u32 val; - - val = readl(i2c->base + SPACEMIT_ICR); - val |= SPACEMIT_CR_STOP | SPACEMIT_CR_ALDIE | SPACEMIT_CR_TB; - - if (i2c->read) - val |= SPACEMIT_CR_ACKNAK; - - writel(val, i2c->base + SPACEMIT_ICR); -} - static int spacemit_i2c_xfer_msg(struct spacemit_i2c_dev *i2c) { unsigned long time_left; @@ -412,7 +399,6 @@ static irqreturn_t spacemit_i2c_irq_handler(int irq, void *devid) val = readl(i2c->base + SPACEMIT_ICR); val &= ~(SPACEMIT_CR_TB | SPACEMIT_CR_ACKNAK | SPACEMIT_CR_STOP | SPACEMIT_CR_START); - writel(val, i2c->base + SPACEMIT_ICR); switch (i2c->state) { case SPACEMIT_STATE_START: @@ -429,14 +415,16 @@ static irqreturn_t spacemit_i2c_irq_handler(int irq, void *devid) } if (i2c->state != SPACEMIT_STATE_IDLE) { + val |= SPACEMIT_CR_TB | SPACEMIT_CR_ALDIE; + if (spacemit_i2c_is_last_msg(i2c)) { /* trigger next byte with stop */ - spacemit_i2c_stop(i2c); - } else { - /* trigger next byte */ - val |= SPACEMIT_CR_ALDIE | SPACEMIT_CR_TB; - writel(val, i2c->base + SPACEMIT_ICR); + val |= SPACEMIT_CR_STOP; + + if (i2c->read) + val |= SPACEMIT_CR_ACKNAK; } + writel(val, i2c->base + SPACEMIT_ICR); } err_out: From bc8712f2b5250825968e6b0c3d2709a4b9d5d570 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 25 Sep 2025 10:00:12 -0700 Subject: [PATCH 2880/3206] bpf: Emit struct bpf_xdp_sock type in vmlinux BTF Similar to other BPF UAPI struct, force emit BTF of struct bpf_xdp_sock so that it is defined in vmlinux.h. In a later patch, a selftest will use vmlinux.h to get the definition of struct bpf_xdp_sock instead of bpf.h. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250925170013.1752561-1-ameryhung@gmail.com --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index b20d59bb19b8c1..2af0a5f1d7489c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7439,6 +7439,8 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_sock, FIELD)); \ } while (0) + BTF_TYPE_EMIT(struct bpf_xdp_sock); + switch (si->off) { case offsetof(struct bpf_xdp_sock, queue_id): BPF_XDP_SOCK_GET(queue_id); From 11f40684ccd84e792eced110f0a5d3d6adbdf90d Mon Sep 17 00:00:00 2001 From: Troy Mitchell Date: Thu, 25 Sep 2025 10:02:27 +0800 Subject: [PATCH 2881/3206] i2c: spacemit: disable SDA glitch fix to avoid restart delay The K1 I2C controller has an SDA glitch fix that introduces a small delay on restart signals. While this feature can suppress glitches on SDA when SCL = 0, it also delays the restart signal, which may cause unexpected behavior in some transfers. The glitch itself does not affect normal I2C operation, because the I2C specification allows SDA to change while SCL is low. To ensure correct transmission for every message, we disable the SDA glitch fix by setting the RCR.SDA_GLITCH_NOFIX bit during initialization. This guarantees that restarts are issued promptly without unintended delays. Fixes: 5ea558473fa31 ("i2c: spacemit: add support for SpacemiT K1 SoC") Reviewed-by: Aurelien Jarno Signed-off-by: Troy Mitchell Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-k1.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/i2c/busses/i2c-k1.c b/drivers/i2c/busses/i2c-k1.c index 84f132d0504dc9..9bf9f01aa68bde 100644 --- a/drivers/i2c/busses/i2c-k1.c +++ b/drivers/i2c/busses/i2c-k1.c @@ -14,6 +14,7 @@ #define SPACEMIT_ICR 0x0 /* Control register */ #define SPACEMIT_ISR 0x4 /* Status register */ #define SPACEMIT_IDBR 0xc /* Data buffer register */ +#define SPACEMIT_IRCR 0x18 /* Reset cycle counter */ #define SPACEMIT_IBMR 0x1c /* Bus monitor register */ /* SPACEMIT_ICR register fields */ @@ -76,6 +77,8 @@ SPACEMIT_SR_GCAD | SPACEMIT_SR_IRF | SPACEMIT_SR_ITE | \ SPACEMIT_SR_ALD) +#define SPACEMIT_RCR_SDA_GLITCH_NOFIX BIT(7) /* bypass the SDA glitch fix */ + /* SPACEMIT_IBMR register fields */ #define SPACEMIT_BMR_SDA BIT(0) /* SDA line level */ #define SPACEMIT_BMR_SCL BIT(1) /* SCL line level */ @@ -237,6 +240,14 @@ static void spacemit_i2c_init(struct spacemit_i2c_dev *i2c) val |= SPACEMIT_CR_MSDE | SPACEMIT_CR_MSDIE; writel(val, i2c->base + SPACEMIT_ICR); + + /* + * The glitch fix in the K1 I2C controller introduces a delay + * on restart signals, so we disable the fix here. + */ + val = readl(i2c->base + SPACEMIT_IRCR); + val |= SPACEMIT_RCR_SDA_GLITCH_NOFIX; + writel(val, i2c->base + SPACEMIT_IRCR); } static inline void From db7720ef50e0103be70a3887bc66e9c909933ad9 Mon Sep 17 00:00:00 2001 From: Troy Mitchell Date: Thu, 25 Sep 2025 10:02:28 +0800 Subject: [PATCH 2882/3206] i2c: spacemit: check SDA instead of SCL after bus reset After calling spacemit_i2c_conditionally_reset_bus(), the controller should ensure that the SDA line is release before proceeding. Previously, the driver checked the SCL line instead, which does not guarantee that the bus is truly idle. This patch changes the check to verify SDA. This ensures proper bus recovery and avoids potential communication errors after a conditional reset. Fixes: 5ea558473fa31 ("i2c: spacemit: add support for SpacemiT K1 SoC") Reviewed-by: Aurelien Jarno Signed-off-by: Troy Mitchell Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-k1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-k1.c b/drivers/i2c/busses/i2c-k1.c index 9bf9f01aa68bde..848dfaf634f630 100644 --- a/drivers/i2c/busses/i2c-k1.c +++ b/drivers/i2c/busses/i2c-k1.c @@ -172,9 +172,9 @@ static void spacemit_i2c_conditionally_reset_bus(struct spacemit_i2c_dev *i2c) spacemit_i2c_reset(i2c); usleep_range(10, 20); - /* check scl status again */ + /* check sda again here */ status = readl(i2c->base + SPACEMIT_IBMR); - if (!(status & SPACEMIT_BMR_SCL)) + if (!(status & SPACEMIT_BMR_SDA)) dev_warn_ratelimited(i2c->dev, "unit reset failed\n"); } From 0de61943244dec418d396633a587adca1c350b55 Mon Sep 17 00:00:00 2001 From: Troy Mitchell Date: Thu, 25 Sep 2025 10:02:29 +0800 Subject: [PATCH 2883/3206] i2c: spacemit: ensure SDA is released after bus reset After performing a conditional bus reset, the controller must ensure that the SDA line is actually released. Previously, the reset routine only performed a single check, which could leave the bus in a locked state in some situations. This patch introduces a loop that toggles the reset cycle and issues a reset request up to SPACEMIT_BUS_RESET_CLK_CNT_MAX times, checking SDA after each attempt. If SDA is released before the maximum count, the function returns early. Otherwise, a warning is emitted. This change improves bus recovery reliability. Fixes: 5ea558473fa31 ("i2c: spacemit: add support for SpacemiT K1 SoC") Signed-off-by: Troy Mitchell Reviewed-by: Aurelien Jarno Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-k1.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-k1.c b/drivers/i2c/busses/i2c-k1.c index 848dfaf634f630..6b918770e612e0 100644 --- a/drivers/i2c/busses/i2c-k1.c +++ b/drivers/i2c/busses/i2c-k1.c @@ -3,6 +3,7 @@ * Copyright (C) 2024-2025 Troy Mitchell */ +#include #include #include #include @@ -26,7 +27,8 @@ #define SPACEMIT_CR_MODE_FAST BIT(8) /* bus mode (master operation) */ /* Bit 9 is reserved */ #define SPACEMIT_CR_UR BIT(10) /* unit reset */ -/* Bits 11-12 are reserved */ +#define SPACEMIT_CR_RSTREQ BIT(11) /* i2c bus reset request */ +/* Bit 12 is reserved */ #define SPACEMIT_CR_SCLE BIT(13) /* master clock enable */ #define SPACEMIT_CR_IUE BIT(14) /* unit enable */ /* Bits 15-17 are reserved */ @@ -78,6 +80,8 @@ SPACEMIT_SR_ALD) #define SPACEMIT_RCR_SDA_GLITCH_NOFIX BIT(7) /* bypass the SDA glitch fix */ +/* the cycles of SCL during bus reset */ +#define SPACEMIT_RCR_FIELD_RST_CYC GENMASK(3, 0) /* SPACEMIT_IBMR register fields */ #define SPACEMIT_BMR_SDA BIT(0) /* SDA line level */ @@ -91,6 +95,8 @@ #define SPACEMIT_SR_ERR (SPACEMIT_SR_BED | SPACEMIT_SR_RXOV | SPACEMIT_SR_ALD) +#define SPACEMIT_BUS_RESET_CLK_CNT_MAX 9 + enum spacemit_i2c_state { SPACEMIT_STATE_IDLE, SPACEMIT_STATE_START, @@ -163,6 +169,7 @@ static int spacemit_i2c_handle_err(struct spacemit_i2c_dev *i2c) static void spacemit_i2c_conditionally_reset_bus(struct spacemit_i2c_dev *i2c) { u32 status; + u8 clk_cnt; /* if bus is locked, reset unit. 0: locked */ status = readl(i2c->base + SPACEMIT_IBMR); @@ -172,6 +179,18 @@ static void spacemit_i2c_conditionally_reset_bus(struct spacemit_i2c_dev *i2c) spacemit_i2c_reset(i2c); usleep_range(10, 20); + for (clk_cnt = 0; clk_cnt < SPACEMIT_BUS_RESET_CLK_CNT_MAX; clk_cnt++) { + status = readl(i2c->base + SPACEMIT_IBMR); + if (status & SPACEMIT_BMR_SDA) + return; + + /* There's nothing left to save here, we are about to exit */ + writel(FIELD_PREP(SPACEMIT_RCR_FIELD_RST_CYC, 1), + i2c->base + SPACEMIT_IRCR); + writel(SPACEMIT_CR_RSTREQ, i2c->base + SPACEMIT_ICR); + usleep_range(20, 30); + } + /* check sda again here */ status = readl(i2c->base + SPACEMIT_IBMR); if (!(status & SPACEMIT_BMR_SDA)) From 437e6c3e3175e40c07987be87eea1cf9d7b8f30f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Monin?= Date: Thu, 4 Sep 2025 16:31:06 +0200 Subject: [PATCH 2884/3206] i2c: designware: convert to dev_err_probe() on request IRQ error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify the error handling of devm_request_irq() in i2c_dw_probe_master() and i2c_dw_probe_slave() by converting to: return dev_err_probe(); instead of calling: dev_err(); return ret; This also handle deferred probe error without spamming the log. Signed-off-by: Benoît Monin Reviewed-by: Andy Shevchenko Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-master.c | 9 ++++----- drivers/i2c/busses/i2c-designware-slave.c | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index cbd88ffa561010..c7a72c28786c2b 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -1068,11 +1068,10 @@ int i2c_dw_probe_master(struct dw_i2c_dev *dev) if (!(dev->flags & ACCESS_POLLING)) { ret = devm_request_irq(dev->dev, dev->irq, i2c_dw_isr, irq_flags, dev_name(dev->dev), dev); - if (ret) { - dev_err(dev->dev, "failure requesting irq %i: %d\n", - dev->irq, ret); - return ret; - } + if (ret) + return dev_err_probe(dev->dev, ret, + "failure requesting irq %i: %d\n", + dev->irq, ret); } ret = i2c_dw_init_recovery_info(dev); diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index b936a240db0a93..6eb16b7d75a6d0 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -266,11 +266,10 @@ int i2c_dw_probe_slave(struct dw_i2c_dev *dev) ret = devm_request_irq(dev->dev, dev->irq, i2c_dw_isr_slave, IRQF_SHARED, dev_name(dev->dev), dev); - if (ret) { - dev_err(dev->dev, "failure requesting IRQ %i: %d\n", - dev->irq, ret); - return ret; - } + if (ret) + return dev_err_probe(dev->dev, ret, + "failure requesting IRQ %i: %d\n", + dev->irq, ret); ret = i2c_add_numbered_adapter(adap); if (ret) From 2b7a2003ba01cde9a4958a50c55207f820766816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Monin?= Date: Thu, 4 Sep 2025 16:31:07 +0200 Subject: [PATCH 2885/3206] i2c: designware: use dev_err_probe() when probing platform device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add calls to dev_err_probe() on error paths that can return -EPROBE_DEFER when probing platform device. Namely when requesting the reset controller, when probing for lock support and when requesting the clocks. PCI device probing already use dev_err_probe(). Signed-off-by: Benoît Monin Reviewed-by: Andy Shevchenko Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index a35e4c64a1d46f..be2f9533cef94e 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -238,7 +238,7 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) dev->rst = devm_reset_control_get_optional_exclusive(device, NULL); if (IS_ERR(dev->rst)) - return PTR_ERR(dev->rst); + return dev_err_probe(device, PTR_ERR(dev->rst), "failed to acquire reset\n"); reset_control_deassert(dev->rst); @@ -247,21 +247,23 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) goto exit_reset; ret = i2c_dw_probe_lock_support(dev); - if (ret) + if (ret) { + ret = dev_err_probe(device, ret, "failed to probe lock support\n"); goto exit_reset; + } i2c_dw_configure(dev); /* Optional interface clock */ dev->pclk = devm_clk_get_optional(device, "pclk"); if (IS_ERR(dev->pclk)) { - ret = PTR_ERR(dev->pclk); + ret = dev_err_probe(device, PTR_ERR(dev->pclk), "failed to acquire pclk\n"); goto exit_reset; } dev->clk = devm_clk_get_optional(device, NULL); if (IS_ERR(dev->clk)) { - ret = PTR_ERR(dev->clk); + ret = dev_err_probe(device, PTR_ERR(dev->clk), "failed to acquire clock\n"); goto exit_reset; } From 59ccb8176bd7e826d47962e891b460284f6978f0 Mon Sep 17 00:00:00 2001 From: I Viswanath Date: Mon, 25 Aug 2025 08:44:30 +0530 Subject: [PATCH 2886/3206] i2c: mux: Simplify boolean assignment in i2c_mux_alloc Refactor boolean field assignments of the form `if (a) b = true` to `b = !!(a)` in i2c_mux_alloc. Signed-off-by: I Viswanath Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-mux.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c index 4d8690981a55dc..d59644e50f14d0 100644 --- a/drivers/i2c/i2c-mux.c +++ b/drivers/i2c/i2c-mux.c @@ -241,12 +241,9 @@ struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent, muxc->parent = parent; muxc->dev = dev; - if (flags & I2C_MUX_LOCKED) - muxc->mux_locked = true; - if (flags & I2C_MUX_ARBITRATOR) - muxc->arbitrator = true; - if (flags & I2C_MUX_GATE) - muxc->gate = true; + muxc->mux_locked = !!(flags & I2C_MUX_LOCKED); + muxc->arbitrator = !!(flags & I2C_MUX_ARBITRATOR); + muxc->gate = !!(flags & I2C_MUX_GATE); muxc->select = select; muxc->deselect = deselect; muxc->max_adapters = max_adapters; From 12aad2960e9d6a32d7371e43cabcb02531ae3704 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Wed, 27 Aug 2025 17:43:44 +0800 Subject: [PATCH 2887/3206] i2c: busses: Fix some spelling errors Fix spelling errors in some comments. Signed-off-by: Xichao Zhao Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-hix5hd2.c | 2 +- drivers/i2c/busses/i2c-sprd.c | 2 +- drivers/i2c/busses/i2c-st.c | 2 +- drivers/i2c/busses/i2c-viperboard.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-hix5hd2.c b/drivers/i2c/busses/i2c-hix5hd2.c index 370f329747637c..5358f5ddf924b8 100644 --- a/drivers/i2c/busses/i2c-hix5hd2.c +++ b/drivers/i2c/busses/i2c-hix5hd2.c @@ -339,7 +339,7 @@ static int hix5hd2_i2c_xfer_msg(struct hix5hd2_i2c_priv *priv, ret = priv->state; /* - * If this is the last message to be transfered (stop == 1) + * If this is the last message to be transferred (stop == 1) * Then check if the bus can be brought back to idle. */ if (priv->state == HIX5I2C_STAT_RW_SUCCESS && stop) diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index 56b2e5c5fb49aa..26ec34b19ad51a 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -425,7 +425,7 @@ static irqreturn_t sprd_i2c_isr(int irq, void *dev_id) * If we did not get one ACK from target when writing data, then we * should finish this transmission since we got some errors. * - * When writing data, if i2c_tran == 0 which means we have writen + * When writing data, if i2c_tran == 0 which means we have written * done all data, then we can finish this transmission. * * When reading data, if conut < rx fifo full threshold, which diff --git a/drivers/i2c/busses/i2c-st.c b/drivers/i2c/busses/i2c-st.c index bf28f8e3ee6bda..97d70e66722706 100644 --- a/drivers/i2c/busses/i2c-st.c +++ b/drivers/i2c/busses/i2c-st.c @@ -152,7 +152,7 @@ struct st_i2c_timings { /** * struct st_i2c_client - client specific data * @addr: 8-bit target addr, including r/w bit - * @count: number of bytes to be transfered + * @count: number of bytes to be transferred * @xfered: number of bytes already transferred * @buf: data buffer * @result: result of the transfer diff --git a/drivers/i2c/busses/i2c-viperboard.c b/drivers/i2c/busses/i2c-viperboard.c index 1bd602852e35c1..f596efcc291c22 100644 --- a/drivers/i2c/busses/i2c-viperboard.c +++ b/drivers/i2c/busses/i2c-viperboard.c @@ -204,7 +204,7 @@ static int vprbrd_i2c_read(struct vprbrd *vb, struct i2c_msg *msg) /* copy the received data */ memcpy(msg->buf + start, rmsg, len1); - /* second read transfer if neccessary */ + /* second read transfer if necessary */ if (len2 > 0) { ret = vprbrd_i2c_receive(vb->usb_dev, rmsg, len2); if (ret < 0) From 1193c46c1745cf809dead55ece4f3baa728f316c Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 25 Sep 2025 10:00:13 -0700 Subject: [PATCH 2888/3206] selftests/bpf: Test changing packet data from global functions with a kfunc The verifier should invalidate all packet pointers after a packet data changing kfunc is called. So, similar to commit 3f23ee5590d9 ("selftests/bpf: test for changing packet data from global functions"), test changing packet data from global functions to make sure packet pointers are indeed invalidated. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250925170013.1752561-2-ameryhung@gmail.com --- .../selftests/bpf/progs/verifier_sock.c | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index bf88c644eb3038..b4d31f10a97687 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Converted from tools/testing/selftests/bpf/verifier/sock.c */ -#include +#include "vmlinux.h" #include #include "bpf_misc.h" @@ -1068,6 +1068,34 @@ int invalidate_pkt_pointers_from_global_func(struct __sk_buff *sk) return TCX_PASS; } +__noinline +long xdp_pull_data2(struct xdp_md *x, __u32 len) +{ + return bpf_xdp_pull_data(x, len); +} + +__noinline +long xdp_pull_data1(struct xdp_md *x, __u32 len) +{ + return xdp_pull_data2(x, len); +} + +/* global function calls bpf_xdp_pull_data(), which invalidates packet + * pointers established before global function call. + */ +SEC("xdp") +__failure __msg("invalid mem access") +int invalidate_xdp_pkt_pointers_from_global_func(struct xdp_md *x) +{ + int *p = (void *)(long)x->data; + + if ((void *)(p + 1) > (void *)(long)x->data_end) + return XDP_DROP; + xdp_pull_data1(x, 0); + *p = 42; /* this is unsafe */ + return XDP_PASS; +} + __noinline int tail_call(struct __sk_buff *sk) { From 105eb5dc74109a9f53c2f26c9a918d9347a73595 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Thu, 25 Sep 2025 22:52:30 +0100 Subject: [PATCH 2889/3206] selftests/bpf: Fix flaky bpf_cookie selftest bpf_cookie can fail on perf_event_open(), when it runs after the task_work selftest. The task_work test causes perf to lower sysctl_perf_event_sample_rate, and bpf_cookie uses sample_freq, which is validated against that sysctl. As a result, perf_event_open() rejects the attr if the (now tighter) limit is exceeded. >From perf_event_open(): if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; } else { if (attr.sample_period & (1ULL << 63)) return -EINVAL; } Switch bpf_cookie to use sample_period, which is not checked against sysctl_perf_event_sample_rate. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250925215230.265501-1-mykyta.yatsenko5@gmail.com --- tools/testing/selftests/bpf/prog_tests/bpf_cookie.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 4a0670c056bad6..75f4dff7d04220 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -450,8 +450,7 @@ static void pe_subtest(struct test_bpf_cookie *skel) attr.size = sizeof(attr); attr.type = PERF_TYPE_SOFTWARE; attr.config = PERF_COUNT_SW_CPU_CLOCK; - attr.freq = 1; - attr.sample_freq = 10000; + attr.sample_period = 100000; pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); if (!ASSERT_GE(pfd, 0, "perf_fd")) goto cleanup; From 7b7387650dcf2881fd8bb55bcf3c8bd6c9542dd7 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Fri, 12 Sep 2025 15:41:39 +0800 Subject: [PATCH 2890/3206] mm/hugetlb: fix folio is still mapped when deleted Migration may be raced with fallocating hole. remove_inode_single_folio will unmap the folio if the folio is still mapped. However, it's called without folio lock. If the folio is migrated and the mapped pte has been converted to migration entry, folio_mapped() returns false, and won't unmap it. Due to extra refcount held by remove_inode_single_folio, migration fails, restores migration entry to normal pte, and the folio is mapped again. As a result, we triggered BUG in filemap_unaccount_folio. The log is as follows: BUG: Bad page cache in process hugetlb pfn:156c00 page: refcount:515 mapcount:0 mapping:0000000099fef6e1 index:0x0 pfn:0x156c00 head: order:9 mapcount:1 entire_mapcount:1 nr_pages_mapped:0 pincount:0 aops:hugetlbfs_aops ino:dcc dentry name(?):"my_hugepage_file" flags: 0x17ffffc00000c1(locked|waiters|head|node=0|zone=2|lastcpupid=0x1fffff) page_type: f4(hugetlb) page dumped because: still mapped when deleted CPU: 1 UID: 0 PID: 395 Comm: hugetlb Not tainted 6.17.0-rc5-00044-g7aac71907bde-dirty #484 NONE Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl+0x4f/0x70 filemap_unaccount_folio+0xc4/0x1c0 __filemap_remove_folio+0x38/0x1c0 filemap_remove_folio+0x41/0xd0 remove_inode_hugepages+0x142/0x250 hugetlbfs_fallocate+0x471/0x5a0 vfs_fallocate+0x149/0x380 Hold folio lock before checking if the folio is mapped to avold race with migration. Link: https://lkml.kernel.org/r/20250912074139.3575005-1-tujinjiang@huawei.com Fixes: 4aae8d1c051e ("mm/hugetlbfs: unmap pages if page fault raced with hole punch") Signed-off-by: Jinjiang Tu Cc: David Hildenbrand Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 09d4baef29cf9e..be4be99304bc01 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -517,14 +517,16 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, /* * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) while holding - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. + * unmapped in caller or hugetlb_vmdelete_list() skips + * unmapping it due to fail to grab lock. Unmap (again) + * while holding the fault mutex. The mutex will prevent + * faults until we finish removing the folio. Hold folio + * lock to guarantee no concurrent migration. */ + folio_lock(folio); if (unlikely(folio_mapped(folio))) hugetlb_unmap_file_folio(h, mapping, folio, index); - folio_lock(folio); /* * We must remove the folio from page cache before removing * the region/ reserve map (hugetlb_unreserve_pages). In From 14967a9c7d247841b0312c48dcf8cd29e55a4cc8 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Mon, 15 Sep 2025 18:45:20 -0600 Subject: [PATCH 2891/3206] mm/hugetlb: fix copy_hugetlb_page_range() to use ->pt_share_count commit 59d9094df3d79 ("mm: hugetlb: independent PMD page table shared count") introduced ->pt_share_count dedicated to hugetlb PMD share count tracking, but omitted fixing copy_hugetlb_page_range(), leaving the function relying on page_count() for tracking that no longer works. When lazy page table copy for hugetlb is disabled, that is, revert commit bcd51a3c679d ("hugetlb: lazy page table copies in fork()") fork()'ing with hugetlb PMD sharing quickly lockup - [ 239.446559] watchdog: BUG: soft lockup - CPU#75 stuck for 27s! [ 239.446611] RIP: 0010:native_queued_spin_lock_slowpath+0x7e/0x2e0 [ 239.446631] Call Trace: [ 239.446633] [ 239.446636] _raw_spin_lock+0x3f/0x60 [ 239.446639] copy_hugetlb_page_range+0x258/0xb50 [ 239.446645] copy_page_range+0x22b/0x2c0 [ 239.446651] dup_mmap+0x3e2/0x770 [ 239.446654] dup_mm.constprop.0+0x5e/0x230 [ 239.446657] copy_process+0xd17/0x1760 [ 239.446660] kernel_clone+0xc0/0x3e0 [ 239.446661] __do_sys_clone+0x65/0xa0 [ 239.446664] do_syscall_64+0x82/0x930 [ 239.446668] ? count_memcg_events+0xd2/0x190 [ 239.446671] ? syscall_trace_enter+0x14e/0x1f0 [ 239.446676] ? syscall_exit_work+0x118/0x150 [ 239.446677] ? arch_exit_to_user_mode_prepare.constprop.0+0x9/0xb0 [ 239.446681] ? clear_bhb_loop+0x30/0x80 [ 239.446684] ? clear_bhb_loop+0x30/0x80 [ 239.446686] entry_SYSCALL_64_after_hwframe+0x76/0x7e There are two options to resolve the potential latent issue: 1. warn against PMD sharing in copy_hugetlb_page_range(), 2. fix it. This patch opts for the second option. While at it, simplify the comment, the details are not actually relevant anymore. Link: https://lkml.kernel.org/r/20250916004520.1604530-1-jane.chu@oracle.com Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: Jane Chu Reviewed-by: Harry Yoo Acked-by: Oscar Salvador Acked-by: David Hildenbrand Cc: Jann Horn Cc: Liu Shixin Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++++ mm/hugetlb.c | 15 +++++---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db9348..a643fae8a3494a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -631,6 +631,11 @@ static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) { return atomic_read(&ptdesc->pt_share_count); } + +static inline bool ptdesc_pmd_is_shared(struct ptdesc *ptdesc) +{ + return !!ptdesc_pmd_pts_count(ptdesc); +} #else static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eed59cfb5d218a..6cfe0b43ab8f96 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5594,18 +5594,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } - /* - * If the pagetables are shared don't copy or take references. - * - * dst_pte == src_pte is the common case of src/dest sharing. - * However, src could have 'unshared' and dst shares with - * another vma. So page_count of ptep page is checked instead - * to reliably determine whether pte is shared. - */ - if (page_count(virt_to_page(dst_pte)) > 1) { +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + /* If the pagetables are shared, there is nothing to do */ + if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) { addr |= last_addr_mask; continue; } +#endif dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -7602,7 +7597,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, hugetlb_vma_assert_locked(vma); if (sz != PMD_SIZE) return 0; - if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) + if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); From 85e1ff61060a765d91ee62dc5606d4d547d9d105 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Sep 2025 12:58:58 -0700 Subject: [PATCH 2892/3206] kmsan: fix out-of-bounds access to shadow memory Running sha224_kunit on a KMSAN-enabled kernel results in a crash in kmsan_internal_set_shadow_origin(): BUG: unable to handle page fault for address: ffffbc3840291000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 1810067 P4D 1810067 PUD 192d067 PMD 3c17067 PTE 0 Oops: 0000 [#1] SMP NOPTI CPU: 0 UID: 0 PID: 81 Comm: kunit_try_catch Tainted: G N 6.17.0-rc3 #10 PREEMPT(voluntary) Tainted: [N]=TEST Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.17.0-0-gb52ca86e094d-prebuilt.qemu.org 04/01/2014 RIP: 0010:kmsan_internal_set_shadow_origin+0x91/0x100 [...] Call Trace: __msan_memset+0xee/0x1a0 sha224_final+0x9e/0x350 test_hash_buffer_overruns+0x46f/0x5f0 ? kmsan_get_shadow_origin_ptr+0x46/0xa0 ? __pfx_test_hash_buffer_overruns+0x10/0x10 kunit_try_run_case+0x198/0xa00 This occurs when memset() is called on a buffer that is not 4-byte aligned and extends to the end of a guard page, i.e. the next page is unmapped. The bug is that the loop at the end of kmsan_internal_set_shadow_origin() accesses the wrong shadow memory bytes when the address is not 4-byte aligned. Since each 4 bytes are associated with an origin, it rounds the address and size so that it can access all the origins that contain the buffer. However, when it checks the corresponding shadow bytes for a particular origin, it incorrectly uses the original unrounded shadow address. This results in reads from shadow memory beyond the end of the buffer's shadow memory, which crashes when that memory is not mapped. To fix this, correctly align the shadow address before accessing the 4 shadow bytes corresponding to each origin. Link: https://lkml.kernel.org/r/20250911195858.394235-1-ebiggers@kernel.org Fixes: 2ef3cec44c60 ("kmsan: do not wipe out origin when doing partial unpoisoning") Signed-off-by: Eric Biggers Tested-by: Alexander Potapenko Reviewed-by: Alexander Potapenko Cc: Dmitriy Vyukov Cc: Marco Elver Cc: Signed-off-by: Andrew Morton --- mm/kmsan/core.c | 10 +++++++--- mm/kmsan/kmsan_test.c | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 1ea711786c522d..8bca7fece47f0e 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -195,7 +195,8 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, u32 origin, bool checked) { u64 address = (u64)addr; - u32 *shadow_start, *origin_start; + void *shadow_start; + u32 *aligned_shadow, *origin_start; size_t pad = 0; KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); @@ -214,9 +215,12 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, } __memset(shadow_start, b, size); - if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) { + if (IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) { + aligned_shadow = shadow_start; + } else { pad = address % KMSAN_ORIGIN_SIZE; address -= pad; + aligned_shadow = shadow_start - pad; size += pad; } size = ALIGN(size, KMSAN_ORIGIN_SIZE); @@ -230,7 +234,7 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, * corresponding shadow slot is zero. */ for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) { - if (origin || !shadow_start[i]) + if (origin || !aligned_shadow[i]) origin_start[i] = origin; } } diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index c6c5b2bbede0cc..902ec48b1e3e6a 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -556,6 +556,21 @@ DEFINE_TEST_MEMSETXX(16) DEFINE_TEST_MEMSETXX(32) DEFINE_TEST_MEMSETXX(64) +/* Test case: ensure that KMSAN does not access shadow memory out of bounds. */ +static void test_memset_on_guarded_buffer(struct kunit *test) +{ + void *buf = vmalloc(PAGE_SIZE); + + kunit_info(test, + "memset() on ends of guarded buffer should not crash\n"); + + for (size_t size = 0; size <= 128; size++) { + memset(buf, 0xff, size); + memset(buf + PAGE_SIZE - size, 0xff, size); + } + vfree(buf); +} + static noinline void fibonacci(int *array, int size, int start) { if (start < 2 || (start == size)) @@ -677,6 +692,7 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_memset16), KUNIT_CASE(test_memset32), KUNIT_CASE(test_memset64), + KUNIT_CASE(test_memset_on_guarded_buffer), KUNIT_CASE(test_long_origin_chain), KUNIT_CASE(test_stackdepot_roundtrip), KUNIT_CASE(test_unpoison_memory), From 28aa29986dde79e8466bc87569141291053833f5 Mon Sep 17 00:00:00 2001 From: Jakub Acs Date: Mon, 22 Sep 2025 08:22:05 +0000 Subject: [PATCH 2893/3206] fs/proc/task_mmu: check p->vec_buf for NULL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the PAGEMAP_SCAN ioctl is invoked with vec_len = 0 reaches pagemap_scan_backout_range(), kernel panics with null-ptr-deref: [ 44.936808] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN NOPTI [ 44.937797] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] [ 44.938391] CPU: 1 UID: 0 PID: 2480 Comm: reproducer Not tainted 6.17.0-rc6 #22 PREEMPT(none) [ 44.939062] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 44.939935] RIP: 0010:pagemap_scan_thp_entry.isra.0+0x741/0xa80 [ 44.946828] Call Trace: [ 44.947030] [ 44.949219] pagemap_scan_pmd_entry+0xec/0xfa0 [ 44.952593] walk_pmd_range.isra.0+0x302/0x910 [ 44.954069] walk_pud_range.isra.0+0x419/0x790 [ 44.954427] walk_p4d_range+0x41e/0x620 [ 44.954743] walk_pgd_range+0x31e/0x630 [ 44.955057] __walk_page_range+0x160/0x670 [ 44.956883] walk_page_range_mm+0x408/0x980 [ 44.958677] walk_page_range+0x66/0x90 [ 44.958984] do_pagemap_scan+0x28d/0x9c0 [ 44.961833] do_pagemap_cmd+0x59/0x80 [ 44.962484] __x64_sys_ioctl+0x18d/0x210 [ 44.962804] do_syscall_64+0x5b/0x290 [ 44.963111] entry_SYSCALL_64_after_hwframe+0x76/0x7e vec_len = 0 in pagemap_scan_init_bounce_buffer() means no buffers are allocated and p->vec_buf remains set to NULL. This breaks an assumption made later in pagemap_scan_backout_range(), that page_region is always allocated for p->vec_buf_index. Fix it by explicitly checking p->vec_buf for NULL before dereferencing. Other sites that might run into same deref-issue are already (directly or transitively) protected by checking p->vec_buf. Note: From PAGEMAP_SCAN man page, it seems vec_len = 0 is valid when no output is requested and it's only the side effects caller is interested in, hence it passes check in pagemap_scan_get_args(). This issue was found by syzkaller. Link: https://lkml.kernel.org/r/20250922082206.6889-1-acsjakub@amazon.de Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs") Signed-off-by: Jakub Acs Reviewed-by: Muhammad Usama Anjum Acked-by: David Hildenbrand Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Jinjiang Tu Cc: Suren Baghdasaryan Cc: Penglei Jiang Cc: Mark Brown Cc: Baolin Wang Cc: Ryan Roberts Cc: Andrei Vagin Cc: "Michał Mirosław" Cc: Stephen Rothwell Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 29cca0e6d0ff5a..b26ae556b4463e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2417,6 +2417,9 @@ static void pagemap_scan_backout_range(struct pagemap_scan_private *p, { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + if (!p->vec_buf) + return; + if (cur_buf->start != addr) cur_buf->end = addr; else From 87e1c7c9a09604ac6a8f6ccf3086aec24a76e5a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bence=20Cs=C3=B3k=C3=A1s?= Date: Mon, 15 Sep 2025 11:05:42 +0200 Subject: [PATCH 2894/3206] =?UTF-8?q?mailmap:=20add=20entry=20for=20Bence?= =?UTF-8?q?=20Cs=C3=B3k=C3=A1s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I will be leaving Prolan this week. You can reach me by my personal email for now. Link: https://lkml.kernel.org/r/20250915-mailmap-v1-1-9ebdea93c6a7@prolan.hu Signed-off-by: Bence Csókás Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index a124aeed52a2a3..82a1773253aad3 100644 --- a/.mailmap +++ b/.mailmap @@ -134,6 +134,7 @@ Ben M Cahill Ben Widawsky Ben Widawsky Ben Widawsky +Bence Csókás Benjamin Poirier Benjamin Tissoires Benjamin Tissoires From 06195ee967d06ead757f9291bbaf1a0b30fa10b8 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 20 Sep 2025 22:25:46 +0900 Subject: [PATCH 2895/3206] mm/damon/sysfs: do not ignore callback's return value in damon_sysfs_damon_call() The callback return value is ignored in damon_sysfs_damon_call(), which means that it is not possible to detect invalid user input when writing commands such as 'commit' to /sys/kernel/mm/damon/admin/kdamonds//state. Fix it. Link: https://lkml.kernel.org/r/20250920132546.5822-1-akinobu.mita@gmail.com Fixes: f64539dcdb87 ("mm/damon/sysfs: use damon_call() for update_schemes_stats") Signed-off-by: Akinobu Mita Reviewed-by: SeongJae Park Cc: [6.14+] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index c96c2154128faa..7308dee97b2109 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1592,12 +1592,14 @@ static int damon_sysfs_damon_call(int (*fn)(void *data), struct damon_sysfs_kdamond *kdamond) { struct damon_call_control call_control = {}; + int err; if (!kdamond->damon_ctx) return -EINVAL; call_control.fn = fn; call_control.data = kdamond; - return damon_call(kdamond->damon_ctx, &call_control); + err = damon_call(kdamond->damon_ctx, &call_control); + return err ? err : call_control.return_code; } struct damon_sysfs_schemes_walk_data { From 7e89979f6695fb56e8739b7d19614256e637131d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 13 Sep 2025 17:03:39 -0700 Subject: [PATCH 2896/3206] include/linux/pgtable.h: convert arch_enter_lazy_mmu_mode() and friends to static inlines commit c519c3c0a113 ("mm/kasan: avoid lazy MMU mode hazards") introduced the use of arch_enter_lazy_mmu_mode(), which results in the compiler complaining about "statement has no effect", when __HAVE_ARCH_LAZY_MMU_MODE is not defined in include/linux/pgtable.h The exact warning/error is: In file included from ./include/linux/kasan.h:37, from mm/kasan/shadow.c:14: mm/kasan/shadow.c: In function kasan_populate_vmalloc_pte: ./include/linux/pgtable.h:247:41: error: statement with no effect [-Werror=unused-value] 247 | #define arch_enter_lazy_mmu_mode() (LAZY_MMU_DEFAULT) | ^ mm/kasan/shadow.c:322:9: note: in expansion of macro arch_enter_lazy_mmu_mode> 322 | arch_enter_lazy_mmu_mode(); | ^~~~~~~~~~~~~~~~~~~~~~~~ switching these "functions" to static inlines fixes this up. Fixes: c519c3c0a113 ("mm/kasan: avoid lazy MMU mode hazards") Reported-by: Balbir Singh Closes: https://lkml.kernel.org/r/20250912235515.367061-1-balbirs@nvidia.com Cc: Alexander Gordeev Cc: Andrey Ryabinin Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2b80fd456c8b55..25a7257052ff94 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -232,9 +232,9 @@ static inline int pmd_dirty(pmd_t pmd) * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() do {} while (0) -#define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_flush_lazy_mmu_mode() do {} while (0) +static inline void arch_enter_lazy_mmu_mode(void) {} +static inline void arch_leave_lazy_mmu_mode(void) {} +static inline void arch_flush_lazy_mmu_mode(void) {} #endif #ifndef pte_batch_hint From 17f0d1f6321caa95699b8f96baf12e654d7b8d60 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Fri, 26 Sep 2025 01:50:28 +0800 Subject: [PATCH 2897/3206] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE The stacktrace map can be easily full, which will lead to failure in obtaining the stack. In addition to increasing the size of the map, another solution is to delete the stack_id after looking it up from the user, so extend the existing bpf_map_lookup_and_delete_elem() functionality to stacktrace map types. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250925175030.1615837-1-chen.dylane@linux.dev --- include/linux/bpf.h | 2 +- kernel/bpf/stackmap.c | 16 ++++++++++++++-- kernel/bpf/syscall.c | 8 +++++--- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ea2ed6771cc60c..6338e54a9b1f8d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2724,7 +2724,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 flags); -int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); +int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, bool delete); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3615c06b7dfa98..2e182a3ac4ce54 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -646,7 +646,15 @@ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) } /* Called from syscall */ -int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return bpf_stackmap_extract(map, key, value, true); +} + +/* Called from syscall */ +int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, + bool delete) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *old_bucket; @@ -663,7 +671,10 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); - old_bucket = xchg(&smap->buckets[id], bucket); + if (delete) + old_bucket = bucket; + else + old_bucket = xchg(&smap->buckets[id], bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; @@ -754,6 +765,7 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, .map_lookup_elem = stack_map_lookup_elem, + .map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index adb05d235011ff..a48fa86f82a7fc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -320,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { - err = bpf_stackmap_copy(map, key, value); + err = bpf_stackmap_extract(map, key, value, false); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { @@ -1666,7 +1666,8 @@ struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); -int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, + bool delete) { return -ENOTSUPP; } @@ -2197,7 +2198,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_STACK_TRACE) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); rcu_read_lock(); From 363b17e273f0929ba7791231a0bbb5424204d93a Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Fri, 26 Sep 2025 01:50:29 +0800 Subject: [PATCH 2898/3206] selftests/bpf: Refactor stacktrace_map case with skeleton The loading method of the stacktrace_map test case looks too outdated, refactor it with skeleton, and we can use global variable feature in the next patch. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250925175030.1615837-2-chen.dylane@linux.dev --- .../selftests/bpf/prog_tests/stacktrace_map.c | 56 ++++++------------- .../bpf/prog_tests/stacktrace_map_raw_tp.c | 2 +- ...test_stacktrace_map.c => stacktrace_map.c} | 0 3 files changed, 18 insertions(+), 40 deletions(-) rename tools/testing/selftests/bpf/progs/{test_stacktrace_map.c => stacktrace_map.c} (100%) diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c index 84a7e405e9129d..2e3da2030a2df1 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c @@ -1,46 +1,26 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include "stacktrace_map.skel.h" void test_stacktrace_map(void) { + struct stacktrace_map *skel; int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd; - const char *prog_name = "oncpu"; - int err, prog_fd, stack_trace_len; - const char *file = "./test_stacktrace_map.bpf.o"; + int err, stack_trace_len; __u32 key, val, duration = 0; - struct bpf_program *prog; - struct bpf_object *obj; - struct bpf_link *link; - err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); - if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno)) + skel = stacktrace_map__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) return; - prog = bpf_object__find_program_by_name(obj, prog_name); - if (CHECK(!prog, "find_prog", "prog '%s' not found\n", prog_name)) - goto close_prog; - - link = bpf_program__attach_tracepoint(prog, "sched", "sched_switch"); - if (!ASSERT_OK_PTR(link, "attach_tp")) - goto close_prog; - - /* find map fds */ - control_map_fd = bpf_find_map(__func__, obj, "control_map"); - if (CHECK_FAIL(control_map_fd < 0)) - goto disable_pmu; - - stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap"); - if (CHECK_FAIL(stackid_hmap_fd < 0)) - goto disable_pmu; - - stackmap_fd = bpf_find_map(__func__, obj, "stackmap"); - if (CHECK_FAIL(stackmap_fd < 0)) - goto disable_pmu; - - stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap"); - if (CHECK_FAIL(stack_amap_fd < 0)) - goto disable_pmu; + control_map_fd = bpf_map__fd(skel->maps.control_map); + stackid_hmap_fd = bpf_map__fd(skel->maps.stackid_hmap); + stackmap_fd = bpf_map__fd(skel->maps.stackmap); + stack_amap_fd = bpf_map__fd(skel->maps.stack_amap); + err = stacktrace_map__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; /* give some time for bpf program run */ sleep(1); @@ -55,21 +35,19 @@ void test_stacktrace_map(void) err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto out; err = compare_map_keys(stackmap_fd, stackid_hmap_fd); if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto out; stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64); err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len); if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto out; -disable_pmu: - bpf_link__destroy(link); -close_prog: - bpf_object__close(obj); +out: + stacktrace_map__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c index e0cb4697b4b3cc..e985d51d3d4789 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c @@ -5,7 +5,7 @@ void test_stacktrace_map_raw_tp(void) { const char *prog_name = "oncpu"; int control_map_fd, stackid_hmap_fd, stackmap_fd; - const char *file = "./test_stacktrace_map.bpf.o"; + const char *file = "./stacktrace_map.bpf.o"; __u32 key, val, duration = 0; int err, prog_fd; struct bpf_program *prog; diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/stacktrace_map.c similarity index 100% rename from tools/testing/selftests/bpf/progs/test_stacktrace_map.c rename to tools/testing/selftests/bpf/progs/stacktrace_map.c From d43029ff7d1b7183dc0cf11b6cc2c12a0b810ad8 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Fri, 26 Sep 2025 01:50:30 +0800 Subject: [PATCH 2899/3206] selftests/bpf: Add stacktrace map lookup_and_delete_elem test case Add tests for stacktrace map lookup and delete: 1. use bpf_map_lookup_and_delete_elem to lookup and delete the target stack_id, 2. lookup the deleted stack_id again to double check. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250925175030.1615837-3-chen.dylane@linux.dev --- .../testing/selftests/bpf/prog_tests/stacktrace_map.c | 11 ++++++++++- tools/testing/selftests/bpf/progs/stacktrace_map.c | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c index 2e3da2030a2df1..c23b97414813ab 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c @@ -7,7 +7,8 @@ void test_stacktrace_map(void) struct stacktrace_map *skel; int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd; int err, stack_trace_len; - __u32 key, val, duration = 0; + __u32 key, val, stack_id, duration = 0; + __u64 stack[PERF_MAX_STACK_DEPTH]; skel = stacktrace_map__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) @@ -48,6 +49,14 @@ void test_stacktrace_map(void) "err %d errno %d\n", err, errno)) goto out; + stack_id = skel->bss->stack_id; + err = bpf_map_lookup_and_delete_elem(stackmap_fd, &stack_id, stack); + if (!ASSERT_OK(err, "lookup and delete target stack_id")) + goto out; + + err = bpf_map_lookup_elem(stackmap_fd, &stack_id, stack); + if (!ASSERT_EQ(err, -ENOENT, "lookup deleted stack_id")) + goto out; out: stacktrace_map__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/stacktrace_map.c b/tools/testing/selftests/bpf/progs/stacktrace_map.c index 47568007b6683b..0c77df05be7fce 100644 --- a/tools/testing/selftests/bpf/progs/stacktrace_map.c +++ b/tools/testing/selftests/bpf/progs/stacktrace_map.c @@ -50,6 +50,7 @@ struct sched_switch_args { int next_prio; }; +__u32 stack_id; SEC("tracepoint/sched/sched_switch") int oncpu(struct sched_switch_args *ctx) { @@ -64,6 +65,7 @@ int oncpu(struct sched_switch_args *ctx) /* The size of stackmap and stackid_hmap should be the same */ key = bpf_get_stackid(ctx, &stackmap, 0); if ((int)key >= 0) { + stack_id = key; bpf_map_update_elem(&stackid_hmap, &key, &val, 0); stack_p = bpf_map_lookup_elem(&stack_amap, &key); if (stack_p) From 27fa1a8b2803dfd88c39f03b0969c55f667cdc43 Mon Sep 17 00:00:00 2001 From: Olivier Moysan Date: Tue, 16 Sep 2025 14:31:18 +0200 Subject: [PATCH 2900/3206] ASoC: stm32: sai: manage context in set_sysclk callback The mclk direction now needs to be specified in endpoint node with "system-clock-direction-out" property. However some calls to the set_sysclk callback, related to CPU DAI clock, result in unbalanced calls to clock API. The set_sysclk callback in STM32 SAI driver is intended only for mclk management. So it is relevant to ensure that calls to set_sysclk are related to mclk only. Since the master clock is handled only at runtime, skip the calls to set_sysclk in the initialization phase. Signed-off-by: Olivier Moysan Link: https://patch.msgid.link/20250916123118.84175-1-olivier.moysan@foss.st.com Signed-off-by: Mark Brown --- sound/soc/stm/stm32_sai_sub.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sound/soc/stm/stm32_sai_sub.c b/sound/soc/stm/stm32_sai_sub.c index 463a2b7d023b9c..0ae1eae2a59e2f 100644 --- a/sound/soc/stm/stm32_sai_sub.c +++ b/sound/soc/stm/stm32_sai_sub.c @@ -672,6 +672,14 @@ static int stm32_sai_set_sysclk(struct snd_soc_dai *cpu_dai, struct stm32_sai_sub_data *sai = snd_soc_dai_get_drvdata(cpu_dai); int ret; + /* + * The mclk rate is determined at runtime from the audio stream rate. + * Skip calls to the set_sysclk callback that are not relevant during the + * initialization phase. + */ + if (!snd_soc_card_is_instantiated(cpu_dai->component->card)) + return 0; + if (dir == SND_SOC_CLOCK_OUT && sai->sai_mclk) { ret = stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, SAI_XCR1_NODIV, From ba0c67d3c4b0ce5ec5e6de35e6433b22eecb1f6a Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Wed, 24 Sep 2025 22:34:02 +0300 Subject: [PATCH 2901/3206] ASoC: qcom: sc8280xp: use sa8775p/ subdir for QCS9100 / QCS9075 All firmware for the Lemans platform aka QCS9100 aka QCS9075 is for historical reasons located in the qcom/sa8775p/ subdir inside linux-firmware. The only exceptions to this rule are audio topology files. While it's not too late, change the subdir to point to the sa8775p/ subdir, so that all firmware for that platform is present at the same location. Fixes: 5b5bf5922f4c ("ASoC: qcom: sc8280xp: Add sound card support for QCS9100 and QCS9075") Signed-off-by: Dmitry Baryshkov Reviewed-by: Srinivas Kandagatla Link: https://patch.msgid.link/20250924-lemans-evk-topo-v2-1-7d44909a5758@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/qcom/sc8280xp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/qcom/sc8280xp.c b/sound/soc/qcom/sc8280xp.c index 288ccd7f8866a6..6847ae4acbd183 100644 --- a/sound/soc/qcom/sc8280xp.c +++ b/sound/soc/qcom/sc8280xp.c @@ -191,8 +191,8 @@ static const struct of_device_id snd_sc8280xp_dt_match[] = { {.compatible = "qcom,qcm6490-idp-sndcard", "qcm6490"}, {.compatible = "qcom,qcs6490-rb3gen2-sndcard", "qcs6490"}, {.compatible = "qcom,qcs8275-sndcard", "qcs8300"}, - {.compatible = "qcom,qcs9075-sndcard", "qcs9075"}, - {.compatible = "qcom,qcs9100-sndcard", "qcs9100"}, + {.compatible = "qcom,qcs9075-sndcard", "sa8775p"}, + {.compatible = "qcom,qcs9100-sndcard", "sa8775p"}, {.compatible = "qcom,sc8280xp-sndcard", "sc8280xp"}, {.compatible = "qcom,sm8450-sndcard", "sm8450"}, {.compatible = "qcom,sm8550-sndcard", "sm8550"}, From 733a763dd8b3ac2858dd238a91bb3a2fdff4739e Mon Sep 17 00:00:00 2001 From: Primoz Fiser Date: Thu, 25 Sep 2025 10:59:29 +0200 Subject: [PATCH 2902/3206] ASoC: tlv320aic3x: Fix class-D initialization for tlv320aic3007 The problem of having class-D initialization sequence in probe using regmap_register_patch() is that it will do hardware register writes immediately after being called as it bypasses regcache. Afterwards, in aic3x_init() we also perform codec soft reset, rendering class-D init sequence pointless. This issue is even more apparent when using reset GPIO line, since in that case class-D amplifier initialization fails with "Failed to init class D: -5" message as codec is already held in reset state after requesting the reset GPIO and hence hardware I/O fails with -EIO errno. Thus move class-D amplifier initialization sequence from probe function to aic3x_set_power() just before the usual regcache sync. Use bypassed regmap_multi_reg_write_bypassed() function to make sure, class-D init sequence is performed in proper order as described in the datasheet. Signed-off-by: Primoz Fiser Link: https://patch.msgid.link/20250925085929.2581749-1-primoz.fiser@norik.com Signed-off-by: Mark Brown --- sound/soc/codecs/tlv320aic3x.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/sound/soc/codecs/tlv320aic3x.c b/sound/soc/codecs/tlv320aic3x.c index f1649df197389d..eea8ca285f8e0f 100644 --- a/sound/soc/codecs/tlv320aic3x.c +++ b/sound/soc/codecs/tlv320aic3x.c @@ -121,6 +121,16 @@ static const struct reg_default aic3x_reg[] = { { 108, 0x00 }, { 109, 0x00 }, }; +static const struct reg_sequence aic3007_class_d[] = { + /* Class-D speaker driver init; datasheet p. 46 */ + { AIC3X_PAGE_SELECT, 0x0D }, + { 0xD, 0x0D }, + { 0x8, 0x5C }, + { 0x8, 0x5D }, + { 0x8, 0x5C }, + { AIC3X_PAGE_SELECT, 0x00 }, +}; + static bool aic3x_volatile_reg(struct device *dev, unsigned int reg) { switch (reg) { @@ -1393,6 +1403,10 @@ static int aic3x_set_power(struct snd_soc_component *component, int power) gpiod_set_value(aic3x->gpio_reset, 0); } + if (aic3x->model == AIC3X_MODEL_3007) + regmap_multi_reg_write_bypassed(aic3x->regmap, aic3007_class_d, + ARRAY_SIZE(aic3007_class_d)); + /* Sync reg_cache with the hardware */ regcache_cache_only(aic3x->regmap, false); regcache_sync(aic3x->regmap); @@ -1723,17 +1737,6 @@ static void aic3x_configure_ocmv(struct device *dev, struct aic3x_priv *aic3x) } } - -static const struct reg_sequence aic3007_class_d[] = { - /* Class-D speaker driver init; datasheet p. 46 */ - { AIC3X_PAGE_SELECT, 0x0D }, - { 0xD, 0x0D }, - { 0x8, 0x5C }, - { 0x8, 0x5D }, - { 0x8, 0x5C }, - { AIC3X_PAGE_SELECT, 0x00 }, -}; - int aic3x_probe(struct device *dev, struct regmap *regmap, kernel_ulong_t driver_data) { struct aic3x_priv *aic3x; @@ -1823,13 +1826,6 @@ int aic3x_probe(struct device *dev, struct regmap *regmap, kernel_ulong_t driver aic3x_configure_ocmv(dev, aic3x); - if (aic3x->model == AIC3X_MODEL_3007) { - ret = regmap_register_patch(aic3x->regmap, aic3007_class_d, - ARRAY_SIZE(aic3007_class_d)); - if (ret != 0) - dev_err(dev, "Failed to init class D: %d\n", ret); - } - ret = devm_snd_soc_register_component(dev, &soc_component_dev_aic3x, &aic3x_dai, 1); if (ret) return ret; From 76bb6969a8cfc5e00ca142fdad86ffd0a6ed9ecd Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 25 Sep 2025 15:22:19 -0400 Subject: [PATCH 2903/3206] dt-bindings: hwmon: (lm75) allow interrupt for ti,tmp75 Allow interrupt for ti,tmp75 because chip has open drain ALERT signal. Signed-off-by: Frank Li Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250925192219.303825-1-Frank.Li@nxp.com Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/hwmon/lm75.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/hwmon/lm75.yaml b/Documentation/devicetree/bindings/hwmon/lm75.yaml index ecdd09a032e512..0b9fda81e3ec50 100644 --- a/Documentation/devicetree/bindings/hwmon/lm75.yaml +++ b/Documentation/devicetree/bindings/hwmon/lm75.yaml @@ -70,6 +70,7 @@ allOf: - ti,tmp100 - ti,tmp101 - ti,tmp112 + - ti,tmp75 then: properties: interrupts: false From cafb47be3f38ad81306bf894e743bebc2ccf66ab Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Sat, 9 Aug 2025 10:35:15 +0530 Subject: [PATCH 2904/3206] tools/power turbostat: Fix incorrect sorting of PMT telemetry The pmt_telemdir_sort() comparison function was returning a boolean value (0 or 1) instead of the required negative, zero, or positive value for proper sorting. This caused unpredictable and incorrect ordering of telemetry directories named telem0, telem1, ..., telemN. Update the comparison logic to return -1, 0, or 1 based on the numerical value extracted from the directory name, ensuring correct numerical ordering when using scandir. This change improves stability and correctness when iterating PMT telemetry directories. Signed-off-by: Kaushlendra Kumar Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5230e072e414c6..b391196c3e90fb 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1649,7 +1649,7 @@ int pmt_telemdir_sort(const struct dirent **a, const struct dirent **b) sscanf((*a)->d_name, "telem%u", &aidx); sscanf((*b)->d_name, "telem%u", &bidx); - return aidx >= bidx; + return (aidx > bidx) ? 1 : (aidx < bidx) ? -1 : 0; } const struct dirent *pmt_diriter_next(struct pmt_diriter_t *iter) From 62127655b7ab7b8c2997041aca48a81bf5c6da0c Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Wed, 13 Aug 2025 12:32:08 +0530 Subject: [PATCH 2905/3206] tools/power x86_energy_perf_policy: Fix incorrect fopen mode usage The fopen_or_die() function was previously hardcoded to open files in read-only mode ("r"), ignoring the mode parameter passed to it. This patch corrects fopen_or_die() to use the provided mode argument, allowing for flexible file access as intended. Additionally, the call to fopen_or_die() in err_on_hypervisor() incorrectly used the mode "ro", which is not a valid fopen mode. This is fixed to use the correct "r" mode. Signed-off-by: Kaushlendra Kumar Signed-off-by: Len Brown --- .../power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index ebda9c366b2ba3..c883f211dbcc9b 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -630,7 +630,7 @@ void cmdline(int argc, char **argv) */ FILE *fopen_or_die(const char *path, const char *mode) { - FILE *filep = fopen(path, "r"); + FILE *filep = fopen(path, mode); if (!filep) err(1, "%s: open failed", path); @@ -644,7 +644,7 @@ void err_on_hypervisor(void) char *buffer; /* On VMs /proc/cpuinfo contains a "flags" entry for hypervisor */ - cpuinfo = fopen_or_die("/proc/cpuinfo", "ro"); + cpuinfo = fopen_or_die("/proc/cpuinfo", "r"); buffer = malloc(4096); if (!buffer) { From b6b42a6051b203bb1d78bf6518007e5dc8b62fc4 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 19 Sep 2025 13:46:55 -0400 Subject: [PATCH 2906/3206] tools/power x86_energy_perf_policy: Enhance HWP enabled check Verify that all CPUs have HWP enabled, not just cpu0. Signed-off-by: Len Brown --- .../x86_energy_perf_policy.c | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index c883f211dbcc9b..2bf0f878441edb 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -4,7 +4,7 @@ * policy preference bias on recent X86 processors. */ /* - * Copyright (c) 2010 - 2017 Intel Corporation. + * Copyright (c) 2010 - 2025 Intel Corporation. * Len Brown */ @@ -517,7 +517,7 @@ void for_packages(unsigned long long pkg_set, int (func)(int)) void print_version(void) { - printf("x86_energy_perf_policy 17.05.11 (C) Len Brown \n"); + printf("x86_energy_perf_policy 2025.9.19 Len Brown \n"); } void cmdline(int argc, char **argv) @@ -1312,6 +1312,17 @@ void for_all_cpus_in_set(size_t set_size, cpu_set_t *cpu_set, int (func)(int)) if (CPU_ISSET_S(cpu_num, set_size, cpu_set)) func(cpu_num); } +int for_all_cpus_in_set_and(size_t set_size, cpu_set_t *cpu_set, int (func)(int)) +{ + int cpu_num; + int retval = 1; + + for (cpu_num = 0; cpu_num <= max_cpu_num; ++cpu_num) + if (CPU_ISSET_S(cpu_num, set_size, cpu_set)) + retval &= func(cpu_num); + + return retval; +} void init_data_structures(void) { @@ -1326,21 +1337,38 @@ void init_data_structures(void) for_all_proc_cpus(mark_cpu_present); } -/* clear has_hwp if it is not enable (or being enabled) */ +int is_hwp_enabled_on_cpu(int cpu_num) +{ + unsigned long long msr; + int retval; + + /* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */ + get_msr(cpu_num, MSR_PM_ENABLE, &msr); + retval = (msr & 1); + if (verbose) + fprintf(stderr, "cpu%d: %sHWP\n", cpu_num, retval ? "" : "No-"); + + return retval; +} + +/* + * verify_hwp_is_enabled() + * + * Set (has_hwp=0) if no HWP feature or any of selected CPU set does not have HWP enabled + */ void verify_hwp_is_enabled(void) { - unsigned long long msr; + int retval; if (!has_hwp) /* set in early_cpuid() */ return; - /* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */ - get_msr(base_cpu, MSR_PM_ENABLE, &msr); - if ((msr & 1) == 0) { + retval = for_all_cpus_in_set_and(cpu_setsize, cpu_selected_set, is_hwp_enabled_on_cpu); + + if (retval == 0) { fprintf(stderr, "HWP can be enabled using '--hwp-enable'\n"); has_hwp = 0; - return; } } From c97c057d357c4b39b153e9e430bbf8976e05bd4e Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 19 Sep 2025 14:07:02 -0400 Subject: [PATCH 2907/3206] tools/power x86_energy_perf_policy: Enhance HWP enable On enabling HWP, preserve the reserved bits in MSR_PM_ENABLE. Also, skip writing the MSR_PM_ENABLE if HWP is already enabled. Signed-off-by: Len Brown --- .../x86_energy_perf_policy/x86_energy_perf_policy.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index 2bf0f878441edb..70d7cdf0b07fcc 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -1166,13 +1166,18 @@ int update_hwp_request_pkg(int pkg) int enable_hwp_on_cpu(int cpu) { - unsigned long long msr; + unsigned long long old_msr, new_msr; + + get_msr(cpu, MSR_PM_ENABLE, &old_msr); + + if (old_msr & 1) + return 0; /* already enabled */ - get_msr(cpu, MSR_PM_ENABLE, &msr); - put_msr(cpu, MSR_PM_ENABLE, 1); + new_msr = old_msr | 1; + put_msr(cpu, MSR_PM_ENABLE, new_msr); if (verbose) - printf("cpu%d: MSR_PM_ENABLE old: %d new: %d\n", cpu, (unsigned int) msr, 1); + printf("cpu%d: MSR_PM_ENABLE old: %llX new: %llX\n", cpu, old_msr, new_msr); return 0; } From 8ef8fa829f8ad313f18d694c15874f85d693215d Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 19 Sep 2025 15:05:48 -0400 Subject: [PATCH 2908/3206] tools/power x86_energy_perf_policy: Prepare for MSR/sysfs refactoring Rename routines to make "_msr" and "_sysfs" access methods clear No functional change Signed-off-by: Len Brown --- .../x86_energy_perf_policy.c | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index 70d7cdf0b07fcc..ce6fb808010513 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -809,7 +809,7 @@ void print_hwp_request_pkg(int pkg, struct msr_hwp_request *h, char *str) h->hwp_min, h->hwp_max, h->hwp_desired, h->hwp_epp, h->hwp_window, h->hwp_window & 0x7F, (h->hwp_window >> 7) & 0x7); } -void read_hwp_request(int cpu, struct msr_hwp_request *hwp_req, unsigned int msr_offset) +void read_hwp_request_msr(int cpu, struct msr_hwp_request *hwp_req, unsigned int msr_offset) { unsigned long long msr; @@ -823,7 +823,7 @@ void read_hwp_request(int cpu, struct msr_hwp_request *hwp_req, unsigned int msr hwp_req->hwp_use_pkg = (((msr) >> 42) & 0x1); } -void write_hwp_request(int cpu, struct msr_hwp_request *hwp_req, unsigned int msr_offset) +void write_hwp_request_msr(int cpu, struct msr_hwp_request *hwp_req, unsigned int msr_offset) { unsigned long long msr = 0; @@ -843,7 +843,7 @@ void write_hwp_request(int cpu, struct msr_hwp_request *hwp_req, unsigned int ms put_msr(cpu, msr_offset, msr); } -static int get_epb(int cpu) +static int get_epb_sysfs(int cpu) { char path[SYSFS_PATH_MAX]; char linebuf[3]; @@ -865,7 +865,7 @@ static int get_epb(int cpu) return (int)val; } -static int set_epb(int cpu, int val) +static int set_epb_sysfs(int cpu, int val) { char path[SYSFS_PATH_MAX]; char linebuf[3]; @@ -895,14 +895,14 @@ int print_cpu_msrs(int cpu) struct msr_hwp_cap cap; int epb; - epb = get_epb(cpu); + epb = get_epb_sysfs(cpu); if (epb >= 0) printf("cpu%d: EPB %u\n", cpu, (unsigned int) epb); if (!has_hwp) return 0; - read_hwp_request(cpu, &req, MSR_HWP_REQUEST); + read_hwp_request_msr(cpu, &req, MSR_HWP_REQUEST); print_hwp_request(cpu, &req, ""); read_hwp_cap(cpu, &cap, MSR_HWP_CAPABILITIES); @@ -919,7 +919,7 @@ int print_pkg_msrs(int pkg) if (!has_hwp) return 0; - read_hwp_request(first_cpu_in_pkg[pkg], &req, MSR_HWP_REQUEST_PKG); + read_hwp_request_msr(first_cpu_in_pkg[pkg], &req, MSR_HWP_REQUEST_PKG); print_hwp_request_pkg(pkg, &req, ""); if (has_hwp_notify) { @@ -1074,14 +1074,14 @@ int check_hwp_request_v_hwp_capabilities(int cpu, struct msr_hwp_request *req, s return 0; } -int update_hwp_request(int cpu) +int update_hwp_request_msr(int cpu) { struct msr_hwp_request req; struct msr_hwp_cap cap; int msr_offset = MSR_HWP_REQUEST; - read_hwp_request(cpu, &req, msr_offset); + read_hwp_request_msr(cpu, &req, msr_offset); if (debug) print_hwp_request(cpu, &req, "old: "); @@ -1111,15 +1111,15 @@ int update_hwp_request(int cpu) verify_hwp_req_self_consistency(cpu, &req); - write_hwp_request(cpu, &req, msr_offset); + write_hwp_request_msr(cpu, &req, msr_offset); if (debug) { - read_hwp_request(cpu, &req, msr_offset); + read_hwp_request_msr(cpu, &req, msr_offset); print_hwp_request(cpu, &req, "new: "); } return 0; } -int update_hwp_request_pkg(int pkg) +int update_hwp_request_pkg_msr(int pkg) { struct msr_hwp_request req; struct msr_hwp_cap cap; @@ -1127,7 +1127,7 @@ int update_hwp_request_pkg(int pkg) int msr_offset = MSR_HWP_REQUEST_PKG; - read_hwp_request(cpu, &req, msr_offset); + read_hwp_request_msr(cpu, &req, msr_offset); if (debug) print_hwp_request_pkg(pkg, &req, "old: "); @@ -1155,10 +1155,10 @@ int update_hwp_request_pkg(int pkg) verify_hwp_req_self_consistency(cpu, &req); - write_hwp_request(cpu, &req, msr_offset); + write_hwp_request_msr(cpu, &req, msr_offset); if (debug) { - read_hwp_request(cpu, &req, msr_offset); + read_hwp_request_msr(cpu, &req, msr_offset); print_hwp_request_pkg(pkg, &req, "new: "); } return 0; @@ -1188,8 +1188,8 @@ int update_cpu_msrs(int cpu) int epb; if (update_epb) { - epb = get_epb(cpu); - set_epb(cpu, new_epb); + epb = get_epb_sysfs(cpu); + set_epb_sysfs(cpu, new_epb); if (verbose) printf("cpu%d: ENERGY_PERF_BIAS old: %d new: %d\n", @@ -1229,7 +1229,7 @@ int update_cpu_msrs(int cpu) if (!hwp_update_enabled()) return 0; - update_hwp_request(cpu); + update_hwp_request_msr(cpu); return 0; } @@ -1587,7 +1587,7 @@ int main(int argc, char **argv) for_all_cpus_in_set(cpu_setsize, cpu_selected_set, update_sysfs); for_all_cpus_in_set(cpu_setsize, cpu_selected_set, update_cpu_msrs); } else if (pkg_selected_set) - for_packages(pkg_selected_set, update_hwp_request_pkg); + for_packages(pkg_selected_set, update_hwp_request_pkg_msr); return 0; } From f8241f5426eb9feb46a007341edd221ceaf73073 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 19 Sep 2025 15:35:07 -0400 Subject: [PATCH 2909/3206] tools/power x86_energy_perf_policy: EPB access is only via sysfs Comprehend that EPB writes are now only via sysfs by moving it out of the _msr specific path. No functional change. Signed-off-by: Len Brown --- .../x86_energy_perf_policy.c | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index ce6fb808010513..fe10057bf9b774 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -1182,19 +1182,23 @@ int enable_hwp_on_cpu(int cpu) return 0; } -int update_cpu_msrs(int cpu) +int update_cpu_epb_sysfs(int cpu) { - unsigned long long msr; int epb; - if (update_epb) { - epb = get_epb_sysfs(cpu); - set_epb_sysfs(cpu, new_epb); + epb = get_epb_sysfs(cpu); + set_epb_sysfs(cpu, new_epb); - if (verbose) - printf("cpu%d: ENERGY_PERF_BIAS old: %d new: %d\n", - cpu, epb, (unsigned int) new_epb); - } + if (verbose) + printf("cpu%d: ENERGY_PERF_BIAS old: %d new: %d\n", + cpu, epb, (unsigned int) new_epb); + + return 0; +} + +int update_cpu_msrs(int cpu) +{ + unsigned long long msr; if (update_turbo) { int turbo_is_present_and_disabled; @@ -1584,8 +1588,11 @@ int main(int argc, char **argv) /* update CPU set */ if (cpu_selected_set) { + if (update_epb) + for_all_cpus_in_set(cpu_setsize, cpu_selected_set, update_cpu_epb_sysfs); for_all_cpus_in_set(cpu_setsize, cpu_selected_set, update_sysfs); for_all_cpus_in_set(cpu_setsize, cpu_selected_set, update_cpu_msrs); + } else if (pkg_selected_set) for_packages(pkg_selected_set, update_hwp_request_pkg_msr); From 2734fdbc9bb8a3aeb309ba0d62212d7f53f30bc7 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 19 Sep 2025 15:56:46 -0400 Subject: [PATCH 2910/3206] tools/power x86_energy_perf_policy: Prefer driver HWP limits When we are successful in using cpufreq min/max limits, skip setting the raw MSR limits entirely. This is necessary to avoid undoing any modification that the cpufreq driver makes to our sysfs request. eg. intel_pstate may take our request for a limit that is valid according to HWP.CAP.MIN/MAX and clip it to be within the range available in PLATFORM_INFO. Signed-off-by: Len Brown --- .../x86_energy_perf_policy/x86_energy_perf_policy.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index fe10057bf9b774..884a4c746f32e8 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -62,6 +62,7 @@ unsigned char turbo_update_value; unsigned char update_hwp_epp; unsigned char update_hwp_min; unsigned char update_hwp_max; +unsigned char hwp_limits_done_via_sysfs; unsigned char update_hwp_desired; unsigned char update_hwp_window; unsigned char update_hwp_use_pkg; @@ -951,8 +952,10 @@ int ratio_2_sysfs_khz(int ratio) } /* * If HWP is enabled and cpufreq sysfs attribtes are present, - * then update sysfs, so that it will not become - * stale when we write to MSRs. + * then update via sysfs. The intel_pstate driver may modify (clip) + * this request, say, when HWP_CAP is outside of PLATFORM_INFO limits, + * and the driver-chosen value takes precidence. + * * (intel_pstate's max_perf_pct and min_perf_pct will follow cpufreq, * so we don't have to touch that.) */ @@ -1007,6 +1010,8 @@ int update_sysfs(int cpu) if (update_hwp_max) update_cpufreq_scaling_freq(1, cpu, req_update.hwp_max); + hwp_limits_done_via_sysfs = 1; + return 0; } @@ -1085,10 +1090,10 @@ int update_hwp_request_msr(int cpu) if (debug) print_hwp_request(cpu, &req, "old: "); - if (update_hwp_min) + if (update_hwp_min && !hwp_limits_done_via_sysfs) req.hwp_min = req_update.hwp_min; - if (update_hwp_max) + if (update_hwp_max && !hwp_limits_done_via_sysfs) req.hwp_max = req_update.hwp_max; if (update_hwp_desired) From a648e0892ccd8c2168c2deae7e12e18a8d596730 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 19 Sep 2025 22:30:18 -0400 Subject: [PATCH 2911/3206] tools/power x86_energy_perf_policy: Add make snapshot target $ make snapshot creates x86_energy_perf_policy-$(DATE).tar.gz Which can be transported to a target machine without needing a kernel tree to build on the target. Useful for creating debug versions. Signed-off-by: Len Brown --- .../power/x86/x86_energy_perf_policy/Makefile | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/x86_energy_perf_policy/Makefile b/tools/power/x86/x86_energy_perf_policy/Makefile index 666b325a62a229..d182846674008e 100644 --- a/tools/power/x86/x86_energy_perf_policy/Makefile +++ b/tools/power/x86/x86_energy_perf_policy/Makefile @@ -1,8 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 CC = $(CROSS_COMPILE)gcc -BUILD_OUTPUT := $(CURDIR) +BUILD_OUTPUT := $(CURDIR) PREFIX := /usr DESTDIR := +DAY := $(shell date +%Y.%m.%d) +SNAPSHOT = x86_energy_perf_policy-$(DAY) + + ifeq ("$(origin O)", "command line") BUILD_OUTPUT := $(O) @@ -27,3 +31,26 @@ install : x86_energy_perf_policy install -d $(DESTDIR)$(PREFIX)/share/man/man8 install -m 644 x86_energy_perf_policy.8 $(DESTDIR)$(PREFIX)/share/man/man8 +snapshot: x86_energy_perf_policy + @rm -rf $(SNAPSHOT) + @mkdir $(SNAPSHOT) + @cp x86_energy_perf_policy Makefile x86_energy_perf_policy.c x86_energy_perf_policy.8 $(SNAPSHOT) + + @sed -e 's/^#include /#include "bits.h"/' -e 's/u64/unsigned long long/' ../../../../arch/x86/include/asm/msr-index.h > $(SNAPSHOT)/msr-index.h + @echo '#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))' >> $(SNAPSHOT)/msr-index.h + @echo "#define BIT(x) (1 << (x))" > $(SNAPSHOT)/bits.h + @echo "#define BIT_ULL(nr) (1ULL << (nr))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK(h, l) (((~0UL) << (l)) & (~0UL >> (sizeof(long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + + @echo '#define BUILD_BUG_ON(cond) do { enum { compile_time_check ## __COUNTER__ = 1/(!(cond)) }; } while (0)' > $(SNAPSHOT)/build_bug.h + @echo '#define __must_be_array(arr) 0' >> $(SNAPSHOT)/build_bug.h + + @echo PWD=. > $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DBUILD_BUG_HEADER='\"build_bug.h\"'" >> $(SNAPSHOT)/Makefile + @sed -e's/.*MSRHEADER.*//' Makefile >> $(SNAPSHOT)/Makefile + + @rm -f $(SNAPSHOT).tar.gz + tar cvzf $(SNAPSHOT).tar.gz $(SNAPSHOT) + From 66f430522452fe1a8a0fd2198cf9f335125acbfc Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 24 Sep 2025 11:50:29 -0400 Subject: [PATCH 2912/3206] tools/power x86_energy_perf_policy.8: Emphasize preference for SW interfaces This tool was originally written when Linux had no standard interface for EPB, or HWP support. Retain the capability to manage a system w/o any kernel PM support, but prefer the standard kernel interfaces, when avaialble. (not doing so led to a confusing conflict between a p-state limit request made via cpufreq and modified by the intel-pstate driver, versus the raw MSR write made by this utility) Signed-off-by: Len Brown --- .../x86_energy_perf_policy.8 | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.8 b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.8 index 78c6361898b189..0aa981c18e56f7 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.8 +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.8 @@ -2,7 +2,7 @@ .\" Distributed under the GPL, Copyleft 1994. .TH X86_ENERGY_PERF_POLICY 8 .SH NAME -x86_energy_perf_policy \- Manage Energy vs. Performance Policy via x86 Model Specific Registers +x86_energy_perf_policy \- Manage Energy vs. Performance Policy .SH SYNOPSIS .B x86_energy_perf_policy .RB "[ options ] [ scope ] [field \ value]" @@ -19,9 +19,14 @@ x86_energy_perf_policy \- Manage Energy vs. Performance Policy via x86 Model Spe .SH DESCRIPTION \fBx86_energy_perf_policy\fP displays and updates energy-performance policy settings specific to -Intel Architecture Processors. Settings are accessed via Model Specific Register (MSR) -updates, no matter if the Linux cpufreq sub-system is enabled or not. +Intel Architecture Processors. It summarizes settings available +in standard Linux interfaces (eg. cpufreq), +and also decodes underlying Model Specific Register (MSRs). +While \fBx86_energy_perf_policy\fP can manage energy-performance policy +using only MSR access, it prefers standard +Linux kernel interfaces, when they are available. +.SH BACKGROUND Policy in MSR_IA32_ENERGY_PERF_BIAS (EPB) may affect a wide range of hardware decisions, such as how aggressively the hardware enters and exits CPU idle states (C-states) @@ -200,7 +205,9 @@ runs only as root. .SH FILES .ta .nf -/dev/cpu/*/msr +EPB: /sys/devices/system/cpu/cpu*/power/energy_perf_bias +EPP: /sys/devices/system/cpu/cpu*/cpufreq/energy_performance_preference +MSR: /dev/cpu/*/msr .fi .SH "SEE ALSO" .nf From 70e633bedeeb4a7290d3b1dd9d49cc2bae25a46f Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 24 Jul 2025 13:22:10 +0900 Subject: [PATCH 2913/3206] i2c: designware: Fix clock issue when PM is disabled When the driver is removed, the clocks are first enabled by calling pm_runtime_get_sync(), and then disabled with pm_runtime_put_sync(). If CONFIG_PM=y, clocks for this controller are disabled when it's in the idle state. So the clocks are properly disabled when the driver exits. Othewise, the clocks are always enabled and the PM functions have no effect. Therefore, the driver exits without disabling the clocks. # cat /sys/kernel/debug/clk/clk-pclk/clk_enable_count 18 # echo 1214a000.i2c > /sys/bus/platform/drivers/i2c_designware/bind # cat /sys/kernel/debug/clk/clk-pclk/clk_enable_count 20 # echo 1214a000.i2c > /sys/bus/platform/drivers/i2c_designware/unbind # cat /sys/kernel/debug/clk/clk-pclk/clk_enable_count 20 To ensure that the clocks can be disabled correctly even without CONFIG_PM=y, should add the following fixes: - Replace with pm_runtime_put_noidle(), which only decrements the runtime PM usage count. - Call i2c_dw_prepare_clk(false) to explicitly disable the clocks. Fixes: 7272194ed391f ("i2c-designware: add minimal support for runtime PM") Co-developed-by: Kohei Ito Signed-off-by: Kohei Ito Signed-off-by: Kunihiko Hayashi Tested-by: Jarkko Nikula Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index be2f9533cef94e..3223da70180699 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -333,9 +333,11 @@ static void dw_i2c_plat_remove(struct platform_device *pdev) i2c_dw_disable(dev); pm_runtime_dont_use_autosuspend(device); - pm_runtime_put_sync(device); + pm_runtime_put_noidle(device); dw_i2c_plat_pm_cleanup(dev); + i2c_dw_prepare_clk(dev, false); + i2c_dw_remove_lock_support(dev); reset_control_assert(dev->rst); From c149841b069ccc6e480b00e11f35a57b5d88c7bb Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 24 Jul 2025 13:22:11 +0900 Subject: [PATCH 2914/3206] i2c: designware: Add disabling clocks when probe fails After an error occurs during probing state, dw_i2c_plat_pm_cleanup() is called. However, this function doesn't disable clocks and the clock-enable count keeps increasing. Should disable these clocks explicitly. Fixes: 7272194ed391f ("i2c-designware: add minimal support for runtime PM") Co-developed-by: Kohei Ito Signed-off-by: Kohei Ito Signed-off-by: Kunihiko Hayashi Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 3223da70180699..34d881572351cc 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -316,6 +316,7 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) exit_probe: dw_i2c_plat_pm_cleanup(dev); + i2c_dw_prepare_clk(dev, false); exit_reset: reset_control_assert(dev->rst); return ret; From 4d428dca252c858bfac691c31fa95d26cd008706 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 25 Sep 2025 14:08:20 +0100 Subject: [PATCH 2915/3206] netfs: fix reference leak Commit 20d72b00ca81 ("netfs: Fix the request's work item to not require a ref") modified netfs_alloc_request() to initialize the reference counter to 2 instead of 1. The rationale was that the requet's "work" would release the second reference after completion (via netfs_{read,write}_collection_worker()). That works most of the time if all goes well. However, it leaks this additional reference if the request is released before the I/O operation has been submitted: the error code path only decrements the reference counter once and the work item will never be queued because there will never be a completion. This has caused outages of our whole server cluster today because tasks were blocked in netfs_wait_for_outstanding_io(), leading to deadlocks in Ceph (another bug that I will address soon in another patch). This was caused by a netfs_pgpriv2_begin_copy_to_cache() call which failed in fscache_begin_write_operation(). The leaked netfs_io_request was never completed, leaving `netfs_inode.io_count` with a positive value forever. All of this is super-fragile code. Finding out which code paths will lead to an eventual completion and which do not is hard to see: - Some functions like netfs_create_write_req() allocate a request, but will never submit any I/O. - netfs_unbuffered_read_iter_locked() calls netfs_unbuffered_read() and then netfs_put_request(); however, netfs_unbuffered_read() can also fail early before submitting the I/O request, therefore another netfs_put_request() call must be added there. A rule of thumb is that functions that return a `netfs_io_request` do not submit I/O, and all of their callers must be checked. For my taste, the whole netfs code needs an overhaul to make reference counting easier to understand and less fragile & obscure. But to fix this bug here and now and produce a patch that is adequate for a stable backport, I tried a minimal approach that quickly frees the request object upon early failure. I decided against adding a second netfs_put_request() each time because that would cause code duplication which obscures the code further. Instead, I added the function netfs_put_failed_request() which frees such a failed request synchronously under the assumption that the reference count is exactly 2 (as initially set by netfs_alloc_request() and never touched), verified by a WARN_ON_ONCE(). It then deinitializes the request object (without going through the "cleanup_work" indirection) and frees the allocation (with RCU protection to protect against concurrent access by netfs_requests_seq_start()). All code paths that fail early have been changed to call netfs_put_failed_request() instead of netfs_put_request(). Additionally, I have added a netfs_put_request() call to netfs_unbuffered_read() as explained above because the netfs_put_failed_request() approach does not work there. Fixes: 20d72b00ca81 ("netfs: Fix the request's work item to not require a ref") Signed-off-by: Max Kellermann Signed-off-by: David Howells cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 10 +++++----- fs/netfs/direct_read.c | 7 ++++++- fs/netfs/direct_write.c | 6 +++++- fs/netfs/internal.h | 1 + fs/netfs/objects.c | 30 +++++++++++++++++++++++++++--- fs/netfs/read_pgpriv2.c | 2 +- fs/netfs/read_single.c | 2 +- fs/netfs/write_issue.c | 3 +-- 8 files changed, 47 insertions(+), 14 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 18b3dc74c70e41..37ab6f28b5ad0e 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -369,7 +369,7 @@ void netfs_readahead(struct readahead_control *ractl) return netfs_put_request(rreq, netfs_rreq_trace_put_return); cleanup_free: - return netfs_put_request(rreq, netfs_rreq_trace_put_failed); + return netfs_put_failed_request(rreq); } EXPORT_SYMBOL(netfs_readahead); @@ -472,7 +472,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); alloc_error: folio_unlock(folio); return ret; @@ -532,7 +532,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); alloc_error: folio_unlock(folio); return ret; @@ -699,7 +699,7 @@ int netfs_write_begin(struct netfs_inode *ctx, return 0; error_put: - netfs_put_request(rreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(rreq); error: if (folio) { folio_unlock(folio); @@ -754,7 +754,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, return ret < 0 ? ret : 0; error_put: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); error: _leave(" = %d", ret); return ret; diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index a05e13472bafb2..a498ee8d66745f 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -131,6 +131,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) if (rreq->len == 0) { pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); return -EIO; } @@ -205,7 +206,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i if (user_backed_iter(iter)) { ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0); if (ret < 0) - goto out; + goto error_put; rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec; rreq->direct_bv_count = ret; rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); @@ -238,6 +239,10 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i if (ret > 0) orig_count -= ret; return ret; + +error_put: + netfs_put_failed_request(rreq); + return ret; } EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index a16660ab7f8385..a9d1c3b2c08426 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -57,7 +57,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0); if (n < 0) { ret = n; - goto out; + goto error_put; } wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec; wreq->direct_bv_count = n; @@ -101,6 +101,10 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * out: netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; + +error_put: + netfs_put_failed_request(wreq); + return ret; } EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index d4f16fefd96518..4319611f535449 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -87,6 +87,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); void netfs_clear_subrequests(struct netfs_io_request *rreq); void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); +void netfs_put_failed_request(struct netfs_io_request *rreq); struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq); static inline void netfs_see_request(struct netfs_io_request *rreq, diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e8c99738b5bbf2..40a1c7d6f6e03e 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -116,10 +116,8 @@ static void netfs_free_request_rcu(struct rcu_head *rcu) netfs_stat_d(&netfs_n_rh_rreq); } -static void netfs_free_request(struct work_struct *work) +static void netfs_deinit_request(struct netfs_io_request *rreq) { - struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, cleanup_work); struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; @@ -149,6 +147,14 @@ static void netfs_free_request(struct work_struct *work) if (atomic_dec_and_test(&ictx->io_count)) wake_up_var(&ictx->io_count); +} + +static void netfs_free_request(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, cleanup_work); + + netfs_deinit_request(rreq); call_rcu(&rreq->rcu, netfs_free_request_rcu); } @@ -167,6 +173,24 @@ void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace } } +/* + * Free a request (synchronously) that was just allocated but has + * failed before it could be submitted. + */ +void netfs_put_failed_request(struct netfs_io_request *rreq) +{ + int r = refcount_read(&rreq->ref); + + /* new requests have two references (see + * netfs_alloc_request(), and this function is only allowed on + * new request objects + */ + WARN_ON_ONCE(r != 2); + + trace_netfs_rreq_ref(rreq->debug_id, r, netfs_rreq_trace_put_failed); + netfs_free_request(&rreq->cleanup_work); +} + /* * Allocate and partially initialise an I/O request structure. */ diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c index 8097bc069c1de6..a1489aa29f782a 100644 --- a/fs/netfs/read_pgpriv2.c +++ b/fs/netfs/read_pgpriv2.c @@ -118,7 +118,7 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache( return creq; cancel_put: - netfs_put_request(creq, netfs_rreq_trace_put_return); + netfs_put_failed_request(creq); cancel: rreq->copy_to_cache = ERR_PTR(-ENOBUFS); clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags); diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index fa622a6cd56da3..5c0dc4efc79227 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -189,7 +189,7 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite return ret; cleanup_free: - netfs_put_request(rreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(rreq); return ret; } EXPORT_SYMBOL(netfs_read_single); diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 0584cba1a04392..dd8743bc8d7fe3 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -133,8 +133,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, return wreq; nomem: - wreq->error = -ENOMEM; - netfs_put_request(wreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(wreq); return ERR_PTR(-ENOMEM); } From e8c84e2082e69335f66c8ade4895e80ec270d7c4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Sep 2025 17:03:51 +0200 Subject: [PATCH 2916/3206] statmount: don't call path_put() under namespace semaphore Massage statmount() and make sure we don't call path_put() under the namespace semaphore. If we put the last reference we're fscked. Fixes: 46eae99ef733 ("add statmount(2) syscall") Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Christian Brauner --- fs/namespace.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ba3e5215b6362d..993983a7685361 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5708,7 +5708,6 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { - struct path root __free(path_put) = {}; struct mount *m; int err; @@ -5720,7 +5719,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!s->mnt) return -ENOENT; - err = grab_requested_root(ns, &root); + err = grab_requested_root(ns, &s->root); if (err) return err; @@ -5729,7 +5728,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, * mounts to show users. */ m = real_mount(s->mnt); - if (!is_path_reachable(m, m->mnt.mnt_root, &root) && + if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -5737,8 +5736,6 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (err) return err; - s->root = root; - /* * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap * can change concurrently as we only hold the read-side of the @@ -5960,6 +5957,7 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, if (!ret) ret = copy_statmount_to_user(ks); kvfree(ks->seq.buf); + path_put(&ks->root); if (retry_statmount(ret, &seq_size)) goto retry; return ret; From c1f86d0ac322c7e77f6f8dbd216c65d39358ffc0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Sep 2025 17:33:47 +0200 Subject: [PATCH 2917/3206] listmount: don't call path_put() under namespace semaphore Massage listmount() and make sure we don't call path_put() under the namespace semaphore. If we put the last reference we're fscked. Fixes: b4c2bea8ceaa ("add listmount(2) syscall") Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Christian Brauner --- fs/namespace.c | 87 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 28 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 993983a7685361..6686c9f54b4005 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5963,23 +5963,34 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, return ret; } -static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, - u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, - bool reverse) +struct klistmount { + u64 last_mnt_id; + u64 mnt_parent_id; + u64 *kmnt_ids; + u32 nr_mnt_ids; + struct mnt_namespace *ns; + struct path root; +}; + +static ssize_t do_listmount(struct klistmount *kls, bool reverse) { - struct path root __free(path_put) = {}; + struct mnt_namespace *ns = kls->ns; + u64 mnt_parent_id = kls->mnt_parent_id; + u64 last_mnt_id = kls->last_mnt_id; + u64 *mnt_ids = kls->kmnt_ids; + size_t nr_mnt_ids = kls->nr_mnt_ids; struct path orig; struct mount *r, *first; ssize_t ret; rwsem_assert_held(&namespace_sem); - ret = grab_requested_root(ns, &root); + ret = grab_requested_root(ns, &kls->root); if (ret) return ret; if (mnt_parent_id == LSMT_ROOT) { - orig = root; + orig = kls->root; } else { orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); if (!orig.mnt) @@ -5991,7 +6002,7 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, * Don't trigger audit denials. We just want to determine what * mounts to show users. */ - if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) && + if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -6024,14 +6035,45 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, return ret; } +static void __free_klistmount_free(const struct klistmount *kls) +{ + path_put(&kls->root); + kvfree(kls->kmnt_ids); + mnt_ns_release(kls->ns); +} + +static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, + size_t nr_mnt_ids) +{ + + u64 last_mnt_id = kreq->param; + + /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ + if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) + return -EINVAL; + + kls->last_mnt_id = last_mnt_id; + + kls->nr_mnt_ids = nr_mnt_ids; + kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids), + GFP_KERNEL_ACCOUNT); + if (!kls->kmnt_ids) + return -ENOMEM; + + kls->ns = grab_requested_mnt_ns(kreq); + if (!kls->ns) + return -ENOENT; + + kls->mnt_parent_id = kreq->mnt_id; + return 0; +} + SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { - u64 *kmnt_ids __free(kvfree) = NULL; + struct klistmount kls __free(klistmount_free) = {}; const size_t maxcount = 1000000; - struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct mnt_id_req kreq; - u64 last_mnt_id; ssize_t ret; if (flags & ~LISTMOUNT_REVERSE) @@ -6052,22 +6094,12 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (ret) return ret; - last_mnt_id = kreq.param; - /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ - if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) - return -EINVAL; - - kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids), - GFP_KERNEL_ACCOUNT); - if (!kmnt_ids) - return -ENOMEM; - - ns = grab_requested_mnt_ns(&kreq); - if (!ns) - return -ENOENT; + ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids); + if (ret) + return ret; - if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && - !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; /* @@ -6075,12 +6107,11 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, * listmount() doesn't care about any mount properties. */ scoped_guard(rwsem_read, &namespace_sem) - ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, - nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); + ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) return ret; - if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids))) + if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids))) return -EFAULT; return ret; From 28986dd7e38fb5ba2f180f9eb3ff330798719369 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Mon, 22 Sep 2025 17:00:46 +0530 Subject: [PATCH 2918/3206] fcntl: trim arguments Remove superfluous argument from fcntl_{get/set}_rw_hint. No functional change. Signed-off-by: Kanchan Joshi Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fcntl.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index 5598e4d5742299..72f8433d910988 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -355,8 +355,7 @@ static bool rw_hint_valid(u64 hint) } } -static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_get_rw_hint(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; @@ -367,8 +366,7 @@ static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, return 0; } -static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_set_rw_hint(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; @@ -547,10 +545,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: - err = fcntl_get_rw_hint(filp, cmd, arg); + err = fcntl_get_rw_hint(filp, arg); break; case F_SET_RW_HINT: - err = fcntl_set_rw_hint(filp, cmd, arg); + err = fcntl_set_rw_hint(filp, arg); break; default: break; From bc061143637532c08d9fc657eec93fdc2588068e Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 25 Sep 2025 16:39:18 +0100 Subject: [PATCH 2919/3206] gpio: mpfs: fix setting gpio direction to output mpfs_gpio_direction_output() actually sets the line to input mode. Use the correct register settings for output mode so that this function actually works as intended. This was a copy-paste mistake made when converting to regmap during the driver submission process. It went unnoticed because my test for output mode is toggling LEDs on an Icicle kit which functions with the incorrect code. The internal reporter has yet to test the patch, but on their system the incorrect setting may be the reason for failures to drive the GPIO lines on the BeagleV-fire board. CC: stable@vger.kernel.org Fixes: a987b78f3615e ("gpio: mpfs: add polarfire soc gpio support") Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20250925-boogieman-carrot-82989ff75d10@spud Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mpfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-mpfs.c b/drivers/gpio/gpio-mpfs.c index 82d557a7e5d8d5..9468795b96348a 100644 --- a/drivers/gpio/gpio-mpfs.c +++ b/drivers/gpio/gpio-mpfs.c @@ -69,7 +69,7 @@ static int mpfs_gpio_direction_output(struct gpio_chip *gc, unsigned int gpio_in struct mpfs_gpio_chip *mpfs_gpio = gpiochip_get_data(gc); regmap_update_bits(mpfs_gpio->regs, MPFS_GPIO_CTRL(gpio_index), - MPFS_GPIO_DIR_MASK, MPFS_GPIO_EN_IN); + MPFS_GPIO_DIR_MASK, MPFS_GPIO_EN_OUT | MPFS_GPIO_EN_OUT_BUF); regmap_update_bits(mpfs_gpio->regs, mpfs_gpio->offsets->outp, BIT(gpio_index), value << gpio_index); From 79428e60897916401c9ed326f6ada4d7c7c997a3 Mon Sep 17 00:00:00 2001 From: Manaf Meethalavalappu Pallikunhi Date: Sat, 20 Sep 2025 18:06:31 +0530 Subject: [PATCH 2920/3206] dt-bindings: thermal: qcom-tsens: Document the Glymur temperature Sensor Document the Temperature Sensor (TSENS) on Glymur Platform. Signed-off-by: Manaf Meethalavalappu Pallikunhi Signed-off-by: Pankaj Patil Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250920123631.281153-1-pankaj.patil@oss.qualcomm.com Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/thermal/qcom-tsens.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml index f65dc829574c70..78e2f6573b96f2 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml +++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml @@ -49,6 +49,7 @@ properties: - description: v2 of TSENS items: - enum: + - qcom,glymur-tsens - qcom,milos-tsens - qcom,msm8953-tsens - qcom,msm8996-tsens From f8b9c819ea20d1101656a91ced843d9e47ba0630 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 26 Sep 2025 07:03:11 +0300 Subject: [PATCH 2921/3206] ASoc: tas2783A: Fix an error code in probe() This code returns the wrong variable "tas_dev->regmap" instead of "regmap" so it returns success instead of a negative error code. Return the correct variable. Fixes: 4cc9bd8d7b32 ("ASoc: tas2783A: Add soundwire based codec driver") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/aNYQf4cyavnku5Nt@stanley.mountain Signed-off-by: Mark Brown --- sound/soc/codecs/tas2783-sdw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/tas2783-sdw.c b/sound/soc/codecs/tas2783-sdw.c index c2a7be51bdc601..1fb4227b711e6b 100644 --- a/sound/soc/codecs/tas2783-sdw.c +++ b/sound/soc/codecs/tas2783-sdw.c @@ -1285,7 +1285,7 @@ static s32 tas_sdw_probe(struct sdw_slave *peripheral, &tas_regmap, &tas2783_mbq_cfg); if (IS_ERR(regmap)) - return dev_err_probe(dev, PTR_ERR(tas_dev->regmap), + return dev_err_probe(dev, PTR_ERR(regmap), "Failed devm_regmap_init_sdw."); /* keep in cache until the device is fully initialized */ From bbc3110823eca23b066e75a920bdc8118adda0d2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 26 Sep 2025 07:03:29 +0300 Subject: [PATCH 2922/3206] pmdomain: thead: Fix error pointer vs NULL bug in th1520_pd_reboot_init() The devm_auxiliary_device_create() returns NULL on error. It never returns error pointers. Using PTR_ERR_OR_ZERO() here means the function always returns success. Replace the PTR_ERR_OR_ZERO() call check with a NULL check. Fixes: 64581f41f4c4 ("pmdomain: thead: create auxiliary device for rebooting") Signed-off-by: Dan Carpenter Acked-by: Icenowy Zheng Signed-off-by: Ulf Hansson --- drivers/pmdomain/thead/th1520-pm-domains.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pmdomain/thead/th1520-pm-domains.c b/drivers/pmdomain/thead/th1520-pm-domains.c index 5213994101a591..d7cb9633c7c8a3 100644 --- a/drivers/pmdomain/thead/th1520-pm-domains.c +++ b/drivers/pmdomain/thead/th1520-pm-domains.c @@ -179,8 +179,10 @@ static int th1520_pd_reboot_init(struct device *dev, struct auxiliary_device *adev; adev = devm_auxiliary_device_create(dev, "reboot", aon_chan); + if (!adev) + return -ENODEV; - return PTR_ERR_OR_ZERO(adev); + return 0; } static int th1520_pd_probe(struct platform_device *pdev) From fbe2dc6a9c7318f7263f5e4d50f6272b931c5756 Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Tue, 23 Sep 2025 17:16:45 +0900 Subject: [PATCH 2923/3206] smb: client: fix wrong index reference in smb2_compound_op() In smb2_compound_op(), the loop that processes each command's response uses wrong indices when accessing response bufferes. This incorrect indexing leads to improper handling of command results. Also, if incorrectly computed index is greather than or equal to MAX_COMPOUND, it can cause out-of-bounds accesses. Fixes: 3681c74d342d ("smb: client: handle lack of EA support in smb2_query_path_info()") # 6.14 Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Sang-Heon Jeon Signed-off-by: Steve French --- fs/smb/client/smb2inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index e32a3f33879339..0985db9f86e510 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -687,7 +687,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, } for (i = 0; i < num_cmds; i++) { - char *buf = rsp_iov[i + i].iov_base; + char *buf = rsp_iov[i + 1].iov_base; if (buf && resp_buftype[i + 1] != CIFS_NO_BUFFER) rc = server->ops->map_error(buf, false); From 927f3e85015285ff68789bd00aa66b071016b27f Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Tue, 23 Sep 2025 15:06:26 -0700 Subject: [PATCH 2924/3206] PM: runtime: Documentation: ABI: Document time units for *_time Many .../power/... time-related attributes have an "_ms" suffix and also include language in their ABI description to make clear that their time is measured in milliseconds. However, 'runtime_suspended_time' and 'runtime_active_time' have neither, and it takes me a nontrivial amount of time to poke through the source to confirm that they are also measured in milliseconds. Update the doc with "millisecond" units. Signed-off-by: Brian Norris Reviewed-by: Dhruva Gole Link: https://patch.msgid.link/20250923150625.1.If11a14e33d578369db48d678395d0323bdb01915@changeid [ rjw: Subject edits, changelog tweak ] Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-devices-power | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power index e4ec5de9a5dd23..9bf7c8a267c587 100644 --- a/Documentation/ABI/testing/sysfs-devices-power +++ b/Documentation/ABI/testing/sysfs-devices-power @@ -274,15 +274,15 @@ What: /sys/devices/.../power/runtime_active_time Date: Jul 2010 Contact: Arjan van de Ven Description: - Reports the total time that the device has been active. - Used for runtime PM statistics. + Reports the total time that the device has been active, in + milliseconds. Used for runtime PM statistics. What: /sys/devices/.../power/runtime_suspended_time Date: Jul 2010 Contact: Arjan van de Ven Description: - Reports total time that the device has been suspended. - Used for runtime PM statistics. + Reports total time that the device has been suspended, in + milliseconds. Used for runtime PM statistics. What: /sys/devices/.../power/runtime_usage Date: Apr 2010 From bbfe987c5a2854705393ad79813074e5eadcbde6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 13:10:22 +0200 Subject: [PATCH 2925/3206] PM: hibernate: Fix pm_hibernation_mode_is_suspend() build breakage Commit 495c8d35035e ("PM: hibernate: Add pm_hibernation_mode_is_suspend()") that introduced pm_hibernation_mode_is_suspend() did not define it in the case when CONFIG_HIBERNATION is unset, but CONFIG_SUSPEND is set. Subsequent commit 0a6e9e098fcc ("drm/amd: Fix hybrid sleep") made the amdgpu driver use that function which led to kernel build breakage in the case mentioned above [1]. Address this by using appropriate #ifdeffery around the definition of pm_hibernation_mode_is_suspend(). Fixes: 0a6e9e098fcc ("drm/amd: Fix hybrid sleep") Reported-by: KernelCI bot Closes: https://groups.io/g/kernelci-results/topic/regression_pm_testing/115439919 [1] Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) --- include/linux/suspend.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 0664c685f0b24e..b02876f1ae38ac 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -276,7 +276,6 @@ extern void arch_suspend_enable_irqs(void); extern int pm_suspend(suspend_state_t state); extern bool sync_on_suspend_enabled; -bool pm_hibernation_mode_is_suspend(void); #else /* !CONFIG_SUSPEND */ #define suspend_valid_only_mem NULL @@ -289,7 +288,6 @@ static inline bool pm_suspend_via_firmware(void) { return false; } static inline bool pm_resume_via_firmware(void) { return false; } static inline bool pm_suspend_no_platform(void) { return false; } static inline bool pm_suspend_default_s2idle(void) { return false; } -static inline bool pm_hibernation_mode_is_suspend(void) { return false; } static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } @@ -420,6 +418,12 @@ static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) { } #endif /* CONFIG_HIBERNATION */ +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_SUSPEND) +bool pm_hibernation_mode_is_suspend(void); +#else +static inline bool pm_hibernation_mode_is_suspend(void) { return false; } +#endif + int arch_resume_nosmt(void); #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV From 6f4c6f9ed4ce65303f6bb153e2afc71bc33c8ded Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 18:40:25 +0200 Subject: [PATCH 2926/3206] PM: hibernate: Restrict GFP mask in power_down() Commit 12ffc3b1513e ("PM: Restrict swap use to later in the suspend sequence") caused hibernation_platform_enter() to call pm_restore_gfp_mask() via dpm_resume_end(), so when power_down() returns after aborting hibernation_platform_enter(), it needs to match the pm_restore_gfp_mask() call in hibernate() that will occur subsequently. Address this by adding a pm_restrict_gfp_mask() call to the relevant error path in power_down(). Fixes: 12ffc3b1513e ("PM: Restrict swap use to later in the suspend sequence") Cc: 6.16+ # 6.16+ Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) --- kernel/power/hibernate.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 5d146218cae86f..728328c51b6499 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -733,6 +733,8 @@ static void power_down(void) case HIBERNATION_PLATFORM: error = hibernation_platform_enter(); if (error == -EAGAIN || error == -EBUSY) { + /* Match pm_restore_gfp_mask() in hibernate(). */ + pm_restrict_gfp_mask(); swsusp_unmark(); events_check_enabled = false; pr_info("Wakeup event detected during hibernation, rolling back.\n"); From 991e555efffde314e94be9c28fc0ea5504195df8 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 26 Sep 2025 09:41:42 -0700 Subject: [PATCH 2927/3206] selftests/bpf: Test changing packet data from kfunc bpf_xdp_pull_data() is the first kfunc that changes packet data. Make sure the verifier clear all packet pointers after calling packet data changing kfunc. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250926164142.1850176-1-ameryhung@gmail.com --- tools/testing/selftests/bpf/progs/verifier_sock.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index b4d31f10a97687..2b4610b53382d0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -1096,6 +1096,20 @@ int invalidate_xdp_pkt_pointers_from_global_func(struct xdp_md *x) return XDP_PASS; } +/* XDP packet changing kfunc calls invalidate packet pointers */ +SEC("xdp") +__failure __msg("invalid mem access") +int invalidate_xdp_pkt_pointers(struct xdp_md *x) +{ + int *p = (void *)(long)x->data; + + if ((void *)(p + 1) > (void *)(long)x->data_end) + return XDP_DROP; + bpf_xdp_pull_data(x, 0); + *p = 42; /* this is unsafe */ + return XDP_PASS; +} + __noinline int tail_call(struct __sk_buff *sk) { From 1f5bcfe91ffce71bdd1022648b9d501d46d20c09 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 18:41:21 +0200 Subject: [PATCH 2928/3206] PM: hibernate: Combine return paths in power_down() To avoid code duplication and improve clarity, combine the code paths in power_down() leading to a return from that function. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/3571055.QJadu78ljV@rafael.j.wysocki [ rjw: Changed the new label name to "exit" ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 728328c51b6499..14e85ff2355123 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -708,21 +708,11 @@ static void power_down(void) if (hibernation_mode == HIBERNATION_SUSPEND) { pm_restore_gfp_mask(); error = suspend_devices_and_enter(mem_sleep_current); - if (error) { - hibernation_mode = hibernation_ops ? - HIBERNATION_PLATFORM : - HIBERNATION_SHUTDOWN; - } else { - /* Match pm_restore_gfp_mask() call in hibernate() */ - pm_restrict_gfp_mask(); - - /* Restore swap signature. */ - error = swsusp_unmark(); - if (error) - pr_err("Swap will be unusable! Try swapon -a.\n"); + if (!error) + goto exit; - return; - } + hibernation_mode = hibernation_ops ? HIBERNATION_PLATFORM : + HIBERNATION_SHUTDOWN; } #endif @@ -733,12 +723,9 @@ static void power_down(void) case HIBERNATION_PLATFORM: error = hibernation_platform_enter(); if (error == -EAGAIN || error == -EBUSY) { - /* Match pm_restore_gfp_mask() in hibernate(). */ - pm_restrict_gfp_mask(); - swsusp_unmark(); events_check_enabled = false; pr_info("Wakeup event detected during hibernation, rolling back.\n"); - return; + goto exit; } fallthrough; case HIBERNATION_SHUTDOWN: @@ -757,6 +744,15 @@ static void power_down(void) pr_crit("Power down manually\n"); while (1) cpu_relax(); + +exit: + /* Match the pm_restore_gfp_mask() call in hibernate(). */ + pm_restrict_gfp_mask(); + + /* Restore swap signature. */ + error = swsusp_unmark(); + if (error) + pr_err("Swap will be unusable! Try swapon -a.\n"); } static int load_image_and_restore(void) From 0cc114dc358cf8da2ca23a366e761e89a46ca277 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 26 Sep 2025 15:17:51 +0800 Subject: [PATCH 2929/3206] libbpf: Fix error when st-prefix_ops and ops from differ btf When a module registers a struct_ops, the struct_ops type and its corresponding map_value type ("bpf_struct_ops_") may reside in different btf objects, here are four possible case: +--------+---------------+-------------+---------------------------------+ | |bpf_struct_ops_| xxx_ops | | +--------+---------------+-------------+---------------------------------+ | case 0 | btf_vmlinux | btf_vmlinux | be used and reg only in vmlinux | +--------+---------------+-------------+---------------------------------+ | case 1 | btf_vmlinux | mod_btf | INVALID | +--------+---------------+-------------+---------------------------------+ | case 2 | mod_btf | btf_vmlinux | reg in mod but be used both in | | | | | vmlinux and mod. | +--------+---------------+-------------+---------------------------------+ | case 3 | mod_btf | mod_btf | be used and reg only in mod | +--------+---------------+-------------+---------------------------------+ Currently we figure out the mod_btf by searching with the struct_ops type, which makes it impossible to figure out the mod_btf when the struct_ops type is in btf_vmlinux while it's corresponding map_value type is in mod_btf (case 2). The fix is to use the corresponding map_value type ("bpf_struct_ops_") as the lookup anchor instead of the struct_ops type to figure out the `btf` and `mod_btf` via find_ksym_btf_id(), and then we can locate the kern_type_id via btf__find_by_name_kind() with the `btf` we just obtained from find_ksym_btf_id(). With this change the lookup obtains the correct btf and mod_btf for case 2, preserves correct behavior for other valid cases, and still fails as expected for the invalid scenario (case 1). Fixes: 590a00888250 ("bpf: libbpf: Add STRUCT_OPS support") Signed-off-by: D. Wythe Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20250926071751.108293-1-alibuda@linux.alibaba.com --- tools/lib/bpf/libbpf.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 5161c2b398753d..7edb36aa88e1dc 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1018,35 +1018,33 @@ find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw, const struct btf_member *kern_data_member; struct btf *btf = NULL; __s32 kern_vtype_id, kern_type_id; - char tname[256]; + char tname[192], stname[256]; __u32 i; snprintf(tname, sizeof(tname), "%.*s", (int)bpf_core_essential_name_len(tname_raw), tname_raw); - kern_type_id = find_ksym_btf_id(obj, tname, BTF_KIND_STRUCT, - &btf, mod_btf); - if (kern_type_id < 0) { - pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", - tname); - return kern_type_id; - } - kern_type = btf__type_by_id(btf, kern_type_id); + snprintf(stname, sizeof(stname), "%s%s", STRUCT_OPS_VALUE_PREFIX, tname); - /* Find the corresponding "map_value" type that will be used - * in map_update(BPF_MAP_TYPE_STRUCT_OPS). For example, - * find "struct bpf_struct_ops_tcp_congestion_ops" from the - * btf_vmlinux. + /* Look for the corresponding "map_value" type that will be used + * in map_update(BPF_MAP_TYPE_STRUCT_OPS) first, figure out the btf + * and the mod_btf. + * For example, find "struct bpf_struct_ops_tcp_congestion_ops". */ - kern_vtype_id = find_btf_by_prefix_kind(btf, STRUCT_OPS_VALUE_PREFIX, - tname, BTF_KIND_STRUCT); + kern_vtype_id = find_ksym_btf_id(obj, stname, BTF_KIND_STRUCT, &btf, mod_btf); if (kern_vtype_id < 0) { - pr_warn("struct_ops init_kern: struct %s%s is not found in kernel BTF\n", - STRUCT_OPS_VALUE_PREFIX, tname); + pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", stname); return kern_vtype_id; } kern_vtype = btf__type_by_id(btf, kern_vtype_id); + kern_type_id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT); + if (kern_type_id < 0) { + pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", tname); + return kern_type_id; + } + kern_type = btf__type_by_id(btf, kern_type_id); + /* Find "struct tcp_congestion_ops" from * struct bpf_struct_ops_tcp_congestion_ops { * [ ... ] @@ -1059,8 +1057,8 @@ find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw, break; } if (i == btf_vlen(kern_vtype)) { - pr_warn("struct_ops init_kern: struct %s data is not found in struct %s%s\n", - tname, STRUCT_OPS_VALUE_PREFIX, tname); + pr_warn("struct_ops init_kern: struct %s data is not found in struct %s\n", + tname, stname); return -EINVAL; } From b1e0ff7209e952bdb4f9a85a8450546700b4affa Mon Sep 17 00:00:00 2001 From: Ivan Pravdin Date: Sun, 7 Sep 2025 22:05:57 -0400 Subject: [PATCH 2930/3206] rtla: Fix buffer overflow in actions_parse Currently, tests 3 and 13-22 in tests/timerlat.t fail with error: *** buffer overflow detected ***: terminated timeout: the monitored command dumped core The result of running `sudo make check` is tests/timerlat.t (Wstat: 0 Tests: 22 Failed: 11) Failed tests: 3, 13-22 Files=3, Tests=34, 140 wallclock secs ( 0.07 usr 0.01 sys + 27.63 cusr 27.96 csys = 55.67 CPU) Result: FAIL Fix buffer overflow in actions_parse to avoid this error. After this change, the tests results are tests/hwnoise.t ... ok tests/osnoise.t ... ok tests/timerlat.t .. ok All tests successful. Files=3, Tests=34, 186 wallclock secs ( 0.06 usr 0.01 sys + 41.10 cusr 44.38 csys = 85.55 CPU) Result: PASS Link: https://lore.kernel.org/164ffc2ec8edacaf1295789dad82a07817b6263d.1757034919.git.ipravdin.official@gmail.com Fixes: 6ea082b171e0 ("rtla/timerlat: Add action on threshold feature") Signed-off-by: Ivan Pravdin Reviewed-by: Tomas Glozar Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/actions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c index aaf0808125d723..eab51c0c0ce2c9 100644 --- a/tools/tracing/rtla/src/actions.c +++ b/tools/tracing/rtla/src/actions.c @@ -131,7 +131,7 @@ actions_parse(struct actions *self, const char *trigger) { enum action_type type = ACTION_NONE; char *token; - char trigger_c[strlen(trigger)]; + char trigger_c[strlen(trigger) + 1]; /* For ACTION_SIGNAL */ int signal = 0, pid = 0; From 2227f273b7dc25a791ae6b152550098aa6934b2f Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Mon, 15 Sep 2025 15:10:56 -0300 Subject: [PATCH 2931/3206] rtla/actions: Fix condition for buffer reallocation The condition to check if the actions buffer needs to be resized was incorrect. The check `self->size >= self->len` would evaluate to true on almost every call to `actions_new()`, causing the buffer to be reallocated unnecessarily each time an action was added. Fix the condition to `self->len >= self.size`, ensuring that the buffer is only resized when it is actually full. Cc: John Kacur Cc: Luis Goncalves Cc: Arnaldo Carvalho de Melo Cc: Chang Yin Cc: Costa Shulyupin Cc: Crystal Wood Cc: Gabriele Monaco Link: https://lore.kernel.org/20250915181101.52513-1-wander@redhat.com Fixes: 6ea082b171e00 ("rtla/timerlat: Add action on threshold feature") Signed-off-by: Wander Lairson Costa Reviewed-by: Tomas Glozar Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/actions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c index eab51c0c0ce2c9..13ff1934d47c94 100644 --- a/tools/tracing/rtla/src/actions.c +++ b/tools/tracing/rtla/src/actions.c @@ -49,7 +49,7 @@ actions_destroy(struct actions *self) static struct action * actions_new(struct actions *self) { - if (self->size >= self->len) { + if (self->len >= self->size) { self->size *= 2; self->list = realloc(self->list, self->size * sizeof(struct action)); } From 87608c2a7718dcac5deef801fb3c18cf36fb0233 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 26 Sep 2025 17:52:39 +0800 Subject: [PATCH 2932/3206] bpf: Remove duplicate crypto/sha2.h header ./include/linux/bpf.h: crypto/sha2.h is included more than once. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=25501 Signed-off-by: Jiapeng Chong Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20250926095240.3397539-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6338e54a9b1f8d..fe2a396d8ac6bb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -32,7 +32,6 @@ #include #include #include -#include struct bpf_verifier_env; struct bpf_verifier_log; From 4b2113413e76213cb36e30e6d5299208254d9369 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 26 Sep 2025 17:52:40 +0800 Subject: [PATCH 2933/3206] bpftool: Remove duplicate string.h header ./tools/bpf/bpftool/sign.c: string.h is included more than once. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=25502 Signed-off-by: Jiapeng Chong Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20250926095240.3397539-2-jiapeng.chong@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/sign.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/bpf/bpftool/sign.c b/tools/bpf/bpftool/sign.c index b29d825bb1d418..b34f74d210e9cd 100644 --- a/tools/bpf/bpftool/sign.c +++ b/tools/bpf/bpftool/sign.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include From 7f7acd193ba8aaa8ed07cfadc335bb17a991fd42 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 25 Sep 2025 12:42:14 -0700 Subject: [PATCH 2934/3206] PM: runtime: Add basic kunit tests for API contracts In exploring the various return codes and failure modes of runtime PM APIs, I found it helpful to verify and codify many of them in unit tests, especially given that even the kerneldoc can be rather complex to reason through, and it also has had subtle errors of its own. Notably, I avoid testing the return codes for pm_runtime_put() and pm_runtime_put_autosuspend(), since code that checks them is probably wrong, and we're considering making them return 'void' altogether. I still test the sync() variants, since those have a bit more meaning to them. Signed-off-by: Brian Norris Signed-off-by: Rafael J. Wysocki --- drivers/base/Kconfig | 6 + drivers/base/power/Makefile | 1 + drivers/base/power/runtime-test.c | 253 ++++++++++++++++++++++++++++++ 3 files changed, 260 insertions(+) create mode 100644 drivers/base/power/runtime-test.c diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 064eb52ff7e2d4..1786d87b29e227 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -167,6 +167,12 @@ config PM_QOS_KUNIT_TEST depends on KUNIT=y default KUNIT_ALL_TESTS +config PM_RUNTIME_KUNIT_TEST + tristate "KUnit Tests for runtime PM" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on PM + default KUNIT_ALL_TESTS + config HMEM_REPORTING bool default n diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index 01f11629d241cd..2989e42d01611a 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_PM_SLEEP) += main.o wakeup.o wakeup_stats.o obj-$(CONFIG_PM_TRACE_RTC) += trace.o obj-$(CONFIG_HAVE_CLK) += clock_ops.o obj-$(CONFIG_PM_QOS_KUNIT_TEST) += qos-test.o +obj-$(CONFIG_PM_RUNTIME_KUNIT_TEST) += runtime-test.o ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG diff --git a/drivers/base/power/runtime-test.c b/drivers/base/power/runtime-test.c new file mode 100644 index 00000000000000..2e966fd966642b --- /dev/null +++ b/drivers/base/power/runtime-test.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2025 Google, Inc. + */ + +#include +#include +#include +#include + +#define DEVICE_NAME "pm_runtime_test_device" + +static void pm_runtime_depth_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_get_sync(dev)); /* "already active" */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +/* Test pm_runtime_put() and friends when already suspended. */ +static void pm_runtime_already_suspended_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + pm_runtime_get_noresume(dev); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_barrier(dev)); /* no wakeup needed */ + pm_runtime_put(dev); + + pm_runtime_get_noresume(dev); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, 1, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_request_autosuspend(dev)); + + pm_runtime_get_noresume(dev); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_put_sync_autosuspend(dev)); + + pm_runtime_get_noresume(dev); + pm_runtime_put_autosuspend(dev); + + /* Grab 2 refcounts */ + pm_runtime_get_noresume(dev); + pm_runtime_get_noresume(dev); + /* The first put() sees usage_count 1 */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync_autosuspend(dev)); + /* The second put() sees usage_count 0 but tells us "already suspended". */ + KUNIT_EXPECT_EQ(test, 1, pm_runtime_put_sync_autosuspend(dev)); + + /* Should have remained suspended the whole time. */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +static void pm_runtime_idle_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_idle(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + pm_runtime_put_noidle(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_idle(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_request_idle(dev)); +} + +static void pm_runtime_disabled_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + /* Never called pm_runtime_enable() */ + KUNIT_EXPECT_FALSE(test, pm_runtime_enabled(dev)); + + /* "disabled" is treated as "active" */ + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_FALSE(test, pm_runtime_suspended(dev)); + + /* + * Note: these "fail", but they still acquire/release refcounts, so + * keep them balanced. + */ + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_get(dev)); + pm_runtime_put(dev); + + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_get(dev)); + pm_runtime_put_autosuspend(dev); + + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_resume_and_get(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_request_idle(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_request_resume(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_request_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_resume(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_autosuspend(dev)); + + /* Still disabled */ + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_FALSE(test, pm_runtime_enabled(dev)); +} + +static void pm_runtime_error_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + /* Fake a .runtime_resume() error */ + dev->power.runtime_error = -EIO; + + /* + * Note: these "fail", but they still acquire/release refcounts, so + * keep them balanced. + */ + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get(dev)); + pm_runtime_put(dev); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get(dev)); + pm_runtime_put_autosuspend(dev); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_put_sync_autosuspend(dev)); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_resume_and_get(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_request_idle(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_request_resume(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_request_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_resume(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_autosuspend(dev)); + + /* Error is still pending */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, -EIO, dev->power.runtime_error); + /* Clear error */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_set_suspended(dev)); + KUNIT_EXPECT_EQ(test, 0, dev->power.runtime_error); + /* Still suspended */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_barrier(dev)); /* resume was pending */ + pm_runtime_put(dev); + pm_runtime_suspend(dev); /* flush the put(), to suspend */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + pm_runtime_put_autosuspend(dev); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_resume_and_get(dev)); + + /* + * The following should all return -EAGAIN (usage is non-zero) or 1 + * (already resumed). + */ + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_request_idle(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_request_resume(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_request_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_resume(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_autosuspend(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + + /* Suspended again */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +/* + * Explore a typical probe() sequence in which a device marks itself powered, + * but doesn't hold any runtime PM reference, so it suspends as soon as it goes + * idle. + */ +static void pm_runtime_probe_active_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + KUNIT_EXPECT_TRUE(test, pm_runtime_status_suspended(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_set_active(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + + pm_runtime_enable(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + + /* Nothing to flush. We stay active. */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_barrier(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + + /* Ask for idle? Now we suspend. */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_idle(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +static struct kunit_case pm_runtime_test_cases[] = { + KUNIT_CASE(pm_runtime_depth_test), + KUNIT_CASE(pm_runtime_already_suspended_test), + KUNIT_CASE(pm_runtime_idle_test), + KUNIT_CASE(pm_runtime_disabled_test), + KUNIT_CASE(pm_runtime_error_test), + KUNIT_CASE(pm_runtime_probe_active_test), + {} +}; + +static struct kunit_suite pm_runtime_test_suite = { + .name = "pm_runtime_test_cases", + .test_cases = pm_runtime_test_cases, +}; + +kunit_test_suite(pm_runtime_test_suite); +MODULE_DESCRIPTION("Runtime power management unit test suite"); +MODULE_LICENSE("GPL"); From d0b8651a026125d58b50b464aeb78f2c5956179f Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 25 Sep 2025 12:42:15 -0700 Subject: [PATCH 2935/3206] PM: runtime: Make put{,_sync}() return 1 when already suspended The pm_runtime.h docs say pm_runtime_put() and pm_runtime_put_sync() return 1 when already suspended, but this is not true -- they return -EAGAIN. On the other hand, pm_runtime_put_sync_suspend() and pm_runtime_put_sync_autosuspend() *do* return 1. This is an artifact of the fact that the former are built on rpm_idle(), whereas the latter are built on rpm_suspend(). There are precious few pm_runtime_put()/pm_runtime_put_sync() callers that check the return code at all, but most of them only log errors, and usually only for negative error codes. None of them should be treating this as an error, so: * at best, this may fix some case where a driver treats this condition as an error, when it shouldn't; * at worst, this should make no effect; and * somewhere in between, we could potentially clear up non-fatal log messages. Fix the pm_runtime_already_suspended_test() while tweaking the behavior. The test makes a lot more sense when these all return 1 when the device is already suspended: pm_runtime_put_sync(dev); pm_runtime_suspend(dev); pm_runtime_autosuspend(dev); pm_request_autosuspend(dev); pm_runtime_put_sync_autosuspend(dev); Notably, I've avoided testing the return codes for these, since they really should be ignored by callers, and we may make them 'void' altogether: pm_runtime_put(dev); pm_runtime_put_autosuspend(dev); Signed-off-by: Brian Norris Reviewed-by: Dhruva Gole Reviewed-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime-test.c | 2 +- drivers/base/power/runtime.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/base/power/runtime-test.c b/drivers/base/power/runtime-test.c index 2e966fd966642b..eca9885e807d1f 100644 --- a/drivers/base/power/runtime-test.c +++ b/drivers/base/power/runtime-test.c @@ -42,7 +42,7 @@ static void pm_runtime_already_suspended_test(struct kunit *test) pm_runtime_put(dev); pm_runtime_get_noresume(dev); - KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_put_sync(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_put_sync(dev)); KUNIT_EXPECT_EQ(test, 1, pm_runtime_suspend(dev)); KUNIT_EXPECT_EQ(test, 1, pm_runtime_autosuspend(dev)); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 3e84dc4122defe..faa68bf9ef3d06 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -498,6 +498,9 @@ static int rpm_idle(struct device *dev, int rpmflags) if (retval < 0) ; /* Conditions are wrong. */ + else if ((rpmflags & RPM_GET_PUT) && retval == 1) + ; /* put() is allowed in RPM_SUSPENDED */ + /* Idle notifications are allowed only in the RPM_ACTIVE state. */ else if (dev->power.runtime_status != RPM_ACTIVE) retval = -EAGAIN; From fed7eaa4f037361fe4f3d4170649d6849a25998d Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 25 Sep 2025 12:42:16 -0700 Subject: [PATCH 2936/3206] PM: runtime: Update kerneldoc return codes APIs based on __pm_runtime_idle() (pm_runtime_idle(), pm_request_idle()) do not return 1 when already suspended. They return -EAGAIN. This is already covered in the docs, so the entry for "1" is redundant and conflicting. (pm_runtime_put() and pm_runtime_put_sync() were previously incorrect, but that's fixed in "PM: runtime: pm_runtime_put{,_sync}() returns 1 when already suspended", to ensure consistency with APIs like pm_runtime_put_autosuspend().) RPM_GET_PUT APIs based on __pm_runtime_suspend() do return 1 when already suspended, but the language is a little unclear -- it's not really an "error", so it seems better to list as a clarification before the 0/success case. Additionally, they only actually return 1 when the refcount makes it to 0; if the usage counter is still non-zero, we return 0. pm_runtime_put(), etc., also don't appear at first like they can ever see "-EAGAIN: Runtime PM usage_count non-zero", because in non-racy conditions, pm_runtime_put() would drop its reference count, see it's non-zero, and return early (in __pm_runtime_idle()). However, it's possible to race with another actor that increments the usage_count afterward, since rpm_idle() is protected by a separate lock; in such a case, we may see -EAGAIN. Because this case is only seen in the presence of concurrent actors, it makes sense to clarify that this is when "usage_count **became** non-zero", by way of some racing actor. Lastly, pm_runtime_put_sync_suspend() duplicated some -EAGAIN language. Fix that. Fixes: 271ff96d6066 ("PM: runtime: Document return values of suspend-related API functions") Link: https://lore.kernel.org/linux-pm/aJ5pkEJuixTaybV4@google.com/ Signed-off-by: Brian Norris Reviewed-by: Sakari Ailus Cc: 6.17+ # 6.17+ Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 56 +++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index d88d6b6ccf5b20..d1ff76e0e2d077 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -350,13 +350,12 @@ static inline int pm_runtime_force_resume(struct device *dev) { return -ENXIO; } * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing - * or device not in %RPM_ACTIVE state. + * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change + * ongoing or device not in %RPM_ACTIVE state. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM idle and suspend callbacks. */ @@ -370,14 +369,15 @@ static inline int pm_runtime_idle(struct device *dev) * @dev: Target device. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -396,14 +396,15 @@ static inline int pm_runtime_suspend(struct device *dev) * engaging its "idle check" callback. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -433,13 +434,12 @@ static inline int pm_runtime_resume(struct device *dev) * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing - * or device not in %RPM_ACTIVE state. + * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change + * ongoing or device not in %RPM_ACTIVE state. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_request_idle(struct device *dev) { @@ -464,15 +464,16 @@ static inline int pm_request_resume(struct device *dev) * equivalent pm_runtime_autosuspend() for @dev asynchronously. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_request_autosuspend(struct device *dev) { @@ -540,15 +541,16 @@ static inline int pm_runtime_resume_and_get(struct device *dev) * equal to 0, queue up a work item for @dev like in pm_request_idle(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_runtime_put(struct device *dev) { @@ -565,15 +567,16 @@ DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T)) * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int __pm_runtime_put_autosuspend(struct device *dev) { @@ -590,15 +593,16 @@ static inline int __pm_runtime_put_autosuspend(struct device *dev) * in pm_request_autosuspend(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_runtime_put_autosuspend(struct device *dev) { @@ -619,14 +623,15 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -646,15 +651,15 @@ static inline int pm_runtime_put_sync(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. - * * -EAGAIN: usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -677,15 +682,16 @@ static inline int pm_runtime_put_sync_suspend(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ From 50a098e3e9b1bb30cefc43cdfba3c7b9b32e14a7 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sat, 27 Sep 2025 11:47:23 +0200 Subject: [PATCH 2937/3206] ALSA: hda/realtek: Add quirk for HP Spectre 14t-ea100 HP-Spectre 14t-ea100 model has no speaker output unless booting previously from Windows on dual boot, a reboot while on Linux will stop the speakers working. Applying the existing quirk for HP Spectre X360 EU0xxx seems fixing this speaker problem. Reported-by: Kaden Berger Cc: Link: https://lore.kernel.org/aMxdGAmfOQ6VPNU8@archlinux Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index f267437c96981e..07ea76efa5de8f 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -6487,6 +6487,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x89c6, "Zbook Fury 17 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89ca, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x89d3, "HP EliteBook 645 G9 (MB 89D2)", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x89da, "HP Spectre x360 14t-ea100", ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX), SND_PCI_QUIRK(0x103c, 0x89e7, "HP Elite x2 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8a0f, "HP Pavilion 14-ec1xxx", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8a20, "HP Laptop 15s-fq5xxx", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), From 632d31067be2f414c57955efcf29c79290cc749b Mon Sep 17 00:00:00 2001 From: Pin-yen Lin Date: Fri, 26 Sep 2025 18:23:18 +0800 Subject: [PATCH 2938/3206] PM: sleep: Do not wait on SYNC_STATE_ONLY device links Device links with DL_FLAG_SYNC_STATE_ONLY should not affect system suspend and resume, and functions like device_reorder_to_tail() and device_link_add() don't try to reorder the consumers with that flag. However, dpm_wait_for_consumers() and dpm_wait_for_suppliers() don't check thas flag before triggering dpm_wait(), leading to potential hang during suspend/resume. This can be reproduced on MT8186 Corsola Chromebook with devicetree like: usb-a-connector { compatible = "usb-a-connector"; port { usb_a_con: endpoint { remote-endpoint = <&usb_hs>; }; }; }; usb_host { compatible = "mediatek,mt8186-xhci", "mediatek,mtk-xhci"; port { usb_hs: endpoint { remote-endpoint = <&usb_a_con>; }; }; }; In this case, the two nodes form a cycle and a SYNC_STATE_ONLY devlink between usb_host (supplier) and usb-a-connector (consumer) is created. Address this by exporting device_link_flag_is_sync_state_only() and making dpm_wait_for_consumers() and dpm_wait_for_suppliers() use it when deciding if dpm_wait() should be called. Fixes: 05ef983e0d65a ("driver core: Add device link support for SYNC_STATE_ONLY flag") Signed-off-by: Pin-yen Lin Link: https://patch.msgid.link/20250926102320.4053167-1-treapking@chromium.org [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/base/base.h | 1 + drivers/base/core.c | 2 +- drivers/base/power/main.c | 6 ++++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/base/base.h b/drivers/base/base.h index 700aecd22fd34a..86fa7fbb354891 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -248,6 +248,7 @@ void device_links_driver_cleanup(struct device *dev); void device_links_no_driver(struct device *dev); bool device_links_busy(struct device *dev); void device_links_unbind_consumers(struct device *dev); +bool device_link_flag_is_sync_state_only(u32 flags); void fw_devlink_drivers_done(void); void fw_devlink_probing_done(void); diff --git a/drivers/base/core.c b/drivers/base/core.c index d22d6b23e75898..a54ec6df1058fb 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -287,7 +287,7 @@ static bool device_is_ancestor(struct device *dev, struct device *target) #define DL_MARKER_FLAGS (DL_FLAG_INFERRED | \ DL_FLAG_CYCLE | \ DL_FLAG_MANAGED) -static inline bool device_link_flag_is_sync_state_only(u32 flags) +bool device_link_flag_is_sync_state_only(u32 flags) { return (flags & ~DL_MARKER_FLAGS) == DL_FLAG_SYNC_STATE_ONLY; } diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e3a7f25bf8d39b..c97f8b139dc202 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -278,7 +278,8 @@ static void dpm_wait_for_suppliers(struct device *dev, bool async) * walking. */ dev_for_each_link_to_supplier(link, dev) - if (READ_ONCE(link->status) != DL_STATE_DORMANT) + if (READ_ONCE(link->status) != DL_STATE_DORMANT && + !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->supplier, async); device_links_read_unlock(idx); @@ -335,7 +336,8 @@ static void dpm_wait_for_consumers(struct device *dev, bool async) * unregistration). */ dev_for_each_link_to_consumer(link, dev) - if (READ_ONCE(link->status) != DL_STATE_DORMANT) + if (READ_ONCE(link->status) != DL_STATE_DORMANT && + !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->consumer, async); device_links_read_unlock(idx); From 0db0934e7f9bb624ed98a665890dbe249f65b8fd Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 22 Sep 2025 15:35:22 +0900 Subject: [PATCH 2939/3206] tracing: fgraph: Protect return handler from recursion loop function_graph_enter_regs() prevents itself from recursion by ftrace_test_recursion_trylock(), but __ftrace_return_to_handler(), which is called at the exit, does not prevent such recursion. Therefore, while it can prevent recursive calls from fgraph_ops::entryfunc(), it is not able to prevent recursive calls to fgraph from fgraph_ops::retfunc(), resulting in a recursive loop. This can lead an unexpected recursion bug reported by Menglong. is_endbr() is called in __ftrace_return_to_handler -> fprobe_return -> kprobe_multi_link_exit_handler -> is_endbr. To fix this issue, acquire ftrace_test_recursion_trylock() in the __ftrace_return_to_handler() after unwind the shadow stack to mark this section must prevent recursive call of fgraph inside user-defined fgraph_ops::retfunc(). This is essentially a fix to commit 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer"), because before that fgraph was only used from the function graph tracer. Fprobe allowed user to run any callbacks from fgraph after that commit. Reported-by: Menglong Dong Closes: https://lore.kernel.org/all/20250918120939.1706585-1-dongml2@chinatelecom.cn/ Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer") Cc: stable@vger.kernel.org Cc: Peter Zijlstra Link: https://lore.kernel.org/175852292275.307379.9040117316112640553.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Acked-by: Jiri Olsa Tested-by: Menglong Dong Acked-by: Menglong Dong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 1e3b32b1e82cd5..484ad7a18463a9 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -815,6 +815,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe unsigned long bitmap; unsigned long ret; int offset; + int bit; int i; ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset); @@ -829,6 +830,15 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe if (fregs) ftrace_regs_set_instruction_pointer(fregs, ret); + bit = ftrace_test_recursion_trylock(trace.func, ret); + /* + * This can fail because ftrace_test_recursion_trylock() allows one nest + * call. If we are already in a nested call, then we don't probe this and + * just return the original return address. + */ + if (unlikely(bit < 0)) + goto out; + #ifdef CONFIG_FUNCTION_GRAPH_RETVAL trace.retval = ftrace_regs_get_return_value(fregs); #endif @@ -852,6 +862,8 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe } } + ftrace_test_recursion_unlock(bit); +out: /* * The ftrace_graph_return() may still access the current * ret_stack structure, we need to make sure the update of From 4540aed51b12bc13364149bf95f6ecef013197c0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Sep 2025 19:12:00 +0200 Subject: [PATCH 2940/3206] bpf: Enforce expected_attach_type for tailcall compatibility Yinhao et al. recently reported: Our fuzzer tool discovered an uninitialized pointer issue in the bpf_prog_test_run_xdp() function within the Linux kernel's BPF subsystem. This leads to a NULL pointer dereference when a BPF program attempts to deference the txq member of struct xdp_buff object. The test initializes two programs of BPF_PROG_TYPE_XDP: progA acts as the entry point for bpf_prog_test_run_xdp() and its expected_attach_type can neither be of be BPF_XDP_DEVMAP nor BPF_XDP_CPUMAP. progA calls into a slot of a tailcall map it owns. progB's expected_attach_type must be BPF_XDP_DEVMAP to pass xdp_is_valid_access() validation. The program returns struct xdp_md's egress_ifindex, and the latter is only allowed to be accessed under mentioned expected_attach_type. progB is then inserted into the tailcall which progA calls. The underlying issue goes beyond XDP though. Another example are programs of type BPF_PROG_TYPE_CGROUP_SOCK_ADDR. sock_addr_is_valid_access() as well as sock_addr_func_proto() have different logic depending on the programs' expected_attach_type. Similarly, a program attached to BPF_CGROUP_INET4_GETPEERNAME should not be allowed doing a tailcall into a program which calls bpf_bind() out of BPF which is only enabled for BPF_CGROUP_INET4_CONNECT. In short, specifying expected_attach_type allows to open up additional functionality or restrictions beyond what the basic bpf_prog_type enables. The use of tailcalls must not violate these constraints. Fix it by enforcing expected_attach_type in __bpf_prog_map_compatible(). Note that we only enforce this for tailcall maps, but not for BPF devmaps or cpumaps: There, the programs are invoked through dev_map_bpf_prog_run*() and cpu_map_bpf_prog_run*() which set up a new environment / context and therefore these situations are not prone to this issue. Fixes: 5e43f899b03a ("bpf: Check attach type at prog load time") Reported-by: Yinhao Hu Reported-by: Kaiyan Mei Reviewed-by: Dongliang Mu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20250926171201.188490-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fe2a396d8ac6bb..a98c8334613474 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -289,6 +289,7 @@ struct bpf_map_owner { bool xdp_has_frags; u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE]; const struct btf_type *attach_func_proto; + enum bpf_attach_type expected_attach_type; }; struct bpf_map { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9b64674df16b88..d595fe512498cc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2361,6 +2361,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, map->owner->type = prog_type; map->owner->jited = fp->jited; map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->expected_attach_type = fp->expected_attach_type; map->owner->attach_func_proto = aux->attach_func_proto; for_each_cgroup_storage_type(i) { map->owner->storage_cookie[i] = @@ -2372,6 +2373,10 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, ret = map->owner->type == prog_type && map->owner->jited == fp->jited && map->owner->xdp_has_frags == aux->xdp_has_frags; + if (ret && + map->map_type == BPF_MAP_TYPE_PROG_ARRAY && + map->owner->expected_attach_type != fp->expected_attach_type) + ret = false; for_each_cgroup_storage_type(i) { if (!ret) break; From 0e8e60e86cf3292e747a0fa7cc13127f290323ad Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Sep 2025 19:12:01 +0200 Subject: [PATCH 2941/3206] selftests/bpf: Add test case for different expected_attach_type Add a small test case which adds two programs - one calling the other through a tailcall - and check that BPF rejects them in case of different expected_attach_type values: # ./vmtest.sh -- ./test_progs -t xdp_devmap [...] #641/1 xdp_devmap_attach/DEVMAP with programs in entries:OK #641/2 xdp_devmap_attach/DEVMAP with frags programs in entries:OK #641/3 xdp_devmap_attach/Verifier check of DEVMAP programs:OK #641/4 xdp_devmap_attach/DEVMAP with programs in entries on veth:OK #641 xdp_devmap_attach:OK Summary: 2/4 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20250926171201.188490-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/xdp_devmap_attach.c | 31 ++++++++++++++++++- .../bpf/progs/test_xdp_devmap_tailcall.c | 29 +++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_devmap_tailcall.c diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c index 461ab18705d5c0..a8ab05216c3853 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -7,6 +7,7 @@ #include #include "test_xdp_devmap_helpers.skel.h" +#include "test_xdp_devmap_tailcall.skel.h" #include "test_xdp_with_devmap_frags_helpers.skel.h" #include "test_xdp_with_devmap_helpers.skel.h" @@ -107,6 +108,29 @@ static void test_neg_xdp_devmap_helpers(void) } } +static void test_xdp_devmap_tailcall(enum bpf_attach_type prog_dev, + enum bpf_attach_type prog_tail, + bool expect_reject) +{ + struct test_xdp_devmap_tailcall *skel; + int err; + + skel = test_xdp_devmap_tailcall__open(); + if (!ASSERT_OK_PTR(skel, "test_xdp_devmap_tailcall__open")) + return; + + bpf_program__set_expected_attach_type(skel->progs.xdp_devmap, prog_dev); + bpf_program__set_expected_attach_type(skel->progs.xdp_entry, prog_tail); + + err = test_xdp_devmap_tailcall__load(skel); + if (expect_reject) + ASSERT_ERR(err, "test_xdp_devmap_tailcall__load"); + else + ASSERT_OK(err, "test_xdp_devmap_tailcall__load"); + + test_xdp_devmap_tailcall__destroy(skel); +} + static void test_xdp_with_devmap_frags_helpers(void) { struct test_xdp_with_devmap_frags_helpers *skel; @@ -238,8 +262,13 @@ void serial_test_xdp_devmap_attach(void) if (test__start_subtest("DEVMAP with frags programs in entries")) test_xdp_with_devmap_frags_helpers(); - if (test__start_subtest("Verifier check of DEVMAP programs")) + if (test__start_subtest("Verifier check of DEVMAP programs")) { test_neg_xdp_devmap_helpers(); + test_xdp_devmap_tailcall(BPF_XDP_DEVMAP, BPF_XDP_DEVMAP, false); + test_xdp_devmap_tailcall(0, 0, true); + test_xdp_devmap_tailcall(BPF_XDP_DEVMAP, 0, true); + test_xdp_devmap_tailcall(0, BPF_XDP_DEVMAP, true); + } if (test__start_subtest("DEVMAP with programs in entries on veth")) test_xdp_with_devmap_helpers_veth(); diff --git a/tools/testing/selftests/bpf/progs/test_xdp_devmap_tailcall.c b/tools/testing/selftests/bpf/progs/test_xdp_devmap_tailcall.c new file mode 100644 index 00000000000000..814e2a980e97e9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_devmap_tailcall.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +SEC("xdp") +int xdp_devmap(struct xdp_md *ctx) +{ + return ctx->egress_ifindex; +} + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, int (void *)); +} xdp_map SEC(".maps") = { + .values = { + [0] = (void *)&xdp_devmap, + }, +}; + +SEC("xdp") +int xdp_entry(struct xdp_md *ctx) +{ + bpf_tail_call(ctx, &xdp_map, 0); + return 0; +} From b49dde7aa4e0a85f104faa6c6987e0d9ca8d9a0d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 25 Sep 2025 22:39:44 +0200 Subject: [PATCH 2942/3206] MAINTAINERS: delete email for Tharun Kumar P The email address bounced. I couldn't find a newer one in recent git history, so delete this email entry. Signed-off-by: Wolfram Sang --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index e2330f9a5d76f3..4de0f5faa49bc5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16689,7 +16689,6 @@ F: drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c F: drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c MICROCHIP PCI1XXXX I2C DRIVER -M: Tharun Kumar P M: Kumaravel Thiagarajan M: Microchip Linux Driver Support L: linux-i2c@vger.kernel.org @@ -16698,7 +16697,6 @@ F: drivers/i2c/busses/i2c-mchp-pci1xxxx.c MICROCHIP PCIe UART DRIVER M: Kumaravel Thiagarajan -M: Tharun Kumar P L: linux-serial@vger.kernel.org S: Maintained F: drivers/tty/serial/8250/8250_pci1xxxx.c From 9036d0882cdc366644ec1c70c37701631837f3b8 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 26 Sep 2025 13:50:57 +0200 Subject: [PATCH 2943/3206] MAINTAINERS: Add me as maintainer of Synopsys DesignWare I2C driver I volunteered as maintainer of the DesignWare I2C driver, so update my entry from reviewer to maintainer. Signed-off-by: Mika Westerberg Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4de0f5faa49bc5..c58e450e081dd0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24479,8 +24479,8 @@ F: Documentation/devicetree/bindings/media/snps,dw-hdmi-rx.yaml F: drivers/media/platform/synopsys/hdmirx/* SYNOPSYS DESIGNWARE I2C DRIVER +M: Mika Westerberg R: Andy Shevchenko -R: Mika Westerberg R: Jan Dabros L: linux-i2c@vger.kernel.org S: Supported From ed45b7a4da175539cd7ede514e4626e12a6d50ca Mon Sep 17 00:00:00 2001 From: Troy Mitchell Date: Fri, 26 Sep 2025 22:51:52 +0800 Subject: [PATCH 2944/3206] MAINTAINERS: add entry for SpacemiT K1 I2C driver Add a MAINTAINERS entry for the SpacemiT K1 I2C driver and its DT binding. Signed-off-by: Troy Mitchell Signed-off-by: Wolfram Sang --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index c58e450e081dd0..b106e1ba3bed6d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23707,6 +23707,12 @@ W: https://linuxtv.org Q: http://patchwork.linuxtv.org/project/linux-media/list/ F: drivers/media/dvb-frontends/sp2* +SPACEMIT K1 I2C DRIVER +M: Troy Mitchell +S: Maintained +F: Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml +F: drivers/i2c/busses/i2c-k1.c + SPANISH DOCUMENTATION M: Carlos Bilbao R: Avadhut Naik From dc67521c20b701b5237ff486ae078829dc1f8fea Mon Sep 17 00:00:00 2001 From: John Madieu Date: Sat, 27 Sep 2025 23:08:24 +0200 Subject: [PATCH 2945/3206] thermal/drivers/renesas/rzg3e: Fix add thermal driver for the Renesas RZ/G3E SoC When applied the change commit 19d3a401a617, a conflict appeared resulting into a manual fix. However the new file rzg3e_thermal.c was not added but stayed locally in source tree and miss to be merged with the entire change. Fix this by adding the file back. Fixes: 19d3a401a617 ("Add thermal driver for the Renesas RZ/G3E SoC") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509272225.sARVqv2G-lkp@intel.com Signed-off-by: John Madieu Signed-off-by: Daniel Lezcano --- drivers/thermal/renesas/rzg3e_thermal.c | 547 ++++++++++++++++++++++++ 1 file changed, 547 insertions(+) create mode 100644 drivers/thermal/renesas/rzg3e_thermal.c diff --git a/drivers/thermal/renesas/rzg3e_thermal.c b/drivers/thermal/renesas/rzg3e_thermal.c new file mode 100644 index 00000000000000..e66d73ca675277 --- /dev/null +++ b/drivers/thermal/renesas/rzg3e_thermal.c @@ -0,0 +1,547 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas RZ/G3E TSU Temperature Sensor Unit + * + * Copyright (C) 2025 Renesas Electronics Corporation + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../thermal_hwmon.h" + +/* TSU Register offsets and bits */ +#define TSU_SSUSR 0x00 +#define TSU_SSUSR_EN_TS BIT(0) +#define TSU_SSUSR_ADC_PD_TS BIT(1) +#define TSU_SSUSR_SOC_TS_EN BIT(2) + +#define TSU_STRGR 0x04 +#define TSU_STRGR_ADST BIT(0) + +#define TSU_SOSR1 0x08 +#define TSU_SOSR1_ADCT_8 0x03 +#define TSU_SOSR1_ADCS BIT(4) +#define TSU_SOSR1_OUTSEL BIT(9) + +#define TSU_SCRR 0x10 +#define TSU_SCRR_OUT12BIT_TS GENMASK(11, 0) + +#define TSU_SSR 0x14 +#define TSU_SSR_CONV BIT(0) + +#define TSU_CMSR 0x18 +#define TSU_CMSR_CMPEN BIT(0) + +#define TSU_LLSR 0x1C +#define TSU_ULSR 0x20 + +#define TSU_SISR 0x30 +#define TSU_SISR_ADF BIT(0) +#define TSU_SISR_CMPF BIT(1) + +#define TSU_SIER 0x34 +#define TSU_SIER_CMPIE BIT(1) + +#define TSU_SICR 0x38 +#define TSU_SICR_ADCLR BIT(0) +#define TSU_SICR_CMPCLR BIT(1) + +/* Temperature calculation constants from datasheet */ +#define TSU_TEMP_D (-41) +#define TSU_TEMP_E 126 +#define TSU_CODE_MAX 0xFFF + +/* Timing specifications from datasheet */ +#define TSU_POWERUP_TIME_US 120 /* 120T at 1MHz sensor clock per datasheet */ +#define TSU_CONV_TIME_US 50 /* Per sample conversion time */ +#define TSU_POLL_DELAY_US 10 /* Polling interval */ +#define TSU_MIN_CLOCK_RATE 24000000 /* TSU_PCLK minimum 24MHz */ + +/** + * struct rzg3e_thermal_priv - RZ/G3E TSU private data + * @base: TSU register base + * @dev: device pointer + * @syscon: regmap for calibration values + * @zone: thermal zone device + * @rstc: reset control + * @trmval0: calibration value 0 (b) + * @trmval1: calibration value 1 (c) + * @trim_offset: offset for trim registers in syscon + * @lock: protects hardware access during conversions + */ +struct rzg3e_thermal_priv { + void __iomem *base; + struct device *dev; + struct regmap *syscon; + struct thermal_zone_device *zone; + struct reset_control *rstc; + u16 trmval0; + u16 trmval1; + u32 trim_offset; + struct mutex lock; +}; + +static int rzg3e_thermal_power_on(struct rzg3e_thermal_priv *priv) +{ + u32 val; + int ret; + + /* Clear any pending interrupts */ + writel(TSU_SICR_ADCLR | TSU_SICR_CMPCLR, priv->base + TSU_SICR); + + /* Disable all interrupts during setup */ + writel(0, priv->base + TSU_SIER); + + /* + * Power-on sequence per datasheet 7.11.9.1: + * SOC_TS_EN must be set at same time or before EN_TS and ADC_PD_TS + */ + val = TSU_SSUSR_SOC_TS_EN | TSU_SSUSR_EN_TS; + writel(val, priv->base + TSU_SSUSR); + + /* Wait for sensor stabilization per datasheet 7.11.7.1 */ + usleep_range(TSU_POWERUP_TIME_US, TSU_POWERUP_TIME_US + 10); + + /* Configure for average mode with 8 samples */ + val = TSU_SOSR1_OUTSEL | TSU_SOSR1_ADCT_8; + writel(val, priv->base + TSU_SOSR1); + + /* Ensure we're in single scan mode (default) */ + val = readl(priv->base + TSU_SOSR1); + if (val & TSU_SOSR1_ADCS) { + dev_err(priv->dev, "Invalid scan mode setting\n"); + return -EINVAL; + } + + /* Wait for any ongoing conversion to complete */ + ret = readl_poll_timeout(priv->base + TSU_SSR, val, + !(val & TSU_SSR_CONV), + TSU_POLL_DELAY_US, + USEC_PER_MSEC); + if (ret) { + dev_err(priv->dev, "Timeout waiting for conversion\n"); + return ret; + } + + return 0; +} + +static void rzg3e_thermal_power_off(struct rzg3e_thermal_priv *priv) +{ + /* Disable all interrupts */ + writel(0, priv->base + TSU_SIER); + + /* Clear pending interrupts */ + writel(TSU_SICR_ADCLR | TSU_SICR_CMPCLR, priv->base + TSU_SICR); + + /* Power down sequence per datasheet */ + writel(TSU_SSUSR_ADC_PD_TS, priv->base + TSU_SSUSR); +} + +/* + * Convert 12-bit sensor code to temperature in millicelsius + * Formula from datasheet 7.11.7.8: + * T(°C) = ((e - d) / (c - b)) * (a - b) + d + * where: a = sensor code, b = trmval0, c = trmval1, d = -41, e = 126 + */ +static int rzg3e_thermal_code_to_temp(struct rzg3e_thermal_priv *priv, u16 code) +{ + int temp_e_mc = TSU_TEMP_E * MILLIDEGREE_PER_DEGREE; + int temp_d_mc = TSU_TEMP_D * MILLIDEGREE_PER_DEGREE; + s64 numerator, denominator; + int temp_mc; + + numerator = (temp_e_mc - temp_d_mc) * (s64)(code - priv->trmval0); + denominator = priv->trmval1 - priv->trmval0; + + temp_mc = div64_s64(numerator, denominator) + temp_d_mc; + + return clamp(temp_mc, temp_d_mc, temp_e_mc); +} + +/* + * Convert temperature in millicelsius to 12-bit sensor code + * Formula from datasheet 7.11.7.9 (inverse of above) + */ +static u16 rzg3e_thermal_temp_to_code(struct rzg3e_thermal_priv *priv, int temp_mc) +{ + int temp_e_mc = TSU_TEMP_E * MILLIDEGREE_PER_DEGREE; + int temp_d_mc = TSU_TEMP_D * MILLIDEGREE_PER_DEGREE; + s64 numerator, denominator; + s64 code; + + numerator = (temp_mc - temp_d_mc) * (priv->trmval1 - priv->trmval0); + denominator = temp_e_mc - temp_d_mc; + + code = div64_s64(numerator, denominator) + priv->trmval0; + + return clamp_val(code, 0, TSU_CODE_MAX); +} + +static int rzg3e_thermal_get_temp(struct thermal_zone_device *tz, int *temp) +{ + struct rzg3e_thermal_priv *priv = thermal_zone_device_priv(tz); + u32 status, code; + int ret, timeout; + + ret = pm_runtime_resume_and_get(priv->dev); + if (ret < 0) + return ret; + + guard(mutex)(&priv->lock); + + /* Clear any previous conversion status */ + writel(TSU_SICR_ADCLR, priv->base + TSU_SICR); + + /* Start single conversion */ + writel(TSU_STRGR_ADST, priv->base + TSU_STRGR); + + /* Wait for conversion completion - 8 samples at ~50us each */ + timeout = TSU_CONV_TIME_US * 8 * 2; /* Double for margin */ + ret = readl_poll_timeout(priv->base + TSU_SISR, status, + status & TSU_SISR_ADF, + TSU_POLL_DELAY_US, timeout); + if (ret) { + dev_err(priv->dev, "Conversion timeout (status=0x%08x)\n", status); + goto out; + } + + /* Read the averaged result and clear the complete flag */ + code = readl(priv->base + TSU_SCRR) & TSU_SCRR_OUT12BIT_TS; + writel(TSU_SICR_ADCLR, priv->base + TSU_SICR); + + /* Convert to temperature */ + *temp = rzg3e_thermal_code_to_temp(priv, code); + + dev_dbg(priv->dev, "temp=%d mC (%d.%03d°C), code=0x%03x\n", + *temp, *temp / 1000, abs(*temp) % 1000, code); + +out: + pm_runtime_mark_last_busy(priv->dev); + pm_runtime_put_autosuspend(priv->dev); + return ret; +} + +static int rzg3e_thermal_set_trips(struct thermal_zone_device *tz, + int low, int high) +{ + struct rzg3e_thermal_priv *priv = thermal_zone_device_priv(tz); + u16 low_code, high_code; + u32 val; + int ret; + + /* Hardware requires low < high */ + if (low >= high) + return -EINVAL; + + ret = pm_runtime_resume_and_get(priv->dev); + if (ret < 0) + return ret; + + guard(mutex)(&priv->lock); + + /* Convert temperatures to codes */ + low_code = rzg3e_thermal_temp_to_code(priv, low); + high_code = rzg3e_thermal_temp_to_code(priv, high); + + dev_dbg(priv->dev, "set_trips: low=%d high=%d (codes: 0x%03x/0x%03x)\n", + low, high, low_code, high_code); + + /* Disable comparison during reconfiguration */ + writel(0, priv->base + TSU_SIER); + writel(0, priv->base + TSU_CMSR); + + /* Clear any pending comparison interrupts */ + writel(TSU_SICR_CMPCLR, priv->base + TSU_SICR); + + /* Set trip points */ + writel(low_code, priv->base + TSU_LLSR); + writel(high_code, priv->base + TSU_ULSR); + + /* + * Ensure OUTSEL is set for comparison per datasheet 7.11.7.4 + * Comparison uses averaged data + */ + val = readl(priv->base + TSU_SOSR1); + val |= TSU_SOSR1_OUTSEL; + writel(val, priv->base + TSU_SOSR1); + + /* Enable comparison with "out of range" mode (CMPCOND=0) */ + writel(TSU_CMSR_CMPEN, priv->base + TSU_CMSR); + + /* Unmask compare IRQ and start a conversion to evaluate window */ + writel(TSU_SIER_CMPIE, priv->base + TSU_SIER); + writel(TSU_STRGR_ADST, priv->base + TSU_STRGR); + + pm_runtime_mark_last_busy(priv->dev); + pm_runtime_put_autosuspend(priv->dev); + + return 0; +} + +static irqreturn_t rzg3e_thermal_irq_thread(int irq, void *data) +{ + struct rzg3e_thermal_priv *priv = data; + + dev_dbg(priv->dev, "Temperature threshold crossed\n"); + + /* Notify thermal framework to re-evaluate trip points */ + thermal_zone_device_update(priv->zone, THERMAL_TRIP_VIOLATED); + + return IRQ_HANDLED; +} + +static irqreturn_t rzg3e_thermal_irq(int irq, void *data) +{ + struct rzg3e_thermal_priv *priv = data; + u32 status; + + status = readl(priv->base + TSU_SISR); + + /* Check if comparison interrupt occurred */ + if (status & TSU_SISR_CMPF) { + /* Clear irq flag and disable interrupt until reconfigured */ + writel(TSU_SICR_CMPCLR, priv->base + TSU_SICR); + writel(0, priv->base + TSU_SIER); + + return IRQ_WAKE_THREAD; + } + + return IRQ_NONE; +} + +static const struct thermal_zone_device_ops rzg3e_tz_ops = { + .get_temp = rzg3e_thermal_get_temp, + .set_trips = rzg3e_thermal_set_trips, +}; + +static int rzg3e_thermal_get_calibration(struct rzg3e_thermal_priv *priv) +{ + u32 val; + int ret; + + /* Read calibration values from syscon */ + ret = regmap_read(priv->syscon, priv->trim_offset, &val); + if (ret) + return ret; + priv->trmval0 = val & GENMASK(11, 0); + + ret = regmap_read(priv->syscon, priv->trim_offset + 4, &val); + if (ret) + return ret; + priv->trmval1 = val & GENMASK(11, 0); + + /* Validate calibration data */ + if (!priv->trmval0 || !priv->trmval1 || + priv->trmval0 == priv->trmval1 || + priv->trmval0 == 0xFFF || priv->trmval1 == 0xFFF) { + dev_err(priv->dev, "Invalid calibration: b=0x%03x, c=0x%03x\n", + priv->trmval0, priv->trmval1); + return -EINVAL; + } + + dev_dbg(priv->dev, "Calibration: b=0x%03x (%u), c=0x%03x (%u)\n", + priv->trmval0, priv->trmval0, priv->trmval1, priv->trmval1); + + return 0; +} + +static int rzg3e_thermal_parse_dt(struct rzg3e_thermal_priv *priv) +{ + struct device_node *np = priv->dev->of_node; + u32 offset; + + priv->syscon = syscon_regmap_lookup_by_phandle_args(np, "renesas,tsu-trim", 1, &offset); + if (IS_ERR(priv->syscon)) + return dev_err_probe(priv->dev, PTR_ERR(priv->syscon), + "Failed to parse renesas,tsu-trim\n"); + + priv->trim_offset = offset; + return 0; +} + +static int rzg3e_thermal_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rzg3e_thermal_priv *priv; + struct clk *clk; + int irq, ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->dev = dev; + ret = devm_mutex_init(dev, &priv->lock); + if (ret) + return ret; + platform_set_drvdata(pdev, priv); + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + /* Parse device tree for trim register info */ + ret = rzg3e_thermal_parse_dt(priv); + if (ret) + return ret; + + /* Get clock to verify frequency - clock is managed by power domain */ + clk = devm_clk_get(dev, NULL); + if (IS_ERR(clk)) + return dev_err_probe(dev, PTR_ERR(clk), + "Failed to get clock\n"); + + if (clk_get_rate(clk) < TSU_MIN_CLOCK_RATE) + return dev_err_probe(dev, -EINVAL, + "Clock rate %lu Hz too low (min %u Hz)\n", + clk_get_rate(clk), TSU_MIN_CLOCK_RATE); + + priv->rstc = devm_reset_control_get_exclusive_deasserted(dev, NULL); + if (IS_ERR(priv->rstc)) + return dev_err_probe(dev, PTR_ERR(priv->rstc), + "Failed to get/deassert reset control\n"); + + /* Get calibration data */ + ret = rzg3e_thermal_get_calibration(priv); + if (ret) + return dev_err_probe(dev, ret, + "Failed to get valid calibration data\n"); + + /* Get comparison interrupt */ + irq = platform_get_irq_byname(pdev, "adcmpi"); + if (irq < 0) + return irq; + + /* Enable runtime PM */ + pm_runtime_set_autosuspend_delay(dev, 1000); + pm_runtime_use_autosuspend(dev); + devm_pm_runtime_enable(dev); + + /* Initial hardware setup */ + ret = pm_runtime_resume_and_get(dev); + if (ret < 0) + return dev_err_probe(dev, ret, "Runtime resume failed\n"); + + /* Register thermal zone - this will trigger DT parsing */ + priv->zone = devm_thermal_of_zone_register(dev, 0, priv, &rzg3e_tz_ops); + if (IS_ERR(priv->zone)) { + ret = PTR_ERR(priv->zone); + dev_err(dev, "Failed to register thermal zone: %d\n", ret); + goto err_pm_put; + } + + /* Request threaded IRQ for comparison interrupt */ + ret = devm_request_threaded_irq(dev, irq, rzg3e_thermal_irq, + rzg3e_thermal_irq_thread, + IRQF_ONESHOT, "rzg3e_thermal", priv); + if (ret) { + dev_err(dev, "Failed to request IRQ: %d\n", ret); + goto err_pm_put; + } + + /* Add hwmon sysfs interface */ + ret = devm_thermal_add_hwmon_sysfs(dev, priv->zone); + if (ret) + dev_warn(dev, "Failed to add hwmon sysfs attributes\n"); + + pm_runtime_mark_last_busy(dev); + pm_runtime_put_autosuspend(dev); + + dev_info(dev, "RZ/G3E thermal sensor registered\n"); + + return 0; + +err_pm_put: + pm_runtime_put_sync(dev); + return ret; +} + +static int rzg3e_thermal_runtime_suspend(struct device *dev) +{ + struct rzg3e_thermal_priv *priv = dev_get_drvdata(dev); + + rzg3e_thermal_power_off(priv); + return 0; +} + +static int rzg3e_thermal_runtime_resume(struct device *dev) +{ + struct rzg3e_thermal_priv *priv = dev_get_drvdata(dev); + + return rzg3e_thermal_power_on(priv); +} + +static int rzg3e_thermal_suspend(struct device *dev) +{ + struct rzg3e_thermal_priv *priv = dev_get_drvdata(dev); + + /* If device is active, power it off */ + if (pm_runtime_active(dev)) + rzg3e_thermal_power_off(priv); + + /* Assert reset to ensure clean state after resume */ + reset_control_assert(priv->rstc); + + return 0; +} + +static int rzg3e_thermal_resume(struct device *dev) +{ + struct rzg3e_thermal_priv *priv = dev_get_drvdata(dev); + int ret; + + /* Deassert reset */ + ret = reset_control_deassert(priv->rstc); + if (ret) { + dev_err(dev, "Failed to deassert reset: %d\n", ret); + return ret; + } + + /* If device was active before suspend, power it back on */ + if (pm_runtime_active(dev)) + return rzg3e_thermal_power_on(priv); + + return 0; +} + +static const struct dev_pm_ops rzg3e_thermal_pm_ops = { + RUNTIME_PM_OPS(rzg3e_thermal_runtime_suspend, + rzg3e_thermal_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(rzg3e_thermal_suspend, rzg3e_thermal_resume) +}; + +static const struct of_device_id rzg3e_thermal_dt_ids[] = { + { .compatible = "renesas,r9a09g047-tsu" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, rzg3e_thermal_dt_ids); + +static struct platform_driver rzg3e_thermal_driver = { + .driver = { + .name = "rzg3e_thermal", + .of_match_table = rzg3e_thermal_dt_ids, + .pm = pm_ptr(&rzg3e_thermal_pm_ops), + }, + .probe = rzg3e_thermal_probe, +}; +module_platform_driver(rzg3e_thermal_driver); + +MODULE_DESCRIPTION("Renesas RZ/G3E TSU Thermal Sensor Driver"); +MODULE_AUTHOR("John Madieu "); +MODULE_LICENSE("GPL"); From 095530512152e6811278de9c30f170f0ac9705eb Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 27 Sep 2025 11:52:16 +0200 Subject: [PATCH 2946/3206] i2c: rtl9300: Drop unsupported I2C_FUNC_SMBUS_I2C_BLOCK While applying the patch for commit ede965fd555a ("i2c: rtl9300: remove broken SMBus Quick operation support"), a conflict was incorrectly solved by adding the I2C_FUNC_SMBUS_I2C_BLOCK feature flag. But the code to handle I2C_SMBUS_I2C_BLOCK_DATA requests will be added by a separate commit. Fixes: ede965fd555a ("i2c: rtl9300: remove broken SMBus Quick operation support") Signed-off-by: Sven Eckelmann Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-rtl9300.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-rtl9300.c b/drivers/i2c/busses/i2c-rtl9300.c index 9e1f71fed0feac..af991b28e4f835 100644 --- a/drivers/i2c/busses/i2c-rtl9300.c +++ b/drivers/i2c/busses/i2c-rtl9300.c @@ -307,8 +307,7 @@ static int rtl9300_i2c_smbus_xfer(struct i2c_adapter *adap, u16 addr, unsigned s static u32 rtl9300_i2c_func(struct i2c_adapter *a) { return I2C_FUNC_SMBUS_BYTE | I2C_FUNC_SMBUS_BYTE_DATA | - I2C_FUNC_SMBUS_WORD_DATA | I2C_FUNC_SMBUS_BLOCK_DATA | - I2C_FUNC_SMBUS_I2C_BLOCK; + I2C_FUNC_SMBUS_WORD_DATA | I2C_FUNC_SMBUS_BLOCK_DATA; } static const struct i2c_algorithm rtl9300_i2c_algo = { From ec67ef0a501d7dc915d84c0ada4ab3697adfa1d9 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 11 Jun 2025 13:08:00 +0200 Subject: [PATCH 2947/3206] dt-bindings: i2c: i2c-mt65xx: Add MediaTek MT8196/6991 compatibles Add support for the MediaTek MT8196 Chromebook SoC and for its close relative, the MediaTek Dimensity 9400 MT6991 SoC. Those chips' multiple I2C controller instances are compatible with the ones found in the MT8188 SoC. Signed-off-by: AngeloGioacchino Del Regno Acked-by: Conor Dooley [wsa: rebased] Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml b/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml index bd6811cbde701c..3562ce0c0f7e48 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml +++ b/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml @@ -53,6 +53,8 @@ properties: - items: - enum: - mediatek,mt6878-i2c + - mediatek,mt6991-i2c + - mediatek,mt8196-i2c - const: mediatek,mt8188-i2c - items: - enum: From cb3005d4c490fe2489accd5408592683d705b455 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 8 Aug 2025 16:09:56 +0300 Subject: [PATCH 2948/3206] i2c: i801: Add support for Intel Wildcat Lake-U Add SMBus IDs on Intel Wildcat Lake-U. Signed-off-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- Documentation/i2c/busses/i2c-i801.rst | 1 + drivers/i2c/busses/Kconfig | 1 + drivers/i2c/busses/i2c-i801.c | 3 +++ 3 files changed, 5 insertions(+) diff --git a/Documentation/i2c/busses/i2c-i801.rst b/Documentation/i2c/busses/i2c-i801.rst index 47e8ac5b7099f7..36c563ad3f068f 100644 --- a/Documentation/i2c/busses/i2c-i801.rst +++ b/Documentation/i2c/busses/i2c-i801.rst @@ -50,6 +50,7 @@ Supported adapters: * Intel Birch Stream (SOC) * Intel Arrow Lake (SOC) * Intel Panther Lake (SOC) + * Intel Wildcat Lake (SOC) Datasheets: Publicly available at the Intel website diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 070d014fdc5d5d..0c77b1d4c260de 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -165,6 +165,7 @@ config I2C_I801 Birch Stream (SOC) Arrow Lake (SOC) Panther Lake (SOC) + Wildcat Lake (SOC) This driver can also be built as a module. If so, the module will be called i2c-i801. diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index e94ac746a741af..cba992fa655791 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -83,6 +83,7 @@ * Arrow Lake-H (SOC) 0x7722 32 hard yes yes yes * Panther Lake-H (SOC) 0xe322 32 hard yes yes yes * Panther Lake-P (SOC) 0xe422 32 hard yes yes yes + * Wildcat Lake-U (SOC) 0x4d22 32 hard yes yes yes * * Features supported by this driver: * Software PEC no @@ -236,6 +237,7 @@ #define PCI_DEVICE_ID_INTEL_5_3400_SERIES_SMBUS 0x3b30 #define PCI_DEVICE_ID_INTEL_TIGERLAKE_H_SMBUS 0x43a3 #define PCI_DEVICE_ID_INTEL_ELKHART_LAKE_SMBUS 0x4b23 +#define PCI_DEVICE_ID_INTEL_WILDCAT_LAKE_U_SMBUS 0x4d22 #define PCI_DEVICE_ID_INTEL_JASPER_LAKE_SMBUS 0x4da3 #define PCI_DEVICE_ID_INTEL_ALDER_LAKE_P_SMBUS 0x51a3 #define PCI_DEVICE_ID_INTEL_ALDER_LAKE_M_SMBUS 0x54a3 @@ -1056,6 +1058,7 @@ static const struct pci_device_id i801_ids[] = { { PCI_DEVICE_DATA(INTEL, ARROW_LAKE_H_SMBUS, FEATURES_ICH5 | FEATURE_TCO_CNL) }, { PCI_DEVICE_DATA(INTEL, PANTHER_LAKE_H_SMBUS, FEATURES_ICH5 | FEATURE_TCO_CNL) }, { PCI_DEVICE_DATA(INTEL, PANTHER_LAKE_P_SMBUS, FEATURES_ICH5 | FEATURE_TCO_CNL) }, + { PCI_DEVICE_DATA(INTEL, WILDCAT_LAKE_U_SMBUS, FEATURES_ICH5 | FEATURE_TCO_CNL) }, { 0, } }; From 40d4c761200b796a44bf2c7675ae09c87b17d4af Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 28 Sep 2025 10:19:10 +0900 Subject: [PATCH 2949/3206] firewire: core: fix undefined reference error in ARM EABI For ARM EABI, GCC generates a reference to __aeabi_uldivmod when compiling a division of 64-bit integer with 32-bit integer. This function is not available in Linux kernel. In such cases, helper macros are defined in include/linux/math64.h. This commit replaces the division with div_u64(). Fixes: 8ec6a8ec23b9 ("firewire: core: suppress overflow warning when computing jiffies from isochronous cycle") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509270428.FZaO2PPq-lkp@intel.com/ Link: https://lore.kernel.org/r/20250928011910.581475-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index 2dd715a580ac8b..e67395ce26b5e3 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -30,7 +30,7 @@ struct fw_packet; // This is the arbitrary value we use to indicate a mismatched gap count. #define GAP_COUNT_MISMATCHED 0 -#define isoc_cycles_to_jiffies(cycles) usecs_to_jiffies((u32)((u64)(cycles) * USEC_PER_SEC / 8000)) +#define isoc_cycles_to_jiffies(cycles) usecs_to_jiffies((u32)div_u64((u64)cycles * USEC_PER_SEC, 8000)) extern __printf(2, 3) void fw_err(const struct fw_card *card, const char *fmt, ...); From ab91835e61ab56d3964f51480955c9661678c269 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 23 Sep 2025 14:03:25 +0100 Subject: [PATCH 2950/3206] ASoC: cs35l56: Set fw_regs table after getting REVID Defer setting the cs35l56_base.fw_regs pointer until after the REVID has been read in cs35l56_hw_init(). Also make the corresponding change to the cs35l56_hda drivers to prevent a build break. This is preparing for firmware registers that change address between revisions of the same device. Signed-off-by: Richard Fitzgerald Signed-off-by: Takashi Iwai --- include/sound/cs35l56.h | 3 - sound/hda/codecs/side-codecs/cs35l56_hda.c | 1 + .../hda/codecs/side-codecs/cs35l56_hda_i2c.c | 2 - .../hda/codecs/side-codecs/cs35l56_hda_spi.c | 2 - sound/soc/codecs/cs35l56-i2c.c | 4 +- sound/soc/codecs/cs35l56-sdw.c | 4 +- sound/soc/codecs/cs35l56-shared.c | 59 +++++++++++-------- sound/soc/codecs/cs35l56-spi.c | 2 +- 8 files changed, 41 insertions(+), 36 deletions(-) diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 7c8bbe8ad1e2de..20dc3ee6378da1 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -337,9 +337,6 @@ extern const struct regmap_config cs35l56_regmap_sdw; extern const struct regmap_config cs35l63_regmap_i2c; extern const struct regmap_config cs35l63_regmap_sdw; -extern const struct cs35l56_fw_reg cs35l56_fw_reg; -extern const struct cs35l56_fw_reg cs35l63_fw_reg; - extern const struct cirrus_amp_cal_controls cs35l56_calibration_controls; extern const char * const cs35l56_tx_input_texts[CS35L56_NUM_INPUT_SRC]; diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda.c b/sound/hda/codecs/side-codecs/cs35l56_hda.c index 36fa62a41984b9..5bb1c4ebeaf3cb 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda.c @@ -1049,6 +1049,7 @@ int cs35l56_hda_common_probe(struct cs35l56_hda *cs35l56, int hid, int id) goto err; } + cs35l56->base.type = hid & 0xff; cs35l56->base.cal_index = -1; cs35l56_init_cs_dsp(&cs35l56->base, &cs35l56->cs_dsp); diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c b/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c index d10209e4eddd5b..1072f17385ac52 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c @@ -27,8 +27,6 @@ static int cs35l56_hda_i2c_probe(struct i2c_client *clt) cs35l56->base.can_hibernate = true; #endif - cs35l56->base.fw_reg = &cs35l56_fw_reg; - cs35l56->base.regmap = devm_regmap_init_i2c(clt, &cs35l56_regmap_i2c); if (IS_ERR(cs35l56->base.regmap)) { ret = PTR_ERR(cs35l56->base.regmap); diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c b/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c index f57533d3d728f5..f802c83c57b4ef 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c @@ -30,8 +30,6 @@ static int cs35l56_hda_spi_probe(struct spi_device *spi) cs35l56->base.can_hibernate = true; #endif - cs35l56->base.fw_reg = &cs35l56_fw_reg; - cs35l56->base.regmap = devm_regmap_init_spi(spi, &cs35l56_regmap_spi); if (IS_ERR(cs35l56->base.regmap)) { ret = PTR_ERR(cs35l56->base.regmap); diff --git a/sound/soc/codecs/cs35l56-i2c.c b/sound/soc/codecs/cs35l56-i2c.c index 073f1796ae2910..0492ddc4102d80 100644 --- a/sound/soc/codecs/cs35l56-i2c.c +++ b/sound/soc/codecs/cs35l56-i2c.c @@ -35,11 +35,11 @@ static int cs35l56_i2c_probe(struct i2c_client *client) switch (id) { case 0x3556: regmap_config = &cs35l56_regmap_i2c; - cs35l56->base.fw_reg = &cs35l56_fw_reg; + cs35l56->base.type = 0x56; break; case 0x3563: regmap_config = &cs35l63_regmap_i2c; - cs35l56->base.fw_reg = &cs35l63_fw_reg; + cs35l56->base.type = 0x63; break; default: return -ENODEV; diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c index 3905c9cb188a8c..42d24ac2977fc8 100644 --- a/sound/soc/codecs/cs35l56-sdw.c +++ b/sound/soc/codecs/cs35l56-sdw.c @@ -527,16 +527,16 @@ static int cs35l56_sdw_probe(struct sdw_slave *peripheral, const struct sdw_devi case 0x3556: case 0x3557: regmap_config = &cs35l56_regmap_sdw; - cs35l56->base.fw_reg = &cs35l56_fw_reg; break; case 0x3563: regmap_config = &cs35l63_regmap_sdw; - cs35l56->base.fw_reg = &cs35l63_fw_reg; break; default: return -ENODEV; } + cs35l56->base.type = ((unsigned int)id->driver_data) & 0xff; + cs35l56->base.regmap = devm_regmap_init(dev, &cs35l56_regmap_bus_sdw, peripheral, regmap_config); if (IS_ERR(cs35l56->base.regmap)) { diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index 95d018ecb95386..03ea66f08787ee 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -309,6 +309,40 @@ static bool cs35l63_volatile_reg(struct device *dev, unsigned int reg) } } +static const struct cs35l56_fw_reg cs35l56_fw_reg = { + .fw_ver = CS35L56_DSP1_FW_VER, + .halo_state = CS35L56_DSP1_HALO_STATE, + .pm_cur_stat = CS35L56_DSP1_PM_CUR_STATE, + .prot_sts = CS35L56_PROTECTION_STATUS, + .transducer_actual_ps = CS35L56_TRANSDUCER_ACTUAL_PS, + .user_mute = CS35L56_MAIN_RENDER_USER_MUTE, + .user_volume = CS35L56_MAIN_RENDER_USER_VOLUME, + .posture_number = CS35L56_MAIN_POSTURE_NUMBER, +}; + +static const struct cs35l56_fw_reg cs35l63_fw_reg = { + .fw_ver = CS35L63_DSP1_FW_VER, + .halo_state = CS35L63_DSP1_HALO_STATE, + .pm_cur_stat = CS35L63_DSP1_PM_CUR_STATE, + .prot_sts = CS35L63_PROTECTION_STATUS, + .transducer_actual_ps = CS35L63_TRANSDUCER_ACTUAL_PS, + .user_mute = CS35L63_MAIN_RENDER_USER_MUTE, + .user_volume = CS35L63_MAIN_RENDER_USER_VOLUME, + .posture_number = CS35L63_MAIN_POSTURE_NUMBER, +}; + +static void cs35l56_set_fw_reg_table(struct cs35l56_base *cs35l56_base) +{ + switch (cs35l56_base->type) { + default: + cs35l56_base->fw_reg = &cs35l56_fw_reg; + break; + case 0x63: + cs35l56_base->fw_reg = &cs35l63_fw_reg; + break; + } +} + int cs35l56_mbox_send(struct cs35l56_base *cs35l56_base, unsigned int command) { unsigned int val; @@ -979,6 +1013,7 @@ int cs35l56_hw_init(struct cs35l56_base *cs35l56_base) return ret; } cs35l56_base->rev = revid & (CS35L56_AREVID_MASK | CS35L56_MTLREVID_MASK); + cs35l56_set_fw_reg_table(cs35l56_base); ret = cs35l56_wait_for_firmware_boot(cs35l56_base); if (ret) @@ -1280,30 +1315,6 @@ const struct regmap_config cs35l63_regmap_sdw = { }; EXPORT_SYMBOL_NS_GPL(cs35l63_regmap_sdw, "SND_SOC_CS35L56_SHARED"); -const struct cs35l56_fw_reg cs35l56_fw_reg = { - .fw_ver = CS35L56_DSP1_FW_VER, - .halo_state = CS35L56_DSP1_HALO_STATE, - .pm_cur_stat = CS35L56_DSP1_PM_CUR_STATE, - .prot_sts = CS35L56_PROTECTION_STATUS, - .transducer_actual_ps = CS35L56_TRANSDUCER_ACTUAL_PS, - .user_mute = CS35L56_MAIN_RENDER_USER_MUTE, - .user_volume = CS35L56_MAIN_RENDER_USER_VOLUME, - .posture_number = CS35L56_MAIN_POSTURE_NUMBER, -}; -EXPORT_SYMBOL_NS_GPL(cs35l56_fw_reg, "SND_SOC_CS35L56_SHARED"); - -const struct cs35l56_fw_reg cs35l63_fw_reg = { - .fw_ver = CS35L63_DSP1_FW_VER, - .halo_state = CS35L63_DSP1_HALO_STATE, - .pm_cur_stat = CS35L63_DSP1_PM_CUR_STATE, - .prot_sts = CS35L63_PROTECTION_STATUS, - .transducer_actual_ps = CS35L63_TRANSDUCER_ACTUAL_PS, - .user_mute = CS35L63_MAIN_RENDER_USER_MUTE, - .user_volume = CS35L63_MAIN_RENDER_USER_VOLUME, - .posture_number = CS35L63_MAIN_POSTURE_NUMBER, -}; -EXPORT_SYMBOL_NS_GPL(cs35l63_fw_reg, "SND_SOC_CS35L56_SHARED"); - MODULE_DESCRIPTION("ASoC CS35L56 Shared"); MODULE_AUTHOR("Richard Fitzgerald "); MODULE_AUTHOR("Simon Trimmer "); diff --git a/sound/soc/codecs/cs35l56-spi.c b/sound/soc/codecs/cs35l56-spi.c index c2ddee22cd231b..9bc9b7c98390dc 100644 --- a/sound/soc/codecs/cs35l56-spi.c +++ b/sound/soc/codecs/cs35l56-spi.c @@ -26,7 +26,7 @@ static int cs35l56_spi_probe(struct spi_device *spi) spi_set_drvdata(spi, cs35l56); - cs35l56->base.fw_reg = &cs35l56_fw_reg; + cs35l56->base.type = 0x56; cs35l56->base.regmap = devm_regmap_init_spi(spi, regmap_config); if (IS_ERR(cs35l56->base.regmap)) { From 33da2d892b6241a3e71f96acdd0e64de5d70b7f3 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 23 Sep 2025 14:03:26 +0100 Subject: [PATCH 2951/3206] ASoC: cs35l56: Add support for CS35L56 B2 silicon This adds support for changed firmware addresses on the B2 revision of CS35L56 silicon. Signed-off-by: Richard Fitzgerald Signed-off-by: Takashi Iwai --- include/sound/cs35l56.h | 2 ++ sound/soc/codecs/cs35l56-shared.c | 40 +++++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 20dc3ee6378da1..ab044ce2aa8b35 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -85,7 +85,9 @@ #define CS35L56_DSP1_XMEM_UNPACKED24_0 0x2800000 #define CS35L56_DSP1_FW_VER 0x2800010 #define CS35L56_DSP1_HALO_STATE 0x28021E0 +#define CS35L56_B2_DSP1_HALO_STATE 0x2803D20 #define CS35L56_DSP1_PM_CUR_STATE 0x2804308 +#define CS35L56_B2_DSP1_PM_CUR_STATE 0x2804678 #define CS35L56_DSP1_XMEM_UNPACKED24_8191 0x2807FFC #define CS35L56_DSP1_CORE_BASE 0x2B80000 #define CS35L56_DSP1_SCRATCH1 0x2B805C0 diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index 03ea66f08787ee..9e6b9ca2f3547d 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -320,6 +320,17 @@ static const struct cs35l56_fw_reg cs35l56_fw_reg = { .posture_number = CS35L56_MAIN_POSTURE_NUMBER, }; +static const struct cs35l56_fw_reg cs35l56_b2_fw_reg = { + .fw_ver = CS35L56_DSP1_FW_VER, + .halo_state = CS35L56_B2_DSP1_HALO_STATE, + .pm_cur_stat = CS35L56_B2_DSP1_PM_CUR_STATE, + .prot_sts = CS35L56_PROTECTION_STATUS, + .transducer_actual_ps = CS35L56_TRANSDUCER_ACTUAL_PS, + .user_mute = CS35L56_MAIN_RENDER_USER_MUTE, + .user_volume = CS35L56_MAIN_RENDER_USER_VOLUME, + .posture_number = CS35L56_MAIN_POSTURE_NUMBER, +}; + static const struct cs35l56_fw_reg cs35l63_fw_reg = { .fw_ver = CS35L63_DSP1_FW_VER, .halo_state = CS35L63_DSP1_HALO_STATE, @@ -335,7 +346,14 @@ static void cs35l56_set_fw_reg_table(struct cs35l56_base *cs35l56_base) { switch (cs35l56_base->type) { default: - cs35l56_base->fw_reg = &cs35l56_fw_reg; + switch (cs35l56_base->rev) { + case 0xb0: + cs35l56_base->fw_reg = &cs35l56_fw_reg; + break; + default: + cs35l56_base->fw_reg = &cs35l56_b2_fw_reg; + break; + } break; case 0x63: cs35l56_base->fw_reg = &cs35l63_fw_reg; @@ -502,6 +520,11 @@ static const struct reg_sequence cs35l56_system_reset_seq[] = { REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_SYSTEM_RESET), }; +static const struct reg_sequence cs35l56_b2_system_reset_seq[] = { + REG_SEQ0(CS35L56_B2_DSP1_HALO_STATE, 0), + REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_SYSTEM_RESET), +}; + static const struct reg_sequence cs35l63_system_reset_seq[] = { REG_SEQ0(CS35L63_DSP1_HALO_STATE, 0), REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_SYSTEM_RESET), @@ -524,9 +547,18 @@ void cs35l56_system_reset(struct cs35l56_base *cs35l56_base, bool is_soundwire) case 0x54: case 0x56: case 0x57: - regmap_multi_reg_write_bypassed(cs35l56_base->regmap, - cs35l56_system_reset_seq, - ARRAY_SIZE(cs35l56_system_reset_seq)); + switch (cs35l56_base->rev) { + case 0xb0: + regmap_multi_reg_write_bypassed(cs35l56_base->regmap, + cs35l56_system_reset_seq, + ARRAY_SIZE(cs35l56_system_reset_seq)); + break; + default: + regmap_multi_reg_write_bypassed(cs35l56_base->regmap, + cs35l56_b2_system_reset_seq, + ARRAY_SIZE(cs35l56_b2_system_reset_seq)); + break; + } break; case 0x63: regmap_multi_reg_write_bypassed(cs35l56_base->regmap, From fa7d16734f9606c396681648618dd76a5af861e6 Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Sat, 27 Sep 2025 14:27:08 +0000 Subject: [PATCH 2952/3206] ALSA: compress: document 'chan_map' member in snd_dec_opus When building kernel docs, the following warning appeared: WARNING: ./include/uapi/sound/compress_params.h:364 struct member 'chan_map' not described in 'snd_dec_opus' The inline struct 'snd_dec_opus_ch_map' inside 'snd_dec_opus' was not properly documented. This patch documents the 'chan_map' member and its fields (stream_count, coupled_count, channel_map), resolving the warning. Fixes: 5d36370f3431 ("ALSA: compress: add raw opus codec define and opus decoder structs") Suggested-by: Bagas Sanjaya Signed-off-by: Kriish Sharma Signed-off-by: Takashi Iwai --- include/uapi/sound/compress_params.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/include/uapi/sound/compress_params.h b/include/uapi/sound/compress_params.h index faf4fa911f7fc2..d7db6b4e116663 100644 --- a/include/uapi/sound/compress_params.h +++ b/include/uapi/sound/compress_params.h @@ -336,16 +336,14 @@ struct snd_dec_ape { * @mapping_family: Order and meaning of output channels. Only values 0 and 1 * are expected; values 2..255 are not recommended for playback. * - * Optional channel mapping table. Describes mapping of opus streams to decoded - * channels. - * @struct snd_dec_opus_ch_map - * @stream_count: Number of streams encoded in each Ogg packet. - * @coupled_count: Number of streams whose decoders are used for two - * channels. - * @channel_map: describes which decoded channel to be used for each one. - * See RFC doc for details. - * This supports only mapping families 0 and 1, therefore max - * number of channels is 8. + * @chan_map: Optional channel mapping table. Describes mapping of opus streams + * to decoded channels. Fields: + * @chan_map.stream_count: Number of streams encoded in each Ogg packet. + * @chan_map.coupled_count: Number of streams whose decoders are used + * for two channels. + * @chan_map.channel_map: Which decoded channel to be used for each one. + * Supports only mapping families 0 and 1, + * max number of channels is 8. * * These options were extracted from RFC7845 Section 5. */ From 659169c4eb21f8d9646044a4f4e1bc314f6f9d0c Mon Sep 17 00:00:00 2001 From: Roy Vegard Ovesen Date: Sat, 27 Sep 2025 17:27:30 +0200 Subject: [PATCH 2953/3206] ALSA: usb-audio: add mono main switch to Presonus S1824c The 1824c does not have the A/B switch that the 1810c has, but instead it has a mono main switch that sums the two main output channels to mono. Signed-off-by: Roy Vegard Ovesen Signed-off-by: Takashi Iwai --- sound/usb/mixer_s1810c.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/sound/usb/mixer_s1810c.c b/sound/usb/mixer_s1810c.c index fac4bbc6b27577..bd24556f6a7fbe 100644 --- a/sound/usb/mixer_s1810c.c +++ b/sound/usb/mixer_s1810c.c @@ -93,6 +93,7 @@ struct s1810c_ctl_packet { #define SC1810C_CTL_LINE_SW 0 #define SC1810C_CTL_MUTE_SW 1 +#define SC1824C_CTL_MONO_SW 2 #define SC1810C_CTL_AB_SW 3 #define SC1810C_CTL_48V_SW 4 @@ -123,6 +124,7 @@ struct s1810c_state_packet { #define SC1810C_STATE_48V_SW 58 #define SC1810C_STATE_LINE_SW 59 #define SC1810C_STATE_MUTE_SW 60 +#define SC1824C_STATE_MONO_SW 61 #define SC1810C_STATE_AB_SW 62 struct s1810_mixer_state { @@ -502,6 +504,15 @@ static const struct snd_kcontrol_new snd_s1810c_mute_sw = { .private_value = (SC1810C_STATE_MUTE_SW | SC1810C_CTL_MUTE_SW << 8) }; +static const struct snd_kcontrol_new snd_s1824c_mono_sw = { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "Mono Main Out Switch", + .info = snd_ctl_boolean_mono_info, + .get = snd_s1810c_switch_get, + .put = snd_s1810c_switch_set, + .private_value = (SC1824C_STATE_MONO_SW | SC1824C_CTL_MONO_SW << 8) +}; + static const struct snd_kcontrol_new snd_s1810c_48v_sw = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "48V Phantom Power On Mic Inputs Switch", @@ -588,8 +599,17 @@ int snd_sc1810_init_mixer(struct usb_mixer_interface *mixer) if (ret < 0) return ret; - ret = snd_s1810c_switch_init(mixer, &snd_s1810c_ab_sw); - if (ret < 0) - return ret; + // The 1824c has a Mono Main switch instead of a + // A/B select switch. + if (mixer->chip->usb_id == USB_ID(0x194f, 0x010d)) { + ret = snd_s1810c_switch_init(mixer, &snd_s1824c_mono_sw); + if (ret < 0) + return ret; + } else if (mixer->chip->usb_id == USB_ID(0x194f, 0x010c)) { + ret = snd_s1810c_switch_init(mixer, &snd_s1810c_ab_sw); + if (ret < 0) + return ret; + } + return ret; } From 9f2c0ac1423d5f267e7f1d1940780fc764b0fee3 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Sun, 28 Sep 2025 02:39:24 +0900 Subject: [PATCH 2954/3206] ALSA: usb-audio: fix race condition to UAF in snd_usbmidi_free The previous commit 0718a78f6a9f ("ALSA: usb-audio: Kill timer properly at removal") patched a UAF issue caused by the error timer. However, because the error timer kill added in this patch occurs after the endpoint delete, a race condition to UAF still occurs, albeit rarely. Additionally, since kill-cleanup for urb is also missing, freed memory can be accessed in interrupt context related to urb, which can cause UAF. Therefore, to prevent this, error timer and urb must be killed before freeing the heap memory. Cc: Reported-by: syzbot+f02665daa2abeef4a947@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f02665daa2abeef4a947 Fixes: 0718a78f6a9f ("ALSA: usb-audio: Kill timer properly at removal") Signed-off-by: Jeongjun Park Signed-off-by: Takashi Iwai --- sound/usb/midi.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sound/usb/midi.c b/sound/usb/midi.c index acb3bf92857c10..97e7e7662b12de 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1522,15 +1522,14 @@ static void snd_usbmidi_free(struct snd_usb_midi *umidi) { int i; + if (!umidi->disconnected) + snd_usbmidi_disconnect(&umidi->list); + for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) { struct snd_usb_midi_endpoint *ep = &umidi->endpoints[i]; - if (ep->out) - snd_usbmidi_out_endpoint_delete(ep->out); - if (ep->in) - snd_usbmidi_in_endpoint_delete(ep->in); + kfree(ep->out); } mutex_destroy(&umidi->mutex); - timer_shutdown_sync(&umidi->error_timer); kfree(umidi); } From ce0172627390c3c3885c60093904c79fbe19c543 Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Sun, 28 Sep 2025 11:07:59 +0800 Subject: [PATCH 2955/3206] ALSA: usb-audio: add two-way convert between name and bit for QUIRK_FLAG_* Define quirk flags enum and its name. Add helper for converting between quirk flags' name and bit. - snd_usb_quirk_flag_find_name() - snd_usb_quirk_flags_from_name() Add helper for printing debug: log snd_usb_apply_flag_dbg() Co-developed-by: Takashi Iwai Signed-off-by: Cryolitia PukNgae Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 83 ++++++++++++++++++++++++++++++++++++++++--- sound/usb/quirks.h | 7 ++++ sound/usb/usbaudio.h | 84 ++++++++++++++++++++++++++++++-------------- 3 files changed, 144 insertions(+), 30 deletions(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 7e8609ac6822fe..6263163c5c7b61 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2443,6 +2443,84 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { {} /* terminator */ }; +#define QUIRK_STRING_ENTRY(x) \ + [QUIRK_TYPE_ ## x] = __stringify(x) + +static const char *const snd_usb_audio_quirk_flag_names[] = { + QUIRK_STRING_ENTRY(GET_SAMPLE_RATE), + QUIRK_STRING_ENTRY(SHARE_MEDIA_DEVICE), + QUIRK_STRING_ENTRY(ALIGN_TRANSFER), + QUIRK_STRING_ENTRY(TX_LENGTH), + QUIRK_STRING_ENTRY(PLAYBACK_FIRST), + QUIRK_STRING_ENTRY(SKIP_CLOCK_SELECTOR), + QUIRK_STRING_ENTRY(IGNORE_CLOCK_SOURCE), + QUIRK_STRING_ENTRY(ITF_USB_DSD_DAC), + QUIRK_STRING_ENTRY(CTL_MSG_DELAY), + QUIRK_STRING_ENTRY(CTL_MSG_DELAY_1M), + QUIRK_STRING_ENTRY(CTL_MSG_DELAY_5M), + QUIRK_STRING_ENTRY(IFACE_DELAY), + QUIRK_STRING_ENTRY(VALIDATE_RATES), + QUIRK_STRING_ENTRY(DISABLE_AUTOSUSPEND), + QUIRK_STRING_ENTRY(IGNORE_CTL_ERROR), + QUIRK_STRING_ENTRY(DSD_RAW), + QUIRK_STRING_ENTRY(SET_IFACE_FIRST), + QUIRK_STRING_ENTRY(GENERIC_IMPLICIT_FB), + QUIRK_STRING_ENTRY(SKIP_IMPLICIT_FB), + QUIRK_STRING_ENTRY(IFACE_SKIP_CLOSE), + QUIRK_STRING_ENTRY(FORCE_IFACE_RESET), + QUIRK_STRING_ENTRY(FIXED_RATE), + QUIRK_STRING_ENTRY(MIC_RES_16), + QUIRK_STRING_ENTRY(MIC_RES_384), + QUIRK_STRING_ENTRY(MIXER_PLAYBACK_MIN_MUTE), + QUIRK_STRING_ENTRY(MIXER_CAPTURE_MIN_MUTE), + NULL +}; + +const char *snd_usb_quirk_flag_find_name(unsigned long index) +{ + if (index >= ARRAY_SIZE(snd_usb_audio_quirk_flag_names)) + return NULL; + + return snd_usb_audio_quirk_flag_names[index]; +} + +u32 snd_usb_quirk_flags_from_name(const char *name) +{ + int i; + + if (!name || !*name) + return 0; + + for (i = 0; snd_usb_audio_quirk_flag_names[i]; i++) { + if (strcasecmp(name, snd_usb_audio_quirk_flag_names[i]) == 0) + return BIT_U32(i); + } + + return 0; +} + +void snd_usb_apply_flag_dbg(const char *reason, + struct snd_usb_audio *chip, + unsigned long flag) +{ + unsigned long bit; + + for_each_set_bit(bit, &flag, BYTES_TO_BITS(sizeof(flag))) { + const char *name = snd_usb_audio_quirk_flag_names[bit]; + + if (name) + usb_audio_dbg(chip, + "From %s apply quirk flag %s for device %04x:%04x\n", + reason, name, USB_ID_VENDOR(chip->usb_id), + USB_ID_PRODUCT(chip->usb_id)); + else + usb_audio_warn(chip, + "From %s apply unknown quirk flag 0x%lx for device %04x:%04x\n", + reason, bit, USB_ID_VENDOR(chip->usb_id), + USB_ID_PRODUCT(chip->usb_id)); + } +} + void snd_usb_init_quirk_flags(struct snd_usb_audio *chip) { const struct usb_audio_quirk_flags_table *p; @@ -2451,10 +2529,7 @@ void snd_usb_init_quirk_flags(struct snd_usb_audio *chip) if (chip->usb_id == p->id || (!USB_ID_PRODUCT(p->id) && USB_ID_VENDOR(chip->usb_id) == USB_ID_VENDOR(p->id))) { - usb_audio_dbg(chip, - "Set quirk_flags 0x%x for device %04x:%04x\n", - p->flags, USB_ID_VENDOR(chip->usb_id), - USB_ID_PRODUCT(chip->usb_id)); + snd_usb_apply_flag_dbg("builtin table", chip, p->flags); chip->quirk_flags |= p->flags; return; } diff --git a/sound/usb/quirks.h b/sound/usb/quirks.h index f9bfd5ac7bab01..1b727d3c35c703 100644 --- a/sound/usb/quirks.h +++ b/sound/usb/quirks.h @@ -48,6 +48,13 @@ void snd_usb_audioformat_attributes_quirk(struct snd_usb_audio *chip, struct audioformat *fp, int stream); +void snd_usb_apply_flag_dbg(const char *reason, + struct snd_usb_audio *chip, + unsigned long flag); + void snd_usb_init_quirk_flags(struct snd_usb_audio *chip); +const char *snd_usb_quirk_flag_find_name(unsigned long flag); +u32 snd_usb_quirk_flags_from_name(const char *name); + #endif /* __USBAUDIO_QUIRKS_H */ diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h index 30b5102e3caed0..79978cae9799cd 100644 --- a/sound/usb/usbaudio.h +++ b/sound/usb/usbaudio.h @@ -226,31 +226,63 @@ extern bool snd_usb_skip_validation; * Similar to QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE, but for capture streams */ -#define QUIRK_FLAG_GET_SAMPLE_RATE (1U << 0) -#define QUIRK_FLAG_SHARE_MEDIA_DEVICE (1U << 1) -#define QUIRK_FLAG_ALIGN_TRANSFER (1U << 2) -#define QUIRK_FLAG_TX_LENGTH (1U << 3) -#define QUIRK_FLAG_PLAYBACK_FIRST (1U << 4) -#define QUIRK_FLAG_SKIP_CLOCK_SELECTOR (1U << 5) -#define QUIRK_FLAG_IGNORE_CLOCK_SOURCE (1U << 6) -#define QUIRK_FLAG_ITF_USB_DSD_DAC (1U << 7) -#define QUIRK_FLAG_CTL_MSG_DELAY (1U << 8) -#define QUIRK_FLAG_CTL_MSG_DELAY_1M (1U << 9) -#define QUIRK_FLAG_CTL_MSG_DELAY_5M (1U << 10) -#define QUIRK_FLAG_IFACE_DELAY (1U << 11) -#define QUIRK_FLAG_VALIDATE_RATES (1U << 12) -#define QUIRK_FLAG_DISABLE_AUTOSUSPEND (1U << 13) -#define QUIRK_FLAG_IGNORE_CTL_ERROR (1U << 14) -#define QUIRK_FLAG_DSD_RAW (1U << 15) -#define QUIRK_FLAG_SET_IFACE_FIRST (1U << 16) -#define QUIRK_FLAG_GENERIC_IMPLICIT_FB (1U << 17) -#define QUIRK_FLAG_SKIP_IMPLICIT_FB (1U << 18) -#define QUIRK_FLAG_IFACE_SKIP_CLOSE (1U << 19) -#define QUIRK_FLAG_FORCE_IFACE_RESET (1U << 20) -#define QUIRK_FLAG_FIXED_RATE (1U << 21) -#define QUIRK_FLAG_MIC_RES_16 (1U << 22) -#define QUIRK_FLAG_MIC_RES_384 (1U << 23) -#define QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE (1U << 24) -#define QUIRK_FLAG_MIXER_CAPTURE_MIN_MUTE (1U << 25) +enum { + QUIRK_TYPE_GET_SAMPLE_RATE = 0, + QUIRK_TYPE_SHARE_MEDIA_DEVICE = 1, + QUIRK_TYPE_ALIGN_TRANSFER = 2, + QUIRK_TYPE_TX_LENGTH = 3, + QUIRK_TYPE_PLAYBACK_FIRST = 4, + QUIRK_TYPE_SKIP_CLOCK_SELECTOR = 5, + QUIRK_TYPE_IGNORE_CLOCK_SOURCE = 6, + QUIRK_TYPE_ITF_USB_DSD_DAC = 7, + QUIRK_TYPE_CTL_MSG_DELAY = 8, + QUIRK_TYPE_CTL_MSG_DELAY_1M = 9, + QUIRK_TYPE_CTL_MSG_DELAY_5M = 10, + QUIRK_TYPE_IFACE_DELAY = 11, + QUIRK_TYPE_VALIDATE_RATES = 12, + QUIRK_TYPE_DISABLE_AUTOSUSPEND = 13, + QUIRK_TYPE_IGNORE_CTL_ERROR = 14, + QUIRK_TYPE_DSD_RAW = 15, + QUIRK_TYPE_SET_IFACE_FIRST = 16, + QUIRK_TYPE_GENERIC_IMPLICIT_FB = 17, + QUIRK_TYPE_SKIP_IMPLICIT_FB = 18, + QUIRK_TYPE_IFACE_SKIP_CLOSE = 19, + QUIRK_TYPE_FORCE_IFACE_RESET = 20, + QUIRK_TYPE_FIXED_RATE = 21, + QUIRK_TYPE_MIC_RES_16 = 22, + QUIRK_TYPE_MIC_RES_384 = 23, + QUIRK_TYPE_MIXER_PLAYBACK_MIN_MUTE = 24, + QUIRK_TYPE_MIXER_CAPTURE_MIN_MUTE = 25, +/* Please also edit snd_usb_audio_quirk_flag_names */ +}; + +#define QUIRK_FLAG(x) BIT_U32(QUIRK_TYPE_ ## x) + +#define QUIRK_FLAG_GET_SAMPLE_RATE QUIRK_FLAG(GET_SAMPLE_RATE) +#define QUIRK_FLAG_SHARE_MEDIA_DEVICE QUIRK_FLAG(SHARE_MEDIA_DEVICE) +#define QUIRK_FLAG_ALIGN_TRANSFER QUIRK_FLAG(ALIGN_TRANSFER) +#define QUIRK_FLAG_TX_LENGTH QUIRK_FLAG(TX_LENGTH) +#define QUIRK_FLAG_PLAYBACK_FIRST QUIRK_FLAG(PLAYBACK_FIRST) +#define QUIRK_FLAG_SKIP_CLOCK_SELECTOR QUIRK_FLAG(SKIP_CLOCK_SELECTOR) +#define QUIRK_FLAG_IGNORE_CLOCK_SOURCE QUIRK_FLAG(IGNORE_CLOCK_SOURCE) +#define QUIRK_FLAG_ITF_USB_DSD_DAC QUIRK_FLAG(ITF_USB_DSD_DAC) +#define QUIRK_FLAG_CTL_MSG_DELAY QUIRK_FLAG(CTL_MSG_DELAY) +#define QUIRK_FLAG_CTL_MSG_DELAY_1M QUIRK_FLAG(CTL_MSG_DELAY_1M) +#define QUIRK_FLAG_CTL_MSG_DELAY_5M QUIRK_FLAG(CTL_MSG_DELAY_5M) +#define QUIRK_FLAG_IFACE_DELAY QUIRK_FLAG(IFACE_DELAY) +#define QUIRK_FLAG_VALIDATE_RATES QUIRK_FLAG(VALIDATE_RATES) +#define QUIRK_FLAG_DISABLE_AUTOSUSPEND QUIRK_FLAG(DISABLE_AUTOSUSPEND) +#define QUIRK_FLAG_IGNORE_CTL_ERROR QUIRK_FLAG(IGNORE_CTL_ERROR) +#define QUIRK_FLAG_DSD_RAW QUIRK_FLAG(DSD_RAW) +#define QUIRK_FLAG_SET_IFACE_FIRST QUIRK_FLAG(SET_IFACE_FIRST) +#define QUIRK_FLAG_GENERIC_IMPLICIT_FB QUIRK_FLAG(GENERIC_IMPLICIT_FB) +#define QUIRK_FLAG_SKIP_IMPLICIT_FB QUIRK_FLAG(SKIP_IMPLICIT_FB) +#define QUIRK_FLAG_IFACE_SKIP_CLOSE QUIRK_FLAG(IFACE_SKIP_CLOSE) +#define QUIRK_FLAG_FORCE_IFACE_RESET QUIRK_FLAG(FORCE_IFACE_RESET) +#define QUIRK_FLAG_FIXED_RATE QUIRK_FLAG(FIXED_RATE) +#define QUIRK_FLAG_MIC_RES_16 QUIRK_FLAG(MIC_RES_16) +#define QUIRK_FLAG_MIC_RES_384 QUIRK_FLAG(MIC_RES_384) +#define QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE QUIRK_FLAG(MIXER_PLAYBACK_MIN_MUTE) +#define QUIRK_FLAG_MIXER_CAPTURE_MIN_MUTE QUIRK_FLAG(MIXER_CAPTURE_MIN_MUTE) #endif /* __USBAUDIO_H */ From ffd586126a82e497e16336a1006430584d0570c4 Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Sun, 28 Sep 2025 11:08:00 +0800 Subject: [PATCH 2956/3206] ALSA: usb-audio: improve module param quirk_flags It accepts strings like `VID:PID:quirk_flag_name1|quirk_flag_name2;...` from now on, so that we can use it to debug USB audio devices more intuitive and flexible. The compatibility of previous form is kept. Co-developed-by: Takashi Iwai Signed-off-by: Cryolitia PukNgae Signed-off-by: Takashi Iwai --- sound/usb/card.c | 32 +++++++++++++---- sound/usb/quirks.c | 90 +++++++++++++++++++++++++++++++++++++++++++++- sound/usb/quirks.h | 4 ++- 3 files changed, 118 insertions(+), 8 deletions(-) diff --git a/sound/usb/card.c b/sound/usb/card.c index 0265206a8e8cf3..66c6c92fa9e8d2 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -73,7 +73,7 @@ static bool lowlatency = true; static char *quirk_alias[SNDRV_CARDS]; static char *delayed_register[SNDRV_CARDS]; static bool implicit_fb[SNDRV_CARDS]; -static unsigned int quirk_flags[SNDRV_CARDS]; +static char *quirk_flags[SNDRV_CARDS]; bool snd_usb_use_vmalloc = true; bool snd_usb_skip_validation; @@ -103,7 +103,7 @@ module_param_array(delayed_register, charp, NULL, 0444); MODULE_PARM_DESC(delayed_register, "Quirk for delayed registration, given by id:iface, e.g. 0123abcd:4."); module_param_array(implicit_fb, bool, NULL, 0444); MODULE_PARM_DESC(implicit_fb, "Apply generic implicit feedback sync mode."); -module_param_array(quirk_flags, uint, NULL, 0444); +module_param_array(quirk_flags, charp, NULL, 0444); MODULE_PARM_DESC(quirk_flags, "Driver quirk bit flags."); module_param_named(use_vmalloc, snd_usb_use_vmalloc, bool, 0444); MODULE_PARM_DESC(use_vmalloc, "Use vmalloc for PCM intermediate buffers (default: yes)."); @@ -692,6 +692,29 @@ static void usb_audio_make_longname(struct usb_device *dev, } } +static void snd_usb_init_quirk_flags(int idx, struct snd_usb_audio *chip) +{ + size_t i; + + /* old style option found: the position-based integer value */ + if (quirk_flags[idx] && + !kstrtou32(quirk_flags[idx], 0, &chip->quirk_flags)) { + snd_usb_apply_flag_dbg("module param", chip, chip->quirk_flags); + return; + } + + /* take the default quirk from the quirk table */ + snd_usb_init_quirk_flags_table(chip); + + /* add or correct quirk bits from options */ + for (i = 0; i < ARRAY_SIZE(quirk_flags); i++) { + if (!quirk_flags[i] || !*quirk_flags[i]) + break; + + snd_usb_init_quirk_flags_parse_string(chip, quirk_flags[i]); + } +} + /* * create a chip instance and set its names. */ @@ -750,10 +773,7 @@ static int snd_usb_audio_create(struct usb_interface *intf, INIT_LIST_HEAD(&chip->midi_v2_list); INIT_LIST_HEAD(&chip->mixer_list); - if (quirk_flags[idx]) - chip->quirk_flags = quirk_flags[idx]; - else - snd_usb_init_quirk_flags(chip); + snd_usb_init_quirk_flags(idx, chip); card->private_free = snd_usb_audio_free; diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 6263163c5c7b61..634cb4fb586f91 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2521,7 +2521,7 @@ void snd_usb_apply_flag_dbg(const char *reason, } } -void snd_usb_init_quirk_flags(struct snd_usb_audio *chip) +void snd_usb_init_quirk_flags_table(struct snd_usb_audio *chip) { const struct usb_audio_quirk_flags_table *p; @@ -2535,3 +2535,91 @@ void snd_usb_init_quirk_flags(struct snd_usb_audio *chip) } } } + +void snd_usb_init_quirk_flags_parse_string(struct snd_usb_audio *chip, + const char *str) +{ + u16 chip_vid = USB_ID_VENDOR(chip->usb_id); + u16 chip_pid = USB_ID_PRODUCT(chip->usb_id); + u32 mask_flags, unmask_flags, bit; + char *p, *field, *flag; + bool is_unmask; + u16 vid, pid; + + char *val __free(kfree) = kstrdup(str, GFP_KERNEL); + + if (!val) + return; + + for (p = val; p && *p;) { + /* Each entry consists of VID:PID:flags */ + field = strsep(&p, ":"); + if (!field) + break; + + if (strcmp(field, "*") == 0) + vid = 0; + else if (kstrtou16(field, 16, &vid)) + break; + + field = strsep(&p, ":"); + if (!field) + break; + + if (strcmp(field, "*") == 0) + pid = 0; + else if (kstrtou16(field, 16, &pid)) + break; + + field = strsep(&p, ";"); + if (!field || !*field) + break; + + if ((vid != 0 && vid != chip_vid) || + (pid != 0 && pid != chip_pid)) + continue; + + /* Collect the flags */ + mask_flags = 0; + unmask_flags = 0; + while (field && *field) { + flag = strsep(&field, "|"); + + if (!flag) + break; + + if (*flag == '!') { + is_unmask = true; + flag++; + } else { + is_unmask = false; + } + + if (!kstrtou32(flag, 16, &bit)) { + if (is_unmask) + unmask_flags |= bit; + else + mask_flags |= bit; + + break; + } + + bit = snd_usb_quirk_flags_from_name(flag); + + if (bit) { + if (is_unmask) + unmask_flags |= bit; + else + mask_flags |= bit; + } else { + pr_warn("snd_usb_audio: unknown flag %s while parsing param quirk_flags\n", + flag); + } + } + + chip->quirk_flags &= ~unmask_flags; + chip->quirk_flags |= mask_flags; + snd_usb_apply_flag_dbg("module param", chip, + chip->quirk_flags); + } +} diff --git a/sound/usb/quirks.h b/sound/usb/quirks.h index 1b727d3c35c703..f24d6a5a197a64 100644 --- a/sound/usb/quirks.h +++ b/sound/usb/quirks.h @@ -52,7 +52,9 @@ void snd_usb_apply_flag_dbg(const char *reason, struct snd_usb_audio *chip, unsigned long flag); -void snd_usb_init_quirk_flags(struct snd_usb_audio *chip); +void snd_usb_init_quirk_flags_table(struct snd_usb_audio *chip); +void snd_usb_init_quirk_flags_parse_string(struct snd_usb_audio *chip, + const char *str); const char *snd_usb_quirk_flag_find_name(unsigned long flag); u32 snd_usb_quirk_flags_from_name(const char *name); From 98b5427bb64fbc039b7715992e137a572ac14ff1 Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Sun, 28 Sep 2025 11:08:01 +0800 Subject: [PATCH 2957/3206] ALSA: usb-audio: make param quirk_flags change-able in runtime Change its permision from 0644 to 0444, and add runtime processing. Developers now can change it during sysfs, without rebooting, for debugging new buggy devices. Co-developed-by: Takashi Iwai Signed-off-by: Cryolitia PukNgae Signed-off-by: Takashi Iwai --- sound/usb/card.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/sound/usb/card.c b/sound/usb/card.c index 66c6c92fa9e8d2..1d5a65eac93350 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -103,13 +103,32 @@ module_param_array(delayed_register, charp, NULL, 0444); MODULE_PARM_DESC(delayed_register, "Quirk for delayed registration, given by id:iface, e.g. 0123abcd:4."); module_param_array(implicit_fb, bool, NULL, 0444); MODULE_PARM_DESC(implicit_fb, "Apply generic implicit feedback sync mode."); -module_param_array(quirk_flags, charp, NULL, 0444); -MODULE_PARM_DESC(quirk_flags, "Driver quirk bit flags."); module_param_named(use_vmalloc, snd_usb_use_vmalloc, bool, 0444); MODULE_PARM_DESC(use_vmalloc, "Use vmalloc for PCM intermediate buffers (default: yes)."); module_param_named(skip_validation, snd_usb_skip_validation, bool, 0444); MODULE_PARM_DESC(skip_validation, "Skip unit descriptor validation (default: no)."); +/* protects quirk_flags */ +static DEFINE_MUTEX(quirk_flags_mutex); + +static int param_set_quirkp(const char *val, + const struct kernel_param *kp) +{ + guard(mutex)(&quirk_flags_mutex); + return param_set_charp(val, kp); +} + +static const struct kernel_param_ops param_ops_quirkp = { + .set = param_set_quirkp, + .get = param_get_charp, + .free = param_free_charp, +}; + +#define param_check_quirkp param_check_charp + +module_param_array(quirk_flags, quirkp, NULL, 0644); +MODULE_PARM_DESC(quirk_flags, "Add/modify USB audio quirks"); + /* * we keep the snd_usb_audio_t instances by ourselves for merging * the all interfaces on the same card as one sound device. @@ -696,6 +715,8 @@ static void snd_usb_init_quirk_flags(int idx, struct snd_usb_audio *chip) { size_t i; + guard(mutex)(&quirk_flags_mutex); + /* old style option found: the position-based integer value */ if (quirk_flags[idx] && !kstrtou32(quirk_flags[idx], 0, &chip->quirk_flags)) { From a767d3583d4e800d6a2d0378d7e57513bb4d4bba Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Sun, 28 Sep 2025 11:08:02 +0800 Subject: [PATCH 2958/3206] ALSA: doc: improved docs about quirk_flags in snd-usb-audio Just briefly described about the option's change and its new usage. Signed-off-by: Cryolitia PukNgae Signed-off-by: Takashi Iwai --- Documentation/sound/alsa-configuration.rst | 108 ++++++++++++++------- 1 file changed, 75 insertions(+), 33 deletions(-) diff --git a/Documentation/sound/alsa-configuration.rst b/Documentation/sound/alsa-configuration.rst index a2fb8ed251dd02..0a4eaa7d66ddd0 100644 --- a/Documentation/sound/alsa-configuration.rst +++ b/Documentation/sound/alsa-configuration.rst @@ -2297,39 +2297,81 @@ skip_validation of the unit descriptor instead of a driver probe error, so that we can check its details. quirk_flags - Contains the bit flags for various device specific workarounds. - Applied to the corresponding card index. - - * bit 0: Skip reading sample rate for devices - * bit 1: Create Media Controller API entries - * bit 2: Allow alignment on audio sub-slot at transfer - * bit 3: Add length specifier to transfers - * bit 4: Start playback stream at first in implement feedback mode - * bit 5: Skip clock selector setup - * bit 6: Ignore errors from clock source search - * bit 7: Indicates ITF-USB DSD based DACs - * bit 8: Add a delay of 20ms at each control message handling - * bit 9: Add a delay of 1-2ms at each control message handling - * bit 10: Add a delay of 5-6ms at each control message handling - * bit 11: Add a delay of 50ms at each interface setup - * bit 12: Perform sample rate validations at probe - * bit 13: Disable runtime PM autosuspend - * bit 14: Ignore errors for mixer access - * bit 15: Support generic DSD raw U32_BE format - * bit 16: Set up the interface at first like UAC1 - * bit 17: Apply the generic implicit feedback sync mode - * bit 18: Don't apply implicit feedback sync mode - * bit 19: Don't closed interface during setting sample rate - * bit 20: Force an interface reset whenever stopping & restarting - a stream - * bit 21: Do not set PCM rate (frequency) when only one rate is - available for the given endpoint. - * bit 22: Set the fixed resolution 16 for Mic Capture Volume - * bit 23: Set the fixed resolution 384 for Mic Capture Volume - * bit 24: Set minimum volume control value as mute for devices - where the lowest playback value represents muted state instead - of minimum audible volume - * bit 25: Be similar to bit 24 but for capture streams + The option provides a refined and flexible control for applying quirk + flags. It allows to specify the quirk flags for each device, and can + be modified dynamically via sysfs. + The old usage accepts an array of integers, each of which applies quirk + flags on the device in the order of probing. + E.g., ``quirk_flags=0x01,0x02`` applies get_sample_rate to the first + device, and share_media_device to the second device. + The new usage accepts a string in the format of + ``VID1:PID1:FLAGS1;VID2:PID2:FLAGS2;...``, where ``VIDx`` and ``PIDx`` + specify the device, and ``FLAGSx`` specify the flags to be applied. + ``VIDx`` and ``PIDx`` are 4-digit hexadecimal numbers, and can be + specified as ``*`` to match any value. ``FLAGSx`` can be a set of + flags given by name, separated by ``|``, or a hexadecimal number + representing the bit flags. The available flag names are listed below. + An exclamation mark can be prefixed to a flag name to negate the flag. + For example, ``1234:abcd:mixer_playback_min_mute|!ignore_ctl_error;*:*:0x01;`` + applies the ``mixer_playback_min_mute`` flag and clears the + ``ignore_ctl_error`` flag for the device 1234:abcd, and applies the + ``skip_sample_rate`` flag for all devices. + + * bit 0: ``get_sample_rate`` + Skip reading sample rate for devices + * bit 1: ``share_media_device`` + Create Media Controller API entries + * bit 2: ``align_transfer`` + Allow alignment on audio sub-slot at transfer + * bit 3: ``tx_length`` + Add length specifier to transfers + * bit 4: ``playback_first`` + Start playback stream at first in implement feedback mode + * bit 5: ``skip_clock_selector`` + Skip clock selector setup + * bit 6: ``ignore_clock_source`` + Ignore errors from clock source search + * bit 7: ``itf_usb_dsd_dac`` + Indicates ITF-USB DSD-based DACs + * bit 8: ``ctl_msg_delay`` + Add a delay of 20ms at each control message handling + * bit 9: ``ctl_msg_delay_1m`` + Add a delay of 1-2ms at each control message handling + * bit 10: ``ctl_msg_delay_5m`` + Add a delay of 5-6ms at each control message handling + * bit 11: ``iface_delay`` + Add a delay of 50ms at each interface setup + * bit 12: ``validate_rates`` + Perform sample rate validations at probe + * bit 13: ``disable_autosuspend`` + Disable runtime PM autosuspend + * bit 14: ``ignore_ctl_error`` + Ignore errors for mixer access + * bit 15: ``dsd_raw`` + Support generic DSD raw U32_BE format + * bit 16: ``set_iface_first`` + Set up the interface at first like UAC1 + * bit 17: ``generic_implicit_fb`` + Apply the generic implicit feedback sync mode + * bit 18: ``skip_implicit_fb`` + Don't apply implicit feedback sync mode + * bit 19: ``iface_skip_close`` + Don't close interface during setting sample rate + * bit 20: ``force_iface_reset`` + Force an interface reset whenever stopping & restarting a stream + * bit 21: ``fixed_rate`` + Do not set PCM rate (frequency) when only one rate is available + for the given endpoint + * bit 22: ``mic_res_16`` + Set the fixed resolution 16 for Mic Capture Volume + * bit 23: ``mic_res_384`` + Set the fixed resolution 384 for Mic Capture Volume + * bit 24: ``mixer_playback_min_mute`` + Set minimum volume control value as mute for devices where the + lowest playback value represents muted state instead of minimum + audible volume + * bit 25: ``mixer_capture_min_mute`` + Similar to bit 24 but for capture streams This module supports multiple devices, autoprobe and hotplugging. From 15cf39221e89ad413c3cc2cb8f15a2487db2ba2f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 27 Sep 2025 20:53:04 +0000 Subject: [PATCH 2959/3206] selftests/bpf: Add stress test for rqspinlock in NMI Introduce a kernel module that will exercise lock acquisition in the NMI path, and bias toward creating contention such that NMI waiters end up being non-head waiters. Prior to the rqspinlock fix made in the commit 0d80e7f951be ("rqspinlock: Choose trylock fallback for NMI waiters"), it was possible for the queueing path of non-head waiters to get stuck in NMI, which this stress test reproduces fairly easily with just 3 CPUs. Both AA and ABBA flavors are supported, and it will serve as a test case for future fixes that address this corner case. More information about the problem in question is available in the commit cited above. When the fix is reverted, this stress test will lock up the system. To enable this test automatically through the test_progs infrastructure, add a load_module_params API to exercise both AA and ABBA cases when running the test. Note that the test runs for at most 5 seconds, and becomes a noop after that, in order to allow the system to make forward progress. In addition, CPU 0 is always kept untouched by the created threads and NMIs. The test will automatically scale to the number of available online CPUs. Note that at least 3 CPUs are necessary to run this test, hence skip the selftest in case the environment has less than 3 CPUs available. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250927205304.199760-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 +- .../selftests/bpf/prog_tests/res_spin_lock.c | 16 ++ .../testing/selftests/bpf/test_kmods/Makefile | 2 +- .../bpf/test_kmods/bpf_test_rqspinlock.c | 209 ++++++++++++++++++ tools/testing/selftests/bpf/testing_helpers.c | 14 +- tools/testing/selftests/bpf/testing_helpers.h | 1 + 6 files changed, 240 insertions(+), 4 deletions(-) create mode 100644 tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 0b6ee902bce512..f00587d4ede68e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -120,7 +120,7 @@ TEST_PROGS_EXTENDED := \ test_bpftool.py TEST_KMODS := bpf_testmod.ko bpf_test_no_cfi.ko bpf_test_modorder_x.ko \ - bpf_test_modorder_y.ko + bpf_test_modorder_y.ko bpf_test_rqspinlock.ko TEST_KMOD_TARGETS = $(addprefix $(OUTPUT)/,$(TEST_KMODS)) # Compile but not part of 'make run_tests' diff --git a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c index 0703e987df8997..8c6c2043a43275 100644 --- a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c @@ -99,3 +99,19 @@ void test_res_spin_lock_success(void) res_spin_lock__destroy(skel); return; } + +void serial_test_res_spin_lock_stress(void) +{ + if (libbpf_num_possible_cpus() < 3) { + test__skip(); + return; + } + + ASSERT_OK(load_module("bpf_test_rqspinlock.ko", false), "load module AA"); + sleep(5); + unload_module("bpf_test_rqspinlock", false); + + ASSERT_OK(load_module_params("bpf_test_rqspinlock.ko", "test_ab=1", false), "load module ABBA"); + sleep(5); + unload_module("bpf_test_rqspinlock", false); +} diff --git a/tools/testing/selftests/bpf/test_kmods/Makefile b/tools/testing/selftests/bpf/test_kmods/Makefile index d4e50c4509c93a..63c4d3f6a12f6b 100644 --- a/tools/testing/selftests/bpf/test_kmods/Makefile +++ b/tools/testing/selftests/bpf/test_kmods/Makefile @@ -8,7 +8,7 @@ Q = @ endif MODULES = bpf_testmod.ko bpf_test_no_cfi.ko bpf_test_modorder_x.ko \ - bpf_test_modorder_y.ko + bpf_test_modorder_y.ko bpf_test_rqspinlock.ko $(foreach m,$(MODULES),$(eval obj-m += $(m:.ko=.o))) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c new file mode 100644 index 00000000000000..769206fc70e485 --- /dev/null +++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct perf_event_attr hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, + .sample_period = 100000, +}; + +static rqspinlock_t lock_a; +static rqspinlock_t lock_b; + +static struct perf_event **rqsl_evts; +static int rqsl_nevts; + +static bool test_ab = false; +module_param(test_ab, bool, 0644); +MODULE_PARM_DESC(test_ab, "Test ABBA situations instead of AA situations"); + +static struct task_struct **rqsl_threads; +static int rqsl_nthreads; +static atomic_t rqsl_ready_cpus = ATOMIC_INIT(0); + +static int pause = 0; + +static bool nmi_locks_a(int cpu) +{ + return (cpu & 1) && test_ab; +} + +static int rqspinlock_worker_fn(void *arg) +{ + int cpu = smp_processor_id(); + unsigned long flags; + int ret; + + if (cpu) { + atomic_inc(&rqsl_ready_cpus); + + while (!kthread_should_stop()) { + if (READ_ONCE(pause)) { + msleep(1000); + continue; + } + if (nmi_locks_a(cpu)) + ret = raw_res_spin_lock_irqsave(&lock_b, flags); + else + ret = raw_res_spin_lock_irqsave(&lock_a, flags); + mdelay(20); + if (nmi_locks_a(cpu) && !ret) + raw_res_spin_unlock_irqrestore(&lock_b, flags); + else if (!ret) + raw_res_spin_unlock_irqrestore(&lock_a, flags); + cpu_relax(); + } + return 0; + } + + while (!kthread_should_stop()) { + int expected = rqsl_nthreads > 0 ? rqsl_nthreads - 1 : 0; + int ready = atomic_read(&rqsl_ready_cpus); + + if (ready == expected && !READ_ONCE(pause)) { + for (int i = 0; i < rqsl_nevts; i++) + perf_event_enable(rqsl_evts[i]); + pr_err("Waiting 5 secs to pause the test\n"); + msleep(1000 * 5); + WRITE_ONCE(pause, 1); + pr_err("Paused the test\n"); + } else { + msleep(1000); + cpu_relax(); + } + } + return 0; +} + +static void nmi_cb(struct perf_event *event, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + unsigned long flags; + int ret; + + if (!cpu || READ_ONCE(pause)) + return; + + if (nmi_locks_a(cpu)) + ret = raw_res_spin_lock_irqsave(&lock_a, flags); + else + ret = raw_res_spin_lock_irqsave(test_ab ? &lock_b : &lock_a, flags); + + mdelay(10); + + if (nmi_locks_a(cpu) && !ret) + raw_res_spin_unlock_irqrestore(&lock_a, flags); + else if (!ret) + raw_res_spin_unlock_irqrestore(test_ab ? &lock_b : &lock_a, flags); +} + +static void free_rqsl_threads(void) +{ + int i; + + if (rqsl_threads) { + for_each_online_cpu(i) { + if (rqsl_threads[i]) + kthread_stop(rqsl_threads[i]); + } + kfree(rqsl_threads); + } +} + +static void free_rqsl_evts(void) +{ + int i; + + if (rqsl_evts) { + for (i = 0; i < rqsl_nevts; i++) { + if (rqsl_evts[i]) + perf_event_release_kernel(rqsl_evts[i]); + } + kfree(rqsl_evts); + } +} + +static int bpf_test_rqspinlock_init(void) +{ + int i, ret; + int ncpus = num_online_cpus(); + + pr_err("Mode = %s\n", test_ab ? "ABBA" : "AA"); + + if (ncpus < 3) + return -ENOTSUPP; + + raw_res_spin_lock_init(&lock_a); + raw_res_spin_lock_init(&lock_b); + + rqsl_evts = kcalloc(ncpus - 1, sizeof(*rqsl_evts), GFP_KERNEL); + if (!rqsl_evts) + return -ENOMEM; + rqsl_nevts = ncpus - 1; + + for (i = 1; i < ncpus; i++) { + struct perf_event *e; + + e = perf_event_create_kernel_counter(&hw_attr, i, NULL, nmi_cb, NULL); + if (IS_ERR(e)) { + ret = PTR_ERR(e); + goto err_perf_events; + } + rqsl_evts[i - 1] = e; + } + + rqsl_threads = kcalloc(ncpus, sizeof(*rqsl_threads), GFP_KERNEL); + if (!rqsl_threads) { + ret = -ENOMEM; + goto err_perf_events; + } + rqsl_nthreads = ncpus; + + for_each_online_cpu(i) { + struct task_struct *t; + + t = kthread_create(rqspinlock_worker_fn, NULL, "rqsl_w/%d", i); + if (IS_ERR(t)) { + ret = PTR_ERR(t); + goto err_threads_create; + } + kthread_bind(t, i); + rqsl_threads[i] = t; + wake_up_process(t); + } + return 0; + +err_threads_create: + free_rqsl_threads(); +err_perf_events: + free_rqsl_evts(); + return ret; +} + +module_init(bpf_test_rqspinlock_init); + +static void bpf_test_rqspinlock_exit(void) +{ + free_rqsl_threads(); + free_rqsl_evts(); +} + +module_exit(bpf_test_rqspinlock_exit); + +MODULE_AUTHOR("Kumar Kartikeya Dwivedi"); +MODULE_DESCRIPTION("BPF rqspinlock stress test module"); +MODULE_LICENSE("GPL"); diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 5e9f16683be546..16eb37e5bad697 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -399,7 +399,7 @@ int unload_module(const char *name, bool verbose) return 0; } -int load_module(const char *path, bool verbose) +static int __load_module(const char *path, const char *param_values, bool verbose) { int fd; @@ -411,7 +411,7 @@ int load_module(const char *path, bool verbose) fprintf(stdout, "Can't find %s kernel module: %d\n", path, -errno); return -ENOENT; } - if (finit_module(fd, "", 0)) { + if (finit_module(fd, param_values, 0)) { fprintf(stdout, "Failed to load %s into the kernel: %d\n", path, -errno); close(fd); return -EINVAL; @@ -423,6 +423,16 @@ int load_module(const char *path, bool verbose) return 0; } +int load_module_params(const char *path, const char *param_values, bool verbose) +{ + return __load_module(path, param_values, verbose); +} + +int load_module(const char *path, bool verbose) +{ + return __load_module(path, "", verbose); +} + int unload_bpf_testmod(bool verbose) { return unload_module("bpf_testmod", verbose); diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h index 46d7f7089f636b..eb20d377221854 100644 --- a/tools/testing/selftests/bpf/testing_helpers.h +++ b/tools/testing/selftests/bpf/testing_helpers.h @@ -39,6 +39,7 @@ int kern_sync_rcu(void); int finit_module(int fd, const char *param_values, int flags); int delete_module(const char *name, int flags); int load_module(const char *path, bool verbose); +int load_module_params(const char *path, const char *param_values, bool verbose); int unload_module(const char *name, bool verbose); static inline __u64 get_time_ns(void) From 399dbcadc01ebf0035f325eaa8c264f8b5cd0a14 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 28 Sep 2025 12:18:29 +0200 Subject: [PATCH 2960/3206] ACPI: battery: Add synchronization between interface updates There is no synchronization between different code paths in the ACPI battery driver that update its sysfs interface or its power supply class device interface. In some cases this results to functional failures due to race conditions. One example of this is when two ACPI notifications: - ACPI_BATTERY_NOTIFY_STATUS (0x80) - ACPI_BATTERY_NOTIFY_INFO (0x81) are triggered (by the platform firmware) in a row with a little delay in between after removing and reinserting a laptop battery. Both notifications cause acpi_battery_update() to be called and if the delay between them is sufficiently small, sysfs_add_battery() can be re-entered before battery->bat is set which leads to a duplicate sysfs entry error: sysfs: cannot create duplicate filename '/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C0A:00/power_supply/BAT1' CPU: 1 UID: 0 PID: 185 Comm: kworker/1:4 Kdump: loaded Not tainted 6.12.38+deb13-amd64 #1 Debian 6.12.38-1 Hardware name: Gateway NV44 /SJV40-MV , BIOS V1.3121 04/08/2009 Workqueue: kacpi_notify acpi_os_execute_deferred Call Trace: dump_stack_lvl+0x5d/0x80 sysfs_warn_dup.cold+0x17/0x23 sysfs_create_dir_ns+0xce/0xe0 kobject_add_internal+0xba/0x250 kobject_add+0x96/0xc0 ? get_device_parent+0xde/0x1e0 device_add+0xe2/0x870 __power_supply_register.part.0+0x20f/0x3f0 ? wake_up_q+0x4e/0x90 sysfs_add_battery+0xa4/0x1d0 [battery] acpi_battery_update+0x19e/0x290 [battery] acpi_battery_notify+0x50/0x120 [battery] acpi_ev_notify_dispatch+0x49/0x70 acpi_os_execute_deferred+0x1a/0x30 process_one_work+0x177/0x330 worker_thread+0x251/0x390 ? __pfx_worker_thread+0x10/0x10 kthread+0xd2/0x100 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x34/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 kobject: kobject_add_internal failed for BAT1 with -EEXIST, don't try to register things with the same name in the same directory. There are also other scenarios in which analogous issues may occur. Address this by using a common lock in all of the code paths leading to updates of driver interfaces: ACPI Notify () handler, system resume callback and post-resume notification, device addition and removal. This new lock replaces sysfs_lock that has been used only in sysfs_remove_battery() which now is going to be always called under the new lock, so it doesn't need any internal locking any more. Fixes: 10666251554c ("ACPI: battery: Install Notify() handler directly") Closes: https://lore.kernel.org/linux-acpi/20250910142653.313360-1-luogf2025@163.com/ Reported-by: GuangFei Luo Tested-by: GuangFei Luo Cc: 6.6+ # 6.6+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/battery.c | 43 ++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 6905b56bf3e458..67b76492c839c4 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -92,7 +92,7 @@ enum { struct acpi_battery { struct mutex lock; - struct mutex sysfs_lock; + struct mutex update_lock; struct power_supply *bat; struct power_supply_desc bat_desc; struct acpi_device *device; @@ -904,15 +904,12 @@ static int sysfs_add_battery(struct acpi_battery *battery) static void sysfs_remove_battery(struct acpi_battery *battery) { - mutex_lock(&battery->sysfs_lock); - if (!battery->bat) { - mutex_unlock(&battery->sysfs_lock); + if (!battery->bat) return; - } + battery_hook_remove_battery(battery); power_supply_unregister(battery->bat); battery->bat = NULL; - mutex_unlock(&battery->sysfs_lock); } static void find_battery(const struct dmi_header *dm, void *private) @@ -1072,6 +1069,9 @@ static void acpi_battery_notify(acpi_handle handle, u32 event, void *data) if (!battery) return; + + guard(mutex)(&battery->update_lock); + old = battery->bat; /* * On Acer Aspire V5-573G notifications are sometimes triggered too @@ -1094,21 +1094,22 @@ static void acpi_battery_notify(acpi_handle handle, u32 event, void *data) } static int battery_notify(struct notifier_block *nb, - unsigned long mode, void *_unused) + unsigned long mode, void *_unused) { struct acpi_battery *battery = container_of(nb, struct acpi_battery, pm_nb); - int result; - switch (mode) { - case PM_POST_HIBERNATION: - case PM_POST_SUSPEND: + if (mode == PM_POST_SUSPEND || mode == PM_POST_HIBERNATION) { + guard(mutex)(&battery->update_lock); + if (!acpi_battery_present(battery)) return 0; if (battery->bat) { acpi_battery_refresh(battery); } else { + int result; + result = acpi_battery_get_info(battery); if (result) return result; @@ -1120,7 +1121,6 @@ static int battery_notify(struct notifier_block *nb, acpi_battery_init_alarm(battery); acpi_battery_get_state(battery); - break; } return 0; @@ -1198,6 +1198,8 @@ static int acpi_battery_update_retry(struct acpi_battery *battery) { int retry, ret; + guard(mutex)(&battery->update_lock); + for (retry = 5; retry; retry--) { ret = acpi_battery_update(battery, false); if (!ret) @@ -1208,6 +1210,13 @@ static int acpi_battery_update_retry(struct acpi_battery *battery) return ret; } +static void sysfs_battery_cleanup(struct acpi_battery *battery) +{ + guard(mutex)(&battery->update_lock); + + sysfs_remove_battery(battery); +} + static int acpi_battery_add(struct acpi_device *device) { int result = 0; @@ -1230,7 +1239,7 @@ static int acpi_battery_add(struct acpi_device *device) if (result) return result; - result = devm_mutex_init(&device->dev, &battery->sysfs_lock); + result = devm_mutex_init(&device->dev, &battery->update_lock); if (result) return result; @@ -1262,7 +1271,7 @@ static int acpi_battery_add(struct acpi_device *device) device_init_wakeup(&device->dev, 0); unregister_pm_notifier(&battery->pm_nb); fail: - sysfs_remove_battery(battery); + sysfs_battery_cleanup(battery); return result; } @@ -1281,6 +1290,9 @@ static void acpi_battery_remove(struct acpi_device *device) device_init_wakeup(&device->dev, 0); unregister_pm_notifier(&battery->pm_nb); + + guard(mutex)(&battery->update_lock); + sysfs_remove_battery(battery); } @@ -1297,6 +1309,9 @@ static int acpi_battery_resume(struct device *dev) return -EINVAL; battery->update_time = 0; + + guard(mutex)(&battery->update_lock); + acpi_battery_update(battery, true); return 0; } From c7bc7e9070d6bd17fad1a3fc6a7de097834beff8 Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Tue, 16 Sep 2025 12:47:22 +0800 Subject: [PATCH 2961/3206] ACPI: APEI: Remove redundant rcu_read_lock/unlock() under spinlock Since commit a8bb74acd8efe ("rcu: Consolidate RCU-sched update-side function definitions") there is no difference between rcu_read_lock(), rcu_read_lock_bh() and rcu_read_lock_sched() in terms of RCU read section and the relevant grace period. That means that spin_lock(), which implies rcu_read_lock_sched(), also implies rcu_read_lock(). There is no need no explicitly start a RCU read section if one has already been started implicitly by spin_lock(). Simplify the code and remove the inner rcu_read_lock() invocation. Signed-off-by: pengdonglin Signed-off-by: pengdonglin Reviewed-by: Hanjun Guo Link: https://patch.msgid.link/20250916044735.2316171-2-dolinux.peng@gmail.com [ rjw: Subject adjustment ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index a0d54993edb3b6..97ee19f2cae060 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -1207,12 +1207,10 @@ static int ghes_notify_hed(struct notifier_block *this, unsigned long event, int ret = NOTIFY_DONE; spin_lock_irqsave(&ghes_notify_lock_irq, flags); - rcu_read_lock(); list_for_each_entry_rcu(ghes, &ghes_hed, list) { if (!ghes_proc(ghes)) ret = NOTIFY_OK; } - rcu_read_unlock(); spin_unlock_irqrestore(&ghes_notify_lock_irq, flags); return ret; From 4ef77dd584cfd915526328f516fec59e3a54d66e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 27 Sep 2025 17:38:33 -0700 Subject: [PATCH 2962/3206] libbpf: Replace AF_ALG with open coded SHA-256 Reimplement libbpf_sha256() using some basic SHA-256 C code. This eliminates the newly-added dependency on AF_ALG, which is a problematic UAPI that is not supported by all kernels. Make libbpf_sha256() return void, since it can no longer fail. This simplifies some callers. Also drop the unnecessary 'sha_out_sz' parameter. Finally, also fix the typo in "compute_sha_udpate_offsets". Fixes: c297fe3e9f99 ("libbpf: Implement SHA256 internal helper") Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20250928003833.138407-1-ebiggers@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/gen_loader.c | 20 ++--- tools/lib/bpf/libbpf.c | 143 ++++++++++++++++++++------------ tools/lib/bpf/libbpf_internal.h | 2 +- 3 files changed, 98 insertions(+), 67 deletions(-) diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 376eef292d3a8b..6945dd99a84693 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -371,7 +371,7 @@ static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off) __emit_sys_close(gen); } -static int compute_sha_udpate_offsets(struct bpf_gen *gen); +static void compute_sha_update_offsets(struct bpf_gen *gen); int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) { @@ -399,11 +399,8 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) blob_fd_array_off(gen, i)); emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0)); emit(gen, BPF_EXIT_INSN()); - if (OPTS_GET(gen->opts, gen_hash, false)) { - gen->error = compute_sha_udpate_offsets(gen); - if (gen->error) - return gen->error; - } + if (OPTS_GET(gen->opts, gen_hash, false)) + compute_sha_update_offsets(gen); pr_debug("gen: finish %s\n", errstr(gen->error)); if (!gen->error) { @@ -457,17 +454,13 @@ void bpf_gen__free(struct bpf_gen *gen) _val; \ }) -static int compute_sha_udpate_offsets(struct bpf_gen *gen) +static void compute_sha_update_offsets(struct bpf_gen *gen) { __u64 sha[SHA256_DWORD_SIZE]; __u64 sha_dw; - int i, err; + int i; - err = libbpf_sha256(gen->data_start, gen->data_cur - gen->data_start, sha, SHA256_DIGEST_LENGTH); - if (err < 0) { - pr_warn("sha256 computation of the metadata failed"); - return err; - } + libbpf_sha256(gen->data_start, gen->data_cur - gen->data_start, (__u8 *)sha); for (i = 0; i < SHA256_DWORD_SIZE; i++) { struct bpf_insn *insn = (struct bpf_insn *)(gen->insn_start + gen->hash_insn_offset[i]); @@ -475,7 +468,6 @@ static int compute_sha_udpate_offsets(struct bpf_gen *gen) insn[0].imm = (__u32)sha_dw; insn[1].imm = sha_dw >> 32; } - return 0; } void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data, diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7edb36aa88e1dc..f92083f51bdb36 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -43,9 +44,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -4491,7 +4489,7 @@ bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) static int bpf_prog_compute_hash(struct bpf_program *prog) { struct bpf_insn *purged; - int i, err; + int i, err = 0; purged = calloc(prog->insns_cnt, BPF_INSN_SZ); if (!purged) @@ -4519,8 +4517,8 @@ static int bpf_prog_compute_hash(struct bpf_program *prog) purged[i].imm = 0; } } - err = libbpf_sha256(purged, prog->insns_cnt * sizeof(struct bpf_insn), - prog->hash, SHA256_DIGEST_LENGTH); + libbpf_sha256(purged, prog->insns_cnt * sizeof(struct bpf_insn), + prog->hash); out: free(purged); return err; @@ -14288,58 +14286,99 @@ void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s) free(s); } -int libbpf_sha256(const void *data, size_t data_sz, void *sha_out, size_t sha_out_sz) +static inline __u32 ror32(__u32 v, int bits) { - struct sockaddr_alg sa = { - .salg_family = AF_ALG, - .salg_type = "hash", - .salg_name = "sha256" - }; - int sock_fd = -1; - int op_fd = -1; - int err = 0; + return (v >> bits) | (v << (32 - bits)); +} - if (sha_out_sz != SHA256_DIGEST_LENGTH) { - pr_warn("sha_out_sz should be exactly 32 bytes for a SHA256 digest"); - return -EINVAL; - } +#define SHA256_BLOCK_LENGTH 64 +#define Ch(x, y, z) (((x) & (y)) ^ (~(x) & (z))) +#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) +#define Sigma_0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22)) +#define Sigma_1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25)) +#define sigma_0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3)) +#define sigma_1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10)) - sock_fd = socket(AF_ALG, SOCK_SEQPACKET, 0); - if (sock_fd < 0) { - err = -errno; - pr_warn("failed to create AF_ALG socket for SHA256: %s\n", errstr(err)); - return err; - } +static const __u32 sha256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, + 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, + 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, + 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, + 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, + 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; - if (bind(sock_fd, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - err = -errno; - pr_warn("failed to bind to AF_ALG socket for SHA256: %s\n", errstr(err)); - goto out; - } +#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) \ + { \ + __u32 tmp = h + Sigma_1(e) + Ch(e, f, g) + sha256_K[i] + w[i]; \ + d += tmp; \ + h = tmp + Sigma_0(a) + Maj(a, b, c); \ + } + +static void sha256_blocks(__u32 state[8], const __u8 *data, size_t nblocks) +{ + while (nblocks--) { + __u32 a = state[0]; + __u32 b = state[1]; + __u32 c = state[2]; + __u32 d = state[3]; + __u32 e = state[4]; + __u32 f = state[5]; + __u32 g = state[6]; + __u32 h = state[7]; + __u32 w[64]; + int i; + + for (i = 0; i < 16; i++) + w[i] = get_unaligned_be32(&data[4 * i]); + for (; i < ARRAY_SIZE(w); i++) + w[i] = sigma_1(w[i - 2]) + w[i - 7] + + sigma_0(w[i - 15]) + w[i - 16]; + for (i = 0; i < ARRAY_SIZE(w); i += 8) { + SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h); + SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g); + SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f); + SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e); + SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d); + SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c); + SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b); + SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a); + } + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + data += SHA256_BLOCK_LENGTH; + } +} + +void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]) +{ + __u32 state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; + const __be64 bitcount = cpu_to_be64((__u64)len * 8); + __u8 final_data[2 * SHA256_BLOCK_LENGTH] = { 0 }; + size_t final_len = len % SHA256_BLOCK_LENGTH; + int i; - op_fd = accept(sock_fd, NULL, 0); - if (op_fd < 0) { - err = -errno; - pr_warn("failed to accept from AF_ALG socket for SHA256: %s\n", errstr(err)); - goto out; - } + sha256_blocks(state, data, len / SHA256_BLOCK_LENGTH); - if (write(op_fd, data, data_sz) != data_sz) { - err = -errno; - pr_warn("failed to write data to AF_ALG socket for SHA256: %s\n", errstr(err)); - goto out; - } + memcpy(final_data, data + len - final_len, final_len); + final_data[final_len] = 0x80; + final_len = round_up(final_len + 9, SHA256_BLOCK_LENGTH); + memcpy(&final_data[final_len - 8], &bitcount, 8); - if (read(op_fd, sha_out, SHA256_DIGEST_LENGTH) != SHA256_DIGEST_LENGTH) { - err = -errno; - pr_warn("failed to read SHA256 from AF_ALG socket: %s\n", errstr(err)); - goto out; - } + sha256_blocks(state, final_data, final_len / SHA256_BLOCK_LENGTH); -out: - if (op_fd >= 0) - close(op_fd); - if (sock_fd >= 0) - close(sock_fd); - return err; + for (i = 0; i < ARRAY_SIZE(state); i++) + put_unaligned_be32(state[i], &out[4 * i]); } diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 8a055de0d32483..c93797dcaf5bcf 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -739,5 +739,5 @@ int probe_fd(int fd); #define SHA256_DIGEST_LENGTH 32 #define SHA256_DWORD_SIZE SHA256_DIGEST_LENGTH / sizeof(__u64) -int libbpf_sha256(const void *data, size_t data_sz, void *sha_out, size_t sha_out_sz); +void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]); #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ From 0ca29010d426ed5dd08dbd2d03074d1673d963c8 Mon Sep 17 00:00:00 2001 From: Roy Vegard Ovesen Date: Sun, 28 Sep 2025 13:38:07 +0200 Subject: [PATCH 2963/3206] ALSA: usb-audio: add the initial mix for Presonus Studio 1824c A reasonable initial mix for the 1824c is the one that Presonus Universal Control calls bypass. It mutes all the physical inputs, and connects Daw channel 1 to Line out channel 1 (left) Daw channel 2 to Line out channel 2 (right) Daw channel 3 to Line out channel 3 (left) etc. To get the most out of the 1824c a mixer application like Universal Control is needed. One is available for linux Link: https://github.com/royvegard/baton Signed-off-by: Roy Vegard Ovesen Signed-off-by: Takashi Iwai --- sound/usb/mixer_s1810c.c | 227 ++++++++++++++++++++++++--------------- 1 file changed, 138 insertions(+), 89 deletions(-) diff --git a/sound/usb/mixer_s1810c.c b/sound/usb/mixer_s1810c.c index 656f0b4bfdf0e8..b2658f922d0717 100644 --- a/sound/usb/mixer_s1810c.c +++ b/sound/usb/mixer_s1810c.c @@ -215,115 +215,164 @@ snd_sc1810c_get_status_field(struct usb_device *dev, */ static int snd_s1810c_init_mixer_maps(struct snd_usb_audio *chip) { - u32 a, b, c, e, n, off; + u32 a, b, c, e, n, off, left, right; struct usb_device *dev = chip->dev; - /* Set initial volume levels ? */ - a = 0x64; - e = 0xbc; - for (n = 0; n < 2; n++) { - off = n * 18; - for (b = off; b < 18 + off; b++) { - /* This channel to all outputs ? */ - for (c = 0; c <= 8; c++) { + switch (chip->usb_id) { + case USB_ID(0x194f, 0x010c): /* 1810c */ + /* Set initial volume levels ? */ + a = 0x64; + e = 0xbc; + for (n = 0; n < 2; n++) { + off = n * 18; + for (b = off; b < 18 + off; b++) { + /* This channel to all outputs ? */ + for (c = 0; c <= 8; c++) { + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); + } + /* This channel to main output (again) */ + snd_s1810c_send_ctl_packet(dev, a, b, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, b, 0, 1, e); + } + /* + * I noticed on UC that DAW channels have different + * initial volumes, so this makes sense. + */ + e = 0xb53bf0; + } + + /* Connect analog outputs ? */ + a = 0x65; + e = 0x01000000; + for (b = 1; b < 3; b++) { + snd_s1810c_send_ctl_packet(dev, a, b, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, b, 0, 1, e); + } + snd_s1810c_send_ctl_packet(dev, a, 0, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, 0, 0, 1, e); + + /* Set initial volume levels for S/PDIF mappings ? */ + a = 0x64; + e = 0xbc; + c = 3; + for (n = 0; n < 2; n++) { + off = n * 18; + for (b = off; b < 18 + off; b++) { snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); } - /* This channel to main output (again) */ + e = 0xb53bf0; + } + + /* Connect S/PDIF output ? */ + a = 0x65; + e = 0x01000000; + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); + + /* Connect all outputs (again) ? */ + a = 0x65; + e = 0x01000000; + for (b = 0; b < 4; b++) { snd_s1810c_send_ctl_packet(dev, a, b, 0, 0, e); snd_s1810c_send_ctl_packet(dev, a, b, 0, 1, e); } - /* - * I noticed on UC that DAW channels have different - * initial volumes, so this makes sense. - */ - e = 0xb53bf0; - } - /* Connect analog outputs ? */ - a = 0x65; - e = 0x01000000; - for (b = 1; b < 3; b++) { - snd_s1810c_send_ctl_packet(dev, a, b, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, b, 0, 1, e); - } - snd_s1810c_send_ctl_packet(dev, a, 0, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, 0, 0, 1, e); - - /* Set initial volume levels for S/PDIF mappings ? */ - a = 0x64; - e = 0xbc; - c = 3; - for (n = 0; n < 2; n++) { - off = n * 18; - for (b = off; b < 18 + off; b++) { - snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); - snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); + /* Basic routing to get sound out of the device */ + a = 0x64; + e = 0x01000000; + for (c = 0; c < 4; c++) { + for (b = 0; b < 36; b++) { + if ((c == 0 && b == 18) || /* DAW1/2 -> Main */ + (c == 1 && b == 20) || /* DAW3/4 -> Line3/4 */ + (c == 2 && b == 22) || /* DAW4/5 -> Line5/6 */ + (c == 3 && b == 24)) { /* DAW5/6 -> S/PDIF */ + /* Left */ + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0); + b++; + /* Right */ + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); + } else { + /* Leave the rest disconnected */ + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0); + } + } } - e = 0xb53bf0; - } - /* Connect S/PDIF output ? */ - a = 0x65; - e = 0x01000000; - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); - - /* Connect all outputs (again) ? */ - a = 0x65; - e = 0x01000000; - for (b = 0; b < 4; b++) { - snd_s1810c_send_ctl_packet(dev, a, b, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, b, 0, 1, e); - } - - /* Basic routing to get sound out of the device */ - a = 0x64; - e = 0x01000000; - for (c = 0; c < 4; c++) { - for (b = 0; b < 36; b++) { - if ((c == 0 && b == 18) || /* DAW1/2 -> Main */ - (c == 1 && b == 20) || /* DAW3/4 -> Line3/4 */ - (c == 2 && b == 22) || /* DAW4/5 -> Line5/6 */ - (c == 3 && b == 24)) { /* DAW5/6 -> S/PDIF */ - /* Left */ + /* Set initial volume levels for S/PDIF (again) ? */ + a = 0x64; + e = 0xbc; + c = 3; + for (n = 0; n < 2; n++) { + off = n * 18; + for (b = off; b < 18 + off; b++) { snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); - snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0); - b++; - /* Right */ - snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0); snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); - } else { - /* Leave the rest disconnected */ - snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0); - snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0); } + e = 0xb53bf0; } - } - /* Set initial volume levels for S/PDIF (again) ? */ - a = 0x64; - e = 0xbc; - c = 3; - for (n = 0; n < 2; n++) { - off = n * 18; - for (b = off; b < 18 + off; b++) { + /* Connect S/PDIF outputs (again) ? */ + a = 0x65; + e = 0x01000000; + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); + + /* Again ? */ + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); + break; + + case USB_ID(0x194f, 0x010d): /* 1824c */ + /* Set all output faders to unity gain */ + a = 0x65; + c = 0x00; + e = 0x01000000; + + for (b = 0; b < 9; b++) { snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); } - e = 0xb53bf0; - } - - /* Connect S/PDIF outputs (again) ? */ - a = 0x65; - e = 0x01000000; - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); - - /* Again ? */ - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); + /* Set + * Daw 1 -> Line out 1 (left), Daw 2 -> Line out 2 (right) + * Daw 3 -> Line out 3, (left) Daw 4 -> Line out 4 (right) + * Daw 5 -> Line out 5, (left) Daw 6 -> Line out 6 (right) + * Daw 7 -> Line out 7, (left) Daw 8 -> Line out 8 (right) + * Daw 9 -> SPDIF out 1, (left) Daw 10 -> SPDIF out 2 (right) + * Daw 11 -> ADAT out 1, (left) Daw 12 -> ADAT out 2 (right) + * Daw 13 -> ADAT out 3, (left) Daw 14 -> ADAT out 4 (right) + * Daw 15 -> ADAT out 5, (left) Daw 16 -> ADAT out 6 (right) + * Daw 17 -> ADAT out 7, (left) Daw 18 -> ADAT out 8 (right) + * Everything else muted + */ + a = 0x64; + /* The first Daw channel is channel 18 */ + left = 18; + + for (c = 0; c < 9; c++) { + right = left + 1; + + for (b = 0; b < 36; b++) { + if (b == left) { + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0x01000000); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0x00); + } else if (b == right) { + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0x00); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0x01000000); + } else { + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0x00); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0x00); + } + } + left += 2; + } + break; + } return 0; } From f65dc3b1ab145c9b8b36301256d703c1dd153f71 Mon Sep 17 00:00:00 2001 From: Roy Vegard Ovesen Date: Sun, 28 Sep 2025 13:38:27 +0200 Subject: [PATCH 2964/3206] ALSA: usb-audio: don't hardcode gain for output channel of Presonus Studio On the 1824c output channel 4 left/right is the S/PDIF output, so this needs to be able to be set. I'm guessing that for the 1810c since it has no output channel 4, it does not matter what the gain value of this channel is set to. Signed-off-by: Roy Vegard Ovesen Signed-off-by: Takashi Iwai --- sound/usb/mixer_s1810c.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sound/usb/mixer_s1810c.c b/sound/usb/mixer_s1810c.c index b2658f922d0717..15960d25e74882 100644 --- a/sound/usb/mixer_s1810c.c +++ b/sound/usb/mixer_s1810c.c @@ -147,12 +147,7 @@ snd_s1810c_send_ctl_packet(struct usb_device *dev, u32 a, pkt.b = b; pkt.c = c; pkt.d = d; - /* - * Value for settings 0/1 for this - * output channel is always 0 (probably because - * there is no ADAT output on 1810c) - */ - pkt.e = (c == 4) ? 0 : e; + pkt.e = e; ret = snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), SC1810C_CMD_REQ, From e5f0a698b34ed76002dc5cff3804a61c80233a7a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Sep 2025 14:39:22 -0700 Subject: [PATCH 2965/3206] Linux 6.17 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 10355ecf32cb47..82bb9cdf73a32b 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Baby Opossum Posse # *DOCUMENTATION* From 8a1f3fd1a89cd1d4acccb0181346ad212a275a69 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 24 Sep 2025 21:56:00 +0200 Subject: [PATCH 2966/3206] i3c: master: adi: fix number of bytes written to fifo adi_i3c_master_wr_to_tx_fifo computes the maximum number of bytes that can be sent to the fifo but never makes use of it, actually limit the number of bytes sent. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509190505.fKGvEJRa-lkp@intel.com/ Reviewed-by: Jorge Marques Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250924195600.122142-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/adi-i3c-master.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i3c/master/adi-i3c-master.c b/drivers/i3c/master/adi-i3c-master.c index 18597ba1f1c3e7..82ac0b3d057abd 100644 --- a/drivers/i3c/master/adi-i3c-master.c +++ b/drivers/i3c/master/adi-i3c-master.c @@ -135,7 +135,7 @@ static void adi_i3c_master_wr_to_tx_fifo(struct adi_i3c_master *master, n = readl(master->regs + REG_SDO_FIFO_ROOM); m = min(n, nbytes); - i3c_writel_fifo(master->regs + REG_SDO_FIFO, buf, nbytes); + i3c_writel_fifo(master->regs + REG_SDO_FIFO, buf, m); } static void adi_i3c_master_rd_from_rx_fifo(struct adi_i3c_master *master, From d6ddd9beb1a5c32acb9b80f5c2cd8b17f41371d1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 24 Sep 2025 22:18:33 +0200 Subject: [PATCH 2967/3206] i3c: fix big-endian FIFO transfers Short MMIO transfers that are not a multiple of four bytes in size need a special case for the final bytes, however the existing implementation is not endian-safe and introduces an incorrect byteswap on big-endian kernels. This usually does not cause problems because most systems are little-endian and most transfers are multiple of four bytes long, but still needs to be fixed to avoid the extra byteswap. Change the special case for both i3c_writel_fifo() and i3c_readl_fifo() to use non-byteswapping writesl() and readsl() with a single element instead of the byteswapping writel()/readl() that are meant for individual MMIO registers. As data is copied between a FIFO and a memory buffer, the writesl()/readsl() loops are typically based on __raw_readl()/ __raw_writel(), resulting in the order of bytes in the FIFO to match the order in the buffer, regardless of the CPU endianess. The earlier versions in the dw-i3c and i3c-master-cdns had a correct implementation, but the generic version that was recently added broke it. Fixes: 733b439375b4 ("i3c: master: Add inline i3c_readl_fifo() and i3c_writel_fifo()") Cc: Manikanta Guntupalli Signed-off-by: Arnd Bergmann Reviewed-by: Jorge Marques Link: https://lore.kernel.org/r/20250924201837.3691486-1-arnd@kernel.org Signed-off-by: Alexandre Belloni --- drivers/i3c/internals.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/i3c/internals.h b/drivers/i3c/internals.h index 0d857cc68cc5d4..79ceaa5f5afd6f 100644 --- a/drivers/i3c/internals.h +++ b/drivers/i3c/internals.h @@ -38,7 +38,11 @@ static inline void i3c_writel_fifo(void __iomem *addr, const void *buf, u32 tmp = 0; memcpy(&tmp, buf + (nbytes & ~3), nbytes & 3); - writel(tmp, addr); + /* + * writesl() instead of writel() to keep FIFO + * byteorder on big-endian targets + */ + writesl(addr, &tmp, 1); } } @@ -55,7 +59,11 @@ static inline void i3c_readl_fifo(const void __iomem *addr, void *buf, if (nbytes & 3) { u32 tmp; - tmp = readl(addr); + /* + * readsl() instead of readl() to keep FIFO + * byteorder on big-endian targets + */ + readsl(addr, &tmp, 1); memcpy(buf + (nbytes & ~3), &tmp, nbytes & 3); } } From 1e2f487584ed426a744b86e04f96662488c7aa0e Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 8 Sep 2025 12:06:18 +0200 Subject: [PATCH 2968/3206] smb: smbdirect: introduce smbdirect_socket_status_string() This will be used for more useful debug messages. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 3c4a8d627aa3f5..8484c6f09317ba 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -16,6 +16,29 @@ enum smbdirect_socket_status { SMBDIRECT_SOCKET_DESTROYED }; +static __always_inline +const char *smbdirect_socket_status_string(enum smbdirect_socket_status status) +{ + switch (status) { + case SMBDIRECT_SOCKET_CREATED: + return "CREATED"; + case SMBDIRECT_SOCKET_CONNECTING: + return "CONNECTING"; + case SMBDIRECT_SOCKET_CONNECTED: + return "CONNECTED"; + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + return "NEGOTIATE_FAILED"; + case SMBDIRECT_SOCKET_DISCONNECTING: + return "DISCONNECTING"; + case SMBDIRECT_SOCKET_DISCONNECTED: + return "DISCONNECTED"; + case SMBDIRECT_SOCKET_DESTROYED: + return "DESTROYED"; + } + + return ""; +} + struct smbdirect_socket { enum smbdirect_socket_status status; From 371d3ab51ce44c8a7c0f9b835b74b7340a922059 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 18:48:16 +0200 Subject: [PATCH 2969/3206] smb: smbdirect: introduce smbdirect_socket.status_wait This will be used by server and client soon. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 8484c6f09317ba..a69b9649b88e2b 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -41,6 +41,7 @@ const char *smbdirect_socket_status_string(enum smbdirect_socket_status status) struct smbdirect_socket { enum smbdirect_socket_status status; + wait_queue_head_t status_wait; /* RDMA related */ struct { From 17e1d07430e4bd6a14e2d194791f545f8548ff58 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 15:08:33 +0200 Subject: [PATCH 2970/3206] smb: smbdirect: introduce smbdirect_socket_init() This will make it easier to keep the initialization in a single place. For now it's an __always_inline function as we only share the header files. Once move to common functions we'll have a dedicated smbdirect.ko that exports functions... Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index a69b9649b88e2b..4e2b87208d9182 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -136,6 +136,24 @@ struct smbdirect_socket { } recv_io; }; +static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) +{ + /* + * This also sets status = SMBDIRECT_SOCKET_CREATED + */ + BUILD_BUG_ON(SMBDIRECT_SOCKET_CREATED != 0); + memset(sc, 0, sizeof(*sc)); + + init_waitqueue_head(&sc->status_wait); + + INIT_LIST_HEAD(&sc->recv_io.free.list); + spin_lock_init(&sc->recv_io.free.lock); + + INIT_LIST_HEAD(&sc->recv_io.reassembly.list); + spin_lock_init(&sc->recv_io.reassembly.lock); + init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); +} + struct smbdirect_send_io { struct smbdirect_socket *socket; struct ib_cqe cqe; From de32f33c0365d06962675dc2c2269723dd917d0f Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 19:05:39 +0200 Subject: [PATCH 2971/3206] smb: smbdirect: introduce smbdirect_socket.disconnect_work Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 4e2b87208d9182..ab36c2271e3eca 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -43,6 +43,8 @@ struct smbdirect_socket { enum smbdirect_socket_status status; wait_queue_head_t status_wait; + struct work_struct disconnect_work; + /* RDMA related */ struct { struct rdma_cm_id *cm_id; From 3b7be44a0eb18b2e51190fd1d26997dc7bddc32d Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 11 Aug 2025 15:19:51 +0200 Subject: [PATCH 2972/3206] smb: smbdirect: introduce smbdirect_socket.send_io.pending.{count,{dec,zero}_wait_queue} This will be shared between client and server soon. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index ab36c2271e3eca..1df43b7a87f46d 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -77,6 +77,21 @@ struct smbdirect_socket { struct kmem_cache *cache; mempool_t *pool; } mem; + + /* + * The state about posted/pending sends + */ + struct { + atomic_t count; + /* + * woken when count is decremented + */ + wait_queue_head_t dec_wait_queue; + /* + * woken when count reached zero + */ + wait_queue_head_t zero_wait_queue; + } pending; } send_io; /* @@ -148,6 +163,10 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) init_waitqueue_head(&sc->status_wait); + atomic_set(&sc->send_io.pending.count, 0); + init_waitqueue_head(&sc->send_io.pending.dec_wait_queue); + init_waitqueue_head(&sc->send_io.pending.zero_wait_queue); + INIT_LIST_HEAD(&sc->recv_io.free.list); spin_lock_init(&sc->recv_io.free.lock); From 2191b3471df53d87a46179b0c302c77c9ccfec11 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 11 Aug 2025 16:03:00 +0200 Subject: [PATCH 2973/3206] smb: smbdirect: introduce smbdirect_socket.send_io.credits.{count,wait_queue} This will be shared between client and server soon. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 1df43b7a87f46d..7c2147a0244a4f 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -78,6 +78,14 @@ struct smbdirect_socket { mempool_t *pool; } mem; + /* + * The credit state for the send side + */ + struct { + atomic_t count; + wait_queue_head_t wait_queue; + } credits; + /* * The state about posted/pending sends */ @@ -163,6 +171,9 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) init_waitqueue_head(&sc->status_wait); + atomic_set(&sc->send_io.credits.count, 0); + init_waitqueue_head(&sc->send_io.credits.wait_queue); + atomic_set(&sc->send_io.pending.count, 0); init_waitqueue_head(&sc->send_io.pending.dec_wait_queue); init_waitqueue_head(&sc->send_io.pending.zero_wait_queue); From cce93d2d0ccc88d2486e220e4a802c9edc48849b Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 18 Aug 2025 21:13:03 +0200 Subject: [PATCH 2974/3206] smb: smbdirect: introduce struct smbdirect_send_batch This will replace struct smb_direct_send_ctx in the server and allow us move code to the common smbdirect layer soon. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 7c2147a0244a4f..e1454ccf849b98 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -210,6 +210,23 @@ struct smbdirect_send_io { u8 packet[]; }; +struct smbdirect_send_batch { + /* + * List of smbdirect_send_io messages + */ + struct list_head msg_list; + /* + * Number of list entries + */ + size_t wr_cnt; + + /* + * Possible remote key invalidation state + */ + bool need_invalidate_rkey; + u32 remote_key; +}; + struct smbdirect_recv_io { struct smbdirect_socket *socket; struct ib_cqe cqe; From 90255ed46774bda93a945a6dbf9228d9ecfcd79f Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 11:41:59 +0200 Subject: [PATCH 2975/3206] smb: smbdirect: introduce smbdirect_socket.rw_io.credits This will be used by the server to manage the state for RDMA read/write requests. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index e1454ccf849b98..148e8dbf1254c6 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -159,6 +159,27 @@ struct smbdirect_socket { bool full_packet_received; } reassembly; } recv_io; + + /* + * The state for RDMA read/write requests on the server + */ + struct { + /* + * The credit state for the send side + */ + struct { + /* + * The maximum number of rw credits + */ + size_t max; + /* + * The number of pages per credit + */ + size_t num_pages; + atomic_t count; + wait_queue_head_t wait_queue; + } credits; + } rw_io; }; static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) @@ -184,6 +205,9 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) INIT_LIST_HEAD(&sc->recv_io.reassembly.list); spin_lock_init(&sc->recv_io.reassembly.lock); init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); + + atomic_set(&sc->rw_io.credits.count, 0); + init_waitqueue_head(&sc->rw_io.credits.wait_queue); } struct smbdirect_send_io { From b5a4242fb902f3780ef85d4027ba5987b7ce1e2d Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 18 Aug 2025 21:04:14 +0200 Subject: [PATCH 2976/3206] smb: smbdirect: introduce struct smbdirect_rw_io This will be used by the server in order to manage RDMA reads and writes. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 148e8dbf1254c6..d7f3b25c022df7 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -6,6 +6,8 @@ #ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ #define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ +#include + enum smbdirect_socket_status { SMBDIRECT_SOCKET_CREATED, SMBDIRECT_SOCKET_CONNECTING, @@ -273,4 +275,18 @@ struct smbdirect_recv_io { u8 packet[]; }; +struct smbdirect_rw_io { + struct smbdirect_socket *socket; + struct ib_cqe cqe; + + struct list_head list; + + int error; + struct completion *completion; + + struct rdma_rw_ctx rdma_ctx; + struct sg_table sgt; + struct scatterlist sg_list[]; +}; + #endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ */ From ec76e3cc6bbcb17815a4bfa6450d4bae1e708c9b Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 11 Aug 2025 15:57:04 +0200 Subject: [PATCH 2977/3206] smb: smbdirect: introduce smbdirect_socket.recv_io.{posted,credits} This will be used by client and server soon in order to maintain the state of posted recv_io messages and granted credits. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index d7f3b25c022df7..1abfcd230f9a3f 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -135,6 +135,23 @@ struct smbdirect_socket { spinlock_t lock; } free; + /* + * The state for posted recv_io messages + * and the refill work struct. + */ + struct { + atomic_t count; + struct work_struct refill_work; + } posted; + + /* + * The credit state for the recv side + */ + struct { + u16 target; + atomic_t count; + } credits; + /* * The list of arrived non-empty smbdirect_recv_io * structures @@ -204,6 +221,10 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) INIT_LIST_HEAD(&sc->recv_io.free.list); spin_lock_init(&sc->recv_io.free.lock); + atomic_set(&sc->recv_io.posted.count, 0); + + atomic_set(&sc->recv_io.credits.count, 0); + INIT_LIST_HEAD(&sc->recv_io.reassembly.list); spin_lock_init(&sc->recv_io.reassembly.lock); init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); From 932fa15c3726459d0f996893262af33b9f7dc121 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 19:12:38 +0200 Subject: [PATCH 2978/3206] smb: smbdirect: introduce smbdirect_socket_parameters.{resolve_{addr,route},rdma_connect,negotiate}_timeout_msec These will be used instead of hardcoded values in client and server. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h index b9a385344ff31c..17aa08dd6aba43 100644 --- a/fs/smb/common/smbdirect/smbdirect.h +++ b/fs/smb/common/smbdirect/smbdirect.h @@ -23,6 +23,10 @@ struct smbdirect_buffer_descriptor_v1 { * Some values are important for the upper layer. */ struct smbdirect_socket_parameters { + __u32 resolve_addr_timeout_msec; + __u32 resolve_route_timeout_msec; + __u32 rdma_connect_timeout_msec; + __u32 negotiate_timeout_msec; __u16 recv_credit_max; __u16 send_credit_target; __u32 max_send_size; From 8e34a763870dc48b69049c2a4ec032a21d66feb6 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 19 Aug 2025 23:18:21 +0200 Subject: [PATCH 2979/3206] smb: smbdirect: introduce smbdirect_socket_parameters.{initiator_depth,responder_resources} This will make it easier to specify these from the outside of the core code later. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h index 17aa08dd6aba43..c3274bbb3c02d8 100644 --- a/fs/smb/common/smbdirect/smbdirect.h +++ b/fs/smb/common/smbdirect/smbdirect.h @@ -27,6 +27,8 @@ struct smbdirect_socket_parameters { __u32 resolve_route_timeout_msec; __u32 rdma_connect_timeout_msec; __u32 negotiate_timeout_msec; + __u8 initiator_depth; + __u8 responder_resources; __u16 recv_credit_max; __u16 send_credit_target; __u32 max_send_size; From 2baedb2026a5c01e79e9691540eea14d65f7277a Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 21 Aug 2025 12:13:16 +0200 Subject: [PATCH 2980/3206] smb: smbdirect: introduce smbdirect_socket.rdma.legacy_iwarp This will be used by client and server soon. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 1abfcd230f9a3f..3aaee6ca293459 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -50,6 +50,10 @@ struct smbdirect_socket { /* RDMA related */ struct { struct rdma_cm_id *cm_id; + /* + * This is for iWarp MPA v1 + */ + bool legacy_iwarp; } rdma; /* IB verbs related */ From 89bae05f9df68f827de0106760de53933083a391 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 15 Aug 2025 17:47:58 +0200 Subject: [PATCH 2981/3206] smb: smbdirect: introduce smbdirect_socket.idle.{keepalive,immediate_work,timer_work} This will allow client and server to use the common structures in order to share common functions later. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 3aaee6ca293459..11e9b365ac9673 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -41,6 +41,12 @@ const char *smbdirect_socket_status_string(enum smbdirect_socket_status status) return ""; } +enum smbdirect_keepalive_status { + SMBDIRECT_KEEPALIVE_NONE, + SMBDIRECT_KEEPALIVE_PENDING, + SMBDIRECT_KEEPALIVE_SENT +}; + struct smbdirect_socket { enum smbdirect_socket_status status; wait_queue_head_t status_wait; @@ -71,6 +77,15 @@ struct smbdirect_socket { struct smbdirect_socket_parameters parameters; + /* + * The state for keepalive and timeout handling + */ + struct { + enum smbdirect_keepalive_status keepalive; + struct work_struct immediate_work; + struct delayed_work timer_work; + } idle; + /* * The state for posted send buffers */ From 6889d2f5caa69ff4f523ef11a207fcaedaff8c7e Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 19 Aug 2025 13:52:29 +0200 Subject: [PATCH 2982/3206] smb: smbdirect: introduce smbdirect_socket.statistics These will be used by the client and maybe shared code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 11e9b365ac9673..649d66cd4a83ad 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -218,6 +218,17 @@ struct smbdirect_socket { wait_queue_head_t wait_queue; } credits; } rw_io; + + /* + * For debug purposes + */ + struct { + u64 get_receive_buffer; + u64 put_receive_buffer; + u64 enqueue_reassembly_queue; + u64 dequeue_reassembly_queue; + u64 send_empty; + } statistics; }; static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) From 6c5b0f9253d7f15722db22da28cd49ab34252214 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 12:22:24 +0200 Subject: [PATCH 2983/3206] smb: smbdirect: introduce smbdirect_socket.workqueue The client currently used a per socket workqueue because it can block in a work function waiting for credits. So we use a per socket pointer in order to prepare common code. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 649d66cd4a83ad..b0efa5a26016e4 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -51,6 +51,14 @@ struct smbdirect_socket { enum smbdirect_socket_status status; wait_queue_head_t status_wait; + /* + * This points to the workqueue to + * be used for this socket. + * It can be per socket (on the client) + * or point to a global workqueue (on the server) + */ + struct workqueue_struct *workqueue; + struct work_struct disconnect_work; /* RDMA related */ From 02c39c0121d209117cb10b76df0b960f546b72e3 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 21 Aug 2025 16:16:47 +0200 Subject: [PATCH 2984/3206] smb: smbdirect: introduce struct smbdirect_mr_io This will be used by the client in order to maintain memory registrations. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 26 ++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index b0efa5a26016e4..f220891ab290db 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -334,6 +334,32 @@ struct smbdirect_recv_io { u8 packet[]; }; +enum smbdirect_mr_state { + SMBDIRECT_MR_READY, + SMBDIRECT_MR_REGISTERED, + SMBDIRECT_MR_INVALIDATED, + SMBDIRECT_MR_ERROR +}; + +struct smbdirect_mr_io { + struct smbdirect_socket *socket; + struct ib_cqe cqe; + + struct list_head list; + + enum smbdirect_mr_state state; + struct ib_mr *mr; + struct sg_table sgt; + enum dma_data_direction dir; + union { + struct ib_reg_wr wr; + struct ib_send_wr inv_wr; + }; + + bool need_invalidate; + struct completion invalidate_done; +}; + struct smbdirect_rw_io { struct smbdirect_socket *socket; struct ib_cqe cqe; From 41e5086905a3c37c3e5b716192b2dcd719ad8ef5 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 21 Aug 2025 23:50:42 +0200 Subject: [PATCH 2985/3206] smb: smbdirect: introduce smbdirect_socket_parameters.max_frmr_depth This will be used by the client... Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h index c3274bbb3c02d8..05cc6a9d0ccd3e 100644 --- a/fs/smb/common/smbdirect/smbdirect.h +++ b/fs/smb/common/smbdirect/smbdirect.h @@ -36,6 +36,7 @@ struct smbdirect_socket_parameters { __u32 max_recv_size; __u32 max_fragmented_recv_size; __u32 max_read_write_size; + __u32 max_frmr_depth; __u32 keepalive_interval_msec; __u32 keepalive_timeout_msec; } __packed; From ed3350f09d2c0234c59910ef1655cb9cbe17b0ca Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 19 Aug 2025 09:34:27 +0200 Subject: [PATCH 2986/3206] smb: smbdirect: introduce smbdirect_socket.mr_io.* This will be used by the client and will allow us to move to common code... Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 45 ++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index f220891ab290db..5d403b1beceab5 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -206,6 +206,44 @@ struct smbdirect_socket { } reassembly; } recv_io; + /* + * The state for Memory registrations on the client + */ + struct { + enum ib_mr_type type; + + /* + * The list of free smbdirect_mr_io + * structures + */ + struct { + struct list_head list; + spinlock_t lock; + } all; + + /* + * The number of available MRs ready for memory registration + */ + struct { + atomic_t count; + wait_queue_head_t wait_queue; + } ready; + + /* + * The number of used MRs + */ + struct { + atomic_t count; + } used; + + struct work_struct recovery_work; + + /* Used by transport to wait until all MRs are returned */ + struct { + wait_queue_head_t wait_queue; + } cleanup; + } mr_io; + /* * The state for RDMA read/write requests on the server */ @@ -269,6 +307,13 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) atomic_set(&sc->rw_io.credits.count, 0); init_waitqueue_head(&sc->rw_io.credits.wait_queue); + + spin_lock_init(&sc->mr_io.all.lock); + INIT_LIST_HEAD(&sc->mr_io.all.list); + atomic_set(&sc->mr_io.ready.count, 0); + init_waitqueue_head(&sc->mr_io.ready.wait_queue); + atomic_set(&sc->mr_io.used.count, 0); + init_waitqueue_head(&sc->mr_io.cleanup.wait_queue); } struct smbdirect_send_io { From 6920b4ad49fc7ff5b99a0dcff8e9612753a7a876 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 26 Aug 2025 16:30:32 +0200 Subject: [PATCH 2987/3206] smb: smbdirect: let smbdirect_socket_init() initialize all [delayed_]work_structs as disabled This safer to start with and allows common code not care about if the caller uses these or not. E.g. sc->mr_io.recovery_work is only used on the client... Note disable_[delayed_]work_sync() requires a valid function pointer in INIT_[DELAYED_]WORK(). The non _sync() version don't require it, but as we need to use the _sync() version on cleanup we better use it here too, it won't block anyway here... Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 5d403b1beceab5..14f6e6d2035fb0 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -277,6 +277,14 @@ struct smbdirect_socket { } statistics; }; +static void __smbdirect_socket_disabled_work(struct work_struct *work) +{ + /* + * Should never be called as disable_[delayed_]work_sync() was used. + */ + WARN_ON_ONCE(1); +} + static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) { /* @@ -287,6 +295,14 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) init_waitqueue_head(&sc->status_wait); + INIT_WORK(&sc->disconnect_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->disconnect_work); + + INIT_WORK(&sc->idle.immediate_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->idle.immediate_work); + INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work); + disable_delayed_work_sync(&sc->idle.timer_work); + atomic_set(&sc->send_io.credits.count, 0); init_waitqueue_head(&sc->send_io.credits.wait_queue); @@ -298,6 +314,8 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) spin_lock_init(&sc->recv_io.free.lock); atomic_set(&sc->recv_io.posted.count, 0); + INIT_WORK(&sc->recv_io.posted.refill_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->recv_io.posted.refill_work); atomic_set(&sc->recv_io.credits.count, 0); @@ -313,6 +331,8 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) atomic_set(&sc->mr_io.ready.count, 0); init_waitqueue_head(&sc->mr_io.ready.wait_queue); atomic_set(&sc->mr_io.used.count, 0); + INIT_WORK(&sc->mr_io.recovery_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->mr_io.recovery_work); init_waitqueue_head(&sc->mr_io.cleanup.wait_queue); } From ef71f1e046489c77a2f7d012edc762fba0a7aadc Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 20 Aug 2025 11:25:06 +0200 Subject: [PATCH 2988/3206] smb: client: fix sending the iwrap custom IRD/ORD negotiation messages Do a real negotiation and check the servers initiator_depth and responder_resources. This should use big endian in order to be useful. I have captures of windows clients showing this. The fact that we used little endian up to now means that we sent very large numbers and the negotiation with the server truncated them to the server limits. Note the reason why this uses u8 for initiator_depth and responder_resources is that the rdma layer also uses it. The inconsitency regarding the initiator_depth and responder_resources values being reversed for iwarp devices in RDMA_CM_EVENT_ESTABLISHED should also be fixed later, but for now we should fix it. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: linux-rdma@vger.kernel.org Fixes: c7398583340a ("CIFS: SMBD: Implement RDMA memory registration") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 110 ++++++++++++++++++++++++++++++++++---- fs/smb/client/smbdirect.h | 4 +- 2 files changed, 103 insertions(+), 11 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index e0fce5033004c7..6480945c245923 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -179,6 +179,8 @@ static int smbd_conn_upcall( struct smbd_connection *info = id->context; struct smbdirect_socket *sc = &info->socket; const char *event_name = rdma_event_msg(event->event); + u8 peer_initiator_depth; + u8 peer_responder_resources; log_rdma_event(INFO, "event=%s status=%d\n", event_name, event->status); @@ -204,6 +206,85 @@ static int smbd_conn_upcall( case RDMA_CM_EVENT_ESTABLISHED: log_rdma_event(INFO, "connected event=%s\n", event_name); + + /* + * Here we work around an inconsistency between + * iWarp and other devices (at least rxe and irdma using RoCEv2) + */ + if (rdma_protocol_iwarp(id->device, id->port_num)) { + /* + * iWarp devices report the peer's values + * with the perspective of the peer here. + * Tested with siw and irdma (in iwarp mode) + * We need to change to our perspective here, + * so we need to switch the values. + */ + peer_initiator_depth = event->param.conn.responder_resources; + peer_responder_resources = event->param.conn.initiator_depth; + } else { + /* + * Non iWarp devices report the peer's values + * already changed to our perspective here. + * Tested with rxe and irdma (in roce mode). + */ + peer_initiator_depth = event->param.conn.initiator_depth; + peer_responder_resources = event->param.conn.responder_resources; + } + if (rdma_protocol_iwarp(id->device, id->port_num) && + event->param.conn.private_data_len == 8) { + /* + * Legacy clients with only iWarp MPA v1 support + * need a private blob in order to negotiate + * the IRD/ORD values. + */ + const __be32 *ird_ord_hdr = event->param.conn.private_data; + u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); + u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); + + /* + * cifs.ko sends the legacy IRD/ORD negotiation + * event if iWarp MPA v2 was used. + * + * Here we check that the values match and only + * mark the client as legacy if they don't match. + */ + if ((u32)event->param.conn.initiator_depth != ird32 || + (u32)event->param.conn.responder_resources != ord32) { + /* + * There are broken clients (old cifs.ko) + * using little endian and also + * struct rdma_conn_param only uses u8 + * for initiator_depth and responder_resources, + * so we truncate the value to U8_MAX. + * + * smb_direct_accept_client() will then + * do the real negotiation in order to + * select the minimum between client and + * server. + */ + ird32 = min_t(u32, ird32, U8_MAX); + ord32 = min_t(u32, ord32, U8_MAX); + + info->legacy_iwarp = true; + peer_initiator_depth = (u8)ird32; + peer_responder_resources = (u8)ord32; + } + } + + /* + * negotiate the value by using the minimum + * between client and server if the client provided + * non 0 values. + */ + if (peer_initiator_depth != 0) + info->initiator_depth = + min_t(u8, info->initiator_depth, + peer_initiator_depth); + if (peer_responder_resources != 0) + info->responder_resources = + min_t(u8, info->responder_resources, + peer_responder_resources); + sc->status = SMBDIRECT_SOCKET_CONNECTED; wake_up_interruptible(&info->status_wait); break; @@ -1551,7 +1632,7 @@ static struct smbd_connection *_smbd_get_connection( struct ib_qp_init_attr qp_attr; struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; struct ib_port_immutable port_immutable; - u32 ird_ord_hdr[2]; + __be32 ird_ord_hdr[2]; info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL); if (!info) @@ -1559,6 +1640,9 @@ static struct smbd_connection *_smbd_get_connection( sc = &info->socket; sp = &sc->parameters; + info->initiator_depth = 1; + info->responder_resources = SMBD_CM_RESPONDER_RESOURCES; + sc->status = SMBDIRECT_SOCKET_CONNECTING; rc = smbd_ia_open(info, dstaddr, port); if (rc) { @@ -1639,22 +1723,22 @@ static struct smbd_connection *_smbd_get_connection( } sc->ib.qp = sc->rdma.cm_id->qp; - memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = 0; - - conn_param.responder_resources = - min(sc->ib.dev->attrs.max_qp_rd_atom, - SMBD_CM_RESPONDER_RESOURCES); - info->responder_resources = conn_param.responder_resources; + info->responder_resources = + min_t(u8, info->responder_resources, + sc->ib.dev->attrs.max_qp_rd_atom); log_rdma_mr(INFO, "responder_resources=%d\n", info->responder_resources); + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.initiator_depth = info->initiator_depth; + conn_param.responder_resources = info->responder_resources; + /* Need to send IRD/ORD in private data for iWARP */ sc->ib.dev->ops.get_port_immutable( sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable); if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { - ird_ord_hdr[0] = info->responder_resources; - ird_ord_hdr[1] = 1; + ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); + ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); conn_param.private_data = ird_ord_hdr; conn_param.private_data_len = sizeof(ird_ord_hdr); } else { @@ -2121,6 +2205,12 @@ static int allocate_mr_list(struct smbd_connection *info) atomic_set(&info->mr_used_count, 0); init_waitqueue_head(&info->wait_for_mr_cleanup); INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); + + if (info->responder_resources == 0) { + log_rdma_mr(ERR, "responder_resources negotiated as 0\n"); + return -EINVAL; + } + /* Allocate more MRs (2x) than hardware responder_resources */ for (i = 0; i < info->responder_resources * 2; i++) { smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index e45aa9ddd71da5..4ca9b2b2c57f93 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -67,7 +67,9 @@ struct smbd_connection { /* Memory registrations */ /* Maximum number of RDMA read/write outstanding on this connection */ - int responder_resources; + bool legacy_iwarp; + u8 initiator_depth; + u8 responder_resources; /* Maximum number of pages in a single RDMA write/read on this connection */ int max_frmr_depth; /* From 58dfba8a2d4eb6defc6e710196b053abdaf6cd79 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 18:48:13 +0200 Subject: [PATCH 2989/3206] smb: client/smbdirect: replace SMBDIRECT_SOCKET_CONNECTING with more detailed states The process of reaching a functional connection represented by SMBDIRECT_SOCKET_CONNECTED, is more complex than using a single SMBDIRECT_SOCKET_CONNECTING state. This will allow us to remove a lot of special variables and completions in the following commits. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 73 ++++++++++++++++++++-- fs/smb/common/smbdirect/smbdirect_socket.h | 42 +++++++++++-- 2 files changed, 103 insertions(+), 12 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 6480945c245923..e4abc1a9564ada 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -161,9 +161,36 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) container_of(work, struct smbd_connection, disconnect_work); struct smbdirect_socket *sc = &info->socket; - if (sc->status == SMBDIRECT_SOCKET_CONNECTED) { + switch (sc->status) { + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_CONNECTED: sc->status = SMBDIRECT_SOCKET_DISCONNECTING; rdma_disconnect(sc->rdma.cm_id); + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + /* + * rdma_connect() never reached + * RDMA_CM_EVENT_ESTABLISHED + */ + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + break; } } @@ -187,19 +214,31 @@ static int smbd_conn_upcall( switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED; + info->ri_rc = 0; + complete(&info->ri_done); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; info->ri_rc = 0; complete(&info->ri_done); break; case RDMA_CM_EVENT_ADDR_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; info->ri_rc = -EHOSTUNREACH; complete(&info->ri_done); break; case RDMA_CM_EVENT_ROUTE_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; info->ri_rc = -ENETUNREACH; complete(&info->ri_done); break; @@ -285,7 +324,8 @@ static int smbd_conn_upcall( min_t(u8, info->responder_resources, peer_responder_resources); - sc->status = SMBDIRECT_SOCKET_CONNECTED; + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; wake_up_interruptible(&info->status_wait); break; @@ -293,7 +333,8 @@ static int smbd_conn_upcall( case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; wake_up_interruptible(&info->status_wait); break; @@ -565,6 +606,12 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) info->negotiate_done = process_negotiation_response(response, wc->byte_len); put_receive_buffer(info, response); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING); + if (!info->negotiate_done) + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; + else + sc->status = SMBDIRECT_SOCKET_CONNECTED; + complete(&info->negotiate_completion); return; @@ -655,6 +702,7 @@ static struct rdma_cm_id *smbd_create_id( struct smbd_connection *info, struct sockaddr *dstaddr, int port) { + struct smbdirect_socket *sc = &info->socket; struct rdma_cm_id *id; int rc; __be16 *sport; @@ -677,6 +725,8 @@ static struct rdma_cm_id *smbd_create_id( init_completion(&info->ri_done); info->ri_rc = -ETIMEDOUT; + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING; rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr, RDMA_RESOLVE_TIMEOUT); if (rc) { @@ -697,6 +747,8 @@ static struct rdma_cm_id *smbd_create_id( } info->ri_rc = -ETIMEDOUT; + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) { log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); @@ -743,6 +795,9 @@ static int smbd_ia_open( struct smbdirect_socket *sc = &info->socket; int rc; + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED; + sc->rdma.cm_id = smbd_create_id(info, dstaddr, port); if (IS_ERR(sc->rdma.cm_id)) { rc = PTR_ERR(sc->rdma.cm_id); @@ -1184,6 +1239,9 @@ static int smbd_negotiate(struct smbd_connection *info) int rc; struct smbdirect_recv_io *response = get_receive_buffer(info); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; + sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP; rc = smbd_post_recv(info, response); log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", @@ -1643,7 +1701,7 @@ static struct smbd_connection *_smbd_get_connection( info->initiator_depth = 1; info->responder_resources = SMBD_CM_RESPONDER_RESOURCES; - sc->status = SMBDIRECT_SOCKET_CONNECTING; + sc->status = SMBDIRECT_SOCKET_CREATED; rc = smbd_ia_open(info, dstaddr, port); if (rc) { log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); @@ -1755,6 +1813,9 @@ static struct smbd_connection *_smbd_get_connection( init_waitqueue_head(&info->status_wait); init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); + + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; rc = rdma_connect(sc->rdma.cm_id, &conn_param); if (rc) { log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc); @@ -1763,10 +1824,10 @@ static struct smbd_connection *_smbd_get_connection( wait_event_interruptible_timeout( info->status_wait, - sc->status != SMBDIRECT_SOCKET_CONNECTING, + sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) { log_rdma_event(ERR, "rdma_connect failed port=%d\n", port); goto rdma_connect_failed; } diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 14f6e6d2035fb0..ced8c9f20cf55f 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -10,9 +10,19 @@ enum smbdirect_socket_status { SMBDIRECT_SOCKET_CREATED, - SMBDIRECT_SOCKET_CONNECTING, - SMBDIRECT_SOCKET_CONNECTED, + SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED, + SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING, + SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED, + SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED, + SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING, + SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED, + SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED, + SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING, + SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED, + SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, + SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, SMBDIRECT_SOCKET_NEGOTIATE_FAILED, + SMBDIRECT_SOCKET_CONNECTED, SMBDIRECT_SOCKET_DISCONNECTING, SMBDIRECT_SOCKET_DISCONNECTED, SMBDIRECT_SOCKET_DESTROYED @@ -24,12 +34,32 @@ const char *smbdirect_socket_status_string(enum smbdirect_socket_status status) switch (status) { case SMBDIRECT_SOCKET_CREATED: return "CREATED"; - case SMBDIRECT_SOCKET_CONNECTING: - return "CONNECTING"; - case SMBDIRECT_SOCKET_CONNECTED: - return "CONNECTED"; + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + return "RESOLVE_ADDR_NEEDED"; + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + return "RESOLVE_ADDR_RUNNING"; + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + return "RESOLVE_ADDR_FAILED"; + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + return "RESOLVE_ROUTE_NEEDED"; + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + return "RESOLVE_ROUTE_RUNNING"; + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + return "RESOLVE_ROUTE_FAILED"; + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + return "RDMA_CONNECT_NEEDED"; + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + return "RDMA_CONNECT_RUNNING"; + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + return "RDMA_CONNECT_FAILED"; + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + return "NEGOTIATE_NEEDED"; + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + return "NEGOTIATE_RUNNING"; case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: return "NEGOTIATE_FAILED"; + case SMBDIRECT_SOCKET_CONNECTED: + return "CONNECTED"; case SMBDIRECT_SOCKET_DISCONNECTING: return "DISCONNECTING"; case SMBDIRECT_SOCKET_DISCONNECTED: From cc678b8a89291ef55c8dfc01391506f1234c6cae Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 28 Aug 2025 10:39:56 +0200 Subject: [PATCH 2990/3206] smb: client/smbdirect: introduce SMBDIRECT_SOCKET_ERROR This will be used to turn SMBDIRECT_SOCKET_CONNECTED into an error within smbd_disconnect_rdma_connection() on the client and smb_direct_disconnect_rdma_connection() on the server. We do this in a single commit with the client as otherwise it won't build because the enum value is not handled in the switch statement. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 1 + fs/smb/common/smbdirect/smbdirect_socket.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index e4abc1a9564ada..09f8b12dd4f109 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -166,6 +166,7 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: case SMBDIRECT_SOCKET_CONNECTED: + case SMBDIRECT_SOCKET_ERROR: sc->status = SMBDIRECT_SOCKET_DISCONNECTING; rdma_disconnect(sc->rdma.cm_id); break; diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index ced8c9f20cf55f..5e25abc0236490 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -23,6 +23,7 @@ enum smbdirect_socket_status { SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, SMBDIRECT_SOCKET_NEGOTIATE_FAILED, SMBDIRECT_SOCKET_CONNECTED, + SMBDIRECT_SOCKET_ERROR, SMBDIRECT_SOCKET_DISCONNECTING, SMBDIRECT_SOCKET_DISCONNECTED, SMBDIRECT_SOCKET_DESTROYED @@ -60,6 +61,8 @@ const char *smbdirect_socket_status_string(enum smbdirect_socket_status status) return "NEGOTIATE_FAILED"; case SMBDIRECT_SOCKET_CONNECTED: return "CONNECTED"; + case SMBDIRECT_SOCKET_ERROR: + return "ERROR"; case SMBDIRECT_SOCKET_DISCONNECTING: return "DISCONNECTING"; case SMBDIRECT_SOCKET_DISCONNECTED: From f5b893edde7bcd15246dfeefd25082fe637db647 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 28 Aug 2025 12:11:02 +0200 Subject: [PATCH 2991/3206] smb: smbdirect: introduce smbdirect_socket.first_error This will be used when a connected conection gets the first error. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/common/smbdirect/smbdirect_socket.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 5e25abc0236490..db22a1d0546b4e 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -83,6 +83,7 @@ enum smbdirect_keepalive_status { struct smbdirect_socket { enum smbdirect_socket_status status; wait_queue_head_t status_wait; + int first_error; /* * This points to the workqueue to From afff34dc025b5f0476645f44b4d9cbc83dd54e71 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 8 Sep 2025 12:14:43 +0200 Subject: [PATCH 2992/3206] smb: client: adjust smbdirect related output of cifs_debug_data_proc_show() For the reader it is not obvious that the counter values are displayed in hex as there's no leading '0x' before '%x'. So changed them to %u instead and added '0x' for non-counter values and also a string for the status. Note this generates some checkpatch warnings like this: WARNING: quoted string split across lines #45: FILE: fs/smb/client/cifs_debug.c:460: + seq_printf(m, "\nSMBDirect protocol version: 0x%x " + "transport status: %s (%u)", But I'll leave this is the current style in the related code... Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 47 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 2337cf795db3f8..265e063e95bd06 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -456,55 +456,56 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) sc = &server->smbd_conn->socket; sp = &sc->parameters; - seq_printf(m, "\nSMBDirect (in hex) protocol version: %x " - "transport status: %x", + seq_printf(m, "\nSMBDirect protocol version: 0x%x " + "transport status: %s (%u)", server->smbd_conn->protocol, - server->smbd_conn->socket.status); - seq_printf(m, "\nConn receive_credit_max: %x " - "send_credit_target: %x max_send_size: %x", + smbdirect_socket_status_string(sc->status), + sc->status); + seq_printf(m, "\nConn receive_credit_max: %u " + "send_credit_target: %u max_send_size: %u", sp->recv_credit_max, sp->send_credit_target, sp->max_send_size); - seq_printf(m, "\nConn max_fragmented_recv_size: %x " - "max_fragmented_send_size: %x max_receive_size:%x", + seq_printf(m, "\nConn max_fragmented_recv_size: %u " + "max_fragmented_send_size: %u max_receive_size:%u", sp->max_fragmented_recv_size, sp->max_fragmented_send_size, sp->max_recv_size); - seq_printf(m, "\nConn keep_alive_interval: %x " - "max_readwrite_size: %x rdma_readwrite_threshold: %x", + seq_printf(m, "\nConn keep_alive_interval: %u " + "max_readwrite_size: %u rdma_readwrite_threshold: %u", sp->keepalive_interval_msec * 1000, sp->max_read_write_size, server->smbd_conn->rdma_readwrite_threshold); - seq_printf(m, "\nDebug count_get_receive_buffer: %x " - "count_put_receive_buffer: %x count_send_empty: %x", + seq_printf(m, "\nDebug count_get_receive_buffer: %u " + "count_put_receive_buffer: %u count_send_empty: %u", server->smbd_conn->count_get_receive_buffer, server->smbd_conn->count_put_receive_buffer, server->smbd_conn->count_send_empty); - seq_printf(m, "\nRead Queue count_reassembly_queue: %x " - "count_enqueue_reassembly_queue: %x " - "count_dequeue_reassembly_queue: %x " - "reassembly_data_length: %x " - "reassembly_queue_length: %x", + seq_printf(m, "\nRead Queue count_reassembly_queue: %u " + "count_enqueue_reassembly_queue: %u " + "count_dequeue_reassembly_queue: %u " + "reassembly_data_length: %u " + "reassembly_queue_length: %u", server->smbd_conn->count_reassembly_queue, server->smbd_conn->count_enqueue_reassembly_queue, server->smbd_conn->count_dequeue_reassembly_queue, sc->recv_io.reassembly.data_length, sc->recv_io.reassembly.queue_length); - seq_printf(m, "\nCurrent Credits send_credits: %x " - "receive_credits: %x receive_credit_target: %x", + seq_printf(m, "\nCurrent Credits send_credits: %u " + "receive_credits: %u receive_credit_target: %u", atomic_read(&server->smbd_conn->send_credits), atomic_read(&server->smbd_conn->receive_credits), server->smbd_conn->receive_credit_target); - seq_printf(m, "\nPending send_pending: %x ", + seq_printf(m, "\nPending send_pending: %u ", atomic_read(&server->smbd_conn->send_pending)); - seq_printf(m, "\nReceive buffers count_receive_queue: %x ", + seq_printf(m, "\nReceive buffers count_receive_queue: %u ", server->smbd_conn->count_receive_queue); - seq_printf(m, "\nMR responder_resources: %x " - "max_frmr_depth: %x mr_type: %x", + seq_printf(m, "\nMR responder_resources: %u " + "max_frmr_depth: %u mr_type: 0x%x", server->smbd_conn->responder_resources, server->smbd_conn->max_frmr_depth, server->smbd_conn->mr_type); - seq_printf(m, "\nMR mr_ready_count: %x mr_used_count: %x", + seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u", atomic_read(&server->smbd_conn->mr_ready_count), atomic_read(&server->smbd_conn->mr_used_count)); skip_rdma: From 00e4c7a87d1fa8d5668e2922578ff4aff6efb8a2 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 18:48:14 +0200 Subject: [PATCH 2993/3206] smb: client: use status_wait and SMBDIRECT_SOCKET_NEGOTIATE_RUNNING for completion We can use the state change from SMBDIRECT_SOCKET_NEGOTIATE_RUNNING to SMBDIRECT_SOCKET_CONNECTED or SMBDIRECT_SOCKET_NEGOTIATE_FAILED in order to notify the caller if the negotiation is over. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 19 ++++++++++--------- fs/smb/client/smbdirect.h | 3 --- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 09f8b12dd4f109..cfb45bc9937666 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -582,6 +582,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) u32 data_offset = 0; u32 data_length = 0; u32 remaining_data_length = 0; + bool negotiate_done = false; log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n", response, sc->recv_io.expected, wc->status, wc->opcode, @@ -604,16 +605,16 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) case SMBDIRECT_EXPECT_NEGOTIATE_REP: dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response)); sc->recv_io.reassembly.full_packet_received = true; - info->negotiate_done = + negotiate_done = process_negotiation_response(response, wc->byte_len); put_receive_buffer(info, response); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING); - if (!info->negotiate_done) + if (!negotiate_done) sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; else sc->status = SMBDIRECT_SOCKET_CONNECTED; - complete(&info->negotiate_completion); + wake_up_interruptible(&info->status_wait); return; /* SMBD data transfer packet */ @@ -1253,17 +1254,17 @@ static int smbd_negotiate(struct smbd_connection *info) return rc; } - init_completion(&info->negotiate_completion); - info->negotiate_done = false; rc = smbd_post_send_negotiate_req(info); if (rc) return rc; - rc = wait_for_completion_interruptible_timeout( - &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ); - log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc); + rc = wait_event_interruptible_timeout( + info->status_wait, + sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, + secs_to_jiffies(SMBD_NEGOTIATE_TIMEOUT)); + log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc); - if (info->negotiate_done) + if (sc->status == SMBDIRECT_SOCKET_CONNECTED) return 0; if (rc == 0) diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 4ca9b2b2c57f93..c9b0c6b61e7e5c 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -49,9 +49,6 @@ struct smbd_connection { struct completion ri_done; wait_queue_head_t status_wait; - struct completion negotiate_completion; - bool negotiate_done; - struct work_struct disconnect_work; struct work_struct post_send_credits_work; From 4ff3fa4e4aa78c3aa9083a8d448f8af4f0740df7 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 18:48:15 +0200 Subject: [PATCH 2994/3206] smb: client: use status_wait and SMBDIRECT_SOCKET_RESOLVE_{ADDR,ROUTE}_RUNNING for completion We can use the state change from SMBDIRECT_SOCKET_RESOLVE_{ADDR,ROUTE}_RUNNING to the next state in order to wake the caller to do the next step. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 49 ++++++++++++++++++++++----------------- fs/smb/client/smbdirect.h | 2 -- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index cfb45bc9937666..556beec3f430fd 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -217,31 +217,27 @@ static int smbd_conn_upcall( case RDMA_CM_EVENT_ADDR_RESOLVED: WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED; - info->ri_rc = 0; - complete(&info->ri_done); + wake_up_interruptible(&info->status_wait); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; - info->ri_rc = 0; - complete(&info->ri_done); + wake_up_interruptible(&info->status_wait); break; case RDMA_CM_EVENT_ADDR_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; - info->ri_rc = -EHOSTUNREACH; - complete(&info->ri_done); + wake_up_interruptible(&info->status_wait); break; case RDMA_CM_EVENT_ROUTE_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; - info->ri_rc = -ENETUNREACH; - complete(&info->ri_done); + wake_up_interruptible(&info->status_wait); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -724,9 +720,6 @@ static struct rdma_cm_id *smbd_create_id( *sport = htons(port); - init_completion(&info->ri_done); - info->ri_rc = -ETIMEDOUT; - WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED); sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING; rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr, @@ -735,20 +728,26 @@ static struct rdma_cm_id *smbd_create_id( log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc); goto out; } - rc = wait_for_completion_interruptible_timeout( - &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + rc = wait_event_interruptible_timeout( + info->status_wait, + sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); /* e.g. if interrupted returns -ERESTARTSYS */ if (rc < 0) { log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); goto out; } - rc = info->ri_rc; - if (rc) { + if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING) { + rc = -ETIMEDOUT; + log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); + goto out; + } + if (sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED) { + rc = -EHOSTUNREACH; log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); goto out; } - info->ri_rc = -ETIMEDOUT; WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED); sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); @@ -756,15 +755,22 @@ static struct rdma_cm_id *smbd_create_id( log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); goto out; } - rc = wait_for_completion_interruptible_timeout( - &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + rc = wait_event_interruptible_timeout( + info->status_wait, + sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); /* e.g. if interrupted returns -ERESTARTSYS */ if (rc < 0) { log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); goto out; } - rc = info->ri_rc; - if (rc) { + if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING) { + rc = -ETIMEDOUT; + log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); + goto out; + } + if (sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED) { + rc = -ENETUNREACH; log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); goto out; } @@ -1703,6 +1709,8 @@ static struct smbd_connection *_smbd_get_connection( info->initiator_depth = 1; info->responder_resources = SMBD_CM_RESPONDER_RESOURCES; + init_waitqueue_head(&info->status_wait); + sc->status = SMBDIRECT_SOCKET_CREATED; rc = smbd_ia_open(info, dstaddr, port); if (rc) { @@ -1813,7 +1821,6 @@ static struct smbd_connection *_smbd_get_connection( log_rdma_event(INFO, "connecting to IP %pI4 port %d\n", &addr_in->sin_addr, port); - init_waitqueue_head(&info->status_wait); init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index c9b0c6b61e7e5c..82b1d936e80084 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -45,8 +45,6 @@ enum keep_alive_status { struct smbd_connection { struct smbdirect_socket socket; - int ri_rc; - struct completion ri_done; wait_queue_head_t status_wait; struct work_struct disconnect_work; From d9140ef074239408a1d991819811fadbbe781c79 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 18:48:17 +0200 Subject: [PATCH 2995/3206] smb: client: make use of smbdirect_socket.status_wait This will allow us to have common helper functions soon. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 32 ++++++++++++++++---------------- fs/smb/client/smbdirect.h | 2 -- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 556beec3f430fd..501c515ec04f47 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -217,27 +217,27 @@ static int smbd_conn_upcall( case RDMA_CM_EVENT_ADDR_RESOLVED: WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED; - wake_up_interruptible(&info->status_wait); + wake_up_interruptible(&sc->status_wait); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; - wake_up_interruptible(&info->status_wait); + wake_up_interruptible(&sc->status_wait); break; case RDMA_CM_EVENT_ADDR_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; - wake_up_interruptible(&info->status_wait); + wake_up_interruptible(&sc->status_wait); break; case RDMA_CM_EVENT_ROUTE_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; - wake_up_interruptible(&info->status_wait); + wake_up_interruptible(&sc->status_wait); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -323,7 +323,7 @@ static int smbd_conn_upcall( WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; - wake_up_interruptible(&info->status_wait); + wake_up_interruptible(&sc->status_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: @@ -332,7 +332,7 @@ static int smbd_conn_upcall( log_rdma_event(ERR, "connecting failed event=%s\n", event_name); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; - wake_up_interruptible(&info->status_wait); + wake_up_interruptible(&sc->status_wait); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: @@ -341,12 +341,12 @@ static int smbd_conn_upcall( if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) { log_rdma_event(ERR, "event=%s during negotiation\n", event_name); sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up(&info->status_wait); + wake_up(&sc->status_wait); break; } sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up_interruptible(&info->status_wait); + wake_up_interruptible(&sc->status_wait); wake_up_interruptible(&sc->recv_io.reassembly.wait_queue); wake_up_interruptible_all(&info->wait_send_queue); break; @@ -610,7 +610,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) else sc->status = SMBDIRECT_SOCKET_CONNECTED; - wake_up_interruptible(&info->status_wait); + wake_up_interruptible(&sc->status_wait); return; /* SMBD data transfer packet */ @@ -729,7 +729,7 @@ static struct rdma_cm_id *smbd_create_id( goto out; } rc = wait_event_interruptible_timeout( - info->status_wait, + sc->status_wait, sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); /* e.g. if interrupted returns -ERESTARTSYS */ @@ -756,7 +756,7 @@ static struct rdma_cm_id *smbd_create_id( goto out; } rc = wait_event_interruptible_timeout( - info->status_wait, + sc->status_wait, sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); /* e.g. if interrupted returns -ERESTARTSYS */ @@ -1265,7 +1265,7 @@ static int smbd_negotiate(struct smbd_connection *info) return rc; rc = wait_event_interruptible_timeout( - info->status_wait, + sc->status_wait, sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, secs_to_jiffies(SMBD_NEGOTIATE_TIMEOUT)); log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc); @@ -1492,7 +1492,7 @@ void smbd_destroy(struct TCP_Server_Info *server) rdma_disconnect(sc->rdma.cm_id); log_rdma_event(INFO, "wait for transport being disconnected\n"); wait_event_interruptible( - info->status_wait, + sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } @@ -1709,7 +1709,7 @@ static struct smbd_connection *_smbd_get_connection( info->initiator_depth = 1; info->responder_resources = SMBD_CM_RESPONDER_RESOURCES; - init_waitqueue_head(&info->status_wait); + init_waitqueue_head(&sc->status_wait); sc->status = SMBDIRECT_SOCKET_CREATED; rc = smbd_ia_open(info, dstaddr, port); @@ -1832,7 +1832,7 @@ static struct smbd_connection *_smbd_get_connection( } wait_event_interruptible_timeout( - info->status_wait, + sc->status_wait, sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); @@ -1889,7 +1889,7 @@ static struct smbd_connection *_smbd_get_connection( destroy_caches_and_workqueue(info); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; rdma_disconnect(sc->rdma.cm_id); - wait_event(info->status_wait, + wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); allocate_cache_failed: diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 82b1d936e80084..f250241d2d24bb 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -45,8 +45,6 @@ enum keep_alive_status { struct smbd_connection { struct smbdirect_socket socket; - wait_queue_head_t status_wait; - struct work_struct disconnect_work; struct work_struct post_send_credits_work; From 5a0d5ae65c121decfac03c68a20343e75f636957 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 10:06:50 +0200 Subject: [PATCH 2996/3206] smb: client: make only use of wake_up[_all]() in smbdirect.c wake_up_interruptible[_all]() doesn't wake up tasks waiting with wait_event(). So we better wake_up[_all]() in order to wake up all tasks in order to simplify the logic. As we currently don't use any wait_event_*_exclusive() it doesn't really matter if we use wake_up() or wake_up_all(). But in this patch I try to use wake_up() for expected situations and wake_up_all() for situations of a broken connection. So don't need to adjust things in future when we may use wait_event_*_exclusive() in order to wake up only one process that should make progress. Changing the wait_event_*() code in order to keep wait_event(), wait_event_interruptible() and wait_event_interruptible_timeout() or changing them to wait_event_killable(), wait_event_killable_timeout(), wait_event_killable_exclusive() is something to think about in a future patch. The goal here is to avoid that some tasks are not woken and freeze forever. Also note that this patch only changes the existing wake_up*() calls. Adding more wake_up*() calls for other wait queues is also deferred to a future patch. Link: https://lore.kernel.org/linux-cifs/13851363-0dc9-465c-9ced-3ede4904eef0@samba.org/T/#t Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 40 ++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 501c515ec04f47..82d36fb248fd4f 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -217,27 +217,27 @@ static int smbd_conn_upcall( case RDMA_CM_EVENT_ADDR_RESOLVED: WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED; - wake_up_interruptible(&sc->status_wait); + wake_up(&sc->status_wait); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; - wake_up_interruptible(&sc->status_wait); + wake_up(&sc->status_wait); break; case RDMA_CM_EVENT_ADDR_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; - wake_up_interruptible(&sc->status_wait); + wake_up_all(&sc->status_wait); break; case RDMA_CM_EVENT_ROUTE_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; - wake_up_interruptible(&sc->status_wait); + wake_up_all(&sc->status_wait); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -323,7 +323,7 @@ static int smbd_conn_upcall( WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; - wake_up_interruptible(&sc->status_wait); + wake_up(&sc->status_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: @@ -332,7 +332,7 @@ static int smbd_conn_upcall( log_rdma_event(ERR, "connecting failed event=%s\n", event_name); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; - wake_up_interruptible(&sc->status_wait); + wake_up_all(&sc->status_wait); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: @@ -341,14 +341,14 @@ static int smbd_conn_upcall( if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) { log_rdma_event(ERR, "event=%s during negotiation\n", event_name); sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up(&sc->status_wait); + wake_up_all(&sc->status_wait); break; } sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up_interruptible(&sc->status_wait); - wake_up_interruptible(&sc->recv_io.reassembly.wait_queue); - wake_up_interruptible_all(&info->wait_send_queue); + wake_up_all(&sc->status_wait); + wake_up_all(&sc->recv_io.reassembly.wait_queue); + wake_up_all(&info->wait_send_queue); break; default: @@ -525,7 +525,7 @@ static void smbd_post_send_credits(struct work_struct *work) struct smbdirect_socket *sc = &info->socket; if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { - wake_up(&info->wait_receive_queues); + wake_up_all(&info->wait_receive_queues); return; } @@ -605,12 +605,14 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) process_negotiation_response(response, wc->byte_len); put_receive_buffer(info, response); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING); - if (!negotiate_done) + if (!negotiate_done) { sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; - else + wake_up_all(&sc->status_wait); + } else { sc->status = SMBDIRECT_SOCKET_CONNECTED; + wake_up(&sc->status_wait); + } - wake_up_interruptible(&sc->status_wait); return; /* SMBD data transfer packet */ @@ -653,7 +655,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) * We have new send credits granted from remote peer * If any sender is waiting for credits, unblock it */ - wake_up_interruptible(&info->wait_send_queue); + wake_up(&info->wait_send_queue); } log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n", @@ -675,7 +677,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) */ if (data_length) { enqueue_reassembly(info, response, data_length); - wake_up_interruptible(&sc->recv_io.reassembly.wait_queue); + wake_up(&sc->recv_io.reassembly.wait_queue); } else put_receive_buffer(info, response); @@ -1536,7 +1538,7 @@ void smbd_destroy(struct TCP_Server_Info *server) * path when sending data, and then release memory registrations. */ log_rdma_event(INFO, "freeing mr list\n"); - wake_up_interruptible_all(&info->wait_mr); + wake_up_all(&info->wait_mr); while (atomic_read(&info->mr_used_count)) { cifs_server_unlock(server); msleep(1000); @@ -2235,7 +2237,7 @@ static void smbd_mr_recovery_work(struct work_struct *work) * get_mr() from the I/O issuing CPUs */ if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up_interruptible(&info->wait_mr); + wake_up(&info->wait_mr); } } @@ -2546,7 +2548,7 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) smbdirect_mr->dir); smbdirect_mr->state = MR_READY; if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up_interruptible(&info->wait_mr); + wake_up(&info->wait_mr); } else /* * Schedule the work to do MR recovery for future I/Os MR From 7360778b6f962bd87472238204b5cf76d98e8122 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 15:10:52 +0200 Subject: [PATCH 2997/3206] smb: client: make use of smbdirect_socket_init() It's much safer to initialize the whole structure at the beginning than doing it all over the place and then miss to move it if code changes. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 82d36fb248fd4f..fcef8f879bb364 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1398,13 +1398,6 @@ static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) struct smbdirect_recv_io *response; int i; - INIT_LIST_HEAD(&sc->recv_io.reassembly.list); - spin_lock_init(&sc->recv_io.reassembly.lock); - sc->recv_io.reassembly.data_length = 0; - sc->recv_io.reassembly.queue_length = 0; - - INIT_LIST_HEAD(&sc->recv_io.free.list); - spin_lock_init(&sc->recv_io.free.lock); info->count_receive_queue = 0; init_waitqueue_head(&info->wait_receive_queues); @@ -1706,14 +1699,12 @@ static struct smbd_connection *_smbd_get_connection( if (!info) return NULL; sc = &info->socket; + smbdirect_socket_init(sc); sp = &sc->parameters; info->initiator_depth = 1; info->responder_resources = SMBD_CM_RESPONDER_RESOURCES; - init_waitqueue_head(&sc->status_wait); - - sc->status = SMBDIRECT_SOCKET_CREATED; rc = smbd_ia_open(info, dstaddr, port); if (rc) { log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); @@ -1823,8 +1814,6 @@ static struct smbd_connection *_smbd_get_connection( log_rdma_event(INFO, "connecting to IP %pI4 port %d\n", &addr_in->sin_addr, port); - init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); - WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; rc = rdma_connect(sc->rdma.cm_id, &conn_param); From a51c67db2c9e3f1539f5c441d8db2058ed8644f9 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 19:06:49 +0200 Subject: [PATCH 2998/3206] smb: client: make use of smbdirect_socket.disconnect_work Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 12 +++++++----- fs/smb/client/smbdirect.h | 1 - 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index fcef8f879bb364..89375f3bd30a08 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -157,9 +157,8 @@ do { \ static void smbd_disconnect_rdma_work(struct work_struct *work) { - struct smbd_connection *info = - container_of(work, struct smbd_connection, disconnect_work); - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, disconnect_work); switch (sc->status) { case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: @@ -197,7 +196,9 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) static void smbd_disconnect_rdma_connection(struct smbd_connection *info) { - queue_work(info->workqueue, &info->disconnect_work); + struct smbdirect_socket *sc = &info->socket; + + queue_work(info->workqueue, &sc->disconnect_work); } /* Upcall from RDMA CM */ @@ -1705,6 +1706,8 @@ static struct smbd_connection *_smbd_get_connection( info->initiator_depth = 1; info->responder_resources = SMBD_CM_RESPONDER_RESOURCES; + INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work); + rc = smbd_ia_open(info, dstaddr, port); if (rc) { log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); @@ -1850,7 +1853,6 @@ static struct smbd_connection *_smbd_get_connection( init_waitqueue_head(&info->wait_post_send); - INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work); INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits); info->new_credits_offered = 0; spin_lock_init(&info->lock_new_credits_offered); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index f250241d2d24bb..1c63188664df95 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -45,7 +45,6 @@ enum keep_alive_status { struct smbd_connection { struct smbdirect_socket socket; - struct work_struct disconnect_work; struct work_struct post_send_credits_work; spinlock_t lock_new_credits_offered; From ca48841de93cbb911d2d85e84a329bbe0fac412d Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 11 Aug 2025 15:19:51 +0200 Subject: [PATCH 2999/3206] smb: client: make use of smbdirect_socket.send_io.pending.{count,{dec,zero}_wait_queue} This will be used by the server too and will allow to create common helper functions. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 +- fs/smb/client/smbdirect.c | 31 +++++++++++++------------------ fs/smb/client/smbdirect.h | 5 ----- 3 files changed, 14 insertions(+), 24 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 265e063e95bd06..d48438e6efdb79 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -497,7 +497,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) atomic_read(&server->smbd_conn->receive_credits), server->smbd_conn->receive_credit_target); seq_printf(m, "\nPending send_pending: %u ", - atomic_read(&server->smbd_conn->send_pending)); + atomic_read(&sc->send_io.pending.count)); seq_printf(m, "\nReceive buffers count_receive_queue: %u ", server->smbd_conn->count_receive_queue); seq_printf(m, "\nMR responder_resources: %u " diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 89375f3bd30a08..947fd301c70bb1 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -418,10 +418,10 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) return; } - if (atomic_dec_and_test(&info->send_pending)) - wake_up(&info->wait_send_pending); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); - wake_up(&info->wait_post_send); + wake_up(&sc->send_io.pending.dec_wait_queue); mempool_free(request, sc->send_io.mem.pool); } @@ -908,14 +908,14 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) request->sge[0].addr, request->sge[0].length, request->sge[0].lkey); - atomic_inc(&info->send_pending); + atomic_inc(&sc->send_io.pending.count); rc = ib_post_send(sc->ib.qp, &send_wr, NULL); if (!rc) return 0; /* if we reach here, post send failed */ log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); - atomic_dec(&info->send_pending); + atomic_dec(&sc->send_io.pending.count); ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr, request->sge[0].length, DMA_TO_DEVICE); @@ -1038,8 +1038,8 @@ static int smbd_post_send_iter(struct smbd_connection *info, } wait_send_queue: - wait_event(info->wait_post_send, - atomic_read(&info->send_pending) < sp->send_credit_target || + wait_event(sc->send_io.pending.dec_wait_queue, + atomic_read(&sc->send_io.pending.count) < sp->send_credit_target || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { @@ -1048,9 +1048,9 @@ static int smbd_post_send_iter(struct smbd_connection *info, goto err_wait_send_queue; } - if (unlikely(atomic_inc_return(&info->send_pending) > + if (unlikely(atomic_inc_return(&sc->send_io.pending.count) > sp->send_credit_target)) { - atomic_dec(&info->send_pending); + atomic_dec(&sc->send_io.pending.count); goto wait_send_queue; } @@ -1157,8 +1157,8 @@ static int smbd_post_send_iter(struct smbd_connection *info, atomic_sub(new_credits, &info->receive_credits); err_alloc: - if (atomic_dec_and_test(&info->send_pending)) - wake_up(&info->wait_send_pending); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); err_wait_send_queue: /* roll back send credits and pending */ @@ -1848,11 +1848,6 @@ static struct smbd_connection *_smbd_get_connection( queue_delayed_work(info->workqueue, &info->idle_timer_work, msecs_to_jiffies(sp->keepalive_interval_msec)); - init_waitqueue_head(&info->wait_send_pending); - atomic_set(&info->send_pending, 0); - - init_waitqueue_head(&info->wait_post_send); - INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits); info->new_credits_offered = 0; spin_lock_init(&info->lock_new_credits_offered); @@ -2151,8 +2146,8 @@ int smbd_send(struct TCP_Server_Info *server, * that means all the I/Os have been out and we are good to return */ - wait_event(info->wait_send_pending, - atomic_read(&info->send_pending) == 0 || + wait_event(sc->send_io.pending.zero_wait_queue, + atomic_read(&sc->send_io.pending.count) == 0 || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0) diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 1c63188664df95..2ce6b7ab898a38 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -82,11 +82,6 @@ struct smbd_connection { /* Used by transport to wait until all MRs are returned */ wait_queue_head_t wait_for_mr_cleanup; - /* Activity accounting */ - atomic_t send_pending; - wait_queue_head_t wait_send_pending; - wait_queue_head_t wait_post_send; - /* Receive queue */ int count_receive_queue; wait_queue_head_t wait_receive_queues; From 9b1a6b7583cb71e101a93e518e2cbc32911a2b89 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 11 Aug 2025 17:11:08 +0200 Subject: [PATCH 3000/3206] smb: client: make use of smbdirect_socket.send_io.credits.{count,wait_queue} This will be used by the server too and will allow to create common helper functions. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 +- fs/smb/client/smbdirect.c | 19 +++++++++---------- fs/smb/client/smbdirect.h | 3 --- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index d48438e6efdb79..01a582e172cbf9 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -493,7 +493,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) sc->recv_io.reassembly.queue_length); seq_printf(m, "\nCurrent Credits send_credits: %u " "receive_credits: %u receive_credit_target: %u", - atomic_read(&server->smbd_conn->send_credits), + atomic_read(&sc->send_io.credits.count), atomic_read(&server->smbd_conn->receive_credits), server->smbd_conn->receive_credit_target); seq_printf(m, "\nPending send_pending: %u ", diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 947fd301c70bb1..a4508f8b7499cc 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -349,7 +349,7 @@ static int smbd_conn_upcall( sc->status = SMBDIRECT_SOCKET_DISCONNECTED; wake_up_all(&sc->status_wait); wake_up_all(&sc->recv_io.reassembly.wait_queue); - wake_up_all(&info->wait_send_queue); + wake_up_all(&sc->send_io.credits.wait_queue); break; default: @@ -473,7 +473,7 @@ static bool process_negotiation_response( log_rdma_event(ERR, "error: credits_granted==0\n"); return false; } - atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted)); + atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted)); atomic_set(&info->receive_credits, 0); @@ -651,12 +651,12 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) le16_to_cpu(data_transfer->credits_requested); if (le16_to_cpu(data_transfer->credits_granted)) { atomic_add(le16_to_cpu(data_transfer->credits_granted), - &info->send_credits); + &sc->send_io.credits.count); /* * We have new send credits granted from remote peer * If any sender is waiting for credits, unblock it */ - wake_up(&info->wait_send_queue); + wake_up(&sc->send_io.credits.wait_queue); } log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n", @@ -1021,8 +1021,8 @@ static int smbd_post_send_iter(struct smbd_connection *info, wait_credit: /* Wait for send credits. A SMBD packet needs one credit */ - rc = wait_event_interruptible(info->wait_send_queue, - atomic_read(&info->send_credits) > 0 || + rc = wait_event_interruptible(sc->send_io.credits.wait_queue, + atomic_read(&sc->send_io.credits.count) > 0 || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) goto err_wait_credit; @@ -1032,8 +1032,8 @@ static int smbd_post_send_iter(struct smbd_connection *info, rc = -EAGAIN; goto err_wait_credit; } - if (unlikely(atomic_dec_return(&info->send_credits) < 0)) { - atomic_inc(&info->send_credits); + if (unlikely(atomic_dec_return(&sc->send_io.credits.count) < 0)) { + atomic_inc(&sc->send_io.credits.count); goto wait_credit; } @@ -1162,7 +1162,7 @@ static int smbd_post_send_iter(struct smbd_connection *info, err_wait_send_queue: /* roll back send credits and pending */ - atomic_inc(&info->send_credits); + atomic_inc(&sc->send_io.credits.count); err_wait_credit: return rc; @@ -1843,7 +1843,6 @@ static struct smbd_connection *_smbd_get_connection( goto allocate_cache_failed; } - init_waitqueue_head(&info->wait_send_queue); INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer); queue_delayed_work(info->workqueue, &info->idle_timer_work, msecs_to_jiffies(sp->keepalive_interval_msec)); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 2ce6b7ab898a38..7075bdc92193b9 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -53,7 +53,6 @@ struct smbd_connection { /* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */ enum keep_alive_status keep_alive_requested; int protocol; - atomic_t send_credits; atomic_t receive_credits; int receive_credit_target; @@ -88,8 +87,6 @@ struct smbd_connection { bool send_immediate; - wait_queue_head_t wait_send_queue; - struct workqueue_struct *workqueue; struct delayed_work idle_timer_work; From b0aa92a229ab9e6de3839e06f3a8494bce5b1cd2 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 12 Aug 2025 09:10:07 +0200 Subject: [PATCH 3001/3206] smb: client: make sure smbd_disconnect_rdma_work() doesn't run after smbd_destroy() took over If we're already disconnecting we don't need to queue the disconnect_work again. disable_work() turns the next queue_work() into a no-op. Also let smbd_destroy() cancel(and disable) queued disconnect_work and call smbd_disconnect_rdma_work() inline. The makes it more obvious that disconnect_work is never queued again after smbd_destroy() called smbd_disconnect_rdma_work(). It also means we have a single place to call rdma_disconnect(). While there we better also disable all other [delayed_]work. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index a4508f8b7499cc..44140fb5f2ad32 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -159,6 +159,18 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) { struct smbdirect_socket *sc = container_of(work, struct smbdirect_socket, disconnect_work); + struct smbd_connection *info = + container_of(sc, struct smbd_connection, socket); + + /* + * make sure this and other work is not queued again + * but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->disconnect_work); + disable_work(&info->post_send_credits_work); + disable_work(&info->mr_recovery_work); + disable_delayed_work(&info->idle_timer_work); switch (sc->status) { case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: @@ -342,11 +354,13 @@ static int smbd_conn_upcall( if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) { log_rdma_event(ERR, "event=%s during negotiation\n", event_name); sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + smbd_disconnect_rdma_work(&sc->disconnect_work); wake_up_all(&sc->status_wait); break; } sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + smbd_disconnect_rdma_work(&sc->disconnect_work); wake_up_all(&sc->status_wait); wake_up_all(&sc->recv_io.reassembly.wait_queue); wake_up_all(&sc->send_io.credits.wait_queue); @@ -1483,9 +1497,12 @@ void smbd_destroy(struct TCP_Server_Info *server) sc = &info->socket; sp = &sc->parameters; + log_rdma_event(INFO, "cancelling and disable disconnect_work\n"); + disable_work_sync(&sc->disconnect_work); + log_rdma_event(INFO, "destroying rdma session\n"); - if (sc->status != SMBDIRECT_SOCKET_DISCONNECTED) { - rdma_disconnect(sc->rdma.cm_id); + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { + smbd_disconnect_rdma_work(&sc->disconnect_work); log_rdma_event(INFO, "wait for transport being disconnected\n"); wait_event_interruptible( sc->status_wait, From 02548c477a90481c1fd0d6e7c84b4504ec2fcc12 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 11 Aug 2025 17:53:55 +0200 Subject: [PATCH 3002/3206] smb: client: queue post_recv_credits_work also if the peer raises the credit target This is already handled in the server, but currently it done in a very complex way there. So we do it much simpler. Note that put_receive_buffer() will take care of it in case data_length is 0. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 44140fb5f2ad32..d8a8382624ada0 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -590,6 +590,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbd_connection *info = container_of(sc, struct smbd_connection, socket); + int old_recv_credit_target; u32 data_offset = 0; u32 data_length = 0; u32 remaining_data_length = 0; @@ -661,6 +662,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } atomic_dec(&info->receive_credits); + old_recv_credit_target = info->receive_credit_target; info->receive_credit_target = le16_to_cpu(data_transfer->credits_requested); if (le16_to_cpu(data_transfer->credits_granted)) { @@ -691,6 +693,9 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) * reassembly queue and wake up the reading thread */ if (data_length) { + if (info->receive_credit_target > old_recv_credit_target) + queue_work(info->workqueue, &info->post_send_credits_work); + enqueue_reassembly(info, response, data_length); wake_up(&sc->recv_io.reassembly.wait_queue); } else From a8e970358b31a5abba8b5737a67ba7b8d26f4258 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 12 Aug 2025 09:44:07 +0200 Subject: [PATCH 3003/3206] smb: client: make use of ib_wc_status_msg() and skip IB_WC_WR_FLUSH_ERR logging There's no need to get log message for every IB_WC_WR_FLUSH_ERR completion, but any other error should be logged at level ERR. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index d8a8382624ada0..0ec09841362f5c 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -415,8 +415,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) struct smbd_connection *info = container_of(sc, struct smbd_connection, socket); - log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%d\n", - request, wc->status); + log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n", + request, ib_wc_status_msg(wc->status)); for (i = 0; i < request->num_sge; i++) ib_dma_unmap_single(sc->ib.dev, @@ -425,8 +425,9 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) DMA_TO_DEVICE); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { - log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n", - wc->status, wc->opcode); + if (wc->status != IB_WC_WR_FLUSH_ERR) + log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n", + ib_wc_status_msg(wc->status), wc->opcode); mempool_free(request, sc->send_io.mem.pool); smbd_disconnect_rdma_connection(info); return; @@ -596,13 +597,16 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) u32 remaining_data_length = 0; bool negotiate_done = false; - log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n", - response, sc->recv_io.expected, wc->status, wc->opcode, + log_rdma_recv(INFO, + "response=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n", + response, sc->recv_io.expected, + ib_wc_status_msg(wc->status), wc->opcode, wc->byte_len, wc->pkey_index); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { - log_rdma_recv(INFO, "wc->status=%d opcode=%d\n", - wc->status, wc->opcode); + if (wc->status != IB_WC_WR_FLUSH_ERR) + log_rdma_recv(ERR, "wc->status=%s opcode=%d\n", + ib_wc_status_msg(wc->status), wc->opcode); goto error; } From 1a07031fdd56754747a27b48d555152e1daf19cd Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 12 Aug 2025 09:24:57 +0200 Subject: [PATCH 3004/3206] smb: client: remove info->wait_receive_queues handling in smbd_destroy() We already call ib_drain_qp() before, which is an sync operation that only returns in the queues are fully drained. ib_drain_qp() completes pending requests with IB_WC_WR_FLUSH_ERR so we have already called put_receive_buffer(). So all smbdirect_recv_io objects are either in the smbdirect_socket.recv_io.free.list or smbdirect_socket.recv_io.reassembly.list. Then we explicitly iterate smbdirect_socket.recv_io.reassembly.list and call put_receive_buffer(), so every object is in smbdirect_socket.recv_io.free.list. It means info->count_receive_queue == sp->recv_credit_max was already true and calling wait_event(info->wait_receive_queues... is pointless. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 -- fs/smb/client/smbdirect.c | 13 ------------- fs/smb/client/smbdirect.h | 4 ---- 3 files changed, 19 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 01a582e172cbf9..a40846500de6b8 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -498,8 +498,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) server->smbd_conn->receive_credit_target); seq_printf(m, "\nPending send_pending: %u ", atomic_read(&sc->send_io.pending.count)); - seq_printf(m, "\nReceive buffers count_receive_queue: %u ", - server->smbd_conn->count_receive_queue); seq_printf(m, "\nMR responder_resources: %u " "max_frmr_depth: %u mr_type: 0x%x", server->smbd_conn->responder_resources, diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 0ec09841362f5c..df032ed70af56b 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -541,7 +541,6 @@ static void smbd_post_send_credits(struct work_struct *work) struct smbdirect_socket *sc = &info->socket; if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { - wake_up_all(&info->wait_receive_queues); return; } @@ -1378,7 +1377,6 @@ static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info &sc->recv_io.free.list, struct smbdirect_recv_io, list); list_del(&ret->list); - info->count_receive_queue--; info->count_get_receive_buffer++; } spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); @@ -1408,7 +1406,6 @@ static void put_receive_buffer( spin_lock_irqsave(&sc->recv_io.free.lock, flags); list_add_tail(&response->list, &sc->recv_io.free.list); - info->count_receive_queue++; info->count_put_receive_buffer++; spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); @@ -1422,10 +1419,6 @@ static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) struct smbdirect_recv_io *response; int i; - info->count_receive_queue = 0; - - init_waitqueue_head(&info->wait_receive_queues); - for (i = 0; i < num_buf; i++) { response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL); if (!response) @@ -1434,7 +1427,6 @@ static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) response->socket = sc; response->sge.length = 0; list_add_tail(&response->list, &sc->recv_io.free.list); - info->count_receive_queue++; } return 0; @@ -1445,7 +1437,6 @@ static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) &sc->recv_io.free.list, struct smbdirect_recv_io, list); list_del(&response->list); - info->count_receive_queue--; mempool_free(response, sc->recv_io.mem.pool); } @@ -1495,7 +1486,6 @@ void smbd_destroy(struct TCP_Server_Info *server) { struct smbd_connection *info = server->smbd_conn; struct smbdirect_socket *sc; - struct smbdirect_socket_parameters *sp; struct smbdirect_recv_io *response; unsigned long flags; @@ -1504,7 +1494,6 @@ void smbd_destroy(struct TCP_Server_Info *server) return; } sc = &info->socket; - sp = &sc->parameters; log_rdma_event(INFO, "cancelling and disable disconnect_work\n"); disable_work_sync(&sc->disconnect_work); @@ -1546,8 +1535,6 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->recv_io.reassembly.data_length = 0; log_rdma_event(INFO, "free receive buffers\n"); - wait_event(info->wait_receive_queues, - info->count_receive_queue == sp->recv_credit_max); destroy_receive_buffers(info); /* diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 7075bdc92193b9..0b6e968c3d917d 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -81,10 +81,6 @@ struct smbd_connection { /* Used by transport to wait until all MRs are returned */ wait_queue_head_t wait_for_mr_cleanup; - /* Receive queue */ - int count_receive_queue; - wait_queue_head_t wait_receive_queues; - bool send_immediate; struct workqueue_struct *workqueue; From 9219f8cac296769324bbe8a28c289586114244c4 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 15:01:35 +0200 Subject: [PATCH 3005/3206] smb: client: limit the range of info->receive_credit_target This simplifies further changes... Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 7 ++++++- fs/smb/client/smbdirect.h | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index df032ed70af56b..92d4d75dc38d13 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -483,6 +483,7 @@ static bool process_negotiation_response( return false; } info->receive_credit_target = le16_to_cpu(packet->credits_requested); + info->receive_credit_target = min_t(u16, info->receive_credit_target, sp->recv_credit_max); if (packet->credits_granted == 0) { log_rdma_event(ERR, "error: credits_granted==0\n"); @@ -590,7 +591,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbd_connection *info = container_of(sc, struct smbd_connection, socket); - int old_recv_credit_target; + u16 old_recv_credit_target; u32 data_offset = 0; u32 data_length = 0; u32 remaining_data_length = 0; @@ -668,6 +669,10 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) old_recv_credit_target = info->receive_credit_target; info->receive_credit_target = le16_to_cpu(data_transfer->credits_requested); + info->receive_credit_target = + min_t(u16, info->receive_credit_target, sp->recv_credit_max); + info->receive_credit_target = + max_t(u16, info->receive_credit_target, 1); if (le16_to_cpu(data_transfer->credits_granted)) { atomic_add(le16_to_cpu(data_transfer->credits_granted), &sc->send_io.credits.count); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 0b6e968c3d917d..d5ccb92fb618bb 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -54,7 +54,7 @@ struct smbd_connection { enum keep_alive_status keep_alive_requested; int protocol; atomic_t receive_credits; - int receive_credit_target; + u16 receive_credit_target; /* Memory registrations */ /* Maximum number of RDMA read/write outstanding on this connection */ From 5fb9b459b3686e366640edd4e62805ef7b4de927 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 13:57:37 +0200 Subject: [PATCH 3006/3206] smb: client: count the number of posted recv_io messages in order to calculated credits (At least for me) the logic maintaining the count of posted recv_io messages and the count of granted credits is much easier to understand. From there we can easily calculate the number of new_credits we'll grant to the peer in outgoing send_io messages. This will simplify the move to common logic that can be shared between client and server in the following patches. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 30 ++++++++++++++---------------- fs/smb/client/smbdirect.h | 4 +--- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 92d4d75dc38d13..6332230ad4cab4 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -492,6 +492,7 @@ static bool process_negotiation_response( atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted)); atomic_set(&info->receive_credits, 0); + atomic_set(&info->receive_posted, 0); if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) { log_rdma_event(ERR, "error: preferred_send_size=%d\n", @@ -533,7 +534,6 @@ static bool process_negotiation_response( static void smbd_post_send_credits(struct work_struct *work) { - int ret = 0; int rc; struct smbdirect_recv_io *response; struct smbd_connection *info = @@ -561,14 +561,10 @@ static void smbd_post_send_credits(struct work_struct *work) break; } - ret++; + atomic_inc(&info->receive_posted); } } - spin_lock(&info->lock_new_credits_offered); - info->new_credits_offered += ret; - spin_unlock(&info->lock_new_credits_offered); - /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */ info->send_immediate = true; if (atomic_read(&info->receive_credits) < @@ -665,6 +661,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) sc->recv_io.reassembly.full_packet_received = true; } + atomic_dec(&info->receive_posted); atomic_dec(&info->receive_credits); old_recv_credit_target = info->receive_credit_target; info->receive_credit_target = @@ -965,10 +962,16 @@ static int manage_credits_prior_sending(struct smbd_connection *info) { int new_credits; - spin_lock(&info->lock_new_credits_offered); - new_credits = info->new_credits_offered; - info->new_credits_offered = 0; - spin_unlock(&info->lock_new_credits_offered); + if (atomic_read(&info->receive_credits) >= info->receive_credit_target) + return 0; + + new_credits = atomic_read(&info->receive_posted); + if (new_credits == 0) + return 0; + + new_credits -= atomic_read(&info->receive_credits); + if (new_credits <= 0) + return 0; return new_credits; } @@ -1177,10 +1180,7 @@ static int smbd_post_send_iter(struct smbd_connection *info, DMA_TO_DEVICE); mempool_free(request, sc->send_io.mem.pool); - /* roll back receive credits and credits to be offered */ - spin_lock(&info->lock_new_credits_offered); - info->new_credits_offered += new_credits; - spin_unlock(&info->lock_new_credits_offered); + /* roll back the granted receive credits */ atomic_sub(new_credits, &info->receive_credits); err_alloc: @@ -1866,8 +1866,6 @@ static struct smbd_connection *_smbd_get_connection( msecs_to_jiffies(sp->keepalive_interval_msec)); INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits); - info->new_credits_offered = 0; - spin_lock_init(&info->lock_new_credits_offered); rc = smbd_negotiate(info); if (rc) { diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index d5ccb92fb618bb..a10f69fab57341 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -46,9 +46,7 @@ struct smbd_connection { struct smbdirect_socket socket; struct work_struct post_send_credits_work; - - spinlock_t lock_new_credits_offered; - int new_credits_offered; + atomic_t receive_posted; /* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */ enum keep_alive_status keep_alive_requested; From c7316ec2d542e27ea717a0d7493902098329f908 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 16:32:30 +0200 Subject: [PATCH 3007/3206] smb: client: make use of smbdirect_socket.recv_io.{posted,credits} This will make it possible to introduce common helper functions in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 4 +-- fs/smb/client/smbdirect.c | 64 ++++++++++++++++++-------------------- fs/smb/client/smbdirect.h | 5 --- 3 files changed, 33 insertions(+), 40 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index a40846500de6b8..10bfcb57e19635 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -494,8 +494,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_printf(m, "\nCurrent Credits send_credits: %u " "receive_credits: %u receive_credit_target: %u", atomic_read(&sc->send_io.credits.count), - atomic_read(&server->smbd_conn->receive_credits), - server->smbd_conn->receive_credit_target); + atomic_read(&sc->recv_io.credits.count), + sc->recv_io.credits.target); seq_printf(m, "\nPending send_pending: %u ", atomic_read(&sc->send_io.pending.count)); seq_printf(m, "\nMR responder_resources: %u " diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 6332230ad4cab4..512f1cdd3e91b6 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -168,7 +168,7 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) * disable[_delayed]_work_sync() */ disable_work(&sc->disconnect_work); - disable_work(&info->post_send_credits_work); + disable_work(&sc->recv_io.posted.refill_work); disable_work(&info->mr_recovery_work); disable_delayed_work(&info->idle_timer_work); @@ -482,8 +482,8 @@ static bool process_negotiation_response( log_rdma_event(ERR, "error: credits_requested==0\n"); return false; } - info->receive_credit_target = le16_to_cpu(packet->credits_requested); - info->receive_credit_target = min_t(u16, info->receive_credit_target, sp->recv_credit_max); + sc->recv_io.credits.target = le16_to_cpu(packet->credits_requested); + sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); if (packet->credits_granted == 0) { log_rdma_event(ERR, "error: credits_granted==0\n"); @@ -491,9 +491,6 @@ static bool process_negotiation_response( } atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted)); - atomic_set(&info->receive_credits, 0); - atomic_set(&info->receive_posted, 0); - if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) { log_rdma_event(ERR, "error: preferred_send_size=%d\n", le32_to_cpu(packet->preferred_send_size)); @@ -536,17 +533,17 @@ static void smbd_post_send_credits(struct work_struct *work) { int rc; struct smbdirect_recv_io *response; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); struct smbd_connection *info = - container_of(work, struct smbd_connection, - post_send_credits_work); - struct smbdirect_socket *sc = &info->socket; + container_of(sc, struct smbd_connection, socket); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { return; } - if (info->receive_credit_target > - atomic_read(&info->receive_credits)) { + if (sc->recv_io.credits.target > + atomic_read(&sc->recv_io.credits.count)) { while (true) { response = get_receive_buffer(info); if (!response) @@ -561,14 +558,14 @@ static void smbd_post_send_credits(struct work_struct *work) break; } - atomic_inc(&info->receive_posted); + atomic_inc(&sc->recv_io.posted.count); } } /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */ info->send_immediate = true; - if (atomic_read(&info->receive_credits) < - info->receive_credit_target - 1) { + if (atomic_read(&sc->recv_io.credits.count) < + sc->recv_io.credits.target - 1) { if (info->keep_alive_requested == KEEP_ALIVE_PENDING || info->send_immediate) { log_keep_alive(INFO, "send an empty message\n"); @@ -661,15 +658,15 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) sc->recv_io.reassembly.full_packet_received = true; } - atomic_dec(&info->receive_posted); - atomic_dec(&info->receive_credits); - old_recv_credit_target = info->receive_credit_target; - info->receive_credit_target = + atomic_dec(&sc->recv_io.posted.count); + atomic_dec(&sc->recv_io.credits.count); + old_recv_credit_target = sc->recv_io.credits.target; + sc->recv_io.credits.target = le16_to_cpu(data_transfer->credits_requested); - info->receive_credit_target = - min_t(u16, info->receive_credit_target, sp->recv_credit_max); - info->receive_credit_target = - max_t(u16, info->receive_credit_target, 1); + sc->recv_io.credits.target = + min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); + sc->recv_io.credits.target = + max_t(u16, sc->recv_io.credits.target, 1); if (le16_to_cpu(data_transfer->credits_granted)) { atomic_add(le16_to_cpu(data_transfer->credits_granted), &sc->send_io.credits.count); @@ -698,8 +695,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) * reassembly queue and wake up the reading thread */ if (data_length) { - if (info->receive_credit_target > old_recv_credit_target) - queue_work(info->workqueue, &info->post_send_credits_work); + if (sc->recv_io.credits.target > old_recv_credit_target) + queue_work(info->workqueue, &sc->recv_io.posted.refill_work); enqueue_reassembly(info, response, data_length); wake_up(&sc->recv_io.reassembly.wait_queue); @@ -960,16 +957,17 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) */ static int manage_credits_prior_sending(struct smbd_connection *info) { + struct smbdirect_socket *sc = &info->socket; int new_credits; - if (atomic_read(&info->receive_credits) >= info->receive_credit_target) + if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) return 0; - new_credits = atomic_read(&info->receive_posted); + new_credits = atomic_read(&sc->recv_io.posted.count); if (new_credits == 0) return 0; - new_credits -= atomic_read(&info->receive_credits); + new_credits -= atomic_read(&sc->recv_io.credits.count); if (new_credits <= 0) return 0; @@ -1123,7 +1121,7 @@ static int smbd_post_send_iter(struct smbd_connection *info, packet->credits_requested = cpu_to_le16(sp->send_credit_target); new_credits = manage_credits_prior_sending(info); - atomic_add(new_credits, &info->receive_credits); + atomic_add(new_credits, &sc->recv_io.credits.count); packet->credits_granted = cpu_to_le16(new_credits); info->send_immediate = false; @@ -1181,7 +1179,7 @@ static int smbd_post_send_iter(struct smbd_connection *info, mempool_free(request, sc->send_io.mem.pool); /* roll back the granted receive credits */ - atomic_sub(new_credits, &info->receive_credits); + atomic_sub(new_credits, &sc->recv_io.credits.count); err_alloc: if (atomic_dec_and_test(&sc->send_io.pending.count)) @@ -1414,7 +1412,7 @@ static void put_receive_buffer( info->count_put_receive_buffer++; spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); - queue_work(info->workqueue, &info->post_send_credits_work); + queue_work(info->workqueue, &sc->recv_io.posted.refill_work); } /* Preallocate all receive buffer on transport establishment */ @@ -1512,8 +1510,8 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } - log_rdma_event(INFO, "cancelling post_send_credits_work\n"); - disable_work_sync(&info->post_send_credits_work); + log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n"); + disable_work_sync(&sc->recv_io.posted.refill_work); log_rdma_event(INFO, "destroying qp\n"); ib_drain_qp(sc->ib.qp); @@ -1865,7 +1863,7 @@ static struct smbd_connection *_smbd_get_connection( queue_delayed_work(info->workqueue, &info->idle_timer_work, msecs_to_jiffies(sp->keepalive_interval_msec)); - INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits); + INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits); rc = smbd_negotiate(info); if (rc) { diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index a10f69fab57341..fb681605ea375a 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -45,14 +45,9 @@ enum keep_alive_status { struct smbd_connection { struct smbdirect_socket socket; - struct work_struct post_send_credits_work; - atomic_t receive_posted; - /* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */ enum keep_alive_status keep_alive_requested; int protocol; - atomic_t receive_credits; - u16 receive_credit_target; /* Memory registrations */ /* Maximum number of RDMA read/write outstanding on this connection */ From d3e743b514c20bfe26e340835cebb1f8301997ab Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 18:34:02 +0200 Subject: [PATCH 3008/3206] smb: client: remove useless smbd_connection.send_immediate We always set it to true before having an if statement that checks it is true... Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 10 ++-------- fs/smb/client/smbdirect.h | 2 -- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 512f1cdd3e91b6..21e4a10d39991c 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -563,14 +563,10 @@ static void smbd_post_send_credits(struct work_struct *work) } /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */ - info->send_immediate = true; if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target - 1) { - if (info->keep_alive_requested == KEEP_ALIVE_PENDING || - info->send_immediate) { - log_keep_alive(INFO, "send an empty message\n"); - smbd_post_send_empty(info); - } + log_keep_alive(INFO, "send an empty message\n"); + smbd_post_send_empty(info); } } @@ -1124,8 +1120,6 @@ static int smbd_post_send_iter(struct smbd_connection *info, atomic_add(new_credits, &sc->recv_io.credits.count); packet->credits_granted = cpu_to_le16(new_credits); - info->send_immediate = false; - packet->flags = 0; if (manage_keep_alive_before_sending(info)) packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index fb681605ea375a..2666f39ef26bdb 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -74,8 +74,6 @@ struct smbd_connection { /* Used by transport to wait until all MRs are returned */ wait_queue_head_t wait_for_mr_cleanup; - bool send_immediate; - struct workqueue_struct *workqueue; struct delayed_work idle_timer_work; From 14b6088dd97b5332c018a2e87c46be1e8cfdabc8 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 15 Aug 2025 11:03:04 +0200 Subject: [PATCH 3009/3206] smb: client: fill smbdirect_socket_parameters at the beginning and use the values from there This is what we should do and it also simplifies the following changes. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 21e4a10d39991c..bed9d86ec85da0 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1718,37 +1718,37 @@ static struct smbd_connection *_smbd_get_connection( INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work); + sp->recv_credit_max = smbd_receive_credit_max; + sp->send_credit_target = smbd_send_credit_target; + sp->max_send_size = smbd_max_send_size; + sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; + sp->max_recv_size = smbd_max_receive_size; + sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; + rc = smbd_ia_open(info, dstaddr, port); if (rc) { log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); goto create_id_failed; } - if (smbd_send_credit_target > sc->ib.dev->attrs.max_cqe || - smbd_send_credit_target > sc->ib.dev->attrs.max_qp_wr) { + if (sp->send_credit_target > sc->ib.dev->attrs.max_cqe || + sp->send_credit_target > sc->ib.dev->attrs.max_qp_wr) { log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", - smbd_send_credit_target, + sp->send_credit_target, sc->ib.dev->attrs.max_cqe, sc->ib.dev->attrs.max_qp_wr); goto config_failed; } - if (smbd_receive_credit_max > sc->ib.dev->attrs.max_cqe || - smbd_receive_credit_max > sc->ib.dev->attrs.max_qp_wr) { + if (sp->recv_credit_max > sc->ib.dev->attrs.max_cqe || + sp->recv_credit_max > sc->ib.dev->attrs.max_qp_wr) { log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", - smbd_receive_credit_max, + sp->recv_credit_max, sc->ib.dev->attrs.max_cqe, sc->ib.dev->attrs.max_qp_wr); goto config_failed; } - sp->recv_credit_max = smbd_receive_credit_max; - sp->send_credit_target = smbd_send_credit_target; - sp->max_send_size = smbd_max_send_size; - sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; - sp->max_recv_size = smbd_max_receive_size; - sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; - if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE || sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) { log_rdma_event(ERR, From 1f2ff73a233005cb1f8e7bc02477603e2f9f4c69 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 19:39:22 +0200 Subject: [PATCH 3010/3206] smb: client: make use of smbdirect_socket_parameters.{resolve_{addr,route},rdma_connect,negotiate}_timeout_msec This will make future changes to these values much saner. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index bed9d86ec85da0..b7b54a36a46e57 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -57,6 +57,9 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, /* SMBD negotiation timeout in seconds */ #define SMBD_NEGOTIATE_TIMEOUT 120 +/* The timeout to wait for a keepalive message from peer in seconds */ +#define KEEPALIVE_RECV_TIMEOUT 5 + /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */ #define SMBD_MIN_RECEIVE_SIZE 128 #define SMBD_MIN_FRAGMENTED_SIZE 131072 @@ -721,6 +724,7 @@ static struct rdma_cm_id *smbd_create_id( struct sockaddr *dstaddr, int port) { struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct rdma_cm_id *id; int rc; __be16 *sport; @@ -743,7 +747,7 @@ static struct rdma_cm_id *smbd_create_id( WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED); sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING; rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr, - RDMA_RESOLVE_TIMEOUT); + sp->resolve_addr_timeout_msec); if (rc) { log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc); goto out; @@ -751,7 +755,7 @@ static struct rdma_cm_id *smbd_create_id( rc = wait_event_interruptible_timeout( sc->status_wait, sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING, - msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + msecs_to_jiffies(sp->resolve_addr_timeout_msec)); /* e.g. if interrupted returns -ERESTARTSYS */ if (rc < 0) { log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); @@ -770,7 +774,7 @@ static struct rdma_cm_id *smbd_create_id( WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED); sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING; - rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); + rc = rdma_resolve_route(id, sp->resolve_route_timeout_msec); if (rc) { log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); goto out; @@ -778,7 +782,7 @@ static struct rdma_cm_id *smbd_create_id( rc = wait_event_interruptible_timeout( sc->status_wait, sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING, - msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + msecs_to_jiffies(sp->resolve_route_timeout_msec)); /* e.g. if interrupted returns -ERESTARTSYS */ if (rc < 0) { log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); @@ -1266,6 +1270,7 @@ static int smbd_post_recv( static int smbd_negotiate(struct smbd_connection *info) { struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; int rc; struct smbdirect_recv_io *response = get_receive_buffer(info); @@ -1289,7 +1294,7 @@ static int smbd_negotiate(struct smbd_connection *info) rc = wait_event_interruptible_timeout( sc->status_wait, sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, - secs_to_jiffies(SMBD_NEGOTIATE_TIMEOUT)); + msecs_to_jiffies(sp->negotiate_timeout_msec)); log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc); if (sc->status == SMBDIRECT_SOCKET_CONNECTED) @@ -1718,12 +1723,17 @@ static struct smbd_connection *_smbd_get_connection( INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work); + sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT; + sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT; + sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT; + sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000; sp->recv_credit_max = smbd_receive_credit_max; sp->send_credit_target = smbd_send_credit_target; sp->max_send_size = smbd_max_send_size; sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; sp->max_recv_size = smbd_max_receive_size; sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; + sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000; rc = smbd_ia_open(info, dstaddr, port); if (rc) { @@ -1838,7 +1848,7 @@ static struct smbd_connection *_smbd_get_connection( wait_event_interruptible_timeout( sc->status_wait, sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING, - msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + msecs_to_jiffies(sp->rdma_connect_timeout_msec)); if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) { log_rdma_event(ERR, "rdma_connect failed port=%d\n", port); From dd53b45fc871cd9eb8ce90b215378c683aae783b Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 19 Aug 2025 23:18:21 +0200 Subject: [PATCH 3011/3206] smb: client: make use of smbdirect_socket_parameters.{initiator_depth,responder_resources} This will make it easier to specify these from the outside of the core code first and then negotiate the value with the peer. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 +- fs/smb/client/smbdirect.c | 29 +++++++++++++++-------------- fs/smb/client/smbdirect.h | 2 -- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 10bfcb57e19635..3086ab2622baac 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -500,7 +500,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) atomic_read(&sc->send_io.pending.count)); seq_printf(m, "\nMR responder_resources: %u " "max_frmr_depth: %u mr_type: 0x%x", - server->smbd_conn->responder_resources, + sp->responder_resources, server->smbd_conn->max_frmr_depth, server->smbd_conn->mr_type); seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u", diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index b7b54a36a46e57..a55163e2876b7a 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -222,6 +222,7 @@ static int smbd_conn_upcall( { struct smbd_connection *info = id->context; struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; const char *event_name = rdma_event_msg(event->event); u8 peer_initiator_depth; u8 peer_responder_resources; @@ -329,12 +330,12 @@ static int smbd_conn_upcall( * non 0 values. */ if (peer_initiator_depth != 0) - info->initiator_depth = - min_t(u8, info->initiator_depth, + sp->initiator_depth = + min_t(u8, sp->initiator_depth, peer_initiator_depth); if (peer_responder_resources != 0) - info->responder_resources = - min_t(u8, info->responder_resources, + sp->responder_resources = + min_t(u8, sp->responder_resources, peer_responder_resources); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); @@ -1718,15 +1719,14 @@ static struct smbd_connection *_smbd_get_connection( smbdirect_socket_init(sc); sp = &sc->parameters; - info->initiator_depth = 1; - info->responder_resources = SMBD_CM_RESPONDER_RESOURCES; - INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work); sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT; sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT; sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT; sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000; + sp->initiator_depth = 1; + sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES; sp->recv_credit_max = smbd_receive_credit_max; sp->send_credit_target = smbd_send_credit_target; sp->max_send_size = smbd_max_send_size; @@ -1807,15 +1807,15 @@ static struct smbd_connection *_smbd_get_connection( } sc->ib.qp = sc->rdma.cm_id->qp; - info->responder_resources = - min_t(u8, info->responder_resources, + sp->responder_resources = + min_t(u8, sp->responder_resources, sc->ib.dev->attrs.max_qp_rd_atom); log_rdma_mr(INFO, "responder_resources=%d\n", - info->responder_resources); + sp->responder_resources); memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = info->initiator_depth; - conn_param.responder_resources = info->responder_resources; + conn_param.initiator_depth = sp->initiator_depth; + conn_param.responder_resources = sp->responder_resources; /* Need to send IRD/ORD in private data for iWARP */ sc->ib.dev->ops.get_port_immutable( @@ -2270,6 +2270,7 @@ static void destroy_mr_list(struct smbd_connection *info) static int allocate_mr_list(struct smbd_connection *info) { struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; int i; struct smbd_mr *smbdirect_mr, *tmp; @@ -2281,13 +2282,13 @@ static int allocate_mr_list(struct smbd_connection *info) init_waitqueue_head(&info->wait_for_mr_cleanup); INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); - if (info->responder_resources == 0) { + if (sp->responder_resources == 0) { log_rdma_mr(ERR, "responder_resources negotiated as 0\n"); return -EINVAL; } /* Allocate more MRs (2x) than hardware responder_resources */ - for (i = 0; i < info->responder_resources * 2; i++) { + for (i = 0; i < sp->responder_resources * 2; i++) { smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); if (!smbdirect_mr) goto cleanup_entries; diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 2666f39ef26bdb..a5a70b3c63cb83 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -52,8 +52,6 @@ struct smbd_connection { /* Memory registrations */ /* Maximum number of RDMA read/write outstanding on this connection */ bool legacy_iwarp; - u8 initiator_depth; - u8 responder_resources; /* Maximum number of pages in a single RDMA write/read on this connection */ int max_frmr_depth; /* From ea5a4e31ab4757273dacaa7e05368892516b53d6 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 21 Aug 2025 12:22:46 +0200 Subject: [PATCH 3012/3206] smb: client: make use of smbdirect_socket.rdma.legacy_iwarp Currently it's write only for the client, but it will likely be use for debugging later. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 2 +- fs/smb/client/smbdirect.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index a55163e2876b7a..b910d62d66c584 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -318,7 +318,7 @@ static int smbd_conn_upcall( ird32 = min_t(u32, ird32, U8_MAX); ord32 = min_t(u32, ord32, U8_MAX); - info->legacy_iwarp = true; + sc->rdma.legacy_iwarp = true; peer_initiator_depth = (u8)ird32; peer_responder_resources = (u8)ord32; } diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index a5a70b3c63cb83..1bb9b9412685bd 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -50,8 +50,6 @@ struct smbd_connection { int protocol; /* Memory registrations */ - /* Maximum number of RDMA read/write outstanding on this connection */ - bool legacy_iwarp; /* Maximum number of pages in a single RDMA write/read on this connection */ int max_frmr_depth; /* From 1b2c46cdb7c0127711cd72028072d85f6ebd1b53 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 15 Aug 2025 12:53:35 +0200 Subject: [PATCH 3013/3206] smb: client: send empty packets via send_immediate_work This is what the server already does and it makes refactoring for common structures and functions much easier. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 25 +++++++++++++++++++++---- fs/smb/client/smbdirect.h | 1 + 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index b910d62d66c584..d53e39e91e1056 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -173,6 +173,7 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) disable_work(&sc->disconnect_work); disable_work(&sc->recv_io.posted.refill_work); disable_work(&info->mr_recovery_work); + disable_work(&info->send_immediate_work); disable_delayed_work(&info->idle_timer_work); switch (sc->status) { @@ -569,8 +570,8 @@ static void smbd_post_send_credits(struct work_struct *work) /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */ if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target - 1) { - log_keep_alive(INFO, "send an empty message\n"); - smbd_post_send_empty(info); + log_keep_alive(INFO, "schedule send of an empty message\n"); + queue_work(info->workqueue, &info->send_immediate_work); } } @@ -1455,6 +1456,19 @@ static void destroy_receive_buffers(struct smbd_connection *info) mempool_free(response, sc->recv_io.mem.pool); } +static void send_immediate_empty_message(struct work_struct *work) +{ + struct smbd_connection *info = + container_of(work, struct smbd_connection, send_immediate_work); + struct smbdirect_socket *sc = &info->socket; + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return; + + log_keep_alive(INFO, "send an empty message\n"); + smbd_post_send_empty(info); +} + /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */ static void idle_connection_timer(struct work_struct *work) { @@ -1472,8 +1486,8 @@ static void idle_connection_timer(struct work_struct *work) return; } - log_keep_alive(INFO, "about to send an empty idle message\n"); - smbd_post_send_empty(info); + log_keep_alive(INFO, "schedule send of empty idle message\n"); + queue_work(info->workqueue, &info->send_immediate_work); /* Setup the next idle timeout work */ queue_delayed_work(info->workqueue, &info->idle_timer_work, @@ -1520,6 +1534,8 @@ void smbd_destroy(struct TCP_Server_Info *server) log_rdma_event(INFO, "cancelling idle timer\n"); disable_delayed_work_sync(&info->idle_timer_work); + log_rdma_event(INFO, "cancelling send immediate work\n"); + disable_work_sync(&info->send_immediate_work); /* It's not possible for upper layer to get to reassembly */ log_rdma_event(INFO, "drain the reassembly queue\n"); @@ -1863,6 +1879,7 @@ static struct smbd_connection *_smbd_get_connection( goto allocate_cache_failed; } + INIT_WORK(&info->send_immediate_work, send_immediate_empty_message); INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer); queue_delayed_work(info->workqueue, &info->idle_timer_work, msecs_to_jiffies(sp->keepalive_interval_msec)); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 1bb9b9412685bd..07c7fba2f340b9 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -71,6 +71,7 @@ struct smbd_connection { wait_queue_head_t wait_for_mr_cleanup; struct workqueue_struct *workqueue; + struct work_struct send_immediate_work; struct delayed_work idle_timer_work; /* for debug purposes */ From ac31755c7a649786a9e11c6288513f5f4c1d30c9 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 19:41:12 +0200 Subject: [PATCH 3014/3206] smb: client: fix smbdirect keep alive handling to match the documentation We setup the first timer with the negotiate timeout and set KEEP_ALIVE_PENDING, so that the expired timer disconnects. On every incoming message we need to reset the timer to the keepalive interval (120s). On SMBDIRECT_FLAG_RESPONSE_REQUESTED we need to schedule a response instead of setting KEEP_ALIVE_PENDING. Doing both would mean we would also set SMBDIRECT_FLAG_RESPONSE_REQUESTED in that response. If both ends would do that we'd play ping pong in a busy loop. If we move to KEEP_ALIVE_SENT and send the keepalive request with SMBDIRECT_FLAG_RESPONSE_REQUESTED, we need to setup the timer with keepalive timeout (5s) in order to disconnect if no incoming message reset the timer. The fired timer sets KEEP_ALIVE_PENDING and also setup timer with keepalive timeout (5s) in order to disconnect if no incoming message reset the timer. We do that before queueing the send_immediate_work and have that timer in case we didn't reach the send code that typically sets the timer to keepalive timeout. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 52 ++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index d53e39e91e1056..5863dc7e021a5c 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -610,6 +610,14 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) response->sge.length, DMA_FROM_DEVICE); + /* + * Reset timer to the keepalive interval in + * order to trigger our next keepalive message. + */ + info->keep_alive_requested = KEEP_ALIVE_NONE; + mod_delayed_work(info->workqueue, &info->idle_timer_work, + msecs_to_jiffies(sp->keepalive_interval_msec)); + switch (sc->recv_io.expected) { /* SMBD negotiation response */ case SMBDIRECT_EXPECT_NEGOTIATE_REP: @@ -684,11 +692,11 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) le32_to_cpu(data_transfer->data_length), le32_to_cpu(data_transfer->remaining_data_length)); - /* Send a KEEP_ALIVE response right away if requested */ - info->keep_alive_requested = KEEP_ALIVE_NONE; + /* Send an immediate response right away if requested */ if (le16_to_cpu(data_transfer->flags) & SMBDIRECT_FLAG_RESPONSE_REQUESTED) { - info->keep_alive_requested = KEEP_ALIVE_PENDING; + log_keep_alive(INFO, "schedule send of immediate response\n"); + queue_work(info->workqueue, &info->send_immediate_work); } /* @@ -987,8 +995,17 @@ static int manage_credits_prior_sending(struct smbd_connection *info) */ static int manage_keep_alive_before_sending(struct smbd_connection *info) { + struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; + if (info->keep_alive_requested == KEEP_ALIVE_PENDING) { info->keep_alive_requested = KEEP_ALIVE_SENT; + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + mod_delayed_work(info->workqueue, &info->idle_timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); return 1; } return 0; @@ -999,7 +1016,6 @@ static int smbd_post_send(struct smbd_connection *info, struct smbdirect_send_io *request) { struct smbdirect_socket *sc = &info->socket; - struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_send_wr send_wr; int rc, i; @@ -1028,10 +1044,7 @@ static int smbd_post_send(struct smbd_connection *info, log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); smbd_disconnect_rdma_connection(info); rc = -EAGAIN; - } else - /* Reset timer for idle connection after packet is sent */ - mod_delayed_work(info->workqueue, &info->idle_timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); + } return rc; } @@ -1486,12 +1499,18 @@ static void idle_connection_timer(struct work_struct *work) return; } + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return; + + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + info->keep_alive_requested = KEEP_ALIVE_PENDING; + mod_delayed_work(info->workqueue, &info->idle_timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); log_keep_alive(INFO, "schedule send of empty idle message\n"); queue_work(info->workqueue, &info->send_immediate_work); - - /* Setup the next idle timeout work */ - queue_delayed_work(info->workqueue, &info->idle_timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); } /* @@ -1881,8 +1900,13 @@ static struct smbd_connection *_smbd_get_connection( INIT_WORK(&info->send_immediate_work, send_immediate_empty_message); INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer); - queue_delayed_work(info->workqueue, &info->idle_timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); + /* + * start with the negotiate timeout and KEEP_ALIVE_PENDING + * so that the timer will cause a disconnect. + */ + info->keep_alive_requested = KEEP_ALIVE_PENDING; + mod_delayed_work(info->workqueue, &info->idle_timer_work, + msecs_to_jiffies(sp->negotiate_timeout_msec)); INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits); From 4dc536a135e0ef862ccc25f5d68ad5c062dc01a0 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 15 Aug 2025 17:57:32 +0200 Subject: [PATCH 3015/3206] smb: client: make use of smbdirect_socket.idle.{keepalive,immediate_work,timer_work} This will allow client and server to use the common structures in order to share common functions later. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 59 ++++++++++++++++++++------------------- fs/smb/client/smbdirect.h | 9 ------ 2 files changed, 30 insertions(+), 38 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 5863dc7e021a5c..efebc4aa297fa9 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -173,8 +173,8 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) disable_work(&sc->disconnect_work); disable_work(&sc->recv_io.posted.refill_work); disable_work(&info->mr_recovery_work); - disable_work(&info->send_immediate_work); - disable_delayed_work(&info->idle_timer_work); + disable_work(&sc->idle.immediate_work); + disable_delayed_work(&sc->idle.timer_work); switch (sc->status) { case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: @@ -571,7 +571,7 @@ static void smbd_post_send_credits(struct work_struct *work) if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target - 1) { log_keep_alive(INFO, "schedule send of an empty message\n"); - queue_work(info->workqueue, &info->send_immediate_work); + queue_work(info->workqueue, &sc->idle.immediate_work); } } @@ -614,8 +614,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) * Reset timer to the keepalive interval in * order to trigger our next keepalive message. */ - info->keep_alive_requested = KEEP_ALIVE_NONE; - mod_delayed_work(info->workqueue, &info->idle_timer_work, + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + mod_delayed_work(info->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->keepalive_interval_msec)); switch (sc->recv_io.expected) { @@ -696,7 +696,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) if (le16_to_cpu(data_transfer->flags) & SMBDIRECT_FLAG_RESPONSE_REQUESTED) { log_keep_alive(INFO, "schedule send of immediate response\n"); - queue_work(info->workqueue, &info->send_immediate_work); + queue_work(info->workqueue, &sc->idle.immediate_work); } /* @@ -998,13 +998,13 @@ static int manage_keep_alive_before_sending(struct smbd_connection *info) struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; - if (info->keep_alive_requested == KEEP_ALIVE_PENDING) { - info->keep_alive_requested = KEEP_ALIVE_SENT; + if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; /* * Now use the keepalive timeout (instead of keepalive interval) * in order to wait for a response */ - mod_delayed_work(info->workqueue, &info->idle_timer_work, + mod_delayed_work(info->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->keepalive_timeout_msec)); return 1; } @@ -1471,9 +1471,10 @@ static void destroy_receive_buffers(struct smbd_connection *info) static void send_immediate_empty_message(struct work_struct *work) { + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.immediate_work); struct smbd_connection *info = - container_of(work, struct smbd_connection, send_immediate_work); - struct smbdirect_socket *sc = &info->socket; + container_of(sc, struct smbd_connection, socket); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return; @@ -1485,16 +1486,16 @@ static void send_immediate_empty_message(struct work_struct *work) /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */ static void idle_connection_timer(struct work_struct *work) { - struct smbd_connection *info = container_of( - work, struct smbd_connection, - idle_timer_work.work); - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.timer_work.work); struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbd_connection *info = + container_of(sc, struct smbd_connection, socket); - if (info->keep_alive_requested != KEEP_ALIVE_NONE) { + if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { log_keep_alive(ERR, - "error status info->keep_alive_requested=%d\n", - info->keep_alive_requested); + "error status sc->idle.keepalive=%d\n", + sc->idle.keepalive); smbd_disconnect_rdma_connection(info); return; } @@ -1506,11 +1507,11 @@ static void idle_connection_timer(struct work_struct *work) * Now use the keepalive timeout (instead of keepalive interval) * in order to wait for a response */ - info->keep_alive_requested = KEEP_ALIVE_PENDING; - mod_delayed_work(info->workqueue, &info->idle_timer_work, + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(info->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->keepalive_timeout_msec)); log_keep_alive(INFO, "schedule send of empty idle message\n"); - queue_work(info->workqueue, &info->send_immediate_work); + queue_work(info->workqueue, &sc->idle.immediate_work); } /* @@ -1552,9 +1553,9 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->ib.qp = NULL; log_rdma_event(INFO, "cancelling idle timer\n"); - disable_delayed_work_sync(&info->idle_timer_work); + disable_delayed_work_sync(&sc->idle.timer_work); log_rdma_event(INFO, "cancelling send immediate work\n"); - disable_work_sync(&info->send_immediate_work); + disable_work_sync(&sc->idle.immediate_work); /* It's not possible for upper layer to get to reassembly */ log_rdma_event(INFO, "drain the reassembly queue\n"); @@ -1898,14 +1899,14 @@ static struct smbd_connection *_smbd_get_connection( goto allocate_cache_failed; } - INIT_WORK(&info->send_immediate_work, send_immediate_empty_message); - INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer); + INIT_WORK(&sc->idle.immediate_work, send_immediate_empty_message); + INIT_DELAYED_WORK(&sc->idle.timer_work, idle_connection_timer); /* - * start with the negotiate timeout and KEEP_ALIVE_PENDING + * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING * so that the timer will cause a disconnect. */ - info->keep_alive_requested = KEEP_ALIVE_PENDING; - mod_delayed_work(info->workqueue, &info->idle_timer_work, + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(info->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->negotiate_timeout_msec)); INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits); @@ -1931,7 +1932,7 @@ static struct smbd_connection *_smbd_get_connection( return NULL; negotiation_failed: - disable_delayed_work_sync(&info->idle_timer_work); + disable_delayed_work_sync(&sc->idle.timer_work); destroy_caches_and_workqueue(info); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; rdma_disconnect(sc->rdma.cm_id); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 07c7fba2f340b9..5f9ba769623b03 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -27,12 +27,6 @@ extern int smbd_max_send_size; extern int smbd_send_credit_target; extern int smbd_receive_credit_max; -enum keep_alive_status { - KEEP_ALIVE_NONE, - KEEP_ALIVE_PENDING, - KEEP_ALIVE_SENT, -}; - /* * The context for the SMBDirect transport * Everything related to the transport is here. It has several logical parts @@ -46,7 +40,6 @@ struct smbd_connection { struct smbdirect_socket socket; /* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */ - enum keep_alive_status keep_alive_requested; int protocol; /* Memory registrations */ @@ -71,8 +64,6 @@ struct smbd_connection { wait_queue_head_t wait_for_mr_cleanup; struct workqueue_struct *workqueue; - struct work_struct send_immediate_work; - struct delayed_work idle_timer_work; /* for debug purposes */ unsigned int count_get_receive_buffer; From 87d03d6c829aea8c6cf632ac24ee5227c3b8b00b Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 18 Aug 2025 10:42:42 +0200 Subject: [PATCH 3016/3206] smb: client: remove unused smbd_connection->protocol There is only one protocol version for smbdirect yet and this variable is write only. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 3 ++- fs/smb/client/smbdirect.c | 1 - fs/smb/client/smbdirect.h | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 3086ab2622baac..e407bedd4aa761 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -24,6 +24,7 @@ #endif #ifdef CONFIG_CIFS_SMB_DIRECT #include "smbdirect.h" +#include "../common/smbdirect/smbdirect_pdu.h" #endif #include "cifs_swn.h" #include "cached_dir.h" @@ -458,7 +459,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_printf(m, "\nSMBDirect protocol version: 0x%x " "transport status: %s (%u)", - server->smbd_conn->protocol, + SMBDIRECT_V1, smbdirect_socket_status_string(sc->status), sc->status); seq_printf(m, "\nConn receive_credit_max: %u " diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index efebc4aa297fa9..6c1bec769e88fd 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -481,7 +481,6 @@ static bool process_negotiation_response( le16_to_cpu(packet->negotiated_version)); return false; } - info->protocol = le16_to_cpu(packet->negotiated_version); if (packet->credits_requested == 0) { log_rdma_event(ERR, "error: credits_requested==0\n"); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 5f9ba769623b03..90238a917256da 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -39,8 +39,6 @@ extern int smbd_receive_credit_max; struct smbd_connection { struct smbdirect_socket socket; - /* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */ - int protocol; /* Memory registrations */ /* Maximum number of pages in a single RDMA write/read on this connection */ From 2449c7cc9bc26f8c80560f376f26c6f50b346c0f Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 19 Aug 2025 12:03:59 +0200 Subject: [PATCH 3017/3206] smb: client: remove unused smbd_connection.count_reassembly_queue This basically represents the same information as sc->recv_io.reassembly.queue_length. The only thing that's different is that smbd_connection.count_reassembly_queue was updated in each loop step, while sc->recv_io.reassembly.queue_length is only updated once after the loop in smbd_recv. Also sc->recv_io.reassembly.queue_length is updated under a spinlock. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 3 +-- fs/smb/client/smbdirect.c | 2 -- fs/smb/client/smbdirect.h | 1 - 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index e407bedd4aa761..7cf3b045d3fbb0 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -482,12 +482,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) server->smbd_conn->count_get_receive_buffer, server->smbd_conn->count_put_receive_buffer, server->smbd_conn->count_send_empty); - seq_printf(m, "\nRead Queue count_reassembly_queue: %u " + seq_printf(m, "\nRead Queue " "count_enqueue_reassembly_queue: %u " "count_dequeue_reassembly_queue: %u " "reassembly_data_length: %u " "reassembly_queue_length: %u", - server->smbd_conn->count_reassembly_queue, server->smbd_conn->count_enqueue_reassembly_queue, server->smbd_conn->count_dequeue_reassembly_queue, sc->recv_io.reassembly.data_length, diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 6c1bec769e88fd..40a423de0e8785 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1353,7 +1353,6 @@ static void enqueue_reassembly( virt_wmb(); sc->recv_io.reassembly.data_length += data_length; spin_unlock(&sc->recv_io.reassembly.lock); - info->count_reassembly_queue++; info->count_enqueue_reassembly_queue++; } @@ -2079,7 +2078,6 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) &sc->recv_io.reassembly.lock); } queue_removed++; - info->count_reassembly_queue--; info->count_dequeue_reassembly_queue++; put_receive_buffer(info, response); offset = 0; diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 90238a917256da..2b13fa92f2d926 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -66,7 +66,6 @@ struct smbd_connection { /* for debug purposes */ unsigned int count_get_receive_buffer; unsigned int count_put_receive_buffer; - unsigned int count_reassembly_queue; unsigned int count_enqueue_reassembly_queue; unsigned int count_dequeue_reassembly_queue; unsigned int count_send_empty; From ddfcb069c1dd1c909c34605e025e0aeae1b496e3 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 19 Aug 2025 13:54:35 +0200 Subject: [PATCH 3018/3206] smb: client: make use of smbdirect_socket.statistics This will allow us to use common functions soon. Note this generates the following warnings from scripts/checkpatch.pl --quiet: WARNING: quoted string split across lines #59: FILE: fs/smb/client/cifs_debug.c:481: + seq_printf(m, "\nDebug count_get_receive_buffer: %llu " + "count_put_receive_buffer: %llu count_send_empty: %llu", WARNING: quoted string split across lines #66: FILE: fs/smb/client/cifs_debug.c:486: seq_printf(m, "\nRead Queue " + "count_enqueue_reassembly_queue: %llu " WARNING: quoted string split across lines #67: FILE: fs/smb/client/cifs_debug.c:487: + "count_enqueue_reassembly_queue: %llu " + "count_dequeue_reassembly_queue: %llu " total: 0 errors, 3 warnings, 83 lines checked scripts/checkpatch.pl: FAILED But I left them in there, because it matches the code arround it... Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 18 +++++++++--------- fs/smb/client/smbdirect.c | 11 ++++++----- fs/smb/client/smbdirect.h | 7 ------- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 7cf3b045d3fbb0..55bf867be0e2bb 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -477,18 +477,18 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) sp->keepalive_interval_msec * 1000, sp->max_read_write_size, server->smbd_conn->rdma_readwrite_threshold); - seq_printf(m, "\nDebug count_get_receive_buffer: %u " - "count_put_receive_buffer: %u count_send_empty: %u", - server->smbd_conn->count_get_receive_buffer, - server->smbd_conn->count_put_receive_buffer, - server->smbd_conn->count_send_empty); + seq_printf(m, "\nDebug count_get_receive_buffer: %llu " + "count_put_receive_buffer: %llu count_send_empty: %llu", + sc->statistics.get_receive_buffer, + sc->statistics.put_receive_buffer, + sc->statistics.send_empty); seq_printf(m, "\nRead Queue " - "count_enqueue_reassembly_queue: %u " - "count_dequeue_reassembly_queue: %u " + "count_enqueue_reassembly_queue: %llu " + "count_dequeue_reassembly_queue: %llu " "reassembly_data_length: %u " "reassembly_queue_length: %u", - server->smbd_conn->count_enqueue_reassembly_queue, - server->smbd_conn->count_dequeue_reassembly_queue, + sc->statistics.enqueue_reassembly_queue, + sc->statistics.dequeue_reassembly_queue, sc->recv_io.reassembly.data_length, sc->recv_io.reassembly.queue_length); seq_printf(m, "\nCurrent Credits send_credits: %u " diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 40a423de0e8785..fe4127b53013a1 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1212,9 +1212,10 @@ static int smbd_post_send_iter(struct smbd_connection *info, */ static int smbd_post_send_empty(struct smbd_connection *info) { + struct smbdirect_socket *sc = &info->socket; int remaining_data_length = 0; - info->count_send_empty++; + sc->statistics.send_empty++; return smbd_post_send_iter(info, NULL, &remaining_data_length); } @@ -1353,7 +1354,7 @@ static void enqueue_reassembly( virt_wmb(); sc->recv_io.reassembly.data_length += data_length; spin_unlock(&sc->recv_io.reassembly.lock); - info->count_enqueue_reassembly_queue++; + sc->statistics.enqueue_reassembly_queue++; } /* @@ -1392,7 +1393,7 @@ static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info &sc->recv_io.free.list, struct smbdirect_recv_io, list); list_del(&ret->list); - info->count_get_receive_buffer++; + sc->statistics.get_receive_buffer++; } spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); @@ -1421,7 +1422,7 @@ static void put_receive_buffer( spin_lock_irqsave(&sc->recv_io.free.lock, flags); list_add_tail(&response->list, &sc->recv_io.free.list); - info->count_put_receive_buffer++; + sc->statistics.put_receive_buffer++; spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); queue_work(info->workqueue, &sc->recv_io.posted.refill_work); @@ -2078,7 +2079,7 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) &sc->recv_io.reassembly.lock); } queue_removed++; - info->count_dequeue_reassembly_queue++; + sc->statistics.dequeue_reassembly_queue++; put_receive_buffer(info, response); offset = 0; log_read(INFO, "put_receive_buffer offset=0\n"); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 2b13fa92f2d926..8ebbbc0b0499aa 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -62,13 +62,6 @@ struct smbd_connection { wait_queue_head_t wait_for_mr_cleanup; struct workqueue_struct *workqueue; - - /* for debug purposes */ - unsigned int count_get_receive_buffer; - unsigned int count_put_receive_buffer; - unsigned int count_enqueue_reassembly_queue; - unsigned int count_dequeue_reassembly_queue; - unsigned int count_send_empty; }; /* Create a SMBDirect session */ From a8efb796db30288fab498156ef1fa03a34d88817 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 21 Aug 2025 14:50:56 +0200 Subject: [PATCH 3019/3206] smb: client: move rdma_readwrite_threshold from smbd_connection to TCP_Server_Info This belongs to the SMB layer not to the transport layer, it just uses the negotiated transport parameters to adjust the value if needed. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 +- fs/smb/client/cifsglob.h | 7 +++++++ fs/smb/client/smb2pdu.c | 2 +- fs/smb/client/smbdirect.c | 15 +++++++++++---- fs/smb/client/smbdirect.h | 7 ------- 5 files changed, 20 insertions(+), 13 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 55bf867be0e2bb..9c9f15871c2395 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -476,7 +476,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) "max_readwrite_size: %u rdma_readwrite_threshold: %u", sp->keepalive_interval_msec * 1000, sp->max_read_write_size, - server->smbd_conn->rdma_readwrite_threshold); + server->rdma_readwrite_threshold); seq_printf(m, "\nDebug count_get_receive_buffer: %llu " "count_put_receive_buffer: %llu count_send_empty: %llu", sc->statistics.get_receive_buffer, diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 0fae95cf81c434..80664f937fc737 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -814,6 +814,13 @@ struct TCP_Server_Info { unsigned int max_read; unsigned int max_write; unsigned int min_offload; + /* + * If payload is less than or equal to the threshold, + * use RDMA send/recv to send upper layer I/O. + * If payload is more than the threshold, + * use RDMA read/write through memory registration for I/O. + */ + unsigned int rdma_readwrite_threshold; unsigned int retrans; struct { bool requested; /* "compress" mount option set*/ diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index c3b9d3f6210ff9..1c63d2c9cc9c82 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4411,7 +4411,7 @@ static inline bool smb3_use_rdma_offload(struct cifs_io_parms *io_parms) return false; /* offload also has its overhead, so only do it if desired */ - if (io_parms->length < server->smbd_conn->rdma_readwrite_threshold) + if (io_parms->length < server->rdma_readwrite_threshold) return false; return true; diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index fe4127b53013a1..09bf47c854a8ad 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -518,10 +518,6 @@ static bool process_negotiation_response( } sp->max_fragmented_send_size = le32_to_cpu(packet->max_fragmented_size); - info->rdma_readwrite_threshold = - rdma_readwrite_threshold > sp->max_fragmented_send_size ? - sp->max_fragmented_send_size : - rdma_readwrite_threshold; sp->max_read_write_size = min_t(u32, @@ -1962,6 +1958,7 @@ struct smbd_connection *smbd_get_connection( struct TCP_Server_Info *server, struct sockaddr *dstaddr) { struct smbd_connection *ret; + const struct smbdirect_socket_parameters *sp; int port = SMBD_PORT; try_again: @@ -1972,6 +1969,16 @@ struct smbd_connection *smbd_get_connection( port = SMB_PORT; goto try_again; } + if (!ret) + return NULL; + + sp = &ret->socket.parameters; + + server->rdma_readwrite_threshold = + rdma_readwrite_threshold > sp->max_fragmented_send_size ? + sp->max_fragmented_send_size : + rdma_readwrite_threshold; + return ret; } diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 8ebbbc0b0499aa..4eec2ac4ba80f6 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -43,13 +43,6 @@ struct smbd_connection { /* Memory registrations */ /* Maximum number of pages in a single RDMA write/read on this connection */ int max_frmr_depth; - /* - * If payload is less than or equal to the threshold, - * use RDMA send/recv to send upper layer I/O. - * If payload is more than the threshold, - * use RDMA read/write through memory registration for I/O. - */ - int rdma_readwrite_threshold; enum ib_mr_type mr_type; struct list_head mr_list; spinlock_t mr_list_lock; From 40212a27c75ff22457d958097a27a3d8815b0a70 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 12:29:03 +0200 Subject: [PATCH 3020/3206] smb: client: make use of smbdirect_socket.workqueue This will simplify the move to common code... Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 32 ++++++++++++++++---------------- fs/smb/client/smbdirect.h | 2 -- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 09bf47c854a8ad..9ea0383a3ae727 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -214,7 +214,7 @@ static void smbd_disconnect_rdma_connection(struct smbd_connection *info) { struct smbdirect_socket *sc = &info->socket; - queue_work(info->workqueue, &sc->disconnect_work); + queue_work(sc->workqueue, &sc->disconnect_work); } /* Upcall from RDMA CM */ @@ -566,7 +566,7 @@ static void smbd_post_send_credits(struct work_struct *work) if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target - 1) { log_keep_alive(INFO, "schedule send of an empty message\n"); - queue_work(info->workqueue, &sc->idle.immediate_work); + queue_work(sc->workqueue, &sc->idle.immediate_work); } } @@ -610,7 +610,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) * order to trigger our next keepalive message. */ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; - mod_delayed_work(info->workqueue, &sc->idle.timer_work, + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->keepalive_interval_msec)); switch (sc->recv_io.expected) { @@ -691,7 +691,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) if (le16_to_cpu(data_transfer->flags) & SMBDIRECT_FLAG_RESPONSE_REQUESTED) { log_keep_alive(INFO, "schedule send of immediate response\n"); - queue_work(info->workqueue, &sc->idle.immediate_work); + queue_work(sc->workqueue, &sc->idle.immediate_work); } /* @@ -700,7 +700,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) */ if (data_length) { if (sc->recv_io.credits.target > old_recv_credit_target) - queue_work(info->workqueue, &sc->recv_io.posted.refill_work); + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); enqueue_reassembly(info, response, data_length); wake_up(&sc->recv_io.reassembly.wait_queue); @@ -999,7 +999,7 @@ static int manage_keep_alive_before_sending(struct smbd_connection *info) * Now use the keepalive timeout (instead of keepalive interval) * in order to wait for a response */ - mod_delayed_work(info->workqueue, &sc->idle.timer_work, + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->keepalive_timeout_msec)); return 1; } @@ -1421,7 +1421,7 @@ static void put_receive_buffer( sc->statistics.put_receive_buffer++; spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); - queue_work(info->workqueue, &sc->recv_io.posted.refill_work); + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); } /* Preallocate all receive buffer on transport establishment */ @@ -1503,10 +1503,10 @@ static void idle_connection_timer(struct work_struct *work) * in order to wait for a response */ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; - mod_delayed_work(info->workqueue, &sc->idle.timer_work, + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->keepalive_timeout_msec)); log_keep_alive(INFO, "schedule send of empty idle message\n"); - queue_work(info->workqueue, &sc->idle.immediate_work); + queue_work(sc->workqueue, &sc->idle.immediate_work); } /* @@ -1601,7 +1601,7 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->status = SMBDIRECT_SOCKET_DESTROYED; - destroy_workqueue(info->workqueue); + destroy_workqueue(sc->workqueue); log_rdma_event(INFO, "rdma session destroyed\n"); kfree(info); server->smbd_conn = NULL; @@ -1648,7 +1648,7 @@ static void destroy_caches_and_workqueue(struct smbd_connection *info) struct smbdirect_socket *sc = &info->socket; destroy_receive_buffers(info); - destroy_workqueue(info->workqueue); + destroy_workqueue(sc->workqueue); mempool_destroy(sc->recv_io.mem.pool); kmem_cache_destroy(sc->recv_io.mem.cache); mempool_destroy(sc->send_io.mem.pool); @@ -1704,8 +1704,8 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) goto out3; scnprintf(name, MAX_NAME_LEN, "smbd_%p", info); - info->workqueue = create_workqueue(name); - if (!info->workqueue) + sc->workqueue = create_workqueue(name); + if (!sc->workqueue) goto out4; rc = allocate_receive_buffers(info, sp->recv_credit_max); @@ -1717,7 +1717,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) return 0; out5: - destroy_workqueue(info->workqueue); + destroy_workqueue(sc->workqueue); out4: mempool_destroy(sc->recv_io.mem.pool); out3: @@ -1901,7 +1901,7 @@ static struct smbd_connection *_smbd_get_connection( * so that the timer will cause a disconnect. */ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; - mod_delayed_work(info->workqueue, &sc->idle.timer_work, + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->negotiate_timeout_msec)); INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits); @@ -2605,7 +2605,7 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) * Schedule the work to do MR recovery for future I/Os MR * recovery is slow and don't want it to block current I/O */ - queue_work(info->workqueue, &info->mr_recovery_work); + queue_work(sc->workqueue, &info->mr_recovery_work); done: if (atomic_dec_and_test(&info->mr_used_count)) diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 4eec2ac4ba80f6..455618e676f51a 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -53,8 +53,6 @@ struct smbd_connection { struct work_struct mr_recovery_work; /* Used by transport to wait until all MRs are returned */ wait_queue_head_t wait_for_mr_cleanup; - - struct workqueue_struct *workqueue; }; /* Create a SMBDirect session */ From 5f84b0819a7e678d1004df9aea05d8484ca68ce8 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 00:23:53 +0200 Subject: [PATCH 3021/3206] smb: client: add and use smbd_get_parameters() In future struct smbdirect_socket_parameters will be the only public structure for the smb layer. This prepares this... Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 8 ++++---- fs/smb/client/smbdirect.c | 7 +++++++ fs/smb/client/smbdirect.h | 2 ++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index e586f3f4b5c937..4711a23c5b380d 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -504,8 +504,8 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { - struct smbdirect_socket_parameters *sp = - &server->smbd_conn->socket.parameters; + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); if (server->sign) /* @@ -555,8 +555,8 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { - struct smbdirect_socket_parameters *sp = - &server->smbd_conn->socket.parameters; + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); if (server->sign) /* diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 9ea0383a3ae727..3aa13771037587 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -13,6 +13,13 @@ #include "cifsproto.h" #include "smb2proto.h" +const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn) +{ + struct smbdirect_socket *sc = &conn->socket; + + return &sc->parameters; +} + static struct smbdirect_recv_io *get_receive_buffer( struct smbd_connection *info); static void put_receive_buffer( diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 455618e676f51a..7773939db5f2c0 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -59,6 +59,8 @@ struct smbd_connection { struct smbd_connection *smbd_get_connection( struct TCP_Server_Info *server, struct sockaddr *dstaddr); +const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn); + /* Reconnect SMBDirect session */ int smbd_reconnect(struct TCP_Server_Info *server); /* Destroy SMBDirect session */ From f454f36cd0b9ee3b7b6c7181238ef20f1326063a Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 21 Aug 2025 23:36:33 +0200 Subject: [PATCH 3022/3206] smb: client: make use of struct smbdirect_mr_io This will allow us to move to common functions in future too. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 2 +- fs/smb/client/smbdirect.c | 66 ++++++++++++++++++++------------------- fs/smb/client/smbdirect.h | 27 ++-------------- 3 files changed, 37 insertions(+), 58 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 80664f937fc737..3ac254e123dcac 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1547,7 +1547,7 @@ struct cifs_io_subrequest { struct kvec iov[2]; struct TCP_Server_Info *server; #ifdef CONFIG_CIFS_SMB_DIRECT - struct smbd_mr *mr; + struct smbdirect_mr_io *mr; #endif struct cifs_credits credits; }; diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 3aa13771037587..138d8837a12315 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -2229,14 +2229,15 @@ int smbd_send(struct TCP_Server_Info *server, static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smbd_mr *mr; - struct ib_cqe *cqe; + struct smbdirect_mr_io *mr = + container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe); + struct smbdirect_socket *sc = mr->socket; + struct smbd_connection *info = + container_of(sc, struct smbd_connection, socket); if (wc->status) { log_rdma_mr(ERR, "status=%d\n", wc->status); - cqe = wc->wr_cqe; - mr = container_of(cqe, struct smbd_mr, cqe); - smbd_disconnect_rdma_connection(mr->conn); + smbd_disconnect_rdma_connection(info); } } @@ -2254,11 +2255,11 @@ static void smbd_mr_recovery_work(struct work_struct *work) struct smbd_connection *info = container_of(work, struct smbd_connection, mr_recovery_work); struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *smbdirect_mr; + struct smbdirect_mr_io *smbdirect_mr; int rc; list_for_each_entry(smbdirect_mr, &info->mr_list, list) { - if (smbdirect_mr->state == MR_ERROR) { + if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) { /* recover this MR entry */ rc = ib_dereg_mr(smbdirect_mr->mr); @@ -2284,7 +2285,7 @@ static void smbd_mr_recovery_work(struct work_struct *work) /* This MR is being used, don't recover it */ continue; - smbdirect_mr->state = MR_READY; + smbdirect_mr->state = SMBDIRECT_MR_READY; /* smbdirect_mr->state is updated by this function * and is read and updated by I/O issuing CPUs trying @@ -2301,11 +2302,11 @@ static void smbd_mr_recovery_work(struct work_struct *work) static void destroy_mr_list(struct smbd_connection *info) { struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *mr, *tmp; + struct smbdirect_mr_io *mr, *tmp; disable_work_sync(&info->mr_recovery_work); list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { - if (mr->state == MR_INVALIDATED) + if (mr->state == SMBDIRECT_MR_INVALIDATED) ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); ib_dereg_mr(mr->mr); @@ -2326,7 +2327,7 @@ static int allocate_mr_list(struct smbd_connection *info) struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; int i; - struct smbd_mr *smbdirect_mr, *tmp; + struct smbdirect_mr_io *smbdirect_mr, *tmp; INIT_LIST_HEAD(&info->mr_list); init_waitqueue_head(&info->wait_mr); @@ -2361,8 +2362,8 @@ static int allocate_mr_list(struct smbd_connection *info) ib_dereg_mr(smbdirect_mr->mr); goto out; } - smbdirect_mr->state = MR_READY; - smbdirect_mr->conn = info; + smbdirect_mr->state = SMBDIRECT_MR_READY; + smbdirect_mr->socket = sc; list_add_tail(&smbdirect_mr->list, &info->mr_list); atomic_inc(&info->mr_ready_count); @@ -2389,10 +2390,10 @@ static int allocate_mr_list(struct smbd_connection *info) * issuing I/O trying to get MR at the same time, mr_list_lock is used to * protect this situation. */ -static struct smbd_mr *get_mr(struct smbd_connection *info) +static struct smbdirect_mr_io *get_mr(struct smbd_connection *info) { struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *ret; + struct smbdirect_mr_io *ret; int rc; again: rc = wait_event_interruptible(info->wait_mr, @@ -2410,8 +2411,8 @@ static struct smbd_mr *get_mr(struct smbd_connection *info) spin_lock(&info->mr_list_lock); list_for_each_entry(ret, &info->mr_list, list) { - if (ret->state == MR_READY) { - ret->state = MR_REGISTERED; + if (ret->state == SMBDIRECT_MR_READY) { + ret->state = SMBDIRECT_MR_REGISTERED; spin_unlock(&info->mr_list_lock); atomic_dec(&info->mr_ready_count); atomic_inc(&info->mr_used_count); @@ -2453,12 +2454,12 @@ static int smbd_iter_to_mr(struct smbd_connection *info, * need_invalidate: true if this MR needs to be locally invalidated after I/O * return value: the MR registered, NULL if failed. */ -struct smbd_mr *smbd_register_mr(struct smbd_connection *info, +struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, struct iov_iter *iter, bool writing, bool need_invalidate) { struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *smbdirect_mr; + struct smbdirect_mr_io *smbdirect_mr; int rc, num_pages; enum dma_data_direction dir; struct ib_reg_wr *reg_wr; @@ -2530,13 +2531,13 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n", rc, reg_wr->key); - /* If all failed, attempt to recover this MR by setting it MR_ERROR*/ + /* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/ map_mr_error: ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, smbdirect_mr->dir); dma_map_error: - smbdirect_mr->state = MR_ERROR; + smbdirect_mr->state = SMBDIRECT_MR_ERROR; if (atomic_dec_and_test(&info->mr_used_count)) wake_up(&info->wait_for_mr_cleanup); @@ -2547,15 +2548,15 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smbd_mr *smbdirect_mr; + struct smbdirect_mr_io *smbdirect_mr; struct ib_cqe *cqe; cqe = wc->wr_cqe; - smbdirect_mr = container_of(cqe, struct smbd_mr, cqe); - smbdirect_mr->state = MR_INVALIDATED; + smbdirect_mr = container_of(cqe, struct smbdirect_mr_io, cqe); + smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED; if (wc->status != IB_WC_SUCCESS) { log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status); - smbdirect_mr->state = MR_ERROR; + smbdirect_mr->state = SMBDIRECT_MR_ERROR; } complete(&smbdirect_mr->invalidate_done); } @@ -2566,11 +2567,12 @@ static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) * and we have to locally invalidate the buffer to prevent data is being * modified by remote peer after upper layer consumes it */ -int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) +int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr) { struct ib_send_wr *wr; - struct smbd_connection *info = smbdirect_mr->conn; - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = smbdirect_mr->socket; + struct smbd_connection *info = + container_of(sc, struct smbd_connection, socket); int rc = 0; if (smbdirect_mr->need_invalidate) { @@ -2594,17 +2596,17 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) smbdirect_mr->need_invalidate = false; } else /* - * For remote invalidation, just set it to MR_INVALIDATED + * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED * and defer to mr_recovery_work to recover the MR for next use */ - smbdirect_mr->state = MR_INVALIDATED; + smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED; - if (smbdirect_mr->state == MR_INVALIDATED) { + if (smbdirect_mr->state == SMBDIRECT_MR_INVALIDATED) { ib_dma_unmap_sg( sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, smbdirect_mr->dir); - smbdirect_mr->state = MR_READY; + smbdirect_mr->state = SMBDIRECT_MR_READY; if (atomic_inc_return(&info->mr_ready_count) == 1) wake_up(&info->wait_mr); } else diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 7773939db5f2c0..83e726967b2f83 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -71,34 +71,11 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg); int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst); -enum mr_state { - MR_READY, - MR_REGISTERED, - MR_INVALIDATED, - MR_ERROR -}; - -struct smbd_mr { - struct smbd_connection *conn; - struct list_head list; - enum mr_state state; - struct ib_mr *mr; - struct sg_table sgt; - enum dma_data_direction dir; - union { - struct ib_reg_wr wr; - struct ib_send_wr inv_wr; - }; - struct ib_cqe cqe; - bool need_invalidate; - struct completion invalidate_done; -}; - /* Interfaces to register and deregister MR for RDMA read/write */ -struct smbd_mr *smbd_register_mr( +struct smbdirect_mr_io *smbd_register_mr( struct smbd_connection *info, struct iov_iter *iter, bool writing, bool need_invalidate); -int smbd_deregister_mr(struct smbd_mr *mr); +int smbd_deregister_mr(struct smbdirect_mr_io *mr); #else #define cifs_rdma_enabled(server) 0 From 9a52e3b0d63fc5eaefee3707136fa3c5258ae39e Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 00:32:34 +0200 Subject: [PATCH 3023/3206] smb: client: make use of smbdirect_socket_parameters.max_frmr_depth This make it easier to have common code later. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 +- fs/smb/client/file.c | 16 ++++++++++++---- fs/smb/client/smbdirect.c | 34 ++++++++++++++++++---------------- fs/smb/client/smbdirect.h | 2 -- 4 files changed, 31 insertions(+), 23 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 9c9f15871c2395..72d1ae56290fe0 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -501,7 +501,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_printf(m, "\nMR responder_resources: %u " "max_frmr_depth: %u mr_type: 0x%x", sp->responder_resources, - server->smbd_conn->max_frmr_depth, + sp->max_frmr_depth, server->smbd_conn->mr_type); seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u", atomic_read(&server->smbd_conn->mr_ready_count), diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index cb907e18cc3589..a5ed742afa007a 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -97,8 +97,12 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) cifs_trace_rw_credits_write_prepare); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->smbd_conn) - stream->sreq_max_segs = server->smbd_conn->max_frmr_depth; + if (server->smbd_conn) { + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); + + stream->sreq_max_segs = sp->max_frmr_depth; + } #endif } @@ -187,8 +191,12 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) cifs_trace_rw_credits_read_submit); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->smbd_conn) - rreq->io_streams[0].sreq_max_segs = server->smbd_conn->max_frmr_depth; + if (server->smbd_conn) { + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); + + rreq->io_streams[0].sreq_max_segs = sp->max_frmr_depth; + } #endif return 0; } diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 138d8837a12315..f85c14261c5947 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -472,8 +472,6 @@ static bool process_negotiation_response( struct smbdirect_recv_io *response, int packet_length) { struct smbdirect_socket *sc = response->socket; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response); @@ -529,8 +527,8 @@ static bool process_negotiation_response( sp->max_read_write_size = min_t(u32, le32_to_cpu(packet->max_readwrite_size), - info->max_frmr_depth * PAGE_SIZE); - info->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE; + sp->max_frmr_depth * PAGE_SIZE); + sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE; sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; return true; @@ -837,6 +835,7 @@ static int smbd_ia_open( struct sockaddr *dstaddr, int port) { struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; int rc; WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); @@ -857,8 +856,8 @@ static int smbd_ia_open( rc = -EPROTONOSUPPORT; goto out2; } - info->max_frmr_depth = min_t(int, - smbd_max_frmr_depth, + sp->max_frmr_depth = min_t(u32, + sp->max_frmr_depth, sc->ib.dev->attrs.max_fast_reg_page_list_len); info->mr_type = IB_MR_TYPE_MEM_REG; if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) @@ -1770,6 +1769,7 @@ static struct smbd_connection *_smbd_get_connection( sp->max_send_size = smbd_max_send_size; sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; sp->max_recv_size = smbd_max_receive_size; + sp->max_frmr_depth = smbd_max_frmr_depth; sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000; @@ -2255,6 +2255,7 @@ static void smbd_mr_recovery_work(struct work_struct *work) struct smbd_connection *info = container_of(work, struct smbd_connection, mr_recovery_work); struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_mr_io *smbdirect_mr; int rc; @@ -2273,11 +2274,11 @@ static void smbd_mr_recovery_work(struct work_struct *work) smbdirect_mr->mr = ib_alloc_mr( sc->ib.pd, info->mr_type, - info->max_frmr_depth); + sp->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", info->mr_type, - info->max_frmr_depth); + sp->max_frmr_depth); smbd_disconnect_rdma_connection(info); continue; } @@ -2348,13 +2349,13 @@ static int allocate_mr_list(struct smbd_connection *info) if (!smbdirect_mr) goto cleanup_entries; smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, info->mr_type, - info->max_frmr_depth); + sp->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", - info->mr_type, info->max_frmr_depth); + info->mr_type, sp->max_frmr_depth); goto out; } - smbdirect_mr->sgt.sgl = kcalloc(info->max_frmr_depth, + smbdirect_mr->sgt.sgl = kcalloc(sp->max_frmr_depth, sizeof(struct scatterlist), GFP_KERNEL); if (!smbdirect_mr->sgt.sgl) { @@ -2459,15 +2460,16 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, bool writing, bool need_invalidate) { struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_mr_io *smbdirect_mr; int rc, num_pages; enum dma_data_direction dir; struct ib_reg_wr *reg_wr; - num_pages = iov_iter_npages(iter, info->max_frmr_depth + 1); - if (num_pages > info->max_frmr_depth) { + num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); + if (num_pages > sp->max_frmr_depth) { log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n", - num_pages, info->max_frmr_depth); + num_pages, sp->max_frmr_depth); WARN_ON_ONCE(1); return NULL; } @@ -2485,8 +2487,8 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, smbdirect_mr->sgt.orig_nents = 0; log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n", - num_pages, iov_iter_count(iter), info->max_frmr_depth); - smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth); + num_pages, iov_iter_count(iter), sp->max_frmr_depth); + smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, sp->max_frmr_depth); rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, dir); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 83e726967b2f83..c88ba6e11dd146 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -41,8 +41,6 @@ struct smbd_connection { /* Memory registrations */ - /* Maximum number of pages in a single RDMA write/read on this connection */ - int max_frmr_depth; enum ib_mr_type mr_type; struct list_head mr_list; spinlock_t mr_list_lock; From 02e6f092c7aa1fa12337978837ad64d1b75bbc82 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 00:57:19 +0200 Subject: [PATCH 3024/3206] smb: client: make use of smbdirect_socket.mr_io Now struct smbd_connection only contains struct smbdirect_socket, this is an important step towards having common functions as well. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 6 +-- fs/smb/client/smbdirect.c | 81 +++++++++++++++++--------------------- fs/smb/client/smbdirect.h | 13 ------ 3 files changed, 40 insertions(+), 60 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 72d1ae56290fe0..35c4d27d2cc0ec 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -502,10 +502,10 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) "max_frmr_depth: %u mr_type: 0x%x", sp->responder_resources, sp->max_frmr_depth, - server->smbd_conn->mr_type); + sc->mr_io.type); seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u", - atomic_read(&server->smbd_conn->mr_ready_count), - atomic_read(&server->smbd_conn->mr_used_count)); + atomic_read(&sc->mr_io.ready.count), + atomic_read(&sc->mr_io.used.count)); skip_rdma: #endif seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x", diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index f85c14261c5947..12235aa6e5556c 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -169,8 +169,6 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) { struct smbdirect_socket *sc = container_of(work, struct smbdirect_socket, disconnect_work); - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); /* * make sure this and other work is not queued again @@ -179,7 +177,7 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) */ disable_work(&sc->disconnect_work); disable_work(&sc->recv_io.posted.refill_work); - disable_work(&info->mr_recovery_work); + disable_work(&sc->mr_io.recovery_work); disable_work(&sc->idle.immediate_work); disable_delayed_work(&sc->idle.timer_work); @@ -859,9 +857,9 @@ static int smbd_ia_open( sp->max_frmr_depth = min_t(u32, sp->max_frmr_depth, sc->ib.dev->attrs.max_fast_reg_page_list_len); - info->mr_type = IB_MR_TYPE_MEM_REG; + sc->mr_io.type = IB_MR_TYPE_MEM_REG; if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) - info->mr_type = IB_MR_TYPE_SG_GAPS; + sc->mr_io.type = IB_MR_TYPE_SG_GAPS; sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); if (IS_ERR(sc->ib.pd)) { @@ -1585,8 +1583,8 @@ void smbd_destroy(struct TCP_Server_Info *server) * path when sending data, and then release memory registrations. */ log_rdma_event(INFO, "freeing mr list\n"); - wake_up_all(&info->wait_mr); - while (atomic_read(&info->mr_used_count)) { + wake_up_all(&sc->mr_io.ready.wait_queue); + while (atomic_read(&sc->mr_io.used.count)) { cifs_server_unlock(server); msleep(1000); cifs_server_lock(server); @@ -2252,14 +2250,15 @@ static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) */ static void smbd_mr_recovery_work(struct work_struct *work) { - struct smbd_connection *info = - container_of(work, struct smbd_connection, mr_recovery_work); - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, mr_io.recovery_work); struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbd_connection *info = + container_of(sc, struct smbd_connection, socket); struct smbdirect_mr_io *smbdirect_mr; int rc; - list_for_each_entry(smbdirect_mr, &info->mr_list, list) { + list_for_each_entry(smbdirect_mr, &sc->mr_io.all.list, list) { if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) { /* recover this MR entry */ @@ -2273,11 +2272,11 @@ static void smbd_mr_recovery_work(struct work_struct *work) } smbdirect_mr->mr = ib_alloc_mr( - sc->ib.pd, info->mr_type, + sc->ib.pd, sc->mr_io.type, sp->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", - info->mr_type, + sc->mr_io.type, sp->max_frmr_depth); smbd_disconnect_rdma_connection(info); continue; @@ -2295,8 +2294,8 @@ static void smbd_mr_recovery_work(struct work_struct *work) * value is updated before waking up any calls to * get_mr() from the I/O issuing CPUs */ - if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up(&info->wait_mr); + if (atomic_inc_return(&sc->mr_io.ready.count) == 1) + wake_up(&sc->mr_io.ready.wait_queue); } } @@ -2305,8 +2304,8 @@ static void destroy_mr_list(struct smbd_connection *info) struct smbdirect_socket *sc = &info->socket; struct smbdirect_mr_io *mr, *tmp; - disable_work_sync(&info->mr_recovery_work); - list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { + disable_work_sync(&sc->mr_io.recovery_work); + list_for_each_entry_safe(mr, tmp, &sc->mr_io.all.list, list) { if (mr->state == SMBDIRECT_MR_INVALIDATED) ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); @@ -2330,13 +2329,7 @@ static int allocate_mr_list(struct smbd_connection *info) int i; struct smbdirect_mr_io *smbdirect_mr, *tmp; - INIT_LIST_HEAD(&info->mr_list); - init_waitqueue_head(&info->wait_mr); - spin_lock_init(&info->mr_list_lock); - atomic_set(&info->mr_ready_count, 0); - atomic_set(&info->mr_used_count, 0); - init_waitqueue_head(&info->wait_for_mr_cleanup); - INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); + INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work); if (sp->responder_resources == 0) { log_rdma_mr(ERR, "responder_resources negotiated as 0\n"); @@ -2348,11 +2341,11 @@ static int allocate_mr_list(struct smbd_connection *info) smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); if (!smbdirect_mr) goto cleanup_entries; - smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, info->mr_type, + smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, sc->mr_io.type, sp->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", - info->mr_type, sp->max_frmr_depth); + sc->mr_io.type, sp->max_frmr_depth); goto out; } smbdirect_mr->sgt.sgl = kcalloc(sp->max_frmr_depth, @@ -2366,15 +2359,15 @@ static int allocate_mr_list(struct smbd_connection *info) smbdirect_mr->state = SMBDIRECT_MR_READY; smbdirect_mr->socket = sc; - list_add_tail(&smbdirect_mr->list, &info->mr_list); - atomic_inc(&info->mr_ready_count); + list_add_tail(&smbdirect_mr->list, &sc->mr_io.all.list); + atomic_inc(&sc->mr_io.ready.count); } return 0; out: kfree(smbdirect_mr); cleanup_entries: - list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { + list_for_each_entry_safe(smbdirect_mr, tmp, &sc->mr_io.all.list, list) { list_del(&smbdirect_mr->list); ib_dereg_mr(smbdirect_mr->mr); kfree(smbdirect_mr->sgt.sgl); @@ -2397,8 +2390,8 @@ static struct smbdirect_mr_io *get_mr(struct smbd_connection *info) struct smbdirect_mr_io *ret; int rc; again: - rc = wait_event_interruptible(info->wait_mr, - atomic_read(&info->mr_ready_count) || + rc = wait_event_interruptible(sc->mr_io.ready.wait_queue, + atomic_read(&sc->mr_io.ready.count) || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) { log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc); @@ -2410,18 +2403,18 @@ static struct smbdirect_mr_io *get_mr(struct smbd_connection *info) return NULL; } - spin_lock(&info->mr_list_lock); - list_for_each_entry(ret, &info->mr_list, list) { + spin_lock(&sc->mr_io.all.lock); + list_for_each_entry(ret, &sc->mr_io.all.list, list) { if (ret->state == SMBDIRECT_MR_READY) { ret->state = SMBDIRECT_MR_REGISTERED; - spin_unlock(&info->mr_list_lock); - atomic_dec(&info->mr_ready_count); - atomic_inc(&info->mr_used_count); + spin_unlock(&sc->mr_io.all.lock); + atomic_dec(&sc->mr_io.ready.count); + atomic_inc(&sc->mr_io.used.count); return ret; } } - spin_unlock(&info->mr_list_lock); + spin_unlock(&sc->mr_io.all.lock); /* * It is possible that we could fail to get MR because other processes may * try to acquire a MR at the same time. If this is the case, retry it. @@ -2540,8 +2533,8 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, dma_map_error: smbdirect_mr->state = SMBDIRECT_MR_ERROR; - if (atomic_dec_and_test(&info->mr_used_count)) - wake_up(&info->wait_for_mr_cleanup); + if (atomic_dec_and_test(&sc->mr_io.used.count)) + wake_up(&sc->mr_io.cleanup.wait_queue); smbd_disconnect_rdma_connection(info); @@ -2609,18 +2602,18 @@ int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr) smbdirect_mr->sgt.nents, smbdirect_mr->dir); smbdirect_mr->state = SMBDIRECT_MR_READY; - if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up(&info->wait_mr); + if (atomic_inc_return(&sc->mr_io.ready.count) == 1) + wake_up(&sc->mr_io.ready.wait_queue); } else /* * Schedule the work to do MR recovery for future I/Os MR * recovery is slow and don't want it to block current I/O */ - queue_work(sc->workqueue, &info->mr_recovery_work); + queue_work(sc->workqueue, &sc->mr_io.recovery_work); done: - if (atomic_dec_and_test(&info->mr_used_count)) - wake_up(&info->wait_for_mr_cleanup); + if (atomic_dec_and_test(&sc->mr_io.used.count)) + wake_up(&sc->mr_io.cleanup.wait_queue); return rc; } diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index c88ba6e11dd146..d67ac5ddaff4e5 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -38,19 +38,6 @@ extern int smbd_receive_credit_max; */ struct smbd_connection { struct smbdirect_socket socket; - - - /* Memory registrations */ - enum ib_mr_type mr_type; - struct list_head mr_list; - spinlock_t mr_list_lock; - /* The number of available MRs ready for memory registration */ - atomic_t mr_ready_count; - atomic_t mr_used_count; - wait_queue_head_t wait_mr; - struct work_struct mr_recovery_work; - /* Used by transport to wait until all MRs are returned */ - wait_queue_head_t wait_for_mr_cleanup; }; /* Create a SMBDirect session */ From f6de7482b42941056f2b3cef0e831abce62ebfa7 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 01:07:11 +0200 Subject: [PATCH 3025/3206] smb: client: pass struct smbdirect_socket to {get,put}_receive_buffer() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 12235aa6e5556c..db37676ced7eed 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -21,9 +21,9 @@ const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connec } static struct smbdirect_recv_io *get_receive_buffer( - struct smbd_connection *info); + struct smbdirect_socket *sc); static void put_receive_buffer( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response); static int allocate_receive_buffers(struct smbd_connection *info, int num_buf); static void destroy_receive_buffers(struct smbd_connection *info); @@ -548,7 +548,7 @@ static void smbd_post_send_credits(struct work_struct *work) if (sc->recv_io.credits.target > atomic_read(&sc->recv_io.credits.count)) { while (true) { - response = get_receive_buffer(info); + response = get_receive_buffer(sc); if (!response) break; @@ -557,7 +557,7 @@ static void smbd_post_send_credits(struct work_struct *work) if (rc) { log_rdma_recv(ERR, "post_recv failed rc=%d\n", rc); - put_receive_buffer(info, response); + put_receive_buffer(sc, response); break; } @@ -623,7 +623,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) sc->recv_io.reassembly.full_packet_received = true; negotiate_done = process_negotiation_response(response, wc->byte_len); - put_receive_buffer(info, response); + put_receive_buffer(sc, response); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING); if (!negotiate_done) { sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; @@ -708,7 +708,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) enqueue_reassembly(info, response, data_length); wake_up(&sc->recv_io.reassembly.wait_queue); } else - put_receive_buffer(info, response); + put_receive_buffer(sc, response); return; @@ -723,7 +723,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected); WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); error: - put_receive_buffer(info, response); + put_receive_buffer(sc, response); smbd_disconnect_rdma_connection(info); } @@ -1287,7 +1287,7 @@ static int smbd_negotiate(struct smbd_connection *info) struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; int rc; - struct smbdirect_recv_io *response = get_receive_buffer(info); + struct smbdirect_recv_io *response = get_receive_buffer(sc); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; @@ -1298,7 +1298,7 @@ static int smbd_negotiate(struct smbd_connection *info) rc, response->sge.addr, response->sge.length, response->sge.lkey); if (rc) { - put_receive_buffer(info, response); + put_receive_buffer(sc, response); return rc; } @@ -1381,9 +1381,8 @@ static struct smbdirect_recv_io *_get_first_reassembly(struct smbd_connection *i * pre-allocated in advance. * return value: the receive buffer, NULL if none is available */ -static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info) +static struct smbdirect_recv_io *get_receive_buffer(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *ret = NULL; unsigned long flags; @@ -1407,9 +1406,8 @@ static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info * receive buffer is returned. */ static void put_receive_buffer( - struct smbd_connection *info, struct smbdirect_recv_io *response) + struct smbdirect_socket *sc, struct smbdirect_recv_io *response) { - struct smbdirect_socket *sc = &info->socket; unsigned long flags; if (likely(response->sge.length != 0)) { @@ -1464,7 +1462,7 @@ static void destroy_receive_buffers(struct smbd_connection *info) struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *response; - while ((response = get_receive_buffer(info))) + while ((response = get_receive_buffer(sc))) mempool_free(response, sc->recv_io.mem.pool); } @@ -1565,7 +1563,7 @@ void smbd_destroy(struct TCP_Server_Info *server) list_del(&response->list); spin_unlock_irqrestore( &sc->recv_io.reassembly.lock, flags); - put_receive_buffer(info, response); + put_receive_buffer(sc, response); } else spin_unlock_irqrestore( &sc->recv_io.reassembly.lock, flags); @@ -2092,7 +2090,7 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) } queue_removed++; sc->statistics.dequeue_reassembly_queue++; - put_receive_buffer(info, response); + put_receive_buffer(sc, response); offset = 0; log_read(INFO, "put_receive_buffer offset=0\n"); } else From 163e0ff8c6e1ba5d436e125291d05951dcc6ba1c Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 01:11:36 +0200 Subject: [PATCH 3026/3206] smb: client: pass struct smbdirect_socket to {allocate,destroy}_receive_buffers() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index db37676ced7eed..91bdf1e34f8cc1 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -25,8 +25,8 @@ static struct smbdirect_recv_io *get_receive_buffer( static void put_receive_buffer( struct smbdirect_socket *sc, struct smbdirect_recv_io *response); -static int allocate_receive_buffers(struct smbd_connection *info, int num_buf); -static void destroy_receive_buffers(struct smbd_connection *info); +static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf); +static void destroy_receive_buffers(struct smbdirect_socket *sc); static void enqueue_reassembly( struct smbd_connection *info, @@ -1427,9 +1427,8 @@ static void put_receive_buffer( } /* Preallocate all receive buffer on transport establishment */ -static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) +static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *response; int i; @@ -1457,9 +1456,8 @@ static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) return -ENOMEM; } -static void destroy_receive_buffers(struct smbd_connection *info) +static void destroy_receive_buffers(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *response; while ((response = get_receive_buffer(sc))) @@ -1571,7 +1569,7 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->recv_io.reassembly.data_length = 0; log_rdma_event(INFO, "free receive buffers\n"); - destroy_receive_buffers(info); + destroy_receive_buffers(sc); /* * For performance reasons, memory registration and deregistration @@ -1649,7 +1647,7 @@ static void destroy_caches_and_workqueue(struct smbd_connection *info) { struct smbdirect_socket *sc = &info->socket; - destroy_receive_buffers(info); + destroy_receive_buffers(sc); destroy_workqueue(sc->workqueue); mempool_destroy(sc->recv_io.mem.pool); kmem_cache_destroy(sc->recv_io.mem.cache); @@ -1710,7 +1708,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!sc->workqueue) goto out4; - rc = allocate_receive_buffers(info, sp->recv_credit_max); + rc = allocate_receive_buffers(sc, sp->recv_credit_max); if (rc) { log_rdma_event(ERR, "failed to allocate receive buffers\n"); goto out5; From b8a5d5d1aa00728a4604cc2c2775c0664e821173 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 01:14:38 +0200 Subject: [PATCH 3027/3206] smb: client: pass struct smbdirect_socket to {allocate,destroy}_caches_and_workqueue() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 91bdf1e34f8cc1..0ac4b76ed76b72 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1643,10 +1643,8 @@ int smbd_reconnect(struct TCP_Server_Info *server) return -ENOENT; } -static void destroy_caches_and_workqueue(struct smbd_connection *info) +static void destroy_caches_and_workqueue(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; - destroy_receive_buffers(sc); destroy_workqueue(sc->workqueue); mempool_destroy(sc->recv_io.mem.pool); @@ -1656,9 +1654,8 @@ static void destroy_caches_and_workqueue(struct smbd_connection *info) } #define MAX_NAME_LEN 80 -static int allocate_caches_and_workqueue(struct smbd_connection *info) +static int allocate_caches_and_workqueue(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; char name[MAX_NAME_LEN]; int rc; @@ -1666,7 +1663,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer))) return -ENOMEM; - scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", sc); sc->send_io.mem.cache = kmem_cache_create( name, @@ -1682,7 +1679,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!sc->send_io.mem.pool) goto out1; - scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", sc); struct kmem_cache_args response_args = { .align = __alignof__(struct smbdirect_recv_io), @@ -1703,7 +1700,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!sc->recv_io.mem.pool) goto out3; - scnprintf(name, MAX_NAME_LEN, "smbd_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbd_%p", sc); sc->workqueue = create_workqueue(name); if (!sc->workqueue) goto out4; @@ -1889,7 +1886,7 @@ static struct smbd_connection *_smbd_get_connection( log_rdma_event(INFO, "rdma_connect connected\n"); - rc = allocate_caches_and_workqueue(info); + rc = allocate_caches_and_workqueue(sc); if (rc) { log_rdma_event(ERR, "cache allocation failed\n"); goto allocate_cache_failed; @@ -1929,7 +1926,7 @@ static struct smbd_connection *_smbd_get_connection( negotiation_failed: disable_delayed_work_sync(&sc->idle.timer_work); - destroy_caches_and_workqueue(info); + destroy_caches_and_workqueue(sc); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; rdma_disconnect(sc->rdma.cm_id); wait_event(sc->status_wait, From 419fc78fd81ffa6d6da847ba46050db1da64b874 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 01:17:35 +0200 Subject: [PATCH 3028/3206] smb: client: pass struct smbdirect_socket to {enqueue,_get_first}_reassembly() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 0ac4b76ed76b72..8e354d10038a0a 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -29,10 +29,10 @@ static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf); static void destroy_receive_buffers(struct smbdirect_socket *sc); static void enqueue_reassembly( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response, int data_length); static struct smbdirect_recv_io *_get_first_reassembly( - struct smbd_connection *info); + struct smbdirect_socket *sc); static int smbd_post_recv( struct smbd_connection *info, @@ -705,7 +705,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) if (sc->recv_io.credits.target > old_recv_credit_target) queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); - enqueue_reassembly(info, response, data_length); + enqueue_reassembly(sc, response, data_length); wake_up(&sc->recv_io.reassembly.wait_queue); } else put_receive_buffer(sc, response); @@ -1336,12 +1336,10 @@ static int smbd_negotiate(struct smbd_connection *info) * data_length: the size of payload in this packet */ static void enqueue_reassembly( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response, int data_length) { - struct smbdirect_socket *sc = &info->socket; - spin_lock(&sc->recv_io.reassembly.lock); list_add_tail(&response->list, &sc->recv_io.reassembly.list); sc->recv_io.reassembly.queue_length++; @@ -1362,9 +1360,8 @@ static void enqueue_reassembly( * Caller is responsible for locking * return value: the first entry if any, NULL if queue is empty */ -static struct smbdirect_recv_io *_get_first_reassembly(struct smbd_connection *info) +static struct smbdirect_recv_io *_get_first_reassembly(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *ret = NULL; if (!list_empty(&sc->recv_io.reassembly.list)) { @@ -1556,7 +1553,7 @@ void smbd_destroy(struct TCP_Server_Info *server) log_rdma_event(INFO, "drain the reassembly queue\n"); do { spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); - response = _get_first_reassembly(info); + response = _get_first_reassembly(sc); if (response) { list_del(&response->list); spin_unlock_irqrestore( @@ -2032,7 +2029,7 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) to_read = size; offset = sc->recv_io.reassembly.first_entry_offset; while (data_read < size) { - response = _get_first_reassembly(info); + response = _get_first_reassembly(sc); data_transfer = smbdirect_recv_io_payload(response); data_length = le32_to_cpu(data_transfer->data_length); remaining_data_length = From a8a45d4c9597ff5357a44e88ce9910ccb52add06 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 10:10:10 +0200 Subject: [PATCH 3029/3206] smb: client: pass struct smbdirect_socket to {allocate,destroy}_mr_list() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 8e354d10038a0a..de0ac75b975b38 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -40,8 +40,8 @@ static int smbd_post_recv( static int smbd_post_send_empty(struct smbd_connection *info); -static void destroy_mr_list(struct smbd_connection *info); -static int allocate_mr_list(struct smbd_connection *info); +static void destroy_mr_list(struct smbdirect_socket *sc); +static int allocate_mr_list(struct smbdirect_socket *sc); struct smb_extract_to_rdma { struct ib_sge *sge; @@ -1582,7 +1582,7 @@ void smbd_destroy(struct TCP_Server_Info *server) msleep(1000); cifs_server_lock(server); } - destroy_mr_list(info); + destroy_mr_list(sc); ib_free_cq(sc->ib.send_cq); ib_free_cq(sc->ib.recv_cq); @@ -1907,7 +1907,7 @@ static struct smbd_connection *_smbd_get_connection( goto negotiation_failed; } - rc = allocate_mr_list(info); + rc = allocate_mr_list(sc); if (rc) { log_rdma_mr(ERR, "memory registration allocation failed\n"); goto allocate_mr_failed; @@ -2289,9 +2289,8 @@ static void smbd_mr_recovery_work(struct work_struct *work) } } -static void destroy_mr_list(struct smbd_connection *info) +static void destroy_mr_list(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_mr_io *mr, *tmp; disable_work_sync(&sc->mr_io.recovery_work); @@ -2312,9 +2311,8 @@ static void destroy_mr_list(struct smbd_connection *info) * Recovery is done in smbd_mr_recovery_work. The content of list entry changes * as MRs are used and recovered for I/O, but the list links will not change */ -static int allocate_mr_list(struct smbd_connection *info) +static int allocate_mr_list(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; int i; struct smbdirect_mr_io *smbdirect_mr, *tmp; From c7e4d5facb47698411d144ea1c968ef3e1794be9 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 10:12:35 +0200 Subject: [PATCH 3030/3206] smb: client: pass struct smbdirect_socket to smbd_disconnect_rdma_connection() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 41 +++++++++++++-------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index de0ac75b975b38..9c2fb138ce1f41 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -215,10 +215,8 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) } } -static void smbd_disconnect_rdma_connection(struct smbd_connection *info) +static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; - queue_work(sc->workqueue, &sc->disconnect_work); } @@ -390,6 +388,7 @@ static void smbd_qp_async_error_upcall(struct ib_event *event, void *context) { struct smbd_connection *info = context; + struct smbdirect_socket *sc = &info->socket; log_rdma_event(ERR, "%s on device %s info %p\n", ib_event_msg(event->event), event->device->name, info); @@ -397,7 +396,7 @@ smbd_qp_async_error_upcall(struct ib_event *event, void *context) switch (event->event) { case IB_EVENT_CQ_ERR: case IB_EVENT_QP_FATAL: - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); break; default: @@ -422,8 +421,6 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_send_io *request = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); struct smbdirect_socket *sc = request->socket; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n", request, ib_wc_status_msg(wc->status)); @@ -439,7 +436,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n", ib_wc_status_msg(wc->status), wc->opcode); mempool_free(request, sc->send_io.mem.pool); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); return; } @@ -581,8 +578,6 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); struct smbdirect_socket *sc = response->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); u16 old_recv_credit_target; u32 data_offset = 0; u32 data_length = 0; @@ -724,7 +719,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); error: put_receive_buffer(sc, response); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); } static struct rdma_cm_id *smbd_create_id( @@ -949,7 +944,7 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr, request->sge[0].length, DMA_TO_DEVICE); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); dma_mapping_failed: mempool_free(request, sc->send_io.mem.pool); @@ -1041,7 +1036,7 @@ static int smbd_post_send(struct smbd_connection *info, rc = ib_post_send(sc->ib.qp, &send_wr, NULL); if (rc) { log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); rc = -EAGAIN; } @@ -1274,7 +1269,7 @@ static int smbd_post_recv( ib_dma_unmap_single(sc->ib.dev, response->sge.addr, response->sge.length, DMA_FROM_DEVICE); response->sge.length = 0; - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc); } @@ -1481,14 +1476,12 @@ static void idle_connection_timer(struct work_struct *work) struct smbdirect_socket *sc = container_of(work, struct smbdirect_socket, idle.timer_work.work); struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { log_keep_alive(ERR, "error status sc->idle.keepalive=%d\n", sc->idle.keepalive); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); return; } @@ -2220,12 +2213,10 @@ static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_mr_io *mr = container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe); struct smbdirect_socket *sc = mr->socket; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); if (wc->status) { log_rdma_mr(ERR, "status=%d\n", wc->status); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); } } @@ -2243,8 +2234,6 @@ static void smbd_mr_recovery_work(struct work_struct *work) struct smbdirect_socket *sc = container_of(work, struct smbdirect_socket, mr_io.recovery_work); struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); struct smbdirect_mr_io *smbdirect_mr; int rc; @@ -2257,7 +2246,7 @@ static void smbd_mr_recovery_work(struct work_struct *work) log_rdma_mr(ERR, "ib_dereg_mr failed rc=%x\n", rc); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); continue; } @@ -2268,7 +2257,7 @@ static void smbd_mr_recovery_work(struct work_struct *work) log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", sc->mr_io.type, sp->max_frmr_depth); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); continue; } } else @@ -2524,7 +2513,7 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, if (atomic_dec_and_test(&sc->mr_io.used.count)) wake_up(&sc->mr_io.cleanup.wait_queue); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); return NULL; } @@ -2554,8 +2543,6 @@ int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr) { struct ib_send_wr *wr; struct smbdirect_socket *sc = smbdirect_mr->socket; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); int rc = 0; if (smbdirect_mr->need_invalidate) { @@ -2572,7 +2559,7 @@ int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr) rc = ib_post_send(sc->ib.qp, wr, NULL); if (rc) { log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); goto done; } wait_for_completion(&smbdirect_mr->invalidate_done); From be8602d47d4116da477cba0e4284295892887b24 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 10:14:29 +0200 Subject: [PATCH 3031/3206] smb: client: pass struct smbdirect_socket to smbd_post_recv() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 9c2fb138ce1f41..809c529b1b3c15 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -35,7 +35,7 @@ static struct smbdirect_recv_io *_get_first_reassembly( struct smbdirect_socket *sc); static int smbd_post_recv( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response); static int smbd_post_send_empty(struct smbd_connection *info); @@ -535,8 +535,6 @@ static void smbd_post_send_credits(struct work_struct *work) struct smbdirect_recv_io *response; struct smbdirect_socket *sc = container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { return; @@ -550,7 +548,7 @@ static void smbd_post_send_credits(struct work_struct *work) break; response->first_segment = false; - rc = smbd_post_recv(info, response); + rc = smbd_post_recv(sc, response); if (rc) { log_rdma_recv(ERR, "post_recv failed rc=%d\n", rc); @@ -1241,9 +1239,8 @@ static int smbd_post_send_full_iter(struct smbd_connection *info, * The interaction is controlled by send/receive credit system */ static int smbd_post_recv( - struct smbd_connection *info, struct smbdirect_recv_io *response) + struct smbdirect_socket *sc, struct smbdirect_recv_io *response) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_recv_wr recv_wr; int rc = -EIO; @@ -1288,7 +1285,7 @@ static int smbd_negotiate(struct smbd_connection *info) sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP; - rc = smbd_post_recv(info, response); + rc = smbd_post_recv(sc, response); log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", rc, response->sge.addr, response->sge.length, response->sge.lkey); From 869bb7284fb2976b59ad29455cddd44bce0976f7 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 10:18:12 +0200 Subject: [PATCH 3032/3206] smb: client: pass struct smbdirect_socket to manage_credits_prior_sending() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 809c529b1b3c15..adebee32442aac 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -957,9 +957,8 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) * buffer as possible, and extend the receive credits to remote peer * return value: the new credtis being granted. */ -static int manage_credits_prior_sending(struct smbd_connection *info) +static int manage_credits_prior_sending(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; int new_credits; if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) @@ -1127,7 +1126,7 @@ static int smbd_post_send_iter(struct smbd_connection *info, packet = smbdirect_send_io_payload(request); packet->credits_requested = cpu_to_le16(sp->send_credit_target); - new_credits = manage_credits_prior_sending(info); + new_credits = manage_credits_prior_sending(sc); atomic_add(new_credits, &sc->recv_io.credits.count); packet->credits_granted = cpu_to_le16(new_credits); From 8a9919b2bf8199206e5c5e223e2dd3d04b93a4e2 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 10:19:28 +0200 Subject: [PATCH 3033/3206] smb: client: pass struct smbdirect_socket to smbd_post_send() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index adebee32442aac..dc3b2f0aab0695 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1003,10 +1003,9 @@ static int manage_keep_alive_before_sending(struct smbd_connection *info) } /* Post the send request */ -static int smbd_post_send(struct smbd_connection *info, +static int smbd_post_send(struct smbdirect_socket *sc, struct smbdirect_send_io *request) { - struct smbdirect_socket *sc = &info->socket; struct ib_send_wr send_wr; int rc, i; @@ -1169,7 +1168,7 @@ static int smbd_post_send_iter(struct smbd_connection *info, request->sge[0].length = header_length; request->sge[0].lkey = sc->ib.pd->local_dma_lkey; - rc = smbd_post_send(info, request); + rc = smbd_post_send(sc, request); if (!rc) return 0; From 8cead970e37c40727d9b93ce49e7c445c2caf7d0 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 10:21:16 +0200 Subject: [PATCH 3034/3206] smb: client: pass struct smbdirect_socket to manage_keep_alive_before_sending() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index dc3b2f0aab0695..7990c0409193c9 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -984,9 +984,8 @@ static int manage_credits_prior_sending(struct smbdirect_socket *sc) * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set * 0: otherwise */ -static int manage_keep_alive_before_sending(struct smbd_connection *info) +static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { @@ -1130,7 +1129,7 @@ static int smbd_post_send_iter(struct smbd_connection *info, packet->credits_granted = cpu_to_le16(new_credits); packet->flags = 0; - if (manage_keep_alive_before_sending(info)) + if (manage_keep_alive_before_sending(sc)) packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); packet->reserved = 0; From 46b8fe2bd18c487aa4fd5c45e6f9c23a1c4c859a Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 10:27:01 +0200 Subject: [PATCH 3035/3206] smb: client: pass struct smbdirect_socket to smbd_post_send_iter() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 7990c0409193c9..211cf391edb5ab 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1038,11 +1038,10 @@ static int smbd_post_send(struct smbdirect_socket *sc, return rc; } -static int smbd_post_send_iter(struct smbd_connection *info, +static int smbd_post_send_iter(struct smbdirect_socket *sc, struct iov_iter *iter, int *_remaining_data_length) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; int i, rc; int header_length; @@ -1206,13 +1205,14 @@ static int smbd_post_send_empty(struct smbd_connection *info) int remaining_data_length = 0; sc->statistics.send_empty++; - return smbd_post_send_iter(info, NULL, &remaining_data_length); + return smbd_post_send_iter(sc, NULL, &remaining_data_length); } static int smbd_post_send_full_iter(struct smbd_connection *info, struct iov_iter *iter, int *_remaining_data_length) { + struct smbdirect_socket *sc = &info->socket; int rc = 0; /* @@ -1222,7 +1222,7 @@ static int smbd_post_send_full_iter(struct smbd_connection *info, */ while (iov_iter_count(iter) > 0) { - rc = smbd_post_send_iter(info, iter, _remaining_data_length); + rc = smbd_post_send_iter(sc, iter, _remaining_data_length); if (rc < 0) break; } From d76e8c7f546b2d2a49149b4ee59583aad0282c93 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 10:47:06 +0200 Subject: [PATCH 3036/3206] smb: client: pass struct smbdirect_socket to smbd_post_send_empty() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 211cf391edb5ab..5784e82f79115b 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -38,7 +38,7 @@ static int smbd_post_recv( struct smbdirect_socket *sc, struct smbdirect_recv_io *response); -static int smbd_post_send_empty(struct smbd_connection *info); +static int smbd_post_send_empty(struct smbdirect_socket *sc); static void destroy_mr_list(struct smbdirect_socket *sc); static int allocate_mr_list(struct smbdirect_socket *sc); @@ -1199,9 +1199,8 @@ static int smbd_post_send_iter(struct smbdirect_socket *sc, * Empty message is used to extend credits to peer to for keep live * while there is no upper layer payload to send at the time */ -static int smbd_post_send_empty(struct smbd_connection *info) +static int smbd_post_send_empty(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; int remaining_data_length = 0; sc->statistics.send_empty++; @@ -1454,14 +1453,12 @@ static void send_immediate_empty_message(struct work_struct *work) { struct smbdirect_socket *sc = container_of(work, struct smbdirect_socket, idle.immediate_work); - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return; log_keep_alive(INFO, "send an empty message\n"); - smbd_post_send_empty(info); + smbd_post_send_empty(sc); } /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */ From 05bd1378c821f4cea2818b0bef6b28415ffca56f Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 11:03:52 +0200 Subject: [PATCH 3037/3206] smb: client: pass struct smbdirect_socket to smbd_post_send_full_iter() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 5784e82f79115b..951e70403aa559 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1207,11 +1207,10 @@ static int smbd_post_send_empty(struct smbdirect_socket *sc) return smbd_post_send_iter(sc, NULL, &remaining_data_length); } -static int smbd_post_send_full_iter(struct smbd_connection *info, +static int smbd_post_send_full_iter(struct smbdirect_socket *sc, struct iov_iter *iter, int *_remaining_data_length) { - struct smbdirect_socket *sc = &info->socket; int rc = 0; /* @@ -2168,13 +2167,13 @@ int smbd_send(struct TCP_Server_Info *server, klen += rqst->rq_iov[i].iov_len; iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); - rc = smbd_post_send_full_iter(info, &iter, &remaining_data_length); + rc = smbd_post_send_full_iter(sc, &iter, &remaining_data_length); if (rc < 0) break; if (iov_iter_count(&rqst->rq_iter) > 0) { /* And then the data pages if there are any */ - rc = smbd_post_send_full_iter(info, &rqst->rq_iter, + rc = smbd_post_send_full_iter(sc, &rqst->rq_iter, &remaining_data_length); if (rc < 0) break; From e3f095cca6563e6aaf9163690e22800f765cd193 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 11:07:48 +0200 Subject: [PATCH 3038/3206] smb: client: pass struct smbdirect_socket to smbd_conn_upcall() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 951e70403aa559..90bfb998684898 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -224,8 +224,7 @@ static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) static int smbd_conn_upcall( struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct smbd_connection *info = id->context; - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = id->context; struct smbdirect_socket_parameters *sp = &sc->parameters; const char *event_name = rdma_event_msg(event->event); u8 peer_initiator_depth; @@ -730,7 +729,7 @@ static struct rdma_cm_id *smbd_create_id( int rc; __be16 *sport; - id = rdma_create_id(&init_net, smbd_conn_upcall, info, + id = rdma_create_id(&init_net, smbd_conn_upcall, sc, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); From 94a0e794b77f3300c99fcac65d1a4fc83fd4684f Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 11:08:13 +0200 Subject: [PATCH 3039/3206] smb: client: pass struct smbdirect_socket to smbd_qp_async_error_upcall() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 90bfb998684898..dd1722443365eb 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -386,11 +386,10 @@ static int smbd_conn_upcall( static void smbd_qp_async_error_upcall(struct ib_event *event, void *context) { - struct smbd_connection *info = context; - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = context; - log_rdma_event(ERR, "%s on device %s info %p\n", - ib_event_msg(event->event), event->device->name, info); + log_rdma_event(ERR, "%s on device %s socket %p\n", + ib_event_msg(event->event), event->device->name, sc); switch (event->event) { case IB_EVENT_CQ_ERR: @@ -1779,7 +1778,7 @@ static struct smbd_connection *_smbd_get_connection( } sc->ib.send_cq = - ib_alloc_cq_any(sc->ib.dev, info, + ib_alloc_cq_any(sc->ib.dev, sc, sp->send_credit_target, IB_POLL_SOFTIRQ); if (IS_ERR(sc->ib.send_cq)) { sc->ib.send_cq = NULL; @@ -1787,7 +1786,7 @@ static struct smbd_connection *_smbd_get_connection( } sc->ib.recv_cq = - ib_alloc_cq_any(sc->ib.dev, info, + ib_alloc_cq_any(sc->ib.dev, sc, sp->recv_credit_max, IB_POLL_SOFTIRQ); if (IS_ERR(sc->ib.recv_cq)) { sc->ib.recv_cq = NULL; @@ -1796,7 +1795,7 @@ static struct smbd_connection *_smbd_get_connection( memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.event_handler = smbd_qp_async_error_upcall; - qp_attr.qp_context = info; + qp_attr.qp_context = sc; qp_attr.cap.max_send_wr = sp->send_credit_target; qp_attr.cap.max_recv_wr = sp->recv_credit_max; qp_attr.cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; From 2569536b0c69eb3c38deacf2b9ce82cd5f8b4633 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 11:10:57 +0200 Subject: [PATCH 3040/3206] smb: client: pass struct smbdirect_socket to smbd_create_id() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index dd1722443365eb..414127b3bf659f 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -719,10 +719,9 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } static struct rdma_cm_id *smbd_create_id( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct sockaddr *dstaddr, int port) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct rdma_cm_id *id; int rc; @@ -830,7 +829,7 @@ static int smbd_ia_open( WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED; - sc->rdma.cm_id = smbd_create_id(info, dstaddr, port); + sc->rdma.cm_id = smbd_create_id(sc, dstaddr, port); if (IS_ERR(sc->rdma.cm_id)) { rc = PTR_ERR(sc->rdma.cm_id); goto out1; From 0a5dc5fc7c014d88c31aff327a65d610d8006454 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 11:34:05 +0200 Subject: [PATCH 3041/3206] smb: client: pass struct smbdirect_socket to smbd_ia_open() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 414127b3bf659f..b9eb3ab52ac3f6 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -819,10 +819,9 @@ static bool frwr_is_supported(struct ib_device_attr *attrs) } static int smbd_ia_open( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct sockaddr *dstaddr, int port) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; int rc; @@ -1741,7 +1740,7 @@ static struct smbd_connection *_smbd_get_connection( sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000; - rc = smbd_ia_open(info, dstaddr, port); + rc = smbd_ia_open(sc, dstaddr, port); if (rc) { log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); goto create_id_failed; From c612e60de5aa0b1cb60ac6c9e7d96eb3c4528b13 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 11:52:24 +0200 Subject: [PATCH 3042/3206] smb: client: pass struct smbdirect_socket to smbd_post_send_negotiate_req() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index b9eb3ab52ac3f6..b4f257b55d99b6 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -873,9 +873,8 @@ static int smbd_ia_open( * After negotiation, the transport is connected and ready for * carrying upper layer SMB payload */ -static int smbd_post_send_negotiate_req(struct smbd_connection *info) +static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_send_wr send_wr; int rc = -ENOMEM; @@ -1285,7 +1284,7 @@ static int smbd_negotiate(struct smbd_connection *info) return rc; } - rc = smbd_post_send_negotiate_req(info); + rc = smbd_post_send_negotiate_req(sc); if (rc) return rc; From 2c6b999a90987a86285865569eac8a5a51ce18a1 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 11:54:30 +0200 Subject: [PATCH 3043/3206] smb: client: pass struct smbdirect_socket to smbd_negotiate() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index b4f257b55d99b6..2304a5c6f42684 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1264,9 +1264,8 @@ static int smbd_post_recv( } /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */ -static int smbd_negotiate(struct smbd_connection *info) +static int smbd_negotiate(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; int rc; struct smbdirect_recv_io *response = get_receive_buffer(sc); @@ -1879,7 +1878,7 @@ static struct smbd_connection *_smbd_get_connection( INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits); - rc = smbd_negotiate(info); + rc = smbd_negotiate(sc); if (rc) { log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc); goto negotiation_failed; From 4c4b1d1122fb49a207b86318b0940e09898d8e62 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 01:14:38 +0200 Subject: [PATCH 3044/3206] smb: client: pass struct smbdirect_socket to get_mr() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 2304a5c6f42684..e453b998a85952 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -2345,9 +2345,8 @@ static int allocate_mr_list(struct smbdirect_socket *sc) * issuing I/O trying to get MR at the same time, mr_list_lock is used to * protect this situation. */ -static struct smbdirect_mr_io *get_mr(struct smbd_connection *info) +static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_mr_io *ret; int rc; again: @@ -2428,7 +2427,7 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, return NULL; } - smbdirect_mr = get_mr(info); + smbdirect_mr = get_mr(sc); if (!smbdirect_mr) { log_rdma_mr(ERR, "get_mr returning NULL\n"); return NULL; From 114347dad6e7955b25aa707d62936023e501d8ed Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 01:14:38 +0200 Subject: [PATCH 3045/3206] smb: client: remove unused struct smbdirect_socket argument of smbd_iter_to_mr() This will make it easier to move function to the common code in future. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index e453b998a85952..dba3f461b5cf9b 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -2385,8 +2385,7 @@ static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc) /* * Transcribe the pages from an iterator into an MR scatterlist. */ -static int smbd_iter_to_mr(struct smbd_connection *info, - struct iov_iter *iter, +static int smbd_iter_to_mr(struct iov_iter *iter, struct sg_table *sgt, unsigned int max_sg) { @@ -2441,7 +2440,7 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n", num_pages, iov_iter_count(iter), sp->max_frmr_depth); - smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, sp->max_frmr_depth); + smbd_iter_to_mr(iter, &smbdirect_mr->sgt, sp->max_frmr_depth); rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, dir); From ffbfc73e84eb829f7e2f901594271557de860827 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 28 Aug 2025 10:39:56 +0200 Subject: [PATCH 3046/3206] smb: client: let smbd_disconnect_rdma_connection() set SMBDIRECT_SOCKET_ERROR... smbd_disconnect_rdma_connection() should turn the status into an error state instead of leaving it as is until smbd_disconnect_rdma_work() is running. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 40 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index dba3f461b5cf9b..45f137487ba1fc 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -217,6 +217,46 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) { + switch (sc->status) { + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_ERROR: + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + /* + * Keep the current error status + */ + break; + + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; + break; + + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; + break; + + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; + break; + + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_CONNECTED: + sc->status = SMBDIRECT_SOCKET_ERROR; + break; + } + queue_work(sc->workqueue, &sc->disconnect_work); } From 98a1cdca35eac55407c76067cdffd19aade0026c Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 28 Aug 2025 12:12:09 +0200 Subject: [PATCH 3047/3206] smb: client: fill in smbdirect_socket.first_error on error For now we just use -ECONNABORTED, but it will get more detailed later. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 45f137487ba1fc..86d32bd5f11366 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -181,6 +181,9 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) disable_work(&sc->idle.immediate_work); disable_delayed_work(&sc->idle.timer_work); + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + switch (sc->status) { case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: @@ -217,6 +220,9 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) { + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + switch (sc->status) { case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: From 0cb7ed37af614bbda39dab3d13f0639458d77cae Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 28 Aug 2025 12:15:11 +0200 Subject: [PATCH 3048/3206] smb: client: let smbd_disconnect_rdma_connection() disable all work but disconnect_work There's no point run these if we already know the connection is broken. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 86d32bd5f11366..155d69745adf19 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -220,6 +220,16 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) { + /* + * make sure other work (than disconnect_work) is + * not queued again but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->recv_io.posted.refill_work); + disable_work(&sc->mr_io.recovery_work); + disable_work(&sc->idle.immediate_work); + disable_delayed_work(&sc->idle.timer_work); + if (sc->first_error == 0) sc->first_error = -ECONNABORTED; From 1b128ec1c76dcf08b752b5f5268da813fea58a01 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 2 Sep 2025 12:17:56 +0200 Subject: [PATCH 3049/3206] smb: client: let smbd_{destroy,disconnect_rdma_{work,connection}}() wake up all wait queues This is important in order to let all waiters notice a broken connection. We also go via smbd_disconnect_rdma_{work,connection}() for broken connections. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 52 ++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 155d69745adf19..6d9c9e68c76505 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -165,6 +165,21 @@ do { \ #define log_rdma_mr(level, fmt, args...) \ log_rdma(level, LOG_RDMA_MR, fmt, ##args) +static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc) +{ + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + wake_up_all(&sc->status_wait); + wake_up_all(&sc->send_io.credits.wait_queue); + wake_up_all(&sc->send_io.pending.dec_wait_queue); + wake_up_all(&sc->send_io.pending.zero_wait_queue); + wake_up_all(&sc->recv_io.reassembly.wait_queue); + wake_up_all(&sc->mr_io.ready.wait_queue); + wake_up_all(&sc->mr_io.cleanup.wait_queue); +} + static void smbd_disconnect_rdma_work(struct work_struct *work) { struct smbdirect_socket *sc = @@ -216,6 +231,12 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) case SMBDIRECT_SOCKET_DESTROYED: break; } + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smbd_disconnect_wake_up_all(sc); } static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) @@ -273,6 +294,12 @@ static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) break; } + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smbd_disconnect_wake_up_all(sc); + queue_work(sc->workqueue, &sc->disconnect_work); } @@ -306,14 +333,14 @@ static int smbd_conn_upcall( log_rdma_event(ERR, "connecting failed event=%s\n", event_name); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; - wake_up_all(&sc->status_wait); + smbd_disconnect_rdma_work(&sc->disconnect_work); break; case RDMA_CM_EVENT_ROUTE_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; - wake_up_all(&sc->status_wait); + smbd_disconnect_rdma_work(&sc->disconnect_work); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -408,7 +435,7 @@ static int smbd_conn_upcall( log_rdma_event(ERR, "connecting failed event=%s\n", event_name); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; - wake_up_all(&sc->status_wait); + smbd_disconnect_rdma_work(&sc->disconnect_work); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: @@ -416,17 +443,10 @@ static int smbd_conn_upcall( /* This happens when we fail the negotiation */ if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) { log_rdma_event(ERR, "event=%s during negotiation\n", event_name); - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - smbd_disconnect_rdma_work(&sc->disconnect_work); - wake_up_all(&sc->status_wait); - break; } sc->status = SMBDIRECT_SOCKET_DISCONNECTED; smbd_disconnect_rdma_work(&sc->disconnect_work); - wake_up_all(&sc->status_wait); - wake_up_all(&sc->recv_io.reassembly.wait_queue); - wake_up_all(&sc->send_io.credits.wait_queue); break; default: @@ -674,7 +694,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING); if (!negotiate_done) { sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; - wake_up_all(&sc->status_wait); + smbd_disconnect_rdma_connection(sc); } else { sc->status = SMBDIRECT_SOCKET_CONNECTED; wake_up(&sc->status_wait); @@ -1569,6 +1589,15 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + * + * Most likely this was already called via + * smbd_disconnect_rdma_work(), but call it again... + */ + smbd_disconnect_wake_up_all(sc); + log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n"); disable_work_sync(&sc->recv_io.posted.refill_work); @@ -1609,7 +1638,6 @@ void smbd_destroy(struct TCP_Server_Info *server) * path when sending data, and then release memory registrations. */ log_rdma_event(INFO, "freeing mr list\n"); - wake_up_all(&sc->mr_io.ready.wait_queue); while (atomic_read(&sc->mr_io.used.count)) { cifs_server_unlock(server); msleep(1000); From a437c9cebcc3c4193fe0143e6fbc72c6b7529ad2 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 27 Aug 2025 15:34:09 +0200 Subject: [PATCH 3050/3206] smb: client: make consitent use of spin_lock_irq{save,restore}() in smbdirect.c There is a mix of using spin_lock(), spin_lock_irq() and spin_lock_irqsave() and it seems at least enqueue_reassembly() was wrong in using just spin_lock() as it's called via recv_done() from a SOFTIRQ as we're using IB_POLL_SOFTIRQ. And Documentation/kernel-hacking/locking.rst section "Cheat Sheet For Locking" says: - Otherwise (== data can be touched in an interrupt), use spin_lock_irqsave() and spin_unlock_irqrestore(). So in order to keep it simple and safe we use that version now. It will help merging functions into common code and have consistent locking in all cases. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 6d9c9e68c76505..8e10a43c8cb135 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1397,7 +1397,9 @@ static void enqueue_reassembly( struct smbdirect_recv_io *response, int data_length) { - spin_lock(&sc->recv_io.reassembly.lock); + unsigned long flags; + + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); list_add_tail(&response->list, &sc->recv_io.reassembly.list); sc->recv_io.reassembly.queue_length++; /* @@ -1408,7 +1410,7 @@ static void enqueue_reassembly( */ virt_wmb(); sc->recv_io.reassembly.data_length += data_length; - spin_unlock(&sc->recv_io.reassembly.lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); sc->statistics.enqueue_reassembly_queue++; } @@ -2076,6 +2078,7 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) if (sc->recv_io.reassembly.data_length >= size) { int queue_length; int queue_removed = 0; + unsigned long flags; /* * Need to make sure reassembly_data_length is read before @@ -2135,11 +2138,11 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) if (queue_length) list_del(&response->list); else { - spin_lock_irq( - &sc->recv_io.reassembly.lock); + spin_lock_irqsave( + &sc->recv_io.reassembly.lock, flags); list_del(&response->list); - spin_unlock_irq( - &sc->recv_io.reassembly.lock); + spin_unlock_irqrestore( + &sc->recv_io.reassembly.lock, flags); } queue_removed++; sc->statistics.dequeue_reassembly_queue++; @@ -2157,10 +2160,10 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) to_read, data_read, offset); } - spin_lock_irq(&sc->recv_io.reassembly.lock); + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); sc->recv_io.reassembly.data_length -= data_read; sc->recv_io.reassembly.queue_length -= queue_removed; - spin_unlock_irq(&sc->recv_io.reassembly.lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); sc->recv_io.reassembly.first_entry_offset = offset; log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", @@ -2432,6 +2435,7 @@ static int allocate_mr_list(struct smbdirect_socket *sc) static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc) { struct smbdirect_mr_io *ret; + unsigned long flags; int rc; again: rc = wait_event_interruptible(sc->mr_io.ready.wait_queue, @@ -2447,18 +2451,18 @@ static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc) return NULL; } - spin_lock(&sc->mr_io.all.lock); + spin_lock_irqsave(&sc->mr_io.all.lock, flags); list_for_each_entry(ret, &sc->mr_io.all.list, list) { if (ret->state == SMBDIRECT_MR_READY) { ret->state = SMBDIRECT_MR_REGISTERED; - spin_unlock(&sc->mr_io.all.lock); + spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); atomic_dec(&sc->mr_io.ready.count); atomic_inc(&sc->mr_io.used.count); return ret; } } - spin_unlock(&sc->mr_io.all.lock); + spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); /* * It is possible that we could fail to get MR because other processes may * try to acquire a MR at the same time. If this is the case, retry it. From 17a6bc60f47b344b831da8cf66adef5c2ea3d9c1 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 26 Aug 2025 15:54:10 +0200 Subject: [PATCH 3051/3206] smb: client: allocate smbdirect workqueue at the beginning of _smbd_get_connection() This will simplify further changes when moving to common code. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 8e10a43c8cb135..788b378fa1f496 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1703,10 +1703,9 @@ int smbd_reconnect(struct TCP_Server_Info *server) return -ENOENT; } -static void destroy_caches_and_workqueue(struct smbdirect_socket *sc) +static void destroy_caches(struct smbdirect_socket *sc) { destroy_receive_buffers(sc); - destroy_workqueue(sc->workqueue); mempool_destroy(sc->recv_io.mem.pool); kmem_cache_destroy(sc->recv_io.mem.cache); mempool_destroy(sc->send_io.mem.pool); @@ -1714,7 +1713,7 @@ static void destroy_caches_and_workqueue(struct smbdirect_socket *sc) } #define MAX_NAME_LEN 80 -static int allocate_caches_and_workqueue(struct smbdirect_socket *sc) +static int allocate_caches(struct smbdirect_socket *sc) { struct smbdirect_socket_parameters *sp = &sc->parameters; char name[MAX_NAME_LEN]; @@ -1760,21 +1759,14 @@ static int allocate_caches_and_workqueue(struct smbdirect_socket *sc) if (!sc->recv_io.mem.pool) goto out3; - scnprintf(name, MAX_NAME_LEN, "smbd_%p", sc); - sc->workqueue = create_workqueue(name); - if (!sc->workqueue) - goto out4; - rc = allocate_receive_buffers(sc, sp->recv_credit_max); if (rc) { log_rdma_event(ERR, "failed to allocate receive buffers\n"); - goto out5; + goto out4; } return 0; -out5: - destroy_workqueue(sc->workqueue); out4: mempool_destroy(sc->recv_io.mem.pool); out3: @@ -1799,12 +1791,19 @@ static struct smbd_connection *_smbd_get_connection( struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; struct ib_port_immutable port_immutable; __be32 ird_ord_hdr[2]; + char wq_name[80]; + struct workqueue_struct *workqueue; info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL); if (!info) return NULL; sc = &info->socket; + scnprintf(wq_name, ARRAY_SIZE(wq_name), "smbd_%p", sc); + workqueue = create_workqueue(wq_name); + if (!workqueue) + goto create_wq_failed; smbdirect_socket_init(sc); + sc->workqueue = workqueue; sp = &sc->parameters; INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work); @@ -1946,7 +1945,7 @@ static struct smbd_connection *_smbd_get_connection( log_rdma_event(INFO, "rdma_connect connected\n"); - rc = allocate_caches_and_workqueue(sc); + rc = allocate_caches(sc); if (rc) { log_rdma_event(ERR, "cache allocation failed\n"); goto allocate_cache_failed; @@ -1986,7 +1985,7 @@ static struct smbd_connection *_smbd_get_connection( negotiation_failed: disable_delayed_work_sync(&sc->idle.timer_work); - destroy_caches_and_workqueue(sc); + destroy_caches(sc); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; rdma_disconnect(sc->rdma.cm_id); wait_event(sc->status_wait, @@ -2008,6 +2007,8 @@ static struct smbd_connection *_smbd_get_connection( rdma_destroy_id(sc->rdma.cm_id); create_id_failed: + destroy_workqueue(sc->workqueue); +create_wq_failed: kfree(info); return NULL; } From c4709e61ef253eed4257328b614e7c57fed259ec Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 15 Sep 2025 23:32:15 +0200 Subject: [PATCH 3052/3206] smb: client: defer calling ib_alloc_pd() after we are connected The protection domain is not needed until we're connected. This makes further changes easier to follow... Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 788b378fa1f496..1d81ead875d0aa 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -926,13 +926,6 @@ static int smbd_ia_open( if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) sc->mr_io.type = IB_MR_TYPE_SG_GAPS; - sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); - if (IS_ERR(sc->ib.pd)) { - rc = PTR_ERR(sc->ib.pd); - log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); - goto out2; - } - return 0; out2: @@ -1858,6 +1851,14 @@ static struct smbd_connection *_smbd_get_connection( goto config_failed; } + sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); + if (IS_ERR(sc->ib.pd)) { + rc = PTR_ERR(sc->ib.pd); + sc->ib.pd = NULL; + log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); + goto alloc_pd_failed; + } + sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, sp->send_credit_target, IB_POLL_SOFTIRQ); @@ -2002,8 +2003,10 @@ static struct smbd_connection *_smbd_get_connection( if (sc->ib.recv_cq) ib_free_cq(sc->ib.recv_cq); -config_failed: ib_dealloc_pd(sc->ib.pd); + +alloc_pd_failed: +config_failed: rdma_destroy_id(sc->rdma.cm_id); create_id_failed: From 2dad0b15bbe86ffa6d2f234c40609aa59894e9ec Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 15 Sep 2025 00:55:31 +0200 Subject: [PATCH 3053/3206] smb: client: let smbd_post_send_iter() call ib_dma_map_single() for the header first This will simplify further changes, the important part is that request->num_sge >= 1 is only set if request->sge[0].* is valid. Note that ib_dma_sync_single_for_device() is called in smbd_post_send() for each sge, so the device will still see the packet header even if it's modified after calling ib_dma_map_single(). Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 43 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 1d81ead875d0aa..316f398c70f4b5 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1158,10 +1158,30 @@ static int smbd_post_send_iter(struct smbdirect_socket *sc, request->socket = sc; memset(request->sge, 0, sizeof(request->sge)); + /* Map the packet to DMA */ + header_length = sizeof(struct smbdirect_data_transfer); + /* If this is a packet without payload, don't send padding */ + if (!iter) + header_length = offsetof(struct smbdirect_data_transfer, padding); + + packet = smbdirect_send_io_payload(request); + request->sge[0].addr = ib_dma_map_single(sc->ib.dev, + (void *)packet, + header_length, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { + rc = -EIO; + goto err_dma; + } + + request->sge[0].length = header_length; + request->sge[0].lkey = sc->ib.pd->local_dma_lkey; + request->num_sge = 1; + /* Fill in the data payload to find out how much data we can add */ if (iter) { struct smb_extract_to_rdma extract = { - .nr_sge = 1, + .nr_sge = request->num_sge, .max_sge = SMBDIRECT_SEND_IO_MAX_SGE, .sge = request->sge, .device = sc->ib.dev, @@ -1180,11 +1200,9 @@ static int smbd_post_send_iter(struct smbdirect_socket *sc, *_remaining_data_length -= data_length; } else { data_length = 0; - request->num_sge = 1; } /* Fill in the packet header */ - packet = smbdirect_send_io_payload(request); packet->credits_requested = cpu_to_le16(sp->send_credit_target); new_credits = manage_credits_prior_sending(sc); @@ -1211,25 +1229,6 @@ static int smbd_post_send_iter(struct smbdirect_socket *sc, le32_to_cpu(packet->data_length), le32_to_cpu(packet->remaining_data_length)); - /* Map the packet to DMA */ - header_length = sizeof(struct smbdirect_data_transfer); - /* If this is a packet without payload, don't send padding */ - if (!data_length) - header_length = offsetof(struct smbdirect_data_transfer, padding); - - request->sge[0].addr = ib_dma_map_single(sc->ib.dev, - (void *)packet, - header_length, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { - rc = -EIO; - request->sge[0].addr = 0; - goto err_dma; - } - - request->sge[0].length = header_length; - request->sge[0].lkey = sc->ib.pd->local_dma_lkey; - rc = smbd_post_send(sc, request); if (!rc) return 0; From fad988a2158d743da7971884b93482a73735b25e Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 20 Aug 2025 15:34:58 +0200 Subject: [PATCH 3054/3206] smb: server: fix IRD/ORD negotiation with the client Already do real negotiation in smb_direct_handle_connect_request() where we see the requested initiator_depth and responder_resources from the client. We should detect legacy iwarp clients using MPA v1 with the custom IRD/ORD negotiation. We need to send the custom IRD/ORD in big endian, but we need to try to let clients with broken requests using little endian (older cifs.ko) to work. Note the reason why this uses u8 for initiator_depth and responder_resources is that the rdma layer also uses it. Acked-by: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: linux-rdma@vger.kernel.org Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 99 +++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 14 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 74dfb6496095db..e1f659d3b4cf57 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -153,6 +153,10 @@ struct smb_direct_transport { struct work_struct disconnect_work; bool negotiation_requested; + + bool legacy_iwarp; + u8 initiator_depth; + u8 responder_resources; }; #define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport)) @@ -347,6 +351,9 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) t->cm_id = cm_id; cm_id->context = t; + t->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; + t->responder_resources = 1; + t->status = SMB_DIRECT_CS_NEW; init_waitqueue_head(&t->wait_status); @@ -1676,21 +1683,21 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, static int smb_direct_accept_client(struct smb_direct_transport *t) { struct rdma_conn_param conn_param; - struct ib_port_immutable port_immutable; - u32 ird_ord_hdr[2]; + __be32 ird_ord_hdr[2]; int ret; + /* + * smb_direct_handle_connect_request() + * already negotiated t->initiator_depth + * and t->responder_resources + */ memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = min_t(u8, t->cm_id->device->attrs.max_qp_rd_atom, - SMB_DIRECT_CM_INITIATOR_DEPTH); - conn_param.responder_resources = 0; - - t->cm_id->device->ops.get_port_immutable(t->cm_id->device, - t->cm_id->port_num, - &port_immutable); - if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { - ird_ord_hdr[0] = conn_param.responder_resources; - ird_ord_hdr[1] = 1; + conn_param.initiator_depth = t->initiator_depth; + conn_param.responder_resources = t->responder_resources; + + if (t->legacy_iwarp) { + ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); + ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); conn_param.private_data = ird_ord_hdr; conn_param.private_data_len = sizeof(ird_ord_hdr); } else { @@ -2081,10 +2088,13 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) return true; } -static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) +static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, + struct rdma_cm_event *event) { struct smb_direct_transport *t; struct task_struct *handler; + u8 peer_initiator_depth; + u8 peer_responder_resources; int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { @@ -2098,6 +2108,67 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) if (!t) return -ENOMEM; + peer_initiator_depth = event->param.conn.initiator_depth; + peer_responder_resources = event->param.conn.responder_resources; + if (rdma_protocol_iwarp(new_cm_id->device, new_cm_id->port_num) && + event->param.conn.private_data_len == 8) { + /* + * Legacy clients with only iWarp MPA v1 support + * need a private blob in order to negotiate + * the IRD/ORD values. + */ + const __be32 *ird_ord_hdr = event->param.conn.private_data; + u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); + u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); + + /* + * cifs.ko sends the legacy IRD/ORD negotiation + * event if iWarp MPA v2 was used. + * + * Here we check that the values match and only + * mark the client as legacy if they don't match. + */ + if ((u32)event->param.conn.initiator_depth != ird32 || + (u32)event->param.conn.responder_resources != ord32) { + /* + * There are broken clients (old cifs.ko) + * using little endian and also + * struct rdma_conn_param only uses u8 + * for initiator_depth and responder_resources, + * so we truncate the value to U8_MAX. + * + * smb_direct_accept_client() will then + * do the real negotiation in order to + * select the minimum between client and + * server. + */ + ird32 = min_t(u32, ird32, U8_MAX); + ord32 = min_t(u32, ord32, U8_MAX); + + t->legacy_iwarp = true; + peer_initiator_depth = (u8)ird32; + peer_responder_resources = (u8)ord32; + } + } + + /* + * First set what the we as server are able to support + */ + t->initiator_depth = min_t(u8, t->initiator_depth, + new_cm_id->device->attrs.max_qp_rd_atom); + + /* + * negotiate the value by using the minimum + * between client and server if the client provided + * non 0 values. + */ + if (peer_initiator_depth != 0) + t->initiator_depth = min_t(u8, t->initiator_depth, + peer_initiator_depth); + if (peer_responder_resources != 0) + t->responder_resources = min_t(u8, t->responder_resources, + peer_responder_resources); + ret = smb_direct_connect(t); if (ret) goto out_err; @@ -2122,7 +2193,7 @@ static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, { switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: { - int ret = smb_direct_handle_connect_request(cm_id); + int ret = smb_direct_handle_connect_request(cm_id, event); if (ret) { pr_err("Can't create transport: %d\n", ret); From 575c1af4dc64434a42c4b00e5b7fd298e8c68c3c Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Sat, 16 Aug 2025 10:24:30 +0900 Subject: [PATCH 3055/3206] smb: server: make use of common smbdirect_pdu.h Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: Hyunchul Lee Cc: Meetakshi Setiya Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 47 +++++++++++++++++----------------- fs/smb/server/transport_rdma.h | 41 ----------------------------- 2 files changed, 24 insertions(+), 64 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index e1f659d3b4cf57..9401ba09310a15 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -23,12 +23,13 @@ #include "connection.h" #include "smb_common.h" #include "../common/smb2status.h" +#include "../common/smbdirect/smbdirect_pdu.h" #include "transport_rdma.h" #define SMB_DIRECT_PORT_IWARP 5445 #define SMB_DIRECT_PORT_INFINIBAND 445 -#define SMB_DIRECT_VERSION_LE cpu_to_le16(0x0100) +#define SMB_DIRECT_VERSION_LE cpu_to_le16(SMBDIRECT_V1) /* SMB_DIRECT negotiation timeout in seconds */ #define SMB_DIRECT_NEGOTIATE_TIMEOUT 120 @@ -479,8 +480,8 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) { switch (recvmsg->type) { case SMB_DIRECT_MSG_DATA_TRANSFER: { - struct smb_direct_data_transfer *req = - (struct smb_direct_data_transfer *)recvmsg->packet; + struct smbdirect_data_transfer *req = + (struct smbdirect_data_transfer *)recvmsg->packet; struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet + le32_to_cpu(req->data_offset)); ksmbd_debug(RDMA, @@ -492,8 +493,8 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) break; } case SMB_DIRECT_MSG_NEGOTIATE_REQ: { - struct smb_direct_negotiate_req *req = - (struct smb_direct_negotiate_req *)recvmsg->packet; + struct smbdirect_negotiate_req *req = + (struct smbdirect_negotiate_req *)recvmsg->packet; ksmbd_debug(RDMA, "MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n", le16_to_cpu(req->min_version), @@ -547,7 +548,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) switch (recvmsg->type) { case SMB_DIRECT_MSG_NEGOTIATE_REQ: - if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) { + if (wc->byte_len < sizeof(struct smbdirect_negotiate_req)) { put_recvmsg(t, recvmsg); smb_direct_disconnect_rdma_connection(t); return; @@ -559,13 +560,13 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) wake_up_interruptible(&t->wait_status); return; case SMB_DIRECT_MSG_DATA_TRANSFER: { - struct smb_direct_data_transfer *data_transfer = - (struct smb_direct_data_transfer *)recvmsg->packet; + struct smbdirect_data_transfer *data_transfer = + (struct smbdirect_data_transfer *)recvmsg->packet; u32 remaining_data_length, data_offset, data_length; int avail_recvmsg_count, receive_credits; if (wc->byte_len < - offsetof(struct smb_direct_data_transfer, padding)) { + offsetof(struct smbdirect_data_transfer, padding)) { put_recvmsg(t, recvmsg); smb_direct_disconnect_rdma_connection(t); return; @@ -615,7 +616,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) &t->send_credits); if (le16_to_cpu(data_transfer->flags) & - SMB_DIRECT_RESPONSE_REQUESTED) + SMBDIRECT_FLAG_RESPONSE_REQUESTED) queue_work(smb_direct_wq, &t->send_immediate_work); if (atomic_read(&t->send_credits) > 0) @@ -680,7 +681,7 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, unsigned int size, int unused) { struct smb_direct_recvmsg *recvmsg; - struct smb_direct_data_transfer *data_transfer; + struct smbdirect_data_transfer *data_transfer; int to_copy, to_read, data_read, offset; u32 data_length, remaining_data_length, data_offset; int rc; @@ -1016,7 +1017,7 @@ static int smb_direct_create_header(struct smb_direct_transport *t, struct smb_direct_sendmsg **sendmsg_out) { struct smb_direct_sendmsg *sendmsg; - struct smb_direct_data_transfer *packet; + struct smbdirect_data_transfer *packet; int header_length; int ret; @@ -1025,7 +1026,7 @@ static int smb_direct_create_header(struct smb_direct_transport *t, return PTR_ERR(sendmsg); /* Fill in the packet header */ - packet = (struct smb_direct_data_transfer *)sendmsg->packet; + packet = (struct smbdirect_data_transfer *)sendmsg->packet; packet->credits_requested = cpu_to_le16(t->send_credit_target); packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); @@ -1048,11 +1049,11 @@ static int smb_direct_create_header(struct smb_direct_transport *t, le32_to_cpu(packet->remaining_data_length)); /* Map the packet to DMA */ - header_length = sizeof(struct smb_direct_data_transfer); + header_length = sizeof(struct smbdirect_data_transfer); /* If this is a packet without payload, don't send padding */ if (!size) header_length = - offsetof(struct smb_direct_data_transfer, padding); + offsetof(struct smbdirect_data_transfer, padding); sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device, (void *)packet, @@ -1228,7 +1229,7 @@ static int smb_direct_writev(struct ksmbd_transport *t, size_t iov_idx; size_t iov_ofs; size_t max_iov_size = st->max_send_size - - sizeof(struct smb_direct_data_transfer); + sizeof(struct smbdirect_data_transfer); int ret; struct smb_direct_send_ctx send_ctx; int error = 0; @@ -1627,18 +1628,18 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, int failed) { struct smb_direct_sendmsg *sendmsg; - struct smb_direct_negotiate_resp *resp; + struct smbdirect_negotiate_resp *resp; int ret; sendmsg = smb_direct_alloc_sendmsg(t); if (IS_ERR(sendmsg)) return -ENOMEM; - resp = (struct smb_direct_negotiate_resp *)sendmsg->packet; + resp = (struct smbdirect_negotiate_resp *)sendmsg->packet; if (failed) { memset(resp, 0, sizeof(*resp)); - resp->min_version = cpu_to_le16(0x0100); - resp->max_version = cpu_to_le16(0x0100); + resp->min_version = SMB_DIRECT_VERSION_LE; + resp->max_version = SMB_DIRECT_VERSION_LE; resp->status = STATUS_NOT_SUPPORTED; } else { resp->status = STATUS_SUCCESS; @@ -1875,7 +1876,7 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) snprintf(name, sizeof(name), "smb_direct_rqst_pool_%p", t); t->sendmsg_cache = kmem_cache_create(name, sizeof(struct smb_direct_sendmsg) + - sizeof(struct smb_direct_negotiate_resp), + sizeof(struct smbdirect_negotiate_resp), 0, SLAB_HWCACHE_ALIGN, NULL); if (!t->sendmsg_cache) return -ENOMEM; @@ -2008,7 +2009,7 @@ static int smb_direct_prepare(struct ksmbd_transport *t) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); struct smb_direct_recvmsg *recvmsg; - struct smb_direct_negotiate_req *req; + struct smbdirect_negotiate_req *req; int ret; ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); @@ -2027,7 +2028,7 @@ static int smb_direct_prepare(struct ksmbd_transport *t) if (ret == -ECONNABORTED) goto out; - req = (struct smb_direct_negotiate_req *)recvmsg->packet; + req = (struct smbdirect_negotiate_req *)recvmsg->packet; st->max_recv_size = min_t(int, st->max_recv_size, le32_to_cpu(req->preferred_send_size)); st->max_send_size = min_t(int, st->max_send_size, diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h index a2291b77488a15..63eab9f8f13dff 100644 --- a/fs/smb/server/transport_rdma.h +++ b/fs/smb/server/transport_rdma.h @@ -11,47 +11,6 @@ #define SMBD_MIN_IOSIZE (512 * 1024) #define SMBD_MAX_IOSIZE (16 * 1024 * 1024) -/* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ -struct smb_direct_negotiate_req { - __le16 min_version; - __le16 max_version; - __le16 reserved; - __le16 credits_requested; - __le32 preferred_send_size; - __le32 max_receive_size; - __le32 max_fragmented_size; -} __packed; - -/* SMB DIRECT negotiation response packet [MS-SMBD] 2.2.2 */ -struct smb_direct_negotiate_resp { - __le16 min_version; - __le16 max_version; - __le16 negotiated_version; - __le16 reserved; - __le16 credits_requested; - __le16 credits_granted; - __le32 status; - __le32 max_readwrite_size; - __le32 preferred_send_size; - __le32 max_receive_size; - __le32 max_fragmented_size; -} __packed; - -#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001 - -/* SMB DIRECT data transfer packet with payload [MS-SMBD] 2.2.3 */ -struct smb_direct_data_transfer { - __le16 credits_requested; - __le16 credits_granted; - __le16 flags; - __le16 reserved; - __le32 remaining_data_length; - __le32 data_offset; - __le32 data_length; - __le32 padding; - __u8 buffer[]; -} __packed; - #ifdef CONFIG_SMB_SERVER_SMBDIRECT int ksmbd_rdma_init(void); void ksmbd_rdma_stop_listening(void); From e7de2c4521ce86eb23de133cdf48cb460cff5a91 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 6 Aug 2025 19:35:55 +0200 Subject: [PATCH 3056/3206] smb: server: make use of common smbdirect.h Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: Hyunchul Lee Cc: Meetakshi Setiya Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 4 ++-- fs/smb/server/connection.h | 10 ++++++---- fs/smb/server/smb2pdu.c | 11 ++++++----- fs/smb/server/smb2pdu.h | 6 ------ fs/smb/server/transport_rdma.c | 7 ++++--- 5 files changed, 18 insertions(+), 20 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 67c4f73398dfee..91a9344111348a 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -243,7 +243,7 @@ int ksmbd_conn_write(struct ksmbd_work *work) int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { int ret = -EINVAL; @@ -257,7 +257,7 @@ int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { int ret = -EINVAL; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 2aa8084bb59302..07b43634262a19 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -19,6 +19,8 @@ #include "smb_common.h" #include "ksmbd_work.h" +struct smbdirect_buffer_descriptor_v1; + #define KSMBD_SOCKET_BACKLOG 16 enum { @@ -133,11 +135,11 @@ struct ksmbd_transport_ops { unsigned int remote_key); int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); int (*rdma_write)(struct ksmbd_transport *t, void *buf, unsigned int len, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); void (*free_transport)(struct ksmbd_transport *kt); }; @@ -163,11 +165,11 @@ bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); int ksmbd_conn_write(struct ksmbd_work *work); int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); void ksmbd_conn_enqueue_request(struct ksmbd_work *work); void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index a565fc36cee6df..00e644935f2864 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -23,6 +23,7 @@ #include "asn1.h" #include "connection.h" #include "transport_ipc.h" +#include "../common/smbdirect/smbdirect.h" #include "transport_rdma.h" #include "vfs.h" #include "vfs_cache.h" @@ -6665,7 +6666,7 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) } static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, __le32 Channel, __le16 ChannelInfoLength) { @@ -6701,7 +6702,7 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, int err; err = ksmbd_conn_rdma_write(work->conn, data_buf, length, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)), le16_to_cpu(req->ReadChannelInfoLength)); if (err) @@ -6772,7 +6773,7 @@ int smb2_read(struct ksmbd_work *work) goto out; } err = smb2_set_remote_key_for_rdma(work, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + ch_offset), req->Channel, req->ReadChannelInfoLength); @@ -6967,7 +6968,7 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, return -ENOMEM; ret = ksmbd_conn_rdma_read(work->conn, data_buf, length, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)), le16_to_cpu(req->WriteChannelInfoLength)); if (ret < 0) { @@ -7032,7 +7033,7 @@ int smb2_write(struct ksmbd_work *work) goto out; } err = smb2_set_remote_key_for_rdma(work, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + ch_offset), req->Channel, req->WriteChannelInfoLength); diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h index 16ae8a10490beb..5163d5241b90d5 100644 --- a/fs/smb/server/smb2pdu.h +++ b/fs/smb/server/smb2pdu.h @@ -136,12 +136,6 @@ struct create_posix_rsp { u8 SidBuffer[44]; } __packed; -struct smb2_buffer_desc_v1 { - __le64 offset; - __le32 token; - __le32 length; -} __packed; - #define SMB2_0_IOCTL_IS_FSCTL 0x00000001 struct smb_sockaddr_in { diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 9401ba09310a15..72ae3c9714a6b1 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -23,6 +23,7 @@ #include "connection.h" #include "smb_common.h" #include "../common/smb2status.h" +#include "../common/smbdirect/smbdirect.h" #include "../common/smbdirect/smbdirect_pdu.h" #include "transport_rdma.h" @@ -1402,7 +1403,7 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc) static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf, int buf_len, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len, bool is_read) { @@ -1532,7 +1533,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, @@ -1541,7 +1542,7 @@ static int smb_direct_rdma_write(struct ksmbd_transport *t, static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, From 4c564f03e23b1ea55b77679192a4cb07f5b3e7f3 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 6 Aug 2025 19:35:56 +0200 Subject: [PATCH 3057/3206] smb: server: make use of common smbdirect_socket This is the next step in the direction of a common smbdirect layer. Currently only structures are shared, but that will change over time until everything is shared. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: Hyunchul Lee Cc: Meetakshi Setiya Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 241 ++++++++++++++++++--------------- 1 file changed, 133 insertions(+), 108 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 72ae3c9714a6b1..e2d8ac08734442 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -25,6 +25,7 @@ #include "../common/smb2status.h" #include "../common/smbdirect/smbdirect.h" #include "../common/smbdirect/smbdirect_pdu.h" +#include "../common/smbdirect/smbdirect_socket.h" #include "transport_rdma.h" #define SMB_DIRECT_PORT_IWARP 5445 @@ -89,26 +90,14 @@ static struct smb_direct_listener { static struct workqueue_struct *smb_direct_wq; -enum smb_direct_status { - SMB_DIRECT_CS_NEW = 0, - SMB_DIRECT_CS_CONNECTED, - SMB_DIRECT_CS_DISCONNECTING, - SMB_DIRECT_CS_DISCONNECTED, -}; - struct smb_direct_transport { struct ksmbd_transport transport; - enum smb_direct_status status; + struct smbdirect_socket socket; + bool full_packet_received; wait_queue_head_t wait_status; - struct rdma_cm_id *cm_id; - struct ib_cq *send_cq; - struct ib_cq *recv_cq; - struct ib_pd *pd; - struct ib_qp *qp; - int max_send_size; int max_recv_size; int max_fragmented_send_size; @@ -271,8 +260,10 @@ smb_direct_recvmsg *get_free_recvmsg(struct smb_direct_transport *t) static void put_recvmsg(struct smb_direct_transport *t, struct smb_direct_recvmsg *recvmsg) { + struct smbdirect_socket *sc = &t->socket; + if (likely(recvmsg->sge.length != 0)) { - ib_dma_unmap_single(t->cm_id->device, + ib_dma_unmap_single(sc->ib.dev, recvmsg->sge.addr, recvmsg->sge.length, DMA_FROM_DEVICE); @@ -316,17 +307,20 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) struct smb_direct_transport *t = container_of(work, struct smb_direct_transport, disconnect_work); + struct smbdirect_socket *sc = &t->socket; - if (t->status == SMB_DIRECT_CS_CONNECTED) { - t->status = SMB_DIRECT_CS_DISCONNECTING; - rdma_disconnect(t->cm_id); + if (sc->status == SMBDIRECT_SOCKET_CONNECTED) { + sc->status = SMBDIRECT_SOCKET_DISCONNECTING; + rdma_disconnect(sc->rdma.cm_id); } } static void smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t) { - if (t->status == SMB_DIRECT_CS_CONNECTED) + struct smbdirect_socket *sc = &t->socket; + + if (sc->status == SMBDIRECT_SOCKET_CONNECTED) queue_work(smb_direct_wq, &t->disconnect_work); } @@ -334,8 +328,9 @@ static void smb_direct_send_immediate_work(struct work_struct *work) { struct smb_direct_transport *t = container_of(work, struct smb_direct_transport, send_immediate_work); + struct smbdirect_socket *sc = &t->socket; - if (t->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return; smb_direct_post_send_data(t, NULL, NULL, 0, 0); @@ -344,19 +339,23 @@ static void smb_direct_send_immediate_work(struct work_struct *work) static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) { struct smb_direct_transport *t; + struct smbdirect_socket *sc; struct ksmbd_conn *conn; t = kzalloc(sizeof(*t), KSMBD_DEFAULT_GFP); if (!t) return NULL; + sc = &t->socket; - t->cm_id = cm_id; + sc->rdma.cm_id = cm_id; cm_id->context = t; t->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; t->responder_resources = 1; - t->status = SMB_DIRECT_CS_NEW; + sc->ib.dev = sc->rdma.cm_id->device; + + sc->status = SMBDIRECT_SOCKET_CREATED; init_waitqueue_head(&t->wait_status); spin_lock_init(&t->reassembly_queue_lock); @@ -400,6 +399,7 @@ static void smb_direct_free_transport(struct ksmbd_transport *kt) static void free_transport(struct smb_direct_transport *t) { + struct smbdirect_socket *sc = &t->socket; struct smb_direct_recvmsg *recvmsg; wake_up_interruptible(&t->wait_send_credits); @@ -412,11 +412,11 @@ static void free_transport(struct smb_direct_transport *t) disable_work_sync(&t->post_recv_credits_work); disable_work_sync(&t->send_immediate_work); - if (t->qp) { - ib_drain_qp(t->qp); - ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs); - t->qp = NULL; - rdma_destroy_qp(t->cm_id); + if (sc->ib.qp) { + ib_drain_qp(sc->ib.qp); + ib_mr_pool_destroy(sc->ib.qp, &sc->ib.qp->rdma_mrs); + sc->ib.qp = NULL; + rdma_destroy_qp(sc->rdma.cm_id); } ksmbd_debug(RDMA, "drain the reassembly queue\n"); @@ -433,14 +433,14 @@ static void free_transport(struct smb_direct_transport *t) } while (recvmsg); t->reassembly_data_length = 0; - if (t->send_cq) - ib_free_cq(t->send_cq); - if (t->recv_cq) - ib_free_cq(t->recv_cq); - if (t->pd) - ib_dealloc_pd(t->pd); - if (t->cm_id) - rdma_destroy_id(t->cm_id); + if (sc->ib.send_cq) + ib_free_cq(sc->ib.send_cq); + if (sc->ib.recv_cq) + ib_free_cq(sc->ib.recv_cq); + if (sc->ib.pd) + ib_dealloc_pd(sc->ib.pd); + if (sc->rdma.cm_id) + rdma_destroy_id(sc->rdma.cm_id); smb_direct_destroy_pools(t); ksmbd_conn_free(KSMBD_TRANS(t)->conn); @@ -463,14 +463,15 @@ static struct smb_direct_sendmsg static void smb_direct_free_sendmsg(struct smb_direct_transport *t, struct smb_direct_sendmsg *msg) { + struct smbdirect_socket *sc = &t->socket; int i; if (msg->num_sge > 0) { - ib_dma_unmap_single(t->cm_id->device, + ib_dma_unmap_single(sc->ib.dev, msg->sge[0].addr, msg->sge[0].length, DMA_TO_DEVICE); for (i = 1; i < msg->num_sge; i++) - ib_dma_unmap_page(t->cm_id->device, + ib_dma_unmap_page(sc->ib.dev, msg->sge[i].addr, msg->sge[i].length, DMA_TO_DEVICE); } @@ -525,9 +526,11 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct smb_direct_recvmsg *recvmsg; struct smb_direct_transport *t; + struct smbdirect_socket *sc; recvmsg = container_of(wc->wr_cqe, struct smb_direct_recvmsg, cqe); t = recvmsg->transport; + sc = &t->socket; if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { put_recvmsg(t, recvmsg); @@ -556,7 +559,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } t->negotiation_requested = true; t->full_packet_received = true; - t->status = SMB_DIRECT_CS_CONNECTED; + sc->status = SMBDIRECT_SOCKET_CONNECTED; enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); return; @@ -647,17 +650,18 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) static int smb_direct_post_recv(struct smb_direct_transport *t, struct smb_direct_recvmsg *recvmsg) { + struct smbdirect_socket *sc = &t->socket; struct ib_recv_wr wr; int ret; - recvmsg->sge.addr = ib_dma_map_single(t->cm_id->device, + recvmsg->sge.addr = ib_dma_map_single(sc->ib.dev, recvmsg->packet, t->max_recv_size, DMA_FROM_DEVICE); - ret = ib_dma_mapping_error(t->cm_id->device, recvmsg->sge.addr); + ret = ib_dma_mapping_error(sc->ib.dev, recvmsg->sge.addr); if (ret) return ret; recvmsg->sge.length = t->max_recv_size; - recvmsg->sge.lkey = t->pd->local_dma_lkey; + recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey; recvmsg->cqe.done = recv_done; wr.wr_cqe = &recvmsg->cqe; @@ -665,10 +669,10 @@ static int smb_direct_post_recv(struct smb_direct_transport *t, wr.sg_list = &recvmsg->sge; wr.num_sge = 1; - ret = ib_post_recv(t->qp, &wr, NULL); + ret = ib_post_recv(sc->ib.qp, &wr, NULL); if (ret) { pr_err("Can't post recv: %d\n", ret); - ib_dma_unmap_single(t->cm_id->device, + ib_dma_unmap_single(sc->ib.dev, recvmsg->sge.addr, recvmsg->sge.length, DMA_FROM_DEVICE); recvmsg->sge.length = 0; @@ -687,9 +691,10 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, u32 data_length, remaining_data_length, data_offset; int rc; struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smbdirect_socket *sc = &st->socket; again: - if (st->status != SMB_DIRECT_CS_CONNECTED) { + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { pr_err("disconnected\n"); return -ENOTCONN; } @@ -798,7 +803,7 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, ksmbd_debug(RDMA, "wait_event on more data\n"); rc = wait_event_interruptible(st->wait_reassembly_queue, st->reassembly_data_length >= size || - st->status != SMB_DIRECT_CS_CONNECTED); + sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) return -EINTR; @@ -900,10 +905,11 @@ static int manage_credits_prior_sending(struct smb_direct_transport *t) static int smb_direct_post_send(struct smb_direct_transport *t, struct ib_send_wr *wr) { + struct smbdirect_socket *sc = &t->socket; int ret; atomic_inc(&t->send_pending); - ret = ib_post_send(t->qp, wr, NULL); + ret = ib_post_send(sc->ib.qp, wr, NULL); if (ret) { pr_err("failed to post send: %d\n", ret); if (atomic_dec_and_test(&t->send_pending)) @@ -968,6 +974,7 @@ static int wait_for_credits(struct smb_direct_transport *t, wait_queue_head_t *waitq, atomic_t *total_credits, int needed) { + struct smbdirect_socket *sc = &t->socket; int ret; do { @@ -977,9 +984,9 @@ static int wait_for_credits(struct smb_direct_transport *t, atomic_add(needed, total_credits); ret = wait_event_interruptible(*waitq, atomic_read(total_credits) >= needed || - t->status != SMB_DIRECT_CS_CONNECTED); + sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (t->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -ENOTCONN; else if (ret < 0) return ret; @@ -1017,6 +1024,7 @@ static int smb_direct_create_header(struct smb_direct_transport *t, int size, int remaining_data_length, struct smb_direct_sendmsg **sendmsg_out) { + struct smbdirect_socket *sc = &t->socket; struct smb_direct_sendmsg *sendmsg; struct smbdirect_data_transfer *packet; int header_length; @@ -1056,11 +1064,11 @@ static int smb_direct_create_header(struct smb_direct_transport *t, header_length = offsetof(struct smbdirect_data_transfer, padding); - sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device, + sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, (void *)packet, header_length, DMA_TO_DEVICE); - ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr); + ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); if (ret) { smb_direct_free_sendmsg(t, sendmsg); return ret; @@ -1068,7 +1076,7 @@ static int smb_direct_create_header(struct smb_direct_transport *t, sendmsg->num_sge = 1; sendmsg->sge[0].length = header_length; - sendmsg->sge[0].lkey = t->pd->local_dma_lkey; + sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; *sendmsg_out = sendmsg; return 0; @@ -1122,10 +1130,11 @@ static int post_sendmsg(struct smb_direct_transport *t, struct smb_direct_send_ctx *send_ctx, struct smb_direct_sendmsg *msg) { + struct smbdirect_socket *sc = &t->socket; int i; for (i = 0; i < msg->num_sge; i++) - ib_dma_sync_single_for_device(t->cm_id->device, + ib_dma_sync_single_for_device(sc->ib.dev, msg->sge[i].addr, msg->sge[i].length, DMA_TO_DEVICE); @@ -1161,6 +1170,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, struct kvec *iov, int niov, int remaining_data_length) { + struct smbdirect_socket *sc = &t->socket; int i, j, ret; struct smb_direct_sendmsg *msg; int data_length; @@ -1186,7 +1196,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, int sg_cnt; sg_init_table(sg, SMB_DIRECT_MAX_SEND_SGES - 1); - sg_cnt = get_mapped_sg_list(t->cm_id->device, + sg_cnt = get_mapped_sg_list(sc->ib.dev, iov[i].iov_base, iov[i].iov_len, sg, SMB_DIRECT_MAX_SEND_SGES - 1, DMA_TO_DEVICE); @@ -1197,7 +1207,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES) { pr_err("buffer not fitted into sges\n"); ret = -E2BIG; - ib_dma_unmap_sg(t->cm_id->device, sg, sg_cnt, + ib_dma_unmap_sg(sc->ib.dev, sg, sg_cnt, DMA_TO_DEVICE); goto err; } @@ -1206,7 +1216,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, sge = &msg->sge[msg->num_sge]; sge->addr = sg_dma_address(&sg[j]); sge->length = sg_dma_len(&sg[j]); - sge->lkey = t->pd->local_dma_lkey; + sge->lkey = sc->ib.pd->local_dma_lkey; msg->num_sge++; } } @@ -1226,6 +1236,7 @@ static int smb_direct_writev(struct ksmbd_transport *t, bool need_invalidate, unsigned int remote_key) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smbdirect_socket *sc = &st->socket; size_t remaining_data_length; size_t iov_idx; size_t iov_ofs; @@ -1235,7 +1246,7 @@ static int smb_direct_writev(struct ksmbd_transport *t, struct smb_direct_send_ctx send_ctx; int error = 0; - if (st->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -ENOTCONN; //FIXME: skip RFC1002 header.. @@ -1367,7 +1378,9 @@ static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, struct smb_direct_rdma_rw_msg *msg, enum dma_data_direction dir) { - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + struct smbdirect_socket *sc = &t->socket; + + rdma_rw_ctx_destroy(&msg->rw_ctx, sc->ib.qp, sc->ib.qp->port, msg->sgt.sgl, msg->sgt.nents, dir); sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); kfree(msg); @@ -1407,6 +1420,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, unsigned int desc_len, bool is_read) { + struct smbdirect_socket *sc = &t->socket; struct smb_direct_rdma_rw_msg *msg, *next_msg; int i, ret; DECLARE_COMPLETION_ONSTACK(completion); @@ -1416,7 +1430,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, int credits_needed; unsigned int desc_buf_len, desc_num = 0; - if (t->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -ENOTCONN; if (buf_len > t->max_rdma_rw_size) @@ -1486,7 +1500,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, goto out; } - ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, + ret = rdma_rw_ctx_init(&msg->rw_ctx, sc->ib.qp, sc->ib.qp->port, msg->sgt.sgl, get_buf_page_count(desc_buf, desc_buf_len), 0, @@ -1507,11 +1521,11 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, /* concatenate work requests of rdma_rw_ctxs */ first_wr = NULL; list_for_each_entry_reverse(msg, &msg_list, list) { - first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, + first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, sc->ib.qp, sc->ib.qp->port, &msg->cqe, first_wr); } - ret = ib_post_send(t->qp, first_wr, NULL); + ret = ib_post_send(sc->ib.qp, first_wr, NULL); if (ret) { pr_err("failed to post send wr for RDMA R/W: %d\n", ret); goto out; @@ -1552,20 +1566,22 @@ static int smb_direct_rdma_read(struct ksmbd_transport *t, static void smb_direct_disconnect(struct ksmbd_transport *t) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smbdirect_socket *sc = &st->socket; - ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", st->cm_id); + ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id); smb_direct_disconnect_rdma_work(&st->disconnect_work); wait_event_interruptible(st->wait_status, - st->status == SMB_DIRECT_CS_DISCONNECTED); + sc->status == SMBDIRECT_SOCKET_DISCONNECTED); free_transport(st); } static void smb_direct_shutdown(struct ksmbd_transport *t) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smbdirect_socket *sc = &st->socket; - ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", st->cm_id); + ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id); smb_direct_disconnect_rdma_work(&st->disconnect_work); } @@ -1574,28 +1590,29 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct smb_direct_transport *t = cm_id->context; + struct smbdirect_socket *sc = &t->socket; ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n", cm_id, rdma_event_msg(event->event), event->event); switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: { - t->status = SMB_DIRECT_CS_CONNECTED; + sc->status = SMBDIRECT_SOCKET_CONNECTED; wake_up_interruptible(&t->wait_status); break; } case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_DISCONNECTED: { - ib_drain_qp(t->qp); + ib_drain_qp(sc->ib.qp); - t->status = SMB_DIRECT_CS_DISCONNECTED; + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; wake_up_interruptible(&t->wait_status); wake_up_interruptible(&t->wait_reassembly_queue); wake_up(&t->wait_send_credits); break; } case RDMA_CM_EVENT_CONNECT_ERROR: { - t->status = SMB_DIRECT_CS_DISCONNECTED; + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; wake_up_interruptible(&t->wait_status); break; } @@ -1611,9 +1628,10 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, static void smb_direct_qpair_handler(struct ib_event *event, void *context) { struct smb_direct_transport *t = context; + struct smbdirect_socket *sc = &t->socket; ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n", - t->cm_id, ib_event_msg(event->event), event->event); + sc->rdma.cm_id, ib_event_msg(event->event), event->event); switch (event->event) { case IB_EVENT_CQ_ERR: @@ -1628,6 +1646,7 @@ static void smb_direct_qpair_handler(struct ib_event *event, void *context) static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, int failed) { + struct smbdirect_socket *sc = &t->socket; struct smb_direct_sendmsg *sendmsg; struct smbdirect_negotiate_resp *resp; int ret; @@ -1658,10 +1677,10 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, cpu_to_le32(t->max_fragmented_recv_size); } - sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device, + sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, (void *)resp, sizeof(*resp), DMA_TO_DEVICE); - ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr); + ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); if (ret) { smb_direct_free_sendmsg(t, sendmsg); return ret; @@ -1669,7 +1688,7 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, sendmsg->num_sge = 1; sendmsg->sge[0].length = sizeof(*resp); - sendmsg->sge[0].lkey = t->pd->local_dma_lkey; + sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; ret = post_sendmsg(t, NULL, sendmsg); if (ret) { @@ -1684,6 +1703,7 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, static int smb_direct_accept_client(struct smb_direct_transport *t) { + struct smbdirect_socket *sc = &t->socket; struct rdma_conn_param conn_param; __be32 ird_ord_hdr[2]; int ret; @@ -1710,7 +1730,7 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY; conn_param.flow_control = 0; - ret = rdma_accept(t->cm_id, &conn_param); + ret = rdma_accept(sc->rdma.cm_id, &conn_param); if (ret) { pr_err("error at rdma_accept: %d\n", ret); return ret; @@ -1750,15 +1770,18 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t) { + struct smbdirect_socket *sc = &t->socket; + return min_t(unsigned int, - t->cm_id->device->attrs.max_fast_reg_page_list_len, + sc->ib.dev->attrs.max_fast_reg_page_list_len, 256); } static int smb_direct_init_params(struct smb_direct_transport *t, struct ib_qp_cap *cap) { - struct ib_device *device = t->cm_id->device; + struct smbdirect_socket *sc = &t->socket; + struct ib_device *device = sc->ib.dev; int max_send_sges, max_rw_wrs, max_send_wrs; unsigned int max_sge_per_wr, wrs_per_credit; @@ -1923,34 +1946,35 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) static int smb_direct_create_qpair(struct smb_direct_transport *t, struct ib_qp_cap *cap) { + struct smbdirect_socket *sc = &t->socket; int ret; struct ib_qp_init_attr qp_attr; int pages_per_rw; - t->pd = ib_alloc_pd(t->cm_id->device, 0); - if (IS_ERR(t->pd)) { + sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); + if (IS_ERR(sc->ib.pd)) { pr_err("Can't create RDMA PD\n"); - ret = PTR_ERR(t->pd); - t->pd = NULL; + ret = PTR_ERR(sc->ib.pd); + sc->ib.pd = NULL; return ret; } - t->send_cq = ib_alloc_cq(t->cm_id->device, t, + sc->ib.send_cq = ib_alloc_cq(sc->ib.dev, t, smb_direct_send_credit_target + cap->max_rdma_ctxs, 0, IB_POLL_WORKQUEUE); - if (IS_ERR(t->send_cq)) { + if (IS_ERR(sc->ib.send_cq)) { pr_err("Can't create RDMA send CQ\n"); - ret = PTR_ERR(t->send_cq); - t->send_cq = NULL; + ret = PTR_ERR(sc->ib.send_cq); + sc->ib.send_cq = NULL; goto err; } - t->recv_cq = ib_alloc_cq(t->cm_id->device, t, + sc->ib.recv_cq = ib_alloc_cq(sc->ib.dev, t, t->recv_credit_max, 0, IB_POLL_WORKQUEUE); - if (IS_ERR(t->recv_cq)) { + if (IS_ERR(sc->ib.recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); - ret = PTR_ERR(t->recv_cq); - t->recv_cq = NULL; + ret = PTR_ERR(sc->ib.recv_cq); + sc->ib.recv_cq = NULL; goto err; } @@ -1960,22 +1984,22 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, qp_attr.cap = *cap; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; - qp_attr.send_cq = t->send_cq; - qp_attr.recv_cq = t->recv_cq; + qp_attr.send_cq = sc->ib.send_cq; + qp_attr.recv_cq = sc->ib.recv_cq; qp_attr.port_num = ~0; - ret = rdma_create_qp(t->cm_id, t->pd, &qp_attr); + ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr); if (ret) { pr_err("Can't create RDMA QP: %d\n", ret); goto err; } - t->qp = t->cm_id->qp; - t->cm_id->event_handler = smb_direct_cm_handler; + sc->ib.qp = sc->rdma.cm_id->qp; + sc->rdma.cm_id->event_handler = smb_direct_cm_handler; pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; - if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { - ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, + if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) { + ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs, t->max_rw_credits, IB_MR_TYPE_MEM_REG, t->pages_per_rw_credit, 0); if (ret) { @@ -1987,21 +2011,21 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, return 0; err: - if (t->qp) { - t->qp = NULL; - rdma_destroy_qp(t->cm_id); + if (sc->ib.qp) { + sc->ib.qp = NULL; + rdma_destroy_qp(sc->rdma.cm_id); } - if (t->recv_cq) { - ib_destroy_cq(t->recv_cq); - t->recv_cq = NULL; + if (sc->ib.recv_cq) { + ib_destroy_cq(sc->ib.recv_cq); + sc->ib.recv_cq = NULL; } - if (t->send_cq) { - ib_destroy_cq(t->send_cq); - t->send_cq = NULL; + if (sc->ib.send_cq) { + ib_destroy_cq(sc->ib.send_cq); + sc->ib.send_cq = NULL; } - if (t->pd) { - ib_dealloc_pd(t->pd); - t->pd = NULL; + if (sc->ib.pd) { + ib_dealloc_pd(sc->ib.pd); + sc->ib.pd = NULL; } return ret; } @@ -2009,6 +2033,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, static int smb_direct_prepare(struct ksmbd_transport *t) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smbdirect_socket *sc = &st->socket; struct smb_direct_recvmsg *recvmsg; struct smbdirect_negotiate_req *req; int ret; @@ -2016,9 +2041,9 @@ static int smb_direct_prepare(struct ksmbd_transport *t) ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); ret = wait_event_interruptible_timeout(st->wait_status, st->negotiation_requested || - st->status == SMB_DIRECT_CS_DISCONNECTED, + sc->status == SMBDIRECT_SOCKET_DISCONNECTED, SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); - if (ret <= 0 || st->status == SMB_DIRECT_CS_DISCONNECTED) + if (ret <= 0 || sc->status == SMBDIRECT_SOCKET_DISCONNECTED) return ret < 0 ? ret : -ETIMEDOUT; recvmsg = get_first_reassembly(st); From 177368b9924314bde7d2ea6dc93de0d9ba728b61 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 6 Aug 2025 19:35:57 +0200 Subject: [PATCH 3058/3206] smb: server: make use of common smbdirect_socket_parameters Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: Hyunchul Lee Cc: Meetakshi Setiya Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 93 ++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 44 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index e2d8ac08734442..499bb2c0bd79be 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -98,12 +98,6 @@ struct smb_direct_transport { bool full_packet_received; wait_queue_head_t wait_status; - int max_send_size; - int max_recv_size; - int max_fragmented_send_size; - int max_fragmented_recv_size; - int max_rdma_rw_size; - spinlock_t reassembly_queue_lock; struct list_head reassembly_queue; int reassembly_data_length; @@ -114,13 +108,11 @@ struct smb_direct_transport { spinlock_t receive_credit_lock; int recv_credits; int count_avail_recvmsg; - int recv_credit_max; int recv_credit_target; spinlock_t recvmsg_queue_lock; struct list_head recvmsg_queue; - int send_credit_target; atomic_t send_credits; spinlock_t lock_new_recv_credits; int new_recv_credits; @@ -527,10 +519,12 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smb_direct_recvmsg *recvmsg; struct smb_direct_transport *t; struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; recvmsg = container_of(wc->wr_cqe, struct smb_direct_recvmsg, cqe); t = recvmsg->transport; sc = &t->socket; + sp = &sc->parameters; if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { put_recvmsg(t, recvmsg); @@ -585,10 +579,10 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(t); return; } - if (remaining_data_length > t->max_fragmented_recv_size || - data_length > t->max_fragmented_recv_size || + if (remaining_data_length > sp->max_fragmented_recv_size || + data_length > sp->max_fragmented_recv_size || (u64)remaining_data_length + (u64)data_length > - (u64)t->max_fragmented_recv_size) { + (u64)sp->max_fragmented_recv_size) { put_recvmsg(t, recvmsg); smb_direct_disconnect_rdma_connection(t); return; @@ -651,16 +645,18 @@ static int smb_direct_post_recv(struct smb_direct_transport *t, struct smb_direct_recvmsg *recvmsg) { struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_recv_wr wr; int ret; recvmsg->sge.addr = ib_dma_map_single(sc->ib.dev, - recvmsg->packet, t->max_recv_size, + recvmsg->packet, + sp->max_recv_size, DMA_FROM_DEVICE); ret = ib_dma_mapping_error(sc->ib.dev, recvmsg->sge.addr); if (ret) return ret; - recvmsg->sge.length = t->max_recv_size; + recvmsg->sge.length = sp->max_recv_size; recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey; recvmsg->cqe.done = recv_done; @@ -1025,6 +1021,7 @@ static int smb_direct_create_header(struct smb_direct_transport *t, struct smb_direct_sendmsg **sendmsg_out) { struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct smb_direct_sendmsg *sendmsg; struct smbdirect_data_transfer *packet; int header_length; @@ -1036,7 +1033,7 @@ static int smb_direct_create_header(struct smb_direct_transport *t, /* Fill in the packet header */ packet = (struct smbdirect_data_transfer *)sendmsg->packet; - packet->credits_requested = cpu_to_le16(t->send_credit_target); + packet->credits_requested = cpu_to_le16(sp->send_credit_target); packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); packet->flags = 0; @@ -1237,10 +1234,11 @@ static int smb_direct_writev(struct ksmbd_transport *t, { struct smb_direct_transport *st = smb_trans_direct_transfort(t); struct smbdirect_socket *sc = &st->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; size_t remaining_data_length; size_t iov_idx; size_t iov_ofs; - size_t max_iov_size = st->max_send_size - + size_t max_iov_size = sp->max_send_size - sizeof(struct smbdirect_data_transfer); int ret; struct smb_direct_send_ctx send_ctx; @@ -1421,6 +1419,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, bool is_read) { struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct smb_direct_rdma_rw_msg *msg, *next_msg; int i, ret; DECLARE_COMPLETION_ONSTACK(completion); @@ -1433,7 +1432,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -ENOTCONN; - if (buf_len > t->max_rdma_rw_size) + if (buf_len > sp->max_read_write_size) return -EINVAL; /* calculate needed credits */ @@ -1647,6 +1646,7 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, int failed) { struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct smb_direct_sendmsg *sendmsg; struct smbdirect_negotiate_resp *resp; int ret; @@ -1668,13 +1668,13 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, resp->negotiated_version = SMB_DIRECT_VERSION_LE; resp->reserved = 0; resp->credits_requested = - cpu_to_le16(t->send_credit_target); + cpu_to_le16(sp->send_credit_target); resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); - resp->max_readwrite_size = cpu_to_le32(t->max_rdma_rw_size); - resp->preferred_send_size = cpu_to_le32(t->max_send_size); - resp->max_receive_size = cpu_to_le32(t->max_recv_size); + resp->max_readwrite_size = cpu_to_le32(sp->max_read_write_size); + resp->preferred_send_size = cpu_to_le32(sp->max_send_size); + resp->max_receive_size = cpu_to_le32(sp->max_recv_size); resp->max_fragmented_size = - cpu_to_le32(t->max_fragmented_recv_size); + cpu_to_le32(sp->max_fragmented_recv_size); } sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, @@ -1781,6 +1781,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, struct ib_qp_cap *cap) { struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_device *device = sc->ib.dev; int max_send_sges, max_rw_wrs, max_send_wrs; unsigned int max_sge_per_wr, wrs_per_credit; @@ -1788,10 +1789,10 @@ static int smb_direct_init_params(struct smb_direct_transport *t, /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, * SMB2 response could be mapped. */ - t->max_send_size = smb_direct_max_send_size; - max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3; + sp->max_send_size = smb_direct_max_send_size; + max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3; if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) { - pr_err("max_send_size %d is too large\n", t->max_send_size); + pr_err("max_send_size %d is too large\n", sp->max_send_size); return -EINVAL; } @@ -1802,9 +1803,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t, * are needed for MR registration, RDMA R/W, local & remote * MR invalidation. */ - t->max_rdma_rw_size = smb_direct_max_read_write_size; + sp->max_read_write_size = smb_direct_max_read_write_size; t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t); - t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size, + t->max_rw_credits = DIV_ROUND_UP(sp->max_read_write_size, (t->pages_per_rw_credit - 1) * PAGE_SIZE); @@ -1850,20 +1851,20 @@ static int smb_direct_init_params(struct smb_direct_transport *t, t->recv_credits = 0; t->count_avail_recvmsg = 0; - t->recv_credit_max = smb_direct_receive_credit_max; + sp->recv_credit_max = smb_direct_receive_credit_max; t->recv_credit_target = 10; t->new_recv_credits = 0; - t->send_credit_target = smb_direct_send_credit_target; + sp->send_credit_target = smb_direct_send_credit_target; atomic_set(&t->send_credits, 0); atomic_set(&t->rw_credits, t->max_rw_credits); - t->max_send_size = smb_direct_max_send_size; - t->max_recv_size = smb_direct_max_receive_size; - t->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; + sp->max_send_size = smb_direct_max_send_size; + sp->max_recv_size = smb_direct_max_receive_size; + sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; cap->max_send_wr = max_send_wrs; - cap->max_recv_wr = t->recv_credit_max; + cap->max_recv_wr = sp->recv_credit_max; cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; @@ -1893,6 +1894,8 @@ static void smb_direct_destroy_pools(struct smb_direct_transport *t) static int smb_direct_create_pools(struct smb_direct_transport *t) { + struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; char name[80]; int i; struct smb_direct_recvmsg *recvmsg; @@ -1905,7 +1908,7 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) if (!t->sendmsg_cache) return -ENOMEM; - t->sendmsg_mempool = mempool_create(t->send_credit_target, + t->sendmsg_mempool = mempool_create(sp->send_credit_target, mempool_alloc_slab, mempool_free_slab, t->sendmsg_cache); if (!t->sendmsg_mempool) @@ -1914,20 +1917,20 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) snprintf(name, sizeof(name), "smb_direct_resp_%p", t); t->recvmsg_cache = kmem_cache_create(name, sizeof(struct smb_direct_recvmsg) + - t->max_recv_size, + sp->max_recv_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!t->recvmsg_cache) goto err; t->recvmsg_mempool = - mempool_create(t->recv_credit_max, mempool_alloc_slab, + mempool_create(sp->recv_credit_max, mempool_alloc_slab, mempool_free_slab, t->recvmsg_cache); if (!t->recvmsg_mempool) goto err; INIT_LIST_HEAD(&t->recvmsg_queue); - for (i = 0; i < t->recv_credit_max; i++) { + for (i = 0; i < sp->recv_credit_max; i++) { recvmsg = mempool_alloc(t->recvmsg_mempool, KSMBD_DEFAULT_GFP); if (!recvmsg) goto err; @@ -1935,7 +1938,7 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) recvmsg->sge.length = 0; list_add(&recvmsg->list, &t->recvmsg_queue); } - t->count_avail_recvmsg = t->recv_credit_max; + t->count_avail_recvmsg = sp->recv_credit_max; return 0; err: @@ -1947,6 +1950,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, struct ib_qp_cap *cap) { struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; int ret; struct ib_qp_init_attr qp_attr; int pages_per_rw; @@ -1970,7 +1974,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } sc->ib.recv_cq = ib_alloc_cq(sc->ib.dev, t, - t->recv_credit_max, 0, IB_POLL_WORKQUEUE); + sp->recv_credit_max, 0, IB_POLL_WORKQUEUE); if (IS_ERR(sc->ib.recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); ret = PTR_ERR(sc->ib.recv_cq); @@ -1997,7 +2001,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, sc->ib.qp = sc->rdma.cm_id->qp; sc->rdma.cm_id->event_handler = smb_direct_cm_handler; - pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; + pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1; if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) { ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs, t->max_rw_credits, IB_MR_TYPE_MEM_REG, @@ -2034,6 +2038,7 @@ static int smb_direct_prepare(struct ksmbd_transport *t) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); struct smbdirect_socket *sc = &st->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct smb_direct_recvmsg *recvmsg; struct smbdirect_negotiate_req *req; int ret; @@ -2055,14 +2060,14 @@ static int smb_direct_prepare(struct ksmbd_transport *t) goto out; req = (struct smbdirect_negotiate_req *)recvmsg->packet; - st->max_recv_size = min_t(int, st->max_recv_size, + sp->max_recv_size = min_t(int, sp->max_recv_size, le32_to_cpu(req->preferred_send_size)); - st->max_send_size = min_t(int, st->max_send_size, + sp->max_send_size = min_t(int, sp->max_send_size, le32_to_cpu(req->max_receive_size)); - st->max_fragmented_send_size = + sp->max_fragmented_send_size = le32_to_cpu(req->max_fragmented_size); - st->max_fragmented_recv_size = - (st->recv_credit_max * st->max_recv_size) / 2; + sp->max_fragmented_recv_size = + (sp->recv_credit_max * sp->max_recv_size) / 2; ret = smb_direct_send_negotiate_response(st, ret); out: From 3e691b1d16dbd30186d71ca707efeb3a98cc542c Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 6 Aug 2025 19:35:58 +0200 Subject: [PATCH 3059/3206] smb: server: make use of smbdirect_socket->recv_io.expected The expected incoming message type can be per connection. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 44 +++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 499bb2c0bd79be..7892dd634c82e5 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -145,10 +145,6 @@ struct smb_direct_transport { #define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport)) #define SMBD_TRANS(t) ((struct smb_direct_transport *)container_of(t, \ struct smb_direct_transport, transport)) -enum { - SMB_DIRECT_MSG_NEGOTIATE_REQ = 0, - SMB_DIRECT_MSG_DATA_TRANSFER -}; static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops; @@ -172,7 +168,6 @@ struct smb_direct_sendmsg { struct smb_direct_recvmsg { struct smb_direct_transport *transport; struct list_head list; - int type; struct ib_sge sge; struct ib_cqe cqe; bool first_segment; @@ -472,8 +467,10 @@ static void smb_direct_free_sendmsg(struct smb_direct_transport *t, static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) { - switch (recvmsg->type) { - case SMB_DIRECT_MSG_DATA_TRANSFER: { + struct smbdirect_socket *sc = &recvmsg->transport->socket; + + switch (sc->recv_io.expected) { + case SMBDIRECT_EXPECT_DATA_TRANSFER: { struct smbdirect_data_transfer *req = (struct smbdirect_data_transfer *)recvmsg->packet; struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet @@ -484,9 +481,9 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) le16_to_cpu(req->credits_requested), req->data_length, req->remaining_data_length, hdr->ProtocolId, hdr->Command); - break; + return 0; } - case SMB_DIRECT_MSG_NEGOTIATE_REQ: { + case SMBDIRECT_EXPECT_NEGOTIATE_REQ: { struct smbdirect_negotiate_req *req = (struct smbdirect_negotiate_req *)recvmsg->packet; ksmbd_debug(RDMA, @@ -506,12 +503,15 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) 128 * 1024) return -ECONNABORTED; - break; + return 0; } - default: - return -EINVAL; + case SMBDIRECT_EXPECT_NEGOTIATE_REP: + /* client only */ + break; } - return 0; + + /* This is an internal error */ + return -EINVAL; } static void recv_done(struct ib_cq *cq, struct ib_wc *wc) @@ -544,8 +544,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr, recvmsg->sge.length, DMA_FROM_DEVICE); - switch (recvmsg->type) { - case SMB_DIRECT_MSG_NEGOTIATE_REQ: + switch (sc->recv_io.expected) { + case SMBDIRECT_EXPECT_NEGOTIATE_REQ: if (wc->byte_len < sizeof(struct smbdirect_negotiate_req)) { put_recvmsg(t, recvmsg); smb_direct_disconnect_rdma_connection(t); @@ -557,7 +557,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); return; - case SMB_DIRECT_MSG_DATA_TRANSFER: { + case SMBDIRECT_EXPECT_DATA_TRANSFER: { struct smbdirect_data_transfer *data_transfer = (struct smbdirect_data_transfer *)recvmsg->packet; u32 remaining_data_length, data_offset, data_length; @@ -631,12 +631,15 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) return; } + case SMBDIRECT_EXPECT_NEGOTIATE_REP: + /* client only */ + break; } /* * This is an internal error! */ - WARN_ON_ONCE(recvmsg->type != SMB_DIRECT_MSG_DATA_TRANSFER); + WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); put_recvmsg(t, recvmsg); smb_direct_disconnect_rdma_connection(t); } @@ -824,7 +827,6 @@ static void smb_direct_post_recv_credits(struct work_struct *work) if (!recvmsg) break; - recvmsg->type = SMB_DIRECT_MSG_DATA_TRANSFER; recvmsg->first_segment = false; ret = smb_direct_post_recv(t, recvmsg); @@ -1675,6 +1677,8 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, resp->max_receive_size = cpu_to_le32(sp->max_recv_size); resp->max_fragmented_size = cpu_to_le32(sp->max_fragmented_recv_size); + + sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; } sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, @@ -1740,13 +1744,15 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) { + struct smbdirect_socket *sc = &t->socket; int ret; struct smb_direct_recvmsg *recvmsg; + sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ; + recvmsg = get_free_recvmsg(t); if (!recvmsg) return -ENOMEM; - recvmsg->type = SMB_DIRECT_MSG_NEGOTIATE_REQ; ret = smb_direct_post_recv(t, recvmsg); if (ret) { From 9883a142f1bba5bdc3a3091608aef8b1600cdd11 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 6 Aug 2025 19:35:59 +0200 Subject: [PATCH 3060/3206] smb: server: make use of struct smbdirect_recv_io This will allow us to move helper functions into common code soon as the client already uses smbdirect_recv_io. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 61 +++++++++++++++------------------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 7892dd634c82e5..c4e786f2e7e7c7 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -165,15 +165,6 @@ struct smb_direct_sendmsg { u8 packet[]; }; -struct smb_direct_recvmsg { - struct smb_direct_transport *transport; - struct list_head list; - struct ib_sge sge; - struct ib_cqe cqe; - bool first_segment; - u8 packet[]; -}; - struct smb_direct_rdma_rw_msg { struct smb_direct_transport *t; struct ib_cqe cqe; @@ -216,7 +207,7 @@ smb_trans_direct_transfort(struct ksmbd_transport *t) } static inline void -*smb_direct_recvmsg_payload(struct smb_direct_recvmsg *recvmsg) +*smbdirect_recv_io_payload(struct smbdirect_recv_io *recvmsg) { return (void *)recvmsg->packet; } @@ -229,14 +220,14 @@ static inline bool is_receive_credit_post_required(int receive_credits, } static struct -smb_direct_recvmsg *get_free_recvmsg(struct smb_direct_transport *t) +smbdirect_recv_io *get_free_recvmsg(struct smb_direct_transport *t) { - struct smb_direct_recvmsg *recvmsg = NULL; + struct smbdirect_recv_io *recvmsg = NULL; spin_lock(&t->recvmsg_queue_lock); if (!list_empty(&t->recvmsg_queue)) { recvmsg = list_first_entry(&t->recvmsg_queue, - struct smb_direct_recvmsg, + struct smbdirect_recv_io, list); list_del(&recvmsg->list); } @@ -245,7 +236,7 @@ smb_direct_recvmsg *get_free_recvmsg(struct smb_direct_transport *t) } static void put_recvmsg(struct smb_direct_transport *t, - struct smb_direct_recvmsg *recvmsg) + struct smbdirect_recv_io *recvmsg) { struct smbdirect_socket *sc = &t->socket; @@ -263,7 +254,7 @@ static void put_recvmsg(struct smb_direct_transport *t, } static void enqueue_reassembly(struct smb_direct_transport *t, - struct smb_direct_recvmsg *recvmsg, + struct smbdirect_recv_io *recvmsg, int data_length) { spin_lock(&t->reassembly_queue_lock); @@ -280,11 +271,11 @@ static void enqueue_reassembly(struct smb_direct_transport *t, spin_unlock(&t->reassembly_queue_lock); } -static struct smb_direct_recvmsg *get_first_reassembly(struct smb_direct_transport *t) +static struct smbdirect_recv_io *get_first_reassembly(struct smb_direct_transport *t) { if (!list_empty(&t->reassembly_queue)) return list_first_entry(&t->reassembly_queue, - struct smb_direct_recvmsg, list); + struct smbdirect_recv_io, list); else return NULL; } @@ -387,7 +378,7 @@ static void smb_direct_free_transport(struct ksmbd_transport *kt) static void free_transport(struct smb_direct_transport *t) { struct smbdirect_socket *sc = &t->socket; - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; wake_up_interruptible(&t->wait_send_credits); @@ -465,9 +456,9 @@ static void smb_direct_free_sendmsg(struct smb_direct_transport *t, mempool_free(msg, t->sendmsg_mempool); } -static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) +static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg) { - struct smbdirect_socket *sc = &recvmsg->transport->socket; + struct smbdirect_socket *sc = recvmsg->socket; switch (sc->recv_io.expected) { case SMBDIRECT_EXPECT_DATA_TRANSFER: { @@ -516,15 +507,15 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) static void recv_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; struct smb_direct_transport *t; struct smbdirect_socket *sc; struct smbdirect_socket_parameters *sp; - recvmsg = container_of(wc->wr_cqe, struct smb_direct_recvmsg, cqe); - t = recvmsg->transport; - sc = &t->socket; + recvmsg = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); + sc = recvmsg->socket; sp = &sc->parameters; + t = container_of(sc, struct smb_direct_transport, socket); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { put_recvmsg(t, recvmsg); @@ -645,7 +636,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } static int smb_direct_post_recv(struct smb_direct_transport *t, - struct smb_direct_recvmsg *recvmsg) + struct smbdirect_recv_io *recvmsg) { struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; @@ -684,7 +675,7 @@ static int smb_direct_post_recv(struct smb_direct_transport *t, static int smb_direct_read(struct ksmbd_transport *t, char *buf, unsigned int size, int unused) { - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; struct smbdirect_data_transfer *data_transfer; int to_copy, to_read, data_read, offset; u32 data_length, remaining_data_length, data_offset; @@ -721,7 +712,7 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, offset = st->first_entry_offset; while (data_read < size) { recvmsg = get_first_reassembly(st); - data_transfer = smb_direct_recvmsg_payload(recvmsg); + data_transfer = smbdirect_recv_io_payload(recvmsg); data_length = le32_to_cpu(data_transfer->data_length); remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); @@ -813,7 +804,7 @@ static void smb_direct_post_recv_credits(struct work_struct *work) { struct smb_direct_transport *t = container_of(work, struct smb_direct_transport, post_recv_credits_work); - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; int receive_credits, credits = 0; int ret; @@ -1746,7 +1737,7 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) { struct smbdirect_socket *sc = &t->socket; int ret; - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ; @@ -1880,7 +1871,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, static void smb_direct_destroy_pools(struct smb_direct_transport *t) { - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; while ((recvmsg = get_free_recvmsg(t))) mempool_free(recvmsg, t->recvmsg_mempool); @@ -1904,7 +1895,7 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) struct smbdirect_socket_parameters *sp = &sc->parameters; char name[80]; int i; - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; snprintf(name, sizeof(name), "smb_direct_rqst_pool_%p", t); t->sendmsg_cache = kmem_cache_create(name, @@ -1920,9 +1911,9 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) if (!t->sendmsg_mempool) goto err; - snprintf(name, sizeof(name), "smb_direct_resp_%p", t); + snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", t); t->recvmsg_cache = kmem_cache_create(name, - sizeof(struct smb_direct_recvmsg) + + sizeof(struct smbdirect_recv_io) + sp->max_recv_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!t->recvmsg_cache) @@ -1940,7 +1931,7 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) recvmsg = mempool_alloc(t->recvmsg_mempool, KSMBD_DEFAULT_GFP); if (!recvmsg) goto err; - recvmsg->transport = t; + recvmsg->socket = sc; recvmsg->sge.length = 0; list_add(&recvmsg->list, &t->recvmsg_queue); } @@ -2045,7 +2036,7 @@ static int smb_direct_prepare(struct ksmbd_transport *t) struct smb_direct_transport *st = smb_trans_direct_transfort(t); struct smbdirect_socket *sc = &st->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; struct smbdirect_negotiate_req *req; int ret; From d9989207b78c2fa3008285c8b1e53e8618544de7 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 6 Aug 2025 19:36:00 +0200 Subject: [PATCH 3061/3206] smb: server: make use of smbdirect_socket.recv_io.free.{list,lock} This is already used by the client and will allow us to add common helper functions soon. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c4e786f2e7e7c7..8eec651e2f9eaa 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -110,9 +110,6 @@ struct smb_direct_transport { int count_avail_recvmsg; int recv_credit_target; - spinlock_t recvmsg_queue_lock; - struct list_head recvmsg_queue; - atomic_t send_credits; spinlock_t lock_new_recv_credits; int new_recv_credits; @@ -222,16 +219,17 @@ static inline bool is_receive_credit_post_required(int receive_credits, static struct smbdirect_recv_io *get_free_recvmsg(struct smb_direct_transport *t) { + struct smbdirect_socket *sc = &t->socket; struct smbdirect_recv_io *recvmsg = NULL; - spin_lock(&t->recvmsg_queue_lock); - if (!list_empty(&t->recvmsg_queue)) { - recvmsg = list_first_entry(&t->recvmsg_queue, + spin_lock(&sc->recv_io.free.lock); + if (!list_empty(&sc->recv_io.free.list)) { + recvmsg = list_first_entry(&sc->recv_io.free.list, struct smbdirect_recv_io, list); list_del(&recvmsg->list); } - spin_unlock(&t->recvmsg_queue_lock); + spin_unlock(&sc->recv_io.free.lock); return recvmsg; } @@ -248,9 +246,9 @@ static void put_recvmsg(struct smb_direct_transport *t, recvmsg->sge.length = 0; } - spin_lock(&t->recvmsg_queue_lock); - list_add(&recvmsg->list, &t->recvmsg_queue); - spin_unlock(&t->recvmsg_queue_lock); + spin_lock(&sc->recv_io.free.lock); + list_add(&recvmsg->list, &sc->recv_io.free.list); + spin_unlock(&sc->recv_io.free.lock); } static void enqueue_reassembly(struct smb_direct_transport *t, @@ -333,6 +331,9 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sc->ib.dev = sc->rdma.cm_id->device; + INIT_LIST_HEAD(&sc->recv_io.free.list); + spin_lock_init(&sc->recv_io.free.lock); + sc->status = SMBDIRECT_SOCKET_CREATED; init_waitqueue_head(&t->wait_status); @@ -345,8 +346,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) init_waitqueue_head(&t->wait_rw_credits); spin_lock_init(&t->receive_credit_lock); - spin_lock_init(&t->recvmsg_queue_lock); - INIT_LIST_HEAD(&t->recvmsg_queue); init_waitqueue_head(&t->wait_send_pending); atomic_set(&t->send_pending, 0); @@ -1925,15 +1924,13 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) if (!t->recvmsg_mempool) goto err; - INIT_LIST_HEAD(&t->recvmsg_queue); - for (i = 0; i < sp->recv_credit_max; i++) { recvmsg = mempool_alloc(t->recvmsg_mempool, KSMBD_DEFAULT_GFP); if (!recvmsg) goto err; recvmsg->socket = sc; recvmsg->sge.length = 0; - list_add(&recvmsg->list, &t->recvmsg_queue); + list_add(&recvmsg->list, &sc->recv_io.free.list); } t->count_avail_recvmsg = sp->recv_credit_max; From bdb0f1596ae537535a0cebc2911b07ff264c3099 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 6 Aug 2025 19:36:01 +0200 Subject: [PATCH 3062/3206] smb: server: make use of smbdirect_socket.recv_io.reassembly.* This is also used by the client and will allow us to introduce common helper functions soon. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 90 ++++++++++++++++------------------ 1 file changed, 43 insertions(+), 47 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 8eec651e2f9eaa..7853b3b4d8daac 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -95,16 +95,8 @@ struct smb_direct_transport { struct smbdirect_socket socket; - bool full_packet_received; wait_queue_head_t wait_status; - spinlock_t reassembly_queue_lock; - struct list_head reassembly_queue; - int reassembly_data_length; - int reassembly_queue_length; - int first_entry_offset; - wait_queue_head_t wait_reassembly_queue; - spinlock_t receive_credit_lock; int recv_credits; int count_avail_recvmsg; @@ -255,9 +247,11 @@ static void enqueue_reassembly(struct smb_direct_transport *t, struct smbdirect_recv_io *recvmsg, int data_length) { - spin_lock(&t->reassembly_queue_lock); - list_add_tail(&recvmsg->list, &t->reassembly_queue); - t->reassembly_queue_length++; + struct smbdirect_socket *sc = &t->socket; + + spin_lock(&sc->recv_io.reassembly.lock); + list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list); + sc->recv_io.reassembly.queue_length++; /* * Make sure reassembly_data_length is updated after list and * reassembly_queue_length are updated. On the dequeue side @@ -265,14 +259,16 @@ static void enqueue_reassembly(struct smb_direct_transport *t, * if reassembly_queue_length and list is up to date */ virt_wmb(); - t->reassembly_data_length += data_length; - spin_unlock(&t->reassembly_queue_lock); + sc->recv_io.reassembly.data_length += data_length; + spin_unlock(&sc->recv_io.reassembly.lock); } static struct smbdirect_recv_io *get_first_reassembly(struct smb_direct_transport *t) { - if (!list_empty(&t->reassembly_queue)) - return list_first_entry(&t->reassembly_queue, + struct smbdirect_socket *sc = &t->socket; + + if (!list_empty(&sc->recv_io.reassembly.list)) + return list_first_entry(&sc->recv_io.reassembly.list, struct smbdirect_recv_io, list); else return NULL; @@ -337,11 +333,11 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sc->status = SMBDIRECT_SOCKET_CREATED; init_waitqueue_head(&t->wait_status); - spin_lock_init(&t->reassembly_queue_lock); - INIT_LIST_HEAD(&t->reassembly_queue); - t->reassembly_data_length = 0; - t->reassembly_queue_length = 0; - init_waitqueue_head(&t->wait_reassembly_queue); + spin_lock_init(&sc->recv_io.reassembly.lock); + INIT_LIST_HEAD(&sc->recv_io.reassembly.list); + sc->recv_io.reassembly.data_length = 0; + sc->recv_io.reassembly.queue_length = 0; + init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); init_waitqueue_head(&t->wait_send_credits); init_waitqueue_head(&t->wait_rw_credits); @@ -398,17 +394,17 @@ static void free_transport(struct smb_direct_transport *t) ksmbd_debug(RDMA, "drain the reassembly queue\n"); do { - spin_lock(&t->reassembly_queue_lock); + spin_lock(&sc->recv_io.reassembly.lock); recvmsg = get_first_reassembly(t); if (recvmsg) { list_del(&recvmsg->list); - spin_unlock(&t->reassembly_queue_lock); + spin_unlock(&sc->recv_io.reassembly.lock); put_recvmsg(t, recvmsg); } else { - spin_unlock(&t->reassembly_queue_lock); + spin_unlock(&sc->recv_io.reassembly.lock); } } while (recvmsg); - t->reassembly_data_length = 0; + sc->recv_io.reassembly.data_length = 0; if (sc->ib.send_cq) ib_free_cq(sc->ib.send_cq); @@ -542,7 +538,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) return; } t->negotiation_requested = true; - t->full_packet_received = true; + sc->recv_io.reassembly.full_packet_received = true; sc->status = SMBDIRECT_SOCKET_CONNECTED; enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); @@ -579,13 +575,13 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } if (data_length) { - if (t->full_packet_received) + if (sc->recv_io.reassembly.full_packet_received) recvmsg->first_segment = true; if (le32_to_cpu(data_transfer->remaining_data_length)) - t->full_packet_received = false; + sc->recv_io.reassembly.full_packet_received = false; else - t->full_packet_received = true; + sc->recv_io.reassembly.full_packet_received = true; spin_lock(&t->receive_credit_lock); receive_credits = --(t->recv_credits); @@ -615,7 +611,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) if (data_length) { enqueue_reassembly(t, recvmsg, (int)data_length); - wake_up_interruptible(&t->wait_reassembly_queue); + wake_up_interruptible(&sc->recv_io.reassembly.wait_queue); } else put_recvmsg(t, recvmsg); @@ -693,7 +689,7 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, * the only one reading from the front of the queue. The transport * may add more entries to the back of the queue at the same time */ - if (st->reassembly_data_length >= size) { + if (sc->recv_io.reassembly.data_length >= size) { int queue_length; int queue_removed = 0; @@ -705,10 +701,10 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, * updated in SOFTIRQ as more data is received */ virt_rmb(); - queue_length = st->reassembly_queue_length; + queue_length = sc->recv_io.reassembly.queue_length; data_read = 0; to_read = size; - offset = st->first_entry_offset; + offset = sc->recv_io.reassembly.first_entry_offset; while (data_read < size) { recvmsg = get_first_reassembly(st); data_transfer = smbdirect_recv_io_payload(recvmsg); @@ -751,9 +747,9 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, if (queue_length) { list_del(&recvmsg->list); } else { - spin_lock_irq(&st->reassembly_queue_lock); + spin_lock_irq(&sc->recv_io.reassembly.lock); list_del(&recvmsg->list); - spin_unlock_irq(&st->reassembly_queue_lock); + spin_unlock_irq(&sc->recv_io.reassembly.lock); } queue_removed++; put_recvmsg(st, recvmsg); @@ -766,10 +762,10 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, data_read += to_copy; } - spin_lock_irq(&st->reassembly_queue_lock); - st->reassembly_data_length -= data_read; - st->reassembly_queue_length -= queue_removed; - spin_unlock_irq(&st->reassembly_queue_lock); + spin_lock_irq(&sc->recv_io.reassembly.lock); + sc->recv_io.reassembly.data_length -= data_read; + sc->recv_io.reassembly.queue_length -= queue_removed; + spin_unlock_irq(&sc->recv_io.reassembly.lock); spin_lock(&st->receive_credit_lock); st->count_avail_recvmsg += queue_removed; @@ -780,18 +776,18 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, spin_unlock(&st->receive_credit_lock); } - st->first_entry_offset = offset; + sc->recv_io.reassembly.first_entry_offset = offset; ksmbd_debug(RDMA, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", - data_read, st->reassembly_data_length, - st->first_entry_offset); + data_read, sc->recv_io.reassembly.data_length, + sc->recv_io.reassembly.first_entry_offset); read_rfc1002_done: return data_read; } ksmbd_debug(RDMA, "wait_event on more data\n"); - rc = wait_event_interruptible(st->wait_reassembly_queue, - st->reassembly_data_length >= size || + rc = wait_event_interruptible(sc->recv_io.reassembly.wait_queue, + sc->recv_io.reassembly.data_length >= size || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) return -EINTR; @@ -1598,7 +1594,7 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, sc->status = SMBDIRECT_SOCKET_DISCONNECTED; wake_up_interruptible(&t->wait_status); - wake_up_interruptible(&t->wait_reassembly_queue); + wake_up_interruptible(&sc->recv_io.reassembly.wait_queue); wake_up(&t->wait_send_credits); break; } @@ -2065,10 +2061,10 @@ static int smb_direct_prepare(struct ksmbd_transport *t) ret = smb_direct_send_negotiate_response(st, ret); out: - spin_lock_irq(&st->reassembly_queue_lock); - st->reassembly_queue_length--; + spin_lock_irq(&sc->recv_io.reassembly.lock); + sc->recv_io.reassembly.queue_length--; list_del(&recvmsg->list); - spin_unlock_irq(&st->reassembly_queue_lock); + spin_unlock_irq(&sc->recv_io.reassembly.lock); put_recvmsg(st, recvmsg); return ret; From ea20d02842c8f38ed36045b46d8e95bcfa0514f2 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 6 Aug 2025 19:36:02 +0200 Subject: [PATCH 3063/3206] smb: server: make use of SMBDIRECT_RECV_IO_MAX_SGE Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 7853b3b4d8daac..b027382c554e29 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -37,7 +37,6 @@ #define SMB_DIRECT_NEGOTIATE_TIMEOUT 120 #define SMB_DIRECT_MAX_SEND_SGES 6 -#define SMB_DIRECT_MAX_RECV_SGES 1 /* * Default maximum number of RDMA read/write outstanding on this connection @@ -1834,7 +1833,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, device->attrs.max_send_sge); return -EINVAL; } - if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { + if (device->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) { pr_err("warning: device max_recv_sge = %d too small\n", device->attrs.max_recv_sge); return -EINVAL; @@ -1858,7 +1857,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_wr = max_send_wrs; cap->max_recv_wr = sp->recv_credit_max; cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; - cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; + cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; cap->max_inline_data = 0; cap->max_rdma_ctxs = t->max_rw_credits; return 0; From 442959002fcce509069025aab2aaab0344709e6f Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 6 Aug 2025 19:36:03 +0200 Subject: [PATCH 3064/3206] smb: server: make use of struct smbdirect_send_io This is already used by the client and will allow us to use common helper functions soon. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 87 +++++++++++++++------------------- 1 file changed, 39 insertions(+), 48 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index b027382c554e29..b4951ecf5c88b3 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -36,8 +36,6 @@ /* SMB_DIRECT negotiation timeout in seconds */ #define SMB_DIRECT_NEGOTIATE_TIMEOUT 120 -#define SMB_DIRECT_MAX_SEND_SGES 6 - /* * Default maximum number of RDMA read/write outstanding on this connection * This value is possibly decreased during QP creation on hardware limit @@ -143,16 +141,6 @@ struct smb_direct_send_ctx { unsigned int remote_key; }; -struct smb_direct_sendmsg { - struct smb_direct_transport *transport; - struct ib_send_wr wr; - struct list_head list; - int num_sge; - struct ib_sge sge[SMB_DIRECT_MAX_SEND_SGES]; - struct ib_cqe cqe; - u8 packet[]; -}; - struct smb_direct_rdma_rw_msg { struct smb_direct_transport *t; struct ib_cqe cqe; @@ -418,22 +406,23 @@ static void free_transport(struct smb_direct_transport *t) ksmbd_conn_free(KSMBD_TRANS(t)->conn); } -static struct smb_direct_sendmsg +static struct smbdirect_send_io *smb_direct_alloc_sendmsg(struct smb_direct_transport *t) { - struct smb_direct_sendmsg *msg; + struct smbdirect_socket *sc = &t->socket; + struct smbdirect_send_io *msg; msg = mempool_alloc(t->sendmsg_mempool, KSMBD_DEFAULT_GFP); if (!msg) return ERR_PTR(-ENOMEM); - msg->transport = t; - INIT_LIST_HEAD(&msg->list); + msg->socket = sc; + INIT_LIST_HEAD(&msg->sibling_list); msg->num_sge = 0; return msg; } static void smb_direct_free_sendmsg(struct smb_direct_transport *t, - struct smb_direct_sendmsg *msg) + struct smbdirect_send_io *msg) { struct smbdirect_socket *sc = &t->socket; int i; @@ -839,12 +828,14 @@ static void smb_direct_post_recv_credits(struct work_struct *work) static void send_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smb_direct_sendmsg *sendmsg, *sibling; + struct smbdirect_send_io *sendmsg, *sibling; struct smb_direct_transport *t; + struct smbdirect_socket *sc; struct list_head *pos, *prev, *end; - sendmsg = container_of(wc->wr_cqe, struct smb_direct_sendmsg, cqe); - t = sendmsg->transport; + sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); + sc = sendmsg->socket; + t = container_of(sc, struct smb_direct_transport, socket); ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, @@ -863,13 +854,13 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) /* iterate and free the list of messages in reverse. the list's head * is invalid. */ - for (pos = &sendmsg->list, prev = pos->prev, end = sendmsg->list.next; + for (pos = &sendmsg->sibling_list, prev = pos->prev, end = sendmsg->sibling_list.next; prev != end; pos = prev, prev = prev->prev) { - sibling = container_of(pos, struct smb_direct_sendmsg, list); + sibling = container_of(pos, struct smbdirect_send_io, sibling_list); smb_direct_free_sendmsg(t, sibling); } - sibling = container_of(pos, struct smb_direct_sendmsg, list); + sibling = container_of(pos, struct smbdirect_send_io, sibling_list); smb_direct_free_sendmsg(t, sibling); } @@ -917,18 +908,18 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, struct smb_direct_send_ctx *send_ctx, bool is_last) { - struct smb_direct_sendmsg *first, *last; + struct smbdirect_send_io *first, *last; int ret; if (list_empty(&send_ctx->msg_list)) return 0; first = list_first_entry(&send_ctx->msg_list, - struct smb_direct_sendmsg, - list); + struct smbdirect_send_io, + sibling_list); last = list_last_entry(&send_ctx->msg_list, - struct smb_direct_sendmsg, - list); + struct smbdirect_send_io, + sibling_list); last->wr.send_flags = IB_SEND_SIGNALED; last->wr.wr_cqe = &last->cqe; @@ -946,7 +937,7 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, atomic_add(send_ctx->wr_cnt, &t->send_credits); wake_up(&t->wait_send_credits); list_for_each_entry_safe(first, last, &send_ctx->msg_list, - list) { + sibling_list) { smb_direct_free_sendmsg(t, first); } } @@ -1005,11 +996,11 @@ static int calc_rw_credits(struct smb_direct_transport *t, static int smb_direct_create_header(struct smb_direct_transport *t, int size, int remaining_data_length, - struct smb_direct_sendmsg **sendmsg_out) + struct smbdirect_send_io **sendmsg_out) { struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smb_direct_sendmsg *sendmsg; + struct smbdirect_send_io *sendmsg; struct smbdirect_data_transfer *packet; int header_length; int ret; @@ -1112,7 +1103,7 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, static int post_sendmsg(struct smb_direct_transport *t, struct smb_direct_send_ctx *send_ctx, - struct smb_direct_sendmsg *msg) + struct smbdirect_send_io *msg) { struct smbdirect_socket *sc = &t->socket; int i; @@ -1132,14 +1123,14 @@ static int post_sendmsg(struct smb_direct_transport *t, msg->wr.wr_cqe = NULL; msg->wr.send_flags = 0; if (!list_empty(&send_ctx->msg_list)) { - struct smb_direct_sendmsg *last; + struct smbdirect_send_io *last; last = list_last_entry(&send_ctx->msg_list, - struct smb_direct_sendmsg, - list); + struct smbdirect_send_io, + sibling_list); last->wr.next = &msg->wr; } - list_add_tail(&msg->list, &send_ctx->msg_list); + list_add_tail(&msg->sibling_list, &send_ctx->msg_list); send_ctx->wr_cnt++; return 0; } @@ -1156,9 +1147,9 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, { struct smbdirect_socket *sc = &t->socket; int i, j, ret; - struct smb_direct_sendmsg *msg; + struct smbdirect_send_io *msg; int data_length; - struct scatterlist sg[SMB_DIRECT_MAX_SEND_SGES - 1]; + struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1]; ret = wait_for_send_credits(t, send_ctx); if (ret) @@ -1179,16 +1170,16 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, struct ib_sge *sge; int sg_cnt; - sg_init_table(sg, SMB_DIRECT_MAX_SEND_SGES - 1); + sg_init_table(sg, SMBDIRECT_SEND_IO_MAX_SGE - 1); sg_cnt = get_mapped_sg_list(sc->ib.dev, iov[i].iov_base, iov[i].iov_len, - sg, SMB_DIRECT_MAX_SEND_SGES - 1, + sg, SMBDIRECT_SEND_IO_MAX_SGE - 1, DMA_TO_DEVICE); if (sg_cnt <= 0) { pr_err("failed to map buffer\n"); ret = -ENOMEM; goto err; - } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES) { + } else if (sg_cnt + msg->num_sge > SMBDIRECT_SEND_IO_MAX_SGE) { pr_err("buffer not fitted into sges\n"); ret = -E2BIG; ib_dma_unmap_sg(sc->ib.dev, sg, sg_cnt, @@ -1246,7 +1237,7 @@ static int smb_direct_writev(struct ksmbd_transport *t, smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key); while (remaining_data_length) { - struct kvec vecs[SMB_DIRECT_MAX_SEND_SGES - 1]; /* minus smbdirect hdr */ + struct kvec vecs[SMBDIRECT_SEND_IO_MAX_SGE - 1]; /* minus smbdirect hdr */ size_t possible_bytes = max_iov_size; size_t possible_vecs; size_t bytes = 0; @@ -1634,7 +1625,7 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, { struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smb_direct_sendmsg *sendmsg; + struct smbdirect_send_io *sendmsg; struct smbdirect_negotiate_resp *resp; int ret; @@ -1782,7 +1773,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, */ sp->max_send_size = smb_direct_max_send_size; max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3; - if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) { + if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) { pr_err("max_send_size %d is too large\n", sp->max_send_size); return -EINVAL; } @@ -1828,7 +1819,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } - if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) { + if (device->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) { pr_err("warning: device max_send_sge = %d too small\n", device->attrs.max_send_sge); return -EINVAL; @@ -1856,7 +1847,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_wr = max_send_wrs; cap->max_recv_wr = sp->recv_credit_max; - cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; + cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; cap->max_inline_data = 0; cap->max_rdma_ctxs = t->max_rw_credits; @@ -1891,9 +1882,9 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) int i; struct smbdirect_recv_io *recvmsg; - snprintf(name, sizeof(name), "smb_direct_rqst_pool_%p", t); + snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", t); t->sendmsg_cache = kmem_cache_create(name, - sizeof(struct smb_direct_sendmsg) + + sizeof(struct smbdirect_send_io) + sizeof(struct smbdirect_negotiate_resp), 0, SLAB_HWCACHE_ALIGN, NULL); if (!t->sendmsg_cache) From 98dc77b310644b6716ed2da829697c128b2e5d51 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 6 Aug 2025 19:36:04 +0200 Subject: [PATCH 3065/3206] smb: server: make use of smbdirect_socket.{send,recv}_io.mem.{cache,pool} This will allow common helper functions to be created later. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 50 ++++++++++++++++------------------ 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index b4951ecf5c88b3..c400833a8c1681 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -109,11 +109,6 @@ struct smb_direct_transport { wait_queue_head_t wait_send_credits; wait_queue_head_t wait_rw_credits; - mempool_t *sendmsg_mempool; - struct kmem_cache *sendmsg_cache; - mempool_t *recvmsg_mempool; - struct kmem_cache *recvmsg_cache; - wait_queue_head_t wait_send_pending; atomic_t send_pending; @@ -412,7 +407,7 @@ static struct smbdirect_send_io struct smbdirect_socket *sc = &t->socket; struct smbdirect_send_io *msg; - msg = mempool_alloc(t->sendmsg_mempool, KSMBD_DEFAULT_GFP); + msg = mempool_alloc(sc->send_io.mem.pool, KSMBD_DEFAULT_GFP); if (!msg) return ERR_PTR(-ENOMEM); msg->socket = sc; @@ -436,7 +431,7 @@ static void smb_direct_free_sendmsg(struct smb_direct_transport *t, msg->sge[i].addr, msg->sge[i].length, DMA_TO_DEVICE); } - mempool_free(msg, t->sendmsg_mempool); + mempool_free(msg, sc->send_io.mem.pool); } static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg) @@ -1856,22 +1851,23 @@ static int smb_direct_init_params(struct smb_direct_transport *t, static void smb_direct_destroy_pools(struct smb_direct_transport *t) { + struct smbdirect_socket *sc = &t->socket; struct smbdirect_recv_io *recvmsg; while ((recvmsg = get_free_recvmsg(t))) - mempool_free(recvmsg, t->recvmsg_mempool); + mempool_free(recvmsg, sc->recv_io.mem.pool); - mempool_destroy(t->recvmsg_mempool); - t->recvmsg_mempool = NULL; + mempool_destroy(sc->recv_io.mem.pool); + sc->recv_io.mem.pool = NULL; - kmem_cache_destroy(t->recvmsg_cache); - t->recvmsg_cache = NULL; + kmem_cache_destroy(sc->recv_io.mem.cache); + sc->recv_io.mem.cache = NULL; - mempool_destroy(t->sendmsg_mempool); - t->sendmsg_mempool = NULL; + mempool_destroy(sc->send_io.mem.pool); + sc->send_io.mem.pool = NULL; - kmem_cache_destroy(t->sendmsg_cache); - t->sendmsg_cache = NULL; + kmem_cache_destroy(sc->send_io.mem.cache); + sc->send_io.mem.cache = NULL; } static int smb_direct_create_pools(struct smb_direct_transport *t) @@ -1883,35 +1879,35 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) struct smbdirect_recv_io *recvmsg; snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", t); - t->sendmsg_cache = kmem_cache_create(name, + sc->send_io.mem.cache = kmem_cache_create(name, sizeof(struct smbdirect_send_io) + sizeof(struct smbdirect_negotiate_resp), 0, SLAB_HWCACHE_ALIGN, NULL); - if (!t->sendmsg_cache) + if (!sc->send_io.mem.cache) return -ENOMEM; - t->sendmsg_mempool = mempool_create(sp->send_credit_target, + sc->send_io.mem.pool = mempool_create(sp->send_credit_target, mempool_alloc_slab, mempool_free_slab, - t->sendmsg_cache); - if (!t->sendmsg_mempool) + sc->send_io.mem.cache); + if (!sc->send_io.mem.pool) goto err; snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", t); - t->recvmsg_cache = kmem_cache_create(name, + sc->recv_io.mem.cache = kmem_cache_create(name, sizeof(struct smbdirect_recv_io) + sp->max_recv_size, 0, SLAB_HWCACHE_ALIGN, NULL); - if (!t->recvmsg_cache) + if (!sc->recv_io.mem.cache) goto err; - t->recvmsg_mempool = + sc->recv_io.mem.pool = mempool_create(sp->recv_credit_max, mempool_alloc_slab, - mempool_free_slab, t->recvmsg_cache); - if (!t->recvmsg_mempool) + mempool_free_slab, sc->recv_io.mem.cache); + if (!sc->recv_io.mem.pool) goto err; for (i = 0; i < sp->recv_credit_max; i++) { - recvmsg = mempool_alloc(t->recvmsg_mempool, KSMBD_DEFAULT_GFP); + recvmsg = mempool_alloc(sc->recv_io.mem.pool, KSMBD_DEFAULT_GFP); if (!recvmsg) goto err; recvmsg->socket = sc; From 27bc4c57f094917c450f1a2d3097bfea728fc1a5 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 10:06:50 +0200 Subject: [PATCH 3066/3206] smb: server: make only use of wake_up[_all]() in transport_rdma.c wake_up_interruptible(() doesn't wake up tasks waiting with wait_event(). So we better wake_up[_all]() in order to wake up all tasks in order to simplify the logic. As we currently don't use any wait_event_*_exclusive() it doesn't really matter if we use wake_up() or wake_up_all(). But in this patch I try to use wake_up() for expected situations and wake_up_all() for situations of a broken connection. So don't need to adjust things in future when we may use wait_event_*_exclusive() in order to wake up only one process that should make progress. Changing the wait_event_*() code in order to keep wait_event(), wait_event_interruptible() and wait_event_interruptible_timeout() or changing them to wait_event_killable(), wait_event_killable_timeout(), wait_event_killable_exclusive() is something to think about in a future patch. The goal here is to avoid that some tasks are not woken and freeze forever. Also note that this patch only changes the existing wake_up*() calls. Adding more wake_up*() calls for other wait queues is also deferred to a future patch. Link: https://lore.kernel.org/linux-cifs/13851363-0dc9-465c-9ced-3ede4904eef0@samba.org/T/#t Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c400833a8c1681..67a0a2974dcbb8 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -357,7 +357,7 @@ static void free_transport(struct smb_direct_transport *t) struct smbdirect_socket *sc = &t->socket; struct smbdirect_recv_io *recvmsg; - wake_up_interruptible(&t->wait_send_credits); + wake_up_all(&t->wait_send_credits); ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n"); wait_event(t->wait_send_pending, @@ -524,7 +524,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) sc->recv_io.reassembly.full_packet_received = true; sc->status = SMBDIRECT_SOCKET_CONNECTED; enqueue_reassembly(t, recvmsg, 0); - wake_up_interruptible(&t->wait_status); + wake_up(&t->wait_status); return; case SMBDIRECT_EXPECT_DATA_TRANSFER: { struct smbdirect_data_transfer *data_transfer = @@ -587,14 +587,14 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) queue_work(smb_direct_wq, &t->send_immediate_work); if (atomic_read(&t->send_credits) > 0) - wake_up_interruptible(&t->wait_send_credits); + wake_up(&t->wait_send_credits); if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count)) queue_work(smb_direct_wq, &t->post_recv_credits_work); if (data_length) { enqueue_reassembly(t, recvmsg, (int)data_length); - wake_up_interruptible(&sc->recv_io.reassembly.wait_queue); + wake_up(&sc->recv_io.reassembly.wait_queue); } else put_recvmsg(t, recvmsg); @@ -1570,7 +1570,7 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: { sc->status = SMBDIRECT_SOCKET_CONNECTED; - wake_up_interruptible(&t->wait_status); + wake_up(&t->wait_status); break; } case RDMA_CM_EVENT_DEVICE_REMOVAL: @@ -1578,14 +1578,14 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, ib_drain_qp(sc->ib.qp); sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up_interruptible(&t->wait_status); - wake_up_interruptible(&sc->recv_io.reassembly.wait_queue); - wake_up(&t->wait_send_credits); + wake_up_all(&t->wait_status); + wake_up_all(&sc->recv_io.reassembly.wait_queue); + wake_up_all(&t->wait_send_credits); break; } case RDMA_CM_EVENT_CONNECT_ERROR: { sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up_interruptible(&t->wait_status); + wake_up_all(&t->wait_status); break; } default: From d4b86b49093cbe98b68461a1b1af958f1a0ee16d Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 18:23:59 +0200 Subject: [PATCH 3067/3206] smb: server: add a pr_info() when the server starts running We already have a message like: ksmbd: kill command received when the server stops running. This makes it easier for debugging in order to match any possible warnings/errors in dmesg with restarted server. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/server.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 8c9c49c3a0a473..40420544cc25a2 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -365,6 +365,7 @@ static void server_ctrl_handle_init(struct server_ctrl_struct *ctrl) return; } + pr_info("running\n"); WRITE_ONCE(server_conf.state, SERVER_STATE_RUNNING); } From a7eef6144c97bd7031d40ebc6e8fdd038ea3f46f Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 11 Aug 2025 18:23:03 +0200 Subject: [PATCH 3068/3206] smb: server: queue post_recv_credits_work in put_recvmsg() and avoid count_avail_recvmsg This is basically what the client is doing in put_receive_buffer(). It means we don't need complicated work to maintain count_avail_recvmsg. But we keep the logic to queue post_recv_credits_work if the peer raises the requested credit_target and put_receive_buffer() is not called. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 45 +++++++++------------------------- 1 file changed, 11 insertions(+), 34 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 67a0a2974dcbb8..6c34a271ae849d 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -96,7 +96,6 @@ struct smb_direct_transport { spinlock_t receive_credit_lock; int recv_credits; - int count_avail_recvmsg; int recv_credit_target; atomic_t send_credits; @@ -183,13 +182,6 @@ static inline void return (void *)recvmsg->packet; } -static inline bool is_receive_credit_post_required(int receive_credits, - int avail_recvmsg_count) -{ - return receive_credits <= (smb_direct_receive_credit_max >> 3) && - avail_recvmsg_count >= (receive_credits >> 2); -} - static struct smbdirect_recv_io *get_free_recvmsg(struct smb_direct_transport *t) { @@ -223,6 +215,8 @@ static void put_recvmsg(struct smb_direct_transport *t, spin_lock(&sc->recv_io.free.lock); list_add(&recvmsg->list, &sc->recv_io.free.list); spin_unlock(&sc->recv_io.free.lock); + + queue_work(smb_direct_wq, &t->post_recv_credits_work); } static void enqueue_reassembly(struct smb_direct_transport *t, @@ -530,7 +524,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_data_transfer *data_transfer = (struct smbdirect_data_transfer *)recvmsg->packet; u32 remaining_data_length, data_offset, data_length; - int avail_recvmsg_count, receive_credits; + int old_recv_credit_target; if (wc->byte_len < offsetof(struct smbdirect_data_transfer, padding)) { @@ -565,18 +559,13 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) sc->recv_io.reassembly.full_packet_received = false; else sc->recv_io.reassembly.full_packet_received = true; - - spin_lock(&t->receive_credit_lock); - receive_credits = --(t->recv_credits); - avail_recvmsg_count = t->count_avail_recvmsg; - spin_unlock(&t->receive_credit_lock); - } else { - spin_lock(&t->receive_credit_lock); - receive_credits = --(t->recv_credits); - avail_recvmsg_count = ++(t->count_avail_recvmsg); - spin_unlock(&t->receive_credit_lock); } + spin_lock(&t->receive_credit_lock); + t->recv_credits -= 1; + spin_unlock(&t->receive_credit_lock); + + old_recv_credit_target = t->recv_credit_target; t->recv_credit_target = le16_to_cpu(data_transfer->credits_requested); atomic_add(le16_to_cpu(data_transfer->credits_granted), @@ -589,10 +578,10 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) if (atomic_read(&t->send_credits) > 0) wake_up(&t->wait_send_credits); - if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count)) - queue_work(smb_direct_wq, &t->post_recv_credits_work); - if (data_length) { + if (t->recv_credit_target > old_recv_credit_target) + queue_work(smb_direct_wq, &t->post_recv_credits_work); + enqueue_reassembly(t, recvmsg, (int)data_length); wake_up(&sc->recv_io.reassembly.wait_queue); } else @@ -750,15 +739,6 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, sc->recv_io.reassembly.queue_length -= queue_removed; spin_unlock_irq(&sc->recv_io.reassembly.lock); - spin_lock(&st->receive_credit_lock); - st->count_avail_recvmsg += queue_removed; - if (is_receive_credit_post_required(st->recv_credits, st->count_avail_recvmsg)) { - spin_unlock(&st->receive_credit_lock); - queue_work(smb_direct_wq, &st->post_recv_credits_work); - } else { - spin_unlock(&st->receive_credit_lock); - } - sc->recv_io.reassembly.first_entry_offset = offset; ksmbd_debug(RDMA, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", @@ -810,7 +790,6 @@ static void smb_direct_post_recv_credits(struct work_struct *work) spin_lock(&t->receive_credit_lock); t->recv_credits += credits; - t->count_avail_recvmsg -= credits; spin_unlock(&t->receive_credit_lock); spin_lock(&t->lock_new_recv_credits); @@ -1826,7 +1805,6 @@ static int smb_direct_init_params(struct smb_direct_transport *t, } t->recv_credits = 0; - t->count_avail_recvmsg = 0; sp->recv_credit_max = smb_direct_receive_credit_max; t->recv_credit_target = 10; @@ -1914,7 +1892,6 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) recvmsg->sge.length = 0; list_add(&recvmsg->list, &sc->recv_io.free.list); } - t->count_avail_recvmsg = sp->recv_credit_max; return 0; err: From c82a53211a378e70866aac8ec49f4df64905ccc2 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 7 Aug 2025 11:20:55 +0200 Subject: [PATCH 3069/3206] smb: server: make use of smbdirect_socket.status_wait This will allow us to have common helper functions soon. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 6c34a271ae849d..a338f9496deaf6 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -92,8 +92,6 @@ struct smb_direct_transport { struct smbdirect_socket socket; - wait_queue_head_t wait_status; - spinlock_t receive_credit_lock; int recv_credits; int recv_credit_target; @@ -307,7 +305,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) spin_lock_init(&sc->recv_io.free.lock); sc->status = SMBDIRECT_SOCKET_CREATED; - init_waitqueue_head(&t->wait_status); + init_waitqueue_head(&sc->status_wait); spin_lock_init(&sc->recv_io.reassembly.lock); INIT_LIST_HEAD(&sc->recv_io.reassembly.list); @@ -518,7 +516,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) sc->recv_io.reassembly.full_packet_received = true; sc->status = SMBDIRECT_SOCKET_CONNECTED; enqueue_reassembly(t, recvmsg, 0); - wake_up(&t->wait_status); + wake_up(&sc->status_wait); return; case SMBDIRECT_EXPECT_DATA_TRANSFER: { struct smbdirect_data_transfer *data_transfer = @@ -1522,7 +1520,7 @@ static void smb_direct_disconnect(struct ksmbd_transport *t) ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id); smb_direct_disconnect_rdma_work(&st->disconnect_work); - wait_event_interruptible(st->wait_status, + wait_event_interruptible(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); free_transport(st); } @@ -1549,7 +1547,7 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: { sc->status = SMBDIRECT_SOCKET_CONNECTED; - wake_up(&t->wait_status); + wake_up(&sc->status_wait); break; } case RDMA_CM_EVENT_DEVICE_REMOVAL: @@ -1557,14 +1555,14 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, ib_drain_qp(sc->ib.qp); sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up_all(&t->wait_status); + wake_up_all(&sc->status_wait); wake_up_all(&sc->recv_io.reassembly.wait_queue); wake_up_all(&t->wait_send_credits); break; } case RDMA_CM_EVENT_CONNECT_ERROR: { sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up_all(&t->wait_status); + wake_up_all(&sc->status_wait); break; } default: @@ -1997,7 +1995,7 @@ static int smb_direct_prepare(struct ksmbd_transport *t) int ret; ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); - ret = wait_event_interruptible_timeout(st->wait_status, + ret = wait_event_interruptible_timeout(sc->status_wait, st->negotiation_requested || sc->status == SMBDIRECT_SOCKET_DISCONNECTED, SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); From e2d5e516c66383bed098b60be4925f9cbe9e3530 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 12:46:25 +0200 Subject: [PATCH 3070/3206] smb: server: only turn into SMBDIRECT_SOCKET_CONNECTED when negotiation is done From SMBDIRECT_SOCKET_CREATED we now go via the following stages: 1. SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED This indicated rdma_accept needs to be called 2. SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING This waits for RDMA_CM_EVENT_ESTABLISHED to arrive 3. SMBDIRECT_SOCKET_NEGOTIATE_NEEDED This waits for the negotiate request to arrive 4. SMBDIRECT_SOCKET_NEGOTIATE_RUNNING This indicates the negotiate request arrived and needs to be processed 5. SMBDIRECT_SOCKET_CONNECTED The connection is ready to use This avoids the extra 'bool negotiation_requested' and makes the steps more clear. In future we may want to add trace points when changing the states, which would be useful for debugging. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 84 ++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 15 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index a338f9496deaf6..c5850dcf2e19c3 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -113,8 +113,6 @@ struct smb_direct_transport { struct work_struct send_immediate_work; struct work_struct disconnect_work; - bool negotiation_requested; - bool legacy_iwarp; u8 initiator_depth; u8 responder_resources; @@ -255,19 +253,53 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) disconnect_work); struct smbdirect_socket *sc = &t->socket; - if (sc->status == SMBDIRECT_SOCKET_CONNECTED) { + /* + * make sure this and other work is not queued again + * but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&t->disconnect_work); + disable_work(&t->post_recv_credits_work); + disable_work(&t->send_immediate_work); + + switch (sc->status) { + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_CONNECTED: + case SMBDIRECT_SOCKET_ERROR: sc->status = SMBDIRECT_SOCKET_DISCONNECTING; rdma_disconnect(sc->rdma.cm_id); + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + /* + * rdma_accept() never reached + * RDMA_CM_EVENT_ESTABLISHED + */ + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + break; } } static void smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t) { - struct smbdirect_socket *sc = &t->socket; - - if (sc->status == SMBDIRECT_SOCKET_CONNECTED) - queue_work(smb_direct_wq, &t->disconnect_work); + queue_work(smb_direct_wq, &t->disconnect_work); } static void smb_direct_send_immediate_work(struct work_struct *work) @@ -512,9 +544,9 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(t); return; } - t->negotiation_requested = true; sc->recv_io.reassembly.full_packet_received = true; - sc->status = SMBDIRECT_SOCKET_CONNECTED; + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; enqueue_reassembly(t, recvmsg, 0); wake_up(&sc->status_wait); return; @@ -1546,7 +1578,8 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: { - sc->status = SMBDIRECT_SOCKET_CONNECTED; + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; wake_up(&sc->status_wait); break; } @@ -1555,6 +1588,7 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, ib_drain_qp(sc->ib.qp); sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + smb_direct_disconnect_rdma_work(&sc->disconnect_work); wake_up_all(&sc->status_wait); wake_up_all(&sc->recv_io.reassembly.wait_queue); wake_up_all(&t->wait_send_credits); @@ -1611,6 +1645,8 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, resp->min_version = SMB_DIRECT_VERSION_LE; resp->max_version = SMB_DIRECT_VERSION_LE; resp->status = STATUS_NOT_SUPPORTED; + + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; } else { resp->status = STATUS_SUCCESS; resp->min_version = SMB_DIRECT_VERSION_LE; @@ -1627,6 +1663,7 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, cpu_to_le32(sp->max_fragmented_recv_size); sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; + sc->status = SMBDIRECT_SOCKET_CONNECTED; } sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, @@ -1682,6 +1719,8 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY; conn_param.flow_control = 0; + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; ret = rdma_accept(sc->rdma.cm_id, &conn_param); if (ret) { pr_err("error at rdma_accept: %d\n", ret); @@ -1696,6 +1735,9 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) int ret; struct smbdirect_recv_io *recvmsg; + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; + sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ; recvmsg = get_free_recvmsg(t); @@ -1708,7 +1750,6 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) goto out_err; } - t->negotiation_requested = false; ret = smb_direct_accept_client(t); if (ret) { pr_err("Can't accept client\n"); @@ -1994,12 +2035,25 @@ static int smb_direct_prepare(struct ksmbd_transport *t) struct smbdirect_negotiate_req *req; int ret; + /* + * We are waiting to pass the following states: + * + * SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED + * SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING + * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED + * + * To finally get to SMBDIRECT_SOCKET_NEGOTIATE_RUNNING + * in order to continue below. + * + * Everything else is unexpected and an error. + */ ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); ret = wait_event_interruptible_timeout(sc->status_wait, - st->negotiation_requested || - sc->status == SMBDIRECT_SOCKET_DISCONNECTED, - SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); - if (ret <= 0 || sc->status == SMBDIRECT_SOCKET_DISCONNECTED) + sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED && + sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING && + sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, + SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); + if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) return ret < 0 ? ret : -ETIMEDOUT; recvmsg = get_first_reassembly(st); From b31606097de8393c8adac4161a4d3a6098af10e0 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 12 Aug 2025 10:55:31 +0200 Subject: [PATCH 3071/3206] smb: server: move smb_direct_disconnect_rdma_work() into free_transport() The logic is also needed when smb_direct_handle_connect_request() calls free_transport(), because rdma_accept() and RDMA_CM_EVENT_ESTABLISHED could already be reached. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c5850dcf2e19c3..298c7e060db655 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -381,13 +381,19 @@ static void free_transport(struct smb_direct_transport *t) struct smbdirect_socket *sc = &t->socket; struct smbdirect_recv_io *recvmsg; + disable_work_sync(&t->disconnect_work); + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { + smb_direct_disconnect_rdma_work(&t->disconnect_work); + wait_event_interruptible(sc->status_wait, + sc->status == SMBDIRECT_SOCKET_DISCONNECTED); + } + wake_up_all(&t->wait_send_credits); ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n"); wait_event(t->wait_send_pending, atomic_read(&t->send_pending) == 0); - disable_work_sync(&t->disconnect_work); disable_work_sync(&t->post_recv_credits_work); disable_work_sync(&t->send_immediate_work); @@ -1551,9 +1557,6 @@ static void smb_direct_disconnect(struct ksmbd_transport *t) ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id); - smb_direct_disconnect_rdma_work(&st->disconnect_work); - wait_event_interruptible(sc->status_wait, - sc->status == SMBDIRECT_SOCKET_DISCONNECTED); free_transport(st); } From b4d56ced9e9dc353484dcb6355c1fd7074609124 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 13 Aug 2025 11:39:19 +0200 Subject: [PATCH 3072/3206] smb: server: don't wait for info->send_pending == 0 on error Instead we just wake up the waiters and let them return -ENOTCONN. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 298c7e060db655..4199a4f9663569 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -389,10 +389,7 @@ static void free_transport(struct smb_direct_transport *t) } wake_up_all(&t->wait_send_credits); - - ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n"); - wait_event(t->wait_send_pending, - atomic_read(&t->send_pending) == 0); + wake_up_all(&t->wait_send_pending); disable_work_sync(&t->post_recv_credits_work); disable_work_sync(&t->send_immediate_work); @@ -1356,7 +1353,11 @@ static int smb_direct_writev(struct ksmbd_transport *t, */ wait_event(st->wait_send_pending, - atomic_read(&st->send_pending) == 0); + atomic_read(&st->send_pending) == 0 || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + if (sc->status != SMBDIRECT_SOCKET_CONNECTED && ret == 0) + ret = -ENOTCONN; + return ret; } @@ -1689,7 +1690,11 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, } wait_event(t->wait_send_pending, - atomic_read(&t->send_pending) == 0); + atomic_read(&t->send_pending) == 0 || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return -ENOTCONN; + return 0; } From 16ba90e52bc7c367b62386fa4f61e4dc318c3583 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 15:10:52 +0200 Subject: [PATCH 3073/3206] smb: server: make use of smbdirect_socket_init() Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 4199a4f9663569..297345c50f9596 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -324,6 +324,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) if (!t) return NULL; sc = &t->socket; + smbdirect_socket_init(sc); sc->rdma.cm_id = cm_id; cm_id->context = t; @@ -333,17 +334,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sc->ib.dev = sc->rdma.cm_id->device; - INIT_LIST_HEAD(&sc->recv_io.free.list); - spin_lock_init(&sc->recv_io.free.lock); - - sc->status = SMBDIRECT_SOCKET_CREATED; - init_waitqueue_head(&sc->status_wait); - - spin_lock_init(&sc->recv_io.reassembly.lock); - INIT_LIST_HEAD(&sc->recv_io.reassembly.list); - sc->recv_io.reassembly.data_length = 0; - sc->recv_io.reassembly.queue_length = 0; - init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); init_waitqueue_head(&t->wait_send_credits); init_waitqueue_head(&t->wait_rw_credits); From bb12617ecb3af1e1c8ba36ef2c3496f95521a605 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 8 Aug 2025 19:07:37 +0200 Subject: [PATCH 3074/3206] smb: server: make use of smbdirect_socket.disconnect_work Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 297345c50f9596..1e14b038a77fbe 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -111,7 +111,6 @@ struct smb_direct_transport { struct work_struct post_recv_credits_work; struct work_struct send_immediate_work; - struct work_struct disconnect_work; bool legacy_iwarp; u8 initiator_depth; @@ -248,17 +247,17 @@ static struct smbdirect_recv_io *get_first_reassembly(struct smb_direct_transpor static void smb_direct_disconnect_rdma_work(struct work_struct *work) { + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, disconnect_work); struct smb_direct_transport *t = - container_of(work, struct smb_direct_transport, - disconnect_work); - struct smbdirect_socket *sc = &t->socket; + container_of(sc, struct smb_direct_transport, socket); /* * make sure this and other work is not queued again * but here we don't block and avoid * disable[_delayed]_work_sync() */ - disable_work(&t->disconnect_work); + disable_work(&sc->disconnect_work); disable_work(&t->post_recv_credits_work); disable_work(&t->send_immediate_work); @@ -299,7 +298,9 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) static void smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t) { - queue_work(smb_direct_wq, &t->disconnect_work); + struct smbdirect_socket *sc = &t->socket; + + queue_work(smb_direct_wq, &sc->disconnect_work); } static void smb_direct_send_immediate_work(struct work_struct *work) @@ -326,6 +327,8 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sc = &t->socket; smbdirect_socket_init(sc); + INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work); + sc->rdma.cm_id = cm_id; cm_id->context = t; @@ -347,7 +350,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) INIT_WORK(&t->post_recv_credits_work, smb_direct_post_recv_credits); INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work); - INIT_WORK(&t->disconnect_work, smb_direct_disconnect_rdma_work); conn = ksmbd_conn_alloc(); if (!conn) @@ -371,9 +373,9 @@ static void free_transport(struct smb_direct_transport *t) struct smbdirect_socket *sc = &t->socket; struct smbdirect_recv_io *recvmsg; - disable_work_sync(&t->disconnect_work); + disable_work_sync(&sc->disconnect_work); if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { - smb_direct_disconnect_rdma_work(&t->disconnect_work); + smb_direct_disconnect_rdma_work(&sc->disconnect_work); wait_event_interruptible(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } @@ -1558,7 +1560,7 @@ static void smb_direct_shutdown(struct ksmbd_transport *t) ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id); - smb_direct_disconnect_rdma_work(&st->disconnect_work); + smb_direct_disconnect_rdma_work(&sc->disconnect_work); } static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, From fd0ad9c521db6acf20fb24699830138481c07f8a Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 11 Aug 2025 15:19:51 +0200 Subject: [PATCH 3075/3206] smb: server: make use of smbdirect_socket.send_io.pending.{count,zero_wait_queue} This will is used by the client already and will allow to create common helper functions. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 1e14b038a77fbe..d8f0f252e40115 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -106,9 +106,6 @@ struct smb_direct_transport { wait_queue_head_t wait_send_credits; wait_queue_head_t wait_rw_credits; - wait_queue_head_t wait_send_pending; - atomic_t send_pending; - struct work_struct post_recv_credits_work; struct work_struct send_immediate_work; @@ -342,9 +339,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) spin_lock_init(&t->receive_credit_lock); - init_waitqueue_head(&t->wait_send_pending); - atomic_set(&t->send_pending, 0); - spin_lock_init(&t->lock_new_recv_credits); INIT_WORK(&t->post_recv_credits_work, @@ -381,7 +375,7 @@ static void free_transport(struct smb_direct_transport *t) } wake_up_all(&t->wait_send_credits); - wake_up_all(&t->wait_send_pending); + wake_up_all(&sc->send_io.pending.zero_wait_queue); disable_work_sync(&t->post_recv_credits_work); disable_work_sync(&t->send_immediate_work); @@ -847,8 +841,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(t); } - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); /* iterate and free the list of messages in reverse. the list's head * is invalid. @@ -881,12 +875,12 @@ static int smb_direct_post_send(struct smb_direct_transport *t, struct smbdirect_socket *sc = &t->socket; int ret; - atomic_inc(&t->send_pending); + atomic_inc(&sc->send_io.pending.count); ret = ib_post_send(sc->ib.qp, wr, NULL); if (ret) { pr_err("failed to post send: %d\n", ret); - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); smb_direct_disconnect_rdma_connection(t); } return ret; @@ -1344,8 +1338,8 @@ static int smb_direct_writev(struct ksmbd_transport *t, * that means all the I/Os have been out and we are good to return */ - wait_event(st->wait_send_pending, - atomic_read(&st->send_pending) == 0 || + wait_event(sc->send_io.pending.zero_wait_queue, + atomic_read(&sc->send_io.pending.count) == 0 || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (sc->status != SMBDIRECT_SOCKET_CONNECTED && ret == 0) ret = -ENOTCONN; @@ -1681,8 +1675,8 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, return ret; } - wait_event(t->wait_send_pending, - atomic_read(&t->send_pending) == 0 || + wait_event(sc->send_io.pending.zero_wait_queue, + atomic_read(&sc->send_io.pending.count) == 0 || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -ENOTCONN; From 2def28338a19f03a15a99d7df46930e6be6944e8 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 11 Aug 2025 17:12:58 +0200 Subject: [PATCH 3076/3206] smb: server: make use of smbdirect_socket.send_io.credits.{count,wait_queue} This will is used by the client already and will allow to create common helper functions. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index d8f0f252e40115..256970c7258615 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -96,14 +96,12 @@ struct smb_direct_transport { int recv_credits; int recv_credit_target; - atomic_t send_credits; spinlock_t lock_new_recv_credits; int new_recv_credits; int max_rw_credits; int pages_per_rw_credit; atomic_t rw_credits; - wait_queue_head_t wait_send_credits; wait_queue_head_t wait_rw_credits; struct work_struct post_recv_credits_work; @@ -334,7 +332,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sc->ib.dev = sc->rdma.cm_id->device; - init_waitqueue_head(&t->wait_send_credits); init_waitqueue_head(&t->wait_rw_credits); spin_lock_init(&t->receive_credit_lock); @@ -374,7 +371,7 @@ static void free_transport(struct smb_direct_transport *t) sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } - wake_up_all(&t->wait_send_credits); + wake_up_all(&sc->send_io.credits.wait_queue); wake_up_all(&sc->send_io.pending.zero_wait_queue); disable_work_sync(&t->post_recv_credits_work); @@ -588,14 +585,14 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) t->recv_credit_target = le16_to_cpu(data_transfer->credits_requested); atomic_add(le16_to_cpu(data_transfer->credits_granted), - &t->send_credits); + &sc->send_io.credits.count); if (le16_to_cpu(data_transfer->flags) & SMBDIRECT_FLAG_RESPONSE_REQUESTED) queue_work(smb_direct_wq, &t->send_immediate_work); - if (atomic_read(&t->send_credits) > 0) - wake_up(&t->wait_send_credits); + if (atomic_read(&sc->send_io.credits.count) > 0) + wake_up(&sc->send_io.credits.wait_queue); if (data_length) { if (t->recv_credit_target > old_recv_credit_target) @@ -901,6 +898,7 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, struct smb_direct_send_ctx *send_ctx, bool is_last) { + struct smbdirect_socket *sc = &t->socket; struct smbdirect_send_io *first, *last; int ret; @@ -927,8 +925,8 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, send_ctx->need_invalidate_rkey, send_ctx->remote_key); } else { - atomic_add(send_ctx->wr_cnt, &t->send_credits); - wake_up(&t->wait_send_credits); + atomic_add(send_ctx->wr_cnt, &sc->send_io.credits.count); + wake_up(&sc->send_io.credits.wait_queue); list_for_each_entry_safe(first, last, &send_ctx->msg_list, sibling_list) { smb_direct_free_sendmsg(t, first); @@ -963,16 +961,17 @@ static int wait_for_credits(struct smb_direct_transport *t, static int wait_for_send_credits(struct smb_direct_transport *t, struct smb_direct_send_ctx *send_ctx) { + struct smbdirect_socket *sc = &t->socket; int ret; if (send_ctx && - (send_ctx->wr_cnt >= 16 || atomic_read(&t->send_credits) <= 1)) { + (send_ctx->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) { ret = smb_direct_flush_send_list(t, send_ctx, false); if (ret) return ret; } - return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1); + return wait_for_credits(t, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1); } static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) @@ -1155,7 +1154,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, ret = smb_direct_create_header(t, data_length, remaining_data_length, &msg); if (ret) { - atomic_inc(&t->send_credits); + atomic_inc(&sc->send_io.credits.count); return ret; } @@ -1195,7 +1194,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, return 0; err: smb_direct_free_sendmsg(t, msg); - atomic_inc(&t->send_credits); + atomic_inc(&sc->send_io.credits.count); return ret; } @@ -1581,7 +1580,7 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, smb_direct_disconnect_rdma_work(&sc->disconnect_work); wake_up_all(&sc->status_wait); wake_up_all(&sc->recv_io.reassembly.wait_queue); - wake_up_all(&t->wait_send_credits); + wake_up_all(&sc->send_io.credits.wait_queue); break; } case RDMA_CM_EVENT_CONNECT_ERROR: { @@ -1844,7 +1843,6 @@ static int smb_direct_init_params(struct smb_direct_transport *t, t->new_recv_credits = 0; sp->send_credit_target = smb_direct_send_credit_target; - atomic_set(&t->send_credits, 0); atomic_set(&t->rw_credits, t->max_rw_credits); sp->max_send_size = smb_direct_max_send_size; From 73430a64469c86c027518014e63ca637d9a53b9a Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 18 Aug 2025 21:16:48 +0200 Subject: [PATCH 3077/3206] smb: server: make use of struct smbdirect_send_batch This makes it easier to move functions to the common smbdirect code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 256970c7258615..4f6e8e552bcda9 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -118,13 +118,6 @@ struct smb_direct_transport { static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops; -struct smb_direct_send_ctx { - struct list_head msg_list; - int wr_cnt; - bool need_invalidate_rkey; - unsigned int remote_key; -}; - struct smb_direct_rdma_rw_msg { struct smb_direct_transport *t; struct ib_cqe cqe; @@ -156,7 +149,7 @@ static inline int get_buf_page_count(void *buf, int size) static void smb_direct_destroy_pools(struct smb_direct_transport *transport); static void smb_direct_post_recv_credits(struct work_struct *work); static int smb_direct_post_send_data(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, + struct smbdirect_send_batch *send_ctx, struct kvec *iov, int niov, int remaining_data_length); @@ -884,7 +877,7 @@ static int smb_direct_post_send(struct smb_direct_transport *t, } static void smb_direct_send_ctx_init(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, + struct smbdirect_send_batch *send_ctx, bool need_invalidate_rkey, unsigned int remote_key) { @@ -895,7 +888,7 @@ static void smb_direct_send_ctx_init(struct smb_direct_transport *t, } static int smb_direct_flush_send_list(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, + struct smbdirect_send_batch *send_ctx, bool is_last) { struct smbdirect_socket *sc = &t->socket; @@ -959,7 +952,7 @@ static int wait_for_credits(struct smb_direct_transport *t, } static int wait_for_send_credits(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx) + struct smbdirect_send_batch *send_ctx) { struct smbdirect_socket *sc = &t->socket; int ret; @@ -1094,7 +1087,7 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, } static int post_sendmsg(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, + struct smbdirect_send_batch *send_ctx, struct smbdirect_send_io *msg) { struct smbdirect_socket *sc = &t->socket; @@ -1133,7 +1126,7 @@ static int post_sendmsg(struct smb_direct_transport *t, } static int smb_direct_post_send_data(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, + struct smbdirect_send_batch *send_ctx, struct kvec *iov, int niov, int remaining_data_length) { @@ -1211,7 +1204,7 @@ static int smb_direct_writev(struct ksmbd_transport *t, size_t max_iov_size = sp->max_send_size - sizeof(struct smbdirect_data_transfer); int ret; - struct smb_direct_send_ctx send_ctx; + struct smbdirect_send_batch send_ctx; int error = 0; if (sc->status != SMBDIRECT_SOCKET_CONNECTED) From 95475d8886bd7e60860f7aa8657683a089f21db5 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 12:01:50 +0200 Subject: [PATCH 3078/3206] smb: server: make use smbdirect_socket.rw_io.credits This will allow to us to have functions moved into common code in future (even if it's only used by the server). Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 44 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 4f6e8e552bcda9..87fb66304ee599 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -98,11 +98,6 @@ struct smb_direct_transport { spinlock_t lock_new_recv_credits; int new_recv_credits; - int max_rw_credits; - int pages_per_rw_credit; - atomic_t rw_credits; - - wait_queue_head_t wait_rw_credits; struct work_struct post_recv_credits_work; struct work_struct send_immediate_work; @@ -325,8 +320,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sc->ib.dev = sc->rdma.cm_id->device; - init_waitqueue_head(&t->wait_rw_credits); - spin_lock_init(&t->receive_credit_lock); spin_lock_init(&t->lock_new_recv_credits); @@ -969,14 +962,21 @@ static int wait_for_send_credits(struct smb_direct_transport *t, static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) { - return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits); + struct smbdirect_socket *sc = &t->socket; + + return wait_for_credits(t, + &sc->rw_io.credits.wait_queue, + &sc->rw_io.credits.count, + credits); } static int calc_rw_credits(struct smb_direct_transport *t, char *buf, unsigned int len) { + struct smbdirect_socket *sc = &t->socket; + return DIV_ROUND_UP(get_buf_page_count(buf, len), - t->pages_per_rw_credit); + sc->rw_io.credits.num_pages); } static int smb_direct_create_header(struct smb_direct_transport *t, @@ -1506,8 +1506,8 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, smb_direct_free_rdma_rw_msg(t, msg, is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } - atomic_add(credits_needed, &t->rw_credits); - wake_up(&t->wait_rw_credits); + atomic_add(credits_needed, &sc->rw_io.credits.count); + wake_up(&sc->rw_io.credits.wait_queue); return ret; } @@ -1785,9 +1785,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t, * MR invalidation. */ sp->max_read_write_size = smb_direct_max_read_write_size; - t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t); - t->max_rw_credits = DIV_ROUND_UP(sp->max_read_write_size, - (t->pages_per_rw_credit - 1) * + sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(t); + sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size, + (sc->rw_io.credits.num_pages - 1) * PAGE_SIZE); max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, @@ -1795,9 +1795,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t, max_sge_per_wr = max_t(unsigned int, max_sge_per_wr, max_send_sges); wrs_per_credit = max_t(unsigned int, 4, - DIV_ROUND_UP(t->pages_per_rw_credit, + DIV_ROUND_UP(sc->rw_io.credits.num_pages, max_sge_per_wr) + 1); - max_rw_wrs = t->max_rw_credits * wrs_per_credit; + max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit; max_send_wrs = smb_direct_send_credit_target + max_rw_wrs; if (max_send_wrs > device->attrs.max_cqe || @@ -1836,7 +1836,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, t->new_recv_credits = 0; sp->send_credit_target = smb_direct_send_credit_target; - atomic_set(&t->rw_credits, t->max_rw_credits); + atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); sp->max_send_size = smb_direct_max_send_size; sp->max_recv_size = smb_direct_max_receive_size; @@ -1847,7 +1847,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; cap->max_inline_data = 0; - cap->max_rdma_ctxs = t->max_rw_credits; + cap->max_rdma_ctxs = sc->rw_io.credits.max; return 0; } @@ -1981,11 +1981,11 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1; if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) { ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs, - t->max_rw_credits, IB_MR_TYPE_MEM_REG, - t->pages_per_rw_credit, 0); + sc->rw_io.credits.max, IB_MR_TYPE_MEM_REG, + sc->rw_io.credits.num_pages, 0); if (ret) { - pr_err("failed to init mr pool count %d pages %d\n", - t->max_rw_credits, t->pages_per_rw_credit); + pr_err("failed to init mr pool count %zu pages %zu\n", + sc->rw_io.credits.max, sc->rw_io.credits.num_pages); goto err; } } From 780ccb0fb624600278ef4a65df30e563fd015a2a Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 18 Aug 2025 21:07:06 +0200 Subject: [PATCH 3079/3206] smb: server: make use of struct smbdirect_rw_io This will allow us to create functions in the common smbdirect code to be used by the server in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 37 +++++++++++++--------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 87fb66304ee599..33ce90b15de524 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -113,17 +113,6 @@ struct smb_direct_transport { static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops; -struct smb_direct_rdma_rw_msg { - struct smb_direct_transport *t; - struct ib_cqe cqe; - int status; - struct completion *completion; - struct list_head list; - struct rdma_rw_ctx rw_ctx; - struct sg_table sgt; - struct scatterlist sg_list[]; -}; - void init_smbd_max_io_size(unsigned int sz) { sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); @@ -1340,12 +1329,12 @@ static int smb_direct_writev(struct ksmbd_transport *t, } static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, - struct smb_direct_rdma_rw_msg *msg, + struct smbdirect_rw_io *msg, enum dma_data_direction dir) { struct smbdirect_socket *sc = &t->socket; - rdma_rw_ctx_destroy(&msg->rw_ctx, sc->ib.qp, sc->ib.qp->port, + rdma_rw_ctx_destroy(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, msg->sgt.sgl, msg->sgt.nents, dir); sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); kfree(msg); @@ -1354,12 +1343,14 @@ static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, enum dma_data_direction dir) { - struct smb_direct_rdma_rw_msg *msg = container_of(wc->wr_cqe, - struct smb_direct_rdma_rw_msg, cqe); - struct smb_direct_transport *t = msg->t; + struct smbdirect_rw_io *msg = + container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe); + struct smbdirect_socket *sc = msg->socket; + struct smb_direct_transport *t = + container_of(sc, struct smb_direct_transport, socket); if (wc->status != IB_WC_SUCCESS) { - msg->status = -EIO; + msg->error = -EIO; pr_err("read/write error. opcode = %d, status = %s(%d)\n", wc->opcode, ib_wc_status_msg(wc->status), wc->status); if (wc->status != IB_WC_WR_FLUSH_ERR) @@ -1387,7 +1378,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, { struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smb_direct_rdma_rw_msg *msg, *next_msg; + struct smbdirect_rw_io *msg, *next_msg; int i, ret; DECLARE_COMPLETION_ONSTACK(completion); struct ib_send_wr *first_wr; @@ -1444,7 +1435,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, desc_buf_len = le32_to_cpu(desc[i].length); - msg->t = t; + msg->socket = sc; msg->cqe.done = is_read ? read_done : write_done; msg->completion = &completion; @@ -1466,7 +1457,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, goto out; } - ret = rdma_rw_ctx_init(&msg->rw_ctx, sc->ib.qp, sc->ib.qp->port, + ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, msg->sgt.sgl, get_buf_page_count(desc_buf, desc_buf_len), 0, @@ -1487,7 +1478,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, /* concatenate work requests of rdma_rw_ctxs */ first_wr = NULL; list_for_each_entry_reverse(msg, &msg_list, list) { - first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, sc->ib.qp, sc->ib.qp->port, + first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, &msg->cqe, first_wr); } @@ -1497,9 +1488,9 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, goto out; } - msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list); + msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list); wait_for_completion(&completion); - ret = msg->status; + ret = msg->error; out: list_for_each_entry_safe(msg, next_msg, &msg_list, list) { list_del(&msg->list); From 113ed9d56f63d2e4eb3d74002c3d91de1e7a3485 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 15:50:05 +0200 Subject: [PATCH 3080/3206] smb: server: take the recv_credit_target from the negotiate req and always limit the range The clients sends the initial recv_credit_target in the negotiate req, so we should use that. We also limit the range between 1 and our local defined sp->recv_credit_max. This will simplify further logic changes. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 33ce90b15de524..283f06cf493851 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -94,7 +94,7 @@ struct smb_direct_transport { spinlock_t receive_credit_lock; int recv_credits; - int recv_credit_target; + u16 recv_credit_target; spinlock_t lock_new_recv_credits; int new_recv_credits; @@ -515,7 +515,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_data_transfer *data_transfer = (struct smbdirect_data_transfer *)recvmsg->packet; u32 remaining_data_length, data_offset, data_length; - int old_recv_credit_target; + u16 old_recv_credit_target; if (wc->byte_len < offsetof(struct smbdirect_data_transfer, padding)) { @@ -559,6 +559,10 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) old_recv_credit_target = t->recv_credit_target; t->recv_credit_target = le16_to_cpu(data_transfer->credits_requested); + t->recv_credit_target = + min_t(u16, t->recv_credit_target, sp->recv_credit_max); + t->recv_credit_target = + max_t(u16, t->recv_credit_target, 1); atomic_add(le16_to_cpu(data_transfer->credits_granted), &sc->send_io.credits.count); @@ -1823,7 +1827,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, t->recv_credits = 0; sp->recv_credit_max = smb_direct_receive_credit_max; - t->recv_credit_target = 10; + t->recv_credit_target = 1; t->new_recv_credits = 0; sp->send_credit_target = smb_direct_send_credit_target; @@ -2049,6 +2053,9 @@ static int smb_direct_prepare(struct ksmbd_transport *t) le32_to_cpu(req->max_fragmented_size); sp->max_fragmented_recv_size = (sp->recv_credit_max * sp->max_recv_size) / 2; + st->recv_credit_target = le16_to_cpu(req->credits_requested); + st->recv_credit_target = min_t(u16, st->recv_credit_target, sp->recv_credit_max); + st->recv_credit_target = max_t(u16, st->recv_credit_target, 1); ret = smb_direct_send_negotiate_response(st, ret); out: From 89b021a72663c4d96d8a8b85272bb42d991a1c6f Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 16:05:47 +0200 Subject: [PATCH 3081/3206] smb: server: manage recv credits by counting posted recv_io and granted credits (At least for me) the logic maintaining the count of posted recv_io messages and the count of granted credits is much easier to understand. From there we can easily calculate the number of new_credits we'll grant to the peer in outgoing send_io messages. This will simplify the move to common logic that can be shared between client and server in the following patches. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 53 ++++++++++++++-------------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 283f06cf493851..bd6d927e9656ac 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -92,13 +92,10 @@ struct smb_direct_transport { struct smbdirect_socket socket; - spinlock_t receive_credit_lock; - int recv_credits; + atomic_t recv_credits; u16 recv_credit_target; - spinlock_t lock_new_recv_credits; - int new_recv_credits; - + atomic_t recv_posted; struct work_struct post_recv_credits_work; struct work_struct send_immediate_work; @@ -309,9 +306,8 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sc->ib.dev = sc->rdma.cm_id->device; - spin_lock_init(&t->receive_credit_lock); - - spin_lock_init(&t->lock_new_recv_credits); + atomic_set(&t->recv_posted, 0); + atomic_set(&t->recv_credits, 0); INIT_WORK(&t->post_recv_credits_work, smb_direct_post_recv_credits); @@ -552,9 +548,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) sc->recv_io.reassembly.full_packet_received = true; } - spin_lock(&t->receive_credit_lock); - t->recv_credits -= 1; - spin_unlock(&t->receive_credit_lock); + atomic_dec(&t->recv_posted); + atomic_dec(&t->recv_credits); old_recv_credit_target = t->recv_credit_target; t->recv_credit_target = @@ -758,14 +753,10 @@ static void smb_direct_post_recv_credits(struct work_struct *work) struct smb_direct_transport *t = container_of(work, struct smb_direct_transport, post_recv_credits_work); struct smbdirect_recv_io *recvmsg; - int receive_credits, credits = 0; + int credits = 0; int ret; - spin_lock(&t->receive_credit_lock); - receive_credits = t->recv_credits; - spin_unlock(&t->receive_credit_lock); - - if (receive_credits < t->recv_credit_target) { + if (atomic_read(&t->recv_credits) < t->recv_credit_target) { while (true) { recvmsg = get_free_recvmsg(t); if (!recvmsg) @@ -780,17 +771,11 @@ static void smb_direct_post_recv_credits(struct work_struct *work) break; } credits++; + + atomic_inc(&t->recv_posted); } } - spin_lock(&t->receive_credit_lock); - t->recv_credits += credits; - spin_unlock(&t->receive_credit_lock); - - spin_lock(&t->lock_new_recv_credits); - t->new_recv_credits += credits; - spin_unlock(&t->lock_new_recv_credits); - if (credits) queue_work(smb_direct_wq, &t->send_immediate_work); } @@ -837,11 +822,18 @@ static int manage_credits_prior_sending(struct smb_direct_transport *t) { int new_credits; - spin_lock(&t->lock_new_recv_credits); - new_credits = t->new_recv_credits; - t->new_recv_credits = 0; - spin_unlock(&t->lock_new_recv_credits); + if (atomic_read(&t->recv_credits) >= t->recv_credit_target) + return 0; + + new_credits = atomic_read(&t->recv_posted); + if (new_credits == 0) + return 0; + new_credits -= atomic_read(&t->recv_credits); + if (new_credits <= 0) + return 0; + + atomic_add(new_credits, &t->recv_credits); return new_credits; } @@ -1824,11 +1816,8 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } - t->recv_credits = 0; - sp->recv_credit_max = smb_direct_receive_credit_max; t->recv_credit_target = 1; - t->new_recv_credits = 0; sp->send_credit_target = smb_direct_send_credit_target; atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); From e8bc71986cbd2d2d5991a2fc6afbac0311ff3769 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 14 Aug 2025 16:33:30 +0200 Subject: [PATCH 3082/3206] smb: server: make use of smbdirect_socket.recv_io.{posted,credits} This will make it possible to introduce common helper functions in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 65 ++++++++++++++++------------------ 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index bd6d927e9656ac..b2de5311a06644 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -92,11 +92,6 @@ struct smb_direct_transport { struct smbdirect_socket socket; - atomic_t recv_credits; - u16 recv_credit_target; - - atomic_t recv_posted; - struct work_struct post_recv_credits_work; struct work_struct send_immediate_work; bool legacy_iwarp; @@ -180,7 +175,7 @@ static void put_recvmsg(struct smb_direct_transport *t, list_add(&recvmsg->list, &sc->recv_io.free.list); spin_unlock(&sc->recv_io.free.lock); - queue_work(smb_direct_wq, &t->post_recv_credits_work); + queue_work(smb_direct_wq, &sc->recv_io.posted.refill_work); } static void enqueue_reassembly(struct smb_direct_transport *t, @@ -227,7 +222,7 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) * disable[_delayed]_work_sync() */ disable_work(&sc->disconnect_work); - disable_work(&t->post_recv_credits_work); + disable_work(&sc->recv_io.posted.refill_work); disable_work(&t->send_immediate_work); switch (sc->status) { @@ -306,10 +301,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sc->ib.dev = sc->rdma.cm_id->device; - atomic_set(&t->recv_posted, 0); - atomic_set(&t->recv_credits, 0); - - INIT_WORK(&t->post_recv_credits_work, + INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits); INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work); @@ -345,7 +337,7 @@ static void free_transport(struct smb_direct_transport *t) wake_up_all(&sc->send_io.credits.wait_queue); wake_up_all(&sc->send_io.pending.zero_wait_queue); - disable_work_sync(&t->post_recv_credits_work); + disable_work_sync(&sc->recv_io.posted.refill_work); disable_work_sync(&t->send_immediate_work); if (sc->ib.qp) { @@ -548,16 +540,16 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) sc->recv_io.reassembly.full_packet_received = true; } - atomic_dec(&t->recv_posted); - atomic_dec(&t->recv_credits); + atomic_dec(&sc->recv_io.posted.count); + atomic_dec(&sc->recv_io.credits.count); - old_recv_credit_target = t->recv_credit_target; - t->recv_credit_target = + old_recv_credit_target = sc->recv_io.credits.target; + sc->recv_io.credits.target = le16_to_cpu(data_transfer->credits_requested); - t->recv_credit_target = - min_t(u16, t->recv_credit_target, sp->recv_credit_max); - t->recv_credit_target = - max_t(u16, t->recv_credit_target, 1); + sc->recv_io.credits.target = + min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); + sc->recv_io.credits.target = + max_t(u16, sc->recv_io.credits.target, 1); atomic_add(le16_to_cpu(data_transfer->credits_granted), &sc->send_io.credits.count); @@ -569,8 +561,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) wake_up(&sc->send_io.credits.wait_queue); if (data_length) { - if (t->recv_credit_target > old_recv_credit_target) - queue_work(smb_direct_wq, &t->post_recv_credits_work); + if (sc->recv_io.credits.target > old_recv_credit_target) + queue_work(smb_direct_wq, &sc->recv_io.posted.refill_work); enqueue_reassembly(t, recvmsg, (int)data_length); wake_up(&sc->recv_io.reassembly.wait_queue); @@ -750,13 +742,15 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, static void smb_direct_post_recv_credits(struct work_struct *work) { - struct smb_direct_transport *t = container_of(work, - struct smb_direct_transport, post_recv_credits_work); + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); + struct smb_direct_transport *t = + container_of(sc, struct smb_direct_transport, socket); struct smbdirect_recv_io *recvmsg; int credits = 0; int ret; - if (atomic_read(&t->recv_credits) < t->recv_credit_target) { + if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target) { while (true) { recvmsg = get_free_recvmsg(t); if (!recvmsg) @@ -772,7 +766,7 @@ static void smb_direct_post_recv_credits(struct work_struct *work) } credits++; - atomic_inc(&t->recv_posted); + atomic_inc(&sc->recv_io.posted.count); } } @@ -820,20 +814,21 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) static int manage_credits_prior_sending(struct smb_direct_transport *t) { + struct smbdirect_socket *sc = &t->socket; int new_credits; - if (atomic_read(&t->recv_credits) >= t->recv_credit_target) + if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) return 0; - new_credits = atomic_read(&t->recv_posted); + new_credits = atomic_read(&sc->recv_io.posted.count); if (new_credits == 0) return 0; - new_credits -= atomic_read(&t->recv_credits); + new_credits -= atomic_read(&sc->recv_io.credits.count); if (new_credits <= 0) return 0; - atomic_add(new_credits, &t->recv_credits); + atomic_add(new_credits, &sc->recv_io.credits.count); return new_credits; } @@ -1729,7 +1724,7 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) goto out_err; } - smb_direct_post_recv_credits(&t->post_recv_credits_work); + smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); return 0; out_err: put_recvmsg(t, recvmsg); @@ -1817,7 +1812,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, } sp->recv_credit_max = smb_direct_receive_credit_max; - t->recv_credit_target = 1; + sc->recv_io.credits.target = 1; sp->send_credit_target = smb_direct_send_credit_target; atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); @@ -2042,9 +2037,9 @@ static int smb_direct_prepare(struct ksmbd_transport *t) le32_to_cpu(req->max_fragmented_size); sp->max_fragmented_recv_size = (sp->recv_credit_max * sp->max_recv_size) / 2; - st->recv_credit_target = le16_to_cpu(req->credits_requested); - st->recv_credit_target = min_t(u16, st->recv_credit_target, sp->recv_credit_max); - st->recv_credit_target = max_t(u16, st->recv_credit_target, 1); + sc->recv_io.credits.target = le16_to_cpu(req->credits_requested); + sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); + sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1); ret = smb_direct_send_negotiate_response(st, ret); out: From 4d2e333dabe74349d257ca2f02225e4bc39f23b8 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 15 Aug 2025 13:01:30 +0200 Subject: [PATCH 3083/3206] smb: server: replace smb_trans_direct_transfort() with SMBD_TRANS() The spelling of smb_trans_direct_transfort was wrong anyway and we don't need the logic twice. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index b2de5311a06644..11fed588fcc086 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -129,12 +129,6 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, struct kvec *iov, int niov, int remaining_data_length); -static inline struct smb_direct_transport * -smb_trans_direct_transfort(struct ksmbd_transport *t) -{ - return container_of(t, struct smb_direct_transport, transport); -} - static inline void *smbdirect_recv_io_payload(struct smbdirect_recv_io *recvmsg) { @@ -629,7 +623,7 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, int to_copy, to_read, data_read, offset; u32 data_length, remaining_data_length, data_offset; int rc; - struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_transport *st = SMBD_TRANS(t); struct smbdirect_socket *sc = &st->socket; again: @@ -1175,7 +1169,7 @@ static int smb_direct_writev(struct ksmbd_transport *t, struct kvec *iov, int niovs, int buflen, bool need_invalidate, unsigned int remote_key) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_transport *st = SMBD_TRANS(t); struct smbdirect_socket *sc = &st->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; size_t remaining_data_length; @@ -1498,7 +1492,7 @@ static int smb_direct_rdma_write(struct ksmbd_transport *t, struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { - return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, + return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, desc, desc_len, false); } @@ -1507,13 +1501,13 @@ static int smb_direct_rdma_read(struct ksmbd_transport *t, struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { - return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, + return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, desc, desc_len, true); } static void smb_direct_disconnect(struct ksmbd_transport *t) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_transport *st = SMBD_TRANS(t); struct smbdirect_socket *sc = &st->socket; ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id); @@ -1523,7 +1517,7 @@ static void smb_direct_disconnect(struct ksmbd_transport *t) static void smb_direct_shutdown(struct ksmbd_transport *t) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_transport *st = SMBD_TRANS(t); struct smbdirect_socket *sc = &st->socket; ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id); @@ -1992,7 +1986,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, static int smb_direct_prepare(struct ksmbd_transport *t) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_transport *st = SMBD_TRANS(t); struct smbdirect_socket *sc = &st->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_recv_io *recvmsg; From 341b6c69b959a901b167a33b7e62e8b2ff8a1e74 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 15 Aug 2025 13:04:31 +0200 Subject: [PATCH 3084/3206] smb: server: remove useless casts from KSMBD_TRANS/SMBD_TRANS At best they gain nothing, at worst we procude real bugs. Note container_of() already casts to a pointer of the given type. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 11fed588fcc086..0abcf2991c3c0d 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -99,8 +99,8 @@ struct smb_direct_transport { u8 responder_resources; }; -#define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport)) -#define SMBD_TRANS(t) ((struct smb_direct_transport *)container_of(t, \ +#define KSMBD_TRANS(t) (&(t)->transport) +#define SMBD_TRANS(t) (container_of(t, \ struct smb_direct_transport, transport)) static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops; From 03a38d846750f06d4fb0aac2213192bdcc2a8435 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 15 Aug 2025 12:03:40 +0200 Subject: [PATCH 3085/3206] smb: server: pass ksmbd_transport to get_smbd_max_read_write_size() We should use the per connection value. And for TCP return NT_STATUS_INVALID_PARAMETER if any SMB2_CHANNEL_RDMA_V1* is used. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 12 ++++++++++-- fs/smb/server/transport_rdma.c | 15 +++++++++++++-- fs/smb/server/transport_rdma.h | 4 ++-- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 00e644935f2864..0c069eff80b771 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -6762,7 +6762,11 @@ int smb2_read(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { is_rdma_channel = true; - max_read_size = get_smbd_max_read_write_size(); + max_read_size = get_smbd_max_read_write_size(work->conn->transport); + if (max_read_size == 0) { + err = -EINVAL; + goto out; + } } if (is_rdma_channel == true) { @@ -7020,7 +7024,11 @@ int smb2_write(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1 || req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { is_rdma_channel = true; - max_write_size = get_smbd_max_read_write_size(); + max_write_size = get_smbd_max_read_write_size(work->conn->transport); + if (max_write_size == 0) { + err = -EINVAL; + goto out; + } length = le32_to_cpu(req->RemainingBytes); } diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 0abcf2991c3c0d..d970a495653376 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -111,9 +111,20 @@ void init_smbd_max_io_size(unsigned int sz) smb_direct_max_read_write_size = sz; } -unsigned int get_smbd_max_read_write_size(void) +unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { - return smb_direct_max_read_write_size; + struct smb_direct_transport *t; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; + + if (kt->ops != &ksmbd_smb_direct_transport_ops) + return 0; + + t = SMBD_TRANS(kt); + sc = &t->socket; + sp = &sc->parameters; + + return sp->max_read_write_size; } static inline int get_buf_page_count(void *buf, int size) diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h index 63eab9f8f13dff..3f93c6a9f7e4ad 100644 --- a/fs/smb/server/transport_rdma.h +++ b/fs/smb/server/transport_rdma.h @@ -17,14 +17,14 @@ void ksmbd_rdma_stop_listening(void); void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); void init_smbd_max_io_size(unsigned int sz); -unsigned int get_smbd_max_read_write_size(void); +unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt); #else static inline int ksmbd_rdma_init(void) { return 0; } static inline void ksmbd_rdma_stop_listening(void) { } static inline void ksmbd_rdma_destroy(void) { } static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } static inline void init_smbd_max_io_size(unsigned int sz) { } -static inline unsigned int get_smbd_max_read_write_size(void) { return 0; } +static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; } #endif #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ From d70e45823961e13af10f44a6a16c5c5a357ccdfb Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 15 Aug 2025 12:22:09 +0200 Subject: [PATCH 3086/3206] smb: server: fill smbdirect_socket_parameters at the beginning and use the values from there This is what we should do and it also simplifies the following changes. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index d970a495653376..0367db232ea890 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -288,6 +288,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) { struct smb_direct_transport *t; struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; struct ksmbd_conn *conn; t = kzalloc(sizeof(*t), KSMBD_DEFAULT_GFP); @@ -295,9 +296,17 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) return NULL; sc = &t->socket; smbdirect_socket_init(sc); + sp = &sc->parameters; INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work); + sp->recv_credit_max = smb_direct_receive_credit_max; + sp->send_credit_target = smb_direct_send_credit_target; + sp->max_send_size = smb_direct_max_send_size; + sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; + sp->max_recv_size = smb_direct_max_receive_size; + sp->max_read_write_size = smb_direct_max_read_write_size; + sc->rdma.cm_id = cm_id; cm_id->context = t; @@ -1757,7 +1766,6 @@ static int smb_direct_init_params(struct smb_direct_transport *t, /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, * SMB2 response could be mapped. */ - sp->max_send_size = smb_direct_max_send_size; max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3; if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) { pr_err("max_send_size %d is too large\n", sp->max_send_size); @@ -1771,7 +1779,6 @@ static int smb_direct_init_params(struct smb_direct_transport *t, * are needed for MR registration, RDMA R/W, local & remote * MR invalidation. */ - sp->max_read_write_size = smb_direct_max_read_write_size; sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(t); sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size, (sc->rw_io.credits.num_pages - 1) * @@ -1786,20 +1793,20 @@ static int smb_direct_init_params(struct smb_direct_transport *t, max_sge_per_wr) + 1); max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit; - max_send_wrs = smb_direct_send_credit_target + max_rw_wrs; + max_send_wrs = sp->send_credit_target + max_rw_wrs; if (max_send_wrs > device->attrs.max_cqe || max_send_wrs > device->attrs.max_qp_wr) { pr_err("consider lowering send_credit_target = %d\n", - smb_direct_send_credit_target); + sp->send_credit_target); pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", device->attrs.max_cqe, device->attrs.max_qp_wr); return -EINVAL; } - if (smb_direct_receive_credit_max > device->attrs.max_cqe || - smb_direct_receive_credit_max > device->attrs.max_qp_wr) { + if (sp->recv_credit_max > device->attrs.max_cqe || + sp->recv_credit_max > device->attrs.max_qp_wr) { pr_err("consider lowering receive_credit_max = %d\n", - smb_direct_receive_credit_max); + sp->recv_credit_max); pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", device->attrs.max_cqe, device->attrs.max_qp_wr); return -EINVAL; @@ -1816,16 +1823,10 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } - sp->recv_credit_max = smb_direct_receive_credit_max; sc->recv_io.credits.target = 1; - sp->send_credit_target = smb_direct_send_credit_target; atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); - sp->max_send_size = smb_direct_max_send_size; - sp->max_recv_size = smb_direct_max_receive_size; - sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; - cap->max_send_wr = max_send_wrs; cap->max_recv_wr = sp->recv_credit_max; cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; @@ -1925,7 +1926,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } sc->ib.send_cq = ib_alloc_cq(sc->ib.dev, t, - smb_direct_send_credit_target + cap->max_rdma_ctxs, + sp->send_credit_target + cap->max_rdma_ctxs, 0, IB_POLL_WORKQUEUE); if (IS_ERR(sc->ib.send_cq)) { pr_err("Can't create RDMA send CQ\n"); From be3c1d032fad608f48b428854694422633c4b004 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 15 Aug 2025 12:31:50 +0200 Subject: [PATCH 3087/3206] smb: server: make use of smbdirect_socket_parameters.negotiate_timeout_msec and change to 5s The server negotiations timer is just 5 seconds in Windows, so use the same. See [MS-SMBD] 3.1.7.2 Connection Arrival. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 0367db232ea890..7d3cf1d22af08b 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -33,8 +33,8 @@ #define SMB_DIRECT_VERSION_LE cpu_to_le16(SMBDIRECT_V1) -/* SMB_DIRECT negotiation timeout in seconds */ -#define SMB_DIRECT_NEGOTIATE_TIMEOUT 120 +/* SMB_DIRECT negotiation timeout (for the server) in seconds */ +#define SMB_DIRECT_NEGOTIATE_TIMEOUT 5 /* * Default maximum number of RDMA read/write outstanding on this connection @@ -300,6 +300,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work); + sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000; sp->recv_credit_max = smb_direct_receive_credit_max; sp->send_credit_target = smb_direct_send_credit_target; sp->max_send_size = smb_direct_max_send_size; @@ -2022,7 +2023,7 @@ static int smb_direct_prepare(struct ksmbd_transport *t) sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED && sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING && sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, - SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); + msecs_to_jiffies(sp->negotiate_timeout_msec)); if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) return ret < 0 ? ret : -ETIMEDOUT; From 49635103e8e68020d80122b9ec408dbbd295d351 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 21 Aug 2025 12:30:32 +0200 Subject: [PATCH 3088/3206] smb: server: make use of smbdirect_socket_parameters.{initiator_depth,responder_resources} This will make it easier to specify these from the outside of the core code first and then negotiate the value with the peer. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 7d3cf1d22af08b..98b1d6453df896 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -95,8 +95,6 @@ struct smb_direct_transport { struct work_struct send_immediate_work; bool legacy_iwarp; - u8 initiator_depth; - u8 responder_resources; }; #define KSMBD_TRANS(t) (&(t)->transport) @@ -301,6 +299,8 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work); sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000; + sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; + sp->responder_resources = 1; sp->recv_credit_max = smb_direct_receive_credit_max; sp->send_credit_target = smb_direct_send_credit_target; sp->max_send_size = smb_direct_max_send_size; @@ -311,9 +311,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sc->rdma.cm_id = cm_id; cm_id->context = t; - t->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; - t->responder_resources = 1; - sc->ib.dev = sc->rdma.cm_id->device; INIT_WORK(&sc->recv_io.posted.refill_work, @@ -1676,18 +1673,19 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, static int smb_direct_accept_client(struct smb_direct_transport *t) { struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct rdma_conn_param conn_param; __be32 ird_ord_hdr[2]; int ret; /* * smb_direct_handle_connect_request() - * already negotiated t->initiator_depth - * and t->responder_resources + * already negotiated sp->initiator_depth + * and sp->responder_resources */ memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = t->initiator_depth; - conn_param.responder_resources = t->responder_resources; + conn_param.initiator_depth = sp->initiator_depth; + conn_param.responder_resources = sp->responder_resources; if (t->legacy_iwarp) { ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); @@ -2103,6 +2101,8 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, struct rdma_cm_event *event) { struct smb_direct_transport *t; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; struct task_struct *handler; u8 peer_initiator_depth; u8 peer_responder_resources; @@ -2118,6 +2118,8 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, t = alloc_transport(new_cm_id); if (!t) return -ENOMEM; + sc = &t->socket; + sp = &sc->parameters; peer_initiator_depth = event->param.conn.initiator_depth; peer_responder_resources = event->param.conn.responder_resources; @@ -2165,7 +2167,7 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, /* * First set what the we as server are able to support */ - t->initiator_depth = min_t(u8, t->initiator_depth, + sp->initiator_depth = min_t(u8, sp->initiator_depth, new_cm_id->device->attrs.max_qp_rd_atom); /* @@ -2174,10 +2176,10 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, * non 0 values. */ if (peer_initiator_depth != 0) - t->initiator_depth = min_t(u8, t->initiator_depth, + sp->initiator_depth = min_t(u8, sp->initiator_depth, peer_initiator_depth); if (peer_responder_resources != 0) - t->responder_resources = min_t(u8, t->responder_resources, + sp->responder_resources = min_t(u8, sp->responder_resources, peer_responder_resources); ret = smb_direct_connect(t); From d0b9b967b35584379fd0421f408c77647e5d6b12 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 21 Aug 2025 12:32:05 +0200 Subject: [PATCH 3089/3206] smb: server: make use of smbdirect_socket.rdma.legacy_iwarp Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 98b1d6453df896..8cbbebb9f1690c 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -93,8 +93,6 @@ struct smb_direct_transport { struct smbdirect_socket socket; struct work_struct send_immediate_work; - - bool legacy_iwarp; }; #define KSMBD_TRANS(t) (&(t)->transport) @@ -1687,7 +1685,7 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) conn_param.initiator_depth = sp->initiator_depth; conn_param.responder_resources = sp->responder_resources; - if (t->legacy_iwarp) { + if (sc->rdma.legacy_iwarp) { ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); conn_param.private_data = ird_ord_hdr; @@ -2158,7 +2156,7 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, ird32 = min_t(u32, ird32, U8_MAX); ord32 = min_t(u32, ord32, U8_MAX); - t->legacy_iwarp = true; + sc->rdma.legacy_iwarp = true; peer_initiator_depth = (u8)ird32; peer_responder_resources = (u8)ord32; } From b8c7776583e72f1fc50f0ed561346954c5d2fe82 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 15 Aug 2025 18:00:36 +0200 Subject: [PATCH 3090/3206] smb: server: make use of smbdirect_socket.idle.immediate_work With this commit the server only uses struct smbdirect_socket! It doesn't use the idle timer yet, but it will be added soon, from there we'll be ready split common functions. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 8cbbebb9f1690c..3bb19b17abae03 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -91,8 +91,6 @@ struct smb_direct_transport { struct ksmbd_transport transport; struct smbdirect_socket socket; - - struct work_struct send_immediate_work; }; #define KSMBD_TRANS(t) (&(t)->transport) @@ -214,8 +212,6 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) { struct smbdirect_socket *sc = container_of(work, struct smbdirect_socket, disconnect_work); - struct smb_direct_transport *t = - container_of(sc, struct smb_direct_transport, socket); /* * make sure this and other work is not queued again @@ -224,7 +220,7 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) */ disable_work(&sc->disconnect_work); disable_work(&sc->recv_io.posted.refill_work); - disable_work(&t->send_immediate_work); + disable_work(&sc->idle.immediate_work); switch (sc->status) { case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: @@ -270,9 +266,10 @@ smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t) static void smb_direct_send_immediate_work(struct work_struct *work) { - struct smb_direct_transport *t = container_of(work, - struct smb_direct_transport, send_immediate_work); - struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.immediate_work); + struct smb_direct_transport *t = + container_of(sc, struct smb_direct_transport, socket); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return; @@ -313,7 +310,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits); - INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work); + INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); conn = ksmbd_conn_alloc(); if (!conn) @@ -348,7 +345,7 @@ static void free_transport(struct smb_direct_transport *t) wake_up_all(&sc->send_io.pending.zero_wait_queue); disable_work_sync(&sc->recv_io.posted.refill_work); - disable_work_sync(&t->send_immediate_work); + disable_work_sync(&sc->idle.immediate_work); if (sc->ib.qp) { ib_drain_qp(sc->ib.qp); @@ -565,7 +562,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) if (le16_to_cpu(data_transfer->flags) & SMBDIRECT_FLAG_RESPONSE_REQUESTED) - queue_work(smb_direct_wq, &t->send_immediate_work); + queue_work(smb_direct_wq, &sc->idle.immediate_work); if (atomic_read(&sc->send_io.credits.count) > 0) wake_up(&sc->send_io.credits.wait_queue); @@ -781,7 +778,7 @@ static void smb_direct_post_recv_credits(struct work_struct *work) } if (credits) - queue_work(smb_direct_wq, &t->send_immediate_work); + queue_work(smb_direct_wq, &sc->idle.immediate_work); } static void send_done(struct ib_cq *cq, struct ib_wc *wc) From 8ad3a97eab240dabbe41b4452d37a5b6b64edf72 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 15 Aug 2025 12:36:33 +0200 Subject: [PATCH 3091/3206] smb: server: implement correct keepalive and timeout handling for smbdirect Now client and server behave in the same way and we can start to share common functions. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 74 ++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 3bb19b17abae03..0865ebb943d320 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -36,6 +36,12 @@ /* SMB_DIRECT negotiation timeout (for the server) in seconds */ #define SMB_DIRECT_NEGOTIATE_TIMEOUT 5 +/* The timeout to wait for a keepalive message from peer in seconds */ +#define SMB_DIRECT_KEEPALIVE_SEND_INTERVAL 120 + +/* The timeout to wait for a keepalive message from peer in seconds */ +#define SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT 5 + /* * Default maximum number of RDMA read/write outstanding on this connection * This value is possibly decreased during QP creation on hardware limit @@ -220,6 +226,7 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) */ disable_work(&sc->disconnect_work); disable_work(&sc->recv_io.posted.refill_work); + disable_delayed_work(&sc->idle.timer_work); disable_work(&sc->idle.immediate_work); switch (sc->status) { @@ -277,6 +284,32 @@ static void smb_direct_send_immediate_work(struct work_struct *work) smb_direct_post_send_data(t, NULL, NULL, 0, 0); } +static void smb_direct_idle_connection_timer(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.timer_work.work); + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smb_direct_transport *t = + container_of(sc, struct smb_direct_transport, socket); + + if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { + smb_direct_disconnect_rdma_connection(t); + return; + } + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return; + + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(smb_direct_wq, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); + queue_work(smb_direct_wq, &sc->idle.immediate_work); +} + static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) { struct smb_direct_transport *t; @@ -302,6 +335,8 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; sp->max_recv_size = smb_direct_max_receive_size; sp->max_read_write_size = smb_direct_max_read_write_size; + sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000; + sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000; sc->rdma.cm_id = cm_id; cm_id->context = t; @@ -311,6 +346,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits); INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); + INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer); conn = ksmbd_conn_alloc(); if (!conn) @@ -345,6 +381,7 @@ static void free_transport(struct smb_direct_transport *t) wake_up_all(&sc->send_io.pending.zero_wait_queue); disable_work_sync(&sc->recv_io.posted.refill_work); + disable_delayed_work_sync(&sc->idle.timer_work); disable_work_sync(&sc->idle.immediate_work); if (sc->ib.qp) { @@ -493,6 +530,14 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr, recvmsg->sge.length, DMA_FROM_DEVICE); + /* + * Reset timer to the keepalive interval in + * order to trigger our next keepalive message. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + mod_delayed_work(smb_direct_wq, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_interval_msec)); + switch (sc->recv_io.expected) { case SMBDIRECT_EXPECT_NEGOTIATE_REQ: if (wc->byte_len < sizeof(struct smbdirect_negotiate_req)) { @@ -839,6 +884,24 @@ static int manage_credits_prior_sending(struct smb_direct_transport *t) return new_credits; } +static int manage_keep_alive_before_sending(struct smb_direct_transport *t) +{ + struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; + + if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + mod_delayed_work(smb_direct_wq, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); + return 1; + } + return 0; +} + static int smb_direct_post_send(struct smb_direct_transport *t, struct ib_send_wr *wr) { @@ -987,6 +1050,9 @@ static int smb_direct_create_header(struct smb_direct_transport *t, packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); packet->flags = 0; + if (manage_keep_alive_before_sending(t)) + packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); + packet->reserved = 0; if (!size) packet->data_offset = 0; @@ -1695,6 +1761,14 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY; conn_param.flow_control = 0; + /* + * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING + * so that the timer will cause a disconnect. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(smb_direct_wq, &sc->idle.timer_work, + msecs_to_jiffies(sp->negotiate_timeout_msec)); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; ret = rdma_accept(sc->rdma.cm_id, &conn_param); From da7d45b9a3858abc89030cf08368e59b2ab2bdd4 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 12:32:48 +0200 Subject: [PATCH 3092/3206] smb: server: make use of smbdirect_socket.workqueue We still use the single global workqueue, but this will allow us to share common code soon. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 0865ebb943d320..bc05ac7366e690 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -180,7 +180,7 @@ static void put_recvmsg(struct smb_direct_transport *t, list_add(&recvmsg->list, &sc->recv_io.free.list); spin_unlock(&sc->recv_io.free.lock); - queue_work(smb_direct_wq, &sc->recv_io.posted.refill_work); + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); } static void enqueue_reassembly(struct smb_direct_transport *t, @@ -268,7 +268,7 @@ smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t) { struct smbdirect_socket *sc = &t->socket; - queue_work(smb_direct_wq, &sc->disconnect_work); + queue_work(sc->workqueue, &sc->disconnect_work); } static void smb_direct_send_immediate_work(struct work_struct *work) @@ -305,9 +305,9 @@ static void smb_direct_idle_connection_timer(struct work_struct *work) * in order to wait for a response */ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; - mod_delayed_work(smb_direct_wq, &sc->idle.timer_work, + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->keepalive_timeout_msec)); - queue_work(smb_direct_wq, &sc->idle.immediate_work); + queue_work(sc->workqueue, &sc->idle.immediate_work); } static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) @@ -324,6 +324,8 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) smbdirect_socket_init(sc); sp = &sc->parameters; + sc->workqueue = smb_direct_wq; + INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work); sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000; @@ -535,7 +537,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) * order to trigger our next keepalive message. */ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; - mod_delayed_work(smb_direct_wq, &sc->idle.timer_work, + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->keepalive_interval_msec)); switch (sc->recv_io.expected) { @@ -607,14 +609,14 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) if (le16_to_cpu(data_transfer->flags) & SMBDIRECT_FLAG_RESPONSE_REQUESTED) - queue_work(smb_direct_wq, &sc->idle.immediate_work); + queue_work(sc->workqueue, &sc->idle.immediate_work); if (atomic_read(&sc->send_io.credits.count) > 0) wake_up(&sc->send_io.credits.wait_queue); if (data_length) { if (sc->recv_io.credits.target > old_recv_credit_target) - queue_work(smb_direct_wq, &sc->recv_io.posted.refill_work); + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); enqueue_reassembly(t, recvmsg, (int)data_length); wake_up(&sc->recv_io.reassembly.wait_queue); @@ -823,7 +825,7 @@ static void smb_direct_post_recv_credits(struct work_struct *work) } if (credits) - queue_work(smb_direct_wq, &sc->idle.immediate_work); + queue_work(sc->workqueue, &sc->idle.immediate_work); } static void send_done(struct ib_cq *cq, struct ib_wc *wc) @@ -895,7 +897,7 @@ static int manage_keep_alive_before_sending(struct smb_direct_transport *t) * Now use the keepalive timeout (instead of keepalive interval) * in order to wait for a response */ - mod_delayed_work(smb_direct_wq, &sc->idle.timer_work, + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->keepalive_timeout_msec)); return 1; } @@ -1766,7 +1768,7 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) * so that the timer will cause a disconnect. */ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; - mod_delayed_work(smb_direct_wq, &sc->idle.timer_work, + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, msecs_to_jiffies(sp->negotiate_timeout_msec)); WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); From 01721df4e309e76866dd4d28b78e8683581e9e87 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 01:14:38 +0200 Subject: [PATCH 3093/3206] smb: server: pass struct smbdirect_socket to {get_free,put}_recvmsg() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 37 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index bc05ac7366e690..2e14905c15038c 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -147,9 +147,8 @@ static inline void } static struct -smbdirect_recv_io *get_free_recvmsg(struct smb_direct_transport *t) +smbdirect_recv_io *get_free_recvmsg(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_recv_io *recvmsg = NULL; spin_lock(&sc->recv_io.free.lock); @@ -163,11 +162,9 @@ smbdirect_recv_io *get_free_recvmsg(struct smb_direct_transport *t) return recvmsg; } -static void put_recvmsg(struct smb_direct_transport *t, +static void put_recvmsg(struct smbdirect_socket *sc, struct smbdirect_recv_io *recvmsg) { - struct smbdirect_socket *sc = &t->socket; - if (likely(recvmsg->sge.length != 0)) { ib_dma_unmap_single(sc->ib.dev, recvmsg->sge.addr, @@ -400,7 +397,7 @@ static void free_transport(struct smb_direct_transport *t) if (recvmsg) { list_del(&recvmsg->list); spin_unlock(&sc->recv_io.reassembly.lock); - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); } else { spin_unlock(&sc->recv_io.reassembly.lock); } @@ -515,7 +512,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) t = container_of(sc, struct smb_direct_transport, socket); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); if (wc->status != IB_WC_WR_FLUSH_ERR) { pr_err("Recv error. status='%s (%d)' opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, @@ -543,7 +540,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) switch (sc->recv_io.expected) { case SMBDIRECT_EXPECT_NEGOTIATE_REQ: if (wc->byte_len < sizeof(struct smbdirect_negotiate_req)) { - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); smb_direct_disconnect_rdma_connection(t); return; } @@ -561,7 +558,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) if (wc->byte_len < offsetof(struct smbdirect_data_transfer, padding)) { - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); smb_direct_disconnect_rdma_connection(t); return; } @@ -571,7 +568,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) data_offset = le32_to_cpu(data_transfer->data_offset); if (wc->byte_len < data_offset || wc->byte_len < (u64)data_offset + data_length) { - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); smb_direct_disconnect_rdma_connection(t); return; } @@ -579,7 +576,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) data_length > sp->max_fragmented_recv_size || (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size) { - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); smb_direct_disconnect_rdma_connection(t); return; } @@ -621,7 +618,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) enqueue_reassembly(t, recvmsg, (int)data_length); wake_up(&sc->recv_io.reassembly.wait_queue); } else - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); return; } @@ -634,7 +631,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) * This is an internal error! */ WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); smb_direct_disconnect_rdma_connection(t); } @@ -760,7 +757,7 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, spin_unlock_irq(&sc->recv_io.reassembly.lock); } queue_removed++; - put_recvmsg(st, recvmsg); + put_recvmsg(sc, recvmsg); offset = 0; } else { offset += to_copy; @@ -806,7 +803,7 @@ static void smb_direct_post_recv_credits(struct work_struct *work) if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target) { while (true) { - recvmsg = get_free_recvmsg(t); + recvmsg = get_free_recvmsg(sc); if (!recvmsg) break; @@ -815,7 +812,7 @@ static void smb_direct_post_recv_credits(struct work_struct *work) ret = smb_direct_post_recv(t, recvmsg); if (ret) { pr_err("Can't post recv: %d\n", ret); - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); break; } credits++; @@ -1792,7 +1789,7 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ; - recvmsg = get_free_recvmsg(t); + recvmsg = get_free_recvmsg(sc); if (!recvmsg) return -ENOMEM; @@ -1811,7 +1808,7 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); return 0; out_err: - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); return ret; } @@ -1911,7 +1908,7 @@ static void smb_direct_destroy_pools(struct smb_direct_transport *t) struct smbdirect_socket *sc = &t->socket; struct smbdirect_recv_io *recvmsg; - while ((recvmsg = get_free_recvmsg(t))) + while ((recvmsg = get_free_recvmsg(sc))) mempool_free(recvmsg, sc->recv_io.mem.pool); mempool_destroy(sc->recv_io.mem.pool); @@ -2123,7 +2120,7 @@ static int smb_direct_prepare(struct ksmbd_transport *t) sc->recv_io.reassembly.queue_length--; list_del(&recvmsg->list); spin_unlock_irq(&sc->recv_io.reassembly.lock); - put_recvmsg(st, recvmsg); + put_recvmsg(sc, recvmsg); return ret; } From ce85071d2cd63badaa71651033aa9f702ed36734 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 13:50:42 +0200 Subject: [PATCH 3094/3206] smb: server: pass struct smbdirect_socket to smb_direct_{create,destroy}_pools() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 2e14905c15038c..7e1afe30ed3caf 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -133,7 +133,7 @@ static inline int get_buf_page_count(void *buf, int size) (uintptr_t)buf / PAGE_SIZE; } -static void smb_direct_destroy_pools(struct smb_direct_transport *transport); +static void smb_direct_destroy_pools(struct smbdirect_socket *sc); static void smb_direct_post_recv_credits(struct work_struct *work); static int smb_direct_post_send_data(struct smb_direct_transport *t, struct smbdirect_send_batch *send_ctx, @@ -413,7 +413,7 @@ static void free_transport(struct smb_direct_transport *t) if (sc->rdma.cm_id) rdma_destroy_id(sc->rdma.cm_id); - smb_direct_destroy_pools(t); + smb_direct_destroy_pools(sc); ksmbd_conn_free(KSMBD_TRANS(t)->conn); } @@ -1903,9 +1903,8 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return 0; } -static void smb_direct_destroy_pools(struct smb_direct_transport *t) +static void smb_direct_destroy_pools(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_recv_io *recvmsg; while ((recvmsg = get_free_recvmsg(sc))) @@ -1924,15 +1923,14 @@ static void smb_direct_destroy_pools(struct smb_direct_transport *t) sc->send_io.mem.cache = NULL; } -static int smb_direct_create_pools(struct smb_direct_transport *t) +static int smb_direct_create_pools(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; char name[80]; int i; struct smbdirect_recv_io *recvmsg; - snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", t); + snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", sc); sc->send_io.mem.cache = kmem_cache_create(name, sizeof(struct smbdirect_send_io) + sizeof(struct smbdirect_negotiate_resp), @@ -1946,7 +1944,7 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) if (!sc->send_io.mem.pool) goto err; - snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", t); + snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", sc); sc->recv_io.mem.cache = kmem_cache_create(name, sizeof(struct smbdirect_recv_io) + sp->max_recv_size, @@ -1971,7 +1969,7 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) return 0; err: - smb_direct_destroy_pools(t); + smb_direct_destroy_pools(sc); return -ENOMEM; } @@ -2127,6 +2125,7 @@ static int smb_direct_prepare(struct ksmbd_transport *t) static int smb_direct_connect(struct smb_direct_transport *st) { + struct smbdirect_socket *sc = &st->socket; int ret; struct ib_qp_cap qp_cap; @@ -2136,7 +2135,7 @@ static int smb_direct_connect(struct smb_direct_transport *st) return ret; } - ret = smb_direct_create_pools(st); + ret = smb_direct_create_pools(sc); if (ret) { pr_err("Can't init RDMA pool: %d\n", ret); return ret; From 57131bf9e87677dcb64c8ce517c220b1fc403022 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 13:55:17 +0200 Subject: [PATCH 3095/3206] smb: server: pass struct smbdirect_socket to smb_direct_get_max_fr_pages() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 7e1afe30ed3caf..547657e14544a8 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1812,10 +1812,8 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) return ret; } -static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t) +static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &t->socket; - return min_t(unsigned int, sc->ib.dev->attrs.max_fast_reg_page_list_len, 256); @@ -1846,7 +1844,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, * are needed for MR registration, RDMA R/W, local & remote * MR invalidation. */ - sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(t); + sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(sc); sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size, (sc->rw_io.credits.num_pages - 1) * PAGE_SIZE); From a25075f112b4bc02335d4df930409982405987f6 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 14:04:34 +0200 Subject: [PATCH 3096/3206] smb: server: pass struct smbdirect_socket to smb_direct_init_params() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 547657e14544a8..ce2426ec5ec6f2 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1819,10 +1819,9 @@ static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc) 256); } -static int smb_direct_init_params(struct smb_direct_transport *t, +static int smb_direct_init_params(struct smbdirect_socket *sc, struct ib_qp_cap *cap) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_device *device = sc->ib.dev; int max_send_sges, max_rw_wrs, max_send_wrs; @@ -2127,7 +2126,7 @@ static int smb_direct_connect(struct smb_direct_transport *st) int ret; struct ib_qp_cap qp_cap; - ret = smb_direct_init_params(st, &qp_cap); + ret = smb_direct_init_params(sc, &qp_cap); if (ret) { pr_err("Can't configure RDMA parameters\n"); return ret; From ece37ea815a70cd170d6922d33b7c9a7e8f2aa9e Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 14:48:01 +0200 Subject: [PATCH 3097/3206] smb: server: pass struct smbdirect_socket to smb_direct_disconnect_rdma_connection() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index ce2426ec5ec6f2..acf3f8bba22376 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -261,10 +261,8 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) } static void -smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t) +smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &t->socket; - queue_work(sc->workqueue, &sc->disconnect_work); } @@ -286,11 +284,9 @@ static void smb_direct_idle_connection_timer(struct work_struct *work) struct smbdirect_socket *sc = container_of(work, struct smbdirect_socket, idle.timer_work.work); struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smb_direct_transport *t = - container_of(sc, struct smb_direct_transport, socket); if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); return; } @@ -517,7 +513,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) pr_err("Recv error. status='%s (%d)' opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, wc->opcode); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); } return; } @@ -541,7 +537,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) case SMBDIRECT_EXPECT_NEGOTIATE_REQ: if (wc->byte_len < sizeof(struct smbdirect_negotiate_req)) { put_recvmsg(sc, recvmsg); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); return; } sc->recv_io.reassembly.full_packet_received = true; @@ -559,7 +555,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) if (wc->byte_len < offsetof(struct smbdirect_data_transfer, padding)) { put_recvmsg(sc, recvmsg); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); return; } @@ -569,7 +565,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) if (wc->byte_len < data_offset || wc->byte_len < (u64)data_offset + data_length) { put_recvmsg(sc, recvmsg); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); return; } if (remaining_data_length > sp->max_fragmented_recv_size || @@ -577,7 +573,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size) { put_recvmsg(sc, recvmsg); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); return; } @@ -632,7 +628,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) */ WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); put_recvmsg(sc, recvmsg); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); } static int smb_direct_post_recv(struct smb_direct_transport *t, @@ -666,7 +662,7 @@ static int smb_direct_post_recv(struct smb_direct_transport *t, recvmsg->sge.addr, recvmsg->sge.length, DMA_FROM_DEVICE); recvmsg->sge.length = 0; - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); return ret; } return ret; @@ -844,7 +840,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) pr_err("Send error. status='%s (%d)', opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, wc->opcode); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); } if (atomic_dec_and_test(&sc->send_io.pending.count)) @@ -913,7 +909,7 @@ static int smb_direct_post_send(struct smb_direct_transport *t, pr_err("failed to post send: %d\n", ret); if (atomic_dec_and_test(&sc->send_io.pending.count)) wake_up(&sc->send_io.pending.zero_wait_queue); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); } return ret; } @@ -1409,15 +1405,13 @@ static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, struct smbdirect_rw_io *msg = container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe); struct smbdirect_socket *sc = msg->socket; - struct smb_direct_transport *t = - container_of(sc, struct smb_direct_transport, socket); if (wc->status != IB_WC_SUCCESS) { msg->error = -EIO; pr_err("read/write error. opcode = %d, status = %s(%d)\n", wc->opcode, ib_wc_status_msg(wc->status), wc->status); if (wc->status != IB_WC_WR_FLUSH_ERR) - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); } complete(msg->completion); @@ -1655,7 +1649,7 @@ static void smb_direct_qpair_handler(struct ib_event *event, void *context) switch (event->event) { case IB_EVENT_CQ_ERR: case IB_EVENT_QP_FATAL: - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); break; default: break; From 56227a7aa5c6e159205d79444833c0e2874dcc94 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 14:44:16 +0200 Subject: [PATCH 3098/3206] smb: server: pass struct smbdirect_socket to smb_direct_cm_handler() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index acf3f8bba22376..8ca75afe082c98 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -334,7 +334,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000; sc->rdma.cm_id = cm_id; - cm_id->context = t; + cm_id->context = sc; sc->ib.dev = sc->rdma.cm_id->device; @@ -1600,8 +1600,7 @@ static void smb_direct_shutdown(struct ksmbd_transport *t) static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { - struct smb_direct_transport *t = cm_id->context; - struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket *sc = cm_id->context; ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n", cm_id, rdma_event_msg(event->event), event->event); From f75c226db0d29ee422f9b3ffd0a4730e6c552fbe Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 14:49:18 +0200 Subject: [PATCH 3099/3206] smb: server: pass struct smbdirect_socket to smb_direct_qpair_handler() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 8ca75afe082c98..9ccc0930e99e50 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1639,8 +1639,7 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, static void smb_direct_qpair_handler(struct ib_event *event, void *context) { - struct smb_direct_transport *t = context; - struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket *sc = context; ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n", sc->rdma.cm_id, ib_event_msg(event->event), event->event); @@ -1980,7 +1979,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, return ret; } - sc->ib.send_cq = ib_alloc_cq(sc->ib.dev, t, + sc->ib.send_cq = ib_alloc_cq(sc->ib.dev, sc, sp->send_credit_target + cap->max_rdma_ctxs, 0, IB_POLL_WORKQUEUE); if (IS_ERR(sc->ib.send_cq)) { @@ -1990,7 +1989,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, goto err; } - sc->ib.recv_cq = ib_alloc_cq(sc->ib.dev, t, + sc->ib.recv_cq = ib_alloc_cq(sc->ib.dev, sc, sp->recv_credit_max, 0, IB_POLL_WORKQUEUE); if (IS_ERR(sc->ib.recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); @@ -2001,7 +2000,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.event_handler = smb_direct_qpair_handler; - qp_attr.qp_context = t; + qp_attr.qp_context = sc; qp_attr.cap = *cap; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; From cf789396479593d4929bdfdc775a45c9b7482360 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 14:50:32 +0200 Subject: [PATCH 3100/3206] smb: server: pass struct smbdirect_socket to smb_direct_create_qpair() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 9ccc0930e99e50..6d1a39afd2a2e5 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1962,10 +1962,9 @@ static int smb_direct_create_pools(struct smbdirect_socket *sc) return -ENOMEM; } -static int smb_direct_create_qpair(struct smb_direct_transport *t, +static int smb_direct_create_qpair(struct smbdirect_socket *sc, struct ib_qp_cap *cap) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; int ret; struct ib_qp_init_attr qp_attr; @@ -2130,7 +2129,7 @@ static int smb_direct_connect(struct smb_direct_transport *st) return ret; } - ret = smb_direct_create_qpair(st, &qp_cap); + ret = smb_direct_create_qpair(sc, &qp_cap); if (ret) { pr_err("Can't accept RDMA client: %d\n", ret); return ret; From 9221b12b356729cb9cb2a0e0ca3062b73aafb04c Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 14:52:04 +0200 Subject: [PATCH 3101/3206] smb: server: pass struct smbdirect_socket to smb_direct_post_recv() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 6d1a39afd2a2e5..01b1d35a0b1c6e 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -631,10 +631,9 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(sc); } -static int smb_direct_post_recv(struct smb_direct_transport *t, +static int smb_direct_post_recv(struct smbdirect_socket *sc, struct smbdirect_recv_io *recvmsg) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_recv_wr wr; int ret; @@ -791,8 +790,6 @@ static void smb_direct_post_recv_credits(struct work_struct *work) { struct smbdirect_socket *sc = container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); - struct smb_direct_transport *t = - container_of(sc, struct smb_direct_transport, socket); struct smbdirect_recv_io *recvmsg; int credits = 0; int ret; @@ -805,7 +802,7 @@ static void smb_direct_post_recv_credits(struct work_struct *work) recvmsg->first_segment = false; - ret = smb_direct_post_recv(t, recvmsg); + ret = smb_direct_post_recv(sc, recvmsg); if (ret) { pr_err("Can't post recv: %d\n", ret); put_recvmsg(sc, recvmsg); @@ -1785,7 +1782,7 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) if (!recvmsg) return -ENOMEM; - ret = smb_direct_post_recv(t, recvmsg); + ret = smb_direct_post_recv(sc, recvmsg); if (ret) { pr_err("Can't post recv: %d\n", ret); goto out_err; From 56bcc18d24f73f66b4211d4d028a626c0a5c56af Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 14:53:13 +0200 Subject: [PATCH 3102/3206] smb: server: pass struct smbdirect_socket to smb_direct_accept_client() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 01b1d35a0b1c6e..d2836780308e37 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1719,9 +1719,8 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, return 0; } -static int smb_direct_accept_client(struct smb_direct_transport *t) +static int smb_direct_accept_client(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct rdma_conn_param conn_param; __be32 ird_ord_hdr[2]; @@ -1788,7 +1787,7 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) goto out_err; } - ret = smb_direct_accept_client(t); + ret = smb_direct_accept_client(sc); if (ret) { pr_err("Can't accept client\n"); goto out_err; From 7cb0ab73f82f19a7d88d01a2949fc1275f56be39 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 14:54:17 +0200 Subject: [PATCH 3103/3206] smb: server: pass struct smbdirect_socket to smb_direct_prepare_negotiation() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index d2836780308e37..4ae94b67461fa9 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1766,11 +1766,10 @@ static int smb_direct_accept_client(struct smbdirect_socket *sc) return 0; } -static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) +static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &t->socket; - int ret; struct smbdirect_recv_io *recvmsg; + int ret; WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; @@ -2131,7 +2130,7 @@ static int smb_direct_connect(struct smb_direct_transport *st) return ret; } - ret = smb_direct_prepare_negotiation(st); + ret = smb_direct_prepare_negotiation(sc); if (ret) { pr_err("Can't negotiate: %d\n", ret); return ret; From 100aec545ad170ac9139dd66e3475501f4b5bda4 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 14:55:50 +0200 Subject: [PATCH 3104/3206] smb: server: pass struct smbdirect_socket to smb_direct_connect() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 4ae94b67461fa9..f8d08a329153bb 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -2106,11 +2106,10 @@ static int smb_direct_prepare(struct ksmbd_transport *t) return ret; } -static int smb_direct_connect(struct smb_direct_transport *st) +static int smb_direct_connect(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &st->socket; - int ret; struct ib_qp_cap qp_cap; + int ret; ret = smb_direct_init_params(sc, &qp_cap); if (ret) { @@ -2232,7 +2231,7 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, sp->responder_resources = min_t(u8, sp->responder_resources, peer_responder_resources); - ret = smb_direct_connect(t); + ret = smb_direct_connect(sc); if (ret) goto out_err; From 789cfc2ffa72d75394cee78abf3fd4d57d623399 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 15:07:35 +0200 Subject: [PATCH 3105/3206] smb: server: pass struct smbdirect_socket to smb_direct_{alloc,free}_sendmsg() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index f8d08a329153bb..05900f62f2e7ab 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -414,9 +414,8 @@ static void free_transport(struct smb_direct_transport *t) } static struct smbdirect_send_io -*smb_direct_alloc_sendmsg(struct smb_direct_transport *t) +*smb_direct_alloc_sendmsg(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_send_io *msg; msg = mempool_alloc(sc->send_io.mem.pool, KSMBD_DEFAULT_GFP); @@ -428,10 +427,9 @@ static struct smbdirect_send_io return msg; } -static void smb_direct_free_sendmsg(struct smb_direct_transport *t, +static void smb_direct_free_sendmsg(struct smbdirect_socket *sc, struct smbdirect_send_io *msg) { - struct smbdirect_socket *sc = &t->socket; int i; if (msg->num_sge > 0) { @@ -821,13 +819,11 @@ static void smb_direct_post_recv_credits(struct work_struct *work) static void send_done(struct ib_cq *cq, struct ib_wc *wc) { struct smbdirect_send_io *sendmsg, *sibling; - struct smb_direct_transport *t; struct smbdirect_socket *sc; struct list_head *pos, *prev, *end; sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); sc = sendmsg->socket; - t = container_of(sc, struct smb_direct_transport, socket); ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, @@ -849,11 +845,11 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) for (pos = &sendmsg->sibling_list, prev = pos->prev, end = sendmsg->sibling_list.next; prev != end; pos = prev, prev = prev->prev) { sibling = container_of(pos, struct smbdirect_send_io, sibling_list); - smb_direct_free_sendmsg(t, sibling); + smb_direct_free_sendmsg(sc, sibling); } sibling = container_of(pos, struct smbdirect_send_io, sibling_list); - smb_direct_free_sendmsg(t, sibling); + smb_direct_free_sendmsg(sc, sibling); } static int manage_credits_prior_sending(struct smb_direct_transport *t) @@ -957,7 +953,7 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, wake_up(&sc->send_io.credits.wait_queue); list_for_each_entry_safe(first, last, &send_ctx->msg_list, sibling_list) { - smb_direct_free_sendmsg(t, first); + smb_direct_free_sendmsg(sc, first); } } return ret; @@ -1032,7 +1028,7 @@ static int smb_direct_create_header(struct smb_direct_transport *t, int header_length; int ret; - sendmsg = smb_direct_alloc_sendmsg(t); + sendmsg = smb_direct_alloc_sendmsg(sc); if (IS_ERR(sendmsg)) return PTR_ERR(sendmsg); @@ -1075,7 +1071,7 @@ static int smb_direct_create_header(struct smb_direct_transport *t, DMA_TO_DEVICE); ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); if (ret) { - smb_direct_free_sendmsg(t, sendmsg); + smb_direct_free_sendmsg(sc, sendmsg); return ret; } @@ -1231,7 +1227,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, goto err; return 0; err: - smb_direct_free_sendmsg(t, msg); + smb_direct_free_sendmsg(sc, msg); atomic_inc(&sc->send_io.credits.count); return ret; } @@ -1660,7 +1656,7 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, struct smbdirect_negotiate_resp *resp; int ret; - sendmsg = smb_direct_alloc_sendmsg(t); + sendmsg = smb_direct_alloc_sendmsg(sc); if (IS_ERR(sendmsg)) return -ENOMEM; @@ -1696,7 +1692,7 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, DMA_TO_DEVICE); ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); if (ret) { - smb_direct_free_sendmsg(t, sendmsg); + smb_direct_free_sendmsg(sc, sendmsg); return ret; } @@ -1706,7 +1702,7 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, ret = post_sendmsg(t, NULL, sendmsg); if (ret) { - smb_direct_free_sendmsg(t, sendmsg); + smb_direct_free_sendmsg(sc, sendmsg); return ret; } From 56b442a248b18e20b7a1b17d8bce2a97c62c8063 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 15:08:27 +0200 Subject: [PATCH 3106/3206] smb: server: remove unused struct struct smb_direct_transport argument from smb_direct_send_ctx_init() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 05900f62f2e7ab..a9cee7f3da856f 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -907,8 +907,7 @@ static int smb_direct_post_send(struct smb_direct_transport *t, return ret; } -static void smb_direct_send_ctx_init(struct smb_direct_transport *t, - struct smbdirect_send_batch *send_ctx, +static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx, bool need_invalidate_rkey, unsigned int remote_key) { @@ -945,7 +944,7 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, ret = smb_direct_post_send(t, &first->wr); if (!ret) { - smb_direct_send_ctx_init(t, send_ctx, + smb_direct_send_ctx_init(send_ctx, send_ctx->need_invalidate_rkey, send_ctx->remote_key); } else { @@ -1261,7 +1260,7 @@ static int smb_direct_writev(struct ksmbd_transport *t, remaining_data_length = buflen; ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); - smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key); + smb_direct_send_ctx_init(&send_ctx, need_invalidate, remote_key); while (remaining_data_length) { struct kvec vecs[SMBDIRECT_SEND_IO_MAX_SGE - 1]; /* minus smbdirect hdr */ size_t possible_bytes = max_iov_size; From ecb56dbc93c67a0e03b6c4877ac1ead4cde0e9ca Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 15:46:53 +0200 Subject: [PATCH 3107/3206] smb: server: pass struct smbdirect_socket to smb_direct_post_send() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index a9cee7f3da856f..c5bc4a850e6b8c 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -890,10 +890,9 @@ static int manage_keep_alive_before_sending(struct smb_direct_transport *t) return 0; } -static int smb_direct_post_send(struct smb_direct_transport *t, +static int smb_direct_post_send(struct smbdirect_socket *sc, struct ib_send_wr *wr) { - struct smbdirect_socket *sc = &t->socket; int ret; atomic_inc(&sc->send_io.pending.count); @@ -942,7 +941,7 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, last->wr.ex.invalidate_rkey = send_ctx->remote_key; } - ret = smb_direct_post_send(t, &first->wr); + ret = smb_direct_post_send(sc, &first->wr); if (!ret) { smb_direct_send_ctx_init(send_ctx, send_ctx->need_invalidate_rkey, @@ -1162,7 +1161,7 @@ static int post_sendmsg(struct smb_direct_transport *t, msg->wr.wr_cqe = &msg->cqe; msg->wr.send_flags = IB_SEND_SIGNALED; - return smb_direct_post_send(t, &msg->wr); + return smb_direct_post_send(sc, &msg->wr); } static int smb_direct_post_send_data(struct smb_direct_transport *t, From d14910bbf64404cb74f35419ca2cb427a558a43c Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 15:48:05 +0200 Subject: [PATCH 3108/3206] smb: server: pass struct smbdirect_socket to smb_direct_flush_send_list() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c5bc4a850e6b8c..dc5a3cf9d0c764 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -916,11 +916,10 @@ static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx, send_ctx->remote_key = remote_key; } -static int smb_direct_flush_send_list(struct smb_direct_transport *t, +static int smb_direct_flush_send_list(struct smbdirect_socket *sc, struct smbdirect_send_batch *send_ctx, bool is_last) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_send_io *first, *last; int ret; @@ -988,7 +987,7 @@ static int wait_for_send_credits(struct smb_direct_transport *t, if (send_ctx && (send_ctx->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) { - ret = smb_direct_flush_send_list(t, send_ctx, false); + ret = smb_direct_flush_send_list(sc, send_ctx, false); if (ret) return ret; } @@ -1358,7 +1357,7 @@ static int smb_direct_writev(struct ksmbd_transport *t, } done: - ret = smb_direct_flush_send_list(st, &send_ctx, true); + ret = smb_direct_flush_send_list(sc, &send_ctx, true); if (unlikely(!ret && error)) ret = error; From 7f4805b7db94b75003d08a97f5125617b42c625b Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 15:49:40 +0200 Subject: [PATCH 3109/3206] smb: server: pass struct smbdirect_socket to wait_for_credits() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index dc5a3cf9d0c764..2ae6500dad2c37 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -956,11 +956,10 @@ static int smb_direct_flush_send_list(struct smbdirect_socket *sc, return ret; } -static int wait_for_credits(struct smb_direct_transport *t, +static int wait_for_credits(struct smbdirect_socket *sc, wait_queue_head_t *waitq, atomic_t *total_credits, int needed) { - struct smbdirect_socket *sc = &t->socket; int ret; do { @@ -992,14 +991,14 @@ static int wait_for_send_credits(struct smb_direct_transport *t, return ret; } - return wait_for_credits(t, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1); + return wait_for_credits(sc, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1); } static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) { struct smbdirect_socket *sc = &t->socket; - return wait_for_credits(t, + return wait_for_credits(sc, &sc->rw_io.credits.wait_queue, &sc->rw_io.credits.count, credits); From 2dc6c7e8ba3f348b8bcf348c0f736898170225fd Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 15:51:53 +0200 Subject: [PATCH 3110/3206] smb: server: pass struct smbdirect_socket to wait_for_send_credits() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 2ae6500dad2c37..8a9bf68a0ef93f 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -978,10 +978,9 @@ static int wait_for_credits(struct smbdirect_socket *sc, } while (true); } -static int wait_for_send_credits(struct smb_direct_transport *t, +static int wait_for_send_credits(struct smbdirect_socket *sc, struct smbdirect_send_batch *send_ctx) { - struct smbdirect_socket *sc = &t->socket; int ret; if (send_ctx && @@ -1173,7 +1172,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, int data_length; struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1]; - ret = wait_for_send_credits(t, send_ctx); + ret = wait_for_send_credits(sc, send_ctx); if (ret) return ret; From cae2d9a2298d04c237c6b7c57b9584068a2223df Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 15:52:47 +0200 Subject: [PATCH 3111/3206] smb: server: pass struct smbdirect_socket to wait_for_rw_credits() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 8a9bf68a0ef93f..02982cadd30482 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -993,10 +993,8 @@ static int wait_for_send_credits(struct smbdirect_socket *sc, return wait_for_credits(sc, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1); } -static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) +static int wait_for_rw_credits(struct smbdirect_socket *sc, int credits) { - struct smbdirect_socket *sc = &t->socket; - return wait_for_credits(sc, &sc->rw_io.credits.wait_queue, &sc->rw_io.credits.count, @@ -1464,7 +1462,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", str_read_write(is_read), buf_len, credits_needed); - ret = wait_for_rw_credits(t, credits_needed); + ret = wait_for_rw_credits(sc, credits_needed); if (ret < 0) return ret; From b156d2c559b0ac9b410effd081262d96aa9bf00e Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 15:55:20 +0200 Subject: [PATCH 3112/3206] smb: server: pass struct smbdirect_socket to calc_rw_credits() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 02982cadd30482..9f47faf0ce09fb 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1001,11 +1001,9 @@ static int wait_for_rw_credits(struct smbdirect_socket *sc, int credits) credits); } -static int calc_rw_credits(struct smb_direct_transport *t, +static int calc_rw_credits(struct smbdirect_socket *sc, char *buf, unsigned int len) { - struct smbdirect_socket *sc = &t->socket; - return DIV_ROUND_UP(get_buf_page_count(buf, len), sc->rw_io.credits.num_pages); } @@ -1453,7 +1451,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, buf_len = 0; } - credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); + credits_needed += calc_rw_credits(sc, desc_buf, desc_buf_len); desc_buf += desc_buf_len; buf_len -= desc_buf_len; desc_num++; From 663b3c3c864c906373ef231cbb1755b353cb5f9f Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 15:57:41 +0200 Subject: [PATCH 3113/3206] smb: server: pass struct smbdirect_socket to manage_credits_prior_sending() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 9f47faf0ce09fb..d7cc593705ddd4 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -852,9 +852,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_free_sendmsg(sc, sibling); } -static int manage_credits_prior_sending(struct smb_direct_transport *t) +static int manage_credits_prior_sending(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &t->socket; int new_credits; if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) @@ -1026,7 +1025,7 @@ static int smb_direct_create_header(struct smb_direct_transport *t, /* Fill in the packet header */ packet = (struct smbdirect_data_transfer *)sendmsg->packet; packet->credits_requested = cpu_to_le16(sp->send_credit_target); - packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); + packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); packet->flags = 0; if (manage_keep_alive_before_sending(t)) @@ -1667,7 +1666,7 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, resp->reserved = 0; resp->credits_requested = cpu_to_le16(sp->send_credit_target); - resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); + resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); resp->max_readwrite_size = cpu_to_le32(sp->max_read_write_size); resp->preferred_send_size = cpu_to_le32(sp->max_send_size); resp->max_receive_size = cpu_to_le32(sp->max_recv_size); From 4d1dffe0a08d87eb4fb1a62e4d76f541849db94a Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 15:59:35 +0200 Subject: [PATCH 3114/3206] smb: server: pass struct smbdirect_socket to manage_keep_alive_before_sending() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index d7cc593705ddd4..4bf5903fadb6a0 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -871,9 +871,8 @@ static int manage_credits_prior_sending(struct smbdirect_socket *sc) return new_credits; } -static int manage_keep_alive_before_sending(struct smb_direct_transport *t) +static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { @@ -1028,7 +1027,7 @@ static int smb_direct_create_header(struct smb_direct_transport *t, packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); packet->flags = 0; - if (manage_keep_alive_before_sending(t)) + if (manage_keep_alive_before_sending(sc)) packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); packet->reserved = 0; From 0a715db380444089e2907fa72ed712d2150aae6c Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 16:02:21 +0200 Subject: [PATCH 3115/3206] smb: server: pass struct smbdirect_socket to smb_direct_create_header() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 4bf5903fadb6a0..5f4d3ebac07c3d 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1006,11 +1006,10 @@ static int calc_rw_credits(struct smbdirect_socket *sc, sc->rw_io.credits.num_pages); } -static int smb_direct_create_header(struct smb_direct_transport *t, +static int smb_direct_create_header(struct smbdirect_socket *sc, int size, int remaining_data_length, struct smbdirect_send_io **sendmsg_out) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_send_io *sendmsg; struct smbdirect_data_transfer *packet; @@ -1174,7 +1173,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, for (i = 0; i < niov; i++) data_length += iov[i].iov_len; - ret = smb_direct_create_header(t, data_length, remaining_data_length, + ret = smb_direct_create_header(sc, data_length, remaining_data_length, &msg); if (ret) { atomic_inc(&sc->send_io.credits.count); From ab83128e65a1f56db292a6db9b96750a39ba074b Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 16:03:52 +0200 Subject: [PATCH 3116/3206] smb: server: pass struct smbdirect_socket to post_sendmsg() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 5f4d3ebac07c3d..4fc1da244bd3f2 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1115,11 +1115,10 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, return ib_dma_map_sg(device, sg_list, npages, dir); } -static int post_sendmsg(struct smb_direct_transport *t, +static int post_sendmsg(struct smbdirect_socket *sc, struct smbdirect_send_batch *send_ctx, struct smbdirect_send_io *msg) { - struct smbdirect_socket *sc = &t->socket; int i; for (i = 0; i < msg->num_sge; i++) @@ -1210,7 +1209,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, } } - ret = post_sendmsg(t, send_ctx, msg); + ret = post_sendmsg(sc, send_ctx, msg); if (ret) goto err; return 0; @@ -1688,7 +1687,7 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, sendmsg->sge[0].length = sizeof(*resp); sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; - ret = post_sendmsg(t, NULL, sendmsg); + ret = post_sendmsg(sc, NULL, sendmsg); if (ret) { smb_direct_free_sendmsg(sc, sendmsg); return ret; From c0cb9823ac52be33a974b3bfe698e62507f456db Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 16:06:09 +0200 Subject: [PATCH 3117/3206] smb: server: pass struct smbdirect_socket to smb_direct_post_send_data() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 4fc1da244bd3f2..4b0147c3b3aabe 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -135,7 +135,7 @@ static inline int get_buf_page_count(void *buf, int size) static void smb_direct_destroy_pools(struct smbdirect_socket *sc); static void smb_direct_post_recv_credits(struct work_struct *work); -static int smb_direct_post_send_data(struct smb_direct_transport *t, +static int smb_direct_post_send_data(struct smbdirect_socket *sc, struct smbdirect_send_batch *send_ctx, struct kvec *iov, int niov, int remaining_data_length); @@ -270,13 +270,11 @@ static void smb_direct_send_immediate_work(struct work_struct *work) { struct smbdirect_socket *sc = container_of(work, struct smbdirect_socket, idle.immediate_work); - struct smb_direct_transport *t = - container_of(sc, struct smb_direct_transport, socket); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return; - smb_direct_post_send_data(t, NULL, NULL, 0, 0); + smb_direct_post_send_data(sc, NULL, NULL, 0, 0); } static void smb_direct_idle_connection_timer(struct work_struct *work) @@ -1153,12 +1151,11 @@ static int post_sendmsg(struct smbdirect_socket *sc, return smb_direct_post_send(sc, &msg->wr); } -static int smb_direct_post_send_data(struct smb_direct_transport *t, +static int smb_direct_post_send_data(struct smbdirect_socket *sc, struct smbdirect_send_batch *send_ctx, struct kvec *iov, int niov, int remaining_data_length) { - struct smbdirect_socket *sc = &t->socket; int i, j, ret; struct smbdirect_send_io *msg; int data_length; @@ -1337,7 +1334,7 @@ static int smb_direct_writev(struct ksmbd_transport *t, remaining_data_length -= bytes; - ret = smb_direct_post_send_data(st, &send_ctx, + ret = smb_direct_post_send_data(sc, &send_ctx, vecs, nvecs, remaining_data_length); if (unlikely(ret)) { From 5e90c56e0e8b787b3a7ab1ff565c93c918f27f1c Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 16:08:49 +0200 Subject: [PATCH 3118/3206] smb: server: pass struct smbdirect_socket to {enqueue,get_first}_reassembly() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 4b0147c3b3aabe..afd03b6a0313ff 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -180,12 +180,10 @@ static void put_recvmsg(struct smbdirect_socket *sc, queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); } -static void enqueue_reassembly(struct smb_direct_transport *t, +static void enqueue_reassembly(struct smbdirect_socket *sc, struct smbdirect_recv_io *recvmsg, int data_length) { - struct smbdirect_socket *sc = &t->socket; - spin_lock(&sc->recv_io.reassembly.lock); list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list); sc->recv_io.reassembly.queue_length++; @@ -200,10 +198,8 @@ static void enqueue_reassembly(struct smb_direct_transport *t, spin_unlock(&sc->recv_io.reassembly.lock); } -static struct smbdirect_recv_io *get_first_reassembly(struct smb_direct_transport *t) +static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &t->socket; - if (!list_empty(&sc->recv_io.reassembly.list)) return list_first_entry(&sc->recv_io.reassembly.list, struct smbdirect_recv_io, list); @@ -387,7 +383,7 @@ static void free_transport(struct smb_direct_transport *t) ksmbd_debug(RDMA, "drain the reassembly queue\n"); do { spin_lock(&sc->recv_io.reassembly.lock); - recvmsg = get_first_reassembly(t); + recvmsg = get_first_reassembly(sc); if (recvmsg) { list_del(&recvmsg->list); spin_unlock(&sc->recv_io.reassembly.lock); @@ -494,14 +490,12 @@ static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg) static void recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct smbdirect_recv_io *recvmsg; - struct smb_direct_transport *t; struct smbdirect_socket *sc; struct smbdirect_socket_parameters *sp; recvmsg = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); sc = recvmsg->socket; sp = &sc->parameters; - t = container_of(sc, struct smb_direct_transport, socket); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { put_recvmsg(sc, recvmsg); @@ -539,7 +533,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) sc->recv_io.reassembly.full_packet_received = true; WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; - enqueue_reassembly(t, recvmsg, 0); + enqueue_reassembly(sc, recvmsg, 0); wake_up(&sc->status_wait); return; case SMBDIRECT_EXPECT_DATA_TRANSFER: { @@ -607,7 +601,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) if (sc->recv_io.credits.target > old_recv_credit_target) queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); - enqueue_reassembly(t, recvmsg, (int)data_length); + enqueue_reassembly(sc, recvmsg, (int)data_length); wake_up(&sc->recv_io.reassembly.wait_queue); } else put_recvmsg(sc, recvmsg); @@ -702,7 +696,7 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, to_read = size; offset = sc->recv_io.reassembly.first_entry_offset; while (data_read < size) { - recvmsg = get_first_reassembly(st); + recvmsg = get_first_reassembly(sc); data_transfer = smbdirect_recv_io_payload(recvmsg); data_length = le32_to_cpu(data_transfer->data_length); remaining_data_length = @@ -2054,7 +2048,7 @@ static int smb_direct_prepare(struct ksmbd_transport *t) if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) return ret < 0 ? ret : -ETIMEDOUT; - recvmsg = get_first_reassembly(st); + recvmsg = get_first_reassembly(sc); if (!recvmsg) return -ECONNABORTED; From 9d0050874767ec5977843d4bfca3aeb9a43a8f74 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Fri, 22 Aug 2025 16:11:34 +0200 Subject: [PATCH 3119/3206] smb: server: pass struct smbdirect_socket to smb_direct_send_negotiate_response() This will make it easier to move function to the common code in future. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index afd03b6a0313ff..a61898ea2a3f5b 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1625,10 +1625,9 @@ static void smb_direct_qpair_handler(struct ib_event *event, void *context) } } -static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, +static int smb_direct_send_negotiate_response(struct smbdirect_socket *sc, int failed) { - struct smbdirect_socket *sc = &t->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_send_io *sendmsg; struct smbdirect_negotiate_resp *resp; @@ -2069,7 +2068,7 @@ static int smb_direct_prepare(struct ksmbd_transport *t) sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1); - ret = smb_direct_send_negotiate_response(st, ret); + ret = smb_direct_send_negotiate_response(sc, ret); out: spin_lock_irq(&sc->recv_io.reassembly.lock); sc->recv_io.reassembly.queue_length--; From b3fd52a0d85c688a381e2ce72912db0a73727ce9 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 28 Aug 2025 10:39:56 +0200 Subject: [PATCH 3120/3206] smb: server: let smb_direct_disconnect_rdma_connection() set SMBDIRECT_SOCKET_ERROR... smb_direct_disconnect_rdma_connection() should turn the status into an error state instead of leaving it as is until smb_direct_disconnect_rdma_work() is running. Cc: Steve French Cc: Tom Talpey Cc: Long Li Acked-by: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index a61898ea2a3f5b..67345c58bfe9ab 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -259,6 +259,46 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) static void smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) { + switch (sc->status) { + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_ERROR: + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + /* + * Keep the current error status + */ + break; + + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; + break; + + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; + break; + + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; + break; + + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_CONNECTED: + sc->status = SMBDIRECT_SOCKET_ERROR; + break; + } + queue_work(sc->workqueue, &sc->disconnect_work); } From 3d71e7cdbcf056a0b8ca3e9634f161b9a7b1e634 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 28 Aug 2025 12:14:07 +0200 Subject: [PATCH 3121/3206] smb: server: fill in smbdirect_socket.first_error on error For now we just use -ECONNABORTED, but it will get more detailed later. Acked-by: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 67345c58bfe9ab..44fa0af21b453d 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -222,6 +222,9 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) disable_delayed_work(&sc->idle.timer_work); disable_work(&sc->idle.immediate_work); + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + switch (sc->status) { case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: @@ -259,6 +262,9 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) static void smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) { + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + switch (sc->status) { case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: From 7d729df9945a5ff9d4947bbc3c1297040ae3c507 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 28 Aug 2025 12:17:23 +0200 Subject: [PATCH 3122/3206] smb: server: let smb_direct_disconnect_rdma_connection() disable all work but disconnect_work There's no point run these if we already know the connection is broken. Acked-by: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 44fa0af21b453d..cd4398ae8b9837 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -262,6 +262,15 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) static void smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) { + /* + * make sure other work (than disconnect_work) is + * not queued again but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->recv_io.posted.refill_work); + disable_work(&sc->idle.immediate_work); + disable_delayed_work(&sc->idle.timer_work); + if (sc->first_error == 0) sc->first_error = -ECONNABORTED; From 0491f26f8980d7a4028d3528df09e23f5a64adbe Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 2 Sep 2025 12:36:50 +0200 Subject: [PATCH 3123/3206] smb: server: let {free_transport,smb_direct_disconnect_rdma_{work,connection}}() wake up all wait queues This is important in order to let all waiters notice a broken connection. We also go via smb_direct_disconnect_rdma_{work,connection}() for broken connections. Acked-by: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 40 +++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index cd4398ae8b9837..ba4dfdcb321a37 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -207,6 +207,19 @@ static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *s return NULL; } +static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc) +{ + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + wake_up_all(&sc->status_wait); + wake_up_all(&sc->send_io.credits.wait_queue); + wake_up_all(&sc->send_io.pending.zero_wait_queue); + wake_up_all(&sc->recv_io.reassembly.wait_queue); + wake_up_all(&sc->rw_io.credits.wait_queue); +} + static void smb_direct_disconnect_rdma_work(struct work_struct *work) { struct smbdirect_socket *sc = @@ -257,6 +270,12 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) case SMBDIRECT_SOCKET_DESTROYED: break; } + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smb_direct_disconnect_wake_up_all(sc); } static void @@ -314,6 +333,12 @@ smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) break; } + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smb_direct_disconnect_wake_up_all(sc); + queue_work(sc->workqueue, &sc->disconnect_work); } @@ -421,8 +446,14 @@ static void free_transport(struct smb_direct_transport *t) sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } - wake_up_all(&sc->send_io.credits.wait_queue); - wake_up_all(&sc->send_io.pending.zero_wait_queue); + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + * + * Most likely this was already called via + * smb_direct_disconnect_rdma_work(), but call it again... + */ + smb_direct_disconnect_wake_up_all(sc); disable_work_sync(&sc->recv_io.posted.refill_work); disable_delayed_work_sync(&sc->idle.timer_work); @@ -1644,14 +1675,11 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, sc->status = SMBDIRECT_SOCKET_DISCONNECTED; smb_direct_disconnect_rdma_work(&sc->disconnect_work); - wake_up_all(&sc->status_wait); - wake_up_all(&sc->recv_io.reassembly.wait_queue); - wake_up_all(&sc->send_io.credits.wait_queue); break; } case RDMA_CM_EVENT_CONNECT_ERROR: { sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up_all(&sc->status_wait); + smb_direct_disconnect_rdma_work(&sc->disconnect_work); break; } default: From 8aa23bae607e3bd5ced892afc26bfbe8945f531b Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 27 Aug 2025 15:54:35 +0200 Subject: [PATCH 3124/3206] smb: server: make consitent use of spin_lock_irq{save,restore}() in transport_rdma.c There is a mix of using spin_lock() and spin_lock_irq(), which is confusing as IB_POLL_WORKQUEUE is used and no code would be called from any interrupt. So using spin_lock() or even mutexes would be ok. But we'll soon share common code with the client, which uses IB_POLL_SOFTIRQ. And Documentation/kernel-hacking/locking.rst section "Cheat Sheet For Locking" says: - Otherwise (== data can be touched in an interrupt), use spin_lock_irqsave() and spin_unlock_irqrestore(). So in order to keep it simple and safe we use that version now. It will help merging functions into common code and have consistent locking in all cases. Acked-by: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 39 +++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index ba4dfdcb321a37..f9734d7025b43d 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -150,21 +150,24 @@ static struct smbdirect_recv_io *get_free_recvmsg(struct smbdirect_socket *sc) { struct smbdirect_recv_io *recvmsg = NULL; + unsigned long flags; - spin_lock(&sc->recv_io.free.lock); + spin_lock_irqsave(&sc->recv_io.free.lock, flags); if (!list_empty(&sc->recv_io.free.list)) { recvmsg = list_first_entry(&sc->recv_io.free.list, struct smbdirect_recv_io, list); list_del(&recvmsg->list); } - spin_unlock(&sc->recv_io.free.lock); + spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); return recvmsg; } static void put_recvmsg(struct smbdirect_socket *sc, struct smbdirect_recv_io *recvmsg) { + unsigned long flags; + if (likely(recvmsg->sge.length != 0)) { ib_dma_unmap_single(sc->ib.dev, recvmsg->sge.addr, @@ -173,9 +176,9 @@ static void put_recvmsg(struct smbdirect_socket *sc, recvmsg->sge.length = 0; } - spin_lock(&sc->recv_io.free.lock); + spin_lock_irqsave(&sc->recv_io.free.lock, flags); list_add(&recvmsg->list, &sc->recv_io.free.list); - spin_unlock(&sc->recv_io.free.lock); + spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); } @@ -184,7 +187,9 @@ static void enqueue_reassembly(struct smbdirect_socket *sc, struct smbdirect_recv_io *recvmsg, int data_length) { - spin_lock(&sc->recv_io.reassembly.lock); + unsigned long flags; + + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list); sc->recv_io.reassembly.queue_length++; /* @@ -195,7 +200,7 @@ static void enqueue_reassembly(struct smbdirect_socket *sc, */ virt_wmb(); sc->recv_io.reassembly.data_length += data_length; - spin_unlock(&sc->recv_io.reassembly.lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); } static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *sc) @@ -468,14 +473,16 @@ static void free_transport(struct smb_direct_transport *t) ksmbd_debug(RDMA, "drain the reassembly queue\n"); do { - spin_lock(&sc->recv_io.reassembly.lock); + unsigned long flags; + + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); recvmsg = get_first_reassembly(sc); if (recvmsg) { list_del(&recvmsg->list); - spin_unlock(&sc->recv_io.reassembly.lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); put_recvmsg(sc, recvmsg); } else { - spin_unlock(&sc->recv_io.reassembly.lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); } } while (recvmsg); sc->recv_io.reassembly.data_length = 0; @@ -768,6 +775,7 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, if (sc->recv_io.reassembly.data_length >= size) { int queue_length; int queue_removed = 0; + unsigned long flags; /* * Need to make sure reassembly_data_length is read before @@ -823,9 +831,9 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, if (queue_length) { list_del(&recvmsg->list); } else { - spin_lock_irq(&sc->recv_io.reassembly.lock); + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); list_del(&recvmsg->list); - spin_unlock_irq(&sc->recv_io.reassembly.lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); } queue_removed++; put_recvmsg(sc, recvmsg); @@ -838,10 +846,10 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, data_read += to_copy; } - spin_lock_irq(&sc->recv_io.reassembly.lock); + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); sc->recv_io.reassembly.data_length -= data_read; sc->recv_io.reassembly.queue_length -= queue_removed; - spin_unlock_irq(&sc->recv_io.reassembly.lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); sc->recv_io.reassembly.first_entry_offset = offset; ksmbd_debug(RDMA, @@ -2107,6 +2115,7 @@ static int smb_direct_prepare(struct ksmbd_transport *t) struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_recv_io *recvmsg; struct smbdirect_negotiate_req *req; + unsigned long flags; int ret; /* @@ -2153,10 +2162,10 @@ static int smb_direct_prepare(struct ksmbd_transport *t) ret = smb_direct_send_negotiate_response(sc, ret); out: - spin_lock_irq(&sc->recv_io.reassembly.lock); + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); sc->recv_io.reassembly.queue_length--; list_del(&recvmsg->list); - spin_unlock_irq(&sc->recv_io.reassembly.lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); put_recvmsg(sc, recvmsg); return ret; From 942ce74ab9a3f1e18fe00bffa80ecf6294e9e6b5 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 15 Sep 2025 08:05:31 +0200 Subject: [PATCH 3125/3206] smb: server: make use of ib_alloc_cq_any() instead of ib_alloc_cq() commit 20cf4e026730 ("rdma: Enable ib_alloc_cq to spread work over a device's comp_vectors") happened before ksmbd was upstreamed, but after the out of tree ksmbd (a.k.a. cifsd) was developed. So we still used ib_alloc_cq(). Acked-by: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index f9734d7025b43d..e78347831d2ffe 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -2037,9 +2037,10 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc, return ret; } - sc->ib.send_cq = ib_alloc_cq(sc->ib.dev, sc, - sp->send_credit_target + cap->max_rdma_ctxs, - 0, IB_POLL_WORKQUEUE); + sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, + sp->send_credit_target + + cap->max_rdma_ctxs, + IB_POLL_WORKQUEUE); if (IS_ERR(sc->ib.send_cq)) { pr_err("Can't create RDMA send CQ\n"); ret = PTR_ERR(sc->ib.send_cq); @@ -2047,8 +2048,9 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc, goto err; } - sc->ib.recv_cq = ib_alloc_cq(sc->ib.dev, sc, - sp->recv_credit_max, 0, IB_POLL_WORKQUEUE); + sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc, + sp->recv_credit_max, + IB_POLL_WORKQUEUE); if (IS_ERR(sc->ib.recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); ret = PTR_ERR(sc->ib.recv_cq); From 1b53426334c3c942db47e0959a2527a4f815af50 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 8 Sep 2025 22:22:35 +0200 Subject: [PATCH 3126/3206] smb: server: let smb_direct_flush_send_list() invalidate a remote key first If we want to invalidate a remote key we should do that as soon as possible, so do it in the first send work request. Acked-by: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index e78347831d2ffe..27e3fc5139cc71 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1017,12 +1017,15 @@ static int smb_direct_flush_send_list(struct smbdirect_socket *sc, struct smbdirect_send_io, sibling_list); + if (send_ctx->need_invalidate_rkey) { + first->wr.opcode = IB_WR_SEND_WITH_INV; + first->wr.ex.invalidate_rkey = send_ctx->remote_key; + send_ctx->need_invalidate_rkey = false; + send_ctx->remote_key = 0; + } + last->wr.send_flags = IB_SEND_SIGNALED; last->wr.wr_cqe = &last->cqe; - if (is_last && send_ctx->need_invalidate_rkey) { - last->wr.opcode = IB_WR_SEND_WITH_INV; - last->wr.ex.invalidate_rkey = send_ctx->remote_key; - } ret = smb_direct_post_send(sc, &first->wr); if (!ret) { From c8a935a31bc787db52296944890f300ba9479088 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Tue, 23 Sep 2025 10:52:29 +0100 Subject: [PATCH 3127/3206] lib/string_choices: Add str_assert_deassert() helper Add str_assert_deassert() helper to return "assert" or "deassert" string literal depending on the boolean argument. Also add the inversed variant str_deassert_assert(). Suggested-by: Philipp Zabel Signed-off-by: Lad Prabhakar Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250923095229.2149740-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Kees Cook --- include/linux/string_choices.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index f3ba4f52ff2601..6c4077be774229 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -17,6 +17,12 @@ #include +static inline const char *str_assert_deassert(bool v) +{ + return v ? "assert" : "deassert"; +} +#define str_deassert_assert(v) str_assert_deassert(!(v)) + static inline const char *str_enable_disable(bool v) { return v ? "enable" : "disable"; From ebc4ed14a4dbf51307102bb7ffc82ed6c16a37c2 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 9 Sep 2025 09:38:19 +0200 Subject: [PATCH 3128/3206] cpufreq: mediatek: fix device leak on probe failure Make sure to drop the reference to the cci device taken by of_find_device_by_node() on probe failure (e.g. probe deferral). Fixes: 0daa47325bae ("cpufreq: mediatek: Link CCI device to CPU") Cc: Jia-Wei Chang Cc: Rex-BC Chen Signed-off-by: Johan Hovold Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Chen-Yu Tsai Signed-off-by: Viresh Kumar --- drivers/cpufreq/mediatek-cpufreq.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c index 00de1166188ab7..5d50a231f94441 100644 --- a/drivers/cpufreq/mediatek-cpufreq.c +++ b/drivers/cpufreq/mediatek-cpufreq.c @@ -403,9 +403,11 @@ static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu) } info->cpu_clk = clk_get(cpu_dev, "cpu"); - if (IS_ERR(info->cpu_clk)) - return dev_err_probe(cpu_dev, PTR_ERR(info->cpu_clk), - "cpu%d: failed to get cpu clk\n", cpu); + if (IS_ERR(info->cpu_clk)) { + ret = PTR_ERR(info->cpu_clk); + dev_err_probe(cpu_dev, ret, "cpu%d: failed to get cpu clk\n", cpu); + goto out_put_cci_dev; + } info->inter_clk = clk_get(cpu_dev, "intermediate"); if (IS_ERR(info->inter_clk)) { @@ -551,6 +553,10 @@ static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu) out_free_mux_clock: clk_put(info->cpu_clk); +out_put_cci_dev: + if (info->soc_data->ccifreq_supported) + put_device(info->cci_dev); + return ret; } @@ -568,6 +574,8 @@ static void mtk_cpu_dvfs_info_release(struct mtk_cpu_dvfs_info *info) clk_put(info->inter_clk); dev_pm_opp_of_cpumask_remove_table(&info->cpus); dev_pm_opp_unregister_notifier(info->cpu_dev, &info->opp_nb); + if (info->soc_data->ccifreq_supported) + put_device(info->cci_dev); } static int mtk_cpufreq_init(struct cpufreq_policy *policy) From 24287f902095d845c6af9c2c369ba96877f5eb79 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 15 Sep 2025 15:59:54 +0200 Subject: [PATCH 3129/3206] rust: cpufreq: streamline find_supply_names Remove local variables from find_supply_names() and use .and_then() with the more concise kernel::kvec![] macro, instead of KVec::with_capacity() followed by .push() and Some(). No functional changes intended. Signed-off-by: Thorsten Blum Signed-off-by: Viresh Kumar --- drivers/cpufreq/rcpufreq_dt.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/rcpufreq_dt.rs b/drivers/cpufreq/rcpufreq_dt.rs index 7e1fbf9a091f74..224d063c7cec17 100644 --- a/drivers/cpufreq/rcpufreq_dt.rs +++ b/drivers/cpufreq/rcpufreq_dt.rs @@ -28,15 +28,11 @@ fn find_supply_name_exact(dev: &Device, name: &str) -> Option { /// Finds supply name for the CPU from DT. fn find_supply_names(dev: &Device, cpu: cpu::CpuId) -> Option> { // Try "cpu0" for older DTs, fallback to "cpu". - let name = (cpu.as_u32() == 0) + (cpu.as_u32() == 0) .then(|| find_supply_name_exact(dev, "cpu0")) .flatten() - .or_else(|| find_supply_name_exact(dev, "cpu"))?; - - let mut list = KVec::with_capacity(1, GFP_KERNEL).ok()?; - list.push(name, GFP_KERNEL).ok()?; - - Some(list) + .or_else(|| find_supply_name_exact(dev, "cpu")) + .and_then(|name| kernel::kvec![name].ok()) } /// Represents the cpufreq dt device. From 0b1bb980fd7cae126ee3d59f817068a13e321b07 Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Thu, 28 Aug 2025 21:48:12 -0500 Subject: [PATCH 3130/3206] cpufreq: tegra186: Set target frequency for all cpus in policy The original commit set all cores in a cluster to a shared policy, but did not update set_target to apply a frequency change to all cores for the policy. This caused most cores to remain stuck at their boot frequency. Fixes: be4ae8c19492 ("cpufreq: tegra186: Share policy per cluster") Signed-off-by: Aaron Kling Reviewed-by: Mikko Perttunen Signed-off-by: Viresh Kumar --- drivers/cpufreq/tegra186-cpufreq.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c index 4270686fc3e3eb..a9e1f9bd2a4e9b 100644 --- a/drivers/cpufreq/tegra186-cpufreq.c +++ b/drivers/cpufreq/tegra186-cpufreq.c @@ -93,10 +93,14 @@ static int tegra186_cpufreq_set_target(struct cpufreq_policy *policy, { struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); struct cpufreq_frequency_table *tbl = policy->freq_table + index; - unsigned int edvd_offset = data->cpus[policy->cpu].edvd_offset; + unsigned int edvd_offset; u32 edvd_val = tbl->driver_data; + u32 cpu; - writel(edvd_val, data->regs + edvd_offset); + for_each_cpu(cpu, policy->cpus) { + edvd_offset = data->cpus[cpu].edvd_offset; + writel(edvd_val, data->regs + edvd_offset); + } return 0; } From ba6018929165fc914c665f071f8e8cdbac844a49 Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Thu, 28 Aug 2025 21:48:13 -0500 Subject: [PATCH 3131/3206] cpufreq: tegra186: Initialize all cores to max frequencies During initialization, the EDVD_COREx_VOLT_FREQ registers for some cores are still at reset values and not reflecting the actual frequency. This causes get calls to fail. Set all cores to their respective max frequency during probe to initialize the registers to working values. Suggested-by: Mikko Perttunen Signed-off-by: Aaron Kling Reviewed-by: Mikko Perttunen Signed-off-by: Viresh Kumar --- drivers/cpufreq/tegra186-cpufreq.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c index a9e1f9bd2a4e9b..136ab102f636aa 100644 --- a/drivers/cpufreq/tegra186-cpufreq.c +++ b/drivers/cpufreq/tegra186-cpufreq.c @@ -136,13 +136,14 @@ static struct cpufreq_driver tegra186_cpufreq_driver = { static struct cpufreq_frequency_table *init_vhint_table( struct platform_device *pdev, struct tegra_bpmp *bpmp, - struct tegra186_cpufreq_cluster *cluster, unsigned int cluster_id) + struct tegra186_cpufreq_cluster *cluster, unsigned int cluster_id, + int *num_rates) { struct cpufreq_frequency_table *table; struct mrq_cpu_vhint_request req; struct tegra_bpmp_message msg; struct cpu_vhint_data *data; - int err, i, j, num_rates = 0; + int err, i, j; dma_addr_t phys; void *virt; @@ -172,6 +173,7 @@ static struct cpufreq_frequency_table *init_vhint_table( goto free; } + *num_rates = 0; for (i = data->vfloor; i <= data->vceil; i++) { u16 ndiv = data->ndiv[i]; @@ -182,10 +184,10 @@ static struct cpufreq_frequency_table *init_vhint_table( if (i > 0 && ndiv == data->ndiv[i - 1]) continue; - num_rates++; + (*num_rates)++; } - table = devm_kcalloc(&pdev->dev, num_rates + 1, sizeof(*table), + table = devm_kcalloc(&pdev->dev, *num_rates + 1, sizeof(*table), GFP_KERNEL); if (!table) { table = ERR_PTR(-ENOMEM); @@ -227,7 +229,9 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) { struct tegra186_cpufreq_data *data; struct tegra_bpmp *bpmp; - unsigned int i = 0, err; + unsigned int i = 0, err, edvd_offset; + int num_rates = 0; + u32 edvd_val, cpu; data = devm_kzalloc(&pdev->dev, struct_size(data, clusters, TEGRA186_NUM_CLUSTERS), @@ -250,10 +254,21 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) for (i = 0; i < TEGRA186_NUM_CLUSTERS; i++) { struct tegra186_cpufreq_cluster *cluster = &data->clusters[i]; - cluster->table = init_vhint_table(pdev, bpmp, cluster, i); + cluster->table = init_vhint_table(pdev, bpmp, cluster, i, &num_rates); if (IS_ERR(cluster->table)) { err = PTR_ERR(cluster->table); goto put_bpmp; + } else if (!num_rates) { + err = -EINVAL; + goto put_bpmp; + } + + for (cpu = 0; cpu < ARRAY_SIZE(tegra186_cpus); cpu++) { + if (data->cpus[cpu].bpmp_cluster_id == i) { + edvd_val = cluster->table[num_rates - 1].driver_data; + edvd_offset = data->cpus[cpu].edvd_offset; + writel(edvd_val, data->regs + edvd_offset); + } } } From 9a0abc39450a3123fd52533a662fbd37e0d1508c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 17:47:14 +0200 Subject: [PATCH 3132/3206] PM: runtime: Add auto-cleanup macros for "resume and get" operations It is generally useful to be able to automatically drop a device's runtime PM usage counter incremented by runtime PM operations that resume a device and bump up its usage counter [1]. To that end, add guard definition macros allowing pm_runtime_put() and pm_runtime_put_autosuspend() to be used for the auto-cleanup in those cases. Simply put, a piece of code like below: pm_runtime_get_sync(dev); ..... pm_runtime_put(dev); return 0; can be transformed with guard() like: guard(pm_runtime_active)(dev); ..... return 0; (see the pm_runtime_put() call is gone). However, it is better to do proper error handling in the majority of cases, so doing something like this instead of the above is recommended: ACQUIRE(pm_runtime_active_try, pm)(dev); if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) return -ENXIO; ..... return 0; In all of the cases in which runtime PM is known to be enabled for the given device or the device can be regarded as operational (and so it can be accessed) with runtime PM disabled, a piece of code like: ret = pm_runtime_resume_and_get(dev); if (ret < 0) return ret; ..... pm_runtime_put(dev); return 0; can be changed as follows: ACQUIRE(pm_runtime_active_try, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_try, &pm); if (ret < 0) return ret; ..... return 0; (again, see the pm_runtime_put() call is gone). Still, if the device cannot be accessed unless runtime PM has been enabled for it, the pm_runtime_active_try_enabled guard variant needs to be used, that is (in the context of the example above): ACQUIRE(pm_runtime_active_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; When the original code calls pm_runtime_put_autosuspend(), use one of the "auto" guard variants, pm_runtime_active_auto/_try/_enabled, so for example, a piece of code like: ret = pm_runtime_resume_and_get(dev); if (ret < 0) return ret; ..... pm_runtime_put_autosuspend(dev); return 0; will become: ACQUIRE(pm_runtime_active_auto_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_auto_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; Note that the cases in which the return value of pm_runtime_get_sync() is checked can also be handled with the help of the new guard macros. For example, a piece of code like: ret = pm_runtime_get_sync(dev); if (ret < 0) { pm_runtime_put(dev); return ret; } ..... pm_runtime_put(dev); return 0; can be rewritten as: ACQUIRE(pm_runtime_active_auto_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_auto_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; or pm_runtime_get_active_try can be used if transparent handling of disabled runtime PM is desirable. Link: https://lore.kernel.org/linux-pm/878qimv24u.wl-tiwai@suse.de/ [1] Link: https://lore.kernel.org/linux-pm/20250926150613.000073a4@huawei.com/ Signed-off-by: Rafael J. Wysocki Acked-by: Dan Williams Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/2238241.irdbgypaU6@rafael.j.wysocki [ rjw: Fixed leftovers from the previous version in the changelog ] Reviewed-by: Jonathan Cameron Reviewed-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime.c | 2 ++ include/linux/pm_runtime.h | 44 ++++++++++++++++++++++++++++-------- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index faa68bf9ef3d06..0fc825066ede26 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -799,6 +799,8 @@ static int rpm_resume(struct device *dev, int rpmflags) if (dev->power.runtime_status == RPM_ACTIVE && dev->power.last_status == RPM_ACTIVE) retval = 1; + else if (rpmflags & RPM_TRANSPARENT) + goto out; else retval = -EACCES; } diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index d1ff76e0e2d077..e5426bdd0c9fc0 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -21,6 +21,7 @@ #define RPM_GET_PUT 0x04 /* Increment/decrement the usage_count */ #define RPM_AUTO 0x08 /* Use autosuspend_delay */ +#define RPM_TRANSPARENT 0x10 /* Succeed if runtime PM is disabled */ /* * Use this for defining a set of PM operations to be used in all situations @@ -512,6 +513,19 @@ static inline int pm_runtime_get_sync(struct device *dev) return __pm_runtime_resume(dev, RPM_GET_PUT); } +static inline int pm_runtime_get_active(struct device *dev, int rpmflags) +{ + int ret; + + ret = __pm_runtime_resume(dev, RPM_GET_PUT | rpmflags); + if (ret < 0) { + pm_runtime_put_noidle(dev); + return ret; + } + + return 0; +} + /** * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. * @dev: Target device. @@ -522,15 +536,7 @@ static inline int pm_runtime_get_sync(struct device *dev) */ static inline int pm_runtime_resume_and_get(struct device *dev) { - int ret; - - ret = __pm_runtime_resume(dev, RPM_GET_PUT); - if (ret < 0) { - pm_runtime_put_noidle(dev); - return ret; - } - - return 0; + return pm_runtime_get_active(dev, 0); } /** @@ -610,6 +616,26 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) return __pm_runtime_put_autosuspend(dev); } +DEFINE_GUARD(pm_runtime_active, struct device *, + pm_runtime_get_sync(_T), pm_runtime_put(_T)); +DEFINE_GUARD(pm_runtime_active_auto, struct device *, + pm_runtime_get_sync(_T), pm_runtime_put_autosuspend(_T)); +/* + * Use the following guards with ACQUIRE()/ACQUIRE_ERR(). + * + * The difference between the "_try" and "_try_enabled" variants is that the + * former do not produce an error when runtime PM is disabled for the given + * device. + */ +DEFINE_GUARD_COND(pm_runtime_active, _try, + pm_runtime_get_active(_T, RPM_TRANSPARENT)) +DEFINE_GUARD_COND(pm_runtime_active, _try_enabled, + pm_runtime_resume_and_get(_T)) +DEFINE_GUARD_COND(pm_runtime_active_auto, _try, + pm_runtime_get_active(_T, RPM_TRANSPARENT)) +DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled, + pm_runtime_resume_and_get(_T)) + /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. From 8ff5aaa7b8c9fb3e66df6f440ee109d00f8d7a1b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 18:24:05 +0200 Subject: [PATCH 3133/3206] PCI/sysfs: Use runtime PM guard macro for auto-cleanup Use the newly introduced pm_runtime_active_try guard to simplify the code and add the proper error handling for PM runtime resume errors. Based on an earlier patch from Takashi Iwai [1]. Link: https://patch.msgid.link/20250919163147.4743-3-tiwai@suse.de [1] Acked-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki Reviewed-by: Takashi Iwai Reviewed-by: Jonathan Cameron --- drivers/pci/pci-sysfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 5eea14c1f7f5f7..c96301026f014e 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1475,8 +1475,9 @@ static ssize_t reset_method_store(struct device *dev, return count; } - pm_runtime_get_sync(dev); - struct device *pmdev __free(pm_runtime_put) = dev; + ACQUIRE(pm_runtime_active_try, pm)(dev); + if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) + return -ENXIO; if (sysfs_streq(buf, "default")) { pci_init_reset_methods(pdev); From d5e58ce1fb0f13a9a0845851f267ede3551cd9fe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 18:26:40 +0200 Subject: [PATCH 3134/3206] PM: runtime: Drop DEFINE_FREE() for pm_runtime_put() The DEFINE_FREE() for pm_runtime_put has been superseded by recently introduced runtime PM auto-cleanup macros and its only user has been converted to using one of the new macros, so drop it. Signed-off-by: Rafael J. Wysocki Reviewed-by: Dhruva Gole Reviewed-by: Takashi Iwai Reviewed-by: Jonathan Cameron --- include/linux/pm_runtime.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index e5426bdd0c9fc0..edb8aed5ef62eb 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -563,8 +563,6 @@ static inline int pm_runtime_put(struct device *dev) return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } -DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T)) - /** * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. From fde0ab43b9a30d08817adc5402b69fec83a61cb8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Sep 2025 08:54:12 -0700 Subject: [PATCH 3135/3206] Fix CC_HAS_ASM_GOTO_OUTPUT on non-x86 architectures There's a silly problem with the CC_HAS_ASM_GOTO_OUTPUT test: even with a working compiler it will fail on some architectures simply because it uses the mnemonic "jmp" for testing the inline asm. And as reported by Geert, not all architectures use that mnemonic, so the test fails spuriously on such platforms (including arm and riscv, but also several other architectures). This issue avoided any obvious test failures because the build still works thanks to falling back on the old non-asm-goto code, which just generates worse code. Just use an empty asm statement instead. Reported-and-tested-by: Geert Uytterhoeven Suggested-by: Peter Zijlstra Fixes: e2ffa15b9baa ("kbuild: Disable CC_HAS_ASM_GOTO_OUTPUT on clang < 17") Cc: Nathan Chancellor Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index ecddb94db8dc01..6a91ec74218695 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -102,7 +102,7 @@ config CC_HAS_ASM_GOTO_OUTPUT # Detect basic support depends on $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null) # Detect clang (< v17) scoped label issues - depends on $(success,echo 'void b(void **);void* c(void);int f(void){{asm goto("jmp %l0"::::l0);return 0;l0:return 1;}void *x __attribute__((cleanup(b)))=c();{asm goto("jmp %l0"::::l1);return 2;l1:return 3;}}' | $(CC) -x c - -c -o /dev/null) + depends on $(success,echo 'void b(void **);void* c(void);int f(void){{asm goto(""::::l0);return 0;l0:return 1;}void *x __attribute__((cleanup(b)))=c();{asm goto(""::::l1);return 2;l1:return 3;}}' | $(CC) -x c - -c -o /dev/null) config CC_HAS_ASM_GOTO_TIED_OUTPUT depends on CC_HAS_ASM_GOTO_OUTPUT From 6c7ca6a02f8f9549a438a08a23c6327580ecf3d6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 29 Sep 2025 11:41:16 +0200 Subject: [PATCH 3136/3206] mount: handle NULL values in mnt_ns_release() When calling in listmount() mnt_ns_release() may be passed a NULL pointer. Handle that case gracefully. Signed-off-by: Christian Brauner Signed-off-by: Linus Torvalds --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 38609066cf3301..1ee17a00c311b6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -180,7 +180,7 @@ static void mnt_ns_tree_add(struct mnt_namespace *ns) static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ - if (refcount_dec_and_test(&ns->passive)) { + if (ns && refcount_dec_and_test(&ns->passive)) { fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); From ee916dccd4df6e2fd19c3606c4735282b72f1473 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Sep 2025 12:24:20 -0700 Subject: [PATCH 3137/3206] Unbreak 'make tools/*' for user-space targets This pattern isn't very documented, and apparently not used much outside of 'make tools/help', but it has existed for over a decade (since commit ea01fa9f63ae: "tools: Connect to the kernel build system"). However, it doesn't work very well for most cases, particularly the useful "tools/all" target, because it overrides the LDFLAGS value with an empty one. And once overridden, 'make' will then not honor the tooling makefiles trying to change it - which then makes any LDFLAGS use in the tooling directory break, typically causing odd link errors. Remove that LDFLAGS override, since it seems to be entirely historical. The core kernel makefiles no longer modify LDFLAGS as part of the build, and use kernel-specific link flags instead (eg 'KBUILD_LDFLAGS' and friends). This allows more of the 'make tools/*' cases to work. I say 'more', because some of the tooling build rules make various other assumptions or have other issues, so it's still a bit hit-or-miss. But those issues tend to show up with the 'make -C tools xyz' pattern too, so now it's no longer an issue of this particular 'tools/*' build rule being special. Acked-by: Nathan Chancellor Cc: Nicolas Schier Cc: Borislav Petkov Signed-off-by: Linus Torvalds --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 82bb9cdf73a32b..76dddefde05404 100644 --- a/Makefile +++ b/Makefile @@ -1444,11 +1444,11 @@ endif tools/: FORCE $(Q)mkdir -p $(objtree)/tools - $(Q)$(MAKE) LDFLAGS= O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ + $(Q)$(MAKE) O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ tools/%: FORCE $(Q)mkdir -p $(objtree)/tools - $(Q)$(MAKE) LDFLAGS= O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $* + $(Q)$(MAKE) O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $* # --------------------------------------------------------------------------- # Kernel selftest From f2c61db29f277b9c80de92102fc532cc247495cd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Sep 2025 13:43:52 -0700 Subject: [PATCH 3138/3206] Remove bcachefs core code bcachefs was marked 'externally maintained' in 6.17 but the code remained to make the transition smoother. It's now a DKMS module, making the in-kernel code stale, so remove it to avoid any version confusion. Link: https://lore.kernel.org/linux-bcachefs/yokpt2d2g2lluyomtqrdvmkl3amv3kgnipmenobkpgx537kay7@xgcgjviv3n7x/T/ Signed-off-by: Linus Torvalds --- .../filesystems/bcachefs/CodingStyle.rst | 186 - .../bcachefs/SubmittingPatches.rst | 105 - .../filesystems/bcachefs/casefolding.rst | 108 - .../filesystems/bcachefs/errorcodes.rst | 30 - .../filesystems/bcachefs/future/idle_work.rst | 78 - Documentation/filesystems/bcachefs/index.rst | 38 - Documentation/filesystems/index.rst | 1 - MAINTAINERS | 3 - arch/m68k/configs/amiga_defconfig | 1 - arch/m68k/configs/apollo_defconfig | 1 - arch/m68k/configs/atari_defconfig | 1 - arch/m68k/configs/bvme6000_defconfig | 1 - arch/m68k/configs/hp300_defconfig | 1 - arch/m68k/configs/mac_defconfig | 1 - arch/m68k/configs/multi_defconfig | 1 - arch/m68k/configs/mvme147_defconfig | 1 - arch/m68k/configs/mvme16x_defconfig | 1 - arch/m68k/configs/q40_defconfig | 1 - arch/m68k/configs/sun3_defconfig | 1 - arch/m68k/configs/sun3x_defconfig | 1 - arch/s390/configs/debug_defconfig | 3 - arch/s390/configs/defconfig | 3 - fs/Kconfig | 1 - fs/Makefile | 1 - fs/bcachefs/Kconfig | 121 - fs/bcachefs/Makefile | 107 - fs/bcachefs/acl.c | 445 -- fs/bcachefs/acl.h | 60 - fs/bcachefs/alloc_background.c | 2680 ------------ fs/bcachefs/alloc_background.h | 361 -- fs/bcachefs/alloc_background_format.h | 95 - fs/bcachefs/alloc_foreground.c | 1683 -------- fs/bcachefs/alloc_foreground.h | 318 -- fs/bcachefs/alloc_types.h | 121 - fs/bcachefs/async_objs.c | 132 - fs/bcachefs/async_objs.h | 44 - fs/bcachefs/async_objs_types.h | 25 - fs/bcachefs/backpointers.c | 1391 ------ fs/bcachefs/backpointers.h | 200 - fs/bcachefs/bbpos.h | 37 - fs/bcachefs/bbpos_types.h | 18 - fs/bcachefs/bcachefs.h | 1295 ------ fs/bcachefs/bcachefs_format.h | 1545 ------- fs/bcachefs/bcachefs_ioctl.h | 473 -- fs/bcachefs/bkey.c | 1112 ----- fs/bcachefs/bkey.h | 605 --- fs/bcachefs/bkey_buf.h | 61 - fs/bcachefs/bkey_cmp.h | 129 - fs/bcachefs/bkey_methods.c | 497 --- fs/bcachefs/bkey_methods.h | 139 - fs/bcachefs/bkey_sort.c | 214 - fs/bcachefs/bkey_sort.h | 54 - fs/bcachefs/bkey_types.h | 241 -- fs/bcachefs/bset.c | 1576 ------- fs/bcachefs/bset.h | 536 --- fs/bcachefs/btree_cache.c | 1516 ------- fs/bcachefs/btree_cache.h | 157 - fs/bcachefs/btree_gc.c | 1308 ------ fs/bcachefs/btree_gc.h | 88 - fs/bcachefs/btree_gc_types.h | 34 - fs/bcachefs/btree_io.c | 2742 ------------ fs/bcachefs/btree_io.h | 239 -- fs/bcachefs/btree_iter.c | 3804 ----------------- fs/bcachefs/btree_iter.h | 1010 ----- fs/bcachefs/btree_journal_iter.c | 830 ---- fs/bcachefs/btree_journal_iter.h | 102 - fs/bcachefs/btree_journal_iter_types.h | 37 - fs/bcachefs/btree_key_cache.c | 880 ---- fs/bcachefs/btree_key_cache.h | 59 - fs/bcachefs/btree_key_cache_types.h | 34 - fs/bcachefs/btree_locking.c | 936 ---- fs/bcachefs/btree_locking.h | 466 -- fs/bcachefs/btree_node_scan.c | 611 --- fs/bcachefs/btree_node_scan.h | 11 - fs/bcachefs/btree_node_scan_types.h | 31 - fs/bcachefs/btree_trans_commit.c | 1121 ----- fs/bcachefs/btree_types.h | 937 ---- fs/bcachefs/btree_update.c | 916 ---- fs/bcachefs/btree_update.h | 429 -- fs/bcachefs/btree_update_interior.c | 2854 ------------- fs/bcachefs/btree_update_interior.h | 364 -- fs/bcachefs/btree_write_buffer.c | 893 ---- fs/bcachefs/btree_write_buffer.h | 113 - fs/bcachefs/btree_write_buffer_types.h | 59 - fs/bcachefs/buckets.c | 1395 ------ fs/bcachefs/buckets.h | 369 -- fs/bcachefs/buckets_types.h | 100 - fs/bcachefs/buckets_waiting_for_journal.c | 174 - fs/bcachefs/buckets_waiting_for_journal.h | 15 - .../buckets_waiting_for_journal_types.h | 23 - fs/bcachefs/chardev.c | 843 ---- fs/bcachefs/chardev.h | 31 - fs/bcachefs/checksum.c | 698 --- fs/bcachefs/checksum.h | 240 -- fs/bcachefs/clock.c | 181 - fs/bcachefs/clock.h | 29 - fs/bcachefs/clock_types.h | 38 - fs/bcachefs/compress.c | 773 ---- fs/bcachefs/compress.h | 73 - fs/bcachefs/darray.c | 38 - fs/bcachefs/darray.h | 158 - fs/bcachefs/data_update.c | 1021 ----- fs/bcachefs/data_update.h | 93 - fs/bcachefs/debug.c | 996 ----- fs/bcachefs/debug.h | 50 - fs/bcachefs/dirent.c | 766 ---- fs/bcachefs/dirent.h | 119 - fs/bcachefs/dirent_format.h | 58 - fs/bcachefs/disk_accounting.c | 1074 ----- fs/bcachefs/disk_accounting.h | 301 -- fs/bcachefs/disk_accounting_format.h | 225 - fs/bcachefs/disk_accounting_types.h | 19 - fs/bcachefs/disk_groups.c | 591 --- fs/bcachefs/disk_groups.h | 111 - fs/bcachefs/disk_groups_format.h | 21 - fs/bcachefs/disk_groups_types.h | 18 - fs/bcachefs/ec.c | 2405 ----------- fs/bcachefs/ec.h | 309 -- fs/bcachefs/ec_format.h | 43 - fs/bcachefs/ec_types.h | 35 - fs/bcachefs/enumerated_ref.c | 144 - fs/bcachefs/enumerated_ref.h | 66 - fs/bcachefs/enumerated_ref_types.h | 19 - fs/bcachefs/errcode.c | 73 - fs/bcachefs/errcode.h | 387 -- fs/bcachefs/error.c | 771 ---- fs/bcachefs/error.h | 258 -- fs/bcachefs/extent_update.c | 155 - fs/bcachefs/extent_update.h | 12 - fs/bcachefs/extents.c | 1735 -------- fs/bcachefs/extents.h | 768 ---- fs/bcachefs/extents_format.h | 304 -- fs/bcachefs/extents_types.h | 42 - fs/bcachefs/eytzinger.c | 315 -- fs/bcachefs/eytzinger.h | 300 -- fs/bcachefs/fast_list.c | 156 - fs/bcachefs/fast_list.h | 41 - fs/bcachefs/fifo.h | 127 - fs/bcachefs/fs-io-buffered.c | 1109 ----- fs/bcachefs/fs-io-buffered.h | 27 - fs/bcachefs/fs-io-direct.c | 704 --- fs/bcachefs/fs-io-direct.h | 16 - fs/bcachefs/fs-io-pagecache.c | 827 ---- fs/bcachefs/fs-io-pagecache.h | 176 - fs/bcachefs/fs-io.c | 1102 ----- fs/bcachefs/fs-io.h | 184 - fs/bcachefs/fs-ioctl.c | 440 -- fs/bcachefs/fs-ioctl.h | 8 - fs/bcachefs/fs.c | 2768 ------------ fs/bcachefs/fs.h | 215 - fs/bcachefs/fsck.c | 3363 --------------- fs/bcachefs/fsck.h | 34 - fs/bcachefs/inode.c | 1566 ------- fs/bcachefs/inode.h | 319 -- fs/bcachefs/inode_format.h | 185 - fs/bcachefs/io_misc.c | 570 --- fs/bcachefs/io_misc.h | 36 - fs/bcachefs/io_read.c | 1543 ------- fs/bcachefs/io_read.h | 216 - fs/bcachefs/io_write.c | 1780 -------- fs/bcachefs/io_write.h | 77 - fs/bcachefs/io_write_types.h | 129 - fs/bcachefs/journal.c | 1832 -------- fs/bcachefs/journal.h | 465 -- fs/bcachefs/journal_io.c | 2242 ---------- fs/bcachefs/journal_io.h | 94 - fs/bcachefs/journal_reclaim.c | 1037 ----- fs/bcachefs/journal_reclaim.h | 84 - fs/bcachefs/journal_sb.c | 232 - fs/bcachefs/journal_sb.h | 24 - fs/bcachefs/journal_seq_blacklist.c | 264 -- fs/bcachefs/journal_seq_blacklist.h | 23 - fs/bcachefs/journal_seq_blacklist_format.h | 15 - fs/bcachefs/journal_types.h | 342 -- fs/bcachefs/keylist.c | 50 - fs/bcachefs/keylist.h | 72 - fs/bcachefs/keylist_types.h | 16 - fs/bcachefs/logged_ops.c | 119 - fs/bcachefs/logged_ops.h | 20 - fs/bcachefs/logged_ops_format.h | 35 - fs/bcachefs/lru.c | 223 - fs/bcachefs/lru.h | 70 - fs/bcachefs/lru_format.h | 27 - fs/bcachefs/mean_and_variance.c | 173 - fs/bcachefs/mean_and_variance.h | 203 - fs/bcachefs/mean_and_variance_test.c | 221 - fs/bcachefs/migrate.c | 277 -- fs/bcachefs/migrate.h | 8 - fs/bcachefs/move.c | 1494 ------- fs/bcachefs/move.h | 165 - fs/bcachefs/move_types.h | 46 - fs/bcachefs/movinggc.c | 476 --- fs/bcachefs/movinggc.h | 20 - fs/bcachefs/namei.c | 1034 ----- fs/bcachefs/namei.h | 79 - fs/bcachefs/nocow_locking.c | 142 - fs/bcachefs/nocow_locking.h | 50 - fs/bcachefs/nocow_locking_types.h | 20 - fs/bcachefs/opts.c | 844 ---- fs/bcachefs/opts.h | 693 --- fs/bcachefs/printbuf.c | 528 --- fs/bcachefs/printbuf.h | 298 -- fs/bcachefs/progress.c | 61 - fs/bcachefs/progress.h | 29 - fs/bcachefs/quota.c | 892 ---- fs/bcachefs/quota.h | 73 - fs/bcachefs/quota_format.h | 47 - fs/bcachefs/quota_types.h | 43 - fs/bcachefs/rcu_pending.c | 666 --- fs/bcachefs/rcu_pending.h | 27 - fs/bcachefs/rebalance.c | 889 ---- fs/bcachefs/rebalance.h | 59 - fs/bcachefs/rebalance_format.h | 53 - fs/bcachefs/rebalance_types.h | 41 - fs/bcachefs/recovery.c | 1306 ------ fs/bcachefs/recovery.h | 13 - fs/bcachefs/recovery_passes.c | 646 --- fs/bcachefs/recovery_passes.h | 48 - fs/bcachefs/recovery_passes_format.h | 106 - fs/bcachefs/recovery_passes_types.h | 27 - fs/bcachefs/reflink.c | 865 ---- fs/bcachefs/reflink.h | 87 - fs/bcachefs/reflink_format.h | 38 - fs/bcachefs/replicas.c | 918 ---- fs/bcachefs/replicas.h | 83 - fs/bcachefs/replicas_format.h | 36 - fs/bcachefs/replicas_types.h | 11 - fs/bcachefs/sb-clean.c | 340 -- fs/bcachefs/sb-clean.h | 16 - fs/bcachefs/sb-counters.c | 147 - fs/bcachefs/sb-counters.h | 20 - fs/bcachefs/sb-counters_format.h | 117 - fs/bcachefs/sb-downgrade.c | 457 -- fs/bcachefs/sb-downgrade.h | 12 - fs/bcachefs/sb-downgrade_format.h | 17 - fs/bcachefs/sb-errors.c | 198 - fs/bcachefs/sb-errors.h | 22 - fs/bcachefs/sb-errors_format.h | 353 -- fs/bcachefs/sb-errors_types.h | 15 - fs/bcachefs/sb-members.c | 606 --- fs/bcachefs/sb-members.h | 377 -- fs/bcachefs/sb-members_format.h | 128 - fs/bcachefs/sb-members_types.h | 22 - fs/bcachefs/seqmutex.h | 45 - fs/bcachefs/siphash.c | 173 - fs/bcachefs/siphash.h | 87 - fs/bcachefs/six.c | 878 ---- fs/bcachefs/six.h | 388 -- fs/bcachefs/snapshot.c | 2043 --------- fs/bcachefs/snapshot.h | 275 -- fs/bcachefs/snapshot_format.h | 36 - fs/bcachefs/snapshot_types.h | 57 - fs/bcachefs/str_hash.c | 400 -- fs/bcachefs/str_hash.h | 431 -- fs/bcachefs/subvolume.c | 752 ---- fs/bcachefs/subvolume.h | 88 - fs/bcachefs/subvolume_format.h | 35 - fs/bcachefs/subvolume_types.h | 11 - fs/bcachefs/super-io.c | 1562 ------- fs/bcachefs/super-io.h | 119 - fs/bcachefs/super.c | 2547 ----------- fs/bcachefs/super.h | 55 - fs/bcachefs/super_types.h | 35 - fs/bcachefs/sysfs.c | 914 ---- fs/bcachefs/sysfs.h | 49 - fs/bcachefs/tests.c | 891 ---- fs/bcachefs/tests.h | 15 - fs/bcachefs/thread_with_file.c | 494 --- fs/bcachefs/thread_with_file.h | 81 - fs/bcachefs/thread_with_file_types.h | 20 - fs/bcachefs/time_stats.c | 191 - fs/bcachefs/time_stats.h | 161 - fs/bcachefs/trace.c | 18 - fs/bcachefs/trace.h | 1883 -------- fs/bcachefs/two_state_shared_lock.c | 8 - fs/bcachefs/two_state_shared_lock.h | 58 - fs/bcachefs/util.c | 1047 ----- fs/bcachefs/util.h | 782 ---- fs/bcachefs/varint.c | 130 - fs/bcachefs/varint.h | 11 - fs/bcachefs/vstructs.h | 63 - fs/bcachefs/xattr.c | 642 --- fs/bcachefs/xattr.h | 50 - fs/bcachefs/xattr_format.h | 25 - 284 files changed, 117483 deletions(-) delete mode 100644 Documentation/filesystems/bcachefs/CodingStyle.rst delete mode 100644 Documentation/filesystems/bcachefs/SubmittingPatches.rst delete mode 100644 Documentation/filesystems/bcachefs/casefolding.rst delete mode 100644 Documentation/filesystems/bcachefs/errorcodes.rst delete mode 100644 Documentation/filesystems/bcachefs/future/idle_work.rst delete mode 100644 Documentation/filesystems/bcachefs/index.rst delete mode 100644 fs/bcachefs/Kconfig delete mode 100644 fs/bcachefs/Makefile delete mode 100644 fs/bcachefs/acl.c delete mode 100644 fs/bcachefs/acl.h delete mode 100644 fs/bcachefs/alloc_background.c delete mode 100644 fs/bcachefs/alloc_background.h delete mode 100644 fs/bcachefs/alloc_background_format.h delete mode 100644 fs/bcachefs/alloc_foreground.c delete mode 100644 fs/bcachefs/alloc_foreground.h delete mode 100644 fs/bcachefs/alloc_types.h delete mode 100644 fs/bcachefs/async_objs.c delete mode 100644 fs/bcachefs/async_objs.h delete mode 100644 fs/bcachefs/async_objs_types.h delete mode 100644 fs/bcachefs/backpointers.c delete mode 100644 fs/bcachefs/backpointers.h delete mode 100644 fs/bcachefs/bbpos.h delete mode 100644 fs/bcachefs/bbpos_types.h delete mode 100644 fs/bcachefs/bcachefs.h delete mode 100644 fs/bcachefs/bcachefs_format.h delete mode 100644 fs/bcachefs/bcachefs_ioctl.h delete mode 100644 fs/bcachefs/bkey.c delete mode 100644 fs/bcachefs/bkey.h delete mode 100644 fs/bcachefs/bkey_buf.h delete mode 100644 fs/bcachefs/bkey_cmp.h delete mode 100644 fs/bcachefs/bkey_methods.c delete mode 100644 fs/bcachefs/bkey_methods.h delete mode 100644 fs/bcachefs/bkey_sort.c delete mode 100644 fs/bcachefs/bkey_sort.h delete mode 100644 fs/bcachefs/bkey_types.h delete mode 100644 fs/bcachefs/bset.c delete mode 100644 fs/bcachefs/bset.h delete mode 100644 fs/bcachefs/btree_cache.c delete mode 100644 fs/bcachefs/btree_cache.h delete mode 100644 fs/bcachefs/btree_gc.c delete mode 100644 fs/bcachefs/btree_gc.h delete mode 100644 fs/bcachefs/btree_gc_types.h delete mode 100644 fs/bcachefs/btree_io.c delete mode 100644 fs/bcachefs/btree_io.h delete mode 100644 fs/bcachefs/btree_iter.c delete mode 100644 fs/bcachefs/btree_iter.h delete mode 100644 fs/bcachefs/btree_journal_iter.c delete mode 100644 fs/bcachefs/btree_journal_iter.h delete mode 100644 fs/bcachefs/btree_journal_iter_types.h delete mode 100644 fs/bcachefs/btree_key_cache.c delete mode 100644 fs/bcachefs/btree_key_cache.h delete mode 100644 fs/bcachefs/btree_key_cache_types.h delete mode 100644 fs/bcachefs/btree_locking.c delete mode 100644 fs/bcachefs/btree_locking.h delete mode 100644 fs/bcachefs/btree_node_scan.c delete mode 100644 fs/bcachefs/btree_node_scan.h delete mode 100644 fs/bcachefs/btree_node_scan_types.h delete mode 100644 fs/bcachefs/btree_trans_commit.c delete mode 100644 fs/bcachefs/btree_types.h delete mode 100644 fs/bcachefs/btree_update.c delete mode 100644 fs/bcachefs/btree_update.h delete mode 100644 fs/bcachefs/btree_update_interior.c delete mode 100644 fs/bcachefs/btree_update_interior.h delete mode 100644 fs/bcachefs/btree_write_buffer.c delete mode 100644 fs/bcachefs/btree_write_buffer.h delete mode 100644 fs/bcachefs/btree_write_buffer_types.h delete mode 100644 fs/bcachefs/buckets.c delete mode 100644 fs/bcachefs/buckets.h delete mode 100644 fs/bcachefs/buckets_types.h delete mode 100644 fs/bcachefs/buckets_waiting_for_journal.c delete mode 100644 fs/bcachefs/buckets_waiting_for_journal.h delete mode 100644 fs/bcachefs/buckets_waiting_for_journal_types.h delete mode 100644 fs/bcachefs/chardev.c delete mode 100644 fs/bcachefs/chardev.h delete mode 100644 fs/bcachefs/checksum.c delete mode 100644 fs/bcachefs/checksum.h delete mode 100644 fs/bcachefs/clock.c delete mode 100644 fs/bcachefs/clock.h delete mode 100644 fs/bcachefs/clock_types.h delete mode 100644 fs/bcachefs/compress.c delete mode 100644 fs/bcachefs/compress.h delete mode 100644 fs/bcachefs/darray.c delete mode 100644 fs/bcachefs/darray.h delete mode 100644 fs/bcachefs/data_update.c delete mode 100644 fs/bcachefs/data_update.h delete mode 100644 fs/bcachefs/debug.c delete mode 100644 fs/bcachefs/debug.h delete mode 100644 fs/bcachefs/dirent.c delete mode 100644 fs/bcachefs/dirent.h delete mode 100644 fs/bcachefs/dirent_format.h delete mode 100644 fs/bcachefs/disk_accounting.c delete mode 100644 fs/bcachefs/disk_accounting.h delete mode 100644 fs/bcachefs/disk_accounting_format.h delete mode 100644 fs/bcachefs/disk_accounting_types.h delete mode 100644 fs/bcachefs/disk_groups.c delete mode 100644 fs/bcachefs/disk_groups.h delete mode 100644 fs/bcachefs/disk_groups_format.h delete mode 100644 fs/bcachefs/disk_groups_types.h delete mode 100644 fs/bcachefs/ec.c delete mode 100644 fs/bcachefs/ec.h delete mode 100644 fs/bcachefs/ec_format.h delete mode 100644 fs/bcachefs/ec_types.h delete mode 100644 fs/bcachefs/enumerated_ref.c delete mode 100644 fs/bcachefs/enumerated_ref.h delete mode 100644 fs/bcachefs/enumerated_ref_types.h delete mode 100644 fs/bcachefs/errcode.c delete mode 100644 fs/bcachefs/errcode.h delete mode 100644 fs/bcachefs/error.c delete mode 100644 fs/bcachefs/error.h delete mode 100644 fs/bcachefs/extent_update.c delete mode 100644 fs/bcachefs/extent_update.h delete mode 100644 fs/bcachefs/extents.c delete mode 100644 fs/bcachefs/extents.h delete mode 100644 fs/bcachefs/extents_format.h delete mode 100644 fs/bcachefs/extents_types.h delete mode 100644 fs/bcachefs/eytzinger.c delete mode 100644 fs/bcachefs/eytzinger.h delete mode 100644 fs/bcachefs/fast_list.c delete mode 100644 fs/bcachefs/fast_list.h delete mode 100644 fs/bcachefs/fifo.h delete mode 100644 fs/bcachefs/fs-io-buffered.c delete mode 100644 fs/bcachefs/fs-io-buffered.h delete mode 100644 fs/bcachefs/fs-io-direct.c delete mode 100644 fs/bcachefs/fs-io-direct.h delete mode 100644 fs/bcachefs/fs-io-pagecache.c delete mode 100644 fs/bcachefs/fs-io-pagecache.h delete mode 100644 fs/bcachefs/fs-io.c delete mode 100644 fs/bcachefs/fs-io.h delete mode 100644 fs/bcachefs/fs-ioctl.c delete mode 100644 fs/bcachefs/fs-ioctl.h delete mode 100644 fs/bcachefs/fs.c delete mode 100644 fs/bcachefs/fs.h delete mode 100644 fs/bcachefs/fsck.c delete mode 100644 fs/bcachefs/fsck.h delete mode 100644 fs/bcachefs/inode.c delete mode 100644 fs/bcachefs/inode.h delete mode 100644 fs/bcachefs/inode_format.h delete mode 100644 fs/bcachefs/io_misc.c delete mode 100644 fs/bcachefs/io_misc.h delete mode 100644 fs/bcachefs/io_read.c delete mode 100644 fs/bcachefs/io_read.h delete mode 100644 fs/bcachefs/io_write.c delete mode 100644 fs/bcachefs/io_write.h delete mode 100644 fs/bcachefs/io_write_types.h delete mode 100644 fs/bcachefs/journal.c delete mode 100644 fs/bcachefs/journal.h delete mode 100644 fs/bcachefs/journal_io.c delete mode 100644 fs/bcachefs/journal_io.h delete mode 100644 fs/bcachefs/journal_reclaim.c delete mode 100644 fs/bcachefs/journal_reclaim.h delete mode 100644 fs/bcachefs/journal_sb.c delete mode 100644 fs/bcachefs/journal_sb.h delete mode 100644 fs/bcachefs/journal_seq_blacklist.c delete mode 100644 fs/bcachefs/journal_seq_blacklist.h delete mode 100644 fs/bcachefs/journal_seq_blacklist_format.h delete mode 100644 fs/bcachefs/journal_types.h delete mode 100644 fs/bcachefs/keylist.c delete mode 100644 fs/bcachefs/keylist.h delete mode 100644 fs/bcachefs/keylist_types.h delete mode 100644 fs/bcachefs/logged_ops.c delete mode 100644 fs/bcachefs/logged_ops.h delete mode 100644 fs/bcachefs/logged_ops_format.h delete mode 100644 fs/bcachefs/lru.c delete mode 100644 fs/bcachefs/lru.h delete mode 100644 fs/bcachefs/lru_format.h delete mode 100644 fs/bcachefs/mean_and_variance.c delete mode 100644 fs/bcachefs/mean_and_variance.h delete mode 100644 fs/bcachefs/mean_and_variance_test.c delete mode 100644 fs/bcachefs/migrate.c delete mode 100644 fs/bcachefs/migrate.h delete mode 100644 fs/bcachefs/move.c delete mode 100644 fs/bcachefs/move.h delete mode 100644 fs/bcachefs/move_types.h delete mode 100644 fs/bcachefs/movinggc.c delete mode 100644 fs/bcachefs/movinggc.h delete mode 100644 fs/bcachefs/namei.c delete mode 100644 fs/bcachefs/namei.h delete mode 100644 fs/bcachefs/nocow_locking.c delete mode 100644 fs/bcachefs/nocow_locking.h delete mode 100644 fs/bcachefs/nocow_locking_types.h delete mode 100644 fs/bcachefs/opts.c delete mode 100644 fs/bcachefs/opts.h delete mode 100644 fs/bcachefs/printbuf.c delete mode 100644 fs/bcachefs/printbuf.h delete mode 100644 fs/bcachefs/progress.c delete mode 100644 fs/bcachefs/progress.h delete mode 100644 fs/bcachefs/quota.c delete mode 100644 fs/bcachefs/quota.h delete mode 100644 fs/bcachefs/quota_format.h delete mode 100644 fs/bcachefs/quota_types.h delete mode 100644 fs/bcachefs/rcu_pending.c delete mode 100644 fs/bcachefs/rcu_pending.h delete mode 100644 fs/bcachefs/rebalance.c delete mode 100644 fs/bcachefs/rebalance.h delete mode 100644 fs/bcachefs/rebalance_format.h delete mode 100644 fs/bcachefs/rebalance_types.h delete mode 100644 fs/bcachefs/recovery.c delete mode 100644 fs/bcachefs/recovery.h delete mode 100644 fs/bcachefs/recovery_passes.c delete mode 100644 fs/bcachefs/recovery_passes.h delete mode 100644 fs/bcachefs/recovery_passes_format.h delete mode 100644 fs/bcachefs/recovery_passes_types.h delete mode 100644 fs/bcachefs/reflink.c delete mode 100644 fs/bcachefs/reflink.h delete mode 100644 fs/bcachefs/reflink_format.h delete mode 100644 fs/bcachefs/replicas.c delete mode 100644 fs/bcachefs/replicas.h delete mode 100644 fs/bcachefs/replicas_format.h delete mode 100644 fs/bcachefs/replicas_types.h delete mode 100644 fs/bcachefs/sb-clean.c delete mode 100644 fs/bcachefs/sb-clean.h delete mode 100644 fs/bcachefs/sb-counters.c delete mode 100644 fs/bcachefs/sb-counters.h delete mode 100644 fs/bcachefs/sb-counters_format.h delete mode 100644 fs/bcachefs/sb-downgrade.c delete mode 100644 fs/bcachefs/sb-downgrade.h delete mode 100644 fs/bcachefs/sb-downgrade_format.h delete mode 100644 fs/bcachefs/sb-errors.c delete mode 100644 fs/bcachefs/sb-errors.h delete mode 100644 fs/bcachefs/sb-errors_format.h delete mode 100644 fs/bcachefs/sb-errors_types.h delete mode 100644 fs/bcachefs/sb-members.c delete mode 100644 fs/bcachefs/sb-members.h delete mode 100644 fs/bcachefs/sb-members_format.h delete mode 100644 fs/bcachefs/sb-members_types.h delete mode 100644 fs/bcachefs/seqmutex.h delete mode 100644 fs/bcachefs/siphash.c delete mode 100644 fs/bcachefs/siphash.h delete mode 100644 fs/bcachefs/six.c delete mode 100644 fs/bcachefs/six.h delete mode 100644 fs/bcachefs/snapshot.c delete mode 100644 fs/bcachefs/snapshot.h delete mode 100644 fs/bcachefs/snapshot_format.h delete mode 100644 fs/bcachefs/snapshot_types.h delete mode 100644 fs/bcachefs/str_hash.c delete mode 100644 fs/bcachefs/str_hash.h delete mode 100644 fs/bcachefs/subvolume.c delete mode 100644 fs/bcachefs/subvolume.h delete mode 100644 fs/bcachefs/subvolume_format.h delete mode 100644 fs/bcachefs/subvolume_types.h delete mode 100644 fs/bcachefs/super-io.c delete mode 100644 fs/bcachefs/super-io.h delete mode 100644 fs/bcachefs/super.c delete mode 100644 fs/bcachefs/super.h delete mode 100644 fs/bcachefs/super_types.h delete mode 100644 fs/bcachefs/sysfs.c delete mode 100644 fs/bcachefs/sysfs.h delete mode 100644 fs/bcachefs/tests.c delete mode 100644 fs/bcachefs/tests.h delete mode 100644 fs/bcachefs/thread_with_file.c delete mode 100644 fs/bcachefs/thread_with_file.h delete mode 100644 fs/bcachefs/thread_with_file_types.h delete mode 100644 fs/bcachefs/time_stats.c delete mode 100644 fs/bcachefs/time_stats.h delete mode 100644 fs/bcachefs/trace.c delete mode 100644 fs/bcachefs/trace.h delete mode 100644 fs/bcachefs/two_state_shared_lock.c delete mode 100644 fs/bcachefs/two_state_shared_lock.h delete mode 100644 fs/bcachefs/util.c delete mode 100644 fs/bcachefs/util.h delete mode 100644 fs/bcachefs/varint.c delete mode 100644 fs/bcachefs/varint.h delete mode 100644 fs/bcachefs/vstructs.h delete mode 100644 fs/bcachefs/xattr.c delete mode 100644 fs/bcachefs/xattr.h delete mode 100644 fs/bcachefs/xattr_format.h diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst deleted file mode 100644 index b29562a6bf555c..00000000000000 --- a/Documentation/filesystems/bcachefs/CodingStyle.rst +++ /dev/null @@ -1,186 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -bcachefs coding style -===================== - -Good development is like gardening, and codebases are our gardens. Tend to them -every day; look for little things that are out of place or in need of tidying. -A little weeding here and there goes a long way; don't wait until things have -spiraled out of control. - -Things don't always have to be perfect - nitpicking often does more harm than -good. But appreciate beauty when you see it - and let people know. - -The code that you are afraid to touch is the code most in need of refactoring. - -A little organizing here and there goes a long way. - -Put real thought into how you organize things. - -Good code is readable code, where the structure is simple and leaves nowhere -for bugs to hide. - -Assertions are one of our most important tools for writing reliable code. If in -the course of writing a patchset you encounter a condition that shouldn't -happen (and will have unpredictable or undefined behaviour if it does), or -you're not sure if it can happen and not sure how to handle it yet - make it a -BUG_ON(). Don't leave undefined or unspecified behavior lurking in the codebase. - -By the time you finish the patchset, you should understand better which -assertions need to be handled and turned into checks with error paths, and -which should be logically impossible. Leave the BUG_ON()s in for the ones which -are logically impossible. (Or, make them debug mode assertions if they're -expensive - but don't turn everything into a debug mode assertion, so that -we're not stuck debugging undefined behaviour should it turn out that you were -wrong). - -Assertions are documentation that can't go out of date. Good assertions are -wonderful. - -Good assertions drastically and dramatically reduce the amount of testing -required to shake out bugs. - -Good assertions are based on state, not logic. To write good assertions, you -have to think about what the invariants on your state are. - -Good invariants and assertions will hold everywhere in your codebase. This -means that you can run them in only a few places in the checked in version, but -should you need to debug something that caused the assertion to fail, you can -quickly shotgun them everywhere to find the codepath that broke the invariant. - -A good assertion checks something that the compiler could check for us, and -elide - if we were working in a language with embedded correctness proofs that -the compiler could check. This is something that exists today, but it'll likely -still be a few decades before it comes to systems programming languages. But we -can still incorporate that kind of thinking into our code and document the -invariants with runtime checks - much like the way people working in -dynamically typed languages may add type annotations, gradually making their -code statically typed. - -Looking for ways to make your assertions simpler - and higher level - will -often nudge you towards making the entire system simpler and more robust. - -Good code is code where you can poke around and see what it's doing - -introspection. We can't debug anything if we can't see what's going on. - -Whenever we're debugging, and the solution isn't immediately obvious, if the -issue is that we don't know where the issue is because we can't see what's -going on - fix that first. - -We have the tools to make anything visible at runtime, efficiently - RCU and -percpu data structures among them. Don't let things stay hidden. - -The most important tool for introspection is the humble pretty printer - in -bcachefs, this means `*_to_text()` functions, which output to printbufs. - -Pretty printers are wonderful, because they compose and you can use them -everywhere. Having functions to print whatever object you're working with will -make your error messages much easier to write (therefore they will actually -exist) and much more informative. And they can be used from sysfs/debugfs, as -well as tracepoints. - -Runtime info and debugging tools should come with clear descriptions and -labels, and good structure - we don't want files with a list of bare integers, -like in procfs. Part of the job of the debugging tools is to educate users and -new developers as to how the system works. - -Error messages should, whenever possible, tell you everything you need to debug -the issue. It's worth putting effort into them. - -Tracepoints shouldn't be the first thing you reach for. They're an important -tool, but always look for more immediate ways to make things visible. When we -have to rely on tracing, we have to know which tracepoints we're looking for, -and then we have to run the troublesome workload, and then we have to sift -through logs. This is a lot of steps to go through when a user is hitting -something, and if it's intermittent it may not even be possible. - -The humble counter is an incredibly useful tool. They're cheap and simple to -use, and many complicated internal operations with lots of things that can -behave weirdly (anything involving memory reclaim, for example) become -shockingly easy to debug once you have counters on every distinct codepath. - -Persistent counters are even better. - -When debugging, try to get the most out of every bug you come across; don't -rush to fix the initial issue. Look for things that will make related bugs -easier the next time around - introspection, new assertions, better error -messages, new debug tools, and do those first. Look for ways to make the system -better behaved; often one bug will uncover several other bugs through -downstream effects. - -Fix all that first, and then the original bug last - even if that means keeping -a user waiting. They'll thank you in the long run, and when they understand -what you're doing you'll be amazed at how patient they're happy to be. Users -like to help - otherwise they wouldn't be reporting the bug in the first place. - -Talk to your users. Don't isolate yourself. - -Users notice all sorts of interesting things, and by just talking to them and -interacting with them you can benefit from their experience. - -Spend time doing support and helpdesk stuff. Don't just write code - code isn't -finished until it's being used trouble free. - -This will also motivate you to make your debugging tools as good as possible, -and perhaps even your documentation, too. Like anything else in life, the more -time you spend at it the better you'll get, and you the developer are the -person most able to improve the tools to make debugging quick and easy. - -Be wary of how you take on and commit to big projects. Don't let development -become product-manager focused. Often time an idea is a good one but needs to -wait for its proper time - but you won't know if it's the proper time for an -idea until you start writing code. - -Expect to throw a lot of things away, or leave them half finished for later. -Nobody writes all perfect code that all gets shipped, and you'll be much more -productive in the long run if you notice this early and shift to something -else. The experience gained and lessons learned will be valuable for all the -other work you do. - -But don't be afraid to tackle projects that require significant rework of -existing code. Sometimes these can be the best projects, because they can lead -us to make existing code more general, more flexible, more multipurpose and -perhaps more robust. Just don't hesitate to abandon the idea if it looks like -it's going to make a mess of things. - -Complicated features can often be done as a series of refactorings, with the -final change that actually implements the feature as a quite small patch at the -end. It's wonderful when this happens, especially when those refactorings are -things that improve the codebase in their own right. When that happens there's -much less risk of wasted effort if the feature you were going for doesn't work -out. - -Always strive to work incrementally. Always strive to turn the big projects -into little bite sized projects that can prove their own merits. - -Instead of always tackling those big projects, look for little things that -will be useful, and make the big projects easier. - -The question of what's likely to be useful is where junior developers most -often go astray - doing something because it seems like it'll be useful often -leads to overengineering. Knowing what's useful comes from many years of -experience, or talking with people who have that experience - or from simply -reading lots of code and looking for common patterns and issues. Don't be -afraid to throw things away and do something simpler. - -Talk about your ideas with your fellow developers; often times the best things -come from relaxed conversations where people aren't afraid to say "what if?". - -Don't neglect your tools. - -The most important tools (besides the compiler and our text editor) are the -tools we use for testing. The shortest possible edit/test/debug cycle is -essential for working productively. We learn, gain experience, and discover the -errors in our thinking by running our code and seeing what happens. If your -time is being wasted because your tools are bad or too slow - don't accept it, -fix it. - -Put effort into your documentation, commit messages, and code comments - but -don't go overboard. A good commit message is wonderful - but if the information -was important enough to go in a commit message, ask yourself if it would be -even better as a code comment. - -A good code comment is wonderful, but even better is the comment that didn't -need to exist because the code was so straightforward as to be obvious; -organized into small clean and tidy modules, with clear and descriptive names -for functions and variables, where every line of code has a clear purpose. diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst deleted file mode 100644 index 18c79d5483911d..00000000000000 --- a/Documentation/filesystems/bcachefs/SubmittingPatches.rst +++ /dev/null @@ -1,105 +0,0 @@ -Submitting patches to bcachefs -============================== - -Here are suggestions for submitting patches to bcachefs subsystem. - -Submission checklist --------------------- - -Patches must be tested before being submitted, either with the xfstests suite -[0]_, or the full bcachefs test suite in ktest [1]_, depending on what's being -touched. Note that ktest wraps xfstests and will be an easier method to running -it for most users; it includes single-command wrappers for all the mainstream -in-kernel local filesystems. - -Patches will undergo more testing after being merged (including -lockdep/kasan/preempt/etc. variants), these are not generally required to be -run by the submitter - but do put some thought into what you're changing and -which tests might be relevant, e.g. are you dealing with tricky memory layout -work? kasan, are you doing locking work? then lockdep; and ktest includes -single-command variants for the debug build types you'll most likely need. - -The exception to this rule is incomplete WIP/RFC patches: if you're working on -something nontrivial, it's encouraged to send out a WIP patch to let people -know what you're doing and make sure you're on the right track. Just make sure -it includes a brief note as to what's done and what's incomplete, to avoid -confusion. - -Rigorous checkpatch.pl adherence is not required (many of its warnings are -considered out of date), but try not to deviate too much without reason. - -Focus on writing code that reads well and is organized well; code should be -aesthetically pleasing. - -CI --- - -Instead of running your tests locally, when running the full test suite it's -preferable to let a server farm do it in parallel, and then have the results -in a nice test dashboard (which can tell you which failures are new, and -presents results in a git log view, avoiding the need for most bisecting). - -That exists [2]_, and community members may request an account. If you work for -a big tech company, you'll need to help out with server costs to get access - -but the CI is not restricted to running bcachefs tests: it runs any ktest test -(which generally makes it easy to wrap other tests that can run in qemu). - -Other things to think about ---------------------------- - -- How will we debug this code? Is there sufficient introspection to diagnose - when something starts acting wonky on a user machine? - - We don't necessarily need every single field of every data structure visible - with introspection, but having the important fields of all the core data - types wired up makes debugging drastically easier - a bit of thoughtful - foresight greatly reduces the need to have people build custom kernels with - debug patches. - - More broadly, think about all the debug tooling that might be needed. - -- Does it make the codebase more or less of a mess? Can we also try to do some - organizing, too? - -- Do new tests need to be written? New assertions? How do we know and verify - that the code is correct, and what happens if something goes wrong? - - We don't yet have automated code coverage analysis or easy fault injection - - but for now, pretend we did and ask what they might tell us. - - Assertions are hugely important, given that we don't yet have a systems - language that can do ergonomic embedded correctness proofs. Hitting an assert - in testing is much better than wandering off into undefined behaviour la-la - land - use them. Use them judiciously, and not as a replacement for proper - error handling, but use them. - -- Does it need to be performance tested? Should we add new performance counters? - - bcachefs has a set of persistent runtime counters which can be viewed with - the 'bcachefs fs top' command; this should give users a basic idea of what - their filesystem is currently doing. If you're doing a new feature or looking - at old code, think if anything should be added. - -- If it's a new on disk format feature - have upgrades and downgrades been - tested? (Automated tests exists but aren't in the CI, due to the hassle of - disk image management; coordinate to have them run.) - -Mailing list, IRC ------------------ - -Patches should hit the list [3]_, but much discussion and code review happens -on IRC as well [4]_; many people appreciate the more conversational approach -and quicker feedback. - -Additionally, we have a lively user community doing excellent QA work, which -exists primarily on IRC. Please make use of that resource; user feedback is -important for any nontrivial feature, and documenting it in commit messages -would be a good idea. - -.. rubric:: References - -.. [0] git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git -.. [1] https://evilpiepirate.org/git/ktest.git/ -.. [2] https://evilpiepirate.org/~testdashboard/ci/ -.. [3] linux-bcachefs@vger.kernel.org -.. [4] irc.oftc.net#bcache, #bcachefs-dev diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst deleted file mode 100644 index 871a38f557e8e4..00000000000000 --- a/Documentation/filesystems/bcachefs/casefolding.rst +++ /dev/null @@ -1,108 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Casefolding -=========== - -bcachefs has support for case-insensitive file and directory -lookups using the regular `chattr +F` (`S_CASEFOLD`, `FS_CASEFOLD_FL`) -casefolding attributes. - -The main usecase for casefolding is compatibility with software written -against other filesystems that rely on casefolded lookups -(eg. NTFS and Wine/Proton). -Taking advantage of file-system level casefolding can lead to great -loading time gains in many applications and games. - -Casefolding support requires a kernel with the `CONFIG_UNICODE` enabled. -Once a directory has been flagged for casefolding, a feature bit -is enabled on the superblock which marks the filesystem as using -casefolding. -When the feature bit for casefolding is enabled, it is no longer possible -to mount that filesystem on kernels without `CONFIG_UNICODE` enabled. - -On the lookup/query side: casefolding is implemented by allocating a new -string of `BCH_NAME_MAX` length using the `utf8_casefold` function to -casefold the query string. - -On the dirent side: casefolding is implemented by ensuring the `bkey`'s -hash is made from the casefolded string and storing the cached casefolded -name with the regular name in the dirent. - -The structure looks like this: - -* Regular: [dirent data][regular name][nul][nul]... -* Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]... - -(Do note, the number of NULs here is merely for illustration; their count can -vary per-key, and they may not even be present if the key is aligned to -`sizeof(u64)`.) - -This is efficient as it means that for all file lookups that require casefolding, -it has identical performance to a regular lookup: -a hash comparison and a `memcmp` of the name. - -Rationale ---------- - -Several designs were considered for this system: -One was to introduce a dirent_v2, however that would be painful especially as -the hash system only has support for a single key type. This would also need -`BCH_NAME_MAX` to change between versions, and a new feature bit. - -Another option was to store without the two lengths, and just take the length of -the regular name and casefolded name contiguously / 2 as the length. This would -assume that the regular length == casefolded length, but that could potentially -not be true, if the uppercase unicode glyph had a different UTF-8 encoding than -the lowercase unicode glyph. -It would be possible to disregard the casefold cache for those cases, but it was -decided to simply encode the two string lengths in the key to avoid random -performance issues if this edgecase was ever hit. - -The option settled on was to use a free-bit in d_type to mark a dirent as having -a casefold cache, and then treat the first 4 bytes the name block as lengths. -You can see this in the `d_cf_name_block` member of union in `bch_dirent`. - -The feature bit was used to allow casefolding support to be enabled for the majority -of users, but some allow users who have no need for the feature to still use bcachefs as -`CONFIG_UNICODE` can increase the kernel side a significant amount due to the tables used, -which may be decider between using bcachefs for eg. embedded platforms. - -Other filesystems like ext4 and f2fs have a super-block level option for casefolding -encoding, but bcachefs currently does not provide this. ext4 and f2fs do not expose -any encodings than a single UTF-8 version. When future encodings are desirable, -they will be added trivially using the opts mechanism. - -dentry/dcache considerations ----------------------------- - -Currently, in casefolded directories, bcachefs (like other filesystems) will not cache -negative dentry's. - -This is because currently doing so presents a problem in the following scenario: - - - Lookup file "blAH" in a casefolded directory - - Creation of file "BLAH" in a casefolded directory - - Lookup file "blAH" in a casefolded directory - -This would fail if negative dentry's were cached. - -This is slightly suboptimal, but could be fixed in future with some vfs work. - - -References ----------- - -(from Peter Anvin, on the list) - -It is worth noting that Microsoft has basically declared their -"recommended" case folding (upcase) table to be permanently frozen (for -new filesystem instances in the case where they use an on-disk -translation table created at format time.) As far as I know they have -never supported anything other than 1:1 conversion of BMP code points, -nor normalization. - -The exFAT specification enumerates the full recommended upcase table, -although in a somewhat annoying format (basically a hex dump of -compressed data): - -https://learn.microsoft.com/en-us/windows/win32/fileio/exfat-specification diff --git a/Documentation/filesystems/bcachefs/errorcodes.rst b/Documentation/filesystems/bcachefs/errorcodes.rst deleted file mode 100644 index 2cccaa0ba7cd4d..00000000000000 --- a/Documentation/filesystems/bcachefs/errorcodes.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -bcachefs private error codes ----------------------------- - -In bcachefs, as a hard rule we do not throw or directly use standard error -codes (-EINVAL, -EBUSY, etc.). Instead, we define private error codes as needed -in fs/bcachefs/errcode.h. - -This gives us much better error messages and makes debugging much easier. Any -direct uses of standard error codes you see in the source code are simply old -code that has yet to be converted - feel free to clean it up! - -Private error codes may subtype another error code, this allows for grouping of -related errors that should be handled similarly (e.g. transaction restart -errors), as well as specifying which standard error code should be returned at -the bcachefs module boundary. - -At the module boundary, we use bch2_err_class() to convert to a standard error -code; this also emits a trace event so that the original error code be -recovered even if it wasn't logged. - -Do not reuse error codes! Generally speaking, a private error code should only -be thrown in one place. That means that when we see it in a log message we can -see, unambiguously, exactly which file and line number it was returned from. - -Try to give error codes names that are as reasonably descriptive of the error -as possible. Frequently, the error will be logged at a place far removed from -where the error was generated; good names for error codes mean much more -descriptive and useful error messages. diff --git a/Documentation/filesystems/bcachefs/future/idle_work.rst b/Documentation/filesystems/bcachefs/future/idle_work.rst deleted file mode 100644 index 59a332509dcd97..00000000000000 --- a/Documentation/filesystems/bcachefs/future/idle_work.rst +++ /dev/null @@ -1,78 +0,0 @@ -Idle/background work classes design doc: - -Right now, our behaviour at idle isn't ideal, it was designed for servers that -would be under sustained load, to keep pending work at a "medium" level, to -let work build up so we can process it in more efficient batches, while also -giving headroom for bursts in load. - -But for desktops or mobile - scenarios where work is less sustained and power -usage is more important - we want to operate differently, with a "rush to -idle" so the system can go to sleep. We don't want to be dribbling out -background work while the system should be idle. - -The complicating factor is that there are a number of background tasks, which -form a heirarchy (or a digraph, depending on how you divide it up) - one -background task may generate work for another. - -Thus proper idle detection needs to model this heirarchy. - -- Foreground writes -- Page cache writeback -- Copygc, rebalance -- Journal reclaim - -When we implement idle detection and rush to idle, we need to be careful not -to disturb too much the existing behaviour that works reasonably well when the -system is under sustained load (or perhaps improve it in the case of -rebalance, which currently does not actively attempt to let work batch up). - -SUSTAINED LOAD REGIME ---------------------- - -When the system is under continuous load, we want these jobs to run -continuously - this is perhaps best modelled with a P/D controller, where -they'll be trying to keep a target value (i.e. fragmented disk space, -available journal space) roughly in the middle of some range. - -The goal under sustained load is to balance our ability to handle load spikes -without running out of x resource (free disk space, free space in the -journal), while also letting some work accumululate to be batched (or become -unnecessary). - -For example, we don't want to run copygc too aggressively, because then it -will be evacuating buckets that would have become empty (been overwritten or -deleted) anyways, and we don't want to wait until we're almost out of free -space because then the system will behave unpredicably - suddenly we're doing -a lot more work to service each write and the system becomes much slower. - -IDLE REGIME ------------ - -When the system becomes idle, we should start flushing our pending work -quicker so the system can go to sleep. - -Note that the definition of "idle" depends on where in the heirarchy a task -is - a task should start flushing work more quickly when the task above it has -stopped generating new work. - -e.g. rebalance should start flushing more quickly when page cache writeback is -idle, and journal reclaim should only start flushing more quickly when both -copygc and rebalance are idle. - -It's important to let work accumulate when more work is still incoming and we -still have room, because flushing is always more efficient if we let it batch -up. New writes may overwrite data before rebalance moves it, and tasks may be -generating more updates for the btree nodes that journal reclaim needs to flush. - -On idle, how much work we do at each interval should be proportional to the -length of time we have been idle for. If we're idle only for a short duration, -we shouldn't flush everything right away; the system might wake up and start -generating new work soon, and flushing immediately might end up doing a lot of -work that would have been unnecessary if we'd allowed things to batch more. - -To summarize, we will need: - - - A list of classes for background tasks that generate work, which will - include one "foreground" class. - - Tracking for each class - "Am I doing work, or have I gone to sleep?" - - And each class should check the class above it when deciding how much work to issue. diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst deleted file mode 100644 index e5c4c2120b93e8..00000000000000 --- a/Documentation/filesystems/bcachefs/index.rst +++ /dev/null @@ -1,38 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====================== -bcachefs Documentation -====================== - -Subsystem-specific development process notes --------------------------------------------- - -Development notes specific to bcachefs. These are intended to supplement -:doc:`general kernel development handbook `. - -.. toctree:: - :maxdepth: 1 - :numbered: - - CodingStyle - SubmittingPatches - -Filesystem implementation -------------------------- - -Documentation for filesystem features and their implementation details. -At this moment, only a few of these are described here. - -.. toctree:: - :maxdepth: 1 - :numbered: - - casefolding - errorcodes - -Future design -------------- -.. toctree:: - :maxdepth: 1 - - future/idle_work diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 11a599387266a4..622187a96bdc6c 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -72,7 +72,6 @@ Documentation for filesystem implementations. afs autofs autofs-mount-control - bcachefs/index befs bfs btrfs diff --git a/MAINTAINERS b/MAINTAINERS index 8eaecbb2cc2baf..4fd5b2ec4f85c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4217,10 +4217,7 @@ M: Kent Overstreet L: linux-bcachefs@vger.kernel.org S: Externally maintained C: irc://irc.oftc.net/bcache -P: Documentation/filesystems/bcachefs/SubmittingPatches.rst T: git https://evilpiepirate.org/git/bcachefs.git -F: fs/bcachefs/ -F: Documentation/filesystems/bcachefs/ BDISP ST MEDIA DRIVER M: Fabien Dessenne diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 5171bb183967b9..b5546a3ac9c5bd 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -454,7 +454,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 16f343ae48c675..4ea0d686e28eaa 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -411,7 +411,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index c08788728ea962..0698d9d4b04e8c 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -431,7 +431,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 962497e7c53fd6..45d1ee0860e531 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -403,7 +403,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index ec28650189e406..e5794b906b655f 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -413,7 +413,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 0afb3ad180dee3..fb84ba4c135006 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -430,7 +430,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index b311e953995d6d..9a05e05523fc65 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -517,7 +517,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index f4e6224f137f99..0e30aa574a2b49 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -403,7 +403,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 498e167222f18c..d6f5600d941065 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -404,7 +404,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 8c6b1eef853423..16f0adff4ada7f 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -420,7 +420,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index c34648f299efb9..5b0e273be74bdb 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -401,7 +401,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 73810d14660f21..3851d6720fac45 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -401,7 +401,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 5e616bc988ac35..8ceef305f13875 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -658,9 +658,6 @@ CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_BTRFS_DEBUG=y CONFIG_BTRFS_ASSERT=y CONFIG_NILFS2_FS=m -CONFIG_BCACHEFS_FS=y -CONFIG_BCACHEFS_QUOTA=y -CONFIG_BCACHEFS_POSIX_ACL=y CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 094599cdaf4d9b..3fc12b0af55b25 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -645,9 +645,6 @@ CONFIG_OCFS2_FS=m CONFIG_BTRFS_FS=y CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m -CONFIG_BCACHEFS_FS=m -CONFIG_BCACHEFS_QUOTA=y -CONFIG_BCACHEFS_POSIX_ACL=y CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y diff --git a/fs/Kconfig b/fs/Kconfig index c654a364289700..7815379032dacb 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -51,7 +51,6 @@ source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" source "fs/f2fs/Kconfig" -source "fs/bcachefs/Kconfig" source "fs/zonefs/Kconfig" endif # BLOCK diff --git a/fs/Makefile b/fs/Makefile index 334654f9584b94..e3523ab2e58713 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -121,7 +121,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_F2FS_FS) += f2fs/ -obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig deleted file mode 100644 index 8cb2b9d5da96f3..00000000000000 --- a/fs/bcachefs/Kconfig +++ /dev/null @@ -1,121 +0,0 @@ - -config BCACHEFS_FS - tristate "bcachefs filesystem support (EXPERIMENTAL)" - depends on BLOCK - select EXPORTFS - select CLOSURES - select CRC32 - select CRC64 - select FS_POSIX_ACL - select LZ4_COMPRESS - select LZ4_DECOMPRESS - select LZ4HC_COMPRESS - select LZ4HC_DECOMPRESS - select ZLIB_DEFLATE - select ZLIB_INFLATE - select ZSTD_COMPRESS - select ZSTD_DECOMPRESS - select CRYPTO_LIB_SHA256 - select CRYPTO_LIB_CHACHA - select CRYPTO_LIB_POLY1305 - select KEYS - select RAID6_PQ - select XOR_BLOCKS - select XXHASH - select SRCU - select SYMBOLIC_ERRNAME - select MIN_HEAP - select XARRAY_MULTI - help - The bcachefs filesystem - a modern, copy on write filesystem, with - support for multiple devices, compression, checksumming, etc. - -config BCACHEFS_QUOTA - bool "bcachefs quota support" - depends on BCACHEFS_FS - select QUOTACTL - -config BCACHEFS_ERASURE_CODING - bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)" - depends on BCACHEFS_FS - select QUOTACTL - help - This enables the "erasure_code" filesysystem and inode option, which - organizes data into reed-solomon stripes instead of ordinary - replication. - - WARNING: this feature is still undergoing on disk format changes, and - should only be enabled for testing purposes. - -config BCACHEFS_POSIX_ACL - bool "bcachefs POSIX ACL support" - depends on BCACHEFS_FS - select FS_POSIX_ACL - -config BCACHEFS_DEBUG - bool "bcachefs debugging" - depends on BCACHEFS_FS - help - Enables many extra debugging checks and assertions. - - The resulting code will be significantly slower than normal; you - probably shouldn't select this option unless you're a developer. - -config BCACHEFS_INJECT_TRANSACTION_RESTARTS - bool "Randomly inject transaction restarts" - depends on BCACHEFS_DEBUG - help - Randomly inject transaction restarts in a few core paths - may have a - significant performance penalty - -config BCACHEFS_TESTS - bool "bcachefs unit and performance tests" - depends on BCACHEFS_FS - help - Include some unit and performance tests for the core btree code - -config BCACHEFS_LOCK_TIME_STATS - bool "bcachefs lock time statistics" - depends on BCACHEFS_FS - help - Expose statistics for how long we held a lock in debugfs - -config BCACHEFS_NO_LATENCY_ACCT - bool "disable latency accounting and time stats" - depends on BCACHEFS_FS - help - This disables device latency tracking and time stats, only for performance testing - -config BCACHEFS_SIX_OPTIMISTIC_SPIN - bool "Optimistic spinning for six locks" - depends on BCACHEFS_FS - depends on SMP - default y - help - Instead of immediately sleeping when attempting to take a six lock that - is held by another thread, spin for a short while, as long as the - thread owning the lock is running. - -config BCACHEFS_PATH_TRACEPOINTS - bool "Extra btree_path tracepoints" - depends on BCACHEFS_FS && TRACING - help - Enable extra tracepoints for debugging btree_path operations; we don't - normally want these enabled because they happen at very high rates. - -config BCACHEFS_TRANS_KMALLOC_TRACE - bool "Trace bch2_trans_kmalloc() calls" - depends on BCACHEFS_FS - -config BCACHEFS_ASYNC_OBJECT_LISTS - bool "Keep async objects on fast_lists for debugfs visibility" - depends on BCACHEFS_FS && DEBUG_FS - -config MEAN_AND_VARIANCE_UNIT_TEST - tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS - depends on KUNIT - depends on BCACHEFS_FS - default KUNIT_ALL_TESTS - help - This option enables the kunit tests for mean_and_variance module. - If unsure, say N. diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile deleted file mode 100644 index 93c8ee5425c8dd..00000000000000 --- a/fs/bcachefs/Makefile +++ /dev/null @@ -1,107 +0,0 @@ - -obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o - -bcachefs-y := \ - acl.o \ - alloc_background.o \ - alloc_foreground.o \ - backpointers.o \ - bkey.o \ - bkey_methods.o \ - bkey_sort.o \ - bset.o \ - btree_cache.o \ - btree_gc.o \ - btree_io.o \ - btree_iter.o \ - btree_journal_iter.o \ - btree_key_cache.o \ - btree_locking.o \ - btree_node_scan.o \ - btree_trans_commit.o \ - btree_update.o \ - btree_update_interior.o \ - btree_write_buffer.o \ - buckets.o \ - buckets_waiting_for_journal.o \ - chardev.o \ - checksum.o \ - clock.o \ - compress.o \ - darray.o \ - data_update.o \ - debug.o \ - dirent.o \ - disk_accounting.o \ - disk_groups.o \ - ec.o \ - enumerated_ref.o \ - errcode.o \ - error.o \ - extents.o \ - extent_update.o \ - eytzinger.o \ - fast_list.o \ - fs.o \ - fs-ioctl.o \ - fs-io.o \ - fs-io-buffered.o \ - fs-io-direct.o \ - fs-io-pagecache.o \ - fsck.o \ - inode.o \ - io_read.o \ - io_misc.o \ - io_write.o \ - journal.o \ - journal_io.o \ - journal_reclaim.o \ - journal_sb.o \ - journal_seq_blacklist.o \ - keylist.o \ - logged_ops.o \ - lru.o \ - mean_and_variance.o \ - migrate.o \ - move.o \ - movinggc.o \ - namei.o \ - nocow_locking.o \ - opts.o \ - printbuf.o \ - progress.o \ - quota.o \ - rebalance.o \ - rcu_pending.o \ - recovery.o \ - recovery_passes.o \ - reflink.o \ - replicas.o \ - sb-clean.o \ - sb-counters.o \ - sb-downgrade.o \ - sb-errors.o \ - sb-members.o \ - siphash.o \ - six.o \ - snapshot.o \ - str_hash.o \ - subvolume.o \ - super.o \ - super-io.o \ - sysfs.o \ - tests.o \ - time_stats.o \ - thread_with_file.o \ - trace.o \ - two_state_shared_lock.o \ - util.o \ - varint.o \ - xattr.o - -bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o - -obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o - -# Silence "note: xyz changed in GCC X.X" messages -subdir-ccflags-y += $(call cc-disable-warning, psabi) diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c deleted file mode 100644 index d03adc36100eb8..00000000000000 --- a/fs/bcachefs/acl.c +++ /dev/null @@ -1,445 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" - -#include "acl.h" -#include "xattr.h" - -#include - -static const char * const acl_types[] = { - [ACL_USER_OBJ] = "user_obj", - [ACL_USER] = "user", - [ACL_GROUP_OBJ] = "group_obj", - [ACL_GROUP] = "group", - [ACL_MASK] = "mask", - [ACL_OTHER] = "other", - NULL, -}; - -void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size) -{ - const void *p, *end = value + size; - - if (!value || - size < sizeof(bch_acl_header) || - ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION)) - return; - - p = value + sizeof(bch_acl_header); - while (p < end) { - const bch_acl_entry *in = p; - unsigned tag = le16_to_cpu(in->e_tag); - - prt_str(out, acl_types[tag]); - - switch (tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - p += sizeof(bch_acl_entry_short); - break; - case ACL_USER: - prt_printf(out, " uid %u", le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - case ACL_GROUP: - prt_printf(out, " gid %u", le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - } - - prt_printf(out, " %o", le16_to_cpu(in->e_perm)); - - if (p != end) - prt_char(out, ' '); - } -} - -#ifdef CONFIG_BCACHEFS_POSIX_ACL - -#include "fs.h" - -#include -#include -#include -#include - -static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) -{ - return sizeof(bch_acl_header) + - sizeof(bch_acl_entry_short) * nr_short + - sizeof(bch_acl_entry) * nr_long; -} - -static inline int acl_to_xattr_type(int type) -{ - switch (type) { - case ACL_TYPE_ACCESS: - return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; - case ACL_TYPE_DEFAULT: - return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; - default: - BUG(); - } -} - -/* - * Convert from filesystem to in-memory representation. - */ -static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, - const void *value, size_t size) -{ - const void *p, *end = value + size; - struct posix_acl *acl; - struct posix_acl_entry *out; - unsigned count = 0; - int ret; - - if (!value) - return NULL; - if (size < sizeof(bch_acl_header)) - goto invalid; - if (((bch_acl_header *)value)->a_version != - cpu_to_le32(BCH_ACL_VERSION)) - goto invalid; - - p = value + sizeof(bch_acl_header); - while (p < end) { - const bch_acl_entry *entry = p; - - if (p + sizeof(bch_acl_entry_short) > end) - goto invalid; - - switch (le16_to_cpu(entry->e_tag)) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - p += sizeof(bch_acl_entry_short); - break; - case ACL_USER: - case ACL_GROUP: - p += sizeof(bch_acl_entry); - break; - default: - goto invalid; - } - - count++; - } - - if (p > end) - goto invalid; - - if (!count) - return NULL; - - acl = allocate_dropping_locks(trans, ret, - posix_acl_alloc(count, _gfp)); - if (!acl) - return ERR_PTR(-ENOMEM); - if (ret) { - kfree(acl); - return ERR_PTR(ret); - } - - out = acl->a_entries; - - p = value + sizeof(bch_acl_header); - while (p < end) { - const bch_acl_entry *in = p; - - out->e_tag = le16_to_cpu(in->e_tag); - out->e_perm = le16_to_cpu(in->e_perm); - - switch (out->e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - p += sizeof(bch_acl_entry_short); - break; - case ACL_USER: - out->e_uid = make_kuid(&init_user_ns, - le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - case ACL_GROUP: - out->e_gid = make_kgid(&init_user_ns, - le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - } - - out++; - } - - BUG_ON(out != acl->a_entries + acl->a_count); - - return acl; -invalid: - pr_err("invalid acl entry"); - return ERR_PTR(-EINVAL); -} - -/* - * Convert from in-memory to filesystem representation. - */ -static struct bkey_i_xattr * -bch2_acl_to_xattr(struct btree_trans *trans, - const struct posix_acl *acl, - int type) -{ - struct bkey_i_xattr *xattr; - bch_acl_header *acl_header; - const struct posix_acl_entry *acl_e, *pe; - void *outptr; - unsigned nr_short = 0, nr_long = 0, acl_len, u64s; - - FOREACH_ACL_ENTRY(acl_e, acl, pe) { - switch (acl_e->e_tag) { - case ACL_USER: - case ACL_GROUP: - nr_long++; - break; - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - nr_short++; - break; - default: - return ERR_PTR(-EINVAL); - } - } - - acl_len = bch2_acl_size(nr_short, nr_long); - u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); - - if (u64s > U8_MAX) - return ERR_PTR(-E2BIG); - - xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); - if (IS_ERR(xattr)) - return xattr; - - bkey_xattr_init(&xattr->k_i); - xattr->k.u64s = u64s; - xattr->v.x_type = acl_to_xattr_type(type); - xattr->v.x_name_len = 0; - xattr->v.x_val_len = cpu_to_le16(acl_len); - - acl_header = xattr_val(&xattr->v); - acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); - - outptr = (void *) acl_header + sizeof(*acl_header); - - FOREACH_ACL_ENTRY(acl_e, acl, pe) { - bch_acl_entry *entry = outptr; - - entry->e_tag = cpu_to_le16(acl_e->e_tag); - entry->e_perm = cpu_to_le16(acl_e->e_perm); - switch (acl_e->e_tag) { - case ACL_USER: - entry->e_id = cpu_to_le32( - from_kuid(&init_user_ns, acl_e->e_uid)); - outptr += sizeof(bch_acl_entry); - break; - case ACL_GROUP: - entry->e_id = cpu_to_le32( - from_kgid(&init_user_ns, acl_e->e_gid)); - outptr += sizeof(bch_acl_entry); - break; - - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - outptr += sizeof(bch_acl_entry_short); - break; - } - } - - BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); - - return xattr; -} - -struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_iter iter = {}; - struct posix_acl *acl = NULL; - - if (rcu) - return ERR_PTR(-ECHILD); - - struct btree_trans *trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash, inode_inum(inode), &search, 0); - int ret = bkey_err(k); - if (ret) - goto err; - - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); - ret = PTR_ERR_OR_ZERO(acl); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (ret) - acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL; - - if (!IS_ERR_OR_NULL(acl)) - set_cached_acl(&inode->v, type, acl); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return acl; -} - -int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode_u, - struct posix_acl *acl, int type) -{ - struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); - int ret; - - if (type == ACL_TYPE_DEFAULT && - !S_ISDIR(inode_u->bi_mode)) - return acl ? -EACCES : 0; - - if (acl) { - struct bkey_i_xattr *xattr = - bch2_acl_to_xattr(trans, acl, type); - if (IS_ERR(xattr)) - return PTR_ERR(xattr); - - ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, - inum, &xattr->k_i, 0); - } else { - struct xattr_search_key search = - X_SEARCH(acl_to_xattr_type(type), "", 0); - - ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, - inum, &search); - } - - return bch2_err_matches(ret, ENOENT) ? 0 : ret; -} - -int bch2_set_acl(struct mnt_idmap *idmap, - struct dentry *dentry, - struct posix_acl *_acl, int type) -{ - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_iter inode_iter = {}; - struct bch_inode_unpacked inode_u; - struct posix_acl *acl; - umode_t mode; - int ret; - - mutex_lock(&inode->ei_update_lock); - struct btree_trans *trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - acl = _acl; - - ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?: - bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_intent); - if (ret) - goto btree_err; - - mode = inode_u.bi_mode; - - if (type == ACL_TYPE_ACCESS) { - ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl); - if (ret) - goto btree_err; - } - - ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type); - if (ret) - goto btree_err; - - inode_u.bi_ctime = bch2_current_time(c); - inode_u.bi_mode = mode; - - ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, 0); -btree_err: - bch2_trans_iter_exit(trans, &inode_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (unlikely(ret)) - goto err; - - bch2_inode_update_after_write(trans, inode, &inode_u, - ATTR_CTIME|ATTR_MODE); - - set_cached_acl(&inode->v, type, acl); -err: - bch2_trans_put(trans); - mutex_unlock(&inode->ei_update_lock); - - return ret; -} - -int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode, - umode_t mode, - struct posix_acl **new_acl) -{ - struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); - struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); - struct btree_iter iter; - struct posix_acl *acl = NULL; - - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash_info, inum, &search, BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - return bch2_err_matches(ret, ENOENT) ? 0 : ret; - - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - - acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); - ret = PTR_ERR_OR_ZERO(acl); - if (ret) - goto err; - - ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode)); - if (ret) - goto err; - - struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - new->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, &new->k_i, 0); - *new_acl = acl; - acl = NULL; -err: - bch2_trans_iter_exit(trans, &iter); - if (!IS_ERR_OR_NULL(acl)) - kfree(acl); - return ret; -} - -#endif /* CONFIG_BCACHEFS_POSIX_ACL */ diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h deleted file mode 100644 index fe730a6bf0c18c..00000000000000 --- a/fs/bcachefs/acl.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ACL_H -#define _BCACHEFS_ACL_H - -struct bch_inode_unpacked; -struct bch_hash_info; -struct bch_inode_info; -struct posix_acl; - -#define BCH_ACL_VERSION 0x0001 - -typedef struct { - __le16 e_tag; - __le16 e_perm; - __le32 e_id; -} bch_acl_entry; - -typedef struct { - __le16 e_tag; - __le16 e_perm; -} bch_acl_entry_short; - -typedef struct { - __le32 a_version; -} bch_acl_header; - -void bch2_acl_to_text(struct printbuf *, const void *, size_t); - -#ifdef CONFIG_BCACHEFS_POSIX_ACL - -struct posix_acl *bch2_get_acl(struct inode *, int, bool); - -int bch2_set_acl_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - struct posix_acl *, int); -int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); -int bch2_acl_chmod(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - umode_t, struct posix_acl **); - -#else - -static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode_u, - struct posix_acl *acl, int type) -{ - return 0; -} - -static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode, - umode_t mode, - struct posix_acl **new_acl) -{ - return 0; -} - -#endif /* CONFIG_BCACHEFS_POSIX_ACL */ - -#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c deleted file mode 100644 index 66de463186209c..00000000000000 --- a/fs/bcachefs/alloc_background.c +++ /dev/null @@ -1,2680 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_gc.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "buckets_waiting_for_journal.h" -#include "clock.h" -#include "debug.h" -#include "disk_accounting.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "lru.h" -#include "recovery.h" -#include "varint.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); - -/* Persistent alloc info: */ - -static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { -#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, - BCH_ALLOC_FIELDS_V1() -#undef x -}; - -struct bkey_alloc_unpacked { - u64 journal_seq; - u8 gen; - u8 oldest_gen; - u8 data_type; - bool need_discard:1; - bool need_inc_gen:1; -#define x(_name, _bits) u##_bits _name; - BCH_ALLOC_FIELDS_V2() -#undef x -}; - -static inline u64 alloc_field_v1_get(const struct bch_alloc *a, - const void **p, unsigned field) -{ - unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; - u64 v; - - if (!(a->fields & (1 << field))) - return 0; - - switch (bytes) { - case 1: - v = *((const u8 *) *p); - break; - case 2: - v = le16_to_cpup(*p); - break; - case 4: - v = le32_to_cpup(*p); - break; - case 8: - v = le64_to_cpup(*p); - break; - default: - BUG(); - } - - *p += bytes; - return v; -} - -static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, - struct bkey_s_c k) -{ - const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; - const void *d = in->data; - unsigned idx = 0; - - out->gen = in->gen; - -#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); - BCH_ALLOC_FIELDS_V1() -#undef x -} - -static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, - struct bkey_s_c k) -{ - struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); - const u8 *in = a.v->data; - const u8 *end = bkey_val_end(a); - unsigned fieldnr = 0; - int ret; - u64 v; - - out->gen = a.v->gen; - out->oldest_gen = a.v->oldest_gen; - out->data_type = a.v->data_type; - -#define x(_name, _bits) \ - if (fieldnr < a.v->nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v = 0; \ - } \ - out->_name = v; \ - if (v != out->_name) \ - return -1; \ - fieldnr++; - - BCH_ALLOC_FIELDS_V2() -#undef x - return 0; -} - -static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, - struct bkey_s_c k) -{ - struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); - const u8 *in = a.v->data; - const u8 *end = bkey_val_end(a); - unsigned fieldnr = 0; - int ret; - u64 v; - - out->gen = a.v->gen; - out->oldest_gen = a.v->oldest_gen; - out->data_type = a.v->data_type; - out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); - out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); - out->journal_seq = le64_to_cpu(a.v->journal_seq); - -#define x(_name, _bits) \ - if (fieldnr < a.v->nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v = 0; \ - } \ - out->_name = v; \ - if (v != out->_name) \ - return -1; \ - fieldnr++; - - BCH_ALLOC_FIELDS_V2() -#undef x - return 0; -} - -static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) -{ - struct bkey_alloc_unpacked ret = { .gen = 0 }; - - switch (k.k->type) { - case KEY_TYPE_alloc: - bch2_alloc_unpack_v1(&ret, k); - break; - case KEY_TYPE_alloc_v2: - bch2_alloc_unpack_v2(&ret, k); - break; - case KEY_TYPE_alloc_v3: - bch2_alloc_unpack_v3(&ret, k); - break; - } - - return ret; -} - -static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) -{ - unsigned i, bytes = offsetof(struct bch_alloc, data); - - for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) - if (a->fields & (1 << i)) - bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; - - return DIV_ROUND_UP(bytes, sizeof(u64)); -} - -int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); - int ret = 0; - - /* allow for unknown fields */ - bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), - c, alloc_v1_val_size_bad, - "incorrect value size (%zu < %u)", - bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); -fsck_err: - return ret; -} - -int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_alloc_unpacked u; - int ret = 0; - - bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), - c, alloc_v2_unpack_error, - "unpack error"); -fsck_err: - return ret; -} - -int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_alloc_unpacked u; - int ret = 0; - - bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), - c, alloc_v3_unpack_error, - "unpack error"); -fsck_err: - return ret; -} - -int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bch_alloc_v4 a; - int ret = 0; - - bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k)); - - bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k), - c, alloc_v4_val_size_bad, - "bad val size (%u > %zu)", - alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k)); - - bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) && - BCH_ALLOC_V4_NR_BACKPOINTERS(&a), - c, alloc_v4_backpointers_start_bad, - "invalid backpointers_start"); - - bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type, - c, alloc_key_data_type_bad, - "invalid data type (got %u should be %u)", - a.data_type, alloc_data_type(a, a.data_type)); - - for (unsigned i = 0; i < 2; i++) - bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX, - c, alloc_key_io_time_bad, - "invalid io_time[%s]: %llu, max %llu", - i == READ ? "read" : "write", - a.io_time[i], LRU_TIME_MAX); - - unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) > - offsetof(struct bch_alloc_v4, stripe_sectors) - ? a.stripe_sectors - : 0; - - switch (a.data_type) { - case BCH_DATA_free: - case BCH_DATA_need_gc_gens: - case BCH_DATA_need_discard: - bkey_fsck_err_on(stripe_sectors || - a.dirty_sectors || - a.cached_sectors || - a.stripe, - c, alloc_key_empty_but_have_data, - "empty data type free but have data %u.%u.%u %u", - stripe_sectors, - a.dirty_sectors, - a.cached_sectors, - a.stripe); - break; - case BCH_DATA_sb: - case BCH_DATA_journal: - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - bkey_fsck_err_on(!a.dirty_sectors && - !stripe_sectors, - c, alloc_key_dirty_sectors_0, - "data_type %s but dirty_sectors==0", - bch2_data_type_str(a.data_type)); - break; - case BCH_DATA_cached: - bkey_fsck_err_on(!a.cached_sectors || - a.dirty_sectors || - stripe_sectors || - a.stripe, - c, alloc_key_cached_inconsistency, - "data type inconsistency"); - - bkey_fsck_err_on(!a.io_time[READ] && - !(c->recovery.passes_to_run & - BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)), - c, alloc_key_cached_but_read_time_zero, - "cached bucket with read_time == 0"); - break; - case BCH_DATA_stripe: - break; - } -fsck_err: - return ret; -} - -void bch2_alloc_v4_swab(struct bkey_s k) -{ - struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; - - a->journal_seq_nonempty = swab64(a->journal_seq_nonempty); - a->journal_seq_empty = swab64(a->journal_seq_empty); - a->flags = swab32(a->flags); - a->dirty_sectors = swab32(a->dirty_sectors); - a->cached_sectors = swab32(a->cached_sectors); - a->io_time[0] = swab64(a->io_time[0]); - a->io_time[1] = swab64(a->io_time[1]); - a->stripe = swab32(a->stripe); - a->nr_external_backpointers = swab32(a->nr_external_backpointers); - a->stripe_sectors = swab32(a->stripe_sectors); -} - -static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, - unsigned dev, const struct bch_alloc_v4 *a) -{ - struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL; - - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); - bch2_prt_data_type(out, a->data_type); - prt_newline(out); - prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); - prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); - prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); - prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); - prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); - prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); - prt_printf(out, "cached_sectors %u\n", a->cached_sectors); - prt_printf(out, "stripe %u\n", a->stripe); - prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); - prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); - prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); - - if (ca) - prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca)); - prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a)); - printbuf_indent_sub(out, 2); - - bch2_dev_put(ca); -} - -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - - __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a); -} - -void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v); -} - -void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) -{ - if (k.k->type == KEY_TYPE_alloc_v4) { - void *src, *dst; - - *out = *bkey_s_c_to_alloc_v4(k).v; - - src = alloc_v4_backpointers(out); - SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); - dst = alloc_v4_backpointers(out); - - if (src < dst) - memset(src, 0, dst - src); - - SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); - } else { - struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); - - *out = (struct bch_alloc_v4) { - .journal_seq_nonempty = u.journal_seq, - .flags = u.need_discard, - .gen = u.gen, - .oldest_gen = u.oldest_gen, - .data_type = u.data_type, - .stripe_redundancy = u.stripe_redundancy, - .dirty_sectors = u.dirty_sectors, - .cached_sectors = u.cached_sectors, - .io_time[READ] = u.read_time, - .io_time[WRITE] = u.write_time, - .stripe = u.stripe, - }; - - SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); - } -} - -static noinline struct bkey_i_alloc_v4 * -__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bkey_i_alloc_v4 *ret; - - ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); - if (IS_ERR(ret)) - return ret; - - if (k.k->type == KEY_TYPE_alloc_v4) { - void *src, *dst; - - bkey_reassemble(&ret->k_i, k); - - src = alloc_v4_backpointers(&ret->v); - SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); - dst = alloc_v4_backpointers(&ret->v); - - if (src < dst) - memset(src, 0, dst - src); - - SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); - set_alloc_v4_u64s(ret); - } else { - bkey_alloc_v4_init(&ret->k_i); - ret->k.p = k.k->p; - bch2_alloc_to_v4(k, &ret->v); - } - return ret; -} - -static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bkey_s_c_alloc_v4 a; - - if (likely(k.k->type == KEY_TYPE_alloc_v4) && - ((a = bkey_s_c_to_alloc_v4(k), true) && - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) - return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); - - return __bch2_alloc_to_v4_mut(trans, k); -} - -struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) -{ - return bch2_alloc_to_v4_mut_inlined(trans, k); -} - -struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos) -{ - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, - BTREE_ITER_with_updates| - BTREE_ITER_cached| - BTREE_ITER_intent); - int ret = bkey_err(k); - if (unlikely(ret)) - return ERR_PTR(ret); - - struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (unlikely(ret)) - goto err; - return a; -err: - bch2_trans_iter_exit(trans, iter); - return ERR_PTR(ret); -} - -__flatten -struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos, - BTREE_ITER_with_updates| - BTREE_ITER_cached| - BTREE_ITER_intent); - int ret = bkey_err(k); - if (unlikely(ret)) - return ERR_PTR(ret); - - if ((void *) k.v >= trans->mem && - (void *) k.v < trans->mem + trans->mem_top) { - bch2_trans_iter_exit(trans, &iter); - return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v); - } - - struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); - if (IS_ERR(a)) { - bch2_trans_iter_exit(trans, &iter); - return a; - } - - ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_); - bch2_trans_iter_exit(trans, &iter); - return unlikely(ret) ? ERR_PTR(ret) : a; -} - -static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) -{ - *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; - - pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; - return pos; -} - -static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) -{ - pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; - pos.offset += offset; - return pos; -} - -static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) -{ - return k.k->type == KEY_TYPE_bucket_gens - ? bkey_s_c_to_bucket_gens(k).v->gens[offset] - : 0; -} - -int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), - c, bucket_gens_val_size_bad, - "bad val size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); -fsck_err: - return ret; -} - -void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); - unsigned i; - - for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { - if (i) - prt_char(out, ' '); - prt_printf(out, "%u", g.v->gens[i]); - } -} - -int bch2_bucket_gens_init(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bkey_i_bucket_gens g; - bool have_bucket_gens_key = false; - int ret; - - ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_prefetch, k, ({ - /* - * Not a fsck error because this is checked/repaired by - * bch2_check_alloc_key() which runs later: - */ - if (!bch2_dev_bucket_exists(c, k.k->p)) - continue; - - struct bch_alloc_v4 a; - u8 gen = bch2_alloc_to_v4(k, &a)->gen; - unsigned offset; - struct bpos pos = alloc_gens_pos(iter.pos, &offset); - int ret2 = 0; - - if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) { - ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret2) - goto iter_err; - have_bucket_gens_key = false; - } - - if (!have_bucket_gens_key) { - bkey_bucket_gens_init(&g.k_i); - g.k.p = pos; - have_bucket_gens_key = true; - } - - g.v.gens[offset] = gen; -iter_err: - ret2; - })); - - if (have_bucket_gens_key && !ret) - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); - - bch2_trans_put(trans); - - bch_err_fn(c, ret); - return ret; -} - -int bch2_alloc_read(struct bch_fs *c) -{ - down_read(&c->state_lock); - - struct btree_trans *trans = bch2_trans_get(c); - struct bch_dev *ca = NULL; - int ret; - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { - ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_prefetch, k, ({ - u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; - u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; - - if (k.k->type != KEY_TYPE_bucket_gens) - continue; - - ca = bch2_dev_iterate(c, ca, k.k->p.inode); - /* - * Not a fsck error because this is checked/repaired by - * bch2_check_alloc_key() which runs later: - */ - if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - - const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; - - for (u64 b = max_t(u64, ca->mi.first_bucket, start); - b < min_t(u64, ca->mi.nbuckets, end); - b++) - *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; - 0; - })); - } else { - ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_prefetch, k, ({ - ca = bch2_dev_iterate(c, ca, k.k->p.inode); - /* - * Not a fsck error because this is checked/repaired by - * bch2_check_alloc_key() which runs later: - */ - if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - - if (k.k->p.offset < ca->mi.first_bucket) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket)); - continue; - } - - if (k.k->p.offset >= ca->mi.nbuckets) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - - struct bch_alloc_v4 a; - *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; - 0; - })); - } - - bch2_dev_put(ca); - bch2_trans_put(trans); - - up_read(&c->state_lock); - bch_err_fn(c, ret); - return ret; -} - -/* Free space/discard btree: */ - -static int __need_discard_or_freespace_err(struct btree_trans *trans, - struct bkey_s_c alloc_k, - bool set, bool discard, bool repair) -{ - struct bch_fs *c = trans->c; - enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0); - enum bch_sb_error_id err_id = discard - ? BCH_FSCK_ERR_need_discard_key_wrong - : BCH_FSCK_ERR_freespace_key_wrong; - enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace; - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, alloc_k); - - int ret = __bch2_fsck_err(NULL, trans, flags, err_id, - "bucket incorrectly %sset in %s btree\n%s", - set ? "" : "un", - bch2_btree_id_str(btree), - buf.buf); - if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) || - bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed)) - ret = 0; - - printbuf_exit(&buf); - return ret; -} - -#define need_discard_or_freespace_err(...) \ - fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__)) - -#define need_discard_or_freespace_err_on(cond, ...) \ - (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false) - -static int bch2_bucket_do_index(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c alloc_k, - const struct bch_alloc_v4 *a, - bool set) -{ - enum btree_id btree; - struct bpos pos; - - if (a->data_type != BCH_DATA_free && - a->data_type != BCH_DATA_need_discard) - return 0; - - switch (a->data_type) { - case BCH_DATA_free: - btree = BTREE_ID_freespace; - pos = alloc_freespace_pos(alloc_k.k->p, *a); - break; - case BCH_DATA_need_discard: - btree = BTREE_ID_need_discard; - pos = alloc_k.k->p; - break; - default: - return 0; - } - - struct btree_iter iter; - struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent); - int ret = bkey_err(old); - if (ret) - return ret; - - need_discard_or_freespace_err_on(ca->mi.freespace_initialized && - !old.k->type != set, - trans, alloc_k, set, - btree == BTREE_ID_need_discard, false); - - ret = bch2_btree_bit_mod_iter(trans, &iter, set); -fsck_err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline int bch2_bucket_gen_update(struct btree_trans *trans, - struct bpos bucket, u8 gen) -{ - struct btree_iter iter; - unsigned offset; - struct bpos pos = alloc_gens_pos(bucket, &offset); - struct bkey_i_bucket_gens *g; - struct bkey_s_c k; - int ret; - - g = bch2_trans_kmalloc(trans, sizeof(*g)); - ret = PTR_ERR_OR_ZERO(g); - if (ret) - return ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, - BTREE_ITER_intent| - BTREE_ITER_with_updates); - ret = bkey_err(k); - if (ret) - return ret; - - if (k.k->type != KEY_TYPE_bucket_gens) { - bkey_bucket_gens_init(&g->k_i); - g->k.p = iter.pos; - } else { - bkey_reassemble(&g->k_i, k); - } - - g->v.gens[offset] = gen; - - ret = bch2_trans_update(trans, &iter, &g->k_i, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca, - enum bch_data_type data_type, - s64 delta_buckets, - s64 delta_sectors, - s64 delta_fragmented, unsigned flags) -{ - s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; - - return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - d, dev_data_type, - .dev = ca->dev_idx, - .data_type = data_type); -} - -int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, - const struct bch_alloc_v4 *old, - const struct bch_alloc_v4 *new, - unsigned flags) -{ - s64 old_sectors = bch2_bucket_sectors(*old); - s64 new_sectors = bch2_bucket_sectors(*new); - if (old->data_type != new->data_type) { - int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, - 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?: - bch2_dev_data_type_accounting_mod(trans, ca, old->data_type, - -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags); - if (ret) - return ret; - } else if (old_sectors != new_sectors) { - int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, - 0, - new_sectors - old_sectors, - bch2_bucket_sectors_fragmented(ca, *new) - - bch2_bucket_sectors_fragmented(ca, *old), flags); - if (ret) - return ret; - } - - s64 old_unstriped = bch2_bucket_sectors_unstriped(*old); - s64 new_unstriped = bch2_bucket_sectors_unstriped(*new); - if (old_unstriped != new_unstriped) { - int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped, - !!new_unstriped - !!old_unstriped, - new_unstriped - old_unstriped, - 0, - flags); - if (ret) - return ret; - } - - return 0; -} - -int bch2_trigger_alloc(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); - if (!ca) - return bch_err_throw(c, trigger_alloc); - - struct bch_alloc_v4 old_a_convert; - const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - - struct bch_alloc_v4 *new_a; - if (likely(new.k->type == KEY_TYPE_alloc_v4)) { - new_a = bkey_s_to_alloc_v4(new).v; - } else { - BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair))); - - struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); - ret = PTR_ERR_OR_ZERO(new_ka); - if (unlikely(ret)) - goto err; - new_a = &new_ka->v; - } - - if (flags & BTREE_TRIGGER_transactional) { - alloc_data_type_set(new_a, new_a->data_type); - - int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - - (int) data_type_is_empty(old_a->data_type); - - if (is_empty_delta < 0) { - new_a->io_time[READ] = bch2_current_io_time(c, READ); - new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); - SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); - } - - if (data_type_is_empty(new_a->data_type) && - BCH_ALLOC_V4_NEED_INC_GEN(new_a) && - !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { - if (new_a->oldest_gen == new_a->gen && - !bch2_bucket_sectors_total(*new_a)) - new_a->oldest_gen++; - new_a->gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); - alloc_data_type_set(new_a, new_a->data_type); - } - - if (old_a->data_type != new_a->data_type || - (new_a->data_type == BCH_DATA_free && - alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { - ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: - bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); - if (ret) - goto err; - } - - if (new_a->data_type == BCH_DATA_cached && - !new_a->io_time[READ]) - new_a->io_time[READ] = bch2_current_io_time(c, READ); - - ret = bch2_lru_change(trans, new.k->p.inode, - bucket_to_u64(new.k->p), - alloc_lru_idx_read(*old_a), - alloc_lru_idx_read(*new_a)); - if (ret) - goto err; - - ret = bch2_lru_change(trans, - BCH_LRU_BUCKET_FRAGMENTATION, - bucket_to_u64(new.k->p), - alloc_lru_idx_fragmentation(*old_a, ca), - alloc_lru_idx_fragmentation(*new_a, ca)); - if (ret) - goto err; - - if (old_a->gen != new_a->gen) { - ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); - if (ret) - goto err; - } - - ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); - if (ret) - goto err; - } - - if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { - u64 transaction_seq = trans->journal_res.seq; - BUG_ON(!transaction_seq); - - if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq, - trans, alloc_key_journal_seq_in_future, - "bucket journal seq in future (currently at %llu)\n%s", - journal_cur_seq(&c->journal), - (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf))) - new_a->journal_seq_nonempty = transaction_seq; - - int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - - (int) data_type_is_empty(old_a->data_type); - - /* - * Record journal sequence number of empty -> nonempty transition: - * Note that there may be multiple empty -> nonempty - * transitions, data in a bucket may be overwritten while we're - * still writing to it - so be careful to only record the first: - * */ - if (is_empty_delta < 0 && - new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) { - new_a->journal_seq_nonempty = transaction_seq; - new_a->journal_seq_empty = 0; - } - - /* - * Bucket becomes empty: mark it as waiting for a journal flush, - * unless updates since empty -> nonempty transition were never - * flushed - we may need to ask the journal not to flush - * intermediate sequence numbers: - */ - if (is_empty_delta > 0) { - if (new_a->journal_seq_nonempty == transaction_seq || - bch2_journal_noflush_seq(&c->journal, - new_a->journal_seq_nonempty, - transaction_seq)) { - new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0; - } else { - new_a->journal_seq_empty = transaction_seq; - - ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - new.k->p.inode, new.k->p.offset, - transaction_seq); - if (bch2_fs_fatal_err_on(ret, c, - "setting bucket_needs_journal_commit: %s", - bch2_err_str(ret))) - goto err; - } - } - - if (new_a->gen != old_a->gen) { - guard(rcu)(); - u8 *gen = bucket_gen(ca, new.k->p.offset); - if (unlikely(!gen)) - goto invalid_bucket; - *gen = new_a->gen; - } - -#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) -#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) -#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk) - - if (statechange(a->data_type == BCH_DATA_free) && - bucket_flushed(new_a)) - closure_wake_up(&c->freelist_wait); - - if (statechange(a->data_type == BCH_DATA_need_discard) && - !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && - bucket_flushed(new_a)) - bch2_discard_one_bucket_fast(ca, new.k->p.offset); - - if (statechange(a->data_type == BCH_DATA_cached) && - !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && - should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_dev_do_invalidates(ca); - - if (statechange(a->data_type == BCH_DATA_need_gc_gens)) - bch2_gc_gens_async(c); - } - - if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { - guard(rcu)(); - struct bucket *g = gc_bucket(ca, new.k->p.offset); - if (unlikely(!g)) - goto invalid_bucket; - g->gen_valid = 1; - g->gen = new_a->gen; - } -err: -fsck_err: - printbuf_exit(&buf); - bch2_dev_put(ca); - return ret; -invalid_bucket: - bch2_fs_inconsistent(c, "reference to invalid bucket\n%s", - (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = bch_err_throw(c, trigger_alloc); - goto err; -} - -/* - * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for - * extents style btrees, but works on non-extents btrees: - */ -static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, struct bkey *hole) -{ - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); - - if (bkey_err(k)) - return k; - - if (k.k->type) { - return k; - } else { - struct btree_iter iter2; - struct bpos next; - - bch2_trans_copy_iter(trans, &iter2, iter); - - struct btree_path *path = btree_iter_path(trans, iter); - if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) - end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); - - end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); - - /* - * btree node min/max is a closed interval, upto takes a half - * open interval: - */ - k = bch2_btree_iter_peek_max(trans, &iter2, end); - next = iter2.pos; - bch2_trans_iter_exit(trans, &iter2); - - BUG_ON(next.offset >= iter->pos.offset + U32_MAX); - - if (bkey_err(k)) - return k; - - bkey_init(hole); - hole->p = iter->pos; - - bch2_key_resize(hole, next.offset - iter->pos.offset); - return (struct bkey_s_c) { hole, NULL }; - } -} - -static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket) -{ - if (*ca) { - if (bucket->offset < (*ca)->mi.first_bucket) - bucket->offset = (*ca)->mi.first_bucket; - - if (bucket->offset < (*ca)->mi.nbuckets) - return true; - - bch2_dev_put(*ca); - *ca = NULL; - bucket->inode++; - bucket->offset = 0; - } - - guard(rcu)(); - *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); - if (*ca) { - *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); - bch2_dev_get(*ca); - } - - return *ca != NULL; -} - -static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_dev **ca, struct bkey *hole) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c k; -again: - k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole); - if (bkey_err(k)) - return k; - - *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode); - - if (!k.k->type) { - struct bpos hole_start = bkey_start_pos(k.k); - - if (!*ca || !bucket_valid(*ca, hole_start.offset)) { - if (!next_bucket(c, ca, &hole_start)) - return bkey_s_c_null; - - bch2_btree_iter_set_pos(trans, iter, hole_start); - goto again; - } - - if (k.k->p.offset > (*ca)->mi.nbuckets) - bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset); - } - - return k; -} - -static noinline_for_stack -int bch2_check_alloc_key(struct btree_trans *trans, - struct bkey_s_c alloc_k, - struct btree_iter *alloc_iter, - struct btree_iter *discard_iter, - struct btree_iter *freespace_iter, - struct btree_iter *bucket_gens_iter) -{ - struct bch_fs *c = trans->c; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - unsigned gens_offset; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); - if (fsck_err_on(!ca, - trans, alloc_key_to_missing_dev_bucket, - "alloc key for invalid device:bucket %llu:%llu", - alloc_k.k->p.inode, alloc_k.k->p.offset)) - ret = bch2_btree_delete_at(trans, alloc_iter, 0); - if (!ca) - return ret; - - if (!ca->mi.freespace_initialized) - goto out; - - a = bch2_alloc_to_v4(alloc_k, &a_convert); - - bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p); - k = bch2_btree_iter_peek_slot(trans, discard_iter); - ret = bkey_err(k); - if (ret) - goto err; - - bool is_discarded = a->data_type == BCH_DATA_need_discard; - if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded, - trans, alloc_k, !is_discarded, true, true)) { - ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded); - if (ret) - goto err; - } - - bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); - k = bch2_btree_iter_peek_slot(trans, freespace_iter); - ret = bkey_err(k); - if (ret) - goto err; - - bool is_free = a->data_type == BCH_DATA_free; - if (need_discard_or_freespace_err_on(!!k.k->type != is_free, - trans, alloc_k, !is_free, false, true)) { - ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free); - if (ret) - goto err; - } - - bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); - k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), - trans, bucket_gens_key_wrong, - "incorrect gen in bucket_gens btree (got %u should be %u)\n%s", - alloc_gen(k, gens_offset), a->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i_bucket_gens *g = - bch2_trans_kmalloc(trans, sizeof(*g)); - - ret = PTR_ERR_OR_ZERO(g); - if (ret) - goto err; - - if (k.k->type == KEY_TYPE_bucket_gens) { - bkey_reassemble(&g->k_i, k); - } else { - bkey_bucket_gens_init(&g->k_i); - g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); - } - - g->v.gens[gens_offset] = a->gen; - - ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); - if (ret) - goto err; - } -out: -err: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -static noinline_for_stack -int bch2_check_alloc_hole_freespace(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos start, - struct bpos *end, - struct btree_iter *freespace_iter) -{ - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - int ret; - - if (!ca->mi.freespace_initialized) - return 0; - - bch2_btree_iter_set_pos(trans, freespace_iter, start); - - k = bch2_btree_iter_peek_slot(trans, freespace_iter); - ret = bkey_err(k); - if (ret) - goto err; - - *end = bkey_min(k.k->p, *end); - - if (fsck_err_on(k.k->type != KEY_TYPE_set, - trans, freespace_hole_missing, - "hole in alloc btree missing in freespace btree\n" - "device %llu buckets %llu-%llu", - freespace_iter->pos.inode, - freespace_iter->pos.offset, - end->offset)) { - struct bkey_i *update = - bch2_trans_kmalloc(trans, sizeof(*update)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_init(&update->k); - update->k.type = KEY_TYPE_set; - update->k.p = freespace_iter->pos; - bch2_key_resize(&update->k, - min_t(u64, U32_MAX, end->offset - - freespace_iter->pos.offset)); - - ret = bch2_trans_update(trans, freespace_iter, update, 0); - if (ret) - goto err; - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static noinline_for_stack -int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, - struct bpos start, - struct bpos *end, - struct btree_iter *bucket_gens_iter) -{ - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - unsigned i, gens_offset, gens_end_offset; - int ret; - - bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); - - k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (bkey_cmp(alloc_gens_pos(start, &gens_offset), - alloc_gens_pos(*end, &gens_end_offset))) - gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; - - if (k.k->type == KEY_TYPE_bucket_gens) { - struct bkey_i_bucket_gens g; - bool need_update = false; - - bkey_reassemble(&g.k_i, k); - - for (i = gens_offset; i < gens_end_offset; i++) { - if (fsck_err_on(g.v.gens[i], trans, - bucket_gens_hole_wrong, - "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", - bucket_gens_pos_to_alloc(k.k->p, i).inode, - bucket_gens_pos_to_alloc(k.k->p, i).offset, - g.v.gens[i])) { - g.v.gens[i] = 0; - need_update = true; - } - } - - if (need_update) { - struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - memcpy(u, &g, sizeof(g)); - - ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); - if (ret) - goto err; - } - } - - *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -struct check_discard_freespace_key_async { - struct work_struct work; - struct bch_fs *c; - struct bbpos pos; -}; - -static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0); - int ret = bkey_err(k); - if (ret) - return ret; - - u8 gen; - ret = k.k->type != KEY_TYPE_set - ? bch2_check_discard_freespace_key(trans, &iter, &gen, false) - : 0; - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static void check_discard_freespace_key_work(struct work_struct *work) -{ - struct check_discard_freespace_key_async *w = - container_of(work, struct check_discard_freespace_key_async, work); - - bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); - enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key); - kfree(w); -} - -int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, - bool async_repair) -{ - struct bch_fs *c = trans->c; - enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard - ? BCH_DATA_need_discard - : BCH_DATA_free; - struct printbuf buf = PRINTBUF; - - unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)| - FSCK_CAN_FIX|FSCK_CAN_IGNORE; - - struct bpos bucket = iter->pos; - bucket.offset &= ~(~0ULL << 56); - u64 genbits = iter->pos.offset & (~0ULL << 56); - - struct btree_iter alloc_iter; - struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, - BTREE_ID_alloc, bucket, - async_repair ? BTREE_ITER_cached : 0); - int ret = bkey_err(alloc_k); - if (ret) - return ret; - - if (!bch2_dev_bucket_exists(c, bucket)) { - if (__fsck_err(trans, fsck_flags, - need_discard_freespace_key_to_invalid_dev_bucket, - "entry in %s btree for nonexistant dev:bucket %llu:%llu", - bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) - goto delete; - ret = 1; - goto out; - } - - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - - if (a->data_type != state || - (state == BCH_DATA_free && - genbits != alloc_freespace_genbits(*a))) { - if (__fsck_err(trans, fsck_flags, - need_discard_freespace_key_bad, - "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), - bch2_btree_id_str(iter->btree_id), - iter->pos.inode, - iter->pos.offset, - a->data_type == state, - genbits >> 56, alloc_freespace_genbits(*a) >> 56)) - goto delete; - ret = 1; - goto out; - } - - *gen = a->gen; -out: -fsck_err: - bch2_set_btree_iter_dontneed(trans, &alloc_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); - return ret; -delete: - if (!async_repair) { - ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc) ?: - bch_err_throw(c, transaction_restart_commit); - goto out; - } else { - /* - * We can't repair here when called from the allocator path: the - * commit will recurse back into the allocator - */ - struct check_discard_freespace_key_async *w = - kzalloc(sizeof(*w), GFP_KERNEL); - if (!w) - goto out; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) { - kfree(w); - goto out; - } - - INIT_WORK(&w->work, check_discard_freespace_key_work); - w->c = c; - w->pos = BBPOS(iter->btree_id, iter->pos); - queue_work(c->write_ref_wq, &w->work); - - ret = 1; /* don't allocate from this bucket */ - goto out; - } -} - -static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) -{ - u8 gen; - int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false); - return ret < 0 ? ret : 0; -} - -/* - * We've already checked that generation numbers in the bucket_gens btree are - * valid for buckets that exist; this just checks for keys for nonexistent - * buckets. - */ -static noinline_for_stack -int bch2_check_bucket_gens_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_i_bucket_gens g; - u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; - u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; - u64 b; - bool need_update = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(k.k->type != KEY_TYPE_bucket_gens); - bkey_reassemble(&g.k_i, k); - - struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); - if (!ca) { - if (fsck_err(trans, bucket_gens_to_invalid_dev, - "bucket_gens key for invalid device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; - } - - if (fsck_err_on(end <= ca->mi.first_bucket || - start >= ca->mi.nbuckets, - trans, bucket_gens_to_invalid_buckets, - "bucket_gens key for invalid buckets:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; - } - - for (b = start; b < ca->mi.first_bucket; b++) - if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], - trans, bucket_gens_nonzero_for_invalid_buckets, - "bucket_gens key has nonzero gen for invalid bucket")) { - g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; - need_update = true; - } - - for (b = ca->mi.nbuckets; b < end; b++) - if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], - trans, bucket_gens_nonzero_for_invalid_buckets, - "bucket_gens key has nonzero gen for invalid bucket")) { - g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; - need_update = true; - } - - if (need_update) { - struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto out; - - memcpy(u, &g, sizeof(g)); - ret = bch2_trans_update(trans, iter, u, 0); - } -out: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_alloc_info(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; - struct bch_dev *ca = NULL; - struct bkey hole; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_prefetch); - - while (1) { - struct bpos next; - - bch2_trans_begin(trans); - - k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole); - ret = bkey_err(k); - if (ret) - goto bkey_err; - - if (!k.k) - break; - - if (k.k->type) { - next = bpos_nosnap_successor(k.k->p); - - ret = bch2_check_alloc_key(trans, - k, &iter, - &discard_iter, - &freespace_iter, - &bucket_gens_iter); - if (ret) - goto bkey_err; - } else { - next = k.k->p; - - ret = bch2_check_alloc_hole_freespace(trans, ca, - bkey_start_pos(k.k), - &next, - &freespace_iter) ?: - bch2_check_alloc_hole_bucket_gens(trans, - bkey_start_pos(k.k), - &next, - &bucket_gens_iter); - if (ret) - goto bkey_err; - } - - ret = bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto bkey_err; - - bch2_btree_iter_set_pos(trans, &iter, next); -bkey_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - } - bch2_trans_iter_exit(trans, &bucket_gens_iter); - bch2_trans_iter_exit(trans, &freespace_iter); - bch2_trans_iter_exit(trans, &discard_iter); - bch2_trans_iter_exit(trans, &iter); - bch2_dev_put(ca); - ca = NULL; - - if (ret < 0) - goto err; - - ret = for_each_btree_key(trans, iter, - BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_prefetch, k, - bch2_check_discard_freespace_key_fsck(trans, &iter)); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_prefetch); - while (1) { - bch2_trans_begin(trans); - k = bch2_btree_iter_peek(trans, &iter); - if (!k.k) - break; - - ret = bkey_err(k) ?: - bch2_check_discard_freespace_key_fsck(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - if (ret) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - - bch_err(c, "while checking %s", buf.buf); - printbuf_exit(&buf); - break; - } - - bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos)); - } - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; - - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_bucket_gens_key(trans, &iter, k)); -err: - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - struct btree_iter *alloc_iter, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - struct bkey_s_c alloc_k; - struct printbuf buf = PRINTBUF; - int ret; - - alloc_k = bch2_btree_iter_peek(trans, alloc_iter); - if (!alloc_k.k) - return 0; - - ret = bkey_err(alloc_k); - if (ret) - return ret; - - struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode); - if (!ca) - return 0; - - a = bch2_alloc_to_v4(alloc_k, &a_convert); - - u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - if (lru_idx) { - ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION, - bucket_to_u64(alloc_k.k->p), - lru_idx, alloc_k, last_flushed); - if (ret) - goto err; - } - - if (a->data_type != BCH_DATA_cached) - goto err; - - if (fsck_err_on(!a->io_time[READ], - trans, alloc_key_cached_but_read_time_zero, - "cached bucket with read_time 0\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i_alloc_v4 *a_mut = - bch2_alloc_to_v4_mut(trans, alloc_k); - ret = PTR_ERR_OR_ZERO(a_mut); - if (ret) - goto err; - - a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); - ret = bch2_trans_update(trans, alloc_iter, - &a_mut->k_i, BTREE_TRIGGER_norun); - if (ret) - goto err; - - a = &a_mut->v; - } - - ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ], - alloc_k, last_flushed); - if (ret) - goto err; -err: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_alloc_to_lru_refs(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?: - bch2_check_stripe_to_lru_refs(c); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; -} - -static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) -{ - struct bch_fs *c = ca->fs; - int ret; - - mutex_lock(&ca->discard_buckets_in_flight_lock); - struct discard_in_flight *i = - darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); - if (i) { - ret = bch_err_throw(c, EEXIST_discard_in_flight_add); - goto out; - } - - ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { - .in_progress = in_progress, - .bucket = bucket, - })); -out: - mutex_unlock(&ca->discard_buckets_in_flight_lock); - return ret; -} - -static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) -{ - mutex_lock(&ca->discard_buckets_in_flight_lock); - struct discard_in_flight *i = - darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); - BUG_ON(!i || !i->in_progress); - - darray_remove_item(&ca->discard_buckets_in_flight, i); - mutex_unlock(&ca->discard_buckets_in_flight_lock); -} - -struct discard_buckets_state { - u64 seen; - u64 open; - u64 need_journal_commit; - u64 discarded; -}; - -static int bch2_discard_one_bucket(struct btree_trans *trans, - struct bch_dev *ca, - struct btree_iter *need_discard_iter, - struct bpos *discard_pos_done, - struct discard_buckets_state *s, - bool fastpath) -{ - struct bch_fs *c = trans->c; - struct bpos pos = need_discard_iter->pos; - struct btree_iter iter = {}; - struct bkey_s_c k; - struct bkey_i_alloc_v4 *a; - struct printbuf buf = PRINTBUF; - bool discard_locked = false; - int ret = 0; - - if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { - s->open++; - goto out; - } - - u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, - pos.inode, pos.offset); - if (seq_ready > c->journal.flushed_seq_ondisk) { - if (seq_ready > c->journal.flushing_seq) - s->need_journal_commit++; - goto out; - } - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - need_discard_iter->pos, - BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - goto out; - - a = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto out; - - if (a->v.data_type != BCH_DATA_need_discard) { - if (need_discard_or_freespace_err(trans, k, true, true, true)) { - ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); - if (ret) - goto out; - goto commit; - } - - goto out; - } - - if (!fastpath) { - if (discard_in_flight_add(ca, iter.pos.offset, true)) - goto out; - - discard_locked = true; - } - - if (!bkey_eq(*discard_pos_done, iter.pos)) { - s->discarded++; - *discard_pos_done = iter.pos; - - if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) { - /* - * This works without any other locks because this is the only - * thread that removes items from the need_discard tree - */ - bch2_trans_unlock_long(trans); - blkdev_issue_discard(ca->disk_sb.bdev, - k.k->p.offset * ca->mi.bucket_size, - ca->mi.bucket_size, - GFP_KERNEL); - ret = bch2_trans_relock_notrace(trans); - if (ret) - goto out; - } - } - - SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - alloc_data_type_set(&a->v, a->v.data_type); - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto out; -commit: - ret = bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto out; - - if (!fastpath) - count_event(c, bucket_discard); - else - count_event(c, bucket_discard_fast); -out: -fsck_err: - if (discard_locked) - discard_in_flight_remove(ca, iter.pos.offset); - if (!ret) - s->seen++; - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -} - -static void bch2_do_discards_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); - struct bch_fs *c = ca->fs; - struct discard_buckets_state s = {}; - struct bpos discard_pos_done = POS_MAX; - int ret; - - /* - * We're doing the commit in bch2_discard_one_bucket instead of using - * for_each_btree_key_commit() so that we can increment counters after - * successful commit: - */ - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, - BTREE_ID_need_discard, - POS(ca->dev_idx, 0), - POS(ca->dev_idx, U64_MAX), 0, k, - bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); - - if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal)) - bch2_journal_flush_async(&c->journal, NULL); - - trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, - bch2_err_str(ret)); - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); -} - -void bch2_dev_do_discards(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard)) - return; - - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards)) - goto put_write_ref; - - if (queue_work(c->write_ref_wq, &ca->discard_work)) - return; - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); -put_write_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); -} - -void bch2_do_discards(struct bch_fs *c) -{ - for_each_member_device(c, ca) - bch2_dev_do_discards(ca); -} - -static int bch2_do_discards_fast_one(struct btree_trans *trans, - struct bch_dev *ca, - u64 bucket, - struct bpos *discard_pos_done, - struct discard_buckets_state *s) -{ - struct btree_iter need_discard_iter; - struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter, - BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); - int ret = bkey_err(discard_k); - if (ret) - return ret; - - if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set, - trans, discarding_bucket_not_in_need_discard_btree, - "attempting to discard bucket %u:%llu not in need_discard btree", - ca->dev_idx, bucket)) - goto out; - - ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); -out: -fsck_err: - bch2_trans_iter_exit(trans, &need_discard_iter); - return ret; -} - -static void bch2_do_discards_fast_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); - struct bch_fs *c = ca->fs; - struct discard_buckets_state s = {}; - struct bpos discard_pos_done = POS_MAX; - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - - while (1) { - bool got_bucket = false; - u64 bucket; - - mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) { - if (i->in_progress) - continue; - - got_bucket = true; - bucket = i->bucket; - i->in_progress = true; - break; - } - mutex_unlock(&ca->discard_buckets_in_flight_lock); - - if (!got_bucket) - break; - - ret = lockrestart_do(trans, - bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s)); - bch_err_fn(c, ret); - - discard_in_flight_remove(ca, bucket); - - if (ret) - break; - } - - trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - - bch2_trans_put(trans); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); -} - -static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) -{ - struct bch_fs *c = ca->fs; - - if (discard_in_flight_add(ca, bucket, false)) - return; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast)) - return; - - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast)) - goto put_ref; - - if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) - return; - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); -put_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); -} - -static int invalidate_one_bp(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c_backpointer bp, - struct bkey_buf *last_flushed) -{ - struct btree_iter extent_iter; - struct bkey_s_c extent_k = - bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed); - int ret = bkey_err(extent_k); - if (ret) - return ret; - - if (!extent_k.k) - return 0; - - struct bkey_i *n = - bch2_bkey_make_mut(trans, &extent_iter, &extent_k, - BTREE_UPDATE_internal_snapshot_node); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx); -err: - bch2_trans_iter_exit(trans, &extent_iter); - return ret; -} - -static int invalidate_one_bucket_by_bps(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, - u8 gen, - struct bkey_buf *last_flushed) -{ - struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket); - struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket); - - return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, - bp_start, bp_end, 0, k, - NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc, ({ - if (k.k->type != KEY_TYPE_backpointer) - continue; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - - if (bp.v->bucket_gen != gen) - continue; - - /* filter out bps with gens that don't match */ - - invalidate_one_bp(trans, ca, bp, last_flushed); - })); -} - -noinline_for_stack -static int invalidate_one_bucket(struct btree_trans *trans, - struct bch_dev *ca, - struct btree_iter *lru_iter, - struct bkey_s_c lru_k, - struct bkey_buf *last_flushed, - s64 *nr_to_invalidate) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); - struct btree_iter alloc_iter = {}; - int ret = 0; - - if (*nr_to_invalidate <= 0) - return 1; - - if (!bch2_dev_bucket_exists(c, bucket)) { - if (fsck_err(trans, lru_entry_to_invalid_bucket, - "lru key points to nonexistent device:bucket %llu:%llu", - bucket.inode, bucket.offset)) - return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); - goto out; - } - - if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) - return 0; - - struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, - BTREE_ID_alloc, bucket, - BTREE_ITER_cached); - ret = bkey_err(alloc_k); - if (ret) - return ret; - - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - - /* We expect harmless races here due to the btree write buffer: */ - if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) - goto out; - - /* - * Impossible since alloc_lru_idx_read() only returns nonzero if the - * bucket is supposed to be on the cached bucket LRU (i.e. - * BCH_DATA_cached) - * - * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 - */ - BUG_ON(a->data_type != BCH_DATA_cached); - BUG_ON(a->dirty_sectors); - - if (!a->cached_sectors) { - bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset, - true, last_flushed); - goto out; - } - - unsigned cached_sectors = a->cached_sectors; - u8 gen = a->gen; - - ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); - if (ret) - goto out; - - trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); - --*nr_to_invalidate; -out: -fsck_err: - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); - return ret; -} - -static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, - struct bch_dev *ca, bool *wrapped) -{ - struct bkey_s_c k; -again: - k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); - if (!k.k && !*wrapped) { - bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0)); - *wrapped = true; - goto again; - } - - return k; -} - -static void bch2_do_invalidates_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); - struct bch_fs *c = ca->fs; - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - ret = bch2_btree_write_buffer_tryflush(trans); - if (ret) - goto err; - - s64 nr_to_invalidate = - should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - struct btree_iter iter; - bool wrapped = false; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, - lru_pos(ca->dev_idx, 0, - ((bch2_current_io_time(c, READ) + U32_MAX) & - LRU_TIME_MAX)), 0); - - while (true) { - bch2_trans_begin(trans); - - struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); - ret = bkey_err(k); - if (ret) - goto restart_err; - if (!k.k) - break; - - ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate); -restart_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - bch2_btree_iter_advance(trans, &iter); - } - bch2_trans_iter_exit(trans, &iter); -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&last_flushed, c); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); -} - -void bch2_dev_do_invalidates(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate)) - return; - - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates)) - goto put_ref; - - if (queue_work(c->write_ref_wq, &ca->invalidate_work)) - return; - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); -put_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); -} - -void bch2_do_invalidates(struct bch_fs *c) -{ - for_each_member_device(c, ca) - bch2_dev_do_invalidates(ca); -} - -int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, - u64 bucket_start, u64 bucket_end) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bkey hole; - struct bpos end = POS(ca->dev_idx, bucket_end); - struct bch_member *m; - unsigned long last_updated = jiffies; - int ret; - - BUG_ON(bucket_start > bucket_end); - BUG_ON(bucket_end > ca->mi.nbuckets); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), - BTREE_ITER_prefetch); - /* - * Scan the alloc btree for every bucket on @ca, and add buckets to the - * freespace/need_discard/need_gc_gens btrees as needed: - */ - while (1) { - if (time_after(jiffies, last_updated + HZ * 10)) { - bch_info(ca, "%s: currently at %llu/%llu", - __func__, iter.pos.offset, ca->mi.nbuckets); - last_updated = jiffies; - } - - bch2_trans_begin(trans); - - if (bkey_ge(iter.pos, end)) { - ret = 0; - break; - } - - k = bch2_get_key_or_hole(trans, &iter, end, &hole); - ret = bkey_err(k); - if (ret) - goto bkey_err; - - if (k.k->type) { - /* - * We process live keys in the alloc btree one at a - * time: - */ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - - ret = bch2_bucket_do_index(trans, ca, k, a, true) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto bkey_err; - - bch2_btree_iter_advance(trans, &iter); - } else { - struct bkey_i *freespace; - - freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); - ret = PTR_ERR_OR_ZERO(freespace); - if (ret) - goto bkey_err; - - bkey_init(&freespace->k); - freespace->k.type = KEY_TYPE_set; - freespace->k.p = k.k->p; - freespace->k.size = k.k->size; - - ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto bkey_err; - - bch2_btree_iter_set_pos(trans, &iter, k.k->p); - } -bkey_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - - if (ret < 0) { - bch_err_msg(ca, ret, "initializing free space"); - return ret; - } - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); - mutex_unlock(&c->sb_lock); - - return 0; -} - -int bch2_fs_freespace_init(struct bch_fs *c) -{ - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) - return 0; - - - /* - * We can crash during the device add path, so we need to check this on - * every mount: - */ - - bool doing_init = false; - for_each_member_device(c, ca) { - if (ca->mi.freespace_initialized) - continue; - - if (!doing_init) { - bch_info(c, "initializing freespace"); - doing_init = true; - } - - int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); - if (ret) { - bch2_dev_put(ca); - bch_err_fn(c, ret); - return ret; - } - } - - if (doing_init) { - mutex_lock(&c->sb_lock); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - bch_verbose(c, "done initializing freespace"); - } - - return 0; -} - -/* device removal */ - -int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) -{ - struct bpos start = POS(ca->dev_idx, 0); - struct bpos end = POS(ca->dev_idx, U64_MAX); - int ret; - - /* - * We clear the LRU and need_discard btrees first so that we don't race - * with bch2_do_invalidates() and bch2_do_discards() - */ - ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_dev_usage_remove(c, ca->dev_idx); - bch_err_msg(ca, ret, "removing dev alloc info"); - return ret; -} - -/* Bucket IO clocks: */ - -static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, - size_t bucket_nr, int rw) -{ - struct bch_fs *c = trans->c; - - struct btree_iter iter; - struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; - - u64 now = bch2_current_io_time(c, rw); - if (a->v.io_time[rw] == now) - goto out; - - a->v.io_time[rw] = now; - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, 0); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, - size_t bucket_nr, int rw) -{ - if (bch2_trans_relock(trans)) - bch2_trans_begin(trans); - - return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw)); -} - -/* Startup/shutdown (ro/rw): */ - -void bch2_recalc_capacity(struct bch_fs *c) -{ - u64 capacity = 0, reserved_sectors = 0, gc_reserve; - unsigned bucket_size_max = 0; - unsigned long ra_pages = 0; - - lockdep_assert_held(&c->state_lock); - - guard(rcu)(); - for_each_member_device_rcu(c, ca, NULL) { - struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); - if (bdev) - ra_pages += bdev->bd_disk->bdi->ra_pages; - - if (ca->mi.state != BCH_MEMBER_STATE_rw) - continue; - - u64 dev_reserve = 0; - - /* - * We need to reserve buckets (from the number - * of currently available buckets) against - * foreground writes so that mainly copygc can - * make forward progress. - * - * We need enough to refill the various reserves - * from scratch - copygc will use its entire - * reserve all at once, then run against when - * its reserve is refilled (from the formerly - * available buckets). - * - * This reserve is just used when considering if - * allocations for foreground writes must wait - - * not -ENOSPC calculations. - */ - - dev_reserve += ca->nr_btree_reserve * 2; - dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ - - dev_reserve += 1; /* btree write point */ - dev_reserve += 1; /* copygc write point */ - dev_reserve += 1; /* rebalance write point */ - - dev_reserve *= ca->mi.bucket_size; - - capacity += bucket_to_sector(ca, ca->mi.nbuckets - - ca->mi.first_bucket); - - reserved_sectors += dev_reserve * 2; - - bucket_size_max = max_t(unsigned, bucket_size_max, - ca->mi.bucket_size); - } - - bch2_set_ra_pages(c, ra_pages); - - gc_reserve = c->opts.gc_reserve_bytes - ? c->opts.gc_reserve_bytes >> 9 - : div64_u64(capacity * c->opts.gc_reserve_percent, 100); - - reserved_sectors = max(gc_reserve, reserved_sectors); - - reserved_sectors = min(reserved_sectors, capacity); - - c->reserved = reserved_sectors; - c->capacity = capacity - reserved_sectors; - - c->bucket_size_max = bucket_size_max; - - /* Wake up case someone was waiting for buckets */ - closure_wake_up(&c->freelist_wait); -} - -u64 bch2_min_rw_member_capacity(struct bch_fs *c) -{ - u64 ret = U64_MAX; - - guard(rcu)(); - for_each_rw_member_rcu(c, ca) - ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); - return ret; -} - -static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) -{ - struct open_bucket *ob; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - scoped_guard(spinlock, &ob->lock) { - if (ob->valid && !ob->on_partial_list && - ob->dev == ca->dev_idx) - return true; - } - } - - return false; -} - -void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) -{ - /* BCH_DATA_free == all rw devs */ - - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - if (rw && - (i == BCH_DATA_free || - (ca->mi.data_allowed & BIT(i)))) - set_bit(ca->dev_idx, c->rw_devs[i].d); - else - clear_bit(ca->dev_idx, c->rw_devs[i].d); -} - -/* device goes ro: */ -void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) -{ - lockdep_assert_held(&c->state_lock); - - /* First, remove device from allocation groups: */ - bch2_dev_allocator_set_rw(c, ca, false); - - c->rw_devs_change_count++; - - /* - * Capacity is calculated based off of devices in allocation groups: - */ - bch2_recalc_capacity(c); - - bch2_open_buckets_stop(c, ca, false); - - /* - * Wake up threads that were blocked on allocation, so they can notice - * the device can no longer be removed and the capacity has changed: - */ - closure_wake_up(&c->freelist_wait); - - /* - * journal_res_get() can block waiting for free space in the journal - - * it needs to notice there may not be devices to allocate from anymore: - */ - wake_up(&c->journal.wait); - - /* Now wait for any in flight writes: */ - - closure_wait_event(&c->open_buckets_wait, - !bch2_dev_has_open_write_point(c, ca)); -} - -/* device goes rw: */ -void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) -{ - lockdep_assert_held(&c->state_lock); - - bch2_dev_allocator_set_rw(c, ca, true); - c->rw_devs_change_count++; -} - -void bch2_dev_allocator_background_exit(struct bch_dev *ca) -{ - darray_exit(&ca->discard_buckets_in_flight); -} - -void bch2_dev_allocator_background_init(struct bch_dev *ca) -{ - mutex_init(&ca->discard_buckets_in_flight_lock); - INIT_WORK(&ca->discard_work, bch2_do_discards_work); - INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work); - INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work); -} - -void bch2_fs_allocator_background_init(struct bch_fs *c) -{ - spin_lock_init(&c->freelist_lock); -} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h deleted file mode 100644 index 0cc5adc55b6f1e..00000000000000 --- a/fs/bcachefs/alloc_background.h +++ /dev/null @@ -1,361 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_BACKGROUND_H -#define _BCACHEFS_ALLOC_BACKGROUND_H - -#include "bcachefs.h" -#include "alloc_types.h" -#include "buckets.h" -#include "debug.h" -#include "super.h" - -/* How out of date a pointer gen is allowed to be: */ -#define BUCKET_GC_GEN_MAX 96U - -static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); - return ca && bucket_valid(ca, pos.offset); -} - -static inline u64 bucket_to_u64(struct bpos bucket) -{ - return (bucket.inode << 48) | bucket.offset; -} - -static inline struct bpos u64_to_bucket(u64 bucket) -{ - return POS(bucket >> 48, bucket & ~(~0ULL << 48)); -} - -static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) -{ - return a.gen - a.oldest_gen; -} - -static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src) -{ - dst->gen = src.gen; - dst->data_type = src.data_type; - dst->stripe_sectors = src.stripe_sectors; - dst->dirty_sectors = src.dirty_sectors; - dst->cached_sectors = src.cached_sectors; - dst->stripe = src.stripe; -} - -static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src) -{ - dst->gen = src.gen; - dst->data_type = src.data_type; - dst->stripe_sectors = src.stripe_sectors; - dst->dirty_sectors = src.dirty_sectors; - dst->cached_sectors = src.cached_sectors; - dst->stripe = src.stripe; -} - -static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) -{ - struct bch_alloc_v4 ret = {}; - __bucket_m_to_alloc(&ret, b); - return ret; -} - -static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) -{ - switch (data_type) { - case BCH_DATA_cached: - case BCH_DATA_stripe: - return BCH_DATA_user; - default: - return data_type; - } -} - -static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, - enum bch_data_type ptr) -{ - return !data_type_is_empty(bucket) && - bucket_data_type(bucket) != bucket_data_type(ptr); -} - -/* - * It is my general preference to use unsigned types for unsigned quantities - - * however, these helpers are used in disk accounting calculations run by - * triggers where the output will be negated and added to an s64. unsigned is - * right out even though all these quantities will fit in 32 bits, since it - * won't be sign extended correctly; u64 will negate "correctly", but s64 is the - * simpler option here. - */ -static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a) -{ - return a.stripe_sectors + a.dirty_sectors + a.cached_sectors; -} - -static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a) -{ - return a.stripe_sectors + a.dirty_sectors; -} - -static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a) -{ - return a.data_type == BCH_DATA_cached - ? a.cached_sectors - : bch2_bucket_sectors_dirty(a); -} - -static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca, - struct bch_alloc_v4 a) -{ - int d = bch2_bucket_sectors(a); - - return d ? max(0, ca->mi.bucket_size - d) : 0; -} - -static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a) -{ - int d = a.stripe_sectors + a.dirty_sectors; - - return d ? max(0, ca->mi.bucket_size - d) : 0; -} - -static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a) -{ - return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0; -} - -static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, - enum bch_data_type data_type) -{ - if (a.stripe) - return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; - if (bch2_bucket_sectors_dirty(a)) - return bucket_data_type(data_type); - if (a.cached_sectors) - return BCH_DATA_cached; - if (BCH_ALLOC_V4_NEED_DISCARD(&a)) - return BCH_DATA_need_discard; - if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) - return BCH_DATA_need_gc_gens; - return BCH_DATA_free; -} - -static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type) -{ - a->data_type = alloc_data_type(*a, data_type); -} - -static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) -{ - return a.data_type == BCH_DATA_cached - ? a.io_time[READ] & LRU_TIME_MAX - : 0; -} - -#define DATA_TYPES_MOVABLE \ - ((1U << BCH_DATA_btree)| \ - (1U << BCH_DATA_user)| \ - (1U << BCH_DATA_stripe)) - -static inline bool data_type_movable(enum bch_data_type type) -{ - return (1U << type) & DATA_TYPES_MOVABLE; -} - -static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, - struct bch_dev *ca) -{ - if (a.data_type >= BCH_DATA_NR) - return 0; - - if (!data_type_movable(a.data_type) || - !bch2_bucket_sectors_fragmented(ca, a)) - return 0; - - /* - * avoid overflowing LRU_TIME_BITS on a corrupted fs, when - * bucket_sectors_dirty is (much) bigger than bucket_size - */ - u64 d = min_t(s64, bch2_bucket_sectors_dirty(a), - ca->mi.bucket_size); - - return div_u64(d * (1ULL << 31), ca->mi.bucket_size); -} - -static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) -{ - return ((u64) alloc_gc_gen(a) >> 4) << 56; -} - -static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) -{ - pos.offset |= alloc_freespace_genbits(a); - return pos; -} - -static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a) -{ - return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: - BCH_ALLOC_V4_U64s_V0) + - BCH_ALLOC_V4_NR_BACKPOINTERS(a) * - (sizeof(struct bch_backpointer) / sizeof(u64)); -} - -static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) -{ - unsigned ret = alloc_v4_u64s_noerror(a); - BUG_ON(ret > U8_MAX - BKEY_U64s); - return ret; -} - -static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) -{ - set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v)); -} - -struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos); -struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *, struct bpos, - enum btree_iter_update_trigger_flags); - -void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); - -static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert) -{ - const struct bch_alloc_v4 *ret; - - if (unlikely(k.k->type != KEY_TYPE_alloc_v4)) - goto slowpath; - - ret = bkey_s_c_to_alloc_v4(k).v; - if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s) - goto slowpath; - - return ret; -slowpath: - __bch2_alloc_to_v4(k, convert); - return convert; -} - -struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); - -int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); - -int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_alloc_v4_swab(struct bkey_s); -void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_alloc ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v1_validate, \ - .val_to_text = bch2_alloc_to_text, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 8, \ -}) - -#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v2_validate, \ - .val_to_text = bch2_alloc_to_text, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 8, \ -}) - -#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v3_validate, \ - .val_to_text = bch2_alloc_to_text, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 16, \ -}) - -#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v4_validate, \ - .val_to_text = bch2_alloc_v4_to_text, \ - .swab = bch2_alloc_v4_swab, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 48, \ -}) - -int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ - .key_validate = bch2_bucket_gens_validate, \ - .val_to_text = bch2_bucket_gens_to_text, \ -}) - -int bch2_bucket_gens_init(struct bch_fs *); - -static inline bool bkey_is_alloc(const struct bkey *k) -{ - return k->type == KEY_TYPE_alloc || - k->type == KEY_TYPE_alloc_v2 || - k->type == KEY_TYPE_alloc_v3; -} - -int bch2_alloc_read(struct bch_fs *); - -int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *, - const struct bch_alloc_v4 *, - const struct bch_alloc_v4 *, unsigned); -int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); -int bch2_check_alloc_info(struct bch_fs *); -int bch2_check_alloc_to_lru_refs(struct bch_fs *); -void bch2_dev_do_discards(struct bch_dev *); -void bch2_do_discards(struct bch_fs *); - -static inline u64 should_invalidate_buckets(struct bch_dev *ca, - struct bch_dev_usage u) -{ - u64 want_free = ca->mi.nbuckets >> 7; - u64 free = max_t(s64, 0, - u.buckets[BCH_DATA_free] - + u.buckets[BCH_DATA_need_discard] - - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); - - return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]); -} - -void bch2_dev_do_invalidates(struct bch_dev *); -void bch2_do_invalidates(struct bch_fs *); - -static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) -{ - return (void *) ((u64 *) &a->v + - (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: - BCH_ALLOC_V4_U64s_V0)); -} - -static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a) -{ - return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); -} - -int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64); -int bch2_fs_freespace_init(struct bch_fs *); -int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); - -void bch2_recalc_capacity(struct bch_fs *); -u64 bch2_min_rw_member_capacity(struct bch_fs *); - -void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool); -void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); - -void bch2_dev_allocator_background_exit(struct bch_dev *); -void bch2_dev_allocator_background_init(struct bch_dev *); - -void bch2_fs_allocator_background_init(struct bch_fs *); - -#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h deleted file mode 100644 index 740238369a5a21..00000000000000 --- a/fs/bcachefs/alloc_background_format.h +++ /dev/null @@ -1,95 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H -#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H - -struct bch_alloc { - struct bch_val v; - __u8 fields; - __u8 gen; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V1() \ - x(read_time, 16) \ - x(write_time, 16) \ - x(data_type, 8) \ - x(dirty_sectors, 16) \ - x(cached_sectors, 16) \ - x(oldest_gen, 8) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -enum { -#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, - BCH_ALLOC_FIELDS_V1() -#undef x -}; - -struct bch_alloc_v2 { - struct bch_val v; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V2() \ - x(read_time, 64) \ - x(write_time, 64) \ - x(dirty_sectors, 32) \ - x(cached_sectors, 32) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -struct bch_alloc_v3 { - struct bch_val v; - __le64 journal_seq; - __le32 flags; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) -LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) - -struct bch_alloc_v4 { - struct bch_val v; - __u64 journal_seq_nonempty; - __u32 flags; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 stripe_redundancy; - __u32 dirty_sectors; - __u32 cached_sectors; - __u64 io_time[2]; - __u32 stripe; - __u32 nr_external_backpointers; - /* end of fields in original version of alloc_v4 */ - __u64 journal_seq_empty; - __u32 stripe_sectors; - __u32 pad; -} __packed __aligned(8); - -#define BCH_ALLOC_V4_U64s_V0 6 -#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) - -BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) -BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) -BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) -BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) - -#define KEY_TYPE_BUCKET_GENS_BITS 8 -#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) -#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) - -struct bch_bucket_gens { - struct bch_val v; - u8 gens[KEY_TYPE_BUCKET_GENS_NR]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c deleted file mode 100644 index b58525ec7b4d3c..00000000000000 --- a/fs/bcachefs/alloc_foreground.c +++ /dev/null @@ -1,1683 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2012 Google, Inc. - * - * Foreground allocator code: allocate buckets from freelist, and allocate in - * sector granularity from writepoints. - * - * bch2_bucket_alloc() allocates a single bucket from a specific device. - * - * bch2_bucket_alloc_set() allocates one or more buckets from different devices - * in a given filesystem. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_gc.h" -#include "buckets.h" -#include "buckets_waiting_for_journal.h" -#include "clock.h" -#include "debug.h" -#include "disk_groups.h" -#include "ec.h" -#include "error.h" -#include "io_write.h" -#include "journal.h" -#include "movinggc.h" -#include "nocow_locking.h" -#include "trace.h" - -#include -#include -#include - -static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, - struct mutex *lock) -{ - if (!mutex_trylock(lock)) { - bch2_trans_unlock(trans); - mutex_lock(lock); - } -} - -const char * const bch2_watermarks[] = { -#define x(t) #t, - BCH_WATERMARKS() -#undef x - NULL -}; - -/* - * Open buckets represent a bucket that's currently being allocated from. They - * serve two purposes: - * - * - They track buckets that have been partially allocated, allowing for - * sub-bucket sized allocations - they're used by the sector allocator below - * - * - They provide a reference to the buckets they own that mark and sweep GC - * can find, until the new allocation has a pointer to it inserted into the - * btree - * - * When allocating some space with the sector allocator, the allocation comes - * with a reference to an open bucket - the caller is required to put that - * reference _after_ doing the index update that makes its allocation reachable. - */ - -void bch2_reset_alloc_cursors(struct bch_fs *c) -{ - guard(rcu)(); - for_each_member_device_rcu(c, ca, NULL) - memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); -} - -static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) -{ - open_bucket_idx_t idx = ob - c->open_buckets; - open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); - - ob->hash = *slot; - *slot = idx; -} - -static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) -{ - open_bucket_idx_t idx = ob - c->open_buckets; - open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); - - while (*slot != idx) { - BUG_ON(!*slot); - slot = &c->open_buckets[*slot].hash; - } - - *slot = ob->hash; - ob->hash = 0; -} - -void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - if (ob->ec) { - ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); - return; - } - - spin_lock(&ob->lock); - ob->valid = false; - ob->data_type = 0; - spin_unlock(&ob->lock); - - spin_lock(&c->freelist_lock); - bch2_open_bucket_hash_remove(c, ob); - - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; - - c->open_buckets_nr_free++; - ca->nr_open_buckets--; - spin_unlock(&c->freelist_lock); - - closure_wake_up(&c->open_buckets_wait); -} - -void bch2_open_bucket_write_error(struct bch_fs *c, - struct open_buckets *obs, - unsigned dev, int err) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, obs, ob, i) - if (ob->dev == dev && ob->ec) - bch2_ec_bucket_cancel(c, ob, err); -} - -static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) -{ - struct open_bucket *ob; - - BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); - - ob = c->open_buckets + c->open_buckets_freelist; - c->open_buckets_freelist = ob->freelist; - atomic_set(&ob->pin, 1); - ob->data_type = 0; - - c->open_buckets_nr_free--; - return ob; -} - -static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -{ - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs)) - return false; - - return bch2_is_superblock_bucket(ca, b); -} - -static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) -{ - BUG_ON(c->open_buckets_partial_nr >= - ARRAY_SIZE(c->open_buckets_partial)); - - spin_lock(&c->freelist_lock); - scoped_guard(rcu) - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; - - ob->on_partial_list = true; - c->open_buckets_partial[c->open_buckets_partial_nr++] = - ob - c->open_buckets; - spin_unlock(&c->freelist_lock); - - closure_wake_up(&c->open_buckets_wait); - closure_wake_up(&c->freelist_wait); -} - -static inline bool may_alloc_bucket(struct bch_fs *c, - struct alloc_request *req, - struct bpos bucket) -{ - if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { - req->counters.skipped_open++; - return false; - } - - u64 journal_seq_ready = - bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, - bucket.inode, bucket.offset); - if (journal_seq_ready > c->journal.flushed_seq_ondisk) { - if (journal_seq_ready > c->journal.flushing_seq) - req->counters.need_journal_commit++; - req->counters.skipped_need_journal_commit++; - return false; - } - - if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { - req->counters.skipped_nocow++; - return false; - } - - return true; -} - -static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, - struct alloc_request *req, - u64 bucket, u8 gen, - struct closure *cl) -{ - struct bch_dev *ca = req->ca; - - if (unlikely(is_superblock_bucket(c, ca, bucket))) - return NULL; - - if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { - req->counters.skipped_nouse++; - return NULL; - } - - spin_lock(&c->freelist_lock); - - if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { - if (cl) - closure_wait(&c->open_buckets_wait, cl); - - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); - spin_unlock(&c->freelist_lock); - return ERR_PTR(bch_err_throw(c, open_buckets_empty)); - } - - /* Recheck under lock: */ - if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { - spin_unlock(&c->freelist_lock); - req->counters.skipped_open++; - return NULL; - } - - struct open_bucket *ob = bch2_open_bucket_alloc(c); - - spin_lock(&ob->lock); - ob->valid = true; - ob->sectors_free = ca->mi.bucket_size; - ob->dev = ca->dev_idx; - ob->gen = gen; - ob->bucket = bucket; - spin_unlock(&ob->lock); - - ca->nr_open_buckets++; - bch2_open_bucket_hash_add(c, ob); - - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); - track_event_change(&c->times[BCH_TIME_blocked_allocate], false); - - spin_unlock(&c->freelist_lock); - return ob; -} - -static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, - struct alloc_request *req, - struct btree_iter *freespace_iter, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - - if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b))) - return NULL; - - u8 gen; - int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); - if (ret < 0) - return ERR_PTR(ret); - if (ret) - return NULL; - - return __try_alloc_bucket(c, req, b, gen, cl); -} - -/* - * This path is for before the freespace btree is initialized: - */ -static noinline struct open_bucket * -bch2_bucket_alloc_early(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = req->ca; - struct btree_iter iter, citer; - struct bkey_s_c k, ck; - struct open_bucket *ob = NULL; - u64 first_bucket = ca->mi.first_bucket; - u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; - u64 alloc_start = max(first_bucket, *dev_alloc_cursor); - u64 alloc_cursor = alloc_start; - int ret; - - /* - * Scan with an uncached iterator to avoid polluting the key cache. An - * uncached iter will return a cached key if one exists, but if not - * there is no other underlying protection for the associated key cache - * slot. To avoid racing bucket allocations, look up the cached key slot - * of any likely allocation candidate before attempting to proceed with - * the allocation. This provides proper exclusion on the associated - * bucket. - */ -again: - for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), - BTREE_ITER_slots, k, ret) { - u64 bucket = k.k->p.offset; - - if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) - break; - - if (req->btree_bitmap != BTREE_BITMAP_ANY && - req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, - bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (req->btree_bitmap == BTREE_BITMAP_YES && - bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) - break; - - bucket = sector_to_bucket(ca, - round_up(bucket_to_sector(ca, bucket) + 1, - 1ULL << ca->mi.btree_bitmap_shift)); - bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); - req->counters.buckets_seen++; - req->counters.skipped_mi_btree_bitmap++; - continue; - } - - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - if (a->data_type != BCH_DATA_free) - continue; - - /* now check the cached key to serialize concurrent allocs of the bucket */ - ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); - ret = bkey_err(ck); - if (ret) - break; - - a = bch2_alloc_to_v4(ck, &a_convert); - if (a->data_type != BCH_DATA_free) - goto next; - - req->counters.buckets_seen++; - - ob = may_alloc_bucket(c, req, k.k->p) - ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) - : NULL; -next: - bch2_set_btree_iter_dontneed(trans, &citer); - bch2_trans_iter_exit(trans, &citer); - if (ob) - break; - } - bch2_trans_iter_exit(trans, &iter); - - alloc_cursor = iter.pos.offset; - - if (!ob && ret) - ob = ERR_PTR(ret); - - if (!ob && alloc_start > first_bucket) { - alloc_cursor = alloc_start = first_bucket; - goto again; - } - - *dev_alloc_cursor = alloc_cursor; - - return ob; -} - -static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - struct bch_dev *ca = req->ca; - struct btree_iter iter; - struct bkey_s_c k; - struct open_bucket *ob = NULL; - u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); - u64 alloc_cursor = alloc_start; - int ret; -again: - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, - POS(ca->dev_idx, alloc_cursor), - POS(ca->dev_idx, U64_MAX), - 0, k, ret) { - /* - * peek normally dosen't trim extents - they can span iter.pos, - * which is not what we want here: - */ - iter.k.size = iter.k.p.offset - iter.pos.offset; - - while (iter.k.size) { - req->counters.buckets_seen++; - - u64 bucket = iter.pos.offset & ~(~0ULL << 56); - if (req->btree_bitmap != BTREE_BITMAP_ANY && - req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, - bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (req->btree_bitmap == BTREE_BITMAP_YES && - bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) - goto fail; - - bucket = sector_to_bucket(ca, - round_up(bucket_to_sector(ca, bucket + 1), - 1ULL << ca->mi.btree_bitmap_shift)); - alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); - - bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); - req->counters.skipped_mi_btree_bitmap++; - goto next; - } - - ob = try_alloc_bucket(trans, req, &iter, cl); - if (ob) { - if (!IS_ERR(ob)) - *dev_alloc_cursor = iter.pos.offset; - bch2_set_btree_iter_dontneed(trans, &iter); - break; - } - - iter.k.size--; - iter.pos.offset++; - } -next: - if (ob || ret) - break; - } -fail: - bch2_trans_iter_exit(trans, &iter); - - BUG_ON(ob && ret); - - if (ret) - ob = ERR_PTR(ret); - - if (!ob && alloc_start > ca->mi.first_bucket) { - alloc_cursor = alloc_start = ca->mi.first_bucket; - goto again; - } - - return ob; -} - -static noinline void trace_bucket_alloc2(struct bch_fs *c, - struct alloc_request *req, - struct closure *cl, - struct open_bucket *ob) -{ - struct printbuf buf = PRINTBUF; - - printbuf_tabstop_push(&buf, 24); - - prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); - prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); - prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); - prt_printf(&buf, "blocking\t%u\n", cl != NULL); - prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); - prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); - prt_printf(&buf, "copygc_wait\t%llu/%lli\n", - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); - prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); - prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open); - prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit); - prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow); - prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse); - prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap); - - if (!IS_ERR(ob)) { - prt_printf(&buf, "allocated\t%llu\n", ob->bucket); - trace_bucket_alloc(c, buf.buf); - } else { - prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); - trace_bucket_alloc_fail(c, buf.buf); - } - - printbuf_exit(&buf); -} - -/** - * bch2_bucket_alloc_trans - allocate a single bucket from a specific device - * @trans: transaction object - * @req: state for the entire allocation - * @cl: if not NULL, closure to be used to wait if buckets not available - * @nowait: if true, do not wait for buckets to become available - * - * Returns: an open_bucket on success, or an ERR_PTR() on failure. - */ -static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl, - bool nowait) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = req->ca; - struct open_bucket *ob = NULL; - bool freespace = READ_ONCE(ca->mi.freespace_initialized); - u64 avail; - bool waiting = nowait; - - req->btree_bitmap = req->data_type == BCH_DATA_btree; - memset(&req->counters, 0, sizeof(req->counters)); -again: - bch2_dev_usage_read_fast(ca, &req->usage); - avail = dev_buckets_free(ca, req->usage, req->watermark); - - if (req->usage.buckets[BCH_DATA_need_discard] > - min(avail, ca->mi.nbuckets >> 7)) - bch2_dev_do_discards(ca); - - if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) - bch2_gc_gens_async(c); - - if (should_invalidate_buckets(ca, req->usage)) - bch2_dev_do_invalidates(ca); - - if (!avail) { - if (req->watermark > BCH_WATERMARK_normal && - c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) - goto alloc; - - if (cl && !waiting) { - closure_wait(&c->freelist_wait, cl); - waiting = true; - goto again; - } - - track_event_change(&c->times[BCH_TIME_blocked_allocate], true); - - ob = ERR_PTR(bch_err_throw(c, freelist_empty)); - goto err; - } - - if (waiting) - closure_wake_up(&c->freelist_wait); -alloc: - ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, req, cl) - : bch2_bucket_alloc_early(trans, req, cl); - - if (req->counters.need_journal_commit * 2 > avail) - bch2_journal_flush_async(&c->journal, NULL); - - if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) { - req->btree_bitmap = BTREE_BITMAP_ANY; - goto alloc; - } - - if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) { - freespace = false; - goto alloc; - } -err: - if (!ob) - ob = ERR_PTR(bch_err_throw(c, no_buckets_found)); - - if (!IS_ERR(ob)) - ob->data_type = req->data_type; - - if (!IS_ERR(ob)) - count_event(c, bucket_alloc); - else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) - count_event(c, bucket_alloc_fail); - - if (!IS_ERR(ob) - ? trace_bucket_alloc_enabled() - : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, req, cl, ob); - - return ob; -} - -struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, - struct closure *cl) -{ - struct open_bucket *ob; - struct alloc_request req = { - .watermark = watermark, - .data_type = data_type, - .ca = ca, - }; - - bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); - return ob; -} - -static int __dev_stripe_cmp(struct dev_stripe_state *stripe, - unsigned l, unsigned r) -{ - return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]); -} - -#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) - -void bch2_dev_alloc_list(struct bch_fs *c, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs, - struct dev_alloc_list *ret) -{ - ret->nr = 0; - - unsigned i; - for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret->data[ret->nr++] = i; - - bubble_sort(ret->data, ret->nr, dev_stripe_cmp); -} - -static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ -static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */ -static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */ - -static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe) -{ - /* - * Avoid underflowing clock hands if at all possible, if clock hands go - * to 0 then we lose information - clock hands can be in a wide range if - * we have devices we rarely try to allocate from, if we generally - * allocate from a specified target but only sometimes have to fall back - * to the whole filesystem. - */ - u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */ - u64 scale_min = 0; /* minumum we must subtract to avoid overflow */ - - for (u64 *v = stripe->next_alloc; - v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) { - if (*v) - scale_max = min(scale_max, *v); - if (*v > stripe_clock_hand_max) - scale_min = max(scale_min, *v - stripe_clock_hand_max); - } - - u64 scale = max(scale_min, scale_max); - - for (u64 *v = stripe->next_alloc; - v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) - *v = *v < scale ? 0 : *v - scale; -} - -static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, - struct dev_stripe_state *stripe, - struct bch_dev_usage *usage) -{ - /* - * Stripe state has a per device clock hand: we allocate from the device - * with the smallest clock hand. - * - * When we allocate, we don't do a simple increment; we add the inverse - * of the device's free space. This results in round robin behavior that - * biases in favor of the device(s) with more free space. - */ - - u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); - u64 free_space_inv = free_space - ? div64_u64(stripe_clock_hand_inv, free_space) - : stripe_clock_hand_inv; - - /* Saturating add, avoid overflow: */ - u64 sum = *v + free_space_inv; - *v = sum >= *v ? sum : U64_MAX; - - if (unlikely(*v > stripe_clock_hand_rescale)) - bch2_stripe_state_rescale(stripe); -} - -void bch2_dev_stripe_increment(struct bch_dev *ca, - struct dev_stripe_state *stripe) -{ - struct bch_dev_usage usage; - - bch2_dev_usage_read_fast(ca, &usage); - bch2_dev_stripe_increment_inlined(ca, stripe, &usage); -} - -static int add_new_bucket(struct bch_fs *c, - struct alloc_request *req, - struct open_bucket *ob) -{ - unsigned durability = ob_dev(c, ob)->mi.durability; - - BUG_ON(req->nr_effective >= req->nr_replicas); - - __clear_bit(ob->dev, req->devs_may_alloc.d); - req->nr_effective += durability; - req->have_cache |= !durability; - - ob_push(c, &req->ptrs, ob); - - if (req->nr_effective >= req->nr_replicas) - return 1; - if (ob->ec) - return 1; - return 0; -} - -inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct alloc_request *req, - struct dev_stripe_state *stripe, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - BUG_ON(req->nr_effective >= req->nr_replicas); - - bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted); - - darray_for_each(req->devs_sorted, i) { - req->ca = bch2_dev_tryget_noerror(c, *i); - if (!req->ca) - continue; - - if (!req->ca->mi.durability && req->have_cache) { - bch2_dev_put(req->ca); - continue; - } - - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, - req->flags & BCH_WRITE_alloc_nowait); - if (!IS_ERR(ob)) - bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); - bch2_dev_put(req->ca); - - if (IS_ERR(ob)) { - ret = PTR_ERR(ob); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) - break; - continue; - } - - ret = add_new_bucket(c, req, ob); - if (ret) - break; - } - - if (ret == 1) - return 0; - if (ret) - return ret; - return bch_err_throw(c, insufficient_devices); -} - -/* Allocate from stripes: */ - -/* - * if we can't allocate a new stripe because there are already too many - * partially filled stripes, force allocating from an existing stripe even when - * it's to a device we don't want: - */ - -static int bucket_alloc_from_stripe(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - if (req->nr_replicas < 2) - return 0; - - if (ec_open_bucket(c, &req->ptrs)) - return 0; - - struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, req, 0, cl); - if (IS_ERR(h)) - return PTR_ERR(h); - if (!h) - return 0; - - bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted); - - darray_for_each(req->devs_sorted, i) - for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { - if (!h->s->blocks[ec_idx]) - continue; - - struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; - if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { - ob->ec_idx = ec_idx; - ob->ec = h->s; - ec_stripe_new_get(h->s, STRIPE_REF_io); - - ret = add_new_bucket(c, req, ob); - goto out; - } - } -out: - bch2_ec_stripe_head_put(c, h); - return ret; -} - -/* Sector allocator */ - -static bool want_bucket(struct bch_fs *c, - struct alloc_request *req, - struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - if (!test_bit(ob->dev, req->devs_may_alloc.d)) - return false; - - if (ob->data_type != req->wp->data_type) - return false; - - if (!ca->mi.durability && - (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) - return false; - - if (req->ec != (ob->ec != NULL)) - return false; - - return true; -} - -static int bucket_alloc_set_writepoint(struct bch_fs *c, - struct alloc_request *req) -{ - struct open_bucket *ob; - unsigned i; - int ret = 0; - - req->scratch_ptrs.nr = 0; - - open_bucket_for_each(c, &req->wp->ptrs, ob, i) { - if (!ret && want_bucket(c, req, ob)) - ret = add_new_bucket(c, req, ob); - else - ob_push(c, &req->scratch_ptrs, ob); - } - req->wp->ptrs = req->scratch_ptrs; - - return ret; -} - -static int bucket_alloc_set_partial(struct bch_fs *c, - struct alloc_request *req) -{ - int i, ret = 0; - - if (!c->open_buckets_partial_nr) - return 0; - - spin_lock(&c->freelist_lock); - - if (!c->open_buckets_partial_nr) - goto unlock; - - for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { - struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; - - if (want_bucket(c, req, ob)) { - struct bch_dev *ca = ob_dev(c, ob); - u64 avail; - - bch2_dev_usage_read_fast(ca, &req->usage); - avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets; - if (!avail) - continue; - - array_remove_item(c->open_buckets_partial, - c->open_buckets_partial_nr, - i); - ob->on_partial_list = false; - - scoped_guard(rcu) - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - - ret = add_new_bucket(c, req, ob); - if (ret) - break; - } - } -unlock: - spin_unlock(&c->freelist_lock); - return ret; -} - -static int __open_bucket_add_buckets(struct btree_trans *trans, - struct alloc_request *req, - struct closure *_cl) -{ - struct bch_fs *c = trans->c; - struct open_bucket *ob; - struct closure *cl = NULL; - unsigned i; - int ret; - - req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); - - /* Don't allocate from devices we already have pointers to: */ - darray_for_each(*req->devs_have, i) - __clear_bit(*i, req->devs_may_alloc.d); - - open_bucket_for_each(c, &req->ptrs, ob, i) - __clear_bit(ob->dev, req->devs_may_alloc.d); - - ret = bucket_alloc_set_writepoint(c, req); - if (ret) - return ret; - - ret = bucket_alloc_set_partial(c, req); - if (ret) - return ret; - - if (req->ec) { - ret = bucket_alloc_from_stripe(trans, req, _cl); - } else { -retry_blocking: - /* - * Try nonblocking first, so that if one device is full we'll try from - * other devices: - */ - ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && - !cl && _cl) { - cl = _cl; - goto retry_blocking; - } - } - - return ret; -} - -static int open_bucket_add_buckets(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - int ret; - - if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { - ret = __open_bucket_add_buckets(trans, req, cl); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_operation_blocked) || - bch2_err_matches(ret, BCH_ERR_freelist_empty) || - bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - return ret; - if (req->nr_effective >= req->nr_replicas) - return 0; - } - - bool ec = false; - swap(ec, req->ec); - ret = __open_bucket_add_buckets(trans, req, cl); - swap(ec, req->ec); - - return ret < 0 ? ret : 0; -} - -/** - * should_drop_bucket - check if this is open_bucket should go away - * @ob: open_bucket to predicate on - * @c: filesystem handle - * @ca: if set, we're killing buckets for a particular device - * @ec: if true, we're shutting down erasure coding and killing all ec - * open_buckets - * otherwise, return true - * Returns: true if we should kill this open_bucket - * - * We're killing open_buckets because we're shutting down a device, erasure - * coding, or the entire filesystem - check if this open_bucket matches: - */ -static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, - struct bch_dev *ca, bool ec) -{ - if (ec) { - return ob->ec != NULL; - } else if (ca) { - bool drop = ob->dev == ca->dev_idx; - struct open_bucket *ob2; - unsigned i; - - if (!drop && ob->ec) { - unsigned nr_blocks; - - mutex_lock(&ob->ec->lock); - nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; - - for (i = 0; i < nr_blocks; i++) { - if (!ob->ec->blocks[i]) - continue; - - ob2 = c->open_buckets + ob->ec->blocks[i]; - drop |= ob2->dev == ca->dev_idx; - } - mutex_unlock(&ob->ec->lock); - } - - return drop; - } else { - return true; - } -} - -static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, - bool ec, struct write_point *wp) -{ - struct open_buckets ptrs = { .nr = 0 }; - struct open_bucket *ob; - unsigned i; - - mutex_lock(&wp->lock); - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (should_drop_bucket(ob, c, ca, ec)) - bch2_open_bucket_put(c, ob); - else - ob_push(c, &ptrs, ob); - wp->ptrs = ptrs; - mutex_unlock(&wp->lock); -} - -void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, - bool ec) -{ - unsigned i; - - /* Next, close write points that point to this device... */ - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); - - bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); - bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); - bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); - - mutex_lock(&c->btree_reserve_cache_lock); - while (c->btree_reserve_cache_nr) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - bch2_open_buckets_put(c, &a->ob); - } - mutex_unlock(&c->btree_reserve_cache_lock); - - spin_lock(&c->freelist_lock); - i = 0; - while (i < c->open_buckets_partial_nr) { - struct open_bucket *ob = - c->open_buckets + c->open_buckets_partial[i]; - - if (should_drop_bucket(ob, c, ca, ec)) { - --c->open_buckets_partial_nr; - swap(c->open_buckets_partial[i], - c->open_buckets_partial[c->open_buckets_partial_nr]); - - ob->on_partial_list = false; - - scoped_guard(rcu) - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - - spin_unlock(&c->freelist_lock); - bch2_open_bucket_put(c, ob); - spin_lock(&c->freelist_lock); - } else { - i++; - } - } - spin_unlock(&c->freelist_lock); - - bch2_ec_stop_dev(c, ca); -} - -static inline struct hlist_head *writepoint_hash(struct bch_fs *c, - unsigned long write_point) -{ - unsigned hash = - hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); - - return &c->write_points_hash[hash]; -} - -static struct write_point *__writepoint_find(struct hlist_head *head, - unsigned long write_point) -{ - struct write_point *wp; - - guard(rcu)(); - hlist_for_each_entry_rcu(wp, head, node) - if (wp->write_point == write_point) - return wp; - return NULL; -} - -static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) -{ - u64 stranded = c->write_points_nr * c->bucket_size_max; - u64 free = bch2_fs_usage_read_short(c).free; - - return stranded * factor > free; -} - -static noinline bool try_increase_writepoints(struct bch_fs *c) -{ - struct write_point *wp; - - if (c->write_points_nr == ARRAY_SIZE(c->write_points) || - too_many_writepoints(c, 32)) - return false; - - wp = c->write_points + c->write_points_nr++; - hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); - return true; -} - -static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) -{ - struct bch_fs *c = trans->c; - struct write_point *wp; - struct open_bucket *ob; - unsigned i; - - mutex_lock(&c->write_points_hash_lock); - if (c->write_points_nr < old_nr) { - mutex_unlock(&c->write_points_hash_lock); - return true; - } - - if (c->write_points_nr == 1 || - !too_many_writepoints(c, 8)) { - mutex_unlock(&c->write_points_hash_lock); - return false; - } - - wp = c->write_points + --c->write_points_nr; - - hlist_del_rcu(&wp->node); - mutex_unlock(&c->write_points_hash_lock); - - bch2_trans_mutex_lock_norelock(trans, &wp->lock); - open_bucket_for_each(c, &wp->ptrs, ob, i) - open_bucket_free_unused(c, ob); - wp->ptrs.nr = 0; - mutex_unlock(&wp->lock); - return true; -} - -static struct write_point *writepoint_find(struct btree_trans *trans, - unsigned long write_point) -{ - struct bch_fs *c = trans->c; - struct write_point *wp, *oldest; - struct hlist_head *head; - - if (!(write_point & 1UL)) { - wp = (struct write_point *) write_point; - bch2_trans_mutex_lock_norelock(trans, &wp->lock); - return wp; - } - - head = writepoint_hash(c, write_point); -restart_find: - wp = __writepoint_find(head, write_point); - if (wp) { -lock_wp: - bch2_trans_mutex_lock_norelock(trans, &wp->lock); - if (wp->write_point == write_point) - goto out; - mutex_unlock(&wp->lock); - goto restart_find; - } -restart_find_oldest: - oldest = NULL; - for (wp = c->write_points; - wp < c->write_points + c->write_points_nr; wp++) - if (!oldest || time_before64(wp->last_used, oldest->last_used)) - oldest = wp; - - bch2_trans_mutex_lock_norelock(trans, &oldest->lock); - bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock); - if (oldest >= c->write_points + c->write_points_nr || - try_increase_writepoints(c)) { - mutex_unlock(&c->write_points_hash_lock); - mutex_unlock(&oldest->lock); - goto restart_find_oldest; - } - - wp = __writepoint_find(head, write_point); - if (wp && wp != oldest) { - mutex_unlock(&c->write_points_hash_lock); - mutex_unlock(&oldest->lock); - goto lock_wp; - } - - wp = oldest; - hlist_del_rcu(&wp->node); - wp->write_point = write_point; - hlist_add_head_rcu(&wp->node, head); - mutex_unlock(&c->write_points_hash_lock); -out: - wp->last_used = local_clock(); - return wp; -} - -static noinline void -deallocate_extra_replicas(struct bch_fs *c, - struct alloc_request *req) -{ - struct open_bucket *ob; - unsigned extra_replicas = req->nr_effective - req->nr_replicas; - unsigned i; - - req->scratch_ptrs.nr = 0; - - open_bucket_for_each(c, &req->ptrs, ob, i) { - unsigned d = ob_dev(c, ob)->mi.durability; - - if (d && d <= extra_replicas) { - extra_replicas -= d; - ob_push(c, &req->wp->ptrs, ob); - } else { - ob_push(c, &req->scratch_ptrs, ob); - } - } - - req->ptrs = req->scratch_ptrs; -} - -/* - * Get us an open_bucket we can allocate from, return with it locked: - */ -int bch2_alloc_sectors_start_trans(struct btree_trans *trans, - unsigned target, - unsigned erasure_code, - struct write_point_specifier write_point, - struct bch_devs_list *devs_have, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl, - struct write_point **wp_ret) -{ - struct bch_fs *c = trans->c; - struct open_bucket *ob; - unsigned write_points_nr; - int i; - - struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req)); - int ret = PTR_ERR_OR_ZERO(req); - if (unlikely(ret)) - return ret; - - if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) - erasure_code = false; - - req->nr_replicas = nr_replicas; - req->target = target; - req->ec = erasure_code; - req->watermark = watermark; - req->flags = flags; - req->devs_have = devs_have; - - BUG_ON(!nr_replicas || !nr_replicas_required); -retry: - req->ptrs.nr = 0; - req->nr_effective = 0; - req->have_cache = false; - write_points_nr = c->write_points_nr; - - *wp_ret = req->wp = writepoint_find(trans, write_point.v); - - req->data_type = req->wp->data_type; - - ret = bch2_trans_relock(trans); - if (ret) - goto err; - - /* metadata may not allocate on cache devices: */ - if (req->data_type != BCH_DATA_user) - req->have_cache = true; - - if (target && !(flags & BCH_WRITE_only_specified_devs)) { - ret = open_bucket_add_buckets(trans, req, NULL); - if (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto alloc_done; - - /* Don't retry from all devices if we're out of open buckets: */ - if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret2 = open_bucket_add_buckets(trans, req, cl); - if (!ret2 || - bch2_err_matches(ret2, BCH_ERR_transaction_restart) || - bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { - ret = ret2; - goto alloc_done; - } - } - - /* - * Only try to allocate cache (durability = 0 devices) from the - * specified target: - */ - req->have_cache = true; - req->target = 0; - - ret = open_bucket_add_buckets(trans, req, cl); - } else { - ret = open_bucket_add_buckets(trans, req, cl); - } -alloc_done: - BUG_ON(!ret && req->nr_effective < req->nr_replicas); - - if (erasure_code && !ec_open_bucket(c, &req->ptrs)) - pr_debug("failed to get ec bucket: ret %u", ret); - - if (ret == -BCH_ERR_insufficient_devices && - req->nr_effective >= nr_replicas_required) - ret = 0; - - if (ret) - goto err; - - if (req->nr_effective > req->nr_replicas) - deallocate_extra_replicas(c, req); - - /* Free buckets we didn't use: */ - open_bucket_for_each(c, &req->wp->ptrs, ob, i) - open_bucket_free_unused(c, ob); - - req->wp->ptrs = req->ptrs; - - req->wp->sectors_free = UINT_MAX; - - open_bucket_for_each(c, &req->wp->ptrs, ob, i) { - /* - * Ensure proper write alignment - either due to misaligned - * bucket sizes (from buggy bcachefs-tools), or writes that mix - * logical/physical alignment: - */ - struct bch_dev *ca = ob_dev(c, ob); - u64 offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - - ob->sectors_free; - unsigned align = round_up(offset, block_sectors(c)) - offset; - - ob->sectors_free = max_t(int, 0, ob->sectors_free - align); - - req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free); - } - - req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c)); - - /* Did alignment use up space in an open_bucket? */ - if (unlikely(!req->wp->sectors_free)) { - bch2_alloc_sectors_done(c, req->wp); - goto retry; - } - - BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX); - - return 0; -err: - open_bucket_for_each(c, &req->wp->ptrs, ob, i) - if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v)) - ob_push(c, &req->ptrs, ob); - else - open_bucket_free_unused(c, ob); - req->wp->ptrs = req->ptrs; - - mutex_unlock(&req->wp->lock); - - if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && - try_decrease_writepoints(trans, write_points_nr)) - goto retry; - - if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - ret = bch_err_throw(c, bucket_alloc_blocked); - - if (cl && !(flags & BCH_WRITE_alloc_nowait) && - bch2_err_matches(ret, BCH_ERR_freelist_empty)) - ret = bch_err_throw(c, bucket_alloc_blocked); - - return ret; -} - -void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, - struct bkey_i *k, unsigned sectors, - bool cached) -{ - bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) -{ - bch2_alloc_sectors_done_inlined(c, wp); -} - -static inline void writepoint_init(struct write_point *wp, - enum bch_data_type type) -{ - mutex_init(&wp->lock); - wp->data_type = type; - - INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); - INIT_LIST_HEAD(&wp->writes); - spin_lock_init(&wp->writes_lock); -} - -void bch2_fs_allocator_foreground_init(struct bch_fs *c) -{ - struct open_bucket *ob; - struct write_point *wp; - - mutex_init(&c->write_points_hash_lock); - c->write_points_nr = ARRAY_SIZE(c->write_points); - - /* open bucket 0 is a sentinal NULL: */ - spin_lock_init(&c->open_buckets[0].lock); - - for (ob = c->open_buckets + 1; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - spin_lock_init(&ob->lock); - c->open_buckets_nr_free++; - - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; - } - - writepoint_init(&c->btree_write_point, BCH_DATA_btree); - writepoint_init(&c->rebalance_write_point, BCH_DATA_user); - writepoint_init(&c->copygc_write_point, BCH_DATA_user); - - for (wp = c->write_points; - wp < c->write_points + c->write_points_nr; wp++) { - writepoint_init(wp, BCH_DATA_user); - - wp->last_used = local_clock(); - wp->write_point = (unsigned long) wp; - hlist_add_head_rcu(&wp->node, - writepoint_hash(c, wp->write_point)); - } -} - -void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - unsigned data_type = ob->data_type; - barrier(); /* READ_ONCE() doesn't work on bitfields */ - - prt_printf(out, "%zu ref %u ", - ob - c->open_buckets, - atomic_read(&ob->pin)); - bch2_prt_data_type(out, data_type); - prt_printf(out, " %u:%llu gen %u allocated %u/%u", - ob->dev, ob->bucket, ob->gen, - ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); - if (ob->ec) - prt_printf(out, " ec idx %llu", ob->ec->idx); - if (ob->on_partial_list) - prt_str(out, " partial"); - prt_newline(out); -} - -void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_dev *ca) -{ - struct open_bucket *ob; - - out->atomic++; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - spin_lock(&ob->lock); - if (ob->valid && (!ca || ob->dev == ca->dev_idx)) - bch2_open_bucket_to_text(out, c, ob); - spin_unlock(&ob->lock); - } - - --out->atomic; -} - -void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) -{ - unsigned i; - - out->atomic++; - spin_lock(&c->freelist_lock); - - for (i = 0; i < c->open_buckets_partial_nr; i++) - bch2_open_bucket_to_text(out, c, - c->open_buckets + c->open_buckets_partial[i]); - - spin_unlock(&c->freelist_lock); - --out->atomic; -} - -static const char * const bch2_write_point_states[] = { -#define x(n) #n, - WRITE_POINT_STATES() -#undef x - NULL -}; - -static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, - struct write_point *wp) -{ - struct open_bucket *ob; - unsigned i; - - mutex_lock(&wp->lock); - - prt_printf(out, "%lu: ", wp->write_point); - prt_human_readable_u64(out, wp->sectors_allocated << 9); - - prt_printf(out, " last wrote: "); - bch2_pr_time_units(out, sched_clock() - wp->last_used); - - for (i = 0; i < WRITE_POINT_STATE_NR; i++) { - prt_printf(out, " %s: ", bch2_write_point_states[i]); - bch2_pr_time_units(out, wp->time[i]); - } - - prt_newline(out); - - printbuf_indent_add(out, 2); - open_bucket_for_each(c, &wp->ptrs, ob, i) - bch2_open_bucket_to_text(out, c, ob); - printbuf_indent_sub(out, 2); - - mutex_unlock(&wp->lock); -} - -void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct write_point *wp; - - prt_str(out, "Foreground write points\n"); - for (wp = c->write_points; - wp < c->write_points + ARRAY_SIZE(c->write_points); - wp++) - bch2_write_point_to_text(out, c, wp); - - prt_str(out, "Copygc write point\n"); - bch2_write_point_to_text(out, c, &c->copygc_write_point); - - prt_str(out, "Rebalance write point\n"); - bch2_write_point_to_text(out, c, &c->rebalance_write_point); - - prt_str(out, "Btree write point\n"); - bch2_write_point_to_text(out, c, &c->btree_write_point); -} - -void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) -{ - unsigned nr[BCH_DATA_NR]; - - memset(nr, 0, sizeof(nr)); - - for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].data_type]++; - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 24); - - prt_printf(out, "capacity\t%llu\n", c->capacity); - prt_printf(out, "reserved\t%llu\n", c->reserved); - prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden)); - prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree)); - prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data)); - prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached)); - prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved)); - prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); - prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes)); - - prt_newline(out); - prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty"); - prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); - prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT); - prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty"); - prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]); - prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]); - prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr); -} - -void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca); - unsigned nr[BCH_DATA_NR]; - - memset(nr, 0, sizeof(nr)); - - for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].data_type]++; - - bch2_dev_usage_to_text(out, ca, &stats); - - prt_newline(out); - - prt_printf(out, "reserves:\n"); - for (unsigned i = 0; i < BCH_WATERMARK_NR; i++) - prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i)); - - prt_newline(out); - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 16); - - prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); - prt_printf(out, "buckets to invalidate\t%llu\r\n", - should_invalidate_buckets(ca, bch2_dev_usage_read(ca))); -} - -static noinline void bch2_print_allocator_stuck(struct bch_fs *c) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", - c->opts.allocator_stuck_timeout); - - prt_printf(&buf, "Allocator debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_fs_alloc_debug_to_text(&buf, c); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - - bch2_printbuf_make_room(&buf, 4096); - - buf.atomic++; - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) { - prt_printf(&buf, "Dev %u:\n", ca->dev_idx); - printbuf_indent_add(&buf, 2); - bch2_dev_alloc_debug_to_text(&buf, ca); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - --buf.atomic; - - prt_printf(&buf, "Copygc debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_copygc_wait_to_text(&buf, c); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - - prt_printf(&buf, "Journal debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_journal_debug_to_text(&buf, &c->journal); - printbuf_indent_sub(&buf, 2); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -} - -static inline unsigned allocator_wait_timeout(struct bch_fs *c) -{ - if (c->allocator_last_stuck && - time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies)) - return 0; - - return c->opts.allocator_stuck_timeout * HZ; -} - -void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) -{ - unsigned t = allocator_wait_timeout(c); - - if (t && closure_sync_timeout(cl, t)) { - c->allocator_last_stuck = jiffies; - bch2_print_allocator_stuck(c); - } - - closure_sync(cl); -} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h deleted file mode 100644 index 1b3fc84600963c..00000000000000 --- a/fs/bcachefs/alloc_foreground.h +++ /dev/null @@ -1,318 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_FOREGROUND_H -#define _BCACHEFS_ALLOC_FOREGROUND_H - -#include "bcachefs.h" -#include "buckets.h" -#include "alloc_types.h" -#include "extents.h" -#include "io_write_types.h" -#include "sb-members.h" - -#include - -struct bkey; -struct bch_dev; -struct bch_fs; -struct bch_devs_List; - -extern const char * const bch2_watermarks[]; - -void bch2_reset_alloc_cursors(struct bch_fs *); - -struct dev_alloc_list { - unsigned nr; - u8 data[BCH_SB_MEMBERS_MAX]; -}; - -struct alloc_request { - unsigned nr_replicas; - unsigned target; - bool ec; - enum bch_watermark watermark; - enum bch_write_flags flags; - enum bch_data_type data_type; - struct bch_devs_list *devs_have; - struct write_point *wp; - - /* These fields are used primarily by open_bucket_add_buckets */ - struct open_buckets ptrs; - unsigned nr_effective; /* sum of @ptrs durability */ - bool have_cache; /* have we allocated from a 0 durability dev */ - struct bch_devs_mask devs_may_alloc; - - /* bch2_bucket_alloc_set_trans(): */ - struct dev_alloc_list devs_sorted; - struct bch_dev_usage usage; - - /* bch2_bucket_alloc_trans(): */ - struct bch_dev *ca; - - enum { - BTREE_BITMAP_NO, - BTREE_BITMAP_YES, - BTREE_BITMAP_ANY, - } btree_bitmap; - - struct { - u64 buckets_seen; - u64 skipped_open; - u64 skipped_need_journal_commit; - u64 need_journal_commit; - u64 skipped_nocow; - u64 skipped_nouse; - u64 skipped_mi_btree_bitmap; - } counters; - - unsigned scratch_nr_replicas; - unsigned scratch_nr_effective; - bool scratch_have_cache; - enum bch_data_type scratch_data_type; - struct open_buckets scratch_ptrs; - struct bch_devs_mask scratch_devs_may_alloc; -}; - -void bch2_dev_alloc_list(struct bch_fs *, - struct dev_stripe_state *, - struct bch_devs_mask *, - struct dev_alloc_list *); -void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); - -static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) -{ - return bch2_dev_have_ref(c, ob->dev); -} - -static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) -{ - switch (watermark) { - case BCH_WATERMARK_interior_updates: - return 0; - case BCH_WATERMARK_reclaim: - return OPEN_BUCKETS_COUNT / 6; - case BCH_WATERMARK_btree: - case BCH_WATERMARK_btree_copygc: - return OPEN_BUCKETS_COUNT / 4; - case BCH_WATERMARK_copygc: - return OPEN_BUCKETS_COUNT / 3; - default: - return OPEN_BUCKETS_COUNT / 2; - } -} - -struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, - enum bch_watermark, enum bch_data_type, - struct closure *); - -static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, - struct open_bucket *ob) -{ - BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); - - obs->v[obs->nr++] = ob - c->open_buckets; -} - -#define open_bucket_for_each(_c, _obs, _ob, _i) \ - for ((_i) = 0; \ - (_i) < (_obs)->nr && \ - ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ - (_i)++) - -static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, - struct open_buckets *obs) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, obs, ob, i) - if (ob->ec) - return ob; - - return NULL; -} - -void bch2_open_bucket_write_error(struct bch_fs *, - struct open_buckets *, unsigned, int); - -void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); - -static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -{ - if (atomic_dec_and_test(&ob->pin)) - __bch2_open_bucket_put(c, ob); -} - -static inline void bch2_open_buckets_put(struct bch_fs *c, - struct open_buckets *ptrs) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, ptrs, ob, i) - bch2_open_bucket_put(c, ob); - ptrs->nr = 0; -} - -static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp) -{ - struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, &wp->ptrs, ob, i) - ob_push(c, ob->sectors_free < block_sectors(c) - ? &ptrs - : &keep, ob); - wp->ptrs = keep; - - mutex_unlock(&wp->lock); - - bch2_open_buckets_put(c, &ptrs); -} - -static inline void bch2_open_bucket_get(struct bch_fs *c, - struct write_point *wp, - struct open_buckets *ptrs) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, &wp->ptrs, ob, i) { - ob->data_type = wp->data_type; - atomic_inc(&ob->pin); - ob_push(c, ptrs, ob); - } -} - -static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c, - unsigned dev, u64 bucket) -{ - return c->open_buckets_hash + - (jhash_3words(dev, bucket, bucket >> 32, 0) & - (OPEN_BUCKETS_COUNT - 1)); -} - -static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket) -{ - open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket); - - while (slot) { - struct open_bucket *ob = &c->open_buckets[slot]; - - if (ob->dev == dev && ob->bucket == bucket) - return true; - - slot = ob->hash; - } - - return false; -} - -static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) -{ - bool ret; - - if (bch2_bucket_is_open(c, dev, bucket)) - return true; - - spin_lock(&c->freelist_lock); - ret = bch2_bucket_is_open(c, dev, bucket); - spin_unlock(&c->freelist_lock); - - return ret; -} - -enum bch_write_flags; -int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, - struct dev_stripe_state *, struct closure *); - -int bch2_alloc_sectors_start_trans(struct btree_trans *, - unsigned, unsigned, - struct write_point_specifier, - struct bch_devs_list *, - unsigned, unsigned, - enum bch_watermark, - enum bch_write_flags, - struct closure *, - struct write_point **); - -static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - return (struct bch_extent_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = ob->gen, - .dev = ob->dev, - .offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - - ob->sectors_free, - }; -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -static inline void -bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, - struct bkey_i *k, unsigned sectors, - bool cached) -{ - struct open_bucket *ob; - unsigned i; - - BUG_ON(sectors > wp->sectors_free); - wp->sectors_free -= sectors; - wp->sectors_allocated += sectors; - - open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = ob_dev(c, ob); - struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); - - ptr.cached = cached || - (!ca->mi.durability && - wp->data_type == BCH_DATA_user); - - bch2_bkey_append_ptr(k, ptr); - - BUG_ON(sectors > ob->sectors_free); - ob->sectors_free -= sectors; - } -} - -void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, - struct bkey_i *, unsigned, bool); -void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); - -void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool); - -static inline struct write_point_specifier writepoint_hashed(unsigned long v) -{ - return (struct write_point_specifier) { .v = v | 1 }; -} - -static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) -{ - return (struct write_point_specifier) { .v = (unsigned long) wp }; -} - -void bch2_fs_allocator_foreground_init(struct bch_fs *); - -void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *); -void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *); -void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); - -void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *); -void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *); - -void __bch2_wait_on_allocator(struct bch_fs *, struct closure *); -static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) -{ - if (cl->closure_get_happened) - __bch2_wait_on_allocator(c, cl); -} - -#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h deleted file mode 100644 index e7becdf22cbafd..00000000000000 --- a/fs/bcachefs/alloc_types.h +++ /dev/null @@ -1,121 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_TYPES_H -#define _BCACHEFS_ALLOC_TYPES_H - -#include -#include - -#include "clock_types.h" -#include "fifo.h" - -#define BCH_WATERMARKS() \ - x(stripe) \ - x(normal) \ - x(copygc) \ - x(btree) \ - x(btree_copygc) \ - x(reclaim) \ - x(interior_updates) - -enum bch_watermark { -#define x(name) BCH_WATERMARK_##name, - BCH_WATERMARKS() -#undef x - BCH_WATERMARK_NR, -}; - -#define BCH_WATERMARK_BITS 3 -#define BCH_WATERMARK_MASK ~(~0U << BCH_WATERMARK_BITS) - -#define OPEN_BUCKETS_COUNT 1024 - -#define WRITE_POINT_HASH_NR 32 -#define WRITE_POINT_MAX 32 - -/* - * 0 is never a valid open_bucket_idx_t: - */ -typedef u16 open_bucket_idx_t; - -struct open_bucket { - spinlock_t lock; - atomic_t pin; - open_bucket_idx_t freelist; - open_bucket_idx_t hash; - - /* - * When an open bucket has an ec_stripe attached, this is the index of - * the block in the stripe this open_bucket corresponds to: - */ - u8 ec_idx; - enum bch_data_type data_type:6; - unsigned valid:1; - unsigned on_partial_list:1; - - u8 dev; - u8 gen; - u32 sectors_free; - u64 bucket; - struct ec_stripe_new *ec; -}; - -#define OPEN_BUCKET_LIST_MAX 15 - -struct open_buckets { - open_bucket_idx_t nr; - open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; -}; - -struct dev_stripe_state { - u64 next_alloc[BCH_SB_MEMBERS_MAX]; -}; - -#define WRITE_POINT_STATES() \ - x(stopped) \ - x(waiting_io) \ - x(waiting_work) \ - x(runnable) \ - x(running) - -enum write_point_state { -#define x(n) WRITE_POINT_##n, - WRITE_POINT_STATES() -#undef x - WRITE_POINT_STATE_NR -}; - -struct write_point { - struct { - struct hlist_node node; - struct mutex lock; - u64 last_used; - unsigned long write_point; - enum bch_data_type data_type; - - /* calculated based on how many pointers we're actually going to use: */ - unsigned sectors_free; - - struct open_buckets ptrs; - struct dev_stripe_state stripe; - - u64 sectors_allocated; - } __aligned(SMP_CACHE_BYTES); - - struct { - struct work_struct index_update_work; - - struct list_head writes; - spinlock_t writes_lock; - - enum write_point_state state; - u64 last_state_change; - u64 time[WRITE_POINT_STATE_NR]; - u64 last_runtime; - } __aligned(SMP_CACHE_BYTES); -}; - -struct write_point_specifier { - unsigned long v; -}; - -#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c deleted file mode 100644 index a7cd1f0f09647a..00000000000000 --- a/fs/bcachefs/async_objs.c +++ /dev/null @@ -1,132 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Async obj debugging: keep asynchronous objects on (very fast) lists, make - * them visibile in debugfs: - */ - -#include "bcachefs.h" -#include "async_objs.h" -#include "btree_io.h" -#include "debug.h" -#include "io_read.h" -#include "io_write.h" - -#include - -static void promote_obj_to_text(struct printbuf *out, void *obj) -{ - bch2_promote_op_to_text(out, obj); -} - -static void rbio_obj_to_text(struct printbuf *out, void *obj) -{ - bch2_read_bio_to_text(out, obj); -} - -static void write_op_obj_to_text(struct printbuf *out, void *obj) -{ - bch2_write_op_to_text(out, obj); -} - -static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj) -{ - struct btree_read_bio *rbio = obj; - bch2_btree_read_bio_to_text(out, rbio); -} - -static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj) -{ - struct btree_write_bio *wbio = obj; - bch2_bio_to_text(out, &wbio->wbio.bio); -} - -static int bch2_async_obj_list_open(struct inode *inode, struct file *file) -{ - struct async_obj_list *list = inode->i_private; - struct dump_iter *i; - - i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) - return -ENOMEM; - - file->private_data = i; - i->from = POS_MIN; - i->iter = 0; - i->c = container_of(list, struct bch_fs, async_objs[list->idx]); - i->list = list; - i->buf = PRINTBUF; - return 0; -} - -static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct async_obj_list *list = i->list; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - struct genradix_iter iter; - void *obj; - fast_list_for_each_from(&list->list, iter, obj, i->iter) { - ret = bch2_debugfs_flush_buf(i); - if (ret) - return ret; - - if (!i->size) - break; - - list->obj_to_text(&i->buf, obj); - } - - if (i->buf.allocation_failure) - ret = -ENOMEM; - else - i->iter = iter.pos; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static const struct file_operations async_obj_ops = { - .owner = THIS_MODULE, - .open = bch2_async_obj_list_open, - .release = bch2_dump_release, - .read = bch2_async_obj_list_read, -}; - -void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) -{ - c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir); - -#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \ - &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops); - BCH_ASYNC_OBJ_LISTS() -#undef x -} - -void bch2_fs_async_obj_exit(struct bch_fs *c) -{ - for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) - fast_list_exit(&c->async_objs[i].list); -} - -int bch2_fs_async_obj_init(struct bch_fs *c) -{ - for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) { - if (fast_list_init(&c->async_objs[i].list)) - return -BCH_ERR_ENOMEM_async_obj_init; - c->async_objs[i].idx = i; - } - -#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text; - BCH_ASYNC_OBJ_LISTS() -#undef x - - return 0; -} diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h deleted file mode 100644 index cd6489b8cf7645..00000000000000 --- a/fs/bcachefs/async_objs.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ASYNC_OBJS_H -#define _BCACHEFS_ASYNC_OBJS_H - -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -static inline void __async_object_list_del(struct fast_list *head, unsigned idx) -{ - fast_list_remove(head, idx); -} - -static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx) -{ - int ret = fast_list_add(head, obj); - *idx = ret > 0 ? ret : 0; - return ret < 0 ? ret : 0; -} - -#define async_object_list_del(_c, _list, idx) \ - __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx) - -#define async_object_list_add(_c, _list, obj, idx) \ - __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx) - -void bch2_fs_async_obj_debugfs_init(struct bch_fs *); -void bch2_fs_async_obj_exit(struct bch_fs *); -int bch2_fs_async_obj_init(struct bch_fs *); - -#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ - -#define async_object_list_del(_c, _n, idx) do {} while (0) - -static inline int __async_object_list_add(void) -{ - return 0; -} -#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add() - -static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {} -static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {} -static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; } - -#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ - -#endif /* _BCACHEFS_ASYNC_OBJS_H */ diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h deleted file mode 100644 index 8d713c0f5841d7..00000000000000 --- a/fs/bcachefs/async_objs_types.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H -#define _BCACHEFS_ASYNC_OBJS_TYPES_H - -#define BCH_ASYNC_OBJ_LISTS() \ - x(promote) \ - x(rbio) \ - x(write_op) \ - x(btree_read_bio) \ - x(btree_write_bio) - -enum bch_async_obj_lists { -#define x(n) BCH_ASYNC_OBJ_LIST_##n, - BCH_ASYNC_OBJ_LISTS() -#undef x - BCH_ASYNC_OBJ_NR -}; - -struct async_obj_list { - struct fast_list list; - void (*obj_to_text)(struct printbuf *, void *); - unsigned idx; -}; - -#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c deleted file mode 100644 index 77d93beb3c8f50..00000000000000 --- a/fs/bcachefs/backpointers.c +++ /dev/null @@ -1,1391 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bbpos.h" -#include "alloc_background.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "checksum.h" -#include "disk_accounting.h" -#include "error.h" -#include "progress.h" -#include "recovery_passes.h" - -#include - -static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64); - -static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) -{ - return (struct bbpos) { - .btree = bp.btree_id, - .pos = bp.pos, - }; -} - -int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - int ret = 0; - - bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH, - c, backpointer_level_bad, - "backpointer level bad: %u >= %u", - bp.v->level, BTREE_MAX_DEPTH); - - bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, - c, backpointer_dev_bad, - "backpointer for BCH_SB_MEMBER_INVALID"); -fsck_err: - return ret; -} - -void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - - struct bch_dev *ca; - u32 bucket_offset; - struct bpos bucket; - scoped_guard(rcu) { - ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); - if (ca) - bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); - } - - if (ca) - prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); - else - prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); - - bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); - prt_str(out, " data_type="); - bch2_prt_data_type(out, bp.v->data_type); - prt_printf(out, " suboffset=%u len=%u gen=%u pos=", - (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), - bp.v->bucket_len, - bp.v->bucket_gen); - bch2_bpos_to_text(out, bp.v->pos); -} - -void bch2_backpointer_swab(struct bkey_s k) -{ - struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); - - bp.v->bucket_len = swab32(bp.v->bucket_len); - bch2_bpos_swab(&bp.v->pos); -} - -static bool extent_matches_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - struct bkey_s_c_backpointer bp) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bkey_i_backpointer bp2; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); - - if (bpos_eq(bp.k->p, bp2.k.p) && - !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) - return true; - } - - return false; -} - -static noinline int backpointer_mod_err(struct btree_trans *trans, - struct bkey_s_c orig_k, - struct bkey_i_backpointer *new_bp, - struct bkey_s_c found_bp, - bool insert) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - bool will_check = c->recovery.passes_to_run & - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); - int ret = 0; - - if (insert) { - prt_printf(&buf, "existing backpointer found when inserting "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "found "); - bch2_bkey_val_to_text(&buf, c, found_bp); - prt_newline(&buf); - - prt_printf(&buf, "for "); - bch2_bkey_val_to_text(&buf, c, orig_k); - } else if (!will_check) { - prt_printf(&buf, "backpointer not found when deleting\n"); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "searching for "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); - prt_newline(&buf); - - prt_printf(&buf, "got "); - bch2_bkey_val_to_text(&buf, c, found_bp); - prt_newline(&buf); - - prt_printf(&buf, "for "); - bch2_bkey_val_to_text(&buf, c, orig_k); - } - - if (!will_check && __bch2_inconsistent_error(c, &buf)) - ret = bch_err_throw(c, erofs_unfixed_errors); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - return ret; -} - -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - struct bkey_s_c orig_k, - struct bkey_i_backpointer *bp, - bool insert) -{ - struct btree_iter bp_iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bp->k.p, - BTREE_ITER_intent| - BTREE_ITER_slots| - BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - - if (insert - ? k.k->type - : (k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { - ret = backpointer_mod_err(trans, orig_k, bp, k, insert); - if (ret) - goto err; - } - - if (!insert) { - bp->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp->k, 0); - } - - ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); -err: - bch2_trans_iter_exit(trans, &bp_iter); - return ret; -} - -static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) -{ - return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) - ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) - : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -} - -static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, - struct bkey_s_c visiting_k, - struct bkey_buf *last_flushed) -{ - return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) - ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) - : 0; -} - -static int backpointer_target_not_found(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct bkey_s_c target_k, - struct bkey_buf *last_flushed, - bool commit) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - /* - * If we're using the btree write buffer, the backpointer we were - * looking at may have already been deleted - failure to find what it - * pointed to is not an error: - */ - ret = last_flushed - ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) - : 0; - if (ret) - return ret; - - prt_printf(&buf, "backpointer doesn't match %s it points to:\n", - bp.v->level ? "btree node" : "extent"); - bch2_bkey_val_to_text(&buf, c, bp.s_c); - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, target_k); - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) - if (p.ptr.dev == bp.k->p.inode) { - prt_newline(&buf); - struct bkey_i_backpointer bp2; - bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); - } - - if (fsck_err(trans, backpointer_to_missing_ptr, - "%s", buf.buf)) { - ret = bch2_backpointer_del(trans, bp.k->p); - if (ret || !commit) - goto out; - - /* - * Normally, on transaction commit from inside a transaction, - * we'll return -BCH_ERR_transaction_restart_nested, since a - * transaction commit invalidates pointers given out by peek(). - * - * However, since we're updating a write buffer btree, if we - * return a transaction restart and loop we won't see that the - * backpointer has been deleted without an additional write - * buffer flush - and those are expensive. - * - * So we're relying on the caller immediately advancing to the - * next backpointer and starting a new transaction immediately - * after backpointer_get_key() returns NULL: - */ - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - } -out: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - struct bkey_buf *last_flushed, - bool commit) -{ - struct bch_fs *c = trans->c; - - BUG_ON(!bp.v->level); - - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, - bp.v->level - 1, - 0); - struct btree *b = bch2_btree_iter_peek_node(trans, iter); - if (IS_ERR_OR_NULL(b)) - goto err; - - BUG_ON(b->c.level != bp.v->level - 1); - - if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, - bkey_i_to_s_c(&b->key), bp)) - return b; - - if (btree_node_will_make_reachable(b)) { - b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)); - } else { - int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), - last_flushed, commit); - b = ret ? ERR_PTR(ret) : NULL; - } -err: - bch2_trans_iter_exit(trans, iter); - return b; -} - -static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - unsigned iter_flags, - struct bkey_buf *last_flushed, - bool commit) -{ - struct bch_fs *c = trans->c; - - if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) - return bkey_s_c_null; - - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, - bp.v->level, - iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); - if (bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } - - /* - * peek_slot() doesn't normally return NULL - except when we ask for a - * key at a btree level that doesn't exist. - * - * We may want to revisit this and change peek_slot(): - */ - if (!k.k) { - bkey_init(&iter->k); - iter->k.p = bp.v->pos; - k.k = &iter->k; - } - - if (k.k && - extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) - return k; - - bch2_trans_iter_exit(trans, iter); - - if (!bp.v->level) { - int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; - } else { - struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); - if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) - return bkey_s_c_null; - if (IS_ERR_OR_NULL(b)) - return ((struct bkey_s_c) { .k = ERR_CAST(b) }); - - return bkey_i_to_s_c(&b->key); - } -} - -struct btree *bch2_backpointer_get_node(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - struct bkey_buf *last_flushed) -{ - return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true); -} - -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - unsigned iter_flags, - struct bkey_buf *last_flushed) -{ - return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true); -} - -static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, - struct bkey_buf *last_flushed) -{ - if (k.k->type != KEY_TYPE_backpointer) - return 0; - - struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = {}; - struct bkey_s_c alloc_k; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bpos bucket; - if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { - ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); - if (ret) - goto out; - - if (fsck_err(trans, backpointer_to_missing_device, - "backpointer for missing device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_backpointer_del(trans, k.k->p); - goto out; - } - - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); - ret = bkey_err(alloc_k); - if (ret) - goto out; - - if (alloc_k.k->type != KEY_TYPE_alloc_v4) { - ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); - if (ret) - goto out; - - if (fsck_err(trans, backpointer_to_missing_alloc, - "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", - alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_backpointer_del(trans, k.k->p); - } -out: -fsck_err: - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); - return ret; -} - -/* verify that every backpointer has a corresponding alloc key */ -int bch2_check_btree_backpointers(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_backpointers, POS_MIN, 0, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; -} - -struct extents_to_bp_state { - struct bpos bp_start; - struct bpos bp_end; - struct bkey_buf last_flushed; -}; - -static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, - struct bkey_s_c extent, unsigned dev) -{ - struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - bch2_bkey_drop_device(bkey_i_to_s(n), dev); - return bch2_btree_insert_trans(trans, btree, n, 0); -} - -static int check_extent_checksum(struct btree_trans *trans, - enum btree_id btree, struct bkey_s_c extent, - enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct printbuf buf = PRINTBUF; - void *data_buf = NULL; - struct bio *bio = NULL; - size_t bytes; - int ret = 0; - - if (bkey_is_btree_ptr(extent.k)) - return false; - - bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) - if (p.ptr.dev == dev) - goto found; - BUG(); -found: - if (!p.crc.csum_type) - return false; - - bytes = p.crc.compressed_size << 9; - - struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ, - BCH_DEV_READ_REF_check_extent_checksums); - if (!ca) - return false; - - data_buf = kvmalloc(bytes, GFP_KERNEL); - if (!data_buf) { - ret = -ENOMEM; - goto err; - } - - bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL); - bio->bi_iter.bi_sector = p.ptr.offset; - bch2_bio_map(bio, data_buf, bytes); - ret = submit_bio_wait(bio); - if (ret) - goto err; - - prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n"); - bch2_btree_id_to_text(&buf, btree); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, extent); - prt_newline(&buf); - bch2_btree_id_to_text(&buf, o_btree); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, extent2); - - struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); - struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); - if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), - trans, dup_backpointer_to_bad_csum_extent, - "%s", buf.buf)) - ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; -fsck_err: -err: - if (bio) - bio_put(bio); - kvfree(data_buf); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_check_extent_checksums); - printbuf_exit(&buf); - return ret; -} - -static int check_bp_exists(struct btree_trans *trans, - struct extents_to_bp_state *s, - struct bkey_i_backpointer *bp, - struct bkey_s_c orig_k) -{ - struct bch_fs *c = trans->c; - struct btree_iter other_extent_iter = {}; - struct printbuf buf = PRINTBUF; - - if (bpos_lt(bp->k.p, s->bp_start) || - bpos_gt(bp->k.p, s->bp_end)) - return 0; - - struct btree_iter bp_iter; - struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); - int ret = bkey_err(bp_k); - if (ret) - goto err; - - if (bp_k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { - ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); - if (ret) - goto err; - - goto check_existing_bp; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &other_extent_iter); - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - return ret; -check_existing_bp: - /* Do we have a backpointer for a different extent? */ - if (bp_k.k->type != KEY_TYPE_backpointer) - goto missing; - - struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); - - struct bkey_s_c other_extent = - __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false); - ret = bkey_err(other_extent); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - ret = 0; - if (ret) - goto err; - - if (!other_extent.k) - goto missing; - - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode); - if (ca) { - struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent); - bkey_for_each_ptr(other_extent_ptrs, ptr) - if (ptr->dev == bp->k.p.inode && - dev_ptr_stale_rcu(ca, ptr)) { - rcu_read_unlock(); - ret = drop_dev_and_update(trans, other_bp.v->btree_id, - other_extent, bp->k.p.inode); - if (ret) - goto err; - goto out; - } - } - rcu_read_unlock(); - - if (bch2_extents_match(orig_k, other_extent)) { - printbuf_reset(&buf); - prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n"); - bch2_bkey_val_to_text(&buf, c, orig_k); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, other_extent); - bch_err(c, "%s", buf.buf); - - if (other_extent.k->size <= orig_k.k->size) { - ret = drop_dev_and_update(trans, other_bp.v->btree_id, - other_extent, bp->k.p.inode); - if (ret) - goto err; - goto out; - } else { - ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); - if (ret) - goto err; - goto missing; - } - } - - ret = check_extent_checksum(trans, - other_bp.v->btree_id, other_extent, - bp->v.btree_id, orig_k, - bp->k.p.inode); - if (ret < 0) - goto err; - if (ret) { - ret = 0; - goto missing; - } - - ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, - other_bp.v->btree_id, other_extent, bp->k.p.inode); - if (ret < 0) - goto err; - if (ret) { - ret = 0; - goto out; - } - - printbuf_reset(&buf); - prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode); - bch2_bkey_val_to_text(&buf, c, orig_k); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, other_extent); - bch_err(c, "%s", buf.buf); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; -missing: - printbuf_reset(&buf); - prt_str(&buf, "missing backpointer\nfor: "); - bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\nwant: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); - prt_printf(&buf, "\ngot: "); - bch2_bkey_val_to_text(&buf, c, bp_k); - - if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) - ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); - - goto out; -} - -static int check_extent_to_backpointers(struct btree_trans *trans, - struct extents_to_bp_state *s, - enum btree_id btree, unsigned level, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.dev == BCH_SB_MEMBER_INVALID) - continue; - - bool empty; - { - /* scoped_guard() is a loop, so it breaks continue */ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - if (!ca) - continue; - - if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) - continue; - - u64 b = PTR_BUCKET_NR(ca, &p.ptr); - if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) - continue; - - empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); - } - - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); - - int ret = !empty - ? check_bp_exists(trans, s, &bp, k) - : bch2_bucket_backpointer_mod(trans, k, &bp, true); - if (ret) - return ret; - } - - return 0; -} - -static int check_btree_root_to_backpointers(struct btree_trans *trans, - struct extents_to_bp_state *s, - enum btree_id btree_id, - int *level) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct btree *b; - struct bkey_s_c k; - int ret; -retry: - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, - 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); - b = bch2_btree_iter_peek_node(trans, &iter); - ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err; - - if (b != btree_node_root(c, b)) { - bch2_trans_iter_exit(trans, &iter); - goto retry; - } - - *level = b->c.level; - - k = bkey_i_to_s_c(&b->key); - ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static u64 mem_may_pin_bytes(struct bch_fs *c) -{ - struct sysinfo i; - si_meminfo(&i); - - u64 mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); -} - -static size_t btree_nodes_fit_in_ram(struct bch_fs *c) -{ - return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); -} - -static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - u64 btree_leaf_mask, - u64 btree_interior_mask, - struct bbpos start, struct bbpos *end) -{ - struct bch_fs *c = trans->c; - s64 mem_may_pin = mem_may_pin_bytes(c); - int ret = 0; - - bch2_btree_cache_unpin(c); - - btree_interior_mask |= btree_leaf_mask; - - c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask; - c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask; - c->btree_cache.pinned_nodes_start = start; - c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; - - for (enum btree_id btree = start.btree; - btree < BTREE_ID_NR && !ret; - btree++) { - unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1; - - if (!(BIT_ULL(btree) & btree_leaf_mask) && - !(BIT_ULL(btree) & btree_interior_mask)) - continue; - - ret = __for_each_btree_node(trans, iter, btree, - btree == start.btree ? start.pos : POS_MIN, - 0, depth, BTREE_ITER_prefetch, b, ({ - mem_may_pin -= btree_buf_bytes(b); - if (mem_may_pin <= 0) { - c->btree_cache.pinned_nodes_end = *end = - BBPOS(btree, b->key.k.p); - break; - } - bch2_node_pin(c, b); - 0; - })); - } - - return ret; -} - -static inline int bch2_fs_going_ro(struct bch_fs *c) -{ - return test_bit(BCH_FS_going_ro, &c->flags) - ? -EROFS - : 0; -} - -static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - struct extents_to_bp_state *s) -{ - struct bch_fs *c = trans->c; - struct progress_indicator_state progress; - int ret = 0; - - bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); - - for (enum btree_id btree_id = 0; - btree_id < btree_id_nr_alive(c); - btree_id++) { - int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - check_btree_root_to_backpointers(trans, s, btree_id, &level)); - if (ret) - return ret; - - while (level >= depth) { - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, - BTREE_ITER_prefetch); - - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); - bch2_fs_going_ro(c) ?: - check_extent_to_backpointers(trans, s, btree_id, level, k) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - })); - if (ret) - return ret; - - --level; - } - } - - return 0; -} - -enum alloc_sector_counter { - ALLOC_dirty, - ALLOC_cached, - ALLOC_stripe, - ALLOC_SECTORS_NR -}; - -static int data_type_to_alloc_counter(enum bch_data_type t) -{ - switch (t) { - case BCH_DATA_btree: - case BCH_DATA_user: - return ALLOC_dirty; - case BCH_DATA_cached: - return ALLOC_cached; - case BCH_DATA_stripe: - case BCH_DATA_parity: - return ALLOC_stripe; - default: - return -1; - } -} - -static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); - -static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, - bool *had_mismatch, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - bool need_commit = false; - - *had_mismatch = false; - - if (a->data_type == BCH_DATA_sb || - a->data_type == BCH_DATA_journal || - a->data_type == BCH_DATA_parity) - return 0; - - u32 sectors[ALLOC_SECTORS_NR]; - memset(sectors, 0, sizeof(sectors)); - - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); - if (!ca) - return 0; - - struct btree_iter iter; - struct bkey_s_c bp_k; - int ret = 0; - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, alloc_k.k->p), - bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { - if (bp_k.k->type != KEY_TYPE_backpointer) - continue; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && - (bp.v->bucket_gen != a->gen || - bp.v->pad)) { - ret = bch2_backpointer_del(trans, bp_k.k->p); - if (ret) - break; - - need_commit = true; - continue; - } - - if (bp.v->bucket_gen != a->gen) - continue; - - int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); - if (alloc_counter < 0) - continue; - - sectors[alloc_counter] += bp.v->bucket_len; - }; - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; - - if (need_commit) { - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - } - - if (sectors[ALLOC_dirty] != a->dirty_sectors || - sectors[ALLOC_cached] != a->cached_sectors || - sectors[ALLOC_stripe] != a->stripe_sectors) { - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { - ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); - if (ret) - goto err; - } - - if (sectors[ALLOC_dirty] > a->dirty_sectors || - sectors[ALLOC_cached] > a->cached_sectors || - sectors[ALLOC_stripe] > a->stripe_sectors) { - ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: - bch_err_throw(c, transaction_restart_nested); - goto err; - } - - bool empty = (sectors[ALLOC_dirty] + - sectors[ALLOC_stripe] + - sectors[ALLOC_cached]) == 0; - - ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch, - alloc_k.k->p.offset) ?: - (empty - ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty, - alloc_k.k->p.offset) - : 0); - - *had_mismatch = true; - } -err: - bch2_dev_put(ca); - return ret; -} - -static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr_v2: { - bool ret = false; - - guard(rcu)(); - struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; - while (pos.inode <= k.k->p.inode) { - if (pos.inode >= c->sb.nr_devices) - break; - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); - if (!ca) - goto next; - - struct bpos bucket = bp_pos_to_bucket(ca, pos); - u64 next = ca->mi.nbuckets; - - unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets); - if (bitmap) - next = min_t(u64, next, - find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset)); - - bucket.offset = next; - if (bucket.offset == ca->mi.nbuckets) - goto next; - - ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); - if (ret) - break; -next: - pos = SPOS(pos.inode + 1, 0, 0); - } - - return ret; - } - case KEY_TYPE_btree_ptr: - return true; - default: - return false; - } -} - -static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, - enum btree_id btree, unsigned level) -{ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err; - - if (b) - bch2_node_pin(trans->c, b); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, - struct bpos start, struct bpos *end) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - struct bkey_buf tmp; - bch2_bkey_buf_init(&tmp); - - bch2_btree_cache_unpin(c); - - *end = SPOS_MAX; - - s64 mem_may_pin = mem_may_pin_bytes(c); - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, - 0, 1, BTREE_ITER_prefetch); - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - if (!backpointer_node_has_missing(c, k)) - continue; - - mem_may_pin -= c->opts.btree_node_size; - if (mem_may_pin <= 0) - break; - - bch2_bkey_buf_reassemble(&tmp, c, k); - struct btree_path *path = btree_iter_path(trans, &iter); - - BUG_ON(path->level != 1); - - bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); - })); - if (ret) - return ret; - - struct bpos pinned = SPOS_MAX; - mem_may_pin = mem_may_pin_bytes(c); - bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, - 0, 1, BTREE_ITER_prefetch); - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - if (!backpointer_node_has_missing(c, k)) - continue; - - mem_may_pin -= c->opts.btree_node_size; - if (mem_may_pin <= 0) { - *end = pinned; - break; - } - - bch2_bkey_buf_reassemble(&tmp, c, k); - struct btree_path *path = btree_iter_path(trans, &iter); - - BUG_ON(path->level != 1); - - int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); - - if (!ret2) - pinned = tmp.k->k.p; - - ret; - })); - if (ret) - return ret; - - return ret; -} - -int bch2_check_extents_to_backpointers(struct bch_fs *c) -{ - int ret = 0; - - struct btree_trans *trans = bch2_trans_get(c); - struct extents_to_bp_state s = { .bp_start = POS_MIN }; - - bch2_bkey_buf_init(&s.last_flushed); - bkey_init(&s.last_flushed.k->k); - - ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_prefetch, k, ({ - bool had_mismatch; - bch2_fs_going_ro(c) ?: - check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed); - })); - if (ret) - goto err; - - u64 nr_buckets = 0, nr_mismatches = 0; - for_each_member_device(c, ca) { - nr_buckets += ca->mi.nbuckets; - nr_mismatches += ca->bucket_backpointer_mismatch.nr; - } - - if (!nr_mismatches) - goto err; - - bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", - nr_mismatches, nr_buckets); - - while (1) { - ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); - if (ret) - break; - - if ( bpos_eq(s.bp_start, POS_MIN) && - !bpos_eq(s.bp_end, SPOS_MAX)) - bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", - __func__, btree_nodes_fit_in_ram(c)); - - if (!bpos_eq(s.bp_start, POS_MIN) || - !bpos_eq(s.bp_end, SPOS_MAX)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, s.bp_start); - prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, s.bp_end); - - bch_verbose(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_check_extents_to_backpointers_pass(trans, &s); - if (ret || bpos_eq(s.bp_end, SPOS_MAX)) - break; - - s.bp_start = bpos_successor(s.bp_end); - } - - for_each_member_device(c, ca) { - bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); - bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); - } -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&s.last_flushed, c); - bch2_btree_cache_unpin(c); - - bch_err_fn(c, ret); - return ret; -} - -static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, - struct bpos bucket, - bool *had_mismatch, - struct bkey_buf *last_flushed) -{ - struct btree_iter alloc_iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter, - BTREE_ID_alloc, bucket, - BTREE_ITER_cached); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed); - bch2_trans_iter_exit(trans, &alloc_iter); - return ret; -} - -int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans, - struct bch_dev *ca, u64 bucket, - bool copygc, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - bool had_mismatch; - int ret = lockrestart_do(trans, - check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket), - &had_mismatch, last_flushed)); - if (ret || !had_mismatch) - return ret; - - u64 nr = ca->bucket_backpointer_mismatch.nr; - u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0; - - struct printbuf buf = PRINTBUF; - __bch2_log_msg_start(ca->name, &buf); - - prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n", - bucket, nr, ca->mi.nbuckets); - - bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_extents_to_backpointers, - nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return 0; -} - -/* backpointers -> extents */ - -static int check_one_backpointer(struct btree_trans *trans, - struct bbpos start, - struct bbpos end, - struct bkey_s_c bp_k, - struct bkey_buf *last_flushed) -{ - if (bp_k.k->type != KEY_TYPE_backpointer) - return 0; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - struct bbpos pos = bp_to_bbpos(*bp.v); - - if (bbpos_cmp(pos, start) < 0 || - bbpos_cmp(pos, end) > 0) - return 0; - - struct btree_iter iter; - struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); - int ret = bkey_err(k); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - return 0; - if (ret) - return ret; - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int check_bucket_backpointers_to_extents(struct btree_trans *trans, - struct bch_dev *ca, struct bpos bucket) -{ - u32 restart_count = trans->restart_count; - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, bucket), - bucket_pos_to_bp_end(ca, bucket), - 0, k, - check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) - ); - - bch2_bkey_buf_exit(&last_flushed, trans->c); - return ret ?: trans_was_restarted(trans, restart_count); -} - -static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, - struct bbpos start, - struct bbpos end) -{ - struct bch_fs *c = trans->c; - struct bkey_buf last_flushed; - struct progress_indicator_state progress; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); - - int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_prefetch, k, ({ - bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); - check_one_backpointer(trans, start, end, k, &last_flushed); - })); - - bch2_bkey_buf_exit(&last_flushed, c); - return ret; -} - -int bch2_check_backpointers_to_extents(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; - int ret; - - while (1) { - ret = bch2_get_btree_in_memory_pos(trans, - BIT_ULL(BTREE_ID_extents)| - BIT_ULL(BTREE_ID_reflink), - ~0, - start, &end); - if (ret) - break; - - if (!bbpos_cmp(start, BBPOS_MIN) && - bbpos_cmp(end, BBPOS_MAX)) - bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass", - __func__, btree_nodes_fit_in_ram(c)); - - if (bbpos_cmp(start, BBPOS_MIN) || - bbpos_cmp(end, BBPOS_MAX)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "check_backpointers_to_extents(): "); - bch2_bbpos_to_text(&buf, start); - prt_str(&buf, "-"); - bch2_bbpos_to_text(&buf, end); - - bch_verbose(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_check_backpointers_to_extents_pass(trans, start, end); - if (ret || !bbpos_cmp(end, BBPOS_MAX)) - break; - - start = bbpos_successor(end); - } - bch2_trans_put(trans); - - bch2_btree_cache_unpin(c); - - bch_err_fn(c, ret); - return ret; -} - -static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit) -{ - scoped_guard(mutex, &b->lock) { - if (!b->buckets) { - b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), GFP_KERNEL); - if (!b->buckets) - return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); - } - - b->nr += !__test_and_set_bit(bit, b->buckets); - } - - return 0; -} - -int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b, - u64 old_size, u64 new_size) -{ - scoped_guard(mutex, &b->lock) { - if (!b->buckets) - return 0; - - unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size), - sizeof(unsigned long), GFP_KERNEL); - if (!n) - return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); - - memcpy(n, b->buckets, - BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long)); - kvfree(b->buckets); - b->buckets = n; - } - - return 0; -} - -void bch2_bucket_bitmap_free(struct bucket_bitmap *b) -{ - mutex_lock(&b->lock); - kvfree(b->buckets); - b->buckets = NULL; - b->nr = 0; - mutex_unlock(&b->lock); -} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h deleted file mode 100644 index 7e71afee1ac053..00000000000000 --- a/fs/bcachefs/backpointers.h +++ /dev/null @@ -1,200 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BACKPOINTERS_H -#define _BCACHEFS_BACKPOINTERS_H - -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "buckets.h" -#include "error.h" -#include "super.h" - -static inline u64 swab40(u64 x) -{ - return (((x & 0x00000000ffULL) << 32)| - ((x & 0x000000ff00ULL) << 16)| - ((x & 0x0000ff0000ULL) >> 0)| - ((x & 0x00ff000000ULL) >> 16)| - ((x & 0xff00000000ULL) >> 32)); -} - -int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, - struct bkey_validate_context); -void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_backpointer_swab(struct bkey_s); - -#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ - .key_validate = bch2_backpointer_validate, \ - .val_to_text = bch2_backpointer_to_text, \ - .swab = bch2_backpointer_swab, \ - .min_val_size = 32, \ -}) - -#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 - -/* - * Convert from pos in backpointer btree to pos of corresponding bucket in alloc - * btree: - */ -static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos) -{ - u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - - return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); -} - -static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos, - u32 *bucket_offset) -{ - u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - - return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset)); -} - -static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); - if (ca) - *bucket = bp_pos_to_bucket(ca, bp_pos); - return ca != NULL; -} - -static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, - struct bpos bucket, - u64 bucket_offset) -{ - return POS(bucket.inode, - (bucket_to_sector(ca, bucket.offset) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); -} - -/* - * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: - */ -static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, - struct bpos bucket, - u64 bucket_offset) -{ - struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset); - EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret))); - return ret; -} - -static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket) -{ - return bucket_pos_to_bp(ca, bucket, 0); -} - -static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket) -{ - return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0)); -} - -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, - struct bkey_s_c, - struct bkey_i_backpointer *, - bool); - -static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, - struct bkey_s_c orig_k, - struct bkey_i_backpointer *bp, - bool insert) -{ - if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); - - if (!insert) { - bp->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp->k, 0); - } - - return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i); -} - -static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, - struct extent_ptr_decoded p, - const union bch_extent_entry *entry) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - return BCH_DATA_btree; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - if (p.has_ec) - return BCH_DATA_stripe; - if (p.ptr.cached) - return BCH_DATA_cached; - else - return BCH_DATA_user; - case KEY_TYPE_stripe: { - const struct bch_extent_ptr *ptr = &entry->ptr; - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - BUG_ON(ptr < s.v->ptrs || - ptr >= s.v->ptrs + s.v->nr_blocks); - - return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity - : BCH_DATA_user; - } - default: - BUG(); - } -} - -static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - const union bch_extent_entry *entry, - struct bkey_i_backpointer *bp) -{ - bkey_backpointer_init(&bp->k_i); - bp->k.p.inode = p.ptr.dev; - - if (k.k->type != KEY_TYPE_stripe) - bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset; - else { - /* - * Put stripe backpointers where they won't collide with the - * extent backpointers within the stripe: - */ - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1; - } - - bp->v = (struct bch_backpointer) { - .btree_id = btree_id, - .level = level, - .data_type = bch2_bkey_ptr_data_type(k, p, entry), - .bucket_gen = p.ptr.gen, - .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p), - .pos = k.k->p, - }; -} - -struct bkey_buf; -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, - struct btree_iter *, unsigned, struct bkey_buf *); -struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, - struct btree_iter *, struct bkey_buf *); - -int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64, - bool, struct bkey_buf *); - -int bch2_check_btree_backpointers(struct bch_fs *); -int bch2_check_extents_to_backpointers(struct bch_fs *); -int bch2_check_backpointers_to_extents(struct bch_fs *); - -static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i) -{ - unsigned long *bitmap = READ_ONCE(b->buckets); - return bitmap && test_bit(i, bitmap); -} - -int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64); -void bch2_bucket_bitmap_free(struct bucket_bitmap *); - -#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h deleted file mode 100644 index 63abe17f35eaa9..00000000000000 --- a/fs/bcachefs/bbpos.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BBPOS_H -#define _BCACHEFS_BBPOS_H - -#include "bbpos_types.h" -#include "bkey_methods.h" -#include "btree_cache.h" - -static inline int bbpos_cmp(struct bbpos l, struct bbpos r) -{ - return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos); -} - -static inline struct bbpos bbpos_successor(struct bbpos pos) -{ - if (bpos_cmp(pos.pos, SPOS_MAX)) { - pos.pos = bpos_successor(pos.pos); - return pos; - } - - if (pos.btree != BTREE_ID_NR) { - pos.btree++; - pos.pos = POS_MIN; - return pos; - } - - BUG(); -} - -static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) -{ - bch2_btree_id_to_text(out, pos.btree); - prt_char(out, ':'); - bch2_bpos_to_text(out, pos.pos); -} - -#endif /* _BCACHEFS_BBPOS_H */ diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h deleted file mode 100644 index f63893344f80aa..00000000000000 --- a/fs/bcachefs/bbpos_types.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BBPOS_TYPES_H -#define _BCACHEFS_BBPOS_TYPES_H - -struct bbpos { - enum btree_id btree; - struct bpos pos; -}; - -static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) -{ - return (struct bbpos) { btree, pos }; -} - -#define BBPOS_MIN BBPOS(0, POS_MIN) -#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX) - -#endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h deleted file mode 100644 index ddfacad0f70cf4..00000000000000 --- a/fs/bcachefs/bcachefs.h +++ /dev/null @@ -1,1295 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_H -#define _BCACHEFS_H - -/* - * SOME HIGH LEVEL CODE DOCUMENTATION: - * - * Bcache mostly works with cache sets, cache devices, and backing devices. - * - * Support for multiple cache devices hasn't quite been finished off yet, but - * it's about 95% plumbed through. A cache set and its cache devices is sort of - * like a md raid array and its component devices. Most of the code doesn't care - * about individual cache devices, the main abstraction is the cache set. - * - * Multiple cache devices is intended to give us the ability to mirror dirty - * cached data and metadata, without mirroring clean cached data. - * - * Backing devices are different, in that they have a lifetime independent of a - * cache set. When you register a newly formatted backing device it'll come up - * in passthrough mode, and then you can attach and detach a backing device from - * a cache set at runtime - while it's mounted and in use. Detaching implicitly - * invalidates any cached data for that backing device. - * - * A cache set can have multiple (many) backing devices attached to it. - * - * There's also flash only volumes - this is the reason for the distinction - * between struct cached_dev and struct bcache_device. A flash only volume - * works much like a bcache device that has a backing device, except the - * "cached" data is always dirty. The end result is that we get thin - * provisioning with very little additional code. - * - * Flash only volumes work but they're not production ready because the moving - * garbage collector needs more work. More on that later. - * - * BUCKETS/ALLOCATION: - * - * Bcache is primarily designed for caching, which means that in normal - * operation all of our available space will be allocated. Thus, we need an - * efficient way of deleting things from the cache so we can write new things to - * it. - * - * To do this, we first divide the cache device up into buckets. A bucket is the - * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ - * works efficiently. - * - * Each bucket has a 16 bit priority, and an 8 bit generation associated with - * it. The gens and priorities for all the buckets are stored contiguously and - * packed on disk (in a linked list of buckets - aside from the superblock, all - * of bcache's metadata is stored in buckets). - * - * The priority is used to implement an LRU. We reset a bucket's priority when - * we allocate it or on cache it, and every so often we decrement the priority - * of each bucket. It could be used to implement something more sophisticated, - * if anyone ever gets around to it. - * - * The generation is used for invalidating buckets. Each pointer also has an 8 - * bit generation embedded in it; for a pointer to be considered valid, its gen - * must match the gen of the bucket it points into. Thus, to reuse a bucket all - * we have to do is increment its gen (and write its new gen to disk; we batch - * this up). - * - * Bcache is entirely COW - we never write twice to a bucket, even buckets that - * contain metadata (including btree nodes). - * - * THE BTREE: - * - * Bcache is in large part design around the btree. - * - * At a high level, the btree is just an index of key -> ptr tuples. - * - * Keys represent extents, and thus have a size field. Keys also have a variable - * number of pointers attached to them (potentially zero, which is handy for - * invalidating the cache). - * - * The key itself is an inode:offset pair. The inode number corresponds to a - * backing device or a flash only volume. The offset is the ending offset of the - * extent within the inode - not the starting offset; this makes lookups - * slightly more convenient. - * - * Pointers contain the cache device id, the offset on that device, and an 8 bit - * generation number. More on the gen later. - * - * Index lookups are not fully abstracted - cache lookups in particular are - * still somewhat mixed in with the btree code, but things are headed in that - * direction. - * - * Updates are fairly well abstracted, though. There are two different ways of - * updating the btree; insert and replace. - * - * BTREE_INSERT will just take a list of keys and insert them into the btree - - * overwriting (possibly only partially) any extents they overlap with. This is - * used to update the index after a write. - * - * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is - * overwriting a key that matches another given key. This is used for inserting - * data into the cache after a cache miss, and for background writeback, and for - * the moving garbage collector. - * - * There is no "delete" operation; deleting things from the index is - * accomplished by either by invalidating pointers (by incrementing a bucket's - * gen) or by inserting a key with 0 pointers - which will overwrite anything - * previously present at that location in the index. - * - * This means that there are always stale/invalid keys in the btree. They're - * filtered out by the code that iterates through a btree node, and removed when - * a btree node is rewritten. - * - * BTREE NODES: - * - * Our unit of allocation is a bucket, and we can't arbitrarily allocate and - * free smaller than a bucket - so, that's how big our btree nodes are. - * - * (If buckets are really big we'll only use part of the bucket for a btree node - * - no less than 1/4th - but a bucket still contains no more than a single - * btree node. I'd actually like to change this, but for now we rely on the - * bucket's gen for deleting btree nodes when we rewrite/split a node.) - * - * Anyways, btree nodes are big - big enough to be inefficient with a textbook - * btree implementation. - * - * The way this is solved is that btree nodes are internally log structured; we - * can append new keys to an existing btree node without rewriting it. This - * means each set of keys we write is sorted, but the node is not. - * - * We maintain this log structure in memory - keeping 1Mb of keys sorted would - * be expensive, and we have to distinguish between the keys we have written and - * the keys we haven't. So to do a lookup in a btree node, we have to search - * each sorted set. But we do merge written sets together lazily, so the cost of - * these extra searches is quite low (normally most of the keys in a btree node - * will be in one big set, and then there'll be one or two sets that are much - * smaller). - * - * This log structure makes bcache's btree more of a hybrid between a - * conventional btree and a compacting data structure, with some of the - * advantages of both. - * - * GARBAGE COLLECTION: - * - * We can't just invalidate any bucket - it might contain dirty data or - * metadata. If it once contained dirty data, other writes might overwrite it - * later, leaving no valid pointers into that bucket in the index. - * - * Thus, the primary purpose of garbage collection is to find buckets to reuse. - * It also counts how much valid data it each bucket currently contains, so that - * allocation can reuse buckets sooner when they've been mostly overwritten. - * - * It also does some things that are really internal to the btree - * implementation. If a btree node contains pointers that are stale by more than - * some threshold, it rewrites the btree node to avoid the bucket's generation - * wrapping around. It also merges adjacent btree nodes if they're empty enough. - * - * THE JOURNAL: - * - * Bcache's journal is not necessary for consistency; we always strictly - * order metadata writes so that the btree and everything else is consistent on - * disk in the event of an unclean shutdown, and in fact bcache had writeback - * caching (with recovery from unclean shutdown) before journalling was - * implemented. - * - * Rather, the journal is purely a performance optimization; we can't complete a - * write until we've updated the index on disk, otherwise the cache would be - * inconsistent in the event of an unclean shutdown. This means that without the - * journal, on random write workloads we constantly have to update all the leaf - * nodes in the btree, and those writes will be mostly empty (appending at most - * a few keys each) - highly inefficient in terms of amount of metadata writes, - * and it puts more strain on the various btree resorting/compacting code. - * - * The journal is just a log of keys we've inserted; on startup we just reinsert - * all the keys in the open journal entries. That means that when we're updating - * a node in the btree, we can wait until a 4k block of keys fills up before - * writing them out. - * - * For simplicity, we only journal updates to leaf nodes; updates to parent - * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth - * the complexity to deal with journalling them (in particular, journal replay) - * - updates to non leaf nodes just happen synchronously (see btree_split()). - */ - -#undef pr_fmt -#ifdef __KERNEL__ -#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ -#else -#define pr_fmt(fmt) "%s() " fmt "\n", __func__ -#endif - -#ifdef CONFIG_BCACHEFS_DEBUG -#define ENUMERATED_REF_DEBUG -#endif - -#ifndef dynamic_fault -#define dynamic_fault(...) 0 -#endif - -#define race_fault(...) dynamic_fault("bcachefs:race") - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "bcachefs_format.h" -#include "btree_journal_iter_types.h" -#include "disk_accounting_types.h" -#include "errcode.h" -#include "fast_list.h" -#include "fifo.h" -#include "nocow_locking_types.h" -#include "opts.h" -#include "sb-errors_types.h" -#include "seqmutex.h" -#include "snapshot_types.h" -#include "time_stats.h" -#include "util.h" - -#include "alloc_types.h" -#include "async_objs_types.h" -#include "btree_gc_types.h" -#include "btree_types.h" -#include "btree_node_scan_types.h" -#include "btree_write_buffer_types.h" -#include "buckets_types.h" -#include "buckets_waiting_for_journal_types.h" -#include "clock_types.h" -#include "disk_groups_types.h" -#include "ec_types.h" -#include "enumerated_ref_types.h" -#include "journal_types.h" -#include "keylist_types.h" -#include "quota_types.h" -#include "rebalance_types.h" -#include "recovery_passes_types.h" -#include "replicas_types.h" -#include "sb-members_types.h" -#include "subvolume_types.h" -#include "super_types.h" -#include "thread_with_file_types.h" - -#include "trace.h" - -#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) - -#define trace_and_count(_c, _name, ...) \ -do { \ - count_event(_c, _name); \ - trace_##_name(__VA_ARGS__); \ -} while (0) - -#define bch2_fs_init_fault(name) \ - dynamic_fault("bcachefs:bch_fs_init:" name) -#define bch2_meta_read_fault(name) \ - dynamic_fault("bcachefs:meta:read:" name) -#define bch2_meta_write_fault(name) \ - dynamic_fault("bcachefs:meta:write:" name) - -#ifdef __KERNEL__ -#define BCACHEFS_LOG_PREFIX -#endif - -#ifdef BCACHEFS_LOG_PREFIX - -#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name) -#define bch2_fmt_dev(_ca, fmt) "bcachefs (%s): " fmt "\n", ((_ca)->name) -#define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset) -#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) -#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ - "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset) - -#else - -#define bch2_log_msg(_c, fmt) fmt -#define bch2_fmt_dev(_ca, fmt) "%s: " fmt "\n", ((_ca)->name) -#define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset) -#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) -#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ - "inum %llu offset %llu: " fmt "\n", (_inum), (_offset) - -#endif - -#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") - -void bch2_print_str(struct bch_fs *, const char *, const char *); - -__printf(2, 3) -void bch2_print_opts(struct bch_opts *, const char *, ...); - -__printf(2, 3) -void __bch2_print(struct bch_fs *c, const char *fmt, ...); - -#define maybe_dev_to_fs(_c) _Generic((_c), \ - struct bch_dev *: ((struct bch_dev *) (_c))->fs, \ - struct bch_fs *: (_c)) - -#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__) - -#define bch2_print_ratelimited(_c, ...) \ -do { \ - static DEFINE_RATELIMIT_STATE(_rs, \ - DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - \ - if (__ratelimit(&_rs)) \ - bch2_print(_c, __VA_ARGS__); \ -} while (0) - -#define bch2_print_str_ratelimited(_c, ...) \ -do { \ - static DEFINE_RATELIMIT_STATE(_rs, \ - DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - \ - if (__ratelimit(&_rs)) \ - bch2_print_str(_c, __VA_ARGS__); \ -} while (0) - -#define bch_info(c, fmt, ...) \ - bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_info_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_notice(c, fmt, ...) \ - bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_warn(c, fmt, ...) \ - bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_warn_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) - -#define bch_err(c, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_err_dev(ca, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) -#define bch_err_dev_offset(ca, _offset, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) -#define bch_err_inum(c, _inum, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) -#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) - -#define bch_err_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_err_dev_ratelimited(ca, fmt, ...) \ - bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) -#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ - bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) -#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ - bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) -#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ - bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) - -static inline bool should_print_err(int err) -{ - return err && !bch2_err_matches(err, BCH_ERR_transaction_restart); -} - -#define bch_err_fn(_c, _ret) \ -do { \ - if (should_print_err(_ret)) \ - bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ -} while (0) - -#define bch_err_fn_ratelimited(_c, _ret) \ -do { \ - if (should_print_err(_ret)) \ - bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ -} while (0) - -#define bch_err_msg(_c, _ret, _msg, ...) \ -do { \ - if (should_print_err(_ret)) \ - bch_err(_c, "%s(): error " _msg " %s", __func__, \ - ##__VA_ARGS__, bch2_err_str(_ret)); \ -} while (0) - -#define bch_verbose(c, fmt, ...) \ -do { \ - if ((c)->opts.verbose) \ - bch_info(c, fmt, ##__VA_ARGS__); \ -} while (0) - -#define bch_verbose_ratelimited(c, fmt, ...) \ -do { \ - if ((c)->opts.verbose) \ - bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \ -} while (0) - -#define pr_verbose_init(opts, fmt, ...) \ -do { \ - if (opt_get(opts, verbose)) \ - pr_info(fmt, ##__VA_ARGS__); \ -} while (0) - -static inline int __bch2_err_trace(struct bch_fs *c, int err) -{ - trace_error_throw(c, err, _THIS_IP_); - return err; -} - -#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err) - -/* Parameters that are useful for debugging, but should always be compiled in: */ -#define BCH_DEBUG_PARAMS_ALWAYS() \ - BCH_DEBUG_PARAM(key_merging_disabled, \ - "Disables merging of extents") \ - BCH_DEBUG_PARAM(btree_node_merging_disabled, \ - "Disables merging of btree nodes") \ - BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ - "Causes mark and sweep to compact and rewrite every " \ - "btree node it traverses") \ - BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ - "Disables rewriting of btree nodes during mark and sweep")\ - BCH_DEBUG_PARAM(btree_shrinker_disabled, \ - "Disables the shrinker callback for the btree node cache")\ - BCH_DEBUG_PARAM(verify_btree_ondisk, \ - "Reread btree nodes at various points to verify the " \ - "mergesort in the read path against modifications " \ - "done in memory") \ - BCH_DEBUG_PARAM(verify_all_btree_replicas, \ - "When reading btree nodes, read all replicas and " \ - "compare them") \ - BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ - "Don't use the write buffer for backpointers, enabling "\ - "extra runtime checks") \ - BCH_DEBUG_PARAM(debug_check_btree_locking, \ - "Enable additional asserts for btree locking") \ - BCH_DEBUG_PARAM(debug_check_iterators, \ - "Enables extra verification for btree iterators") \ - BCH_DEBUG_PARAM(debug_check_bset_lookups, \ - "Enables extra verification for bset lookups") \ - BCH_DEBUG_PARAM(debug_check_btree_accounting, \ - "Verify btree accounting for keys within a node") \ - BCH_DEBUG_PARAM(debug_check_bkey_unpack, \ - "Enables extra verification for bkey unpack") - -/* Parameters that should only be compiled in debug mode: */ -#define BCH_DEBUG_PARAMS_DEBUG() \ - BCH_DEBUG_PARAM(journal_seq_verify, \ - "Store the journal sequence number in the version " \ - "number of every btree key, and verify that btree " \ - "update ordering is preserved during recovery") \ - BCH_DEBUG_PARAM(inject_invalid_keys, \ - "Store the journal sequence number in the version " \ - "number of every btree key, and verify that btree " \ - "update ordering is preserved during recovery") \ - BCH_DEBUG_PARAM(test_alloc_startup, \ - "Force allocator startup to use the slowpath where it" \ - "can't find enough free buckets without invalidating" \ - "cached data") \ - BCH_DEBUG_PARAM(force_reconstruct_read, \ - "Force reads to use the reconstruct path, when reading" \ - "from erasure coded extents") \ - BCH_DEBUG_PARAM(test_restart_gc, \ - "Test restarting mark and sweep gc when bucket gens change") - -#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() - -#ifdef CONFIG_BCACHEFS_DEBUG -#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() -#else -#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() -#endif - -#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name; -BCH_DEBUG_PARAMS_ALL() -#undef BCH_DEBUG_PARAM - -#define BCH_TIME_STATS() \ - x(btree_node_mem_alloc) \ - x(btree_node_split) \ - x(btree_node_compact) \ - x(btree_node_merge) \ - x(btree_node_sort) \ - x(btree_node_get) \ - x(btree_node_read) \ - x(btree_node_read_done) \ - x(btree_node_write) \ - x(btree_interior_update_foreground) \ - x(btree_interior_update_total) \ - x(btree_gc) \ - x(data_write) \ - x(data_write_to_submit) \ - x(data_write_to_queue) \ - x(data_write_to_btree_update) \ - x(data_write_btree_update) \ - x(data_read) \ - x(data_promote) \ - x(journal_flush_write) \ - x(journal_noflush_write) \ - x(journal_flush_seq) \ - x(blocked_journal_low_on_space) \ - x(blocked_journal_low_on_pin) \ - x(blocked_journal_max_in_flight) \ - x(blocked_journal_max_open) \ - x(blocked_key_cache_flush) \ - x(blocked_allocate) \ - x(blocked_allocate_open_bucket) \ - x(blocked_write_buffer_full) \ - x(nocow_lock_contended) - -enum bch_time_stats { -#define x(name) BCH_TIME_##name, - BCH_TIME_STATS() -#undef x - BCH_TIME_STAT_NR -}; - -/* Number of nodes btree coalesce will try to coalesce at once */ -#define GC_MERGE_NODES 4U - -/* Maximum number of nodes we might need to allocate atomically: */ -#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) - -/* Size of the freelist we allocate btree nodes from: */ -#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) - -#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) - -struct btree; - -struct io_count { - u64 sectors[2][BCH_DATA_NR]; -}; - -struct discard_in_flight { - bool in_progress:1; - u64 bucket:63; -}; - -#define BCH_DEV_READ_REFS() \ - x(bch2_online_devs) \ - x(trans_mark_dev_sbs) \ - x(read_fua_test) \ - x(sb_field_resize) \ - x(write_super) \ - x(journal_read) \ - x(fs_journal_alloc) \ - x(fs_resize_on_mount) \ - x(btree_node_read) \ - x(btree_node_read_all_replicas) \ - x(btree_node_scrub) \ - x(btree_node_write) \ - x(btree_node_scan) \ - x(btree_verify_replicas) \ - x(btree_node_ondisk_to_text) \ - x(io_read) \ - x(check_extent_checksums) \ - x(ec_block) - -enum bch_dev_read_ref { -#define x(n) BCH_DEV_READ_REF_##n, - BCH_DEV_READ_REFS() -#undef x - BCH_DEV_READ_REF_NR, -}; - -#define BCH_DEV_WRITE_REFS() \ - x(journal_write) \ - x(journal_do_discards) \ - x(dev_do_discards) \ - x(discard_one_bucket_fast) \ - x(do_invalidates) \ - x(nocow_flush) \ - x(io_write) \ - x(ec_block) \ - x(ec_bucket_zero) - -enum bch_dev_write_ref { -#define x(n) BCH_DEV_WRITE_REF_##n, - BCH_DEV_WRITE_REFS() -#undef x - BCH_DEV_WRITE_REF_NR, -}; - -struct bucket_bitmap { - unsigned long *buckets; - u64 nr; - struct mutex lock; -}; - -struct bch_dev { - struct kobject kobj; -#ifdef CONFIG_BCACHEFS_DEBUG - atomic_long_t ref; - bool dying; - unsigned long last_put; -#else - struct percpu_ref ref; -#endif - struct completion ref_completion; - struct enumerated_ref io_ref[2]; - - struct bch_fs *fs; - - u8 dev_idx; - /* - * Cached version of this device's member info from superblock - * Committed by bch2_write_super() -> bch_fs_mi_update() - */ - struct bch_member_cpu mi; - atomic64_t errors[BCH_MEMBER_ERROR_NR]; - unsigned long write_errors_start; - - __uuid_t uuid; - char name[BDEVNAME_SIZE]; - - struct bch_sb_handle disk_sb; - struct bch_sb *sb_read_scratch; - int sb_write_error; - dev_t dev; - atomic_t flush_seq; - - struct bch_devs_mask self; - - /* - * Buckets: - * Per-bucket arrays are protected by either rcu_read_lock or - * state_lock, for device resize. - */ - GENRADIX(struct bucket) buckets_gc; - struct bucket_gens __rcu *bucket_gens; - u8 *oldest_gen; - unsigned long *buckets_nouse; - - struct bucket_bitmap bucket_backpointer_mismatch; - struct bucket_bitmap bucket_backpointer_empty; - - struct bch_dev_usage_full __percpu - *usage; - - /* Allocator: */ - u64 alloc_cursor[3]; - - unsigned nr_open_buckets; - unsigned nr_partial_buckets; - unsigned nr_btree_reserve; - - struct work_struct invalidate_work; - struct work_struct discard_work; - struct mutex discard_buckets_in_flight_lock; - DARRAY(struct discard_in_flight) discard_buckets_in_flight; - struct work_struct discard_fast_work; - - atomic64_t rebalance_work; - - struct journal_device journal; - u64 prev_journal_sector; - - struct work_struct io_error_work; - - /* The rest of this all shows up in sysfs */ - atomic64_t cur_latency[2]; - struct bch2_time_stats_quantiles io_latency[2]; - -#define CONGESTED_MAX 1024 - atomic_t congested; - u64 congested_last; - - struct io_count __percpu *io_done; -}; - -/* - * initial_gc_unfixed - * error - * topology error - */ - -#define BCH_FS_FLAGS() \ - x(new_fs) \ - x(started) \ - x(clean_recovery) \ - x(btree_running) \ - x(accounting_replay_done) \ - x(may_go_rw) \ - x(rw) \ - x(rw_init_done) \ - x(was_rw) \ - x(stopping) \ - x(emergency_ro) \ - x(going_ro) \ - x(write_disable_complete) \ - x(clean_shutdown) \ - x(in_recovery) \ - x(in_fsck) \ - x(initial_gc_unfixed) \ - x(need_delete_dead_snapshots) \ - x(error) \ - x(topology_error) \ - x(errors_fixed) \ - x(errors_not_fixed) \ - x(no_invalid_checks) \ - x(discard_mount_opt_set) \ - -enum bch_fs_flags { -#define x(n) BCH_FS_##n, - BCH_FS_FLAGS() -#undef x -}; - -struct btree_debug { - unsigned id; -}; - -#define BCH_TRANSACTIONS_NR 128 - -struct btree_transaction_stats { - struct bch2_time_stats duration; - struct bch2_time_stats lock_hold_times; - struct mutex lock; - unsigned nr_max_paths; - unsigned max_mem; -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_trans_kmalloc_trace trans_kmalloc_trace; -#endif - char *max_paths_text; -}; - -struct bch_fs_pcpu { - u64 sectors_available; -}; - -struct journal_seq_blacklist_table { - size_t nr; - struct journal_seq_blacklist_table_entry { - u64 start; - u64 end; - bool dirty; - } entries[]; -}; - -struct btree_trans_buf { - struct btree_trans *trans; -}; - -#define BCH_WRITE_REFS() \ - x(journal) \ - x(trans) \ - x(write) \ - x(promote) \ - x(node_rewrite) \ - x(stripe_create) \ - x(stripe_delete) \ - x(reflink) \ - x(fallocate) \ - x(fsync) \ - x(dio_write) \ - x(discard) \ - x(discard_fast) \ - x(check_discard_freespace_key) \ - x(invalidate) \ - x(delete_dead_snapshots) \ - x(gc_gens) \ - x(snapshot_delete_pagecache) \ - x(sysfs) \ - x(btree_write_buffer) \ - x(btree_node_scrub) \ - x(async_recovery_passes) \ - x(ioctl_data) - -enum bch_write_ref { -#define x(n) BCH_WRITE_REF_##n, - BCH_WRITE_REFS() -#undef x - BCH_WRITE_REF_NR, -}; - -#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0) - -struct bch_fs { - struct closure cl; - - struct list_head list; - struct kobject kobj; - struct kobject counters_kobj; - struct kobject internal; - struct kobject opts_dir; - struct kobject time_stats; - unsigned long flags; - - int minor; - struct device *chardev; - struct super_block *vfs_sb; - dev_t dev; - char name[40]; - struct stdio_redirect *stdio; - struct task_struct *stdio_filter; - - /* ro/rw, add/remove/resize devices: */ - struct rw_semaphore state_lock; - - /* Counts outstanding writes, for clean transition to read-only */ - struct enumerated_ref writes; - /* - * Certain operations are only allowed in single threaded mode, during - * recovery, and we want to assert that this is the case: - */ - struct task_struct *recovery_task; - - /* - * Analagous to c->writes, for asynchronous ops that don't necessarily - * need fs to be read-write - */ - refcount_t ro_ref; - wait_queue_head_t ro_ref_wait; - - struct work_struct read_only_work; - - struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; - - struct bch_accounting_mem accounting; - - struct bch_replicas_cpu replicas; - struct bch_replicas_cpu replicas_gc; - struct mutex replicas_gc_lock; - - struct journal_entry_res btree_root_journal_res; - struct journal_entry_res clock_journal_res; - - struct bch_disk_groups_cpu __rcu *disk_groups; - - struct bch_opts opts; - - /* Updated by bch2_sb_update():*/ - struct { - __uuid_t uuid; - __uuid_t user_uuid; - - u16 version; - u16 version_incompat; - u16 version_incompat_allowed; - u16 version_min; - u16 version_upgrade_complete; - - u8 nr_devices; - u8 clean; - bool multi_device; /* true if we've ever had more than one device */ - - u8 encryption_type; - - u64 time_base_lo; - u32 time_base_hi; - unsigned time_units_per_sec; - unsigned nsec_per_time_unit; - u64 features; - u64 compat; - u64 recovery_passes_required; - unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; - u64 btrees_lost_data; - } sb; - DARRAY(enum bcachefs_metadata_version) - incompat_versions_requested; - - struct unicode_map *cf_encoding; - - struct bch_sb_handle disk_sb; - - unsigned short block_bits; /* ilog2(block_size) */ - - u16 btree_foreground_merge_threshold; - - struct closure sb_write; - struct mutex sb_lock; - - /* snapshot.c: */ - struct snapshot_table __rcu *snapshots; - struct mutex snapshot_table_lock; - struct rw_semaphore snapshot_create_lock; - - struct snapshot_delete snapshot_delete; - struct work_struct snapshot_wait_for_pagecache_and_delete_work; - snapshot_id_list snapshots_unlinked; - struct mutex snapshots_unlinked_lock; - - /* BTREE CACHE */ - struct bio_set btree_bio; - struct workqueue_struct *btree_read_complete_wq; - struct workqueue_struct *btree_write_submit_wq; - - struct btree_root btree_roots_known[BTREE_ID_NR]; - DARRAY(struct btree_root) btree_roots_extra; - struct mutex btree_root_lock; - - struct btree_cache btree_cache; - - /* - * Cache of allocated btree nodes - if we allocate a btree node and - * don't use it, if we free it that space can't be reused until going - * _all_ the way through the allocator (which exposes us to a livelock - * when allocating btree reserves fail halfway through) - instead, we - * can stick them here: - */ - struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; - unsigned btree_reserve_cache_nr; - struct mutex btree_reserve_cache_lock; - - mempool_t btree_interior_update_pool; - struct list_head btree_interior_update_list; - struct list_head btree_interior_updates_unwritten; - struct mutex btree_interior_update_lock; - struct closure_waitlist btree_interior_update_wait; - - struct workqueue_struct *btree_interior_update_worker; - struct work_struct btree_interior_update_work; - - struct workqueue_struct *btree_node_rewrite_worker; - struct list_head btree_node_rewrites; - struct list_head btree_node_rewrites_pending; - spinlock_t btree_node_rewrites_lock; - struct closure_waitlist btree_node_rewrites_wait; - - /* btree_io.c: */ - spinlock_t btree_write_error_lock; - struct btree_write_stats { - atomic64_t nr; - atomic64_t bytes; - } btree_write_stats[BTREE_WRITE_TYPE_NR]; - - /* btree_iter.c: */ - struct seqmutex btree_trans_lock; - struct list_head btree_trans_list; - mempool_t btree_trans_pool; - mempool_t btree_trans_mem_pool; - struct btree_trans_buf __percpu *btree_trans_bufs; - - struct srcu_struct btree_trans_barrier; - bool btree_trans_barrier_initialized; - - struct btree_key_cache btree_key_cache; - unsigned btree_key_cache_btrees; - - struct btree_write_buffer btree_write_buffer; - - struct workqueue_struct *btree_update_wq; - struct workqueue_struct *btree_write_complete_wq; - /* copygc needs its own workqueue for index updates.. */ - struct workqueue_struct *copygc_wq; - /* - * Use a dedicated wq for write ref holder tasks. Required to avoid - * dependency problems with other wq tasks that can block on ref - * draining, such as read-only transition. - */ - struct workqueue_struct *write_ref_wq; - - /* ALLOCATION */ - struct bch_devs_mask online_devs; - struct bch_devs_mask rw_devs[BCH_DATA_NR]; - unsigned long rw_devs_change_count; - - u64 capacity; /* sectors */ - u64 reserved; /* sectors */ - - /* - * When capacity _decreases_ (due to a disk being removed), we - * increment capacity_gen - this invalidates outstanding reservations - * and forces them to be revalidated - */ - u32 capacity_gen; - unsigned bucket_size_max; - - atomic64_t sectors_available; - struct mutex sectors_available_lock; - - struct bch_fs_pcpu __percpu *pcpu; - - struct percpu_rw_semaphore mark_lock; - - seqcount_t usage_lock; - struct bch_fs_usage_base __percpu *usage; - u64 __percpu *online_reserved; - - unsigned long allocator_last_stuck; - - struct io_clock io_clock[2]; - - /* JOURNAL SEQ BLACKLIST */ - struct journal_seq_blacklist_table * - journal_seq_blacklist_table; - - /* ALLOCATOR */ - spinlock_t freelist_lock; - struct closure_waitlist freelist_wait; - - open_bucket_idx_t open_buckets_freelist; - open_bucket_idx_t open_buckets_nr_free; - struct closure_waitlist open_buckets_wait; - struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; - open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; - - open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; - open_bucket_idx_t open_buckets_partial_nr; - - struct write_point btree_write_point; - struct write_point rebalance_write_point; - - struct write_point write_points[WRITE_POINT_MAX]; - struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; - struct mutex write_points_hash_lock; - unsigned write_points_nr; - - struct buckets_waiting_for_journal buckets_waiting_for_journal; - - /* GARBAGE COLLECTION */ - struct work_struct gc_gens_work; - unsigned long gc_count; - - enum btree_id gc_gens_btree; - struct bpos gc_gens_pos; - - /* - * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] - * has been marked by GC. - * - * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.) - * - * Protected by gc_pos_lock. Only written to by GC thread, so GC thread - * can read without a lock. - */ - seqcount_t gc_pos_lock; - struct gc_pos gc_pos; - - /* - * The allocation code needs gc_mark in struct bucket to be correct, but - * it's not while a gc is in progress. - */ - struct rw_semaphore gc_lock; - struct mutex gc_gens_lock; - - /* IO PATH */ - struct semaphore io_in_flight; - struct bio_set bio_read; - struct bio_set bio_read_split; - struct bio_set bio_write; - struct bio_set replica_set; - struct mutex bio_bounce_pages_lock; - mempool_t bio_bounce_pages; - struct bucket_nocow_lock_table - nocow_locks; - struct rhashtable promote_table; - -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR]; -#endif - - mempool_t compression_bounce[2]; - mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; - size_t zstd_workspace_size; - - struct bch_key chacha20_key; - bool chacha20_key_set; - - atomic64_t key_version; - - mempool_t large_bkey_pool; - - /* MOVE.C */ - struct list_head moving_context_list; - struct mutex moving_context_lock; - - /* REBALANCE */ - struct bch_fs_rebalance rebalance; - - /* COPYGC */ - struct task_struct *copygc_thread; - struct write_point copygc_write_point; - s64 copygc_wait_at; - s64 copygc_wait; - bool copygc_running; - wait_queue_head_t copygc_running_wq; - - /* STRIPES: */ - GENRADIX(struct gc_stripe) gc_stripes; - - struct hlist_head ec_stripes_new[32]; - spinlock_t ec_stripes_new_lock; - - /* ERASURE CODING */ - struct list_head ec_stripe_head_list; - struct mutex ec_stripe_head_lock; - - struct list_head ec_stripe_new_list; - struct mutex ec_stripe_new_lock; - wait_queue_head_t ec_stripe_new_wait; - - struct work_struct ec_stripe_create_work; - u64 ec_stripe_hint; - - struct work_struct ec_stripe_delete_work; - - struct bio_set ec_bioset; - - /* REFLINK */ - reflink_gc_table reflink_gc_table; - size_t reflink_gc_nr; - - /* fs.c */ - struct list_head vfs_inodes_list; - struct mutex vfs_inodes_lock; - struct rhashtable vfs_inodes_table; - struct rhltable vfs_inodes_by_inum_table; - - /* VFS IO PATH - fs-io.c */ - struct bio_set writepage_bioset; - struct bio_set dio_write_bioset; - struct bio_set dio_read_bioset; - struct bio_set nocow_flush_bioset; - - /* QUOTAS */ - struct bch_memquota_type quotas[QTYP_NR]; - - /* RECOVERY */ - u64 journal_replay_seq_start; - u64 journal_replay_seq_end; - struct bch_fs_recovery recovery; - - /* DEBUG JUNK */ - struct dentry *fs_debug_dir; - struct dentry *btree_debug_dir; - struct dentry *async_obj_dir; - struct btree_debug btree_debug[BTREE_ID_NR]; - struct btree *verify_data; - struct btree_node *verify_ondisk; - struct mutex verify_lock; - - /* - * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - have to dynamically allocate them - */ - mempool_t fill_iter; - - mempool_t btree_bounce_pool; - - struct journal journal; - GENRADIX(struct journal_replay *) journal_entries; - u64 journal_entries_base_seq; - struct journal_keys journal_keys; - struct list_head journal_iters; - - struct find_btree_nodes found_btree_nodes; - - u64 last_bucket_seq_cleanup; - - u64 counters_on_mount[BCH_COUNTER_NR]; - u64 __percpu *counters; - - struct bch2_time_stats times[BCH_TIME_STAT_NR]; - - struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; - - /* ERRORS */ - struct list_head fsck_error_msgs; - struct mutex fsck_error_msgs_lock; - bool fsck_alloc_msgs_err; - - bch_sb_errors_cpu fsck_error_counts; - struct mutex fsck_error_counts_lock; -}; - -extern struct wait_queue_head bch2_read_only_wait; - -static inline bool bch2_ro_ref_tryget(struct bch_fs *c) -{ - if (test_bit(BCH_FS_stopping, &c->flags)) - return false; - - return refcount_inc_not_zero(&c->ro_ref); -} - -static inline void bch2_ro_ref_put(struct bch_fs *c) -{ - if (refcount_dec_and_test(&c->ro_ref)) - wake_up(&c->ro_ref_wait); -} - -static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) -{ -#ifndef NO_BCACHEFS_FS - if (c->vfs_sb) - c->vfs_sb->s_bdi->ra_pages = ra_pages; -#endif -} - -static inline unsigned bucket_bytes(const struct bch_dev *ca) -{ - return ca->mi.bucket_size << 9; -} - -static inline unsigned block_bytes(const struct bch_fs *c) -{ - return c->opts.block_size; -} - -static inline unsigned block_sectors(const struct bch_fs *c) -{ - return c->opts.block_size >> 9; -} - -static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) -{ - return c->btree_key_cache_btrees & (1U << btree); -} - -static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) -{ - struct timespec64 t; - s64 sec; - s32 rem; - - time += c->sb.time_base_lo; - - sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); - - set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit); - - return t; -} - -static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) -{ - return (ts.tv_sec * c->sb.time_units_per_sec + - (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; -} - -static inline s64 bch2_current_time(const struct bch_fs *c) -{ - struct timespec64 now; - - ktime_get_coarse_real_ts64(&now); - return timespec_to_bch2_time(c, now); -} - -static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw) -{ - return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX); -} - -static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) -{ - struct stdio_redirect *stdio = c->stdio; - - if (c->stdio_filter && c->stdio_filter != current) - stdio = NULL; - return stdio; -} - -static inline unsigned metadata_replicas_required(struct bch_fs *c) -{ - return min(c->opts.metadata_replicas, - c->opts.metadata_replicas_required); -} - -static inline unsigned data_replicas_required(struct bch_fs *c) -{ - return min(c->opts.data_replicas, - c->opts.data_replicas_required); -} - -#define BKEY_PADDED_ONSTACK(key, pad) \ - struct { struct bkey_i key; __u64 key ## _pad[pad]; } - -/* - * This is needed because discard is both a filesystem option and a device - * option, and mount options are supposed to apply to that mount and not be - * persisted, i.e. if it's set as a mount option we can't propagate it to the - * device. - */ -static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) -{ - return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) - ? c->opts.discard - : ca->mi.discard; -} - -static inline bool bch2_fs_casefold_enabled(struct bch_fs *c) -{ -#ifdef CONFIG_UNICODE - return !c->opts.casefold_disabled; -#else - return false; -#endif -} - -#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h deleted file mode 100644 index b4a04df5ea9555..00000000000000 --- a/fs/bcachefs/bcachefs_format.h +++ /dev/null @@ -1,1545 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FORMAT_H -#define _BCACHEFS_FORMAT_H - -/* - * bcachefs on disk data structures - * - * OVERVIEW: - * - * There are three main types of on disk data structures in bcachefs (this is - * reduced from 5 in bcache) - * - * - superblock - * - journal - * - btree - * - * The btree is the primary structure; most metadata exists as keys in the - * various btrees. There are only a small number of btrees, they're not - * sharded - we have one btree for extents, another for inodes, et cetera. - * - * SUPERBLOCK: - * - * The superblock contains the location of the journal, the list of devices in - * the filesystem, and in general any metadata we need in order to decide - * whether we can start a filesystem or prior to reading the journal/btree - * roots. - * - * The superblock is extensible, and most of the contents of the superblock are - * in variable length, type tagged fields; see struct bch_sb_field. - * - * Backup superblocks do not reside in a fixed location; also, superblocks do - * not have a fixed size. To locate backup superblocks we have struct - * bch_sb_layout; we store a copy of this inside every superblock, and also - * before the first superblock. - * - * JOURNAL: - * - * The journal primarily records btree updates in the order they occurred; - * journal replay consists of just iterating over all the keys in the open - * journal entries and re-inserting them into the btrees. - * - * The journal also contains entry types for the btree roots, and blacklisted - * journal sequence numbers (see journal_seq_blacklist.c). - * - * BTREE: - * - * bcachefs btrees are copy on write b+ trees, where nodes are big (typically - * 128k-256k) and log structured. We use struct btree_node for writing the first - * entry in a given node (offset 0), and struct btree_node_entry for all - * subsequent writes. - * - * After the header, btree node entries contain a list of keys in sorted order. - * Values are stored inline with the keys; since values are variable length (and - * keys effectively are variable length too, due to packing) we can't do random - * access without building up additional in memory tables in the btree node read - * path. - * - * BTREE KEYS (struct bkey): - * - * The various btrees share a common format for the key - so as to avoid - * switching in fastpath lookup/comparison code - but define their own - * structures for the key values. - * - * The size of a key/value pair is stored as a u8 in units of u64s, so the max - * size is just under 2k. The common part also contains a type tag for the - * value, and a format field indicating whether the key is packed or not (and - * also meant to allow adding new key fields in the future, if desired). - * - * bkeys, when stored within a btree node, may also be packed. In that case, the - * bkey_format in that node is used to unpack it. Packed bkeys mean that we can - * be generous with field sizes in the common part of the key format (64 bit - * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. - */ - -#include -#include -#include -#include -#include -#include "vstructs.h" - -#ifdef __KERNEL__ -typedef uuid_t __uuid_t; -#endif - -#define BITMASK(name, type, field, offset, end) \ -static const __maybe_unused unsigned name##_OFFSET = offset; \ -static const __maybe_unused unsigned name##_BITS = (end - offset); \ - \ -static inline __u64 name(const type *k) \ -{ \ - return (k->field >> offset) & ~(~0ULL << (end - offset)); \ -} \ - \ -static inline void SET_##name(type *k, __u64 v) \ -{ \ - k->field &= ~(~(~0ULL << (end - offset)) << offset); \ - k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ -} - -#define LE_BITMASK(_bits, name, type, field, offset, end) \ -static const __maybe_unused unsigned name##_OFFSET = offset; \ -static const __maybe_unused unsigned name##_BITS = (end - offset); \ -static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\ - \ -static inline __u64 name(const type *k) \ -{ \ - return (__le##_bits##_to_cpu(k->field) >> offset) & \ - ~(~0ULL << (end - offset)); \ -} \ - \ -static inline void SET_##name(type *k, __u64 v) \ -{ \ - __u##_bits new = __le##_bits##_to_cpu(k->field); \ - \ - new &= ~(~(~0ULL << (end - offset)) << offset); \ - new |= (v & ~(~0ULL << (end - offset))) << offset; \ - k->field = __cpu_to_le##_bits(new); \ -} - -#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) -#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) -#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) - -struct bkey_format { - __u8 key_u64s; - __u8 nr_fields; - /* One unused slot for now: */ - __u8 bits_per_field[6]; - __le64 field_offset[6]; -}; - -/* Btree keys - all units are in sectors */ - -struct bpos { - /* - * Word order matches machine byte order - btree code treats a bpos as a - * single large integer, for search/comparison purposes - * - * Note that wherever a bpos is embedded in another on disk data - * structure, it has to be byte swabbed when reading in metadata that - * wasn't written in native endian order: - */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - __u32 snapshot; - __u64 offset; - __u64 inode; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - __u64 inode; - __u64 offset; /* Points to end of extent - sectors */ - __u32 snapshot; -#else -#error edit for your odd byteorder. -#endif -} __packed -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -__aligned(4) -#endif -; - -#define KEY_INODE_MAX ((__u64)~0ULL) -#define KEY_OFFSET_MAX ((__u64)~0ULL) -#define KEY_SNAPSHOT_MAX ((__u32)~0U) -#define KEY_SIZE_MAX ((__u32)~0U) - -static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) -{ - return (struct bpos) { - .inode = inode, - .offset = offset, - .snapshot = snapshot, - }; -} - -#define POS_MIN SPOS(0, 0, 0) -#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0) -#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) -#define POS(_inode, _offset) SPOS(_inode, _offset, 0) - -/* Empty placeholder struct, for container_of() */ -struct bch_val { - __u64 __nothing[0]; -}; - -struct bversion { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - __u64 lo; - __u32 hi; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - __u32 hi; - __u64 lo; -#endif -} __packed -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -__aligned(4) -#endif -; - -struct bkey { - /* Size of combined key and value, in u64s */ - __u8 u64s; - - /* Format of key (0 for format local to btree node) */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 format:7, - needs_whiteout:1; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u8 needs_whiteout:1, - format:7; -#else -#error edit for your odd byteorder. -#endif - - /* Type of the value */ - __u8 type; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - __u8 pad[1]; - - struct bversion bversion; - __u32 size; /* extent size, in sectors */ - struct bpos p; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - struct bpos p; - __u32 size; /* extent size, in sectors */ - struct bversion bversion; - - __u8 pad[1]; -#endif -} __packed -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -/* - * The big-endian version of bkey can't be compiled by rustc with the "aligned" - * attr since it doesn't allow types to have both "packed" and "aligned" attrs. - * So for Rust compatibility, don't include this. It can be included in the LE - * version because the "packed" attr is redundant in that case. - * - * History: (quoting Kent) - * - * Specifically, when i was designing bkey, I wanted the header to be no - * bigger than necessary so that bkey_packed could use the rest. That means that - * decently offten extent keys will fit into only 8 bytes, instead of spilling over - * to 16. - * - * But packed_bkey treats the part after the header - the packed section - - * as a single multi word, variable length integer. And bkey, the unpacked - * version, is just a special case version of a bkey_packed; all the packed - * bkey code will work on keys in any packed format, the in-memory - * representation of an unpacked key also is just one type of packed key... - * - * So that constrains the key part of a bkig endian bkey to start right - * after the header. - * - * If we ever do a bkey_v2 and need to expand the hedaer by another byte for - * some reason - that will clean up this wart. - */ -__aligned(8) -#endif -; - -struct bkey_packed { - __u64 _data[0]; - - /* Size of combined key and value, in u64s */ - __u8 u64s; - - /* Format of key (0 for format local to btree node) */ - - /* - * XXX: next incompat on disk format change, switch format and - * needs_whiteout - bkey_packed() will be cheaper if format is the high - * bits of the bitfield - */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 format:7, - needs_whiteout:1; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u8 needs_whiteout:1, - format:7; -#endif - - /* Type of the value */ - __u8 type; - __u8 key_start[0]; - - /* - * We copy bkeys with struct assignment in various places, and while - * that shouldn't be done with packed bkeys we can't disallow it in C, - * and it's legal to cast a bkey to a bkey_packed - so padding it out - * to the same size as struct bkey should hopefully be safest. - */ - __u8 pad[sizeof(struct bkey) - 3]; -} __packed __aligned(8); - -typedef struct { - __le64 lo; - __le64 hi; -} bch_le128; - -#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) -#define BKEY_U64s_MAX U8_MAX -#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) - -#define KEY_PACKED_BITS_START 24 - -#define KEY_FORMAT_LOCAL_BTREE 0 -#define KEY_FORMAT_CURRENT 1 - -enum bch_bkey_fields { - BKEY_FIELD_INODE, - BKEY_FIELD_OFFSET, - BKEY_FIELD_SNAPSHOT, - BKEY_FIELD_SIZE, - BKEY_FIELD_VERSION_HI, - BKEY_FIELD_VERSION_LO, - BKEY_NR_FIELDS, -}; - -#define bkey_format_field(name, field) \ - [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) - -#define BKEY_FORMAT_CURRENT \ -((struct bkey_format) { \ - .key_u64s = BKEY_U64s, \ - .nr_fields = BKEY_NR_FIELDS, \ - .bits_per_field = { \ - bkey_format_field(INODE, p.inode), \ - bkey_format_field(OFFSET, p.offset), \ - bkey_format_field(SNAPSHOT, p.snapshot), \ - bkey_format_field(SIZE, size), \ - bkey_format_field(VERSION_HI, bversion.hi), \ - bkey_format_field(VERSION_LO, bversion.lo), \ - }, \ -}) - -/* bkey with inline value */ -struct bkey_i { - __u64 _data[0]; - - struct bkey k; - struct bch_val v; -}; - -#define POS_KEY(_pos) \ -((struct bkey) { \ - .u64s = BKEY_U64s, \ - .format = KEY_FORMAT_CURRENT, \ - .p = _pos, \ -}) - -#define KEY(_inode, _offset, _size) \ -((struct bkey) { \ - .u64s = BKEY_U64s, \ - .format = KEY_FORMAT_CURRENT, \ - .p = POS(_inode, _offset), \ - .size = _size, \ -}) - -static inline void bkey_init(struct bkey *k) -{ - *k = KEY(0, 0, 0); -} - -#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) - -#define __BKEY_PADDED(key, pad) \ - struct bkey_i key; __u64 key ## _pad[pad] - -enum bch_bkey_type_flags { - BKEY_TYPE_strict_btree_checks = BIT(0), -}; - -/* - * - DELETED keys are used internally to mark keys that should be ignored but - * override keys in composition order. Their version number is ignored. - * - * - DISCARDED keys indicate that the data is all 0s because it has been - * discarded. DISCARDs may have a version; if the version is nonzero the key - * will be persistent, otherwise the key will be dropped whenever the btree - * node is rewritten (like DELETED keys). - * - * - ERROR: any read of the data returns a read error, as the data was lost due - * to a failing device. Like DISCARDED keys, they can be removed (overridden) - * by new writes or cluster-wide GC. Node repair can also overwrite them with - * the same or a more recent version number, but not with an older version - * number. - * - * - WHITEOUT: for hash table btrees - */ -#define BCH_BKEY_TYPES() \ - x(deleted, 0, 0) \ - x(whiteout, 1, 0) \ - x(error, 2, 0) \ - x(cookie, 3, 0) \ - x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \ - x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \ - x(extent, 6, BKEY_TYPE_strict_btree_checks) \ - x(reservation, 7, BKEY_TYPE_strict_btree_checks) \ - x(inode, 8, BKEY_TYPE_strict_btree_checks) \ - x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \ - x(dirent, 10, BKEY_TYPE_strict_btree_checks) \ - x(xattr, 11, BKEY_TYPE_strict_btree_checks) \ - x(alloc, 12, BKEY_TYPE_strict_btree_checks) \ - x(quota, 13, BKEY_TYPE_strict_btree_checks) \ - x(stripe, 14, BKEY_TYPE_strict_btree_checks) \ - x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \ - x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \ - x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \ - x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \ - x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \ - x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \ - x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \ - x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \ - x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \ - x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \ - x(set, 25, 0) \ - x(lru, 26, BKEY_TYPE_strict_btree_checks) \ - x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \ - x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \ - x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \ - x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \ - x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \ - x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \ - x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \ - x(accounting, 34, BKEY_TYPE_strict_btree_checks) \ - x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) - -enum bch_bkey_type { -#define x(name, nr, ...) KEY_TYPE_##name = nr, - BCH_BKEY_TYPES() -#undef x - KEY_TYPE_MAX, -}; - -struct bch_deleted { - struct bch_val v; -}; - -struct bch_whiteout { - struct bch_val v; -}; - -struct bch_error { - struct bch_val v; -}; - -struct bch_cookie { - struct bch_val v; - __le64 cookie; -}; - -struct bch_hash_whiteout { - struct bch_val v; -}; - -struct bch_set { - struct bch_val v; -}; - -/* 128 bits, sufficient for cryptographic MACs: */ -struct bch_csum { - __le64 lo; - __le64 hi; -} __packed __aligned(8); - -struct bch_backpointer { - struct bch_val v; - __u8 btree_id; - __u8 level; - __u8 data_type; - __u8 bucket_gen; - __u32 pad; - __u32 bucket_len; - struct bpos pos; -} __packed __aligned(8); - -/* Optional/variable size superblock sections: */ - -struct bch_sb_field { - __u64 _data[0]; - __le32 u64s; - __le32 type; -}; - -#define BCH_SB_FIELDS() \ - x(journal, 0) \ - x(members_v1, 1) \ - x(crypt, 2) \ - x(replicas_v0, 3) \ - x(quota, 4) \ - x(disk_groups, 5) \ - x(clean, 6) \ - x(replicas, 7) \ - x(journal_seq_blacklist, 8) \ - x(journal_v2, 9) \ - x(counters, 10) \ - x(members_v2, 11) \ - x(errors, 12) \ - x(ext, 13) \ - x(downgrade, 14) \ - x(recovery_passes, 15) - -#include "alloc_background_format.h" -#include "dirent_format.h" -#include "disk_accounting_format.h" -#include "disk_groups_format.h" -#include "extents_format.h" -#include "ec_format.h" -#include "inode_format.h" -#include "journal_seq_blacklist_format.h" -#include "logged_ops_format.h" -#include "lru_format.h" -#include "quota_format.h" -#include "recovery_passes_format.h" -#include "reflink_format.h" -#include "replicas_format.h" -#include "snapshot_format.h" -#include "subvolume_format.h" -#include "sb-counters_format.h" -#include "sb-downgrade_format.h" -#include "sb-errors_format.h" -#include "sb-members_format.h" -#include "xattr_format.h" - -enum bch_sb_field_type { -#define x(f, nr) BCH_SB_FIELD_##f = nr, - BCH_SB_FIELDS() -#undef x - BCH_SB_FIELD_NR -}; - -/* - * Most superblock fields are replicated in all device's superblocks - a few are - * not: - */ -#define BCH_SINGLE_DEVICE_SB_FIELDS \ - ((1U << BCH_SB_FIELD_journal)| \ - (1U << BCH_SB_FIELD_journal_v2)) - -/* BCH_SB_FIELD_journal: */ - -struct bch_sb_field_journal { - struct bch_sb_field field; - __le64 buckets[]; -}; - -struct bch_sb_field_journal_v2 { - struct bch_sb_field field; - - struct bch_sb_field_journal_v2_entry { - __le64 start; - __le64 nr; - } d[]; -}; - -/* BCH_SB_FIELD_crypt: */ - -struct nonce { - __le32 d[4]; -}; - -struct bch_key { - __le64 key[4]; -}; - -#define BCH_KEY_MAGIC \ - (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \ - ((__u64) 'h' << 16)|((__u64) '*' << 24)| \ - ((__u64) '*' << 32)|((__u64) 'k' << 40)| \ - ((__u64) 'e' << 48)|((__u64) 'y' << 56)) - -struct bch_encrypted_key { - __le64 magic; - struct bch_key key; -}; - -/* - * If this field is present in the superblock, it stores an encryption key which - * is used encrypt all other data/metadata. The key will normally be encrypted - * with the key userspace provides, but if encryption has been turned off we'll - * just store the master key unencrypted in the superblock so we can access the - * previously encrypted data. - */ -struct bch_sb_field_crypt { - struct bch_sb_field field; - - __le64 flags; - __le64 kdf_flags; - struct bch_encrypted_key key; -}; - -LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); - -enum bch_kdf_types { - BCH_KDF_SCRYPT = 0, - BCH_KDF_NR = 1, -}; - -/* stored as base 2 log of scrypt params: */ -LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); -LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); -LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); - -/* - * On clean shutdown, store btree roots and current journal sequence number in - * the superblock: - */ -struct jset_entry { - __le16 u64s; - __u8 btree_id; - __u8 level; - __u8 type; /* designates what this jset holds */ - __u8 pad[3]; - - struct bkey_i start[0]; - __u64 _data[]; -}; - -struct bch_sb_field_clean { - struct bch_sb_field field; - - __le32 flags; - __le16 _read_clock; /* no longer used */ - __le16 _write_clock; - __le64 journal_seq; - - struct jset_entry start[0]; - __u64 _data[]; -}; - -struct bch_sb_field_ext { - struct bch_sb_field field; - __le64 recovery_passes_required[2]; - __le64 errors_silent[8]; - __le64 btrees_lost_data; -}; - -/* Superblock: */ - -/* - * New versioning scheme: - * One common version number for all on disk data structures - superblock, btree - * nodes, journal entries - */ -#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10)) -#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) -#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0) - -/* - * field 1: version name - * field 2: BCH_VERSION(major, minor) - * field 3: recovery passess required on upgrade - */ -#define BCH_METADATA_VERSIONS() \ - x(bkey_renumber, BCH_VERSION(0, 10)) \ - x(inode_btree_change, BCH_VERSION(0, 11)) \ - x(snapshot, BCH_VERSION(0, 12)) \ - x(inode_backpointers, BCH_VERSION(0, 13)) \ - x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \ - x(snapshot_2, BCH_VERSION(0, 15)) \ - x(reflink_p_fix, BCH_VERSION(0, 16)) \ - x(subvol_dirent, BCH_VERSION(0, 17)) \ - x(inode_v2, BCH_VERSION(0, 18)) \ - x(freespace, BCH_VERSION(0, 19)) \ - x(alloc_v4, BCH_VERSION(0, 20)) \ - x(new_data_types, BCH_VERSION(0, 21)) \ - x(backpointers, BCH_VERSION(0, 22)) \ - x(inode_v3, BCH_VERSION(0, 23)) \ - x(unwritten_extents, BCH_VERSION(0, 24)) \ - x(bucket_gens, BCH_VERSION(0, 25)) \ - x(lru_v2, BCH_VERSION(0, 26)) \ - x(fragmentation_lru, BCH_VERSION(0, 27)) \ - x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \ - x(snapshot_trees, BCH_VERSION(0, 29)) \ - x(major_minor, BCH_VERSION(1, 0)) \ - x(snapshot_skiplists, BCH_VERSION(1, 1)) \ - x(deleted_inodes, BCH_VERSION(1, 2)) \ - x(rebalance_work, BCH_VERSION(1, 3)) \ - x(member_seq, BCH_VERSION(1, 4)) \ - x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ - x(btree_subvolume_children, BCH_VERSION(1, 6)) \ - x(mi_btree_bitmap, BCH_VERSION(1, 7)) \ - x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ - x(disk_accounting_v2, BCH_VERSION(1, 9)) \ - x(disk_accounting_v3, BCH_VERSION(1, 10)) \ - x(disk_accounting_inum, BCH_VERSION(1, 11)) \ - x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ - x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \ - x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ - x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ - x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ - x(inode_depth, BCH_VERSION(1, 17)) \ - x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ - x(autofix_errors, BCH_VERSION(1, 19)) \ - x(directory_size, BCH_VERSION(1, 20)) \ - x(cached_backpointers, BCH_VERSION(1, 21)) \ - x(stripe_backpointers, BCH_VERSION(1, 22)) \ - x(stripe_lru, BCH_VERSION(1, 23)) \ - x(casefolding, BCH_VERSION(1, 24)) \ - x(extent_flags, BCH_VERSION(1, 25)) \ - x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ - x(fast_device_removal, BCH_VERSION(1, 27)) \ - x(inode_has_case_insensitive, BCH_VERSION(1, 28)) - -enum bcachefs_metadata_version { - bcachefs_metadata_version_min = 9, -#define x(t, n) bcachefs_metadata_version_##t = n, - BCH_METADATA_VERSIONS() -#undef x - bcachefs_metadata_version_max -}; - -static const __maybe_unused -unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work; - -#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) - -#define BCH_SB_SECTOR 8 - -#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */ - -struct bch_sb_layout { - __uuid_t magic; /* bcachefs superblock UUID */ - __u8 layout_type; - __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ - __u8 nr_superblocks; - __u8 pad[5]; - __le64 sb_offset[61]; -} __packed __aligned(8); - -#define BCH_SB_LAYOUT_SECTOR 7 - -/* - * @offset - sector where this sb was written - * @version - on disk format version - * @version_min - Oldest metadata version this filesystem contains; so we can - * safely drop compatibility code and refuse to mount filesystems - * we'd need it for - * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC) - * @seq - incremented each time superblock is written - * @uuid - used for generating various magic numbers and identifying - * member devices, never changes - * @user_uuid - user visible UUID, may be changed - * @label - filesystem label - * @seq - identifies most recent superblock, incremented each time - * superblock is written - * @features - enabled incompatible features - */ -struct bch_sb { - struct bch_csum csum; - __le16 version; - __le16 version_min; - __le16 pad[2]; - __uuid_t magic; - __uuid_t uuid; - __uuid_t user_uuid; - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 offset; - __le64 seq; - - __le16 block_size; - __u8 dev_idx; - __u8 nr_devices; - __le32 u64s; - - __le64 time_base_lo; - __le32 time_base_hi; - __le32 time_precision; - - __le64 flags[7]; - __le64 write_time; - __le64 features[2]; - __le64 compat[2]; - - struct bch_sb_layout layout; - - struct bch_sb_field start[0]; - __le64 _data[]; -} __packed __aligned(8); - -/* - * Flags: - * BCH_SB_INITALIZED - set on first mount - * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect - * behaviour of mount/recovery path: - * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits - * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 - * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides - * DATA/META_CSUM_TYPE. Also indicates encryption - * algorithm in use, if/when we get more than one - */ - -LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); - -LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); -LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); -LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); -LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); - -LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); - -LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); -LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); - -LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); -LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); - -LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); -LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); - -LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); -LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); -LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); -LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); - -LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); -LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); - -LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); -LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS, - struct bch_sb, flags[0], 63, 64); - -LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); -LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8); -LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); - -LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); -LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); - -/* - * Max size of an extent that may require bouncing to read or write - * (checksummed, compressed): 64k - */ -LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, - struct bch_sb, flags[1], 14, 20); - -LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); -LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); - -LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); -LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); -LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); - -LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO, - struct bch_sb, flags[2], 0, 4); -LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); - -LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); -LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); -LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); -LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); -LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); -LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); -LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64); -LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); -LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); -LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); -LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54); -LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56); - -LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60); -LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, - struct bch_sb, flags[4], 60, 64); - -LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, - struct bch_sb, flags[5], 0, 16); -LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, - struct bch_sb, flags[5], 16, 32); -LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); -LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, - struct bch_sb, flags[5], 48, 64); -LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); -LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); -LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); -LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); -LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); -LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24); - -static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) -{ - return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4); -} - -static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) -{ - SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v); - SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4); -} - -static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb) -{ - return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) | - (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4); -} - -static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) -{ - SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v); - SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4); -} - -/* - * Features: - * - * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist - * reflink: gates KEY_TYPE_reflink - * inline_data: gates KEY_TYPE_inline_data - * new_siphash: gates BCH_STR_HASH_siphash - * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE - */ -#define BCH_SB_FEATURES() \ - x(lz4, 0) \ - x(gzip, 1) \ - x(zstd, 2) \ - x(atomic_nlink, 3) \ - x(ec, 4) \ - x(journal_seq_blacklist_v3, 5) \ - x(reflink, 6) \ - x(new_siphash, 7) \ - x(inline_data, 8) \ - x(new_extent_overwrite, 9) \ - x(incompressible, 10) \ - x(btree_ptr_v2, 11) \ - x(extents_above_btree_updates, 12) \ - x(btree_updates_journalled, 13) \ - x(reflink_inline_data, 14) \ - x(new_varint, 15) \ - x(journal_no_flush, 16) \ - x(alloc_v2, 17) \ - x(extents_across_btree_nodes, 18) \ - x(incompat_version_field, 19) \ - x(casefolding, 20) \ - x(no_alloc_info, 21) \ - x(small_image, 22) - -#define BCH_SB_FEATURES_ALWAYS \ - (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ - BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\ - BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\ - BIT_ULL(BCH_FEATURE_alloc_v2)|\ - BIT_ULL(BCH_FEATURE_extents_across_btree_nodes)) - -#define BCH_SB_FEATURES_ALL \ - (BCH_SB_FEATURES_ALWAYS| \ - BIT_ULL(BCH_FEATURE_new_siphash)| \ - BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ - BIT_ULL(BCH_FEATURE_new_varint)| \ - BIT_ULL(BCH_FEATURE_journal_no_flush)| \ - BIT_ULL(BCH_FEATURE_incompat_version_field)) - -enum bch_sb_feature { -#define x(f, n) BCH_FEATURE_##f, - BCH_SB_FEATURES() -#undef x - BCH_FEATURE_NR, -}; - -#define BCH_SB_COMPAT() \ - x(alloc_info, 0) \ - x(alloc_metadata, 1) \ - x(extents_above_btree_updates_done, 2) \ - x(bformat_overflow_done, 3) - -enum bch_sb_compat { -#define x(f, n) BCH_COMPAT_##f, - BCH_SB_COMPAT() -#undef x - BCH_COMPAT_NR, -}; - -/* options: */ - -#define BCH_VERSION_UPGRADE_OPTS() \ - x(compatible, 0) \ - x(incompatible, 1) \ - x(none, 2) - -enum bch_version_upgrade_opts { -#define x(t, n) BCH_VERSION_UPGRADE_##t = n, - BCH_VERSION_UPGRADE_OPTS() -#undef x -}; - -#define BCH_REPLICAS_MAX 4U - -#define BCH_BKEY_PTRS_MAX 16U - -#define BCH_ERROR_ACTIONS() \ - x(continue, 0) \ - x(fix_safe, 1) \ - x(panic, 2) \ - x(ro, 3) - -enum bch_error_actions { -#define x(t, n) BCH_ON_ERROR_##t = n, - BCH_ERROR_ACTIONS() -#undef x - BCH_ON_ERROR_NR -}; - -#define BCH_DEGRADED_ACTIONS() \ - x(ask, 0) \ - x(yes, 1) \ - x(very, 2) \ - x(no, 3) - -enum bch_degraded_actions { -#define x(t, n) BCH_DEGRADED_##t = n, - BCH_DEGRADED_ACTIONS() -#undef x - BCH_DEGRADED_ACTIONS_NR -}; - -#define BCH_STR_HASH_TYPES() \ - x(crc32c, 0) \ - x(crc64, 1) \ - x(siphash_old, 2) \ - x(siphash, 3) - -enum bch_str_hash_type { -#define x(t, n) BCH_STR_HASH_##t = n, - BCH_STR_HASH_TYPES() -#undef x - BCH_STR_HASH_NR -}; - -#define BCH_STR_HASH_OPTS() \ - x(crc32c, 0) \ - x(crc64, 1) \ - x(siphash, 2) - -enum bch_str_hash_opts { -#define x(t, n) BCH_STR_HASH_OPT_##t = n, - BCH_STR_HASH_OPTS() -#undef x - BCH_STR_HASH_OPT_NR -}; - -#define BCH_CSUM_TYPES() \ - x(none, 0) \ - x(crc32c_nonzero, 1) \ - x(crc64_nonzero, 2) \ - x(chacha20_poly1305_80, 3) \ - x(chacha20_poly1305_128, 4) \ - x(crc32c, 5) \ - x(crc64, 6) \ - x(xxhash, 7) - -enum bch_csum_type { -#define x(t, n) BCH_CSUM_##t = n, - BCH_CSUM_TYPES() -#undef x - BCH_CSUM_NR -}; - -static const __maybe_unused unsigned bch_crc_bytes[] = { - [BCH_CSUM_none] = 0, - [BCH_CSUM_crc32c_nonzero] = 4, - [BCH_CSUM_crc32c] = 4, - [BCH_CSUM_crc64_nonzero] = 8, - [BCH_CSUM_crc64] = 8, - [BCH_CSUM_xxhash] = 8, - [BCH_CSUM_chacha20_poly1305_80] = 10, - [BCH_CSUM_chacha20_poly1305_128] = 16, -}; - -static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) -{ - switch (type) { - case BCH_CSUM_chacha20_poly1305_80: - case BCH_CSUM_chacha20_poly1305_128: - return true; - default: - return false; - } -} - -#define BCH_CSUM_OPTS() \ - x(none, 0) \ - x(crc32c, 1) \ - x(crc64, 2) \ - x(xxhash, 3) - -enum bch_csum_opt { -#define x(t, n) BCH_CSUM_OPT_##t = n, - BCH_CSUM_OPTS() -#undef x - BCH_CSUM_OPT_NR -}; - -#define BCH_COMPRESSION_TYPES() \ - x(none, 0) \ - x(lz4_old, 1) \ - x(gzip, 2) \ - x(lz4, 3) \ - x(zstd, 4) \ - x(incompressible, 5) - -enum bch_compression_type { -#define x(t, n) BCH_COMPRESSION_TYPE_##t = n, - BCH_COMPRESSION_TYPES() -#undef x - BCH_COMPRESSION_TYPE_NR -}; - -#define BCH_COMPRESSION_OPTS() \ - x(none, 0) \ - x(lz4, 1) \ - x(gzip, 2) \ - x(zstd, 3) - -enum bch_compression_opts { -#define x(t, n) BCH_COMPRESSION_OPT_##t = n, - BCH_COMPRESSION_OPTS() -#undef x - BCH_COMPRESSION_OPT_NR -}; - -/* - * Magic numbers - * - * The various other data structures have their own magic numbers, which are - * xored with the first part of the cache set's UUID - */ - -#define BCACHE_MAGIC \ - UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \ - 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) -#define BCHFS_MAGIC \ - UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \ - 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) - -#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC - -#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) -#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) - -static inline __le64 __bch2_sb_magic(struct bch_sb *sb) -{ - __le64 ret; - - memcpy(&ret, &sb->uuid, sizeof(ret)); - return ret; -} - -static inline __u64 __jset_magic(struct bch_sb *sb) -{ - return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); -} - -static inline __u64 __bset_magic(struct bch_sb *sb) -{ - return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); -} - -/* Journal */ - -#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) - -#define BCH_JSET_ENTRY_TYPES() \ - x(btree_keys, 0) \ - x(btree_root, 1) \ - x(prio_ptrs, 2) \ - x(blacklist, 3) \ - x(blacklist_v2, 4) \ - x(usage, 5) \ - x(data_usage, 6) \ - x(clock, 7) \ - x(dev_usage, 8) \ - x(log, 9) \ - x(overwrite, 10) \ - x(write_buffer_keys, 11) \ - x(datetime, 12) \ - x(log_bkey, 13) - -enum bch_jset_entry_type { -#define x(f, nr) BCH_JSET_ENTRY_##f = nr, - BCH_JSET_ENTRY_TYPES() -#undef x - BCH_JSET_ENTRY_NR -}; - -static inline bool jset_entry_is_key(struct jset_entry *e) -{ - switch (e->type) { - case BCH_JSET_ENTRY_btree_keys: - case BCH_JSET_ENTRY_btree_root: - case BCH_JSET_ENTRY_write_buffer_keys: - return true; - } - - return false; -} - -/* - * Journal sequence numbers can be blacklisted: bsets record the max sequence - * number of all the journal entries they contain updates for, so that on - * recovery we can ignore those bsets that contain index updates newer that what - * made it into the journal. - * - * This means that we can't reuse that journal_seq - we have to skip it, and - * then record that we skipped it so that the next time we crash and recover we - * don't think there was a missing journal entry. - */ -struct jset_entry_blacklist { - struct jset_entry entry; - __le64 seq; -}; - -struct jset_entry_blacklist_v2 { - struct jset_entry entry; - __le64 start; - __le64 end; -}; - -#define BCH_FS_USAGE_TYPES() \ - x(reserved, 0) \ - x(inodes, 1) \ - x(key_version, 2) - -enum bch_fs_usage_type { -#define x(f, nr) BCH_FS_USAGE_##f = nr, - BCH_FS_USAGE_TYPES() -#undef x - BCH_FS_USAGE_NR -}; - -struct jset_entry_usage { - struct jset_entry entry; - __le64 v; -} __packed; - -struct jset_entry_data_usage { - struct jset_entry entry; - __le64 v; - struct bch_replicas_entry_v1 r; -} __packed; - -struct jset_entry_clock { - struct jset_entry entry; - __u8 rw; - __u8 pad[7]; - __le64 time; -} __packed; - -struct jset_entry_dev_usage_type { - __le64 buckets; - __le64 sectors; - __le64 fragmented; -} __packed; - -struct jset_entry_dev_usage { - struct jset_entry entry; - __le32 dev; - __u32 pad; - - __le64 _buckets_ec; /* No longer used */ - __le64 _buckets_unavailable; /* No longer used */ - - struct jset_entry_dev_usage_type d[]; -}; - -static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) -{ - return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) / - sizeof(struct jset_entry_dev_usage_type); -} - -struct jset_entry_log { - struct jset_entry entry; - u8 d[]; -} __packed __aligned(8); - -static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l) -{ - unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d); - - while (b && !l->d[b - 1]) - --b; - return b; -} - -struct jset_entry_datetime { - struct jset_entry entry; - __le64 seconds; -} __packed __aligned(8); - -/* - * On disk format for a journal entry: - * seq is monotonically increasing; every journal entry has its own unique - * sequence number. - * - * last_seq is the oldest journal entry that still has keys the btree hasn't - * flushed to disk yet. - * - * version is for on disk format changes. - */ -struct jset { - struct bch_csum csum; - - __le64 magic; - __le64 seq; - __le32 version; - __le32 flags; - - __le32 u64s; /* size of d[] in u64s */ - - __u8 encrypted_start[0]; - - __le16 _read_clock; /* no longer used */ - __le16 _write_clock; - - /* Sequence number of oldest dirty journal entry */ - __le64 last_seq; - - - struct jset_entry start[0]; - __u64 _data[]; -} __packed __aligned(8); - -LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); -LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); -LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); - -#define BCH_JOURNAL_BUCKETS_MIN 8 - -/* Btree: */ - -enum btree_id_flags { - BTREE_IS_extents = BIT(0), - BTREE_IS_snapshots = BIT(1), - BTREE_IS_snapshot_field = BIT(2), - BTREE_IS_data = BIT(3), - BTREE_IS_write_buffer = BIT(4), -}; - -#define BCH_BTREE_IDS() \ - x(extents, 0, \ - BTREE_IS_extents| \ - BTREE_IS_snapshots| \ - BTREE_IS_data, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_error)| \ - BIT_ULL(KEY_TYPE_cookie)| \ - BIT_ULL(KEY_TYPE_extent)| \ - BIT_ULL(KEY_TYPE_reservation)| \ - BIT_ULL(KEY_TYPE_reflink_p)| \ - BIT_ULL(KEY_TYPE_inline_data)) \ - x(inodes, 1, \ - BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_inode)| \ - BIT_ULL(KEY_TYPE_inode_v2)| \ - BIT_ULL(KEY_TYPE_inode_v3)| \ - BIT_ULL(KEY_TYPE_inode_generation)) \ - x(dirents, 2, \ - BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_hash_whiteout)| \ - BIT_ULL(KEY_TYPE_dirent)) \ - x(xattrs, 3, \ - BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_cookie)| \ - BIT_ULL(KEY_TYPE_hash_whiteout)| \ - BIT_ULL(KEY_TYPE_xattr)) \ - x(alloc, 4, 0, \ - BIT_ULL(KEY_TYPE_alloc)| \ - BIT_ULL(KEY_TYPE_alloc_v2)| \ - BIT_ULL(KEY_TYPE_alloc_v3)| \ - BIT_ULL(KEY_TYPE_alloc_v4)) \ - x(quotas, 5, 0, \ - BIT_ULL(KEY_TYPE_quota)) \ - x(stripes, 6, 0, \ - BIT_ULL(KEY_TYPE_stripe)) \ - x(reflink, 7, \ - BTREE_IS_extents| \ - BTREE_IS_data, \ - BIT_ULL(KEY_TYPE_reflink_v)| \ - BIT_ULL(KEY_TYPE_indirect_inline_data)| \ - BIT_ULL(KEY_TYPE_error)) \ - x(subvolumes, 8, 0, \ - BIT_ULL(KEY_TYPE_subvolume)) \ - x(snapshots, 9, 0, \ - BIT_ULL(KEY_TYPE_snapshot)) \ - x(lru, 10, \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)) \ - x(freespace, 11, \ - BTREE_IS_extents, \ - BIT_ULL(KEY_TYPE_set)) \ - x(need_discard, 12, 0, \ - BIT_ULL(KEY_TYPE_set)) \ - x(backpointers, 13, \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_backpointer)) \ - x(bucket_gens, 14, 0, \ - BIT_ULL(KEY_TYPE_bucket_gens)) \ - x(snapshot_trees, 15, 0, \ - BIT_ULL(KEY_TYPE_snapshot_tree)) \ - x(deleted_inodes, 16, \ - BTREE_IS_snapshot_field| \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)) \ - x(logged_ops, 17, 0, \ - BIT_ULL(KEY_TYPE_logged_op_truncate)| \ - BIT_ULL(KEY_TYPE_logged_op_finsert)| \ - BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \ - x(rebalance_work, 18, \ - BTREE_IS_snapshot_field| \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ - x(subvolume_children, 19, 0, \ - BIT_ULL(KEY_TYPE_set)) \ - x(accounting, 20, \ - BTREE_IS_snapshot_field| \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_accounting)) \ - -enum btree_id { -#define x(name, nr, ...) BTREE_ID_##name = nr, - BCH_BTREE_IDS() -#undef x - BTREE_ID_NR -}; - -/* - * Maximum number of btrees that we will _ever_ have under the current scheme, - * where we refer to them with 64 bit bitfields - and we also need a bit for - * the interior btree node type: - */ -#define BTREE_ID_NR_MAX 63 - -static inline bool btree_id_is_alloc(enum btree_id id) -{ - switch (id) { - case BTREE_ID_alloc: - case BTREE_ID_backpointers: - case BTREE_ID_need_discard: - case BTREE_ID_freespace: - case BTREE_ID_bucket_gens: - case BTREE_ID_lru: - case BTREE_ID_accounting: - return true; - default: - return false; - } -} - -#define BTREE_MAX_DEPTH 4U - -/* Btree nodes */ - -/* - * Btree nodes - * - * On disk a btree node is a list/log of these; within each set the keys are - * sorted - */ -struct bset { - __le64 seq; - - /* - * Highest journal entry this bset contains keys for. - * If on recovery we don't see that journal entry, this bset is ignored: - * this allows us to preserve the order of all index updates after a - * crash, since the journal records a total order of all index updates - * and anything that didn't make it to the journal doesn't get used. - */ - __le64 journal_seq; - - __le32 flags; - __le16 version; - __le16 u64s; /* count of d[] in u64s */ - - struct bkey_packed start[0]; - __u64 _data[]; -} __packed __aligned(8); - -LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); - -LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); -LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, - struct bset, flags, 5, 6); - -/* Sector offset within the btree node: */ -LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32); - -struct btree_node { - struct bch_csum csum; - __le64 magic; - - /* this flags field is encrypted, unlike bset->flags: */ - __le64 flags; - - /* Closed interval: */ - struct bpos min_key; - struct bpos max_key; - struct bch_extent_ptr _ptr; /* not used anymore */ - struct bkey_format format; - - union { - struct bset keys; - struct { - __u8 pad[22]; - __le16 u64s; - __u64 _data[0]; - - }; - }; -} __packed __aligned(8); - -LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, 0, 4); -LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); -LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, - struct btree_node, flags, 8, 9); -LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, 9, 25); -/* 25-32 unused */ -LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); - -static inline __u64 BTREE_NODE_ID(struct btree_node *n) -{ - return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4); -} - -static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v) -{ - SET_BTREE_NODE_ID_LO(n, v); - SET_BTREE_NODE_ID_HI(n, v >> 4); -} - -struct btree_node_entry { - struct bch_csum csum; - - union { - struct bset keys; - struct { - __u8 pad[22]; - __le16 u64s; - __u64 _data[0]; - }; - }; -} __packed __aligned(8); - -#endif /* _BCACHEFS_FORMAT_H */ diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h deleted file mode 100644 index 52594e925eb7ea..00000000000000 --- a/fs/bcachefs/bcachefs_ioctl.h +++ /dev/null @@ -1,473 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IOCTL_H -#define _BCACHEFS_IOCTL_H - -#include -#include -#include "bcachefs_format.h" -#include "bkey_types.h" - -/* - * Flags common to multiple ioctls: - */ -#define BCH_FORCE_IF_DATA_LOST (1 << 0) -#define BCH_FORCE_IF_METADATA_LOST (1 << 1) -#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) -#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) - -#define BCH_FORCE_IF_LOST \ - (BCH_FORCE_IF_DATA_LOST| \ - BCH_FORCE_IF_METADATA_LOST) -#define BCH_FORCE_IF_DEGRADED \ - (BCH_FORCE_IF_DATA_DEGRADED| \ - BCH_FORCE_IF_METADATA_DEGRADED) - -/* - * If cleared, ioctl that refer to a device pass it as a pointer to a pathname - * (e.g. /dev/sda1); if set, the dev field is the device's index within the - * filesystem: - */ -#define BCH_BY_INDEX (1 << 4) - -/* - * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem - * wide superblock: - */ -#define BCH_READ_DEV (1 << 5) - -/* global control dev: */ - -/* These are currently broken, and probably unnecessary: */ -#if 0 -#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) -#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) - -struct bch_ioctl_assemble { - __u32 flags; - __u32 nr_devs; - __u64 pad; - __u64 devs[]; -}; - -struct bch_ioctl_incremental { - __u32 flags; - __u64 pad; - __u64 dev; -}; -#endif - -/* filesystem ioctls: */ - -#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) - -/* These only make sense when we also have incremental assembly */ -#if 0 -#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) -#define BCH_IOCTL_STOP _IO(0xbc, 3) -#endif - -#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) -#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) -#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) -#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) -#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) -#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) -#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) -#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) - -#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) -#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) - -#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2) - -#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) -#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) -#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) -#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters) - -/* ioctl below act on a particular file, not the filesystem as a whole: */ - -#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) - -/* - * BCH_IOCTL_QUERY_UUID: get filesystem UUID - * - * Returns user visible UUID, not internal UUID (which may not ever be changed); - * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with - * this UUID. - */ -struct bch_ioctl_query_uuid { - __uuid_t uuid; -}; - -#if 0 -struct bch_ioctl_start { - __u32 flags; - __u32 pad; -}; -#endif - -/* - * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem - * - * The specified device must not be open or in use. On success, the new device - * will be an online member of the filesystem just like any other member. - * - * The device must first be prepared by userspace by formatting with a bcachefs - * superblock, which is only used for passing in superblock options/parameters - * for that device (in struct bch_member). The new device's superblock should - * not claim to be a member of any existing filesystem - UUIDs on it will be - * ignored. - */ - -/* - * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem - * - * Any data present on @dev will be permanently deleted, and @dev will be - * removed from its slot in the filesystem's list of member devices. The device - * may be either offline or offline. - * - * Will fail removing @dev would leave us with insufficient read write devices - * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are - * set. - */ - -/* - * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem - * but is not open (e.g. because we started in degraded mode), bring it online - * - * all existing data on @dev will be available once the device is online, - * exactly as if @dev was present when the filesystem was first mounted - */ - -/* - * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that - * block device, without removing it from the filesystem (so it can be brought - * back online later) - * - * Data present on @dev will be unavailable while @dev is offline (unless - * replicated), but will still be intact and untouched if @dev is brought back - * online - * - * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would - * leave us with insufficient read write devices or degraded/unavailable data, - * unless the approprate BCH_FORCE_IF_* flags are set. - */ - -struct bch_ioctl_disk { - __u32 flags; - __u32 pad; - __u64 dev; -}; - -/* - * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem - * - * @new_state - one of the bch_member_state states (rw, ro, failed, - * spare) - * - * Will refuse to change member state if we would then have insufficient devices - * to write to, or if it would result in degraded data (when @new_state is - * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. - */ -struct bch_ioctl_disk_set_state { - __u32 flags; - __u8 new_state; - __u8 pad[3]; - __u64 dev; -}; - -#define BCH_DATA_OPS() \ - x(scrub, 0) \ - x(rereplicate, 1) \ - x(migrate, 2) \ - x(rewrite_old_nodes, 3) \ - x(drop_extra_replicas, 4) - -enum bch_data_ops { -#define x(t, n) BCH_DATA_OP_##t = n, - BCH_DATA_OPS() -#undef x - BCH_DATA_OP_NR -}; - -/* - * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. - * scrub, rereplicate, migrate). - * - * This ioctl kicks off a job in the background, and returns a file descriptor. - * Reading from the file descriptor returns a struct bch_ioctl_data_event, - * indicating current progress, and closing the file descriptor will stop the - * job. The file descriptor is O_CLOEXEC. - */ -struct bch_ioctl_data { - __u16 op; - __u8 start_btree; - __u8 end_btree; - __u32 flags; - - struct bpos start_pos; - struct bpos end_pos; - - union { - struct { - __u32 dev; - __u32 data_types; - } scrub; - struct { - __u32 dev; - __u32 pad; - } migrate; - struct { - __u64 pad[8]; - }; - }; -} __packed __aligned(8); - -enum bch_data_event { - BCH_DATA_EVENT_PROGRESS = 0, - /* XXX: add an event for reporting errors */ - BCH_DATA_EVENT_NR = 1, -}; - -enum data_progress_data_type_special { - DATA_PROGRESS_DATA_TYPE_phys = 254, - DATA_PROGRESS_DATA_TYPE_done = 255, -}; - -struct bch_ioctl_data_progress { - __u8 data_type; - __u8 btree_id; - __u8 pad[2]; - struct bpos pos; - - __u64 sectors_done; - __u64 sectors_total; - __u64 sectors_error_corrected; - __u64 sectors_error_uncorrected; -} __packed __aligned(8); - -enum bch_ioctl_data_event_ret { - BCH_IOCTL_DATA_EVENT_RET_done = 1, - BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, -}; - -struct bch_ioctl_data_event { - __u8 type; - __u8 ret; - __u8 pad[6]; - union { - struct bch_ioctl_data_progress p; - __u64 pad2[15]; - }; -} __packed __aligned(8); - -struct bch_replicas_usage { - __u64 sectors; - struct bch_replicas_entry_v1 r; -} __packed; - -static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u) -{ - return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r); -} - -static inline struct bch_replicas_usage * -replicas_usage_next(struct bch_replicas_usage *u) -{ - return (void *) u + replicas_usage_bytes(u); -} - -/* Obsolete */ -/* - * BCH_IOCTL_FS_USAGE: query filesystem disk space usage - * - * Returns disk space usage broken out by data type, number of replicas, and - * by component device - * - * @replica_entries_bytes - size, in bytes, allocated for replica usage entries - * - * On success, @replica_entries_bytes will be changed to indicate the number of - * bytes actually used. - * - * Returns -ERANGE if @replica_entries_bytes was too small - */ -struct bch_ioctl_fs_usage { - __u64 capacity; - __u64 used; - __u64 online_reserved; - __u64 persistent_reserved[BCH_REPLICAS_MAX]; - - __u32 replica_entries_bytes; - __u32 pad; - - struct bch_replicas_usage replicas[]; -}; - -/* Obsolete */ -/* - * BCH_IOCTL_DEV_USAGE: query device disk space usage - * - * Returns disk space usage broken out by data type - both by buckets and - * sectors. - */ -struct bch_ioctl_dev_usage { - __u64 dev; - __u32 flags; - __u8 state; - __u8 pad[7]; - - __u32 bucket_size; - __u64 nr_buckets; - - __u64 buckets_ec; - - struct bch_ioctl_dev_usage_type { - __u64 buckets; - __u64 sectors; - __u64 fragmented; - } d[10]; -}; - -/* Obsolete */ -struct bch_ioctl_dev_usage_v2 { - __u64 dev; - __u32 flags; - __u8 state; - __u8 nr_data_types; - __u8 pad[6]; - - __u32 bucket_size; - __u64 nr_buckets; - - struct bch_ioctl_dev_usage_type d[]; -}; - -/* - * BCH_IOCTL_READ_SUPER: read filesystem superblock - * - * Equivalent to reading the superblock directly from the block device, except - * avoids racing with the kernel writing the superblock or having to figure out - * which block device to read - * - * @sb - buffer to read into - * @size - size of userspace allocated buffer - * @dev - device to read superblock for, if BCH_READ_DEV flag is - * specified - * - * Returns -ERANGE if buffer provided is too small - */ -struct bch_ioctl_read_super { - __u32 flags; - __u32 pad; - __u64 dev; - __u64 size; - __u64 sb; -}; - -/* - * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to - * determine if disk is a (online) member - if so, returns device's index - * - * Returns -ENOENT if not found - */ -struct bch_ioctl_disk_get_idx { - __u64 dev; -}; - -/* - * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device - * - * @dev - member to resize - * @nbuckets - new number of buckets - */ -struct bch_ioctl_disk_resize { - __u32 flags; - __u32 pad; - __u64 dev; - __u64 nbuckets; -}; - -/* - * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device - * - * @dev - member to resize - * @nbuckets - new number of buckets - */ -struct bch_ioctl_disk_resize_journal { - __u32 flags; - __u32 pad; - __u64 dev; - __u64 nbuckets; -}; - -struct bch_ioctl_subvolume { - __u32 flags; - __u32 dirfd; - __u16 mode; - __u16 pad[3]; - __u64 dst_ptr; - __u64 src_ptr; -}; - -#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) -#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) - -/* - * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command, - * but with the kernel's implementation of fsck: - */ -struct bch_ioctl_fsck_offline { - __u64 flags; - __u64 opts; /* string */ - __u64 nr_devs; - __u64 devs[] __counted_by(nr_devs); -}; - -/* - * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command, - * but with the kernel's implementation of fsck: - */ -struct bch_ioctl_fsck_online { - __u64 flags; - __u64 opts; /* string */ -}; - -/* - * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting - * - * Returns disk space usage broken out by data type, number of replicas, and - * by component device - * - * @replica_entries_bytes - size, in bytes, allocated for replica usage entries - * - * On success, @replica_entries_bytes will be changed to indicate the number of - * bytes actually used. - * - * Returns -ERANGE if @replica_entries_bytes was too small - */ -struct bch_ioctl_query_accounting { - __u64 capacity; - __u64 used; - __u64 online_reserved; - - __u32 accounting_u64s; /* input parameter */ - __u32 accounting_types_mask; /* input parameter */ - - struct bkey_i_accounting accounting[]; -}; - -#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0) - -struct bch_ioctl_query_counters { - __u16 nr; - __u16 flags; - __u32 pad; - __u64 d[]; -}; - -#endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c deleted file mode 100644 index ee823c640642b4..00000000000000 --- a/fs/bcachefs/bkey.c +++ /dev/null @@ -1,1112 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey.h" -#include "bkey_cmp.h" -#include "bkey_methods.h" -#include "bset.h" -#include "util.h" - -const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; - -void bch2_bkey_packed_to_binary_text(struct printbuf *out, - const struct bkey_format *f, - const struct bkey_packed *k) -{ - const u64 *p = high_word(f, k); - unsigned word_bits = 64 - high_bit_offset; - unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset; - u64 v = *p & (~0ULL >> high_bit_offset); - - if (!nr_key_bits) { - prt_str(out, "(empty)"); - return; - } - - while (1) { - unsigned next_key_bits = nr_key_bits; - - if (nr_key_bits < 64) { - v >>= 64 - nr_key_bits; - next_key_bits = 0; - } else { - next_key_bits -= 64; - } - - bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits)); - - if (!next_key_bits) - break; - - prt_char(out, ' '); - - p = next_word(p); - v = *p; - word_bits = 64; - nr_key_bits = next_key_bits; - } -} - -static void __bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) -{ - struct bkey tmp; - - BUG_ON(bkeyp_val_u64s(format, packed) != - bkey_val_u64s(unpacked)); - - BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); - - tmp = __bch2_bkey_unpack_key(format, packed); - - if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n", - format->key_u64s, - format->bits_per_field[0], - format->bits_per_field[1], - format->bits_per_field[2], - format->bits_per_field[3], - format->bits_per_field[4]); - - prt_printf(&buf, "compiled unpack: "); - bch2_bkey_to_text(&buf, unpacked); - prt_newline(&buf); - - prt_printf(&buf, "c unpack: "); - bch2_bkey_to_text(&buf, &tmp); - prt_newline(&buf); - - prt_printf(&buf, "compiled unpack: "); - bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, - (struct bkey_packed *) unpacked); - prt_newline(&buf); - - prt_printf(&buf, "c unpack: "); - bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, - (struct bkey_packed *) &tmp); - prt_newline(&buf); - - panic("%s", buf.buf); - } -} - -static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) -{ - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) - __bch2_bkey_pack_verify(packed, unpacked, format); -} - -struct pack_state { - const struct bkey_format *format; - unsigned bits; /* bits remaining in current word */ - u64 w; /* current word */ - u64 *p; /* pointer to next word */ -}; - -__always_inline -static struct pack_state pack_state_init(const struct bkey_format *format, - struct bkey_packed *k) -{ - u64 *p = high_word(format, k); - - return (struct pack_state) { - .format = format, - .bits = 64 - high_bit_offset, - .w = 0, - .p = p, - }; -} - -__always_inline -static void pack_state_finish(struct pack_state *state, - struct bkey_packed *k) -{ - EBUG_ON(state->p < k->_data); - EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s); - - *state->p = state->w; -} - -struct unpack_state { - const struct bkey_format *format; - unsigned bits; /* bits remaining in current word */ - u64 w; /* current word */ - const u64 *p; /* pointer to next word */ -}; - -__always_inline -static struct unpack_state unpack_state_init(const struct bkey_format *format, - const struct bkey_packed *k) -{ - const u64 *p = high_word(format, k); - - return (struct unpack_state) { - .format = format, - .bits = 64 - high_bit_offset, - .w = *p << high_bit_offset, - .p = p, - }; -} - -__always_inline -static u64 get_inc_field(struct unpack_state *state, unsigned field) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); - - if (bits >= state->bits) { - v = state->w >> (64 - bits); - bits -= state->bits; - - state->p = next_word(state->p); - state->w = *state->p; - state->bits = 64; - } - - /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ - v |= (state->w >> 1) >> (63 - bits); - state->w <<= bits; - state->bits -= bits; - - return v + offset; -} - -__always_inline -static void __set_inc_field(struct pack_state *state, unsigned field, u64 v) -{ - unsigned bits = state->format->bits_per_field[field]; - - if (bits) { - if (bits > state->bits) { - bits -= state->bits; - /* avoid shift by 64 if bits is 64 - bits is never 0 here: */ - state->w |= (v >> 1) >> (bits - 1); - - *state->p = state->w; - state->p = next_word(state->p); - state->w = 0; - state->bits = 64; - } - - state->bits -= bits; - state->w |= v << state->bits; - } -} - -__always_inline -static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 offset = le64_to_cpu(state->format->field_offset[field]); - - if (v < offset) - return false; - - v -= offset; - - if (fls64(v) > bits) - return false; - - __set_inc_field(state, field, v); - return true; -} - -/* - * Note: does NOT set out->format (we don't know what it should be here!) - * - * Also: doesn't work on extents - it doesn't preserve the invariant that - * if k is packed bkey_start_pos(k) will successfully pack - */ -static bool bch2_bkey_transform_key(const struct bkey_format *out_f, - struct bkey_packed *out, - const struct bkey_format *in_f, - const struct bkey_packed *in) -{ - struct pack_state out_s = pack_state_init(out_f, out); - struct unpack_state in_s = unpack_state_init(in_f, in); - u64 *w = out->_data; - unsigned i; - - *w = 0; - - for (i = 0; i < BKEY_NR_FIELDS; i++) - if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) - return false; - - /* Can't happen because the val would be too big to unpack: */ - EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); - - pack_state_finish(&out_s, out); - out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; - out->needs_whiteout = in->needs_whiteout; - out->type = in->type; - - return true; -} - -bool bch2_bkey_transform(const struct bkey_format *out_f, - struct bkey_packed *out, - const struct bkey_format *in_f, - const struct bkey_packed *in) -{ - if (!bch2_bkey_transform_key(out_f, out, in_f, in)) - return false; - - memcpy_u64s((u64 *) out + out_f->key_u64s, - (u64 *) in + in_f->key_u64s, - (in->u64s - in_f->key_u64s)); - return true; -} - -struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, - const struct bkey_packed *in) -{ - struct unpack_state state = unpack_state_init(format, in); - struct bkey out; - - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->u64s < format->key_u64s); - EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); - EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); - - out.u64s = BKEY_U64s + in->u64s - format->key_u64s; - out.format = KEY_FORMAT_CURRENT; - out.needs_whiteout = in->needs_whiteout; - out.type = in->type; - out.pad[0] = 0; - -#define x(id, field) out.field = get_inc_field(&state, id); - bkey_fields() -#undef x - - return out; -} - -#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -struct bpos __bkey_unpack_pos(const struct bkey_format *format, - const struct bkey_packed *in) -{ - struct unpack_state state = unpack_state_init(format, in); - struct bpos out; - - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->u64s < format->key_u64s); - EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); - - out.inode = get_inc_field(&state, BKEY_FIELD_INODE); - out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); - out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); - - return out; -} -#endif - -/** - * bch2_bkey_pack_key -- pack just the key, not the value - * @out: packed result - * @in: key to pack - * @format: format of packed result - * - * Returns: true on success, false on failure - */ -bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, - const struct bkey_format *format) -{ - struct pack_state state = pack_state_init(format, out); - u64 *w = out->_data; - - EBUG_ON((void *) in == (void *) out); - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->format != KEY_FORMAT_CURRENT); - - *w = 0; - -#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; - bkey_fields() -#undef x - pack_state_finish(&state, out); - out->u64s = format->key_u64s + in->u64s - BKEY_U64s; - out->format = KEY_FORMAT_LOCAL_BTREE; - out->needs_whiteout = in->needs_whiteout; - out->type = in->type; - - bch2_bkey_pack_verify(out, in, format); - return true; -} - -/** - * bch2_bkey_unpack -- unpack the key and the value - * @b: btree node of @src key (for packed format) - * @dst: unpacked result - * @src: packed input - */ -void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, - const struct bkey_packed *src) -{ - __bkey_unpack_key(b, &dst->k, src); - - memcpy_u64s(&dst->v, - bkeyp_val(&b->format, src), - bkeyp_val_u64s(&b->format, src)); -} - -/** - * bch2_bkey_pack -- pack the key and the value - * @dst: packed result - * @src: unpacked input - * @format: format of packed result - * - * Returns: true on success, false on failure - */ -bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src, - const struct bkey_format *format) -{ - struct bkey_packed tmp; - - if (!bch2_bkey_pack_key(&tmp, &src->k, format)) - return false; - - memmove_u64s((u64 *) dst + format->key_u64s, - &src->v, - bkey_val_u64s(&src->k)); - memcpy_u64s_small(dst, &tmp, format->key_u64s); - - return true; -} - -__always_inline -static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 offset = le64_to_cpu(state->format->field_offset[field]); - bool ret = true; - - EBUG_ON(v < offset); - v -= offset; - - if (fls64(v) > bits) { - v = ~(~0ULL << bits); - ret = false; - } - - __set_inc_field(state, field, v); - return ret; -} - -static bool bkey_packed_successor(struct bkey_packed *out, - const struct btree *b, - struct bkey_packed k) -{ - const struct bkey_format *f = &b->format; - unsigned nr_key_bits = b->nr_key_bits; - unsigned first_bit, offset; - u64 *p; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); - - if (!nr_key_bits) - return false; - - *out = k; - - first_bit = high_bit_offset + nr_key_bits - 1; - p = nth_word(high_word(f, out), first_bit >> 6); - offset = 63 - (first_bit & 63); - - while (nr_key_bits) { - unsigned bits = min(64 - offset, nr_key_bits); - u64 mask = (~0ULL >> (64 - bits)) << offset; - - if ((*p & mask) != mask) { - *p += 1ULL << offset; - EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); - return true; - } - - *p &= ~mask; - p = prev_word(p); - nr_key_bits -= bits; - offset = 0; - } - - return false; -} - -static bool bkey_format_has_too_big_fields(const struct bkey_format *f) -{ - for (unsigned i = 0; i < f->nr_fields; i++) { - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 packed_max = f->bits_per_field[i] - ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) - : 0; - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (packed_max + field_offset < packed_max || - packed_max + field_offset > unpacked_max) - return true; - } - - return false; -} - -/* - * Returns a packed key that compares <= in - * - * This is used in bset_search_tree(), where we need a packed pos in order to be - * able to compare against the keys in the auxiliary search tree - and it's - * legal to use a packed pos that isn't equivalent to the original pos, - * _provided_ it compares <= to the original pos. - */ -enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, - struct bpos in, - const struct btree *b) -{ - const struct bkey_format *f = &b->format; - struct pack_state state = pack_state_init(f, out); - u64 *w = out->_data; - struct bpos orig = in; - bool exact = true; - unsigned i; - - /* - * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3 - * byte header, but pack_pos() won't if the len/version fields are big - * enough - we need to make sure to zero them out: - */ - for (i = 0; i < f->key_u64s; i++) - w[i] = 0; - - if (unlikely(in.snapshot < - le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { - if (!in.offset-- && - !in.inode--) - return BKEY_PACK_POS_FAIL; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(in.offset < - le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { - if (!in.inode--) - return BKEY_PACK_POS_FAIL; - in.offset = KEY_OFFSET_MAX; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(in.inode < - le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) - return BKEY_PACK_POS_FAIL; - - if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) { - in.offset = KEY_OFFSET_MAX; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) { - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))) - exact = false; - - pack_state_finish(&state, out); - out->u64s = f->key_u64s; - out->format = KEY_FORMAT_LOCAL_BTREE; - out->type = KEY_TYPE_deleted; - - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { - if (exact) { - BUG_ON(bkey_cmp_left_packed(b, out, &orig)); - } else { - struct bkey_packed successor; - - BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); - BUG_ON(bkey_packed_successor(&successor, b, *out) && - bkey_cmp_left_packed(b, &successor, &orig) < 0 && - !bkey_format_has_too_big_fields(f)); - } - } - - return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; -} - -void bch2_bkey_format_init(struct bkey_format_state *s) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(s->field_min); i++) - s->field_min[i] = U64_MAX; - - for (i = 0; i < ARRAY_SIZE(s->field_max); i++) - s->field_max[i] = 0; - - /* Make sure we can store a size of 0: */ - s->field_min[BKEY_FIELD_SIZE] = 0; -} - -void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) -{ - unsigned field = 0; - - __bkey_format_add(s, field++, p.inode); - __bkey_format_add(s, field++, p.offset); - __bkey_format_add(s, field++, p.snapshot); -} - -/* - * We don't want it to be possible for the packed format to represent fields - * bigger than a u64... that will cause confusion and issues (like with - * bkey_packed_successor()) - */ -static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, - unsigned bits, u64 offset) -{ - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - - bits = min(bits, unpacked_bits); - - offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1)); - - f->bits_per_field[i] = bits; - f->field_offset[i] = cpu_to_le64(offset); -} - -struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) -{ - unsigned i, bits = KEY_PACKED_BITS_START; - struct bkey_format ret = { - .nr_fields = BKEY_NR_FIELDS, - }; - - for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { - s->field_min[i] = min(s->field_min[i], s->field_max[i]); - - set_format_field(&ret, i, - fls64(s->field_max[i] - s->field_min[i]), - s->field_min[i]); - - bits += ret.bits_per_field[i]; - } - - /* allow for extent merging: */ - if (ret.bits_per_field[BKEY_FIELD_SIZE]) { - unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]); - - ret.bits_per_field[BKEY_FIELD_SIZE] += b; - bits += b; - } - - ret.key_u64s = DIV_ROUND_UP(bits, 64); - - /* if we have enough spare bits, round fields up to nearest byte */ - bits = ret.key_u64s * 64 - bits; - - for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { - unsigned r = round_up(ret.bits_per_field[i], 8) - - ret.bits_per_field[i]; - - if (r <= bits) { - set_format_field(&ret, i, - ret.bits_per_field[i] + r, - le64_to_cpu(ret.field_offset[i])); - bits -= r; - } - } - - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { - struct printbuf buf = PRINTBUF; - - BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); - printbuf_exit(&buf); - } - - return ret; -} - -int bch2_bkey_format_invalid(struct bch_fs *c, - struct bkey_format *f, - enum bch_validate_flags flags, - struct printbuf *err) -{ - unsigned bits = KEY_PACKED_BITS_START; - - if (f->nr_fields != BKEY_NR_FIELDS) { - prt_printf(err, "incorrect number of fields: got %u, should be %u", - f->nr_fields, BKEY_NR_FIELDS); - return -BCH_ERR_invalid; - } - - /* - * Verify that the packed format can't represent fields larger than the - * unpacked format: - */ - for (unsigned i = 0; i < f->nr_fields; i++) { - if (bch2_bkey_format_field_overflows(f, i)) { - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - unsigned packed_bits = min(64, f->bits_per_field[i]); - u64 packed_max = packed_bits - ? ~((~0ULL << 1) << (packed_bits - 1)) - : 0; - - prt_printf(err, "field %u too large: %llu + %llu > %llu", - i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max); - return -BCH_ERR_invalid; - } - - bits += f->bits_per_field[i]; - } - - if (f->key_u64s != DIV_ROUND_UP(bits, 64)) { - prt_printf(err, "incorrect key_u64s: got %u, should be %u", - f->key_u64s, DIV_ROUND_UP(bits, 64)); - return -BCH_ERR_invalid; - } - - return 0; -} - -void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f) -{ - prt_printf(out, "u64s %u fields ", f->key_u64s); - - for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) { - if (i) - prt_str(out, ", "); - prt_printf(out, "%u:%llu", - f->bits_per_field[i], - le64_to_cpu(f->field_offset[i])); - } -} - -/* - * Most significant differing bit - * Bits are indexed from 0 - return is [0, nr_key_bits) - */ -__pure -unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, - const struct bkey_packed *l_k, - const struct bkey_packed *r_k) -{ - const u64 *l = high_word(&b->format, l_k); - const u64 *r = high_word(&b->format, r_k); - unsigned nr_key_bits = b->nr_key_bits; - unsigned word_bits = 64 - high_bit_offset; - u64 l_v, r_v; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); - - /* for big endian, skip past header */ - l_v = *l & (~0ULL >> high_bit_offset); - r_v = *r & (~0ULL >> high_bit_offset); - - while (nr_key_bits) { - if (nr_key_bits < word_bits) { - l_v >>= word_bits - nr_key_bits; - r_v >>= word_bits - nr_key_bits; - nr_key_bits = 0; - } else { - nr_key_bits -= word_bits; - } - - if (l_v != r_v) - return fls64(l_v ^ r_v) - 1 + nr_key_bits; - - l = next_word(l); - r = next_word(r); - - l_v = *l; - r_v = *r; - word_bits = 64; - } - - return 0; -} - -/* - * First set bit - * Bits are indexed from 0 - return is [0, nr_key_bits) - */ -__pure -unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) -{ - const u64 *p = high_word(&b->format, k); - unsigned nr_key_bits = b->nr_key_bits; - unsigned ret = 0, offset; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); - - offset = nr_key_bits; - while (offset > 64) { - p = next_word(p); - offset -= 64; - } - - offset = 64 - offset; - - while (nr_key_bits) { - unsigned bits = nr_key_bits + offset < 64 - ? nr_key_bits - : 64 - offset; - - u64 mask = (~0ULL >> (64 - bits)) << offset; - - if (*p & mask) - return ret + __ffs64(*p & mask) - offset; - - p = prev_word(p); - nr_key_bits -= bits; - ret += bits; - offset = 0; - } - - return 0; -} - -#ifdef HAVE_BCACHEFS_COMPILED_UNPACK - -#define I(_x) (*(out)++ = (_x)) -#define I1(i0) I(i0) -#define I2(i0, i1) (I1(i0), I(i1)) -#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) -#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) -#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) - -static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, - enum bch_bkey_fields field, - unsigned dst_offset, unsigned dst_size, - bool *eax_zeroed) -{ - unsigned bits = format->bits_per_field[field]; - u64 offset = le64_to_cpu(format->field_offset[field]); - unsigned i, byte, bit_offset, align, shl, shr; - - if (!bits && !offset) { - if (!*eax_zeroed) { - /* xor eax, eax */ - I2(0x31, 0xc0); - } - - *eax_zeroed = true; - goto set_field; - } - - if (!bits) { - /* just return offset: */ - - switch (dst_size) { - case 8: - if (offset > S32_MAX) { - /* mov [rdi + dst_offset], offset */ - I3(0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - - I3(0xc7, 0x47, dst_offset + 4); - memcpy(out, (void *) &offset + 4, 4); - out += 4; - } else { - /* mov [rdi + dst_offset], offset */ - /* sign extended */ - I4(0x48, 0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - } - break; - case 4: - /* mov [rdi + dst_offset], offset */ - I3(0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - break; - default: - BUG(); - } - - return out; - } - - bit_offset = format->key_u64s * 64; - for (i = 0; i <= field; i++) - bit_offset -= format->bits_per_field[i]; - - byte = bit_offset / 8; - bit_offset -= byte * 8; - - *eax_zeroed = false; - - if (bit_offset == 0 && bits == 8) { - /* movzx eax, BYTE PTR [rsi + imm8] */ - I4(0x0f, 0xb6, 0x46, byte); - } else if (bit_offset == 0 && bits == 16) { - /* movzx eax, WORD PTR [rsi + imm8] */ - I4(0x0f, 0xb7, 0x46, byte); - } else if (bit_offset + bits <= 32) { - align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); - byte -= align; - bit_offset += align * 8; - - BUG_ON(bit_offset + bits > 32); - - /* mov eax, [rsi + imm8] */ - I3(0x8b, 0x46, byte); - - if (bit_offset) { - /* shr eax, imm8 */ - I3(0xc1, 0xe8, bit_offset); - } - - if (bit_offset + bits < 32) { - unsigned mask = ~0U >> (32 - bits); - - /* and eax, imm32 */ - I1(0x25); - memcpy(out, &mask, 4); - out += 4; - } - } else if (bit_offset + bits <= 64) { - align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); - byte -= align; - bit_offset += align * 8; - - BUG_ON(bit_offset + bits > 64); - - /* mov rax, [rsi + imm8] */ - I4(0x48, 0x8b, 0x46, byte); - - shl = 64 - bit_offset - bits; - shr = bit_offset + shl; - - if (shl) { - /* shl rax, imm8 */ - I4(0x48, 0xc1, 0xe0, shl); - } - - if (shr) { - /* shr rax, imm8 */ - I4(0x48, 0xc1, 0xe8, shr); - } - } else { - align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); - byte -= align; - bit_offset += align * 8; - - BUG_ON(bit_offset + bits > 96); - - /* mov rax, [rsi + byte] */ - I4(0x48, 0x8b, 0x46, byte); - - /* mov edx, [rsi + byte + 8] */ - I3(0x8b, 0x56, byte + 8); - - /* bits from next word: */ - shr = bit_offset + bits - 64; - BUG_ON(shr > bit_offset); - - /* shr rax, bit_offset */ - I4(0x48, 0xc1, 0xe8, shr); - - /* shl rdx, imm8 */ - I4(0x48, 0xc1, 0xe2, 64 - shr); - - /* or rax, rdx */ - I3(0x48, 0x09, 0xd0); - - shr = bit_offset - shr; - - if (shr) { - /* shr rax, imm8 */ - I4(0x48, 0xc1, 0xe8, shr); - } - } - - /* rax += offset: */ - if (offset > S32_MAX) { - /* mov rdx, imm64 */ - I2(0x48, 0xba); - memcpy(out, &offset, 8); - out += 8; - /* add %rdx, %rax */ - I3(0x48, 0x01, 0xd0); - } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { - /* add rax, imm32 */ - I2(0x48, 0x05); - memcpy(out, &offset, 4); - out += 4; - } else if (offset) { - /* add eax, imm32 */ - I1(0x05); - memcpy(out, &offset, 4); - out += 4; - } -set_field: - switch (dst_size) { - case 8: - /* mov [rdi + dst_offset], rax */ - I4(0x48, 0x89, 0x47, dst_offset); - break; - case 4: - /* mov [rdi + dst_offset], eax */ - I3(0x89, 0x47, dst_offset); - break; - default: - BUG(); - } - - return out; -} - -int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) -{ - bool eax_zeroed = false; - u8 *out = _out; - - /* - * rdi: dst - unpacked key - * rsi: src - packed key - */ - - /* k->u64s, k->format, k->type */ - - /* mov eax, [rsi] */ - I2(0x8b, 0x06); - - /* add eax, BKEY_U64s - format->key_u64s */ - I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); - - /* and eax, imm32: mask out k->pad: */ - I5(0x25, 0xff, 0xff, 0xff, 0); - - /* mov [rdi], eax */ - I2(0x89, 0x07); - -#define x(id, field) \ - out = compile_bkey_field(format, out, id, \ - offsetof(struct bkey, field), \ - sizeof(((struct bkey *) NULL)->field), \ - &eax_zeroed); - bkey_fields() -#undef x - - /* retq */ - I1(0xc3); - - return (void *) out - _out; -} - -#else -#endif - -__pure -int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, - const struct bkey_packed *r, - const struct btree *b) -{ - return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); -} - -__pure __flatten -int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, - const struct bkey_packed *l, - const struct bpos *r) -{ - return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r); -} - -__pure __flatten -int bch2_bkey_cmp_packed(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed_inlined(b, l, r); -} - -__pure __flatten -int __bch2_bkey_cmp_left_packed(const struct btree *b, - const struct bkey_packed *l, - const struct bpos *r) -{ - const struct bkey *l_unpacked; - - return unlikely(l_unpacked = packed_to_bkey_c(l)) - ? bpos_cmp(l_unpacked->p, *r) - : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -} - -void bch2_bpos_swab(struct bpos *p) -{ - u8 *l = (u8 *) p; - u8 *h = ((u8 *) &p[1]) - 1; - - while (l < h) { - swap(*l, *h); - l++; - --h; - } -} - -void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) -{ - const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; - u8 *l = k->key_start; - u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1; - - while (l < h) { - swap(*l, *h); - l++; - --h; - } -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_bkey_pack_test(void) -{ - struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); - struct bkey_packed p; - - struct bkey_format test_format = { - .key_u64s = 3, - .nr_fields = BKEY_NR_FIELDS, - .bits_per_field = { - 13, - 64, - 32, - }, - }; - - struct unpack_state in_s = - unpack_state_init(&bch2_bkey_format_current, (void *) &t); - struct pack_state out_s = pack_state_init(&test_format, &p); - unsigned i; - - for (i = 0; i < out_s.format->nr_fields; i++) { - u64 a, v = get_inc_field(&in_s, i); - - switch (i) { -#define x(id, field) case id: a = t.field; break; - bkey_fields() -#undef x - default: - BUG(); - } - - if (a != v) - panic("got %llu actual %llu i %u\n", v, a, i); - - if (!set_inc_field(&out_s, i, v)) - panic("failed at %u\n", i); - } - - BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); -} -#endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h deleted file mode 100644 index 3ccd521c190ac7..00000000000000 --- a/fs/bcachefs/bkey.h +++ /dev/null @@ -1,605 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_H -#define _BCACHEFS_BKEY_H - -#include -#include "bcachefs_format.h" -#include "bkey_types.h" -#include "btree_types.h" -#include "util.h" -#include "vstructs.h" - -#if 0 - -/* - * compiled unpack functions are disabled, pending a new interface for - * dynamically allocating executable memory: - */ - -#ifdef CONFIG_X86_64 -#define HAVE_BCACHEFS_COMPILED_UNPACK 1 -#endif -#endif - -void bch2_bkey_packed_to_binary_text(struct printbuf *, - const struct bkey_format *, - const struct bkey_packed *); - -enum bkey_lr_packed { - BKEY_PACKED_BOTH, - BKEY_PACKED_RIGHT, - BKEY_PACKED_LEFT, - BKEY_PACKED_NONE, -}; - -#define bkey_lr_packed(_l, _r) \ - ((_l)->format + ((_r)->format << 1)) - -static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src) -{ - memcpy_u64s_small(dst, src, src->u64s); -} - -static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src) -{ - memcpy_u64s_small(dst, src, src->k.u64s); -} - -struct btree; - -__pure -unsigned bch2_bkey_greatest_differing_bit(const struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); -__pure -unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); - -__pure -int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, - const struct bkey_packed *, - const struct btree *); - -__pure -int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, - const struct bkey_packed *, - const struct bpos *); - -__pure -int bch2_bkey_cmp_packed(const struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); - -__pure -int __bch2_bkey_cmp_left_packed(const struct btree *, - const struct bkey_packed *, - const struct bpos *); - -static inline __pure -int bkey_cmp_left_packed(const struct btree *b, - const struct bkey_packed *l, const struct bpos *r) -{ - return __bch2_bkey_cmp_left_packed(b, l, r); -} - -/* - * The compiler generates better code when we pass bpos by ref, but it's often - * enough terribly convenient to pass it by val... as much as I hate c++, const - * ref would be nice here: - */ -__pure __flatten -static inline int bkey_cmp_left_packed_byval(const struct btree *b, - const struct bkey_packed *l, - struct bpos r) -{ - return bkey_cmp_left_packed(b, l, &r); -} - -static __always_inline bool bpos_eq(struct bpos l, struct bpos r) -{ - return !((l.inode ^ r.inode) | - (l.offset ^ r.offset) | - (l.snapshot ^ r.snapshot)); -} - -static __always_inline bool bpos_lt(struct bpos l, struct bpos r) -{ - return l.inode != r.inode ? l.inode < r.inode : - l.offset != r.offset ? l.offset < r.offset : - l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false; -} - -static __always_inline bool bpos_le(struct bpos l, struct bpos r) -{ - return l.inode != r.inode ? l.inode < r.inode : - l.offset != r.offset ? l.offset < r.offset : - l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true; -} - -static __always_inline bool bpos_gt(struct bpos l, struct bpos r) -{ - return bpos_lt(r, l); -} - -static __always_inline bool bpos_ge(struct bpos l, struct bpos r) -{ - return bpos_le(r, l); -} - -static __always_inline int bpos_cmp(struct bpos l, struct bpos r) -{ - return cmp_int(l.inode, r.inode) ?: - cmp_int(l.offset, r.offset) ?: - cmp_int(l.snapshot, r.snapshot); -} - -static inline struct bpos bpos_min(struct bpos l, struct bpos r) -{ - return bpos_lt(l, r) ? l : r; -} - -static inline struct bpos bpos_max(struct bpos l, struct bpos r) -{ - return bpos_gt(l, r) ? l : r; -} - -static __always_inline bool bkey_eq(struct bpos l, struct bpos r) -{ - return !((l.inode ^ r.inode) | - (l.offset ^ r.offset)); -} - -static __always_inline bool bkey_lt(struct bpos l, struct bpos r) -{ - return l.inode != r.inode - ? l.inode < r.inode - : l.offset < r.offset; -} - -static __always_inline bool bkey_le(struct bpos l, struct bpos r) -{ - return l.inode != r.inode - ? l.inode < r.inode - : l.offset <= r.offset; -} - -static __always_inline bool bkey_gt(struct bpos l, struct bpos r) -{ - return bkey_lt(r, l); -} - -static __always_inline bool bkey_ge(struct bpos l, struct bpos r) -{ - return bkey_le(r, l); -} - -static __always_inline int bkey_cmp(struct bpos l, struct bpos r) -{ - return cmp_int(l.inode, r.inode) ?: - cmp_int(l.offset, r.offset); -} - -static inline struct bpos bkey_min(struct bpos l, struct bpos r) -{ - return bkey_lt(l, r) ? l : r; -} - -static inline struct bpos bkey_max(struct bpos l, struct bpos r) -{ - return bkey_gt(l, r) ? l : r; -} - -static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) -{ - return bpos_eq(l.k->p, r.k->p) && - l.k->size == r.k->size && - bkey_bytes(l.k) == bkey_bytes(r.k) && - !memcmp(l.v, r.v, bkey_val_bytes(l.k)); -} - -void bch2_bpos_swab(struct bpos *); -void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); - -static __always_inline int bversion_cmp(struct bversion l, struct bversion r) -{ - return cmp_int(l.hi, r.hi) ?: - cmp_int(l.lo, r.lo); -} - -#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) -#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) - -static __always_inline bool bversion_zero(struct bversion v) -{ - return bversion_cmp(v, ZERO_VERSION) == 0; -} - -#ifdef CONFIG_BCACHEFS_DEBUG -/* statement expressions confusing unlikely()? */ -#define bkey_packed(_k) \ - ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ - (_k)->format != KEY_FORMAT_CURRENT; }) -#else -#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) -#endif - -/* - * It's safe to treat an unpacked bkey as a packed one, but not the reverse - */ -static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) -{ - return (struct bkey_packed *) k; -} - -static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) -{ - return (const struct bkey_packed *) k; -} - -static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) -{ - return bkey_packed(k) ? NULL : (struct bkey_i *) k; -} - -static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) -{ - return bkey_packed(k) ? NULL : (const struct bkey *) k; -} - -static inline unsigned bkey_format_key_bits(const struct bkey_format *format) -{ - return format->bits_per_field[BKEY_FIELD_INODE] + - format->bits_per_field[BKEY_FIELD_OFFSET] + - format->bits_per_field[BKEY_FIELD_SNAPSHOT]; -} - -static inline struct bpos bpos_successor(struct bpos p) -{ - if (!++p.snapshot && - !++p.offset && - !++p.inode) - BUG(); - - return p; -} - -static inline struct bpos bpos_predecessor(struct bpos p) -{ - if (!p.snapshot-- && - !p.offset-- && - !p.inode--) - BUG(); - - return p; -} - -static inline struct bpos bpos_nosnap_successor(struct bpos p) -{ - p.snapshot = 0; - - if (!++p.offset && - !++p.inode) - BUG(); - - return p; -} - -static inline struct bpos bpos_nosnap_predecessor(struct bpos p) -{ - p.snapshot = 0; - - if (!p.offset-- && - !p.inode--) - BUG(); - - return p; -} - -static inline u64 bkey_start_offset(const struct bkey *k) -{ - return k->p.offset - k->size; -} - -static inline struct bpos bkey_start_pos(const struct bkey *k) -{ - return (struct bpos) { - .inode = k->p.inode, - .offset = bkey_start_offset(k), - .snapshot = k->p.snapshot, - }; -} - -/* Packed helpers */ - -static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return bkey_packed(k) ? format->key_u64s : BKEY_U64s; -} - -static inline bool bkeyp_u64s_valid(const struct bkey_format *f, - const struct bkey_packed *k) -{ - return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s); -} - -static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return bkeyp_key_u64s(format, k) * sizeof(u64); -} - -static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return k->u64s - bkeyp_key_u64s(format, k); -} - -static inline size_t bkeyp_val_bytes(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return bkeyp_val_u64s(format, k) * sizeof(u64); -} - -static inline void set_bkeyp_val_u64s(const struct bkey_format *format, - struct bkey_packed *k, unsigned val_u64s) -{ - k->u64s = bkeyp_key_u64s(format, k) + val_u64s; -} - -#define bkeyp_val(_format, _k) \ - ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k))) - -extern const struct bkey_format bch2_bkey_format_current; - -bool bch2_bkey_transform(const struct bkey_format *, - struct bkey_packed *, - const struct bkey_format *, - const struct bkey_packed *); - -struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, - const struct bkey_packed *); - -#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -struct bpos __bkey_unpack_pos(const struct bkey_format *, - const struct bkey_packed *); -#endif - -bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, - const struct bkey_format *); - -enum bkey_pack_pos_ret { - BKEY_PACK_POS_EXACT, - BKEY_PACK_POS_SMALLER, - BKEY_PACK_POS_FAIL, -}; - -enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, - const struct btree *); - -static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, - const struct btree *b) -{ - return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; -} - -void bch2_bkey_unpack(const struct btree *, struct bkey_i *, - const struct bkey_packed *); -bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, - const struct bkey_format *); - -typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); - -static inline void -__bkey_unpack_key_format_checked(const struct btree *b, - struct bkey *dst, - const struct bkey_packed *src) -{ - if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) { - compiled_unpack_fn unpack_fn = b->aux_data; - unpack_fn(dst, src); - - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { - struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); - - BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); - } - } else { - *dst = __bch2_bkey_unpack_key(&b->format, src); - } -} - -static inline struct bkey -bkey_unpack_key_format_checked(const struct btree *b, - const struct bkey_packed *src) -{ - struct bkey dst; - - __bkey_unpack_key_format_checked(b, &dst, src); - return dst; -} - -static inline void __bkey_unpack_key(const struct btree *b, - struct bkey *dst, - const struct bkey_packed *src) -{ - if (likely(bkey_packed(src))) - __bkey_unpack_key_format_checked(b, dst, src); - else - *dst = *packed_to_bkey_c(src); -} - -/** - * bkey_unpack_key -- unpack just the key, not the value - */ -static inline struct bkey bkey_unpack_key(const struct btree *b, - const struct bkey_packed *src) -{ - return likely(bkey_packed(src)) - ? bkey_unpack_key_format_checked(b, src) - : *packed_to_bkey_c(src); -} - -static inline struct bpos -bkey_unpack_pos_format_checked(const struct btree *b, - const struct bkey_packed *src) -{ -#ifdef HAVE_BCACHEFS_COMPILED_UNPACK - return bkey_unpack_key_format_checked(b, src).p; -#else - return __bkey_unpack_pos(&b->format, src); -#endif -} - -static inline struct bpos bkey_unpack_pos(const struct btree *b, - const struct bkey_packed *src) -{ - return likely(bkey_packed(src)) - ? bkey_unpack_pos_format_checked(b, src) - : packed_to_bkey_c(src)->p; -} - -/* Disassembled bkeys */ - -static inline struct bkey_s_c bkey_disassemble(const struct btree *b, - const struct bkey_packed *k, - struct bkey *u) -{ - __bkey_unpack_key(b, u, k); - - return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; -} - -/* non const version: */ -static inline struct bkey_s __bkey_disassemble(const struct btree *b, - struct bkey_packed *k, - struct bkey *u) -{ - __bkey_unpack_key(b, u, k); - - return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; -} - -static inline u64 bkey_field_max(const struct bkey_format *f, - enum bch_bkey_fields nr) -{ - return f->bits_per_field[nr] < 64 - ? (le64_to_cpu(f->field_offset[nr]) + - ~(~0ULL << f->bits_per_field[nr])) - : U64_MAX; -} - -#ifdef HAVE_BCACHEFS_COMPILED_UNPACK - -int bch2_compile_bkey_format(const struct bkey_format *, void *); - -#else - -static inline int bch2_compile_bkey_format(const struct bkey_format *format, - void *out) { return 0; } - -#endif - -static inline void bkey_reassemble(struct bkey_i *dst, - struct bkey_s_c src) -{ - dst->k = *src.k; - memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); -} - -/* byte order helpers */ - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - -static inline unsigned high_word_offset(const struct bkey_format *f) -{ - return f->key_u64s - 1; -} - -#define high_bit_offset 0 -#define nth_word(p, n) ((p) - (n)) - -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - -static inline unsigned high_word_offset(const struct bkey_format *f) -{ - return 0; -} - -#define high_bit_offset KEY_PACKED_BITS_START -#define nth_word(p, n) ((p) + (n)) - -#else -#error edit for your odd byteorder. -#endif - -#define high_word(f, k) ((u64 *) (k)->_data + high_word_offset(f)) -#define next_word(p) nth_word(p, 1) -#define prev_word(p) nth_word(p, -1) - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_bkey_pack_test(void); -#else -static inline void bch2_bkey_pack_test(void) {} -#endif - -#define bkey_fields() \ - x(BKEY_FIELD_INODE, p.inode) \ - x(BKEY_FIELD_OFFSET, p.offset) \ - x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ - x(BKEY_FIELD_SIZE, size) \ - x(BKEY_FIELD_VERSION_HI, bversion.hi) \ - x(BKEY_FIELD_VERSION_LO, bversion.lo) - -struct bkey_format_state { - u64 field_min[BKEY_NR_FIELDS]; - u64 field_max[BKEY_NR_FIELDS]; -}; - -void bch2_bkey_format_init(struct bkey_format_state *); - -static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v) -{ - s->field_min[field] = min(s->field_min[field], v); - s->field_max[field] = max(s->field_max[field], v); -} - -/* - * Changes @format so that @k can be successfully packed with @format - */ -static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) -{ -#define x(id, field) __bkey_format_add(s, id, k->field); - bkey_fields() -#undef x -} - -void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); -struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); - -static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i) -{ - unsigned f_bits = f->bits_per_field[i]; - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (f_bits > unpacked_bits) - return true; - - if ((f_bits == unpacked_bits) && field_offset) - return true; - - u64 f_mask = f_bits - ? ~((~0ULL << (f_bits - 1)) << 1) - : 0; - - if (((field_offset + f_mask) & unpacked_mask) < field_offset) - return true; - return false; -} - -int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, - enum bch_validate_flags, struct printbuf *); -void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); - -#endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h deleted file mode 100644 index a30c4ae8eb369d..00000000000000 --- a/fs/bcachefs/bkey_buf.h +++ /dev/null @@ -1,61 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_BUF_H -#define _BCACHEFS_BKEY_BUF_H - -#include "bcachefs.h" -#include "bkey.h" - -struct bkey_buf { - struct bkey_i *k; - u64 onstack[12]; -}; - -static inline void bch2_bkey_buf_realloc(struct bkey_buf *s, - struct bch_fs *c, unsigned u64s) -{ - if (s->k == (void *) s->onstack && - u64s > ARRAY_SIZE(s->onstack)) { - s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); - memcpy(s->k, s->onstack, sizeof(s->onstack)); - } -} - -static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s, - struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_buf_realloc(s, c, k.k->u64s); - bkey_reassemble(s->k, k); -} - -static inline void bch2_bkey_buf_copy(struct bkey_buf *s, - struct bch_fs *c, - struct bkey_i *src) -{ - bch2_bkey_buf_realloc(s, c, src->k.u64s); - bkey_copy(s->k, src); -} - -static inline void bch2_bkey_buf_unpack(struct bkey_buf *s, - struct bch_fs *c, - struct btree *b, - struct bkey_packed *src) -{ - bch2_bkey_buf_realloc(s, c, BKEY_U64s + - bkeyp_val_u64s(&b->format, src)); - bch2_bkey_unpack(b, s->k, src); -} - -static inline void bch2_bkey_buf_init(struct bkey_buf *s) -{ - s->k = (void *) s->onstack; -} - -static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c) -{ - if (s->k != (void *) s->onstack) - mempool_free(s->k, &c->large_bkey_pool); - s->k = NULL; -} - -#endif /* _BCACHEFS_BKEY_BUF_H */ diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h deleted file mode 100644 index 5f42a6e693606b..00000000000000 --- a/fs/bcachefs/bkey_cmp.h +++ /dev/null @@ -1,129 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_CMP_H -#define _BCACHEFS_BKEY_CMP_H - -#include "bkey.h" - -#ifdef CONFIG_X86_64 -static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, - unsigned nr_key_bits) -{ - long d0, d1, d2, d3; - int cmp; - - /* we shouldn't need asm for this, but gcc is being retarded: */ - - asm(".intel_syntax noprefix;" - "xor eax, eax;" - "xor edx, edx;" - "1:;" - "mov r8, [rdi];" - "mov r9, [rsi];" - "sub ecx, 64;" - "jl 2f;" - - "cmp r8, r9;" - "jnz 3f;" - - "lea rdi, [rdi - 8];" - "lea rsi, [rsi - 8];" - "jmp 1b;" - - "2:;" - "not ecx;" - "shr r8, 1;" - "shr r9, 1;" - "shr r8, cl;" - "shr r9, cl;" - "cmp r8, r9;" - - "3:\n" - "seta al;" - "setb dl;" - "sub eax, edx;" - ".att_syntax prefix;" - : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) - : "0" (l), "1" (r), "3" (nr_key_bits) - : "r8", "r9", "cc", "memory"); - - return cmp; -} -#else -static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, - unsigned nr_key_bits) -{ - u64 l_v, r_v; - - if (!nr_key_bits) - return 0; - - /* for big endian, skip past header */ - nr_key_bits += high_bit_offset; - l_v = *l & (~0ULL >> high_bit_offset); - r_v = *r & (~0ULL >> high_bit_offset); - - while (1) { - if (nr_key_bits < 64) { - l_v >>= 64 - nr_key_bits; - r_v >>= 64 - nr_key_bits; - nr_key_bits = 0; - } else { - nr_key_bits -= 64; - } - - if (!nr_key_bits || l_v != r_v) - break; - - l = next_word(l); - r = next_word(r); - - l_v = *l; - r_v = *r; - } - - return cmp_int(l_v, r_v); -} -#endif - -static inline __pure __flatten -int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l, - const struct bkey_packed *r, - const struct btree *b) -{ - const struct bkey_format *f = &b->format; - int ret; - - EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); - - ret = __bkey_cmp_bits(high_word(f, l), - high_word(f, r), - b->nr_key_bits); - - EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), - bkey_unpack_pos(b, r))); - return ret; -} - -static inline __pure __flatten -int bch2_bkey_cmp_packed_inlined(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - struct bkey unpacked; - - if (likely(bkey_packed(l) && bkey_packed(r))) - return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); - - if (bkey_packed(l)) { - __bkey_unpack_key_format_checked(b, &unpacked, l); - l = (void *) &unpacked; - } else if (bkey_packed(r)) { - __bkey_unpack_key_format_checked(b, &unpacked, r); - r = (void *) &unpacked; - } - - return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); -} - -#endif /* _BCACHEFS_BKEY_CMP_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c deleted file mode 100644 index fcd8c82cba4f6f..00000000000000 --- a/fs/bcachefs/bkey_methods.c +++ /dev/null @@ -1,497 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "backpointers.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_types.h" -#include "alloc_background.h" -#include "dirent.h" -#include "disk_accounting.h" -#include "ec.h" -#include "error.h" -#include "extents.h" -#include "inode.h" -#include "io_misc.h" -#include "lru.h" -#include "quota.h" -#include "reflink.h" -#include "snapshot.h" -#include "subvolume.h" -#include "xattr.h" - -const char * const bch2_bkey_types[] = { -#define x(name, nr, ...) #name, - BCH_BKEY_TYPES() -#undef x - NULL -}; - -static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -#define bch2_bkey_ops_deleted ((struct bkey_ops) { \ - .key_validate = deleted_key_validate, \ -}) - -#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ - .key_validate = deleted_key_validate, \ -}) - -static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_val_bytes(k.k), - c, bkey_val_size_nonzero, - "incorrect value size (%zu != 0)", - bkey_val_bytes(k.k)); -fsck_err: - return ret; -} - -#define bch2_bkey_ops_error ((struct bkey_ops) { \ - .key_validate = empty_val_key_validate, \ -}) - -static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k); - - prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie)); -} - -#define bch2_bkey_ops_cookie ((struct bkey_ops) { \ - .key_validate = key_type_cookie_validate, \ - .val_to_text = key_type_cookie_to_text, \ - .min_val_size = 8, \ -}) - -#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ - .key_validate = empty_val_key_validate, \ -}) - -static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); - unsigned datalen = bkey_inline_data_bytes(k.k); - - prt_printf(out, "datalen %u: %*phN", - datalen, min(datalen, 32U), d.v->data); -} - -#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ - .key_validate = key_type_inline_data_validate, \ - .val_to_text = key_type_inline_data_to_text, \ -}) - -static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -{ - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -#define bch2_bkey_ops_set ((struct bkey_ops) { \ - .key_validate = empty_val_key_validate, \ - .key_merge = key_type_set_merge, \ -}) - -const struct bkey_ops bch2_bkey_ops[] = { -#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name, - BCH_BKEY_TYPES() -#undef x -}; - -const struct bkey_ops bch2_bkey_null_ops = { -}; - -int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) - return 0; - - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - int ret = 0; - - bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, - c, bkey_val_size_too_small, - "bad val size (%zu < %u)", - bkey_val_bytes(k.k), ops->min_val_size); - - if (!ops->key_validate) - return 0; - - ret = ops->key_validate(c, k, from); -fsck_err: - return ret; -} - -static u64 bch2_key_types_allowed[] = { - [BKEY_TYPE_btree] = - BIT_ULL(KEY_TYPE_deleted)| - BIT_ULL(KEY_TYPE_btree_ptr)| - BIT_ULL(KEY_TYPE_btree_ptr_v2), -#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, - BCH_BTREE_IDS() -#undef x -}; - -static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = { -#define x(name, nr, flags) [KEY_TYPE_##name] = flags, - BCH_BKEY_TYPES() -#undef x -}; - -const char *bch2_btree_node_type_str(enum btree_node_type type) -{ - return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); -} - -int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - enum btree_node_type type = __btree_node_type(from.level, from.btree); - - if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) - return 0; - - int ret = 0; - - bkey_fsck_err_on(k.k->u64s < BKEY_U64s, - c, bkey_u64s_too_small, - "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); - - if (type >= BKEY_TYPE_NR) - return 0; - - enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX - ? bch2_bkey_type_flags[k.k->type] - : 0; - - bool strict_key_type_allowed = - (from.flags & BCH_VALIDATE_commit) || - type == BKEY_TYPE_btree || - (from.btree < BTREE_ID_NR && - (bkey_flags & BKEY_TYPE_strict_btree_checks)); - - bkey_fsck_err_on(strict_key_type_allowed && - k.k->type < KEY_TYPE_MAX && - !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), - c, bkey_invalid_type_for_btree, - "invalid key type for btree %s (%s)", - bch2_btree_node_type_str(type), - k.k->type < KEY_TYPE_MAX - ? bch2_bkey_types[k.k->type] - : "(unknown)"); - - if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { - bkey_fsck_err_on(k.k->size == 0, - c, bkey_extent_size_zero, - "size == 0"); - - bkey_fsck_err_on(k.k->size > k.k->p.offset, - c, bkey_extent_size_greater_than_offset, - "size greater than offset (%u > %llu)", - k.k->size, k.k->p.offset); - } else { - bkey_fsck_err_on(k.k->size, - c, bkey_size_nonzero, - "size != 0"); - } - - if (type != BKEY_TYPE_btree) { - enum btree_id btree = type - 1; - - if (btree_type_has_snapshots(btree)) { - bkey_fsck_err_on(!k.k->p.snapshot, - c, bkey_snapshot_zero, - "snapshot == 0"); - } else if (!btree_type_has_snapshot_field(btree)) { - bkey_fsck_err_on(k.k->p.snapshot, - c, bkey_snapshot_nonzero, - "nonzero snapshot"); - } else { - /* - * btree uses snapshot field but it's not required to be - * nonzero - */ - } - - bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), - c, bkey_at_pos_max, - "key at POS_MAX"); - } -fsck_err: - return ret; -} - -int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return __bch2_bkey_validate(c, k, from) ?: - bch2_bkey_val_validate(c, k, from); -} - -int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), - c, bkey_before_start_of_btree_node, - "key before start of btree node"); - - bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), - c, bkey_after_end_of_btree_node, - "key past end of btree node"); -fsck_err: - return ret; -} - -void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) -{ - if (bpos_eq(pos, POS_MIN)) - prt_printf(out, "POS_MIN"); - else if (bpos_eq(pos, POS_MAX)) - prt_printf(out, "POS_MAX"); - else if (bpos_eq(pos, SPOS_MAX)) - prt_printf(out, "SPOS_MAX"); - else { - if (pos.inode == U64_MAX) - prt_printf(out, "U64_MAX"); - else - prt_printf(out, "%llu", pos.inode); - prt_printf(out, ":"); - if (pos.offset == U64_MAX) - prt_printf(out, "U64_MAX"); - else - prt_printf(out, "%llu", pos.offset); - prt_printf(out, ":"); - if (pos.snapshot == U32_MAX) - prt_printf(out, "U32_MAX"); - else - prt_printf(out, "%u", pos.snapshot); - } -} - -void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) -{ - if (k) { - prt_printf(out, "u64s %u type ", k->u64s); - - if (k->type < KEY_TYPE_MAX) - prt_printf(out, "%s ", bch2_bkey_types[k->type]); - else - prt_printf(out, "%u ", k->type); - - bch2_bpos_to_text(out, k->p); - - prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo); - } else { - prt_printf(out, "(null)"); - } -} - -void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - - if (likely(ops->val_to_text)) - ops->val_to_text(out, c, k); -} - -void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_to_text(out, k.k); - - if (bkey_val_bytes(k.k)) { - prt_printf(out, ": "); - bch2_val_to_text(out, c, k); - } -} - -void bch2_bkey_swab_val(struct bkey_s k) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - - if (ops->swab) - ops->swab(k); -} - -bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - - return ops->key_normalize - ? ops->key_normalize(c, k) - : false; -} - -bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type); - - return ops->key_merge && - bch2_bkey_maybe_mergable(l.k, r.k) && - (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && - !static_branch_unlikely(&bch2_key_merging_disabled) && - ops->key_merge(c, l, r); -} - -static const struct old_bkey_type { - u8 btree_node_type; - u8 old; - u8 new; -} bkey_renumber_table[] = { - {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr }, - {BKEY_TYPE_extents, 128, KEY_TYPE_extent }, - {BKEY_TYPE_extents, 129, KEY_TYPE_extent }, - {BKEY_TYPE_extents, 130, KEY_TYPE_reservation }, - {BKEY_TYPE_inodes, 128, KEY_TYPE_inode }, - {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation }, - {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent }, - {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout }, - {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr }, - {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout }, - {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc }, - {BKEY_TYPE_quotas, 128, KEY_TYPE_quota }, -}; - -void bch2_bkey_renumber(enum btree_node_type btree_node_type, - struct bkey_packed *k, - int write) -{ - const struct old_bkey_type *i; - - for (i = bkey_renumber_table; - i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); - i++) - if (btree_node_type == i->btree_node_type && - k->type == (write ? i->new : i->old)) { - k->type = write ? i->old : i->new; - break; - } -} - -void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, - struct bkey_format *f, - struct bkey_packed *k) -{ - const struct bkey_ops *ops; - struct bkey uk; - unsigned nr_compat = 5; - int i; - - /* - * Do these operations in reverse order in the write path: - */ - - for (i = 0; i < nr_compat; i++) - switch (!write ? i : nr_compat - 1 - i) { - case 0: - if (big_endian != CPU_BIG_ENDIAN) { - bch2_bkey_swab_key(f, k); - } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - bch2_bkey_swab_key(f, k); - bch2_bkey_swab_key(f, k); - } - break; - case 1: - if (version < bcachefs_metadata_version_bkey_renumber) - bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); - break; - case 2: - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_inodes) { - if (!bkey_packed(k)) { - struct bkey_i *u = packed_to_bkey(k); - - swap(u->k.p.inode, u->k.p.offset); - } else if (f->bits_per_field[BKEY_FIELD_INODE] && - f->bits_per_field[BKEY_FIELD_OFFSET]) { - struct bkey_format tmp = *f, *in = f, *out = &tmp; - - swap(tmp.bits_per_field[BKEY_FIELD_INODE], - tmp.bits_per_field[BKEY_FIELD_OFFSET]); - swap(tmp.field_offset[BKEY_FIELD_INODE], - tmp.field_offset[BKEY_FIELD_OFFSET]); - - if (!write) - swap(in, out); - - uk = __bch2_bkey_unpack_key(in, k); - swap(uk.p.inode, uk.p.offset); - BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); - } - } - break; - case 3: - if (version < bcachefs_metadata_version_snapshot && - (level || btree_type_has_snapshots(btree_id))) { - struct bkey_i *u = packed_to_bkey(k); - - if (u) { - u->k.p.snapshot = write - ? 0 : U32_MAX; - } else { - u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]); - u64 max_packed = min_packed + - ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); - - uk = __bch2_bkey_unpack_key(f, k); - uk.p.snapshot = write - ? min_packed : min_t(u64, U32_MAX, max_packed); - - BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); - } - } - - break; - case 4: { - struct bkey_s u; - - if (!bkey_packed(k)) { - u = bkey_i_to_s(packed_to_bkey(k)); - } else { - uk = __bch2_bkey_unpack_key(f, k); - u.k = &uk; - u.v = bkeyp_val(f, k); - } - - if (big_endian != CPU_BIG_ENDIAN) - bch2_bkey_swab_val(u); - - ops = bch2_bkey_type_ops(k->type); - - if (ops->compat) - ops->compat(btree_id, version, big_endian, write, u); - break; - } - default: - BUG(); - } -} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h deleted file mode 100644 index bf34111cdf008d..00000000000000 --- a/fs/bcachefs/bkey_methods.h +++ /dev/null @@ -1,139 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_METHODS_H -#define _BCACHEFS_BKEY_METHODS_H - -#include "bkey.h" - -struct bch_fs; -struct btree; -struct btree_trans; -struct bkey; -enum btree_node_type; - -extern const char * const bch2_bkey_types[]; -extern const struct bkey_ops bch2_bkey_null_ops; - -/* - * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If - * invalid, entire key will be deleted. - * - * When invalid, error string is returned via @err. @rw indicates whether key is - * being read or written; more aggressive checks can be enabled when rw == WRITE. - */ -struct bkey_ops { - int (*key_validate)(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from); - void (*val_to_text)(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - void (*swab)(struct bkey_s); - bool (*key_normalize)(struct bch_fs *, struct bkey_s); - bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); - int (*trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - void (*compat)(enum btree_id id, unsigned version, - unsigned big_endian, int write, - struct bkey_s); - - /* Size of value type when first created: */ - unsigned min_val_size; -}; - -extern const struct bkey_ops bch2_bkey_ops[]; - -static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) -{ - return likely(type < KEY_TYPE_MAX) - ? &bch2_bkey_ops[type] - : &bch2_bkey_null_ops; -} - -int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, - struct bkey_validate_context from); - -void bch2_bpos_to_text(struct printbuf *, struct bpos); -void bch2_bkey_to_text(struct printbuf *, const struct bkey *); -void bch2_val_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - -void bch2_bkey_swab_val(struct bkey_s); - -bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); - -static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) -{ - return l->type == r->type && - !bversion_cmp(l->bversion, r->bversion) && - bpos_eq(l->p, bkey_start_pos(r)); -} - -bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - -static inline int bch2_key_trigger(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); - - return ops->trigger - ? ops->trigger(trans, btree, level, old, new, flags) - : 0; -} - -static inline int bch2_key_trigger_old(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i deleted; - - bkey_init(&deleted.k); - deleted.k.p = old.k->p; - - return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted), - BTREE_TRIGGER_overwrite|flags); -} - -static inline int bch2_key_trigger_new(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i deleted; - - bkey_init(&deleted.k); - deleted.k.p = new.k->p; - - return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, - BTREE_TRIGGER_insert|flags); -} - -void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); - -void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, - int, struct bkey_format *, struct bkey_packed *); - -static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, - struct bkey_format *f, - struct bkey_packed *k) -{ - if (version < bcachefs_metadata_version_current || - big_endian != CPU_BIG_ENDIAN || - IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - __bch2_bkey_compat(level, btree_id, version, - big_endian, write, f, k); - -} - -#endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c deleted file mode 100644 index 4536eb50fc4064..00000000000000 --- a/fs/bcachefs/bkey_sort.c +++ /dev/null @@ -1,214 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bkey_buf.h" -#include "bkey_cmp.h" -#include "bkey_sort.h" -#include "bset.h" -#include "extents.h" - -typedef int (*sort_cmp_fn)(const struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); - -static inline bool sort_iter_end(struct sort_iter *iter) -{ - return !iter->used; -} - -static inline void sort_iter_sift(struct sort_iter *iter, unsigned from, - sort_cmp_fn cmp) -{ - unsigned i; - - for (i = from; - i + 1 < iter->used && - cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; - i++) - swap(iter->data[i], iter->data[i + 1]); -} - -static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) -{ - unsigned i = iter->used; - - while (i--) - sort_iter_sift(iter, i, cmp); -} - -static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) -{ - return !sort_iter_end(iter) ? iter->data->k : NULL; -} - -static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -{ - struct sort_iter_set *i = iter->data; - - BUG_ON(!iter->used); - - i->k = bkey_p_next(i->k); - - BUG_ON(i->k > i->end); - - if (i->k == i->end) - array_remove_item(iter->data, iter->used, 0); - else - sort_iter_sift(iter, 0, cmp); -} - -static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, - sort_cmp_fn cmp) -{ - struct bkey_packed *ret = sort_iter_peek(iter); - - if (ret) - sort_iter_advance(iter, cmp); - - return ret; -} - -/* - * If keys compare equal, compare by pointer order: - */ -static inline int key_sort_fix_overlapping_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed(b, l, r) ?: - cmp_int((unsigned long) l, (unsigned long) r); -} - -static inline bool should_drop_next_key(struct sort_iter *iter) -{ - /* - * key_sort_cmp() ensures that when keys compare equal the older key - * comes first; so if l->k compares equal to r->k then l->k is older - * and should be dropped. - */ - return iter->used >= 2 && - !bch2_bkey_cmp_packed(iter->b, - iter->data[0].k, - iter->data[1].k); -} - -struct btree_nr_keys -bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, - struct sort_iter *iter) -{ - struct bkey_packed *out = dst->start; - struct bkey_packed *k; - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - sort_iter_sort(iter, key_sort_fix_overlapping_cmp); - - while ((k = sort_iter_peek(iter))) { - if (!bkey_deleted(k) && - !should_drop_next_key(iter)) { - bkey_p_copy(out, k); - btree_keys_account_key_add(&nr, 0, out); - out = bkey_p_next(out); - } - - sort_iter_advance(iter, key_sort_fix_overlapping_cmp); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -/* Sort + repack in a new format: */ -struct btree_nr_keys -bch2_sort_repack(struct bset *dst, struct btree *src, - struct btree_node_iter *src_iter, - struct bkey_format *out_f, - bool filter_whiteouts) -{ - struct bkey_format *in_f = &src->format; - struct bkey_packed *in, *out = vstruct_last(dst); - struct btree_nr_keys nr; - bool transform = memcmp(out_f, &src->format, sizeof(*out_f)); - - memset(&nr, 0, sizeof(nr)); - - while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { - if (filter_whiteouts && bkey_deleted(in)) - continue; - - if (!transform) - bkey_p_copy(out, in); - else if (bch2_bkey_transform(out_f, out, bkey_packed(in) - ? in_f : &bch2_bkey_format_current, in)) - out->format = KEY_FORMAT_LOCAL_BTREE; - else - bch2_bkey_unpack(src, (void *) out, in); - - out->needs_whiteout = false; - - btree_keys_account_key_add(&nr, 0, out); - out = bkey_p_next(out); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -static inline int keep_unwritten_whiteouts_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed_inlined(b, l, r) ?: - (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: - (long) l - (long) r; -} - -#include "btree_update_interior.h" - -/* - * For sorting in the btree node write path: whiteouts not in the unwritten - * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are - * dropped if overwritten by real keys: - */ -unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter) -{ - struct bkey_packed *in, *next, *out = dst; - - sort_iter_sort(iter, keep_unwritten_whiteouts_cmp); - - while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) { - if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b)) - continue; - - if ((next = sort_iter_peek(iter)) && - !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) - continue; - - bkey_p_copy(out, in); - out = bkey_p_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -/* - * Main sort routine for compacting a btree node in memory: we always drop - * whiteouts because any whiteouts that need to be written are in the unwritten - * whiteouts area: - */ -unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter) -{ - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined); - - while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) { - if (bkey_deleted(in)) - continue; - - bkey_p_copy(out, in); - out = bkey_p_next(out); - } - - return (u64 *) out - (u64 *) dst; -} diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h deleted file mode 100644 index 9be969d4689066..00000000000000 --- a/fs/bcachefs/bkey_sort.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_SORT_H -#define _BCACHEFS_BKEY_SORT_H - -struct sort_iter { - struct btree *b; - unsigned used; - unsigned size; - - struct sort_iter_set { - struct bkey_packed *k, *end; - } data[]; -}; - -static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size) -{ - iter->b = b; - iter->used = 0; - iter->size = size; -} - -struct sort_iter_stack { - struct sort_iter iter; - struct sort_iter_set sets[MAX_BSETS + 1]; -}; - -static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b) -{ - sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets)); -} - -static inline void sort_iter_add(struct sort_iter *iter, - struct bkey_packed *k, - struct bkey_packed *end) -{ - BUG_ON(iter->used >= iter->size); - - if (k != end) - iter->data[iter->used++] = (struct sort_iter_set) { k, end }; -} - -struct btree_nr_keys -bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, - struct sort_iter *); - -struct btree_nr_keys -bch2_sort_repack(struct bset *, struct btree *, - struct btree_node_iter *, - struct bkey_format *, bool); - -unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *); -unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *); - -#endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h deleted file mode 100644 index b4f328f9853c60..00000000000000 --- a/fs/bcachefs/bkey_types.h +++ /dev/null @@ -1,241 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_TYPES_H -#define _BCACHEFS_BKEY_TYPES_H - -#include "bcachefs_format.h" - -/* - * bkey_i - bkey with inline value - * bkey_s - bkey with split value - * bkey_s_c - bkey with split value, const - */ - -#define bkey_p_next(_k) vstruct_next(_k) - -static inline struct bkey_i *bkey_next(struct bkey_i *k) -{ - return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); -} - -#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) - -static inline size_t bkey_val_bytes(const struct bkey *k) -{ - return bkey_val_u64s(k) * sizeof(u64); -} - -static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -{ - unsigned u64s = BKEY_U64s + val_u64s; - - BUG_ON(u64s > U8_MAX); - k->u64s = u64s; -} - -static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -{ - set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); -} - -#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) - -#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) - -#define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) - -/* bkey with split value, const */ -struct bkey_s_c { - const struct bkey *k; - const struct bch_val *v; -}; - -/* bkey with split value */ -struct bkey_s { - union { - struct { - struct bkey *k; - struct bch_val *v; - }; - struct bkey_s_c s_c; - }; -}; - -#define bkey_s_null ((struct bkey_s) { .k = NULL }) -#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) - -#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) - -static inline struct bkey_s bkey_to_s(struct bkey *k) -{ - return (struct bkey_s) { .k = k, .v = NULL }; -} - -static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -{ - return (struct bkey_s_c) { .k = k, .v = NULL }; -} - -static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -{ - return (struct bkey_s) { .k = &k->k, .v = &k->v }; -} - -static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -{ - return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -} - -/* - * For a given type of value (e.g. struct bch_extent), generates the types for - * bkey + bch_extent - inline, split, split const - and also all the conversion - * functions, which also check that the value is of the correct type. - * - * We use anonymous unions for upcasting - e.g. converting from e.g. a - * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion - * functions. - */ -#define x(name, ...) \ -struct bkey_i_##name { \ - union { \ - struct bkey k; \ - struct bkey_i k_i; \ - }; \ - struct bch_##name v; \ -}; \ - \ -struct bkey_s_c_##name { \ - union { \ - struct { \ - const struct bkey *k; \ - const struct bch_##name *v; \ - }; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -struct bkey_s_##name { \ - union { \ - struct { \ - struct bkey *k; \ - struct bch_##name *v; \ - }; \ - struct bkey_s_c_##name c; \ - struct bkey_s s; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline const struct bkey_i_##name * \ -bkey_i_to_##name##_c(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -{ \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -name##_i_to_s_c(const struct bkey_i_##name *k) \ -{ \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -bkey_i_to_s_c_##name(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -{ \ - struct bkey_i_##name *k = \ - container_of(&_k->k, struct bkey_i_##name, k); \ - \ - bkey_init(&k->k); \ - memset(&k->v, 0, sizeof(k->v)); \ - k->k.type = KEY_TYPE_##name; \ - set_bkey_val_bytes(&k->k, sizeof(k->v)); \ - \ - return k; \ -} - -BCH_BKEY_TYPES(); -#undef x - -enum bch_validate_flags { - BCH_VALIDATE_write = BIT(0), - BCH_VALIDATE_commit = BIT(1), - BCH_VALIDATE_silent = BIT(2), -}; - -#define BKEY_VALIDATE_CONTEXTS() \ - x(unknown) \ - x(superblock) \ - x(journal) \ - x(btree_root) \ - x(btree_node) \ - x(commit) - -struct bkey_validate_context { - enum { -#define x(n) BKEY_VALIDATE_##n, - BKEY_VALIDATE_CONTEXTS() -#undef x - } from:8; - enum bch_validate_flags flags:8; - u8 level; - enum btree_id btree; - bool root:1; - unsigned journal_offset; - u64 journal_seq; -}; - -#endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c deleted file mode 100644 index 32841f762eb2e0..00000000000000 --- a/fs/bcachefs/bset.c +++ /dev/null @@ -1,1576 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Code for working with individual keys, and sorted sets of keys with in a - * btree node - * - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "btree_cache.h" -#include "bset.h" -#include "eytzinger.h" -#include "trace.h" -#include "util.h" - -#include -#include -#include -#include - -static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, - struct btree *); - -static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) -{ - unsigned n = ARRAY_SIZE(iter->data); - - while (n && __btree_node_iter_set_end(iter, n - 1)) - --n; - - return n; -} - -struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) -{ - return bch2_bkey_to_bset_inlined(b, k); -} - -/* - * There are never duplicate live keys in the btree - but including keys that - * have been flagged as deleted (and will be cleaned up later) we _will_ see - * duplicates. - * - * Thus the sort order is: usual key comparison first, but for keys that compare - * equal the deleted key(s) come first, and the (at most one) live version comes - * last. - * - * The main reason for this is insertion: to handle overwrites, we first iterate - * over keys that compare equal to our insert key, and then insert immediately - * prior to the first key greater than the key we're inserting - our insert - * position will be after all keys that compare equal to our insert key, which - * by the time we actually do the insert will all be deleted. - */ - -void bch2_dump_bset(struct bch_fs *c, struct btree *b, - struct bset *i, unsigned set) -{ - struct bkey_packed *_k, *_n; - struct bkey uk, n; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - - if (!i->u64s) - return; - - for (_k = i->start; - _k < vstruct_last(i); - _k = _n) { - _n = bkey_p_next(_k); - - if (!_k->u64s) { - printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set, - _k->_data - i->_data); - break; - } - - k = bkey_disassemble(b, _k, &uk); - - printbuf_reset(&buf); - if (c) - bch2_bkey_val_to_text(&buf, c, k); - else - bch2_bkey_to_text(&buf, k.k); - printk(KERN_ERR "block %u key %5zu: %s\n", set, - _k->_data - i->_data, buf.buf); - - if (_n == vstruct_last(i)) - continue; - - n = bkey_unpack_key(b, _n); - - if (bpos_lt(n.p, k.k->p)) { - printk(KERN_ERR "Key skipped backwards\n"); - continue; - } - - if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) - printk(KERN_ERR "Duplicate keys\n"); - } - - printbuf_exit(&buf); -} - -void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) -{ - console_lock(); - for_each_bset(b, t) - bch2_dump_bset(c, b, bset(b, t), t - b->set); - console_unlock(); -} - -void bch2_dump_btree_node_iter(struct btree *b, - struct btree_node_iter *iter) -{ - struct btree_node_iter_set *set; - struct printbuf buf = PRINTBUF; - - printk(KERN_ERR "btree node iter with %u/%u sets:\n", - __btree_node_iter_used(iter), b->nsets); - - btree_node_iter_for_each(iter, set) { - struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); - struct bset_tree *t = bch2_bkey_to_bset(b, k); - struct bkey uk = bkey_unpack_key(b, k); - - printbuf_reset(&buf); - bch2_bkey_to_text(&buf, &uk); - printk(KERN_ERR "set %zu key %u: %s\n", - t - b->set, set->k, buf.buf); - } - - printbuf_exit(&buf); -} - -struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) -{ - struct bkey_packed *k; - struct btree_nr_keys nr = {}; - - for_each_bset(b, t) - bset_tree_for_each_key(b, t, k) - if (!bkey_deleted(k)) - btree_keys_account_key_add(&nr, t - b->set, k); - return nr; -} - -void __bch2_verify_btree_nr_keys(struct btree *b) -{ - struct btree_nr_keys nr = bch2_btree_node_count_keys(b); - - BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); -} - -static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, - struct btree *b) -{ - struct btree_node_iter iter = *_iter; - const struct bkey_packed *k, *n; - - k = bch2_btree_node_iter_peek_all(&iter, b); - __bch2_btree_node_iter_advance(&iter, b); - n = bch2_btree_node_iter_peek_all(&iter, b); - - bkey_unpack_key(b, k); - - if (n && - bkey_iter_cmp(b, k, n) > 0) { - struct btree_node_iter_set *set; - struct bkey ku = bkey_unpack_key(b, k); - struct bkey nu = bkey_unpack_key(b, n); - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&buf1, &ku); - bch2_bkey_to_text(&buf2, &nu); - printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", - buf1.buf, buf2.buf); - printk(KERN_ERR "iter was:"); - - btree_node_iter_for_each(_iter, set) { - struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k); - struct bset_tree *t = bch2_bkey_to_bset(b, k2); - printk(" [%zi %zi]", t - b->set, - k2->_data - bset(b, t)->_data); - } - panic("\n"); - } -} - -void __bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) -{ - struct btree_node_iter_set *set, *s2; - struct bkey_packed *k, *p; - - if (bch2_btree_node_iter_end(iter)) - return; - - /* Verify no duplicates: */ - btree_node_iter_for_each(iter, set) { - BUG_ON(set->k > set->end); - btree_node_iter_for_each(iter, s2) - BUG_ON(set != s2 && set->end == s2->end); - } - - /* Verify that set->end is correct: */ - btree_node_iter_for_each(iter, set) { - for_each_bset(b, t) - if (set->end == t->end_offset) { - BUG_ON(set->k < btree_bkey_first_offset(t) || - set->k >= t->end_offset); - goto found; - } - BUG(); -found: - do {} while (0); - } - - /* Verify iterator is sorted: */ - btree_node_iter_for_each(iter, set) - BUG_ON(set != iter->data && - btree_node_iter_cmp(b, set[-1], set[0]) > 0); - - k = bch2_btree_node_iter_peek_all(iter, b); - - for_each_bset(b, t) { - if (iter->data[0].end == t->end_offset) - continue; - - p = bch2_bkey_prev_all(b, t, - bch2_btree_node_iter_bset_pos(iter, b, t)); - - BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); - } -} - -static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, - struct bkey_packed *insert, unsigned clobber_u64s) -{ - struct bset_tree *t = bch2_bkey_to_bset(b, where); - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); - struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s); - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; -#if 0 - BUG_ON(prev && - bkey_iter_cmp(b, prev, insert) > 0); -#else - if (prev && - bkey_iter_cmp(b, prev, insert) > 0) { - struct bkey k1 = bkey_unpack_key(b, prev); - struct bkey k2 = bkey_unpack_key(b, insert); - - bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&buf1, &k1); - bch2_bkey_to_text(&buf2, &k2); - - panic("prev > insert:\n" - "prev key %s\n" - "insert key %s\n", - buf1.buf, buf2.buf); - } -#endif -#if 0 - BUG_ON(next != btree_bkey_last(b, t) && - bkey_iter_cmp(b, insert, next) > 0); -#else - if (next != btree_bkey_last(b, t) && - bkey_iter_cmp(b, insert, next) > 0) { - struct bkey k1 = bkey_unpack_key(b, insert); - struct bkey k2 = bkey_unpack_key(b, next); - - bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&buf1, &k1); - bch2_bkey_to_text(&buf2, &k2); - - panic("insert > next:\n" - "insert key %s\n" - "next key %s\n", - buf1.buf, buf2.buf); - } -#endif -} - -static inline void bch2_verify_insert_pos(struct btree *b, - struct bkey_packed *where, - struct bkey_packed *insert, - unsigned clobber_u64s) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bch2_verify_insert_pos(b, where, insert, clobber_u64s); -} - - -/* Auxiliary search trees */ - -#define BFLOAT_FAILED_UNPACKED U8_MAX -#define BFLOAT_FAILED U8_MAX - -struct bkey_float { - u8 exponent; - u8 key_offset; - u16 mantissa; -}; -#define BKEY_MANTISSA_BITS 16 - -struct ro_aux_tree { - u8 nothing[0]; - struct bkey_float f[]; -}; - -struct rw_aux_tree { - u16 offset; - struct bpos k; -}; - -static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) -{ - BUG_ON(t->aux_data_offset == U16_MAX); - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - return t->aux_data_offset; - case BSET_RO_AUX_TREE: - return t->aux_data_offset + - DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8); - case BSET_RW_AUX_TREE: - return t->aux_data_offset + - DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); - default: - BUG(); - } -} - -static unsigned bset_aux_tree_buf_start(const struct btree *b, - const struct bset_tree *t) -{ - return t == b->set - ? DIV_ROUND_UP(b->unpack_fn_len, 8) - : bset_aux_tree_buf_end(t - 1); -} - -static void *__aux_tree_base(const struct btree *b, - const struct bset_tree *t) -{ - return b->aux_data + t->aux_data_offset * 8; -} - -static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, - const struct bset_tree *t) -{ - EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); - - return __aux_tree_base(b, t); -} - -static struct bkey_float *bkey_float(const struct btree *b, - const struct bset_tree *t, - unsigned idx) -{ - return ro_aux_tree_base(b, t)->f + idx; -} - -static void __bset_aux_tree_verify(struct btree *b) -{ - for_each_bset(b, t) { - if (t->aux_data_offset == U16_MAX) - continue; - - BUG_ON(t != b->set && - t[-1].aux_data_offset == U16_MAX); - - BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); - BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); - BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); - } -} - -static inline void bset_aux_tree_verify(struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bset_aux_tree_verify(b); -} - -void bch2_btree_keys_init(struct btree *b) -{ - unsigned i; - - b->nsets = 0; - memset(&b->nr, 0, sizeof(b->nr)); - - for (i = 0; i < MAX_BSETS; i++) - b->set[i].data_offset = U16_MAX; - - bch2_bset_set_no_aux_tree(b, b->set); -} - -/* Binary tree stuff for auxiliary search trees */ - -/* - * Cacheline/offset <-> bkey pointer arithmetic: - * - * t->tree is a binary search tree in an array; each node corresponds to a key - * in one cacheline in t->set (BSET_CACHELINE bytes). - * - * This means we don't have to store the full index of the key that a node in - * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and - * then bkey_float->m gives us the offset within that cacheline, in units of 8 - * bytes. - * - * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to - * make this work. - * - * To construct the bfloat for an arbitrary key we need to know what the key - * immediately preceding it is: we have to check if the two keys differ in the - * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size - * of the previous key so we can walk backwards to it from t->tree[j]'s key. - */ - -static inline void *bset_cacheline(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline) -{ - return (void *) round_down((unsigned long) btree_bkey_first(b, t), - L1_CACHE_BYTES) + - cacheline * BSET_CACHELINE; -} - -static struct bkey_packed *cacheline_to_bkey(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - unsigned offset) -{ - return bset_cacheline(b, t, cacheline) + offset * 8; -} - -static unsigned bkey_to_cacheline(const struct btree *b, - const struct bset_tree *t, - const struct bkey_packed *k) -{ - return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; -} - -static ssize_t __bkey_to_cacheline_offset(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - const struct bkey_packed *k) -{ - return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); -} - -static unsigned bkey_to_cacheline_offset(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - const struct bkey_packed *k) -{ - size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); - - EBUG_ON(m > U8_MAX); - return m; -} - -static inline struct bkey_packed *tree_to_bkey(const struct btree *b, - const struct bset_tree *t, - unsigned j) -{ - return cacheline_to_bkey(b, t, - __eytzinger1_to_inorder(j, t->size - 1, t->extra), - bkey_float(b, t, j)->key_offset); -} - -static struct rw_aux_tree *rw_aux_tree(const struct btree *b, - const struct bset_tree *t) -{ - EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); - - return __aux_tree_base(b, t); -} - -/* - * For the write set - the one we're currently inserting keys into - we don't - * maintain a full search tree, we just keep a simple lookup table in t->prev. - */ -static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, - struct bset_tree *t, - unsigned j) -{ - return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); -} - -static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, - unsigned j, struct bkey_packed *k) -{ - EBUG_ON(k >= btree_bkey_last(b, t)); - - rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { - .offset = __btree_node_key_to_offset(b, k), - .k = bkey_unpack_pos(b, k), - }; -} - -static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t) -{ - struct bkey_packed *k = btree_bkey_first(b, t); - unsigned j = 0; - - BUG_ON(bset_has_ro_aux_tree(t)); - - if (!bset_has_rw_aux_tree(t)) - return; - - BUG_ON(t->size < 1); - BUG_ON(rw_aux_to_bkey(b, t, j) != k); - - goto start; - while (1) { - if (rw_aux_to_bkey(b, t, j) == k) { - BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k, - bkey_unpack_pos(b, k))); -start: - if (++j == t->size) - break; - - BUG_ON(rw_aux_tree(b, t)[j].offset <= - rw_aux_tree(b, t)[j - 1].offset); - } - - k = bkey_p_next(k); - BUG_ON(k >= btree_bkey_last(b, t)); - } -} - -static inline void bch2_bset_verify_rw_aux_tree(struct btree *b, - struct bset_tree *t) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bch2_bset_verify_rw_aux_tree(b, t); -} - -/* returns idx of first entry >= offset: */ -static unsigned rw_aux_tree_bsearch(struct btree *b, - struct bset_tree *t, - unsigned offset) -{ - unsigned bset_offs = offset - btree_bkey_first_offset(t); - unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); - unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; - - EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); - EBUG_ON(!t->size); - EBUG_ON(idx > t->size); - - while (idx < t->size && - rw_aux_tree(b, t)[idx].offset < offset) - idx++; - - while (idx && - rw_aux_tree(b, t)[idx - 1].offset >= offset) - idx--; - - EBUG_ON(idx < t->size && - rw_aux_tree(b, t)[idx].offset < offset); - EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); - EBUG_ON(idx + 1 < t->size && - rw_aux_tree(b, t)[idx].offset == - rw_aux_tree(b, t)[idx + 1].offset); - - return idx; -} - -static inline unsigned bkey_mantissa(const struct bkey_packed *k, - const struct bkey_float *f) -{ - u64 v; - - EBUG_ON(!bkey_packed(k)); - - v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); - - /* - * In little endian, we're shifting off low bits (and then the bits we - * want are at the low end), in big endian we're shifting off high bits - * (and then the bits we want are at the high end, so we shift them - * back down): - */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - v >>= f->exponent & 7; -#else - v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; -#endif - return (u16) v; -} - -static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) -{ - struct bkey_float *f = bkey_float(b, t, j); - struct bkey_packed *m = tree_to_bkey(b, t, j); - struct bkey_packed *l = is_power_of_2(j) - ? min_key - : tree_to_bkey(b, t, j >> ffs(j)); - struct bkey_packed *r = is_power_of_2(j + 1) - ? max_key - : tree_to_bkey(b, t, j >> (ffz(j) + 1)); - unsigned mantissa; - int shift, exponent, high_bit; - - /* - * for failed bfloats, the lookup code falls back to comparing against - * the original key. - */ - - if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || - !b->nr_key_bits) { - f->exponent = BFLOAT_FAILED_UNPACKED; - return; - } - - /* - * The greatest differing bit of l and r is the first bit we must - * include in the bfloat mantissa we're creating in order to do - * comparisons - that bit always becomes the high bit of - * bfloat->mantissa, and thus the exponent we're calculating here is - * the position of what will become the low bit in bfloat->mantissa: - * - * Note that this may be negative - we may be running off the low end - * of the key: we handle this later: - */ - high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), - min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); - exponent = high_bit - (BKEY_MANTISSA_BITS - 1); - - /* - * Then we calculate the actual shift value, from the start of the key - * (k->_data), to get the key bits starting at exponent: - */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; - - EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); -#else - shift = high_bit_offset + - b->nr_key_bits - - exponent - - BKEY_MANTISSA_BITS; - - EBUG_ON(shift < KEY_PACKED_BITS_START); -#endif - EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); - - f->exponent = shift; - mantissa = bkey_mantissa(m, f); - - /* - * If we've got garbage bits, set them to all 1s - it's legal for the - * bfloat to compare larger than the original key, but not smaller: - */ - if (exponent < 0) - mantissa |= ~(~0U << -exponent); - - f->mantissa = mantissa; -} - -/* bytes remaining - only valid for last bset: */ -static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t) -{ - bset_aux_tree_verify(b); - - return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); -} - -static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t) -{ - return __bset_tree_capacity(b, t) / sizeof(struct bkey_float); -} - -static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t) -{ - return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); -} - -static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) -{ - struct bkey_packed *k; - - t->size = 1; - t->extra = BSET_RW_AUX_TREE_VAL; - rw_aux_tree(b, t)[0].offset = - __btree_node_key_to_offset(b, btree_bkey_first(b, t)); - - bset_tree_for_each_key(b, t, k) { - if (t->size == bset_rw_tree_capacity(b, t)) - break; - - if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > - L1_CACHE_BYTES) - rw_aux_tree_set(b, t, t->size++, k); - } -} - -static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) -{ - struct bkey_packed *k = btree_bkey_first(b, t); - struct bkey_i min_key, max_key; - unsigned cacheline = 1; - - t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), - bset_ro_tree_capacity(b, t)); -retry: - if (t->size < 2) { - t->size = 0; - t->extra = BSET_NO_AUX_TREE_VAL; - return; - } - - t->extra = eytzinger1_extra(t->size - 1); - - /* First we figure out where the first key in each cacheline is */ - eytzinger1_for_each(j, t->size - 1) { - while (bkey_to_cacheline(b, t, k) < cacheline) - k = bkey_p_next(k); - - if (k >= btree_bkey_last(b, t)) { - /* XXX: this path sucks */ - t->size--; - goto retry; - } - - bkey_float(b, t, j)->key_offset = - bkey_to_cacheline_offset(b, t, cacheline++, k); - - EBUG_ON(tree_to_bkey(b, t, j) != k); - } - - if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { - bkey_init(&min_key.k); - min_key.k.p = b->data->min_key; - } - - if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { - bkey_init(&max_key.k); - max_key.k.p = b->data->max_key; - } - - /* Then we build the tree */ - eytzinger1_for_each(j, t->size - 1) - make_bfloat(b, t, j, - bkey_to_packed(&min_key), - bkey_to_packed(&max_key)); -} - -static void bset_alloc_tree(struct btree *b, struct bset_tree *t) -{ - struct bset_tree *i; - - for (i = b->set; i != t; i++) - BUG_ON(bset_has_rw_aux_tree(i)); - - bch2_bset_set_no_aux_tree(b, t); - - /* round up to next cacheline: */ - t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), - SMP_CACHE_BYTES / sizeof(u64)); - - bset_aux_tree_verify(b); -} - -void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, - bool writeable) -{ - if (writeable - ? bset_has_rw_aux_tree(t) - : bset_has_ro_aux_tree(t)) - return; - - bset_alloc_tree(b, t); - - if (!__bset_tree_capacity(b, t)) - return; - - if (writeable) - __build_rw_aux_tree(b, t); - else - __build_ro_aux_tree(b, t); - - bset_aux_tree_verify(b); -} - -void bch2_bset_init_first(struct btree *b, struct bset *i) -{ - struct bset_tree *t; - - BUG_ON(b->nsets); - - memset(i, 0, sizeof(*i)); - get_random_bytes(&i->seq, sizeof(i->seq)); - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - t = &b->set[b->nsets++]; - set_btree_bset(b, t, i); -} - -void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne) -{ - struct bset *i = &bne->keys; - struct bset_tree *t; - - BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b)); - BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); - BUG_ON(b->nsets >= MAX_BSETS); - - memset(i, 0, sizeof(*i)); - i->seq = btree_bset_first(b)->seq; - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - t = &b->set[b->nsets++]; - set_btree_bset(b, t, i); -} - -/* - * find _some_ key in the same bset as @k that precedes @k - not necessarily the - * immediate predecessor: - */ -static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - struct bkey_packed *p; - unsigned offset; - int j; - - EBUG_ON(k < btree_bkey_first(b, t) || - k > btree_bkey_last(b, t)); - - if (k == btree_bkey_first(b, t)) - return NULL; - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - p = btree_bkey_first(b, t); - break; - case BSET_RO_AUX_TREE: - j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); - - do { - p = j ? tree_to_bkey(b, t, - __inorder_to_eytzinger1(j--, - t->size - 1, t->extra)) - : btree_bkey_first(b, t); - } while (p >= k); - break; - case BSET_RW_AUX_TREE: - offset = __btree_node_key_to_offset(b, k); - j = rw_aux_tree_bsearch(b, t, offset); - p = j ? rw_aux_to_bkey(b, t, j - 1) - : btree_bkey_first(b, t); - break; - } - - return p; -} - -struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, - struct bset_tree *t, - struct bkey_packed *k, - unsigned min_key_type) -{ - struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; - - while ((p = __bkey_prev(b, t, k)) && !ret) { - for (i = p; i != k; i = bkey_p_next(i)) - if (i->type >= min_key_type) - ret = i; - - k = p; - } - - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { - BUG_ON(ret >= orig_k); - - for (i = ret - ? bkey_p_next(ret) - : btree_bkey_first(b, t); - i != orig_k; - i = bkey_p_next(i)) - BUG_ON(i->type >= min_key_type); - } - - return ret; -} - -/* Insert */ - -static void rw_aux_tree_insert_entry(struct btree *b, - struct bset_tree *t, - unsigned idx) -{ - EBUG_ON(!idx || idx > t->size); - struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1); - struct bkey_packed *end = idx < t->size - ? rw_aux_to_bkey(b, t, idx) - : btree_bkey_last(b, t); - - if (t->size < bset_rw_tree_capacity(b, t) && - (void *) end - (void *) start > L1_CACHE_BYTES) { - struct bkey_packed *k = start; - - while (1) { - k = bkey_p_next(k); - if (k == end) - break; - - if ((void *) k - (void *) start >= L1_CACHE_BYTES) { - memmove(&rw_aux_tree(b, t)[idx + 1], - &rw_aux_tree(b, t)[idx], - (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[idx]); - t->size++; - rw_aux_tree_set(b, t, idx, k); - break; - } - } - } -} - -static void bch2_bset_fix_lookup_table(struct btree *b, - struct bset_tree *t, - struct bkey_packed *_where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - int shift = new_u64s - clobber_u64s; - unsigned idx, j, where = __btree_node_key_to_offset(b, _where); - - EBUG_ON(bset_has_ro_aux_tree(t)); - - if (!bset_has_rw_aux_tree(t)) - return; - - if (where > rw_aux_tree(b, t)[t->size - 1].offset) { - rw_aux_tree_insert_entry(b, t, t->size); - goto verify; - } - - /* returns first entry >= where */ - idx = rw_aux_tree_bsearch(b, t, where); - - if (rw_aux_tree(b, t)[idx].offset == where) { - if (!idx) { /* never delete first entry */ - idx++; - } else if (where < t->end_offset) { - rw_aux_tree_set(b, t, idx++, _where); - } else { - EBUG_ON(where != t->end_offset); - rw_aux_tree_insert_entry(b, t, --t->size); - goto verify; - } - } - - EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where); - if (idx < t->size && - rw_aux_tree(b, t)[idx].offset + shift == - rw_aux_tree(b, t)[idx - 1].offset) { - memmove(&rw_aux_tree(b, t)[idx], - &rw_aux_tree(b, t)[idx + 1], - (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[idx + 1]); - t->size -= 1; - } - - for (j = idx; j < t->size; j++) - rw_aux_tree(b, t)[j].offset += shift; - - EBUG_ON(idx < t->size && - rw_aux_tree(b, t)[idx].offset == - rw_aux_tree(b, t)[idx - 1].offset); - - rw_aux_tree_insert_entry(b, t, idx); - -verify: - bch2_bset_verify_rw_aux_tree(b, t); - bset_aux_tree_verify(b); -} - -void bch2_bset_insert(struct btree *b, - struct bkey_packed *where, - struct bkey_i *insert, - unsigned clobber_u64s) -{ - struct bkey_format *f = &b->format; - struct bset_tree *t = bset_tree_last(b); - struct bkey_packed packed, *src = bkey_to_packed(insert); - - bch2_bset_verify_rw_aux_tree(b, t); - bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); - - if (bch2_bkey_pack_key(&packed, &insert->k, f)) - src = &packed; - - if (!bkey_deleted(&insert->k)) - btree_keys_account_key_add(&b->nr, t - b->set, src); - - if (src->u64s != clobber_u64s) { - u64 *src_p = (u64 *) where->_data + clobber_u64s; - u64 *dst_p = (u64 *) where->_data + src->u64s; - - EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < - (int) clobber_u64s - src->u64s); - - memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); - le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); - set_btree_bset_end(b, t); - } - - memcpy_u64s_small(where, src, - bkeyp_key_u64s(f, src)); - memcpy_u64s(bkeyp_val(f, where), &insert->v, - bkeyp_val_u64s(f, src)); - - if (src->u64s != clobber_u64s) - bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); - - bch2_verify_btree_nr_keys(b); -} - -void bch2_bset_delete(struct btree *b, - struct bkey_packed *where, - unsigned clobber_u64s) -{ - struct bset_tree *t = bset_tree_last(b); - u64 *src_p = (u64 *) where->_data + clobber_u64s; - u64 *dst_p = where->_data; - - bch2_bset_verify_rw_aux_tree(b, t); - - EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); - - memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); - le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); - set_btree_bset_end(b, t); - - bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); -} - -/* Lookup */ - -__flatten -static struct bkey_packed *bset_search_write_set(const struct btree *b, - struct bset_tree *t, - struct bpos *search) -{ - unsigned l = 0, r = t->size; - - while (l + 1 != r) { - unsigned m = (l + r) >> 1; - - if (bpos_lt(rw_aux_tree(b, t)[m].k, *search)) - l = m; - else - r = m; - } - - return rw_aux_to_bkey(b, t, l); -} - -static inline void prefetch_four_cachelines(void *p) -{ -#ifdef CONFIG_X86_64 - asm("prefetcht0 (-127 + 64 * 0)(%0);" - "prefetcht0 (-127 + 64 * 1)(%0);" - "prefetcht0 (-127 + 64 * 2)(%0);" - "prefetcht0 (-127 + 64 * 3)(%0);" - : - : "r" (p + 127)); -#else - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - prefetch(p + L1_CACHE_BYTES * 3); -#endif -} - -static inline bool bkey_mantissa_bits_dropped(const struct btree *b, - const struct bkey_float *f) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; - - return f->exponent > key_bits_start; -#else - unsigned key_bits_end = high_bit_offset + b->nr_key_bits; - - return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; -#endif -} - -__flatten -static struct bkey_packed *bset_search_tree(const struct btree *b, - const struct bset_tree *t, - const struct bpos *search, - const struct bkey_packed *packed_search) -{ - struct ro_aux_tree *base = ro_aux_tree_base(b, t); - struct bkey_float *f; - struct bkey_packed *k; - unsigned inorder, n = 1, l, r; - int cmp; - - do { - if (likely(n << 4 < t->size)) - prefetch(&base->f[n << 4]); - - f = &base->f[n]; - if (unlikely(f->exponent >= BFLOAT_FAILED)) - goto slowpath; - - l = f->mantissa; - r = bkey_mantissa(packed_search, f); - - if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f)) - goto slowpath; - - n = n * 2 + (l < r); - continue; -slowpath: - k = tree_to_bkey(b, t, n); - cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); - if (!cmp) - return k; - - n = n * 2 + (cmp < 0); - } while (n < t->size); - - inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); - - /* - * n would have been the node we recursed to - the low bit tells us if - * we recursed left or recursed right. - */ - if (likely(!(n & 1))) { - --inorder; - if (unlikely(!inorder)) - return btree_bkey_first(b, t); - - f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; - } - - return cacheline_to_bkey(b, t, inorder, f->key_offset); -} - -static __always_inline __flatten -struct bkey_packed *__bch2_bset_search(struct btree *b, - struct bset_tree *t, - struct bpos *search, - const struct bkey_packed *lossy_packed_search) -{ - - /* - * First, we search for a cacheline, then lastly we do a linear search - * within that cacheline. - * - * To search for the cacheline, there's three different possibilities: - * * The set is too small to have a search tree, so we just do a linear - * search over the whole set. - * * The set is the one we're currently inserting into; keeping a full - * auxiliary search tree up to date would be too expensive, so we - * use a much simpler lookup table to do a binary search - - * bset_search_write_set(). - * * Or we use the auxiliary search tree we constructed earlier - - * bset_search_tree() - */ - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - return btree_bkey_first(b, t); - case BSET_RW_AUX_TREE: - return bset_search_write_set(b, t, search); - case BSET_RO_AUX_TREE: - return bset_search_tree(b, t, search, lossy_packed_search); - default: - BUG(); - } -} - -static __always_inline __flatten -struct bkey_packed *bch2_bset_search_linear(struct btree *b, - struct bset_tree *t, - struct bpos *search, - struct bkey_packed *packed_search, - const struct bkey_packed *lossy_packed_search, - struct bkey_packed *m) -{ - if (lossy_packed_search) - while (m != btree_bkey_last(b, t) && - bkey_iter_cmp_p_or_unp(b, m, - lossy_packed_search, search) < 0) - m = bkey_p_next(m); - - if (!packed_search) - while (m != btree_bkey_last(b, t) && - bkey_iter_pos_cmp(b, m, search) < 0) - m = bkey_p_next(m); - - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); - - BUG_ON(prev && - bkey_iter_cmp_p_or_unp(b, prev, - packed_search, search) >= 0); - } - - return m; -} - -/* Btree node iterator */ - -static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) -{ - if (k != end) { - struct btree_node_iter_set *pos; - - btree_node_iter_for_each(iter, pos) - ; - - BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); - *pos = (struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k), - __btree_node_key_to_offset(b, end) - }; - } -} - -void bch2_btree_node_iter_push(struct btree_node_iter *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) -{ - __bch2_btree_node_iter_push(iter, b, k, end); - bch2_btree_node_iter_sort(iter, b); -} - -noinline __flatten __cold -static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, - struct btree *b, struct bpos *search) -{ - struct bkey_packed *k; - - trace_bkey_pack_pos_fail(search); - - bch2_btree_node_iter_init_from_start(iter, b); - - while ((k = bch2_btree_node_iter_peek(iter, b)) && - bkey_iter_pos_cmp(b, k, search) < 0) - bch2_btree_node_iter_advance(iter, b); -} - -/** - * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a - * given position - * - * @iter: iterator to initialize - * @b: btree node to search - * @search: search key - * - * Main entry point to the lookup code for individual btree nodes: - * - * NOTE: - * - * When you don't filter out deleted keys, btree nodes _do_ contain duplicate - * keys. This doesn't matter for most code, but it does matter for lookups. - * - * Some adjacent keys with a string of equal keys: - * i j k k k k l m - * - * If you search for k, the lookup code isn't guaranteed to return you any - * specific k. The lookup code is conceptually doing a binary search and - * iterating backwards is very expensive so if the pivot happens to land at the - * last k that's what you'll get. - * - * This works out ok, but it's something to be aware of: - * - * - For non extents, we guarantee that the live key comes last - see - * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't - * see will only be deleted keys you don't care about. - * - * - For extents, deleted keys sort last (see the comment at the top of this - * file). But when you're searching for extents, you actually want the first - * key strictly greater than your search key - an extent that compares equal - * to the search key is going to have 0 sectors after the search key. - * - * But this does mean that we can't just search for - * bpos_successor(start_of_range) to get the first extent that overlaps with - * the range we want - if we're unlucky and there's an extent that ends - * exactly where we searched, then there could be a deleted key at the same - * position and we'd get that when we search instead of the preceding extent - * we needed. - * - * So we've got to search for start_of_range, then after the lookup iterate - * past any extents that compare equal to the position we searched for. - */ -__flatten -void bch2_btree_node_iter_init(struct btree_node_iter *iter, - struct btree *b, struct bpos *search) -{ - struct bkey_packed p, *packed_search = NULL; - struct btree_node_iter_set *pos = iter->data; - struct bkey_packed *k[MAX_BSETS]; - unsigned i; - - EBUG_ON(bpos_lt(*search, b->data->min_key)); - EBUG_ON(bpos_gt(*search, b->data->max_key)); - bset_aux_tree_verify(b); - - memset(iter, 0, sizeof(*iter)); - - switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { - case BKEY_PACK_POS_EXACT: - packed_search = &p; - break; - case BKEY_PACK_POS_SMALLER: - packed_search = NULL; - break; - case BKEY_PACK_POS_FAIL: - btree_node_iter_init_pack_failed(iter, b, search); - return; - } - - for (i = 0; i < b->nsets; i++) { - k[i] = __bch2_bset_search(b, b->set + i, search, &p); - prefetch_four_cachelines(k[i]); - } - - for (i = 0; i < b->nsets; i++) { - struct bset_tree *t = b->set + i; - struct bkey_packed *end = btree_bkey_last(b, t); - - k[i] = bch2_bset_search_linear(b, t, search, - packed_search, &p, k[i]); - if (k[i] != end) - *pos++ = (struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k[i]), - __btree_node_key_to_offset(b, end) - }; - } - - bch2_btree_node_iter_sort(iter, b); -} - -void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, - struct btree *b) -{ - memset(iter, 0, sizeof(*iter)); - - for_each_bset(b, t) - __bch2_btree_node_iter_push(iter, b, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - bch2_btree_node_iter_sort(iter, b); -} - -struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, - struct btree *b, - struct bset_tree *t) -{ - struct btree_node_iter_set *set; - - btree_node_iter_for_each(iter, set) - if (set->end == t->end_offset) - return __btree_node_offset_to_key(b, set->k); - - return btree_bkey_last(b, t); -} - -static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, - struct btree *b, - unsigned first) -{ - bool ret; - - if ((ret = (btree_node_iter_cmp(b, - iter->data[first], - iter->data[first + 1]) > 0))) - swap(iter->data[first], iter->data[first + 1]); - return ret; -} - -void bch2_btree_node_iter_sort(struct btree_node_iter *iter, - struct btree *b) -{ - /* unrolled bubble sort: */ - - if (!__btree_node_iter_set_end(iter, 2)) { - btree_node_iter_sort_two(iter, b, 0); - btree_node_iter_sort_two(iter, b, 1); - } - - if (!__btree_node_iter_set_end(iter, 1)) - btree_node_iter_sort_two(iter, b, 0); -} - -void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, - struct btree_node_iter_set *set) -{ - struct btree_node_iter_set *last = - iter->data + ARRAY_SIZE(iter->data) - 1; - - memmove(&set[0], &set[1], (void *) last - (void *) set); - *last = (struct btree_node_iter_set) { 0, 0 }; -} - -static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, - struct btree *b) -{ - iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; - - EBUG_ON(iter->data->k > iter->data->end); - - if (unlikely(__btree_node_iter_set_end(iter, 0))) { - /* avoid an expensive memmove call: */ - iter->data[0] = iter->data[1]; - iter->data[1] = iter->data[2]; - iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; - return; - } - - if (__btree_node_iter_set_end(iter, 1)) - return; - - if (!btree_node_iter_sort_two(iter, b, 0)) - return; - - if (__btree_node_iter_set_end(iter, 2)) - return; - - btree_node_iter_sort_two(iter, b, 1); -} - -void bch2_btree_node_iter_advance(struct btree_node_iter *iter, - struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { - __bch2_btree_node_iter_verify(iter, b); - __bch2_btree_node_iter_next_check(iter, b); - } - - __bch2_btree_node_iter_advance(iter, b); -} - -/* - * Expensive: - */ -struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, - struct btree *b) -{ - struct bkey_packed *k, *prev = NULL; - struct btree_node_iter_set *set; - unsigned end = 0; - - bch2_btree_node_iter_verify(iter, b); - - for_each_bset(b, t) { - k = bch2_bkey_prev_all(b, t, - bch2_btree_node_iter_bset_pos(iter, b, t)); - if (k && - (!prev || bkey_iter_cmp(b, k, prev) > 0)) { - prev = k; - end = t->end_offset; - } - } - - if (!prev) - return NULL; - - /* - * We're manually memmoving instead of just calling sort() to ensure the - * prev we picked ends up in slot 0 - sort won't necessarily put it - * there because of duplicate deleted keys: - */ - btree_node_iter_for_each(iter, set) - if (set->end == end) - goto found; - - BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); -found: - BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); - - memmove(&iter->data[1], - &iter->data[0], - (void *) set - (void *) &iter->data[0]); - - iter->data[0].k = __btree_node_key_to_offset(b, prev); - iter->data[0].end = end; - - bch2_btree_node_iter_verify(iter, b); - return prev; -} - -struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, - struct btree *b) -{ - struct bkey_packed *prev; - - do { - prev = bch2_btree_node_iter_prev_all(iter, b); - } while (prev && bkey_deleted(prev)); - - return prev; -} - -struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, - struct btree *b, - struct bkey *u) -{ - struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); - - return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; -} - -/* Mergesort */ - -void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) -{ - for_each_bset_c(b, t) { - enum bset_aux_tree_type type = bset_aux_tree_type(t); - size_t j; - - stats->sets[type].nr++; - stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * - sizeof(u64); - - if (bset_has_ro_aux_tree(t)) { - stats->floats += t->size - 1; - - for (j = 1; j < t->size; j++) - stats->failed += - bkey_float(b, t, j)->exponent == - BFLOAT_FAILED; - } - } -} - -void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, - struct bkey_packed *k) -{ - struct bset_tree *t = bch2_bkey_to_bset(b, k); - struct bkey uk; - unsigned j, inorder; - - if (!bset_has_ro_aux_tree(t)) - return; - - inorder = bkey_to_cacheline(b, t, k); - if (!inorder || inorder >= t->size) - return; - - j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); - if (k != tree_to_bkey(b, t, j)) - return; - - switch (bkey_float(b, t, j)->exponent) { - case BFLOAT_FAILED: - uk = bkey_unpack_key(b, k); - prt_printf(out, - " failed unpacked at depth %u\n" - "\t", - ilog2(j)); - bch2_bpos_to_text(out, uk.p); - prt_printf(out, "\n"); - break; - } -} diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h deleted file mode 100644 index a15ecf9d006e05..00000000000000 --- a/fs/bcachefs/bset.h +++ /dev/null @@ -1,536 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BSET_H -#define _BCACHEFS_BSET_H - -#include -#include - -#include "bcachefs.h" -#include "bkey.h" -#include "bkey_methods.h" -#include "btree_types.h" -#include "util.h" /* for time_stats */ -#include "vstructs.h" - -/* - * BKEYS: - * - * A bkey contains a key, a size field, a variable number of pointers, and some - * ancillary flag bits. - * - * We use two different functions for validating bkeys, bkey_invalid and - * bkey_deleted(). - * - * The one exception to the rule that ptr_invalid() filters out invalid keys is - * that it also filters out keys of size 0 - these are keys that have been - * completely overwritten. It'd be safe to delete these in memory while leaving - * them on disk, just unnecessary work - so we filter them out when resorting - * instead. - * - * We can't filter out stale keys when we're resorting, because garbage - * collection needs to find them to ensure bucket gens don't wrap around - - * unless we're rewriting the btree node those stale keys still exist on disk. - * - * We also implement functions here for removing some number of sectors from the - * front or the back of a bkey - this is mainly used for fixing overlapping - * extents, by removing the overlapping sectors from the older key. - * - * BSETS: - * - * A bset is an array of bkeys laid out contiguously in memory in sorted order, - * along with a header. A btree node is made up of a number of these, written at - * different times. - * - * There could be many of them on disk, but we never allow there to be more than - * 4 in memory - we lazily resort as needed. - * - * We implement code here for creating and maintaining auxiliary search trees - * (described below) for searching an individial bset, and on top of that we - * implement a btree iterator. - * - * BTREE ITERATOR: - * - * Most of the code in bcache doesn't care about an individual bset - it needs - * to search entire btree nodes and iterate over them in sorted order. - * - * The btree iterator code serves both functions; it iterates through the keys - * in a btree node in sorted order, starting from either keys after a specific - * point (if you pass it a search key) or the start of the btree node. - * - * AUXILIARY SEARCH TREES: - * - * Since keys are variable length, we can't use a binary search on a bset - we - * wouldn't be able to find the start of the next key. But binary searches are - * slow anyways, due to terrible cache behaviour; bcache originally used binary - * searches and that code topped out at under 50k lookups/second. - * - * So we need to construct some sort of lookup table. Since we only insert keys - * into the last (unwritten) set, most of the keys within a given btree node are - * usually in sets that are mostly constant. We use two different types of - * lookup tables to take advantage of this. - * - * Both lookup tables share in common that they don't index every key in the - * set; they index one key every BSET_CACHELINE bytes, and then a linear search - * is used for the rest. - * - * For sets that have been written to disk and are no longer being inserted - * into, we construct a binary search tree in an array - traversing a binary - * search tree in an array gives excellent locality of reference and is very - * fast, since both children of any node are adjacent to each other in memory - * (and their grandchildren, and great grandchildren...) - this means - * prefetching can be used to great effect. - * - * It's quite useful performance wise to keep these nodes small - not just - * because they're more likely to be in L2, but also because we can prefetch - * more nodes on a single cacheline and thus prefetch more iterations in advance - * when traversing this tree. - * - * Nodes in the auxiliary search tree must contain both a key to compare against - * (we don't want to fetch the key from the set, that would defeat the purpose), - * and a pointer to the key. We use a few tricks to compress both of these. - * - * To compress the pointer, we take advantage of the fact that one node in the - * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have - * a function (to_inorder()) that takes the index of a node in a binary tree and - * returns what its index would be in an inorder traversal, so we only have to - * store the low bits of the offset. - * - * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To - * compress that, we take advantage of the fact that when we're traversing the - * search tree at every iteration we know that both our search key and the key - * we're looking for lie within some range - bounded by our previous - * comparisons. (We special case the start of a search so that this is true even - * at the root of the tree). - * - * So we know the key we're looking for is between a and b, and a and b don't - * differ higher than bit 50, we don't need to check anything higher than bit - * 50. - * - * We don't usually need the rest of the bits, either; we only need enough bits - * to partition the key range we're currently checking. Consider key n - the - * key our auxiliary search tree node corresponds to, and key p, the key - * immediately preceding n. The lowest bit we need to store in the auxiliary - * search tree is the highest bit that differs between n and p. - * - * Note that this could be bit 0 - we might sometimes need all 80 bits to do the - * comparison. But we'd really like our nodes in the auxiliary search tree to be - * of fixed size. - * - * The solution is to make them fixed size, and when we're constructing a node - * check if p and n differed in the bits we needed them to. If they don't we - * flag that node, and when doing lookups we fallback to comparing against the - * real key. As long as this doesn't happen to often (and it seems to reliably - * happen a bit less than 1% of the time), we win - even on failures, that key - * is then more likely to be in cache than if we were doing binary searches all - * the way, since we're touching so much less memory. - * - * The keys in the auxiliary search tree are stored in (software) floating - * point, with an exponent and a mantissa. The exponent needs to be big enough - * to address all the bits in the original key, but the number of bits in the - * mantissa is somewhat arbitrary; more bits just gets us fewer failures. - * - * We need 7 bits for the exponent and 3 bits for the key's offset (since keys - * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. - * We need one node per 128 bytes in the btree node, which means the auxiliary - * search trees take up 3% as much memory as the btree itself. - * - * Constructing these auxiliary search trees is moderately expensive, and we - * don't want to be constantly rebuilding the search tree for the last set - * whenever we insert another key into it. For the unwritten set, we use a much - * simpler lookup table - it's just a flat array, so index i in the lookup table - * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing - * within each byte range works the same as with the auxiliary search trees. - * - * These are much easier to keep up to date when we insert a key - we do it - * somewhat lazily; when we shift a key up we usually just increment the pointer - * to it, only when it would overflow do we go to the trouble of finding the - * first key in that range of bytes again. - */ - -enum bset_aux_tree_type { - BSET_NO_AUX_TREE, - BSET_RO_AUX_TREE, - BSET_RW_AUX_TREE, -}; - -#define BSET_TREE_NR_TYPES 3 - -#define BSET_NO_AUX_TREE_VAL (U16_MAX) -#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) - -static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) -{ - switch (t->extra) { - case BSET_NO_AUX_TREE_VAL: - EBUG_ON(t->size); - return BSET_NO_AUX_TREE; - case BSET_RW_AUX_TREE_VAL: - EBUG_ON(!t->size); - return BSET_RW_AUX_TREE; - default: - EBUG_ON(!t->size); - return BSET_RO_AUX_TREE; - } -} - -/* - * BSET_CACHELINE was originally intended to match the hardware cacheline size - - * it used to be 64, but I realized the lookup code would touch slightly less - * memory if it was 128. - * - * It definites the number of bytes (in struct bset) per struct bkey_float in - * the auxiliar search tree - when we're done searching the bset_float tree we - * have this many bytes left that we do a linear search over. - * - * Since (after level 5) every level of the bset_tree is on a new cacheline, - * we're touching one fewer cacheline in the bset tree in exchange for one more - * cacheline in the linear search - but the linear search might stop before it - * gets to the second cacheline. - */ - -#define BSET_CACHELINE 256 - -static inline size_t btree_keys_cachelines(const struct btree *b) -{ - return (1U << b->byte_order) / BSET_CACHELINE; -} - -static inline size_t btree_aux_data_bytes(const struct btree *b) -{ - return btree_keys_cachelines(b) * 8; -} - -static inline size_t btree_aux_data_u64s(const struct btree *b) -{ - return btree_aux_data_bytes(b) / sizeof(u64); -} - -#define for_each_bset(_b, _t) \ - for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) - -#define for_each_bset_c(_b, _t) \ - for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) - -#define bset_tree_for_each_key(_b, _t, _k) \ - for (_k = btree_bkey_first(_b, _t); \ - _k != btree_bkey_last(_b, _t); \ - _k = bkey_p_next(_k)) - -static inline bool bset_has_ro_aux_tree(const struct bset_tree *t) -{ - return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; -} - -static inline bool bset_has_rw_aux_tree(struct bset_tree *t) -{ - return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; -} - -static inline void bch2_bset_set_no_aux_tree(struct btree *b, - struct bset_tree *t) -{ - BUG_ON(t < b->set); - - for (; t < b->set + ARRAY_SIZE(b->set); t++) { - t->size = 0; - t->extra = BSET_NO_AUX_TREE_VAL; - t->aux_data_offset = U16_MAX; - } -} - -static inline void btree_node_set_format(struct btree *b, - struct bkey_format f) -{ - int len; - - b->format = f; - b->nr_key_bits = bkey_format_key_bits(&f); - - len = bch2_compile_bkey_format(&b->format, b->aux_data); - BUG_ON(len < 0 || len > U8_MAX); - - b->unpack_fn_len = len; - - bch2_bset_set_no_aux_tree(b, b->set); -} - -static inline struct bset *bset_next_set(struct btree *b, - unsigned block_bytes) -{ - struct bset *i = btree_bset_last(b); - - EBUG_ON(!is_power_of_2(block_bytes)); - - return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); -} - -void bch2_btree_keys_init(struct btree *); - -void bch2_bset_init_first(struct btree *, struct bset *); -void bch2_bset_init_next(struct btree *, struct btree_node_entry *); -void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); - -void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *, - unsigned); -void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); - -/* Bkey utility code */ - -/* packed or unpacked */ -static inline int bkey_cmp_p_or_unp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r_packed, - const struct bpos *r) -{ - EBUG_ON(r_packed && !bkey_packed(r_packed)); - - if (unlikely(!bkey_packed(l))) - return bpos_cmp(packed_to_bkey_c(l)->p, *r); - - if (likely(r_packed)) - return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); - - return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -} - -static inline struct bset_tree * -bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) -{ - unsigned offset = __btree_node_key_to_offset(b, k); - - for_each_bset(b, t) - if (offset <= t->end_offset) { - EBUG_ON(offset < btree_bkey_first_offset(t)); - return t; - } - - BUG(); -} - -struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); - -struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, - struct bkey_packed *, unsigned); - -static inline struct bkey_packed * -bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -{ - return bch2_bkey_prev_filter(b, t, k, 0); -} - -static inline struct bkey_packed * -bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -{ - return bch2_bkey_prev_filter(b, t, k, 1); -} - -/* Btree key iteration */ - -void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); -void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, - struct bpos *); -void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, - struct btree *); -struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, - struct btree *, - struct bset_tree *); - -void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); -void bch2_btree_node_iter_set_drop(struct btree_node_iter *, - struct btree_node_iter_set *); -void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); - -#define btree_node_iter_for_each(_iter, _set) \ - for (_set = (_iter)->data; \ - _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ - (_set)->k != (_set)->end; \ - _set++) - -static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, - unsigned i) -{ - return iter->data[i].k == iter->data[i].end; -} - -static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) -{ - return __btree_node_iter_set_end(iter, 0); -} - -/* - * When keys compare equal, deleted keys compare first: - * - * XXX: only need to compare pointers for keys that are both within a - * btree_node_iterator - we need to break ties for prev() to work correctly - */ -static inline int bkey_iter_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed(b, l, r) - ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) - ?: cmp_int(l, r); -} - -static inline int btree_node_iter_cmp(const struct btree *b, - struct btree_node_iter_set l, - struct btree_node_iter_set r) -{ - return bkey_iter_cmp(b, - __btree_node_offset_to_key(b, l.k), - __btree_node_offset_to_key(b, r.k)); -} - -/* These assume r (the search key) is not a deleted key: */ -static inline int bkey_iter_pos_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bpos *r) -{ - return bkey_cmp_left_packed(b, l, r) - ?: -((int) bkey_deleted(l)); -} - -static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r_packed, - const struct bpos *r) -{ - return bkey_cmp_p_or_unp(b, l, r_packed, r) - ?: -((int) bkey_deleted(l)); -} - -static inline struct bkey_packed * -__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, - struct btree *b) -{ - return __btree_node_offset_to_key(b, iter->data->k); -} - -static inline struct bkey_packed * -bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b) -{ - return !bch2_btree_node_iter_end(iter) - ? __btree_node_offset_to_key(b, iter->data->k) - : NULL; -} - -static inline struct bkey_packed * -bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) -{ - struct bkey_packed *k; - - while ((k = bch2_btree_node_iter_peek_all(iter, b)) && - bkey_deleted(k)) - bch2_btree_node_iter_advance(iter, b); - - return k; -} - -static inline struct bkey_packed * -bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) -{ - struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); - - if (ret) - bch2_btree_node_iter_advance(iter, b); - - return ret; -} - -struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, - struct btree *); -struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, - struct btree *); - -struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, - struct btree *, - struct bkey *); - -#define for_each_btree_node_key(b, k, iter) \ - for (bch2_btree_node_iter_init_from_start((iter), (b)); \ - (k = bch2_btree_node_iter_peek((iter), (b))); \ - bch2_btree_node_iter_advance(iter, b)) - -#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ - for (bch2_btree_node_iter_init_from_start((iter), (b)); \ - (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ - bch2_btree_node_iter_advance(iter, b)) - -/* Accounting: */ - -struct btree_nr_keys bch2_btree_node_count_keys(struct btree *); - -static inline void btree_keys_account_key(struct btree_nr_keys *n, - unsigned bset, - struct bkey_packed *k, - int sign) -{ - n->live_u64s += k->u64s * sign; - n->bset_u64s[bset] += k->u64s * sign; - - if (bkey_packed(k)) - n->packed_keys += sign; - else - n->unpacked_keys += sign; -} - -static inline void btree_keys_account_val_delta(struct btree *b, - struct bkey_packed *k, - int delta) -{ - struct bset_tree *t = bch2_bkey_to_bset(b, k); - - b->nr.live_u64s += delta; - b->nr.bset_u64s[t - b->set] += delta; -} - -#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ - btree_keys_account_key(_nr, _bset_idx, _k, 1) -#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ - btree_keys_account_key(_nr, _bset_idx, _k, -1) - -#define btree_account_key_add(_b, _k) \ - btree_keys_account_key(&(_b)->nr, \ - bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) -#define btree_account_key_drop(_b, _k) \ - btree_keys_account_key(&(_b)->nr, \ - bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) - -struct bset_stats { - struct { - size_t nr, bytes; - } sets[BSET_TREE_NR_TYPES]; - - size_t floats; - size_t failed; -}; - -void bch2_btree_keys_stats(const struct btree *, struct bset_stats *); -void bch2_bfloat_to_text(struct printbuf *, struct btree *, - struct bkey_packed *); - -/* Debug stuff */ - -void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); -void bch2_dump_btree_node(struct bch_fs *, struct btree *); -void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); - -void __bch2_verify_btree_nr_keys(struct btree *); -void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); - -static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bch2_btree_node_iter_verify(iter, b); -} - -static inline void bch2_verify_btree_nr_keys(struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_btree_accounting)) - __bch2_verify_btree_nr_keys(b); -} - -#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c deleted file mode 100644 index 83c9860e6b82cb..00000000000000 --- a/fs/bcachefs/btree_cache.c +++ /dev/null @@ -1,1516 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bbpos.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "debug.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "trace.h" - -#include -#include -#include - -const char * const bch2_btree_node_flags[] = { - "typebit", - "typebit", - "typebit", -#define x(f) [BTREE_NODE_##f] = #f, - BTREE_FLAGS() -#undef x - NULL -}; - -void bch2_recalc_btree_reserve(struct bch_fs *c) -{ - unsigned reserve = 16; - - if (!c->btree_roots_known[0].b) - reserve += 8; - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (r->b) - reserve += min_t(unsigned, 1, r->b->c.level) * 8; - } - - c->btree_cache.nr_reserve = reserve; -} - -static inline size_t btree_cache_can_free(struct btree_cache_list *list) -{ - struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); - - size_t can_free = list->nr; - if (!list->idx) - can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); - return can_free; -} - -static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - - if (b->c.lock.readers) - list_add(&b->list, &bc->freed_pcpu); - else - list_add(&b->list, &bc->freed_nonpcpu); -} - -static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - BUG_ON(!b->data); - - bc->nr_freeable++; - list_add(&b->list, &bc->freeable); -} - -void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) -{ - struct btree_cache *bc = &c->btree_cache; - - mutex_lock(&bc->lock); - __bch2_btree_node_to_freelist(bc, b); - mutex_unlock(&bc->lock); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); -} - -void __btree_node_data_free(struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - BUG_ON(btree_node_hashed(b)); - - /* - * This should really be done in slub/vmalloc, but we're using the - * kmalloc_large() path, so we're working around a slub bug by doing - * this here: - */ - if (b->data) - mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); - if (b->aux_data) - mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); - - EBUG_ON(btree_node_write_in_flight(b)); - - clear_btree_node_just_written(b); - - kvfree(b->data); - b->data = NULL; -#ifdef __KERNEL__ - kvfree(b->aux_data); -#else - munmap(b->aux_data, btree_aux_data_bytes(b)); -#endif - b->aux_data = NULL; -} - -static void btree_node_data_free(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(list_empty(&b->list)); - list_del_init(&b->list); - - __btree_node_data_free(b); - - --bc->nr_freeable; - btree_node_to_freedlist(bc, b); -} - -static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct btree *b = obj; - const u64 *v = arg->key; - - return b->hash_val == *v ? 0 : 1; -} - -static const struct rhashtable_params bch_btree_cache_params = { - .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, hash_val), - .key_len = sizeof(u64), - .obj_cmpfn = bch2_btree_cache_cmp_fn, - .automatic_shrinking = true, -}; - -static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) -{ - BUG_ON(b->data || b->aux_data); - - gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; - - b->data = kvmalloc(btree_buf_bytes(b), gfp); - if (!b->data) - return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); -#ifdef __KERNEL__ - b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); -#else - b->aux_data = mmap(NULL, btree_aux_data_bytes(b), - PROT_READ|PROT_WRITE|PROT_EXEC, - MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); - if (b->aux_data == MAP_FAILED) - b->aux_data = NULL; -#endif - if (!b->aux_data) { - kvfree(b->data); - b->data = NULL; - return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); - } - - return 0; -} - -static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) -{ - struct btree *b; - - b = kzalloc(sizeof(struct btree), gfp); - if (!b) - return NULL; - - bkey_btree_ptr_init(&b->key); - INIT_LIST_HEAD(&b->list); - INIT_LIST_HEAD(&b->write_blocked); - b->byte_order = ilog2(c->opts.btree_node_size); - return b; -} - -struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) -{ - struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); - if (!b) - return NULL; - - if (btree_node_data_alloc(c, b, GFP_KERNEL)) { - kfree(b); - return NULL; - } - - bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); - return b; -} - -static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) -{ - struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); - - u64 mask = bc->pinned_nodes_mask[!!b->c.level]; - - return ((mask & BIT_ULL(b->c.btree_id)) && - bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && - bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); -} - -void bch2_node_pin(struct bch_fs *c, struct btree *b) -{ - struct btree_cache *bc = &c->btree_cache; - - mutex_lock(&bc->lock); - if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { - set_btree_node_pinned(b); - list_move(&b->list, &bc->live[1].list); - bc->live[0].nr--; - bc->live[1].nr++; - } - mutex_unlock(&bc->lock); -} - -void bch2_btree_cache_unpin(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct btree *b, *n; - - mutex_lock(&bc->lock); - c->btree_cache.pinned_nodes_mask[0] = 0; - c->btree_cache.pinned_nodes_mask[1] = 0; - - list_for_each_entry_safe(b, n, &bc->live[1].list, list) { - clear_btree_node_pinned(b); - list_move(&b->list, &bc->live[0].list); - bc->live[0].nr++; - bc->live[1].nr--; - } - - mutex_unlock(&bc->lock); -} - -/* Btree in memory cache - hash table */ - -void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) -{ - lockdep_assert_held(&bc->lock); - - int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); - BUG_ON(ret); - - /* Cause future lookups for this node to fail: */ - b->hash_val = 0; - - if (b->c.btree_id < BTREE_ID_NR) - --bc->nr_by_btree[b->c.btree_id]; - --bc->live[btree_node_pinned(b)].nr; - list_del_init(&b->list); -} - -void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) -{ - __bch2_btree_node_hash_remove(bc, b); - __bch2_btree_node_to_freelist(bc, b); -} - -int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - BUG_ON(b->hash_val); - - b->hash_val = btree_ptr_hash_val(&b->key); - int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, - bch_btree_cache_params); - if (ret) - return ret; - - if (b->c.btree_id < BTREE_ID_NR) - bc->nr_by_btree[b->c.btree_id]++; - - bool p = __btree_node_pinned(bc, b); - mod_bit(BTREE_NODE_pinned, &b->flags, p); - - list_add_tail(&b->list, &bc->live[p].list); - bc->live[p].nr++; - return 0; -} - -int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, - unsigned level, enum btree_id id) -{ - b->c.level = level; - b->c.btree_id = id; - - mutex_lock(&bc->lock); - int ret = __bch2_btree_node_hash_insert(bc, b); - mutex_unlock(&bc->lock); - - return ret; -} - -void bch2_btree_node_update_key_early(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_i *new) -{ - struct bch_fs *c = trans->c; - struct btree *b; - struct bkey_buf tmp; - int ret; - - bch2_bkey_buf_init(&tmp); - bch2_bkey_buf_reassemble(&tmp, c, old); - - b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); - if (!IS_ERR_OR_NULL(b)) { - mutex_lock(&c->btree_cache.lock); - - __bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, new); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - - mutex_unlock(&c->btree_cache.lock); - six_unlock_read(&b->c.lock); - } - - bch2_bkey_buf_exit(&tmp, c); -} - -__flatten -static inline struct btree *btree_cache_find(struct btree_cache *bc, - const struct bkey_i *k) -{ - u64 v = btree_ptr_hash_val(k); - - return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); -} - -static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, - bool flush, bool locked) -{ - struct btree_cache *bc = &c->btree_cache; - - lockdep_assert_held(&bc->lock); - - if (btree_node_noevict(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - if (btree_node_write_blocked(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - if (btree_node_will_make_reachable(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (btree_node_dirty(b)) { - if (!flush) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (locked) { - /* - * Using the underscore version because we don't want to compact - * bsets after the write, since this node is about to be evicted - * - unless btree verify mode is enabled, since it runs out of - * the post write cleanup: - */ - if (static_branch_unlikely(&bch2_verify_btree_ondisk)) - bch2_btree_node_write(c, b, SIX_LOCK_intent, - BTREE_WRITE_cache_reclaim); - else - __bch2_btree_node_write(c, b, - BTREE_WRITE_cache_reclaim); - } - } - - if (b->flags & ((1U << BTREE_NODE_read_in_flight)| - (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_read_in_flight(b)) - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; - else if (btree_node_write_in_flight(b)) - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (locked) - return -EINTR; - - /* XXX: waiting on IO with btree cache lock held */ - bch2_btree_node_wait_on_read(b); - bch2_btree_node_wait_on_write(b); - } - - return 0; -} - -/* - * this version is for btree nodes that have already been freed (we're not - * reaping a real btree node) - */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) -{ - struct btree_cache *bc = &c->btree_cache; - int ret = 0; - - lockdep_assert_held(&bc->lock); -retry_unlocked: - ret = __btree_node_reclaim_checks(c, b, flush, false); - if (ret) - return ret; - - if (!six_trylock_intent(&b->c.lock)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (!six_trylock_write(&b->c.lock)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; - six_unlock_intent(&b->c.lock); - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - /* recheck under lock */ - ret = __btree_node_reclaim_checks(c, b, flush, true); - if (ret) { - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - if (ret == -EINTR) - goto retry_unlocked; - return ret; - } - - if (b->hash_val && !ret) - trace_and_count(c, btree_cache_reap, c, b); - return 0; -} - -static int btree_node_reclaim(struct bch_fs *c, struct btree *b) -{ - return __btree_node_reclaim(c, b, false); -} - -static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) -{ - return __btree_node_reclaim(c, b, true); -} - -static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct btree_cache_list *list = shrink->private_data; - struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); - struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); - struct btree *b, *t; - unsigned long nr = sc->nr_to_scan; - unsigned long can_free = 0; - unsigned long freed = 0; - unsigned long touched = 0; - unsigned i, flags; - unsigned long ret = SHRINK_STOP; - bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; - - if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) - return SHRINK_STOP; - - mutex_lock(&bc->lock); - flags = memalloc_nofs_save(); - - /* - * It's _really_ critical that we don't free too many btree nodes - we - * have to always leave ourselves a reserve. The reserve is how we - * guarantee that allocating memory for a new btree node can always - * succeed, so that inserting keys into the btree can always succeed and - * IO can always make forward progress: - */ - can_free = btree_cache_can_free(list); - if (nr > can_free) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; - nr = can_free; - } - - i = 0; - list_for_each_entry_safe(b, t, &bc->freeable, list) { - /* - * Leave a few nodes on the freeable list, so that a btree split - * won't have to hit the system allocator: - */ - if (++i <= 3) - continue; - - touched++; - - if (touched >= nr) - goto out; - - if (!btree_node_reclaim(c, b)) { - btree_node_data_free(bc, b); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - freed++; - bc->nr_freed++; - } - } -restart: - list_for_each_entry_safe(b, t, &list->list, list) { - touched++; - - if (btree_node_accessed(b)) { - clear_btree_node_accessed(b); - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; - --touched;; - } else if (!btree_node_reclaim(c, b)) { - __bch2_btree_node_hash_remove(bc, b); - __btree_node_data_free(b); - btree_node_to_freedlist(bc, b); - - freed++; - bc->nr_freed++; - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - if (freed == nr) - goto out_rotate; - } else if (trigger_writes && - btree_node_dirty(b) && - !btree_node_will_make_reachable(b) && - !btree_node_write_blocked(b) && - six_trylock_read(&b->c.lock)) { - list_move(&list->list, &b->list); - mutex_unlock(&bc->lock); - __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); - six_unlock_read(&b->c.lock); - if (touched >= nr) - goto out_nounlock; - mutex_lock(&bc->lock); - goto restart; - } - - if (touched >= nr) - break; - } -out_rotate: - if (&t->list != &list->list) - list_move_tail(&list->list, &t->list); -out: - mutex_unlock(&bc->lock); -out_nounlock: - ret = freed; - memalloc_nofs_restore(flags); - trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); - return ret; -} - -static unsigned long bch2_btree_cache_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct btree_cache_list *list = shrink->private_data; - - if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) - return 0; - - return btree_cache_can_free(list); -} - -void bch2_fs_btree_cache_exit(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct btree *b, *t; - unsigned long flags; - - shrinker_free(bc->live[1].shrink); - shrinker_free(bc->live[0].shrink); - - /* vfree() can allocate memory: */ - flags = memalloc_nofs_save(); - mutex_lock(&bc->lock); - - if (c->verify_data) - list_move(&c->verify_data->list, &bc->live[0].list); - - kvfree(c->verify_ondisk); - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (r->b) - list_add(&r->b->list, &bc->live[0].list); - } - - list_for_each_entry_safe(b, t, &bc->live[1].list, list) - bch2_btree_node_hash_remove(bc, b); - list_for_each_entry_safe(b, t, &bc->live[0].list, list) - bch2_btree_node_hash_remove(bc, b); - - list_for_each_entry_safe(b, t, &bc->freeable, list) { - BUG_ON(btree_node_read_in_flight(b) || - btree_node_write_in_flight(b)); - - btree_node_data_free(bc, b); - cond_resched(); - } - - BUG_ON(!bch2_journal_error(&c->journal) && - atomic_long_read(&c->btree_cache.nr_dirty)); - - list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); - - list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { - list_del(&b->list); - six_lock_exit(&b->c.lock); - kfree(b); - } - - mutex_unlock(&bc->lock); - memalloc_nofs_restore(flags); - - for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) - BUG_ON(bc->nr_by_btree[i]); - BUG_ON(bc->live[0].nr); - BUG_ON(bc->live[1].nr); - BUG_ON(bc->nr_freeable); - - if (bc->table_init_done) - rhashtable_destroy(&bc->table); -} - -int bch2_fs_btree_cache_init(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct shrinker *shrink; - unsigned i; - int ret = 0; - - ret = rhashtable_init(&bc->table, &bch_btree_cache_params); - if (ret) - goto err; - - bc->table_init_done = true; - - bch2_recalc_btree_reserve(c); - - for (i = 0; i < bc->nr_reserve; i++) { - struct btree *b = __bch2_btree_node_mem_alloc(c); - if (!b) - goto err; - __bch2_btree_node_to_freelist(bc, b); - } - - list_splice_init(&bc->live[0].list, &bc->freeable); - - mutex_init(&c->verify_lock); - - shrink = shrinker_alloc(0, "%s-btree_cache", c->name); - if (!shrink) - goto err; - bc->live[0].shrink = shrink; - shrink->count_objects = bch2_btree_cache_count; - shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 2; - shrink->private_data = &bc->live[0]; - shrinker_register(shrink); - - shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); - if (!shrink) - goto err; - bc->live[1].shrink = shrink; - shrink->count_objects = bch2_btree_cache_count; - shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 8; - shrink->private_data = &bc->live[1]; - shrinker_register(shrink); - - return 0; -err: - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); -} - -void bch2_fs_btree_cache_init_early(struct btree_cache *bc) -{ - mutex_init(&bc->lock); - for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { - bc->live[i].idx = i; - INIT_LIST_HEAD(&bc->live[i].list); - } - INIT_LIST_HEAD(&bc->freeable); - INIT_LIST_HEAD(&bc->freed_pcpu); - INIT_LIST_HEAD(&bc->freed_nonpcpu); -} - -/* - * We can only have one thread cannibalizing other cached btree nodes at a time, - * or we'll deadlock. We use an open coded mutex to ensure that, which a - * cannibalize_bucket() will take. This means every time we unlock the root of - * the btree, we need to release this lock if we have it held. - */ -void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - - if (bc->alloc_lock == current) { - trace_and_count(c, btree_cache_cannibalize_unlock, trans); - bc->alloc_lock = NULL; - closure_wake_up(&bc->alloc_wait); - } -} - -int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct task_struct *old; - - old = NULL; - if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) - goto success; - - if (!cl) { - trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock); - } - - closure_wait(&bc->alloc_wait, cl); - - /* Try again, after adding ourselves to waitlist */ - old = NULL; - if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { - /* We raced */ - closure_wake_up(&bc->alloc_wait); - goto success; - } - - trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return bch_err_throw(c, btree_cache_cannibalize_lock_blocked); - -success: - trace_and_count(c, btree_cache_cannibalize_lock, trans); - return 0; -} - -static struct btree *btree_node_cannibalize(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) - list_for_each_entry_reverse(b, &bc->live[i].list, list) - if (!btree_node_reclaim(c, b)) - return b; - - while (1) { - for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) - list_for_each_entry_reverse(b, &bc->live[i].list, list) - if (!btree_node_write_and_reclaim(c, b)) - return b; - - /* - * Rare case: all nodes were intent-locked. - * Just busy-wait. - */ - WARN_ONCE(1, "btree cache cannibalize failed\n"); - cond_resched(); - } -} - -struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct list_head *freed = pcpu_read_locks - ? &bc->freed_pcpu - : &bc->freed_nonpcpu; - struct btree *b, *b2; - u64 start_time = local_clock(); - - mutex_lock(&bc->lock); - - /* - * We never free struct btree itself, just the memory that holds the on - * disk node. Check the freed list before allocating a new one: - */ - list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b)) { - list_del_init(&b->list); - goto got_node; - } - - b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); - if (b) { - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); - } else { - mutex_unlock(&bc->lock); - bch2_trans_unlock(trans); - b = __btree_node_mem_alloc(c, GFP_KERNEL); - if (!b) - goto err; - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); - mutex_lock(&bc->lock); - } - - BUG_ON(!six_trylock_intent(&b->c.lock)); - BUG_ON(!six_trylock_write(&b->c.lock)); - -got_node: - /* - * btree_free() doesn't free memory; it sticks the node on the end of - * the list. Check if there's any freed nodes there: - */ - list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2)) { - swap(b->data, b2->data); - swap(b->aux_data, b2->aux_data); - - list_del_init(&b2->list); - --bc->nr_freeable; - btree_node_to_freedlist(bc, b2); - mutex_unlock(&bc->lock); - - six_unlock_write(&b2->c.lock); - six_unlock_intent(&b2->c.lock); - goto got_mem; - } - - mutex_unlock(&bc->lock); - - if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { - bch2_trans_unlock(trans); - if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) - goto err; - } - -got_mem: - BUG_ON(!list_empty(&b->list)); - BUG_ON(btree_node_hashed(b)); - BUG_ON(btree_node_dirty(b)); - BUG_ON(btree_node_write_in_flight(b)); -out: - b->flags = 0; - b->written = 0; - b->nsets = 0; - b->sib_u64s[0] = 0; - b->sib_u64s[1] = 0; - b->whiteout_u64s = 0; - bch2_btree_keys_init(b); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], - start_time); - - int ret = bch2_trans_relock(trans); - if (unlikely(ret)) { - bch2_btree_node_to_freelist(c, b); - return ERR_PTR(ret); - } - - return b; -err: - mutex_lock(&bc->lock); - - /* Try to cannibalize another cached btree node: */ - if (bc->alloc_lock == current) { - b2 = btree_node_cannibalize(c); - clear_btree_node_just_written(b2); - __bch2_btree_node_hash_remove(bc, b2); - - if (b) { - swap(b->data, b2->data); - swap(b->aux_data, b2->aux_data); - btree_node_to_freedlist(bc, b2); - six_unlock_write(&b2->c.lock); - six_unlock_intent(&b2->c.lock); - } else { - b = b2; - } - - BUG_ON(!list_empty(&b->list)); - mutex_unlock(&bc->lock); - - trace_and_count(c, btree_cache_cannibalize, trans); - goto out; - } - - mutex_unlock(&bc->lock); - return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); -} - -/* Slowpath, don't want it inlined into btree_iter_traverse() */ -static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, - struct btree_path *path, - const struct bkey_i *k, - enum btree_id btree_id, - unsigned level, - enum six_lock_type lock_type, - bool sync) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - if (unlikely(level >= BTREE_MAX_DEPTH)) { - int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", - level, BTREE_MAX_DEPTH); - return ERR_PTR(ret); - } - - if (unlikely(!bkey_is_btree_ptr(&k->k))) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - - int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); - printbuf_exit(&buf); - return ERR_PTR(ret); - } - - if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - - int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); - printbuf_exit(&buf); - return ERR_PTR(ret); - } - - /* - * Parent node must be locked, else we could read in a btree node that's - * been freed: - */ - if (path && !bch2_btree_node_relock(trans, path, level + 1)) { - trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); - } - - b = bch2_btree_node_mem_alloc(trans, level != 0); - - if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { - if (!path) - return b; - - trans->memory_allocation_failure = true; - trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); - } - - if (IS_ERR(b)) - return b; - - bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { - /* raced with another fill: */ - - /* mark as unhashed... */ - b->hash_val = 0; - - mutex_lock(&bc->lock); - __bch2_btree_node_to_freelist(bc, b); - mutex_unlock(&bc->lock); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - return NULL; - } - - set_btree_node_read_in_flight(b); - six_unlock_write(&b->c.lock); - - if (path) { - u32 seq = six_lock_seq(&b->c.lock); - - /* Unlock before doing IO: */ - six_unlock_intent(&b->c.lock); - bch2_trans_unlock(trans); - - bch2_btree_node_read(trans, b, sync); - - int ret = bch2_trans_relock(trans); - if (ret) - return ERR_PTR(ret); - - if (!sync) - return NULL; - - if (!six_relock_type(&b->c.lock, lock_type, seq)) - b = NULL; - } else { - bch2_btree_node_read(trans, b, sync); - if (lock_type == SIX_LOCK_read) - six_lock_downgrade(&b->c.lock); - } - - return b; -} - -static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) -{ - struct printbuf buf = PRINTBUF; - - if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) - return; - - prt_printf(&buf, - "btree node header doesn't match ptr: "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, "\nptr: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - prt_str(&buf, "\nheader: "); - bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); - prt_str(&buf, "\nmin "); - bch2_bpos_to_text(&buf, b->data->min_key); - - prt_printf(&buf, "\nmax "); - bch2_bpos_to_text(&buf, b->data->max_key); - - bch2_fs_topology_error(c, "%s", buf.buf); - - printbuf_exit(&buf); -} - -static inline void btree_check_header(struct bch_fs *c, struct btree *b) -{ - if (b->c.btree_id != BTREE_NODE_ID(b->data) || - b->c.level != BTREE_NODE_LEVEL(b->data) || - !bpos_eq(b->data->max_key, b->key.k.p) || - (b->key.k.type == KEY_TYPE_btree_ptr_v2 && - !bpos_eq(b->data->min_key, - bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) - btree_bad_header(c, b); -} - -static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, - const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - bool need_relock = false; - int ret; - - EBUG_ON(level >= BTREE_MAX_DEPTH); -retry: - b = btree_cache_find(bc, k); - if (unlikely(!b)) { - /* - * We must have the parent locked to call bch2_btree_node_fill(), - * else we could read in a btree node from disk that's been - * freed: - */ - b = bch2_btree_node_fill(trans, path, k, path->btree_id, - level, lock_type, true); - need_relock = true; - - /* We raced and found the btree node in the cache */ - if (!b) - goto retry; - - if (IS_ERR(b)) - return b; - } else { - if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(trans, path, level + 1); - - ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ERR_PTR(ret); - - BUG_ON(ret); - - if (unlikely(b->hash_val != btree_ptr_hash_val(k) || - b->c.level != level || - race_fault())) { - six_unlock_type(&b->c.lock, lock_type); - if (bch2_btree_node_relock(trans, path, level + 1)) - goto retry; - - trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); - } - - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - } - - if (unlikely(btree_node_read_in_flight(b))) { - u32 seq = six_lock_seq(&b->c.lock); - - six_unlock_type(&b->c.lock, lock_type); - bch2_trans_unlock(trans); - need_relock = true; - - bch2_btree_node_wait_on_read(b); - - ret = bch2_trans_relock(trans); - if (ret) - return ERR_PTR(ret); - - /* - * should_be_locked is not set on this path yet, so we need to - * relock it specifically: - */ - if (!six_relock_type(&b->c.lock, lock_type, seq)) - goto retry; - } - - if (unlikely(need_relock)) { - ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(ret); - } - } - - prefetch(b->aux_data); - - for_each_bset(b, t) { - void *p = (u64 *) b->aux_data + t->aux_data_offset; - - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - } - - if (unlikely(btree_node_read_error(b))) { - six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - } - - EBUG_ON(b->c.btree_id != path->btree_id); - EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - btree_check_header(c, b); - - return b; -} - -/** - * bch2_btree_node_get - find a btree node in the cache and lock it, reading it - * in from disk if necessary. - * - * @trans: btree transaction object - * @path: btree_path being traversed - * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) - * @level: level of btree node being looked up (0 == leaf node) - * @lock_type: SIX_LOCK_read or SIX_LOCK_intent - * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) - * - * The btree node will have either a read or a write lock held, depending on - * the @write parameter. - * - * Returns: btree node or ERR_PTR() - */ -struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, - const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree *b; - int ret; - - EBUG_ON(level >= BTREE_MAX_DEPTH); - - b = btree_node_mem_ptr(k); - - /* - * Check b->hash_val _before_ calling btree_node_lock() - this might not - * be the node we want anymore, and trying to lock the wrong node could - * cause an unneccessary transaction restart: - */ - if (unlikely(!c->opts.btree_node_mem_ptr_optimization || - !b || - b->hash_val != btree_ptr_hash_val(k))) - return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); - - if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(trans, path, level + 1); - - ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ERR_PTR(ret); - - BUG_ON(ret); - - if (unlikely(b->hash_val != btree_ptr_hash_val(k) || - b->c.level != level || - race_fault())) { - six_unlock_type(&b->c.lock, lock_type); - if (bch2_btree_node_relock(trans, path, level + 1)) - return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); - - trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); - } - - if (unlikely(btree_node_read_in_flight(b))) { - six_unlock_type(&b->c.lock, lock_type); - return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); - } - - prefetch(b->aux_data); - - for_each_bset(b, t) { - void *p = (u64 *) b->aux_data + t->aux_data_offset; - - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - } - - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - - if (unlikely(btree_node_read_error(b))) { - six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - } - - EBUG_ON(b->c.btree_id != path->btree_id); - EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - btree_check_header(c, b); - - return b; -} - -struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, - const struct bkey_i *k, - enum btree_id btree_id, - unsigned level, - bool nofill) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - int ret; - - EBUG_ON(level >= BTREE_MAX_DEPTH); - - if (c->opts.btree_node_mem_ptr_optimization) { - b = btree_node_mem_ptr(k); - if (b) - goto lock_node; - } -retry: - b = btree_cache_find(bc, k); - if (unlikely(!b)) { - if (nofill) - goto out; - - b = bch2_btree_node_fill(trans, NULL, k, btree_id, - level, SIX_LOCK_read, true); - - /* We raced and found the btree node in the cache */ - if (!b) - goto retry; - - if (IS_ERR(b) && - !bch2_btree_cache_cannibalize_lock(trans, NULL)) - goto retry; - - if (IS_ERR(b)) - goto out; - } else { -lock_node: - ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ERR_PTR(ret); - - BUG_ON(ret); - - if (unlikely(b->hash_val != btree_ptr_hash_val(k) || - b->c.btree_id != btree_id || - b->c.level != level)) { - six_unlock_read(&b->c.lock); - goto retry; - } - - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - } - - /* XXX: waiting on IO with btree locks held: */ - __bch2_btree_node_wait_on_read(b); - - prefetch(b->aux_data); - - for_each_bset(b, t) { - void *p = (u64 *) b->aux_data + t->aux_data_offset; - - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - } - - if (unlikely(btree_node_read_error(b))) { - six_unlock_read(&b->c.lock); - b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - goto out; - } - - EBUG_ON(b->c.btree_id != btree_id); - EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - btree_check_header(c, b); -out: - bch2_btree_cache_cannibalize_unlock(trans); - return b; -} - -int bch2_btree_node_prefetch(struct btree_trans *trans, - struct btree_path *path, - const struct bkey_i *k, - enum btree_id btree_id, unsigned level) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - - BUG_ON(path && !btree_node_locked(path, level + 1)); - BUG_ON(level >= BTREE_MAX_DEPTH); - - struct btree *b = btree_cache_find(bc, k); - if (b) - return 0; - - b = bch2_btree_node_fill(trans, path, k, btree_id, - level, SIX_LOCK_read, false); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - return ret; - if (b) - six_unlock_read(&b->c.lock); - return 0; -} - -void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - b = btree_cache_find(bc, k); - if (!b) - return; - - BUG_ON(b == btree_node_root(trans->c, b)); -wait_on_io: - /* not allowed to wait on io with btree locks held: */ - - /* XXX we're called from btree_gc which will be holding other btree - * nodes locked - */ - __bch2_btree_node_wait_on_read(b); - __bch2_btree_node_wait_on_write(b); - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - if (unlikely(b->hash_val != btree_ptr_hash_val(k))) - goto out; - - if (btree_node_dirty(b)) { - __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - goto wait_on_io; - } - - BUG_ON(btree_node_dirty(b)); - - mutex_lock(&bc->lock); - bch2_btree_node_hash_remove(bc, b); - btree_node_data_free(bc, b); - mutex_unlock(&bc->lock); -out: - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); -} - -const char *bch2_btree_id_str(enum btree_id btree) -{ - return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; -} - -void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) -{ - if (btree < BTREE_ID_NR) - prt_str(out, __bch2_btree_ids[btree]); - else - prt_printf(out, "(unknown btree %u)", btree); -} - -void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) -{ - prt_str(out, "btree="); - bch2_btree_id_to_text(out, btree); - prt_printf(out, " level=%u", level); -} - -void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, - enum btree_id btree, unsigned level, struct bkey_s_c k) -{ - bch2_btree_id_to_text(out, btree); - prt_printf(out, " level %u/", level); - struct btree_root *r = bch2_btree_id_root(c, btree); - if (r) - prt_printf(out, "%u", r->level); - else - prt_printf(out, "(unknown)"); - prt_newline(out); - - bch2_bkey_val_to_text(out, c, k); -} - -void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) -{ - __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); -} - -void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) -{ - struct bset_stats stats; - - memset(&stats, 0, sizeof(stats)); - - bch2_btree_keys_stats(b, &stats); - - prt_printf(out, "l %u ", b->c.level); - bch2_bpos_to_text(out, b->data->min_key); - prt_printf(out, " - "); - bch2_bpos_to_text(out, b->data->max_key); - prt_printf(out, ":\n" - " ptrs: "); - bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); - prt_newline(out); - - prt_printf(out, - " format: "); - bch2_bkey_format_to_text(out, &b->format); - - prt_printf(out, - " unpack fn len: %u\n" - " bytes used %zu/%zu (%zu%% full)\n" - " sib u64s: %u, %u (merge threshold %u)\n" - " nr packed keys %u\n" - " nr unpacked keys %u\n" - " floats %zu\n" - " failed unpacked %zu\n", - b->unpack_fn_len, - b->nr.live_u64s * sizeof(u64), - btree_buf_bytes(b) - sizeof(struct btree_node), - b->nr.live_u64s * 100 / btree_max_u64s(c), - b->sib_u64s[0], - b->sib_u64s[1], - c->btree_foreground_merge_threshold, - b->nr.packed_keys, - b->nr.unpacked_keys, - stats.floats, - stats.failed); -} - -static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, - const char *label, size_t nr) -{ - prt_printf(out, "%s\t", label); - prt_human_readable_u64(out, nr * c->opts.btree_node_size); - prt_printf(out, " (%zu)\n", nr); -} - -static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { -#define x(n) #n, - BCH_BTREE_CACHE_NOT_FREED_REASONS() -#undef x - NULL -}; - -void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); - - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - prt_btree_cache_line(out, c, "live:", bc->live[0].nr); - prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); - prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); - prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); - prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); - prt_newline(out); - - for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { - bch2_btree_id_to_text(out, i); - prt_printf(out, "\t"); - prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); - prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); - } - - prt_newline(out); - prt_printf(out, "counters since mount:\n"); - prt_printf(out, "freed:\t%zu\n", bc->nr_freed); - prt_printf(out, "not freed:\n"); - - for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) - prt_printf(out, " %s\t%llu\n", - bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); -} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h deleted file mode 100644 index be275f87a60e04..00000000000000 --- a/fs/bcachefs/btree_cache.h +++ /dev/null @@ -1,157 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_CACHE_H -#define _BCACHEFS_BTREE_CACHE_H - -#include "bcachefs.h" -#include "btree_types.h" -#include "bkey_methods.h" - -extern const char * const bch2_btree_node_flags[]; - -struct btree_iter; - -void bch2_recalc_btree_reserve(struct bch_fs *); - -void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *); - -void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); -void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); - -int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); -int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, - unsigned, enum btree_id); - -void bch2_node_pin(struct bch_fs *, struct btree *); -void bch2_btree_cache_unpin(struct bch_fs *); - -void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *); - -void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); -int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); - -void __btree_node_data_free(struct btree *); -struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); -struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); - -struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, - const struct bkey_i *, unsigned, - enum six_lock_type, unsigned long); - -struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *, - enum btree_id, unsigned, bool); - -int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *, - const struct bkey_i *, enum btree_id, unsigned); - -void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *); - -void bch2_fs_btree_cache_exit(struct bch_fs *); -int bch2_fs_btree_cache_init(struct bch_fs *); -void bch2_fs_btree_cache_init_early(struct btree_cache *); - -static inline u64 btree_ptr_hash_val(const struct bkey_i *k) -{ - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); - case KEY_TYPE_btree_ptr_v2: - /* - * The cast/deref is only necessary to avoid sparse endianness - * warnings: - */ - return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq); - default: - return 0; - } -} - -static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) -{ - return k->k.type == KEY_TYPE_btree_ptr_v2 - ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr - : NULL; -} - -/* is btree node in hash table? */ -static inline bool btree_node_hashed(struct btree *b) -{ - return b->hash_val != 0; -} - -#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ - for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ - &(_c)->btree_cache.table), \ - _iter = 0; _iter < (_tbl)->size; _iter++) \ - rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) - -static inline size_t btree_buf_bytes(const struct btree *b) -{ - return 1UL << b->byte_order; -} - -static inline size_t btree_buf_max_u64s(const struct btree *b) -{ - return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64); -} - -static inline size_t btree_max_u64s(const struct bch_fs *c) -{ - return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64); -} - -static inline size_t btree_sectors(const struct bch_fs *c) -{ - return c->opts.btree_node_size >> SECTOR_SHIFT; -} - -static inline unsigned btree_blocks(const struct bch_fs *c) -{ - return btree_sectors(c) >> c->block_bits; -} - -#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) - -#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) -#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ - (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ - (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) - -static inline unsigned btree_id_nr_alive(struct bch_fs *c) -{ - return BTREE_ID_NR + c->btree_roots_extra.nr; -} - -static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id) -{ - if (likely(id < BTREE_ID_NR)) { - return &c->btree_roots_known[id]; - } else { - unsigned idx = id - BTREE_ID_NR; - - /* This can happen when we're called from btree_node_scan */ - if (idx >= c->btree_roots_extra.nr) - return NULL; - - return &c->btree_roots_extra.data[idx]; - } -} - -static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) -{ - struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); - - return r ? r->b : NULL; -} - -const char *bch2_btree_id_str(enum btree_id); /* avoid */ -void bch2_btree_id_to_text(struct printbuf *, enum btree_id); -void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned); - -void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, - enum btree_id, unsigned, struct bkey_s_c); -void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); -void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); -void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); - -#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c deleted file mode 100644 index bac108e93823c2..00000000000000 --- a/fs/bcachefs/btree_gc.c +++ /dev/null @@ -1,1308 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2010 Kent Overstreet - * Copyright (C) 2014 Datera Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_methods.h" -#include "bkey_buf.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_node_scan.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "btree_gc.h" -#include "buckets.h" -#include "clock.h" -#include "debug.h" -#include "disk_accounting.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "journal.h" -#include "keylist.h" -#include "move.h" -#include "progress.h" -#include "recovery_passes.h" -#include "reflink.h" -#include "recovery.h" -#include "replicas.h" -#include "super-io.h" -#include "trace.h" - -#include -#include -#include -#include -#include -#include -#include - -#define DROP_THIS_NODE 10 -#define DROP_PREV_NODE 11 -#define DID_FILL_FROM_SCAN 12 - -/* - * Returns true if it's a btree we can easily reconstruct, or otherwise won't - * cause data loss if it's missing: - */ -static bool btree_id_important(enum btree_id btree) -{ - if (btree_id_is_alloc(btree)) - return false; - - switch (btree) { - case BTREE_ID_quotas: - case BTREE_ID_snapshot_trees: - case BTREE_ID_logged_ops: - case BTREE_ID_rebalance_work: - case BTREE_ID_subvolume_children: - return false; - default: - return true; - } -} - -static const char * const bch2_gc_phase_strs[] = { -#define x(n) #n, - GC_PHASES() -#undef x - NULL -}; - -void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p) -{ - prt_str(out, bch2_gc_phase_strs[p->phase]); - prt_char(out, ' '); - bch2_btree_id_level_to_text(out, p->btree, p->level); - prt_char(out, ' '); - bch2_bpos_to_text(out, p->pos); -} - -static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) -{ - return (struct bkey_s) {{{ - (struct bkey *) k.k, - (struct bch_val *) k.v - }}}; -} - -static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - preempt_disable(); - write_seqcount_begin(&c->gc_pos_lock); - c->gc_pos = new_pos; - write_seqcount_end(&c->gc_pos_lock); - preempt_enable(); -} - -static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0); - __gc_pos_set(c, new_pos); -} - -static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) -{ - switch (b->key.k.type) { - case KEY_TYPE_btree_ptr: { - struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); - - dst->k.p = src->k.p; - dst->v.mem_ptr = 0; - dst->v.seq = b->data->keys.seq; - dst->v.sectors_written = 0; - dst->v.flags = 0; - dst->v.min_key = b->data->min_key; - set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); - memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); - break; - } - case KEY_TYPE_btree_ptr_v2: - bkey_copy(&dst->k_i, &b->key); - break; - default: - BUG(); - } -} - -static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) -{ - struct bkey_i_btree_ptr_v2 *new; - int ret; - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, " -> "); - bch2_bpos_to_text(&buf, new_min); - - bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); - if (!new) - return bch_err_throw(c, ENOMEM_gc_repair_key); - - btree_ptr_to_v2(b, new); - b->data->min_key = new_min; - new->v.min_key = new_min; - SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); - - ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); - if (ret) { - kfree(new); - return ret; - } - - bch2_btree_node_drop_keys_outside_node(b); - bkey_copy(&b->key, &new->k_i); - return 0; -} - -static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) -{ - struct bkey_i_btree_ptr_v2 *new; - int ret; - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, " -> "); - bch2_bpos_to_text(&buf, new_max); - - bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); - if (ret) - return ret; - - new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); - if (!new) - return bch_err_throw(c, ENOMEM_gc_repair_key); - - btree_ptr_to_v2(b, new); - b->data->max_key = new_max; - new->k.p = new_max; - SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); - - ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); - if (ret) { - kfree(new); - return ret; - } - - bch2_btree_node_drop_keys_outside_node(b); - - mutex_lock(&c->btree_cache.lock); - __bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, &new->k_i); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - mutex_unlock(&c->btree_cache.lock); - return 0; -} - -static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b, - struct btree *prev, struct btree *cur, - struct bpos *pulled_from_scan) -{ - struct bch_fs *c = trans->c; - struct bpos expected_start = !prev - ? b->data->min_key - : bpos_successor(prev->key.k.p); - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, - b->data->min_key)); - - if (bpos_eq(expected_start, cur->data->min_key)) - return 0; - - prt_printf(&buf, " at "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_printf(&buf, ":\nparent: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - if (prev) { - prt_printf(&buf, "\nprev: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); - } - - prt_str(&buf, "\nnext: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); - - if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ - if (b->c.level == 1 && - bpos_lt(*pulled_from_scan, cur->data->min_key)) { - ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, - expected_start, - bpos_predecessor(cur->data->min_key)); - if (ret) - goto err; - - *pulled_from_scan = cur->data->min_key; - ret = DID_FILL_FROM_SCAN; - } else { - if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, - "btree node with incorrect min_key%s", buf.buf)) - ret = set_node_min(c, cur, expected_start); - } - } else { /* overlap */ - if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */ - if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ - if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node, - "btree node overwritten by next node%s", buf.buf)) - ret = DROP_PREV_NODE; - } else { - if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, - "btree node with incorrect max_key%s", buf.buf)) - ret = set_node_max(c, prev, - bpos_predecessor(cur->data->min_key)); - } - } else { - if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ - if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node, - "btree node overwritten by prev node%s", buf.buf)) - ret = DROP_THIS_NODE; - } else { - if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, - "btree node with incorrect min_key%s", buf.buf)) - ret = set_node_min(c, cur, expected_start); - } - } - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, - struct btree *child, struct bpos *pulled_from_scan) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (bpos_eq(child->key.k.p, b->key.k.p)) - return 0; - - prt_printf(&buf, "\nat: "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_printf(&buf, "\nparent: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - prt_str(&buf, "\nchild: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); - - if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, - "btree node with incorrect max_key%s", buf.buf)) { - if (b->c.level == 1 && - bpos_lt(*pulled_from_scan, b->key.k.p)) { - ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, - bpos_successor(child->key.k.p), b->key.k.p); - if (ret) - goto err; - - *pulled_from_scan = b->key.k.p; - ret = DID_FILL_FROM_SCAN; - } else { - ret = set_node_max(c, child, b->key.k.p); - } - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b, - struct bpos *pulled_from_scan) -{ - struct bch_fs *c = trans->c; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct bkey_buf prev_k, cur_k; - struct btree *prev = NULL, *cur = NULL; - bool have_child, new_pass = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (!b->c.level) - return 0; - - bch2_bkey_buf_init(&prev_k); - bch2_bkey_buf_init(&cur_k); -again: - cur = prev = NULL; - have_child = new_pass = false; - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - iter.prefetch = true; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - BUG_ON(bpos_lt(k.k->p, b->data->min_key)); - BUG_ON(bpos_gt(k.k->p, b->data->max_key)); - - bch2_btree_and_journal_iter_advance(&iter); - bch2_bkey_buf_reassemble(&cur_k, c, k); - - cur = bch2_btree_node_get_noiter(trans, cur_k.k, - b->c.btree_id, b->c.level - 1, - false); - ret = PTR_ERR_OR_ZERO(cur); - - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - - if (bch2_err_matches(ret, EIO)) { - bch2_btree_node_evict(trans, cur_k.k); - cur = NULL; - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - if (ret) - break; - continue; - } - - bch_err_msg(c, ret, "getting btree node"); - if (ret) - break; - - if (bch2_btree_node_is_stale(c, cur)) { - bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf); - six_unlock_read(&cur->c.lock); - bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - cur = NULL; - if (ret) - break; - continue; - } - - ret = lockrestart_do(trans, - btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan)); - if (ret < 0) - goto err; - - if (ret == DID_FILL_FROM_SCAN) { - new_pass = true; - ret = 0; - } - - if (ret == DROP_THIS_NODE) { - six_unlock_read(&cur->c.lock); - bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - cur = NULL; - if (ret) - break; - continue; - } - - if (prev) - six_unlock_read(&prev->c.lock); - prev = NULL; - - if (ret == DROP_PREV_NODE) { - bch_info(c, "dropped prev node"); - bch2_btree_node_evict(trans, prev_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, prev_k.k->k.p); - if (ret) - break; - - bch2_btree_and_journal_iter_exit(&iter); - goto again; - } else if (ret) - break; - - prev = cur; - cur = NULL; - bch2_bkey_buf_copy(&prev_k, c, cur_k.k); - } - - if (!ret && !IS_ERR_OR_NULL(prev)) { - BUG_ON(cur); - ret = lockrestart_do(trans, - btree_repair_node_end(trans, b, prev, pulled_from_scan)); - if (ret == DID_FILL_FROM_SCAN) { - new_pass = true; - ret = 0; - } - } - - if (!IS_ERR_OR_NULL(prev)) - six_unlock_read(&prev->c.lock); - prev = NULL; - if (!IS_ERR_OR_NULL(cur)) - six_unlock_read(&cur->c.lock); - cur = NULL; - - if (ret) - goto err; - - bch2_btree_and_journal_iter_exit(&iter); - - if (new_pass) - goto again; - - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - iter.prefetch = true; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - bch2_bkey_buf_reassemble(&cur_k, c, k); - bch2_btree_and_journal_iter_advance(&iter); - - cur = bch2_btree_node_get_noiter(trans, cur_k.k, - b->c.btree_id, b->c.level - 1, - false); - ret = PTR_ERR_OR_ZERO(cur); - - bch_err_msg(c, ret, "getting btree node"); - if (ret) - goto err; - - ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan); - six_unlock_read(&cur->c.lock); - cur = NULL; - - if (ret == DROP_THIS_NODE) { - bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - new_pass = true; - } - - if (ret) - goto err; - - have_child = true; - } - - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - /* - * XXX: we're not passing the trans object here because we're not set up - * to handle a transaction restart - this code needs to be rewritten - * when we start doing online topology repair - */ - bch2_trans_unlock_long(trans); - if (mustfix_fsck_err_on(!have_child, - c, btree_node_topology_interior_node_empty, - "empty interior btree node at %s", buf.buf)) - ret = DROP_THIS_NODE; -err: -fsck_err: - if (!IS_ERR_OR_NULL(prev)) - six_unlock_read(&prev->c.lock); - if (!IS_ERR_OR_NULL(cur)) - six_unlock_read(&cur->c.lock); - - bch2_btree_and_journal_iter_exit(&iter); - - if (!ret && new_pass) - goto again; - - BUG_ON(!ret && bch2_btree_node_check_topology(trans, b)); - - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_check_root(struct btree_trans *trans, enum btree_id btree, - bool *reconstructed_root) -{ - struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, btree); - struct printbuf buf = PRINTBUF; - int ret = 0; - - bch2_btree_id_to_text(&buf, btree); - - if (r->error) { - bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - - ret = bch2_btree_has_scanned_nodes(c, btree); - if (ret < 0) - goto err; - - if (!ret) { - __fsck_err(trans, - FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0), - btree_root_unreadable_and_scan_found_nothing, - "no nodes found for btree %s, continue?", buf.buf); - - r->alive = false; - r->error = 0; - bch2_btree_root_alloc_fake_trans(trans, btree, 0); - } else { - r->alive = false; - r->error = 0; - bch2_btree_root_alloc_fake_trans(trans, btree, 1); - - bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX); - if (ret) - goto err; - } - - *reconstructed_root = true; - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -int bch2_check_topology(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bpos pulled_from_scan = POS_MIN; - int ret = 0; - - bch2_trans_srcu_unlock(trans); - - for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - bool reconstructed_root = false; -recover: - ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root)); - if (ret) - break; - - struct btree_root *r = bch2_btree_id_root(c, i); - struct btree *b = r->b; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); - six_unlock_read(&b->c.lock); - - if (ret == DROP_THIS_NODE) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - r->b = NULL; - - if (!reconstructed_root) { - r->error = -EIO; - goto recover; - } - - struct printbuf buf = PRINTBUF; - bch2_btree_id_to_text(&buf, i); - bch_err(c, "empty btree root %s", buf.buf); - printbuf_exit(&buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); - r->alive = false; - ret = 0; - } - } - - bch2_trans_put(trans); - return ret; -} - -/* marking of btree keys/nodes: */ - -static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, - unsigned level, struct btree **prev, - struct btree_iter *iter, struct bkey_s_c k, - bool initial) -{ - struct bch_fs *c = trans->c; - - if (iter) { - struct btree_path *path = btree_iter_path(trans, iter); - struct btree *b = path_l(path)->b; - - if (*prev != b) { - int ret = bch2_btree_node_check_topology(trans, b); - if (ret) - return ret; - } - *prev = b; - } - - struct bkey deleted = KEY(0, 0, 0); - struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; - struct printbuf buf = PRINTBUF; - int ret = 0; - - deleted.p = k.k->p; - - if (initial) { - BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) && - k.k->bversion.lo > atomic64_read(&c->journal.seq)); - - if (fsck_err_on(btree_id != BTREE_ID_accounting && - k.k->bversion.lo > atomic64_read(&c->key_version), - trans, bkey_version_in_future, - "key version number higher than recorded %llu\n%s", - atomic64_read(&c->key_version), - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - atomic64_set(&c->key_version, k.k->bversion.lo); - } - - if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), - trans, btree_bitmap_not_marked, - "btree ptr not marked in member info btree allocated bitmap\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - mutex_lock(&c->sb_lock); - bch2_dev_btree_bitmap_mark(c, k); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - - /* - * We require a commit before key_trigger() because - * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the - * wrong result if we run it multiple times. - */ - unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0; - - ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), - BTREE_TRIGGER_check_repair|flags); - if (ret) - goto out; - - if (trans->nr_updates) { - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } - - ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), - BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags); -out: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_gc_btree(struct btree_trans *trans, - struct progress_indicator_state *progress, - enum btree_id btree, bool initial) -{ - struct bch_fs *c = trans->c; - unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; - int ret = 0; - - /* We need to make sure every leaf node is readable before going RW */ - if (initial) - target_depth = 0; - - for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) { - struct btree *prev = NULL; - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level, - BTREE_ITER_prefetch); - - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - bch2_progress_update_iter(trans, progress, &iter, "check_allocations"); - gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); - bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); - })); - if (ret) - goto err; - } - - /* root */ - do { -retry_root: - bch2_trans_begin(trans); - - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, - 0, bch2_btree_id_root(c, btree)->b->c.level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err_root; - - if (b != btree_node_root(c, b)) { - bch2_trans_iter_exit(trans, &iter); - goto retry_root; - } - - gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial); -err_root: - bch2_trans_iter_exit(trans, &iter); - } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); -err: - bch_err_fn(c, ret); - return ret; -} - -static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) -{ - return cmp_int(gc_btree_order(l), gc_btree_order(r)); -} - -static int bch2_gc_btrees(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct progress_indicator_state progress; - bch2_progress_init(&progress, c, ~0ULL); - - enum btree_id ids[BTREE_ID_NR]; - for (unsigned i = 0; i < BTREE_ID_NR; i++) - ids[i] = i; - bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - - for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - unsigned btree = i < BTREE_ID_NR ? ids[i] : i; - - if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) - continue; - - ret = bch2_gc_btree(trans, &progress, btree, true); - } - - printbuf_exit(&buf); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_mark_superblocks(struct bch_fs *c) -{ - gc_pos_set(c, gc_phase(GC_PHASE_sb)); - - return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); -} - -static void bch2_gc_free(struct bch_fs *c) -{ - bch2_accounting_gc_free(c); - - genradix_free(&c->reflink_gc_table); - genradix_free(&c->gc_stripes); - - for_each_member_device(c, ca) - genradix_free(&ca->buckets_gc); -} - -static int bch2_gc_start(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - int ret = bch2_dev_usage_init(ca, true); - if (ret) { - bch2_dev_put(ca); - return ret; - } - } - - return 0; -} - -/* returns true if not equal */ -static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, - struct bch_alloc_v4 r) -{ - return l.gen != r.gen || - l.oldest_gen != r.oldest_gen || - l.data_type != r.data_type || - l.dirty_sectors != r.dirty_sectors || - l.stripe_sectors != r.stripe_sectors || - l.cached_sectors != r.cached_sectors || - l.stripe_redundancy != r.stripe_redundancy || - l.stripe != r.stripe; -} - -static int bch2_alloc_write_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_dev *ca, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_i_alloc_v4 *a; - struct bch_alloc_v4 old_gc, gc, old_convert, new; - const struct bch_alloc_v4 *old; - int ret; - - if (!bucket_valid(ca, k.k->p.offset)) - return 0; - - old = bch2_alloc_to_v4(k, &old_convert); - gc = new = *old; - - __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); - - old_gc = gc; - - if ((old->data_type == BCH_DATA_sb || - old->data_type == BCH_DATA_journal) && - !bch2_dev_is_online(ca)) { - gc.data_type = old->data_type; - gc.dirty_sectors = old->dirty_sectors; - } - - /* - * gc.data_type doesn't yet include need_discard & need_gc_gen states - - * fix that here: - */ - alloc_data_type_set(&gc, gc.data_type); - if (gc.data_type != old_gc.data_type || - gc.dirty_sectors != old_gc.dirty_sectors) { - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc); - if (ret) - return ret; - - /* - * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not - * safe w.r.t. transaction restarts, so fixup the gc_bucket so - * we don't run it twice: - */ - struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); - gc_m->data_type = gc.data_type; - gc_m->dirty_sectors = gc.dirty_sectors; - } - - if (fsck_err_on(new.data_type != gc.data_type, - trans, alloc_key_data_type_wrong, - "bucket %llu:%llu gen %u has wrong data_type" - ": got %s, should be %s", - iter->pos.inode, iter->pos.offset, - gc.gen, - bch2_data_type_str(new.data_type), - bch2_data_type_str(gc.data_type))) - new.data_type = gc.data_type; - -#define copy_bucket_field(_errtype, _f) \ - if (fsck_err_on(new._f != gc._f, \ - trans, _errtype, \ - "bucket %llu:%llu gen %u data type %s has wrong " #_f \ - ": got %llu, should be %llu", \ - iter->pos.inode, iter->pos.offset, \ - gc.gen, \ - bch2_data_type_str(gc.data_type), \ - (u64) new._f, (u64) gc._f)) \ - new._f = gc._f; \ - - copy_bucket_field(alloc_key_gen_wrong, gen); - copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors); - copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors); - copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors); - copy_bucket_field(alloc_key_stripe_wrong, stripe); - copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy); -#undef copy_bucket_field - - if (!bch2_alloc_v4_cmp(*old, new)) - return 0; - - a = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; - - a->v = new; - - /* - * The trigger normally makes sure these are set, but we're not running - * triggers: - */ - if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) - a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - - ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun); -fsck_err: - return ret; -} - -static int bch2_gc_alloc_done(struct bch_fs *c) -{ - int ret = 0; - - for_each_member_device(c, ca) { - ret = bch2_trans_run(c, - for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, - POS(ca->dev_idx, ca->mi.first_bucket), - POS(ca->dev_idx, ca->mi.nbuckets - 1), - BTREE_ITER_slots|BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_alloc_write_key(trans, &iter, ca, k))); - if (ret) { - bch2_dev_put(ca); - break; - } - } - - bch_err_fn(c, ret); - return ret; -} - -static int bch2_gc_alloc_start(struct bch_fs *c) -{ - int ret = 0; - - for_each_member_device(c, ca) { - ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); - if (ret) { - bch2_dev_put(ca); - ret = bch_err_throw(c, ENOMEM_gc_alloc_start); - break; - } - } - - bch_err_fn(c, ret); - return ret; -} - -static int bch2_gc_write_stripes_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - const struct bch_stripe *s; - struct gc_stripe *m; - bool bad = false; - unsigned i; - int ret = 0; - - if (k.k->type != KEY_TYPE_stripe) - return 0; - - s = bkey_s_c_to_stripe(k).v; - m = genradix_ptr(&c->gc_stripes, k.k->p.offset); - - for (i = 0; i < s->nr_blocks; i++) { - u32 old = stripe_blockcount_get(s, i); - u32 new = (m ? m->block_sectors[i] : 0); - - if (old != new) { - prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n", - i, old, new); - bad = true; - } - } - - if (bad) - bch2_bkey_val_to_text(&buf, c, k); - - if (fsck_err_on(bad, - trans, stripe_sector_count_wrong, - "%s", buf.buf)) { - struct bkey_i_stripe *new; - - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bkey_reassemble(&new->k_i, k); - - for (i = 0; i < new->v.nr_blocks; i++) - stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); - - ret = bch2_trans_update(trans, iter, &new->k_i, 0); - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int bch2_gc_stripes_done(struct bch_fs *c) -{ - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_stripes, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_gc_write_stripes_key(trans, &iter, k))); -} - -/** - * bch2_check_allocations - walk all references to buckets, and recompute them: - * - * @c: filesystem object - * - * Returns: 0 on success, or standard errcode on failure - * - * Order matters here: - * - Concurrent GC relies on the fact that we have a total ordering for - * everything that GC walks - see gc_will_visit_node(), - * gc_will_visit_root() - * - * - also, references move around in the course of index updates and - * various other crap: everything needs to agree on the ordering - * references are allowed to move around in - e.g., we're allowed to - * start with a reference owned by an open_bucket (the allocator) and - * move it to the btree, but not the reverse. - * - * This is necessary to ensure that gc doesn't miss references that - * move around - if references move backwards in the ordering GC - * uses, GC could skip past them - */ -int bch2_check_allocations(struct bch_fs *c) -{ - int ret; - - down_read(&c->state_lock); - down_write(&c->gc_lock); - - bch2_btree_interior_updates_flush(c); - - ret = bch2_gc_accounting_start(c) ?: - bch2_gc_start(c) ?: - bch2_gc_alloc_start(c) ?: - bch2_gc_reflink_start(c); - if (ret) - goto out; - - gc_pos_set(c, gc_phase(GC_PHASE_start)); - - ret = bch2_mark_superblocks(c); - bch_err_msg(c, ret, "marking superblocks"); - if (ret) - goto out; - - ret = bch2_gc_btrees(c); - if (ret) - goto out; - - c->gc_count++; - - ret = bch2_gc_alloc_done(c) ?: - bch2_gc_accounting_done(c) ?: - bch2_gc_stripes_done(c) ?: - bch2_gc_reflink_done(c); -out: - percpu_down_write(&c->mark_lock); - /* Indicates that gc is no longer in progress: */ - __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); - - bch2_gc_free(c); - percpu_up_write(&c->mark_lock); - - up_write(&c->gc_lock); - up_read(&c->state_lock); - - /* - * At startup, allocations can happen directly instead of via the - * allocator thread - issue wakeup in case they blocked on gc_lock: - */ - closure_wake_up(&c->freelist_wait); - - if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) - bch2_sb_members_clean_deleted(c); - - bch_err_fn(c, ret); - return ret; -} - -static int gc_btree_gens_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) - return -EROFS; - - bool too_stale = false; - scoped_guard(rcu) { - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - too_stale |= dev_ptr_stale(ca, ptr) > 16; - } - - if (!too_stale) - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; - if (gen_after(*gen, ptr->gen)) - *gen = ptr->gen; - } - } - - if (too_stale) { - struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0); - int ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - bch2_extent_normalize(c, bkey_i_to_s(u)); - } - - return 0; -} - -static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca, - struct btree_iter *iter, struct bkey_s_c k) -{ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - struct bkey_i_alloc_v4 *a_mut; - int ret; - - if (a->oldest_gen == ca->oldest_gen[iter->pos.offset]) - return 0; - - a_mut = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a_mut); - if (ret) - return ret; - - a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; - - return bch2_trans_update(trans, iter, &a_mut->k_i, 0); -} - -int bch2_gc_gens(struct bch_fs *c) -{ - u64 b, start_time = local_clock(); - int ret; - - if (!mutex_trylock(&c->gc_gens_lock)) - return 0; - - trace_and_count(c, gc_gens_start, c); - - /* - * We have to use trylock here. Otherwise, we would - * introduce a deadlock in the RO path - we take the - * state lock at the start of going RO. - */ - if (!down_read_trylock(&c->state_lock)) { - mutex_unlock(&c->gc_gens_lock); - return 0; - } - - for_each_member_device(c, ca) { - struct bucket_gens *gens = bucket_gens(ca); - - BUG_ON(ca->oldest_gen); - - ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); - if (!ca->oldest_gen) { - bch2_dev_put(ca); - ret = bch_err_throw(c, ENOMEM_gc_gens); - goto err; - } - - for (b = gens->first_bucket; - b < gens->nbuckets; b++) - ca->oldest_gen[b] = gens->b[b]; - } - - for (unsigned i = 0; i < BTREE_ID_NR; i++) - if (btree_type_has_ptrs(i)) { - c->gc_gens_btree = i; - c->gc_gens_pos = POS_MIN; - - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, i, - POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - gc_btree_gens_key(trans, &iter, k))); - if (ret) - goto err; - } - - struct bch_dev *ca = NULL; - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, - BTREE_ITER_prefetch, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ - ca = bch2_dev_iterate(c, ca, k.k->p.inode); - if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - bch2_alloc_write_oldest_gen(trans, ca, &iter, k); - }))); - bch2_dev_put(ca); - - if (ret) - goto err; - - c->gc_gens_btree = 0; - c->gc_gens_pos = POS_MIN; - - c->gc_count++; - - bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); - trace_and_count(c, gc_gens_end, c); -err: - for_each_member_device(c, ca) { - kvfree(ca->oldest_gen); - ca->oldest_gen = NULL; - } - - up_read(&c->state_lock); - mutex_unlock(&c->gc_gens_lock); - if (!bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); - return ret; -} - -static void bch2_gc_gens_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); - bch2_gc_gens(c); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); -} - -void bch2_gc_gens_async(struct bch_fs *c) -{ - if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) && - !queue_work(c->write_ref_wq, &c->gc_gens_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); -} - -void bch2_fs_btree_gc_init_early(struct bch_fs *c) -{ - seqcount_init(&c->gc_pos_lock); - INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); - - init_rwsem(&c->gc_lock); - mutex_init(&c->gc_gens_lock); -} diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h deleted file mode 100644 index ec77662369a2f5..00000000000000 --- a/fs/bcachefs/btree_gc.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_GC_H -#define _BCACHEFS_BTREE_GC_H - -#include "bkey.h" -#include "btree_gc_types.h" -#include "btree_types.h" - -int bch2_check_topology(struct bch_fs *); -int bch2_check_allocations(struct bch_fs *); - -/* - * For concurrent mark and sweep (with other index updates), we define a total - * ordering of _all_ references GC walks: - * - * Note that some references will have the same GC position as others - e.g. - * everything within the same btree node; in those cases we're relying on - * whatever locking exists for where those references live, i.e. the write lock - * on a btree node. - * - * That locking is also required to ensure GC doesn't pass the updater in - * between the updater adding/removing the reference and updating the GC marks; - * without that, we would at best double count sometimes. - * - * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ - * be held that prevents GC from passing the position the updater is at. - * - * (What about the start of gc, when we're clearing all the marks? GC clears the - * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc - * position inside its cmpxchg loop, so crap magically works). - */ - -/* Position of (the start of) a gc phase: */ -static inline struct gc_pos gc_phase(enum gc_phase phase) -{ - return (struct gc_pos) { .phase = phase, }; -} - -static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, - struct bpos pos) -{ - return (struct gc_pos) { - .phase = GC_PHASE_btree, - .btree = btree, - .level = level, - .pos = pos, - }; -} - -static inline int gc_btree_order(enum btree_id btree) -{ - if (btree == BTREE_ID_alloc) - return -2; - if (btree == BTREE_ID_stripes) - return -1; - return btree; -} - -static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -{ - return cmp_int(l.phase, r.phase) ?: - cmp_int(gc_btree_order(l.btree), - gc_btree_order(r.btree)) ?: - cmp_int(l.level, r.level) ?: - bpos_cmp(l.pos, r.pos); -} - -static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) -{ - unsigned seq; - bool ret; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - ret = gc_pos_cmp(pos, c->gc_pos) <= 0; - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - - return ret; -} - -void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); - -int bch2_gc_gens(struct bch_fs *); -void bch2_gc_gens_async(struct bch_fs *); - -void bch2_fs_btree_gc_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h deleted file mode 100644 index c24dd6edf3773a..00000000000000 --- a/fs/bcachefs/btree_gc_types.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_GC_TYPES_H -#define _BCACHEFS_BTREE_GC_TYPES_H - -#include - -#define GC_PHASES() \ - x(not_running) \ - x(start) \ - x(sb) \ - x(btree) - -enum gc_phase { -#define x(n) GC_PHASE_##n, - GC_PHASES() -#undef x -}; - -struct gc_pos { - enum gc_phase phase:8; - enum btree_id btree:8; - u16 level; - struct bpos pos; -}; - -struct reflink_gc { - u64 offset; - u32 size; - u32 refcount; -}; - -typedef GENRADIX(struct reflink_gc) reflink_gc_table; - -#endif /* _BCACHEFS_BTREE_GC_TYPES_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c deleted file mode 100644 index 590cd29f3e86cb..00000000000000 --- a/fs/bcachefs/btree_io.c +++ /dev/null @@ -1,2742 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "async_objs.h" -#include "bkey_buf.h" -#include "bkey_methods.h" -#include "bkey_sort.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "checksum.h" -#include "debug.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "recovery.h" -#include "super-io.h" -#include "trace.h" - -#include - -static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) -{ - bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); - prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn)); - prt_str(out, "min: "); - bch2_bpos_to_text(out, bn->min_key); - prt_newline(out); - prt_str(out, "max: "); - bch2_bpos_to_text(out, bn->max_key); -} - -void bch2_btree_node_io_unlock(struct btree *b) -{ - EBUG_ON(!btree_node_write_in_flight(b)); - - clear_btree_node_write_in_flight_inner(b); - clear_btree_node_write_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -} - -void bch2_btree_node_io_lock(struct btree *b) -{ - wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void __bch2_btree_node_wait_on_read(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void __bch2_btree_node_wait_on_write(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void bch2_btree_node_wait_on_read(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void bch2_btree_node_wait_on_write(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -static void verify_no_dups(struct btree *b, - struct bkey_packed *start, - struct bkey_packed *end) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct bkey_packed *k, *p; - - if (start == end) - return; - - for (p = start, k = bkey_p_next(start); - k != end; - p = k, k = bkey_p_next(k)) { - struct bkey l = bkey_unpack_key(b, p); - struct bkey r = bkey_unpack_key(b, k); - - BUG_ON(bpos_ge(l.p, bkey_start_pos(&r))); - } -#endif -} - -static void set_needs_whiteout(struct bset *i, int v) -{ - struct bkey_packed *k; - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) - k->needs_whiteout = v; -} - -static void btree_bounce_free(struct bch_fs *c, size_t size, - bool used_mempool, void *p) -{ - if (used_mempool) - mempool_free(p, &c->btree_bounce_pool); - else - kvfree(p); -} - -static void *btree_bounce_alloc(struct bch_fs *c, size_t size, - bool *used_mempool) -{ - unsigned flags = memalloc_nofs_save(); - void *p; - - BUG_ON(size > c->opts.btree_node_size); - - *used_mempool = false; - p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); - if (!p) { - *used_mempool = true; - p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); - } - memalloc_nofs_restore(flags); - return p; -} - -static void sort_bkey_ptrs(const struct btree *bt, - struct bkey_packed **ptrs, unsigned nr) -{ - unsigned n = nr, a = nr / 2, b, c, d; - - if (!a) - return; - - /* Heap sort: see lib/sort.c: */ - while (1) { - if (a) - a--; - else if (--n) - swap(ptrs[0], ptrs[n]); - else - break; - - for (b = a; c = 2 * b + 1, (d = c + 1) < n;) - b = bch2_bkey_cmp_packed(bt, - ptrs[c], - ptrs[d]) >= 0 ? c : d; - if (d == n) - b = c; - - while (b != a && - bch2_bkey_cmp_packed(bt, - ptrs[a], - ptrs[b]) >= 0) - b = (b - 1) / 2; - c = b; - while (b != a) { - b = (b - 1) / 2; - swap(ptrs[b], ptrs[c]); - } - } -} - -static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) -{ - struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; - bool used_mempool = false; - size_t bytes = b->whiteout_u64s * sizeof(u64); - - if (!b->whiteout_u64s) - return; - - new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); - - ptrs = ptrs_end = ((void *) new_whiteouts + bytes); - - for (k = unwritten_whiteouts_start(b); - k != unwritten_whiteouts_end(b); - k = bkey_p_next(k)) - *--ptrs = k; - - sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); - - k = new_whiteouts; - - while (ptrs != ptrs_end) { - bkey_p_copy(k, *ptrs); - k = bkey_p_next(k); - ptrs++; - } - - verify_no_dups(b, new_whiteouts, - (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); - - memcpy_u64s(unwritten_whiteouts_start(b), - new_whiteouts, b->whiteout_u64s); - - btree_bounce_free(c, bytes, used_mempool, new_whiteouts); -} - -static bool should_compact_bset(struct btree *b, struct bset_tree *t, - bool compacting, enum compact_mode mode) -{ - if (!bset_dead_u64s(b, t)) - return false; - - switch (mode) { - case COMPACT_LAZY: - return should_compact_bset_lazy(b, t) || - (compacting && !bset_written(b, bset(b, t))); - case COMPACT_ALL: - return true; - default: - BUG(); - } -} - -static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) -{ - bool ret = false; - - for_each_bset(b, t) { - struct bset *i = bset(b, t); - struct bkey_packed *k, *n, *out, *start, *end; - struct btree_node_entry *src = NULL, *dst = NULL; - - if (t != b->set && !bset_written(b, i)) { - src = container_of(i, struct btree_node_entry, keys); - dst = max(write_block(b), - (void *) btree_bkey_last(b, t - 1)); - } - - if (src != dst) - ret = true; - - if (!should_compact_bset(b, t, ret, mode)) { - if (src != dst) { - memmove(dst, src, sizeof(*src) + - le16_to_cpu(src->keys.u64s) * - sizeof(u64)); - i = &dst->keys; - set_btree_bset(b, t, i); - } - continue; - } - - start = btree_bkey_first(b, t); - end = btree_bkey_last(b, t); - - if (src != dst) { - memmove(dst, src, sizeof(*src)); - i = &dst->keys; - set_btree_bset(b, t, i); - } - - out = i->start; - - for (k = start; k != end; k = n) { - n = bkey_p_next(k); - - if (!bkey_deleted(k)) { - bkey_p_copy(out, k); - out = bkey_p_next(out); - } else { - BUG_ON(k->needs_whiteout); - } - } - - i->u64s = cpu_to_le16((u64 *) out - i->_data); - set_btree_bset_end(b, t); - bch2_bset_set_no_aux_tree(b, t); - ret = true; - } - - bch2_verify_btree_nr_keys(b); - - bch2_btree_build_aux_trees(b); - - return ret; -} - -bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, - enum compact_mode mode) -{ - return bch2_drop_whiteouts(b, mode); -} - -static void btree_node_sort(struct bch_fs *c, struct btree *b, - unsigned start_idx, - unsigned end_idx) -{ - struct btree_node *out; - struct sort_iter_stack sort_iter; - struct bset_tree *t; - struct bset *start_bset = bset(b, &b->set[start_idx]); - bool used_mempool = false; - u64 start_time, seq = 0; - unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; - bool sorting_entire_node = start_idx == 0 && - end_idx == b->nsets; - - sort_iter_stack_init(&sort_iter, b); - - for (t = b->set + start_idx; - t < b->set + end_idx; - t++) { - u64s += le16_to_cpu(bset(b, t)->u64s); - sort_iter_add(&sort_iter.iter, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - } - - bytes = sorting_entire_node - ? btree_buf_bytes(b) - : __vstruct_bytes(struct btree_node, u64s); - - out = btree_bounce_alloc(c, bytes, &used_mempool); - - start_time = local_clock(); - - u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter); - - out->keys.u64s = cpu_to_le16(u64s); - - BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); - - if (sorting_entire_node) - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], - start_time); - - /* Make sure we preserve bset journal_seq: */ - for (t = b->set + start_idx; t < b->set + end_idx; t++) - seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); - start_bset->journal_seq = cpu_to_le64(seq); - - if (sorting_entire_node) { - u64s = le16_to_cpu(out->keys.u64s); - - BUG_ON(bytes != btree_buf_bytes(b)); - - /* - * Our temporary buffer is the same size as the btree node's - * buffer, we can just swap buffers instead of doing a big - * memcpy() - */ - *out = *b->data; - out->keys.u64s = cpu_to_le16(u64s); - swap(out, b->data); - set_btree_bset(b, b->set, &b->data->keys); - } else { - start_bset->u64s = out->keys.u64s; - memcpy_u64s(start_bset->start, - out->keys.start, - le16_to_cpu(out->keys.u64s)); - } - - for (i = start_idx + 1; i < end_idx; i++) - b->nr.bset_u64s[start_idx] += - b->nr.bset_u64s[i]; - - b->nsets -= shift; - - for (i = start_idx + 1; i < b->nsets; i++) { - b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; - b->set[i] = b->set[i + shift]; - } - - for (i = b->nsets; i < MAX_BSETS; i++) - b->nr.bset_u64s[i] = 0; - - set_btree_bset_end(b, &b->set[start_idx]); - bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); - - btree_bounce_free(c, bytes, used_mempool, out); - - bch2_verify_btree_nr_keys(b); -} - -void bch2_btree_sort_into(struct bch_fs *c, - struct btree *dst, - struct btree *src) -{ - struct btree_nr_keys nr; - struct btree_node_iter src_iter; - u64 start_time = local_clock(); - - BUG_ON(dst->nsets != 1); - - bch2_bset_set_no_aux_tree(dst, dst->set); - - bch2_btree_node_iter_init_from_start(&src_iter, src); - - nr = bch2_sort_repack(btree_bset_first(dst), - src, &src_iter, - &dst->format, - true); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], - start_time); - - set_btree_bset_end(dst, dst->set); - - dst->nr.live_u64s += nr.live_u64s; - dst->nr.bset_u64s[0] += nr.bset_u64s[0]; - dst->nr.packed_keys += nr.packed_keys; - dst->nr.unpacked_keys += nr.unpacked_keys; - - bch2_verify_btree_nr_keys(dst); -} - -/* - * We're about to add another bset to the btree node, so if there's currently - * too many bsets - sort some of them together: - */ -static bool btree_node_compact(struct bch_fs *c, struct btree *b) -{ - unsigned unwritten_idx; - bool ret = false; - - for (unwritten_idx = 0; - unwritten_idx < b->nsets; - unwritten_idx++) - if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) - break; - - if (b->nsets - unwritten_idx > 1) { - btree_node_sort(c, b, unwritten_idx, b->nsets); - ret = true; - } - - if (unwritten_idx > 1) { - btree_node_sort(c, b, 0, unwritten_idx); - ret = true; - } - - return ret; -} - -void bch2_btree_build_aux_trees(struct btree *b) -{ - for_each_bset(b, t) - bch2_bset_build_aux_tree(b, t, - !bset_written(b, bset(b, t)) && - t == bset_tree_last(b)); -} - -/* - * If we have MAX_BSETS (3) bsets, should we sort them all down to just one? - * - * The first bset is going to be of similar order to the size of the node, the - * last bset is bounded by btree_write_set_buffer(), which is set to keep the - * memmove on insert from being too expensive: the middle bset should, ideally, - * be the geometric mean of the first and the last. - * - * Returns true if the middle bset is greater than that geometric mean: - */ -static inline bool should_compact_all(struct bch_fs *c, struct btree *b) -{ - unsigned mid_u64s_bits = - (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2; - - return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits; -} - -/* - * @bch_btree_init_next - initialize a new (unwritten) bset that can then be - * inserted into - * - * Safe to call if there already is an unwritten bset - will only add a new bset - * if @b doesn't already have one. - * - * Returns true if we sorted (i.e. invalidated iterators - */ -void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - struct btree_node_entry *bne; - bool reinit_iter = false; - - EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]); - BUG_ON(bset_written(b, bset(b, &b->set[1]))); - BUG_ON(btree_node_just_written(b)); - - if (b->nsets == MAX_BSETS && - !btree_node_write_in_flight(b) && - should_compact_all(c, b)) { - bch2_btree_node_write_trans(trans, b, SIX_LOCK_write, - BTREE_WRITE_init_next_bset); - reinit_iter = true; - } - - if (b->nsets == MAX_BSETS && - btree_node_compact(c, b)) - reinit_iter = true; - - BUG_ON(b->nsets >= MAX_BSETS); - - bne = want_new_bset(c, b); - if (bne) - bch2_bset_init_next(b, bne); - - bch2_btree_build_aux_trees(b); - - if (reinit_iter) - bch2_trans_node_reinit_iter(trans, b); -} - -static void btree_err_msg(struct printbuf *out, struct bch_fs *c, - struct bch_dev *ca, - bool print_pos, - struct btree *b, struct bset *i, struct bkey_packed *k, - unsigned offset, int rw) -{ - if (print_pos) { - prt_str(out, rw == READ - ? "error validating btree node " - : "corrupt btree node before write "); - prt_printf(out, "at btree "); - bch2_btree_pos_to_text(out, c, b); - prt_newline(out); - } - - if (ca) - prt_printf(out, "%s ", ca->name); - - prt_printf(out, "node offset %u/%u", - b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); - if (i) - prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); - if (k) - prt_printf(out, " bset byte offset %lu", - (unsigned long)(void *)k - - ((unsigned long)(void *)i & ~511UL)); - prt_str(out, ": "); -} - -__printf(11, 12) -static int __btree_err(int ret, - struct bch_fs *c, - struct bch_dev *ca, - struct btree *b, - struct bset *i, - struct bkey_packed *k, - int rw, - enum bch_sb_error_id err_type, - struct bch_io_failures *failed, - struct printbuf *err_msg, - const char *fmt, ...) -{ - if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) - return ret == -BCH_ERR_btree_node_read_err_fixable - ? bch_err_throw(c, fsck_fix) - : ret; - - bool have_retry = false; - int ret2; - - if (ca) { - bch2_mark_btree_validate_failure(failed, ca->dev_idx); - - struct extent_ptr_decoded pick; - have_retry = bch2_bkey_pick_read_device(c, - bkey_i_to_s_c(&b->key), - failed, &pick, -1) == 1; - } - - if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) - ret = bch_err_throw(c, btree_node_read_err_fixable); - if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) - ret = bch_err_throw(c, btree_node_read_err_bad_node); - - bch2_sb_error_count(c, err_type); - - bool print_deferred = err_msg && - rw == READ && - !(test_bit(BCH_FS_in_fsck, &c->flags) && - c->opts.fix_errors == FSCK_FIX_ask); - - struct printbuf out = PRINTBUF; - bch2_log_msg_start(c, &out); - - if (!print_deferred) - err_msg = &out; - - btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw); - - va_list args; - va_start(args, fmt); - prt_vprintf(err_msg, fmt, args); - va_end(args); - - if (print_deferred) { - prt_newline(err_msg); - - switch (ret) { - case -BCH_ERR_btree_node_read_err_fixable: - ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type); - if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && - !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { - ret = ret2; - goto fsck_err; - } - - if (!have_retry) - ret = bch_err_throw(c, fsck_fix); - goto out; - case -BCH_ERR_btree_node_read_err_bad_node: - prt_str(&out, ", "); - break; - } - - goto out; - } - - if (rw == WRITE) { - prt_str(&out, ", "); - ret = __bch2_inconsistent_error(c, &out) - ? -BCH_ERR_fsck_errors_not_fixed - : 0; - goto print; - } - - switch (ret) { - case -BCH_ERR_btree_node_read_err_fixable: - ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf); - if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && - !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { - ret = ret2; - goto fsck_err; - } - - if (!have_retry) - ret = bch_err_throw(c, fsck_fix); - goto out; - case -BCH_ERR_btree_node_read_err_bad_node: - prt_str(&out, ", "); - break; - } -print: - bch2_print_str(c, KERN_ERR, out.buf); -out: -fsck_err: - printbuf_exit(&out); - return ret; -} - -#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ -({ \ - int _ret = __btree_err(type, c, ca, b, i, k, write, \ - BCH_FSCK_ERR_##_err_type, \ - failed, err_msg, \ - msg, ##__VA_ARGS__); \ - \ - if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \ - ret = _ret; \ - goto fsck_err; \ - } \ - \ - true; \ -}) - -#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) - -/* - * When btree topology repair changes the start or end of a node, that might - * mean we have to drop keys that are no longer inside the node: - */ -__cold -void bch2_btree_node_drop_keys_outside_node(struct btree *b) -{ - for_each_bset(b, t) { - struct bset *i = bset(b, t); - struct bkey_packed *k; - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) - if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) - break; - - if (k != i->start) { - unsigned shift = (u64 *) k - (u64 *) i->start; - - memmove_u64s_down(i->start, k, - (u64 *) vstruct_end(i) - (u64 *) k); - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); - set_btree_bset_end(b, t); - } - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) - if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) - break; - - if (k != vstruct_last(i)) { - i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); - set_btree_bset_end(b, t); - } - } - - /* - * Always rebuild search trees: eytzinger search tree nodes directly - * depend on the values of min/max key: - */ - bch2_bset_set_no_aux_tree(b, b->set); - bch2_btree_build_aux_trees(b); - b->nr = bch2_btree_node_count_keys(b); - - struct bkey_s_c k; - struct bkey unpacked; - struct btree_node_iter iter; - for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { - BUG_ON(bpos_lt(k.k->p, b->data->min_key)); - BUG_ON(bpos_gt(k.k->p, b->data->max_key)); - } -} - -static int validate_bset(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, struct bset *i, - unsigned offset, int write, - struct bch_io_failures *failed, - struct printbuf *err_msg) -{ - unsigned version = le16_to_cpu(i->version); - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - int ret = 0; - - btree_err_on(!bch2_version_compatible(version), - -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, NULL, - btree_node_unsupported_version, - "unsupported bset version %u.%u", - BCH_VERSION_MAJOR(version), - BCH_VERSION_MINOR(version)); - - if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes && - btree_err_on(version < c->sb.version_min, - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, NULL, - btree_node_bset_older_than_sb_min, - "bset version %u older than superblock version_min %u", - version, c->sb.version_min)) { - if (bch2_version_compatible(version)) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->version_min = cpu_to_le16(version); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } else { - /* We have no idea what's going on: */ - i->version = cpu_to_le16(c->sb.version); - } - } - - if (btree_err_on(BCH_VERSION_MAJOR(version) > - BCH_VERSION_MAJOR(c->sb.version), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, NULL, - btree_node_bset_newer_than_sb, - "bset version %u newer than superblock version %u", - version, c->sb.version)) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->version = cpu_to_le16(version); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - - btree_err_on(BSET_SEPARATE_WHITEOUTS(i), - -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, NULL, - btree_node_unsupported_version, - "BSET_SEPARATE_WHITEOUTS no longer supported"); - - btree_err_on(offset && !i->u64s, - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_empty, - "empty bset"); - - btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_wrong_sector_offset, - "bset at wrong sector offset"); - - if (!offset) { - struct btree_node *bn = - container_of(i, struct btree_node, keys); - /* These indicate that we read the wrong btree node: */ - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *bp = - &bkey_i_to_btree_ptr_v2(&b->key)->v; - - /* XXX endianness */ - btree_err_on(bp->seq != bn->keys.seq, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - bset_bad_seq, - "incorrect sequence number (wrong btree node)"); - } - - btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, NULL, - btree_node_bad_btree, - "incorrect btree id"); - - btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, NULL, - btree_node_bad_level, - "incorrect level"); - - if (!write) - compat_btree_node(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, bn); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *bp = - &bkey_i_to_btree_ptr_v2(&b->key)->v; - - if (BTREE_PTR_RANGE_UPDATED(bp)) { - b->data->min_key = bp->min_key; - b->data->max_key = b->key.k.p; - } - - btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_min_key, - "incorrect min_key: got %s should be %s", - (printbuf_reset(&buf1), - bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), - (printbuf_reset(&buf2), - bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); - } - - btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, NULL, - btree_node_bad_max_key, - "incorrect max key %s", - (printbuf_reset(&buf1), - bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); - - if (write) - compat_btree_node(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, bn); - - btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), - -BCH_ERR_btree_node_read_err_bad_node, - c, ca, b, i, NULL, - btree_node_bad_format, - "invalid bkey format: %s\n%s", buf1.buf, - (printbuf_reset(&buf2), - bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); - printbuf_reset(&buf1); - - compat_bformat(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, - &bn->format); - } -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - -static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - enum bch_validate_flags flags) -{ - return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = flags - }); -} - -static int bset_key_validate(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - bool updated_range, - enum bch_validate_flags flags) -{ - struct bkey_validate_context from = (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = flags, - }; - return __bch2_bkey_validate(c, k, from) ?: - (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?: - (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0); -} - -static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, - struct bset *i, struct bkey_packed *k) -{ - if (bkey_p_next(k) > vstruct_last(i)) - return false; - - if (k->format > KEY_FORMAT_CURRENT) - return false; - - if (!bkeyp_u64s_valid(&b->format, k)) - return false; - - struct bkey tmp; - struct bkey_s u = __bkey_disassemble(b, k, &tmp); - return !__bch2_bkey_validate(c, u.s_c, - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = BCH_VALIDATE_silent - }); -} - -static inline int btree_node_read_bkey_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed(b, l, r) - ?: (int) bkey_deleted(r) - (int) bkey_deleted(l); -} - -static int validate_bset_keys(struct bch_fs *c, struct btree *b, - struct bset *i, int write, - struct bch_io_failures *failed, - struct printbuf *err_msg) -{ - unsigned version = le16_to_cpu(i->version); - struct bkey_packed *k, *prev = NULL; - struct printbuf buf = PRINTBUF; - bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && - BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); - int ret = 0; - - for (k = i->start; - k != vstruct_last(i);) { - struct bkey_s u; - struct bkey tmp; - unsigned next_good_key; - - if (btree_err_on(bkey_p_next(k) > vstruct_last(i), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_past_bset_end, - "key extends past end of bset")) { - i->u64s = cpu_to_le16((u64 *) k - i->_data); - break; - } - - if (btree_err_on(k->format > KEY_FORMAT_CURRENT, - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_bad_format, - "invalid bkey format %u", k->format)) - goto drop_this_key; - - if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_bad_u64s, - "bad k->u64s %u (min %u max %zu)", k->u64s, - bkeyp_key_u64s(&b->format, k), - U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k))) - goto drop_this_key; - - if (!write) - bch2_bkey_compat(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, - &b->format, k); - - u = __bkey_disassemble(b, k, &tmp); - - ret = bset_key_validate(c, b, u.s_c, updated_range, write); - if (ret == -BCH_ERR_fsck_delete_bkey) - goto drop_this_key; - if (ret) - goto fsck_err; - - if (write) - bch2_bkey_compat(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, - &b->format, k); - - if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) { - struct bkey up = bkey_unpack_key(b, prev); - - printbuf_reset(&buf); - prt_printf(&buf, "keys out of order: "); - bch2_bkey_to_text(&buf, &up); - prt_printf(&buf, " > "); - bch2_bkey_to_text(&buf, u.k); - - if (btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_out_of_order, - "%s", buf.buf)) - goto drop_this_key; - } - - prev = k; - k = bkey_p_next(k); - continue; -drop_this_key: - next_good_key = k->u64s; - - if (!next_good_key || - (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN && - version >= bcachefs_metadata_version_snapshot)) { - /* - * only do scanning if bch2_bkey_compat() has nothing to - * do - */ - - if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { - for (next_good_key = 1; - next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; - next_good_key++) - if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) - goto got_good_key; - } - - /* - * didn't find a good key, have to truncate the rest of - * the bset - */ - next_good_key = (u64 *) vstruct_last(i) - (u64 *) k; - } -got_good_key: - le16_add_cpu(&i->u64s, -next_good_key); - memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_error(b); - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, - struct bch_io_failures *failed, - struct printbuf *err_msg) -{ - struct btree_node_entry *bne; - struct sort_iter *iter; - struct btree_node *sorted; - struct bkey_packed *k; - struct bset *i; - bool used_mempool, blacklisted; - bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && - BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); - unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); - u64 max_journal_seq = 0; - struct printbuf buf = PRINTBUF; - int ret = 0, write = READ; - u64 start_time = local_clock(); - - b->version_ondisk = U16_MAX; - /* We might get called multiple times on read retry: */ - b->written = 0; - - iter = mempool_alloc(&c->fill_iter, GFP_NOFS); - sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2); - - if (bch2_meta_read_fault("btree")) - btree_err(-BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_fault_injected, - "dynamic fault"); - - btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_magic, - "bad magic: want %llx, got %llx", - bset_magic(c), le64_to_cpu(b->data->magic)); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *bp = - &bkey_i_to_btree_ptr_v2(&b->key)->v; - - bch2_bpos_to_text(&buf, b->data->min_key); - prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, b->data->max_key); - - btree_err_on(b->data->keys.seq != bp->seq, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_seq, - "got wrong btree node: got\n%s", - (printbuf_reset(&buf), - bch2_btree_node_header_to_text(&buf, b->data), - buf.buf)); - } else { - btree_err_on(!b->data->keys.seq, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_seq, - "bad btree header: seq 0\n%s", - (printbuf_reset(&buf), - bch2_btree_node_header_to_text(&buf, b->data), - buf.buf)); - } - - while (b->written < (ptr_written ?: btree_sectors(c))) { - unsigned sectors; - bool first = !b->written; - - if (first) { - bne = NULL; - i = &b->data->keys; - } else { - bne = write_block(b); - i = &bne->keys; - - if (i->seq != b->data->keys.seq) - break; - } - - struct nonce nonce = btree_nonce(i, b->written << 9); - bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); - - btree_err_on(!good_csum_type, - bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) - ? -BCH_ERR_btree_node_read_err_must_retry - : -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_unknown_csum, - "unknown checksum type %llu", BSET_CSUM_TYPE(i)); - - if (first) { - sectors = vstruct_sectors(b->data, c->block_bits); - if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_past_end_of_btree_node, - "bset past end of btree node (offset %u len %u but written %zu)", - b->written, sectors, ptr_written ?: btree_sectors(c))) - i->u64s = 0; - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); - bool csum_bad = bch2_crc_cmp(b->data->csum, csum); - if (csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; - } - - btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && - !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), - -BCH_ERR_btree_node_read_err_incompatible, - c, NULL, b, NULL, NULL, - btree_node_unsupported_version, - "btree node does not have NEW_EXTENT_OVERWRITE set"); - } else { - sectors = vstruct_sectors(bne, c->block_bits); - if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_past_end_of_btree_node, - "bset past end of btree node (offset %u len %u but written %zu)", - b->written, sectors, ptr_written ?: btree_sectors(c))) - i->u64s = 0; - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - bool csum_bad = bch2_crc_cmp(bne->csum, csum); - if (ca && csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; - } - } - - b->version_ondisk = min(b->version_ondisk, - le16_to_cpu(i->version)); - - ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg); - if (ret) - goto fsck_err; - - if (!b->written) - btree_node_set_format(b, b->data->format); - - ret = validate_bset_keys(c, b, i, READ, failed, err_msg); - if (ret) - goto fsck_err; - - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - blacklisted = bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(i->journal_seq), - true); - - btree_err_on(blacklisted && first, - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_blacklisted_journal_seq, - "first btree node bset has blacklisted journal seq (%llu)", - le64_to_cpu(i->journal_seq)); - - btree_err_on(blacklisted && ptr_written, - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - first_bset_blacklisted_journal_seq, - "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", - le64_to_cpu(i->journal_seq), - b->written, b->written + sectors, ptr_written); - - b->written = min(b->written + sectors, btree_sectors(c)); - - if (blacklisted && !first) - continue; - - sort_iter_add(iter, - vstruct_idx(i, 0), - vstruct_last(i)); - - max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq)); - } - - if (ptr_written) { - btree_err_on(b->written < ptr_written, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, NULL, - btree_node_data_missing, - "btree node data missing: expected %u sectors, found %u", - ptr_written, b->written); - } else { - for (bne = write_block(b); - bset_byte_offset(b, bne) < btree_buf_bytes(b); - bne = (void *) bne + block_bytes(c)) - btree_err_on(bne->keys.seq == b->data->keys.seq && - !bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(bne->keys.journal_seq), - true), - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, NULL, - btree_node_bset_after_end, - "found bset signature after last bset"); - } - - sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); - sorted->keys.u64s = 0; - - b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); - memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, - btree_buf_bytes(b) - - sizeof(struct btree_node) - - b->nr.live_u64s * sizeof(u64)); - - b->data->keys.u64s = sorted->keys.u64s; - *sorted = *b->data; - swap(sorted, b->data); - set_btree_bset(b, b->set, &b->data->keys); - b->nsets = 1; - b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); - - BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s)); - - btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); - - i = &b->data->keys; - for (k = i->start; k != vstruct_last(i);) { - struct bkey tmp; - struct bkey_s u = __bkey_disassemble(b, k, &tmp); - - ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); - if (ret == -BCH_ERR_fsck_delete_bkey || - (static_branch_unlikely(&bch2_inject_invalid_keys) && - !bversion_cmp(u.k->bversion, MAX_VERSION))) { - btree_keys_account_key_drop(&b->nr, 0, k); - - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - set_btree_bset_end(b, b->set); - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_error(b); - continue; - } - if (ret) - goto fsck_err; - - if (u.k->type == KEY_TYPE_btree_ptr_v2) { - struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); - - bp.v->mem_ptr = 0; - } - - k = bkey_p_next(k); - } - - bch2_bset_build_aux_tree(b, b->set, false); - - set_needs_whiteout(btree_bset_first(b), true); - - btree_node_reset_sib_u64s(b); - - if (updated_range) - bch2_btree_node_drop_keys_outside_node(b); - - /* - * XXX: - * - * We deadlock if too many btree updates require node rewrites while - * we're still in journal replay. - * - * This is because btree node rewrites generate more updates for the - * interior updates (alloc, backpointers), and if those updates touch - * new nodes and generate more rewrites - well, you see the problem. - * - * The biggest cause is that we don't use the btree write buffer (for - * the backpointer updates - this needs some real thought on locking in - * order to fix. - * - * The problem with this workaround (not doing the rewrite for degraded - * nodes in journal replay) is that those degraded nodes persist, and we - * don't want that (this is a real bug when a btree node write completes - * with fewer replicas than we wanted and leaves a degraded node due to - * device _removal_, i.e. the device went away mid write). - * - * It's less of a bug here, but still a problem because we don't yet - * have a way of tracking degraded data - we another index (all - * extents/btree nodes, by replicas entry) in order to fix properly - * (re-replicate degraded data at the earliest possible time). - */ - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { - scoped_guard(rcu) - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_degraded(b); - } - } - } - - if (!ptr_written) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_ptr_written_zero(b); - } -fsck_err: - mempool_free(iter, &c->fill_iter); - printbuf_exit(&buf); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); - return ret; -} - -static void btree_node_read_work(struct work_struct *work) -{ - struct btree_read_bio *rb = - container_of(work, struct btree_read_bio, work); - struct bch_fs *c = rb->c; - struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - struct btree *b = rb->b; - struct bio *bio = &rb->bio; - struct bch_io_failures failed = { .nr = 0 }; - int ret = 0; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "btree node read error at btree "); - bch2_btree_pos_to_text(&buf, c, b); - prt_newline(&buf); - - goto start; - while (1) { - ret = bch2_bkey_pick_read_device(c, - bkey_i_to_s_c(&b->key), - &failed, &rb->pick, -1); - if (ret <= 0) { - set_btree_node_read_error(b); - break; - } - - ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); - rb->have_ioref = ca != NULL; - rb->start_time = local_clock(); - bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = rb->pick.ptr.offset; - bio->bi_iter.bi_size = btree_buf_bytes(b); - - if (rb->have_ioref) { - bio_set_dev(bio, ca->disk_sb.bdev); - submit_bio_wait(bio); - } else { - bio->bi_status = BLK_STS_REMOVED; - } - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - rb->start_time, !bio->bi_status); -start: - if (rb->have_ioref) - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read); - rb->have_ioref = false; - - if (bio->bi_status) { - bch2_mark_io_failure(&failed, &rb->pick, false); - continue; - } - - ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); - if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) - continue; - - if (ret) - set_btree_node_read_error(b); - - break; - } - - bch2_io_failures_to_text(&buf, c, &failed); - - if (btree_node_read_error(b)) - bch2_btree_lost_data(c, &buf, b->c.btree_id); - - /* - * only print retry success if we read from a replica with no errors - */ - if (btree_node_read_error(b)) - prt_printf(&buf, "ret %s", bch2_err_str(ret)); - else if (failed.nr) { - if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev)) - prt_printf(&buf, "retry success"); - else - prt_printf(&buf, "repair success"); - } - - if ((failed.nr || - btree_node_need_rewrite(b)) && - !btree_node_read_error(b) && - c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { - prt_printf(&buf, " (rewriting node)"); - bch2_btree_node_rewrite_async(c, b); - } - prt_newline(&buf); - - if (failed.nr) - bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); - - async_object_list_del(c, btree_read_bio, rb->list_idx); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], - rb->start_time); - bio_put(&rb->bio); - printbuf_exit(&buf); - clear_btree_node_read_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -} - -static void btree_node_read_endio(struct bio *bio) -{ - struct btree_read_bio *rb = - container_of(bio, struct btree_read_bio, bio); - struct bch_fs *c = rb->c; - struct bch_dev *ca = rb->have_ioref - ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - rb->start_time, !bio->bi_status); - - queue_work(c->btree_read_complete_wq, &rb->work); -} - -void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio) -{ - bch2_bio_to_text(out, &rbio->bio); -} - -struct btree_node_read_all { - struct closure cl; - struct bch_fs *c; - struct btree *b; - unsigned nr; - void *buf[BCH_REPLICAS_MAX]; - struct bio *bio[BCH_REPLICAS_MAX]; - blk_status_t err[BCH_REPLICAS_MAX]; -}; - -static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) -{ - struct btree_node *bn = data; - struct btree_node_entry *bne; - unsigned offset = 0; - - if (le64_to_cpu(bn->magic) != bset_magic(c)) - return 0; - - while (offset < btree_sectors(c)) { - if (!offset) { - offset += vstruct_sectors(bn, c->block_bits); - } else { - bne = data + (offset << 9); - if (bne->keys.seq != bn->keys.seq) - break; - offset += vstruct_sectors(bne, c->block_bits); - } - } - - return offset; -} - -static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) -{ - struct btree_node *bn = data; - struct btree_node_entry *bne; - - if (!offset) - return false; - - while (offset < btree_sectors(c)) { - bne = data + (offset << 9); - if (bne->keys.seq == bn->keys.seq) - return true; - offset++; - } - - return false; - return offset; -} - -static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) -{ - closure_type(ra, struct btree_node_read_all, cl); - struct bch_fs *c = ra->c; - struct btree *b = ra->b; - struct printbuf buf = PRINTBUF; - bool dump_bset_maps = false; - int ret = 0, best = -1, write = READ; - unsigned i, written = 0, written2 = 0; - __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 - ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; - bool _saw_error = false, *saw_error = &_saw_error; - struct printbuf *err_msg = NULL; - struct bch_io_failures *failed = NULL; - - for (i = 0; i < ra->nr; i++) { - struct btree_node *bn = ra->buf[i]; - - if (ra->err[i]) - continue; - - if (le64_to_cpu(bn->magic) != bset_magic(c) || - (seq && seq != bn->keys.seq)) - continue; - - if (best < 0) { - best = i; - written = btree_node_sectors_written(c, bn); - continue; - } - - written2 = btree_node_sectors_written(c, ra->buf[i]); - if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, NULL, - btree_node_replicas_sectors_written_mismatch, - "btree node sectors written mismatch: %u != %u", - written, written2) || - btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, NULL, - btree_node_bset_after_end, - "found bset signature after last bset") || - btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, NULL, - btree_node_replicas_data_mismatch, - "btree node replicas content mismatch")) - dump_bset_maps = true; - - if (written2 > written) { - written = written2; - best = i; - } - } -fsck_err: - if (dump_bset_maps) { - for (i = 0; i < ra->nr; i++) { - struct btree_node *bn = ra->buf[i]; - struct btree_node_entry *bne = NULL; - unsigned offset = 0, sectors; - bool gap = false; - - if (ra->err[i]) - continue; - - printbuf_reset(&buf); - - while (offset < btree_sectors(c)) { - if (!offset) { - sectors = vstruct_sectors(bn, c->block_bits); - } else { - bne = ra->buf[i] + (offset << 9); - if (bne->keys.seq != bn->keys.seq) - break; - sectors = vstruct_sectors(bne, c->block_bits); - } - - prt_printf(&buf, " %u-%u", offset, offset + sectors); - if (bne && bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(bne->keys.journal_seq), false)) - prt_printf(&buf, "*"); - offset += sectors; - } - - while (offset < btree_sectors(c)) { - bne = ra->buf[i] + (offset << 9); - if (bne->keys.seq == bn->keys.seq) { - if (!gap) - prt_printf(&buf, " GAP"); - gap = true; - - sectors = vstruct_sectors(bne, c->block_bits); - prt_printf(&buf, " %u-%u", offset, offset + sectors); - if (bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(bne->keys.journal_seq), false)) - prt_printf(&buf, "*"); - } - offset++; - } - - bch_err(c, "replica %u:%s", i, buf.buf); - } - } - - if (best >= 0) { - memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); - ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL); - } else { - ret = -1; - } - - if (ret) { - set_btree_node_read_error(b); - - struct printbuf buf = PRINTBUF; - bch2_btree_lost_data(c, &buf, b->c.btree_id); - if (buf.pos) - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - } else if (*saw_error) - bch2_btree_node_rewrite_async(c, b); - - for (i = 0; i < ra->nr; i++) { - mempool_free(ra->buf[i], &c->btree_bounce_pool); - bio_put(ra->bio[i]); - } - - closure_debug_destroy(&ra->cl); - kfree(ra); - printbuf_exit(&buf); - - clear_btree_node_read_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -} - -static void btree_node_read_all_replicas_endio(struct bio *bio) -{ - struct btree_read_bio *rb = - container_of(bio, struct btree_read_bio, bio); - struct bch_fs *c = rb->c; - struct btree_node_read_all *ra = rb->ra; - - if (rb->have_ioref) { - struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); - - bch2_latency_acct(ca, rb->start_time, READ); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_node_read_all_replicas); - } - - ra->err[rb->idx] = bio->bi_status; - closure_put(&ra->cl); -} - -/* - * XXX This allocates multiple times from the same mempools, and can deadlock - * under sufficient memory pressure (but is only a debug path) - */ -static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) -{ - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded pick; - struct btree_node_read_all *ra; - unsigned i; - - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas); - - closure_init(&ra->cl, NULL); - ra->c = c; - ra->b = b; - ra->nr = bch2_bkey_nr_ptrs(k); - - for (i = 0; i < ra->nr; i++) { - ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); - ra->bio[i] = bio_alloc_bioset(NULL, - buf_pages(ra->buf[i], btree_buf_bytes(b)), - REQ_OP_READ|REQ_SYNC|REQ_META, - GFP_NOFS, - &c->btree_bio); - } - - i = 0; - bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_node_read_all_replicas); - struct btree_read_bio *rb = - container_of(ra->bio[i], struct btree_read_bio, bio); - rb->c = c; - rb->b = b; - rb->ra = ra; - rb->start_time = local_clock(); - rb->have_ioref = ca != NULL; - rb->idx = i; - rb->pick = pick; - rb->bio.bi_iter.bi_sector = pick.ptr.offset; - rb->bio.bi_end_io = btree_node_read_all_replicas_endio; - bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b)); - - if (rb->have_ioref) { - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], - bio_sectors(&rb->bio)); - bio_set_dev(&rb->bio, ca->disk_sb.bdev); - - closure_get(&ra->cl); - submit_bio(&rb->bio); - } else { - ra->err[i] = BLK_STS_REMOVED; - } - - i++; - } - - if (sync) { - closure_sync(&ra->cl); - btree_node_read_all_replicas_done(&ra->cl.work); - } else { - continue_at(&ra->cl, btree_node_read_all_replicas_done, - c->btree_read_complete_wq); - } - - return 0; -} - -void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, - bool sync) -{ - struct bch_fs *c = trans->c; - struct extent_ptr_decoded pick; - struct btree_read_bio *rb; - struct bch_dev *ca; - struct bio *bio; - int ret; - - trace_and_count(c, btree_node_read, trans, b); - - if (static_branch_unlikely(&bch2_verify_all_btree_replicas) && - !btree_node_read_all_replicas(c, b, sync)) - return; - - ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - NULL, &pick, -1); - - if (ret <= 0) { - bool ratelimit = true; - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_str(&buf, "btree node read error: no device to read from\n at "); - bch2_btree_pos_to_text(&buf, c, b); - prt_newline(&buf); - bch2_btree_lost_data(c, &buf, b->c.btree_id); - - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && - bch2_fs_emergency_read_only2(c, &buf)) - ratelimit = false; - - static DEFINE_RATELIMIT_STATE(rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - if (!ratelimit || __ratelimit(&rs)) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - set_btree_node_read_error(b); - clear_btree_node_read_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); - return; - } - - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); - - bio = bio_alloc_bioset(NULL, - buf_pages(b->data, btree_buf_bytes(b)), - REQ_OP_READ|REQ_SYNC|REQ_META, - GFP_NOFS, - &c->btree_bio); - rb = container_of(bio, struct btree_read_bio, bio); - rb->c = c; - rb->b = b; - rb->ra = NULL; - rb->start_time = local_clock(); - rb->have_ioref = ca != NULL; - rb->pick = pick; - INIT_WORK(&rb->work, btree_node_read_work); - bio->bi_iter.bi_sector = pick.ptr.offset; - bio->bi_end_io = btree_node_read_endio; - bch2_bio_map(bio, b->data, btree_buf_bytes(b)); - - async_object_list_add(c, btree_read_bio, rb, &rb->list_idx); - - if (rb->have_ioref) { - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], - bio_sectors(bio)); - bio_set_dev(bio, ca->disk_sb.bdev); - - if (sync) { - submit_bio_wait(bio); - bch2_latency_acct(ca, rb->start_time, READ); - btree_node_read_work(&rb->work); - } else { - submit_bio(bio); - } - } else { - bio->bi_status = BLK_STS_REMOVED; - - if (sync) - btree_node_read_work(&rb->work); - else - queue_work(c->btree_read_complete_wq, &rb->work); - } -} - -static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, - const struct bkey_i *k, unsigned level) -{ - struct bch_fs *c = trans->c; - struct closure cl; - struct btree *b; - int ret; - - closure_init_stack(&cl); - - do { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - closure_sync(&cl); - } while (ret); - - b = bch2_btree_node_mem_alloc(trans, level != 0); - bch2_btree_cache_cannibalize_unlock(trans); - - BUG_ON(IS_ERR(b)); - - bkey_copy(&b->key, k); - BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); - - set_btree_node_read_in_flight(b); - - /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */ - bch2_trans_unlock(trans); - bch2_btree_node_read(trans, b, true); - - if (btree_node_read_error(b)) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - ret = bch_err_throw(c, btree_node_read_error); - goto err; - } - - bch2_btree_set_root_for_read(c, b); -err: - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - return ret; -} - -int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, - const struct bkey_i *k, unsigned level) -{ - return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); -} - -struct btree_node_scrub { - struct bch_fs *c; - struct bch_dev *ca; - void *buf; - bool used_mempool; - unsigned written; - - enum btree_id btree; - unsigned level; - struct bkey_buf key; - __le64 seq; - - struct work_struct work; - struct bio bio; -}; - -static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, - struct printbuf *err) -{ - unsigned written = 0; - - if (le64_to_cpu(data->magic) != bset_magic(c)) { - prt_printf(err, "bad magic: want %llx, got %llx", - bset_magic(c), le64_to_cpu(data->magic)); - return false; - } - - while (written < (ptr_written ?: btree_sectors(c))) { - struct btree_node_entry *bne; - struct bset *i; - bool first = !written; - - if (first) { - bne = NULL; - i = &data->keys; - } else { - bne = (void *) data + (written << 9); - i = &bne->keys; - - if (!ptr_written && i->seq != data->keys.seq) - break; - } - - struct nonce nonce = btree_nonce(i, written << 9); - bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); - - if (first) { - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); - if (bch2_crc_cmp(data->csum, csum)) { - bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); - return false; - } - } - - written += vstruct_sectors(data, c->block_bits); - } else { - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - if (bch2_crc_cmp(bne->csum, csum)) { - bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); - return false; - } - } - - written += vstruct_sectors(bne, c->block_bits); - } - } - - return true; -} - -static void btree_node_scrub_work(struct work_struct *work) -{ - struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); - struct bch_fs *c = scrub->c; - struct printbuf err = PRINTBUF; - - __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, - bkey_i_to_s_c(scrub->key.k)); - prt_newline(&err); - - if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { - int ret = bch2_trans_do(c, - bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1, - scrub->key.k, 0)); - if (!bch2_err_matches(ret, ENOENT) && - !bch2_err_matches(ret, EROFS)) - bch_err_fn_ratelimited(c, ret); - } - - printbuf_exit(&err); - bch2_bkey_buf_exit(&scrub->key, c);; - btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); - enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); - kfree(scrub); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); -} - -static void btree_node_scrub_endio(struct bio *bio) -{ - struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); - - queue_work(scrub->c->btree_read_complete_wq, &scrub->work); -} - -int bch2_btree_node_scrub(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c k, unsigned dev) -{ - if (k.k->type != KEY_TYPE_btree_ptr_v2) - return 0; - - struct bch_fs *c = trans->c; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub)) - return bch_err_throw(c, erofs_no_writes); - - struct extent_ptr_decoded pick; - int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); - if (ret <= 0) - goto err; - - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_node_scrub); - if (!ca) { - ret = bch_err_throw(c, device_offline); - goto err; - } - - bool used_mempool = false; - void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); - - unsigned vecs = buf_pages(buf, c->opts.btree_node_size); - - struct btree_node_scrub *scrub = - kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); - if (!scrub) { - ret = -ENOMEM; - goto err_free; - } - - scrub->c = c; - scrub->ca = ca; - scrub->buf = buf; - scrub->used_mempool = used_mempool; - scrub->written = btree_ptr_sectors_written(k); - - scrub->btree = btree; - scrub->level = level; - bch2_bkey_buf_init(&scrub->key); - bch2_bkey_buf_reassemble(&scrub->key, c, k); - scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; - - INIT_WORK(&scrub->work, btree_node_scrub_work); - - bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); - bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); - scrub->bio.bi_iter.bi_sector = pick.ptr.offset; - scrub->bio.bi_end_io = btree_node_scrub_endio; - submit_bio(&scrub->bio); - return 0; -err_free: - btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); -err: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); - return ret; -} - -static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, - struct btree_write *w) -{ - unsigned long old, new; - - old = READ_ONCE(b->will_make_reachable); - do { - new = old; - if (!(old & 1)) - break; - - new &= ~1UL; - } while (!try_cmpxchg(&b->will_make_reachable, &old, new)); - - if (old & 1) - closure_put(&((struct btree_update *) new)->cl); - - bch2_journal_pin_drop(&c->journal, &w->journal); -} - -static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) -{ - struct btree_write *w = btree_prev_write(b); - unsigned long old, new; - unsigned type = 0; - - bch2_btree_complete_write(c, b, w); - - if (start_time) - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time); - - old = READ_ONCE(b->flags); - do { - new = old; - - if ((old & (1U << BTREE_NODE_dirty)) && - (old & (1U << BTREE_NODE_need_write)) && - !(old & (1U << BTREE_NODE_never_write)) && - !(old & (1U << BTREE_NODE_write_blocked)) && - !(old & (1U << BTREE_NODE_will_make_reachable))) { - new &= ~(1U << BTREE_NODE_dirty); - new &= ~(1U << BTREE_NODE_need_write); - new |= (1U << BTREE_NODE_write_in_flight); - new |= (1U << BTREE_NODE_write_in_flight_inner); - new |= (1U << BTREE_NODE_just_written); - new ^= (1U << BTREE_NODE_write_idx); - - type = new & BTREE_WRITE_TYPE_MASK; - new &= ~BTREE_WRITE_TYPE_MASK; - } else { - new &= ~(1U << BTREE_NODE_write_in_flight); - new &= ~(1U << BTREE_NODE_write_in_flight_inner); - } - } while (!try_cmpxchg(&b->flags, &old, new)); - - if (new & (1U << BTREE_NODE_write_in_flight)) - __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); - else { - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); - } -} - -static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) -{ - struct btree_trans *trans = bch2_trans_get(c); - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - - /* we don't need transaction context anymore after we got the lock. */ - bch2_trans_put(trans); - __btree_node_write_done(c, b, start_time); - six_unlock_read(&b->c.lock); -} - -static void btree_node_write_work(struct work_struct *work) -{ - struct btree_write_bio *wbio = - container_of(work, struct btree_write_bio, work); - struct bch_fs *c = wbio->wbio.c; - struct btree *b = wbio->wbio.bio.bi_private; - u64 start_time = wbio->start_time; - int ret = 0; - - btree_bounce_free(c, - wbio->data_bytes, - wbio->wbio.used_mempool, - wbio->data); - - bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, - bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = bch_err_throw(c, btree_node_write_all_failed); - goto err; - } - - if (wbio->wbio.first_btree_write) { - if (wbio->wbio.failed.nr) { - - } - } else { - ret = bch2_trans_do(c, - bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, - BCH_WATERMARK_interior_updates| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw, - !wbio->wbio.failed.nr)); - if (ret) - goto err; - } -out: - async_object_list_del(c, btree_write_bio, wbio->list_idx); - bio_put(&wbio->wbio.bio); - btree_node_write_done(c, b, start_time); - return; -err: - set_btree_node_noevict(b); - - if (!bch2_err_matches(ret, EROFS)) { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); - bch2_btree_pos_to_text(&buf, c, b); - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - } - goto out; -} - -static void btree_node_write_endio(struct bio *bio) -{ - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; - struct bch_write_bio *orig = parent ?: wbio; - struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); - struct bch_fs *c = wbio->c; - struct btree *b = wbio->bio.bi_private; - struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, - wbio->submit_time, !bio->bi_status); - - if (ca && bio->bi_status) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "btree write error: %s\n ", - bch2_blk_status_to_str(bio->bi_status)); - bch2_btree_pos_to_text(&buf, c, b); - bch_err_dev_ratelimited(ca, "%s", buf.buf); - printbuf_exit(&buf); - } - - if (bio->bi_status) { - unsigned long flags; - spin_lock_irqsave(&c->btree_write_error_lock, flags); - bch2_dev_list_add_dev(&orig->failed, wbio->dev); - spin_unlock_irqrestore(&c->btree_write_error_lock, flags); - } - - /* - * XXX: we should be using io_ref[WRITE], but we aren't retrying failed - * btree writes yet (due to device removal/ro): - */ - if (wbio->have_ioref) - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_node_write); - - if (parent) { - bio_put(bio); - bio_endio(&parent->bio); - return; - } - - clear_btree_node_write_in_flight_inner(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); - INIT_WORK(&wb->work, btree_node_write_work); - queue_work(c->btree_write_complete_wq, &wb->work); -} - -static int validate_bset_for_write(struct bch_fs *c, struct btree *b, - struct bset *i) -{ - int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level + 1, - .btree = b->c.btree_id, - .flags = BCH_VALIDATE_write, - }); - if (ret) { - bch2_fs_inconsistent(c, "invalid btree node key before write"); - return ret; - } - - ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: - validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL); - if (ret) { - bch2_inconsistent_error(c); - dump_stack(); - } - - return ret; -} - -static void btree_write_submit(struct work_struct *work) -{ - struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); - BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - - bkey_copy(&tmp.k, &wbio->key); - - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) - ptr->offset += wbio->sector_offset; - - bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, - &tmp.k, false); -} - -void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) -{ - struct btree_write_bio *wbio; - struct bset *i; - struct btree_node *bn = NULL; - struct btree_node_entry *bne = NULL; - struct sort_iter_stack sort_iter; - struct nonce nonce; - unsigned bytes_to_write, sectors_to_write, bytes, u64s; - u64 seq = 0; - bool used_mempool; - unsigned long old, new; - bool validate_before_checksum = false; - enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; - void *data; - u64 start_time = local_clock(); - int ret; - - if (flags & BTREE_WRITE_ALREADY_STARTED) - goto do_write; - - /* - * We may only have a read lock on the btree node - the dirty bit is our - * "lock" against racing with other threads that may be trying to start - * a write, we do a write iff we clear the dirty bit. Since setting the - * dirty bit requires a write lock, we can't race with other threads - * redirtying it: - */ - old = READ_ONCE(b->flags); - do { - new = old; - - if (!(old & (1 << BTREE_NODE_dirty))) - return; - - if ((flags & BTREE_WRITE_ONLY_IF_NEED) && - !(old & (1 << BTREE_NODE_need_write))) - return; - - if (old & - ((1 << BTREE_NODE_never_write)| - (1 << BTREE_NODE_write_blocked))) - return; - - if (b->written && - (old & (1 << BTREE_NODE_will_make_reachable))) - return; - - if (old & (1 << BTREE_NODE_write_in_flight)) - return; - - if (flags & BTREE_WRITE_ONLY_IF_NEED) - type = new & BTREE_WRITE_TYPE_MASK; - new &= ~BTREE_WRITE_TYPE_MASK; - - new &= ~(1 << BTREE_NODE_dirty); - new &= ~(1 << BTREE_NODE_need_write); - new |= (1 << BTREE_NODE_write_in_flight); - new |= (1 << BTREE_NODE_write_in_flight_inner); - new |= (1 << BTREE_NODE_just_written); - new ^= (1 << BTREE_NODE_write_idx); - } while (!try_cmpxchg_acquire(&b->flags, &old, new)); - - if (new & (1U << BTREE_NODE_need_write)) - return; -do_write: - BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); - - atomic_long_dec(&c->btree_cache.nr_dirty); - - BUG_ON(btree_node_fake(b)); - BUG_ON((b->will_make_reachable != 0) != !b->written); - - BUG_ON(b->written >= btree_sectors(c)); - BUG_ON(b->written & (block_sectors(c) - 1)); - BUG_ON(bset_written(b, btree_bset_last(b))); - BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); - BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); - - bch2_sort_whiteouts(c, b); - - sort_iter_stack_init(&sort_iter, b); - - bytes = !b->written - ? sizeof(struct btree_node) - : sizeof(struct btree_node_entry); - - bytes += b->whiteout_u64s * sizeof(u64); - - for_each_bset(b, t) { - i = bset(b, t); - - if (bset_written(b, i)) - continue; - - bytes += le16_to_cpu(i->u64s) * sizeof(u64); - sort_iter_add(&sort_iter.iter, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - seq = max(seq, le64_to_cpu(i->journal_seq)); - } - - BUG_ON(b->written && !seq); - - /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ - bytes += 8; - - /* buffer must be a multiple of the block size */ - bytes = round_up(bytes, block_bytes(c)); - - data = btree_bounce_alloc(c, bytes, &used_mempool); - - if (!b->written) { - bn = data; - *bn = *b->data; - i = &bn->keys; - } else { - bne = data; - bne->keys = b->data->keys; - i = &bne->keys; - } - - i->journal_seq = cpu_to_le64(seq); - i->u64s = 0; - - sort_iter_add(&sort_iter.iter, - unwritten_whiteouts_start(b), - unwritten_whiteouts_end(b)); - SET_BSET_SEPARATE_WHITEOUTS(i, false); - - u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter); - le16_add_cpu(&i->u64s, u64s); - - b->whiteout_u64s = 0; - - BUG_ON(!b->written && i->u64s != b->data->keys.u64s); - - set_needs_whiteout(i, false); - - /* do we have data to write? */ - if (b->written && !i->u64s) - goto nowrite; - - bytes_to_write = vstruct_end(i) - data; - sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; - - if (!b->written && - b->key.k.type == KEY_TYPE_btree_ptr_v2) - BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write); - - memset(data + bytes_to_write, 0, - (sectors_to_write << 9) - bytes_to_write); - - BUG_ON(b->written + sectors_to_write > btree_sectors(c)); - BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); - BUG_ON(i->seq != b->data->keys.seq); - - i->version = cpu_to_le16(c->sb.version); - SET_BSET_OFFSET(i, b->written); - SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); - - if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) - validate_before_checksum = true; - - /* validate_bset will be modifying: */ - if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) - validate_before_checksum = true; - - /* if we're going to be encrypting, check metadata validity first: */ - if (validate_before_checksum && - validate_bset_for_write(c, b, i)) - goto err; - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "encrypting btree node: %s", bch2_err_str(ret))) - goto err; - - nonce = btree_nonce(i, b->written << 9); - - if (bn) - bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); - else - bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - - /* if we're not encrypting, check metadata after checksumming: */ - if (!validate_before_checksum && - validate_bset_for_write(c, b, i)) - goto err; - - /* - * We handle btree write errors by immediately halting the journal - - * after we've done that, we can't issue any subsequent btree writes - * because they might have pointers to new nodes that failed to write. - * - * Furthermore, there's no point in doing any more btree writes because - * with the journal stopped, we're never going to update the journal to - * reflect that those writes were done and the data flushed from the - * journal: - * - * Also on journal error, the pending write may have updates that were - * never journalled (interior nodes, see btree_update_nodes_written()) - - * it's critical that we don't do the write in that case otherwise we - * will have updates visible that weren't in the journal: - * - * Make sure to update b->written so bch2_btree_init_next() doesn't - * break: - */ - if (bch2_journal_error(&c->journal) || - c->opts.nochanges) - goto err; - - trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); - - wbio = container_of(bio_alloc_bioset(NULL, - buf_pages(data, sectors_to_write << 9), - REQ_OP_WRITE|REQ_META, - GFP_NOFS, - &c->btree_bio), - struct btree_write_bio, wbio.bio); - wbio_init(&wbio->wbio.bio); - wbio->data = data; - wbio->data_bytes = bytes; - wbio->sector_offset = b->written; - wbio->start_time = start_time; - wbio->wbio.c = c; - wbio->wbio.used_mempool = used_mempool; - wbio->wbio.first_btree_write = !b->written; - wbio->wbio.bio.bi_end_io = btree_node_write_endio; - wbio->wbio.bio.bi_private = b; - - bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); - - bkey_copy(&wbio->key, &b->key); - - b->written += sectors_to_write; - - if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = - cpu_to_le16(b->written); - - atomic64_inc(&c->btree_write_stats[type].nr); - atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); - - async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx); - - INIT_WORK(&wbio->work, btree_write_submit); - queue_work(c->btree_write_submit_wq, &wbio->work); - return; -err: - set_btree_node_noevict(b); - b->written += sectors_to_write; -nowrite: - btree_bounce_free(c, bytes, used_mempool, data); - __btree_node_write_done(c, b, 0); -} - -/* - * Work that must be done with write lock held: - */ -bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) -{ - bool invalidated_iter = false; - struct btree_node_entry *bne; - - if (!btree_node_just_written(b)) - return false; - - BUG_ON(b->whiteout_u64s); - - clear_btree_node_just_written(b); - - /* - * Note: immediately after write, bset_written() doesn't work - the - * amount of data we had to write after compaction might have been - * smaller than the offset of the last bset. - * - * However, we know that all bsets have been written here, as long as - * we're still holding the write lock: - */ - - /* - * XXX: decide if we really want to unconditionally sort down to a - * single bset: - */ - if (b->nsets > 1) { - btree_node_sort(c, b, 0, b->nsets); - invalidated_iter = true; - } else { - invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); - } - - for_each_bset(b, t) - set_needs_whiteout(bset(b, t), true); - - bch2_btree_verify(c, b); - - /* - * If later we don't unconditionally sort down to a single bset, we have - * to ensure this is still true: - */ - BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); - - bne = want_new_bset(c, b); - if (bne) - bch2_bset_init_next(b, bne); - - bch2_btree_build_aux_trees(b); - - return invalidated_iter; -} - -/* - * Use this one if the node is intent locked: - */ -void bch2_btree_node_write(struct bch_fs *c, struct btree *b, - enum six_lock_type lock_type_held, - unsigned flags) -{ - if (lock_type_held == SIX_LOCK_intent || - (lock_type_held == SIX_LOCK_read && - six_lock_tryupgrade(&b->c.lock))) { - __bch2_btree_node_write(c, b, flags); - - /* don't cycle lock unnecessarily: */ - if (btree_node_just_written(b) && - six_trylock_write(&b->c.lock)) { - bch2_btree_post_write_cleanup(c, b); - six_unlock_write(&b->c.lock); - } - - if (lock_type_held == SIX_LOCK_read) - six_lock_downgrade(&b->c.lock); - } else { - __bch2_btree_node_write(c, b, flags); - if (lock_type_held == SIX_LOCK_write && - btree_node_just_written(b)) - bch2_btree_post_write_cleanup(c, b); - } -} - -void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b, - enum six_lock_type lock_type_held, - unsigned flags) -{ - struct bch_fs *c = trans->c; - - if (lock_type_held == SIX_LOCK_intent || - (lock_type_held == SIX_LOCK_read && - six_lock_tryupgrade(&b->c.lock))) { - __bch2_btree_node_write(c, b, flags); - - /* don't cycle lock unnecessarily: */ - if (btree_node_just_written(b) && - six_trylock_write(&b->c.lock)) { - bch2_btree_post_write_cleanup(c, b); - __bch2_btree_node_unlock_write(trans, b); - } - - if (lock_type_held == SIX_LOCK_read) - six_lock_downgrade(&b->c.lock); - } else { - __bch2_btree_node_write(c, b, flags); - if (lock_type_held == SIX_LOCK_write && - btree_node_just_written(b)) - bch2_btree_post_write_cleanup(c, b); - } -} - -static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) -{ - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - unsigned i; - bool ret = false; -restart: - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) - if (test_bit(flag, &b->flags)) { - rcu_read_unlock(); - wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); - ret = true; - goto restart; - } - rcu_read_unlock(); - - return ret; -} - -bool bch2_btree_flush_all_reads(struct bch_fs *c) -{ - return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); -} - -bool bch2_btree_flush_all_writes(struct bch_fs *c) -{ - return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); -} - -static const char * const bch2_btree_write_types[] = { -#define x(t, n) [n] = #t, - BCH_BTREE_WRITE_TYPES() - NULL -}; - -void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) -{ - printbuf_tabstop_push(out, 20); - printbuf_tabstop_push(out, 10); - - prt_printf(out, "\tnr\tsize\n"); - - for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { - u64 nr = atomic64_read(&c->btree_write_stats[i].nr); - u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); - - prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr); - prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); - prt_newline(out); - } -} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h deleted file mode 100644 index 30a5180532c8d8..00000000000000 --- a/fs/bcachefs/btree_io.h +++ /dev/null @@ -1,239 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_IO_H -#define _BCACHEFS_BTREE_IO_H - -#include "bkey_methods.h" -#include "bset.h" -#include "btree_locking.h" -#include "checksum.h" -#include "extents.h" -#include "io_write_types.h" - -struct bch_fs; -struct btree_write; -struct btree; -struct btree_iter; -struct btree_node_read_all; - -static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) -{ - if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) - atomic_long_inc(&c->btree_cache.nr_dirty); -} - -static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) -{ - if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) - atomic_long_dec(&c->btree_cache.nr_dirty); -} - -static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k) -{ - return k.k->type == KEY_TYPE_btree_ptr_v2 - ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written) - : 0; -} - -struct btree_read_bio { - struct bch_fs *c; - struct btree *b; - struct btree_node_read_all *ra; - u64 start_time; - unsigned have_ioref:1; - unsigned idx:7; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - struct extent_ptr_decoded pick; - struct work_struct work; - struct bio bio; -}; - -struct btree_write_bio { - struct work_struct work; - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); - void *data; - unsigned data_bytes; - unsigned sector_offset; - u64 start_time; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - struct bch_write_bio wbio; -}; - -void bch2_btree_node_io_unlock(struct btree *); -void bch2_btree_node_io_lock(struct btree *); -void __bch2_btree_node_wait_on_read(struct btree *); -void __bch2_btree_node_wait_on_write(struct btree *); -void bch2_btree_node_wait_on_read(struct btree *); -void bch2_btree_node_wait_on_write(struct btree *); - -enum compact_mode { - COMPACT_LAZY, - COMPACT_ALL, -}; - -bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, - enum compact_mode); - -static inline bool should_compact_bset_lazy(struct btree *b, - struct bset_tree *t) -{ - unsigned total_u64s = bset_u64s(t); - unsigned dead_u64s = bset_dead_u64s(b, t); - - return dead_u64s > 64 && dead_u64s * 3 > total_u64s; -} - -static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) -{ - for_each_bset(b, t) - if (should_compact_bset_lazy(b, t)) - return bch2_compact_whiteouts(c, b, COMPACT_LAZY); - - return false; -} - -static inline struct nonce btree_nonce(struct bset *i, unsigned offset) -{ - return (struct nonce) {{ - [0] = cpu_to_le32(offset), - [1] = ((__le32 *) &i->seq)[0], - [2] = ((__le32 *) &i->seq)[1], - [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, - }}; -} - -static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) -{ - struct nonce nonce = btree_nonce(i, offset); - int ret; - - if (!offset) { - struct btree_node *bn = container_of(i, struct btree_node, keys); - unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; - - ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, - &bn->flags, bytes); - if (ret) - return ret; - - nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); - } - - return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, - vstruct_end(i) - (void *) i->_data); -} - -void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); - -void bch2_btree_node_drop_keys_outside_node(struct btree *); - -void bch2_btree_build_aux_trees(struct btree *); -void bch2_btree_init_next(struct btree_trans *, struct btree *); - -int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, - struct btree *, - struct bch_io_failures *, - struct printbuf *); -void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); -int bch2_btree_root_read(struct bch_fs *, enum btree_id, - const struct bkey_i *, unsigned); - -void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *); - -int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, unsigned); - -bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); - -enum btree_write_flags { - __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, - __BTREE_WRITE_ALREADY_STARTED, -}; -#define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED) -#define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED) - -void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); -void bch2_btree_node_write(struct bch_fs *, struct btree *, - enum six_lock_type, unsigned); -void bch2_btree_node_write_trans(struct btree_trans *, struct btree *, - enum six_lock_type, unsigned); - -static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b, - enum six_lock_type lock_held) -{ - bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); -} - -bool bch2_btree_flush_all_reads(struct bch_fs *); -bool bch2_btree_flush_all_writes(struct bch_fs *); - -static inline void compat_bformat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, struct bkey_format *f) -{ - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_inodes) { - swap(f->bits_per_field[BKEY_FIELD_INODE], - f->bits_per_field[BKEY_FIELD_OFFSET]); - swap(f->field_offset[BKEY_FIELD_INODE], - f->field_offset[BKEY_FIELD_OFFSET]); - } - - if (version < bcachefs_metadata_version_snapshot && - (level || btree_type_has_snapshots(btree_id))) { - u64 max_packed = - ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); - - f->field_offset[BKEY_FIELD_SNAPSHOT] = write - ? 0 - : cpu_to_le64(U32_MAX - max_packed); - } -} - -static inline void compat_bpos(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, struct bpos *p) -{ - if (big_endian != CPU_BIG_ENDIAN) - bch2_bpos_swab(p); - - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_inodes) - swap(p->inode, p->offset); -} - -static inline void compat_btree_node(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, - struct btree_node *bn) -{ - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id_is_extents(btree_id) && - !bpos_eq(bn->min_key, POS_MIN) && - write) - bn->min_key = bpos_nosnap_predecessor(bn->min_key); - - if (version < bcachefs_metadata_version_snapshot && - write) - bn->max_key.snapshot = 0; - - compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); - compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); - - if (version < bcachefs_metadata_version_snapshot && - !write) - bn->max_key.snapshot = U32_MAX; - - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id_is_extents(btree_id) && - !bpos_eq(bn->min_key, POS_MIN) && - !write) - bn->min_key = bpos_nosnap_successor(bn->min_key); -} - -void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c deleted file mode 100644 index f8829b667ad35e..00000000000000 --- a/fs/bcachefs/btree_iter.c +++ /dev/null @@ -1,3804 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_methods.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "journal.h" -#include "journal_io.h" -#include "replicas.h" -#include "snapshot.h" -#include "super.h" -#include "trace.h" - -#include -#include - -static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); -static inline void btree_path_list_add(struct btree_trans *, - btree_path_idx_t, btree_path_idx_t); - -static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) -{ -#ifdef TRACK_PATH_ALLOCATED - return iter->ip_allocated; -#else - return 0; -#endif -} - -static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t); -static void bch2_trans_srcu_lock(struct btree_trans *); - -static inline int __btree_path_cmp(const struct btree_path *l, - enum btree_id r_btree_id, - bool r_cached, - struct bpos r_pos, - unsigned r_level) -{ - /* - * Must match lock ordering as defined by __bch2_btree_node_lock: - */ - return cmp_int(l->btree_id, r_btree_id) ?: - cmp_int((int) l->cached, (int) r_cached) ?: - bpos_cmp(l->pos, r_pos) ?: - -cmp_int(l->level, r_level); -} - -static inline int btree_path_cmp(const struct btree_path *l, - const struct btree_path *r) -{ - return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); -} - -static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) -{ - /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_all_snapshots) { - p = bpos_successor(p); - } else { - p = bpos_nosnap_successor(p); - p.snapshot = iter->snapshot; - } - - return p; -} - -static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) -{ - /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_all_snapshots) { - p = bpos_predecessor(p); - } else { - p = bpos_nosnap_predecessor(p); - p.snapshot = iter->snapshot; - } - - return p; -} - -static inline struct bpos btree_iter_search_key(struct btree_iter *iter) -{ - struct bpos pos = iter->pos; - - if ((iter->flags & BTREE_ITER_is_extents) && - !bkey_eq(pos, POS_MAX)) - pos = bkey_successor(iter, pos); - return pos; -} - -static inline bool btree_path_pos_before_node(struct btree_path *path, - struct btree *b) -{ - return bpos_lt(path->pos, b->data->min_key); -} - -static inline bool btree_path_pos_after_node(struct btree_path *path, - struct btree *b) -{ - return bpos_gt(path->pos, b->key.k.p); -} - -static inline bool btree_path_pos_in_node(struct btree_path *path, - struct btree *b) -{ - return path->btree_id == b->c.btree_id && - !btree_path_pos_before_node(path, b) && - !btree_path_pos_after_node(path, b); -} - -/* Debug: */ - -static void __bch2_btree_path_verify_cached(struct btree_trans *trans, - struct btree_path *path) -{ - struct bkey_cached *ck; - bool locked = btree_node_locked(path, 0); - - if (!bch2_btree_node_relock(trans, path, 0)) - return; - - ck = (void *) path->l[0].b; - BUG_ON(ck->key.btree_id != path->btree_id || - !bkey_eq(ck->key.pos, path->pos)); - - if (!locked) - btree_node_unlock(trans, path, 0); -} - -static void __bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree_path_level *l; - struct btree_node_iter tmp; - bool locked; - struct bkey_packed *p, *k; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - struct printbuf buf3 = PRINTBUF; - const char *msg; - - l = &path->l[level]; - tmp = l->iter; - locked = btree_node_locked(path, level); - - if (path->cached) { - if (!level) - __bch2_btree_path_verify_cached(trans, path); - return; - } - - if (!btree_path_node(path, level)) - return; - - if (!bch2_btree_node_relock_notrace(trans, path, level)) - return; - - BUG_ON(!btree_path_pos_in_node(path, l->b)); - - bch2_btree_node_iter_verify(&l->iter, l->b); - - /* - * For interior nodes, the iterator will have skipped past deleted keys: - */ - p = level - ? bch2_btree_node_iter_prev(&tmp, l->b) - : bch2_btree_node_iter_prev_all(&tmp, l->b); - k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - - if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { - msg = "before"; - goto err; - } - - if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { - msg = "after"; - goto err; - } - - if (!locked) - btree_node_unlock(trans, path, level); - return; -err: - bch2_bpos_to_text(&buf1, path->pos); - - if (p) { - struct bkey uk = bkey_unpack_key(l->b, p); - - bch2_bkey_to_text(&buf2, &uk); - } else { - prt_printf(&buf2, "(none)"); - } - - if (k) { - struct bkey uk = bkey_unpack_key(l->b, k); - - bch2_bkey_to_text(&buf3, &uk); - } else { - prt_printf(&buf3, "(none)"); - } - - panic("path should be %s key at level %u:\n" - "path pos %s\n" - "prev key %s\n" - "cur key %s\n", - msg, level, buf1.buf, buf2.buf, buf3.buf); -} - -static void __bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) -{ - struct bch_fs *c = trans->c; - - for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { - if (!path->l[i].b) { - BUG_ON(!path->cached && - bch2_btree_id_root(c, path->btree_id)->b->c.level > i); - break; - } - - __bch2_btree_path_verify_level(trans, path, i); - } - - bch2_btree_path_verify_locks(trans, path); -} - -void __bch2_trans_verify_paths(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned iter; - - trans_for_each_path(trans, path, iter) - __bch2_btree_path_verify(trans, path); -} - -static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) -{ - BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); - - BUG_ON((iter->flags & BTREE_ITER_is_extents) && - (iter->flags & BTREE_ITER_all_snapshots)); - - BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) && - (iter->flags & BTREE_ITER_all_snapshots) && - !btree_type_has_snapshot_field(iter->btree_id)); - - if (iter->update_path) - __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); - __bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); -} - -static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) -{ - BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && - !iter->pos.snapshot); - - BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && - iter->pos.snapshot != iter->snapshot); - - BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) : - !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) : - (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || - bkey_gt(iter->pos, iter->k.p))); -} - -static int __bch2_btree_iter_verify_ret(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c k) -{ - struct btree_iter copy; - struct bkey_s_c prev; - int ret = 0; - - if (!(iter->flags & BTREE_ITER_filter_snapshots)) - return 0; - - if (bkey_err(k) || !k.k) - return 0; - - BUG_ON(!bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)); - - bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, - BTREE_ITER_nopreserve| - BTREE_ITER_all_snapshots); - prev = bch2_btree_iter_prev(trans, ©); - if (!prev.k) - goto out; - - ret = bkey_err(prev); - if (ret) - goto out; - - if (bkey_eq(prev.k->p, k.k->p) && - bch2_snapshot_is_ancestor(trans->c, iter->snapshot, - prev.k->p.snapshot) > 0) { - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - - bch2_bkey_to_text(&buf1, k.k); - bch2_bkey_to_text(&buf2, prev.k); - - panic("iter snap %u\n" - "k %s\n" - "prev %s\n", - iter->snapshot, - buf1.buf, buf2.buf); - } -out: - bch2_trans_iter_exit(trans, ©); - return ret; -} - -void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - struct btree_path *path; - struct trans_for_each_path_inorder_iter iter; - struct printbuf buf = PRINTBUF; - - btree_trans_sort_paths(trans); - - trans_for_each_path_inorder(trans, path, iter) { - if (path->btree_id != id || - !btree_node_locked(path, 0) || - !path->should_be_locked) - continue; - - if (!path->cached) { - if (bkey_ge(pos, path->l[0].b->data->min_key) && - bkey_le(pos, path->l[0].b->key.k.p)) - return; - } else { - if (bkey_eq(pos, path->pos)) - return; - } - } - - bch2_dump_trans_paths_updates(trans); - bch2_bpos_to_text(&buf, pos); - - panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); -} - -static inline void bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned l) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_path_verify_level(trans, path, l); -} - -static inline void bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_path_verify(trans, path); -} - -static inline void bch2_btree_iter_verify(struct btree_trans *trans, - struct btree_iter *iter) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_iter_verify(trans, iter); -} - -static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_iter_verify_entry_exit(iter); -} - -static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - return static_branch_unlikely(&bch2_debug_check_iterators) - ? __bch2_btree_iter_verify_ret(trans, iter, k) - : 0; -} - -/* Btree path: fixups after btree updates */ - -static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, - struct btree *b, - struct bset_tree *t, - struct bkey_packed *k) -{ - struct btree_node_iter_set *set; - - btree_node_iter_for_each(iter, set) - if (set->end == t->end_offset) { - set->k = __btree_node_key_to_offset(b, k); - bch2_btree_node_iter_sort(iter, b); - return; - } - - bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); -} - -static void __bch2_btree_path_fix_key_modified(struct btree_path *path, - struct btree *b, - struct bkey_packed *where) -{ - struct btree_path_level *l = &path->l[b->c.level]; - - if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) - return; - - if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) - bch2_btree_node_iter_advance(&l->iter, l->b); -} - -void bch2_btree_path_fix_key_modified(struct btree_trans *trans, - struct btree *b, - struct bkey_packed *where) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path_with_node(trans, b, path, i) { - __bch2_btree_path_fix_key_modified(path, b, where); - bch2_btree_path_verify_level(trans, path, b->c.level); - } -} - -static void __bch2_btree_node_iter_fix(struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bset_tree *t, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - const struct bkey_packed *end = btree_bkey_last(b, t); - struct btree_node_iter_set *set; - unsigned offset = __btree_node_key_to_offset(b, where); - int shift = new_u64s - clobber_u64s; - unsigned old_end = t->end_offset - shift; - unsigned orig_iter_pos = node_iter->data[0].k; - bool iter_current_key_modified = - orig_iter_pos >= offset && - orig_iter_pos <= offset + clobber_u64s; - - btree_node_iter_for_each(node_iter, set) - if (set->end == old_end) - goto found; - - /* didn't find the bset in the iterator - might have to readd it: */ - if (new_u64s && - bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { - bch2_btree_node_iter_push(node_iter, b, where, end); - goto fixup_done; - } else { - /* Iterator is after key that changed */ - return; - } -found: - set->end = t->end_offset; - - /* Iterator hasn't gotten to the key that changed yet: */ - if (set->k < offset) - return; - - if (new_u64s && - bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { - set->k = offset; - } else if (set->k < offset + clobber_u64s) { - set->k = offset + new_u64s; - if (set->k == set->end) - bch2_btree_node_iter_set_drop(node_iter, set); - } else { - /* Iterator is after key that changed */ - set->k = (int) set->k + shift; - return; - } - - bch2_btree_node_iter_sort(node_iter, b); -fixup_done: - if (node_iter->data[0].k != orig_iter_pos) - iter_current_key_modified = true; - - /* - * When a new key is added, and the node iterator now points to that - * key, the iterator might have skipped past deleted keys that should - * come after the key the iterator now points to. We have to rewind to - * before those deleted keys - otherwise - * bch2_btree_node_iter_prev_all() breaks: - */ - if (!bch2_btree_node_iter_end(node_iter) && - iter_current_key_modified && - b->c.level) { - struct bkey_packed *k, *k2, *p; - - k = bch2_btree_node_iter_peek_all(node_iter, b); - - for_each_bset(b, t) { - bool set_pos = false; - - if (node_iter->data[0].end == t->end_offset) - continue; - - k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); - - while ((p = bch2_bkey_prev_all(b, t, k2)) && - bkey_iter_cmp(b, k, p) < 0) { - k2 = p; - set_pos = true; - } - - if (set_pos) - btree_node_iter_set_set_pos(node_iter, - b, t, k2); - } - } -} - -void bch2_btree_node_iter_fix(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); - struct btree_path *linked; - unsigned i; - - if (node_iter != &path->l[b->c.level].iter) { - __bch2_btree_node_iter_fix(path, b, node_iter, t, - where, clobber_u64s, new_u64s); - - if (static_branch_unlikely(&bch2_debug_check_iterators)) - bch2_btree_node_iter_verify(node_iter, b); - } - - trans_for_each_path_with_node(trans, b, linked, i) { - __bch2_btree_node_iter_fix(linked, b, - &linked->l[b->c.level].iter, t, - where, clobber_u64s, new_u64s); - bch2_btree_path_verify_level(trans, linked, b->c.level); - } -} - -/* Btree path level: pointer to a particular btree node and node iter */ - -static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, - struct btree_path_level *l, - struct bkey *u, - struct bkey_packed *k) -{ - if (unlikely(!k)) { - /* - * signal to bch2_btree_iter_peek_slot() that we're currently at - * a hole - */ - u->type = KEY_TYPE_deleted; - return bkey_s_c_null; - } - - return bkey_disassemble(l->b, k, u); -} - -static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, - struct btree_path_level *l, - struct bkey *u) -{ - return __btree_iter_unpack(c, l, u, - bch2_btree_node_iter_peek_all(&l->iter, l->b)); -} - -static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, - struct bkey *u) -{ - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, - bch2_btree_node_iter_prev(&l->iter, l->b)); - - path->pos = k.k ? k.k->p : l->b->data->min_key; - trans->paths_sorted = false; - bch2_btree_path_verify_level(trans, path, l - path->l); - return k; -} - -static inline bool btree_path_advance_to_pos(struct btree_path *path, - struct btree_path_level *l, - int max_advance) -{ - struct bkey_packed *k; - int nr_advanced = 0; - - while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && - bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { - if (max_advance > 0 && nr_advanced >= max_advance) - return false; - - bch2_btree_node_iter_advance(&l->iter, l->b); - nr_advanced++; - } - - return true; -} - -static inline void __btree_path_level_init(struct btree_path *path, - unsigned level) -{ - struct btree_path_level *l = &path->l[level]; - - bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); - - /* - * Iterators to interior nodes should always be pointed at the first non - * whiteout: - */ - if (level) - bch2_btree_node_iter_peek(&l->iter, l->b); -} - -void bch2_btree_path_level_init(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - BUG_ON(path->cached); - - EBUG_ON(!btree_path_pos_in_node(path, b)); - - path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); - path->l[b->c.level].b = b; - __btree_path_level_init(path, b->c.level); -} - -/* Btree path: fixups after btree node updates: */ - -static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - - trans_for_each_update(trans, i) - if (!i->cached && - i->level == b->c.level && - i->btree_id == b->c.btree_id && - bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && - bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { - i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v; - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, i->btree_id, i->level, - i->k->k.p); - - if (j_k) { - i->old_k = j_k->k; - i->old_v = &j_k->v; - } - } - } -} - -/* - * A btree node is being replaced - update the iterator to point to the new - * node: - */ -void bch2_trans_node_add(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - struct btree_path *prev; - - BUG_ON(!btree_path_pos_in_node(path, b)); - - while ((prev = prev_btree_path(trans, path)) && - btree_path_pos_in_node(prev, b)) - path = prev; - - for (; - path && btree_path_pos_in_node(path, b); - path = next_btree_path(trans, path)) - if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) { - enum btree_node_locked_type t = - btree_lock_want(path, b->c.level); - - if (t != BTREE_NODE_UNLOCKED) { - btree_node_unlock(trans, path, b->c.level); - six_lock_increment(&b->c.lock, (enum six_lock_type) t); - mark_btree_node_locked(trans, path, b->c.level, t); - } - - bch2_btree_path_level_init(trans, path, b); - } - - bch2_trans_revalidate_updates_in_node(trans, b); -} - -void bch2_trans_node_drop(struct btree_trans *trans, - struct btree *b) -{ - struct btree_path *path; - unsigned i, level = b->c.level; - - trans_for_each_path(trans, path, i) - if (path->l[level].b == b) { - btree_node_unlock(trans, path, level); - path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); - } -} - -/* - * A btree node has been modified in such a way as to invalidate iterators - fix - * them: - */ -void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path_with_node(trans, b, path, i) - __btree_path_level_init(path, b->c.level); - - bch2_trans_revalidate_updates_in_node(trans, b); -} - -/* Btree path: traverse, set_pos: */ - -static inline int btree_path_lock_root(struct btree_trans *trans, - struct btree_path *path, - unsigned depth_want, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, path->btree_id); - enum six_lock_type lock_type; - unsigned i; - int ret; - - EBUG_ON(path->nodes_locked); - - while (1) { - struct btree *b = READ_ONCE(r->b); - if (unlikely(!b)) { - BUG_ON(!r->error); - return r->error; - } - - path->level = READ_ONCE(b->c.level); - - if (unlikely(path->level < depth_want)) { - /* - * the root is at a lower depth than the depth we want: - * got to the end of the btree, or we're walking nodes - * greater than some depth and there are no nodes >= - * that depth - */ - path->level = depth_want; - for (i = path->level; i < BTREE_MAX_DEPTH; i++) - path->l[i].b = NULL; - return 1; - } - - lock_type = __btree_lock_want(path, path->level); - ret = btree_node_lock(trans, path, &b->c, - path->level, lock_type, trace_ip); - if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - BUG(); - } - - if (likely(b == READ_ONCE(r->b) && - b->c.level == path->level && - !race_fault())) { - for (i = 0; i < path->level; i++) - path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); - path->l[path->level].b = b; - for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) - path->l[i].b = NULL; - - mark_btree_node_locked(trans, path, path->level, - (enum btree_node_locked_type) lock_type); - bch2_btree_path_level_init(trans, path, b); - return 0; - } - - six_unlock_type(&b->c.lock, lock_type); - } -} - -noinline -static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l = path_l(path); - struct btree_node_iter node_iter = l->iter; - struct bkey_packed *k; - struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_started, &c->flags) - ? (path->level > 1 ? 0 : 2) - : (path->level > 1 ? 1 : 16); - bool was_locked = btree_node_locked(path, path->level); - int ret = 0; - - bch2_bkey_buf_init(&tmp); - - while (nr-- && !ret) { - if (!bch2_btree_node_relock(trans, path, path->level)) - break; - - bch2_btree_node_iter_advance(&node_iter, l->b); - k = bch2_btree_node_iter_peek(&node_iter, l->b); - if (!k) - break; - - bch2_bkey_buf_unpack(&tmp, c, l->b, k); - ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, - path->level - 1); - } - - if (!was_locked) - btree_node_unlock(trans, path, path->level); - - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, - struct btree_and_journal_iter *jiter) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c k; - struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_started, &c->flags) - ? (path->level > 1 ? 0 : 2) - : (path->level > 1 ? 1 : 16); - bool was_locked = btree_node_locked(path, path->level); - int ret = 0; - - bch2_bkey_buf_init(&tmp); - - jiter->fail_if_too_many_whiteouts = true; - - while (nr-- && !ret) { - if (!bch2_btree_node_relock(trans, path, path->level)) - break; - - bch2_btree_and_journal_iter_advance(jiter); - k = bch2_btree_and_journal_iter_peek(jiter); - if (!k.k) - break; - - bch2_bkey_buf_reassemble(&tmp, c, k); - ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, - path->level - 1); - } - - if (!was_locked) - btree_node_unlock(trans, path, path->level); - - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, - struct btree_path *path, - unsigned plevel, struct btree *b) -{ - struct btree_path_level *l = &path->l[plevel]; - bool locked = btree_node_locked(path, plevel); - struct bkey_packed *k; - struct bch_btree_ptr_v2 *bp; - - if (!bch2_btree_node_relock(trans, path, plevel)) - return; - - k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); - - bp = (void *) bkeyp_val(&l->b->format, k); - bp->mem_ptr = (unsigned long)b; - - if (!locked) - btree_node_unlock(trans, path, plevel); -} - -static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, - struct btree_path *path, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l = path_l(path); - struct btree_and_journal_iter jiter; - struct bkey_s_c k; - int ret = 0; - - __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); - - k = bch2_btree_and_journal_iter_peek(&jiter); - if (!k.k) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " at btree "); - bch2_btree_pos_to_text(&buf, c, l->b); - - ret = bch2_fs_topology_error(c, "%s", buf.buf); - printbuf_exit(&buf); - goto err; - } - - bkey_reassemble(&trans->btree_path_down, k); - - if ((flags & BTREE_ITER_prefetch) && - c->opts.btree_node_prefetch) - ret = btree_path_prefetch_j(trans, path, &jiter); - -err: - bch2_btree_and_journal_iter_exit(&jiter); - return ret; -} - -static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans, - struct btree_path *path) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " within parent node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); - - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, btree_need_topology_repair); -} - -static __always_inline int btree_path_down(struct btree_trans *trans, - struct btree_path *path, - unsigned flags, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l = path_l(path); - struct btree *b; - unsigned level = path->level - 1; - enum six_lock_type lock_type = __btree_lock_want(path, level); - int ret; - - EBUG_ON(!btree_node_locked(path, path->level)); - - if (unlikely(trans->journal_replay_not_finished)) { - ret = btree_node_iter_and_journal_peek(trans, path, flags); - if (ret) - return ret; - } else { - struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); - if (unlikely(!k)) - return btree_node_missing_err(trans, path); - - bch2_bkey_unpack(l->b, &trans->btree_path_down, k); - - if (unlikely((flags & BTREE_ITER_prefetch)) && - c->opts.btree_node_prefetch) { - ret = btree_path_prefetch(trans, path); - if (ret) - return ret; - } - } - - b = bch2_btree_node_get(trans, path, &trans->btree_path_down, - level, lock_type, trace_ip); - ret = PTR_ERR_OR_ZERO(b); - if (unlikely(ret)) - return ret; - - if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) && - likely(!trans->journal_replay_not_finished && - trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2)) - btree_node_mem_ptr_set(trans, path, level + 1, b); - - if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(trans, path, level + 1); - - mark_btree_node_locked(trans, path, level, - (enum btree_node_locked_type) lock_type); - path->level = level; - bch2_btree_path_level_init(trans, path, b); - - bch2_btree_path_verify_locks(trans, path); - return 0; -} - -static int bch2_btree_path_traverse_all(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_path *path; - unsigned long trace_ip = _RET_IP_; - unsigned i; - int ret = 0; - - if (trans->in_traverse_all) - return bch_err_throw(c, transaction_restart_in_traverse_all); - - trans->in_traverse_all = true; -retry_all: - trans->restarted = 0; - trans->last_restarted_ip = 0; - - trans_for_each_path(trans, path, i) - path->should_be_locked = false; - - btree_trans_sort_paths(trans); - - bch2_trans_unlock(trans); - cond_resched(); - trans_set_locked(trans, false); - - if (unlikely(trans->memory_allocation_failure)) { - struct closure cl; - - closure_init_stack(&cl); - - do { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - closure_sync(&cl); - } while (ret); - } - - /* Now, redo traversals in correct order: */ - i = 0; - while (i < trans->nr_sorted) { - btree_path_idx_t idx = trans->sorted[i]; - - /* - * Traversing a path can cause another path to be added at about - * the same position: - */ - if (trans->paths[idx].uptodate) { - __btree_path_get(trans, &trans->paths[idx], false); - ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_); - __btree_path_put(trans, &trans->paths[idx], false); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, ENOMEM)) - goto retry_all; - if (ret) - goto err; - } else { - i++; - } - } - - /* - * We used to assert that all paths had been traversed here - * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since - * path->should_be_locked is not set yet, we might have unlocked and - * then failed to relock a path - that's fine. - */ -err: - bch2_btree_cache_cannibalize_unlock(trans); - - trans->in_traverse_all = false; - - trace_and_count(c, trans_traverse_all, trans, trace_ip); - return ret; -} - -static inline bool btree_path_check_pos_in_node(struct btree_path *path, - unsigned l, int check_pos) -{ - if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) - return false; - if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) - return false; - return true; -} - -static inline bool btree_path_good_node(struct btree_trans *trans, - struct btree_path *path, - unsigned l, int check_pos) -{ - return is_btree_node(path, l) && - bch2_btree_node_relock(trans, path, l) && - btree_path_check_pos_in_node(path, l, check_pos); -} - -static void btree_path_set_level_down(struct btree_trans *trans, - struct btree_path *path, - unsigned new_level) -{ - unsigned l; - - path->level = new_level; - - for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) - if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) - btree_node_unlock(trans, path, l); - - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - bch2_btree_path_verify(trans, path); -} - -static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans, - struct btree_path *path, - int check_pos) -{ - unsigned i, l = path->level; -again: - while (btree_path_node(path, l) && - !btree_path_good_node(trans, path, l, check_pos)) - __btree_path_set_level_up(trans, path, l++); - - /* If we need intent locks, take them too: */ - for (i = l + 1; - i < path->locks_want && btree_path_node(path, i); - i++) - if (!bch2_btree_node_relock(trans, path, i)) { - while (l <= i) - __btree_path_set_level_up(trans, path, l++); - goto again; - } - - return l; -} - -static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, - struct btree_path *path, - int check_pos) -{ - return likely(btree_node_locked(path, path->level) && - btree_path_check_pos_in_node(path, path->level, check_pos)) - ? path->level - : __btree_path_up_until_good_node(trans, path, check_pos); -} - -/* - * This is the main state machine for walking down the btree - walks down to a - * specified depth - * - * Returns 0 on success, -EIO on error (error reading in a btree node). - * - * On error, caller (peek_node()/peek_key()) must return NULL; the error is - * stashed in the iterator and returned from bch2_trans_exit(). - */ -int bch2_btree_path_traverse_one(struct btree_trans *trans, - btree_path_idx_t path_idx, - unsigned flags, - unsigned long trace_ip) -{ - struct btree_path *path = &trans->paths[path_idx]; - unsigned depth_want = path->level; - int ret = -((int) trans->restarted); - - if (unlikely(ret)) - goto out; - - if (unlikely(!trans->srcu_held)) - bch2_trans_srcu_lock(trans); - - trace_btree_path_traverse_start(trans, path); - - /* - * Ensure we obey path->should_be_locked: if it's set, we can't unlock - * and re-traverse the path without a transaction restart: - */ - if (path->should_be_locked) { - ret = bch2_btree_path_relock(trans, path, trace_ip); - goto out; - } - - if (path->cached) { - ret = bch2_btree_path_traverse_cached(trans, path_idx, flags); - goto out; - } - - path = &trans->paths[path_idx]; - - if (unlikely(path->level >= BTREE_MAX_DEPTH)) - goto out_uptodate; - - path->level = btree_path_up_until_good_node(trans, path, 0); - unsigned max_level = path->level; - - EBUG_ON(btree_path_node(path, path->level) && - !btree_node_locked(path, path->level)); - - /* - * Note: path->nodes[path->level] may be temporarily NULL here - that - * would indicate to other code that we got to the end of the btree, - * here it indicates that relocking the root failed - it's critical that - * btree_path_lock_root() comes next and that it can't fail - */ - while (path->level > depth_want) { - ret = btree_path_node(path, path->level) - ? btree_path_down(trans, path, flags, trace_ip) - : btree_path_lock_root(trans, path, depth_want, trace_ip); - if (unlikely(ret)) { - if (ret == 1) { - /* - * No nodes at this level - got to the end of - * the btree: - */ - ret = 0; - goto out; - } - - __bch2_btree_path_unlock(trans, path); - path->level = depth_want; - path->l[path->level].b = ERR_PTR(ret); - goto out; - } - } - - if (unlikely(max_level > path->level)) { - struct btree_path *linked; - unsigned iter; - - trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter) - for (unsigned j = path->level + 1; j < max_level; j++) - linked->l[j] = path->l[j]; - } - -out_uptodate: - path->uptodate = BTREE_ITER_UPTODATE; - trace_btree_path_traverse_end(trans, path); -out: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) - panic("ret %s (%i) trans->restarted %s (%i)\n", - bch2_err_str(ret), ret, - bch2_err_str(trans->restarted), trans->restarted); - bch2_btree_path_verify(trans, path); - return ret; -} - -static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, - struct btree_path *src) -{ - unsigned i, offset = offsetof(struct btree_path, pos); - - memcpy((void *) dst + offset, - (void *) src + offset, - sizeof(struct btree_path) - offset); - - for (i = 0; i < BTREE_MAX_DEPTH; i++) { - unsigned t = btree_node_locked_type(dst, i); - - if (t != BTREE_NODE_UNLOCKED) - six_lock_increment(&dst->l[i].b->c.lock, t); - } -} - -static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, - bool intent, unsigned long ip) -{ - btree_path_idx_t new = btree_path_alloc(trans, src); - btree_path_copy(trans, trans->paths + new, trans->paths + src); - __btree_path_get(trans, trans->paths + new, intent); -#ifdef TRACK_PATH_ALLOCATED - trans->paths[new].ip_allocated = ip; -#endif - return new; -} - -__flatten -btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, - btree_path_idx_t path, bool intent, unsigned long ip) -{ - struct btree_path *old = trans->paths + path; - __btree_path_put(trans, trans->paths + path, intent); - path = btree_path_clone(trans, path, intent, ip); - trace_btree_path_clone(trans, old, trans->paths + path); - trans->paths[path].preserve = false; - return path; -} - -btree_path_idx_t __must_check -__bch2_btree_path_set_pos(struct btree_trans *trans, - btree_path_idx_t path_idx, struct bpos new_pos, - bool intent, unsigned long ip) -{ - int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - EBUG_ON(!trans->paths[path_idx].ref); - - trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos); - - path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip); - - struct btree_path *path = trans->paths + path_idx; - path->pos = new_pos; - trans->paths_sorted = false; - - if (unlikely(path->cached)) { - btree_node_unlock(trans, path, 0); - path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - goto out; - } - - unsigned level = btree_path_up_until_good_node(trans, path, cmp); - - if (btree_path_node(path, level)) { - struct btree_path_level *l = &path->l[level]; - - BUG_ON(!btree_node_locked(path, level)); - /* - * We might have to skip over many keys, or just a few: try - * advancing the node iterator, and if we have to skip over too - * many keys just reinit it (or if we're rewinding, since that - * is expensive). - */ - if (cmp < 0 || - !btree_path_advance_to_pos(path, l, 8)) - bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); - - /* - * Iterators to interior nodes should always be pointed at the first non - * whiteout: - */ - if (unlikely(level)) - bch2_btree_node_iter_peek(&l->iter, l->b); - } - - if (unlikely(level != path->level)) { - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - __bch2_btree_path_unlock(trans, path); - } -out: - bch2_btree_path_verify(trans, path); - return path_idx; -} - -/* Btree path: main interface: */ - -static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) -{ - struct btree_path *sib; - - sib = prev_btree_path(trans, path); - if (sib && !btree_path_cmp(sib, path)) - return sib; - - sib = next_btree_path(trans, path); - if (sib && !btree_path_cmp(sib, path)) - return sib; - - return NULL; -} - -static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) -{ - struct btree_path *sib; - - sib = prev_btree_path(trans, path); - if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) - return sib; - - sib = next_btree_path(trans, path); - if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) - return sib; - - return NULL; -} - -static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path) -{ - __bch2_btree_path_unlock(trans, trans->paths + path); - btree_path_list_remove(trans, trans->paths + path); - __clear_bit(path, trans->paths_allocated); -} - -static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path) -{ - unsigned l = path->level; - - do { - if (!btree_path_node(path, l)) - break; - - if (!is_btree_node(path, l)) - return false; - - if (path->l[l].lock_seq != path->l[l].b->c.lock.seq) - return false; - - l++; - } while (l < path->locks_want); - - return true; -} - -void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) -{ - struct btree_path *path = trans->paths + path_idx, *dup = NULL; - - if (!__btree_path_put(trans, path, intent)) - return; - - if (!path->preserve && !path->should_be_locked) - goto free; - - dup = path->preserve - ? have_path_at_pos(trans, path) - : have_node_at_pos(trans, path); - if (!dup) - return; - - /* - * If we need this path locked, the duplicate also has te be locked - * before we free this one: - */ - if (path->should_be_locked && - !dup->should_be_locked && - !trans->restarted) { - if (!(trans->locked - ? bch2_btree_path_relock_norestart(trans, dup) - : bch2_btree_path_can_relock(trans, dup))) - return; - - dup->should_be_locked = true; - } - - BUG_ON(path->should_be_locked && - !trans->restarted && - trans->locked && - !btree_node_locked(dup, dup->level)); - - path->should_be_locked = false; - dup->preserve |= path->preserve; -free: - trace_btree_path_free(trans, path_idx, dup); - __bch2_path_free(trans, path_idx); -} - -void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) -{ - panic("trans->restart_count %u, should be %u, last restarted by %pS\n", - trans->restart_count, restart_count, - (void *) trans->last_begin_ip); -} - -static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct printbuf buf = PRINTBUF; - bch2_prt_backtrace(&buf, &trans->last_restarted_trace); - panic("in transaction restart: %s, last restarted by\n%s", - bch2_err_str(trans->restarted), - buf.buf); -#else - panic("in transaction restart: %s, last restarted by %pS\n", - bch2_err_str(trans->restarted), - (void *) trans->last_restarted_ip); -#endif -} - -void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans) -{ - if (trans->restarted) - bch2_trans_in_restart_error(trans); - - if (!trans->locked) - panic("trans should be locked, unlocked by %pS\n", - (void *) trans->last_unlock_ip); - - BUG(); -} - -noinline __cold -void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) -{ - prt_printf(buf, "%u transaction updates for %s journal seq %llu\n", - trans->nr_updates, trans->fn, trans->journal_res.seq); - printbuf_indent_add(buf, 2); - - trans_for_each_update(trans, i) { - struct bkey_s_c old = { &i->old_k, i->old_v }; - - prt_str(buf, "update: btree="); - bch2_btree_id_to_text(buf, i->btree_id); - prt_printf(buf, " cached=%u %pS\n", - i->cached, - (void *) i->ip_allocated); - - prt_printf(buf, " old "); - bch2_bkey_val_to_text(buf, trans->c, old); - prt_newline(buf); - - prt_printf(buf, " new "); - bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); - prt_newline(buf); - } - - for (struct jset_entry *e = btree_trans_journal_entries_start(trans); - e != btree_trans_journal_entries_top(trans); - e = vstruct_next(e)) { - bch2_journal_entry_to_text(buf, trans->c, e); - prt_newline(buf); - } - - printbuf_indent_sub(buf, 2); -} - -static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) -{ - struct btree_path *path = trans->paths + path_idx; - - prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ", - path_idx, path->ref, path->intent_ref, - path->preserve ? 'P' : ' ', - path->should_be_locked ? 'S' : ' ', - path->cached ? 'C' : 'B'); - bch2_btree_id_level_to_text(out, path->btree_id, path->level); - prt_str(out, " pos "); - bch2_bpos_to_text(out, path->pos); - - if (!path->cached && btree_node_locked(path, path->level)) { - prt_char(out, ' '); - struct btree *b = path_l(path)->b; - bch2_bpos_to_text(out, b->data->min_key); - prt_char(out, '-'); - bch2_bpos_to_text(out, b->key.k.p); - } - -#ifdef TRACK_PATH_ALLOCATED - prt_printf(out, " %pS", (void *) path->ip_allocated); -#endif -} - -static const char *btree_node_locked_str(enum btree_node_locked_type t) -{ - switch (t) { - case BTREE_NODE_UNLOCKED: - return "unlocked"; - case BTREE_NODE_READ_LOCKED: - return "read"; - case BTREE_NODE_INTENT_LOCKED: - return "intent"; - case BTREE_NODE_WRITE_LOCKED: - return "write"; - default: - return NULL; - } -} - -void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) -{ - bch2_btree_path_to_text_short(out, trans, path_idx); - - struct btree_path *path = trans->paths + path_idx; - - prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want); - prt_newline(out); - - printbuf_indent_add(out, 2); - for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { - prt_printf(out, "l=%u locks %s seq %u node ", l, - btree_node_locked_str(btree_node_locked_type(path, l)), - path->l[l].lock_seq); - - int ret = PTR_ERR_OR_ZERO(path->l[l].b); - if (ret) - prt_str(out, bch2_err_str(ret)); - else - prt_printf(out, "%px", path->l[l].b); - prt_newline(out); - } - printbuf_indent_sub(out, 2); -} - -static noinline __cold -void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, - bool nosort) -{ - struct trans_for_each_path_inorder_iter iter; - - if (!nosort) - btree_trans_sort_paths(trans); - - trans_for_each_path_idx_inorder(trans, iter) { - bch2_btree_path_to_text_short(out, trans, iter.path_idx); - prt_newline(out); - } -} - -noinline __cold -void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) -{ - __bch2_trans_paths_to_text(out, trans, false); -} - -static noinline __cold -void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) -{ - struct printbuf buf = PRINTBUF; - - __bch2_trans_paths_to_text(&buf, trans, nosort); - bch2_trans_updates_to_text(&buf, trans); - - bch2_print_str(trans->c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -} - -noinline __cold -void bch2_dump_trans_paths_updates(struct btree_trans *trans) -{ - __bch2_dump_trans_paths_updates(trans, false); -} - -noinline __cold -static void bch2_trans_update_max_paths(struct btree_trans *trans) -{ - struct btree_transaction_stats *s = btree_trans_stats(trans); - struct printbuf buf = PRINTBUF; - size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths); - - bch2_trans_paths_to_text(&buf, trans); - - if (!buf.allocation_failure) { - mutex_lock(&s->lock); - if (nr > s->nr_max_paths) { - s->nr_max_paths = nr; - swap(s->max_paths_text, buf.buf); - } - mutex_unlock(&s->lock); - } - - printbuf_exit(&buf); - - trans->nr_paths_max = nr; -} - -noinline __cold -int __bch2_btree_trans_too_many_iters(struct btree_trans *trans) -{ - if (trace_trans_restart_too_many_iters_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_trans_paths_to_text(&buf, trans); - trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf); - printbuf_exit(&buf); - } - - count_event(trans->c, trans_restart_too_many_iters); - - return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); -} - -static noinline void btree_path_overflow(struct btree_trans *trans) -{ - bch2_dump_trans_paths_updates(trans); - bch_err(trans->c, "trans path overflow"); -} - -static noinline void btree_paths_realloc(struct btree_trans *trans) -{ - unsigned nr = trans->nr_paths * 2; - - void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + - sizeof(struct btree_trans_paths) + - nr * sizeof(struct btree_path) + - nr * sizeof(btree_path_idx_t) + 8 + - nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL); - - unsigned long *paths_allocated = p; - memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long)); - p += BITS_TO_LONGS(nr) * sizeof(unsigned long); - - p += sizeof(struct btree_trans_paths); - struct btree_path *paths = p; - *trans_paths_nr(paths) = nr; - memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path)); - p += nr * sizeof(struct btree_path); - - btree_path_idx_t *sorted = p; - memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t)); - p += nr * sizeof(btree_path_idx_t) + 8; - - struct btree_insert_entry *updates = p; - memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry)); - - unsigned long *old = trans->paths_allocated; - - rcu_assign_pointer(trans->paths_allocated, paths_allocated); - rcu_assign_pointer(trans->paths, paths); - rcu_assign_pointer(trans->sorted, sorted); - rcu_assign_pointer(trans->updates, updates); - - trans->nr_paths = nr; - - if (old != trans->_paths_allocated) - kfree_rcu_mightsleep(old); -} - -static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, - btree_path_idx_t pos) -{ - btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths); - - if (unlikely(idx == trans->nr_paths)) { - if (trans->nr_paths == BTREE_ITER_MAX) { - btree_path_overflow(trans); - return 0; - } - - btree_paths_realloc(trans); - } - - /* - * Do this before marking the new path as allocated, since it won't be - * initialized yet: - */ - if (unlikely(idx > trans->nr_paths_max)) - bch2_trans_update_max_paths(trans); - - __set_bit(idx, trans->paths_allocated); - - struct btree_path *path = &trans->paths[idx]; - path->ref = 0; - path->intent_ref = 0; - path->nodes_locked = 0; - - btree_path_list_add(trans, pos, idx); - trans->paths_sorted = false; - return idx; -} - -btree_path_idx_t bch2_path_get(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - unsigned locks_want, unsigned level, - unsigned flags, unsigned long ip) -{ - struct btree_path *path; - bool cached = flags & BTREE_ITER_cached; - bool intent = flags & BTREE_ITER_intent; - struct trans_for_each_path_inorder_iter iter; - btree_path_idx_t path_pos = 0, path_idx; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_trans_verify_locks(trans); - - btree_trans_sort_paths(trans); - - if (intent) - locks_want = max(locks_want, level + 1); - locks_want = min(locks_want, BTREE_MAX_DEPTH); - - trans_for_each_path_inorder(trans, path, iter) { - if (__btree_path_cmp(path, - btree_id, - cached, - pos, - level) > 0) - break; - - path_pos = iter.path_idx; - } - - if (path_pos && - trans->paths[path_pos].cached == cached && - trans->paths[path_pos].btree_id == btree_id && - trans->paths[path_pos].level == level && - bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) { - trace_btree_path_get(trans, trans->paths + path_pos, &pos); - - __btree_path_get(trans, trans->paths + path_pos, intent); - path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); - path = trans->paths + path_idx; - } else { - path_idx = btree_path_alloc(trans, path_pos); - path = trans->paths + path_idx; - - __btree_path_get(trans, path, intent); - path->pos = pos; - path->btree_id = btree_id; - path->cached = cached; - path->uptodate = BTREE_ITER_NEED_TRAVERSE; - path->should_be_locked = false; - path->level = level; - path->locks_want = locks_want; - path->nodes_locked = 0; - for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++) - path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); -#ifdef TRACK_PATH_ALLOCATED - path->ip_allocated = ip; -#endif - trans->paths_sorted = false; - - trace_btree_path_alloc(trans, path); - } - - if (!(flags & BTREE_ITER_nopreserve)) - path->preserve = true; - - /* - * If the path has locks_want greater than requested, we don't downgrade - * it here - on transaction restart because btree node split needs to - * upgrade locks, we might be putting/getting the iterator again. - * Downgrading iterators only happens via bch2_trans_downgrade(), after - * a successful transaction commit. - */ - - return path_idx; -} - -btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans, - enum btree_id btree_id, - unsigned level, - struct bpos pos) -{ - btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, - BTREE_ITER_nopreserve| - BTREE_ITER_intent, _RET_IP_); - path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); - - struct btree_path *path = trans->paths + path_idx; - bch2_btree_path_downgrade(trans, path); - __bch2_btree_path_unlock(trans, path); - return path_idx; -} - -struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) -{ - - struct btree_path_level *l = path_l(path); - struct bkey_packed *_k; - struct bkey_s_c k; - - if (unlikely(!l->b)) - return bkey_s_c_null; - - EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); - EBUG_ON(!btree_node_locked(path, path->level)); - - if (!path->cached) { - _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; - - EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos)); - - if (!k.k || !bpos_eq(path->pos, k.k->p)) - goto hole; - } else { - struct bkey_cached *ck = (void *) path->l[0].b; - if (!ck) - return bkey_s_c_null; - - EBUG_ON(path->btree_id != ck->key.btree_id || - !bkey_eq(path->pos, ck->key.pos)); - - *u = ck->k->k; - k = (struct bkey_s_c) { u, &ck->k->v }; - } - - return k; -hole: - bkey_init(u); - u->p = path->pos; - return (struct bkey_s_c) { u, NULL }; -} - -void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!iter->path || trans->restarted) - return; - - struct btree_path *path = btree_iter_path(trans, iter); - path->preserve = false; - if (path->ref == 1) - path->should_be_locked = false; -} -/* Btree iterators: */ - -int __must_check -__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) -{ - return bch2_btree_path_traverse(trans, iter->path, iter->flags); -} - -int __must_check -bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - iter->path = bch2_btree_path_set_pos(trans, iter->path, - btree_iter_search_key(iter), - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - return ret; - - struct btree_path *path = btree_iter_path(trans, iter); - if (btree_path_node(path, path->level)) - btree_path_set_should_be_locked(trans, path); - return 0; -} - -/* Iterate across nodes (leaf and interior nodes) */ - -struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, - struct btree_iter *iter) -{ - struct btree *b = NULL; - int ret; - - EBUG_ON(trans->paths[iter->path].cached); - bch2_btree_iter_verify(trans, iter); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - goto err; - - struct btree_path *path = btree_iter_path(trans, iter); - b = btree_path_node(path, path->level); - if (!b) - goto out; - - BUG_ON(bpos_lt(b->key.k.p, iter->pos)); - - bkey_init(&iter->k); - iter->k.p = iter->pos = b->key.k.p; - - iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); -out: - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - - return b; -err: - b = ERR_PTR(ret); - goto out; -} - -/* Only kept for -tools */ -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans, - struct btree_iter *iter) -{ - struct btree *b; - - while (b = bch2_btree_iter_peek_node(trans, iter), - bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) - bch2_trans_begin(trans); - - return b; -} - -struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter) -{ - struct btree *b = NULL; - int ret; - - EBUG_ON(trans->paths[iter->path].cached); - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(trans, iter); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - goto err; - - - struct btree_path *path = btree_iter_path(trans, iter); - - /* already at end? */ - if (!btree_path_node(path, path->level)) - return NULL; - - /* got to end? */ - if (!btree_path_node(path, path->level + 1)) { - path->should_be_locked = false; - btree_path_set_level_up(trans, path); - return NULL; - } - - /* - * We don't correctly handle nodes with extra intent locks here: - * downgrade so we don't violate locking invariants - */ - bch2_btree_path_downgrade(trans, path); - - if (!bch2_btree_node_relock(trans, path, path->level + 1)) { - trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); - __bch2_btree_path_unlock(trans, path); - path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); - path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - goto err; - } - - b = btree_path_node(path, path->level + 1); - - if (bpos_eq(iter->pos, b->key.k.p)) { - __btree_path_set_level_up(trans, path, path->level++); - } else { - if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED) - btree_node_unlock(trans, path, path->level + 1); - - /* - * Haven't gotten to the end of the parent node: go back down to - * the next child node - */ - iter->path = bch2_btree_path_set_pos(trans, iter->path, - bpos_successor(iter->pos), - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - path = btree_iter_path(trans, iter); - btree_path_set_level_down(trans, path, iter->min_depth); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - goto err; - - path = btree_iter_path(trans, iter); - b = path->l[path->level].b; - } - - bkey_init(&iter->k); - iter->k.p = iter->pos = b->key.k.p; - - iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); - EBUG_ON(btree_iter_path(trans, iter)->uptodate); -out: - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - - return b; -err: - b = ERR_PTR(ret); - goto out; -} - -/* Iterate across keys (in leaf nodes only) */ - -inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_all_snapshots - ? bpos_eq(pos, SPOS_MAX) - : bkey_eq(pos, SPOS_MAX)); - - if (ret && !(iter->flags & BTREE_ITER_is_extents)) - pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(trans, iter, pos); - return ret; -} - -inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bpos pos = bkey_start_pos(&iter->k); - bool ret = !(iter->flags & BTREE_ITER_all_snapshots - ? bpos_eq(pos, POS_MIN) - : bkey_eq(pos, POS_MIN)); - - if (ret && !(iter->flags & BTREE_ITER_is_extents)) - pos = bkey_predecessor(iter, pos); - bch2_btree_iter_set_pos(trans, iter, pos); - return ret; -} - -static noinline -void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key, struct bkey_s_c *k) -{ - struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; - - trans_for_each_update(trans, i) - if (!i->key_cache_already_flushed && - i->btree_id == iter->btree_id && - bpos_le(i->k->k.p, search_key) && - bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { - iter->k = i->k->k; - *k = bkey_i_to_s_c(i->k); - } -} - -static noinline -void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key, - struct bkey_s_c *k) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bpos end = path_l(path)->b->key.k.p; - - trans_for_each_update(trans, i) - if (!i->key_cache_already_flushed && - i->btree_id == iter->btree_id && - bpos_ge(i->k->k.p, search_key) && - bpos_le(i->k->k.p, k->k ? k->k->p : end)) { - iter->k = i->k->k; - *k = bkey_i_to_s_c(i->k); - } -} - -static noinline -void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k) -{ - trans_for_each_update(trans, i) - if (!i->key_cache_already_flushed && - i->btree_id == iter->btree_id && - bpos_eq(i->k->k.p, iter->pos)) { - iter->k = i->k->k; - *k = bkey_i_to_s_c(i->k); - } -} - -static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_pos, - struct bpos end_pos) -{ - struct btree_path *path = btree_iter_path(trans, iter); - - return bch2_journal_keys_peek_max(trans->c, iter->btree_id, - path->level, - search_pos, - end_pos, - &iter->journal_idx); -} - -static noinline -struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, - struct btree_iter *iter) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos); - - if (k) { - iter->k = k->k; - return bkey_i_to_s_c(k); - } else { - return bkey_s_c_null; - } -} - -static noinline -void btree_trans_peek_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_key, - struct bkey_s_c *k) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *next_journal = - bch2_btree_journal_peek(trans, iter, search_key, - k->k ? k->k->p : path_l(path)->b->key.k.p); - if (next_journal) { - iter->k = next_journal->k; - *k = bkey_i_to_s_c(next_journal); - } -} - -static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_key, - struct bpos end_pos) -{ - struct btree_path *path = btree_iter_path(trans, iter); - - return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, - path->level, - search_key, - end_pos, - &iter->journal_idx); -} - -static noinline -void btree_trans_peek_prev_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_key, - struct bkey_s_c *k) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *next_journal = - bch2_btree_journal_peek_prev(trans, iter, search_key, - k->k ? k->k->p : path_l(path)->b->data->min_key); - - if (next_journal) { - iter->k = next_journal->k; - *k = bkey_i_to_s_c(next_journal); - } -} - -/* - * Checks btree key cache for key at iter->pos and returns it if present, or - * bkey_s_c_null: - */ -static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct bkey u; - struct bkey_s_c k; - int ret; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if ((iter->flags & BTREE_ITER_key_cache_fill) && - bpos_eq(iter->pos, pos)) - return bkey_s_c_null; - - if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) - return bkey_s_c_null; - - if (!iter->key_cache_path) - iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, - iter->flags & BTREE_ITER_intent, 0, - iter->flags|BTREE_ITER_cached| - BTREE_ITER_cached_nofill, - _THIS_IP_); - - iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - iter->flags|BTREE_ITER_cached) ?: - bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_); - if (unlikely(ret)) - return bkey_s_c_err(ret); - - k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); - if (!k.k) - return k; - - if ((iter->flags & BTREE_ITER_all_snapshots) && - !bpos_eq(pos, k.k->p)) - return bkey_s_c_null; - - iter->k = u; - k.k = &iter->k; - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); - return k; -} - -static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key) -{ - struct bkey_s_c k, k2; - int ret; - - EBUG_ON(btree_iter_path(trans, iter)->cached); - bch2_btree_iter_verify(trans, iter); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - break; - } - - struct btree_path *path = btree_iter_path(trans, iter); - struct btree_path_level *l = path_l(path); - - if (unlikely(!l->b)) { - /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } - - btree_path_set_should_be_locked(trans, path); - - k = btree_path_level_peek_all(trans->c, l, &iter->k); - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - k.k && - (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { - k = k2; - if (bkey_err(k)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); - break; - } - } - - if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_journal(trans, iter, search_key, &k); - - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) - bch2_btree_trans_peek_updates(trans, iter, search_key, &k); - - if (k.k && bkey_deleted(k.k)) { - /* - * If we've got a whiteout, and it's after the search - * key, advance the search key to the whiteout instead - * of just after the whiteout - it might be a btree - * whiteout, with a real key at the same position, since - * in the btree deleted keys sort before non deleted. - */ - search_key = !bpos_eq(search_key, k.k->p) - ? k.k->p - : bpos_successor(k.k->p); - continue; - } - - if (likely(k.k)) { - break; - } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) { - /* Advance to next leaf node: */ - search_key = bpos_successor(l->b->key.k.p); - } else { - /* End of btree: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } - } - - bch2_btree_iter_verify(trans, iter); - - if (trace___btree_iter_peek_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace___btree_iter_peek(trans->c, buf.buf); - } - - return k; -} - -/** - * bch2_btree_iter_peek_max() - returns first key greater than or equal to - * iterator's current position - * @trans: btree transaction object - * @iter: iterator to peek from - * @end: search limit: returns keys less than or equal to @end - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end) -{ - struct bpos search_key = btree_iter_search_key(iter); - struct bkey_s_c k; - struct bpos iter_pos = iter->pos; - int ret; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; - } - - if (iter->update_path) { - bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - } - - while (1) { - k = __bch2_btree_iter_peek(trans, iter, search_key); - if (unlikely(!k.k)) - goto end; - if (unlikely(bkey_err(k))) - goto out_no_locked; - - if (iter->flags & BTREE_ITER_filter_snapshots) { - /* - * We need to check against @end before FILTER_SNAPSHOTS because - * if we get to a different inode that requested we might be - * seeing keys for a different snapshot tree that will all be - * filtered out. - * - * But we can't do the full check here, because bkey_start_pos() - * isn't monotonically increasing before FILTER_SNAPSHOTS, and - * that's what we check against in extents mode: - */ - if (unlikely(!(iter->flags & BTREE_ITER_is_extents) - ? bkey_gt(k.k->p, end) - : k.k->p.inode > end.inode)) - goto end; - - if (iter->update_path && - !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - } - - if ((iter->flags & BTREE_ITER_intent) && - !(iter->flags & BTREE_ITER_is_extents) && - !iter->update_path) { - struct bpos pos = k.k->p; - - if (pos.snapshot < iter->snapshot) { - search_key = bpos_successor(k.k->p); - continue; - } - - pos.snapshot = iter->snapshot; - - /* - * advance, same as on exit for iter->path, but only up - * to snapshot - */ - __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); - iter->update_path = iter->path; - - iter->update_path = bch2_btree_path_set_pos(trans, - iter->update_path, pos, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; - } - } - - /* - * We can never have a key in a leaf node at POS_MAX, so - * we don't have to check these successor() calls: - */ - if (!bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - search_key = bpos_successor(k.k->p); - continue; - } - - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_key_cache_fill)) { - search_key = bkey_successor(iter, k.k->p); - continue; - } - } - - /* - * iter->pos should be mononotically increasing, and always be - * equal to the key we just returned - except extents can - * straddle iter->pos: - */ - if (!(iter->flags & BTREE_ITER_is_extents)) - iter_pos = k.k->p; - else - iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - - if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) : - iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) : - bkey_gt(iter_pos, end))) - goto end; - - break; - } - - iter->pos = iter_pos; - - iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); -out_no_locked: - if (iter->update_path) { - ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_); - if (unlikely(ret)) - k = bkey_s_c_err(ret); - else - btree_path_set_should_be_locked(trans, trans->paths + iter->update_path); - } - - if (!(iter->flags & BTREE_ITER_all_snapshots)) - iter->pos.snapshot = iter->snapshot; - - ret = bch2_btree_iter_verify_ret(trans, iter, k); - if (unlikely(ret)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - } - - bch2_btree_iter_verify_entry_exit(iter); - - if (trace_btree_iter_peek_max_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace_btree_iter_peek_max(trans->c, buf.buf); - } - - return k; -end: - bch2_btree_iter_set_pos(trans, iter, end); - k = bkey_s_c_null; - goto out_no_locked; -} - -/** - * bch2_btree_iter_next() - returns first key greater than iterator's current - * position - * @trans: btree transaction object - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_advance(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek(trans, iter); -} - -static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key) -{ - struct bkey_s_c k, k2; - - bch2_btree_iter_verify(trans, iter); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - break; - } - - struct btree_path *path = btree_iter_path(trans, iter); - struct btree_path_level *l = path_l(path); - - if (unlikely(!l->b)) { - /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } - - btree_path_set_should_be_locked(trans, path); - - k = btree_path_level_peek_all(trans->c, l, &iter->k); - if (!k.k || bpos_gt(k.k->p, search_key)) { - k = btree_path_level_prev(trans, path, l, &iter->k); - - BUG_ON(k.k && bpos_gt(k.k->p, search_key)); - } - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - k.k && - (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { - k = k2; - if (bkey_err(k2)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); - break; - } - } - - if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_prev_journal(trans, iter, search_key, &k); - - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) - bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k); - - if (likely(k.k && !bkey_deleted(k.k))) { - break; - } else if (k.k) { - search_key = bpos_predecessor(k.k->p); - } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { - /* Advance to previous leaf node: */ - search_key = bpos_predecessor(path->l[0].b->data->min_key); - } else { - /* Start of btree: */ - bch2_btree_iter_set_pos(trans, iter, POS_MIN); - k = bkey_s_c_null; - break; - } - } - - bch2_btree_iter_verify(trans, iter); - return k; -} - -/** - * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to - * iterator's current position - * @trans: btree transaction object - * @iter: iterator to peek from - * @end: search limit: returns keys greater than or equal to @end - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end) -{ - if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && - !bkey_eq(iter->pos, POS_MAX) && - !((iter->flags & BTREE_ITER_is_extents) && - iter->pos.offset == U64_MAX)) { - - /* - * bkey_start_pos(), for extents, is not monotonically - * increasing until after filtering for snapshots: - * - * Thus, for extents we need to search forward until we find a - * real visible extents - easiest to just use peek_slot() (which - * internally uses peek() for extents) - */ - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); - if (bkey_err(k)) - return k; - - if (!bkey_deleted(k.k) && - (!(iter->flags & BTREE_ITER_is_extents) || - bkey_lt(bkey_start_pos(k.k), iter->pos))) - return k; - } - - struct bpos search_key = iter->pos; - struct bkey_s_c k; - btree_path_idx_t saved_path = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode); - - int ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; - } - - while (1) { - k = __bch2_btree_iter_peek_prev(trans, iter, search_key); - if (unlikely(!k.k)) - goto end; - if (unlikely(bkey_err(k))) - goto out_no_locked; - - if (iter->flags & BTREE_ITER_filter_snapshots) { - struct btree_path *s = saved_path ? trans->paths + saved_path : NULL; - if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) { - /* - * If we have a saved candidate, and we're past - * the last possible snapshot overwrite, return - * it: - */ - bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_intent); - iter->path = saved_path; - saved_path = 0; - k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); - break; - } - - /* - * We need to check against @end before FILTER_SNAPSHOTS because - * if we get to a different inode that requested we might be - * seeing keys for a different snapshot tree that will all be - * filtered out. - */ - if (unlikely(bkey_lt(k.k->p, end))) - goto end; - - if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { - search_key = bpos_predecessor(k.k->p); - continue; - } - - if (k.k->p.snapshot != iter->snapshot) { - /* - * Have a key visible in iter->snapshot, but - * might have overwrites: - save it and keep - * searching. Unless it's a whiteout - then drop - * our previous saved candidate: - */ - if (saved_path) { - bch2_path_put(trans, saved_path, - iter->flags & BTREE_ITER_intent); - saved_path = 0; - } - - if (!bkey_whiteout(k.k)) { - saved_path = btree_path_clone(trans, iter->path, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - trace_btree_path_save_pos(trans, - trans->paths + iter->path, - trans->paths + saved_path); - } - - search_key = bpos_predecessor(k.k->p); - continue; - } - - if (bkey_whiteout(k.k)) { - search_key = bkey_predecessor(iter, k.k->p); - search_key.snapshot = U32_MAX; - continue; - } - } - - EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) : - iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) : - bkey_gt(k.k->p, iter->pos)); - - if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) : - iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) : - bkey_lt(k.k->p, end))) - goto end; - - break; - } - - /* Extents can straddle iter->pos: */ - iter->pos = bpos_min(iter->pos, k.k->p);; - - if (iter->flags & BTREE_ITER_filter_snapshots) - iter->pos.snapshot = iter->snapshot; -out_no_locked: - if (saved_path) - bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent); - - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - - if (trace_btree_iter_peek_prev_min_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace_btree_iter_peek_prev_min(trans->c, buf.buf); - } - return k; -end: - bch2_btree_iter_set_pos(trans, iter, end); - k = bkey_s_c_null; - goto out_no_locked; -} - -/** - * bch2_btree_iter_prev() - returns first key less than iterator's current - * position - * @trans: btree transaction object - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_rewind(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_prev(trans, iter); -} - -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bpos search_key; - struct bkey_s_c k; - int ret; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(trans, iter); - bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out; - } - - /* extents can't span inode numbers: */ - if ((iter->flags & BTREE_ITER_is_extents) && - unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { - if (iter->pos.inode == KEY_INODE_MAX) { - k = bkey_s_c_null; - goto out2; - } - - bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); - } - - search_key = btree_iter_search_key(iter); - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out; - } - - struct btree_path *path = btree_iter_path(trans, iter); - if (unlikely(!btree_path_node(path, path->level))) { - k = bkey_s_c_null; - goto out2; - } - - btree_path_set_should_be_locked(trans, path); - - if ((iter->flags & BTREE_ITER_cached) || - !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { - k = bkey_s_c_null; - - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) { - bch2_btree_trans_peek_slot_updates(trans, iter, &k); - if (k.k) - goto out; - } - - if (unlikely(iter->flags & BTREE_ITER_with_journal) && - (k = btree_trans_peek_slot_journal(trans, iter)).k) - goto out; - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { - if (!bkey_err(k)) - iter->k = *k.k; - /* We're not returning a key from iter->path: */ - goto out; - } - - k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); - if (unlikely(!k.k)) - goto out; - - if (unlikely(k.k->type == KEY_TYPE_whiteout && - (iter->flags & BTREE_ITER_filter_snapshots) && - !(iter->flags & BTREE_ITER_key_cache_fill))) - iter->k.type = KEY_TYPE_deleted; - } else { - struct bpos next; - struct bpos end = iter->pos; - - if (iter->flags & BTREE_ITER_is_extents) - end.offset = U64_MAX; - - EBUG_ON(btree_iter_path(trans, iter)->level); - - if (iter->flags & BTREE_ITER_intent) { - struct btree_iter iter2; - - bch2_trans_copy_iter(trans, &iter2, iter); - k = bch2_btree_iter_peek_max(trans, &iter2, end); - - if (k.k && !bkey_err(k)) { - swap(iter->key_cache_path, iter2.key_cache_path); - iter->k = iter2.k; - k.k = &iter->k; - } - bch2_trans_iter_exit(trans, &iter2); - } else { - struct bpos pos = iter->pos; - - k = bch2_btree_iter_peek_max(trans, iter, end); - if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(trans, iter, pos); - else - iter->pos = pos; - } - - if (unlikely(bkey_err(k))) - goto out; - - next = k.k ? bkey_start_pos(k.k) : POS_MAX; - - if (bkey_lt(iter->pos, next)) { - bkey_init(&iter->k); - iter->k.p = iter->pos; - - if (iter->flags & BTREE_ITER_is_extents) { - bch2_key_resize(&iter->k, - min_t(u64, KEY_SIZE_MAX, - (next.inode == iter->pos.inode - ? next.offset - : KEY_OFFSET_MAX) - - iter->pos.offset)); - EBUG_ON(!iter->k.size); - } - - k = (struct bkey_s_c) { &iter->k, NULL }; - } - } -out: - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - ret = bch2_btree_iter_verify_ret(trans, iter, k); - if (unlikely(ret)) - k = bkey_s_c_err(ret); -out2: - if (trace_btree_iter_peek_slot_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace_btree_iter_peek_slot(trans->c, buf.buf); - } - - return k; -} - -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_advance(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_slot(trans, iter); -} - -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_rewind(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_slot(trans, iter); -} - -/* Obsolete, but still used by rust wrapper in -tools */ -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_type(trans, iter, iter->flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - -/* new transactional stuff: */ - -#ifdef CONFIG_BCACHEFS_DEBUG -static void btree_trans_verify_sorted_refs(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1); - - trans_for_each_path(trans, path, i) { - BUG_ON(path->sorted_idx >= trans->nr_sorted); - BUG_ON(trans->sorted[path->sorted_idx] != i); - } - - for (i = 0; i < trans->nr_sorted; i++) { - unsigned idx = trans->sorted[i]; - - BUG_ON(!test_bit(idx, trans->paths_allocated)); - BUG_ON(trans->paths[idx].sorted_idx != i); - } -} - -static void btree_trans_verify_sorted(struct btree_trans *trans) -{ - struct btree_path *path, *prev = NULL; - struct trans_for_each_path_inorder_iter iter; - - if (!static_branch_unlikely(&bch2_debug_check_iterators)) - return; - - trans_for_each_path_inorder(trans, path, iter) { - if (prev && btree_path_cmp(prev, path) > 0) { - __bch2_dump_trans_paths_updates(trans, true); - panic("trans paths out of order!\n"); - } - prev = path; - } -} -#else -static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} -static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} -#endif - -void __bch2_btree_trans_sort_paths(struct btree_trans *trans) -{ - int i, l = 0, r = trans->nr_sorted, inc = 1; - bool swapped; - - btree_trans_verify_sorted_refs(trans); - - if (trans->paths_sorted) - goto out; - - /* - * Cocktail shaker sort: this is efficient because iterators will be - * mostly sorted. - */ - do { - swapped = false; - - for (i = inc > 0 ? l : r - 2; - i + 1 < r && i >= l; - i += inc) { - if (btree_path_cmp(trans->paths + trans->sorted[i], - trans->paths + trans->sorted[i + 1]) > 0) { - swap(trans->sorted[i], trans->sorted[i + 1]); - trans->paths[trans->sorted[i]].sorted_idx = i; - trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; - swapped = true; - } - } - - if (inc > 0) - --r; - else - l++; - inc = -inc; - } while (swapped); - - trans->paths_sorted = true; -out: - btree_trans_verify_sorted(trans); -} - -static inline void btree_path_list_remove(struct btree_trans *trans, - struct btree_path *path) -{ - EBUG_ON(path->sorted_idx >= trans->nr_sorted); -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - trans->nr_sorted--; - memmove_u64s_down_small(trans->sorted + path->sorted_idx, - trans->sorted + path->sorted_idx + 1, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, - sizeof(u64) / sizeof(btree_path_idx_t))); -#else - array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); -#endif - for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) - trans->paths[trans->sorted[i]].sorted_idx = i; -} - -static inline void btree_path_list_add(struct btree_trans *trans, - btree_path_idx_t pos, - btree_path_idx_t path_idx) -{ - struct btree_path *path = trans->paths + path_idx; - - path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted; - -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, - trans->sorted + path->sorted_idx, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, - sizeof(u64) / sizeof(btree_path_idx_t))); - trans->nr_sorted++; - trans->sorted[path->sorted_idx] = path_idx; -#else - array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx); -#endif - - for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) - trans->paths[trans->sorted[i]].sorted_idx = i; - - btree_trans_verify_sorted_refs(trans); -} - -void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) -{ - if (iter->update_path) - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - if (iter->path) - bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_intent); - if (iter->key_cache_path) - bch2_path_put(trans, iter->key_cache_path, - iter->flags & BTREE_ITER_intent); - iter->path = 0; - iter->update_path = 0; - iter->key_cache_path = 0; -} - -void bch2_trans_iter_init_outlined(struct btree_trans *trans, - struct btree_iter *iter, - enum btree_id btree_id, struct bpos pos, - unsigned flags) -{ - bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, 0, flags), - _RET_IP_); -} - -void bch2_trans_node_iter_init(struct btree_trans *trans, - struct btree_iter *iter, - enum btree_id btree_id, - struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags) -{ - flags |= BTREE_ITER_not_extents; - flags |= BTREE_ITER_snapshot_field; - flags |= BTREE_ITER_all_snapshots; - - if (!depth && btree_id_cached(trans->c, btree_id)) - flags |= BTREE_ITER_with_key_cache; - - bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, - bch2_btree_iter_flags(trans, btree_id, depth, flags), - _RET_IP_); - - iter->min_depth = depth; - - struct btree_path *path = btree_iter_path(trans, iter); - BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); - BUG_ON(path->level != depth); - BUG_ON(iter->min_depth != depth); -} - -void bch2_trans_copy_iter(struct btree_trans *trans, - struct btree_iter *dst, struct btree_iter *src) -{ - *dst = *src; -#ifdef TRACK_PATH_ALLOCATED - dst->ip_allocated = _RET_IP_; -#endif - if (src->path) - __btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent); - if (src->update_path) - __btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent); - dst->key_cache_path = 0; -} - -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, - darray_trans_kmalloc_trace *trace) -{ - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 60); - - darray_for_each(*trace, i) - prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); -} -#endif - -void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) -{ - struct bch_fs *c = trans->c; - unsigned new_top = trans->mem_top + size; - unsigned old_bytes = trans->mem_bytes; - unsigned new_bytes = roundup_pow_of_two(new_top); - int ret; - void *new_mem; - void *p; - - if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n", - BTREE_TRANS_MEM_MAX); - - bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -#endif - } - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (ret) - return ERR_PTR(ret); - - struct btree_transaction_stats *s = btree_trans_stats(trans); - if (new_bytes > s->max_mem) { - mutex_lock(&s->lock); -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); - s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, - trans->trans_kmalloc_trace.nr); - - memcpy(s->trans_kmalloc_trace.data, - trans->trans_kmalloc_trace.data, - sizeof(s->trans_kmalloc_trace.data[0]) * - s->trans_kmalloc_trace.nr); -#endif - s->max_mem = new_bytes; - mutex_unlock(&s->lock); - } - - if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) { - EBUG_ON(trans->mem_bytes >= new_bytes); - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); - } - - if (old_bytes) { - trans->realloc_bytes_required = new_bytes; - trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart_ip(trans, - BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); - } - - EBUG_ON(trans->mem); - - new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); - - new_mem = kmalloc(new_bytes, GFP_KERNEL); - if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - trans->used_mempool = true; - } - - EBUG_ON(!new_mem); - - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - - ret = bch2_trans_relock(trans); - if (ret) - return ERR_PTR(ret); - } - - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - - p = trans->mem + trans->mem_top; - trans->mem_top += size; - memset(p, 0, size); - return p; -} - -static inline void check_srcu_held_too_long(struct btree_trans *trans) -{ - WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10), - "btree trans held srcu lock (delaying memory reclaim) for %lu seconds", - (jiffies - trans->srcu_lock_time) / HZ); -} - -void bch2_trans_srcu_unlock(struct btree_trans *trans) -{ - if (trans->srcu_held) { - struct bch_fs *c = trans->c; - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->cached && !btree_node_locked(path, 0)) - path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); - - check_srcu_held_too_long(trans); - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - trans->srcu_held = false; - } -} - -static void bch2_trans_srcu_lock(struct btree_trans *trans) -{ - if (!trans->srcu_held) { - trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; - trans->srcu_held = true; - } -} - -/** - * bch2_trans_begin() - reset a transaction after a interrupted attempt - * @trans: transaction to reset - * - * Returns: current restart counter, to be used with trans_was_restarted() - * - * While iterating over nodes or updating nodes a attempt to lock a btree node - * may return BCH_ERR_transaction_restart when the trylock fails. When this - * occurs bch2_trans_begin() should be called and the transaction retried. - */ -u32 bch2_trans_begin(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - u64 now; - - bch2_trans_reset_updates(trans); - - trans->restart_count++; - trans->mem_top = 0; - - if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) { - EBUG_ON(!trans->mem || !trans->mem_bytes); - unsigned new_bytes = trans->realloc_bytes_required; - void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); - new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); - - EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); - - if (!new_mem) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - trans->used_mempool = true; - kfree(trans->mem); - } - } - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - } - - trans_for_each_path(trans, path, i) { - path->should_be_locked = false; - - /* - * If the transaction wasn't restarted, we're presuming to be - * doing something new: dont keep iterators excpt the ones that - * are in use - except for the subvolumes btree: - */ - if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) - path->preserve = false; - - /* - * XXX: we probably shouldn't be doing this if the transaction - * was restarted, but currently we still overflow transaction - * iterators if we do that - */ - if (!path->ref && !path->preserve) - __bch2_path_free(trans, i); - else - path->preserve = false; - } - - now = local_clock(); - - if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) && - time_after64(now, trans->last_begin_time + 10)) - __bch2_time_stats_update(&btree_trans_stats(trans)->duration, - trans->last_begin_time, now); - - if (!trans->restarted && - (need_resched() || - time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) { - bch2_trans_unlock(trans); - cond_resched(); - now = local_clock(); - } - trans->last_begin_time = now; - - if (unlikely(trans->srcu_held && - time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) - bch2_trans_srcu_unlock(trans); - - trans->last_begin_ip = _RET_IP_; - -#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS - if (trans->restarted) { - trans->restart_count_this_trans++; - } else { - trans->restart_count_this_trans = 0; - } -#endif - -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - trans->trans_kmalloc_trace.nr = 0; -#endif - - trans_set_locked(trans, false); - - if (trans->restarted) { - bch2_btree_path_traverse_all(trans); - trans->notrace_relock_fail = false; - } - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - return trans->restart_count; -} - -const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" }; - -unsigned bch2_trans_get_fn_idx(const char *fn) -{ - for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) - if (!bch2_btree_transaction_fns[i] || - bch2_btree_transaction_fns[i] == fn) { - bch2_btree_transaction_fns[i] = fn; - return i; - } - - pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); - return 0; -} - -struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) - __acquires(&c->btree_trans_barrier) -{ - struct btree_trans *trans; - - if (IS_ENABLED(__KERNEL__)) { - trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); - if (trans) { - memset(trans, 0, offsetof(struct btree_trans, list)); - goto got_trans; - } - } - - trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); - memset(trans, 0, sizeof(*trans)); - - seqmutex_lock(&c->btree_trans_lock); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - struct btree_trans *pos; - pid_t pid = current->pid; - - trans->locking_wait.task = current; - - list_for_each_entry(pos, &c->btree_trans_list, list) { - struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task); - /* - * We'd much prefer to be stricter here and completely - * disallow multiple btree_trans in the same thread - - * but the data move path calls bch2_write when we - * already have a btree_trans initialized. - */ - BUG_ON(pos_task && - pid == pos_task->pid && - pos->locked); - } - } - - list_add(&trans->list, &c->btree_trans_list); - seqmutex_unlock(&c->btree_trans_lock); -got_trans: - trans->c = c; - trans->last_begin_time = local_clock(); - trans->fn_idx = fn_idx; - trans->locking_wait.task = current; - trans->journal_replay_not_finished = - unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) && - atomic_inc_not_zero(&c->journal_keys.ref); - trans->nr_paths = ARRAY_SIZE(trans->_paths); - trans->paths_allocated = trans->_paths_allocated; - trans->sorted = trans->_sorted; - trans->paths = trans->_paths; - trans->updates = trans->_updates; - - *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL; - - trans->paths_allocated[0] = 1; - - static struct lock_class_key lockdep_key; - lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0); - - if (fn_idx < BCH_TRANSACTIONS_NR) { - trans->fn = bch2_btree_transaction_fns[fn_idx]; - - struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx]; - - if (s->max_mem) { - unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); - - trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); - if (likely(trans->mem)) - trans->mem_bytes = expected_mem_bytes; - } - - trans->nr_paths_max = s->nr_max_paths; - } - - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; - trans->srcu_held = true; - trans_set_locked(trans, false); - - closure_init_stack_release(&trans->ref); - return trans; -} - -#ifdef CONFIG_BCACHEFS_DEBUG - -static bool btree_paths_leaked(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->ref) - return true; - return false; -} - -static void check_btree_paths_leaked(struct btree_trans *trans) -{ - if (btree_paths_leaked(trans)) { - struct bch_fs *c = trans->c; - struct btree_path *path; - unsigned i; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn); - trans_for_each_path(trans, path, i) - if (path->ref) - prt_printf(&buf, "btree %s %pS\n", - bch2_btree_id_str(path->btree_id), - (void *) path->ip_allocated); - - bch2_fs_emergency_read_only2(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } -} -#else -static inline void check_btree_paths_leaked(struct btree_trans *trans) {} -#endif - -void bch2_trans_put(struct btree_trans *trans) - __releases(&c->btree_trans_barrier) -{ - struct bch_fs *c = trans->c; - - if (trans->restarted) - bch2_trans_in_restart_error(trans); - - bch2_trans_unlock(trans); - - trans_for_each_update(trans, i) - __btree_path_put(trans, trans->paths + i->path, true); - trans->nr_updates = 0; - - check_btree_paths_leaked(trans); - - if (trans->srcu_held) { - check_srcu_held_too_long(trans); - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - } - - if (unlikely(trans->journal_replay_not_finished)) - bch2_journal_keys_put(c); - - /* - * trans->ref protects trans->locking_wait.task, btree_paths array; used - * by cycle detector - */ - closure_return_sync(&trans->ref); - trans->locking_wait.task = NULL; - -#ifdef CONFIG_BCACHEFS_DEBUG - darray_exit(&trans->last_restarted_trace); -#endif -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_exit(&trans->trans_kmalloc_trace); -#endif - - unsigned long *paths_allocated = trans->paths_allocated; - trans->paths_allocated = NULL; - trans->paths = NULL; - - if (paths_allocated != trans->_paths_allocated) - kvfree_rcu_mightsleep(paths_allocated); - - if (trans->used_mempool) - mempool_free(trans->mem, &c->btree_trans_mem_pool); - else - kfree(trans->mem); - - /* Userspace doesn't have a real percpu implementation: */ - if (IS_ENABLED(__KERNEL__)) - trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); - - if (trans) { - seqmutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - seqmutex_unlock(&c->btree_trans_lock); - - mempool_free(trans, &c->btree_trans_pool); - } -} - -bool bch2_current_has_btree_trans(struct bch_fs *c) -{ - seqmutex_lock(&c->btree_trans_lock); - struct btree_trans *trans; - bool ret = false; - list_for_each_entry(trans, &c->btree_trans_list, list) - if (trans->locking_wait.task == current && - trans->locked) { - ret = true; - break; - } - seqmutex_unlock(&c->btree_trans_lock); - return ret; -} - -static void __maybe_unused -bch2_btree_bkey_cached_common_to_text(struct printbuf *out, - struct btree_bkey_cached_common *b) -{ - struct six_lock_count c = six_lock_counts(&b->lock); - pid_t pid; - - scoped_guard(rcu) { - struct task_struct *owner = READ_ONCE(b->lock.owner); - pid = owner ? owner->pid : 0; - } - - prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); - bch2_btree_id_to_text(out, b->btree_id); - prt_printf(out, " l=%u:", b->level); - bch2_bpos_to_text(out, btree_node_pos(b)); - - prt_printf(out, "\t locks %u:%u:%u held by pid %u", - c.n[0], c.n[1], c.n[2], pid); -} - -void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) -{ - struct btree_bkey_cached_common *b; - static char lock_types[] = { 'r', 'i', 'w' }; - struct task_struct *task = READ_ONCE(trans->locking_wait.task); - unsigned l, idx; - - /* before rcu_read_lock(): */ - bch2_printbuf_make_room(out, 4096); - - if (!out->nr_tabstops) { - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 32); - } - - prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); - - /* trans->paths is rcu protected vs. freeing */ - guard(rcu)(); - out->atomic++; - - struct btree_path *paths = rcu_dereference(trans->paths); - if (!paths) - goto out; - - unsigned long *paths_allocated = trans_paths_allocated(paths); - - trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) { - struct btree_path *path = paths + idx; - if (!path->nodes_locked) - continue; - - prt_printf(out, " path %u %c ", - idx, - path->cached ? 'c' : 'b'); - bch2_btree_id_to_text(out, path->btree_id); - prt_printf(out, " l=%u:", path->level); - bch2_bpos_to_text(out, path->pos); - prt_newline(out); - - for (l = 0; l < BTREE_MAX_DEPTH; l++) { - if (btree_node_locked(path, l) && - !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { - prt_printf(out, " %c l=%u ", - lock_types[btree_node_locked_type(path, l)], l); - bch2_btree_bkey_cached_common_to_text(out, b); - prt_newline(out); - } - } - } - - b = READ_ONCE(trans->locking); - if (b) { - prt_printf(out, " blocked for %lluus on\n", - div_u64(local_clock() - trans->locking_wait.start_time, 1000)); - prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); - bch2_btree_bkey_cached_common_to_text(out, b); - prt_newline(out); - } -out: - --out->atomic; -} - -void bch2_fs_btree_iter_exit(struct bch_fs *c) -{ - struct btree_transaction_stats *s; - struct btree_trans *trans; - int cpu; - - if (c->btree_trans_bufs) - for_each_possible_cpu(cpu) { - struct btree_trans *trans = - per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; - - if (trans) { - seqmutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - seqmutex_unlock(&c->btree_trans_lock); - } - kfree(trans); - } - free_percpu(c->btree_trans_bufs); - - trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); - if (trans) - panic("%s leaked btree_trans\n", trans->fn); - - for (s = c->btree_transaction_stats; - s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); - s++) { -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_exit(&s->trans_kmalloc_trace); -#endif - kfree(s->max_paths_text); - bch2_time_stats_exit(&s->lock_hold_times); - } - - if (c->btree_trans_barrier_initialized) { - synchronize_srcu_expedited(&c->btree_trans_barrier); - cleanup_srcu_struct(&c->btree_trans_barrier); - } - mempool_exit(&c->btree_trans_mem_pool); - mempool_exit(&c->btree_trans_pool); -} - -void bch2_fs_btree_iter_init_early(struct bch_fs *c) -{ - struct btree_transaction_stats *s; - - for (s = c->btree_transaction_stats; - s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); - s++) { - bch2_time_stats_init(&s->duration); - bch2_time_stats_init(&s->lock_hold_times); - mutex_init(&s->lock); - } - - INIT_LIST_HEAD(&c->btree_trans_list); - seqmutex_init(&c->btree_trans_lock); -} - -int bch2_fs_btree_iter_init(struct bch_fs *c) -{ - int ret; - - c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf); - if (!c->btree_trans_bufs) - return -ENOMEM; - - ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1, - sizeof(struct btree_trans)) ?: - mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, - BTREE_TRANS_MEM_MAX) ?: - init_srcu_struct(&c->btree_trans_barrier); - if (ret) - return ret; - - /* - * static annotation (hackily done) for lock ordering of reclaim vs. - * btree node locks: - */ -#ifdef CONFIG_LOCKDEP - fs_reclaim_acquire(GFP_KERNEL); - struct btree_trans *trans = bch2_trans_get(c); - trans_set_locked(trans, false); - bch2_trans_put(trans); - fs_reclaim_release(GFP_KERNEL); -#endif - - c->btree_trans_barrier_initialized = true; - return 0; - -} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h deleted file mode 100644 index 09dd3e52622e48..00000000000000 --- a/fs/bcachefs/btree_iter.h +++ /dev/null @@ -1,1010 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_ITER_H -#define _BCACHEFS_BTREE_ITER_H - -#include "bset.h" -#include "btree_types.h" -#include "trace.h" - -void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); -void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); -void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); -void bch2_dump_trans_paths_updates(struct btree_trans *); - -static inline int __bkey_err(const struct bkey *k) -{ - return PTR_ERR_OR_ZERO(k); -} - -#define bkey_err(_k) __bkey_err((_k).k) - -static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent) -{ - unsigned idx = path - trans->paths; - - EBUG_ON(idx >= trans->nr_paths); - EBUG_ON(!test_bit(idx, trans->paths_allocated)); - if (unlikely(path->ref == U8_MAX)) { - bch2_dump_trans_paths_updates(trans); - panic("path %u refcount overflow\n", idx); - } - - path->ref++; - path->intent_ref += intent; - trace_btree_path_get_ll(trans, path); -} - -static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) -{ - EBUG_ON(path - trans->paths >= trans->nr_paths); - EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated)); - EBUG_ON(!path->ref); - EBUG_ON(!path->intent_ref && intent); - - trace_btree_path_put_ll(trans, path); - path->intent_ref -= intent; - return --path->ref == 0; -} - -static inline void btree_path_set_dirty(struct btree_trans *trans, - struct btree_path *path, - enum btree_path_uptodate u) -{ - BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); - path->uptodate = max_t(unsigned, path->uptodate, u); -} - -static inline struct btree *btree_path_node(struct btree_path *path, - unsigned level) -{ - return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL; -} - -static inline bool btree_node_lock_seq_matches(const struct btree_path *path, - const struct btree *b, unsigned level) -{ - return path->l[level].lock_seq == six_lock_seq(&b->c.lock); -} - -static inline struct btree *btree_node_parent(struct btree_path *path, - struct btree *b) -{ - return btree_path_node(path, b->c.level + 1); -} - -/* Iterate over paths within a transaction: */ - -void __bch2_btree_trans_sort_paths(struct btree_trans *); - -static inline void btree_trans_sort_paths(struct btree_trans *trans) -{ - if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - trans->paths_sorted) - return; - __bch2_btree_trans_sort_paths(trans); -} - -static inline unsigned long *trans_paths_nr(struct btree_path *paths) -{ - return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths; -} - -static inline unsigned long *trans_paths_allocated(struct btree_path *paths) -{ - unsigned long *v = trans_paths_nr(paths); - return v - BITS_TO_LONGS(*v); -} - -#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\ - for (_idx = _start; \ - (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \ - _idx++) - -static inline struct btree_path * -__trans_next_path(struct btree_trans *trans, unsigned *idx) -{ - unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG; - /* - * Open coded find_next_bit(), because - * - this is fast path, we can't afford the function call - * - and we know that nr_paths is a multiple of BITS_PER_LONG, - */ - while (*idx < trans->nr_paths) { - unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1)); - if (v) { - *idx += __ffs(v); - return trans->paths + *idx; - } - - *idx += BITS_PER_LONG; - *idx &= ~(BITS_PER_LONG - 1); - w++; - } - - return NULL; -} - -/* - * This version is intended to be safe for use on a btree_trans that is owned by - * another thread, for bch2_btree_trans_to_text(); - */ -#define trans_for_each_path_from(_trans, _path, _idx, _start) \ - for (_idx = _start; \ - (_path = __trans_next_path((_trans), &_idx)); \ - _idx++) - -#define trans_for_each_path(_trans, _path, _idx) \ - trans_for_each_path_from(_trans, _path, _idx, 1) - -static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) -{ - unsigned idx = path ? path->sorted_idx + 1 : 0; - - EBUG_ON(idx > trans->nr_sorted); - - return idx < trans->nr_sorted - ? trans->paths + trans->sorted[idx] - : NULL; -} - -static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) -{ - unsigned idx = path ? path->sorted_idx : trans->nr_sorted; - - return idx - ? trans->paths + trans->sorted[idx - 1] - : NULL; -} - -#define trans_for_each_path_idx_inorder(_trans, _iter) \ - for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ - (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ - _iter.sorted_idx < (_trans)->nr_sorted); \ - _iter.sorted_idx++) - -struct trans_for_each_path_inorder_iter { - btree_path_idx_t sorted_idx; - btree_path_idx_t path_idx; -}; - -#define trans_for_each_path_inorder(_trans, _path, _iter) \ - for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ - (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ - _path = (_trans)->paths + _iter.path_idx, \ - _iter.sorted_idx < (_trans)->nr_sorted); \ - _iter.sorted_idx++) - -#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ - for (_i = trans->nr_sorted - 1; \ - ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\ - --_i) - -static inline bool __path_has_node(const struct btree_path *path, - const struct btree *b) -{ - return path->l[b->c.level].b == b && - btree_node_lock_seq_matches(path, b, b->c.level); -} - -static inline struct btree_path * -__trans_next_path_with_node(struct btree_trans *trans, struct btree *b, - unsigned *idx) -{ - struct btree_path *path; - - while ((path = __trans_next_path(trans, idx)) && - !__path_has_node(path, b)) - (*idx)++; - - return path; -} - -#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \ - for (_iter = 1; \ - (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\ - _iter++) - -btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t, - bool, unsigned long); - -static inline btree_path_idx_t __must_check -bch2_btree_path_make_mut(struct btree_trans *trans, - btree_path_idx_t path, bool intent, - unsigned long ip) -{ - if (trans->paths[path].ref > 1 || - trans->paths[path].preserve) - path = __bch2_btree_path_make_mut(trans, path, intent, ip); - trans->paths[path].should_be_locked = false; - return path; -} - -btree_path_idx_t __must_check -__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t, - struct bpos, bool, unsigned long); - -static inline btree_path_idx_t __must_check -bch2_btree_path_set_pos(struct btree_trans *trans, - btree_path_idx_t path, struct bpos new_pos, - bool intent, unsigned long ip) -{ - return !bpos_eq(new_pos, trans->paths[path].pos) - ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip) - : path; -} - -int __must_check bch2_btree_path_traverse_one(struct btree_trans *, - btree_path_idx_t, - unsigned, unsigned long); - -static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *); - -static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, - btree_path_idx_t path, unsigned flags) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) - return 0; - - return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); -} - -btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, - unsigned, unsigned, unsigned, unsigned long); -btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id, - unsigned, struct bpos); - -struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); - -/* - * bch2_btree_path_peek_slot() for a cached iterator might return a key in a - * different snapshot: - */ -static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) -{ - struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); - - if (k.k && bpos_eq(path->pos, k.k->p)) - return k; - - bkey_init(u); - u->p = path->pos; - return (struct bkey_s_c) { u, NULL }; -} - -struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, - struct btree_iter *, struct bpos); - -void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *); - -int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *); - -static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) -{ - return mutex_trylock(lock) - ? 0 - : __bch2_trans_mutex_lock(trans, lock); -} - -/* Debug: */ - -void __bch2_trans_verify_paths(struct btree_trans *); -void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); - -static inline void bch2_trans_verify_paths(struct btree_trans *trans) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_trans_verify_paths(trans); -} - -static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree, - struct bpos pos) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_assert_pos_locked(trans, btree, pos); -} - -void bch2_btree_path_fix_key_modified(struct btree_trans *trans, - struct btree *, struct bkey_packed *); -void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, - struct btree *, struct btree_node_iter *, - struct bkey_packed *, unsigned, unsigned); - -int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); - -void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool); - -int bch2_trans_relock(struct btree_trans *); -int bch2_trans_relock_notrace(struct btree_trans *); -void bch2_trans_unlock(struct btree_trans *); -void bch2_trans_unlock_long(struct btree_trans *); - -static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) -{ - return restart_count != trans->restart_count - ? -BCH_ERR_transaction_restart_nested - : 0; -} - -void __noreturn bch2_trans_restart_error(struct btree_trans *, u32); - -static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, - u32 restart_count) -{ - if (trans_was_restarted(trans, restart_count)) - bch2_trans_restart_error(trans, restart_count); -} - -void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *); - -static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans) -{ - if (trans->restarted || !trans->locked) - bch2_trans_unlocked_or_in_restart_error(trans); -} - -__always_inline -static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip) -{ - BUG_ON(err <= 0); - BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); - - trans->restarted = err; - trans->last_restarted_ip = ip; - return -err; -} - -__always_inline -static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) -{ - btree_trans_restart_foreign_task(trans, err, ip); -#ifdef CONFIG_BCACHEFS_DEBUG - darray_exit(&trans->last_restarted_trace); - bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); -#endif - return -err; -} - -__always_inline -static int btree_trans_restart(struct btree_trans *trans, int err) -{ - return btree_trans_restart_ip(trans, err, _THIS_IP_); -} - -static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip) -{ -#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS - if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) { - trace_and_count(trans->c, trans_restart_injected, trans, ip); - return btree_trans_restart_ip(trans, - BCH_ERR_transaction_restart_fault_inject, ip); - } -#endif - return 0; -} - -bool bch2_btree_node_upgrade(struct btree_trans *, - struct btree_path *, unsigned); - -void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned); - -static inline void bch2_btree_path_downgrade(struct btree_trans *trans, - struct btree_path *path) -{ - unsigned new_locks_want = path->level + !!path->intent_ref; - - if (path->locks_want > new_locks_want) - __bch2_btree_path_downgrade(trans, path, new_locks_want); -} - -void bch2_trans_downgrade(struct btree_trans *); - -void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *); -void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); -void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); - -int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); -int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); - -struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *); -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *); -struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *); - -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos); -struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *); - -static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans, - struct btree_iter *iter) -{ - return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX); -} - -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos); - -static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter) -{ - return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN); -} - -struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *); - -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *); - -bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *); -bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *); - -static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) -{ - iter->k.type = KEY_TYPE_deleted; - iter->k.p.inode = iter->pos.inode = new_pos.inode; - iter->k.p.offset = iter->pos.offset = new_pos.offset; - iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; - iter->k.size = 0; -} - -static inline void bch2_btree_iter_set_pos(struct btree_trans *trans, - struct btree_iter *iter, struct bpos new_pos) -{ - if (unlikely(iter->update_path)) - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - - if (!(iter->flags & BTREE_ITER_all_snapshots)) - new_pos.snapshot = iter->snapshot; - - __bch2_btree_iter_set_pos(iter, new_pos); -} - -static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) -{ - BUG_ON(!(iter->flags & BTREE_ITER_is_extents)); - iter->pos = bkey_start_pos(&iter->k); -} - -static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans, - struct btree_iter *iter, u32 snapshot) -{ - struct bpos pos = iter->pos; - - iter->snapshot = snapshot; - pos.snapshot = snapshot; - bch2_btree_iter_set_pos(trans, iter, pos); -} - -void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); - -static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned level, - unsigned flags) -{ - if (level || !btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_cached; - flags &= ~BTREE_ITER_with_key_cache; - } else if (!(flags & BTREE_ITER_cached)) - flags |= BTREE_ITER_with_key_cache; - - if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && - btree_id_is_extents(btree_id)) - flags |= BTREE_ITER_is_extents; - - if (!(flags & BTREE_ITER_snapshot_field) && - !btree_type_has_snapshot_field(btree_id)) - flags &= ~BTREE_ITER_all_snapshots; - - if (!(flags & BTREE_ITER_all_snapshots) && - btree_type_has_snapshots(btree_id)) - flags |= BTREE_ITER_filter_snapshots; - - if (trans->journal_replay_not_finished) - flags |= BTREE_ITER_with_journal; - - return flags; -} - -static inline void bch2_trans_iter_init_common(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags, - unsigned long ip) -{ - iter->update_path = 0; - iter->key_cache_path = 0; - iter->btree_id = btree_id; - iter->min_depth = 0; - iter->flags = flags; - iter->snapshot = pos.snapshot; - iter->pos = pos; - iter->k = POS_KEY(pos); - iter->journal_idx = 0; -#ifdef CONFIG_BCACHEFS_DEBUG - iter->ip_allocated = ip; -#endif - iter->path = bch2_path_get(trans, btree_id, iter->pos, - locks_want, depth, flags, ip); -} - -void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos, unsigned); - -static inline void bch2_trans_iter_init(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags) -{ - if (__builtin_constant_p(btree_id) && - __builtin_constant_p(flags)) - bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, 0, flags), - _THIS_IP_); - else - bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); -} - -void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos, - unsigned, unsigned, unsigned); -void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *); - -void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); - -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -void bch2_trans_kmalloc_trace_to_text(struct printbuf *, - darray_trans_kmalloc_trace *); -#endif - -void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); - -static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, - unsigned long ip) -{ -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_push(&trans->trans_kmalloc_trace, - ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); -#endif -} - -static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size, - unsigned long ip) -{ - size = roundup(size, 8); - - bch2_trans_kmalloc_trace(trans, size, ip); - - if (likely(trans->mem_top + size <= trans->mem_bytes)) { - void *p = trans->mem + trans->mem_top; - - trans->mem_top += size; - return p; - } else { - return __bch2_trans_kmalloc(trans, size, ip); - } -} - -static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size, - unsigned long ip) -{ - size = roundup(size, 8); - - bch2_trans_kmalloc_trace(trans, size, ip); - - if (likely(trans->mem_top + size <= trans->mem_bytes)) { - void *p = trans->mem + trans->mem_top; - - trans->mem_top += size; - memset(p, 0, size); - return p; - } else { - return __bch2_trans_kmalloc(trans, size, ip); - } -} - -/** - * bch2_trans_kmalloc - allocate memory for use by the current transaction - * - * Must be called after bch2_trans_begin, which on second and further calls - * frees all memory allocated in this transaction - */ -static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -{ - return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_); -} - -static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) -{ - return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_); -} - -static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type) -{ - struct bkey_s_c k; - - bch2_trans_iter_init(trans, iter, btree_id, pos, flags); - k = bch2_btree_iter_peek_slot(trans, iter); - - if (!bkey_err(k) && type && k.k->type != type) - k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); - if (unlikely(bkey_err(k))) - bch2_trans_iter_exit(trans, iter); - return k; -} - -static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags) -{ - return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0); -} - -#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ - bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ - _btree_id, _pos, _flags, KEY_TYPE_##_type)) - -static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k) -{ - unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k)); - memcpy(dst_v, src_k.v, b); - if (unlikely(b < dst_size)) - memset(dst_v + b, 0, dst_size - b); -} - -#define bkey_val_copy(_dst_v, _src_k) \ -do { \ - BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \ - __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \ -} while (0) - -static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, - unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type, - unsigned val_size, void *val) -{ - struct btree_iter iter; - struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); - int ret = bkey_err(k); - if (!ret) { - __bkey_val_copy(val, val_size, k); - bch2_trans_iter_exit(trans, &iter); - } - - return ret; -} - -#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\ - __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \ - KEY_TYPE_##_type, sizeof(*_val), _val) - -void bch2_trans_srcu_unlock(struct btree_trans *); - -u32 bch2_trans_begin(struct btree_trans *); - -#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _locks_want, _depth, _flags, _b, _do) \ -({ \ - bch2_trans_begin((_trans)); \ - \ - struct btree_iter _iter; \ - bch2_trans_node_iter_init((_trans), &_iter, (_btree_id), \ - _start, _locks_want, _depth, _flags); \ - int _ret3 = 0; \ - do { \ - _ret3 = lockrestart_do((_trans), ({ \ - struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\ - if (!_b) \ - break; \ - \ - PTR_ERR_OR_ZERO(_b) ?: (_do); \ - })) ?: \ - lockrestart_do((_trans), \ - PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\ - } while (!_ret3); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _flags, _b, _do) \ - __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - 0, 0, _flags, _b, _do) - -static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) -{ - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : - bch2_btree_iter_peek_prev(trans, iter); -} - -static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) -{ - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : - bch2_btree_iter_peek(trans, iter); -} - -static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end, - unsigned flags) -{ - if (!(flags & BTREE_ITER_slots)) - return bch2_btree_iter_peek_max(trans, iter, end); - - if (bkey_gt(iter->pos, end)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_slot(trans, iter); -} - -int __bch2_btree_trans_too_many_iters(struct btree_trans *); - -static inline int btree_trans_too_many_iters(struct btree_trans *trans) -{ - if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8) - return __bch2_btree_trans_too_many_iters(trans); - - return 0; -} - -/* - * goto instead of loop, so that when used inside for_each_btree_key2() - * break/continue work correctly - */ -#define lockrestart_do(_trans, _do) \ -({ \ - __label__ transaction_restart; \ - u32 _restart_count; \ - int _ret2; \ -transaction_restart: \ - _restart_count = bch2_trans_begin(_trans); \ - _ret2 = (_do); \ - \ - if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \ - goto transaction_restart; \ - \ - if (!_ret2) \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - _ret2; \ -}) - -/* - * nested_lockrestart_do(), nested_commit_do(): - * - * These are like lockrestart_do() and commit_do(), with two differences: - * - * - We don't call bch2_trans_begin() unless we had a transaction restart - * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a - * transaction restart - */ -#define nested_lockrestart_do(_trans, _do) \ -({ \ - u32 _restart_count, _orig_restart_count; \ - int _ret2; \ - \ - _restart_count = _orig_restart_count = (_trans)->restart_count; \ - \ - while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\ - _restart_count = bch2_trans_begin(_trans); \ - \ - if (!_ret2) \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - \ - _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ -}) - -#define for_each_btree_key_max_continue(_trans, _iter, \ - _end, _flags, _k, _do) \ -({ \ - struct bkey_s_c _k; \ - int _ret3 = 0; \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \ - _end, (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ - } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ - for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) - -#define for_each_btree_key_max(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ -({ \ - bch2_trans_begin(trans); \ - \ - struct btree_iter _iter; \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\ -}) - -#define for_each_btree_key(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ - for_each_btree_key_max(_trans, _iter, _btree_id, _start, \ - SPOS_MAX, _flags, _k, _do) - -#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ -({ \ - struct btree_iter _iter; \ - struct bkey_s_c _k; \ - int _ret3 = 0; \ - \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \ - (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ - } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_key_commit(_trans, _iter, _btree_id, \ - _start, _iter_flags, _k, \ - _disk_res, _journal_seq, _commit_flags,\ - _do) \ - for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - -#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \ - _start, _iter_flags, _k, \ - _disk_res, _journal_seq, _commit_flags,\ - _do) \ - for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - -#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \ - _start, _end, _iter_flags, _k, \ - _disk_res, _journal_seq, _commit_flags,\ - _do) \ - for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, - struct btree_iter *); - -#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(_trans, &(_iter))) - -#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\ - for (; \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(_trans, &(_iter))) - -#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ - SPOS_MAX, _flags, _k, _ret) - -#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_rewind(_trans, &(_iter))) - -#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \ - for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret) - -/* - * This should not be used in a fastpath, without first trying _do in - * nonblocking mode - it will cause excessive transaction restarts and - * potentially livelocking: - */ -#define drop_locks_do(_trans, _do) \ -({ \ - bch2_trans_unlock(_trans); \ - (_do) ?: bch2_trans_relock(_trans); \ -}) - -#define allocate_dropping_locks_errcode(_trans, _do) \ -({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ - int _ret = _do; \ - \ - if (bch2_err_matches(_ret, ENOMEM)) { \ - _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(_trans, _do); \ - } \ - _ret; \ -}) - -#define allocate_dropping_locks(_trans, _ret, _do) \ -({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ - typeof(_do) _p = _do; \ - \ - _ret = 0; \ - if (unlikely(!_p)) { \ - _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(_trans, ((_p = _do), 0)); \ - } \ - _p; \ -}) - -struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); -void bch2_trans_put(struct btree_trans *); - -bool bch2_current_has_btree_trans(struct bch_fs *); - -extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; -unsigned bch2_trans_get_fn_idx(const char *); - -#define bch2_trans_get(_c) \ -({ \ - static unsigned trans_fn_idx; \ - \ - if (unlikely(!trans_fn_idx)) \ - trans_fn_idx = bch2_trans_get_fn_idx(__func__); \ - __bch2_trans_get(_c, trans_fn_idx); \ -}) - -/* - * We don't use DEFINE_CLASS() because using a function for the constructor - * breaks bch2_trans_get()'s use of __func__ - */ -typedef struct btree_trans * class_btree_trans_t; -static inline void class_btree_trans_destructor(struct btree_trans **p) -{ - struct btree_trans *trans = *p; - bch2_trans_put(trans); -} - -#define class_btree_trans_constructor(_c) bch2_trans_get(_c) - -#define bch2_trans_run(_c, _do) \ -({ \ - CLASS(btree_trans, trans)(_c); \ - (_do); \ -}) - -#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) - -void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); - -void bch2_fs_btree_iter_exit(struct bch_fs *); -void bch2_fs_btree_iter_init_early(struct bch_fs *); -int bch2_fs_btree_iter_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c deleted file mode 100644 index ea839560a13639..00000000000000 --- a/fs/bcachefs/btree_journal_iter.c +++ /dev/null @@ -1,830 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "bset.h" -#include "btree_cache.h" -#include "btree_journal_iter.h" -#include "journal_io.h" - -#include - -/* - * For managing keys we read from the journal: until journal replay works normal - * btree lookups need to be able to find and return keys from the journal where - * they overwrite what's in the btree, so we have a special iterator and - * operations for the regular btree iter code to use: - */ - -static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos) -{ - size_t gap_size = keys->size - keys->nr; - - BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size); - - if (pos >= keys->gap) - pos -= gap_size; - return pos; -} - -static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) -{ - size_t gap_size = keys->size - keys->nr; - - if (idx >= keys->gap) - idx += gap_size; - return idx; -} - -static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) -{ - return keys->data + idx_to_pos(keys, idx); -} - -static size_t __bch2_journal_key_search(struct journal_keys *keys, - enum btree_id id, unsigned level, - struct bpos pos) -{ - size_t l = 0, r = keys->nr, m; - - while (l < r) { - m = l + ((r - l) >> 1); - if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) - l = m + 1; - else - r = m; - } - - BUG_ON(l < keys->nr && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); - - BUG_ON(l && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); - - return l; -} - -static size_t bch2_journal_key_search(struct journal_keys *keys, - enum btree_id id, unsigned level, - struct bpos pos) -{ - return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); -} - -/* Returns first non-overwritten key >= search key: */ -struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos, - struct bpos end_pos, size_t *idx) -{ - struct journal_keys *keys = &c->journal_keys; - unsigned iters = 0; - struct journal_key *k; - - BUG_ON(*idx > keys->nr); -search: - if (!*idx) - *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - - while (*idx && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { - --(*idx); - iters++; - if (iters == 10) { - *idx = 0; - goto search; - } - } - - struct bkey_i *ret = NULL; - rcu_read_lock(); /* for overwritten_ranges */ - - while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { - if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) - break; - - if (k->overwritten) { - if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->end; - else - *idx += 1; - continue; - } - - if (__journal_key_cmp(btree_id, level, pos, k) <= 0) { - ret = k->k; - break; - } - - (*idx)++; - iters++; - if (iters == 10) { - *idx = 0; - rcu_read_unlock(); - goto search; - } - } - - rcu_read_unlock(); - return ret; -} - -struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos, - struct bpos end_pos, size_t *idx) -{ - struct journal_keys *keys = &c->journal_keys; - unsigned iters = 0; - struct journal_key *k; - - BUG_ON(*idx > keys->nr); - - if (!keys->nr) - return NULL; -search: - if (!*idx) - *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - - while (*idx < keys->nr && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { - (*idx)++; - iters++; - if (iters == 10) { - *idx = 0; - goto search; - } - } - - if (*idx == keys->nr) - --(*idx); - - struct bkey_i *ret = NULL; - rcu_read_lock(); /* for overwritten_ranges */ - - while (true) { - k = idx_to_key(keys, *idx); - if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) - break; - - if (k->overwritten) { - if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->start; - if (!*idx) - break; - --(*idx); - continue; - } - - if (__journal_key_cmp(btree_id, level, pos, k) >= 0) { - ret = k->k; - break; - } - - if (!*idx) - break; - --(*idx); - iters++; - if (iters == 10) { - *idx = 0; - goto search; - } - } - - rcu_read_unlock(); - return ret; -} - -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos) -{ - size_t idx = 0; - - return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx); -} - -static void journal_iter_verify(struct journal_iter *iter) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct journal_keys *keys = iter->keys; - size_t gap_size = keys->size - keys->nr; - - BUG_ON(iter->idx >= keys->gap && - iter->idx < keys->gap + gap_size); - - if (iter->idx < keys->size) { - struct journal_key *k = keys->data + iter->idx; - - int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); - BUG_ON(cmp > 0); - } -#endif -} - -static void journal_iters_fix(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - /* The key we just inserted is immediately before the gap: */ - size_t gap_end = keys->gap + (keys->size - keys->nr); - struct journal_key *new_key = &keys->data[keys->gap - 1]; - struct journal_iter *iter; - - /* - * If an iterator points one after the key we just inserted, decrement - * the iterator so it points at the key we just inserted - if the - * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will - * handle that: - */ - list_for_each_entry(iter, &c->journal_iters, list) { - journal_iter_verify(iter); - if (iter->idx == gap_end && - new_key->btree_id == iter->btree_id && - new_key->level == iter->level) - iter->idx = keys->gap - 1; - journal_iter_verify(iter); - } -} - -static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) -{ - struct journal_keys *keys = &c->journal_keys; - struct journal_iter *iter; - size_t gap_size = keys->size - keys->nr; - - list_for_each_entry(iter, &c->journal_iters, list) { - if (iter->idx > old_gap) - iter->idx -= gap_size; - if (iter->idx >= new_gap) - iter->idx += gap_size; - } -} - -int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) -{ - struct journal_key n = { - .btree_id = id, - .level = level, - .k = k, - .allocated = true, - /* - * Ensure these keys are done last by journal replay, to unblock - * journal reclaim: - */ - .journal_seq = U64_MAX, - }; - struct journal_keys *keys = &c->journal_keys; - size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); - - BUG_ON(test_bit(BCH_FS_rw, &c->flags)); - - if (idx < keys->size && - journal_key_cmp(&n, &keys->data[idx]) == 0) { - if (keys->data[idx].allocated) - kfree(keys->data[idx].k); - keys->data[idx] = n; - return 0; - } - - if (idx > keys->gap) - idx -= keys->size - keys->nr; - - size_t old_gap = keys->gap; - - if (keys->nr == keys->size) { - journal_iters_move_gap(c, old_gap, keys->size); - old_gap = keys->size; - - struct journal_keys new_keys = { - .nr = keys->nr, - .size = max_t(size_t, keys->size, 8) * 2, - }; - - new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL); - if (!new_keys.data) { - bch_err(c, "%s: error allocating new key array (size %zu)", - __func__, new_keys.size); - return bch_err_throw(c, ENOMEM_journal_key_insert); - } - - /* Since @keys was full, there was no gap: */ - memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); - kvfree(keys->data); - keys->data = new_keys.data; - keys->nr = new_keys.nr; - keys->size = new_keys.size; - - /* And now the gap is at the end: */ - keys->gap = keys->nr; - } - - journal_iters_move_gap(c, old_gap, idx); - - move_gap(keys, idx); - - keys->nr++; - keys->data[keys->gap++] = n; - - journal_iters_fix(c); - - return 0; -} - -/* - * Can only be used from the recovery thread while we're still RO - can't be - * used once we've got RW, as journal_keys is at that point used by multiple - * threads: - */ -int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) -{ - struct bkey_i *n; - int ret; - - n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); - if (!n) - return bch_err_throw(c, ENOMEM_journal_key_insert); - - bkey_copy(n, k); - ret = bch2_journal_key_insert_take(c, id, level, n); - if (ret) - kfree(n); - return ret; -} - -int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, - unsigned level, struct bpos pos) -{ - struct bkey_i whiteout; - - bkey_init(&whiteout.k); - whiteout.k.p = pos; - - return bch2_journal_key_insert(c, id, level, &whiteout); -} - -bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, - unsigned level, struct bpos pos) -{ - struct journal_keys *keys = &trans->c->journal_keys; - size_t idx = bch2_journal_key_search(keys, btree, level, pos); - - if (!trans->journal_replay_not_finished) - return false; - - return (idx < keys->size && - keys->data[idx].btree_id == btree && - keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos) && - bkey_deleted(&keys->data[idx].k->k)); -} - -static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos) -{ - struct journal_key *k = keys->data + pos; - size_t idx = pos_to_idx(keys, pos); - - k->overwritten = true; - - struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL; - struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL; - - bool prev_overwritten = prev && prev->overwritten; - bool next_overwritten = next && next->overwritten; - - struct journal_key_range_overwritten *prev_range = - prev_overwritten ? prev->overwritten_range : NULL; - struct journal_key_range_overwritten *next_range = - next_overwritten ? next->overwritten_range : NULL; - - BUG_ON(prev_range && prev_range->end != idx); - BUG_ON(next_range && next_range->start != idx + 1); - - if (prev_range && next_range) { - prev_range->end = next_range->end; - - keys->data[pos].overwritten_range = prev_range; - for (size_t i = next_range->start; i < next_range->end; i++) { - struct journal_key *ip = keys->data + idx_to_pos(keys, i); - BUG_ON(ip->overwritten_range != next_range); - ip->overwritten_range = prev_range; - } - - kfree_rcu_mightsleep(next_range); - } else if (prev_range) { - prev_range->end++; - k->overwritten_range = prev_range; - if (next_overwritten) { - prev_range->end++; - next->overwritten_range = prev_range; - } - } else if (next_range) { - next_range->start--; - k->overwritten_range = next_range; - if (prev_overwritten) { - next_range->start--; - prev->overwritten_range = next_range; - } - } else if (prev_overwritten || next_overwritten) { - struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL); - if (!r) - return; - - r->start = idx - (size_t) prev_overwritten; - r->end = idx + 1 + (size_t) next_overwritten; - - rcu_assign_pointer(k->overwritten_range, r); - if (prev_overwritten) - prev->overwritten_range = r; - if (next_overwritten) - next->overwritten_range = r; - } -} - -void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, - unsigned level, struct bpos pos) -{ - struct journal_keys *keys = &c->journal_keys; - size_t idx = bch2_journal_key_search(keys, btree, level, pos); - - if (idx < keys->size && - keys->data[idx].btree_id == btree && - keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos) && - !keys->data[idx].overwritten) { - mutex_lock(&keys->overwrite_lock); - __bch2_journal_key_overwritten(keys, idx); - mutex_unlock(&keys->overwrite_lock); - } -} - -static void bch2_journal_iter_advance(struct journal_iter *iter) -{ - if (iter->idx < iter->keys->size) { - iter->idx++; - if (iter->idx == iter->keys->gap) - iter->idx += iter->keys->size - iter->keys->nr; - } -} - -static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) -{ - journal_iter_verify(iter); - - guard(rcu)(); - while (iter->idx < iter->keys->size) { - struct journal_key *k = iter->keys->data + iter->idx; - - int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); - if (cmp < 0) - break; - BUG_ON(cmp); - - if (!k->overwritten) - return bkey_i_to_s_c(k->k); - - if (k->overwritten_range) - iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); - else - bch2_journal_iter_advance(iter); - } - - return bkey_s_c_null; -} - -static void bch2_journal_iter_exit(struct journal_iter *iter) -{ - list_del(&iter->list); -} - -static void bch2_journal_iter_init(struct bch_fs *c, - struct journal_iter *iter, - enum btree_id id, unsigned level, - struct bpos pos) -{ - iter->btree_id = id; - iter->level = level; - iter->keys = &c->journal_keys; - iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); - - journal_iter_verify(iter); -} - -static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -{ - return bch2_btree_node_iter_peek_unpack(&iter->node_iter, - iter->b, &iter->unpacked); -} - -static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) -{ - bch2_btree_node_iter_advance(&iter->node_iter, iter->b); -} - -void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) -{ - if (bpos_eq(iter->pos, SPOS_MAX)) - iter->at_end = true; - else - iter->pos = bpos_successor(iter->pos); -} - -static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) -{ - struct btree_and_journal_iter iter = *_iter; - struct bch_fs *c = iter.trans->c; - unsigned level = iter.journal.level; - struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_started, &c->flags) - ? (level > 1 ? 0 : 2) - : (level > 1 ? 1 : 16); - - iter.prefetch = false; - iter.fail_if_too_many_whiteouts = true; - bch2_bkey_buf_init(&tmp); - - while (nr--) { - bch2_btree_and_journal_iter_advance(&iter); - struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); - if (!k.k) - break; - - bch2_bkey_buf_reassemble(&tmp, c, k); - bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); - } - - bch2_bkey_buf_exit(&tmp, c); -} - -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) -{ - struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; - size_t iters = 0; - - if (iter->prefetch && iter->journal.level) - btree_and_journal_iter_prefetch(iter); -again: - if (iter->at_end) - return bkey_s_c_null; - - iters++; - - if (iters > 20 && iter->fail_if_too_many_whiteouts) - return bkey_s_c_null; - - while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && - bpos_lt(btree_k.k->p, iter->pos)) - bch2_journal_iter_advance_btree(iter); - - if (iter->trans->journal_replay_not_finished) - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_lt(journal_k.k->p, iter->pos)) - bch2_journal_iter_advance(&iter->journal); - - ret = journal_k.k && - (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) - ? journal_k - : btree_k; - - if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) - ret = bkey_s_c_null; - - if (ret.k) { - iter->pos = ret.k->p; - if (bkey_deleted(ret.k)) { - bch2_btree_and_journal_iter_advance(iter); - goto again; - } - } else { - iter->pos = SPOS_MAX; - iter->at_end = true; - } - - return ret; -} - -void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) -{ - bch2_journal_iter_exit(&iter->journal); -} - -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, - struct btree_and_journal_iter *iter, - struct btree *b, - struct btree_node_iter node_iter, - struct bpos pos) -{ - memset(iter, 0, sizeof(*iter)); - - iter->trans = trans; - iter->b = b; - iter->node_iter = node_iter; - iter->pos = b->data->min_key; - iter->at_end = false; - INIT_LIST_HEAD(&iter->journal.list); - - if (trans->journal_replay_not_finished) { - bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); - if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) - list_add(&iter->journal.list, &trans->c->journal_iters); - } -} - -/* - * this version is used by btree_gc before filesystem has gone RW and - * multithreaded, so uses the journal_iters list: - */ -void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, - struct btree_and_journal_iter *iter, - struct btree *b) -{ - struct btree_node_iter node_iter; - - bch2_btree_node_iter_init_from_start(&node_iter, b); - __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); -} - -/* sort and dedup all keys in the journal: */ - -/* - * When keys compare equal, oldest compares first: - */ -static int journal_sort_key_cmp(const void *_l, const void *_r) -{ - const struct journal_key *l = _l; - const struct journal_key *r = _r; - int rewind = l->rewind && r->rewind ? -1 : 1; - - return journal_key_cmp(l, r) ?: - ((cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset)) * rewind); -} - -void bch2_journal_keys_put(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - BUG_ON(atomic_read(&keys->ref) <= 0); - - if (!atomic_dec_and_test(&keys->ref)) - return; - - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) { - if (i->overwritten_range && - (i == &darray_last(*keys) || - i->overwritten_range != i[1].overwritten_range)) - kfree(i->overwritten_range); - - if (i->allocated) - kfree(i->k); - } - - kvfree(keys->data); - keys->data = NULL; - keys->nr = keys->gap = keys->size = 0; - - struct journal_replay **i; - struct genradix_iter iter; - - genradix_for_each(&c->journal_entries, iter, i) - kvfree(*i); - genradix_free(&c->journal_entries); -} - -static void __journal_keys_sort(struct journal_keys *keys) -{ - sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]), - journal_sort_key_cmp, NULL); - - cond_resched(); - - struct journal_key *dst = keys->data; - - darray_for_each(*keys, src) { - /* - * We don't accumulate accounting keys here because we have to - * compare each individual accounting key against the version in - * the btree during replay: - */ - if (src->k->k.type != KEY_TYPE_accounting && - src + 1 < &darray_top(*keys) && - !journal_key_cmp(src, src + 1)) - continue; - - *dst++ = *src; - } - - keys->nr = dst - keys->data; -} - -int bch2_journal_keys_sort(struct bch_fs *c) -{ - struct genradix_iter iter; - struct journal_replay *i, **_i; - struct journal_keys *keys = &c->journal_keys; - size_t nr_read = 0; - - u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - cond_resched(); - - vstruct_for_each(&i->j, entry) { - bool rewind = !entry->level && - !btree_id_is_alloc(entry->btree_id) && - le64_to_cpu(i->j.seq) >= rewind_seq; - - if (entry->type != (rewind - ? BCH_JSET_ENTRY_overwrite - : BCH_JSET_ENTRY_btree_keys)) - continue; - - if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start) - continue; - - jset_entry_for_each_key(entry, k) { - struct journal_key n = (struct journal_key) { - .btree_id = entry->btree_id, - .level = entry->level, - .rewind = rewind, - .k = k, - .journal_seq = le64_to_cpu(i->j.seq), - .journal_offset = k->_data - i->j._data, - }; - - if (darray_push(keys, n)) { - __journal_keys_sort(keys); - - if (keys->nr * 8 > keys->size * 7) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", - keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); - return bch_err_throw(c, ENOMEM_journal_keys_sort); - } - - BUG_ON(darray_push(keys, n)); - } - - nr_read++; - } - } - } - - __journal_keys_sort(keys); - keys->gap = keys->nr; - - bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); - return 0; -} - -void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, - unsigned level_min, unsigned level_max, - struct bpos start, struct bpos end) -{ - struct journal_keys *keys = &c->journal_keys; - size_t dst = 0; - - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) - if (!(i->btree_id == btree && - i->level >= level_min && - i->level <= level_max && - bpos_ge(i->k->k.p, start) && - bpos_le(i->k->k.p, end))) - keys->data[dst++] = *i; - keys->nr = keys->gap = dst; -} - -void bch2_journal_keys_dump(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - struct printbuf buf = PRINTBUF; - - pr_info("%zu keys:", keys->nr); - - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) { - printbuf_reset(&buf); - prt_printf(&buf, "btree="); - bch2_btree_id_to_text(&buf, i->btree_id); - prt_printf(&buf, " l=%u ", i->level); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); - pr_err("%s", buf.buf); - } - printbuf_exit(&buf); -} - -void bch2_fs_journal_keys_init(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - atomic_set(&keys->ref, 1); - keys->initial_ref_held = true; - mutex_init(&keys->overwrite_lock); -} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h deleted file mode 100644 index 2a3082919b8d3e..00000000000000 --- a/fs/bcachefs/btree_journal_iter.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H -#define _BCACHEFS_BTREE_JOURNAL_ITER_H - -#include "bkey.h" - -struct journal_iter { - struct list_head list; - enum btree_id btree_id; - unsigned level; - size_t idx; - struct journal_keys *keys; -}; - -/* - * Iterate over keys in the btree, with keys from the journal overlaid on top: - */ - -struct btree_and_journal_iter { - struct btree_trans *trans; - struct btree *b; - struct btree_node_iter node_iter; - struct bkey unpacked; - - struct journal_iter journal; - struct bpos pos; - bool at_end; - bool prefetch; - bool fail_if_too_many_whiteouts; -}; - -static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, - unsigned l_level, - const struct journal_key *r) -{ - return -cmp_int(l_level, r->level) ?: - cmp_int(l_btree_id, r->btree_id); -} - -static inline int __journal_key_cmp(enum btree_id l_btree_id, - unsigned l_level, - struct bpos l_pos, - const struct journal_key *r) -{ - return __journal_key_btree_cmp(l_btree_id, l_level, r) ?: - bpos_cmp(l_pos, r->k->k.p); -} - -static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) -{ - return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); -} - -struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, - unsigned, struct bpos, struct bpos, size_t *); -struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, - unsigned, struct bpos, struct bpos, size_t *); -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, - unsigned, struct bpos); - -int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, - struct btree_and_journal_iter *); - -int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, - unsigned, struct bkey_i *); -int bch2_journal_key_insert(struct bch_fs *, enum btree_id, - unsigned, struct bkey_i *); -int bch2_journal_key_delete(struct bch_fs *, enum btree_id, - unsigned, struct bpos); -bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos); -void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos); - -void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); - -void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, - struct btree_and_journal_iter *, struct btree *, - struct btree_node_iter, struct bpos); -void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, - struct btree_and_journal_iter *, struct btree *); - -void bch2_journal_keys_put(struct bch_fs *); - -static inline void bch2_journal_keys_put_initial(struct bch_fs *c) -{ - if (c->journal_keys.initial_ref_held) - bch2_journal_keys_put(c); - c->journal_keys.initial_ref_held = false; -} - -int bch2_journal_keys_sort(struct bch_fs *); - -void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, - unsigned, unsigned, - struct bpos, struct bpos); - -void bch2_journal_keys_dump(struct bch_fs *); - -void bch2_fs_journal_keys_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h deleted file mode 100644 index 86aacb254fb2dd..00000000000000 --- a/fs/bcachefs/btree_journal_iter_types.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H -#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H - -struct journal_key_range_overwritten { - size_t start, end; -}; - -struct journal_key { - u64 journal_seq; - u32 journal_offset; - enum btree_id btree_id:8; - unsigned level:8; - bool allocated:1; - bool overwritten:1; - bool rewind:1; - struct journal_key_range_overwritten __rcu * - overwritten_range; - struct bkey_i *k; -}; - -struct journal_keys { - /* must match layout in darray_types.h */ - size_t nr, size; - struct journal_key *data; - /* - * Gap buffer: instead of all the empty space in the array being at the - * end of the buffer - from @nr to @size - the empty space is at @gap. - * This means that sequential insertions are O(n) instead of O(n^2). - */ - size_t gap; - atomic_t ref; - bool initial_ref_held; - struct mutex overwrite_lock; -}; - -#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c deleted file mode 100644 index d96188b92db236..00000000000000 --- a/fs/bcachefs/btree_key_cache.c +++ /dev/null @@ -1,880 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "trace.h" - -#include - -static inline bool btree_uses_pcpu_readers(enum btree_id id) -{ - return id == BTREE_ID_subvolumes; -} - -static struct kmem_cache *bch2_key_cache; - -static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct bkey_cached *ck = obj; - const struct bkey_cached_key *key = arg->key; - - return ck->key.btree_id != key->btree_id || - !bpos_eq(ck->key.pos, key->pos); -} - -static const struct rhashtable_params bch2_btree_key_cache_params = { - .head_offset = offsetof(struct bkey_cached, hash), - .key_offset = offsetof(struct bkey_cached, key), - .key_len = sizeof(struct bkey_cached_key), - .obj_cmpfn = bch2_btree_key_cache_cmp_fn, - .automatic_shrinking = true, -}; - -static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, - struct bkey_cached *ck, - enum btree_node_locked_type lock_held) -{ - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - path->l[0].b = (void *) ck; - mark_btree_node_locked(trans, path, 0, lock_held); -} - -__flatten -inline struct bkey_cached * -bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) -{ - struct bkey_cached_key key = { - .btree_id = btree_id, - .pos = pos, - }; - - return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, - bch2_btree_key_cache_params); -} - -static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) -{ - if (!six_trylock_intent(&ck->c.lock)) - return false; - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - six_unlock_intent(&ck->c.lock); - return false; - } - - if (!six_trylock_write(&ck->c.lock)) { - six_unlock_intent(&ck->c.lock); - return false; - } - - return true; -} - -static bool bkey_cached_evict(struct btree_key_cache *c, - struct bkey_cached *ck) -{ - bool ret = !rhashtable_remove_fast(&c->table, &ck->hash, - bch2_btree_key_cache_params); - if (ret) { - memset(&ck->key, ~0, sizeof(ck->key)); - atomic_long_dec(&c->nr_keys); - } - - return ret; -} - -static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu) -{ - struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier); - struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu); - - this_cpu_dec(*c->btree_key_cache.nr_pending); - kmem_cache_free(bch2_key_cache, ck); -} - -static inline void bkey_cached_free_noassert(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - kfree(ck->k); - ck->k = NULL; - ck->u64s = 0; - - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - - bool pcpu_readers = ck->c.lock.readers != NULL; - rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu); - this_cpu_inc(*bc->nr_pending); -} - -static void bkey_cached_free(struct btree_trans *trans, - struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - /* - * we'll hit strange issues in the SRCU code if we aren't holding an - * SRCU read lock... - */ - EBUG_ON(!trans->srcu_held); - - bkey_cached_free_noassert(bc, ck); -} - -static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) -{ - gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; - - struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); - if (unlikely(!ck)) - return NULL; - ck->k = kmalloc(key_u64s * sizeof(u64), gfp); - if (unlikely(!ck->k)) { - kmem_cache_free(bch2_key_cache, ck); - return NULL; - } - ck->u64s = key_u64s; - return ck; -} - -static struct bkey_cached * -bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) -{ - struct bch_fs *c = trans->c; - struct btree_key_cache *bc = &c->btree_key_cache; - bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); - int ret; - - struct bkey_cached *ck = container_of_or_null( - rcu_pending_dequeue(&bc->pending[pcpu_readers]), - struct bkey_cached, rcu); - if (ck) - goto lock; - - ck = allocate_dropping_locks(trans, ret, - __bkey_cached_alloc(key_u64s, _gfp)); - if (ret) { - if (ck) - kfree(ck->k); - kmem_cache_free(bch2_key_cache, ck); - return ERR_PTR(ret); - } - - if (ck) { - bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); - ck->c.cached = true; - goto lock; - } - - ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]), - struct bkey_cached, rcu); - if (ck) - goto lock; -lock: - six_lock_intent(&ck->c.lock, NULL, NULL); - six_lock_write(&ck->c.lock, NULL, NULL); - return ck; -} - -static struct bkey_cached * -bkey_cached_reuse(struct btree_key_cache *c) -{ - - guard(rcu)(); - struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table); - struct rhash_head *pos; - struct bkey_cached *ck; - - for (unsigned i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bkey_cached_lock_for_evict(ck)) { - if (bkey_cached_evict(c, ck)) - return ck; - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - } - } - return NULL; -} - -static int btree_key_cache_create(struct btree_trans *trans, - struct btree_path *path, - struct btree_path *ck_path, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct btree_key_cache *bc = &c->btree_key_cache; - - /* - * bch2_varint_decode can read past the end of the buffer by at - * most 7 bytes (it won't be used): - */ - unsigned key_u64s = k.k->u64s + 1; - - /* - * Allocate some extra space so that the transaction commit path is less - * likely to have to reallocate, since that requires a transaction - * restart: - */ - key_u64s = min(256U, (key_u64s * 3) / 2); - key_u64s = roundup_pow_of_two(key_u64s); - - struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); - int ret = PTR_ERR_OR_ZERO(ck); - if (ret) - return ret; - - if (unlikely(!ck)) { - ck = bkey_cached_reuse(bc); - if (unlikely(!ck)) { - bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_id_str(ck_path->btree_id)); - return bch_err_throw(c, ENOMEM_btree_key_cache_create); - } - } - - ck->c.level = 0; - ck->c.btree_id = ck_path->btree_id; - ck->key.btree_id = ck_path->btree_id; - ck->key.pos = ck_path->pos; - ck->flags = 1U << BKEY_CACHED_ACCESSED; - - if (unlikely(key_u64s > ck->u64s)) { - mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); - - struct bkey_i *new_k = allocate_dropping_locks(trans, ret, - kmalloc(key_u64s * sizeof(u64), _gfp)); - if (unlikely(!new_k)) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(ck->key.btree_id), key_u64s); - ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill); - } else if (ret) { - kfree(new_k); - goto err; - } - - kfree(ck->k); - ck->k = new_k; - ck->u64s = key_u64s; - } - - bkey_reassemble(ck->k, k); - - ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); - if (unlikely(ret)) - goto err; - - ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); - - bch2_btree_node_unlock_write(trans, path, path_l(path)->b); - - if (unlikely(ret)) /* raced with another fill? */ - goto err; - - atomic_long_inc(&bc->nr_keys); - six_unlock_write(&ck->c.lock); - - enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); - if (lock_want == SIX_LOCK_read) - six_lock_downgrade(&ck->c.lock); - btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); - ck_path->uptodate = BTREE_ITER_UPTODATE; - return 0; -err: - bkey_cached_free(trans, bc, ck); - mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); - - return ret; -} - -static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans, - struct btree_path *ck_path, - struct bkey_s_c k) -{ - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, ck_path->pos); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, trans->c, k); - trace_key_cache_fill(trans, buf.buf); - printbuf_exit(&buf); -} - -static noinline int btree_key_cache_fill(struct btree_trans *trans, - btree_path_idx_t ck_path_idx, - unsigned flags) -{ - struct btree_path *ck_path = trans->paths + ck_path_idx; - - if (flags & BTREE_ITER_cached_nofill) { - ck_path->l[0].b = NULL; - return 0; - } - - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, - BTREE_ITER_intent| - BTREE_ITER_key_cache_fill| - BTREE_ITER_cached_nofill); - iter.flags &= ~BTREE_ITER_with_journal; - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - /* Recheck after btree lookup, before allocating: */ - ck_path = trans->paths + ck_path_idx; - ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; - if (unlikely(ret)) - goto out; - - ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); - if (ret) - goto err; - - if (trace_key_cache_fill_enabled()) - do_trace_key_cache_fill(trans, ck_path, k); -out: - /* We're not likely to need this iterator again: */ - bch2_set_btree_iter_dontneed(trans, &iter); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, - btree_path_idx_t path_idx) -{ - struct bch_fs *c = trans->c; - struct bkey_cached *ck; - struct btree_path *path = trans->paths + path_idx; -retry: - ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) - return -ENOENT; - - enum six_lock_type lock_want = __btree_lock_want(path, 0); - - int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); - if (ret) - return ret; - - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; - } - - if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) - set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - - btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); - path->uptodate = BTREE_ITER_UPTODATE; - return 0; -} - -int bch2_btree_path_traverse_cached(struct btree_trans *trans, - btree_path_idx_t path_idx, - unsigned flags) -{ - EBUG_ON(trans->paths[path_idx].level); - - int ret; - do { - ret = btree_path_traverse_cached_fast(trans, path_idx); - if (unlikely(ret == -ENOENT)) - ret = btree_key_cache_fill(trans, path_idx, flags); - } while (ret == -EEXIST); - - struct btree_path *path = trans->paths + path_idx; - - if (unlikely(ret)) { - path->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - btree_node_unlock(trans, path, 0); - path->l[0].b = ERR_PTR(ret); - } - } else { - BUG_ON(path->uptodate); - BUG_ON(!path->nodes_locked); - } - - return ret; -} - -static int btree_key_cache_flush_pos(struct btree_trans *trans, - struct bkey_cached_key key, - u64 journal_seq, - unsigned commit_flags, - bool evict) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct btree_iter c_iter, b_iter; - struct bkey_cached *ck = NULL; - int ret; - - bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, - BTREE_ITER_slots| - BTREE_ITER_intent| - BTREE_ITER_all_snapshots); - bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, - BTREE_ITER_cached| - BTREE_ITER_intent); - b_iter.flags &= ~BTREE_ITER_with_key_cache; - - ret = bch2_btree_iter_traverse(trans, &c_iter); - if (ret) - goto out; - - ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; - if (!ck) - goto out; - - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - if (evict) - goto evict; - goto out; - } - - if (journal_seq && ck->journal.seq != journal_seq) - goto out; - - trans->journal_res.seq = ck->journal.seq; - - /* - * If we're at the end of the journal, we really want to free up space - * in the journal right away - we don't want to pin that old journal - * sequence number with a new btree node write, we want to re-journal - * the update - */ - if (ck->journal.seq == journal_last_seq(j)) - commit_flags |= BCH_WATERMARK_reclaim; - - if (ck->journal.seq != journal_last_seq(j) || - !test_bit(JOURNAL_space_low, &c->journal.flags)) - commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - - struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter); - ret = bkey_err(btree_k); - if (ret) - goto err; - - /* * Check that we're not violating cache coherency rules: */ - BUG_ON(bkey_deleted(btree_k.k)); - - ret = bch2_trans_update(trans, &b_iter, ck->k, - BTREE_UPDATE_key_cache_reclaim| - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - commit_flags); -err: - bch2_fs_fatal_err_on(ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && - !bch2_journal_error(j), c, - "flushing key cache: %s", bch2_err_str(ret)); - if (ret) - goto out; - - bch2_journal_pin_drop(j, &ck->journal); - - struct btree_path *path = btree_iter_path(trans, &c_iter); - BUG_ON(!btree_node_locked(path, 0)); - - if (!evict) { - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_dec(&c->btree_key_cache.nr_dirty); - } - } else { - struct btree_path *path2; - unsigned i; -evict: - trans_for_each_path(trans, path2, i) - if (path2 != path) - __bch2_btree_path_unlock(trans, path2); - - bch2_btree_node_lock_write_nofail(trans, path, &ck->c); - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_dec(&c->btree_key_cache.nr_dirty); - } - - mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); - if (bkey_cached_evict(&c->btree_key_cache, ck)) { - bkey_cached_free(trans, &c->btree_key_cache, ck); - } else { - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - } - } -out: - bch2_trans_iter_exit(trans, &b_iter); - bch2_trans_iter_exit(trans, &c_iter); - return ret; -} - -int bch2_btree_key_cache_journal_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bkey_cached *ck = - container_of(pin, struct bkey_cached, journal); - struct bkey_cached_key key; - struct btree_trans *trans = bch2_trans_get(c); - int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - int ret = 0; - - btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); - key = ck->key; - - if (ck->journal.seq != seq || - !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - six_unlock_read(&ck->c.lock); - goto unlock; - } - - if (ck->seq != seq) { - bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, - bch2_btree_key_cache_journal_flush); - six_unlock_read(&ck->c.lock); - goto unlock; - } - six_unlock_read(&ck->c.lock); - - ret = lockrestart_do(trans, - btree_key_cache_flush_pos(trans, key, seq, - BCH_TRANS_COMMIT_journal_reclaim, false)); -unlock: - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - - bch2_trans_put(trans); - return ret; -} - -bool bch2_btree_insert_key_cached(struct btree_trans *trans, - unsigned flags, - struct btree_insert_entry *insert_entry) -{ - struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; - struct bkey_i *insert = insert_entry->k; - bool kick_reclaim = false; - - BUG_ON(insert->k.u64s > ck->u64s); - - bkey_copy(ck->k, insert); - - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - set_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_inc(&c->btree_key_cache.nr_dirty); - - if (bch2_nr_btree_keys_need_flush(c)) - kick_reclaim = true; - } - - /* - * To minimize lock contention, we only add the journal pin here and - * defer pin updates to the flush callback via ->seq. Be careful not to - * update ->seq on nojournal commits because we don't want to update the - * pin to a seq that doesn't include journal updates on disk. Otherwise - * we risk losing the update after a crash. - * - * The only exception is if the pin is not active in the first place. We - * have to add the pin because journal reclaim drives key cache - * flushing. The flush callback will not proceed unless ->seq matches - * the latest pin, so make sure it starts with a consistent value. - */ - if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || - !journal_pin_active(&ck->journal)) { - ck->seq = trans->journal_res.seq; - } - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, - &ck->journal, bch2_btree_key_cache_journal_flush); - - if (kick_reclaim) - journal_reclaim_kick(&c->journal); - return true; -} - -void bch2_btree_key_cache_drop(struct btree_trans *trans, - struct btree_path *path) -{ - struct bch_fs *c = trans->c; - struct btree_key_cache *bc = &c->btree_key_cache; - struct bkey_cached *ck = (void *) path->l[0].b; - - /* - * We just did an update to the btree, bypassing the key cache: the key - * cache key is now stale and must be dropped, even if dirty: - */ - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_dec(&c->btree_key_cache.nr_dirty); - bch2_journal_pin_drop(&c->journal, &ck->journal); - } - - bkey_cached_evict(bc, ck); - bkey_cached_free(trans, bc, ck); - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); - - struct btree_path *path2; - unsigned i; - trans_for_each_path(trans, path2, i) - if (path2->l[0].b == (void *) ck) { - /* - * It's safe to clear should_be_locked here because - * we're evicting from the key cache, and we still have - * the underlying btree locked: filling into the key - * cache would require taking a write lock on the btree - * node - */ - path2->should_be_locked = false; - __bch2_btree_path_unlock(trans, path2); - path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); - btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE); - } - - bch2_trans_verify_locks(trans); -} - -static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct bch_fs *c = shrink->private_data; - struct btree_key_cache *bc = &c->btree_key_cache; - struct bucket_table *tbl; - struct bkey_cached *ck; - size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; - unsigned iter, start; - int srcu_idx; - - srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - rcu_read_lock(); - - tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - - /* - * Scanning is expensive while a rehash is in progress - most elements - * will be on the new hashtable, if it's in progress - * - * A rehash could still start while we're scanning - that's ok, we'll - * still see most elements. - */ - if (unlikely(tbl->nest)) { - rcu_read_unlock(); - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - return SHRINK_STOP; - } - - iter = bc->shrink_iter; - if (iter >= tbl->size) - iter = 0; - start = iter; - - do { - struct rhash_head *pos, *next; - - pos = rht_ptr_rcu(&tbl->buckets[iter]); - - while (!rht_is_a_nulls(pos)) { - next = rht_dereference_bucket_rcu(pos->next, tbl, iter); - ck = container_of(pos, struct bkey_cached, hash); - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - bc->skipped_dirty++; - } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { - clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); - bc->skipped_accessed++; - } else if (!bkey_cached_lock_for_evict(ck)) { - bc->skipped_lock_fail++; - } else if (bkey_cached_evict(bc, ck)) { - bkey_cached_free_noassert(bc, ck); - bc->freed++; - freed++; - } else { - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - } - - scanned++; - if (scanned >= nr) - goto out; - - pos = next; - } - - iter++; - if (iter >= tbl->size) - iter = 0; - } while (scanned < nr && iter != start); -out: - bc->shrink_iter = iter; - - rcu_read_unlock(); - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - - return freed; -} - -static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct bch_fs *c = shrink->private_data; - struct btree_key_cache *bc = &c->btree_key_cache; - long nr = atomic_long_read(&bc->nr_keys) - - atomic_long_read(&bc->nr_dirty); - - /* - * Avoid hammering our shrinker too much if it's nearly empty - the - * shrinker code doesn't take into account how big our cache is, if it's - * mostly empty but the system is under memory pressure it causes nasty - * lock contention: - */ - nr -= 128; - - return max(0L, nr); -} - -void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - struct bucket_table *tbl; - struct bkey_cached *ck; - struct rhash_head *pos; - LIST_HEAD(items); - unsigned i; - - shrinker_free(bc->shrink); - - /* - * The loop is needed to guard against racing with rehash: - */ - while (atomic_long_read(&bc->nr_keys)) { - rcu_read_lock(); - tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (tbl) { - if (tbl->nest) { - /* wait for in progress rehash */ - rcu_read_unlock(); - mutex_lock(&bc->table.mutex); - mutex_unlock(&bc->table.mutex); - continue; - } - for (i = 0; i < tbl->size; i++) - while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { - ck = container_of(pos, struct bkey_cached, hash); - BUG_ON(!bkey_cached_evict(bc, ck)); - kfree(ck->k); - kmem_cache_free(bch2_key_cache, ck); - } - } - rcu_read_unlock(); - } - - if (atomic_long_read(&bc->nr_dirty) && - !bch2_journal_error(&c->journal) && - test_bit(BCH_FS_was_rw, &c->flags)) - panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", - atomic_long_read(&bc->nr_dirty)); - - if (atomic_long_read(&bc->nr_keys)) - panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", - atomic_long_read(&bc->nr_keys)); - - if (bc->table_init_done) - rhashtable_destroy(&bc->table); - - rcu_pending_exit(&bc->pending[0]); - rcu_pending_exit(&bc->pending[1]); - - free_percpu(bc->nr_pending); -} - -void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) -{ -} - -int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - struct shrinker *shrink; - - bc->nr_pending = alloc_percpu(size_t); - if (!bc->nr_pending) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - - if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || - rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - - if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - - bc->table_init_done = true; - - shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); - if (!shrink) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - bc->shrink = shrink; - shrink->count_objects = bch2_btree_key_cache_count; - shrink->scan_objects = bch2_btree_key_cache_scan; - shrink->batch = 1 << 14; - shrink->seeks = 0; - shrink->private_data = c; - shrinker_register(shrink); - return 0; -} - -void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) -{ - printbuf_tabstop_push(out, 24); - printbuf_tabstop_push(out, 12); - - prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); - prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size); - prt_newline(out); - prt_printf(out, "shrinker:\n"); - prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); - prt_printf(out, "freed:\t%lu\r\n", bc->freed); - prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); - prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); - prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); - prt_newline(out); - prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending)); -} - -void bch2_btree_key_cache_exit(void) -{ - kmem_cache_destroy(bch2_key_cache); -} - -int __init bch2_btree_key_cache_init(void) -{ - bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); - if (!bch2_key_cache) - return -ENOMEM; - - return 0; -} diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h deleted file mode 100644 index 82d8c72512a93a..00000000000000 --- a/fs/bcachefs/btree_key_cache.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_KEY_CACHE_H -#define _BCACHEFS_BTREE_KEY_CACHE_H - -static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) -{ - size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); - size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); - size_t max_dirty = 1024 + nr_keys / 2; - - return max_t(ssize_t, 0, nr_dirty - max_dirty); -} - -static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c) -{ - size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); - size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); - size_t max_dirty = 4096 + (nr_keys * 3) / 4; - - return nr_dirty - max_dirty; -} - -static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) -{ - return __bch2_btree_key_cache_must_wait(c) > 0; -} - -static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c) -{ - size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); - size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); - size_t max_dirty = 2048 + (nr_keys * 5) / 8; - - return nr_dirty <= max_dirty; -} - -int bch2_btree_key_cache_journal_flush(struct journal *, - struct journal_entry_pin *, u64); - -struct bkey_cached * -bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); - -int bch2_btree_path_traverse_cached(struct btree_trans *, btree_path_idx_t, unsigned); - -bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, - struct btree_insert_entry *); -void bch2_btree_key_cache_drop(struct btree_trans *, - struct btree_path *); - -void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); -void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); -int bch2_fs_btree_key_cache_init(struct btree_key_cache *); - -void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); - -void bch2_btree_key_cache_exit(void); -int __init bch2_btree_key_cache_init(void); - -#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h deleted file mode 100644 index 722f1ed1055152..00000000000000 --- a/fs/bcachefs/btree_key_cache_types.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H -#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H - -#include "rcu_pending.h" - -struct btree_key_cache { - struct rhashtable table; - bool table_init_done; - - struct shrinker *shrink; - unsigned shrink_iter; - - /* 0: non pcpu reader locks, 1: pcpu reader locks */ - struct rcu_pending pending[2]; - size_t __percpu *nr_pending; - - atomic_long_t nr_keys; - atomic_long_t nr_dirty; - - /* shrinker stats */ - unsigned long requested_to_free; - unsigned long freed; - unsigned long skipped_dirty; - unsigned long skipped_accessed; - unsigned long skipped_lock_fail; -}; - -struct bkey_cached_key { - u32 btree_id; - struct bpos pos; -} __packed __aligned(4); - -#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c deleted file mode 100644 index bed2b4b6ffb9e0..00000000000000 --- a/fs/bcachefs/btree_locking.c +++ /dev/null @@ -1,936 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_locking.h" -#include "btree_types.h" - -static struct lock_class_key bch2_btree_node_lock_key; - -void bch2_btree_lock_init(struct btree_bkey_cached_common *b, - enum six_lock_init_flags flags, - gfp_t gfp) -{ - __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp); - lockdep_set_notrack_class(&b->lock); -} - -/* Btree node locking: */ - -struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, - struct btree_path *skip, - struct btree_bkey_cached_common *b, - unsigned level) -{ - struct btree_path *path; - struct six_lock_count ret; - unsigned i; - - memset(&ret, 0, sizeof(ret)); - - if (IS_ERR_OR_NULL(b)) - return ret; - - trans_for_each_path(trans, path, i) - if (path != skip && &path->l[level].b->c == b) { - int t = btree_node_locked_type(path, level); - - if (t != BTREE_NODE_UNLOCKED) - ret.n[t]++; - } - - return ret; -} - -/* unlock */ - -void bch2_btree_node_unlock_write(struct btree_trans *trans, - struct btree_path *path, struct btree *b) -{ - bch2_btree_node_unlock_write_inlined(trans, path, b); -} - -/* lock */ - -/* - * @trans wants to lock @b with type @type - */ -struct trans_waiting_for_lock { - struct btree_trans *trans; - struct btree_bkey_cached_common *node_want; - enum six_lock_type lock_want; - - /* for iterating over held locks :*/ - u8 path_idx; - u8 level; - u64 lock_start_time; -}; - -struct lock_graph { - struct trans_waiting_for_lock g[8]; - unsigned nr; -}; - -static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) -{ - struct trans_waiting_for_lock *i; - - prt_printf(out, "Found lock cycle (%u entries):\n", g->nr); - - for (i = g->g; i < g->g + g->nr; i++) { - struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); - if (!task) - continue; - - bch2_btree_trans_to_text(out, i->trans); - bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT); - } -} - -static noinline void print_chain(struct printbuf *out, struct lock_graph *g) -{ - struct trans_waiting_for_lock *i; - - for (i = g->g; i != g->g + g->nr; i++) { - struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); - if (i != g->g) - prt_str(out, "<- "); - prt_printf(out, "%u ", task ? task->pid : 0); - } - prt_newline(out); -} - -static void lock_graph_up(struct lock_graph *g) -{ - closure_put(&g->g[--g->nr].trans->ref); -} - -static noinline void lock_graph_pop_all(struct lock_graph *g) -{ - while (g->nr) - lock_graph_up(g); -} - -static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i) -{ - while (g->g + g->nr > i) - lock_graph_up(g); -} - -static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) -{ - g->g[g->nr++] = (struct trans_waiting_for_lock) { - .trans = trans, - .node_want = trans->locking, - .lock_want = trans->locking_wait.lock_want, - }; -} - -static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) -{ - closure_get(&trans->ref); - __lock_graph_down(g, trans); -} - -static bool lock_graph_remove_non_waiters(struct lock_graph *g, - struct trans_waiting_for_lock *from) -{ - struct trans_waiting_for_lock *i; - - if (from->trans->locking != from->node_want) { - lock_graph_pop_from(g, from); - return true; - } - - for (i = from + 1; i < g->g + g->nr; i++) - if (i->trans->locking != i->node_want || - i->trans->locking_wait.start_time != i[-1].lock_start_time) { - lock_graph_pop_from(g, i); - return true; - } - - return false; -} - -static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - - count_event(c, trans_restart_would_deadlock); - - if (trace_trans_restart_would_deadlock_enabled()) { - struct printbuf buf = PRINTBUF; - - buf.atomic++; - print_cycle(&buf, g); - - trace_trans_restart_would_deadlock(trans, buf.buf); - printbuf_exit(&buf); - } -} - -static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) -{ - if (i == g->g) { - trace_would_deadlock(g, i->trans); - return btree_trans_restart_foreign_task(i->trans, - BCH_ERR_transaction_restart_would_deadlock, - _THIS_IP_); - } else { - i->trans->lock_must_abort = true; - wake_up_process(i->trans->locking_wait.task); - return 0; - } -} - -static int btree_trans_abort_preference(struct btree_trans *trans) -{ - if (trans->lock_may_not_fail) - return 0; - if (trans->locking_wait.lock_want == SIX_LOCK_write) - return 1; - if (!trans->in_traverse_all) - return 2; - return 3; -} - -static noinline __noreturn void break_cycle_fail(struct lock_graph *g) -{ - struct printbuf buf = PRINTBUF; - buf.atomic++; - - prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); - - for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) { - struct btree_trans *trans = i->trans; - - bch2_btree_trans_to_text(&buf, trans); - - prt_printf(&buf, "backtrace:\n"); - printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - - bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - BUG(); -} - -static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, - struct trans_waiting_for_lock *from) -{ - struct trans_waiting_for_lock *i, *abort = NULL; - unsigned best = 0, pref; - int ret; - - if (lock_graph_remove_non_waiters(g, from)) - return 0; - - /* Only checking, for debugfs: */ - if (cycle) { - print_cycle(cycle, g); - ret = -1; - goto out; - } - - for (i = from; i < g->g + g->nr; i++) { - pref = btree_trans_abort_preference(i->trans); - if (pref > best) { - abort = i; - best = pref; - } - } - - if (unlikely(!best)) - break_cycle_fail(g); - - ret = abort_lock(g, abort); -out: - if (ret) - lock_graph_pop_all(g); - else - lock_graph_pop_from(g, abort); - return ret; -} - -static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, - struct printbuf *cycle) -{ - struct btree_trans *orig_trans = g->g->trans; - - for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) - if (i->trans == trans) { - closure_put(&trans->ref); - return break_cycle(g, cycle, i); - } - - if (unlikely(g->nr == ARRAY_SIZE(g->g))) { - closure_put(&trans->ref); - - if (orig_trans->lock_may_not_fail) - return 0; - - lock_graph_pop_all(g); - - if (cycle) - return 0; - - trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); - return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); - } - - __lock_graph_down(g, trans); - return 0; -} - -static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) -{ - return t1 + t2 > 1; -} - -int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) -{ - struct lock_graph g; - struct trans_waiting_for_lock *top; - struct btree_bkey_cached_common *b; - btree_path_idx_t path_idx; - int ret = 0; - - g.nr = 0; - - if (trans->lock_must_abort && !trans->lock_may_not_fail) { - if (cycle) - return -1; - - trace_would_deadlock(&g, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); - } - - lock_graph_down(&g, trans); - - /* trans->paths is rcu protected vs. freeing */ - guard(rcu)(); - if (cycle) - cycle->atomic++; -next: - if (!g.nr) - goto out; - - top = &g.g[g.nr - 1]; - - struct btree_path *paths = rcu_dereference(top->trans->paths); - if (!paths) - goto up; - - unsigned long *paths_allocated = trans_paths_allocated(paths); - - trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), - path_idx, top->path_idx) { - struct btree_path *path = paths + path_idx; - if (!path->nodes_locked) - continue; - - if (path_idx != top->path_idx) { - top->path_idx = path_idx; - top->level = 0; - top->lock_start_time = 0; - } - - for (; - top->level < BTREE_MAX_DEPTH; - top->level++, top->lock_start_time = 0) { - int lock_held = btree_node_locked_type(path, top->level); - - if (lock_held == BTREE_NODE_UNLOCKED) - continue; - - b = &READ_ONCE(path->l[top->level].b)->c; - - if (IS_ERR_OR_NULL(b)) { - /* - * If we get here, it means we raced with the - * other thread updating its btree_path - * structures - which means it can't be blocked - * waiting on a lock: - */ - if (!lock_graph_remove_non_waiters(&g, g.g)) { - /* - * If lock_graph_remove_non_waiters() - * didn't do anything, it must be - * because we're being called by debugfs - * checking for lock cycles, which - * invokes us on btree_transactions that - * aren't actually waiting on anything. - * Just bail out: - */ - lock_graph_pop_all(&g); - } - - goto next; - } - - if (list_empty_careful(&b->lock.wait_list)) - continue; - - raw_spin_lock(&b->lock.wait_lock); - list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) { - BUG_ON(b != trans->locking); - - if (top->lock_start_time && - time_after_eq64(top->lock_start_time, trans->locking_wait.start_time)) - continue; - - top->lock_start_time = trans->locking_wait.start_time; - - /* Don't check for self deadlock: */ - if (trans == top->trans || - !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) - continue; - - closure_get(&trans->ref); - raw_spin_unlock(&b->lock.wait_lock); - - ret = lock_graph_descend(&g, trans, cycle); - if (ret) - goto out; - goto next; - - } - raw_spin_unlock(&b->lock.wait_lock); - } - } -up: - if (g.nr > 1 && cycle) - print_chain(cycle, &g); - lock_graph_up(&g); - goto next; -out: - if (cycle) - --cycle->atomic; - return ret; -} - -int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) -{ - struct btree_trans *trans = p; - - return bch2_check_for_deadlock(trans, NULL); -} - -int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, - struct btree_bkey_cached_common *b, - bool lock_may_not_fail) -{ - int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; - int ret; - - /* - * Must drop our read locks before calling six_lock_write() - - * six_unlock() won't do wakeups until the reader count - * goes to 0, and it's safe because we have the node intent - * locked: - */ - six_lock_readers_add(&b->lock, -readers); - ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, - lock_may_not_fail, _RET_IP_); - six_lock_readers_add(&b->lock, readers); - - if (ret) - mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED); - - return ret; -} - -void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b) -{ - int ret = __btree_node_lock_write(trans, path, b, true); - BUG_ON(ret); -} - -/* relock */ - -static int btree_path_get_locks(struct btree_trans *trans, - struct btree_path *path, - bool upgrade, - struct get_locks_fail *f, - int restart_err) -{ - unsigned l = path->level; - - do { - if (!btree_path_node(path, l)) - break; - - if (!(upgrade - ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) - goto err; - - l++; - } while (l < path->locks_want); - - if (path->uptodate == BTREE_ITER_NEED_RELOCK) - path->uptodate = BTREE_ITER_UPTODATE; - - return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1; -err: - if (f) { - f->l = l; - f->b = path->l[l].b; - } - - /* - * Do transaction restart before unlocking, so we don't pop - * should_be_locked asserts - */ - if (restart_err) { - btree_trans_restart(trans, restart_err); - } else if (path->should_be_locked && !trans->restarted) { - if (upgrade) - path->locks_want = l; - return -1; - } - - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - - /* - * When we fail to get a lock, we have to ensure that any child nodes - * can't be relocked so bch2_btree_path_traverse has to walk back up to - * the node that we failed to relock: - */ - do { - path->l[l].b = upgrade - ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) - : ERR_PTR(-BCH_ERR_no_btree_node_relock); - } while (l--); - - return -restart_err ?: -1; -} - -bool __bch2_btree_node_relock(struct btree_trans *trans, - struct btree_path *path, unsigned level, - bool trace) -{ - struct btree *b = btree_path_node(path, level); - int want = __btree_lock_want(path, level); - - if (race_fault()) - goto fail; - - if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || - (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, &b->c, level, want))) { - mark_btree_node_locked(trans, path, level, want); - return true; - } -fail: - if (trace && !trans->notrace_relock_fail) - trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); - return false; -} - -/* upgrade */ - -bool bch2_btree_node_upgrade(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree *b = path->l[level].b; - - if (!is_btree_node(path, level)) - return false; - - switch (btree_lock_want(path, level)) { - case BTREE_NODE_UNLOCKED: - BUG_ON(btree_node_locked(path, level)); - return true; - case BTREE_NODE_READ_LOCKED: - BUG_ON(btree_node_intent_locked(path, level)); - return bch2_btree_node_relock(trans, path, level); - case BTREE_NODE_INTENT_LOCKED: - break; - case BTREE_NODE_WRITE_LOCKED: - BUG(); - } - - if (btree_node_intent_locked(path, level)) - return true; - - if (race_fault()) - return false; - - if (btree_node_locked(path, level) - ? six_lock_tryupgrade(&b->c.lock) - : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) - goto success; - - if (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { - btree_node_unlock(trans, path, level); - goto success; - } - - trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level); - return false; -success: - mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); - return true; -} - -/* Btree path locking: */ - -/* - * Only for btree_cache.c - only relocks intent locks - */ -int bch2_btree_path_relock_intent(struct btree_trans *trans, - struct btree_path *path) -{ - unsigned l; - - for (l = path->level; - l < path->locks_want && btree_path_node(path, l); - l++) { - if (!bch2_btree_node_relock(trans, path, l)) { - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); - } - } - - return 0; -} - -__flatten -bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) -{ - bool ret = !btree_path_get_locks(trans, path, false, NULL, 0); - bch2_trans_verify_locks(trans); - return ret; -} - -int __bch2_btree_path_relock(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - if (!bch2_btree_path_relock_norestart(trans, path)) { - trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); - } - - return 0; -} - -bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - path->locks_want = new_locks_want; - - /* - * If we need it locked, we can't touch it. Otherwise, we can return - * success - bch2_path_get() will use this path, and it'll just be - * retraversed: - */ - bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) || - !path->should_be_locked; - - bch2_btree_path_verify_locks(trans, path); - return ret; -} - -int __bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - unsigned old_locks = path->nodes_locked; - unsigned old_locks_want = path->locks_want; - - path->locks_want = max_t(unsigned, path->locks_want, new_locks_want); - - struct get_locks_fail f = {}; - int ret = btree_path_get_locks(trans, path, true, &f, - BCH_ERR_transaction_restart_upgrade); - if (!ret) - goto out; - - /* - * XXX: this is ugly - we'd prefer to not be mucking with other - * iterators in the btree_trans here. - * - * On failure to upgrade the iterator, setting iter->locks_want and - * calling get_locks() is sufficient to make bch2_btree_path_traverse() - * get the locks we want on transaction restart. - * - * But if this iterator was a clone, on transaction restart what we did - * to this iterator isn't going to be preserved. - * - * Possibly we could add an iterator field for the parent iterator when - * an iterator is a copy - for now, we'll just upgrade any other - * iterators with the same btree id. - * - * The code below used to be needed to ensure ancestor nodes get locked - * before interior nodes - now that's handled by - * bch2_btree_path_traverse_all(). - */ - if (!path->cached && !trans->in_traverse_all) { - struct btree_path *linked; - unsigned i; - - trans_for_each_path(trans, linked, i) - if (linked != path && - linked->cached == path->cached && - linked->btree_id == path->btree_id && - linked->locks_want < new_locks_want) { - linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true, NULL, 0); - } - } - - count_event(trans->c, trans_restart_upgrade); - if (trace_trans_restart_upgrade_enabled()) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_); - prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id)); - bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, "locks want %u -> %u level %u\n", - old_locks_want, new_locks_want, f.l); - prt_printf(&buf, "nodes_locked %x -> %x\n", - old_locks, path->nodes_locked); - prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) : - !f.b ? "(null)" : "(node)"); - prt_printf(&buf, "path seq %u node seq %u\n", - IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq, - path->l[f.l].lock_seq); - - trace_trans_restart_upgrade(trans->c, buf.buf); - printbuf_exit(&buf); - } -out: - bch2_trans_verify_locks(trans); - return ret; -} - -void __bch2_btree_path_downgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - unsigned l, old_locks_want = path->locks_want; - - if (trans->restarted) - return; - - EBUG_ON(path->locks_want < new_locks_want); - - path->locks_want = new_locks_want; - - while (path->nodes_locked && - (l = btree_path_highest_level_locked(path)) >= path->locks_want) { - if (l > path->level) { - btree_node_unlock(trans, path, l); - } else { - if (btree_node_intent_locked(path, l)) { - six_lock_downgrade(&path->l[l].b->c.lock); - mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED); - } - break; - } - } - - bch2_btree_path_verify_locks(trans, path); - - trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); -} - -/* Btree transaction locking: */ - -void bch2_trans_downgrade(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - if (trans->restarted) - return; - - trans_for_each_path(trans, path, i) - if (path->ref) - bch2_btree_path_downgrade(trans, path); -} - -static inline void __bch2_trans_unlock(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - __bch2_btree_path_unlock(trans, path); -} - -static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, - struct get_locks_fail *f, bool trace, ulong ip) -{ - if (!trace) - goto out; - - if (trace_trans_restart_relock_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " %s l=%u seq=%u node seq=", - bch2_btree_id_str(path->btree_id), - f->l, path->l[f->l].lock_seq); - if (IS_ERR_OR_NULL(f->b)) { - prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); - } else { - prt_printf(&buf, "%u", f->b->c.lock.seq); - - struct six_lock_count c = - bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l); - prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - - c = six_lock_counts(&f->b->c.lock); - prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - } - - trace_trans_restart_relock(trans, ip, buf.buf); - printbuf_exit(&buf); - } - - count_event(trans->c, trans_restart_relock); -out: - __bch2_trans_unlock(trans); - bch2_trans_verify_locks(trans); -} - -static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip) -{ - bch2_trans_verify_locks(trans); - - if (unlikely(trans->restarted)) - return -((int) trans->restarted); - if (unlikely(trans->locked)) - goto out; - - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) { - struct get_locks_fail f; - int ret; - - if (path->should_be_locked && - (ret = btree_path_get_locks(trans, path, false, &f, - BCH_ERR_transaction_restart_relock))) { - bch2_trans_relock_fail(trans, path, &f, trace, ip); - return ret; - } - } - - trans_set_locked(trans, true); -out: - bch2_trans_verify_locks(trans); - return 0; -} - -int bch2_trans_relock(struct btree_trans *trans) -{ - return __bch2_trans_relock(trans, true, _RET_IP_); -} - -int bch2_trans_relock_notrace(struct btree_trans *trans) -{ - return __bch2_trans_relock(trans, false, _RET_IP_); -} - -void bch2_trans_unlock(struct btree_trans *trans) -{ - trans_set_unlocked(trans); - - __bch2_trans_unlock(trans); -} - -void bch2_trans_unlock_long(struct btree_trans *trans) -{ - bch2_trans_unlock(trans); - bch2_trans_srcu_unlock(trans); -} - -void bch2_trans_unlock_write(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) - if (btree_node_write_locked(path, l)) - bch2_btree_node_unlock_write(trans, path, path->l[l].b); -} - -int __bch2_trans_mutex_lock(struct btree_trans *trans, - struct mutex *lock) -{ - int ret = drop_locks_do(trans, (mutex_lock(lock), 0)); - - if (ret) - mutex_unlock(lock); - return ret; -} - -/* Debug */ - -void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path) -{ - if (!path->nodes_locked && btree_path_node(path, path->level)) { - /* - * A path may be uptodate and yet have nothing locked if and only if - * there is no node at path->level, which generally means we were - * iterating over all nodes and got to the end of the btree - */ - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE); - BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); - } - - if (!path->nodes_locked) - return; - - for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { - int want = btree_lock_want(path, l); - int have = btree_node_locked_type_nowrite(path, l); - - BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); - - BUG_ON(is_btree_node(path, l) && want != have); - - BUG_ON(btree_node_locked(path, l) && - path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); - } -} - -static bool bch2_trans_locked(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->nodes_locked) - return true; - return false; -} - -void __bch2_trans_verify_locks(struct btree_trans *trans) -{ - if (!trans->locked) { - BUG_ON(bch2_trans_locked(trans)); - return; - } - - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - __bch2_btree_path_verify_locks(trans, path); -} diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h deleted file mode 100644 index f2173a3316f4a0..00000000000000 --- a/fs/bcachefs/btree_locking.h +++ /dev/null @@ -1,466 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_LOCKING_H -#define _BCACHEFS_BTREE_LOCKING_H - -/* - * Only for internal btree use: - * - * The btree iterator tracks what locks it wants to take, and what locks it - * currently has - here we have wrappers for locking/unlocking btree nodes and - * updating the iterator state - */ - -#include "btree_iter.h" -#include "six.h" - -void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); - -void bch2_trans_unlock_write(struct btree_trans *); - -static inline bool is_btree_node(struct btree_path *path, unsigned l) -{ - return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b); -} - -static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans) -{ - return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats) - ? &trans->c->btree_transaction_stats[trans->fn_idx] - : NULL; -} - -/* matches six lock types */ -enum btree_node_locked_type { - BTREE_NODE_UNLOCKED = -1, - BTREE_NODE_READ_LOCKED = SIX_LOCK_read, - BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, - BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write, -}; - -static inline int btree_node_locked_type(struct btree_path *path, - unsigned level) -{ - return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); -} - -static inline int btree_node_locked_type_nowrite(struct btree_path *path, - unsigned level) -{ - int have = btree_node_locked_type(path, level); - return have == BTREE_NODE_WRITE_LOCKED - ? BTREE_NODE_INTENT_LOCKED - : have; -} - -static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) -{ - return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; -} - -static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l) -{ - return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED; -} - -static inline bool btree_node_read_locked(struct btree_path *path, unsigned l) -{ - return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED; -} - -static inline bool btree_node_locked(struct btree_path *path, unsigned level) -{ - return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED; -} - -static inline void mark_btree_node_locked_noreset(struct btree_path *path, - unsigned level, - enum btree_node_locked_type type) -{ - /* relying on this to avoid a branch */ - BUILD_BUG_ON(SIX_LOCK_read != 0); - BUILD_BUG_ON(SIX_LOCK_intent != 1); - - path->nodes_locked &= ~(3U << (level << 1)); - path->nodes_locked |= (type + 1) << (level << 1); -} - -static inline void mark_btree_node_locked(struct btree_trans *trans, - struct btree_path *path, - unsigned level, - enum btree_node_locked_type type) -{ - mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type); -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - path->l[level].lock_taken_time = local_clock(); -#endif -} - -static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) -{ - return level < path->locks_want - ? SIX_LOCK_intent - : SIX_LOCK_read; -} - -static inline enum btree_node_locked_type -btree_lock_want(struct btree_path *path, int level) -{ - if (level < path->level) - return BTREE_NODE_UNLOCKED; - if (level < path->locks_want) - return BTREE_NODE_INTENT_LOCKED; - if (level == path->level) - return BTREE_NODE_READ_LOCKED; - return BTREE_NODE_UNLOCKED; -} - -static void btree_trans_lock_hold_time_update(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times, - path->l[level].lock_taken_time, - local_clock()); -#endif -} - -/* unlock: */ - -void bch2_btree_node_unlock_write(struct btree_trans *, - struct btree_path *, struct btree *); - -static inline void btree_node_unlock(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - int lock_type = btree_node_locked_type(path, level); - - EBUG_ON(level >= BTREE_MAX_DEPTH); - - if (lock_type != BTREE_NODE_UNLOCKED) { - if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) { - bch2_btree_node_unlock_write(trans, path, path->l[level].b); - lock_type = BTREE_NODE_INTENT_LOCKED; - } - six_unlock_type(&path->l[level].b->c.lock, lock_type); - btree_trans_lock_hold_time_update(trans, path, level); - mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); - } -} - -static inline int btree_path_lowest_level_locked(struct btree_path *path) -{ - return __ffs(path->nodes_locked) >> 1; -} - -static inline int btree_path_highest_level_locked(struct btree_path *path) -{ - return __fls(path->nodes_locked) >> 1; -} - -static inline void __bch2_btree_path_unlock(struct btree_trans *trans, - struct btree_path *path) -{ - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK); - - while (path->nodes_locked) - btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); -} - -/* - * Updates the saved lock sequence number, so that bch2_btree_node_relock() will - * succeed: - */ -static inline void -__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b) -{ - if (!b->c.lock.write_lock_recurse) { - struct btree_path *linked; - unsigned i; - - trans_for_each_path_with_node(trans, b, linked, i) - linked->l[b->c.level].lock_seq++; - } - - six_unlock_write(&b->c.lock); -} - -static inline void -bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, - struct btree *b) -{ - EBUG_ON(path->l[b->c.level].b != b); - EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); - EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); - - mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - __bch2_btree_node_unlock_write(trans, b); -} - -int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); - -/* lock: */ - -static inline void trans_set_locked(struct btree_trans *trans, bool try) -{ - if (!trans->locked) { - lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_); - trans->locked = true; - trans->last_unlock_ip = 0; - - trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0; - current->flags |= PF_MEMALLOC_NOFS; - } -} - -static inline void trans_set_unlocked(struct btree_trans *trans) -{ - if (trans->locked) { - lock_release(&trans->dep_map, _THIS_IP_); - trans->locked = false; - trans->last_unlock_ip = _RET_IP_; - - if (!trans->pf_memalloc_nofs) - current->flags &= ~PF_MEMALLOC_NOFS; - } -} - -static inline int __btree_node_lock_nopath(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - enum six_lock_type type, - bool lock_may_not_fail, - unsigned long ip) -{ - trans->lock_may_not_fail = lock_may_not_fail; - trans->lock_must_abort = false; - trans->locking = b; - - int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, - bch2_six_check_for_deadlock, trans, ip); - WRITE_ONCE(trans->locking, NULL); - WRITE_ONCE(trans->locking_wait.start_time, 0); - - if (!ret) - trace_btree_path_lock(trans, _THIS_IP_, b); - return ret; -} - -static inline int __must_check -btree_node_lock_nopath(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - enum six_lock_type type, - unsigned long ip) -{ - return __btree_node_lock_nopath(trans, b, type, false, ip); -} - -static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - enum six_lock_type type) -{ - int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_); - - BUG_ON(ret); -} - -/* - * Lock a btree node if we already have it locked on one of our linked - * iterators: - */ -static inline bool btree_node_lock_increment(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - unsigned level, - enum btree_node_locked_type want) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (&path->l[level].b->c == b && - btree_node_locked_type(path, level) >= want) { - six_lock_increment(&b->lock, (enum six_lock_type) want); - return true; - } - - return false; -} - -static inline int btree_node_lock(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b, - unsigned level, - enum six_lock_type type, - unsigned long ip) -{ - int ret = 0; - - EBUG_ON(level >= BTREE_MAX_DEPTH); - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if (likely(six_trylock_type(&b->lock, type)) || - btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || - !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) { -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - path->l[b->level].lock_taken_time = local_clock(); -#endif - } - - return ret; -} - -int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *, - struct btree_bkey_cached_common *b, bool); - -static inline int __btree_node_lock_write(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b, - bool lock_may_not_fail) -{ - EBUG_ON(&path->l[b->level].b->c != b); - EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock)); - EBUG_ON(!btree_node_intent_locked(path, b->level)); - - /* - * six locks are unfair, and read locks block while a thread wants a - * write lock: thus, we need to tell the cycle detector we have a write - * lock _before_ taking the lock: - */ - mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED); - - return likely(six_trylock_write(&b->lock)) - ? 0 - : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail); -} - -static inline int __must_check -bch2_btree_node_lock_write(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b) -{ - return __btree_node_lock_write(trans, path, b, false); -} - -void bch2_btree_node_lock_write_nofail(struct btree_trans *, - struct btree_path *, - struct btree_bkey_cached_common *); - -/* relock: */ - -bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *); -int __bch2_btree_path_relock(struct btree_trans *, - struct btree_path *, unsigned long); - -static inline int bch2_btree_path_relock(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - return btree_node_locked(path, path->level) - ? 0 - : __bch2_btree_path_relock(trans, path, trace_ip); -} - -bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace); - -static inline bool bch2_btree_node_relock(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - EBUG_ON(btree_node_locked(path, level) && - !btree_node_write_locked(path, level) && - btree_node_locked_type(path, level) != __btree_lock_want(path, level)); - - return likely(btree_node_locked(path, level)) || - (!IS_ERR_OR_NULL(path->l[level].b) && - __bch2_btree_node_relock(trans, path, level, true)); -} - -static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - EBUG_ON(btree_node_locked(path, level) && - btree_node_locked_type_nowrite(path, level) != - __btree_lock_want(path, level)); - - return likely(btree_node_locked(path, level)) || - (!IS_ERR_OR_NULL(path->l[level].b) && - __bch2_btree_node_relock(trans, path, level, false)); -} - -/* upgrade */ - -bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned); - -static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - return new_locks_want > path->locks_want - ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want) - : true; -} - -int __bch2_btree_path_upgrade(struct btree_trans *, - struct btree_path *, unsigned); - -static inline int bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - - return likely(path->locks_want >= new_locks_want && path->nodes_locked) - ? 0 - : __bch2_btree_path_upgrade(trans, path, new_locks_want); -} - -/* misc: */ - -static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path) -{ - EBUG_ON(!btree_node_locked(path, path->level)); - EBUG_ON(path->uptodate); - - if (!path->should_be_locked) { - path->should_be_locked = true; - trace_btree_path_should_be_locked(trans, path); - } -} - -static inline void __btree_path_set_level_up(struct btree_trans *trans, - struct btree_path *path, - unsigned l) -{ - btree_node_unlock(trans, path, l); - path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up); -} - -static inline void btree_path_set_level_up(struct btree_trans *trans, - struct btree_path *path) -{ - __btree_path_set_level_up(trans, path, path->level++); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); -} - -/* debug */ - -struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, - struct btree_path *, - struct btree_bkey_cached_common *b, - unsigned); - -int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); - -void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *); -void __bch2_trans_verify_locks(struct btree_trans *); - -static inline void bch2_btree_path_verify_locks(struct btree_trans *trans, - struct btree_path *path) -{ - if (static_branch_unlikely(&bch2_debug_check_btree_locking)) - __bch2_btree_path_verify_locks(trans, path); -} - -static inline void bch2_trans_verify_locks(struct btree_trans *trans) -{ - if (static_branch_unlikely(&bch2_debug_check_btree_locking)) - __bch2_trans_verify_locks(trans); -} - -#endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c deleted file mode 100644 index a3fb07c60e25f0..00000000000000 --- a/fs/bcachefs/btree_node_scan.c +++ /dev/null @@ -1,611 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_journal_iter.h" -#include "btree_node_scan.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "error.h" -#include "journal_io.h" -#include "recovery_passes.h" - -#include -#include -#include -#include - -struct find_btree_nodes_worker { - struct closure *cl; - struct find_btree_nodes *f; - struct bch_dev *ca; -}; - -static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) -{ - bch2_btree_id_level_to_text(out, n->btree_id, n->level); - prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ", - n->seq, n->journal_seq, n->cookie); - bch2_bpos_to_text(out, n->min_key); - prt_str(out, "-"); - bch2_bpos_to_text(out, n->max_key); - - if (n->range_updated) - prt_str(out, " range updated"); - - for (unsigned i = 0; i < n->nr_ptrs; i++) { - prt_char(out, ' '); - bch2_extent_ptr_to_text(out, c, n->ptrs + i); - } -} - -static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) -{ - printbuf_indent_add(out, 2); - darray_for_each(nodes, i) { - found_btree_node_to_text(out, c, i); - prt_newline(out); - } - printbuf_indent_sub(out, 2); -} - -static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) -{ - struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); - - set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); - bp->k.p = f->max_key; - bp->v.seq = cpu_to_le64(f->cookie); - bp->v.sectors_written = 0; - bp->v.flags = 0; - bp->v.sectors_written = cpu_to_le16(f->sectors_written); - bp->v.min_key = f->min_key; - SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); - memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); -} - -static inline u64 bkey_journal_seq(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode_v3: - return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq); - default: - return 0; - } -} - -static int found_btree_node_cmp_cookie(const void *_l, const void *_r) -{ - const struct found_btree_node *l = _l; - const struct found_btree_node *r = _r; - - return cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->level, r->level) ?: - cmp_int(l->cookie, r->cookie); -} - -/* - * Given two found btree nodes, if their sequence numbers are equal, take the - * one that's readable: - */ -static int found_btree_node_cmp_time(const struct found_btree_node *l, - const struct found_btree_node *r) -{ - return cmp_int(l->seq, r->seq) ?: - cmp_int(l->journal_seq, r->journal_seq); -} - -static int found_btree_node_cmp_pos(const void *_l, const void *_r) -{ - const struct found_btree_node *l = _l; - const struct found_btree_node *r = _r; - - return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->min_key, r->min_key) ?: - -found_btree_node_cmp_time(l, r); -} - -static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg) -{ - return found_btree_node_cmp_pos(l, r) < 0; -} - -static inline void found_btree_node_swap(void *_l, void *_r, void *arg) -{ - struct found_btree_node *l = _l; - struct found_btree_node *r = _r; - - swap(*l, *r); -} - -static const struct min_heap_callbacks found_btree_node_heap_cbs = { - .less = found_btree_node_cmp_pos_less, - .swp = found_btree_node_swap, -}; - -static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, - struct btree *b, struct bio *bio, u64 offset) -{ - struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); - struct btree_node *bn = b->data; - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, b->data, c->opts.block_size); - - u64 submit_time = local_clock(); - submit_bio_wait(bio); - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, - "IO error in try_read_btree_node() at %llu: %s", - offset, bch2_blk_status_to_str(bio->bi_status)); - return; - } - - if (le64_to_cpu(bn->magic) != bset_magic(c)) - return; - - if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { - if (!c->chacha20_key_set) - return; - - struct nonce nonce = btree_nonce(&bn->keys, 0); - unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; - - bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); - } - - if (btree_id_is_alloc(BTREE_NODE_ID(bn))) - return; - - if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) - return; - - if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) - return; - - rcu_read_lock(); - struct found_btree_node n = { - .btree_id = BTREE_NODE_ID(bn), - .level = BTREE_NODE_LEVEL(bn), - .seq = BTREE_NODE_SEQ(bn), - .cookie = le64_to_cpu(bn->keys.seq), - .min_key = bn->min_key, - .max_key = bn->max_key, - .nr_ptrs = 1, - .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, - .ptrs[0].offset = offset, - .ptrs[0].dev = ca->dev_idx, - .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)), - }; - rcu_read_unlock(); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, b->data, c->opts.btree_node_size); - - submit_time = local_clock(); - submit_bio_wait(bio); - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); - - found_btree_node_to_key(&b->key, &n); - - CLASS(printbuf, buf)(); - if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) { - /* read_done will swap out b->data for another buffer */ - bn = b->data; - /* - * Grab journal_seq here because we want the max journal_seq of - * any bset; read_done sorts down to a single set and picks the - * max journal_seq - */ - n.journal_seq = le64_to_cpu(bn->keys.journal_seq), - n.sectors_written = b->written; - - mutex_lock(&f->lock); - if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { - bch_err(c, "try_read_btree_node() can't handle endian conversion"); - f->ret = -EINVAL; - goto unlock; - } - - if (darray_push(&f->nodes, n)) - f->ret = -ENOMEM; -unlock: - mutex_unlock(&f->lock); - } -} - -static int read_btree_nodes_worker(void *p) -{ - struct find_btree_nodes_worker *w = p; - struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); - struct bch_dev *ca = w->ca; - unsigned long last_print = jiffies; - struct btree *b = NULL; - struct bio *bio = NULL; - - b = __bch2_btree_node_mem_alloc(c); - if (!b) { - bch_err(c, "read_btree_nodes_worker: error allocating buf"); - w->f->ret = -ENOMEM; - goto err; - } - - bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL); - if (!bio) { - bch_err(c, "read_btree_nodes_worker: error allocating bio"); - w->f->ret = -ENOMEM; - goto err; - } - - for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) - for (unsigned bucket_offset = 0; - bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; - bucket_offset += btree_sectors(c)) { - if (time_after(jiffies, last_print + HZ * 30)) { - u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; - u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; - - bch_info(ca, "%s: %2u%% done", __func__, - (unsigned) div64_u64(cur_sector * 100, end_sector)); - last_print = jiffies; - } - - u64 sector = bucket * ca->mi.bucket_size + bucket_offset; - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && - !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) - continue; - - try_read_btree_node(w->f, ca, b, bio, sector); - } -err: - if (b) - __btree_node_data_free(b); - kfree(b); - bio_put(bio); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - closure_put(w->cl); - kfree(w); - return 0; -} - -static int read_btree_nodes(struct find_btree_nodes *f) -{ - struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); - struct closure cl; - int ret = 0; - - closure_init_stack(&cl); - - for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) { - if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) - continue; - - struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); - if (!w) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - ret = -ENOMEM; - goto err; - } - - w->cl = &cl; - w->f = f; - w->ca = ca; - - struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); - ret = PTR_ERR_OR_ZERO(t); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - kfree(w); - bch_err_msg(c, ret, "starting kthread"); - break; - } - - closure_get(&cl); - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - wake_up_process(t); - } -err: - while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2)) - ; - return f->ret ?: ret; -} - -static bool nodes_overlap(const struct found_btree_node *l, - const struct found_btree_node *r) -{ - return (l->btree_id == r->btree_id && - l->level == r->level && - bpos_gt(l->max_key, r->min_key)); -} - -static int handle_overwrites(struct bch_fs *c, - struct found_btree_node *l, - found_btree_nodes *nodes_heap) -{ - struct found_btree_node *r; - - while ((r = min_heap_peek(nodes_heap)) && - nodes_overlap(l, r)) { - int cmp = found_btree_node_cmp_time(l, r); - - if (cmp > 0) { - if (bpos_cmp(l->max_key, r->max_key) >= 0) - min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - else { - r->range_updated = true; - r->min_key = bpos_successor(l->max_key); - r->range_updated = true; - min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); - } - } else if (cmp < 0) { - BUG_ON(bpos_eq(l->min_key, r->min_key)); - - l->max_key = bpos_predecessor(r->min_key); - l->range_updated = true; - } else if (r->level) { - min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - } else { - if (bpos_cmp(l->max_key, r->max_key) >= 0) - min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - else { - r->range_updated = true; - r->min_key = bpos_successor(l->max_key); - r->range_updated = true; - min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); - } - } - - cond_resched(); - } - - return 0; -} - -int bch2_scan_for_btree_nodes(struct bch_fs *c) -{ - struct find_btree_nodes *f = &c->found_btree_nodes; - struct printbuf buf = PRINTBUF; - found_btree_nodes nodes_heap = {}; - size_t dst; - int ret = 0; - - if (f->nodes.nr) - return 0; - - mutex_init(&f->lock); - - ret = read_btree_nodes(f); - if (ret) - return ret; - - if (!f->nodes.nr) { - bch_err(c, "%s: no btree nodes found", __func__); - ret = -EINVAL; - goto err; - } - - if (0 && c->opts.verbose) { - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes found:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_str(c, KERN_INFO, buf.buf); - } - - sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); - - dst = 0; - darray_for_each(f->nodes, i) { - struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; - - if (prev && - prev->cookie == i->cookie) { - if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { - bch_err(c, "%s: found too many replicas for btree node", __func__); - ret = -EINVAL; - goto err; - } - prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; - } else { - f->nodes.data[dst++] = *i; - } - } - f->nodes.nr = dst; - - sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); - - if (0 && c->opts.verbose) { - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_str(c, KERN_INFO, buf.buf); - } - - swap(nodes_heap, f->nodes); - - { - /* darray must have same layout as a heap */ - min_heap_char real_heap; - BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr)); - BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size)); - BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr)); - BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size)); - } - - min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL); - - if (nodes_heap.nr) { - ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); - if (ret) - goto err; - - min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); - } - - while (true) { - ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap); - if (ret) - goto err; - - if (!nodes_heap.nr) - break; - - ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); - if (ret) - goto err; - - min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); - } - - for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++) - BUG_ON(nodes_overlap(n, n + 1)); - - if (0 && c->opts.verbose) { - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_str(c, KERN_INFO, buf.buf); - } else { - bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); - } - - eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); -err: - darray_exit(&nodes_heap); - printbuf_exit(&buf); - return ret; -} - -static int found_btree_node_range_start_cmp(const void *_l, const void *_r) -{ - const struct found_btree_node *l = _l; - const struct found_btree_node *r = _r; - - return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->max_key, r->min_key); -} - -#define for_each_found_btree_node_in_range(_f, _search, _idx) \ - for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ - sizeof((_f)->nodes.data[0]), \ - found_btree_node_range_start_cmp, &search); \ - _idx < (_f)->nodes.nr && \ - (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ - (_f)->nodes.data[_idx].level == _search.level && \ - bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ - _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) - -bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) -{ - struct find_btree_nodes *f = &c->found_btree_nodes; - - struct found_btree_node search = { - .btree_id = b->c.btree_id, - .level = b->c.level, - .min_key = b->data->min_key, - .max_key = b->key.k.p, - }; - - for_each_found_btree_node_in_range(f, search, idx) - if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) - return true; - return false; -} - -int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) -{ - int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); - if (ret) - return ret; - - struct found_btree_node search = { - .btree_id = btree, - .level = 0, - .min_key = POS_MIN, - .max_key = SPOS_MAX, - }; - - for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) - return true; - return false; -} - -int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, - unsigned level, struct bpos node_min, struct bpos node_max) -{ - if (btree_id_is_alloc(btree)) - return 0; - - struct find_btree_nodes *f = &c->found_btree_nodes; - - int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); - if (ret) - return ret; - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "recovery "); - bch2_btree_id_level_to_text(&buf, btree, level); - prt_str(&buf, " "); - bch2_bpos_to_text(&buf, node_min); - prt_str(&buf, " - "); - bch2_bpos_to_text(&buf, node_max); - - bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - struct found_btree_node search = { - .btree_id = btree, - .level = level, - .min_key = node_min, - .max_key = node_max, - }; - - for_each_found_btree_node_in_range(f, search, idx) { - struct found_btree_node n = f->nodes.data[idx]; - - n.range_updated |= bpos_lt(n.min_key, node_min); - n.min_key = bpos_max(n.min_key, node_min); - - n.range_updated |= bpos_gt(n.max_key, node_max); - n.max_key = bpos_min(n.max_key, node_max); - - struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; - - found_btree_node_to_key(&tmp.k, &n); - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); - bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = level + 1, - .btree = btree, - })); - - ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); - if (ret) - return ret; - } - - return 0; -} - -void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) -{ - darray_exit(&f->nodes); -} diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h deleted file mode 100644 index 66e6f9ed19d04b..00000000000000 --- a/fs/bcachefs/btree_node_scan.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_NODE_SCAN_H -#define _BCACHEFS_BTREE_NODE_SCAN_H - -int bch2_scan_for_btree_nodes(struct bch_fs *); -bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); -int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); -int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); -void bch2_find_btree_nodes_exit(struct find_btree_nodes *); - -#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */ diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h deleted file mode 100644 index 2811b6857c970d..00000000000000 --- a/fs/bcachefs/btree_node_scan_types.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H -#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H - -#include "darray.h" - -struct found_btree_node { - bool range_updated:1; - u8 btree_id; - u8 level; - unsigned sectors_written; - u32 seq; - u64 journal_seq; - u64 cookie; - - struct bpos min_key; - struct bpos max_key; - - unsigned nr_ptrs; - struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; -}; - -typedef DARRAY(struct found_btree_node) found_btree_nodes; - -struct find_btree_nodes { - int ret; - struct mutex lock; - found_btree_nodes nodes; -}; - -#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c deleted file mode 100644 index 639ef75b3dbd04..00000000000000 --- a/fs/bcachefs/btree_trans_commit.c +++ /dev/null @@ -1,1121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "btree_gc.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "disk_accounting.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "replicas.h" -#include "snapshot.h" - -#include -#include - -static const char * const trans_commit_flags_strs[] = { -#define x(n, ...) #n, - BCH_TRANS_COMMIT_FLAGS() -#undef x - NULL -}; - -void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags) -{ - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - - prt_printf(out, "watermark=%s", bch2_watermarks[watermark]); - - flags >>= BCH_WATERMARK_BITS; - if (flags) { - prt_char(out, ' '); - bch2_prt_bitflags(out, trans_commit_flags_strs, flags); - } -} - -static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct bch_fs *c = trans->c; - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u); - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); - - if (j_k) - k = bkey_i_to_s_c(j_k); - } - - u = *k.k; - u.needs_whiteout = i->old_k.needs_whiteout; - - BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); - BUG_ON(i->old_v != k.v); -#endif -} - -static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i) -{ - return (trans->paths + i->path)->l + i->level; -} - -static inline bool same_leaf_as_prev(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i != trans->updates && - insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b; -} - -static inline bool same_leaf_as_next(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i + 1 < trans->updates + trans->nr_updates && - insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b; -} - -inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - struct bch_fs *c = trans->c; - - if (unlikely(btree_node_just_written(b)) && - bch2_btree_post_write_cleanup(c, b)) - bch2_trans_node_reinit_iter(trans, b); - - /* - * If the last bset has been written, or if it's gotten too big - start - * a new bset to insert into: - */ - if (want_new_bset(c, b)) - bch2_btree_init_next(trans, b); -} - -static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) -{ - while (--i >= trans->updates) { - if (same_leaf_as_prev(trans, i)) - continue; - - bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b); - } - - trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); -} - -static inline int bch2_trans_lock_write(struct btree_trans *trans) -{ - EBUG_ON(trans->write_locked); - - trans_for_each_update(trans, i) { - if (same_leaf_as_prev(trans, i)) - continue; - - if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c)) - return trans_lock_write_fail(trans, i); - - if (!i->cached) - bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b); - } - - trans->write_locked = true; - return 0; -} - -static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans) -{ - if (likely(trans->write_locked)) { - trans_for_each_update(trans, i) - if (btree_node_locked_type(trans->paths + i->path, i->level) == - BTREE_NODE_WRITE_LOCKED) - bch2_btree_node_unlock_write_inlined(trans, - trans->paths + i->path, insert_l(trans, i)->b); - trans->write_locked = false; - } -} - -/* Inserting into a given leaf node (last stage of insert): */ - -/* Handle overwrites and do insert, for non extents: */ -bool bch2_btree_bset_insert_key(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_i *insert) -{ - struct bkey_packed *k; - unsigned clobber_u64s = 0, new_u64s = 0; - - EBUG_ON(btree_node_just_written(b)); - EBUG_ON(bset_written(b, btree_bset_last(b))); - EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); - EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); - EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); - EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); - kmsan_check_memory(insert, bkey_bytes(&insert->k)); - - k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) - k = NULL; - - /* @k is the key being overwritten/deleted, if any: */ - EBUG_ON(k && bkey_deleted(k)); - - /* Deleting, but not found? nothing to do: */ - if (bkey_deleted(&insert->k) && !k) - return false; - - if (bkey_deleted(&insert->k)) { - /* Deleting: */ - btree_account_key_drop(b, k); - k->type = KEY_TYPE_deleted; - - if (k->needs_whiteout) - push_whiteout(b, insert->k.p); - k->needs_whiteout = false; - - if (k >= btree_bset_last(b)->start) { - clobber_u64s = k->u64s; - bch2_bset_delete(b, k, clobber_u64s); - goto fix_iter; - } else { - bch2_btree_path_fix_key_modified(trans, b, k); - } - - return true; - } - - if (k) { - /* Overwriting: */ - btree_account_key_drop(b, k); - k->type = KEY_TYPE_deleted; - - insert->k.needs_whiteout = k->needs_whiteout; - k->needs_whiteout = false; - - if (k >= btree_bset_last(b)->start) { - clobber_u64s = k->u64s; - goto overwrite; - } else { - bch2_btree_path_fix_key_modified(trans, b, k); - } - } - - k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); -overwrite: - bch2_bset_insert(b, k, insert, clobber_u64s); - new_u64s = k->u64s; -fix_iter: - if (clobber_u64s != new_u64s) - bch2_btree_node_iter_fix(trans, path, b, node_iter, k, - clobber_u64s, new_u64s); - return true; -} - -static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, - unsigned i, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct btree_write *w = container_of(pin, struct btree_write, journal); - struct btree *b = container_of(w, struct btree, writes[i]); - struct btree_trans *trans = bch2_trans_get(c); - unsigned long old, new; - unsigned idx = w - b->writes; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - - old = READ_ONCE(b->flags); - do { - new = old; - - if (!(old & (1 << BTREE_NODE_dirty)) || - !!(old & (1 << BTREE_NODE_write_idx)) != idx || - w->journal.seq != seq) - break; - - new &= ~BTREE_WRITE_TYPE_MASK; - new |= BTREE_WRITE_journal_reclaim; - new |= 1 << BTREE_NODE_need_write; - } while (!try_cmpxchg(&b->flags, &old, new)); - - btree_node_write_if_need(trans, b, SIX_LOCK_read); - six_unlock_read(&b->c.lock); - - bch2_trans_put(trans); - return 0; -} - -int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 0, seq); -} - -int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 1, seq); -} - -inline void bch2_btree_add_journal_pin(struct bch_fs *c, - struct btree *b, u64 seq) -{ - struct btree_write *w = btree_current_write(b); - - bch2_journal_pin_add(&c->journal, seq, &w->journal, - btree_node_write_idx(b) == 0 - ? bch2_btree_node_flush0 - : bch2_btree_node_flush1); -} - -/** - * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node - * @trans: btree transaction object - * @path: path pointing to @insert's pos - * @insert: key to insert - * @journal_seq: sequence number of journal reservation - */ -inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, - struct btree_path *path, - struct bkey_i *insert, - u64 journal_seq) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(path)->b; - struct bset_tree *t = bset_tree_last(b); - struct bset *i = bset(b, t); - int old_u64s = bset_u64s(t); - int old_live_u64s = b->nr.live_u64s; - int live_u64s_added, u64s_added; - - if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, - &path_l(path)->iter, insert))) - return; - - i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); - - bch2_btree_add_journal_pin(c, b, journal_seq); - - if (unlikely(!btree_node_dirty(b))) { - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - set_btree_node_dirty_acct(c, b); - } - - live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) bset_u64s(t) - old_u64s; - - if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); - if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); - - if (u64s_added > live_u64s_added && - bch2_maybe_compact_whiteouts(c, b)) - bch2_trans_node_reinit_iter(trans, b); -} - -/* Cached btree updates: */ - -/* Normal update interface: */ - -static inline void btree_insert_entry_checks(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - struct btree_path *path = trans->paths + i->path; - - BUG_ON(!bpos_eq(i->k->k.p, path->pos)); - BUG_ON(i->cached != path->cached); - BUG_ON(i->level != path->level); - BUG_ON(i->btree_id != path->btree_id); - BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); - EBUG_ON(!i->level && - btree_type_has_snapshots(i->btree_id) && - !(i->flags & BTREE_UPDATE_internal_snapshot_node) && - test_bit(JOURNAL_replay_done, &trans->c->journal.flags) && - i->k->k.p.snapshot && - bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); -} - -static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, - unsigned flags) -{ - return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, - trans->journal_u64s, flags, trans); -} - -#define JSET_ENTRY_LOG_U64s 4 - -static noinline void journal_transaction_name(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct jset_entry *entry = - bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_log, 0, 0, - JSET_ENTRY_LOG_U64s); - struct jset_entry_log *l = - container_of(entry, struct jset_entry_log, entry); - - memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), - trans->fn, strlen(trans->fn), 0); -} - -static inline int btree_key_can_insert(struct btree_trans *trans, - struct btree *b, unsigned u64s) -{ - if (!bch2_btree_node_insert_fits(b, u64s)) - return bch_err_throw(trans->c, btree_insert_btree_node_full); - - return 0; -} - -noinline static int -btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, - struct btree_path *path, unsigned new_u64s) -{ - struct bkey_cached *ck = (void *) path->l[0].b; - struct bkey_i *new_k; - int ret; - - bch2_trans_unlock_updates_write(trans); - bch2_trans_unlock(trans); - - new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); - if (!new_k) { - struct bch_fs *c = trans->c; - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(path->btree_id), new_u64s); - return bch_err_throw(c, ENOMEM_btree_key_cache_insert); - } - - ret = bch2_trans_relock(trans) ?: - bch2_trans_lock_write(trans); - if (unlikely(ret)) { - kfree(new_k); - return ret; - } - - memcpy(new_k, ck->k, ck->u64s * sizeof(u64)); - - trans_for_each_update(trans, i) - if (i->old_v == &ck->k->v) - i->old_v = &new_k->v; - - kfree(ck->k); - ck->u64s = new_u64s; - ck->k = new_k; - return 0; -} - -static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, - struct btree_path *path, unsigned u64s) -{ - struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) path->l[0].b; - unsigned new_u64s; - struct bkey_i *new_k; - unsigned watermark = flags & BCH_WATERMARK_MASK; - - EBUG_ON(path->level); - - if (watermark < BCH_WATERMARK_reclaim && - !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(c)) - return bch_err_throw(c, btree_insert_need_journal_reclaim); - - /* - * bch2_varint_decode can read past the end of the buffer by at most 7 - * bytes (it won't be used): - */ - u64s += 1; - - if (u64s <= ck->u64s) - return 0; - - new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_k)) - return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); - - trans_for_each_update(trans, i) - if (i->old_v == &ck->k->v) - i->old_v = &new_k->v; - - ck->u64s = new_u64s; - ck->k = new_k; - return 0; -} - -/* Triggers: */ - -static int run_one_mem_trigger(struct btree_trans *trans, - struct btree_insert_entry *i, - unsigned flags) -{ - verify_update_old_key(trans, i); - - if (unlikely(flags & BTREE_TRIGGER_norun)) - return 0; - - struct bkey_s_c old = { &i->old_k, i->old_v }; - struct bkey_i *new = i->k; - const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); - const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - - if (old_ops->trigger == new_ops->trigger) - return bch2_key_trigger(trans, i->btree_id, i->level, - old, bkey_i_to_s(new), - BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags); - else - return bch2_key_trigger_new(trans, i->btree_id, i->level, - bkey_i_to_s(new), flags) ?: - bch2_key_trigger_old(trans, i->btree_id, i->level, - old, flags); -} - -static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i) -{ - verify_update_old_key(trans, i); - - if ((i->flags & BTREE_TRIGGER_norun) || - !btree_node_type_has_trans_triggers(i->bkey_type)) - return 0; - - /* - * Transactional triggers create new btree_insert_entries, so we can't - * pass them a pointer to a btree_insert_entry, that memory is going to - * move: - */ - struct bkey old_k = i->old_k; - struct bkey_s_c old = { &old_k, i->old_v }; - const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); - const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - unsigned flags = i->flags|BTREE_TRIGGER_transactional; - - if (!i->insert_trigger_run && - !i->overwrite_trigger_run && - old_ops->trigger == new_ops->trigger) { - i->overwrite_trigger_run = true; - i->insert_trigger_run = true; - return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), - BTREE_TRIGGER_insert| - BTREE_TRIGGER_overwrite|flags) ?: 1; - } else if (!i->overwrite_trigger_run) { - i->overwrite_trigger_run = true; - return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; - } else if (!i->insert_trigger_run) { - i->insert_trigger_run = true; - return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; - } else { - return 0; - } -} - -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -{ - unsigned sort_id_start = 0; - - while (sort_id_start < trans->nr_updates) { - unsigned i, sort_id = trans->updates[sort_id_start].sort_order; - bool trans_trigger_run; - - /* - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being - * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop - * references before they are re-added. - * - * Running triggers will append more updates to the list of - * updates as we're walking it: - */ - do { - trans_trigger_run = false; - - for (i = sort_id_start; - i < trans->nr_updates && trans->updates[i].sort_order <= sort_id; - i++) { - if (trans->updates[i].sort_order < sort_id) { - sort_id_start = i; - continue; - } - - int ret = run_one_trans_trigger(trans, trans->updates + i); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; - } - } while (trans_trigger_run); - - sort_id_start = i; - } - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && - btree_node_type_has_trans_triggers(i->bkey_type) && - (!i->insert_trigger_run || !i->overwrite_trigger_run)); -#endif - return 0; -} - -static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) -{ - trans_for_each_update(trans, i) - if (btree_node_type_has_triggers(i->bkey_type) && - gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) { - int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc); - if (ret) - return ret; - } - - return 0; -} - -static inline int -bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry **stopped_at, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_trans_commit_hook *h; - unsigned u64s = 0; - int ret = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); -#if 0 - /* todo: bring back dynamic fault injection */ - if (race_fault()) { - trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); - } -#endif - /* - * Check if the insert will fit in the leaf node with the write lock - * held, otherwise another thread could write the node changing the - * amount of space available: - */ - - prefetch(&trans->c->journal.flags); - - trans_for_each_update(trans, i) { - /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, i)) - u64s = 0; - - u64s += i->k->k.u64s; - ret = !i->cached - ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s) - : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s); - if (ret) { - *stopped_at = i; - return ret; - } - - i->k->k.needs_whiteout = false; - } - - /* - * Don't get journal reservation until after we know insert will - * succeed: - */ - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { - ret = bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_NONBLOCK); - if (ret) - return ret; - - if (unlikely(trans->journal_transaction_names)) - journal_transaction_name(trans); - } - - /* - * Not allowed to fail after we've gotten our journal reservation - we - * have to use it: - */ - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - !(flags & BCH_TRANS_COMMIT_no_journal_res)) { - if (static_branch_unlikely(&bch2_journal_seq_verify)) - trans_for_each_update(trans, i) - i->k->k.bversion.lo = trans->journal_res.seq; - else if (static_branch_unlikely(&bch2_inject_invalid_keys)) - trans_for_each_update(trans, i) - i->k->k.bversion = MAX_VERSION; - } - - h = trans->hooks; - while (h) { - ret = h->fn(trans, h); - if (ret) - return ret; - h = h->next; - } - - struct bkey_i *accounting; - - percpu_down_read(&c->mark_lock); - for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); - accounting != btree_trans_subbuf_top(trans, &trans->accounting); - accounting = bkey_next(accounting)) { - ret = bch2_accounting_trans_commit_hook(trans, - bkey_i_to_accounting(accounting), flags); - if (ret) - goto revert_fs_usage; - } - percpu_up_read(&c->mark_lock); - - /* XXX: we only want to run this if deltas are nonzero */ - bch2_trans_account_disk_usage_change(trans); - - trans_for_each_update(trans, i) - if (btree_node_type_has_atomic_triggers(i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags); - if (ret) - goto fatal_err; - } - - if (unlikely(c->gc_pos.phase)) { - ret = bch2_trans_commit_run_gc_triggers(trans); - if (ret) - goto fatal_err; - } - - struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit }; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; - - for (struct jset_entry *i = btree_trans_journal_entries_start(trans); - i != btree_trans_journal_entries_top(trans); - i = vstruct_next(i)) { - ret = bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, validate_context); - if (unlikely(ret)) { - bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", - trans->fn); - goto fatal_err; - } - } - - trans_for_each_update(trans, i) { - validate_context.level = i->level; - validate_context.btree = i->btree_id; - - ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context); - if (unlikely(ret)){ - bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", - trans->fn, (void *) i->ip_allocated); - goto fatal_err; - } - btree_insert_entry_checks(trans, i); - } - - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { - struct journal *j = &c->journal; - struct jset_entry *entry; - - trans_for_each_update(trans, i) { - if (i->key_cache_already_flushed) - continue; - - if (i->flags & BTREE_UPDATE_nojournal) - continue; - - verify_update_old_key(trans, i); - - if (trans->journal_transaction_names) { - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_overwrite, - i->btree_id, i->level, - i->old_k.u64s); - bkey_reassemble((struct bkey_i *) entry->start, - (struct bkey_s_c) { &i->old_k, i->old_v }); - } - - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, - i->btree_id, i->level, - i->k->k.u64s); - bkey_copy((struct bkey_i *) entry->start, i->k); - } - - memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - btree_trans_journal_entries_start(trans), - trans->journal_entries.u64s); - - EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s); - - trans->journal_res.offset += trans->journal_entries.u64s; - trans->journal_res.u64s -= trans->journal_entries.u64s; - - memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_write_buffer_keys, - BTREE_ID_accounting, 0, - trans->accounting.u64s)->_data, - btree_trans_subbuf_base(trans, &trans->accounting), - trans->accounting.u64s); - - if (trans->journal_seq) - *trans->journal_seq = trans->journal_res.seq; - } - - trans_for_each_update(trans, i) { - struct btree_path *path = trans->paths + i->path; - - if (!i->cached) - bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq); - else if (!i->key_cache_already_flushed) - bch2_btree_insert_key_cached(trans, flags, i); - else - bch2_btree_key_cache_drop(trans, path); - } - - return 0; -fatal_err: - bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); - percpu_down_read(&c->mark_lock); -revert_fs_usage: - for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); - i != accounting; - i = bkey_next(i)) - bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); - percpu_up_read(&c->mark_lock); - return ret; -} - -static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) -{ - /* - * Accounting keys aren't deduped in the journal: we have to compare - * each individual update against what's in the btree to see if it has - * been applied yet, and accounting updates also don't overwrite, - * they're deltas that accumulate. - */ - trans_for_each_update(trans, i) - if (i->k->k.type != KEY_TYPE_accounting) - bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); -} - -static int bch2_trans_commit_journal_pin_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - return 0; -} - -/* - * Get journal reservation, take write locks, and attempt to do btree update(s): - */ -static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry **stopped_at, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - int ret = 0, u64s_delta = 0; - - for (unsigned idx = 0; idx < trans->nr_updates; idx++) { - struct btree_insert_entry *i = trans->updates + idx; - if (i->cached) - continue; - - u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; - u64s_delta -= i->old_btree_u64s; - - if (!same_leaf_as_next(trans, i)) { - if (u64s_delta <= 0) { - ret = bch2_foreground_maybe_merge(trans, i->path, - i->level, flags); - if (unlikely(ret)) - return ret; - } - - u64s_delta = 0; - } - } - - ret = bch2_trans_lock_write(trans); - if (unlikely(ret)) - return ret; - - ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); - - if (!ret && unlikely(trans->journal_replay_not_finished)) - bch2_drop_overwrites_from_journal(trans); - - bch2_trans_unlock_updates_write(trans); - - if (!ret && trans->journal_pin) - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, - trans->journal_pin, - bch2_trans_commit_journal_pin_flush); - - /* - * Drop journal reservation after dropping write locks, since dropping - * the journal reservation may kick off a journal write: - */ - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) - bch2_journal_res_put(&c->journal, &trans->journal_res); - - return ret; -} - -static int journal_reclaim_wait_done(struct bch_fs *c) -{ - int ret = bch2_journal_error(&c->journal) ?: - bch2_btree_key_cache_wait_done(c); - - if (!ret) - journal_reclaim_kick(&c->journal); - return ret; -} - -static noinline -int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry *i, - int ret, unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - - if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) { - /* - * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK - * flag - */ - if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) { - ret = bch_err_throw(c, journal_reclaim_would_deadlock); - goto out; - } - - ret = drop_locks_do(trans, - bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_CHECK)); - goto out; - } - - switch (ret) { - case -BCH_ERR_btree_insert_btree_node_full: - ret = bch2_btree_split_leaf(trans, i->path, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, - trace_ip, trans->paths + i->path); - break; - case -BCH_ERR_btree_insert_need_mark_replicas: - ret = drop_locks_do(trans, - bch2_accounting_update_sb(trans)); - break; - case -BCH_ERR_btree_insert_need_journal_reclaim: - bch2_trans_unlock(trans); - - trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); - track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true); - - wait_event_freezable(c->journal.reclaim_wait, - (ret = journal_reclaim_wait_done(c))); - - track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false); - - if (ret < 0) - break; - - ret = bch2_trans_relock(trans); - break; - default: - BUG_ON(ret >= 0); - break; - } -out: - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); - - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && - (flags & BCH_TRANS_COMMIT_no_enospc), c, - "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); - - return ret; -} - -/* - * This is for updates done in the early part of fsck - btree_gc - before we've - * gone RW. we only add the new key to the list of keys for journal replay to - * do. - */ -static noinline int -do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - - BUG_ON(current != c->recovery_task); - - trans_for_each_update(trans, i) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); - if (ret) - return ret; - } - - for (struct jset_entry *i = btree_trans_journal_entries_start(trans); - i != btree_trans_journal_entries_top(trans); - i = vstruct_next(i)) { - if (i->type == BCH_JSET_ENTRY_btree_keys || - i->type == BCH_JSET_ENTRY_write_buffer_keys) { - jset_entry_for_each_key(i, k) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k); - if (ret) - return ret; - } - } - - if (i->type == BCH_JSET_ENTRY_btree_root) { - guard(mutex)(&c->btree_root_lock); - - struct btree_root *r = bch2_btree_id_root(c, i->btree_id); - - bkey_copy(&r->key, i->start); - r->level = i->level; - r->alive = true; - } - } - - for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); - i != btree_trans_subbuf_top(trans, &trans->accounting); - i = bkey_next(i)) { - int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i); - if (ret) - return ret; - } - - return 0; -} - -int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) -{ - struct btree_insert_entry *errored_at = NULL; - struct bch_fs *c = trans->c; - unsigned journal_u64s = 0; - int ret = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) - goto out_reset; - - if (!trans->nr_updates && - !trans->journal_entries.u64s && - !trans->accounting.u64s) - goto out_reset; - - ret = bch2_trans_commit_run_triggers(trans); - if (ret) - goto out_reset; - - if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && - unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) { - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) - ret = do_bch2_trans_commit_to_journal_replay(trans); - else - ret = bch_err_throw(c, erofs_trans_commit); - goto out_reset; - } - - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - - journal_u64s = jset_u64s(trans->accounting.u64s); - trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); - if (trans->journal_transaction_names) - journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); - - trans_for_each_update(trans, i) { - struct btree_path *path = trans->paths + i->path; - - EBUG_ON(!path->should_be_locked); - - ret = bch2_btree_path_upgrade(trans, path, i->level + 1); - if (unlikely(ret)) - goto out; - - EBUG_ON(!btree_node_intent_locked(path, i->level)); - - if (i->key_cache_already_flushed) - continue; - - if (i->flags & BTREE_UPDATE_nojournal) - continue; - - /* we're going to journal the key being updated: */ - journal_u64s += jset_u64s(i->k->k.u64s); - - /* and we're also going to log the overwrite: */ - if (trans->journal_transaction_names) - journal_u64s += jset_u64s(i->old_k.u64s); - } - - if (trans->extra_disk_res) { - ret = bch2_disk_reservation_add(c, trans->disk_res, - trans->extra_disk_res, - (flags & BCH_TRANS_COMMIT_no_enospc) - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - goto err; - } -retry: - errored_at = NULL; - bch2_trans_verify_not_unlocked_or_in_restart(trans); - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); - - trans->journal_u64s = journal_u64s + trans->journal_entries.u64s; - - ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); - - /* make sure we didn't drop or screw up locks: */ - bch2_trans_verify_locks(trans); - - if (ret) - goto err; - - trace_and_count(c, transaction_commit, trans, _RET_IP_); -out: - if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans); -out_reset: - if (!ret) - bch2_trans_downgrade(trans); - bch2_trans_reset_updates(trans); - - return ret; -err: - ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_); - if (ret) - goto out; - - /* - * We might have done another transaction commit in the error path - - * i.e. btree write buffer flush - which will have made use of - * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is - * how the journal sequence number to pin is passed in - so we must - * restart: - */ - if (flags & BCH_TRANS_COMMIT_no_journal_res) { - ret = bch_err_throw(c, transaction_restart_nested); - goto out; - } - - goto retry; -} diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h deleted file mode 100644 index 112170fd9c8fb5..00000000000000 --- a/fs/bcachefs/btree_types.h +++ /dev/null @@ -1,937 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_TYPES_H -#define _BCACHEFS_BTREE_TYPES_H - -#include -#include - -#include "bbpos_types.h" -#include "btree_key_cache_types.h" -#include "buckets_types.h" -#include "darray.h" -#include "errcode.h" -#include "journal_types.h" -#include "replicas_types.h" -#include "six.h" - -struct open_bucket; -struct btree_update; -struct btree_trans; - -#define MAX_BSETS 3U - -struct btree_nr_keys { - - /* - * Amount of live metadata (i.e. size of node after a compaction) in - * units of u64s - */ - u16 live_u64s; - u16 bset_u64s[MAX_BSETS]; - - /* live keys only: */ - u16 packed_keys; - u16 unpacked_keys; -}; - -struct bset_tree { - /* - * We construct a binary tree in an array as if the array - * started at 1, so that things line up on the same cachelines - * better: see comments in bset.c at cacheline_to_bkey() for - * details - */ - - /* size of the binary tree and prev array */ - u16 size; - - /* function of size - precalculated for to_inorder() */ - u16 extra; - - u16 data_offset; - u16 aux_data_offset; - u16 end_offset; -}; - -struct btree_write { - struct journal_entry_pin journal; -}; - -struct btree_alloc { - struct open_buckets ob; - __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); -}; - -struct btree_bkey_cached_common { - struct six_lock lock; - u8 level; - u8 btree_id; - bool cached; -}; - -struct btree { - struct btree_bkey_cached_common c; - - struct rhash_head hash; - u64 hash_val; - - unsigned long flags; - u16 written; - u8 nsets; - u8 nr_key_bits; - u16 version_ondisk; - - struct bkey_format format; - - struct btree_node *data; - void *aux_data; - - /* - * Sets of sorted keys - the real btree node - plus a binary search tree - * - * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point - * to the memory we have allocated for this btree node. Additionally, - * set[0]->data points to the entire btree node as it exists on disk. - */ - struct bset_tree set[MAX_BSETS]; - - struct btree_nr_keys nr; - u16 sib_u64s[2]; - u16 whiteout_u64s; - u8 byte_order; - u8 unpack_fn_len; - - struct btree_write writes[2]; - - /* Key/pointer for this btree node */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); - - /* - * XXX: add a delete sequence number, so when bch2_btree_node_relock() - * fails because the lock sequence number has changed - i.e. the - * contents were modified - we can still relock the node if it's still - * the one we want, without redoing the traversal - */ - - /* - * For asynchronous splits/interior node updates: - * When we do a split, we allocate new child nodes and update the parent - * node to point to them: we update the parent in memory immediately, - * but then we must wait until the children have been written out before - * the update to the parent can be written - this is a list of the - * btree_updates that are blocking this node from being - * written: - */ - struct list_head write_blocked; - - /* - * Also for asynchronous splits/interior node updates: - * If a btree node isn't reachable yet, we don't want to kick off - * another write - because that write also won't yet be reachable and - * marking it as completed before it's reachable would be incorrect: - */ - unsigned long will_make_reachable; - - struct open_buckets ob; - - /* lru list */ - struct list_head list; -}; - -#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ - x(cache_reserve) \ - x(lock_intent) \ - x(lock_write) \ - x(dirty) \ - x(read_in_flight) \ - x(write_in_flight) \ - x(noevict) \ - x(write_blocked) \ - x(will_make_reachable) \ - x(access_bit) - -enum bch_btree_cache_not_freed_reasons { -#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n, - BCH_BTREE_CACHE_NOT_FREED_REASONS() -#undef x - BCH_BTREE_CACHE_NOT_FREED_REASONS_NR, -}; - -struct btree_cache_list { - unsigned idx; - struct shrinker *shrink; - struct list_head list; - size_t nr; -}; - -struct btree_cache { - struct rhashtable table; - bool table_init_done; - /* - * We never free a struct btree, except on shutdown - we just put it on - * the btree_cache_freed list and reuse it later. This simplifies the - * code, and it doesn't cost us much memory as the memory usage is - * dominated by buffers that hold the actual btree node data and those - * can be freed - and the number of struct btrees allocated is - * effectively bounded. - * - * btree_cache_freeable effectively is a small cache - we use it because - * high order page allocations can be rather expensive, and it's quite - * common to delete and allocate btree nodes in quick succession. It - * should never grow past ~2-3 nodes in practice. - */ - struct mutex lock; - struct list_head freeable; - struct list_head freed_pcpu; - struct list_head freed_nonpcpu; - struct btree_cache_list live[2]; - - size_t nr_freeable; - size_t nr_reserve; - size_t nr_by_btree[BTREE_ID_NR]; - atomic_long_t nr_dirty; - - /* shrinker stats */ - size_t nr_freed; - u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR]; - - /* - * If we need to allocate memory for a new btree node and that - * allocation fails, we can cannibalize another node in the btree cache - * to satisfy the allocation - lock to guarantee only one thread does - * this at a time: - */ - struct task_struct *alloc_lock; - struct closure_waitlist alloc_wait; - - struct bbpos pinned_nodes_start; - struct bbpos pinned_nodes_end; - /* btree id mask: 0 for leaves, 1 for interior */ - u64 pinned_nodes_mask[2]; -}; - -struct btree_node_iter { - struct btree_node_iter_set { - u16 k, end; - } data[MAX_BSETS]; -}; - -#define BTREE_ITER_FLAGS() \ - x(slots) \ - x(intent) \ - x(prefetch) \ - x(is_extents) \ - x(not_extents) \ - x(cached) \ - x(with_key_cache) \ - x(with_updates) \ - x(with_journal) \ - x(snapshot_field) \ - x(all_snapshots) \ - x(filter_snapshots) \ - x(nopreserve) \ - x(cached_nofill) \ - x(key_cache_fill) \ - -#define STR_HASH_FLAGS() \ - x(must_create) \ - x(must_replace) - -#define BTREE_UPDATE_FLAGS() \ - x(internal_snapshot_node) \ - x(nojournal) \ - x(key_cache_reclaim) - - -/* - * BTREE_TRIGGER_norun - don't run triggers at all - * - * BTREE_TRIGGER_transactional - we're running transactional triggers as part of - * a transaction commit: triggers may generate new updates - * - * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction - * commit: we have our journal reservation, we're holding btree node write - * locks, and we know the transaction is going to commit (returning an error - * here is a fatal error, causing us to go emergency read-only) - * - * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage - * - * BTREE_TRIGGER_insert - @new is entering the btree - * BTREE_TRIGGER_overwrite - @old is leaving the btree - */ -#define BTREE_TRIGGER_FLAGS() \ - x(norun) \ - x(transactional) \ - x(atomic) \ - x(check_repair) \ - x(gc) \ - x(insert) \ - x(overwrite) \ - x(is_root) - -enum { -#define x(n) BTREE_ITER_FLAG_BIT_##n, - BTREE_ITER_FLAGS() - STR_HASH_FLAGS() - BTREE_UPDATE_FLAGS() - BTREE_TRIGGER_FLAGS() -#undef x -}; - -/* iter flags must fit in a u16: */ -//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15); - -enum btree_iter_update_trigger_flags { -#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - BTREE_ITER_FLAGS() -#undef x -#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - STR_HASH_FLAGS() -#undef x -#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - BTREE_UPDATE_FLAGS() -#undef x -#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - BTREE_TRIGGER_FLAGS() -#undef x -}; - -enum btree_path_uptodate { - BTREE_ITER_UPTODATE = 0, - BTREE_ITER_NEED_RELOCK = 1, - BTREE_ITER_NEED_TRAVERSE = 2, -}; - -#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG) -#define TRACK_PATH_ALLOCATED -#endif - -typedef u16 btree_path_idx_t; - -struct btree_path { - btree_path_idx_t sorted_idx; - u8 ref; - u8 intent_ref; - - /* btree_iter_copy starts here: */ - struct bpos pos; - - enum btree_id btree_id:5; - bool cached:1; - bool preserve:1; - enum btree_path_uptodate uptodate:2; - /* - * When true, failing to relock this path will cause the transaction to - * restart: - */ - bool should_be_locked:1; - unsigned level:3, - locks_want:3; - u8 nodes_locked; - - struct btree_path_level { - struct btree *b; - struct btree_node_iter iter; - u32 lock_seq; -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - u64 lock_taken_time; -#endif - } l[BTREE_MAX_DEPTH]; -#ifdef TRACK_PATH_ALLOCATED - unsigned long ip_allocated; -#endif -}; - -static inline struct btree_path_level *path_l(struct btree_path *path) -{ - return path->l + path->level; -} - -static inline unsigned long btree_path_ip_allocated(struct btree_path *path) -{ -#ifdef TRACK_PATH_ALLOCATED - return path->ip_allocated; -#else - return _THIS_IP_; -#endif -} - -/* - * @pos - iterator's current position - * @level - current btree depth - * @locks_want - btree level below which we start taking intent locks - * @nodes_locked - bitmask indicating which nodes in @nodes are locked - * @nodes_intent_locked - bitmask indicating which locks are intent locks - */ -struct btree_iter { - btree_path_idx_t path; - btree_path_idx_t update_path; - btree_path_idx_t key_cache_path; - - enum btree_id btree_id:8; - u8 min_depth; - - /* btree_iter_copy starts here: */ - u16 flags; - - /* When we're filtering by snapshot, the snapshot ID we're looking for: */ - unsigned snapshot; - - struct bpos pos; - /* - * Current unpacked key - so that bch2_btree_iter_next()/ - * bch2_btree_iter_next_slot() can correctly advance pos. - */ - struct bkey k; - - /* BTREE_ITER_with_journal: */ - size_t journal_idx; -#ifdef TRACK_PATH_ALLOCATED - unsigned long ip_allocated; -#endif -}; - -#define BKEY_CACHED_ACCESSED 0 -#define BKEY_CACHED_DIRTY 1 - -struct bkey_cached { - struct btree_bkey_cached_common c; - - unsigned long flags; - u16 u64s; - struct bkey_cached_key key; - - struct rhash_head hash; - - struct journal_entry_pin journal; - u64 seq; - - struct bkey_i *k; - struct rcu_head rcu; -}; - -static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) -{ - return !b->cached - ? container_of(b, struct btree, c)->key.k.p - : container_of(b, struct bkey_cached, c)->key.pos; -} - -struct btree_insert_entry { - unsigned flags; - u8 sort_order; - u8 bkey_type; - enum btree_id btree_id:8; - u8 level:4; - bool cached:1; - bool insert_trigger_run:1; - bool overwrite_trigger_run:1; - bool key_cache_already_flushed:1; - /* - * @old_k may be a key from the journal; @old_btree_u64s always refers - * to the size of the key being overwritten in the btree: - */ - u8 old_btree_u64s; - btree_path_idx_t path; - struct bkey_i *k; - /* key being overwritten: */ - struct bkey old_k; - const struct bch_val *old_v; - unsigned long ip_allocated; -}; - -/* Number of btree paths we preallocate, usually enough */ -#define BTREE_ITER_INITIAL 64 -/* - * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code - * paths should run inside this limit, and if they don't it usually indicates a - * bug (leaking/duplicated btree paths). - * - * exception: some fsck paths - * - * bugs with excessive path usage seem to have possibly been eliminated now, so - * we might consider eliminating this (and btree_trans_too_many_iter()) at some - * point. - */ -#define BTREE_ITER_NORMAL_LIMIT 256 -/* never exceed limit */ -#define BTREE_ITER_MAX (1U << 10) - -struct btree_trans_commit_hook; -typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); - -struct btree_trans_commit_hook { - btree_trans_commit_hook_fn *fn; - struct btree_trans_commit_hook *next; -}; - -#define BTREE_TRANS_MEM_MAX (1U << 16) - -#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 - -struct btree_trans_paths { - unsigned long nr_paths; - struct btree_path paths[]; -}; - -struct trans_kmalloc_trace { - unsigned long ip; - size_t bytes; -}; -typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; - -struct btree_trans_subbuf { - u16 base; - u16 u64s; - u16 size;; -}; - -struct btree_trans { - struct bch_fs *c; - - unsigned long *paths_allocated; - struct btree_path *paths; - btree_path_idx_t *sorted; - struct btree_insert_entry *updates; - - void *mem; - unsigned mem_top; - unsigned mem_bytes; - unsigned realloc_bytes_required; -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_trans_kmalloc_trace trans_kmalloc_trace; -#endif - - btree_path_idx_t nr_sorted; - btree_path_idx_t nr_paths; - btree_path_idx_t nr_paths_max; - btree_path_idx_t nr_updates; - u8 fn_idx; - u8 lock_must_abort; - bool lock_may_not_fail:1; - bool srcu_held:1; - bool locked:1; - bool pf_memalloc_nofs:1; - bool write_locked:1; - bool used_mempool:1; - bool in_traverse_all:1; - bool paths_sorted:1; - bool memory_allocation_failure:1; - bool journal_transaction_names:1; - bool journal_replay_not_finished:1; - bool notrace_relock_fail:1; - enum bch_errcode restarted:16; - u32 restart_count; -#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS - u32 restart_count_this_trans; -#endif - - u64 last_begin_time; - unsigned long last_begin_ip; - unsigned long last_restarted_ip; -#ifdef CONFIG_BCACHEFS_DEBUG - bch_stacktrace last_restarted_trace; -#endif - unsigned long last_unlock_ip; - unsigned long srcu_lock_time; - - const char *fn; - struct btree_bkey_cached_common *locking; - struct six_lock_waiter locking_wait; - int srcu_idx; - - /* update path: */ - struct btree_trans_subbuf journal_entries; - struct btree_trans_subbuf accounting; - - struct btree_trans_commit_hook *hooks; - struct journal_entry_pin *journal_pin; - - struct journal_res journal_res; - u64 *journal_seq; - struct disk_reservation *disk_res; - - struct bch_fs_usage_base fs_usage_delta; - - unsigned journal_u64s; - unsigned extra_disk_res; /* XXX kill */ - - __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif - /* Entries before this are zeroed out on every bch2_trans_get() call */ - - struct list_head list; - struct closure ref; - - unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)]; - struct btree_trans_paths trans_paths; - struct btree_path _paths[BTREE_ITER_INITIAL]; - btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4]; - struct btree_insert_entry _updates[BTREE_ITER_INITIAL]; -}; - -static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter) -{ - return trans->paths + iter->path; -} - -static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter) -{ - return iter->key_cache_path - ? trans->paths + iter->key_cache_path - : NULL; -} - -#define BCH_BTREE_WRITE_TYPES() \ - x(initial, 0) \ - x(init_next_bset, 1) \ - x(cache_reclaim, 2) \ - x(journal_reclaim, 3) \ - x(interior, 4) - -enum btree_write_type { -#define x(t, n) BTREE_WRITE_##t, - BCH_BTREE_WRITE_TYPES() -#undef x - BTREE_WRITE_TYPE_NR, -}; - -#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1) -#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR)) - -#define BTREE_FLAGS() \ - x(read_in_flight) \ - x(read_error) \ - x(dirty) \ - x(need_write) \ - x(write_blocked) \ - x(will_make_reachable) \ - x(noevict) \ - x(write_idx) \ - x(accessed) \ - x(write_in_flight) \ - x(write_in_flight_inner) \ - x(just_written) \ - x(dying) \ - x(fake) \ - x(need_rewrite) \ - x(need_rewrite_error) \ - x(need_rewrite_degraded) \ - x(need_rewrite_ptr_written_zero) \ - x(never_write) \ - x(pinned) - -enum btree_flags { - /* First bits for btree node write type */ - BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1, -#define x(flag) BTREE_NODE_##flag, - BTREE_FLAGS() -#undef x -}; - -#define x(flag) \ -static inline bool btree_node_ ## flag(struct btree *b) \ -{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ - \ -static inline void set_btree_node_ ## flag(struct btree *b) \ -{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ - \ -static inline void clear_btree_node_ ## flag(struct btree *b) \ -{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } - -BTREE_FLAGS() -#undef x - -#define BTREE_NODE_REWRITE_REASON() \ - x(none) \ - x(unknown) \ - x(error) \ - x(degraded) \ - x(ptr_written_zero) - -enum btree_node_rewrite_reason { -#define x(n) BTREE_NODE_REWRITE_##n, - BTREE_NODE_REWRITE_REASON() -#undef x -}; - -static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b) -{ - if (btree_node_need_rewrite_ptr_written_zero(b)) - return BTREE_NODE_REWRITE_ptr_written_zero; - if (btree_node_need_rewrite_degraded(b)) - return BTREE_NODE_REWRITE_degraded; - if (btree_node_need_rewrite_error(b)) - return BTREE_NODE_REWRITE_error; - if (btree_node_need_rewrite(b)) - return BTREE_NODE_REWRITE_unknown; - return BTREE_NODE_REWRITE_none; -} - -static inline struct btree_write *btree_current_write(struct btree *b) -{ - return b->writes + btree_node_write_idx(b); -} - -static inline struct btree_write *btree_prev_write(struct btree *b) -{ - return b->writes + (btree_node_write_idx(b) ^ 1); -} - -static inline struct bset_tree *bset_tree_last(struct btree *b) -{ - EBUG_ON(!b->nsets); - return b->set + b->nsets - 1; -} - -static inline void * -__btree_node_offset_to_ptr(const struct btree *b, u16 offset) -{ - return (void *) ((u64 *) b->data + offset); -} - -static inline u16 -__btree_node_ptr_to_offset(const struct btree *b, const void *p) -{ - u16 ret = (u64 *) p - (u64 *) b->data; - - EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); - return ret; -} - -static inline struct bset *bset(const struct btree *b, - const struct bset_tree *t) -{ - return __btree_node_offset_to_ptr(b, t->data_offset); -} - -static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) -{ - t->end_offset = - __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); -} - -static inline void set_btree_bset(struct btree *b, struct bset_tree *t, - const struct bset *i) -{ - t->data_offset = __btree_node_ptr_to_offset(b, i); - set_btree_bset_end(b, t); -} - -static inline struct bset *btree_bset_first(struct btree *b) -{ - return bset(b, b->set); -} - -static inline struct bset *btree_bset_last(struct btree *b) -{ - return bset(b, bset_tree_last(b)); -} - -static inline u16 -__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) -{ - return __btree_node_ptr_to_offset(b, k); -} - -static inline struct bkey_packed * -__btree_node_offset_to_key(const struct btree *b, u16 k) -{ - return __btree_node_offset_to_ptr(b, k); -} - -static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) -{ - return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); -} - -#define btree_bkey_first(_b, _t) \ -({ \ - EBUG_ON(bset(_b, _t)->start != \ - __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ - \ - bset(_b, _t)->start; \ -}) - -#define btree_bkey_last(_b, _t) \ -({ \ - EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ - vstruct_last(bset(_b, _t))); \ - \ - __btree_node_offset_to_key(_b, (_t)->end_offset); \ -}) - -static inline unsigned bset_u64s(struct bset_tree *t) -{ - return t->end_offset - t->data_offset - - sizeof(struct bset) / sizeof(u64); -} - -static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) -{ - return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; -} - -static inline unsigned bset_byte_offset(struct btree *b, void *i) -{ - return i - (void *) b->data; -} - -enum btree_node_type { - BKEY_TYPE_btree, -#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1, - BCH_BTREE_IDS() -#undef x - BKEY_TYPE_NR -}; - -/* Type of a key in btree @id at level @level: */ -static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) -{ - return level ? BKEY_TYPE_btree : (unsigned) id + 1; -} - -/* Type of keys @b contains: */ -static inline enum btree_node_type btree_node_type(struct btree *b) -{ - return __btree_node_type(b->c.level, b->c.btree_id); -} - -const char *bch2_btree_node_type_str(enum btree_node_type); - -#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ - (BIT_ULL(BKEY_TYPE_extents)| \ - BIT_ULL(BKEY_TYPE_alloc)| \ - BIT_ULL(BKEY_TYPE_inodes)| \ - BIT_ULL(BKEY_TYPE_stripes)| \ - BIT_ULL(BKEY_TYPE_reflink)| \ - BIT_ULL(BKEY_TYPE_subvolumes)| \ - BIT_ULL(BKEY_TYPE_btree)) - -#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ - (BIT_ULL(BKEY_TYPE_alloc)| \ - BIT_ULL(BKEY_TYPE_inodes)| \ - BIT_ULL(BKEY_TYPE_stripes)| \ - BIT_ULL(BKEY_TYPE_snapshots)) - -#define BTREE_NODE_TYPE_HAS_TRIGGERS \ - (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ - BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS) - -static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type) -{ - return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS; -} - -static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type) -{ - return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS; -} - -static inline bool btree_node_type_has_triggers(enum btree_node_type type) -{ - return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; -} - -static inline bool btree_id_is_extents(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_node_type_is_extents(enum btree_node_type type) -{ - return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1); -} - -static inline bool btree_type_has_snapshots(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_type_has_snapshot_field(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_type_has_ptrs(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_type_uses_write_buffer(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline u8 btree_trigger_order(enum btree_id btree) -{ - switch (btree) { - case BTREE_ID_alloc: - return U8_MAX; - case BTREE_ID_stripes: - return U8_MAX - 1; - default: - return btree; - } -} - -struct btree_root { - struct btree *b; - - /* On disk root - see async splits: */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); - u8 level; - u8 alive; - s16 error; -}; - -enum btree_gc_coalesce_fail_reason { - BTREE_GC_COALESCE_FAIL_RESERVE_GET, - BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, - BTREE_GC_COALESCE_FAIL_FORMAT_FITS, -}; - -enum btree_node_sibling { - btree_prev_sib, - btree_next_sib, -}; - -struct get_locks_fail { - unsigned l; - struct btree *b; -}; - -#endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c deleted file mode 100644 index ee657b9f4b968d..00000000000000 --- a/fs/bcachefs/btree_update.c +++ /dev/null @@ -1,916 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_update.h" -#include "btree_iter.h" -#include "btree_journal_iter.h" -#include "btree_locking.h" -#include "buckets.h" -#include "debug.h" -#include "errcode.h" -#include "error.h" -#include "extents.h" -#include "keylist.h" -#include "snapshot.h" -#include "trace.h" - -#include - -static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, - const struct btree_insert_entry *r) -{ - return cmp_int(l->sort_order, r->sort_order) ?: - cmp_int(l->cached, r->cached) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p); -} - -static int __must_check -bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t, - struct bkey_i *, enum btree_iter_update_trigger_flags, - unsigned long ip); - -static noinline int extent_front_merge(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bkey_i **insert, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_i *update; - int ret; - - if (unlikely(trans->journal_replay_not_finished)) - return 0; - - update = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(update); - if (ret) - return ret; - - if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) - return 0; - - ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?: - bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p); - if (ret < 0) - return ret; - if (ret) - return 0; - - ret = bch2_btree_delete_at(trans, iter, flags); - if (ret) - return ret; - - *insert = update; - return 0; -} - -static noinline int extent_back_merge(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - int ret; - - if (unlikely(trans->journal_replay_not_finished)) - return 0; - - ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: - bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); - if (ret < 0) - return ret; - if (ret) - return 0; - - bch2_bkey_merge(c, bkey_i_to_s(insert), k); - return 0; -} - -/* - * When deleting, check if we need to emit a whiteout (because we're overwriting - * something in an ancestor snapshot) - */ -static int need_whiteout_for_snapshot(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - u32 snapshot = pos.snapshot; - int ret; - - if (!bch2_snapshot_parent(trans->c, pos.snapshot)) - return 0; - - pos.snapshot++; - - for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_all_snapshots| - BTREE_ITER_nopreserve, k, ret) { - if (!bkey_eq(k.k->p, pos)) - break; - - if (bch2_snapshot_is_ancestor(trans->c, snapshot, - k.k->p.snapshot)) { - ret = !bkey_whiteout(k.k); - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - snapshot_id_list *s) -{ - int ret = 0; - - darray_for_each(*s, id) { - pos.snapshot = *id; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos, - BTREE_ITER_not_extents| - BTREE_ITER_intent); - ret = bkey_err(k); - if (ret) - break; - - if (k.k->type == KEY_TYPE_deleted) { - struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); - ret = PTR_ERR_OR_ZERO(update); - if (ret) { - bch2_trans_iter_exit(trans, &iter); - break; - } - - bkey_init(&update->k); - update->k.p = pos; - update->k.type = KEY_TYPE_whiteout; - - ret = bch2_trans_update(trans, &iter, update, - BTREE_UPDATE_internal_snapshot_node); - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - break; - } - - darray_exit(s); - return ret; -} - -int bch2_trans_update_extent_overwrite(struct btree_trans *trans, - struct btree_iter *iter, - enum btree_iter_update_trigger_flags flags, - struct bkey_s_c old, - struct bkey_s_c new) -{ - enum btree_id btree_id = iter->btree_id; - struct bkey_i *update; - struct bpos new_start = bkey_start_pos(new.k); - unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start); - unsigned back_split = bkey_gt(old.k->p, new.k->p); - unsigned middle_split = (front_split || back_split) && - old.k->p.snapshot != new.k->p.snapshot; - unsigned nr_splits = front_split + back_split + middle_split; - int ret = 0, compressed_sectors; - - /* - * If we're going to be splitting a compressed extent, note it - * so that __bch2_trans_commit() can increase our disk - * reservation: - */ - if (nr_splits > 1 && - (compressed_sectors = bch2_bkey_sectors_compressed(old))) - trans->extra_disk_res += compressed_sectors * (nr_splits - 1); - - if (front_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_back(new_start, update); - - ret = bch2_insert_snapshot_whiteouts(trans, btree_id, - old.k->p, update->k.p) ?: - bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_internal_snapshot_node|flags); - if (ret) - return ret; - } - - /* If we're overwriting in a different snapshot - middle split: */ - if (middle_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_front(new_start, update); - bch2_cut_back(new.k->p, update); - - ret = bch2_insert_snapshot_whiteouts(trans, btree_id, - old.k->p, update->k.p) ?: - bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_internal_snapshot_node|flags); - if (ret) - return ret; - } - - if (bkey_le(old.k->p, new.k->p)) { - update = bch2_trans_kmalloc(trans, sizeof(*update)); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bkey_init(&update->k); - update->k.p = old.k->p; - update->k.p.snapshot = new.k->p.snapshot; - - if (new.k->p.snapshot != old.k->p.snapshot) { - update->k.type = KEY_TYPE_whiteout; - } else if (btree_type_has_snapshots(btree_id)) { - ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); - if (ret < 0) - return ret; - if (ret) - update->k.type = KEY_TYPE_whiteout; - } - - ret = bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_internal_snapshot_node|flags); - if (ret) - return ret; - } - - if (back_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_front(new.k->p, update); - - ret = bch2_trans_update_by_path(trans, iter->path, update, - BTREE_UPDATE_internal_snapshot_node| - flags, _RET_IP_); - if (ret) - return ret; - } - - return 0; -} - -static int bch2_trans_update_extent(struct btree_trans *trans, - struct btree_iter *orig_iter, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - struct bkey_s_c k; - enum btree_id btree_id = orig_iter->btree_id; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), - BTREE_ITER_intent| - BTREE_ITER_with_updates| - BTREE_ITER_not_extents); - k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) - goto out; - - if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { - if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { - ret = extent_front_merge(trans, &iter, k, &insert, flags); - if (ret) - goto err; - } - - goto next; - } - - while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { - bool done = bkey_lt(insert->k.p, k.k->p); - - ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); - if (ret) - goto err; - - if (done) - goto out; -next: - bch2_btree_iter_advance(trans, &iter); - k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) - goto out; - } - - if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { - ret = extent_back_merge(trans, &iter, insert, k); - if (ret) - goto err; - } -out: - if (!bkey_deleted(&insert->k)) - ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); -err: - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static noinline int flush_new_cached_update(struct btree_trans *trans, - struct btree_insert_entry *i, - enum btree_iter_update_trigger_flags flags, - unsigned long ip) -{ - struct bkey k; - int ret; - - btree_path_idx_t path_idx = - bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, - BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, path_idx, 0); - if (ret) - goto out; - - struct btree_path *btree_path = trans->paths + path_idx; - - /* - * The old key in the insert entry might actually refer to an existing - * key in the btree that has been deleted from cache and not yet - * flushed. Check for this and skip the flush so we don't run triggers - * against a stale key. - */ - bch2_btree_path_peek_slot_exact(btree_path, &k); - if (!bkey_deleted(&k)) - goto out; - - i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_norun; - - btree_path_set_should_be_locked(trans, btree_path); - ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); -out: - bch2_path_put(trans, path_idx, true); - return ret; -} - -static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags, - unsigned long ip) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i, n; - int cmp; - - struct btree_path *path = trans->paths + path_idx; - EBUG_ON(!path->should_be_locked); - EBUG_ON(trans->nr_updates >= trans->nr_paths); - EBUG_ON(!bpos_eq(k->k.p, path->pos)); - - n = (struct btree_insert_entry) { - .flags = flags, - .sort_order = btree_trigger_order(path->btree_id), - .bkey_type = __btree_node_type(path->level, path->btree_id), - .btree_id = path->btree_id, - .level = path->level, - .cached = path->cached, - .path = path_idx, - .k = k, - .ip_allocated = ip, - }; - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(i != trans->updates && - btree_insert_entry_cmp(i - 1, i) >= 0); -#endif - - /* - * Pending updates are kept sorted: first, find position of new update, - * then delete/trim any updates the new update overwrites: - */ - for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) { - cmp = btree_insert_entry_cmp(&n, i); - if (cmp <= 0) - break; - } - - bool overwrite = !cmp && i < trans->updates + trans->nr_updates; - - if (overwrite) { - EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); - - bch2_path_put(trans, i->path, true); - i->flags = n.flags; - i->cached = n.cached; - i->k = n.k; - i->path = n.path; - i->ip_allocated = n.ip_allocated; - } else { - array_insert_item(trans->updates, trans->nr_updates, - i - trans->updates, n); - - i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; - i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); - - if (j_k) { - i->old_k = j_k->k; - i->old_v = &j_k->v; - } - } - } - - __btree_path_get(trans, trans->paths + i->path, true); - - trace_update_by_path(trans, path, i, overwrite); - - /* - * If a key is present in the key cache, it must also exist in the - * btree - this is necessary for cache coherency. When iterating over - * a btree that's cached in the key cache, the btree iter code checks - * the key cache - but the key has to exist in the btree for that to - * work: - */ - if (path->cached && !i->old_btree_u64s) - return flush_new_cached_update(trans, i, flags, ip); - - return 0; -} - -static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, - struct btree_iter *iter, - struct btree_path *path) -{ - struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter); - - if (!key_cache_path || - !key_cache_path->should_be_locked || - !bpos_eq(key_cache_path->pos, iter->pos)) { - struct bkey_cached *ck; - int ret; - - if (!iter->key_cache_path) - iter->key_cache_path = - bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_intent| - BTREE_ITER_cached, _THIS_IP_); - - iter->key_cache_path = - bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached); - if (unlikely(ret)) - return ret; - - ck = (void *) trans->paths[iter->key_cache_path].l[0].b; - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); - } - - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); - } - - return 0; -} - -int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags, - unsigned long ip) -{ - kmsan_check_memory(k, bkey_bytes(&k->k)); - - btree_path_idx_t path_idx = iter->update_path ?: iter->path; - int ret; - - if (iter->flags & BTREE_ITER_is_extents) - return bch2_trans_update_extent(trans, iter, k, flags); - - if (bkey_deleted(&k->k) && - !(flags & BTREE_UPDATE_key_cache_reclaim) && - (iter->flags & BTREE_ITER_filter_snapshots)) { - ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); - if (unlikely(ret < 0)) - return ret; - - if (ret) - k->k.type = KEY_TYPE_whiteout; - } - - /* - * Ensure that updates to cached btrees go to the key cache: - */ - struct btree_path *path = trans->paths + path_idx; - if (!(flags & BTREE_UPDATE_key_cache_reclaim) && - !path->cached && - !path->level && - btree_id_cached(trans->c, path->btree_id)) { - ret = bch2_trans_update_get_key_cache(trans, iter, path); - if (ret) - return ret; - - path_idx = iter->key_cache_path; - } - - return bch2_trans_update_by_path(trans, path_idx, k, flags, ip); -} - -int bch2_btree_insert_clone_trans(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) -{ - struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k)); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - bkey_copy(n, k); - return bch2_btree_insert_trans(trans, btree, n, 0); -} - -void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, - struct btree_trans_subbuf *buf, - unsigned u64s) -{ - unsigned new_top = buf->u64s + u64s; - unsigned new_size = buf->size; - - BUG_ON(roundup_pow_of_two(new_top) > U16_MAX); - - if (new_top > new_size) - new_size = roundup_pow_of_two(new_top); - - void *n = bch2_trans_kmalloc_nomemzero(trans, new_size * sizeof(u64)); - if (IS_ERR(n)) - return n; - - unsigned offset = (u64 *) n - (u64 *) trans->mem; - BUG_ON(offset > U16_MAX); - - if (buf->u64s) - memcpy(n, - btree_trans_subbuf_base(trans, buf), - buf->size * sizeof(u64)); - buf->base = (u64 *) n - (u64 *) trans->mem; - buf->size = new_size; - - void *p = btree_trans_subbuf_top(trans, buf); - buf->u64s = new_top; - return p; -} - -int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, - enum btree_id btree, struct bpos end) -{ - bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); - struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter); - int ret = bkey_err(k); - if (ret) - goto err; - - bch2_btree_iter_advance(trans, iter); - k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) - goto err; - - BUG_ON(k.k->type != KEY_TYPE_deleted); - - if (bkey_gt(k.k->p, end)) { - ret = bch_err_throw(trans->c, ENOSPC_btree_slot); - goto err; - } - - return 0; -err: - bch2_trans_iter_exit(trans, iter); - return ret; -} - -void bch2_trans_commit_hook(struct btree_trans *trans, - struct btree_trans_commit_hook *h) -{ - h->next = trans->hooks; - trans->hooks = h; -} - -int bch2_btree_insert_nonextent(struct btree_trans *trans, - enum btree_id btree, struct bkey_i *k, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, btree, k->k.p, - BTREE_ITER_cached| - BTREE_ITER_not_extents| - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), - BTREE_ITER_intent|flags); - int ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/** - * bch2_btree_insert - insert keys into the extent btree - * @c: pointer to struct bch_fs - * @id: btree to insert into - * @k: key to insert - * @disk_res: must be non-NULL whenever inserting or potentially - * splitting data extents - * @flags: transaction commit flags - * @iter_flags: btree iter update trigger flags - * - * Returns: 0 on success, error code on failure - */ -int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, - struct disk_reservation *disk_res, int flags, - enum btree_iter_update_trigger_flags iter_flags) -{ - return bch2_trans_commit_do(c, disk_res, NULL, flags, - bch2_btree_insert_trans(trans, id, k, iter_flags)); -} - -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned update_flags) -{ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); - int ret = PTR_ERR_OR_ZERO(k); - if (ret) - return ret; - - bkey_init(&k->k); - k->k.p = iter->pos; - return bch2_trans_update(trans, iter, k, update_flags); -} - -int bch2_btree_delete(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - unsigned update_flags) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, btree, pos, - BTREE_ITER_cached| - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, update_flags); - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, - struct bpos start, struct bpos end, - unsigned update_flags, - u64 *journal_seq) -{ - u32 restart_count = trans->restart_count; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); - while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(trans->c, 0); - struct bkey_i delete; - - ret = bkey_err(k); - if (ret) - goto err; - - bkey_init(&delete.k); - - /* - * This could probably be more efficient for extents: - */ - - /* - * For extents, iter.pos won't necessarily be the same as - * bkey_start_pos(k.k) (for non extents they always will be the - * same). It's important that we delete starting from iter.pos - * because the range we want to delete could start in the middle - * of k. - * - * (bch2_btree_iter_peek() does guarantee that iter.pos >= - * bkey_start_pos(k.k)). - */ - delete.k.p = iter.pos; - - if (iter.flags & BTREE_ITER_is_extents) - bch2_key_resize(&delete.k, - bpos_min(end, k.k->p).offset - - iter.pos.offset); - - ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: - bch2_trans_commit(trans, &disk_res, journal_seq, - BCH_TRANS_COMMIT_no_enospc); - bch2_disk_reservation_put(trans->c, &disk_res); -err: - /* - * the bch2_trans_begin() call is in a weird place because we - * need to call it after every transaction commit, to avoid path - * overflow, but don't want to call it if the delete operation - * is a no-op and we have no work to do: - */ - bch2_trans_begin(trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret ?: trans_was_restarted(trans, restart_count); -} - -/* - * bch_btree_delete_range - delete everything within a given range - * - * Range is a half open interval - [start, end) - */ -int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, - struct bpos start, struct bpos end, - unsigned update_flags, - u64 *journal_seq) -{ - int ret = bch2_trans_run(c, - bch2_btree_delete_range_trans(trans, id, start, end, - update_flags, journal_seq)); - if (ret == -BCH_ERR_transaction_restart_nested) - ret = 0; - return ret; -} - -int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set) -{ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); - int ret = PTR_ERR_OR_ZERO(k); - if (ret) - return ret; - - bkey_init(&k->k); - k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k->k.p = iter->pos; - if (iter->flags & BTREE_ITER_is_extents) - bch2_key_resize(&k->k, 1); - - return bch2_trans_update(trans, iter, k, 0); -} - -int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, - struct bpos pos, bool set) -{ - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - - int ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_bit_mod_iter(trans, &iter, set); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, - struct bpos pos, bool set) -{ - struct bkey_i k; - - bkey_init(&k.k); - k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k.k.p = pos; - - return bch2_trans_update_buffered(trans, btree, &k); -} - -static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len) -{ - unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); - - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); - journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0); - return 0; -} - -int bch2_trans_log_str(struct btree_trans *trans, const char *str) -{ - return __bch2_trans_log_str(trans, str, strlen(str)); -} - -int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) -{ - int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - return ret; - - return __bch2_trans_log_str(trans, buf->buf, buf->pos); -} - -int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree, - unsigned level, struct bkey_i *k) -{ - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - journal_entry_init(e, BCH_JSET_ENTRY_log_bkey, btree, level, k->k.u64s); - bkey_copy(e->start, k); - return 0; -} - -__printf(3, 0) -static int -__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, - va_list args) -{ - struct printbuf buf = PRINTBUF; - prt_vprintf(&buf, fmt, args); - - unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - - int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - goto err; - - if (!test_bit(JOURNAL_running, &c->journal.flags)) { - ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); - if (ret) - goto err; - - struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); - journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0); - c->journal.early_journal_entries.nr += jset_u64s(u64s); - } else { - ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, - bch2_trans_log_msg(trans, &buf)); - } -err: - printbuf_exit(&buf); - return ret; -} - -__printf(2, 3) -int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = __bch2_fs_log_msg(c, 0, fmt, args); - va_end(args); - return ret; -} - -/* - * Use for logging messages during recovery to enable reserved space and avoid - * blocking. - */ -__printf(2, 3) -int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); - va_end(args); - return ret; -} diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h deleted file mode 100644 index 0b98ab959719ac..00000000000000 --- a/fs/bcachefs/btree_update.h +++ /dev/null @@ -1,429 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_UPDATE_H -#define _BCACHEFS_BTREE_UPDATE_H - -#include "btree_iter.h" -#include "journal.h" -#include "snapshot.h" - -struct bch_fs; -struct btree; - -void bch2_btree_node_prep_for_write(struct btree_trans *, - struct btree_path *, struct btree *); -bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, - struct btree *, struct btree_node_iter *, - struct bkey_i *); - -int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64); -int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64); -void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); - -void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, - struct bkey_i *, u64); - -#define BCH_TRANS_COMMIT_FLAGS() \ - x(no_enospc, "don't check for enospc") \ - x(no_check_rw, "don't attempt to take a ref on c->writes") \ - x(no_journal_res, "don't take a journal reservation, instead " \ - "pin journal entry referred to by trans->journal_res.seq") \ - x(journal_reclaim, "operation required for journal reclaim; may return error" \ - "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\ - x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied") - -enum __bch_trans_commit_flags { - /* First bits for bch_watermark: */ - __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS, -#define x(n, ...) __BCH_TRANS_COMMIT_##n, - BCH_TRANS_COMMIT_FLAGS() -#undef x -}; - -enum bch_trans_commit_flags { -#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n), - BCH_TRANS_COMMIT_FLAGS() -#undef x -}; - -void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); - -int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); - -int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, - struct bkey_i *, enum btree_iter_update_trigger_flags); - -int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, - enum btree_iter_update_trigger_flags); -int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct - disk_reservation *, int flags, enum - btree_iter_update_trigger_flags iter_flags); - -int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, - struct bpos, struct bpos, unsigned, u64 *); -int bch2_btree_delete_range(struct bch_fs *, enum btree_id, - struct bpos, struct bpos, unsigned, u64 *); - -int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool); -int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); -int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); - -static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, - enum btree_id btree, struct bpos pos) -{ - return bch2_btree_bit_mod_buffered(trans, btree, pos, false); -} - -int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, - struct bpos, snapshot_id_list *); - -/* - * For use when splitting extents in existing snapshots: - * - * If @old_pos is an interior snapshot node, iterate over descendent snapshot - * nodes: for every descendent snapshot in whiche @old_pos is overwritten and - * not visible, emit a whiteout at @new_pos. - */ -static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id btree, - struct bpos old_pos, - struct bpos new_pos) -{ - BUG_ON(old_pos.snapshot != new_pos.snapshot); - - if (!btree_type_has_snapshots(btree) || - bkey_eq(old_pos, new_pos)) - return 0; - - snapshot_id_list s; - int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s); - if (ret) - return ret; - - return s.nr - ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s) - : 0; -} - -int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, - enum btree_iter_update_trigger_flags, - struct bkey_s_c, struct bkey_s_c); - -int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos); - -int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_iter_update_trigger_flags, - unsigned long); - -static inline int __must_check -bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags) -{ - return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_); -} - -static inline void *btree_trans_subbuf_base(struct btree_trans *trans, - struct btree_trans_subbuf *buf) -{ - return (u64 *) trans->mem + buf->base; -} - -static inline void *btree_trans_subbuf_top(struct btree_trans *trans, - struct btree_trans_subbuf *buf) -{ - return (u64 *) trans->mem + buf->base + buf->u64s; -} - -void *__bch2_trans_subbuf_alloc(struct btree_trans *, - struct btree_trans_subbuf *, - unsigned); - -static inline void * -bch2_trans_subbuf_alloc(struct btree_trans *trans, - struct btree_trans_subbuf *buf, - unsigned u64s) -{ - if (buf->u64s + u64s > buf->size) - return __bch2_trans_subbuf_alloc(trans, buf, u64s); - - void *p = btree_trans_subbuf_top(trans, buf); - buf->u64s += u64s; - return p; -} - -static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans) -{ - return btree_trans_subbuf_base(trans, &trans->journal_entries); -} - -static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) -{ - return btree_trans_subbuf_top(trans, &trans->journal_entries); -} - -static inline struct jset_entry * -bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) -{ - return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s); -} - -int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); - -int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *); - -static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) -{ - kmsan_check_memory(k, bkey_bytes(&k->k)); - - EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); - - if (unlikely(!btree_type_uses_write_buffer(btree))) { - int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k); - dump_stack(); - return ret; - } - /* - * Most updates skip the btree write buffer until journal replay is - * finished because synchronization with journal replay relies on having - * a btree node locked - if we're overwriting a key in the journal that - * journal replay hasn't yet replayed, we have to mark it as - * overwritten. - * - * But accounting updates don't overwrite, they're deltas, and they have - * to be flushed to the btree strictly in order for journal replay to be - * able to tell which updates need to be applied: - */ - if (k->k.type != KEY_TYPE_accounting && - unlikely(trans->journal_replay_not_finished)) - return bch2_btree_insert_clone_trans(trans, btree, k); - - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s); - bkey_copy(e->start, k); - return 0; -} - -void bch2_trans_commit_hook(struct btree_trans *, - struct btree_trans_commit_hook *); -int __bch2_trans_commit(struct btree_trans *, unsigned); - -int bch2_trans_log_str(struct btree_trans *, const char *); -int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); -int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *); - -__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); -__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); - -/** - * bch2_trans_commit - insert keys at given iterator positions - * - * This is main entry point for btree updates. - * - * Return values: - * -EROFS: filesystem read only - * -EIO: journal or btree node IO error - */ -static inline int bch2_trans_commit(struct btree_trans *trans, - struct disk_reservation *disk_res, - u64 *journal_seq, - unsigned flags) -{ - trans->disk_res = disk_res; - trans->journal_seq = journal_seq; - - return __bch2_trans_commit(trans, flags); -} - -#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ - lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_flags))) - -#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ - nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_flags))) - -#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \ - bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) - -#define trans_for_each_update(_trans, _i) \ - for (struct btree_insert_entry *_i = (_trans)->updates; \ - (_i) < (_trans)->updates + (_trans)->nr_updates; \ - (_i)++) - -static inline void bch2_trans_reset_updates(struct btree_trans *trans) -{ - trans_for_each_update(trans, i) - bch2_path_put(trans, i->path, true); - - trans->nr_updates = 0; - trans->journal_entries.u64s = 0; - trans->journal_entries.size = 0; - trans->accounting.u64s = 0; - trans->accounting.size = 0; - trans->hooks = NULL; - trans->extra_disk_res = 0; -} - -static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, - unsigned type, unsigned min_bytes) -{ - unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); - struct bkey_i *mut; - - if (type && k.k->type != type) - return ERR_PTR(-ENOENT); - - /* extra padding for varint_decode_fast... */ - mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8); - if (!IS_ERR(mut)) { - bkey_reassemble(mut, k); - - if (unlikely(bytes > bkey_bytes(k.k))) { - memset((void *) mut + bkey_bytes(k.k), 0, - bytes - bkey_bytes(k.k)); - mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64)); - } - } - return mut; -} - -static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) -{ - return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); -} - -#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type) \ - bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k, \ - KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) - -static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned min_bytes) -{ - struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); - int ret; - - if (IS_ERR(mut)) - return mut; - - ret = bch2_trans_update(trans, iter, mut, flags); - if (ret) - return ERR_PTR(ret); - - *k = bkey_i_to_s_c(mut); - return mut; -} - -static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c *k, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); -} - -#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type) \ - bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\ - KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) - -static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned min_bytes) -{ - struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, - btree_id, pos, flags|BTREE_ITER_intent, type); - struct bkey_i *ret = IS_ERR(k.k) - ? ERR_CAST(k.k) - : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); - if (IS_ERR(ret)) - bch2_trans_iter_exit(trans, iter); - return ret; -} - -static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); -} - -static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned min_bytes) -{ - struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, - btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); - int ret; - - if (IS_ERR(mut)) - return mut; - - ret = bch2_trans_update(trans, iter, mut, flags); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ERR_PTR(ret); - } - - return mut; -} - -static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags, - unsigned min_bytes) -{ - return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); -} - -static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); -} - -#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ - bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \ - _btree_id, _pos, _flags, \ - KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) - -static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned val_size) -{ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); - int ret; - - if (IS_ERR(k)) - return k; - - bkey_init(&k->k); - k->k.p = iter->pos; - k->k.type = type; - set_bkey_val_bytes(&k->k, val_size); - - ret = bch2_trans_update(trans, iter, k, flags); - if (unlikely(ret)) - return ERR_PTR(ret); - return k; -} - -#define bch2_bkey_alloc(_trans, _iter, _flags, _type) \ - bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags, \ - KEY_TYPE_##_type, sizeof(struct bch_##_type))) - -#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c deleted file mode 100644 index 553059b33bfd62..00000000000000 --- a/fs/bcachefs/btree_update_interior.c +++ /dev/null @@ -1,2854 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_gc.h" -#include "btree_journal_iter.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "buckets.h" -#include "clock.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "keylist.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-members.h" -#include "super-io.h" -#include "trace.h" - -#include - -static const char * const bch2_btree_update_modes[] = { -#define x(t) #t, - BTREE_UPDATE_MODES() -#undef x - NULL -}; - -static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *); - -static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - btree_path_idx_t, struct btree *, struct keylist *); -static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); - -/* - * Verify that child nodes correctly span parent node's range: - */ -int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 - ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key - : b->data->min_key; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - struct bkey_buf prev; - int ret = 0; - - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, - b->data->min_key)); - - bch2_bkey_buf_init(&prev); - bkey_init(&prev.k->k); - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - - if (b == btree_node_root(c, b)) { - if (!bpos_eq(b->data->min_key, POS_MIN)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "btree root with incorrect min_key: "); - bch2_bpos_to_text(&buf, b->data->min_key); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_root_bad_min_key, &buf); - goto err; - } - - if (!bpos_eq(b->data->max_key, SPOS_MAX)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "btree root with incorrect max_key: "); - bch2_bpos_to_text(&buf, b->data->max_key); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_root_bad_max_key, &buf); - goto err; - } - } - - if (!b->c.level) - goto out; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - if (k.k->type != KEY_TYPE_btree_ptr_v2) - goto out; - - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - - struct bpos expected_min = bkey_deleted(&prev.k->k) - ? node_min - : bpos_successor(prev.k->k.p); - - if (!bpos_eq(expected_min, bp.v->min_key)) { - prt_str(&buf, "end of prev node doesn't match start of next node"); - prt_str(&buf, "\nprev "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - prt_str(&buf, "\nnext "); - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf); - goto err; - } - - bch2_bkey_buf_reassemble(&prev, c, k); - bch2_btree_and_journal_iter_advance(&iter); - } - - if (bkey_deleted(&prev.k->k)) { - prt_printf(&buf, "empty interior node\n"); - bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf); - goto err; - } - - if (!bpos_eq(prev.k->k.p, b->key.k.p)) { - prt_str(&buf, "last child node doesn't end at end of parent node\nchild: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf); - goto err; - } -out: - bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev, c); - printbuf_exit(&buf); - return ret; -err: - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_newline(&buf); - - ret = __bch2_topology_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - BUG_ON(!ret); - goto out; -} - -/* Calculate ideal packed bkey format for new btree nodes: */ - -static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) -{ - struct bkey_packed *k; - struct bkey uk; - - for_each_bset(b, t) - bset_tree_for_each_key(b, t, k) - if (!bkey_deleted(k)) { - uk = bkey_unpack_key(b, k); - bch2_bkey_format_add_key(s, &uk); - } -} - -static struct bkey_format bch2_btree_calc_format(struct btree *b) -{ - struct bkey_format_state s; - - bch2_bkey_format_init(&s); - bch2_bkey_format_add_pos(&s, b->data->min_key); - bch2_bkey_format_add_pos(&s, b->data->max_key); - __bch2_btree_calc_format(&s, b); - - return bch2_bkey_format_done(&s); -} - -static size_t btree_node_u64s_with_format(struct btree_nr_keys nr, - struct bkey_format *old_f, - struct bkey_format *new_f) -{ - /* stupid integer promotion rules */ - ssize_t delta = - (((int) new_f->key_u64s - old_f->key_u64s) * - (int) nr.packed_keys) + - (((int) new_f->key_u64s - BKEY_U64s) * - (int) nr.unpacked_keys); - - BUG_ON(delta + nr.live_u64s < 0); - - return nr.live_u64s + delta; -} - -/** - * bch2_btree_node_format_fits - check if we could rewrite node with a new format - * - * @c: filesystem handle - * @b: btree node to rewrite - * @nr: number of keys for new node (i.e. b->nr) - * @new_f: bkey format to translate keys to - * - * Returns: true if all re-packed keys will be able to fit in a new node. - * - * Assumes all keys will successfully pack with the new format. - */ -static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, - struct btree_nr_keys nr, - struct bkey_format *new_f) -{ - size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); - - return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b); -} - -/* Btree node freeing/allocation: */ - -static void __btree_node_free(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - - trace_and_count(c, btree_node_free, trans, b); - - BUG_ON(btree_node_write_blocked(b)); - BUG_ON(btree_node_dirty(b)); - BUG_ON(btree_node_need_write(b)); - BUG_ON(b == btree_node_root(c, b)); - BUG_ON(b->ob.nr); - BUG_ON(!list_empty(&b->write_blocked)); - BUG_ON(b->will_make_reachable); - - clear_btree_node_noevict(b); -} - -static void bch2_btree_node_free_inmem(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - struct bch_fs *c = trans->c; - - bch2_btree_node_lock_write_nofail(trans, path, &b->c); - - __btree_node_free(trans, b); - - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - six_unlock_write(&b->c.lock); - mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - - bch2_trans_node_drop(trans, b); -} - -static void bch2_btree_node_free_never_used(struct btree_update *as, - struct btree_trans *trans, - struct btree *b) -{ - struct bch_fs *c = as->c; - struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; - - BUG_ON(!list_empty(&b->write_blocked)); - BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); - - b->will_make_reachable = 0; - closure_put(&as->cl); - - clear_btree_node_will_make_reachable(b); - clear_btree_node_accessed(b); - clear_btree_node_dirty_acct(c, b); - clear_btree_node_need_write(b); - - mutex_lock(&c->btree_cache.lock); - __bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - BUG_ON(p->nr >= ARRAY_SIZE(p->b)); - p->b[p->nr++] = b; - - six_unlock_intent(&b->c.lock); - - bch2_trans_node_drop(trans, b); -} - -static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, - struct disk_reservation *res, - struct closure *cl, - bool interior_node, - unsigned target, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct write_point *wp; - struct btree *b; - BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - struct open_buckets obs = { .nr = 0 }; - struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim - ? BTREE_NODE_RESERVE - : 0; - int ret; - - b = bch2_btree_node_mem_alloc(trans, interior_node); - if (IS_ERR(b)) - return b; - - BUG_ON(b->ob.nr); - - mutex_lock(&c->btree_reserve_cache_lock); - if (c->btree_reserve_cache_nr > nr_reserve) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - obs = a->ob; - bkey_copy(&tmp.k, &a->k); - mutex_unlock(&c->btree_reserve_cache_lock); - goto out; - } - mutex_unlock(&c->btree_reserve_cache_lock); -retry: - ret = bch2_alloc_sectors_start_trans(trans, - target ?: - c->opts.metadata_target ?: - c->opts.foreground_target, - 0, - writepoint_ptr(&c->btree_write_point), - &devs_have, - res->nr_replicas, - min(res->nr_replicas, - c->opts.metadata_replicas_required), - watermark, - target ? BCH_WRITE_only_specified_devs : 0, - cl, &wp); - if (unlikely(ret)) - goto err; - - if (wp->sectors_free < btree_sectors(c)) { - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ob->sectors_free < btree_sectors(c)) - ob->sectors_free = 0; - - bch2_alloc_sectors_done(c, wp); - goto retry; - } - - bkey_btree_ptr_v2_init(&tmp.k); - bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); - - bch2_open_bucket_get(c, wp, &obs); - bch2_alloc_sectors_done(c, wp); -out: - bkey_copy(&b->key, &tmp.k); - b->ob = obs; - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - return b; -err: - bch2_btree_node_to_freelist(c, b); - return ERR_PTR(ret); -} - -static struct btree *bch2_btree_node_alloc(struct btree_update *as, - struct btree_trans *trans, - unsigned level) -{ - struct bch_fs *c = as->c; - struct btree *b; - struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; - int ret; - - BUG_ON(level >= BTREE_MAX_DEPTH); - BUG_ON(!p->nr); - - b = p->b[--p->nr]; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - - set_btree_node_accessed(b); - set_btree_node_dirty_acct(c, b); - set_btree_node_need_write(b); - - bch2_bset_init_first(b, &b->data->keys); - b->c.level = level; - b->c.btree_id = as->btree_id; - b->version_ondisk = c->sb.version; - - memset(&b->nr, 0, sizeof(b->nr)); - b->data->magic = cpu_to_le64(bset_magic(c)); - memset(&b->data->_ptr, 0, sizeof(b->data->_ptr)); - b->data->flags = 0; - SET_BTREE_NODE_ID(b->data, as->btree_id); - SET_BTREE_NODE_LEVEL(b->data, level); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); - - bp->v.mem_ptr = 0; - bp->v.seq = b->data->keys.seq; - bp->v.sectors_written = 0; - } - - SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); - - bch2_btree_build_aux_trees(b); - - ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); - BUG_ON(ret); - - trace_and_count(c, btree_node_alloc, trans, b); - bch2_increment_clock(c, btree_sectors(c), WRITE); - return b; -} - -static void btree_set_min(struct btree *b, struct bpos pos) -{ - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; - b->data->min_key = pos; -} - -static void btree_set_max(struct btree *b, struct bpos pos) -{ - b->key.k.p = pos; - b->data->max_key = pos; -} - -static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, - struct btree_trans *trans, - struct btree *b) -{ - struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level); - struct bkey_format format = bch2_btree_calc_format(b); - - /* - * The keys might expand with the new format - if they wouldn't fit in - * the btree node anymore, use the old format for now: - */ - if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format)) - format = b->format; - - SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); - - btree_set_min(n, b->data->min_key); - btree_set_max(n, b->data->max_key); - - n->data->format = format; - btree_node_set_format(n, format); - - bch2_btree_sort_into(as->c, n, b); - - btree_node_reset_sib_u64s(n); - return n; -} - -static struct btree *__btree_root_alloc(struct btree_update *as, - struct btree_trans *trans, unsigned level) -{ - struct btree *b = bch2_btree_node_alloc(as, trans, level); - - btree_set_min(b, POS_MIN); - btree_set_max(b, SPOS_MAX); - b->data->format = bch2_btree_calc_format(b); - - btree_node_set_format(b, b->data->format); - bch2_btree_build_aux_trees(b); - - return b; -} - -static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans) -{ - struct bch_fs *c = as->c; - struct prealloc_nodes *p; - - for (p = as->prealloc_nodes; - p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); - p++) { - while (p->nr) { - struct btree *b = p->b[--p->nr]; - - mutex_lock(&c->btree_reserve_cache_lock); - - if (c->btree_reserve_cache_nr < - ARRAY_SIZE(c->btree_reserve_cache)) { - struct btree_alloc *a = - &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; - - a->ob = b->ob; - b->ob.nr = 0; - bkey_copy(&a->k, &b->key); - } else { - bch2_open_buckets_put(c, &b->ob); - } - - mutex_unlock(&c->btree_reserve_cache_lock); - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - __btree_node_free(trans, b); - bch2_btree_node_to_freelist(c, b); - } - } -} - -static int bch2_btree_reserve_get(struct btree_trans *trans, - struct btree_update *as, - unsigned nr_nodes[2], - unsigned target, - unsigned flags, - struct closure *cl) -{ - struct btree *b; - unsigned interior; - int ret = 0; - - BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); - - /* - * Protects reaping from the btree node cache and using the btree node - * open bucket reserve: - */ - ret = bch2_btree_cache_cannibalize_lock(trans, cl); - if (ret) - return ret; - - for (interior = 0; interior < 2; interior++) { - struct prealloc_nodes *p = as->prealloc_nodes + interior; - - while (p->nr < nr_nodes[interior]) { - b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, - interior, target, flags); - if (IS_ERR(b)) { - ret = PTR_ERR(b); - goto err; - } - - p->b[p->nr++] = b; - } - } -err: - bch2_btree_cache_cannibalize_unlock(trans); - return ret; -} - -/* Asynchronous interior node update machinery */ - -static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans) -{ - struct bch_fs *c = as->c; - - if (as->took_gc_lock) - up_read(&c->gc_lock); - as->took_gc_lock = false; - - bch2_journal_pin_drop(&c->journal, &as->journal); - bch2_journal_pin_flush(&c->journal, &as->journal); - bch2_disk_reservation_put(c, &as->disk_res); - bch2_btree_reserve_put(as, trans); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], - as->start_time); - - mutex_lock(&c->btree_interior_update_lock); - list_del(&as->unwritten_list); - list_del(&as->list); - - closure_debug_destroy(&as->cl); - mempool_free(as, &c->btree_interior_update_pool); - - /* - * Have to do the wakeup with btree_interior_update_lock still held, - * since being on btree_interior_update_list is our ref on @c: - */ - closure_wake_up(&c->btree_interior_update_wait); - - mutex_unlock(&c->btree_interior_update_lock); -} - -static void btree_update_add_key(struct btree_update *as, - struct keylist *keys, struct btree *b) -{ - struct bkey_i *k = &b->key; - - BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > - ARRAY_SIZE(as->_old_keys)); - - bkey_copy(keys->top, k); - bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1; - - bch2_keylist_push(keys); -} - -static bool btree_update_new_nodes_marked_sb(struct btree_update *as) -{ - for_each_keylist_key(&as->new_keys, k) - if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) - return false; - return true; -} - -static void btree_update_new_nodes_mark_sb(struct btree_update *as) -{ - struct bch_fs *c = as->c; - - mutex_lock(&c->sb_lock); - for_each_keylist_key(&as->new_keys, k) - bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); -} - -/* - * The transactional part of an interior btree node update, where we journal the - * update we did to the interior node and update alloc info: - */ -static int btree_update_nodes_written_trans(struct btree_trans *trans, - struct btree_update *as) -{ - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64)); - - trans->journal_pin = &as->journal; - - for_each_keylist_key(&as->old_keys, k) { - unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - - ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), - BTREE_TRIGGER_transactional); - if (ret) - return ret; - } - - for_each_keylist_key(&as->new_keys, k) { - unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - - ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), - BTREE_TRIGGER_transactional); - if (ret) - return ret; - } - - return 0; -} - -/* If the node has been reused, we might be reading uninitialized memory - that's fine: */ -static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) -{ - struct btree_node *b_data = READ_ONCE(b->data); - - return (b_data ? b_data->keys.seq : 0) == seq; -} - -static void btree_update_nodes_written(struct btree_update *as) -{ - struct bch_fs *c = as->c; - struct btree *b; - struct btree_trans *trans = bch2_trans_get(c); - u64 journal_seq = 0; - unsigned i; - int ret; - - /* - * If we're already in an error state, it might be because a btree node - * was never written, and we might be trying to free that same btree - * node here, but it won't have been marked as allocated and we'll see - * spurious disk usage inconsistencies in the transactional part below - * if we don't skip it: - */ - ret = bch2_journal_error(&c->journal); - if (ret) - goto err; - - if (!btree_update_new_nodes_marked_sb(as)) - btree_update_new_nodes_mark_sb(as); - - /* - * Wait for any in flight writes to finish before we free the old nodes - * on disk. But we haven't pinned those old nodes in the btree cache, - * they might have already been evicted. - * - * The update we're completing deleted references to those nodes from the - * btree, so we know if they've been evicted they can't be pulled back in. - * We just have to check if the nodes we have pointers to are still those - * old nodes, and haven't been reused. - * - * This can't be done locklessly because the data buffer might have been - * vmalloc allocated, and they're not RCU freed. We also need the - * __no_kmsan_checks annotation because even with the btree node read - * lock, nothing tells us that the data buffer has been initialized (if - * the btree node has been reused for a different node, and the data - * buffer swapped for a new data buffer). - */ - for (i = 0; i < as->nr_old_nodes; i++) { - b = as->old_nodes[i]; - - bch2_trans_begin(trans); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]); - six_unlock_read(&b->c.lock); - bch2_trans_unlock_long(trans); - - if (seq_matches) - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, - TASK_UNINTERRUPTIBLE); - } - - /* - * We did an update to a parent node where the pointers we added pointed - * to child nodes that weren't written yet: now, the child nodes have - * been written so we can write out the update to the interior node. - */ - - /* - * We can't call into journal reclaim here: we'd block on the journal - * reclaim lock, but we may need to release the open buckets we have - * pinned in order for other btree updates to make forward progress, and - * journal reclaim does btree updates when flushing bkey_cached entries, - * which may require allocations as well. - */ - ret = commit_do(trans, &as->disk_res, &journal_seq, - BCH_WATERMARK_interior_updates| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_journal_reclaim, - btree_update_nodes_written_trans(trans, as)); - bch2_trans_unlock(trans); - - bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, - "%s", bch2_err_str(ret)); -err: - /* - * Ensure transaction is unlocked before using btree_node_lock_nopath() - * (the use of which is always suspect, we need to work on removing this - * in the future) - * - * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() - * calls bch2_path_upgrade(), before we call path_make_mut(), so we may - * rarely end up with a locked path besides the one we have here: - */ - bch2_trans_unlock(trans); - bch2_trans_begin(trans); - - /* - * We have to be careful because another thread might be getting ready - * to free as->b and calling btree_update_reparent() on us - we'll - * recheck under btree_update_lock below: - */ - b = READ_ONCE(as->b); - if (b) { - /* - * @b is the node we did the final insert into: - * - * On failure to get a journal reservation, we still have to - * unblock the write and allow most of the write path to happen - * so that shutdown works, but the i->journal_seq mechanism - * won't work to prevent the btree write from being visible (we - * didn't get a journal sequence number) - instead - * __bch2_btree_node_write() doesn't do the actual write if - * we're in journal error state: - */ - - btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans, - as->btree_id, b->c.level, b->key.k.p); - struct btree_path *path = trans->paths + path_idx; - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); - path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); - path->l[b->c.level].b = b; - - bch2_btree_node_lock_write_nofail(trans, path, &b->c); - - mutex_lock(&c->btree_interior_update_lock); - - list_del(&as->write_blocked_list); - if (list_empty(&b->write_blocked)) - clear_btree_node_write_blocked(b); - - /* - * Node might have been freed, recheck under - * btree_interior_update_lock: - */ - if (as->b == b) { - BUG_ON(!b->c.level); - BUG_ON(!btree_node_dirty(b)); - - if (!ret) { - struct bset *last = btree_bset_last(b); - - last->journal_seq = cpu_to_le64( - max(journal_seq, - le64_to_cpu(last->journal_seq))); - - bch2_btree_add_journal_pin(c, b, journal_seq); - } else { - /* - * If we didn't get a journal sequence number we - * can't write this btree node, because recovery - * won't know to ignore this write: - */ - set_btree_node_never_write(b); - } - } - - mutex_unlock(&c->btree_interior_update_lock); - - mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - six_unlock_write(&b->c.lock); - - btree_node_write_if_need(trans, b, SIX_LOCK_intent); - btree_node_unlock(trans, path, b->c.level); - bch2_path_put(trans, path_idx, true); - } - - bch2_journal_pin_drop(&c->journal, &as->journal); - - mutex_lock(&c->btree_interior_update_lock); - for (i = 0; i < as->nr_new_nodes; i++) { - b = as->new_nodes[i]; - - BUG_ON(b->will_make_reachable != (unsigned long) as); - b->will_make_reachable = 0; - clear_btree_node_will_make_reachable(b); - } - mutex_unlock(&c->btree_interior_update_lock); - - for (i = 0; i < as->nr_new_nodes; i++) { - b = as->new_nodes[i]; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - btree_node_write_if_need(trans, b, SIX_LOCK_read); - six_unlock_read(&b->c.lock); - } - - for (i = 0; i < as->nr_open_buckets; i++) - bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); - - bch2_btree_update_free(as, trans); - bch2_trans_put(trans); -} - -static void btree_interior_update_work(struct work_struct *work) -{ - struct bch_fs *c = - container_of(work, struct bch_fs, btree_interior_update_work); - struct btree_update *as; - - while (1) { - mutex_lock(&c->btree_interior_update_lock); - as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, - struct btree_update, unwritten_list); - if (as && !as->nodes_written) - as = NULL; - mutex_unlock(&c->btree_interior_update_lock); - - if (!as) - break; - - btree_update_nodes_written(as); - } -} - -static CLOSURE_CALLBACK(btree_update_set_nodes_written) -{ - closure_type(as, struct btree_update, cl); - struct bch_fs *c = as->c; - - mutex_lock(&c->btree_interior_update_lock); - as->nodes_written = true; - mutex_unlock(&c->btree_interior_update_lock); - - queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); -} - -/* - * We're updating @b with pointers to nodes that haven't finished writing yet: - * block @b from being written until @as completes - */ -static void btree_update_updated_node(struct btree_update *as, struct btree *b) -{ - struct bch_fs *c = as->c; - - BUG_ON(as->mode != BTREE_UPDATE_none); - BUG_ON(as->update_level_end < b->c.level); - BUG_ON(!btree_node_dirty(b)); - BUG_ON(!b->c.level); - - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - - as->mode = BTREE_UPDATE_node; - as->b = b; - as->update_level_end = b->c.level; - - set_btree_node_write_blocked(b); - list_add(&as->write_blocked_list, &b->write_blocked); - - mutex_unlock(&c->btree_interior_update_lock); -} - -static int bch2_update_reparent_journal_pin_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - return 0; -} - -static void btree_update_reparent(struct btree_update *as, - struct btree_update *child) -{ - struct bch_fs *c = as->c; - - lockdep_assert_held(&c->btree_interior_update_lock); - - child->b = NULL; - child->mode = BTREE_UPDATE_update; - - bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, - bch2_update_reparent_journal_pin_flush); -} - -static void btree_update_updated_root(struct btree_update *as, struct btree *b) -{ - struct bkey_i *insert = &b->key; - struct bch_fs *c = as->c; - - BUG_ON(as->mode != BTREE_UPDATE_none); - - BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > - ARRAY_SIZE(as->journal_entries)); - - as->journal_u64s += - journal_entry_set((void *) &as->journal_entries[as->journal_u64s], - BCH_JSET_ENTRY_btree_root, - b->c.btree_id, b->c.level, - insert, insert->k.u64s); - - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - - as->mode = BTREE_UPDATE_root; - mutex_unlock(&c->btree_interior_update_lock); -} - -/* - * bch2_btree_update_add_new_node: - * - * This causes @as to wait on @b to be written, before it gets to - * bch2_btree_update_nodes_written - * - * Additionally, it sets b->will_make_reachable to prevent any additional writes - * to @b from happening besides the first until @b is reachable on disk - * - * And it adds @b to the list of @as's new nodes, so that we can update sector - * counts in bch2_btree_update_nodes_written: - */ -static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) -{ - struct bch_fs *c = as->c; - - closure_get(&as->cl); - - mutex_lock(&c->btree_interior_update_lock); - BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); - BUG_ON(b->will_make_reachable); - - as->new_nodes[as->nr_new_nodes++] = b; - b->will_make_reachable = 1UL|(unsigned long) as; - set_btree_node_will_make_reachable(b); - - mutex_unlock(&c->btree_interior_update_lock); - - btree_update_add_key(as, &as->new_keys, b); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data; - unsigned sectors = round_up(bytes, block_bytes(c)) >> 9; - - bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = - cpu_to_le16(sectors); - } -} - -/* - * returns true if @b was a new node - */ -static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) -{ - struct btree_update *as; - unsigned long v; - unsigned i; - - mutex_lock(&c->btree_interior_update_lock); - /* - * When b->will_make_reachable != 0, it owns a ref on as->cl that's - * dropped when it gets written by bch2_btree_complete_write - the - * xchg() is for synchronization with bch2_btree_complete_write: - */ - v = xchg(&b->will_make_reachable, 0); - clear_btree_node_will_make_reachable(b); - as = (struct btree_update *) (v & ~1UL); - - if (!as) { - mutex_unlock(&c->btree_interior_update_lock); - return; - } - - for (i = 0; i < as->nr_new_nodes; i++) - if (as->new_nodes[i] == b) - goto found; - - BUG(); -found: - array_remove_item(as->new_nodes, as->nr_new_nodes, i); - mutex_unlock(&c->btree_interior_update_lock); - - if (v & 1) - closure_put(&as->cl); -} - -static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) -{ - while (b->ob.nr) - as->open_buckets[as->nr_open_buckets++] = - b->ob.v[--b->ob.nr]; -} - -static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - return 0; -} - -/* - * @b is being split/rewritten: it may have pointers to not-yet-written btree - * nodes and thus outstanding btree_updates - redirect @b's - * btree_updates to point to this btree_update: - */ -static void bch2_btree_interior_update_will_free_node(struct btree_update *as, - struct btree *b) -{ - struct bch_fs *c = as->c; - struct btree_update *p, *n; - struct btree_write *w; - - set_btree_node_dying(b); - - if (btree_node_fake(b)) - return; - - mutex_lock(&c->btree_interior_update_lock); - - /* - * Does this node have any btree_update operations preventing - * it from being written? - * - * If so, redirect them to point to this btree_update: we can - * write out our new nodes, but we won't make them visible until those - * operations complete - */ - list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { - list_del_init(&p->write_blocked_list); - btree_update_reparent(as, p); - - /* - * for flush_held_btree_writes() waiting on updates to flush or - * nodes to be writeable: - */ - closure_wake_up(&c->btree_interior_update_wait); - } - - clear_btree_node_dirty_acct(c, b); - clear_btree_node_need_write(b); - clear_btree_node_write_blocked(b); - - /* - * Does this node have unwritten data that has a pin on the journal? - * - * If so, transfer that pin to the btree_update operation - - * note that if we're freeing multiple nodes, we only need to keep the - * oldest pin of any of the nodes we're freeing. We'll release the pin - * when the new nodes are persistent and reachable on disk: - */ - w = btree_current_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, - bch2_btree_update_will_free_node_journal_pin_flush); - bch2_journal_pin_drop(&c->journal, &w->journal); - - w = btree_prev_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, - bch2_btree_update_will_free_node_journal_pin_flush); - bch2_journal_pin_drop(&c->journal, &w->journal); - - mutex_unlock(&c->btree_interior_update_lock); - - /* - * Is this a node that isn't reachable on disk yet? - * - * Nodes that aren't reachable yet have writes blocked until they're - * reachable - now that we've cancelled any pending writes and moved - * things waiting on that write to wait on this update, we can drop this - * node from the list of nodes that the other update is making - * reachable, prior to freeing it: - */ - btree_update_drop_new_node(c, b); - - btree_update_add_key(as, &as->old_keys, b); - - as->old_nodes[as->nr_old_nodes] = b; - as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; - as->nr_old_nodes++; -} - -static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans) -{ - struct bch_fs *c = as->c; - u64 start_time = as->start_time; - - BUG_ON(as->mode == BTREE_UPDATE_none); - - if (as->took_gc_lock) - up_read(&as->c->gc_lock); - as->took_gc_lock = false; - - bch2_btree_reserve_put(as, trans); - - continue_at(&as->cl, btree_update_set_nodes_written, - as->c->btree_interior_update_worker); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], - start_time); -} - -static const char * const btree_node_reawrite_reason_strs[] = { -#define x(n) #n, - BTREE_NODE_REWRITE_REASON() -#undef x - NULL, -}; - -static struct btree_update * -bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level_start, bool split, - unsigned target, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_update *as; - u64 start_time = local_clock(); - int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) - ? BCH_DISK_RESERVATION_NOFAIL : 0; - unsigned nr_nodes[2] = { 0, 0 }; - unsigned level_end = level_start; - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - int ret = 0; - u32 restart_count = trans->restart_count; - - BUG_ON(!path->should_be_locked); - - if (watermark == BCH_WATERMARK_copygc) - watermark = BCH_WATERMARK_btree_copygc; - if (watermark < BCH_WATERMARK_btree) - watermark = BCH_WATERMARK_btree; - - flags &= ~BCH_WATERMARK_MASK; - flags |= watermark; - - if (watermark < BCH_WATERMARK_reclaim && - test_bit(JOURNAL_space_low, &c->journal.flags)) { - if (flags & BCH_TRANS_COMMIT_journal_reclaim) - return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); - - ret = drop_locks_do(trans, - ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; })); - if (ret) - return ERR_PTR(ret); - } - - while (1) { - nr_nodes[!!level_end] += 1 + split; - level_end++; - - ret = bch2_btree_path_upgrade(trans, path, level_end + 1); - if (ret) - return ERR_PTR(ret); - - if (!btree_path_node(path, level_end)) { - /* Allocating new root? */ - nr_nodes[1] += split; - level_end = BTREE_MAX_DEPTH; - break; - } - - /* - * Always check for space for two keys, even if we won't have to - * split at prior level - it might have been a merge instead: - */ - if (bch2_btree_node_insert_fits(path->l[level_end].b, - BKEY_BTREE_PTR_U64s_MAX * 2)) - break; - - split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); - } - - if (!down_read_trylock(&c->gc_lock)) { - ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); - if (ret) { - up_read(&c->gc_lock); - return ERR_PTR(ret); - } - } - - as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); - memset(as, 0, sizeof(*as)); - closure_init(&as->cl, NULL); - as->c = c; - as->start_time = start_time; - as->ip_started = _RET_IP_; - as->mode = BTREE_UPDATE_none; - as->flags = flags; - as->took_gc_lock = true; - as->btree_id = path->btree_id; - as->update_level_start = level_start; - as->update_level_end = level_end; - INIT_LIST_HEAD(&as->list); - INIT_LIST_HEAD(&as->unwritten_list); - INIT_LIST_HEAD(&as->write_blocked_list); - bch2_keylist_init(&as->old_keys, as->_old_keys); - bch2_keylist_init(&as->new_keys, as->_new_keys); - bch2_keylist_init(&as->parent_keys, as->inline_keys); - - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->list, &c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - - struct btree *b = btree_path_node(path, path->level); - as->node_start = b->data->min_key; - as->node_end = b->data->max_key; - as->node_needed_rewrite = btree_node_rewrite_reason(b); - as->node_written = b->written; - as->node_sectors = btree_buf_bytes(b) >> 9; - as->node_remaining = __bch2_btree_u64s_remaining(b, - btree_bkey_last(b, bset_tree_last(b))); - - /* - * We don't want to allocate if we're in an error state, that can cause - * deadlock on emergency shutdown due to open buckets getting stuck in - * the btree_reserve_cache after allocator shutdown has cleared it out. - * This check needs to come after adding us to the btree_interior_update - * list but before calling bch2_btree_reserve_get, to synchronize with - * __bch2_fs_read_only(). - */ - ret = bch2_journal_error(&c->journal); - if (ret) - goto err; - - ret = bch2_disk_reservation_get(c, &as->disk_res, - (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), - READ_ONCE(c->opts.metadata_replicas), - disk_res_flags); - if (ret) - goto err; - - ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); - if (bch2_err_matches(ret, ENOSPC) || - bch2_err_matches(ret, ENOMEM)) { - struct closure cl; - - /* - * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK - * flag - */ - if (bch2_err_matches(ret, ENOSPC) && - (flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) { - ret = bch_err_throw(c, journal_reclaim_would_deadlock); - goto err; - } - - closure_init_stack(&cl); - - do { - ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); - if (!bch2_err_matches(ret, BCH_ERR_operation_blocked)) - break; - bch2_trans_unlock(trans); - bch2_wait_on_allocator(c, &cl); - } while (1); - } - - if (ret) { - trace_and_count(c, btree_reserve_get_fail, trans->fn, - _RET_IP_, nr_nodes[0] + nr_nodes[1], ret); - goto err; - } - - ret = bch2_trans_relock(trans); - if (ret) - goto err; - - bch2_trans_verify_not_restarted(trans, restart_count); - return as; -err: - bch2_btree_update_free(as, trans); - if (!bch2_err_matches(ret, ENOSPC) && - !bch2_err_matches(ret, EROFS) && - ret != -BCH_ERR_journal_reclaim_would_deadlock && - ret != -BCH_ERR_journal_shutdown) - bch_err_fn_ratelimited(c, ret); - return ERR_PTR(ret); -} - -/* Btree root updates: */ - -static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) -{ - /* Root nodes cannot be reaped */ - mutex_lock(&c->btree_cache.lock); - list_del_init(&b->list); - mutex_unlock(&c->btree_cache.lock); - - mutex_lock(&c->btree_root_lock); - bch2_btree_id_root(c, b->c.btree_id)->b = b; - mutex_unlock(&c->btree_root_lock); - - bch2_recalc_btree_reserve(c); -} - -static int bch2_btree_set_root(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - bool nofail) -{ - struct bch_fs *c = as->c; - - trace_and_count(c, btree_node_set_root, trans, b); - - struct btree *old = btree_node_root(c, b); - - /* - * Ensure no one is using the old root while we switch to the - * new root: - */ - if (nofail) { - bch2_btree_node_lock_write_nofail(trans, path, &old->c); - } else { - int ret = bch2_btree_node_lock_write(trans, path, &old->c); - if (ret) - return ret; - } - - bch2_btree_set_root_inmem(c, b); - - btree_update_updated_root(as, b); - - /* - * Unlock old root after new root is visible: - * - * The new root isn't persistent, but that's ok: we still have - * an intent lock on the new root, and any updates that would - * depend on the new root would have to update the new root. - */ - bch2_btree_node_unlock_write(trans, path, old); - return 0; -} - -/* Interior node updates: */ - -static void bch2_insert_fixup_btree_ptr(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_i *insert) -{ - struct bch_fs *c = as->c; - struct bkey_packed *k; - struct printbuf buf = PRINTBUF; - unsigned long old, new; - - BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && - !btree_ptr_sectors_written(bkey_i_to_s_c(insert))); - - if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) - bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); - - struct bkey_validate_context from = (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = BCH_VALIDATE_commit, - }; - if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?: - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) { - bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); - dump_stack(); - } - - BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > - ARRAY_SIZE(as->journal_entries)); - - as->journal_u64s += - journal_entry_set((void *) &as->journal_entries[as->journal_u64s], - BCH_JSET_ENTRY_btree_keys, - b->c.btree_id, b->c.level, - insert, insert->k.u64s); - - while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && - bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) - bch2_btree_node_iter_advance(node_iter, b); - - bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); - set_btree_node_dirty_acct(c, b); - - old = READ_ONCE(b->flags); - do { - new = old; - - new &= ~BTREE_WRITE_TYPE_MASK; - new |= BTREE_WRITE_interior; - new |= 1 << BTREE_NODE_need_write; - } while (!try_cmpxchg(&b->flags, &old, new)); - - printbuf_exit(&buf); -} - -static int -bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter node_iter, - struct keylist *keys) -{ - struct bkey_i *insert = bch2_keylist_front(keys); - struct bkey_packed *k; - - BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); - - while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && - (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) - ; - - for (; - insert != keys->top && bpos_le(insert->k.p, b->key.k.p); - insert = bkey_next(insert)) - bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); - - int ret = bch2_btree_node_check_topology(trans, b); - if (ret) { - struct printbuf buf = PRINTBUF; - - for (struct bkey_i *k = keys->keys; - k != insert; - k = bkey_next(k)) { - bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); - prt_newline(&buf); - } - - bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s", - (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf); - dump_stack(); - return ret; - } - - memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); - keys->top_p -= insert->_data - keys->keys_p; - return 0; -} - -static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) -{ - if (insert_keys) - for_each_keylist_key(insert_keys, k) - if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos)) - return true; - return false; -} - -/* - * Move keys from n1 (original replacement node, now lower node) to n2 (higher - * node) - */ -static void __btree_split_node(struct btree_update *as, - struct btree_trans *trans, - struct btree *b, - struct btree *n[2], - struct keylist *insert_keys) -{ - struct bkey_packed *k; - struct bpos n1_pos = POS_MIN; - struct btree_node_iter iter; - struct bset *bsets[2]; - struct bkey_format_state format[2]; - struct bkey_packed *out[2]; - struct bkey uk; - unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5; - struct { unsigned nr_keys, val_u64s; } nr_keys[2]; - int i; - - memset(&nr_keys, 0, sizeof(nr_keys)); - - for (i = 0; i < 2; i++) { - BUG_ON(n[i]->nsets != 1); - - bsets[i] = btree_bset_first(n[i]); - out[i] = bsets[i]->start; - - SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1); - bch2_bkey_format_init(&format[i]); - } - - u64s = 0; - for_each_btree_node_key(b, k, &iter) { - if (bkey_deleted(k)) - continue; - - uk = bkey_unpack_key(b, k); - - if (b->c.level && - u64s < n1_u64s && - u64s + k->u64s >= n1_u64s && - (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) || - key_deleted_in_insert(insert_keys, uk.p))) - n1_u64s += k->u64s; - - i = u64s >= n1_u64s; - u64s += k->u64s; - if (!i) - n1_pos = uk.p; - bch2_bkey_format_add_key(&format[i], &uk); - - nr_keys[i].nr_keys++; - nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k); - } - - btree_set_min(n[0], b->data->min_key); - btree_set_max(n[0], n1_pos); - btree_set_min(n[1], bpos_successor(n1_pos)); - btree_set_max(n[1], b->data->max_key); - - for (i = 0; i < 2; i++) { - bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key); - bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key); - - n[i]->data->format = bch2_bkey_format_done(&format[i]); - - unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + - nr_keys[i].val_u64s; - if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b)) - n[i]->data->format = b->format; - - btree_node_set_format(n[i], n[i]->data->format); - } - - u64s = 0; - for_each_btree_node_key(b, k, &iter) { - if (bkey_deleted(k)) - continue; - - i = u64s >= n1_u64s; - u64s += k->u64s; - - if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k) - ? &b->format: &bch2_bkey_format_current, k)) - out[i]->format = KEY_FORMAT_LOCAL_BTREE; - else - bch2_bkey_unpack(b, (void *) out[i], k); - - out[i]->needs_whiteout = false; - - btree_keys_account_key_add(&n[i]->nr, 0, out[i]); - out[i] = bkey_p_next(out[i]); - } - - for (i = 0; i < 2; i++) { - bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data); - - BUG_ON(!bsets[i]->u64s); - - set_btree_bset_end(n[i], n[i]->set); - - btree_node_reset_sib_u64s(n[i]); - - bch2_verify_btree_nr_keys(n[i]); - - BUG_ON(bch2_btree_node_check_topology(trans, n[i])); - } -} - -/* - * For updates to interior nodes, we've got to do the insert before we split - * because the stuff we're inserting has to be inserted atomically. Post split, - * the keys might have to go in different nodes and the split would no longer be - * atomic. - * - * Worse, if the insert is from btree node coalescing, if we do the insert after - * we do the split (and pick the pivot) - the pivot we pick might be between - * nodes that were coalesced, and thus in the middle of a child node post - * coalescing: - */ -static int btree_split_insert_keys(struct btree_update *as, - struct btree_trans *trans, - btree_path_idx_t path_idx, - struct btree *b, - struct keylist *keys) -{ - struct btree_path *path = trans->paths + path_idx; - - if (!bch2_keylist_empty(keys) && - bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { - struct btree_node_iter node_iter; - - bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - - int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - if (ret) - return ret; - } - - return 0; -} - -static int btree_split(struct btree_update *as, struct btree_trans *trans, - btree_path_idx_t path, struct btree *b, - struct keylist *keys) -{ - struct bch_fs *c = as->c; - struct btree *parent = btree_node_parent(trans->paths + path, b); - struct btree *n1, *n2 = NULL, *n3 = NULL; - btree_path_idx_t path1 = 0, path2 = 0; - u64 start_time = local_clock(); - int ret = 0; - - bch2_verify_btree_nr_keys(b); - BUG_ON(!parent && (b != btree_node_root(c, b))); - BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); - - ret = bch2_btree_node_check_topology(trans, b); - if (ret) - return ret; - - if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { - struct btree *n[2]; - - trace_and_count(c, btree_node_split, trans, b); - - n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); - n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); - - __btree_split_node(as, trans, b, n, keys); - - if (keys) { - ret = btree_split_insert_keys(as, trans, path, n1, keys) ?: - btree_split_insert_keys(as, trans, path, n2, keys); - if (ret) - goto err; - BUG_ON(!bch2_keylist_empty(keys)); - } - - bch2_btree_build_aux_trees(n2); - bch2_btree_build_aux_trees(n1); - - bch2_btree_update_add_new_node(as, n1); - bch2_btree_update_add_new_node(as, n2); - six_unlock_write(&n2->c.lock); - six_unlock_write(&n1->c.lock); - - path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); - six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path1, n1); - - path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p); - six_lock_increment(&n2->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path2, n2); - - /* - * Note that on recursive parent_keys == keys, so we - * can't start adding new keys to parent_keys before emptying it - * out (which we did with btree_split_insert_keys() above) - */ - bch2_keylist_add(&as->parent_keys, &n1->key); - bch2_keylist_add(&as->parent_keys, &n2->key); - - if (!parent) { - /* Depth increases, make a new root */ - n3 = __btree_root_alloc(as, trans, b->c.level + 1); - - bch2_btree_update_add_new_node(as, n3); - six_unlock_write(&n3->c.lock); - - trans->paths[path2].locks_want++; - BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level)); - six_lock_increment(&n3->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path2, n3); - - n3->sib_u64s[0] = U16_MAX; - n3->sib_u64s[1] = U16_MAX; - - ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); - if (ret) - goto err; - } - } else { - trace_and_count(c, btree_node_compact, trans, b); - - n1 = bch2_btree_node_alloc_replacement(as, trans, b); - - if (keys) { - ret = btree_split_insert_keys(as, trans, path, n1, keys); - if (ret) - goto err; - BUG_ON(!bch2_keylist_empty(keys)); - } - - bch2_btree_build_aux_trees(n1); - bch2_btree_update_add_new_node(as, n1); - six_unlock_write(&n1->c.lock); - - path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); - six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path1, n1); - - if (parent) - bch2_keylist_add(&as->parent_keys, &n1->key); - } - - /* New nodes all written, now make them visible: */ - - if (parent) { - /* Split a non root node */ - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); - } else if (n3) { - ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false); - } else { - /* Root filled up but didn't need to be split */ - ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false); - } - - if (ret) - goto err; - - bch2_btree_interior_update_will_free_node(as, b); - - if (n3) { - bch2_btree_update_get_open_buckets(as, n3); - bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0); - } - if (n2) { - bch2_btree_update_get_open_buckets(as, n2); - bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0); - } - bch2_btree_update_get_open_buckets(as, n1); - bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0); - - /* - * The old node must be freed (in memory) _before_ unlocking the new - * nodes - else another thread could re-acquire a read lock on the old - * node after another thread has locked and updated the new node, thus - * seeing stale data: - */ - bch2_btree_node_free_inmem(trans, trans->paths + path, b); - - if (n3) - bch2_trans_node_add(trans, trans->paths + path, n3); - if (n2) - bch2_trans_node_add(trans, trans->paths + path2, n2); - bch2_trans_node_add(trans, trans->paths + path1, n1); - - if (n3) - six_unlock_intent(&n3->c.lock); - if (n2) - six_unlock_intent(&n2->c.lock); - six_unlock_intent(&n1->c.lock); -out: - if (path2) { - __bch2_btree_path_unlock(trans, trans->paths + path2); - bch2_path_put(trans, path2, true); - } - if (path1) { - __bch2_btree_path_unlock(trans, trans->paths + path1); - bch2_path_put(trans, path1, true); - } - - bch2_trans_verify_locks(trans); - - bch2_time_stats_update(&c->times[n2 - ? BCH_TIME_btree_node_split - : BCH_TIME_btree_node_compact], - start_time); - return ret; -err: - if (n3) - bch2_btree_node_free_never_used(as, trans, n3); - if (n2) - bch2_btree_node_free_never_used(as, trans, n2); - bch2_btree_node_free_never_used(as, trans, n1); - goto out; -} - -/** - * bch2_btree_insert_node - insert bkeys into a given btree node - * - * @as: btree_update object - * @trans: btree_trans object - * @path_idx: path that points to current node - * @b: node to insert keys into - * @keys: list of keys to insert - * - * Returns: 0 on success, typically transaction restart error on failure - * - * Inserts as many keys as it can into a given btree node, splitting it if full. - * If a split occurred, this function will return early. This can only happen - * for leaf nodes -- inserts into interior nodes have to be atomic. - */ -static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, - btree_path_idx_t path_idx, struct btree *b, - struct keylist *keys) -{ - struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx, *linked; - unsigned i; - int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); - int old_live_u64s = b->nr.live_u64s; - int live_u64s_added, u64s_added; - int ret; - - lockdep_assert_held(&c->gc_lock); - BUG_ON(!b->c.level); - BUG_ON(!as || as->b); - bch2_verify_keylist_sorted(keys); - - if (!btree_node_intent_locked(path, b->c.level)) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "%s(): node not locked at level %u\n", - __func__, b->c.level); - bch2_btree_update_to_text(&buf, as); - bch2_btree_path_to_text(&buf, trans, path_idx); - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return -EIO; - } - - ret = bch2_btree_node_lock_write(trans, path, &b->c); - if (ret) - return ret; - - bch2_btree_node_prep_for_write(trans, path, b); - - if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) { - bch2_btree_node_unlock_write(trans, path, b); - goto split; - } - - - ret = bch2_btree_node_check_topology(trans, b) ?: - bch2_btree_insert_keys_interior(as, trans, path, b, - path->l[b->c.level].iter, keys); - if (ret) { - bch2_btree_node_unlock_write(trans, path, b); - return ret; - } - - trans_for_each_path_with_node(trans, b, linked, i) - bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - - bch2_trans_verify_paths(trans); - - live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; - - if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); - if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); - - if (u64s_added > live_u64s_added && - bch2_maybe_compact_whiteouts(c, b)) - bch2_trans_node_reinit_iter(trans, b); - - btree_update_updated_node(as, b); - bch2_btree_node_unlock_write(trans, path, b); - return 0; -split: - /* - * We could attempt to avoid the transaction restart, by calling - * bch2_btree_path_upgrade() and allocating more nodes: - */ - if (b->c.level >= as->update_level_end) { - trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); - } - - return btree_split(as, trans, path_idx, b, keys); -} - -int bch2_btree_split_leaf(struct btree_trans *trans, - btree_path_idx_t path, - unsigned flags) -{ - /* btree_split & merge may both cause paths array to be reallocated */ - struct btree *b = path_l(trans->paths + path)->b; - struct btree_update *as; - unsigned l; - int ret = 0; - - as = bch2_btree_update_start(trans, trans->paths + path, - trans->paths[path].level, - true, 0, flags); - if (IS_ERR(as)) - return PTR_ERR(as); - - ret = btree_split(as, trans, path, b, NULL); - if (ret) { - bch2_btree_update_free(as, trans); - return ret; - } - - bch2_btree_update_done(as, trans); - - for (l = trans->paths[path].level + 1; - btree_node_intent_locked(&trans->paths[path], l) && !ret; - l++) - ret = bch2_foreground_maybe_merge(trans, path, l, flags); - - return ret; -} - -static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, - btree_path_idx_t path_idx) -{ - struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx; - struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; - - BUG_ON(!btree_node_locked(path, b->c.level)); - - n = __btree_root_alloc(as, trans, b->c.level + 1); - - bch2_btree_update_add_new_node(as, n); - six_unlock_write(&n->c.lock); - - path->locks_want++; - BUG_ON(btree_node_locked(path, n->c.level)); - six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path, n); - - n->sib_u64s[0] = U16_MAX; - n->sib_u64s[1] = U16_MAX; - - bch2_keylist_add(&as->parent_keys, &b->key); - btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); - - int ret = bch2_btree_set_root(as, trans, path, n, true); - BUG_ON(ret); - - bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - bch2_trans_node_add(trans, path, n); - six_unlock_intent(&n->c.lock); - - mutex_lock(&c->btree_cache.lock); - list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); - mutex_unlock(&c->btree_cache.lock); - - bch2_trans_verify_locks(trans); -} - -int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; - - if (btree_node_fake(b)) - return bch2_btree_split_leaf(trans, path, flags); - - struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, b->c.level, - true, 0, flags); - if (IS_ERR(as)) - return PTR_ERR(as); - - __btree_increase_depth(as, trans, path); - bch2_btree_update_done(as, trans); - return 0; -} - -int __bch2_foreground_maybe_merge(struct btree_trans *trans, - btree_path_idx_t path, - unsigned level, - unsigned flags, - enum btree_node_sibling sib) -{ - struct bch_fs *c = trans->c; - struct btree_update *as; - struct bkey_format_state new_s; - struct bkey_format new_f; - struct bkey_i delete; - struct btree *b, *m, *n, *prev, *next, *parent; - struct bpos sib_pos; - size_t sib_u64s; - enum btree_id btree = trans->paths[path].btree_id; - btree_path_idx_t sib_path = 0, new_path = 0; - u64 start_time = local_clock(); - int ret = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - BUG_ON(!trans->paths[path].should_be_locked); - BUG_ON(!btree_node_locked(&trans->paths[path], level)); - - /* - * Work around a deadlock caused by the btree write buffer not doing - * merges and leaving tons of merges for us to do - we really don't need - * to be doing merges at all from the interior update path, and if the - * interior update path is generating too many new interior updates we - * deadlock: - */ - if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) - return 0; - - if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) { - flags &= ~BCH_WATERMARK_MASK; - flags |= BCH_WATERMARK_btree; - flags |= BCH_TRANS_COMMIT_journal_reclaim; - } - - b = trans->paths[path].l[level].b; - - if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || - (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { - b->sib_u64s[sib] = U16_MAX; - return 0; - } - - sib_pos = sib == btree_prev_sib - ? bpos_predecessor(b->data->min_key) - : bpos_successor(b->data->max_key); - - sib_path = bch2_path_get(trans, btree, sib_pos, - U8_MAX, level, BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, sib_path, false); - if (ret) - goto err; - - btree_path_set_should_be_locked(trans, trans->paths + sib_path); - - m = trans->paths[sib_path].l[level].b; - - if (btree_node_parent(trans->paths + path, b) != - btree_node_parent(trans->paths + sib_path, m)) { - b->sib_u64s[sib] = U16_MAX; - goto out; - } - - if (sib == btree_prev_sib) { - prev = m; - next = b; - } else { - prev = b; - next = m; - } - - if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { - struct printbuf buf = PRINTBUF; - - printbuf_indent_add_nextline(&buf, 2); - prt_printf(&buf, "%s(): ", __func__); - ret = __bch2_topology_error(c, &buf); - prt_newline(&buf); - - prt_printf(&buf, "prev ends at "); - bch2_bpos_to_text(&buf, prev->data->max_key); - prt_newline(&buf); - - prt_printf(&buf, "next starts at "); - bch2_bpos_to_text(&buf, next->data->min_key); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - goto err; - } - - bch2_bkey_format_init(&new_s); - bch2_bkey_format_add_pos(&new_s, prev->data->min_key); - __bch2_btree_calc_format(&new_s, prev); - __bch2_btree_calc_format(&new_s, next); - bch2_bkey_format_add_pos(&new_s, next->data->max_key); - new_f = bch2_bkey_format_done(&new_s); - - sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) + - btree_node_u64s_with_format(m->nr, &m->format, &new_f); - - if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { - sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); - sib_u64s /= 2; - sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); - } - - sib_u64s = min(sib_u64s, btree_max_u64s(c)); - sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); - b->sib_u64s[sib] = sib_u64s; - - if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) - goto out; - - parent = btree_node_parent(trans->paths + path, b); - as = bch2_btree_update_start(trans, trans->paths + path, level, false, - 0, BCH_TRANS_COMMIT_no_enospc|flags); - ret = PTR_ERR_OR_ZERO(as); - if (ret) - goto err; - - as->node_start = prev->data->min_key; - as->node_end = next->data->max_key; - - trace_and_count(c, btree_node_merge, trans, b); - - n = bch2_btree_node_alloc(as, trans, b->c.level); - - SET_BTREE_NODE_SEQ(n->data, - max(BTREE_NODE_SEQ(b->data), - BTREE_NODE_SEQ(m->data)) + 1); - - btree_set_min(n, prev->data->min_key); - btree_set_max(n, next->data->max_key); - - n->data->format = new_f; - btree_node_set_format(n, new_f); - - bch2_btree_sort_into(c, n, prev); - bch2_btree_sort_into(c, n, next); - - bch2_btree_build_aux_trees(n); - bch2_btree_update_add_new_node(as, n); - six_unlock_write(&n->c.lock); - - new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p); - six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + new_path, n); - - bkey_init(&delete.k); - delete.k.p = prev->key.k.p; - bch2_keylist_add(&as->parent_keys, &delete); - bch2_keylist_add(&as->parent_keys, &n->key); - - bch2_trans_verify_paths(trans); - - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); - if (ret) - goto err_free_update; - - bch2_btree_interior_update_will_free_node(as, b); - bch2_btree_interior_update_will_free_node(as, m); - - bch2_trans_verify_paths(trans); - - bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - - bch2_btree_node_free_inmem(trans, trans->paths + path, b); - bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); - - bch2_trans_node_add(trans, trans->paths + path, n); - - bch2_trans_verify_paths(trans); - - six_unlock_intent(&n->c.lock); - - bch2_btree_update_done(as, trans); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); -out: -err: - if (new_path) - bch2_path_put(trans, new_path, true); - bch2_path_put(trans, sib_path, true); - bch2_trans_verify_locks(trans); - if (ret == -BCH_ERR_journal_reclaim_would_deadlock) - ret = 0; - if (!ret) - ret = bch2_trans_relock(trans); - return ret; -err_free_update: - bch2_btree_node_free_never_used(as, trans, n); - bch2_btree_update_free(as, trans); - goto out; -} - -static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, - struct btree *b) -{ - bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, - BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(trans, iter); - if (ret) - goto err; - - /* has node been freed? */ - if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { - /* node has been freed: */ - BUG_ON(!btree_node_dying(b)); - ret = bch_err_throw(trans->c, btree_node_dying); - goto err; - } - - BUG_ON(!btree_node_hashed(b)); - return 0; -err: - bch2_trans_iter_exit(trans, iter); - return ret; -} - -int bch2_btree_node_rewrite(struct btree_trans *trans, - struct btree_iter *iter, - struct btree *b, - unsigned target, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree *n, *parent; - struct btree_update *as; - btree_path_idx_t new_path = 0; - int ret; - - flags |= BCH_TRANS_COMMIT_no_enospc; - - struct btree_path *path = btree_iter_path(trans, iter); - parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, b->c.level, - false, target, flags); - ret = PTR_ERR_OR_ZERO(as); - if (ret) - goto out; - - n = bch2_btree_node_alloc_replacement(as, trans, b); - - bch2_btree_build_aux_trees(n); - bch2_btree_update_add_new_node(as, n); - six_unlock_write(&n->c.lock); - - new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p); - six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + new_path, n); - - trace_and_count(c, btree_node_rewrite, trans, b); - - if (parent) { - bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); - } else { - ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false); - } - - if (ret) - goto err; - - bch2_btree_interior_update_will_free_node(as, b); - - bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - - bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); - - bch2_trans_node_add(trans, trans->paths + iter->path, n); - six_unlock_intent(&n->c.lock); - - bch2_btree_update_done(as, trans); -out: - if (new_path) - bch2_path_put(trans, new_path, true); - bch2_trans_downgrade(trans); - return ret; -err: - bch2_btree_node_free_never_used(as, trans, n); - bch2_btree_update_free(as, trans); - goto out; -} - -int bch2_btree_node_rewrite_key(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_i *k, unsigned flags) -{ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, - btree, k->k.p, - BTREE_MAX_DEPTH, level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto out; - - bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); - ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags) - : -ENOENT; -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_node_rewrite_pos(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bpos pos, - unsigned target, - unsigned flags) -{ - BUG_ON(!level); - - /* Traverse one depth lower to get a pointer to the node itself: */ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err; - - ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, - struct btree *b, unsigned flags) -{ - struct btree_iter iter; - int ret = get_iter_to_node(trans, &iter, b); - if (ret) - return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -struct async_btree_rewrite { - struct bch_fs *c; - struct work_struct work; - struct list_head list; - enum btree_id btree_id; - unsigned level; - struct bkey_buf key; -}; - -static void async_btree_node_rewrite_work(struct work_struct *work) -{ - struct async_btree_rewrite *a = - container_of(work, struct async_btree_rewrite, work); - struct bch_fs *c = a->c; - - int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, - a->btree_id, a->level, a->key.k, 0)); - if (!bch2_err_matches(ret, ENOENT) && - !bch2_err_matches(ret, EROFS)) - bch_err_fn_ratelimited(c, ret); - - spin_lock(&c->btree_node_rewrites_lock); - list_del(&a->list); - spin_unlock(&c->btree_node_rewrites_lock); - - closure_wake_up(&c->btree_node_rewrites_wait); - - bch2_bkey_buf_exit(&a->key, c); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite); - kfree(a); -} - -void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) -{ - struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS); - if (!a) - return; - - a->c = c; - a->btree_id = b->c.btree_id; - a->level = b->c.level; - INIT_WORK(&a->work, async_btree_node_rewrite_work); - - bch2_bkey_buf_init(&a->key); - bch2_bkey_buf_copy(&a->key, c, &b->key); - - bool now = false, pending = false; - - spin_lock(&c->btree_node_rewrites_lock); - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) && - enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { - list_add(&a->list, &c->btree_node_rewrites); - now = true; - } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { - list_add(&a->list, &c->btree_node_rewrites_pending); - pending = true; - } - spin_unlock(&c->btree_node_rewrites_lock); - - if (now) { - queue_work(c->btree_node_rewrite_worker, &a->work); - } else if (pending) { - /* bch2_do_pending_node_rewrites will execute */ - } else { - bch2_bkey_buf_exit(&a->key, c); - kfree(a); - } -} - -void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) -{ - closure_wait_event(&c->btree_node_rewrites_wait, - list_empty(&c->btree_node_rewrites)); -} - -void bch2_do_pending_node_rewrites(struct bch_fs *c) -{ - while (1) { - spin_lock(&c->btree_node_rewrites_lock); - struct async_btree_rewrite *a = - list_pop_entry(&c->btree_node_rewrites_pending, - struct async_btree_rewrite, list); - if (a) - list_add(&a->list, &c->btree_node_rewrites); - spin_unlock(&c->btree_node_rewrites_lock); - - if (!a) - break; - - enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite); - queue_work(c->btree_node_rewrite_worker, &a->work); - } -} - -void bch2_free_pending_node_rewrites(struct bch_fs *c) -{ - while (1) { - spin_lock(&c->btree_node_rewrites_lock); - struct async_btree_rewrite *a = - list_pop_entry(&c->btree_node_rewrites_pending, - struct async_btree_rewrite, list); - spin_unlock(&c->btree_node_rewrites_lock); - - if (!a) - break; - - bch2_bkey_buf_exit(&a->key, c); - kfree(a); - } -} - -static int __bch2_btree_node_update_key(struct btree_trans *trans, - struct btree_iter *iter, - struct btree *b, struct btree *new_hash, - struct bkey_i *new_key, - unsigned commit_flags, - bool skip_triggers) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter2 = {}; - struct btree *parent; - int ret; - - if (!skip_triggers) { - ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, - bkey_i_to_s_c(&b->key), - BTREE_TRIGGER_transactional) ?: - bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, - bkey_i_to_s(new_key), - BTREE_TRIGGER_transactional); - if (ret) - return ret; - } - - if (new_hash) { - bkey_copy(&new_hash->key, new_key); - ret = bch2_btree_node_hash_insert(&c->btree_cache, - new_hash, b->c.level, b->c.btree_id); - BUG_ON(ret); - } - - parent = btree_node_parent(btree_iter_path(trans, iter), b); - if (parent) { - bch2_trans_copy_iter(trans, &iter2, iter); - - iter2.path = bch2_btree_path_make_mut(trans, iter2.path, - iter2.flags & BTREE_ITER_intent, - _THIS_IP_); - - struct btree_path *path2 = btree_iter_path(trans, &iter2); - BUG_ON(path2->level != b->c.level); - BUG_ON(!bpos_eq(path2->pos, new_key->k.p)); - - btree_path_set_level_up(trans, path2); - - trans->paths_sorted = false; - - ret = bch2_btree_iter_traverse(trans, &iter2) ?: - bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); - if (ret) - goto err; - } else { - BUG_ON(btree_node_root(c, b) != b); - - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, - jset_u64s(new_key->k.u64s)); - ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - journal_entry_set(e, - BCH_JSET_ENTRY_btree_root, - b->c.btree_id, b->c.level, - new_key, new_key->k.u64s); - } - - ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); - if (ret) - goto err; - - bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); - - if (new_hash) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, new_hash); - - __bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, new_key); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - mutex_unlock(&c->btree_cache.lock); - } else { - bkey_copy(&b->key, new_key); - } - - bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); -out: - bch2_trans_iter_exit(trans, &iter2); - return ret; -err: - if (new_hash) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - } - goto out; -} - -int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, - struct btree *b, struct bkey_i *new_key, - unsigned commit_flags, bool skip_triggers) -{ - struct bch_fs *c = trans->c; - struct btree *new_hash = NULL; - struct btree_path *path = btree_iter_path(trans, iter); - struct closure cl; - int ret = 0; - - ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1); - if (ret) - return ret; - - closure_init_stack(&cl); - - /* - * check btree_ptr_hash_val() after @b is locked by - * btree_iter_traverse(): - */ - if (btree_ptr_hash_val(new_key) != b->hash_val) { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - if (ret) { - ret = drop_locks_do(trans, (closure_sync(&cl), 0)); - if (ret) - return ret; - } - - new_hash = bch2_btree_node_mem_alloc(trans, false); - ret = PTR_ERR_OR_ZERO(new_hash); - if (ret) - goto err; - } - - path->intent_ref++; - ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key, - commit_flags, skip_triggers); - --path->intent_ref; - - if (new_hash) - bch2_btree_node_to_freelist(c, new_hash); -err: - closure_sync(&cl); - bch2_btree_cache_cannibalize_unlock(trans); - return ret; -} - -int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, - struct btree *b, struct bkey_i *new_key, - unsigned commit_flags, bool skip_triggers) -{ - struct btree_iter iter; - int ret = get_iter_to_node(trans, &iter, b); - if (ret) - return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - - bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, - !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); - - ret = bch2_btree_node_update_key(trans, &iter, b, new_key, - commit_flags, skip_triggers); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* Init code: */ - -/* - * Only for filesystem bringup, when first reading the btree roots or allocating - * btree roots when initializing a new filesystem: - */ -void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) -{ - BUG_ON(btree_node_root(c, b)); - - bch2_btree_set_root_inmem(c, b); -} - -int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level) -{ - struct bch_fs *c = trans->c; - struct closure cl; - struct btree *b; - int ret; - - closure_init_stack(&cl); - - do { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - closure_sync(&cl); - } while (ret); - - b = bch2_btree_node_mem_alloc(trans, false); - bch2_btree_cache_cannibalize_unlock(trans); - - ret = PTR_ERR_OR_ZERO(b); - if (ret) - return ret; - - set_btree_node_fake(b); - set_btree_node_need_rewrite(b); - b->c.level = level; - b->c.btree_id = id; - - bkey_btree_ptr_init(&b->key); - b->key.k.p = SPOS_MAX; - *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; - - bch2_bset_init_first(b, &b->data->keys); - bch2_btree_build_aux_trees(b); - - b->data->flags = 0; - btree_set_min(b, POS_MIN); - btree_set_max(b, SPOS_MAX); - b->data->format = bch2_btree_calc_format(b); - btree_node_set_format(b, b->data->format); - - ret = bch2_btree_node_hash_insert(&c->btree_cache, b, - b->c.level, b->c.btree_id); - BUG_ON(ret); - - bch2_btree_set_root_inmem(c, b); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - return 0; -} - -void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) -{ - bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level))); -} - -static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) -{ - prt_printf(out, "%ps: ", (void *) as->ip_started); - bch2_trans_commit_flags_to_text(out, as->flags); - - prt_str(out, " "); - bch2_btree_id_to_text(out, as->btree_id); - prt_printf(out, " l=%u-%u ", - as->update_level_start, - as->update_level_end); - bch2_bpos_to_text(out, as->node_start); - prt_char(out, ' '); - bch2_bpos_to_text(out, as->node_end); - prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s", - as->node_written, - as->node_sectors, - as->node_remaining, - btree_node_reawrite_reason_strs[as->node_needed_rewrite]); - - prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - bch2_btree_update_modes[as->mode], - as->nodes_written, - closure_nr_remaining(&as->cl), - as->journal.seq); -} - -void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct btree_update *as; - - mutex_lock(&c->btree_interior_update_lock); - list_for_each_entry(as, &c->btree_interior_update_list, list) - bch2_btree_update_to_text(out, as); - mutex_unlock(&c->btree_interior_update_lock); -} - -static bool bch2_btree_interior_updates_pending(struct bch_fs *c) -{ - bool ret; - - mutex_lock(&c->btree_interior_update_lock); - ret = !list_empty(&c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - - return ret; -} - -bool bch2_btree_interior_updates_flush(struct bch_fs *c) -{ - bool ret = bch2_btree_interior_updates_pending(c); - - if (ret) - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_pending(c)); - return ret; -} - -void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry) -{ - struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); - - mutex_lock(&c->btree_root_lock); - - r->level = entry->level; - r->alive = true; - bkey_copy(&r->key, (struct bkey_i *) entry->start); - - mutex_unlock(&c->btree_root_lock); -} - -struct jset_entry * -bch2_btree_roots_to_journal_entries(struct bch_fs *c, - struct jset_entry *end, - unsigned long skip) -{ - unsigned i; - - mutex_lock(&c->btree_root_lock); - - for (i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (r->alive && !test_bit(i, &skip)) { - journal_entry_set(end, BCH_JSET_ENTRY_btree_root, - i, r->level, &r->key, r->key.k.u64s); - end = vstruct_next(end); - } - } - - mutex_unlock(&c->btree_root_lock); - - return end; -} - -static void bch2_btree_alloc_to_text(struct printbuf *out, - struct bch_fs *c, - struct btree_alloc *a) -{ - printbuf_indent_add(out, 2); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k)); - prt_newline(out); - - struct open_bucket *ob; - unsigned i; - open_bucket_for_each(c, &a->ob, ob, i) - bch2_open_bucket_to_text(out, c, ob); - - printbuf_indent_sub(out, 2); -} - -void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c) -{ - for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++) - bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]); -} - -void bch2_fs_btree_interior_update_exit(struct bch_fs *c) -{ - WARN_ON(!list_empty(&c->btree_node_rewrites)); - WARN_ON(!list_empty(&c->btree_node_rewrites_pending)); - - if (c->btree_node_rewrite_worker) - destroy_workqueue(c->btree_node_rewrite_worker); - if (c->btree_interior_update_worker) - destroy_workqueue(c->btree_interior_update_worker); - mempool_exit(&c->btree_interior_update_pool); -} - -void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) -{ - mutex_init(&c->btree_reserve_cache_lock); - INIT_LIST_HEAD(&c->btree_interior_update_list); - INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); - mutex_init(&c->btree_interior_update_lock); - INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); - - INIT_LIST_HEAD(&c->btree_node_rewrites); - INIT_LIST_HEAD(&c->btree_node_rewrites_pending); - spin_lock_init(&c->btree_node_rewrites_lock); -} - -int bch2_fs_btree_interior_update_init(struct bch_fs *c) -{ - c->btree_interior_update_worker = - alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); - if (!c->btree_interior_update_worker) - return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); - - c->btree_node_rewrite_worker = - alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); - if (!c->btree_node_rewrite_worker) - return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); - - if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, - sizeof(struct btree_update))) - return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init); - - return 0; -} diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h deleted file mode 100644 index ac04e45a851594..00000000000000 --- a/fs/bcachefs/btree_update_interior.h +++ /dev/null @@ -1,364 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H -#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H - -#include "btree_cache.h" -#include "btree_locking.h" -#include "btree_update.h" - -#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) - -#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) - -int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); - -#define BTREE_UPDATE_MODES() \ - x(none) \ - x(node) \ - x(root) \ - x(update) - -enum btree_update_mode { -#define x(n) BTREE_UPDATE_##n, - BTREE_UPDATE_MODES() -#undef x -}; - -/* - * Tracks an in progress split/rewrite of a btree node and the update to the - * parent node: - * - * When we split/rewrite a node, we do all the updates in memory without - * waiting for any writes to complete - we allocate the new node(s) and update - * the parent node, possibly recursively up to the root. - * - * The end result is that we have one or more new nodes being written - - * possibly several, if there were multiple splits - and then a write (updating - * an interior node) which will make all these new nodes visible. - * - * Additionally, as we split/rewrite nodes we free the old nodes - but the old - * nodes can't be freed (their space on disk can't be reclaimed) until the - * update to the interior node that makes the new node visible completes - - * until then, the old nodes are still reachable on disk. - * - */ -struct btree_update { - struct closure cl; - struct bch_fs *c; - u64 start_time; - unsigned long ip_started; - - struct list_head list; - struct list_head unwritten_list; - - enum btree_update_mode mode; - enum bch_trans_commit_flags flags; - unsigned nodes_written:1; - unsigned took_gc_lock:1; - - enum btree_id btree_id; - struct bpos node_start; - struct bpos node_end; - enum btree_node_rewrite_reason node_needed_rewrite; - u16 node_written; - u16 node_sectors; - u16 node_remaining; - - unsigned update_level_start; - unsigned update_level_end; - - struct disk_reservation disk_res; - - /* - * BTREE_UPDATE_node: - * The update that made the new nodes visible was a regular update to an - * existing interior node - @b. We can't write out the update to @b - * until the new nodes we created are finished writing, so we block @b - * from writing by putting this btree_interior update on the - * @b->write_blocked list with @write_blocked_list: - */ - struct btree *b; - struct list_head write_blocked_list; - - /* - * We may be freeing nodes that were dirty, and thus had journal entries - * pinned: we need to transfer the oldest of those pins to the - * btree_update operation, and release it when the new node(s) - * are all persistent and reachable: - */ - struct journal_entry_pin journal; - - /* Preallocated nodes we reserve when we start the update: */ - struct prealloc_nodes { - struct btree *b[BTREE_UPDATE_NODES_MAX]; - unsigned nr; - } prealloc_nodes[2]; - - /* Nodes being freed: */ - struct keylist old_keys; - u64 _old_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_U64s_MAX]; - - /* Nodes being added: */ - struct keylist new_keys; - u64 _new_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_U64s_MAX]; - - /* New nodes, that will be made reachable by this update: */ - struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; - unsigned nr_new_nodes; - - struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; - __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; - unsigned nr_old_nodes; - - open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * - BCH_REPLICAS_MAX]; - open_bucket_idx_t nr_open_buckets; - - unsigned journal_u64s; - u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; - - /* Only here to reduce stack usage on recursive splits: */ - struct keylist parent_keys; - /* - * Enough room for btree_split's keys without realloc - btree node - * pointers never have crc/compression info, so we only need to acount - * for the pointers for three keys - */ - u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; -}; - -struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, - struct btree_trans *, - struct btree *, - struct bkey_format); - -int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); - -int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned); - -int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, - unsigned, unsigned, enum btree_node_sibling); - -static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, - btree_path_idx_t path_idx, - unsigned level, unsigned flags, - enum btree_node_sibling sib) -{ - struct btree_path *path = trans->paths + path_idx; - struct btree *b; - - EBUG_ON(!btree_node_locked(path, level)); - - if (static_branch_unlikely(&bch2_btree_node_merging_disabled)) - return 0; - - b = path->l[level].b; - if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) - return 0; - - return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib); -} - -static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, - btree_path_idx_t path, - unsigned level, - unsigned flags) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, - btree_prev_sib) ?: - bch2_foreground_maybe_merge_sibling(trans, path, level, flags, - btree_next_sib); -} - -int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, - struct btree *, unsigned, unsigned); -int bch2_btree_node_rewrite_key(struct btree_trans *, - enum btree_id, unsigned, - struct bkey_i *, unsigned); -int bch2_btree_node_rewrite_pos(struct btree_trans *, - enum btree_id, unsigned, - struct bpos, unsigned, unsigned); -int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, - struct btree *, unsigned); - -void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); - -int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, - struct btree *, struct bkey_i *, - unsigned, bool); -int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, - struct bkey_i *, unsigned, bool); - -void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); - -int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned); -void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); - -static inline unsigned btree_update_reserve_required(struct bch_fs *c, - struct btree *b) -{ - unsigned depth = btree_node_root(c, b)->c.level + 1; - - /* - * Number of nodes we might have to allocate in a worst case btree - * split operation - we split all the way up to the root, then allocate - * a new root, unless we're already at max depth: - */ - if (depth < BTREE_MAX_DEPTH) - return (depth - b->c.level) * 2 + 1; - else - return (depth - b->c.level) * 2 - 1; -} - -static inline void btree_node_reset_sib_u64s(struct btree *b) -{ - b->sib_u64s[0] = b->nr.live_u64s; - b->sib_u64s[1] = b->nr.live_u64s; -} - -static inline void *btree_data_end(struct btree *b) -{ - return (void *) b->data + btree_buf_bytes(b); -} - -static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b) -{ - return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s); -} - -static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b) -{ - return btree_data_end(b); -} - -static inline void *write_block(struct btree *b) -{ - return (void *) b->data + (b->written << 9); -} - -static inline bool __btree_addr_written(struct btree *b, void *p) -{ - return p < write_block(b); -} - -static inline bool bset_written(struct btree *b, struct bset *i) -{ - return __btree_addr_written(b, i); -} - -static inline bool bkey_written(struct btree *b, struct bkey_packed *k) -{ - return __btree_addr_written(b, k); -} - -static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end) -{ - ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + - b->whiteout_u64s; - ssize_t total = btree_buf_bytes(b) >> 3; - - /* Always leave one extra u64 for bch2_varint_decode: */ - used++; - - return total - used; -} - -static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b) -{ - ssize_t remaining = __bch2_btree_u64s_remaining(b, - btree_bkey_last(b, bset_tree_last(b))); - - BUG_ON(remaining < 0); - - if (bset_written(b, btree_bset_last(b))) - return 0; - - return remaining; -} - -#define BTREE_WRITE_SET_U64s_BITS 9 - -static inline unsigned btree_write_set_buffer(struct btree *b) -{ - /* - * Could buffer up larger amounts of keys for btrees with larger keys, - * pending benchmarking: - */ - return 8 << BTREE_WRITE_SET_U64s_BITS; -} - -static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) -{ - struct bset_tree *t = bset_tree_last(b); - struct btree_node_entry *bne = max(write_block(b), - (void *) btree_bkey_last(b, t)); - ssize_t remaining_space = - __bch2_btree_u64s_remaining(b, bne->keys.start); - - if (unlikely(bset_written(b, bset(b, t)))) { - if (b->written + block_sectors(c) <= btree_sectors(c)) - return bne; - } else { - if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && - remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) - return bne; - } - - return NULL; -} - -static inline void push_whiteout(struct btree *b, struct bpos pos) -{ - struct bkey_packed k; - - BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s); - EBUG_ON(btree_node_just_written(b)); - - if (!bkey_pack_pos(&k, pos, b)) { - struct bkey *u = (void *) &k; - - bkey_init(u); - u->p = pos; - } - - k.needs_whiteout = true; - - b->whiteout_u64s += k.u64s; - bkey_p_copy(unwritten_whiteouts_start(b), &k); -} - -/* - * write lock must be held on @b (else the dirty bset that we were going to - * insert into could be written out from under us) - */ -static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s) -{ - if (unlikely(btree_node_need_rewrite(b))) - return false; - - return u64s <= bch2_btree_keys_u64s_remaining(b); -} - -void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); - -bool bch2_btree_interior_updates_flush(struct bch_fs *); - -void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); -struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, - struct jset_entry *, unsigned long); - -void bch2_async_btree_node_rewrites_flush(struct bch_fs *); -void bch2_do_pending_node_rewrites(struct bch_fs *); -void bch2_free_pending_node_rewrites(struct bch_fs *); - -void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_btree_interior_update_exit(struct bch_fs *); -void bch2_fs_btree_interior_update_init_early(struct bch_fs *); -int bch2_fs_btree_interior_update_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c deleted file mode 100644 index 0afb44ce1a8562..00000000000000 --- a/fs/bcachefs/btree_write_buffer.c +++ /dev/null @@ -1,893 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "disk_accounting.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" - -#include -#include - -static int bch2_btree_write_buffer_journal_flush(struct journal *, - struct journal_entry_pin *, u64); - -static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) -{ - return (cmp_int(l->hi, r->hi) ?: - cmp_int(l->mi, r->mi) ?: - cmp_int(l->lo, r->lo)) >= 0; -} - -static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) -{ -#ifdef CONFIG_X86_64 - int cmp; - - asm("mov (%[l]), %%rax;" - "sub (%[r]), %%rax;" - "mov 8(%[l]), %%rax;" - "sbb 8(%[r]), %%rax;" - "mov 16(%[l]), %%rax;" - "sbb 16(%[r]), %%rax;" - : "=@ccae" (cmp) - : [l] "r" (l), [r] "r" (r) - : "rax", "cc"); - - EBUG_ON(cmp != __wb_key_ref_cmp(l, r)); - return cmp; -#else - return __wb_key_ref_cmp(l, r); -#endif -} - -static int wb_key_seq_cmp(const void *_l, const void *_r) -{ - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; - - return cmp_int(l->journal_seq, r->journal_seq); -} - -/* Compare excluding idx, the low 24 bits: */ -static inline bool wb_key_eq(const void *_l, const void *_r) -{ - const struct wb_key_ref *l = _l; - const struct wb_key_ref *r = _r; - - return !((l->hi ^ r->hi)| - (l->mi ^ r->mi)| - ((l->lo >> 24) ^ (r->lo >> 24))); -} - -static noinline void wb_sort(struct wb_key_ref *base, size_t num) -{ - size_t n = num, a = num / 2; - - if (!a) /* num < 2 || size == 0 */ - return; - - for (;;) { - size_t b, c, d; - - if (a) /* Building heap: sift down --a */ - --a; - else if (--n) /* Sorting: Extract root to --n */ - swap(base[0], base[n]); - else /* Sort complete */ - break; - - /* - * Sift element at "a" down into heap. This is the - * "bottom-up" variant, which significantly reduces - * calls to cmp_func(): we find the sift-down path all - * the way to the leaves (one compare per level), then - * backtrack to find where to insert the target element. - * - * Because elements tend to sift down close to the leaves, - * this uses fewer compares than doing two per level - * on the way down. (A bit more than half as many on - * average, 3/4 worst-case.) - */ - for (b = a; c = 2*b + 1, (d = c + 1) < n;) - b = wb_key_ref_cmp(base + c, base + d) ? c : d; - if (d == n) /* Special case last leaf with no sibling */ - b = c; - - /* Now backtrack from "b" to the correct location for "a" */ - while (b != a && wb_key_ref_cmp(base + a, base + b)) - b = (b - 1) / 2; - c = b; /* Where "a" belongs */ - while (b != a) { /* Shift it into place */ - b = (b - 1) / 2; - swap(base[b], base[c]); - } - } -} - -static noinline int wb_flush_one_slowpath(struct btree_trans *trans, - struct btree_iter *iter, - struct btree_write_buffered_key *wb) -{ - struct btree_path *path = btree_iter_path(trans, iter); - - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - - trans->journal_res.seq = wb->journal_seq; - - return bch2_trans_update(trans, iter, &wb->k, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim); -} - -static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter, - struct btree_write_buffered_key *wb, - bool *write_locked, - bool *accounting_accumulated, - size_t *fast) -{ - struct btree_path *path; - int ret; - - EBUG_ON(!wb->journal_seq); - EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); - EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); - - ret = bch2_btree_iter_traverse(trans, iter); - if (ret) - return ret; - - if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) { - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u); - - if (k.k->type == KEY_TYPE_accounting) - bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k), - bkey_s_c_to_accounting(k)); - } - *accounting_accumulated = true; - - /* - * We can't clone a path that has write locks: unshare it now, before - * set_pos and traverse(): - */ - if (btree_iter_path(trans, iter)->ref > 1) - iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_); - - path = btree_iter_path(trans, iter); - - if (!*write_locked) { - ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); - if (ret) - return ret; - - bch2_btree_node_prep_for_write(trans, path, path->l[0].b); - *write_locked = true; - } - - if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) { - *write_locked = false; - return wb_flush_one_slowpath(trans, iter, wb); - } - - EBUG_ON(!bpos_eq(wb->k.k.p, path->pos)); - - bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); - (*fast)++; - return 0; -} - -/* - * Update a btree with a write buffered key using the journal seq of the - * original write buffer insert. - * - * It is not safe to rejournal the key once it has been inserted into the write - * buffer because that may break recovery ordering. For example, the key may - * have already been modified in the active write buffer in a seq that comes - * before the current transaction. If we were to journal this key again and - * crash, recovery would process updates in the wrong order. - */ -static int -btree_write_buffered_insert(struct btree_trans *trans, - struct btree_write_buffered_key *wb) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), - BTREE_ITER_cached|BTREE_ITER_intent); - - trans->journal_res.seq = wb->journal_seq; - - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &wb->k, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) -{ - struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); - struct journal *j = &c->journal; - - if (!wb->inc.keys.nr) - return; - - bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, - bch2_btree_write_buffer_journal_flush); - - darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); - darray_resize(&wb->sorted, wb->flushing.keys.size); - - if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { - swap(wb->flushing.keys, wb->inc.keys); - goto out; - } - - size_t nr = min(darray_room(wb->flushing.keys), - wb->sorted.size - wb->flushing.keys.nr); - nr = min(nr, wb->inc.keys.nr); - - memcpy(&darray_top(wb->flushing.keys), - wb->inc.keys.data, - sizeof(wb->inc.keys.data[0]) * nr); - - memmove(wb->inc.keys.data, - wb->inc.keys.data + nr, - sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); - - wb->flushing.keys.nr += nr; - wb->inc.keys.nr -= nr; -out: - if (!wb->inc.keys.nr) - bch2_journal_pin_drop(j, &wb->inc.pin); - else - bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, - bch2_btree_write_buffer_journal_flush); - - if (j->watermark) { - spin_lock(&j->lock); - bch2_journal_set_watermark(j); - spin_unlock(&j->lock); - } - - BUG_ON(wb->sorted.size < wb->flushing.keys.nr); -} - -int bch2_btree_write_buffer_insert_err(struct bch_fs *c, - enum btree_id btree, struct bkey_i *k) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "attempting to do write buffer update on non wb btree="); - bch2_btree_id_to_text(&buf, btree); - prt_str(&buf, "\n"); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - - bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - return -EROFS; -} - -static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_iter iter = {}; - size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; - bool write_locked = false; - bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); - int ret = 0; - - ret = bch2_journal_error(&c->journal); - if (ret) - return ret; - - bch2_trans_unlock(trans); - bch2_trans_begin(trans); - - mutex_lock(&wb->inc.lock); - move_keys_from_inc_to_flushing(wb); - mutex_unlock(&wb->inc.lock); - - for (size_t i = 0; i < wb->flushing.keys.nr; i++) { - wb->sorted.data[i].idx = i; - wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; - memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos)); - } - wb->sorted.nr = wb->flushing.keys.nr; - - /* - * We first sort so that we can detect and skip redundant updates, and - * then we attempt to flush in sorted btree order, as this is most - * efficient. - * - * However, since we're not flushing in the order they appear in the - * journal we won't be able to drop our journal pin until everything is - * flushed - which means this could deadlock the journal if we weren't - * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail - * if it would block taking a journal reservation. - * - * If that happens, simply skip the key so we can optimistically insert - * as many keys as possible in the fast path. - */ - wb_sort(wb->sorted.data, wb->sorted.nr); - - darray_for_each(wb->sorted, i) { - struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; - - if (unlikely(!btree_type_uses_write_buffer(k->btree))) { - ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k); - goto err; - } - - for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) - prefetch(&wb->flushing.keys.data[n->idx]); - - BUG_ON(!k->journal_seq); - - if (!accounting_replay_done && - k->k.k.type == KEY_TYPE_accounting) { - slowpath++; - continue; - } - - if (i + 1 < &darray_top(wb->sorted) && - wb_key_eq(i, i + 1)) { - struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; - - if (k->k.k.type == KEY_TYPE_accounting && - n->k.k.type == KEY_TYPE_accounting) - bch2_accounting_accumulate(bkey_i_to_accounting(&n->k), - bkey_i_to_s_c_accounting(&k->k)); - - overwritten++; - n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); - k->journal_seq = 0; - continue; - } - - if (write_locked) { - struct btree_path *path = btree_iter_path(trans, &iter); - - if (path->btree_id != i->btree || - bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - write_locked = false; - - ret = lockrestart_do(trans, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_foreground_maybe_merge(trans, iter.path, 0, - BCH_WATERMARK_reclaim| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc)); - if (ret) - goto err; - } - } - - if (!iter.path || iter.btree_id != k->btree) { - bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); - } - - bch2_btree_iter_set_pos(trans, &iter, k->k.k.p); - btree_iter_path(trans, &iter)->preserve = false; - - bool accounting_accumulated = false; - do { - if (race_fault()) { - ret = bch_err_throw(c, journal_reclaim_would_deadlock); - break; - } - - ret = wb_flush_one(trans, &iter, k, &write_locked, - &accounting_accumulated, &fast); - if (!write_locked) - bch2_trans_begin(trans); - } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); - - if (!ret) { - k->journal_seq = 0; - } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { - slowpath++; - ret = 0; - } else - break; - } - - if (write_locked) { - struct btree_path *path = btree_iter_path(trans, &iter); - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - goto err; - - if (slowpath) { - /* - * Flush in the order they were present in the journal, so that - * we can release journal pins: - * The fastpath zapped the seq of keys that were successfully flushed so - * we can skip those here. - */ - trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); - - sort_nonatomic(wb->flushing.keys.data, - wb->flushing.keys.nr, - sizeof(wb->flushing.keys.data[0]), - wb_key_seq_cmp, NULL); - - darray_for_each(wb->flushing.keys, i) { - if (!i->journal_seq) - continue; - - if (!accounting_replay_done && - i->k.k.type == KEY_TYPE_accounting) { - could_not_insert++; - continue; - } - - if (!could_not_insert) - bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, - bch2_btree_write_buffer_journal_flush); - - bch2_trans_begin(trans); - - ret = commit_do(trans, NULL, NULL, - BCH_WATERMARK_reclaim| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_journal_res , - btree_write_buffered_insert(trans, i)); - if (ret) - goto err; - - i->journal_seq = 0; - } - - /* - * If journal replay hasn't finished with accounting keys we - * can't flush accounting keys at all - condense them and leave - * them for next time. - * - * Q: Can the write buffer overflow? - * A Shouldn't be any actual risk. It's just new accounting - * updates that the write buffer can't flush, and those are only - * going to be generated by interior btree node updates as - * journal replay has to split/rewrite nodes to make room for - * its updates. - * - * And for those new acounting updates, updates to the same - * counters get accumulated as they're flushed from the journal - * to the write buffer - see the patch for eytzingcer tree - * accumulated. So we could only overflow if the number of - * distinct counters touched somehow was very large. - */ - if (could_not_insert) { - struct btree_write_buffered_key *dst = wb->flushing.keys.data; - - darray_for_each(wb->flushing.keys, i) - if (i->journal_seq) - *dst++ = *i; - wb->flushing.keys.nr = dst - wb->flushing.keys.data; - } - } -err: - if (ret || !could_not_insert) { - bch2_journal_pin_drop(j, &wb->flushing.pin); - wb->flushing.keys.nr = 0; - } - - bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret)); - trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0); - return ret; -} - -static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) -{ - struct journal_keys_to_wb dst; - int ret = 0; - - bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); - - for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { - jset_entry_for_each_key(entry, k) { - ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); - if (ret) - goto out; - } - - entry->type = BCH_JSET_ENTRY_btree_keys; - } -out: - ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; - return ret; -} - -static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) -{ - struct journal *j = &c->journal; - struct journal_buf *buf; - bool blocked; - int ret = 0; - - while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) { - ret = bch2_journal_keys_to_write_buffer(c, buf); - - if (!blocked && !ret) { - spin_lock(&j->lock); - buf->need_flush_to_write_buffer = false; - spin_unlock(&j->lock); - } - - mutex_unlock(&j->buf_lock); - - if (blocked) { - bch2_journal_unblock(j); - break; - } - } - - return ret; -} - -static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq, - bool *did_work) -{ - struct bch_fs *c = trans->c; - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret = 0, fetch_from_journal_err; - - do { - bch2_trans_unlock(trans); - - fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq); - - *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; - - /* - * On memory allocation failure, bch2_btree_write_buffer_flush_locked() - * is not guaranteed to empty wb->inc: - */ - mutex_lock(&wb->flushing.lock); - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flushing.lock); - } while (!ret && - (fetch_from_journal_err || - (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) || - (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq))); - - return ret; -} - -static int bch2_btree_write_buffer_journal_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - bool did_work = false; - - return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work)); -} - -int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - bool did_work = false; - - trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); - - return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work); -} - -/* - * The write buffer requires flushing when going RO: keys in the journal for the - * write buffer don't have a journal pin yet - */ -bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c) -{ - if (bch2_journal_error(&c->journal)) - return false; - - bool did_work = false; - bch2_trans_run(c, btree_write_buffer_flush_seq(trans, - journal_cur_seq(&c->journal), &did_work)); - return did_work; -} - -int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret = 0; - - if (mutex_trylock(&wb->flushing.lock)) { - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flushing.lock); - } - - return ret; -} - -int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) - return bch_err_throw(c, erofs_no_writes); - - int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); - return ret; -} - -/* - * In check and repair code, when checking references to write buffer btrees we - * need to issue a flush before we have a definitive error: this issues a flush - * if this is a key we haven't yet checked. - */ -int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, - struct bkey_s_c referring_k, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bkey_buf tmp; - int ret = 0; - - bch2_bkey_buf_init(&tmp); - - if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { - if (trace_write_buffer_maybe_flush_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, referring_k); - trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf); - printbuf_exit(&buf); - } - - bch2_bkey_buf_reassemble(&tmp, c, referring_k); - - if (bkey_is_btree_ptr(referring_k.k)) { - bch2_trans_unlock(trans); - bch2_btree_interior_updates_flush(c); - } - - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - bch2_bkey_buf_copy(last_flushed, c, tmp.k); - - /* can we avoid the unconditional restart? */ - trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_); - ret = bch_err_throw(c, transaction_restart_write_buffer_flush); - } -err: - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -static void bch2_btree_write_buffer_flush_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret; - - mutex_lock(&wb->flushing.lock); - do { - ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); - } while (!ret && bch2_btree_write_buffer_should_flush(c)); - mutex_unlock(&wb->flushing.lock); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); -} - -static void wb_accounting_sort(struct btree_write_buffer *wb) -{ - eytzinger0_sort(wb->accounting.data, wb->accounting.nr, - sizeof(wb->accounting.data[0]), - wb_key_cmp, NULL); -} - -int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree, - struct bkey_i_accounting *k) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key new = { .btree = btree }; - - bkey_copy(&new.k, &k->k_i); - - int ret = darray_push(&wb->accounting, new); - if (ret) - return ret; - - wb_accounting_sort(wb); - return 0; -} - -int bch2_journal_key_to_wb_slowpath(struct bch_fs *c, - struct journal_keys_to_wb *dst, - enum btree_id btree, struct bkey_i *k) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret; -retry: - ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); - if (!ret && dst->wb == &wb->flushing) - ret = darray_resize(&wb->sorted, wb->flushing.keys.size); - - if (unlikely(ret)) { - if (dst->wb == &c->btree_write_buffer.flushing) { - mutex_unlock(&dst->wb->lock); - dst->wb = &c->btree_write_buffer.inc; - bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, - bch2_btree_write_buffer_journal_flush); - goto retry; - } - - return ret; - } - - dst->room = darray_room(dst->wb->keys); - if (dst->wb == &wb->flushing) - dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); - BUG_ON(!dst->room); - BUG_ON(!dst->seq); - - struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); - wb_k->journal_seq = dst->seq; - wb_k->btree = btree; - bkey_copy(&wb_k->k, k); - dst->wb->keys.nr++; - dst->room--; - return 0; -} - -void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - if (mutex_trylock(&wb->flushing.lock)) { - mutex_lock(&wb->inc.lock); - move_keys_from_inc_to_flushing(wb); - - /* - * Attempt to skip wb->inc, and add keys directly to - * wb->flushing, saving us a copy later: - */ - - if (!wb->inc.keys.nr) { - dst->wb = &wb->flushing; - } else { - mutex_unlock(&wb->flushing.lock); - dst->wb = &wb->inc; - } - } else { - mutex_lock(&wb->inc.lock); - dst->wb = &wb->inc; - } - - dst->room = darray_room(dst->wb->keys); - if (dst->wb == &wb->flushing) - dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); - dst->seq = seq; - - bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, - bch2_btree_write_buffer_journal_flush); - - darray_for_each(wb->accounting, i) - memset(&i->k.v, 0, bkey_val_bytes(&i->k.k)); -} - -int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - unsigned live_accounting_keys = 0; - int ret = 0; - - darray_for_each(wb->accounting, i) - if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) { - i->journal_seq = dst->seq; - live_accounting_keys++; - ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k); - if (ret) - break; - } - - if (live_accounting_keys * 2 < wb->accounting.nr) { - struct btree_write_buffered_key *dst = wb->accounting.data; - - darray_for_each(wb->accounting, src) - if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k))) - *dst++ = *src; - wb->accounting.nr = dst - wb->accounting.data; - wb_accounting_sort(wb); - } - - if (!dst->wb->keys.nr) - bch2_journal_pin_drop(&c->journal, &dst->wb->pin); - - if (bch2_btree_write_buffer_should_flush(c) && - __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) && - !queue_work(system_dfl_wq, &c->btree_write_buffer.flush_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); - - if (dst->wb == &wb->flushing) - mutex_unlock(&wb->flushing.lock); - mutex_unlock(&wb->inc.lock); - - return ret; -} - -static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) -{ - if (wb->keys.size >= new_size) - return 0; - - if (!mutex_trylock(&wb->lock)) - return -EINTR; - - int ret = darray_resize(&wb->keys, new_size); - mutex_unlock(&wb->lock); - return ret; -} - -int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb_keys_resize(&wb->flushing, new_size) ?: - wb_keys_resize(&wb->inc, new_size); -} - -void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && - !bch2_journal_error(&c->journal)); - - darray_exit(&wb->accounting); - darray_exit(&wb->sorted); - darray_exit(&wb->flushing.keys); - darray_exit(&wb->inc.keys); -} - -void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - mutex_init(&wb->inc.lock); - mutex_init(&wb->flushing.lock); - INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); -} - -int bch2_fs_btree_write_buffer_init(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - /* Will be resized by journal as needed: */ - unsigned initial_size = 1 << 16; - - return darray_make_room(&wb->inc.keys, initial_size) ?: - darray_make_room(&wb->flushing.keys, initial_size) ?: - darray_make_room(&wb->sorted, initial_size); -} diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h deleted file mode 100644 index c351d21aca0b88..00000000000000 --- a/fs/bcachefs/btree_write_buffer.h +++ /dev/null @@ -1,113 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H -#define _BCACHEFS_BTREE_WRITE_BUFFER_H - -#include "bkey.h" -#include "disk_accounting.h" - -static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4; -} - -static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4; -} - -struct btree_trans; -int bch2_btree_write_buffer_flush_sync(struct btree_trans *); -bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *); -int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); -int bch2_btree_write_buffer_tryflush(struct btree_trans *); - -struct bkey_buf; -int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *); - -struct journal_keys_to_wb { - struct btree_write_buffer_keys *wb; - size_t room; - u64 seq; -}; - -static inline int wb_key_cmp(const void *_l, const void *_r) -{ - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; - - return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p); -} - -int bch2_accounting_key_to_wb_slowpath(struct bch_fs *, - enum btree_id, struct bkey_i_accounting *); - -static inline int bch2_accounting_key_to_wb(struct bch_fs *c, - enum btree_id btree, struct bkey_i_accounting *k) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key search; - search.btree = btree; - search.k.k.p = k->k.p; - - unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr, - sizeof(wb->accounting.data[0]), - wb_key_cmp, &search); - - if (idx >= wb->accounting.nr) - return bch2_accounting_key_to_wb_slowpath(c, btree, k); - - struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k); - bch2_accounting_accumulate(dst, accounting_i_to_s_c(k)); - return 0; -} - -int bch2_journal_key_to_wb_slowpath(struct bch_fs *, - struct journal_keys_to_wb *, - enum btree_id, struct bkey_i *); - -static inline int __bch2_journal_key_to_wb(struct bch_fs *c, - struct journal_keys_to_wb *dst, - enum btree_id btree, struct bkey_i *k) -{ - if (unlikely(!dst->room)) - return bch2_journal_key_to_wb_slowpath(c, dst, btree, k); - - struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); - wb_k->journal_seq = dst->seq; - wb_k->btree = btree; - bkey_copy(&wb_k->k, k); - dst->wb->keys.nr++; - dst->room--; - return 0; -} - -static inline int bch2_journal_key_to_wb(struct bch_fs *c, - struct journal_keys_to_wb *dst, - enum btree_id btree, struct bkey_i *k) -{ - if (unlikely(!btree_type_uses_write_buffer(btree))) { - int ret = bch2_btree_write_buffer_insert_err(c, btree, k); - dump_stack(); - return ret; - } - - EBUG_ON(!dst->seq); - - return k->k.type == KEY_TYPE_accounting - ? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k)) - : __bch2_journal_key_to_wb(c, dst, btree, k); -} - -void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); -int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); - -int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); -void bch2_fs_btree_write_buffer_exit(struct bch_fs *); -void bch2_fs_btree_write_buffer_init_early(struct bch_fs *); -int bch2_fs_btree_write_buffer_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h deleted file mode 100644 index e9e76e20f43b0b..00000000000000 --- a/fs/bcachefs/btree_write_buffer_types.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H -#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H - -#include "darray.h" -#include "journal_types.h" - -#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 -#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) - -struct wb_key_ref { -union { - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - unsigned idx:24; - u8 pos[sizeof(struct bpos)]; - enum btree_id btree:8; -#else - enum btree_id btree:8; - u8 pos[sizeof(struct bpos)]; - unsigned idx:24; -#endif - } __packed; - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - u64 lo; - u64 mi; - u64 hi; -#else - u64 hi; - u64 mi; - u64 lo; -#endif - }; -}; -}; - -struct btree_write_buffered_key { - enum btree_id btree:8; - u64 journal_seq:56; - __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); -}; - -struct btree_write_buffer_keys { - DARRAY(struct btree_write_buffered_key) keys; - struct journal_entry_pin pin; - struct mutex lock; -}; - -struct btree_write_buffer { - DARRAY(struct wb_key_ref) sorted; - struct btree_write_buffer_keys inc; - struct btree_write_buffer_keys flushing; - struct work_struct flush_work; - - DARRAY(struct btree_write_buffered_key) accounting; -}; - -#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c deleted file mode 100644 index f25903c10e8a6f..00000000000000 --- a/fs/bcachefs/buckets.c +++ /dev/null @@ -1,1395 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Code for manipulating bucket marks for garbage collection. - * - * Copyright 2014 Datera, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "backpointers.h" -#include "bset.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "buckets.h" -#include "buckets_waiting_for_journal.h" -#include "disk_accounting.h" -#include "ec.h" -#include "error.h" -#include "inode.h" -#include "movinggc.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "reflink.h" -#include "replicas.h" -#include "subvolume.h" -#include "trace.h" - -#include - -void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) -{ - for (unsigned i = 0; i < BCH_DATA_NR; i++) - usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets); -} - -void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage) -{ - memset(usage, 0, sizeof(*usage)); - acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, - sizeof(struct bch_dev_usage_full) / sizeof(u64)); -} - -static u64 reserve_factor(u64 r) -{ - return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); -} - -static struct bch_fs_usage_short -__bch2_fs_usage_read_short(struct bch_fs *c) -{ - struct bch_fs_usage_short ret; - u64 data, reserved; - - ret.capacity = c->capacity - - percpu_u64_get(&c->usage->hidden); - - data = percpu_u64_get(&c->usage->data) + - percpu_u64_get(&c->usage->btree); - reserved = percpu_u64_get(&c->usage->reserved) + - percpu_u64_get(c->online_reserved); - - ret.used = min(ret.capacity, data + reserve_factor(reserved)); - ret.free = ret.capacity - ret.used; - - ret.nr_inodes = percpu_u64_get(&c->usage->nr_inodes); - - return ret; -} - -struct bch_fs_usage_short -bch2_fs_usage_read_short(struct bch_fs *c) -{ - struct bch_fs_usage_short ret; - - percpu_down_read(&c->mark_lock); - ret = __bch2_fs_usage_read_short(c); - percpu_up_read(&c->mark_lock); - - return ret; -} - -void bch2_dev_usage_to_text(struct printbuf *out, - struct bch_dev *ca, - struct bch_dev_usage_full *usage) -{ - if (out->nr_tabstops < 5) { - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - } - - prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); - - for (unsigned i = 0; i < BCH_DATA_NR; i++) { - bch2_prt_data_type(out, i); - prt_printf(out, "\t%llu\r%llu\r%llu\r\n", - usage->d[i].buckets, - usage->d[i].sectors, - usage->d[i].fragmented); - } - - prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets); -} - -static int bch2_check_fix_ptr(struct btree_trans *trans, - struct bkey_s_c k, - struct extent_ptr_decoded p, - const union bch_extent_entry *entry, - bool *do_update) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); - if (!ca) { - if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID, - trans, ptr_to_invalid_device, - "pointer to missing device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - return 0; - } - - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - if (!g) { - if (fsck_err(trans, ptr_to_invalid_device, - "pointer to invalid bucket on device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - goto out; - } - - enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); - - if (fsck_err_on(!g->gen_valid, - trans, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - } else { - /* this pointer will be dropped */ - *do_update = true; - goto out; - } - } - - /* g->gen_valid == true */ - - if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, - trans, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached && - (g->data_type != BCH_DATA_btree || - data_type == BCH_DATA_btree)) { - g->data_type = data_type; - g->stripe_sectors = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } - - *do_update = true; - } - - if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, - trans, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - - if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, - trans, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - - if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) - goto out; - - if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), - trans, ptr_bucket_data_type_mismatch, - "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached && - data_type == BCH_DATA_btree) { - switch (g->data_type) { - case BCH_DATA_sb: - bch_err(c, "btree and superblock in the same bucket - cannot repair"); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto out; - case BCH_DATA_journal: - ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr)); - bch_err_msg(c, ret, "error deleting journal bucket %zu", - PTR_BUCKET_NR(ca, &p.ptr)); - if (ret) - goto out; - break; - } - - g->data_type = data_type; - g->stripe_sectors = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } else { - *do_update = true; - } - } - - if (p.has_ec) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - - if (fsck_err_on(!m || !m->alive, - trans, ptr_to_missing_stripe, - "pointer to nonexistent stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - - if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), - trans, ptr_to_incorrect_stripe, - "pointer does not match stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - } -out: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_fix_ptrs(struct btree_trans *trans, - enum btree_id btree, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry_c; - struct extent_ptr_decoded p = { 0 }; - bool do_update = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - /* We don't yet do btree key updates correctly for when we're RW */ - BUG_ON(test_bit(BCH_FS_rw, &c->flags)); - - bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { - ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); - if (ret) - goto err; - } - - if (do_update) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - scoped_guard(rcu) - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); - - if (level) { - /* - * We don't want to drop btree node pointers - if the - * btree node isn't there anymore, the read path will - * sort it out: - */ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - scoped_guard(rcu) - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen; - } - } else { - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - - rcu_read_lock(); -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); - - if ((p.ptr.cached && - (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || - (!p.ptr.cached && - gen_cmp(p.ptr.gen, g->gen) < 0) || - gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || - (g->data_type && - g->data_type != data_type)) { - bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); - goto restart_drop_ptrs; - } - } - rcu_read_unlock(); -again: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_extent_entry_for_each(ptrs, entry) { - if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, - entry->stripe_ptr.idx); - union bch_extent_entry *next_ptr; - - bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) - if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) - goto found; - next_ptr = NULL; -found: - if (!next_ptr) { - bch_err(c, "aieee, found stripe ptr with no data ptr"); - continue; - } - - if (!m || !m->alive || - !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], - &next_ptr->ptr, - m->sectors)) { - bch2_bkey_extent_entry_drop(new, entry); - goto again; - } - } - } - } - - if (0) { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, k); - bch_info(c, "updated %s", buf.buf); - - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); - bch_info(c, "new key %s", buf.buf); - } - - if (!(flags & BTREE_TRIGGER_is_root)) { - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun); - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; - - if (level) - bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); - } else { - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, - jset_u64s(new->k.u64s)); - ret = PTR_ERR_OR_ZERO(e); - if (ret) - goto err; - - journal_entry_set(e, - BCH_JSET_ENTRY_btree_root, - btree, level - 1, - new, new->k.u64s); - - /* - * no locking, we're single threaded and not rw yet, see - * the big assertino above that we repeat here: - */ - BUG_ON(test_bit(BCH_FS_rw, &c->flags)); - - struct btree *b = bch2_btree_id_root(c, btree)->b; - bkey_copy(&b->key, new); - } - } -err: - printbuf_exit(&buf); - return ret; -} - -static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf, - struct bkey_s_c k, bool insert, enum bch_sb_error_id id) -{ - struct bch_fs *c = trans->c; - - prt_printf(buf, "\nwhile marking "); - bch2_bkey_val_to_text(buf, c, k); - prt_newline(buf); - - bool print = __bch2_count_fsck_err(c, id, buf); - - int ret = bch2_run_explicit_recovery_pass(c, buf, - BCH_RECOVERY_PASS_check_allocations, 0); - - if (insert) { - bch2_trans_updates_to_text(buf, trans); - __bch2_inconsistent_error(c, buf); - /* - * If we're in recovery, run_explicit_recovery_pass might give - * us an error code for rewinding recovery - */ - if (!ret) - ret = bch_err_throw(c, bucket_ref_update); - } else { - /* Always ignore overwrite errors, so that deletion works */ - ret = 0; - } - - if (print || insert) - bch2_print_str(c, KERN_ERR, buf->buf); - return ret; -} - -int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 b_gen, u8 bucket_data_type, - u32 *bucket_sectors) -{ - struct bch_fs *c = trans->c; - size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); - struct printbuf buf = PRINTBUF; - bool inserting = sectors > 0; - int ret = 0; - - BUG_ON(!sectors); - - if (unlikely(gen_after(ptr->gen, b_gen))) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen); - goto out; - } - - if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_ptr_too_stale); - goto out; - } - - if (b_gen != ptr->gen && ptr->cached) { - ret = 1; - goto out; - } - - if (unlikely(b_gen != ptr->gen)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)", - ptr->dev, bucket_nr, b_gen, - bucket_gen_get(ca, bucket_nr), - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_stale_dirty_ptr); - goto out; - } - - if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type), - bch2_data_type_str(ptr_data_type)); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_ptr_bucket_data_type_mismatch); - goto out; - } - - if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - *bucket_sectors, sectors); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_bucket_sector_count_overflow); - sectors = -*bucket_sectors; - goto out; - } - - *bucket_sectors += sectors; -out: - printbuf_exit(&buf); - return ret; -} - -void bch2_trans_account_disk_usage_change(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - static int warned_disk_usage = 0; - bool warn = false; - - percpu_down_read(&c->mark_lock); - struct bch_fs_usage_base *src = &trans->fs_usage_delta; - - s64 added = src->btree + src->data + src->reserved; - - /* - * Not allowed to reduce sectors_available except by getting a - * reservation: - */ - s64 should_not_have_added = added - (s64) disk_res_sectors; - if (unlikely(should_not_have_added > 0)) { - u64 old, new; - - old = atomic64_read(&c->sectors_available); - do { - new = max_t(s64, 0, old - should_not_have_added); - } while (!atomic64_try_cmpxchg(&c->sectors_available, - &old, new)); - - added -= should_not_have_added; - warn = true; - } - - if (added > 0) { - trans->disk_res->sectors -= added; - this_cpu_sub(*c->online_reserved, added); - } - - preempt_disable(); - struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); - acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); - preempt_enable(); - percpu_up_read(&c->mark_lock); - - if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) - bch2_trans_inconsistent(trans, - "disk usage increased %lli more than %llu sectors reserved)", - should_not_have_added, disk_res_sectors); -} - -/* KEY_TYPE_extent: */ - -static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, - struct bkey_s_c k, - const struct extent_ptr_decoded *p, - s64 sectors, enum bch_data_type ptr_data_type, - struct bch_alloc_v4 *a, - bool insert) -{ - u32 *dst_sectors = p->has_ec ? &a->stripe_sectors : - !p->ptr.cached ? &a->dirty_sectors : - &a->cached_sectors; - int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type, - a->gen, a->data_type, dst_sectors); - - if (ret) - return ret; - if (insert) - alloc_data_type_set(a, ptr_data_type); - return 0; -} - -static int bch2_trigger_pointer(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - const union bch_extent_entry *entry, - s64 *sectors, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - bool insert = !(flags & BTREE_TRIGGER_overwrite); - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp); - - *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len; - - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); - if (unlikely(!ca)) { - if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) - ret = bch_err_throw(c, trigger_pointer); - goto err; - } - - struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); - if (!bucket_valid(ca, bucket.offset)) { - if (insert) { - bch2_dev_bucket_missing(ca, bucket.offset); - ret = bch_err_throw(c, trigger_pointer); - } - goto err; - } - - if (flags & BTREE_TRIGGER_transactional) { - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: - __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); - if (ret) - goto err; - - ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); - if (ret) - goto err; - } - - if (flags & BTREE_TRIGGER_gc) { - struct bucket *g = gc_bucket(ca, bucket.offset); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", - p.ptr.dev, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch_err_throw(c, trigger_pointer); - goto err; - } - - bucket_lock(g); - struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); - alloc_to_bucket(g, new); - bucket_unlock(g); - - if (!ret) - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - } -err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -static int bch2_trigger_stripe_ptr(struct btree_trans *trans, - struct bkey_s_c k, - struct extent_ptr_decoded p, - enum bch_data_type data_type, - s64 sectors, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - - if (flags & BTREE_TRIGGER_transactional) { - struct btree_iter iter; - struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_with_updates, stripe); - int ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, - "pointer to nonexistent stripe %llu", - (u64) p.ec.idx); - goto err; - } - - if (!bch2_ptr_matches_stripe(&s->v, p)) { - bch2_trans_inconsistent(trans, - "stripe pointer doesn't match stripe %llu", - (u64) p.ec.idx); - ret = bch_err_throw(c, trigger_stripe_pointer); - goto err; - } - - stripe_blockcount_set(&s->v, p.ec.block, - stripe_blockcount_get(&s->v, p.ec.block) + - sectors); - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = data_type; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; - } - - if (flags & BTREE_TRIGGER_gc) { - struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); - if (!m) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", - (u64) p.ec.idx); - return bch_err_throw(c, ENOMEM_mark_stripe_ptr); - } - - gc_stripe_lock(m); - - if (!m || !m->alive) { - gc_stripe_unlock(m); - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ", - (u64) p.ec.idx); - bch2_bkey_val_to_text(&buf, c, k); - __bch2_inconsistent_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, trigger_stripe_pointer); - } - - m->block_sectors[p.ec.block] += sectors; - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA"); - gc_stripe_unlock(m); - - acc.replicas.data_type = data_type; - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); - if (ret) - return ret; - } - - return 0; -} - -static int __trigger_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - bool gc = flags & BTREE_TRIGGER_gc; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - enum bch_data_type data_type = bkey_is_btree_ptr(k.k) - ? BCH_DATA_btree - : BCH_DATA_user; - int ret = 0; - - s64 replicas_sectors = 0; - - struct disk_accounting_pos acc_replicas_key; - memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); - acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; - acc_replicas_key.replicas.data_type = data_type; - acc_replicas_key.replicas.nr_devs = 0; - acc_replicas_key.replicas.nr_required = 1; - - unsigned cur_compression_type = 0; - u64 compression_acct[3] = { 1, 0, 0 }; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = 0; - ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); - if (ret < 0) - return ret; - - bool stale = ret > 0; - - if (p.ptr.cached && stale) - continue; - - if (p.ptr.cached) { - ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc); - if (ret) - return ret; - } else if (!p.has_ec) { - replicas_sectors += disk_sectors; - replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); - } else { - ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); - if (ret) - return ret; - - /* - * There may be other dirty pointers in this extent, but - * if so they're not required for mounting if we have an - * erasure coded pointer in this extent: - */ - acc_replicas_key.replicas.nr_required = 0; - } - - if (cur_compression_type && - cur_compression_type != p.crc.compression_type) { - if (flags & BTREE_TRIGGER_overwrite) - bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - - ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, - compression, cur_compression_type); - if (ret) - return ret; - - compression_acct[0] = 1; - compression_acct[1] = 0; - compression_acct[2] = 0; - } - - cur_compression_type = p.crc.compression_type; - if (p.crc.compression_type) { - compression_acct[1] += p.crc.uncompressed_size; - compression_acct[2] += p.crc.compressed_size; - } - } - - if (acc_replicas_key.replicas.nr_devs) { - ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); - if (ret) - return ret; - } - - if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot); - if (ret) - return ret; - } - - if (cur_compression_type) { - if (flags & BTREE_TRIGGER_overwrite) - bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - - ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, - compression, cur_compression_type); - if (ret) - return ret; - } - - if (level) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id); - if (ret) - return ret; - } else { - bool insert = !(flags & BTREE_TRIGGER_overwrite); - - s64 v[3] = { - insert ? 1 : -1, - insert ? k.k->size : -((s64) k.k->size), - replicas_sectors, - }; - ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); - if (ret) - return ret; - } - - return 0; -} - -int bch2_trigger_extent(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); - struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); - unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; - unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; - - if (unlikely(flags & BTREE_TRIGGER_check_repair)) - return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags); - - /* if pointers aren't changing - nothing to do: */ - if (new_ptrs_bytes == old_ptrs_bytes && - !memcmp(new_ptrs.start, - old_ptrs.start, - new_ptrs_bytes)) - return 0; - - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - if (old.k->type) { - int ret = __trigger_extent(trans, btree, level, old, - flags & ~BTREE_TRIGGER_insert); - if (ret) - return ret; - } - - if (new.k->type) { - int ret = __trigger_extent(trans, btree, level, new.s_c, - flags & ~BTREE_TRIGGER_overwrite); - if (ret) - return ret; - } - - int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta[1] = { 0 }; - - s64 s = bch2_bkey_sectors_need_rebalance(c, old); - need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta[0] -= s; - - s = bch2_bkey_sectors_need_rebalance(c, new.s_c); - need_rebalance_delta += s != 0; - need_rebalance_sectors_delta[0] += s; - - if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, need_rebalance_delta > 0); - if (ret) - return ret; - } - - if (need_rebalance_sectors_delta[0]) { - int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - need_rebalance_sectors_delta, rebalance_work); - if (ret) - return ret; - } - } - - return 0; -} - -/* KEY_TYPE_reservation */ - -static int __trigger_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 sectors[1] = { k.k->size }; - - if (flags & BTREE_TRIGGER_overwrite) - sectors[0] = -sectors[0]; - - return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, - persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); - } - - return 0; -} - -int bch2_trigger_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); -} - -/* Mark superblocks: */ - -static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - enum bch_data_type type, - unsigned sectors) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - int ret = 0; - - struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b)); - if (IS_ERR(a)) - return PTR_ERR(a); - - if (a->v.data_type && type && a->v.data_type != type) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s\n", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - bch2_data_type_str(type), - bch2_data_type_str(type)); - - bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); - - ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_allocations, 0); - - /* Always print, this is always fatal */ - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - if (!ret) - ret = bch_err_throw(c, metadata_bucket_inconsistency); - goto err; - } - - if (a->v.data_type != type || - a->v.dirty_sectors != sectors) { - a->v.data_type = type; - a->v.dirty_sectors = sectors; - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, - u64 b, enum bch_data_type data_type, unsigned sectors, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - struct bucket *g = gc_bucket(ca, b); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", - ca->dev_idx, bch2_data_type_str(data_type))) - goto err; - - bucket_lock(g); - struct bch_alloc_v4 old = bucket_m_to_alloc(*g); - - if (bch2_fs_inconsistent_on(g->data_type && - g->data_type != data_type, c, - "different types of data in same bucket: %s, %s", - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type))) - goto err_unlock; - - if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, - "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", - ca->dev_idx, b, g->gen, - bch2_data_type_str(g->data_type ?: data_type), - g->dirty_sectors, sectors)) - goto err_unlock; - - g->data_type = data_type; - g->dirty_sectors += sectors; - struct bch_alloc_v4 new = bucket_m_to_alloc(*g); - bucket_unlock(g); - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - return ret; -err_unlock: - bucket_unlock(g); -err: - return bch_err_throw(c, metadata_bucket_inconsistency); -} - -int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - enum bch_data_type type, unsigned sectors, - enum btree_iter_update_trigger_flags flags) -{ - BUG_ON(type != BCH_DATA_free && - type != BCH_DATA_sb && - type != BCH_DATA_journal); - - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return 0; - - if (flags & BTREE_TRIGGER_gc) - return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags); - else if (flags & BTREE_TRIGGER_transactional) - return commit_do(trans, NULL, NULL, 0, - __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); - else - BUG(); -} - -static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, - struct bch_dev *ca, u64 start, u64 end, - enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors, - enum btree_iter_update_trigger_flags flags) -{ - do { - u64 b = sector_to_bucket(ca, start); - unsigned sectors = - min_t(u64, bucket_to_sector(ca, b + 1), end) - start; - - if (b != *bucket && *bucket_sectors) { - int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, - type, *bucket_sectors, flags); - if (ret) - return ret; - - *bucket_sectors = 0; - } - - *bucket = b; - *bucket_sectors += sectors; - start += sectors; - } while (start < end); - - return 0; -} - -static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - - mutex_lock(&c->sb_lock); - struct bch_sb_layout layout = ca->disk_sb.sb->layout; - mutex_unlock(&c->sb_lock); - - u64 bucket = 0; - unsigned i, bucket_sectors = 0; - int ret; - - for (i = 0; i < layout.nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout.sb_offset[i]); - - if (offset == BCH_SB_SECTOR) { - ret = bch2_trans_mark_metadata_sectors(trans, ca, - 0, BCH_SB_SECTOR, - BCH_DATA_sb, &bucket, &bucket_sectors, flags); - if (ret) - return ret; - } - - ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, - offset + (1 << layout.sb_max_size_bits), - BCH_DATA_sb, &bucket, &bucket_sectors, flags); - if (ret) - return ret; - } - - if (bucket_sectors) { - ret = bch2_trans_mark_metadata_bucket(trans, ca, - bucket, BCH_DATA_sb, bucket_sectors, flags); - if (ret) - return ret; - } - - for (i = 0; i < ca->journal.nr; i++) { - ret = bch2_trans_mark_metadata_bucket(trans, ca, - ca->journal.buckets[i], - BCH_DATA_journal, ca->mi.bucket_size, flags); - if (ret) - return ret; - } - - return 0; -} - -int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, - enum btree_iter_update_trigger_flags flags) -{ - int ret = bch2_trans_run(c, - __bch2_trans_mark_dev_sb(trans, ca, flags)); - bch_err_fn(c, ret); - return ret; -} - -int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, - enum btree_iter_update_trigger_flags flags) -{ - for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) { - int ret = bch2_trans_mark_dev_sb(c, ca, flags); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs); - return ret; - } - } - - return 0; -} - -int bch2_trans_mark_dev_sbs(struct bch_fs *c) -{ - return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); -} - -bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - u64 b_offset = bucket_to_sector(ca, b); - u64 b_end = bucket_to_sector(ca, b + 1); - unsigned i; - - if (!b) - return true; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - u64 end = offset + (1 << layout->sb_max_size_bits); - - if (!(offset >= b_end || end <= b_offset)) - return true; - } - - for (i = 0; i < ca->journal.nr; i++) - if (b == ca->journal.buckets[i]) - return true; - - return false; -} - -/* Disk reservations: */ - -#define SECTORS_CACHE 1024 - -int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, enum bch_reservation_flags flags) -{ - struct bch_fs_pcpu *pcpu; - u64 old, get; - u64 sectors_available; - int ret; - - percpu_down_read(&c->mark_lock); - preempt_disable(); - pcpu = this_cpu_ptr(c->pcpu); - - if (sectors <= pcpu->sectors_available) - goto out; - - old = atomic64_read(&c->sectors_available); - do { - get = min((u64) sectors + SECTORS_CACHE, old); - - if (get < sectors) { - preempt_enable(); - goto recalculate; - } - } while (!atomic64_try_cmpxchg(&c->sectors_available, - &old, old - get)); - - pcpu->sectors_available += get; - -out: - pcpu->sectors_available -= sectors; - this_cpu_add(*c->online_reserved, sectors); - res->sectors += sectors; - - preempt_enable(); - percpu_up_read(&c->mark_lock); - return 0; - -recalculate: - mutex_lock(&c->sectors_available_lock); - - percpu_u64_set(&c->pcpu->sectors_available, 0); - sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); - - if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL)) - sectors = min(sectors, sectors_available); - - if (sectors <= sectors_available || - (flags & BCH_DISK_RESERVATION_NOFAIL)) { - atomic64_set(&c->sectors_available, - max_t(s64, 0, sectors_available - sectors)); - this_cpu_add(*c->online_reserved, sectors); - res->sectors += sectors; - ret = 0; - } else { - atomic64_set(&c->sectors_available, sectors_available); - ret = bch_err_throw(c, ENOSPC_disk_reservation); - } - - mutex_unlock(&c->sectors_available_lock); - percpu_up_read(&c->mark_lock); - - return ret; -} - -/* Startup/shutdown: */ - -void bch2_buckets_nouse_free(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - kvfree_rcu_mightsleep(ca->buckets_nouse); - ca->buckets_nouse = NULL; - } -} - -int bch2_buckets_nouse_alloc(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - BUG_ON(ca->buckets_nouse); - - ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO); - if (!ca->buckets_nouse) { - bch2_dev_put(ca); - return bch_err_throw(c, ENOMEM_buckets_nouse); - } - } - - return 0; -} - -static void bucket_gens_free_rcu(struct rcu_head *rcu) -{ - struct bucket_gens *buckets = - container_of(rcu, struct bucket_gens, rcu); - - kvfree(buckets); -} - -int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -{ - struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; - bool resize = ca->bucket_gens != NULL; - int ret; - - if (resize) - lockdep_assert_held(&c->state_lock); - - if (resize && ca->buckets_nouse) - return bch_err_throw(c, no_resize_with_buckets_nouse); - - bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), - GFP_KERNEL|__GFP_ZERO); - if (!bucket_gens) { - ret = bch_err_throw(c, ENOMEM_bucket_gens); - goto err; - } - - bucket_gens->first_bucket = ca->mi.first_bucket; - bucket_gens->nbuckets = nbuckets; - bucket_gens->nbuckets_minus_first = - bucket_gens->nbuckets - bucket_gens->first_bucket; - - old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); - - if (resize) { - u64 copy = min(bucket_gens->nbuckets, - old_bucket_gens->nbuckets); - memcpy(bucket_gens->b, - old_bucket_gens->b, - sizeof(bucket_gens->b[0]) * copy); - } - - ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch, - ca->mi.nbuckets, nbuckets) ?: - bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty, - ca->mi.nbuckets, nbuckets); - - rcu_assign_pointer(ca->bucket_gens, bucket_gens); - bucket_gens = old_bucket_gens; - - nbuckets = ca->mi.nbuckets; - - ret = 0; -err: - if (bucket_gens) - call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); - - return ret; -} - -void bch2_dev_buckets_free(struct bch_dev *ca) -{ - kvfree(ca->buckets_nouse); - kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); - free_percpu(ca->usage); -} - -int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) -{ - ca->usage = alloc_percpu(struct bch_dev_usage_full); - if (!ca->usage) - return bch_err_throw(c, ENOMEM_usage_init); - - return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); -} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h deleted file mode 100644 index 49a3807a5eabff..00000000000000 --- a/fs/bcachefs/buckets.h +++ /dev/null @@ -1,369 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Code for manipulating bucket marks for garbage collection. - * - * Copyright 2014 Datera, Inc. - */ - -#ifndef _BUCKETS_H -#define _BUCKETS_H - -#include "buckets_types.h" -#include "extents.h" -#include "sb-members.h" - -static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s) -{ - return div_u64(s, ca->mi.bucket_size); -} - -static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) -{ - return ((sector_t) b) * ca->mi.bucket_size; -} - -static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -{ - u32 remainder; - - div_u64_rem(s, ca->mi.bucket_size, &remainder); - return remainder; -} - -static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset) -{ - return div_u64_rem(s, ca->mi.bucket_size, offset); -} - -#define for_each_bucket(_b, _buckets) \ - for (_b = (_buckets)->b + (_buckets)->first_bucket; \ - _b < (_buckets)->b + (_buckets)->nbuckets; _b++) - -static inline void bucket_unlock(struct bucket *b) -{ - BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); - - clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock); - smp_mb__after_atomic(); - wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR); -} - -static inline void bucket_lock(struct bucket *b) -{ - wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR, - TASK_UNINTERRUPTIBLE); -} - -static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) -{ - return bucket_valid(ca, b) - ? genradix_ptr(&ca->buckets_gc, b) - : NULL; -} - -static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) -{ - return rcu_dereference_check(ca->bucket_gens, - lockdep_is_held(&ca->fs->state_lock)); -} - -static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) -{ - struct bucket_gens *gens = bucket_gens(ca); - - if (b - gens->first_bucket >= gens->nbuckets_minus_first) - return NULL; - return gens->b + b; -} - -static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) -{ - u8 *gen = bucket_gen(ca, b); - return gen ? *gen : -1; -} - -static inline int bucket_gen_get(struct bch_dev *ca, size_t b) -{ - guard(rcu)(); - return bucket_gen_get_rcu(ca, b); -} - -static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return sector_to_bucket(ca, ptr->offset); -} - -static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); -} - -static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr, - u32 *bucket_offset) -{ - return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); -} - -static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr)); -} - -static inline enum bch_data_type ptr_data_type(const struct bkey *k, - const struct bch_extent_ptr *ptr) -{ - if (bkey_is_btree_ptr(k)) - return BCH_DATA_btree; - - return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; -} - -static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) -{ - EBUG_ON(sectors < 0); - - return crc_is_compressed(p.crc) - ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, - p.crc.uncompressed_size) - : sectors; -} - -static inline int gen_cmp(u8 a, u8 b) -{ - return (s8) (a - b); -} - -static inline int gen_after(u8 a, u8 b) -{ - return max(0, gen_cmp(a, b)); -} - -static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) -{ - int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr)); - return gen < 0 ? gen : gen_after(gen, ptr->gen); -} - -/** - * dev_ptr_stale() - check if a pointer points into a bucket that has been - * invalidated. - */ -static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) -{ - guard(rcu)(); - return dev_ptr_stale_rcu(ca, ptr); -} - -/* Device usage: */ - -void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *); -static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) -{ - struct bch_dev_usage ret; - - bch2_dev_usage_read_fast(ca, &ret); - return ret; -} - -void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *); -static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca) -{ - struct bch_dev_usage_full ret; - - bch2_dev_usage_full_read_fast(ca, &ret); - return ret; -} - -void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *); - -static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) -{ - s64 reserved = 0; - - switch (watermark) { - case BCH_WATERMARK_NR: - BUG(); - case BCH_WATERMARK_stripe: - reserved += ca->mi.nbuckets >> 6; - fallthrough; - case BCH_WATERMARK_normal: - reserved += ca->mi.nbuckets >> 6; - fallthrough; - case BCH_WATERMARK_copygc: - reserved += ca->nr_btree_reserve; - fallthrough; - case BCH_WATERMARK_btree: - reserved += ca->nr_btree_reserve; - fallthrough; - case BCH_WATERMARK_btree_copygc: - case BCH_WATERMARK_reclaim: - case BCH_WATERMARK_interior_updates: - break; - } - - return reserved; -} - -static inline u64 dev_buckets_free(struct bch_dev *ca, - struct bch_dev_usage usage, - enum bch_watermark watermark) -{ - return max_t(s64, 0, - usage.buckets[BCH_DATA_free]- - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, watermark)); -} - -static inline u64 __dev_buckets_available(struct bch_dev *ca, - struct bch_dev_usage usage, - enum bch_watermark watermark) -{ - return max_t(s64, 0, - usage.buckets[BCH_DATA_free] - + usage.buckets[BCH_DATA_cached] - + usage.buckets[BCH_DATA_need_gc_gens] - + usage.buckets[BCH_DATA_need_discard] - - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, watermark)); -} - -static inline u64 dev_buckets_available(struct bch_dev *ca, - enum bch_watermark watermark) -{ - return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark); -} - -/* Filesystem usage: */ - -struct bch_fs_usage_short -bch2_fs_usage_read_short(struct bch_fs *); - -int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *, - struct bkey_s_c, const struct bch_extent_ptr *, - s64, enum bch_data_type, u8, u8, u32 *); - -int bch2_check_fix_ptrs(struct btree_trans *, - enum btree_id, unsigned, struct bkey_s_c, - enum btree_iter_update_trigger_flags); - -int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); -int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ -({ \ - int ret = 0; \ - \ - if (_old.k->type) \ - ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \ - if (!ret && _new.k->type) \ - ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\ - ret; \ -}) - -void bch2_trans_account_disk_usage_change(struct btree_trans *); - -int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64, - enum bch_data_type, unsigned, - enum btree_iter_update_trigger_flags); -int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *, - enum btree_iter_update_trigger_flags); -int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, - enum btree_iter_update_trigger_flags); -int bch2_trans_mark_dev_sbs(struct bch_fs *); - -bool bch2_is_superblock_bucket(struct bch_dev *, u64); - -static inline const char *bch2_data_type_str(enum bch_data_type type) -{ - return type < BCH_DATA_NR - ? __bch2_data_types[type] - : "(invalid data type)"; -} - -/* disk reservations: */ - -static inline void bch2_disk_reservation_put(struct bch_fs *c, - struct disk_reservation *res) -{ - if (res->sectors) { - this_cpu_sub(*c->online_reserved, res->sectors); - res->sectors = 0; - } -} - -enum bch_reservation_flags { - BCH_DISK_RESERVATION_NOFAIL = 1 << 0, - BCH_DISK_RESERVATION_PARTIAL = 1 << 1, -}; - -int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *, - u64, enum bch_reservation_flags); - -static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, enum bch_reservation_flags flags) -{ -#ifdef __KERNEL__ - u64 old, new; - - old = this_cpu_read(c->pcpu->sectors_available); - do { - if (sectors > old) - return __bch2_disk_reservation_add(c, res, sectors, flags); - - new = old - sectors; - } while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new)); - - this_cpu_add(*c->online_reserved, sectors); - res->sectors += sectors; - return 0; -#else - return __bch2_disk_reservation_add(c, res, sectors, flags); -#endif -} - -static inline struct disk_reservation -bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) -{ - return (struct disk_reservation) { - .sectors = 0, -#if 0 - /* not used yet: */ - .gen = c->capacity_gen, -#endif - .nr_replicas = nr_replicas, - }; -} - -static inline int bch2_disk_reservation_get(struct bch_fs *c, - struct disk_reservation *res, - u64 sectors, unsigned nr_replicas, - int flags) -{ - *res = bch2_disk_reservation_init(c, nr_replicas); - - return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); -} - -#define RESERVE_FACTOR 6 - -static inline u64 avail_factor(u64 r) -{ - return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); -} - -void bch2_buckets_nouse_free(struct bch_fs *); -int bch2_buckets_nouse_alloc(struct bch_fs *); - -int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); -void bch2_dev_buckets_free(struct bch_dev *); -int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); - -#endif /* _BUCKETS_H */ diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h deleted file mode 100644 index 0aed2500ade32f..00000000000000 --- a/fs/bcachefs/buckets_types.h +++ /dev/null @@ -1,100 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BUCKETS_TYPES_H -#define _BUCKETS_TYPES_H - -#include "bcachefs_format.h" -#include "util.h" - -#define BUCKET_JOURNAL_SEQ_BITS 16 - -/* - * Ugly hack alert: - * - * We need to cram a spinlock in a single byte, because that's what we have left - * in struct bucket, and we care about the size of these - during fsck, we need - * in memory state for every single bucket on every device. - * - * We used to do - * while (xchg(&b->lock, 1) cpu_relax(); - * but, it turns out not all architectures support xchg on a single byte. - * - * So now we use bit_spin_lock(), with fun games since we can't burn a whole - * ulong for this - we just need to make sure the lock bit always ends up in the - * first byte. - */ - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define BUCKET_LOCK_BITNR 0 -#else -#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) -#endif - -union ulong_byte_assert { - ulong ulong; - u8 byte; -}; - -struct bucket { - u8 lock; - u8 gen_valid:1; - u8 data_type:7; - u8 gen; - u8 stripe_redundancy; - u32 stripe; - u32 dirty_sectors; - u32 cached_sectors; - u32 stripe_sectors; -} __aligned(sizeof(long)); - -struct bucket_gens { - struct rcu_head rcu; - u16 first_bucket; - size_t nbuckets; - size_t nbuckets_minus_first; - u8 b[] __counted_by(nbuckets); -}; - -/* Only info on bucket countns: */ -struct bch_dev_usage { - u64 buckets[BCH_DATA_NR]; -}; - -struct bch_dev_usage_full { - struct bch_dev_usage_type { - u64 buckets; - u64 sectors; /* _compressed_ sectors: */ - /* - * XXX - * Why do we have this? Isn't it just buckets * bucket_size - - * sectors? - */ - u64 fragmented; - } d[BCH_DATA_NR]; -}; - -struct bch_fs_usage_base { - u64 hidden; - u64 btree; - u64 data; - u64 cached; - u64 reserved; - u64 nr_inodes; -}; - -struct bch_fs_usage_short { - u64 capacity; - u64 used; - u64 free; - u64 nr_inodes; -}; - -/* - * A reservation for space on disk: - */ -struct disk_reservation { - u64 sectors; - u32 gen; - unsigned nr_replicas; -}; - -#endif /* _BUCKETS_TYPES_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c deleted file mode 100644 index 832eff93acb667..00000000000000 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "buckets_waiting_for_journal.h" -#include -#include - -static inline struct bucket_hashed * -bucket_hash(struct buckets_waiting_for_journal_table *t, - unsigned hash_seed_idx, u64 dev_bucket) -{ - return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits); -} - -static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits) -{ - unsigned i; - - t->bits = bits; - for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) - get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); - memset(t->d, 0, sizeof(t->d[0]) << t->bits); -} - -u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b, - unsigned dev, u64 bucket) -{ - struct buckets_waiting_for_journal_table *t; - u64 dev_bucket = (u64) dev << 56 | bucket; - u64 ret = 0; - - mutex_lock(&b->lock); - t = b->t; - - for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { - struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); - - if (h->dev_bucket == dev_bucket) { - ret = h->journal_seq; - break; - } - } - - mutex_unlock(&b->lock); - - return ret; -} - -static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, - struct bucket_hashed *new, - u64 flushed_seq) -{ - struct bucket_hashed *last_evicted = NULL; - unsigned tries, i; - - for (tries = 0; tries < 10; tries++) { - struct bucket_hashed *old, *victim = NULL; - - for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { - old = bucket_hash(t, i, new->dev_bucket); - - if (old->dev_bucket == new->dev_bucket || - old->journal_seq <= flushed_seq) { - *old = *new; - return true; - } - - if (last_evicted != old) - victim = old; - } - - /* hashed to same slot 3 times: */ - if (!victim) - break; - - /* Failed to find an empty slot: */ - swap(*new, *victim); - last_evicted = victim; - } - - return false; -} - -int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, - u64 flushed_seq, - unsigned dev, u64 bucket, - u64 journal_seq) -{ - struct buckets_waiting_for_journal_table *t, *n; - struct bucket_hashed tmp, new = { - .dev_bucket = (u64) dev << 56 | bucket, - .journal_seq = journal_seq, - }; - size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0; - int ret = 0; - - mutex_lock(&b->lock); - - if (likely(bucket_table_insert(b->t, &new, flushed_seq))) - goto out; - - t = b->t; - size = 1UL << t->bits; - for (i = 0; i < size; i++) - nr_elements += t->d[i].journal_seq > flushed_seq; - - new_bits = ilog2(roundup_pow_of_two(nr_elements * 3)); -realloc: - n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); - if (!n) { - struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal); - ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set); - goto out; - } - -retry_rehash: - if (nr_rehashes_this_size == 3) { - new_bits++; - nr_rehashes_this_size = 0; - kvfree(n); - goto realloc; - } - - nr_rehashes++; - nr_rehashes_this_size++; - - bucket_table_init(n, new_bits); - - tmp = new; - BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); - - for (i = 0; i < 1UL << t->bits; i++) { - if (t->d[i].journal_seq <= flushed_seq) - continue; - - tmp = t->d[i]; - if (!bucket_table_insert(n, &tmp, flushed_seq)) - goto retry_rehash; - } - - b->t = n; - kvfree(t); - - pr_debug("took %zu rehashes, table at %zu/%lu elements", - nr_rehashes, nr_elements, 1UL << b->t->bits); -out: - mutex_unlock(&b->lock); - - return ret; -} - -void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) -{ - struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; - - kvfree(b->t); -} - -#define INITIAL_TABLE_BITS 3 - -int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) -{ - struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; - - mutex_init(&b->lock); - - b->t = kvmalloc(sizeof(*b->t) + - (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL); - if (!b->t) - return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init; - - bucket_table_init(b->t, INITIAL_TABLE_BITS); - return 0; -} diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h deleted file mode 100644 index 365619ca44c87e..00000000000000 --- a/fs/bcachefs/buckets_waiting_for_journal.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H -#define _BUCKETS_WAITING_FOR_JOURNAL_H - -#include "buckets_waiting_for_journal_types.h" - -u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *, - unsigned, u64); -int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, - u64, unsigned, u64, u64); - -void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); -int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); - -#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h deleted file mode 100644 index e593db061d81b2..00000000000000 --- a/fs/bcachefs/buckets_waiting_for_journal_types.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H -#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H - -#include - -struct bucket_hashed { - u64 dev_bucket; - u64 journal_seq; -}; - -struct buckets_waiting_for_journal_table { - unsigned bits; - u64 hash_seeds[3]; - struct bucket_hashed d[]; -}; - -struct buckets_waiting_for_journal { - struct mutex lock; - struct buckets_waiting_for_journal_table *t; -}; - -#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c deleted file mode 100644 index 5ea89aa2b0c42a..00000000000000 --- a/fs/bcachefs/chardev.c +++ /dev/null @@ -1,843 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_CHARDEV - -#include "bcachefs.h" -#include "bcachefs_ioctl.h" -#include "buckets.h" -#include "chardev.h" -#include "disk_accounting.h" -#include "fsck.h" -#include "journal.h" -#include "move.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-counters.h" -#include "super-io.h" -#include "thread_with_file.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -/* returns with ref on ca->ref */ -static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, - unsigned flags) -{ - struct bch_dev *ca; - - if (flags & BCH_BY_INDEX) { - if (dev >= c->sb.nr_devices) - return ERR_PTR(-EINVAL); - - ca = bch2_dev_tryget_noerror(c, dev); - if (!ca) - return ERR_PTR(-EINVAL); - } else { - char *path; - - path = strndup_user((const char __user *) - (unsigned long) dev, PATH_MAX); - if (IS_ERR(path)) - return ERR_CAST(path); - - ca = bch2_dev_lookup(c, path); - kfree(path); - } - - return ca; -} - -#if 0 -static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) -{ - struct bch_ioctl_assemble arg; - struct bch_fs *c; - u64 *user_devs = NULL; - char **devs = NULL; - unsigned i; - int ret = -EFAULT; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags || arg.pad) - return -EINVAL; - - user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); - if (!user_devs) - return -ENOMEM; - - devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); - - if (copy_from_user(user_devs, user_arg->devs, - sizeof(u64) * arg.nr_devs)) - goto err; - - for (i = 0; i < arg.nr_devs; i++) { - devs[i] = strndup_user((const char __user *)(unsigned long) - user_devs[i], - PATH_MAX); - ret= PTR_ERR_OR_ZERO(devs[i]); - if (ret) - goto err; - } - - c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); - ret = PTR_ERR_OR_ZERO(c); - if (!ret) - closure_put(&c->cl); -err: - if (devs) - for (i = 0; i < arg.nr_devs; i++) - kfree(devs[i]); - kfree(devs); - return ret; -} - -static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) -{ - struct bch_ioctl_incremental arg; - const char *err; - char *path; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - ret = PTR_ERR_OR_ZERO(path); - if (ret) - return ret; - - err = bch2_fs_open_incremental(path); - kfree(path); - - if (err) { - pr_err("Could not register bcachefs devices: %s", err); - return -EINVAL; - } - - return 0; -} -#endif - -static long bch2_global_ioctl(unsigned cmd, void __user *arg) -{ - long ret; - - switch (cmd) { -#if 0 - case BCH_IOCTL_ASSEMBLE: - return bch2_ioctl_assemble(arg); - case BCH_IOCTL_INCREMENTAL: - return bch2_ioctl_incremental(arg); -#endif - case BCH_IOCTL_FSCK_OFFLINE: { - ret = bch2_ioctl_fsck_offline(arg); - break; - } - default: - ret = -ENOTTY; - break; - } - - if (ret < 0) - ret = bch2_err_class(ret); - return ret; -} - -static long bch2_ioctl_query_uuid(struct bch_fs *c, - struct bch_ioctl_query_uuid __user *user_arg) -{ - return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid, - sizeof(c->sb.user_uuid)); -} - -#if 0 -static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (arg.flags || arg.pad) - return -EINVAL; - - return bch2_fs_start(c); -} - -static long bch2_ioctl_stop(struct bch_fs *c) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - bch2_fs_stop(c); - return 0; -} -#endif - -static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - char *path; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - ret = PTR_ERR_OR_ZERO(path); - if (ret) - return ret; - - ret = bch2_dev_add(c, path); - if (!IS_ERR(path)) - kfree(path); - - return ret; -} - -static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - struct bch_dev *ca; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| - BCH_FORCE_IF_METADATA_LOST| - BCH_FORCE_IF_DEGRADED| - BCH_BY_INDEX)) || - arg.pad) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - return bch2_dev_remove(c, ca, arg.flags); -} - -static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - char *path; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - ret = PTR_ERR_OR_ZERO(path); - if (ret) - return ret; - - ret = bch2_dev_online(c, path); - kfree(path); - return ret; -} - -static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| - BCH_FORCE_IF_METADATA_LOST| - BCH_FORCE_IF_DEGRADED| - BCH_BY_INDEX)) || - arg.pad) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_dev_offline(c, ca, arg.flags); - bch2_dev_put(ca); - return ret; -} - -static long bch2_ioctl_disk_set_state(struct bch_fs *c, - struct bch_ioctl_disk_set_state arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| - BCH_FORCE_IF_METADATA_LOST| - BCH_FORCE_IF_DEGRADED| - BCH_BY_INDEX)) || - arg.pad[0] || arg.pad[1] || arg.pad[2] || - arg.new_state >= BCH_MEMBER_STATE_NR) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); - if (ret) - bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); - - bch2_dev_put(ca); - return ret; -} - -struct bch_data_ctx { - struct thread_with_file thr; - - struct bch_fs *c; - struct bch_ioctl_data arg; - struct bch_move_stats stats; -}; - -static int bch2_data_thread(void *arg) -{ - struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); - - ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); - if (ctx->thr.ret == -BCH_ERR_device_offline) - ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; - else { - ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; - ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; - } - enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data); - return 0; -} - -static int bch2_data_job_release(struct inode *inode, struct file *file) -{ - struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - - bch2_thread_with_file_exit(&ctx->thr); - kfree(ctx); - return 0; -} - -static ssize_t bch2_data_job_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - struct bch_fs *c = ctx->c; - struct bch_ioctl_data_event e = { - .type = BCH_DATA_EVENT_PROGRESS, - .ret = ctx->stats.ret, - .p.data_type = ctx->stats.data_type, - .p.btree_id = ctx->stats.pos.btree, - .p.pos = ctx->stats.pos.pos, - .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), - .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), - }; - - if (ctx->arg.op == BCH_DATA_OP_scrub) { - struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); - if (ca) { - struct bch_dev_usage_full u; - bch2_dev_usage_full_read_fast(ca, &u); - for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) - if (ctx->arg.scrub.data_types & BIT(i)) - e.p.sectors_total += u.d[i].sectors; - bch2_dev_put(ca); - } - } else { - e.p.sectors_total = bch2_fs_usage_read_short(c).used; - } - - if (len < sizeof(e)) - return -EINVAL; - - return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e); -} - -static const struct file_operations bcachefs_data_ops = { - .release = bch2_data_job_release, - .read = bch2_data_job_read, -}; - -static long bch2_ioctl_data(struct bch_fs *c, - struct bch_ioctl_data arg) -{ - struct bch_data_ctx *ctx; - int ret; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data)) - return -EROFS; - - if (!capable(CAP_SYS_ADMIN)) { - ret = -EPERM; - goto put_ref; - } - - if (arg.op >= BCH_DATA_OP_NR || arg.flags) { - ret = -EINVAL; - goto put_ref; - } - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { - ret = -ENOMEM; - goto put_ref; - } - - ctx->c = c; - ctx->arg = arg; - - ret = bch2_run_thread_with_file(&ctx->thr, - &bcachefs_data_ops, - bch2_data_thread); - if (ret < 0) - goto cleanup; - return ret; -cleanup: - kfree(ctx); -put_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data); - return ret; -} - -static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c, - struct bch_ioctl_fs_usage __user *user_arg) -{ - struct bch_ioctl_fs_usage arg = {}; - darray_char replicas = {}; - u32 replica_entries_bytes; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) - return -EFAULT; - - ret = bch2_fs_replicas_usage_read(c, &replicas) ?: - (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?: - copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr); - if (ret) - goto err; - - struct bch_fs_usage_short u = bch2_fs_usage_read_short(c); - arg.capacity = c->capacity; - arg.used = u.used; - arg.online_reserved = percpu_u64_get(c->online_reserved); - arg.replica_entries_bytes = replicas.nr; - - for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { - struct disk_accounting_pos k; - disk_accounting_key_init(k, persistent_reserved, .nr_replicas = i); - - bch2_accounting_mem_read(c, - disk_accounting_pos_to_bpos(&k), - &arg.persistent_reserved[i], 1); - } - - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); -err: - darray_exit(&replicas); - return ret; -} - -static long bch2_ioctl_query_accounting(struct bch_fs *c, - struct bch_ioctl_query_accounting __user *user_arg) -{ - struct bch_ioctl_query_accounting arg; - darray_char accounting = {}; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?: - bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?: - (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?: - copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr); - if (ret) - goto err; - - arg.capacity = c->capacity; - arg.used = bch2_fs_usage_read_short(c).used; - arg.online_reserved = percpu_u64_get(c->online_reserved); - arg.accounting_u64s = accounting.nr / sizeof(u64); - - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); -err: - darray_exit(&accounting); - return ret; -} - -/* obsolete, didn't allow for new data types: */ -static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c, - struct bch_ioctl_dev_usage __user *user_arg) -{ - struct bch_ioctl_dev_usage arg; - struct bch_dev_usage_full src; - struct bch_dev *ca; - unsigned i; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad[0] || - arg.pad[1] || - arg.pad[2]) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - src = bch2_dev_usage_full_read(ca); - - arg.state = ca->mi.state; - arg.bucket_size = ca->mi.bucket_size; - arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - - for (i = 0; i < ARRAY_SIZE(arg.d); i++) { - arg.d[i].buckets = src.d[i].buckets; - arg.d[i].sectors = src.d[i].sectors; - arg.d[i].fragmented = src.d[i].fragmented; - } - - bch2_dev_put(ca); - - return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); -} - -static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, - struct bch_ioctl_dev_usage_v2 __user *user_arg) -{ - struct bch_ioctl_dev_usage_v2 arg; - struct bch_dev_usage_full src; - struct bch_dev *ca; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad[0] || - arg.pad[1] || - arg.pad[2]) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - src = bch2_dev_usage_full_read(ca); - - arg.state = ca->mi.state; - arg.bucket_size = ca->mi.bucket_size; - arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR); - arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); - if (ret) - goto err; - - for (unsigned i = 0; i < arg.nr_data_types; i++) { - struct bch_ioctl_dev_usage_type t = { - .buckets = src.d[i].buckets, - .sectors = src.d[i].sectors, - .fragmented = src.d[i].fragmented, - }; - - ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t)); - if (ret) - goto err; - } -err: - bch2_dev_put(ca); - return ret; -} - -static long bch2_ioctl_read_super(struct bch_fs *c, - struct bch_ioctl_read_super arg) -{ - struct bch_dev *ca = NULL; - struct bch_sb *sb; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || - arg.pad) - return -EINVAL; - - mutex_lock(&c->sb_lock); - - if (arg.flags & BCH_READ_DEV) { - ca = bch2_device_lookup(c, arg.dev, arg.flags); - ret = PTR_ERR_OR_ZERO(ca); - if (ret) - goto err_unlock; - - sb = ca->disk_sb.sb; - } else { - sb = c->disk_sb.sb; - } - - if (vstruct_bytes(sb) > arg.size) { - ret = -ERANGE; - goto err; - } - - ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb, - vstruct_bytes(sb)); -err: - bch2_dev_put(ca); -err_unlock: - mutex_unlock(&c->sb_lock); - return ret; -} - -static long bch2_ioctl_disk_get_idx(struct bch_fs *c, - struct bch_ioctl_disk_get_idx arg) -{ - dev_t dev = huge_decode_dev(arg.dev); - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!dev) - return -EINVAL; - - guard(rcu)(); - for_each_online_member_rcu(c, ca) - if (ca->dev == dev) - return ca->dev_idx; - - return bch_err_throw(c, ENOENT_dev_idx_not_found); -} - -static long bch2_ioctl_disk_resize(struct bch_fs *c, - struct bch_ioctl_disk_resize arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_dev_resize(c, ca, arg.nbuckets); - - bch2_dev_put(ca); - return ret; -} - -static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, - struct bch_ioctl_disk_resize_journal arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad) - return -EINVAL; - - if (arg.nbuckets > U32_MAX) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); - - bch2_dev_put(ca); - return ret; -} - -#define BCH_IOCTL(_name, _argtype) \ -do { \ - _argtype i; \ - \ - if (copy_from_user(&i, arg, sizeof(i))) \ - return -EFAULT; \ - ret = bch2_ioctl_##_name(c, i); \ - goto out; \ -} while (0) - -long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) -{ - long ret; - - switch (cmd) { - case BCH_IOCTL_QUERY_UUID: - return bch2_ioctl_query_uuid(c, arg); - case BCH_IOCTL_FS_USAGE: - return bch2_ioctl_fs_usage(c, arg); - case BCH_IOCTL_DEV_USAGE: - return bch2_ioctl_dev_usage(c, arg); - case BCH_IOCTL_DEV_USAGE_V2: - return bch2_ioctl_dev_usage_v2(c, arg); -#if 0 - case BCH_IOCTL_START: - BCH_IOCTL(start, struct bch_ioctl_start); - case BCH_IOCTL_STOP: - return bch2_ioctl_stop(c); -#endif - case BCH_IOCTL_READ_SUPER: - BCH_IOCTL(read_super, struct bch_ioctl_read_super); - case BCH_IOCTL_DISK_GET_IDX: - BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); - } - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - switch (cmd) { - case BCH_IOCTL_DISK_ADD: - BCH_IOCTL(disk_add, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_REMOVE: - BCH_IOCTL(disk_remove, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_ONLINE: - BCH_IOCTL(disk_online, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_OFFLINE: - BCH_IOCTL(disk_offline, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_SET_STATE: - BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); - case BCH_IOCTL_DATA: - BCH_IOCTL(data, struct bch_ioctl_data); - case BCH_IOCTL_DISK_RESIZE: - BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); - case BCH_IOCTL_DISK_RESIZE_JOURNAL: - BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); - case BCH_IOCTL_FSCK_ONLINE: - BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); - case BCH_IOCTL_QUERY_ACCOUNTING: - return bch2_ioctl_query_accounting(c, arg); - case BCH_IOCTL_QUERY_COUNTERS: - return bch2_ioctl_query_counters(c, arg); - default: - return -ENOTTY; - } -out: - if (ret < 0) - ret = bch2_err_class(ret); - return ret; -} - -static DEFINE_IDR(bch_chardev_minor); - -static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) -{ - unsigned minor = iminor(file_inode(filp)); - struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; - void __user *arg = (void __user *) v; - - return c - ? bch2_fs_ioctl(c, cmd, arg) - : bch2_global_ioctl(cmd, arg); -} - -static const struct file_operations bch_chardev_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = bch2_chardev_ioctl, - .open = nonseekable_open, -}; - -static int bch_chardev_major; -static const struct class bch_chardev_class = { - .name = "bcachefs", -}; -static struct device *bch_chardev; - -void bch2_fs_chardev_exit(struct bch_fs *c) -{ - if (!IS_ERR_OR_NULL(c->chardev)) - device_unregister(c->chardev); - if (c->minor >= 0) - idr_remove(&bch_chardev_minor, c->minor); -} - -int bch2_fs_chardev_init(struct bch_fs *c) -{ - c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); - if (c->minor < 0) - return c->minor; - - c->chardev = device_create(&bch_chardev_class, NULL, - MKDEV(bch_chardev_major, c->minor), c, - "bcachefs%u-ctl", c->minor); - if (IS_ERR(c->chardev)) - return PTR_ERR(c->chardev); - - return 0; -} - -void bch2_chardev_exit(void) -{ - device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX)); - class_unregister(&bch_chardev_class); - if (bch_chardev_major > 0) - unregister_chrdev(bch_chardev_major, "bcachefs"); -} - -int __init bch2_chardev_init(void) -{ - int ret; - - bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); - if (bch_chardev_major < 0) - return bch_chardev_major; - - ret = class_register(&bch_chardev_class); - if (ret) - goto major_out; - - bch_chardev = device_create(&bch_chardev_class, NULL, - MKDEV(bch_chardev_major, U8_MAX), - NULL, "bcachefs-ctl"); - if (IS_ERR(bch_chardev)) { - ret = PTR_ERR(bch_chardev); - goto class_out; - } - - return 0; - -class_out: - class_unregister(&bch_chardev_class); -major_out: - unregister_chrdev(bch_chardev_major, "bcachefs-ctl"); - return ret; -} - -#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h deleted file mode 100644 index 0f563ca53c36e7..00000000000000 --- a/fs/bcachefs/chardev.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CHARDEV_H -#define _BCACHEFS_CHARDEV_H - -#ifndef NO_BCACHEFS_FS - -long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); - -void bch2_fs_chardev_exit(struct bch_fs *); -int bch2_fs_chardev_init(struct bch_fs *); - -void bch2_chardev_exit(void); -int __init bch2_chardev_init(void); - -#else - -static inline long bch2_fs_ioctl(struct bch_fs *c, - unsigned cmd, void __user * arg) -{ - return -ENOTTY; -} - -static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} -static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } - -static inline void bch2_chardev_exit(void) {} -static inline int __init bch2_chardev_init(void) { return 0; } - -#endif /* NO_BCACHEFS_FS */ - -#endif /* _BCACHEFS_CHARDEV_H */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c deleted file mode 100644 index a6795e73f0b93f..00000000000000 --- a/fs/bcachefs/checksum.c +++ /dev/null @@ -1,698 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "checksum.h" -#include "errcode.h" -#include "error.h" -#include "super.h" -#include "super-io.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * bch2_checksum state is an abstraction of the checksum state calculated over different pages. - * it features page merging without having the checksum algorithm lose its state. - * for native checksum aglorithms (like crc), a default seed value will do. - * for hash-like algorithms, a state needs to be stored - */ - -struct bch2_checksum_state { - union { - u64 seed; - struct xxh64_state h64state; - }; - unsigned int type; -}; - -static void bch2_checksum_init(struct bch2_checksum_state *state) -{ - switch (state->type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c: - case BCH_CSUM_crc64: - state->seed = 0; - break; - case BCH_CSUM_crc32c_nonzero: - state->seed = U32_MAX; - break; - case BCH_CSUM_crc64_nonzero: - state->seed = U64_MAX; - break; - case BCH_CSUM_xxhash: - xxh64_reset(&state->h64state, 0); - break; - default: - BUG(); - } -} - -static u64 bch2_checksum_final(const struct bch2_checksum_state *state) -{ - switch (state->type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c: - case BCH_CSUM_crc64: - return state->seed; - case BCH_CSUM_crc32c_nonzero: - return state->seed ^ U32_MAX; - case BCH_CSUM_crc64_nonzero: - return state->seed ^ U64_MAX; - case BCH_CSUM_xxhash: - return xxh64_digest(&state->h64state); - default: - BUG(); - } -} - -static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) -{ - switch (state->type) { - case BCH_CSUM_none: - return; - case BCH_CSUM_crc32c_nonzero: - case BCH_CSUM_crc32c: - state->seed = crc32c(state->seed, data, len); - break; - case BCH_CSUM_crc64_nonzero: - case BCH_CSUM_crc64: - state->seed = crc64_be(state->seed, data, len); - break; - case BCH_CSUM_xxhash: - xxh64_update(&state->h64state, data, len); - break; - default: - BUG(); - } -} - -static void bch2_chacha20_init(struct chacha_state *state, - const struct bch_key *key, struct nonce nonce) -{ - u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)]; - - BUILD_BUG_ON(sizeof(key_words) != sizeof(*key)); - memcpy(key_words, key, sizeof(key_words)); - le32_to_cpu_array(key_words, ARRAY_SIZE(key_words)); - - BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE); - chacha_init(state, key_words, (const u8 *)nonce.d); - - memzero_explicit(key_words, sizeof(key_words)); -} - -void bch2_chacha20(const struct bch_key *key, struct nonce nonce, - void *data, size_t len) -{ - struct chacha_state state; - - bch2_chacha20_init(&state, key, nonce); - chacha20_crypt(&state, data, data, len); - chacha_zeroize_state(&state); -} - -static void bch2_poly1305_init(struct poly1305_desc_ctx *desc, - struct bch_fs *c, struct nonce nonce) -{ - u8 key[POLY1305_KEY_SIZE] = { 0 }; - - nonce.d[3] ^= BCH_NONCE_POLY; - - bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key)); - poly1305_init(desc, key); -} - -struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, - struct nonce nonce, const void *data, size_t len) -{ - switch (type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c_nonzero: - case BCH_CSUM_crc64_nonzero: - case BCH_CSUM_crc32c: - case BCH_CSUM_xxhash: - case BCH_CSUM_crc64: { - struct bch2_checksum_state state; - - state.type = type; - - bch2_checksum_init(&state); - bch2_checksum_update(&state, data, len); - - return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; - } - - case BCH_CSUM_chacha20_poly1305_80: - case BCH_CSUM_chacha20_poly1305_128: { - struct poly1305_desc_ctx dctx; - u8 digest[POLY1305_DIGEST_SIZE]; - struct bch_csum ret = { 0 }; - - bch2_poly1305_init(&dctx, c, nonce); - poly1305_update(&dctx, data, len); - poly1305_final(&dctx, digest); - - memcpy(&ret, digest, bch_crc_bytes[type]); - return ret; - } - default: - return (struct bch_csum) {}; - } -} - -int bch2_encrypt(struct bch_fs *c, unsigned type, - struct nonce nonce, void *data, size_t len) -{ - if (!bch2_csum_type_is_encryption(type)) - return 0; - - if (bch2_fs_inconsistent_on(!c->chacha20_key_set, - c, "attempting to encrypt without encryption key")) - return bch_err_throw(c, no_encryption_key); - - bch2_chacha20(&c->chacha20_key, nonce, data, len); - return 0; -} - -static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv; - - switch (type) { - case BCH_CSUM_none: - return (struct bch_csum) { 0 }; - case BCH_CSUM_crc32c_nonzero: - case BCH_CSUM_crc64_nonzero: - case BCH_CSUM_crc32c: - case BCH_CSUM_xxhash: - case BCH_CSUM_crc64: { - struct bch2_checksum_state state; - - state.type = type; - bch2_checksum_init(&state); - -#ifdef CONFIG_HIGHMEM - __bio_for_each_segment(bv, bio, *iter, *iter) { - void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; - - bch2_checksum_update(&state, p, bv.bv_len); - kunmap_local(p); - } -#else - __bio_for_each_bvec(bv, bio, *iter, *iter) - bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, - bv.bv_len); -#endif - return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; - } - - case BCH_CSUM_chacha20_poly1305_80: - case BCH_CSUM_chacha20_poly1305_128: { - struct poly1305_desc_ctx dctx; - u8 digest[POLY1305_DIGEST_SIZE]; - struct bch_csum ret = { 0 }; - - bch2_poly1305_init(&dctx, c, nonce); - -#ifdef CONFIG_HIGHMEM - __bio_for_each_segment(bv, bio, *iter, *iter) { - void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; - - poly1305_update(&dctx, p, bv.bv_len); - kunmap_local(p); - } -#else - __bio_for_each_bvec(bv, bio, *iter, *iter) - poly1305_update(&dctx, - page_address(bv.bv_page) + bv.bv_offset, - bv.bv_len); -#endif - poly1305_final(&dctx, digest); - - memcpy(&ret, digest, bch_crc_bytes[type]); - return ret; - } - default: - return (struct bch_csum) {}; - } -} - -struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) -{ - struct bvec_iter iter = bio->bi_iter; - - return __bch2_checksum_bio(c, type, nonce, bio, &iter); -} - -int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) -{ - struct bio_vec bv; - struct bvec_iter iter; - struct chacha_state chacha_state; - int ret = 0; - - if (bch2_fs_inconsistent_on(!c->chacha20_key_set, - c, "attempting to encrypt without encryption key")) - return bch_err_throw(c, no_encryption_key); - - bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce); - - bio_for_each_segment(bv, bio, iter) { - void *p; - - /* - * chacha_crypt() assumes that the length is a multiple of - * CHACHA_BLOCK_SIZE on any non-final call. - */ - if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) { - bch_err_ratelimited(c, "bio not aligned for encryption"); - ret = -EIO; - break; - } - - p = bvec_kmap_local(&bv); - chacha20_crypt(&chacha_state, p, p, bv.bv_len); - kunmap_local(p); - } - chacha_zeroize_state(&chacha_state); - return ret; -} - -struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, - struct bch_csum b, size_t b_len) -{ - struct bch2_checksum_state state; - - state.type = type; - bch2_checksum_init(&state); - state.seed = le64_to_cpu(a.lo); - - BUG_ON(!bch2_checksum_mergeable(type)); - - while (b_len) { - unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE); - - bch2_checksum_update(&state, - page_address(ZERO_PAGE(0)), page_len); - b_len -= page_len; - } - a.lo = cpu_to_le64(bch2_checksum_final(&state)); - a.lo ^= b.lo; - a.hi ^= b.hi; - return a; -} - -int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, - struct bversion version, - struct bch_extent_crc_unpacked crc_old, - struct bch_extent_crc_unpacked *crc_a, - struct bch_extent_crc_unpacked *crc_b, - unsigned len_a, unsigned len_b, - unsigned new_csum_type) -{ - struct bvec_iter iter = bio->bi_iter; - struct nonce nonce = extent_nonce(version, crc_old); - struct bch_csum merged = { 0 }; - struct crc_split { - struct bch_extent_crc_unpacked *crc; - unsigned len; - unsigned csum_type; - struct bch_csum csum; - } splits[3] = { - { crc_a, len_a, new_csum_type, { 0 }}, - { crc_b, len_b, new_csum_type, { 0 } }, - { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } }, - }, *i; - bool mergeable = crc_old.csum_type == new_csum_type && - bch2_checksum_mergeable(new_csum_type); - unsigned crc_nonce = crc_old.nonce; - - BUG_ON(len_a + len_b > bio_sectors(bio)); - BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); - BUG_ON(crc_is_compressed(crc_old)); - BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != - bch2_csum_type_is_encryption(new_csum_type)); - - for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { - iter.bi_size = i->len << 9; - if (mergeable || i->crc) - i->csum = __bch2_checksum_bio(c, i->csum_type, - nonce, bio, &iter); - else - bio_advance_iter(bio, &iter, i->len << 9); - nonce = nonce_add(nonce, i->len << 9); - } - - if (mergeable) - for (i = splits; i < splits + ARRAY_SIZE(splits); i++) - merged = bch2_checksum_merge(new_csum_type, merged, - i->csum, i->len << 9); - else - merged = bch2_checksum_bio(c, crc_old.csum_type, - extent_nonce(version, crc_old), bio); - - if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" - " expected %0llx:%0llx got %0llx:%0llx (old type ", - __func__, - crc_old.csum.hi, - crc_old.csum.lo, - merged.hi, - merged.lo); - bch2_prt_csum_type(&buf, crc_old.csum_type); - prt_str(&buf, " new type "); - bch2_prt_csum_type(&buf, new_csum_type); - prt_str(&buf, ")"); - WARN_RATELIMIT(1, "%s", buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, recompute_checksum); - } - - for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { - if (i->crc) - *i->crc = (struct bch_extent_crc_unpacked) { - .csum_type = i->csum_type, - .compression_type = crc_old.compression_type, - .compressed_size = i->len, - .uncompressed_size = i->len, - .offset = 0, - .live_size = i->len, - .nonce = crc_nonce, - .csum = i->csum, - }; - - if (bch2_csum_type_is_encryption(new_csum_type)) - crc_nonce += i->len; - } - - return 0; -} - -/* BCH_SB_FIELD_crypt: */ - -static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - - if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&crypt->field), sizeof(*crypt)); - return -BCH_ERR_invalid_sb_crypt; - } - - if (BCH_CRYPT_KDF_TYPE(crypt)) { - prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); - return -BCH_ERR_invalid_sb_crypt; - } - - return 0; -} - -static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - - prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt)); - prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt)); - prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt)); - prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt)); -} - -const struct bch_sb_field_ops bch_sb_field_ops_crypt = { - .validate = bch2_sb_crypt_validate, - .to_text = bch2_sb_crypt_to_text, -}; - -#ifdef __KERNEL__ -static int __bch2_request_key(char *key_description, struct bch_key *key) -{ - struct key *keyring_key; - const struct user_key_payload *ukp; - int ret; - - keyring_key = request_key(&key_type_user, key_description, NULL); - if (IS_ERR(keyring_key)) - return PTR_ERR(keyring_key); - - down_read(&keyring_key->sem); - ukp = dereference_key_locked(keyring_key); - if (ukp->datalen == sizeof(*key)) { - memcpy(key, ukp->data, ukp->datalen); - ret = 0; - } else { - ret = -EINVAL; - } - up_read(&keyring_key->sem); - key_put(keyring_key); - - return ret; -} -#else -#include - -static int __bch2_request_key(char *key_description, struct bch_key *key) -{ - key_serial_t key_id; - - key_id = request_key("user", key_description, NULL, - KEY_SPEC_SESSION_KEYRING); - if (key_id >= 0) - goto got_key; - - key_id = request_key("user", key_description, NULL, - KEY_SPEC_USER_KEYRING); - if (key_id >= 0) - goto got_key; - - key_id = request_key("user", key_description, NULL, - KEY_SPEC_USER_SESSION_KEYRING); - if (key_id >= 0) - goto got_key; - - return -errno; -got_key: - - if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) - return -1; - - return 0; -} - -#include "crypto.h" -#endif - -int bch2_request_key(struct bch_sb *sb, struct bch_key *key) -{ - struct printbuf key_description = PRINTBUF; - int ret; - - prt_printf(&key_description, "bcachefs:"); - pr_uuid(&key_description, sb->user_uuid.b); - - ret = __bch2_request_key(key_description.buf, key); - printbuf_exit(&key_description); - -#ifndef __KERNEL__ - if (ret) { - char *passphrase = read_passphrase("Enter passphrase: "); - struct bch_encrypted_key sb_key; - - bch2_passphrase_check(sb, passphrase, - key, &sb_key); - ret = 0; - } -#endif - - /* stash with memfd, pass memfd fd to mount */ - - return ret; -} - -#ifndef __KERNEL__ -int bch2_revoke_key(struct bch_sb *sb) -{ - key_serial_t key_id; - struct printbuf key_description = PRINTBUF; - - prt_printf(&key_description, "bcachefs:"); - pr_uuid(&key_description, sb->user_uuid.b); - - key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING); - printbuf_exit(&key_description); - if (key_id < 0) - return errno; - - keyctl_revoke(key_id); - - return 0; -} -#endif - -int bch2_decrypt_sb_key(struct bch_fs *c, - struct bch_sb_field_crypt *crypt, - struct bch_key *key) -{ - struct bch_encrypted_key sb_key = crypt->key; - struct bch_key user_key; - int ret = 0; - - /* is key encrypted? */ - if (!bch2_key_is_encrypted(&sb_key)) - goto out; - - ret = bch2_request_key(c->disk_sb.sb, &user_key); - if (ret) { - bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); - goto err; - } - - /* decrypt real key: */ - bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key)); - - if (bch2_key_is_encrypted(&sb_key)) { - bch_err(c, "incorrect encryption key"); - ret = -EINVAL; - goto err; - } -out: - *key = sb_key.key; -err: - memzero_explicit(&sb_key, sizeof(sb_key)); - memzero_explicit(&user_key, sizeof(user_key)); - return ret; -} - -#if 0 - -/* - * This seems to be duplicating code in cmd_remove_passphrase() in - * bcachefs-tools, but we might want to switch userspace to use this - and - * perhaps add an ioctl for calling this at runtime, so we can take the - * passphrase off of a mounted filesystem (which has come up). - */ -int bch2_disable_encryption(struct bch_fs *c) -{ - struct bch_sb_field_crypt *crypt; - struct bch_key key; - int ret = -EINVAL; - - mutex_lock(&c->sb_lock); - - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); - if (!crypt) - goto out; - - /* is key encrypted? */ - ret = 0; - if (bch2_key_is_encrypted(&crypt->key)) - goto out; - - ret = bch2_decrypt_sb_key(c, crypt, &key); - if (ret) - goto out; - - crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC); - crypt->key.key = key; - - SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); - bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); - - return ret; -} - -/* - * For enabling encryption on an existing filesystem: not hooked up yet, but it - * should be - */ -int bch2_enable_encryption(struct bch_fs *c, bool keyed) -{ - struct bch_encrypted_key key; - struct bch_key user_key; - struct bch_sb_field_crypt *crypt; - int ret = -EINVAL; - - mutex_lock(&c->sb_lock); - - /* Do we already have an encryption key? */ - if (bch2_sb_field_get(c->disk_sb.sb, crypt)) - goto err; - - ret = bch2_alloc_ciphers(c); - if (ret) - goto err; - - key.magic = cpu_to_le64(BCH_KEY_MAGIC); - get_random_bytes(&key.key, sizeof(key.key)); - - if (keyed) { - ret = bch2_request_key(c->disk_sb.sb, &user_key); - if (ret) { - bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); - goto err; - } - - ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), - &key, sizeof(key)); - if (ret) - goto err; - } - - ret = crypto_skcipher_setkey(&c->chacha20->base, - (void *) &key.key, sizeof(key.key)); - if (ret) - goto err; - - crypt = bch2_sb_field_resize(&c->disk_sb, crypt, - sizeof(*crypt) / sizeof(u64)); - if (!crypt) { - ret = bch_err_throw(c, ENOSPC_sb_crypt); - goto err; - } - - crypt->key = key; - - /* write superblock */ - SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); - bch2_write_super(c); -err: - mutex_unlock(&c->sb_lock); - memzero_explicit(&user_key, sizeof(user_key)); - memzero_explicit(&key, sizeof(key)); - return ret; -} -#endif - -void bch2_fs_encryption_exit(struct bch_fs *c) -{ - memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key)); -} - -int bch2_fs_encryption_init(struct bch_fs *c) -{ - struct bch_sb_field_crypt *crypt; - int ret; - - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); - if (!crypt) - return 0; - - ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key); - if (ret) - return ret; - c->chacha20_key_set = true; - return 0; -} diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h deleted file mode 100644 index 7bd9cf6104ca12..00000000000000 --- a/fs/bcachefs/checksum.h +++ /dev/null @@ -1,240 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CHECKSUM_H -#define _BCACHEFS_CHECKSUM_H - -#include "bcachefs.h" -#include "extents_types.h" -#include "super-io.h" - -#include -#include - -static inline bool bch2_checksum_mergeable(unsigned type) -{ - - switch (type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c: - case BCH_CSUM_crc64: - return true; - default: - return false; - } -} - -struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, - struct bch_csum, size_t); - -#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) -#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) -#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) -#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) -#define BCH_NONCE_POLY cpu_to_le32(1 << 31) - -struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, - const void *, size_t); - -/* - * This is used for various on disk data structures - bch_sb, prio_set, bset, - * jset: The checksum is _always_ the first field of these structs - */ -#define csum_vstruct(_c, _type, _nonce, _i) \ -({ \ - const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\ - \ - bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\ -}) - -static inline void bch2_csum_to_text(struct printbuf *out, - enum bch_csum_type type, - struct bch_csum csum) -{ - const u8 *p = (u8 *) &csum; - unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16; - - for (unsigned i = 0; i < bytes; i++) - prt_hex_byte(out, p[i]); -} - -static inline void bch2_csum_err_msg(struct printbuf *out, - enum bch_csum_type type, - struct bch_csum expected, - struct bch_csum got) -{ - prt_str(out, "checksum error, type "); - bch2_prt_csum_type(out, type); - prt_str(out, ": got "); - bch2_csum_to_text(out, type, got); - prt_str(out, " should be "); - bch2_csum_to_text(out, type, expected); -} - -void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t); - -int bch2_request_key(struct bch_sb *, struct bch_key *); -#ifndef __KERNEL__ -int bch2_revoke_key(struct bch_sb *); -#endif - -int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, - void *data, size_t); - -struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); - -int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, - struct bch_extent_crc_unpacked, - struct bch_extent_crc_unpacked *, - struct bch_extent_crc_unpacked *, - unsigned, unsigned, unsigned); - -int __bch2_encrypt_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); - -static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) -{ - return bch2_csum_type_is_encryption(type) - ? __bch2_encrypt_bio(c, type, nonce, bio) - : 0; -} - -extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; - -int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, - struct bch_key *); - -#if 0 -int bch2_disable_encryption(struct bch_fs *); -int bch2_enable_encryption(struct bch_fs *, bool); -#endif - -void bch2_fs_encryption_exit(struct bch_fs *); -int bch2_fs_encryption_init(struct bch_fs *); - -static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type, - bool data) -{ - switch (type) { - case BCH_CSUM_OPT_none: - return BCH_CSUM_none; - case BCH_CSUM_OPT_crc32c: - return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; - case BCH_CSUM_OPT_crc64: - return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; - case BCH_CSUM_OPT_xxhash: - return BCH_CSUM_xxhash; - default: - BUG(); - } -} - -static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, - struct bch_io_opts opts) -{ - if (opts.nocow) - return 0; - - if (c->sb.encryption_type) - return c->opts.wide_macs - ? BCH_CSUM_chacha20_poly1305_128 - : BCH_CSUM_chacha20_poly1305_80; - - return bch2_csum_opt_to_type(opts.data_checksum, true); -} - -static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) -{ - if (c->sb.encryption_type) - return BCH_CSUM_chacha20_poly1305_128; - - return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); -} - -static inline bool bch2_checksum_type_valid(const struct bch_fs *c, - unsigned type) -{ - if (type >= BCH_CSUM_NR) - return false; - - if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set) - return false; - - return true; -} - -/* returns true if not equal */ -static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) -{ - /* - * XXX: need some way of preventing the compiler from optimizing this - * into a form that isn't constant time.. - */ - return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; -} - -/* for skipping ahead and encrypting/decrypting at an offset: */ -static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) -{ - EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - - le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); - return nonce; -} - -static inline struct nonce null_nonce(void) -{ - struct nonce ret; - - memset(&ret, 0, sizeof(ret)); - return ret; -} - -static inline struct nonce extent_nonce(struct bversion version, - struct bch_extent_crc_unpacked crc) -{ - unsigned compression_type = crc_is_compressed(crc) - ? crc.compression_type - : 0; - unsigned size = compression_type ? crc.uncompressed_size : 0; - struct nonce nonce = (struct nonce) {{ - [0] = cpu_to_le32(size << 22), - [1] = cpu_to_le32(version.lo), - [2] = cpu_to_le32(version.lo >> 32), - [3] = cpu_to_le32(version.hi| - (compression_type << 24))^BCH_NONCE_EXTENT, - }}; - - return nonce_add(nonce, crc.nonce << 9); -} - -static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) -{ - return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; -} - -static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) -{ - __le64 magic = __bch2_sb_magic(sb); - - return (struct nonce) {{ - [0] = 0, - [1] = 0, - [2] = ((__le32 *) &magic)[0], - [3] = ((__le32 *) &magic)[1], - }}; -} - -static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) -{ - __le64 magic = bch2_sb_magic(c); - - return (struct nonce) {{ - [0] = 0, - [1] = 0, - [2] = ((__le32 *) &magic)[0], - [3] = ((__le32 *) &magic)[1], - }}; -} - -#endif /* _BCACHEFS_CHECKSUM_H */ diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c deleted file mode 100644 index 8e9264b5a84e3d..00000000000000 --- a/fs/bcachefs/clock.c +++ /dev/null @@ -1,181 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "clock.h" - -#include -#include -#include - -static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args) -{ - struct io_timer **_l = (struct io_timer **)l; - struct io_timer **_r = (struct io_timer **)r; - - return (*_l)->expire < (*_r)->expire; -} - -static const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = NULL, -}; - -void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) -{ - spin_lock(&clock->timer_lock); - - if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { - spin_unlock(&clock->timer_lock); - timer->fn(timer); - return; - } - - for (size_t i = 0; i < clock->timers.nr; i++) - if (clock->timers.data[i] == timer) - goto out; - - BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL)); -out: - spin_unlock(&clock->timer_lock); -} - -void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) -{ - spin_lock(&clock->timer_lock); - - for (size_t i = 0; i < clock->timers.nr; i++) - if (clock->timers.data[i] == timer) { - min_heap_del(&clock->timers, i, &callbacks, NULL); - break; - } - - spin_unlock(&clock->timer_lock); -} - -struct io_clock_wait { - struct io_timer io_timer; - struct task_struct *task; - int expired; -}; - -static void io_clock_wait_fn(struct io_timer *timer) -{ - struct io_clock_wait *wait = container_of(timer, - struct io_clock_wait, io_timer); - - wait->expired = 1; - wake_up_process(wait->task); -} - -void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) -{ - struct io_clock_wait wait = { - .io_timer.expire = until, - .io_timer.fn = io_clock_wait_fn, - .io_timer.fn2 = (void *) _RET_IP_, - .task = current, - }; - - bch2_io_timer_add(clock, &wait.io_timer); - schedule(); - bch2_io_timer_del(clock, &wait.io_timer); -} - -unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock, - u64 io_until, unsigned long cpu_timeout) -{ - bool kthread = (current->flags & PF_KTHREAD) != 0; - struct io_clock_wait wait = { - .io_timer.expire = io_until, - .io_timer.fn = io_clock_wait_fn, - .io_timer.fn2 = (void *) _RET_IP_, - .task = current, - }; - - bch2_io_timer_add(clock, &wait.io_timer); - - set_current_state(TASK_INTERRUPTIBLE); - if (!(kthread && kthread_should_stop())) { - cpu_timeout = schedule_timeout(cpu_timeout); - try_to_freeze(); - } - - __set_current_state(TASK_RUNNING); - bch2_io_timer_del(clock, &wait.io_timer); - return cpu_timeout; -} - -void bch2_kthread_io_clock_wait(struct io_clock *clock, - u64 io_until, unsigned long cpu_timeout) -{ - bool kthread = (current->flags & PF_KTHREAD) != 0; - - while (!(kthread && kthread_should_stop()) && - cpu_timeout && - atomic64_read(&clock->now) < io_until) - cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout); -} - -static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) -{ - struct io_timer *ret = NULL; - - if (clock->timers.nr && - time_after_eq64(now, clock->timers.data[0]->expire)) { - ret = *min_heap_peek(&clock->timers); - min_heap_pop(&clock->timers, &callbacks, NULL); - } - - return ret; -} - -void __bch2_increment_clock(struct io_clock *clock, u64 sectors) -{ - struct io_timer *timer; - u64 now = atomic64_add_return(sectors, &clock->now); - - spin_lock(&clock->timer_lock); - while ((timer = get_expired_timer(clock, now))) - timer->fn(timer); - spin_unlock(&clock->timer_lock); -} - -void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) -{ - out->atomic++; - spin_lock(&clock->timer_lock); - u64 now = atomic64_read(&clock->now); - - printbuf_tabstop_push(out, 40); - prt_printf(out, "current time:\t%llu\n", now); - - for (unsigned i = 0; i < clock->timers.nr; i++) - prt_printf(out, "%ps %ps:\t%llu\n", - clock->timers.data[i]->fn, - clock->timers.data[i]->fn2, - clock->timers.data[i]->expire); - spin_unlock(&clock->timer_lock); - --out->atomic; -} - -void bch2_io_clock_exit(struct io_clock *clock) -{ - free_heap(&clock->timers); - free_percpu(clock->pcpu_buf); -} - -int bch2_io_clock_init(struct io_clock *clock) -{ - atomic64_set(&clock->now, 0); - spin_lock_init(&clock->timer_lock); - - clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); - - clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); - if (!clock->pcpu_buf) - return -BCH_ERR_ENOMEM_io_clock_init; - - if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) - return -BCH_ERR_ENOMEM_io_clock_init; - - return 0; -} diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h deleted file mode 100644 index 8769be2aa21e89..00000000000000 --- a/fs/bcachefs/clock.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CLOCK_H -#define _BCACHEFS_CLOCK_H - -void bch2_io_timer_add(struct io_clock *, struct io_timer *); -void bch2_io_timer_del(struct io_clock *, struct io_timer *); -unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long); -void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); - -void __bch2_increment_clock(struct io_clock *, u64); - -static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors, - int rw) -{ - struct io_clock *clock = &c->io_clock[rw]; - - if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= - IO_CLOCK_PCPU_SECTORS)) - __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); -} - -void bch2_io_clock_schedule_timeout(struct io_clock *, u64); - -void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); - -void bch2_io_clock_exit(struct io_clock *); -int bch2_io_clock_init(struct io_clock *); - -#endif /* _BCACHEFS_CLOCK_H */ diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h deleted file mode 100644 index 37554e4514fe70..00000000000000 --- a/fs/bcachefs/clock_types.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CLOCK_TYPES_H -#define _BCACHEFS_CLOCK_TYPES_H - -#include "util.h" - -#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) - -/* - * Clocks/timers in units of sectors of IO: - * - * Note - they use percpu batching, so they're only approximate. - */ - -struct io_timer; -typedef void (*io_timer_fn)(struct io_timer *); - -struct io_timer { - io_timer_fn fn; - void *fn2; - u64 expire; -}; - -/* Amount to buffer up on a percpu counter */ -#define IO_CLOCK_PCPU_SECTORS 128 - -typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap) io_timer_heap; - -struct io_clock { - atomic64_t now; - u16 __percpu *pcpu_buf; - unsigned max_slop; - - spinlock_t timer_lock; - io_timer_heap timers; -}; - -#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c deleted file mode 100644 index b37b1f325f0ae5..00000000000000 --- a/fs/bcachefs/compress.c +++ /dev/null @@ -1,773 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "checksum.h" -#include "compress.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "opts.h" -#include "super-io.h" - -#include -#include -#include - -static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type) -{ - switch (type) { - case BCH_COMPRESSION_TYPE_none: - case BCH_COMPRESSION_TYPE_incompressible: - return BCH_COMPRESSION_OPT_none; - case BCH_COMPRESSION_TYPE_lz4_old: - case BCH_COMPRESSION_TYPE_lz4: - return BCH_COMPRESSION_OPT_lz4; - case BCH_COMPRESSION_TYPE_gzip: - return BCH_COMPRESSION_OPT_gzip; - case BCH_COMPRESSION_TYPE_zstd: - return BCH_COMPRESSION_OPT_zstd; - default: - BUG(); - } -} - -/* Bounce buffer: */ -struct bbuf { - void *b; - enum { - BB_NONE, - BB_VMAP, - BB_KMALLOC, - BB_MEMPOOL, - } type; - int rw; -}; - -static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) -{ - void *b; - - BUG_ON(size > c->opts.encoded_extent_max); - - b = kmalloc(size, GFP_NOFS|__GFP_NOWARN); - if (b) - return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; - - b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS); - if (b) - return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; - - BUG(); -} - -static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) -{ - struct bio_vec bv; - struct bvec_iter iter; - void *expected_start = NULL; - - __bio_for_each_bvec(bv, bio, iter, start) { - if (expected_start && - expected_start != page_address(bv.bv_page) + bv.bv_offset) - return false; - - expected_start = page_address(bv.bv_page) + - bv.bv_offset + bv.bv_len; - } - - return true; -} - -static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, - struct bvec_iter start, int rw) -{ - struct bbuf ret; - struct bio_vec bv; - struct bvec_iter iter; - unsigned nr_pages = 0; - struct page *stack_pages[16]; - struct page **pages = NULL; - void *data; - - BUG_ON(start.bi_size > c->opts.encoded_extent_max); - - if (!PageHighMem(bio_iter_page(bio, start)) && - bio_phys_contig(bio, start)) - return (struct bbuf) { - .b = page_address(bio_iter_page(bio, start)) + - bio_iter_offset(bio, start), - .type = BB_NONE, .rw = rw - }; - - /* check if we can map the pages contiguously: */ - __bio_for_each_segment(bv, bio, iter, start) { - if (iter.bi_size != start.bi_size && - bv.bv_offset) - goto bounce; - - if (bv.bv_len < iter.bi_size && - bv.bv_offset + bv.bv_len < PAGE_SIZE) - goto bounce; - - nr_pages++; - } - - BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); - - pages = nr_pages > ARRAY_SIZE(stack_pages) - ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS) - : stack_pages; - if (!pages) - goto bounce; - - nr_pages = 0; - __bio_for_each_segment(bv, bio, iter, start) - pages[nr_pages++] = bv.bv_page; - - data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (pages != stack_pages) - kfree(pages); - - if (data) - return (struct bbuf) { - .b = data + bio_iter_offset(bio, start), - .type = BB_VMAP, .rw = rw - }; -bounce: - ret = __bounce_alloc(c, start.bi_size, rw); - - if (rw == READ) - memcpy_from_bio(ret.b, bio, start); - - return ret; -} - -static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) -{ - return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); -} - -static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) -{ - switch (buf.type) { - case BB_NONE: - break; - case BB_VMAP: - vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); - break; - case BB_KMALLOC: - kfree(buf.b); - break; - case BB_MEMPOOL: - mempool_free(buf.b, &c->compression_bounce[buf.rw]); - break; - } -} - -static inline void zlib_set_workspace(z_stream *strm, void *workspace) -{ -#ifdef __KERNEL__ - strm->workspace = workspace; -#endif -} - -static int __bio_uncompress(struct bch_fs *c, struct bio *src, - void *dst_data, struct bch_extent_crc_unpacked crc) -{ - struct bbuf src_data = { NULL }; - size_t src_len = src->bi_iter.bi_size; - size_t dst_len = crc.uncompressed_size << 9; - void *workspace; - int ret = 0, ret2; - - enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); - mempool_t *workspace_pool = &c->compress_workspace[opt]; - if (unlikely(!mempool_initialized(workspace_pool))) { - if (fsck_err(c, compression_type_not_marked_in_sb, - "compression type %s set but not marked in superblock", - __bch2_compression_types[crc.compression_type])) - ret = bch2_check_set_has_compressed_data(c, opt); - else - ret = bch_err_throw(c, compression_workspace_not_initialized); - if (ret) - goto err; - } - - src_data = bio_map_or_bounce(c, src, READ); - - switch (crc.compression_type) { - case BCH_COMPRESSION_TYPE_lz4_old: - case BCH_COMPRESSION_TYPE_lz4: - ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, - src_len, dst_len, dst_len); - if (ret2 != dst_len) - ret = bch_err_throw(c, decompress_lz4); - break; - case BCH_COMPRESSION_TYPE_gzip: { - z_stream strm = { - .next_in = src_data.b, - .avail_in = src_len, - .next_out = dst_data, - .avail_out = dst_len, - }; - - workspace = mempool_alloc(workspace_pool, GFP_NOFS); - - zlib_set_workspace(&strm, workspace); - zlib_inflateInit2(&strm, -MAX_WBITS); - ret2 = zlib_inflate(&strm, Z_FINISH); - - mempool_free(workspace, workspace_pool); - - if (ret2 != Z_STREAM_END) - ret = bch_err_throw(c, decompress_gzip); - break; - } - case BCH_COMPRESSION_TYPE_zstd: { - ZSTD_DCtx *ctx; - size_t real_src_len = le32_to_cpup(src_data.b); - - if (real_src_len > src_len - 4) { - ret = bch_err_throw(c, decompress_zstd_src_len_bad); - goto err; - } - - workspace = mempool_alloc(workspace_pool, GFP_NOFS); - ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - - ret2 = zstd_decompress_dctx(ctx, - dst_data, dst_len, - src_data.b + 4, real_src_len); - - mempool_free(workspace, workspace_pool); - - if (ret2 != dst_len) - ret = bch_err_throw(c, decompress_zstd); - break; - } - default: - BUG(); - } -err: -fsck_err: - bio_unmap_or_unbounce(c, src_data); - return ret; -} - -int bch2_bio_uncompress_inplace(struct bch_write_op *op, - struct bio *bio) -{ - struct bch_fs *c = op->c; - struct bch_extent_crc_unpacked *crc = &op->crc; - struct bbuf data = { NULL }; - size_t dst_len = crc->uncompressed_size << 9; - int ret = 0; - - /* bio must own its pages: */ - BUG_ON(!bio->bi_vcnt); - BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - - if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) { - bch2_write_op_error(op, op->pos.offset, - "extent too big to decompress (%u > %u)", - crc->uncompressed_size << 9, c->opts.encoded_extent_max); - return bch_err_throw(c, decompress_exceeded_max_encoded_extent); - } - - data = __bounce_alloc(c, dst_len, WRITE); - - ret = __bio_uncompress(c, bio, data.b, *crc); - - if (c->opts.no_data_io) - ret = 0; - - if (ret) { - bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret)); - goto err; - } - - /* - * XXX: don't have a good way to assert that the bio was allocated with - * enough space, we depend on bch2_move_extent doing the right thing - */ - bio->bi_iter.bi_size = crc->live_size << 9; - - memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); - - crc->csum_type = 0; - crc->compression_type = 0; - crc->compressed_size = crc->live_size; - crc->uncompressed_size = crc->live_size; - crc->offset = 0; - crc->csum = (struct bch_csum) { 0, 0 }; -err: - bio_unmap_or_unbounce(c, data); - return ret; -} - -int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, - struct bio *dst, struct bvec_iter dst_iter, - struct bch_extent_crc_unpacked crc) -{ - struct bbuf dst_data = { NULL }; - size_t dst_len = crc.uncompressed_size << 9; - int ret; - - if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || - crc.compressed_size << 9 > c->opts.encoded_extent_max) - return bch_err_throw(c, decompress_exceeded_max_encoded_extent); - - dst_data = dst_len == dst_iter.bi_size - ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) - : __bounce_alloc(c, dst_len, WRITE); - - ret = __bio_uncompress(c, src, dst_data.b, crc); - if (ret) - goto err; - - if (dst_data.type != BB_NONE && - dst_data.type != BB_VMAP) - memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); -err: - bio_unmap_or_unbounce(c, dst_data); - return ret; -} - -static int attempt_compress(struct bch_fs *c, - void *workspace, - void *dst, size_t dst_len, - void *src, size_t src_len, - struct bch_compression_opt compression) -{ - enum bch_compression_type compression_type = - __bch2_compression_opt_to_type[compression.type]; - - switch (compression_type) { - case BCH_COMPRESSION_TYPE_lz4: - if (compression.level < LZ4HC_MIN_CLEVEL) { - int len = src_len; - int ret = LZ4_compress_destSize( - src, dst, - &len, dst_len, - workspace); - if (len < src_len) - return -len; - - return ret; - } else { - int ret = LZ4_compress_HC( - src, dst, - src_len, dst_len, - compression.level, - workspace); - - return ret ?: -1; - } - case BCH_COMPRESSION_TYPE_gzip: { - z_stream strm = { - .next_in = src, - .avail_in = src_len, - .next_out = dst, - .avail_out = dst_len, - }; - - zlib_set_workspace(&strm, workspace); - if (zlib_deflateInit2(&strm, - compression.level - ? clamp_t(unsigned, compression.level, - Z_BEST_SPEED, Z_BEST_COMPRESSION) - : Z_DEFAULT_COMPRESSION, - Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, - Z_DEFAULT_STRATEGY) != Z_OK) - return 0; - - if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) - return 0; - - if (zlib_deflateEnd(&strm) != Z_OK) - return 0; - - return strm.total_out; - } - case BCH_COMPRESSION_TYPE_zstd: { - /* - * rescale: - * zstd max compression level is 22, our max level is 15 - */ - unsigned level = min((compression.level * 3) / 2, zstd_max_clevel()); - ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max); - ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size); - - /* - * ZSTD requires that when we decompress we pass in the exact - * compressed size - rounding it up to the nearest sector - * doesn't work, so we use the first 4 bytes of the buffer for - * that. - * - * Additionally, the ZSTD code seems to have a bug where it will - * write just past the end of the buffer - so subtract a fudge - * factor (7 bytes) from the dst buffer size to account for - * that. - */ - size_t len = zstd_compress_cctx(ctx, - dst + 4, dst_len - 4 - 7, - src, src_len, - ¶ms); - if (zstd_is_error(len)) - return 0; - - *((__le32 *) dst) = cpu_to_le32(len); - return len + 4; - } - default: - BUG(); - } -} - -static unsigned __bio_compress(struct bch_fs *c, - struct bio *dst, size_t *dst_len, - struct bio *src, size_t *src_len, - struct bch_compression_opt compression) -{ - struct bbuf src_data = { NULL }, dst_data = { NULL }; - void *workspace; - enum bch_compression_type compression_type = - __bch2_compression_opt_to_type[compression.type]; - unsigned pad; - int ret = 0; - - /* bch2_compression_decode catches unknown compression types: */ - BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR); - - mempool_t *workspace_pool = &c->compress_workspace[compression.type]; - if (unlikely(!mempool_initialized(workspace_pool))) { - if (fsck_err(c, compression_opt_not_marked_in_sb, - "compression opt %s set but not marked in superblock", - bch2_compression_opts[compression.type])) { - ret = bch2_check_set_has_compressed_data(c, compression.type); - if (ret) /* memory allocation failure, don't compress */ - return 0; - } else { - return 0; - } - } - - /* If it's only one block, don't bother trying to compress: */ - if (src->bi_iter.bi_size <= c->opts.block_size) - return BCH_COMPRESSION_TYPE_incompressible; - - dst_data = bio_map_or_bounce(c, dst, WRITE); - src_data = bio_map_or_bounce(c, src, READ); - - workspace = mempool_alloc(workspace_pool, GFP_NOFS); - - *src_len = src->bi_iter.bi_size; - *dst_len = dst->bi_iter.bi_size; - - /* - * XXX: this algorithm sucks when the compression code doesn't tell us - * how much would fit, like LZ4 does: - */ - while (1) { - if (*src_len <= block_bytes(c)) { - ret = -1; - break; - } - - ret = attempt_compress(c, workspace, - dst_data.b, *dst_len, - src_data.b, *src_len, - compression); - if (ret > 0) { - *dst_len = ret; - ret = 0; - break; - } - - /* Didn't fit: should we retry with a smaller amount? */ - if (*src_len <= *dst_len) { - ret = -1; - break; - } - - /* - * If ret is negative, it's a hint as to how much data would fit - */ - BUG_ON(-ret >= *src_len); - - if (ret < 0) - *src_len = -ret; - else - *src_len -= (*src_len - *dst_len) / 2; - *src_len = round_down(*src_len, block_bytes(c)); - } - - mempool_free(workspace, workspace_pool); - - if (ret) - goto err; - - /* Didn't get smaller: */ - if (round_up(*dst_len, block_bytes(c)) >= *src_len) - goto err; - - pad = round_up(*dst_len, block_bytes(c)) - *dst_len; - - memset(dst_data.b + *dst_len, 0, pad); - *dst_len += pad; - - if (dst_data.type != BB_NONE && - dst_data.type != BB_VMAP) - memcpy_to_bio(dst, dst->bi_iter, dst_data.b); - - BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); - BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); - BUG_ON(*dst_len & (block_bytes(c) - 1)); - BUG_ON(*src_len & (block_bytes(c) - 1)); - ret = compression_type; -out: - bio_unmap_or_unbounce(c, src_data); - bio_unmap_or_unbounce(c, dst_data); - return ret; -err: - ret = BCH_COMPRESSION_TYPE_incompressible; - goto out; -fsck_err: - ret = 0; - goto out; -} - -unsigned bch2_bio_compress(struct bch_fs *c, - struct bio *dst, size_t *dst_len, - struct bio *src, size_t *src_len, - unsigned compression_opt) -{ - unsigned orig_dst = dst->bi_iter.bi_size; - unsigned orig_src = src->bi_iter.bi_size; - unsigned compression_type; - - /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ - src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, - c->opts.encoded_extent_max); - /* Don't generate a bigger output than input: */ - dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - - compression_type = - __bio_compress(c, dst, dst_len, src, src_len, - bch2_compression_decode(compression_opt)); - - dst->bi_iter.bi_size = orig_dst; - src->bi_iter.bi_size = orig_src; - return compression_type; -} - -static int __bch2_fs_compress_init(struct bch_fs *, u64); - -#define BCH_FEATURE_none 0 - -static const unsigned bch2_compression_opt_to_feature[] = { -#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, - BCH_COMPRESSION_OPTS() -#undef x -}; - -#undef BCH_FEATURE_none - -static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) -{ - int ret = 0; - - if ((c->sb.features & f) == f) - return 0; - - mutex_lock(&c->sb_lock); - - if ((c->sb.features & f) == f) { - mutex_unlock(&c->sb_lock); - return 0; - } - - ret = __bch2_fs_compress_init(c, c->sb.features|f); - if (ret) { - mutex_unlock(&c->sb_lock); - return ret; - } - - c->disk_sb.sb->features[0] |= cpu_to_le64(f); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return 0; -} - -int bch2_check_set_has_compressed_data(struct bch_fs *c, - unsigned compression_opt) -{ - unsigned compression_type = bch2_compression_decode(compression_opt).type; - - BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); - - return compression_type - ? __bch2_check_set_has_compressed_data(c, - 1ULL << bch2_compression_opt_to_feature[compression_type]) - : 0; -} - -void bch2_fs_compress_exit(struct bch_fs *c) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) - mempool_exit(&c->compress_workspace[i]); - mempool_exit(&c->compression_bounce[WRITE]); - mempool_exit(&c->compression_bounce[READ]); -} - -static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) -{ - ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), - c->opts.encoded_extent_max); - - c->zstd_workspace_size = zstd_cctx_workspace_bound(¶ms.cParams); - - struct { - unsigned feature; - enum bch_compression_opts type; - size_t compress_workspace; - } compression_types[] = { - { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4, - max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, - { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip, - max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), - zlib_inflate_workspacesize()) }, - { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd, - max(c->zstd_workspace_size, - zstd_dctx_workspace_bound()) }, - }, *i; - bool have_compressed = false; - - for (i = compression_types; - i < compression_types + ARRAY_SIZE(compression_types); - i++) - have_compressed |= (features & (1 << i->feature)) != 0; - - if (!have_compressed) - return 0; - - if (!mempool_initialized(&c->compression_bounce[READ]) && - mempool_init_kvmalloc_pool(&c->compression_bounce[READ], - 1, c->opts.encoded_extent_max)) - return bch_err_throw(c, ENOMEM_compression_bounce_read_init); - - if (!mempool_initialized(&c->compression_bounce[WRITE]) && - mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], - 1, c->opts.encoded_extent_max)) - return bch_err_throw(c, ENOMEM_compression_bounce_write_init); - - for (i = compression_types; - i < compression_types + ARRAY_SIZE(compression_types); - i++) { - if (!(features & (1 << i->feature))) - continue; - - if (mempool_initialized(&c->compress_workspace[i->type])) - continue; - - if (mempool_init_kvmalloc_pool( - &c->compress_workspace[i->type], - 1, i->compress_workspace)) - return bch_err_throw(c, ENOMEM_compression_workspace_init); - } - - return 0; -} - -static u64 compression_opt_to_feature(unsigned v) -{ - unsigned type = bch2_compression_decode(v).type; - - return BIT_ULL(bch2_compression_opt_to_feature[type]); -} - -int bch2_fs_compress_init(struct bch_fs *c) -{ - u64 f = c->sb.features; - - f |= compression_opt_to_feature(c->opts.compression); - f |= compression_opt_to_feature(c->opts.background_compression); - - return __bch2_fs_compress_init(c, f); -} - -int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, - struct printbuf *err) -{ - char *val = kstrdup(_val, GFP_KERNEL); - char *p = val, *type_str, *level_str; - struct bch_compression_opt opt = { 0 }; - int ret; - - if (!val) - return -ENOMEM; - - type_str = strsep(&p, ":"); - level_str = p; - - ret = match_string(bch2_compression_opts, -1, type_str); - if (ret < 0 && err) - prt_printf(err, "invalid compression type\n"); - if (ret < 0) - goto err; - - opt.type = ret; - - if (level_str) { - unsigned level; - - ret = kstrtouint(level_str, 10, &level); - if (!ret && !opt.type && level) - ret = -EINVAL; - if (!ret && level > 15) - ret = -EINVAL; - if (ret < 0 && err) - prt_printf(err, "invalid compression level\n"); - if (ret < 0) - goto err; - - opt.level = level; - } - - *res = bch2_compression_encode(opt); -err: - kfree(val); - return ret; -} - -void bch2_compression_opt_to_text(struct printbuf *out, u64 v) -{ - struct bch_compression_opt opt = bch2_compression_decode(v); - - if (opt.type < BCH_COMPRESSION_OPT_NR) - prt_str(out, bch2_compression_opts[opt.type]); - else - prt_printf(out, "(unknown compression opt %u)", opt.type); - if (opt.level) - prt_printf(out, ":%u", opt.level); -} - -void bch2_opt_compression_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_sb *sb, - u64 v) -{ - return bch2_compression_opt_to_text(out, v); -} - -int bch2_opt_compression_validate(u64 v, struct printbuf *err) -{ - if (!bch2_compression_opt_valid(v)) { - prt_printf(err, "invalid compression opt %llu", v); - return -BCH_ERR_invalid_sb_opt_compression; - } - - return 0; -} diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h deleted file mode 100644 index bec2f05bfd52ac..00000000000000 --- a/fs/bcachefs/compress.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_COMPRESS_H -#define _BCACHEFS_COMPRESS_H - -#include "extents_types.h" - -static const unsigned __bch2_compression_opt_to_type[] = { -#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, - BCH_COMPRESSION_OPTS() -#undef x -}; - -struct bch_compression_opt { - u8 type:4, - level:4; -}; - -static inline struct bch_compression_opt __bch2_compression_decode(unsigned v) -{ - return (struct bch_compression_opt) { - .type = v & 15, - .level = v >> 4, - }; -} - -static inline bool bch2_compression_opt_valid(unsigned v) -{ - struct bch_compression_opt opt = __bch2_compression_decode(v); - - return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level); -} - -static inline struct bch_compression_opt bch2_compression_decode(unsigned v) -{ - return bch2_compression_opt_valid(v) - ? __bch2_compression_decode(v) - : (struct bch_compression_opt) { 0 }; -} - -static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) -{ - return opt.type|(opt.level << 4); -} - -static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) -{ - return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; -} - -struct bch_write_op; -int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *); -int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, - struct bvec_iter, struct bch_extent_crc_unpacked); -unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, - struct bio *, size_t *, unsigned); - -int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); -void bch2_fs_compress_exit(struct bch_fs *); -int bch2_fs_compress_init(struct bch_fs *); - -void bch2_compression_opt_to_text(struct printbuf *, u64); - -int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); -void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); -int bch2_opt_compression_validate(u64, struct printbuf *); - -#define bch2_opt_compression (struct bch_opt_fn) { \ - .parse = bch2_opt_compression_parse, \ - .to_text = bch2_opt_compression_to_text, \ - .validate = bch2_opt_compression_validate, \ -} - -#endif /* _BCACHEFS_COMPRESS_H */ diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c deleted file mode 100644 index e86d36d23e9e30..00000000000000 --- a/fs/bcachefs/darray.c +++ /dev/null @@ -1,38 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include "darray.h" - -int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) -{ - if (new_size > d->size) { - new_size = roundup_pow_of_two(new_size); - - /* - * This is a workaround: kvmalloc() doesn't support > INT_MAX - * allocations, but vmalloc() does. - * The limit needs to be lifted from kvmalloc, and when it does - * we'll go back to just using that. - */ - size_t bytes; - if (unlikely(check_mul_overflow(new_size, element_size, &bytes))) - return -ENOMEM; - - void *data = likely(bytes < INT_MAX) - ? kvmalloc_noprof(bytes, gfp) - : vmalloc_noprof(bytes); - if (!data) - return -ENOMEM; - - if (d->size) - memcpy(data, d->data, d->size * element_size); - if (d->data != d->preallocated) - kvfree(d->data); - d->data = data; - d->size = new_size; - } - - return 0; -} diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h deleted file mode 100644 index 4080ee99aaddba..00000000000000 --- a/fs/bcachefs/darray.h +++ /dev/null @@ -1,158 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DARRAY_H -#define _BCACHEFS_DARRAY_H - -/* - * Dynamic arrays: - * - * Inspired by CCAN's darray - */ - -#include -#include - -#define DARRAY_PREALLOCATED(_type, _nr) \ -struct { \ - size_t nr, size; \ - _type *data; \ - _type preallocated[_nr]; \ -} - -#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) - -typedef DARRAY(char) darray_char; -typedef DARRAY(char *) darray_str; -typedef DARRAY(const char *) darray_const_str; - -typedef DARRAY(u8) darray_u8; -typedef DARRAY(u16) darray_u16; -typedef DARRAY(u32) darray_u32; -typedef DARRAY(u64) darray_u64; - -typedef DARRAY(s8) darray_s8; -typedef DARRAY(s16) darray_s16; -typedef DARRAY(s32) darray_s32; -typedef DARRAY(s64) darray_s64; - -int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); - -#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__)) - -#define __darray_resize(_d, _element_size, _new_size, _gfp) \ - (unlikely((_new_size) > (_d)->size) \ - ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\ - : 0) - -#define darray_resize_gfp(_d, _new_size, _gfp) \ - __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp) - -#define darray_resize(_d, _new_size) \ - darray_resize_gfp(_d, _new_size, GFP_KERNEL) - -#define darray_make_room_gfp(_d, _more, _gfp) \ - darray_resize_gfp((_d), (_d)->nr + (_more), _gfp) - -#define darray_make_room(_d, _more) \ - darray_make_room_gfp(_d, _more, GFP_KERNEL) - -#define darray_room(_d) ((_d).size - (_d).nr) - -#define darray_top(_d) ((_d).data[(_d).nr]) - -#define darray_push_gfp(_d, _item, _gfp) \ -({ \ - int _ret = darray_make_room_gfp((_d), 1, _gfp); \ - \ - if (!_ret) \ - (_d)->data[(_d)->nr++] = (_item); \ - _ret; \ -}) - -#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL) - -#define darray_pop(_d) ((_d)->data[--(_d)->nr]) - -#define darray_first(_d) ((_d).data[0]) -#define darray_last(_d) ((_d).data[(_d).nr - 1]) - -#define darray_insert_item(_d, pos, _item) \ -({ \ - size_t _pos = (pos); \ - int _ret = darray_make_room((_d), 1); \ - \ - if (!_ret) \ - array_insert_item((_d)->data, (_d)->nr, _pos, (_item)); \ - _ret; \ -}) - -#define darray_remove_item(_d, _pos) \ - array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) - -#define darray_find_p(_d, _i, cond) \ -({ \ - typeof((_d).data) _ret = NULL; \ - \ - darray_for_each(_d, _i) \ - if (cond) { \ - _ret = _i; \ - break; \ - } \ - _ret; \ -}) - -#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item) - -/* Iteration: */ - -#define __darray_for_each(_d, _i) \ - for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) - -#define darray_for_each(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) - -#define darray_for_each_reverse(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) - -/* Init/exit */ - -#define darray_init(_d) \ -do { \ - (_d)->nr = 0; \ - (_d)->size = ARRAY_SIZE((_d)->preallocated); \ - (_d)->data = (_d)->size ? (_d)->preallocated : NULL; \ -} while (0) - -#define darray_exit(_d) \ -do { \ - if (!ARRAY_SIZE((_d)->preallocated) || \ - (_d)->data != (_d)->preallocated) \ - kvfree((_d)->data); \ - darray_init(_d); \ -} while (0) - -#define DEFINE_DARRAY_CLASS(_type) \ -DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void) - -#define DEFINE_DARRAY(_type) \ -typedef DARRAY(_type) darray_##_type; \ -DEFINE_DARRAY_CLASS(darray_##_type) - -#define DEFINE_DARRAY_NAMED(_name, _type) \ -typedef DARRAY(_type) _name; \ -DEFINE_DARRAY_CLASS(_name) - -DEFINE_DARRAY_CLASS(darray_char); -DEFINE_DARRAY_CLASS(darray_str) -DEFINE_DARRAY_CLASS(darray_const_str) - -DEFINE_DARRAY_CLASS(darray_u8) -DEFINE_DARRAY_CLASS(darray_u16) -DEFINE_DARRAY_CLASS(darray_u32) -DEFINE_DARRAY_CLASS(darray_u64) - -DEFINE_DARRAY_CLASS(darray_s8) -DEFINE_DARRAY_CLASS(darray_s16) -DEFINE_DARRAY_CLASS(darray_s32) -DEFINE_DARRAY_CLASS(darray_s64) - -#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c deleted file mode 100644 index e848e210a9bf76..00000000000000 --- a/fs/bcachefs/data_update.c +++ /dev/null @@ -1,1021 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "compress.h" -#include "data_update.h" -#include "disk_groups.h" -#include "ec.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "keylist.h" -#include "move.h" -#include "nocow_locking.h" -#include "rebalance.h" -#include "snapshot.h" -#include "subvolume.h" -#include "trace.h" - -#include - -static const char * const bch2_data_update_type_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_DATA_UPDATE_TYPES() -#undef x - NULL -}; - -static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) - bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); -} - -static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) { - if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; - bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); - } - return false; - } - } - return true; -} - -static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } -} - -static noinline_for_stack -bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs, - const struct bch_extent_ptr *start) -{ - if (!ctxt) { - bkey_for_each_ptr(ptrs, ptr) { - if (ptr == start) - break; - - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } - return false; - } - - __bkey_for_each_ptr(start, ptrs.end, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - bool locked; - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || - list_empty(&ctxt->ios)); - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); - } - return true; -} - -static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs) -{ - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) - return __bkey_nocow_lock(c, ctxt, ptrs, ptr); - } - - return true; -} - -noinline_for_stack -static void trace_io_move_finish2(struct data_update *u, - struct bkey_i *new, - struct bkey_i *insert) -{ - struct bch_fs *c = u->op.c; - struct printbuf buf = PRINTBUF; - - prt_newline(&buf); - - bch2_data_update_to_text(&buf, u); - prt_newline(&buf); - - prt_str_indented(&buf, "new replicas:\t"); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); - prt_newline(&buf); - - prt_str_indented(&buf, "insert:\t"); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_newline(&buf); - - trace_io_move_finish(c, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static void trace_io_move_fail2(struct data_update *m, - struct bkey_s_c new, - struct bkey_s_c wrote, - struct bkey_i *insert, - const char *msg) -{ - struct bch_fs *c = m->op.c; - struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - struct printbuf buf = PRINTBUF; - unsigned rewrites_found = 0; - - if (!trace_io_move_fail_enabled()) - return; - - prt_str(&buf, msg); - - if (insert) { - const union bch_extent_entry *entry; - struct bch_extent_ptr *ptr; - struct extent_ptr_decoded p; - - unsigned ptr_bit = 1; - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { - if ((ptr_bit & m->data_opts.rewrite_ptrs) && - (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && - !ptr->cached) - rewrites_found |= ptr_bit; - ptr_bit <<= 1; - } - } - - prt_str(&buf, "rewrites found:\t"); - bch2_prt_u64_base2(&buf, rewrites_found); - prt_newline(&buf); - - bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, new); - - prt_str(&buf, "\nwrote: "); - bch2_bkey_val_to_text(&buf, c, wrote); - - if (insert) { - prt_str(&buf, "\ninsert: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - } - - trace_io_move_fail(c, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static void trace_data_update2(struct data_update *m, - struct bkey_s_c old, struct bkey_s_c k, - struct bkey_i *insert) -{ - struct bch_fs *c = m->op.c; - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_data_update(c, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static void trace_io_move_created_rebalance2(struct data_update *m, - struct bkey_s_c old, struct bkey_s_c k, - struct bkey_i *insert) -{ - struct bch_fs *c = m->op.c; - struct printbuf buf = PRINTBUF; - - bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_io_move_created_rebalance(c, buf.buf); - printbuf_exit(&buf); - - this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); -} - -noinline_for_stack -static int data_update_invalid_bkey(struct data_update *m, - struct bkey_s_c old, struct bkey_s_c k, - struct bkey_i *insert) -{ - struct bch_fs *c = m->op.c; - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_str(&buf, "about to insert invalid key in data update path"); - prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_newline(&buf); - - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - return bch_err_throw(c, invalid_bkey); -} - -static int __bch2_data_update_index_update(struct btree_trans *trans, - struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_iter iter; - struct data_update *m = container_of(op, struct data_update, op); - int ret = 0; - - bch2_trans_iter_init(trans, &iter, m->btree_id, - bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k), - BTREE_ITER_slots|BTREE_ITER_intent); - - while (1) { - struct bkey_s_c k; - struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - struct bkey_i *insert = NULL; - struct bkey_i_extent *new; - const union bch_extent_entry *entry_c; - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_extent_ptr *ptr; - const struct bch_extent_ptr *ptr_c; - struct bpos next_pos; - bool should_check_enospc; - s64 i_sectors_delta = 0, disk_sectors_delta = 0; - unsigned rewrites_found = 0, durability, ptr_bit; - - bch2_trans_begin(trans); - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys)); - - if (!bch2_extents_match(k, old)) { - trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), - NULL, "no match:"); - goto nowork; - } - - insert = bch2_trans_kmalloc(trans, - bkey_bytes(k.k) + - bkey_val_bytes(&new->k) + - sizeof(struct bch_extent_rebalance)); - ret = PTR_ERR_OR_ZERO(insert); - if (ret) - goto err; - - bkey_reassemble(insert, k); - - new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys)); - bch2_cut_front(iter.pos, &new->k_i); - - bch2_cut_front(iter.pos, insert); - bch2_cut_back(new->k.p, insert); - bch2_cut_back(insert->k.p, &new->k_i); - - /* - * @old: extent that we read from - * @insert: key that we're going to update, initialized from - * extent currently in btree - same as @old unless we raced with - * other updates - * @new: extent with new pointers that we'll be adding to @insert - * - * Fist, drop rewrite_ptrs from @new: - */ - ptr_bit = 1; - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { - if ((ptr_bit & m->data_opts.rewrite_ptrs) && - (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && - !ptr->cached) { - bch2_extent_ptr_set_cached(c, &m->op.opts, - bkey_i_to_s(insert), ptr); - rewrites_found |= ptr_bit; - } - ptr_bit <<= 1; - } - - if (m->data_opts.rewrite_ptrs && - !rewrites_found && - bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { - trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); - goto nowork; - } - - /* - * A replica that we just wrote might conflict with a replica - * that we want to keep, due to racing with another move: - */ -restart_drop_conflicting_replicas: - extent_for_each_ptr(extent_i_to_s(new), ptr) - if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) && - !ptr_c->cached) { - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr); - goto restart_drop_conflicting_replicas; - } - - if (!bkey_val_u64s(&new->k)) { - trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); - goto nowork; - } - - /* Now, drop pointers that conflict with what we just wrote: */ - extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) - if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev))) - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); - - durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) + - bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); - - /* Now, drop excess replicas: */ - scoped_guard(rcu) { -restart_drop_extra_replicas: - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { - unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); - - if (!p.ptr.cached && - durability - ptr_durability >= m->op.opts.data_replicas) { - durability -= ptr_durability; - - bch2_extent_ptr_set_cached(c, &m->op.opts, - bkey_i_to_s(insert), &entry->ptr); - goto restart_drop_extra_replicas; - } - } - } - - /* Finally, add the pointers we just wrote: */ - extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) - bch2_extent_ptr_decoded_append(insert, &p); - - bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert)); - - ret = bch2_sum_sector_overwrites(trans, &iter, insert, - &should_check_enospc, - &i_sectors_delta, - &disk_sectors_delta); - if (ret) - goto err; - - if (disk_sectors_delta > (s64) op->res.sectors) { - ret = bch2_disk_reservation_add(c, &op->res, - disk_sectors_delta - op->res.sectors, - !should_check_enospc - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - goto out; - } - - next_pos = insert->k.p; - - /* - * Check for nonce offset inconsistency: - * This is debug code - we've been seeing this bug rarely, and - * it's been hard to reproduce, so this should give us some more - * information when it does occur: - */ - int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), - (struct bkey_validate_context) { - .btree = m->btree_id, - .flags = BCH_VALIDATE_commit, - }); - if (unlikely(invalid)) { - ret = data_update_invalid_bkey(m, old, k, insert); - goto out; - } - - ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: - bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: - bch2_insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, bkey_start_pos(&insert->k)) ?: - bch2_insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: - bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto err; - - if (trace_data_update_enabled()) - trace_data_update2(m, old, k, insert); - - if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > - bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) - trace_io_move_created_rebalance2(m, old, k, insert); - - ret = bch2_trans_commit(trans, &op->res, - NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - m->data_opts.btree_insert_flags); - if (ret) - goto err; - - bch2_btree_iter_set_pos(trans, &iter, next_pos); - - this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); - if (trace_io_move_finish_enabled()) - trace_io_move_finish2(m, &new->k_i, insert); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - if (ret) - break; -next: - while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) { - bch2_keylist_pop_front(&op->insert_keys); - if (bch2_keylist_empty(&op->insert_keys)) - goto out; - } - continue; -nowork: - if (m->stats) { - BUG_ON(k.k->p.offset <= iter.pos.offset); - atomic64_inc(&m->stats->keys_raced); - atomic64_add(k.k->p.offset - iter.pos.offset, - &m->stats->sectors_raced); - } - - count_event(c, io_move_fail); - - bch2_btree_iter_advance(trans, &iter); - goto next; - } -out: - bch2_trans_iter_exit(trans, &iter); - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); - return ret; -} - -int bch2_data_update_index_update(struct bch_write_op *op) -{ - return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); -} - -void bch2_data_update_read_done(struct data_update *m) -{ - m->read_done = true; - - /* write bio must own pages: */ - BUG_ON(!m->op.wbio.bio.bi_vcnt); - - m->op.crc = m->rbio.pick.crc; - m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; - - this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); - - closure_call(&m->op.cl, bch2_write, NULL, NULL); -} - -void bch2_data_update_exit(struct data_update *update) -{ - struct bch_fs *c = update->op.c; - struct bkey_s_c k = bkey_i_to_s_c(update->k.k); - - bch2_bio_free_pages_pool(c, &update->op.wbio.bio); - kfree(update->bvecs); - update->bvecs = NULL; - - if (c->opts.nocow_enabled) - bkey_nocow_unlock(c, k); - bkey_put_dev_refs(c, k); - bch2_disk_reservation_put(c, &update->op.res); - bch2_bkey_buf_exit(&update->k, c); -} - -static noinline_for_stack -int bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) -{ - struct bch_fs *c = update->op.c; - struct bkey_i_extent *e; - struct write_point *wp; - struct closure cl; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - closure_init_stack(&cl); - bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); - - while (bpos_lt(update->op.pos, update->k.k->k.p)) { - unsigned sectors = update->k.k->k.p.offset - - update->op.pos.offset; - - bch2_trans_begin(trans); - - bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, - BTREE_ITER_slots); - ret = lockrestart_do(trans, ({ - k = bch2_btree_iter_peek_slot(trans, &iter); - bkey_err(k); - })); - bch2_trans_iter_exit(trans, &iter); - - if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k))) - break; - - e = bkey_extent_init(update->op.insert_keys.top); - e->k.p = update->op.pos; - - ret = bch2_alloc_sectors_start_trans(trans, - update->op.target, - false, - update->op.write_point, - &update->op.devs_have, - update->op.nr_replicas, - update->op.nr_replicas, - update->op.watermark, - 0, &cl, &wp); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { - bch2_trans_unlock(trans); - closure_sync(&cl); - continue; - } - - bch_err_fn_ratelimited(c, ret); - - if (ret) - break; - - sectors = min(sectors, wp->sectors_free); - - bch2_key_resize(&e->k, sectors); - - bch2_open_bucket_get(c, wp, &update->op.open_buckets); - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); - bch2_alloc_sectors_done(c, wp); - - update->op.pos.offset += sectors; - - extent_for_each_ptr(extent_i_to_s(e), ptr) - ptr->unwritten = true; - bch2_keylist_push(&update->op.insert_keys); - - ret = __bch2_data_update_index_update(trans, &update->op); - - bch2_open_buckets_put(c, &update->op.open_buckets); - - if (ret) - break; - } - - if (closure_nr_remaining(&cl) != 1) { - bch2_trans_unlock(trans); - closure_sync(&cl); - } - - return ret; -} - -void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 20); - - prt_str_indented(out, "rewrite ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); - prt_newline(out); - - prt_str_indented(out, "kill ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->kill_ptrs); - prt_newline(out); - - prt_str_indented(out, "target:\t"); - bch2_target_to_text(out, c, data_opts->target); - prt_newline(out); - - prt_str_indented(out, "compression:\t"); - bch2_compression_opt_to_text(out, io_opts->background_compression); - prt_newline(out); - - prt_str_indented(out, "opts.replicas:\t"); - prt_u64(out, io_opts->data_replicas); - prt_newline(out); - - prt_str_indented(out, "extra replicas:\t"); - prt_u64(out, data_opts->extra_replicas); - prt_newline(out); - - prt_str_indented(out, "scrub:\t"); - prt_u64(out, data_opts->scrub); -} - -void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) -{ - prt_str(out, bch2_data_update_type_strs[m->type]); - prt_newline(out); - - bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - prt_newline(out); - - prt_str_indented(out, "old key:\t"); - bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); -} - -void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) -{ - bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); - prt_newline(out); - printbuf_indent_add(out, 2); - bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - - if (!m->read_done) { - prt_printf(out, "read:\n"); - printbuf_indent_add(out, 2); - bch2_read_bio_to_text(out, &m->rbio); - } else { - prt_printf(out, "write:\n"); - printbuf_indent_add(out, 2); - bch2_write_op_to_text(out, &m->op); - } - printbuf_indent_sub(out, 4); -} - -int bch2_extent_drop_ptrs(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bch_fs *c = trans->c; - struct bkey_i *n; - int ret; - - n = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - while (data_opts->kill_ptrs) { - unsigned i = 0, drop = __fls(data_opts->kill_ptrs); - - bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); - data_opts->kill_ptrs ^= 1U << drop; - } - - /* - * If the new extent no longer has any pointers, bch2_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_error key, or just a discard if it was a cached extent) - */ - bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n)); - - /* - * Since we're not inserting through an extent iterator - * (BTREE_ITER_all_snapshots iterators aren't extent iterators), - * we aren't using the extent overwrite path to delete, we're - * just using the normal key deletion path: - */ - if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents)) - n->k.size = 0; - - return bch2_trans_relock(trans) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -} - -static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts, - unsigned buf_bytes) -{ - unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); - - m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); - if (!m->bvecs) - return -ENOMEM; - - bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); - bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); - - if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { - kfree(m->bvecs); - m->bvecs = NULL; - return -ENOMEM; - } - - rbio_init(&m->rbio.bio, c, *io_opts, NULL); - m->rbio.data_update = true; - m->rbio.bio.bi_iter.bi_size = buf_bytes; - m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); - m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - return 0; -} - -int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - /* write path might have to decompress data: */ - unsigned buf_bytes = 0; - bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - - return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); -} - -static int can_write_extent(struct bch_fs *c, struct data_update *m) -{ - if ((m->op.flags & BCH_WRITE_alloc_nowait) && - unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) - return bch_err_throw(c, data_update_done_would_block); - - unsigned target = m->op.flags & BCH_WRITE_only_specified_devs - ? m->op.target - : 0; - struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); - - darray_for_each(m->op.devs_have, i) - __clear_bit(*i, devs.d); - - guard(rcu)(); - - unsigned nr_replicas = 0, i; - for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); - if (!ca) - continue; - - struct bch_dev_usage usage; - bch2_dev_usage_read_fast(ca, &usage); - - if (!dev_buckets_free(ca, usage, m->op.watermark)) - continue; - - nr_replicas += ca->mi.durability; - if (nr_replicas >= m->op.nr_replicas) - break; - } - - if (!nr_replicas) - return bch_err_throw(c, data_update_done_no_rw_devs); - if (nr_replicas < m->op.nr_replicas) - return bch_err_throw(c, insufficient_devices); - return 0; -} - -int bch2_data_update_init(struct btree_trans *trans, - struct btree_iter *iter, - struct moving_context *ctxt, - struct data_update *m, - struct write_point_specifier wp, - struct bch_io_opts *io_opts, - struct data_update_opts data_opts, - enum btree_id btree_id, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - if (k.k->p.snapshot) { - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) { - /* Can't repair yet, waiting on other recovery passes */ - return bch_err_throw(c, data_update_done_no_snapshot); - } - if (ret < 0) - return ret; - if (ret) /* key was deleted */ - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - bch_err_throw(c, data_update_done_no_snapshot); - ret = 0; - } - - bch2_bkey_buf_init(&m->k); - bch2_bkey_buf_reassemble(&m->k, c, k); - m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc - ? BCH_DATA_UPDATE_copygc - : BCH_DATA_UPDATE_rebalance; - m->btree_id = btree_id; - m->data_opts = data_opts; - m->ctxt = ctxt; - m->stats = ctxt ? ctxt->stats : NULL; - - bch2_write_op_init(&m->op, c, *io_opts); - m->op.pos = bkey_start_pos(k.k); - m->op.version = k.k->bversion; - m->op.target = data_opts.target; - m->op.write_point = wp; - m->op.nr_replicas = 0; - m->op.flags |= BCH_WRITE_pages_stable| - BCH_WRITE_pages_owned| - BCH_WRITE_data_encoded| - BCH_WRITE_move| - m->data_opts.write_flags; - m->op.compression_opt = io_opts->background_compression; - m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - - unsigned durability_have = 0, durability_removing = 0; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; - unsigned buf_bytes = 0; - bool unwritten = false; - - unsigned ptr_bit = 1; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (!p.ptr.cached) { - guard(rcu)(); - if (ptr_bit & m->data_opts.rewrite_ptrs) { - if (crc_is_compressed(p.crc)) - reserve_sectors += k.k->size; - - m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); - durability_removing += bch2_extent_ptr_desired_durability(c, &p); - } else if (!(ptr_bit & m->data_opts.kill_ptrs)) { - bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); - durability_have += bch2_extent_ptr_durability(c, &p); - } - } - - /* - * op->csum_type is normally initialized from the fs/file's - * current options - but if an extent is encrypted, we require - * that it stays encrypted: - */ - if (bch2_csum_type_is_encryption(p.crc.csum_type)) { - m->op.nonce = p.crc.nonce + p.crc.offset; - m->op.csum_type = p.crc.csum_type; - } - - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) - m->op.incompressible = true; - - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - unwritten |= p.ptr.unwritten; - - ptr_bit <<= 1; - } - - unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); - - /* - * If current extent durability is less than io_opts.data_replicas, - * we're not trying to rereplicate the extent up to data_replicas here - - * unless extra_replicas was specified - * - * Increasing replication is an explicit operation triggered by - * rereplicate, currently, so that users don't get an unexpected -ENOSPC - */ - m->op.nr_replicas = min(durability_removing, durability_required) + - m->data_opts.extra_replicas; - - /* - * If device(s) were set to durability=0 after data was written to them - * we can end up with a duribilty=0 extent, and the normal algorithm - * that tries not to increase durability doesn't work: - */ - if (!(durability_have + durability_removing)) - m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); - - m->op.nr_replicas_required = m->op.nr_replicas; - - /* - * It might turn out that we don't need any new replicas, if the - * replicas or durability settings have been changed since the extent - * was written: - */ - if (!m->op.nr_replicas) { - m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; - m->data_opts.rewrite_ptrs = 0; - /* if iter == NULL, it's just a promote */ - if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); - if (!ret) - ret = bch_err_throw(c, data_update_done_no_writes_needed); - goto out_bkey_buf_exit; - } - - /* - * Check if the allocation will succeed, to avoid getting an error later - * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless - * read: - * - * This guards against - * - BCH_WRITE_alloc_nowait allocations failing (promotes) - * - Destination target full - * - Device(s) in destination target offline - * - Insufficient durability available in destination target - * (i.e. trying to move a durability=2 replica to a target with a - * single durability=2 device) - */ - ret = can_write_extent(c, m); - if (ret) - goto out_bkey_buf_exit; - - if (reserve_sectors) { - ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, - m->data_opts.extra_replicas - ? 0 - : BCH_DISK_RESERVATION_NOFAIL); - if (ret) - goto out_bkey_buf_exit; - } - - if (!bkey_get_dev_refs(c, k)) { - ret = bch_err_throw(c, data_update_done_no_dev_refs); - goto out_put_disk_res; - } - - if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, ptrs)) { - ret = bch_err_throw(c, nocow_lock_blocked); - goto out_put_dev_refs; - } - - if (unwritten) { - ret = bch2_update_unwritten_extent(trans, m) ?: - bch_err_throw(c, data_update_done_unwritten); - goto out_nocow_unlock; - } - - bch2_trans_unlock(trans); - - ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); - if (ret) - goto out_nocow_unlock; - - return 0; -out_nocow_unlock: - if (c->opts.nocow_enabled) - bkey_nocow_unlock(c, k); -out_put_dev_refs: - bkey_put_dev_refs(c, k); -out_put_disk_res: - bch2_disk_reservation_put(c, &m->op.res); -out_bkey_buf_exit: - bch2_bkey_buf_exit(&m->k, c); - return ret; -} - -void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned ptr_bit = 1; - - bkey_for_each_ptr(ptrs, ptr) { - if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) { - opts->kill_ptrs |= ptr_bit; - opts->rewrite_ptrs ^= ptr_bit; - } - - ptr_bit <<= 1; - } -} diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h deleted file mode 100644 index 5e14d13568de8f..00000000000000 --- a/fs/bcachefs/data_update.h +++ /dev/null @@ -1,93 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _BCACHEFS_DATA_UPDATE_H -#define _BCACHEFS_DATA_UPDATE_H - -#include "bkey_buf.h" -#include "io_read.h" -#include "io_write_types.h" - -struct moving_context; - -struct data_update_opts { - unsigned rewrite_ptrs; - unsigned kill_ptrs; - u16 target; - u8 extra_replicas; - unsigned btree_insert_flags; - unsigned write_flags; - - int read_dev; - bool scrub; -}; - -void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, - struct bch_io_opts *, struct data_update_opts *); - -#define BCH_DATA_UPDATE_TYPES() \ - x(copygc, 0) \ - x(rebalance, 1) \ - x(promote, 2) - -enum bch_data_update_types { -#define x(n, id) BCH_DATA_UPDATE_##n = id, - BCH_DATA_UPDATE_TYPES() -#undef x -}; - -struct data_update { - enum bch_data_update_types type; - /* extent being updated: */ - bool read_done; - enum btree_id btree_id; - struct bkey_buf k; - struct data_update_opts data_opts; - struct moving_context *ctxt; - struct bch_move_stats *stats; - - struct bch_read_bio rbio; - struct bch_write_op op; - struct bio_vec *bvecs; -}; - -struct promote_op { - struct rcu_head rcu; - u64 start_time; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - - struct rhash_head hash; - struct bpos pos; - - struct work_struct work; - struct data_update write; - struct bio_vec bi_inline_vecs[]; /* must be last */ -}; - -void bch2_data_update_to_text(struct printbuf *, struct data_update *); -void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); - -int bch2_data_update_index_update(struct bch_write_op *); - -void bch2_data_update_read_done(struct data_update *); - -int bch2_extent_drop_ptrs(struct btree_trans *, - struct btree_iter *, - struct bkey_s_c, - struct bch_io_opts *, - struct data_update_opts *); - -int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, - struct bch_io_opts *); - -void bch2_data_update_exit(struct data_update *); -int bch2_data_update_init(struct btree_trans *, struct btree_iter *, - struct moving_context *, - struct data_update *, - struct write_point_specifier, - struct bch_io_opts *, struct data_update_opts, - enum btree_id, struct bkey_s_c); -void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); - -#endif /* _BCACHEFS_DATA_UPDATE_H */ diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c deleted file mode 100644 index 07c2a0f73cc204..00000000000000 --- a/fs/bcachefs/debug.c +++ /dev/null @@ -1,996 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Assorted bcachefs debug code - * - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "data_update.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "fsck.h" -#include "inode.h" -#include "journal_reclaim.h" -#include "super.h" - -#include -#include -#include -#include -#include - -static struct dentry *bch_debug; - -static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, - struct extent_ptr_decoded pick) -{ - struct btree *v = c->verify_data; - struct btree_node *n_ondisk = c->verify_ondisk; - struct btree_node *n_sorted = c->verify_data->data; - struct bset *sorted, *inmemory = &b->data->keys; - struct bio *bio; - bool failed = false; - - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_verify_replicas); - if (!ca) - return false; - - bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_sorted, btree_buf_bytes(b)), - REQ_OP_READ|REQ_META, - GFP_NOFS, - &c->btree_bio); - bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_sorted, btree_buf_bytes(b)); - - submit_bio_wait(bio); - - bio_put(bio); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_verify_replicas); - - memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); - - v->written = 0; - if (bch2_btree_node_read_done(c, ca, v, NULL, NULL)) - return false; - - n_sorted = c->verify_data->data; - sorted = &n_sorted->keys; - - if (inmemory->u64s != sorted->u64s || - memcmp(inmemory->start, - sorted->start, - vstruct_end(inmemory) - (void *) inmemory->start)) { - unsigned offset = 0, sectors; - struct bset *i; - unsigned j; - - console_lock(); - - printk(KERN_ERR "*** in memory:\n"); - bch2_dump_bset(c, b, inmemory, 0); - - printk(KERN_ERR "*** read back in:\n"); - bch2_dump_bset(c, v, sorted, 0); - - while (offset < v->written) { - if (!offset) { - i = &n_ondisk->keys; - sectors = vstruct_blocks(n_ondisk, c->block_bits) << - c->block_bits; - } else { - struct btree_node_entry *bne = - (void *) n_ondisk + (offset << 9); - i = &bne->keys; - - sectors = vstruct_blocks(bne, c->block_bits) << - c->block_bits; - } - - printk(KERN_ERR "*** on disk block %u:\n", offset); - bch2_dump_bset(c, b, i, offset); - - offset += sectors; - } - - for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) - if (inmemory->_data[j] != sorted->_data[j]) - break; - - console_unlock(); - bch_err(c, "verify failed at key %u", j); - - failed = true; - } - - if (v->written != b->written) { - bch_err(c, "written wrong: expected %u, got %u", - b->written, v->written); - failed = true; - } - - return failed; -} - -void __bch2_btree_verify(struct bch_fs *c, struct btree *b) -{ - struct bkey_ptrs_c ptrs; - struct extent_ptr_decoded p; - const union bch_extent_entry *entry; - struct btree *v; - struct bset *inmemory = &b->data->keys; - struct bkey_packed *k; - bool failed = false; - - if (c->opts.nochanges) - return; - - bch2_btree_node_io_lock(b); - mutex_lock(&c->verify_lock); - - if (!c->verify_ondisk) { - c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); - if (!c->verify_ondisk) - goto out; - } - - if (!c->verify_data) { - c->verify_data = __bch2_btree_node_mem_alloc(c); - if (!c->verify_data) - goto out; - } - - BUG_ON(b->nsets != 1); - - for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k)) - if (k->type == KEY_TYPE_btree_ptr_v2) - ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0; - - v = c->verify_data; - bkey_copy(&v->key, &b->key); - v->c.level = b->c.level; - v->c.btree_id = b->c.btree_id; - bch2_btree_keys_init(v); - - ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); - bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry) - failed |= bch2_btree_verify_replica(c, b, p); - - if (failed) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf); - printbuf_exit(&buf); - } -out: - mutex_unlock(&c->verify_lock); - bch2_btree_node_io_unlock(b); -} - -void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, - const struct btree *b) -{ - struct btree_node *n_ondisk = NULL; - struct extent_ptr_decoded pick; - struct bch_dev *ca; - struct bio *bio = NULL; - unsigned offset = 0; - int ret; - - if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { - prt_printf(out, "error getting device to read from: invalid device\n"); - return; - } - - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_node_ondisk_to_text); - if (!ca) { - prt_printf(out, "error getting device to read from: not online\n"); - return; - } - - n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); - if (!n_ondisk) { - prt_printf(out, "memory allocation failure\n"); - goto out; - } - - bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_ondisk, btree_buf_bytes(b)), - REQ_OP_READ|REQ_META, - GFP_NOFS, - &c->btree_bio); - bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b)); - - ret = submit_bio_wait(bio); - if (ret) { - prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret)); - goto out; - } - - while (offset < btree_sectors(c)) { - struct bset *i; - struct nonce nonce; - struct bch_csum csum; - struct bkey_packed *k; - unsigned sectors; - - if (!offset) { - i = &n_ondisk->keys; - - if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { - prt_printf(out, "unknown checksum type at offset %u: %llu\n", - offset, BSET_CSUM_TYPE(i)); - goto out; - } - - nonce = btree_nonce(i, offset << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk); - - if (bch2_crc_cmp(csum, n_ondisk->csum)) { - prt_printf(out, "invalid checksum\n"); - goto out; - } - - bset_encrypt(c, i, offset << 9); - - sectors = vstruct_sectors(n_ondisk, c->block_bits); - } else { - struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9); - - i = &bne->keys; - - if (i->seq != n_ondisk->keys.seq) - break; - - if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { - prt_printf(out, "unknown checksum type at offset %u: %llu\n", - offset, BSET_CSUM_TYPE(i)); - goto out; - } - - nonce = btree_nonce(i, offset << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - - if (bch2_crc_cmp(csum, bne->csum)) { - prt_printf(out, "invalid checksum"); - goto out; - } - - bset_encrypt(c, i, offset << 9); - - sectors = vstruct_sectors(bne, c->block_bits); - } - - prt_printf(out, " offset %u version %u, journal seq %llu\n", - offset, - le16_to_cpu(i->version), - le64_to_cpu(i->journal_seq)); - offset += sectors; - - printbuf_indent_add(out, 4); - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) { - struct bkey u; - - bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u)); - prt_newline(out); - } - - printbuf_indent_sub(out, 4); - } -out: - if (bio) - bio_put(bio); - kvfree(n_ondisk); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_node_ondisk_to_text); -} - -#ifdef CONFIG_DEBUG_FS - -ssize_t bch2_debugfs_flush_buf(struct dump_iter *i) -{ - if (i->buf.pos) { - size_t bytes = min_t(size_t, i->buf.pos, i->size); - int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes); - - i->ret += copied; - i->ubuf += copied; - i->size -= copied; - i->buf.pos -= copied; - memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); - - if (i->buf.last_newline >= copied) - i->buf.last_newline -= copied; - if (i->buf.last_field >= copied) - i->buf.last_field -= copied; - - if (copied != bytes) - return -EFAULT; - } - - return i->size ? 0 : i->ret; -} - -static int bch2_dump_open(struct inode *inode, struct file *file) -{ - struct btree_debug *bd = inode->i_private; - struct dump_iter *i; - - i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) - return -ENOMEM; - - file->private_data = i; - i->from = POS_MIN; - i->iter = 0; - i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); - i->id = bd->id; - i->buf = PRINTBUF; - - return 0; -} - -int bch2_dump_release(struct inode *inode, struct file *file) -{ - struct dump_iter *i = file->private_data; - - printbuf_exit(&i->buf); - kfree(i); - return 0; -} - -static ssize_t bch2_read_btree(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - return bch2_debugfs_flush_buf(i) ?: - bch2_trans_run(i->c, - for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - bch2_bkey_val_to_text(&i->buf, i->c, k); - prt_newline(&i->buf); - bch2_trans_unlock(trans); - i->from = bpos_successor(iter.pos); - bch2_debugfs_flush_buf(i); - }))) ?: - i->ret; -} - -static const struct file_operations btree_debug_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_read_btree, -}; - -static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - ssize_t ret = bch2_debugfs_flush_buf(i); - if (ret) - return ret; - - if (bpos_eq(SPOS_MAX, i->from)) - return i->ret; - - return bch2_trans_run(i->c, - for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ - bch2_btree_node_to_text(&i->buf, i->c, b); - i->from = !bpos_eq(SPOS_MAX, b->key.k.p) - ? bpos_successor(b->key.k.p) - : b->key.k.p; - - drop_locks_do(trans, bch2_debugfs_flush_buf(i)); - }))) ?: i->ret; -} - -static const struct file_operations btree_format_debug_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_read_btree_formats, -}; - -static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - return bch2_debugfs_flush_buf(i) ?: - bch2_trans_run(i->c, - for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - struct btree_path_level *l = - &btree_iter_path(trans, &iter)->l[0]; - struct bkey_packed *_k = - bch2_btree_node_iter_peek(&l->iter, l->b); - - if (bpos_gt(l->b->key.k.p, i->prev_node)) { - bch2_btree_node_to_text(&i->buf, i->c, l->b); - i->prev_node = l->b->key.k.p; - } - - bch2_bfloat_to_text(&i->buf, l->b, _k); - bch2_trans_unlock(trans); - i->from = bpos_successor(iter.pos); - bch2_debugfs_flush_buf(i); - }))) ?: - i->ret; -} - -static const struct file_operations bfloat_failed_debug_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_read_bfloat_failed, -}; - -static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, - struct btree *b) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - prt_printf(out, "%px ", b); - bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); - prt_printf(out, "\n"); - - printbuf_indent_add(out, 2); - - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); - prt_newline(out); - - prt_printf(out, "flags:\t"); - prt_bitflags(out, bch2_btree_node_flags, b->flags); - prt_newline(out); - - prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL); - prt_printf(out, "written:\t%u\n", b->written); - prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked)); - prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable); - - prt_printf(out, "journal pin %px:\t%llu\n", - &b->writes[0].journal, b->writes[0].journal.seq); - prt_printf(out, "journal pin %px:\t%llu\n", - &b->writes[1].journal, b->writes[1].journal.seq); - - prt_printf(out, "ob:\t%u\n", b->ob.nr); - - printbuf_indent_sub(out, 2); -} - -static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - bool done = false; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - do { - ret = bch2_debugfs_flush_buf(i); - if (ret) - return ret; - - i->buf.atomic++; - scoped_guard(rcu) { - struct bucket_table *tbl = - rht_dereference_rcu(c->btree_cache.table.tbl, - &c->btree_cache.table); - if (i->iter < tbl->size) { - struct rhash_head *pos; - struct btree *b; - - rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) - bch2_cached_btree_node_to_text(&i->buf, c, b); - i->iter++; - } else { - done = true; - } - } - --i->buf.atomic; - } while (!done); - - if (i->buf.allocation_failure) - ret = -ENOMEM; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static const struct file_operations cached_btree_nodes_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_cached_btree_nodes_read, -}; - -typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r); - -static void list_sort(struct list_head *head, list_cmp_fn cmp) -{ - struct list_head *pos; - - list_for_each(pos, head) - while (!list_is_last(pos, head) && - cmp(pos, pos->next) > 0) { - struct list_head *pos2, *next = pos->next; - - list_del(next); - list_for_each(pos2, head) - if (cmp(next, pos2) < 0) - goto pos_found; - BUG(); -pos_found: - list_add_tail(next, pos2); - } -} - -static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r) -{ - return cmp_int(l, r); -} - -static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - struct btree_trans *trans; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); -restart: - seqmutex_lock(&c->btree_trans_lock); - list_sort(&c->btree_trans_list, list_ptr_order_cmp); - - list_for_each_entry(trans, &c->btree_trans_list, list) { - if ((ulong) trans <= i->iter) - continue; - - i->iter = (ulong) trans; - - if (!closure_get_not_zero(&trans->ref)) - continue; - - if (!trans->srcu_held) { - closure_put(&trans->ref); - continue; - } - - u32 seq = seqmutex_unlock(&c->btree_trans_lock); - - bch2_btree_trans_to_text(&i->buf, trans); - - prt_printf(&i->buf, "backtrace:\n"); - printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); - printbuf_indent_sub(&i->buf, 2); - prt_newline(&i->buf); - - closure_put(&trans->ref); - - ret = bch2_debugfs_flush_buf(i); - if (ret) - goto unlocked; - - if (!seqmutex_relock(&c->btree_trans_lock, seq)) - goto restart; - } - seqmutex_unlock(&c->btree_trans_lock); -unlocked: - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - - if (i->buf.allocation_failure) - ret = -ENOMEM; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static const struct file_operations btree_transactions_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_btree_transactions_read, -}; - -static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - bool done = false; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - while (1) { - err = bch2_debugfs_flush_buf(i); - if (err) - return err; - - if (!i->size) - break; - - if (done) - break; - - done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); - i->iter++; - } - - if (i->buf.allocation_failure) - return -ENOMEM; - - return i->ret; -} - -static const struct file_operations journal_pins_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_journal_pins_read, -}; - -static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (!i->iter) { - bch2_btree_updates_to_text(&i->buf, c); - i->iter++; - } - - err = bch2_debugfs_flush_buf(i); - if (err) - return err; - - if (i->buf.allocation_failure) - return -ENOMEM; - - return i->ret; -} - -static const struct file_operations btree_updates_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_btree_updates_read, -}; - -static int btree_transaction_stats_open(struct inode *inode, struct file *file) -{ - struct bch_fs *c = inode->i_private; - struct dump_iter *i; - - i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) - return -ENOMEM; - - i->iter = 1; - i->c = c; - i->buf = PRINTBUF; - file->private_data = i; - - return 0; -} - -static int btree_transaction_stats_release(struct inode *inode, struct file *file) -{ - struct dump_iter *i = file->private_data; - - printbuf_exit(&i->buf); - kfree(i); - - return 0; -} - -static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - while (1) { - struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; - - err = bch2_debugfs_flush_buf(i); - if (err) - return err; - - if (!i->size) - break; - - if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) || - !bch2_btree_transaction_fns[i->iter]) - break; - - prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]); - printbuf_indent_add(&i->buf, 2); - - mutex_lock(&s->lock); - - prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - printbuf_indent_add(&i->buf, 2); - bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); - printbuf_indent_sub(&i->buf, 2); -#endif - - prt_printf(&i->buf, "Transaction duration:\n"); - - printbuf_indent_add(&i->buf, 2); - bch2_time_stats_to_text(&i->buf, &s->duration); - printbuf_indent_sub(&i->buf, 2); - - if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { - prt_printf(&i->buf, "Lock hold times:\n"); - - printbuf_indent_add(&i->buf, 2); - bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); - printbuf_indent_sub(&i->buf, 2); - } - - if (s->max_paths_text) { - prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths); - - printbuf_indent_add(&i->buf, 2); - prt_str_indented(&i->buf, s->max_paths_text); - printbuf_indent_sub(&i->buf, 2); - } - - mutex_unlock(&s->lock); - - printbuf_indent_sub(&i->buf, 2); - prt_newline(&i->buf); - i->iter++; - } - - if (i->buf.allocation_failure) - return -ENOMEM; - - return i->ret; -} - -static const struct file_operations btree_transaction_stats_op = { - .owner = THIS_MODULE, - .open = btree_transaction_stats_open, - .release = btree_transaction_stats_release, - .read = btree_transaction_stats_read, -}; - -/* walk btree transactions until we find a deadlock and print it */ -static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct btree_trans *trans; - ulong iter = 0; -restart: - seqmutex_lock(&c->btree_trans_lock); - list_sort(&c->btree_trans_list, list_ptr_order_cmp); - - list_for_each_entry(trans, &c->btree_trans_list, list) { - if ((ulong) trans <= iter) - continue; - - iter = (ulong) trans; - - if (!closure_get_not_zero(&trans->ref)) - continue; - - u32 seq = seqmutex_unlock(&c->btree_trans_lock); - - bool found = bch2_check_for_deadlock(trans, out) != 0; - - closure_put(&trans->ref); - - if (found) - return; - - if (!seqmutex_relock(&c->btree_trans_lock, seq)) - goto restart; - } - seqmutex_unlock(&c->btree_trans_lock); -} - -typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); - -static ssize_t bch2_simple_print(struct file *file, char __user *buf, - size_t size, loff_t *ppos, - fs_to_text_fn fn) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (!i->iter) { - fn(&i->buf, c); - i->iter++; - } - - if (i->buf.allocation_failure) - ret = -ENOMEM; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); -} - -static const struct file_operations btree_deadlock_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_btree_deadlock_read, -}; - -static ssize_t bch2_write_points_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); -} - -static const struct file_operations write_points_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_write_points_read, -}; - -void bch2_fs_debug_exit(struct bch_fs *c) -{ - if (!IS_ERR_OR_NULL(c->fs_debug_dir)) - debugfs_remove_recursive(c->fs_debug_dir); -} - -static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd) -{ - struct dentry *d; - - d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir); - - debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops); - - debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops); - - debugfs_create_file("bfloat-failed", 0400, d, bd, - &bfloat_failed_debug_ops); -} - -void bch2_fs_debug_init(struct bch_fs *c) -{ - struct btree_debug *bd; - char name[100]; - - if (IS_ERR_OR_NULL(bch_debug)) - return; - - if (c->sb.multi_device) - snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); - else - strscpy(name, c->name, sizeof(name)); - - c->fs_debug_dir = debugfs_create_dir(name, bch_debug); - if (IS_ERR_OR_NULL(c->fs_debug_dir)) - return; - - debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, - c->btree_debug, &cached_btree_nodes_ops); - - debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, - c->btree_debug, &btree_transactions_ops); - - debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, - c->btree_debug, &journal_pins_ops); - - debugfs_create_file("btree_updates", 0400, c->fs_debug_dir, - c->btree_debug, &btree_updates_ops); - - debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, - c, &btree_transaction_stats_op); - - debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, - c->btree_debug, &btree_deadlock_ops); - - debugfs_create_file("write_points", 0400, c->fs_debug_dir, - c->btree_debug, &write_points_ops); - - bch2_fs_async_obj_debugfs_init(c); - - c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); - if (IS_ERR_OR_NULL(c->btree_debug_dir)) - return; - - for (bd = c->btree_debug; - bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); - bd++) { - bd->id = bd - c->btree_debug; - bch2_fs_debug_btree_init(c, bd); - } -} - -#endif - -void bch2_debug_exit(void) -{ - if (!IS_ERR_OR_NULL(bch_debug)) - debugfs_remove_recursive(bch_debug); -} - -int __init bch2_debug_init(void) -{ - bch_debug = debugfs_create_dir("bcachefs", NULL); - return 0; -} diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h deleted file mode 100644 index d88b1194b8acc0..00000000000000 --- a/fs/bcachefs/debug.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DEBUG_H -#define _BCACHEFS_DEBUG_H - -#include "bcachefs.h" - -struct bio; -struct btree; -struct bch_fs; - -void __bch2_btree_verify(struct bch_fs *, struct btree *); -void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, - const struct btree *); - -static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) -{ - if (static_branch_unlikely(&bch2_verify_btree_ondisk)) - __bch2_btree_verify(c, b); -} - -#ifdef CONFIG_DEBUG_FS -struct dump_iter { - struct bch_fs *c; - struct async_obj_list *list; - enum btree_id id; - struct bpos from; - struct bpos prev_node; - u64 iter; - - struct printbuf buf; - - char __user *ubuf; /* destination user buffer */ - size_t size; /* size of requested read */ - ssize_t ret; /* bytes read so far */ -}; - -ssize_t bch2_debugfs_flush_buf(struct dump_iter *); -int bch2_dump_release(struct inode *, struct file *); - -void bch2_fs_debug_exit(struct bch_fs *); -void bch2_fs_debug_init(struct bch_fs *); -#else -static inline void bch2_fs_debug_exit(struct bch_fs *c) {} -static inline void bch2_fs_debug_init(struct bch_fs *c) {} -#endif - -void bch2_debug_exit(void); -int bch2_debug_init(void); - -#endif /* _BCACHEFS_DEBUG_H */ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c deleted file mode 100644 index 28875c5c86add7..00000000000000 --- a/fs/bcachefs/dirent.c +++ /dev/null @@ -1,766 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "extents.h" -#include "dirent.h" -#include "fs.h" -#include "keylist.h" -#include "str_hash.h" -#include "subvolume.h" - -#include - -#ifdef CONFIG_UNICODE -int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) -{ - *out_cf = (struct qstr) QSTR_INIT(NULL, 0); - - if (!bch2_fs_casefold_enabled(trans->c)) - return -EOPNOTSUPP; - - unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); - int ret = PTR_ERR_OR_ZERO(buf); - if (ret) - return ret; - - ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1); - if (ret <= 0) - return ret; - - *out_cf = (struct qstr) QSTR_INIT(buf, ret); - return 0; -} -#endif - -static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) -{ - if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) - return 0; - - unsigned bkey_u64s = bkey_val_u64s(d.k); - unsigned bkey_bytes = bkey_u64s * sizeof(u64); - u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1]; -#if CPU_BIG_ENDIAN - unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8; -#else - unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8; -#endif - - return bkey_bytes - - (d.v->d_casefold - ? offsetof(struct bch_dirent, d_cf_name_block.d_names) - : offsetof(struct bch_dirent, d_name)) - - trailing_nuls; -} - -struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) -{ - if (d.v->d_casefold) { - unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); - return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len); - } else { - return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); - } -} - -static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d) -{ - if (d.v->d_casefold) { - unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); - unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len); - return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len); - } else { - return (struct qstr) QSTR_INIT(NULL, 0); - } -} - -static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d) -{ - return d.v->d_casefold - ? bch2_dirent_get_casefold_name(d) - : bch2_dirent_get_name(d); -} - -static u64 bch2_dirent_hash(const struct bch_hash_info *info, - const struct qstr *name) -{ - struct bch_str_hash_ctx ctx; - - bch2_str_hash_init(&ctx, info); - bch2_str_hash_update(&ctx, info, name->name, name->len); - - /* [0,2) reserved for dots */ - return max_t(u64, bch2_str_hash_end(&ctx, info), 2); -} - -static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) -{ - return bch2_dirent_hash(info, key); -} - -static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr name = bch2_dirent_get_lookup_name(d); - - return bch2_dirent_hash(info, &name); -} - -static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) -{ - struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - const struct qstr l_name = bch2_dirent_get_lookup_name(l); - const struct qstr *r_name = _r; - - return !qstr_eq(l_name, *r_name); -} - -static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -{ - struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); - const struct qstr l_name = bch2_dirent_get_lookup_name(l); - const struct qstr r_name = bch2_dirent_get_lookup_name(r); - - return !qstr_eq(l_name, r_name); -} - -static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - if (d.v->d_type == DT_SUBVOL) - return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol; - return true; -} - -const struct bch_hash_desc bch2_dirent_hash_desc = { - .btree_id = BTREE_ID_dirents, - .key_type = KEY_TYPE_dirent, - .hash_key = dirent_hash_key, - .hash_bkey = dirent_hash_bkey, - .cmp_key = dirent_cmp_key, - .cmp_bkey = dirent_cmp_bkey, - .is_visible = dirent_is_visible, -}; - -int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - unsigned name_block_len = bch2_dirent_name_bytes(d); - struct qstr d_name = bch2_dirent_get_name(d); - struct qstr d_cf_name = bch2_dirent_get_casefold_name(d); - int ret = 0; - - bkey_fsck_err_on(!d_name.len, - c, dirent_empty_name, - "empty name"); - - bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len, - c, dirent_val_too_big, - "dirent names exceed bkey size (%d + %d > %d)", - d_name.len, d_cf_name.len, name_block_len); - - /* - * Check new keys don't exceed the max length - * (older keys may be larger.) - */ - bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, - c, dirent_name_too_long, - "dirent name too big (%u > %u)", - d_name.len, BCH_NAME_MAX); - - bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), - c, dirent_name_embedded_nul, - "dirent has stray data after name's NUL"); - - bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) || - (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), - c, dirent_name_dot_or_dotdot, - "invalid name"); - - bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), - c, dirent_name_has_slash, - "name with /"); - - bkey_fsck_err_on(d.v->d_type != DT_SUBVOL && - le64_to_cpu(d.v->d_inum) == d.k->p.inode, - c, dirent_to_itself, - "dirent points to own directory"); - - if (d.v->d_casefold) { - bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit && - d_cf_name.len > BCH_NAME_MAX, - c, dirent_cf_name_too_big, - "dirent w/ cf name too big (%u > %u)", - d_cf_name.len, BCH_NAME_MAX); - - bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len), - c, dirent_stray_data_after_cf_name, - "dirent has stray data after cf name's NUL"); - } -fsck_err: - return ret; -} - -void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr d_name = bch2_dirent_get_name(d); - - prt_printf(out, "%.*s", d_name.len, d_name.name); - - if (d.v->d_casefold) { - struct qstr d_name = bch2_dirent_get_lookup_name(d); - prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name); - } - - prt_str(out, " ->"); - - if (d.v->d_type != DT_SUBVOL) - prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum)); - else - prt_printf(out, " %u -> %u", - le32_to_cpu(d.v->d_parent_subvol), - le32_to_cpu(d.v->d_child_subvol)); - - prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); -} - -int bch2_dirent_init_name(struct bch_fs *c, - struct bkey_i_dirent *dirent, - const struct bch_hash_info *hash_info, - const struct qstr *name, - const struct qstr *cf_name) -{ - EBUG_ON(hash_info->cf_encoding == NULL && cf_name); - int cf_len = 0; - - if (name->len > BCH_NAME_MAX) - return -ENAMETOOLONG; - - dirent->v.d_casefold = hash_info->cf_encoding != NULL; - - if (!dirent->v.d_casefold) { - memcpy(&dirent->v.d_name[0], name->name, name->len); - memset(&dirent->v.d_name[name->len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_name) - - name->len); - } else { - if (!bch2_fs_casefold_enabled(c)) - return -EOPNOTSUPP; - -#ifdef CONFIG_UNICODE - memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); - - char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len]; - - if (cf_name) { - cf_len = cf_name->len; - - memcpy(cf_out, cf_name->name, cf_name->len); - } else { - cf_len = utf8_casefold(hash_info->cf_encoding, name, - cf_out, - bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out); - if (cf_len <= 0) - return cf_len; - } - - memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_cf_name_block.d_names) - - name->len + cf_len); - - dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); - dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len); - - EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len); -#endif - } - - unsigned u64s = dirent_val_u64s(name->len, cf_len); - BUG_ON(u64s > bkey_val_u64s(&dirent->k)); - set_bkey_val_u64s(&dirent->k, u64s); - return 0; -} - -struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans, - const struct bch_hash_info *hash_info, - subvol_inum dir, - u8 type, - const struct qstr *name, - const struct qstr *cf_name, - u64 dst) -{ - struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); - if (IS_ERR(dirent)) - return dirent; - - bkey_dirent_init(&dirent->k_i); - dirent->k.u64s = BKEY_U64s_MAX; - - if (type != DT_SUBVOL) { - dirent->v.d_inum = cpu_to_le64(dst); - } else { - dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); - dirent->v.d_child_subvol = cpu_to_le32(dst); - } - - dirent->v.d_type = type; - dirent->v.d_unused = 0; - - int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name); - if (ret) - return ERR_PTR(ret); - - EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); - return dirent; -} - -int bch2_dirent_create_snapshot(struct btree_trans *trans, - u32 dir_subvol, u64 dir, u32 snapshot, - const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - u64 *dir_offset, - enum btree_iter_update_trigger_flags flags) -{ - subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; - struct bkey_i_dirent *dirent; - int ret; - - dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); - ret = PTR_ERR_OR_ZERO(dirent); - if (ret) - return ret; - - dirent->k.p.inode = dir; - dirent->k.p.snapshot = snapshot; - - ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, snapshot, &dirent->k_i, flags); - *dir_offset = dirent->k.p.offset; - - return ret; -} - -int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, - const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - u64 *dir_offset, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i_dirent *dirent; - int ret; - - dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); - ret = PTR_ERR_OR_ZERO(dirent); - if (ret) - return ret; - - ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir, &dirent->k_i, flags); - *dir_offset = dirent->k.p.offset; - - return ret; -} - -int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, - struct bkey_s_c_dirent d, subvol_inum *target) -{ - struct bch_subvolume s; - int ret = 0; - - if (d.v->d_type == DT_SUBVOL && - le32_to_cpu(d.v->d_parent_subvol) != dir.subvol) - return 1; - - if (likely(d.v->d_type != DT_SUBVOL)) { - target->subvol = dir.subvol; - target->inum = le64_to_cpu(d.v->d_inum); - } else { - target->subvol = le32_to_cpu(d.v->d_child_subvol); - - ret = bch2_subvolume_get(trans, target->subvol, true, &s); - - target->inum = le64_to_cpu(s.inode); - } - - return ret; -} - -int bch2_dirent_rename(struct btree_trans *trans, - subvol_inum src_dir, struct bch_hash_info *src_hash, - subvol_inum dst_dir, struct bch_hash_info *dst_hash, - const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, - const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, - enum bch_rename_mode mode) -{ - struct qstr src_name_lookup, dst_name_lookup; - struct btree_iter src_iter = {}; - struct btree_iter dst_iter = {}; - struct bkey_s_c old_src, old_dst = bkey_s_c_null; - struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; - struct bpos dst_pos = - POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); - unsigned src_update_flags = 0; - bool delete_src, delete_dst; - int ret = 0; - - memset(src_inum, 0, sizeof(*src_inum)); - memset(dst_inum, 0, sizeof(*dst_inum)); - - /* Lookup src: */ - ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup); - if (ret) - goto out; - old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, - src_hash, src_dir, &src_name_lookup, - BTREE_ITER_intent); - ret = bkey_err(old_src); - if (ret) - goto out; - - ret = bch2_dirent_read_target(trans, src_dir, - bkey_s_c_to_dirent(old_src), src_inum); - if (ret) - goto out; - - /* Lookup dst: */ - ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup); - if (ret) - goto out; - if (mode == BCH_RENAME) { - /* - * Note that we're _not_ checking if the target already exists - - * we're relying on the VFS to do that check for us for - * correctness: - */ - ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, &dst_name_lookup); - if (ret) - goto out; - } else { - old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, &dst_name_lookup, - BTREE_ITER_intent); - ret = bkey_err(old_dst); - if (ret) - goto out; - - ret = bch2_dirent_read_target(trans, dst_dir, - bkey_s_c_to_dirent(old_dst), dst_inum); - if (ret) - goto out; - } - - if (mode != BCH_RENAME_EXCHANGE) - *src_offset = dst_iter.pos.offset; - - /* Create new dst key: */ - new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, - dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); - ret = PTR_ERR_OR_ZERO(new_dst); - if (ret) - goto out; - - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - new_dst->k.p = dst_iter.pos; - - /* Create new src key: */ - if (mode == BCH_RENAME_EXCHANGE) { - new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name, - src_hash->cf_encoding ? &src_name_lookup : NULL, 0); - ret = PTR_ERR_OR_ZERO(new_src); - if (ret) - goto out; - - dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); - new_src->k.p = src_iter.pos; - } else { - new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); - ret = PTR_ERR_OR_ZERO(new_src); - if (ret) - goto out; - - bkey_init(&new_src->k); - new_src->k.p = src_iter.pos; - - if (bkey_le(dst_pos, src_iter.pos) && - bkey_lt(src_iter.pos, dst_iter.pos)) { - /* - * We have a hash collision for the new dst key, - * and new_src - the key we're deleting - is between - * new_dst's hashed slot and the slot we're going to be - * inserting it into - oops. This will break the hash - * table if we don't deal with it: - */ - if (mode == BCH_RENAME) { - /* - * If we're not overwriting, we can just insert - * new_dst at the src position: - */ - new_src = new_dst; - new_src->k.p = src_iter.pos; - goto out_set_src; - } else { - /* If we're overwriting, we can't insert new_dst - * at a different slot because it has to - * overwrite old_dst - just make sure to use a - * whiteout when deleting src: - */ - new_src->k.type = KEY_TYPE_hash_whiteout; - } - } else { - /* Check if we need a whiteout to delete src: */ - ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, - src_hash, &src_iter); - if (ret < 0) - goto out; - - if (ret) - new_src->k.type = KEY_TYPE_hash_whiteout; - } - } - - if (new_dst->v.d_type == DT_SUBVOL) - new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol); - - if ((mode == BCH_RENAME_EXCHANGE) && - new_src->v.d_type == DT_SUBVOL) - new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); - - ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); - if (ret) - goto out; -out_set_src: - /* - * If we're deleting a subvolume we need to really delete the dirent, - * not just emit a whiteout in the current snapshot - there can only be - * single dirent that points to a given subvolume. - * - * IOW, we don't maintain multiple versions in different snapshots of - * dirents that point to subvolumes - dirents that point to subvolumes - * are only visible in one particular subvolume so it's not necessary, - * and it would be particularly confusing for fsck to have to deal with. - */ - delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL && - new_src->k.p.snapshot != old_src.k->p.snapshot; - - delete_dst = old_dst.k && - bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL && - new_dst->k.p.snapshot != old_dst.k->p.snapshot; - - if (!delete_src || !bkey_deleted(&new_src->k)) { - ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); - if (ret) - goto out; - } - - if (delete_src) { - bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &src_iter) ?: - bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto out; - } - - if (delete_dst) { - bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &dst_iter) ?: - bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto out; - } - - if (mode == BCH_RENAME_EXCHANGE) - *src_offset = new_src->k.p.offset; - *dst_offset = new_dst->k.p.offset; -out: - bch2_trans_iter_exit(trans, &src_iter); - bch2_trans_iter_exit(trans, &dst_iter); - return ret; -} - -int bch2_dirent_lookup_trans(struct btree_trans *trans, - struct btree_iter *iter, - subvol_inum dir, - const struct bch_hash_info *hash_info, - const struct qstr *name, subvol_inum *inum, - unsigned flags) -{ - struct qstr lookup_name; - int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name); - if (ret) - return ret; - - struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, &lookup_name, flags); - ret = bkey_err(k); - if (ret) - goto err; - - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum); - if (ret > 0) - ret = -ENOENT; -err: - if (ret) - bch2_trans_iter_exit(trans, iter); - return ret; -} - -u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, - const struct bch_hash_info *hash_info, - const struct qstr *name, subvol_inum *inum) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = {}; - - int ret = lockrestart_do(trans, - bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, - SPOS(dir, 0, snapshot), - POS(dir, U64_MAX), 0, k, ret) - if (k.k->type == KEY_TYPE_dirent) { - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) - continue; - ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty); - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) -{ - u32 snapshot; - - return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?: - bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot); -} - -static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target) -{ - struct qstr name = bch2_dirent_get_name(d); - /* - * Although not required by the kernel code, updating ctx->pos is needed - * for the bcachefs FUSE driver. Without this update, the FUSE - * implementation will be stuck in an infinite loop when reading - * directories (via the bcachefs_fuse_readdir callback). - * In kernel space, ctx->pos is updated by the VFS code. - */ - ctx->pos = d.k->p.offset; - bool ret = dir_emit(ctx, name.name, - name.len, - target.inum, - vfs_d_type(d.v->d_type)); - if (ret) - ctx->pos = d.k->p.offset + 1; - return !ret; -} - -int bch2_readdir(struct bch_fs *c, subvol_inum inum, - struct bch_hash_info *hash_info, - struct dir_context *ctx) -{ - struct bkey_buf sk; - bch2_bkey_buf_init(&sk); - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, - POS(inum.inum, ctx->pos), - POS(inum.inum, U64_MAX), - inum.subvol, 0, k, ({ - if (k.k->type != KEY_TYPE_dirent) - continue; - - /* dir_emit() can fault and block: */ - bch2_bkey_buf_reassemble(&sk, c, k); - struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); - - subvol_inum target; - - bool need_second_pass = false; - int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc, - hash_info, &iter, k, &need_second_pass) ?: - bch2_dirent_read_target(trans, inum, dirent, &target); - if (ret2 > 0) - continue; - - ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target)); - }))); - - bch2_bkey_buf_exit(&sk, c); - - return ret < 0 ? ret : 0; -} - -/* fsck */ - -static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode_nr) - break; - if (!bkey_is_inode(k.k)) - continue; - ret = bch2_inode_unpack(k, inode); - goto found; - } - ret = bch_err_throw(trans->c, ENOENT_inode); -found: - bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bch_inode_unpacked dir_inode; - struct bch_hash_info dir_hash_info; - int ret; - - ret = lookup_first_inode(trans, pos.inode, &dir_inode); - if (ret) - goto err; - - dir_hash_info = bch2_hash_info_init(c, &dir_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); -err: - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h deleted file mode 100644 index 0417608c18d578..00000000000000 --- a/fs/bcachefs/dirent.h +++ /dev/null @@ -1,119 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DIRENT_H -#define _BCACHEFS_DIRENT_H - -#include "str_hash.h" - -extern const struct bch_hash_desc bch2_dirent_hash_desc; - -int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_dirent ((struct bkey_ops) { \ - .key_validate = bch2_dirent_validate, \ - .val_to_text = bch2_dirent_to_text, \ - .min_val_size = 16, \ -}) - -struct qstr; -struct file; -struct dir_context; -struct bch_fs; -struct bch_hash_info; -struct bch_inode_info; - -#ifdef CONFIG_UNICODE -int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, - const struct qstr *, struct qstr *); -#else -static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) -{ - return -EOPNOTSUPP; -} -#endif - -static inline int bch2_maybe_casefold(struct btree_trans *trans, - const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) -{ - if (likely(!info->cf_encoding)) { - *out_cf = *str; - return 0; - } else { - return bch2_casefold(trans, info, str, out_cf); - } -} - -struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent); - -static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) -{ - unsigned bytes = cf_len - ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len - : offsetof(struct bch_dirent, d_name) + len; - - return DIV_ROUND_UP(bytes, sizeof(u64)); -} - -int bch2_dirent_read_target(struct btree_trans *, subvol_inum, - struct bkey_s_c_dirent, subvol_inum *); - -static inline void dirent_copy_target(struct bkey_i_dirent *dst, - struct bkey_s_c_dirent src) -{ - dst->v.d_inum = src.v->d_inum; - dst->v.d_type = src.v->d_type; -} - -int bch2_dirent_init_name(struct bch_fs *, - struct bkey_i_dirent *, - const struct bch_hash_info *, - const struct qstr *, - const struct qstr *); -struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *, - const struct bch_hash_info *, subvol_inum, u8, - const struct qstr *, const struct qstr *, u64); - -int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, - const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, - enum btree_iter_update_trigger_flags); -int bch2_dirent_create(struct btree_trans *, subvol_inum, - const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, - enum btree_iter_update_trigger_flags); - -static inline unsigned vfs_d_type(unsigned type) -{ - return type == DT_SUBVOL ? DT_DIR : type; -} - -enum bch_rename_mode { - BCH_RENAME, - BCH_RENAME_OVERWRITE, - BCH_RENAME_EXCHANGE, -}; - -int bch2_dirent_rename(struct btree_trans *, - subvol_inum, struct bch_hash_info *, - subvol_inum, struct bch_hash_info *, - const struct qstr *, subvol_inum *, u64 *, - const struct qstr *, subvol_inum *, u64 *, - enum bch_rename_mode); - -int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, - subvol_inum, const struct bch_hash_info *, - const struct qstr *, subvol_inum *, unsigned); -u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, - const struct bch_hash_info *, - const struct qstr *, subvol_inum *); - -int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); -int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); -int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *); - -int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); - -#endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h deleted file mode 100644 index a46dbddd21aad8..00000000000000 --- a/fs/bcachefs/dirent_format.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DIRENT_FORMAT_H -#define _BCACHEFS_DIRENT_FORMAT_H - -/* - * Dirents (and xattrs) have to implement string lookups; since our b-tree - * doesn't support arbitrary length strings for the key, we instead index by a - * 64 bit hash (currently truncated sha1) of the string, stored in the offset - * field of the key - using linear probing to resolve hash collisions. This also - * provides us with the readdir cookie posix requires. - * - * Linear probing requires us to use whiteouts for deletions, in the event of a - * collision: - */ - -struct bch_dirent { - struct bch_val v; - - /* Target inode number: */ - union { - __le64 d_inum; - struct { /* DT_SUBVOL */ - __le32 d_child_subvol; - __le32 d_parent_subvol; - }; - }; - - /* - * Copy of mode bits 12-15 from the target inode - so userspace can get - * the filetype without having to do a stat() - */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 d_type:5, - d_unused:2, - d_casefold:1; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u8 d_casefold:1, - d_unused:2, - d_type:5; -#endif - - union { - struct { - __u8 d_pad; - __le16 d_name_len; - __le16 d_cf_name_len; - __u8 d_names[]; - } d_cf_name_block __packed; - __DECLARE_FLEX_ARRAY(__u8, d_name); - } __packed; -} __packed __aligned(8); - -#define DT_SUBVOL 16 -#define BCH_DT_MAX 17 - -#define BCH_NAME_MAX 512 - -#endif /* _BCACHEFS_DIRENT_FORMAT_H */ diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c deleted file mode 100644 index f7528cd69c73fe..00000000000000 --- a/fs/bcachefs/disk_accounting.c +++ /dev/null @@ -1,1074 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bcachefs_ioctl.h" -#include "btree_cache.h" -#include "btree_journal_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "compress.h" -#include "disk_accounting.h" -#include "error.h" -#include "journal_io.h" -#include "replicas.h" - -/* - * Notes on disk accounting: - * - * We have two parallel sets of counters to be concerned with, and both must be - * kept in sync. - * - * - Persistent/on disk accounting, stored in the accounting btree and updated - * via btree write buffer updates that treat new accounting keys as deltas to - * apply to existing values. But reading from a write buffer btree is - * expensive, so we also have - * - * - In memory accounting, where accounting is stored as an array of percpu - * counters, indexed by an eytzinger array of disk acounting keys/bpos (which - * are the same thing, excepting byte swabbing on big endian). - * - * Cheap to read, but non persistent. - * - * Disk accounting updates are generated by transactional triggers; these run as - * keys enter and leave the btree, and can compare old and new versions of keys; - * the output of these triggers are deltas to the various counters. - * - * Disk accounting updates are done as btree write buffer updates, where the - * counters in the disk accounting key are deltas that will be applied to the - * counter in the btree when the key is flushed by the write buffer (or journal - * replay). - * - * To do a disk accounting update: - * - initialize a disk_accounting_pos, to specify which counter is being update - * - initialize counter deltas, as an array of 1-3 s64s - * - call bch2_disk_accounting_mod() - * - * This queues up the accounting update to be done at transaction commit time. - * Underneath, it's a normal btree write buffer update. - * - * The transaction commit path is responsible for propagating updates to the in - * memory counters, with bch2_accounting_mem_mod(). - * - * The commit path also assigns every disk accounting update a unique version - * number, based on the journal sequence number and offset within that journal - * buffer; this is used by journal replay to determine which updates have been - * done. - * - * The transaction commit path also ensures that replicas entry accounting - * updates are properly marked in the superblock (so that we know whether we can - * mount without data being unavailable); it will update the superblock if - * bch2_accounting_mem_mod() tells it to. - */ - -static const char * const disk_accounting_type_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_DISK_ACCOUNTING_TYPES() -#undef x - NULL -}; - -static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos, - s64 *d, unsigned nr) -{ - struct bkey_i_accounting *acc = bkey_accounting_init(k); - - acc->k.p = pos; - set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); - - memcpy_u64s_small(acc->v.d, d, nr); -} - -static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, - s64 *d, unsigned nr) -{ - return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr); -} - -static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); - -int bch2_disk_accounting_mod(struct btree_trans *trans, - struct disk_accounting_pos *k, - s64 *d, unsigned nr, bool gc) -{ - BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); - - /* Normalize: */ - switch (k->type) { - case BCH_DISK_ACCOUNTING_replicas: - bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp); - break; - } - - struct bpos pos = disk_accounting_pos_to_bpos(k); - - if (likely(!gc)) { - struct bkey_i_accounting *a; -#if 0 - for (a = btree_trans_subbuf_base(trans, &trans->accounting); - a != btree_trans_subbuf_top(trans, &trans->accounting); - a = (void *) bkey_next(&a->k_i)) - if (bpos_eq(a->k.p, pos)) { - BUG_ON(nr != bch2_accounting_counters(&a->k)); - acc_u64s(a->v.d, d, nr); - - if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) { - unsigned offset = (u64 *) a - - (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); - - trans->accounting.u64s -= a->k.u64s; - memmove_u64s_down(a, - bkey_next(&a->k_i), - trans->accounting.u64s - offset); - } - return 0; - } -#endif - unsigned u64s = sizeof(*a) / sizeof(u64) + nr; - a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; - - __accounting_key_init(&a->k_i, pos, d, nr); - return 0; - } else { - struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; - - __accounting_key_init(&k_i.k, pos, d, nr); - - int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); - if (ret == -BCH_ERR_btree_insert_need_mark_replicas) - ret = drop_locks_do(trans, - bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: - bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); - return ret; - } -} - -int bch2_mod_dev_cached_sectors(struct btree_trans *trans, - unsigned dev, s64 sectors, - bool gc) -{ - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_replicas_entry_cached(&acc.replicas, dev); - - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); -} - -static inline bool is_zero(char *start, char *end) -{ - BUG_ON(start > end); - - for (; start < end; start++) - if (*start) - return false; - return true; -} - -#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) - -static const unsigned bch2_accounting_type_nr_counters[] = { -#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr, - BCH_DISK_ACCOUNTING_TYPES() -#undef x -}; - -int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - void *end = &acc_k + 1; - int ret = 0; - - bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && - bversion_zero(k.k->bversion), - c, accounting_key_version_0, - "accounting key with version=0"); - - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_nr_inodes: - end = field_end(acc_k, nr_inodes); - break; - case BCH_DISK_ACCOUNTING_persistent_reserved: - end = field_end(acc_k, persistent_reserved); - break; - case BCH_DISK_ACCOUNTING_replicas: - bkey_fsck_err_on(!acc_k.replicas.nr_devs, - c, accounting_key_replicas_nr_devs_0, - "accounting key replicas entry with nr_devs=0"); - - bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || - (acc_k.replicas.nr_required > 1 && - acc_k.replicas.nr_required == acc_k.replicas.nr_devs), - c, accounting_key_replicas_nr_required_bad, - "accounting key replicas entry with bad nr_required"); - - for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) - bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], - c, accounting_key_replicas_devs_unsorted, - "accounting key replicas entry with unsorted devs"); - - end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - end = field_end(acc_k, dev_data_type); - break; - case BCH_DISK_ACCOUNTING_compression: - end = field_end(acc_k, compression); - break; - case BCH_DISK_ACCOUNTING_snapshot: - end = field_end(acc_k, snapshot); - break; - case BCH_DISK_ACCOUNTING_btree: - end = field_end(acc_k, btree); - break; - case BCH_DISK_ACCOUNTING_rebalance_work: - end = field_end(acc_k, rebalance_work); - break; - } - - bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), - c, accounting_key_junk_at_end, - "junk at end of accounting key"); - - bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], - c, accounting_key_nr_counters_wrong, - "accounting key with %u counters, should be %u", - bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); -fsck_err: - return ret; -} - -void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) -{ - if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { - prt_printf(out, "unknown type %u", k->type); - return; - } - - prt_str(out, disk_accounting_type_strs[k->type]); - prt_str(out, " "); - - switch (k->type) { - case BCH_DISK_ACCOUNTING_nr_inodes: - break; - case BCH_DISK_ACCOUNTING_persistent_reserved: - prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas); - break; - case BCH_DISK_ACCOUNTING_replicas: - bch2_replicas_entry_to_text(out, &k->replicas); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev); - bch2_prt_data_type(out, k->dev_data_type.data_type); - break; - case BCH_DISK_ACCOUNTING_compression: - bch2_prt_compression_type(out, k->compression.type); - break; - case BCH_DISK_ACCOUNTING_snapshot: - prt_printf(out, "id=%u", k->snapshot.id); - break; - case BCH_DISK_ACCOUNTING_btree: - prt_str(out, "btree="); - bch2_btree_id_to_text(out, k->btree.id); - break; - } -} - -void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k); - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - bch2_accounting_key_to_text(out, &acc_k); - - for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) - prt_printf(out, " %lli", acc.v->d[i]); -} - -void bch2_accounting_swab(struct bkey_s k) -{ - for (u64 *p = (u64 *) k.v; - p < (u64 *) bkey_val_end(k); - p++) - *p = swab64(*p); -} - -static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, - struct disk_accounting_pos *acc) -{ - unsafe_memcpy(r, &acc->replicas, - replicas_entry_bytes(&acc->replicas), - "variable length struct"); -} - -static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) -{ - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, p); - - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_replicas: - __accounting_to_replicas(r, &acc_k); - return true; - default: - return false; - } -} - -static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) -{ - union bch_replicas_padded r; - return accounting_to_replicas(&r.e, p) - ? bch2_mark_replicas(c, &r.e) - : 0; -} - -/* - * Ensure accounting keys being updated are present in the superblock, when - * applicable (i.e. replicas updates) - */ -int bch2_accounting_update_sb(struct btree_trans *trans) -{ - for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); - i != btree_trans_subbuf_top(trans, &trans->accounting); - i = bkey_next(i)) { - int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); - if (ret) - return ret; - } - - return 0; -} - -static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a) -{ - struct bch_accounting_mem *acc = &c->accounting; - - /* raced with another insert, already present: */ - if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &a.k->p) < acc->k.nr) - return 0; - - struct accounting_mem_entry n = { - .pos = a.k->p, - .bversion = a.k->bversion, - .nr_counters = bch2_accounting_counters(a.k), - .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL), - }; - - if (!n.v[0]) - goto err; - - if (acc->gc_running) { - n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!n.v[1]) - goto err; - } - - if (darray_push(&acc->k, n)) - goto err; - - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); - - if (trace_accounting_mem_insert_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_accounting_to_text(&buf, c, a.s_c); - trace_accounting_mem_insert(c, buf.buf); - printbuf_exit(&buf); - } - return 0; -err: - free_percpu(n.v[1]); - free_percpu(n.v[0]); - return bch_err_throw(c, ENOMEM_disk_accounting); -} - -int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, - enum bch_accounting_mode mode) -{ - union bch_replicas_padded r; - - if (mode != BCH_ACCOUNTING_read && - accounting_to_replicas(&r.e, a.k->p) && - !bch2_replicas_marked_locked(c, &r.e)) - return bch_err_throw(c, btree_insert_need_mark_replicas); - - percpu_up_read(&c->mark_lock); - percpu_down_write(&c->mark_lock); - int ret = __bch2_accounting_mem_insert(c, a); - percpu_up_write(&c->mark_lock); - percpu_down_read(&c->mark_lock); - return ret; -} - -int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, - enum bch_accounting_mode mode) -{ - union bch_replicas_padded r; - - if (mode != BCH_ACCOUNTING_read && - accounting_to_replicas(&r.e, a.k->p) && - !bch2_replicas_marked_locked(c, &r.e)) - return bch_err_throw(c, btree_insert_need_mark_replicas); - - return __bch2_accounting_mem_insert(c, a); -} - -static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) -{ - for (unsigned i = 0; i < e->nr_counters; i++) - if (percpu_u64_get(e->v[0] + i) || - (e->v[1] && - percpu_u64_get(e->v[1] + i))) - return false; - return true; -} - -void bch2_accounting_mem_gc(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - - percpu_down_write(&c->mark_lock); - struct accounting_mem_entry *dst = acc->k.data; - - darray_for_each(acc->k, src) { - if (accounting_mem_entry_is_zero(src)) { - free_percpu(src->v[0]); - free_percpu(src->v[1]); - } else { - *dst++ = *src; - } - } - - acc->k.nr = dst - acc->k.data; - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); - percpu_up_write(&c->mark_lock); -} - -/* - * Read out accounting keys for replicas entries, as an array of - * bch_replicas_usage entries. - * - * Note: this may be deprecated/removed at smoe point in the future and replaced - * with something more general, it exists to support the ioctl used by the - * 'bcachefs fs usage' command. - */ -int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) -{ - struct bch_accounting_mem *acc = &c->accounting; - int ret = 0; - - darray_init(usage); - - percpu_down_read(&c->mark_lock); - darray_for_each(acc->k, i) { - union { - u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, - BCH_BKEY_PTRS_MAX)]; - struct bch_replicas_usage r; - } u; - u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; - - if (!accounting_to_replicas(&u.r.r, i->pos)) - continue; - - u64 sectors; - bch2_accounting_mem_read_counters(acc, i - acc->k.data, §ors, 1, false); - u.r.sectors = sectors; - - ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); - if (ret) - break; - - memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); - usage->nr += replicas_usage_bytes(&u.r); - } - percpu_up_read(&c->mark_lock); - - if (ret) - darray_exit(usage); - return ret; -} - -int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) -{ - - struct bch_accounting_mem *acc = &c->accounting; - int ret = 0; - - darray_init(out_buf); - - percpu_down_read(&c->mark_lock); - darray_for_each(acc->k, i) { - struct disk_accounting_pos a_p; - bpos_to_disk_accounting_pos(&a_p, i->pos); - - if (!(accounting_types_mask & BIT(a_p.type))) - continue; - - ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) + - sizeof(u64) * i->nr_counters); - if (ret) - break; - - struct bkey_i_accounting *a_out = - bkey_accounting_init((void *) &darray_top(*out_buf)); - set_bkey_val_u64s(&a_out->k, i->nr_counters); - a_out->k.p = i->pos; - bch2_accounting_mem_read_counters(acc, i - acc->k.data, - a_out->v.d, i->nr_counters, false); - - if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out))) - out_buf->nr += bkey_bytes(&a_out->k); - } - - percpu_up_read(&c->mark_lock); - - if (ret) - darray_exit(out_buf); - return ret; -} - -static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) -{ - darray_for_each(acc->k, e) { - free_percpu(e->v[gc]); - e->v[gc] = NULL; - } -} - -int bch2_gc_accounting_start(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - int ret = 0; - - percpu_down_write(&c->mark_lock); - darray_for_each(acc->k, e) { - e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!e->v[1]) { - bch2_accounting_free_counters(acc, true); - ret = bch_err_throw(c, ENOMEM_disk_accounting); - break; - } - } - - acc->gc_running = !ret; - percpu_up_write(&c->mark_lock); - - return ret; -} - -int bch2_gc_accounting_done(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - struct bpos pos = POS_MIN; - int ret = 0; - - percpu_down_write(&c->mark_lock); - while (1) { - unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &pos); - - if (idx >= acc->k.nr) - break; - - struct accounting_mem_entry *e = acc->k.data + idx; - pos = bpos_successor(e->pos); - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, e->pos); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - continue; - - u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; - u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; - - unsigned nr = e->nr_counters; - bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false); - bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true); - - if (memcmp(dst_v, src_v, nr * sizeof(u64))) { - printbuf_reset(&buf); - prt_str(&buf, "accounting mismatch for "); - bch2_accounting_key_to_text(&buf, &acc_k); - - prt_str(&buf, ":\n got"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", dst_v[j]); - - prt_str(&buf, "\nshould be"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", src_v[j]); - - for (unsigned j = 0; j < nr; j++) - src_v[j] -= dst_v[j]; - - bch2_trans_unlock_long(trans); - - if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) { - percpu_up_write(&c->mark_lock); - ret = commit_do(trans, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); - percpu_down_write(&c->mark_lock); - if (ret) - goto err; - - if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { - memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); - struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; - - accounting_key_init(&k_i.k, &acc_k, src_v, nr); - bch2_accounting_mem_mod_locked(trans, - bkey_i_to_s_c_accounting(&k_i.k), - BCH_ACCOUNTING_normal, true); - - preempt_disable(); - struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); - struct bch_fs_usage_base *src = &trans->fs_usage_delta; - acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); - preempt_enable(); - } - } - } - } -err: -fsck_err: - percpu_up_write(&c->mark_lock); - printbuf_exit(&buf); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - - if (k.k->type != KEY_TYPE_accounting) - return 0; - - percpu_down_read(&c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), - BCH_ACCOUNTING_read, false); - percpu_up_read(&c->mark_lock); - return ret; -} - -static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - struct disk_accounting_pos *acc, - u64 *v, unsigned nr) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0, invalid_dev = -1; - - switch (acc->type) { - case BCH_DISK_ACCOUNTING_replicas: { - union bch_replicas_padded r; - __accounting_to_replicas(&r.e, acc); - - for (unsigned i = 0; i < r.e.nr_devs; i++) - if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && - !bch2_dev_exists(c, r.e.devs[i])) { - invalid_dev = r.e.devs[i]; - goto invalid_device; - } - - /* - * All replicas entry checks except for invalid device are done - * in bch2_accounting_validate - */ - BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); - - if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), - trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n%s", - (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, acc), - buf.buf))) { - /* - * We're not RW yet and still single threaded, dropping - * and retaking lock is ok: - */ - percpu_up_write(&c->mark_lock); - ret = bch2_mark_replicas(c, &r.e); - if (ret) - goto fsck_err; - percpu_down_write(&c->mark_lock); - } - break; - } - - case BCH_DISK_ACCOUNTING_dev_data_type: - if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { - invalid_dev = acc->dev_data_type.dev; - goto invalid_device; - } - break; - } - -fsck_err: - printbuf_exit(&buf); - return ret; -invalid_device: - if (fsck_err(trans, accounting_to_invalid_device, - "accounting entry points to invalid device %i\n%s", - invalid_dev, - (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, acc), - buf.buf))) { - for (unsigned i = 0; i < nr; i++) - v[i] = -v[i]; - - ret = commit_do(trans, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: - -BCH_ERR_remove_disk_accounting_entry; - } else { - ret = bch_err_throw(c, remove_disk_accounting_entry); - } - goto fsck_err; -} - -/* - * At startup time, initialize the in memory accounting from the btree (and - * journal) - */ -int bch2_accounting_read(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - - /* - * We might run more than once if we rewind to start topology repair or - * btree node scan - and those might cause us to get different results, - * so we can't just skip if we've already run. - * - * Instead, zero out any accounting we have: - */ - percpu_down_write(&c->mark_lock); - darray_for_each(acc->k, e) - percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); - for_each_member_device(c, ca) - percpu_memset(ca->usage, 0, sizeof(*ca->usage)); - percpu_memset(c->usage, 0, sizeof(*c->usage)); - percpu_up_write(&c->mark_lock); - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); - iter.flags &= ~BTREE_ITER_with_journal; - int ret = for_each_btree_key_continue(trans, iter, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); - - if (k.k->type != KEY_TYPE_accounting) - continue; - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; - - if (!bch2_accounting_is_mem(&acc_k)) { - struct disk_accounting_pos next; - memset(&next, 0, sizeof(next)); - next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); - continue; - } - - accounting_read_key(trans, k); - })); - if (ret) - goto err; - - struct journal_keys *keys = &c->journal_keys; - struct journal_key *dst = keys->data; - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) { - if (i->k->k.type == KEY_TYPE_accounting) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); - - if (!bch2_accounting_is_mem(&acc_k)) - continue; - - struct bkey_s_c k = bkey_i_to_s_c(i->k); - unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, - sizeof(acc->k.data[0]), - accounting_pos_cmp, &k.k->p); - - bool applied = idx < acc->k.nr && - bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0; - - if (applied) - continue; - - if (i + 1 < &darray_top(*keys) && - i[1].k->k.type == KEY_TYPE_accounting && - !journal_key_cmp(i, i + 1)) { - WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); - - i[1].journal_seq = i[0].journal_seq; - - bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k), - bkey_s_c_to_accounting(k)); - continue; - } - - ret = accounting_read_key(trans, k); - if (ret) - goto err; - } - - *dst++ = *i; - } - keys->gap = keys->nr = dst - keys->data; - - percpu_down_write(&c->mark_lock); - - darray_for_each_reverse(acc->k, i) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, i->pos); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - memset(v, 0, sizeof(v)); - - for (unsigned j = 0; j < i->nr_counters; j++) - v[j] = percpu_u64_get(i->v[0] + j); - - /* - * If the entry counters are zeroed, it should be treated as - * nonexistent - it might point to an invalid device. - * - * Remove it, so that if it's re-added it gets re-marked in the - * superblock: - */ - ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) - ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); - - if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(i->v[0]); - free_percpu(i->v[1]); - darray_remove_item(&acc->k, i); - ret = 0; - continue; - } - - if (ret) - goto fsck_err; - } - - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); - - preempt_disable(); - struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); - - for (unsigned i = 0; i < acc->k.nr; i++) { - struct disk_accounting_pos k; - bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); - - switch (k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - usage->reserved += v[0] * k.persistent_reserved.nr_replicas; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); - if (ca) { - struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; - percpu_u64_set(&d->buckets, v[0]); - percpu_u64_set(&d->sectors, v[1]); - percpu_u64_set(&d->fragmented, v[2]); - - if (k.dev_data_type.data_type == BCH_DATA_sb || - k.dev_data_type.data_type == BCH_DATA_journal) - usage->hidden += v[0] * ca->mi.bucket_size; - } - break; - } - } - } - preempt_enable(); -fsck_err: - percpu_up_write(&c->mark_lock); -err: - printbuf_exit(&buf); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) -{ - return bch2_trans_run(c, - bch2_btree_write_buffer_flush_sync(trans) ?: - for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, - BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ - struct disk_accounting_pos acc; - bpos_to_disk_accounting_pos(&acc, k.k->p); - - acc.type == BCH_DISK_ACCOUNTING_dev_data_type && - acc.dev_data_type.dev == dev - ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) - : 0; - })) ?: - bch2_btree_write_buffer_flush_sync(trans)); -} - -int bch2_dev_usage_init(struct bch_dev *ca, bool gc) -{ - struct bch_fs *c = ca->fs; - u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; - - int ret = bch2_trans_do(c, ({ - bch2_disk_accounting_mod2(trans, gc, - v, dev_data_type, - .dev = ca->dev_idx, - .data_type = BCH_DATA_free) ?: - (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); - })); - bch_err_fn(c, ret); - return ret; -} - -void bch2_verify_accounting_clean(struct bch_fs *c) -{ - bool mismatch = false; - struct bch_fs_usage_base base = {}, base_inmem = {}; - - bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_accounting, POS_MIN, - BTREE_ITER_all_snapshots, k, ({ - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); - unsigned nr = bch2_accounting_counters(k.k); - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; - - if (!bch2_accounting_is_mem(&acc_k)) { - struct disk_accounting_pos next; - memset(&next, 0, sizeof(next)); - next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); - continue; - } - - bch2_accounting_mem_read(c, k.k->p, v, nr); - - if (memcmp(a.v->d, v, nr * sizeof(u64))) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, " !="); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", v[j]); - - pr_err("%s", buf.buf); - printbuf_exit(&buf); - mismatch = true; - } - - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - { - guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ - struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (!ca) - continue; - - v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); - v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); - v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); - } - - if (memcmp(a.v->d, v, 3 * sizeof(u64))) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, " in mem"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", v[j]); - - pr_err("dev accounting mismatch: %s", buf.buf); - printbuf_exit(&buf); - mismatch = true; - } - } - - 0; - }))); - - acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64)); - -#define check(x) \ - if (base.x != base_inmem.x) { \ - pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \ - mismatch = true; \ - } - - //check(hidden); - check(btree); - check(data); - check(cached); - check(reserved); - check(nr_inodes); - - WARN_ON(mismatch); -} - -void bch2_accounting_gc_free(struct bch_fs *c) -{ - lockdep_assert_held(&c->mark_lock); - - struct bch_accounting_mem *acc = &c->accounting; - - bch2_accounting_free_counters(acc, true); - acc->gc_running = false; -} - -void bch2_fs_accounting_exit(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - - bch2_accounting_free_counters(acc, false); - darray_exit(&acc->k); -} diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h deleted file mode 100644 index d61abebf3e0be4..00000000000000 --- a/fs/bcachefs/disk_accounting.h +++ /dev/null @@ -1,301 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_ACCOUNTING_H -#define _BCACHEFS_DISK_ACCOUNTING_H - -#include "btree_update.h" -#include "eytzinger.h" -#include "sb-members.h" - -static inline void bch2_u64s_neg(u64 *v, unsigned nr) -{ - for (unsigned i = 0; i < nr; i++) - v[i] = -v[i]; -} - -static inline unsigned bch2_accounting_counters(const struct bkey *k) -{ - return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64); -} - -static inline void bch2_accounting_neg(struct bkey_s_accounting a) -{ - bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k)); -} - -static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a) -{ - for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) - if (a.v->d[i]) - return false; - return true; -} - -static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, - struct bkey_s_c_accounting src) -{ - for (unsigned i = 0; - i < min(bch2_accounting_counters(&dst->k), - bch2_accounting_counters(src.k)); - i++) - dst->v.d[i] += src.v->d[i]; - - if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0) - dst->k.bversion = src.k->bversion; -} - -static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, - enum bch_data_type data_type, - s64 sectors) -{ - switch (data_type) { - case BCH_DATA_btree: - fs_usage->btree += sectors; - break; - case BCH_DATA_user: - case BCH_DATA_parity: - fs_usage->data += sectors; - break; - case BCH_DATA_cached: - fs_usage->cached += sectors; - break; - default: - break; - } -} - -static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p) -{ - BUILD_BUG_ON(sizeof(*acc) != sizeof(p)); - -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - acc->_pad = p; -#else - memcpy_swab(acc, &p, sizeof(p)); -#endif -} - -static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc) -{ - struct bpos p; -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - p = acc->_pad; -#else - memcpy_swab(&p, acc, sizeof(p)); -#endif - return p; -} - -int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, - s64 *, unsigned, bool); - -#define disk_accounting_key_init(_k, _type, ...) \ -do { \ - memset(&(_k), 0, sizeof(_k)); \ - (_k).type = BCH_DISK_ACCOUNTING_##_type; \ - (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ -} while (0) - -#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ -({ \ - struct disk_accounting_pos pos; \ - disk_accounting_key_init(pos, __VA_ARGS__); \ - bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ -}) - -#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ - bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) - -int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); - -int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); -void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_accounting_swab(struct bkey_s); - -#define bch2_bkey_ops_accounting ((struct bkey_ops) { \ - .key_validate = bch2_accounting_validate, \ - .val_to_text = bch2_accounting_to_text, \ - .swab = bch2_accounting_swab, \ - .min_val_size = 8, \ -}) - -int bch2_accounting_update_sb(struct btree_trans *); - -static inline int accounting_pos_cmp(const void *_l, const void *_r) -{ - const struct bpos *l = _l, *r = _r; - - return bpos_cmp(*l, *r); -} - -enum bch_accounting_mode { - BCH_ACCOUNTING_normal, - BCH_ACCOUNTING_gc, - BCH_ACCOUNTING_read, -}; - -int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); -int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); -void bch2_accounting_mem_gc(struct bch_fs *); - -static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc) -{ - return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR && - acc->type != BCH_DISK_ACCOUNTING_inum; -} - -/* - * Update in memory counters so they match the btree update we're doing; called - * from transaction commit path - */ -static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, - struct bkey_s_c_accounting a, - enum bch_accounting_mode mode, - bool write_locked) -{ - struct bch_fs *c = trans->c; - struct bch_accounting_mem *acc = &c->accounting; - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, a.k->p); - bool gc = mode == BCH_ACCOUNTING_gc; - - if (gc && !acc->gc_running) - return 0; - - if (!bch2_accounting_is_mem(&acc_k)) - return 0; - - if (mode == BCH_ACCOUNTING_normal) { - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (ca) { - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); - } - break; - } - } - } - - unsigned idx; - - while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { - int ret = 0; - if (unlikely(write_locked)) - ret = bch2_accounting_mem_insert_locked(c, a, mode); - else - ret = bch2_accounting_mem_insert(c, a, mode); - if (ret) - return ret; - } - - struct accounting_mem_entry *e = &acc->k.data[idx]; - - EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); - - for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) - this_cpu_add(e->v[gc][i], a.v->d[i]); - return 0; -} - -static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) -{ - percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false); - percpu_up_read(&trans->c->mark_lock); - return ret; -} - -static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc, - unsigned idx, u64 *v, unsigned nr, bool gc) -{ - memset(v, 0, sizeof(*v) * nr); - - if (unlikely(idx >= acc->k.nr)) - return; - - struct accounting_mem_entry *e = &acc->k.data[idx]; - - nr = min_t(unsigned, nr, e->nr_counters); - - for (unsigned i = 0; i < nr; i++) - v[i] = percpu_u64_get(e->v[gc] + i); -} - -static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, - u64 *v, unsigned nr) -{ - percpu_down_read(&c->mark_lock); - struct bch_accounting_mem *acc = &c->accounting; - unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &p); - - bch2_accounting_mem_read_counters(acc, idx, v, nr, false); - percpu_up_read(&c->mark_lock); -} - -static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) -{ - EBUG_ON(!res->ref); - - return (struct bversion) { - .hi = res->seq >> 32, - .lo = (res->seq << 32) | (res->offset + offset), - }; -} - -static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, - struct bkey_i_accounting *a, - unsigned commit_flags) -{ - u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); - a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base); - - EBUG_ON(bversion_zero(a->k.bversion)); - - return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) - ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false) - : 0; -} - -static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans, - struct bkey_i_accounting *a_i, - unsigned commit_flags) -{ - if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { - struct bkey_s_accounting a = accounting_i_to_s(a_i); - - bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false); - bch2_accounting_neg(a); - } -} - -int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); -int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); - -int bch2_gc_accounting_start(struct bch_fs *); -int bch2_gc_accounting_done(struct bch_fs *); - -int bch2_accounting_read(struct bch_fs *); - -int bch2_dev_usage_remove(struct bch_fs *, unsigned); -int bch2_dev_usage_init(struct bch_dev *, bool); - -void bch2_verify_accounting_clean(struct bch_fs *c); - -void bch2_accounting_gc_free(struct bch_fs *); -void bch2_fs_accounting_exit(struct bch_fs *); - -#endif /* _BCACHEFS_DISK_ACCOUNTING_H */ diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h deleted file mode 100644 index 8269af1dbe2a09..00000000000000 --- a/fs/bcachefs/disk_accounting_format.h +++ /dev/null @@ -1,225 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H -#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H - -#include "replicas_format.h" - -/* - * Disk accounting - KEY_TYPE_accounting - on disk format: - * - * Here, the key has considerably more structure than a typical key (bpos); an - * accounting key is 'struct disk_accounting_pos', which is a union of bpos. - * - * More specifically: a key is just a muliword integer (where word endianness - * matches native byte order), so we're treating bpos as an opaque 20 byte - * integer and mapping bch_accounting_key to that. - * - * This is a type-tagged union of all our various subtypes; a disk accounting - * key can be device counters, replicas counters, et cetera - it's extensible. - * - * The value is a list of u64s or s64s; the number of counters is specific to a - * given accounting type. - * - * Unlike with other key types, updates are _deltas_, and the deltas are not - * resolved until the update to the underlying btree, done by btree write buffer - * flush or journal replay. - * - * Journal replay in particular requires special handling. The journal tracks a - * range of entries which may possibly have not yet been applied to the btree - * yet - it does not know definitively whether individual entries are dirty and - * still need to be applied. - * - * To handle this, we use the version field of struct bkey, and give every - * accounting update a unique version number - a total ordering in time; the - * version number is derived from the key's position in the journal. Then - * journal replay can compare the version number of the key from the journal - * with the version number of the key in the btree to determine if a key needs - * to be replayed. - * - * For this to work, we must maintain this strict time ordering of updates as - * they are flushed to the btree, both via write buffer flush and via journal - * replay. This has complications for the write buffer code while journal replay - * is still in progress; the write buffer cannot flush any accounting keys to - * the btree until journal replay has finished replaying its accounting keys, or - * the (newer) version number of the keys from the write buffer will cause - * updates from journal replay to be lost. - */ - -struct bch_accounting { - struct bch_val v; - __u64 d[]; -}; - -#define BCH_ACCOUNTING_MAX_COUNTERS 3 - -#define BCH_DATA_TYPES() \ - x(free, 0) \ - x(sb, 1) \ - x(journal, 2) \ - x(btree, 3) \ - x(user, 4) \ - x(cached, 5) \ - x(parity, 6) \ - x(stripe, 7) \ - x(need_gc_gens, 8) \ - x(need_discard, 9) \ - x(unstriped, 10) - -enum bch_data_type { -#define x(t, n) BCH_DATA_##t, - BCH_DATA_TYPES() -#undef x - BCH_DATA_NR -}; - -static inline bool data_type_is_empty(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_free: - case BCH_DATA_need_gc_gens: - case BCH_DATA_need_discard: - return true; - default: - return false; - } -} - -static inline bool data_type_is_hidden(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_sb: - case BCH_DATA_journal: - return true; - default: - return false; - } -} - -/* - * field 1: name - * field 2: id - * field 3: number of counters (max 3) - */ - -#define BCH_DISK_ACCOUNTING_TYPES() \ - x(nr_inodes, 0, 1) \ - x(persistent_reserved, 1, 1) \ - x(replicas, 2, 1) \ - x(dev_data_type, 3, 3) \ - x(compression, 4, 3) \ - x(snapshot, 5, 1) \ - x(btree, 6, 1) \ - x(rebalance_work, 7, 1) \ - x(inum, 8, 3) - -enum disk_accounting_type { -#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, - BCH_DISK_ACCOUNTING_TYPES() -#undef x - BCH_DISK_ACCOUNTING_TYPE_NR, -}; - -/* - * No subtypes - number of inodes in the entire filesystem - * - * XXX: perhaps we could add a per-subvolume counter? - */ -struct bch_acct_nr_inodes { -}; - -/* - * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the - * reservation: - */ -struct bch_acct_persistent_reserved { - __u8 nr_replicas; -}; - -/* - * device, data type counter fields: - * [ - * nr_buckets - * live sectors (in buckets of that data type) - * sectors of internal fragmentation - * ] - * - * XXX: live sectors should've been done differently, you can have multiple data - * types in the same bucket (user, stripe, cached) and this collapses them to - * the bucket data type, and makes the internal fragmentation counter redundant - */ -struct bch_acct_dev_data_type { - __u8 dev; - __u8 data_type; -}; - -/* - * Compression type fields: - * [ - * number of extents - * uncompressed size - * compressed size - * ] - * - * Compression ratio, average extent size (fragmentation). - */ -struct bch_acct_compression { - __u8 type; -}; - -/* - * On disk usage by snapshot id; counts same values as replicas counter, but - * aggregated differently - */ -struct bch_acct_snapshot { - __u32 id; -} __packed; - -struct bch_acct_btree { - __u32 id; -} __packed; - -/* - * inum counter fields: - * [ - * number of extents - * sum of extent sizes - bkey size - * this field is similar to inode.bi_sectors, except here extents in - * different snapshots but the same inode number are all collapsed to the - * same counter - * sum of on disk size - same values tracked by replicas counters - * ] - * - * This tracks on disk fragmentation. - */ -struct bch_acct_inum { - __u64 inum; -} __packed; - -/* - * Simple counter of the amount of data (on disk sectors) rebalance needs to - * move, extents counted here are also in the rebalance_work btree. - */ -struct bch_acct_rebalance_work { -}; - -struct disk_accounting_pos { - union { - struct { - __u8 type; - union { - struct bch_acct_nr_inodes nr_inodes; - struct bch_acct_persistent_reserved persistent_reserved; - struct bch_replicas_entry_v1 replicas; - struct bch_acct_dev_data_type dev_data_type; - struct bch_acct_compression compression; - struct bch_acct_snapshot snapshot; - struct bch_acct_btree btree; - struct bch_acct_rebalance_work rebalance_work; - struct bch_acct_inum inum; - } __packed; - } __packed; - struct bpos _pad; - }; -}; - -#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */ diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h deleted file mode 100644 index b1982131b20666..00000000000000 --- a/fs/bcachefs/disk_accounting_types.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H -#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H - -#include "darray.h" - -struct accounting_mem_entry { - struct bpos pos; - struct bversion bversion; - unsigned nr_counters; - u64 __percpu *v[2]; -}; - -struct bch_accounting_mem { - DARRAY(struct accounting_mem_entry) k; - bool gc_running; -}; - -#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c deleted file mode 100644 index cde842ac188632..00000000000000 --- a/fs/bcachefs/disk_groups.c +++ /dev/null @@ -1,591 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "disk_groups.h" -#include "sb-members.h" -#include "super-io.h" - -#include - -static int group_cmp(const void *_l, const void *_r) -{ - const struct bch_disk_group *l = _l; - const struct bch_disk_group *r = _r; - - return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - - (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: - ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - - (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: - strncmp(l->label, r->label, sizeof(l->label)); -} - -static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_disk_groups *groups = - field_to_type(f, disk_groups); - struct bch_disk_group *g, *sorted = NULL; - unsigned nr_groups = disk_groups_nr(groups); - unsigned i, len; - int ret = 0; - - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member m = bch2_sb_member_get(sb, i); - unsigned group_id; - - if (!BCH_MEMBER_GROUP(&m)) - continue; - - group_id = BCH_MEMBER_GROUP(&m) - 1; - - if (group_id >= nr_groups) { - prt_printf(err, "disk %u has invalid label %u (have %u)", - i, group_id, nr_groups); - return -BCH_ERR_invalid_sb_disk_groups; - } - - if (BCH_GROUP_DELETED(&groups->entries[group_id])) { - prt_printf(err, "disk %u has deleted label %u", i, group_id); - return -BCH_ERR_invalid_sb_disk_groups; - } - } - - if (!nr_groups) - return 0; - - for (i = 0; i < nr_groups; i++) { - g = groups->entries + i; - - if (BCH_GROUP_DELETED(g)) - continue; - - len = strnlen(g->label, sizeof(g->label)); - if (!len) { - prt_printf(err, "label %u empty", i); - return -BCH_ERR_invalid_sb_disk_groups; - } - } - - sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); - if (!sorted) - return -BCH_ERR_ENOMEM_disk_groups_validate; - - memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); - sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); - - for (g = sorted; g + 1 < sorted + nr_groups; g++) - if (!BCH_GROUP_DELETED(g) && - !group_cmp(&g[0], &g[1])) { - prt_printf(err, "duplicate label %llu.%.*s", - BCH_GROUP_PARENT(g), - (int) sizeof(g->label), g->label); - ret = -BCH_ERR_invalid_sb_disk_groups; - goto err; - } -err: - kfree(sorted); - return ret; -} - -static void bch2_sb_disk_groups_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_disk_groups *groups = - field_to_type(f, disk_groups); - struct bch_disk_group *g; - unsigned nr_groups = disk_groups_nr(groups); - - for (g = groups->entries; - g < groups->entries + nr_groups; - g++) { - if (g != groups->entries) - prt_printf(out, " "); - - if (BCH_GROUP_DELETED(g)) - prt_printf(out, "[deleted]"); - else - prt_printf(out, "[parent %llu name %s]", - BCH_GROUP_PARENT(g), g->label); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { - .validate = bch2_sb_disk_groups_validate, - .to_text = bch2_sb_disk_groups_to_text -}; - -int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) -{ - struct bch_sb_field_disk_groups *groups; - struct bch_disk_groups_cpu *cpu_g, *old_g; - unsigned i, g, nr_groups; - - lockdep_assert_held(&c->sb_lock); - - groups = bch2_sb_field_get(c->disk_sb.sb, disk_groups); - nr_groups = disk_groups_nr(groups); - - if (!groups) - return 0; - - cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL); - if (!cpu_g) - return bch_err_throw(c, ENOMEM_disk_groups_to_cpu); - - cpu_g->nr = nr_groups; - - for (i = 0; i < nr_groups; i++) { - struct bch_disk_group *src = &groups->entries[i]; - struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; - - dst->deleted = BCH_GROUP_DELETED(src); - dst->parent = BCH_GROUP_PARENT(src); - memcpy(dst->label, src->label, sizeof(dst->label)); - } - - for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i); - struct bch_disk_group_cpu *dst; - - if (!bch2_member_alive(&m)) - continue; - - g = BCH_MEMBER_GROUP(&m); - while (g) { - dst = &cpu_g->entries[g - 1]; - __set_bit(i, dst->devs.d); - g = dst->parent; - } - } - - old_g = rcu_dereference_protected(c->disk_groups, - lockdep_is_held(&c->sb_lock)); - rcu_assign_pointer(c->disk_groups, cpu_g); - if (old_g) - kfree_rcu(old_g, rcu); - - return 0; -} - -const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) -{ - struct target t = target_decode(target); - - guard(rcu)(); - - switch (t.type) { - case TARGET_NULL: - return NULL; - case TARGET_DEV: { - struct bch_dev *ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - return ca ? &ca->self : NULL; - } - case TARGET_GROUP: { - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - - return g && t.group < g->nr && !g->entries[t.group].deleted - ? &g->entries[t.group].devs - : NULL; - } - default: - BUG(); - } -} - -bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) -{ - struct target t = target_decode(target); - - switch (t.type) { - case TARGET_NULL: - return false; - case TARGET_DEV: - return dev == t.dev; - case TARGET_GROUP: { - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - const struct bch_devs_mask *m = - g && t.group < g->nr && !g->entries[t.group].deleted - ? &g->entries[t.group].devs - : NULL; - - return m ? test_bit(dev, m->d) : false; - } - default: - BUG(); - } -} - -static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, - unsigned parent, - const char *name, unsigned namelen) -{ - unsigned i, nr_groups = disk_groups_nr(groups); - - if (!namelen || namelen > BCH_SB_LABEL_SIZE) - return -EINVAL; - - for (i = 0; i < nr_groups; i++) { - struct bch_disk_group *g = groups->entries + i; - - if (BCH_GROUP_DELETED(g)) - continue; - - if (!BCH_GROUP_DELETED(g) && - BCH_GROUP_PARENT(g) == parent && - strnlen(g->label, sizeof(g->label)) == namelen && - !memcmp(name, g->label, namelen)) - return i; - } - - return -1; -} - -static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, - const char *name, unsigned namelen) -{ - struct bch_sb_field_disk_groups *groups = - bch2_sb_field_get(sb->sb, disk_groups); - unsigned i, nr_groups = disk_groups_nr(groups); - struct bch_disk_group *g; - - if (!namelen || namelen > BCH_SB_LABEL_SIZE) - return -EINVAL; - - for (i = 0; - i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); - i++) - ; - - if (i == nr_groups) { - unsigned u64s = - (sizeof(struct bch_sb_field_disk_groups) + - sizeof(struct bch_disk_group) * (nr_groups + 1)) / - sizeof(u64); - - groups = bch2_sb_field_resize(sb, disk_groups, u64s); - if (!groups) - return -BCH_ERR_ENOSPC_disk_label_add; - - nr_groups = disk_groups_nr(groups); - } - - BUG_ON(i >= nr_groups); - - g = &groups->entries[i]; - - memcpy(g->label, name, namelen); - if (namelen < sizeof(g->label)) - g->label[namelen] = '\0'; - SET_BCH_GROUP_DELETED(g, 0); - SET_BCH_GROUP_PARENT(g, parent); - SET_BCH_GROUP_DATA_ALLOWED(g, ~0); - - return i; -} - -int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) -{ - struct bch_sb_field_disk_groups *groups = - bch2_sb_field_get(sb->sb, disk_groups); - int v = -1; - - do { - const char *next = strchrnul(name, '.'); - unsigned len = next - name; - - if (*next == '.') - next++; - - v = __bch2_disk_group_find(groups, v + 1, name, len); - name = next; - } while (*name && v >= 0); - - return v; -} - -int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) -{ - struct bch_sb_field_disk_groups *groups; - unsigned parent = 0; - int v = -1; - - do { - const char *next = strchrnul(name, '.'); - unsigned len = next - name; - - if (*next == '.') - next++; - - groups = bch2_sb_field_get(sb->sb, disk_groups); - - v = __bch2_disk_group_find(groups, parent, name, len); - if (v < 0) - v = __bch2_disk_group_add(sb, parent, name, len); - if (v < 0) - return v; - - parent = v + 1; - name = next; - } while (*name && v >= 0); - - return v; -} - -static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g, - unsigned v) -{ - u16 path[32]; - unsigned nr = 0; - - while (1) { - if (nr == ARRAY_SIZE(path)) - goto invalid; - - if (v >= (g ? g->nr : 0)) - goto invalid; - - struct bch_disk_group_cpu *e = g->entries + v; - - if (e->deleted) - goto invalid; - - path[nr++] = v; - - if (!e->parent) - break; - - v = e->parent - 1; - } - - while (nr) { - struct bch_disk_group_cpu *e = g->entries + path[--nr]; - - prt_printf(out, "%.*s", (int) sizeof(e->label), e->label); - if (nr) - prt_printf(out, "."); - } - return; -invalid: - prt_printf(out, "invalid label %u", v); -} - -void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) -{ - bch2_printbuf_make_room(out, 4096); - - out->atomic++; - guard(rcu)(); - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - - for (unsigned i = 0; i < (g ? g->nr : 0); i++) { - prt_printf(out, "%2u: ", i); - - if (g->entries[i].deleted) { - prt_printf(out, "[deleted]"); - goto next; - } - - __bch2_disk_path_to_text(out, g, i); - - prt_printf(out, " devs"); - - for_each_member_device_rcu(c, ca, &g->entries[i].devs) - prt_printf(out, " %s", ca->name); -next: - prt_newline(out); - } - - out->atomic--; -} - -void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) -{ - out->atomic++; - guard(rcu)(); - __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v), - --out->atomic; -} - -void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) -{ - struct bch_sb_field_disk_groups *groups = - bch2_sb_field_get(sb, disk_groups); - struct bch_disk_group *g; - unsigned nr = 0; - u16 path[32]; - - while (1) { - if (nr == ARRAY_SIZE(path)) - goto inval; - - if (v >= disk_groups_nr(groups)) - goto inval; - - g = groups->entries + v; - - if (BCH_GROUP_DELETED(g)) - goto inval; - - path[nr++] = v; - - if (!BCH_GROUP_PARENT(g)) - break; - - v = BCH_GROUP_PARENT(g) - 1; - } - - while (nr) { - v = path[--nr]; - g = groups->entries + v; - - prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); - if (nr) - prt_printf(out, "."); - } - return; -inval: - prt_printf(out, "invalid label %u", v); -} - -int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -{ - lockdep_assert_held(&c->sb_lock); - - - if (!strlen(name) || !strcmp(name, "none")) { - struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_GROUP(mi, 0); - } else { - int v = bch2_disk_path_find_or_create(&c->disk_sb, name); - if (v < 0) - return v; - - struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_GROUP(mi, v + 1); - } - - return bch2_sb_disk_groups_to_cpu(c); -} - -int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -{ - int ret; - - mutex_lock(&c->sb_lock); - ret = __bch2_dev_group_set(c, ca, name) ?: - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; -} - -int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, - struct printbuf *err) -{ - struct bch_dev *ca; - int g; - - if (!val) - return -EINVAL; - - if (!c) - return -BCH_ERR_option_needs_open_fs; - - if (!strlen(val) || !strcmp(val, "none")) { - *res = 0; - return 0; - } - - /* Is it a device? */ - ca = bch2_dev_lookup(c, val); - if (!IS_ERR(ca)) { - *res = dev_to_target(ca->dev_idx); - bch2_dev_put(ca); - return 0; - } - - mutex_lock(&c->sb_lock); - g = bch2_disk_path_find(&c->disk_sb, val); - mutex_unlock(&c->sb_lock); - - if (g >= 0) { - *res = group_to_target(g); - return 0; - } - - return -EINVAL; -} - -void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) -{ - struct target t = target_decode(v); - - switch (t.type) { - case TARGET_NULL: - prt_printf(out, "none"); - return; - case TARGET_DEV: { - out->atomic++; - guard(rcu)(); - struct bch_dev *ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - - if (ca && ca->disk_sb.bdev) - prt_printf(out, "/dev/%s", ca->name); - else if (ca) - prt_printf(out, "offline device %u", t.dev); - else - prt_printf(out, "invalid device %u", t.dev); - - out->atomic--; - return; - } - case TARGET_GROUP: - bch2_disk_path_to_text(out, c, t.group); - return; - default: - BUG(); - } -} - -static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) -{ - struct target t = target_decode(v); - - switch (t.type) { - case TARGET_NULL: - prt_printf(out, "none"); - break; - case TARGET_DEV: { - struct bch_member m = bch2_sb_member_get(sb, t.dev); - - if (bch2_member_exists(sb, t.dev)) { - prt_printf(out, "Device "); - pr_uuid(out, m.uuid.b); - prt_printf(out, " (%u)", t.dev); - } else { - prt_printf(out, "Bad device %u", t.dev); - } - break; - } - case TARGET_GROUP: - bch2_disk_path_to_text_sb(out, sb, t.group); - break; - default: - BUG(); - } -} - -void bch2_opt_target_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_sb *sb, - u64 v) -{ - if (c) - bch2_target_to_text(out, c, v); - else - bch2_target_to_text_sb(out, sb, v); -} diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h deleted file mode 100644 index 441826fff22436..00000000000000 --- a/fs/bcachefs/disk_groups.h +++ /dev/null @@ -1,111 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_GROUPS_H -#define _BCACHEFS_DISK_GROUPS_H - -#include "disk_groups_types.h" - -extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; - -static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) -{ - return groups - ? (vstruct_end(&groups->field) - - (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) - : 0; -} - -struct target { - enum { - TARGET_NULL, - TARGET_DEV, - TARGET_GROUP, - } type; - union { - unsigned dev; - unsigned group; - }; -}; - -#define TARGET_DEV_START 1 -#define TARGET_GROUP_START (256 + TARGET_DEV_START) - -static inline u16 dev_to_target(unsigned dev) -{ - return TARGET_DEV_START + dev; -} - -static inline u16 group_to_target(unsigned group) -{ - return TARGET_GROUP_START + group; -} - -static inline struct target target_decode(unsigned target) -{ - if (target >= TARGET_GROUP_START) - return (struct target) { - .type = TARGET_GROUP, - .group = target - TARGET_GROUP_START - }; - - if (target >= TARGET_DEV_START) - return (struct target) { - .type = TARGET_DEV, - .group = target - TARGET_DEV_START - }; - - return (struct target) { .type = TARGET_NULL }; -} - -const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); - -static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, - enum bch_data_type data_type, - u16 target) -{ - struct bch_devs_mask devs = c->rw_devs[data_type]; - const struct bch_devs_mask *t = bch2_target_to_mask(c, target); - - if (t) - bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); - return devs; -} - -static inline bool bch2_target_accepts_data(struct bch_fs *c, - enum bch_data_type data_type, - u16 target) -{ - struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target); - return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX); -} - -bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); - -int bch2_disk_path_find(struct bch_sb_handle *, const char *); - -/* Exported for userspace bcachefs-tools: */ -int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); - -void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned); -void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned); - -void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned); - -int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); -void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); - -#define bch2_opt_target (struct bch_opt_fn) { \ - .parse = bch2_opt_target_parse, \ - .to_text = bch2_opt_target_to_text, \ -} - -int bch2_sb_disk_groups_to_cpu(struct bch_fs *); - -int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); -int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); - -const char *bch2_sb_validate_disk_groups(struct bch_sb *, - struct bch_sb_field *); - -void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *); - -#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h deleted file mode 100644 index 698990bbf1d206..00000000000000 --- a/fs/bcachefs/disk_groups_format.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H -#define _BCACHEFS_DISK_GROUPS_FORMAT_H - -#define BCH_SB_LABEL_SIZE 32 - -struct bch_disk_group { - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 flags[2]; -} __packed __aligned(8); - -LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) - -struct bch_sb_field_disk_groups { - struct bch_sb_field field; - struct bch_disk_group entries[]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */ diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h deleted file mode 100644 index a54ef085b13d46..00000000000000 --- a/fs/bcachefs/disk_groups_types.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H -#define _BCACHEFS_DISK_GROUPS_TYPES_H - -struct bch_disk_group_cpu { - bool deleted; - u16 parent; - u8 label[BCH_SB_LABEL_SIZE]; - struct bch_devs_mask devs; -}; - -struct bch_disk_groups_cpu { - struct rcu_head rcu; - unsigned nr; - struct bch_disk_group_cpu entries[] __counted_by(nr); -}; - -#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c deleted file mode 100644 index 543dbba9b14f39..00000000000000 --- a/fs/bcachefs/ec.c +++ /dev/null @@ -1,2405 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* erasure coding */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "bset.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "checksum.h" -#include "disk_accounting.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "io_read.h" -#include "io_write.h" -#include "keylist.h" -#include "lru.h" -#include "recovery.h" -#include "replicas.h" -#include "super-io.h" -#include "util.h" - -#include -#include - -#ifdef __KERNEL__ - -#include -#include - -static void raid5_recov(unsigned disks, unsigned failed_idx, - size_t size, void **data) -{ - unsigned i = 2, nr; - - BUG_ON(failed_idx >= disks); - - swap(data[0], data[failed_idx]); - memcpy(data[0], data[1], size); - - while (i < disks) { - nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); - xor_blocks(nr, size, data[0], data + i); - i += nr; - } - - swap(data[0], data[failed_idx]); -} - -static void raid_gen(int nd, int np, size_t size, void **v) -{ - if (np >= 1) - raid5_recov(nd + np, nd, size, v); - if (np >= 2) - raid6_call.gen_syndrome(nd + np, size, v); - BUG_ON(np > 2); -} - -static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) -{ - switch (nr) { - case 0: - break; - case 1: - if (ir[0] < nd + 1) - raid5_recov(nd + 1, ir[0], size, v); - else - raid6_call.gen_syndrome(nd + np, size, v); - break; - case 2: - if (ir[1] < nd) { - /* data+data failure. */ - raid6_2data_recov(nd + np, size, ir[0], ir[1], v); - } else if (ir[0] < nd) { - /* data + p/q failure */ - - if (ir[1] == nd) /* data + p failure */ - raid6_datap_recov(nd + np, size, ir[0], v); - else { /* data + q failure */ - raid5_recov(nd + 1, ir[0], size, v); - raid6_call.gen_syndrome(nd + np, size, v); - } - } else { - raid_gen(nd, np, size, v); - } - break; - default: - BUG(); - } -} - -#else - -#include - -#endif - -struct ec_bio { - struct bch_dev *ca; - struct ec_stripe_buf *buf; - size_t idx; - int rw; - u64 submit_time; - struct bio bio; -}; - -/* Stripes btree keys: */ - -int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - int ret = 0; - - bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || - bpos_gt(k.k->p, POS(0, U32_MAX)), - c, stripe_pos_bad, - "stripe at bad pos"); - - bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), - c, stripe_val_size_bad, - "incorrect value size (%zu < %u)", - bkey_val_u64s(k.k), stripe_val_u64s(s)); - - bkey_fsck_err_on(s->csum_granularity_bits >= 64, - c, stripe_csum_granularity_bad, - "invalid csum granularity (%u >= 64)", - s->csum_granularity_bits); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; - struct bch_stripe s = {}; - - memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); - - unsigned nr_data = s.nr_blocks - s.nr_redundant; - - prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", - s.algorithm, - le16_to_cpu(s.sectors), - nr_data, - s.nr_redundant); - bch2_prt_csum_type(out, s.csum_type); - prt_str(out, " gran "); - if (s.csum_granularity_bits < 64) - prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits); - else - prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits); - - if (s.disk_label) { - prt_str(out, " label"); - bch2_disk_path_to_text(out, c, s.disk_label - 1); - } - - for (unsigned i = 0; i < s.nr_blocks; i++) { - const struct bch_extent_ptr *ptr = sp->ptrs + i; - - if ((void *) ptr >= bkey_val_end(k)) - break; - - prt_char(out, ' '); - bch2_extent_ptr_to_text(out, c, ptr); - - if (s.csum_type < BCH_CSUM_NR && - i < nr_data && - stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) - prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); - } -} - -/* Triggers: */ - -static int __mark_stripe_bucket(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c_stripe s, - unsigned ptr_idx, bool deleting, - struct bpos bucket, - struct bch_alloc_v4 *a, - enum btree_iter_update_trigger_flags flags) -{ - const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; - unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant; - bool parity = ptr_idx >= nr_data; - enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; - s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_fs *c = trans->c; - if (deleting) - sectors = -sectors; - - if (!deleting) { - if (bch2_trans_inconsistent_on(a->stripe || - a->stripe_redundancy, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s", - bucket.inode, bucket.offset, a->gen, - bch2_data_type_str(a->data_type), - a->dirty_sectors, - a->stripe, s.k->p.offset, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", - bucket.inode, bucket.offset, a->gen, - bch2_data_type_str(a->data_type), - a->dirty_sectors, - a->cached_sectors, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - } else { - if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || - a->stripe_redundancy != s.v->nr_redundant, trans, - "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", - bucket.inode, bucket.offset, a->gen, - a->stripe, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, - "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", - bucket.inode, bucket.offset, a->gen, - bch2_data_type_str(a->data_type), - bch2_data_type_str(data_type), - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - if (bch2_trans_inconsistent_on(parity && - (a->dirty_sectors != -sectors || - a->cached_sectors), trans, - "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s", - bucket.inode, bucket.offset, a->gen, - a->dirty_sectors, - a->cached_sectors, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - } - - if (sectors) { - ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, - a->gen, a->data_type, &a->dirty_sectors); - if (ret) - goto err; - } - - if (!deleting) { - a->stripe = s.k->p.offset; - a->stripe_redundancy = s.v->nr_redundant; - alloc_data_type_set(a, data_type); - } else { - a->stripe = 0; - a->stripe_redundancy = 0; - alloc_data_type_set(a, BCH_DATA_user); - } -err: - printbuf_exit(&buf); - return ret; -} - -static int mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned ptr_idx, bool deleting, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); - if (unlikely(!ca)) { - if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - if (flags & BTREE_TRIGGER_transactional) { - struct extent_ptr_decoded p = { - .ptr = *ptr, - .crc = bch2_extent_crc_unpack(s.k, NULL), - }; - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p, - (const union bch_extent_entry *) ptr, &bp); - - struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: - __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?: - bch2_bucket_backpointer_mod(trans, s.s_c, &bp, - !(flags & BTREE_TRIGGER_overwrite)); - if (ret) - goto err; - } - - if (flags & BTREE_TRIGGER_gc) { - struct bucket *g = gc_bucket(ca, bucket.offset); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", - ptr->dev, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - bucket_lock(g); - struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); - alloc_to_bucket(g, new); - bucket_unlock(g); - - if (!ret) - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - } -err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -static int mark_stripe_buckets(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - enum btree_iter_update_trigger_flags flags) -{ - const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(old).v : NULL; - const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(new).v : NULL; - - BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks); - - unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - - for (unsigned i = 0; i < nr_blocks; i++) { - if (new_s && old_s && - !memcmp(&new_s->ptrs[i], - &old_s->ptrs[i], - sizeof(new_s->ptrs[i]))) - continue; - - if (new_s) { - int ret = mark_stripe_bucket(trans, - bkey_s_c_to_stripe(new), i, false, flags); - if (ret) - return ret; - } - - if (old_s) { - int ret = mark_stripe_bucket(trans, - bkey_s_c_to_stripe(old), i, true, flags); - if (ret) - return ret; - } - } - - return 0; -} - -int bch2_trigger_stripe(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s _new, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_s_c new = _new.s_c; - struct bch_fs *c = trans->c; - u64 idx = new.k->p.offset; - const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(old).v : NULL; - const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(new).v : NULL; - - if (unlikely(flags & BTREE_TRIGGER_check_repair)) - return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags); - - BUG_ON(new_s && old_s && - (new_s->nr_blocks != old_s->nr_blocks || - new_s->nr_redundant != old_s->nr_redundant)); - - if (flags & BTREE_TRIGGER_transactional) { - int ret = bch2_lru_change(trans, - BCH_LRU_STRIPE_FRAGMENTATION, - idx, - stripe_lru_pos(old_s), - stripe_lru_pos(new_s)); - if (ret) - return ret; - } - - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - /* - * If the pointers aren't changing, we don't need to do anything: - */ - if (new_s && old_s && - new_s->nr_blocks == old_s->nr_blocks && - new_s->nr_redundant == old_s->nr_redundant && - !memcmp(old_s->ptrs, new_s->ptrs, - new_s->nr_blocks * sizeof(struct bch_extent_ptr))) - return 0; - - struct gc_stripe *gc = NULL; - if (flags & BTREE_TRIGGER_gc) { - gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); - if (!gc) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); - return bch_err_throw(c, ENOMEM_mark_stripe); - } - - /* - * This will be wrong when we bring back runtime gc: we should - * be unmarking the old key and then marking the new key - * - * Also: when we bring back runtime gc, locking - */ - gc->alive = true; - gc->sectors = le16_to_cpu(new_s->sectors); - gc->nr_blocks = new_s->nr_blocks; - gc->nr_redundant = new_s->nr_redundant; - - for (unsigned i = 0; i < new_s->nr_blocks; i++) - gc->ptrs[i] = new_s->ptrs[i]; - - /* - * gc recalculates this field from stripe ptr - * references: - */ - memset(gc->block_sectors, 0, sizeof(gc->block_sectors)); - } - - if (new_s) { - s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant; - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, new); - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); - if (ret) - return ret; - - if (gc) - unsafe_memcpy(&gc->r.e, &acc.replicas, - replicas_entry_bytes(&acc.replicas), "VLA"); - } - - if (old_s) { - s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant; - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, old); - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); - if (ret) - return ret; - } - - int ret = mark_stripe_buckets(trans, old, new, flags); - if (ret) - return ret; - } - - return 0; -} - -/* returns blocknr in stripe that we matched: */ -static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, - struct bkey_s_c k, unsigned *block) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i, nr_data = s->nr_blocks - s->nr_redundant; - - bkey_for_each_ptr(ptrs, ptr) - for (i = 0; i < nr_data; i++) - if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, - le16_to_cpu(s->sectors))) { - *block = i; - return ptr; - } - - return NULL; -} - -static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - - bkey_extent_entry_for_each(ptrs, entry) - if (extent_entry_type(entry) == - BCH_EXTENT_ENTRY_stripe_ptr && - entry->stripe_ptr.idx == idx) - return true; - - return false; -} - -/* Stripe bufs: */ - -static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) -{ - if (buf->key.k.type == KEY_TYPE_stripe) { - struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); - unsigned i; - - for (i = 0; i < s->v.nr_blocks; i++) { - kvfree(buf->data[i]); - buf->data[i] = NULL; - } - } -} - -/* XXX: this is a non-mempoolified memory allocation: */ -static int ec_stripe_buf_init(struct bch_fs *c, - struct ec_stripe_buf *buf, - unsigned offset, unsigned size) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned csum_granularity = 1U << v->csum_granularity_bits; - unsigned end = offset + size; - unsigned i; - - BUG_ON(end > le16_to_cpu(v->sectors)); - - offset = round_down(offset, csum_granularity); - end = min_t(unsigned, le16_to_cpu(v->sectors), - round_up(end, csum_granularity)); - - buf->offset = offset; - buf->size = end - offset; - - memset(buf->valid, 0xFF, sizeof(buf->valid)); - - for (i = 0; i < v->nr_blocks; i++) { - buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); - if (!buf->data[i]) - goto err; - } - - return 0; -err: - ec_stripe_buf_exit(buf); - return bch_err_throw(c, ENOMEM_stripe_buf); -} - -/* Checksumming: */ - -static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, - unsigned block, unsigned offset) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned csum_granularity = 1 << v->csum_granularity_bits; - unsigned end = buf->offset + buf->size; - unsigned len = min(csum_granularity, end - offset); - - BUG_ON(offset >= end); - BUG_ON(offset < buf->offset); - BUG_ON(offset & (csum_granularity - 1)); - BUG_ON(offset + len != le16_to_cpu(v->sectors) && - (len & (csum_granularity - 1))); - - return bch2_checksum(NULL, v->csum_type, - null_nonce(), - buf->data[block] + ((offset - buf->offset) << 9), - len << 9); -} - -static void ec_generate_checksums(struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned i, j, csums_per_device = stripe_csums_per_device(v); - - if (!v->csum_type) - return; - - BUG_ON(buf->offset); - BUG_ON(buf->size != le16_to_cpu(v->sectors)); - - for (i = 0; i < v->nr_blocks; i++) - for (j = 0; j < csums_per_device; j++) - stripe_csum_set(v, i, j, - ec_block_checksum(buf, i, j << v->csum_granularity_bits)); -} - -static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned csum_granularity = 1 << v->csum_granularity_bits; - unsigned i; - - if (!v->csum_type) - return; - - for (i = 0; i < v->nr_blocks; i++) { - unsigned offset = buf->offset; - unsigned end = buf->offset + buf->size; - - if (!test_bit(i, buf->valid)) - continue; - - while (offset < end) { - unsigned j = offset >> v->csum_granularity_bits; - unsigned len = min(csum_granularity, end - offset); - struct bch_csum want = stripe_csum_get(v, i, j); - struct bch_csum got = ec_block_checksum(buf, i, offset); - - if (bch2_crc_cmp(want, got)) { - struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); - if (ca) { - struct printbuf err = PRINTBUF; - - prt_str(&err, "stripe "); - bch2_csum_err_msg(&err, v->csum_type, want, got); - prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); - bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); - bch_err_ratelimited(ca, "%s", err.buf); - printbuf_exit(&err); - - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - } - - clear_bit(i, buf->valid); - break; - } - - offset += len; - } - } -} - -/* Erasure coding: */ - -static void ec_generate_ec(struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned nr_data = v->nr_blocks - v->nr_redundant; - unsigned bytes = le16_to_cpu(v->sectors) << 9; - - raid_gen(nr_data, v->nr_redundant, bytes, buf->data); -} - -static unsigned ec_nr_failed(struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - - return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); -} - -static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; - unsigned nr_data = v->nr_blocks - v->nr_redundant; - unsigned bytes = buf->size << 9; - - if (ec_nr_failed(buf) > v->nr_redundant) { - bch_err_ratelimited(c, - "error doing reconstruct read: unable to read enough blocks"); - return -1; - } - - for (i = 0; i < nr_data; i++) - if (!test_bit(i, buf->valid)) - failed[nr_failed++] = i; - - raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); - return 0; -} - -/* IO: */ - -static void ec_block_endio(struct bio *bio) -{ - struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); - struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; - struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; - struct bch_dev *ca = ec_bio->ca; - struct closure *cl = bio->bi_private; - int rw = ec_bio->rw; - unsigned ref = rw == READ - ? BCH_DEV_READ_REF_ec_block - : BCH_DEV_WRITE_REF_ec_block; - - bch2_account_io_completion(ca, bio_data_dir(bio), - ec_bio->submit_time, !bio->bi_status); - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, "erasure coding %s error: %s", - str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status)); - clear_bit(ec_bio->idx, ec_bio->buf->valid); - } - - int stale = dev_ptr_stale(ca, ptr); - if (stale) { - bch_err_ratelimited(ca->fs, - "error %s stripe: stale/invalid pointer (%i) after io", - bio_data_dir(bio) == READ ? "reading from" : "writing to", - stale); - clear_bit(ec_bio->idx, ec_bio->buf->valid); - } - - bio_put(&ec_bio->bio); - enumerated_ref_put(&ca->io_ref[rw], ref); - closure_put(cl); -} - -static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, - blk_opf_t opf, unsigned idx, struct closure *cl) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned offset = 0, bytes = buf->size << 9; - struct bch_extent_ptr *ptr = &v->ptrs[idx]; - enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant - ? BCH_DATA_user - : BCH_DATA_parity; - int rw = op_is_write(opf); - unsigned ref = rw == READ - ? BCH_DEV_READ_REF_ec_block - : BCH_DEV_WRITE_REF_ec_block; - - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref); - if (!ca) { - clear_bit(idx, buf->valid); - return; - } - - int stale = dev_ptr_stale(ca, ptr); - if (stale) { - bch_err_ratelimited(c, - "error %s stripe: stale pointer (%i)", - rw == READ ? "reading from" : "writing to", - stale); - clear_bit(idx, buf->valid); - return; - } - - - this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); - - while (offset < bytes) { - unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, - DIV_ROUND_UP(bytes, PAGE_SIZE)); - unsigned b = min_t(size_t, bytes - offset, - nr_iovecs << PAGE_SHIFT); - struct ec_bio *ec_bio; - - ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, - nr_iovecs, - opf, - GFP_KERNEL, - &c->ec_bioset), - struct ec_bio, bio); - - ec_bio->ca = ca; - ec_bio->buf = buf; - ec_bio->idx = idx; - ec_bio->rw = rw; - ec_bio->submit_time = local_clock(); - - ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); - ec_bio->bio.bi_end_io = ec_block_endio; - ec_bio->bio.bi_private = cl; - - bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); - - closure_get(cl); - enumerated_ref_get(&ca->io_ref[rw], ref); - - submit_bio(&ec_bio->bio); - - offset += b; - } - - enumerated_ref_put(&ca->io_ref[rw], ref); -} - -static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, - struct ec_stripe_buf *stripe) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - POS(0, idx), BTREE_ITER_slots); - ret = bkey_err(k); - if (ret) - goto err; - if (k.k->type != KEY_TYPE_stripe) { - ret = -ENOENT; - goto err; - } - bkey_reassemble(&stripe->key, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* recovery read path: */ -int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, - struct bkey_s_c orig_k) -{ - struct bch_fs *c = trans->c; - struct ec_stripe_buf *buf = NULL; - struct closure cl; - struct bch_stripe *v; - unsigned i, offset; - const char *msg = NULL; - struct printbuf msgbuf = PRINTBUF; - int ret = 0; - - closure_init_stack(&cl); - - BUG_ON(!rbio->pick.has_ec); - - buf = kzalloc(sizeof(*buf), GFP_NOFS); - if (!buf) - return bch_err_throw(c, ENOMEM_ec_read_extent); - - ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); - if (ret) { - msg = "stripe not found"; - goto err; - } - - v = &bkey_i_to_stripe(&buf->key)->v; - - if (!bch2_ptr_matches_stripe(v, rbio->pick)) { - msg = "pointer doesn't match stripe"; - goto err; - } - - offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; - if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { - msg = "read is bigger than stripe"; - goto err; - } - - ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio)); - if (ret) { - msg = "-ENOMEM"; - goto err; - } - - for (i = 0; i < v->nr_blocks; i++) - ec_block_io(c, buf, REQ_OP_READ, i, &cl); - - closure_sync(&cl); - - if (ec_nr_failed(buf) > v->nr_redundant) { - msg = "unable to read enough blocks"; - goto err; - } - - ec_validate_checksums(c, buf); - - ret = ec_do_recov(c, buf); - if (ret) - goto err; - - memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, - buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); -out: - ec_stripe_buf_exit(buf); - kfree(buf); - return ret; -err: - bch2_bkey_val_to_text(&msgbuf, c, orig_k); - bch_err_ratelimited(c, - "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); - printbuf_exit(&msgbuf); - ret = bch_err_throw(c, stripe_reconstruct); - goto out; -} - -/* stripe bucket accounting: */ - -static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) -{ - if (c->gc_pos.phase != GC_PHASE_not_running && - !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) - return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc); - - return 0; -} - -static int ec_stripe_mem_alloc(struct btree_trans *trans, - struct btree_iter *iter) -{ - return allocate_dropping_locks_errcode(trans, - __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); -} - -/* - * Hash table of open stripes: - * Stripes that are being created or modified are kept in a hash table, so that - * stripe deletion can skip them. - */ - -static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) -{ - unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); - struct ec_stripe_new *s; - - hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) - if (s->idx == idx) - return true; - return false; -} - -static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) -{ - bool ret = false; - - spin_lock(&c->ec_stripes_new_lock); - ret = __bch2_stripe_is_open(c, idx); - spin_unlock(&c->ec_stripes_new_lock); - - return ret; -} - -static bool bch2_try_open_stripe(struct bch_fs *c, - struct ec_stripe_new *s, - u64 idx) -{ - bool ret; - - spin_lock(&c->ec_stripes_new_lock); - ret = !__bch2_stripe_is_open(c, idx); - if (ret) { - unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); - - s->idx = idx; - hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); - } - spin_unlock(&c->ec_stripes_new_lock); - - return ret; -} - -static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) -{ - BUG_ON(!s->idx); - - spin_lock(&c->ec_stripes_new_lock); - hlist_del_init(&s->hash); - spin_unlock(&c->ec_stripes_new_lock); - - s->idx = 0; -} - -/* stripe deletion */ - -static int ec_stripe_delete(struct btree_trans *trans, u64 idx) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - goto err; - - /* - * We expect write buffer races here - * Important: check stripe_is_open with stripe key locked: - */ - if (k.k->type == KEY_TYPE_stripe && - !bch2_stripe_is_open(trans->c, idx) && - stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * XXX - * can we kill this and delete stripes from the trigger? - */ -static void ec_stripe_delete_work(struct work_struct *work) -{ - struct bch_fs *c = - container_of(work, struct bch_fs, ec_stripe_delete_work); - - bch2_trans_run(c, - bch2_btree_write_buffer_tryflush(trans) ?: - for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru, - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0), - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX), - 0, lru_k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ - ec_stripe_delete(trans, lru_k.k->p.offset); - }))); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); -} - -void bch2_do_stripe_deletes(struct bch_fs *c) -{ - if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) && - !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); -} - -/* stripe creation: */ - -static int ec_stripe_key_update(struct btree_trans *trans, - struct bkey_i_stripe *old, - struct bkey_i_stripe *new) -{ - struct bch_fs *c = trans->c; - bool create = !old; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - goto err; - - if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), - c, "error %s stripe: got existing key type %s", - create ? "creating" : "updating", - bch2_bkey_types[k.k->type])) { - ret = -EINVAL; - goto err; - } - - if (k.k->type == KEY_TYPE_stripe) { - const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; - - BUG_ON(old->v.nr_blocks != new->v.nr_blocks); - BUG_ON(old->v.nr_blocks != v->nr_blocks); - - for (unsigned i = 0; i < new->v.nr_blocks; i++) { - unsigned sectors = stripe_blockcount_get(v, i); - - if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "stripe changed nonempty block %u", i); - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); - bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -EINVAL; - goto err; - } - - /* - * If the stripe ptr changed underneath us, it must have - * been dev_remove_stripes() -> * invalidate_stripe_to_dev() - */ - if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) { - BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID); - - if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i])) - new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID; - } - - stripe_blockcount_set(&new->v, i, sectors); - } - } - - ret = bch2_trans_update(trans, &iter, &new->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int ec_stripe_update_extent(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, u8 gen, - struct ec_stripe_buf *s, - struct bkey_s_c_backpointer bp, - struct bkey_buf *last_flushed) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - const struct bch_extent_ptr *ptr_c; - struct bch_extent_ptr *ec_ptr = NULL; - struct bch_extent_stripe_ptr stripe_ptr; - struct bkey_i *n; - int ret, dev, block; - - if (bp.v->level) { - struct printbuf buf = PRINTBUF; - struct btree_iter node_iter; - struct btree *b; - - b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); - bch2_trans_iter_exit(trans, &node_iter); - - if (!b) - return 0; - - prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); - bch2_bkey_val_to_text(&buf, c, bp.s_c); - - bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, erasure_coding_found_btree_node); - } - - k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); - ret = bkey_err(k); - if (ret) - return ret; - if (!k.k) { - /* - * extent no longer exists - we could flush the btree - * write buffer and retry to verify, but no need: - */ - return 0; - } - - if (extent_has_stripe_ptr(k, s->key.k.p.offset)) - goto out; - - ptr_c = bkey_matches_stripe(v, k, &block); - /* - * It doesn't generally make sense to erasure code cached ptrs: - * XXX: should we be incrementing a counter? - */ - if (!ptr_c || ptr_c->cached) - goto out; - - dev = v->ptrs[block].dev; - - n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto out; - - bkey_reassemble(n, k); - - bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev); - ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); - BUG_ON(!ec_ptr); - - stripe_ptr = (struct bch_extent_stripe_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, - .block = block, - .redundancy = v->nr_redundant, - .idx = s->key.k.p.offset, - }; - - __extent_entry_insert(n, - (union bch_extent_entry *) ec_ptr, - (union bch_extent_entry *) &stripe_ptr); - - ret = bch2_trans_update(trans, &iter, n, 0); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, - unsigned block) -{ - struct bch_fs *c = trans->c; - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_extent_ptr ptr = v->ptrs[block]; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); - if (!ca) - return bch_err_throw(c, ENOENT_dev_not_found); - - struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, bucket_pos), - bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, - NULL, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, ({ - if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0))) - break; - - if (bp_k.k->type != KEY_TYPE_backpointer) - continue; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - if (bp.v->btree_id == BTREE_ID_stripes) - continue; - - ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, - bp, &last_flushed); - })); - - bch2_bkey_buf_exit(&last_flushed, c); - bch2_dev_put(ca); - return ret; -} - -static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - unsigned nr_data = v->nr_blocks - v->nr_redundant; - - int ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - for (unsigned i = 0; i < nr_data; i++) { - ret = ec_stripe_update_bucket(trans, s, i); - if (ret) - break; - } -err: - bch2_trans_put(trans); - return ret; -} - -static void zero_out_rest_of_ec_bucket(struct bch_fs *c, - struct ec_stripe_new *s, - unsigned block, - struct open_bucket *ob) -{ - struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, - BCH_DEV_WRITE_REF_ec_bucket_zero); - if (!ca) { - s->err = bch_err_throw(c, erofs_no_writes); - return; - } - - unsigned offset = ca->mi.bucket_size - ob->sectors_free; - memset(s->new_stripe.data[block] + (offset << 9), - 0, - ob->sectors_free << 9); - - int ret = blkdev_issue_zeroout(ca->disk_sb.bdev, - ob->bucket * ca->mi.bucket_size + offset, - ob->sectors_free, - GFP_KERNEL, 0); - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero); - - if (ret) - s->err = ret; -} - -void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) -{ - if (s->idx) - bch2_stripe_close(c, s); - kfree(s); -} - -/* - * data buckets of new stripe all written: create the stripe - */ -static void ec_stripe_create(struct ec_stripe_new *s) -{ - struct bch_fs *c = s->c; - struct open_bucket *ob; - struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - unsigned i, nr_data = v->nr_blocks - v->nr_redundant; - int ret; - - BUG_ON(s->h->s == s); - - closure_sync(&s->iodone); - - if (!s->err) { - for (i = 0; i < nr_data; i++) - if (s->blocks[i]) { - ob = c->open_buckets + s->blocks[i]; - - if (ob->sectors_free) - zero_out_rest_of_ec_bucket(c, s, i, ob); - } - } - - if (s->err) { - if (!bch2_err_matches(s->err, EROFS)) - bch_err(c, "error creating stripe: error writing data buckets"); - ret = s->err; - goto err; - } - - if (s->have_existing_stripe) { - ec_validate_checksums(c, &s->existing_stripe); - - if (ec_do_recov(c, &s->existing_stripe)) { - bch_err(c, "error creating stripe: error reading existing stripe"); - ret = bch_err_throw(c, ec_block_read); - goto err; - } - - for (i = 0; i < nr_data; i++) - if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) - swap(s->new_stripe.data[i], - s->existing_stripe.data[i]); - - ec_stripe_buf_exit(&s->existing_stripe); - } - - BUG_ON(!s->allocated); - BUG_ON(!s->idx); - - ec_generate_ec(&s->new_stripe); - - ec_generate_checksums(&s->new_stripe); - - /* write p/q: */ - for (i = nr_data; i < v->nr_blocks; i++) - ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); - closure_sync(&s->iodone); - - if (ec_nr_failed(&s->new_stripe)) { - bch_err(c, "error creating stripe: error writing redundancy buckets"); - ret = bch_err_throw(c, ec_block_write); - goto err; - } - - ret = bch2_trans_commit_do(c, &s->res, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, - ec_stripe_key_update(trans, - s->have_existing_stripe - ? bkey_i_to_stripe(&s->existing_stripe.key) - : NULL, - bkey_i_to_stripe(&s->new_stripe.key))); - bch_err_msg(c, ret, "creating stripe key"); - if (ret) { - goto err; - } - - ret = ec_stripe_update_extents(c, &s->new_stripe); - bch_err_msg(c, ret, "error updating extents"); - if (ret) - goto err; -err: - trace_stripe_create(c, s->idx, ret); - - bch2_disk_reservation_put(c, &s->res); - - for (i = 0; i < v->nr_blocks; i++) - if (s->blocks[i]) { - ob = c->open_buckets + s->blocks[i]; - - if (i < nr_data) { - ob->ec = NULL; - __bch2_open_bucket_put(c, ob); - } else { - bch2_open_bucket_put(c, ob); - } - } - - mutex_lock(&c->ec_stripe_new_lock); - list_del(&s->list); - mutex_unlock(&c->ec_stripe_new_lock); - wake_up(&c->ec_stripe_new_wait); - - ec_stripe_buf_exit(&s->existing_stripe); - ec_stripe_buf_exit(&s->new_stripe); - closure_debug_destroy(&s->iodone); - - ec_stripe_new_put(c, s, STRIPE_REF_stripe); -} - -static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) -{ - struct ec_stripe_new *s; - - mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(s, &c->ec_stripe_new_list, list) - if (!atomic_read(&s->ref[STRIPE_REF_io])) - goto out; - s = NULL; -out: - mutex_unlock(&c->ec_stripe_new_lock); - - return s; -} - -static void ec_stripe_create_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, - struct bch_fs, ec_stripe_create_work); - struct ec_stripe_new *s; - - while ((s = get_pending_stripe(c))) - ec_stripe_create(s); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); -} - -void bch2_ec_do_stripe_creates(struct bch_fs *c) -{ - enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create); - - if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); -} - -static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) -{ - struct ec_stripe_new *s = h->s; - - lockdep_assert_held(&h->lock); - - BUG_ON(!s->allocated && !s->err); - - h->s = NULL; - s->pending = true; - - mutex_lock(&c->ec_stripe_new_lock); - list_add(&s->list, &c->ec_stripe_new_list); - mutex_unlock(&c->ec_stripe_new_lock); - - ec_stripe_new_put(c, s, STRIPE_REF_io); -} - -static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err) -{ - h->s->err = err; - ec_stripe_new_set_pending(c, h); -} - -void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err) -{ - struct ec_stripe_new *s = ob->ec; - - s->err = err; -} - -void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) -{ - struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); - if (!ob) - return NULL; - - BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); - - struct bch_dev *ca = ob_dev(c, ob); - unsigned offset = ca->mi.bucket_size - ob->sectors_free; - - return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); -} - -static int unsigned_cmp(const void *_l, const void *_r) -{ - unsigned l = *((const unsigned *) _l); - unsigned r = *((const unsigned *) _r); - - return cmp_int(l, r); -} - -/* pick most common bucket size: */ -static unsigned pick_blocksize(struct bch_fs *c, - struct bch_devs_mask *devs) -{ - unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX]; - struct { - unsigned nr, size; - } cur = { 0, 0 }, best = { 0, 0 }; - - for_each_member_device_rcu(c, ca, devs) - sizes[nr++] = ca->mi.bucket_size; - - sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); - - for (unsigned i = 0; i < nr; i++) { - if (sizes[i] != cur.size) { - if (cur.nr > best.nr) - best = cur; - - cur.nr = 0; - cur.size = sizes[i]; - } - - cur.nr++; - } - - if (cur.nr > best.nr) - best = cur; - - return best.size; -} - -static bool may_create_new_stripe(struct bch_fs *c) -{ - return false; -} - -static void ec_stripe_key_init(struct bch_fs *c, - struct bkey_i *k, - unsigned nr_data, - unsigned nr_parity, - unsigned stripe_size, - unsigned disk_label) -{ - struct bkey_i_stripe *s = bkey_stripe_init(k); - unsigned u64s; - - s->v.sectors = cpu_to_le16(stripe_size); - s->v.algorithm = 0; - s->v.nr_blocks = nr_data + nr_parity; - s->v.nr_redundant = nr_parity; - s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); - s->v.csum_type = BCH_CSUM_crc32c; - s->v.disk_label = disk_label; - - while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { - BUG_ON(1 << s->v.csum_granularity_bits >= - le16_to_cpu(s->v.sectors) || - s->v.csum_granularity_bits == U8_MAX); - s->v.csum_granularity_bits++; - } - - set_bkey_val_u64s(&s->k, u64s); -} - -static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) -{ - struct ec_stripe_new *s; - - lockdep_assert_held(&h->lock); - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return NULL; - - mutex_init(&s->lock); - closure_init(&s->iodone, NULL); - atomic_set(&s->ref[STRIPE_REF_stripe], 1); - atomic_set(&s->ref[STRIPE_REF_io], 1); - s->c = c; - s->h = h; - s->nr_data = min_t(unsigned, h->nr_active_devs, - BCH_BKEY_PTRS_MAX) - h->redundancy; - s->nr_parity = h->redundancy; - - ec_stripe_key_init(c, &s->new_stripe.key, - s->nr_data, s->nr_parity, - h->blocksize, h->disk_label); - return s; -} - -static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) -{ - struct bch_devs_mask devs = h->devs; - unsigned nr_devs, nr_devs_with_durability; - - scoped_guard(rcu) { - h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label - ? group_to_target(h->disk_label - 1) - : 0); - nr_devs = dev_mask_nr(&h->devs); - - for_each_member_device_rcu(c, ca, &h->devs) - if (!ca->mi.durability) - __clear_bit(ca->dev_idx, h->devs.d); - nr_devs_with_durability = dev_mask_nr(&h->devs); - - h->blocksize = pick_blocksize(c, &h->devs); - - h->nr_active_devs = 0; - for_each_member_device_rcu(c, ca, &h->devs) - if (ca->mi.bucket_size == h->blocksize) - h->nr_active_devs++; - } - - /* - * If we only have redundancy + 1 devices, we're better off with just - * replication: - */ - h->insufficient_devs = h->nr_active_devs < h->redundancy + 2; - - if (h->insufficient_devs) { - const char *err; - - if (nr_devs < h->redundancy + 2) - err = NULL; - else if (nr_devs_with_durability < h->redundancy + 2) - err = "cannot use durability=0 devices"; - else - err = "mismatched bucket sizes"; - - if (err) - bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s", - h->nr_active_devs, h->redundancy + 2, err); - } - - struct bch_devs_mask devs_leaving; - bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX); - - if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving)) - ec_stripe_new_cancel(c, h, -EINTR); - - h->rw_devs_change_count = c->rw_devs_change_count; -} - -static struct ec_stripe_head * -ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label, - unsigned algo, unsigned redundancy, - enum bch_watermark watermark) -{ - struct ec_stripe_head *h; - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return NULL; - - mutex_init(&h->lock); - BUG_ON(!mutex_trylock(&h->lock)); - - h->disk_label = disk_label; - h->algo = algo; - h->redundancy = redundancy; - h->watermark = watermark; - - list_add(&h->list, &c->ec_stripe_head_list); - return h; -} - -void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) -{ - if (h->s && - h->s->allocated && - bitmap_weight(h->s->blocks_allocated, - h->s->nr_data) == h->s->nr_data) - ec_stripe_new_set_pending(c, h); - - mutex_unlock(&h->lock); -} - -static struct ec_stripe_head * -__bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned disk_label, - unsigned algo, - unsigned redundancy, - enum bch_watermark watermark) -{ - struct bch_fs *c = trans->c; - struct ec_stripe_head *h; - int ret; - - if (!redundancy) - return NULL; - - ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); - if (ret) - return ERR_PTR(ret); - - if (test_bit(BCH_FS_going_ro, &c->flags)) { - h = ERR_PTR(-BCH_ERR_erofs_no_writes); - goto err; - } - - list_for_each_entry(h, &c->ec_stripe_head_list, list) - if (h->disk_label == disk_label && - h->algo == algo && - h->redundancy == redundancy && - h->watermark == watermark) { - ret = bch2_trans_mutex_lock(trans, &h->lock); - if (ret) { - h = ERR_PTR(ret); - goto err; - } - goto found; - } - - h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); - if (!h) { - h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc); - goto err; - } -found: - if (h->rw_devs_change_count != c->rw_devs_change_count) - ec_stripe_head_devs_update(c, h); - - if (h->insufficient_devs) { - mutex_unlock(&h->lock); - h = NULL; - } -err: - mutex_unlock(&c->ec_stripe_head_lock); - return h; -} - -static int new_stripe_alloc_buckets(struct btree_trans *trans, - struct alloc_request *req, - struct ec_stripe_head *h, struct ec_stripe_new *s, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - struct open_bucket *ob; - struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - unsigned i, j, nr_have_parity = 0, nr_have_data = 0; - int ret = 0; - - req->scratch_data_type = req->data_type; - req->scratch_ptrs = req->ptrs; - req->scratch_nr_replicas = req->nr_replicas; - req->scratch_nr_effective = req->nr_effective; - req->scratch_have_cache = req->have_cache; - req->scratch_devs_may_alloc = req->devs_may_alloc; - - req->devs_may_alloc = h->devs; - req->have_cache = true; - - BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); - BUG_ON(v->nr_redundant != s->nr_parity); - - /* * We bypass the sector allocator which normally does this: */ - bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, - c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); - - for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { - /* - * Note: we don't yet repair invalid blocks (failed/removed - * devices) when reusing stripes - we still need a codepath to - * walk backpointers and update all extents that point to that - * block when updating the stripe - */ - if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); - - if (i < s->nr_data) - nr_have_data++; - else - nr_have_parity++; - } - - BUG_ON(nr_have_data > s->nr_data); - BUG_ON(nr_have_parity > s->nr_parity); - - req->ptrs.nr = 0; - if (nr_have_parity < s->nr_parity) { - req->nr_replicas = s->nr_parity; - req->nr_effective = nr_have_parity; - req->data_type = BCH_DATA_parity; - - ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); - - open_bucket_for_each(c, &req->ptrs, ob, i) { - j = find_next_zero_bit(s->blocks_gotten, - s->nr_data + s->nr_parity, - s->nr_data); - BUG_ON(j >= s->nr_data + s->nr_parity); - - s->blocks[j] = req->ptrs.v[i]; - v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, s->blocks_gotten); - } - - if (ret) - goto err; - } - - req->ptrs.nr = 0; - if (nr_have_data < s->nr_data) { - req->nr_replicas = s->nr_data; - req->nr_effective = nr_have_data; - req->data_type = BCH_DATA_user; - - ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); - - open_bucket_for_each(c, &req->ptrs, ob, i) { - j = find_next_zero_bit(s->blocks_gotten, - s->nr_data, 0); - BUG_ON(j >= s->nr_data); - - s->blocks[j] = req->ptrs.v[i]; - v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, s->blocks_gotten); - } - - if (ret) - goto err; - } -err: - req->data_type = req->scratch_data_type; - req->ptrs = req->scratch_ptrs; - req->nr_replicas = req->scratch_nr_replicas; - req->nr_effective = req->scratch_nr_effective; - req->have_cache = req->scratch_have_cache; - req->devs_may_alloc = req->scratch_devs_may_alloc; - return ret; -} - -static int __get_existing_stripe(struct btree_trans *trans, - struct ec_stripe_head *head, - struct ec_stripe_buf *stripe, - u64 idx) -{ - struct bch_fs *c = trans->c; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_stripes, POS(0, idx), 0); - int ret = bkey_err(k); - if (ret) - goto err; - - /* We expect write buffer races here */ - if (k.k->type != KEY_TYPE_stripe) - goto out; - - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - if (stripe_lru_pos(s.v) <= 1) - goto out; - - if (s.v->disk_label == head->disk_label && - s.v->algorithm == head->algo && - s.v->nr_redundant == head->redundancy && - le16_to_cpu(s.v->sectors) == head->blocksize && - bch2_try_open_stripe(c, head->s, idx)) { - bkey_reassemble(&stripe->key, k); - ret = 1; - } -out: - bch2_set_btree_iter_dontneed(trans, &iter); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) -{ - struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v; - unsigned i; - - BUG_ON(existing_v->nr_redundant != s->nr_parity); - s->nr_data = existing_v->nr_blocks - - existing_v->nr_redundant; - - int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); - if (ret) { - bch2_stripe_close(c, s); - return ret; - } - - BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); - - /* - * Free buckets we initially allocated - they might conflict with - * blocks from the stripe we're reusing: - */ - for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) { - bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]); - s->blocks[i] = 0; - } - memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten)); - memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated)); - - for (unsigned i = 0; i < existing_v->nr_blocks; i++) { - if (stripe_blockcount_get(existing_v, i)) { - __set_bit(i, s->blocks_gotten); - __set_bit(i, s->blocks_allocated); - } - - ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone); - } - - bkey_copy(&s->new_stripe.key, &s->existing_stripe.key); - s->have_existing_stripe = true; - - return 0; -} - -static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h, - struct ec_stripe_new *s) -{ - struct bch_fs *c = trans->c; - - /* - * If we can't allocate a new stripe, and there's no stripes with empty - * blocks for us to reuse, that means we have to wait on copygc: - */ - if (may_create_new_stripe(c)) - return -1; - - struct btree_iter lru_iter; - struct bkey_s_c lru_k; - int ret = 0; - - for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru, - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0), - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX), - 0, lru_k, ret) { - ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &lru_iter); - if (!ret) - ret = bch_err_throw(c, stripe_alloc_blocked); - if (ret == 1) - ret = 0; - if (ret) - return ret; - - return init_new_stripe_from_existing(c, s); -} - -static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h, - struct ec_stripe_new *s) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bpos min_pos = POS(0, 1); - struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); - int ret; - - if (!s->res.sectors) { - ret = bch2_disk_reservation_get(c, &s->res, - h->blocksize, - s->nr_parity, - BCH_DISK_RESERVATION_NOFAIL); - if (ret) - return ret; - } - - /* - * Allocate stripe slot - * XXX: we're going to need a bitrange btree of free stripes - */ - for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, - BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { - if (bkey_gt(k.k->p, POS(0, U32_MAX))) { - if (start_pos.offset) { - start_pos = min_pos; - bch2_btree_iter_set_pos(trans, &iter, start_pos); - continue; - } - - ret = bch_err_throw(c, ENOSPC_stripe_create); - break; - } - - if (bkey_deleted(k.k) && - bch2_try_open_stripe(c, s, k.k->p.offset)) - break; - } - - c->ec_stripe_hint = iter.pos.offset; - - if (ret) - goto err; - - ret = ec_stripe_mem_alloc(trans, &iter); - if (ret) { - bch2_stripe_close(c, s); - goto err; - } - - s->new_stripe.key.k.p = iter.pos; -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -err: - bch2_disk_reservation_put(c, &s->res); - goto out; -} - -struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - struct alloc_request *req, - unsigned algo, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - unsigned redundancy = req->nr_replicas - 1; - unsigned disk_label = 0; - struct target t = target_decode(req->target); - bool waiting = false; - int ret; - - if (t.type == TARGET_GROUP) { - if (t.group > U8_MAX) { - bch_err(c, "cannot create a stripe when disk_label > U8_MAX"); - return NULL; - } - disk_label = t.group + 1; /* 0 == no label */ - } - - struct ec_stripe_head *h = - __bch2_ec_stripe_head_get(trans, disk_label, algo, - redundancy, req->watermark); - if (IS_ERR_OR_NULL(h)) - return h; - - if (!h->s) { - h->s = ec_new_stripe_alloc(c, h); - if (!h->s) { - ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc); - bch_err(c, "failed to allocate new stripe"); - goto err; - } - - h->nr_created++; - } - - struct ec_stripe_new *s = h->s; - - if (s->allocated) - goto allocated; - - if (s->have_existing_stripe) - goto alloc_existing; - - /* First, try to allocate a full stripe: */ - enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; - swap(req->watermark, saved_watermark); - ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h, s); - swap(req->watermark, saved_watermark); - - if (!ret) - goto allocate_buf; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, ENOMEM)) - goto err; - - /* - * Not enough buckets available for a full stripe: we must reuse an - * existing stripe: - */ - while (1) { - ret = __bch2_ec_stripe_head_reuse(trans, h, s); - if (!ret) - break; - if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) - goto err; - - if (req->watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h, s); - if (ret) - goto err; - goto allocate_buf; - } - - /* XXX freelist_wait? */ - closure_wait(&c->freelist_wait, cl); - waiting = true; - } - - if (waiting) - closure_wake_up(&c->freelist_wait); -alloc_existing: - /* - * Retry allocating buckets, with the watermark for this - * particular write: - */ - ret = new_stripe_alloc_buckets(trans, req, h, s, cl); - if (ret) - goto err; - -allocate_buf: - ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize); - if (ret) - goto err; - - s->allocated = true; -allocated: - BUG_ON(!s->idx); - BUG_ON(!s->new_stripe.data[0]); - BUG_ON(trans->restarted); - return h; -err: - bch2_ec_stripe_head_put(c, h); - return ERR_PTR(ret); -} - -/* device removal */ - -int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - unsigned dev_idx, - unsigned flags) -{ - if (k.k->type != KEY_TYPE_stripe) - return 0; - - struct bch_fs *c = trans->c; - struct bkey_i_stripe *s = - bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); - int ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - struct disk_accounting_pos acc; - - s64 sectors = 0; - for (unsigned i = 0; i < s->v.nr_blocks; i++) - sectors -= stripe_blockcount_get(&s->v, i); - - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = BCH_DATA_user; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); - if (ret) - return ret; - - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); - - /* XXX: how much redundancy do we still have? check degraded flags */ - - unsigned nr_good = 0; - - scoped_guard(rcu) - bkey_for_each_ptr(ptrs, ptr) { - if (ptr->dev == dev_idx) - ptr->dev = BCH_SB_MEMBER_INVALID; - - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; - } - - if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return bch_err_throw(c, remove_would_lose_data); - - unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; - - if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) - return bch_err_throw(c, remove_would_lose_data); - - sectors = -sectors; - - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = BCH_DATA_user; - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); -} - -static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, - unsigned flags) -{ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); - - if (!a->stripe) - return 0; - - if (a->stripe_sectors) { - struct bch_fs *c = trans->c; - bch_err(c, "trying to invalidate device in stripe when bucket has stripe data"); - return bch_err_throw(c, invalidate_stripe_to_dev); - } - - struct btree_iter iter; - struct bkey_s_c_stripe s = - bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), - BTREE_ITER_slots, stripe); - int ret = bkey_err(s); - if (ret) - return ret; - - ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_max_commit(trans, iter, - BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), - BTREE_ITER_intent, k, - NULL, NULL, 0, ({ - bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); - }))); - bch_err_fn(c, ret); - return ret; -} - -/* startup/shutdown */ - -static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) -{ - struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i; - - mutex_lock(&c->ec_stripe_head_lock); - list_for_each_entry(h, &c->ec_stripe_head_list, list) { - mutex_lock(&h->lock); - if (!h->s) - goto unlock; - - if (!ca) - goto found; - - for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { - if (!h->s->blocks[i]) - continue; - - ob = c->open_buckets + h->s->blocks[i]; - if (ob->dev == ca->dev_idx) - goto found; - } - goto unlock; -found: - ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes); -unlock: - mutex_unlock(&h->lock); - } - mutex_unlock(&c->ec_stripe_head_lock); -} - -void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) -{ - __bch2_ec_stop(c, ca); -} - -void bch2_fs_ec_stop(struct bch_fs *c) -{ - __bch2_ec_stop(c, NULL); -} - -static bool bch2_fs_ec_flush_done(struct bch_fs *c) -{ - sched_annotate_sleep(); - - mutex_lock(&c->ec_stripe_new_lock); - bool ret = list_empty(&c->ec_stripe_new_list); - mutex_unlock(&c->ec_stripe_new_lock); - - return ret; -} - -void bch2_fs_ec_flush(struct bch_fs *c) -{ - wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); -} - -int bch2_stripes_read(struct bch_fs *c) -{ - return 0; -} - -static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, - struct ec_stripe_new *s) -{ - prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs", - s->idx, s->nr_data, s->nr_parity, - bitmap_weight(s->blocks_allocated, s->nr_data), - atomic_read(&s->ref[STRIPE_REF_io]), - atomic_read(&s->ref[STRIPE_REF_stripe]), - bch2_watermarks[s->h->watermark]); - - struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - unsigned i; - for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) - prt_printf(out, " %u", s->blocks[i]); - prt_newline(out); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key)); - prt_newline(out); -} - -void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct ec_stripe_head *h; - struct ec_stripe_new *s; - - mutex_lock(&c->ec_stripe_head_lock); - list_for_each_entry(h, &c->ec_stripe_head_list, list) { - prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", - h->disk_label, h->algo, h->redundancy, - bch2_watermarks[h->watermark], - h->nr_created); - - if (h->s) - bch2_new_stripe_to_text(out, c, h->s); - } - mutex_unlock(&c->ec_stripe_head_lock); - - prt_printf(out, "in flight:\n"); - - mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(s, &c->ec_stripe_new_list, list) - bch2_new_stripe_to_text(out, c, s); - mutex_unlock(&c->ec_stripe_new_lock); -} - -void bch2_fs_ec_exit(struct bch_fs *c) -{ - struct ec_stripe_head *h; - unsigned i; - - while (1) { - mutex_lock(&c->ec_stripe_head_lock); - h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); - mutex_unlock(&c->ec_stripe_head_lock); - - if (!h) - break; - - if (h->s) { - for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) - BUG_ON(h->s->blocks[i]); - - kfree(h->s); - } - kfree(h); - } - - BUG_ON(!list_empty(&c->ec_stripe_new_list)); - - bioset_exit(&c->ec_bioset); -} - -void bch2_fs_ec_init_early(struct bch_fs *c) -{ - spin_lock_init(&c->ec_stripes_new_lock); - - INIT_LIST_HEAD(&c->ec_stripe_head_list); - mutex_init(&c->ec_stripe_head_lock); - - INIT_LIST_HEAD(&c->ec_stripe_new_list); - mutex_init(&c->ec_stripe_new_lock); - init_waitqueue_head(&c->ec_stripe_new_wait); - - INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); - INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); -} - -int bch2_fs_ec_init(struct bch_fs *c) -{ - return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), - BIOSET_NEED_BVECS); -} - -static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, - struct bkey_s_c k, - struct bkey_buf *last_flushed) -{ - if (k.k->type != KEY_TYPE_stripe) - return 0; - - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - u64 lru_idx = stripe_lru_pos(s.v); - if (lru_idx) { - int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION, - k.k->p.offset, lru_idx, k, last_flushed); - if (ret) - return ret; - } - return 0; -} - -int bch2_check_stripe_to_lru_refs(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, - POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_stripe_to_lru_ref(trans, k, &last_flushed))); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h deleted file mode 100644 index 548048adf0d573..00000000000000 --- a/fs/bcachefs/ec.h +++ /dev/null @@ -1,309 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EC_H -#define _BCACHEFS_EC_H - -#include "ec_types.h" -#include "buckets_types.h" -#include "extents_types.h" - -int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_stripe ((struct bkey_ops) { \ - .key_validate = bch2_stripe_validate, \ - .val_to_text = bch2_stripe_to_text, \ - .swab = bch2_ptr_swab, \ - .trigger = bch2_trigger_stripe, \ - .min_val_size = 8, \ -}) - -static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) -{ - return DIV_ROUND_UP(le16_to_cpu(s->sectors), - 1 << s->csum_granularity_bits); -} - -static inline unsigned stripe_csum_offset(const struct bch_stripe *s, - unsigned dev, unsigned csum_idx) -{ - EBUG_ON(s->csum_type >= BCH_CSUM_NR); - - unsigned csum_bytes = bch_crc_bytes[s->csum_type]; - - return sizeof(struct bch_stripe) + - sizeof(struct bch_extent_ptr) * s->nr_blocks + - (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; -} - -static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, - unsigned idx) -{ - return stripe_csum_offset(s, s->nr_blocks, 0) + - sizeof(u16) * idx; -} - -static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, - unsigned idx) -{ - return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); -} - -static inline void stripe_blockcount_set(struct bch_stripe *s, - unsigned idx, unsigned v) -{ - __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); - - *p = cpu_to_le16(v); -} - -static inline unsigned stripe_val_u64s(const struct bch_stripe *s) -{ - return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), - sizeof(u64)); -} - -static inline void *stripe_csum(struct bch_stripe *s, - unsigned block, unsigned csum_idx) -{ - EBUG_ON(block >= s->nr_blocks); - EBUG_ON(csum_idx >= stripe_csums_per_device(s)); - - return (void *) s + stripe_csum_offset(s, block, csum_idx); -} - -static inline struct bch_csum stripe_csum_get(struct bch_stripe *s, - unsigned block, unsigned csum_idx) -{ - struct bch_csum csum = { 0 }; - - memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]); - return csum; -} - -static inline void stripe_csum_set(struct bch_stripe *s, - unsigned block, unsigned csum_idx, - struct bch_csum csum) -{ - memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); -} - -#define STRIPE_LRU_POS_EMPTY 1 - -static inline u64 stripe_lru_pos(const struct bch_stripe *s) -{ - if (!s) - return 0; - - unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0; - - for (unsigned i = 0; i < nr_data; i++) - blocks_empty += !stripe_blockcount_get(s, i); - - /* Will be picked up by the stripe_delete worker */ - if (blocks_empty == nr_data) - return STRIPE_LRU_POS_EMPTY; - - if (!blocks_empty) - return 0; - - /* invert: more blocks empty = reuse first */ - return LRU_TIME_MAX - blocks_empty; -} - -static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, - const struct bch_extent_ptr *data_ptr, - unsigned sectors) -{ - return (data_ptr->dev == stripe_ptr->dev || - data_ptr->dev == BCH_SB_MEMBER_INVALID || - stripe_ptr->dev == BCH_SB_MEMBER_INVALID) && - data_ptr->gen == stripe_ptr->gen && - data_ptr->offset >= stripe_ptr->offset && - data_ptr->offset < stripe_ptr->offset + sectors; -} - -static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, - struct extent_ptr_decoded p) -{ - unsigned nr_data = s->nr_blocks - s->nr_redundant; - - BUG_ON(!p.has_ec); - - if (p.ec.block >= nr_data) - return false; - - return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr, - le16_to_cpu(s->sectors)); -} - -static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, - struct extent_ptr_decoded p) -{ - unsigned nr_data = m->nr_blocks - m->nr_redundant; - - BUG_ON(!p.has_ec); - - if (p.ec.block >= nr_data) - return false; - - return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr, - m->sectors); -} - -static inline void gc_stripe_unlock(struct gc_stripe *s) -{ - BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); - - clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock); - smp_mb__after_atomic(); - wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR); -} - -static inline void gc_stripe_lock(struct gc_stripe *s) -{ - wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR, - TASK_UNINTERRUPTIBLE); -} - -struct bch_read_bio; - -struct ec_stripe_buf { - /* might not be buffering the entire stripe: */ - unsigned offset; - unsigned size; - unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - - void *data[BCH_BKEY_PTRS_MAX]; - - __BKEY_PADDED(key, 255); -}; - -struct ec_stripe_head; - -enum ec_stripe_ref { - STRIPE_REF_io, - STRIPE_REF_stripe, - STRIPE_REF_NR -}; - -struct ec_stripe_new { - struct bch_fs *c; - struct ec_stripe_head *h; - struct mutex lock; - struct list_head list; - - struct hlist_node hash; - u64 idx; - - struct closure iodone; - - atomic_t ref[STRIPE_REF_NR]; - - int err; - - u8 nr_data; - u8 nr_parity; - bool allocated; - bool pending; - bool have_existing_stripe; - - unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; - struct disk_reservation res; - - struct ec_stripe_buf new_stripe; - struct ec_stripe_buf existing_stripe; -}; - -struct ec_stripe_head { - struct list_head list; - struct mutex lock; - - unsigned disk_label; - unsigned algo; - unsigned redundancy; - enum bch_watermark watermark; - bool insufficient_devs; - - unsigned long rw_devs_change_count; - - u64 nr_created; - - struct bch_devs_mask devs; - unsigned nr_active_devs; - - unsigned blocksize; - - struct dev_stripe_state block_stripe; - struct dev_stripe_state parity_stripe; - - struct ec_stripe_new *s; -}; - -int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c); - -void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); - -void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); - -int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); - -void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); - -struct alloc_request; -struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, - struct alloc_request *, unsigned, struct closure *); - -void bch2_do_stripe_deletes(struct bch_fs *); -void bch2_ec_do_stripe_creates(struct bch_fs *); -void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); - -static inline void ec_stripe_new_get(struct ec_stripe_new *s, - enum ec_stripe_ref ref) -{ - atomic_inc(&s->ref[ref]); -} - -static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, - enum ec_stripe_ref ref) -{ - BUG_ON(atomic_read(&s->ref[ref]) <= 0); - - if (atomic_dec_and_test(&s->ref[ref])) - switch (ref) { - case STRIPE_REF_stripe: - bch2_ec_stripe_new_free(c, s); - break; - case STRIPE_REF_io: - bch2_ec_do_stripe_creates(c); - break; - default: - BUG(); - } -} - -int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *, - struct bkey_s_c, unsigned, unsigned); -int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned); - -void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); -void bch2_fs_ec_stop(struct bch_fs *); -void bch2_fs_ec_flush(struct bch_fs *); - -int bch2_stripes_read(struct bch_fs *); - -void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_ec_exit(struct bch_fs *); -void bch2_fs_ec_init_early(struct bch_fs *); -int bch2_fs_ec_init(struct bch_fs *); - -int bch2_check_stripe_to_lru_refs(struct bch_fs *); - -#endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h deleted file mode 100644 index b9770f24f213c1..00000000000000 --- a/fs/bcachefs/ec_format.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EC_FORMAT_H -#define _BCACHEFS_EC_FORMAT_H - -struct bch_stripe { - struct bch_val v; - __le16 sectors; - __u8 algorithm; - __u8 nr_blocks; - __u8 nr_redundant; - - __u8 csum_granularity_bits; - __u8 csum_type; - - /* - * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2 - * - * we can manage with this because this only needs to point to a - * disk label, not a target: - */ - __u8 disk_label; - - /* - * Variable length sections: - * - Pointers - * - Checksums - * 2D array of [stripe block/device][csum block], with checksum block - * size given by csum_granularity_bits - * - Block sector counts: per-block array of u16s - * - * XXX: - * Either checksums should have come last, or we should have included a - * checksum_size field (the size in bytes of the checksum itself, not - * the blocksize the checksum covers). - * - * Currently we aren't able to access the block sector counts if the - * checksum type is unknown. - */ - - struct bch_extent_ptr ptrs[]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_EC_FORMAT_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h deleted file mode 100644 index 809446c789518b..00000000000000 --- a/fs/bcachefs/ec_types.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EC_TYPES_H -#define _BCACHEFS_EC_TYPES_H - -#include "bcachefs_format.h" - -union bch_replicas_padded { - u8 bytes[struct_size_t(struct bch_replicas_entry_v1, - devs, BCH_BKEY_PTRS_MAX)]; - struct bch_replicas_entry_v1 e; -}; - -struct stripe { - size_t heap_idx; - u16 sectors; - u8 algorithm; - u8 nr_blocks; - u8 nr_redundant; - u8 blocks_nonempty; - u8 disk_label; -}; - -struct gc_stripe { - u8 lock; - unsigned alive:1; /* does a corresponding key exist in stripes btree? */ - u16 sectors; - u8 nr_blocks; - u8 nr_redundant; - u16 block_sectors[BCH_BKEY_PTRS_MAX]; - struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; - - union bch_replicas_padded r; -}; - -#endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c deleted file mode 100644 index 56ab430f209f5e..00000000000000 --- a/fs/bcachefs/enumerated_ref.c +++ /dev/null @@ -1,144 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "enumerated_ref.h" -#include "util.h" - -#include - -#ifdef ENUMERATED_REF_DEBUG -void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - atomic_long_inc(&ref->refs[idx]); -} - -bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - return atomic_long_inc_not_zero(&ref->refs[idx]); -} - -bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - return !ref->dying && - atomic_long_inc_not_zero(&ref->refs[idx]); -} - -void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - long v = atomic_long_dec_return(&ref->refs[idx]); - - BUG_ON(v < 0); - if (v) - return; - - for (unsigned i = 0; i < ref->nr; i++) - if (atomic_long_read(&ref->refs[i])) - return; - - if (ref->stop_fn) - ref->stop_fn(ref); - complete(&ref->stop_complete); -} -#endif - -#ifndef ENUMERATED_REF_DEBUG -static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref) -{ - struct enumerated_ref *ref = - container_of(percpu_ref, struct enumerated_ref, ref); - - if (ref->stop_fn) - ref->stop_fn(ref); - complete(&ref->stop_complete); -} -#endif - -void enumerated_ref_stop_async(struct enumerated_ref *ref) -{ - reinit_completion(&ref->stop_complete); - -#ifndef ENUMERATED_REF_DEBUG - percpu_ref_kill(&ref->ref); -#else - ref->dying = true; - for (unsigned i = 0; i < ref->nr; i++) - enumerated_ref_put(ref, i); -#endif -} - -void enumerated_ref_stop(struct enumerated_ref *ref, - const char * const names[]) -{ - enumerated_ref_stop_async(ref); - while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n"); - prt_str(&buf, "Outstanding refs:\n"); - enumerated_ref_to_text(&buf, ref, names); - printk(KERN_ERR "%s", buf.buf); - printbuf_exit(&buf); - } -} - -void enumerated_ref_start(struct enumerated_ref *ref) -{ -#ifndef ENUMERATED_REF_DEBUG - percpu_ref_reinit(&ref->ref); -#else - ref->dying = false; - for (unsigned i = 0; i < ref->nr; i++) { - BUG_ON(atomic_long_read(&ref->refs[i])); - atomic_long_inc(&ref->refs[i]); - } -#endif -} - -void enumerated_ref_exit(struct enumerated_ref *ref) -{ -#ifndef ENUMERATED_REF_DEBUG - percpu_ref_exit(&ref->ref); -#else - kfree(ref->refs); - ref->refs = NULL; - ref->nr = 0; -#endif -} - -int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr, - void (*stop_fn)(struct enumerated_ref *)) -{ - init_completion(&ref->stop_complete); - ref->stop_fn = stop_fn; - -#ifndef ENUMERATED_REF_DEBUG - return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb, - PERCPU_REF_INIT_DEAD, GFP_KERNEL); -#else - ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL); - if (!ref->refs) - return -ENOMEM; - - ref->nr = nr; - return 0; -#endif -} - -void enumerated_ref_to_text(struct printbuf *out, - struct enumerated_ref *ref, - const char * const names[]) -{ -#ifdef ENUMERATED_REF_DEBUG - bch2_printbuf_tabstop_push(out, 32); - - for (unsigned i = 0; i < ref->nr; i++) - prt_printf(out, "%s\t%li\n", names[i], - atomic_long_read(&ref->refs[i])); -#else - prt_str(out, "(not in debug mode)\n"); -#endif -} diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h deleted file mode 100644 index ec01cf59ef8069..00000000000000 --- a/fs/bcachefs/enumerated_ref.h +++ /dev/null @@ -1,66 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ENUMERATED_REF_H -#define _BCACHEFS_ENUMERATED_REF_H - -#include "enumerated_ref_types.h" - -/* - * A refcount where the users are enumerated: in debug mode, we create sepate - * refcounts for each user, to make leaks and refcount errors easy to track - * down: - */ - -#ifdef ENUMERATED_REF_DEBUG -void enumerated_ref_get(struct enumerated_ref *, unsigned); -bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned); -bool enumerated_ref_tryget(struct enumerated_ref *, unsigned); -void enumerated_ref_put(struct enumerated_ref *, unsigned); -#else - -static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) -{ - percpu_ref_get(&ref->ref); -} - -static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - return percpu_ref_tryget(&ref->ref); -} - -static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - return percpu_ref_tryget_live(&ref->ref); -} - -static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) -{ - percpu_ref_put(&ref->ref); -} -#endif - -static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref) -{ -#ifndef ENUMERATED_REF_DEBUG - return percpu_ref_is_zero(&ref->ref); -#else - for (unsigned i = 0; i < ref->nr; i++) - if (atomic_long_read(&ref->refs[i])) - return false; - return true; -#endif -} - -void enumerated_ref_stop_async(struct enumerated_ref *); -void enumerated_ref_stop(struct enumerated_ref *, const char * const[]); -void enumerated_ref_start(struct enumerated_ref *); - -void enumerated_ref_exit(struct enumerated_ref *); -int enumerated_ref_init(struct enumerated_ref *, unsigned, - void (*stop_fn)(struct enumerated_ref *)); - -struct printbuf; -void enumerated_ref_to_text(struct printbuf *, - struct enumerated_ref *, - const char * const[]); - -#endif /* _BCACHEFS_ENUMERATED_REF_H */ diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h deleted file mode 100644 index 0e6076f466d3d8..00000000000000 --- a/fs/bcachefs/enumerated_ref_types.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H -#define _BCACHEFS_ENUMERATED_REF_TYPES_H - -#include - -struct enumerated_ref { -#ifdef ENUMERATED_REF_DEBUG - unsigned nr; - bool dying; - atomic_long_t *refs; -#else - struct percpu_ref ref; -#endif - void (*stop_fn)(struct enumerated_ref *); - struct completion stop_complete; -}; - -#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */ diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c deleted file mode 100644 index c39cf304c68102..00000000000000 --- a/fs/bcachefs/errcode.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "errcode.h" -#include "trace.h" - -#include - -static const char * const bch2_errcode_strs[] = { -#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err, - BCH_ERRCODES() -#undef x - NULL -}; - -static const unsigned bch2_errcode_parents[] = { -#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, - BCH_ERRCODES() -#undef x -}; - -__attribute__((const)) -const char *bch2_err_str(int err) -{ - const char *errstr; - - err = abs(err); - - BUG_ON(err >= BCH_ERR_MAX); - - if (err >= BCH_ERR_START) - errstr = bch2_errcode_strs[err - BCH_ERR_START]; - else if (err) - errstr = errname(err); - else - errstr = "(No error)"; - return errstr ?: "(Invalid error)"; -} - -__attribute__((const)) -bool __bch2_err_matches(int err, int class) -{ - err = abs(err); - class = abs(class); - - BUG_ON(err >= BCH_ERR_MAX); - BUG_ON(class >= BCH_ERR_MAX); - - while (err >= BCH_ERR_START && err != class) - err = bch2_errcode_parents[err - BCH_ERR_START]; - - return err == class; -} - -int __bch2_err_class(int bch_err) -{ - int std_err = -bch_err; - BUG_ON((unsigned) std_err >= BCH_ERR_MAX); - - while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START]) - std_err = bch2_errcode_parents[std_err - BCH_ERR_START]; - - trace_error_downcast(bch_err, std_err, _RET_IP_); - - return -std_err; -} - -const char *bch2_blk_status_to_str(blk_status_t status) -{ - if (status == BLK_STS_REMOVED) - return "device removed"; - return blk_status_to_str(status); -} diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h deleted file mode 100644 index acc3b7b677041d..00000000000000 --- a/fs/bcachefs/errcode.h +++ /dev/null @@ -1,387 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ERRCODE_H -#define _BCACHEFS_ERRCODE_H - -#define BCH_ERRCODES() \ - x(ERANGE, ERANGE_option_too_small) \ - x(ERANGE, ERANGE_option_too_big) \ - x(EINVAL, injected) \ - x(BCH_ERR_injected, injected_fs_start) \ - x(EINVAL, mount_option) \ - x(BCH_ERR_mount_option, option_name) \ - x(BCH_ERR_mount_option, option_value) \ - x(BCH_ERR_mount_option, option_not_bool) \ - x(ENOMEM, ENOMEM_stripe_buf) \ - x(ENOMEM, ENOMEM_replicas_table) \ - x(ENOMEM, ENOMEM_cpu_replicas) \ - x(ENOMEM, ENOMEM_replicas_gc) \ - x(ENOMEM, ENOMEM_disk_groups_validate) \ - x(ENOMEM, ENOMEM_disk_groups_to_cpu) \ - x(ENOMEM, ENOMEM_mark_snapshot) \ - x(ENOMEM, ENOMEM_mark_stripe) \ - x(ENOMEM, ENOMEM_mark_stripe_ptr) \ - x(ENOMEM, ENOMEM_btree_key_cache_create) \ - x(ENOMEM, ENOMEM_btree_key_cache_fill) \ - x(ENOMEM, ENOMEM_btree_key_cache_insert) \ - x(ENOMEM, ENOMEM_trans_kmalloc) \ - x(ENOMEM, ENOMEM_trans_log_msg) \ - x(ENOMEM, ENOMEM_do_encrypt) \ - x(ENOMEM, ENOMEM_ec_read_extent) \ - x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \ - x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \ - x(ENOMEM, ENOMEM_fs_btree_cache_init) \ - x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \ - x(ENOMEM, ENOMEM_fs_counters_init) \ - x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \ - x(ENOMEM, ENOMEM_io_clock_init) \ - x(ENOMEM, ENOMEM_blacklist_table_init) \ - x(ENOMEM, ENOMEM_sb_realloc_injected) \ - x(ENOMEM, ENOMEM_sb_bio_realloc) \ - x(ENOMEM, ENOMEM_sb_buf_realloc) \ - x(ENOMEM, ENOMEM_sb_journal_validate) \ - x(ENOMEM, ENOMEM_sb_journal_v2_validate) \ - x(ENOMEM, ENOMEM_journal_entry_add) \ - x(ENOMEM, ENOMEM_journal_read_buf_realloc) \ - x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\ - x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \ - x(ENOMEM, ENOMEM_bio_read_init) \ - x(ENOMEM, ENOMEM_bio_read_split_init) \ - x(ENOMEM, ENOMEM_bio_write_init) \ - x(ENOMEM, ENOMEM_bio_bounce_pages_init) \ - x(ENOMEM, ENOMEM_writepage_bioset_init) \ - x(ENOMEM, ENOMEM_dio_read_bioset_init) \ - x(ENOMEM, ENOMEM_dio_write_bioset_init) \ - x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ - x(ENOMEM, ENOMEM_promote_table_init) \ - x(ENOMEM, ENOMEM_async_obj_init) \ - x(ENOMEM, ENOMEM_compression_bounce_read_init) \ - x(ENOMEM, ENOMEM_compression_bounce_write_init) \ - x(ENOMEM, ENOMEM_compression_workspace_init) \ - x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \ - x(EIO, compression_workspace_not_initialized) \ - x(ENOMEM, ENOMEM_bucket_gens) \ - x(ENOMEM, ENOMEM_buckets_nouse) \ - x(ENOMEM, ENOMEM_usage_init) \ - x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \ - x(ENOMEM, ENOMEM_btree_node_reclaim) \ - x(ENOMEM, ENOMEM_btree_node_mem_alloc) \ - x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \ - x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\ - x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \ - x(ENOMEM, ENOMEM_set_nr_journal_buckets) \ - x(ENOMEM, ENOMEM_dev_journal_init) \ - x(ENOMEM, ENOMEM_journal_pin_fifo) \ - x(ENOMEM, ENOMEM_journal_buf) \ - x(ENOMEM, ENOMEM_gc_start) \ - x(ENOMEM, ENOMEM_gc_alloc_start) \ - x(ENOMEM, ENOMEM_gc_reflink_start) \ - x(ENOMEM, ENOMEM_gc_gens) \ - x(ENOMEM, ENOMEM_gc_repair_key) \ - x(ENOMEM, ENOMEM_fsck_extent_ends_at) \ - x(ENOMEM, ENOMEM_fsck_add_nlink) \ - x(ENOMEM, ENOMEM_journal_key_insert) \ - x(ENOMEM, ENOMEM_journal_keys_sort) \ - x(ENOMEM, ENOMEM_read_superblock_clean) \ - x(ENOMEM, ENOMEM_fs_alloc) \ - x(ENOMEM, ENOMEM_fs_name_alloc) \ - x(ENOMEM, ENOMEM_fs_other_alloc) \ - x(ENOMEM, ENOMEM_dev_alloc) \ - x(ENOMEM, ENOMEM_disk_accounting) \ - x(ENOMEM, ENOMEM_stripe_head_alloc) \ - x(ENOMEM, ENOMEM_journal_read_bucket) \ - x(ENOSPC, ENOSPC_disk_reservation) \ - x(ENOSPC, ENOSPC_bucket_alloc) \ - x(ENOSPC, ENOSPC_disk_label_add) \ - x(ENOSPC, ENOSPC_stripe_create) \ - x(ENOSPC, ENOSPC_inode_create) \ - x(ENOSPC, ENOSPC_str_hash_create) \ - x(ENOSPC, ENOSPC_snapshot_create) \ - x(ENOSPC, ENOSPC_subvolume_create) \ - x(ENOSPC, ENOSPC_sb) \ - x(ENOSPC, ENOSPC_sb_journal) \ - x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \ - x(ENOSPC, ENOSPC_sb_quota) \ - x(ENOSPC, ENOSPC_sb_replicas) \ - x(ENOSPC, ENOSPC_sb_members) \ - x(ENOSPC, ENOSPC_sb_members_v2) \ - x(ENOSPC, ENOSPC_sb_crypt) \ - x(ENOSPC, ENOSPC_sb_downgrade) \ - x(ENOSPC, ENOSPC_btree_slot) \ - x(ENOSPC, ENOSPC_snapshot_tree) \ - x(ENOENT, ENOENT_bkey_type_mismatch) \ - x(ENOENT, ENOENT_str_hash_lookup) \ - x(ENOENT, ENOENT_str_hash_set_must_replace) \ - x(ENOENT, ENOENT_inode) \ - x(ENOENT, ENOENT_not_subvol) \ - x(ENOENT, ENOENT_not_directory) \ - x(ENOENT, ENOENT_directory_dead) \ - x(ENOENT, ENOENT_subvolume) \ - x(ENOENT, ENOENT_snapshot_tree) \ - x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ - x(ENOENT, ENOENT_dev_not_found) \ - x(ENOENT, ENOENT_dev_bucket_not_found) \ - x(ENOENT, ENOENT_dev_idx_not_found) \ - x(ENOENT, ENOENT_inode_no_backpointer) \ - x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ - x(ENOENT, btree_node_dying) \ - x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ - x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ - x(EEXIST, EEXIST_str_hash_set) \ - x(EEXIST, EEXIST_discard_in_flight_add) \ - x(EEXIST, EEXIST_subvolume_create) \ - x(ENOSPC, open_buckets_empty) \ - x(ENOSPC, freelist_empty) \ - x(BCH_ERR_freelist_empty, no_buckets_found) \ - x(0, transaction_restart) \ - x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ - x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ - x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ - x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ - x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\ - x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \ - x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \ - x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \ - x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ - x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ - x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ - x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ - x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ - x(BCH_ERR_transaction_restart, transaction_restart_nested) \ - x(BCH_ERR_transaction_restart, transaction_restart_commit) \ - x(0, no_btree_node) \ - x(BCH_ERR_no_btree_node, no_btree_node_relock) \ - x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ - x(BCH_ERR_no_btree_node, no_btree_node_drop) \ - x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \ - x(BCH_ERR_no_btree_node, no_btree_node_up) \ - x(BCH_ERR_no_btree_node, no_btree_node_down) \ - x(BCH_ERR_no_btree_node, no_btree_node_init) \ - x(BCH_ERR_no_btree_node, no_btree_node_cached) \ - x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \ - x(0, btree_insert_fail) \ - x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ - x(0, backpointer_to_overwritten_btree_node) \ - x(0, journal_reclaim_would_deadlock) \ - x(EINVAL, fsck) \ - x(BCH_ERR_fsck, fsck_ask) \ - x(BCH_ERR_fsck, fsck_fix) \ - x(BCH_ERR_fsck, fsck_delete_bkey) \ - x(BCH_ERR_fsck, fsck_ignore) \ - x(BCH_ERR_fsck, fsck_errors_not_fixed) \ - x(BCH_ERR_fsck, fsck_repair_unimplemented) \ - x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(EINVAL, recovery_will_run) \ - x(BCH_ERR_recovery_will_run, restart_recovery) \ - x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \ - x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \ - x(0, data_update_done) \ - x(0, bkey_was_deleted) \ - x(BCH_ERR_data_update_done, data_update_done_would_block) \ - x(BCH_ERR_data_update_done, data_update_done_unwritten) \ - x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ - x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ - x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ - x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \ - x(EINVAL, device_state_not_allowed) \ - x(EINVAL, member_info_missing) \ - x(EINVAL, mismatched_block_size) \ - x(EINVAL, block_size_too_small) \ - x(EINVAL, bucket_size_too_small) \ - x(EINVAL, device_size_too_small) \ - x(EINVAL, device_size_too_big) \ - x(EINVAL, device_not_a_member_of_filesystem) \ - x(EINVAL, device_has_been_removed) \ - x(EINVAL, device_splitbrain) \ - x(EINVAL, device_already_online) \ - x(EINVAL, filesystem_uuid_already_open) \ - x(EINVAL, insufficient_devices_to_start) \ - x(EINVAL, invalid) \ - x(EINVAL, internal_fsck_err) \ - x(EINVAL, opt_parse_error) \ - x(EINVAL, remove_with_metadata_missing_unimplemented)\ - x(EINVAL, remove_would_lose_data) \ - x(EINVAL, no_resize_with_buckets_nouse) \ - x(EINVAL, inode_unpack_error) \ - x(EINVAL, inode_not_unlinked) \ - x(EINVAL, inode_has_child_snapshot) \ - x(EINVAL, varint_decode_error) \ - x(EINVAL, erasure_coding_found_btree_node) \ - x(EINVAL, option_negative) \ - x(EOPNOTSUPP, may_not_use_incompat_feature) \ - x(EROFS, erofs_trans_commit) \ - x(EROFS, erofs_no_writes) \ - x(EROFS, erofs_journal_err) \ - x(EROFS, erofs_sb_err) \ - x(EROFS, erofs_unfixed_errors) \ - x(EROFS, erofs_norecovery) \ - x(EROFS, erofs_nochanges) \ - x(EROFS, erofs_no_alloc_info) \ - x(EROFS, erofs_filesystem_full) \ - x(EROFS, insufficient_devices) \ - x(0, operation_blocked) \ - x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ - x(BCH_ERR_operation_blocked, journal_res_blocked) \ - x(BCH_ERR_journal_res_blocked, journal_blocked) \ - x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \ - x(BCH_ERR_journal_res_blocked, journal_max_open) \ - x(BCH_ERR_journal_res_blocked, journal_full) \ - x(BCH_ERR_journal_res_blocked, journal_pin_full) \ - x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ - x(BCH_ERR_journal_res_blocked, journal_stuck) \ - x(BCH_ERR_journal_res_blocked, journal_retry_open) \ - x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ - x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ - x(BCH_ERR_invalid, invalid_sb) \ - x(BCH_ERR_invalid_sb, invalid_sb_magic) \ - x(BCH_ERR_invalid_sb, invalid_sb_version) \ - x(BCH_ERR_invalid_sb, invalid_sb_features) \ - x(BCH_ERR_invalid_sb, invalid_sb_too_big) \ - x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \ - x(BCH_ERR_invalid_sb, invalid_sb_csum) \ - x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ - x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ - x(BCH_ERR_invalid_sb, invalid_sb_offset) \ - x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ - x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ - x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ - x(BCH_ERR_invalid_sb, invalid_sb_field_size) \ - x(BCH_ERR_invalid_sb, invalid_sb_layout) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \ - x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ - x(BCH_ERR_invalid_sb, invalid_sb_members) \ - x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ - x(BCH_ERR_invalid_sb, invalid_sb_replicas) \ - x(BCH_ERR_invalid_sb, invalid_replicas_entry) \ - x(BCH_ERR_invalid_sb, invalid_sb_journal) \ - x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \ - x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ - x(BCH_ERR_invalid_sb, invalid_sb_clean) \ - x(BCH_ERR_invalid_sb, invalid_sb_quota) \ - x(BCH_ERR_invalid_sb, invalid_sb_errors) \ - x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \ - x(BCH_ERR_invalid_sb, invalid_sb_ext) \ - x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ - x(BCH_ERR_invalid, invalid_bkey) \ - x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ - x(EROFS, journal_shutdown) \ - x(EIO, journal_flush_err) \ - x(EIO, journal_write_err) \ - x(EIO, btree_node_read_err) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ - x(EIO, sb_not_downgraded) \ - x(EIO, btree_node_write_all_failed) \ - x(EIO, btree_node_read_error) \ - x(EIO, btree_need_topology_repair) \ - x(EIO, bucket_ref_update) \ - x(EIO, trigger_alloc) \ - x(EIO, trigger_pointer) \ - x(EIO, trigger_stripe_pointer) \ - x(EIO, metadata_bucket_inconsistency) \ - x(EIO, mark_stripe) \ - x(EIO, stripe_reconstruct) \ - x(EIO, key_type_error) \ - x(EIO, extent_poisoned) \ - x(EIO, missing_indirect_extent) \ - x(EIO, invalidate_stripe_to_dev) \ - x(EIO, no_encryption_key) \ - x(EIO, insufficient_journal_devices) \ - x(EIO, device_offline) \ - x(EIO, EIO_fault_injected) \ - x(EIO, ec_block_read) \ - x(EIO, ec_block_write) \ - x(EIO, recompute_checksum) \ - x(EIO, decompress) \ - x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \ - x(BCH_ERR_decompress, decompress_lz4) \ - x(BCH_ERR_decompress, decompress_gzip) \ - x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \ - x(BCH_ERR_decompress, decompress_zstd) \ - x(EIO, data_write) \ - x(BCH_ERR_data_write, data_write_io) \ - x(BCH_ERR_data_write, data_write_csum) \ - x(BCH_ERR_data_write, data_write_invalid_ptr) \ - x(BCH_ERR_data_write, data_write_misaligned) \ - x(BCH_ERR_decompress, data_read) \ - x(BCH_ERR_data_read, no_device_to_read_from) \ - x(BCH_ERR_data_read, no_devices_valid) \ - x(BCH_ERR_data_read, data_read_io_err) \ - x(BCH_ERR_data_read, data_read_csum_err) \ - x(BCH_ERR_data_read, data_read_retry) \ - x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \ - x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\ - x(BCH_ERR_data_read, data_read_decompress_err) \ - x(BCH_ERR_data_read, data_read_decrypt_err) \ - x(BCH_ERR_data_read, data_read_ptr_stale_race) \ - x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ - x(BCH_ERR_data_read, data_read_no_encryption_key) \ - x(BCH_ERR_data_read, data_read_buffer_too_small) \ - x(BCH_ERR_data_read, data_read_key_overwritten) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) \ - x(0, nopromote) \ - x(BCH_ERR_nopromote, nopromote_may_not) \ - x(BCH_ERR_nopromote, nopromote_already_promoted) \ - x(BCH_ERR_nopromote, nopromote_unwritten) \ - x(BCH_ERR_nopromote, nopromote_congested) \ - x(BCH_ERR_nopromote, nopromote_in_flight) \ - x(BCH_ERR_nopromote, nopromote_no_writes) \ - x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, invalid_snapshot_node) \ - x(0, option_needs_open_fs) \ - x(0, remove_disk_accounting_entry) - -enum bch_errcode { - BCH_ERR_START = 2048, -#define x(class, err) BCH_ERR_##err, - BCH_ERRCODES() -#undef x - BCH_ERR_MAX -}; - -__attribute__((const)) const char *bch2_err_str(int); - -__attribute__((const)) bool __bch2_err_matches(int, int); - -__attribute__((const)) -static inline bool _bch2_err_matches(int err, int class) -{ - return err < 0 && __bch2_err_matches(err, class); -} - -#define bch2_err_matches(_err, _class) \ -({ \ - BUILD_BUG_ON(!__builtin_constant_p(_class)); \ - unlikely(_bch2_err_matches(_err, _class)); \ -}) - -int __bch2_err_class(int); - -static inline long bch2_err_class(long err) -{ - return err < 0 ? __bch2_err_class(err) : err; -} - -#define BLK_STS_REMOVED ((__force blk_status_t)128) - -#include -const char *bch2_blk_status_to_str(blk_status_t); - -#endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c deleted file mode 100644 index 267e73d9d7e6ee..00000000000000 --- a/fs/bcachefs/error.c +++ /dev/null @@ -1,771 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "error.h" -#include "journal.h" -#include "namei.h" -#include "recovery_passes.h" -#include "super.h" -#include "thread_with_file.h" - -#define FSCK_ERR_RATELIMIT_NR 10 - -void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out) -{ - printbuf_indent_add_nextline(out, 2); - -#ifdef BCACHEFS_LOG_PREFIX - prt_printf(out, "bcachefs (%s): ", fs_or_dev_name); -#endif -} - -bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) -{ - set_bit(BCH_FS_error, &c->flags); - - switch (c->opts.errors) { - case BCH_ON_ERROR_continue: - return false; - case BCH_ON_ERROR_fix_safe: - case BCH_ON_ERROR_ro: - bch2_fs_emergency_read_only2(c, out); - return true; - case BCH_ON_ERROR_panic: - bch2_print_str(c, KERN_ERR, out->buf); - panic(bch2_fmt(c, "panic after error")); - return true; - default: - BUG(); - } -} - -bool bch2_inconsistent_error(struct bch_fs *c) -{ - struct printbuf buf = PRINTBUF; - buf.atomic++; - - printbuf_indent_add_nextline(&buf, 2); - - bool ret = __bch2_inconsistent_error(c, &buf); - if (ret) - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - return ret; -} - -__printf(3, 0) -static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans, - const char *fmt, va_list args) -{ - struct printbuf buf = PRINTBUF; - buf.atomic++; - - bch2_log_msg_start(c, &buf); - - prt_vprintf(&buf, fmt, args); - prt_newline(&buf); - - if (trans) - bch2_trans_updates_to_text(&buf, trans); - bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - - printbuf_exit(&buf); - return ret; -} - -bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args); - va_end(args); - return ret; -} - -bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args); - va_end(args); - return ret; -} - -int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) -{ - prt_printf(out, "btree topology error: "); - - set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_in_recovery, &c->flags)) { - __bch2_inconsistent_error(c, out); - return bch_err_throw(c, btree_need_topology_repair); - } else { - return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: - bch_err_throw(c, btree_need_topology_repair); - } -} - -int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) -{ - struct printbuf buf = PRINTBUF; - - bch2_log_msg_start(c, &buf); - - va_list args; - va_start(args, fmt); - prt_vprintf(&buf, fmt, args); - va_end(args); - - int ret = __bch2_topology_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - - printbuf_exit(&buf); - return ret; -} - -void bch2_fatal_error(struct bch_fs *c) -{ - if (bch2_fs_emergency_read_only(c)) - bch_err(c, "fatal error - emergency read only"); -} - -void bch2_io_error_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); - struct bch_fs *c = ca->fs; - - /* XXX: if it's reads or checksums that are failing, set it to failed */ - - down_write(&c->state_lock); - unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); - - if (write_errors_start && - time_after(jiffies, - write_errors_start + c->opts.write_error_timeout * HZ)) { - if (ca->mi.state >= BCH_MEMBER_STATE_ro) - goto out; - - bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED); - struct printbuf buf = PRINTBUF; - __bch2_log_msg_start(ca->name, &buf); - - prt_printf(&buf, "writes erroring for %u seconds, setting %s ro", - c->opts.write_error_timeout, - dev ? "device" : "filesystem"); - if (!dev) - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } -out: - up_write(&c->state_lock); -} - -void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) -{ - atomic64_inc(&ca->errors[type]); - - if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start) - ca->write_errors_start = jiffies; - - queue_work(system_long_wq, &ca->io_error_work); -} - -enum ask_yn { - YN_NO, - YN_YES, - YN_ALLNO, - YN_ALLYES, -}; - -static enum ask_yn parse_yn_response(char *buf) -{ - buf = strim(buf); - - if (strlen(buf) == 1) - switch (buf[0]) { - case 'n': - return YN_NO; - case 'y': - return YN_YES; - case 'N': - return YN_ALLNO; - case 'Y': - return YN_ALLYES; - } - return -1; -} - -#ifdef __KERNEL__ -static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) -{ - struct stdio_redirect *stdio = c->stdio; - - if (c->stdio_filter && c->stdio_filter != current) - stdio = NULL; - - if (!stdio) - return YN_NO; - - if (trans) - bch2_trans_unlock(trans); - - unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0; - darray_char line = {}; - int ret; - - do { - unsigned long t; - bch2_print(c, " (y,n, or Y,N for all errors of this type) "); -rewait: - t = unlock_long_at - ? max_t(long, unlock_long_at - jiffies, 0) - : MAX_SCHEDULE_TIMEOUT; - - int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t); - if (r == -ETIME) { - bch2_trans_unlock_long(trans); - unlock_long_at = 0; - goto rewait; - } - - if (r < 0) { - ret = YN_NO; - break; - } - - darray_last(line) = '\0'; - } while ((ret = parse_yn_response(line.data)) < 0); - - darray_exit(&line); - return ret; -} -#else - -#include "tools-util.h" - -static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) -{ - char *buf = NULL; - size_t buflen = 0; - int ret; - - do { - fputs(" (y,n, or Y,N for all errors of this type) ", stdout); - fflush(stdout); - - if (getline(&buf, &buflen, stdin) < 0) - die("error reading from standard input"); - } while ((ret = parse_yn_response(buf)) < 0); - - free(buf); - return ret; -} - -#endif - -static struct fsck_err_state *fsck_err_get(struct bch_fs *c, - enum bch_sb_error_id id) -{ - struct fsck_err_state *s; - - list_for_each_entry(s, &c->fsck_error_msgs, list) - if (s->id == id) { - /* - * move it to the head of the list: repeated fsck errors - * are common - */ - list_move(&s->list, &c->fsck_error_msgs); - return s; - } - - s = kzalloc(sizeof(*s), GFP_NOFS); - if (!s) { - if (!c->fsck_alloc_msgs_err) - bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); - c->fsck_alloc_msgs_err = true; - return NULL; - } - - INIT_LIST_HEAD(&s->list); - s->id = id; - list_add(&s->list, &c->fsck_error_msgs); - return s; -} - -/* s/fix?/fixing/ s/recreate?/recreating/ */ -static void prt_actioning(struct printbuf *out, const char *action) -{ - unsigned len = strlen(action); - - BUG_ON(action[len - 1] != '?'); - --len; - - if (action[len - 1] == 'e') - --len; - - prt_bytes(out, action, len); - prt_str(out, "ing"); -} - -static const u8 fsck_flags_extra[] = { -#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags, - BCH_SB_ERRS() -#undef x -}; - -static int do_fsck_ask_yn(struct bch_fs *c, - struct btree_trans *trans, - struct printbuf *question, - const char *action) -{ - prt_str(question, ", "); - prt_str(question, action); - - if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s", question->buf); - else - bch2_print_str(c, KERN_ERR, question->buf); - - int ask = bch2_fsck_ask_yn(c, trans); - - if (trans) { - int ret = bch2_trans_relock(trans); - if (ret) - return ret; - } - - return ask; -} - -static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c, - enum bch_sb_error_id id, const char *msg, - bool *repeat, bool *print, bool *suppress) -{ - bch2_sb_error_count(c, id); - - struct fsck_err_state *s = fsck_err_get(c, id); - if (s) { - /* - * We may be called multiple times for the same error on - * transaction restart - this memoizes instead of asking the user - * multiple times for the same error: - */ - if (s->last_msg && !strcmp(msg, s->last_msg)) { - *repeat = true; - *print = false; - return s; - } - - kfree(s->last_msg); - s->last_msg = kstrdup(msg, GFP_KERNEL); - - if (c->opts.ratelimit_errors && - s->nr >= FSCK_ERR_RATELIMIT_NR) { - if (s->nr == FSCK_ERR_RATELIMIT_NR) - *suppress = true; - else - *print = false; - } - - s->nr++; - } - return s; -} - -bool __bch2_count_fsck_err(struct bch_fs *c, - enum bch_sb_error_id id, struct printbuf *msg) -{ - bch2_sb_error_count(c, id); - - mutex_lock(&c->fsck_error_msgs_lock); - bool print = true, repeat = false, suppress = false; - - count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress); - mutex_unlock(&c->fsck_error_msgs_lock); - - if (suppress) - prt_printf(msg, "Ratelimiting new instances of previous error\n"); - - return print && !repeat; -} - -int bch2_fsck_err_opt(struct bch_fs *c, - enum bch_fsck_flags flags, - enum bch_sb_error_id err) -{ - if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) - flags |= fsck_flags_extra[err]; - - if (test_bit(BCH_FS_in_fsck, &c->flags)) { - if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) - return bch_err_throw(c, fsck_repair_unimplemented); - - switch (c->opts.fix_errors) { - case FSCK_FIX_exit: - return bch_err_throw(c, fsck_errors_not_fixed); - case FSCK_FIX_yes: - if (flags & FSCK_CAN_FIX) - return bch_err_throw(c, fsck_fix); - fallthrough; - case FSCK_FIX_no: - if (flags & FSCK_CAN_IGNORE) - return bch_err_throw(c, fsck_ignore); - return bch_err_throw(c, fsck_errors_not_fixed); - case FSCK_FIX_ask: - if (flags & FSCK_AUTOFIX) - return bch_err_throw(c, fsck_fix); - return bch_err_throw(c, fsck_ask); - default: - BUG(); - } - } else { - if ((flags & FSCK_AUTOFIX) && - (c->opts.errors == BCH_ON_ERROR_continue || - c->opts.errors == BCH_ON_ERROR_fix_safe)) - return bch_err_throw(c, fsck_fix); - - if (c->opts.errors == BCH_ON_ERROR_continue && - (flags & FSCK_CAN_IGNORE)) - return bch_err_throw(c, fsck_ignore); - return bch_err_throw(c, fsck_errors_not_fixed); - } -} - -int __bch2_fsck_err(struct bch_fs *c, - struct btree_trans *trans, - enum bch_fsck_flags flags, - enum bch_sb_error_id err, - const char *fmt, ...) -{ - va_list args; - struct printbuf buf = PRINTBUF, *out = &buf; - int ret = 0; - const char *action_orig = "fix?", *action = action_orig; - - might_sleep(); - - if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) - flags |= fsck_flags_extra[err]; - - if (!c) - c = trans->c; - - /* - * Ugly: if there's a transaction in the current task it has to be - * passed in to unlock if we prompt for user input. - * - * But, plumbing a transaction and transaction restarts into - * bkey_validate() is problematic. - * - * So: - * - make all bkey errors AUTOFIX, they're simple anyways (we just - * delete the key) - * - and we don't need to warn if we're not prompting - */ - WARN_ON((flags & FSCK_CAN_FIX) && - !(flags & FSCK_AUTOFIX) && - !trans && - bch2_current_has_btree_trans(c)); - - if (test_bit(err, c->sb.errors_silent)) - return flags & FSCK_CAN_FIX - ? bch_err_throw(c, fsck_fix) - : bch_err_throw(c, fsck_ignore); - - printbuf_indent_add_nextline(out, 2); - -#ifdef BCACHEFS_LOG_PREFIX - if (strncmp(fmt, "bcachefs", 8)) - prt_printf(out, bch2_log_msg(c, "")); -#endif - - va_start(args, fmt); - prt_vprintf(out, fmt, args); - va_end(args); - - /* Custom fix/continue/recreate/etc.? */ - if (out->buf[out->pos - 1] == '?') { - const char *p = strrchr(out->buf, ','); - if (p) { - out->pos = p - out->buf; - action = kstrdup(p + 2, GFP_KERNEL); - if (!action) { - ret = -ENOMEM; - goto err; - } - } - } - - mutex_lock(&c->fsck_error_msgs_lock); - bool repeat = false, print = true, suppress = false; - bool inconsistent = false, exiting = false; - struct fsck_err_state *s = - count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress); - if (repeat) { - ret = s->ret; - goto err_unlock; - } - - if ((flags & FSCK_AUTOFIX) && - (c->opts.errors == BCH_ON_ERROR_continue || - c->opts.errors == BCH_ON_ERROR_fix_safe)) { - prt_str(out, ", "); - if (flags & FSCK_CAN_FIX) { - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_fix); - } else { - prt_str(out, ", continuing"); - ret = bch_err_throw(c, fsck_ignore); - } - - goto print; - } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) { - if (c->opts.errors != BCH_ON_ERROR_continue || - !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { - prt_str_indented(out, ", shutting down\n" - "error not marked as autofix and not in fsck\n" - "run fsck, and forward to devs so error can be marked for self-healing"); - inconsistent = true; - print = true; - ret = bch_err_throw(c, fsck_errors_not_fixed); - } else if (flags & FSCK_CAN_FIX) { - prt_str(out, ", "); - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_fix); - } else { - prt_str(out, ", continuing"); - ret = bch_err_throw(c, fsck_ignore); - } - } else if (c->opts.fix_errors == FSCK_FIX_exit) { - prt_str(out, ", exiting"); - ret = bch_err_throw(c, fsck_errors_not_fixed); - } else if (flags & FSCK_CAN_FIX) { - int fix = s && s->fix - ? s->fix - : c->opts.fix_errors; - - if (fix == FSCK_FIX_ask) { - print = false; - - ret = do_fsck_ask_yn(c, trans, out, action); - if (ret < 0) - goto err_unlock; - - if (ret >= YN_ALLNO && s) - s->fix = ret == YN_ALLNO - ? FSCK_FIX_no - : FSCK_FIX_yes; - - ret = ret & 1 - ? bch_err_throw(c, fsck_fix) - : bch_err_throw(c, fsck_ignore); - } else if (fix == FSCK_FIX_yes || - (c->opts.nochanges && - !(flags & FSCK_CAN_IGNORE))) { - prt_str(out, ", "); - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_fix); - } else { - prt_str(out, ", not "); - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_ignore); - } - } else { - if (flags & FSCK_CAN_IGNORE) { - prt_str(out, ", continuing"); - ret = bch_err_throw(c, fsck_ignore); - } else { - prt_str(out, " (repair unimplemented)"); - ret = bch_err_throw(c, fsck_repair_unimplemented); - } - } - - if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) && - (c->opts.fix_errors == FSCK_FIX_exit || - !(flags & FSCK_CAN_IGNORE))) - ret = bch_err_throw(c, fsck_errors_not_fixed); - - if (test_bit(BCH_FS_in_fsck, &c->flags) && - (!bch2_err_matches(ret, BCH_ERR_fsck_fix) && - !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) { - exiting = true; - print = true; - } -print: - prt_newline(out); - - if (inconsistent) - __bch2_inconsistent_error(c, out); - else if (exiting) - prt_printf(out, "Unable to continue, halting\n"); - else if (suppress) - prt_printf(out, "Ratelimiting new instances of previous error\n"); - - if (print) { - /* possibly strip an empty line, from printbuf_indent_add */ - while (out->pos && out->buf[out->pos - 1] == ' ') - --out->pos; - printbuf_nul_terminate(out); - - if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s", out->buf); - else - bch2_print_str(c, KERN_ERR, out->buf); - } - - if (s) - s->ret = ret; - - if (trans && - !(flags & FSCK_ERR_NO_LOG) && - ret == -BCH_ERR_fsck_fix) - ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret; -err_unlock: - mutex_unlock(&c->fsck_error_msgs_lock); -err: - /* - * We don't yet track whether the filesystem currently has errors, for - * log_fsck_err()s: that would require us to track for every error type - * which recovery pass corrects it, to get the fsck exit status correct: - */ - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - /* nothing */ - } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { - set_bit(BCH_FS_errors_fixed, &c->flags); - } else { - set_bit(BCH_FS_errors_not_fixed, &c->flags); - set_bit(BCH_FS_error, &c->flags); - } - - if (action != action_orig) - kfree(action); - printbuf_exit(&buf); - - BUG_ON(!ret); - return ret; -} - -static const char * const bch2_bkey_validate_contexts[] = { -#define x(n) #n, - BKEY_VALIDATE_CONTEXTS() -#undef x - NULL -}; - -int __bch2_bkey_fsck_err(struct bch_fs *c, - struct bkey_s_c k, - struct bkey_validate_context from, - enum bch_sb_error_id err, - const char *fmt, ...) -{ - if (from.flags & BCH_VALIDATE_silent) - return bch_err_throw(c, fsck_delete_bkey); - - unsigned fsck_flags = 0; - if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { - if (test_bit(err, c->sb.errors_silent)) - return bch_err_throw(c, fsck_delete_bkey); - - fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; - } - if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) - fsck_flags |= fsck_flags_extra[err]; - - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "invalid bkey in %s", - bch2_bkey_validate_contexts[from.from]); - - if (from.from == BKEY_VALIDATE_journal) - prt_printf(&buf, " journal seq=%llu offset=%u", - from.journal_seq, from.journal_offset); - - prt_str(&buf, " btree="); - bch2_btree_id_to_text(&buf, from.btree); - prt_printf(&buf, " level=%u: ", from.level); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - va_list args; - va_start(args, fmt); - prt_vprintf(&buf, fmt, args); - va_end(args); - - int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf); - printbuf_exit(&buf); - return ret; -} - -static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print) -{ - struct fsck_err_state *s, *n; - - mutex_lock(&c->fsck_error_msgs_lock); - - list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { - if (print && s->ratelimited && s->last_msg) - bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); - - list_del(&s->list); - kfree(s->last_msg); - kfree(s); - } - - mutex_unlock(&c->fsck_error_msgs_lock); -} - -void bch2_flush_fsck_errs(struct bch_fs *c) -{ - __bch2_flush_fsck_errs(c, true); -} - -void bch2_free_fsck_errs(struct bch_fs *c) -{ - __bch2_flush_fsck_errs(c, false); -} - -int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - subvol_inum inum, u64 offset) -{ - u32 restart_count = trans->restart_count; - int ret = 0; - - if (inum.subvol) { - ret = bch2_inum_to_path(trans, inum, out); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - } - if (!inum.subvol || ret) - prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); - prt_printf(out, " offset %llu: ", offset); - - return trans_was_restarted(trans, restart_count); -} - -void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, - subvol_inum inum, u64 offset) -{ - bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); -} - -int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - struct bpos pos) -{ - int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out); - if (ret) - return ret; - - prt_printf(out, " offset %llu: ", pos.offset << 8); - return 0; -} - -void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, - struct bpos pos) -{ - bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); -} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h deleted file mode 100644 index 0c3c3a24fc6f6d..00000000000000 --- a/fs/bcachefs/error.h +++ /dev/null @@ -1,258 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ERROR_H -#define _BCACHEFS_ERROR_H - -#include -#include -#include "bkey_types.h" -#include "sb-errors.h" - -struct bch_dev; -struct bch_fs; -struct work_struct; - -/* - * XXX: separate out errors that indicate on disk data is inconsistent, and flag - * superblock as such - */ - -/* Error messages: */ - -void __bch2_log_msg_start(const char *, struct printbuf *); - -static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) -{ - __bch2_log_msg_start(c->name, out); -} - -/* - * Inconsistency errors: The on disk data is inconsistent. If these occur during - * initial recovery, they don't indicate a bug in the running code - we walk all - * the metadata before modifying anything. If they occur at runtime, they - * indicate either a bug in the running code or (less likely) data is being - * silently corrupted under us. - * - * XXX: audit all inconsistent errors and make sure they're all recoverable, in - * BCH_ON_ERROR_CONTINUE mode - */ - -bool __bch2_inconsistent_error(struct bch_fs *, struct printbuf *); -bool bch2_inconsistent_error(struct bch_fs *); -__printf(2, 3) -bool bch2_fs_inconsistent(struct bch_fs *, const char *, ...); - -#define bch2_fs_inconsistent_on(cond, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - if (_ret) \ - bch2_fs_inconsistent(__VA_ARGS__); \ - _ret; \ -}) - -__printf(2, 3) -bool bch2_trans_inconsistent(struct btree_trans *, const char *, ...); - -#define bch2_trans_inconsistent_on(cond, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - if (_ret) \ - bch2_trans_inconsistent(__VA_ARGS__); \ - _ret; \ -}) - -int __bch2_topology_error(struct bch_fs *, struct printbuf *); -__printf(2, 3) -int bch2_fs_topology_error(struct bch_fs *, const char *, ...); - -/* - * Fsck errors: inconsistency errors we detect at mount time, and should ideally - * be able to repair: - */ - -struct fsck_err_state { - struct list_head list; - enum bch_sb_error_id id; - u64 nr; - bool ratelimited; - int ret; - int fix; - char *last_msg; -}; - -#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) - -bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *); -#define bch2_count_fsck_err(_c, _err, ...) \ - __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__) - -int bch2_fsck_err_opt(struct bch_fs *, - enum bch_fsck_flags, - enum bch_sb_error_id); - -__printf(5, 6) __cold -int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, - enum bch_fsck_flags, - enum bch_sb_error_id, - const char *, ...); -#define bch2_fsck_err(c, _flags, _err_type, ...) \ - __bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\ - type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\ - _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__) - -void bch2_flush_fsck_errs(struct bch_fs *); -void bch2_free_fsck_errs(struct bch_fs *); - -#define fsck_err_wrap(_do) \ -({ \ - int _ret = _do; \ - if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ - !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \ - ret = _ret; \ - goto fsck_err; \ - } \ - \ - bch2_err_matches(_ret, BCH_ERR_fsck_fix); \ -}) - -#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) - -/* These macros return true if error should be fixed: */ - -/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ - -#define __fsck_err_on(cond, c, _flags, _err_type, ...) \ -({ \ - might_sleep(); \ - \ - if (type_is(c, struct bch_fs *)) \ - WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\ - \ - (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\ -}) - -#define mustfix_fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) - -#define mustfix_fsck_err_on(cond, c, _err_type, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) - -#define fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) - -#define fsck_err_on(cond, c, _err_type, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) - -#define log_fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) - -#define log_fsck_err_on(cond, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - if (_ret) \ - log_fsck_err(__VA_ARGS__); \ - _ret; \ -}) - -enum bch_validate_flags; -__printf(5, 6) -int __bch2_bkey_fsck_err(struct bch_fs *, - struct bkey_s_c, - struct bkey_validate_context from, - enum bch_sb_error_id, - const char *, ...); - -/* - * for now, bkey fsck errors are always handled by deleting the entire key - - * this will change at some point - */ -#define bkey_fsck_err(c, _err_type, _err_msg, ...) \ -do { \ - int _ret = __bch2_bkey_fsck_err(c, k, from, \ - BCH_FSCK_ERR_##_err_type, \ - _err_msg, ##__VA_ARGS__); \ - if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ - !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \ - ret = _ret; \ - ret = bch_err_throw(c, fsck_delete_bkey); \ - goto fsck_err; \ -} while (0) - -#define bkey_fsck_err_on(cond, ...) \ -do { \ - if (unlikely(cond)) \ - bkey_fsck_err(__VA_ARGS__); \ -} while (0) - -/* - * Fatal errors: these don't indicate a bug, but we can't continue running in RW - * mode - pretty much just due to metadata IO errors: - */ - -void bch2_fatal_error(struct bch_fs *); - -#define bch2_fs_fatal_error(c, _msg, ...) \ -do { \ - bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \ - bch2_fatal_error(c); \ -} while (0) - -#define bch2_fs_fatal_err_on(cond, c, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - \ - if (_ret) \ - bch2_fs_fatal_error(c, __VA_ARGS__); \ - _ret; \ -}) - -/* - * IO errors: either recoverable metadata IO (because we have replicas), or data - * IO - we need to log it and print out a message, but we don't (necessarily) - * want to shut down the fs: - */ - -void bch2_io_error_work(struct work_struct *); - -/* Does the error handling without logging a message */ -void bch2_io_error(struct bch_dev *, enum bch_member_error_type); - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void bch2_latency_acct(struct bch_dev *, u64, int); -#else -static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -#endif - -static inline void bch2_account_io_success_fail(struct bch_dev *ca, - enum bch_member_error_type type, - bool success) -{ - if (likely(success)) { - if (type == BCH_MEMBER_ERROR_write && - ca->write_errors_start) - ca->write_errors_start = 0; - } else { - bch2_io_error(ca, type); - } -} - -static inline void bch2_account_io_completion(struct bch_dev *ca, - enum bch_member_error_type type, - u64 submit_time, bool success) -{ - if (unlikely(!ca)) - return; - - if (type != BCH_MEMBER_ERROR_checksum) - bch2_latency_acct(ca, submit_time, type); - - bch2_account_io_success_fail(ca, type, success); -} - -int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); - -void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); - -int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); -void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos); - -#endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c deleted file mode 100644 index e76e58a568bffc..00000000000000 --- a/fs/bcachefs/extent_update.c +++ /dev/null @@ -1,155 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "debug.h" -#include "extents.h" -#include "extent_update.h" - -/* - * This counts the number of iterators to the alloc & ec btrees we'll need - * inserting/removing this extent: - */ -static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - unsigned ret = 0, lru = 0; - - bkey_extent_entry_for_each(ptrs, entry) { - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - /* Might also be updating LRU btree */ - if (entry->ptr.cached) - lru++; - - fallthrough; - case BCH_EXTENT_ENTRY_stripe_ptr: - ret++; - } - } - - /* - * Updating keys in the alloc btree may also update keys in the - * freespace or discard btrees: - */ - return lru + ret * 2; -} - -#define EXTENT_ITERS_MAX 64 - -static int count_iters_for_insert(struct btree_trans *trans, - struct bkey_s_c k, - unsigned offset, - struct bpos *end, - unsigned *nr_iters) -{ - int ret = 0, ret2 = 0; - - if (*nr_iters >= EXTENT_ITERS_MAX) { - *end = bpos_min(*end, k.k->p); - ret = 1; - } - - switch (k.k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - *nr_iters += bch2_bkey_nr_alloc_ptrs(k); - - if (*nr_iters >= EXTENT_ITERS_MAX) { - *end = bpos_min(*end, k.k->p); - ret = 1; - } - - break; - case KEY_TYPE_reflink_p: { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx = REFLINK_P_IDX(p.v); - unsigned sectors = bpos_min(*end, p.k->p).offset - - bkey_start_offset(p.k); - struct btree_iter iter; - struct bkey_s_c r_k; - - for_each_btree_key_norestart(trans, iter, - BTREE_ID_reflink, POS(0, idx + offset), - BTREE_ITER_slots, r_k, ret2) { - if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) - break; - - /* extent_update_to_keys(), for the reflink_v update */ - *nr_iters += 1; - - *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); - - if (*nr_iters >= EXTENT_ITERS_MAX) { - struct bpos pos = bkey_start_pos(k.k); - pos.offset += min_t(u64, k.k->size, - r_k.k->p.offset - idx); - - *end = bpos_min(*end, pos); - ret = 1; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - break; - } - } - - return ret2 ?: ret; -} - -int bch2_extent_atomic_end(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos *end) -{ - unsigned nr_iters = 0; - - struct btree_iter copy; - bch2_trans_copy_iter(trans, ©, iter); - - int ret = bch2_btree_iter_traverse(trans, ©); - if (ret) - goto err; - - struct bkey_s_c k; - for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) { - unsigned offset = 0; - - if (bkey_gt(iter->pos, bkey_start_pos(k.k))) - offset = iter->pos.offset - bkey_start_offset(k.k); - - ret = count_iters_for_insert(trans, k, offset, end, &nr_iters); - if (ret) - break; - } -err: - bch2_trans_iter_exit(trans, ©); - return ret < 0 ? ret : 0; -} - -int bch2_extent_trim_atomic(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *k) -{ - struct bpos end = k->k.p; - int ret = bch2_extent_atomic_end(trans, iter, &end); - if (ret) - return ret; - - /* tracepoint */ - - if (bpos_lt(end, k->k.p)) { - if (trace_extent_trim_atomic_enabled()) { - CLASS(printbuf, buf)(); - bch2_bpos_to_text(&buf, end); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); - trace_extent_trim_atomic(trans->c, buf.buf); - } - bch2_cut_back(end, k); - } - return 0; -} diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h deleted file mode 100644 index 34467db53f4575..00000000000000 --- a/fs/bcachefs/extent_update.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENT_UPDATE_H -#define _BCACHEFS_EXTENT_UPDATE_H - -#include "bcachefs.h" - -int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, - struct bpos *); -int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, - struct bkey_i *); - -#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c deleted file mode 100644 index 83cbd77dcb9cce..00000000000000 --- a/fs/bcachefs/extents.c +++ /dev/null @@ -1,1735 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2010 Kent Overstreet - * - * Code for managing the extent btree and dynamically updating the writeback - * dirty sector count. - */ - -#include "bcachefs.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_gc.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "buckets.h" -#include "checksum.h" -#include "compress.h" -#include "debug.h" -#include "disk_groups.h" -#include "error.h" -#include "extents.h" -#include "inode.h" -#include "journal.h" -#include "rebalance.h" -#include "replicas.h" -#include "super.h" -#include "super-io.h" -#include "trace.h" -#include "util.h" - -static const char * const bch2_extent_flags_strs[] = { -#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, - BCH_EXTENT_FLAGS() -#undef x - NULL, -}; - -static unsigned bch2_crc_field_size_max[] = { - [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, -}; - -static void bch2_extent_crc_pack(union bch_extent_crc *, - struct bch_extent_crc_unpacked, - enum bch_extent_entry_type); - -void bch2_io_failures_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_io_failures *failed) -{ - static const char * const error_types[] = { - "btree validate", "io", "checksum", "ec reconstruct", NULL - }; - - for (struct bch_dev_io_failures *f = failed->devs; - f < failed->devs + failed->nr; - f++) { - unsigned errflags = - ((!!f->failed_btree_validate) << 0) | - ((!!f->failed_io) << 1) | - ((!!f->failed_csum_nr) << 2) | - ((!!f->failed_ec) << 3); - - bch2_printbuf_make_room(out, 1024); - out->atomic++; - scoped_guard(rcu) { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); - if (ca) - prt_str(out, ca->name); - else - prt_printf(out, "(invalid device %u)", f->dev); - } - --out->atomic; - - prt_char(out, ' '); - - if (!errflags) { - prt_str(out, "no error - confused"); - } else if (is_power_of_2(errflags)) { - prt_bitflags(out, error_types, errflags); - prt_str(out, " error"); - } else { - prt_str(out, "errors: "); - prt_bitflags(out, error_types, errflags); - } - prt_newline(out); - } -} - -struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, - unsigned dev) -{ - struct bch_dev_io_failures *i; - - for (i = f->devs; i < f->devs + f->nr; i++) - if (i->dev == dev) - return i; - - return NULL; -} - -void bch2_mark_io_failure(struct bch_io_failures *failed, - struct extent_ptr_decoded *p, - bool csum_error) -{ - struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); - - if (!f) { - BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); - - f = &failed->devs[failed->nr++]; - memset(f, 0, sizeof(*f)); - f->dev = p->ptr.dev; - } - - if (p->do_ec_reconstruct) - f->failed_ec = true; - else if (!csum_error) - f->failed_io = true; - else - f->failed_csum_nr++; -} - -void bch2_mark_btree_validate_failure(struct bch_io_failures *failed, - unsigned dev) -{ - struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev); - - if (!f) { - BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); - - f = &failed->devs[failed->nr++]; - memset(f, 0, sizeof(*f)); - f->dev = dev; - } - - f->failed_btree_validate = true; -} - -static inline u64 dev_latency(struct bch_dev *ca) -{ - return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; -} - -static inline int dev_failed(struct bch_dev *ca) -{ - return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; -} - -/* - * returns true if p1 is better than p2: - */ -static inline bool ptr_better(struct bch_fs *c, - const struct extent_ptr_decoded p1, - u64 p1_latency, - struct bch_dev *ca1, - const struct extent_ptr_decoded p2, - u64 p2_latency) -{ - struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); - - int failed_delta = dev_failed(ca1) - dev_failed(ca2); - if (unlikely(failed_delta)) - return failed_delta < 0; - - if (static_branch_unlikely(&bch2_force_reconstruct_read)) - return p1.do_ec_reconstruct > p2.do_ec_reconstruct; - - if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) - return p1.do_ec_reconstruct < p2.do_ec_reconstruct; - - int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr; - if (unlikely(crc_retry_delta)) - return crc_retry_delta < 0; - - /* Pick at random, biased in favor of the faster device: */ - - return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency; -} - -/* - * This picks a non-stale pointer, preferably from a device other than @avoid. - * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to - * other devices, it will still pick a pointer from avoid. - */ -int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_failures *failed, - struct extent_ptr_decoded *pick, - int dev) -{ - bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; - bool have_dirty_ptrs = false, have_pick = false; - - if (k.k->type == KEY_TYPE_error) - return bch_err_throw(c, key_type_error); - - rcu_read_lock(); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - u64 pick_latency; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - have_dirty_ptrs |= !p.ptr.cached; - - /* - * Unwritten extent: no need to actually read, treat it as a - * hole and return 0s: - */ - if (p.ptr.unwritten) { - rcu_read_unlock(); - return 0; - } - - /* Are we being asked to read from a specific device? */ - if (dev >= 0 && p.ptr.dev != dev) - continue; - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - - if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) { - rcu_read_unlock(); - int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev); - if (ret) - return ret; - rcu_read_lock(); - } - - if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) - continue; - - struct bch_dev_io_failures *f = - unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; - if (unlikely(f)) { - p.crc_retry_nr = f->failed_csum_nr; - p.has_ec &= ~f->failed_ec; - - if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { - have_io_errors |= f->failed_io; - have_io_errors |= f->failed_btree_validate; - have_io_errors |= f->failed_ec; - } - have_csum_errors |= !!f->failed_csum_nr; - - if (p.has_ec && (f->failed_io || f->failed_csum_nr)) - p.do_ec_reconstruct = true; - else if (f->failed_io || - f->failed_btree_validate || - f->failed_csum_nr > c->opts.checksum_err_retry_nr) - continue; - } - - have_missing_devs |= ca && !bch2_dev_is_online(ca); - - if (!ca || !bch2_dev_is_online(ca)) { - if (!p.has_ec) - continue; - p.do_ec_reconstruct = true; - } - - if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec) - p.do_ec_reconstruct = true; - - u64 p_latency = dev_latency(ca); - /* - * Square the latencies, to bias more in favor of the faster - * device - we never want to stop issuing reads to the slower - * device altogether, so that we can update our latency numbers: - */ - p_latency *= p_latency; - - if (!have_pick || - ptr_better(c, - p, p_latency, ca, - *pick, pick_latency)) { - *pick = p; - pick_latency = p_latency; - have_pick = true; - } - } - rcu_read_unlock(); - - if (have_pick) - return 1; - if (!have_dirty_ptrs) - return 0; - if (have_missing_devs) - return bch_err_throw(c, no_device_to_read_from); - if (have_csum_errors) - return bch_err_throw(c, data_read_csum_err); - if (have_io_errors) - return bch_err_throw(c, data_read_io_err); - - /* - * If we get here, we have pointers (bkey_ptrs_validate() ensures that), - * but they don't point to valid devices: - */ - return bch_err_throw(c, no_devices_valid); -} - -/* KEY_TYPE_btree_ptr: */ - -int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, - c, btree_ptr_val_too_big, - "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); -} - -int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - int ret = 0; - - bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, - c, btree_ptr_v2_val_too_big, - "value too big (%zu > %zu)", - bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); - - bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), - c, btree_ptr_v2_min_key_bad, - "min_key > key"); - - if ((from.flags & BCH_VALIDATE_write) && - c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written) - bkey_fsck_err_on(!bp.v->sectors_written, - c, btree_ptr_v2_written_0, - "sectors_written == 0"); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - - prt_printf(out, "seq %llx written %u min_key %s", - le64_to_cpu(bp.v->seq), - le16_to_cpu(bp.v->sectors_written), - BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); - - bch2_bpos_to_text(out, bp.v->min_key); - prt_printf(out, " "); - bch2_bkey_ptrs_to_text(out, c, k); -} - -void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, - unsigned big_endian, int write, - struct bkey_s k) -{ - struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); - - compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); - - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id_is_extents(btree_id) && - !bkey_eq(bp.v->min_key, POS_MIN)) - bp.v->min_key = write - ? bpos_nosnap_predecessor(bp.v->min_key) - : bpos_nosnap_successor(bp.v->min_key); -} - -/* KEY_TYPE_extent: */ - -bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -{ - struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); - struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); - union bch_extent_entry *en_l; - const union bch_extent_entry *en_r; - struct extent_ptr_decoded lp, rp; - bool use_right_ptr; - - en_l = l_ptrs.start; - en_r = r_ptrs.start; - while (en_l < l_ptrs.end && en_r < r_ptrs.end) { - if (extent_entry_type(en_l) != extent_entry_type(en_r)) - return false; - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - if (en_l < l_ptrs.end || en_r < r_ptrs.end) - return false; - - en_l = l_ptrs.start; - en_r = r_ptrs.start; - lp.crc = bch2_extent_crc_unpack(l.k, NULL); - rp.crc = bch2_extent_crc_unpack(r.k, NULL); - - guard(rcu)(); - - while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && - __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { - if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != - rp.ptr.offset + rp.crc.offset || - lp.ptr.dev != rp.ptr.dev || - lp.ptr.gen != rp.ptr.gen || - lp.ptr.unwritten != rp.ptr.unwritten || - lp.has_ec != rp.has_ec) - return false; - - /* Extents may not straddle buckets: */ - struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); - bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); - - if (!same_bucket) - return false; - - if (lp.has_ec != rp.has_ec || - (lp.has_ec && - (lp.ec.block != rp.ec.block || - lp.ec.redundancy != rp.ec.redundancy || - lp.ec.idx != rp.ec.idx))) - return false; - - if (lp.crc.compression_type != rp.crc.compression_type || - lp.crc.nonce != rp.crc.nonce) - return false; - - if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= - lp.crc.uncompressed_size) { - /* can use left extent's crc entry */ - } else if (lp.crc.live_size <= rp.crc.offset) { - /* can use right extent's crc entry */ - } else { - /* check if checksums can be merged: */ - if (lp.crc.csum_type != rp.crc.csum_type || - lp.crc.nonce != rp.crc.nonce || - crc_is_compressed(lp.crc) || - !bch2_checksum_mergeable(lp.crc.csum_type)) - return false; - - if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || - rp.crc.offset) - return false; - - if (lp.crc.csum_type && - lp.crc.uncompressed_size + - rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) - return false; - } - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - en_l = l_ptrs.start; - en_r = r_ptrs.start; - while (en_l < l_ptrs.end && en_r < r_ptrs.end) { - if (extent_entry_is_crc(en_l)) { - struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - if (crc_l.uncompressed_size + crc_r.uncompressed_size > - bch2_crc_field_size_max[extent_entry_type(en_l)]) - return false; - } - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - use_right_ptr = false; - en_l = l_ptrs.start; - en_r = r_ptrs.start; - while (en_l < l_ptrs.end) { - if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && - use_right_ptr) - en_l->ptr = en_r->ptr; - - if (extent_entry_is_crc(en_l)) { - struct bch_extent_crc_unpacked crc_l = - bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - struct bch_extent_crc_unpacked crc_r = - bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - use_right_ptr = false; - - if (crc_l.offset + crc_l.live_size + crc_r.live_size <= - crc_l.uncompressed_size) { - /* can use left extent's crc entry */ - } else if (crc_l.live_size <= crc_r.offset) { - /* can use right extent's crc entry */ - crc_r.offset -= crc_l.live_size; - bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, - extent_entry_type(en_l)); - use_right_ptr = true; - } else { - crc_l.csum = bch2_checksum_merge(crc_l.csum_type, - crc_l.csum, - crc_r.csum, - crc_r.uncompressed_size << 9); - - crc_l.uncompressed_size += crc_r.uncompressed_size; - crc_l.compressed_size += crc_r.compressed_size; - bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, - extent_entry_type(en_l)); - } - } - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -/* KEY_TYPE_reservation: */ - -int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - int ret = 0; - - bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, - c, reservation_key_nr_replicas_invalid, - "invalid nr_replicas (%u)", r.v->nr_replicas); -fsck_err: - return ret; -} - -void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - prt_printf(out, "generation %u replicas %u", - le32_to_cpu(r.v->generation), - r.v->nr_replicas); -} - -bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reservation l = bkey_s_to_reservation(_l); - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); - - if (l.v->generation != r.v->generation || - l.v->nr_replicas != r.v->nr_replicas) - return false; - - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -/* Extent checksum entries: */ - -/* returns true if not equal */ -static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, - struct bch_extent_crc_unpacked r) -{ - return (l.csum_type != r.csum_type || - l.compression_type != r.compression_type || - l.compressed_size != r.compressed_size || - l.uncompressed_size != r.uncompressed_size || - l.offset != r.offset || - l.live_size != r.live_size || - l.nonce != r.nonce || - bch2_crc_cmp(l.csum, r.csum)); -} - -static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, - struct bch_extent_crc_unpacked n) -{ - return !crc_is_compressed(u) && - u.csum_type && - u.uncompressed_size > u.live_size && - bch2_csum_type_is_encryption(u.csum_type) == - bch2_csum_type_is_encryption(n.csum_type); -} - -bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, - struct bch_extent_crc_unpacked n) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - - if (!n.csum_type) - return false; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (can_narrow_crc(crc, n)) - return true; - - return false; -} - -/* - * We're writing another replica for this extent, so while we've got the data in - * memory we'll be computing a new checksum for the currently live data. - * - * If there are other replicas we aren't moving, and they are checksummed but - * not compressed, we can modify them to point to only the data that is - * currently live (so that readers won't have to bounce) while we've got the - * checksum we need: - */ -bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - struct bch_extent_crc_unpacked u; - struct extent_ptr_decoded p; - union bch_extent_entry *i; - bool ret = false; - - /* Find a checksum entry that covers only live data: */ - if (!n.csum_type) { - bkey_for_each_crc(&k->k, ptrs, u, i) - if (!crc_is_compressed(u) && - u.csum_type && - u.live_size == u.uncompressed_size) { - n = u; - goto found; - } - return false; - } -found: - BUG_ON(crc_is_compressed(n)); - BUG_ON(n.offset); - BUG_ON(n.live_size != k->k.size); - -restart_narrow_pointers: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - - bkey_for_each_ptr_decode(&k->k, ptrs, p, i) - if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); - p.ptr.offset += p.crc.offset; - p.crc = n; - bch2_extent_ptr_decoded_append(k, &p); - ret = true; - goto restart_narrow_pointers; - } - - return ret; -} - -static void bch2_extent_crc_pack(union bch_extent_crc *dst, - struct bch_extent_crc_unpacked src, - enum bch_extent_entry_type type) -{ -#define common_fields(_src) \ - .type = BIT(type), \ - .csum_type = _src.csum_type, \ - .compression_type = _src.compression_type, \ - ._compressed_size = _src.compressed_size - 1, \ - ._uncompressed_size = _src.uncompressed_size - 1, \ - .offset = _src.offset - - switch (type) { - case BCH_EXTENT_ENTRY_crc32: - dst->crc32 = (struct bch_extent_crc32) { - common_fields(src), - .csum = (u32 __force) *((__le32 *) &src.csum.lo), - }; - break; - case BCH_EXTENT_ENTRY_crc64: - dst->crc64 = (struct bch_extent_crc64) { - common_fields(src), - .nonce = src.nonce, - .csum_lo = (u64 __force) src.csum.lo, - .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), - }; - break; - case BCH_EXTENT_ENTRY_crc128: - dst->crc128 = (struct bch_extent_crc128) { - common_fields(src), - .nonce = src.nonce, - .csum = src.csum, - }; - break; - default: - BUG(); - } -#undef set_common_fields -} - -void bch2_extent_crc_append(struct bkey_i *k, - struct bch_extent_crc_unpacked new) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - union bch_extent_crc *crc = (void *) ptrs.end; - enum bch_extent_entry_type type; - - if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size <= CRC32_SIZE_MAX && - new.nonce <= CRC32_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc32; - else if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size <= CRC64_SIZE_MAX && - new.nonce <= CRC64_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc64; - else if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size <= CRC128_SIZE_MAX && - new.nonce <= CRC128_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc128; - else - BUG(); - - bch2_extent_crc_pack(crc, new, type); - - k->k.u64s += extent_entry_u64s(ptrs.end); - - EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); -} - -/* Generic code for keys with pointers: */ - -unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) -{ - return bch2_bkey_devs(k).nr; -} - -unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) -{ - return k.k->type == KEY_TYPE_reservation - ? bkey_s_c_to_reservation(k).v->nr_replicas - : bch2_bkey_dirty_devs(k).nr; -} - -unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) -{ - unsigned ret = 0; - - if (k.k->type == KEY_TYPE_reservation) { - ret = bkey_s_c_to_reservation(k).v->nr_replicas; - } else { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - ret += !p.ptr.cached && !crc_is_compressed(p.crc); - } - - return ret; -} - -unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ret = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && crc_is_compressed(p.crc)) - ret += p.crc.compressed_size; - - return ret; -} - -bool bch2_bkey_is_incompressible(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - - bkey_for_each_crc(k.k, ptrs, crc, entry) - if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) - return true; - return false; -} - -unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p = { 0 }; - unsigned replicas = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.cached) - continue; - - if (p.has_ec) - replicas += p.ec.redundancy; - - replicas++; - - } - - return replicas; -} - -static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p) -{ - if (p->ptr.cached) - return 0; - - return p->has_ec - ? p->ec.redundancy + 1 - : ca->mi.durability; -} - -unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) -{ - struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - - return ca ? __extent_ptr_durability(ca, p) : 0; -} - -unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) -{ - struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - - if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) - return 0; - - return __extent_ptr_durability(ca, p); -} - -unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned durability = 0; - - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - durability += bch2_extent_ptr_durability(c, &p); - return durability; -} - -static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned durability = 0; - - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) - durability += bch2_extent_ptr_durability(c, &p); - return durability; -} - -void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - union bch_extent_entry *next = extent_entry_next(entry); - - memmove_u64s(entry, next, (u64 *) end - (u64 *) next); - k->k.u64s -= extent_entry_u64s(entry); -} - -void bch2_extent_ptr_decoded_append(struct bkey_i *k, - struct extent_ptr_decoded *p) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - struct bch_extent_crc_unpacked crc = - bch2_extent_crc_unpack(&k->k, NULL); - union bch_extent_entry *pos; - - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = ptrs.start; - goto found; - } - - bkey_for_each_crc(&k->k, ptrs, crc, pos) - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = extent_entry_next(pos); - goto found; - } - - bch2_extent_crc_append(k, p->crc); - pos = bkey_val_end(bkey_i_to_s(k)); -found: - p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ptr)); - - if (p->has_ec) { - p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ec)); - } -} - -static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, - union bch_extent_entry *entry) -{ - union bch_extent_entry *i = ptrs.start; - - if (i == entry) - return NULL; - - while (extent_entry_next(i) != entry) - i = extent_entry_next(i); - return i; -} - -/* - * Returns pointer to the next entry after the one being dropped: - */ -void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry = to_entry(ptr), *next; - bool drop_crc = true; - - if (k.k->type == KEY_TYPE_stripe) { - ptr->dev = BCH_SB_MEMBER_INVALID; - return; - } - - EBUG_ON(ptr < &ptrs.start->ptr || - ptr >= &ptrs.end->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - - for (next = extent_entry_next(entry); - next != ptrs.end; - next = extent_entry_next(next)) { - if (extent_entry_is_crc(next)) { - break; - } else if (extent_entry_is_ptr(next)) { - drop_crc = false; - break; - } - } - - extent_entry_drop(k, entry); - - while ((entry = extent_entry_prev(ptrs, entry))) { - if (extent_entry_is_ptr(entry)) - break; - - if ((extent_entry_is_crc(entry) && drop_crc) || - extent_entry_is_stripe_ptr(entry)) - extent_entry_drop(k, entry); - } -} - -void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr) -{ - if (k.k->type != KEY_TYPE_stripe) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == ptr->dev && p.has_ec) { - ptr->dev = BCH_SB_MEMBER_INVALID; - return; - } - } - - bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; - - bch2_bkey_drop_ptr_noerror(k, ptr); - - /* - * If we deleted all the dirty pointers and there's still cached - * pointers, we could set the cached pointers to dirty if they're not - * stale - but to do that correctly we'd need to grab an open_bucket - * reference so that we don't race with bucket reuse: - */ - if (have_dirty && - !bch2_bkey_dirty_devs(k.s_c).nr) { - k.k->type = KEY_TYPE_error; - set_bkey_val_u64s(k.k, 0); - } else if (!bch2_bkey_nr_ptrs(k.s_c)) { - k.k->type = KEY_TYPE_deleted; - set_bkey_val_u64s(k.k, 0); - } -} - -void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) -{ - bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); -} - -void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) -{ - bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev); -} - -const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == dev) - return ptr; - - return NULL; -} - -bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_dev *ca; - - guard(rcu)(); - bkey_for_each_ptr(ptrs, ptr) - if (bch2_dev_in_target(c, ptr->dev, target) && - (ca = bch2_dev_rcu(c, ptr->dev)) && - (!ptr->cached || - !dev_ptr_stale_rcu(ca, ptr))) - return true; - - return false; -} - -bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_extent_ptr m, u64 offset) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == m.dev && - p.ptr.gen == m.gen && - (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == - (s64) m.offset - offset) - return true; - - return false; -} - -/* - * Returns true if two extents refer to the same data: - */ -bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) -{ - if (k1.k->type != k2.k->type) - return false; - - if (bkey_extent_is_direct_data(k1.k)) { - struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); - struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); - const union bch_extent_entry *entry1, *entry2; - struct extent_ptr_decoded p1, p2; - - if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) - return false; - - bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) - bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) - if (p1.ptr.dev == p2.ptr.dev && - p1.ptr.gen == p2.ptr.gen && - - /* - * This checks that the two pointers point - * to the same region on disk - adjusting - * for the difference in where the extents - * start, since one may have been trimmed: - */ - (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && - - /* - * This additionally checks that the - * extents overlap on disk, since the - * previous check may trigger spuriously - * when one extent is immediately partially - * overwritten with another extent (so that - * on disk they are adjacent) and - * compression is in use: - */ - ((p1.ptr.offset >= p2.ptr.offset && - p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || - (p2.ptr.offset >= p1.ptr.offset && - p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) - return true; - - return false; - } else { - /* KEY_TYPE_deleted, etc. */ - return true; - } -} - -struct bch_extent_ptr * -bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) -{ - struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); - union bch_extent_entry *entry2; - struct extent_ptr_decoded p2; - - bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) - if (p1.ptr.dev == p2.ptr.dev && - p1.ptr.gen == p2.ptr.gen && - (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) - return &entry2->ptr; - - return NULL; -} - -static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, - struct bch_extent_ptr *ptr) -{ - unsigned target = opts->promote_target ?: opts->foreground_target; - - if (target && !bch2_dev_in_target(c, ptr->dev, target)) - return false; - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - - return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr); -} - -void bch2_extent_ptr_set_cached(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s k, - struct bch_extent_ptr *ptr) -{ - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bool have_cached_ptr; - unsigned drop_dev = ptr->dev; - - guard(rcu)(); -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(k); - have_cached_ptr = false; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - /* - * Check if it's erasure coded - stripes can't contain cached - * data. Possibly something we can fix in the future? - */ - if (&entry->ptr == ptr && p.has_ec) - goto drop; - - if (p.ptr.cached) { - if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) { - bch2_bkey_drop_ptr_noerror(k, &entry->ptr); - ptr = NULL; - goto restart_drop_ptrs; - } - - have_cached_ptr = true; - } - } - - if (!ptr) - bkey_for_each_ptr(ptrs, ptr2) - if (ptr2->dev == drop_dev) - ptr = ptr2; - - if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) - goto drop; - - ptr->cached = true; - return; -drop: - bch2_bkey_drop_ptr_noerror(k, ptr); -} - -/* - * bch2_extent_normalize - clean up an extent, dropping stale pointers etc. - * - * Returns true if @k should be dropped entirely - * - * For existing keys, only called when btree nodes are being rewritten, not when - * they're merely being compacted/resorted in memory. - */ -bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) -{ - struct bch_dev *ca; - - guard(rcu)(); - bch2_bkey_drop_ptrs(k, ptr, - ptr->cached && - (!(ca = bch2_dev_rcu(c, ptr->dev)) || - dev_ptr_stale_rcu(ca, ptr) > 0)); - - return bkey_deleted(k.k); -} - -/* - * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc. - * - * Like bch2_extent_normalize(), but also only keeps a single cached pointer on - * the promote target. - */ -bool bch2_extent_normalize_by_opts(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s k) -{ - struct bkey_ptrs ptrs; - bool have_cached_ptr; - - guard(rcu)(); -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(k); - have_cached_ptr = false; - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->cached) { - if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) { - bch2_bkey_drop_ptr(k, ptr); - goto restart_drop_ptrs; - } - have_cached_ptr = true; - } - - return bkey_deleted(k.k); -} - -void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) -{ - out->atomic++; - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - if (!ca) { - prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : ""); - } else { - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - - prt_printf(out, "ptr: %u:%llu:%u gen %u", - ptr->dev, b, offset, ptr->gen); - if (ca->mi.durability != 1) - prt_printf(out, " d=%u", ca->mi.durability); - if (ptr->cached) - prt_str(out, " cached"); - if (ptr->unwritten) - prt_str(out, " unwritten"); - int stale = dev_ptr_stale_rcu(ca, ptr); - if (stale > 0) - prt_printf(out, " stale"); - else if (stale) - prt_printf(out, " invalid"); - } - --out->atomic; -} - -void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc) -{ - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", - crc->compressed_size, - crc->uncompressed_size, - crc->offset, crc->nonce); - bch2_prt_csum_type(out, crc->csum_type); - prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo); - prt_str(out, " compress "); - bch2_prt_compression_type(out, crc->compression_type); -} - -static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, - const struct bch_extent_rebalance *r) -{ - prt_str(out, "rebalance:"); - - prt_printf(out, " replicas=%u", r->data_replicas); - if (r->data_replicas_from_inode) - prt_str(out, " (inode)"); - - prt_str(out, " checksum="); - bch2_prt_csum_opt(out, r->data_checksum); - if (r->data_checksum_from_inode) - prt_str(out, " (inode)"); - - if (r->background_compression || r->background_compression_from_inode) { - prt_str(out, " background_compression="); - bch2_compression_opt_to_text(out, r->background_compression); - - if (r->background_compression_from_inode) - prt_str(out, " (inode)"); - } - - if (r->background_target || r->background_target_from_inode) { - prt_str(out, " background_target="); - if (c) - bch2_target_to_text(out, c, r->background_target); - else - prt_printf(out, "%u", r->background_target); - - if (r->background_target_from_inode) - prt_str(out, " (inode)"); - } - - if (r->promote_target || r->promote_target_from_inode) { - prt_str(out, " promote_target="); - if (c) - bch2_target_to_text(out, c, r->promote_target); - else - prt_printf(out, "%u", r->promote_target); - - if (r->promote_target_from_inode) - prt_str(out, " (inode)"); - } - - if (r->erasure_code || r->erasure_code_from_inode) { - prt_printf(out, " ec=%u", r->erasure_code); - if (r->erasure_code_from_inode) - prt_str(out, " (inode)"); - } -} - -void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - bool first = true; - - if (c) - prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k)); - - bkey_extent_entry_for_each(ptrs, entry) { - if (!first) - prt_printf(out, " "); - - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); - break; - - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: { - struct bch_extent_crc_unpacked crc = - bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - - bch2_extent_crc_unpacked_to_text(out, &crc); - break; - } - case BCH_EXTENT_ENTRY_stripe_ptr: { - const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr; - - prt_printf(out, "ec: idx %llu block %u", - (u64) ec->idx, ec->block); - break; - } - case BCH_EXTENT_ENTRY_rebalance: - bch2_extent_rebalance_to_text(out, c, &entry->rebalance); - break; - - case BCH_EXTENT_ENTRY_flags: - prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); - break; - - default: - prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); - return; - } - - first = false; - } -} - -static int extent_ptr_validate(struct bch_fs *c, - struct bkey_s_c k, - struct bkey_validate_context from, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) -{ - int ret = 0; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr2) - bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, - c, ptr_to_duplicate_device, - "multiple pointers to same device (%u)", ptr->dev); - - /* bad pointers are repaired by check_fix_ptrs(): */ - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - if (!ca) { - rcu_read_unlock(); - return 0; - } - u32 bucket_offset; - u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - unsigned first_bucket = ca->mi.first_bucket; - u64 nbuckets = ca->mi.nbuckets; - unsigned bucket_size = ca->mi.bucket_size; - rcu_read_unlock(); - - bkey_fsck_err_on(bucket >= nbuckets, - c, ptr_after_last_bucket, - "pointer past last bucket (%llu > %llu)", bucket, nbuckets); - bkey_fsck_err_on(bucket < first_bucket, - c, ptr_before_first_bucket, - "pointer before first bucket (%llu < %u)", bucket, first_bucket); - bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, - c, ptr_spans_multiple_buckets, - "pointer spans multiple buckets (%u + %u > %u)", - bucket_offset, size_ondisk, bucket_size); -fsck_err: - return ret; -} - -int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - unsigned size_ondisk = k.k->size; - unsigned nonce = UINT_MAX; - unsigned nr_ptrs = 0; - bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; - int ret = 0; - - if (bkey_is_btree_ptr(k.k)) - size_ondisk = btree_sectors(c); - - bkey_extent_entry_for_each(ptrs, entry) { - bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, - c, extent_ptrs_invalid_entry, - "invalid extent entry type (got %u, max %u)", - __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); - - bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && - !extent_entry_is_ptr(entry), - c, btree_ptr_has_non_ptr, - "has non ptr field"); - - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false); - if (ret) - return ret; - - bkey_fsck_err_on(entry->ptr.cached && have_ec, - c, ptr_cached_and_erasure_coded, - "cached, erasure coded ptr"); - - if (!entry->ptr.unwritten) - have_written = true; - else - have_unwritten = true; - - have_ec = false; - crc_since_last_ptr = false; - nr_ptrs++; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - - bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), - c, ptr_crc_csum_type_unknown, - "invalid checksum type"); - bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, - c, ptr_crc_compression_type_unknown, - "invalid compression type"); - - bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, - c, ptr_crc_uncompressed_size_too_small, - "checksum offset + key size > uncompressed size"); - bkey_fsck_err_on(crc_is_encoded(crc) && - (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), - c, ptr_crc_uncompressed_size_too_big, - "too large encoded extent"); - bkey_fsck_err_on(!crc_is_compressed(crc) && - crc.compressed_size != crc.uncompressed_size, - c, ptr_crc_uncompressed_size_mismatch, - "not compressed but compressed != uncompressed size"); - - if (bch2_csum_type_is_encryption(crc.csum_type)) { - if (nonce == UINT_MAX) - nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) - bkey_fsck_err(c, ptr_crc_nonce_mismatch, - "incorrect nonce"); - } - - bkey_fsck_err_on(crc_since_last_ptr, - c, ptr_crc_redundant, - "redundant crc entry"); - crc_since_last_ptr = true; - - size_ondisk = crc.compressed_size; - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - bkey_fsck_err_on(have_ec, - c, ptr_stripe_redundant, - "redundant stripe entry"); - have_ec = true; - break; - case BCH_EXTENT_ENTRY_rebalance: { - /* - * this shouldn't be a fsck error, for forward - * compatibility; the rebalance code should just refetch - * the compression opt if it's unknown - */ -#if 0 - const struct bch_extent_rebalance *r = &entry->rebalance; - - if (!bch2_compression_opt_valid(r->compression)) { - struct bch_compression_opt opt = __bch2_compression_decode(r->compression); - prt_printf(err, "invalid compression opt %u:%u", - opt.type, opt.level); - return bch_err_throw(c, invalid_bkey); - } -#endif - break; - } - case BCH_EXTENT_ENTRY_flags: - bkey_fsck_err_on(entry != ptrs.start, - c, extent_flags_not_at_start, - "extent flags entry not at start"); - break; - } - } - - bkey_fsck_err_on(!nr_ptrs, - c, extent_ptrs_no_ptrs, - "no ptrs"); - bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, - c, extent_ptrs_too_many_ptrs, - "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); - bkey_fsck_err_on(have_written && have_unwritten, - c, extent_ptrs_written_and_unwritten, - "extent with unwritten and written ptrs"); - bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, - c, extent_ptrs_unwritten, - "has unwritten ptrs"); - bkey_fsck_err_on(crc_since_last_ptr, - c, extent_ptrs_redundant_crc, - "redundant crc entry"); - bkey_fsck_err_on(have_ec, - c, extent_ptrs_redundant_stripe, - "redundant stripe entry"); -fsck_err: - return ret; -} - -void bch2_ptr_swab(struct bkey_s k) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - u64 *d; - - for (d = (u64 *) ptrs.start; - d != (u64 *) ptrs.end; - d++) - *d = swab64(*d); - - for (entry = ptrs.start; - entry < ptrs.end; - entry = extent_entry_next(entry)) { - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.csum = swab32(entry->crc32.csum); - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); - entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = (__force __le64) - swab64((__force u64) entry->crc128.csum.hi); - entry->crc128.csum.lo = (__force __le64) - swab64((__force u64) entry->crc128.csum.lo); - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - break; - case BCH_EXTENT_ENTRY_rebalance: - break; - default: - /* Bad entry type: will be caught by validate() */ - return; - } - } -} - -int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags) -{ - int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags); - if (ret) - return ret; - - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - - if (ptrs.start != ptrs.end && - extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) { - ptrs.start->flags.flags = flags; - } else { - struct bch_extent_flags f = { - .type = BIT(BCH_EXTENT_ENTRY_flags), - .flags = flags, - }; - __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f); - } - - return 0; -} - -/* Generic extent code: */ - -int bch2_cut_front_s(struct bpos where, struct bkey_s k) -{ - unsigned new_val_u64s = bkey_val_u64s(k.k); - int val_u64s_delta; - u64 sub; - - if (bkey_le(where, bkey_start_pos(k.k))) - return 0; - - EBUG_ON(bkey_gt(where, k.k->p)); - - sub = where.offset - bkey_start_offset(k.k); - - k.k->size -= sub; - - if (!k.k->size) { - k.k->type = KEY_TYPE_deleted; - new_val_u64s = 0; - } - - switch (k.k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - bool seen_crc = false; - - bkey_extent_entry_for_each(ptrs, entry) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - if (!seen_crc) - entry->ptr.offset += sub; - break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.offset += sub; - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.offset += sub; - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.offset += sub; - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - case BCH_EXTENT_ENTRY_rebalance: - case BCH_EXTENT_ENTRY_flags: - break; - } - - if (extent_entry_is_crc(entry)) - seen_crc = true; - } - - break; - } - case KEY_TYPE_reflink_p: { - struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); - - SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub); - break; - } - case KEY_TYPE_inline_data: - case KEY_TYPE_indirect_inline_data: { - void *p = bkey_inline_data_p(k); - unsigned bytes = bkey_inline_data_bytes(k.k); - - sub = min_t(u64, sub << 9, bytes); - - memmove(p, p + sub, bytes - sub); - - new_val_u64s -= sub >> 3; - break; - } - } - - val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; - BUG_ON(val_u64s_delta < 0); - - set_bkey_val_u64s(k.k, new_val_u64s); - memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); - return -val_u64s_delta; -} - -int bch2_cut_back_s(struct bpos where, struct bkey_s k) -{ - unsigned new_val_u64s = bkey_val_u64s(k.k); - int val_u64s_delta; - u64 len = 0; - - if (bkey_ge(where, k.k->p)) - return 0; - - EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); - - len = where.offset - bkey_start_offset(k.k); - - k.k->p.offset = where.offset; - k.k->size = len; - - if (!len) { - k.k->type = KEY_TYPE_deleted; - new_val_u64s = 0; - } - - switch (k.k->type) { - case KEY_TYPE_inline_data: - case KEY_TYPE_indirect_inline_data: - new_val_u64s = (bkey_inline_data_offset(k.k) + - min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; - break; - } - - val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; - BUG_ON(val_u64s_delta < 0); - - set_bkey_val_u64s(k.k, new_val_u64s); - memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); - return -val_u64s_delta; -} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h deleted file mode 100644 index b8590e51b76e62..00000000000000 --- a/fs/bcachefs/extents.h +++ /dev/null @@ -1,768 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENTS_H -#define _BCACHEFS_EXTENTS_H - -#include "bcachefs.h" -#include "bkey.h" -#include "extents_types.h" - -struct bch_fs; -struct btree_trans; - -/* extent entries: */ - -#define extent_entry_last(_e) \ - ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) - -#define entry_to_ptr(_entry) \ -({ \ - EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ - \ - __builtin_choose_expr( \ - type_is_exact(_entry, const union bch_extent_entry *), \ - (const struct bch_extent_ptr *) (_entry), \ - (struct bch_extent_ptr *) (_entry)); \ -}) - -/* downcast, preserves const */ -#define to_entry(_entry) \ -({ \ - BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ - !type_is(_entry, struct bch_extent_ptr *) && \ - !type_is(_entry, struct bch_extent_stripe_ptr *)); \ - \ - __builtin_choose_expr( \ - (type_is_exact(_entry, const union bch_extent_crc *) || \ - type_is_exact(_entry, const struct bch_extent_ptr *) ||\ - type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ - (const union bch_extent_entry *) (_entry), \ - (union bch_extent_entry *) (_entry)); \ -}) - -#define extent_entry_next(_entry) \ - ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) - -#define extent_entry_next_safe(_entry, _end) \ - (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \ - ? extent_entry_next(_entry) \ - : _end) - -static inline unsigned -__extent_entry_type(const union bch_extent_entry *e) -{ - return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; -} - -static inline enum bch_extent_entry_type -extent_entry_type(const union bch_extent_entry *e) -{ - int ret = __ffs(e->type); - - EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); - - return ret; -} - -static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) -{ - switch (extent_entry_type(entry)) { -#define x(f, n) \ - case BCH_EXTENT_ENTRY_##f: \ - return sizeof(struct bch_extent_##f); - BCH_EXTENT_ENTRY_TYPES() -#undef x - default: - BUG(); - } -} - -static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) -{ - return extent_entry_bytes(entry) / sizeof(u64); -} - -static inline void __extent_entry_insert(struct bkey_i *k, - union bch_extent_entry *dst, - union bch_extent_entry *new) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - - memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); - k->k.u64s += extent_entry_u64s(new); - memcpy_u64s_small(dst, new, extent_entry_u64s(new)); -} - -static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) -{ - union bch_extent_entry *next = extent_entry_next(entry); - - /* stripes have ptrs, but their layout doesn't work with this code */ - BUG_ON(k.k->type == KEY_TYPE_stripe); - - memmove_u64s_down(entry, next, - (u64 *) bkey_val_end(k) - (u64 *) next); - k.k->u64s -= (u64 *) next - (u64 *) entry; -} - -static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) -{ - return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; -} - -static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) -{ - return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; -} - -static inline bool extent_entry_is_crc(const union bch_extent_entry *e) -{ - switch (__extent_entry_type(e)) { - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - return true; - default: - return false; - } -} - -union bch_extent_crc { - u8 type; - struct bch_extent_crc32 crc32; - struct bch_extent_crc64 crc64; - struct bch_extent_crc128 crc128; -}; - -#define __entry_to_crc(_entry) \ - __builtin_choose_expr( \ - type_is_exact(_entry, const union bch_extent_entry *), \ - (const union bch_extent_crc *) (_entry), \ - (union bch_extent_crc *) (_entry)) - -#define entry_to_crc(_entry) \ -({ \ - EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ - \ - __entry_to_crc(_entry); \ -}) - -static inline struct bch_extent_crc_unpacked -bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) -{ -#define common_fields(_crc) \ - .csum_type = _crc.csum_type, \ - .compression_type = _crc.compression_type, \ - .compressed_size = _crc._compressed_size + 1, \ - .uncompressed_size = _crc._uncompressed_size + 1, \ - .offset = _crc.offset, \ - .live_size = k->size - - if (!crc) - return (struct bch_extent_crc_unpacked) { - .compressed_size = k->size, - .uncompressed_size = k->size, - .live_size = k->size, - }; - - switch (extent_entry_type(to_entry(crc))) { - case BCH_EXTENT_ENTRY_crc32: { - struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { - common_fields(crc->crc32), - }; - - *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum; - return ret; - } - case BCH_EXTENT_ENTRY_crc64: { - struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { - common_fields(crc->crc64), - .nonce = crc->crc64.nonce, - .csum.lo = (__force __le64) crc->crc64.csum_lo, - }; - - *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi; - - return ret; - } - case BCH_EXTENT_ENTRY_crc128: { - struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { - common_fields(crc->crc128), - .nonce = crc->crc128.nonce, - .csum = crc->crc128.csum, - }; - - return ret; - } - default: - BUG(); - } -#undef common_fields -} - -static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) -{ - return (crc.compression_type != BCH_COMPRESSION_TYPE_none && - crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); -} - -static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc) -{ - return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc); -} - -void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *); - -/* bkey_ptrs: generically over any key type that has ptrs */ - -struct bkey_ptrs_c { - const union bch_extent_entry *start; - const union bch_extent_entry *end; -}; - -struct bkey_ptrs { - union bch_extent_entry *start; - union bch_extent_entry *end; -}; - -static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: { - struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); - - return (struct bkey_ptrs_c) { - to_entry(&e.v->start[0]), - to_entry(extent_entry_last(e)) - }; - } - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - - return (struct bkey_ptrs_c) { - e.v->start, - extent_entry_last(e) - }; - } - case KEY_TYPE_stripe: { - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - return (struct bkey_ptrs_c) { - to_entry(&s.v->ptrs[0]), - to_entry(&s.v->ptrs[s.v->nr_blocks]), - }; - } - case KEY_TYPE_reflink_v: { - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - - return (struct bkey_ptrs_c) { - r.v->start, - bkey_val_end(r), - }; - } - case KEY_TYPE_btree_ptr_v2: { - struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); - - return (struct bkey_ptrs_c) { - to_entry(&e.v->start[0]), - to_entry(extent_entry_last(e)) - }; - } - default: - return (struct bkey_ptrs_c) { NULL, NULL }; - } -} - -static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) -{ - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); - - return (struct bkey_ptrs) { - (void *) p.start, - (void *) p.end - }; -} - -#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ - for ((_entry) = (_start); \ - (_entry) < (_end); \ - (_entry) = extent_entry_next_safe(_entry, _end)) - -#define __bkey_ptr_next(_ptr, _end) \ -({ \ - typeof(_end) _entry; \ - \ - __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ - if (extent_entry_is_ptr(_entry)) \ - break; \ - \ - _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ -}) - -#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ - __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) - -#define bkey_extent_entry_for_each(_p, _entry) \ - bkey_extent_entry_for_each_from(_p, _entry, _p.start) - -#define __bkey_for_each_ptr(_start, _end, _ptr) \ - for (typeof(_start) (_ptr) = (_start); \ - ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ - (_ptr)++) - -#define bkey_ptr_next(_p, _ptr) \ - __bkey_ptr_next(_ptr, (_p).end) - -#define bkey_for_each_ptr(_p, _ptr) \ - __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) - -#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ -({ \ - __label__ out; \ - \ - (_ptr).has_ec = false; \ - (_ptr).do_ec_reconstruct = false; \ - (_ptr).crc_retry_nr = 0; \ - \ - __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ - switch (__extent_entry_type(_entry)) { \ - case BCH_EXTENT_ENTRY_ptr: \ - (_ptr).ptr = _entry->ptr; \ - goto out; \ - case BCH_EXTENT_ENTRY_crc32: \ - case BCH_EXTENT_ENTRY_crc64: \ - case BCH_EXTENT_ENTRY_crc128: \ - (_ptr).crc = bch2_extent_crc_unpack(_k, \ - entry_to_crc(_entry)); \ - break; \ - case BCH_EXTENT_ENTRY_stripe_ptr: \ - (_ptr).ec = _entry->stripe_ptr; \ - (_ptr).has_ec = true; \ - break; \ - default: \ - /* nothing */ \ - break; \ - } \ -out: \ - _entry < (_end); \ -}) - -#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ - for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ - (_entry) = _start; \ - __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ - (_entry) = extent_entry_next_safe(_entry, _end)) - -#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ - __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ - _ptr, _entry) - -#define bkey_crc_next(_k, _end, _crc, _iter) \ -({ \ - __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ - if (extent_entry_is_crc(_iter)) { \ - (_crc) = bch2_extent_crc_unpack(_k, \ - entry_to_crc(_iter)); \ - break; \ - } \ - \ - (_iter) < (_end); \ -}) - -#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ - for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ - (_iter) = (_start); \ - bkey_crc_next(_k, _end, _crc, _iter); \ - (_iter) = extent_entry_next(_iter)) - -#define bkey_for_each_crc(_k, _p, _crc, _iter) \ - __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) - -/* Iterate over pointers in KEY_TYPE_extent: */ - -#define extent_ptr_next(_e, _ptr) \ - __bkey_ptr_next(_ptr, extent_entry_last(_e)) - -#define extent_for_each_ptr(_e, _ptr) \ - __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) - -#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ - __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ - extent_entry_last(_e), _ptr, _entry) - -/* utility code common to all keys with pointers: */ - -void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *, - struct bch_io_failures *); -struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, - unsigned); -void bch2_mark_io_failure(struct bch_io_failures *, - struct extent_ptr_decoded *, bool); -void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned); -int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, - struct bch_io_failures *, - struct extent_ptr_decoded *, int); - -/* KEY_TYPE_btree_ptr: */ - -int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - -int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, - int, struct bkey_s); - -#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ - .key_validate = bch2_btree_ptr_validate, \ - .val_to_text = bch2_btree_ptr_to_text, \ - .swab = bch2_ptr_swab, \ - .trigger = bch2_trigger_extent, \ -}) - -#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ - .key_validate = bch2_btree_ptr_v2_validate, \ - .val_to_text = bch2_btree_ptr_v2_to_text, \ - .swab = bch2_ptr_swab, \ - .compat = bch2_btree_ptr_v2_compat, \ - .trigger = bch2_trigger_extent, \ - .min_val_size = 40, \ -}) - -/* KEY_TYPE_extent: */ - -bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - -#define bch2_bkey_ops_extent ((struct bkey_ops) { \ - .key_validate = bch2_bkey_ptrs_validate, \ - .val_to_text = bch2_bkey_ptrs_to_text, \ - .swab = bch2_ptr_swab, \ - .key_normalize = bch2_extent_normalize, \ - .key_merge = bch2_extent_merge, \ - .trigger = bch2_trigger_extent, \ -}) - -/* KEY_TYPE_reservation: */ - -int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - -#define bch2_bkey_ops_reservation ((struct bkey_ops) { \ - .key_validate = bch2_reservation_validate, \ - .val_to_text = bch2_reservation_to_text, \ - .key_merge = bch2_reservation_merge, \ - .trigger = bch2_trigger_reservation, \ - .min_val_size = 8, \ -}) - -/* Extent checksum entries: */ - -bool bch2_can_narrow_extent_crcs(struct bkey_s_c, - struct bch_extent_crc_unpacked); -bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); -void bch2_extent_crc_append(struct bkey_i *, - struct bch_extent_crc_unpacked); - -/* Generic code for keys with pointers: */ - -static inline bool bkey_is_btree_ptr(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_direct_data(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_inline_data(const struct bkey *k) -{ - return k->type == KEY_TYPE_inline_data || - k->type == KEY_TYPE_indirect_inline_data; -} - -static inline unsigned bkey_inline_data_offset(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_inline_data: - return sizeof(struct bch_inline_data); - case KEY_TYPE_indirect_inline_data: - return sizeof(struct bch_indirect_inline_data); - default: - BUG(); - } -} - -static inline unsigned bkey_inline_data_bytes(const struct bkey *k) -{ - return bkey_val_bytes(k) - bkey_inline_data_offset(k); -} - -#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k)) - -static inline bool bkey_extent_is_data(const struct bkey *k) -{ - return bkey_extent_is_direct_data(k) || - bkey_extent_is_inline_data(k) || - k->type == KEY_TYPE_reflink_p; -} - -/* - * Should extent be counted under inode->i_sectors? - */ -static inline bool bkey_extent_is_allocation(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reservation: - case KEY_TYPE_reflink_p: - case KEY_TYPE_reflink_v: - case KEY_TYPE_inline_data: - case KEY_TYPE_indirect_inline_data: - case KEY_TYPE_error: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_unwritten(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->unwritten) - return true; - return false; -} - -static inline bool bkey_extent_is_reservation(struct bkey_s_c k) -{ - return k.k->type == KEY_TYPE_reservation || - bkey_extent_is_unwritten(k); -} - -static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(p, ptr) - ret.data[ret.nr++] = ptr->dev; - - return ret; -} - -static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(p, ptr) - if (!ptr->cached) - ret.data[ret.nr++] = ptr->dev; - - return ret; -} - -static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(p, ptr) - if (ptr->cached) - ret.data[ret.nr++] = ptr->dev; - - return ret; -} - -unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); -unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); -unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); -bool bch2_bkey_is_incompressible(struct bkey_s_c); -unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); - -unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); -unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *); -unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); -unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); - -const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); - -static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) -{ - return (void *) bch2_bkey_has_device_c(k.s_c, dev); -} - -bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); - -void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); - -static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) -{ - struct bch_extent_ptr *dest; - - EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev)); - - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); - - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k)); - *dest = ptr; - k->k.u64s++; - break; - default: - BUG(); - } -} - -void bch2_extent_ptr_decoded_append(struct bkey_i *, - struct extent_ptr_decoded *); -void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *); -void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); - -void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); -void bch2_bkey_drop_device(struct bkey_s, unsigned); - -#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \ -do { \ - __label__ _again; \ - struct bkey_ptrs _ptrs; \ -_again: \ - _ptrs = bch2_bkey_ptrs(_k); \ - \ - bkey_for_each_ptr(_ptrs, _ptr) \ - if (_cond) { \ - bch2_bkey_drop_ptr_noerror(_k, _ptr); \ - goto _again; \ - } \ -} while (0) - -#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ -do { \ - __label__ _again; \ - struct bkey_ptrs _ptrs; \ -_again: \ - _ptrs = bch2_bkey_ptrs(_k); \ - \ - bkey_for_each_ptr(_ptrs, _ptr) \ - if (_cond) { \ - bch2_bkey_drop_ptr(_k, _ptr); \ - goto _again; \ - } \ -} while (0) - -bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, - struct bch_extent_ptr, u64); -bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); -struct bch_extent_ptr * -bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); - -void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *, - struct bkey_s, struct bch_extent_ptr *); - -bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s); -bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); - -void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); -void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); - -static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, - struct bch_extent_ptr ptr2) -{ - return (ptr1.cached == ptr2.cached && - ptr1.unwritten == ptr2.unwritten && - ptr1.offset == ptr2.offset && - ptr1.dev == ptr2.dev && - ptr1.gen == ptr2.gen); -} - -void bch2_ptr_swab(struct bkey_s); - -/* Generic extent code: */ - -enum bch_extent_overlap { - BCH_EXTENT_OVERLAP_ALL = 0, - BCH_EXTENT_OVERLAP_BACK = 1, - BCH_EXTENT_OVERLAP_FRONT = 2, - BCH_EXTENT_OVERLAP_MIDDLE = 3, -}; - -/* Returns how k overlaps with m */ -static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, - const struct bkey *m) -{ - int cmp1 = bkey_lt(k->p, m->p); - int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m)); - - return (cmp1 << 1) + cmp2; -} - -int bch2_cut_front_s(struct bpos, struct bkey_s); -int bch2_cut_back_s(struct bpos, struct bkey_s); - -static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) -{ - bch2_cut_front_s(where, bkey_i_to_s(k)); -} - -static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) -{ - bch2_cut_back_s(where, bkey_i_to_s(k)); -} - -/** - * bch_key_resize - adjust size of @k - * - * bkey_start_offset(k) will be preserved, modifies where the extent ends - */ -static inline void bch2_key_resize(struct bkey *k, unsigned new_size) -{ - k->p.offset -= k->size; - k->p.offset += new_size; - k->size = new_size; -} - -static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs) -{ - if (ptrs.start != ptrs.end && - extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) - return ptrs.start->flags.flags; - return 0; -} - -static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k) -{ - return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k)); -} - -int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64); - -#endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h deleted file mode 100644 index 74c0252cbd984d..00000000000000 --- a/fs/bcachefs/extents_format.h +++ /dev/null @@ -1,304 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENTS_FORMAT_H -#define _BCACHEFS_EXTENTS_FORMAT_H - -/* - * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally - * preceded by checksum/compression information (bch_extent_crc32 or - * bch_extent_crc64). - * - * One major determining factor in the format of extents is how we handle and - * represent extents that have been partially overwritten and thus trimmed: - * - * If an extent is not checksummed or compressed, when the extent is trimmed we - * don't have to remember the extent we originally allocated and wrote: we can - * merely adjust ptr->offset to point to the start of the data that is currently - * live. The size field in struct bkey records the current (live) size of the - * extent, and is also used to mean "size of region on disk that we point to" in - * this case. - * - * Thus an extent that is not checksummed or compressed will consist only of a - * list of bch_extent_ptrs, with none of the fields in - * bch_extent_crc32/bch_extent_crc64. - * - * When an extent is checksummed or compressed, it's not possible to read only - * the data that is currently live: we have to read the entire extent that was - * originally written, and then return only the part of the extent that is - * currently live. - * - * Thus, in addition to the current size of the extent in struct bkey, we need - * to store the size of the originally allocated space - this is the - * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, - * when the extent is trimmed, instead of modifying the offset field of the - * pointer, we keep a second smaller offset field - "offset into the original - * extent of the currently live region". - * - * The other major determining factor is replication and data migration: - * - * Each pointer may have its own bch_extent_crc32/64. When doing a replicated - * write, we will initially write all the replicas in the same format, with the - * same checksum type and compression format - however, when copygc runs later (or - * tiering/cache promotion, anything that moves data), it is not in general - * going to rewrite all the pointers at once - one of the replicas may be in a - * bucket on one device that has very little fragmentation while another lives - * in a bucket that has become heavily fragmented, and thus is being rewritten - * sooner than the rest. - * - * Thus it will only move a subset of the pointers (or in the case of - * tiering/cache promotion perhaps add a single pointer without dropping any - * current pointers), and if the extent has been partially overwritten it must - * write only the currently live portion (or copygc would not be able to reduce - * fragmentation!) - which necessitates a different bch_extent_crc format for - * the new pointer. - * - * But in the interests of space efficiency, we don't want to store one - * bch_extent_crc for each pointer if we don't have to. - * - * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and - * bch_extent_ptrs appended arbitrarily one after the other. We determine the - * type of a given entry with a scheme similar to utf8 (except we're encoding a - * type, not a size), encoding the type in the position of the first set bit: - * - * bch_extent_crc32 - 0b1 - * bch_extent_ptr - 0b10 - * bch_extent_crc64 - 0b100 - * - * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and - * bch_extent_crc64 is the least constrained). - * - * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, - * until the next bch_extent_crc32/64. - * - * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer - * is neither checksummed nor compressed. - */ - -#define BCH_EXTENT_ENTRY_TYPES() \ - x(ptr, 0) \ - x(crc32, 1) \ - x(crc64, 2) \ - x(crc128, 3) \ - x(stripe_ptr, 4) \ - x(rebalance, 5) \ - x(flags, 6) -#define BCH_EXTENT_ENTRY_MAX 7 - -enum bch_extent_entry_type { -#define x(f, n) BCH_EXTENT_ENTRY_##f = n, - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -/* Compressed/uncompressed size are stored biased by 1: */ -struct bch_extent_crc32 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u32 type:2, - _compressed_size:7, - _uncompressed_size:7, - offset:7, - _unused:1, - csum_type:4, - compression_type:4; - __u32 csum; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u32 csum; - __u32 compression_type:4, - csum_type:4, - _unused:1, - offset:7, - _uncompressed_size:7, - _compressed_size:7, - type:2; -#endif -} __packed __aligned(8); - -#define CRC32_SIZE_MAX (1U << 7) -#define CRC32_NONCE_MAX 0 - -struct bch_extent_crc64 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:3, - _compressed_size:9, - _uncompressed_size:9, - offset:9, - nonce:10, - csum_type:4, - compression_type:4, - csum_hi:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 csum_hi:16, - compression_type:4, - csum_type:4, - nonce:10, - offset:9, - _uncompressed_size:9, - _compressed_size:9, - type:3; -#endif - __u64 csum_lo; -} __packed __aligned(8); - -#define CRC64_SIZE_MAX (1U << 9) -#define CRC64_NONCE_MAX ((1U << 10) - 1) - -struct bch_extent_crc128 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:4, - _compressed_size:13, - _uncompressed_size:13, - offset:13, - nonce:13, - csum_type:4, - compression_type:4; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 compression_type:4, - csum_type:4, - nonce:13, - offset:13, - _uncompressed_size:13, - _compressed_size:13, - type:4; -#endif - struct bch_csum csum; -} __packed __aligned(8); - -#define CRC128_SIZE_MAX (1U << 13) -#define CRC128_NONCE_MAX ((1U << 13) - 1) - -/* - * @reservation - pointer hasn't been written to, just reserved - */ -struct bch_extent_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:1, - cached:1, - unused:1, - unwritten:1, - offset:44, /* 8 petabytes */ - dev:8, - gen:8; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 gen:8, - dev:8, - offset:44, - unwritten:1, - unused:1, - cached:1, - type:1; -#endif -} __packed __aligned(8); - -struct bch_extent_stripe_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:5, - block:8, - redundancy:4, - idx:47; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 idx:47, - redundancy:4, - block:8, - type:5; -#endif -}; - -#define BCH_EXTENT_FLAGS() \ - x(poisoned, 0) - -enum bch_extent_flags_e { -#define x(n, v) BCH_EXTENT_FLAG_##n = v, - BCH_EXTENT_FLAGS() -#undef x -}; - -struct bch_extent_flags { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:7, - flags:57; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 flags:57, - type:7; -#endif -}; - -/* bch_extent_rebalance: */ -#include "rebalance_format.h" - -union bch_extent_entry { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 - unsigned long type; -#elif __BITS_PER_LONG == 32 - struct { - unsigned long pad; - unsigned long type; - }; -#else -#error edit for your odd byteorder. -#endif - -#define x(f, n) struct bch_extent_##f f; - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -struct bch_btree_ptr { - struct bch_val v; - - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -struct bch_btree_ptr_v2 { - struct bch_val v; - - __u64 mem_ptr; - __le64 seq; - __le16 sectors_written; - __le16 flags; - struct bpos min_key; - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); - -struct bch_extent { - struct bch_val v; - - __u64 _data[0]; - union bch_extent_entry start[]; -} __packed __aligned(8); - -/* Maximum size (in u64s) a single pointer could be: */ -#define BKEY_EXTENT_PTR_U64s_MAX\ - ((sizeof(struct bch_extent_crc128) + \ - sizeof(struct bch_extent_ptr)) / sizeof(__u64)) - -/* Maximum possible size of an entire extent value: */ -#define BKEY_EXTENT_VAL_U64s_MAX \ - (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) - -/* * Maximum possible size of an entire extent, key + value: */ -#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) - -/* Btree pointers don't carry around checksums: */ -#define BKEY_BTREE_PTR_VAL_U64s_MAX \ - ((sizeof(struct bch_btree_ptr_v2) + \ - sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) -#define BKEY_BTREE_PTR_U64s_MAX \ - (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) - -struct bch_reservation { - struct bch_val v; - - __le32 generation; - __u8 nr_replicas; - __u8 pad[3]; -} __packed __aligned(8); - -struct bch_inline_data { - struct bch_val v; - u8 data[]; -}; - -#endif /* _BCACHEFS_EXTENTS_FORMAT_H */ diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h deleted file mode 100644 index b23ce4a373c024..00000000000000 --- a/fs/bcachefs/extents_types.h +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENTS_TYPES_H -#define _BCACHEFS_EXTENTS_TYPES_H - -#include "bcachefs_format.h" - -struct bch_extent_crc_unpacked { - u32 compressed_size; - u32 uncompressed_size; - u32 live_size; - - u8 csum_type; - u8 compression_type; - - u16 offset; - - u16 nonce; - - struct bch_csum csum; -}; - -struct extent_ptr_decoded { - bool has_ec; - bool do_ec_reconstruct; - u8 crc_retry_nr; - struct bch_extent_crc_unpacked crc; - struct bch_extent_ptr ptr; - struct bch_extent_stripe_ptr ec; -}; - -struct bch_io_failures { - u8 nr; - struct bch_dev_io_failures { - u8 dev; - unsigned failed_csum_nr:6, - failed_io:1, - failed_btree_validate:1, - failed_ec:1; - } devs[BCH_REPLICAS_MAX + 1]; -}; - -#endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c deleted file mode 100644 index 0e742555cb0af9..00000000000000 --- a/fs/bcachefs/eytzinger.c +++ /dev/null @@ -1,315 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "eytzinger.h" - -/** - * is_aligned - is this pointer & size okay for word-wide copying? - * @base: pointer to data - * @size: size of each element - * @align: required alignment (typically 4 or 8) - * - * Returns true if elements can be copied using word loads and stores. - * The size must be a multiple of the alignment, and the base address must - * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. - * - * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" - * to "if ((a | b) & mask)", so we do that by hand. - */ -__attribute_const__ __always_inline -static bool is_aligned(const void *base, size_t size, unsigned char align) -{ - unsigned char lsbits = (unsigned char)size; - - (void)base; -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - lsbits |= (unsigned char)(uintptr_t)base; -#endif - return (lsbits & (align - 1)) == 0; -} - -/** - * swap_words_32 - swap two elements in 32-bit chunks - * @a: pointer to the first element to swap - * @b: pointer to the second element to swap - * @n: element size (must be a multiple of 4) - * - * Exchange the two objects in memory. This exploits base+index addressing, - * which basically all CPUs have, to minimize loop overhead computations. - * - * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the - * bottom of the loop, even though the zero flag is still valid from the - * subtract (since the intervening mov instructions don't alter the flags). - * Gcc 8.1.0 doesn't have that problem. - */ -static void swap_words_32(void *a, void *b, size_t n) -{ - do { - u32 t = *(u32 *)(a + (n -= 4)); - *(u32 *)(a + n) = *(u32 *)(b + n); - *(u32 *)(b + n) = t; - } while (n); -} - -/** - * swap_words_64 - swap two elements in 64-bit chunks - * @a: pointer to the first element to swap - * @b: pointer to the second element to swap - * @n: element size (must be a multiple of 8) - * - * Exchange the two objects in memory. This exploits base+index - * addressing, which basically all CPUs have, to minimize loop overhead - * computations. - * - * We'd like to use 64-bit loads if possible. If they're not, emulating - * one requires base+index+4 addressing which x86 has but most other - * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, - * but it's possible to have 64-bit loads without 64-bit pointers (e.g. - * x32 ABI). Are there any cases the kernel needs to worry about? - */ -static void swap_words_64(void *a, void *b, size_t n) -{ - do { -#ifdef CONFIG_64BIT - u64 t = *(u64 *)(a + (n -= 8)); - *(u64 *)(a + n) = *(u64 *)(b + n); - *(u64 *)(b + n) = t; -#else - /* Use two 32-bit transfers to avoid base+index+4 addressing */ - u32 t = *(u32 *)(a + (n -= 4)); - *(u32 *)(a + n) = *(u32 *)(b + n); - *(u32 *)(b + n) = t; - - t = *(u32 *)(a + (n -= 4)); - *(u32 *)(a + n) = *(u32 *)(b + n); - *(u32 *)(b + n) = t; -#endif - } while (n); -} - -/** - * swap_bytes - swap two elements a byte at a time - * @a: pointer to the first element to swap - * @b: pointer to the second element to swap - * @n: element size - * - * This is the fallback if alignment doesn't allow using larger chunks. - */ -static void swap_bytes(void *a, void *b, size_t n) -{ - do { - char t = ((char *)a)[--n]; - ((char *)a)[n] = ((char *)b)[n]; - ((char *)b)[n] = t; - } while (n); -} - -/* - * The values are arbitrary as long as they can't be confused with - * a pointer, but small integers make for the smallest compare - * instructions. - */ -#define SWAP_WORDS_64 (swap_r_func_t)0 -#define SWAP_WORDS_32 (swap_r_func_t)1 -#define SWAP_BYTES (swap_r_func_t)2 -#define SWAP_WRAPPER (swap_r_func_t)3 - -struct wrapper { - cmp_func_t cmp; - swap_func_t swap_func; -}; - -/* - * The function pointer is last to make tail calls most efficient if the - * compiler decides not to inline this function. - */ -static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) -{ - if (swap_func == SWAP_WRAPPER) { - ((const struct wrapper *)priv)->swap_func(a, b, (int)size); - return; - } - - if (swap_func == SWAP_WORDS_64) - swap_words_64(a, b, size); - else if (swap_func == SWAP_WORDS_32) - swap_words_32(a, b, size); - else if (swap_func == SWAP_BYTES) - swap_bytes(a, b, size); - else - swap_func(a, b, (int)size, priv); -} - -#define _CMP_WRAPPER ((cmp_r_func_t)0L) - -static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) -{ - if (cmp == _CMP_WRAPPER) - return ((const struct wrapper *)priv)->cmp(a, b); - return cmp(a, b, priv); -} - -static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size, - cmp_r_func_t cmp_func, const void *priv, - size_t l, size_t r) -{ - return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size, - base1 + inorder_to_eytzinger1(r, n) * size, - cmp_func, priv); -} - -static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size, - swap_r_func_t swap_func, const void *priv, - size_t l, size_t r) -{ - do_swap(base1 + inorder_to_eytzinger1(l, n) * size, - base1 + inorder_to_eytzinger1(r, n) * size, - size, swap_func, priv); -} - -static void eytzinger1_sort_r(void *base1, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) -{ - unsigned i, j, k; - - /* called from 'sort' without swap function, let's pick the default */ - if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) - swap_func = NULL; - - if (!swap_func) { - if (is_aligned(base1, size, 8)) - swap_func = SWAP_WORDS_64; - else if (is_aligned(base1, size, 4)) - swap_func = SWAP_WORDS_32; - else - swap_func = SWAP_BYTES; - } - - /* heapify */ - for (i = n / 2; i >= 1; --i) { - /* Find the sift-down path all the way to the leaves. */ - for (j = i; k = j * 2, k < n;) - j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - - /* Special case for the last leaf with no sibling. */ - if (j * 2 == n) - j *= 2; - - /* Backtrack to the correct location. */ - while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0) - j /= 2; - - /* Shift the element into its correct place. */ - for (k = j; j != i;) { - j /= 2; - eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); - } - } - - /* sort */ - for (i = n; i > 1; --i) { - eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i); - - /* Find the sift-down path all the way to the leaves. */ - for (j = 1; k = j * 2, k + 1 < i;) - j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - - /* Special case for the last leaf with no sibling. */ - if (j * 2 + 1 == i) - j *= 2; - - /* Backtrack to the correct location. */ - while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0) - j /= 2; - - /* Shift the element into its correct place. */ - for (k = j; j > 1;) { - j /= 2; - eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); - } - } -} - -void eytzinger0_sort_r(void *base, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) -{ - void *base1 = base - size; - - return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv); -} - -void eytzinger0_sort(void *base, size_t n, size_t size, - cmp_func_t cmp_func, - swap_func_t swap_func) -{ - struct wrapper w = { - .cmp = cmp_func, - .swap_func = swap_func, - }; - - return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); -} - -#if 0 -#include -#include -#include - -static u64 cmp_count; - -static int mycmp(const void *a, const void *b) -{ - u32 _a = *(u32 *)a; - u32 _b = *(u32 *)b; - - cmp_count++; - if (_a < _b) - return -1; - else if (_a > _b) - return 1; - else - return 0; -} - -static int test(void) -{ - size_t N, i; - ktime_t start, end; - s64 delta; - u32 *arr; - - for (N = 10000; N <= 100000; N += 10000) { - arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL); - cmp_count = 0; - - for (i = 0; i < N; i++) - arr[i] = get_random_u32(); - - start = ktime_get(); - eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL); - end = ktime_get(); - - delta = ktime_us_delta(end, start); - printk(KERN_INFO "time: %lld\n", delta); - printk(KERN_INFO "comparisons: %lld\n", cmp_count); - - u32 prev = 0; - - eytzinger0_for_each(i, N) { - if (prev > arr[i]) - goto err; - prev = arr[i]; - } - - kfree(arr); - } - return 0; - -err: - kfree(arr); - return -1; -} -#endif diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h deleted file mode 100644 index 643c1f7160615d..00000000000000 --- a/fs/bcachefs/eytzinger.h +++ /dev/null @@ -1,300 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _EYTZINGER_H -#define _EYTZINGER_H - -#include -#include - -#ifdef EYTZINGER_DEBUG -#include -#define EYTZINGER_BUG_ON(cond) BUG_ON(cond) -#else -#define EYTZINGER_BUG_ON(cond) -#endif - -/* - * Traversal for trees in eytzinger layout - a full binary tree layed out in an - * array. - * - * Consider using an eytzinger tree any time you would otherwise be doing binary - * search over an array. Binary search is a worst case scenario for branch - * prediction and prefetching, but in an eytzinger tree every node's children - * are adjacent in memory, thus we can prefetch children before knowing the - * result of the comparison, assuming multiple nodes fit on a cacheline. - * - * Two variants are provided, for one based indexing and zero based indexing. - * - * Zero based indexing is more convenient, but one based indexing has better - * alignment and thus better performance because each new level of the tree - * starts at a power of two, and thus if element 0 was cacheline aligned, each - * new level will be as well. - */ - -static inline unsigned eytzinger1_child(unsigned i, unsigned child) -{ - EYTZINGER_BUG_ON(child > 1); - - return (i << 1) + child; -} - -static inline unsigned eytzinger1_left_child(unsigned i) -{ - return eytzinger1_child(i, 0); -} - -static inline unsigned eytzinger1_right_child(unsigned i) -{ - return eytzinger1_child(i, 1); -} - -static inline unsigned eytzinger1_first(unsigned size) -{ - return size ? rounddown_pow_of_two(size) : 0; -} - -static inline unsigned eytzinger1_last(unsigned size) -{ - return rounddown_pow_of_two(size + 1) - 1; -} - -static inline unsigned eytzinger1_next(unsigned i, unsigned size) -{ - EYTZINGER_BUG_ON(i == 0 || i > size); - - if (eytzinger1_right_child(i) <= size) { - i = eytzinger1_right_child(i); - - i <<= __fls(size) - __fls(i); - i >>= i > size; - } else { - i >>= ffz(i) + 1; - } - - return i; -} - -static inline unsigned eytzinger1_prev(unsigned i, unsigned size) -{ - EYTZINGER_BUG_ON(i == 0 || i > size); - - if (eytzinger1_left_child(i) <= size) { - i = eytzinger1_left_child(i) + 1; - - i <<= __fls(size) - __fls(i); - i -= 1; - i >>= i > size; - } else { - i >>= __ffs(i) + 1; - } - - return i; -} - -static inline unsigned eytzinger1_extra(unsigned size) -{ - return size - ? (size + 1 - rounddown_pow_of_two(size)) << 1 - : 0; -} - -static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, - unsigned extra) -{ - unsigned b = __fls(i); - unsigned shift = __fls(size) - b; - int s; - - EYTZINGER_BUG_ON(!i || i > size); - - i ^= 1U << b; - i <<= 1; - i |= 1; - i <<= shift; - - /* - * sign bit trick: - * - * if (i > extra) - * i -= (i - extra) >> 1; - */ - s = extra - i; - i += (s >> 1) & (s >> 31); - - return i; -} - -static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, - unsigned extra) -{ - unsigned shift; - int s; - - EYTZINGER_BUG_ON(!i || i > size); - - /* - * sign bit trick: - * - * if (i > extra) - * i += i - extra; - */ - s = extra - i; - i -= s & (s >> 31); - - shift = __ffs(i); - - i >>= shift + 1; - i |= 1U << (__fls(size) - shift); - - return i; -} - -static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) -{ - return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); -} - -static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) -{ - return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); -} - -#define eytzinger1_for_each(_i, _size) \ - for (unsigned (_i) = eytzinger1_first((_size)); \ - (_i) != 0; \ - (_i) = eytzinger1_next((_i), (_size))) - -/* Zero based indexing version: */ - -static inline unsigned eytzinger0_child(unsigned i, unsigned child) -{ - EYTZINGER_BUG_ON(child > 1); - - return (i << 1) + 1 + child; -} - -static inline unsigned eytzinger0_left_child(unsigned i) -{ - return eytzinger0_child(i, 0); -} - -static inline unsigned eytzinger0_right_child(unsigned i) -{ - return eytzinger0_child(i, 1); -} - -static inline unsigned eytzinger0_first(unsigned size) -{ - return eytzinger1_first(size) - 1; -} - -static inline unsigned eytzinger0_last(unsigned size) -{ - return eytzinger1_last(size) - 1; -} - -static inline unsigned eytzinger0_next(unsigned i, unsigned size) -{ - return eytzinger1_next(i + 1, size) - 1; -} - -static inline unsigned eytzinger0_prev(unsigned i, unsigned size) -{ - return eytzinger1_prev(i + 1, size) - 1; -} - -static inline unsigned eytzinger0_extra(unsigned size) -{ - return eytzinger1_extra(size); -} - -static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, - unsigned extra) -{ - return __eytzinger1_to_inorder(i + 1, size, extra) - 1; -} - -static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, - unsigned extra) -{ - return __inorder_to_eytzinger1(i + 1, size, extra) - 1; -} - -static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) -{ - return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); -} - -static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) -{ - return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); -} - -#define eytzinger0_for_each(_i, _size) \ - for (unsigned (_i) = eytzinger0_first((_size)); \ - (_i) != -1; \ - (_i) = eytzinger0_next((_i), (_size))) - -#define eytzinger0_for_each_prev(_i, _size) \ - for (unsigned (_i) = eytzinger0_last((_size)); \ - (_i) != -1; \ - (_i) = eytzinger0_prev((_i), (_size))) - -/* return greatest node <= @search, or -1 if not found */ -static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) -{ - void *base1 = base - size; - unsigned n = 1; - - while (n <= nr) - n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); - n >>= __ffs(n) + 1; - return n - 1; -} - -/* return smallest node > @search, or -1 if not found */ -static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) -{ - void *base1 = base - size; - unsigned n = 1; - - while (n <= nr) - n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); - n >>= __ffs(n + 1) + 1; - return n - 1; -} - -/* return smallest node >= @search, or -1 if not found */ -static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) -{ - void *base1 = base - size; - unsigned n = 1; - - while (n <= nr) - n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0); - n >>= __ffs(n + 1) + 1; - return n - 1; -} - -#define eytzinger0_find(base, nr, size, _cmp, search) \ -({ \ - size_t _size = (size); \ - void *_base1 = (void *)(base) - _size; \ - const void *_search = (search); \ - size_t _nr = (nr); \ - size_t _i = 1; \ - int _res; \ - \ - while (_i <= _nr && \ - (_res = _cmp(_search, _base1 + _i * _size))) \ - _i = eytzinger1_child(_i, _res > 0); \ - _i - 1; \ -}) - -void eytzinger0_sort_r(void *, size_t, size_t, - cmp_r_func_t, swap_r_func_t, const void *); -void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t); - -#endif /* _EYTZINGER_H */ diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c deleted file mode 100644 index 2faec143eb31c0..00000000000000 --- a/fs/bcachefs/fast_list.c +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Fast, unordered lists - * - * Supports add, remove, and iterate - * - * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot - * allocation and freeing. - * - * This means that adding, removing, and iterating over items is lockless, - * except when refilling/emptying the percpu slot buffers. - */ - -#include "fast_list.h" - -struct fast_list_pcpu { - u32 nr; - u32 entries[31]; -}; - -static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp) -{ - int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp); - if (unlikely(idx < 0)) - return 0; - - if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) { - ida_free(&l->slots_allocated, idx); - return 0; - } - - return idx; -} - -/** - * fast_list_get_idx - get a slot in a fast_list - * @l: list to get slot in - * - * This allocates a slot in the radix tree without storing to it, so that we can - * take the potential memory allocation failure early and do the list add later - * when we can't take an allocation failure. - * - * Returns: positive integer on success, -ENOMEM on failure - */ -int fast_list_get_idx(struct fast_list *l) -{ - unsigned long flags; - int idx; -retry: - local_irq_save(flags); - struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); - - if (unlikely(!lp->nr)) { - u32 entries[16], nr = 0; - - local_irq_restore(flags); - while (nr < ARRAY_SIZE(entries) && - (idx = fast_list_alloc_idx(l, GFP_KERNEL))) - entries[nr++] = idx; - local_irq_save(flags); - - lp = this_cpu_ptr(l->buffer); - - while (nr && lp->nr < ARRAY_SIZE(lp->entries)) - lp->entries[lp->nr++] = entries[--nr]; - - if (unlikely(nr)) { - local_irq_restore(flags); - while (nr) - ida_free(&l->slots_allocated, entries[--nr]); - goto retry; - } - - if (unlikely(!lp->nr)) { - local_irq_restore(flags); - return -ENOMEM; - } - } - - idx = lp->entries[--lp->nr]; - local_irq_restore(flags); - - return idx; -} - -/** - * fast_list_add - add an item to a fast_list - * @l: list - * @item: item to add - * - * Allocates a slot in the radix tree and stores to it and then returns the - * slot index, which must be passed to fast_list_remove(). - * - * Returns: positive integer on success, -ENOMEM on failure - */ -int fast_list_add(struct fast_list *l, void *item) -{ - int idx = fast_list_get_idx(l); - if (idx < 0) - return idx; - - *genradix_ptr_inlined(&l->items, idx) = item; - return idx; -} - -/** - * fast_list_remove - remove an item from a fast_list - * @l: list - * @idx: item's slot index - * - * Zeroes out the slot in the radix tree and frees the slot for future - * fast_list_add() operations. - */ -void fast_list_remove(struct fast_list *l, unsigned idx) -{ - u32 entries[16], nr = 0; - unsigned long flags; - - if (!idx) - return; - - *genradix_ptr_inlined(&l->items, idx) = NULL; - - local_irq_save(flags); - struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); - - if (unlikely(lp->nr == ARRAY_SIZE(lp->entries))) - while (nr < ARRAY_SIZE(entries)) - entries[nr++] = lp->entries[--lp->nr]; - - lp->entries[lp->nr++] = idx; - local_irq_restore(flags); - - if (unlikely(nr)) - while (nr) - ida_free(&l->slots_allocated, entries[--nr]); -} - -void fast_list_exit(struct fast_list *l) -{ - /* XXX: warn if list isn't empty */ - free_percpu(l->buffer); - ida_destroy(&l->slots_allocated); - genradix_free(&l->items); -} - -int fast_list_init(struct fast_list *l) -{ - genradix_init(&l->items); - ida_init(&l->slots_allocated); - l->buffer = alloc_percpu(*l->buffer); - if (!l->buffer) - return -ENOMEM; - return 0; -} diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h deleted file mode 100644 index 73c9bf591fd6ef..00000000000000 --- a/fs/bcachefs/fast_list.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef _LINUX_FAST_LIST_H -#define _LINUX_FAST_LIST_H - -#include -#include -#include - -struct fast_list_pcpu; - -struct fast_list { - GENRADIX(void *) items; - struct ida slots_allocated;; - struct fast_list_pcpu __percpu - *buffer; -}; - -static inline void *fast_list_iter_peek(struct genradix_iter *iter, - struct fast_list *list) -{ - void **p; - while ((p = genradix_iter_peek(iter, &list->items)) && !*p) - genradix_iter_advance(iter, &list->items); - - return p ? *p : NULL; -} - -#define fast_list_for_each_from(_list, _iter, _i, _start) \ - for (_iter = genradix_iter_init(&(_list)->items, _start); \ - (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \ - genradix_iter_advance(&(_iter), &(_list)->items)) - -#define fast_list_for_each(_list, _iter, _i) \ - fast_list_for_each_from(_list, _iter, _i, 0) - -int fast_list_get_idx(struct fast_list *l); -int fast_list_add(struct fast_list *l, void *item); -void fast_list_remove(struct fast_list *l, unsigned idx); -void fast_list_exit(struct fast_list *l); -int fast_list_init(struct fast_list *l); - -#endif /* _LINUX_FAST_LIST_H */ diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h deleted file mode 100644 index d8153fe27037ef..00000000000000 --- a/fs/bcachefs/fifo.h +++ /dev/null @@ -1,127 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FIFO_H -#define _BCACHEFS_FIFO_H - -#include "util.h" - -#define FIFO(type) \ -struct { \ - size_t front, back, size, mask; \ - type *data; \ -} - -#define DECLARE_FIFO(type, name) FIFO(type) name - -#define fifo_buf_size(fifo) \ - ((fifo)->size \ - ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ - : 0) - -#define init_fifo(fifo, _size, _gfp) \ -({ \ - (fifo)->front = (fifo)->back = 0; \ - (fifo)->size = (_size); \ - (fifo)->mask = (fifo)->size \ - ? roundup_pow_of_two((fifo)->size) - 1 \ - : 0; \ - (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \ -}) - -#define free_fifo(fifo) \ -do { \ - kvfree((fifo)->data); \ - (fifo)->data = NULL; \ -} while (0) - -#define fifo_swap(l, r) \ -do { \ - swap((l)->front, (r)->front); \ - swap((l)->back, (r)->back); \ - swap((l)->size, (r)->size); \ - swap((l)->mask, (r)->mask); \ - swap((l)->data, (r)->data); \ -} while (0) - -#define fifo_move(dest, src) \ -do { \ - typeof(*((dest)->data)) _t; \ - while (!fifo_full(dest) && \ - fifo_pop(src, _t)) \ - fifo_push(dest, _t); \ -} while (0) - -#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) -#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) - -#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) -#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) - -#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) -#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) - -#define fifo_entry_idx_abs(fifo, p) \ - ((((p) >= &fifo_peek_front(fifo) \ - ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ - (((p) - (fifo)->data))) - -#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) -#define fifo_idx_entry(fifo, i) ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask]) - -#define fifo_push_back_ref(f) \ - (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) - -#define fifo_push_front_ref(f) \ - (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) - -#define fifo_push_back(fifo, new) \ -({ \ - typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ - if (_r) \ - *_r = (new); \ - _r != NULL; \ -}) - -#define fifo_push_front(fifo, new) \ -({ \ - typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ - if (_r) \ - *_r = (new); \ - _r != NULL; \ -}) - -#define fifo_pop_front(fifo, i) \ -({ \ - bool _r = !fifo_empty((fifo)); \ - if (_r) \ - (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ - _r; \ -}) - -#define fifo_pop_back(fifo, i) \ -({ \ - bool _r = !fifo_empty((fifo)); \ - if (_r) \ - (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ - _r; \ -}) - -#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) -#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) -#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) -#define fifo_peek(fifo) fifo_peek_front(fifo) - -#define fifo_for_each_entry(_entry, _fifo, _iter) \ - for (typecheck(typeof((_fifo)->front), _iter), \ - (_iter) = (_fifo)->front; \ - ((_iter != (_fifo)->back) && \ - (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - (_iter)++) - -#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ - for (typecheck(typeof((_fifo)->front), _iter), \ - (_iter) = (_fifo)->front; \ - ((_iter != (_fifo)->back) && \ - (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - (_iter)++) - -#endif /* _BCACHEFS_FIFO_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c deleted file mode 100644 index 1c54b9b5bd6953..00000000000000 --- a/fs/bcachefs/fs-io-buffered.c +++ /dev/null @@ -1,1109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "fs-io.h" -#include "fs-io-buffered.h" -#include "fs-io-direct.h" -#include "fs-io-pagecache.h" -#include "io_read.h" -#include "io_write.h" - -#include -#include -#include - -static inline bool bio_full(struct bio *bio, unsigned len) -{ - if (bio->bi_vcnt >= bio->bi_max_vecs) - return true; - if (bio->bi_iter.bi_size > UINT_MAX - len) - return true; - return false; -} - -/* readpage(s): */ - -static void bch2_readpages_end_io(struct bio *bio) -{ - struct folio_iter fi; - - bio_for_each_folio_all(fi, bio) - folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK); - - bio_put(bio); -} - -struct readpages_iter { - struct address_space *mapping; - unsigned idx; - folios folios; -}; - -static int readpages_iter_init(struct readpages_iter *iter, - struct readahead_control *ractl) -{ - struct folio *folio; - - *iter = (struct readpages_iter) { ractl->mapping }; - - while ((folio = __readahead_folio(ractl))) { - if (!bch2_folio_create(folio, GFP_KERNEL) || - darray_push(&iter->folios, folio)) { - bch2_folio_release(folio); - ractl->_nr_pages += folio_nr_pages(folio); - ractl->_index -= folio_nr_pages(folio); - return iter->folios.nr ? 0 : -ENOMEM; - } - - folio_put(folio); - } - - return 0; -} - -static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) -{ - if (iter->idx >= iter->folios.nr) - return NULL; - return iter->folios.data[iter->idx]; -} - -static inline void readpage_iter_advance(struct readpages_iter *iter) -{ - iter->idx++; -} - -static bool extent_partial_reads_expensive(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (crc.csum_type || crc.compression_type) - return true; - return false; -} - -static int readpage_bio_extend(struct btree_trans *trans, - struct readpages_iter *iter, - struct bio *bio, - unsigned sectors_this_extent, - bool get_more) -{ - /* Don't hold btree locks while allocating memory: */ - bch2_trans_unlock(trans); - - while (bio_sectors(bio) < sectors_this_extent && - bio->bi_vcnt < bio->bi_max_vecs) { - struct folio *folio = readpage_iter_peek(iter); - int ret; - - if (folio) { - readpage_iter_advance(iter); - } else { - pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; - - if (!get_more) - break; - - unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio); - - if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping)) - break; - - unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); - - /* ensure proper alignment */ - order = min(order, __ffs(folio_offset|BIT(31))); - - folio = xa_load(&iter->mapping->i_pages, folio_offset); - if (folio && !xa_is_value(folio)) - break; - - folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order); - if (!folio) - break; - - if (!__bch2_folio_create(folio, GFP_KERNEL)) { - folio_put(folio); - break; - } - - ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); - if (ret) { - __bch2_folio_release(folio); - folio_put(folio); - break; - } - - folio_put(folio); - } - - BUG_ON(folio_sector(folio) != bio_end_sector(bio)); - - BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); - } - - return bch2_trans_relock(trans); -} - -static void bchfs_read(struct btree_trans *trans, - struct bch_read_bio *rbio, - subvol_inum inum, - struct readpages_iter *readpages_iter) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_buf sk; - int flags = BCH_READ_retry_if_stale| - BCH_READ_may_promote; - int ret = 0; - - rbio->subvol = inum.subvol; - - bch2_bkey_buf_init(&sk); - bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_slots); - while (1) { - struct bkey_s_c k; - unsigned bytes, sectors; - s64 offset_into_extent; - enum btree_id data_btree = BTREE_ID_extents; - - bch2_trans_begin(trans); - - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - bch2_btree_iter_set_pos(trans, &iter, - POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - - ret = bch2_read_indirect_extent(trans, &data_btree, - &offset_into_extent, &sk); - if (ret) - goto err; - - k = bkey_i_to_s_c(sk.k); - - sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - - if (readpages_iter) { - ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, - extent_partial_reads_expensive(k)); - if (ret) - goto err; - } - - bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; - swap(rbio->bio.bi_iter.bi_size, bytes); - - if (rbio->bio.bi_iter.bi_size == bytes) - flags |= BCH_READ_last_fragment; - - bch2_bio_page_state_set(&rbio->bio, k); - - bch2_read_extent(trans, rbio, iter.pos, - data_btree, k, offset_into_extent, flags); - /* - * Careful there's a landmine here if bch2_read_extent() ever - * starts returning transaction restarts here. - * - * We've changed rbio->bi_iter.bi_size to be "bytes we can read - * from this extent" with the swap call, and we restore it - * below. That restore needs to come before checking for - * errors. - * - * But unlike __bch2_read(), we use the rbio bvec iter, not one - * on the stack, so we can't do the restore right after the - * bch2_read_extent() call: we don't own that iterator anymore - * if BCH_READ_last_fragment is set, since we may have submitted - * that rbio instead of cloning it. - */ - - if (flags & BCH_READ_last_fragment) - break; - - swap(rbio->bio.bi_iter.bi_size, bytes); - bio_advance(&rbio->bio, bytes); -err: - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) { - struct printbuf buf = PRINTBUF; - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); - prt_printf(&buf, "read error %i from btree lookup", ret); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - - rbio->bio.bi_status = BLK_STS_IOERR; - bio_endio(&rbio->bio); - } - - bch2_bkey_buf_exit(&sk, c); -} - -void bch2_readahead(struct readahead_control *ractl) -{ - struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; - struct folio *folio; - struct readpages_iter readpages_iter; - struct blk_plug plug; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - int ret = readpages_iter_init(&readpages_iter, ractl); - if (ret) - return; - - /* - * Besides being a general performance optimization, plugging helps with - * avoiding btree transaction srcu warnings - submitting a bio can - * block, and we don't want todo that with the transaction locked. - * - * However, plugged bios are submitted when we schedule; we ideally - * would have our own scheduler hook to call unlock_long() before - * scheduling. - */ - blk_start_plug(&plug); - bch2_pagecache_add_get(inode); - - struct btree_trans *trans = bch2_trans_get(c); - while ((folio = readpage_iter_peek(&readpages_iter))) { - unsigned n = min_t(unsigned, - readpages_iter.folios.nr - - readpages_iter.idx, - BIO_MAX_VECS); - struct bch_read_bio *rbio = - rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, - GFP_KERNEL, &c->bio_read), - c, - opts, - bch2_readpages_end_io); - - readpage_iter_advance(&readpages_iter); - - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bchfs_read(trans, rbio, inode_inum(inode), - &readpages_iter); - bch2_trans_unlock(trans); - } - bch2_trans_put(trans); - - bch2_pagecache_add_put(inode); - blk_finish_plug(&plug); - darray_exit(&readpages_iter.folios); -} - -static void bch2_read_single_folio_end_io(struct bio *bio) -{ - complete(bio->bi_private); -} - -int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_read_bio *rbio; - struct bch_io_opts opts; - struct blk_plug plug; - int ret; - DECLARE_COMPLETION_ONSTACK(done); - - BUG_ON(folio_test_uptodate(folio)); - BUG_ON(folio_test_dirty(folio)); - - if (!bch2_folio_create(folio, GFP_KERNEL)) - return -ENOMEM; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - c, - opts, - bch2_read_single_folio_end_io); - rbio->bio.bi_private = &done; - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - blk_start_plug(&plug); - bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); - blk_finish_plug(&plug); - wait_for_completion(&done); - - ret = blk_status_to_errno(rbio->bio.bi_status); - bio_put(&rbio->bio); - - if (ret < 0) - return ret; - - folio_mark_uptodate(folio); - return 0; -} - -int bch2_read_folio(struct file *file, struct folio *folio) -{ - int ret; - - ret = bch2_read_single_folio(folio, folio->mapping); - folio_unlock(folio); - return bch2_err_class(ret); -} - -/* writepages: */ - -struct bch_writepage_io { - struct bch_inode_info *inode; - - /* must be last: */ - struct bch_write_op op; -}; - -struct bch_writepage_state { - struct bch_writepage_io *io; - struct bch_io_opts opts; - struct bch_folio_sector *tmp; - unsigned tmp_sectors; - struct blk_plug plug; -}; - -/* - * Determine when a writepage io is full. We have to limit writepage bios to a - * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to - * what the bounce path in bch2_write_extent() can handle. In theory we could - * loosen this restriction for non-bounce I/O, but we don't have that context - * here. Ideally, we can up this limit and make it configurable in the future - * when the bounce path can be enhanced to accommodate larger source bios. - */ -static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len) -{ - struct bio *bio = &io->op.wbio.bio; - return bio_full(bio, len) || - (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE); -} - -static void bch2_writepage_io_done(struct bch_write_op *op) -{ - struct bch_writepage_io *io = - container_of(op, struct bch_writepage_io, op); - struct bch_fs *c = io->op.c; - struct bio *bio = &io->op.wbio.bio; - struct folio_iter fi; - unsigned i; - - if (io->op.error) { - set_bit(EI_INODE_ERROR, &io->inode->ei_flags); - - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - - mapping_set_error(fi.folio->mapping, -EIO); - - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); - for (i = 0; i < folio_sectors(fi.folio); i++) - s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); - } - } - - if (io->op.flags & BCH_WRITE_wrote_data_inline) { - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); - for (i = 0; i < folio_sectors(fi.folio); i++) - s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); - } - } - - /* - * racing with fallocate can cause us to add fewer sectors than - * expected - but we shouldn't add more sectors than expected: - */ - WARN_ON_ONCE(io->op.i_sectors_delta > 0); - - /* - * (error (due to going RO) halfway through a page can screw that up - * slightly) - * XXX wtf? - BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); - */ - - /* - * The writeback flag is effectively our ref on the inode - - * fixup i_blocks before calling folio_end_writeback: - */ - bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s = __bch2_folio(fi.folio); - - if (atomic_dec_and_test(&s->write_count)) - folio_end_writeback(fi.folio); - } - - bio_put(&io->op.wbio.bio); -} - -static void bch2_writepage_do_io(struct bch_writepage_state *w) -{ - struct bch_writepage_io *io = w->io; - - w->io = NULL; - closure_call(&io->op.cl, bch2_write, NULL, NULL); -} - -/* - * Get a bch_writepage_io and add @page to it - appending to an existing one if - * possible, else allocating a new one: - */ -static void bch2_writepage_io_alloc(struct bch_fs *c, - struct writeback_control *wbc, - struct bch_writepage_state *w, - struct bch_inode_info *inode, - u64 sector, - unsigned nr_replicas) -{ - struct bch_write_op *op; - - w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, - REQ_OP_WRITE, - GFP_KERNEL, - &c->writepage_bioset), - struct bch_writepage_io, op.wbio.bio); - - w->io->inode = inode; - op = &w->io->op; - bch2_write_op_init(op, c, w->opts); - op->target = w->opts.foreground_target; - op->nr_replicas = nr_replicas; - op->res.nr_replicas = nr_replicas; - op->write_point = writepoint_hashed(inode->ei_last_dirtied); - op->subvol = inode->ei_inum.subvol; - op->pos = POS(inode->v.i_ino, sector); - op->end_io = bch2_writepage_io_done; - op->devs_need_flush = &inode->ei_devs_need_flush; - op->wbio.bio.bi_iter.bi_sector = sector; - op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); -} - -static int __bch2_writepage(struct folio *folio, - struct writeback_control *wbc, - void *data) -{ - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_writepage_state *w = data; - struct bch_folio *s; - unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; - loff_t i_size = i_size_read(&inode->v); - int ret; - - EBUG_ON(!folio_test_uptodate(folio)); - - /* Is the folio fully inside i_size? */ - if (folio_end_pos(folio) <= i_size) - goto do_io; - - /* Is the folio fully outside i_size? (truncate in progress) */ - if (folio_pos(folio) >= i_size) { - folio_unlock(folio); - return 0; - } - - /* - * The folio straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the folio size. For a file that is not a multiple of - * the folio size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - folio_zero_segment(folio, - i_size - folio_pos(folio), - folio_size(folio)); -do_io: - f_sectors = folio_sectors(folio); - s = bch2_folio(folio); - - if (f_sectors > w->tmp_sectors) { - kfree(w->tmp); - w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL); - w->tmp_sectors = f_sectors; - } - - /* - * Things get really hairy with errors during writeback: - */ - ret = bch2_get_folio_disk_reservation(c, inode, folio, false); - BUG_ON(ret); - - /* Before unlocking the page, get copy of reservations: */ - spin_lock(&s->lock); - memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); - - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; - - nr_replicas_this_write = - min_t(unsigned, nr_replicas_this_write, - s->s[i].nr_replicas + - s->s[i].replicas_reserved); - } - - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; - - s->s[i].nr_replicas = w->opts.compression - ? 0 : nr_replicas_this_write; - - s->s[i].replicas_reserved = 0; - bch2_folio_sector_set(folio, s, i, SECTOR_allocated); - } - spin_unlock(&s->lock); - - BUG_ON(atomic_read(&s->write_count)); - atomic_set(&s->write_count, 1); - - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - - folio_unlock(folio); - - offset = 0; - while (1) { - unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; - u64 sector; - - while (offset < f_sectors && - w->tmp[offset].state < SECTOR_dirty) - offset++; - - if (offset == f_sectors) - break; - - while (offset + sectors < f_sectors && - w->tmp[offset + sectors].state >= SECTOR_dirty) { - reserved_sectors += w->tmp[offset + sectors].replicas_reserved; - dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; - sectors++; - } - BUG_ON(!sectors); - - sector = folio_sector(folio) + offset; - - if (w->io && - (w->io->op.res.nr_replicas != nr_replicas_this_write || - bch_io_full(w->io, sectors << 9) || - bio_end_sector(&w->io->op.wbio.bio) != sector)) - bch2_writepage_do_io(w); - - if (!w->io) - bch2_writepage_io_alloc(c, wbc, w, inode, sector, - nr_replicas_this_write); - - atomic_inc(&s->write_count); - - BUG_ON(inode != w->io->inode); - BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, - sectors << 9, offset << 9)); - - w->io->op.res.sectors += reserved_sectors; - w->io->op.i_sectors_delta -= dirty_sectors; - w->io->op.new_i_size = i_size; - - offset += sectors; - } - - if (atomic_dec_and_test(&s->write_count)) - folio_end_writeback(folio); - - return 0; -} - -int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) -{ - struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL); - - bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); - - blk_start_plug(&w->plug); - int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w); - if (w->io) - bch2_writepage_do_io(w); - blk_finish_plug(&w->plug); - kfree(w->tmp); - kfree(w); - return bch2_err_class(ret); -} - -/* buffered writes: */ - -int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation *res; - struct folio *folio; - unsigned offset; - int ret = -ENOMEM; - - res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) - return -ENOMEM; - - bch2_folio_reservation_init(c, inode, res); - *fsdata = res; - - bch2_pagecache_add_get(inode); - - folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, - FGP_WRITEBEGIN | fgf_set_order(len), - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) - goto err_unlock; - - offset = pos - folio_pos(folio); - len = min_t(size_t, len, folio_end_pos(folio) - pos); - - if (folio_test_uptodate(folio)) - goto out; - - /* If we're writing entire folio, don't need to read it in first: */ - if (!offset && len == folio_size(folio)) - goto out; - - if (!offset && pos + len >= inode->v.i_size) { - folio_zero_segment(folio, len, folio_size(folio)); - flush_dcache_folio(folio); - goto out; - } - - if (folio_pos(folio) >= inode->v.i_size) { - folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); - flush_dcache_folio(folio); - goto out; - } -readpage: - ret = bch2_read_single_folio(folio, mapping); - if (ret) - goto err; -out: - ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); - if (ret) - goto err; - - ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); - if (ret) { - if (!folio_test_uptodate(folio)) { - /* - * If the folio hasn't been read in, we won't know if we - * actually need a reservation - we don't actually need - * to read here, we just need to check if the folio is - * fully backed by uncompressed data: - */ - goto readpage; - } - - goto err; - } - - *foliop = folio; - return 0; -err: - folio_unlock(folio); - folio_put(folio); -err_unlock: - bch2_pagecache_add_put(inode); - kfree(res); - *fsdata = NULL; - return bch2_err_class(ret); -} - -int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation *res = fsdata; - unsigned offset = pos - folio_pos(folio); - - lockdep_assert_held(&inode->v.i_rwsem); - BUG_ON(offset + copied > folio_size(folio)); - - if (unlikely(copied < len && !folio_test_uptodate(folio))) { - /* - * The folio needs to be read in, but that would destroy - * our partial write - simplest thing is to just force - * userspace to redo the write: - */ - folio_zero_range(folio, 0, folio_size(folio)); - flush_dcache_folio(folio); - copied = 0; - } - - spin_lock(&inode->v.i_lock); - if (pos + copied > inode->v.i_size) - i_size_write(&inode->v, pos + copied); - spin_unlock(&inode->v.i_lock); - - if (copied) { - if (!folio_test_uptodate(folio)) - folio_mark_uptodate(folio); - - bch2_set_folio_dirty(c, inode, folio, res, offset, copied); - - inode->ei_last_dirtied = (unsigned long) current; - } - - folio_unlock(folio); - folio_put(folio); - bch2_pagecache_add_put(inode); - - bch2_folio_reservation_put(c, inode, res); - kfree(res); - - return copied; -} - -static noinline void folios_trunc(folios *fs, struct folio **fi) -{ - while (fs->data + fs->nr > fi) { - struct folio *f = darray_pop(fs); - - folio_unlock(f); - folio_put(f); - } -} - -static int __bch2_buffered_write(struct bch_inode_info *inode, - struct address_space *mapping, - struct iov_iter *iter, - loff_t pos, unsigned len) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation res; - folios fs; - struct folio *f; - unsigned copied = 0, f_offset, f_copied; - u64 end = pos + len, f_pos, f_len; - loff_t last_folio_pos = inode->v.i_size; - int ret = 0; - - BUG_ON(!len); - - bch2_folio_reservation_init(c, inode, &res); - darray_init(&fs); - - ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, - FGP_WRITEBEGIN | fgf_set_order(len), - mapping_gfp_mask(mapping), &fs); - if (ret) - goto out; - - BUG_ON(!fs.nr); - - f = darray_first(fs); - if (pos != folio_pos(f) && !folio_test_uptodate(f)) { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - - f = darray_last(fs); - end = min(end, folio_end_pos(f)); - last_folio_pos = folio_pos(f); - if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { - if (end >= inode->v.i_size) { - folio_zero_range(f, 0, folio_size(f)); - } else { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - } - - ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr); - if (ret) - goto out; - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(fs)); - darray_for_each(fs, fi) { - ssize_t f_reserved; - - f = *fi; - f_len = min(end, folio_end_pos(f)) - f_pos; - f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len); - - if (unlikely(f_reserved != f_len)) { - if (f_reserved < 0) { - if (f == darray_first(fs)) { - ret = f_reserved; - goto out; - } - - folios_trunc(&fs, fi); - end = min(end, folio_end_pos(darray_last(fs))); - } else { - if (!folio_test_uptodate(f)) { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - - folios_trunc(&fs, fi + 1); - end = f_pos + f_reserved; - } - - break; - } - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - if (mapping_writably_mapped(mapping)) - darray_for_each(fs, fi) - flush_dcache_folio(*fi); - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(fs)); - darray_for_each(fs, fi) { - f = *fi; - f_len = min(end, folio_end_pos(f)) - f_pos; - f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); - if (!f_copied) { - folios_trunc(&fs, fi); - break; - } - - if (!folio_test_uptodate(f) && - f_copied != folio_size(f) && - pos + copied + f_copied < inode->v.i_size) { - iov_iter_revert(iter, f_copied); - folio_zero_range(f, 0, folio_size(f)); - folios_trunc(&fs, fi); - break; - } - - flush_dcache_folio(f); - copied += f_copied; - - if (f_copied != f_len) { - folios_trunc(&fs, fi + 1); - break; - } - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - if (!copied) - goto out; - - end = pos + copied; - - spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) - i_size_write(&inode->v, end); - spin_unlock(&inode->v.i_lock); - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(fs)); - darray_for_each(fs, fi) { - f = *fi; - f_len = min(end, folio_end_pos(f)) - f_pos; - - if (!folio_test_uptodate(f)) - folio_mark_uptodate(f); - - bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - inode->ei_last_dirtied = (unsigned long) current; -out: - darray_for_each(fs, fi) { - folio_unlock(*fi); - folio_put(*fi); - } - - /* - * If the last folio added to the mapping starts beyond current EOF, we - * performed a short write but left around at least one post-EOF folio. - * Clean up the mapping before we return. - */ - if (last_folio_pos >= inode->v.i_size) - truncate_pagecache(&inode->v, inode->v.i_size); - - darray_exit(&fs); - bch2_folio_reservation_put(c, inode, &res); - - return copied ?: ret; -} - -static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos = iocb->ki_pos; - ssize_t written = 0; - int ret = 0; - - bch2_pagecache_add_get(inode); - - do { - unsigned offset = pos & (PAGE_SIZE - 1); - unsigned bytes = iov_iter_count(iter); -again: - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. - */ - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { - bytes = min_t(unsigned long, iov_iter_count(iter), - PAGE_SIZE - offset); - - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { - ret = -EFAULT; - break; - } - } - - if (unlikely(fatal_signal_pending(current))) { - ret = -EINTR; - break; - } - - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); - if (unlikely(ret < 0)) - break; - - cond_resched(); - - if (unlikely(ret == 0)) { - /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. - */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(iter)); - goto again; - } - pos += ret; - written += ret; - ret = 0; - - balance_dirty_pages_ratelimited(mapping); - } while (iov_iter_count(iter)); - - bch2_pagecache_add_put(inode); - - return written ? written : ret; -} - -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - ssize_t ret; - - if (iocb->ki_flags & IOCB_DIRECT) { - ret = bch2_direct_write(iocb, from); - goto out; - } - - inode_lock(&inode->v); - - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs(file); - if (ret) - goto unlock; - - ret = file_update_time(file); - if (ret) - goto unlock; - - ret = bch2_buffered_write(iocb, from); - if (likely(ret > 0)) - iocb->ki_pos += ret; -unlock: - inode_unlock(&inode->v); - - if (ret > 0) - ret = generic_write_sync(iocb, ret); -out: - return bch2_err_class(ret); -} - -void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) -{ - bioset_exit(&c->writepage_bioset); -} - -int bch2_fs_fs_io_buffered_init(struct bch_fs *c) -{ - if (bioset_init(&c->writepage_bioset, - 4, offsetof(struct bch_writepage_io, op.wbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_writepage_bioset_init; - - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h deleted file mode 100644 index 14de91c2765607..00000000000000 --- a/fs/bcachefs/fs-io-buffered.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_BUFFERED_H -#define _BCACHEFS_FS_IO_BUFFERED_H - -#ifndef NO_BCACHEFS_FS - -int bch2_read_single_folio(struct folio *, struct address_space *); -int bch2_read_folio(struct file *, struct folio *); - -int bch2_writepages(struct address_space *, struct writeback_control *); -void bch2_readahead(struct readahead_control *); - -int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos, - unsigned len, struct folio **, void **); -int bch2_write_end(const struct kiocb *, struct address_space *, loff_t, - unsigned len, unsigned copied, struct folio *, void *); - -ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); - -void bch2_fs_fs_io_buffered_exit(struct bch_fs *); -int bch2_fs_fs_io_buffered_init(struct bch_fs *); -#else -static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {} -static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; } -#endif - -#endif /* _BCACHEFS_FS_IO_BUFFERED_H */ diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c deleted file mode 100644 index 1f5154d9676bda..00000000000000 --- a/fs/bcachefs/fs-io-direct.c +++ /dev/null @@ -1,704 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "enumerated_ref.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-io-direct.h" -#include "fs-io-pagecache.h" -#include "io_read.h" -#include "io_write.h" - -#include -#include -#include -#include - -/* O_DIRECT reads */ - -struct dio_read { - struct closure cl; - struct kiocb *req; - long ret; - bool should_dirty; - struct bch_read_bio rbio; -}; - -static void bio_check_or_release(struct bio *bio, bool check_dirty) -{ - if (check_dirty) { - bio_check_pages_dirty(bio); - } else { - bio_release_pages(bio, false); - bio_put(bio); - } -} - -static CLOSURE_CALLBACK(bch2_dio_read_complete) -{ - closure_type(dio, struct dio_read, cl); - - dio->req->ki_complete(dio->req, dio->ret); - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); -} - -static void bch2_direct_IO_read_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - - if (bio->bi_status) - dio->ret = blk_status_to_errno(bio->bi_status); - - closure_put(&dio->cl); -} - -static void bch2_direct_IO_read_split_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - bool should_dirty = dio->should_dirty; - - bch2_direct_IO_read_endio(bio); - bio_check_or_release(bio, should_dirty); -} - -static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) -{ - struct file *file = req->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; - struct dio_read *dio; - struct bio *bio; - struct blk_plug plug; - loff_t offset = req->ki_pos; - bool sync = is_sync_kiocb(req); - bool split = false; - size_t shorten; - ssize_t ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - /* bios must be 512 byte aligned: */ - if ((offset|iter->count) & (SECTOR_SIZE - 1)) - return -EINVAL; - - ret = min_t(loff_t, iter->count, - max_t(loff_t, 0, i_size_read(&inode->v) - offset)); - - if (!ret) - return ret; - - shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); - if (shorten >= iter->count) - shorten = 0; - iter->count -= shorten; - - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->dio_read_bioset); - - dio = container_of(bio, struct dio_read, rbio.bio); - closure_init(&dio->cl, NULL); - - /* - * this is a _really_ horrible hack just to avoid an atomic sub at the - * end: - */ - if (!sync) { - set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER - - CLOSURE_RUNNING + - CLOSURE_DESTRUCTOR); - } else { - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER + 1); - dio->cl.closure_get_happened = true; - } - - dio->req = req; - dio->ret = ret; - /* - * This is one of the sketchier things I've encountered: we have to skip - * the dirtying of requests that are internal from the kernel (i.e. from - * loopback), because we'll deadlock on page_lock. - */ - dio->should_dirty = iter_is_iovec(iter); - - blk_start_plug(&plug); - - goto start; - while (iter->count) { - split = true; - - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->bio_read); -start: - bio->bi_opf = REQ_OP_READ|REQ_SYNC; - bio->bi_iter.bi_sector = offset >> 9; - bio->bi_private = dio; - - ret = bio_iov_iter_get_pages(bio, iter); - if (ret < 0) { - /* XXX: fault inject this path */ - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - break; - } - - offset += bio->bi_iter.bi_size; - - if (dio->should_dirty) - bio_set_pages_dirty(bio); - - if (iter->count) - closure_get(&dio->cl); - - struct bch_read_bio *rbio = - rbio_init(bio, - c, - opts, - split - ? bch2_direct_IO_read_split_endio - : bch2_direct_IO_read_endio); - - bch2_read(c, rbio, inode_inum(inode)); - } - - blk_finish_plug(&plug); - - iter->count += shorten; - - if (sync) { - closure_sync(&dio->cl); - closure_debug_destroy(&dio->cl); - ret = dio->ret; - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); - return ret; - } else { - return -EIOCBQUEUED; - } -} - -ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = file->f_mapping; - size_t count = iov_iter_count(iter); - ssize_t ret = 0; - - if (!count) - return 0; /* skip atime */ - - if (iocb->ki_flags & IOCB_DIRECT) { - struct blk_plug plug; - - if (unlikely(mapping->nrpages)) { - ret = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret < 0) - goto out; - } - - file_accessed(file); - - blk_start_plug(&plug); - ret = bch2_direct_IO_read(iocb, iter); - blk_finish_plug(&plug); - - if (ret >= 0) - iocb->ki_pos += ret; - } else { - bch2_pagecache_add_get(inode); - ret = filemap_read(iocb, iter, ret); - bch2_pagecache_add_put(inode); - } -out: - return bch2_err_class(ret); -} - -/* O_DIRECT writes */ - -struct dio_write { - struct kiocb *req; - struct address_space *mapping; - struct bch_inode_info *inode; - struct mm_struct *mm; - const struct iovec *iov; - unsigned loop:1, - extending:1, - sync:1, - flush:1; - struct quota_res quota_res; - u64 written; - - struct iov_iter iter; - struct iovec inline_vecs[2]; - - /* must be last: */ - struct bch_write_op op; -}; - -static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, - u64 offset, u64 size, - unsigned nr_replicas, bool compressed) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - u64 end = offset + size; - u32 snapshot; - bool ret = true; - int err; -retry: - bch2_trans_begin(trans); - - err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (err) - goto err; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_slots, k, err) { - if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) - break; - - if (k.k->p.snapshot != snapshot || - nr_replicas > bch2_bkey_replicas(c, k) || - (!compressed && bch2_bkey_sectors_compressed(k))) { - ret = false; - break; - } - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(err, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_put(trans); - - return err ? false : ret; -} - -static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct bch_inode_info *inode = dio->inode; - struct bio *bio = &dio->op.wbio.bio; - - return bch2_check_range_allocated(c, inode_inum(inode), - dio->op.pos.offset, bio_sectors(bio), - dio->op.opts.data_replicas, - dio->op.opts.compression != 0); -} - -static void bch2_dio_write_loop_async(struct bch_write_op *); -static __always_inline long bch2_dio_write_done(struct dio_write *dio); - -/* - * We're going to return -EIOCBQUEUED, but we haven't finished consuming the - * iov_iter yet, so we need to stash a copy of the iovec: it might be on the - * caller's stack, we're not guaranteed that it will live for the duration of - * the IO: - */ -static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) -{ - struct iovec *iov = dio->inline_vecs; - - /* - * iov_iter has a single embedded iovec - nothing to do: - */ - if (iter_is_ubuf(&dio->iter)) - return 0; - - /* - * We don't currently handle non-iovec iov_iters here - return an error, - * and we'll fall back to doing the IO synchronously: - */ - if (!iter_is_iovec(&dio->iter)) - return -1; - - if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), - GFP_KERNEL); - if (unlikely(!iov)) - return -ENOMEM; - } - - memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); - dio->iter.__iov = iov; - return 0; -} - -static CLOSURE_CALLBACK(bch2_dio_write_flush_done) -{ - closure_type(dio, struct dio_write, op.cl); - struct bch_fs *c = dio->op.c; - - closure_debug_destroy(cl); - - dio->op.error = bch2_journal_error(&c->journal); - - bch2_dio_write_done(dio); -} - -static noinline void bch2_dio_write_flush(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct bch_inode_unpacked inode; - int ret; - - dio->flush = 0; - - closure_init(&dio->op.cl, NULL); - - if (!dio->op.error) { - ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); - if (ret) { - dio->op.error = ret; - } else { - bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, - &dio->op.cl); - bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); - } - } - - if (dio->sync) { - closure_sync(&dio->op.cl); - closure_debug_destroy(&dio->op.cl); - } else { - continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); - } -} - -static __always_inline long bch2_dio_write_done(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct bch_inode_info *inode = dio->inode; - bool sync = dio->sync; - long ret; - - if (unlikely(dio->flush)) { - bch2_dio_write_flush(dio); - if (!sync) - return -EIOCBQUEUED; - } - - bch2_pagecache_block_put(inode); - - kfree(dio->iov); - - ret = dio->op.error ?: ((long) dio->written << 9); - bio_put(&dio->op.wbio.bio); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); - - /* inode->i_dio_count is our ref on inode and thus bch_fs */ - inode_dio_end(&inode->v); - - if (ret < 0) - ret = bch2_err_class(ret); - - if (!sync) { - req->ki_complete(req, ret); - ret = -EIOCBQUEUED; - } - return ret; -} - -static __always_inline void bch2_dio_write_end(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct bch_inode_info *inode = dio->inode; - struct bio *bio = &dio->op.wbio.bio; - - req->ki_pos += (u64) dio->op.written << 9; - dio->written += dio->op.written; - - if (dio->extending) { - spin_lock(&inode->v.i_lock); - if (req->ki_pos > inode->v.i_size) - i_size_write(&inode->v, req->ki_pos); - spin_unlock(&inode->v.i_lock); - } - - if (dio->op.i_sectors_delta || dio->quota_res.sectors) { - mutex_lock(&inode->ei_quota_lock); - __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); - __bch2_quota_reservation_put(c, inode, &dio->quota_res); - mutex_unlock(&inode->ei_quota_lock); - } - - bio_release_pages(bio, false); - - if (unlikely(dio->op.error)) - set_bit(EI_INODE_ERROR, &inode->ei_flags); -} - -static __always_inline long bch2_dio_write_loop(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct address_space *mapping = dio->mapping; - struct bch_inode_info *inode = dio->inode; - struct bch_io_opts opts; - struct bio *bio = &dio->op.wbio.bio; - unsigned unaligned, iter_count; - bool sync = dio->sync, dropped_locks; - long ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - while (1) { - iter_count = dio->iter.count; - - EBUG_ON(current->faults_disabled_mapping); - current->faults_disabled_mapping = mapping; - - ret = bio_iov_iter_get_pages(bio, &dio->iter); - - dropped_locks = fdm_dropped_locks(); - - current->faults_disabled_mapping = NULL; - - /* - * If the fault handler returned an error but also signalled - * that it dropped & retook ei_pagecache_lock, we just need to - * re-shoot down the page cache and retry: - */ - if (dropped_locks && ret) - ret = 0; - - if (unlikely(ret < 0)) - goto err; - - if (unlikely(dropped_locks)) { - ret = bch2_write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter_count - 1); - if (unlikely(ret)) - goto err; - - if (!bio->bi_iter.bi_size) - continue; - } - - unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); - bio->bi_iter.bi_size -= unaligned; - iov_iter_revert(&dio->iter, unaligned); - - if (!bio->bi_iter.bi_size) { - /* - * bio_iov_iter_get_pages was only able to get < - * blocksize worth of pages: - */ - ret = -EFAULT; - goto err; - } - - bch2_write_op_init(&dio->op, c, opts); - dio->op.end_io = sync - ? NULL - : bch2_dio_write_loop_async; - dio->op.target = dio->op.opts.foreground_target; - dio->op.write_point = writepoint_hashed((unsigned long) current); - dio->op.nr_replicas = dio->op.opts.data_replicas; - dio->op.subvol = inode->ei_inum.subvol; - dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); - dio->op.devs_need_flush = &inode->ei_devs_need_flush; - - if (sync) - dio->op.flags |= BCH_WRITE_sync; - dio->op.flags |= BCH_WRITE_check_enospc; - - ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, - bio_sectors(bio), true); - if (unlikely(ret)) - goto err; - - ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), - dio->op.opts.data_replicas, 0); - if (unlikely(ret) && - !bch2_dio_write_check_allocated(dio)) - goto err; - - task_io_account_write(bio->bi_iter.bi_size); - - if (unlikely(dio->iter.count) && - !dio->sync && - !dio->loop && - bch2_dio_write_copy_iov(dio)) - dio->sync = sync = true; - - dio->loop = true; - closure_call(&dio->op.cl, bch2_write, NULL, NULL); - - if (!sync) - return -EIOCBQUEUED; - - bch2_dio_write_end(dio); - - if (likely(!dio->iter.count) || dio->op.error) - break; - - bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); - } -out: - return bch2_dio_write_done(dio); -err: - dio->op.error = ret; - - bio_release_pages(bio, false); - - bch2_quota_reservation_put(c, inode, &dio->quota_res); - goto out; -} - -static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) -{ - struct mm_struct *mm = dio->mm; - - bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); - - if (mm) - kthread_use_mm(mm); - bch2_dio_write_loop(dio); - if (mm) - kthread_unuse_mm(mm); -} - -static void bch2_dio_write_loop_async(struct bch_write_op *op) -{ - struct dio_write *dio = container_of(op, struct dio_write, op); - - bch2_dio_write_end(dio); - - if (likely(!dio->iter.count) || dio->op.error) - bch2_dio_write_done(dio); - else - bch2_dio_write_continue(dio); -} - -ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) -{ - struct file *file = req->ki_filp; - struct address_space *mapping = file->f_mapping; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct dio_write *dio; - struct bio *bio; - bool locked = true, extending; - ssize_t ret; - - prefetch(&c->opts); - prefetch((void *) &c->opts + 64); - prefetch(&inode->ei_inode); - prefetch((void *) &inode->ei_inode + 64); - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write)) - return -EROFS; - - inode_lock(&inode->v); - - ret = generic_write_checks(req, iter); - if (unlikely(ret <= 0)) - goto err_put_write_ref; - - ret = file_remove_privs(file); - if (unlikely(ret)) - goto err_put_write_ref; - - ret = file_update_time(file); - if (unlikely(ret)) - goto err_put_write_ref; - - if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) { - ret = -EINVAL; - goto err_put_write_ref; - } - - inode_dio_begin(&inode->v); - bch2_pagecache_block_get(inode); - - extending = req->ki_pos + iter->count > inode->v.i_size; - if (!extending) { - inode_unlock(&inode->v); - locked = false; - } - - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_WRITE | REQ_SYNC | REQ_IDLE, - GFP_KERNEL, - &c->dio_write_bioset); - dio = container_of(bio, struct dio_write, op.wbio.bio); - dio->req = req; - dio->mapping = mapping; - dio->inode = inode; - dio->mm = current->mm; - dio->iov = NULL; - dio->loop = false; - dio->extending = extending; - dio->sync = is_sync_kiocb(req) || extending; - dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; - dio->quota_res.sectors = 0; - dio->written = 0; - dio->iter = *iter; - dio->op.c = c; - - if (unlikely(mapping->nrpages)) { - ret = bch2_write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter->count - 1); - if (unlikely(ret)) - goto err_put_bio; - } - - ret = bch2_dio_write_loop(dio); -out: - if (locked) - inode_unlock(&inode->v); - return ret; -err_put_bio: - bch2_pagecache_block_put(inode); - bio_put(bio); - inode_dio_end(&inode->v); -err_put_write_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); - goto out; -} - -void bch2_fs_fs_io_direct_exit(struct bch_fs *c) -{ - bioset_exit(&c->dio_write_bioset); - bioset_exit(&c->dio_read_bioset); -} - -int bch2_fs_fs_io_direct_init(struct bch_fs *c) -{ - if (bioset_init(&c->dio_read_bioset, - 4, offsetof(struct dio_read, rbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_dio_read_bioset_init; - - if (bioset_init(&c->dio_write_bioset, - 4, offsetof(struct dio_write, op.wbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_dio_write_bioset_init; - - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h deleted file mode 100644 index 814621ec7f81dc..00000000000000 --- a/fs/bcachefs/fs-io-direct.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_DIRECT_H -#define _BCACHEFS_FS_IO_DIRECT_H - -#ifndef NO_BCACHEFS_FS -ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *); -ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); - -void bch2_fs_fs_io_direct_exit(struct bch_fs *); -int bch2_fs_fs_io_direct_init(struct bch_fs *); -#else -static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {} -static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; } -#endif - -#endif /* _BCACHEFS_FS_IO_DIRECT_H */ diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c deleted file mode 100644 index c2cc405822f2b2..00000000000000 --- a/fs/bcachefs/fs-io-pagecache.c +++ /dev/null @@ -1,827 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "btree_iter.h" -#include "extents.h" -#include "fs-io.h" -#include "fs-io-pagecache.h" -#include "subvolume.h" - -#include -#include - -int bch2_filemap_get_contig_folios_d(struct address_space *mapping, - loff_t start, u64 end, - fgf_t fgp_flags, gfp_t gfp, - folios *fs) -{ - struct folio *f; - u64 pos = start; - int ret = 0; - - while (pos < end) { - if ((u64) pos >= (u64) start + (1ULL << 20)) - fgp_flags &= ~FGP_CREAT; - - ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL); - if (ret) - break; - - f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); - if (IS_ERR(f)) - break; - - BUG_ON(fs->nr && folio_pos(f) != pos); - - pos = folio_end_pos(f); - darray_push(fs, f); - } - - if (!fs->nr && !ret && (fgp_flags & FGP_CREAT)) - ret = -ENOMEM; - - return fs->nr ? 0 : ret; -} - -/* pagecache_block must be held */ -int bch2_write_invalidate_inode_pages_range(struct address_space *mapping, - loff_t start, loff_t end) -{ - int ret; - - /* - * XXX: the way this is currently implemented, we can spin if a process - * is continually redirtying a specific page - */ - do { - if (!mapping->nrpages) - return 0; - - ret = filemap_write_and_wait_range(mapping, start, end); - if (ret) - break; - - if (!mapping->nrpages) - return 0; - - ret = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } while (ret == -EBUSY); - - return ret; -} - -#if 0 -/* Useful for debug tracing: */ -static const char * const bch2_folio_sector_states[] = { -#define x(n) #n, - BCH_FOLIO_SECTOR_STATE() -#undef x - NULL -}; -#endif - -static inline enum bch_folio_sector_state -folio_sector_dirty(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_unallocated: - return SECTOR_dirty; - case SECTOR_reserved: - return SECTOR_dirty_reserved; - default: - return state; - } -} - -static inline enum bch_folio_sector_state -folio_sector_undirty(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_dirty: - return SECTOR_unallocated; - case SECTOR_dirty_reserved: - return SECTOR_reserved; - default: - return state; - } -} - -static inline enum bch_folio_sector_state -folio_sector_reserve(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_unallocated: - return SECTOR_reserved; - case SECTOR_dirty: - return SECTOR_dirty_reserved; - default: - return state; - } -} - -/* for newly allocated folios: */ -struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) -{ - struct bch_folio *s; - - s = kzalloc(sizeof(*s) + - sizeof(struct bch_folio_sector) * - folio_sectors(folio), gfp); - if (!s) - return NULL; - - spin_lock_init(&s->lock); - folio_attach_private(folio, s); - return s; -} - -struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) -{ - return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); -} - -static unsigned bkey_to_sector_state(struct bkey_s_c k) -{ - if (bkey_extent_is_reservation(k)) - return SECTOR_reserved; - if (bkey_extent_is_allocation(k.k)) - return SECTOR_allocated; - return SECTOR_unallocated; -} - -static void __bch2_folio_set(struct folio *folio, - unsigned pg_offset, unsigned pg_len, - unsigned nr_ptrs, unsigned state) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, sectors = folio_sectors(folio); - - BUG_ON(pg_offset >= sectors); - BUG_ON(pg_offset + pg_len > sectors); - - spin_lock(&s->lock); - - for (i = pg_offset; i < pg_offset + pg_len; i++) { - s->s[i].nr_replicas = nr_ptrs; - bch2_folio_sector_set(folio, s, i, state); - } - - if (i == sectors) - s->uptodate = true; - - spin_unlock(&s->lock); -} - -/* - * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the - * extents btree: - */ -int bch2_folio_set(struct bch_fs *c, subvol_inum inum, - struct folio **fs, unsigned nr_folios) -{ - u64 offset = folio_sector(fs[0]); - bool need_set = false; - - for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) { - struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); - if (!s) - return -ENOMEM; - - need_set |= !s->uptodate; - } - - if (!need_set) - return 0; - - unsigned folio_idx = 0; - - return bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inum.inum, offset), - POS(inum.inum, U64_MAX), - inum.subvol, BTREE_ITER_slots, k, ({ - unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); - - while (folio_idx < nr_folios) { - struct folio *folio = fs[folio_idx]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - - folio_start; - unsigned folio_len = min(k.k->p.offset, folio_end) - - folio_offset - folio_start; - - BUG_ON(k.k->p.offset < folio_start); - BUG_ON(bkey_start_offset(k.k) > folio_end); - - if (!bch2_folio(folio)->uptodate) - __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); - - if (k.k->p.offset < folio_end) - break; - folio_idx++; - } - - if (folio_idx == nr_folios) - break; - 0; - }))); -} - -void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) -{ - struct bvec_iter iter; - struct folio_vec fv; - unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v - ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); - - bio_for_each_folio(fv, bio, iter) - __bch2_folio_set(fv.fv_folio, - fv.fv_offset >> 9, - fv.fv_len >> 9, - nr_ptrs, state); -} - -void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, - u64 start, u64 end) -{ - pgoff_t index = start >> PAGE_SECTORS_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct folio_batch fbatch; - unsigned i, j; - - if (end <= start) - return; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(inode->v.i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; - - BUG_ON(end <= folio_start); - - folio_lock(folio); - s = bch2_folio(folio); - - if (s) { - spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) - s->s[j].nr_replicas = 0; - spin_unlock(&s->lock); - } - - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } -} - -int bch2_mark_pagecache_reserved(struct bch_inode_info *inode, - u64 *start, u64 end, - bool nonblocking) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - pgoff_t index = *start >> PAGE_SECTORS_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct folio_batch fbatch; - s64 i_sectors_delta = 0; - int ret = 0; - - if (end <= *start) - return 0; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(inode->v.i_mapping, - &index, end_index, &fbatch)) { - for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - - if (!nonblocking) - folio_lock(folio); - else if (!folio_trylock(folio)) { - folio_batch_release(&fbatch); - ret = -EAGAIN; - break; - } - - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - - BUG_ON(end <= folio_start); - - *start = min(end, folio_end); - - struct bch_folio *s = bch2_folio(folio); - if (s) { - unsigned folio_offset = max(*start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - - spin_lock(&s->lock); - for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) { - i_sectors_delta -= s->s[j].state == SECTOR_dirty; - bch2_folio_sector_set(folio, s, j, - folio_sector_reserve(s->s[j].state)); - } - spin_unlock(&s->lock); - } - - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } - - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - return ret; -} - -static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, - unsigned nr_replicas) -{ - return max(0, (int) nr_replicas - - s->nr_replicas - - s->replicas_reserved); -} - -int bch2_get_folio_disk_reservation(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, bool check_enospc) -{ - struct bch_folio *s = bch2_folio_create(folio, 0); - unsigned nr_replicas = inode_nr_replicas(c, inode); - struct disk_reservation disk_res = { 0 }; - unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; - int ret; - - if (!s) - return -ENOMEM; - - for (i = 0; i < sectors; i++) - disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); - - if (!disk_res_sectors) - return 0; - - ret = bch2_disk_reservation_get(c, &disk_res, - disk_res_sectors, 1, - !check_enospc - ? BCH_DISK_RESERVATION_NOFAIL - : 0); - if (unlikely(ret)) - return ret; - - for (i = 0; i < sectors; i++) - s->s[i].replicas_reserved += - sectors_to_reserve(&s->s[i], nr_replicas); - - return 0; -} - -void bch2_folio_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch2_folio_reservation *res) -{ - bch2_disk_reservation_put(c, &res->disk); - bch2_quota_reservation_put(c, inode, &res->quota); -} - -static int __bch2_folio_reservation_get(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - size_t offset, size_t len, - bool partial) -{ - struct bch_folio *s = bch2_folio_create(folio, 0); - unsigned i, disk_sectors = 0, quota_sectors = 0; - struct disk_reservation disk_res = {}; - size_t reserved = len; - int ret; - - if (!s) - return -ENOMEM; - - BUG_ON(!s->uptodate); - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); - quota_sectors += s->s[i].state == SECTOR_unallocated; - } - - if (disk_sectors) { - ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors, - partial ? BCH_DISK_RESERVATION_PARTIAL : 0); - if (unlikely(ret)) - return ret; - - if (unlikely(disk_res.sectors != disk_sectors)) { - disk_sectors = quota_sectors = 0; - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); - if (disk_sectors > disk_res.sectors) { - /* - * Make sure to get a reservation that's - * aligned to the filesystem blocksize: - */ - unsigned reserved_offset = round_down(i << 9, block_bytes(c)); - reserved = clamp(reserved_offset, offset, offset + len) - offset; - - if (!reserved) { - bch2_disk_reservation_put(c, &disk_res); - return bch_err_throw(c, ENOSPC_disk_reservation); - } - break; - } - quota_sectors += s->s[i].state == SECTOR_unallocated; - } - } - } - - if (quota_sectors) { - ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true); - if (unlikely(ret)) { - bch2_disk_reservation_put(c, &disk_res); - return ret; - } - } - - res->disk.sectors += disk_res.sectors; - return partial ? reserved : 0; -} - -int bch2_folio_reservation_get(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - size_t offset, size_t len) -{ - return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false); -} - -ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - size_t offset, size_t len) -{ - return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true); -} - -static void bch2_clear_folio_bits(struct folio *folio) -{ - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_folio *s = bch2_folio(folio); - struct disk_reservation disk_res = { 0 }; - int i, sectors = folio_sectors(folio), dirty_sectors = 0; - - if (!s) - return; - - EBUG_ON(!folio_test_locked(folio)); - EBUG_ON(folio_test_writeback(folio)); - - for (i = 0; i < sectors; i++) { - disk_res.sectors += s->s[i].replicas_reserved; - s->s[i].replicas_reserved = 0; - - dirty_sectors -= s->s[i].state == SECTOR_dirty; - bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); - } - - bch2_disk_reservation_put(c, &disk_res); - - bch2_i_sectors_acct(c, inode, NULL, dirty_sectors); - - bch2_folio_release(folio); -} - -void bch2_set_folio_dirty(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - unsigned offset, unsigned len) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, dirty_sectors = 0; - - WARN_ON((u64) folio_pos(folio) + offset + len > - round_up((u64) i_size_read(&inode->v), block_bytes(c))); - - BUG_ON(!s->uptodate); - - spin_lock(&s->lock); - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - unsigned sectors = sectors_to_reserve(&s->s[i], - res->disk.nr_replicas); - - /* - * This can happen if we race with the error path in - * bch2_writepage_io_done(): - */ - sectors = min_t(unsigned, sectors, res->disk.sectors); - - s->s[i].replicas_reserved += sectors; - res->disk.sectors -= sectors; - - dirty_sectors += s->s[i].state == SECTOR_unallocated; - - bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); - } - - spin_unlock(&s->lock); - - bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); - - if (!folio_test_dirty(folio)) - filemap_dirty_folio(inode->v.i_mapping, folio); -} - -vm_fault_t bch2_page_fault(struct vm_fault *vmf) -{ - struct file *file = vmf->vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct address_space *fdm = faults_disabled_mapping(); - struct bch_inode_info *inode = file_bch_inode(file); - vm_fault_t ret; - - if (fdm == mapping) - return VM_FAULT_SIGBUS; - - /* Lock ordering: */ - if (fdm > mapping) { - struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); - - if (bch2_pagecache_add_tryget(inode)) - goto got_lock; - - bch2_pagecache_block_put(fdm_host); - - bch2_pagecache_add_get(inode); - bch2_pagecache_add_put(inode); - - bch2_pagecache_block_get(fdm_host); - - /* Signal that lock has been dropped: */ - set_fdm_dropped_locks(); - return VM_FAULT_SIGBUS; - } - - bch2_pagecache_add_get(inode); -got_lock: - ret = filemap_fault(vmf); - bch2_pagecache_add_put(inode); - - return ret; -} - -vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) -{ - struct folio *folio = page_folio(vmf->page); - struct file *file = vmf->vma->vm_file; - struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = file->f_mapping; - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation res; - vm_fault_t ret; - - loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c)); - unsigned offset = file_offset - folio_pos(folio); - unsigned len = max(PAGE_SIZE, block_bytes(c)); - - BUG_ON(offset + len > folio_size(folio)); - - bch2_folio_reservation_init(c, inode, &res); - - sb_start_pagefault(inode->v.i_sb); - file_update_time(file); - - /* - * Not strictly necessary, but helps avoid dio writes livelocking in - * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get - * a bch2_write_invalidate_inode_pages_range() that works without dropping - * page lock before invalidating page - */ - bch2_pagecache_add_get(inode); - - folio_lock(folio); - u64 isize = i_size_read(&inode->v); - - if (folio->mapping != mapping || file_offset >= isize) { - folio_unlock(folio); - ret = VM_FAULT_NOPAGE; - goto out; - } - - len = min_t(unsigned, len, isize - file_offset); - - if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: - bch2_folio_reservation_get(c, inode, folio, &res, offset, len)) { - folio_unlock(folio); - ret = VM_FAULT_SIGBUS; - goto out; - } - - bch2_set_folio_dirty(c, inode, folio, &res, offset, len); - bch2_folio_reservation_put(c, inode, &res); - - folio_wait_stable(folio); - ret = VM_FAULT_LOCKED; -out: - bch2_pagecache_add_put(inode); - sb_end_pagefault(inode->v.i_sb); - - return ret; -} - -void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) -{ - if (offset || length < folio_size(folio)) - return; - - bch2_clear_folio_bits(folio); -} - -bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) -{ - if (folio_test_dirty(folio) || folio_test_writeback(folio)) - return false; - - bch2_clear_folio_bits(folio); - return true; -} - -/* fseek: */ - -static int folio_data_offset(struct folio *folio, loff_t pos, - unsigned min_replicas) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, sectors = folio_sectors(folio); - - if (s) - for (i = folio_pos_to_s(folio, pos); i < sectors; i++) - if (s->s[i].state >= SECTOR_dirty && - s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) - return i << SECTOR_SHIFT; - - return -1; -} - -loff_t bch2_seek_pagecache_data(struct inode *vinode, - loff_t start_offset, - loff_t end_offset, - unsigned min_replicas, - bool nonblock) -{ - struct folio_batch fbatch; - pgoff_t start_index = start_offset >> PAGE_SHIFT; - pgoff_t end_index = end_offset >> PAGE_SHIFT; - pgoff_t index = start_index; - unsigned i; - loff_t ret; - int offset; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(vinode->i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - - if (!nonblock) { - folio_lock(folio); - } else if (!folio_trylock(folio)) { - folio_batch_release(&fbatch); - return -EAGAIN; - } - - offset = folio_data_offset(folio, - max(folio_pos(folio), start_offset), - min_replicas); - if (offset >= 0) { - ret = clamp(folio_pos(folio) + offset, - start_offset, end_offset); - folio_unlock(folio); - folio_batch_release(&fbatch); - return ret; - } - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } - - return end_offset; -} - -/* - * Search for a hole in a folio. - * - * The filemap layer returns -ENOENT if no folio exists, so reuse the same error - * code to indicate a pagecache hole exists at the returned offset. Otherwise - * return 0 if the folio is filled with data, or an error code. This function - * can return -EAGAIN if nonblock is specified. - */ -static int folio_hole_offset(struct address_space *mapping, loff_t *offset, - unsigned min_replicas, bool nonblock) -{ - struct folio *folio; - struct bch_folio *s; - unsigned i, sectors; - int ret = -ENOENT; - - folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, - FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0); - if (IS_ERR(folio)) - return PTR_ERR(folio); - - s = bch2_folio(folio); - if (!s) - goto unlock; - - sectors = folio_sectors(folio); - for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) - if (s->s[i].state < SECTOR_dirty || - s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { - *offset = max(*offset, - folio_pos(folio) + (i << SECTOR_SHIFT)); - goto unlock; - } - - *offset = folio_end_pos(folio); - ret = 0; -unlock: - folio_unlock(folio); - folio_put(folio); - return ret; -} - -loff_t bch2_seek_pagecache_hole(struct inode *vinode, - loff_t start_offset, - loff_t end_offset, - unsigned min_replicas, - bool nonblock) -{ - struct address_space *mapping = vinode->i_mapping; - loff_t offset = start_offset; - loff_t ret = 0; - - while (!ret && offset < end_offset) - ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock); - - if (ret && ret != -ENOENT) - return ret; - return min(offset, end_offset); -} - -int bch2_clamp_data_hole(struct inode *inode, - u64 *hole_start, - u64 *hole_end, - unsigned min_replicas, - bool nonblock) -{ - loff_t ret; - - ret = bch2_seek_pagecache_hole(inode, - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; - if (ret < 0) - return ret; - - *hole_start = ret; - - if (*hole_start == *hole_end) - return 0; - - ret = bch2_seek_pagecache_data(inode, - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; - if (ret < 0) - return ret; - - *hole_end = ret; - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h deleted file mode 100644 index fad911cf506801..00000000000000 --- a/fs/bcachefs/fs-io-pagecache.h +++ /dev/null @@ -1,176 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_PAGECACHE_H -#define _BCACHEFS_FS_IO_PAGECACHE_H - -#include - -typedef DARRAY(struct folio *) folios; - -int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, - u64, fgf_t, gfp_t, folios *); -int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); - -/* - * Use u64 for the end pos and sector helpers because if the folio covers the - * max supported range of the mapping, the start offset of the next folio - * overflows loff_t. This breaks much of the range based processing in the - * buffered write path. - */ -static inline u64 folio_end_pos(struct folio *folio) -{ - return folio_pos(folio) + folio_size(folio); -} - -static inline size_t folio_sectors(struct folio *folio) -{ - return PAGE_SECTORS << folio_order(folio); -} - -static inline loff_t folio_sector(struct folio *folio) -{ - return folio_pos(folio) >> 9; -} - -static inline u64 folio_end_sector(struct folio *folio) -{ - return folio_end_pos(folio) >> 9; -} - -#define BCH_FOLIO_SECTOR_STATE() \ - x(unallocated) \ - x(reserved) \ - x(dirty) \ - x(dirty_reserved) \ - x(allocated) - -enum bch_folio_sector_state { -#define x(n) SECTOR_##n, - BCH_FOLIO_SECTOR_STATE() -#undef x -}; - -struct bch_folio_sector { - /* Uncompressed, fully allocated replicas (or on disk reservation): */ - u8 nr_replicas:4, - /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ - replicas_reserved:4; - u8 state; -}; - -struct bch_folio { - spinlock_t lock; - atomic_t write_count; - /* - * Is the sector state up to date with the btree? - * (Not the data itself) - */ - bool uptodate; - struct bch_folio_sector s[]; -}; - -/* Helper for when we need to add debug instrumentation: */ -static inline void bch2_folio_sector_set(struct folio *folio, - struct bch_folio *s, - unsigned i, unsigned n) -{ - s->s[i].state = n; -} - -/* file offset (to folio offset) to bch_folio_sector index */ -static inline int folio_pos_to_s(struct folio *folio, loff_t pos) -{ - u64 f_offset = pos - folio_pos(folio); - - BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); - return f_offset >> SECTOR_SHIFT; -} - -/* for newly allocated folios: */ -static inline void __bch2_folio_release(struct folio *folio) -{ - kfree(folio_detach_private(folio)); -} - -static inline void bch2_folio_release(struct folio *folio) -{ - EBUG_ON(!folio_test_locked(folio)); - __bch2_folio_release(folio); -} - -static inline struct bch_folio *__bch2_folio(struct folio *folio) -{ - return folio_get_private(folio); -} - -static inline struct bch_folio *bch2_folio(struct folio *folio) -{ - EBUG_ON(!folio_test_locked(folio)); - - return __bch2_folio(folio); -} - -struct bch_folio *__bch2_folio_create(struct folio *, gfp_t); -struct bch_folio *bch2_folio_create(struct folio *, gfp_t); - -struct bch2_folio_reservation { - struct disk_reservation disk; - struct quota_res quota; -}; - -static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) -{ - /* XXX: this should not be open coded */ - return inode->ei_inode.bi_data_replicas - ? inode->ei_inode.bi_data_replicas - 1 - : c->opts.data_replicas; -} - -static inline void bch2_folio_reservation_init(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch2_folio_reservation *res) -{ - memset(res, 0, sizeof(*res)); - - res->disk.nr_replicas = inode_nr_replicas(c, inode); -} - -int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); -void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); - -void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); -int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool); - -int bch2_get_folio_disk_reservation(struct bch_fs *, - struct bch_inode_info *, - struct folio *, bool); - -void bch2_folio_reservation_put(struct bch_fs *, - struct bch_inode_info *, - struct bch2_folio_reservation *); -int bch2_folio_reservation_get(struct bch_fs *, - struct bch_inode_info *, - struct folio *, - struct bch2_folio_reservation *, - size_t, size_t); -ssize_t bch2_folio_reservation_get_partial(struct bch_fs *, - struct bch_inode_info *, - struct folio *, - struct bch2_folio_reservation *, - size_t, size_t); - -void bch2_set_folio_dirty(struct bch_fs *, - struct bch_inode_info *, - struct folio *, - struct bch2_folio_reservation *, - unsigned, unsigned); - -vm_fault_t bch2_page_fault(struct vm_fault *); -vm_fault_t bch2_page_mkwrite(struct vm_fault *); -void bch2_invalidate_folio(struct folio *, size_t, size_t); -bool bch2_release_folio(struct folio *, gfp_t); - -loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool); -loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool); -int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); - -#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c deleted file mode 100644 index a233f45875e966..00000000000000 --- a/fs/bcachefs/fs-io.c +++ /dev/null @@ -1,1102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "clock.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "extent_update.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-io-buffered.h" -#include "fs-io-pagecache.h" -#include "fsck.h" -#include "inode.h" -#include "journal.h" -#include "io_misc.h" -#include "keylist.h" -#include "quota.h" -#include "reflink.h" -#include "trace.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -struct nocow_flush { - struct closure *cl; - struct bch_dev *ca; - struct bio bio; -}; - -static void nocow_flush_endio(struct bio *_bio) -{ - - struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); - - closure_put(bio->cl); - enumerated_ref_put(&bio->ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_nocow_flush); - bio_put(&bio->bio); -} - -void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, - struct bch_inode_info *inode, - struct closure *cl) -{ - struct nocow_flush *bio; - struct bch_dev *ca; - struct bch_devs_mask devs; - unsigned dev; - - dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); - if (dev == BCH_SB_MEMBERS_MAX) - return; - - devs = inode->ei_devs_need_flush; - memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - - for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { - scoped_guard(rcu) { - ca = rcu_dereference(c->devs[dev]); - if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_nocow_flush)) - ca = NULL; - } - - if (!ca) - continue; - - bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, - REQ_OP_WRITE|REQ_PREFLUSH, - GFP_KERNEL, - &c->nocow_flush_bioset), - struct nocow_flush, bio); - bio->cl = cl; - bio->ca = ca; - bio->bio.bi_end_io = nocow_flush_endio; - closure_bio_submit(&bio->bio, cl); - } -} - -static int bch2_inode_flush_nocow_writes(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct closure cl; - - closure_init_stack(&cl); - bch2_inode_flush_nocow_writes_async(c, inode, &cl); - closure_sync(&cl); - - return 0; -} - -/* i_size updates: */ - -struct inode_new_size { - loff_t new_size; - u64 now; - unsigned fields; -}; - -static int inode_set_size(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct inode_new_size *s = p; - - bi->bi_size = s->new_size; - if (s->fields & ATTR_ATIME) - bi->bi_atime = s->now; - if (s->fields & ATTR_MTIME) - bi->bi_mtime = s->now; - if (s->fields & ATTR_CTIME) - bi->bi_ctime = s->now; - - return 0; -} - -int __must_check bch2_write_inode_size(struct bch_fs *c, - struct bch_inode_info *inode, - loff_t new_size, unsigned fields) -{ - struct inode_new_size s = { - .new_size = new_size, - .now = bch2_current_time(c), - .fields = fields, - }; - - return bch2_write_inode(c, inode, inode_set_size, &s, fields); -} - -void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, s64 sectors) -{ - if (unlikely((s64) inode->v.i_blocks + sectors < 0)) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, sectors, - inode->ei_inode.bi_sectors); - - bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - if (sectors < 0) - sectors = -inode->v.i_blocks; - else - sectors = 0; - } - - inode->v.i_blocks += sectors; - -#ifdef CONFIG_BCACHEFS_QUOTA - if (quota_res && - !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && - sectors > 0) { - BUG_ON(sectors > quota_res->sectors); - BUG_ON(sectors > inode->ei_quota_reserved); - - quota_res->sectors -= sectors; - inode->ei_quota_reserved -= sectors; - } else { - bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); - } -#endif -} - -/* fsync: */ - -static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, - u64 *seq) -{ - struct printbuf buf = PRINTBUF; - struct bch_inode_unpacked u; - struct btree_iter iter; - int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); - if (ret) - return ret; - - u64 cur_seq = journal_cur_seq(&trans->c->journal); - *seq = min(cur_seq, u.bi_journal_seq); - - if (fsck_err_on(u.bi_journal_seq > cur_seq, - trans, inode_journal_seq_in_future, - "inode journal seq in future (currently at %llu)\n%s", - cur_seq, - (bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - u.bi_journal_seq = cur_seq; - ret = bch2_inode_write(trans, &iter, &u); - } -fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -} - -/* - * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an - * insert trigger: look up the btree inode instead - */ -static int bch2_flush_inode(struct bch_fs *c, - struct bch_inode_info *inode) -{ - if (c->opts.journal_flush_disabled) - return 0; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) - return -EROFS; - - u64 seq; - int ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: - bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: - bch2_inode_flush_nocow_writes(c, inode); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync); - return ret; -} - -int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret, err; - - trace_bch2_fsync(file, datasync); - - ret = file_write_and_wait_range(file, start, end); - if (ret) - goto out; - ret = sync_inode_metadata(&inode->v, 1); - if (ret) - goto out; - ret = bch2_flush_inode(c, inode); -out: - ret = bch2_err_class(ret); - if (ret == -EROFS) - ret = -EIO; - - err = file_check_and_advance_wb_err(file); - if (!ret) - ret = err; - - return ret; -} - -/* truncate: */ - -static inline int range_has_data(struct bch_fs *c, u32 subvol, - struct bpos start, - struct bpos end) -{ - return bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, - subvol, 0, k, ({ - bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); - }))); -} - -static int __bch2_truncate_folio(struct bch_inode_info *inode, - pgoff_t index, loff_t start, loff_t end) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = inode->v.i_mapping; - struct bch_folio *s; - unsigned start_offset; - unsigned end_offset; - unsigned i; - struct folio *folio; - s64 i_sectors_delta = 0; - int ret = 0; - u64 end_pos; - - folio = filemap_lock_folio(mapping, index); - if (IS_ERR_OR_NULL(folio)) { - /* - * XXX: we're doing two index lookups when we end up reading the - * folio - */ - ret = range_has_data(c, inode->ei_inum.subvol, - POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), - POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); - if (ret <= 0) - return ret; - - folio = __filemap_get_folio(mapping, index, - FGP_LOCK|FGP_CREAT, GFP_KERNEL); - if (IS_ERR(folio)) { - ret = -ENOMEM; - goto out; - } - } - - BUG_ON(start >= folio_end_pos(folio)); - BUG_ON(end <= folio_pos(folio)); - - start_offset = max(start, folio_pos(folio)) - folio_pos(folio); - end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); - - /* Folio boundary? Nothing to do */ - if (start_offset == 0 && - end_offset == folio_size(folio)) { - ret = 0; - goto unlock; - } - - s = bch2_folio_create(folio, 0); - if (!s) { - ret = -ENOMEM; - goto unlock; - } - - if (!folio_test_uptodate(folio)) { - ret = bch2_read_single_folio(folio, mapping); - if (ret) - goto unlock; - } - - ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); - if (ret) - goto unlock; - - for (i = round_up(start_offset, block_bytes(c)) >> 9; - i < round_down(end_offset, block_bytes(c)) >> 9; - i++) { - s->s[i].nr_replicas = 0; - - i_sectors_delta -= s->s[i].state == SECTOR_dirty; - bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); - } - - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - - /* - * Caller needs to know whether this folio will be written out by - * writeback - doing an i_size update if necessary - or whether it will - * be responsible for the i_size update. - * - * Note that we shouldn't ever see a folio beyond EOF, but check and - * warn if so. This has been observed by failure to clean up folios - * after a short write and there's still a chance reclaim will fix - * things up. - */ - WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); - end_pos = folio_end_pos(folio); - if (inode->v.i_size > folio_pos(folio)) - end_pos = min_t(u64, inode->v.i_size, end_pos); - ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; - - folio_zero_segment(folio, start_offset, end_offset); - - /* - * Bit of a hack - we don't want truncate to fail due to -ENOSPC. - * - * XXX: because we aren't currently tracking whether the folio has actual - * data in it (vs. just 0s, or only partially written) this wrong. ick. - */ - BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); - - /* - * This removes any writeable userspace mappings; we need to force - * .page_mkwrite to be called again before any mmapped writes, to - * redirty the full page: - */ - folio_mkclean(folio); - filemap_dirty_folio(mapping, folio); -unlock: - folio_unlock(folio); - folio_put(folio); -out: - return ret; -} - -static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) -{ - return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, - from, ANYSINT_MAX(loff_t)); -} - -static int bch2_truncate_folios(struct bch_inode_info *inode, - loff_t start, loff_t end) -{ - int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, - start, end); - - if (ret >= 0 && - start >> PAGE_SHIFT != end >> PAGE_SHIFT) - ret = __bch2_truncate_folio(inode, - (end - 1) >> PAGE_SHIFT, - start, end); - return ret; -} - -static int bch2_extend(struct mnt_idmap *idmap, - struct bch_inode_info *inode, - struct bch_inode_unpacked *inode_u, - struct iattr *iattr) -{ - struct address_space *mapping = inode->v.i_mapping; - int ret; - - /* - * sync appends: - * - * this has to be done _before_ extending i_size: - */ - ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); - if (ret) - return ret; - - truncate_setsize(&inode->v, iattr->ia_size); - - return bch2_setattr_nonsize(idmap, inode, iattr); -} - -int bchfs_truncate(struct mnt_idmap *idmap, - struct bch_inode_info *inode, struct iattr *iattr) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = inode->v.i_mapping; - struct bch_inode_unpacked inode_u; - s64 i_sectors_delta = 0; - int ret = 0; - - /* - * If the truncate call with change the size of the file, the - * cmtimes should be updated. If the size will not change, we - * do not need to update the cmtimes. - */ - if (iattr->ia_size != inode->v.i_size) { - if (!(iattr->ia_valid & ATTR_MTIME)) - ktime_get_coarse_real_ts64(&iattr->ia_mtime); - if (!(iattr->ia_valid & ATTR_CTIME)) - ktime_get_coarse_real_ts64(&iattr->ia_ctime); - iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; - } - - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(inode); - - ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); - if (ret) - goto err; - - /* - * check this before next assertion; on filesystem error our normal - * invariants are a bit broken (truncate has to truncate the page cache - * before the inode). - */ - ret = bch2_journal_error(&c->journal); - if (ret) - goto err; - - WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && - inode->v.i_size < inode_u.bi_size, - "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", - (u64) inode->v.i_size, inode_u.bi_size); - - if (iattr->ia_size > inode->v.i_size) { - ret = bch2_extend(idmap, inode, &inode_u, iattr); - goto err; - } - - iattr->ia_valid &= ~ATTR_SIZE; - - ret = bch2_truncate_folio(inode, iattr->ia_size); - if (unlikely(ret < 0)) - goto err; - ret = 0; - - truncate_setsize(&inode->v, iattr->ia_size); - - /* - * When extending, we're going to write the new i_size to disk - * immediately so we need to flush anything above the current on disk - * i_size first: - * - * Also, when extending we need to flush the page that i_size currently - * straddles - if it's mapped to userspace, we need to ensure that - * userspace has to redirty it and call .mkwrite -> set_page_dirty - * again to allocate the part of the page that was extended. - */ - if (iattr->ia_size > inode_u.bi_size) - ret = filemap_write_and_wait_range(mapping, - inode_u.bi_size, - iattr->ia_size - 1); - else if (iattr->ia_size & (PAGE_SIZE - 1)) - ret = filemap_write_and_wait_range(mapping, - round_down(iattr->ia_size, PAGE_SIZE), - iattr->ia_size - 1); - if (ret) - goto err; - - ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - - if (unlikely(ret)) { - /* - * If we error here, VFS caches are now inconsistent with btree - */ - set_bit(EI_INODE_ERROR, &inode->ei_flags); - goto err; - } - - if (unlikely(!inode->v.i_size && inode->v.i_blocks && - !bch2_journal_error(&c->journal))) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, - inode->ei_inode.bi_sectors); - - bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_setattr_nonsize(idmap, inode, iattr); -err: - bch2_pagecache_block_put(inode); - return bch2_err_class(ret); -} - -/* fallocate: */ - -static int inode_update_times_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); - return 0; -} - -static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 end = offset + len; - u64 block_start = round_up(offset, block_bytes(c)); - u64 block_end = round_down(end, block_bytes(c)); - bool truncated_last_page; - int ret = 0; - - ret = bch2_truncate_folios(inode, offset, end); - if (unlikely(ret < 0)) - goto err; - - truncated_last_page = ret; - - truncate_pagecache_range(&inode->v, offset, end - 1); - - if (block_start < block_end) { - s64 i_sectors_delta = 0; - - ret = bch2_fpunch(c, inode_inum(inode), - block_start >> 9, block_end >> 9, - &i_sectors_delta); - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - } - - mutex_lock(&inode->ei_update_lock); - if (end >= inode->v.i_size && !truncated_last_page) { - ret = bch2_write_inode_size(c, inode, inode->v.i_size, - ATTR_MTIME|ATTR_CTIME); - } else { - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_MTIME|ATTR_CTIME); - } - mutex_unlock(&inode->ei_update_lock); -err: - return ret; -} - -static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode, - loff_t offset, loff_t len, - bool insert) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = inode->v.i_mapping; - s64 i_sectors_delta = 0; - int ret = 0; - - if ((offset | len) & (block_bytes(c) - 1)) - return -EINVAL; - - if (insert) { - if (offset >= inode->v.i_size) - return -EINVAL; - } else { - if (offset + len >= inode->v.i_size) - return -EINVAL; - } - - ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); - if (ret) - return ret; - - if (insert) - i_size_write(&inode->v, inode->v.i_size + len); - - ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, - insert, &i_sectors_delta); - if (!ret && !insert) - i_size_write(&inode->v, inode->v.i_size - len); - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - - return ret; -} - -static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, - u64 start_sector, u64 end_sector) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bpos end_pos = POS(inode->v.i_ino, end_sector); - struct bch_io_opts opts; - int ret = 0; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inode->v.i_ino, start_sector), - BTREE_ITER_slots|BTREE_ITER_intent); - - while (!ret) { - s64 i_sectors_delta = 0; - struct quota_res quota_res = { 0 }; - struct bkey_s_c k; - unsigned sectors; - bool is_allocation; - u64 hole_start, hole_end; - u32 snapshot; - - bch2_trans_begin(trans); - - if (bkey_ge(iter.pos, end_pos)) - break; - - ret = bch2_subvolume_get_snapshot(trans, - inode->ei_inum.subvol, &snapshot); - if (ret) - goto bkey_err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - k = bch2_btree_iter_peek_slot(trans, &iter); - if ((ret = bkey_err(k))) - goto bkey_err; - - hole_start = iter.pos.offset; - hole_end = bpos_min(k.k->p, end_pos).offset; - is_allocation = bkey_extent_is_allocation(k.k); - - /* already reserved */ - if (bkey_extent_is_reservation(k) && - bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { - bch2_btree_iter_advance(trans, &iter); - continue; - } - - if (bkey_extent_is_data(k.k) && - !(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_advance(trans, &iter); - continue; - } - - if (!(mode & FALLOC_FL_ZERO_RANGE)) { - /* - * Lock ordering - can't be holding btree locks while - * blocking on a folio lock: - */ - if (bch2_clamp_data_hole(&inode->v, - &hole_start, - &hole_end, - opts.data_replicas, true)) { - ret = drop_locks_do(trans, - (bch2_clamp_data_hole(&inode->v, - &hole_start, - &hole_end, - opts.data_replicas, false), 0)); - if (ret) - goto bkey_err; - } - bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); - - if (ret) - goto bkey_err; - - if (hole_start == hole_end) - continue; - } - - sectors = hole_end - hole_start; - - if (!is_allocation) { - ret = bch2_quota_reservation_add(c, inode, - "a_res, sectors, true); - if (unlikely(ret)) - goto bkey_err; - } - - ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, - sectors, opts, &i_sectors_delta, - writepoint_hashed((unsigned long) current)); - if (ret) - goto bkey_err; - - bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - - if (bch2_mark_pagecache_reserved(inode, &hole_start, - iter.pos.offset, true)) { - ret = drop_locks_do(trans, - bch2_mark_pagecache_reserved(inode, &hole_start, - iter.pos.offset, false)); - if (ret) - goto bkey_err; - } -bkey_err: - bch2_quota_reservation_put(c, inode, "a_res); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - } - - if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { - struct quota_res quota_res = { 0 }; - s64 i_sectors_delta = 0; - - bch2_fpunch_at(trans, &iter, inode_inum(inode), - end_sector, &i_sectors_delta); - bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - bch2_quota_reservation_put(c, inode, "a_res); - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode, - loff_t offset, loff_t len) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 end = offset + len; - u64 block_start = round_down(offset, block_bytes(c)); - u64 block_end = round_up(end, block_bytes(c)); - bool truncated_last_page = false; - int ret, ret2 = 0; - - if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { - ret = inode_newsize_ok(&inode->v, end); - if (ret) - return ret; - } - - if (mode & FALLOC_FL_ZERO_RANGE) { - ret = bch2_truncate_folios(inode, offset, end); - if (unlikely(ret < 0)) - return ret; - - truncated_last_page = ret; - - truncate_pagecache_range(&inode->v, offset, end - 1); - - block_start = round_up(offset, block_bytes(c)); - block_end = round_down(end, block_bytes(c)); - } - - ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); - - /* - * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, - * so that the VFS cache i_size is consistent with the btree i_size: - */ - if (ret && - !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) - return ret; - - if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) - end = inode->v.i_size; - - if (end >= inode->v.i_size && - (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || - !(mode & FALLOC_FL_KEEP_SIZE))) { - spin_lock(&inode->v.i_lock); - i_size_write(&inode->v, end); - spin_unlock(&inode->v.i_lock); - - mutex_lock(&inode->ei_update_lock); - ret2 = bch2_write_inode_size(c, inode, end, 0); - mutex_unlock(&inode->ei_update_lock); - } - - return ret ?: ret2; -} - -long bch2_fallocate_dispatch(struct file *file, int mode, - loff_t offset, loff_t len) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - long ret; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate)) - return -EROFS; - - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(inode); - - ret = file_modified(file); - if (ret) - goto err; - - if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) - ret = bchfs_fallocate(inode, mode, offset, len); - else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) - ret = bchfs_fpunch(inode, offset, len); - else if (mode == FALLOC_FL_INSERT_RANGE) - ret = bchfs_fcollapse_finsert(inode, offset, len, true); - else if (mode == FALLOC_FL_COLLAPSE_RANGE) - ret = bchfs_fcollapse_finsert(inode, offset, len, false); - else - ret = -EOPNOTSUPP; -err: - bch2_pagecache_block_put(inode); - inode_unlock(&inode->v); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate); - - return bch2_err_class(ret); -} - -/* - * Take a quota reservation for unallocated blocks in a given file range - * Does not check pagecache - */ -static int quota_reserve_range(struct bch_inode_info *inode, - struct quota_res *res, - u64 start, u64 end) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 sectors = end - start; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, - BTREE_ID_extents, - POS(inode->v.i_ino, start), - POS(inode->v.i_ino, end - 1), - inode->ei_inum.subvol, 0, k, ({ - if (bkey_extent_is_allocation(k.k)) { - u64 s = min(end, k.k->p.offset) - - max(start, bkey_start_offset(k.k)); - BUG_ON(s > sectors); - sectors -= s; - } - - 0; - }))); - - return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); -} - -loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - loff_t len, unsigned remap_flags) -{ - struct bch_inode_info *src = file_bch_inode(file_src); - struct bch_inode_info *dst = file_bch_inode(file_dst); - struct bch_fs *c = src->v.i_sb->s_fs_info; - struct quota_res quota_res = { 0 }; - s64 i_sectors_delta = 0; - u64 aligned_len; - loff_t ret = 0; - - if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) - return -EINVAL; - - if ((pos_src & (block_bytes(c) - 1)) || - (pos_dst & (block_bytes(c) - 1))) - return -EINVAL; - - if (src == dst && - abs(pos_src - pos_dst) < len) - return -EINVAL; - - lock_two_nondirectories(&src->v, &dst->v); - bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); - - inode_dio_wait(&src->v); - inode_dio_wait(&dst->v); - - ret = generic_remap_file_range_prep(file_src, pos_src, - file_dst, pos_dst, - &len, remap_flags); - if (ret < 0 || len == 0) - goto err; - - aligned_len = round_up((u64) len, block_bytes(c)); - - ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, - pos_dst, pos_dst + len - 1); - if (ret) - goto err; - - ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, - (pos_dst + aligned_len) >> 9); - if (ret) - goto err; - - if (!(remap_flags & REMAP_FILE_DEDUP)) - file_update_time(file_dst); - - bch2_mark_pagecache_unallocated(src, pos_src >> 9, - (pos_src + aligned_len) >> 9); - - /* - * XXX: we'd like to be telling bch2_remap_range() if we have - * permission to write to the source file, and thus if io path option - * changes should be propagated through the copy, but we need mnt_idmap - * from the pathwalk, awkward - */ - ret = bch2_remap_range(c, - inode_inum(dst), pos_dst >> 9, - inode_inum(src), pos_src >> 9, - aligned_len >> 9, - pos_dst + len, &i_sectors_delta, - false); - if (ret < 0) - goto err; - - /* - * due to alignment, we might have remapped slightly more than requsted - */ - ret = min((u64) ret << 9, (u64) len); - - bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); - - spin_lock(&dst->v.i_lock); - if (pos_dst + ret > dst->v.i_size) - i_size_write(&dst->v, pos_dst + ret); - spin_unlock(&dst->v.i_lock); - - if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || - IS_SYNC(file_inode(file_dst))) - ret = bch2_flush_inode(c, dst); -err: - bch2_quota_reservation_put(c, dst, "a_res); - bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); - unlock_two_nondirectories(&src->v, &dst->v); - - return bch2_err_class(ret); -} - -/* fseek: */ - -static loff_t bch2_seek_data(struct file *file, u64 offset) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - subvol_inum inum = inode_inum(inode); - u64 isize, next_data = MAX_LFS_FILESIZE; - - isize = i_size_read(&inode->v); - if (offset >= isize) - return -ENXIO; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), - POS(inode->v.i_ino, U64_MAX), - inum.subvol, 0, k, ({ - if (bkey_extent_is_data(k.k)) { - next_data = max(offset, bkey_start_offset(k.k) << 9); - break; - } else if (k.k->p.offset >> 9 > isize) - break; - 0; - }))); - if (ret) - return ret; - - if (next_data > offset) - next_data = bch2_seek_pagecache_data(&inode->v, - offset, next_data, 0, false); - - if (next_data >= isize) - return -ENXIO; - - return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); -} - -static loff_t bch2_seek_hole(struct file *file, u64 offset) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - subvol_inum inum = inode_inum(inode); - u64 isize, next_hole = MAX_LFS_FILESIZE; - - isize = i_size_read(&inode->v); - if (offset >= isize) - return -ENXIO; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), - POS(inode->v.i_ino, U64_MAX), - inum.subvol, BTREE_ITER_slots, k, ({ - if (k.k->p.inode != inode->v.i_ino || - !bkey_extent_is_data(k.k)) { - loff_t start_offset = k.k->p.inode == inode->v.i_ino - ? max(offset, bkey_start_offset(k.k) << 9) - : offset; - loff_t end_offset = k.k->p.inode == inode->v.i_ino - ? MAX_LFS_FILESIZE - : k.k->p.offset << 9; - - /* - * Found a hole in the btree, now make sure it's - * a hole in the pagecache. We might have to - * keep searching if this hole is entirely dirty - * in the page cache: - */ - bch2_trans_unlock(trans); - loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, - start_offset, end_offset, 0, false); - if (pagecache_hole < end_offset) { - next_hole = pagecache_hole; - break; - } - } else { - offset = max(offset, bkey_start_offset(k.k) << 9); - } - 0; - }))); - if (ret) - return ret; - - if (next_hole > isize) - next_hole = isize; - - return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); -} - -loff_t bch2_llseek(struct file *file, loff_t offset, int whence) -{ - loff_t ret; - - switch (whence) { - case SEEK_SET: - case SEEK_CUR: - case SEEK_END: - ret = generic_file_llseek(file, offset, whence); - break; - case SEEK_DATA: - ret = bch2_seek_data(file, offset); - break; - case SEEK_HOLE: - ret = bch2_seek_hole(file, offset); - break; - default: - ret = -EINVAL; - break; - } - - return bch2_err_class(ret); -} - -void bch2_fs_fsio_exit(struct bch_fs *c) -{ - bioset_exit(&c->nocow_flush_bioset); -} - -int bch2_fs_fsio_init(struct bch_fs *c) -{ - if (bioset_init(&c->nocow_flush_bioset, - 1, offsetof(struct nocow_flush, bio), 0)) - return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; - - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h deleted file mode 100644 index ca70346e68dc3d..00000000000000 --- a/fs/bcachefs/fs-io.h +++ /dev/null @@ -1,184 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_H -#define _BCACHEFS_FS_IO_H - -#ifndef NO_BCACHEFS_FS - -#include "buckets.h" -#include "fs.h" -#include "io_write_types.h" -#include "quota.h" - -#include - -struct folio_vec { - struct folio *fv_folio; - size_t fv_offset; - size_t fv_len; -}; - -static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) -{ - - struct folio *folio = page_folio(bv.bv_page); - size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + - bv.bv_offset; - size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); - - return (struct folio_vec) { - .fv_folio = folio, - .fv_offset = offset, - .fv_len = len, - }; -} - -static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, - struct bvec_iter iter) -{ - return biovec_to_foliovec(bio_iter_iovec(bio, iter)); -} - -#define __bio_for_each_folio(bvl, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ - bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) - -/** - * bio_for_each_folio - iterate over folios within a bio - * - * Like other non-_all versions, this iterates over what bio->bi_iter currently - * points to. This version is for drivers, where the bio may have previously - * been split or cloned. - */ -#define bio_for_each_folio(bvl, bio, iter) \ - __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) - -struct quota_res { - u64 sectors; -}; - -#ifdef CONFIG_BCACHEFS_QUOTA - -static inline void __bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) -{ - BUG_ON(res->sectors > inode->ei_quota_reserved); - - bch2_quota_acct(c, inode->ei_qid, Q_SPC, - -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); - inode->ei_quota_reserved -= res->sectors; - res->sectors = 0; -} - -static inline void bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) -{ - if (res->sectors) { - mutex_lock(&inode->ei_quota_lock); - __bch2_quota_reservation_put(c, inode, res); - mutex_unlock(&inode->ei_quota_lock); - } -} - -static inline int bch2_quota_reservation_add(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res, - u64 sectors, - bool check_enospc) -{ - int ret; - - if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) - return 0; - - mutex_lock(&inode->ei_quota_lock); - ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, - check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); - if (likely(!ret)) { - inode->ei_quota_reserved += sectors; - res->sectors += sectors; - } - mutex_unlock(&inode->ei_quota_lock); - - return ret; -} - -#else - -static inline void __bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) {} - -static inline void bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) {} - -static inline int bch2_quota_reservation_add(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res, - unsigned sectors, - bool check_enospc) -{ - return 0; -} - -#endif - -void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *, - struct quota_res *, s64); - -static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, s64 sectors) -{ - if (sectors) { - mutex_lock(&inode->ei_quota_lock); - __bch2_i_sectors_acct(c, inode, quota_res, sectors); - mutex_unlock(&inode->ei_quota_lock); - } -} - -static inline struct address_space *faults_disabled_mapping(void) -{ - return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); -} - -static inline void set_fdm_dropped_locks(void) -{ - current->faults_disabled_mapping = - (void *) (((unsigned long) current->faults_disabled_mapping)|1); -} - -static inline bool fdm_dropped_locks(void) -{ - return ((unsigned long) current->faults_disabled_mapping) & 1; -} - -void bch2_inode_flush_nocow_writes_async(struct bch_fs *, - struct bch_inode_info *, struct closure *); - -int __must_check bch2_write_inode_size(struct bch_fs *, - struct bch_inode_info *, - loff_t, unsigned); - -int bch2_fsync(struct file *, loff_t, loff_t, int); - -int bchfs_truncate(struct mnt_idmap *, - struct bch_inode_info *, struct iattr *); -long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); - -loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, - loff_t, loff_t, unsigned); - -loff_t bch2_llseek(struct file *, loff_t, int); - -void bch2_fs_fsio_exit(struct bch_fs *); -int bch2_fs_fsio_init(struct bch_fs *); -#else -static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} -static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } -#endif - -#endif /* _BCACHEFS_FS_IO_H */ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c deleted file mode 100644 index 43510da5e73440..00000000000000 --- a/fs/bcachefs/fs-ioctl.c +++ /dev/null @@ -1,440 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "chardev.h" -#include "dirent.h" -#include "fs.h" -#include "fs-ioctl.h" -#include "namei.h" -#include "quota.h" - -#include -#include -#include -#include -#include -#include - -#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) -#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ -#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ -#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ - -static int bch2_reinherit_attrs_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_inode_info *dir = p; - - return !bch2_reinherit_attrs(bi, &dir->ei_inode); -} - -static int bch2_ioc_reinherit_attrs(struct bch_fs *c, - struct file *file, - struct bch_inode_info *src, - const char __user *name) -{ - struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode); - struct bch_inode_info *dst; - struct inode *vinode = NULL; - char *kname = NULL; - struct qstr qstr; - int ret = 0; - subvol_inum inum; - - kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL); - if (!kname) - return -ENOMEM; - - ret = strncpy_from_user(kname, name, BCH_NAME_MAX); - if (unlikely(ret < 0)) - goto err1; - - qstr.len = ret; - qstr.name = kname; - - ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); - if (ret) - goto err1; - - vinode = bch2_vfs_inode_get(c, inum); - ret = PTR_ERR_OR_ZERO(vinode); - if (ret) - goto err1; - - dst = to_bch_ei(vinode); - - ret = mnt_want_write_file(file); - if (ret) - goto err2; - - bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); - - if (inode_attr_changing(src, dst, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, dst, - src->ei_qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err3; - } - - ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); -err3: - bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); - - /* return true if we did work */ - if (ret >= 0) - ret = !ret; - - mnt_drop_write_file(file); -err2: - iput(vinode); -err1: - kfree(kname); - - return ret; -} - -static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg) -{ - return put_user(inode->v.i_generation, arg); -} - -static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label) -{ - int ret; - size_t len; - char label[BCH_SB_LABEL_SIZE]; - - BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX); - - mutex_lock(&c->sb_lock); - memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE); - mutex_unlock(&c->sb_lock); - - len = strnlen(label, BCH_SB_LABEL_SIZE); - if (len == BCH_SB_LABEL_SIZE) { - bch_warn(c, - "label is too long, return the first %zu bytes", - --len); - } - - ret = copy_to_user(user_label, label, len); - - return ret ? -EFAULT : 0; -} - -static int bch2_ioc_setlabel(struct bch_fs *c, - struct file *file, - struct bch_inode_info *inode, - const char __user *user_label) -{ - int ret; - char label[BCH_SB_LABEL_SIZE]; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(label, user_label, sizeof(label))) - return -EFAULT; - - if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) { - bch_err(c, - "unable to set label with more than %d bytes", - BCH_SB_LABEL_SIZE - 1); - return -EINVAL; - } - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - mutex_lock(&c->sb_lock); - strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); - ret = bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - mnt_drop_write_file(file); - return ret; -} - -static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) -{ - u32 flags; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (get_user(flags, arg)) - return -EFAULT; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "shutdown by ioctl type %u", flags); - - switch (flags) { - case FSOP_GOING_FLAGS_DEFAULT: - ret = bdev_freeze(c->vfs_sb->s_bdev); - if (ret) - break; - bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only2(c, &buf); - bdev_thaw(c->vfs_sb->s_bdev); - break; - case FSOP_GOING_FLAGS_LOGFLUSH: - bch2_journal_flush(&c->journal); - fallthrough; - case FSOP_GOING_FLAGS_NOLOGFLUSH: - bch2_fs_emergency_read_only2(c, &buf); - break; - default: - ret = -EINVAL; - goto noprint; - } - - bch2_print_str(c, KERN_ERR, buf.buf); -noprint: - printbuf_exit(&buf); - return ret; -} - -static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) -{ - struct inode *dir; - struct bch_inode_info *inode; - struct user_namespace *s_user_ns; - struct dentry *dst_dentry; - struct path src_path, dst_path; - int how = LOOKUP_FOLLOW; - int error; - subvol_inum snapshot_src = { 0 }; - unsigned lookup_flags = 0; - unsigned create_flags = BCH_CREATE_SUBVOL; - - if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| - BCH_SUBVOL_SNAPSHOT_RO)) - return -EINVAL; - - if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && - (arg.src_ptr || - (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) - return -EINVAL; - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) - create_flags |= BCH_CREATE_SNAPSHOT; - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) - create_flags |= BCH_CREATE_SNAPSHOT_RO; - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) { - /* sync_inodes_sb enforce s_umount is locked */ - down_read(&c->vfs_sb->s_umount); - sync_inodes_sb(c->vfs_sb); - up_read(&c->vfs_sb->s_umount); - } - - if (arg.src_ptr) { - error = user_path_at(arg.dirfd, - (const char __user *)(unsigned long)arg.src_ptr, - how, &src_path); - if (error) - goto err1; - - if (src_path.dentry->d_sb->s_fs_info != c) { - path_put(&src_path); - error = -EXDEV; - goto err1; - } - - snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); - } - - dst_dentry = start_creating_user_path(arg.dirfd, - (const char __user *)(unsigned long)arg.dst_ptr, - &dst_path, lookup_flags); - error = PTR_ERR_OR_ZERO(dst_dentry); - if (error) - goto err2; - - if (dst_dentry->d_sb->s_fs_info != c) { - error = -EXDEV; - goto err3; - } - - if (dst_dentry->d_inode) { - error = bch_err_throw(c, EEXIST_subvolume_create); - goto err3; - } - - dir = dst_path.dentry->d_inode; - if (IS_DEADDIR(dir)) { - error = bch_err_throw(c, ENOENT_directory_dead); - goto err3; - } - - s_user_ns = dir->i_sb->s_user_ns; - if (!kuid_has_mapping(s_user_ns, current_fsuid()) || - !kgid_has_mapping(s_user_ns, current_fsgid())) { - error = -EOVERFLOW; - goto err3; - } - - error = inode_permission(file_mnt_idmap(filp), - dir, MAY_WRITE | MAY_EXEC); - if (error) - goto err3; - - if (!IS_POSIXACL(dir)) - arg.mode &= ~current_umask(); - - error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); - if (error) - goto err3; - - if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && - !arg.src_ptr) - snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; - - down_write(&c->snapshot_create_lock); - inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), - dst_dentry, arg.mode|S_IFDIR, - 0, snapshot_src, create_flags); - up_write(&c->snapshot_create_lock); - - error = PTR_ERR_OR_ZERO(inode); - if (error) - goto err3; - - d_instantiate(dst_dentry, &inode->v); - fsnotify_mkdir(dir, dst_dentry); -err3: - end_creating_path(&dst_path, dst_dentry); -err2: - if (arg.src_ptr) - path_put(&src_path); -err1: - return error; -} - -static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) -{ - const char __user *name = (void __user *)(unsigned long)arg.dst_ptr; - struct path path; - struct inode *dir; - struct dentry *victim; - int ret = 0; - - if (arg.flags) - return -EINVAL; - - victim = start_removing_user_path_at(arg.dirfd, name, &path); - if (IS_ERR(victim)) - return PTR_ERR(victim); - - dir = d_inode(path.dentry); - if (victim->d_sb->s_fs_info != c) { - ret = -EXDEV; - goto err; - } - - ret = inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?: - __bch2_unlink(dir, victim, true); - if (!ret) { - fsnotify_rmdir(dir, victim); - d_invalidate(victim); - } -err: - end_removing_path(&path, victim); - return ret; -} - -long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - long ret; - - switch (cmd) { - case BCHFS_IOC_REINHERIT_ATTRS: - ret = bch2_ioc_reinherit_attrs(c, file, inode, - (void __user *) arg); - break; - - case FS_IOC_GETVERSION: - ret = bch2_ioc_getversion(inode, (u32 __user *) arg); - break; - - case FS_IOC_SETVERSION: - ret = -ENOTTY; - break; - - case FS_IOC_GETFSLABEL: - ret = bch2_ioc_getlabel(c, (void __user *) arg); - break; - - case FS_IOC_SETFSLABEL: - ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg); - break; - - case FS_IOC_GOINGDOWN: - ret = bch2_ioc_goingdown(c, (u32 __user *) arg); - break; - - case BCH_IOCTL_SUBVOLUME_CREATE: { - struct bch_ioctl_subvolume i; - - ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) - ? -EFAULT - : bch2_ioctl_subvolume_create(c, file, i); - break; - } - - case BCH_IOCTL_SUBVOLUME_DESTROY: { - struct bch_ioctl_subvolume i; - - ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) - ? -EFAULT - : bch2_ioctl_subvolume_destroy(c, file, i); - break; - } - - default: - ret = bch2_fs_ioctl(c, cmd, (void __user *) arg); - break; - } - - return bch2_err_class(ret); -} - -#ifdef CONFIG_COMPAT -long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - /* These are just misnamed, they actually get/put from/to user an int */ - switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; - case FS_IOC32_GETVERSION: - cmd = FS_IOC_GETVERSION; - break; - case FS_IOC_GETFSLABEL: - case FS_IOC_SETFSLABEL: - break; - default: - return -ENOIOCTLCMD; - } - return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -} -#endif - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h deleted file mode 100644 index a657e4994b7153..00000000000000 --- a/fs/bcachefs/fs-ioctl.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IOCTL_H -#define _BCACHEFS_FS_IOCTL_H - -long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); -long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); - -#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c deleted file mode 100644 index 687af0eea0c2b4..00000000000000 --- a/fs/bcachefs/fs.c +++ /dev/null @@ -1,2768 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "acl.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "chardev.h" -#include "dirent.h" -#include "errcode.h" -#include "extents.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-ioctl.h" -#include "fs-io-buffered.h" -#include "fs-io-direct.h" -#include "fs-io-pagecache.h" -#include "fsck.h" -#include "inode.h" -#include "io_read.h" -#include "journal.h" -#include "keylist.h" -#include "namei.h" -#include "quota.h" -#include "rebalance.h" -#include "snapshot.h" -#include "super.h" -#include "xattr.h" -#include "trace.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct kmem_cache *bch2_inode_cache; - -static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, - struct bch_inode_info *, - struct bch_inode_unpacked *, - struct bch_subvolume *); - -/* Set VFS inode flags from bcachefs inode: */ -static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode) -{ - static const __maybe_unused unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_sync] = S_SYNC, - [__BCH_INODE_immutable] = S_IMMUTABLE, - [__BCH_INODE_append] = S_APPEND, - [__BCH_INODE_noatime] = S_NOATIME, - }; - - set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); - - if (bch2_inode_casefold(c, &inode->ei_inode)) - inode->v.i_flags |= S_CASEFOLD; - else - inode->v.i_flags &= ~S_CASEFOLD; -} - -void bch2_inode_update_after_write(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - unsigned fields) -{ - struct bch_fs *c = trans->c; - - BUG_ON(bi->bi_inum != inode->v.i_ino); - - bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); - - set_nlink(&inode->v, bch2_inode_nlink_get(bi)); - i_uid_write(&inode->v, bi->bi_uid); - i_gid_write(&inode->v, bi->bi_gid); - inode->v.i_mode = bi->bi_mode; - - if (fields & ATTR_SIZE) - i_size_write(&inode->v, bi->bi_size); - - if (fields & ATTR_ATIME) - inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); - if (fields & ATTR_MTIME) - inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); - if (fields & ATTR_CTIME) - inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); - - inode->ei_inode = *bi; - - bch2_inode_flags_to_vfs(c, inode); -} - -int __must_check bch2_write_inode(struct bch_fs *c, - struct bch_inode_info *inode, - inode_set_fn set, - void *p, unsigned fields) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = {}; - struct bch_inode_unpacked inode_u; - int ret; -retry: - bch2_trans_begin(trans); - - ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); - if (ret) - goto err; - - struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); - - ret = (set ? set(trans, inode, &inode_u, p) : 0); - if (ret) - goto err; - - struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); - bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); - - if (rebalance_changed) { - ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); - if (ret) - goto err; - } - - ret = bch2_inode_write(trans, &iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - - /* - * the btree node lock protects inode->ei_inode, not ei_update_lock; - * this is important for inode updates via bchfs_write_index_update - */ - if (!ret) - bch2_inode_update_after_write(trans, inode, &inode_u, fields); -err: - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (rebalance_changed) - bch2_rebalance_wakeup(c); - - bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, - "%s: inode %llu:%llu not found when updating", - bch2_err_str(ret), - inode_inum(inode).subvol, - inode_inum(inode).inum); - - bch2_trans_put(trans); - return ret < 0 ? ret : 0; -} - -int bch2_fs_quota_transfer(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch_qid new_qid, - unsigned qtypes, - enum quota_acct_mode mode) -{ - unsigned i; - int ret; - - qtypes &= enabled_qtypes(c); - - for (i = 0; i < QTYP_NR; i++) - if (new_qid.q[i] == inode->ei_qid.q[i]) - qtypes &= ~(1U << i); - - if (!qtypes) - return 0; - - mutex_lock(&inode->ei_quota_lock); - - ret = bch2_quota_transfer(c, qtypes, new_qid, - inode->ei_qid, - inode->v.i_blocks + - inode->ei_quota_reserved, - mode); - if (!ret) - for (i = 0; i < QTYP_NR; i++) - if (qtypes & (1 << i)) - inode->ei_qid.q[i] = new_qid.q[i]; - - mutex_unlock(&inode->ei_quota_lock); - - return ret; -} - -static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) -{ - const subvol_inum *inum = data; - siphash_key_t k = { .key[0] = seed }; - - return siphash_2u64(inum->subvol, inum->inum, &k); -} - -static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) -{ - const struct bch_inode_info *inode = data; - - return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); -} - -static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct bch_inode_info *inode = obj; - const subvol_inum *v = arg->key; - - return !subvol_inum_eq(inode->ei_inum, *v); -} - -static const struct rhashtable_params bch2_vfs_inodes_params = { - .head_offset = offsetof(struct bch_inode_info, hash), - .key_offset = offsetof(struct bch_inode_info, ei_inum), - .key_len = sizeof(subvol_inum), - .hashfn = bch2_vfs_inode_hash_fn, - .obj_hashfn = bch2_vfs_inode_obj_hash_fn, - .obj_cmpfn = bch2_vfs_inode_cmp_fn, - .automatic_shrinking = true, -}; - -static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { - .head_offset = offsetof(struct bch_inode_info, by_inum_hash), - .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), - .key_len = sizeof(u64), - .automatic_shrinking = true, -}; - -int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) -{ - struct bch_fs *c = trans->c; - struct rhltable *ht = &c->vfs_inodes_by_inum_table; - u64 inum = p.offset; - DARRAY(u32) subvols; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return false; - - darray_init(&subvols); -restart_from_top: - - /* - * Tweaked version of __rhashtable_lookup(); we need to get a list of - * subvolumes in which the given inode number is open. - * - * For this to work, we don't include the subvolume ID in the key that - * we hash - all inodes with the same inode number regardless of - * subvolume will hash to the same slot. - * - * This will be less than ideal if the same file is ever open - * simultaneously in many different snapshots: - */ - rcu_read_lock(); - struct rhash_lock_head __rcu *const *bkt; - struct rhash_head *he; - unsigned int hash; - struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); -restart: - hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); - bkt = rht_bucket(tbl, hash); - do { - struct bch_inode_info *inode; - - rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { - if (inode->ei_inum.inum == inum) { - ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, - GFP_NOWAIT|__GFP_NOWARN); - if (ret) { - rcu_read_unlock(); - ret = darray_make_room(&subvols, 1); - if (ret) - goto err; - subvols.nr = 0; - goto restart_from_top; - } - } - } - /* An object might have been moved to a different hash chain, - * while we walk along it - better check and retry. - */ - } while (he != RHT_NULLS_MARKER(bkt)); - - /* Ensure we see any new tables. */ - smp_rmb(); - - tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); - if (unlikely(tbl)) - goto restart; - rcu_read_unlock(); - - darray_for_each(subvols, i) { - u32 snap; - ret = bch2_subvolume_get_snapshot(trans, *i, &snap); - if (ret) - goto err; - - ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); - if (ret) - break; - } -err: - darray_exit(&subvols); - return ret; -} - -static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) -{ - return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); -} - -static void __wait_on_freeing_inode(struct bch_fs *c, - struct bch_inode_info *inode, - subvol_inum inum) -{ - wait_queue_head_t *wq; - struct wait_bit_queue_entry wait; - - wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->v.i_lock); - - if (__bch2_inode_hash_find(c, inum) == inode) - schedule_timeout(HZ * 10); - finish_wait(wq, &wait.wq_entry); -} - -static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, - subvol_inum inum) -{ - struct bch_inode_info *inode; -repeat: - inode = __bch2_inode_hash_find(c, inum); - if (inode) { - spin_lock(&inode->v.i_lock); - if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { - spin_unlock(&inode->v.i_lock); - return NULL; - } - if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { - if (!trans) { - __wait_on_freeing_inode(c, inode, inum); - } else { - int ret = drop_locks_do(trans, - (__wait_on_freeing_inode(c, inode, inum), 0)); - if (ret) - return ERR_PTR(ret); - } - goto repeat; - } - __iget(&inode->v); - spin_unlock(&inode->v.i_lock); - } - - return inode; -} - -static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) -{ - spin_lock(&inode->v.i_lock); - bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); - spin_unlock(&inode->v.i_lock); - - if (remove) { - int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, - &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); - BUG_ON(ret); - - ret = rhashtable_remove_fast(&c->vfs_inodes_table, - &inode->hash, bch2_vfs_inodes_params); - BUG_ON(ret); - inode->v.i_hash.pprev = NULL; - /* - * This pairs with the bch2_inode_hash_find() -> - * __wait_on_freeing_inode() path - */ - inode_wake_up_bit(&inode->v, __I_NEW); - } -} - -static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, - struct btree_trans *trans, - struct bch_inode_info *inode) -{ - struct bch_inode_info *old = inode; - - set_bit(EI_INODE_HASHED, &inode->ei_flags); -retry: - if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, - &inode->ei_inum, - &inode->hash, - bch2_vfs_inodes_params))) { - old = bch2_inode_hash_find(c, trans, inode->ei_inum); - if (!old) - goto retry; - - clear_bit(EI_INODE_HASHED, &inode->ei_flags); - - /* - * bcachefs doesn't use I_NEW; we have no use for it since we - * only insert fully created inodes in the inode hash table. But - * discard_new_inode() expects it to be set... - */ - inode->v.i_state |= I_NEW; - /* - * We don't want bch2_evict_inode() to delete the inode on disk, - * we just raced and had another inode in cache. Normally new - * inodes don't have nlink == 0 - except tmpfiles do... - */ - set_nlink(&inode->v, 1); - discard_new_inode(&inode->v); - return old; - } else { - int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, - &inode->by_inum_hash, - bch2_vfs_inodes_by_inum_params); - BUG_ON(ret); - - inode_fake_hash(&inode->v); - - inode_sb_list_add(&inode->v); - - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); - return inode; - } -} - -#define memalloc_flags_do(_flags, _do) \ -({ \ - unsigned _saved_flags = memalloc_flags_save(_flags); \ - typeof(_do) _ret = _do; \ - memalloc_noreclaim_restore(_saved_flags); \ - _ret; \ -}) - -static struct inode *bch2_alloc_inode(struct super_block *sb) -{ - BUG(); -} - -static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) -{ - struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, - bch2_inode_cache, gfp); - if (!inode) - return NULL; - - inode_init_once(&inode->v); - mutex_init(&inode->ei_update_lock); - two_state_lock_init(&inode->ei_pagecache_lock); - INIT_LIST_HEAD(&inode->ei_vfs_inode_list); - inode->ei_flags = 0; - mutex_init(&inode->ei_quota_lock); - memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - - if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { - kmem_cache_free(bch2_inode_cache, inode); - return NULL; - } - - return inode; -} - -/* - * Allocate a new inode, dropping/retaking btree locks if necessary: - */ -static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) -{ - struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); - - if (unlikely(!inode)) { - int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); - if (ret && inode) { - __destroy_inode(&inode->v); - kmem_cache_free(bch2_inode_cache, inode); - } - if (ret) - return ERR_PTR(ret); - } - - return inode; -} - -static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *bi, - struct bch_subvolume *subvol) -{ - struct bch_inode_info *inode = bch2_new_inode(trans); - if (IS_ERR(inode)) - return inode; - - bch2_vfs_inode_init(trans, inum, inode, bi, subvol); - - return bch2_inode_hash_insert(trans->c, trans, inode); - -} - -struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) -{ - struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); - if (inode) - return &inode->v; - - struct btree_trans *trans = bch2_trans_get(c); - - struct bch_inode_unpacked inode_u; - struct bch_subvolume subvol; - int ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: - PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); - bch2_trans_put(trans); - - return ret ? ERR_PTR(ret) : &inode->v; -} - -struct bch_inode_info * -__bch2_create(struct mnt_idmap *idmap, - struct bch_inode_info *dir, struct dentry *dentry, - umode_t mode, dev_t rdev, subvol_inum snapshot_src, - unsigned flags) -{ - struct bch_fs *c = dir->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct bch_inode_unpacked dir_u; - struct bch_inode_info *inode; - struct bch_inode_unpacked inode_u; - struct posix_acl *default_acl = NULL, *acl = NULL; - subvol_inum inum; - struct bch_subvolume subvol; - u64 journal_seq = 0; - kuid_t kuid; - kgid_t kgid; - int ret; - - /* - * preallocate acls + vfs inode before btree transaction, so that - * nothing can fail after the transaction succeeds: - */ -#ifdef CONFIG_BCACHEFS_POSIX_ACL - ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); - if (ret) - return ERR_PTR(ret); -#endif - inode = __bch2_new_inode(c, GFP_NOFS); - if (unlikely(!inode)) { - inode = ERR_PTR(-ENOMEM); - goto err; - } - - bch2_inode_init_early(c, &inode_u); - - if (!(flags & BCH_CREATE_TMPFILE)) - mutex_lock(&dir->ei_update_lock); - - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); - kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); - ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: - bch2_create_trans(trans, - inode_inum(dir), &dir_u, &inode_u, - !(flags & BCH_CREATE_TMPFILE) - ? &dentry->d_name : NULL, - from_kuid(i_user_ns(&dir->v), kuid), - from_kgid(i_user_ns(&dir->v), kgid), - mode, rdev, - default_acl, acl, snapshot_src, flags) ?: - bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, - KEY_TYPE_QUOTA_PREALLOC); - if (unlikely(ret)) - goto err_before_quota; - - inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; - inum.inum = inode_u.bi_inum; - - ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_trans_commit(trans, NULL, &journal_seq, 0); - if (unlikely(ret)) { - bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, - KEY_TYPE_QUOTA_WARN); -err_before_quota: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - goto err_trans; - } - - if (!(flags & BCH_CREATE_TMPFILE)) { - bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - mutex_unlock(&dir->ei_update_lock); - } - - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - - set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); - set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); - - /* - * we must insert the new inode into the inode cache before calling - * bch2_trans_exit() and dropping locks, else we could race with another - * thread pulling the inode in and modifying it: - * - * also, calling bch2_inode_hash_insert() without passing in the - * transaction object is sketchy - if we could ever end up in - * __wait_on_freeing_inode(), we'd risk deadlock. - * - * But that shouldn't be possible, since we still have the inode locked - * that we just created, and we _really_ can't take a transaction - * restart here. - */ - inode = bch2_inode_hash_insert(c, NULL, inode); - bch2_trans_put(trans); -err: - posix_acl_release(default_acl); - posix_acl_release(acl); - return inode; -err_trans: - if (!(flags & BCH_CREATE_TMPFILE)) - mutex_unlock(&dir->ei_update_lock); - - bch2_trans_put(trans); - make_bad_inode(&inode->v); - iput(&inode->v); - inode = ERR_PTR(ret); - goto err; -} - -/* methods */ - -static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, - subvol_inum dir, struct bch_hash_info *dir_hash_info, - const struct qstr *name) -{ - struct bch_fs *c = trans->c; - subvol_inum inum = {}; - struct printbuf buf = PRINTBUF; - - struct qstr lookup_name; - int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name); - if (ret) - return ERR_PTR(ret); - - struct btree_iter dirent_iter = {}; - struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, - dir_hash_info, dir, &lookup_name, 0); - ret = bkey_err(k); - if (ret) - return ERR_PTR(ret); - - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - ret = bch2_dirent_read_target(trans, dir, d, &inum); - if (ret > 0) - ret = -ENOENT; - if (ret) - goto err; - - struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); - if (inode) - goto out; - - /* - * Note: if check/repair needs it, we commit before - * bch2_inode_hash_init_insert(), as after that point we can't take a - * restart - not in the top level loop with a commit_do(), like we - * usually do: - */ - - struct bch_subvolume subvol; - struct bch_inode_unpacked inode_u; - ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: - bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); - - /* - * don't remove it: check_inodes might find another inode that points - * back to this dirent - */ - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - c, "dirent to missing inode:\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); - if (ret) - goto err; -out: - bch2_trans_iter_exit(trans, &dirent_iter); - printbuf_exit(&buf); - return inode; -err: - inode = ERR_PTR(ret); - goto out; -} - -static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, - unsigned int flags) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); - - struct bch_inode_info *inode; - bch2_trans_do(c, - PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), - &hash, &dentry->d_name))); - if (IS_ERR(inode)) - inode = NULL; - - if (!inode && IS_CASEFOLDED(vdir)) { - /* - * Do not cache a negative dentry in casefolded directories - * as it would need to be invalidated in the following situation: - * - Lookup file "blAH" in a casefolded directory - * - Creation of file "BLAH" in a casefolded directory - * - Lookup file "blAH" in a casefolded directory - * which would fail if we had a negative dentry. - * - * We should come back to this when VFS has a method to handle - * this edgecase. - */ - return NULL; - } - - return d_splice_alias(&inode->v, dentry); -} - -static int bch2_mknod(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - struct bch_inode_info *inode = - __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, - (subvol_inum) { 0 }, 0); - - if (IS_ERR(inode)) - return bch2_err_class(PTR_ERR(inode)); - - d_instantiate(dentry, &inode->v); - return 0; -} - -static int bch2_create(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, - umode_t mode, bool excl) -{ - return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); -} - -static int __bch2_link(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch_inode_info *dir, - struct dentry *dentry) -{ - struct bch_inode_unpacked dir_u, inode_u; - int ret; - - mutex_lock(&inode->ei_update_lock); - struct btree_trans *trans = bch2_trans_get(c); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_link_trans(trans, - inode_inum(dir), &dir_u, - inode_inum(inode), &inode_u, - &dentry->d_name)); - - if (likely(!ret)) { - bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); - } - - bch2_trans_put(trans); - mutex_unlock(&inode->ei_update_lock); - return ret; -} - -static int bch2_link(struct dentry *old_dentry, struct inode *vdir, - struct dentry *dentry) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); - int ret; - - lockdep_assert_held(&inode->v.i_rwsem); - - ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: - bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - __bch2_link(c, inode, dir, dentry); - if (unlikely(ret)) - return bch2_err_class(ret); - - ihold(&inode->v); - d_instantiate(dentry, &inode->v); - return 0; -} - -int __bch2_unlink(struct inode *vdir, struct dentry *dentry, - bool deleting_snapshot) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct bch_inode_unpacked dir_u, inode_u; - int ret; - - bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); - - struct btree_trans *trans = bch2_trans_get(c); - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_unlink_trans(trans, - inode_inum(dir), &dir_u, - &inode_u, &dentry->d_name, - deleting_snapshot)); - if (unlikely(ret)) - goto err; - - bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - bch2_inode_update_after_write(trans, inode, &inode_u, - ATTR_MTIME); - - if (inode_u.bi_subvol) { - /* - * Subvolume deletion is asynchronous, but we still want to tell - * the VFS that it's been deleted here: - */ - set_nlink(&inode->v, 0); - } - - if (IS_CASEFOLDED(vdir)) - d_invalidate(dentry); -err: - bch2_trans_put(trans); - bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); - - return ret; -} - -static int bch2_unlink(struct inode *vdir, struct dentry *dentry) -{ - struct bch_inode_info *dir= to_bch_ei(vdir); - struct bch_fs *c = dir->v.i_sb->s_fs_info; - - int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: - __bch2_unlink(vdir, dentry, false); - return bch2_err_class(ret); -} - -static int bch2_symlink(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, - const char *symname) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir), *inode; - int ret; - - inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, - (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); - if (IS_ERR(inode)) - return bch2_err_class(PTR_ERR(inode)); - - inode_lock(&inode->v); - ret = page_symlink(&inode->v, symname, strlen(symname) + 1); - inode_unlock(&inode->v); - - if (unlikely(ret)) - goto err; - - ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); - if (unlikely(ret)) - goto err; - - ret = __bch2_link(c, inode, dir, dentry); - if (unlikely(ret)) - goto err; - - d_instantiate(dentry, &inode->v); - return 0; -err: - iput(&inode->v); - return bch2_err_class(ret); -} - -static struct dentry *bch2_mkdir(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, umode_t mode) -{ - return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0)); -} - -static int bch2_rename2(struct mnt_idmap *idmap, - struct inode *src_vdir, struct dentry *src_dentry, - struct inode *dst_vdir, struct dentry *dst_dentry, - unsigned flags) -{ - struct bch_fs *c = src_vdir->i_sb->s_fs_info; - struct bch_inode_info *src_dir = to_bch_ei(src_vdir); - struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); - struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); - struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); - struct bch_inode_unpacked dst_dir_u, src_dir_u; - struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; - struct btree_trans *trans; - enum bch_rename_mode mode = flags & RENAME_EXCHANGE - ? BCH_RENAME_EXCHANGE - : dst_dentry->d_inode - ? BCH_RENAME_OVERWRITE : BCH_RENAME; - bool whiteout = !!(flags & RENAME_WHITEOUT); - int ret; - - if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) - return -EINVAL; - - if (mode == BCH_RENAME_OVERWRITE) { - ret = filemap_write_and_wait_range(src_inode->v.i_mapping, - 0, LLONG_MAX); - if (ret) - return ret; - } - - bch2_lock_inodes(INODE_UPDATE_LOCK, - src_dir, - dst_dir, - src_inode, - dst_inode); - - trans = bch2_trans_get(c); - - ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: - bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); - if (ret) - goto err_tx_restart; - - if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, src_inode, - dst_dir->ei_qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err; - } - - if (mode == BCH_RENAME_EXCHANGE && - inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, dst_inode, - src_dir->ei_qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err; - } -retry: - bch2_trans_begin(trans); - - ret = bch2_rename_trans(trans, - inode_inum(src_dir), &src_dir_u, - inode_inum(dst_dir), &dst_dir_u, - &src_inode_u, - &dst_inode_u, - &src_dentry->d_name, - &dst_dentry->d_name, - mode); - if (unlikely(ret)) - goto err_tx_restart; - - if (whiteout) { - whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); - ret = PTR_ERR_OR_ZERO(whiteout_inode_u); - if (unlikely(ret)) - goto err_tx_restart; - bch2_inode_init_early(c, whiteout_inode_u); - - ret = bch2_create_trans(trans, - inode_inum(src_dir), &src_dir_u, - whiteout_inode_u, - &src_dentry->d_name, - from_kuid(i_user_ns(&src_dir->v), current_fsuid()), - from_kgid(i_user_ns(&src_dir->v), current_fsgid()), - S_IFCHR|WHITEOUT_MODE, 0, - NULL, NULL, (subvol_inum) { 0 }, 0) ?: - bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, - KEY_TYPE_QUOTA_PREALLOC); - if (unlikely(ret)) - goto err_tx_restart; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0); - if (unlikely(ret)) { -err_tx_restart: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - goto err; - } - - BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); - BUG_ON(dst_inode && - dst_inode->v.i_ino != dst_inode_u.bi_inum); - - bch2_inode_update_after_write(trans, src_dir, &src_dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - - if (src_dir != dst_dir) - bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - - bch2_inode_update_after_write(trans, src_inode, &src_inode_u, - ATTR_CTIME); - - if (dst_inode) - bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, - ATTR_CTIME); -err: - bch2_trans_put(trans); - - bch2_fs_quota_transfer(c, src_inode, - bch_qid(&src_inode->ei_inode), - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_NOCHECK); - if (dst_inode) - bch2_fs_quota_transfer(c, dst_inode, - bch_qid(&dst_inode->ei_inode), - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_NOCHECK); - - bch2_unlock_inodes(INODE_UPDATE_LOCK, - src_dir, - dst_dir, - src_inode, - dst_inode); - - return bch2_err_class(ret); -} - -static void bch2_setattr_copy(struct mnt_idmap *idmap, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - struct iattr *attr) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - unsigned int ia_valid = attr->ia_valid; - kuid_t kuid; - kgid_t kgid; - - if (ia_valid & ATTR_UID) { - kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); - bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); - } - if (ia_valid & ATTR_GID) { - kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); - bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); - } - - if (ia_valid & ATTR_SIZE) - bi->bi_size = attr->ia_size; - - if (ia_valid & ATTR_ATIME) - bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); - if (ia_valid & ATTR_MTIME) - bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) - bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); - - if (ia_valid & ATTR_MODE) { - umode_t mode = attr->ia_mode; - kgid_t gid = ia_valid & ATTR_GID - ? kgid - : inode->v.i_gid; - - if (!in_group_or_capable(idmap, &inode->v, - make_vfsgid(idmap, i_user_ns(&inode->v), gid))) - mode &= ~S_ISGID; - bi->bi_mode = mode; - } -} - -int bch2_setattr_nonsize(struct mnt_idmap *idmap, - struct bch_inode_info *inode, - struct iattr *attr) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_qid qid; - struct btree_trans *trans; - struct btree_iter inode_iter = {}; - struct bch_inode_unpacked inode_u; - struct posix_acl *acl = NULL; - kuid_t kuid; - kgid_t kgid; - int ret; - - mutex_lock(&inode->ei_update_lock); - - qid = inode->ei_qid; - - if (attr->ia_valid & ATTR_UID) { - kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); - qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); - } - - if (attr->ia_valid & ATTR_GID) { - kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); - qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); - } - - ret = bch2_fs_quota_transfer(c, inode, qid, ~0, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err; - - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - kfree(acl); - acl = NULL; - - ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_intent); - if (ret) - goto btree_err; - - bch2_setattr_copy(idmap, inode, &inode_u, attr); - - if (attr->ia_valid & ATTR_MODE) { - ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, - inode_u.bi_mode, &acl); - if (ret) - goto btree_err; - } - - ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -btree_err: - bch2_trans_iter_exit(trans, &inode_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (unlikely(ret)) - goto err_trans; - - bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); - - if (acl) - set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -err_trans: - bch2_trans_put(trans); -err: - mutex_unlock(&inode->ei_update_lock); - - return bch2_err_class(ret); -} - -static int bch2_getattr(struct mnt_idmap *idmap, - const struct path *path, struct kstat *stat, - u32 request_mask, unsigned query_flags) -{ - struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); - vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); - - stat->dev = inode->v.i_sb->s_dev; - stat->ino = inode->v.i_ino; - stat->mode = inode->v.i_mode; - stat->nlink = inode->v.i_nlink; - stat->uid = vfsuid_into_kuid(vfsuid); - stat->gid = vfsgid_into_kgid(vfsgid); - stat->rdev = inode->v.i_rdev; - stat->size = i_size_read(&inode->v); - stat->atime = inode_get_atime(&inode->v); - stat->mtime = inode_get_mtime(&inode->v); - stat->ctime = inode_get_ctime(&inode->v); - stat->blksize = block_bytes(c); - stat->blocks = inode->v.i_blocks; - - stat->subvol = inode->ei_inum.subvol; - stat->result_mask |= STATX_SUBVOL; - - if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { - stat->result_mask |= STATX_DIOALIGN; - /* - * this is incorrect; we should be tracking this in superblock, - * and checking the alignment of open devices - */ - stat->dio_mem_align = SECTOR_SIZE; - stat->dio_offset_align = block_bytes(c); - } - - if (request_mask & STATX_BTIME) { - stat->result_mask |= STATX_BTIME; - stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); - } - - if (inode->ei_inode.bi_flags & BCH_INODE_immutable) - stat->attributes |= STATX_ATTR_IMMUTABLE; - stat->attributes_mask |= STATX_ATTR_IMMUTABLE; - - if (inode->ei_inode.bi_flags & BCH_INODE_append) - stat->attributes |= STATX_ATTR_APPEND; - stat->attributes_mask |= STATX_ATTR_APPEND; - - if (inode->ei_inode.bi_flags & BCH_INODE_nodump) - stat->attributes |= STATX_ATTR_NODUMP; - stat->attributes_mask |= STATX_ATTR_NODUMP; - - return 0; -} - -static int bch2_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *iattr) -{ - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; - - lockdep_assert_held(&inode->v.i_rwsem); - - ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - setattr_prepare(idmap, dentry, iattr); - if (ret) - return ret; - - return iattr->ia_valid & ATTR_SIZE - ? bchfs_truncate(idmap, inode, iattr) - : bch2_setattr_nonsize(idmap, inode, iattr); -} - -static int bch2_tmpfile(struct mnt_idmap *idmap, - struct inode *vdir, struct file *file, umode_t mode) -{ - struct bch_inode_info *inode = - __bch2_create(idmap, to_bch_ei(vdir), - file->f_path.dentry, mode, 0, - (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); - - if (IS_ERR(inode)) - return bch2_err_class(PTR_ERR(inode)); - - d_mark_tmpfile(file, &inode->v); - d_instantiate(file->f_path.dentry, &inode->v); - return finish_open_simple(file, 0); -} - -struct bch_fiemap_extent { - struct bkey_buf kbuf; - unsigned flags; -}; - -static int bch2_fill_extent(struct bch_fs *c, - struct fiemap_extent_info *info, - struct bch_fiemap_extent *fe) -{ - struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k); - unsigned flags = fe->flags; - - BUG_ON(!k.k->size); - - if (bkey_extent_is_direct_data(k.k)) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - int ret; - - if (k.k->type == KEY_TYPE_reflink_v) - flags |= FIEMAP_EXTENT_SHARED; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int flags2 = 0; - u64 offset = p.ptr.offset; - - if (p.ptr.unwritten) - flags2 |= FIEMAP_EXTENT_UNWRITTEN; - - if (p.crc.compression_type) - flags2 |= FIEMAP_EXTENT_ENCODED; - else - offset += p.crc.offset; - - if ((offset & (block_sectors(c) - 1)) || - (k.k->size & (block_sectors(c) - 1))) - flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; - - ret = fiemap_fill_next_extent(info, - bkey_start_offset(k.k) << 9, - offset << 9, - k.k->size << 9, flags|flags2); - if (ret) - return ret; - } - - return 0; - } else if (bkey_extent_is_inline_data(k.k)) { - return fiemap_fill_next_extent(info, - bkey_start_offset(k.k) << 9, - 0, k.k->size << 9, - flags| - FIEMAP_EXTENT_DATA_INLINE); - } else if (k.k->type == KEY_TYPE_reservation) { - return fiemap_fill_next_extent(info, - bkey_start_offset(k.k) << 9, - 0, k.k->size << 9, - flags| - FIEMAP_EXTENT_DELALLOC| - FIEMAP_EXTENT_UNWRITTEN); - } else { - BUG(); - } -} - -/* - * Scan a range of an inode for data in pagecache. - * - * Intended to be retryable, so don't modify the output params until success is - * imminent. - */ -static int -bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, - bool nonblock) -{ - loff_t dstart, dend; - - dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock); - if (dstart < 0) - return dstart; - - if (dstart == *end) { - *start = dstart; - return 0; - } - - dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock); - if (dend < 0) - return dend; - - /* race */ - BUG_ON(dstart == dend); - - *start = dstart; - *end = dend; - return 0; -} - -/* - * Scan a range of pagecache that corresponds to a file mapping hole in the - * extent btree. If data is found, fake up an extent key so it looks like a - * delalloc extent to the rest of the fiemap processing code. - */ -static int -bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode, - u64 start, u64 end, struct bch_fiemap_extent *cur) -{ - struct bch_fs *c = trans->c; - struct bkey_i_extent *delextent; - struct bch_extent_ptr ptr = {}; - loff_t dstart = start << 9, dend = end << 9; - int ret; - - /* - * We hold btree locks here so we cannot block on folio locks without - * dropping trans locks first. Run a nonblocking scan for the common - * case of no folios over holes and fall back on failure. - * - * Note that dropping locks like this is technically racy against - * writeback inserting to the extent tree, but a non-sync fiemap scan is - * fundamentally racy with writeback anyways. Therefore, just report the - * range as delalloc regardless of whether we have to cycle trans locks. - */ - ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true); - if (ret == -EAGAIN) - ret = drop_locks_do(trans, - bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false)); - if (ret < 0) - return ret; - - /* - * Create a fake extent key in the buffer. We have to add a dummy extent - * pointer for the fill code to add an extent entry. It's explicitly - * zeroed to reflect delayed allocation (i.e. phys offset 0). - */ - bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); - delextent = bkey_extent_init(cur->kbuf.k); - delextent->k.p = POS(inode->ei_inum.inum, dend >> 9); - delextent->k.size = (dend - dstart) >> 9; - bch2_bkey_append_ptr(&delextent->k_i, ptr); - - cur->flags = FIEMAP_EXTENT_DELALLOC; - - return 0; -} - -static int bch2_next_fiemap_extent(struct btree_trans *trans, - struct bch_inode_info *inode, - u64 start, u64 end, - struct bch_fiemap_extent *cur) -{ - u32 snapshot; - int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); - if (ret) - return ret; - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inode->ei_inum.inum, start, snapshot), 0); - - struct bkey_s_c k = - bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end)); - ret = bkey_err(k); - if (ret) - goto err; - - u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end; - - ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur); - if (ret) - goto err; - - struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k); - - /* - * Does the pagecache or the btree take precedence? - * - * It _should_ be the pagecache, so that we correctly report delalloc - * extents when dirty in the pagecache (we're COW, after all). - * - * But we'd have to add per-sector writeback tracking to - * bch_folio_state, otherwise we report delalloc extents for clean - * cached data in the pagecache. - * - * We should do this, but even then fiemap won't report stable mappings: - * on bcachefs data moves around in the background (copygc, rebalance) - * and we don't provide a way for userspace to lock that out. - */ - if (k.k && - bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)), - pagecache_start)) { - bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k); - bch2_cut_front(iter.pos, cur->kbuf.k); - bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k); - cur->flags = 0; - } else if (k.k) { - bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k); - } - - if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) { - unsigned sectors = cur->kbuf.k->k.size; - s64 offset_into_extent = 0; - enum btree_id data_btree = BTREE_ID_extents; - ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, - &cur->kbuf); - if (ret) - goto err; - - struct bkey_i *k = cur->kbuf.k; - sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent); - - bch2_cut_front(POS(k->k.p.inode, - bkey_start_offset(&k->k) + offset_into_extent), - k); - bch2_key_resize(&k->k, sectors); - k->k.p = iter.pos; - k->k.p.offset += k->k.size; - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, - u64 start, u64 len) -{ - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *ei = to_bch_ei(vinode); - struct btree_trans *trans; - struct bch_fiemap_extent cur, prev; - int ret = 0; - - ret = fiemap_prep(&ei->v, info, start, &len, 0); - if (ret) - return ret; - - if (start + len < start) - return -EINVAL; - - start >>= 9; - u64 end = (start + len) >> 9; - - bch2_bkey_buf_init(&cur.kbuf); - bch2_bkey_buf_init(&prev.kbuf); - bkey_init(&prev.kbuf.k->k); - - trans = bch2_trans_get(c); - - while (start < end) { - ret = lockrestart_do(trans, - bch2_next_fiemap_extent(trans, ei, start, end, &cur)); - if (ret) - goto err; - - BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start); - BUG_ON(cur.kbuf.k->k.p.offset > end); - - if (bkey_start_offset(&cur.kbuf.k->k) == end) - break; - - start = cur.kbuf.k->k.p.offset; - - if (!bkey_deleted(&prev.kbuf.k->k)) { - bch2_trans_unlock(trans); - ret = bch2_fill_extent(c, info, &prev); - if (ret) - goto err; - } - - bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k); - prev.flags = cur.flags; - } - - if (!bkey_deleted(&prev.kbuf.k->k)) { - bch2_trans_unlock(trans); - prev.flags |= FIEMAP_EXTENT_LAST; - ret = bch2_fill_extent(c, info, &prev); - } -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&cur.kbuf, c); - bch2_bkey_buf_exit(&prev.kbuf, c); - - return bch2_err_class(ret < 0 ? ret : 0); -} - -static const struct vm_operations_struct bch_vm_ops = { - .fault = bch2_page_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = bch2_page_mkwrite, -}; - -static int bch2_mmap_prepare(struct vm_area_desc *desc) -{ - file_accessed(desc->file); - - desc->vm_ops = &bch_vm_ops; - return 0; -} - -/* Directories: */ - -static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) -{ - return generic_file_llseek_size(file, offset, whence, - S64_MAX, S64_MAX); -} - -static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - - if (!dir_emit_dots(file, ctx)) - return 0; - - int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx); - - bch_err_fn(c, ret); - return bch2_err_class(ret); -} - -static int bch2_open(struct inode *vinode, struct file *file) -{ - if (file->f_flags & (O_WRONLY|O_RDWR)) { - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); - if (ret) - return ret; - } - - file->f_mode |= FMODE_CAN_ODIRECT; - - return generic_file_open(vinode, file); -} - -/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -static const __maybe_unused unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_sync] = FS_SYNC_FL, - [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, - [__BCH_INODE_append] = FS_APPEND_FL, - [__BCH_INODE_nodump] = FS_NODUMP_FL, - [__BCH_INODE_noatime] = FS_NOATIME_FL, -}; - -/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -static const __maybe_unused unsigned bch_flags_to_xflags[] = { - [__BCH_INODE_sync] = FS_XFLAG_SYNC, - [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, - [__BCH_INODE_append] = FS_XFLAG_APPEND, - [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, - [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, -}; - -static int bch2_fileattr_get(struct dentry *dentry, - struct file_kattr *fa) -{ - struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags)); - - if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) - fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; - - if (bch2_inode_casefold(c, &inode->ei_inode)) - fa->flags |= FS_CASEFOLD_FL; - - fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ]; - return 0; -} - -struct flags_set { - unsigned mask; - unsigned flags; - unsigned projid; - bool set_project; - bool set_casefold; - bool casefold; -}; - -static int fssetxattr_inode_update_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = trans->c; - struct flags_set *s = p; - - /* - * We're relying on btree locking here for exclusion with other ioctl - * calls - use the flags in the btree (@bi), not inode->i_flags: - */ - if (!S_ISREG(bi->bi_mode) && - !S_ISDIR(bi->bi_mode) && - (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags) - return -EINVAL; - - if (s->casefold != bch2_inode_casefold(c, bi)) { - int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold); - if (ret) - return ret; - } - - if (s->set_project) { - bi->bi_project = s->projid; - bi->bi_fields_set |= BIT(Inode_opt_project); - } - - bi->bi_flags &= ~s->mask; - bi->bi_flags |= s->flags; - - bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); - return 0; -} - -static int bch2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, - struct file_kattr *fa) -{ - struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct flags_set s = {}; - int ret; - - if (fa->fsx_valid) { - fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT; - - s.mask = map_defined(bch_flags_to_xflags); - s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); - if (fa->fsx_xflags) - return -EOPNOTSUPP; - - if (fa->fsx_projid >= U32_MAX) - return -EINVAL; - - /* - * inode fields accessible via the xattr interface are stored with a +1 - * bias, so that 0 means unset: - */ - if ((inode->ei_inode.bi_project || - fa->fsx_projid) && - inode->ei_inode.bi_project != fa->fsx_projid + 1) { - s.projid = fa->fsx_projid + 1; - s.set_project = true; - } - } - - if (fa->flags_valid) { - s.mask = map_defined(bch_flags_to_uflags); - - s.set_casefold = true; - s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0; - fa->flags &= ~FS_CASEFOLD_FL; - - s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); - if (fa->flags) - return -EOPNOTSUPP; - } - - mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - (s.set_project - ? bch2_set_projid(c, inode, fa->fsx_projid) - : 0) ?: - bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, - ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - - return bch2_err_class(ret); -} - -static const struct file_operations bch_file_operations = { - .open = bch2_open, - .llseek = bch2_llseek, - .read_iter = bch2_read_iter, - .write_iter = bch2_write_iter, - .mmap_prepare = bch2_mmap_prepare, - .get_unmapped_area = thp_get_unmapped_area, - .fsync = bch2_fsync, - .splice_read = filemap_splice_read, - .splice_write = iter_file_splice_write, - .fallocate = bch2_fallocate_dispatch, - .unlocked_ioctl = bch2_fs_file_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = bch2_compat_fs_ioctl, -#endif - .remap_file_range = bch2_remap_file_range, -}; - -static const struct inode_operations bch_file_inode_operations = { - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .fiemap = bch2_fiemap, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct inode_operations bch_dir_inode_operations = { - .lookup = bch2_lookup, - .create = bch2_create, - .link = bch2_link, - .unlink = bch2_unlink, - .symlink = bch2_symlink, - .mkdir = bch2_mkdir, - .rmdir = bch2_unlink, - .mknod = bch2_mknod, - .rename = bch2_rename2, - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .tmpfile = bch2_tmpfile, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct file_operations bch_dir_file_operations = { - .llseek = bch2_dir_llseek, - .read = generic_read_dir, - .iterate_shared = bch2_vfs_readdir, - .fsync = bch2_fsync, - .unlocked_ioctl = bch2_fs_file_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = bch2_compat_fs_ioctl, -#endif -}; - -static const struct inode_operations bch_symlink_inode_operations = { - .get_link = page_get_link, - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct inode_operations bch_special_inode_operations = { - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct address_space_operations bch_address_space_operations = { - .read_folio = bch2_read_folio, - .writepages = bch2_writepages, - .readahead = bch2_readahead, - .dirty_folio = filemap_dirty_folio, - .write_begin = bch2_write_begin, - .write_end = bch2_write_end, - .invalidate_folio = bch2_invalidate_folio, - .release_folio = bch2_release_folio, -#ifdef CONFIG_MIGRATION - .migrate_folio = filemap_migrate_folio, -#endif - .error_remove_folio = generic_error_remove_folio, -}; - -struct bcachefs_fid { - u64 inum; - u32 subvol; - u32 gen; -} __packed; - -struct bcachefs_fid_with_parent { - struct bcachefs_fid fid; - struct bcachefs_fid dir; -} __packed; - -static int bcachefs_fid_valid(int fh_len, int fh_type) -{ - switch (fh_type) { - case FILEID_BCACHEFS_WITHOUT_PARENT: - return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); - case FILEID_BCACHEFS_WITH_PARENT: - return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); - default: - return false; - } -} - -static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) -{ - return (struct bcachefs_fid) { - .inum = inode->ei_inum.inum, - .subvol = inode->ei_inum.subvol, - .gen = inode->ei_inode.bi_generation, - }; -} - -static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, - struct inode *vdir) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_inode_info *dir = to_bch_ei(vdir); - int min_len; - - if (!S_ISDIR(inode->v.i_mode) && dir) { - struct bcachefs_fid_with_parent *fid = (void *) fh; - - min_len = sizeof(*fid) / sizeof(u32); - if (*len < min_len) { - *len = min_len; - return FILEID_INVALID; - } - - fid->fid = bch2_inode_to_fid(inode); - fid->dir = bch2_inode_to_fid(dir); - - *len = min_len; - return FILEID_BCACHEFS_WITH_PARENT; - } else { - struct bcachefs_fid *fid = (void *) fh; - - min_len = sizeof(*fid) / sizeof(u32); - if (*len < min_len) { - *len = min_len; - return FILEID_INVALID; - } - *fid = bch2_inode_to_fid(inode); - - *len = min_len; - return FILEID_BCACHEFS_WITHOUT_PARENT; - } -} - -static struct inode *bch2_nfs_get_inode(struct super_block *sb, - struct bcachefs_fid fid) -{ - struct bch_fs *c = sb->s_fs_info; - struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { - .subvol = fid.subvol, - .inum = fid.inum, - }); - if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { - iput(vinode); - vinode = ERR_PTR(-ESTALE); - } - return vinode; -} - -static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, - int fh_len, int fh_type) -{ - struct bcachefs_fid *fid = (void *) _fid; - - if (!bcachefs_fid_valid(fh_len, fh_type)) - return NULL; - - return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); -} - -static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, - int fh_len, int fh_type) -{ - struct bcachefs_fid_with_parent *fid = (void *) _fid; - - if (!bcachefs_fid_valid(fh_len, fh_type) || - fh_type != FILEID_BCACHEFS_WITH_PARENT) - return NULL; - - return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); -} - -static struct dentry *bch2_get_parent(struct dentry *child) -{ - struct bch_inode_info *inode = to_bch_ei(child->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - subvol_inum parent_inum = { - .subvol = inode->ei_inode.bi_parent_subvol ?: - inode->ei_inum.subvol, - .inum = inode->ei_inode.bi_dir, - }; - - return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); -} - -static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) -{ - struct bch_inode_info *inode = to_bch_ei(child->d_inode); - struct bch_inode_info *dir = to_bch_ei(parent->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct btree_iter iter1; - struct btree_iter iter2; - struct bkey_s_c k; - struct bkey_s_c_dirent d; - struct bch_inode_unpacked inode_u; - subvol_inum target; - u32 snapshot; - struct qstr dirent_name; - unsigned name_len = 0; - int ret; - - if (!S_ISDIR(dir->v.i_mode)) - return -EINVAL; - - trans = bch2_trans_get(c); - - bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, - POS(dir->ei_inode.bi_inum, 0), 0); - bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, - POS(dir->ei_inode.bi_inum, 0), 0); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); - bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); - - ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); - if (ret) - goto err; - - if (inode_u.bi_dir == dir->ei_inode.bi_inum) { - bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); - - k = bch2_btree_iter_peek_slot(trans, &iter1); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_dirent) { - ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); - goto err; - } - - d = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); - if (ret > 0) - ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); - if (ret) - goto err; - - if (subvol_inum_eq(target, inode->ei_inum)) - goto found; - } else { - /* - * File with multiple hardlinks and our backref is to the wrong - * directory - linear search: - */ - for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { - if (k.k->p.inode > dir->ei_inode.bi_inum) - break; - - if (k.k->type != KEY_TYPE_dirent) - continue; - - d = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); - if (ret < 0) - break; - if (ret) - continue; - - if (subvol_inum_eq(target, inode->ei_inum)) - goto found; - } - } - - ret = -ENOENT; - goto err; -found: - dirent_name = bch2_dirent_get_name(d); - - name_len = min_t(unsigned, dirent_name.len, NAME_MAX); - memcpy(name, dirent_name.name, name_len); - name[name_len] = '\0'; -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_iter_exit(trans, &iter1); - bch2_trans_iter_exit(trans, &iter2); - bch2_trans_put(trans); - - return ret; -} - -static const struct export_operations bch_export_ops = { - .encode_fh = bch2_encode_fh, - .fh_to_dentry = bch2_fh_to_dentry, - .fh_to_parent = bch2_fh_to_parent, - .get_parent = bch2_get_parent, - .get_name = bch2_get_name, -}; - -static void bch2_vfs_inode_init(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - struct bch_subvolume *subvol) -{ - inode->v.i_ino = inum.inum; - inode->ei_inum = inum; - inode->ei_inode.bi_inum = inum.inum; - bch2_inode_update_after_write(trans, inode, bi, ~0); - - inode->v.i_blocks = bi->bi_sectors; - inode->v.i_rdev = bi->bi_dev; - inode->v.i_generation = bi->bi_generation; - inode->v.i_size = bi->bi_size; - - inode->ei_flags = 0; - inode->ei_quota_reserved = 0; - inode->ei_qid = bch_qid(bi); - - if (BCH_SUBVOLUME_SNAP(subvol)) - set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - - inode->v.i_mapping->a_ops = &bch_address_space_operations; - - switch (inode->v.i_mode & S_IFMT) { - case S_IFREG: - inode->v.i_op = &bch_file_inode_operations; - inode->v.i_fop = &bch_file_operations; - break; - case S_IFDIR: - inode->v.i_op = &bch_dir_inode_operations; - inode->v.i_fop = &bch_dir_file_operations; - break; - case S_IFLNK: - inode_nohighmem(&inode->v); - inode->v.i_op = &bch_symlink_inode_operations; - break; - default: - init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); - inode->v.i_op = &bch_special_inode_operations; - break; - } - - mapping_set_folio_min_order(inode->v.i_mapping, - get_order(trans->c->opts.block_size)); -} - -static void bch2_free_inode(struct inode *vinode) -{ - kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); -} - -static int inode_update_times_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); - bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); - bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); - - return 0; -} - -static int bch2_vfs_write_inode(struct inode *vinode, - struct writeback_control *wbc) -{ - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *inode = to_bch_ei(vinode); - int ret; - - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - - return bch2_err_class(ret); -} - -static void bch2_evict_inode(struct inode *vinode) -{ - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *inode = to_bch_ei(vinode); - bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); - - /* - * evict() has waited for outstanding writeback, we'll do no more IO - * through this inode: it's safe to remove from VFS inode hashtable here - * - * Do that now so that other threads aren't blocked from pulling it back - * in, there's no reason for them to be: - */ - if (!delete) - bch2_inode_hash_remove(c, inode); - - truncate_inode_pages_final(&inode->v.i_data); - - clear_inode(&inode->v); - - BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); - - if (delete) { - bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), - KEY_TYPE_QUOTA_WARN); - bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, - KEY_TYPE_QUOTA_WARN); - int ret = bch2_inode_rm(c, inode_inum(inode)); - if (ret && !bch2_err_matches(ret, EROFS)) { - bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu", - inode->ei_inum.subvol, - inode->ei_inum.inum); - bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm); - } - - /* - * If we are deleting, we need it present in the vfs hash table - * so that fsck can check if unlinked inodes are still open: - */ - bch2_inode_hash_remove(c, inode); - } - - mutex_lock(&c->vfs_inodes_lock); - list_del_init(&inode->ei_vfs_inode_list); - mutex_unlock(&c->vfs_inodes_lock); -} - -void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) -{ - struct bch_inode_info *inode; - DARRAY(struct bch_inode_info *) grabbed; - bool clean_pass = false, this_pass_clean; - - /* - * Initially, we scan for inodes without I_DONTCACHE, then mark them to - * be pruned with d_mark_dontcache(). - * - * Once we've had a clean pass where we didn't find any inodes without - * I_DONTCACHE, we wait for them to be freed: - */ - - darray_init(&grabbed); - darray_make_room(&grabbed, 1024); -again: - cond_resched(); - this_pass_clean = true; - - mutex_lock(&c->vfs_inodes_lock); - list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { - if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) - continue; - - if (!(inode->v.i_state & I_DONTCACHE) && - !(inode->v.i_state & I_FREEING) && - igrab(&inode->v)) { - this_pass_clean = false; - - if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { - iput(&inode->v); - break; - } - } else if (clean_pass && this_pass_clean) { - struct wait_bit_queue_entry wqe; - struct wait_queue_head *wq_head; - - wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); - prepare_to_wait_event(wq_head, &wqe.wq_entry, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&c->vfs_inodes_lock); - - schedule(); - finish_wait(wq_head, &wqe.wq_entry); - goto again; - } - } - mutex_unlock(&c->vfs_inodes_lock); - - darray_for_each(grabbed, i) { - inode = *i; - d_mark_dontcache(&inode->v); - d_prune_aliases(&inode->v); - iput(&inode->v); - } - grabbed.nr = 0; - - if (!clean_pass || !this_pass_clean) { - clean_pass = this_pass_clean; - goto again; - } - - darray_exit(&grabbed); -} - -static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_sb; - struct bch_fs *c = sb->s_fs_info; - struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); - unsigned shift = sb->s_blocksize_bits - 9; - /* - * this assumes inodes take up 64 bytes, which is a decent average - * number: - */ - u64 avail_inodes = ((usage.capacity - usage.used) << 3); - - buf->f_type = BCACHEFS_STATFS_MAGIC; - buf->f_bsize = sb->s_blocksize; - buf->f_blocks = usage.capacity >> shift; - buf->f_bfree = usage.free >> shift; - buf->f_bavail = avail_factor(usage.free) >> shift; - - buf->f_files = usage.nr_inodes + avail_inodes; - buf->f_ffree = avail_inodes; - - buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); - buf->f_namelen = BCH_NAME_MAX; - - return 0; -} - -static int bch2_sync_fs(struct super_block *sb, int wait) -{ - struct bch_fs *c = sb->s_fs_info; - int ret; - - trace_bch2_sync_fs(sb, wait); - - if (c->opts.journal_flush_disabled) - return 0; - - if (!wait) { - bch2_journal_flush_async(&c->journal, NULL); - return 0; - } - - ret = bch2_journal_flush(&c->journal); - return bch2_err_class(ret); -} - -static struct bch_fs *bch2_path_to_fs(const char *path) -{ - struct bch_fs *c; - dev_t dev; - int ret; - - ret = lookup_bdev(path, &dev); - if (ret) - return ERR_PTR(ret); - - c = bch2_dev_to_fs(dev); - if (c) - closure_put(&c->cl); - return c ?: ERR_PTR(-ENOENT); -} - -static int bch2_show_devname(struct seq_file *seq, struct dentry *root) -{ - struct bch_fs *c = root->d_sb->s_fs_info; - bool first = true; - - guard(rcu)(); - for_each_online_member_rcu(c, ca) { - if (!first) - seq_putc(seq, ':'); - first = false; - seq_puts(seq, ca->disk_sb.sb_name); - } - - return 0; -} - -static int bch2_show_options(struct seq_file *seq, struct dentry *root) -{ - struct bch_fs *c = root->d_sb->s_fs_info; - struct printbuf buf = PRINTBUF; - - bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, - OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); - printbuf_nul_terminate(&buf); - seq_printf(seq, ",%s", buf.buf); - - int ret = buf.allocation_failure ? -ENOMEM : 0; - printbuf_exit(&buf); - return ret; -} - -static void bch2_put_super(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - - __bch2_fs_stop(c); -} - -/* - * bcachefs doesn't currently integrate intwrite freeze protection but the - * internal write references serve the same purpose. Therefore reuse the - * read-only transition code to perform the quiesce. The caveat is that we don't - * currently have the ability to block tasks that want a write reference while - * the superblock is frozen. This is fine for now, but we should either add - * blocking support or find a way to integrate sb_start_intwrite() and friends. - */ -static int bch2_freeze(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - - down_write(&c->state_lock); - bch2_fs_read_only(c); - up_write(&c->state_lock); - return 0; -} - -static int bch2_unfreeze(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - int ret; - - if (test_bit(BCH_FS_emergency_ro, &c->flags)) - return 0; - - down_write(&c->state_lock); - ret = bch2_fs_read_write(c); - up_write(&c->state_lock); - return ret; -} - -static const struct super_operations bch_super_operations = { - .alloc_inode = bch2_alloc_inode, - .free_inode = bch2_free_inode, - .write_inode = bch2_vfs_write_inode, - .evict_inode = bch2_evict_inode, - .sync_fs = bch2_sync_fs, - .statfs = bch2_statfs, - .show_devname = bch2_show_devname, - .show_options = bch2_show_options, - .put_super = bch2_put_super, - .freeze_fs = bch2_freeze, - .unfreeze_fs = bch2_unfreeze, -}; - -static int bch2_set_super(struct super_block *s, void *data) -{ - s->s_fs_info = data; - return 0; -} - -static int bch2_noset_super(struct super_block *s, void *data) -{ - return -EBUSY; -} - -typedef DARRAY(struct bch_fs *) darray_fs; - -static int bch2_test_super(struct super_block *s, void *data) -{ - struct bch_fs *c = s->s_fs_info; - darray_fs *d = data; - - if (!c) - return false; - - darray_for_each(*d, i) - if (c != *i) - return false; - return true; -} - -static int bch2_fs_get_tree(struct fs_context *fc) -{ - struct bch_fs *c; - struct super_block *sb; - struct inode *vinode; - struct bch2_opts_parse *opts_parse = fc->fs_private; - struct bch_opts opts = opts_parse->opts; - darray_const_str devs; - darray_fs devs_to_fs = {}; - int ret; - - opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); - opt_set(opts, nostart, true); - - if (!fc->source || strlen(fc->source) == 0) - return -EINVAL; - - ret = bch2_split_devs(fc->source, &devs); - if (ret) - return ret; - - darray_for_each(devs, i) { - ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); - if (ret) - goto err; - } - - sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs); - if (!IS_ERR(sb)) - goto got_sb; - - c = bch2_fs_open(&devs, &opts); - ret = PTR_ERR_OR_ZERO(c); - if (ret) - goto err; - - if (opt_defined(opts, discard)) - set_bit(BCH_FS_discard_mount_opt_set, &c->flags); - - /* Some options can't be parsed until after the fs is started: */ - opts = bch2_opts_empty(); - ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false); - if (ret) - goto err_stop_fs; - - bch2_opts_apply(&c->opts, opts); - - ret = bch2_fs_start(c); - if (ret) - goto err_stop_fs; - - /* - * We might be doing a RO mount because other options required it, or we - * have no alloc info and it's a small image with no room to regenerate - * it - */ - if (c->opts.read_only) - fc->sb_flags |= SB_RDONLY; - - sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); - ret = PTR_ERR_OR_ZERO(sb); - if (ret) - goto err_stop_fs; -got_sb: - c = sb->s_fs_info; - - if (sb->s_root) { - if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) { - ret = -EBUSY; - goto err_put_super; - } - goto out; - } - - sb->s_blocksize = block_bytes(c); - sb->s_blocksize_bits = ilog2(block_bytes(c)); - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_op = &bch_super_operations; - sb->s_export_op = &bch_export_ops; -#ifdef CONFIG_BCACHEFS_QUOTA - sb->s_qcop = &bch2_quotactl_operations; - sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; -#endif - sb->s_xattr = bch2_xattr_handlers; - sb->s_magic = BCACHEFS_STATFS_MAGIC; - sb->s_time_gran = c->sb.nsec_per_time_unit; - sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; - sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); - super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); - - if (c->sb.multi_device) - super_set_sysfs_name_uuid(sb); - else - strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name)); - - sb->s_shrink->seeks = 0; - c->vfs_sb = sb; - strscpy(sb->s_id, c->name, sizeof(sb->s_id)); - - ret = super_setup_bdi(sb); - if (ret) - goto err_put_super; - - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - - scoped_guard(rcu) { - for_each_online_member_rcu(c, ca) { - struct block_device *bdev = ca->disk_sb.bdev; - - /* XXX: create an anonymous device for multi device filesystems */ - sb->s_bdev = bdev; - sb->s_dev = bdev->bd_dev; - break; - } - } - - c->dev = sb->s_dev; - -#ifdef CONFIG_BCACHEFS_POSIX_ACL - if (c->opts.acl) - sb->s_flags |= SB_POSIXACL; -#endif - - sb->s_shrink->seeks = 0; - -#ifdef CONFIG_UNICODE - if (bch2_fs_casefold_enabled(c)) - sb->s_encoding = c->cf_encoding; - generic_set_sb_d_ops(sb); -#endif - - vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); - ret = PTR_ERR_OR_ZERO(vinode); - bch_err_msg(c, ret, "mounting: error getting root inode"); - if (ret) - goto err_put_super; - - sb->s_root = d_make_root(vinode); - if (!sb->s_root) { - bch_err(c, "error mounting: error allocating root dentry"); - ret = -ENOMEM; - goto err_put_super; - } - - sb->s_flags |= SB_ACTIVE; -out: - fc->root = dget(sb->s_root); -err: - darray_exit(&devs_to_fs); - bch2_darray_str_exit(&devs); - if (ret) - pr_err("error: %s", bch2_err_str(ret)); - /* - * On an inconsistency error in recovery we might see an -EROFS derived - * errorcode (from the journal), but we don't want to return that to - * userspace as that causes util-linux to retry the mount RO - which is - * confusing: - */ - if (bch2_err_matches(ret, EROFS) && ret != -EROFS) - ret = -EIO; - return bch2_err_class(ret); - -err_stop_fs: - bch2_fs_stop(c); - goto err; - -err_put_super: - if (!sb->s_root) - __bch2_fs_stop(c); - deactivate_locked_super(sb); - goto err; -} - -static void bch2_kill_sb(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - - generic_shutdown_super(sb); - bch2_fs_free(c); -} - -static void bch2_fs_context_free(struct fs_context *fc) -{ - struct bch2_opts_parse *opts = fc->fs_private; - - if (opts) { - printbuf_exit(&opts->parse_later); - kfree(opts); - } -} - -static int bch2_fs_parse_param(struct fs_context *fc, - struct fs_parameter *param) -{ - /* - * the "source" param, i.e., the name of the device(s) to mount, - * is handled by the VFS layer. - */ - if (!strcmp(param->key, "source")) - return -ENOPARAM; - - struct bch2_opts_parse *opts = fc->fs_private; - struct bch_fs *c = NULL; - - /* for reconfigure, we already have a struct bch_fs */ - if (fc->root) - c = fc->root->d_sb->s_fs_info; - - int ret = bch2_parse_one_mount_opt(c, &opts->opts, - &opts->parse_later, param->key, - param->string); - if (ret) - pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret)); - - return bch2_err_class(ret); -} - -static int bch2_fs_reconfigure(struct fs_context *fc) -{ - struct super_block *sb = fc->root->d_sb; - struct bch2_opts_parse *opts = fc->fs_private; - struct bch_fs *c = sb->s_fs_info; - int ret = 0; - - opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); - - if (opts->opts.read_only != c->opts.read_only) { - down_write(&c->state_lock); - - if (opts->opts.read_only) { - bch2_fs_read_only(c); - - sb->s_flags |= SB_RDONLY; - } else { - ret = bch2_fs_read_write(c); - if (ret) { - bch_err(c, "error going rw: %i", ret); - up_write(&c->state_lock); - ret = -EINVAL; - goto err; - } - - sb->s_flags &= ~SB_RDONLY; - } - - c->opts.read_only = opts->opts.read_only; - - up_write(&c->state_lock); - } - - if (opt_defined(opts->opts, errors)) - c->opts.errors = opts->opts.errors; -err: - return bch2_err_class(ret); -} - -static const struct fs_context_operations bch2_context_ops = { - .free = bch2_fs_context_free, - .parse_param = bch2_fs_parse_param, - .get_tree = bch2_fs_get_tree, - .reconfigure = bch2_fs_reconfigure, -}; - -static int bch2_init_fs_context(struct fs_context *fc) -{ - struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL); - - if (!opts) - return -ENOMEM; - - opts->parse_later = PRINTBUF; - - fc->ops = &bch2_context_ops; - fc->fs_private = opts; - - return 0; -} - -void bch2_fs_vfs_exit(struct bch_fs *c) -{ - if (c->vfs_inodes_by_inum_table.ht.tbl) - rhltable_destroy(&c->vfs_inodes_by_inum_table); - if (c->vfs_inodes_table.tbl) - rhashtable_destroy(&c->vfs_inodes_table); -} - -int bch2_fs_vfs_init(struct bch_fs *c) -{ - return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: - rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); -} - -static struct file_system_type bcache_fs_type = { - .owner = THIS_MODULE, - .name = "bcachefs", - .init_fs_context = bch2_init_fs_context, - .kill_sb = bch2_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS, -}; - -MODULE_ALIAS_FS("bcachefs"); - -void bch2_vfs_exit(void) -{ - unregister_filesystem(&bcache_fs_type); - kmem_cache_destroy(bch2_inode_cache); -} - -int __init bch2_vfs_init(void) -{ - int ret = -ENOMEM; - - bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | - SLAB_ACCOUNT); - if (!bch2_inode_cache) - goto err; - - ret = register_filesystem(&bcache_fs_type); - if (ret) - goto err; - - return 0; -err: - bch2_vfs_exit(); - return ret; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h deleted file mode 100644 index dd2198541455b2..00000000000000 --- a/fs/bcachefs/fs.h +++ /dev/null @@ -1,215 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_H -#define _BCACHEFS_FS_H - -#include "inode.h" -#include "opts.h" -#include "str_hash.h" -#include "quota_types.h" -#include "two_state_shared_lock.h" - -#include -#include - -struct bch_inode_info { - struct inode v; - struct rhash_head hash; - struct rhlist_head by_inum_hash; - subvol_inum ei_inum; - - struct list_head ei_vfs_inode_list; - unsigned long ei_flags; - - struct mutex ei_update_lock; - u64 ei_quota_reserved; - unsigned long ei_last_dirtied; - two_state_lock_t ei_pagecache_lock; - - struct mutex ei_quota_lock; - struct bch_qid ei_qid; - - /* - * When we've been doing nocow writes we'll need to issue flushes to the - * underlying block devices - * - * XXX: a device may have had a flush issued by some other codepath. It - * would be better to keep for each device a sequence number that's - * incremented when we isusue a cache flush, and track here the sequence - * number that needs flushing. - */ - struct bch_devs_mask ei_devs_need_flush; - - /* copy of inode in btree: */ - struct bch_inode_unpacked ei_inode; -}; - -#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0) -#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0) -#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0) - -#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1) -#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1) - -static inline subvol_inum inode_inum(struct bch_inode_info *inode) -{ - return inode->ei_inum; -} - -/* - * Set if we've gotten a btree error for this inode, and thus the vfs inode and - * btree inode may be inconsistent: - */ -#define EI_INODE_ERROR 0 - -/* - * Set in the inode is in a snapshot subvolume - we don't do quota accounting in - * those: - */ -#define EI_INODE_SNAPSHOT 1 -#define EI_INODE_HASHED 2 - -#define to_bch_ei(_inode) \ - container_of_or_null(_inode, struct bch_inode_info, v) - -static inline int ptrcmp(void *l, void *r) -{ - return cmp_int(l, r); -} - -enum bch_inode_lock_op { - INODE_PAGECACHE_BLOCK = (1U << 0), - INODE_UPDATE_LOCK = (1U << 1), -}; - -#define bch2_lock_inodes(_locks, ...) \ -do { \ - struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ - unsigned i; \ - \ - bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ - \ - for (i = 1; i < ARRAY_SIZE(a); i++) \ - if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_PAGECACHE_BLOCK) \ - bch2_pagecache_block_get(a[i]);\ - if ((_locks) & INODE_UPDATE_LOCK) \ - mutex_lock_nested(&a[i]->ei_update_lock, i);\ - } \ -} while (0) - -#define bch2_unlock_inodes(_locks, ...) \ -do { \ - struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ - unsigned i; \ - \ - bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ - \ - for (i = 1; i < ARRAY_SIZE(a); i++) \ - if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_PAGECACHE_BLOCK) \ - bch2_pagecache_block_put(a[i]);\ - if ((_locks) & INODE_UPDATE_LOCK) \ - mutex_unlock(&a[i]->ei_update_lock); \ - } \ -} while (0) - -static inline struct bch_inode_info *file_bch_inode(struct file *file) -{ - return to_bch_ei(file_inode(file)); -} - -static inline bool inode_attr_changing(struct bch_inode_info *dir, - struct bch_inode_info *inode, - enum inode_opt_id id) -{ - return !(inode->ei_inode.bi_fields_set & (1 << id)) && - bch2_inode_opt_get(&dir->ei_inode, id) != - bch2_inode_opt_get(&inode->ei_inode, id); -} - -static inline bool inode_attrs_changing(struct bch_inode_info *dir, - struct bch_inode_info *inode) -{ - unsigned id; - - for (id = 0; id < Inode_opt_nr; id++) - if (inode_attr_changing(dir, inode, id)) - return true; - - return false; -} - -struct bch_inode_unpacked; - -#ifndef NO_BCACHEFS_FS - -struct bch_inode_info * -__bch2_create(struct mnt_idmap *, struct bch_inode_info *, - struct dentry *, umode_t, dev_t, subvol_inum, unsigned); - -int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p); - -int bch2_fs_quota_transfer(struct bch_fs *, - struct bch_inode_info *, - struct bch_qid, - unsigned, - enum quota_acct_mode); - -static inline int bch2_set_projid(struct bch_fs *c, - struct bch_inode_info *inode, - u32 projid) -{ - struct bch_qid qid = inode->ei_qid; - - qid.q[QTYP_PRJ] = projid; - - return bch2_fs_quota_transfer(c, inode, qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); -} - -struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); - -/* returns 0 if we want to do the update, or error is passed up */ -typedef int (*inode_set_fn)(struct btree_trans *, - struct bch_inode_info *, - struct bch_inode_unpacked *, void *); - -void bch2_inode_update_after_write(struct btree_trans *, - struct bch_inode_info *, - struct bch_inode_unpacked *, - unsigned); -int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, - inode_set_fn, void *, unsigned); - -int bch2_setattr_nonsize(struct mnt_idmap *, - struct bch_inode_info *, - struct iattr *); -int __bch2_unlink(struct inode *, struct dentry *, bool); - -void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); - -void bch2_fs_vfs_exit(struct bch_fs *); -int bch2_fs_vfs_init(struct bch_fs *); - -void bch2_vfs_exit(void); -int bch2_vfs_init(void); - -#else - -#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) - -static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; } - -static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, - snapshot_id_list *s) {} - -static inline void bch2_fs_vfs_exit(struct bch_fs *c) {} -static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; } - -static inline void bch2_vfs_exit(void) {} -static inline int bch2_vfs_init(void) { return 0; } - -#endif /* NO_BCACHEFS_FS */ - -#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c deleted file mode 100644 index 15c1e890d299b2..00000000000000 --- a/fs/bcachefs/fsck.c +++ /dev/null @@ -1,3363 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bcachefs_ioctl.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_update.h" -#include "buckets.h" -#include "darray.h" -#include "dirent.h" -#include "error.h" -#include "fs.h" -#include "fsck.h" -#include "inode.h" -#include "io_misc.h" -#include "keylist.h" -#include "namei.h" -#include "recovery_passes.h" -#include "snapshot.h" -#include "super.h" -#include "thread_with_file.h" -#include "xattr.h" - -#include -#include /* struct qstr */ - -static int dirent_points_to_inode_nowarn(struct bch_fs *c, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) -{ - if (d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol - : le64_to_cpu(d.v->d_inum) == inode->bi_inum) - return 0; - return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); -} - -static void dirent_inode_mismatch_msg(struct printbuf *out, - struct bch_fs *c, - struct bkey_s_c_dirent dirent, - struct bch_inode_unpacked *inode) -{ - prt_str(out, "inode points to dirent that does not point back:"); - prt_newline(out); - bch2_bkey_val_to_text(out, c, dirent.s_c); - prt_newline(out); - bch2_inode_unpacked_to_text(out, inode); -} - -static int dirent_points_to_inode(struct bch_fs *c, - struct bkey_s_c_dirent dirent, - struct bch_inode_unpacked *inode) -{ - int ret = dirent_points_to_inode_nowarn(c, dirent, inode); - if (ret) { - struct printbuf buf = PRINTBUF; - dirent_inode_mismatch_msg(&buf, c, dirent, inode); - bch_warn(c, "%s", buf.buf); - printbuf_exit(&buf); - } - return ret; -} - -/* - * XXX: this is handling transaction restarts without returning - * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: - */ -static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, - u32 snapshot) -{ - u64 sectors = 0; - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(inum, 0, snapshot), - POS(inum, U64_MAX), - 0, k, ({ - if (bkey_extent_is_allocation(k.k)) - sectors += k.k->size; - 0; - })); - - return ret ?: sectors; -} - -static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, - u32 snapshot) -{ - u64 subdirs = 0; - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - POS(inum, U64_MAX), - 0, k, ({ - if (k.k->type == KEY_TYPE_dirent && - bkey_s_c_to_dirent(k).v->d_type == DT_DIR) - subdirs++; - 0; - })); - - return ret ?: subdirs; -} - -static int subvol_lookup(struct btree_trans *trans, u32 subvol, - u32 *snapshot, u64 *inum) -{ - struct bch_subvolume s; - int ret = bch2_subvolume_get(trans, subvol, false, &s); - - *snapshot = le32_to_cpu(s.snapshot); - *inum = le64_to_cpu(s.inode); - return ret; -} - -static int lookup_dirent_in_snapshot(struct btree_trans *trans, - struct bch_hash_info hash_info, - subvol_inum dir, struct qstr *name, - u64 *target, unsigned *type, u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0, snapshot); - int ret = bkey_err(k); - if (ret) - return ret; - - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - *target = le64_to_cpu(d.v->d_inum); - *type = d.v->d_type; - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -/* - * Find any subvolume associated with a tree of snapshots - * We can't rely on master_subvol - it might have been deleted. - */ -static int find_snapshot_tree_subvol(struct btree_trans *trans, - u32 tree_id, u32 *subvol) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { - if (k.k->type != KEY_TYPE_snapshot) - continue; - - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - if (le32_to_cpu(s.v->tree) != tree_id) - continue; - - if (s.v->subvol) { - *subvol = le32_to_cpu(s.v->subvol); - goto found; - } - } - ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol); -found: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* Get lost+found, create if it doesn't exist: */ -static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - struct bch_inode_unpacked *lostfound, - u64 reattaching_inum) -{ - struct bch_fs *c = trans->c; - struct qstr lostfound_str = QSTR("lost+found"); - struct btree_iter lostfound_iter = {}; - u64 inum = 0; - unsigned d_type = 0; - int ret; - - struct bch_snapshot_tree st; - ret = bch2_snapshot_tree_lookup(trans, - bch2_snapshot_tree(c, snapshot), &st); - if (ret) - return ret; - - u32 subvolid; - ret = find_snapshot_tree_subvol(trans, - bch2_snapshot_tree(c, snapshot), &subvolid); - bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u", - bch2_snapshot_tree(c, snapshot)); - if (ret) - return ret; - - struct bch_subvolume subvol; - ret = bch2_subvolume_get(trans, subvolid, false, &subvol); - bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot); - if (ret) - return ret; - - if (!subvol.inode) { - struct btree_iter iter; - struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvolid), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(subvol); - if (ret) - return ret; - - subvol->v.inode = cpu_to_le64(reattaching_inum); - bch2_trans_iter_exit(trans, &iter); - } - - subvol_inum root_inum = { - .subvol = subvolid, - .inum = le64_to_cpu(subvol.inode) - }; - - struct bch_inode_unpacked root_inode; - struct bch_hash_info root_hash_info; - ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); - bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", - root_inum.inum, subvolid); - if (ret) - return ret; - - root_hash_info = bch2_hash_info_init(c, &root_inode); - - ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, - &lostfound_str, &inum, &d_type, snapshot); - if (bch2_err_matches(ret, ENOENT)) - goto create_lostfound; - - bch_err_fn(c, ret); - if (ret) - return ret; - - if (d_type != DT_DIR) { - bch_err(c, "error looking up lost+found: not a directory"); - return bch_err_throw(c, ENOENT_not_directory); - } - - /* - * The bch2_check_dirents pass has already run, dangling dirents - * shouldn't exist here: - */ - ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); - bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", - inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); - return ret; - -create_lostfound: - /* - * we always create lost+found in the root snapshot; we don't want - * different branches of the snapshot tree to have different lost+found - */ - snapshot = le32_to_cpu(st.root_snapshot); - /* - * XXX: we could have a nicer log message here if we had a nice way to - * walk backpointers to print a path - */ - struct printbuf path = PRINTBUF; - ret = bch2_inum_to_path(trans, root_inum, &path); - if (ret) - goto err; - - bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", - path.buf, root_inum.subvol, snapshot); - printbuf_exit(&path); - - u64 now = bch2_current_time(c); - u64 cpu = raw_smp_processor_id(); - - bch2_inode_init_early(c, lostfound); - bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); - lostfound->bi_dir = root_inode.bi_inum; - lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); - - root_inode.bi_nlink++; - - ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); - ret = bch2_btree_iter_traverse(trans, &lostfound_iter); - if (ret) - goto err; - - ret = bch2_dirent_create_snapshot(trans, - 0, root_inode.bi_inum, snapshot, &root_hash_info, - mode_to_type(lostfound->bi_mode), - &lostfound_str, - lostfound->bi_inum, - &lostfound->bi_dir_offset, - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create) ?: - bch2_inode_write_flags(trans, &lostfound_iter, lostfound, - BTREE_UPDATE_internal_snapshot_node); -err: - bch_err_msg(c, ret, "creating lost+found"); - bch2_trans_iter_exit(trans, &lostfound_iter); - return ret; -} - -static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) -{ - if (inode->bi_inum == BCACHEFS_ROOT_INO && - inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) - return false; - - /* - * Subvolume roots are special: older versions of subvolume roots may be - * disconnected, it's only the newest version that matters. - * - * We only keep a single dirent pointing to a subvolume root, i.e. - * older versions of snapshots will not have a different dirent pointing - * to the same subvolume root. - * - * This is because dirents that point to subvolumes are only visible in - * the parent subvolume - versioning is not needed - and keeping them - * around would break fsck, because when we're crossing subvolumes we - * don't have a consistent snapshot ID to do check the inode <-> dirent - * relationships. - * - * Thus, a subvolume root that's been renamed after a snapshot will have - * a disconnected older version - that's expected. - * - * Note that taking a snapshot always updates the root inode (to update - * the dirent backpointer), so a subvolume root inode with - * BCH_INODE_has_child_snapshot is never visible. - */ - if (inode->bi_subvol && - (inode->bi_flags & BCH_INODE_has_child_snapshot)) - return false; - - return !bch2_inode_has_backpointer(inode) && - !(inode->bi_flags & BCH_INODE_unlinked); -} - -static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, - SPOS(d_pos.inode, d_pos.offset, snapshot), - BTREE_ITER_intent| - BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - - if (bpos_eq(k.k->p, d_pos)) { - /* - * delet_at() doesn't work because the update path doesn't - * internally use BTREE_ITER_with_updates yet - */ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); - ret = PTR_ERR_OR_ZERO(k); - if (ret) - goto err; - - bkey_init(&k->k); - k->k.type = KEY_TYPE_whiteout; - k->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) -{ - struct bch_fs *c = trans->c; - struct bch_inode_unpacked lostfound; - char name_buf[20]; - int ret; - - u32 dirent_snapshot = inode->bi_snapshot; - if (inode->bi_subvol) { - inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; - - struct btree_iter subvol_iter; - struct bkey_i_subvolume *subvol = - bch2_bkey_get_mut_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, inode->bi_subvol), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(subvol); - if (ret) - return ret; - - subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL; - bch2_trans_iter_exit(trans, &subvol_iter); - - u64 root_inum; - ret = subvol_lookup(trans, inode->bi_parent_subvol, - &dirent_snapshot, &root_inum); - if (ret) - return ret; - - snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); - } else { - snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); - } - - ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); - if (ret) - return ret; - - bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum); - - lostfound.bi_nlink += S_ISDIR(inode->bi_mode); - - /* ensure lost+found inode is also present in inode snapshot */ - if (!inode->bi_subvol) { - BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot)); - lostfound.bi_snapshot = inode->bi_snapshot; - } - - ret = __bch2_fsck_write_inode(trans, &lostfound); - if (ret) - return ret; - - struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); - struct qstr name = QSTR(name_buf); - - inode->bi_dir = lostfound.bi_inum; - - ret = bch2_dirent_create_snapshot(trans, - inode->bi_parent_subvol, lostfound.bi_inum, - dirent_snapshot, - &dir_hash, - inode_d_type(inode), - &name, - inode->bi_subvol ?: inode->bi_inum, - &inode->bi_dir_offset, - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create); - if (ret) { - bch_err_msg(c, ret, "error creating dirent"); - return ret; - } - - ret = __bch2_fsck_write_inode(trans, inode); - if (ret) - return ret; - - { - CLASS(printbuf, buf)(); - ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, - inode->bi_snapshot, NULL, &buf); - if (ret) - return ret; - - bch_info(c, "reattached at %s", buf.buf); - } - - /* - * Fix up inodes in child snapshots: if they should also be reattached - * update the backpointer field, if they should not be we need to emit - * whiteouts for the dirent we just created. - */ - if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { - snapshot_id_list whiteouts_done; - struct btree_iter iter; - struct bkey_s_c k; - - darray_init(&whiteouts_done); - - for_each_btree_key_reverse_norestart(trans, iter, - BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), - BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { - if (k.k->p.offset != inode->bi_inum) - break; - - if (!bkey_is_inode(k.k) || - !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || - snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) - continue; - - struct bch_inode_unpacked child_inode; - ret = bch2_inode_unpack(k, &child_inode); - if (ret) - break; - - if (!inode_should_reattach(&child_inode)) { - ret = maybe_delete_dirent(trans, - SPOS(lostfound.bi_inum, inode->bi_dir_offset, - dirent_snapshot), - k.k->p.snapshot); - if (ret) - break; - - ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); - if (ret) - break; - } else { - iter.snapshot = k.k->p.snapshot; - child_inode.bi_dir = inode->bi_dir; - child_inode.bi_dir_offset = inode->bi_dir_offset; - - ret = bch2_inode_write_flags(trans, &iter, &child_inode, - BTREE_UPDATE_internal_snapshot_node); - if (ret) - break; - } - } - darray_exit(&whiteouts_done); - bch2_trans_iter_exit(trans, &iter); - } - - return ret; -} - -static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) -{ - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); -} - -static int remove_backpointer(struct btree_trans *trans, - struct bch_inode_unpacked *inode) -{ - if (!bch2_inode_has_backpointer(inode)) - return 0; - - u32 snapshot = inode->bi_snapshot; - - if (inode->bi_parent_subvol) { - int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot); - if (ret) - return ret; - } - - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); - int ret = bkey_err(d) ?: - dirent_points_to_inode(c, d, inode) ?: - bch2_fsck_remove_dirent(trans, d.k->p); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) -{ - struct bch_fs *c = trans->c; - - struct bch_inode_unpacked inode; - int ret = bch2_inode_find_by_inum_trans(trans, - (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, - &inode); - if (ret) - return ret; - - ret = remove_backpointer(trans, &inode); - if (!bch2_err_matches(ret, ENOENT)) - bch_err_msg(c, ret, "removing dirent"); - if (ret) - return ret; - - ret = reattach_inode(trans, &inode); - bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); - return ret; -} - -static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) -{ - struct bch_fs *c = trans->c; - - if (!bch2_snapshot_is_leaf(c, snapshotid)) { - bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); - return bch_err_throw(c, fsck_repair_unimplemented); - } - - /* - * If inum isn't set, that means we're being called from check_dirents, - * not check_inodes - the root of this subvolume doesn't exist or we - * would have found it there: - */ - if (!inum) { - struct btree_iter inode_iter = {}; - struct bch_inode_unpacked new_inode; - u64 cpu = raw_smp_processor_id(); - - bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); - - new_inode.bi_subvol = subvolid; - - int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: - bch2_btree_iter_traverse(trans, &inode_iter) ?: - bch2_inode_write(trans, &inode_iter, &new_inode); - bch2_trans_iter_exit(trans, &inode_iter); - if (ret) - return ret; - - inum = new_inode.bi_inum; - } - - bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); - - struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); - int ret = PTR_ERR_OR_ZERO(new_subvol); - if (ret) - return ret; - - bkey_subvolume_init(&new_subvol->k_i); - new_subvol->k.p.offset = subvolid; - new_subvol->v.snapshot = cpu_to_le32(snapshotid); - new_subvol->v.inode = cpu_to_le64(inum); - ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); - if (ret) - return ret; - - struct btree_iter iter; - struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, snapshotid), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(s); - bch_err_msg(c, ret, "getting snapshot %u", snapshotid); - if (ret) - return ret; - - u32 snapshot_tree = le32_to_cpu(s->v.tree); - - s->v.subvol = cpu_to_le32(subvolid); - SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); - bch2_trans_iter_exit(trans, &iter); - - struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshot_trees, POS(0, snapshot_tree), - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(st); - bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); - if (ret) - return ret; - - if (!st->v.master_subvol) - st->v.master_subvol = cpu_to_le32(subvolid); - - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) -{ - struct bch_fs *c = trans->c; - unsigned i_mode = S_IFREG; - u64 i_size = 0; - - switch (btree) { - case BTREE_ID_extents: { - struct btree_iter iter = {}; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); - struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); - bch2_trans_iter_exit(trans, &iter); - int ret = bkey_err(k); - if (ret) - return ret; - - i_size = k.k->p.offset << 9; - break; - } - case BTREE_ID_dirents: - i_mode = S_IFDIR; - break; - case BTREE_ID_xattrs: - break; - default: - BUG(); - } - - struct bch_inode_unpacked new_inode; - bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); - new_inode.bi_size = i_size; - new_inode.bi_inum = inum; - new_inode.bi_snapshot = snapshot; - - return __bch2_fsck_write_inode(trans, &new_inode); -} - -static inline void snapshots_seen_exit(struct snapshots_seen *s) -{ - darray_exit(&s->ids); -} - -static inline void snapshots_seen_init(struct snapshots_seen *s) -{ - memset(s, 0, sizeof(*s)); -} - -static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) -{ - u32 *i; - __darray_for_each(s->ids, i) { - if (*i == id) - return 0; - if (*i > id) - break; - } - - int ret = darray_insert_item(&s->ids, i - s->ids.data, id); - if (ret) - bch_err(c, "error reallocating snapshots_seen table (size %zu)", - s->ids.size); - return ret; -} - -static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, - enum btree_id btree_id, struct bpos pos) -{ - if (!bkey_eq(s->pos, pos)) - s->ids.nr = 0; - s->pos = pos; - - return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); -} - -/** - * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, - * and @ancestor hasn't been overwritten in @seen - * - * @c: filesystem handle - * @seen: list of snapshot ids already seen at current position - * @id: descendent snapshot id - * @ancestor: ancestor snapshot id - * - * Returns: whether key in @ancestor snapshot is visible in @id snapshot - */ -static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, - u32 id, u32 ancestor) -{ - EBUG_ON(id > ancestor); - - if (id == ancestor) - return true; - - if (!bch2_snapshot_is_ancestor(c, id, ancestor)) - return false; - - /* - * We know that @id is a descendant of @ancestor, we're checking if - * we've seen a key that overwrote @ancestor - i.e. also a descendent of - * @ascestor and with @id as a descendent. - * - * But we already know that we're scanning IDs between @id and @ancestor - * numerically, since snapshot ID lists are kept sorted, so if we find - * an id that's an ancestor of @id we're done: - */ - darray_for_each_reverse(seen->ids, i) - if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i)) - return false; - - return true; -} - -/** - * ref_visible - given a key with snapshot id @src that points to a key with - * snapshot id @dst, test whether there is some snapshot in which @dst is - * visible. - * - * @c: filesystem handle - * @s: list of snapshot IDs already seen at @src - * @src: snapshot ID of src key - * @dst: snapshot ID of dst key - * Returns: true if there is some snapshot in which @dst is visible - * - * Assumes we're visiting @src keys in natural key order - */ -static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, - u32 src, u32 dst) -{ - return dst <= src - ? key_visible_in_snapshot(c, s, dst, src) - : bch2_snapshot_is_ancestor(c, src, dst); -} - -static int ref_visible2(struct bch_fs *c, - u32 src, struct snapshots_seen *src_seen, - u32 dst, struct snapshots_seen *dst_seen) -{ - if (dst > src) { - swap(dst, src); - swap(dst_seen, src_seen); - } - return key_visible_in_snapshot(c, src_seen, dst, src); -} - -#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ - for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ - (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ - if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) - -struct inode_walker_entry { - struct bch_inode_unpacked inode; - bool whiteout; - u64 count; - u64 i_size; -}; - -struct inode_walker { - bool first_this_inode; - bool have_inodes; - bool recalculate_sums; - struct bpos last_pos; - - DARRAY(struct inode_walker_entry) inodes; - snapshot_id_list deletes; -}; - -static void inode_walker_exit(struct inode_walker *w) -{ - darray_exit(&w->inodes); - darray_exit(&w->deletes); -} - -static struct inode_walker inode_walker_init(void) -{ - return (struct inode_walker) { 0, }; -} - -static int add_inode(struct bch_fs *c, struct inode_walker *w, - struct bkey_s_c inode) -{ - int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { - .whiteout = !bkey_is_inode(inode.k), - })); - if (ret) - return ret; - - struct inode_walker_entry *n = &darray_last(w->inodes); - if (!n->whiteout) { - return bch2_inode_unpack(inode, &n->inode); - } else { - n->inode.bi_inum = inode.k->p.offset; - n->inode.bi_snapshot = inode.k->p.snapshot; - return 0; - } -} - -static int get_inodes_all_snapshots(struct btree_trans *trans, - struct inode_walker *w, u64 inum) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - /* - * We no longer have inodes for w->last_pos; clear this to avoid - * screwing up check_i_sectors/check_subdir_count if we take a - * transaction restart here: - */ - w->have_inodes = false; - w->recalculate_sums = false; - w->inodes.nr = 0; - - for_each_btree_key_max_norestart(trans, iter, - BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - ret = add_inode(c, w, k); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - return ret; - - w->first_this_inode = true; - w->have_inodes = true; - return 0; -} - -static int get_visible_inodes(struct btree_trans *trans, - struct inode_walker *w, - struct snapshots_seen *s, - u64 inum) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - w->inodes.nr = 0; - w->deletes.nr = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - - if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) - continue; - - if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) - continue; - - ret = bkey_is_inode(k.k) - ? add_inode(c, w, k) - : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static struct inode_walker_entry * -lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - - struct inode_walker_entry *i = darray_find_p(w->inodes, i, - bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)); - - if (!i) - return NULL; - - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, - trans, snapshot_key_missing_inode_snapshot, - "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" - "unexpected because we should always update the inode when we update a key in that inode\n" - "%s", - w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, - (bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - if (!i->whiteout) { - struct bch_inode_unpacked new = i->inode; - new.bi_snapshot = k.k->p.snapshot; - ret = __bch2_fsck_write_inode(trans, &new); - } else { - struct bkey_i whiteout; - bkey_init(&whiteout.k); - whiteout.k.type = KEY_TYPE_whiteout; - whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot); - ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &whiteout, - BTREE_UPDATE_internal_snapshot_node); - } - - if (ret) - goto fsck_err; - - ret = bch2_trans_commit(trans, NULL, NULL, 0); - if (ret) - goto fsck_err; - - struct inode_walker_entry new_entry = *i; - - new_entry.inode.bi_snapshot = k.k->p.snapshot; - new_entry.count = 0; - new_entry.i_size = 0; - - while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) - --i; - - size_t pos = i - w->inodes.data; - ret = darray_insert_item(&w->inodes, pos, new_entry); - if (ret) - goto fsck_err; - - ret = bch_err_throw(c, transaction_restart_nested); - goto fsck_err; - } - - printbuf_exit(&buf); - return i; -fsck_err: - printbuf_exit(&buf); - return ERR_PTR(ret); -} - -static struct inode_walker_entry *walk_inode(struct btree_trans *trans, - struct inode_walker *w, - struct bkey_s_c k) -{ - if (w->last_pos.inode != k.k->p.inode) { - int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); - if (ret) - return ERR_PTR(ret); - } - - w->last_pos = k.k->p; - - return lookup_inode_for_snapshot(trans, w, k); -} - -/* - * Prefer to delete the first one, since that will be the one at the wrong - * offset: - * return value: 0 -> delete k1, 1 -> delete k2 - */ -int bch2_fsck_update_backpointers(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_i *new) -{ - if (new->k.type != KEY_TYPE_dirent) - return 0; - - struct bkey_i_dirent *d = bkey_i_to_dirent(new); - struct inode_walker target = inode_walker_init(); - int ret = 0; - - if (d->v.d_type == DT_SUBVOL) { - bch_err(trans->c, "%s does not support DT_SUBVOL", __func__); - ret = -BCH_ERR_fsck_repair_unimplemented; - } else { - ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); - if (ret) - goto err; - - darray_for_each(target.inodes, i) { - i->inode.bi_dir_offset = d->k.p.offset; - ret = __bch2_fsck_write_inode(trans, &i->inode); - if (ret) - goto err; - } - } -err: - inode_walker_exit(&target); - return ret; -} - -static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - u32 *snapshot) -{ - if (inode->bi_subvol) { - u64 inum; - int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); - if (ret) - return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); - } - - return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); -} - -static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); - int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int check_inode_dirent_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - bool *write_inode) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - u32 inode_snapshot = inode->bi_snapshot; - struct btree_iter dirent_iter = {}; - struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); - int ret = bkey_err(d); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) && - inode->bi_subvol && - (inode->bi_flags & BCH_INODE_has_child_snapshot)) { - /* Older version of a renamed subvolume root: we won't have a - * correct dirent for it. That's expected, see - * inode_should_reattach(). - * - * We don't clear the backpointer field when doing the rename - * because there might be arbitrarily many versions in older - * snapshots. - */ - inode->bi_dir = 0; - inode->bi_dir_offset = 0; - *write_inode = true; - goto out; - } - - if (fsck_err_on(ret, - trans, inode_points_to_missing_dirent, - "inode points to missing dirent\n%s", - (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || - fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode), - trans, inode_points_to_wrong_dirent, - "%s", - (printbuf_reset(&buf), - dirent_inode_mismatch_msg(&buf, c, d, inode), - buf.buf))) { - /* - * We just clear the backpointer fields for now. If we find a - * dirent that points to this inode in check_dirents(), we'll - * update it then; then when we get to check_path() if the - * backpointer is still 0 we'll reattach it. - */ - inode->bi_dir = 0; - inode->bi_dir_offset = 0; - *write_inode = true; - } -out: - ret = 0; -fsck_err: - bch2_trans_iter_exit(trans, &dirent_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int check_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_inode_unpacked *snapshot_root, - struct snapshots_seen *s) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct bch_inode_unpacked u; - bool do_update = false; - int ret; - - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret < 0) - goto err; - if (ret) - return 0; - - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); - if (ret) - goto err; - - if (!bkey_is_inode(k.k)) - return 0; - - ret = bch2_inode_unpack(k, &u); - if (ret) - goto err; - - if (snapshot_root->bi_inum != u.bi_inum) { - ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); - if (ret) - goto err; - } - - if (u.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { - ret = bch2_repair_inode_hash_info(trans, snapshot_root); - BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); - if (ret) - goto err; - } - - ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); - if (ret) - goto err; - - if (bch2_inode_has_backpointer(&u)) { - ret = check_inode_dirent_inode(trans, &u, &do_update); - if (ret) - goto err; - } - - if (fsck_err_on(bch2_inode_has_backpointer(&u) && - (u.bi_flags & BCH_INODE_unlinked), - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - u.bi_flags &= ~BCH_INODE_unlinked; - do_update = true; - } - - if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) { - /* Check for this early so that check_unreachable_inode() will reattach it */ - - ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot); - if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty) - goto err; - - fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty, - "dir unlinked but not empty\n%s", - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf)); - u.bi_flags &= ~BCH_INODE_unlinked; - do_update = true; - ret = 0; - } - - if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, - trans, inode_dir_has_nonzero_i_size, - "directory %llu:%u with nonzero i_size %lli", - u.bi_inum, u.bi_snapshot, u.bi_size)) { - u.bi_size = 0; - do_update = true; - } - - ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret < 0) - goto err; - - if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), - trans, inode_has_child_snapshots_wrong, - "inode has_child_snapshots flag wrong (should be %u)\n%s", - ret, - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - if (ret) - u.bi_flags |= BCH_INODE_has_child_snapshot; - else - u.bi_flags &= ~BCH_INODE_has_child_snapshot; - do_update = true; - } - ret = 0; - - if ((u.bi_flags & BCH_INODE_unlinked) && - !(u.bi_flags & BCH_INODE_has_child_snapshot)) { - if (!test_bit(BCH_FS_started, &c->flags)) { - /* - * If we're not in online fsck, don't delete unlinked - * inodes, just make sure they're on the deleted list. - * - * They might be referred to by a logged operation - - * i.e. we might have crashed in the middle of a - * truncate on an unlinked but open file - so we want to - * let the delete_dead_inodes kill it after resuming - * logged ops. - */ - ret = check_inode_deleted_list(trans, k.k->p); - if (ret < 0) - goto err_noprint; - - fsck_err_on(!ret, - trans, unlinked_inode_not_on_deleted_list, - "inode %llu:%u unlinked, but not on deleted list", - u.bi_inum, k.k->p.snapshot); - - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); - if (ret) - goto err; - } else { - ret = bch2_inode_or_descendents_is_open(trans, k.k->p); - if (ret < 0) - goto err; - - if (fsck_err_on(!ret, - trans, inode_unlinked_and_not_open, - "inode %llu:%u unlinked and not open", - u.bi_inum, u.bi_snapshot)) { - ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); - bch_err_msg(c, ret, "in fsck deleting inode"); - goto err_noprint; - } - ret = 0; - } - } - - if (fsck_err_on(u.bi_parent_subvol && - (u.bi_subvol == 0 || - u.bi_subvol == BCACHEFS_ROOT_SUBVOL), - trans, inode_bi_parent_nonzero, - "inode %llu:%u has subvol %u but nonzero parent subvol %u", - u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { - u.bi_parent_subvol = 0; - do_update = true; - } - - if (u.bi_subvol) { - struct bch_subvolume s; - - ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { - ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); - goto do_update; - } - - if (fsck_err_on(ret, - trans, inode_bi_subvol_missing, - "inode %llu:%u bi_subvol points to missing subvolume %u", - u.bi_inum, k.k->p.snapshot, u.bi_subvol) || - fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || - !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), - k.k->p.snapshot), - trans, inode_bi_subvol_wrong, - "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", - u.bi_inum, k.k->p.snapshot, u.bi_subvol, - le64_to_cpu(s.inode), - le32_to_cpu(s.snapshot))) { - u.bi_subvol = 0; - u.bi_parent_subvol = 0; - do_update = true; - } - } - - if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), - trans, inode_journal_seq_in_future, - "inode journal seq in future (currently at %llu)\n%s", - journal_cur_seq(&c->journal), - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - u.bi_journal_seq = journal_cur_seq(&c->journal); - do_update = true; - } -do_update: - if (do_update) { - ret = __bch2_fsck_write_inode(trans, &u); - bch_err_msg(c, ret, "in fsck updating inode"); - if (ret) - goto err_noprint; - } -err: -fsck_err: - bch_err_fn(c, ret); -err_noprint: - printbuf_exit(&buf); - return ret; -} - -int bch2_check_inodes(struct bch_fs *c) -{ - struct bch_inode_unpacked snapshot_root = {}; - struct snapshots_seen s; - - snapshots_seen_init(&s); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_inode(trans, &iter, k, &snapshot_root, &s))); - - snapshots_seen_exit(&s); - bch_err_fn(c, ret); - return ret; -} - -static int find_oldest_inode_needs_reattach(struct btree_trans *trans, - struct bch_inode_unpacked *inode) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - /* - * We look for inodes to reattach in natural key order, leaves first, - * but we should do the reattach at the oldest version that needs to be - * reattached: - */ - for_each_btree_key_norestart(trans, iter, - BTREE_ID_inodes, - SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode->bi_inum) - break; - - if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) - continue; - - if (!bkey_is_inode(k.k)) - break; - - struct bch_inode_unpacked parent_inode; - ret = bch2_inode_unpack(k, &parent_inode); - if (ret) - break; - - if (!inode_should_reattach(&parent_inode)) - break; - - *inode = parent_inode; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static int check_unreachable_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (!bkey_is_inode(k.k)) - return 0; - - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(k, &inode); - if (ret) - return ret; - - if (!inode_should_reattach(&inode)) - return 0; - - ret = find_oldest_inode_needs_reattach(trans, &inode); - if (ret) - return ret; - - if (fsck_err(trans, inode_unreachable, - "unreachable inode:\n%s", - (bch2_inode_unpacked_to_text(&buf, &inode), - buf.buf))) - ret = reattach_inode(trans, &inode); -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* - * Reattach unreachable (but not unlinked) inodes - * - * Run after check_inodes() and check_dirents(), so we node that inode - * backpointer fields point to valid dirents, and every inode that has a dirent - * that points to it has its backpointer field set - so we're just looking for - * non-unlinked inodes without backpointers: - * - * XXX: this is racy w.r.t. hardlink removal in online fsck - */ -int bch2_check_unreachable_inodes(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_unreachable_inode(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) -{ - switch (btree) { - case BTREE_ID_extents: - return S_ISREG(mode) || S_ISLNK(mode); - case BTREE_ID_dirents: - return S_ISDIR(mode); - case BTREE_ID_xattrs: - return true; - default: - BUG(); - } -} - -static int check_key_has_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct inode_walker *inode, - struct inode_walker_entry *i, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter iter2 = {}; - int ret = PTR_ERR_OR_ZERO(i); - if (ret) - return ret; - - if (k.k->type == KEY_TYPE_whiteout) - goto out; - - bool have_inode = i && !i->whiteout; - - if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) - goto reconstruct; - - if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode)) - goto out; - - prt_printf(&buf, ", "); - - bool have_old_inode = false; - darray_for_each(inode->inodes, i2) - if (!i2->whiteout && - bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) && - btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) { - prt_printf(&buf, "but found good inode in older snapshot\n"); - bch2_inode_unpacked_to_text(&buf, &i2->inode); - prt_newline(&buf); - have_old_inode = true; - break; - } - - struct bkey_s_c k2; - unsigned nr_keys = 0; - - prt_printf(&buf, "found keys:\n"); - - for_each_btree_key_max_norestart(trans, iter2, iter->btree_id, - SPOS(k.k->p.inode, 0, k.k->p.snapshot), - POS(k.k->p.inode, U64_MAX), - 0, k2, ret) { - nr_keys++; - if (nr_keys <= 10) { - bch2_bkey_val_to_text(&buf, c, k2); - prt_newline(&buf); - } - if (nr_keys >= 100) - break; - } - - if (ret) - goto err; - - if (nr_keys > 100) - prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); - else if (nr_keys > 10) - prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); - - if (!have_inode) { - if (fsck_err_on(!have_inode, - trans, key_in_missing_inode, - "key in missing inode%s", buf.buf)) { - /* - * Maybe a deletion that raced with data move, or something - * weird like that? But if we know the inode was deleted, or - * it's just a few keys, we can safely delete them. - * - * If it's many keys, we should probably recreate the inode - */ - if (have_old_inode || nr_keys <= 2) - goto delete; - else - goto reconstruct; - } - } else { - /* - * not autofix, this one would be a giant wtf - bit error in the - * inode corrupting i_mode? - * - * may want to try repairing inode instead of deleting - */ - if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), - trans, key_in_wrong_inode_type, - "key for wrong inode mode %o%s", - i->inode.bi_mode, buf.buf)) - goto delete; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &iter2); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -delete: - /* - * XXX: print out more info - * count up extents for this inode, check if we have different inode in - * an older snapshot version, perhaps decide if we want to reconstitute - */ - ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - goto out; -reconstruct: - ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - - inode->last_pos.inode--; - ret = bch_err_throw(c, transaction_restart_nested); - goto out; -} - -static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) -{ - struct bch_fs *c = trans->c; - int ret = 0; - s64 count2; - - darray_for_each(w->inodes, i) { - if (i->inode.bi_sectors == i->count) - continue; - - count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); - - if (w->recalculate_sums) - i->count = count2; - - if (i->count != count2) { - bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); - i->count = count2; - } - - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), - trans, inode_i_sectors_wrong, - "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", - w->last_pos.inode, i->inode.bi_snapshot, - i->inode.bi_sectors, i->count)) { - i->inode.bi_sectors = i->count; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; - } - } -fsck_err: - bch_err_fn(c, ret); - return ret; -} - -static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) -{ - u32 restart_count = trans->restart_count; - return check_i_sectors_notnested(trans, w) ?: - trans_was_restarted(trans, restart_count); -} - -struct extent_end { - u32 snapshot; - u64 offset; - struct snapshots_seen seen; -}; - -struct extent_ends { - struct bpos last_pos; - DARRAY(struct extent_end) e; -}; - -static void extent_ends_reset(struct extent_ends *extent_ends) -{ - darray_for_each(extent_ends->e, i) - snapshots_seen_exit(&i->seen); - extent_ends->e.nr = 0; -} - -static void extent_ends_exit(struct extent_ends *extent_ends) -{ - extent_ends_reset(extent_ends); - darray_exit(&extent_ends->e); -} - -static void extent_ends_init(struct extent_ends *extent_ends) -{ - memset(extent_ends, 0, sizeof(*extent_ends)); -} - -static int extent_ends_at(struct bch_fs *c, - struct extent_ends *extent_ends, - struct snapshots_seen *seen, - struct bkey_s_c k) -{ - struct extent_end *i, n = (struct extent_end) { - .offset = k.k->p.offset, - .snapshot = k.k->p.snapshot, - .seen = *seen, - }; - - n.seen.ids.data = kmemdup(seen->ids.data, - sizeof(seen->ids.data[0]) * seen->ids.size, - GFP_KERNEL); - if (!n.seen.ids.data) - return bch_err_throw(c, ENOMEM_fsck_extent_ends_at); - - __darray_for_each(extent_ends->e, i) { - if (i->snapshot == k.k->p.snapshot) { - snapshots_seen_exit(&i->seen); - *i = n; - return 0; - } - - if (i->snapshot >= k.k->p.snapshot) - break; - } - - return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); -} - -static int overlapping_extents_found(struct btree_trans *trans, - enum btree_id btree, - struct bpos pos1, struct snapshots_seen *pos1_seen, - struct bkey pos2, - bool *fixed, - struct extent_end *extent_end) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter iter1, iter2 = {}; - struct bkey_s_c k1, k2; - int ret; - - BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); - - bch2_trans_iter_init(trans, &iter1, btree, pos1, - BTREE_ITER_all_snapshots| - BTREE_ITER_not_extents); - k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); - ret = bkey_err(k1); - if (ret) - goto err; - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k1); - - if (!bpos_eq(pos1, k1.k->p)) { - prt_str(&buf, "\nwanted\n "); - bch2_bpos_to_text(&buf, pos1); - prt_str(&buf, "\n"); - bch2_bkey_to_text(&buf, &pos2); - - bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", - __func__, buf.buf); - ret = bch_err_throw(c, internal_fsck_err); - goto err; - } - - bch2_trans_copy_iter(trans, &iter2, &iter1); - - while (1) { - bch2_btree_iter_advance(trans, &iter2); - - k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); - ret = bkey_err(k2); - if (ret) - goto err; - - if (bpos_ge(k2.k->p, pos2.p)) - break; - } - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k2); - - if (bpos_gt(k2.k->p, pos2.p) || - pos2.size != k2.k->size) { - bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", - __func__, buf.buf); - ret = bch_err_throw(c, internal_fsck_err); - goto err; - } - - prt_printf(&buf, "\noverwriting %s extent", - pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); - - if (fsck_err(trans, extent_overlapping, - "overlapping extents%s", buf.buf)) { - struct btree_iter *old_iter = &iter1; - struct disk_reservation res = { 0 }; - - if (pos1.snapshot < pos2.p.snapshot) { - old_iter = &iter2; - swap(k1, k2); - } - - trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); - - ret = bch2_trans_update_extent_overwrite(trans, old_iter, - BTREE_UPDATE_internal_snapshot_node, - k1, k2) ?: - bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); - bch2_disk_reservation_put(c, &res); - - bch_info(c, "repair ret %s", bch2_err_str(ret)); - - if (ret) - goto err; - - *fixed = true; - - if (pos1.snapshot == pos2.p.snapshot) { - /* - * We overwrote the first extent, and did the overwrite - * in the same snapshot: - */ - extent_end->offset = bkey_start_offset(&pos2); - } else if (pos1.snapshot > pos2.p.snapshot) { - /* - * We overwrote the first extent in pos2's snapshot: - */ - ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); - } else { - /* - * We overwrote the second extent - restart - * check_extent() from the top: - */ - ret = bch_err_throw(c, transaction_restart_nested); - } - } -fsck_err: -err: - bch2_trans_iter_exit(trans, &iter2); - bch2_trans_iter_exit(trans, &iter1); - printbuf_exit(&buf); - return ret; -} - -static int check_overlapping_extents(struct btree_trans *trans, - struct snapshots_seen *seen, - struct extent_ends *extent_ends, - struct bkey_s_c k, - struct btree_iter *iter, - bool *fixed) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - /* transaction restart, running again */ - if (bpos_eq(extent_ends->last_pos, k.k->p)) - return 0; - - if (extent_ends->last_pos.inode != k.k->p.inode) - extent_ends_reset(extent_ends); - - darray_for_each(extent_ends->e, i) { - if (i->offset <= bkey_start_offset(k.k)) - continue; - - if (!ref_visible2(c, - k.k->p.snapshot, seen, - i->snapshot, &i->seen)) - continue; - - ret = overlapping_extents_found(trans, iter->btree_id, - SPOS(iter->pos.inode, - i->offset, - i->snapshot), - &i->seen, - *k.k, fixed, i); - if (ret) - goto err; - } - - extent_ends->last_pos = k.k->p; -err: - return ret; -} - -static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (crc_is_encoded(crc) && - crc.uncompressed_size > encoded_extent_max_sectors) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); - printbuf_exit(&buf); - } - - return 0; -} - -static int check_extent(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct inode_walker *inode, - struct snapshots_seen *s, - struct extent_ends *extent_ends, - struct disk_reservation *res) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret) { - ret = ret < 0 ? ret : 0; - goto out; - } - - if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { - ret = check_i_sectors(trans, inode); - if (ret) - goto err; - } - - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); - if (ret) - goto err; - - struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); - ret = PTR_ERR_OR_ZERO(extent_i); - if (ret) - goto err; - - ret = check_key_has_inode(trans, iter, inode, extent_i, k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_whiteout) { - ret = check_overlapping_extents(trans, s, extent_ends, k, iter, - &inode->recalculate_sums); - if (ret) - goto err; - - /* - * Check inodes in reverse order, from oldest snapshots to - * newest, starting from the inode that matches this extent's - * snapshot. If we didn't have one, iterate over all inodes: - */ - for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); - inode->inodes.data && i >= inode->inodes.data; - --i) { - if (i->inode.bi_snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) - continue; - - u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9; - - if (fsck_err_on(k.k->p.offset > last_block && - !bkey_extent_is_reservation(k), - trans, extent_past_end_of_inode, - "extent type past end of inode %llu:%u, i_size %llu\n%s", - i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?: - bch2_fpunch_snapshot(trans, - SPOS(i->inode.bi_inum, - last_block, - i->inode.bi_snapshot), - POS(i->inode.bi_inum, U64_MAX)); - if (ret) - goto err; - - iter->k.type = KEY_TYPE_whiteout; - break; - } - } - } - - ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - - if (bkey_extent_is_allocation(k.k)) { - for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); - inode->inodes.data && i >= inode->inodes.data; - --i) { - if (i->whiteout || - i->inode.bi_snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) - continue; - - i->count += k.k->size; - } - } - - if (k.k->type != KEY_TYPE_whiteout) { - ret = extent_ends_at(c, extent_ends, s, k); - if (ret) - goto err; - } -out: -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -/* - * Walk extents: verify that extents have a corresponding S_ISREG inode, and - * that i_size an i_sectors are consistent - */ -int bch2_check_extents(struct bch_fs *c) -{ - struct inode_walker w = inode_walker_init(); - struct snapshots_seen s; - struct extent_ends extent_ends; - struct disk_reservation res = { 0 }; - - snapshots_seen_init(&s); - extent_ends_init(&extent_ends); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_extents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: - check_extent_overbig(trans, &iter, k); - })) ?: - check_i_sectors_notnested(trans, &w)); - - bch2_disk_reservation_put(c, &res); - extent_ends_exit(&extent_ends); - inode_walker_exit(&w); - snapshots_seen_exit(&s); - - bch_err_fn(c, ret); - return ret; -} - -int bch2_check_indirect_extents(struct bch_fs *c) -{ - struct disk_reservation res = { 0 }; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, - POS_MIN, - BTREE_ITER_prefetch, k, - &res, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ - bch2_disk_reservation_put(c, &res); - check_extent_overbig(trans, &iter, k); - }))); - - bch2_disk_reservation_put(c, &res); - bch_err_fn(c, ret); - return ret; -} - -static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) -{ - struct bch_fs *c = trans->c; - int ret = 0; - s64 count2; - - darray_for_each(w->inodes, i) { - if (i->inode.bi_nlink == i->count) - continue; - - count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); - if (count2 < 0) - return count2; - - if (i->count != count2) { - bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); - i->count = count2; - if (i->inode.bi_nlink == i->count) - continue; - } - - if (i->inode.bi_nlink != i->count) { - CLASS(printbuf, buf)(); - - lockrestart_do(trans, - bch2_inum_snapshot_to_path(trans, w->last_pos.inode, - i->inode.bi_snapshot, NULL, &buf)); - - if (fsck_err_on(i->inode.bi_nlink != i->count, - trans, inode_dir_wrong_nlink, - "directory with wrong i_nlink: got %u, should be %llu\n%s", - i->inode.bi_nlink, i->count, buf.buf)) { - i->inode.bi_nlink = i->count; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; - } - } - } -fsck_err: - bch_err_fn(c, ret); - return ret; -} - -static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) -{ - u32 restart_count = trans->restart_count; - return check_subdir_count_notnested(trans, w) ?: - trans_was_restarted(trans, restart_count); -} - -/* find a subvolume that's a descendent of @snapshot: */ -static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { - if (k.k->type != KEY_TYPE_subvolume) - continue; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { - bch2_trans_iter_exit(trans, &iter); - *subvolid = k.k->p.offset; - goto found; - } - } - if (!ret) - ret = -ENOENT; -found: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -noinline_for_stack -static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c_dirent d) -{ - struct bch_fs *c = trans->c; - struct btree_iter subvol_iter = {}; - struct bch_inode_unpacked subvol_root; - u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); - u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); - u32 parent_snapshot; - u32 new_parent_subvol = 0; - u64 parent_inum; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (ret || - (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { - int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); - if (ret2 && !bch2_err_matches(ret, ENOENT)) - return ret2; - } - - if (ret && - !new_parent_subvol && - (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { - /* - * Couldn't find a subvol for dirent's snapshot - but we lost - * subvols, so we need to reconstruct: - */ - ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); - if (ret) - return ret; - - parent_snapshot = d.k->p.snapshot; - } - - if (fsck_err_on(ret, - trans, dirent_to_missing_parent_subvol, - "dirent parent_subvol points to missing subvolume\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || - fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), - trans, dirent_not_visible_in_parent_subvol, - "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", - parent_snapshot, - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - if (!new_parent_subvol) { - bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); - return bch_err_throw(c, fsck_repair_unimplemented); - } - - struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); - ret = PTR_ERR_OR_ZERO(new_dirent); - if (ret) - goto err; - - new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); - } - - struct bkey_s_c_subvolume s = - bch2_bkey_get_iter_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, target_subvol), - 0, subvolume); - ret = bkey_err(s.s_c); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (ret) { - if (fsck_err(trans, dirent_to_missing_subvol, - "dirent points to missing subvolume\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) - return bch2_fsck_remove_dirent(trans, d.k->p); - ret = 0; - goto out; - } - - if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { - printbuf_reset(&buf); - - prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", - parent_subvol); - - ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, - le64_to_cpu(s.v->inode) }, &buf); - if (ret) - goto err; - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, s.s_c); - - if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { - struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - n->v.fs_path_parent = cpu_to_le32(parent_subvol); - } - } - - u64 target_inum = le64_to_cpu(s.v->inode); - u32 target_snapshot = le32_to_cpu(s.v->snapshot); - - ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, - &subvol_root, 0); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (ret) { - bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; - } - - if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, - trans, inode_bi_parent_wrong, - "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", - target_inum, - subvol_root.bi_parent_subvol, parent_subvol)) { - subvol_root.bi_parent_subvol = parent_subvol; - subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot); - ret = __bch2_fsck_write_inode(trans, &subvol_root); - if (ret) - goto err; - } - - ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); - if (ret) - goto err; -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &subvol_iter); - printbuf_exit(&buf); - return ret; -} - -static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct bch_hash_info *hash_info, - struct inode_walker *dir, - struct inode_walker *target, - struct snapshots_seen *s, - bool *need_second_pass) -{ - struct bch_fs *c = trans->c; - struct inode_walker_entry *i; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret) { - ret = ret < 0 ? ret : 0; - goto out; - } - - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); - if (ret) - goto err; - - if (k.k->type == KEY_TYPE_whiteout) - goto out; - - if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { - ret = check_subdir_dirents_count(trans, dir); - if (ret) - goto err; - } - - i = walk_inode(trans, dir, k); - ret = PTR_ERR_OR_ZERO(i); - if (ret < 0) - goto err; - - ret = check_key_has_inode(trans, iter, dir, i, k); - if (ret) - goto err; - - if (!i || i->whiteout) - goto out; - - if (dir->first_this_inode) - *hash_info = bch2_hash_info_init(c, &i->inode); - dir->first_this_inode = false; - - hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; - - ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, - iter, k, need_second_pass); - if (ret < 0) - goto err; - if (ret) { - /* dirent has been deleted */ - ret = 0; - goto out; - } - - if (k.k->type != KEY_TYPE_dirent) - goto out; - - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - /* check casefold */ - if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding, - trans, dirent_casefold_mismatch, - "dirent casefold does not match dir casefold\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_parent_subvol) - : 0, - }; - u64 target = d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) - : le64_to_cpu(d.v->d_inum); - struct qstr name = bch2_dirent_get_name(d); - - struct bkey_i_dirent *new_d = - bch2_dirent_create_key(trans, hash_info, dir_inum, - d.v->d_type, &name, NULL, target); - ret = PTR_ERR_OR_ZERO(new_d); - if (ret) - goto out; - - new_d->k.p.inode = d.k->p.inode; - new_d->k.p.snapshot = d.k->p.snapshot; - - struct btree_iter dup_iter = {}; - ret = bch2_hash_delete_at(trans, - bch2_dirent_hash_desc, hash_info, iter, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_str_hash_repair_key(trans, s, - &bch2_dirent_hash_desc, hash_info, - iter, bkey_i_to_s_c(&new_d->k_i), - &dup_iter, bkey_s_c_null, - need_second_pass); - goto out; - } - - if (d.v->d_type == DT_SUBVOL) { - ret = check_dirent_to_subvol(trans, iter, d); - if (ret) - goto err; - } else { - ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); - if (ret) - goto err; - - if (fsck_err_on(!target->inodes.nr, - trans, dirent_to_missing_inode, - "dirent points to missing inode:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - ret = bch2_fsck_remove_dirent(trans, d.k->p); - if (ret) - goto err; - } - - darray_for_each(target->inodes, i) { - ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); - if (ret) - goto err; - } - - darray_for_each(target->deletes, i) - if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i), - trans, dirent_to_overwritten_inode, - "dirent points to inode overwritten in snapshot %u:\n%s", - *i, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - struct btree_iter delete_iter; - bch2_trans_iter_init(trans, &delete_iter, - BTREE_ID_dirents, - SPOS(k.k->p.inode, k.k->p.offset, *i), - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - hash_info, - &delete_iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &delete_iter); - if (ret) - goto err; - - } - } - - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - - for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { - if (d.v->d_type == DT_DIR) - i->count++; - i->i_size += bkey_bytes(d.k); - } -out: -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* - * Walk dirents: verify that they all have a corresponding S_ISDIR inode, - * validate d_type - */ -int bch2_check_dirents(struct bch_fs *c) -{ - struct inode_walker dir = inode_walker_init(); - struct inode_walker target = inode_walker_init(); - struct snapshots_seen s; - struct bch_hash_info hash_info; - bool need_second_pass = false, did_second_pass = false; - int ret; - - snapshots_seen_init(&s); -again: - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, - &need_second_pass)) ?: - check_subdir_count_notnested(trans, &dir)); - - if (!ret && need_second_pass && !did_second_pass) { - bch_info(c, "check_dirents requires second pass"); - swap(did_second_pass, need_second_pass); - goto again; - } - - if (!ret && need_second_pass) { - bch_err(c, "dirents not repairing"); - ret = -EINVAL; - } - - snapshots_seen_exit(&s); - inode_walker_exit(&dir); - inode_walker_exit(&target); - bch_err_fn(c, ret); - return ret; -} - -static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct bch_hash_info *hash_info, - struct inode_walker *inode) -{ - struct bch_fs *c = trans->c; - - int ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret < 0) - return ret; - if (ret) - return 0; - - struct inode_walker_entry *i = walk_inode(trans, inode, k); - ret = PTR_ERR_OR_ZERO(i); - if (ret) - return ret; - - ret = check_key_has_inode(trans, iter, inode, i, k); - if (ret) - return ret; - - if (!i || i->whiteout) - return 0; - - if (inode->first_this_inode) - *hash_info = bch2_hash_info_init(c, &i->inode); - inode->first_this_inode = false; - - bool need_second_pass = false; - return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, - iter, k, &need_second_pass); -} - -/* - * Walk xattrs: verify that they all have a corresponding inode - */ -int bch2_check_xattrs(struct bch_fs *c) -{ - struct inode_walker inode = inode_walker_init(); - struct bch_hash_info hash_info; - int ret = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - check_xattr(trans, &iter, k, &hash_info, &inode))); - - inode_walker_exit(&inode); - bch_err_fn(c, ret); - return ret; -} - -static int check_root_trans(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct bch_inode_unpacked root_inode; - u32 snapshot; - u64 inum; - int ret; - - ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, - "root subvol missing")) { - struct bkey_i_subvolume *root_subvol = - bch2_trans_kmalloc(trans, sizeof(*root_subvol)); - ret = PTR_ERR_OR_ZERO(root_subvol); - if (ret) - goto err; - - snapshot = U32_MAX; - inum = BCACHEFS_ROOT_INO; - - bkey_subvolume_init(&root_subvol->k_i); - root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_subvol->v.flags = 0; - root_subvol->v.snapshot = cpu_to_le32(snapshot); - root_subvol->v.inode = cpu_to_le64(inum); - ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); - bch_err_msg(c, ret, "writing root subvol"); - if (ret) - goto err; - } - - ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, - &root_inode, 0); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (mustfix_fsck_err_on(ret, - trans, root_dir_missing, - "root directory missing") || - mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), - trans, root_inode_not_dir, - "root inode not a directory")) { - bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, - 0, NULL); - root_inode.bi_inum = inum; - root_inode.bi_snapshot = snapshot; - - ret = __bch2_fsck_write_inode(trans, &root_inode); - bch_err_msg(c, ret, "writing root inode"); - } -err: -fsck_err: - return ret; -} - -/* Get root directory, create if it doesn't exist: */ -int bch2_check_root(struct bch_fs *c) -{ - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_root_trans(trans)); - bch_err_fn(c, ret); - return ret; -} - -static bool darray_u32_has(darray_u32 *d, u32 v) -{ - darray_for_each(*d, i) - if (*i == v) - return true; - return false; -} - -static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct btree_iter parent_iter = {}; - darray_u32 subvol_path = {}; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (k.k->type != KEY_TYPE_subvolume) - return 0; - - subvol_inum start = { - .subvol = k.k->p.offset, - .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode), - }; - - while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { - ret = darray_push(&subvol_path, k.k->p.offset); - if (ret) - goto err; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - - struct bch_inode_unpacked subvol_root; - ret = bch2_inode_find_by_inum_trans(trans, - (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, - &subvol_root); - if (ret) - break; - - u32 parent = le32_to_cpu(s.v->fs_path_parent); - - if (darray_u32_has(&subvol_path, parent)) { - printbuf_reset(&buf); - prt_printf(&buf, "subvolume loop: "); - - ret = bch2_inum_to_path(trans, start, &buf); - if (ret) - goto err; - - if (fsck_err(trans, subvol_loop, "%s", buf.buf)) - ret = reattach_subvol(trans, s); - break; - } - - bch2_trans_iter_exit(trans, &parent_iter); - bch2_trans_iter_init(trans, &parent_iter, - BTREE_ID_subvolumes, POS(0, parent), 0); - k = bch2_btree_iter_peek_slot(trans, &parent_iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, - trans, subvol_unreachable, - "unreachable subvolume %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, s.s_c), - buf.buf))) { - ret = reattach_subvol(trans, s); - break; - } - } -fsck_err: -err: - printbuf_exit(&buf); - darray_exit(&subvol_path); - bch2_trans_iter_exit(trans, &parent_iter); - return ret; -} - -int bch2_check_subvolume_structure(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol_path(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_bi_depth_renumber_one(struct btree_trans *trans, - u64 inum, u32 snapshot, - u32 new_depth) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), 0); - - struct bch_inode_unpacked inode; - int ret = bkey_err(k) ?: - !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode - : bch2_inode_unpack(k, &inode); - if (ret) - goto err; - - if (inode.bi_depth != new_depth) { - inode.bi_depth = new_depth; - ret = __bch2_fsck_write_inode(trans, &inode) ?: - bch2_trans_commit(trans, NULL, NULL, 0); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path, - u32 snapshot, u32 new_bi_depth) -{ - u32 restart_count = trans->restart_count; - int ret = 0; - - darray_for_each_reverse(*path, i) { - ret = nested_lockrestart_do(trans, - bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth)); - bch_err_fn(trans->c, ret); - if (ret) - break; - - new_bi_depth++; - } - - return ret ?: trans_was_restarted(trans, restart_count); -} - -static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) -{ - struct bch_fs *c = trans->c; - struct btree_iter inode_iter = {}; - darray_u64 path = {}; - struct printbuf buf = PRINTBUF; - u32 snapshot = inode_k.k->p.snapshot; - bool redo_bi_depth = false; - u32 min_bi_depth = U32_MAX; - int ret = 0; - - struct bpos start = inode_k.k->p; - - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(inode_k, &inode); - if (ret) - return ret; - - /* - * If we're running full fsck, check_dirents() will have already ran, - * and we shouldn't see any missing backpointers here - otherwise that's - * handled separately, by check_unreachable_inodes - */ - while (!inode.bi_subvol && - bch2_inode_has_backpointer(&inode)) { - struct btree_iter dirent_iter; - struct bkey_s_c_dirent d; - - d = dirent_get_by_pos(trans, &dirent_iter, - SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot)); - ret = bkey_err(d.s_c); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto out; - - if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) - bch2_trans_iter_exit(trans, &dirent_iter); - - if (bch2_err_matches(ret, ENOENT)) { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, inode_k); - bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", - bch2_err_str(ret), buf.buf); - goto out; - } - - bch2_trans_iter_exit(trans, &dirent_iter); - - ret = darray_push(&path, inode.bi_inum); - if (ret) - return ret; - - bch2_trans_iter_exit(trans, &inode_iter); - inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, inode.bi_dir, snapshot), 0); - - struct bch_inode_unpacked parent_inode; - ret = bkey_err(inode_k) ?: - !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode - : bch2_inode_unpack(inode_k, &parent_inode); - if (ret) { - /* Should have been caught in dirents pass */ - bch_err_msg(c, ret, "error looking up parent directory"); - goto out; - } - - min_bi_depth = parent_inode.bi_depth; - - if (parent_inode.bi_depth < inode.bi_depth && - min_bi_depth < U16_MAX) - break; - - inode = parent_inode; - redo_bi_depth = true; - - if (darray_find(path, inode.bi_inum)) { - printbuf_reset(&buf); - prt_printf(&buf, "directory structure loop in snapshot %u: ", - snapshot); - - ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf); - if (ret) - goto out; - - if (c->opts.verbose) { - prt_newline(&buf); - darray_for_each(path, i) - prt_printf(&buf, "%llu ", *i); - } - - if (fsck_err(trans, dir_loop, "%s", buf.buf)) { - ret = remove_backpointer(trans, &inode); - bch_err_msg(c, ret, "removing dirent"); - if (ret) - goto out; - - ret = reattach_inode(trans, &inode); - bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); - } - - goto out; - } - } - - if (inode.bi_subvol) - min_bi_depth = 0; - - if (redo_bi_depth) - ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth); -out: -fsck_err: - bch2_trans_iter_exit(trans, &inode_iter); - darray_exit(&path); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -/* - * Check for loops in the directory structure: all other connectivity issues - * have been fixed by prior passes - */ -int bch2_check_directory_structure(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_intent| - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - if (!S_ISDIR(bkey_inode_mode(k))) - continue; - - if (bch2_inode_flags(k) & BCH_INODE_unlinked) - continue; - - check_path_loop(trans, k); - }))); - - bch_err_fn(c, ret); - return ret; -} - -struct nlink_table { - size_t nr; - size_t size; - - struct nlink { - u64 inum; - u32 snapshot; - u32 count; - } *d; -}; - -static int add_nlink(struct bch_fs *c, struct nlink_table *t, - u64 inum, u32 snapshot) -{ - if (t->nr == t->size) { - size_t new_size = max_t(size_t, 128UL, t->size * 2); - void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); - - if (!d) { - bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", - new_size); - return bch_err_throw(c, ENOMEM_fsck_add_nlink); - } - - if (t->d) - memcpy(d, t->d, t->size * sizeof(t->d[0])); - kvfree(t->d); - - t->d = d; - t->size = new_size; - } - - - t->d[t->nr++] = (struct nlink) { - .inum = inum, - .snapshot = snapshot, - }; - - return 0; -} - -static int nlink_cmp(const void *_l, const void *_r) -{ - const struct nlink *l = _l; - const struct nlink *r = _r; - - return cmp_int(l->inum, r->inum); -} - -static void inc_link(struct bch_fs *c, struct snapshots_seen *s, - struct nlink_table *links, - u64 range_start, u64 range_end, u64 inum, u32 snapshot) -{ - struct nlink *link, key = { - .inum = inum, .snapshot = U32_MAX, - }; - - if (inum < range_start || inum >= range_end) - return; - - link = __inline_bsearch(&key, links->d, links->nr, - sizeof(links->d[0]), nlink_cmp); - if (!link) - return; - - while (link > links->d && link[0].inum == link[-1].inum) - --link; - - for (; link < links->d + links->nr && link->inum == inum; link++) - if (ref_visible(c, s, snapshot, link->snapshot)) { - link->count++; - if (link->snapshot >= snapshot) - break; - } -} - -noinline_for_stack -static int check_nlinks_find_hardlinks(struct bch_fs *c, - struct nlink_table *t, - u64 start, u64 *end) -{ - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_inodes, - POS(0, start), - BTREE_ITER_intent| - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - if (!bkey_is_inode(k.k)) - continue; - - /* Should never fail, checked by bch2_inode_invalid: */ - struct bch_inode_unpacked u; - _ret3 = bch2_inode_unpack(k, &u); - if (_ret3) - break; - - /* - * Backpointer and directory structure checks are sufficient for - * directories, since they can't have hardlinks: - */ - if (S_ISDIR(u.bi_mode)) - continue; - - /* - * Previous passes ensured that bi_nlink is nonzero if - * it had multiple hardlinks: - */ - if (!u.bi_nlink) - continue; - - ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); - if (ret) { - *end = k.k->p.offset; - ret = 0; - break; - } - 0; - }))); - - bch_err_fn(c, ret); - return ret; -} - -noinline_for_stack -static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, - u64 range_start, u64 range_end) -{ - struct snapshots_seen s; - - snapshots_seen_init(&s); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, - BTREE_ITER_intent| - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); - if (ret) - break; - - if (k.k->type == KEY_TYPE_dirent) { - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - if (d.v->d_type != DT_DIR && - d.v->d_type != DT_SUBVOL) - inc_link(c, &s, links, range_start, range_end, - le64_to_cpu(d.v->d_inum), d.k->p.snapshot); - } - 0; - }))); - - snapshots_seen_exit(&s); - - bch_err_fn(c, ret); - return ret; -} - -static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct nlink_table *links, - size_t *idx, u64 range_end) -{ - struct bch_inode_unpacked u; - struct nlink *link = &links->d[*idx]; - int ret = 0; - - if (k.k->p.offset >= range_end) - return 1; - - if (!bkey_is_inode(k.k)) - return 0; - - ret = bch2_inode_unpack(k, &u); - if (ret) - return ret; - - if (S_ISDIR(u.bi_mode)) - return 0; - - if (!u.bi_nlink) - return 0; - - while ((cmp_int(link->inum, k.k->p.offset) ?: - cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { - BUG_ON(*idx == links->nr); - link = &links->d[++*idx]; - } - - if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, - trans, inode_wrong_nlink, - "inode %llu type %s has wrong i_nlink (%u, should be %u)", - u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], - bch2_inode_nlink_get(&u), link->count)) { - bch2_inode_nlink_set(&u, link->count); - ret = __bch2_fsck_write_inode(trans, &u); - } -fsck_err: - return ret; -} - -noinline_for_stack -static int check_nlinks_update_hardlinks(struct bch_fs *c, - struct nlink_table *links, - u64 range_start, u64 range_end) -{ - size_t idx = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS(0, range_start), - BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); - if (ret < 0) { - bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); - return ret; - } - - return 0; -} - -int bch2_check_nlinks(struct bch_fs *c) -{ - struct nlink_table links = { 0 }; - u64 this_iter_range_start, next_iter_range_start = 0; - int ret = 0; - - do { - this_iter_range_start = next_iter_range_start; - next_iter_range_start = U64_MAX; - - ret = check_nlinks_find_hardlinks(c, &links, - this_iter_range_start, - &next_iter_range_start); - - ret = check_nlinks_walk_dirents(c, &links, - this_iter_range_start, - next_iter_range_start); - if (ret) - break; - - ret = check_nlinks_update_hardlinks(c, &links, - this_iter_range_start, - next_iter_range_start); - if (ret) - break; - - links.nr = 0; - } while (next_iter_range_start != U64_MAX); - - kvfree(links.d); - bch_err_fn(c, ret); - return ret; -} - -static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_p p; - struct bkey_i_reflink_p *u; - - if (k.k->type != KEY_TYPE_reflink_p) - return 0; - - p = bkey_s_c_to_reflink_p(k); - - if (!p.v->front_pad && !p.v->back_pad) - return 0; - - u = bch2_trans_kmalloc(trans, sizeof(*u)); - int ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - bkey_reassemble(&u->k_i, k); - u->v.front_pad = 0; - u->v.back_pad = 0; - - return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); -} - -int bch2_fix_reflink_p(struct bch_fs *c) -{ - if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) - return 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_extents, POS_MIN, - BTREE_ITER_intent|BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - fix_reflink_p_key(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -#ifndef NO_BCACHEFS_CHARDEV - -struct fsck_thread { - struct thread_with_stdio thr; - struct bch_fs *c; - struct bch_opts opts; -}; - -static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) -{ - struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); - kfree(thr); -} - -static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) -{ - struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = thr->c; - - int ret = PTR_ERR_OR_ZERO(c); - if (ret) - return ret; - - ret = bch2_fs_start(thr->c); - if (ret) - goto err; - - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { - bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); - ret |= 1; - } - if (test_bit(BCH_FS_error, &c->flags)) { - bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); - ret |= 4; - } -err: - bch2_fs_stop(c); - return ret; -} - -static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { - .exit = bch2_fsck_thread_exit, - .fn = bch2_fsck_offline_thread_fn, -}; - -long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) -{ - struct bch_ioctl_fsck_offline arg; - struct fsck_thread *thr = NULL; - darray_const_str devs = {}; - long ret = 0; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - for (size_t i = 0; i < arg.nr_devs; i++) { - u64 dev_u64; - ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); - if (ret) - goto err; - - char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); - ret = PTR_ERR_OR_ZERO(dev_str); - if (ret) - goto err; - - ret = darray_push(&devs, dev_str); - if (ret) { - kfree(dev_str); - goto err; - } - } - - thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) { - ret = -ENOMEM; - goto err; - } - - thr->opts = bch2_opts_empty(); - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false); - if (!IS_ERR(optstr)) - kfree(optstr); - - if (ret) - goto err; - } - - opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); - opt_set(thr->opts, read_only, 1); - opt_set(thr->opts, ratelimit_errors, 0); - - /* We need request_key() to be called before we punt to kthread: */ - opt_set(thr->opts, nostart, true); - - bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); - - thr->c = bch2_fs_open(&devs, &thr->opts); - - if (!IS_ERR(thr->c) && - thr->c->opts.errors == BCH_ON_ERROR_panic) - thr->c->opts.errors = BCH_ON_ERROR_ro; - - ret = __bch2_run_thread_with_stdio(&thr->thr); -out: - darray_for_each(devs, i) - kfree(*i); - darray_exit(&devs); - return ret; -err: - if (thr) - bch2_fsck_thread_exit(&thr->thr); - pr_err("ret %s", bch2_err_str(ret)); - goto out; -} - -static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) -{ - struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = thr->c; - - c->stdio_filter = current; - c->stdio = &thr->thr.stdio; - - /* - * XXX: can we figure out a way to do this without mucking with c->opts? - */ - unsigned old_fix_errors = c->opts.fix_errors; - if (opt_defined(thr->opts, fix_errors)) - c->opts.fix_errors = thr->opts.fix_errors; - else - c->opts.fix_errors = FSCK_FIX_ask; - - c->opts.fsck = true; - set_bit(BCH_FS_in_fsck, &c->flags); - - int ret = bch2_run_online_recovery_passes(c, ~0ULL); - - clear_bit(BCH_FS_in_fsck, &c->flags); - bch_err_fn(c, ret); - - c->stdio = NULL; - c->stdio_filter = NULL; - c->opts.fix_errors = old_fix_errors; - - up(&c->recovery.run_lock); - bch2_ro_ref_put(c); - return ret; -} - -static const struct thread_with_stdio_ops bch2_online_fsck_ops = { - .exit = bch2_fsck_thread_exit, - .fn = bch2_fsck_online_thread_fn, -}; - -long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) -{ - struct fsck_thread *thr = NULL; - long ret = 0; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!bch2_ro_ref_tryget(c)) - return -EROFS; - - if (down_trylock(&c->recovery.run_lock)) { - bch2_ro_ref_put(c); - return -EAGAIN; - } - - thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) { - ret = -ENOMEM; - goto err; - } - - thr->c = c; - thr->opts = bch2_opts_empty(); - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false); - if (!IS_ERR(optstr)) - kfree(optstr); - - if (ret) - goto err; - } - - ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); -err: - if (ret < 0) { - bch_err_fn(c, ret); - if (thr) - bch2_fsck_thread_exit(&thr->thr); - up(&c->recovery.run_lock); - bch2_ro_ref_put(c); - } - return ret; -} - -#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h deleted file mode 100644 index e5fe7cf7b25141..00000000000000 --- a/fs/bcachefs/fsck.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FSCK_H -#define _BCACHEFS_FSCK_H - -#include "str_hash.h" - -/* recoverds snapshot IDs of overwrites at @pos */ -struct snapshots_seen { - struct bpos pos; - snapshot_id_list ids; -}; - -int bch2_fsck_update_backpointers(struct btree_trans *, - struct snapshots_seen *, - const struct bch_hash_desc, - struct bch_hash_info *, - struct bkey_i *); - -int bch2_check_inodes(struct bch_fs *); -int bch2_check_extents(struct bch_fs *); -int bch2_check_indirect_extents(struct bch_fs *); -int bch2_check_dirents(struct bch_fs *); -int bch2_check_xattrs(struct bch_fs *); -int bch2_check_root(struct bch_fs *); -int bch2_check_subvolume_structure(struct bch_fs *); -int bch2_check_unreachable_inodes(struct bch_fs *); -int bch2_check_directory_structure(struct bch_fs *); -int bch2_check_nlinks(struct bch_fs *); -int bch2_fix_reflink_p(struct bch_fs *); - -long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *); -long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online); - -#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c deleted file mode 100644 index ef4cc7395b86b2..00000000000000 --- a/fs/bcachefs/inode.c +++ /dev/null @@ -1,1566 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_key_cache.h" -#include "btree_write_buffer.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "buckets.h" -#include "compress.h" -#include "dirent.h" -#include "disk_accounting.h" -#include "error.h" -#include "extents.h" -#include "extent_update.h" -#include "fs.h" -#include "inode.h" -#include "namei.h" -#include "opts.h" -#include "str_hash.h" -#include "snapshot.h" -#include "subvolume.h" -#include "varint.h" - -#include - -#include - -#define x(name, ...) #name, -const char * const bch2_inode_opts[] = { - BCH_INODE_OPTS() - NULL, -}; - -static const char * const bch2_inode_flag_strs[] = { - BCH_INODE_FLAGS() - NULL -}; -#undef x - -static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); -static int may_delete_deleted_inum(struct btree_trans *, subvol_inum); - -static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; - -static int inode_decode_field(const u8 *in, const u8 *end, - u64 out[2], unsigned *out_bits) -{ - __be64 be[2] = { 0, 0 }; - unsigned bytes, shift; - u8 *p; - - if (in >= end) - return -BCH_ERR_inode_unpack_error; - - if (!*in) - return -BCH_ERR_inode_unpack_error; - - /* - * position of highest set bit indicates number of bytes: - * shift = number of bits to remove in high byte: - */ - shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ - bytes = byte_table[shift - 1]; - - if (in + bytes > end) - return -BCH_ERR_inode_unpack_error; - - p = (u8 *) be + 16 - bytes; - memcpy(p, in, bytes); - *p ^= (1 << 8) >> shift; - - out[0] = be64_to_cpu(be[0]); - out[1] = be64_to_cpu(be[1]); - *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); - - return bytes; -} - -static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) -{ - struct bkey_i_inode_v3 *k = &packed->inode; - u8 *out = k->v.fields; - u8 *end = (void *) &packed[1]; - u8 *last_nonzero_field = out; - unsigned nr_fields = 0, last_nonzero_fieldnr = 0; - unsigned bytes; - int ret; - - bkey_inode_v3_init(&packed->inode.k_i); - packed->inode.k.p.offset = inode->bi_inum; - packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); - packed->inode.v.bi_hash_seed = inode->bi_hash_seed; - packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); - packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors); - packed->inode.v.bi_size = cpu_to_le64(inode->bi_size); - packed->inode.v.bi_version = cpu_to_le64(inode->bi_version); - SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode); - SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR); - - -#define x(_name, _bits) \ - nr_fields++; \ - \ - if (inode->_name) { \ - ret = bch2_varint_encode_fast(out, inode->_name); \ - out += ret; \ - \ - if (_bits > 64) \ - *out++ = 0; \ - \ - last_nonzero_field = out; \ - last_nonzero_fieldnr = nr_fields; \ - } else { \ - *out++ = 0; \ - \ - if (_bits > 64) \ - *out++ = 0; \ - } - - BCH_INODE_FIELDS_v3() -#undef x - BUG_ON(out > end); - - out = last_nonzero_field; - nr_fields = last_nonzero_fieldnr; - - bytes = out - (u8 *) &packed->inode.v; - set_bkey_val_bytes(&packed->inode.k, bytes); - memset_u64s_tail(&packed->inode.v, 0, bytes); - - SET_INODEv3_NR_FIELDS(&k->v, nr_fields); - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - struct bch_inode_unpacked unpacked; - - ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked); - BUG_ON(ret); - BUG_ON(unpacked.bi_inum != inode->bi_inum); - BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); - BUG_ON(unpacked.bi_sectors != inode->bi_sectors); - BUG_ON(unpacked.bi_size != inode->bi_size); - BUG_ON(unpacked.bi_version != inode->bi_version); - BUG_ON(unpacked.bi_mode != inode->bi_mode); - -#define x(_name, _bits) if (unpacked._name != inode->_name) \ - panic("unpacked %llu should be %llu", \ - (u64) unpacked._name, (u64) inode->_name); - BCH_INODE_FIELDS_v3() -#undef x - } -} - -void bch2_inode_pack(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) -{ - bch2_inode_pack_inlined(packed, inode); -} - -static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, - struct bch_inode_unpacked *unpacked) -{ - const u8 *in = inode.v->fields; - const u8 *end = bkey_val_end(inode); - u64 field[2]; - unsigned fieldnr = 0, field_bits; - int ret; - -#define x(_name, _bits) \ - if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \ - unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ - memset((void *) unpacked + offset, 0, \ - sizeof(*unpacked) - offset); \ - return 0; \ - } \ - \ - ret = inode_decode_field(in, end, field, &field_bits); \ - if (ret < 0) \ - return ret; \ - \ - if (field_bits > sizeof(unpacked->_name) * 8) \ - return -BCH_ERR_inode_unpack_error; \ - \ - unpacked->_name = field[1]; \ - in += ret; - - BCH_INODE_FIELDS_v2() -#undef x - - /* XXX: signal if there were more fields than expected? */ - return 0; -} - -static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, - const u8 *in, const u8 *end, - unsigned nr_fields) -{ - unsigned fieldnr = 0; - int ret; - u64 v[2]; - -#define x(_name, _bits) \ - if (fieldnr < nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v[0]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - \ - if (_bits > 64) { \ - ret = bch2_varint_decode_fast(in, end, &v[1]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v[1] = 0; \ - } \ - } else { \ - v[0] = v[1] = 0; \ - } \ - \ - unpacked->_name = v[0]; \ - if (v[1] || v[0] != unpacked->_name) \ - return -BCH_ERR_inode_unpack_error; \ - fieldnr++; - - BCH_INODE_FIELDS_v2() -#undef x - - /* XXX: signal if there were more fields than expected? */ - return 0; -} - -static int bch2_inode_unpack_v3(struct bkey_s_c k, - struct bch_inode_unpacked *unpacked) -{ - struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); - const u8 *in = inode.v->fields; - const u8 *end = bkey_val_end(inode); - unsigned nr_fields = INODEv3_NR_FIELDS(inode.v); - unsigned fieldnr = 0; - int ret; - u64 v[2]; - - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); - unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors); - unpacked->bi_size = le64_to_cpu(inode.v->bi_size); - unpacked->bi_version = le64_to_cpu(inode.v->bi_version); - unpacked->bi_mode = INODEv3_MODE(inode.v); - -#define x(_name, _bits) \ - if (fieldnr < nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v[0]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - \ - if (_bits > 64) { \ - ret = bch2_varint_decode_fast(in, end, &v[1]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v[1] = 0; \ - } \ - } else { \ - v[0] = v[1] = 0; \ - } \ - \ - unpacked->_name = v[0]; \ - if (v[1] || v[0] != unpacked->_name) \ - return -BCH_ERR_inode_unpack_error; \ - fieldnr++; - - BCH_INODE_FIELDS_v3() -#undef x - - /* XXX: signal if there were more fields than expected? */ - return 0; -} - -static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, - struct bch_inode_unpacked *unpacked) -{ - memset(unpacked, 0, sizeof(*unpacked)); - - switch (k.k->type) { - case KEY_TYPE_inode: { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= 0; - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); - unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - - if (INODEv1_NEW_VARINT(inode.v)) { - return bch2_inode_unpack_v2(unpacked, inode.v->fields, - bkey_val_end(inode), - INODEv1_NR_FIELDS(inode.v)); - } else { - return bch2_inode_unpack_v1(inode, unpacked); - } - break; - } - case KEY_TYPE_inode_v2: { - struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); - - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); - unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - - return bch2_inode_unpack_v2(unpacked, inode.v->fields, - bkey_val_end(inode), - INODEv2_NR_FIELDS(inode.v)); - } - default: - BUG(); - } -} - -int bch2_inode_unpack(struct bkey_s_c k, - struct bch_inode_unpacked *unpacked) -{ - return likely(k.k->type == KEY_TYPE_inode_v3) - ? bch2_inode_unpack_v3(k, unpacked) - : bch2_inode_unpack_slowpath(k, unpacked); -} - -int __bch2_inode_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - subvol_inum inum, unsigned flags, - bool warn) -{ - u32 snapshot; - int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn); - if (ret) - return ret; - - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), - flags|BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; - if (ret) - goto err; - - ret = bch2_inode_unpack(k, inode); - if (ret) - goto err; - - return 0; -err: - if (warn) - bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum); - bch2_trans_iter_exit(trans, iter); - return ret; -} - -int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, - u64 inode_nr, u32 snapshot, - struct bch_inode_unpacked *inode, - unsigned flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inode_nr, snapshot), flags); - int ret = bkey_err(k); - if (ret) - goto err; - - ret = bkey_is_inode(k.k) - ? bch2_inode_unpack(k, inode) - : -BCH_ERR_ENOENT_inode; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); -} - -int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, - struct bch_inode_unpacked *root) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) { - ret = bch2_inode_unpack(k, root); - goto out; - } - } - /* We're only called when we know we have an inode for @inum */ - BUG_ON(!ret); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_write_flags(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_inode_buf *inode_p; - - inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack_inlined(inode_p, inode); - inode_p->inode.k.p.snapshot = iter->snapshot; - return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); -} - -int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) -{ - struct bkey_inode_buf *inode_p = - bch2_trans_kmalloc(trans, sizeof(*inode_p)); - - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack(inode_p, inode); - inode_p->inode.k.p.snapshot = inode->bi_snapshot; - - return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &inode_p->inode.k_i, - BTREE_UPDATE_internal_snapshot_node); -} - -int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) -{ - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_fsck_write_inode(trans, inode)); - bch_err_fn(trans->c, ret); - return ret; -} - -struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) -{ - struct bch_inode_unpacked u; - struct bkey_inode_buf *inode_p; - int ret; - - if (!bkey_is_inode(&k->k)) - return ERR_PTR(-ENOENT); - - inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); - if (IS_ERR(inode_p)) - return ERR_CAST(inode_p); - - ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u); - if (ret) - return ERR_PTR(ret); - - bch2_inode_pack(inode_p, &u); - return &inode_p->inode.k_i; -} - -static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bch_inode_unpacked unpacked; - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode, - c, inode_pos_inode_nonzero, - "nonzero k.p.inode"); - - bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, - c, inode_pos_blockdev_range, - "fs inode in blockdev range"); - - bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), - c, inode_unpack_error, - "invalid variable length fields"); - - bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, - c, inode_checksum_type_invalid, - "invalid data checksum type (%u >= %u", - unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); - - bkey_fsck_err_on(unpacked.bi_compression && - !bch2_compression_opt_valid(unpacked.bi_compression - 1), - c, inode_compression_type_invalid, - "invalid compression opt %u", unpacked.bi_compression - 1); - - bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && - unpacked.bi_nlink != 0, - c, inode_unlinked_but_nlink_nonzero, - "flagged as unlinked but bi_nlink != 0"); - - bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), - c, inode_subvol_root_but_not_dir, - "subvolume root but not a directory"); -fsck_err: - return ret; -} - -int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - int ret = 0; - - bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR, - c, inode_str_hash_invalid, - "invalid str hash type (%llu >= %u)", - INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); - - ret = __bch2_inode_validate(c, k, from); -fsck_err: - return ret; -} - -int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); - int ret = 0; - - bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, - c, inode_str_hash_invalid, - "invalid str hash type (%llu >= %u)", - INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - - ret = __bch2_inode_validate(c, k, from); -fsck_err: - return ret; -} - -int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); - int ret = 0; - - bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || - INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), - c, inode_v3_fields_start_bad, - "invalid fields_start (got %llu, min %u max %zu)", - INODEv3_FIELDS_START(inode.v), - INODEv3_FIELDS_START_INITIAL, - bkey_val_u64s(inode.k)); - - bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, - c, inode_str_hash_invalid, - "invalid str hash type (%llu >= %u)", - INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - - ret = __bch2_inode_validate(c, k, from); -fsck_err: - return ret; -} - -static void __bch2_inode_unpacked_to_text(struct printbuf *out, - struct bch_inode_unpacked *inode) -{ - prt_printf(out, "\n"); - printbuf_indent_add(out, 2); - prt_printf(out, "mode=%o\n", inode->bi_mode); - - prt_str(out, "flags="); - prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); - prt_printf(out, "(%x)\n", inode->bi_flags); - - prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); - prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed); - prt_printf(out, "hash_type="); - bch2_prt_str_hash_type(out, INODE_STR_HASH(inode)); - prt_newline(out); - prt_printf(out, "bi_size=%llu\n", inode->bi_size); - prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors); - prt_printf(out, "bi_version=%llu\n", inode->bi_version); - -#define x(_name, _bits) \ - prt_printf(out, #_name "=%llu\n", (u64) inode->_name); - BCH_INODE_FIELDS_v3() -#undef x - - bch2_printbuf_strip_trailing_newline(out); - printbuf_indent_sub(out, 2); -} - -void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) -{ - prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot); - __bch2_inode_unpacked_to_text(out, inode); -} - -void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bch_inode_unpacked inode; - - if (bch2_inode_unpack(k, &inode)) { - prt_printf(out, "(unpack error)"); - return; - } - - __bch2_inode_unpacked_to_text(out, &inode); -} - -static inline u64 bkey_inode_flags(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); - case KEY_TYPE_inode_v2: - return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); - case KEY_TYPE_inode_v3: - return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); - default: - return 0; - } -} - -static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f); - return; - case KEY_TYPE_inode_v2: - bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f); - return; - case KEY_TYPE_inode_v3: - bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f); - return; - default: - BUG(); - } -} - -static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) -{ - unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; - - return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); -} - -static struct bkey_s_c -bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, - enum btree_id btree, struct bpos pos, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_max_norestart(trans, *iter, btree, - bpos_successor(pos), - SPOS(pos.inode, pos.offset, U32_MAX), - flags|BTREE_ITER_all_snapshots, k, ret) - if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) - return k; - - bch2_trans_iter_exit(trans, iter); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -} - -static struct bkey_s_c -bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos, unsigned flags) -{ - struct bkey_s_c k; -again: - k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags); - if (!k.k || - bkey_err(k) || - bkey_is_inode(k.k)) - return k; - - bch2_trans_iter_exit(trans, iter); - pos = k.k->p; - goto again; -} - -int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_max_norestart(trans, iter, - BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), - BTREE_ITER_all_snapshots| - BTREE_ITER_with_updates, k, ret) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) && - bkey_is_inode(k.k)) { - ret = 1; - break; - } - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int update_inode_has_children(struct btree_trans *trans, - struct bkey_s k, - bool have_child) -{ - if (!have_child) { - int ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret) - return ret < 0 ? ret : 0; - } - - u64 f = bkey_inode_flags(k.s_c); - if (have_child != !!(f & BCH_INODE_has_child_snapshot)) - bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot); - - return 0; -} - -static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos, - bool have_child) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans, - &iter, pos, BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - if (!k.k) - return 0; - - if (!have_child) { - ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret) { - ret = ret < 0 ? ret : 0; - goto err; - } - } - - u64 f = bkey_inode_flags(k); - if (have_child != !!(f & BCH_INODE_has_child_snapshot)) { - struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k, - BTREE_UPDATE_internal_snapshot_node); - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_trigger_inode(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - - if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { - BUG_ON(!trans->journal_res.seq); - bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); - } - - s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) }; - if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) { - int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes); - if (ret) - return ret; - } - - if (flags & BTREE_TRIGGER_transactional) { - int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) - - (int) bkey_is_unlinked_inode(old); - if (unlinked_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, - new.k->p, unlinked_delta > 0); - if (ret) - return ret; - } - - /* - * If we're creating or deleting an inode at this snapshot ID, - * and there might be an inode in a parent snapshot ID, we might - * need to set or clear the has_child_snapshot flag on the - * parent. - */ - int deleted_delta = (int) bkey_is_inode(new.k) - - (int) bkey_is_inode(old.k); - if (deleted_delta && - bch2_snapshot_parent(c, new.k->p.snapshot)) { - int ret = update_parent_inode_has_children(trans, new.k->p, - deleted_delta > 0); - if (ret) - return ret; - } - - /* - * When an inode is first updated in a new snapshot, we may need - * to clear has_child_snapshot - */ - if (deleted_delta > 0) { - int ret = update_inode_has_children(trans, new, false); - if (ret) - return ret; - } - } - - return 0; -} - -int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode, - c, inode_pos_inode_nonzero, - "nonzero k.p.inode"); -fsck_err: - return ret; -} - -void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); - - prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); -} - -int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors, - c, inode_alloc_cursor_inode_bad, - "k.p.inode bad"); -fsck_err: - return ret; -} - -void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k); - - prt_printf(out, "idx %llu generation %llu", - le64_to_cpu(i.v->idx), - le64_to_cpu(i.v->gen)); -} - -void bch2_inode_init_early(struct bch_fs *c, - struct bch_inode_unpacked *inode_u) -{ - enum bch_str_hash_type str_hash = - bch2_str_hash_opt_to_type(c, c->opts.str_hash); - - memset(inode_u, 0, sizeof(*inode_u)); - - SET_INODE_STR_HASH(inode_u, str_hash); - get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); -} - -void bch2_inode_init_late(struct bch_fs *c, - struct bch_inode_unpacked *inode_u, u64 now, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct bch_inode_unpacked *parent) -{ - inode_u->bi_mode = mode; - inode_u->bi_uid = uid; - inode_u->bi_gid = gid; - inode_u->bi_dev = rdev; - inode_u->bi_atime = now; - inode_u->bi_mtime = now; - inode_u->bi_ctime = now; - inode_u->bi_otime = now; - - if (parent && parent->bi_mode & S_ISGID) { - inode_u->bi_gid = parent->bi_gid; - if (S_ISDIR(mode)) - inode_u->bi_mode |= S_ISGID; - } - - if (parent) { -#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; - BCH_INODE_OPTS() -#undef x - } - - if (!S_ISDIR(mode)) - inode_u->bi_casefold = 0; - - if (bch2_inode_casefold(c, inode_u)) - inode_u->bi_flags |= BCH_INODE_has_case_insensitive; -} - -void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct bch_inode_unpacked *parent) -{ - bch2_inode_init_early(c, inode_u); - bch2_inode_init_late(c, inode_u, bch2_current_time(c), - uid, gid, mode, rdev, parent); -} - -static struct bkey_i_inode_alloc_cursor * -bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) -{ - struct bch_fs *c = trans->c; - - u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1; - - cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_logged_ops, - POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), - BTREE_ITER_cached); - int ret = bkey_err(k); - if (ret) - return ERR_PTR(ret); - - struct bkey_i_inode_alloc_cursor *cursor = - k.k->type == KEY_TYPE_inode_alloc_cursor - ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) - : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); - ret = PTR_ERR_OR_ZERO(cursor); - if (ret) - goto err; - - if (c->opts.inodes_32bit) { - *min = BLOCKDEV_INODE_MAX; - *max = INT_MAX; - } else { - cursor->v.bits = c->opts.shard_inode_numbers_bits; - - unsigned bits = 63 - c->opts.shard_inode_numbers_bits; - - *min = max(cpu << bits, (u64) INT_MAX + 1); - *max = (cpu << bits) | ~(ULLONG_MAX << bits); - } - - if (le64_to_cpu(cursor->v.idx) < *min) - cursor->v.idx = cpu_to_le64(*min); - - if (le64_to_cpu(cursor->v.idx) >= *max) { - cursor->v.idx = cpu_to_le64(*min); - le32_add_cpu(&cursor->v.gen, 1); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret ? ERR_PTR(ret) : cursor; -} - -/* - * This just finds an empty slot: - */ -int bch2_inode_create(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode_u, - u32 snapshot, u64 cpu) -{ - u64 min, max; - struct bkey_i_inode_alloc_cursor *cursor = - bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); - int ret = PTR_ERR_OR_ZERO(cursor); - if (ret) - return ret; - - u64 start = le64_to_cpu(cursor->v.idx); - u64 pos = start; - - bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), - BTREE_ITER_all_snapshots| - BTREE_ITER_intent); - struct bkey_s_c k; -again: - while ((k = bch2_btree_iter_peek(trans, iter)).k && - !(ret = bkey_err(k)) && - bkey_lt(k.k->p, POS(0, max))) { - if (pos < iter->pos.offset) - goto found_slot; - - /* - * We don't need to iterate over keys in every snapshot once - * we've found just one: - */ - pos = iter->pos.offset + 1; - bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); - } - - if (!ret && pos < max) - goto found_slot; - - if (!ret && start == min) - ret = bch_err_throw(trans->c, ENOSPC_inode_create); - - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ret; - } - - /* Retry from start */ - pos = start = min; - bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); - le32_add_cpu(&cursor->v.gen, 1); - goto again; -found_slot: - bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot)); - k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ret; - } - - inode_u->bi_inum = k.k->p.offset; - inode_u->bi_generation = le64_to_cpu(cursor->v.gen); - cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); - return 0; -} - -static int bch2_inode_delete_keys(struct btree_trans *trans, - subvol_inum inum, enum btree_id id) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i delete; - struct bpos end = POS(inum.inum, U64_MAX); - u32 snapshot; - int ret = 0; - - /* - * We're never going to be deleting partial extents, no need to use an - * extent iterator: - */ - bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), - BTREE_ITER_intent); - - while (1) { - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - k = bch2_btree_iter_peek_max(trans, &iter, end); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k) - break; - - bkey_init(&delete.k); - delete.k.p = iter.pos; - - if (iter.flags & BTREE_ITER_is_extents) - bch2_key_resize(&delete.k, - bpos_min(end, k.k->p).offset - - iter.pos.offset); - - ret = bch2_trans_update(trans, &iter, &delete, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -err: - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = {}; - struct bkey_s_c k; - u32 snapshot; - int ret; - - ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum)); - if (ret) - goto err2; - - /* - * If this was a directory, there shouldn't be any real dirents left - - * but there could be whiteouts (from hash collisions) that we should - * delete: - * - * XXX: the dirent code ideally would delete whiteouts when they're no - * longer needed - */ - ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: - bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: - bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); - if (ret) - goto err2; -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), - BTREE_ITER_intent|BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - goto err; - - if (!bkey_is_inode(k.k)) { - bch2_fs_inconsistent(c, - "inode %llu:%u not found when deleting", - inum.inum, snapshot); - ret = bch_err_throw(c, ENOENT_inode); - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -err: - bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (ret) - goto err2; - - ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); -err2: - bch2_trans_put(trans); - return ret; -} - -int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) -{ - if (bi->bi_flags & BCH_INODE_unlinked) - bi->bi_flags &= ~BCH_INODE_unlinked; - else { - if (bi->bi_nlink == U32_MAX) - return -EINVAL; - - bi->bi_nlink++; - } - - return 0; -} - -void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) -{ - if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) { - bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", - bi->bi_inum); - return; - } - - if (bi->bi_flags & BCH_INODE_unlinked) { - bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); - return; - } - - if (bi->bi_nlink) - bi->bi_nlink--; - else - bi->bi_flags |= BCH_INODE_unlinked; -} - -struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) -{ - struct bch_opts ret = { 0 }; -#define x(_name, _bits) \ - if (inode->bi_##_name) \ - opt_set(ret, _name, inode->bi_##_name - 1); - BCH_INODE_OPTS() -#undef x - return ret; -} - -void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, - struct bch_inode_unpacked *inode) -{ -#define x(_name, _bits) \ - if ((inode)->bi_##_name) { \ - opts->_name = inode->bi_##_name - 1; \ - opts->_name##_from_inode = true; \ - } else { \ - opts->_name = c->opts._name; \ - opts->_name##_from_inode = false; \ - } - BCH_INODE_OPTS() -#undef x - - bch2_io_opts_fixups(opts); -} - -int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) -{ - struct bch_inode_unpacked inode; - int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); - - if (ret) - return ret; - - bch2_inode_opts_get(opts, trans->c, &inode); - return 0; -} - -int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *bi, unsigned v) -{ - struct bch_fs *c = trans->c; - -#ifndef CONFIG_UNICODE - bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); - return -EOPNOTSUPP; -#endif - - if (c->opts.casefold_disabled) - return -EOPNOTSUPP; - - int ret = 0; - /* Not supported on individual files. */ - if (!S_ISDIR(bi->bi_mode)) - return -EOPNOTSUPP; - - /* - * Make sure the dir is empty, as otherwise we'd need to - * rehash everything and update the dirent keys. - */ - ret = bch2_empty_dir_trans(trans, inum); - if (ret < 0) - return ret; - - ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); - if (ret) - return ret; - - bch2_check_set_feature(c, BCH_FEATURE_casefolding); - - bi->bi_casefold = v + 1; - bi->bi_fields_set |= BIT(Inode_opt_casefold); - - return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); -} - -static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter = {}; - struct bkey_i_inode_generation delete; - struct bch_inode_unpacked inode_u; - struct bkey_s_c k; - int ret; - - do { - ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL) ?: - bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL) ?: - bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL); - } while (ret == -BCH_ERR_transaction_restart_nested); - if (ret) - goto err; -retry: - bch2_trans_begin(trans); - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), BTREE_ITER_intent); - ret = bkey_err(k); - if (ret) - goto err; - - if (!bkey_is_inode(k.k)) { - bch2_fs_inconsistent(c, - "inode %llu:%u not found when deleting", - inum, snapshot); - ret = bch_err_throw(c, ENOENT_inode); - goto err; - } - - bch2_inode_unpack(k, &inode_u); - - /* Subvolume root? */ - if (inode_u.bi_subvol) - bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); - - bkey_inode_generation_init(&delete.k_i); - delete.k.p = iter.pos; - delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - - ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -err: - bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - return ret ?: -BCH_ERR_transaction_restart_nested; -} - -/* - * After deleting an inode, there may be versions in older snapshots that should - * also be deleted - if they're not referenced by sibling snapshots and not open - * in other subvolumes: - */ -static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; -next_parent: - ret = lockrestart_do(trans, - bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0))); - if (ret || !k.k) - return ret; - - bool unlinked = bkey_is_unlinked_inode(k); - pos = k.k->p; - bch2_trans_iter_exit(trans, &iter); - - if (!unlinked) - return 0; - - ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); - if (ret) - return ret < 0 ? ret : 0; - - ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); - if (ret) - return ret; - goto next_parent; -} - -int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) -{ - return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: - delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); -} - -static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, - bool from_deleted_inodes) -{ - struct bch_fs *c = trans->c; - struct btree_iter inode_iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - struct printbuf buf = PRINTBUF; - int ret; - - k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode); - if (fsck_err_on(from_deleted_inodes && ret, - trans, deleted_inode_missing, - "nonexistent inode %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - - ret = bch2_inode_unpack(k, &inode); - if (ret) - goto out; - - if (S_ISDIR(inode.bi_mode)) { - ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); - if (fsck_err_on(from_deleted_inodes && - bch2_err_matches(ret, ENOTEMPTY), - trans, deleted_inode_is_dir, - "non empty directory %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - } - - ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked); - if (fsck_err_on(from_deleted_inodes && ret, - trans, deleted_inode_not_unlinked, - "non-deleted inode %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - - ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot) - ? 0 : bch_err_throw(c, inode_has_child_snapshot); - - if (fsck_err_on(from_deleted_inodes && ret, - trans, deleted_inode_has_child_snapshots, - "inode with child snapshots %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - - ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret < 0) - goto out; - - if (ret) { - if (fsck_err(trans, inode_has_child_snapshots_wrong, - "inode has_child_snapshots flag wrong (should be set)\n%s", - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &inode), - buf.buf))) { - inode.bi_flags |= BCH_INODE_has_child_snapshot; - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - goto out; - } - - if (!from_deleted_inodes) { - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - bch_err_throw(c, inode_has_child_snapshot); - goto out; - } - - goto delete; - - } - - if (from_deleted_inodes) { - if (test_bit(BCH_FS_clean_recovery, &c->flags) && - !fsck_err(trans, deleted_inode_but_clean, - "filesystem marked as clean but have deleted inode %llu:%u", - pos.offset, pos.snapshot)) { - ret = 0; - goto out; - } - - ret = 1; - } -out: -fsck_err: - bch2_trans_iter_exit(trans, &inode_iter); - printbuf_exit(&buf); - return ret; -delete: - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); - goto out; -} - -static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum) -{ - u32 snapshot; - - return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: - may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false); -} - -int bch2_delete_dead_inodes(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - int ret; - - /* - * if we ran check_inodes() unlinked inodes will have already been - * cleaned up but the write buffer will be out of sync; therefore we - * alway need a write buffer flush - */ - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - /* - * Weird transaction restart handling here because on successful delete, - * bch2_inode_rm_snapshot() will return a nested transaction restart, - * but we can't retry because the btree write buffer won't have been - * flushed and we'd spin: - */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - ret = may_delete_deleted_inode(trans, k.k->p, true); - if (ret > 0) { - bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", - k.k->p.offset, k.k->p.snapshot); - - ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); - /* - * We don't want to loop here: a transaction restart - * error here means we handled a transaction restart and - * we're actually done, but if we loop we'll retry the - * same key because the write buffer hasn't been flushed - * yet - */ - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - } - - ret; - })); -err: - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h deleted file mode 100644 index b8ec3e628d9055..00000000000000 --- a/fs/bcachefs/inode.h +++ /dev/null @@ -1,319 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_INODE_H -#define _BCACHEFS_INODE_H - -#include "bkey.h" -#include "bkey_methods.h" -#include "opts.h" -#include "snapshot.h" - -extern const char * const bch2_inode_opts[]; - -int bch2_inode_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); - -static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) -{ - return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0 - ? __bch2_inode_has_child_snapshots(trans, pos) - : 0; -} - -int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_inode ((struct bkey_ops) { \ - .key_validate = bch2_inode_validate, \ - .val_to_text = bch2_inode_to_text, \ - .trigger = bch2_trigger_inode, \ - .min_val_size = 16, \ -}) - -#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ - .key_validate = bch2_inode_v2_validate, \ - .val_to_text = bch2_inode_to_text, \ - .trigger = bch2_trigger_inode, \ - .min_val_size = 32, \ -}) - -#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ - .key_validate = bch2_inode_v3_validate, \ - .val_to_text = bch2_inode_to_text, \ - .trigger = bch2_trigger_inode, \ - .min_val_size = 48, \ -}) - -static inline bool bkey_is_inode(const struct bkey *k) -{ - return k->type == KEY_TYPE_inode || - k->type == KEY_TYPE_inode_v2 || - k->type == KEY_TYPE_inode_v3; -} - -int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ - .key_validate = bch2_inode_generation_validate, \ - .val_to_text = bch2_inode_generation_to_text, \ - .min_val_size = 8, \ -}) - -int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \ - .key_validate = bch2_inode_alloc_cursor_validate, \ - .val_to_text = bch2_inode_alloc_cursor_to_text, \ - .min_val_size = 16, \ -}) - -#if 0 -typedef struct { - u64 lo; - u32 hi; -} __packed __aligned(4) u96; -#endif -typedef u64 u96; - -struct bch_inode_unpacked { - u64 bi_inum; - u32 bi_snapshot; - u64 bi_journal_seq; - __le64 bi_hash_seed; - u64 bi_size; - u64 bi_sectors; - u64 bi_version; - u32 bi_flags; - u16 bi_mode; - -#define x(_name, _bits) u##_bits _name; - BCH_INODE_FIELDS_v3() -#undef x -}; -BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24); - -struct bkey_inode_buf { - struct bkey_i_inode_v3 inode; - -#define x(_name, _bits) + 8 + _bits / 8 - u8 _pad[0 + BCH_INODE_FIELDS_v3()]; -#undef x -}; - -void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); -int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); -struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); - -void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); - -int __bch2_inode_peek(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, subvol_inum, unsigned, bool); - -static inline int bch2_inode_peek_nowarn(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - subvol_inum inum, unsigned flags) -{ - return __bch2_inode_peek(trans, iter, inode, inum, flags, false); -} - -static inline int bch2_inode_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - subvol_inum inum, unsigned flags) -{ - return __bch2_inode_peek(trans, iter, inode, inum, flags, true); -} - -int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32, - struct bch_inode_unpacked *, unsigned); -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, - subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, - struct bch_inode_unpacked *); - -int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, - struct bch_inode_unpacked *root); - -int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); - -static inline int bch2_inode_write(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode) -{ - return bch2_inode_write_flags(trans, iter, inode, 0); -} - -int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); -int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); - -void bch2_inode_init_early(struct bch_fs *, - struct bch_inode_unpacked *); -void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64, - uid_t, gid_t, umode_t, dev_t, - struct bch_inode_unpacked *); -void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, - uid_t, gid_t, umode_t, dev_t, - struct bch_inode_unpacked *); - -int bch2_inode_create(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, u32, u64); - -int bch2_inode_rm(struct bch_fs *, subvol_inum); - -#define inode_opt_get(_c, _inode, _name) \ - ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) - -static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, - enum inode_opt_id id, u64 v) -{ - switch (id) { -#define x(_name, ...) \ - case Inode_opt_##_name: \ - inode->bi_##_name = v; \ - break; - BCH_INODE_OPTS() -#undef x - default: - BUG(); - } -} - -static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, - enum inode_opt_id id) -{ - switch (id) { -#define x(_name, ...) \ - case Inode_opt_##_name: \ - return inode->bi_##_name; - BCH_INODE_OPTS() -#undef x - default: - BUG(); - } -} - -static inline u8 mode_to_type(umode_t mode) -{ - return (mode >> 12) & 15; -} - -static inline u8 inode_d_type(struct bch_inode_unpacked *inode) -{ - return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); -} - -static inline u32 bch2_inode_flags(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); - case KEY_TYPE_inode_v2: - return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); - case KEY_TYPE_inode_v3: - return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); - default: - return 0; - } -} - -static inline unsigned bkey_inode_mode(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode); - case KEY_TYPE_inode_v2: - return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode); - case KEY_TYPE_inode_v3: - return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v); - default: - return 0; - } -} - -static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi) -{ - /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */ - return bi->bi_casefold - ? bi->bi_casefold - 1 - : c->opts.casefold; -} - -static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi) -{ - return bi->bi_dir || bi->bi_dir_offset; -} - -/* i_nlink: */ - -static inline unsigned nlink_bias(umode_t mode) -{ - return S_ISDIR(mode) ? 2 : 1; -} - -static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) -{ - return bi->bi_flags & BCH_INODE_unlinked - ? 0 - : bi->bi_nlink + nlink_bias(bi->bi_mode); -} - -static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, - unsigned nlink) -{ - if (nlink) { - bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); - bi->bi_flags &= ~BCH_INODE_unlinked; - } else { - bi->bi_nlink = 0; - bi->bi_flags |= BCH_INODE_unlinked; - } -} - -int bch2_inode_nlink_inc(struct bch_inode_unpacked *); -void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); - -struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); -void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, - struct bch_inode_unpacked *); -int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *); -int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, unsigned); - -#include "rebalance.h" - -static inline struct bch_extent_rebalance -bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) -{ - struct bch_io_opts io_opts; - bch2_inode_opts_get(&io_opts, c, inode); - return io_opts_to_rebalance_opts(c, &io_opts); -} - -#define BCACHEFS_ROOT_SUBVOL_INUM \ - ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) - -static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b) -{ - return a.subvol == b.subvol && a.inum == b.inum; -} - -int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); -int bch2_delete_dead_inodes(struct bch_fs *); - -#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h deleted file mode 100644 index 1f00938b1bdc8a..00000000000000 --- a/fs/bcachefs/inode_format.h +++ /dev/null @@ -1,185 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_INODE_FORMAT_H -#define _BCACHEFS_INODE_FORMAT_H - -#define BLOCKDEV_INODE_MAX 4096 -#define BCACHEFS_ROOT_INO 4096 - -struct bch_inode { - struct bch_val v; - - __le64 bi_hash_seed; - __le32 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v2 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v3 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le64 bi_sectors; - __le64 bi_size; - __le64 bi_version; - __u8 fields[]; -} __packed __aligned(8); - -#define INODEv3_FIELDS_START_INITIAL 6 -#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) - -struct bch_inode_generation { - struct bch_val v; - - __le32 bi_generation; - __le32 pad; -} __packed __aligned(8); - -/* - * bi_subvol and bi_parent_subvol are only set for subvolume roots: - */ - -#define BCH_INODE_FIELDS_v2() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_size, 64) \ - x(bi_sectors, 64) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) - -#define BCH_INODE_FIELDS_v3() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) \ - x(bi_depth, 32) \ - x(bi_inodes_32bit, 8) \ - x(bi_casefold, 8) - -/* subset of BCH_INODE_FIELDS */ -#define BCH_INODE_OPTS() \ - x(data_checksum, 8) \ - x(compression, 8) \ - x(project, 32) \ - x(background_compression, 8) \ - x(data_replicas, 8) \ - x(promote_target, 16) \ - x(foreground_target, 16) \ - x(background_target, 16) \ - x(erasure_code, 16) \ - x(nocow, 8) \ - x(inodes_32bit, 8) \ - x(casefold, 8) - -enum inode_opt_id { -#define x(name, ...) \ - Inode_opt_##name, - BCH_INODE_OPTS() -#undef x - Inode_opt_nr, -}; - -/* - * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive - - * for overlayfs - */ -#define BCH_INODE_FLAGS() \ - x(sync, 0) \ - x(immutable, 1) \ - x(append, 2) \ - x(nodump, 3) \ - x(noatime, 4) \ - x(i_size_dirty, 5) \ - x(i_sectors_dirty, 6) \ - x(unlinked, 7) \ - x(backptr_untrusted, 8) \ - x(has_child_snapshot, 9) \ - x(has_case_insensitive, 10) - -/* bits 20+ reserved for packed fields below: */ - -enum bch_inode_flags { -#define x(t, n) BCH_INODE_##t = 1U << n, - BCH_INODE_FLAGS() -#undef x -}; - -enum __bch_inode_flags { -#define x(t, n) __BCH_INODE_##t = n, - BCH_INODE_FLAGS() -#undef x -}; - -LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32); - -LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); -LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); -LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_FIELDS_START, - struct bch_inode_v3, bi_flags, 31, 36); -LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); - -struct bch_inode_alloc_cursor { - struct bch_val v; - __u8 bits; - __u8 pad; - __le32 gen; - __le64 idx; -}; - -#endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c deleted file mode 100644 index 07023667a475f6..00000000000000 --- a/fs/bcachefs/io_misc.c +++ /dev/null @@ -1,570 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * io_misc.c - fallocate, fpunch, truncate: - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "clock.h" -#include "error.h" -#include "extents.h" -#include "extent_update.h" -#include "inode.h" -#include "io_misc.h" -#include "io_write.h" -#include "logged_ops.h" -#include "rebalance.h" -#include "subvolume.h" - -/* Overwrites whatever was present with zeroes: */ -int bch2_extent_fallocate(struct btree_trans *trans, - subvol_inum inum, - struct btree_iter *iter, - u64 sectors, - struct bch_io_opts opts, - s64 *i_sectors_delta, - struct write_point_specifier write_point) -{ - struct bch_fs *c = trans->c; - struct disk_reservation disk_res = { 0 }; - struct closure cl; - struct open_buckets open_buckets = { 0 }; - struct bkey_s_c k; - struct bkey_buf old, new; - unsigned sectors_allocated = 0, new_replicas; - bool unwritten = opts.nocow && - c->sb.version >= bcachefs_metadata_version_unwritten_extents; - int ret; - - bch2_bkey_buf_init(&old); - bch2_bkey_buf_init(&new); - closure_init_stack(&cl); - - k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) - return ret; - - sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); - new_replicas = max(0, (int) opts.data_replicas - - (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - - /* - * Get a disk reservation before (in the nocow case) calling - * into the allocator: - */ - ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); - if (unlikely(ret)) - goto err_noprint; - - bch2_bkey_buf_reassemble(&old, c, k); - - if (!unwritten) { - struct bkey_i_reservation *reservation; - - bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); - reservation = bkey_reservation_init(new.k); - reservation->k.p = iter->pos; - bch2_key_resize(&reservation->k, sectors); - reservation->v.nr_replicas = opts.data_replicas; - } else { - struct bkey_i_extent *e; - struct bch_devs_list devs_have; - struct write_point *wp; - - devs_have.nr = 0; - - bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); - - e = bkey_extent_init(new.k); - e->k.p = iter->pos; - - ret = bch2_alloc_sectors_start_trans(trans, - opts.foreground_target, - false, - write_point, - &devs_have, - opts.data_replicas, - opts.data_replicas, - BCH_WATERMARK_normal, 0, &cl, &wp); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = bch_err_throw(c, transaction_restart_nested); - if (ret) - goto err; - - sectors = min_t(u64, sectors, wp->sectors_free); - sectors_allocated = sectors; - - bch2_key_resize(&e->k, sectors); - - bch2_open_bucket_get(c, wp, &open_buckets); - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); - bch2_alloc_sectors_done(c, wp); - - extent_for_each_ptr(extent_i_to_s(e), ptr) - ptr->unwritten = true; - } - - ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, - 0, i_sectors_delta, true); -err: - if (!ret && sectors_allocated) - bch2_increment_clock(c, sectors_allocated, WRITE); - if (should_print_err(ret)) { - struct printbuf buf = PRINTBUF; - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); - prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } -err_noprint: - bch2_open_buckets_put(c, &open_buckets); - bch2_disk_reservation_put(c, &disk_res); - bch2_bkey_buf_exit(&new, c); - bch2_bkey_buf_exit(&old, c); - - if (closure_nr_remaining(&cl) != 1) { - bch2_trans_unlock_long(trans); - bch2_wait_on_allocator(c, &cl); - } - - return ret; -} - -/* For fsck */ -int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end) -{ - u32 restart_count = trans->restart_count; - struct bch_fs *c = trans->c; - struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct bkey_i delete; - - int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, - start, end, 0, k, - &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - bkey_init(&delete.k); - delete.k.p = iter.pos; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete); - - bch2_extent_trim_atomic(trans, &iter, &delete) ?: - bch2_trans_update(trans, &iter, &delete, 0); - })); - - bch2_disk_reservation_put(c, &disk_res); - return ret ?: trans_was_restarted(trans, restart_count); -} - -/* - * Returns -BCH_ERR_transacton_restart if we had to drop locks: - */ -int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - subvol_inum inum, u64 end, - s64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct bpos end_pos = POS(inum.inum, end); - struct bkey_s_c k; - int ret = 0, ret2 = 0; - u32 snapshot; - - while (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete; - - if (ret) - ret2 = ret; - - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(trans, iter, snapshot); - - /* - * peek_max() doesn't have ideal semantics for extents: - */ - k = bch2_btree_iter_peek_max(trans, iter, end_pos); - if (!k.k) - break; - - ret = bkey_err(k); - if (ret) - continue; - - bkey_init(&delete.k); - delete.k.p = iter->pos; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end_pos, &delete); - - ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, 0, i_sectors_delta, false); - bch2_disk_reservation_put(c, &disk_res); - } - - return ret ?: ret2; -} - -int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, - s64 *i_sectors_delta) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, start), - BTREE_ITER_intent); - - ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - - return ret; -} - -/* truncate: */ - -void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k); - - prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); - prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); - prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size)); -} - -static int truncate_set_isize(struct btree_trans *trans, - subvol_inum inum, - u64 new_i_size, - bool warn) -{ - struct btree_iter iter = {}; - struct bch_inode_unpacked inode_u; - int ret; - - ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?: - (inode_u.bi_size = new_i_size, 0) ?: - bch2_inode_write(trans, &iter, &inode_u); - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, - struct bkey_i *op_k, - u64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter fpunch_iter; - struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k); - subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; - u64 new_i_size = le64_to_cpu(op->v.new_i_size); - bool warn_errors = i_sectors_delta != NULL; - int ret; - - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL)); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, - POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), - BTREE_ITER_intent); - ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); - bch2_trans_iter_exit(trans, &fpunch_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; -err: - if (warn_errors) - bch_err_fn(c, ret); - return ret; -} - -int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k) -{ - return __bch2_resume_logged_op_truncate(trans, op_k, NULL); -} - -int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta) -{ - struct bkey_i_logged_op_truncate op; - - bkey_logged_op_truncate_init(&op.k_i); - op.v.subvol = cpu_to_le32(inum.subvol); - op.v.inum = cpu_to_le64(inum.inum); - op.v.new_i_size = cpu_to_le64(new_i_size); - - /* - * Logged ops aren't atomic w.r.t. snapshot creation: creating a - * snapshot while they're in progress, then crashing, will result in the - * resume only proceeding in one of the snapshots - */ - down_read(&c->snapshot_create_lock); - struct btree_trans *trans = bch2_trans_get(c); - int ret = bch2_logged_op_start(trans, &op.k_i); - if (ret) - goto out; - ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta); - ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; -out: - bch2_trans_put(trans); - up_read(&c->snapshot_create_lock); - - return ret; -} - -/* finsert/fcollapse: */ - -void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k); - - prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); - prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); - prt_printf(out, " dst_offset=%lli", le64_to_cpu(op.v->dst_offset)); - prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset)); -} - -static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, - u64 offset, s64 len, bool warn) -{ - struct btree_iter iter; - struct bch_inode_unpacked inode_u; - int ret; - - offset <<= 9; - len <<= 9; - - ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn); - if (ret) - return ret; - - if (len > 0) { - if (MAX_LFS_FILESIZE - inode_u.bi_size < len) { - ret = -EFBIG; - goto err; - } - - if (offset >= inode_u.bi_size) { - ret = -EINVAL; - goto err; - } - } - - inode_u.bi_size += len; - inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c); - - ret = bch2_inode_write(trans, &iter, &inode_u); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, - struct bkey_i *op_k, - u64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); - subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; - struct bch_io_opts opts; - u64 dst_offset = le64_to_cpu(op->v.dst_offset); - u64 src_offset = le64_to_cpu(op->v.src_offset); - s64 shift = dst_offset - src_offset; - u64 len = abs(shift); - u64 pos = le64_to_cpu(op->v.pos); - bool insert = shift > 0; - u32 snapshot; - bool warn_errors = i_sectors_delta != NULL; - int ret = 0; - - ret = bch2_inum_opts_get(trans, inum, &opts); - if (ret) - return ret; - - /* - * check for missing subvolume before fpunch, as in resume we don't want - * it to be a fatal error - */ - ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors)); - if (ret) - return ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, 0), - BTREE_ITER_intent); - - switch (op->v.state) { -case LOGGED_OP_FINSERT_start: - op->v.state = LOGGED_OP_FINSERT_shift_extents; - - if (insert) { - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, src_offset, len, warn_errors) ?: - bch2_logged_op_update(trans, &op->k_i)); - if (ret) - goto err; - } else { - bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset)); - - ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_logged_op_update(trans, &op->k_i)); - } - - fallthrough; -case LOGGED_OP_FINSERT_shift_extents: - while (1) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete, *copy; - struct bkey_s_c k; - struct bpos src_pos = POS(inum.inum, src_offset); - - bch2_trans_begin(trans); - - ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, - warn_errors); - if (ret) - goto btree_err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot)); - - k = insert - ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0)) - : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX)); - if ((ret = bkey_err(k))) - goto btree_err; - - if (!k.k || - k.k->p.inode != inum.inum || - bkey_le(k.k->p, POS(inum.inum, src_offset))) - break; - - copy = bch2_bkey_make_mut_noupdate(trans, k); - if ((ret = PTR_ERR_OR_ZERO(copy))) - goto btree_err; - - if (insert && - bkey_lt(bkey_start_pos(k.k), src_pos)) { - bch2_cut_front(src_pos, copy); - - /* Splitting compressed extent? */ - bch2_disk_reservation_add(c, &disk_res, - copy->k.size * - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)), - BCH_DISK_RESERVATION_NOFAIL); - } - - bkey_init(&delete.k); - delete.k.p = copy->k.p; - delete.k.p.snapshot = snapshot; - delete.k.size = copy->k.size; - - copy->k.p.offset += shift; - copy->k.p.snapshot = snapshot; - - op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - - ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?: - bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: - bch2_logged_op_update(trans, &op->k_i) ?: - bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc); -btree_err: - bch2_disk_reservation_put(c, &disk_res); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - - pos = le64_to_cpu(op->v.pos); - } - - op->v.state = LOGGED_OP_FINSERT_finish; - - if (!insert) { - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?: - bch2_logged_op_update(trans, &op->k_i)); - } else { - /* We need an inode update to update bi_journal_seq for fsync: */ - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, 0, 0, warn_errors) ?: - bch2_logged_op_update(trans, &op->k_i)); - } - - break; -case LOGGED_OP_FINSERT_finish: - break; - } -err: - bch2_trans_iter_exit(trans, &iter); - if (warn_errors) - bch_err_fn(c, ret); - return ret; -} - -int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k) -{ - return __bch2_resume_logged_op_finsert(trans, op_k, NULL); -} - -int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, - u64 offset, u64 len, bool insert, - s64 *i_sectors_delta) -{ - struct bkey_i_logged_op_finsert op; - s64 shift = insert ? len : -len; - - bkey_logged_op_finsert_init(&op.k_i); - op.v.subvol = cpu_to_le32(inum.subvol); - op.v.inum = cpu_to_le64(inum.inum); - op.v.dst_offset = cpu_to_le64(offset + shift); - op.v.src_offset = cpu_to_le64(offset); - op.v.pos = cpu_to_le64(insert ? U64_MAX : offset); - - /* - * Logged ops aren't atomic w.r.t. snapshot creation: creating a - * snapshot while they're in progress, then crashing, will result in the - * resume only proceeding in one of the snapshots - */ - down_read(&c->snapshot_create_lock); - struct btree_trans *trans = bch2_trans_get(c); - int ret = bch2_logged_op_start(trans, &op.k_i); - if (ret) - goto out; - ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta); - ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; -out: - bch2_trans_put(trans); - up_read(&c->snapshot_create_lock); - - return ret; -} diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h deleted file mode 100644 index b93e4d4b3c0c50..00000000000000 --- a/fs/bcachefs/io_misc.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_MISC_H -#define _BCACHEFS_IO_MISC_H - -int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, - u64, struct bch_io_opts, s64 *, - struct write_point_specifier); - -int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); -int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - subvol_inum, u64, s64 *); -int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); - -void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) { \ - .val_to_text = bch2_logged_op_truncate_to_text, \ - .min_val_size = 24, \ -}) - -int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *); - -int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *); - -void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) { \ - .val_to_text = bch2_logged_op_finsert_to_text, \ - .min_val_size = 24, \ -}) - -int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *); - -int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *); - -#endif /* _BCACHEFS_IO_MISC_H */ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c deleted file mode 100644 index 460e2e6341f1ce..00000000000000 --- a/fs/bcachefs/io_read.c +++ /dev/null @@ -1,1543 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Some low level IO code, and hacks for various block layer limitations - * - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "btree_update.h" -#include "buckets.h" -#include "checksum.h" -#include "clock.h" -#include "compress.h" -#include "data_update.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "io_read.h" -#include "io_misc.h" -#include "io_write.h" -#include "reflink.h" -#include "subvolume.h" -#include "trace.h" - -#include -#include -#include - -#ifdef CONFIG_BCACHEFS_DEBUG -static unsigned bch2_read_corrupt_ratio; -module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); -MODULE_PARM_DESC(read_corrupt_ratio, ""); -#endif - -static bool bch2_poison_extents_on_checksum_error; -module_param_named(poison_extents_on_checksum_error, - bch2_poison_extents_on_checksum_error, bool, 0644); -MODULE_PARM_DESC(poison_extents_on_checksum_error, - "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - const struct bch_devs_mask *devs; - unsigned d, nr = 0, total = 0; - u64 now = local_clock(), last; - s64 congested; - struct bch_dev *ca; - - if (!target) - return false; - - guard(rcu)(); - devs = bch2_target_to_mask(c, target) ?: - &c->rw_devs[BCH_DATA_user]; - - for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { - ca = rcu_dereference(c->devs[d]); - if (!ca) - continue; - - congested = atomic_read(&ca->congested); - last = READ_ONCE(ca->congested_last); - if (time_after64(now, last)) - congested -= (now - last) >> 12; - - total += max(congested, 0LL); - nr++; - } - - return get_random_u32_below(nr * CONGESTED_MAX) < total; -} - -#else - -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - return false; -} - -#endif - -/* Cache promotion on read */ - -static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), - .key_len = sizeof(struct bpos), - .automatic_shrinking = true, -}; - -static inline bool have_io_error(struct bch_io_failures *failed) -{ - return failed && failed->nr; -} - -static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) -{ - EBUG_ON(rbio->split); - - return rbio->data_update - ? container_of(rbio, struct data_update, rbio) - : NULL; -} - -static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) -{ - struct data_update *u = rbio_data_update(orig); - if (!u) - return false; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); - unsigned i = 0; - bkey_for_each_ptr(ptrs, ptr) { - if (ptr->dev == dev && - u->data_opts.rewrite_ptrs & BIT(i)) - return true; - i++; - } - - return false; -} - -static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, - struct bpos pos, - struct bch_io_opts opts, - unsigned flags, - struct bch_io_failures *failed) -{ - if (!have_io_error(failed)) { - BUG_ON(!opts.promote_target); - - if (!(flags & BCH_READ_may_promote)) - return bch_err_throw(c, nopromote_may_not); - - if (bch2_bkey_has_target(c, k, opts.promote_target)) - return bch_err_throw(c, nopromote_already_promoted); - - if (bkey_extent_is_unwritten(k)) - return bch_err_throw(c, nopromote_unwritten); - - if (bch2_target_congested(c, opts.promote_target)) - return bch_err_throw(c, nopromote_congested); - } - - if (rhashtable_lookup_fast(&c->promote_table, &pos, - bch_promote_params)) - return bch_err_throw(c, nopromote_in_flight); - - return 0; -} - -static noinline void promote_free(struct bch_read_bio *rbio) -{ - struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - struct bch_fs *c = rbio->c; - - int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); - - async_object_list_del(c, promote, op->list_idx); - async_object_list_del(c, rbio, rbio->list_idx); - - bch2_data_update_exit(&op->write); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); - kfree_rcu(op, rcu); -} - -static void promote_done(struct bch_write_op *wop) -{ - struct promote_op *op = container_of(wop, struct promote_op, write.op); - struct bch_fs *c = op->write.rbio.c; - - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); - promote_free(&op->write.rbio); -} - -static void promote_start_work(struct work_struct *work) -{ - struct promote_op *op = container_of(work, struct promote_op, work); - - bch2_data_update_read_done(&op->write); -} - -static noinline void promote_start(struct bch_read_bio *rbio) -{ - struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - - trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); - - INIT_WORK(&op->work, promote_start_work); - queue_work(rbio->c->write_ref_wq, &op->work); -} - -static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_s_c k, - struct bpos pos, - struct extent_ptr_decoded *pick, - unsigned sectors, - struct bch_read_bio *orig, - struct bch_io_failures *failed) -{ - struct bch_fs *c = trans->c; - int ret; - - struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; - - if (!have_io_error(failed)) { - update_opts.target = orig->opts.promote_target; - update_opts.extra_replicas = 1; - update_opts.write_flags |= BCH_WRITE_cached; - update_opts.write_flags |= BCH_WRITE_only_specified_devs; - } else { - update_opts.target = orig->opts.foreground_target; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned ptr_bit = 1; - bkey_for_each_ptr(ptrs, ptr) { - if (bch2_dev_io_failures(failed, ptr->dev) && - !ptr_being_rewritten(orig, ptr->dev)) - update_opts.rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - - if (!update_opts.rewrite_ptrs) - return NULL; - } - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) - return ERR_PTR(-BCH_ERR_nopromote_no_writes); - - struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) { - ret = bch_err_throw(c, nopromote_enomem); - goto err_put; - } - - op->start_time = local_clock(); - op->pos = pos; - - if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, - bch_promote_params)) { - ret = bch_err_throw(c, nopromote_in_flight); - goto err; - } - - ret = async_object_list_add(c, promote, op, &op->list_idx); - if (ret < 0) - goto err_remove_hash; - - ret = bch2_data_update_init(trans, NULL, NULL, &op->write, - writepoint_hashed((unsigned long) current), - &orig->opts, - update_opts, - btree_id, k); - op->write.type = BCH_DATA_UPDATE_promote; - /* - * possible errors: -BCH_ERR_nocow_lock_blocked, - * -BCH_ERR_ENOSPC_disk_reservation: - */ - if (ret) - goto err_remove_list; - - rbio_init_fragment(&op->write.rbio.bio, orig); - op->write.rbio.bounce = true; - op->write.rbio.promote = true; - op->write.op.end_io = promote_done; - - return &op->write.rbio; -err_remove_list: - async_object_list_del(c, promote, op->list_idx); -err_remove_hash: - BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params)); -err: - bio_free_pages(&op->write.op.wbio.bio); - /* We may have added to the rhashtable and thus need rcu freeing: */ - kfree_rcu(op, rcu); -err_put: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); - return ERR_PTR(ret); -} - -noinline -static struct bch_read_bio *promote_alloc(struct btree_trans *trans, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_ptr_decoded *pick, - unsigned flags, - struct bch_read_bio *orig, - bool *bounce, - bool *read_full, - struct bch_io_failures *failed) -{ - /* - * We're in the retry path, but we don't know what to repair yet, and we - * don't want to do a promote here: - */ - if (failed && !failed->nr) - return NULL; - - struct bch_fs *c = trans->c; - /* - * if failed != NULL we're not actually doing a promote, we're - * recovering from an io/checksum error - */ - bool promote_full = (have_io_error(failed) || - *read_full || - READ_ONCE(c->opts.promote_whole_extents)); - /* data might have to be decompressed in the write path: */ - unsigned sectors = promote_full - ? max(pick->crc.compressed_size, pick->crc.live_size) - : bvec_iter_sectors(iter); - struct bpos pos = promote_full - ? bkey_start_pos(k.k) - : POS(k.k->p.inode, iter.bi_sector); - int ret; - - ret = should_promote(c, k, pos, orig->opts, flags, failed); - if (ret) - goto nopromote; - - struct bch_read_bio *promote = - __promote_alloc(trans, - k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_reflink - : BTREE_ID_extents, - k, pos, pick, sectors, orig, failed); - if (!promote) - return NULL; - - ret = PTR_ERR_OR_ZERO(promote); - if (ret) - goto nopromote; - - *bounce = true; - *read_full = promote_full; - - if (have_io_error(failed)) - orig->self_healing = true; - - return promote; -nopromote: - trace_io_read_nopromote(c, ret); - return NULL; -} - -void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op) -{ - if (!op->write.read_done) { - prt_printf(out, "parent read: %px\n", op->write.rbio.parent); - printbuf_indent_add(out, 2); - bch2_read_bio_to_text(out, op->write.rbio.parent); - printbuf_indent_sub(out, 2); - } - - bch2_data_update_to_text(out, &op->write); -} - -/* Read */ - -static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_read_bio *rbio, struct bpos read_pos) -{ - int ret = lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { rbio->subvol, read_pos.inode }, - read_pos.offset << 9)); - if (ret) - return ret; - - if (rbio->data_update) - prt_str(out, "(internal move) "); - - return 0; -} - -static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, - struct bch_read_bio *rbio, struct bpos read_pos) -{ - bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); -} - -enum rbio_context { - RBIO_CONTEXT_NULL, - RBIO_CONTEXT_HIGHPRI, - RBIO_CONTEXT_UNBOUND, -}; - -static inline struct bch_read_bio * -bch2_rbio_parent(struct bch_read_bio *rbio) -{ - return rbio->split ? rbio->parent : rbio; -} - -__always_inline -static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, - enum rbio_context context, - struct workqueue_struct *wq) -{ - if (context <= rbio->context) { - fn(&rbio->work); - } else { - rbio->work.func = fn; - rbio->context = context; - queue_work(wq, &rbio->work); - } -} - -static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) -{ - BUG_ON(rbio->bounce && !rbio->split); - - if (rbio->have_ioref) { - struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); - } - - if (rbio->split) { - struct bch_read_bio *parent = rbio->parent; - - if (unlikely(rbio->promote)) { - if (!rbio->bio.bi_status) - promote_start(rbio); - else - promote_free(rbio); - } else { - async_object_list_del(rbio->c, rbio, rbio->list_idx); - - if (rbio->bounce) - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - - bio_put(&rbio->bio); - } - - rbio = parent; - } - - return rbio; -} - -/* - * Only called on a top level bch_read_bio to complete an entire read request, - * not a split: - */ -static void bch2_rbio_done(struct bch_read_bio *rbio) -{ - if (rbio->start_time) - bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], - rbio->start_time); -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - if (rbio->list_idx) - async_object_list_del(rbio->c, rbio, rbio->list_idx); -#endif - bio_endio(&rbio->bio); -} - -static void get_rbio_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, - struct bkey_buf *sk) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = lockrestart_do(trans, - bkey_err(k = bch2_bkey_get_iter(trans, &iter, - rbio->data_btree, rbio->data_pos, 0))); - if (ret) - return; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) - if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { - bch2_bkey_buf_reassemble(sk, trans->c, k); - break; - } - - bch2_trans_iter_exit(trans, &iter); -} - -static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, - enum btree_id btree, struct bkey_s_c read_k) -{ - if (!bch2_poison_extents_on_checksum_error) - return 0; - - struct bch_fs *c = trans->c; - - struct data_update *u = rbio_data_update(rbio); - if (u) - read_k = bkey_i_to_s_c(u->k.k); - - u64 flags = bch2_bkey_extent_flags(read_k); - if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), - BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_and_val_eq(k, read_k)) - goto out; - - struct bkey_i *new = bch2_trans_kmalloc(trans, - bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); - ret = PTR_ERR_OR_ZERO(new) ?: - (bkey_reassemble(new, k), 0) ?: - bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: - bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, 0); - - /* - * Propagate key change back to data update path, in particular so it - * knows the extent has been poisoned and it's safe to change the - * checksum - */ - if (u && !ret) - bch2_bkey_buf_copy(&u->k, c, new); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, - struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) -{ - struct data_update *u = container_of(rbio, struct data_update, rbio); -retry: - bch2_trans_begin(trans); - - struct btree_iter iter; - struct bkey_s_c k; - int ret = lockrestart_do(trans, - bkey_err(k = bch2_bkey_get_iter(trans, &iter, - u->btree_id, bkey_start_pos(&u->k.k->k), - 0))); - if (ret) - goto err; - - if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { - /* extent we wanted to read no longer exists: */ - rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten); - goto err; - } - - ret = __bch2_read_extent(trans, rbio, bvec_iter, - bkey_start_pos(&u->k.k->k), - u->btree_id, - bkey_i_to_s_c(u->k.k), - 0, failed, flags, -1); -err: - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_data_read_retry)) - goto retry; - - if (ret) { - rbio->bio.bi_status = BLK_STS_IOERR; - rbio->ret = ret; - } - - BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); - return ret; -} - -static void bch2_rbio_retry(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bvec_iter iter = rbio->bvec_iter; - unsigned flags = rbio->flags; - subvol_inum inum = { - .subvol = rbio->subvol, - .inum = rbio->read_pos.inode, - }; - struct bch_io_failures failed = { .nr = 0 }; - - struct btree_trans *trans = bch2_trans_get(c); - - struct bkey_buf sk; - bch2_bkey_buf_init(&sk); - bkey_init(&sk.k->k); - - trace_io_read_retry(&rbio->bio); - this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], - bvec_iter_sectors(rbio->bvec_iter)); - - get_rbio_extent(trans, rbio, &sk); - - if (!bkey_deleted(&sk.k->k) && - bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) - bch2_mark_io_failure(&failed, &rbio->pick, - rbio->ret == -BCH_ERR_data_read_retry_csum_err); - - if (!rbio->split) { - rbio->bio.bi_status = 0; - rbio->ret = 0; - } - - unsigned subvol = rbio->subvol; - struct bpos read_pos = rbio->read_pos; - - rbio = bch2_rbio_free(rbio); - - flags |= BCH_READ_in_retry; - flags &= ~BCH_READ_may_promote; - flags &= ~BCH_READ_last_fragment; - flags |= BCH_READ_must_clone; - - int ret = rbio->data_update - ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) - : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); - - if (ret) { - rbio->ret = ret; - rbio->bio.bi_status = BLK_STS_IOERR; - } - - if (failed.nr || ret) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, - (subvol_inum) { subvol, read_pos.inode }, - read_pos.offset << 9)); - if (rbio->data_update) - prt_str(&buf, "(internal move) "); - - prt_str(&buf, "data read error, "); - if (!ret) { - prt_str(&buf, "successful retry"); - if (rbio->self_healing) - prt_str(&buf, ", self healing"); - } else - prt_str(&buf, bch2_err_str(ret)); - prt_newline(&buf); - - - if (!bkey_deleted(&sk.k->k)) { - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); - prt_newline(&buf); - } - - bch2_io_failures_to_text(&buf, c, &failed); - - bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - - bch2_rbio_done(rbio); - bch2_bkey_buf_exit(&sk, c); - bch2_trans_put(trans); -} - -static void bch2_rbio_error(struct bch_read_bio *rbio, - int ret, blk_status_t blk_error) -{ - BUG_ON(ret >= 0); - - rbio->ret = ret; - rbio->bio.bi_status = blk_error; - - bch2_rbio_parent(rbio)->saw_error = true; - - if (rbio->flags & BCH_READ_in_retry) - return; - - if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { - bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_dfl_wq); - } else { - rbio = bch2_rbio_free(rbio); - - rbio->ret = ret; - rbio->bio.bi_status = blk_error; - - bch2_rbio_done(rbio); - } -} - -static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, - struct bch_read_bio *rbio) -{ - struct bch_fs *c = rbio->c; - u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; - struct bch_extent_crc_unpacked new_crc; - struct btree_iter iter; - struct bkey_i *new; - struct bkey_s_c k; - int ret = 0; - - if (crc_is_compressed(rbio->pick.crc)) - return 0; - - k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_slots|BTREE_ITER_intent); - if ((ret = bkey_err(k))) - goto out; - - if (bversion_cmp(k.k->bversion, rbio->version) || - !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) - goto out; - - /* Extent was merged? */ - if (bkey_start_offset(k.k) < data_offset || - k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) - goto out; - - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - bkey_start_offset(k.k) - data_offset, k.k->size, - rbio->pick.crc.csum_type)) { - bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); - ret = 0; - goto out; - } - - /* - * going to be temporarily appending another checksum entry: - */ - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - sizeof(struct bch_extent_crc128)); - if ((ret = PTR_ERR_OR_ZERO(new))) - goto out; - - bkey_reassemble(new, k); - - if (!bch2_bkey_narrow_crcs(new, new_crc)) - goto out; - - ret = bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) -{ - bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_rbio_narrow_crcs(trans, rbio)); -} - -static void bch2_read_decompress_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_str(&buf, "decompression error"); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); - printbuf_exit(&buf); -} - -static void bch2_read_decrypt_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_str(&buf, "decrypt error"); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); - printbuf_exit(&buf); -} - -/* Inner part that may run in process context */ -static void __bch2_read_endio(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct bch_read_bio *parent = bch2_rbio_parent(rbio); - struct bio *src = &rbio->bio; - struct bio *dst = &parent->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - unsigned nofs_flags; - struct bch_csum csum; - int ret; - - nofs_flags = memalloc_nofs_save(); - - /* Reset iterator for checksumming and copying bounced data: */ - if (rbio->bounce) { - src->bi_iter.bi_size = crc.compressed_size << 9; - src->bi_iter.bi_idx = 0; - src->bi_iter.bi_bvec_done = 0; - } else { - src->bi_iter = rbio->bvec_iter; - } - - bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); - - csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; - - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { - rbio->flags |= BCH_READ_must_bounce; - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, - BLK_STS_IOERR); - goto out; - } - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); - - if (!csum_good) - goto csum_err; - - /* - * XXX - * We need to rework the narrow_crcs path to deliver the read completion - * first, and then punt to a different workqueue, otherwise we're - * holding up reads while doing btree updates which is bad for memory - * reclaim. - */ - if (unlikely(rbio->narrow_crcs)) - bch2_rbio_narrow_crcs(rbio); - - if (likely(!parent->data_update)) { - /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->offset_into_extent; - crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - - if (crc_is_compressed(crc)) { - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && - !c->opts.no_data_io) - goto decompression_err; - } else { - /* don't need to decrypt the entire bio: */ - nonce = nonce_add(nonce, crc.offset << 9); - bio_advance(src, crc.offset << 9); - - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; - - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (rbio->bounce) { - struct bvec_iter src_iter = src->bi_iter; - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } - } - } else { - if (rbio->split) - rbio->parent->pick = rbio->pick; - - if (rbio->bounce) { - struct bvec_iter src_iter = src->bi_iter; - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } - } - - if (rbio->promote) { - /* - * Re encrypt data we decrypted, so it's consistent with - * rbio->crc: - */ - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - } - - if (likely(!(rbio->flags & BCH_READ_in_retry))) { - rbio = bch2_rbio_free(rbio); - bch2_rbio_done(rbio); - } -out: - memalloc_nofs_restore(nofs_flags); - return; -csum_err: - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); - goto out; -decompression_err: - bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_dfl_wq); - goto out; -decrypt_err: - bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_dfl_wq); - goto out; -} - -static void bch2_read_endio(struct bio *bio) -{ - struct bch_read_bio *rbio = - container_of(bio, struct bch_read_bio, bio); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct workqueue_struct *wq = NULL; - enum rbio_context context = RBIO_CONTEXT_NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - rbio->submit_time, !bio->bi_status); - - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - - if (unlikely(bio->bi_status)) { - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); - return; - } - - if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || - (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { - trace_and_count(c, io_read_reuse_race, &rbio->bio); - - if (rbio->flags & BCH_READ_retry_if_stale) - bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); - else - bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); - return; - } - - if (rbio->narrow_crcs || - rbio->promote || - crc_is_compressed(rbio->pick.crc) || - bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) - context = RBIO_CONTEXT_UNBOUND, wq = system_dfl_wq; - else if (rbio->pick.crc.csum_type) - context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; - - bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); -} - -static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c k, - struct bch_extent_ptr ptr) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - PTR_BUCKET_POS(ca, &ptr), - BTREE_ITER_cached); - - int gen = bucket_gen_get(ca, iter.pos.offset); - if (gen >= 0) { - prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); - printbuf_indent_add(&buf, 2); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - prt_printf(&buf, "memory gen: %u", gen); - - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); - if (!ret) { - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - } - } else { - prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", - iter.pos.inode, iter.pos.offset); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "first bucket %u nbuckets %llu\n", - ca->mi.first_bucket, ca->mi.nbuckets); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - } - - bch2_fs_inconsistent(c, "%s", buf.buf); - - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); -} - -int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - struct bvec_iter iter, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, - struct bch_io_failures *failed, unsigned flags, int dev) -{ - struct bch_fs *c = trans->c; - struct extent_ptr_decoded pick; - struct bch_read_bio *rbio = NULL; - bool bounce = false, read_full = false, narrow_crcs = false; - struct bpos data_pos = bkey_start_pos(k.k); - struct data_update *u = rbio_data_update(orig); - int ret = 0; - - if (bkey_extent_is_inline_data(k.k)) { - unsigned bytes = min_t(unsigned, iter.bi_size, - bkey_inline_data_bytes(k.k)); - - swap(iter.bi_size, bytes); - memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); - swap(iter.bi_size, bytes); - bio_advance_iter(&orig->bio, &iter, bytes); - zero_fill_bio_iter(&orig->bio, iter); - this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], - bvec_iter_sectors(iter)); - goto out_read_done; - } - - if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && - !orig->data_update) - return bch_err_throw(c, extent_poisoned); -retry_pick: - ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); - - /* hole or reservation - just zero fill: */ - if (!ret) - goto hole; - - if (unlikely(ret < 0)) { - if (ret == -BCH_ERR_data_read_csum_err) { - int ret2 = maybe_poison_extent(trans, orig, data_btree, k); - if (ret2) { - ret = ret2; - goto err; - } - - trace_and_count(c, io_read_fail_and_poison, &orig->bio); - } - - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "%s\n ", bch2_err_str(ret)); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - goto err; - } - - if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && - !c->chacha20_key_set) { - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, data_read_no_encryption_key); - goto err; - } - - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_io_read); - - /* - * Stale dirty pointers are treated as IO errors, but @failed isn't - * allocated unless we're in the retry path - so if we're not in the - * retry path, don't check here, it'll be caught in bch2_read_endio() - * and we'll end up in the retry path: - */ - if ((flags & BCH_READ_in_retry) && - !pick.ptr.cached && - ca && - unlikely(dev_ptr_stale(ca, &pick.ptr))) { - read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); - bch2_mark_io_failure(failed, &pick, false); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); - goto retry_pick; - } - - if (likely(!u)) { - if (!(flags & BCH_READ_last_fragment) || - bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_must_clone; - - narrow_crcs = !(flags & BCH_READ_in_retry) && - bch2_can_narrow_extent_crcs(k, pick.crc); - - if (narrow_crcs && (flags & BCH_READ_user_mapped)) - flags |= BCH_READ_must_bounce; - - EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - - if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_none && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_user_mapped)) || - (flags & BCH_READ_must_bounce)))) { - read_full = true; - bounce = true; - } - } else { - /* - * can happen if we retry, and the extent we were going to read - * has been merged in the meantime: - */ - if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { - if (ca) - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_io_read); - rbio->ret = bch_err_throw(c, data_read_buffer_too_small); - goto out_read_done; - } - - iter.bi_size = pick.crc.compressed_size << 9; - read_full = true; - } - - if (orig->opts.promote_target || have_io_error(failed)) - rbio = promote_alloc(trans, iter, k, &pick, flags, orig, - &bounce, &read_full, failed); - - if (!read_full) { - EBUG_ON(crc_is_compressed(pick.crc)); - EBUG_ON(pick.crc.csum_type && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - bvec_iter_sectors(iter) != pick.crc.live_size || - pick.crc.offset || - offset_into_extent)); - - data_pos.offset += offset_into_extent; - pick.ptr.offset += pick.crc.offset + - offset_into_extent; - offset_into_extent = 0; - pick.crc.compressed_size = bvec_iter_sectors(iter); - pick.crc.uncompressed_size = bvec_iter_sectors(iter); - pick.crc.offset = 0; - pick.crc.live_size = bvec_iter_sectors(iter); - } - - if (rbio) { - /* - * promote already allocated bounce rbio: - * promote needs to allocate a bio big enough for uncompressing - * data in the write path, but we're not going to use it all - * here: - */ - EBUG_ON(rbio->bio.bi_iter.bi_size < - pick.crc.compressed_size << 9); - rbio->bio.bi_iter.bi_size = - pick.crc.compressed_size << 9; - } else if (bounce) { - unsigned sectors = pick.crc.compressed_size; - - rbio = rbio_init_fragment(bio_alloc_bioset(NULL, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - 0, - GFP_NOFS, - &c->bio_read_split), - orig); - - bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - rbio->bounce = true; - } else if (flags & BCH_READ_must_clone) { - /* - * Have to clone if there were any splits, due to error - * reporting issues (if a split errored, and retrying didn't - * work, when it reports the error to its parent (us) we don't - * know if the error was from our bio, and we should retry, or - * from the whole bio, in which case we don't want to retry and - * lose the error) - */ - rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, - &c->bio_read_split), - orig); - rbio->bio.bi_iter = iter; - } else { - rbio = orig; - rbio->bio.bi_iter = iter; - EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); - } - - EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - - rbio->submit_time = local_clock(); - if (!rbio->split) - rbio->end_io = orig->bio.bi_end_io; - rbio->bvec_iter = iter; - rbio->offset_into_extent= offset_into_extent; - rbio->flags = flags; - rbio->have_ioref = ca != NULL; - rbio->narrow_crcs = narrow_crcs; - rbio->ret = 0; - rbio->context = 0; - rbio->pick = pick; - rbio->subvol = orig->subvol; - rbio->read_pos = read_pos; - rbio->data_btree = data_btree; - rbio->data_pos = data_pos; - rbio->version = k.k->bversion; - INIT_WORK(&rbio->work, NULL); - - rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick.ptr.offset; - rbio->bio.bi_end_io = bch2_read_endio; - - async_object_list_add(c, rbio, rbio, &rbio->list_idx); - - if (rbio->bounce) - trace_and_count(c, io_read_bounce, &rbio->bio); - - if (!u) - this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); - else - this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); - bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - - /* - * If it's being moved internally, we don't want to flag it as a cache - * hit: - */ - if (ca && pick.ptr.cached && !u) - bch2_bucket_io_time_reset(trans, pick.ptr.dev, - PTR_BUCKET_NR(ca, &pick.ptr), READ); - - if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { - bio_inc_remaining(&orig->bio); - trace_and_count(c, io_read_split, &orig->bio); - } - - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - if (!(flags & BCH_READ_in_retry)) - bch2_trans_unlock(trans); - else - bch2_trans_unlock_long(trans); - - if (likely(!rbio->pick.do_ec_reconstruct)) { - if (unlikely(!rbio->have_ioref)) { - bch2_rbio_error(rbio, - -BCH_ERR_data_read_retry_device_offline, - BLK_STS_IOERR); - goto out; - } - - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], - bio_sectors(&rbio->bio)); - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - - if (unlikely(c->opts.no_data_io)) { - if (likely(!(flags & BCH_READ_in_retry))) - bio_endio(&rbio->bio); - } else { - if (likely(!(flags & BCH_READ_in_retry))) - submit_bio(&rbio->bio); - else - submit_bio_wait(&rbio->bio); - } - - /* - * We just submitted IO which may block, we expect relock fail - * events and shouldn't count them: - */ - trans->notrace_relock_fail = true; - } else { - /* Attempting reconstruct read: */ - if (bch2_ec_read_extent(trans, rbio, k)) { - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, - BLK_STS_IOERR); - goto out; - } - - if (likely(!(flags & BCH_READ_in_retry))) - bio_endio(&rbio->bio); - } -out: - if (likely(!(flags & BCH_READ_in_retry))) { - return 0; - } else { - bch2_trans_unlock(trans); - - int ret; - - rbio->context = RBIO_CONTEXT_UNBOUND; - bch2_read_endio(&rbio->bio); - - ret = rbio->ret; - rbio = bch2_rbio_free(rbio); - - if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) - bch2_mark_io_failure(failed, &pick, - ret == -BCH_ERR_data_read_retry_csum_err); - - return ret; - } - -err: - if (flags & BCH_READ_in_retry) - return ret; - - orig->bio.bi_status = BLK_STS_IOERR; - orig->ret = ret; - goto out_read_done; - -hole: - this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], - bvec_iter_sectors(iter)); - /* - * won't normally happen in the data update (bch2_move_extent()) path, - * but if we retry and the extent we wanted to read no longer exists we - * have to signal that: - */ - if (u) - orig->ret = bch_err_throw(c, data_read_key_overwritten); - - zero_fill_bio_iter(&orig->bio, iter); -out_read_done: - if ((flags & BCH_READ_last_fragment) && - !(flags & BCH_READ_in_retry)) - bch2_rbio_done(orig); - return 0; -} - -int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, - struct bkey_buf *prev_read, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - enum btree_id data_btree; - int ret; - - EBUG_ON(rbio->data_update); - - bch2_bkey_buf_init(&sk); - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, bvec_iter.bi_sector), - BTREE_ITER_slots); - - while (1) { - data_btree = BTREE_ID_extents; - - bch2_trans_begin(trans); - - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - bch2_btree_iter_set_pos(trans, &iter, - POS(inum.inum, bvec_iter.bi_sector)); - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - s64 offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - unsigned sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - - ret = bch2_read_indirect_extent(trans, &data_btree, - &offset_into_extent, &sk); - if (ret) - goto err; - - k = bkey_i_to_s_c(sk.k); - - if (unlikely(flags & BCH_READ_in_retry)) { - if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) - failed->nr = 0; - bch2_bkey_buf_copy(prev_read, c, sk.k); - } - - /* - * With indirect extents, the amount of data to read is the min - * of the original extent and the indirect extent: - */ - sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - - unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; - swap(bvec_iter.bi_size, bytes); - - if (bvec_iter.bi_size == bytes) - flags |= BCH_READ_last_fragment; - - ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, - data_btree, k, - offset_into_extent, failed, flags, -1); - swap(bvec_iter.bi_size, bytes); - - if (ret) - goto err; - - if (flags & BCH_READ_last_fragment) - break; - - bio_advance_iter(&rbio->bio, &bvec_iter, bytes); -err: - if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) - flags |= BCH_READ_must_bounce; - - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_data_read_retry)) - break; - } - - if (unlikely(ret)) { - if (ret != -BCH_ERR_extent_poisoned) { - struct printbuf buf = PRINTBUF; - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, inum, - bvec_iter.bi_sector << 9)); - prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - rbio->bio.bi_status = BLK_STS_IOERR; - rbio->ret = ret; - - if (!(flags & BCH_READ_in_retry)) - bch2_rbio_done(rbio); - } - - bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&sk, c); - return ret; -} - -static const char * const bch2_read_bio_flags[] = { -#define x(n) #n, - BCH_READ_FLAGS() -#undef x - NULL -}; - -void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) -{ - u64 now = local_clock(); - prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0); - prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0); - - if (!rbio->split) - prt_printf(out, "end_io:\t%ps\n", rbio->end_io); - else - prt_printf(out, "parent:\t%px\n", rbio->parent); - - prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io); - - prt_printf(out, "promote:\t%u\n", rbio->promote); - prt_printf(out, "bounce:\t%u\n", rbio->bounce); - prt_printf(out, "split:\t%u\n", rbio->split); - prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); - prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); - prt_printf(out, "context:\t%u\n", rbio->context); - - int ret = READ_ONCE(rbio->ret); - if (ret < 0) - prt_printf(out, "ret:\t%s\n", bch2_err_str(ret)); - else - prt_printf(out, "ret:\t%i\n", ret); - - prt_printf(out, "flags:\t"); - bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); - prt_newline(out); - - bch2_bio_to_text(out, &rbio->bio); -} - -void bch2_fs_io_read_exit(struct bch_fs *c) -{ - if (c->promote_table.tbl) - rhashtable_destroy(&c->promote_table); - bioset_exit(&c->bio_read_split); - bioset_exit(&c->bio_read); - mempool_exit(&c->bio_bounce_pages); -} - -int bch2_fs_io_read_init(struct bch_fs *c) -{ - if (mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->opts.encoded_extent_max) / - PAGE_SIZE, 0)) - return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); - - if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return bch_err_throw(c, ENOMEM_bio_read_init); - - if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return bch_err_throw(c, ENOMEM_bio_read_split_init); - - if (rhashtable_init(&c->promote_table, &bch_promote_params)) - return bch_err_throw(c, ENOMEM_promote_table_init); - - return 0; -} diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h deleted file mode 100644 index 9c5ddbf861b39c..00000000000000 --- a/fs/bcachefs/io_read.h +++ /dev/null @@ -1,216 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_READ_H -#define _BCACHEFS_IO_READ_H - -#include "bkey_buf.h" -#include "btree_iter.h" -#include "extents_types.h" -#include "reflink.h" - -struct bch_read_bio { - struct bch_fs *c; - u64 start_time; - u64 submit_time; - - /* - * Reads will often have to be split, and if the extent being read from - * was checksummed or compressed we'll also have to allocate bounce - * buffers and copy the data back into the original bio. - * - * If we didn't have to split, we have to save and restore the original - * bi_end_io - @split below indicates which: - */ - union { - struct bch_read_bio *parent; - bio_end_io_t *end_io; - }; - - /* - * Saved copy of bio->bi_iter, from submission time - allows us to - * resubmit on IO error, and also to copy data back to the original bio - * when we're bouncing: - */ - struct bvec_iter bvec_iter; - - unsigned offset_into_extent; - - u16 flags; - union { - struct { - u16 data_update:1, - promote:1, - bounce:1, - split:1, - have_ioref:1, - narrow_crcs:1, - saw_error:1, - self_healing:1, - context:2; - }; - u16 _state; - }; - s16 ret; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - - struct extent_ptr_decoded pick; - - /* - * pos we read from - different from data_pos for indirect extents: - */ - u32 subvol; - struct bpos read_pos; - - /* - * start pos of data we read (may not be pos of data we want) - for - * promote, narrow extents paths: - */ - enum btree_id data_btree; - struct bpos data_pos; - struct bversion version; - - struct bch_io_opts opts; - - struct work_struct work; - - struct bio bio; -}; - -#define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio) - -struct bch_devs_mask; -struct cache_promote_op; -struct extent_ptr_decoded; - -static inline int bch2_read_indirect_extent(struct btree_trans *trans, - enum btree_id *data_btree, - s64 *offset_into_extent, - struct bkey_buf *extent) -{ - if (extent->k->k.type != KEY_TYPE_reflink_p) - return 0; - - *data_btree = BTREE_ID_reflink; - - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, - offset_into_extent, - bkey_i_to_s_c_reflink_p(extent->k), - true, 0); - int ret = bkey_err(k); - if (ret) - return ret; - - if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); - return bch_err_throw(c, missing_indirect_extent); - } - - bch2_bkey_buf_reassemble(extent, c, k); - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -#define BCH_READ_FLAGS() \ - x(retry_if_stale) \ - x(may_promote) \ - x(user_mapped) \ - x(last_fragment) \ - x(must_bounce) \ - x(must_clone) \ - x(in_retry) - -enum __bch_read_flags { -#define x(n) __BCH_READ_##n, - BCH_READ_FLAGS() -#undef x -}; - -enum bch_read_flags { -#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), - BCH_READ_FLAGS() -#undef x -}; - -int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, - struct bvec_iter, struct bpos, enum btree_id, - struct bkey_s_c, unsigned, - struct bch_io_failures *, unsigned, int); - -static inline void bch2_read_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, unsigned flags) -{ - int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags, -1); - /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ - WARN(ret, "unhandled error from __bch2_read_extent()"); -} - -int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, - struct bch_io_failures *, struct bkey_buf *, unsigned flags); - -static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum) -{ - BUG_ON(rbio->_state); - - rbio->subvol = inum.subvol; - - bch2_trans_run(c, - __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, - BCH_READ_retry_if_stale| - BCH_READ_may_promote| - BCH_READ_user_mapped)); -} - -static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, - struct bch_read_bio *orig) -{ - struct bch_read_bio *rbio = to_rbio(bio); - - rbio->c = orig->c; - rbio->_state = 0; - rbio->flags = 0; - rbio->ret = 0; - rbio->split = true; - rbio->parent = orig; - rbio->opts = orig->opts; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - rbio->list_idx = 0; -#endif - return rbio; -} - -static inline struct bch_read_bio *rbio_init(struct bio *bio, - struct bch_fs *c, - struct bch_io_opts opts, - bio_end_io_t end_io) -{ - struct bch_read_bio *rbio = to_rbio(bio); - - rbio->start_time = local_clock(); - rbio->c = c; - rbio->_state = 0; - rbio->flags = 0; - rbio->ret = 0; - rbio->opts = opts; - rbio->bio.bi_end_io = end_io; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - rbio->list_idx = 0; -#endif - return rbio; -} - -struct promote_op; -void bch2_promote_op_to_text(struct printbuf *, struct promote_op *); -void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *); - -void bch2_fs_io_read_exit(struct bch_fs *); -int bch2_fs_io_read_init(struct bch_fs *); - -#endif /* _BCACHEFS_IO_READ_H */ diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c deleted file mode 100644 index 88b1eec8eff319..00000000000000 --- a/fs/bcachefs/io_write.c +++ /dev/null @@ -1,1780 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "bkey_buf.h" -#include "bset.h" -#include "btree_update.h" -#include "buckets.h" -#include "checksum.h" -#include "clock.h" -#include "compress.h" -#include "debug.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extent_update.h" -#include "inode.h" -#include "io_write.h" -#include "journal.h" -#include "keylist.h" -#include "move.h" -#include "nocow_locking.h" -#include "rebalance.h" -#include "subvolume.h" -#include "super.h" -#include "super-io.h" -#include "trace.h" - -#include -#include -#include -#include - -#ifdef CONFIG_BCACHEFS_DEBUG -static unsigned bch2_write_corrupt_ratio; -module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); -MODULE_PARM_DESC(write_corrupt_ratio, ""); -#endif - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - -static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, - u64 now, int rw) -{ - u64 latency_capable = - ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; - /* ideally we'd be taking into account the device's variance here: */ - u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); - s64 latency_over = io_latency - latency_threshold; - - if (latency_threshold && latency_over > 0) { - /* - * bump up congested by approximately latency_over * 4 / - * latency_threshold - we don't need much accuracy here so don't - * bother with the divide: - */ - if (atomic_read(&ca->congested) < CONGESTED_MAX) - atomic_add(latency_over >> - max_t(int, ilog2(latency_threshold) - 2, 0), - &ca->congested); - - ca->congested_last = now; - } else if (atomic_read(&ca->congested) > 0) { - atomic_dec(&ca->congested); - } -} - -void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) -{ - atomic64_t *latency = &ca->cur_latency[rw]; - u64 now = local_clock(); - u64 io_latency = time_after64(now, submit_time) - ? now - submit_time - : 0; - u64 old, new; - - old = atomic64_read(latency); - do { - /* - * If the io latency was reasonably close to the current - * latency, skip doing the update and atomic operation - most of - * the time: - */ - if (abs((int) (old - io_latency)) < (old >> 1) && - now & ~(~0U << 5)) - break; - - new = ewma_add(old, io_latency, 5); - } while (!atomic64_try_cmpxchg(latency, &old, new)); - - bch2_congested_acct(ca, io_latency, now, rw); - - __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); -} - -#endif - -/* Allocate, free from mempool: */ - -void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) -{ - struct bvec_iter_all iter; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, iter) - if (bv->bv_page != ZERO_PAGE(0)) - mempool_free(bv->bv_page, &c->bio_bounce_pages); - bio->bi_vcnt = 0; -} - -static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) -{ - struct page *page; - - if (likely(!*using_mempool)) { - page = alloc_page(GFP_NOFS); - if (unlikely(!page)) { - mutex_lock(&c->bio_bounce_pages_lock); - *using_mempool = true; - goto pool_alloc; - - } - } else { -pool_alloc: - page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); - } - - return page; -} - -void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, - size_t size) -{ - bool using_mempool = false; - - while (size) { - struct page *page = __bio_alloc_page_pool(c, &using_mempool); - unsigned len = min_t(size_t, PAGE_SIZE, size); - - BUG_ON(!bio_add_page(bio, page, len, 0)); - size -= len; - } - - if (using_mempool) - mutex_unlock(&c->bio_bounce_pages_lock); -} - -/* Extent update path: */ - -int bch2_sum_sector_overwrites(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *new, - bool *usage_increasing, - s64 *i_sectors_delta, - s64 *disk_sectors_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c old; - unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); - bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); - int ret = 0; - - *usage_increasing = false; - *i_sectors_delta = 0; - *disk_sectors_delta = 0; - - bch2_trans_copy_iter(trans, &iter, extent_iter); - - for_each_btree_key_max_continue_norestart(trans, iter, - new->k.p, BTREE_ITER_slots, old, ret) { - s64 sectors = min(new->k.p.offset, old.k->p.offset) - - max(bkey_start_offset(&new->k), - bkey_start_offset(old.k)); - - *i_sectors_delta += sectors * - (bkey_extent_is_allocation(&new->k) - - bkey_extent_is_allocation(old.k)); - - *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); - *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot - ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) - : 0; - - if (!*usage_increasing && - (new->k.p.snapshot != old.k->p.snapshot || - new_replicas > bch2_bkey_replicas(c, old) || - (!new_compressed && bch2_bkey_sectors_compressed(old)))) - *usage_increasing = true; - - if (bkey_ge(old.k->p, new->k.p)) - break; - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, - struct btree_iter *extent_iter, - u64 new_i_size, - s64 i_sectors_delta) -{ - /* - * Crazy performance optimization: - * Every extent update needs to also update the inode: the inode trigger - * will set bi->journal_seq to the journal sequence number of this - * transaction - for fsync. - * - * But if that's the only reason we're updating the inode (we're not - * updating bi_size or bi_sectors), then we don't need the inode update - * to be journalled - if we crash, the bi_journal_seq update will be - * lost, but that's fine. - */ - unsigned inode_update_flags = BTREE_UPDATE_nojournal; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, - extent_iter->pos.inode, - extent_iter->snapshot), - BTREE_ITER_intent| - BTREE_ITER_cached); - int ret = bkey_err(k); - if (unlikely(ret)) - return ret; - - /* - * varint_decode_fast(), in the inode .invalid method, reads up to 7 - * bytes past the end of the buffer: - */ - struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); - ret = PTR_ERR_OR_ZERO(k_mut); - if (unlikely(ret)) - goto err; - - bkey_reassemble(k_mut, k); - - if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) { - k_mut = bch2_inode_to_v3(trans, k_mut); - ret = PTR_ERR_OR_ZERO(k_mut); - if (unlikely(ret)) - goto err; - } - - struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut); - - if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && - new_i_size > le64_to_cpu(inode->v.bi_size)) { - inode->v.bi_size = cpu_to_le64(new_i_size); - inode_update_flags = 0; - } - - if (i_sectors_delta) { - s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors); - if (unlikely(bi_sectors + i_sectors_delta < 0)) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", - extent_iter->pos.inode, bi_sectors, i_sectors_delta); - - bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - if (i_sectors_delta < 0) - i_sectors_delta = -bi_sectors; - else - i_sectors_delta = 0; - } - - le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); - inode_update_flags = 0; - } - - /* - * extents, dirents and xattrs updates require that an inode update also - * happens - to ensure that if a key exists in one of those btrees with - * a given snapshot ID an inode is also present - so we may have to skip - * the nojournal optimization: - */ - if (inode->k.p.snapshot != iter.snapshot) { - inode->k.p.snapshot = iter.snapshot; - inode_update_flags = 0; - } - - ret = bch2_trans_update(trans, &iter, &inode->k_i, - BTREE_UPDATE_internal_snapshot_node| - inode_update_flags); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_extent_update(struct btree_trans *trans, - subvol_inum inum, - struct btree_iter *iter, - struct bkey_i *k, - struct disk_reservation *disk_res, - u64 new_i_size, - s64 *i_sectors_delta_total, - bool check_enospc) -{ - struct bpos next_pos; - bool usage_increasing; - s64 i_sectors_delta = 0, disk_sectors_delta = 0; - int ret; - - /* - * This traverses us the iterator without changing iter->path->pos to - * search_key() (which is pos + 1 for extents): we want there to be a - * path already traversed at iter->pos because - * bch2_trans_extent_update() will use it to attempt extent merging - */ - ret = __bch2_btree_iter_traverse(trans, iter); - if (ret) - return ret; - - ret = bch2_extent_trim_atomic(trans, iter, k); - if (ret) - return ret; - - next_pos = k->k.p; - - ret = bch2_sum_sector_overwrites(trans, iter, k, - &usage_increasing, - &i_sectors_delta, - &disk_sectors_delta); - if (ret) - return ret; - - if (disk_res && - disk_sectors_delta > (s64) disk_res->sectors) { - ret = bch2_disk_reservation_add(trans->c, disk_res, - disk_sectors_delta - disk_res->sectors, - !check_enospc || !usage_increasing - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - return ret; - } - - /* - * Note: - * We always have to do an inode update - even when i_size/i_sectors - * aren't changing - for fsync to work properly; fsync relies on - * inode->bi_journal_seq which is updated by the trigger code: - */ - ret = bch2_extent_update_i_size_sectors(trans, iter, - min(k->k.p.offset << 9, new_i_size), - i_sectors_delta) ?: - bch2_trans_update(trans, iter, k, 0) ?: - bch2_trans_commit(trans, disk_res, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc); - if (unlikely(ret)) - return ret; - - if (i_sectors_delta_total) - *i_sectors_delta_total += i_sectors_delta; - bch2_btree_iter_set_pos(trans, iter, next_pos); - return 0; -} - -static int bch2_write_index_default(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct bkey_buf sk; - struct keylist *keys = &op->insert_keys; - struct bkey_i *k = bch2_keylist_front(keys); - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - subvol_inum inum = { - .subvol = op->subvol, - .inum = k->k.p.inode, - }; - int ret; - - BUG_ON(!inum.subvol); - - bch2_bkey_buf_init(&sk); - - do { - bch2_trans_begin(trans); - - k = bch2_keylist_front(keys); - bch2_bkey_buf_copy(&sk, c, k); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, - &sk.k->k.p.snapshot); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - bkey_start_pos(&sk.k->k), - BTREE_ITER_slots|BTREE_ITER_intent); - - ret = bch2_extent_update(trans, inum, &iter, sk.k, - &op->res, - op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_check_enospc); - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (bkey_ge(iter.pos, k->k.p)) - bch2_keylist_pop_front(&op->insert_keys); - else - bch2_cut_front(iter.pos, k); - } while (!bch2_keylist_empty(keys)); - - bch2_trans_put(trans); - bch2_bkey_buf_exit(&sk, c); - - return ret; -} - -/* Writes */ - -void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) -{ - struct printbuf buf = PRINTBUF; - - if (op->subvol) { - bch2_inum_offset_err_msg(op->c, &buf, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - } else { - struct bpos pos = op->pos; - pos.offset = offset; - bch2_inum_snap_offset_err_msg(op->c, &buf, pos); - } - - prt_str(&buf, "write error: "); - - va_list args; - va_start(args, fmt); - prt_vprintf(&buf, fmt, args); - va_end(args); - - if (op->flags & BCH_WRITE_move) { - struct data_update *u = container_of(op, struct data_update, op); - - prt_printf(&buf, "\n from internal move "); - bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); - } - - bch_err_ratelimited(op->c, "%s", buf.buf); - printbuf_exit(&buf); -} - -void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, - enum bch_data_type type, - const struct bkey_i *k, - bool nocow) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - struct bch_write_bio *n; - unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE; - unsigned ref_idx = type == BCH_DATA_btree - ? BCH_DEV_READ_REF_btree_node_write - : BCH_DEV_WRITE_REF_io_write; - - BUG_ON(c->opts.nochanges); - - const struct bch_extent_ptr *last = NULL; - bkey_for_each_ptr(ptrs, ptr) - last = ptr; - - bkey_for_each_ptr(ptrs, ptr) { - /* - * XXX: btree writes should be using io_ref[WRITE], but we - * aren't retrying failed btree writes yet (due to device - * removal/ro): - */ - struct bch_dev *ca = nocow - ? bch2_dev_have_ref(c, ptr->dev) - : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx); - - if (ptr != last) { - n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); - - n->bio.bi_end_io = wbio->bio.bi_end_io; - n->bio.bi_private = wbio->bio.bi_private; - n->parent = wbio; - n->split = true; - n->bounce = false; - n->put_bio = true; - n->bio.bi_opf = wbio->bio.bi_opf; - bio_inc_remaining(&wbio->bio); - } else { - n = wbio; - n->split = false; - } - - n->c = c; - n->dev = ptr->dev; - n->have_ioref = ca != NULL; - n->nocow = nocow; - n->submit_time = local_clock(); - n->inode_offset = bkey_start_offset(&k->k); - if (nocow) - n->nocow_bucket = PTR_BUCKET_NR(ca, ptr); - n->bio.bi_iter.bi_sector = ptr->offset; - - if (likely(n->have_ioref)) { - this_cpu_add(ca->io_done->sectors[WRITE][type], - bio_sectors(&n->bio)); - - bio_set_dev(&n->bio, ca->disk_sb.bdev); - - if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { - bio_endio(&n->bio); - continue; - } - - submit_bio(&n->bio); - } else { - n->bio.bi_status = BLK_STS_REMOVED; - bio_endio(&n->bio); - } - } -} - -static void __bch2_write(struct bch_write_op *); - -static void bch2_write_done(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_fs *c = op->c; - - EBUG_ON(op->open_buckets.nr); - - bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - bch2_disk_reservation_put(c, &op->res); - - if (!(op->flags & BCH_WRITE_move)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_write); - bch2_keylist_free(&op->insert_keys, op->inline_keys); - - EBUG_ON(cl->parent); - closure_debug_destroy(cl); - async_object_list_del(c, write_op, op->list_idx); - if (op->end_io) - op->end_io(op); -} - -static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct keylist *keys = &op->insert_keys; - struct bkey_i *src, *dst = keys->keys, *n; - - for (src = keys->keys; src != keys->top; src = n) { - n = bkey_next(src); - - if (bkey_extent_is_direct_data(&src->k)) { - bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, - test_bit(ptr->dev, op->failed.d)); - - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return bch_err_throw(c, data_write_io); - } - - if (dst != src) - memmove_u64s_down(dst, src, src->k.u64s); - dst = bkey_next(dst); - } - - keys->top = dst; - return 0; -} - -/** - * __bch2_write_index - after a write, update index to point to new data - * @op: bch_write_op to process - */ -static void __bch2_write_index(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct keylist *keys = &op->insert_keys; - unsigned dev; - int ret = 0; - - if (unlikely(op->flags & BCH_WRITE_io_error)) { - ret = bch2_write_drop_io_error_ptrs(op); - if (ret) - goto err; - } - - if (!bch2_keylist_empty(keys)) { - u64 sectors_start = keylist_sectors(keys); - - ret = !(op->flags & BCH_WRITE_move) - ? bch2_write_index_default(op) - : bch2_data_update_index_update(op); - - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); - BUG_ON(keylist_sectors(keys) && !ret); - - op->written += sectors_start - keylist_sectors(keys); - - if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - - bch2_write_op_error(op, bkey_start_offset(&insert->k), - "btree update error: %s", bch2_err_str(ret)); - } - - if (ret) - goto err; - } -out: - /* If some a bucket wasn't written, we can't erasure code it: */ - for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) - bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); - - bch2_open_buckets_put(c, &op->open_buckets); - return; -err: - keys->top = keys->keys; - op->error = ret; - op->flags |= BCH_WRITE_submitted; - goto out; -} - -static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) -{ - if (state != wp->state) { - struct task_struct *p = current; - u64 now = ktime_get_ns(); - u64 runtime = p->se.sum_exec_runtime + - (now - p->se.exec_start); - - if (state == WRITE_POINT_runnable) - wp->last_runtime = runtime; - else if (wp->state == WRITE_POINT_runnable) - wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; - - if (wp->last_state_change && - time_after64(now, wp->last_state_change)) - wp->time[wp->state] += now - wp->last_state_change; - wp->state = state; - wp->last_state_change = now; - } -} - -static inline void wp_update_state(struct write_point *wp, bool running) -{ - enum write_point_state state; - - state = running ? WRITE_POINT_runnable: - !list_empty(&wp->writes) ? WRITE_POINT_waiting_io - : WRITE_POINT_stopped; - - __wp_update_state(wp, state); -} - -static CLOSURE_CALLBACK(bch2_write_index) -{ - closure_type(op, struct bch_write_op, cl); - struct write_point *wp = op->wp; - struct workqueue_struct *wq = index_update_wq(op); - unsigned long flags; - - if ((op->flags & BCH_WRITE_submitted) && - (op->flags & BCH_WRITE_move)) - bch2_bio_free_pages_pool(op->c, &op->wbio.bio); - - spin_lock_irqsave(&wp->writes_lock, flags); - if (wp->state == WRITE_POINT_waiting_io) - __wp_update_state(wp, WRITE_POINT_waiting_work); - list_add_tail(&op->wp_list, &wp->writes); - spin_unlock_irqrestore (&wp->writes_lock, flags); - - queue_work(wq, &wp->index_update_work); -} - -static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) -{ - op->wp = wp; - - if (wp->state == WRITE_POINT_stopped) { - spin_lock_irq(&wp->writes_lock); - __wp_update_state(wp, WRITE_POINT_waiting_io); - spin_unlock_irq(&wp->writes_lock); - } -} - -void bch2_write_point_do_index_updates(struct work_struct *work) -{ - struct write_point *wp = - container_of(work, struct write_point, index_update_work); - struct bch_write_op *op; - - while (1) { - spin_lock_irq(&wp->writes_lock); - op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); - wp_update_state(wp, op != NULL); - spin_unlock_irq(&wp->writes_lock); - - if (!op) - break; - - op->flags |= BCH_WRITE_in_worker; - - __bch2_write_index(op); - - if (!(op->flags & BCH_WRITE_submitted)) - __bch2_write(op); - else - bch2_write_done(&op->cl); - } -} - -static void bch2_write_endio(struct bio *bio) -{ - struct closure *cl = bio->bi_private; - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; - struct bch_fs *c = wbio->c; - struct bch_dev *ca = wbio->have_ioref - ? bch2_dev_have_ref(c, wbio->dev) - : NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, - wbio->submit_time, !bio->bi_status); - - if (unlikely(bio->bi_status)) { - if (ca) - bch_err_inum_offset_ratelimited(ca, - op->pos.inode, - wbio->inode_offset << 9, - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status)); - else - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - wbio->inode_offset << 9, - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status)); - set_bit(wbio->dev, op->failed.d); - op->flags |= BCH_WRITE_io_error; - } - - if (wbio->nocow) { - bch2_bucket_nocow_unlock(&c->nocow_locks, - POS(ca->dev_idx, wbio->nocow_bucket), - BUCKET_NOCOW_LOCK_UPDATE); - set_bit(wbio->dev, op->devs_need_flush->d); - } - - if (wbio->have_ioref) - enumerated_ref_put(&ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_io_write); - - if (wbio->bounce) - bch2_bio_free_pages_pool(c, bio); - - if (wbio->put_bio) - bio_put(bio); - - if (parent) - bio_endio(&parent->bio); - else - closure_put(cl); -} - -static void init_append_extent(struct bch_write_op *op, - struct write_point *wp, - struct bversion version, - struct bch_extent_crc_unpacked crc) -{ - struct bkey_i_extent *e; - - op->pos.offset += crc.uncompressed_size; - - e = bkey_extent_init(op->insert_keys.top); - e->k.p = op->pos; - e->k.size = crc.uncompressed_size; - e->k.bversion = version; - - if (crc.csum_type || - crc.compression_type || - crc.nonce) - bch2_extent_crc_append(&e->k_i, crc); - - bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, - op->flags & BCH_WRITE_cached); - - if (!(op->flags & BCH_WRITE_move)) - bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); - - bch2_keylist_push(&op->insert_keys); -} - -static struct bio *bch2_write_bio_alloc(struct bch_fs *c, - struct write_point *wp, - struct bio *src, - bool *page_alloc_failed, - void *buf) -{ - struct bch_write_bio *wbio; - struct bio *bio; - unsigned output_available = - min(wp->sectors_free << 9, src->bi_iter.bi_size); - unsigned pages = DIV_ROUND_UP(output_available + - (buf - ? ((unsigned long) buf & (PAGE_SIZE - 1)) - : 0), PAGE_SIZE); - - pages = min(pages, BIO_MAX_VECS); - - bio = bio_alloc_bioset(NULL, pages, 0, - GFP_NOFS, &c->bio_write); - wbio = wbio_init(bio); - wbio->put_bio = true; - /* copy WRITE_SYNC flag */ - wbio->bio.bi_opf = src->bi_opf; - - if (buf) { - bch2_bio_map(bio, buf, output_available); - return bio; - } - - wbio->bounce = true; - - /* - * We can't use mempool for more than c->sb.encoded_extent_max - * worth of pages, but we'd like to allocate more if we can: - */ - bch2_bio_alloc_pages_pool(c, bio, - min_t(unsigned, output_available, - c->opts.encoded_extent_max)); - - if (bio->bi_iter.bi_size < output_available) - *page_alloc_failed = - bch2_bio_alloc_pages(bio, - output_available - - bio->bi_iter.bi_size, - GFP_NOFS) != 0; - - return bio; -} - -static int bch2_write_rechecksum(struct bch_fs *c, - struct bch_write_op *op, - unsigned new_csum_type) -{ - struct bio *bio = &op->wbio.bio; - struct bch_extent_crc_unpacked new_crc; - - /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ - - if (bch2_csum_type_is_encryption(op->crc.csum_type) != - bch2_csum_type_is_encryption(new_csum_type)) - new_csum_type = op->crc.csum_type; - - int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, - NULL, &new_crc, - op->crc.offset, op->crc.live_size, - new_csum_type); - if (ret) - return ret; - - bio_advance(bio, op->crc.offset << 9); - bio->bi_iter.bi_size = op->crc.live_size << 9; - op->crc = new_crc; - return 0; -} - -static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) -{ - struct bch_fs *c = op->c; - struct bio *bio = &op->wbio.bio; - struct bch_csum csum; - int ret = 0; - - BUG_ON(bio_sectors(bio) != op->crc.compressed_size); - - /* Can we just write the entire extent as is? */ - if (op->crc.uncompressed_size == op->crc.live_size && - op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && - op->crc.compressed_size <= wp->sectors_free && - (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || - op->incompressible)) { - if (!crc_is_compressed(op->crc) && - op->csum_type != op->crc.csum_type) { - ret = bch2_write_rechecksum(c, op, op->csum_type); - if (ret) - return ret; - } - - return 1; - } - - /* - * If the data is compressed and we couldn't write the entire extent as - * is, we have to decompress it: - */ - if (crc_is_compressed(op->crc)) { - /* Last point we can still verify checksum: */ - struct nonce nonce = extent_nonce(op->version, op->crc); - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - goto csum_err; - - if (bch2_csum_type_is_encryption(op->crc.csum_type)) { - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); - if (ret) - return ret; - - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - } - - ret = bch2_bio_uncompress_inplace(op, bio); - if (ret) - return ret; - } - - /* - * No longer have compressed data after this point - data might be - * encrypted: - */ - - /* - * If the data is checksummed and we're only writing a subset, - * rechecksum and adjust bio to point to currently live data: - */ - if (op->crc.live_size != op->crc.uncompressed_size || - op->crc.csum_type != op->csum_type) { - ret = bch2_write_rechecksum(c, op, op->csum_type); - if (ret) - return ret; - } - - /* - * If we want to compress the data, it has to be decrypted: - */ - if (bch2_csum_type_is_encryption(op->crc.csum_type) && - (op->compression_opt || op->crc.csum_type != op->csum_type)) { - struct nonce nonce = extent_nonce(op->version, op->crc); - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - goto csum_err; - - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); - if (ret) - return ret; - - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - } - - return 0; -csum_err: - bch2_write_op_error(op, op->pos.offset, - "error verifying existing checksum while moving existing data (memory corruption?)\n" - " expected %0llx:%0llx got %0llx:%0llx type %s", - op->crc.csum.hi, - op->crc.csum.lo, - csum.hi, - csum.lo, - op->crc.csum_type < BCH_CSUM_NR - ? __bch2_csum_types[op->crc.csum_type] - : "(unknown)"); - return bch_err_throw(c, data_write_csum); -} - -static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - struct bio **_dst) -{ - struct bch_fs *c = op->c; - struct bio *src = &op->wbio.bio, *dst = src; - struct bvec_iter saved_iter; - void *ec_buf; - unsigned total_output = 0, total_input = 0; - bool bounce = false; - bool page_alloc_failed = false; - int ret, more = 0; - - if (op->incompressible) - op->compression_opt = 0; - - BUG_ON(!bio_sectors(src)); - - ec_buf = bch2_writepoint_ec_buf(c, wp); - - if (unlikely(op->flags & BCH_WRITE_data_encoded)) { - ret = bch2_write_prep_encoded_data(op, wp); - if (ret < 0) - goto err; - if (ret) { - if (ec_buf) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bio_copy_data(dst, src); - bounce = true; - } - init_append_extent(op, wp, op->version, op->crc); - goto do_write; - } - } - - if (ec_buf || - op->compression_opt || - (op->csum_type && - !(op->flags & BCH_WRITE_pages_stable)) || - (bch2_csum_type_is_encryption(op->csum_type) && - !(op->flags & BCH_WRITE_pages_owned))) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bounce = true; - } - -#ifdef CONFIG_BCACHEFS_DEBUG - unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); - if (!bounce && write_corrupt_ratio) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bounce = true; - } -#endif - saved_iter = dst->bi_iter; - - do { - struct bch_extent_crc_unpacked crc = { 0 }; - struct bversion version = op->version; - size_t dst_len = 0, src_len = 0; - - if (page_alloc_failed && - dst->bi_iter.bi_size < (wp->sectors_free << 9) && - dst->bi_iter.bi_size < c->opts.encoded_extent_max) - break; - - BUG_ON(op->compression_opt && - (op->flags & BCH_WRITE_data_encoded) && - bch2_csum_type_is_encryption(op->crc.csum_type)); - BUG_ON(op->compression_opt && !bounce); - - crc.compression_type = op->incompressible - ? BCH_COMPRESSION_TYPE_incompressible - : op->compression_opt - ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, - op->compression_opt) - : 0; - if (!crc_is_compressed(crc)) { - dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); - - if (op->csum_type) - dst_len = min_t(unsigned, dst_len, - c->opts.encoded_extent_max); - - if (bounce) { - swap(dst->bi_iter.bi_size, dst_len); - bio_copy_data(dst, src); - swap(dst->bi_iter.bi_size, dst_len); - } - - src_len = dst_len; - } - - BUG_ON(!src_len || !dst_len); - - if (bch2_csum_type_is_encryption(op->csum_type)) { - if (bversion_zero(version)) { - version.lo = atomic64_inc_return(&c->key_version); - } else { - crc.nonce = op->nonce; - op->nonce += src_len >> 9; - } - } - - if ((op->flags & BCH_WRITE_data_encoded) && - !crc_is_compressed(crc) && - bch2_csum_type_is_encryption(op->crc.csum_type) == - bch2_csum_type_is_encryption(op->csum_type)) { - u8 compression_type = crc.compression_type; - u16 nonce = crc.nonce; - /* - * Note: when we're using rechecksum(), we need to be - * checksumming @src because it has all the data our - * existing checksum covers - if we bounced (because we - * were trying to compress), @dst will only have the - * part of the data the new checksum will cover. - * - * But normally we want to be checksumming post bounce, - * because part of the reason for bouncing is so the - * data can't be modified (by userspace) while it's in - * flight. - */ - ret = bch2_rechecksum_bio(c, src, version, op->crc, - &crc, &op->crc, - src_len >> 9, - bio_sectors(src) - (src_len >> 9), - op->csum_type); - if (ret) - goto err; - /* - * rchecksum_bio sets compression_type on crc from op->crc, - * this isn't always correct as sometimes we're changing - * an extent from uncompressed to incompressible. - */ - crc.compression_type = compression_type; - crc.nonce = nonce; - } else { - if ((op->flags & BCH_WRITE_data_encoded) && - (ret = bch2_rechecksum_bio(c, src, version, op->crc, - NULL, &op->crc, - src_len >> 9, - bio_sectors(src) - (src_len >> 9), - op->crc.csum_type))) - goto err; - - crc.compressed_size = dst_len >> 9; - crc.uncompressed_size = src_len >> 9; - crc.live_size = src_len >> 9; - - swap(dst->bi_iter.bi_size, dst_len); - ret = bch2_encrypt_bio(c, op->csum_type, - extent_nonce(version, crc), dst); - if (ret) - goto err; - - crc.csum = bch2_checksum_bio(c, op->csum_type, - extent_nonce(version, crc), dst); - crc.csum_type = op->csum_type; - swap(dst->bi_iter.bi_size, dst_len); - } - - init_append_extent(op, wp, version, crc); - -#ifdef CONFIG_BCACHEFS_DEBUG - if (write_corrupt_ratio) { - swap(dst->bi_iter.bi_size, dst_len); - bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); - swap(dst->bi_iter.bi_size, dst_len); - } -#endif - - if (dst != src) - bio_advance(dst, dst_len); - bio_advance(src, src_len); - total_output += dst_len; - total_input += src_len; - } while (dst->bi_iter.bi_size && - src->bi_iter.bi_size && - wp->sectors_free && - !bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_EXTENT_U64s_MAX)); - - more = src->bi_iter.bi_size != 0; - - dst->bi_iter = saved_iter; - - if (dst == src && more) { - BUG_ON(total_output != total_input); - - dst = bio_split(src, total_input >> 9, - GFP_NOFS, &c->bio_write); - wbio_init(dst)->put_bio = true; - /* copy WRITE_SYNC flag */ - dst->bi_opf = src->bi_opf; - } - - dst->bi_iter.bi_size = total_output; -do_write: - *_dst = dst; - return more; -err: - if (to_wbio(dst)->bounce) - bch2_bio_free_pages_pool(c, dst); - if (to_wbio(dst)->put_bio) - bio_put(dst); - - return ret; -} - -static bool bch2_extent_is_writeable(struct bch_write_op *op, - struct bkey_s_c k) -{ - struct bch_fs *c = op->c; - struct bkey_s_c_extent e; - struct extent_ptr_decoded p; - const union bch_extent_entry *entry; - unsigned replicas = 0; - - if (k.k->type != KEY_TYPE_extent) - return false; - - e = bkey_s_c_to_extent(k); - - guard(rcu)(); - extent_for_each_ptr_decode(e, p, entry) { - if (crc_is_encoded(p.crc) || p.has_ec) - return false; - - replicas += bch2_extent_ptr_durability(c, &p); - } - - return replicas >= op->opts.data_replicas; -} - -static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *orig, - struct bkey_s_c k, - u64 new_i_size) -{ - if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { - /* trace this */ - return 0; - } - - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bch2_cut_front(bkey_start_pos(&orig->k), new); - bch2_cut_back(orig->k.p, new); - - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr(ptrs, ptr) - ptr->unwritten = 0; - - /* - * Note that we're not calling bch2_subvol_get_snapshot() in this path - - * that was done when we kicked off the write, and here it's important - * that we update the extent that we wrote to - even if a snapshot has - * since been created. The write is still outstanding, so we're ok - * w.r.t. snapshot atomicity: - */ - return bch2_extent_update_i_size_sectors(trans, iter, - min(new->k.p.offset << 9, new_i_size), 0) ?: - bch2_trans_update(trans, iter, new, - BTREE_UPDATE_internal_snapshot_node); -} - -static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - - for_each_keylist_key(&op->insert_keys, orig) { - ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, - bkey_start_pos(&orig->k), orig->k.p, - BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); - })); - if (ret) - break; - } - - bch2_trans_put(trans); - - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - bch2_write_op_error(op, bkey_start_offset(&insert->k), - "btree update error: %s", bch2_err_str(ret)); - } - - if (ret) - op->error = ret; -} - -static void __bch2_nocow_write_done(struct bch_write_op *op) -{ - if (unlikely(op->flags & BCH_WRITE_io_error)) { - op->error = bch_err_throw(op->c, data_write_io); - } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) - bch2_nocow_write_convert_unwritten(op); -} - -static CLOSURE_CALLBACK(bch2_nocow_write_done) -{ - closure_type(op, struct bch_write_op, cl); - - __bch2_nocow_write_done(op); - bch2_write_done(cl); -} - -struct bucket_to_lock { - struct bpos b; - unsigned gen; - struct nocow_lock_bucket *l; -}; - -static void bch2_nocow_write(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; - u32 snapshot; - struct bucket_to_lock *stale_at; - int stale, ret; - - if (op->flags & BCH_WRITE_move) - return; - - darray_init(&buckets); - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); - if (unlikely(ret)) - goto err; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(op->pos.inode, op->pos.offset, snapshot), - BTREE_ITER_slots); - while (1) { - struct bio *bio = &op->wbio.bio; - - buckets.nr = 0; - - ret = bch2_trans_relock(trans); - if (ret) - break; - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - break; - - /* fall back to normal cow write path? */ - if (unlikely(k.k->p.snapshot != snapshot || - !bch2_extent_is_writeable(op, k))) - break; - - if (bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - k.k->u64s)) - break; - - /* Get iorefs before dropping btree locks: */ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, - BCH_DEV_WRITE_REF_io_write); - if (unlikely(!ca)) - goto err_get_ioref; - - struct bpos b = PTR_BUCKET_POS(ca, ptr); - struct nocow_lock_bucket *l = - bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); - prefetch(l); - - /* XXX allocating memory with btree locks held - rare */ - darray_push_gfp(&buckets, ((struct bucket_to_lock) { - .b = b, .gen = ptr->gen, .l = l, - }), GFP_KERNEL|__GFP_NOFAIL); - - if (ptr->unwritten) - op->flags |= BCH_WRITE_convert_unwritten; - } - - /* Unlock before taking nocow locks, doing IO: */ - bkey_reassemble(op->insert_keys.top, k); - bch2_trans_unlock(trans); - - bch2_cut_front(op->pos, op->insert_keys.top); - if (op->flags & BCH_WRITE_convert_unwritten) - bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); - - darray_for_each(buckets, i) { - struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); - - __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, - bucket_to_u64(i->b), - BUCKET_NOCOW_LOCK_UPDATE); - - int gen = bucket_gen_get(ca, i->b.offset); - stale = gen < 0 ? gen : gen_after(gen, i->gen); - if (unlikely(stale)) { - stale_at = i; - goto err_bucket_stale; - } - } - - bio = &op->wbio.bio; - if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { - bio = bio_split(bio, k.k->p.offset - op->pos.offset, - GFP_KERNEL, &c->bio_write); - wbio_init(bio)->put_bio = true; - bio->bi_opf = op->wbio.bio.bi_opf; - } else { - op->flags |= BCH_WRITE_submitted; - } - - op->pos.offset += bio_sectors(bio); - op->written += bio_sectors(bio); - - bio->bi_end_io = bch2_write_endio; - bio->bi_private = &op->cl; - bio->bi_opf |= REQ_OP_WRITE; - closure_get(&op->cl); - - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, - op->insert_keys.top, true); - - bch2_keylist_push(&op->insert_keys); - if (op->flags & BCH_WRITE_submitted) - break; - bch2_btree_iter_advance(trans, &iter); - } -out: - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - darray_exit(&buckets); - - if (ret) { - bch2_write_op_error(op, op->pos.offset, - "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); - op->error = ret; - op->flags |= BCH_WRITE_submitted; - } - - /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_submitted)) { - closure_sync(&op->cl); - __bch2_nocow_write_done(op); - op->insert_keys.top = op->insert_keys.keys; - } else if (op->flags & BCH_WRITE_sync) { - closure_sync(&op->cl); - bch2_nocow_write_done(&op->cl.work); - } else { - /* - * XXX - * needs to run out of process context because ei_quota_lock is - * a mutex - */ - continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); - } - return; -err_get_ioref: - darray_for_each(buckets, i) - enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE], - BCH_DEV_WRITE_REF_io_write); - - /* Fall back to COW path: */ - goto out; -err_bucket_stale: - darray_for_each(buckets, i) { - bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE); - if (i == stale_at) - break; - } - - struct printbuf buf = PRINTBUF; - if (bch2_fs_inconsistent_on(stale < 0, c, - "pointer to invalid bucket in nocow path on device %llu\n %s", - stale_at->b.inode, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch_err_throw(c, data_write_invalid_ptr); - } else { - /* We can retry this: */ - ret = bch_err_throw(c, transaction_restart); - } - printbuf_exit(&buf); - - goto err_get_ioref; -} - -static void __bch2_write(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct write_point *wp = NULL; - struct bio *bio = NULL; - unsigned nofs_flags; - int ret; - - nofs_flags = memalloc_nofs_save(); - - if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { - bch2_nocow_write(op); - if (op->flags & BCH_WRITE_submitted) - goto out_nofs_restore; - } -again: - memset(&op->failed, 0, sizeof(op->failed)); - - do { - struct bkey_i *key_to_write; - unsigned key_to_write_offset = op->insert_keys.top_p - - op->insert_keys.keys_p; - - /* +1 for possible cache device: */ - if (op->open_buckets.nr + op->nr_replicas + 1 > - ARRAY_SIZE(op->open_buckets.v)) - break; - - if (bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_EXTENT_U64s_MAX)) - break; - - /* - * The copygc thread is now global, which means it's no longer - * freeing up space on specific disks, which means that - * allocations for specific disks may hang arbitrarily long: - */ - ret = bch2_trans_run(c, lockrestart_do(trans, - bch2_alloc_sectors_start_trans(trans, - op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), - op->write_point, - &op->devs_have, - op->nr_replicas, - op->nr_replicas_required, - op->watermark, - op->flags, - &op->cl, &wp))); - if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - break; - - goto err; - } - - EBUG_ON(!wp); - - bch2_open_bucket_get(c, wp, &op->open_buckets); - ret = bch2_write_extent(op, wp, &bio); - - bch2_alloc_sectors_done_inlined(c, wp); -err: - if (ret <= 0) { - op->flags |= BCH_WRITE_submitted; - - if (unlikely(ret < 0)) { - if (!(op->flags & BCH_WRITE_alloc_nowait)) - bch2_write_op_error(op, op->pos.offset, - "%s(): %s", __func__, bch2_err_str(ret)); - op->error = ret; - break; - } - } - - bio->bi_end_io = bch2_write_endio; - bio->bi_private = &op->cl; - bio->bi_opf |= REQ_OP_WRITE; - - closure_get(bio->bi_private); - - key_to_write = (void *) (op->insert_keys.keys_p + - key_to_write_offset); - - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, - key_to_write, false); - } while (ret); - - /* - * Sync or no? - * - * If we're running asynchronously, wne may still want to block - * synchronously here if we weren't able to submit all of the IO at - * once, as that signals backpressure to the caller. - */ - if ((op->flags & BCH_WRITE_sync) || - (!(op->flags & BCH_WRITE_submitted) && - !(op->flags & BCH_WRITE_in_worker))) { - bch2_wait_on_allocator(c, &op->cl); - - __bch2_write_index(op); - - if (!(op->flags & BCH_WRITE_submitted)) - goto again; - bch2_write_done(&op->cl); - } else { - bch2_write_queue(op, wp); - continue_at(&op->cl, bch2_write_index, NULL); - } -out_nofs_restore: - memalloc_nofs_restore(nofs_flags); -} - -static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) -{ - struct bio *bio = &op->wbio.bio; - struct bvec_iter iter; - struct bkey_i_inline_data *id; - unsigned sectors; - int ret; - - memset(&op->failed, 0, sizeof(op->failed)); - - op->flags |= BCH_WRITE_wrote_data_inline; - op->flags |= BCH_WRITE_submitted; - - bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); - - ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_U64s + DIV_ROUND_UP(data_len, 8)); - if (ret) { - op->error = ret; - goto err; - } - - sectors = bio_sectors(bio); - op->pos.offset += sectors; - - id = bkey_inline_data_init(op->insert_keys.top); - id->k.p = op->pos; - id->k.bversion = op->version; - id->k.size = sectors; - - iter = bio->bi_iter; - iter.bi_size = data_len; - memcpy_from_bio(id->v.data, bio, iter); - - while (data_len & 7) - id->v.data[data_len++] = '\0'; - set_bkey_val_bytes(&id->k, data_len); - bch2_keylist_push(&op->insert_keys); - - __bch2_write_index(op); -err: - bch2_write_done(&op->cl); -} - -/** - * bch2_write() - handle a write to a cache device or flash only volume - * @cl: &bch_write_op->cl - * - * This is the starting point for any data to end up in a cache device; it could - * be from a normal write, or a writeback write, or a write to a flash only - * volume - it's also used by the moving garbage collector to compact data in - * mostly empty buckets. - * - * It first writes the data to the cache, creating a list of keys to be inserted - * (if the data won't fit in a single open bucket, there will be multiple keys); - * after the data is written it calls bch_journal, and after the keys have been - * added to the next journal write they're inserted into the btree. - * - * If op->discard is true, instead of inserting the data it invalidates the - * region of the cache represented by op->bio and op->inode. - */ -CLOSURE_CALLBACK(bch2_write) -{ - closure_type(op, struct bch_write_op, cl); - struct bio *bio = &op->wbio.bio; - struct bch_fs *c = op->c; - unsigned data_len; - - EBUG_ON(op->cl.parent); - BUG_ON(!op->nr_replicas); - BUG_ON(!op->write_point.v); - BUG_ON(bkey_eq(op->pos, POS_MAX)); - - async_object_list_add(c, write_op, op, &op->list_idx); - - if (op->flags & BCH_WRITE_only_specified_devs) - op->flags |= BCH_WRITE_alloc_nowait; - - op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); - op->start_time = local_clock(); - bch2_keylist_init(&op->insert_keys, op->inline_keys); - wbio_init(bio)->put_bio = false; - - if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { - bch2_write_op_error(op, op->pos.offset, "misaligned write"); - op->error = bch_err_throw(c, data_write_misaligned); - goto err; - } - - if (c->opts.nochanges) { - op->error = bch_err_throw(c, erofs_no_writes); - goto err; - } - - if (!(op->flags & BCH_WRITE_move) && - !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { - op->error = bch_err_throw(c, erofs_no_writes); - goto err; - } - - if (!(op->flags & BCH_WRITE_move)) - this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); - bch2_increment_clock(c, bio_sectors(bio), WRITE); - - data_len = min_t(u64, bio->bi_iter.bi_size, - op->new_i_size - (op->pos.offset << 9)); - - if (c->opts.inline_data && - data_len <= min(block_bytes(c) / 2, 1024U)) { - bch2_write_data_inline(op, data_len); - return; - } - - __bch2_write(op); - return; -err: - bch2_disk_reservation_put(c, &op->res); - - closure_debug_destroy(&op->cl); - async_object_list_del(c, write_op, op->list_idx); - if (op->end_io) - op->end_io(op); -} - -static const char * const bch2_write_flags[] = { -#define x(f) #f, - BCH_WRITE_FLAGS() -#undef x - NULL -}; - -void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - prt_printf(out, "pos:\t"); - bch2_bpos_to_text(out, op->pos); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "started:\t"); - bch2_pr_time_units(out, local_clock() - op->start_time); - prt_newline(out); - - prt_printf(out, "flags:\t"); - prt_bitflags(out, bch2_write_flags, op->flags); - prt_newline(out); - - prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); - prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); - - prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); - prt_printf(out, "ret\t%s\n", bch2_err_str(op->error)); - - printbuf_indent_sub(out, 2); -} - -void bch2_fs_io_write_exit(struct bch_fs *c) -{ - bioset_exit(&c->replica_set); - bioset_exit(&c->bio_write); -} - -int bch2_fs_io_write_init(struct bch_fs *c) -{ - if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || - bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) - return bch_err_throw(c, ENOMEM_bio_write_init); - - return 0; -} diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h deleted file mode 100644 index 2c0a8f35ee1feb..00000000000000 --- a/fs/bcachefs/io_write.h +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_WRITE_H -#define _BCACHEFS_IO_WRITE_H - -#include "checksum.h" -#include "io_write_types.h" - -#define to_wbio(_bio) \ - container_of((_bio), struct bch_write_bio, bio) - -void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); -void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); - -void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - enum bch_data_type, const struct bkey_i *, bool); - -__printf(3, 4) -void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); - -static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) -{ - return op->watermark == BCH_WATERMARK_copygc - ? op->c->copygc_wq - : op->c->btree_update_wq; -} - -int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, - struct bkey_i *, bool *, s64 *, s64 *); -int bch2_extent_update(struct btree_trans *, subvol_inum, - struct btree_iter *, struct bkey_i *, - struct disk_reservation *, u64, s64 *, bool); - -static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct bch_io_opts opts) -{ - op->c = c; - op->end_io = NULL; - op->flags = 0; - op->written = 0; - op->error = 0; - op->csum_type = bch2_data_checksum_type(c, opts); - op->compression_opt = opts.compression; - op->nr_replicas = 0; - op->nr_replicas_required = c->opts.data_replicas_required; - op->watermark = BCH_WATERMARK_normal; - op->incompressible = 0; - op->open_buckets.nr = 0; - op->devs_have.nr = 0; - op->target = 0; - op->opts = opts; - op->subvol = 0; - op->pos = POS_MAX; - op->version = ZERO_VERSION; - op->write_point = (struct write_point_specifier) { 0 }; - op->res = (struct disk_reservation) { 0 }; - op->new_i_size = U64_MAX; - op->i_sectors_delta = 0; - op->devs_need_flush = NULL; -} - -CLOSURE_CALLBACK(bch2_write); -void bch2_write_point_do_index_updates(struct work_struct *); - -static inline struct bch_write_bio *wbio_init(struct bio *bio) -{ - struct bch_write_bio *wbio = to_wbio(bio); - - memset(&wbio->wbio, 0, sizeof(wbio->wbio)); - return wbio; -} - -void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); - -void bch2_fs_io_write_exit(struct bch_fs *); -int bch2_fs_io_write_init(struct bch_fs *); - -#endif /* _BCACHEFS_IO_WRITE_H */ diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h deleted file mode 100644 index 5da4eb8bb6f6d8..00000000000000 --- a/fs/bcachefs/io_write_types.h +++ /dev/null @@ -1,129 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_WRITE_TYPES_H -#define _BCACHEFS_IO_WRITE_TYPES_H - -#include "alloc_types.h" -#include "btree_types.h" -#include "buckets_types.h" -#include "extents_types.h" -#include "keylist_types.h" -#include "opts.h" -#include "super_types.h" - -#include -#include - -#define BCH_WRITE_FLAGS() \ - x(alloc_nowait) \ - x(cached) \ - x(data_encoded) \ - x(pages_stable) \ - x(pages_owned) \ - x(only_specified_devs) \ - x(wrote_data_inline) \ - x(check_enospc) \ - x(sync) \ - x(move) \ - x(in_worker) \ - x(submitted) \ - x(io_error) \ - x(convert_unwritten) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), - BCH_WRITE_FLAGS() -#undef x -}; - -struct bch_write_bio { - struct_group(wbio, - struct bch_fs *c; - struct bch_write_bio *parent; - - u64 submit_time; - u64 inode_offset; - u64 nocow_bucket; - - struct bch_devs_list failed; - u8 dev; - - unsigned split:1, - bounce:1, - put_bio:1, - have_ioref:1, - nocow:1, - used_mempool:1, - first_btree_write:1; - ); - - struct bio bio; -}; - -struct bch_write_op { - struct closure cl; - struct bch_fs *c; - void (*end_io)(struct bch_write_op *); - u64 start_time; - -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - - unsigned written; /* sectors */ - u16 flags; - s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ - - unsigned compression_opt:8; - unsigned csum_type:4; - unsigned nr_replicas:4; - unsigned nr_replicas_required:4; - unsigned watermark:3; - unsigned incompressible:1; - unsigned stripe_waited:1; - - struct bch_devs_list devs_have; - u16 target; - u16 nonce; - struct bch_io_opts opts; - - u32 subvol; - struct bpos pos; - struct bversion version; - - /* For BCH_WRITE_data_encoded: */ - struct bch_extent_crc_unpacked crc; - - struct write_point_specifier write_point; - - struct write_point *wp; - struct list_head wp_list; - - struct disk_reservation res; - - struct open_buckets open_buckets; - - u64 new_i_size; - s64 i_sectors_delta; - - struct bch_devs_mask failed; - - struct keylist insert_keys; - u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; - - /* - * Bitmask of devices that have had nocow writes issued to them since - * last flush: - */ - struct bch_devs_mask *devs_need_flush; - - /* Must be last: */ - struct bch_write_bio wbio; -}; - -#endif /* _BCACHEFS_IO_WRITE_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c deleted file mode 100644 index ddfeb0dafc9d84..00000000000000 --- a/fs/bcachefs/journal.c +++ /dev/null @@ -1,1832 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bcachefs journalling code, for btree insertions - * - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_methods.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "enumerated_ref.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "journal_sb.h" -#include "journal_seq_blacklist.h" -#include "trace.h" - -static inline bool journal_seq_unwritten(struct journal *j, u64 seq) -{ - return seq > j->seq_ondisk; -} - -static bool __journal_entry_is_open(union journal_res_state state) -{ - return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -} - -static inline unsigned nr_unwritten_journal_entries(struct journal *j) -{ - return atomic64_read(&j->seq) - j->seq_ondisk; -} - -static bool journal_entry_is_open(struct journal *j) -{ - return __journal_entry_is_open(j->reservations); -} - -static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) -{ - union journal_res_state s = READ_ONCE(j->reservations); - unsigned i = seq & JOURNAL_BUF_MASK; - struct journal_buf *buf = j->buf + i; - - prt_printf(out, "seq:\t%llu\n", seq); - printbuf_indent_add(out, 2); - - if (!buf->write_started) - prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); - - struct closure *cl = &buf->io; - int r = atomic_read(&cl->remaining); - prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK); - - if (buf->data) { - prt_printf(out, "size:\t"); - prt_human_readable_u64(out, vstruct_bytes(buf->data)); - prt_newline(out); - } - - prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies); - - prt_printf(out, "flags:\t"); - if (buf->noflush) - prt_str(out, "noflush "); - if (buf->must_flush) - prt_str(out, "must_flush "); - if (buf->separate_flush) - prt_str(out, "separate_flush "); - if (buf->need_flush_to_write_buffer) - prt_str(out, "need_flush_to_write_buffer "); - if (buf->write_started) - prt_str(out, "write_started "); - if (buf->write_allocated) - prt_str(out, "write_allocated "); - if (buf->write_done) - prt_str(out, "write_done"); - prt_newline(out); - - printbuf_indent_sub(out, 2); -} - -static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) -{ - lockdep_assert_held(&j->lock); - out->atomic++; - - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 24); - - for (u64 seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) - bch2_journal_buf_to_text(out, j, seq); - prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); - - --out->atomic; -} - -static inline struct journal_buf * -journal_seq_to_buf(struct journal *j, u64 seq) -{ - struct journal_buf *buf = NULL; - - EBUG_ON(seq > journal_cur_seq(j)); - - if (journal_seq_unwritten(j, seq)) - buf = j->buf + (seq & JOURNAL_BUF_MASK); - return buf; -} - -static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) -{ - for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) - INIT_LIST_HEAD(&p->unflushed[i]); - for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) - INIT_LIST_HEAD(&p->flushed[i]); - atomic_set(&p->count, count); - p->devs.nr = 0; -} - -/* - * Detect stuck journal conditions and trigger shutdown. Technically the journal - * can end up stuck for a variety of reasons, such as a blocked I/O, journal - * reservation lockup, etc. Since this is a fatal error with potentially - * unpredictable characteristics, we want to be fairly conservative before we - * decide to shut things down. - * - * Consider the journal stuck when it appears full with no ability to commit - * btree transactions, to discard journal buckets, nor acquire priority - * (reserved watermark) reservation. - */ -static inline bool -journal_error_check_stuck(struct journal *j, int error, unsigned flags) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - bool stuck = false; - struct printbuf buf = PRINTBUF; - - buf.atomic++; - - if (!(error == -BCH_ERR_journal_full || - error == -BCH_ERR_journal_pin_full) || - nr_unwritten_journal_entries(j) || - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) - return stuck; - - spin_lock(&j->lock); - - if (j->can_discard) { - spin_unlock(&j->lock); - return stuck; - } - - stuck = true; - - /* - * The journal shutdown path will set ->err_seq, but do it here first to - * serialize against concurrent failures and avoid duplicate error - * reports. - */ - if (j->err_seq) { - spin_unlock(&j->lock); - return stuck; - } - j->err_seq = journal_cur_seq(j); - - __bch2_journal_debug_to_text(&buf, j); - spin_unlock(&j->lock); - prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"), - bch2_err_str(error)); - bch2_print_str(c, KERN_ERR, buf.buf); - - printbuf_reset(&buf); - bch2_journal_pins_to_text(&buf, j); - bch_err(c, "Journal pins:\n%s", buf.buf); - printbuf_exit(&buf); - - bch2_fatal_error(c); - dump_stack(); - - return stuck; -} - -void bch2_journal_do_writes(struct journal *j) -{ - for (u64 seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) { - unsigned idx = seq & JOURNAL_BUF_MASK; - struct journal_buf *w = j->buf + idx; - - if (w->write_started && !w->write_allocated) - break; - if (w->write_started) - continue; - - if (!journal_state_seq_count(j, j->reservations, seq)) { - j->seq_write_started = seq; - w->write_started = true; - closure_call(&w->io, bch2_journal_write, j->wq, NULL); - } - - break; - } -} - -/* - * Final processing when the last reference of a journal buffer has been - * dropped. Drop the pin list reference acquired at journal entry open and write - * the buffer, if requested. - */ -void bch2_journal_buf_put_final(struct journal *j, u64 seq) -{ - lockdep_assert_held(&j->lock); - - if (__bch2_journal_pin_put(j, seq)) - bch2_journal_reclaim_fast(j); - bch2_journal_do_writes(j); - - /* - * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an - * open journal entry - */ - wake_up(&j->wait); -} - -/* - * Returns true if journal entry is now closed: - * - * We don't close a journal_buf until the next journal_buf is finished writing, - * and can be opened again - this also initializes the next journal_buf: - */ -static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf = journal_cur_buf(j); - union journal_res_state old, new; - unsigned sectors; - - BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && - closed_val != JOURNAL_ENTRY_ERROR_VAL); - - lockdep_assert_held(&j->lock); - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - new.cur_entry_offset = closed_val; - - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || - old.cur_entry_offset == new.cur_entry_offset) - return; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - - if (!__journal_entry_is_open(old)) - return; - - if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) - old.cur_entry_offset = j->cur_entry_offset_if_blocked; - - /* Close out old buffer: */ - buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - - if (trace_journal_entry_close_enabled() && trace) { - struct printbuf pbuf = PRINTBUF; - pbuf.atomic++; - - prt_str(&pbuf, "entry size: "); - prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); - prt_newline(&pbuf); - bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT); - trace_journal_entry_close(c, pbuf.buf); - printbuf_exit(&pbuf); - } - - sectors = vstruct_blocks_plus(buf->data, c->block_bits, - buf->u64s_reserved) << c->block_bits; - if (unlikely(sectors > buf->sectors)) { - struct printbuf err = PRINTBUF; - err.atomic++; - - prt_printf(&err, "journal entry overran reserved space: %u > %u\n", - sectors, buf->sectors); - prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n", - le32_to_cpu(buf->data->u64s), buf->u64s_reserved, - j->cur_entry_u64s, - c->block_bits); - prt_printf(&err, "fatal error - emergency read only"); - bch2_journal_halt_locked(j); - - bch_err(c, "%s", err.buf); - printbuf_exit(&err); - return; - } - - buf->sectors = sectors; - - /* - * We have to set last_seq here, _before_ opening a new journal entry: - * - * A threads may replace an old pin with a new pin on their current - * journal reservation - the expectation being that the journal will - * contain either what the old pin protected or what the new pin - * protects. - * - * After the old pin is dropped journal_last_seq() won't include the old - * pin, so we can only write the updated last_seq on the entry that - * contains whatever the new pin protects. - * - * Restated, we can _not_ update last_seq for a given entry if there - * could be a newer entry open with reservations/pins that have been - * taken against it. - * - * Hence, we want update/set last_seq on the current journal entry right - * before we open a new one: - */ - buf->last_seq = journal_last_seq(j); - buf->data->last_seq = cpu_to_le64(buf->last_seq); - BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); - - cancel_delayed_work(&j->write_work); - - bch2_journal_space_available(j); - - __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); -} - -void bch2_journal_halt_locked(struct journal *j) -{ - lockdep_assert_held(&j->lock); - - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); - if (!j->err_seq) - j->err_seq = journal_cur_seq(j); - journal_wake(j); -} - -void bch2_journal_halt(struct journal *j) -{ - spin_lock(&j->lock); - bch2_journal_halt_locked(j); - spin_unlock(&j->lock); -} - -static bool journal_entry_want_write(struct journal *j) -{ - bool ret = !journal_entry_is_open(j) || - journal_cur_seq(j) == journal_last_unwritten_seq(j); - - /* Don't close it yet if we already have a write in flight: */ - if (ret) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - else if (nr_unwritten_journal_entries(j)) { - struct journal_buf *buf = journal_cur_buf(j); - - if (!buf->flush_time) { - buf->flush_time = local_clock() ?: 1; - buf->expires = jiffies; - } - } - - return ret; -} - -bool bch2_journal_entry_close(struct journal *j) -{ - bool ret; - - spin_lock(&j->lock); - ret = journal_entry_want_write(j); - spin_unlock(&j->lock); - - return ret; -} - -/* - * should _only_ called from journal_res_get() - when we actually want a - * journal reservation - journal entry is open means journal is dirty: - */ -static int journal_entry_open(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf = j->buf + - ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); - union journal_res_state old, new; - int u64s; - - lockdep_assert_held(&j->lock); - BUG_ON(journal_entry_is_open(j)); - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - - if (j->blocked) - return bch_err_throw(c, journal_blocked); - - if (j->cur_entry_error) - return j->cur_entry_error; - - int ret = bch2_journal_error(j); - if (unlikely(ret)) - return ret; - - if (!fifo_free(&j->pin)) - return bch_err_throw(c, journal_pin_full); - - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return bch_err_throw(c, journal_max_in_flight); - - if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) - return bch_err_throw(c, journal_max_open); - - if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { - bch_err(c, "cannot start: journal seq overflow"); - if (bch2_fs_emergency_read_only_locked(c)) - bch_err(c, "fatal error - emergency read only"); - return bch_err_throw(c, journal_shutdown); - } - - if (!j->free_buf && !buf->data) - return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */ - - BUG_ON(!j->cur_entry_sectors); - - if (!buf->data) { - swap(buf->data, j->free_buf); - swap(buf->buf_size, j->free_buf_size); - } - - buf->expires = - (journal_cur_seq(j) == j->flushed_seq_ondisk - ? jiffies - : j->last_flush_write) + - msecs_to_jiffies(c->opts.journal_flush_delay); - - buf->u64s_reserved = j->entry_u64s_reserved; - buf->disk_sectors = j->cur_entry_sectors; - buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); - - u64s = (int) (buf->sectors << 9) / sizeof(u64) - - journal_entry_overhead(j); - u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); - - if (u64s <= (ssize_t) j->early_journal_entries.nr) - return bch_err_throw(c, journal_full); - - if (fifo_empty(&j->pin) && j->reclaim_thread) - wake_up_process(j->reclaim_thread); - - /* - * The fifo_push() needs to happen at the same time as j->seq is - * incremented for journal_last_seq() to be calculated correctly - */ - atomic64_inc(&j->seq); - journal_pin_list_init(fifo_push_ref(&j->pin), 1); - - if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) { - bch_err(c, "attempting to open blacklisted journal seq %llu", - journal_cur_seq(j)); - if (bch2_fs_emergency_read_only_locked(c)) - bch_err(c, "fatal error - emergency read only"); - return bch_err_throw(c, journal_shutdown); - } - - BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); - - BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); - - bkey_extent_init(&buf->key); - buf->noflush = false; - buf->must_flush = false; - buf->separate_flush = false; - buf->flush_time = 0; - buf->need_flush_to_write_buffer = true; - buf->write_started = false; - buf->write_allocated = false; - buf->write_done = false; - - memset(buf->data, 0, sizeof(*buf->data)); - buf->data->seq = cpu_to_le64(journal_cur_seq(j)); - buf->data->u64s = 0; - - if (j->early_journal_entries.nr) { - memcpy(buf->data->_data, j->early_journal_entries.data, - j->early_journal_entries.nr * sizeof(u64)); - le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr); - } - - /* - * Must be set before marking the journal entry as open: - */ - j->cur_entry_u64s = u64s; - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - - BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); - - new.idx++; - BUG_ON(journal_state_count(new, new.idx)); - BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK)); - - journal_state_inc(&new); - - /* Handle any already added entries */ - new.cur_entry_offset = le32_to_cpu(buf->data->u64s); - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - - if (nr_unwritten_journal_entries(j) == 1) - mod_delayed_work(j->wq, - &j->write_work, - msecs_to_jiffies(c->opts.journal_flush_delay)); - journal_wake(j); - - if (j->early_journal_entries.nr) - darray_exit(&j->early_journal_entries); - return 0; -} - -static bool journal_quiesced(struct journal *j) -{ - bool ret = atomic64_read(&j->seq) == j->seq_ondisk; - - if (!ret) - bch2_journal_entry_close(j); - return ret; -} - -static void journal_quiesce(struct journal *j) -{ - wait_event(j->wait, journal_quiesced(j)); -} - -static void journal_write_work(struct work_struct *work) -{ - struct journal *j = container_of(work, struct journal, write_work.work); - - spin_lock(&j->lock); - if (__journal_entry_is_open(j->reservations)) { - long delta = journal_cur_buf(j)->expires - jiffies; - - if (delta > 0) - mod_delayed_work(j->wq, &j->write_work, delta); - else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - } - spin_unlock(&j->lock); -} - -static void journal_buf_prealloc(struct journal *j) -{ - if (j->free_buf && - j->free_buf_size >= j->buf_size_want) - return; - - unsigned buf_size = j->buf_size_want; - - spin_unlock(&j->lock); - void *buf = kvmalloc(buf_size, GFP_NOFS); - spin_lock(&j->lock); - - if (buf && - (!j->free_buf || - buf_size > j->free_buf_size)) { - swap(buf, j->free_buf); - swap(buf_size, j->free_buf_size); - } - - if (unlikely(buf)) { - spin_unlock(&j->lock); - /* kvfree can sleep */ - kvfree(buf); - spin_lock(&j->lock); - } -} - -static int __journal_res_get(struct journal *j, struct journal_res *res, - unsigned flags) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf; - bool can_discard; - int ret; -retry: - if (journal_res_get_fast(j, res, flags)) - return 0; - - ret = bch2_journal_error(j); - if (unlikely(ret)) - return ret; - - if (j->blocked) - return bch_err_throw(c, journal_blocked); - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - ret = bch_err_throw(c, journal_full); - can_discard = j->can_discard; - goto out; - } - - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { - ret = bch_err_throw(c, journal_max_in_flight); - goto out; - } - - spin_lock(&j->lock); - - journal_buf_prealloc(j); - - /* - * Recheck after taking the lock, so we don't race with another thread - * that just did journal_entry_open() and call bch2_journal_entry_close() - * unnecessarily - */ - if (journal_res_get_fast(j, res, flags)) { - ret = 0; - goto unlock; - } - - /* - * If we couldn't get a reservation because the current buf filled up, - * and we had room for a bigger entry on disk, signal that we want to - * realloc the journal bufs: - */ - buf = journal_cur_buf(j); - if (journal_entry_is_open(j) && - buf->buf_size >> 9 < buf->disk_sectors && - buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) - j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open; -unlock: - can_discard = j->can_discard; - spin_unlock(&j->lock); -out: - if (likely(!ret)) - return 0; - if (ret == -BCH_ERR_journal_retry_open) - goto retry; - - if (journal_error_check_stuck(j, ret, flags)) - ret = bch_err_throw(c, journal_stuck); - - if (ret == -BCH_ERR_journal_max_in_flight && - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && - trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_printbuf_make_room(&buf, 4096); - - spin_lock(&j->lock); - prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); - bch2_journal_bufs_to_text(&buf, j); - spin_unlock(&j->lock); - - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - count_event(c, journal_entry_full); - } - - if (ret == -BCH_ERR_journal_max_open && - track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && - trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_printbuf_make_room(&buf, 4096); - - spin_lock(&j->lock); - prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); - bch2_journal_bufs_to_text(&buf, j); - spin_unlock(&j->lock); - - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - count_event(c, journal_entry_full); - } - - /* - * Journal is full - can't rely on reclaim from work item due to - * freezing: - */ - if ((ret == -BCH_ERR_journal_full || - ret == -BCH_ERR_journal_pin_full) && - !(flags & JOURNAL_RES_GET_NONBLOCK)) { - if (can_discard) { - bch2_journal_do_discards(j); - goto retry; - } - - if (mutex_trylock(&j->reclaim_lock)) { - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); - } - } - - return ret; -} - -static unsigned max_dev_latency(struct bch_fs *c) -{ - u64 nsecs = 0; - - guard(rcu)(); - for_each_rw_member_rcu(c, ca) - nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); - - return nsecs_to_jiffies(nsecs); -} - -/* - * Essentially the entry function to the journaling code. When bcachefs is doing - * a btree insert, it calls this function to get the current journal write. - * Journal write is the structure used set up journal writes. The calling - * function will then add its keys to the structure, queuing them for the next - * write. - * - * To ensure forward progress, the current task must not be holding any - * btree node write locks. - */ -int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned flags, - struct btree_trans *trans) -{ - int ret; - - if (closure_wait_event_timeout(&j->async_wait, - !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK), - HZ)) - return ret; - - if (trans) - bch2_trans_unlock_long(trans); - - struct bch_fs *c = container_of(j, struct bch_fs, journal); - int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); - - remaining_wait = max(0, remaining_wait - HZ); - - if (closure_wait_event_timeout(&j->async_wait, - !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK), - remaining_wait)) - return ret; - - struct printbuf buf = PRINTBUF; - bch2_journal_debug_to_text(&buf, j); - bch2_print_str(c, KERN_ERR, buf.buf); - prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); - printbuf_exit(&buf); - - closure_wait_event(&j->async_wait, - !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK)); - return ret; -} - -/* journal_entry_res: */ - -void bch2_journal_entry_res_resize(struct journal *j, - struct journal_entry_res *res, - unsigned new_u64s) -{ - union journal_res_state state; - int d = new_u64s - res->u64s; - - spin_lock(&j->lock); - - j->entry_u64s_reserved += d; - if (d <= 0) - goto out; - - j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); - state = READ_ONCE(j->reservations); - - if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && - state.cur_entry_offset > j->cur_entry_u64s) { - j->cur_entry_u64s += d; - /* - * Not enough room in current journal entry, have to flush it: - */ - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - } else { - journal_cur_buf(j)->u64s_reserved += d; - } -out: - spin_unlock(&j->lock); - res->u64s += d; -} - -/* journal flushing: */ - -/** - * bch2_journal_flush_seq_async - wait for a journal entry to be written - * @j: journal object - * @seq: seq to flush - * @parent: closure object to wait with - * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, - * -BCH_ERR_journal_flush_err if @seq will never be flushed - * - * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if - * necessary - */ -int bch2_journal_flush_seq_async(struct journal *j, u64 seq, - struct closure *parent) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf; - int ret = 0; - - if (seq <= j->flushed_seq_ondisk) - return 1; - - spin_lock(&j->lock); - - if (WARN_ONCE(seq > journal_cur_seq(j), - "requested to flush journal seq %llu, but currently at %llu", - seq, journal_cur_seq(j))) - goto out; - - /* Recheck under lock: */ - if (j->err_seq && seq >= j->err_seq) { - ret = bch_err_throw(c, journal_flush_err); - goto out; - } - - if (seq <= j->flushed_seq_ondisk) { - ret = 1; - goto out; - } - - /* if seq was written, but not flushed - flush a newer one instead */ - seq = max(seq, journal_last_unwritten_seq(j)); - -recheck_need_open: - if (seq > journal_cur_seq(j)) { - struct journal_res res = { 0 }; - - if (journal_entry_is_open(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - - spin_unlock(&j->lock); - - /* - * We're called from bch2_journal_flush_seq() -> wait_event(); - * but this might block. We won't usually block, so we won't - * livelock: - */ - sched_annotate_sleep(); - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); - if (ret) - return ret; - - seq = res.seq; - buf = journal_seq_to_buf(j, seq); - buf->must_flush = true; - - if (!buf->flush_time) { - buf->flush_time = local_clock() ?: 1; - buf->expires = jiffies; - } - - if (parent && !closure_wait(&buf->wait, parent)) - BUG(); - - bch2_journal_res_put(j, &res); - - spin_lock(&j->lock); - goto want_write; - } - - /* - * if write was kicked off without a flush, or if we promised it - * wouldn't be a flush, flush the next sequence number instead - */ - buf = journal_seq_to_buf(j, seq); - if (buf->noflush) { - seq++; - goto recheck_need_open; - } - - buf->must_flush = true; - j->flushing_seq = max(j->flushing_seq, seq); - - if (parent && !closure_wait(&buf->wait, parent)) - BUG(); -want_write: - if (seq == journal_cur_seq(j)) - journal_entry_want_write(j); -out: - spin_unlock(&j->lock); - return ret; -} - -int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state) -{ - u64 start_time = local_clock(); - int ret, ret2; - - /* - * Don't update time_stats when @seq is already flushed: - */ - if (seq <= j->flushed_seq_ondisk) - return 0; - - ret = wait_event_state(j->wait, - (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)), - task_state); - - if (!ret) - bch2_time_stats_update(j->flush_seq_time, start_time); - - return ret ?: ret2 < 0 ? ret2 : 0; -} - -/* - * bch2_journal_flush_async - if there is an open journal entry, or a journal - * still being written, write it and wait for the write to complete - */ -void bch2_journal_flush_async(struct journal *j, struct closure *parent) -{ - bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); -} - -int bch2_journal_flush(struct journal *j) -{ - return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE); -} - -/* - * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the - * range [start, end) - * @seq - */ -bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - u64 unwritten_seq; - bool ret = false; - - if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) - return false; - - if (c->journal.flushed_seq_ondisk >= start) - return false; - - spin_lock(&j->lock); - if (c->journal.flushed_seq_ondisk >= start) - goto out; - - for (unwritten_seq = journal_last_unwritten_seq(j); - unwritten_seq < end; - unwritten_seq++) { - struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); - - /* journal flush already in flight, or flush requseted */ - if (buf->must_flush) - goto out; - - buf->noflush = true; - } - - ret = true; -out: - spin_unlock(&j->lock); - return ret; -} - -static int __bch2_journal_meta(struct journal *j) -{ - struct journal_res res = {}; - int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); - if (ret) - return ret; - - struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK); - buf->must_flush = true; - - if (!buf->flush_time) { - buf->flush_time = local_clock() ?: 1; - buf->expires = jiffies; - } - - bch2_journal_res_put(j, &res); - - return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); -} - -int bch2_journal_meta(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) - return bch_err_throw(c, erofs_no_writes); - - int ret = __bch2_journal_meta(j); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); - return ret; -} - -/* block/unlock the journal: */ - -void bch2_journal_unblock(struct journal *j) -{ - spin_lock(&j->lock); - if (!--j->blocked && - j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && - j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { - union journal_res_state old, new; - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - new.cur_entry_offset = j->cur_entry_offset_if_blocked; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - } - spin_unlock(&j->lock); - - journal_wake(j); -} - -static void __bch2_journal_block(struct journal *j) -{ - if (!j->blocked++) { - union journal_res_state old, new; - - old.v = atomic64_read(&j->reservations.counter); - do { - j->cur_entry_offset_if_blocked = old.cur_entry_offset; - - if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL) - break; - - new.v = old.v; - new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - - if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL) - journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); - } -} - -void bch2_journal_block(struct journal *j) -{ - spin_lock(&j->lock); - __bch2_journal_block(j); - spin_unlock(&j->lock); - - journal_quiesce(j); -} - -static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, - u64 max_seq, bool *blocked) -{ - struct journal_buf *ret = NULL; - - /* We're inside wait_event(), but using mutex_lock(: */ - sched_annotate_sleep(); - mutex_lock(&j->buf_lock); - spin_lock(&j->lock); - max_seq = min(max_seq, journal_cur_seq(j)); - - for (u64 seq = journal_last_unwritten_seq(j); - seq <= max_seq; - seq++) { - unsigned idx = seq & JOURNAL_BUF_MASK; - struct journal_buf *buf = j->buf + idx; - - if (buf->need_flush_to_write_buffer) { - union journal_res_state s; - s.v = atomic64_read_acquire(&j->reservations.counter); - - unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s); - - if (open && !*blocked) { - __bch2_journal_block(j); - s.v = atomic64_read_acquire(&j->reservations.counter); - *blocked = true; - } - - ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open - ? ERR_PTR(-EAGAIN) - : buf; - break; - } - } - - spin_unlock(&j->lock); - if (IS_ERR_OR_NULL(ret)) - mutex_unlock(&j->buf_lock); - return ret; -} - -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, - u64 max_seq, bool *blocked) -{ - struct journal_buf *ret; - *blocked = false; - - wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, - max_seq, blocked)) != ERR_PTR(-EAGAIN)); - if (IS_ERR_OR_NULL(ret) && *blocked) - bch2_journal_unblock(j); - - return ret; -} - -/* allocate journal on a device: */ - -static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, - bool new_fs, struct closure *cl) -{ - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; - u64 *new_bucket_seq = NULL, *new_buckets = NULL; - struct open_bucket **ob = NULL; - long *bu = NULL; - unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr; - int ret = 0; - - BUG_ON(nr <= ja->nr); - - bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL); - ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL); - new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); - new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); - if (!bu || !ob || !new_buckets || !new_bucket_seq) { - ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); - goto err_free; - } - - for (nr_got = 0; nr_got < nr_want; nr_got++) { - enum bch_watermark watermark = new_fs - ? BCH_WATERMARK_btree - : BCH_WATERMARK_normal; - - ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, - BCH_DATA_journal, cl); - ret = PTR_ERR_OR_ZERO(ob[nr_got]); - if (ret) - break; - - if (!new_fs) { - ret = bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(trans, ca, - ob[nr_got]->bucket, BCH_DATA_journal, - ca->mi.bucket_size, BTREE_TRIGGER_transactional)); - if (ret) { - bch2_open_bucket_put(c, ob[nr_got]); - bch_err_msg(c, ret, "marking new journal buckets"); - break; - } - } - - bu[nr_got] = ob[nr_got]->bucket; - } - - if (!nr_got) - goto err_free; - - /* Don't return an error if we successfully allocated some buckets: */ - ret = 0; - - if (c) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_block(&c->journal); - mutex_lock(&c->sb_lock); - } - - memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); - memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); - - BUG_ON(ja->discard_idx > ja->nr); - - pos = ja->discard_idx ?: ja->nr; - - memmove(new_buckets + pos + nr_got, - new_buckets + pos, - sizeof(new_buckets[0]) * (ja->nr - pos)); - memmove(new_bucket_seq + pos + nr_got, - new_bucket_seq + pos, - sizeof(new_bucket_seq[0]) * (ja->nr - pos)); - - for (i = 0; i < nr_got; i++) { - new_buckets[pos + i] = bu[i]; - new_bucket_seq[pos + i] = 0; - } - - nr = ja->nr + nr_got; - - ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr); - if (ret) - goto err_unblock; - - bch2_write_super(c); - - /* Commit: */ - if (c) - spin_lock(&c->journal.lock); - - swap(new_buckets, ja->buckets); - swap(new_bucket_seq, ja->bucket_seq); - ja->nr = nr; - - if (pos <= ja->discard_idx) - ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr; - if (pos <= ja->dirty_idx_ondisk) - ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr; - if (pos <= ja->dirty_idx) - ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr; - if (pos <= ja->cur_idx) - ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr; - - if (c) - spin_unlock(&c->journal.lock); -err_unblock: - if (c) { - bch2_journal_unblock(&c->journal); - mutex_unlock(&c->sb_lock); - } - - if (ret && !new_fs) - for (i = 0; i < nr_got; i++) - bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(trans, ca, - bu[i], BCH_DATA_free, 0, - BTREE_TRIGGER_transactional)); -err_free: - for (i = 0; i < nr_got; i++) - bch2_open_bucket_put(c, ob[i]); - - kfree(new_bucket_seq); - kfree(new_buckets); - kfree(ob); - kfree(bu); - return ret; -} - -static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca, - unsigned nr, bool new_fs) -{ - struct journal_device *ja = &ca->journal; - int ret = 0; - - struct closure cl; - closure_init_stack(&cl); - - /* don't handle reducing nr of buckets yet: */ - if (nr < ja->nr) - return 0; - - while (!ret && ja->nr < nr) { - struct disk_reservation disk_res = { 0, 0, 0 }; - - /* - * note: journal buckets aren't really counted as _sectors_ used yet, so - * we don't need the disk reservation to avoid the BUG_ON() in buckets.c - * when space used goes up without a reservation - but we do need the - * reservation to ensure we'll actually be able to allocate: - * - * XXX: that's not right, disk reservations only ensure a - * filesystem-wide allocation will succeed, this is a device - * specific allocation - we can hang here: - */ - if (!new_fs) { - ret = bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0); - if (ret) - break; - } - - ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl); - - if (ret == -BCH_ERR_bucket_alloc_blocked || - ret == -BCH_ERR_open_buckets_empty) - ret = 0; /* wait and retry */ - - bch2_disk_reservation_put(c, &disk_res); - bch2_wait_on_allocator(c, &cl); - } - - return ret; -} - -/* - * Allocate more journal space at runtime - not currently making use if it, but - * the code works: - */ -int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - unsigned nr) -{ - down_write(&c->state_lock); - int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false); - up_write(&c->state_lock); - - bch_err_fn(c, ret); - return ret; -} - -int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b) -{ - struct bch_fs *c = ca->fs; - struct journal *j = &c->journal; - struct journal_device *ja = &ca->journal; - - guard(mutex)(&c->sb_lock); - unsigned pos; - for (pos = 0; pos < ja->nr; pos++) - if (ja->buckets[pos] == b) - break; - - if (pos == ja->nr) { - bch_err(ca, "journal bucket %llu not found when deleting", b); - return -EINVAL; - } - - u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);; - if (!new_buckets) - return bch_err_throw(c, ENOMEM_set_nr_journal_buckets); - - memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); - memmove(&new_buckets[pos], - &new_buckets[pos + 1], - (ja->nr - 1 - pos) * sizeof(new_buckets[0])); - - int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?: - bch2_write_super(c); - if (ret) { - kfree(new_buckets); - return ret; - } - - scoped_guard(spinlock, &j->lock) { - if (pos < ja->discard_idx) - --ja->discard_idx; - if (pos < ja->dirty_idx_ondisk) - --ja->dirty_idx_ondisk; - if (pos < ja->dirty_idx) - --ja->dirty_idx; - if (pos < ja->cur_idx) - --ja->cur_idx; - - ja->nr--; - - memmove(&ja->buckets[pos], - &ja->buckets[pos + 1], - (ja->nr - pos) * sizeof(ja->buckets[0])); - - memmove(&ja->bucket_seq[pos], - &ja->bucket_seq[pos + 1], - (ja->nr - pos) * sizeof(ja->bucket_seq[0])); - - bch2_journal_space_available(j); - } - - kfree(new_buckets); - return 0; -} - -int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) -{ - struct bch_fs *c = ca->fs; - - if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) - return 0; - - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { - bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); - return bch_err_throw(c, erofs_filesystem_full); - } - - unsigned nr; - int ret; - - if (dynamic_fault("bcachefs:add:journal_alloc")) { - ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); - goto err; - } - - /* 1/128th of the device by default: */ - nr = ca->mi.nbuckets >> 7; - - /* - * clamp journal size to 8192 buckets or 8GB (in sectors), whichever - * is smaller: - */ - nr = clamp_t(unsigned, nr, - BCH_JOURNAL_BUCKETS_MIN, - min(1 << 13, - (1 << 24) / ca->mi.bucket_size)); - - ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); -err: - bch_err_fn(ca, ret); - return ret; -} - -int bch2_fs_journal_alloc(struct bch_fs *c) -{ - for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) { - if (ca->journal.nr) - continue; - - int ret = bch2_dev_journal_alloc(ca, true); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_fs_journal_alloc); - return ret; - } - } - - return 0; -} - -/* startup/shutdown: */ - -static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) -{ - bool ret = false; - u64 seq; - - spin_lock(&j->lock); - for (seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j) && !ret; - seq++) { - struct journal_buf *buf = journal_seq_to_buf(j, seq); - - if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx)) - ret = true; - } - spin_unlock(&j->lock); - - return ret; -} - -void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) -{ - wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); -} - -void bch2_fs_journal_stop(struct journal *j) -{ - if (!test_bit(JOURNAL_running, &j->flags)) - return; - - bch2_journal_reclaim_stop(j); - bch2_journal_flush_all_pins(j); - - wait_event(j->wait, bch2_journal_entry_close(j)); - - /* - * Always write a new journal entry, to make sure the clock hands are up - * to date (and match the superblock) - */ - __bch2_journal_meta(j); - - journal_quiesce(j); - cancel_delayed_work_sync(&j->write_work); - - WARN(!bch2_journal_error(j) && - test_bit(JOURNAL_replay_done, &j->flags) && - j->last_empty_seq != journal_cur_seq(j), - "journal shutdown error: cur seq %llu but last empty seq %llu", - journal_cur_seq(j), j->last_empty_seq); - - if (!bch2_journal_error(j)) - clear_bit(JOURNAL_running, &j->flags); -} - -int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_entry_pin_list *p; - struct journal_replay *i, **_i; - struct genradix_iter iter; - bool had_entries = false; - - /* - * - * XXX pick most recent non blacklisted sequence number - */ - - cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); - - if (cur_seq >= JOURNAL_SEQ_MAX) { - bch_err(c, "cannot start: journal seq overflow"); - return -EINVAL; - } - - /* Clean filesystem? */ - if (!last_seq) - last_seq = cur_seq; - - u64 nr = cur_seq - last_seq; - - /* - * Extra fudge factor, in case we crashed when the journal pin fifo was - * nearly or completely full. We'll need to be able to open additional - * journal entries (at least a few) in order for journal replay to get - * going: - */ - nr += nr / 4; - - nr = max(nr, JOURNAL_PIN); - init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return bch_err_throw(c, ENOMEM_journal_pin_fifo); - } - - j->replay_journal_seq = last_seq; - j->replay_journal_seq_end = cur_seq; - j->last_seq_ondisk = last_seq; - j->flushed_seq_ondisk = cur_seq - 1; - j->seq_write_started = cur_seq - 1; - j->seq_ondisk = cur_seq - 1; - j->pin.front = last_seq; - j->pin.back = cur_seq; - atomic64_set(&j->seq, cur_seq - 1); - - u64 seq; - fifo_for_each_entry_ptr(p, &j->pin, seq) - journal_pin_list_init(p, 1); - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - seq = le64_to_cpu(i->j.seq); - BUG_ON(seq >= cur_seq); - - if (seq < last_seq) - continue; - - if (journal_entry_empty(&i->j)) - j->last_empty_seq = le64_to_cpu(i->j.seq); - - p = journal_seq_pin(j, seq); - - p->devs.nr = 0; - darray_for_each(i->ptrs, ptr) - bch2_dev_list_add_dev(&p->devs, ptr->dev); - - had_entries = true; - } - - if (!had_entries) - j->last_empty_seq = cur_seq - 1; /* to match j->seq */ - - spin_lock(&j->lock); - j->last_flush_write = jiffies; - - j->reservations.idx = journal_cur_seq(j); - - c->last_bucket_seq_cleanup = journal_cur_seq(j); - spin_unlock(&j->lock); - - return 0; -} - -void bch2_journal_set_replay_done(struct journal *j) -{ - /* - * journal_space_available must happen before setting JOURNAL_running - * JOURNAL_running must happen before JOURNAL_replay_done - */ - spin_lock(&j->lock); - bch2_journal_space_available(j); - - set_bit(JOURNAL_need_flush_write, &j->flags); - set_bit(JOURNAL_running, &j->flags); - set_bit(JOURNAL_replay_done, &j->flags); - spin_unlock(&j->lock); -} - -/* init/exit: */ - -void bch2_dev_journal_exit(struct bch_dev *ca) -{ - struct journal_device *ja = &ca->journal; - - for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - kfree(ja->bio[i]); - ja->bio[i] = NULL; - } - - kfree(ja->buckets); - kfree(ja->bucket_seq); - ja->buckets = NULL; - ja->bucket_seq = NULL; -} - -int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) -{ - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets = - bch2_sb_field_get(sb, journal); - struct bch_sb_field_journal_v2 *journal_buckets_v2 = - bch2_sb_field_get(sb, journal_v2); - - ja->nr = 0; - - if (journal_buckets_v2) { - unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - - for (unsigned i = 0; i < nr; i++) - ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); - } else if (journal_buckets) { - ja->nr = bch2_nr_journal_buckets(journal_buckets); - } - - ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); - if (!ja->bucket_seq) - return bch_err_throw(c, ENOMEM_dev_journal_init); - - unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); - - for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, - nr_bvecs), GFP_KERNEL); - if (!ja->bio[i]) - return bch_err_throw(c, ENOMEM_dev_journal_init); - - ja->bio[i]->ca = ca; - ja->bio[i]->buf_idx = i; - bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); - } - - ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); - if (!ja->buckets) - return bch_err_throw(c, ENOMEM_dev_journal_init); - - if (journal_buckets_v2) { - unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - unsigned dst = 0; - - for (unsigned i = 0; i < nr; i++) - for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) - ja->buckets[dst++] = - le64_to_cpu(journal_buckets_v2->d[i].start) + j; - } else if (journal_buckets) { - for (unsigned i = 0; i < ja->nr; i++) - ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); - } - - return 0; -} - -void bch2_fs_journal_exit(struct journal *j) -{ - if (j->wq) - destroy_workqueue(j->wq); - - darray_exit(&j->early_journal_entries); - - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) - kvfree(j->buf[i].data); - kvfree(j->free_buf); - free_fifo(&j->pin); -} - -void bch2_fs_journal_init_early(struct journal *j) -{ - static struct lock_class_key res_key; - - mutex_init(&j->buf_lock); - spin_lock_init(&j->lock); - spin_lock_init(&j->err_lock); - init_waitqueue_head(&j->wait); - INIT_DELAYED_WORK(&j->write_work, journal_write_work); - init_waitqueue_head(&j->reclaim_wait); - init_waitqueue_head(&j->pin_flush_wait); - mutex_init(&j->reclaim_lock); - mutex_init(&j->discard_lock); - - lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - - atomic64_set(&j->reservations.counter, - ((union journal_res_state) - { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); -} - -int bch2_fs_journal_init(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; - j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); - if (!j->free_buf) - return bch_err_throw(c, ENOMEM_journal_buf); - - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) - j->buf[i].idx = i; - - j->wq = alloc_workqueue("bcachefs_journal", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); - if (!j->wq) - return bch_err_throw(c, ENOMEM_fs_other_alloc); - return 0; -} - -/* debug: */ - -static const char * const bch2_journal_flags_strs[] = { -#define x(n) #n, - JOURNAL_FLAGS() -#undef x - NULL -}; - -void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - union journal_res_state s; - unsigned long now = jiffies; - u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 28); - out->atomic++; - - guard(rcu)(); - s = READ_ONCE(j->reservations); - - prt_printf(out, "flags:\t"); - prt_bitflags(out, bch2_journal_flags_strs, j->flags); - prt_newline(out); - prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); - prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j)); - prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j)); - prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); - prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); - prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); - prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); - prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); - prt_printf(out, "average write size:\t"); - prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); - prt_newline(out); - prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0); - prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); - prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); - prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); - prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) - ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - prt_printf(out, "blocked:\t%u\n", j->blocked); - prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error)); - prt_printf(out, "current entry:\t"); - - switch (s.cur_entry_offset) { - case JOURNAL_ENTRY_ERROR_VAL: - prt_printf(out, "error\n"); - break; - case JOURNAL_ENTRY_CLOSED_VAL: - prt_printf(out, "closed\n"); - break; - case JOURNAL_ENTRY_BLOCKED_VAL: - prt_printf(out, "blocked\n"); - break; - default: - prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); - break; - } - - prt_printf(out, "unwritten entries:\n"); - bch2_journal_bufs_to_text(out, j); - - prt_printf(out, "space:\n"); - printbuf_indent_add(out, 2); - prt_printf(out, "discarded\t%u:%u\n", - j->space[journal_space_discarded].next_entry, - j->space[journal_space_discarded].total); - prt_printf(out, "clean ondisk\t%u:%u\n", - j->space[journal_space_clean_ondisk].next_entry, - j->space[journal_space_clean_ondisk].total); - prt_printf(out, "clean\t%u:%u\n", - j->space[journal_space_clean].next_entry, - j->space[journal_space_clean].total); - prt_printf(out, "total\t%u:%u\n", - j->space[journal_space_total].next_entry, - j->space[journal_space_total].total); - printbuf_indent_sub(out, 2); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->mi.durability) - continue; - - struct journal_device *ja = &ca->journal; - - if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) - continue; - - if (!ja->nr) - continue; - - prt_printf(out, "dev %u:\n", ca->dev_idx); - prt_printf(out, "durability %u:\n", ca->mi.durability); - printbuf_indent_add(out, 2); - prt_printf(out, "nr\t%u\n", ja->nr); - prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); - prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); - prt_printf(out, "discard_idx\t%u\n", ja->discard_idx); - prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); - prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); - prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); - printbuf_indent_sub(out, 2); - } - - prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); - - --out->atomic; -} - -void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) -{ - spin_lock(&j->lock); - __bch2_journal_debug_to_text(out, j); - spin_unlock(&j->lock); -} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h deleted file mode 100644 index 977907038d98d0..00000000000000 --- a/fs/bcachefs/journal.h +++ /dev/null @@ -1,465 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_H -#define _BCACHEFS_JOURNAL_H - -/* - * THE JOURNAL: - * - * The primary purpose of the journal is to log updates (insertions) to the - * b-tree, to avoid having to do synchronous updates to the b-tree on disk. - * - * Without the journal, the b-tree is always internally consistent on - * disk - and in fact, in the earliest incarnations bcache didn't have a journal - * but did handle unclean shutdowns by doing all index updates synchronously - * (with coalescing). - * - * Updates to interior nodes still happen synchronously and without the journal - * (for simplicity) - this may change eventually but updates to interior nodes - * are rare enough it's not a huge priority. - * - * This means the journal is relatively separate from the b-tree; it consists of - * just a list of keys and journal replay consists of just redoing those - * insertions in same order that they appear in the journal. - * - * PERSISTENCE: - * - * For synchronous updates (where we're waiting on the index update to hit - * disk), the journal entry will be written out immediately (or as soon as - * possible, if the write for the previous journal entry was still in flight). - * - * Synchronous updates are specified by passing a closure (@flush_cl) to - * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter - * down to the journalling code. That closure will wait on the journal write to - * complete (via closure_wait()). - * - * If the index update wasn't synchronous, the journal entry will be - * written out after 10 ms have elapsed, by default (the delay_ms field - * in struct journal). - * - * JOURNAL ENTRIES: - * - * A journal entry is variable size (struct jset), it's got a fixed length - * header and then a variable number of struct jset_entry entries. - * - * Journal entries are identified by monotonically increasing 64 bit sequence - * numbers - jset->seq; other places in the code refer to this sequence number. - * - * A jset_entry entry contains one or more bkeys (which is what gets inserted - * into the b-tree). We need a container to indicate which b-tree the key is - * for; also, the roots of the various b-trees are stored in jset_entry entries - * (one for each b-tree) - this lets us add new b-tree types without changing - * the on disk format. - * - * We also keep some things in the journal header that are logically part of the - * superblock - all the things that are frequently updated. This is for future - * bcache on raw flash support; the superblock (which will become another - * journal) can't be moved or wear leveled, so it contains just enough - * information to find the main journal, and the superblock only has to be - * rewritten when we want to move/wear level the main journal. - * - * JOURNAL LAYOUT ON DISK: - * - * The journal is written to a ringbuffer of buckets (which is kept in the - * superblock); the individual buckets are not necessarily contiguous on disk - * which means that journal entries are not allowed to span buckets, but also - * that we can resize the journal at runtime if desired (unimplemented). - * - * The journal buckets exist in the same pool as all the other buckets that are - * managed by the allocator and garbage collection - garbage collection marks - * the journal buckets as metadata buckets. - * - * OPEN/DIRTY JOURNAL ENTRIES: - * - * Open/dirty journal entries are journal entries that contain b-tree updates - * that have not yet been written out to the b-tree on disk. We have to track - * which journal entries are dirty, and we also have to avoid wrapping around - * the journal and overwriting old but still dirty journal entries with new - * journal entries. - * - * On disk, this is represented with the "last_seq" field of struct jset; - * last_seq is the first sequence number that journal replay has to replay. - * - * To avoid overwriting dirty journal entries on disk, we keep a mapping (in - * journal_device->seq) of for each journal bucket, the highest sequence number - * any journal entry it contains. Then, by comparing that against last_seq we - * can determine whether that journal bucket contains dirty journal entries or - * not. - * - * To track which journal entries are dirty, we maintain a fifo of refcounts - * (where each entry corresponds to a specific sequence number) - when a ref - * goes to 0, that journal entry is no longer dirty. - * - * Journalling of index updates is done at the same time as the b-tree itself is - * being modified (see btree_insert_key()); when we add the key to the journal - * the pending b-tree write takes a ref on the journal entry the key was added - * to. If a pending b-tree write would need to take refs on multiple dirty - * journal entries, it only keeps the ref on the oldest one (since a newer - * journal entry will still be replayed if an older entry was dirty). - * - * JOURNAL FILLING UP: - * - * There are two ways the journal could fill up; either we could run out of - * space to write to, or we could have too many open journal entries and run out - * of room in the fifo of refcounts. Since those refcounts are decremented - * without any locking we can't safely resize that fifo, so we handle it the - * same way. - * - * If the journal fills up, we start flushing dirty btree nodes until we can - * allocate space for a journal write again - preferentially flushing btree - * nodes that are pinning the oldest journal entries first. - */ - -#include - -#include "journal_types.h" - -struct bch_fs; - -static inline void journal_wake(struct journal *j) -{ - wake_up(&j->wait); - closure_wake_up(&j->async_wait); -} - -/* Sequence number of oldest dirty journal entry */ - -static inline u64 journal_last_seq(struct journal *j) -{ - return j->pin.front; -} - -static inline u64 journal_cur_seq(struct journal *j) -{ - return atomic64_read(&j->seq); -} - -static inline u64 journal_last_unwritten_seq(struct journal *j) -{ - return j->seq_ondisk + 1; -} - -static inline struct journal_buf *journal_cur_buf(struct journal *j) -{ - unsigned idx = (journal_cur_seq(j) & - JOURNAL_BUF_MASK & - ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx; - - return j->buf + idx; -} - -static inline int journal_state_count(union journal_res_state s, int idx) -{ - switch (idx) { - case 0: return s.buf0_count; - case 1: return s.buf1_count; - case 2: return s.buf2_count; - case 3: return s.buf3_count; - } - BUG(); -} - -static inline int journal_state_seq_count(struct journal *j, - union journal_res_state s, u64 seq) -{ - if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR) - return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK); - else - return 0; -} - -static inline void journal_state_inc(union journal_res_state *s) -{ - s->buf0_count += s->idx == 0; - s->buf1_count += s->idx == 1; - s->buf2_count += s->idx == 2; - s->buf3_count += s->idx == 3; -} - -/* - * Amount of space that will be taken up by some keys in the journal (i.e. - * including the jset header) - */ -static inline unsigned jset_u64s(unsigned u64s) -{ - return u64s + sizeof(struct jset_entry) / sizeof(u64); -} - -static inline int journal_entry_overhead(struct journal *j) -{ - return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; -} - -static inline struct jset_entry * -bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) -{ - struct jset *jset = buf->data; - struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); - - memset(entry, 0, sizeof(*entry)); - entry->u64s = cpu_to_le16(u64s); - - le32_add_cpu(&jset->u64s, jset_u64s(u64s)); - - return entry; -} - -static inline struct jset_entry * -journal_res_entry(struct journal *j, struct journal_res *res) -{ - return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset); -} - -static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, - enum btree_id id, unsigned level, - unsigned u64s) -{ - entry->u64s = cpu_to_le16(u64s); - entry->btree_id = id; - entry->level = level; - entry->type = type; - entry->pad[0] = 0; - entry->pad[1] = 0; - entry->pad[2] = 0; - return jset_u64s(u64s); -} - -static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, - enum btree_id id, unsigned level, - const void *data, unsigned u64s) -{ - unsigned ret = journal_entry_init(entry, type, id, level, u64s); - - memcpy_u64s_small(entry->_data, data, u64s); - return ret; -} - -static inline struct jset_entry * -bch2_journal_add_entry(struct journal *j, struct journal_res *res, - unsigned type, enum btree_id id, - unsigned level, unsigned u64s) -{ - struct jset_entry *entry = journal_res_entry(j, res); - unsigned actual = journal_entry_init(entry, type, id, level, u64s); - - EBUG_ON(!res->ref); - EBUG_ON(actual > res->u64s); - - res->offset += actual; - res->u64s -= actual; - return entry; -} - -static inline bool journal_entry_empty(struct jset *j) -{ - if (j->seq != j->last_seq) - return false; - - vstruct_for_each(j, i) - if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) - return false; - return true; -} - -/* - * Drop reference on a buffer index and return true if the count has hit zero. - */ -static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx) -{ - union journal_res_state s; - - s.v = atomic64_sub_return(((union journal_res_state) { - .buf0_count = idx == 0, - .buf1_count = idx == 1, - .buf2_count = idx == 2, - .buf3_count = idx == 3, - }).v, &j->reservations.counter); - return s; -} - -bool bch2_journal_entry_close(struct journal *); -void bch2_journal_do_writes(struct journal *); -void bch2_journal_buf_put_final(struct journal *, u64); - -static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) -{ - unsigned idx = seq & JOURNAL_STATE_BUF_MASK; - union journal_res_state s; - - s = journal_state_buf_put(j, idx); - if (!journal_state_count(s, idx)) - bch2_journal_buf_put_final(j, seq); -} - -static inline void bch2_journal_buf_put(struct journal *j, u64 seq) -{ - unsigned idx = seq & JOURNAL_STATE_BUF_MASK; - union journal_res_state s; - - s = journal_state_buf_put(j, idx); - if (!journal_state_count(s, idx)) { - spin_lock(&j->lock); - bch2_journal_buf_put_final(j, seq); - spin_unlock(&j->lock); - } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) - wake_up(&j->wait); -} - -/* - * This function releases the journal write structure so other threads can - * then proceed to add their keys as well. - */ -static inline void bch2_journal_res_put(struct journal *j, - struct journal_res *res) -{ - if (!res->ref) - return; - - lock_release(&j->res_map, _THIS_IP_); - - while (res->u64s) - bch2_journal_add_entry(j, res, - BCH_JSET_ENTRY_btree_keys, - 0, 0, 0); - - bch2_journal_buf_put(j, res->seq); - - res->ref = 0; -} - -int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned, struct btree_trans *); - -/* First bits for BCH_WATERMARK: */ -enum journal_res_flags { - __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS, - __JOURNAL_RES_GET_CHECK, -}; - -#define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK) -#define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK) - -static inline int journal_res_get_fast(struct journal *j, - struct journal_res *res, - unsigned flags) -{ - union journal_res_state old, new; - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - - /* - * Check if there is still room in the current journal - * entry, smp_rmb() guarantees that reads from reservations.counter - * occur before accessing cur_entry_u64s: - */ - smp_rmb(); - if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) - return 0; - - EBUG_ON(!journal_state_count(new, new.idx)); - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) - return 0; - - new.cur_entry_offset += res->u64s; - journal_state_inc(&new); - - /* - * If the refcount would overflow, we have to wait: - * XXX - tracepoint this: - */ - if (!journal_state_count(new, new.idx)) - return 0; - - if (flags & JOURNAL_RES_GET_CHECK) - return 1; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - - res->ref = true; - res->offset = old.cur_entry_offset; - res->seq = journal_cur_seq(j); - res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK; - return 1; -} - -static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s, unsigned flags, - struct btree_trans *trans) -{ - int ret; - - EBUG_ON(res->ref); - EBUG_ON(!test_bit(JOURNAL_running, &j->flags)); - - res->u64s = u64s; - - if (journal_res_get_fast(j, res, flags)) - goto out; - - ret = bch2_journal_res_get_slowpath(j, res, flags, trans); - if (ret) - return ret; -out: - if (!(flags & JOURNAL_RES_GET_CHECK)) { - lock_acquire_shared(&j->res_map, 0, - (flags & JOURNAL_RES_GET_NONBLOCK) != 0, - NULL, _THIS_IP_); - EBUG_ON(!res->ref); - BUG_ON(!res->seq); - } - return 0; -} - -/* journal_entry_res: */ - -void bch2_journal_entry_res_resize(struct journal *, - struct journal_entry_res *, - unsigned); - -int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); -void bch2_journal_flush_async(struct journal *, struct closure *); - -int bch2_journal_flush_seq(struct journal *, u64, unsigned); -int bch2_journal_flush(struct journal *); -bool bch2_journal_noflush_seq(struct journal *, u64, u64); -int bch2_journal_meta(struct journal *); - -void bch2_journal_halt_locked(struct journal *); -void bch2_journal_halt(struct journal *); - -static inline int bch2_journal_error(struct journal *j) -{ - return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL - ? -BCH_ERR_journal_shutdown : 0; -} - -struct bch_dev; - -void bch2_journal_unblock(struct journal *); -void bch2_journal_block(struct journal *); -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); - -void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); -void bch2_journal_debug_to_text(struct printbuf *, struct journal *); - -int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned); -int bch2_dev_journal_bucket_delete(struct bch_dev *, u64); - -int bch2_dev_journal_alloc(struct bch_dev *, bool); -int bch2_fs_journal_alloc(struct bch_fs *); - -void bch2_dev_journal_stop(struct journal *, struct bch_dev *); - -void bch2_fs_journal_stop(struct journal *); -int bch2_fs_journal_start(struct journal *, u64, u64); -void bch2_journal_set_replay_done(struct journal *); - -void bch2_dev_journal_exit(struct bch_dev *); -int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); -void bch2_fs_journal_exit(struct journal *); -void bch2_fs_journal_init_early(struct journal *); -int bch2_fs_journal_init(struct journal *); - -#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c deleted file mode 100644 index 29bea8e0e49541..00000000000000 --- a/fs/bcachefs/journal_io.c +++ /dev/null @@ -1,2242 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "btree_io.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "checksum.h" -#include "disk_groups.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "replicas.h" -#include "sb-clean.h" -#include "trace.h" - -#include -#include -#include - -void bch2_journal_pos_from_member_info_set(struct bch_fs *c) -{ - lockdep_assert_held(&c->sb_lock); - - for_each_member_device(c, ca) { - struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - - m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); - m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); - } -} - -void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - for_each_member_device(c, ca) { - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); - - unsigned idx = le32_to_cpu(m.last_journal_bucket); - if (idx < ca->journal.nr) - ca->journal.cur_idx = idx; - unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); - if (offset <= ca->mi.bucket_size) - ca->journal.sectors_free = ca->mi.bucket_size - offset; - } - mutex_unlock(&c->sb_lock); -} - -static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p) -{ - struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev); - prt_printf(out, "%s %u:%u:%u (sector %llu)", - ca ? ca->name : "(invalid dev)", - p->dev, p->bucket, p->bucket_offset, p->sector); - bch2_dev_put(ca); -} - -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) -{ - darray_for_each(j->ptrs, i) { - if (i != j->ptrs.data) - prt_printf(out, " "); - bch2_journal_ptr_to_text(out, c, i); - } -} - -static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j) -{ - for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) { - struct jset_entry_datetime *datetime = - container_of(entry, struct jset_entry_datetime, entry); - bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); - break; - } -} - -static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) -{ - prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); - bch2_journal_datetime_to_text(out, &j->j); - prt_char(out, ' '); - bch2_journal_ptrs_to_text(out, c, j); -} - -static struct nonce journal_nonce(const struct jset *jset) -{ - return (struct nonce) {{ - [0] = 0, - [1] = ((__le32 *) &jset->seq)[0], - [2] = ((__le32 *) &jset->seq)[1], - [3] = BCH_NONCE_JOURNAL, - }}; -} - -static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) -{ - if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { - *csum = (struct bch_csum) {}; - return false; - } - - *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); - return !bch2_crc_cmp(j->csum, *csum); -} - -static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) -{ - return (seq - c->journal_entries_base_seq) & (~0U >> 1); -} - -static void __journal_replay_free(struct bch_fs *c, - struct journal_replay *i) -{ - struct journal_replay **p = - genradix_ptr(&c->journal_entries, - journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); - - BUG_ON(*p != i); - *p = NULL; - kvfree(i); -} - -static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) -{ - if (blacklisted) - i->ignore_blacklisted = true; - else - i->ignore_not_dirty = true; - - if (!c->opts.read_entire_journal) - __journal_replay_free(c, i); -} - -struct journal_list { - struct closure cl; - u64 last_seq; - struct mutex lock; - int ret; -}; - -#define JOURNAL_ENTRY_ADD_OK 0 -#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 - -/* - * Given a journal entry we just read, add it to the list of journal entries to - * be replayed: - */ -static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, - struct journal_ptr entry_ptr, - struct journal_list *jlist, struct jset *j) -{ - struct genradix_iter iter; - struct journal_replay **_i, *i, *dup; - size_t bytes = vstruct_bytes(j); - u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; - struct printbuf buf = PRINTBUF; - int ret = JOURNAL_ENTRY_ADD_OK; - - if (last_seq && c->opts.journal_rewind) - last_seq = min(last_seq, c->opts.journal_rewind); - - if (!c->journal.oldest_seq_found_ondisk || - le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) - c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); - - /* Is this entry older than the range we need? */ - if (!c->opts.read_entire_journal && - le64_to_cpu(j->seq) < jlist->last_seq) - return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - - /* - * genradixes are indexed by a ulong, not a u64, so we can't index them - * by sequence number directly: Assume instead that they will all fall - * within the range of +-2billion of the filrst one we find. - */ - if (!c->journal_entries_base_seq) - c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); - - /* Drop entries we don't need anymore */ - if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { - genradix_for_each_from(&c->journal_entries, iter, _i, - journal_entry_radix_idx(c, jlist->last_seq)) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - if (le64_to_cpu(i->j.seq) >= last_seq) - break; - - journal_replay_free(c, i, false); - } - } - - jlist->last_seq = max(jlist->last_seq, last_seq); - - _i = genradix_ptr_alloc(&c->journal_entries, - journal_entry_radix_idx(c, le64_to_cpu(j->seq)), - GFP_KERNEL); - if (!_i) - return bch_err_throw(c, ENOMEM_journal_entry_add); - - /* - * Duplicate journal entries? If so we want the one that didn't have a - * checksum error: - */ - dup = *_i; - if (dup) { - bool identical = bytes == vstruct_bytes(&dup->j) && - !memcmp(j, &dup->j, bytes); - bool not_identical = !identical && - entry_ptr.csum_good && - dup->csum_good; - - bool same_device = false; - darray_for_each(dup->ptrs, ptr) - if (ptr->dev == ca->dev_idx) - same_device = true; - - ret = darray_push(&dup->ptrs, entry_ptr); - if (ret) - goto out; - - bch2_journal_replay_to_text(&buf, c, dup); - - fsck_err_on(same_device, - c, journal_entry_dup_same_device, - "duplicate journal entry on same device\n%s", - buf.buf); - - fsck_err_on(not_identical, - c, journal_entry_replicas_data_mismatch, - "found duplicate but non identical journal entries\n%s", - buf.buf); - - if (entry_ptr.csum_good && !identical) - goto replace; - - goto out; - } -replace: - i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); - if (!i) - return bch_err_throw(c, ENOMEM_journal_entry_add); - - darray_init(&i->ptrs); - i->csum_good = entry_ptr.csum_good; - i->ignore_blacklisted = false; - i->ignore_not_dirty = false; - unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); - - if (dup) { - /* The first ptr should represent the jset we kept: */ - darray_for_each(dup->ptrs, ptr) - darray_push(&i->ptrs, *ptr); - __journal_replay_free(c, dup); - } else { - darray_push(&i->ptrs, entry_ptr); - } - - *_i = i; -out: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* this fills in a range with empty jset_entries: */ -static void journal_entry_null_range(void *start, void *end) -{ - struct jset_entry *entry; - - for (entry = start; entry != end; entry = vstruct_next(entry)) - memset(entry, 0, sizeof(*entry)); -} - -#define JOURNAL_ENTRY_REREAD 5 -#define JOURNAL_ENTRY_NONE 6 -#define JOURNAL_ENTRY_BAD 7 - -static void journal_entry_err_msg(struct printbuf *out, - u32 version, - struct jset *jset, - struct jset_entry *entry) -{ - prt_str(out, "invalid journal entry, version="); - bch2_version_to_text(out, version); - - if (entry) { - prt_str(out, " type="); - bch2_prt_jset_entry_type(out, entry->type); - } - - if (!jset) { - prt_printf(out, " in superblock"); - } else { - - prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); - - if (entry) - prt_printf(out, " offset=%zi/%u", - (u64 *) entry - jset->_data, - le32_to_cpu(jset->u64s)); - } - - prt_str(out, ": "); -} - -#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ -({ \ - struct printbuf _buf = PRINTBUF; \ - \ - journal_entry_err_msg(&_buf, version, jset, entry); \ - prt_printf(&_buf, msg, ##__VA_ARGS__); \ - \ - switch (from.flags & BCH_VALIDATE_write) { \ - case READ: \ - mustfix_fsck_err(c, _err, "%s", _buf.buf); \ - break; \ - case WRITE: \ - bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ - if (bch2_fs_inconsistent(c, \ - "corrupt metadata before write: %s\n", _buf.buf)) {\ - ret = bch_err_throw(c, fsck_errors_not_fixed); \ - goto fsck_err; \ - } \ - break; \ - } \ - \ - printbuf_exit(&_buf); \ - true; \ -}) - -#define journal_entry_err_on(cond, ...) \ - ((cond) ? journal_entry_err(__VA_ARGS__) : false) - -#define FSCK_DELETED_KEY 5 - -static int journal_validate_key(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - struct bkey_i *k, - struct bkey_validate_context from, - unsigned version, int big_endian) -{ - enum bch_validate_flags flags = from.flags; - int write = flags & BCH_VALIDATE_write; - void *next = vstruct_next(entry); - int ret = 0; - - if (journal_entry_err_on(!k->k.u64s, - c, version, jset, entry, - journal_entry_bkey_u64s_0, - "k->u64s 0")) { - entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - - if (journal_entry_err_on((void *) bkey_next(k) > - (void *) vstruct_next(entry), - c, version, jset, entry, - journal_entry_bkey_past_end, - "extends past end of journal entry")) { - entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - - if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, - c, version, jset, entry, - journal_entry_bkey_bad_format, - "bad format %u", k->k.format)) { - le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); - memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - - if (!write) - bch2_bkey_compat(from.level, from.btree, version, big_endian, - write, NULL, bkey_to_packed(k)); - - ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); - if (ret == -BCH_ERR_fsck_delete_bkey) { - le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); - memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - if (ret) - goto fsck_err; - - if (write) - bch2_bkey_compat(from.level, from.btree, version, big_endian, - write, NULL, bkey_to_packed(k)); -fsck_err: - return ret; -} - -static int journal_entry_btree_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct bkey_i *k = entry->start; - - from.level = entry->level; - from.btree = entry->btree_id; - - while (k != vstruct_last(entry)) { - int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); - if (ret == FSCK_DELETED_KEY) - continue; - else if (ret) - return ret; - - k = bkey_next(k); - } - - return 0; -} - -static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - bool first = true; - - jset_entry_for_each_key(entry, k) { - /* We may be called on entries that haven't been validated: */ - if (!k->k.u64s) - break; - - if (!first) { - prt_newline(out); - bch2_prt_jset_entry_type(out, entry->type); - prt_str(out, ": "); - } - bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); - prt_char(out, ' '); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); - first = false; - } -} - -static int journal_entry_btree_root_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct bkey_i *k = entry->start; - int ret = 0; - - from.root = true; - from.level = entry->level + 1; - from.btree = entry->btree_id; - - if (journal_entry_err_on(!entry->u64s || - le16_to_cpu(entry->u64s) != k->k.u64s, - c, version, jset, entry, - journal_entry_btree_root_bad_size, - "invalid btree root journal entry: wrong number of keys")) { - void *next = vstruct_next(entry); - /* - * we don't want to null out this jset_entry, - * just the contents, so that later we can tell - * we were _supposed_ to have a btree root - */ - entry->u64s = 0; - journal_entry_null_range(vstruct_next(entry), next); - return 0; - } - - ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); - if (ret == FSCK_DELETED_KEY) - ret = 0; -fsck_err: - return ret; -} - -static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_prio_ptrs_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - /* obsolete, don't care: */ - return 0; -} - -static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ -} - -static int journal_entry_blacklist_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - int ret = 0; - - if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, - c, version, jset, entry, - journal_entry_blacklist_bad_size, - "invalid journal seq blacklist entry: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - } -fsck_err: - return ret; -} - -static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_blacklist *bl = - container_of(entry, struct jset_entry_blacklist, entry); - - prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); -} - -static int journal_entry_blacklist_v2_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_blacklist_v2 *bl_entry; - int ret = 0; - - if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, - c, version, jset, entry, - journal_entry_blacklist_v2_bad_size, - "invalid journal seq blacklist entry: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - goto out; - } - - bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); - - if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > - le64_to_cpu(bl_entry->end), - c, version, jset, entry, - journal_entry_blacklist_v2_start_past_end, - "invalid journal seq blacklist entry: start > end")) { - journal_entry_null_range(entry, vstruct_next(entry)); - } -out: -fsck_err: - return ret; -} - -static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_blacklist_v2 *bl = - container_of(entry, struct jset_entry_blacklist_v2, entry); - - prt_printf(out, "start=%llu end=%llu", - le64_to_cpu(bl->start), - le64_to_cpu(bl->end)); -} - -static int journal_entry_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - int ret = 0; - - if (journal_entry_err_on(bytes < sizeof(*u), - c, version, jset, entry, - journal_entry_usage_bad_size, - "invalid journal entry usage: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - -fsck_err: - return ret; -} - -static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); - - prt_str(out, "type="); - bch2_prt_fs_usage_type(out, u->entry.btree_id); - prt_printf(out, " v=%llu", le64_to_cpu(u->v)); -} - -static int journal_entry_data_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - struct printbuf err = PRINTBUF; - int ret = 0; - - if (journal_entry_err_on(bytes < sizeof(*u) || - bytes < sizeof(*u) + u->r.nr_devs, - c, version, jset, entry, - journal_entry_data_usage_bad_size, - "invalid journal entry usage: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - goto out; - } - - if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), - c, version, jset, entry, - journal_entry_data_usage_bad_size, - "invalid journal entry usage: %s", err.buf)) { - journal_entry_null_range(entry, vstruct_next(entry)); - goto out; - } -out: -fsck_err: - printbuf_exit(&err); - return ret; -} - -static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); - - bch2_replicas_entry_to_text(out, &u->r); - prt_printf(out, "=%llu", le64_to_cpu(u->v)); -} - -static int journal_entry_clock_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_clock *clock = - container_of(entry, struct jset_entry_clock, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - int ret = 0; - - if (journal_entry_err_on(bytes != sizeof(*clock), - c, version, jset, entry, - journal_entry_clock_bad_size, - "bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - - if (journal_entry_err_on(clock->rw > 1, - c, version, jset, entry, - journal_entry_clock_bad_rw, - "bad rw")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - -fsck_err: - return ret; -} - -static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_clock *clock = - container_of(entry, struct jset_entry_clock, entry); - - prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); -} - -static int journal_entry_dev_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_dev_usage *u = - container_of(entry, struct jset_entry_dev_usage, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - unsigned expected = sizeof(*u); - int ret = 0; - - if (journal_entry_err_on(bytes < expected, - c, version, jset, entry, - journal_entry_dev_usage_bad_size, - "bad size (%u < %u)", - bytes, expected)) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - - if (journal_entry_err_on(u->pad, - c, version, jset, entry, - journal_entry_dev_usage_bad_pad, - "bad pad")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - -fsck_err: - return ret; -} - -static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_dev_usage *u = - container_of(entry, struct jset_entry_dev_usage, entry); - unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - - if (vstruct_bytes(entry) < sizeof(*u)) - return; - - prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); - - printbuf_indent_add(out, 2); - for (i = 0; i < nr_types; i++) { - prt_newline(out); - bch2_prt_data_type(out, i); - prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", - le64_to_cpu(u->d[i].buckets), - le64_to_cpu(u->d[i].sectors), - le64_to_cpu(u->d[i].fragmented)); - } - printbuf_indent_sub(out, 2); -} - -static int journal_entry_log_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - return 0; -} - -static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - - prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); -} - -static int journal_entry_overwrite_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - from.flags = 0; - return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, from); -} - -static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_log_bkey_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - from.flags = 0; - return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, from); -} - -static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, from); -} - -static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_datetime_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - unsigned bytes = vstruct_bytes(entry); - unsigned expected = 16; - int ret = 0; - - if (journal_entry_err_on(vstruct_bytes(entry) < expected, - c, version, jset, entry, - journal_entry_dev_usage_bad_size, - "bad size (%u < %u)", - bytes, expected)) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } -fsck_err: - return ret; -} - -static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_datetime *datetime = - container_of(entry, struct jset_entry_datetime, entry); - - bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); -} - -struct jset_entry_ops { - int (*validate)(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, - struct bkey_validate_context); - void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); -}; - -static const struct jset_entry_ops bch2_jset_entry_ops[] = { -#define x(f, nr) \ - [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ - .validate = journal_entry_##f##_validate, \ - .to_text = journal_entry_##f##_to_text, \ - }, - BCH_JSET_ENTRY_TYPES() -#undef x -}; - -int bch2_journal_entry_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - return entry->type < BCH_JSET_ENTRY_NR - ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, - version, big_endian, from) - : 0; -} - -void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - bch2_prt_jset_entry_type(out, entry->type); - - if (entry->type < BCH_JSET_ENTRY_NR) { - prt_str(out, ": "); - bch2_jset_entry_ops[entry->type].to_text(out, c, entry); - } -} - -static int jset_validate_entries(struct bch_fs *c, struct jset *jset, - enum bch_validate_flags flags) -{ - struct bkey_validate_context from = { - .flags = flags, - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(jset->seq), - }; - - unsigned version = le32_to_cpu(jset->version); - int ret = 0; - - vstruct_for_each(jset, entry) { - from.journal_offset = (u64 *) entry - jset->_data; - - if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), - c, version, jset, entry, - journal_entry_past_jset_end, - "journal entry extends past end of jset")) { - jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); - break; - } - - ret = bch2_journal_entry_validate(c, jset, entry, version, - JSET_BIG_ENDIAN(jset), from); - if (ret) - break; - } -fsck_err: - return ret; -} - -static int jset_validate(struct bch_fs *c, - struct bch_dev *ca, - struct jset *jset, u64 sector, - enum bch_validate_flags flags) -{ - struct bkey_validate_context from = { - .flags = flags, - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(jset->seq), - }; - int ret = 0; - - if (le64_to_cpu(jset->magic) != jset_magic(c)) - return JOURNAL_ENTRY_NONE; - - unsigned version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), - c, version, jset, NULL, - jset_unsupported_version, - "%s sector %llu seq %llu: incompatible journal entry version %u.%u", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), - BCH_VERSION_MAJOR(version), - BCH_VERSION_MINOR(version))) { - /* don't try to continue: */ - return -EINVAL; - } - - if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), - c, version, jset, NULL, - jset_unknown_csum, - "%s sector %llu seq %llu: journal entry with unknown csum type %llu", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), - JSET_CSUM_TYPE(jset))) - ret = JOURNAL_ENTRY_BAD; - - /* last_seq is ignored when JSET_NO_FLUSH is true */ - if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && - le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), - c, version, jset, NULL, - jset_last_seq_newer_than_seq, - "invalid journal entry: last_seq > seq (%llu > %llu)", - le64_to_cpu(jset->last_seq), - le64_to_cpu(jset->seq))) { - jset->last_seq = jset->seq; - return JOURNAL_ENTRY_BAD; - } - - ret = jset_validate_entries(c, jset, flags); -fsck_err: - return ret; -} - -static int jset_validate_early(struct bch_fs *c, - struct bch_dev *ca, - struct jset *jset, u64 sector, - unsigned bucket_sectors_left, - unsigned sectors_read) -{ - struct bkey_validate_context from = { - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(jset->seq), - }; - int ret = 0; - - if (le64_to_cpu(jset->magic) != jset_magic(c)) - return JOURNAL_ENTRY_NONE; - - unsigned version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), - c, version, jset, NULL, - jset_unsupported_version, - "%s sector %llu seq %llu: unknown journal entry version %u.%u", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), - BCH_VERSION_MAJOR(version), - BCH_VERSION_MINOR(version))) { - /* don't try to continue: */ - return -EINVAL; - } - - size_t bytes = vstruct_bytes(jset); - if (bytes > (sectors_read << 9) && - sectors_read < bucket_sectors_left) - return JOURNAL_ENTRY_REREAD; - - if (journal_entry_err_on(bytes > bucket_sectors_left << 9, - c, version, jset, NULL, - jset_past_bucket_end, - "%s sector %llu seq %llu: journal entry too big (%zu bytes)", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), bytes)) - le32_add_cpu(&jset->u64s, - -((bytes - (bucket_sectors_left << 9)) / 8)); -fsck_err: - return ret; -} - -struct journal_read_buf { - void *data; - size_t size; -}; - -static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b, - size_t new_size) -{ - void *n; - - /* the bios are sized for this many pages, max: */ - if (new_size > JOURNAL_ENTRY_SIZE_MAX) - return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); - - new_size = roundup_pow_of_two(new_size); - n = kvmalloc(new_size, GFP_KERNEL); - if (!n) - return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); - - kvfree(b->data); - b->data = n; - b->size = new_size; - return 0; -} - -static int journal_read_bucket(struct bch_dev *ca, - struct journal_read_buf *buf, - struct journal_list *jlist, - unsigned bucket) -{ - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; - struct jset *j = NULL; - unsigned sectors, sectors_read = 0; - u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), - end = offset + ca->mi.bucket_size; - bool saw_bad = false, csum_good; - int ret = 0; - - pr_debug("reading %u", bucket); - - while (offset < end) { - if (!sectors_read) { - struct bio *bio; - unsigned nr_bvecs; -reread: - sectors_read = min_t(unsigned, - end - offset, buf->size >> 9); - nr_bvecs = buf_pages(buf->data, sectors_read << 9); - - bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); - if (!bio) - return bch_err_throw(c, ENOMEM_journal_read_bucket); - bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); - - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, buf->data, sectors_read << 9); - - u64 submit_time = local_clock(); - ret = submit_bio_wait(bio); - kfree(bio); - - if (!ret && bch2_meta_read_fault("journal")) - ret = bch_err_throw(c, EIO_fault_injected); - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - submit_time, !ret); - - if (ret) { - bch_err_dev_ratelimited(ca, - "journal read error: sector %llu", offset); - /* - * We don't error out of the recovery process - * here, since the relevant journal entry may be - * found on a different device, and missing or - * no journal entries will be handled later - */ - return 0; - } - - j = buf->data; - } - - ret = jset_validate_early(c, ca, j, offset, - end - offset, sectors_read); - switch (ret) { - case 0: - sectors = vstruct_sectors(j, c->block_bits); - break; - case JOURNAL_ENTRY_REREAD: - if (vstruct_bytes(j) > buf->size) { - ret = journal_read_buf_realloc(c, buf, - vstruct_bytes(j)); - if (ret) - return ret; - } - goto reread; - case JOURNAL_ENTRY_NONE: - if (!saw_bad) - return 0; - /* - * On checksum error we don't really trust the size - * field of the journal entry we read, so try reading - * again at next block boundary: - */ - sectors = block_sectors(c); - goto next_block; - default: - return ret; - } - - if (le64_to_cpu(j->seq) > ja->highest_seq_found) { - ja->highest_seq_found = le64_to_cpu(j->seq); - ja->cur_idx = bucket; - ja->sectors_free = ca->mi.bucket_size - - bucket_remainder(ca, offset) - sectors; - } - - /* - * This happens sometimes if we don't have discards on - - * when we've partially overwritten a bucket with new - * journal entries. We don't need the rest of the - * bucket: - */ - if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - return 0; - - ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - - struct bch_csum csum; - csum_good = jset_csum_good(c, j, &csum); - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); - - if (!csum_good) { - /* - * Don't print an error here, we'll print the error - * later if we need this journal entry - */ - saw_bad = true; - } - - ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), - j->encrypted_start, - vstruct_end(j) - (void *) j->encrypted_start); - bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); - - mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, (struct journal_ptr) { - .csum_good = csum_good, - .csum = csum, - .dev = ca->dev_idx, - .bucket = bucket, - .bucket_offset = offset - - bucket_to_sector(ca, ja->buckets[bucket]), - .sector = offset, - }, jlist, j); - mutex_unlock(&jlist->lock); - - switch (ret) { - case JOURNAL_ENTRY_ADD_OK: - break; - case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: - break; - default: - return ret; - } -next_block: - pr_debug("next"); - offset += sectors; - sectors_read -= sectors; - j = ((void *) j) + (sectors << 9); - } - - return 0; -} - -static CLOSURE_CALLBACK(bch2_journal_read_device) -{ - closure_type(ja, struct journal_device, read); - struct bch_dev *ca = container_of(ja, struct bch_dev, journal); - struct bch_fs *c = ca->fs; - struct journal_list *jlist = - container_of(cl->parent, struct journal_list, cl); - struct journal_read_buf buf = { NULL, 0 }; - unsigned i; - int ret = 0; - - if (!ja->nr) - goto out; - - ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE); - if (ret) - goto err; - - pr_debug("%u journal buckets", ja->nr); - - for (i = 0; i < ja->nr; i++) { - ret = journal_read_bucket(ca, &buf, jlist, i); - if (ret) - goto err; - } - - /* - * Set dirty_idx to indicate the entire journal is full and needs to be - * reclaimed - journal reclaim will immediately reclaim whatever isn't - * pinned when it first runs: - */ - ja->discard_idx = ja->dirty_idx_ondisk = - ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; -out: - bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); - kvfree(buf.data); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); - closure_return(cl); - return; -err: - mutex_lock(&jlist->lock); - jlist->ret = ret; - mutex_unlock(&jlist->lock); - goto out; -} - -noinline_for_stack -static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) -{ - struct printbuf buf = PRINTBUF; - enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); - bool have_good = false; - - prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq)); - bch2_journal_datetime_to_text(&buf, &j->j); - prt_newline(&buf); - - darray_for_each(j->ptrs, ptr) - if (!ptr->csum_good) { - bch2_journal_ptr_to_text(&buf, c, ptr); - prt_char(&buf, ' '); - bch2_csum_to_text(&buf, csum_type, ptr->csum); - prt_newline(&buf); - } else { - have_good = true; - } - - prt_printf(&buf, "should be "); - bch2_csum_to_text(&buf, csum_type, j->j.csum); - - if (have_good) - prt_printf(&buf, "\n(had good copy on another device)"); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) -{ - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct genradix_iter radix_iter; - struct journal_replay *i, **_i, *prev = NULL; - u64 seq = start_seq; - - genradix_for_each(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - BUG_ON(seq > le64_to_cpu(i->j.seq)); - - while (seq < le64_to_cpu(i->j.seq)) { - while (seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (seq == le64_to_cpu(i->j.seq)) - break; - - u64 missing_start = seq; - - while (seq < le64_to_cpu(i->j.seq) && - !bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - u64 missing_end = seq - 1; - - printbuf_reset(&buf); - prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", - missing_start, missing_end, - start_seq, end_seq); - - prt_printf(&buf, "\nprev at "); - if (prev) { - bch2_journal_ptrs_to_text(&buf, c, prev); - prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); - } else - prt_printf(&buf, "(none)"); - - prt_printf(&buf, "\nnext at "); - bch2_journal_ptrs_to_text(&buf, c, i); - prt_printf(&buf, ", continue?"); - - fsck_err(c, journal_entries_missing, "%s", buf.buf); - } - - prev = i; - seq++; - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_journal_read(struct bch_fs *c, - u64 *last_seq, - u64 *blacklist_seq, - u64 *start_seq) -{ - struct journal_list jlist; - struct journal_replay *i, **_i; - struct genradix_iter radix_iter; - struct printbuf buf = PRINTBUF; - bool degraded = false, last_write_torn = false; - u64 seq; - int ret = 0; - - closure_init_stack(&jlist.cl); - mutex_init(&jlist.lock); - jlist.last_seq = 0; - jlist.ret = 0; - - for_each_member_device(c, ca) { - if (!c->opts.fsck && - !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) - continue; - - if ((ca->mi.state == BCH_MEMBER_STATE_rw || - ca->mi.state == BCH_MEMBER_STATE_ro) && - enumerated_ref_tryget(&ca->io_ref[READ], - BCH_DEV_READ_REF_journal_read)) - closure_call(&ca->journal.read, - bch2_journal_read_device, - system_dfl_wq, - &jlist.cl); - else - degraded = true; - } - - while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) - ; - - if (jlist.ret) - return jlist.ret; - - *last_seq = 0; - *start_seq = 0; - *blacklist_seq = 0; - - /* - * Find most recent flush entry, and ignore newer non flush entries - - * those entries will be blacklisted: - */ - genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - if (!*start_seq) - *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; - - if (JSET_NO_FLUSH(&i->j)) { - i->ignore_blacklisted = true; - continue; - } - - if (!last_write_torn && !i->csum_good) { - last_write_torn = true; - i->ignore_blacklisted = true; - continue; - } - - struct bkey_validate_context from = { - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(i->j.seq), - }; - if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), - c, le32_to_cpu(i->j.version), &i->j, NULL, - jset_last_seq_newer_than_seq, - "invalid journal entry: last_seq > seq (%llu > %llu)", - le64_to_cpu(i->j.last_seq), - le64_to_cpu(i->j.seq))) - i->j.last_seq = i->j.seq; - - *last_seq = le64_to_cpu(i->j.last_seq); - *blacklist_seq = le64_to_cpu(i->j.seq) + 1; - break; - } - - if (!*start_seq) { - bch_info(c, "journal read done, but no entries found"); - return 0; - } - - if (!*last_seq) { - fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, - "journal read done, but no entries found after dropping non-flushes"); - return 0; - } - - printbuf_reset(&buf); - prt_printf(&buf, "journal read done, replaying entries %llu-%llu", - *last_seq, *blacklist_seq - 1); - - /* - * Drop blacklisted entries and entries older than last_seq (or start of - * journal rewind: - */ - u64 drop_before = *last_seq; - if (c->opts.journal_rewind) { - drop_before = min(drop_before, c->opts.journal_rewind); - prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind); - } - - *last_seq = drop_before; - if (*start_seq != *blacklist_seq) - prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); - bch_info(c, "%s", buf.buf); - genradix_for_each(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - seq = le64_to_cpu(i->j.seq); - if (seq < drop_before) { - journal_replay_free(c, i, false); - continue; - } - - if (bch2_journal_seq_is_blacklisted(c, seq, true)) { - fsck_err_on(!JSET_NO_FLUSH(&i->j), c, - jset_seq_blacklisted, - "found blacklisted journal entry %llu", seq); - i->ignore_blacklisted = true; - } - } - - ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1); - if (ret) - goto err; - - genradix_for_each(&c->journal_entries, radix_iter, _i) { - union bch_replicas_padded replicas = { - .e.data_type = BCH_DATA_journal, - .e.nr_devs = 0, - .e.nr_required = 1, - }; - - i = *_i; - if (journal_replay_ignore(i)) - continue; - - /* - * Don't print checksum errors until we know we're going to use - * a given journal entry: - */ - darray_for_each(i->ptrs, ptr) - if (!ptr->csum_good) { - bch2_journal_print_checksum_error(c, i); - break; - } - - ret = jset_validate(c, - bch2_dev_have_ref(c, i->ptrs.data[0].dev), - &i->j, - i->ptrs.data[0].sector, - READ); - if (ret) - goto err; - - darray_for_each(i->ptrs, ptr) - replicas_entry_add_dev(&replicas.e, ptr->dev); - - bch2_replicas_entry_sort(&replicas.e); - - printbuf_reset(&buf); - bch2_replicas_entry_to_text(&buf, &replicas.e); - - if (!degraded && - !bch2_replicas_marked(c, &replicas.e) && - (le64_to_cpu(i->j.seq) == *last_seq || - fsck_err(c, journal_entry_replicas_not_marked, - "superblock not marked as containing replicas for journal entry %llu\n%s", - le64_to_cpu(i->j.seq), buf.buf))) { - ret = bch2_mark_replicas(c, &replicas.e); - if (ret) - goto err; - } - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* journal write: */ - -static void journal_advance_devs_to_next_bucket(struct journal *j, - struct dev_alloc_list *devs, - unsigned sectors, __le64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - guard(rcu)(); - darray_for_each(*devs, i) { - struct bch_dev *ca = rcu_dereference(c->devs[*i]); - if (!ca) - continue; - - struct journal_device *ja = &ca->journal; - - if (sectors > ja->sectors_free && - sectors <= ca->mi.bucket_size && - bch2_journal_dev_buckets_available(j, ja, - journal_space_discarded)) { - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = ca->mi.bucket_size; - - /* - * ja->bucket_seq[ja->cur_idx] must always have - * something sensible: - */ - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); - } - } -} - -static void __journal_write_alloc(struct journal *j, - struct journal_buf *w, - struct dev_alloc_list *devs, - unsigned sectors, - unsigned *replicas, - unsigned replicas_want) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - darray_for_each(*devs, i) { - struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, - BCH_DEV_WRITE_REF_journal_write); - if (!ca) - continue; - - struct journal_device *ja = &ca->journal; - - /* - * Check that we can use this device, and aren't already using - * it: - */ - if (!ca->mi.durability || - ca->mi.state != BCH_MEMBER_STATE_rw || - !ja->nr || - bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || - sectors > ja->sectors_free) { - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); - continue; - } - - bch2_dev_stripe_increment(ca, &j->wp.stripe); - - bch2_bkey_append_ptr(&w->key, - (struct bch_extent_ptr) { - .offset = bucket_to_sector(ca, - ja->buckets[ja->cur_idx]) + - ca->mi.bucket_size - - ja->sectors_free, - .dev = ca->dev_idx, - }); - - ja->sectors_free -= sectors; - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - - *replicas += ca->mi.durability; - - if (*replicas >= replicas_want) - break; - } -} - -static int journal_write_alloc(struct journal *j, struct journal_buf *w, - unsigned *replicas) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_devs_mask devs; - struct dev_alloc_list devs_sorted; - unsigned sectors = vstruct_sectors(w->data, c->block_bits); - unsigned target = c->opts.metadata_target ?: - c->opts.foreground_target; - unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); - unsigned replicas_need = min_t(unsigned, replicas_want, - READ_ONCE(c->opts.metadata_replicas_required)); - bool advance_done = false; - -retry_target: - devs = target_rw_devs(c, BCH_DATA_journal, target); - bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted); -retry_alloc: - __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); - - if (likely(*replicas >= replicas_want)) - goto done; - - if (!advance_done) { - journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); - advance_done = true; - goto retry_alloc; - } - - if (*replicas < replicas_want && target) { - /* Retry from all devices: */ - target = 0; - advance_done = false; - goto retry_target; - } -done: - BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - -#if 0 - /* - * XXX: we need a way to alert the user when we go degraded for any - * reason - */ - if (*replicas < min(replicas_want, - dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) { - } -#endif - - return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; -} - -static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - /* we aren't holding j->lock: */ - unsigned new_size = READ_ONCE(j->buf_size_want); - void *new_buf; - - if (buf->buf_size >= new_size) - return; - - size_t btree_write_buffer_size = new_size / 64; - - if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) - return; - - new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); - if (!new_buf) - return; - - memcpy(new_buf, buf->data, buf->buf_size); - - spin_lock(&j->lock); - swap(buf->data, new_buf); - swap(buf->buf_size, new_size); - spin_unlock(&j->lock); - - kvfree(new_buf); -} - -static CLOSURE_CALLBACK(journal_write_done) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - union bch_replicas_padded replicas; - u64 seq = le64_to_cpu(w->data->seq); - int err = 0; - - bch2_time_stats_update(!JSET_NO_FLUSH(w->data) - ? j->flush_write_time - : j->noflush_write_time, j->write_start_time); - - if (!w->devs_written.nr) { - err = bch_err_throw(c, journal_write_err); - } else { - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - w->devs_written); - err = bch2_mark_replicas(c, &replicas.e); - } - - if (err && !bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - if (err == -BCH_ERR_journal_write_err) - prt_printf(&buf, "unable to write journal to sufficient devices\n"); - else - prt_printf(&buf, "journal write error marking replicas: %s\n", - bch2_err_str(err)); - - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - - closure_debug_destroy(cl); - - spin_lock(&j->lock); - if (seq >= j->pin.front) - journal_seq_pin(j, seq)->devs = w->devs_written; - if (err && (!j->err_seq || seq < j->err_seq)) - j->err_seq = seq; - w->write_done = true; - - if (!j->free_buf || j->free_buf_size < w->buf_size) { - swap(j->free_buf, w->data); - swap(j->free_buf_size, w->buf_size); - } - - if (w->data) { - void *buf = w->data; - w->data = NULL; - w->buf_size = 0; - - spin_unlock(&j->lock); - kvfree(buf); - spin_lock(&j->lock); - } - - bool completed = false; - bool do_discards = false; - - for (seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) { - w = j->buf + (seq & JOURNAL_BUF_MASK); - if (!w->write_done) - break; - - if (!j->err_seq && !w->noflush) { - j->flushed_seq_ondisk = seq; - j->last_seq_ondisk = w->last_seq; - - closure_wake_up(&c->freelist_wait); - bch2_reset_alloc_cursors(c); - do_discards = true; - } - - j->seq_ondisk = seq; - - /* - * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard - * more buckets: - * - * Must come before signaling write completion, for - * bch2_fs_journal_stop(): - */ - if (j->watermark != BCH_WATERMARK_stripe) - journal_reclaim_kick(&c->journal); - - closure_wake_up(&w->wait); - completed = true; - } - - if (completed) { - bch2_journal_reclaim_fast(j); - bch2_journal_space_available(j); - - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); - - journal_wake(j); - } - - if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && - j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { - struct journal_buf *buf = journal_cur_buf(j); - long delta = buf->expires - jiffies; - - /* - * We don't close a journal entry to write it while there's - * previous entries still in flight - the current journal entry - * might want to be written now: - */ - mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); - } - - /* - * We don't typically trigger journal writes from her - the next journal - * write will be triggered immediately after the previous one is - * allocated, in bch2_journal_write() - but the journal write error path - * is special: - */ - bch2_journal_do_writes(j); - spin_unlock(&j->lock); - - if (do_discards) - bch2_do_discards(c); -} - -static void journal_write_endio(struct bio *bio) -{ - struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); - struct bch_dev *ca = jbio->ca; - struct journal *j = &ca->fs->journal; - struct journal_buf *w = j->buf + jbio->buf_idx; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, - jbio->submit_time, !bio->bi_status); - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, - "error writing journal entry %llu: %s", - le64_to_cpu(w->data->seq), - bch2_blk_status_to_str(bio->bi_status)); - - unsigned long flags; - spin_lock_irqsave(&j->err_lock, flags); - bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); - spin_unlock_irqrestore(&j->err_lock, flags); - } - - closure_put(&w->io); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); -} - -static CLOSURE_CALLBACK(journal_write_submit) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned sectors = vstruct_sectors(w->data, c->block_bits); - - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], - sectors); - - struct journal_device *ja = &ca->journal; - struct journal_bio *jbio = ja->bio[w->idx]; - struct bio *bio = &jbio->bio; - - jbio->submit_time = local_clock(); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = ptr->offset; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); - - BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); - ca->prev_journal_sector = bio->bi_iter.bi_sector; - - if (!JSET_NO_FLUSH(w->data)) - bio->bi_opf |= REQ_FUA; - if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) - bio->bi_opf |= REQ_PREFLUSH; - - bch2_bio_map(bio, w->data, sectors << 9); - - trace_and_count(c, journal_write, bio); - closure_bio_submit(bio, cl); - - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - } - - continue_at(cl, journal_write_done, j->wq); -} - -static CLOSURE_CALLBACK(journal_write_preflush) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - /* - * Wait for previous journal writes to comelete; they won't necessarily - * be flushed if they're still in flight - */ - if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { - spin_lock(&j->lock); - if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { - closure_wait(&j->async_wait, cl); - spin_unlock(&j->lock); - continue_at(cl, journal_write_preflush, j->wq); - return; - } - spin_unlock(&j->lock); - } - - if (w->separate_flush) { - for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) { - enumerated_ref_get(&ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_journal_write); - - struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->bio[w->idx]->bio; - bio_reset(bio, ca->disk_sb.bdev, - REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - - continue_at(cl, journal_write_submit, j->wq); - } else { - /* - * no need to punt to another work item if we're not waiting on - * preflushes - */ - journal_write_submit(&cl->work); - } -} - -static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset_entry *start, *end; - struct jset *jset = w->data; - struct journal_keys_to_wb wb = { NULL }; - unsigned u64s; - unsigned long btree_roots_have = 0; - u64 seq = le64_to_cpu(jset->seq); - int ret; - - /* - * Simple compaction, dropping empty jset_entries (from journal - * reservations that weren't fully used) and merging jset_entries that - * can be. - * - * If we wanted to be really fancy here, we could sort all the keys in - * the jset and drop keys that were overwritten - probably not worth it: - */ - vstruct_for_each(jset, i) { - unsigned u64s = le16_to_cpu(i->u64s); - - /* Empty entry: */ - if (!u64s) - continue; - - /* - * New btree roots are set by journalling them; when the journal - * entry gets written we have to propagate them to - * c->btree_roots - * - * But, every journal entry we write has to contain all the - * btree roots (at least for now); so after we copy btree roots - * to c->btree_roots we have to get any missing btree roots and - * add them to this journal entry: - */ - switch (i->type) { - case BCH_JSET_ENTRY_btree_root: - bch2_journal_entry_to_btree_root(c, i); - __set_bit(i->btree_id, &btree_roots_have); - break; - case BCH_JSET_ENTRY_write_buffer_keys: - EBUG_ON(!w->need_flush_to_write_buffer); - - if (!wb.wb) - bch2_journal_keys_to_write_buffer_start(c, &wb, seq); - - jset_entry_for_each_key(i, k) { - ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); - if (ret) { - bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", - bch2_err_str(ret)); - bch2_journal_keys_to_write_buffer_end(c, &wb); - return ret; - } - } - i->type = BCH_JSET_ENTRY_btree_keys; - break; - } - } - - if (wb.wb) { - ret = bch2_journal_keys_to_write_buffer_end(c, &wb); - if (ret) { - bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", - bch2_err_str(ret)); - return ret; - } - } - - spin_lock(&c->journal.lock); - w->need_flush_to_write_buffer = false; - spin_unlock(&c->journal.lock); - - start = end = vstruct_last(jset); - - end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); - - struct jset_entry_datetime *d = - container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); - d->entry.type = BCH_JSET_ENTRY_datetime; - d->seconds = cpu_to_le64(ktime_get_real_seconds()); - - bch2_journal_super_entries_add_common(c, &end, seq); - u64s = (u64 *) end - (u64 *) start; - - WARN_ON(u64s > j->entry_u64s_reserved); - - le32_add_cpu(&jset->u64s, u64s); - - unsigned sectors = vstruct_sectors(jset, c->block_bits); - - if (sectors > w->sectors) { - bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", - vstruct_bytes(jset), w->sectors << 9, - u64s, w->u64s_reserved, j->entry_u64s_reserved); - return -EINVAL; - } - - return 0; -} - -static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset *jset = w->data; - u64 seq = le64_to_cpu(jset->seq); - bool validate_before_checksum = false; - int ret = 0; - - jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = cpu_to_le32(c->sb.version); - - SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); - SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); - - if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) - j->last_empty_seq = seq; - - if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) - validate_before_checksum = true; - - if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) - validate_before_checksum = true; - - if (validate_before_checksum && - (ret = jset_validate(c, NULL, jset, 0, WRITE))) - return ret; - - ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), - jset->encrypted_start, - vstruct_end(jset) - (void *) jset->encrypted_start); - if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret))) - return ret; - - jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), - journal_nonce(jset), jset); - - if (!validate_before_checksum && - (ret = jset_validate(c, NULL, jset, 0, WRITE))) - return ret; - - unsigned sectors = vstruct_sectors(jset, c->block_bits); - unsigned bytes = vstruct_bytes(jset); - memset((void *) jset + bytes, 0, (sectors << 9) - bytes); - return 0; -} - -static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - int error = bch2_journal_error(j); - - /* - * If the journal is in an error state - we did an emergency shutdown - - * we prefer to continue doing journal writes. We just mark them as - * noflush so they'll never be used, but they'll still be visible by the - * list_journal tool - this helps in debugging. - * - * There's a caveat: the first journal write after marking the - * superblock dirty must always be a flush write, because on startup - * from a clean shutdown we didn't necessarily read the journal and the - * new journal write might overwrite whatever was in the journal - * previously - we can't leave the journal without any flush writes in - * it. - * - * So if we're in an error state, and we're still starting up, we don't - * write anything at all. - */ - if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) - return error; - - if (error || - w->noflush || - (!w->must_flush && - time_before(jiffies, j->last_flush_write + - msecs_to_jiffies(c->opts.journal_flush_delay)) && - test_bit(JOURNAL_may_skip_flush, &j->flags))) { - w->noflush = true; - SET_JSET_NO_FLUSH(w->data, true); - w->data->last_seq = 0; - w->last_seq = 0; - - j->nr_noflush_writes++; - } else { - w->must_flush = true; - j->last_flush_write = jiffies; - j->nr_flush_writes++; - clear_bit(JOURNAL_need_flush_write, &j->flags); - } - - return 0; -} - -CLOSURE_CALLBACK(bch2_journal_write) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - union bch_replicas_padded replicas; - unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); - int ret; - - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - BUG_ON(!w->write_started); - BUG_ON(w->write_allocated); - BUG_ON(w->write_done); - - j->write_start_time = local_clock(); - - spin_lock(&j->lock); - if (nr_rw_members > 1) - w->separate_flush = true; - - ret = bch2_journal_write_pick_flush(j, w); - spin_unlock(&j->lock); - - if (unlikely(ret)) - goto err; - - mutex_lock(&j->buf_lock); - journal_buf_realloc(j, w); - - ret = bch2_journal_write_prep(j, w); - mutex_unlock(&j->buf_lock); - - if (unlikely(ret)) - goto err; - - unsigned replicas_allocated = 0; - while (1) { - ret = journal_write_alloc(j, w, &replicas_allocated); - if (!ret || !j->can_discard) - break; - - bch2_journal_do_discards(j); - } - - if (unlikely(ret)) - goto err_allocate_write; - - ret = bch2_journal_write_checksum(j, w); - if (unlikely(ret)) - goto err; - - spin_lock(&j->lock); - /* - * write is allocated, no longer need to account for it in - * bch2_journal_space_available(): - */ - w->sectors = 0; - w->write_allocated = true; - j->entry_bytes_written += vstruct_bytes(w->data); - - /* - * journal entry has been compacted and allocated, recalculate space - * available: - */ - bch2_journal_space_available(j); - bch2_journal_do_writes(j); - spin_unlock(&j->lock); - - w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - - /* - * Mark journal replicas before we submit the write to guarantee - * recovery will find the journal entries after a crash. - */ - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - w->devs_written); - ret = bch2_mark_replicas(c, &replicas.e); - if (ret) - goto err; - - if (c->opts.nochanges) - goto no_io; - - if (!JSET_NO_FLUSH(w->data)) - continue_at(cl, journal_write_preflush, j->wq); - else - continue_at(cl, journal_write_submit, j->wq); - return; -err_allocate_write: - if (!bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - - bch2_journal_debug_to_text(&buf, j); - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), - le64_to_cpu(w->data->seq), - vstruct_sectors(w->data, c->block_bits), - bch2_err_str(ret)); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } -err: - bch2_fatal_error(c); -no_io: - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); - } - - continue_at(cl, journal_write_done, j->wq); -} diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h deleted file mode 100644 index 6fa82c4050fea1..00000000000000 --- a/fs/bcachefs/journal_io.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_IO_H -#define _BCACHEFS_JOURNAL_IO_H - -#include "darray.h" - -void bch2_journal_pos_from_member_info_set(struct bch_fs *); -void bch2_journal_pos_from_member_info_resume(struct bch_fs *); - -struct journal_ptr { - bool csum_good; - struct bch_csum csum; - u8 dev; - u32 bucket; - u32 bucket_offset; - u64 sector; -}; - -/* - * Only used for holding the journal entries we read in btree_journal_read() - * during cache_registration - */ -struct journal_replay { - DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; - - bool csum_good; - bool ignore_blacklisted; - bool ignore_not_dirty; - /* must be last: */ - struct jset j; -}; - -static inline bool journal_replay_ignore(struct journal_replay *i) -{ - return !i || i->ignore_blacklisted || i->ignore_not_dirty; -} - -static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, - struct jset_entry *entry, unsigned type) -{ - while (entry < vstruct_last(jset)) { - if (entry->type == type) - return entry; - - entry = vstruct_next(entry); - } - - return NULL; -} - -#define for_each_jset_entry_type(entry, jset, type) \ - for (struct jset_entry *entry = (jset)->start; \ - (entry = __jset_entry_type_next(jset, entry, type)); \ - entry = vstruct_next(entry)) - -#define jset_entry_for_each_key(_e, _k) \ - for (struct bkey_i *_k = (_e)->start; \ - _k < vstruct_last(_e); \ - _k = bkey_next(_k)) - -#define for_each_jset_key(k, entry, jset) \ - for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\ - jset_entry_for_each_key(entry, k) - -int bch2_journal_entry_validate(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, - struct bkey_validate_context); -void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, - struct jset_entry *); - -void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, - struct journal_replay *); - -int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); - -CLOSURE_CALLBACK(bch2_journal_write); - -static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -{ - struct jset_entry *entry = *end; - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - - memset(entry, 0, u64s * sizeof(u64)); - /* - * The u64s field counts from the start of data, ignoring the shared - * fields. - */ - entry->u64s = cpu_to_le16(u64s - 1); - - *end = vstruct_next(*end); - return entry; -} - -#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c deleted file mode 100644 index 0042d43b8e57f8..00000000000000 --- a/fs/bcachefs/journal_reclaim.c +++ /dev/null @@ -1,1037 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "replicas.h" -#include "sb-members.h" -#include "trace.h" - -#include -#include - -static bool __should_discard_bucket(struct journal *, struct journal_device *); - -/* Free space calculations: */ - -static unsigned journal_space_from(struct journal_device *ja, - enum journal_space_from from) -{ - switch (from) { - case journal_space_discarded: - return ja->discard_idx; - case journal_space_clean_ondisk: - return ja->dirty_idx_ondisk; - case journal_space_clean: - return ja->dirty_idx; - default: - BUG(); - } -} - -unsigned bch2_journal_dev_buckets_available(struct journal *j, - struct journal_device *ja, - enum journal_space_from from) -{ - if (!ja->nr) - return 0; - - unsigned available = (journal_space_from(ja, from) - - ja->cur_idx - 1 + ja->nr) % ja->nr; - - /* - * Don't use the last bucket unless writing the new last_seq - * will make another bucket available: - */ - if (available && ja->dirty_idx_ondisk == ja->dirty_idx) - --available; - - return available; -} - -void bch2_journal_set_watermark(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - bool low_on_space = j->space[journal_space_clean].total * 4 <= - j->space[journal_space_total].total; - bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; - bool low_on_wb = bch2_btree_write_buffer_must_wait(c); - unsigned watermark = low_on_space || low_on_pin || low_on_wb - ? BCH_WATERMARK_reclaim - : BCH_WATERMARK_stripe; - - if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) || - track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) || - track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) - trace_and_count(c, journal_full, c); - - mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin); - - swap(watermark, j->watermark); - if (watermark > j->watermark) - journal_wake(j); -} - -static struct journal_space -journal_dev_space_available(struct journal *j, struct bch_dev *ca, - enum journal_space_from from) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_device *ja = &ca->journal; - unsigned sectors, buckets, unwritten; - unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c)); - u64 seq; - - if (from == journal_space_total) - return (struct journal_space) { - .next_entry = bucket_size_aligned, - .total = bucket_size_aligned * ja->nr, - }; - - buckets = bch2_journal_dev_buckets_available(j, ja, from); - sectors = round_down(ja->sectors_free, block_sectors(c)); - - /* - * We that we don't allocate the space for a journal entry - * until we write it out - thus, account for it here: - */ - for (seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) { - unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; - - if (!unwritten) - continue; - - /* entry won't fit on this device, skip: */ - if (unwritten > bucket_size_aligned) - continue; - - if (unwritten >= sectors) { - if (!buckets) { - sectors = 0; - break; - } - - buckets--; - sectors = bucket_size_aligned; - } - - sectors -= unwritten; - } - - if (sectors < ca->mi.bucket_size && buckets) { - buckets--; - sectors = bucket_size_aligned; - } - - return (struct journal_space) { - .next_entry = sectors, - .total = sectors + buckets * bucket_size_aligned, - }; -} - -static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, - enum journal_space_from from) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned pos, nr_devs = 0; - struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; - unsigned min_bucket_size = U32_MAX; - - BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->journal.nr || - !ca->mi.durability) - continue; - - min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); - - space = journal_dev_space_available(j, ca, from); - if (!space.next_entry) - continue; - - for (pos = 0; pos < nr_devs; pos++) - if (space.total > dev_space[pos].total) - break; - - array_insert_item(dev_space, nr_devs, pos, space); - } - - if (nr_devs < nr_devs_want) - return (struct journal_space) { 0, 0 }; - - /* - * It's possible for bucket size to be misaligned w.r.t. the filesystem - * block size: - */ - min_bucket_size = round_down(min_bucket_size, block_sectors(c)); - - /* - * We sorted largest to smallest, and we want the smallest out of the - * @nr_devs_want largest devices: - */ - space = dev_space[nr_devs_want - 1]; - space.next_entry = min(space.next_entry, min_bucket_size); - return space; -} - -void bch2_journal_space_available(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned clean, clean_ondisk, total; - unsigned max_entry_size = min(j->buf[0].buf_size >> 9, - j->buf[1].buf_size >> 9); - unsigned nr_online = 0, nr_devs_want; - bool can_discard = false; - int ret = 0; - - lockdep_assert_held(&j->lock); - guard(rcu)(); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - struct journal_device *ja = &ca->journal; - - if (!ja->nr) - continue; - - while (ja->dirty_idx != ja->cur_idx && - ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) - ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; - - while (ja->dirty_idx_ondisk != ja->dirty_idx && - ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) - ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; - - can_discard |= __should_discard_bucket(j, ja); - - max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); - nr_online++; - } - - j->can_discard = can_discard; - - if (nr_online < metadata_replicas_required(c)) { - if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" - "rw journal devs:", nr_online, metadata_replicas_required(c)); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) - prt_printf(&buf, " %s", ca->name); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - } - ret = bch_err_throw(c, insufficient_journal_devices); - goto out; - } - - nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); - - for (unsigned i = 0; i < journal_space_nr; i++) - j->space[i] = __journal_space_available(j, nr_devs_want, i); - - clean_ondisk = j->space[journal_space_clean_ondisk].total; - clean = j->space[journal_space_clean].total; - total = j->space[journal_space_total].total; - - if (!j->space[journal_space_discarded].next_entry) - ret = bch_err_throw(c, journal_full); - - if ((j->space[journal_space_clean_ondisk].next_entry < - j->space[journal_space_clean_ondisk].total) && - (clean - clean_ondisk <= total / 8) && - (clean_ondisk * 2 > clean)) - set_bit(JOURNAL_may_skip_flush, &j->flags); - else - clear_bit(JOURNAL_may_skip_flush, &j->flags); - - bch2_journal_set_watermark(j); -out: - j->cur_entry_sectors = !ret - ? j->space[journal_space_discarded].next_entry - : 0; - j->cur_entry_error = ret; - - if (!ret) - journal_wake(j); -} - -/* Discards - last part of journal reclaim: */ - -static bool __should_discard_bucket(struct journal *j, struct journal_device *ja) -{ - unsigned min_free = max(4, ja->nr / 8); - - return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < - min_free && - ja->discard_idx != ja->dirty_idx_ondisk; -} - -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -{ - spin_lock(&j->lock); - bool ret = __should_discard_bucket(j, ja); - spin_unlock(&j->lock); - - return ret; -} - -/* - * Advance ja->discard_idx as long as it points to buckets that are no longer - * dirty, issuing discards if necessary: - */ -void bch2_journal_do_discards(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - mutex_lock(&j->discard_lock); - - for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) { - struct journal_device *ja = &ca->journal; - - while (should_discard_bucket(j, ja)) { - if (!c->opts.nochanges && - bch2_discard_opt_enabled(c, ca) && - bdev_max_discard_sectors(ca->disk_sb.bdev)) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, - ja->buckets[ja->discard_idx]), - ca->mi.bucket_size, GFP_NOFS); - - spin_lock(&j->lock); - ja->discard_idx = (ja->discard_idx + 1) % ja->nr; - - bch2_journal_space_available(j); - spin_unlock(&j->lock); - } - } - - mutex_unlock(&j->discard_lock); -} - -/* - * Journal entry pinning - machinery for holding a reference on a given journal - * entry, holding it open to ensure it gets replayed during recovery: - */ - -void bch2_journal_reclaim_fast(struct journal *j) -{ - bool popped = false; - - lockdep_assert_held(&j->lock); - - /* - * Unpin journal entries whose reference counts reached zero, meaning - * all btree nodes got written out - */ - while (!fifo_empty(&j->pin) && - j->pin.front <= j->seq_ondisk && - !atomic_read(&fifo_peek_front(&j->pin).count)) { - j->pin.front++; - popped = true; - } - - if (popped) { - bch2_journal_space_available(j); - __closure_wake_up(&j->reclaim_flush_wait); - } -} - -bool __bch2_journal_pin_put(struct journal *j, u64 seq) -{ - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - - return atomic_dec_and_test(&pin_list->count); -} - -void bch2_journal_pin_put(struct journal *j, u64 seq) -{ - if (__bch2_journal_pin_put(j, seq)) { - spin_lock(&j->lock); - bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); - } -} - -static inline bool __journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) -{ - struct journal_entry_pin_list *pin_list; - - if (!journal_pin_active(pin)) - return false; - - if (j->flush_in_progress == pin) - j->flush_in_progress_dropped = true; - - pin_list = journal_seq_pin(j, pin->seq); - pin->seq = 0; - list_del_init(&pin->list); - - if (j->reclaim_flush_wait.list.first) - __closure_wake_up(&j->reclaim_flush_wait); - - /* - * Unpinning a journal entry may make journal_next_bucket() succeed, if - * writing a new last_seq will now make another bucket available: - */ - return atomic_dec_and_test(&pin_list->count) && - pin_list == &fifo_peek_front(&j->pin); -} - -void bch2_journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) -{ - spin_lock(&j->lock); - if (__journal_pin_drop(j, pin)) - bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); -} - -static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, - journal_pin_flush_fn fn) -{ - if (fn == bch2_btree_node_flush0 || - fn == bch2_btree_node_flush1) { - unsigned idx = fn == bch2_btree_node_flush1; - struct btree *b = container_of(pin, struct btree, writes[idx].journal); - - return JOURNAL_PIN_TYPE_btree0 - b->c.level; - } else if (fn == bch2_btree_key_cache_journal_flush) - return JOURNAL_PIN_TYPE_key_cache; - else - return JOURNAL_PIN_TYPE_other; -} - -static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn, - enum journal_pin_type type) -{ - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - - /* - * flush_fn is how we identify journal pins in debugfs, so must always - * exist, even if it doesn't do anything: - */ - BUG_ON(!flush_fn); - - atomic_inc(&pin_list->count); - pin->seq = seq; - pin->flush = flush_fn; - - if (list_empty(&pin_list->unflushed[type]) && - j->reclaim_flush_wait.list.first) - __closure_wake_up(&j->reclaim_flush_wait); - - list_add(&pin->list, &pin_list->unflushed[type]); -} - -void bch2_journal_pin_copy(struct journal *j, - struct journal_entry_pin *dst, - struct journal_entry_pin *src, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - - u64 seq = READ_ONCE(src->seq); - - if (seq < journal_last_seq(j)) { - /* - * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on - * the src pin - with the pin dropped, the entry to pin might no - * longer to exist, but that means there's no longer anything to - * copy and we can bail out here: - */ - spin_unlock(&j->lock); - return; - } - - bool reclaim = __journal_pin_drop(j, dst); - - bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); - - if (reclaim) - bch2_journal_reclaim_fast(j); - - /* - * If the journal is currently full, we might want to call flush_fn - * immediately: - */ - if (seq == journal_last_seq(j)) - journal_wake(j); - spin_unlock(&j->lock); -} - -void bch2_journal_pin_set(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - - BUG_ON(seq < journal_last_seq(j)); - - bool reclaim = __journal_pin_drop(j, pin); - - bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); - - if (reclaim) - bch2_journal_reclaim_fast(j); - /* - * If the journal is currently full, we might want to call flush_fn - * immediately: - */ - if (seq == journal_last_seq(j)) - journal_wake(j); - - spin_unlock(&j->lock); -} - -/** - * bch2_journal_pin_flush: ensure journal pin callback is no longer running - * @j: journal object - * @pin: pin to flush - */ -void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) -{ - BUG_ON(journal_pin_active(pin)); - - wait_event(j->pin_flush_wait, j->flush_in_progress != pin); -} - -/* - * Journal reclaim: flush references to open journal entries to reclaim space in - * the journal - * - * May be done by the journal code in the background as needed to free up space - * for more journal entries, or as part of doing a clean shutdown, or to migrate - * data off of a specific device: - */ - -static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, - u64 seq_to_flush, - unsigned allowed_below_seq, - unsigned allowed_above_seq, - u64 *seq) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *ret = NULL; - - fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { - if (*seq > seq_to_flush && !allowed_above_seq) - break; - - for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) - if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) || - (BIT(i) & allowed_above_seq)) { - ret = list_first_entry_or_null(&pin_list->unflushed[i], - struct journal_entry_pin, list); - if (ret) - return ret; - } - } - - return NULL; -} - -/* returns true if we did work */ -static size_t journal_flush_pins(struct journal *j, - u64 seq_to_flush, - unsigned allowed_below_seq, - unsigned allowed_above_seq, - unsigned min_any, - unsigned min_key_cache) -{ - struct journal_entry_pin *pin; - size_t nr_flushed = 0; - journal_pin_flush_fn flush_fn; - u64 seq; - int err; - - lockdep_assert_held(&j->reclaim_lock); - - while (1) { - unsigned allowed_above = allowed_above_seq; - unsigned allowed_below = allowed_below_seq; - - if (min_any) { - allowed_above |= ~0; - allowed_below |= ~0; - } - - if (min_key_cache) { - allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache); - allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache); - } - - cond_resched(); - - j->last_flushed = jiffies; - - spin_lock(&j->lock); - pin = journal_get_next_pin(j, seq_to_flush, - allowed_below, - allowed_above, &seq); - if (pin) { - BUG_ON(j->flush_in_progress); - j->flush_in_progress = pin; - j->flush_in_progress_dropped = false; - flush_fn = pin->flush; - } - spin_unlock(&j->lock); - - if (!pin) - break; - - if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush) - min_key_cache--; - - if (min_any) - min_any--; - - err = flush_fn(j, pin, seq); - - spin_lock(&j->lock); - /* Pin might have been dropped or rearmed: */ - if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); - j->flush_in_progress = NULL; - j->flush_in_progress_dropped = false; - spin_unlock(&j->lock); - - wake_up(&j->pin_flush_wait); - - if (err) - break; - - nr_flushed++; - } - - return nr_flushed; -} - -static u64 journal_seq_to_flush(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - u64 seq_to_flush = 0; - - guard(spinlock)(&j->lock); - guard(rcu)(); - - for_each_rw_member_rcu(c, ca) { - struct journal_device *ja = &ca->journal; - unsigned nr_buckets, bucket_to_flush; - - if (!ja->nr) - continue; - - /* Try to keep the journal at most half full: */ - nr_buckets = ja->nr / 2; - - bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; - seq_to_flush = max(seq_to_flush, - ja->bucket_seq[bucket_to_flush]); - } - - /* Also flush if the pin fifo is more than half full */ - return max_t(s64, seq_to_flush, - (s64) journal_cur_seq(j) - - (j->pin.size >> 1)); -} - -/** - * __bch2_journal_reclaim - free up journal buckets - * @j: journal object - * @direct: direct or background reclaim? - * @kicked: requested to run since we last ran? - * - * Background journal reclaim writes out btree nodes. It should be run - * early enough so that we never completely run out of journal buckets. - * - * High watermarks for triggering background reclaim: - * - FIFO has fewer than 512 entries left - * - fewer than 25% journal buckets free - * - * Background reclaim runs until low watermarks are reached: - * - FIFO has more than 1024 entries left - * - more than 50% journal buckets free - * - * As long as a reclaim can complete in the time it takes to fill up - * 512 journal entries or 25% of all journal buckets, then - * journal_next_bucket() should not stall. - */ -static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct btree_cache *bc = &c->btree_cache; - bool kthread = (current->flags & PF_KTHREAD) != 0; - u64 seq_to_flush; - size_t min_nr, min_key_cache, nr_flushed; - unsigned flags; - int ret = 0; - - /* - * We can't invoke memory reclaim while holding the reclaim_lock - - * journal reclaim is required to make progress for memory reclaim - * (cleaning the caches), so we can't get stuck in memory reclaim while - * we're holding the reclaim lock: - */ - lockdep_assert_held(&j->reclaim_lock); - flags = memalloc_noreclaim_save(); - - do { - if (kthread && kthread_should_stop()) - break; - - ret = bch2_journal_error(j); - if (ret) - break; - - /* XXX shove journal discards off to another thread */ - bch2_journal_do_discards(j); - - seq_to_flush = journal_seq_to_flush(j); - min_nr = 0; - - /* - * If it's been longer than j->reclaim_delay_ms since we last flushed, - * make sure to flush at least one journal pin: - */ - if (time_after(jiffies, j->last_flushed + - msecs_to_jiffies(c->opts.journal_reclaim_delay))) - min_nr = 1; - - if (j->watermark != BCH_WATERMARK_stripe) - min_nr = 1; - - size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr; - if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live) - min_nr = 1; - - min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); - - trace_and_count(c, journal_reclaim_start, c, - direct, kicked, - min_nr, min_key_cache, - atomic_long_read(&bc->nr_dirty), btree_cache_live, - atomic_long_read(&c->btree_key_cache.nr_dirty), - atomic_long_read(&c->btree_key_cache.nr_keys)); - - nr_flushed = journal_flush_pins(j, seq_to_flush, - ~0, 0, - min_nr, min_key_cache); - - if (direct) - j->nr_direct_reclaim += nr_flushed; - else - j->nr_background_reclaim += nr_flushed; - trace_and_count(c, journal_reclaim_finish, c, nr_flushed); - - if (nr_flushed) - wake_up(&j->reclaim_wait); - } while ((min_nr || min_key_cache) && nr_flushed && !direct); - - memalloc_noreclaim_restore(flags); - - return ret; -} - -int bch2_journal_reclaim(struct journal *j) -{ - return __bch2_journal_reclaim(j, true, true); -} - -static int bch2_journal_reclaim_thread(void *arg) -{ - struct journal *j = arg; - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned long delay, now; - bool journal_empty; - int ret = 0; - - set_freezable(); - - j->last_flushed = jiffies; - - while (!ret && !kthread_should_stop()) { - bool kicked = j->reclaim_kicked; - - j->reclaim_kicked = false; - - mutex_lock(&j->reclaim_lock); - ret = __bch2_journal_reclaim(j, false, kicked); - mutex_unlock(&j->reclaim_lock); - - now = jiffies; - delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); - j->next_reclaim = j->last_flushed + delay; - - if (!time_in_range(j->next_reclaim, now, now + delay)) - j->next_reclaim = now + delay; - - while (1) { - set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); - if (kthread_should_stop()) - break; - if (j->reclaim_kicked) - break; - - spin_lock(&j->lock); - journal_empty = fifo_empty(&j->pin); - spin_unlock(&j->lock); - - long timeout = j->next_reclaim - jiffies; - - if (journal_empty) - schedule(); - else if (timeout > 0) - schedule_timeout(timeout); - else - break; - } - __set_current_state(TASK_RUNNING); - } - - return 0; -} - -void bch2_journal_reclaim_stop(struct journal *j) -{ - struct task_struct *p = j->reclaim_thread; - - j->reclaim_thread = NULL; - - if (p) { - kthread_stop(p); - put_task_struct(p); - } -} - -int bch2_journal_reclaim_start(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct task_struct *p; - int ret; - - if (j->reclaim_thread) - return 0; - - p = kthread_create(bch2_journal_reclaim_thread, j, - "bch-reclaim/%s", c->name); - ret = PTR_ERR_OR_ZERO(p); - bch_err_msg(c, ret, "creating journal reclaim thread"); - if (ret) - return ret; - - get_task_struct(p); - j->reclaim_thread = p; - wake_up_process(p); - return 0; -} - -static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, - unsigned types) -{ - struct journal_entry_pin_list *pin_list; - u64 seq; - - spin_lock(&j->lock); - fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { - if (seq > seq_to_flush) - break; - - for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) - if ((BIT(i) & types) && - (!list_empty(&pin_list->unflushed[i]) || - !list_empty(&pin_list->flushed[i]))) { - spin_unlock(&j->lock); - return true; - } - } - spin_unlock(&j->lock); - - return false; -} - -static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush, - unsigned types) -{ - return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) || - journal_pins_still_flushing(j, seq_to_flush, types); -} - -static int journal_flush_done(struct journal *j, u64 seq_to_flush, - bool *did_work) -{ - int ret = 0; - - ret = bch2_journal_error(j); - if (ret) - return ret; - - mutex_lock(&j->reclaim_lock); - - for (int type = JOURNAL_PIN_TYPE_NR - 1; - type >= 0; - --type) - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { - *did_work = true; - goto unlock; - } - - if (seq_to_flush > journal_cur_seq(j)) - bch2_journal_entry_close(j); - - spin_lock(&j->lock); - /* - * If journal replay hasn't completed, the unreplayed journal entries - * hold refs on their corresponding sequence numbers - */ - ret = !test_bit(JOURNAL_replay_done, &j->flags) || - journal_last_seq(j) > seq_to_flush || - !fifo_used(&j->pin); - - spin_unlock(&j->lock); -unlock: - mutex_unlock(&j->reclaim_lock); - - return ret; -} - -bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) -{ - /* time_stats this */ - bool did_work = false; - - if (!test_bit(JOURNAL_running, &j->flags)) - return false; - - closure_wait_event(&j->reclaim_flush_wait, - journal_flush_done(j, seq_to_flush, &did_work)); - - return did_work; -} - -int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_entry_pin_list *p; - u64 iter, seq = 0; - int ret = 0; - - spin_lock(&j->lock); - fifo_for_each_entry_ptr(p, &j->pin, iter) - if (dev_idx >= 0 - ? bch2_dev_list_has_dev(p->devs, dev_idx) - : p->devs.nr < c->opts.metadata_replicas) - seq = iter; - spin_unlock(&j->lock); - - bch2_journal_flush_pins(j, seq); - - ret = bch2_journal_error(j); - if (ret) - return ret; - - mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); - - /* - * Now that we've populated replicas_gc, write to the journal to mark - * active journal devices. This handles the case where the journal might - * be empty. Otherwise we could clear all journal replicas and - * temporarily put the fs into an unrecoverable state. Journal recovery - * expects to find devices marked for journal data on unclean mount. - */ - ret = bch2_journal_meta(&c->journal); - if (ret) - goto err; - - seq = 0; - spin_lock(&j->lock); - while (!ret) { - union bch_replicas_padded replicas; - - seq = max(seq, journal_last_seq(j)); - if (seq >= j->pin.back) - break; - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - journal_seq_pin(j, seq)->devs); - seq++; - - if (replicas.e.nr_devs) { - spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, &replicas.e); - spin_lock(&j->lock); - } - } - spin_unlock(&j->lock); -err: - ret = bch2_replicas_gc_end(c, ret); - mutex_unlock(&c->replicas_gc_lock); - - return ret; -} - -bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *pin; - - spin_lock(&j->lock); - if (!test_bit(JOURNAL_running, &j->flags)) { - spin_unlock(&j->lock); - return true; - } - - *seq = max(*seq, j->pin.front); - - if (*seq >= j->pin.back) { - spin_unlock(&j->lock); - return true; - } - - out->atomic++; - - pin_list = journal_seq_pin(j, *seq); - - prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); - printbuf_indent_add(out, 2); - - prt_printf(out, "unflushed:\n"); - for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) - list_for_each_entry(pin, &pin_list->unflushed[i], list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - prt_printf(out, "flushed:\n"); - for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++) - list_for_each_entry(pin, &pin_list->flushed[i], list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - printbuf_indent_sub(out, 2); - - --out->atomic; - spin_unlock(&j->lock); - - return false; -} - -void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) -{ - u64 seq = 0; - - while (!bch2_journal_seq_pins_to_text(out, j, &seq)) - seq++; -} diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h deleted file mode 100644 index 0a73d7134e1cc6..00000000000000 --- a/fs/bcachefs/journal_reclaim.h +++ /dev/null @@ -1,84 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_RECLAIM_H -#define _BCACHEFS_JOURNAL_RECLAIM_H - -#define JOURNAL_PIN (32 * 1024) - -static inline void journal_reclaim_kick(struct journal *j) -{ - struct task_struct *p = READ_ONCE(j->reclaim_thread); - - j->reclaim_kicked = true; - if (p) - wake_up_process(p); -} - -unsigned bch2_journal_dev_buckets_available(struct journal *, - struct journal_device *, - enum journal_space_from); -void bch2_journal_set_watermark(struct journal *); -void bch2_journal_space_available(struct journal *); - -static inline bool journal_pin_active(struct journal_entry_pin *pin) -{ - return pin->seq != 0; -} - -static inline struct journal_entry_pin_list * -journal_seq_pin(struct journal *j, u64 seq) -{ - EBUG_ON(seq < j->pin.front || seq >= j->pin.back); - - return &j->pin.data[seq & j->pin.mask]; -} - -void bch2_journal_reclaim_fast(struct journal *); -bool __bch2_journal_pin_put(struct journal *, u64); -void bch2_journal_pin_put(struct journal *, u64); -void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); - -void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *, - journal_pin_flush_fn); - -static inline void bch2_journal_pin_add(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) - bch2_journal_pin_set(j, seq, pin, flush_fn); -} - -void bch2_journal_pin_copy(struct journal *, - struct journal_entry_pin *, - struct journal_entry_pin *, - journal_pin_flush_fn); - -static inline void bch2_journal_pin_update(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - if (unlikely(!journal_pin_active(pin) || pin->seq < seq)) - bch2_journal_pin_set(j, seq, pin, flush_fn); -} - -void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); - -void bch2_journal_do_discards(struct journal *); -int bch2_journal_reclaim(struct journal *); - -void bch2_journal_reclaim_stop(struct journal *); -int bch2_journal_reclaim_start(struct journal *); - -bool bch2_journal_flush_pins(struct journal *, u64); - -static inline bool bch2_journal_flush_all_pins(struct journal *j) -{ - return bch2_journal_flush_pins(j, U64_MAX); -} - -int bch2_journal_flush_device_pins(struct journal *, int); - -void bch2_journal_pins_to_text(struct printbuf *, struct journal *); -bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); - -#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c deleted file mode 100644 index 0cb9b93f13e79f..00000000000000 --- a/fs/bcachefs/journal_sb.c +++ /dev/null @@ -1,232 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "journal_sb.h" -#include "darray.h" - -#include - -/* BCH_SB_FIELD_journal: */ - -static int u64_cmp(const void *_l, const void *_r) -{ - const u64 *l = _l; - const u64 *r = _r; - - return cmp_int(*l, *r); -} - -static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_journal *journal = field_to_type(f, journal); - struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); - int ret = -BCH_ERR_invalid_sb_journal; - unsigned nr; - unsigned i; - u64 *b; - - nr = bch2_nr_journal_buckets(journal); - if (!nr) - return 0; - - b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL); - if (!b) - return -BCH_ERR_ENOMEM_sb_journal_validate; - - for (i = 0; i < nr; i++) - b[i] = le64_to_cpu(journal->buckets[i]); - - sort(b, nr, sizeof(u64), u64_cmp, NULL); - - if (!b[0]) { - prt_printf(err, "journal bucket at sector 0"); - goto err; - } - - if (b[0] < le16_to_cpu(m.first_bucket)) { - prt_printf(err, "journal bucket %llu before first bucket %u", - b[0], le16_to_cpu(m.first_bucket)); - goto err; - } - - if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) { - prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1], le64_to_cpu(m.nbuckets)); - goto err; - } - - for (i = 0; i + 1 < nr; i++) - if (b[i] == b[i + 1]) { - prt_printf(err, "duplicate journal buckets %llu", b[i]); - goto err; - } - - ret = 0; -err: - kfree(b); - return ret; -} - -static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_journal *journal = field_to_type(f, journal); - unsigned i, nr = bch2_nr_journal_buckets(journal); - - prt_printf(out, "Buckets: "); - for (i = 0; i < nr; i++) - prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i])); - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_journal = { - .validate = bch2_sb_journal_validate, - .to_text = bch2_sb_journal_to_text, -}; - -struct u64_range { - u64 start; - u64 end; -}; - -static int u64_range_cmp(const void *_l, const void *_r) -{ - const struct u64_range *l = _l; - const struct u64_range *r = _r; - - return cmp_int(l->start, r->start); -} - -static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); - struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); - int ret = -BCH_ERR_invalid_sb_journal; - u64 sum = 0; - unsigned nr; - unsigned i; - struct u64_range *b; - - nr = bch2_sb_field_journal_v2_nr_entries(journal); - if (!nr) - return 0; - - b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL); - if (!b) - return -BCH_ERR_ENOMEM_sb_journal_v2_validate; - - for (i = 0; i < nr; i++) { - b[i].start = le64_to_cpu(journal->d[i].start); - b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); - - if (b[i].end <= b[i].start) { - prt_printf(err, "journal buckets entry with bad nr: %llu+%llu", - le64_to_cpu(journal->d[i].start), - le64_to_cpu(journal->d[i].nr)); - goto err; - } - - sum += le64_to_cpu(journal->d[i].nr); - } - - sort(b, nr, sizeof(*b), u64_range_cmp, NULL); - - if (!b[0].start) { - prt_printf(err, "journal bucket at sector 0"); - goto err; - } - - if (b[0].start < le16_to_cpu(m.first_bucket)) { - prt_printf(err, "journal bucket %llu before first bucket %u", - b[0].start, le16_to_cpu(m.first_bucket)); - goto err; - } - - if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) { - prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1].end - 1, le64_to_cpu(m.nbuckets)); - goto err; - } - - for (i = 0; i + 1 < nr; i++) { - if (b[i].end > b[i + 1].start) { - prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", - b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); - goto err; - } - } - - if (sum > UINT_MAX) { - prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX); - goto err; - } - - ret = 0; -err: - kfree(b); - return ret; -} - -static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); - unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); - - prt_printf(out, "Buckets: "); - for (i = 0; i < nr; i++) - prt_printf(out, " %llu-%llu", - le64_to_cpu(journal->d[i].start), - le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { - .validate = bch2_sb_journal_v2_validate, - .to_text = bch2_sb_journal_v2_to_text, -}; - -int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, - u64 *buckets, unsigned nr) -{ - struct bch_sb_field_journal_v2 *j; - unsigned i, dst = 0, nr_compacted = 1; - - if (c) - lockdep_assert_held(&c->sb_lock); - - if (!nr) { - bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); - bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); - return 0; - } - - for (i = 0; i + 1 < nr; i++) - if (buckets[i] + 1 != buckets[i + 1]) - nr_compacted++; - - j = bch2_sb_field_resize(&ca->disk_sb, journal_v2, - (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); - if (!j) - return bch_err_throw(c, ENOSPC_sb_journal); - - bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); - - j->d[dst].start = cpu_to_le64(buckets[0]); - j->d[dst].nr = cpu_to_le64(1); - - for (i = 1; i < nr; i++) { - if (buckets[i] == buckets[i - 1] + 1) { - le64_add_cpu(&j->d[dst].nr, 1); - } else { - dst++; - j->d[dst].start = cpu_to_le64(buckets[i]); - j->d[dst].nr = cpu_to_le64(1); - } - } - - BUG_ON(dst + 1 != nr_compacted); - return 0; -} diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h deleted file mode 100644 index ba40a7e8d90a32..00000000000000 --- a/fs/bcachefs/journal_sb.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#include "super-io.h" -#include "vstructs.h" - -static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -{ - return j - ? (__le64 *) vstruct_end(&j->field) - j->buckets - : 0; -} - -static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) -{ - if (!j) - return 0; - - return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; -} - -extern const struct bch_sb_field_ops bch_sb_field_ops_journal; -extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; - -int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c deleted file mode 100644 index af4fe416d9ecac..00000000000000 --- a/fs/bcachefs/journal_seq_blacklist.c +++ /dev/null @@ -1,264 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "eytzinger.h" -#include "journal.h" -#include "journal_seq_blacklist.h" -#include "super-io.h" - -/* - * journal_seq_blacklist machinery: - * - * To guarantee order of btree updates after a crash, we need to detect when a - * btree node entry (bset) is newer than the newest journal entry that was - * successfully written, and ignore it - effectively ignoring any btree updates - * that didn't make it into the journal. - * - * If we didn't do this, we might have two btree nodes, a and b, both with - * updates that weren't written to the journal yet: if b was updated after a, - * but b was flushed and not a - oops; on recovery we'll find that the updates - * to b happened, but not the updates to a that happened before it. - * - * Ignoring bsets that are newer than the newest journal entry is always safe, - * because everything they contain will also have been journalled - and must - * still be present in the journal on disk until a journal entry has been - * written _after_ that bset was written. - * - * To accomplish this, bsets record the newest journal sequence number they - * contain updates for; then, on startup, the btree code queries the journal - * code to ask "Is this sequence number newer than the newest journal entry? If - * so, ignore it." - * - * When this happens, we must blacklist that journal sequence number: the - * journal must not write any entries with that sequence number, and it must - * record that it was blacklisted so that a) on recovery we don't think we have - * missing journal entries and b) so that the btree code continues to ignore - * that bset, until that btree node is rewritten. - */ - -static unsigned sb_blacklist_u64s(unsigned nr) -{ - struct bch_sb_field_journal_seq_blacklist *bl; - - return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); -} - -int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) -{ - struct bch_sb_field_journal_seq_blacklist *bl; - unsigned i = 0, nr; - int ret = 0; - - mutex_lock(&c->sb_lock); - bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); - nr = blacklist_nr_entries(bl); - - while (i < nr) { - struct journal_seq_blacklist_entry *e = - bl->start + i; - - if (end < le64_to_cpu(e->start)) - break; - - if (start > le64_to_cpu(e->end)) { - i++; - continue; - } - - /* - * Entry is contiguous or overlapping with new entry: merge it - * with new entry, and delete: - */ - - start = min(start, le64_to_cpu(e->start)); - end = max(end, le64_to_cpu(e->end)); - array_remove_item(bl->start, nr, i); - } - - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - sb_blacklist_u64s(nr + 1)); - if (!bl) { - ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist); - goto out; - } - - array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { - .start = cpu_to_le64(start), - .end = cpu_to_le64(end), - })); - c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); - - ret = bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); - - return ret ?: bch2_blacklist_table_initialize(c); -} - -static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) -{ - const struct journal_seq_blacklist_table_entry *l = _l; - const struct journal_seq_blacklist_table_entry *r = _r; - - return cmp_int(l->start, r->start); -} - -bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, - bool dirty) -{ - struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; - struct journal_seq_blacklist_table_entry search = { .start = seq }; - int idx; - - if (!t) - return false; - - idx = eytzinger0_find_le(t->entries, t->nr, - sizeof(t->entries[0]), - journal_seq_blacklist_table_cmp, - &search); - if (idx < 0) - return false; - - BUG_ON(t->entries[idx].start > seq); - - if (seq >= t->entries[idx].end) - return false; - - if (dirty) - t->entries[idx].dirty = true; - return true; -} - -u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c) -{ - struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; - - if (!t || !t->nr) - return 0; - - return t->entries[eytzinger0_last(t->nr)].end - 1; -} - -int bch2_blacklist_table_initialize(struct bch_fs *c) -{ - struct bch_sb_field_journal_seq_blacklist *bl = - bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); - struct journal_seq_blacklist_table *t; - unsigned i, nr = blacklist_nr_entries(bl); - - if (!bl) - return 0; - - t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); - if (!t) - return bch_err_throw(c, ENOMEM_blacklist_table_init); - - t->nr = nr; - - for (i = 0; i < nr; i++) { - t->entries[i].start = le64_to_cpu(bl->start[i].start); - t->entries[i].end = le64_to_cpu(bl->start[i].end); - } - - eytzinger0_sort(t->entries, - t->nr, - sizeof(t->entries[0]), - journal_seq_blacklist_table_cmp, - NULL); - - kfree(c->journal_seq_blacklist_table); - c->journal_seq_blacklist_table = t; - return 0; -} - -static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_journal_seq_blacklist *bl = - field_to_type(f, journal_seq_blacklist); - unsigned i, nr = blacklist_nr_entries(bl); - - for (i = 0; i < nr; i++) { - struct journal_seq_blacklist_entry *e = bl->start + i; - - if (le64_to_cpu(e->start) >= - le64_to_cpu(e->end)) { - prt_printf(err, "entry %u start >= end (%llu >= %llu)", - i, le64_to_cpu(e->start), le64_to_cpu(e->end)); - return -BCH_ERR_invalid_sb_journal_seq_blacklist; - } - - if (i + 1 < nr && - le64_to_cpu(e[0].end) > - le64_to_cpu(e[1].start)) { - prt_printf(err, "entry %u out of order with next entry (%llu > %llu)", - i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); - return -BCH_ERR_invalid_sb_journal_seq_blacklist; - } - } - - return 0; -} - -static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_journal_seq_blacklist *bl = - field_to_type(f, journal_seq_blacklist); - struct journal_seq_blacklist_entry *i; - unsigned nr = blacklist_nr_entries(bl); - - for (i = bl->start; i < bl->start + nr; i++) { - if (i != bl->start) - prt_printf(out, " "); - - prt_printf(out, "%llu-%llu", - le64_to_cpu(i->start), - le64_to_cpu(i->end)); - } - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { - .validate = bch2_sb_journal_seq_blacklist_validate, - .to_text = bch2_sb_journal_seq_blacklist_to_text -}; - -bool bch2_blacklist_entries_gc(struct bch_fs *c) -{ - struct journal_seq_blacklist_entry *src, *dst; - - struct bch_sb_field_journal_seq_blacklist *bl = - bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); - if (!bl) - return false; - - unsigned nr = blacklist_nr_entries(bl); - dst = bl->start; - - struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; - BUG_ON(nr != t->nr); - - src = bl->start; - eytzinger0_for_each(i, nr) { - BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); - BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); - - if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) - *dst++ = *src; - src++; - } - - unsigned new_nr = dst - bl->start; - if (new_nr == nr) - return false; - - bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr); - - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - new_nr ? sb_blacklist_u64s(new_nr) : 0); - BUG_ON(new_nr && !bl); - return true; -} diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h deleted file mode 100644 index f06942ccfcddaa..00000000000000 --- a/fs/bcachefs/journal_seq_blacklist.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H - -static inline unsigned -blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) -{ - return bl - ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / - sizeof(struct journal_seq_blacklist_entry)) - : 0; -} - -bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); -u64 bch2_journal_last_blacklisted_seq(struct bch_fs *); -int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); -int bch2_blacklist_table_initialize(struct bch_fs *); - -extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; - -bool bch2_blacklist_entries_gc(struct bch_fs *); - -#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h deleted file mode 100644 index 2566b12dbc045e..00000000000000 --- a/fs/bcachefs/journal_seq_blacklist_format.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H -#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H - -struct journal_seq_blacklist_entry { - __le64 start; - __le64 end; -}; - -struct bch_sb_field_journal_seq_blacklist { - struct bch_sb_field field; - struct journal_seq_blacklist_entry start[]; -}; - -#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h deleted file mode 100644 index 51104bbb99dae7..00000000000000 --- a/fs/bcachefs/journal_types.h +++ /dev/null @@ -1,342 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_TYPES_H -#define _BCACHEFS_JOURNAL_TYPES_H - -#include -#include - -#include "alloc_types.h" -#include "super_types.h" -#include "fifo.h" - -/* btree write buffer steals 8 bits for its own purposes: */ -#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) - -#define JOURNAL_STATE_BUF_BITS 2 -#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS) -#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1) - -#define JOURNAL_BUF_BITS 4 -#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) -#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) - -/* - * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to - * the journal that are being staged or in flight. - */ -struct journal_buf { - struct closure io; - struct jset *data; - - __BKEY_PADDED(key, BCH_REPLICAS_MAX); - struct bch_devs_list devs_written; - - struct closure_waitlist wait; - u64 last_seq; /* copy of data->last_seq */ - long expires; - u64 flush_time; - - unsigned buf_size; /* size in bytes of @data */ - unsigned sectors; /* maximum size for current entry */ - unsigned disk_sectors; /* maximum size entry could have been, if - buf_size was bigger */ - unsigned u64s_reserved; - bool noflush:1; /* write has already been kicked off, and was noflush */ - bool must_flush:1; /* something wants a flush */ - bool separate_flush:1; - bool need_flush_to_write_buffer:1; - bool write_started:1; - bool write_allocated:1; - bool write_done:1; - u8 idx; -}; - -/* - * Something that makes a journal entry dirty - i.e. a btree node that has to be - * flushed: - */ - -enum journal_pin_type { - JOURNAL_PIN_TYPE_btree3, - JOURNAL_PIN_TYPE_btree2, - JOURNAL_PIN_TYPE_btree1, - JOURNAL_PIN_TYPE_btree0, - JOURNAL_PIN_TYPE_key_cache, - JOURNAL_PIN_TYPE_other, - JOURNAL_PIN_TYPE_NR, -}; - -struct journal_entry_pin_list { - struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; - struct list_head flushed[JOURNAL_PIN_TYPE_NR]; - atomic_t count; - struct bch_devs_list devs; -}; - -struct journal; -struct journal_entry_pin; -typedef int (*journal_pin_flush_fn)(struct journal *j, - struct journal_entry_pin *, u64); - -struct journal_entry_pin { - struct list_head list; - journal_pin_flush_fn flush; - u64 seq; -}; - -struct journal_res { - bool ref; - u16 u64s; - u32 offset; - u64 seq; -}; - -union journal_res_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - u64 cur_entry_offset:22, - idx:2, - buf0_count:10, - buf1_count:10, - buf2_count:10, - buf3_count:10; - }; -}; - -/* bytes: */ -#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ - -/* - * We stash some journal state as sentinal values in cur_entry_offset: - * note - cur_entry_offset is in units of u64s - */ -#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1) - -#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) -#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) -#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) - -struct journal_space { - /* Units of 512 bytes sectors: */ - unsigned next_entry; /* How big the next journal entry can be */ - unsigned total; -}; - -enum journal_space_from { - journal_space_discarded, - journal_space_clean_ondisk, - journal_space_clean, - journal_space_total, - journal_space_nr, -}; - -#define JOURNAL_FLAGS() \ - x(replay_done) \ - x(running) \ - x(may_skip_flush) \ - x(need_flush_write) \ - x(space_low) - -enum journal_flags { -#define x(n) JOURNAL_##n, - JOURNAL_FLAGS() -#undef x -}; - -struct journal_bio { - struct bch_dev *ca; - unsigned buf_idx; - u64 submit_time; - - struct bio bio; -}; - -/* Embedded in struct bch_fs */ -struct journal { - /* Fastpath stuff up front: */ - struct { - - union journal_res_state reservations; - enum bch_watermark watermark; - - } __aligned(SMP_CACHE_BYTES); - - unsigned long flags; - - /* Max size of current journal entry */ - unsigned cur_entry_u64s; - unsigned cur_entry_sectors; - - /* Reserved space in journal entry to be used just prior to write */ - unsigned entry_u64s_reserved; - - - /* - * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if - * insufficient devices: - */ - int cur_entry_error; - unsigned cur_entry_offset_if_blocked; - - unsigned buf_size_want; - /* - * We may queue up some things to be journalled (log messages) before - * the journal has actually started - stash them here: - */ - darray_u64 early_journal_entries; - - /* - * Protects journal_buf->data, when accessing without a jorunal - * reservation: for synchronization between the btree write buffer code - * and the journal write path: - */ - struct mutex buf_lock; - /* - * Two journal entries -- one is currently open for new entries, the - * other is possibly being written out. - */ - struct journal_buf buf[JOURNAL_BUF_NR]; - void *free_buf; - unsigned free_buf_size; - - spinlock_t lock; - - /* if nonzero, we may not open a new journal entry: */ - unsigned blocked; - - /* Used when waiting because the journal was full */ - wait_queue_head_t wait; - struct closure_waitlist async_wait; - struct closure_waitlist reclaim_flush_wait; - - struct delayed_work write_work; - struct workqueue_struct *wq; - - /* Sequence number of most recent journal entry (last entry in @pin) */ - atomic64_t seq; - - u64 seq_write_started; - /* seq, last_seq from the most recent journal entry successfully written */ - u64 seq_ondisk; - u64 flushed_seq_ondisk; - u64 flushing_seq; - u64 last_seq_ondisk; - u64 err_seq; - u64 last_empty_seq; - u64 oldest_seq_found_ondisk; - - /* - * FIFO of journal entries whose btree updates have not yet been - * written out. - * - * Each entry is a reference count. The position in the FIFO is the - * entry's sequence number relative to @seq. - * - * The journal entry itself holds a reference count, put when the - * journal entry is written out. Each btree node modified by the journal - * entry also holds a reference count, put when the btree node is - * written. - * - * When a reference count reaches zero, the journal entry is no longer - * needed. When all journal entries in the oldest journal bucket are no - * longer needed, the bucket can be discarded and reused. - */ - struct { - u64 front, back, size, mask; - struct journal_entry_pin_list *data; - } pin; - - struct journal_space space[journal_space_nr]; - - u64 replay_journal_seq; - u64 replay_journal_seq_end; - - struct write_point wp; - spinlock_t err_lock; - - struct mutex reclaim_lock; - /* - * Used for waiting until journal reclaim has freed up space in the - * journal: - */ - wait_queue_head_t reclaim_wait; - struct task_struct *reclaim_thread; - bool reclaim_kicked; - unsigned long next_reclaim; - u64 nr_direct_reclaim; - u64 nr_background_reclaim; - - unsigned long last_flushed; - struct journal_entry_pin *flush_in_progress; - bool flush_in_progress_dropped; - wait_queue_head_t pin_flush_wait; - - /* protects advancing ja->discard_idx: */ - struct mutex discard_lock; - bool can_discard; - - unsigned long last_flush_write; - - u64 write_start_time; - - u64 nr_flush_writes; - u64 nr_noflush_writes; - u64 entry_bytes_written; - - struct bch2_time_stats *flush_write_time; - struct bch2_time_stats *noflush_write_time; - struct bch2_time_stats *flush_seq_time; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map res_map; -#endif -} __aligned(SMP_CACHE_BYTES); - -/* - * Embedded in struct bch_dev. First three fields refer to the array of journal - * buckets, in bch_sb. - */ -struct journal_device { - /* - * For each journal bucket, contains the max sequence number of the - * journal writes it contains - so we know when a bucket can be reused. - */ - u64 *bucket_seq; - - unsigned sectors_free; - - /* - * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: - */ - unsigned discard_idx; /* Next bucket to discard */ - unsigned dirty_idx_ondisk; - unsigned dirty_idx; - unsigned cur_idx; /* Journal bucket we're currently writing to */ - unsigned nr; - - u64 *buckets; - - /* Bio for journal reads/writes to this device */ - struct journal_bio *bio[JOURNAL_BUF_NR]; - - /* for bch_journal_read_device */ - struct closure read; - u64 highest_seq_found; -}; - -/* - * journal_entry_res - reserve space in every journal entry: - */ -struct journal_entry_res { - unsigned u64s; -}; - -#endif /* _BCACHEFS_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c deleted file mode 100644 index 1b828bddd11bf1..00000000000000 --- a/fs/bcachefs/keylist.c +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey.h" -#include "keylist.h" - -int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, - size_t nr_inline_u64s, size_t new_u64s) -{ - size_t oldsize = bch2_keylist_u64s(l); - size_t newsize = oldsize + new_u64s; - u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; - u64 *new_keys; - - newsize = roundup_pow_of_two(newsize); - - if (newsize <= nr_inline_u64s || - (old_buf && roundup_pow_of_two(oldsize) == newsize)) - return 0; - - new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS); - if (!new_keys) - return -ENOMEM; - - if (!old_buf) - memcpy_u64s(new_keys, inline_u64s, oldsize); - - l->keys_p = new_keys; - l->top_p = new_keys + oldsize; - - return 0; -} - -void bch2_keylist_pop_front(struct keylist *l) -{ - l->top_p -= bch2_keylist_front(l)->k.u64s; - - memmove_u64s_down(l->keys, - bkey_next(l->keys), - bch2_keylist_u64s(l)); -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_verify_keylist_sorted(struct keylist *l) -{ - for_each_keylist_key(l, k) - BUG_ON(bkey_next(k) != l->top && - bpos_ge(k->k.p, bkey_next(k)->k.p)); -} -#endif diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h deleted file mode 100644 index e687e0e9aede1c..00000000000000 --- a/fs/bcachefs/keylist.h +++ /dev/null @@ -1,72 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_KEYLIST_H -#define _BCACHEFS_KEYLIST_H - -#include "keylist_types.h" - -int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); -void bch2_keylist_pop_front(struct keylist *); - -static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) -{ - l->top_p = l->keys_p = inline_keys; -} - -static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) -{ - if (l->keys_p != inline_keys) - kfree(l->keys_p); -} - -static inline void bch2_keylist_push(struct keylist *l) -{ - l->top = bkey_next(l->top); -} - -static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) -{ - bkey_copy(l->top, k); - bch2_keylist_push(l); -} - -static inline bool bch2_keylist_empty(struct keylist *l) -{ - return l->top == l->keys; -} - -static inline size_t bch2_keylist_u64s(struct keylist *l) -{ - return l->top_p - l->keys_p; -} - -static inline size_t bch2_keylist_bytes(struct keylist *l) -{ - return bch2_keylist_u64s(l) * sizeof(u64); -} - -static inline struct bkey_i *bch2_keylist_front(struct keylist *l) -{ - return l->keys; -} - -#define for_each_keylist_key(_keylist, _k) \ - for (struct bkey_i *_k = (_keylist)->keys; \ - _k != (_keylist)->top; \ - _k = bkey_next(_k)) - -static inline u64 keylist_sectors(struct keylist *keys) -{ - u64 ret = 0; - - for_each_keylist_key(keys, k) - ret += k->k.size; - return ret; -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_verify_keylist_sorted(struct keylist *); -#else -static inline void bch2_verify_keylist_sorted(struct keylist *l) {} -#endif - -#endif /* _BCACHEFS_KEYLIST_H */ diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h deleted file mode 100644 index 4b3ff7d8a87560..00000000000000 --- a/fs/bcachefs/keylist_types.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_KEYLIST_TYPES_H -#define _BCACHEFS_KEYLIST_TYPES_H - -struct keylist { - union { - struct bkey_i *keys; - u64 *keys_p; - }; - union { - struct bkey_i *top; - u64 *top_p; - }; -}; - -#endif /* _BCACHEFS_KEYLIST_TYPES_H */ diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c deleted file mode 100644 index 75f27ec26f85f8..00000000000000 --- a/fs/bcachefs/logged_ops.c +++ /dev/null @@ -1,119 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "error.h" -#include "io_misc.h" -#include "logged_ops.h" -#include "super.h" - -struct bch_logged_op_fn { - u8 type; - int (*resume)(struct btree_trans *, struct bkey_i *); -}; - -static const struct bch_logged_op_fn logged_op_fns[] = { -#define x(n) { \ - .type = KEY_TYPE_logged_op_##n, \ - .resume = bch2_resume_logged_op_##n, \ -}, - BCH_LOGGED_OPS() -#undef x -}; - -static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type) -{ - for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++) - if (logged_op_fns[i].type == type) - return logged_op_fns + i; - return NULL; -} - -static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; - struct printbuf buf = PRINTBUF; - int ret = 0; - - fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags), - trans, logged_op_but_clean, - "filesystem marked as clean but have logged op\n%s", - (bch2_bkey_val_to_text(&buf, c, k), - buf.buf)); - - struct bkey_buf sk; - bch2_bkey_buf_init(&sk); - bch2_bkey_buf_reassemble(&sk, c, k); - - const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type); - if (fn) - fn->resume(trans, sk.k); - - ret = bch2_logged_op_finish(trans, sk.k); - - bch2_bkey_buf_exit(&sk, c); -fsck_err: - printbuf_exit(&buf); - return ret ?: trans_was_restarted(trans, restart_count); -} - -int bch2_resume_logged_ops(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, - BTREE_ID_logged_ops, - POS(LOGGED_OPS_INUM_logged_ops, 0), - POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), - BTREE_ITER_prefetch, k, - resume_logged_op(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) -{ - struct btree_iter iter; - int ret = bch2_bkey_get_empty_slot(trans, &iter, - BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX)); - if (ret) - return ret; - - k->k.p = iter.pos; - - ret = bch2_trans_update(trans, &iter, k, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) -{ - return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_logged_op_start(trans, k)); -} - -int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) -{ - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0)); - /* - * This needs to be a fatal error because we've left an unfinished - * operation in the logged ops btree. - * - * We should only ever see an error here if the filesystem has already - * been shut down, but make sure of that here: - */ - if (ret) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - bch2_fs_fatal_error(c, "deleting logged operation %s: %s", - buf.buf, bch2_err_str(ret)); - printbuf_exit(&buf); - } - - return ret; -} diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h deleted file mode 100644 index 30ae9ef737dd95..00000000000000 --- a/fs/bcachefs/logged_ops.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LOGGED_OPS_H -#define _BCACHEFS_LOGGED_OPS_H - -#include "bkey.h" - -#define BCH_LOGGED_OPS() \ - x(truncate) \ - x(finsert) - -static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op) -{ - return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0); -} - -int bch2_resume_logged_ops(struct bch_fs *); -int bch2_logged_op_start(struct btree_trans *, struct bkey_i *); -int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *); - -#endif /* _BCACHEFS_LOGGED_OPS_H */ diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h deleted file mode 100644 index cfb67c95d4c8a0..00000000000000 --- a/fs/bcachefs/logged_ops_format.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H -#define _BCACHEFS_LOGGED_OPS_FORMAT_H - -enum logged_ops_inums { - LOGGED_OPS_INUM_logged_ops, - LOGGED_OPS_INUM_inode_cursors, -}; - -struct bch_logged_op_truncate { - struct bch_val v; - __le32 subvol; - __le32 pad; - __le64 inum; - __le64 new_i_size; -}; - -enum logged_op_finsert_state { - LOGGED_OP_FINSERT_start, - LOGGED_OP_FINSERT_shift_extents, - LOGGED_OP_FINSERT_finish, -}; - -struct bch_logged_op_finsert { - struct bch_val v; - __u8 state; - __u8 pad[3]; - __le32 subvol; - __le64 inum; - __le64 dst_offset; - __le64 src_offset; - __le64 pos; -}; - -#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c deleted file mode 100644 index 57b5b3263b083a..00000000000000 --- a/fs/bcachefs/lru.c +++ /dev/null @@ -1,223 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "bkey_buf.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "ec.h" -#include "error.h" -#include "lru.h" -#include "recovery.h" - -/* KEY_TYPE_lru is obsolete: */ -int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(!lru_pos_time(k.k->p), - c, lru_entry_at_time_0, - "lru entry at time=0"); -fsck_err: - return ret; -} - -void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct bch_lru *lru = bkey_s_c_to_lru(k).v; - - prt_printf(out, "idx %llu", le64_to_cpu(lru->idx)); -} - -void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru) -{ - prt_printf(out, "%llu:%llu -> %llu:%llu", - lru_pos_id(lru), - lru_pos_time(lru), - u64_to_bucket(lru.offset).inode, - u64_to_bucket(lru.offset).offset); -} - -static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, - u64 dev_bucket, u64 time, bool set) -{ - return time - ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), set) - : 0; -} - -int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) -{ - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted); -} - -int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) -{ - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); -} - -int __bch2_lru_change(struct btree_trans *trans, - u16 lru_id, u64 dev_bucket, - u64 old_time, u64 new_time) -{ - if (old_time == new_time) - return 0; - - return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?: - bch2_lru_set(trans, lru_id, dev_bucket, new_time); -} - -static const char * const bch2_lru_types[] = { -#define x(n) #n, - BCH_LRU_TYPES() -#undef x - NULL -}; - -int bch2_lru_check_set(struct btree_trans *trans, - u16 lru_id, - u64 dev_bucket, - u64 time, - struct bkey_s_c referring_k, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter lru_iter; - struct bkey_s_c lru_k = - bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), 0); - int ret = bkey_err(lru_k); - if (ret) - return ret; - - if (lru_k.k->type != KEY_TYPE_set) { - ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed); - if (ret) - goto err; - - if (fsck_err(trans, alloc_key_to_missing_lru_entry, - "missing %s lru entry\n%s", - bch2_lru_types[lru_type(lru_k)], - (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { - ret = bch2_lru_set(trans, lru_id, dev_bucket, time); - if (ret) - goto err; - } - } -err: -fsck_err: - bch2_trans_iter_exit(trans, &lru_iter); - printbuf_exit(&buf); - return ret; -} - -static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) -{ - enum bch_lru_type type = lru_type(lru_k); - - switch (type) { - case BCH_LRU_read: - case BCH_LRU_fragmentation: - return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset)); - case BCH_LRU_stripes: - return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset)); - default: - BUG(); - } -} - -static u64 bkey_lru_type_idx(struct bch_fs *c, - enum bch_lru_type type, - struct bkey_s_c k) -{ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - - switch (type) { - case BCH_LRU_read: - a = bch2_alloc_to_v4(k, &a_convert); - return alloc_lru_idx_read(*a); - case BCH_LRU_fragmentation: { - a = bch2_alloc_to_v4(k, &a_convert); - - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); - return ca - ? alloc_lru_idx_fragmentation(*a, ca) - : 0; - } - case BCH_LRU_stripes: - return k.k->type == KEY_TYPE_stripe - ? stripe_lru_pos(bkey_s_c_to_stripe(k).v) - : 0; - default: - BUG(); - } -} - -static int bch2_check_lru_key(struct btree_trans *trans, - struct btree_iter *lru_iter, - struct bkey_s_c lru_k, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - struct bbpos bp = lru_pos_to_bp(lru_k); - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0); - int ret = bkey_err(k); - if (ret) - goto err; - - enum bch_lru_type type = lru_type(lru_k); - u64 idx = bkey_lru_type_idx(c, type, k); - - if (lru_pos_time(lru_k.k->p) != idx) { - ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); - if (ret) - goto err; - - if (fsck_err(trans, lru_entry_bad, - "incorrect lru entry: lru %s time %llu\n" - "%s\n" - "for %s", - bch2_lru_types[type], - lru_pos_time(lru_k.k->p), - (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), - (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); - } -err: -fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - -int bch2_check_lrus(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_lru_key(trans, &iter, k, &last_flushed))); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; - -} diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h deleted file mode 100644 index 8abd0aa2083aea..00000000000000 --- a/fs/bcachefs/lru.h +++ /dev/null @@ -1,70 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LRU_H -#define _BCACHEFS_LRU_H - -static inline u64 lru_pos_id(struct bpos pos) -{ - return pos.inode >> LRU_TIME_BITS; -} - -static inline u64 lru_pos_time(struct bpos pos) -{ - return pos.inode & ~(~0ULL << LRU_TIME_BITS); -} - -static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) -{ - struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket); - - EBUG_ON(time > LRU_TIME_MAX); - EBUG_ON(lru_pos_id(pos) != lru_id); - EBUG_ON(lru_pos_time(pos) != time); - EBUG_ON(pos.offset != dev_bucket); - - return pos; -} - -static inline enum bch_lru_type lru_type(struct bkey_s_c l) -{ - u16 lru_id = l.k->p.inode >> 48; - - switch (lru_id) { - case BCH_LRU_BUCKET_FRAGMENTATION: - return BCH_LRU_fragmentation; - case BCH_LRU_STRIPE_FRAGMENTATION: - return BCH_LRU_stripes; - default: - return BCH_LRU_read; - } -} - -int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); -void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -void bch2_lru_pos_to_text(struct printbuf *, struct bpos); - -#define bch2_bkey_ops_lru ((struct bkey_ops) { \ - .key_validate = bch2_lru_validate, \ - .val_to_text = bch2_lru_to_text, \ - .min_val_size = 8, \ -}) - -int bch2_lru_del(struct btree_trans *, u16, u64, u64); -int bch2_lru_set(struct btree_trans *, u16, u64, u64); -int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); - -static inline int bch2_lru_change(struct btree_trans *trans, - u16 lru_id, u64 dev_bucket, - u64 old_time, u64 new_time) -{ - return old_time != new_time - ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time) - : 0; -} - -struct bkey_buf; -int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); - -int bch2_check_lrus(struct bch_fs *); - -#endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h deleted file mode 100644 index b7392ad8e41f0a..00000000000000 --- a/fs/bcachefs/lru_format.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LRU_FORMAT_H -#define _BCACHEFS_LRU_FORMAT_H - -struct bch_lru { - struct bch_val v; - __le64 idx; -} __packed __aligned(8); - -#define BCH_LRU_TYPES() \ - x(read) \ - x(fragmentation) \ - x(stripes) - -enum bch_lru_type { -#define x(n) BCH_LRU_##n, - BCH_LRU_TYPES() -#undef x -}; - -#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1) -#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2) - -#define LRU_TIME_BITS 48 -#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) - -#endif /* _BCACHEFS_LRU_FORMAT_H */ diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c deleted file mode 100644 index 0ea9f30803a2b3..00000000000000 --- a/fs/bcachefs/mean_and_variance.c +++ /dev/null @@ -1,173 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions for incremental mean and variance. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published by - * the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * Copyright © 2022 Daniel B. Hill - * - * Author: Daniel B. Hill - * - * Description: - * - * This is includes some incremental algorithms for mean and variance calculation - * - * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf - * - * Create a struct and if it's the weighted variant set the w field (weight = 2^k). - * - * Use mean_and_variance[_weighted]_update() on the struct to update it's state. - * - * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation - * is deferred to these functions for performance reasons. - * - * see lib/math/mean_and_variance_test.c for examples of usage. - * - * DO NOT access the mean and variance fields of the weighted variants directly. - * DO NOT change the weight after calling update. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "mean_and_variance.h" - -u128_u u128_div(u128_u n, u64 d) -{ - u128_u r; - u64 rem; - u64 hi = u128_hi(n); - u64 lo = u128_lo(n); - u64 h = hi & ((u64) U32_MAX << 32); - u64 l = (hi & (u64) U32_MAX) << 32; - - r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); - r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); - r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); - return r; -} -EXPORT_SYMBOL_GPL(u128_div); - -/** - * mean_and_variance_get_mean() - get mean from @s - * @s: mean and variance number of samples and their sums - */ -s64 mean_and_variance_get_mean(struct mean_and_variance s) -{ - return s.n ? div64_u64(s.sum, s.n) : 0; -} -EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); - -/** - * mean_and_variance_get_variance() - get variance from @s1 - * @s1: mean and variance number of samples and sums - * - * see linked pdf equation 12. - */ -u64 mean_and_variance_get_variance(struct mean_and_variance s1) -{ - if (s1.n) { - u128_u s2 = u128_div(s1.sum_squares, s1.n); - u64 s3 = abs(mean_and_variance_get_mean(s1)); - - return u128_lo(u128_sub(s2, u128_square(s3))); - } else { - return 0; - } -} -EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); - -/** - * mean_and_variance_get_stddev() - get standard deviation from @s - * @s: mean and variance number of samples and their sums - */ -u32 mean_and_variance_get_stddev(struct mean_and_variance s) -{ - return int_sqrt64(mean_and_variance_get_variance(s)); -} -EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); - -/** - * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() - * @s: mean and variance number of samples and their sums - * @x: new value to include in the &mean_and_variance_weighted - * @initted: caller must track whether this is the first use or not - * @weight: ewma weight - * - * see linked pdf: function derived from equations 140-143 where alpha = 2^w. - * values are stored bitshifted for performance and added precision. - */ -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, - s64 x, bool initted, u8 weight) -{ - // previous weighted variance. - u8 w = weight; - u64 var_w0 = s->variance; - // new value weighted. - s64 x_w = x << w; - s64 diff_w = x_w - s->mean; - s64 diff = fast_divpow2(diff_w, w); - // new mean weighted. - s64 u_w1 = s->mean + diff; - - if (!initted) { - s->mean = x_w; - s->variance = 0; - } else { - s->mean = u_w1; - s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; - } -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); - -/** - * mean_and_variance_weighted_get_mean() - get mean from @s - * @s: mean and variance number of samples and their sums - * @weight: ewma weight - */ -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, - u8 weight) -{ - return fast_divpow2(s.mean, weight); -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); - -/** - * mean_and_variance_weighted_get_variance() -- get variance from @s - * @s: mean and variance number of samples and their sums - * @weight: ewma weight - */ -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, - u8 weight) -{ - // always positive don't need fast divpow2 - return s.variance >> weight; -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); - -/** - * mean_and_variance_weighted_get_stddev() - get standard deviation from @s - * @s: mean and variance number of samples and their sums - * @weight: ewma weight - */ -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, - u8 weight) -{ - return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight)); -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); - -MODULE_AUTHOR("Daniel B. Hill"); -MODULE_LICENSE("GPL"); diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h deleted file mode 100644 index 47e4a3c3d26e73..00000000000000 --- a/fs/bcachefs/mean_and_variance.h +++ /dev/null @@ -1,203 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef MEAN_AND_VARIANCE_H_ -#define MEAN_AND_VARIANCE_H_ - -#include -#include -#include -#include - -#define SQRT_U64_MAX 4294967295ULL - -/* - * u128_u: u128 user mode, because not all architectures support a real int128 - * type - * - * We don't use this version in userspace, because in userspace we link with - * Rust and rustc has issues with u128. - */ - -#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC) - -typedef struct { - unsigned __int128 v; -} __aligned(16) u128_u; - -static inline u128_u u64_to_u128(u64 a) -{ - return (u128_u) { .v = a }; -} - -static inline u64 u128_lo(u128_u a) -{ - return a.v; -} - -static inline u64 u128_hi(u128_u a) -{ - return a.v >> 64; -} - -static inline u128_u u128_add(u128_u a, u128_u b) -{ - a.v += b.v; - return a; -} - -static inline u128_u u128_sub(u128_u a, u128_u b) -{ - a.v -= b.v; - return a; -} - -static inline u128_u u128_shl(u128_u a, s8 shift) -{ - a.v <<= shift; - return a; -} - -static inline u128_u u128_square(u64 a) -{ - u128_u b = u64_to_u128(a); - - b.v *= b.v; - return b; -} - -#else - -typedef struct { - u64 hi, lo; -} __aligned(16) u128_u; - -/* conversions */ - -static inline u128_u u64_to_u128(u64 a) -{ - return (u128_u) { .lo = a }; -} - -static inline u64 u128_lo(u128_u a) -{ - return a.lo; -} - -static inline u64 u128_hi(u128_u a) -{ - return a.hi; -} - -/* arithmetic */ - -static inline u128_u u128_add(u128_u a, u128_u b) -{ - u128_u c; - - c.lo = a.lo + b.lo; - c.hi = a.hi + b.hi + (c.lo < a.lo); - return c; -} - -static inline u128_u u128_sub(u128_u a, u128_u b) -{ - u128_u c; - - c.lo = a.lo - b.lo; - c.hi = a.hi - b.hi - (c.lo > a.lo); - return c; -} - -static inline u128_u u128_shl(u128_u i, s8 shift) -{ - u128_u r; - - r.lo = i.lo << (shift & 63); - if (shift < 64) - r.hi = (i.hi << (shift & 63)) | (i.lo >> (-shift & 63)); - else { - r.hi = i.lo << (-shift & 63); - r.lo = 0; - } - return r; -} - -static inline u128_u u128_square(u64 i) -{ - u128_u r; - u64 h = i >> 32, l = i & U32_MAX; - - r = u128_shl(u64_to_u128(h*h), 64); - r = u128_add(r, u128_shl(u64_to_u128(h*l), 32)); - r = u128_add(r, u128_shl(u64_to_u128(l*h), 32)); - r = u128_add(r, u64_to_u128(l*l)); - return r; -} - -#endif - -static inline u128_u u64s_to_u128(u64 hi, u64 lo) -{ - u128_u c = u64_to_u128(hi); - - c = u128_shl(c, 64); - c = u128_add(c, u64_to_u128(lo)); - return c; -} - -u128_u u128_div(u128_u n, u64 d); - -struct mean_and_variance { - s64 n; - s64 sum; - u128_u sum_squares; -}; - -/* expontentially weighted variant */ -struct mean_and_variance_weighted { - s64 mean; - u64 variance; -}; - -/** - * fast_divpow2() - fast approximation for n / (1 << d) - * @n: numerator - * @d: the power of 2 denominator. - * - * note: this rounds towards 0. - */ -static inline s64 fast_divpow2(s64 n, u8 d) -{ - return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; -} - -/** - * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 - * and return it. - * @s1: the mean_and_variance to update. - * @v1: the new sample. - * - * see linked pdf equation 12. - */ -static inline void -mean_and_variance_update(struct mean_and_variance *s, s64 v) -{ - s->n++; - s->sum += v; - s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v))); -} - -s64 mean_and_variance_get_mean(struct mean_and_variance s); -u64 mean_and_variance_get_variance(struct mean_and_variance s1); -u32 mean_and_variance_get_stddev(struct mean_and_variance s); - -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, - s64 v, bool initted, u8 weight); - -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, - u8 weight); -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, - u8 weight); -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, - u8 weight); - -#endif // MEAN_AND_VAIRANCE_H_ diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c deleted file mode 100644 index e9d9c0212e44b1..00000000000000 --- a/fs/bcachefs/mean_and_variance_test.c +++ /dev/null @@ -1,221 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include - -#include "mean_and_variance.h" - -#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX) - -static void mean_and_variance_basic_test(struct kunit *test) -{ - struct mean_and_variance s = {}; - - mean_and_variance_update(&s, 2); - mean_and_variance_update(&s, 2); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0); - KUNIT_EXPECT_EQ(test, s.n, 2); - - mean_and_variance_update(&s, 4); - mean_and_variance_update(&s, 4); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1); - KUNIT_EXPECT_EQ(test, s.n, 4); -} - -/* - * Test values computed using a spreadsheet from the psuedocode at the bottom: - * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf - */ - -static void mean_and_variance_weighted_test(struct kunit *test) -{ - struct mean_and_variance_weighted s = { }; - - mean_and_variance_weighted_update(&s, 10, false, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - - mean_and_variance_weighted_update(&s, 20, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - - mean_and_variance_weighted_update(&s, 30, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); - - s = (struct mean_and_variance_weighted) { }; - - mean_and_variance_weighted_update(&s, -10, false, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - - mean_and_variance_weighted_update(&s, -20, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - - mean_and_variance_weighted_update(&s, -30, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); -} - -static void mean_and_variance_weighted_advanced_test(struct kunit *test) -{ - struct mean_and_variance_weighted s = { }; - bool initted = false; - s64 i; - - for (i = 10; i <= 100; i += 10) { - mean_and_variance_weighted_update(&s, i, initted, 8); - initted = true; - } - - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); - - s = (struct mean_and_variance_weighted) { }; - initted = false; - - for (i = -10; i >= -100; i -= 10) { - mean_and_variance_weighted_update(&s, i, initted, 8); - initted = true; - } - - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); -} - -static void do_mean_and_variance_test(struct kunit *test, - s64 initial_value, - s64 initial_n, - s64 n, - unsigned weight, - s64 *data, - s64 *mean, - s64 *stddev, - s64 *weighted_mean, - s64 *weighted_stddev) -{ - struct mean_and_variance mv = {}; - struct mean_and_variance_weighted vw = { }; - - for (unsigned i = 0; i < initial_n; i++) { - mean_and_variance_update(&mv, initial_value); - mean_and_variance_weighted_update(&vw, initial_value, false, weight); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0); - } - - for (unsigned i = 0; i < n; i++) { - mean_and_variance_update(&mv, data[i]); - mean_and_variance_weighted_update(&vw, data[i], true, weight); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]); - } - - KUNIT_EXPECT_EQ(test, mv.n, initial_n + n); -} - -/* Test behaviour with a single outlier, then back to steady state: */ -static void mean_and_variance_test_1(struct kunit *test) -{ - s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; - s64 mean[] = { 22, 21, 20, 19, 18, 17, 16 }; - s64 stddev[] = { 32, 29, 28, 27, 26, 25, 24 }; - s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; - s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - -/* Test behaviour where we switch from one steady state to another: */ -static void mean_and_variance_test_2(struct kunit *test) -{ - s64 d[] = { 100, 100, 100, 100, 100 }; - s64 mean[] = { 22, 32, 40, 46, 50 }; - s64 stddev[] = { 32, 39, 42, 44, 45 }; - s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; - s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - -static void mean_and_variance_fast_divpow2(struct kunit *test) -{ - s64 i; - u8 d; - - for (i = 0; i < 100; i++) { - d = 0; - KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d)); - KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d)); - for (d = 1; d < 32; d++) { - KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)), - div_u64(i, 1 << d), "%lld %u", i, d); - KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)), - div_u64(i, 1 << d), "%lld %u", -i, d); - } - } -} - -static void mean_and_variance_u128_basic_test(struct kunit *test) -{ - u128_u a = u64s_to_u128(0, U64_MAX); - u128_u a1 = u64s_to_u128(0, 1); - u128_u b = u64s_to_u128(1, 0); - u128_u c = u64s_to_u128(0, 1LLU << 63); - u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0); - KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0); - - KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX); - KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1); - - KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31); -} - -static struct kunit_case mean_and_variance_test_cases[] = { - KUNIT_CASE(mean_and_variance_fast_divpow2), - KUNIT_CASE(mean_and_variance_u128_basic_test), - KUNIT_CASE(mean_and_variance_basic_test), - KUNIT_CASE(mean_and_variance_weighted_test), - KUNIT_CASE(mean_and_variance_weighted_advanced_test), - KUNIT_CASE(mean_and_variance_test_1), - KUNIT_CASE(mean_and_variance_test_2), - {} -}; - -static struct kunit_suite mean_and_variance_test_suite = { - .name = "mean and variance tests", - .test_cases = mean_and_variance_test_cases -}; - -kunit_test_suite(mean_and_variance_test_suite); - -MODULE_AUTHOR("Daniel B. Hill"); -MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests"); -MODULE_LICENSE("GPL"); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c deleted file mode 100644 index f296cce95338ce..00000000000000 --- a/fs/bcachefs/migrate.c +++ /dev/null @@ -1,277 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Code for moving data off a device. - */ - -#include "bcachefs.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "ec.h" -#include "errcode.h" -#include "extents.h" -#include "io_write.h" -#include "journal.h" -#include "keylist.h" -#include "migrate.h" -#include "move.h" -#include "progress.h" -#include "replicas.h" -#include "super-io.h" - -static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, - unsigned dev_idx, unsigned flags, bool metadata) -{ - unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; - unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; - unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; - unsigned nr_good; - - bch2_bkey_drop_device(k, dev_idx); - - nr_good = bch2_bkey_durability(c, k.s_c); - if ((!nr_good && !(flags & lost)) || - (nr_good < replicas && !(flags & degraded))) - return bch_err_throw(c, remove_would_lose_data); - - return 0; -} - -static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, - struct btree *b, unsigned dev_idx, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_buf k; - - bch2_bkey_buf_init(&k); - bch2_bkey_buf_copy(&k, c, &b->key); - - int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?: - bch2_btree_node_update_key(trans, iter, b, k.k, 0, false); - - bch_err_fn(c, ret); - bch2_bkey_buf_exit(&k, c); - return ret; -} - -static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - unsigned dev_idx, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_i *n; - int ret; - - if (!bch2_bkey_has_device_c(k, dev_idx)) - return 0; - - n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false); - if (ret) - return ret; - - /* - * If the new extent no longer has any pointers, bch2_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_error key, or just a discard if it was a cached extent) - */ - bch2_extent_normalize(c, bkey_i_to_s(n)); - - /* - * Since we're not inserting through an extent iterator - * (BTREE_ITER_all_snapshots iterators aren't extent iterators), - * we aren't using the extent overwrite path to delete, we're - * just using the normal key deletion path: - */ - if (bkey_deleted(&n->k)) - n->k.size = 0; - return 0; -} - -static int bch2_dev_btree_drop_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - unsigned dev_idx, - struct bkey_buf *last_flushed, - unsigned flags) -{ - struct btree_iter iter; - struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret; - - ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_dev_usrdata_drop(struct bch_fs *c, - struct progress_indicator_state *progress, - unsigned dev_idx, unsigned flags) -{ - struct btree_trans *trans = bch2_trans_get(c); - enum btree_id id; - int ret = 0; - - for (id = 0; id < BTREE_ID_NR; id++) { - if (!btree_type_has_ptrs(id)) - continue; - - ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); - bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); - })); - if (ret) - break; - } - - bch2_trans_put(trans); - - return ret; -} - -static int bch2_dev_metadata_drop(struct bch_fs *c, - struct progress_indicator_state *progress, - unsigned dev_idx, unsigned flags) -{ - struct btree_trans *trans; - struct btree_iter iter; - struct closure cl; - struct btree *b; - struct bkey_buf k; - unsigned id; - int ret; - - /* don't handle this yet: */ - if (flags & BCH_FORCE_IF_METADATA_LOST) - return bch_err_throw(c, remove_with_metadata_missing_unimplemented); - - trans = bch2_trans_get(c); - bch2_bkey_buf_init(&k); - closure_init_stack(&cl); - - for (id = 0; id < BTREE_ID_NR; id++) { - bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, - BTREE_ITER_prefetch); -retry: - ret = 0; - while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(trans, &iter)) && - !(ret = PTR_ERR_OR_ZERO(b))) { - bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); - - if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) - goto next; - - ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - - if (ret) - break; -next: - bch2_btree_iter_next_node(trans, &iter); - } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_iter_exit(trans, &iter); - - if (ret) - goto err; - } - - bch2_btree_interior_updates_flush(c); - ret = 0; -err: - bch2_bkey_buf_exit(&k, c); - bch2_trans_put(trans); - - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); - - return ret; -} - -static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, - struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, - unsigned flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, - last_flushed); - int ret = bkey_err(k); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - return 0; - if (ret) - return ret; - - if (!k.k || !bch2_bkey_has_device_c(k, dev_idx)) - goto out; - - /* - * XXX: pass flags arg to invalidate_stripe_to_dev and handle it - * properly - */ - - if (bkey_is_btree_ptr(k.k)) - ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags); - else if (k.k->type == KEY_TYPE_stripe) - ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags); - else - ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags) -{ - struct btree_trans *trans = bch2_trans_get(c); - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_btree_write_buffer_flush_sync(trans) ?: - for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, - POS(dev_idx, 0), - POS(dev_idx, U64_MAX), 0, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - if (k.k->type != KEY_TYPE_backpointer) - continue; - - data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), - &last_flushed, flags); - - })); - - bch2_bkey_buf_exit(&last_flushed, trans->c); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags) -{ - struct progress_indicator_state progress; - bch2_progress_init(&progress, c, - BIT_ULL(BTREE_ID_extents)| - BIT_ULL(BTREE_ID_reflink)); - - return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?: - bch2_dev_metadata_drop(c, &progress, dev_idx, flags); -} diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h deleted file mode 100644 index 30018140711b7d..00000000000000 --- a/fs/bcachefs/migrate.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MIGRATE_H -#define _BCACHEFS_MIGRATE_H - -int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned); -int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned); - -#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c deleted file mode 100644 index eec591e947bdad..00000000000000 --- a/fs/bcachefs/move.c +++ /dev/null @@ -1,1494 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_gc.h" -#include "btree_io.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "compress.h" -#include "disk_groups.h" -#include "ec.h" -#include "errcode.h" -#include "error.h" -#include "inode.h" -#include "io_read.h" -#include "io_write.h" -#include "journal_reclaim.h" -#include "keylist.h" -#include "move.h" -#include "rebalance.h" -#include "reflink.h" -#include "replicas.h" -#include "snapshot.h" -#include "super-io.h" -#include "trace.h" - -#include -#include - -const char * const bch2_data_ops_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_DATA_OPS() -#undef x - NULL -}; - -struct evacuate_bucket_arg { - struct bpos bucket; - int gen; - struct data_update_opts data_opts; -}; - -static bool evacuate_bucket_pred(struct bch_fs *, void *, - enum btree_id, struct bkey_s_c, - struct bch_io_opts *, - struct data_update_opts *); - -static noinline void -trace_io_move2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_io_move(c, buf.buf); - printbuf_exit(&buf); -} - -static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) -{ - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - trace_io_move_read(c, buf.buf); - printbuf_exit(&buf); -} - -static noinline void -trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts, - move_pred_fn pred, void *_arg, bool p) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "%ps: %u", pred, p); - - if (pred == evacuate_bucket_pred) { - struct evacuate_bucket_arg *arg = _arg; - prt_printf(&buf, " gen=%u", arg->gen); - } - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_io_move_pred(c, buf.buf); - printbuf_exit(&buf); -} - -static noinline void -trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "bucket: "); - bch2_bpos_to_text(&buf, bucket); - prt_printf(&buf, " gen: %i\n", gen); - - trace_io_move_evacuate_bucket(c, buf.buf); - printbuf_exit(&buf); -} - -struct moving_io { - struct list_head read_list; - struct list_head io_list; - struct move_bucket *b; - struct closure cl; - bool read_completed; - - unsigned read_sectors; - unsigned write_sectors; - - struct data_update write; -}; - -static void move_free(struct moving_io *io) -{ - struct moving_context *ctxt = io->write.ctxt; - - if (io->b) - atomic_dec(&io->b->count); - - mutex_lock(&ctxt->lock); - list_del(&io->io_list); - wake_up(&ctxt->wait); - mutex_unlock(&ctxt->lock); - - if (!io->write.data_opts.scrub) { - bch2_data_update_exit(&io->write); - } else { - bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); - kfree(io->write.bvecs); - } - kfree(io); -} - -static void move_write_done(struct bch_write_op *op) -{ - struct moving_io *io = container_of(op, struct moving_io, write.op); - struct bch_fs *c = op->c; - struct moving_context *ctxt = io->write.ctxt; - - if (op->error) { - if (trace_io_move_write_fail_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_write_op_to_text(&buf, op); - trace_io_move_write_fail(c, buf.buf); - printbuf_exit(&buf); - } - this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); - - ctxt->write_error = true; - } - - atomic_sub(io->write_sectors, &ctxt->write_sectors); - atomic_dec(&ctxt->write_ios); - move_free(io); - closure_put(&ctxt->cl); -} - -static void move_write(struct moving_io *io) -{ - struct bch_fs *c = io->write.op.c; - struct moving_context *ctxt = io->write.ctxt; - struct bch_read_bio *rbio = &io->write.rbio; - - if (ctxt->stats) { - if (rbio->bio.bi_status) - atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, - &ctxt->stats->sectors_error_uncorrected); - else if (rbio->saw_error) - atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, - &ctxt->stats->sectors_error_corrected); - } - - /* - * If the extent has been bitrotted, we're going to have to give it a - * new checksum in order to move it - but the poison bit will ensure - * that userspace still gets the appropriate error. - */ - if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && - (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - - rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, - nonce, &rbio->bio); - rbio->ret = 0; - } - - if (unlikely(rbio->ret || io->write.data_opts.scrub)) { - move_free(io); - return; - } - - if (trace_io_move_write_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); - trace_io_move_write(c, buf.buf); - printbuf_exit(&buf); - } - - closure_get(&io->write.ctxt->cl); - atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); - atomic_inc(&io->write.ctxt->write_ios); - - bch2_data_update_read_done(&io->write); -} - -struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) -{ - struct moving_io *io = - list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); - - return io && io->read_completed ? io : NULL; -} - -static void move_read_endio(struct bio *bio) -{ - struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); - struct moving_context *ctxt = io->write.ctxt; - - atomic_sub(io->read_sectors, &ctxt->read_sectors); - atomic_dec(&ctxt->read_ios); - io->read_completed = true; - - wake_up(&ctxt->wait); - closure_put(&ctxt->cl); -} - -void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) -{ - struct moving_io *io; - - while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { - bch2_trans_unlock_long(ctxt->trans); - list_del(&io->read_list); - move_write(io); - } -} - -void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) -{ - unsigned sectors_pending = atomic_read(&ctxt->write_sectors); - - move_ctxt_wait_event(ctxt, - !atomic_read(&ctxt->write_sectors) || - atomic_read(&ctxt->write_sectors) != sectors_pending); -} - -void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) -{ - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); - bch2_trans_unlock_long(ctxt->trans); - closure_sync(&ctxt->cl); -} - -void bch2_moving_ctxt_exit(struct moving_context *ctxt) -{ - struct bch_fs *c = ctxt->trans->c; - - bch2_moving_ctxt_flush_all(ctxt); - - EBUG_ON(atomic_read(&ctxt->write_sectors)); - EBUG_ON(atomic_read(&ctxt->write_ios)); - EBUG_ON(atomic_read(&ctxt->read_sectors)); - EBUG_ON(atomic_read(&ctxt->read_ios)); - - mutex_lock(&c->moving_context_lock); - list_del(&ctxt->list); - mutex_unlock(&c->moving_context_lock); - - /* - * Generally, releasing a transaction within a transaction restart means - * an unhandled transaction restart: but this can happen legitimately - * within the move code, e.g. when bch2_move_ratelimit() tells us to - * exit before we've retried - */ - bch2_trans_begin(ctxt->trans); - bch2_trans_put(ctxt->trans); - memset(ctxt, 0, sizeof(*ctxt)); -} - -void bch2_moving_ctxt_init(struct moving_context *ctxt, - struct bch_fs *c, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc) -{ - memset(ctxt, 0, sizeof(*ctxt)); - - ctxt->trans = bch2_trans_get(c); - ctxt->fn = (void *) _RET_IP_; - ctxt->rate = rate; - ctxt->stats = stats; - ctxt->wp = wp; - ctxt->wait_on_copygc = wait_on_copygc; - - closure_init_stack(&ctxt->cl); - - mutex_init(&ctxt->lock); - INIT_LIST_HEAD(&ctxt->reads); - INIT_LIST_HEAD(&ctxt->ios); - init_waitqueue_head(&ctxt->wait); - - mutex_lock(&c->moving_context_lock); - list_add(&ctxt->list, &c->moving_context_list); - mutex_unlock(&c->moving_context_lock); -} - -void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) -{ - trace_move_data(c, stats); -} - -void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) -{ - memset(stats, 0, sizeof(*stats)); - stats->data_type = BCH_DATA_user; - scnprintf(stats->name, sizeof(stats->name), "%s", name); -} - -int bch2_move_extent(struct moving_context *ctxt, - struct move_bucket *bucket_in_flight, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_io_opts io_opts, - struct data_update_opts data_opts) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - int ret = -ENOMEM; - - if (trace_io_move_enabled()) - trace_io_move2(c, k, &io_opts, &data_opts); - this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); - - if (ctxt->stats) - ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); - - bch2_data_update_opts_normalize(k, &data_opts); - - if (!data_opts.rewrite_ptrs && - !data_opts.extra_replicas && - !data_opts.scrub) { - if (data_opts.kill_ptrs) - return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); - return 0; - } - - struct moving_io *io = allocate_dropping_locks(trans, ret, - kzalloc(sizeof(struct moving_io), _gfp)); - if (!io) - goto err; - - if (ret) - goto err_free; - - INIT_LIST_HEAD(&io->io_list); - io->write.ctxt = ctxt; - io->read_sectors = k.k->size; - io->write_sectors = k.k->size; - - if (!data_opts.scrub) { - ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, - &io_opts, data_opts, iter->btree_id, k); - if (ret) - goto err_free; - - io->write.op.end_io = move_write_done; - } else { - bch2_bkey_buf_init(&io->write.k); - bch2_bkey_buf_reassemble(&io->write.k, c, k); - - io->write.op.c = c; - io->write.data_opts = data_opts; - - bch2_trans_unlock(trans); - - ret = bch2_data_update_bios_init(&io->write, c, &io_opts); - if (ret) - goto err_free; - } - - io->write.rbio.bio.bi_end_io = move_read_endio; - io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, k.k->size); - - if (ctxt->stats) { - atomic64_inc(&ctxt->stats->keys_moved); - atomic64_add(k.k->size, &ctxt->stats->sectors_moved); - } - - if (bucket_in_flight) { - io->b = bucket_in_flight; - atomic_inc(&io->b->count); - } - - if (trace_io_move_read_enabled()) - trace_io_move_read2(c, k); - - mutex_lock(&ctxt->lock); - atomic_add(io->read_sectors, &ctxt->read_sectors); - atomic_inc(&ctxt->read_ios); - - list_add_tail(&io->read_list, &ctxt->reads); - list_add_tail(&io->io_list, &ctxt->ios); - mutex_unlock(&ctxt->lock); - - /* - * dropped by move_read_endio() - guards against use after free of - * ctxt when doing wakeup - */ - closure_get(&ctxt->cl); - __bch2_read_extent(trans, &io->write.rbio, - io->write.rbio.bio.bi_iter, - bkey_start_pos(k.k), - iter->btree_id, k, 0, - NULL, - BCH_READ_last_fragment, - data_opts.scrub ? data_opts.read_dev : -1); - return 0; -err_free: - kfree(io); -err: - if (bch2_err_matches(ret, EROFS) || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - - count_event(c, io_move_start_fail); - - if (trace_io_move_start_fail_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, ": "); - prt_str(&buf, bch2_err_str(ret)); - trace_io_move_start_fail(c, buf.buf); - printbuf_exit(&buf); - } - - if (bch2_err_matches(ret, BCH_ERR_data_update_done)) - return 0; - return ret; -} - -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, - struct per_snapshot_io_opts *io_opts, - struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; - struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; - int ret = 0; - - if (extent_iter->min_depth) - return opts_ret; - - if (extent_k.k->type == KEY_TYPE_reflink_v) - goto out; - - if (io_opts->cur_inum != extent_pos.inode) { - io_opts->d.nr = 0; - - ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), - BTREE_ITER_all_snapshots, k, ({ - if (k.k->p.offset != extent_pos.inode) - break; - - if (!bkey_is_inode(k.k)) - continue; - - struct bch_inode_unpacked inode; - _ret3 = bch2_inode_unpack(k, &inode); - if (_ret3) - break; - - struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; - bch2_inode_opts_get(&e.io_opts, trans->c, &inode); - - darray_push(&io_opts->d, e); - })); - io_opts->cur_inum = extent_pos.inode; - } - - ret = ret ?: trans_was_restarted(trans, restart_count); - if (ret) - return ERR_PTR(ret); - - if (extent_k.k->p.snapshot) - darray_for_each(io_opts->d, i) - if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { - opts_ret = &i->io_opts; - break; - } -out: - ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); - if (ret) - return ERR_PTR(ret); - return opts_ret; -} - -int bch2_move_get_io_opts_one(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - - *io_opts = bch2_opts_to_inode_opts(c->opts); - - /* reflink btree? */ - if (!extent_k.k->p.inode) - goto out; - - struct btree_iter inode_iter; - struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_cached); - int ret = bkey_err(inode_k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - - if (!ret && bkey_is_inode(inode_k.k)) { - struct bch_inode_unpacked inode; - bch2_inode_unpack(inode_k, &inode); - bch2_inode_opts_get(io_opts, c, &inode); - } - bch2_trans_iter_exit(trans, &inode_iter); - /* seem to be spinning here? */ -out: - return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); -} - -int bch2_move_ratelimit(struct moving_context *ctxt) -{ - struct bch_fs *c = ctxt->trans->c; - bool is_kthread = current->flags & PF_KTHREAD; - u64 delay; - - if (ctxt->wait_on_copygc && c->copygc_running) { - bch2_moving_ctxt_flush_all(ctxt); - wait_event_killable(c->copygc_running_wq, - !c->copygc_running || - (is_kthread && kthread_should_stop())); - } - - do { - delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; - - if (is_kthread && kthread_should_stop()) - return 1; - - if (delay) - move_ctxt_wait_event_timeout(ctxt, - freezing(current) || - (is_kthread && kthread_should_stop()), - delay); - - if (unlikely(freezing(current))) { - bch2_moving_ctxt_flush_all(ctxt); - try_to_freeze(); - } - } while (delay); - - /* - * XXX: these limits really ought to be per device, SSDs and hard drives - * will want different limits - */ - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && - atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && - atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && - atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); - - return 0; -} - -/* - * Move requires non extents iterators, and there's also no need for it to - * signal indirect_extent_missing_error: - */ -static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_reflink_p p) -{ - if (unlikely(REFLINK_P_ERROR(p.v))) - return bkey_s_c_null; - - struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v)); - - bch2_trans_iter_init(trans, iter, - BTREE_ID_reflink, reflink_pos, - BTREE_ITER_not_extents); - - struct bkey_s_c k = bch2_btree_iter_peek(trans, iter); - if (!k.k || bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } - - if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { - bch2_trans_iter_exit(trans, iter); - return bkey_s_c_null; - } - - return k; -} - -int bch2_move_data_btree(struct moving_context *ctxt, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - enum btree_id btree_id, unsigned level) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct per_snapshot_io_opts snapshot_io_opts; - struct bch_io_opts *io_opts; - struct bkey_buf sk; - struct btree_iter iter, reflink_iter = {}; - struct bkey_s_c k; - struct data_update_opts data_opts; - /* - * If we're moving a single file, also process reflinked data it points - * to (this includes propagating changed io_opts from the inode to the - * extent): - */ - bool walk_indirect = start.inode == end.inode; - int ret = 0, ret2; - - per_snapshot_io_opts_init(&snapshot_io_opts, c); - bch2_bkey_buf_init(&sk); - - if (ctxt->stats) { - ctxt->stats->data_type = BCH_DATA_user; - ctxt->stats->pos = BBPOS(btree_id, start); - } - -retry_root: - bch2_trans_begin(trans); - - if (level == bch2_btree_id_root(c, btree_id)->level + 1) { - bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1, - BTREE_ITER_prefetch| - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto root_err; - - if (b != btree_node_root(c, b)) { - bch2_trans_iter_exit(trans, &iter); - goto retry_root; - } - - k = bkey_i_to_s_c(&b->key); - - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, - iter.pos, &iter, k); - ret = PTR_ERR_OR_ZERO(io_opts); - if (ret) - goto root_err; - - memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) - goto out; - - - if (!data_opts.scrub) - ret = bch2_btree_node_rewrite_pos(trans, btree_id, level, - k.k->p, data_opts.target, 0); - else - ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); - -root_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bch2_trans_iter_exit(trans, &iter); - goto retry_root; - } - - goto out; - } - - bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, - BTREE_ITER_prefetch| - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - - if (ctxt->rate) - bch2_ratelimit_reset(ctxt->rate); - - while (!bch2_move_ratelimit(ctxt)) { - struct btree_iter *extent_iter = &iter; - - bch2_trans_begin(trans); - - k = bch2_btree_iter_peek(trans, &iter); - if (!k.k) - break; - - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (bkey_gt(bkey_start_pos(k.k), end)) - break; - - if (ctxt->stats) - ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - - if (walk_indirect && - k.k->type == KEY_TYPE_reflink_p && - REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - - bch2_trans_iter_exit(trans, &reflink_iter); - k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (!k.k) - goto next_nondata; - - /* - * XXX: reflink pointers may point to multiple indirect - * extents, so don't advance past the entire reflink - * pointer - need to fixup iter->k - */ - extent_iter = &reflink_iter; - } - - if (!bkey_extent_is_direct_data(k.k)) - goto next_nondata; - - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, - iter.pos, extent_iter, k); - ret = PTR_ERR_OR_ZERO(io_opts); - if (ret) - continue; - - memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) - goto next; - - /* - * The iterator gets unlocked by __bch2_read_extent - need to - * save a copy of @k elsewhere: - */ - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - if (!level) - ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); - else if (!data_opts.scrub) - ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, - k.k->p, data_opts.target, 0); - else - ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); - - if (ret2) { - if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) - continue; - - if (bch2_err_matches(ret2, ENOMEM)) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; - } - - /* XXX signal failure */ - goto next; - } -next: - if (ctxt->stats) - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); -next_nondata: - if (!bch2_btree_iter_advance(trans, &iter)) - break; - } -out: - bch2_trans_iter_exit(trans, &reflink_iter); - bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&sk, c); - per_snapshot_io_opts_exit(&snapshot_io_opts); - - return ret; -} - -int __bch2_move_data(struct moving_context *ctxt, - struct bbpos start, - struct bbpos end, - move_pred_fn pred, void *arg) -{ - struct bch_fs *c = ctxt->trans->c; - enum btree_id id; - int ret = 0; - - for (id = start.btree; - id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); - id++) { - ctxt->stats->pos = BBPOS(id, POS_MIN); - - if (!btree_type_has_ptrs(id) || - !bch2_btree_id_root(c, id)->b) - continue; - - ret = bch2_move_data_btree(ctxt, - id == start.btree ? start.pos : POS_MIN, - id == end.btree ? end.pos : POS_MAX, - pred, arg, id, 0); - if (ret) - break; - } - - return ret; -} - -int bch2_move_data(struct bch_fs *c, - struct bbpos start, - struct bbpos end, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) -{ - struct moving_context ctxt; - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - int ret = __bch2_move_data(&ctxt, start, end, pred, arg); - bch2_moving_ctxt_exit(&ctxt); - - return ret; -} - -static int __bch2_move_data_phys(struct moving_context *ctxt, - struct move_bucket *bucket_in_flight, - unsigned dev, - u64 bucket_start, - u64 bucket_end, - unsigned data_types, - bool copygc, - move_pred_fn pred, void *arg) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - bool is_kthread = current->flags & PF_KTHREAD; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_iter iter = {}, bp_iter = {}; - struct bkey_buf sk; - struct bkey_s_c k; - struct bkey_buf last_flushed; - u64 check_mismatch_done = bucket_start; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, dev); - if (!ca) - return 0; - - bucket_end = min(bucket_end, ca->mi.nbuckets); - - struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); - struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - bch2_bkey_buf_init(&sk); - - /* - * We're not run in a context that handles transaction restarts: - */ - bch2_trans_begin(trans); - - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); - - ret = bch2_btree_write_buffer_tryflush(trans); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "flushing btree write buffer"); - if (ret) - goto err; - - while (!(ret = bch2_move_ratelimit(ctxt))) { - if (is_kthread && kthread_should_stop()) - break; - - bch2_trans_begin(trans); - - k = bch2_btree_iter_peek(trans, &bp_iter); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - - if (!k.k || bkey_gt(k.k->p, bp_end)) - break; - - if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { - while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { - bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, - copygc, &last_flushed); - } - continue; - } - - if (k.k->type != KEY_TYPE_backpointer) - goto next; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - - if (ctxt->stats) - ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - - if (!(data_types & BIT(bp.v->data_type))) - goto next; - - if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes) - goto next; - - k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!k.k) - goto next; - - if (!bp.v->level) { - ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); - if (ret) { - bch2_trans_iter_exit(trans, &iter); - continue; - } - } - - struct data_update_opts data_opts = {}; - bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts); - - if (trace_io_move_pred_enabled()) - trace_io_move_pred2(c, k, &io_opts, &data_opts, - pred, arg, p); - - if (!p) { - bch2_trans_iter_exit(trans, &iter); - goto next; - } - - if (data_opts.scrub && - !bch2_dev_idx_is_online(c, data_opts.read_dev)) { - bch2_trans_iter_exit(trans, &iter); - ret = bch_err_throw(c, device_offline); - break; - } - - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - /* move_extent will drop locks */ - unsigned sectors = bp.v->bucket_len; - - if (!bp.v->level) - ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); - else if (!data_opts.scrub) - ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, - k.k->p, data_opts.target, 0); - else - ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); - - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret == -ENOMEM) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; - } - if (ret) - goto err; - - if (ctxt->stats) - atomic64_add(sectors, &ctxt->stats->sectors_seen); -next: - bch2_btree_iter_advance(trans, &bp_iter); - } - - while (check_mismatch_done < bucket_end) - bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, - copygc, &last_flushed); -err: - bch2_trans_iter_exit(trans, &bp_iter); - bch2_bkey_buf_exit(&sk, c); - bch2_bkey_buf_exit(&last_flushed, c); - bch2_dev_put(ca); - return ret; -} - -int bch2_move_data_phys(struct bch_fs *c, - unsigned dev, - u64 start, - u64 end, - unsigned data_types, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) -{ - struct moving_context ctxt; - - bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - if (ctxt.stats) { - ctxt.stats->phys = true; - ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; - } - - int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, - data_types, false, pred, arg); - bch2_moving_ctxt_exit(&ctxt); - - return ret; -} - -static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct evacuate_bucket_arg *arg = _arg; - - *data_opts = arg->data_opts; - - unsigned i = 0; - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (ptr->dev == arg->bucket.inode && - (arg->gen < 0 || arg->gen == ptr->gen) && - !ptr->cached) - data_opts->rewrite_ptrs |= BIT(i); - i++; - } - - return data_opts->rewrite_ptrs != 0; -} - -int bch2_evacuate_bucket(struct moving_context *ctxt, - struct move_bucket *bucket_in_flight, - struct bpos bucket, int gen, - struct data_update_opts data_opts) -{ - struct bch_fs *c = ctxt->trans->c; - struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; - - count_event(c, io_move_evacuate_bucket); - if (trace_io_move_evacuate_bucket_enabled()) - trace_io_move_evacuate_bucket2(c, bucket, gen); - - return __bch2_move_data_phys(ctxt, bucket_in_flight, - bucket.inode, - bucket.offset, - bucket.offset + 1, - ~0, - true, - evacuate_bucket_pred, &arg); -} - -typedef bool (*move_btree_pred)(struct bch_fs *, void *, - struct btree *, struct bch_io_opts *, - struct data_update_opts *); - -static int bch2_move_btree(struct bch_fs *c, - struct bbpos start, - struct bbpos end, - move_btree_pred pred, void *arg, - struct bch_move_stats *stats) -{ - bool kthread = (current->flags & PF_KTHREAD) != 0; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct moving_context ctxt; - struct btree_trans *trans; - struct btree_iter iter; - struct btree *b; - enum btree_id btree; - struct data_update_opts data_opts; - int ret = 0; - - bch2_moving_ctxt_init(&ctxt, c, NULL, stats, - writepoint_ptr(&c->btree_write_point), - true); - trans = ctxt.trans; - - stats->data_type = BCH_DATA_btree; - - for (btree = start.btree; - btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); - btree ++) { - stats->pos = BBPOS(btree, POS_MIN); - - if (!bch2_btree_id_root(c, btree)->b) - continue; - - bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, - BTREE_ITER_prefetch); -retry: - ret = 0; - while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(trans, &iter)) && - !(ret = PTR_ERR_OR_ZERO(b))) { - if (kthread && kthread_should_stop()) - break; - - if ((cmp_int(btree, end.btree) ?: - bpos_cmp(b->key.k.p, end.pos)) > 0) - break; - - stats->pos = BBPOS(iter.btree_id, iter.pos); - - if (!pred(c, arg, b, &io_opts, &data_opts)) - goto next; - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; -next: - bch2_btree_iter_next_node(trans, &iter); - } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_iter_exit(trans, &iter); - - if (kthread && kthread_should_stop()) - break; - } - - bch_err_fn(c, ret); - bch2_moving_ctxt_exit(&ctxt); - bch2_btree_interior_updates_flush(c); - - return ret; -} - -static bool rereplicate_pred(struct bch_fs *c, void *arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - unsigned nr_good = bch2_bkey_durability(c, k); - unsigned replicas = bkey_is_btree_ptr(k.k) - ? c->opts.metadata_replicas - : io_opts->data_replicas; - - guard(rcu)(); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i = 0; - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ptr->cached && - (!ca || !ca->mi.durability)) - data_opts->kill_ptrs |= BIT(i); - i++; - } - - if (!data_opts->kill_ptrs && - (!nr_good || nr_good >= replicas)) - return false; - - data_opts->target = 0; - data_opts->extra_replicas = replicas - nr_good; - data_opts->btree_insert_flags = 0; - return true; -} - -static bool migrate_pred(struct bch_fs *c, void *arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_ioctl_data *op = arg; - unsigned i = 0; - - data_opts->rewrite_ptrs = 0; - data_opts->target = 0; - data_opts->extra_replicas = 0; - data_opts->btree_insert_flags = 0; - - bkey_for_each_ptr(ptrs, ptr) { - if (ptr->dev == op->migrate.dev) - data_opts->rewrite_ptrs |= 1U << i; - i++; - } - - return data_opts->rewrite_ptrs != 0; -} - -static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts); -} - -/* - * Ancient versions of bcachefs produced packed formats which could represent - * keys that the in memory format cannot represent; this checks for those - * formats so we can get rid of them. - */ -static bool bformat_needs_redo(struct bkey_format *f) -{ - for (unsigned i = 0; i < f->nr_fields; i++) - if (bch2_bkey_format_field_overflows(f, i)) - return true; - - return false; -} - -static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - if (b->version_ondisk != c->sb.version || - btree_node_need_rewrite(b) || - bformat_needs_redo(&b->format)) { - data_opts->target = 0; - data_opts->extra_replicas = 0; - data_opts->btree_insert_flags = 0; - return true; - } - - return false; -} - -int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) -{ - int ret; - - ret = bch2_move_btree(c, - BBPOS_MIN, - BBPOS_MAX, - rewrite_old_nodes_pred, c, stats); - if (!ret) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); - c->disk_sb.sb->version_min = c->disk_sb.sb->version; - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - - bch_err_fn(c, ret); - return ret; -} - -static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - unsigned durability = bch2_bkey_durability(c, k); - unsigned replicas = bkey_is_btree_ptr(k.k) - ? c->opts.metadata_replicas - : io_opts->data_replicas; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned i = 0; - - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { - unsigned d = bch2_extent_ptr_durability(c, &p); - - if (d && durability - d >= replicas) { - data_opts->kill_ptrs |= BIT(i); - durability -= d; - } - - i++; - } - - return data_opts->kill_ptrs != 0; -} - -static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), - io_opts, data_opts); -} - -static bool scrub_pred(struct bch_fs *c, void *_arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bch_ioctl_data *arg = _arg; - - if (k.k->type != KEY_TYPE_btree_ptr_v2) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == arg->migrate.dev) { - if (!p.crc.csum_type) - return false; - break; - } - } - - data_opts->scrub = true; - data_opts->read_dev = arg->migrate.dev; - return true; -} - -int bch2_data_job(struct bch_fs *c, - struct bch_move_stats *stats, - struct bch_ioctl_data op) -{ - struct bbpos start = BBPOS(op.start_btree, op.start_pos); - struct bbpos end = BBPOS(op.end_btree, op.end_pos); - int ret = 0; - - if (op.op >= BCH_DATA_OP_NR) - return -EINVAL; - - bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); - - switch (op.op) { - case BCH_DATA_OP_scrub: - /* - * prevent tests from spuriously failing, make sure we see all - * btree nodes that need to be repaired - */ - bch2_btree_interior_updates_flush(c); - - ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, - op.scrub.data_types, - NULL, - stats, - writepoint_hashed((unsigned long) current), - false, - scrub_pred, &op) ?: ret; - break; - - case BCH_DATA_OP_rereplicate: - stats->data_type = BCH_DATA_journal; - ret = bch2_journal_flush_device_pins(&c->journal, -1); - ret = bch2_move_btree(c, start, end, - rereplicate_btree_pred, c, stats) ?: ret; - ret = bch2_move_data(c, start, end, - NULL, - stats, - writepoint_hashed((unsigned long) current), - true, - rereplicate_pred, c) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - break; - case BCH_DATA_OP_migrate: - if (op.migrate.dev >= c->sb.nr_devices) - return -EINVAL; - - stats->data_type = BCH_DATA_journal; - ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, - ~0, - NULL, - stats, - writepoint_hashed((unsigned long) current), - true, - migrate_pred, &op) ?: ret; - bch2_btree_interior_updates_flush(c); - ret = bch2_replicas_gc2(c) ?: ret; - break; - case BCH_DATA_OP_rewrite_old_nodes: - ret = bch2_scan_old_btree_nodes(c, stats); - break; - case BCH_DATA_OP_drop_extra_replicas: - ret = bch2_move_btree(c, start, end, - drop_extra_replicas_btree_pred, c, stats) ?: ret; - ret = bch2_move_data(c, start, end, NULL, stats, - writepoint_hashed((unsigned long) current), - true, - drop_extra_replicas_pred, c) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - break; - default: - ret = -EINVAL; - } - - bch2_move_stats_exit(stats, c); - return ret; -} - -void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) -{ - prt_printf(out, "%s: data type==", stats->name); - bch2_prt_data_type(out, stats->data_type); - prt_str(out, " pos="); - bch2_bbpos_to_text(out, stats->pos); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); - prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); - prt_printf(out, "bytes seen:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); - prt_newline(out); - - prt_printf(out, "bytes moved:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); - prt_newline(out); - - prt_printf(out, "bytes raced:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); - prt_newline(out); - - printbuf_indent_sub(out, 2); -} - -static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - bch2_move_stats_to_text(out, ctxt->stats); - printbuf_indent_add(out, 2); - - prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", - atomic_read(&ctxt->read_ios), - c->opts.move_ios_in_flight, - atomic_read(&ctxt->read_sectors), - c->opts.move_bytes_in_flight >> 9); - - prt_printf(out, "writes: ios %u/%u sectors %u/%u\n", - atomic_read(&ctxt->write_ios), - c->opts.move_ios_in_flight, - atomic_read(&ctxt->write_sectors), - c->opts.move_bytes_in_flight >> 9); - - printbuf_indent_add(out, 2); - - mutex_lock(&ctxt->lock); - struct moving_io *io; - list_for_each_entry(io, &ctxt->ios, io_list) - bch2_data_update_inflight_to_text(out, &io->write); - mutex_unlock(&ctxt->lock); - - printbuf_indent_sub(out, 4); -} - -void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct moving_context *ctxt; - - mutex_lock(&c->moving_context_lock); - list_for_each_entry(ctxt, &c->moving_context_list, list) - bch2_moving_ctxt_to_text(out, c, ctxt); - mutex_unlock(&c->moving_context_lock); -} - -void bch2_fs_move_init(struct bch_fs *c) -{ - INIT_LIST_HEAD(&c->moving_context_list); - mutex_init(&c->moving_context_lock); -} diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h deleted file mode 100644 index 86b80499ac55f2..00000000000000 --- a/fs/bcachefs/move.h +++ /dev/null @@ -1,165 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MOVE_H -#define _BCACHEFS_MOVE_H - -#include "bbpos.h" -#include "bcachefs_ioctl.h" -#include "btree_iter.h" -#include "buckets.h" -#include "data_update.h" -#include "move_types.h" - -struct bch_read_bio; - -struct moving_context { - struct btree_trans *trans; - struct list_head list; - void *fn; - - struct bch_ratelimit *rate; - struct bch_move_stats *stats; - struct write_point_specifier wp; - bool wait_on_copygc; - bool write_error; - - /* For waiting on outstanding reads and writes: */ - struct closure cl; - - struct mutex lock; - struct list_head reads; - struct list_head ios; - - /* in flight sectors: */ - atomic_t read_sectors; - atomic_t write_sectors; - atomic_t read_ios; - atomic_t write_ios; - - wait_queue_head_t wait; -}; - -#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout) \ -({ \ - int _ret = 0; \ - while (true) { \ - bool cond_finished = false; \ - bch2_moving_ctxt_do_pending_writes(_ctxt); \ - \ - if (_cond) \ - break; \ - bch2_trans_unlock_long((_ctxt)->trans); \ - _ret = __wait_event_timeout((_ctxt)->wait, \ - bch2_moving_ctxt_next_pending_write(_ctxt) || \ - (cond_finished = (_cond)), _timeout); \ - if (_ret || ( cond_finished)) \ - break; \ - } \ - _ret; \ -}) - -#define move_ctxt_wait_event(_ctxt, _cond) \ -do { \ - bool cond_finished = false; \ - bch2_moving_ctxt_do_pending_writes(_ctxt); \ - \ - if (_cond) \ - break; \ - bch2_trans_unlock_long((_ctxt)->trans); \ - __wait_event((_ctxt)->wait, \ - bch2_moving_ctxt_next_pending_write(_ctxt) || \ - (cond_finished = (_cond))); \ - if (cond_finished) \ - break; \ -} while (1) - -typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, - struct bch_io_opts *, struct data_update_opts *); - -extern const char * const bch2_data_ops_strs[]; - -void bch2_moving_ctxt_exit(struct moving_context *); -void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, - struct bch_ratelimit *, struct bch_move_stats *, - struct write_point_specifier, bool); -struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); -void bch2_moving_ctxt_do_pending_writes(struct moving_context *); -void bch2_moving_ctxt_flush_all(struct moving_context *); -void bch2_move_ctxt_wait_for_io(struct moving_context *); -int bch2_move_ratelimit(struct moving_context *); - -/* Inodes in different snapshots may have different IO options: */ -struct snapshot_io_opts_entry { - u32 snapshot; - struct bch_io_opts io_opts; -}; - -struct per_snapshot_io_opts { - u64 cur_inum; - struct bch_io_opts fs_io_opts; - DARRAY(struct snapshot_io_opts_entry) d; -}; - -static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) -{ - memset(io_opts, 0, sizeof(*io_opts)); - io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts); -} - -static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) -{ - darray_exit(&io_opts->d); -} - -int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, - struct btree_iter *, struct bkey_s_c); - -int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); - -int bch2_move_extent(struct moving_context *, - struct move_bucket *, - struct btree_iter *, - struct bkey_s_c, - struct bch_io_opts, - struct data_update_opts); - -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, - struct per_snapshot_io_opts *, struct bpos, - struct btree_iter *, struct bkey_s_c); - -int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, - move_pred_fn, void *, enum btree_id, unsigned); -int __bch2_move_data(struct moving_context *, - struct bbpos, - struct bbpos, - move_pred_fn, void *); -int bch2_move_data(struct bch_fs *, - struct bbpos start, - struct bbpos end, - struct bch_ratelimit *, - struct bch_move_stats *, - struct write_point_specifier, - bool, - move_pred_fn, void *); - -int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, - struct bch_ratelimit *, struct bch_move_stats *, - struct write_point_specifier, bool, - move_pred_fn, void *); - -int bch2_evacuate_bucket(struct moving_context *, - struct move_bucket *, - struct bpos, int, - struct data_update_opts); -int bch2_data_job(struct bch_fs *, - struct bch_move_stats *, - struct bch_ioctl_data); - -void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); -void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); -void bch2_move_stats_init(struct bch_move_stats *, const char *); - -void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_move_init(struct bch_fs *); - -#endif /* _BCACHEFS_MOVE_H */ diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h deleted file mode 100644 index c5c62cd600de1c..00000000000000 --- a/fs/bcachefs/move_types.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MOVE_TYPES_H -#define _BCACHEFS_MOVE_TYPES_H - -#include "bbpos_types.h" -#include "bcachefs_ioctl.h" - -struct bch_move_stats { - char name[32]; - bool phys; - enum bch_ioctl_data_event_ret ret; - - union { - struct { - enum bch_data_type data_type; - struct bbpos pos; - }; - struct { - unsigned dev; - u64 offset; - }; - }; - - atomic64_t keys_moved; - atomic64_t keys_raced; - atomic64_t sectors_seen; - atomic64_t sectors_moved; - atomic64_t sectors_raced; - atomic64_t sectors_error_corrected; - atomic64_t sectors_error_uncorrected; -}; - -struct move_bucket_key { - struct bpos bucket; - unsigned gen; -}; - -struct move_bucket { - struct move_bucket *next; - struct rhash_head hash; - struct move_bucket_key k; - unsigned sectors; - atomic_t count; -}; - -#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c deleted file mode 100644 index 5e6de91a87630d..00000000000000 --- a/fs/bcachefs/movinggc.c +++ /dev/null @@ -1,476 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Moving/copying garbage collector - * - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "clock.h" -#include "errcode.h" -#include "error.h" -#include "lru.h" -#include "move.h" -#include "movinggc.h" -#include "trace.h" - -#include -#include -#include -#include -#include - -struct buckets_in_flight { - struct rhashtable *table; - struct move_bucket *first; - struct move_bucket *last; - size_t nr; - size_t sectors; - - DARRAY(struct move_bucket *) to_evacuate; -}; - -static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket, hash), - .key_offset = offsetof(struct move_bucket, k), - .key_len = sizeof(struct move_bucket_key), - .automatic_shrinking = true, -}; - -static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b) -{ - if (!list->first) - list->first = b; - else - list->last->next = b; - - list->last = b; - list->nr++; - list->sectors += b->sectors; -} - -static int bch2_bucket_is_movable(struct btree_trans *trans, - struct move_bucket *b, u64 time) -{ - struct bch_fs *c = trans->c; - - if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) - return 0; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - b->k.bucket, BTREE_ITER_cached); - int ret = bkey_err(k); - if (ret) - return ret; - - struct bch_dev *ca = bch2_dev_bucket_tryget(c, k.k->p); - if (!ca) - goto out; - - if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) - goto out; - - if (ca->mi.state != BCH_MEMBER_STATE_rw || - !bch2_dev_is_online(ca)) - goto out; - - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - b->k.gen = a->gen; - b->sectors = bch2_bucket_sectors_dirty(*a); - u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - - ret = lru_idx && lru_idx <= time; -out: - bch2_dev_put(ca); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static void move_bucket_free(struct buckets_in_flight *list, - struct move_bucket *b) -{ - int ret = rhashtable_remove_fast(list->table, &b->hash, - bch_move_bucket_params); - BUG_ON(ret); - kfree(b); -} - -static void move_buckets_wait(struct moving_context *ctxt, - struct buckets_in_flight *list, - bool flush) -{ - struct move_bucket *i; - - while ((i = list->first)) { - if (flush) - move_ctxt_wait_event(ctxt, !atomic_read(&i->count)); - - if (atomic_read(&i->count)) - break; - - list->first = i->next; - if (!list->first) - list->last = NULL; - - list->nr--; - list->sectors -= i->sectors; - - move_bucket_free(list, i); - } - - bch2_trans_unlock_long(ctxt->trans); -} - -static bool bucket_in_flight(struct buckets_in_flight *list, - struct move_bucket_key k) -{ - return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); -} - -static int bch2_copygc_get_buckets(struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); - size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; - int ret; - - move_buckets_wait(ctxt, buckets_in_flight, false); - - ret = bch2_btree_write_buffer_tryflush(trans); - if (bch2_err_matches(ret, EROFS)) - return ret; - - if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) - return ret; - - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), - 0, k, ({ - struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; - int ret2 = 0; - - saw++; - - ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)); - if (ret2 < 0) - goto err; - - if (!ret2) - not_movable++; - else if (bucket_in_flight(buckets_in_flight, b.k)) - in_flight++; - else { - struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); - ret2 = b_i ? 0 : -ENOMEM; - if (ret2) - goto err; - - *b_i = b; - - ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); - if (ret2) { - kfree(b_i); - goto err; - } - - ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, - bch_move_bucket_params); - BUG_ON(ret2); - - sectors += b.sectors; - } - - ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; -err: - ret2; - })); - - pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", - buckets_in_flight->nr, buckets_in_flight->sectors, - saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); - - return ret < 0 ? ret : 0; -} - -noinline -static int bch2_copygc(struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight, - bool *did_work) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct data_update_opts data_opts = { - .btree_insert_flags = BCH_WATERMARK_copygc, - }; - u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); - u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); - int ret = 0; - - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); - if (ret) - goto err; - - darray_for_each(buckets_in_flight->to_evacuate, i) { - if (kthread_should_stop() || freezing(current)) - break; - - struct move_bucket *b = *i; - *i = NULL; - - move_bucket_in_flight_add(buckets_in_flight, b); - - ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts); - if (ret) - goto err; - - *did_work = true; - } -err: - /* no entries in LRU btree found, or got to end: */ - if (bch2_err_matches(ret, ENOENT)) - ret = 0; - - if (ret < 0 && !bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "from bch2_move_data()"); - - sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; - sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; - trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved); - - darray_for_each(buckets_in_flight->to_evacuate, i) - if (*i) - move_bucket_free(buckets_in_flight, *i); - darray_exit(&buckets_in_flight->to_evacuate); - return ret; -} - -static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) -{ - struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); - struct bch_dev_usage usage; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - usage.buckets[i] = usage_full.d[i].buckets; - - s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * - ca->mi.bucket_size) >> 1); - s64 fragmented = 0; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - if (data_type_movable(i)) - fragmented += usage_full.d[i].fragmented; - - return max(0LL, fragmented_allowed - fragmented); -} - -/* - * Copygc runs when the amount of fragmented data is above some arbitrary - * threshold: - * - * The threshold at the limit - when the device is full - is the amount of space - * we reserved in bch2_recalc_capacity; we can't have more than that amount of - * disk space stranded due to fragmentation and store everything we have - * promised to store. - * - * But we don't want to be running copygc unnecessarily when the device still - * has plenty of free space - rather, we want copygc to smoothly run every so - * often and continually reduce the amount of fragmented space as the device - * fills up. So, we increase the threshold by half the current free space. - */ -u64 bch2_copygc_wait_amount(struct bch_fs *c) -{ - u64 wait = U64_MAX; - - guard(rcu)(); - for_each_rw_member_rcu(c, ca) - wait = min(wait, bch2_copygc_dev_wait_amount(ca)); - return wait; -} - -void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) -{ - printbuf_tabstop_push(out, 32); - prt_printf(out, "running:\t%u\n", c->copygc_running); - prt_printf(out, "copygc_wait:\t%llu\n", c->copygc_wait); - prt_printf(out, "copygc_wait_at:\t%llu\n", c->copygc_wait_at); - - prt_printf(out, "Currently waiting for:\t"); - prt_human_readable_u64(out, max(0LL, c->copygc_wait - - atomic64_read(&c->io_clock[WRITE].now)) << 9); - prt_newline(out); - - prt_printf(out, "Currently waiting since:\t"); - prt_human_readable_u64(out, max(0LL, - atomic64_read(&c->io_clock[WRITE].now) - - c->copygc_wait_at) << 9); - prt_newline(out); - - bch2_printbuf_make_room(out, 4096); - - struct task_struct *t; - out->atomic++; - scoped_guard(rcu) { - prt_printf(out, "Currently calculated wait:\n"); - for_each_rw_member_rcu(c, ca) { - prt_printf(out, " %s:\t", ca->name); - prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); - prt_newline(out); - } - - t = rcu_dereference(c->copygc_thread); - if (t) - get_task_struct(t); - } - --out->atomic; - - if (t) { - bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); - put_task_struct(t); - } -} - -static int bch2_copygc_thread(void *arg) -{ - struct bch_fs *c = arg; - struct moving_context ctxt; - struct bch_move_stats move_stats; - struct io_clock *clock = &c->io_clock[WRITE]; - struct buckets_in_flight buckets = {}; - u64 last, wait; - - buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL); - int ret = !buckets.table - ? -ENOMEM - : rhashtable_init(buckets.table, &bch_move_bucket_params); - bch_err_msg(c, ret, "allocating copygc buckets in flight"); - if (ret) - goto err; - - set_freezable(); - - /* - * Data move operations can't run until after check_snapshots has - * completed, and bch2_snapshot_is_ancestor() is available. - */ - kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || - kthread_should_stop()); - - bch2_move_stats_init(&move_stats, "copygc"); - bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, - writepoint_ptr(&c->copygc_write_point), - false); - - while (!ret && !kthread_should_stop()) { - bool did_work = false; - - bch2_trans_unlock_long(ctxt.trans); - cond_resched(); - - if (!c->opts.copygc_enabled) { - move_buckets_wait(&ctxt, &buckets, true); - kthread_wait_freezable(c->opts.copygc_enabled || - kthread_should_stop()); - } - - if (unlikely(freezing(current))) { - move_buckets_wait(&ctxt, &buckets, true); - __refrigerator(false); - continue; - } - - last = atomic64_read(&clock->now); - wait = bch2_copygc_wait_amount(c); - - if (wait > clock->max_slop) { - c->copygc_wait_at = last; - c->copygc_wait = last + wait; - move_buckets_wait(&ctxt, &buckets, true); - trace_and_count(c, copygc_wait, c, wait, last + wait); - bch2_kthread_io_clock_wait(clock, last + wait, - MAX_SCHEDULE_TIMEOUT); - continue; - } - - c->copygc_wait = 0; - - c->copygc_running = true; - ret = bch2_copygc(&ctxt, &buckets, &did_work); - c->copygc_running = false; - - wake_up(&c->copygc_running_wq); - - if (!wait && !did_work) { - u64 min_member_capacity = bch2_min_rw_member_capacity(c); - - if (min_member_capacity == U64_MAX) - min_member_capacity = 128 * 2048; - - move_buckets_wait(&ctxt, &buckets, true); - bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), - MAX_SCHEDULE_TIMEOUT); - } - } - - move_buckets_wait(&ctxt, &buckets, true); - rhashtable_destroy(buckets.table); - bch2_moving_ctxt_exit(&ctxt); - bch2_move_stats_exit(&move_stats, c); -err: - kfree(buckets.table); - return ret; -} - -void bch2_copygc_stop(struct bch_fs *c) -{ - if (c->copygc_thread) { - kthread_stop(c->copygc_thread); - put_task_struct(c->copygc_thread); - } - c->copygc_thread = NULL; -} - -int bch2_copygc_start(struct bch_fs *c) -{ - struct task_struct *t; - int ret; - - if (c->copygc_thread) - return 0; - - if (c->opts.nochanges) - return 0; - - if (bch2_fs_init_fault("copygc_start")) - return -ENOMEM; - - t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); - ret = PTR_ERR_OR_ZERO(t); - bch_err_msg(c, ret, "creating copygc thread"); - if (ret) - return ret; - - get_task_struct(t); - - c->copygc_thread = t; - wake_up_process(c->copygc_thread); - - return 0; -} - -void bch2_fs_copygc_init(struct bch_fs *c) -{ - init_waitqueue_head(&c->copygc_running_wq); - c->copygc_running = false; -} diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h deleted file mode 100644 index f615910d6f9836..00000000000000 --- a/fs/bcachefs/movinggc.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MOVINGGC_H -#define _BCACHEFS_MOVINGGC_H - -u64 bch2_copygc_wait_amount(struct bch_fs *); -void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); - -static inline void bch2_copygc_wakeup(struct bch_fs *c) -{ - guard(rcu)(); - struct task_struct *p = rcu_dereference(c->copygc_thread); - if (p) - wake_up_process(p); -} - -void bch2_copygc_stop(struct bch_fs *); -int bch2_copygc_start(struct bch_fs *); -void bch2_fs_copygc_init(struct bch_fs *); - -#endif /* _BCACHEFS_MOVINGGC_H */ diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c deleted file mode 100644 index c3f87c59922d1a..00000000000000 --- a/fs/bcachefs/namei.c +++ /dev/null @@ -1,1034 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "acl.h" -#include "btree_update.h" -#include "dirent.h" -#include "inode.h" -#include "namei.h" -#include "subvolume.h" -#include "xattr.h" - -#include - -static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode) -{ - return (subvol_inum) { - .subvol = inode->bi_parent_subvol ?: inum.subvol, - .inum = inode->bi_dir, - }; -} - -static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) -{ - return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; -} - -int bch2_create_trans(struct btree_trans *trans, - subvol_inum dir, - struct bch_inode_unpacked *dir_u, - struct bch_inode_unpacked *new_inode, - const struct qstr *name, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct posix_acl *default_acl, - struct posix_acl *acl, - subvol_inum snapshot_src, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter inode_iter = {}; - subvol_inum new_inum = dir; - u64 now = bch2_current_time(c); - u64 cpu = raw_smp_processor_id(); - u64 dir_target; - u32 snapshot; - unsigned dir_type = mode_to_type(mode); - int ret; - - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, - BTREE_ITER_intent|BTREE_ITER_with_updates); - if (ret) - goto err; - - if (!(flags & BCH_CREATE_SNAPSHOT)) { - /* Normal create path - allocate a new inode: */ - bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u); - - if (flags & BCH_CREATE_TMPFILE) - new_inode->bi_flags |= BCH_INODE_unlinked; - - ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); - if (ret) - goto err; - - snapshot_src = (subvol_inum) { 0 }; - } else { - /* - * Creating a snapshot - we're not allocating a new inode, but - * we do have to lookup the root inode of the subvolume we're - * snapshotting and update it (in the new snapshot): - */ - - if (!snapshot_src.inum) { - /* Inode wasn't specified, just snapshot: */ - struct bch_subvolume s; - ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s); - if (ret) - goto err; - - snapshot_src.inum = le64_to_cpu(s.inode); - } - - ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, - BTREE_ITER_intent); - if (ret) - goto err; - - if (new_inode->bi_subvol != snapshot_src.subvol) { - /* Not a subvolume root: */ - ret = -EINVAL; - goto err; - } - - /* - * If we're not root, we have to own the subvolume being - * snapshotted: - */ - if (uid && new_inode->bi_uid != uid) { - ret = -EPERM; - goto err; - } - - flags |= BCH_CREATE_SUBVOL; - } - - new_inum.inum = new_inode->bi_inum; - dir_target = new_inode->bi_inum; - - if (flags & BCH_CREATE_SUBVOL) { - u32 new_subvol, dir_snapshot; - - ret = bch2_subvolume_create(trans, new_inode->bi_inum, - dir.subvol, - snapshot_src.subvol, - &new_subvol, &snapshot, - (flags & BCH_CREATE_SNAPSHOT_RO) != 0); - if (ret) - goto err; - - new_inode->bi_parent_subvol = dir.subvol; - new_inode->bi_subvol = new_subvol; - new_inum.subvol = new_subvol; - dir_target = new_subvol; - dir_type = DT_SUBVOL; - - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot); - ret = bch2_btree_iter_traverse(trans, &dir_iter); - if (ret) - goto err; - } - - if (!(flags & BCH_CREATE_SNAPSHOT)) { - if (default_acl) { - ret = bch2_set_acl_trans(trans, new_inum, new_inode, - default_acl, ACL_TYPE_DEFAULT); - if (ret) - goto err; - } - - if (acl) { - ret = bch2_set_acl_trans(trans, new_inum, new_inode, - acl, ACL_TYPE_ACCESS); - if (ret) - goto err; - } - } - - if (!(flags & BCH_CREATE_TMPFILE)) { - struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); - u64 dir_offset; - - if (is_subdir_for_nlink(new_inode)) - dir_u->bi_nlink++; - dir_u->bi_mtime = dir_u->bi_ctime = now; - - ret = bch2_dirent_create(trans, dir, &dir_hash, - dir_type, - name, - dir_target, - &dir_offset, - STR_HASH_must_create|BTREE_ITER_with_updates) ?: - bch2_inode_write(trans, &dir_iter, dir_u); - if (ret) - goto err; - - new_inode->bi_dir = dir_u->bi_inum; - new_inode->bi_dir_offset = dir_offset; - } - - if (S_ISDIR(mode)) { - ret = bch2_maybe_propagate_has_case_insensitive(trans, - (subvol_inum) { - new_inode->bi_subvol ?: dir.subvol, - new_inode->bi_inum }, - new_inode); - if (ret) - goto err; - } - - if (S_ISDIR(mode) && - !new_inode->bi_subvol) - new_inode->bi_depth = dir_u->bi_depth + 1; - - inode_iter.flags &= ~BTREE_ITER_all_snapshots; - bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot); - - ret = bch2_btree_iter_traverse(trans, &inode_iter) ?: - bch2_inode_write(trans, &inode_iter, new_inode); -err: - bch2_trans_iter_exit(trans, &inode_iter); - bch2_trans_iter_exit(trans, &dir_iter); - return ret; -} - -int bch2_link_trans(struct btree_trans *trans, - subvol_inum dir, struct bch_inode_unpacked *dir_u, - subvol_inum inum, struct bch_inode_unpacked *inode_u, - const struct qstr *name) -{ - struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter inode_iter = {}; - struct bch_hash_info dir_hash; - u64 now = bch2_current_time(c); - u64 dir_offset = 0; - int ret; - - if (dir.subvol != inum.subvol) - return -EXDEV; - - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); - if (ret) - return ret; - - inode_u->bi_ctime = now; - ret = bch2_inode_nlink_inc(inode_u); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); - if (ret) - goto err; - - if (bch2_reinherit_attrs(inode_u, dir_u)) { - ret = -EXDEV; - goto err; - } - - dir_u->bi_mtime = dir_u->bi_ctime = now; - - dir_hash = bch2_hash_info_init(c, dir_u); - - ret = bch2_dirent_create(trans, dir, &dir_hash, - mode_to_type(inode_u->bi_mode), - name, inum.inum, - &dir_offset, - STR_HASH_must_create); - if (ret) - goto err; - - inode_u->bi_dir = dir.inum; - inode_u->bi_dir_offset = dir_offset; - - ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: - bch2_inode_write(trans, &inode_iter, inode_u); -err: - bch2_trans_iter_exit(trans, &dir_iter); - bch2_trans_iter_exit(trans, &inode_iter); - return ret; -} - -int bch2_unlink_trans(struct btree_trans *trans, - subvol_inum dir, - struct bch_inode_unpacked *dir_u, - struct bch_inode_unpacked *inode_u, - const struct qstr *name, - bool deleting_subvol) -{ - struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter dirent_iter = {}; - struct btree_iter inode_iter = {}; - struct bch_hash_info dir_hash; - subvol_inum inum; - u64 now = bch2_current_time(c); - struct bkey_s_c k; - int ret; - - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); - if (ret) - goto err; - - dir_hash = bch2_hash_info_init(c, dir_u); - - ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, - name, &inum, BTREE_ITER_intent); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, - BTREE_ITER_intent); - if (ret) - goto err; - - if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) { - ret = bch2_empty_dir_trans(trans, inum); - if (ret) - goto err; - } - - if (deleting_subvol && !inode_u->bi_subvol) { - ret = bch_err_throw(c, ENOENT_not_subvol); - goto err; - } - - if (inode_u->bi_subvol) { - /* Recursive subvolume destroy not allowed (yet?) */ - ret = bch2_subvol_has_children(trans, inode_u->bi_subvol); - if (ret) - goto err; - } - - if (deleting_subvol || inode_u->bi_subvol) { - ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); - if (ret) - goto err; - - k = bch2_btree_iter_peek_slot(trans, &dirent_iter); - ret = bkey_err(k); - if (ret) - goto err; - - /* - * If we're deleting a subvolume, we need to really delete the - * dirent, not just emit a whiteout in the current snapshot: - */ - bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &dirent_iter); - if (ret) - goto err; - } else { - bch2_inode_nlink_dec(trans, inode_u); - } - - if (inode_u->bi_dir == dirent_iter.pos.inode && - inode_u->bi_dir_offset == dirent_iter.pos.offset) { - inode_u->bi_dir = 0; - inode_u->bi_dir_offset = 0; - } - - dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; - dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); - - ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash, &dirent_iter, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_inode_write(trans, &dir_iter, dir_u) ?: - bch2_inode_write(trans, &inode_iter, inode_u); -err: - bch2_trans_iter_exit(trans, &inode_iter); - bch2_trans_iter_exit(trans, &dirent_iter); - bch2_trans_iter_exit(trans, &dir_iter); - return ret; -} - -bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, - struct bch_inode_unpacked *src_u) -{ - u64 src, dst; - unsigned id; - bool ret = false; - - for (id = 0; id < Inode_opt_nr; id++) { - if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold) - continue; - - /* Skip attributes that were explicitly set on this inode */ - if (dst_u->bi_fields_set & (1 << id)) - continue; - - src = bch2_inode_opt_get(src_u, id); - dst = bch2_inode_opt_get(dst_u, id); - - if (src == dst) - continue; - - bch2_inode_opt_set(dst_u, id, src); - ret = true; - } - - return ret; -} - -static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent) -{ - struct btree_iter iter; - struct bkey_i_subvolume *s = - bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvol), - BTREE_ITER_cached, subvolume); - int ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - s->v.fs_path_parent = cpu_to_le32(new_parent); - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -int bch2_rename_trans(struct btree_trans *trans, - subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, - subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, - struct bch_inode_unpacked *src_inode_u, - struct bch_inode_unpacked *dst_inode_u, - const struct qstr *src_name, - const struct qstr *dst_name, - enum bch_rename_mode mode) -{ - struct bch_fs *c = trans->c; - struct btree_iter src_dir_iter = {}; - struct btree_iter dst_dir_iter = {}; - struct btree_iter src_inode_iter = {}; - struct btree_iter dst_inode_iter = {}; - struct bch_hash_info src_hash, dst_hash; - subvol_inum src_inum, dst_inum; - u64 src_offset, dst_offset; - u64 now = bch2_current_time(c); - int ret; - - ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, - BTREE_ITER_intent); - if (ret) - goto err; - - src_hash = bch2_hash_info_init(c, src_dir_u); - - if (!subvol_inum_eq(dst_dir, src_dir)) { - ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, - BTREE_ITER_intent); - if (ret) - goto err; - - dst_hash = bch2_hash_info_init(c, dst_dir_u); - } else { - dst_dir_u = src_dir_u; - dst_hash = src_hash; - } - - ret = bch2_dirent_rename(trans, - src_dir, &src_hash, - dst_dir, &dst_hash, - src_name, &src_inum, &src_offset, - dst_name, &dst_inum, &dst_offset, - mode); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, - BTREE_ITER_intent); - if (ret) - goto err; - - if (dst_inum.inum) { - ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, - BTREE_ITER_intent); - if (ret) - goto err; - } - - if (src_inode_u->bi_subvol && - dst_dir.subvol != src_inode_u->bi_parent_subvol) { - ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol); - if (ret) - goto err; - } - - if (mode == BCH_RENAME_EXCHANGE && - dst_inode_u->bi_subvol && - src_dir.subvol != dst_inode_u->bi_parent_subvol) { - ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol); - if (ret) - goto err; - } - - /* Can't move across subvolumes, unless it's a subvolume root: */ - if (src_dir.subvol != dst_dir.subvol && - (!src_inode_u->bi_subvol || - (dst_inum.inum && !dst_inode_u->bi_subvol))) { - ret = -EXDEV; - goto err; - } - - if (src_inode_u->bi_parent_subvol) - src_inode_u->bi_parent_subvol = dst_dir.subvol; - - if ((mode == BCH_RENAME_EXCHANGE) && - dst_inode_u->bi_parent_subvol) - dst_inode_u->bi_parent_subvol = src_dir.subvol; - - src_inode_u->bi_dir = dst_dir_u->bi_inum; - src_inode_u->bi_dir_offset = dst_offset; - - if (mode == BCH_RENAME_EXCHANGE) { - dst_inode_u->bi_dir = src_dir_u->bi_inum; - dst_inode_u->bi_dir_offset = src_offset; - } - - if (mode == BCH_RENAME_OVERWRITE && - dst_inode_u->bi_dir == dst_dir_u->bi_inum && - dst_inode_u->bi_dir_offset == src_offset) { - dst_inode_u->bi_dir = 0; - dst_inode_u->bi_dir_offset = 0; - } - - if (mode == BCH_RENAME_OVERWRITE) { - if (S_ISDIR(src_inode_u->bi_mode) != - S_ISDIR(dst_inode_u->bi_mode)) { - ret = -ENOTDIR; - goto err; - } - - if (S_ISDIR(dst_inode_u->bi_mode)) { - ret = bch2_empty_dir_trans(trans, dst_inum); - if (ret) - goto err; - } - } - - if (!subvol_inum_eq(dst_dir, src_dir)) { - if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && - S_ISDIR(src_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } - - if (mode == BCH_RENAME_EXCHANGE && - bch2_reinherit_attrs(dst_inode_u, src_dir_u) && - S_ISDIR(dst_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } - - ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?: - (mode == BCH_RENAME_EXCHANGE - ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u) - : 0); - if (ret) - goto err; - - if (is_subdir_for_nlink(src_inode_u)) { - src_dir_u->bi_nlink--; - dst_dir_u->bi_nlink++; - } - - if (S_ISDIR(src_inode_u->bi_mode) && - !src_inode_u->bi_subvol) - src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; - - if (mode == BCH_RENAME_EXCHANGE && - S_ISDIR(dst_inode_u->bi_mode) && - !dst_inode_u->bi_subvol) - dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; - } - - if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { - dst_dir_u->bi_nlink--; - src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; - } - - if (mode == BCH_RENAME_OVERWRITE) - bch2_inode_nlink_dec(trans, dst_inode_u); - - src_dir_u->bi_mtime = now; - src_dir_u->bi_ctime = now; - - if (src_dir.inum != dst_dir.inum) { - dst_dir_u->bi_mtime = now; - dst_dir_u->bi_ctime = now; - } - - src_inode_u->bi_ctime = now; - - if (dst_inum.inum) - dst_inode_u->bi_ctime = now; - - ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: - (src_dir.inum != dst_dir.inum - ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) - : 0) ?: - bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: - (dst_inum.inum - ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) - : 0); -err: - bch2_trans_iter_exit(trans, &dst_inode_iter); - bch2_trans_iter_exit(trans, &src_inode_iter); - bch2_trans_iter_exit(trans, &dst_dir_iter); - bch2_trans_iter_exit(trans, &src_dir_iter); - return ret; -} - -/* inum_to_path */ - -static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) -{ - bch2_printbuf_make_room(out, n); - - unsigned can_print = min(n, printbuf_remaining(out)); - - b += n; - - for (unsigned i = 0; i < can_print; i++) - out->buf[out->pos++] = *((char *) --b); - - printbuf_nul_terminate(out); -} - -static inline void prt_str_reversed(struct printbuf *out, const char *s) -{ - prt_bytes_reversed(out, s, strlen(s)); -} - -static inline void reverse_bytes(void *b, size_t n) -{ - char *e = b + n, *s = b; - - while (s < e) { - --e; - swap(*s, *e); - s++; - } -} - -static int __bch2_inum_to_path(struct btree_trans *trans, - u32 subvol, u64 inum, u32 snapshot, - struct printbuf *path) -{ - unsigned orig_pos = path->pos; - int ret = 0; - DARRAY(subvol_inum) inums = {}; - - if (!snapshot) { - ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); - if (ret) - goto disconnected; - } - - while (true) { - subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum }; - - if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) { - prt_str_reversed(path, "(loop)"); - break; - } - - ret = darray_push(&inums, n); - if (ret) - goto err; - - struct bch_inode_unpacked inode; - ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); - if (ret) - goto disconnected; - - if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL && - inode.bi_inum == BCACHEFS_ROOT_INO) - break; - - if (!inode.bi_dir && !inode.bi_dir_offset) { - ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer); - goto disconnected; - } - - inum = inode.bi_dir; - if (inode.bi_parent_subvol) { - subvol = inode.bi_parent_subvol; - ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot); - if (ret) - goto disconnected; - } - - struct btree_iter d_iter; - struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, - BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), - 0, dirent); - ret = bkey_err(d.s_c); - if (ret) - goto disconnected; - - struct qstr dirent_name = bch2_dirent_get_name(d); - - prt_bytes_reversed(path, dirent_name.name, dirent_name.len); - - prt_char(path, '/'); - - bch2_trans_iter_exit(trans, &d_iter); - } - - if (orig_pos == path->pos) - prt_char(path, '/'); -out: - ret = path->allocation_failure ? -ENOMEM : 0; - if (ret) - goto err; - - reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); - darray_exit(&inums); - return 0; -err: - darray_exit(&inums); - return ret; -disconnected: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - - prt_str_reversed(path, "(disconnected)"); - goto out; -} - -int bch2_inum_to_path(struct btree_trans *trans, - subvol_inum inum, - struct printbuf *path) -{ - return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path); -} - -int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot, - snapshot_id_list *snapshot_overwrites, - struct printbuf *path) -{ - return __bch2_inum_to_path(trans, 0, inum, snapshot, path); -} - -/* fsck */ - -static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - bool in_fsck) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = {}; - int ret = 0; - - if (inode_points_to_dirent(target, d)) - return 0; - - if (!bch2_inode_has_backpointer(target)) { - fsck_err_on(S_ISDIR(target->bi_mode), - trans, inode_dir_missing_backpointer, - "directory with missing backpointer\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - fsck_err_on(target->bi_flags & BCH_INODE_unlinked, - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - target->bi_flags &= ~BCH_INODE_unlinked; - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - return __bch2_fsck_write_inode(trans, target); - } - - struct bkey_s_c_dirent bp_dirent = - bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, - SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), - 0, dirent); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; - - if (!backpointer_exists) { - if (fsck_err(trans, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target->bi_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - ret = __bch2_fsck_write_inode(trans, target); - } - } else { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); - - if (S_ISDIR(target->bi_mode) || target->bi_subvol) { - /* - * XXX: verify connectivity of the other dirent - * up to the root before removing this one - * - * Additionally, bch2_lookup would need to cope with the - * dirent it found being removed - or should we remove - * the other one, even though the inode points to it? - */ - if (in_fsck) { - if (fsck_err(trans, inode_dir_multiple_links, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf)) - ret = bch2_fsck_remove_dirent(trans, d.k->p); - } else { - bch2_fs_inconsistent(c, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf); - } - - goto out; - } else { - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(!target->bi_nlink, - trans, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - ret = __bch2_fsck_write_inode(trans, target); - if (ret) - goto err; - } - } - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -int __bch2_check_dirent_target(struct btree_trans *trans, - struct btree_iter *dirent_iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - bool in_fsck) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); - if (ret) - goto err; - - if (fsck_err_on(d.v->d_type != inode_d_type(target), - trans, dirent_d_type_wrong, - "incorrect d_type: got %s, should be %s:\n%s", - bch2_d_type_str(d.v->d_type), - bch2_d_type_str(inode_d_type(target)), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = inode_d_type(target); - if (n->v.d_type == DT_SUBVOL) { - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); - n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); - } else { - n->v.d_inum = cpu_to_le64(target->bi_inum); - } - - ret = bch2_trans_update(trans, dirent_iter, &n->k_i, - BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto err; - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -/* - * BCH_INODE_has_case_insensitive: - * We have to track whether directories have any descendent directory that is - * casefolded - for overlayfs: - */ - -static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum) -{ - struct btree_iter iter = {}; - int ret = 0; - - while (true) { - struct bch_inode_unpacked inode; - ret = bch2_inode_peek(trans, &iter, &inode, inum, - BTREE_ITER_intent|BTREE_ITER_with_updates); - if (ret) - break; - - if (inode.bi_flags & BCH_INODE_has_case_insensitive) - break; - - inode.bi_flags |= BCH_INODE_has_case_insensitive; - ret = bch2_inode_write(trans, &iter, &inode); - if (ret) - break; - - bch2_trans_iter_exit(trans, &iter); - if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM)) - break; - - inum = parent_inum(inum, &inode); - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - if (!bch2_inode_casefold(trans->c, inode)) - return 0; - - inode->bi_flags |= BCH_INODE_has_case_insensitive; - - return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode)); -} - -int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - snapshot_id_list *snapshot_overwrites, - bool *do_update) -{ - struct printbuf buf = PRINTBUF; - bool repairing_parents = false; - int ret = 0; - - if (!S_ISDIR(inode->bi_mode)) { - /* - * Old versions set bi_casefold for non dirs, but that's - * unnecessary and wasteful - */ - if (inode->bi_casefold) { - inode->bi_casefold = 0; - *do_update = true; - } - return 0; - } - - if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive) - return 0; - - if (bch2_inode_casefold(trans->c, inode) && - !(inode->bi_flags & BCH_INODE_has_case_insensitive)) { - prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ", - inode->bi_inum, inode->bi_snapshot); - - ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot, - snapshot_overwrites, &buf); - if (ret) - goto err; - - if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) { - inode->bi_flags |= BCH_INODE_has_case_insensitive; - *do_update = true; - } - } - - if (!(inode->bi_flags & BCH_INODE_has_case_insensitive)) - goto out; - - struct bch_inode_unpacked dir = *inode; - u32 snapshot = dir.bi_snapshot; - - while (!(dir.bi_inum == BCACHEFS_ROOT_INO && - dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { - if (dir.bi_parent_subvol) { - ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot); - if (ret) - goto err; - - snapshot_overwrites = NULL; - } - - ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0); - if (ret) - goto err; - - if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) { - prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n"); - - ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot, - snapshot_overwrites, &buf); - if (ret) - goto err; - - if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) { - dir.bi_flags |= BCH_INODE_has_case_insensitive; - ret = __bch2_fsck_write_inode(trans, &dir); - if (ret) - goto err; - } - } - - /* - * We only need to check the first parent, unless we find an - * inconsistency - */ - if (!repairing_parents) - break; - } -out: -err: -fsck_err: - printbuf_exit(&buf); - if (ret) - return ret; - - if (repairing_parents) { - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - } - - return 0; -} diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h deleted file mode 100644 index ae6ebc2d078504..00000000000000 --- a/fs/bcachefs/namei.h +++ /dev/null @@ -1,79 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_NAMEI_H -#define _BCACHEFS_NAMEI_H - -#include "dirent.h" - -struct posix_acl; - -#define BCH_CREATE_TMPFILE (1U << 0) -#define BCH_CREATE_SUBVOL (1U << 1) -#define BCH_CREATE_SNAPSHOT (1U << 2) -#define BCH_CREATE_SNAPSHOT_RO (1U << 3) - -int bch2_create_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - const struct qstr *, - uid_t, gid_t, umode_t, dev_t, - struct posix_acl *, - struct posix_acl *, - subvol_inum, unsigned); - -int bch2_link_trans(struct btree_trans *, - subvol_inum, struct bch_inode_unpacked *, - subvol_inum, struct bch_inode_unpacked *, - const struct qstr *); - -int bch2_unlink_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - const struct qstr *, bool); - -int bch2_rename_trans(struct btree_trans *, - subvol_inum, struct bch_inode_unpacked *, - subvol_inum, struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - const struct qstr *, - const struct qstr *, - enum bch_rename_mode); - -bool bch2_reinherit_attrs(struct bch_inode_unpacked *, - struct bch_inode_unpacked *); - -int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32, - snapshot_id_list *, struct printbuf *); - -int __bch2_check_dirent_target(struct btree_trans *, - struct btree_iter *, - struct bkey_s_c_dirent, - struct bch_inode_unpacked *, bool); - -static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - -static inline int bch2_check_dirent_target(struct btree_trans *trans, - struct btree_iter *dirent_iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - bool in_fsck) -{ - if (likely(inode_points_to_dirent(target, d) && - d.v->d_type == inode_d_type(target))) - return 0; - - return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); -} - -int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *); -int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *, - snapshot_id_list *, bool *); - -#endif /* _BCACHEFS_NAMEI_H */ diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c deleted file mode 100644 index 962218fa68ec01..00000000000000 --- a/fs/bcachefs/nocow_locking.c +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_methods.h" -#include "nocow_locking.h" -#include "util.h" - -#include - -bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - unsigned i; - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (l->b[i] == dev_bucket && atomic_read(&l->l[i])) - return true; - return false; -} - -#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0) - -void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - int lock_val = flags ? 1 : -1; - unsigned i; - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (l->b[i] == dev_bucket) { - int v = atomic_sub_return(lock_val, &l->l[i]); - - BUG_ON(v && sign(v) != lock_val); - if (!v) - closure_wake_up(&l->wait); - return; - } - - BUG(); -} - -bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, - u64 dev_bucket, int flags) -{ - int v, lock_val = flags ? 1 : -1; - unsigned i; - - spin_lock(&l->lock); - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (l->b[i] == dev_bucket) - goto got_entry; - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (!atomic_read(&l->l[i])) { - l->b[i] = dev_bucket; - goto take_lock; - } -fail: - spin_unlock(&l->lock); - return false; -got_entry: - v = atomic_read(&l->l[i]); - if (lock_val > 0 ? v < 0 : v > 0) - goto fail; -take_lock: - v = atomic_read(&l->l[i]); - /* Overflow? */ - if (v && sign(v + lock_val) != sign(v)) - goto fail; - - atomic_add(lock_val, &l->l[i]); - spin_unlock(&l->lock); - return true; -} - -void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, - struct nocow_lock_bucket *l, - u64 dev_bucket, int flags) -{ - if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) { - struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); - u64 start_time = local_clock(); - - __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags)); - bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); - } -} - -void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t) - -{ - unsigned i, nr_zero = 0; - struct nocow_lock_bucket *l; - - for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) { - unsigned v = 0; - - for (i = 0; i < ARRAY_SIZE(l->l); i++) - v |= atomic_read(&l->l[i]); - - if (!v) { - nr_zero++; - continue; - } - - if (nr_zero) - prt_printf(out, "(%u empty entries)\n", nr_zero); - nr_zero = 0; - - for (i = 0; i < ARRAY_SIZE(l->l); i++) { - int v = atomic_read(&l->l[i]); - if (v) { - bch2_bpos_to_text(out, u64_to_bucket(l->b[i])); - prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v)); - } - } - prt_newline(out); - } - - if (nr_zero) - prt_printf(out, "(%u empty entries)\n", nr_zero); -} - -void bch2_fs_nocow_locking_exit(struct bch_fs *c) -{ - struct bucket_nocow_lock_table *t = &c->nocow_locks; - - for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) - for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++) - BUG_ON(atomic_read(&l->l[j])); -} - -void bch2_fs_nocow_locking_init_early(struct bch_fs *c) -{ - struct bucket_nocow_lock_table *t = &c->nocow_locks; - - for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) - spin_lock_init(&l->lock); -} diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h deleted file mode 100644 index 48b8a003c0d25a..00000000000000 --- a/fs/bcachefs/nocow_locking.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_NOCOW_LOCKING_H -#define _BCACHEFS_NOCOW_LOCKING_H - -#include "bcachefs.h" -#include "alloc_background.h" -#include "nocow_locking_types.h" - -#include - -static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t, - u64 dev_bucket) -{ - unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS); - - return t->l + (h & (BUCKET_NOCOW_LOCKS - 1)); -} - -#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0) - -bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos); -void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int); -bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int); -void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, - struct nocow_lock_bucket *, u64, int); - -static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, - struct bpos bucket, int flags) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - - __bch2_bucket_nocow_lock(t, l, dev_bucket, flags); -} - -static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, - struct bpos bucket, int flags) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - - return __bch2_bucket_nocow_trylock(l, dev_bucket, flags); -} - -void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); - -void bch2_fs_nocow_locking_exit(struct bch_fs *); -void bch2_fs_nocow_locking_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h deleted file mode 100644 index bd12bf6779241f..00000000000000 --- a/fs/bcachefs/nocow_locking_types.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H -#define _BCACHEFS_NOCOW_LOCKING_TYPES_H - -#define BUCKET_NOCOW_LOCKS_BITS 10 -#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS) - -struct nocow_lock_bucket { - struct closure_waitlist wait; - spinlock_t lock; - u64 b[4]; - atomic_t l[4]; -} __aligned(SMP_CACHE_BYTES); - -struct bucket_nocow_lock_table { - struct nocow_lock_bucket l[BUCKET_NOCOW_LOCKS]; -}; - -#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */ - diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c deleted file mode 100644 index b1cf88905b816e..00000000000000 --- a/fs/bcachefs/opts.c +++ /dev/null @@ -1,844 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include - -#include "bcachefs.h" -#include "compress.h" -#include "disk_groups.h" -#include "error.h" -#include "movinggc.h" -#include "opts.h" -#include "rebalance.h" -#include "recovery_passes.h" -#include "super-io.h" -#include "util.h" - -#define x(t, n, ...) [n] = #t, - -const char * const bch2_error_actions[] = { - BCH_ERROR_ACTIONS() - NULL -}; - -const char * const bch2_degraded_actions[] = { - BCH_DEGRADED_ACTIONS() - NULL -}; - -const char * const bch2_fsck_fix_opts[] = { - BCH_FIX_ERRORS_OPTS() - NULL -}; - -const char * const bch2_version_upgrade_opts[] = { - BCH_VERSION_UPGRADE_OPTS() - NULL -}; - -const char * const bch2_sb_features[] = { - BCH_SB_FEATURES() - NULL -}; - -const char * const bch2_sb_compat[] = { - BCH_SB_COMPAT() - NULL -}; - -const char * const __bch2_btree_ids[] = { - BCH_BTREE_IDS() - NULL -}; - -const char * const __bch2_csum_types[] = { - BCH_CSUM_TYPES() - NULL -}; - -const char * const __bch2_csum_opts[] = { - BCH_CSUM_OPTS() - NULL -}; - -const char * const __bch2_compression_types[] = { - BCH_COMPRESSION_TYPES() - NULL -}; - -const char * const bch2_compression_opts[] = { - BCH_COMPRESSION_OPTS() - NULL -}; - -const char * const __bch2_str_hash_types[] = { - BCH_STR_HASH_TYPES() - NULL -}; - -const char * const bch2_str_hash_opts[] = { - BCH_STR_HASH_OPTS() - NULL -}; - -const char * const __bch2_data_types[] = { - BCH_DATA_TYPES() - NULL -}; - -const char * const bch2_member_states[] = { - BCH_MEMBER_STATES() - NULL -}; - -static const char * const __bch2_jset_entry_types[] = { - BCH_JSET_ENTRY_TYPES() - NULL -}; - -static const char * const __bch2_fs_usage_types[] = { - BCH_FS_USAGE_TYPES() - NULL -}; - -#undef x - -static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], - unsigned nr, const char *type, unsigned idx) -{ - if (idx < nr) - prt_str(out, opts[idx]); - else - prt_printf(out, "(unknown %s %u)", type, idx); -} - -#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \ -void bch2_prt_##name(struct printbuf *out, type t) \ -{ \ - prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\ -} - -PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); -PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); -PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); -PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); -PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); -PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); -PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); - -static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, - struct printbuf *err) -{ - if (!val) { - *res = FSCK_FIX_yes; - } else { - int ret = match_string(bch2_fsck_fix_opts, -1, val); - - if (ret < 0 && err) - prt_str(err, "fix_errors: invalid selection"); - if (ret < 0) - return ret; - *res = ret; - } - - return 0; -} - -static void bch2_opt_fix_errors_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_sb *sb, - u64 v) -{ - prt_str(out, bch2_fsck_fix_opts[v]); -} - -#define bch2_opt_fix_errors (struct bch_opt_fn) { \ - .parse = bch2_opt_fix_errors_parse, \ - .to_text = bch2_opt_fix_errors_to_text, \ -} - -const char * const bch2_d_types[BCH_DT_MAX] = { - [DT_UNKNOWN] = "unknown", - [DT_FIFO] = "fifo", - [DT_CHR] = "chr", - [DT_DIR] = "dir", - [DT_BLK] = "blk", - [DT_REG] = "reg", - [DT_LNK] = "lnk", - [DT_SOCK] = "sock", - [DT_WHT] = "whiteout", - [DT_SUBVOL] = "subvol", -}; - -void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) -{ -#define x(_name, ...) \ - if (opt_defined(src, _name)) \ - opt_set(*dst, _name, src._name); - - BCH_OPTS() -#undef x -} - -bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) -{ - switch (id) { -#define x(_name, ...) \ - case Opt_##_name: \ - return opt_defined(*opts, _name); - BCH_OPTS() -#undef x - default: - BUG(); - } -} - -u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) -{ - switch (id) { -#define x(_name, ...) \ - case Opt_##_name: \ - return opts->_name; - BCH_OPTS() -#undef x - default: - BUG(); - } -} - -void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) -{ - switch (id) { -#define x(_name, ...) \ - case Opt_##_name: \ - opt_set(*opts, _name, v); \ - break; - BCH_OPTS() -#undef x - default: - BUG(); - } -} - -/* dummy option, for options that aren't stored in the superblock */ -typedef u64 (*sb_opt_get_fn)(const struct bch_sb *); -typedef void (*sb_opt_set_fn)(struct bch_sb *, u64); -typedef u64 (*member_opt_get_fn)(const struct bch_member *); -typedef void (*member_opt_set_fn)(struct bch_member *, u64); - -__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL; -__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL; -__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL; -__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL; - -#define type_compatible_or_null(_p, _type) \ - __builtin_choose_expr( \ - __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL) - -const struct bch_option bch2_opt_table[] = { -#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 -#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ - .min = _min, .max = _max -#define OPT_STR(_choices) .type = BCH_OPT_STR, \ - .min = 0, .max = ARRAY_SIZE(_choices) - 1, \ - .choices = _choices -#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ - .min = 0, .max = U64_MAX, \ - .choices = _choices -#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \ - .choices = _choices -#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn - -#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ - [Opt_##_name] = { \ - .attr.name = #_name, \ - .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ - .flags = _flags, \ - .hint = _hint, \ - .help = _help, \ - .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \ - .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \ - .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \ - .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\ - _type \ - }, - - BCH_OPTS() -#undef x -}; - -int bch2_opt_lookup(const char *name) -{ - const struct bch_option *i; - - for (i = bch2_opt_table; - i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); - i++) - if (!strcmp(name, i->attr.name)) - return i - bch2_opt_table; - - return -1; -} - -struct opt_synonym { - const char *s1, *s2; -}; - -static const struct opt_synonym bch2_opt_synonyms[] = { - { "quota", "usrquota" }, -}; - -static int bch2_mount_opt_lookup(const char *name) -{ - const struct opt_synonym *i; - - for (i = bch2_opt_synonyms; - i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms); - i++) - if (!strcmp(name, i->s1)) - name = i->s2; - - return bch2_opt_lookup(name); -} - -struct opt_val_synonym { - const char *opt, *v1, *v2; -}; - -static const struct opt_val_synonym bch2_opt_val_synonyms[] = { - { "degraded", "true", "yes" }, - { "degraded", "false", "no" }, - { "degraded", "1", "yes" }, - { "degraded", "0", "no" }, -}; - -static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val) -{ - const struct opt_val_synonym *i; - - for (i = bch2_opt_val_synonyms; - i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms); - i++) - if (!strcmp(opt, i->opt) && !strcmp(val, i->v1)) - return i->v2; - - return val; -} - -int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) -{ - if (v < opt->min) { - if (err) - prt_printf(err, "%s: too small (min %llu)", - opt->attr.name, opt->min); - return -BCH_ERR_ERANGE_option_too_small; - } - - if (opt->max && v >= opt->max) { - if (err) - prt_printf(err, "%s: too big (max %llu)", - opt->attr.name, opt->max); - return -BCH_ERR_ERANGE_option_too_big; - } - - if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { - if (err) - prt_printf(err, "%s: not a multiple of 512", - opt->attr.name); - return -BCH_ERR_opt_parse_error; - } - - if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { - if (err) - prt_printf(err, "%s: must be a power of two", - opt->attr.name); - return -BCH_ERR_opt_parse_error; - } - - if (opt->fn.validate) - return opt->fn.validate(v, err); - - return 0; -} - -int bch2_opt_parse(struct bch_fs *c, - const struct bch_option *opt, - const char *val, u64 *res, - struct printbuf *err) -{ - ssize_t ret; - - if (err) - printbuf_indent_add_nextline(err, 2); - - switch (opt->type) { - case BCH_OPT_BOOL: - if (!val) - val = "1"; - - ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); - if (ret != -BCH_ERR_option_not_bool) { - *res = ret; - } else { - if (err) - prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; - } - break; - case BCH_OPT_UINT: - if (!val) { - prt_printf(err, "%s: required value", - opt->attr.name); - return -EINVAL; - } - - if (*val != '-') { - ret = opt->flags & OPT_HUMAN_READABLE - ? bch2_strtou64_h(val, res) - : kstrtou64(val, 10, res); - } else { - prt_printf(err, "%s: must be a non-negative number", opt->attr.name); - return -BCH_ERR_option_negative; - } - - if (ret < 0) { - if (err) - prt_printf(err, "%s: must be a number", - opt->attr.name); - return ret; - } - break; - case BCH_OPT_STR: - if (!val) { - prt_printf(err, "%s: required value", - opt->attr.name); - return -EINVAL; - } - - ret = match_string(opt->choices, -1, val); - if (ret < 0) { - if (err) - prt_printf(err, "%s: invalid selection", - opt->attr.name); - return ret; - } - - *res = ret; - break; - case BCH_OPT_BITFIELD: { - s64 v = bch2_read_flag_list(val, opt->choices); - if (v < 0) - return v; - *res = v; - break; - } - case BCH_OPT_FN: - ret = opt->fn.parse(c, val, res, err); - - if (ret == -BCH_ERR_option_needs_open_fs) - return ret; - - if (ret < 0) { - if (err) - prt_printf(err, "%s: parse error", - opt->attr.name); - return ret; - } - } - - return bch2_opt_validate(opt, *res, err); -} - -void bch2_opt_to_text(struct printbuf *out, - struct bch_fs *c, struct bch_sb *sb, - const struct bch_option *opt, u64 v, - unsigned flags) -{ - if (flags & OPT_SHOW_MOUNT_STYLE) { - if (opt->type == BCH_OPT_BOOL) { - prt_printf(out, "%s%s", - v ? "" : "no", - opt->attr.name); - return; - } - - prt_printf(out, "%s=", opt->attr.name); - } - - switch (opt->type) { - case BCH_OPT_BOOL: - case BCH_OPT_UINT: - if (opt->flags & OPT_HUMAN_READABLE) - prt_human_readable_u64(out, v); - else - prt_printf(out, "%lli", v); - break; - case BCH_OPT_STR: - if (v < opt->min || v >= opt->max) - prt_printf(out, "(invalid option %lli)", v); - else if (flags & OPT_SHOW_FULL_LIST) - prt_string_option(out, opt->choices, v); - else - prt_str(out, opt->choices[v]); - break; - case BCH_OPT_BITFIELD: - prt_bitflags(out, opt->choices, v); - break; - case BCH_OPT_FN: - opt->fn.to_text(out, c, sb, v); - break; - default: - BUG(); - } -} - -void bch2_opts_to_text(struct printbuf *out, - struct bch_opts opts, - struct bch_fs *c, struct bch_sb *sb, - unsigned show_mask, unsigned hide_mask, - unsigned flags) -{ - bool first = true; - - for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - - if ((opt->flags & hide_mask) || !(opt->flags & show_mask)) - continue; - - u64 v = bch2_opt_get_by_id(&opts, i); - if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) - continue; - - if (!first) - prt_char(out, ','); - first = false; - - bch2_opt_to_text(out, c, sb, opt, v, flags); - } -} - -int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v) -{ - int ret = 0; - - switch (id) { - case Opt_state: - if (ca) - return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED); - break; - - case Opt_compression: - case Opt_background_compression: - ret = bch2_check_set_has_compressed_data(c, v); - break; - case Opt_erasure_code: - if (v) - bch2_check_set_feature(c, BCH_FEATURE_ec); - break; - default: - break; - } - - return ret; -} - -int bch2_opts_hooks_pre_set(struct bch_fs *c) -{ - for (unsigned i = 0; i < bch2_opts_nr; i++) { - int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); - if (ret) - return ret; - } - - return 0; -} - -void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, - struct bch_opts *new_opts, enum bch_opt_id id) -{ - switch (id) { - case Opt_foreground_target: - if (new_opts->foreground_target && - !new_opts->background_target) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_compression: - if (new_opts->compression && - !new_opts->background_compression) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_background_target: - if (new_opts->background_target) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_background_compression: - if (new_opts->background_compression) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_rebalance_enabled: - bch2_rebalance_wakeup(c); - break; - case Opt_copygc_enabled: - bch2_copygc_wakeup(c); - break; - case Opt_discard: - if (!ca) { - mutex_lock(&c->sb_lock); - for_each_member_device(c, ca) { - struct bch_member *m = - bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_DISCARD(m, c->opts.discard); - } - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - break; - case Opt_version_upgrade: - /* - * XXX: in the future we'll likely want to do compatible - * upgrades at runtime as well, but right now there's nothing - * that does that: - */ - if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible) - bch2_sb_upgrade_incompat(c); - break; - default: - break; - } -} - -int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, - struct printbuf *parse_later, - const char *name, const char *val) -{ - struct printbuf err = PRINTBUF; - u64 v; - int ret, id; - - id = bch2_mount_opt_lookup(name); - - /* Check for the form "noopt", negation of a boolean opt: */ - if (id < 0 && - !val && - !strncmp("no", name, 2)) { - id = bch2_mount_opt_lookup(name + 2); - val = "0"; - } - - /* Unknown options are ignored: */ - if (id < 0) - return 0; - - /* must have a value for synonym lookup - but OPT_FN is weird */ - if (!val && bch2_opt_table[id].type != BCH_OPT_FN) - val = "1"; - - val = bch2_opt_val_synonym_lookup(name, val); - - if (!(bch2_opt_table[id].flags & OPT_MOUNT)) - goto bad_opt; - - if (id == Opt_acl && - !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) - goto bad_opt; - - if ((id == Opt_usrquota || - id == Opt_grpquota) && - !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) - goto bad_opt; - - ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); - if (ret == -BCH_ERR_option_needs_open_fs) { - ret = 0; - - if (parse_later) { - prt_printf(parse_later, "%s=%s,", name, val); - if (parse_later->allocation_failure) - ret = -ENOMEM; - } - - goto out; - } - - if (ret < 0) - goto bad_val; - - if (opts) - bch2_opt_set_by_id(opts, id, v); - - ret = 0; -out: - printbuf_exit(&err); - return ret; -bad_opt: - ret = -BCH_ERR_option_name; - goto out; -bad_val: - ret = -BCH_ERR_option_value; - goto out; -} - -int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, - struct printbuf *parse_later, char *options, - bool ignore_unknown) -{ - char *copied_opts, *copied_opts_start; - char *opt, *name, *val; - int ret = 0; - - if (!options) - return 0; - - /* - * sys_fsconfig() is now occasionally providing us with option lists - * starting with a comma - weird. - */ - if (*options == ',') - options++; - - copied_opts = kstrdup(options, GFP_KERNEL); - if (!copied_opts) - return -ENOMEM; - copied_opts_start = copied_opts; - - while ((opt = strsep(&copied_opts, ",")) != NULL) { - if (!*opt) - continue; - - name = strsep(&opt, "="); - val = opt; - - ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val); - if (ret == -BCH_ERR_option_name && ignore_unknown) - ret = 0; - if (ret) { - pr_err("Error parsing option %s: %s", name, bch2_err_str(ret)); - break; - } - } - - kfree(copied_opts_start); - return ret; -} - -u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx) -{ - const struct bch_option *opt = bch2_opt_table + id; - u64 v; - - if (dev_idx < 0) { - v = opt->get_sb(sb); - } else { - if (WARN(!bch2_member_exists(sb, dev_idx), - "tried to set device option %s on nonexistent device %i", - opt->attr.name, dev_idx)) - return 0; - - struct bch_member m = bch2_sb_member_get(sb, dev_idx); - v = opt->get_member(&m); - } - - if (opt->flags & OPT_SB_FIELD_ONE_BIAS) - --v; - - if (opt->flags & OPT_SB_FIELD_ILOG2) - v = 1ULL << v; - - if (opt->flags & OPT_SB_FIELD_SECTORS) - v <<= 9; - - return v; -} - -/* - * Initial options from superblock - here we don't want any options undefined, - * any options the superblock doesn't specify are set to 0: - */ -int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) -{ - for (unsigned id = 0; id < bch2_opts_nr; id++) { - const struct bch_option *opt = bch2_opt_table + id; - - if (opt->get_sb) - bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1)); - } - - return 0; -} - -bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, - const struct bch_option *opt, u64 v) -{ - bool changed = false; - - if (opt->flags & OPT_SB_FIELD_SECTORS) - v >>= 9; - - if (opt->flags & OPT_SB_FIELD_ILOG2) - v = ilog2(v); - - if (opt->flags & OPT_SB_FIELD_ONE_BIAS) - v++; - - if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) { - changed = v != opt->get_sb(sb); - - opt->set_sb(sb, v); - } - - if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { - if (WARN(!bch2_member_exists(sb, dev_idx), - "tried to set device option %s on nonexistent device %i", - opt->attr.name, dev_idx)) - return false; - - struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); - changed = v != opt->get_member(m); - opt->set_member(m, v); - } - - return changed; -} - -bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, - const struct bch_option *opt, u64 v) -{ - mutex_lock(&c->sb_lock); - bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); - if (changed) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - return changed; -} - -/* io opts: */ - -struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) -{ - struct bch_io_opts opts = { -#define x(_name, _bits) ._name = src._name, - BCH_INODE_OPTS() -#undef x - }; - - bch2_io_opts_fixups(&opts); - return opts; -} - -bool bch2_opt_is_inode_opt(enum bch_opt_id id) -{ - static const enum bch_opt_id inode_opt_list[] = { -#define x(_name, _bits) Opt_##_name, - BCH_INODE_OPTS() -#undef x - }; - unsigned i; - - for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) - if (inode_opt_list[i] == id) - return true; - - return false; -} diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h deleted file mode 100644 index 63f8e254495cbd..00000000000000 --- a/fs/bcachefs/opts.h +++ /dev/null @@ -1,693 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_OPTS_H -#define _BCACHEFS_OPTS_H - -#include -#include -#include -#include -#include "bcachefs_format.h" - -struct bch_fs; - -extern const char * const bch2_error_actions[]; -extern const char * const bch2_degraded_actions[]; -extern const char * const bch2_fsck_fix_opts[]; -extern const char * const bch2_version_upgrade_opts[]; -extern const char * const bch2_sb_features[]; -extern const char * const bch2_sb_compat[]; -extern const char * const __bch2_btree_ids[]; -extern const char * const __bch2_csum_types[]; -extern const char * const __bch2_csum_opts[]; -extern const char * const __bch2_compression_types[]; -extern const char * const bch2_compression_opts[]; -extern const char * const __bch2_str_hash_types[]; -extern const char * const bch2_str_hash_opts[]; -extern const char * const __bch2_data_types[]; -extern const char * const bch2_member_states[]; -extern const char * const bch2_d_types[]; - -void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); -void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); -void bch2_prt_data_type(struct printbuf *, enum bch_data_type); -void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); -void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); -void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); -void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); - -static inline const char *bch2_d_type_str(unsigned d_type) -{ - return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; -} - -/* - * Mount options; we also store defaults in the superblock. - * - * Also exposed via sysfs: if an option is writeable, and it's also stored in - * the superblock, changing it via sysfs (currently? might change this) also - * updates the superblock. - * - * We store options as signed integers, where -1 means undefined. This means we - * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only - * apply the options from that struct that are defined. - */ - -/* When can be set: */ -enum opt_flags { - OPT_FS = BIT(0), /* Filesystem option */ - OPT_DEVICE = BIT(1), /* Device option */ - OPT_INODE = BIT(2), /* Inode option */ - OPT_FORMAT = BIT(3), /* May be specified at format time */ - OPT_MOUNT = BIT(4), /* May be specified at mount time */ - OPT_RUNTIME = BIT(5), /* May be specified at runtime */ - OPT_HUMAN_READABLE = BIT(6), - OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */ - OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */ - OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */ - OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */ - OPT_HIDDEN = BIT(11), -}; - -enum opt_type { - BCH_OPT_BOOL, - BCH_OPT_UINT, - BCH_OPT_STR, - BCH_OPT_BITFIELD, - BCH_OPT_FN, -}; - -struct bch_opt_fn { - int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); - void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); - int (*validate)(u64, struct printbuf *); -}; - -/** - * x(name, shortopt, type, in mem type, mode, sb_opt) - * - * @name - name of mount option, sysfs attribute, and struct bch_opts - * member - * - * @mode - when opt may be set - * - * @sb_option - name of corresponding superblock option - * - * @type - one of OPT_BOOL, OPT_UINT, OPT_STR - */ - -/* - * XXX: add fields for - * - default value - * - helptext - */ - -#ifdef __KERNEL__ -#define RATELIMIT_ERRORS_DEFAULT true -#else -#define RATELIMIT_ERRORS_DEFAULT false -#endif - -#ifdef CONFIG_BCACHEFS_DEBUG -#define BCACHEFS_VERBOSE_DEFAULT true -#else -#define BCACHEFS_VERBOSE_DEFAULT false -#endif - -#define BCH_FIX_ERRORS_OPTS() \ - x(exit, 0) \ - x(yes, 1) \ - x(no, 2) \ - x(ask, 3) - -enum fsck_err_opts { -#define x(t, n) FSCK_FIX_##t, - BCH_FIX_ERRORS_OPTS() -#undef x -}; - -#define BCH_OPTS() \ - x(block_size, u16, \ - OPT_FS|OPT_FORMAT| \ - OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(512, 1U << 16), \ - BCH_SB_BLOCK_SIZE, 4 << 10, \ - "size", NULL) \ - x(btree_node_size, u32, \ - OPT_FS|OPT_FORMAT| \ - OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(512, 1U << 20), \ - BCH_SB_BTREE_NODE_SIZE, 256 << 10, \ - "size", "Btree node size, default 256k") \ - x(errors, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ - NULL, "Action to take on filesystem error") \ - x(write_error_timeout, u16, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, 300), \ - BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ - NULL, "Number of consecutive write errors allowed before kicking out a device")\ - x(metadata_replicas, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_META_REPLICAS_WANT, 1, \ - "#", "Number of metadata replicas") \ - x(data_replicas, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_DATA_REPLICAS_WANT, 1, \ - "#", "Number of data replicas") \ - x(metadata_replicas_required, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_META_REPLICAS_REQ, 1, \ - "#", NULL) \ - x(data_replicas_required, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_DATA_REPLICAS_REQ, 1, \ - "#", NULL) \ - x(encoded_extent_max, u32, \ - OPT_FS|OPT_FORMAT| \ - OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ - OPT_UINT(4096, 2U << 20), \ - BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ - "size", "Maximum size of checksummed/compressed extents")\ - x(metadata_checksum, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(__bch2_csum_opts), \ - BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ - NULL, NULL) \ - x(data_checksum, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(__bch2_csum_opts), \ - BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ - NULL, NULL) \ - x(checksum_err_retry_nr, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(0, 32), \ - BCH_SB_CSUM_ERR_RETRY_NR, 3, \ - NULL, NULL) \ - x(compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_compression), \ - BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ - NULL, NULL) \ - x(background_compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_compression), \ - BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ - NULL, NULL) \ - x(str_hash, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_str_hash_opts), \ - BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ - NULL, "Hash function for directory entries and xattrs")\ - x(metadata_target, u16, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_METADATA_TARGET, 0, \ - "(target)", "Device or label for metadata writes") \ - x(foreground_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_FOREGROUND_TARGET, 0, \ - "(target)", "Device or label for foreground writes") \ - x(background_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_BACKGROUND_TARGET, 0, \ - "(target)", "Device or label to move data to in the background")\ - x(promote_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_PROMOTE_TARGET, 0, \ - "(target)", "Device or label to promote data to on read") \ - x(erasure_code, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_ERASURE_CODE, false, \ - NULL, "Enable erasure coding (DO NOT USE YET)") \ - x(casefold, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT, \ - OPT_BOOL(), \ - BCH_SB_CASEFOLD, false, \ - NULL, "Dirent lookups are casefolded") \ - x(casefold_disabled, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Disable casefolding filesystem wide") \ - x(inodes_32bit, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_INODE_32BIT, true, \ - NULL, "Constrain inode numbers to 32 bits") \ - x(shard_inode_numbers_bits, u8, \ - OPT_FS|OPT_FORMAT, \ - OPT_UINT(0, 8), \ - BCH_SB_SHARD_INUMS_NBITS, 0, \ - NULL, "Shard new inode numbers by CPU id") \ - x(inodes_use_key_cache, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_INODES_USE_KEY_CACHE, true, \ - NULL, "Use the btree key cache for the inodes btree") \ - x(btree_node_mem_ptr_optimization, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Stash pointer to in memory btree node in btree ptr")\ - x(gc_reserve_percent, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(5, 21), \ - BCH_SB_GC_RESERVE, 8, \ - "%", "Percentage of disk space to reserve for copygc")\ - x(gc_reserve_bytes, u64, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ - OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(0, U64_MAX), \ - BCH_SB_GC_RESERVE_BYTES, 0, \ - "%", "Amount of disk space to reserve for copygc\n" \ - "Takes precedence over gc_reserve_percent if set")\ - x(root_reserve_percent, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_UINT(0, 100), \ - BCH_SB_ROOT_RESERVE, 0, \ - "%", "Percentage of disk space to reserve for superuser")\ - x(wide_macs, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_128_BIT_MACS, false, \ - NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ - x(inline_data, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable inline data extents") \ - x(promote_whole_extents, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \ - NULL, "Promote whole extents, instead of just part being read")\ - x(acl, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_POSIX_ACL, true, \ - NULL, "Enable POSIX acls") \ - x(usrquota, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_USRQUOTA, false, \ - NULL, "Enable user quotas") \ - x(grpquota, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_GRPQUOTA, false, \ - NULL, "Enable group quotas") \ - x(prjquota, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_PRJQUOTA, false, \ - NULL, "Enable project quotas") \ - x(degraded, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_STR(bch2_degraded_actions), \ - BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \ - NULL, "Allow mounting in degraded mode") \ - x(no_splitbrain_check, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don't kick drives out when splitbrain detected")\ - x(verbose, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \ - NULL, "Extra debugging information during mount/recovery")\ - x(journal_flush_delay, u32, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, U32_MAX), \ - BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ - NULL, "Delay in milliseconds before automatic journal commits")\ - x(journal_flush_disabled, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ - NULL, "Disable journal flush on sync/fsync\n" \ - "If enabled, writes can be lost, but only since the\n"\ - "last journal write (default 1 second)") \ - x(journal_reclaim_delay, u32, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(0, U32_MAX), \ - BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ - NULL, "Delay in milliseconds before automatic journal reclaim")\ - x(move_bytes_in_flight, u32, \ - OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1024, U32_MAX), \ - BCH2_NO_SB_OPT, 1U << 20, \ - NULL, "Maximum Amount of IO to keep in flight by the move path")\ - x(move_ios_in_flight, u32, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, 1024), \ - BCH2_NO_SB_OPT, 32, \ - NULL, "Maximum number of IOs to keep in flight by the move path")\ - x(fsck, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Run fsck on mount") \ - x(fsck_memory_usage_percent, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(20, 70), \ - BCH2_NO_SB_OPT, 50, \ - NULL, "Maximum percentage of system ram fsck is allowed to pin")\ - x(fix_errors, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_FN(bch2_opt_fix_errors), \ - BCH2_NO_SB_OPT, FSCK_FIX_exit, \ - NULL, "Fix errors during fsck without asking") \ - x(ratelimit_errors, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ - NULL, "Ratelimit error messages during fsck") \ - x(nochanges, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Super read only mode - no writes at all will be issued,\n"\ - "even if we have to replay the journal") \ - x(norecovery, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Exit recovery immediately prior to journal replay")\ - x(journal_rewind, u64, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(0, U64_MAX), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Rewind journal") \ - x(recovery_passes, u64, \ - OPT_FS|OPT_MOUNT, \ - OPT_BITFIELD(bch2_recovery_passes), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Recovery passes to run explicitly") \ - x(recovery_passes_exclude, u64, \ - OPT_FS|OPT_MOUNT, \ - OPT_BITFIELD(bch2_recovery_passes), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Recovery passes to exclude") \ - x(recovery_pass_last, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_STR_NOLIMIT(bch2_recovery_passes), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Exit recovery after specified pass") \ - x(retain_recovery_info, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\ - x(read_entire_journal, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Read all journal entries, not just dirty ones")\ - x(read_journal_only, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Only read the journal, skip the rest of recovery")\ - x(journal_transaction_names, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ - NULL, "Log transaction function names in journal") \ - x(allocator_stuck_timeout, u16, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(0, U16_MAX), \ - BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \ - NULL, "Default timeout in seconds for stuck allocator messages")\ - x(noexcl, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don't open device in exclusive mode") \ - x(direct_io, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Use O_DIRECT (userspace only)") \ - x(sb, u64, \ - OPT_MOUNT, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ - "offset", "Sector offset of superblock") \ - x(read_only, u8, \ - OPT_FS|OPT_MOUNT|OPT_HIDDEN, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, NULL) \ - x(nostart, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don\'t start filesystem, only open devices") \ - x(reconstruct_alloc, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Reconstruct alloc btree") \ - x(version_upgrade, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_version_upgrade_opts), \ - BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ - NULL, "Set superblock to latest version,\n" \ - "allowing any new features to be used") \ - x(stdio, u64, \ - 0, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Pointer to a struct stdio_redirect") \ - x(project, u8, \ - OPT_INODE, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, NULL) \ - x(nocow, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ - OPT_BOOL(), \ - BCH_SB_NOCOW, false, \ - NULL, "Nocow mode: Writes will be done in place when possible.\n"\ - "Snapshots and reflink will still caused writes to be COW\n"\ - "Implicitly disables data checksumming, compression and encryption")\ - x(nocow_enabled, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable nocow mode: enables runtime locking in\n"\ - "data move path needed if nocow will ever be in use\n")\ - x(copygc_enabled, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable copygc: disable for debugging, or to\n"\ - "quiet the system when doing performance testing\n")\ - x(rebalance_enabled, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable rebalance: disable for debugging, or to\n"\ - "quiet the system when doing performance testing\n")\ - x(rebalance_on_ac_only, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_REBALANCE_AC_ONLY, false, \ - NULL, "Enable rebalance while on mains power only\n") \ - x(auto_snapshot_deletion, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\ - "quiet the system when doing performance testing\n")\ - x(no_data_io, u8, \ - OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Skip submit_bio() for data reads and writes, " \ - "for performance testing purposes") \ - x(state, u64, \ - OPT_DEVICE|OPT_RUNTIME, \ - OPT_STR(bch2_member_states), \ - BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ - "state", "rw,ro,failed,spare") \ - x(bucket_size, u32, \ - OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(0, S64_MAX), \ - BCH_MEMBER_BUCKET_SIZE, 0, \ - "size", "Specifies the bucket size; must be greater than the btree node size")\ - x(durability, u8, \ - OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \ - OPT_UINT(0, BCH_REPLICAS_MAX), \ - BCH_MEMBER_DURABILITY, 1, \ - "n", "Data written to this device will be considered\n"\ - "to have already been replicated n times") \ - x(data_allowed, u8, \ - OPT_DEVICE, \ - OPT_BITFIELD(__bch2_data_types), \ - BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ - "types", "Allowed data types for this device: journal, btree, and/or user")\ - x(discard, u8, \ - OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_MEMBER_DISCARD, true, \ - NULL, "Enable discard/TRIM support") \ - x(btree_node_prefetch, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\ - " prefetched sequentially") - -struct bch_opts { -#define x(_name, _bits, ...) unsigned _name##_defined:1; - BCH_OPTS() -#undef x - -#define x(_name, _bits, ...) _bits _name; - BCH_OPTS() -#undef x -}; - -struct bch2_opts_parse { - struct bch_opts opts; - - /* to save opts that can't be parsed before the FS is opened: */ - struct printbuf parse_later; -}; - -static const __maybe_unused struct bch_opts bch2_opts_default = { -#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ - ._name##_defined = true, \ - ._name = _default, \ - - BCH_OPTS() -#undef x -}; - -#define opt_defined(_opts, _name) ((_opts)._name##_defined) - -#define opt_get(_opts, _name) \ - (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) - -#define opt_set(_opts, _name, _v) \ -do { \ - (_opts)._name##_defined = true; \ - (_opts)._name = _v; \ -} while (0) - -static inline struct bch_opts bch2_opts_empty(void) -{ - return (struct bch_opts) { 0 }; -} - -void bch2_opts_apply(struct bch_opts *, struct bch_opts); - -enum bch_opt_id { -#define x(_name, ...) Opt_##_name, - BCH_OPTS() -#undef x - bch2_opts_nr -}; - -struct bch_fs; -struct printbuf; - -struct bch_option { - struct attribute attr; - enum opt_type type; - enum opt_flags flags; - u64 min, max; - - const char * const *choices; - - struct bch_opt_fn fn; - - const char *hint; - const char *help; - - u64 (*get_sb)(const struct bch_sb *); - void (*set_sb)(struct bch_sb *, u64); - - u64 (*get_member)(const struct bch_member *); - void (*set_member)(struct bch_member *, u64); - -}; - -extern const struct bch_option bch2_opt_table[]; - -bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); -u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); -void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); - -u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); -int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); -bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); - -struct bch_dev; -bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); - -int bch2_opt_lookup(const char *); -int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); -int bch2_opt_parse(struct bch_fs *, const struct bch_option *, - const char *, u64 *, struct printbuf *); - -#define OPT_SHOW_FULL_LIST (1 << 0) -#define OPT_SHOW_MOUNT_STYLE (1 << 1) - -void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, - const struct bch_option *, u64, unsigned); -void bch2_opts_to_text(struct printbuf *, - struct bch_opts, - struct bch_fs *, struct bch_sb *, - unsigned, unsigned, unsigned); - -int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64); -int bch2_opts_hooks_pre_set(struct bch_fs *); -void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, - struct bch_opts *, enum bch_opt_id); - -int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, - struct printbuf *, const char *, const char *); -int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, - char *, bool); - -/* inode opts: */ - -struct bch_io_opts { -#define x(_name, _bits) u##_bits _name; - BCH_INODE_OPTS() -#undef x -#define x(_name, _bits) u64 _name##_from_inode:1; - BCH_INODE_OPTS() -#undef x -}; - -static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) -{ - if (!opts->background_target) - opts->background_target = opts->foreground_target; - if (!opts->background_compression) - opts->background_compression = opts->compression; - if (opts->nocow) { - opts->compression = opts->background_compression = 0; - opts->data_checksum = 0; - opts->erasure_code = 0; - } -} - -struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); -bool bch2_opt_is_inode_opt(enum bch_opt_id); - -#endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c deleted file mode 100644 index 3302bbc78a09ba..00000000000000 --- a/fs/bcachefs/printbuf.c +++ /dev/null @@ -1,528 +0,0 @@ -// SPDX-License-Identifier: LGPL-2.1+ -/* Copyright (C) 2022 Kent Overstreet */ - -#include -#include -#include -#include -#include -#include - -#include "printbuf.h" - -static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos) -{ - return pos - buf->last_newline; -} - -static inline unsigned printbuf_linelen(struct printbuf *buf) -{ - return __printbuf_linelen(buf, buf->pos); -} - -/* - * Returns spaces from start of line, if set, or 0 if unset: - */ -static inline unsigned cur_tabstop(struct printbuf *buf) -{ - return buf->cur_tabstop < buf->nr_tabstops - ? buf->_tabstops[buf->cur_tabstop] - : 0; -} - -int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) -{ - /* Reserved space for terminating nul: */ - extra += 1; - - if (out->pos + extra <= out->size) - return 0; - - if (!out->heap_allocated) { - out->overflow = true; - return 0; - } - - unsigned new_size = roundup_pow_of_two(out->size + extra); - - /* Sanity check... */ - if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) { - out->allocation_failure = true; - out->overflow = true; - return -ENOMEM; - } - - /* - * Note: output buffer must be freeable with kfree(), it's not required - * that the user use printbuf_exit(). - */ - char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); - - if (!buf) { - out->allocation_failure = true; - out->overflow = true; - return -ENOMEM; - } - - out->buf = buf; - out->size = new_size; - return 0; -} - -static void printbuf_advance_pos(struct printbuf *out, unsigned len) -{ - out->pos += min(len, printbuf_remaining(out)); -} - -static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr) -{ - unsigned move = out->pos - pos; - - bch2_printbuf_make_room(out, nr); - - if (pos + nr < out->size) - memmove(out->buf + pos + nr, - out->buf + pos, - min(move, out->size - 1 - pos - nr)); - - if (pos < out->size) - memset(out->buf + pos, ' ', min(nr, out->size - pos)); - - printbuf_advance_pos(out, nr); - printbuf_nul_terminate_reserved(out); -} - -static void __printbuf_do_indent(struct printbuf *out, unsigned pos) -{ - while (true) { - int pad; - unsigned len = out->pos - pos; - char *p = out->buf + pos; - char *n = memscan(p, '\n', len); - if (cur_tabstop(out)) { - n = min(n, (char *) memscan(p, '\r', len)); - n = min(n, (char *) memscan(p, '\t', len)); - } - - pos = n - out->buf; - if (pos == out->pos) - break; - - switch (*n) { - case '\n': - pos++; - out->last_newline = pos; - - printbuf_insert_spaces(out, pos, out->indent); - - pos = min(pos + out->indent, out->pos); - out->last_field = pos; - out->cur_tabstop = 0; - break; - case '\r': - memmove(n, n + 1, out->pos - pos); - --out->pos; - pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos); - if (pad > 0) { - printbuf_insert_spaces(out, out->last_field, pad); - pos += pad; - } - - out->last_field = pos; - out->cur_tabstop++; - break; - case '\t': - pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1; - if (pad > 0) { - *n = ' '; - printbuf_insert_spaces(out, pos, pad - 1); - pos += pad; - } else { - memmove(n, n + 1, out->pos - pos); - --out->pos; - } - - out->last_field = pos; - out->cur_tabstop++; - break; - } - } -} - -static inline void printbuf_do_indent(struct printbuf *out, unsigned pos) -{ - if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling) - __printbuf_do_indent(out, pos); -} - -void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) -{ - int len; - - do { - va_list args2; - - va_copy(args2, args); - len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2); - va_end(args2); - } while (len > printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len)); - - unsigned indent_pos = out->pos; - printbuf_advance_pos(out, len); - printbuf_do_indent(out, indent_pos); -} - -void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) -{ - va_list args; - int len; - - do { - va_start(args, fmt); - len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args); - va_end(args); - } while (len > printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len)); - - unsigned indent_pos = out->pos; - printbuf_advance_pos(out, len); - printbuf_do_indent(out, indent_pos); -} - -/** - * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be - * null terminated - * @buf: printbuf to terminate - * Returns: Printbuf contents, as a nul terminated C string - */ -const char *bch2_printbuf_str(const struct printbuf *buf) -{ - /* - * If we've written to a printbuf then it's guaranteed to be a null - * terminated string - but if we haven't, then we might not have - * allocated a buffer at all: - */ - return buf->pos - ? buf->buf - : ""; -} - -/** - * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it - * against accidental use. - * @buf: printbuf to exit - */ -void bch2_printbuf_exit(struct printbuf *buf) -{ - if (buf->heap_allocated) { - kfree(buf->buf); - buf->buf = ERR_PTR(-EINTR); /* poison value */ - } -} - -void bch2_printbuf_tabstops_reset(struct printbuf *buf) -{ - buf->nr_tabstops = 0; -} - -void bch2_printbuf_tabstop_pop(struct printbuf *buf) -{ - if (buf->nr_tabstops) - --buf->nr_tabstops; -} - -/* - * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop - * - * @buf: printbuf to control - * @spaces: number of spaces from previous tabpstop - * - * In the future this function may allocate memory if setting more than - * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start - * of line. - */ -int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) -{ - unsigned prev_tabstop = buf->nr_tabstops - ? buf->_tabstops[buf->nr_tabstops - 1] - : 0; - - if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops))) - return -EINVAL; - - buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces; - buf->has_indent_or_tabstops = true; - return 0; -} - -/** - * bch2_printbuf_indent_add() - add to the current indent level - * - * @buf: printbuf to control - * @spaces: number of spaces to add to the current indent level - * - * Subsequent lines, and the current line if the output position is at the start - * of the current line, will be indented by @spaces more spaces. - */ -void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) -{ - if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) - spaces = 0; - - buf->indent += spaces; - prt_chars(buf, ' ', spaces); - - buf->has_indent_or_tabstops = true; -} - -/** - * bch2_printbuf_indent_add_nextline() - add to the current indent level for - * subsequent lines - * - * @buf: printbuf to control - * @spaces: number of spaces to add to the current indent level - * - * Subsequent lines - not the current line - will be indented by @spaces more - * spaces. - */ -void bch2_printbuf_indent_add_nextline(struct printbuf *buf, unsigned spaces) -{ - if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) - spaces = 0; - - buf->indent += spaces; - buf->has_indent_or_tabstops = true; -} - -/** - * bch2_printbuf_indent_sub() - subtract from the current indent level - * - * @buf: printbuf to control - * @spaces: number of spaces to subtract from the current indent level - * - * Subsequent lines, and the current line if the output position is at the start - * of the current line, will be indented by @spaces less spaces. - */ -void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) -{ - if (WARN_ON_ONCE(spaces > buf->indent)) - spaces = buf->indent; - - if (buf->last_newline + buf->indent == buf->pos) { - buf->pos -= spaces; - printbuf_nul_terminate(buf); - } - buf->indent -= spaces; - - if (!buf->indent && !buf->nr_tabstops) - buf->has_indent_or_tabstops = false; -} - -void bch2_prt_newline(struct printbuf *buf) -{ - bch2_printbuf_make_room(buf, 1 + buf->indent); - - __prt_char_reserved(buf, '\n'); - - buf->last_newline = buf->pos; - - __prt_chars_reserved(buf, ' ', buf->indent); - - printbuf_nul_terminate_reserved(buf); - - buf->last_field = buf->pos; - buf->cur_tabstop = 0; -} - -void bch2_printbuf_strip_trailing_newline(struct printbuf *out) -{ - for (int p = out->pos - 1; p >= 0; --p) { - if (out->buf[p] == '\n') { - out->pos = p; - break; - } - if (out->buf[p] != ' ') - break; - } - - printbuf_nul_terminate_reserved(out); -} - -static void __prt_tab(struct printbuf *out) -{ - int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); - - prt_chars(out, ' ', spaces); - - out->last_field = out->pos; - out->cur_tabstop++; -} - -/** - * bch2_prt_tab() - Advance printbuf to the next tabstop - * @out: printbuf to control - * - * Advance output to the next tabstop by printing spaces. - */ -void bch2_prt_tab(struct printbuf *out) -{ - if (WARN_ON(!cur_tabstop(out))) - return; - - __prt_tab(out); -} - -static void __prt_tab_rjust(struct printbuf *buf) -{ - int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); - if (pad > 0) - printbuf_insert_spaces(buf, buf->last_field, pad); - - buf->last_field = buf->pos; - buf->cur_tabstop++; -} - -/** - * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying - * previous output - * - * @buf: printbuf to control - * - * Advance output to the next tabstop by inserting spaces immediately after the - * previous tabstop, right justifying previously outputted text. - */ -void bch2_prt_tab_rjust(struct printbuf *buf) -{ - if (WARN_ON(!cur_tabstop(buf))) - return; - - __prt_tab_rjust(buf); -} - -/** - * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters - * - * @out: output printbuf - * @str: string to print - * @count: number of bytes to print - * - * The following contol characters are handled as so: - * \n: prt_newline newline that obeys current indent level - * \t: prt_tab advance to next tabstop - * \r: prt_tab_rjust advance to next tabstop, with right justification - */ -void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) -{ - unsigned indent_pos = out->pos; - prt_bytes(out, str, count); - printbuf_do_indent(out, indent_pos); -} - -/** - * bch2_prt_human_readable_u64() - Print out a u64 in human readable units - * @out: output printbuf - * @v: integer to print - * - * Units of 2^10 (default) or 10^3 are controlled via @out->si_units - */ -void bch2_prt_human_readable_u64(struct printbuf *out, u64 v) -{ - bch2_printbuf_make_room(out, 10); - unsigned len = string_get_size(v, 1, !out->si_units, - out->buf + out->pos, - printbuf_remaining_size(out)); - printbuf_advance_pos(out, len); -} - -/** - * bch2_prt_human_readable_s64() - Print out a s64 in human readable units - * @out: output printbuf - * @v: integer to print - * - * Units of 2^10 (default) or 10^3 are controlled via @out->si_units - */ -void bch2_prt_human_readable_s64(struct printbuf *out, s64 v) -{ - if (v < 0) - prt_char(out, '-'); - bch2_prt_human_readable_u64(out, abs(v)); -} - -/** - * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options - * @out: output printbuf - * @v: integer to print - * - * Units are either raw (default), or human reabable units (controlled via - * @buf->human_readable_units) - */ -void bch2_prt_units_u64(struct printbuf *out, u64 v) -{ - if (out->human_readable_units) - bch2_prt_human_readable_u64(out, v); - else - bch2_prt_printf(out, "%llu", v); -} - -/** - * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options - * @out: output printbuf - * @v: integer to print - * - * Units are either raw (default), or human reabable units (controlled via - * @buf->human_readable_units) - */ -void bch2_prt_units_s64(struct printbuf *out, s64 v) -{ - if (v < 0) - prt_char(out, '-'); - bch2_prt_units_u64(out, abs(v)); -} - -void bch2_prt_string_option(struct printbuf *out, - const char * const list[], - size_t selected) -{ - for (size_t i = 0; list[i]; i++) - bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); -} - -void bch2_prt_bitflags(struct printbuf *out, - const char * const list[], u64 flags) -{ - unsigned bit, nr = 0; - bool first = true; - - while (list[nr]) - nr++; - - while (flags && (bit = __ffs64(flags)) < nr) { - if (!first) - bch2_prt_printf(out, ","); - first = false; - bch2_prt_printf(out, "%s", list[bit]); - flags ^= BIT_ULL(bit); - } -} - -void bch2_prt_bitflags_vector(struct printbuf *out, - const char * const list[], - unsigned long *v, unsigned nr) -{ - bool first = true; - unsigned i; - - for (i = 0; i < nr; i++) - if (!list[i]) { - nr = i - 1; - break; - } - - for_each_set_bit(i, v, nr) { - if (!first) - bch2_prt_printf(out, ","); - first = false; - bch2_prt_printf(out, "%s", list[i]); - } -} diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h deleted file mode 100644 index 8f4e28d440ac3f..00000000000000 --- a/fs/bcachefs/printbuf.h +++ /dev/null @@ -1,298 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1+ */ -/* Copyright (C) 2022 Kent Overstreet */ - -#ifndef _BCACHEFS_PRINTBUF_H -#define _BCACHEFS_PRINTBUF_H - -/* - * Printbufs: Simple strings for printing to, with optional heap allocation - * - * This code has provisions for use in userspace, to aid in making other code - * portable between kernelspace and userspace. - * - * Basic example: - * struct printbuf buf = PRINTBUF; - * - * prt_printf(&buf, "foo="); - * foo_to_text(&buf, foo); - * printk("%s", buf.buf); - * printbuf_exit(&buf); - * - * Or - * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size) - * - * We can now write pretty printers instead of writing code that dumps - * everything to the kernel log buffer, and then those pretty-printers can be - * used by other code that outputs to kernel log, sysfs, debugfs, etc. - * - * Memory allocation: Outputing to a printbuf may allocate memory. This - * allocation is done with GFP_KERNEL, by default: use the newer - * memalloc_*_(save|restore) functions as needed. - * - * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations - * will be done with GFP_NOWAIT if printbuf->atomic is nonzero. - * - * It's allowed to grab the output buffer and free it later with kfree() instead - * of using printbuf_exit(), if the user just needs a heap allocated string at - * the end. - * - * Memory allocation failures: We don't return errors directly, because on - * memory allocation failure we usually don't want to bail out and unwind - we - * want to print what we've got, on a best-effort basis. But code that does want - * to return -ENOMEM may check printbuf.allocation_failure. - * - * Indenting, tabstops: - * - * To aid is writing multi-line pretty printers spread across multiple - * functions, printbufs track the current indent level. - * - * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent - * level, respectively. - * - * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from - * start of line. Once set, prt_tab() will output spaces up to the next tabstop. - * prt_tab_rjust() will also advance the current line of text up to the next - * tabstop, but it does so by shifting text since the previous tabstop up to the - * next tabstop - right justifying it. - * - * Make sure you use prt_newline() instead of \n in the format string for indent - * level and tabstops to work corretly. - * - * Output units: printbuf->units exists to tell pretty-printers how to output - * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as - * human readable bytes. prt_units() obeys it. - */ - -#include -#include - -enum printbuf_si { - PRINTBUF_UNITS_2, /* use binary powers of 2^10 */ - PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */ -}; - -#define PRINTBUF_INLINE_TABSTOPS 6 - -struct printbuf { - char *buf; - unsigned size; - unsigned pos; - unsigned last_newline; - unsigned last_field; - unsigned indent; - /* - * If nonzero, allocations will be done with GFP_ATOMIC: - */ - u8 atomic; - bool allocation_failure:1; - bool heap_allocated:1; - bool overflow:1; - enum printbuf_si si_units:1; - bool human_readable_units:1; - bool has_indent_or_tabstops:1; - bool suppress_indent_tabstop_handling:1; - u8 nr_tabstops; - - /* - * Do not modify directly: use printbuf_tabstop_add(), - * printbuf_tabstop_get() - */ - u8 cur_tabstop; - u8 _tabstops[PRINTBUF_INLINE_TABSTOPS]; -}; - -int bch2_printbuf_make_room(struct printbuf *, unsigned); -__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...); -__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list); -const char *bch2_printbuf_str(const struct printbuf *); -void bch2_printbuf_exit(struct printbuf *); - -void bch2_printbuf_tabstops_reset(struct printbuf *); -void bch2_printbuf_tabstop_pop(struct printbuf *); -int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); - -void bch2_printbuf_indent_add(struct printbuf *, unsigned); -void bch2_printbuf_indent_add_nextline(struct printbuf *, unsigned); -void bch2_printbuf_indent_sub(struct printbuf *, unsigned); - -void bch2_prt_newline(struct printbuf *); -void bch2_printbuf_strip_trailing_newline(struct printbuf *); -void bch2_prt_tab(struct printbuf *); -void bch2_prt_tab_rjust(struct printbuf *); - -void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned); -void bch2_prt_human_readable_u64(struct printbuf *, u64); -void bch2_prt_human_readable_s64(struct printbuf *, s64); -void bch2_prt_units_u64(struct printbuf *, u64); -void bch2_prt_units_s64(struct printbuf *, s64); -void bch2_prt_string_option(struct printbuf *, const char * const[], size_t); -void bch2_prt_bitflags(struct printbuf *, const char * const[], u64); -void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], - unsigned long *, unsigned); - -/* Initializer for a heap allocated printbuf: */ -#define PRINTBUF ((struct printbuf) { .heap_allocated = true }) - -/* Initializer a printbuf that points to an external buffer: */ -#define PRINTBUF_EXTERN(_buf, _size) \ -((struct printbuf) { \ - .buf = _buf, \ - .size = _size, \ -}) - -static inline struct printbuf bch2_printbuf_init(void) -{ - return PRINTBUF; -} - -DEFINE_CLASS(printbuf, struct printbuf, - bch2_printbuf_exit(&_T), bch2_printbuf_init(), void) - -/* - * Returns size remaining of output buffer: - */ -static inline unsigned printbuf_remaining_size(struct printbuf *out) -{ - if (WARN_ON(out->size && out->pos >= out->size)) - out->pos = out->size - 1; - return out->size - out->pos; -} - -/* - * Returns number of characters we can print to the output buffer - i.e. - * excluding the terminating nul: - */ -static inline unsigned printbuf_remaining(struct printbuf *out) -{ - return out->size ? printbuf_remaining_size(out) - 1 : 0; -} - -static inline unsigned printbuf_written(struct printbuf *out) -{ - return out->size ? min(out->pos, out->size - 1) : 0; -} - -static inline void printbuf_nul_terminate_reserved(struct printbuf *out) -{ - if (WARN_ON(out->size && out->pos >= out->size)) - out->pos = out->size - 1; - if (out->size) - out->buf[out->pos] = 0; -} - -static inline void printbuf_nul_terminate(struct printbuf *out) -{ - bch2_printbuf_make_room(out, 1); - printbuf_nul_terminate_reserved(out); -} - -/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ -static inline void __prt_char_reserved(struct printbuf *out, char c) -{ - if (printbuf_remaining(out)) - out->buf[out->pos++] = c; -} - -/* Doesn't nul terminate: */ -static inline void __prt_char(struct printbuf *out, char c) -{ - bch2_printbuf_make_room(out, 1); - __prt_char_reserved(out, c); -} - -static inline void prt_char(struct printbuf *out, char c) -{ - bch2_printbuf_make_room(out, 2); - __prt_char_reserved(out, c); - printbuf_nul_terminate_reserved(out); -} - -static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) -{ - unsigned can_print = min(n, printbuf_remaining(out)); - - for (unsigned i = 0; i < can_print; i++) - out->buf[out->pos++] = c; -} - -static inline void prt_chars(struct printbuf *out, char c, unsigned n) -{ - bch2_printbuf_make_room(out, n); - __prt_chars_reserved(out, c, n); - printbuf_nul_terminate_reserved(out); -} - -static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) -{ - bch2_printbuf_make_room(out, n); - - unsigned can_print = min(n, printbuf_remaining(out)); - - for (unsigned i = 0; i < can_print; i++) - out->buf[out->pos++] = ((char *) b)[i]; - - printbuf_nul_terminate(out); -} - -static inline void prt_str(struct printbuf *out, const char *str) -{ - prt_bytes(out, str, strlen(str)); -} - -static inline void prt_str_indented(struct printbuf *out, const char *str) -{ - bch2_prt_bytes_indented(out, str, strlen(str)); -} - -static inline void prt_hex_byte(struct printbuf *out, u8 byte) -{ - bch2_printbuf_make_room(out, 3); - __prt_char_reserved(out, hex_asc_hi(byte)); - __prt_char_reserved(out, hex_asc_lo(byte)); - printbuf_nul_terminate_reserved(out); -} - -static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) -{ - bch2_printbuf_make_room(out, 3); - __prt_char_reserved(out, hex_asc_upper_hi(byte)); - __prt_char_reserved(out, hex_asc_upper_lo(byte)); - printbuf_nul_terminate_reserved(out); -} - -static inline void printbuf_reset_keep_tabstops(struct printbuf *buf) -{ - buf->pos = 0; - buf->allocation_failure = 0; - buf->last_newline = 0; - buf->last_field = 0; - buf->indent = 0; - buf->cur_tabstop = 0; -} - -/** - * printbuf_reset - re-use a printbuf without freeing and re-initializing it: - */ -static inline void printbuf_reset(struct printbuf *buf) -{ - printbuf_reset_keep_tabstops(buf); - buf->nr_tabstops = 0; -} - -/** - * printbuf_atomic_inc - mark as entering an atomic section - */ -static inline void printbuf_atomic_inc(struct printbuf *buf) -{ - buf->atomic++; -} - -/** - * printbuf_atomic_inc - mark as leaving an atomic section - */ -static inline void printbuf_atomic_dec(struct printbuf *buf) -{ - buf->atomic--; -} - -#endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c deleted file mode 100644 index d09898566abea9..00000000000000 --- a/fs/bcachefs/progress.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bbpos.h" -#include "disk_accounting.h" -#include "progress.h" - -void bch2_progress_init(struct progress_indicator_state *s, - struct bch_fs *c, - u64 btree_id_mask) -{ - memset(s, 0, sizeof(*s)); - - s->next_print = jiffies + HZ * 10; - - for (unsigned i = 0; i < BTREE_ID_NR; i++) { - if (!(btree_id_mask & BIT_ULL(i))) - continue; - - struct disk_accounting_pos acc; - disk_accounting_key_init(acc, btree, .id = i); - - u64 v; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); - s->nodes_total += div64_ul(v, btree_sectors(c)); - } -} - -static inline bool progress_update_p(struct progress_indicator_state *s) -{ - bool ret = time_after_eq(jiffies, s->next_print); - - if (ret) - s->next_print = jiffies + HZ * 10; - return ret; -} - -void bch2_progress_update_iter(struct btree_trans *trans, - struct progress_indicator_state *s, - struct btree_iter *iter, - const char *msg) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(btree_iter_path(trans, iter))->b; - - s->nodes_seen += b != s->last_node; - s->last_node = b; - - if (progress_update_p(s)) { - struct printbuf buf = PRINTBUF; - unsigned percent = s->nodes_total - ? div64_u64(s->nodes_seen * 100, s->nodes_total) - : 0; - - prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", - msg, percent, s->nodes_seen, s->nodes_total); - bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); - - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } -} diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h deleted file mode 100644 index 23fb1811f9436f..00000000000000 --- a/fs/bcachefs/progress.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_PROGRESS_H -#define _BCACHEFS_PROGRESS_H - -/* - * Lame progress indicators - * - * We don't like to use these because they print to the dmesg console, which is - * spammy - we much prefer to be wired up to a userspace programm (e.g. via - * thread_with_file) and have it print the progress indicator. - * - * But some code is old and doesn't support that, or runs in a context where - * that's not yet practical (mount). - */ - -struct progress_indicator_state { - unsigned long next_print; - u64 nodes_seen; - u64 nodes_total; - struct btree *last_node; -}; - -void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); -void bch2_progress_update_iter(struct btree_trans *, - struct progress_indicator_state *, - struct btree_iter *, - const char *); - -#endif /* _BCACHEFS_PROGRESS_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c deleted file mode 100644 index f241efb1fb5070..00000000000000 --- a/fs/bcachefs/quota.c +++ /dev/null @@ -1,892 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "btree_update.h" -#include "errcode.h" -#include "error.h" -#include "inode.h" -#include "quota.h" -#include "snapshot.h" -#include "super-io.h" - -static const char * const bch2_quota_types[] = { - "user", - "group", - "project", -}; - -static const char * const bch2_quota_counters[] = { - "space", - "inodes", -}; - -static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_quota *q = field_to_type(f, quota); - - if (vstruct_bytes(&q->field) < sizeof(*q)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&q->field), sizeof(*q)); - return -BCH_ERR_invalid_sb_quota; - } - - return 0; -} - -static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_quota *q = field_to_type(f, quota); - unsigned qtyp, counter; - - for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { - prt_printf(out, "%s: flags %llx", - bch2_quota_types[qtyp], - le64_to_cpu(q->q[qtyp].flags)); - - for (counter = 0; counter < Q_COUNTERS; counter++) - prt_printf(out, " %s timelimit %u warnlimit %u", - bch2_quota_counters[counter], - le32_to_cpu(q->q[qtyp].c[counter].timelimit), - le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); - - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_quota = { - .validate = bch2_sb_quota_validate, - .to_text = bch2_sb_quota_to_text, -}; - -int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, - c, quota_type_invalid, - "invalid quota type (%llu >= %u)", - k.k->p.inode, QTYP_NR); -fsck_err: - return ret; -} - -void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); - unsigned i; - - for (i = 0; i < Q_COUNTERS; i++) - prt_printf(out, "%s hardlimit %llu softlimit %llu", - bch2_quota_counters[i], - le64_to_cpu(dq.v->c[i].hardlimit), - le64_to_cpu(dq.v->c[i].softlimit)); -} - -#ifdef CONFIG_BCACHEFS_QUOTA - -#include -#include -#include - -static void qc_info_to_text(struct printbuf *out, struct qc_info *i) -{ - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 20); - - prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask); - prt_printf(out, "i_flags\t%u\n", i->i_flags); - prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit); - prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit); - prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit); - prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit); - prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit); - prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit); -} - -static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) -{ - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 20); - - prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask); - prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit); - prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit); - prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit); - prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit); - prt_printf(out, "d_space\t%llu\n", q->d_space); - prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count); - prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer); - prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer); - prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns); - prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns); -} - -static inline unsigned __next_qtype(unsigned i, unsigned qtypes) -{ - qtypes >>= i; - return qtypes ? i + __ffs(qtypes) : QTYP_NR; -} - -#define for_each_set_qtype(_c, _i, _q, _qtypes) \ - for (_i = 0; \ - (_i = __next_qtype(_i, _qtypes), \ - _q = &(_c)->quotas[_i], \ - _i < QTYP_NR); \ - _i++) - -static bool ignore_hardlimit(struct bch_memquota_type *q) -{ - if (capable(CAP_SYS_RESOURCE)) - return true; -#if 0 - struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; - - return capable(CAP_SYS_RESOURCE) && - (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || - !(info->dqi_flags & DQF_ROOT_SQUASH)); -#endif - return false; -} - -enum quota_msg { - SOFTWARN, /* Softlimit reached */ - SOFTLONGWARN, /* Grace time expired */ - HARDWARN, /* Hardlimit reached */ - - HARDBELOW, /* Usage got below inode hardlimit */ - SOFTBELOW, /* Usage got below inode softlimit */ -}; - -static int quota_nl[][Q_COUNTERS] = { - [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, - [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, - [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, - [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, - [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, - - [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, - [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, - [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, - [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, - [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, -}; - -struct quota_msgs { - u8 nr; - struct { - u8 qtype; - u8 msg; - } m[QTYP_NR * Q_COUNTERS]; -}; - -static void prepare_msg(unsigned qtype, - enum quota_counters counter, - struct quota_msgs *msgs, - enum quota_msg msg_type) -{ - BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); - - msgs->m[msgs->nr].qtype = qtype; - msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; - msgs->nr++; -} - -static void prepare_warning(struct memquota_counter *qc, - unsigned qtype, - enum quota_counters counter, - struct quota_msgs *msgs, - enum quota_msg msg_type) -{ - if (qc->warning_issued & (1 << msg_type)) - return; - - prepare_msg(qtype, counter, msgs, msg_type); -} - -static void flush_warnings(struct bch_qid qid, - struct super_block *sb, - struct quota_msgs *msgs) -{ - unsigned i; - - for (i = 0; i < msgs->nr; i++) - quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), - sb->s_dev, msgs->m[i].msg); -} - -static int bch2_quota_check_limit(struct bch_fs *c, - unsigned qtype, - struct bch_memquota *mq, - struct quota_msgs *msgs, - enum quota_counters counter, - s64 v, - enum quota_acct_mode mode) -{ - struct bch_memquota_type *q = &c->quotas[qtype]; - struct memquota_counter *qc = &mq->c[counter]; - u64 n = qc->v + v; - - BUG_ON((s64) n < 0); - - if (mode == KEY_TYPE_QUOTA_NOCHECK) - return 0; - - if (v <= 0) { - if (n < qc->hardlimit && - (qc->warning_issued & (1 << HARDWARN))) { - qc->warning_issued &= ~(1 << HARDWARN); - prepare_msg(qtype, counter, msgs, HARDBELOW); - } - - if (n < qc->softlimit && - (qc->warning_issued & (1 << SOFTWARN))) { - qc->warning_issued &= ~(1 << SOFTWARN); - prepare_msg(qtype, counter, msgs, SOFTBELOW); - } - - qc->warning_issued = 0; - return 0; - } - - if (qc->hardlimit && - qc->hardlimit < n && - !ignore_hardlimit(q)) { - prepare_warning(qc, qtype, counter, msgs, HARDWARN); - return -EDQUOT; - } - - if (qc->softlimit && - qc->softlimit < n) { - if (qc->timer == 0) { - qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit; - prepare_warning(qc, qtype, counter, msgs, SOFTWARN); - } else if (ktime_get_real_seconds() >= qc->timer && - !ignore_hardlimit(q)) { - prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); - return -EDQUOT; - } - } - - return 0; -} - -int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, - enum quota_counters counter, s64 v, - enum quota_acct_mode mode) -{ - unsigned qtypes = enabled_qtypes(c); - struct bch_memquota_type *q; - struct bch_memquota *mq[QTYP_NR]; - struct quota_msgs msgs; - unsigned i; - int ret = 0; - - memset(&msgs, 0, sizeof(msgs)); - - for_each_set_qtype(c, i, q, qtypes) { - mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL); - if (!mq[i]) - return -ENOMEM; - } - - for_each_set_qtype(c, i, q, qtypes) - mutex_lock_nested(&q->lock, i); - - for_each_set_qtype(c, i, q, qtypes) { - ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); - if (ret) - goto err; - } - - for_each_set_qtype(c, i, q, qtypes) - mq[i]->c[counter].v += v; -err: - for_each_set_qtype(c, i, q, qtypes) - mutex_unlock(&q->lock); - - flush_warnings(qid, c->vfs_sb, &msgs); - - return ret; -} - -static void __bch2_quota_transfer(struct bch_memquota *src_q, - struct bch_memquota *dst_q, - enum quota_counters counter, s64 v) -{ - BUG_ON(v > src_q->c[counter].v); - BUG_ON(v + dst_q->c[counter].v < v); - - src_q->c[counter].v -= v; - dst_q->c[counter].v += v; -} - -int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, - struct bch_qid dst, - struct bch_qid src, u64 space, - enum quota_acct_mode mode) -{ - struct bch_memquota_type *q; - struct bch_memquota *src_q[3], *dst_q[3]; - struct quota_msgs msgs; - unsigned i; - int ret = 0; - - qtypes &= enabled_qtypes(c); - - memset(&msgs, 0, sizeof(msgs)); - - for_each_set_qtype(c, i, q, qtypes) { - src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL); - dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL); - if (!src_q[i] || !dst_q[i]) - return -ENOMEM; - } - - for_each_set_qtype(c, i, q, qtypes) - mutex_lock_nested(&q->lock, i); - - for_each_set_qtype(c, i, q, qtypes) { - ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, - dst_q[i]->c[Q_SPC].v + space, - mode); - if (ret) - goto err; - - ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, - dst_q[i]->c[Q_INO].v + 1, - mode); - if (ret) - goto err; - } - - for_each_set_qtype(c, i, q, qtypes) { - __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); - __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); - } - -err: - for_each_set_qtype(c, i, q, qtypes) - mutex_unlock(&q->lock); - - flush_warnings(dst, c->vfs_sb, &msgs); - - return ret; -} - -static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, - struct qc_dqblk *qdq) -{ - struct bkey_s_c_quota dq; - struct bch_memquota_type *q; - struct bch_memquota *mq; - unsigned i; - - BUG_ON(k.k->p.inode >= QTYP_NR); - - if (!((1U << k.k->p.inode) & enabled_qtypes(c))) - return 0; - - switch (k.k->type) { - case KEY_TYPE_quota: - dq = bkey_s_c_to_quota(k); - q = &c->quotas[k.k->p.inode]; - - mutex_lock(&q->lock); - mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); - if (!mq) { - mutex_unlock(&q->lock); - return -ENOMEM; - } - - for (i = 0; i < Q_COUNTERS; i++) { - mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); - mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); - } - - if (qdq && qdq->d_fieldmask & QC_SPC_TIMER) - mq->c[Q_SPC].timer = qdq->d_spc_timer; - if (qdq && qdq->d_fieldmask & QC_SPC_WARNS) - mq->c[Q_SPC].warns = qdq->d_spc_warns; - if (qdq && qdq->d_fieldmask & QC_INO_TIMER) - mq->c[Q_INO].timer = qdq->d_ino_timer; - if (qdq && qdq->d_fieldmask & QC_INO_WARNS) - mq->c[Q_INO].warns = qdq->d_ino_warns; - - mutex_unlock(&q->lock); - } - - return 0; -} - -void bch2_fs_quota_exit(struct bch_fs *c) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(c->quotas); i++) - genradix_free(&c->quotas[i].table); -} - -void bch2_fs_quota_init(struct bch_fs *c) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(c->quotas); i++) - mutex_init(&c->quotas[i].lock); -} - -static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb) -{ - struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota); - - if (sb_quota) - return sb_quota; - - sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64)); - if (sb_quota) { - unsigned qtype, qc; - - for (qtype = 0; qtype < QTYP_NR; qtype++) - for (qc = 0; qc < Q_COUNTERS; qc++) - sb_quota->q[qtype].c[qc].timelimit = - cpu_to_le32(7 * 24 * 60 * 60); - } - - return sb_quota; -} - -static void bch2_sb_quota_read(struct bch_fs *c) -{ - struct bch_sb_field_quota *sb_quota; - unsigned i, j; - - sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota); - if (!sb_quota) - return; - - for (i = 0; i < QTYP_NR; i++) { - struct bch_memquota_type *q = &c->quotas[i]; - - for (j = 0; j < Q_COUNTERS; j++) { - q->limits[j].timelimit = - le32_to_cpu(sb_quota->q[i].c[j].timelimit); - q->limits[j].warnlimit = - le32_to_cpu(sb_quota->q[i].c[j].warnlimit); - } - } -} - -static int bch2_fs_quota_read_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bch_inode_unpacked u; - struct bch_snapshot_tree s_t; - u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot); - - int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "%s: snapshot tree %u not found", __func__, tree); - if (ret) - return ret; - - if (!s_t.master_subvol) - goto advance; - - ret = bch2_inode_find_by_inum_nowarn_trans(trans, - (subvol_inum) { - le32_to_cpu(s_t.master_subvol), - k.k->p.offset, - }, &u); - /* - * Inode might be deleted in this snapshot - the easiest way to handle - * that is to just skip it here: - */ - if (bch2_err_matches(ret, ENOENT)) - goto advance; - - if (ret) - return ret; - - bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, - KEY_TYPE_QUOTA_NOCHECK); - bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, - KEY_TYPE_QUOTA_NOCHECK); -advance: - bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); - return 0; -} - -int bch2_fs_quota_read(struct bch_fs *c) -{ - - mutex_lock(&c->sb_lock); - struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); - if (!sb_quota) { - mutex_unlock(&c->sb_lock); - return bch_err_throw(c, ENOSPC_sb_quota); - } - - bch2_sb_quota_read(c); - mutex_unlock(&c->sb_lock); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, - BTREE_ITER_prefetch, k, - __bch2_quota_set(c, k, NULL)) ?: - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - bch2_fs_quota_read_inode(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -/* Enable/disable/delete quotas for an entire filesystem: */ - -static int bch2_quota_enable(struct super_block *sb, unsigned uflags) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_sb_field_quota *sb_quota; - int ret = 0; - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - /* Accounting must be enabled at mount time: */ - if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) - return -EINVAL; - - /* Can't enable enforcement without accounting: */ - if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) - return -EINVAL; - - if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) - return -EINVAL; - - if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) - return -EINVAL; - - mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); - if (!sb_quota) { - ret = bch_err_throw(c, ENOSPC_sb_quota); - goto unlock; - } - - if (uflags & FS_QUOTA_UDQ_ENFD) - SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); - - if (uflags & FS_QUOTA_GDQ_ENFD) - SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); - - if (uflags & FS_QUOTA_PDQ_ENFD) - SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); - - bch2_write_super(c); -unlock: - mutex_unlock(&c->sb_lock); - - return bch2_err_class(ret); -} - -static int bch2_quota_disable(struct super_block *sb, unsigned uflags) -{ - struct bch_fs *c = sb->s_fs_info; - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - mutex_lock(&c->sb_lock); - if (uflags & FS_QUOTA_UDQ_ENFD) - SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); - - if (uflags & FS_QUOTA_GDQ_ENFD) - SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); - - if (uflags & FS_QUOTA_PDQ_ENFD) - SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return 0; -} - -static int bch2_quota_remove(struct super_block *sb, unsigned uflags) -{ - struct bch_fs *c = sb->s_fs_info; - int ret; - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - if (uflags & FS_USER_QUOTA) { - if (c->opts.usrquota) - return -EINVAL; - - ret = bch2_btree_delete_range(c, BTREE_ID_quotas, - POS(QTYP_USR, 0), - POS(QTYP_USR, U64_MAX), - 0, NULL); - if (ret) - return ret; - } - - if (uflags & FS_GROUP_QUOTA) { - if (c->opts.grpquota) - return -EINVAL; - - ret = bch2_btree_delete_range(c, BTREE_ID_quotas, - POS(QTYP_GRP, 0), - POS(QTYP_GRP, U64_MAX), - 0, NULL); - if (ret) - return ret; - } - - if (uflags & FS_PROJ_QUOTA) { - if (c->opts.prjquota) - return -EINVAL; - - ret = bch2_btree_delete_range(c, BTREE_ID_quotas, - POS(QTYP_PRJ, 0), - POS(QTYP_PRJ, U64_MAX), - 0, NULL); - if (ret) - return ret; - } - - return 0; -} - -/* - * Return quota status information, such as enforcements, quota file inode - * numbers etc. - */ -static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) -{ - struct bch_fs *c = sb->s_fs_info; - unsigned qtypes = enabled_qtypes(c); - unsigned i; - - memset(state, 0, sizeof(*state)); - - for (i = 0; i < QTYP_NR; i++) { - state->s_state[i].flags |= QCI_SYSFILE; - - if (!(qtypes & (1 << i))) - continue; - - state->s_state[i].flags |= QCI_ACCT_ENABLED; - - state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; - state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; - - state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; - state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; - } - - return 0; -} - -/* - * Adjust quota timers & warnings - */ -static int bch2_quota_set_info(struct super_block *sb, int type, - struct qc_info *info) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_sb_field_quota *sb_quota; - int ret = 0; - - if (0) { - struct printbuf buf = PRINTBUF; - - qc_info_to_text(&buf, info); - pr_info("setting:\n%s", buf.buf); - printbuf_exit(&buf); - } - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - if (type >= QTYP_NR) - return -EINVAL; - - if (!((1 << type) & enabled_qtypes(c))) - return -ESRCH; - - if (info->i_fieldmask & - ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) - return -EINVAL; - - mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); - if (!sb_quota) { - ret = bch_err_throw(c, ENOSPC_sb_quota); - goto unlock; - } - - if (info->i_fieldmask & QC_SPC_TIMER) - sb_quota->q[type].c[Q_SPC].timelimit = - cpu_to_le32(info->i_spc_timelimit); - - if (info->i_fieldmask & QC_SPC_WARNS) - sb_quota->q[type].c[Q_SPC].warnlimit = - cpu_to_le32(info->i_spc_warnlimit); - - if (info->i_fieldmask & QC_INO_TIMER) - sb_quota->q[type].c[Q_INO].timelimit = - cpu_to_le32(info->i_ino_timelimit); - - if (info->i_fieldmask & QC_INO_WARNS) - sb_quota->q[type].c[Q_INO].warnlimit = - cpu_to_le32(info->i_ino_warnlimit); - - bch2_sb_quota_read(c); - - bch2_write_super(c); -unlock: - mutex_unlock(&c->sb_lock); - - return bch2_err_class(ret); -} - -/* Get/set individual quotas: */ - -static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) -{ - dst->d_space = src->c[Q_SPC].v << 9; - dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; - dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; - dst->d_spc_timer = src->c[Q_SPC].timer; - dst->d_spc_warns = src->c[Q_SPC].warns; - - dst->d_ino_count = src->c[Q_INO].v; - dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; - dst->d_ino_softlimit = src->c[Q_INO].softlimit; - dst->d_ino_timer = src->c[Q_INO].timer; - dst->d_ino_warns = src->c[Q_INO].warns; -} - -static int bch2_get_quota(struct super_block *sb, struct kqid kqid, - struct qc_dqblk *qdq) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_memquota_type *q = &c->quotas[kqid.type]; - qid_t qid = from_kqid(&init_user_ns, kqid); - struct bch_memquota *mq; - - memset(qdq, 0, sizeof(*qdq)); - - mutex_lock(&q->lock); - mq = genradix_ptr(&q->table, qid); - if (mq) - __bch2_quota_get(qdq, mq); - mutex_unlock(&q->lock); - - return 0; -} - -static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, - struct qc_dqblk *qdq) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_memquota_type *q = &c->quotas[kqid->type]; - qid_t qid = from_kqid(&init_user_ns, *kqid); - struct genradix_iter iter; - struct bch_memquota *mq; - int ret = 0; - - mutex_lock(&q->lock); - - genradix_for_each_from(&q->table, iter, mq, qid) - if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { - __bch2_quota_get(qdq, mq); - *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); - goto found; - } - - ret = -ENOENT; -found: - mutex_unlock(&q->lock); - return bch2_err_class(ret); -} - -static int bch2_set_quota_trans(struct btree_trans *trans, - struct bkey_i_quota *new_quota, - struct qc_dqblk *qdq) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, - BTREE_ITER_slots|BTREE_ITER_intent); - ret = bkey_err(k); - if (unlikely(ret)) - return ret; - - if (k.k->type == KEY_TYPE_quota) - new_quota->v = *bkey_s_c_to_quota(k).v; - - if (qdq->d_fieldmask & QC_SPC_SOFT) - new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); - if (qdq->d_fieldmask & QC_SPC_HARD) - new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); - - if (qdq->d_fieldmask & QC_INO_SOFT) - new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); - if (qdq->d_fieldmask & QC_INO_HARD) - new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); - - ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_set_quota(struct super_block *sb, struct kqid qid, - struct qc_dqblk *qdq) -{ - struct bch_fs *c = sb->s_fs_info; - struct bkey_i_quota new_quota; - int ret; - - if (0) { - struct printbuf buf = PRINTBUF; - - qc_dqblk_to_text(&buf, qdq); - pr_info("setting:\n%s", buf.buf); - printbuf_exit(&buf); - } - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - bkey_quota_init(&new_quota.k_i); - new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_set_quota_trans(trans, &new_quota, qdq)) ?: - __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); - - return bch2_err_class(ret); -} - -const struct quotactl_ops bch2_quotactl_operations = { - .quota_enable = bch2_quota_enable, - .quota_disable = bch2_quota_disable, - .rm_xquota = bch2_quota_remove, - - .get_state = bch2_quota_get_state, - .set_info = bch2_quota_set_info, - - .get_dqblk = bch2_get_quota, - .get_nextdqblk = bch2_get_next_quota, - .set_dqblk = bch2_set_quota, -}; - -#endif /* CONFIG_BCACHEFS_QUOTA */ diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h deleted file mode 100644 index 1551800ff44c91..00000000000000 --- a/fs/bcachefs/quota.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_QUOTA_H -#define _BCACHEFS_QUOTA_H - -#include "inode.h" -#include "quota_types.h" - -extern const struct bch_sb_field_ops bch_sb_field_ops_quota; - -int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_quota ((struct bkey_ops) { \ - .key_validate = bch2_quota_validate, \ - .val_to_text = bch2_quota_to_text, \ - .min_val_size = 32, \ -}) - -static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) -{ - return (struct bch_qid) { - .q[QTYP_USR] = u->bi_uid, - .q[QTYP_GRP] = u->bi_gid, - .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, - }; -} - -static inline unsigned enabled_qtypes(struct bch_fs *c) -{ - return ((c->opts.usrquota << QTYP_USR)| - (c->opts.grpquota << QTYP_GRP)| - (c->opts.prjquota << QTYP_PRJ)); -} - -#ifdef CONFIG_BCACHEFS_QUOTA - -int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, - s64, enum quota_acct_mode); - -int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, - struct bch_qid, u64, enum quota_acct_mode); - -void bch2_fs_quota_exit(struct bch_fs *); -void bch2_fs_quota_init(struct bch_fs *); -int bch2_fs_quota_read(struct bch_fs *); - -extern const struct quotactl_ops bch2_quotactl_operations; - -#else - -static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, - enum quota_counters counter, s64 v, - enum quota_acct_mode mode) -{ - return 0; -} - -static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, - struct bch_qid dst, - struct bch_qid src, u64 space, - enum quota_acct_mode mode) -{ - return 0; -} - -static inline void bch2_fs_quota_exit(struct bch_fs *c) {} -static inline void bch2_fs_quota_init(struct bch_fs *c) {} -static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } - -#endif - -#endif /* _BCACHEFS_QUOTA_H */ diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h deleted file mode 100644 index dc34347ef6c74a..00000000000000 --- a/fs/bcachefs/quota_format.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_QUOTA_FORMAT_H -#define _BCACHEFS_QUOTA_FORMAT_H - -/* KEY_TYPE_quota: */ - -enum quota_types { - QTYP_USR = 0, - QTYP_GRP = 1, - QTYP_PRJ = 2, - QTYP_NR = 3, -}; - -enum quota_counters { - Q_SPC = 0, - Q_INO = 1, - Q_COUNTERS = 2, -}; - -struct bch_quota_counter { - __le64 hardlimit; - __le64 softlimit; -}; - -struct bch_quota { - struct bch_val v; - struct bch_quota_counter c[Q_COUNTERS]; -} __packed __aligned(8); - -/* BCH_SB_FIELD_quota: */ - -struct bch_sb_quota_counter { - __le32 timelimit; - __le32 warnlimit; -}; - -struct bch_sb_quota_type { - __le64 flags; - struct bch_sb_quota_counter c[Q_COUNTERS]; -}; - -struct bch_sb_field_quota { - struct bch_sb_field field; - struct bch_sb_quota_type q[QTYP_NR]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_QUOTA_FORMAT_H */ diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h deleted file mode 100644 index 6a136083d3899d..00000000000000 --- a/fs/bcachefs/quota_types.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_QUOTA_TYPES_H -#define _BCACHEFS_QUOTA_TYPES_H - -#include - -struct bch_qid { - u32 q[QTYP_NR]; -}; - -enum quota_acct_mode { - KEY_TYPE_QUOTA_PREALLOC, - KEY_TYPE_QUOTA_WARN, - KEY_TYPE_QUOTA_NOCHECK, -}; - -struct memquota_counter { - u64 v; - u64 hardlimit; - u64 softlimit; - s64 timer; - int warns; - int warning_issued; -}; - -struct bch_memquota { - struct memquota_counter c[Q_COUNTERS]; -}; - -typedef GENRADIX(struct bch_memquota) bch_memquota_table; - -struct quota_limit { - u32 timelimit; - u32 warnlimit; -}; - -struct bch_memquota_type { - struct quota_limit limits[Q_COUNTERS]; - bch_memquota_table table; - struct mutex lock; -}; - -#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c deleted file mode 100644 index b1438be9d69088..00000000000000 --- a/fs/bcachefs/rcu_pending.c +++ /dev/null @@ -1,666 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define pr_fmt(fmt) "%s() " fmt "\n", __func__ - -#include -#include -#include -#include -#include -#include - -#include "rcu_pending.h" -#include "darray.h" -#include "util.h" - -#define static_array_for_each(_a, _i) \ - for (typeof(&(_a)[0]) _i = _a; \ - _i < (_a) + ARRAY_SIZE(_a); \ - _i++) - -enum rcu_pending_special { - RCU_PENDING_KVFREE = 1, - RCU_PENDING_CALL_RCU = 2, -}; - -#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) -#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) - -#ifdef __KERNEL__ -typedef unsigned long rcu_gp_poll_state_t; - -static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) -{ - return l == r; -} -#else -typedef struct urcu_gp_poll_state rcu_gp_poll_state_t; - -static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) -{ - return l.grace_period_id == r.grace_period_id; -} -#endif - -static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp) -{ - return ssp - ? get_state_synchronize_srcu(ssp) - : get_state_synchronize_rcu(); -} - -static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp) -{ - return ssp - ? start_poll_synchronize_srcu(ssp) - : start_poll_synchronize_rcu(); -} - -static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie) -{ - return ssp - ? poll_state_synchronize_srcu(ssp, cookie) - : poll_state_synchronize_rcu(cookie); -} - -static inline void __rcu_barrier(struct srcu_struct *ssp) -{ - return ssp - ? srcu_barrier(ssp) - : rcu_barrier(); -} - -static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp, - rcu_callback_t func) -{ - if (ssp) - call_srcu(ssp, rhp, func); - else - call_rcu(rhp, func); -} - -struct rcu_pending_seq { - /* - * We're using a radix tree like a vector - we're just pushing elements - * onto the end; we're using a radix tree instead of an actual vector to - * avoid reallocation overhead - */ - GENRADIX(struct rcu_head *) objs; - size_t nr; - struct rcu_head **cursor; - rcu_gp_poll_state_t seq; -}; - -struct rcu_pending_list { - struct rcu_head *head; - struct rcu_head *tail; - rcu_gp_poll_state_t seq; -}; - -struct rcu_pending_pcpu { - struct rcu_pending *parent; - spinlock_t lock; - int cpu; - - /* - * We can't bound the number of unprocessed gp sequence numbers, and we - * can't efficiently merge radix trees for expired grace periods, so we - * need darray/vector: - */ - DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs; - - /* Third entry is for expired objects: */ - struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1]; - - struct rcu_head cb; - bool cb_armed; - struct work_struct work; -}; - -static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p) -{ - if (p->objs.nr) - return true; - - static_array_for_each(p->lists, i) - if (i->head) - return true; - - return false; -} - -static void rcu_pending_list_merge(struct rcu_pending_list *l1, - struct rcu_pending_list *l2) -{ -#ifdef __KERNEL__ - if (!l1->head) - l1->head = l2->head; - else - l1->tail->next = l2->head; -#else - if (!l1->head) - l1->head = l2->head; - else - l1->tail->next.next = (void *) l2->head; -#endif - - l1->tail = l2->tail; - l2->head = l2->tail = NULL; -} - -static void rcu_pending_list_add(struct rcu_pending_list *l, - struct rcu_head *n) -{ -#ifdef __KERNEL__ - if (!l->head) - l->head = n; - else - l->tail->next = n; - l->tail = n; - n->next = NULL; -#else - if (!l->head) - l->head = n; - else - l->tail->next.next = (void *) n; - l->tail = n; - n->next.next = NULL; -#endif -} - -static void merge_expired_lists(struct rcu_pending_pcpu *p) -{ - struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; - - for (struct rcu_pending_list *i = p->lists; i < expired; i++) - if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq)) - rcu_pending_list_merge(expired, i); -} - -#ifndef __KERNEL__ -static inline void kfree_bulk(size_t nr, void ** p) -{ - while (nr--) - kfree(*p); -} -#endif - -static noinline void __process_finished_items(struct rcu_pending *pending, - struct rcu_pending_pcpu *p, - unsigned long flags) -{ - struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; - struct rcu_pending_seq objs = {}; - struct rcu_head *list = NULL; - - if (p->objs.nr && - __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) { - objs = p->objs.data[0]; - darray_remove_item(&p->objs, p->objs.data); - } - - merge_expired_lists(p); - - list = expired->head; - expired->head = expired->tail = NULL; - - spin_unlock_irqrestore(&p->lock, flags); - - switch ((ulong) pending->process) { - case RCU_PENDING_KVFREE: - for (size_t i = 0; i < objs.nr; ) { - size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i); - - kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i)); - i += nr_this_node; - } - genradix_free(&objs.objs); - - while (list) { - struct rcu_head *obj = list; -#ifdef __KERNEL__ - list = obj->next; -#else - list = (void *) obj->next.next; -#endif - - /* - * low bit of pointer indicates whether rcu_head needs - * to be freed - kvfree_rcu_mightsleep() - */ - BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0); - - void *ptr = (void *)(((unsigned long) obj->func) & ~1UL); - bool free_head = ((unsigned long) obj->func) & 1UL; - - kvfree(ptr); - if (free_head) - kfree(obj); - } - - break; - - case RCU_PENDING_CALL_RCU: - for (size_t i = 0; i < objs.nr; i++) { - struct rcu_head *obj = *genradix_ptr(&objs.objs, i); - obj->func(obj); - } - genradix_free(&objs.objs); - - while (list) { - struct rcu_head *obj = list; -#ifdef __KERNEL__ - list = obj->next; -#else - list = (void *) obj->next.next; -#endif - obj->func(obj); - } - break; - - default: - for (size_t i = 0; i < objs.nr; i++) - pending->process(pending, *genradix_ptr(&objs.objs, i)); - genradix_free(&objs.objs); - - while (list) { - struct rcu_head *obj = list; -#ifdef __KERNEL__ - list = obj->next; -#else - list = (void *) obj->next.next; -#endif - pending->process(pending, obj); - } - break; - } -} - -static bool process_finished_items(struct rcu_pending *pending, - struct rcu_pending_pcpu *p, - unsigned long flags) -{ - /* - * XXX: we should grab the gp seq once and avoid multiple function - * calls, this is called from __rcu_pending_enqueue() fastpath in - * may_sleep==true mode - */ - if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) || - (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) || - (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) || - p->lists[2].head) { - __process_finished_items(pending, p, flags); - return true; - } - - return false; -} - -static void rcu_pending_work(struct work_struct *work) -{ - struct rcu_pending_pcpu *p = - container_of(work, struct rcu_pending_pcpu, work); - struct rcu_pending *pending = p->parent; - unsigned long flags; - - do { - spin_lock_irqsave(&p->lock, flags); - } while (process_finished_items(pending, p, flags)); - - spin_unlock_irqrestore(&p->lock, flags); -} - -static void rcu_pending_rcu_cb(struct rcu_head *rcu) -{ - struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb); - - schedule_work_on(p->cpu, &p->work); - - unsigned long flags; - spin_lock_irqsave(&p->lock, flags); - if (__rcu_pending_has_pending(p)) { - spin_unlock_irqrestore(&p->lock, flags); - __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb); - } else { - p->cb_armed = false; - spin_unlock_irqrestore(&p->lock, flags); - } -} - -static __always_inline struct rcu_pending_seq * -get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq) -{ - darray_for_each_reverse(p->objs, objs) - if (rcu_gp_poll_cookie_eq(objs->seq, seq)) - return objs; - - if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) - return NULL; - - return &darray_last(p->objs); -} - -static noinline bool -rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq, - struct rcu_head *head, void *ptr, - unsigned long *flags) -{ - if (ptr) { - if (!head) { - /* - * kvfree_rcu_mightsleep(): we weren't passed an - * rcu_head, but we need one: use the low bit of the - * ponter to free to flag that the head needs to be - * freed as well: - */ - ptr = (void *)(((unsigned long) ptr)|1UL); - head = kmalloc(sizeof(*head), __GFP_NOWARN); - if (!head) { - spin_unlock_irqrestore(&p->lock, *flags); - head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL); - /* - * dropped lock, did GFP_KERNEL allocation, - * check for gp expiration - */ - if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) { - kvfree(--ptr); - kfree(head); - spin_lock_irqsave(&p->lock, *flags); - return false; - } - } - } - - head->func = ptr; - } -again: - for (struct rcu_pending_list *i = p->lists; - i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { - if (rcu_gp_poll_cookie_eq(i->seq, seq)) { - rcu_pending_list_add(i, head); - return false; - } - } - - for (struct rcu_pending_list *i = p->lists; - i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { - if (!i->head) { - i->seq = seq; - rcu_pending_list_add(i, head); - return true; - } - } - - merge_expired_lists(p); - goto again; -} - -/* - * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via - * pending->pracess) once grace period elapses. - * - * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall - * back to a linked list. - * - * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a - * process callback - * - * - If @ptr and @head are both not NULL, we're kvfree_rcu() - * - * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep() - * - * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process - * expired items. - */ -static __always_inline void -__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, - void *ptr, bool may_sleep) -{ - - struct rcu_pending_pcpu *p; - struct rcu_pending_seq *objs; - struct genradix_node *new_node = NULL; - unsigned long flags; - bool start_gp = false; - - BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); - - /* We could technically be scheduled before taking the lock and end up - * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock - * anyways - * - * And we have to do it this way to avoid breaking PREEMPT_RT, which - * redefines how spinlocks work: - */ - p = raw_cpu_ptr(pending->p); - spin_lock_irqsave(&p->lock, flags); - rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); -restart: - if (may_sleep && - unlikely(process_finished_items(pending, p, flags))) - goto check_expired; - - /* - * In kvfree_rcu() mode, the radix tree is only for slab pointers so - * that we can do kfree_bulk() - vmalloc pointers always use the linked - * list: - */ - if (ptr && unlikely(is_vmalloc_addr(ptr))) - goto list_add; - - objs = get_object_radix(p, seq); - if (unlikely(!objs)) - goto list_add; - - if (unlikely(!objs->cursor)) { - /* - * New radix tree nodes must be added under @p->lock because the - * tree root is in a darray that can be resized (typically, - * genradix supports concurrent unlocked allocation of new - * nodes) - hence preallocation and the retry loop: - */ - objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs, - objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN); - if (unlikely(!objs->cursor)) { - if (may_sleep) { - spin_unlock_irqrestore(&p->lock, flags); - - gfp_t gfp = GFP_KERNEL; - if (!head) - gfp |= __GFP_NOFAIL; - - new_node = genradix_alloc_node(gfp); - if (!new_node) - may_sleep = false; - goto check_expired; - } -list_add: - start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags); - goto start_gp; - } - } - - *objs->cursor++ = ptr ?: head; - /* zero cursor if we hit the end of a radix tree node: */ - if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1))) - objs->cursor = NULL; - start_gp = !objs->nr; - objs->nr++; -start_gp: - if (unlikely(start_gp)) { - /* - * We only have one callback (ideally, we would have one for - * every outstanding graceperiod) - so if our callback is - * already in flight, we may still have to start a grace period - * (since we used get_state() above, not start_poll()) - */ - if (!p->cb_armed) { - p->cb_armed = true; - __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb); - } else { - __start_poll_synchronize_rcu(pending->srcu); - } - } - spin_unlock_irqrestore(&p->lock, flags); -free_node: - if (new_node) - genradix_free_node(new_node); - return; -check_expired: - if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) { - switch ((ulong) pending->process) { - case RCU_PENDING_KVFREE: - kvfree(ptr); - break; - case RCU_PENDING_CALL_RCU: - head->func(head); - break; - default: - pending->process(pending, head); - break; - } - goto free_node; - } - - p = raw_cpu_ptr(pending->p); - spin_lock_irqsave(&p->lock, flags); - goto restart; -} - -void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj) -{ - __rcu_pending_enqueue(pending, obj, NULL, true); -} - -static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p) -{ - struct rcu_head *ret = NULL; - - spin_lock_irq(&p->lock); - darray_for_each(p->objs, objs) - if (objs->nr) { - ret = *genradix_ptr(&objs->objs, --objs->nr); - objs->cursor = NULL; - if (!objs->nr) - genradix_free(&objs->objs); - goto out; - } - - static_array_for_each(p->lists, i) - if (i->head) { - ret = i->head; -#ifdef __KERNEL__ - i->head = ret->next; -#else - i->head = (void *) ret->next.next; -#endif - if (!i->head) - i->tail = NULL; - goto out; - } -out: - spin_unlock_irq(&p->lock); - - return ret; -} - -struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending) -{ - return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p)); -} - -struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending) -{ - struct rcu_head *ret = rcu_pending_dequeue(pending); - - if (ret) - return ret; - - int cpu; - for_each_possible_cpu(cpu) { - ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu)); - if (ret) - break; - } - return ret; -} - -static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending) -{ - int cpu; - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - spin_lock_irq(&p->lock); - if (__rcu_pending_has_pending(p) || p->cb_armed) { - spin_unlock_irq(&p->lock); - return true; - } - spin_unlock_irq(&p->lock); - } - - return false; -} - -void rcu_pending_exit(struct rcu_pending *pending) -{ - int cpu; - - if (!pending->p) - return; - - while (rcu_pending_has_pending_or_armed(pending)) { - __rcu_barrier(pending->srcu); - - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - flush_work(&p->work); - } - } - - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - flush_work(&p->work); - } - - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - - static_array_for_each(p->lists, i) - WARN_ON(i->head); - WARN_ON(p->objs.nr); - darray_exit(&p->objs); - } - free_percpu(pending->p); -} - -/** - * rcu_pending_init: - initialize a rcu_pending - * - * @pending: Object to init - * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal - * RCU flavor - * @process: Callback function invoked on objects once their RCU barriers - * have completed; if NULL, kvfree() is used. - */ -int rcu_pending_init(struct rcu_pending *pending, - struct srcu_struct *srcu, - rcu_pending_process_fn process) -{ - pending->p = alloc_percpu(struct rcu_pending_pcpu); - if (!pending->p) - return -ENOMEM; - - int cpu; - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - p->parent = pending; - p->cpu = cpu; - spin_lock_init(&p->lock); - darray_init(&p->objs); - INIT_WORK(&p->work, rcu_pending_work); - } - - pending->srcu = srcu; - pending->process = process; - - return 0; -} diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h deleted file mode 100644 index 71a2f4ddaade48..00000000000000 --- a/fs/bcachefs/rcu_pending.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_RCU_PENDING_H -#define _LINUX_RCU_PENDING_H - -#include - -struct rcu_pending; -typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *); - -struct rcu_pending_pcpu; - -struct rcu_pending { - struct rcu_pending_pcpu __percpu *p; - struct srcu_struct *srcu; - rcu_pending_process_fn process; -}; - -void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj); -struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending); -struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending); - -void rcu_pending_exit(struct rcu_pending *pending); -int rcu_pending_init(struct rcu_pending *pending, - struct srcu_struct *srcu, - rcu_pending_process_fn process); - -#endif /* _LINUX_RCU_PENDING_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c deleted file mode 100644 index 1c345b86b1c007..00000000000000 --- a/fs/bcachefs/rebalance.c +++ /dev/null @@ -1,889 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "clock.h" -#include "compress.h" -#include "disk_groups.h" -#include "errcode.h" -#include "error.h" -#include "inode.h" -#include "io_write.h" -#include "move.h" -#include "rebalance.h" -#include "subvolume.h" -#include "super-io.h" -#include "trace.h" - -#include -#include -#include - -/* bch_extent_rebalance: */ - -static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) -{ - const union bch_extent_entry *entry; - - bkey_extent_entry_for_each(ptrs, entry) - if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) - return &entry->rebalance; - - return NULL; -} - -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) -{ - return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); -} - -static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s_c k, - struct bkey_ptrs_c ptrs) -{ - if (!opts->background_compression) - return 0; - - unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ptr_bit = 1; - unsigned rewrite_ptrs = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) - return 0; - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - - return rewrite_ptrs; -} - -static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_ptrs_c ptrs) -{ - if (!opts->background_target || - !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) - return 0; - - unsigned ptr_bit = 1; - unsigned rewrite_ptrs = 0; - - guard(rcu)(); - bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - - return rewrite_ptrs; -} - -static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; - - return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | - bch2_bkey_ptrs_need_move(c, opts, ptrs); -} - -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); - if (!opts) - return 0; - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; - - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - u64 sectors = 0; - - if (opts->background_compression) { - unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) { - sectors = 0; - goto incompressible; - } - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - sectors += p.crc.compressed_size; - } - } -incompressible: - if (opts->background_target) { - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) - sectors += p.crc.compressed_size; - } - - return sectors; -} - -static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, - struct bkey_s_c k) -{ - if (!bkey_extent_is_direct_data(k.k)) - return 0; - - const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); - - if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts); - return old == NULL || memcmp(old, &new, sizeof(new)); - } else { - return old != NULL; - } -} - -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, - struct bkey_i *_k) -{ - if (!bkey_extent_is_direct_data(&_k->k)) - return 0; - - struct bkey_s k = bkey_i_to_s(_k); - struct bch_extent_rebalance *old = - (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - - if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { - if (!old) { - old = bkey_val_end(k); - k.k->u64s += sizeof(*old) / sizeof(u64); - } - - *old = io_opts_to_rebalance_opts(c, opts); - } else { - if (old) - extent_entry_drop(k, (union bch_extent_entry *) old); - } - - return 0; -} - -int bch2_get_update_rebalance_opts(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct btree_iter *iter, - struct bkey_s_c k) -{ - BUG_ON(iter->flags & BTREE_ITER_is_extents); - BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); - - const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v - ? bch2_bkey_rebalance_opts(k) : NULL; - if (r) { -#define x(_name) \ - if (r->_name##_from_inode) { \ - io_opts->_name = r->_name; \ - io_opts->_name##_from_inode = true; \ - } - BCH_REBALANCE_OPTS() -#undef x - } - - if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) - return 0; - - struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - bkey_reassemble(n, k); - - /* On successfull transaction commit, @k was invalidated: */ - - return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; -} - -#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) - -static const char * const bch2_rebalance_state_strs[] = { -#define x(t) #t, - BCH_REBALANCE_STATES() - NULL -#undef x -}; - -int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i_cookie *cookie; - u64 v; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - v = k.k->type == KEY_TYPE_cookie - ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) - : 0; - - cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); - ret = PTR_ERR_OR_ZERO(cookie); - if (ret) - goto err; - - bkey_cookie_init(&cookie->k_i); - cookie->k.p = iter.pos; - cookie->v.cookie = cpu_to_le64(v + 1); - - ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) -{ - int ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_set_rebalance_needs_scan_trans(trans, inum)); - bch2_rebalance_wakeup(c); - return ret; -} - -int bch2_set_fs_needs_rebalance(struct bch_fs *c) -{ - return bch2_set_rebalance_needs_scan(c, 0); -} - -static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 v; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - v = k.k->type == KEY_TYPE_cookie - ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) - : 0; - - if (v == cookie) - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, - struct btree_iter *work_iter) -{ - return !kthread_should_stop() - ? bch2_btree_iter_peek(trans, work_iter) - : bkey_s_c_null; -} - -static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) - return 0; - - struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - extent_entry_drop(bkey_i_to_s(n), - (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -} - -static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, - struct bpos work_pos, - struct btree_iter *extent_iter, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bch_fs *c = trans->c; - - bch2_trans_iter_exit(trans, extent_iter); - bch2_trans_iter_init(trans, extent_iter, - work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, - work_pos, - BTREE_ITER_all_snapshots); - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter); - if (bkey_err(k)) - return k; - - int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); - if (ret) - return bkey_s_c_err(ret); - - memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_only_specified_devs; - - if (!data_opts->rewrite_ptrs) { - /* - * device we would want to write to offline? devices in target - * changed? - * - * We'll now need a full scan before this extent is picked up - * again: - */ - int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); - if (ret) - return bkey_s_c_err(ret); - return bkey_s_c_null; - } - - if (trace_rebalance_extent_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); - if (p) { - prt_str(&buf, "compression="); - bch2_compression_opt_to_text(&buf, io_opts->background_compression); - prt_str(&buf, " "); - bch2_prt_u64_base2(&buf, p); - prt_newline(&buf); - } - - p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); - if (p) { - prt_str(&buf, "move="); - bch2_target_to_text(&buf, c, io_opts->background_target); - prt_str(&buf, " "); - bch2_prt_u64_base2(&buf, p); - prt_newline(&buf); - } - - trace_rebalance_extent(c, buf.buf); - printbuf_exit(&buf); - } - - return k; -} - -noinline_for_stack -static int do_rebalance_extent(struct moving_context *ctxt, - struct bpos work_pos, - struct btree_iter *extent_iter) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct bch_fs_rebalance *r = &trans->c->rebalance; - struct data_update_opts data_opts; - struct bch_io_opts io_opts; - struct bkey_s_c k; - struct bkey_buf sk; - int ret; - - ctxt->stats = &r->work_stats; - r->state = BCH_REBALANCE_working; - - bch2_bkey_buf_init(&sk); - - ret = bkey_err(k = next_rebalance_extent(trans, work_pos, - extent_iter, &io_opts, &data_opts)); - if (ret || !k.k) - goto out; - - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); - - /* - * The iterator gets unlocked by __bch2_read_extent - need to - * save a copy of @k elsewhere: - */ - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); - if (ret) { - if (bch2_err_matches(ret, ENOMEM)) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - ret = bch_err_throw(c, transaction_restart_nested); - } - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto out; - - /* skip it and continue, XXX signal failure */ - ret = 0; - } -out: - bch2_bkey_buf_exit(&sk, c); - return ret; -} - -static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct bch_fs_rebalance *r = &trans->c->rebalance; - - bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - ctxt->stats = &r->scan_stats; - - if (!inum) { - r->scan_start = BBPOS_MIN; - r->scan_end = BBPOS_MAX; - } else { - r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); - r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); - } - - r->state = BCH_REBALANCE_scanning; - - struct per_snapshot_io_opts snapshot_io_opts; - per_snapshot_io_opts_init(&snapshot_io_opts, c); - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, - r->scan_start.pos, r->scan_end.pos, - BTREE_ITER_all_snapshots| - BTREE_ITER_not_extents| - BTREE_ITER_prefetch, k, ({ - ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - - struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, - &snapshot_io_opts, iter.pos, &iter, k); - PTR_ERR_OR_ZERO(io_opts); - })) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_clear_rebalance_needs_scan(trans, inum, cookie)); - - per_snapshot_io_opts_exit(&snapshot_io_opts); - bch2_move_stats_exit(&r->scan_stats, trans->c); - - /* - * Ensure that the rebalance_work entries we created are seen by the - * next iteration of do_rebalance(), so we don't end up stuck in - * rebalance_wait(): - */ - atomic64_inc(&r->scan_stats.sectors_seen); - bch2_btree_write_buffer_flush_sync(trans); - - return ret; -} - -static void rebalance_wait(struct bch_fs *c) -{ - struct bch_fs_rebalance *r = &c->rebalance; - struct io_clock *clock = &c->io_clock[WRITE]; - u64 now = atomic64_read(&clock->now); - u64 min_member_capacity = bch2_min_rw_member_capacity(c); - - if (min_member_capacity == U64_MAX) - min_member_capacity = 128 * 2048; - - r->wait_iotime_end = now + (min_member_capacity >> 6); - - if (r->state != BCH_REBALANCE_waiting) { - r->wait_iotime_start = now; - r->wait_wallclock_start = ktime_get_real_ns(); - r->state = BCH_REBALANCE_waiting; - } - - bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); -} - -static bool bch2_rebalance_enabled(struct bch_fs *c) -{ - return c->opts.rebalance_enabled && - !(c->opts.rebalance_on_ac_only && - c->rebalance.on_battery); -} - -static int do_rebalance(struct moving_context *ctxt) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct bch_fs_rebalance *r = &c->rebalance; - struct btree_iter rebalance_work_iter, extent_iter = {}; - struct bkey_s_c k; - u32 kick = r->kick; - int ret = 0; - - bch2_trans_begin(trans); - - bch2_move_stats_init(&r->work_stats, "rebalance_work"); - bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - - bch2_trans_iter_init(trans, &rebalance_work_iter, - BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_all_snapshots); - - while (!bch2_move_ratelimit(ctxt)) { - if (!bch2_rebalance_enabled(c)) { - bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(bch2_rebalance_enabled(c) || - kthread_should_stop()); - } - - if (kthread_should_stop()) - break; - - bch2_trans_begin(trans); - - ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret || !k.k) - break; - - ret = k.k->type == KEY_TYPE_cookie - ? do_rebalance_scan(ctxt, k.k->p.inode, - le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) - : do_rebalance_extent(ctxt, k.k->p, &extent_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - bch2_btree_iter_advance(trans, &rebalance_work_iter); - } - - bch2_trans_iter_exit(trans, &extent_iter); - bch2_trans_iter_exit(trans, &rebalance_work_iter); - bch2_move_stats_exit(&r->scan_stats, c); - - if (!ret && - !kthread_should_stop() && - !atomic64_read(&r->work_stats.sectors_seen) && - !atomic64_read(&r->scan_stats.sectors_seen) && - kick == r->kick) { - bch2_moving_ctxt_flush_all(ctxt); - bch2_trans_unlock_long(trans); - rebalance_wait(c); - } - - if (!bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); - return ret; -} - -static int bch2_rebalance_thread(void *arg) -{ - struct bch_fs *c = arg; - struct bch_fs_rebalance *r = &c->rebalance; - struct moving_context ctxt; - - set_freezable(); - - /* - * Data move operations can't run until after check_snapshots has - * completed, and bch2_snapshot_is_ancestor() is available. - */ - kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || - kthread_should_stop()); - - bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, - writepoint_ptr(&c->rebalance_write_point), - true); - - while (!kthread_should_stop() && !do_rebalance(&ctxt)) - ; - - bch2_moving_ctxt_exit(&ctxt); - - return 0; -} - -void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) -{ - printbuf_tabstop_push(out, 32); - - struct bch_fs_rebalance *r = &c->rebalance; - - /* print pending work */ - struct disk_accounting_pos acc; - disk_accounting_key_init(acc, rebalance_work); - u64 v; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); - - prt_printf(out, "pending work:\t"); - prt_human_readable_u64(out, v << 9); - prt_printf(out, "\n\n"); - - prt_str(out, bch2_rebalance_state_strs[r->state]); - prt_newline(out); - printbuf_indent_add(out, 2); - - switch (r->state) { - case BCH_REBALANCE_waiting: { - u64 now = atomic64_read(&c->io_clock[WRITE].now); - - prt_printf(out, "io wait duration:\t"); - bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); - prt_newline(out); - - prt_printf(out, "io wait remaining:\t"); - bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); - prt_newline(out); - - prt_printf(out, "duration waited:\t"); - bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); - prt_newline(out); - break; - } - case BCH_REBALANCE_working: - bch2_move_stats_to_text(out, &r->work_stats); - break; - case BCH_REBALANCE_scanning: - bch2_move_stats_to_text(out, &r->scan_stats); - break; - } - prt_newline(out); - - struct task_struct *t; - scoped_guard(rcu) { - t = rcu_dereference(c->rebalance.thread); - if (t) - get_task_struct(t); - } - - if (t) { - bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); - put_task_struct(t); - } - - printbuf_indent_sub(out, 2); -} - -void bch2_rebalance_stop(struct bch_fs *c) -{ - struct task_struct *p; - - c->rebalance.pd.rate.rate = UINT_MAX; - bch2_ratelimit_reset(&c->rebalance.pd.rate); - - p = rcu_dereference_protected(c->rebalance.thread, 1); - c->rebalance.thread = NULL; - - if (p) { - /* for sychronizing with bch2_rebalance_wakeup() */ - synchronize_rcu(); - - kthread_stop(p); - put_task_struct(p); - } -} - -int bch2_rebalance_start(struct bch_fs *c) -{ - struct task_struct *p; - int ret; - - if (c->rebalance.thread) - return 0; - - if (c->opts.nochanges) - return 0; - - p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); - ret = PTR_ERR_OR_ZERO(p); - bch_err_msg(c, ret, "creating rebalance thread"); - if (ret) - return ret; - - get_task_struct(p); - rcu_assign_pointer(c->rebalance.thread, p); - wake_up_process(p); - return 0; -} - -#ifdef CONFIG_POWER_SUPPLY -#include - -static int bch2_rebalance_power_notifier(struct notifier_block *nb, - unsigned long event, void *data) -{ - struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier); - - c->rebalance.on_battery = !power_supply_is_system_supplied(); - bch2_rebalance_wakeup(c); - return NOTIFY_OK; -} -#endif - -void bch2_fs_rebalance_exit(struct bch_fs *c) -{ -#ifdef CONFIG_POWER_SUPPLY - power_supply_unreg_notifier(&c->rebalance.power_notifier); -#endif -} - -int bch2_fs_rebalance_init(struct bch_fs *c) -{ - struct bch_fs_rebalance *r = &c->rebalance; - - bch2_pd_controller_init(&r->pd); - -#ifdef CONFIG_POWER_SUPPLY - r->power_notifier.notifier_call = bch2_rebalance_power_notifier; - int ret = power_supply_reg_notifier(&r->power_notifier); - if (ret) - return ret; - - r->on_battery = !power_supply_is_system_supplied(); -#endif - return 0; -} - -static int check_rebalance_work_one(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct btree_iter *rebalance_iter, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c extent_k, rebalance_k; - struct printbuf buf = PRINTBUF; - - int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?: - bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter)); - if (ret) - return ret; - - if (!extent_k.k && - extent_iter->btree_id == BTREE_ID_reflink && - (!rebalance_k.k || - rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { - bch2_trans_iter_exit(trans, extent_iter); - bch2_trans_iter_init(trans, extent_iter, - BTREE_ID_extents, POS_MIN, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots); - return bch_err_throw(c, transaction_restart_nested); - } - - if (!extent_k.k && !rebalance_k.k) - return 1; - - int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, - rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); - - struct bkey deleted; - bkey_init(&deleted); - - if (cmp < 0) { - deleted.p = extent_k.k->p; - rebalance_k.k = &deleted; - } else if (cmp > 0) { - deleted.p = rebalance_k.k->p; - extent_k.k = &deleted; - } - - bool should_have_rebalance = - bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; - bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; - - if (should_have_rebalance != have_rebalance) { - ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed); - if (ret) - return ret; - - bch2_bkey_val_to_text(&buf, c, extent_k); - } - - if (fsck_err_on(!should_have_rebalance && have_rebalance, - trans, rebalance_work_incorrectly_set, - "rebalance work incorrectly set\n%s", buf.buf)) { - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - extent_k.k->p, false); - if (ret) - goto err; - } - - if (fsck_err_on(should_have_rebalance && !have_rebalance, - trans, rebalance_work_incorrectly_unset, - "rebalance work incorrectly unset\n%s", buf.buf)) { - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - extent_k.k->p, true); - if (ret) - goto err; - } - - if (cmp <= 0) - bch2_btree_iter_advance(trans, extent_iter); - if (cmp >= 0) - bch2_btree_iter_advance(trans, rebalance_iter); -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_check_rebalance_work(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter rebalance_iter, extent_iter; - int ret = 0; - - bch2_trans_iter_init(trans, &extent_iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &rebalance_iter, - BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_prefetch); - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - while (!ret) { - bch2_trans_begin(trans); - - ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - } - - bch2_bkey_buf_exit(&last_flushed, c); - bch2_trans_iter_exit(trans, &extent_iter); - bch2_trans_iter_exit(trans, &rebalance_iter); - bch2_trans_put(trans); - return ret < 0 ? ret : 0; -} diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h deleted file mode 100644 index 7a565ea7dbfcc5..00000000000000 --- a/fs/bcachefs/rebalance.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REBALANCE_H -#define _BCACHEFS_REBALANCE_H - -#include "compress.h" -#include "disk_groups.h" -#include "opts.h" -#include "rebalance_types.h" - -static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, - struct bch_io_opts *opts) -{ - struct bch_extent_rebalance r = { - .type = BIT(BCH_EXTENT_ENTRY_rebalance), -#define x(_name) \ - ._name = opts->_name, \ - ._name##_from_inode = opts->_name##_from_inode, - BCH_REBALANCE_OPTS() -#undef x - }; - - if (r.background_target && - !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) - r.background_target = 0; - - return r; -}; - -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); -int bch2_get_update_rebalance_opts(struct btree_trans *, - struct bch_io_opts *, - struct btree_iter *, - struct bkey_s_c); - -int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); -int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); -int bch2_set_fs_needs_rebalance(struct bch_fs *); - -static inline void bch2_rebalance_wakeup(struct bch_fs *c) -{ - c->rebalance.kick++; - guard(rcu)(); - struct task_struct *p = rcu_dereference(c->rebalance.thread); - if (p) - wake_up_process(p); -} - -void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); - -void bch2_rebalance_stop(struct bch_fs *); -int bch2_rebalance_start(struct bch_fs *); - -void bch2_fs_rebalance_exit(struct bch_fs *); -int bch2_fs_rebalance_init(struct bch_fs *); - -int bch2_check_rebalance_work(struct bch_fs *); - -#endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h deleted file mode 100644 index ff9a1342a22b4c..00000000000000 --- a/fs/bcachefs/rebalance_format.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REBALANCE_FORMAT_H -#define _BCACHEFS_REBALANCE_FORMAT_H - -struct bch_extent_rebalance { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:3, - - promote_target_from_inode:1, - erasure_code_from_inode:1, - data_checksum_from_inode:1, - background_compression_from_inode:1, - data_replicas_from_inode:1, - background_target_from_inode:1, - - promote_target:16, - erasure_code:1, - data_checksum:4, - data_replicas:4, - background_compression:8, /* enum bch_compression_opt */ - background_target:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 background_target:16, - background_compression:8, - data_replicas:4, - data_checksum:4, - erasure_code:1, - promote_target:16, - - background_target_from_inode:1, - data_replicas_from_inode:1, - background_compression_from_inode:1, - data_checksum_from_inode:1, - erasure_code_from_inode:1, - promote_target_from_inode:1, - - unused:3, - type:6; -#endif -}; - -/* subset of BCH_INODE_OPTS */ -#define BCH_REBALANCE_OPTS() \ - x(data_checksum) \ - x(background_compression) \ - x(data_replicas) \ - x(promote_target) \ - x(background_target) \ - x(erasure_code) - -#endif /* _BCACHEFS_REBALANCE_FORMAT_H */ - diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h deleted file mode 100644 index c659da149fa3c4..00000000000000 --- a/fs/bcachefs/rebalance_types.h +++ /dev/null @@ -1,41 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REBALANCE_TYPES_H -#define _BCACHEFS_REBALANCE_TYPES_H - -#include "bbpos_types.h" -#include "move_types.h" - -#define BCH_REBALANCE_STATES() \ - x(waiting) \ - x(working) \ - x(scanning) - -enum bch_rebalance_states { -#define x(t) BCH_REBALANCE_##t, - BCH_REBALANCE_STATES() -#undef x -}; - -struct bch_fs_rebalance { - struct task_struct __rcu *thread; - u32 kick; - struct bch_pd_controller pd; - - enum bch_rebalance_states state; - u64 wait_iotime_start; - u64 wait_iotime_end; - u64 wait_wallclock_start; - - struct bch_move_stats work_stats; - - struct bbpos scan_start; - struct bbpos scan_end; - struct bch_move_stats scan_stats; - - bool on_battery; -#ifdef CONFIG_POWER_SUPPLY - struct notifier_block power_notifier; -#endif -}; - -#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c deleted file mode 100644 index c94debb12d2fee..00000000000000 --- a/fs/bcachefs/recovery.c +++ /dev/null @@ -1,1306 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "bkey_buf.h" -#include "btree_journal_iter.h" -#include "btree_node_scan.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "buckets.h" -#include "dirent.h" -#include "disk_accounting.h" -#include "errcode.h" -#include "error.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "logged_ops.h" -#include "move.h" -#include "movinggc.h" -#include "namei.h" -#include "quota.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-clean.h" -#include "sb-downgrade.h" -#include "snapshot.h" -#include "super-io.h" - -#include -#include - -int bch2_btree_lost_data(struct bch_fs *c, - struct printbuf *msg, - enum btree_id btree) -{ - u64 b = BIT_ULL(btree); - int ret = 0; - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - if (!(c->sb.btrees_lost_data & b)) { - prt_printf(msg, "flagging btree "); - bch2_btree_id_to_text(msg, btree); - prt_printf(msg, " lost data\n"); - - ext->btrees_lost_data |= cpu_to_le64(b); - } - - /* Once we have runtime self healing for topology errors we won't need this: */ - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - - /* Btree node accounting will be off: */ - __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; - -#ifdef CONFIG_BCACHEFS_DEBUG - /* - * These are much more minor, and don't need to be corrected right away, - * but in debug mode we want the next fsck run to be clean: - */ - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret; -#endif - - switch (btree) { - case BTREE_ID_alloc: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); - goto out; - case BTREE_ID_backpointers: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret; - goto out; - case BTREE_ID_need_discard: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_freespace: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_bucket_gens: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_lru: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_accounting: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; - goto out; - case BTREE_ID_snapshots: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; - goto out; - default: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; - goto out; - } -out: - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; -} - -static void kill_btree(struct bch_fs *c, enum btree_id btree) -{ - bch2_btree_id_root(c, btree)->alive = false; - bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); -} - -/* for -o reconstruct_alloc: */ -void bch2_reconstruct_alloc(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); - - __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - - c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info)); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) - if (btree_id_is_alloc(i)) - kill_btree(c, i); -} - -/* - * Btree node pointers have a field to stack a pointer to the in memory btree - * node; we need to zero out this field when reading in btree nodes, or when - * reading in keys from the journal: - */ -static void zero_out_btree_mem_ptr(struct journal_keys *keys) -{ - darray_for_each(*keys, i) - if (i->k->k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; -} - -/* journal replay: */ - -static void replay_now_at(struct journal *j, u64 seq) -{ - BUG_ON(seq < j->replay_journal_seq); - - seq = min(seq, j->replay_journal_seq_end); - - while (j->replay_journal_seq < seq) - bch2_journal_pin_put(j, j->replay_journal_seq++); -} - -static int bch2_journal_replay_accounting_key(struct btree_trans *trans, - struct journal_key *k) -{ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, k->level, - BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(trans, &iter); - if (ret) - goto out; - - struct bkey u; - struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); - - /* Has this delta already been applied to the btree? */ - if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) { - ret = 0; - goto out; - } - - struct bkey_i *new = k->k; - if (old.k->type == KEY_TYPE_accounting) { - new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - bch2_accounting_accumulate(bkey_i_to_accounting(new), - bkey_s_c_to_accounting(old)); - } - - trans->journal_res.seq = k->journal_seq; - - ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_journal_replay_key(struct btree_trans *trans, - struct journal_key *k) -{ - struct btree_iter iter; - unsigned iter_flags = - BTREE_ITER_intent| - BTREE_ITER_not_extents; - unsigned update_flags = BTREE_TRIGGER_norun; - int ret; - - if (k->overwritten) - return 0; - - trans->journal_res.seq = k->journal_seq; - - /* - * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to - * keep the key cache coherent with the underlying btree. Nothing - * besides the allocator is doing updates yet so we don't need key cache - * coherency for non-alloc btrees, and key cache fills for snapshots - * btrees use BTREE_ITER_filter_snapshots, which isn't available until - * the snapshots recovery pass runs. - */ - if (!k->level && k->btree_id == BTREE_ID_alloc) - iter_flags |= BTREE_ITER_cached; - else - update_flags |= BTREE_UPDATE_key_cache_reclaim; - - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, k->level, - iter_flags); - ret = bch2_btree_iter_traverse(trans, &iter); - if (ret) - goto out; - - struct btree_path *path = btree_iter_path(trans, &iter); - if (unlikely(!btree_path_node(path, k->level))) { - struct bch_fs *c = trans->c; - - CLASS(printbuf, buf)(); - prt_str(&buf, "btree="); - bch2_btree_id_to_text(&buf, k->btree_id); - prt_printf(&buf, " level=%u ", k->level); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k)); - - if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| - BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { - bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s", - buf.buf); - ret = -EINVAL; - } - - if (!k->allocated) { - bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s", - buf.buf); - k->overwritten = true; - goto out; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, 0, iter_flags); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_increase_depth(trans, iter.path, 0) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } - - /* Must be checked with btree locked: */ - if (k->overwritten) - goto out; - - if (k->k->k.type == KEY_TYPE_accounting) { - struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto out; - - bkey_copy(n, k->k); - goto out; - } - - ret = bch2_trans_update(trans, &iter, k->k, update_flags); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int journal_sort_seq_cmp(const void *_l, const void *_r) -{ - const struct journal_key *l = *((const struct journal_key **)_l); - const struct journal_key *r = *((const struct journal_key **)_r); - - /* - * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last - * - * journal_seq == 0 means that the key comes from early repair, and - * should be inserted last so as to avoid overflowing the journal - */ - return cmp_int(l->journal_seq - 1, r->journal_seq - 1); -} - -int bch2_journal_replay(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - DARRAY(struct journal_key *) keys_sorted = { 0 }; - struct journal *j = &c->journal; - u64 start_seq = c->journal_replay_seq_start; - u64 end_seq = c->journal_replay_seq_start; - struct btree_trans *trans = NULL; - bool immediate_flush = false; - int ret = 0; - - if (keys->nr) { - ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", - keys->nr, start_seq, end_seq); - if (ret) - goto err; - } - - BUG_ON(!atomic_read(&keys->ref)); - - move_gap(keys, keys->nr); - trans = bch2_trans_get(c); - - /* - * Replay accounting keys first: we can't allow the write buffer to - * flush accounting keys until we're done - */ - darray_for_each(*keys, k) { - if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated)) - continue; - - cond_resched(); - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_skip_accounting_apply| - BCH_TRANS_COMMIT_no_journal_res| - BCH_WATERMARK_reclaim, - bch2_journal_replay_accounting_key(trans, k)); - if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) - goto err; - - k->overwritten = true; - } - - set_bit(BCH_FS_accounting_replay_done, &c->flags); - - /* - * First, attempt to replay keys in sorted order. This is more - * efficient - better locality of btree access - but some might fail if - * that would cause a journal deadlock. - */ - darray_for_each(*keys, k) { - cond_resched(); - - /* - * k->allocated means the key wasn't read in from the journal, - * rather it was from early repair code - */ - if (k->allocated) - immediate_flush = true; - - /* Skip fastpath if we're low on space in the journal */ - ret = c->journal.watermark ? -1 : - commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_skip_accounting_apply| - (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), - bch2_journal_replay_key(trans, k)); - BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting); - if (ret) { - ret = darray_push(&keys_sorted, k); - if (ret) - goto err; - } - } - - bch2_trans_unlock_long(trans); - /* - * Now, replay any remaining keys in the order in which they appear in - * the journal, unpinning those journal entries as we go: - */ - sort_nonatomic(keys_sorted.data, keys_sorted.nr, - sizeof(keys_sorted.data[0]), - journal_sort_seq_cmp, NULL); - - darray_for_each(keys_sorted, kp) { - cond_resched(); - - struct journal_key *k = *kp; - - if (k->journal_seq) - replay_now_at(j, k->journal_seq); - else - replay_now_at(j, j->replay_journal_seq_end); - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_skip_accounting_apply| - (!k->allocated - ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim - : 0), - bch2_journal_replay_key(trans, k)); - if (ret) { - struct printbuf buf = PRINTBUF; - bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); - bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); - printbuf_exit(&buf); - goto err; - } - - BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); - } - - /* - * We need to put our btree_trans before calling flush_all_pins(), since - * that will use a btree_trans internally - */ - bch2_trans_put(trans); - trans = NULL; - - if (!c->opts.retain_recovery_info && - c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) - bch2_journal_keys_put_initial(c); - - replay_now_at(j, j->replay_journal_seq_end); - j->replay_journal_seq = 0; - - bch2_journal_set_replay_done(j); - - /* if we did any repair, flush it immediately */ - if (immediate_flush) { - bch2_journal_flush_all_pins(&c->journal); - ret = bch2_journal_meta(&c->journal); - } - - if (keys->nr) - bch2_journal_log_msg(c, "journal replay finished"); -err: - if (trans) - bch2_trans_put(trans); - darray_exit(&keys_sorted); - bch_err_fn(c, ret); - return ret; -} - -/* journal replay early: */ - -static int journal_replay_entry_early(struct bch_fs *c, - struct jset_entry *entry) -{ - int ret = 0; - - switch (entry->type) { - case BCH_JSET_ENTRY_btree_root: { - - if (unlikely(!entry->u64s)) - return 0; - - if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, - c, invalid_btree_id, - "invalid btree id %u (max %u)", - entry->btree_id, BTREE_ID_NR_MAX)) - return 0; - - while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { - ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); - if (ret) - return ret; - } - - struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); - - r->level = entry->level; - bkey_copy(&r->key, (struct bkey_i *) entry->start); - r->error = 0; - r->alive = true; - break; - } - case BCH_JSET_ENTRY_usage: { - struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); - - switch (entry->btree_id) { - case BCH_FS_USAGE_key_version: - atomic64_set(&c->key_version, le64_to_cpu(u->v)); - break; - } - break; - } - case BCH_JSET_ENTRY_blacklist: { - struct jset_entry_blacklist *bl_entry = - container_of(entry, struct jset_entry_blacklist, entry); - - ret = bch2_journal_seq_blacklist_add(c, - le64_to_cpu(bl_entry->seq), - le64_to_cpu(bl_entry->seq) + 1); - break; - } - case BCH_JSET_ENTRY_blacklist_v2: { - struct jset_entry_blacklist_v2 *bl_entry = - container_of(entry, struct jset_entry_blacklist_v2, entry); - - ret = bch2_journal_seq_blacklist_add(c, - le64_to_cpu(bl_entry->start), - le64_to_cpu(bl_entry->end) + 1); - break; - } - case BCH_JSET_ENTRY_clock: { - struct jset_entry_clock *clock = - container_of(entry, struct jset_entry_clock, entry); - - atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); - } - } -fsck_err: - return ret; -} - -static int journal_replay_early(struct bch_fs *c, - struct bch_sb_field_clean *clean) -{ - if (clean) { - for (struct jset_entry *entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - int ret = journal_replay_entry_early(c, entry); - if (ret) - return ret; - } - } else { - struct genradix_iter iter; - struct journal_replay *i, **_i; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - vstruct_for_each(&i->j, entry) { - int ret = journal_replay_entry_early(c, entry); - if (ret) - return ret; - } - } - } - - return 0; -} - -/* sb clean section: */ - -static int read_btree_roots(struct bch_fs *c) -{ - struct printbuf buf = PRINTBUF; - int ret = 0; - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (!r->alive) - continue; - - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, i, r->level); - - if (mustfix_fsck_err_on((ret = r->error), - c, btree_root_bkey_invalid, - "invalid btree root %s", - buf.buf) || - mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), - c, btree_root_read_error, - "error reading btree root %s: %s", - buf.buf, bch2_err_str(ret))) { - if (btree_id_is_alloc(i)) - r->error = 0; - ret = 0; - } - } - - for (unsigned i = 0; i < BTREE_ID_NR; i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (!r->b && !r->error) { - r->alive = false; - r->level = 0; - bch2_btree_root_alloc_fake(c, i, 0); - } - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static bool check_version_upgrade(struct bch_fs *c) -{ - unsigned latest_version = bcachefs_metadata_version_current; - unsigned latest_compatible = min(latest_version, - bch2_latest_compatible_version(c->sb.version)); - unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; - unsigned new_version = 0; - bool ret = false; - - if (old_version < bcachefs_metadata_required_upgrade_below) { - if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || - latest_compatible < bcachefs_metadata_required_upgrade_below) - new_version = latest_version; - else - new_version = latest_compatible; - } else { - switch (c->opts.version_upgrade) { - case BCH_VERSION_UPGRADE_compatible: - new_version = latest_compatible; - break; - case BCH_VERSION_UPGRADE_incompatible: - new_version = latest_version; - break; - case BCH_VERSION_UPGRADE_none: - new_version = min(old_version, latest_version); - break; - } - } - - if (new_version > old_version) { - struct printbuf buf = PRINTBUF; - - if (old_version < bcachefs_metadata_required_upgrade_below) - prt_str(&buf, "Version upgrade required:\n"); - - if (old_version != c->sb.version) { - prt_str(&buf, "Version upgrade from "); - bch2_version_to_text(&buf, c->sb.version_upgrade_complete); - prt_str(&buf, " to "); - bch2_version_to_text(&buf, c->sb.version); - prt_str(&buf, " incomplete\n"); - } - - prt_printf(&buf, "Doing %s version upgrade from ", - BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) - ? "incompatible" : "compatible"); - bch2_version_to_text(&buf, old_version); - prt_str(&buf, " to "); - bch2_version_to_text(&buf, new_version); - prt_newline(&buf); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_upgrade(c, old_version, new_version); - passes = ext->recovery_passes_required[0] & ~passes; - - if (passes) { - prt_str(&buf, " running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - - ret = true; - } - - if (new_version > c->sb.version_incompat_allowed && - c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Now allowing incompatible features up to "); - bch2_version_to_text(&buf, new_version); - prt_str(&buf, ", previously allowed up to "); - bch2_version_to_text(&buf, c->sb.version_incompat_allowed); - prt_newline(&buf); - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - - ret = true; - } - - if (ret) - bch2_sb_upgrade(c, new_version, - c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); - - return ret; -} - -int bch2_fs_recovery(struct bch_fs *c) -{ - struct bch_sb_field_clean *clean = NULL; - struct jset *last_journal_entry = NULL; - u64 last_seq = 0, blacklist_seq, journal_seq; - int ret = 0; - - if (c->sb.clean) { - clean = bch2_read_superblock_clean(c); - ret = PTR_ERR_OR_ZERO(clean); - if (ret) - goto err; - - bch_info(c, "recovering from clean shutdown, journal seq %llu", - le64_to_cpu(clean->journal_seq)); - } else { - bch_info(c, "recovering from unclean shutdown"); - } - - if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { - bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); - ret = -EINVAL; - goto err; - } - - if (!c->sb.clean && - !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { - bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); - ret = -EINVAL; - goto err; - } - - if (c->opts.norecovery) { - c->opts.recovery_pass_last = c->opts.recovery_pass_last - ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) - : BCH_RECOVERY_PASS_snapshots_read; - c->opts.nochanges = true; - } - - if (c->opts.nochanges) - c->opts.read_only = true; - - if (c->opts.journal_rewind) { - bch_info(c, "rewinding journal, fsck required"); - c->opts.fsck = true; - } - - if (go_rw_in_recovery(c)) { - /* - * start workqueues/kworkers early - kthread creation checks for - * pending signals, which is _very_ annoying - */ - ret = bch2_fs_init_rw(c); - if (ret) - goto err; - } - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - bool write_sb = false; - - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { - ext->recovery_passes_required[0] |= - cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); - write_sb = true; - } - - u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - if (sb_passes) { - struct printbuf buf = PRINTBUF; - prt_str(&buf, "superblock requires following recovery passes to be run:\n "); - prt_bitflags(&buf, bch2_recovery_passes, sb_passes); - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - if (bch2_check_version_downgrade(c)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Version downgrade required:"); - - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_downgrade(c, - BCH_VERSION_MINOR(bcachefs_metadata_version_current), - BCH_VERSION_MINOR(c->sb.version)); - passes = ext->recovery_passes_required[0] & ~passes; - if (passes) { - prt_str(&buf, "\n running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } - - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - write_sb = true; - } - - if (check_version_upgrade(c)) - write_sb = true; - - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - - if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { - SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); - write_sb = true; - } - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (c->sb.clean) - set_bit(BCH_FS_clean_recovery, &c->flags); - if (c->opts.fsck) - set_bit(BCH_FS_in_fsck, &c->flags); - set_bit(BCH_FS_in_recovery, &c->flags); - - ret = bch2_blacklist_table_initialize(c); - if (ret) { - bch_err(c, "error initializing blacklist table"); - goto err; - } - - bch2_journal_pos_from_member_info_resume(c); - - if (!c->sb.clean || c->opts.retain_recovery_info) { - struct genradix_iter iter; - struct journal_replay **i; - - bch_verbose(c, "starting journal read"); - ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); - if (ret) - goto err; - - /* - * note: cmd_list_journal needs the blacklist table fully up to date so - * it can asterisk ignored journal entries: - */ - if (c->opts.read_journal_only) - goto out; - - genradix_for_each_reverse(&c->journal_entries, iter, i) - if (!journal_replay_ignore(*i)) { - last_journal_entry = &(*i)->j; - break; - } - - if (mustfix_fsck_err_on(c->sb.clean && - last_journal_entry && - !journal_entry_empty(last_journal_entry), c, - clean_but_journal_not_empty, - "filesystem marked clean but journal not empty")) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - } - - if (!last_journal_entry) { - fsck_err_on(!c->sb.clean, c, - dirty_but_no_journal_entries, - "no journal entries found"); - if (clean) - goto use_clean; - - genradix_for_each_reverse(&c->journal_entries, iter, i) - if (*i) { - last_journal_entry = &(*i)->j; - (*i)->ignore_blacklisted = false; - (*i)->ignore_not_dirty= false; - /* - * This was probably a NO_FLUSH entry, - * so last_seq was garbage - but we know - * we're only using a single journal - * entry, set it here: - */ - (*i)->j.last_seq = (*i)->j.seq; - break; - } - } - - ret = bch2_journal_keys_sort(c); - if (ret) - goto err; - - if (c->sb.clean && last_journal_entry) { - ret = bch2_verify_superblock_clean(c, &clean, - last_journal_entry); - if (ret) - goto err; - } - } else { -use_clean: - if (!clean) { - bch_err(c, "no superblock clean section found"); - ret = bch_err_throw(c, fsck_repair_impossible); - goto err; - - } - blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; - } - - c->journal_replay_seq_start = last_seq; - c->journal_replay_seq_end = blacklist_seq - 1; - - zero_out_btree_mem_ptr(&c->journal_keys); - - ret = journal_replay_early(c, clean); - if (ret) - goto err; - - ret = bch2_fs_resize_on_mount(c); - if (ret) { - up_write(&c->state_lock); - goto err; - } - - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { - bch_info(c, "filesystem is an unresized image file, mounting ro"); - c->opts.read_only = true; - } - - if (!c->opts.read_only && - (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { - bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); - - bch2_reconstruct_alloc(c); - } else if (c->opts.reconstruct_alloc) { - bch2_journal_log_msg(c, "dropping alloc info"); - bch_info(c, "dropping and reconstructing all alloc info"); - - bch2_reconstruct_alloc(c); - } - - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { - /* We can't go RW to fix errors without alloc info */ - if (c->opts.fix_errors == FSCK_FIX_yes || - c->opts.fix_errors == FSCK_FIX_ask) - c->opts.fix_errors = FSCK_FIX_no; - if (c->opts.errors == BCH_ON_ERROR_fix_safe) - c->opts.errors = BCH_ON_ERROR_continue; - } - - /* - * After an unclean shutdown, skip then next few journal sequence - * numbers as they may have been referenced by btree writes that - * happened before their corresponding journal writes - those btree - * writes need to be ignored, by skipping and blacklisting the next few - * journal sequence numbers: - */ - if (!c->sb.clean) - journal_seq += JOURNAL_BUF_NR * 4; - - if (blacklist_seq != journal_seq) { - ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", - blacklist_seq, journal_seq) ?: - bch2_journal_seq_blacklist_add(c, - blacklist_seq, journal_seq); - if (ret) { - bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); - goto err; - } - } - - ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", - journal_seq, last_seq, blacklist_seq - 1) ?: - bch2_fs_journal_start(&c->journal, last_seq, journal_seq); - if (ret) - goto err; - - /* - * Skip past versions that might have possibly been used (as nonces), - * but hadn't had their pointers written: - */ - if (c->sb.encryption_type && !c->sb.clean) - atomic64_add(1 << 16, &c->key_version); - - ret = read_btree_roots(c); - if (ret) - goto err; - - set_bit(BCH_FS_btree_running, &c->flags); - - ret = bch2_sb_set_upgrade_extra(c); - if (ret) - goto err; - - ret = bch2_run_recovery_passes(c, 0); - if (ret) - goto err; - - /* - * Normally set by the appropriate recovery pass: when cleared, this - * indicates we're in early recovery and btree updates should be done by - * being applied to the journal replay keys. _Must_ be cleared before - * multithreaded use: - */ - set_bit(BCH_FS_may_go_rw, &c->flags); - clear_bit(BCH_FS_in_fsck, &c->flags); - - /* in case we don't run journal replay, i.e. norecovery mode */ - set_bit(BCH_FS_accounting_replay_done, &c->flags); - - bch2_async_btree_node_rewrites_flush(c); - - /* fsync if we fixed errors */ - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_meta(&c->journal); - } - - /* If we fixed errors, verify that fs is actually clean now: */ - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - test_bit(BCH_FS_errors_fixed, &c->flags) && - !test_bit(BCH_FS_errors_not_fixed, &c->flags) && - !test_bit(BCH_FS_error, &c->flags)) { - bch2_flush_fsck_errs(c); - - bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); - clear_bit(BCH_FS_errors_fixed, &c->flags); - - ret = bch2_run_recovery_passes(c, - BCH_RECOVERY_PASS_check_alloc_info); - if (ret) - goto err; - - if (test_bit(BCH_FS_errors_fixed, &c->flags) || - test_bit(BCH_FS_errors_not_fixed, &c->flags)) { - bch_err(c, "Second fsck run was not clean"); - set_bit(BCH_FS_errors_not_fixed, &c->flags); - } - - set_bit(BCH_FS_errors_fixed, &c->flags); - } - - if (enabled_qtypes(c)) { - bch_verbose(c, "reading quotas"); - ret = bch2_fs_quota_read(c); - if (ret) - goto err; - bch_verbose(c, "quotas done"); - } - - mutex_lock(&c->sb_lock); - ext = bch2_sb_field_get(c->disk_sb.sb, ext); - write_sb = false; - - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); - write_sb = true; - } - - if (!test_bit(BCH_FS_error, &c->flags) && - !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); - write_sb = true; - } - - if (!test_bit(BCH_FS_error, &c->flags) && - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { - memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); - write_sb = true; - } - - if (c->opts.fsck && - !test_bit(BCH_FS_error, &c->flags) && - c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 && - ext->btrees_lost_data) { - ext->btrees_lost_data = 0; - write_sb = true; - } - - if (c->opts.fsck && - !test_bit(BCH_FS_error, &c->flags) && - !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { - SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); - SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); - write_sb = true; - } - - if (bch2_blacklist_entries_gc(c)) - write_sb = true; - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || - c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { - struct bch_move_stats stats; - - bch2_move_stats_init(&stats, "recovery"); - - struct printbuf buf = PRINTBUF; - bch2_version_to_text(&buf, c->sb.version_min); - bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); - printbuf_exit(&buf); - - ret = bch2_fs_read_write_early(c) ?: - bch2_scan_old_btree_nodes(c, &stats); - if (ret) - goto err; - bch_info(c, "scanning for old btree nodes done"); - } - - ret = 0; -out: - bch2_flush_fsck_errs(c); - - if (!ret && - test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && - !c->opts.nochanges) { - bch2_fs_read_write_early(c); - bch2_delete_dead_snapshots_async(c); - } - - bch_err_fn(c, ret); -final_out: - if (!IS_ERR(clean)) - kfree(clean); - return ret; -err: -fsck_err: - { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret)); - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - goto final_out; -} - -int bch2_fs_initialize(struct bch_fs *c) -{ - struct bch_inode_unpacked root_inode, lostfound_inode; - struct bkey_inode_buf packed_inode; - struct qstr lostfound = QSTR("lost+found"); - struct bch_member *m; - int ret; - - bch_notice(c, "initializing new filesystem"); - set_bit(BCH_FS_new_fs, &c->flags); - - mutex_lock(&c->sb_lock); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); - - bch2_check_version_downgrade(c); - - if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { - bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - bch2_write_super(c); - } - - for_each_member_device(c, ca) { - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); - ca->mi = bch2_mi_to_cpu(m); - } - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - set_bit(BCH_FS_btree_running, &c->flags); - set_bit(BCH_FS_may_go_rw, &c->flags); - - for (unsigned i = 0; i < BTREE_ID_NR; i++) - bch2_btree_root_alloc_fake(c, i, 0); - - ret = bch2_fs_journal_alloc(c); - if (ret) - goto err; - - /* - * journal_res_get() will crash if called before this has - * set up the journal.pin FIFO and journal.cur pointer: - */ - ret = bch2_fs_journal_start(&c->journal, 1, 1); - if (ret) - goto err; - - ret = bch2_fs_read_write_early(c); - if (ret) - goto err; - - set_bit(BCH_FS_accounting_replay_done, &c->flags); - bch2_journal_set_replay_done(&c->journal); - - for_each_member_device(c, ca) { - ret = bch2_dev_usage_init(ca, false); - if (ret) { - bch2_dev_put(ca); - goto err; - } - } - - /* - * Write out the superblock and journal buckets, now that we can do - * btree updates - */ - bch_verbose(c, "marking superblocks"); - ret = bch2_trans_mark_dev_sbs(c); - bch_err_msg(c, ret, "marking superblocks"); - if (ret) - goto err; - - ret = bch2_fs_freespace_init(c); - if (ret) - goto err; - - ret = bch2_initialize_subvolumes(c); - if (ret) - goto err; - - bch_verbose(c, "reading snapshots table"); - ret = bch2_snapshots_read(c); - if (ret) - goto err; - bch_verbose(c, "reading snapshots done"); - - bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); - root_inode.bi_inum = BCACHEFS_ROOT_INO; - root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - bch2_inode_pack(&packed_inode, &root_inode); - packed_inode.inode.k.p.snapshot = U32_MAX; - - ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "creating root directory"); - if (ret) - goto err; - - bch2_inode_init_early(c, &lostfound_inode); - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_create_trans(trans, - BCACHEFS_ROOT_SUBVOL_INUM, - &root_inode, &lostfound_inode, - &lostfound, - 0, 0, S_IFDIR|0700, 0, - NULL, NULL, (subvol_inum) { 0 }, 0)); - bch_err_msg(c, ret, "creating lost+found"); - if (ret) - goto err; - - c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1; - - bch2_copygc_wakeup(c); - bch2_rebalance_wakeup(c); - - if (enabled_qtypes(c)) { - ret = bch2_fs_quota_read(c); - if (ret) - goto err; - } - - ret = bch2_journal_flush(&c->journal); - bch_err_msg(c, ret, "writing first journal entry"); - if (ret) - goto err; - - mutex_lock(&c->sb_lock); - SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - c->recovery.curr_pass = BCH_RECOVERY_PASS_NR; - return 0; -err: - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h deleted file mode 100644 index c023f52fc2d6dc..00000000000000 --- a/fs/bcachefs/recovery.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_H -#define _BCACHEFS_RECOVERY_H - -int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id); -void bch2_reconstruct_alloc(struct bch_fs *); - -int bch2_journal_replay(struct bch_fs *); - -int bch2_fs_recovery(struct bch_fs *); -int bch2_fs_initialize(struct bch_fs *); - -#endif /* _BCACHEFS_RECOVERY_H */ diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c deleted file mode 100644 index 6a039e0110643f..00000000000000 --- a/fs/bcachefs/recovery_passes.c +++ /dev/null @@ -1,646 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "backpointers.h" -#include "btree_gc.h" -#include "btree_node_scan.h" -#include "disk_accounting.h" -#include "ec.h" -#include "fsck.h" -#include "inode.h" -#include "journal.h" -#include "lru.h" -#include "logged_ops.h" -#include "movinggc.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "snapshot.h" -#include "subvolume.h" -#include "super.h" -#include "super-io.h" - -const char * const bch2_recovery_passes[] = { -#define x(_fn, ...) #_fn, - BCH_RECOVERY_PASSES() -#undef x - NULL -}; - -static const u8 passes_to_stable_map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x -}; - -static const u8 passes_from_stable_map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x -}; - -static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) -{ - return passes_to_stable_map[pass]; -} - -u64 bch2_recovery_passes_to_stable(u64 v) -{ - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(passes_to_stable_map[i]); - return ret; -} - -static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) -{ - return pass < ARRAY_SIZE(passes_from_stable_map) - ? passes_from_stable_map[pass] - : 0; -} - -u64 bch2_recovery_passes_from_stable(u64 v) -{ - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(passes_from_stable_map[i]); - return ret; -} - -static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - return 0; -} - -static void bch2_sb_recovery_passes_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_recovery_passes *r = - field_to_type(f, recovery_passes); - unsigned nr = recovery_passes_nr_entries(r); - - if (out->nr_tabstops < 1) - printbuf_tabstop_push(out, 32); - if (out->nr_tabstops < 2) - printbuf_tabstop_push(out, 16); - - prt_printf(out, "Pass\tLast run\tLast runtime\n"); - - for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { - if (!i->last_run) - continue; - - unsigned idx = i - r->start; - - prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); - - bch2_prt_datetime(out, le64_to_cpu(i->last_run)); - prt_tab(out); - - bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); - - if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) - prt_str(out, " (no ratelimit)"); - - prt_newline(out); - } -} - -static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); - - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_recovery_passes *r = - bch2_sb_field_get(c->disk_sb.sb, recovery_passes); - - if (stable >= recovery_passes_nr_entries(r)) { - unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); - - r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); - if (!r) { - bch_err(c, "error creating recovery_passes sb section"); - return NULL; - } - } - - return r->start + stable; -} - -static void bch2_sb_recovery_pass_complete(struct bch_fs *c, - enum bch_recovery_pass pass, - s64 start_time) -{ - guard(mutex)(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __clear_bit_le64(bch2_recovery_pass_to_stable(pass), - ext->recovery_passes_required); - - struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); - if (e) { - s64 end_time = ktime_get_real_seconds(); - e->last_run = cpu_to_le64(end_time); - e->last_runtime = cpu_to_le32(max(0, end_time - start_time)); - SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); - } - - bch2_write_super(c); -} - -void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - guard(mutex)(&c->sb_lock); - - struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); - if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) { - SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); - bch2_write_super(c); - } -} - -static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) -{ - enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); - bool ret = false; - - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_recovery_passes *r = - bch2_sb_field_get(c->disk_sb.sb, recovery_passes); - - if (stable < recovery_passes_nr_entries(r)) { - struct recovery_pass_entry *i = r->start + stable; - - /* - * Ratelimit if the last runtime was more than 1% of the time - * since we last ran - */ - ret = (u64) le32_to_cpu(i->last_runtime) * 100 > - ktime_get_real_seconds() - le64_to_cpu(i->last_run); - - if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) - ret = false; - } - - return ret; -} - -const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { - .validate = bch2_sb_recovery_passes_validate, - .to_text = bch2_sb_recovery_passes_to_text -}; - -/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ -static int bch2_recovery_pass_empty(struct bch_fs *c) -{ - return 0; -} - -static int bch2_set_may_go_rw(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - /* - * After we go RW, the journal keys buffer can't be modified (except for - * setting journal_key->overwritten: it will be accessed by multiple - * threads - */ - move_gap(keys, keys->nr); - - set_bit(BCH_FS_may_go_rw, &c->flags); - - if (go_rw_in_recovery(c)) { - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { - bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); - bch2_reconstruct_alloc(c); - } - - return bch2_fs_read_write_early(c); - } - return 0; -} - -/* - * Make sure root inode is readable while we're still in recovery and can rewind - * for repair: - */ -static int bch2_lookup_root_inode(struct bch_fs *c) -{ - subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; - struct bch_inode_unpacked inode_u; - struct bch_subvolume subvol; - - return bch2_trans_do(c, - bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); -} - -struct recovery_pass_fn { - int (*fn)(struct bch_fs *); - unsigned when; -}; - -static struct recovery_pass_fn recovery_pass_fns[] = { -#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, - BCH_RECOVERY_PASSES() -#undef x -}; - -static u64 bch2_recovery_passes_match(unsigned flags) -{ - u64 ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & flags) - ret |= BIT_ULL(i); - return ret; -} - -u64 bch2_fsck_recovery_passes(void) -{ - return bch2_recovery_passes_match(PASS_FSCK); -} - -static void bch2_run_async_recovery_passes(struct bch_fs *c) -{ - if (!down_trylock(&c->recovery.run_lock)) - return; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) - goto unlock; - - if (queue_work(system_long_wq, &c->recovery.work)) - return; - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); -unlock: - up(&c->recovery.run_lock); -} - -static bool recovery_pass_needs_set(struct bch_fs *c, - enum bch_recovery_pass pass, - enum bch_run_recovery_pass_flags *flags) -{ - struct bch_fs_recovery *r = &c->recovery; - - /* - * Never run scan_for_btree_nodes persistently: check_topology will run - * it if required - */ - if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) - *flags |= RUN_RECOVERY_PASS_nopersistent; - - if ((*flags & RUN_RECOVERY_PASS_ratelimit) && - !bch2_recovery_pass_want_ratelimit(c, pass)) - *flags &= ~RUN_RECOVERY_PASS_ratelimit; - - /* - * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do - * anything if the pass has already run: these mean we need a prior pass - * to run before we continue to repair, we don't expect that pass to fix - * the damage we encountered. - * - * Otherwise, we run run_explicit_recovery_pass when we find damage, so - * it should run again even if it's already run: - */ - bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); - bool rewind = in_recovery && - r->curr_pass > pass && - !(r->passes_complete & BIT_ULL(pass)); - - if (persistent - ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) - : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass))) - return true; - - if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && - (r->passes_ratelimiting & BIT_ULL(pass))) - return true; - - if (rewind) - return true; - - return false; -} - -/* - * For when we need to rewind recovery passes and run a pass we skipped: - */ -int __bch2_run_explicit_recovery_pass(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass, - enum bch_run_recovery_pass_flags flags) -{ - struct bch_fs_recovery *r = &c->recovery; - int ret = 0; - - lockdep_assert_held(&c->sb_lock); - - bch2_printbuf_make_room(out, 1024); - out->atomic++; - - unsigned long lockflags; - spin_lock_irqsave(&r->lock, lockflags); - - if (!recovery_pass_needs_set(c, pass, &flags)) - goto out; - - bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool rewind = in_recovery && - r->curr_pass > pass && - !(r->passes_complete & BIT_ULL(pass)); - bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; - - if (!(flags & RUN_RECOVERY_PASS_nopersistent)) { - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); - } - - if (pass < BCH_RECOVERY_PASS_set_may_go_rw && - (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { - prt_printf(out, "need recovery pass %s (%u), but already rw\n", - bch2_recovery_passes[pass], pass); - ret = bch_err_throw(c, cannot_rewind_recovery); - goto out; - } - - if (ratelimit) - r->passes_ratelimiting |= BIT_ULL(pass); - else - r->passes_ratelimiting &= ~BIT_ULL(pass); - - if (in_recovery && !ratelimit) { - prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[r->curr_pass], r->curr_pass, - rewind ? " - rewinding" : ""); - - r->passes_to_run |= BIT_ULL(pass); - - if (rewind) { - r->next_pass = pass; - r->passes_complete &= (1ULL << pass) >> 1; - ret = bch_err_throw(c, restart_recovery); - } - } else { - prt_printf(out, "scheduling recovery pass %s (%u)%s\n", - bch2_recovery_passes[pass], pass, - ratelimit ? " - ratelimiting" : ""); - - struct recovery_pass_fn *p = recovery_pass_fns + pass; - if (p->when & PASS_ONLINE) - bch2_run_async_recovery_passes(c); - } -out: - spin_unlock_irqrestore(&r->lock, lockflags); - --out->atomic; - return ret; -} - -int bch2_run_explicit_recovery_pass(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass, - enum bch_run_recovery_pass_flags flags) -{ - int ret = 0; - - if (recovery_pass_needs_set(c, pass, &flags)) { - guard(mutex)(&c->sb_lock); - ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); - bch2_write_super(c); - } - - return ret; -} - -/* - * Returns 0 if @pass has run recently, otherwise one of - * -BCH_ERR_restart_recovery - * -BCH_ERR_recovery_pass_will_run - */ -int bch2_require_recovery_pass(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass) -{ - if (test_bit(BCH_FS_in_recovery, &c->flags) && - c->recovery.passes_complete & BIT_ULL(pass)) - return 0; - - guard(mutex)(&c->sb_lock); - - if (bch2_recovery_pass_want_ratelimit(c, pass)) - return 0; - - enum bch_run_recovery_pass_flags flags = 0; - int ret = 0; - - if (recovery_pass_needs_set(c, pass, &flags)) { - ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); - bch2_write_super(c); - } - - return ret ?: bch_err_throw(c, recovery_pass_will_run); -} - -int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - enum bch_run_recovery_pass_flags flags = 0; - - if (!recovery_pass_needs_set(c, pass, &flags)) - return 0; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - mutex_lock(&c->sb_lock); - int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, - RUN_RECOVERY_PASS_nopersistent); - mutex_unlock(&c->sb_lock); - - bch2_print_str(c, KERN_NOTICE, buf.buf); - printbuf_exit(&buf); - return ret; -} - -static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct bch_fs_recovery *r = &c->recovery; - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - - s64 start_time = ktime_get_real_seconds(); - int ret = p->fn(c); - - r->passes_to_run &= ~BIT_ULL(pass); - - if (ret) { - r->passes_failing |= BIT_ULL(pass); - return ret; - } - - r->passes_failing = 0; - - if (!test_bit(BCH_FS_error, &c->flags)) - bch2_sb_recovery_pass_complete(c, pass, start_time); - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_CONT " done\n"); - - return 0; -} - -static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, - bool online) -{ - struct bch_fs_recovery *r = &c->recovery; - int ret = 0; - - spin_lock_irq(&r->lock); - - if (online) - orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); - - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) - orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); - - /* - * A failed recovery pass will be retried after another pass succeeds - - * but not this iteration. - * - * This is because some passes depend on repair done by other passes: we - * may want to retry, but we don't want to loop on failing passes. - */ - - orig_passes_to_run &= ~r->passes_failing; - - r->passes_to_run = orig_passes_to_run; - - while (r->passes_to_run) { - unsigned prev_done = r->pass_done; - unsigned pass = __ffs64(r->passes_to_run); - r->curr_pass = pass; - r->next_pass = r->curr_pass + 1; - r->passes_to_run &= ~BIT_ULL(pass); - - spin_unlock_irq(&r->lock); - - int ret2 = bch2_run_recovery_pass(c, pass) ?: - bch2_journal_flush(&c->journal); - - spin_lock_irq(&r->lock); - - if (r->next_pass < r->curr_pass) { - /* Rewind: */ - r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); - } else if (!ret2) { - r->pass_done = max(r->pass_done, pass); - r->passes_complete |= BIT_ULL(pass); - } else { - ret = ret2; - } - - if (ret && !online) - break; - - if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && - r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { - bch2_copygc_wakeup(c); - bch2_rebalance_wakeup(c); - } - } - - clear_bit(BCH_FS_in_recovery, &c->flags); - spin_unlock_irq(&r->lock); - - return ret; -} - -static void bch2_async_recovery_passes_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); - struct bch_fs_recovery *r = &c->recovery; - - __bch2_run_recovery_passes(c, - c->sb.recovery_passes_required & ~r->passes_ratelimiting, - true); - - up(&r->run_lock); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); -} - -int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) -{ - return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); -} - -int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) -{ - u64 passes = - bch2_recovery_passes_match(PASS_ALWAYS) | - (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | - (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | - c->opts.recovery_passes | - c->sb.recovery_passes_required; - - if (c->opts.recovery_pass_last) - passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; - - /* - * We can't allow set_may_go_rw to be excluded; that would cause us to - * use the journal replay keys for updates where it's not expected. - */ - c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - passes &= ~c->opts.recovery_passes_exclude; - - passes &= ~(BIT_ULL(from) - 1); - - down(&c->recovery.run_lock); - int ret = __bch2_run_recovery_passes(c, passes, false); - up(&c->recovery.run_lock); - - return ret; -} - -static void prt_passes(struct printbuf *out, const char *msg, u64 passes) -{ - prt_printf(out, "%s:\t", msg); - prt_bitflags(out, bch2_recovery_passes, passes); - prt_newline(out); -} - -void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_fs_recovery *r = &c->recovery; - - printbuf_tabstop_push(out, 32); - prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); - prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & - bch2_recovery_passes_match(PASS_ONLINE)); - prt_passes(out, "Complete passes", r->passes_complete); - prt_passes(out, "Failing passes", r->passes_failing); - - if (r->curr_pass) { - prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); - prt_passes(out, "Current passes", r->passes_to_run); - } -} - -void bch2_fs_recovery_passes_init(struct bch_fs *c) -{ - spin_lock_init(&c->recovery.lock); - sema_init(&c->recovery.run_lock, 1); - - INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); -} diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h deleted file mode 100644 index 2117f0ce19229a..00000000000000 --- a/fs/bcachefs/recovery_passes.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef _BCACHEFS_RECOVERY_PASSES_H -#define _BCACHEFS_RECOVERY_PASSES_H - -extern const char * const bch2_recovery_passes[]; - -extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes; - -u64 bch2_recovery_passes_to_stable(u64 v); -u64 bch2_recovery_passes_from_stable(u64 v); - -u64 bch2_fsck_recovery_passes(void); - -void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass); - -enum bch_run_recovery_pass_flags { - RUN_RECOVERY_PASS_nopersistent = BIT(0), - RUN_RECOVERY_PASS_ratelimit = BIT(1), -}; - -static inline bool go_rw_in_recovery(struct bch_fs *c) -{ - return (c->journal_keys.nr || - !c->opts.read_only || - !c->sb.clean || - c->opts.recovery_passes || - (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))); -} - -int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); - -int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, - enum bch_recovery_pass, - enum bch_run_recovery_pass_flags); -int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, - enum bch_recovery_pass, - enum bch_run_recovery_pass_flags); - -int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *, - enum bch_recovery_pass); - -int bch2_run_online_recovery_passes(struct bch_fs *, u64); -int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); - -void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_recovery_passes_init(struct bch_fs *); - -#endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h deleted file mode 100644 index b63c20558d3d42..00000000000000 --- a/fs/bcachefs/recovery_passes_format.h +++ /dev/null @@ -1,106 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H -#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H - -#define PASS_SILENT BIT(0) -#define PASS_FSCK BIT(1) -#define PASS_UNCLEAN BIT(2) -#define PASS_ALWAYS BIT(3) -#define PASS_ONLINE BIT(4) -#define PASS_ALLOC BIT(5) -#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC) - -#ifdef CONFIG_BCACHEFS_DEBUG -#define PASS_FSCK_DEBUG BIT(1) -#else -#define PASS_FSCK_DEBUG 0 -#endif - -/* - * Passes may be reordered, but the second field is a persistent identifier and - * must never change: - */ -#define BCH_RECOVERY_PASSES() \ - x(recovery_pass_empty, 41, PASS_SILENT) \ - x(scan_for_btree_nodes, 37, 0) \ - x(check_topology, 4, 0) \ - x(accounting_read, 39, PASS_ALWAYS) \ - x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, 0) \ - x(initialize_subvolumes, 2, 0) \ - x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_allocations, 5, PASS_FSCK_ALLOC) \ - x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ - x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ - x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ - x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 17, 0) \ - x(reconstruct_snapshots, 38, 0) \ - x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ - x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ - x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ - x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 22, 0) \ - x(check_inodes, 24, PASS_FSCK) \ - x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ - x(check_dirents, 27, PASS_FSCK) \ - x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ - x(check_unreachable_inodes, 40, PASS_FSCK) \ - x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ - x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ - x(check_nlinks, 31, PASS_FSCK) \ - x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(delete_dead_inodes, 32, PASS_ALWAYS) \ - x(fix_reflink_p, 33, 0) \ - x(set_fs_needs_rebalance, 34, 0) \ - x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT) - -/* We normally enumerate recovery passes in the order we run them: */ -enum bch_recovery_pass { -#define x(n, id, when) BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - BCH_RECOVERY_PASS_NR -}; - -/* But we also need stable identifiers that can be used in the superblock */ -enum bch_recovery_pass_stable { -#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, - BCH_RECOVERY_PASSES() -#undef x -}; - -struct recovery_pass_entry { - __le64 last_run; - __le32 last_runtime; - __le32 flags; -}; - -LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1) - -struct bch_sb_field_recovery_passes { - struct bch_sb_field field; - struct recovery_pass_entry start[]; -}; - -static inline unsigned -recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r) -{ - return r - ? ((vstruct_end(&r->field) - (void *) &r->start[0]) / - sizeof(struct recovery_pass_entry)) - : 0; -} - -#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h deleted file mode 100644 index aa9526938cc35d..00000000000000 --- a/fs/bcachefs/recovery_passes_types.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H -#define _BCACHEFS_RECOVERY_PASSES_TYPES_H - -struct bch_fs_recovery { - /* - * Two different uses: - * "Has this fsck pass?" - i.e. should this type of error be an - * emergency read-only - * And, in certain situations fsck will rewind to an earlier pass: used - * for signaling to the toplevel code which pass we want to run now. - */ - enum bch_recovery_pass curr_pass; - enum bch_recovery_pass next_pass; - /* never rewinds version of curr_pass */ - enum bch_recovery_pass pass_done; - u64 passes_to_run; - /* bitmask of recovery passes that we actually ran */ - u64 passes_complete; - u64 passes_failing; - u64 passes_ratelimiting; - spinlock_t lock; - struct semaphore run_lock; - struct work_struct work; -}; - -#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c deleted file mode 100644 index 92b90cfe622b96..00000000000000 --- a/fs/bcachefs/reflink.c +++ /dev/null @@ -1,865 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "inode.h" -#include "io_misc.h" -#include "io_write.h" -#include "rebalance.h" -#include "reflink.h" -#include "subvolume.h" -#include "super-io.h" - -#include - -static inline bool bkey_extent_is_reflink_data(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_reflink_v: - case KEY_TYPE_indirect_inline_data: - return true; - default: - return false; - } -} - -static inline unsigned bkey_type_to_indirect(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_extent: - return KEY_TYPE_reflink_v; - case KEY_TYPE_inline_data: - return KEY_TYPE_indirect_inline_data; - default: - return 0; - } -} - -/* reflink pointers */ - -int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - int ret = 0; - - bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad), - c, reflink_p_front_pad_bad, - "idx < front_pad (%llu < %u)", - REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad)); -fsck_err: - return ret; -} - -void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - - prt_printf(out, "idx %llu front_pad %u back_pad %u", - REFLINK_P_IDX(p.v), - le32_to_cpu(p.v->front_pad), - le32_to_cpu(p.v->back_pad)); - - if (REFLINK_P_ERROR(p.v)) - prt_str(out, " error"); -} - -bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); - struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r); - - /* - * Disabled for now, the triggers code needs to be reworked for merging - * of reflink pointers to work: - */ - return false; - - if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v)) - return false; - - if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v)) - return false; - - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -/* indirect extents */ - -int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)), - c, reflink_v_pos_bad, - "indirect extent above maximum position 0:%llu", - REFLINK_P_IDX_MAX); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - - prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); - - bch2_bkey_ptrs_to_text(out, c, k); -} - -#if 0 -Currently disabled, needs to be debugged: - -bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); - - return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); -} -#endif - -/* indirect inline data */ - -int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -void bch2_indirect_inline_data_to_text(struct printbuf *out, - struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); - unsigned datalen = bkey_inline_data_bytes(k.k); - - prt_printf(out, "refcount %llu datalen %u: %*phN", - le64_to_cpu(d.v->refcount), datalen, - min(datalen, 32U), d.v->data); -} - -/* lookup */ - -static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p, - bool should_commit) -{ - struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - SET_REFLINK_P_ERROR(&new->v, false); - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); - if (ret) - return ret; - - if (!should_commit) - return 0; - - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; -} - -static int bch2_indirect_extent_missing_error(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 missing_start, u64 missing_end, - bool should_commit) -{ - if (REFLINK_P_ERROR(p.v)) - return 0; - - struct bch_fs *c = trans->c; - u64 live_start = REFLINK_P_IDX(p.v); - u64 live_end = REFLINK_P_IDX(p.v) + p.k->size; - u64 refd_start = live_start - le32_to_cpu(p.v->front_pad); - u64 refd_end = live_end + le32_to_cpu(p.v->back_pad); - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(missing_start < refd_start); - BUG_ON(missing_end > refd_end); - - struct bpos missing_pos = bkey_start_pos(p.k); - missing_pos.offset += missing_start - live_start; - - prt_printf(&buf, "pointer to missing indirect extent in "); - ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); - if (ret) - goto err; - - prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9); - bch2_bkey_val_to_text(&buf, c, p.s_c); - - prt_printf(&buf, "\nmissing reflink btree range %llu-%llu", - missing_start, missing_end); - - if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) { - struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - /* - * Is the missing range not actually needed? - * - * p.v->idx refers to the data that we actually want, but if the - * indirect extent we point to was bigger, front_pad and back_pad - * indicate the range we took a reference on. - */ - - if (missing_end <= live_start) { - new->v.front_pad = cpu_to_le32(live_start - missing_end); - } else if (missing_start >= live_end) { - new->v.back_pad = cpu_to_le32(missing_start - live_end); - } else { - struct bpos new_start = bkey_start_pos(&new->k); - struct bpos new_end = new->k.p; - - if (missing_start > live_start) - new_start.offset += missing_start - live_start; - if (missing_end < live_end) - new_end.offset -= live_end - missing_end; - - bch2_cut_front(new_start, &new->k_i); - bch2_cut_back(new_end, &new->k_i); - - SET_REFLINK_P_ERROR(&new->v, true); - } - - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); - if (ret) - goto err; - - if (should_commit) - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* - * This is used from the read path, which doesn't expect to have to do a - * transaction commit, and from triggers, which should not be doing a commit: - */ -struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, - struct btree_iter *iter, - s64 *offset_into_extent, - struct bkey_s_c_reflink_p p, - bool should_commit, - unsigned iter_flags) -{ - BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad))); - BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad)); - - u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent; - - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink, - POS(0, reflink_offset), iter_flags); - if (bkey_err(k)) - return k; - - if (unlikely(!bkey_extent_is_reflink_data(k.k))) { - u64 missing_end = min(k.k->p.offset, - REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad)); - BUG_ON(reflink_offset == missing_end); - - int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, - missing_end, should_commit); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return bkey_s_c_err(ret); - } - } else if (unlikely(REFLINK_P_ERROR(p.v))) { - int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return bkey_s_c_err(ret); - } - } - - *offset_into_extent = reflink_offset - bkey_start_offset(k.k); - return k; -} - -/* reflink pointer trigger */ - -static int trans_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, u64 *idx, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v); - struct btree_iter iter; - struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false, - BTREE_ITER_intent| - BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_refcount_c(k)) { - if (!(flags & BTREE_TRIGGER_overwrite)) - ret = bch_err_throw(c, missing_indirect_extent); - goto next; - } - - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - __le64 *refcount = bkey_refcount(bkey_i_to_s(new)); - if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - log_fsck_err(trans, reflink_refcount_underflow, - "indirect extent refcount underflow while marking\n%s", - buf.buf); - goto next; - } - - if (flags & BTREE_TRIGGER_insert) { - struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; - u64 pad; - - pad = max_t(s64, le32_to_cpu(v->front_pad), - REFLINK_P_IDX(v) - bkey_start_offset(&new->k)); - BUG_ON(pad > U32_MAX); - v->front_pad = cpu_to_le32(pad); - - pad = max_t(s64, le32_to_cpu(v->back_pad), - new->k.p.offset - p.k->size - REFLINK_P_IDX(v)); - BUG_ON(pad > U32_MAX); - v->back_pad = cpu_to_le32(pad); - } - - le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1); - - bch2_btree_iter_set_pos_to_extent_start(&iter); - ret = bch2_trans_update(trans, &iter, new, 0); - if (ret) - goto err; -next: - *idx = k.k->p.offset; -err: -fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -} - -static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, u64 *idx, - enum btree_iter_update_trigger_flags flags, - size_t r_idx) -{ - struct bch_fs *c = trans->c; - struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; - u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); - s64 ret = 0; - struct printbuf buf = PRINTBUF; - - if (r_idx >= c->reflink_gc_nr) - goto not_found; - - r = genradix_ptr(&c->reflink_gc_table, r_idx); - next_idx = min(next_idx, r->offset - r->size); - if (*idx < next_idx) - goto not_found; - - BUG_ON((s64) r->refcount + add < 0); - - if (flags & BTREE_TRIGGER_gc) - r->refcount += add; - *idx = r->offset; - return 0; -not_found: - if (flags & BTREE_TRIGGER_check_repair) { - ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); - if (ret) - goto err; - } - - *idx = next_idx; -err: - printbuf_exit(&buf); - return ret; -} - -static int __trigger_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - int ret = 0; - - u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); - u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); - - if (flags & BTREE_TRIGGER_transactional) { - while (idx < end && !ret) - ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags); - } - - if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) { - size_t l = 0, r = c->reflink_gc_nr; - - while (l < r) { - size_t m = l + (r - l) / 2; - struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m); - if (ref->offset <= idx) - l = m + 1; - else - r = m; - } - - while (idx < end && !ret) - ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++); - } - - return ret; -} - -int bch2_trigger_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - if ((flags & BTREE_TRIGGER_transactional) && - (flags & BTREE_TRIGGER_insert)) { - struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v; - - v->front_pad = v->back_pad = 0; - } - - return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags); -} - -/* indirect extent trigger */ - -static inline void -check_indirect_extent_deleting(struct bkey_s new, - enum btree_iter_update_trigger_flags *flags) -{ - if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) { - new.k->type = KEY_TYPE_deleted; - new.k->size = 0; - set_bkey_val_u64s(new.k, 0); - *flags &= ~BTREE_TRIGGER_insert; - } -} - -int bch2_trigger_reflink_v(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - if ((flags & BTREE_TRIGGER_transactional) && - (flags & BTREE_TRIGGER_insert)) - check_indirect_extent_deleting(new, &flags); - - return bch2_trigger_extent(trans, btree_id, level, old, new, flags); -} - -int bch2_trigger_indirect_inline_data(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - check_indirect_extent_deleting(new, &flags); - - return 0; -} - -/* create */ - -static int bch2_make_extent_indirect(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *orig, - bool reflink_p_may_update_opts_field) -{ - struct bch_fs *c = trans->c; - struct btree_iter reflink_iter = {}; - struct bkey_s_c k; - struct bkey_i *r_v; - struct bkey_i_reflink_p *r_p; - __le64 *refcount; - int ret; - - if (orig->k.type == KEY_TYPE_inline_data) - bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); - - bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, - BTREE_ITER_intent); - k = bch2_btree_iter_peek_prev(trans, &reflink_iter); - ret = bkey_err(k); - if (ret) - goto err; - - /* - * XXX: we're assuming that 56 bits will be enough for the life of the - * filesystem: we need to implement wraparound, with a cursor in the - * logged ops btree: - */ - if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size))) - return -ENOSPC; - - r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); - ret = PTR_ERR_OR_ZERO(r_v); - if (ret) - goto err; - - bkey_init(&r_v->k); - r_v->k.type = bkey_type_to_indirect(&orig->k); - r_v->k.p = reflink_iter.pos; - bch2_key_resize(&r_v->k, orig->k.size); - r_v->k.bversion = orig->k.bversion; - - set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); - - refcount = bkey_refcount(bkey_i_to_s(r_v)); - *refcount = 0; - memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); - - ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); - if (ret) - goto err; - - /* - * orig is in a bkey_buf which statically allocates 5 64s for the val, - * so we know it will be big enough: - */ - orig->k.type = KEY_TYPE_reflink_p; - r_p = bkey_i_to_reflink_p(orig); - set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); - - /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */ -#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) - __underlying_memset(&r_p->v, 0, sizeof(r_p->v)); -#else - memset(&r_p->v, 0, sizeof(r_p->v)); -#endif - - SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k)); - - if (reflink_p_may_update_opts_field) - SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true); - - ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, - BTREE_UPDATE_internal_snapshot_node); -err: - bch2_trans_iter_exit(trans, &reflink_iter); - - return ret; -} - -static struct bkey_s_c get_next_src(struct btree_trans *trans, - struct btree_iter *iter, struct bpos end) -{ - struct bkey_s_c k; - int ret; - - for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) { - if (bkey_extent_is_unwritten(k)) - continue; - - if (bkey_extent_is_data(k.k)) - return k; - } - - if (bkey_ge(iter->pos, end)) - bch2_btree_iter_set_pos(trans, iter, end); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -} - -s64 bch2_remap_range(struct bch_fs *c, - subvol_inum dst_inum, u64 dst_offset, - subvol_inum src_inum, u64 src_offset, - u64 remap_sectors, - u64 new_i_size, s64 *i_sectors_delta, - bool may_change_src_io_path_opts) -{ - struct btree_trans *trans; - struct btree_iter dst_iter, src_iter; - struct bkey_s_c src_k; - struct bkey_buf new_dst, new_src; - struct bpos dst_start = POS(dst_inum.inum, dst_offset); - struct bpos src_start = POS(src_inum.inum, src_offset); - struct bpos dst_end = dst_start, src_end = src_start; - struct bch_io_opts opts; - struct bpos src_want; - u64 dst_done = 0; - u32 dst_snapshot, src_snapshot; - bool reflink_p_may_update_opts_field = - !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); - int ret = 0, ret2 = 0; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink)) - return bch_err_throw(c, erofs_no_writes); - - bch2_check_set_feature(c, BCH_FEATURE_reflink); - - dst_end.offset += remap_sectors; - src_end.offset += remap_sectors; - - bch2_bkey_buf_init(&new_dst); - bch2_bkey_buf_init(&new_src); - trans = bch2_trans_get(c); - - ret = bch2_inum_opts_get(trans, src_inum, &opts); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, - BTREE_ITER_intent); - bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, - BTREE_ITER_intent); - - while ((ret == 0 || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) && - bkey_lt(dst_iter.pos, dst_end)) { - struct disk_reservation disk_res = { 0 }; - - bch2_trans_begin(trans); - - if (fatal_signal_pending(current)) { - ret = -EINTR; - break; - } - - ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol, - &src_snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot); - - ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, - &dst_snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot); - - if (dst_inum.inum < src_inum.inum) { - /* Avoid some lock cycle transaction restarts */ - ret = bch2_btree_iter_traverse(trans, &dst_iter); - if (ret) - continue; - } - - dst_done = dst_iter.pos.offset - dst_start.offset; - src_want = POS(src_start.inode, src_start.offset + dst_done); - bch2_btree_iter_set_pos(trans, &src_iter, src_want); - - src_k = get_next_src(trans, &src_iter, src_end); - ret = bkey_err(src_k); - if (ret) - continue; - - if (bkey_lt(src_want, src_iter.pos)) { - ret = bch2_fpunch_at(trans, &dst_iter, dst_inum, - min(dst_end.offset, - dst_iter.pos.offset + - src_iter.pos.offset - src_want.offset), - i_sectors_delta); - continue; - } - - if (src_k.k->type != KEY_TYPE_reflink_p) { - bch2_btree_iter_set_pos_to_extent_start(&src_iter); - - bch2_bkey_buf_reassemble(&new_src, c, src_k); - src_k = bkey_i_to_s_c(new_src.k); - - ret = bch2_make_extent_indirect(trans, &src_iter, - new_src.k, - reflink_p_may_update_opts_field); - if (ret) - continue; - - BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); - } - - if (src_k.k->type == KEY_TYPE_reflink_p) { - struct bkey_s_c_reflink_p src_p = - bkey_s_c_to_reflink_p(src_k); - struct bkey_i_reflink_p *dst_p = - bkey_reflink_p_init(new_dst.k); - - u64 offset = REFLINK_P_IDX(src_p.v) + - (src_want.offset - - bkey_start_offset(src_k.k)); - - SET_REFLINK_P_IDX(&dst_p->v, offset); - - if (reflink_p_may_update_opts_field && - may_change_src_io_path_opts && - REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v)) - SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); - } else { - BUG(); - } - - new_dst.k->k.p = dst_iter.pos; - bch2_key_resize(&new_dst.k->k, - min(src_k.k->p.offset - src_want.offset, - dst_end.offset - dst_iter.pos.offset)); - - ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?: - bch2_extent_update(trans, dst_inum, &dst_iter, - new_dst.k, &disk_res, - new_i_size, i_sectors_delta, - true); - bch2_disk_reservation_put(c, &disk_res); - } - bch2_trans_iter_exit(trans, &dst_iter); - bch2_trans_iter_exit(trans, &src_iter); - - BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end)); - BUG_ON(bkey_gt(dst_iter.pos, dst_end)); - - dst_done = dst_iter.pos.offset - dst_start.offset; - new_i_size = min(dst_iter.pos.offset << 9, new_i_size); - - do { - struct bch_inode_unpacked inode_u; - struct btree_iter inode_iter = {}; - - bch2_trans_begin(trans); - - ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u, - dst_inum, BTREE_ITER_intent); - - if (!ret2 && - inode_u.bi_size < new_i_size) { - inode_u.bi_size = new_i_size; - ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - } - - bch2_trans_iter_exit(trans, &inode_iter); - } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&new_src, c); - bch2_bkey_buf_exit(&new_dst, c); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink); - - return dst_done ?: ret ?: ret2; -} - -/* fsck */ - -static int bch2_gc_write_reflink_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - size_t *idx) -{ - struct bch_fs *c = trans->c; - const __le64 *refcount = bkey_refcount_c(k); - struct printbuf buf = PRINTBUF; - struct reflink_gc *r; - int ret = 0; - - if (!refcount) - return 0; - - while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && - r->offset < k.k->p.offset) - ++*idx; - - if (!r || - r->offset != k.k->p.offset || - r->size != k.k->size) { - bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); - return -EINVAL; - } - - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), - trans, reflink_v_refcount_wrong, - "reflink key has wrong refcount:\n" - "%s\n" - "should be %u", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf), - r->refcount)) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - if (!r->refcount) - new->k.type = KEY_TYPE_deleted; - else - *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); - ret = bch2_trans_update(trans, iter, new, 0); - } -out: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_gc_reflink_done(struct bch_fs *c) -{ - size_t idx = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_gc_write_reflink_key(trans, &iter, k, &idx))); - c->reflink_gc_nr = 0; - return ret; -} - -int bch2_gc_reflink_start(struct bch_fs *c) -{ - c->reflink_gc_nr = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch, k, ({ - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - continue; - - struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, - c->reflink_gc_nr++, GFP_KERNEL); - if (!r) { - ret = bch_err_throw(c, ENOMEM_gc_reflink_start); - break; - } - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - 0; - }))); - - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h deleted file mode 100644 index 1632780bdf181f..00000000000000 --- a/fs/bcachefs/reflink.h +++ /dev/null @@ -1,87 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REFLINK_H -#define _BCACHEFS_REFLINK_H - -int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ - .key_validate = bch2_reflink_p_validate, \ - .val_to_text = bch2_reflink_p_to_text, \ - .key_merge = bch2_reflink_p_merge, \ - .trigger = bch2_trigger_reflink_p, \ - .min_val_size = 16, \ -}) - -int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ - .key_validate = bch2_reflink_v_validate, \ - .val_to_text = bch2_reflink_v_to_text, \ - .swab = bch2_ptr_swab, \ - .trigger = bch2_trigger_reflink_v, \ - .min_val_size = 8, \ -}) - -int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_indirect_inline_data_to_text(struct printbuf *, - struct bch_fs *, struct bkey_s_c); -int bch2_trigger_indirect_inline_data(struct btree_trans *, - enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ - .key_validate = bch2_indirect_inline_data_validate, \ - .val_to_text = bch2_indirect_inline_data_to_text, \ - .trigger = bch2_trigger_indirect_inline_data, \ - .min_val_size = 8, \ -}) - -static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_reflink_v: - return &bkey_s_c_to_reflink_v(k).v->refcount; - case KEY_TYPE_indirect_inline_data: - return &bkey_s_c_to_indirect_inline_data(k).v->refcount; - default: - return NULL; - } -} - -static inline __le64 *bkey_refcount(struct bkey_s k) -{ - switch (k.k->type) { - case KEY_TYPE_reflink_v: - return &bkey_s_to_reflink_v(k).v->refcount; - case KEY_TYPE_indirect_inline_data: - return &bkey_s_to_indirect_inline_data(k).v->refcount; - default: - return NULL; - } -} - -struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *, - s64 *, struct bkey_s_c_reflink_p, - bool, unsigned); - -s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, - subvol_inum, u64, u64, u64, s64 *, - bool); - -int bch2_gc_reflink_done(struct bch_fs *); -int bch2_gc_reflink_start(struct bch_fs *); - -#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h deleted file mode 100644 index 92995e4f898e27..00000000000000 --- a/fs/bcachefs/reflink_format.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REFLINK_FORMAT_H -#define _BCACHEFS_REFLINK_FORMAT_H - -struct bch_reflink_p { - struct bch_val v; - __le64 idx_flags; - /* - * A reflink pointer might point to an indirect extent which is then - * later split (by copygc or rebalance). If we only pointed to part of - * the original indirect extent, and then one of the fragments is - * outside the range we point to, we'd leak a refcount: so when creating - * reflink pointers, we need to store pad values to remember the full - * range we were taking a reference on. - */ - __le32 front_pad; - __le32 back_pad; -} __packed __aligned(8); - -LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56); -LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57); -LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS, - struct bch_reflink_p, idx_flags, 57, 58); - -struct bch_reflink_v { - struct bch_val v; - __le64 refcount; - union bch_extent_entry start[0]; - __u64 _data[]; -} __packed __aligned(8); - -struct bch_indirect_inline_data { - struct bch_val v; - __le64 refcount; - u8 data[]; -}; - -#endif /* _BCACHEFS_REFLINK_FORMAT_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c deleted file mode 100644 index 8383bd7fdb3fee..00000000000000 --- a/fs/bcachefs/replicas.c +++ /dev/null @@ -1,918 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "buckets.h" -#include "disk_accounting.h" -#include "journal.h" -#include "replicas.h" -#include "super-io.h" - -#include - -static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, - struct bch_replicas_cpu *); - -/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, const void *priv) -{ - size_t size = (size_t) priv; - return memcmp(l, r, size); -} - -/* Replicas tracking - in memory: */ - -static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - BUG_ON(!e->nr_devs); - BUG_ON(e->nr_required > 1 && - e->nr_required >= e->nr_devs); - - for (unsigned i = 0; i + 1 < e->nr_devs; i++) - BUG_ON(e->devs[i] >= e->devs[i + 1]); -#endif -} - -void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) -{ - bubble_sort(e->devs, e->nr_devs, u8_cmp); -} - -static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) -{ - eytzinger0_sort_r(r->entries, r->nr, r->entry_size, - bch2_memcmp, NULL, (void *)(size_t)r->entry_size); -} - -static void bch2_replicas_entry_v0_to_text(struct printbuf *out, - struct bch_replicas_entry_v0 *e) -{ - bch2_prt_data_type(out, e->data_type); - - prt_printf(out, ": %u [", e->nr_devs); - for (unsigned i = 0; i < e->nr_devs; i++) - prt_printf(out, i ? " %u" : "%u", e->devs[i]); - prt_printf(out, "]"); -} - -void bch2_replicas_entry_to_text(struct printbuf *out, - struct bch_replicas_entry_v1 *e) -{ - bch2_prt_data_type(out, e->data_type); - - prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); - for (unsigned i = 0; i < e->nr_devs; i++) - prt_printf(out, i ? " %u" : "%u", e->devs[i]); - prt_printf(out, "]"); -} - -static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, - struct bch_sb *sb, - struct printbuf *err) -{ - if (!r->nr_devs) { - prt_printf(err, "no devices in entry "); - goto bad; - } - - if (r->nr_required > 1 && - r->nr_required >= r->nr_devs) { - prt_printf(err, "bad nr_required in entry "); - goto bad; - } - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] != BCH_SB_MEMBER_INVALID && - !bch2_member_exists(sb, r->devs[i])) { - prt_printf(err, "invalid device %u in entry ", r->devs[i]); - goto bad; - } - - return 0; -bad: - bch2_replicas_entry_to_text(err, r); - return -BCH_ERR_invalid_replicas_entry; -} - -int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, - struct bch_fs *c, - struct printbuf *err) -{ - if (!r->nr_devs) { - prt_printf(err, "no devices in entry "); - goto bad; - } - - if (r->nr_required > 1 && - r->nr_required >= r->nr_devs) { - prt_printf(err, "bad nr_required in entry "); - goto bad; - } - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] != BCH_SB_MEMBER_INVALID && - !bch2_dev_exists(c, r->devs[i])) { - prt_printf(err, "invalid device %u in entry ", r->devs[i]); - goto bad; - } - - return 0; -bad: - bch2_replicas_entry_to_text(err, r); - return bch_err_throw(c, invalid_replicas_entry); -} - -void bch2_cpu_replicas_to_text(struct printbuf *out, - struct bch_replicas_cpu *r) -{ - struct bch_replicas_entry_v1 *e; - bool first = true; - - for_each_cpu_replicas_entry(r, e) { - if (!first) - prt_printf(out, " "); - first = false; - - bch2_replicas_entry_to_text(out, e); - } -} - -static void extent_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry_v1 *r) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - r->nr_required = 1; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.cached) - continue; - - if (!p.has_ec) - replicas_entry_add_dev(r, p.ptr.dev); - else - r->nr_required = 0; - } -} - -static void stripe_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry_v1 *r) -{ - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - const struct bch_extent_ptr *ptr; - - r->nr_required = s.v->nr_blocks - s.v->nr_redundant; - - for (ptr = s.v->ptrs; - ptr < s.v->ptrs + s.v->nr_blocks; - ptr++) - replicas_entry_add_dev(r, ptr->dev); -} - -void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, - struct bkey_s_c k) -{ - e->nr_devs = 0; - - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - e->data_type = BCH_DATA_btree; - extent_to_replicas(k, e); - break; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - e->data_type = BCH_DATA_user; - extent_to_replicas(k, e); - break; - case KEY_TYPE_stripe: - e->data_type = BCH_DATA_parity; - stripe_to_replicas(k, e); - break; - } - - bch2_replicas_entry_sort(e); -} - -void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, - enum bch_data_type data_type, - struct bch_devs_list devs) -{ - BUG_ON(!data_type || - data_type == BCH_DATA_sb || - data_type >= BCH_DATA_NR); - - e->data_type = data_type; - e->nr_devs = 0; - e->nr_required = 1; - - darray_for_each(devs, i) - replicas_entry_add_dev(e, *i); - - bch2_replicas_entry_sort(e); -} - -static struct bch_replicas_cpu -cpu_replicas_add_entry(struct bch_fs *c, - struct bch_replicas_cpu *old, - struct bch_replicas_entry_v1 *new_entry) -{ - struct bch_replicas_cpu new = { - .nr = old->nr + 1, - .entry_size = max_t(unsigned, old->entry_size, - replicas_entry_bytes(new_entry)), - }; - - new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); - if (!new.entries) - return new; - - for (unsigned i = 0; i < old->nr; i++) - memcpy(cpu_replicas_entry(&new, i), - cpu_replicas_entry(old, i), - old->entry_size); - - memcpy(cpu_replicas_entry(&new, old->nr), - new_entry, - replicas_entry_bytes(new_entry)); - - bch2_cpu_replicas_sort(&new); - return new; -} - -static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, - struct bch_replicas_entry_v1 *search) -{ - int idx, entry_size = replicas_entry_bytes(search); - - if (unlikely(entry_size > r->entry_size)) - return -1; - -#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) - idx = eytzinger0_find(r->entries, r->nr, r->entry_size, - entry_cmp, search); -#undef entry_cmp - - return idx < r->nr ? idx : -1; -} - -int bch2_replicas_entry_idx(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - bch2_replicas_entry_sort(search); - - return __replicas_entry_idx(&c->replicas, search); -} - -static bool __replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_entry_v1 *search) -{ - return __replicas_entry_idx(r, search) >= 0; -} - -bool bch2_replicas_marked_locked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - verify_replicas_entry(search); - - return !search->nr_devs || - (__replicas_has_entry(&c->replicas, search) && - (likely((!c->replicas_gc.entries)) || - __replicas_has_entry(&c->replicas_gc, search))); -} - -bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - percpu_down_read(&c->mark_lock); - bool ret = bch2_replicas_marked_locked(c, search); - percpu_up_read(&c->mark_lock); - - return ret; -} - -noinline -static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_entry_v1 *new_entry) -{ - struct bch_replicas_cpu new_r, new_gc; - int ret = 0; - - verify_replicas_entry(new_entry); - - memset(&new_r, 0, sizeof(new_r)); - memset(&new_gc, 0, sizeof(new_gc)); - - mutex_lock(&c->sb_lock); - - if (c->replicas_gc.entries && - !__replicas_has_entry(&c->replicas_gc, new_entry)) { - new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); - if (!new_gc.entries) { - ret = bch_err_throw(c, ENOMEM_cpu_replicas); - goto err; - } - } - - if (!__replicas_has_entry(&c->replicas, new_entry)) { - new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); - if (!new_r.entries) { - ret = bch_err_throw(c, ENOMEM_cpu_replicas); - goto err; - } - - ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); - if (ret) - goto err; - } - - if (!new_r.entries && - !new_gc.entries) - goto out; - - /* allocations done, now commit: */ - - if (new_r.entries) - bch2_write_super(c); - - /* don't update in memory replicas until changes are persistent */ - percpu_down_write(&c->mark_lock); - if (new_r.entries) - swap(c->replicas, new_r); - if (new_gc.entries) - swap(new_gc, c->replicas_gc); - percpu_up_write(&c->mark_lock); -out: - mutex_unlock(&c->sb_lock); - - kfree(new_r.entries); - kfree(new_gc.entries); - - return ret; -err: - bch_err_msg(c, ret, "adding replicas entry"); - goto out; -} - -int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) -{ - return likely(bch2_replicas_marked(c, r)) - ? 0 : bch2_mark_replicas_slowpath(c, r); -} - -/* - * Old replicas_gc mechanism: only used for journal replicas entries now, should - * die at some point: - */ - -int bch2_replicas_gc_end(struct bch_fs *c, int ret) -{ - lockdep_assert_held(&c->replicas_gc_lock); - - mutex_lock(&c->sb_lock); - percpu_down_write(&c->mark_lock); - - ret = ret ?: - bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); - if (!ret) - swap(c->replicas, c->replicas_gc); - - kfree(c->replicas_gc.entries); - c->replicas_gc.entries = NULL; - - percpu_up_write(&c->mark_lock); - - if (!ret) - bch2_write_super(c); - - mutex_unlock(&c->sb_lock); - - return ret; -} - -int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -{ - struct bch_replicas_entry_v1 *e; - unsigned i = 0; - - lockdep_assert_held(&c->replicas_gc_lock); - - mutex_lock(&c->sb_lock); - BUG_ON(c->replicas_gc.entries); - - c->replicas_gc.nr = 0; - c->replicas_gc.entry_size = 0; - - for_each_cpu_replicas_entry(&c->replicas, e) { - /* Preserve unknown data types */ - if (e->data_type >= BCH_DATA_NR || - !((1 << e->data_type) & typemask)) { - c->replicas_gc.nr++; - c->replicas_gc.entry_size = - max_t(unsigned, c->replicas_gc.entry_size, - replicas_entry_bytes(e)); - } - } - - c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, - c->replicas_gc.entry_size, - GFP_KERNEL); - if (!c->replicas_gc.entries) { - mutex_unlock(&c->sb_lock); - bch_err(c, "error allocating c->replicas_gc"); - return bch_err_throw(c, ENOMEM_replicas_gc); - } - - for_each_cpu_replicas_entry(&c->replicas, e) - if (e->data_type >= BCH_DATA_NR || - !((1 << e->data_type) & typemask)) - memcpy(cpu_replicas_entry(&c->replicas_gc, i++), - e, c->replicas_gc.entry_size); - - bch2_cpu_replicas_sort(&c->replicas_gc); - mutex_unlock(&c->sb_lock); - - return 0; -} - -/* - * New much simpler mechanism for clearing out unneeded replicas entries - drop - * replicas entries that have 0 sectors used. - * - * However, we don't track sector counts for journal usage, so this doesn't drop - * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism - * is retained for that. - */ -int bch2_replicas_gc2(struct bch_fs *c) -{ - struct bch_replicas_cpu new = { 0 }; - unsigned nr; - int ret = 0; - - bch2_accounting_mem_gc(c); -retry: - nr = READ_ONCE(c->replicas.nr); - new.entry_size = READ_ONCE(c->replicas.entry_size); - new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); - if (!new.entries) { - bch_err(c, "error allocating c->replicas_gc"); - return bch_err_throw(c, ENOMEM_replicas_gc); - } - - mutex_lock(&c->sb_lock); - percpu_down_write(&c->mark_lock); - - if (nr != c->replicas.nr || - new.entry_size != c->replicas.entry_size) { - percpu_up_write(&c->mark_lock); - mutex_unlock(&c->sb_lock); - kfree(new.entries); - goto retry; - } - - for (unsigned i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); - - struct disk_accounting_pos k = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; - - unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), - "embedded variable length struct"); - - struct bpos p = disk_accounting_pos_to_bpos(&k); - - struct bch_accounting_mem *acc = &c->accounting; - bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &p) >= acc->k.nr; - - if (e->data_type == BCH_DATA_journal || !kill) - memcpy(cpu_replicas_entry(&new, new.nr++), - e, new.entry_size); - } - - bch2_cpu_replicas_sort(&new); - - ret = bch2_cpu_replicas_to_sb_replicas(c, &new); - - if (!ret) - swap(c->replicas, new); - - kfree(new.entries); - - percpu_up_write(&c->mark_lock); - - if (!ret) - bch2_write_super(c); - - mutex_unlock(&c->sb_lock); - - return ret; -} - -/* Replicas tracking - superblock: */ - -static int -__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, - struct bch_replicas_cpu *cpu_r) -{ - struct bch_replicas_entry_v1 *e, *dst; - unsigned nr = 0, entry_size = 0, idx = 0; - - for_each_replicas_entry(sb_r, e) { - entry_size = max_t(unsigned, entry_size, - replicas_entry_bytes(e)); - nr++; - } - - cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); - if (!cpu_r->entries) - return -BCH_ERR_ENOMEM_cpu_replicas; - - cpu_r->nr = nr; - cpu_r->entry_size = entry_size; - - for_each_replicas_entry(sb_r, e) { - dst = cpu_replicas_entry(cpu_r, idx++); - memcpy(dst, e, replicas_entry_bytes(e)); - bch2_replicas_entry_sort(dst); - } - - return 0; -} - -static int -__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, - struct bch_replicas_cpu *cpu_r) -{ - struct bch_replicas_entry_v0 *e; - unsigned nr = 0, entry_size = 0, idx = 0; - - for_each_replicas_entry(sb_r, e) { - entry_size = max_t(unsigned, entry_size, - replicas_entry_bytes(e)); - nr++; - } - - entry_size += sizeof(struct bch_replicas_entry_v1) - - sizeof(struct bch_replicas_entry_v0); - - cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); - if (!cpu_r->entries) - return -BCH_ERR_ENOMEM_cpu_replicas; - - cpu_r->nr = nr; - cpu_r->entry_size = entry_size; - - for_each_replicas_entry(sb_r, e) { - struct bch_replicas_entry_v1 *dst = - cpu_replicas_entry(cpu_r, idx++); - - dst->data_type = e->data_type; - dst->nr_devs = e->nr_devs; - dst->nr_required = 1; - memcpy(dst->devs, e->devs, e->nr_devs); - bch2_replicas_entry_sort(dst); - } - - return 0; -} - -int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) -{ - struct bch_sb_field_replicas *sb_v1; - struct bch_sb_field_replicas_v0 *sb_v0; - struct bch_replicas_cpu new_r = { 0, 0, NULL }; - int ret = 0; - - if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas))) - ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); - else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0))) - ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); - if (ret) - return ret; - - bch2_cpu_replicas_sort(&new_r); - - percpu_down_write(&c->mark_lock); - swap(c->replicas, new_r); - percpu_up_write(&c->mark_lock); - - kfree(new_r.entries); - - return 0; -} - -static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, - struct bch_replicas_cpu *r) -{ - struct bch_sb_field_replicas_v0 *sb_r; - struct bch_replicas_entry_v0 *dst; - struct bch_replicas_entry_v1 *src; - size_t bytes; - - bytes = sizeof(struct bch_sb_field_replicas); - - for_each_cpu_replicas_entry(r, src) - bytes += replicas_entry_bytes(src) - 1; - - sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, - DIV_ROUND_UP(bytes, sizeof(u64))); - if (!sb_r) - return bch_err_throw(c, ENOSPC_sb_replicas); - - bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); - sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); - - memset(&sb_r->entries, 0, - vstruct_end(&sb_r->field) - - (void *) &sb_r->entries); - - dst = sb_r->entries; - for_each_cpu_replicas_entry(r, src) { - dst->data_type = src->data_type; - dst->nr_devs = src->nr_devs; - memcpy(dst->devs, src->devs, src->nr_devs); - - dst = replicas_entry_next(dst); - - BUG_ON((void *) dst > vstruct_end(&sb_r->field)); - } - - return 0; -} - -static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, - struct bch_replicas_cpu *r) -{ - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry_v1 *dst, *src; - bool need_v1 = false; - size_t bytes; - - bytes = sizeof(struct bch_sb_field_replicas); - - for_each_cpu_replicas_entry(r, src) { - bytes += replicas_entry_bytes(src); - if (src->nr_required != 1) - need_v1 = true; - } - - if (!need_v1) - return bch2_cpu_replicas_to_sb_replicas_v0(c, r); - - sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, - DIV_ROUND_UP(bytes, sizeof(u64))); - if (!sb_r) - return bch_err_throw(c, ENOSPC_sb_replicas); - - bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); - sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); - - memset(&sb_r->entries, 0, - vstruct_end(&sb_r->field) - - (void *) &sb_r->entries); - - dst = sb_r->entries; - for_each_cpu_replicas_entry(r, src) { - memcpy(dst, src, replicas_entry_bytes(src)); - - dst = replicas_entry_next(dst); - - BUG_ON((void *) dst > vstruct_end(&sb_r->field)); - } - - return 0; -} - -static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, - struct bch_sb *sb, - struct printbuf *err) -{ - unsigned i; - - sort_r(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - bch2_memcmp, NULL, - (void *)(size_t)cpu_r->entry_size); - - for (i = 0; i < cpu_r->nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(cpu_r, i); - - int ret = bch2_replicas_entry_sb_validate(e, sb, err); - if (ret) - return ret; - - if (i + 1 < cpu_r->nr) { - struct bch_replicas_entry_v1 *n = - cpu_replicas_entry(cpu_r, i + 1); - - BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); - - if (!memcmp(e, n, cpu_r->entry_size)) { - prt_printf(err, "duplicate replicas entry "); - bch2_replicas_entry_to_text(err, e); - return -BCH_ERR_invalid_sb_replicas; - } - } - } - - return 0; -} - -static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); - struct bch_replicas_cpu cpu_r; - int ret; - - ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r); - if (ret) - return ret; - - ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); - kfree(cpu_r.entries); - return ret; -} - -static void bch2_sb_replicas_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_replicas *r = field_to_type(f, replicas); - struct bch_replicas_entry_v1 *e; - bool first = true; - - for_each_replicas_entry(r, e) { - if (!first) - prt_printf(out, " "); - first = false; - - bch2_replicas_entry_to_text(out, e); - } - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_replicas = { - .validate = bch2_sb_replicas_validate, - .to_text = bch2_sb_replicas_to_text, -}; - -static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); - struct bch_replicas_cpu cpu_r; - int ret; - - ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r); - if (ret) - return ret; - - ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); - kfree(cpu_r.entries); - return ret; -} - -static void bch2_sb_replicas_v0_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); - struct bch_replicas_entry_v0 *e; - bool first = true; - - for_each_replicas_entry(sb_r, e) { - if (!first) - prt_printf(out, " "); - first = false; - - bch2_replicas_entry_v0_to_text(out, e); - } - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { - .validate = bch2_sb_replicas_v0_validate, - .to_text = bch2_sb_replicas_v0_to_text, -}; - -/* Query replicas: */ - -bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, - unsigned flags, bool print) -{ - struct bch_replicas_entry_v1 *e; - bool ret = true; - - percpu_down_read(&c->mark_lock); - for_each_cpu_replicas_entry(&c->replicas, e) { - unsigned nr_online = 0, nr_failed = 0, dflags = 0; - bool metadata = e->data_type < BCH_DATA_user; - - if (e->data_type == BCH_DATA_cached) - continue; - - scoped_guard(rcu) - for (unsigned i = 0; i < e->nr_devs; i++) { - if (e->devs[i] == BCH_SB_MEMBER_INVALID) { - nr_failed++; - continue; - } - - nr_online += test_bit(e->devs[i], devs.d); - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); - nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; - } - - if (nr_online + nr_failed == e->nr_devs) - continue; - - if (nr_online < e->nr_required) - dflags |= metadata - ? BCH_FORCE_IF_METADATA_LOST - : BCH_FORCE_IF_DATA_LOST; - - if (nr_online < e->nr_devs) - dflags |= metadata - ? BCH_FORCE_IF_METADATA_DEGRADED - : BCH_FORCE_IF_DATA_DEGRADED; - - if (dflags & ~flags) { - if (print) { - struct printbuf buf = PRINTBUF; - - bch2_replicas_entry_to_text(&buf, e); - bch_err(c, "insufficient devices online (%u) for replicas entry %s", - nr_online, buf.buf); - printbuf_exit(&buf); - } - ret = false; - break; - } - - } - percpu_up_read(&c->mark_lock); - - return ret; -} - -unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) -{ - struct bch_sb_field_replicas *replicas; - struct bch_sb_field_replicas_v0 *replicas_v0; - unsigned data_has = 0; - - replicas = bch2_sb_field_get(sb, replicas); - replicas_v0 = bch2_sb_field_get(sb, replicas_v0); - - if (replicas) { - struct bch_replicas_entry_v1 *r; - - for_each_replicas_entry(replicas, r) { - if (r->data_type >= sizeof(data_has) * 8) - continue; - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] == dev) - data_has |= 1 << r->data_type; - } - - } else if (replicas_v0) { - struct bch_replicas_entry_v0 *r; - - for_each_replicas_entry_v0(replicas_v0, r) { - if (r->data_type >= sizeof(data_has) * 8) - continue; - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] == dev) - data_has |= 1 << r->data_type; - } - } - - - return data_has; -} - -unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) -{ - mutex_lock(&c->sb_lock); - unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); - mutex_unlock(&c->sb_lock); - - return ret; -} - -void bch2_fs_replicas_exit(struct bch_fs *c) -{ - kfree(c->replicas.entries); - kfree(c->replicas_gc.entries); -} diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h deleted file mode 100644 index 5aba2c1ce1331a..00000000000000 --- a/fs/bcachefs/replicas.h +++ /dev/null @@ -1,83 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REPLICAS_H -#define _BCACHEFS_REPLICAS_H - -#include "bkey.h" -#include "eytzinger.h" -#include "replicas_types.h" - -void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); -void bch2_replicas_entry_to_text(struct printbuf *, - struct bch_replicas_entry_v1 *); -int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *, - struct bch_fs *, struct printbuf *); -void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); - -static inline struct bch_replicas_entry_v1 * -cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -{ - return (void *) r->entries + r->entry_size * i; -} - -int bch2_replicas_entry_idx(struct bch_fs *, - struct bch_replicas_entry_v1 *); - -void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *, - enum bch_data_type, - struct bch_devs_list); - -bool bch2_replicas_marked_locked(struct bch_fs *, - struct bch_replicas_entry_v1 *); -bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *); -int bch2_mark_replicas(struct bch_fs *, - struct bch_replicas_entry_v1 *); - -void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c); - -static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e, - unsigned dev) -{ - e->data_type = BCH_DATA_cached; - e->nr_devs = 1; - e->nr_required = 1; - e->devs[0] = dev; -} - -bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, - unsigned, bool); - -unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); -unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); - -int bch2_replicas_gc_end(struct bch_fs *, int); -int bch2_replicas_gc_start(struct bch_fs *, unsigned); -int bch2_replicas_gc2(struct bch_fs *); - -#define for_each_cpu_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ - _i = (void *) (_i) + (_r)->entry_size) - -/* iterate over superblock replicas - used by userspace tools: */ - -#define replicas_entry_next(_i) \ - ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) - -#define for_each_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ - (_i) = replicas_entry_next(_i)) - -#define for_each_replicas_entry_v0(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ - (_i) = replicas_entry_next(_i)) - -int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); - -extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; -extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; - -void bch2_fs_replicas_exit(struct bch_fs *); - -#endif /* _BCACHEFS_REPLICAS_H */ diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h deleted file mode 100644 index b7eff904acdb71..00000000000000 --- a/fs/bcachefs/replicas_format.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REPLICAS_FORMAT_H -#define _BCACHEFS_REPLICAS_FORMAT_H - -struct bch_replicas_entry_v0 { - __u8 data_type; - __u8 nr_devs; - __u8 devs[] __counted_by(nr_devs); -} __packed; - -struct bch_sb_field_replicas_v0 { - struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[]; -} __packed __aligned(8); - -struct bch_replicas_entry_v1 { - __u8 data_type; - __u8 nr_devs; - __u8 nr_required; - __u8 devs[] __counted_by(nr_devs); -} __packed; - -struct bch_sb_field_replicas { - struct bch_sb_field field; - struct bch_replicas_entry_v1 entries[]; -} __packed __aligned(8); - -#define replicas_entry_bytes(_i) \ - (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) - -#define replicas_entry_add_dev(e, d) ({ \ - (e)->nr_devs++; \ - (e)->devs[(e)->nr_devs - 1] = (d); \ -}) - -#endif /* _BCACHEFS_REPLICAS_FORMAT_H */ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h deleted file mode 100644 index fed71c861fe76f..00000000000000 --- a/fs/bcachefs/replicas_types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REPLICAS_TYPES_H -#define _BCACHEFS_REPLICAS_TYPES_H - -struct bch_replicas_cpu { - unsigned nr; - unsigned entry_size; - struct bch_replicas_entry_v1 *entries; -}; - -#endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c deleted file mode 100644 index 59c8770e4a0e94..00000000000000 --- a/fs/bcachefs/sb-clean.c +++ /dev/null @@ -1,340 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "error.h" -#include "journal_io.h" -#include "replicas.h" -#include "sb-clean.h" -#include "super-io.h" - -/* - * BCH_SB_FIELD_clean: - * - * Btree roots, and a few other things, are recovered from the journal after an - * unclean shutdown - but after a clean shutdown, to avoid having to read the - * journal, we can store them in the superblock. - * - * bch_sb_field_clean simply contains a list of journal entries, stored exactly - * as they would be in the journal: - */ - -int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, - int write) -{ - struct bkey_validate_context from = { - .flags = write, - .from = BKEY_VALIDATE_superblock, - }; - struct jset_entry *entry; - int ret; - - for (entry = clean->start; - entry < (struct jset_entry *) vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - if (vstruct_end(entry) > vstruct_end(&clean->field)) { - bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu", - le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s), - (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field)); - bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun); - return -BCH_ERR_fsck_repair_unimplemented; - } - - ret = bch2_journal_entry_validate(c, NULL, entry, - le16_to_cpu(c->disk_sb.sb->version), - BCH_SB_BIG_ENDIAN(c->disk_sb.sb), - from); - if (ret) - return ret; - } - - return 0; -} - -static struct bkey_i *btree_root_find(struct bch_fs *c, - struct bch_sb_field_clean *clean, - struct jset *j, - enum btree_id id, unsigned *level) -{ - struct bkey_i *k; - struct jset_entry *entry, *start, *end; - - if (clean) { - start = clean->start; - end = vstruct_end(&clean->field); - } else { - start = j->start; - end = vstruct_last(j); - } - - for (entry = start; entry < end; entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_btree_root && - entry->btree_id == id) - goto found; - - return NULL; -found: - if (!entry->u64s) - return ERR_PTR(-EINVAL); - - k = entry->start; - *level = entry->level; - return k; -} - -int bch2_verify_superblock_clean(struct bch_fs *c, - struct bch_sb_field_clean **cleanp, - struct jset *j) -{ - unsigned i; - struct bch_sb_field_clean *clean = *cleanp; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - int ret = 0; - - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, - sb_clean_journal_seq_mismatch, - "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", - le64_to_cpu(clean->journal_seq), - le64_to_cpu(j->seq))) { - kfree(clean); - *cleanp = NULL; - return 0; - } - - for (i = 0; i < BTREE_ID_NR; i++) { - struct bkey_i *k1, *k2; - unsigned l1 = 0, l2 = 0; - - k1 = btree_root_find(c, clean, NULL, i, &l1); - k2 = btree_root_find(c, NULL, j, i, &l2); - - if (!k1 && !k2) - continue; - - printbuf_reset(&buf1); - printbuf_reset(&buf2); - - if (k1) - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); - else - prt_printf(&buf1, "(none)"); - - if (k2) - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); - else - prt_printf(&buf2, "(none)"); - - mustfix_fsck_err_on(!k1 || !k2 || - IS_ERR(k1) || - IS_ERR(k2) || - k1->k.u64s != k2->k.u64s || - memcmp(k1, k2, bkey_bytes(&k1->k)) || - l1 != l2, c, - sb_clean_btree_root_mismatch, - "superblock btree root %u doesn't match journal after clean shutdown\n" - "sb: l=%u %s\n" - "journal: l=%u %s\n", i, - l1, buf1.buf, - l2, buf2.buf); - } -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - -struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *clean, *sb_clean; - int ret; - - mutex_lock(&c->sb_lock); - sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean); - - if (fsck_err_on(!sb_clean, c, - sb_clean_missing, - "superblock marked clean but clean section not present")) { - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - mutex_unlock(&c->sb_lock); - return ERR_PTR(-BCH_ERR_invalid_sb_clean); - } - - clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), - GFP_KERNEL); - if (!clean) { - mutex_unlock(&c->sb_lock); - return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); - } - - ret = bch2_sb_clean_validate_late(c, clean, READ); - if (ret) { - kfree(clean); - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); - } - - mutex_unlock(&c->sb_lock); - - return clean; -fsck_err: - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); -} - -void bch2_journal_super_entries_add_common(struct bch_fs *c, - struct jset_entry **end, - u64 journal_seq) -{ - { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_key_version; - u->v = cpu_to_le64(atomic64_read(&c->key_version)); - } - - for (unsigned i = 0; i < 2; i++) { - struct jset_entry_clock *clock = - container_of(jset_entry_init(end, sizeof(*clock)), - struct jset_entry_clock, entry); - - clock->entry.type = BCH_JSET_ENTRY_clock; - clock->rw = i; - clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); - } -} - -static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_clean *clean = field_to_type(f, clean); - - if (vstruct_bytes(&clean->field) < sizeof(*clean)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&clean->field), sizeof(*clean)); - return -BCH_ERR_invalid_sb_clean; - } - - for (struct jset_entry *entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) { - prt_str(err, "entry type "); - bch2_prt_jset_entry_type(err, entry->type); - prt_str(err, " overruns end of section"); - return -BCH_ERR_invalid_sb_clean; - } - } - - return 0; -} - -static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_clean *clean = field_to_type(f, clean); - struct jset_entry *entry; - - prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags)); - prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq)); - - for (entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) - break; - - if (entry->type == BCH_JSET_ENTRY_btree_keys && - !entry->u64s) - continue; - - bch2_journal_entry_to_text(out, NULL, entry); - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_clean = { - .validate = bch2_sb_clean_validate, - .to_text = bch2_sb_clean_to_text, -}; - -int bch2_fs_mark_dirty(struct bch_fs *c) -{ - int ret; - - /* - * Unconditionally write superblock, to verify it hasn't changed before - * we go rw: - */ - - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); - - ret = bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; -} - -void bch2_fs_mark_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *sb_clean; - struct jset_entry *entry; - unsigned u64s; - int ret; - - mutex_lock(&c->sb_lock); - if (BCH_SB_CLEAN(c->disk_sb.sb)) - goto out; - - SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); - - u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; - - sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s); - if (!sb_clean) { - bch_err(c, "error resizing superblock while setting filesystem clean"); - goto out; - } - - sb_clean->flags = 0; - sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); - - /* Trying to catch outstanding bug: */ - BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); - - entry = sb_clean->start; - bch2_journal_super_entries_add_common(c, &entry, 0); - entry = bch2_btree_roots_to_journal_entries(c, entry, 0); - BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); - - memset(entry, 0, - vstruct_end(&sb_clean->field) - (void *) entry); - - /* - * this should be in the write path, and we should be validating every - * superblock section: - */ - ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); - if (ret) { - bch_err(c, "error writing marking filesystem clean: validate error"); - goto out; - } - - bch2_journal_pos_from_member_info_set(c); - - bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); -} diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h deleted file mode 100644 index 71caef2812398c..00000000000000 --- a/fs/bcachefs/sb-clean.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_CLEAN_H -#define _BCACHEFS_SB_CLEAN_H - -int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); -int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **, - struct jset *); -struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *); -void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64); - -extern const struct bch_sb_field_ops bch_sb_field_ops_clean; - -int bch2_fs_mark_dirty(struct bch_fs *); -void bch2_fs_mark_clean(struct bch_fs *); - -#endif /* _BCACHEFS_SB_CLEAN_H */ diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c deleted file mode 100644 index 2b4b8445d418b3..00000000000000 --- a/fs/bcachefs/sb-counters.c +++ /dev/null @@ -1,147 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "super-io.h" -#include "sb-counters.h" - -/* BCH_SB_FIELD_counters */ - -static const u8 counters_to_stable_map[] = { -#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, - BCH_PERSISTENT_COUNTERS() -#undef x -}; - -const char * const bch2_counter_names[] = { -#define x(t, n, ...) (#t), - BCH_PERSISTENT_COUNTERS() -#undef x - NULL -}; - -static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) -{ - if (!ctrs) - return 0; - - return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; -} - -static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - return 0; -} - -static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_counters *ctrs = field_to_type(f, counters); - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - if (stable < nr) - prt_printf(out, "%s \t%llu\n", - bch2_counter_names[i], - le64_to_cpu(ctrs->d[stable])); - } -} - -int bch2_sb_counters_to_cpu(struct bch_fs *c) -{ - struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) - c->counters_on_mount[i] = 0; - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - if (stable < nr) { - u64 v = le64_to_cpu(ctrs->d[stable]); - percpu_u64_set(&c->counters[i], v); - c->counters_on_mount[i] = v; - } - } - - return 0; -} - -int bch2_sb_counters_from_cpu(struct bch_fs *c) -{ - struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - struct bch_sb_field_counters *ret; - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - - if (nr < BCH_COUNTER_NR) { - ret = bch2_sb_field_resize(&c->disk_sb, counters, - sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); - if (ret) { - ctrs = ret; - nr = bch2_sb_counter_nr_entries(ctrs); - } - } - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - if (stable < nr) - ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); - } - - return 0; -} - -void bch2_fs_counters_exit(struct bch_fs *c) -{ - free_percpu(c->counters); -} - -int bch2_fs_counters_init(struct bch_fs *c) -{ - c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64)); - if (!c->counters) - return -BCH_ERR_ENOMEM_fs_counters_init; - - return bch2_sb_counters_to_cpu(c); -} - -const struct bch_sb_field_ops bch_sb_field_ops_counters = { - .validate = bch2_sb_counters_validate, - .to_text = bch2_sb_counters_to_text, -}; - -#ifndef NO_BCACHEFS_CHARDEV -long bch2_ioctl_query_counters(struct bch_fs *c, - struct bch_ioctl_query_counters __user *user_arg) -{ - struct bch_ioctl_query_counters arg; - int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); - if (ret) - return ret; - - if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || - arg.pad) - return -EINVAL; - - arg.nr = min(arg.nr, BCH_COUNTER_NR); - ret = put_user(arg.nr, &user_arg->nr); - if (ret) - return ret; - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - - if (stable < arg.nr) { - u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) - ? percpu_u64_get(&c->counters[i]) - : c->counters_on_mount[i]; - - ret = put_user(v, &user_arg->d[stable]); - if (ret) - return ret; - } - } - - return 0; -} -#endif diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h deleted file mode 100644 index a4329ad8dd1baa..00000000000000 --- a/fs/bcachefs/sb-counters.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_COUNTERS_H -#define _BCACHEFS_SB_COUNTERS_H - -#include "bcachefs.h" -#include "super-io.h" - -int bch2_sb_counters_to_cpu(struct bch_fs *); -int bch2_sb_counters_from_cpu(struct bch_fs *); - -void bch2_fs_counters_exit(struct bch_fs *); -int bch2_fs_counters_init(struct bch_fs *); - -extern const char * const bch2_counter_names[]; -extern const struct bch_sb_field_ops bch_sb_field_ops_counters; - -long bch2_ioctl_query_counters(struct bch_fs *, - struct bch_ioctl_query_counters __user *); - -#endif // _BCACHEFS_SB_COUNTERS_H diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h deleted file mode 100644 index b868702a431a5e..00000000000000 --- a/fs/bcachefs/sb-counters_format.h +++ /dev/null @@ -1,117 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H -#define _BCACHEFS_SB_COUNTERS_FORMAT_H - -enum counters_flags { - TYPE_COUNTER = BIT(0), /* event counters */ - TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */ -}; - -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0, TYPE_SECTORS) \ - x(io_read_inline, 80, TYPE_SECTORS) \ - x(io_read_hole, 81, TYPE_SECTORS) \ - x(io_read_promote, 30, TYPE_COUNTER) \ - x(io_read_bounce, 31, TYPE_COUNTER) \ - x(io_read_split, 33, TYPE_COUNTER) \ - x(io_read_reuse_race, 34, TYPE_COUNTER) \ - x(io_read_retry, 32, TYPE_COUNTER) \ - x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ - x(io_write, 1, TYPE_SECTORS) \ - x(io_move, 2, TYPE_SECTORS) \ - x(io_move_read, 35, TYPE_SECTORS) \ - x(io_move_write, 36, TYPE_SECTORS) \ - x(io_move_finish, 37, TYPE_SECTORS) \ - x(io_move_fail, 38, TYPE_COUNTER) \ - x(io_move_write_fail, 82, TYPE_COUNTER) \ - x(io_move_start_fail, 39, TYPE_COUNTER) \ - x(io_move_created_rebalance, 83, TYPE_COUNTER) \ - x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \ - x(bucket_invalidate, 3, TYPE_COUNTER) \ - x(bucket_discard, 4, TYPE_COUNTER) \ - x(bucket_discard_fast, 79, TYPE_COUNTER) \ - x(bucket_alloc, 5, TYPE_COUNTER) \ - x(bucket_alloc_fail, 6, TYPE_COUNTER) \ - x(btree_cache_scan, 7, TYPE_COUNTER) \ - x(btree_cache_reap, 8, TYPE_COUNTER) \ - x(btree_cache_cannibalize, 9, TYPE_COUNTER) \ - x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \ - x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \ - x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \ - x(btree_node_write, 13, TYPE_COUNTER) \ - x(btree_node_read, 14, TYPE_COUNTER) \ - x(btree_node_compact, 15, TYPE_COUNTER) \ - x(btree_node_merge, 16, TYPE_COUNTER) \ - x(btree_node_split, 17, TYPE_COUNTER) \ - x(btree_node_rewrite, 18, TYPE_COUNTER) \ - x(btree_node_alloc, 19, TYPE_COUNTER) \ - x(btree_node_free, 20, TYPE_COUNTER) \ - x(btree_node_set_root, 21, TYPE_COUNTER) \ - x(btree_path_relock_fail, 22, TYPE_COUNTER) \ - x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \ - x(btree_reserve_get_fail, 24, TYPE_COUNTER) \ - x(journal_entry_full, 25, TYPE_COUNTER) \ - x(journal_full, 26, TYPE_COUNTER) \ - x(journal_reclaim_finish, 27, TYPE_COUNTER) \ - x(journal_reclaim_start, 28, TYPE_COUNTER) \ - x(journal_write, 29, TYPE_COUNTER) \ - x(copygc, 40, TYPE_COUNTER) \ - x(copygc_wait, 41, TYPE_COUNTER) \ - x(gc_gens_end, 42, TYPE_COUNTER) \ - x(gc_gens_start, 43, TYPE_COUNTER) \ - x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \ - x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \ - x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \ - x(trans_restart_fault_inject, 47, TYPE_COUNTER) \ - x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \ - x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \ - x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \ - x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \ - x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \ - x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \ - x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \ - x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \ - x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \ - x(trans_restart_relock, 57, TYPE_COUNTER) \ - x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \ - x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \ - x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \ - x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \ - x(trans_restart_relock_path, 62, TYPE_COUNTER) \ - x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \ - x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \ - x(trans_restart_traverse, 65, TYPE_COUNTER) \ - x(trans_restart_upgrade, 66, TYPE_COUNTER) \ - x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \ - x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \ - x(trans_restart_injected, 69, TYPE_COUNTER) \ - x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \ - x(trans_traverse_all, 71, TYPE_COUNTER) \ - x(transaction_commit, 72, TYPE_COUNTER) \ - x(write_super, 73, TYPE_COUNTER) \ - x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \ - x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \ - x(trans_restart_split_race, 76, TYPE_COUNTER) \ - x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \ - x(write_buffer_flush_sync, 78, TYPE_COUNTER) - -enum bch_persistent_counters { -#define x(t, n, ...) BCH_COUNTER_##t, - BCH_PERSISTENT_COUNTERS() -#undef x - BCH_COUNTER_NR -}; - -enum bch_persistent_counters_stable { -#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n, - BCH_PERSISTENT_COUNTERS() -#undef x - BCH_COUNTER_STABLE_NR -}; - -struct bch_sb_field_counters { - struct bch_sb_field field; - __le64 d[]; -}; - -#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c deleted file mode 100644 index 1506d05e06654b..00000000000000 --- a/fs/bcachefs/sb-downgrade.c +++ /dev/null @@ -1,457 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Superblock section that contains a list of recovery passes to run when - * downgrading past a given version - */ - -#include "bcachefs.h" -#include "darray.h" -#include "recovery_passes.h" -#include "sb-downgrade.h" -#include "sb-errors.h" -#include "super-io.h" - -#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63) - -/* - * Upgrade, downgrade tables - run certain recovery passes, fix certain errors - * - * x(version, recovery_passes, errors...) - */ -#define UPGRADE_TABLE() \ - x(snapshot_2, \ - RECOVERY_PASS_ALL_FSCK, \ - BCH_FSCK_ERR_subvol_root_wrong_bi_subvol, \ - BCH_FSCK_ERR_subvol_not_master_and_not_snapshot) \ - x(backpointers, \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_v3, \ - RECOVERY_PASS_ALL_FSCK) \ - x(unwritten_extents, \ - RECOVERY_PASS_ALL_FSCK) \ - x(bucket_gens, \ - BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ - RECOVERY_PASS_ALL_FSCK) \ - x(lru_v2, \ - RECOVERY_PASS_ALL_FSCK) \ - x(fragmentation_lru, \ - RECOVERY_PASS_ALL_FSCK) \ - x(no_bps_in_alloc_keys, \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_trees, \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_skiplists, \ - BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \ - BCH_FSCK_ERR_snapshot_bad_depth, \ - BCH_FSCK_ERR_snapshot_bad_skiplist) \ - x(deleted_inodes, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ - BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ - x(rebalance_work, \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \ - x(subvolume_fs_parent, \ - BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ - BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ - x(btree_subvolume_children, \ - BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ - BCH_FSCK_ERR_subvol_children_not_set) \ - x(mi_btree_bitmap, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_btree_bitmap_not_marked) \ - x(disk_accounting_v2, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_bkey_version_in_future, \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_accounting_mismatch) \ - x(disk_accounting_v3, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_bkey_version_in_future, \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \ - BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(disk_accounting_inum, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) \ - x(rebalance_work_acct_fix, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) \ - x(inode_has_child_snapshots, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ - BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \ - x(backpointer_bucket_gen, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_backpointer_to_missing_ptr, \ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(disk_accounting_big_endian, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(cached_backpointers, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(stripe_backpointers, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(inode_has_case_insensitive, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ - BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \ - BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set) - -#define DOWNGRADE_TABLE() \ - x(bucket_stripe_sectors, \ - 0) \ - x(disk_accounting_v2, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_fs_usage_hidden_wrong, \ - BCH_FSCK_ERR_fs_usage_btree_wrong, \ - BCH_FSCK_ERR_fs_usage_data_wrong, \ - BCH_FSCK_ERR_fs_usage_cached_wrong, \ - BCH_FSCK_ERR_fs_usage_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ - BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_replicas_wrong, \ - BCH_FSCK_ERR_bkey_version_in_future) \ - x(disk_accounting_v3, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_fs_usage_hidden_wrong, \ - BCH_FSCK_ERR_fs_usage_btree_wrong, \ - BCH_FSCK_ERR_fs_usage_data_wrong, \ - BCH_FSCK_ERR_fs_usage_cached_wrong, \ - BCH_FSCK_ERR_fs_usage_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ - BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_replicas_wrong, \ - BCH_FSCK_ERR_accounting_replicas_not_marked, \ - BCH_FSCK_ERR_bkey_version_in_future) \ - x(rebalance_work_acct_fix, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(backpointer_bucket_gen, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \ - BCH_FSCK_ERR_backpointer_to_missing_ptr, \ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(disk_accounting_big_endian, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) - -struct upgrade_downgrade_entry { - u64 recovery_passes; - u16 version; - u16 nr_errors; - const u16 *errors; -}; - -#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ }; -UPGRADE_TABLE() -#undef x - -static const struct upgrade_downgrade_entry upgrade_table[] = { -#define x(ver, passes, ...) { \ - .recovery_passes = passes, \ - .version = bcachefs_metadata_version_##ver,\ - .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \ - .errors = upgrade_##ver##_errors, \ -}, -UPGRADE_TABLE() -#undef x -}; - -static int have_stripes(struct bch_fs *c) -{ - if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b)) - return 0; - - return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b); -} - -int bch2_sb_set_upgrade_extra(struct bch_fs *c) -{ - unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; - unsigned new_version = c->sb.version; - bool write_sb = false; - int ret = 0; - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - if (old_version < bcachefs_metadata_version_bucket_stripe_sectors && - new_version >= bcachefs_metadata_version_bucket_stripe_sectors && - (ret = have_stripes(c) > 0)) { - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent); - write_sb = true; - } - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret < 0 ? ret : 0; -} - -void bch2_sb_set_upgrade(struct bch_fs *c, - unsigned old_version, - unsigned new_version) -{ - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - for (const struct upgrade_downgrade_entry *i = upgrade_table; - i < upgrade_table + ARRAY_SIZE(upgrade_table); - i++) - if (i->version > old_version && i->version <= new_version) { - u64 passes = i->recovery_passes; - - if (passes & RECOVERY_PASS_ALL_FSCK) - passes |= bch2_fsck_recovery_passes(); - passes &= ~RECOVERY_PASS_ALL_FSCK; - - ext->recovery_passes_required[0] |= - cpu_to_le64(bch2_recovery_passes_to_stable(passes)); - - for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++) - __set_bit_le64(*e, ext->errors_silent); - } -} - -#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ }; -DOWNGRADE_TABLE() -#undef x - -static const struct upgrade_downgrade_entry downgrade_table[] = { -#define x(ver, passes, ...) { \ - .recovery_passes = passes, \ - .version = bcachefs_metadata_version_##ver,\ - .nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \ - .errors = downgrade_##ver##_errors, \ -}, -DOWNGRADE_TABLE() -#undef x -}; - -static int downgrade_table_extra(struct bch_fs *c, darray_char *table) -{ - unsigned dst_offset = table->nr; - struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table); - unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); - int ret = 0; - - unsigned nr_errors = le16_to_cpu(dst->nr_errors); - - switch (le16_to_cpu(dst->version)) { - case bcachefs_metadata_version_bucket_stripe_sectors: - if (have_stripes(c)) { - bytes += sizeof(dst->errors[0]) * 2; - - ret = darray_make_room(table, bytes); - if (ret) - return ret; - - dst = (void *) &table->data[dst_offset]; - dst->nr_errors = cpu_to_le16(nr_errors + 1); - - /* open coded __set_bit_le64, as dst is packed and - * dst->recovery_passes is misaligned */ - unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations; - dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64)); - - dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong); - } - break; - } - - return ret; -} - -static inline const struct bch_sb_field_downgrade_entry * -downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e) -{ - return (void *) &e->errors[le16_to_cpu(e->nr_errors)]; -} - -#define for_each_downgrade_entry(_d, _i) \ - for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \ - (void *) _i < vstruct_end(&(_d)->field) && \ - (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \ - (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \ - _i = downgrade_entry_next_c(_i)) - -static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); - - for (const struct bch_sb_field_downgrade_entry *i = e->entries; - (void *) i < vstruct_end(&e->field); - i = downgrade_entry_next_c(i)) { - /* - * Careful: sb_field_downgrade_entry is only 2 byte aligned, but - * section sizes are 8 byte aligned - an empty entry spanning - * the end of the section is allowed (and ignored): - */ - if ((void *) &i->errors[0] > vstruct_end(&e->field)) - break; - - if (flags & BCH_VALIDATE_write && - (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) { - prt_printf(err, "downgrade entry overruns end of superblock section"); - return -BCH_ERR_invalid_sb_downgrade; - } - - if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) != - BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) { - prt_printf(err, "downgrade entry with mismatched major version (%u != %u)", - BCH_VERSION_MAJOR(le16_to_cpu(i->version)), - BCH_VERSION_MAJOR(le16_to_cpu(sb->version))); - return -BCH_ERR_invalid_sb_downgrade; - } - } - - return 0; -} - -static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); - - if (out->nr_tabstops <= 1) - printbuf_tabstop_push(out, 16); - - for_each_downgrade_entry(e, i) { - prt_str(out, "version:\t"); - bch2_version_to_text(out, le16_to_cpu(i->version)); - prt_newline(out); - - prt_str(out, "recovery passes:\t"); - prt_bitflags(out, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0]))); - prt_newline(out); - - prt_str(out, "errors:\t"); - bool first = true; - for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { - if (!first) - prt_char(out, ','); - first = false; - bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j])); - } - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_downgrade = { - .validate = bch2_sb_downgrade_validate, - .to_text = bch2_sb_downgrade_to_text, -}; - -int bch2_sb_downgrade_update(struct bch_fs *c) -{ - if (!test_bit(BCH_FS_btree_running, &c->flags)) - return 0; - - darray_char table = {}; - int ret = 0; - - for (const struct upgrade_downgrade_entry *src = downgrade_table; - src < downgrade_table + ARRAY_SIZE(downgrade_table); - src++) { - if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) - continue; - - if (src->version < c->sb.version_incompat) - continue; - - struct bch_sb_field_downgrade_entry *dst; - unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors; - - ret = darray_make_room(&table, bytes); - if (ret) - goto out; - - dst = (void *) &darray_top(table); - dst->version = cpu_to_le16(src->version); - dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes)); - dst->recovery_passes[1] = 0; - dst->nr_errors = cpu_to_le16(src->nr_errors); - for (unsigned i = 0; i < src->nr_errors; i++) - dst->errors[i] = cpu_to_le16(src->errors[i]); - - ret = downgrade_table_extra(c, &table); - if (ret) - goto out; - - if (!dst->recovery_passes[0] && - !dst->recovery_passes[1] && - !dst->nr_errors) - continue; - - table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); - } - - struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade); - - unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64)); - - if (d && le32_to_cpu(d->field.u64s) > sb_u64s) - goto out; - - d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s); - if (!d) { - ret = bch_err_throw(c, ENOSPC_sb_downgrade); - goto out; - } - - memcpy(d->entries, table.data, table.nr); - memset_u64s_tail(d->entries, 0, table.nr); -out: - darray_exit(&table); - return ret; -} - -void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor) -{ - struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade); - if (!d) - return; - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - for_each_downgrade_entry(d, i) { - unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version)); - if (new_minor < minor && minor <= old_minor) { - ext->recovery_passes_required[0] |= i->recovery_passes[0]; - ext->recovery_passes_required[1] |= i->recovery_passes[1]; - - for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { - unsigned e = le16_to_cpu(i->errors[j]); - if (e < BCH_FSCK_ERR_MAX) - __set_bit(e, c->sb.errors_silent); - if (e < sizeof(ext->errors_silent) * 8) - __set_bit_le64(e, ext->errors_silent); - } - } - } -} diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h deleted file mode 100644 index 095b7cc9bb4735..00000000000000 --- a/fs/bcachefs/sb-downgrade.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_DOWNGRADE_H -#define _BCACHEFS_SB_DOWNGRADE_H - -extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade; - -int bch2_sb_downgrade_update(struct bch_fs *); -void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned); -int bch2_sb_set_upgrade_extra(struct bch_fs *); -void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned); - -#endif /* _BCACHEFS_SB_DOWNGRADE_H */ diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h deleted file mode 100644 index cffd932be3eca0..00000000000000 --- a/fs/bcachefs/sb-downgrade_format.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H -#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H - -struct bch_sb_field_downgrade_entry { - __le16 version; - __le64 recovery_passes[2]; - __le16 nr_errors; - __le16 errors[] __counted_by(nr_errors); -} __packed __aligned(2); - -struct bch_sb_field_downgrade { - struct bch_sb_field field; - struct bch_sb_field_downgrade_entry entries[]; -}; - -#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c deleted file mode 100644 index 48853efdc105d5..00000000000000 --- a/fs/bcachefs/sb-errors.c +++ /dev/null @@ -1,198 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "sb-errors.h" -#include "super-io.h" - -const char * const bch2_sb_error_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_SB_ERRS() -#undef x -}; - -void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) -{ - if (id < BCH_FSCK_ERR_MAX) - prt_str(out, bch2_sb_error_strs[id]); - else - prt_printf(out, "(unknown error %u)", id); -} - -static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e) -{ - return bch2_sb_field_nr_entries(e); -} - -static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) -{ - return (sizeof(struct bch_sb_field_errors) + - sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64); -} - -static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_errors *e = field_to_type(f, errors); - unsigned i, nr = bch2_sb_field_errors_nr_entries(e); - - for (i = 0; i < nr; i++) { - if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) { - prt_printf(err, "entry with count 0 (id "); - bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); - prt_printf(err, ")"); - return -BCH_ERR_invalid_sb_errors; - } - - if (i + 1 < nr && - BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >= - BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) { - prt_printf(err, "entries out of order"); - return -BCH_ERR_invalid_sb_errors; - } - } - - return 0; -} - -static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_errors *e = field_to_type(f, errors); - unsigned i, nr = bch2_sb_field_errors_nr_entries(e); - - if (out->nr_tabstops <= 1) - printbuf_tabstop_push(out, 16); - - for (i = 0; i < nr; i++) { - bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); - prt_tab(out); - prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i])); - prt_tab(out); - bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time)); - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_errors = { - .validate = bch2_sb_errors_validate, - .to_text = bch2_sb_errors_to_text, -}; - -void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c) -{ - if (out->nr_tabstops < 1) - printbuf_tabstop_push(out, 48); - if (out->nr_tabstops < 2) - printbuf_tabstop_push(out, 8); - if (out->nr_tabstops < 3) - printbuf_tabstop_push(out, 16); - - guard(mutex)(&c->fsck_error_counts_lock); - - bch_sb_errors_cpu *e = &c->fsck_error_counts; - darray_for_each(*e, i) { - bch2_sb_error_id_to_text(out, i->id); - prt_tab(out); - prt_u64(out, i->nr); - prt_tab(out); - bch2_prt_datetime(out, i->last_error_time); - prt_newline(out); - } -} - -void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) -{ - bch_sb_errors_cpu *e = &c->fsck_error_counts; - struct bch_sb_error_entry_cpu n = { - .id = err, - .nr = 1, - .last_error_time = ktime_get_real_seconds() - }; - unsigned i; - - mutex_lock(&c->fsck_error_counts_lock); - for (i = 0; i < e->nr; i++) { - if (err == e->data[i].id) { - e->data[i].nr++; - e->data[i].last_error_time = n.last_error_time; - goto out; - } - if (err < e->data[i].id) - break; - } - - if (darray_make_room(e, 1)) - goto out; - - darray_insert_item(e, i, n); -out: - mutex_unlock(&c->fsck_error_counts_lock); -} - -void bch2_sb_errors_from_cpu(struct bch_fs *c) -{ - bch_sb_errors_cpu *src = &c->fsck_error_counts; - struct bch_sb_field_errors *dst; - unsigned i; - - mutex_lock(&c->fsck_error_counts_lock); - - dst = bch2_sb_field_resize(&c->disk_sb, errors, - bch2_sb_field_errors_u64s(src->nr)); - - if (!dst) - goto err; - - for (i = 0; i < src->nr; i++) { - SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); - SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); - dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); - } - -err: - mutex_unlock(&c->fsck_error_counts_lock); -} - -static int bch2_sb_errors_to_cpu(struct bch_fs *c) -{ - struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors); - bch_sb_errors_cpu *dst = &c->fsck_error_counts; - unsigned i, nr = bch2_sb_field_errors_nr_entries(src); - int ret; - - if (!nr) - return 0; - - mutex_lock(&c->fsck_error_counts_lock); - ret = darray_make_room(dst, nr); - if (ret) - goto err; - - dst->nr = nr; - - for (i = 0; i < nr; i++) { - dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]); - dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]); - dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time); - } -err: - mutex_unlock(&c->fsck_error_counts_lock); - - return ret; -} - -void bch2_fs_sb_errors_exit(struct bch_fs *c) -{ - darray_exit(&c->fsck_error_counts); -} - -void bch2_fs_sb_errors_init_early(struct bch_fs *c) -{ - mutex_init(&c->fsck_error_counts_lock); - darray_init(&c->fsck_error_counts); -} - -int bch2_fs_sb_errors_init(struct bch_fs *c) -{ - return bch2_sb_errors_to_cpu(c); -} diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h deleted file mode 100644 index e86267264692d1..00000000000000 --- a/fs/bcachefs/sb-errors.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_ERRORS_H -#define _BCACHEFS_SB_ERRORS_H - -#include "sb-errors_types.h" - -extern const char * const bch2_sb_error_strs[]; - -void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id); -void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *); - -extern const struct bch_sb_field_ops bch_sb_field_ops_errors; - -void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); - -void bch2_sb_errors_from_cpu(struct bch_fs *); - -void bch2_fs_sb_errors_exit(struct bch_fs *); -void bch2_fs_sb_errors_init_early(struct bch_fs *); -int bch2_fs_sb_errors_init(struct bch_fs *); - -#endif /* _BCACHEFS_SB_ERRORS_H */ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h deleted file mode 100644 index d154b7651d2875..00000000000000 --- a/fs/bcachefs/sb-errors_format.h +++ /dev/null @@ -1,353 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H -#define _BCACHEFS_SB_ERRORS_FORMAT_H - -enum bch_fsck_flags { - FSCK_CAN_FIX = BIT(0), - FSCK_CAN_IGNORE = BIT(1), - FSCK_AUTOFIX = BIT(2), - FSCK_ERR_NO_LOG = BIT(3), -}; - -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0, 0) \ - x(dirty_but_no_journal_entries, 1, 0) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \ - x(sb_clean_journal_seq_mismatch, 3, 0) \ - x(sb_clean_btree_root_mismatch, 4, 0) \ - x(sb_clean_missing, 5, 0) \ - x(jset_unsupported_version, 6, 0) \ - x(jset_unknown_csum, 7, 0) \ - x(jset_last_seq_newer_than_seq, 8, 0) \ - x(jset_past_bucket_end, 9, 0) \ - x(jset_seq_blacklisted, 10, 0) \ - x(journal_entries_missing, 11, 0) \ - x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \ - x(journal_entry_past_jset_end, 13, 0) \ - x(journal_entry_replicas_data_mismatch, 14, 0) \ - x(journal_entry_bkey_u64s_0, 15, 0) \ - x(journal_entry_bkey_past_end, 16, 0) \ - x(journal_entry_bkey_bad_format, 17, 0) \ - x(journal_entry_bkey_invalid, 18, 0) \ - x(journal_entry_btree_root_bad_size, 19, 0) \ - x(journal_entry_blacklist_bad_size, 20, 0) \ - x(journal_entry_blacklist_v2_bad_size, 21, 0) \ - x(journal_entry_blacklist_v2_start_past_end, 22, 0) \ - x(journal_entry_usage_bad_size, 23, 0) \ - x(journal_entry_data_usage_bad_size, 24, 0) \ - x(journal_entry_clock_bad_size, 25, 0) \ - x(journal_entry_clock_bad_rw, 26, 0) \ - x(journal_entry_dev_usage_bad_size, 27, 0) \ - x(journal_entry_dev_usage_bad_dev, 28, 0) \ - x(journal_entry_dev_usage_bad_pad, 29, 0) \ - x(btree_node_unreadable, 30, 0) \ - x(btree_node_fault_injected, 31, 0) \ - x(btree_node_bad_magic, 32, 0) \ - x(btree_node_bad_seq, 33, 0) \ - x(btree_node_unsupported_version, 34, 0) \ - x(btree_node_bset_older_than_sb_min, 35, 0) \ - x(btree_node_bset_newer_than_sb, 36, 0) \ - x(btree_node_data_missing, 37, FSCK_AUTOFIX) \ - x(btree_node_bset_after_end, 38, 0) \ - x(btree_node_replicas_sectors_written_mismatch, 39, 0) \ - x(btree_node_replicas_data_mismatch, 40, 0) \ - x(bset_unknown_csum, 41, 0) \ - x(bset_bad_csum, 42, 0) \ - x(bset_past_end_of_btree_node, 43, 0) \ - x(bset_wrong_sector_offset, 44, 0) \ - x(bset_empty, 45, 0) \ - x(bset_bad_seq, 46, 0) \ - x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \ - x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ - x(btree_node_bad_btree, 49, 0) \ - x(btree_node_bad_level, 50, 0) \ - x(btree_node_bad_min_key, 51, 0) \ - x(btree_node_bad_max_key, 52, 0) \ - x(btree_node_bad_format, 53, 0) \ - x(btree_node_bkey_past_bset_end, 54, 0) \ - x(btree_node_bkey_bad_format, 55, 0) \ - x(btree_node_bad_bkey, 56, 0) \ - x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \ - x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \ - x(btree_root_read_error, 59, FSCK_AUTOFIX) \ - x(btree_root_bad_min_key, 60, 0) \ - x(btree_root_bad_max_key, 61, 0) \ - x(btree_node_read_error, 62, FSCK_AUTOFIX) \ - x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ - x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ - x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \ - x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \ - x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \ - x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ - x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ - x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ - x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \ - x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \ - x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \ - x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \ - x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \ - x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \ - x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \ - x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \ - x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \ - x(bkey_version_in_future, 80, 0) \ - x(bkey_u64s_too_small, 81, 0) \ - x(bkey_invalid_type_for_btree, 82, 0) \ - x(bkey_extent_size_zero, 83, 0) \ - x(bkey_extent_size_greater_than_offset, 84, 0) \ - x(bkey_size_nonzero, 85, 0) \ - x(bkey_snapshot_nonzero, 86, 0) \ - x(bkey_snapshot_zero, 87, 0) \ - x(bkey_at_pos_max, 88, 0) \ - x(bkey_before_start_of_btree_node, 89, 0) \ - x(bkey_after_end_of_btree_node, 90, 0) \ - x(bkey_val_size_nonzero, 91, 0) \ - x(bkey_val_size_too_small, 92, 0) \ - x(alloc_v1_val_size_bad, 93, 0) \ - x(alloc_v2_unpack_error, 94, 0) \ - x(alloc_v3_unpack_error, 95, 0) \ - x(alloc_v4_val_size_bad, 96, 0) \ - x(alloc_v4_backpointers_start_bad, 97, 0) \ - x(alloc_key_data_type_bad, 98, 0) \ - x(alloc_key_empty_but_have_data, 99, 0) \ - x(alloc_key_dirty_sectors_0, 100, 0) \ - x(alloc_key_data_type_inconsistency, 101, 0) \ - x(alloc_key_to_missing_dev_bucket, 102, 0) \ - x(alloc_key_cached_inconsistency, 103, 0) \ - x(alloc_key_cached_but_read_time_zero, 104, FSCK_AUTOFIX) \ - x(alloc_key_to_missing_lru_entry, 105, FSCK_AUTOFIX) \ - x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \ - x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \ - x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \ - x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ - x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ - x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ - x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \ - x(bucket_sector_count_overflow, 112, 0) \ - x(bucket_metadata_type_mismatch, 113, 0) \ - x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \ - x(freespace_key_wrong, 115, FSCK_AUTOFIX) \ - x(freespace_hole_missing, 116, FSCK_AUTOFIX) \ - x(bucket_gens_val_size_bad, 117, 0) \ - x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \ - x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \ - x(bucket_gens_to_invalid_dev, 120, FSCK_AUTOFIX) \ - x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ - x(need_discard_freespace_key_bad, 124, FSCK_AUTOFIX) \ - x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ - x(backpointer_bucket_offset_wrong, 125, 0) \ - x(backpointer_level_bad, 294, 0) \ - x(backpointer_dev_bad, 297, 0) \ - x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \ - x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \ - x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \ - x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \ - x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \ - x(lru_entry_bad, 131, FSCK_AUTOFIX) \ - x(btree_ptr_val_too_big, 132, 0) \ - x(btree_ptr_v2_val_too_big, 133, 0) \ - x(btree_ptr_has_non_ptr, 134, 0) \ - x(extent_ptrs_invalid_entry, 135, 0) \ - x(extent_ptrs_no_ptrs, 136, 0) \ - x(extent_ptrs_too_many_ptrs, 137, 0) \ - x(extent_ptrs_redundant_crc, 138, 0) \ - x(extent_ptrs_redundant_stripe, 139, 0) \ - x(extent_ptrs_unwritten, 140, 0) \ - x(extent_ptrs_written_and_unwritten, 141, 0) \ - x(ptr_to_invalid_device, 142, 0) \ - x(ptr_to_duplicate_device, 143, 0) \ - x(ptr_after_last_bucket, 144, 0) \ - x(ptr_before_first_bucket, 145, 0) \ - x(ptr_spans_multiple_buckets, 146, 0) \ - x(ptr_to_missing_backpointer, 147, FSCK_AUTOFIX) \ - x(ptr_to_missing_alloc_key, 148, FSCK_AUTOFIX) \ - x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \ - x(ptr_to_missing_stripe, 150, 0) \ - x(ptr_to_incorrect_stripe, 151, 0) \ - x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \ - x(ptr_too_stale, 153, 0) \ - x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ - x(ptr_bucket_data_type_mismatch, 155, 0) \ - x(ptr_cached_and_erasure_coded, 156, 0) \ - x(ptr_crc_uncompressed_size_too_small, 157, 0) \ - x(ptr_crc_uncompressed_size_too_big, 161, 0) \ - x(ptr_crc_uncompressed_size_mismatch, 300, 0) \ - x(ptr_crc_csum_type_unknown, 158, 0) \ - x(ptr_crc_compression_type_unknown, 159, 0) \ - x(ptr_crc_redundant, 160, 0) \ - x(ptr_crc_nonce_mismatch, 162, 0) \ - x(ptr_stripe_redundant, 163, 0) \ - x(extent_flags_not_at_start, 306, 0) \ - x(reservation_key_nr_replicas_invalid, 164, 0) \ - x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ - x(reflink_v_pos_bad, 292, 0) \ - x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \ - x(reflink_refcount_underflow, 293, 0) \ - x(stripe_pos_bad, 167, 0) \ - x(stripe_val_size_bad, 168, 0) \ - x(stripe_csum_granularity_bad, 290, 0) \ - x(stripe_sector_count_wrong, 169, 0) \ - x(snapshot_tree_pos_bad, 170, 0) \ - x(snapshot_tree_to_missing_snapshot, 171, 0) \ - x(snapshot_tree_to_missing_subvol, 172, 0) \ - x(snapshot_tree_to_wrong_subvol, 173, 0) \ - x(snapshot_tree_to_snapshot_subvol, 174, 0) \ - x(snapshot_pos_bad, 175, 0) \ - x(snapshot_parent_bad, 176, 0) \ - x(snapshot_children_not_normalized, 177, 0) \ - x(snapshot_child_duplicate, 178, 0) \ - x(snapshot_child_bad, 179, 0) \ - x(snapshot_skiplist_not_normalized, 180, 0) \ - x(snapshot_skiplist_bad, 181, 0) \ - x(snapshot_should_not_have_subvol, 182, 0) \ - x(snapshot_to_bad_snapshot_tree, 183, FSCK_AUTOFIX) \ - x(snapshot_bad_depth, 184, 0) \ - x(snapshot_bad_skiplist, 185, 0) \ - x(subvol_pos_bad, 186, 0) \ - x(subvol_not_master_and_not_snapshot, 187, FSCK_AUTOFIX) \ - x(subvol_to_missing_root, 188, 0) \ - x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \ - x(bkey_in_missing_snapshot, 190, 0) \ - x(bkey_in_deleted_snapshot, 315, FSCK_AUTOFIX) \ - x(inode_pos_inode_nonzero, 191, 0) \ - x(inode_pos_blockdev_range, 192, 0) \ - x(inode_alloc_cursor_inode_bad, 301, 0) \ - x(inode_unpack_error, 193, 0) \ - x(inode_str_hash_invalid, 194, 0) \ - x(inode_v3_fields_start_bad, 195, 0) \ - x(inode_snapshot_mismatch, 196, 0) \ - x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \ - x(inode_unlinked_but_clean, 197, 0) \ - x(inode_unlinked_but_nlink_nonzero, 198, 0) \ - x(inode_unlinked_and_not_open, 281, 0) \ - x(inode_unlinked_but_has_dirent, 285, 0) \ - x(inode_checksum_type_invalid, 199, 0) \ - x(inode_compression_type_invalid, 200, 0) \ - x(inode_subvol_root_but_not_dir, 201, 0) \ - x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \ - x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \ - x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \ - x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \ - x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ - x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \ - x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \ - x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \ - x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ - x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ - x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ - x(inode_has_child_snapshots_wrong, 287, FSCK_AUTOFIX) \ - x(inode_unreachable, 210, FSCK_AUTOFIX) \ - x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ - x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ - x(inode_has_case_insensitive_not_set, 316, FSCK_AUTOFIX) \ - x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ - x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ - x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ - x(vfs_bad_inode_rm, 320, 0) \ - x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ - x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ - x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ - x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ - x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ - x(extent_overlapping, 215, 0) \ - x(key_in_missing_inode, 216, FSCK_AUTOFIX) \ - x(key_in_wrong_inode_type, 217, 0) \ - x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \ - x(dirent_empty_name, 219, 0) \ - x(dirent_val_too_big, 220, 0) \ - x(dirent_name_too_long, 221, 0) \ - x(dirent_name_embedded_nul, 222, 0) \ - x(dirent_name_dot_or_dotdot, 223, 0) \ - x(dirent_name_has_slash, 224, 0) \ - x(dirent_d_type_wrong, 225, FSCK_AUTOFIX) \ - x(inode_bi_parent_wrong, 226, 0) \ - x(dirent_in_missing_dir_inode, 227, 0) \ - x(dirent_in_non_dir_inode, 228, 0) \ - x(dirent_to_missing_inode, 229, FSCK_AUTOFIX) \ - x(dirent_to_overwritten_inode, 302, 0) \ - x(dirent_to_missing_subvol, 230, 0) \ - x(dirent_to_itself, 231, 0) \ - x(dirent_casefold_mismatch, 318, FSCK_AUTOFIX) \ - x(quota_type_invalid, 232, 0) \ - x(xattr_val_size_too_small, 233, 0) \ - x(xattr_val_size_too_big, 234, 0) \ - x(xattr_invalid_type, 235, 0) \ - x(xattr_name_invalid_chars, 236, 0) \ - x(xattr_in_missing_inode, 237, 0) \ - x(root_subvol_missing, 238, 0) \ - x(root_dir_missing, 239, 0) \ - x(root_inode_not_dir, 240, 0) \ - x(dir_loop, 241, 0) \ - x(hash_table_key_duplicate, 242, FSCK_AUTOFIX) \ - x(hash_table_key_wrong_offset, 243, FSCK_AUTOFIX) \ - x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \ - x(reflink_p_front_pad_bad, 245, 0) \ - x(journal_entry_dup_same_device, 246, 0) \ - x(inode_bi_subvol_missing, 247, 0) \ - x(inode_bi_subvol_wrong, 248, 0) \ - x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \ - x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \ - x(inode_bi_parent_nonzero, 251, 0) \ - x(dirent_to_missing_parent_subvol, 252, 0) \ - x(dirent_not_visible_in_parent_subvol, 253, 0) \ - x(subvol_fs_path_parent_wrong, 254, 0) \ - x(subvol_root_fs_path_parent_nonzero, 255, 0) \ - x(subvol_children_not_set, 256, 0) \ - x(subvol_children_bad, 257, 0) \ - x(subvol_loop, 258, 0) \ - x(subvol_unreachable, 259, FSCK_AUTOFIX) \ - x(btree_node_bkey_bad_u64s, 260, 0) \ - x(btree_node_topology_empty_interior_node, 261, 0) \ - x(btree_ptr_v2_min_key_bad, 262, 0) \ - x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ - x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ - x(dup_backpointer_to_bad_csum_extent, 265, 0) \ - x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ - x(sb_clean_entry_overrun, 267, 0) \ - x(btree_ptr_v2_written_0, 268, 0) \ - x(subvol_snapshot_bad, 269, 0) \ - x(subvol_inode_bad, 270, 0) \ - x(subvol_missing, 308, FSCK_AUTOFIX) \ - x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ - x(accounting_mismatch, 272, FSCK_AUTOFIX) \ - x(accounting_replicas_not_marked, 273, 0) \ - x(accounting_to_invalid_device, 289, 0) \ - x(invalid_btree_id, 274, FSCK_AUTOFIX) \ - x(alloc_key_io_time_bad, 275, 0) \ - x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ - x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ - x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ - x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ - x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ - x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ - x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \ - x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ - x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ - x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ - x(dirent_cf_name_too_big, 304, 0) \ - x(dirent_stray_data_after_cf_name, 305, 0) \ - x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ - x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 321, 0) - -enum bch_sb_error_id { -#define x(t, n, ...) BCH_FSCK_ERR_##t = n, - BCH_SB_ERRS() -#undef x -}; - -struct bch_sb_field_errors { - struct bch_sb_field field; - struct bch_sb_field_error_entry { - __le64 v; - __le64 last_error_time; - } entries[]; -}; - -LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); -LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); - -#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h deleted file mode 100644 index 40325239c3b0f2..00000000000000 --- a/fs/bcachefs/sb-errors_types.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_ERRORS_TYPES_H -#define _BCACHEFS_SB_ERRORS_TYPES_H - -#include "darray.h" - -struct bch_sb_error_entry_cpu { - u64 id:16, - nr:48; - u64 last_error_time; -}; - -typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; - -#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c deleted file mode 100644 index 6245e342a8a851..00000000000000 --- a/fs/bcachefs/sb-members.c +++ /dev/null @@ -1,606 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "disk_groups.h" -#include "error.h" -#include "opts.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-members.h" -#include "super-io.h" - -int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) -{ - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev); - bch2_bkey_val_to_text(&buf, c, k); - - bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); - - int ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_allocations, 0); - - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return ret; -} - -void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) -{ - if (dev != BCH_SB_MEMBER_INVALID) - bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); -} - -void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket) -{ - bch2_fs_inconsistent(ca->fs, - "pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)", - bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets); -} - -#define x(t, n, ...) [n] = #t, -static const char * const bch2_iops_measurements[] = { - BCH_IOPS_MEASUREMENTS() - NULL -}; - -char * const bch2_member_error_strs[] = { - BCH_MEMBER_ERROR_TYPES() - NULL -}; -#undef x - -/* Code for bch_sb_field_members_v1: */ - -struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i) -{ - return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); -} - -static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i) -{ - struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); - memset(&ret, 0, sizeof(ret)); - memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); - return ret; -} - -static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i) -{ - return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES); -} - -static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i) -{ - struct bch_member ret, *p = members_v1_get_mut(mi, i); - memset(&ret, 0, sizeof(ret)); - memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret))); - return ret; -} - -struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i) -{ - struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2); - if (mi2) - return members_v2_get(mi2, i); - struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1); - return members_v1_get(mi1, i); -} - -static int sb_members_v2_resize_entries(struct bch_fs *c) -{ - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - - if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) { - unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) * - c->disk_sb.sb->nr_devices), 8); - - mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); - if (!mi) - return bch_err_throw(c, ENOSPC_sb_members_v2); - - for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { - void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); - memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes)); - memset(dst + le16_to_cpu(mi->member_bytes), - 0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes))); - } - mi->member_bytes = cpu_to_le16(sizeof(struct bch_member)); - } - return 0; -} - -int bch2_sb_members_v2_init(struct bch_fs *c) -{ - struct bch_sb_field_members_v1 *mi1; - struct bch_sb_field_members_v2 *mi2; - - if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) { - mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2, - DIV_ROUND_UP(sizeof(*mi2) + - sizeof(struct bch_member) * c->sb.nr_devices, - sizeof(u64))); - mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1); - memcpy(&mi2->_members[0], &mi1->_members[0], - BCH_MEMBER_V1_BYTES * c->sb.nr_devices); - memset(&mi2->pad[0], 0, sizeof(mi2->pad)); - mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES); - } - - return sb_members_v2_resize_entries(c); -} - -int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) -{ - struct bch_sb_field_members_v1 *mi1; - struct bch_sb_field_members_v2 *mi2; - - if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) { - bch2_sb_field_resize(disk_sb, members_v1, 0); - return 0; - } - - mi1 = bch2_sb_field_resize(disk_sb, members_v1, - DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * - disk_sb->sb->nr_devices, sizeof(u64))); - if (!mi1) - return -BCH_ERR_ENOSPC_sb_members; - - mi2 = bch2_sb_field_get(disk_sb->sb, members_v2); - - for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++) - memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES); - - return 0; -} - -static int validate_member(struct printbuf *err, - struct bch_member m, - struct bch_sb *sb, - int i) -{ - if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) { - prt_printf(err, "device %u: too many buckets (got %llu, max %u)", - i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX); - return -BCH_ERR_invalid_sb_members; - } - - if (le64_to_cpu(m.nbuckets) - - le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) { - prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", - i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS); - return -BCH_ERR_invalid_sb_members; - } - - if (le16_to_cpu(m.bucket_size) < - le16_to_cpu(sb->block_size)) { - prt_printf(err, "device %u: bucket size %u smaller than block size %u", - i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size)); - return -BCH_ERR_invalid_sb_members; - } - - if (le16_to_cpu(m.bucket_size) < - BCH_SB_BTREE_NODE_SIZE(sb)) { - prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", - i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); - return -BCH_ERR_invalid_sb_members; - } - - if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) { - prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift); - return -BCH_ERR_invalid_sb_members; - } - - if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) && - sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) { - prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i); - return -BCH_ERR_invalid_sb_members; - } - - return 0; -} - -static void member_to_text(struct printbuf *out, - struct bch_member m, - struct bch_sb_field_disk_groups *gi, - struct bch_sb *sb, - int i) -{ - unsigned data_have = bch2_sb_dev_has_data(sb, i); - u64 bucket_size = le16_to_cpu(m.bucket_size); - u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; - - if (!bch2_member_alive(&m)) - return; - - prt_printf(out, "Device:\t%u\n", i); - - printbuf_indent_add(out, 2); - - prt_printf(out, "Label:\t"); - if (BCH_MEMBER_GROUP(&m)) - bch2_disk_path_to_text_sb(out, sb, - BCH_MEMBER_GROUP(&m) - 1); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "UUID:\t"); - pr_uuid(out, m.uuid.b); - prt_newline(out); - - prt_printf(out, "Size:\t"); - prt_units_u64(out, device_size << 9); - prt_newline(out); - - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i])); - - for (unsigned i = 0; i < BCH_IOPS_NR; i++) - prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i])); - - prt_printf(out, "Bucket size:\t"); - prt_units_u64(out, bucket_size << 9); - prt_newline(out); - - prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket)); - prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets)); - - prt_printf(out, "Last mount:\t"); - if (m.last_mount) - bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); - else - prt_printf(out, "(never)"); - prt_newline(out); - - prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq)); - - prt_printf(out, "State:\t%s\n", - BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR - ? bch2_member_states[BCH_MEMBER_STATE(&m)] - : "unknown"); - - prt_printf(out, "Data allowed:\t"); - if (BCH_MEMBER_DATA_ALLOWED(&m)) - prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "Has data:\t"); - if (data_have) - prt_bitflags(out, __bch2_data_types, data_have); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "Btree allocated bitmap blocksize:\t"); - if (m.btree_bitmap_shift < 64) - prt_units_u64(out, 1ULL << m.btree_bitmap_shift); - else - prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); - prt_newline(out); - - prt_printf(out, "Btree allocated bitmap:\t"); - bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64); - prt_newline(out); - - prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); - - prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); - prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); - prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m)); - - printbuf_indent_sub(out, 2); -} - -static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); - unsigned i; - - if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) { - prt_printf(err, "too many devices for section size"); - return -BCH_ERR_invalid_sb_members; - } - - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member m = members_v1_get(mi, i); - - int ret = validate_member(err, m, sb, i); - if (ret) - return ret; - } - - return 0; -} - -static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); - struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - - if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { - prt_printf(out, "field ends before start of entries"); - return; - } - - unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]); - if (nr != sb->nr_devices) - prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); - - for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) - member_to_text(out, members_v1_get(mi, i), gi, sb, i); -} - -const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = { - .validate = bch2_sb_members_v1_validate, - .to_text = bch2_sb_members_v1_to_text, -}; - -static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); - struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - - if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { - prt_printf(out, "field ends before start of entries"); - return; - } - - if (!le16_to_cpu(mi->member_bytes)) { - prt_printf(out, "member_bytes 0"); - return; - } - - unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes); - if (nr != sb->nr_devices) - prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); - - /* - * We call to_text() on superblock sections that haven't passed - * validate, so we can't trust sb->nr_devices. - */ - - for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) - member_to_text(out, members_v2_get(mi, i), gi, sb, i); -} - -static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); - size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - - (void *) mi; - - if (mi_bytes > vstruct_bytes(&mi->field)) { - prt_printf(err, "section too small (%zu > %zu)", - mi_bytes, vstruct_bytes(&mi->field)); - return -BCH_ERR_invalid_sb_members; - } - - for (unsigned i = 0; i < sb->nr_devices; i++) { - int ret = validate_member(err, members_v2_get(mi, i), sb, i); - if (ret) - return ret; - } - - return 0; -} - -const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = { - .validate = bch2_sb_members_v2_validate, - .to_text = bch2_sb_members_v2_to_text, -}; - -void bch2_sb_members_from_cpu(struct bch_fs *c) -{ - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - - guard(rcu)(); - for_each_member_device_rcu(c, ca, NULL) { - struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); - - for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) - m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); - } -} - -void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_member m; - - mutex_lock(&ca->fs->sb_lock); - m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); - mutex_unlock(&ca->fs->sb_lock); - - printbuf_tabstop_push(out, 12); - - prt_str(out, "IO errors since filesystem creation"); - prt_newline(out); - - printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i])); - printbuf_indent_sub(out, 2); - - prt_str(out, "IO errors since "); - bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC); - prt_str(out, " ago"); - prt_newline(out); - - printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], - atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); - printbuf_indent_sub(out, 2); -} - -void bch2_dev_errors_reset(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_member *m; - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) - m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); - m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds()); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); -} - -/* - * Per member "range has btree nodes" bitmap: - * - * This is so that if we ever have to run the btree node scan to repair we don't - * have to scan full devices: - */ - -bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) -{ - guard(rcu)(); - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (ca && - !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) - return false; - } - return true; -} - -static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, - u64 start, unsigned sectors) -{ - struct bch_member *m = __bch2_members_v2_get_mut(mi, dev); - u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap); - - u64 end = start + sectors; - - int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6); - if (resize > 0) { - u64 new_bitmap = 0; - - for (unsigned i = 0; i < 64; i++) - if (bitmap & BIT_ULL(i)) - new_bitmap |= BIT_ULL(i >> resize); - bitmap = new_bitmap; - m->btree_bitmap_shift += resize; - } - - BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX); - BUG_ON(end > 64ULL << m->btree_bitmap_shift); - - for (unsigned bit = start >> m->btree_bitmap_shift; - (u64) bit << m->btree_bitmap_shift < end; - bit++) - bitmap |= BIT_ULL(bit); - - m->btree_allocated_bitmap = cpu_to_le64(bitmap); -} - -void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) -{ - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (!bch2_member_exists(c->disk_sb.sb, ptr->dev)) - continue; - - __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); - } -} - -unsigned bch2_sb_nr_devices(const struct bch_sb *sb) -{ - unsigned nr = 0; - - for (unsigned i = 0; i < sb->nr_devices; i++) - nr += bch2_member_exists((struct bch_sb *) sb, i); - return nr; -} - -int bch2_sb_member_alloc(struct bch_fs *c) -{ - unsigned dev_idx = c->sb.nr_devices; - struct bch_sb_field_members_v2 *mi; - unsigned nr_devices; - unsigned u64s; - int best = -1; - u64 best_last_mount = 0; - unsigned nr_deleted = 0; - - if (dev_idx < BCH_SB_MEMBERS_MAX) - goto have_slot; - - for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { - /* eventually BCH_SB_MEMBERS_MAX will be raised */ - if (dev_idx == BCH_SB_MEMBER_INVALID) - continue; - - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - - nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); - - if (!bch2_is_zero(&m.uuid, sizeof(m.uuid))) - continue; - - u64 last_mount = le64_to_cpu(m.last_mount); - if (best < 0 || last_mount < best_last_mount) { - best = dev_idx; - best_last_mount = last_mount; - } - } - if (best >= 0) { - dev_idx = best; - goto have_slot; - } - - if (nr_deleted) - bch_err(c, "unable to allocate new member, but have %u deleted: run fsck", - nr_deleted); - - return -BCH_ERR_ENOSPC_sb_members; -have_slot: - nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); - - mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + - le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); - - mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); - if (!mi) - return -BCH_ERR_ENOSPC_sb_members; - - c->disk_sb.sb->nr_devices = nr_devices; - return dev_idx; -} - -void bch2_sb_members_clean_deleted(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - bool write_sb = false; - - for (unsigned i = 0; i < c->sb.nr_devices; i++) { - struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i); - - if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) { - memset(&m->uuid, 0, sizeof(m->uuid)); - write_sb = true; - } - } - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); -} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h deleted file mode 100644 index 8d8a8a857648c5..00000000000000 --- a/fs/bcachefs/sb-members.h +++ /dev/null @@ -1,377 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_MEMBERS_H -#define _BCACHEFS_SB_MEMBERS_H - -#include "darray.h" -#include "bkey_types.h" -#include "enumerated_ref.h" - -extern char * const bch2_member_error_strs[]; - -static inline struct bch_member * -__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i) -{ - return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); -} - -int bch2_sb_members_v2_init(struct bch_fs *c); -int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); -struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i); -struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); - -static inline bool bch2_dev_is_online(struct bch_dev *ca) -{ - return !enumerated_ref_is_zero(&ca->io_ref[READ]); -} - -static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); - -static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu(c, dev); - return ca && bch2_dev_is_online(ca); -} - -static inline bool bch2_dev_is_healthy(struct bch_dev *ca) -{ - return bch2_dev_is_online(ca) && - ca->mi.state != BCH_MEMBER_STATE_failed; -} - -static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) -{ - return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); -} - -static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, - unsigned dev) -{ - darray_for_each(devs, i) - if (*i == dev) - return true; - return false; -} - -static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, - unsigned dev) -{ - darray_for_each(*devs, i) - if (*i == dev) { - darray_remove_item(devs, i); - return; - } -} - -static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, - unsigned dev) -{ - if (!bch2_dev_list_has_dev(*devs, dev)) { - BUG_ON(devs->nr >= ARRAY_SIZE(devs->data)); - devs->data[devs->nr++] = dev; - } -} - -static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) -{ - return (struct bch_devs_list) { .nr = 1, .data[0] = dev }; -} - -static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx, - const struct bch_devs_mask *mask) -{ - struct bch_dev *ca = NULL; - - while ((idx = mask - ? find_next_bit(mask->d, c->sb.nr_devices, idx) - : idx) < c->sb.nr_devices && - !(ca = rcu_dereference_check(c->devs[idx], - lockdep_is_held(&c->state_lock)))) - idx++; - - return ca; -} - -static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca, - const struct bch_devs_mask *mask) -{ - return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask); -} - -#define for_each_member_device_rcu(_c, _ca, _mask) \ - for (struct bch_dev *_ca = NULL; \ - (_ca = __bch2_next_dev((_c), _ca, (_mask)));) - -#define for_each_online_member_rcu(_c, _ca) \ - for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) - -#define for_each_rw_member_rcu(_c, _ca) \ - for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free]) - -static inline void bch2_dev_get(struct bch_dev *ca) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L); -#else - percpu_ref_get(&ca->ref); -#endif -} - -static inline void __bch2_dev_put(struct bch_dev *ca) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - long r = atomic_long_dec_return(&ca->ref); - if (r < (long) !ca->dying) - panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put); - ca->last_put = _THIS_IP_; - if (!r) - complete(&ca->ref_completion); -#else - percpu_ref_put(&ca->ref); -#endif -} - -static inline void bch2_dev_put(struct bch_dev *ca) -{ - if (ca) - __bch2_dev_put(ca); -} - -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) -{ - guard(rcu)(); - bch2_dev_put(ca); - if ((ca = __bch2_next_dev(c, ca, NULL))) - bch2_dev_get(ca); - return ca; -} - -/* - * If you break early, you must drop your ref on the current device - */ -#define __for_each_member_device(_c, _ca) \ - for (; (_ca = bch2_get_next_dev(_c, _ca));) - -#define for_each_member_device(_c, _ca) \ - for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_dev(_c, _ca));) - -static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, - struct bch_dev *ca, - unsigned state_mask, - int rw, unsigned ref_idx) -{ - guard(rcu)(); - if (ca) - enumerated_ref_put(&ca->io_ref[rw], ref_idx); - - while ((ca = __bch2_next_dev(c, ca, NULL)) && - (!((1 << ca->mi.state) & state_mask) || - !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))) - ; - - return ca; -} - -#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \ - for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));) - -#define for_each_online_member(c, ca, ref_idx) \ - __for_each_online_member(c, ca, ~0, READ, ref_idx) - -#define for_each_rw_member(c, ca, ref_idx) \ - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx) - -#define for_each_readable_member(c, ca, ref_idx) \ - __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx) - -static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) -{ - return dev < c->sb.nr_devices && c->devs[dev]; -} - -static inline bool bucket_valid(const struct bch_dev *ca, u64 b) -{ - return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first; -} - -static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev) -{ - EBUG_ON(!bch2_dev_exists(c, dev)); - - return rcu_dereference_check(c->devs[dev], 1); -} - -static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev) -{ - EBUG_ON(!bch2_dev_exists(c, dev)); - - return rcu_dereference_protected(c->devs[dev], - lockdep_is_held(&c->sb_lock) || - lockdep_is_held(&c->state_lock)); -} - -static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev) -{ - return c && dev < c->sb.nr_devices - ? rcu_dereference(c->devs[dev]) - : NULL; -} - -int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned); - -void bch2_dev_missing_atomic(struct bch_fs *, unsigned); - -static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) -{ - struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); - if (unlikely(!ca)) - bch2_dev_missing_atomic(c, dev); - return ca; -} - -static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); - if (ca) - bch2_dev_get(ca); - return ca; -} - -static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) -{ - struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); - if (unlikely(!ca)) - bch2_dev_missing_atomic(c, dev); - return ca; -} - -static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket) -{ - struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); - if (ca && unlikely(!bucket_valid(ca, bucket.offset))) { - bch2_dev_put(ca); - ca = NULL; - } - return ca; -} - -void bch2_dev_bucket_missing(struct bch_dev *, u64); - -static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket) -{ - struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); - if (ca && unlikely(!bucket_valid(ca, bucket.offset))) { - bch2_dev_bucket_missing(ca, bucket.offset); - bch2_dev_put(ca); - ca = NULL; - } - return ca; -} - -static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) -{ - if (ca && ca->dev_idx == dev_idx) - return ca; - bch2_dev_put(ca); - return bch2_dev_tryget_noerror(c, dev_idx); -} - -static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) -{ - if (ca && ca->dev_idx == dev_idx) - return ca; - bch2_dev_put(ca); - return bch2_dev_tryget(c, dev_idx); -} - -static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, - int rw, unsigned ref_idx) -{ - might_sleep(); - - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu(c, dev); - if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) - return NULL; - - if (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) - return ca; - - enumerated_ref_put(&ca->io_ref[rw], ref_idx); - return NULL; -} - -extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; -extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; - -static inline bool bch2_member_alive(struct bch_member *m) -{ - return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) && - !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID); -} - -static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) -{ - if (dev < sb->nr_devices) { - struct bch_member m = bch2_sb_member_get(sb, dev); - return bch2_member_alive(&m); - } - return false; -} - -unsigned bch2_sb_nr_devices(const struct bch_sb *); - -static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) -{ - return (struct bch_member_cpu) { - .nbuckets = le64_to_cpu(mi->nbuckets), - .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) - - le16_to_cpu(mi->first_bucket), - .first_bucket = le16_to_cpu(mi->first_bucket), - .bucket_size = le16_to_cpu(mi->bucket_size), - .group = BCH_MEMBER_GROUP(mi), - .state = BCH_MEMBER_STATE(mi), - .discard = BCH_MEMBER_DISCARD(mi), - .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), - .durability = BCH_MEMBER_DURABILITY(mi) - ? BCH_MEMBER_DURABILITY(mi) - 1 - : 1, - .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), - .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi), - .valid = bch2_member_alive(mi), - .btree_bitmap_shift = mi->btree_bitmap_shift, - .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), - }; -} - -void bch2_sb_members_from_cpu(struct bch_fs *); - -void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); -void bch2_dev_errors_reset(struct bch_dev *); - -static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors) -{ - u64 end = start + sectors; - - if (end > 64ULL << ca->mi.btree_bitmap_shift) - return false; - - for (unsigned bit = start >> ca->mi.btree_bitmap_shift; - (u64) bit << ca->mi.btree_bitmap_shift < end; - bit++) - if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit))) - return false; - return true; -} - -bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); -void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); - -int bch2_sb_member_alloc(struct bch_fs *); -void bch2_sb_members_clean_deleted(struct bch_fs *); - -#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h deleted file mode 100644 index fb72ad730518f7..00000000000000 --- a/fs/bcachefs/sb-members_format.h +++ /dev/null @@ -1,128 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H -#define _BCACHEFS_SB_MEMBERS_FORMAT_H - -/* - * We refer to members with bitmasks in various places - but we need to get rid - * of this limit: - */ -#define BCH_SB_MEMBERS_MAX 64 - -/* - * Sentinal value - indicates a device that does not exist - */ -#define BCH_SB_MEMBER_INVALID 255 - -#define BCH_SB_MEMBER_DELETED_UUID \ - UUID_INIT(0xffffffff, 0xffff, 0xffff, \ - 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) - -#define BCH_MIN_NR_NBUCKETS (1 << 6) - -#define BCH_IOPS_MEASUREMENTS() \ - x(seqread, 0) \ - x(seqwrite, 1) \ - x(randread, 2) \ - x(randwrite, 3) - -enum bch_iops_measurement { -#define x(t, n) BCH_IOPS_##t = n, - BCH_IOPS_MEASUREMENTS() -#undef x - BCH_IOPS_NR -}; - -#define BCH_MEMBER_ERROR_TYPES() \ - x(read, 0) \ - x(write, 1) \ - x(checksum, 2) - -enum bch_member_error_type { -#define x(t, n) BCH_MEMBER_ERROR_##t = n, - BCH_MEMBER_ERROR_TYPES() -#undef x - BCH_MEMBER_ERROR_NR -}; - -struct bch_member { - __uuid_t uuid; - __le64 nbuckets; /* device size */ - __le16 first_bucket; /* index of first bucket used */ - __le16 bucket_size; /* sectors */ - __u8 btree_bitmap_shift; - __u8 pad[3]; - __le64 last_mount; /* time_t */ - - __le64 flags; - __le32 iops[4]; - __le64 errors[BCH_MEMBER_ERROR_NR]; - __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; - __le64 errors_reset_time; - __le64 seq; - __le64 btree_allocated_bitmap; - /* - * On recovery from a clean shutdown we don't normally read the journal, - * but we still want to resume writing from where we left off so we - * don't overwrite more than is necessary, for list journal debugging: - */ - __le32 last_journal_bucket; - __le32 last_journal_bucket_offset; -}; - -/* - * btree_allocated_bitmap can represent sector addresses of a u64: it itself has - * 64 elements, so 64 - ilog2(64) - */ -#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58 - -/* - * This limit comes from the bucket_gens array - it's a single allocation, and - * kernel allocation are limited to INT_MAX - */ -#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) - -#define BCH_MEMBER_V1_BYTES 56 - -LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16) -LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) -/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ -LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) -LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) -LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) -LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) -LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, - struct bch_member, flags, 30, 31) -LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT, - struct bch_member, flags, 31, 32) - -#if 0 -LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -#endif - -#define BCH_MEMBER_STATES() \ - x(rw, 0) \ - x(ro, 1) \ - x(failed, 2) \ - x(spare, 3) - -enum bch_member_state { -#define x(t, n) BCH_MEMBER_STATE_##t = n, - BCH_MEMBER_STATES() -#undef x - BCH_MEMBER_STATE_NR -}; - -struct bch_sb_field_members_v1 { - struct bch_sb_field field; - struct bch_member _members[]; //Members are now variable size -}; - -struct bch_sb_field_members_v2 { - struct bch_sb_field field; - __le16 member_bytes; //size of single member entry - u8 pad[6]; - struct bch_member _members[]; -}; - -#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h deleted file mode 100644 index d6443e18687299..00000000000000 --- a/fs/bcachefs/sb-members_types.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H -#define _BCACHEFS_SB_MEMBERS_TYPES_H - -struct bch_member_cpu { - u64 nbuckets; /* device size */ - u64 nbuckets_minus_first; - u16 first_bucket; /* index of first bucket used */ - u16 bucket_size; /* sectors */ - u16 group; - u8 state; - u8 discard; - u8 data_allowed; - u8 durability; - u8 freespace_initialized; - u8 resize_on_mount; - u8 valid; - u8 btree_bitmap_shift; - u64 btree_allocated_bitmap; -}; - -#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h deleted file mode 100644 index c4b3d8d3f4149c..00000000000000 --- a/fs/bcachefs/seqmutex.h +++ /dev/null @@ -1,45 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SEQMUTEX_H -#define _BCACHEFS_SEQMUTEX_H - -#include - -struct seqmutex { - struct mutex lock; - u32 seq; -}; - -#define seqmutex_init(_lock) mutex_init(&(_lock)->lock) - -static inline bool seqmutex_trylock(struct seqmutex *lock) -{ - return mutex_trylock(&lock->lock); -} - -static inline void seqmutex_lock(struct seqmutex *lock) -{ - mutex_lock(&lock->lock); - lock->seq++; -} - -static inline u32 seqmutex_unlock(struct seqmutex *lock) -{ - u32 seq = lock->seq; - mutex_unlock(&lock->lock); - return seq; -} - -static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) -{ - if (lock->seq != seq || !mutex_trylock(&lock->lock)) - return false; - - if (lock->seq != seq) { - mutex_unlock(&lock->lock); - return false; - } - - return true; -} - -#endif /* _BCACHEFS_SEQMUTEX_H */ diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c deleted file mode 100644 index a1cc44e66c7ed5..00000000000000 --- a/fs/bcachefs/siphash.c +++ /dev/null @@ -1,173 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ - -/*- - * Copyright (c) 2013 Andre Oppermann - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d - * are the number of compression rounds and the number of finalization rounds. - * A compression round is identical to a finalization round and this round - * function is called SipRound. Given a 128-bit key k and a (possibly empty) - * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). - * - * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, - * by Jean-Philippe Aumasson and Daniel J. Bernstein, - * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa - * https://131002.net/siphash/siphash.pdf - * https://131002.net/siphash/ - */ - -#include -#include -#include -#include - -#include "siphash.h" - -static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) -{ - while (rounds--) { - ctx->v[0] += ctx->v[1]; - ctx->v[2] += ctx->v[3]; - ctx->v[1] = rol64(ctx->v[1], 13); - ctx->v[3] = rol64(ctx->v[3], 16); - - ctx->v[1] ^= ctx->v[0]; - ctx->v[3] ^= ctx->v[2]; - ctx->v[0] = rol64(ctx->v[0], 32); - - ctx->v[2] += ctx->v[1]; - ctx->v[0] += ctx->v[3]; - ctx->v[1] = rol64(ctx->v[1], 17); - ctx->v[3] = rol64(ctx->v[3], 21); - - ctx->v[1] ^= ctx->v[2]; - ctx->v[3] ^= ctx->v[0]; - ctx->v[2] = rol64(ctx->v[2], 32); - } -} - -static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) -{ - u64 m = get_unaligned_le64(ptr); - - ctx->v[3] ^= m; - SipHash_Rounds(ctx, rounds); - ctx->v[0] ^= m; -} - -void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) -{ - u64 k0, k1; - - k0 = le64_to_cpu(key->k0); - k1 = le64_to_cpu(key->k1); - - ctx->v[0] = 0x736f6d6570736575ULL ^ k0; - ctx->v[1] = 0x646f72616e646f6dULL ^ k1; - ctx->v[2] = 0x6c7967656e657261ULL ^ k0; - ctx->v[3] = 0x7465646279746573ULL ^ k1; - - memset(ctx->buf, 0, sizeof(ctx->buf)); - ctx->bytes = 0; -} - -void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, - const void *src, size_t len) -{ - const u8 *ptr = src; - size_t left, used; - - if (len == 0) - return; - - used = ctx->bytes % sizeof(ctx->buf); - ctx->bytes += len; - - if (used > 0) { - left = sizeof(ctx->buf) - used; - - if (len >= left) { - memcpy(&ctx->buf[used], ptr, left); - SipHash_CRounds(ctx, ctx->buf, rc); - len -= left; - ptr += left; - } else { - memcpy(&ctx->buf[used], ptr, len); - return; - } - } - - while (len >= sizeof(ctx->buf)) { - SipHash_CRounds(ctx, ptr, rc); - len -= sizeof(ctx->buf); - ptr += sizeof(ctx->buf); - } - - if (len > 0) - memcpy(&ctx->buf[used], ptr, len); -} - -void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) -{ - u64 r; - - r = SipHash_End(ctx, rc, rf); - - *((__le64 *) dst) = cpu_to_le64(r); -} - -u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) -{ - u64 r; - size_t left, used; - - used = ctx->bytes % sizeof(ctx->buf); - left = sizeof(ctx->buf) - used; - memset(&ctx->buf[used], 0, left - 1); - ctx->buf[7] = ctx->bytes; - - SipHash_CRounds(ctx, ctx->buf, rc); - ctx->v[2] ^= 0xff; - SipHash_Rounds(ctx, rf); - - r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); - memset(ctx, 0, sizeof(*ctx)); - return r; -} - -u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) -{ - SIPHASH_CTX ctx; - - SipHash_Init(&ctx, key); - SipHash_Update(&ctx, rc, rf, src, len); - return SipHash_End(&ctx, rc, rf); -} diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h deleted file mode 100644 index 3dfaf34a43b284..00000000000000 --- a/fs/bcachefs/siphash.h +++ /dev/null @@ -1,87 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause */ -/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ -/*- - * Copyright (c) 2013 Andre Oppermann - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -/* - * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) - * optimized for speed on short messages returning a 64bit hash/digest value. - * - * The number of rounds is defined during the initialization: - * SipHash24_Init() for the fast and resonable strong version - * SipHash48_Init() for the strong version (half as fast) - * - * struct SIPHASH_CTX ctx; - * SipHash24_Init(&ctx); - * SipHash_SetKey(&ctx, "16bytes long key"); - * SipHash_Update(&ctx, pointer_to_string, length_of_string); - * SipHash_Final(output, &ctx); - */ - -#ifndef _SIPHASH_H_ -#define _SIPHASH_H_ - -#include - -#define SIPHASH_BLOCK_LENGTH 8 -#define SIPHASH_KEY_LENGTH 16 -#define SIPHASH_DIGEST_LENGTH 8 - -typedef struct _SIPHASH_CTX { - u64 v[4]; - u8 buf[SIPHASH_BLOCK_LENGTH]; - u32 bytes; -} SIPHASH_CTX; - -typedef struct { - __le64 k0; - __le64 k1; -} SIPHASH_KEY; - -void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); -void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); -u64 SipHash_End(SIPHASH_CTX *, int, int); -void SipHash_Final(void *, SIPHASH_CTX *, int, int); -u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); - -#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) -#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) -#define SipHash24_End(_d) SipHash_End((_d), 2, 4) -#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) -#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) - -#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) -#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) -#define SipHash48_End(_d) SipHash_End((_d), 4, 8) -#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) -#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) - -#endif /* _SIPHASH_H_ */ diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c deleted file mode 100644 index 538c324f4765dc..00000000000000 --- a/fs/bcachefs/six.c +++ /dev/null @@ -1,878 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "six.h" - -#ifdef DEBUG -#define EBUG_ON(cond) BUG_ON(cond) -#else -#define EBUG_ON(cond) do {} while (0) -#endif - -#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) -#define six_release(l, ip) lock_release(l, ip) - -static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); - -#define SIX_LOCK_HELD_read_OFFSET 0 -#define SIX_LOCK_HELD_read ~(~0U << 26) -#define SIX_LOCK_HELD_intent (1U << 26) -#define SIX_LOCK_HELD_write (1U << 27) -#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) -#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) -#define SIX_LOCK_NOSPIN (1U << 31) - -struct six_lock_vals { - /* Value we add to the lock in order to take the lock: */ - u32 lock_val; - - /* If the lock has this value (used as a mask), taking the lock fails: */ - u32 lock_fail; - - /* Mask that indicates lock is held for this type: */ - u32 held_mask; - - /* Waitlist we wakeup when releasing the lock: */ - enum six_lock_type unlock_wakeup; -}; - -static const struct six_lock_vals l[] = { - [SIX_LOCK_read] = { - .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, - .lock_fail = SIX_LOCK_HELD_write, - .held_mask = SIX_LOCK_HELD_read, - .unlock_wakeup = SIX_LOCK_write, - }, - [SIX_LOCK_intent] = { - .lock_val = SIX_LOCK_HELD_intent, - .lock_fail = SIX_LOCK_HELD_intent, - .held_mask = SIX_LOCK_HELD_intent, - .unlock_wakeup = SIX_LOCK_intent, - }, - [SIX_LOCK_write] = { - .lock_val = SIX_LOCK_HELD_write, - .lock_fail = SIX_LOCK_HELD_read, - .held_mask = SIX_LOCK_HELD_write, - .unlock_wakeup = SIX_LOCK_read, - }, -}; - -static inline void six_set_bitmask(struct six_lock *lock, u32 mask) -{ - if ((atomic_read(&lock->state) & mask) != mask) - atomic_or(mask, &lock->state); -} - -static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) -{ - if (atomic_read(&lock->state) & mask) - atomic_and(~mask, &lock->state); -} - -static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, - u32 old, struct task_struct *owner) -{ - if (type != SIX_LOCK_intent) - return; - - if (!(old & SIX_LOCK_HELD_intent)) { - EBUG_ON(lock->owner); - lock->owner = owner; - } else { - EBUG_ON(lock->owner != current); - } -} - -static inline unsigned pcpu_read_count(struct six_lock *lock) -{ - unsigned read_count = 0; - int cpu; - - for_each_possible_cpu(cpu) - read_count += *per_cpu_ptr(lock->readers, cpu); - return read_count; -} - -/* - * __do_six_trylock() - main trylock routine - * - * Returns 1 on success, 0 on failure - * - * In percpu reader mode, a failed trylock may cause a spurious trylock failure - * for anoter thread taking the competing lock type, and we may havve to do a - * wakeup: when a wakeup is required, we return -1 - wakeup_type. - */ -static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, - struct task_struct *task, bool try) -{ - int ret; - u32 old; - - EBUG_ON(type == SIX_LOCK_write && lock->owner != task); - EBUG_ON(type == SIX_LOCK_write && - (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); - - /* - * Percpu reader mode: - * - * The basic idea behind this algorithm is that you can implement a lock - * between two threads without any atomics, just memory barriers: - * - * For two threads you'll need two variables, one variable for "thread a - * has the lock" and another for "thread b has the lock". - * - * To take the lock, a thread sets its variable indicating that it holds - * the lock, then issues a full memory barrier, then reads from the - * other thread's variable to check if the other thread thinks it has - * the lock. If we raced, we backoff and retry/sleep. - * - * Failure to take the lock may cause a spurious trylock failure in - * another thread, because we temporarily set the lock to indicate that - * we held it. This would be a problem for a thread in six_lock(), when - * they are calling trylock after adding themself to the waitlist and - * prior to sleeping. - * - * Therefore, if we fail to get the lock, and there were waiters of the - * type we conflict with, we will have to issue a wakeup. - * - * Since we may be called under wait_lock (and by the wakeup code - * itself), we return that the wakeup has to be done instead of doing it - * here. - */ - if (type == SIX_LOCK_read && lock->readers) { - preempt_disable(); - this_cpu_inc(*lock->readers); /* signal that we own lock */ - - smp_mb(); - - old = atomic_read(&lock->state); - ret = !(old & l[type].lock_fail); - - this_cpu_sub(*lock->readers, !ret); - preempt_enable(); - - if (!ret) { - smp_mb(); - if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write) - ret = -1 - SIX_LOCK_write; - } - } else if (type == SIX_LOCK_write && lock->readers) { - if (try) - atomic_add(SIX_LOCK_HELD_write, &lock->state); - - /* - * Make sure atomic_add happens before pcpu_read_count and - * six_set_bitmask in slow path happens before pcpu_read_count. - * - * Paired with the smp_mb() in read lock fast path (per-cpu mode) - * and the one before atomic_read in read unlock path. - */ - smp_mb(); - ret = !pcpu_read_count(lock); - - if (try && !ret) { - old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); - if (old & SIX_LOCK_WAITING_read) - ret = -1 - SIX_LOCK_read; - } - } else { - old = atomic_read(&lock->state); - do { - ret = !(old & l[type].lock_fail); - if (!ret || (type == SIX_LOCK_write && !try)) { - smp_mb(); - break; - } - } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val)); - - EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); - } - - if (ret > 0) - six_set_owner(lock, type, old, task); - - EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && - (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); - - return ret; -} - -static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) -{ - struct six_lock_waiter *w, *next; - struct task_struct *task; - bool saw_one; - int ret; -again: - ret = 0; - saw_one = false; - raw_spin_lock(&lock->wait_lock); - - list_for_each_entry_safe(w, next, &lock->wait_list, list) { - if (w->lock_want != lock_type) - continue; - - if (saw_one && lock_type != SIX_LOCK_read) - goto unlock; - saw_one = true; - - ret = __do_six_trylock(lock, lock_type, w->task, false); - if (ret <= 0) - goto unlock; - - /* - * Similar to percpu_rwsem_wake_function(), we need to guard - * against the wakee noticing w->lock_acquired, returning, and - * then exiting before we do the wakeup: - */ - task = get_task_struct(w->task); - __list_del(w->list.prev, w->list.next); - /* - * The release barrier here ensures the ordering of the - * __list_del before setting w->lock_acquired; @w is on the - * stack of the thread doing the waiting and will be reused - * after it sees w->lock_acquired with no other locking: - * pairs with smp_load_acquire() in six_lock_slowpath() - */ - smp_store_release(&w->lock_acquired, true); - wake_up_process(task); - put_task_struct(task); - } - - six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); -unlock: - raw_spin_unlock(&lock->wait_lock); - - if (ret < 0) { - lock_type = -ret - 1; - goto again; - } -} - -__always_inline -static void six_lock_wakeup(struct six_lock *lock, u32 state, - enum six_lock_type lock_type) -{ - if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) - return; - - if (!(state & (SIX_LOCK_WAITING_read << lock_type))) - return; - - __six_lock_wakeup(lock, lock_type); -} - -__always_inline -static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) -{ - int ret; - - ret = __do_six_trylock(lock, type, current, try); - if (ret < 0) - __six_lock_wakeup(lock, -ret - 1); - - return ret > 0; -} - -/** - * six_trylock_ip - attempt to take a six lock without blocking - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: true on success, false on failure. - */ -bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -{ - if (!do_six_trylock(lock, type, true)) - return false; - - if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); - return true; -} -EXPORT_SYMBOL_GPL(six_trylock_ip); - -/** - * six_relock_ip - attempt to re-take a lock that was held previously - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @seq: lock sequence number obtained from six_lock_seq() while lock was - * held previously - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: true on success, false on failure. - */ -bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, - unsigned seq, unsigned long ip) -{ - if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip)) - return false; - - if (six_lock_seq(lock) != seq) { - six_unlock_ip(lock, type, ip); - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(six_relock_ip); - -#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN - -static inline bool six_owner_running(struct six_lock *lock) -{ - /* - * When there's no owner, we might have preempted between the owner - * acquiring the lock and setting the owner field. If we're an RT task - * that will live-lock because we won't let the owner complete. - */ - guard(rcu)(); - struct task_struct *owner = READ_ONCE(lock->owner); - return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); -} - -static inline bool six_optimistic_spin(struct six_lock *lock, - struct six_lock_waiter *wait, - enum six_lock_type type) -{ - unsigned loop = 0; - u64 end_time; - - if (type == SIX_LOCK_write) - return false; - - if (lock->wait_list.next != &wait->list) - return false; - - if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN) - return false; - - preempt_disable(); - end_time = sched_clock() + 10 * NSEC_PER_USEC; - - while (!need_resched() && six_owner_running(lock)) { - /* - * Ensures that writes to the waitlist entry happen after we see - * wait->lock_acquired: pairs with the smp_store_release in - * __six_lock_wakeup - */ - if (smp_load_acquire(&wait->lock_acquired)) { - preempt_enable(); - return true; - } - - if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { - six_set_bitmask(lock, SIX_LOCK_NOSPIN); - break; - } - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax(); - } - - preempt_enable(); - return false; -} - -#else /* CONFIG_LOCK_SPIN_ON_OWNER */ - -static inline bool six_optimistic_spin(struct six_lock *lock, - struct six_lock_waiter *wait, - enum six_lock_type type) -{ - return false; -} - -#endif - -noinline -static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - int ret = 0; - - if (type == SIX_LOCK_write) { - EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); - atomic_add(SIX_LOCK_HELD_write, &lock->state); - smp_mb__after_atomic(); - } - - trace_contention_begin(lock, 0); - lock_contended(&lock->dep_map, ip); - - wait->task = current; - wait->lock_want = type; - wait->lock_acquired = false; - - raw_spin_lock(&lock->wait_lock); - six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); - /* - * Retry taking the lock after taking waitlist lock, in case we raced - * with an unlock: - */ - ret = __do_six_trylock(lock, type, current, false); - if (ret <= 0) { - wait->start_time = local_clock(); - - if (!list_empty(&lock->wait_list)) { - struct six_lock_waiter *last = - list_last_entry(&lock->wait_list, - struct six_lock_waiter, list); - - if (time_before_eq64(wait->start_time, last->start_time)) - wait->start_time = last->start_time + 1; - } - - list_add_tail(&wait->list, &lock->wait_list); - } - raw_spin_unlock(&lock->wait_lock); - - if (unlikely(ret > 0)) { - ret = 0; - goto out; - } - - if (unlikely(ret < 0)) { - __six_lock_wakeup(lock, -ret - 1); - ret = 0; - } - - if (six_optimistic_spin(lock, wait, type)) - goto out; - - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - - /* - * Ensures that writes to the waitlist entry happen after we see - * wait->lock_acquired: pairs with the smp_store_release in - * __six_lock_wakeup - */ - if (smp_load_acquire(&wait->lock_acquired)) - break; - - ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; - if (unlikely(ret)) { - bool acquired; - - /* - * If should_sleep_fn() returns an error, we are - * required to return that error even if we already - * acquired the lock - should_sleep_fn() might have - * modified external state (e.g. when the deadlock cycle - * detector in bcachefs issued a transaction restart) - */ - raw_spin_lock(&lock->wait_lock); - acquired = wait->lock_acquired; - if (!acquired) - list_del(&wait->list); - raw_spin_unlock(&lock->wait_lock); - - if (unlikely(acquired)) { - do_six_unlock_type(lock, type); - } else if (type == SIX_LOCK_write) { - six_clear_bitmask(lock, SIX_LOCK_HELD_write); - six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); - } - break; - } - - schedule(); - } - - __set_current_state(TASK_RUNNING); -out: - trace_contention_end(lock, 0); - - return ret; -} - -/** - * six_lock_ip_waiter - take a lock, with full waitlist interface - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @wait: pointer to wait object, which will be added to lock's waitlist - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * This is the most general six_lock() variant, with parameters to support full - * cycle detection for deadlock avoidance. - * - * The code calling this function must implement tracking of held locks, and the - * @wait object should be embedded into the struct that tracks held locks - - * which must also be accessible in a thread-safe way. - * - * @should_sleep_fn should invoke the cycle detector; it should walk each - * lock's waiters, and for each waiter recursively walk their held locks. - * - * When this function must block, @wait will be added to @lock's waitlist before - * calling trylock, and before calling @should_sleep_fn, and @wait will not be - * removed from the lock waitlist until the lock has been successfully acquired, - * or we abort. - * - * @wait.start_time will be monotonically increasing for any given waitlist, and - * thus may be used as a loop cursor. - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - int ret; - - wait->start_time = 0; - - if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); - - ret = do_six_trylock(lock, type, true) ? 0 - : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); - - if (ret && type != SIX_LOCK_write) - six_release(&lock->dep_map, ip); - if (!ret) - lock_acquired(&lock->dep_map, ip); - - return ret; -} -EXPORT_SYMBOL_GPL(six_lock_ip_waiter); - -__always_inline -static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) -{ - u32 state; - - if (type == SIX_LOCK_intent) - lock->owner = NULL; - - if (type == SIX_LOCK_read && - lock->readers) { - smp_mb(); /* unlock barrier */ - this_cpu_dec(*lock->readers); - smp_mb(); /* between unlocking and checking for waiters */ - state = atomic_read(&lock->state); - } else { - u32 v = l[type].lock_val; - - if (type != SIX_LOCK_read) - v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; - - EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); - state = atomic_sub_return_release(v, &lock->state); - } - - six_lock_wakeup(lock, state, l[type].unlock_wakeup); -} - -/** - * six_unlock_ip - drop a six lock - * @lock: lock to unlock - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * When a lock is held multiple times (because six_lock_incement()) was used), - * this decrements the 'lock held' counter by one. - * - * For example: - * six_lock_read(&foo->lock); read count 1 - * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 - */ -void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -{ - EBUG_ON(type == SIX_LOCK_write && - !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); - EBUG_ON((type == SIX_LOCK_write || - type == SIX_LOCK_intent) && - lock->owner != current); - - if (type != SIX_LOCK_write) - six_release(&lock->dep_map, ip); - - if (type == SIX_LOCK_intent && - lock->intent_lock_recurse) { - --lock->intent_lock_recurse; - return; - } - - if (type == SIX_LOCK_write && - lock->write_lock_recurse) { - --lock->write_lock_recurse; - return; - } - - if (type == SIX_LOCK_write) - lock->seq++; - - do_six_unlock_type(lock, type); -} -EXPORT_SYMBOL_GPL(six_unlock_ip); - -/** - * six_lock_downgrade - convert an intent lock to a read lock - * @lock: lock to dowgrade - * - * @lock will have read count incremented and intent count decremented - */ -void six_lock_downgrade(struct six_lock *lock) -{ - six_lock_increment(lock, SIX_LOCK_read); - six_unlock_intent(lock); -} -EXPORT_SYMBOL_GPL(six_lock_downgrade); - -/** - * six_lock_tryupgrade - attempt to convert read lock to an intent lock - * @lock: lock to upgrade - * - * On success, @lock will have intent count incremented and read count - * decremented - * - * Return: true on success, false on failure - */ -bool six_lock_tryupgrade(struct six_lock *lock) -{ - u32 old = atomic_read(&lock->state), new; - - do { - new = old; - - if (new & SIX_LOCK_HELD_intent) - return false; - - if (!lock->readers) { - EBUG_ON(!(new & SIX_LOCK_HELD_read)); - new -= l[SIX_LOCK_read].lock_val; - } - - new |= SIX_LOCK_HELD_intent; - } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new)); - - if (lock->readers) - this_cpu_dec(*lock->readers); - - six_set_owner(lock, SIX_LOCK_intent, old, current); - - return true; -} -EXPORT_SYMBOL_GPL(six_lock_tryupgrade); - -/** - * six_trylock_convert - attempt to convert a held lock from one type to another - * @lock: lock to upgrade - * @from: SIX_LOCK_read or SIX_LOCK_intent - * @to: SIX_LOCK_read or SIX_LOCK_intent - * - * On success, @lock will have intent count incremented and read count - * decremented - * - * Return: true on success, false on failure - */ -bool six_trylock_convert(struct six_lock *lock, - enum six_lock_type from, - enum six_lock_type to) -{ - EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); - - if (to == from) - return true; - - if (to == SIX_LOCK_read) { - six_lock_downgrade(lock); - return true; - } else { - return six_lock_tryupgrade(lock); - } -} -EXPORT_SYMBOL_GPL(six_trylock_convert); - -/** - * six_lock_increment - increase held lock count on a lock that is already held - * @lock: lock to increment - * @type: SIX_LOCK_read or SIX_LOCK_intent - * - * @lock must already be held, with a lock type that is greater than or equal to - * @type - * - * A corresponding six_unlock_type() call will be required for @lock to be fully - * unlocked. - */ -void six_lock_increment(struct six_lock *lock, enum six_lock_type type) -{ - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); - - /* XXX: assert already locked, and that we don't overflow: */ - - switch (type) { - case SIX_LOCK_read: - if (lock->readers) { - this_cpu_inc(*lock->readers); - } else { - EBUG_ON(!(atomic_read(&lock->state) & - (SIX_LOCK_HELD_read| - SIX_LOCK_HELD_intent))); - atomic_add(l[type].lock_val, &lock->state); - } - break; - case SIX_LOCK_write: - lock->write_lock_recurse++; - fallthrough; - case SIX_LOCK_intent: - EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); - lock->intent_lock_recurse++; - break; - } -} -EXPORT_SYMBOL_GPL(six_lock_increment); - -/** - * six_lock_wakeup_all - wake up all waiters on @lock - * @lock: lock to wake up waiters for - * - * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then - * abort the lock operation. - * - * This function is never needed in a bug-free program; it's only useful in - * debug code, e.g. to determine if a cycle detector is at fault. - */ -void six_lock_wakeup_all(struct six_lock *lock) -{ - u32 state = atomic_read(&lock->state); - struct six_lock_waiter *w; - - six_lock_wakeup(lock, state, SIX_LOCK_read); - six_lock_wakeup(lock, state, SIX_LOCK_intent); - six_lock_wakeup(lock, state, SIX_LOCK_write); - - raw_spin_lock(&lock->wait_lock); - list_for_each_entry(w, &lock->wait_list, list) - wake_up_process(w->task); - raw_spin_unlock(&lock->wait_lock); -} -EXPORT_SYMBOL_GPL(six_lock_wakeup_all); - -/** - * six_lock_counts - return held lock counts, for each lock type - * @lock: lock to return counters for - * - * Return: the number of times a lock is held for read, intent and write. - */ -struct six_lock_count six_lock_counts(struct six_lock *lock) -{ - struct six_lock_count ret; - - ret.n[SIX_LOCK_read] = !lock->readers - ? atomic_read(&lock->state) & SIX_LOCK_HELD_read - : pcpu_read_count(lock); - ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + - lock->intent_lock_recurse; - ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); - - return ret; -} -EXPORT_SYMBOL_GPL(six_lock_counts); - -/** - * six_lock_readers_add - directly manipulate reader count of a lock - * @lock: lock to add/subtract readers for - * @nr: reader count to add/subtract - * - * When an upper layer is implementing lock reentrency, we may have both read - * and intent locks on the same lock. - * - * When we need to take a write lock, the read locks will cause self-deadlock, - * because six locks themselves do not track which read locks are held by the - * current thread and which are held by a different thread - it does no - * per-thread tracking of held locks. - * - * The upper layer that is tracking held locks may however, if trylock() has - * failed, count up its own read locks, subtract them, take the write lock, and - * then re-add them. - * - * As in any other situation when taking a write lock, @lock must be held for - * intent one (or more) times, so @lock will never be left unlocked. - */ -void six_lock_readers_add(struct six_lock *lock, int nr) -{ - if (lock->readers) { - this_cpu_add(*lock->readers, nr); - } else { - EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); - /* reader count starts at bit 0 */ - atomic_add(nr, &lock->state); - } -} -EXPORT_SYMBOL_GPL(six_lock_readers_add); - -/** - * six_lock_exit - release resources held by a lock prior to freeing - * @lock: lock to exit - * - * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is - * required to free the percpu read counts. - */ -void six_lock_exit(struct six_lock *lock) -{ - WARN_ON(lock->readers && pcpu_read_count(lock)); - WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); - - free_percpu(lock->readers); - lock->readers = NULL; -} -EXPORT_SYMBOL_GPL(six_lock_exit); - -void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags, - gfp_t gfp) -{ - atomic_set(&lock->state, 0); - raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - debug_check_no_locks_freed((void *) lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - - /* - * Don't assume that we have real percpu variables available in - * userspace: - */ -#ifdef __KERNEL__ - if (flags & SIX_LOCK_INIT_PCPU) { - /* - * We don't return an error here on memory allocation failure - * since percpu is an optimization, and locks will work with the - * same semantics in non-percpu mode: callers can check for - * failure if they wish by checking lock->readers, but generally - * will not want to treat it as an error. - */ - lock->readers = alloc_percpu_gfp(unsigned, gfp); - } -#endif -} -EXPORT_SYMBOL_GPL(__six_lock_init); diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h deleted file mode 100644 index 59b851cf8bacc4..00000000000000 --- a/fs/bcachefs/six.h +++ /dev/null @@ -1,388 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _LINUX_SIX_H -#define _LINUX_SIX_H - -/** - * DOC: SIX locks overview - * - * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores - * but with an additional state: read/shared, intent, exclusive/write - * - * The purpose of the intent state is to allow for greater concurrency on tree - * structures without deadlocking. In general, a read can't be upgraded to a - * write lock without deadlocking, so an operation that updates multiple nodes - * will have to take write locks for the full duration of the operation. - * - * But by adding an intent state, which is exclusive with other intent locks but - * not with readers, we can take intent locks at the start of the operation, - * and then take write locks only for the actual update to each individual - * nodes, without deadlocking. - * - * Example usage: - * six_lock_read(&foo->lock); - * six_unlock_read(&foo->lock); - * - * An intent lock must be held before taking a write lock: - * six_lock_intent(&foo->lock); - * six_lock_write(&foo->lock); - * six_unlock_write(&foo->lock); - * six_unlock_intent(&foo->lock); - * - * Other operations: - * six_trylock_read() - * six_trylock_intent() - * six_trylock_write() - * - * six_lock_downgrade() convert from intent to read - * six_lock_tryupgrade() attempt to convert from read to intent, may fail - * - * There are also interfaces that take the lock type as an enum: - * - * six_lock_type(&foo->lock, SIX_LOCK_read); - * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) - * six_lock_type(&foo->lock, SIX_LOCK_write); - * six_unlock_type(&foo->lock, SIX_LOCK_write); - * six_unlock_type(&foo->lock, SIX_LOCK_intent); - * - * Lock sequence numbers - unlock(), relock(): - * - * Locks embed sequences numbers, which are incremented on write lock/unlock. - * This allows locks to be dropped and the retaken iff the state they protect - * hasn't changed; this makes it much easier to avoid holding locks while e.g. - * doing IO or allocating memory. - * - * Example usage: - * six_lock_read(&foo->lock); - * u32 seq = six_lock_seq(&foo->lock); - * six_unlock_read(&foo->lock); - * - * some_operation_that_may_block(); - * - * if (six_relock_read(&foo->lock, seq)) { ... } - * - * If the relock operation succeeds, it is as if the lock was never unlocked. - * - * Reentrancy: - * - * Six locks are not by themselves reentrant, but have counters for both the - * read and intent states that can be used to provide reentrancy by an upper - * layer that tracks held locks. If a lock is known to already be held in the - * read or intent state, six_lock_increment() can be used to bump the "lock - * held in this state" counter, increasing the number of unlock calls that - * will be required to fully unlock it. - * - * Example usage: - * six_lock_read(&foo->lock); - * six_lock_increment(&foo->lock, SIX_LOCK_read); - * six_unlock_read(&foo->lock); - * six_unlock_read(&foo->lock); - * foo->lock is now fully unlocked. - * - * Since the intent state supercedes read, it's legal to increment the read - * counter when holding an intent lock, but not the reverse. - * - * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) - * is not legal. - * - * should_sleep_fn: - * - * There is a six_lock() variant that takes a function pointer that is called - * immediately prior to schedule() when blocking, and may return an error to - * abort. - * - * One possible use for this feature is when objects being locked are part of - * a cache and may reused, and lock ordering is based on a property of the - * object that will change when the object is reused - i.e. logical key order. - * - * If looking up an object in the cache may race with object reuse, and lock - * ordering is required to prevent deadlock, object reuse may change the - * correct lock order for that object and cause a deadlock. should_sleep_fn - * can be used to check if the object is still the object we want and avoid - * this deadlock. - * - * Wait list entry interface: - * - * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a - * wait list entry. By embedding six_lock_waiter into another object, and by - * traversing lock waitlists, it is then possible for an upper layer to - * implement full cycle detection for deadlock avoidance. - * - * should_sleep_fn should be used for invoking the cycle detector, walking the - * graph of held locks to check for a deadlock. The upper layer must track - * held locks for each thread, and each thread's held locks must be reachable - * from its six_lock_waiter object. - * - * six_lock_waiter() will add the wait object to the waitlist re-trying taking - * the lock, and before calling should_sleep_fn, and the wait object will not - * be removed from the waitlist until either the lock has been successfully - * acquired, or we aborted because should_sleep_fn returned an error. - * - * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will - * have timestamps in strictly ascending order - this is so the timestamp can - * be used as a cursor for lock graph traverse. - */ - -#include -#include -#include - -enum six_lock_type { - SIX_LOCK_read, - SIX_LOCK_intent, - SIX_LOCK_write, -}; - -struct six_lock { - atomic_t state; - u32 seq; - unsigned intent_lock_recurse; - unsigned write_lock_recurse; - struct task_struct *owner; - unsigned __percpu *readers; - raw_spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -struct six_lock_waiter { - struct list_head list; - struct task_struct *task; - enum six_lock_type lock_want; - bool lock_acquired; - u64 start_time; -}; - -typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); - -void six_lock_exit(struct six_lock *lock); - -enum six_lock_init_flags { - SIX_LOCK_INIT_PCPU = 1U << 0, -}; - -void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags, - gfp_t gfp); - -/** - * six_lock_init - initialize a six lock - * @lock: lock to initialize - * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU - */ -#define six_lock_init(lock, flags, gfp) \ -do { \ - static struct lock_class_key __key; \ - \ - __six_lock_init((lock), #lock, &__key, flags, gfp); \ -} while (0) - -/** - * six_lock_seq - obtain current lock sequence number - * @lock: six_lock to obtain sequence number for - * - * @lock should be held for read or intent, and not write - * - * By saving the lock sequence number, we can unlock @lock and then (typically - * after some blocking operation) attempt to relock it: the relock will succeed - * if the sequence number hasn't changed, meaning no write locks have been taken - * and state corresponding to what @lock protects is still valid. - */ -static inline u32 six_lock_seq(const struct six_lock *lock) -{ - return lock->seq; -} - -bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); - -/** - * six_trylock_type - attempt to take a six lock without blocking - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * - * Return: true on success, false on failure. - */ -static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -{ - return six_trylock_ip(lock, type, _THIS_IP_); -} - -int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip); - -/** - * six_lock_waiter - take a lock, with full waitlist interface - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @wait: pointer to wait object, which will be added to lock's waitlist - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * - * This is a convenience wrapper around six_lock_ip_waiter(), see that function - * for full documentation. - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); -} - -/** - * six_lock_ip - take a six lock lock - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - struct six_lock_waiter wait; - - return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); -} - -/** - * six_lock_type - take a six lock lock - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - struct six_lock_waiter wait; - - return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); -} - -bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, - unsigned seq, unsigned long ip); - -/** - * six_relock_type - attempt to re-take a lock that was held previously - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @seq: lock sequence number obtained from six_lock_seq() while lock was - * held previously - * - * Return: true on success, false on failure. - */ -static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq) -{ - return six_relock_ip(lock, type, seq, _THIS_IP_); -} - -void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); - -/** - * six_unlock_type - drop a six lock - * @lock: lock to unlock - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * - * When a lock is held multiple times (because six_lock_incement()) was used), - * this decrements the 'lock held' counter by one. - * - * For example: - * six_lock_read(&foo->lock); read count 1 - * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 - */ -static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -{ - six_unlock_ip(lock, type, _THIS_IP_); -} - -#define __SIX_LOCK(type) \ -static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ -{ \ - return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ -} \ - \ -static inline bool six_trylock_##type(struct six_lock *lock) \ -{ \ - return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -} \ - \ -static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ - struct six_lock_waiter *wait, \ - six_lock_should_sleep_fn should_sleep_fn, void *p,\ - unsigned long ip) \ -{ \ - return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ -} \ - \ -static inline int six_lock_ip_##type(struct six_lock *lock, \ - six_lock_should_sleep_fn should_sleep_fn, void *p, \ - unsigned long ip) \ -{ \ - return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ -} \ - \ -static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ -{ \ - return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ -} \ - \ -static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ -{ \ - return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ -} \ - \ -static inline int six_lock_##type(struct six_lock *lock, \ - six_lock_should_sleep_fn fn, void *p)\ -{ \ - return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ -} \ - \ -static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ -{ \ - six_unlock_ip(lock, SIX_LOCK_##type, ip); \ -} \ - \ -static inline void six_unlock_##type(struct six_lock *lock) \ -{ \ - six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -} - -__SIX_LOCK(read) -__SIX_LOCK(intent) -__SIX_LOCK(write) -#undef __SIX_LOCK - -void six_lock_downgrade(struct six_lock *); -bool six_lock_tryupgrade(struct six_lock *); -bool six_trylock_convert(struct six_lock *, enum six_lock_type, - enum six_lock_type); - -void six_lock_increment(struct six_lock *, enum six_lock_type); - -void six_lock_wakeup_all(struct six_lock *); - -struct six_lock_count { - unsigned n[3]; -}; - -struct six_lock_count six_lock_counts(struct six_lock *); -void six_lock_readers_add(struct six_lock *, int); - -#endif /* _LINUX_SIX_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c deleted file mode 100644 index 4c43d2a2c1f5bb..00000000000000 --- a/fs/bcachefs/snapshot.c +++ /dev/null @@ -1,2043 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bbpos.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "buckets.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "fs.h" -#include "recovery_passes.h" -#include "snapshot.h" - -#include - -/* - * Snapshot trees: - * - * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they - * exist to provide a stable identifier for the whole lifetime of a snapshot - * tree. - */ - -void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); - - prt_printf(out, "subvol %u root snapshot %u", - le32_to_cpu(t.v->master_subvol), - le32_to_cpu(t.v->root_snapshot)); -} - -int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), - c, snapshot_tree_pos_bad, - "bad pos"); -fsck_err: - return ret; -} - -int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, - struct bch_snapshot_tree *s) -{ - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), - BTREE_ITER_with_updates, snapshot_tree, s); - - if (bch2_err_matches(ret, ENOENT)) - ret = bch_err_throw(trans->c, ENOENT_snapshot_tree); - return ret; -} - -struct bkey_i_snapshot_tree * -__bch2_snapshot_tree_create(struct btree_trans *trans) -{ - struct btree_iter iter; - int ret = bch2_bkey_get_empty_slot(trans, &iter, - BTREE_ID_snapshot_trees, POS(0, U32_MAX)); - struct bkey_i_snapshot_tree *s_t; - - if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree); - if (ret) - return ERR_PTR(ret); - - s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(s_t); - bch2_trans_iter_exit(trans, &iter); - return ret ? ERR_PTR(ret) : s_t; -} - -static int bch2_snapshot_tree_create(struct btree_trans *trans, - u32 root_id, u32 subvol_id, u32 *tree_id) -{ - struct bkey_i_snapshot_tree *n_tree = - __bch2_snapshot_tree_create(trans); - - if (IS_ERR(n_tree)) - return PTR_ERR(n_tree); - - n_tree->v.master_subvol = cpu_to_le32(subvol_id); - n_tree->v.root_snapshot = cpu_to_le32(root_id); - *tree_id = n_tree->k.p.offset; - return 0; -} - -/* Snapshot nodes: */ - -static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor) -{ - while (id && id < ancestor) { - const struct snapshot_t *s = __snapshot_t(t, id); - id = s ? s->parent : 0; - } - return id == ancestor; -} - -static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) -{ - guard(rcu)(); - return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); -} - -static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) -{ - const struct snapshot_t *s = __snapshot_t(t, id); - if (!s) - return 0; - - if (s->skip[2] <= ancestor) - return s->skip[2]; - if (s->skip[1] <= ancestor) - return s->skip[1]; - if (s->skip[0] <= ancestor) - return s->skip[0]; - return s->parent; -} - -static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) -{ - const struct snapshot_t *s = __snapshot_t(t, id); - if (!s) - return false; - - return test_bit(ancestor - id - 1, s->is_ancestor); -} - -bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - u32 orig_id = id; -#endif - - guard(rcu)(); - struct snapshot_table *t = rcu_dereference(c->snapshots); - - if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) - return __bch2_snapshot_is_ancestor_early(t, id, ancestor); - - if (likely(ancestor >= IS_ANCESTOR_BITMAP)) - while (id && id < ancestor - IS_ANCESTOR_BITMAP) - id = get_ancestor_below(t, id, ancestor); - - bool ret = id && id < ancestor - ? test_ancestor_bitmap(t, id, ancestor) - : id == ancestor; - - EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor)); - return ret; -} - -static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) -{ - size_t idx = U32_MAX - id; - struct snapshot_table *new, *old; - - size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); - size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); - - if (unlikely(new_bytes > INT_MAX)) - return NULL; - - new = kvzalloc(new_bytes, GFP_KERNEL); - if (!new) - return NULL; - - new->nr = new_size; - - old = rcu_dereference_protected(c->snapshots, true); - if (old) - memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr); - - rcu_assign_pointer(c->snapshots, new); - kvfree_rcu(old, rcu); - - return &rcu_dereference_protected(c->snapshots, - lockdep_is_held(&c->snapshot_table_lock))->s[idx]; -} - -static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) -{ - size_t idx = U32_MAX - id; - struct snapshot_table *table = - rcu_dereference_protected(c->snapshots, - lockdep_is_held(&c->snapshot_table_lock)); - - lockdep_assert_held(&c->snapshot_table_lock); - - if (likely(table && idx < table->nr)) - return &table->s[idx]; - - return __snapshot_t_mut(c, id); -} - -void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - - if (BCH_SNAPSHOT_SUBVOL(s.v)) - prt_str(out, "subvol "); - if (BCH_SNAPSHOT_WILL_DELETE(s.v)) - prt_str(out, "will_delete "); - if (BCH_SNAPSHOT_DELETED(s.v)) - prt_str(out, "deleted "); - - prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u", - le32_to_cpu(s.v->parent), - le32_to_cpu(s.v->children[0]), - le32_to_cpu(s.v->children[1]), - le32_to_cpu(s.v->subvol), - le32_to_cpu(s.v->tree)); - - if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) - prt_printf(out, " depth %u skiplist %u %u %u", - le32_to_cpu(s.v->depth), - le32_to_cpu(s.v->skip[0]), - le32_to_cpu(s.v->skip[1]), - le32_to_cpu(s.v->skip[2])); -} - -int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_snapshot s; - u32 i, id; - int ret = 0; - - bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), - c, snapshot_pos_bad, - "bad pos"); - - s = bkey_s_c_to_snapshot(k); - - id = le32_to_cpu(s.v->parent); - bkey_fsck_err_on(id && id <= k.k->p.offset, - c, snapshot_parent_bad, - "bad parent node (%u <= %llu)", - id, k.k->p.offset); - - bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), - c, snapshot_children_not_normalized, - "children not normalized"); - - bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], - c, snapshot_child_duplicate, - "duplicate child nodes"); - - for (i = 0; i < 2; i++) { - id = le32_to_cpu(s.v->children[i]); - - bkey_fsck_err_on(id >= k.k->p.offset, - c, snapshot_child_bad, - "bad child node (%u >= %llu)", - id, k.k->p.offset); - } - - if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { - bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || - le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), - c, snapshot_skiplist_not_normalized, - "skiplist not normalized"); - - for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { - id = le32_to_cpu(s.v->skip[i]); - - bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), - c, snapshot_skiplist_bad, - "bad skiplist node %u", id); - } - } -fsck_err: - return ret; -} - -static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) -{ - mutex_lock(&c->snapshot_table_lock); - int ret = snapshot_t_mut(c, id) - ? 0 - : bch_err_throw(c, ENOMEM_mark_snapshot); - mutex_unlock(&c->snapshot_table_lock); - return ret; -} - -static int __bch2_mark_snapshot(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct snapshot_t *t; - u32 id = new.k->p.offset; - int ret = 0; - - mutex_lock(&c->snapshot_table_lock); - - t = snapshot_t_mut(c, id); - if (!t) { - ret = bch_err_throw(c, ENOMEM_mark_snapshot); - goto err; - } - - if (new.k->type == KEY_TYPE_snapshot) { - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - - t->state = !BCH_SNAPSHOT_DELETED(s.v) - ? SNAPSHOT_ID_live - : SNAPSHOT_ID_deleted; - t->parent = le32_to_cpu(s.v->parent); - t->children[0] = le32_to_cpu(s.v->children[0]); - t->children[1] = le32_to_cpu(s.v->children[1]); - t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; - t->tree = le32_to_cpu(s.v->tree); - - if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { - t->depth = le32_to_cpu(s.v->depth); - t->skip[0] = le32_to_cpu(s.v->skip[0]); - t->skip[1] = le32_to_cpu(s.v->skip[1]); - t->skip[2] = le32_to_cpu(s.v->skip[2]); - } else { - t->depth = 0; - t->skip[0] = 0; - t->skip[1] = 0; - t->skip[2] = 0; - } - - u32 parent = id; - - while ((parent = bch2_snapshot_parent_early(c, parent)) && - parent - id - 1 < IS_ANCESTOR_BITMAP) - __set_bit(parent - id - 1, t->is_ancestor); - - if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots) - bch2_delete_dead_snapshots_async(c); - } - } else { - memset(t, 0, sizeof(*t)); - } -err: - mutex_unlock(&c->snapshot_table_lock); - return ret; -} - -int bch2_mark_snapshot(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags); -} - -int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, - struct bch_snapshot *s) -{ - return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_with_updates, snapshot, s); -} - -/* fsck: */ - -static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) -{ - return snapshot_t(c, id)->children[child]; -} - -static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) -{ - return bch2_snapshot_child(c, id, 0); -} - -static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) -{ - return bch2_snapshot_child(c, id, 1); -} - -static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) -{ - u32 n, parent; - - n = bch2_snapshot_left_child(c, id); - if (n) - return n; - - while ((parent = bch2_snapshot_parent(c, id))) { - n = bch2_snapshot_right_child(c, parent); - if (n && n != id) - return n; - id = parent; - } - - return 0; -} - -u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root, - snapshot_id_list *skip) -{ - guard(rcu)(); - u32 id, subvol = 0, s; -retry: - id = snapshot_root; - while (id && bch2_snapshot_exists(c, id)) { - if (!(skip && snapshot_list_has_id(skip, id))) { - s = snapshot_t(c, id)->subvol; - - if (s && (!subvol || s < subvol)) - subvol = s; - } - id = bch2_snapshot_tree_next(c, id); - if (id == snapshot_root) - break; - } - - if (!subvol && skip) { - skip = NULL; - goto retry; - } - - return subvol; -} - -static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, - u32 snapshot_root, u32 *subvol_id) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - bool found = false; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, - 0, k, ret) { - if (k.k->type != KEY_TYPE_subvolume) - continue; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) - continue; - if (!BCH_SUBVOLUME_SNAP(s.v)) { - *subvol_id = s.k->p.offset; - found = true; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - if (!ret && !found) { - struct bkey_i_subvolume *u; - - *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL); - - u = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, *subvol_id), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - SET_BCH_SUBVOLUME_SNAP(&u->v, false); - } - - return ret; -} - -static int check_snapshot_tree(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_snapshot_tree st; - struct bch_snapshot s; - struct bch_subvolume subvol; - struct printbuf buf = PRINTBUF; - struct btree_iter snapshot_iter = {}; - u32 root_id; - int ret; - - if (k.k->type != KEY_TYPE_snapshot_tree) - return 0; - - st = bkey_s_c_to_snapshot_tree(k); - root_id = le32_to_cpu(st.v->root_snapshot); - - struct bkey_s_c_snapshot snapshot_k = - bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, - POS(0, root_id), 0, snapshot); - ret = bkey_err(snapshot_k); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (!ret) - bkey_val_copy(&s, snapshot_k); - - if (fsck_err_on(ret || - root_id != bch2_snapshot_root(c, root_id) || - st.k->p.offset != le32_to_cpu(s.tree), - trans, snapshot_tree_to_missing_snapshot, - "snapshot tree points to missing/incorrect snapshot:\n%s", - (bch2_bkey_val_to_text(&buf, c, st.s_c), - prt_newline(&buf), - ret - ? prt_printf(&buf, "(%s)", bch2_err_str(ret)) - : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c), - buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto err; - } - - if (!st.v->master_subvol) - goto out; - - ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, - trans, snapshot_tree_to_missing_subvol, - "snapshot tree points to missing subvolume:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(!bch2_snapshot_is_ancestor(c, - le32_to_cpu(subvol.snapshot), - root_id), - trans, snapshot_tree_to_wrong_subvol, - "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), - trans, snapshot_tree_to_snapshot_subvol, - "snapshot tree points to snapshot subvolume:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { - struct bkey_i_snapshot_tree *u; - u32 subvol_id; - - ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); - bch_err_fn(c, ret); - - if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ - ret = 0; - goto err; - } - - if (ret) - goto err; - - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.master_subvol = cpu_to_le32(subvol_id); - st = snapshot_tree_i_to_s_c(u); - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &snapshot_iter); - printbuf_exit(&buf); - return ret; -} - -/* - * For each snapshot_tree, make sure it points to the root of a snapshot tree - * and that snapshot entry points back to it, or delete it. - * - * And, make sure it points to a subvolume within that snapshot tree, or correct - * it to point to the oldest subvolume within that snapshot tree. - */ -int bch2_check_snapshot_trees(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_snapshot_trees, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_snapshot_tree(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -/* - * Look up snapshot tree for @tree_id and find root, - * make sure @snap_id is a descendent: - */ -static int snapshot_tree_ptr_good(struct btree_trans *trans, - u32 snap_id, u32 tree_id) -{ - struct bch_snapshot_tree s_t; - int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); - - if (bch2_err_matches(ret, ENOENT)) - return 0; - if (ret) - return ret; - - return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); -} - -u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) -{ - if (!id) - return 0; - - guard(rcu)(); - const struct snapshot_t *s = snapshot_t(c, id); - return s->parent - ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)) - : id; -} - -static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) -{ - unsigned i; - - for (i = 0; i < 3; i++) - if (!s.parent) { - if (s.skip[i]) - return false; - } else { - if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i]))) - return false; - } - - return true; -} - -/* - * snapshot_tree pointer was incorrect: look up root snapshot node, make sure - * its snapshot_tree pointer is correct (allocate new one if necessary), then - * update this node's pointer to root node's pointer: - */ -static int snapshot_tree_ptr_repair(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_snapshot *s) -{ - struct bch_fs *c = trans->c; - struct btree_iter root_iter; - struct bch_snapshot_tree s_t; - struct bkey_s_c_snapshot root; - struct bkey_i_snapshot *u; - u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; - int ret; - - root = bch2_bkey_get_iter_typed(trans, &root_iter, - BTREE_ID_snapshots, POS(0, root_id), - BTREE_ITER_with_updates, snapshot); - ret = bkey_err(root); - if (ret) - goto err; - - tree_id = le32_to_cpu(root.v->tree); - - ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { - u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u) ?: - bch2_snapshot_tree_create(trans, root_id, - bch2_snapshot_oldest_subvol(c, root_id, NULL), - &tree_id); - if (ret) - goto err; - - u->v.tree = cpu_to_le32(tree_id); - if (k.k->p.offset == root_id) - *s = u->v; - } - - if (k.k->p.offset != root_id) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.tree = cpu_to_le32(tree_id); - *s = u->v; - } -err: - bch2_trans_iter_exit(trans, &root_iter); - return ret; -} - -static int check_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bch_snapshot s; - struct bch_subvolume subvol; - struct bch_snapshot v; - struct bkey_i_snapshot *u; - u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); - u32 real_depth; - struct printbuf buf = PRINTBUF; - u32 i, id; - int ret = 0; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - memset(&s, 0, sizeof(s)); - memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); - - if (BCH_SNAPSHOT_DELETED(&s)) - return 0; - - id = le32_to_cpu(s.parent); - if (id) { - ret = bch2_snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot with nonexistent parent:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (ret) - goto err; - - if (le32_to_cpu(v.children[0]) != k.k->p.offset && - le32_to_cpu(v.children[1]) != k.k->p.offset) { - bch_err(c, "snapshot parent %u missing pointer to child %llu", - id, k.k->p.offset); - ret = -EINVAL; - goto err; - } - } - - for (i = 0; i < 2 && s.children[i]; i++) { - id = le32_to_cpu(s.children[i]); - - ret = bch2_snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot node %llu has nonexistent child %u", - k.k->p.offset, id); - if (ret) - goto err; - - if (le32_to_cpu(v.parent) != k.k->p.offset) { - bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", - id, le32_to_cpu(v.parent), k.k->p.offset); - ret = -EINVAL; - goto err; - } - } - - bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && - !BCH_SNAPSHOT_WILL_DELETE(&s); - - if (should_have_subvol) { - id = le32_to_cpu(s.subvol); - ret = bch2_subvolume_get(trans, id, false, &subvol); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot points to nonexistent subvolume:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (ret) - goto err; - - if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { - bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", - k.k->p.offset); - ret = -EINVAL; - goto err; - } - } else { - if (fsck_err_on(s.subvol, - trans, snapshot_should_not_have_subvol, - "snapshot should not point to subvol:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.subvol = 0; - s = u->v; - } - } - - ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); - if (ret < 0) - goto err; - - if (fsck_err_on(!ret, - trans, snapshot_to_bad_snapshot_tree, - "snapshot points to missing/incorrect tree:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = snapshot_tree_ptr_repair(trans, iter, k, &s); - if (ret) - goto err; - } - ret = 0; - - real_depth = bch2_snapshot_depth(c, parent_id); - - if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, - trans, snapshot_bad_depth, - "snapshot with incorrect depth field, should be %u:\n%s", - real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.depth = cpu_to_le32(real_depth); - s = u->v; - } - - ret = snapshot_skiplist_good(trans, k.k->p.offset, s); - if (ret < 0) - goto err; - - if (fsck_err_on(!ret, - trans, snapshot_bad_skiplist, - "snapshot with bad skiplist field:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) - u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id)); - - bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); - s = u->v; - } - ret = 0; -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_check_snapshots(struct bch_fs *c) -{ - /* - * We iterate backwards as checking/fixing the depth field requires that - * the parent's depth already be correct: - */ - int ret = bch2_trans_run(c, - for_each_btree_key_reverse_commit(trans, iter, - BTREE_ID_snapshots, POS_MAX, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_snapshot(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int check_snapshot_exists(struct btree_trans *trans, u32 id) -{ - struct bch_fs *c = trans->c; - - /* Do we need to reconstruct the snapshot_tree entry as well? */ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - u32 tree_id = 0; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, - 0, k, ret) { - if (k.k->type == KEY_TYPE_snapshot_tree && - le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { - tree_id = k.k->p.offset; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - return ret; - - if (!tree_id) { - ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); - if (ret) - return ret; - } - - struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); - ret = PTR_ERR_OR_ZERO(snapshot); - if (ret) - return ret; - - bkey_snapshot_init(&snapshot->k_i); - snapshot->k.p = POS(0, id); - snapshot->v.tree = cpu_to_le32(tree_id); - snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); - - for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, - 0, k, ret) { - if (k.k->type == KEY_TYPE_subvolume && - le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { - snapshot->v.subvol = cpu_to_le32(k.k->p.offset); - SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return bch2_snapshot_table_make_room(c, id) ?: - bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0); -} - -/* Figure out which snapshot nodes belong in the same tree: */ -struct snapshot_tree_reconstruct { - enum btree_id btree; - struct bpos cur_pos; - snapshot_id_list cur_ids; - DARRAY(snapshot_id_list) trees; -}; - -static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r) -{ - darray_for_each(r->trees, i) - darray_exit(i); - darray_exit(&r->trees); - darray_exit(&r->cur_ids); -} - -static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos) -{ - return r->btree == BTREE_ID_inodes - ? r->cur_pos.offset == pos.offset - : r->cur_pos.inode == pos.inode; -} - -static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) -{ - return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL; -} - -static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) -{ - bool first = true; - darray_for_each(*s, i) { - if (!first) - prt_char(out, ' '); - first = false; - prt_printf(out, "%u", *i); - } -} - -static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r) -{ - if (r->cur_ids.nr) { - darray_for_each(r->trees, i) - if (snapshot_id_lists_have_common(i, &r->cur_ids)) { - int ret = snapshot_list_merge(c, i, &r->cur_ids); - if (ret) - return ret; - goto out; - } - darray_push(&r->trees, r->cur_ids); - darray_init(&r->cur_ids); - } -out: - r->cur_ids.nr = 0; - return 0; -} - -static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos) -{ - if (!same_snapshot(r, pos)) - snapshot_tree_reconstruct_next(c, r); - r->cur_pos = pos; - return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot); -} - -int bch2_reconstruct_snapshots(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - struct snapshot_tree_reconstruct r = {}; - int ret = 0; - - for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { - if (btree_type_has_snapshots(btree)) { - r.btree = btree; - - ret = for_each_btree_key(trans, iter, btree, POS_MIN, - BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ - get_snapshot_trees(c, &r, k.k->p); - })); - if (ret) - goto err; - - snapshot_tree_reconstruct_next(c, &r); - } - } - - darray_for_each(r.trees, t) { - printbuf_reset(&buf); - snapshot_id_list_to_text(&buf, t); - - darray_for_each(*t, id) { - if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty, - trans, snapshot_node_missing, - "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { - if (t->nr > 1) { - bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; - } - - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_snapshot_exists(trans, *id)); - if (ret) - goto err; - } - } - } -fsck_err: -err: - bch2_trans_put(trans); - snapshot_tree_reconstruct_exit(&r); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -int __bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot); - - /* Snapshot was definitively deleted, this error is marked autofix */ - if (fsck_err_on(state == SNAPSHOT_ID_deleted, - trans, bkey_in_deleted_snapshot, - "key in deleted snapshot %s, delete?", - (bch2_btree_id_to_text(&buf, iter->btree_id), - prt_char(&buf, ' '), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; - - if (state == SNAPSHOT_ID_empty) { - /* - * Snapshot missing: we should have caught this with btree_lost_data and - * kicked off reconstruct_snapshots, so if we end up here we have no - * idea what happened. - * - * Do not delete unless we know that subvolumes and snapshots - * are consistent: - * - * XXX: - * - * We could be smarter here, and instead of using the generic - * recovery pass ratelimiting, track if there have been any - * changes to the snapshots or inodes btrees since those passes - * last ran. - */ - ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret; - ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret; - - if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)) - ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; - - unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0); - - if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot, - "key in missing snapshot %s, delete?", - (bch2_btree_id_to_text(&buf, iter->btree_id), - prt_char(&buf, ' '), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; - } - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int __bch2_get_snapshot_overwrites(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - snapshot_id_list *s) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos), - BTREE_ITER_all_snapshots, k, ret) { - if (!bkey_eq(k.k->p, pos)) - break; - - if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) || - snapshot_list_has_ancestor(c, s, k.k->p.snapshot)) - continue; - - ret = snapshot_list_add(c, s, k.k->p.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - if (ret) - darray_exit(s); - - return ret; -} - -/* - * Mark a snapshot as deleted, for future cleanup: - */ -int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) -{ - struct btree_iter iter; - struct bkey_i_snapshot *s = - bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, id), - 0, snapshot); - int ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - trans->c, "missing snapshot %u", id); - return ret; - } - - /* already deleted? */ - if (BCH_SNAPSHOT_WILL_DELETE(&s->v)) - goto err; - - SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true); - SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); - s->v.subvol = 0; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) -{ - if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) - swap(s->children[0], s->children[1]); -} - -static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter, p_iter = {}; - struct btree_iter c_iter = {}; - struct btree_iter tree_iter = {}; - u32 parent_id, child_id; - unsigned i; - int ret = 0; - - struct bkey_i_snapshot *s = - bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_intent, snapshot); - ret = PTR_ERR_OR_ZERO(s); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", id); - - if (ret) - goto err; - - BUG_ON(BCH_SNAPSHOT_DELETED(&s->v)); - BUG_ON(s->v.children[1]); - - parent_id = le32_to_cpu(s->v.parent); - child_id = le32_to_cpu(s->v.children[0]); - - if (parent_id) { - struct bkey_i_snapshot *parent; - - parent = bch2_bkey_get_mut_typed(trans, &p_iter, - BTREE_ID_snapshots, POS(0, parent_id), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(parent); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", parent_id); - if (unlikely(ret)) - goto err; - - /* find entry in parent->children for node being deleted */ - for (i = 0; i < 2; i++) - if (le32_to_cpu(parent->v.children[i]) == id) - break; - - if (bch2_fs_inconsistent_on(i == 2, c, - "snapshot %u missing child pointer to %u", - parent_id, id)) - goto err; - - parent->v.children[i] = cpu_to_le32(child_id); - - normalize_snapshot_child_pointers(&parent->v); - } - - if (child_id) { - struct bkey_i_snapshot *child; - - child = bch2_bkey_get_mut_typed(trans, &c_iter, - BTREE_ID_snapshots, POS(0, child_id), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(child); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", child_id); - if (unlikely(ret)) - goto err; - - child->v.parent = cpu_to_le32(parent_id); - - if (!child->v.parent) { - child->v.skip[0] = 0; - child->v.skip[1] = 0; - child->v.skip[2] = 0; - } - } - - if (!parent_id) { - /* - * We're deleting the root of a snapshot tree: update the - * snapshot_tree entry to point to the new root, or delete it if - * this is the last snapshot ID in this tree: - */ - struct bkey_i_snapshot_tree *s_t; - - BUG_ON(s->v.children[1]); - - s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, - BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)), - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(s_t); - if (ret) - goto err; - - if (s->v.children[0]) { - s_t->v.root_snapshot = s->v.children[0]; - } else { - s_t->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&s_t->k, 0); - } - } - - if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) { - SET_BCH_SNAPSHOT_DELETED(&s->v, true); - s->v.parent = 0; - s->v.children[0] = 0; - s->v.children[1] = 0; - s->v.subvol = 0; - s->v.tree = 0; - s->v.depth = 0; - s->v.skip[0] = 0; - s->v.skip[1] = 0; - s->v.skip[2] = 0; - } else { - s->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&s->k, 0); - } -err: - bch2_trans_iter_exit(trans, &tree_iter); - bch2_trans_iter_exit(trans, &p_iter); - bch2_trans_iter_exit(trans, &c_iter); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i_snapshot *n; - struct bkey_s_c k; - unsigned i, j; - u32 depth = bch2_snapshot_depth(c, parent); - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS_MIN, BTREE_ITER_intent); - k = bch2_btree_iter_peek(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - for (i = 0; i < nr_snapids; i++) { - k = bch2_btree_iter_prev_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k || !k.k->p.offset) { - ret = bch_err_throw(c, ENOSPC_snapshot_create); - goto err; - } - - n = bch2_bkey_alloc(trans, &iter, 0, snapshot); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - n->v.flags = 0; - n->v.parent = cpu_to_le32(parent); - n->v.subvol = cpu_to_le32(snapshot_subvols[i]); - n->v.tree = cpu_to_le32(tree); - n->v.depth = cpu_to_le32(depth); - n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); - n->v.btime.hi = 0; - - for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) - n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); - - bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); - SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); - - ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); - if (ret) - goto err; - - new_snapids[i] = iter.pos.offset; - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * Create new snapshot IDs as children of an existing snapshot ID: - */ -static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct btree_iter iter; - struct bkey_i_snapshot *n_parent; - int ret = 0; - - n_parent = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, parent), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(n_parent); - if (unlikely(ret)) { - if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot %u not found", parent); - return ret; - } - - if (n_parent->v.children[0] || n_parent->v.children[1]) { - bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); - ret = -EINVAL; - goto err; - } - - ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), - new_snapids, snapshot_subvols, nr_snapids); - if (ret) - goto err; - - n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); - n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); - n_parent->v.subvol = 0; - SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * Create a snapshot node that is the root of a new tree: - */ -static int bch2_snapshot_node_create_tree(struct btree_trans *trans, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct bkey_i_snapshot_tree *n_tree; - int ret; - - n_tree = __bch2_snapshot_tree_create(trans); - ret = PTR_ERR_OR_ZERO(n_tree) ?: - create_snapids(trans, 0, n_tree->k.p.offset, - new_snapids, snapshot_subvols, nr_snapids); - if (ret) - return ret; - - n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); - n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); - return 0; -} - -int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - BUG_ON((parent == 0) != (nr_snapids == 1)); - BUG_ON((parent != 0) != (nr_snapids == 2)); - - return parent - ? bch2_snapshot_node_create_children(trans, parent, - new_snapids, snapshot_subvols, nr_snapids) - : bch2_snapshot_node_create_tree(trans, - new_snapids, snapshot_subvols, nr_snapids); - -} - -/* - * If we have an unlinked inode in an internal snapshot node, and the inode - * really has been deleted in all child snapshots, how does this get cleaned up? - * - * first there is the problem of how keys that have been overwritten in all - * child snapshots get deleted (unimplemented?), but inodes may perhaps be - * special? - * - * also: unlinked inode in internal snapshot appears to not be getting deleted - * correctly if inode doesn't exist in leaf snapshots - * - * solution: - * - * for a key in an interior snapshot node that needs work to be done that - * requires it to be mutated: iterate over all descendent leaf nodes and copy - * that key to snapshot leaf nodes, where we can mutate it - */ - -static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) -{ - struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id); - return i ? i->live_child : 0; -} - -static unsigned __live_child(struct snapshot_table *t, u32 id, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) -{ - struct snapshot_t *s = __snapshot_t(t, id); - if (!s) - return 0; - - for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) - if (s->children[i] && - !snapshot_list_has_id(delete_leaves, s->children[i]) && - !interior_delete_has_id(delete_interior, s->children[i])) - return s->children[i]; - - for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { - u32 live_child = s->children[i] - ? __live_child(t, s->children[i], delete_leaves, delete_interior) - : 0; - if (live_child) - return live_child; - } - - return 0; -} - -static unsigned live_child(struct bch_fs *c, u32 id) -{ - struct snapshot_delete *d = &c->snapshot_delete; - - guard(rcu)(); - return __live_child(rcu_dereference(c->snapshots), id, - &d->delete_leaves, &d->delete_interior); -} - -static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) -{ - return snapshot_list_has_id(&d->delete_leaves, id) || - interior_delete_has_id(&d->delete_interior, id) != 0; -} - -static int delete_dead_snapshots_process_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct snapshot_delete *d = &trans->c->snapshot_delete; - - if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot)) - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - - u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot); - if (live_child) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - new->k.p.snapshot = live_child; - - struct btree_iter dst_iter; - struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, - iter->btree_id, new->k.p, - BTREE_ITER_all_snapshots| - BTREE_ITER_intent); - ret = bkey_err(dst_k); - if (ret) - return ret; - - ret = (bkey_deleted(dst_k.k) - ? bch2_trans_update(trans, &dst_iter, new, - BTREE_UPDATE_internal_snapshot_node) - : 0) ?: - bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &dst_iter); - return ret; - } - - return 0; -} - -static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - - u64 inum = iter->btree_id != BTREE_ID_inodes - ? iter->pos.inode - : iter->pos.offset; - - if (*prev_inum == inum) - return false; - - *prev_inum = inum; - - bool ret = !snapshot_list_has_id(&d->deleting_from_trees, - bch2_snapshot_tree(c, iter->pos.snapshot)); - if (unlikely(ret)) { - struct bpos pos = iter->pos; - pos.snapshot = 0; - if (iter->btree_id != BTREE_ID_inodes) - pos.offset = U64_MAX; - bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos)); - } - - return ret; -} - -static int delete_dead_snapshot_keys_v1(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - - for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { - struct disk_reservation res = { 0 }; - u64 prev_inum = 0; - - d->pos.pos = POS_MIN; - - if (!btree_type_has_snapshots(d->pos.btree)) - continue; - - int ret = for_each_btree_key_commit(trans, iter, - d->pos.btree, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - d->pos.pos = iter.pos; - - if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) - continue; - - delete_dead_snapshots_process_key(trans, &iter, k); - })); - - bch2_disk_reservation_put(c, &res); - - if (ret) - return ret; - } - - return 0; -} - -static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree, - struct bpos start, struct bpos end) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - struct disk_reservation res = { 0 }; - - d->pos.btree = btree; - d->pos.pos = POS_MIN; - - int ret = for_each_btree_key_max_commit(trans, iter, - btree, start, end, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - d->pos.pos = iter.pos; - delete_dead_snapshots_process_key(trans, &iter, k); - })); - - bch2_disk_reservation_put(c, &res); - return ret; -} - -static int delete_dead_snapshot_keys_v2(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - struct disk_reservation res = { 0 }; - u64 prev_inum = 0; - int ret = 0; - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); - - while (1) { - struct bkey_s_c k; - ret = lockrestart_do(trans, - bkey_err(k = bch2_btree_iter_peek(trans, &iter))); - if (ret) - break; - - if (!k.k) - break; - - d->pos.btree = iter.btree_id; - d->pos.pos = iter.pos; - - if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) - continue; - - if (snapshot_id_dying(d, k.k->p.snapshot)) { - struct bpos start = POS(k.k->p.offset, 0); - struct bpos end = POS(k.k->p.offset, U64_MAX); - - ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?: - delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?: - delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end); - if (ret) - break; - - bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1)); - } else { - bch2_btree_iter_advance(trans, &iter); - } - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - goto err; - - prev_inum = 0; - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - d->pos.btree = iter.btree_id; - d->pos.pos = iter.pos; - - if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) - continue; - - delete_dead_snapshots_process_key(trans, &iter, k); - })); -err: - bch2_disk_reservation_put(c, &res); - return ret; -} - -/* - * For a given snapshot, if it doesn't have a subvolume that points to it, and - * it doesn't have child snapshot nodes - it's now redundant and we can mark it - * as deleted. - */ -static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k) -{ - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - unsigned live_children = 0; - int ret = 0; - - if (BCH_SNAPSHOT_SUBVOL(s.v)) - return 0; - - if (BCH_SNAPSHOT_DELETED(s.v)) - return 0; - - mutex_lock(&d->progress_lock); - for (unsigned i = 0; i < 2; i++) { - u32 child = le32_to_cpu(s.v->children[i]); - - live_children += child && - !snapshot_list_has_id(&d->delete_leaves, child); - } - - u32 tree = bch2_snapshot_tree(c, s.k->p.offset); - - if (live_children == 0) { - ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: - snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); - } else if (live_children == 1) { - struct snapshot_interior_delete n = { - .id = s.k->p.offset, - .live_child = live_child(c, s.k->p.offset), - }; - - if (!n.live_child) { - bch_err(c, "error finding live child of snapshot %u", n.id); - ret = -EINVAL; - } else { - ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: - darray_push(&d->delete_interior, n); - } - } - mutex_unlock(&d->progress_lock); - - return ret; -} - -static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, - interior_delete_list *skip) -{ - guard(rcu)(); - while (interior_delete_has_id(skip, id)) - id = __bch2_snapshot_parent(c, id); - - while (n--) { - do { - id = __bch2_snapshot_parent(c, id); - } while (interior_delete_has_id(skip, id)); - } - - return id; -} - -static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c k, - interior_delete_list *deleted) -{ - struct bch_fs *c = trans->c; - u32 nr_deleted_ancestors = 0; - struct bkey_i_snapshot *s; - int ret; - - if (!bch2_snapshot_exists(c, k.k->p.offset)) - return 0; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - if (interior_delete_has_id(deleted, k.k->p.offset)) - return 0; - - s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - darray_for_each(*deleted, i) - nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); - - if (!nr_deleted_ancestors) - return 0; - - le32_add_cpu(&s->v.depth, -nr_deleted_ancestors); - - if (!s->v.depth) { - s->v.skip[0] = 0; - s->v.skip[1] = 0; - s->v.skip[2] = 0; - } else { - u32 depth = le32_to_cpu(s->v.depth); - u32 parent = bch2_snapshot_parent(c, s->k.p.offset); - - for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { - u32 id = le32_to_cpu(s->v.skip[j]); - - if (interior_delete_has_id(deleted, id)) { - id = bch2_snapshot_nth_parent_skip(c, - parent, - depth > 1 - ? get_random_u32_below(depth - 1) - : 0, - deleted); - s->v.skip[j] = cpu_to_le32(id); - } - } - - bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32); - } - - return bch2_trans_update(trans, iter, &s->k_i, 0); -} - -static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d) -{ - prt_printf(out, "deleting from trees"); - darray_for_each(d->deleting_from_trees, i) - prt_printf(out, " %u", *i); - - prt_printf(out, "deleting leaves"); - darray_for_each(d->delete_leaves, i) - prt_printf(out, " %u", *i); - prt_newline(out); - - prt_printf(out, "interior"); - darray_for_each(d->delete_interior, i) - prt_printf(out, " %u->%u", i->id, i->live_child); - prt_newline(out); -} - -int __bch2_delete_dead_snapshots(struct bch_fs *c) -{ - struct snapshot_delete *d = &c->snapshot_delete; - int ret = 0; - - if (!mutex_trylock(&d->lock)) - return 0; - - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) - goto out_unlock; - - struct btree_trans *trans = bch2_trans_get(c); - - /* - * For every snapshot node: If we have no live children and it's not - * pointed to by a subvolume, delete it: - */ - d->running = true; - d->pos = BBPOS_MIN; - - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, - check_should_delete_snapshot(trans, k)); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "walking snapshots"); - if (ret) - goto err; - - if (!d->delete_leaves.nr && !d->delete_interior.nr) - goto err; - - { - struct printbuf buf = PRINTBUF; - bch2_snapshot_delete_nodes_to_text(&buf, d); - - ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); - printbuf_exit(&buf); - if (ret) - goto err; - } - - ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2) - ? delete_dead_snapshot_keys_v2(trans) - : delete_dead_snapshot_keys_v1(trans); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting keys from dying snapshots"); - if (ret) - goto err; - - darray_for_each(d->delete_leaves, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting snapshot %u", *i); - if (ret) - goto err; - } - - /* - * Fixing children of deleted snapshots can't be done completely - * atomically, if we crash between here and when we delete the interior - * nodes some depth fields will be off: - */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, - BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); - if (ret) - goto err; - - darray_for_each(d->delete_interior, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, i->id)); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting snapshot %u", i->id); - if (ret) - goto err; - } -err: - mutex_lock(&d->progress_lock); - darray_exit(&d->deleting_from_trees); - darray_exit(&d->delete_interior); - darray_exit(&d->delete_leaves); - d->running = false; - mutex_unlock(&d->progress_lock); - bch2_trans_put(trans); - - bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots); -out_unlock: - mutex_unlock(&d->lock); - if (!bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); - return ret; -} - -int bch2_delete_dead_snapshots(struct bch_fs *c) -{ - if (!c->opts.auto_snapshot_deletion) - return 0; - - return __bch2_delete_dead_snapshots(c); -} - -void bch2_delete_dead_snapshots_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); - - set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); - - bch2_delete_dead_snapshots(c); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); -} - -void bch2_delete_dead_snapshots_async(struct bch_fs *c) -{ - if (!c->opts.auto_snapshot_deletion) - return; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots)) - return; - - BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - - if (!queue_work(system_long_wq, &c->snapshot_delete.work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); -} - -void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct snapshot_delete *d = &c->snapshot_delete; - - if (!d->running) { - prt_str(out, "(not running)"); - return; - } - - mutex_lock(&d->progress_lock); - bch2_snapshot_delete_nodes_to_text(out, d); - - bch2_bbpos_to_text(out, d->pos); - mutex_unlock(&d->progress_lock); -} - -int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos), - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots, - k, ret) { - if (!bkey_eq(pos, k.k->p)) - break; - - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { - ret = 1; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) -{ - /* If there's one child, it's redundant and keys will be moved to the child */ - return !!snap.v->children[0] + !!snap.v->children[1] == 1; -} - -static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) -{ - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_WILL_DELETE(snap.v) || - interior_snapshot_needs_delete(snap)) - set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); - - return 0; -} - -int bch2_snapshots_read(struct bch_fs *c) -{ - /* - * Initializing the is_ancestor bitmaps requires ancestors to already be - * initialized - so mark in reverse: - */ - int ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, - POS_MAX, 0, k, - __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_check_snapshot_needs_deletion(trans, k))); - bch_err_fn(c, ret); - - /* - * It's important that we check if we need to reconstruct snapshots - * before going RW, so we mark that pass as required in the superblock - - * otherwise, we could end up deleting keys with missing snapshot nodes - * instead - */ - BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && - test_bit(BCH_FS_may_go_rw, &c->flags)); - - return ret; -} - -void bch2_fs_snapshots_exit(struct bch_fs *c) -{ - kvfree(rcu_dereference_protected(c->snapshots, true)); -} - -void bch2_fs_snapshots_init_early(struct bch_fs *c) -{ - INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); - mutex_init(&c->snapshot_delete.lock); - mutex_init(&c->snapshot_delete.progress_lock); - mutex_init(&c->snapshots_unlinked_lock); -} diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h deleted file mode 100644 index 6766bf673ed92c..00000000000000 --- a/fs/bcachefs/snapshot.h +++ /dev/null @@ -1,275 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SNAPSHOT_H -#define _BCACHEFS_SNAPSHOT_H - -void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); - -#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ - .key_validate = bch2_snapshot_tree_validate, \ - .val_to_text = bch2_snapshot_tree_to_text, \ - .min_val_size = 8, \ -}) - -struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); - -int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); - -void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ - .key_validate = bch2_snapshot_validate, \ - .val_to_text = bch2_snapshot_to_text, \ - .trigger = bch2_mark_snapshot, \ - .min_val_size = 24, \ -}) - -static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) -{ - u32 idx = U32_MAX - id; - - return likely(t && idx < t->nr) - ? &t->s[idx] - : NULL; -} - -static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) -{ - return __snapshot_t(rcu_dereference(c->snapshots), id); -} - -static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->tree : 0; -} - -static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->parent : 0; -} - -static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - return __bch2_snapshot_parent_early(c, id); -} - -static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - if (!s) - return 0; - - u32 parent = s->parent; - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - parent && - s->depth != snapshot_t(c, parent)->depth + 1) - panic("id %u depth=%u parent %u depth=%u\n", - id, snapshot_t(c, id)->depth, - parent, snapshot_t(c, parent)->depth); - - return parent; -} - -static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - return __bch2_snapshot_parent(c, id); -} - -static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) -{ - guard(rcu)(); - while (n--) - id = __bch2_snapshot_parent(c, id); - return id; -} - -u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *); -u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); - -static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - - u32 parent; - while ((parent = __bch2_snapshot_parent(c, id))) - id = parent; - return id; -} - -static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->state : SNAPSHOT_ID_empty; -} - -static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - return __bch2_snapshot_id_state(c, id); -} - -static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) -{ - return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live; -} - -static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; -} - -static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) -{ - int ret = bch2_snapshot_is_internal_node(c, id); - if (ret < 0) - return ret; - return !ret; -} - -static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) -{ - guard(rcu)(); - return parent ? snapshot_t(c, parent)->depth + 1 : 0; -} - -bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); - -static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -{ - return id == ancestor - ? true - : __bch2_snapshot_is_ancestor(c, id, ancestor); -} - -static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - const struct snapshot_t *t = snapshot_t(c, id); - return t && (t->children[0]|t->children[1]) != 0; -} - -static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) -{ - return darray_find(*s, id) != NULL; -} - -static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) -{ - darray_for_each(*s, i) - if (bch2_snapshot_is_ancestor(c, id, *i)) - return true; - return false; -} - -static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) -{ - BUG_ON(snapshot_list_has_id(s, id)); - int ret = darray_push(s, id); - if (ret) - bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); - return ret; -} - -static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id) -{ - int ret = snapshot_list_has_id(s, id) - ? 0 - : darray_push(s, id); - if (ret) - bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); - return ret; -} - -static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src) -{ - darray_for_each(*src, i) { - int ret = snapshot_list_add_nodup(c, dst, *i); - if (ret) - return ret; - } - - return 0; -} - -int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, - struct bch_snapshot *s); -int bch2_snapshot_get_subvol(struct btree_trans *, u32, - struct bch_subvolume *); - -/* only exported for tests: */ -int bch2_snapshot_node_create(struct btree_trans *, u32, - u32 *, u32 *, unsigned); - -int bch2_check_snapshot_trees(struct bch_fs *); -int bch2_check_snapshots(struct bch_fs *); -int bch2_reconstruct_snapshots(struct bch_fs *); - -int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); - -static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot)) - ? 0 - : __bch2_check_key_has_snapshot(trans, iter, k); -} - -int __bch2_get_snapshot_overwrites(struct btree_trans *, - enum btree_id, struct bpos, - snapshot_id_list *); - -/* - * Get a list of snapshot IDs that have overwritten a given key: - */ -static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - snapshot_id_list *s) -{ - darray_init(s); - - return bch2_snapshot_has_children(trans->c, pos.snapshot) - ? __bch2_get_snapshot_overwrites(trans, btree, pos, s) - : 0; - -} - -int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); - -int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); - -static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - if (!btree_type_has_snapshots(id) || - bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0) - return 0; - - return __bch2_key_has_snapshot_overwrites(trans, id, pos); -} - -int __bch2_delete_dead_snapshots(struct bch_fs *); -int bch2_delete_dead_snapshots(struct bch_fs *); -void bch2_delete_dead_snapshots_work(struct work_struct *); -void bch2_delete_dead_snapshots_async(struct bch_fs *); -void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *); - -int bch2_snapshots_read(struct bch_fs *); -void bch2_fs_snapshots_exit(struct bch_fs *); -void bch2_fs_snapshots_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_SNAPSHOT_H */ diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h deleted file mode 100644 index 9bccae1f3590ad..00000000000000 --- a/fs/bcachefs/snapshot_format.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H -#define _BCACHEFS_SNAPSHOT_FORMAT_H - -struct bch_snapshot { - struct bch_val v; - __le32 flags; - __le32 parent; - __le32 children[2]; - __le32 subvol; - /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ - __le32 tree; - __le32 depth; - __le32 skip[3]; - bch_le128 btime; -}; - -LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) -/* True if a subvolume points to this snapshot node: */ -LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) - -/* - * Snapshot trees: - * - * The snapshot_trees btree gives us persistent indentifier for each tree of - * bch_snapshot nodes, and allow us to record and easily find the root/master - * subvolume that other snapshots were created from: - */ -struct bch_snapshot_tree { - struct bch_val v; - __le32 master_subvol; - __le32 root_snapshot; -}; - -#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */ diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h deleted file mode 100644 index 0ab698f13e5c6a..00000000000000 --- a/fs/bcachefs/snapshot_types.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SNAPSHOT_TYPES_H -#define _BCACHEFS_SNAPSHOT_TYPES_H - -#include "bbpos_types.h" -#include "darray.h" -#include "subvolume_types.h" - -typedef DARRAY(u32) snapshot_id_list; - -#define IS_ANCESTOR_BITMAP 128 - -struct snapshot_t { - enum snapshot_id_state { - SNAPSHOT_ID_empty, - SNAPSHOT_ID_live, - SNAPSHOT_ID_deleted, - } state; - u32 parent; - u32 skip[3]; - u32 depth; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 tree; - unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; -}; - -struct snapshot_table { - struct rcu_head rcu; - size_t nr; -#ifndef RUST_BINDGEN - DECLARE_FLEX_ARRAY(struct snapshot_t, s); -#else - struct snapshot_t s[0]; -#endif -}; - -struct snapshot_interior_delete { - u32 id; - u32 live_child; -}; -typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; - -struct snapshot_delete { - struct mutex lock; - struct work_struct work; - - struct mutex progress_lock; - snapshot_id_list deleting_from_trees; - snapshot_id_list delete_leaves; - interior_delete_list delete_interior; - - bool running; - struct bbpos pos; -}; - -#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */ diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c deleted file mode 100644 index 3e9f59226bdf2b..00000000000000 --- a/fs/bcachefs/str_hash.c +++ /dev/null @@ -1,400 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_update.h" -#include "dirent.h" -#include "fsck.h" -#include "str_hash.h" -#include "subvolume.h" - -static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) -{ - if (d.v->d_type == DT_SUBVOL) { - struct bch_subvolume subvol; - int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol), - false, &subvol); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - return !ret; - } else { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k); - bch2_trans_iter_exit(trans, &iter); - return ret; - } -} - -static int bch2_fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old, - bool *updated_before_k_pos) -{ - struct bch_fs *c = trans->c; - struct qstr old_name = bch2_dirent_get_name(old); - struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bkey_dirent_init(&new->k_i); - dirent_copy_target(new, old); - new->k.p = old.k->p; - - char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20); - ret = PTR_ERR_OR_ZERO(renamed_buf); - if (ret) - return ret; - - for (unsigned i = 0; i < 1000; i++) { - new->k.u64s = BKEY_U64s_MAX; - - struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf, - sprintf(renamed_buf, "%.*s.fsck_renamed-%u", - old_name.len, old_name.name, i)); - - ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL); - if (ret) - return ret; - - ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - (subvol_inum) { 0, old.k->p.inode }, - old.k->p.snapshot, &new->k_i, - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create); - if (ret && !bch2_err_matches(ret, EEXIST)) - break; - if (!ret) { - if (bpos_lt(new->k.p, old.k->p)) - *updated_before_k_pos = true; - break; - } - } - - ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); - bch_err_fn(c, ret); - return ret; -} - -static noinline int hash_pick_winner(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c k1, - struct bkey_s_c k2) -{ - if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && - !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) - return 0; - - switch (desc.btree_id) { - case BTREE_ID_dirents: { - int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1)); - if (ret < 0) - return ret; - if (!ret) - return 0; - - ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2)); - if (ret < 0) - return ret; - if (!ret) - return 1; - return 2; - } - default: - return 0; - } -} - -/* - * str_hash lookups across snapshots break in wild ways if hash_info in - * different snapshot versions doesn't match - so if we find one mismatch, check - * them all - */ -int bch2_repair_inode_hash_info(struct btree_trans *trans, - struct bch_inode_unpacked *snapshot_root) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - bool need_commit = false; - int ret = 0; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, - POS(0, snapshot_root->bi_inum), - BTREE_ITER_all_snapshots, k, ret) { - if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot))) - break; - if (!bkey_is_inode(k.k)) - continue; - - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(k, &inode); - if (ret) - break; - - if (inode.bi_hash_seed == snapshot_root->bi_hash_seed && - INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root); - struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); - - BUG_ON(hash1.type != hash2.type || - memcmp(&hash1.siphash_key, - &hash2.siphash_key, - sizeof(hash1.siphash_key))); -#endif - continue; - } - - printbuf_reset(&buf); - prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n", - snapshot_root->bi_inum, - inode.bi_snapshot, - snapshot_root->bi_snapshot); - - bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode)); - prt_printf(&buf, " %llx\n", inode.bi_hash_seed); - - bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); - prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed); - - if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) { - inode.bi_hash_seed = snapshot_root->bi_hash_seed; - SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); - - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - break; - need_commit = true; - } - } - - if (ret) - goto err; - - if (!need_commit) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", - snapshot_root->bi_inum); - - prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot); - bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); - prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed); -#if 0 - prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); - bch2_prt_str_hash_type(&buf, hash_info->type); - prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); -#endif - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; - } - - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; -err: -fsck_err: - printbuf_exit(&buf); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * All versions of the same inode in different snapshots must have the same hash - * seed/type: verify that the hash info we're using matches the root - */ -static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, - struct bch_hash_info *hash_info) -{ - struct bch_inode_unpacked snapshot_root; - int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); - if (ret) - return ret; - - struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root); - if (hash_info->type != hash_root.type || - memcmp(&hash_info->siphash_key, - &hash_root.siphash_key, - sizeof(hash_root.siphash_key))) - ret = bch2_repair_inode_hash_info(trans, &snapshot_root); - - return ret; -} - -/* Put a str_hash key in its proper location, checking for duplicates */ -int bch2_str_hash_repair_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc *desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c k, - struct btree_iter *dup_iter, struct bkey_s_c dup_k, - bool *updated_before_k_pos) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - bool free_snapshots_seen = false; - int ret = 0; - - if (!s) { - s = bch2_trans_kmalloc(trans, sizeof(*s)); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - goto out; - - s->pos = k_iter->pos; - darray_init(&s->ids); - - ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids); - if (ret) - goto out; - - free_snapshots_seen = true; - } - - if (!dup_k.k) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info, - (subvol_inum) { 0, new->k.p.inode }, - new->k.p.snapshot, new, - STR_HASH_must_create| - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node); - ret = bkey_err(dup_k); - if (ret) - goto out; - if (dup_k.k) - goto duplicate_entries; - - if (bpos_lt(new->k.p, k.k->p)) - *updated_before_k_pos = true; - - ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id, - k_iter->pos, new->k.p) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_commit; - } else { -duplicate_entries: - ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k); - if (ret < 0) - goto out; - - if (!fsck_err(trans, hash_table_key_duplicate, - "duplicate hash table keys%s:\n%s", - ret != 2 ? "" : ", both point to valid inodes", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, dup_k), - buf.buf))) - goto out; - - switch (ret) { - case 0: - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - break; - case 1: - ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0); - break; - case 2: - ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info, - bkey_s_c_to_dirent(k), - updated_before_k_pos) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_ITER_with_updates); - goto out; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_commit; - } -out: -fsck_err: - bch2_trans_iter_exit(trans, dup_iter); - printbuf_exit(&buf); - if (free_snapshots_seen) - darray_exit(&s->ids); - return ret; -} - -int __bch2_str_hash_check_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc *desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k, - bool *updated_before_k_pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter = {}; - struct printbuf buf = PRINTBUF; - struct bkey_s_c k; - int ret = 0; - - u64 hash = desc->hash_bkey(hash_info, hash_k); - if (hash_k.k->p.offset < hash) - goto bad_hash; - - for_each_btree_key_norestart(trans, iter, desc->btree_id, - SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_slots| - BTREE_ITER_with_updates, k, ret) { - if (bkey_eq(k.k->p, hash_k.k->p)) - break; - - if (k.k->type == desc->key_type && - !desc->cmp_bkey(k, hash_k)) { - ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, - hash_info) ?: - bch2_str_hash_repair_key(trans, s, desc, hash_info, - k_iter, hash_k, - &iter, k, updated_before_k_pos); - break; - } - - if (bkey_deleted(k.k)) - goto bad_hash; - } - bch2_trans_iter_exit(trans, &iter); -out: -fsck_err: - printbuf_exit(&buf); - return ret; -bad_hash: - bch2_trans_iter_exit(trans, &iter); - /* - * Before doing any repair, check hash_info itself: - */ - ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info); - if (ret) - goto out; - - if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: should be at %llu\n%s", - hash, - (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) - ret = bch2_str_hash_repair_key(trans, s, desc, hash_info, - k_iter, hash_k, - &iter, bkey_s_c_null, - updated_before_k_pos); - goto out; -} diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h deleted file mode 100644 index 8979ac2d7a3bed..00000000000000 --- a/fs/bcachefs/str_hash.h +++ /dev/null @@ -1,431 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_STR_HASH_H -#define _BCACHEFS_STR_HASH_H - -#include "btree_iter.h" -#include "btree_update.h" -#include "checksum.h" -#include "error.h" -#include "inode.h" -#include "siphash.h" -#include "subvolume.h" -#include "super.h" - -#include -#include - -static inline enum bch_str_hash_type -bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) -{ - switch (opt) { - case BCH_STR_HASH_OPT_crc32c: - return BCH_STR_HASH_crc32c; - case BCH_STR_HASH_OPT_crc64: - return BCH_STR_HASH_crc64; - case BCH_STR_HASH_OPT_siphash: - return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) - ? BCH_STR_HASH_siphash - : BCH_STR_HASH_siphash_old; - default: - BUG(); - } -} - -struct bch_hash_info { - u32 inum_snapshot; - u8 type; - struct unicode_map *cf_encoding; - /* - * For crc32 or crc64 string hashes the first key value of - * the siphash_key (k0) is used as the key. - */ - SIPHASH_KEY siphash_key; -}; - -static inline struct bch_hash_info -bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) -{ - struct bch_hash_info info = { - .inum_snapshot = bi->bi_snapshot, - .type = INODE_STR_HASH(bi), - .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, - .siphash_key = { .k0 = bi->bi_hash_seed } - }; - - if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { - u8 digest[SHA256_DIGEST_SIZE]; - - sha256((const u8 *)&bi->bi_hash_seed, - sizeof(bi->bi_hash_seed), digest); - memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); - } - - return info; -} - -struct bch_str_hash_ctx { - union { - u32 crc32c; - u64 crc64; - SIPHASH_CTX siphash; - }; -}; - -static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info) -{ - switch (info->type) { - case BCH_STR_HASH_crc32c: - ctx->crc32c = crc32c(~0, &info->siphash_key.k0, - sizeof(info->siphash_key.k0)); - break; - case BCH_STR_HASH_crc64: - ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, - sizeof(info->siphash_key.k0)); - break; - case BCH_STR_HASH_siphash_old: - case BCH_STR_HASH_siphash: - SipHash24_Init(&ctx->siphash, &info->siphash_key); - break; - default: - BUG(); - } -} - -static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info, - const void *data, size_t len) -{ - switch (info->type) { - case BCH_STR_HASH_crc32c: - ctx->crc32c = crc32c(ctx->crc32c, data, len); - break; - case BCH_STR_HASH_crc64: - ctx->crc64 = crc64_be(ctx->crc64, data, len); - break; - case BCH_STR_HASH_siphash_old: - case BCH_STR_HASH_siphash: - SipHash24_Update(&ctx->siphash, data, len); - break; - default: - BUG(); - } -} - -static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info) -{ - switch (info->type) { - case BCH_STR_HASH_crc32c: - return ctx->crc32c; - case BCH_STR_HASH_crc64: - return ctx->crc64 >> 1; - case BCH_STR_HASH_siphash_old: - case BCH_STR_HASH_siphash: - return SipHash24_End(&ctx->siphash) >> 1; - default: - BUG(); - } -} - -struct bch_hash_desc { - enum btree_id btree_id; - u8 key_type; - - u64 (*hash_key)(const struct bch_hash_info *, const void *); - u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); - bool (*cmp_key)(struct bkey_s_c, const void *); - bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); - bool (*is_visible)(subvol_inum inum, struct bkey_s_c); -}; - -static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k) -{ - return k.k->type == desc.key_type && - (!desc.is_visible || - !inum.inum || - desc.is_visible(inum, k)); -} - -static __always_inline struct bkey_s_c -bch2_hash_lookup_in_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key, - enum btree_iter_update_trigger_flags flags, - u32 snapshot) -{ - struct bkey_s_c k; - int ret; - - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(inum.inum, desc.hash_key(info, key), snapshot), - POS(inum.inum, U64_MAX), - BTREE_ITER_slots|flags, k, ret) { - if (is_visible_key(desc, inum, k)) { - if (!desc.cmp_key(k, key)) - return k; - } else if (k.k->type == KEY_TYPE_hash_whiteout) { - ; - } else { - /* hole, not found */ - break; - } - } - bch2_trans_iter_exit(trans, iter); - - return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup); -} - -static __always_inline struct bkey_s_c -bch2_hash_lookup(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key, - enum btree_iter_update_trigger_flags flags) -{ - u32 snapshot; - int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return bkey_s_c_err(ret); - - return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); -} - -static __always_inline int -bch2_hash_hole(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key) -{ - struct bkey_s_c k; - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(inum.inum, desc.hash_key(info, key), snapshot), - POS(inum.inum, U64_MAX), - BTREE_ITER_slots|BTREE_ITER_intent, k, ret) - if (!is_visible_key(desc, inum, k)) - return 0; - bch2_trans_iter_exit(trans, iter); - - return ret ?: -BCH_ERR_ENOSPC_str_hash_create; -} - -static __always_inline -int bch2_hash_needs_whiteout(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *start) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_copy_iter(trans, &iter, start); - - bch2_btree_iter_advance(trans, &iter); - - for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) { - if (k.k->type != desc.key_type && - k.k->type != KEY_TYPE_hash_whiteout) - break; - - if (k.k->type == desc.key_type && - desc.hash_bkey(info, k) <= start->pos.offset) { - ret = 1; - break; - } - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static __always_inline -struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, u32 snapshot, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter slot = {}; - struct bkey_s_c k; - bool found = false; - int ret; - - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(insert->k.p.inode, - desc.hash_bkey(info, bkey_i_to_s_c(insert)), - snapshot), - POS(insert->k.p.inode, U64_MAX), - BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) { - if (is_visible_key(desc, inum, k)) { - if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) - goto found; - - /* hash collision: */ - continue; - } - - if (!slot.path && !(flags & STR_HASH_must_replace)) - bch2_trans_copy_iter(trans, &slot, iter); - - if (k.k->type != KEY_TYPE_hash_whiteout) - goto not_found; - } - - if (!ret) - ret = bch_err_throw(c, ENOSPC_str_hash_create); -out: - bch2_trans_iter_exit(trans, &slot); - bch2_trans_iter_exit(trans, iter); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -found: - found = true; -not_found: - if (found && (flags & STR_HASH_must_create)) { - bch2_trans_iter_exit(trans, &slot); - return k; - } else if (!found && (flags & STR_HASH_must_replace)) { - ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace); - } else { - if (!found && slot.path) - swap(*iter, slot); - - insert->k.p = iter->pos; - ret = bch2_trans_update(trans, iter, insert, flags); - } - - goto out; -} - -static __always_inline -int bch2_hash_set_in_snapshot(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, u32 snapshot, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum, - snapshot, insert, flags); - int ret = bkey_err(k); - if (ret) - return ret; - if (k.k) { - bch2_trans_iter_exit(trans, &iter); - return bch_err_throw(trans->c, EEXIST_str_hash_set); - } - - return 0; -} - -static __always_inline -int bch2_hash_set(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - insert->k.p.inode = inum.inum; - - u32 snapshot; - return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: - bch2_hash_set_in_snapshot(trans, desc, info, inum, - snapshot, insert, flags); -} - -static __always_inline -int bch2_hash_delete_at(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i *delete; - int ret; - - delete = bch2_trans_kmalloc(trans, sizeof(*delete)); - ret = PTR_ERR_OR_ZERO(delete); - if (ret) - return ret; - - ret = bch2_hash_needs_whiteout(trans, desc, info, iter); - if (ret < 0) - return ret; - - bkey_init(&delete->k); - delete->k.p = iter->pos; - delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; - - return bch2_trans_update(trans, iter, delete, flags); -} - -static __always_inline -int bch2_hash_delete(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key, - BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); - -struct snapshots_seen; -int bch2_str_hash_repair_key(struct btree_trans *, - struct snapshots_seen *, - const struct bch_hash_desc *, - struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c, - struct btree_iter *, struct bkey_s_c, - bool *); - -int __bch2_str_hash_check_key(struct btree_trans *, - struct snapshots_seen *, - const struct bch_hash_desc *, - struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c, - bool *); - -static inline int bch2_str_hash_check_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc *desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k, - bool *updated_before_k_pos) -{ - if (hash_k.k->type != desc->key_type) - return 0; - - if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) - return 0; - - return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k, - updated_before_k_pos); -} - -#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c deleted file mode 100644 index 020587449123b1..00000000000000 --- a/fs/bcachefs/subvolume.c +++ /dev/null @@ -1,752 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "fs.h" -#include "recovery_passes.h" -#include "snapshot.h" -#include "subvolume.h" - -#include - -static int bch2_subvolume_delete(struct btree_trans *, u32); - -static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid) -{ - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "missing subvolume %u", subvolid); - bool print = bch2_count_fsck_err(c, subvol_missing, &buf); - - int ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_inodes, 0); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return ret; -} - -static struct bpos subvolume_children_pos(struct bkey_s_c k) -{ - if (k.k->type != KEY_TYPE_subvolume) - return POS_MIN; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - if (!s.v->fs_path_parent) - return POS_MIN; - return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset); -} - -static int check_subvol(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_subvolume subvol; - struct btree_iter subvol_children_iter = {}; - struct bch_snapshot snapshot; - struct printbuf buf = PRINTBUF; - unsigned snapid; - int ret = 0; - - if (k.k->type != KEY_TYPE_subvolume) - return 0; - - subvol = bkey_s_c_to_subvolume(k); - snapid = le32_to_cpu(subvol.v->snapshot); - ret = bch2_snapshot_lookup(trans, snapid, &snapshot); - - if (bch2_err_matches(ret, ENOENT)) - return bch2_run_print_explicit_recovery_pass(c, - BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; - if (ret) - return ret; - - if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { - ret = bch2_subvolume_delete(trans, iter->pos.offset); - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); - return ret ?: -BCH_ERR_transaction_restart_nested; - } - - if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && - subvol.v->fs_path_parent, - trans, subvol_root_fs_path_parent_nonzero, - "root subvolume has nonzero fs_path_parent\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - n->v.fs_path_parent = 0; - } - - if (subvol.v->fs_path_parent) { - struct bpos pos = subvolume_children_pos(k); - - struct bkey_s_c subvol_children_k = - bch2_bkey_get_iter(trans, &subvol_children_iter, - BTREE_ID_subvolume_children, pos, 0); - ret = bkey_err(subvol_children_k); - if (ret) - goto err; - - if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, - trans, subvol_children_not_set, - "subvolume not set in subvolume_children btree at %llu:%llu\n%s", - pos.inode, pos.offset, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true); - if (ret) - goto err; - } - } - - struct bch_inode_unpacked inode; - ret = bch2_inode_find_by_inum_nowarn_trans(trans, - (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, - &inode); - if (!ret) { - if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, - trans, subvol_root_wrong_bi_subvol, - "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", - inode.bi_inum, inode.bi_snapshot, - inode.bi_subvol, subvol.k->p.offset)) { - inode.bi_subvol = subvol.k->p.offset; - inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - goto err; - } - } else if (bch2_err_matches(ret, ENOENT)) { - if (fsck_err(trans, subvol_to_missing_root, - "subvolume %llu points to missing subvolume root %llu:%u", - k.k->p.offset, le64_to_cpu(subvol.v->inode), - le32_to_cpu(subvol.v->snapshot))) { - /* - * Recreate - any contents that are still disconnected - * will then get reattached under lost+found - */ - bch2_inode_init_early(c, &inode); - bch2_inode_init_late(c, &inode, bch2_current_time(c), - 0, 0, S_IFDIR|0700, 0, NULL); - inode.bi_inum = le64_to_cpu(subvol.v->inode); - inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); - inode.bi_subvol = k.k->p.offset; - inode.bi_parent_subvol = le32_to_cpu(subvol.v->fs_path_parent); - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - goto err; - } - } else { - goto err; - } - - if (!BCH_SUBVOLUME_SNAP(subvol.v)) { - u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); - u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root); - - struct bch_snapshot_tree st; - ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); - - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "%s: snapshot tree %u not found", __func__, snapshot_tree); - - if (ret) - goto err; - - if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, - trans, subvol_not_master_and_not_snapshot, - "subvolume %llu is not set as snapshot but is not master subvolume", - k.k->p.offset)) { - struct bkey_i_subvolume *s = - bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - goto err; - - SET_BCH_SUBVOLUME_SNAP(&s->v, true); - } - } -err: -fsck_err: - bch2_trans_iter_exit(trans, &subvol_children_iter); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_subvols(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int check_subvol_child(struct btree_trans *trans, - struct btree_iter *child_iter, - struct bkey_s_c child_k) -{ - struct bch_subvolume s; - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset), - 0, subvolume, &s); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (fsck_err_on(ret || - le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode, - trans, subvol_children_bad, - "incorrect entry in subvolume_children btree %llu:%llu", - child_k.k->p.inode, child_k.k->p.offset)) { - ret = bch2_btree_delete_at(trans, child_iter, 0); - if (ret) - goto err; - } -err: -fsck_err: - return ret; -} - -int bch2_check_subvol_children(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol_child(trans, &iter, k))); - bch_err_fn(c, ret); - return 0; -} - -/* Subvolumes: */ - -int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); - int ret = 0; - - bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || - bkey_gt(k.k->p, SUBVOL_POS_MAX), - c, subvol_pos_bad, - "invalid pos"); - - bkey_fsck_err_on(!subvol.v->snapshot, - c, subvol_snapshot_bad, - "invalid snapshot"); - - bkey_fsck_err_on(!subvol.v->inode, - c, subvol_inode_bad, - "invalid inode"); -fsck_err: - return ret; -} - -void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - - prt_printf(out, "root %llu snapshot id %u", - le64_to_cpu(s.v->inode), - le32_to_cpu(s.v->snapshot)); - - if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) { - prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); - prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); - } - - if (BCH_SUBVOLUME_RO(s.v)) - prt_printf(out, " ro"); - if (BCH_SUBVOLUME_SNAP(s.v)) - prt_printf(out, " snapshot"); - if (BCH_SUBVOLUME_UNLINKED(s.v)) - prt_printf(out, " unlinked"); -} - -static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) -{ - return !bpos_eq(pos, POS_MIN) - ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set) - : 0; -} - -int bch2_subvolume_trigger(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - if (flags & BTREE_TRIGGER_transactional) { - struct bpos children_pos_old = subvolume_children_pos(old); - struct bpos children_pos_new = subvolume_children_pos(new.s_c); - - if (!bpos_eq(children_pos_old, children_pos_new)) { - int ret = subvolume_children_mod(trans, children_pos_old, false) ?: - subvolume_children_mod(trans, children_pos_new, true); - if (ret) - return ret; - } - } - - return 0; -} - -int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) -{ - struct btree_iter iter; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); - struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter); - bch2_trans_iter_exit(trans, &iter); - - return bkey_err(k) ?: k.k && k.k->p.inode == subvol - ? -BCH_ERR_ENOTEMPTY_subvol_not_empty - : 0; -} - -static __always_inline int -bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, - bool inconsistent_if_not_found, - struct bch_subvolume *s) -{ - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), - BTREE_ITER_cached| - BTREE_ITER_with_updates, subvolume, s); - if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found) - ret = bch2_subvolume_missing(trans->c, subvol) ?: ret; - return ret; -} - -int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, - bool inconsistent_if_not_found, - struct bch_subvolume *s) -{ - return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s); -} - -int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) -{ - struct bch_subvolume s; - int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s); - if (ret) - return ret; - - if (BCH_SUBVOLUME_RO(&s)) - return -EROFS; - return 0; -} - -int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol) -{ - return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol)); -} - -int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, - struct bch_subvolume *subvol) -{ - struct bch_snapshot snap; - - return bch2_snapshot_lookup(trans, snapshot, &snap) ?: - bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol); -} - -int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, - u32 *snapid, bool warn) -{ - struct btree_iter iter; - struct bkey_s_c_subvolume subvol; - int ret; - - subvol = bch2_bkey_get_iter_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached|BTREE_ITER_with_updates, - subvolume); - ret = bkey_err(subvol); - - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; - - if (likely(!ret)) - *snapid = le32_to_cpu(subvol.v->snapshot); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, - u32 *snapid) -{ - return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true); -} - -static int bch2_subvolume_reparent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - u32 old_parent, u32 new_parent) -{ - struct bkey_i_subvolume *s; - int ret; - - if (k.k->type != KEY_TYPE_subvolume) - return 0; - - if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) && - le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent) - return 0; - - s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - s->v.creation_parent = cpu_to_le32(new_parent); - return 0; -} - -/* - * Separate from the snapshot tree in the snapshots btree, we record the tree - * structure of how snapshot subvolumes were created - the parent subvolume of - * each snapshot subvolume. - * - * When a subvolume is deleted, we scan for child subvolumes and reparant them, - * to avoid dangling references: - */ -static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) -{ - struct bch_subvolume s; - - return lockrestart_do(trans, - bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?: - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_subvolume_reparent(trans, &iter, k, - subvolid_to_delete, le32_to_cpu(s.creation_parent))); -} - -/* - * Delete subvolume, mark snapshot ID as deleted, queue up snapshot - * deletion/cleanup: - */ -static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) -{ - struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {}; - - struct bkey_s_c_subvolume subvol = - bch2_bkey_get_iter_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached|BTREE_ITER_intent, - subvolume); - int ret = bkey_err(subvol); - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; - if (ret) - goto err; - - u32 snapid = le32_to_cpu(subvol.v->snapshot); - - struct bkey_s_c_snapshot snapshot = - bch2_bkey_get_iter_typed(trans, &snapshot_iter, - BTREE_ID_snapshots, POS(0, snapid), - 0, snapshot); - ret = bkey_err(snapshot); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing snapshot %u", snapid); - if (ret) - goto err; - - u32 treeid = le32_to_cpu(snapshot.v->tree); - - struct bkey_s_c_snapshot_tree snapshot_tree = - bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, - BTREE_ID_snapshot_trees, POS(0, treeid), - 0, snapshot_tree); - ret = bkey_err(snapshot_tree); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing snapshot tree %u", treeid); - if (ret) - goto err; - - if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { - struct bkey_i_snapshot_tree *snapshot_tree_mut = - bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter, - &snapshot_tree.s_c, - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(snapshot_tree_mut); - if (ret) - goto err; - - snapshot_tree_mut->v.master_subvol = 0; - } - - ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?: - bch2_snapshot_node_set_deleted(trans, snapid); -err: - bch2_trans_iter_exit(trans, &snapshot_tree_iter); - bch2_trans_iter_exit(trans, &snapshot_iter); - bch2_trans_iter_exit(trans, &subvol_iter); - return ret; -} - -static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) -{ - int ret = bch2_subvolumes_reparent(trans, subvolid) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_subvolume_delete(trans, subvolid)); - - bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols); - return ret; -} - -static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, - snapshot_wait_for_pagecache_and_delete_work); - int ret = 0; - - while (!ret) { - mutex_lock(&c->snapshots_unlinked_lock); - snapshot_id_list s = c->snapshots_unlinked; - darray_init(&c->snapshots_unlinked); - mutex_unlock(&c->snapshots_unlinked_lock); - - if (!s.nr) - break; - - bch2_evict_subvolume_inodes(c, &s); - - darray_for_each(s, id) { - ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); - bch_err_msg(c, ret, "deleting subvolume %u", *id); - if (ret) - break; - } - - darray_exit(&s); - } - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); -} - -struct subvolume_unlink_hook { - struct btree_trans_commit_hook h; - u32 subvol; -}; - -static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, - struct btree_trans_commit_hook *_h) -{ - struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); - struct bch_fs *c = trans->c; - int ret = 0; - - mutex_lock(&c->snapshots_unlinked_lock); - if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) - ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); - mutex_unlock(&c->snapshots_unlinked_lock); - - if (ret) - return ret; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache)) - return -EROFS; - - if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); - return 0; -} - -int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) -{ - struct btree_iter iter; - struct bkey_i_subvolume *n; - struct subvolume_unlink_hook *h; - int ret = 0; - - h = bch2_trans_kmalloc(trans, sizeof(*h)); - ret = PTR_ERR_OR_ZERO(h); - if (ret) - return ret; - - h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; - h->subvol = subvolid; - bch2_trans_commit_hook(trans, &h->h); - - n = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached, subvolume); - ret = PTR_ERR_OR_ZERO(n); - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; - if (unlikely(ret)) - return ret; - - SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); - n->v.fs_path_parent = 0; - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_subvolume_create(struct btree_trans *trans, u64 inode, - u32 parent_subvolid, - u32 src_subvolid, - u32 *new_subvolid, - u32 *new_snapshotid, - bool ro) -{ - struct bch_fs *c = trans->c; - struct btree_iter dst_iter, src_iter = {}; - struct bkey_i_subvolume *new_subvol = NULL; - struct bkey_i_subvolume *src_subvol = NULL; - u32 parent = 0, new_nodes[2], snapshot_subvols[2]; - int ret = 0; - - ret = bch2_bkey_get_empty_slot(trans, &dst_iter, - BTREE_ID_subvolumes, POS(0, U32_MAX)); - if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = bch_err_throw(c, ENOSPC_subvolume_create); - if (ret) - return ret; - - snapshot_subvols[0] = dst_iter.pos.offset; - snapshot_subvols[1] = src_subvolid; - - if (src_subvolid) { - /* Creating a snapshot: */ - - src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, - BTREE_ID_subvolumes, POS(0, src_subvolid), - BTREE_ITER_cached, subvolume); - ret = PTR_ERR_OR_ZERO(src_subvol); - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret; - if (unlikely(ret)) - goto err; - - parent = le32_to_cpu(src_subvol->v.snapshot); - } - - ret = bch2_snapshot_node_create(trans, parent, new_nodes, - snapshot_subvols, - src_subvolid ? 2 : 1); - if (ret) - goto err; - - if (src_subvolid) { - src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); - ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); - if (ret) - goto err; - } - - new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume); - ret = PTR_ERR_OR_ZERO(new_subvol); - if (ret) - goto err; - - new_subvol->v.flags = 0; - new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); - new_subvol->v.inode = cpu_to_le64(inode); - new_subvol->v.creation_parent = cpu_to_le32(src_subvolid); - new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid); - new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); - new_subvol->v.otime.hi = 0; - - SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); - SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); - - *new_subvolid = new_subvol->k.p.offset; - *new_snapshotid = new_nodes[0]; -err: - bch2_trans_iter_exit(trans, &src_iter); - bch2_trans_iter_exit(trans, &dst_iter); - return ret; -} - -int bch2_initialize_subvolumes(struct bch_fs *c) -{ - struct bkey_i_snapshot_tree root_tree; - struct bkey_i_snapshot root_snapshot; - struct bkey_i_subvolume root_volume; - int ret; - - bkey_snapshot_tree_init(&root_tree.k_i); - root_tree.k.p.offset = 1; - root_tree.v.master_subvol = cpu_to_le32(1); - root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); - - bkey_snapshot_init(&root_snapshot.k_i); - root_snapshot.k.p.offset = U32_MAX; - root_snapshot.v.flags = 0; - root_snapshot.v.parent = 0; - root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); - root_snapshot.v.tree = cpu_to_le32(1); - SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); - - bkey_subvolume_init(&root_volume.k_i); - root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_volume.v.flags = 0; - root_volume.v.snapshot = cpu_to_le32(U32_MAX); - root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?: - bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?: - bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0); - bch_err_fn(c, ret); - return ret; -} - -static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); - ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_is_inode(k.k)) { - struct bch_fs *c = trans->c; - bch_err(c, "root inode not found"); - ret = bch_err_throw(c, ENOENT_inode); - goto err; - } - - ret = bch2_inode_unpack(k, &inode); - BUG_ON(ret); - - inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - - ret = bch2_inode_write(trans, &iter, &inode); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* set bi_subvol on root inode */ -int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) -{ - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_fs_upgrade_for_subvolumes(trans)); - bch_err_fn(c, ret); - return ret; -} - -void bch2_fs_subvolumes_init_early(struct bch_fs *c) -{ - INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, - bch2_subvolume_wait_for_pagecache_and_delete); -} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h deleted file mode 100644 index 075f55e25c7048..00000000000000 --- a/fs/bcachefs/subvolume.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUBVOLUME_H -#define _BCACHEFS_SUBVOLUME_H - -#include "darray.h" -#include "subvolume_types.h" - -int bch2_check_subvols(struct bch_fs *); -int bch2_check_subvol_children(struct bch_fs *); - -int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ - .key_validate = bch2_subvolume_validate, \ - .val_to_text = bch2_subvolume_to_text, \ - .trigger = bch2_subvolume_trigger, \ - .min_val_size = 16, \ -}) - -int bch2_subvol_has_children(struct btree_trans *, u32); -int bch2_subvolume_get(struct btree_trans *, unsigned, - bool, struct bch_subvolume *); -int __bch2_subvolume_get_snapshot(struct btree_trans *, u32, - u32 *, bool); -int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); - -int bch2_subvol_is_ro_trans(struct btree_trans *, u32); -int bch2_subvol_is_ro(struct bch_fs *, u32); - -static inline struct bkey_s_c -bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, u32 subvolid, unsigned flags) -{ - u32 snapshot; - int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot); - if (ret) - return bkey_s_c_err(ret); - - bch2_btree_iter_set_snapshot(trans, iter, snapshot); - return bch2_btree_iter_peek_max_type(trans, iter, end, flags); -} - -#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ - _end, _subvolid, _flags, _k, _do) \ -({ \ - struct bkey_s_c _k; \ - int _ret3 = 0; \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\ - _end, _subvolid, (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ - } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \ - _start, _end, _subvolid, _flags, _k, _do) \ -({ \ - struct btree_iter _iter; \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ - _end, _subvolid, _flags, _k, _do); \ -}) - -int bch2_subvolume_unlink(struct btree_trans *, u32); -int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); - -int bch2_initialize_subvolumes(struct bch_fs *); -int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); - -void bch2_fs_subvolumes_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h deleted file mode 100644 index e029df7ba89f52..00000000000000 --- a/fs/bcachefs/subvolume_format.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H -#define _BCACHEFS_SUBVOLUME_FORMAT_H - -#define SUBVOL_POS_MIN POS(0, 1) -#define SUBVOL_POS_MAX POS(0, S32_MAX) -#define BCACHEFS_ROOT_SUBVOL 1 - -struct bch_subvolume { - struct bch_val v; - __le32 flags; - __le32 snapshot; - __le64 inode; - /* - * Snapshot subvolumes form a tree, separate from the snapshot nodes - * tree - if this subvolume is a snapshot, this is the ID of the - * subvolume it was created from: - * - * This is _not_ necessarily the subvolume of the directory containing - * this subvolume: - */ - __le32 creation_parent; - __le32 fs_path_parent; - bch_le128 otime; -}; - -LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) -/* - * We need to know whether a subvolume is a snapshot so we can know whether we - * can delete it (or whether it should just be rm -rf'd) - */ -LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) -LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) - -#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h deleted file mode 100644 index 9d634b906dcda3..00000000000000 --- a/fs/bcachefs/subvolume_types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUBVOLUME_TYPES_H -#define _BCACHEFS_SUBVOLUME_TYPES_H - -typedef struct { - /* we can't have padding in this struct: */ - u64 subvol; - u64 inum; -} subvol_inum; - -#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c deleted file mode 100644 index 6c2e1d647403f2..00000000000000 --- a/fs/bcachefs/super-io.c +++ /dev/null @@ -1,1562 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "checksum.h" -#include "disk_groups.h" -#include "ec.h" -#include "error.h" -#include "journal.h" -#include "journal_sb.h" -#include "journal_seq_blacklist.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "quota.h" -#include "sb-clean.h" -#include "sb-counters.h" -#include "sb-downgrade.h" -#include "sb-errors.h" -#include "sb-members.h" -#include "super-io.h" -#include "super.h" -#include "trace.h" -#include "vstructs.h" - -#include -#include -#include - -struct bch2_metadata_version { - u16 version; - const char *name; -}; - -static const struct bch2_metadata_version bch2_metadata_versions[] = { -#define x(n, v) { \ - .version = v, \ - .name = #n, \ -}, - BCH_METADATA_VERSIONS() -#undef x -}; - -void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v) -{ - const char *str = "(unknown version)"; - - for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) - if (bch2_metadata_versions[i].version == v) { - str = bch2_metadata_versions[i].name; - break; - } - - prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); -} - -enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v) -{ - if (!BCH_VERSION_MAJOR(v)) - return v; - - for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) - if (bch2_metadata_versions[i].version > v && - BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) == - BCH_VERSION_MAJOR(v)) - v = bch2_metadata_versions[i].version; - - return v; -} - -int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) -{ - int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && - version <= c->sb.version_incompat_allowed) - ? 0 - : -BCH_ERR_may_not_use_incompat_feature; - - mutex_lock(&c->sb_lock); - if (!ret) { - SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); - bch2_write_super(c); - } else { - darray_for_each(c->incompat_versions_requested, i) - if (version == *i) - goto out; - - darray_push(&c->incompat_versions_requested, version); - struct printbuf buf = PRINTBUF; - prt_str(&buf, "requested incompat feature "); - bch2_version_to_text(&buf, version); - prt_str(&buf, " currently not enabled, allowed up to "); - bch2_version_to_text(&buf, version); - prt_printf(&buf, "\n set version_upgrade=incompat to enable"); - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - } - -out: - mutex_unlock(&c->sb_lock); - - return ret; -} - -const char * const bch2_sb_fields[] = { -#define x(name, nr) #name, - BCH_SB_FIELDS() -#undef x - NULL -}; - -static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, - enum bch_validate_flags, struct printbuf *); - -struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, - enum bch_sb_field_type type) -{ - /* XXX: need locking around superblock to access optional fields */ - - vstruct_for_each(sb, f) - if (le32_to_cpu(f->type) == type) - return f; - return NULL; -} - -static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, - struct bch_sb_field *f, - unsigned u64s) -{ - unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; - unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; - - BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); - - if (!f && !u64s) { - /* nothing to do: */ - } else if (!f) { - f = vstruct_last(sb->sb); - memset(f, 0, sizeof(u64) * u64s); - f->u64s = cpu_to_le32(u64s); - f->type = 0; - } else { - void *src, *dst; - - src = vstruct_end(f); - - if (u64s) { - f->u64s = cpu_to_le32(u64s); - dst = vstruct_end(f); - } else { - dst = f; - } - - memmove(dst, src, vstruct_end(sb->sb) - src); - - if (dst > src) - memset(src, 0, dst - src); - } - - sb->sb->u64s = cpu_to_le32(sb_u64s); - - return u64s ? f : NULL; -} - -void bch2_sb_field_delete(struct bch_sb_handle *sb, - enum bch_sb_field_type type) -{ - struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); - - if (f) - __bch2_sb_field_resize(sb, f, 0); -} - -/* Superblock realloc/free: */ - -void bch2_free_super(struct bch_sb_handle *sb) -{ - kfree(sb->bio); - if (!IS_ERR_OR_NULL(sb->s_bdev_file)) - bdev_fput(sb->s_bdev_file); - kfree(sb->holder); - kfree(sb->sb_name); - - kfree(sb->sb); - memset(sb, 0, sizeof(*sb)); -} - -int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) -{ - size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); - size_t new_buffer_size; - struct bch_sb *new_sb; - struct bio *bio; - - if (sb->bdev) - new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); - - new_buffer_size = roundup_pow_of_two(new_bytes); - - if (sb->sb && sb->buffer_size >= new_buffer_size) - return 0; - - if (sb->sb && sb->have_layout) { - u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; - - if (new_bytes > max_bytes) { - struct printbuf buf = PRINTBUF; - - prt_bdevname(&buf, sb->bdev); - prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes); - pr_err("%s", buf.buf); - printbuf_exit(&buf); - return -BCH_ERR_ENOSPC_sb; - } - } - - if (sb->buffer_size >= new_buffer_size && sb->sb) - return 0; - - if (dynamic_fault("bcachefs:add:super_realloc")) - return -BCH_ERR_ENOMEM_sb_realloc_injected; - - new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); - if (!new_sb) - return -BCH_ERR_ENOMEM_sb_buf_realloc; - - sb->sb = new_sb; - - if (sb->have_bio) { - unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size); - - bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); - if (!bio) - return -BCH_ERR_ENOMEM_sb_bio_realloc; - - bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); - - kfree(sb->bio); - sb->bio = bio; - } - - sb->buffer_size = new_buffer_size; - - return 0; -} - -struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, - enum bch_sb_field_type type, - unsigned u64s) -{ - struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); - ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; - ssize_t d = -old_u64s + u64s; - - if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) - return NULL; - - if (sb->fs_sb) { - struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); - - lockdep_assert_held(&c->sb_lock); - - /* XXX: we're not checking that offline device have enough space */ - - for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) { - struct bch_sb_handle *dev_sb = &ca->disk_sb; - - if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize); - return NULL; - } - } - } - - f = bch2_sb_field_get_id(sb->sb, type); - f = __bch2_sb_field_resize(sb, f, u64s); - if (f) - f->type = cpu_to_le32(type); - return f; -} - -struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *sb, - enum bch_sb_field_type type, - unsigned u64s) -{ - struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); - - if (!f || le32_to_cpu(f->u64s) < u64s) - f = bch2_sb_field_resize_id(sb, type, u64s); - return f; -} - -/* Superblock validate: */ - -static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) -{ - u64 offset, prev_offset, max_sectors; - unsigned i; - - BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); - - if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) && - !uuid_equal(&layout->magic, &BCHFS_MAGIC)) { - prt_printf(out, "Not a bcachefs superblock layout"); - return -BCH_ERR_invalid_sb_layout; - } - - if (layout->layout_type != 0) { - prt_printf(out, "Invalid superblock layout type %u", - layout->layout_type); - return -BCH_ERR_invalid_sb_layout_type; - } - - if (!layout->nr_superblocks) { - prt_printf(out, "Invalid superblock layout: no superblocks"); - return -BCH_ERR_invalid_sb_layout_nr_superblocks; - } - - if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { - prt_printf(out, "Invalid superblock layout: too many superblocks"); - return -BCH_ERR_invalid_sb_layout_nr_superblocks; - } - - if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) { - prt_printf(out, "Invalid superblock layout: max_size_bits too high"); - return -BCH_ERR_invalid_sb_layout_sb_max_size_bits; - } - - max_sectors = 1 << layout->sb_max_size_bits; - - prev_offset = le64_to_cpu(layout->sb_offset[0]); - - for (i = 1; i < layout->nr_superblocks; i++) { - offset = le64_to_cpu(layout->sb_offset[i]); - - if (offset < prev_offset + max_sectors) { - prt_printf(out, "Invalid superblock layout: superblocks overlap\n" - " (sb %u ends at %llu next starts at %llu", - i - 1, prev_offset + max_sectors, offset); - return -BCH_ERR_invalid_sb_layout_superblocks_overlap; - } - prev_offset = offset; - } - - return 0; -} - -static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) -{ - u16 version = le16_to_cpu(sb->version); - u16 version_min = le16_to_cpu(sb->version_min); - - if (!bch2_version_compatible(version)) { - prt_str(out, "Unsupported superblock version "); - bch2_version_to_text(out, version); - prt_str(out, " (min "); - bch2_version_to_text(out, bcachefs_metadata_version_min); - prt_str(out, ", max "); - bch2_version_to_text(out, bcachefs_metadata_version_current); - prt_str(out, ")"); - return -BCH_ERR_invalid_sb_version; - } - - if (!bch2_version_compatible(version_min)) { - prt_str(out, "Unsupported superblock version_min "); - bch2_version_to_text(out, version_min); - prt_str(out, " (min "); - bch2_version_to_text(out, bcachefs_metadata_version_min); - prt_str(out, ", max "); - bch2_version_to_text(out, bcachefs_metadata_version_current); - prt_str(out, ")"); - return -BCH_ERR_invalid_sb_version; - } - - if (version_min > version) { - prt_str(out, "Bad minimum version "); - bch2_version_to_text(out, version_min); - prt_str(out, ", greater than version field "); - bch2_version_to_text(out, version); - return -BCH_ERR_invalid_sb_version; - } - - return 0; -} - -int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, - enum bch_validate_flags flags, struct printbuf *out) -{ - enum bch_opt_id opt_id; - int ret; - - ret = bch2_sb_compatible(sb, out); - if (ret) - return ret; - - u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); - unsigned incompat_bit = 0; - if (incompat) - incompat_bit = __ffs64(incompat); - else if (sb->features[1]) - incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); - - if (incompat_bit) { - prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", - incompat_bit, - bch2_sb_features[BCH_FEATURE_NR - 1], - BCH_FEATURE_NR - 1); - return -BCH_ERR_invalid_sb_features; - } - - if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || - BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { - prt_str(out, "Filesystem has incompatible version "); - bch2_version_to_text(out, le16_to_cpu(sb->version)); - prt_str(out, ", current version "); - bch2_version_to_text(out, bcachefs_metadata_version_current); - return -BCH_ERR_invalid_sb_features; - } - - if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { - prt_printf(out, "Bad user UUID (got zeroes)"); - return -BCH_ERR_invalid_sb_uuid; - } - - if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { - prt_printf(out, "Bad internal UUID (got zeroes)"); - return -BCH_ERR_invalid_sb_uuid; - } - - if (!(flags & BCH_VALIDATE_write) && - le64_to_cpu(sb->offset) != read_offset) { - prt_printf(out, "Bad sb offset (got %llu, read from %llu)", - le64_to_cpu(sb->offset), read_offset); - return -BCH_ERR_invalid_sb_offset; - } - - if (!sb->nr_devices || - sb->nr_devices > BCH_SB_MEMBERS_MAX) { - prt_printf(out, "Bad number of member devices %u (max %u)", - sb->nr_devices, BCH_SB_MEMBERS_MAX); - return -BCH_ERR_invalid_sb_too_many_members; - } - - if (sb->dev_idx >= sb->nr_devices) { - prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", - sb->dev_idx, sb->nr_devices); - return -BCH_ERR_invalid_sb_dev_idx; - } - - if (!sb->time_precision || - le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { - prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", - le32_to_cpu(sb->time_precision), NSEC_PER_SEC); - return -BCH_ERR_invalid_sb_time_precision; - } - - /* old versions didn't know to downgrade this field */ - if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version)) - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version)); - - if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) { - prt_printf(out, "Invalid version_incompat "); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); - prt_str(out, " > incompat_allowed "); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); - if (flags & BCH_VALIDATE_write) - return -BCH_ERR_invalid_sb_version; - else - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); - } - - if (sb->nr_devices > 1) - SET_BCH_SB_MULTI_DEVICE(sb, true); - - if (!flags) { - /* - * Been seeing a bug where these are getting inexplicably - * zeroed, so we're now validating them, but we have to be - * careful not to preven people's filesystems from mounting: - */ - if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) - SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); - if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) - SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); - - if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 && - !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb)) - SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) - SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); - - if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) - SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags && - !BCH_SB_CSUM_ERR_RETRY_NR(sb)) - SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3); - } - -#ifdef __KERNEL__ - if (!BCH_SB_SHARD_INUMS_NBITS(sb)) - SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus()))); -#endif - - for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { - const struct bch_option *opt = bch2_opt_table + opt_id; - - if (opt->get_sb) { - u64 v = bch2_opt_from_sb(sb, opt_id, -1); - - prt_printf(out, "Invalid option "); - ret = bch2_opt_validate(opt, v, out); - if (ret) - return ret; - - printbuf_reset(out); - } - } - - /* validate layout */ - ret = validate_sb_layout(&sb->layout, out); - if (ret) - return ret; - - vstruct_for_each(sb, f) { - if (!f->u64s) { - prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)", - le32_to_cpu(f->type)); - return -BCH_ERR_invalid_sb_field_size; - } - - if (vstruct_next(f) > vstruct_last(sb)) { - prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", - le32_to_cpu(f->type)); - return -BCH_ERR_invalid_sb_field_size; - } - } - - struct bch_sb_field *mi = - bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?: - bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1); - - /* members must be validated first: */ - if (!mi) { - prt_printf(out, "Invalid superblock: member info area missing"); - return -BCH_ERR_invalid_sb_members_missing; - } - - ret = bch2_sb_field_validate(sb, mi, flags, out); - if (ret) - return ret; - - vstruct_for_each(sb, f) { - if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) - continue; - - ret = bch2_sb_field_validate(sb, f, flags, out); - if (ret) - return ret; - } - - if ((flags & BCH_VALIDATE_write) && - bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { - prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", - le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), - le64_to_cpu(sb->seq)); - return -BCH_ERR_invalid_sb_members_missing; - } - - return 0; -} - -/* device open: */ - -static unsigned long le_ulong_to_cpu(unsigned long v) -{ - return sizeof(unsigned long) == 8 - ? le64_to_cpu(v) - : le32_to_cpu(v); -} - -static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned nr) -{ - BUG_ON(nr & (BITS_PER_TYPE(long) - 1)); - - for (unsigned i = 0; i < BITS_TO_LONGS(nr); i++) - dst[i] = le_ulong_to_cpu(src[i]); -} - -static void bch2_sb_update(struct bch_fs *c) -{ - struct bch_sb *src = c->disk_sb.sb; - - lockdep_assert_held(&c->sb_lock); - - c->sb.uuid = src->uuid; - c->sb.user_uuid = src->user_uuid; - c->sb.version = le16_to_cpu(src->version); - c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src); - c->sb.version_incompat_allowed - = BCH_SB_VERSION_INCOMPAT_ALLOWED(src); - c->sb.version_min = le16_to_cpu(src->version_min); - c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); - c->sb.nr_devices = src->nr_devices; - c->sb.clean = BCH_SB_CLEAN(src); - c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); - - c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); - c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; - - /* XXX this is wrong, we need a 96 or 128 bit integer type */ - c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), - c->sb.nsec_per_time_unit); - c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); - - c->sb.features = le64_to_cpu(src->features[0]); - c->sb.compat = le64_to_cpu(src->compat[0]); - c->sb.multi_device = BCH_SB_MULTI_DEVICE(src); - - memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); - if (ext) { - c->sb.recovery_passes_required = - bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - - le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, - sizeof(c->sb.errors_silent) * 8); - c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); - } - - for_each_member_device(c, ca) { - struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); - ca->mi = bch2_mi_to_cpu(&m); - } -} - -static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) -{ - struct bch_sb_field *src_f, *dst_f; - struct bch_sb *dst = dst_handle->sb; - unsigned i; - - dst->version = src->version; - dst->version_min = src->version_min; - dst->seq = src->seq; - dst->uuid = src->uuid; - dst->user_uuid = src->user_uuid; - memcpy(dst->label, src->label, sizeof(dst->label)); - - dst->block_size = src->block_size; - dst->nr_devices = src->nr_devices; - - dst->time_base_lo = src->time_base_lo; - dst->time_base_hi = src->time_base_hi; - dst->time_precision = src->time_precision; - dst->write_time = src->write_time; - - memcpy(dst->flags, src->flags, sizeof(dst->flags)); - memcpy(dst->features, src->features, sizeof(dst->features)); - memcpy(dst->compat, src->compat, sizeof(dst->compat)); - - for (i = 0; i < BCH_SB_FIELD_NR; i++) { - int d; - - if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) - continue; - - src_f = bch2_sb_field_get_id(src, i); - dst_f = bch2_sb_field_get_id(dst, i); - - d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - - (dst_f ? le32_to_cpu(dst_f->u64s) : 0); - if (d > 0) { - int ret = bch2_sb_realloc(dst_handle, - le32_to_cpu(dst_handle->sb->u64s) + d); - - if (ret) - return ret; - - dst = dst_handle->sb; - dst_f = bch2_sb_field_get_id(dst, i); - } - - dst_f = __bch2_sb_field_resize(dst_handle, dst_f, - src_f ? le32_to_cpu(src_f->u64s) : 0); - - if (src_f) - memcpy(dst_f, src_f, vstruct_bytes(src_f)); - } - - return 0; -} - -int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) -{ - int ret; - - lockdep_assert_held(&c->sb_lock); - - ret = bch2_sb_realloc(&c->disk_sb, 0) ?: - __copy_super(&c->disk_sb, src) ?: - bch2_sb_replicas_to_cpu_replicas(c) ?: - bch2_sb_disk_groups_to_cpu(c); - if (ret) - return ret; - - bch2_sb_update(c); - return 0; -} - -int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) -{ - return __copy_super(&ca->disk_sb, c->disk_sb.sb); -} - -/* read superblock: */ - -static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) -{ - size_t bytes; - int ret; -reread: - bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); - sb->bio->bi_iter.bi_sector = offset; - bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); - - ret = submit_bio_wait(sb->bio); - if (ret) { - prt_printf(err, "IO error: %i", ret); - return ret; - } - - if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && - !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { - prt_str(err, "Not a bcachefs superblock (got magic "); - pr_uuid(err, sb->sb->magic.b); - prt_str(err, ")"); - return -BCH_ERR_invalid_sb_magic; - } - - ret = bch2_sb_compatible(sb->sb, err); - if (ret) - return ret; - - bytes = vstruct_bytes(sb->sb); - - u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits); - if (bytes > sb_size) { - prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)", - bytes, sb_size); - return -BCH_ERR_invalid_sb_too_big; - } - - if (bytes > sb->buffer_size) { - ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)); - if (ret) - return ret; - goto reread; - } - - enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb); - if (csum_type >= BCH_CSUM_NR || - bch2_csum_type_is_encryption(csum_type)) { - prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); - return -BCH_ERR_invalid_sb_csum_type; - } - - /* XXX: verify MACs */ - struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb); - if (bch2_crc_cmp(csum, sb->sb->csum)) { - bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum); - return -BCH_ERR_invalid_sb_csum; - } - - sb->seq = le64_to_cpu(sb->sb->seq); - - return 0; -} - -static int __bch2_read_super(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb, bool ignore_notbchfs_msg) -{ - u64 offset = opt_get(*opts, sb); - struct bch_sb_layout layout; - struct printbuf err = PRINTBUF; - struct printbuf err2 = PRINTBUF; - __le64 *i; - int ret; -#ifndef __KERNEL__ -retry: -#endif - memset(sb, 0, sizeof(*sb)); - sb->mode = BLK_OPEN_READ; - sb->have_bio = true; - sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL); - if (!sb->holder) - return -ENOMEM; - - sb->sb_name = kstrdup(path, GFP_KERNEL); - if (!sb->sb_name) { - ret = -ENOMEM; - prt_printf(&err, "error allocating memory for sb_name"); - goto err; - } - -#ifndef __KERNEL__ - if (opt_get(*opts, direct_io) == false) - sb->mode |= BLK_OPEN_BUFFERED; -#endif - - if (!opt_get(*opts, noexcl)) - sb->mode |= BLK_OPEN_EXCL; - - if (!opt_get(*opts, nochanges)) - sb->mode |= BLK_OPEN_WRITE; - - sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (IS_ERR(sb->s_bdev_file) && - PTR_ERR(sb->s_bdev_file) == -EACCES && - opt_get(*opts, read_only)) { - sb->mode &= ~BLK_OPEN_WRITE; - - sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (!IS_ERR(sb->s_bdev_file)) - opt_set(*opts, nochanges, true); - } - - if (IS_ERR(sb->s_bdev_file)) { - ret = PTR_ERR(sb->s_bdev_file); - prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret)); - goto err; - } - sb->bdev = file_bdev(sb->s_bdev_file); - - ret = bch2_sb_realloc(sb, 0); - if (ret) { - prt_printf(&err, "error allocating memory for superblock"); - goto err; - } - - if (bch2_fs_init_fault("read_super")) { - prt_printf(&err, "dynamic fault"); - ret = -EFAULT; - goto err; - } - - ret = read_one_super(sb, offset, &err); - if (!ret) - goto got_super; - - if (opt_defined(*opts, sb)) - goto err; - - prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", - path, err.buf); - if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) - bch2_print_opts(opts, KERN_INFO "%s", err2.buf); - else - bch2_print_opts(opts, KERN_ERR "%s", err2.buf); - - printbuf_exit(&err2); - printbuf_reset(&err); - - /* - * Error reading primary superblock - read location of backup - * superblocks: - */ - bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); - sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; - /* - * use sb buffer to read layout, since sb buffer is page aligned but - * layout won't be: - */ - bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); - - ret = submit_bio_wait(sb->bio); - if (ret) { - prt_printf(&err, "IO error: %i", ret); - goto err; - } - - memcpy(&layout, sb->sb, sizeof(layout)); - ret = validate_sb_layout(&layout, &err); - if (ret) - goto err; - - for (i = layout.sb_offset; - i < layout.sb_offset + layout.nr_superblocks; i++) { - offset = le64_to_cpu(*i); - - if (offset == opt_get(*opts, sb)) { - ret = -BCH_ERR_invalid; - continue; - } - - ret = read_one_super(sb, offset, &err); - if (!ret) - goto got_super; - } - - goto err; - -got_super: - if (le16_to_cpu(sb->sb->block_size) << 9 < - bdev_logical_block_size(sb->bdev) && - opt_get(*opts, direct_io)) { -#ifndef __KERNEL__ - opt_set(*opts, direct_io, false); - bch2_free_super(sb); - goto retry; -#endif - prt_printf(&err, "block size (%u) smaller than device block size (%u)", - le16_to_cpu(sb->sb->block_size) << 9, - bdev_logical_block_size(sb->bdev)); - ret = -BCH_ERR_block_size_too_small; - goto err; - } - - sb->have_layout = true; - - ret = bch2_sb_validate(sb->sb, offset, 0, &err); - if (ret) { - bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", - path, err.buf); - goto err_no_print; - } -out: - printbuf_exit(&err); - return ret; -err: - bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n", - path, err.buf); -err_no_print: - bch2_free_super(sb); - goto out; -} - -int bch2_read_super(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb) -{ - return __bch2_read_super(path, opts, sb, false); -} - -/* provide a silenced version for mount.bcachefs */ - -int bch2_read_super_silent(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb) -{ - return __bch2_read_super(path, opts, sb, true); -} - -/* write superblock: */ - -static void write_super_endio(struct bio *bio) -{ - struct bch_dev *ca = bio->bi_private; - - bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status); - - /* XXX: return errors directly */ - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, "superblock %s error: %s", - str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status)); - ca->sb_write_error = 1; - } - - closure_put(&ca->fs->sb_write); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); -} - -static void read_back_super(struct bch_fs *c, struct bch_dev *ca) -{ - struct bch_sb *sb = ca->disk_sb.sb; - struct bio *bio = ca->disk_sb.bio; - - memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; - bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE); - - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); - - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - closure_bio_submit(bio, &c->sb_write); -} - -static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) -{ - struct bch_sb *sb = ca->disk_sb.sb; - struct bio *bio = ca->disk_sb.bio; - - sb->offset = sb->layout.sb_offset[idx]; - - SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); - sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), - null_nonce(), sb); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; - bch2_bio_map(bio, sb, - roundup((size_t) vstruct_bytes(sb), - bdev_logical_block_size(ca->disk_sb.bdev))); - - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], - bio_sectors(bio)); - - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - closure_bio_submit(bio, &c->sb_write); -} - -int bch2_write_super(struct bch_fs *c) -{ - struct closure *cl = &c->sb_write; - struct printbuf err = PRINTBUF; - unsigned sb = 0, nr_wrote; - struct bch_devs_mask sb_written; - bool wrote, can_mount_without_written, can_mount_with_written; - unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; - DARRAY(struct bch_dev *) online_devices = {}; - int ret = 0; - - trace_and_count(c, write_super, c, _RET_IP_); - - if (c->opts.degraded == BCH_DEGRADED_very) - degraded_flags |= BCH_FORCE_IF_LOST; - - lockdep_assert_held(&c->sb_lock); - - closure_init_stack(cl); - memset(&sb_written, 0, sizeof(sb_written)); - - /* - * Note: we do writes to RO devices here, and we might want to change - * that in the future. - * - * For now, we expect to be able to call write_super() when we're not - * yet RW: - */ - for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) { - ret = darray_push(&online_devices, ca); - if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - goto out; - } - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - } - - /* Make sure we're using the new magic numbers: */ - c->disk_sb.sb->magic = BCHFS_MAGIC; - c->disk_sb.sb->layout.magic = BCHFS_MAGIC; - - le64_add_cpu(&c->disk_sb.sb->seq, 1); - - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - darray_for_each(online_devices, ca) - __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq; - c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds()); - - if (test_bit(BCH_FS_error, &c->flags)) - SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); - if (test_bit(BCH_FS_topology_error, &c->flags)) - SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); - - SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); - - bch2_sb_counters_from_cpu(c); - bch2_sb_members_from_cpu(c); - bch2_sb_members_cpy_v2_v1(&c->disk_sb); - bch2_sb_errors_from_cpu(c); - bch2_sb_downgrade_update(c); - - darray_for_each(online_devices, ca) - bch2_sb_from_fs(c, (*ca)); - - darray_for_each(online_devices, ca) { - printbuf_reset(&err); - - ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err); - if (ret) { - bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); - goto out; - } - } - - if (c->opts.nochanges) - goto out; - - /* - * Defer writing the superblock until filesystem initialization is - * complete - don't write out a partly initialized superblock: - */ - if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) - goto out; - - if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "attempting to write superblock that wasn't version downgraded ("); - bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version)); - prt_str(&buf, " > "); - bch2_version_to_text(&buf, bcachefs_metadata_version_current); - prt_str(&buf, ")"); - bch2_fs_fatal_error(c, ": %s", buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, sb_not_downgraded); - goto out; - } - - darray_for_each(online_devices, ca) { - __set_bit((*ca)->dev_idx, sb_written.d); - (*ca)->sb_write_error = 0; - } - - darray_for_each(online_devices, ca) - read_back_super(c, *ca); - closure_sync(cl); - - darray_for_each(online_devices, cap) { - struct bch_dev *ca = *cap; - - if (ca->sb_write_error) - continue; - - if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { - struct printbuf buf = PRINTBUF; - prt_char(&buf, ' '); - prt_bdevname(&buf, ca->disk_sb.bdev); - prt_printf(&buf, - ": Superblock write was silently dropped! (seq %llu expected %llu)", - le64_to_cpu(ca->sb_read_scratch->seq), - ca->disk_sb.seq); - - if (c->opts.errors != BCH_ON_ERROR_continue && - c->opts.errors != BCH_ON_ERROR_fix_safe) { - ret = bch_err_throw(c, erofs_sb_err); - bch2_fs_fatal_error(c, "%s", buf.buf); - } else { - bch_err(c, "%s", buf.buf); - } - - printbuf_exit(&buf); - } - - if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { - struct printbuf buf = PRINTBUF; - prt_char(&buf, ' '); - prt_bdevname(&buf, ca->disk_sb.bdev); - prt_printf(&buf, - ": Superblock modified by another process (seq %llu expected %llu)", - le64_to_cpu(ca->sb_read_scratch->seq), - ca->disk_sb.seq); - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, erofs_sb_err); - } - } - - if (ret) - goto out; - - do { - wrote = false; - darray_for_each(online_devices, cap) { - struct bch_dev *ca = *cap; - if (!ca->sb_write_error && - sb < ca->disk_sb.sb->layout.nr_superblocks) { - write_one_super(c, ca, sb); - wrote = true; - } - } - closure_sync(cl); - sb++; - } while (wrote); - - darray_for_each(online_devices, cap) { - struct bch_dev *ca = *cap; - if (ca->sb_write_error) - __clear_bit(ca->dev_idx, sb_written.d); - else - ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); - } - - nr_wrote = dev_mask_nr(&sb_written); - - can_mount_with_written = - bch2_have_enough_devs(c, sb_written, degraded_flags, false); - - for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++) - sb_written.d[i] = ~sb_written.d[i]; - - can_mount_without_written = - bch2_have_enough_devs(c, sb_written, degraded_flags, false); - - /* - * If we would be able to mount _without_ the devices we successfully - * wrote superblocks to, we weren't able to write to enough devices: - * - * Exception: if we can mount without the successes because we haven't - * written anything (new filesystem), we continue if we'd be able to - * mount with the devices we did successfully write to: - */ - if (bch2_fs_fatal_err_on(!nr_wrote || - !can_mount_with_written || - (can_mount_without_written && - !can_mount_with_written), c, - ": Unable to write superblock to sufficient devices (from %ps)", - (void *) _RET_IP_)) - ret = bch_err_throw(c, erofs_sb_err); -out: - /* Make new options visible after they're persistent: */ - bch2_sb_update(c); - darray_for_each(online_devices, ca) - enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super); - darray_exit(&online_devices); - printbuf_exit(&err); - return ret; -} - -void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) -{ - mutex_lock(&c->sb_lock); - if (!(c->sb.features & (1ULL << feat))) { - c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); - - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); -} - -/* Downgrade if superblock is at a higher version than currently supported: */ -bool bch2_check_version_downgrade(struct bch_fs *c) -{ - bool ret = bcachefs_metadata_version_current < c->sb.version; - - lockdep_assert_held(&c->sb_lock); - - /* - * Downgrade, if superblock is at a higher version than currently - * supported: - * - * c->sb will be checked before we write the superblock, so update it as - * well: - */ - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current) - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current); - if (c->sb.version > bcachefs_metadata_version_current) - c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - if (c->sb.version_min > bcachefs_metadata_version_current) - c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); - c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); - return ret; -} - -void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) -{ - lockdep_assert_held(&c->sb_lock); - - if (BCH_VERSION_MAJOR(new_version) > - BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) - bch2_sb_field_resize(&c->disk_sb, downgrade, 0); - - c->disk_sb.sb->version = cpu_to_le16(new_version); - - if (incompat) { - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); - } -} - -void bch2_sb_upgrade_incompat(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - if (c->sb.version == c->sb.version_incompat_allowed) - goto unlock; - - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Now allowing incompatible features up to "); - bch2_version_to_text(&buf, c->sb.version); - prt_str(&buf, ", previously allowed up to "); - bch2_version_to_text(&buf, c->sb.version_incompat_allowed); - prt_newline(&buf); - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version)); - bch2_write_super(c); -unlock: - mutex_unlock(&c->sb_lock); -} - -static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - if (vstruct_bytes(f) < 88) { - prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88); - return -BCH_ERR_invalid_sb_ext; - } - - return 0; -} - -static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_ext *e = field_to_type(f, ext); - - prt_printf(out, "Recovery passes required:\t"); - prt_bitflags(out, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0]))); - prt_newline(out); - - unsigned long *errors_silent = kmalloc(sizeof(e->errors_silent), GFP_KERNEL); - if (errors_silent) { - le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8); - - prt_printf(out, "Errors to silently fix:\t"); - prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, - min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8)); - prt_newline(out); - - kfree(errors_silent); - } - - prt_printf(out, "Btrees with missing data:\t"); - prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); - prt_newline(out); -} - -static const struct bch_sb_field_ops bch_sb_field_ops_ext = { - .validate = bch2_sb_ext_validate, - .to_text = bch2_sb_ext_to_text, -}; - -static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { -#define x(f, nr) \ - [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, - BCH_SB_FIELDS() -#undef x -}; - -static const struct bch_sb_field_ops bch2_sb_field_null_ops; - -static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) -{ - return likely(type < ARRAY_SIZE(bch2_sb_field_ops)) - ? bch2_sb_field_ops[type] - : &bch2_sb_field_null_ops; -} - -static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - unsigned type = le32_to_cpu(f->type); - struct printbuf field_err = PRINTBUF; - const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); - int ret; - - ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0; - if (ret) { - prt_printf(err, "Invalid superblock section %s: %s", - bch2_sb_fields[type], field_err.buf); - prt_newline(err); - bch2_sb_field_to_text(err, sb, f); - } - - printbuf_exit(&field_err); - return ret; -} - -void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - unsigned type = le32_to_cpu(f->type); - const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); - - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - if (ops->to_text) - ops->to_text(out, sb, f); -} - -void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - unsigned type = le32_to_cpu(f->type); - - if (type < BCH_SB_FIELD_NR) - prt_printf(out, "%s", bch2_sb_fields[type]); - else - prt_printf(out, "(unknown field %u)", type); - - prt_printf(out, " (size %zu):", vstruct_bytes(f)); - prt_newline(out); - - __bch2_sb_field_to_text(out, sb, f); -} - -void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) -{ - unsigned i; - - prt_printf(out, "Type: %u", l->layout_type); - prt_newline(out); - - prt_str(out, "Superblock max size: "); - prt_units_u64(out, 512 << l->sb_max_size_bits); - prt_newline(out); - - prt_printf(out, "Nr superblocks: %u", l->nr_superblocks); - prt_newline(out); - - prt_str(out, "Offsets: "); - for (i = 0; i < l->nr_superblocks; i++) { - if (i) - prt_str(out, ", "); - prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i])); - } - prt_newline(out); -} - -void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, - bool print_layout, unsigned fields) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 44); - - prt_printf(out, "External UUID:\t"); - pr_uuid(out, sb->user_uuid.b); - prt_newline(out); - - prt_printf(out, "Internal UUID:\t"); - pr_uuid(out, sb->uuid.b); - prt_newline(out); - - prt_printf(out, "Magic number:\t"); - pr_uuid(out, sb->magic.b); - prt_newline(out); - - prt_printf(out, "Device index:\t%u\n", sb->dev_idx); - - prt_printf(out, "Label:\t"); - if (!strlen(sb->label)) - prt_printf(out, "(none)"); - else - prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); - prt_newline(out); - - prt_printf(out, "Version:\t"); - bch2_version_to_text(out, le16_to_cpu(sb->version)); - prt_newline(out); - - prt_printf(out, "Incompatible features allowed:\t"); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); - prt_newline(out); - - prt_printf(out, "Incompatible features in use:\t"); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); - prt_newline(out); - - prt_printf(out, "Version upgrade complete:\t"); - bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); - prt_newline(out); - - prt_printf(out, "Oldest version on disk:\t"); - bch2_version_to_text(out, le16_to_cpu(sb->version_min)); - prt_newline(out); - - prt_printf(out, "Created:\t"); - if (sb->time_base_lo) - bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); - else - prt_printf(out, "(not set)"); - prt_newline(out); - - prt_printf(out, "Sequence number:\t"); - prt_printf(out, "%llu", le64_to_cpu(sb->seq)); - prt_newline(out); - - prt_printf(out, "Time of last write:\t"); - bch2_prt_datetime(out, le64_to_cpu(sb->write_time)); - prt_newline(out); - - prt_printf(out, "Superblock size:\t"); - prt_units_u64(out, vstruct_bytes(sb)); - prt_str(out, "/"); - prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); - prt_newline(out); - - prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb)); - prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb)); - - prt_printf(out, "Sections:\t"); - u64 fields_have = 0; - vstruct_for_each(sb, f) - fields_have |= 1 << le32_to_cpu(f->type); - prt_bitflags(out, bch2_sb_fields, fields_have); - prt_newline(out); - - prt_printf(out, "Features:\t"); - prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); - prt_newline(out); - - prt_printf(out, "Compat features:\t"); - prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); - prt_newline(out); - - prt_newline(out); - prt_printf(out, "Options:"); - prt_newline(out); - printbuf_indent_add(out, 2); - { - enum bch_opt_id id; - - for (id = 0; id < bch2_opts_nr; id++) { - const struct bch_option *opt = bch2_opt_table + id; - - if (opt->get_sb) { - u64 v = bch2_opt_from_sb(sb, id, -1); - - prt_printf(out, "%s:\t", opt->attr.name); - bch2_opt_to_text(out, NULL, sb, opt, v, - OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); - prt_newline(out); - } - } - } - - printbuf_indent_sub(out, 2); - - if (print_layout) { - prt_newline(out); - prt_printf(out, "layout:"); - prt_newline(out); - printbuf_indent_add(out, 2); - bch2_sb_layout_to_text(out, &sb->layout); - printbuf_indent_sub(out, 2); - } - - vstruct_for_each(sb, f) - if (fields & (1 << le32_to_cpu(f->type))) { - prt_newline(out); - bch2_sb_field_to_text(out, sb, f); - } -} diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h deleted file mode 100644 index a3b7a90f2533db..00000000000000 --- a/fs/bcachefs/super-io.h +++ /dev/null @@ -1,119 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUPER_IO_H -#define _BCACHEFS_SUPER_IO_H - -#include "extents.h" -#include "eytzinger.h" -#include "super_types.h" -#include "super.h" -#include "sb-members.h" - -#include - -#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096 - -static inline bool bch2_version_compatible(u16 version) -{ - return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && - version >= bcachefs_metadata_version_min; -} - -void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); -enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); - -int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); - -static inline int bch2_request_incompat_feature(struct bch_fs *c, - enum bcachefs_metadata_version version) -{ - return likely(version <= c->sb.version_incompat) - ? 0 - : bch2_set_version_incompat(c, version); -} - -static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) -{ - return le32_to_cpu(f->u64s) * sizeof(u64); -} - -#define field_to_type(_f, _name) \ - container_of_or_null(_f, struct bch_sb_field_##_name, field) - -struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type); -#define bch2_sb_field_get(_sb, _name) \ - field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name) - -struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *, - enum bch_sb_field_type, unsigned); -#define bch2_sb_field_resize(_sb, _name, _u64s) \ - field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name) - -struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *, - enum bch_sb_field_type, unsigned); -#define bch2_sb_field_get_minsize(_sb, _name, _u64s) \ - field_to_type(bch2_sb_field_get_minsize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name) - -#define bch2_sb_field_nr_entries(_f) \ - (_f ? ((bch2_sb_field_bytes(&_f->field) - sizeof(*_f)) / \ - sizeof(_f->entries[0])) \ - : 0) - -void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); - -extern const char * const bch2_sb_fields[]; - -struct bch_sb_field_ops { - int (*validate)(struct bch_sb *, struct bch_sb_field *, - enum bch_validate_flags, struct printbuf *); - void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); -}; - -static inline __le64 bch2_sb_magic(struct bch_fs *c) -{ - __le64 ret; - - memcpy(&ret, &c->sb.uuid, sizeof(ret)); - return ret; -} - -static inline __u64 jset_magic(struct bch_fs *c) -{ - return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); -} - -static inline __u64 bset_magic(struct bch_fs *c) -{ - return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); -} - -int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); -int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); - -void bch2_free_super(struct bch_sb_handle *); -int bch2_sb_realloc(struct bch_sb_handle *, unsigned); - -int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *); - -int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); -int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); -int bch2_write_super(struct bch_fs *); -void __bch2_check_set_feature(struct bch_fs *, unsigned); - -static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) -{ - if (!(c->sb.features & (1ULL << feat))) - __bch2_check_set_feature(c, feat); -} - -bool bch2_check_version_downgrade(struct bch_fs *); -void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); -void bch2_sb_upgrade_incompat(struct bch_fs *); - -void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, - struct bch_sb_field *); -void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, - struct bch_sb_field *); -void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); -void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned); - -#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c deleted file mode 100644 index f2417d298b844f..00000000000000 --- a/fs/bcachefs/super.c +++ /dev/null @@ -1,2547 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bcachefs setup/teardown code, and some metadata io - read a superblock and - * figure out what to do with it. - * - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "backpointers.h" -#include "bkey_sort.h" -#include "btree_cache.h" -#include "btree_gc.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_node_scan.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "btree_write_buffer.h" -#include "buckets_waiting_for_journal.h" -#include "chardev.h" -#include "checksum.h" -#include "clock.h" -#include "compress.h" -#include "debug.h" -#include "disk_accounting.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-io-buffered.h" -#include "fs-io-direct.h" -#include "fsck.h" -#include "inode.h" -#include "io_read.h" -#include "io_write.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "move.h" -#include "migrate.h" -#include "movinggc.h" -#include "nocow_locking.h" -#include "quota.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-clean.h" -#include "sb-counters.h" -#include "sb-errors.h" -#include "sb-members.h" -#include "snapshot.h" -#include "subvolume.h" -#include "super.h" -#include "super-io.h" -#include "sysfs.h" -#include "thread_with_file.h" -#include "trace.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kent Overstreet "); -MODULE_DESCRIPTION("bcachefs filesystem"); - -typedef DARRAY(struct bch_sb_handle) bch_sb_handles; - -#define x(n) #n, -const char * const bch2_fs_flag_strs[] = { - BCH_FS_FLAGS() - NULL -}; - -const char * const bch2_write_refs[] = { - BCH_WRITE_REFS() - NULL -}; - -const char * const bch2_dev_read_refs[] = { - BCH_DEV_READ_REFS() - NULL -}; - -const char * const bch2_dev_write_refs[] = { - BCH_DEV_WRITE_REFS() - NULL -}; -#undef x - -static void __bch2_print_str(struct bch_fs *c, const char *prefix, - const char *str) -{ -#ifdef __KERNEL__ - struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); - - if (unlikely(stdio)) { - bch2_stdio_redirect_printf(stdio, true, "%s", str); - return; - } -#endif - bch2_print_string_as_lines(KERN_ERR, str); -} - -void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) -{ - __bch2_print_str(c, prefix, str); -} - -__printf(2, 0) -static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) -{ -#ifdef __KERNEL__ - if (unlikely(stdio)) { - if (fmt[0] == KERN_SOH[0]) - fmt += 2; - - bch2_stdio_redirect_vprintf(stdio, true, fmt, args); - return; - } -#endif - vprintk(fmt, args); -} - -void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) -{ - struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; - - va_list args; - va_start(args, fmt); - bch2_print_maybe_redirect(stdio, fmt, args); - va_end(args); -} - -void __bch2_print(struct bch_fs *c, const char *fmt, ...) -{ - struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); - - va_list args; - va_start(args, fmt); - bch2_print_maybe_redirect(stdio, fmt, args); - va_end(args); -} - -#define KTYPE(type) \ -static const struct attribute_group type ## _group = { \ - .attrs = type ## _files \ -}; \ - \ -static const struct attribute_group *type ## _groups[] = { \ - &type ## _group, \ - NULL \ -}; \ - \ -static const struct kobj_type type ## _ktype = { \ - .release = type ## _release, \ - .sysfs_ops = &type ## _sysfs_ops, \ - .default_groups = type ## _groups \ -} - -static void bch2_fs_release(struct kobject *); -static void bch2_dev_release(struct kobject *); -static void bch2_fs_counters_release(struct kobject *k) -{ -} - -static void bch2_fs_internal_release(struct kobject *k) -{ -} - -static void bch2_fs_opts_dir_release(struct kobject *k) -{ -} - -static void bch2_fs_time_stats_release(struct kobject *k) -{ -} - -KTYPE(bch2_fs); -KTYPE(bch2_fs_counters); -KTYPE(bch2_fs_internal); -KTYPE(bch2_fs_opts_dir); -KTYPE(bch2_fs_time_stats); -KTYPE(bch2_dev); - -static struct kset *bcachefs_kset; -static LIST_HEAD(bch_fs_list); -static DEFINE_MUTEX(bch_fs_list_lock); - -DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); - -static void bch2_dev_unlink(struct bch_dev *); -static void bch2_dev_free(struct bch_dev *); -static int bch2_dev_alloc(struct bch_fs *, unsigned); -static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); -static void bch2_dev_io_ref_stop(struct bch_dev *, int); -static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); - -struct bch_fs *bch2_dev_to_fs(dev_t dev) -{ - guard(mutex)(&bch_fs_list_lock); - guard(rcu)(); - - struct bch_fs *c; - list_for_each_entry(c, &bch_fs_list, list) - for_each_member_device_rcu(c, ca, NULL) - if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { - closure_get(&c->cl); - return c; - } - return NULL; -} - -static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) -{ - struct bch_fs *c; - - lockdep_assert_held(&bch_fs_list_lock); - - list_for_each_entry(c, &bch_fs_list, list) - if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) - return c; - - return NULL; -} - -struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) -{ - struct bch_fs *c; - - mutex_lock(&bch_fs_list_lock); - c = __bch2_uuid_to_fs(uuid); - if (c) - closure_get(&c->cl); - mutex_unlock(&bch_fs_list_lock); - - return c; -} - -/* Filesystem RO/RW: */ - -/* - * For startup/shutdown of RW stuff, the dependencies are: - * - * - foreground writes depend on copygc and rebalance (to free up space) - * - * - copygc and rebalance depend on mark and sweep gc (they actually probably - * don't because they either reserve ahead of time or don't block if - * allocations fail, but allocations can require mark and sweep gc to run - * because of generation number wraparound) - * - * - all of the above depends on the allocator threads - * - * - allocator depends on the journal (when it rewrites prios and gens) - */ - -static void __bch2_fs_read_only(struct bch_fs *c) -{ - unsigned clean_passes = 0; - u64 seq = 0; - - bch2_fs_ec_stop(c); - bch2_open_buckets_stop(c, NULL, true); - bch2_rebalance_stop(c); - bch2_copygc_stop(c); - bch2_fs_ec_flush(c); - - bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", - journal_cur_seq(&c->journal)); - - do { - clean_passes++; - - if (bch2_btree_interior_updates_flush(c) || - bch2_btree_write_buffer_flush_going_ro(c) || - bch2_journal_flush_all_pins(&c->journal) || - bch2_btree_flush_all_writes(c) || - seq != atomic64_read(&c->journal.seq)) { - seq = atomic64_read(&c->journal.seq); - clean_passes = 0; - } - } while (clean_passes < 2); - - bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", - journal_cur_seq(&c->journal)); - - if (test_bit(JOURNAL_replay_done, &c->journal.flags) && - !test_bit(BCH_FS_emergency_ro, &c->flags)) - set_bit(BCH_FS_clean_shutdown, &c->flags); - - bch2_fs_journal_stop(&c->journal); - - bch_info(c, "%sclean shutdown complete, journal seq %llu", - test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", - c->journal.seq_ondisk); - - /* - * After stopping journal: - */ - for_each_member_device(c, ca) { - bch2_dev_io_ref_stop(ca, WRITE); - bch2_dev_allocator_remove(c, ca); - } -} - -static void bch2_writes_disabled(struct enumerated_ref *writes) -{ - struct bch_fs *c = container_of(writes, struct bch_fs, writes); - - set_bit(BCH_FS_write_disable_complete, &c->flags); - wake_up(&bch2_read_only_wait); -} - -void bch2_fs_read_only(struct bch_fs *c) -{ - if (!test_bit(BCH_FS_rw, &c->flags)) { - bch2_journal_reclaim_stop(&c->journal); - return; - } - - BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); - - bch_verbose(c, "going read-only"); - - /* - * Block new foreground-end write operations from starting - any new - * writes will return -EROFS: - */ - set_bit(BCH_FS_going_ro, &c->flags); - enumerated_ref_stop_async(&c->writes); - - /* - * If we're not doing an emergency shutdown, we want to wait on - * outstanding writes to complete so they don't see spurious errors due - * to shutting down the allocator: - * - * If we are doing an emergency shutdown outstanding writes may - * hang until we shutdown the allocator so we don't want to wait - * on outstanding writes before shutting everything down - but - * we do need to wait on them before returning and signalling - * that going RO is complete: - */ - wait_event(bch2_read_only_wait, - test_bit(BCH_FS_write_disable_complete, &c->flags) || - test_bit(BCH_FS_emergency_ro, &c->flags)); - - bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); - if (writes_disabled) - bch_verbose(c, "finished waiting for writes to stop"); - - __bch2_fs_read_only(c); - - wait_event(bch2_read_only_wait, - test_bit(BCH_FS_write_disable_complete, &c->flags)); - - if (!writes_disabled) - bch_verbose(c, "finished waiting for writes to stop"); - - clear_bit(BCH_FS_write_disable_complete, &c->flags); - clear_bit(BCH_FS_going_ro, &c->flags); - clear_bit(BCH_FS_rw, &c->flags); - - if (!bch2_journal_error(&c->journal) && - !test_bit(BCH_FS_error, &c->flags) && - !test_bit(BCH_FS_emergency_ro, &c->flags) && - test_bit(BCH_FS_started, &c->flags) && - test_bit(BCH_FS_clean_shutdown, &c->flags) && - c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { - BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); - BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); - BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); - BUG_ON(c->btree_write_buffer.inc.keys.nr); - BUG_ON(c->btree_write_buffer.flushing.keys.nr); - bch2_verify_accounting_clean(c); - - bch_verbose(c, "marking filesystem clean"); - bch2_fs_mark_clean(c); - } else { - /* Make sure error counts/counters are persisted */ - mutex_lock(&c->sb_lock); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - bch_verbose(c, "done going read-only, filesystem not clean"); - } -} - -static void bch2_fs_read_only_work(struct work_struct *work) -{ - struct bch_fs *c = - container_of(work, struct bch_fs, read_only_work); - - down_write(&c->state_lock); - bch2_fs_read_only(c); - up_write(&c->state_lock); -} - -static void bch2_fs_read_only_async(struct bch_fs *c) -{ - queue_work(system_long_wq, &c->read_only_work); -} - -bool bch2_fs_emergency_read_only(struct bch_fs *c) -{ - bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - - bch2_journal_halt(&c->journal); - bch2_fs_read_only_async(c); - - wake_up(&bch2_read_only_wait); - return ret; -} - -static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, - bool locked) -{ - bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - - if (!locked) - bch2_journal_halt(&c->journal); - else - bch2_journal_halt_locked(&c->journal); - bch2_fs_read_only_async(c); - wake_up(&bch2_read_only_wait); - - if (ret) - prt_printf(out, "emergency read only at seq %llu\n", - journal_cur_seq(&c->journal)); - - return ret; -} - -bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) -{ - return __bch2_fs_emergency_read_only2(c, out, false); -} - -bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) -{ - bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - - bch2_journal_halt_locked(&c->journal); - bch2_fs_read_only_async(c); - - wake_up(&bch2_read_only_wait); - return ret; -} - -static int __bch2_fs_read_write(struct bch_fs *c, bool early) -{ - int ret; - - BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - - if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) - return bch_err_throw(c, erofs_no_alloc_info); - - if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { - bch_err(c, "cannot go rw, unfixed btree errors"); - return bch_err_throw(c, erofs_unfixed_errors); - } - - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { - bch_err(c, "cannot go rw, filesystem is an unresized image file"); - return bch_err_throw(c, erofs_filesystem_full); - } - - if (test_bit(BCH_FS_rw, &c->flags)) - return 0; - - bch_info(c, "going read-write"); - - ret = bch2_fs_init_rw(c); - if (ret) - goto err; - - ret = bch2_sb_members_v2_init(c); - if (ret) - goto err; - - clear_bit(BCH_FS_clean_shutdown, &c->flags); - - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - bch2_dev_allocator_add(c, ca); - enumerated_ref_start(&ca->io_ref[WRITE]); - } - - bch2_recalc_capacity(c); - - /* - * First journal write must be a flush write: after a clean shutdown we - * don't read the journal, so the first journal write may end up - * overwriting whatever was there previously, and there must always be - * at least one non-flush write in the journal or recovery will fail: - */ - spin_lock(&c->journal.lock); - set_bit(JOURNAL_need_flush_write, &c->journal.flags); - set_bit(JOURNAL_running, &c->journal.flags); - bch2_journal_space_available(&c->journal); - spin_unlock(&c->journal.lock); - - ret = bch2_fs_mark_dirty(c); - if (ret) - goto err; - - ret = bch2_journal_reclaim_start(&c->journal); - if (ret) - goto err; - - set_bit(BCH_FS_rw, &c->flags); - set_bit(BCH_FS_was_rw, &c->flags); - - enumerated_ref_start(&c->writes); - - ret = bch2_copygc_start(c); - if (ret) { - bch_err_msg(c, ret, "error starting copygc thread"); - goto err; - } - - ret = bch2_rebalance_start(c); - if (ret) { - bch_err_msg(c, ret, "error starting rebalance thread"); - goto err; - } - - bch2_do_discards(c); - bch2_do_invalidates(c); - bch2_do_stripe_deletes(c); - bch2_do_pending_node_rewrites(c); - return 0; -err: - if (test_bit(BCH_FS_rw, &c->flags)) - bch2_fs_read_only(c); - else - __bch2_fs_read_only(c); - return ret; -} - -int bch2_fs_read_write(struct bch_fs *c) -{ - if (c->opts.recovery_pass_last && - c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) - return bch_err_throw(c, erofs_norecovery); - - if (c->opts.nochanges) - return bch_err_throw(c, erofs_nochanges); - - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) - return bch_err_throw(c, erofs_no_alloc_info); - - return __bch2_fs_read_write(c, false); -} - -int bch2_fs_read_write_early(struct bch_fs *c) -{ - down_write(&c->state_lock); - int ret = __bch2_fs_read_write(c, true); - up_write(&c->state_lock); - - return ret; -} - -/* Filesystem startup/shutdown: */ - -static void __bch2_fs_free(struct bch_fs *c) -{ - for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) - bch2_time_stats_exit(&c->times[i]); - -#ifdef CONFIG_UNICODE - utf8_unload(c->cf_encoding); -#endif - - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - bch2_free_pending_node_rewrites(c); - bch2_free_fsck_errs(c); - bch2_fs_vfs_exit(c); - bch2_fs_snapshots_exit(c); - bch2_fs_sb_errors_exit(c); - bch2_fs_replicas_exit(c); - bch2_fs_rebalance_exit(c); - bch2_fs_quota_exit(c); - bch2_fs_nocow_locking_exit(c); - bch2_fs_journal_exit(&c->journal); - bch2_fs_fs_io_direct_exit(c); - bch2_fs_fs_io_buffered_exit(c); - bch2_fs_fsio_exit(c); - bch2_fs_io_write_exit(c); - bch2_fs_io_read_exit(c); - bch2_fs_encryption_exit(c); - bch2_fs_ec_exit(c); - bch2_fs_counters_exit(c); - bch2_fs_compress_exit(c); - bch2_io_clock_exit(&c->io_clock[WRITE]); - bch2_io_clock_exit(&c->io_clock[READ]); - bch2_fs_buckets_waiting_for_journal_exit(c); - bch2_fs_btree_write_buffer_exit(c); - bch2_fs_btree_key_cache_exit(&c->btree_key_cache); - bch2_fs_btree_iter_exit(c); - bch2_fs_btree_interior_update_exit(c); - bch2_fs_btree_cache_exit(c); - bch2_fs_accounting_exit(c); - bch2_fs_async_obj_exit(c); - bch2_journal_keys_put_initial(c); - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - - BUG_ON(atomic_read(&c->journal_keys.ref)); - percpu_free_rwsem(&c->mark_lock); - if (c->online_reserved) { - u64 v = percpu_u64_get(c->online_reserved); - WARN(v, "online_reserved not 0 at shutdown: %lli", v); - free_percpu(c->online_reserved); - } - - darray_exit(&c->incompat_versions_requested); - darray_exit(&c->btree_roots_extra); - free_percpu(c->pcpu); - free_percpu(c->usage); - mempool_exit(&c->large_bkey_pool); - mempool_exit(&c->btree_bounce_pool); - bioset_exit(&c->btree_bio); - mempool_exit(&c->fill_iter); - enumerated_ref_exit(&c->writes); - kfree(rcu_dereference_protected(c->disk_groups, 1)); - kfree(c->journal_seq_blacklist_table); - - if (c->write_ref_wq) - destroy_workqueue(c->write_ref_wq); - if (c->btree_write_submit_wq) - destroy_workqueue(c->btree_write_submit_wq); - if (c->btree_read_complete_wq) - destroy_workqueue(c->btree_read_complete_wq); - if (c->copygc_wq) - destroy_workqueue(c->copygc_wq); - if (c->btree_write_complete_wq) - destroy_workqueue(c->btree_write_complete_wq); - if (c->btree_update_wq) - destroy_workqueue(c->btree_update_wq); - - bch2_free_super(&c->disk_sb); - kvfree(c); - module_put(THIS_MODULE); -} - -static void bch2_fs_release(struct kobject *kobj) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - __bch2_fs_free(c); -} - -void __bch2_fs_stop(struct bch_fs *c) -{ - bch_verbose(c, "shutting down"); - - set_bit(BCH_FS_stopping, &c->flags); - - down_write(&c->state_lock); - bch2_fs_read_only(c); - up_write(&c->state_lock); - - for (unsigned i = 0; i < c->sb.nr_devices; i++) { - struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); - if (ca) - bch2_dev_io_ref_stop(ca, READ); - } - - for_each_member_device(c, ca) - bch2_dev_unlink(ca); - - if (c->kobj.state_in_sysfs) - kobject_del(&c->kobj); - - bch2_fs_debug_exit(c); - bch2_fs_chardev_exit(c); - - bch2_ro_ref_put(c); - wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); - - kobject_put(&c->counters_kobj); - kobject_put(&c->time_stats); - kobject_put(&c->opts_dir); - kobject_put(&c->internal); - - /* btree prefetch might have kicked off reads in the background: */ - bch2_btree_flush_all_reads(c); - - for_each_member_device(c, ca) - cancel_work_sync(&ca->io_error_work); - - cancel_work_sync(&c->read_only_work); -} - -void bch2_fs_free(struct bch_fs *c) -{ - mutex_lock(&bch_fs_list_lock); - list_del(&c->list); - mutex_unlock(&bch_fs_list_lock); - - closure_sync(&c->cl); - closure_debug_destroy(&c->cl); - - for (unsigned i = 0; i < c->sb.nr_devices; i++) { - struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); - - if (ca) { - EBUG_ON(atomic_long_read(&ca->ref) != 1); - bch2_dev_io_ref_stop(ca, READ); - bch2_free_super(&ca->disk_sb); - bch2_dev_free(ca); - } - } - - bch_verbose(c, "shutdown complete"); - - kobject_put(&c->kobj); -} - -void bch2_fs_stop(struct bch_fs *c) -{ - __bch2_fs_stop(c); - bch2_fs_free(c); -} - -static int bch2_fs_online(struct bch_fs *c) -{ - int ret = 0; - - lockdep_assert_held(&bch_fs_list_lock); - - if (c->sb.multi_device && - __bch2_uuid_to_fs(c->sb.uuid)) { - bch_err(c, "filesystem UUID already open"); - return bch_err_throw(c, filesystem_uuid_already_open); - } - - ret = bch2_fs_chardev_init(c); - if (ret) { - bch_err(c, "error creating character device"); - return ret; - } - - bch2_fs_debug_init(c); - - ret = (c->sb.multi_device - ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) - : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: - kobject_add(&c->internal, &c->kobj, "internal") ?: - kobject_add(&c->opts_dir, &c->kobj, "options") ?: -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: -#endif - kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: - bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); - if (ret) { - bch_err(c, "error creating sysfs objects"); - return ret; - } - - down_write(&c->state_lock); - - for_each_member_device(c, ca) { - ret = bch2_dev_sysfs_online(c, ca); - if (ret) { - bch_err(c, "error creating sysfs objects"); - bch2_dev_put(ca); - goto err; - } - } - - BUG_ON(!list_empty(&c->list)); - list_add(&c->list, &bch_fs_list); -err: - up_write(&c->state_lock); - return ret; -} - -int bch2_fs_init_rw(struct bch_fs *c) -{ - if (test_bit(BCH_FS_rw_init_done, &c->flags)) - return 0; - - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || - !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 1)) || - !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_PERCPU, 1)) || - !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 1)) || - !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", - WQ_FREEZABLE|WQ_PERCPU, 0))) - return bch_err_throw(c, ENOMEM_fs_other_alloc); - - int ret = bch2_fs_btree_interior_update_init(c) ?: - bch2_fs_btree_write_buffer_init(c) ?: - bch2_fs_fs_io_buffered_init(c) ?: - bch2_fs_io_write_init(c) ?: - bch2_fs_journal_init(&c->journal); - if (ret) - return ret; - - set_bit(BCH_FS_rw_init_done, &c->flags); - return 0; -} - -static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, - bch_sb_handles *sbs) -{ - struct bch_fs *c; - struct printbuf name = PRINTBUF; - unsigned i, iter_size; - int ret = 0; - - c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); - if (!c) { - c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); - goto out; - } - - c->stdio = (void *)(unsigned long) opts->stdio; - - __module_get(THIS_MODULE); - - closure_init(&c->cl, NULL); - - c->kobj.kset = bcachefs_kset; - kobject_init(&c->kobj, &bch2_fs_ktype); - kobject_init(&c->internal, &bch2_fs_internal_ktype); - kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); - kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); - kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); - - c->minor = -1; - c->disk_sb.fs_sb = true; - - init_rwsem(&c->state_lock); - mutex_init(&c->sb_lock); - mutex_init(&c->replicas_gc_lock); - mutex_init(&c->btree_root_lock); - INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); - - refcount_set(&c->ro_ref, 1); - init_waitqueue_head(&c->ro_ref_wait); - - for (i = 0; i < BCH_TIME_STAT_NR; i++) - bch2_time_stats_init(&c->times[i]); - - bch2_fs_allocator_background_init(c); - bch2_fs_allocator_foreground_init(c); - bch2_fs_btree_cache_init_early(&c->btree_cache); - bch2_fs_btree_gc_init_early(c); - bch2_fs_btree_interior_update_init_early(c); - bch2_fs_btree_iter_init_early(c); - bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); - bch2_fs_btree_write_buffer_init_early(c); - bch2_fs_copygc_init(c); - bch2_fs_ec_init_early(c); - bch2_fs_journal_init_early(&c->journal); - bch2_fs_journal_keys_init(c); - bch2_fs_move_init(c); - bch2_fs_nocow_locking_init_early(c); - bch2_fs_quota_init(c); - bch2_fs_recovery_passes_init(c); - bch2_fs_sb_errors_init_early(c); - bch2_fs_snapshots_init_early(c); - bch2_fs_subvolumes_init_early(c); - - INIT_LIST_HEAD(&c->list); - - mutex_init(&c->bio_bounce_pages_lock); - mutex_init(&c->snapshot_table_lock); - init_rwsem(&c->snapshot_create_lock); - - spin_lock_init(&c->btree_write_error_lock); - - INIT_LIST_HEAD(&c->journal_iters); - - INIT_LIST_HEAD(&c->fsck_error_msgs); - mutex_init(&c->fsck_error_msgs_lock); - - seqcount_init(&c->usage_lock); - - sema_init(&c->io_in_flight, 128); - - INIT_LIST_HEAD(&c->vfs_inodes_list); - mutex_init(&c->vfs_inodes_lock); - - c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; - c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; - c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; - - mutex_init(&c->sectors_available_lock); - - ret = percpu_init_rwsem(&c->mark_lock); - if (ret) - goto err; - - mutex_lock(&c->sb_lock); - ret = bch2_sb_to_fs(c, sb); - mutex_unlock(&c->sb_lock); - - if (ret) - goto err; - - /* Compat: */ - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && - !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) - SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && - !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) - SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); - - c->opts = bch2_opts_default; - ret = bch2_opts_from_sb(&c->opts, sb); - if (ret) - goto err; - - bch2_opts_apply(&c->opts, *opts); - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - c->opts.block_size > PAGE_SIZE) { - bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); - ret = -EINVAL; - goto err; - } - - c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; - if (c->opts.inodes_use_key_cache) - c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; - c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; - - c->block_bits = ilog2(block_sectors(c)); - c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); - - if (bch2_fs_init_fault("fs_alloc")) { - bch_err(c, "fs_alloc fault injected"); - ret = -EFAULT; - goto err; - } - - if (c->sb.multi_device) - pr_uuid(&name, c->sb.user_uuid.b); - else - prt_bdevname(&name, sbs->data[0].bdev); - - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; - if (ret) - goto err; - - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - - iter_size = sizeof(struct sort_iter) + - (btree_blocks(c) + 1) * 2 * - sizeof(struct sort_iter_set); - - if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 512)) || - enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, - bch2_writes_disabled) || - mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || - bioset_init(&c->btree_bio, 1, - max(offsetof(struct btree_read_bio, bio), - offsetof(struct btree_write_bio, wbio.bio)), - BIOSET_NEED_BVECS) || - !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || - !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || - !(c->online_reserved = alloc_percpu(u64)) || - mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, - c->opts.btree_node_size) || - mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { - ret = bch_err_throw(c, ENOMEM_fs_other_alloc); - goto err; - } - - ret = - bch2_fs_async_obj_init(c) ?: - bch2_fs_btree_cache_init(c) ?: - bch2_fs_btree_iter_init(c) ?: - bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_buckets_waiting_for_journal_init(c) ?: - bch2_io_clock_init(&c->io_clock[READ]) ?: - bch2_io_clock_init(&c->io_clock[WRITE]) ?: - bch2_fs_compress_init(c) ?: - bch2_fs_counters_init(c) ?: - bch2_fs_ec_init(c) ?: - bch2_fs_encryption_init(c) ?: - bch2_fs_fsio_init(c) ?: - bch2_fs_fs_io_direct_init(c) ?: - bch2_fs_io_read_init(c) ?: - bch2_fs_rebalance_init(c) ?: - bch2_fs_sb_errors_init(c) ?: - bch2_fs_vfs_init(c); - if (ret) - goto err; - - if (go_rw_in_recovery(c)) { - /* - * start workqueues/kworkers early - kthread creation checks for - * pending signals, which is _very_ annoying - */ - ret = bch2_fs_init_rw(c); - if (ret) - goto err; - } - -#ifdef CONFIG_UNICODE - if (bch2_fs_casefold_enabled(c)) { - /* Default encoding until we can potentially have more as an option. */ - c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); - if (IS_ERR(c->cf_encoding)) { - printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); - ret = -EINVAL; - goto err; - } - } -#else - if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { - printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); - ret = -EINVAL; - goto err; - } -#endif - - for (i = 0; i < c->sb.nr_devices; i++) { - if (!bch2_member_exists(c->disk_sb.sb, i)) - continue; - ret = bch2_dev_alloc(c, i); - if (ret) - goto err; - } - - bch2_journal_entry_res_resize(&c->journal, - &c->btree_root_journal_res, - BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); - bch2_journal_entry_res_resize(&c->journal, - &c->clock_journal_res, - (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); - - mutex_lock(&bch_fs_list_lock); - ret = bch2_fs_online(c); - mutex_unlock(&bch_fs_list_lock); - - if (ret) - goto err; -out: - return c; -err: - bch2_fs_free(c); - c = ERR_PTR(ret); - goto out; -} - -noinline_for_stack -static void print_mount_opts(struct bch_fs *c) -{ - enum bch_opt_id i; - CLASS(printbuf, p)(); - bch2_log_msg_start(c, &p); - - prt_str(&p, "starting version "); - bch2_version_to_text(&p, c->sb.version); - - bool first = true; - for (i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - u64 v = bch2_opt_get_by_id(&c->opts, i); - - if (!(opt->flags & OPT_MOUNT)) - continue; - - if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) - continue; - - prt_str(&p, first ? " opts=" : ","); - first = false; - bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); - } - - if (c->sb.version_incompat_allowed != c->sb.version) { - prt_printf(&p, "\nallowing incompatible features above "); - bch2_version_to_text(&p, c->sb.version_incompat_allowed); - } - - if (c->opts.verbose) { - prt_printf(&p, "\nfeatures: "); - prt_bitflags(&p, bch2_sb_features, c->sb.features); - } - - if (c->sb.multi_device) { - prt_printf(&p, "\nwith devices"); - for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { - prt_char(&p, ' '); - prt_str(&p, ca->name); - } - } - - bch2_print_str(c, KERN_INFO, p.buf); -} - -static bool bch2_fs_may_start(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned flags = 0; - - switch (c->opts.degraded) { - case BCH_DEGRADED_very: - flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; - break; - case BCH_DEGRADED_yes: - flags |= BCH_FORCE_IF_DEGRADED; - break; - default: - mutex_lock(&c->sb_lock); - for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { - if (!bch2_member_exists(c->disk_sb.sb, i)) - continue; - - ca = bch2_dev_locked(c, i); - - if (!bch2_dev_is_online(ca) && - (ca->mi.state == BCH_MEMBER_STATE_rw || - ca->mi.state == BCH_MEMBER_STATE_ro)) { - mutex_unlock(&c->sb_lock); - return false; - } - } - mutex_unlock(&c->sb_lock); - break; - } - - return bch2_have_enough_devs(c, c->online_devs, flags, true); -} - -int bch2_fs_start(struct bch_fs *c) -{ - time64_t now = ktime_get_real_seconds(); - int ret = 0; - - print_mount_opts(c); - - if (c->cf_encoding) - bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); - - if (!bch2_fs_may_start(c)) - return bch_err_throw(c, insufficient_devices_to_start); - - down_write(&c->state_lock); - mutex_lock(&c->sb_lock); - - BUG_ON(test_bit(BCH_FS_started, &c->flags)); - - if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, - sizeof(struct bch_sb_field_ext) / sizeof(u64))) { - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); - ret = bch_err_throw(c, ENOSPC_sb); - goto err; - } - - ret = bch2_sb_members_v2_init(c); - if (ret) { - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); - goto err; - } - - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = - cpu_to_le64(now); - - /* - * Dno't write superblock yet: recovery might have to downgrade - */ - mutex_unlock(&c->sb_lock); - - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) - if (ca->mi.state == BCH_MEMBER_STATE_rw) - bch2_dev_allocator_add(c, ca); - bch2_recalc_capacity(c); - up_write(&c->state_lock); - - c->recovery_task = current; - ret = BCH_SB_INITIALIZED(c->disk_sb.sb) - ? bch2_fs_recovery(c) - : bch2_fs_initialize(c); - c->recovery_task = NULL; - - if (ret) - goto err; - - ret = bch2_opts_hooks_pre_set(c); - if (ret) - goto err; - - if (bch2_fs_init_fault("fs_start")) { - ret = bch_err_throw(c, injected_fs_start); - goto err; - } - - set_bit(BCH_FS_started, &c->flags); - wake_up(&c->ro_ref_wait); - - down_write(&c->state_lock); - if (c->opts.read_only) - bch2_fs_read_only(c); - else if (!test_bit(BCH_FS_rw, &c->flags)) - ret = bch2_fs_read_write(c); - up_write(&c->state_lock); - -err: - if (ret) - bch_err_msg(c, ret, "starting filesystem"); - else - bch_verbose(c, "done starting filesystem"); - return ret; -} - -static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) -{ - struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); - - if (le16_to_cpu(sb->block_size) != block_sectors(c)) - return bch_err_throw(c, mismatched_block_size); - - if (le16_to_cpu(m.bucket_size) < - BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) - return bch_err_throw(c, bucket_size_too_small); - - return 0; -} - -static int bch2_dev_in_fs(struct bch_sb_handle *fs, - struct bch_sb_handle *sb, - struct bch_opts *opts) -{ - if (fs == sb) - return 0; - - if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) - return -BCH_ERR_device_not_a_member_of_filesystem; - - if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) - return -BCH_ERR_device_has_been_removed; - - if (fs->sb->block_size != sb->sb->block_size) - return -BCH_ERR_mismatched_block_size; - - if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || - le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) - return 0; - - if (fs->sb->seq == sb->sb->seq && - fs->sb->write_time != sb->sb->write_time) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Split brain detected between "); - prt_bdevname(&buf, sb->bdev); - prt_str(&buf, " and "); - prt_bdevname(&buf, fs->bdev); - prt_char(&buf, ':'); - prt_newline(&buf); - prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); - prt_newline(&buf); - - prt_bdevname(&buf, fs->bdev); - prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); - prt_newline(&buf); - - prt_bdevname(&buf, sb->bdev); - prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); - prt_newline(&buf); - - if (!opts->no_splitbrain_check) - prt_printf(&buf, "Not using older sb"); - - pr_err("%s", buf.buf); - printbuf_exit(&buf); - - if (!opts->no_splitbrain_check) - return -BCH_ERR_device_splitbrain; - } - - struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); - u64 seq_from_fs = le64_to_cpu(m.seq); - u64 seq_from_member = le64_to_cpu(sb->sb->seq); - - if (seq_from_fs && seq_from_fs < seq_from_member) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Split brain detected between "); - prt_bdevname(&buf, sb->bdev); - prt_str(&buf, " and "); - prt_bdevname(&buf, fs->bdev); - prt_char(&buf, ':'); - prt_newline(&buf); - - prt_bdevname(&buf, fs->bdev); - prt_str(&buf, " believes seq of "); - prt_bdevname(&buf, sb->bdev); - prt_printf(&buf, " to be %llu, but ", seq_from_fs); - prt_bdevname(&buf, sb->bdev); - prt_printf(&buf, " has %llu\n", seq_from_member); - - if (!opts->no_splitbrain_check) { - prt_str(&buf, "Not using "); - prt_bdevname(&buf, sb->bdev); - } - - pr_err("%s", buf.buf); - printbuf_exit(&buf); - - if (!opts->no_splitbrain_check) - return -BCH_ERR_device_splitbrain; - } - - return 0; -} - -/* Device startup/shutdown: */ - -static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) -{ - if (rw == READ) - clear_bit(ca->dev_idx, ca->fs->online_devs.d); - - if (!enumerated_ref_is_zero(&ca->io_ref[rw])) - enumerated_ref_stop(&ca->io_ref[rw], - rw == READ - ? bch2_dev_read_refs - : bch2_dev_write_refs); -} - -static void bch2_dev_release(struct kobject *kobj) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - - kfree(ca); -} - -static void bch2_dev_free(struct bch_dev *ca) -{ - WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); - WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); - - cancel_work_sync(&ca->io_error_work); - - bch2_dev_unlink(ca); - - if (ca->kobj.state_in_sysfs) - kobject_del(&ca->kobj); - - bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); - bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); - - bch2_free_super(&ca->disk_sb); - bch2_dev_allocator_background_exit(ca); - bch2_dev_journal_exit(ca); - - free_percpu(ca->io_done); - bch2_dev_buckets_free(ca); - kfree(ca->sb_read_scratch); - - bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); - bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); - - enumerated_ref_exit(&ca->io_ref[WRITE]); - enumerated_ref_exit(&ca->io_ref[READ]); -#ifndef CONFIG_BCACHEFS_DEBUG - percpu_ref_exit(&ca->ref); -#endif - kobject_put(&ca->kobj); -} - -static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) -{ - - lockdep_assert_held(&c->state_lock); - - if (enumerated_ref_is_zero(&ca->io_ref[READ])) - return; - - __bch2_dev_read_only(c, ca); - - bch2_dev_io_ref_stop(ca, READ); - - bch2_dev_unlink(ca); - - bch2_free_super(&ca->disk_sb); - bch2_dev_journal_exit(ca); -} - -#ifndef CONFIG_BCACHEFS_DEBUG -static void bch2_dev_ref_complete(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, ref); - - complete(&ca->ref_completion); -} -#endif - -static void bch2_dev_unlink(struct bch_dev *ca) -{ - struct kobject *b; - - /* - * This is racy w.r.t. the underlying block device being hot-removed, - * which removes it from sysfs. - * - * It'd be lovely if we had a way to handle this race, but the sysfs - * code doesn't appear to provide a good method and block/holder.c is - * susceptible as well: - */ - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev && - (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { - sysfs_remove_link(b, "bcachefs"); - sysfs_remove_link(&ca->kobj, "block"); - } -} - -static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) -{ - int ret; - - if (!c->kobj.state_in_sysfs) - return 0; - - if (!ca->kobj.state_in_sysfs) { - ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: - bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); - if (ret) - return ret; - } - - if (ca->disk_sb.bdev) { - struct kobject *block = bdev_kobj(ca->disk_sb.bdev); - - ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); - if (ret) - return ret; - - ret = sysfs_create_link(&ca->kobj, block, "block"); - if (ret) - return ret; - } - - return 0; -} - -static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, - struct bch_member *member) -{ - struct bch_dev *ca; - unsigned i; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - return NULL; - - kobject_init(&ca->kobj, &bch2_dev_ktype); - init_completion(&ca->ref_completion); - - INIT_WORK(&ca->io_error_work, bch2_io_error_work); - - bch2_time_stats_quantiles_init(&ca->io_latency[READ]); - bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); - - ca->mi = bch2_mi_to_cpu(member); - - for (i = 0; i < ARRAY_SIZE(member->errors); i++) - atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); - - ca->uuid = member->uuid; - - ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, - ca->mi.bucket_size / btree_sectors(c)); - -#ifndef CONFIG_BCACHEFS_DEBUG - if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) - goto err; -#else - atomic_long_set(&ca->ref, 1); -#endif - - mutex_init(&ca->bucket_backpointer_mismatch.lock); - mutex_init(&ca->bucket_backpointer_empty.lock); - - bch2_dev_allocator_background_init(ca); - - if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || - enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || - !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || - bch2_dev_buckets_alloc(c, ca) || - !(ca->io_done = alloc_percpu(*ca->io_done))) - goto err; - - return ca; -err: - bch2_dev_free(ca); - return NULL; -} - -static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, - unsigned dev_idx) -{ - ca->dev_idx = dev_idx; - __set_bit(ca->dev_idx, ca->self.d); - - if (!ca->name[0]) - scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); - - ca->fs = c; - rcu_assign_pointer(c->devs[ca->dev_idx], ca); - - if (bch2_dev_sysfs_online(c, ca)) - pr_warn("error creating sysfs objects"); -} - -static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) -{ - struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - struct bch_dev *ca = NULL; - - if (bch2_fs_init_fault("dev_alloc")) - goto err; - - ca = __bch2_dev_alloc(c, &member); - if (!ca) - goto err; - - ca->fs = c; - - bch2_dev_attach(c, ca, dev_idx); - return 0; -err: - return bch_err_throw(c, ENOMEM_dev_alloc); -} - -static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) -{ - unsigned ret; - - if (bch2_dev_is_online(ca)) { - bch_err(ca, "already have device online in slot %u", - sb->sb->dev_idx); - return bch_err_throw(ca->fs, device_already_online); - } - - if (get_capacity(sb->bdev->bd_disk) < - ca->mi.bucket_size * ca->mi.nbuckets) { - bch_err(ca, "cannot online: device too small"); - return bch_err_throw(ca->fs, device_size_too_small); - } - - BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); - BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); - - ret = bch2_dev_journal_init(ca, sb->sb); - if (ret) - return ret; - - struct printbuf name = PRINTBUF; - prt_bdevname(&name, sb->bdev); - strscpy(ca->name, name.buf, sizeof(ca->name)); - printbuf_exit(&name); - - /* Commit: */ - ca->disk_sb = *sb; - memset(sb, 0, sizeof(*sb)); - - /* - * Stash pointer to the filesystem for blk_holder_ops - note that once - * attached to a filesystem, we will always close the block device - * before tearing down the filesystem object. - */ - ca->disk_sb.holder->c = ca->fs; - - ca->dev = ca->disk_sb.bdev->bd_dev; - - enumerated_ref_start(&ca->io_ref[READ]); - - return 0; -} - -static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) -{ - struct bch_dev *ca; - int ret; - - lockdep_assert_held(&c->state_lock); - - if (le64_to_cpu(sb->sb->seq) > - le64_to_cpu(c->disk_sb.sb->seq)) - bch2_sb_to_fs(c, sb->sb); - - BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); - - ca = bch2_dev_locked(c, sb->sb->dev_idx); - - ret = __bch2_dev_attach_bdev(ca, sb); - if (ret) - return ret; - - set_bit(ca->dev_idx, c->online_devs.d); - - bch2_dev_sysfs_online(c, ca); - - bch2_rebalance_wakeup(c); - return 0; -} - -/* Device management: */ - -/* - * Note: this function is also used by the error paths - when a particular - * device sees an error, we call it to determine whether we can just set the - * device RO, or - if this function returns false - we'll set the whole - * filesystem RO: - * - * XXX: maybe we should be more explicit about whether we're changing state - * because we got an error or what have you? - */ -bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - struct bch_devs_mask new_online_devs; - int nr_rw = 0, required; - - lockdep_assert_held(&c->state_lock); - - switch (new_state) { - case BCH_MEMBER_STATE_rw: - return true; - case BCH_MEMBER_STATE_ro: - if (ca->mi.state != BCH_MEMBER_STATE_rw) - return true; - - /* do we have enough devices to write to? */ - for_each_member_device(c, ca2) - if (ca2 != ca) - nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; - - required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) - ? c->opts.metadata_replicas - : metadata_replicas_required(c), - !(flags & BCH_FORCE_IF_DATA_DEGRADED) - ? c->opts.data_replicas - : data_replicas_required(c)); - - return nr_rw >= required; - case BCH_MEMBER_STATE_failed: - case BCH_MEMBER_STATE_spare: - if (ca->mi.state != BCH_MEMBER_STATE_rw && - ca->mi.state != BCH_MEMBER_STATE_ro) - return true; - - /* do we have enough devices to read from? */ - new_online_devs = c->online_devs; - __clear_bit(ca->dev_idx, new_online_devs.d); - - return bch2_have_enough_devs(c, new_online_devs, flags, false); - default: - BUG(); - } -} - -static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) -{ - bch2_dev_io_ref_stop(ca, WRITE); - - /* - * The allocator thread itself allocates btree nodes, so stop it first: - */ - bch2_dev_allocator_remove(c, ca); - bch2_recalc_capacity(c); - bch2_dev_journal_stop(&c->journal, ca); -} - -static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) -{ - lockdep_assert_held(&c->state_lock); - - BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); - - bch2_dev_allocator_add(c, ca); - bch2_recalc_capacity(c); - - if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) - enumerated_ref_start(&ca->io_ref[WRITE]); - - bch2_dev_do_discards(ca); -} - -int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - struct bch_member *m; - int ret = 0; - - if (ca->mi.state == new_state) - return 0; - - if (!bch2_dev_state_allowed(c, ca, new_state, flags)) - return bch_err_throw(c, device_state_not_allowed); - - if (new_state != BCH_MEMBER_STATE_rw) - __bch2_dev_read_only(c, ca); - - bch_notice(ca, "%s", bch2_member_states[new_state]); - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_STATE(m, new_state); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (new_state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); - - bch2_rebalance_wakeup(c); - - return ret; -} - -int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - int ret; - - down_write(&c->state_lock); - ret = __bch2_dev_set_state(c, ca, new_state, flags); - up_write(&c->state_lock); - - return ret; -} - -/* Device add/removal: */ - -int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) -{ - struct bch_member *m; - unsigned dev_idx = ca->dev_idx, data; - bool fast_device_removal = !bch2_request_incompat_feature(c, - bcachefs_metadata_version_fast_device_removal); - int ret; - - down_write(&c->state_lock); - - /* - * We consume a reference to ca->ref, regardless of whether we succeed - * or fail: - */ - bch2_dev_put(ca); - - if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { - bch_err(ca, "Cannot remove without losing data"); - ret = bch_err_throw(c, device_state_not_allowed); - goto err; - } - - __bch2_dev_read_only(c, ca); - - ret = fast_device_removal - ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) - : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: - bch2_dev_remove_stripes(c, ca->dev_idx, flags)); - if (ret) - goto err; - - /* Check if device still has data before blowing away alloc info */ - struct bch_dev_usage usage = bch2_dev_usage_read(ca); - for (unsigned i = 0; i < BCH_DATA_NR; i++) - if (!data_type_is_empty(i) && - !data_type_is_hidden(i) && - usage.buckets[i]) { - bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", - __bch2_data_types[i], usage.buckets[i]); - ret = -EBUSY; - goto err; - } - - ret = bch2_dev_remove_alloc(c, ca); - bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); - if (ret) - goto err; - - /* - * We need to flush the entire journal to get rid of keys that reference - * the device being removed before removing the superblock entry - */ - bch2_journal_flush_all_pins(&c->journal); - - /* - * this is really just needed for the bch2_replicas_gc_(start|end) - * calls, and could be cleaned up: - */ - ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); - bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); - if (ret) - goto err; - - ret = bch2_journal_flush(&c->journal); - bch_err_msg(ca, ret, "bch2_journal_flush()"); - if (ret) - goto err; - - ret = bch2_replicas_gc2(c); - bch_err_msg(ca, ret, "bch2_replicas_gc2()"); - if (ret) - goto err; - - data = bch2_dev_has_data(c, ca); - if (data) { - struct printbuf data_has = PRINTBUF; - - prt_bitflags(&data_has, __bch2_data_types, data); - bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); - printbuf_exit(&data_has); - ret = -EBUSY; - goto err; - } - - __bch2_dev_offline(c, ca); - - mutex_lock(&c->sb_lock); - rcu_assign_pointer(c->devs[ca->dev_idx], NULL); - mutex_unlock(&c->sb_lock); - -#ifndef CONFIG_BCACHEFS_DEBUG - percpu_ref_kill(&ca->ref); -#else - ca->dying = true; - bch2_dev_put(ca); -#endif - wait_for_completion(&ca->ref_completion); - - bch2_dev_free(ca); - - /* - * Free this device's slot in the bch_member array - all pointers to - * this device must be gone: - */ - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); - - if (fast_device_removal) - m->uuid = BCH_SB_MEMBER_DELETED_UUID; - else - memset(&m->uuid, 0, sizeof(m->uuid)); - - bch2_write_super(c); - - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); - return 0; -err: - if (test_bit(BCH_FS_rw, &c->flags) && - ca->mi.state == BCH_MEMBER_STATE_rw && - !enumerated_ref_is_zero(&ca->io_ref[READ])) - __bch2_dev_read_write(c, ca); - up_write(&c->state_lock); - return ret; -} - -/* Add new device to running filesystem: */ -int bch2_dev_add(struct bch_fs *c, const char *path) -{ - struct bch_opts opts = bch2_opts_empty(); - struct bch_sb_handle sb = {}; - struct bch_dev *ca = NULL; - struct printbuf errbuf = PRINTBUF; - struct printbuf label = PRINTBUF; - int ret = 0; - - ret = bch2_read_super(path, &opts, &sb); - bch_err_msg(c, ret, "reading super"); - if (ret) - goto err; - - struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); - - if (BCH_MEMBER_GROUP(&dev_mi)) { - bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); - if (label.allocation_failure) { - ret = -ENOMEM; - goto err; - } - } - - if (list_empty(&c->list)) { - mutex_lock(&bch_fs_list_lock); - if (__bch2_uuid_to_fs(c->sb.uuid)) - ret = bch_err_throw(c, filesystem_uuid_already_open); - else - list_add(&c->list, &bch_fs_list); - mutex_unlock(&bch_fs_list_lock); - - if (ret) { - bch_err(c, "filesystem UUID already open"); - goto err; - } - } - - ret = bch2_dev_may_add(sb.sb, c); - if (ret) - goto err; - - ca = __bch2_dev_alloc(c, &dev_mi); - if (!ca) { - ret = -ENOMEM; - goto err; - } - - ret = __bch2_dev_attach_bdev(ca, &sb); - if (ret) - goto err; - - down_write(&c->state_lock); - mutex_lock(&c->sb_lock); - SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); - - ret = bch2_sb_from_fs(c, ca); - bch_err_msg(c, ret, "setting up new superblock"); - if (ret) - goto err_unlock; - - if (dynamic_fault("bcachefs:add:no_slot")) - goto err_unlock; - - ret = bch2_sb_member_alloc(c); - if (ret < 0) { - bch_err_msg(c, ret, "setting up new superblock"); - goto err_unlock; - } - unsigned dev_idx = ret; - ret = 0; - - /* success: */ - - dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); - *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; - - ca->disk_sb.sb->dev_idx = dev_idx; - bch2_dev_attach(c, ca, dev_idx); - - if (BCH_MEMBER_GROUP(&dev_mi)) { - ret = __bch2_dev_group_set(c, ca, label.buf); - bch_err_msg(c, ret, "creating new label"); - if (ret) - goto err_unlock; - } - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (test_bit(BCH_FS_started, &c->flags)) { - ret = bch2_dev_usage_init(ca, false); - if (ret) - goto err_late; - - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - bch_err_msg(ca, ret, "marking new superblock"); - if (ret) - goto err_late; - - ret = bch2_fs_freespace_init(c); - bch_err_msg(ca, ret, "initializing free space"); - if (ret) - goto err_late; - - if (ca->mi.state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); - - ret = bch2_dev_journal_alloc(ca, false); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err_late; - } - - /* - * We just changed the superblock UUID, invalidate cache and send a - * uevent to update /dev/disk/by-uuid - */ - invalidate_bdev(ca->disk_sb.bdev); - - char uuid_str[37]; - snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid); - - char *envp[] = { - "CHANGE=uuid", - uuid_str, - NULL, - }; - kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); - - up_write(&c->state_lock); -out: - printbuf_exit(&label); - printbuf_exit(&errbuf); - bch_err_fn(c, ret); - return ret; - -err_unlock: - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); -err: - if (ca) - bch2_dev_free(ca); - bch2_free_super(&sb); - goto out; -err_late: - up_write(&c->state_lock); - ca = NULL; - goto err; -} - -/* Hot add existing device to running filesystem: */ -int bch2_dev_online(struct bch_fs *c, const char *path) -{ - struct bch_opts opts = bch2_opts_empty(); - struct bch_sb_handle sb = { NULL }; - struct bch_dev *ca; - unsigned dev_idx; - int ret; - - down_write(&c->state_lock); - - ret = bch2_read_super(path, &opts, &sb); - if (ret) { - up_write(&c->state_lock); - return ret; - } - - dev_idx = sb.sb->dev_idx; - - ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); - bch_err_msg(c, ret, "bringing %s online", path); - if (ret) - goto err; - - ret = bch2_dev_attach_bdev(c, &sb); - if (ret) - goto err; - - ca = bch2_dev_locked(c, dev_idx); - - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); - if (ret) - goto err; - - if (ca->mi.state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); - - if (!ca->mi.freespace_initialized) { - ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); - bch_err_msg(ca, ret, "initializing free space"); - if (ret) - goto err; - } - - if (!ca->journal.nr) { - ret = bch2_dev_journal_alloc(ca, false); - bch_err_msg(ca, ret, "allocating journal"); - if (ret) - goto err; - } - - mutex_lock(&c->sb_lock); - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = - cpu_to_le64(ktime_get_real_seconds()); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - up_write(&c->state_lock); - return 0; -err: - up_write(&c->state_lock); - bch2_free_super(&sb); - return ret; -} - -int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) -{ - down_write(&c->state_lock); - - if (!bch2_dev_is_online(ca)) { - bch_err(ca, "Already offline"); - up_write(&c->state_lock); - return 0; - } - - if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { - bch_err(ca, "Cannot offline required disk"); - up_write(&c->state_lock); - return bch_err_throw(c, device_state_not_allowed); - } - - __bch2_dev_offline(c, ca); - - up_write(&c->state_lock); - return 0; -} - -static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) -{ - struct bch_fs *c = ca->fs; - u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; - - return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod2(trans, false, v, dev_data_type, - .dev = ca->dev_idx, - .data_type = BCH_DATA_free)) ?: - bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); -} - -int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -{ - struct bch_member *m; - u64 old_nbuckets; - int ret = 0; - - down_write(&c->state_lock); - old_nbuckets = ca->mi.nbuckets; - - if (nbuckets < ca->mi.nbuckets) { - bch_err(ca, "Cannot shrink yet"); - ret = -EINVAL; - goto err; - } - - if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { - bch_err(ca, "New device size too big (%llu greater than max %u)", - nbuckets, BCH_MEMBER_NBUCKETS_MAX); - ret = bch_err_throw(c, device_size_too_big); - goto err; - } - - if (bch2_dev_is_online(ca) && - get_capacity(ca->disk_sb.bdev->bd_disk) < - ca->mi.bucket_size * nbuckets) { - bch_err(ca, "New size larger than device"); - ret = bch_err_throw(c, device_size_too_small); - goto err; - } - - ret = bch2_dev_buckets_resize(c, ca, nbuckets); - bch_err_msg(ca, ret, "resizing buckets"); - if (ret) - goto err; - - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - if (ret) - goto err; - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - m->nbuckets = cpu_to_le64(nbuckets); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (ca->mi.freespace_initialized) { - ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); - if (ret) - goto err; - } - - bch2_recalc_capacity(c); -err: - up_write(&c->state_lock); - return ret; -} - -int bch2_fs_resize_on_mount(struct bch_fs *c) -{ - for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { - u64 old_nbuckets = ca->mi.nbuckets; - u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), - ca->mi.bucket_size); - - if (ca->mi.resize_on_mount && - new_nbuckets > ca->mi.nbuckets) { - bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); - int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); - bch_err_fn(ca, ret); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_fs_resize_on_mount); - up_write(&c->state_lock); - return ret; - } - - mutex_lock(&c->sb_lock); - struct bch_member *m = - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - m->nbuckets = cpu_to_le64(new_nbuckets); - SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); - - c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (ca->mi.freespace_initialized) { - ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_fs_resize_on_mount); - up_write(&c->state_lock); - return ret; - } - } - } - } - return 0; -} - -/* return with ref on ca->ref: */ -struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) -{ - if (!strncmp(name, "/dev/", strlen("/dev/"))) - name += strlen("/dev/"); - - for_each_member_device(c, ca) - if (!strcmp(name, ca->name)) - return ca; - return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); -} - -/* blk_holder_ops: */ - -static struct bch_fs *bdev_get_fs(struct block_device *bdev) - __releases(&bdev->bd_holder_lock) -{ - struct bch_sb_handle_holder *holder = bdev->bd_holder; - struct bch_fs *c = holder->c; - - if (c && !bch2_ro_ref_tryget(c)) - c = NULL; - - mutex_unlock(&bdev->bd_holder_lock); - - if (c) - wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); - return c; -} - -/* returns with ref on ca->ref */ -static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) -{ - for_each_member_device(c, ca) - if (ca->disk_sb.bdev == bdev) - return ca; - return NULL; -} - -static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) -{ - struct bch_fs *c = bdev_get_fs(bdev); - if (!c) - return; - - struct super_block *sb = c->vfs_sb; - if (sb) { - /* - * Not necessary, c->ro_ref guards against the filesystem being - * unmounted - we only take this to avoid a warning in - * sync_filesystem: - */ - down_read(&sb->s_umount); - } - - down_write(&c->state_lock); - struct bch_dev *ca = bdev_to_bch_dev(c, bdev); - if (!ca) - goto unlock; - - bool dev = bch2_dev_state_allowed(c, ca, - BCH_MEMBER_STATE_failed, - BCH_FORCE_IF_DEGRADED); - - if (!dev && sb) { - if (!surprise) - sync_filesystem(sb); - shrink_dcache_sb(sb); - evict_inodes(sb); - } - - struct printbuf buf = PRINTBUF; - __bch2_log_msg_start(ca->name, &buf); - - prt_printf(&buf, "offline from block layer"); - - if (dev) { - __bch2_dev_offline(c, ca); - } else { - bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only2(c, &buf); - } - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - bch2_dev_put(ca); -unlock: - if (sb) - up_read(&sb->s_umount); - up_write(&c->state_lock); - bch2_ro_ref_put(c); -} - -static void bch2_fs_bdev_sync(struct block_device *bdev) -{ - struct bch_fs *c = bdev_get_fs(bdev); - if (!c) - return; - - struct super_block *sb = c->vfs_sb; - if (sb) { - /* - * Not necessary, c->ro_ref guards against the filesystem being - * unmounted - we only take this to avoid a warning in - * sync_filesystem: - */ - down_read(&sb->s_umount); - sync_filesystem(sb); - up_read(&sb->s_umount); - } - - bch2_ro_ref_put(c); -} - -const struct blk_holder_ops bch2_sb_handle_bdev_ops = { - .mark_dead = bch2_fs_bdev_mark_dead, - .sync = bch2_fs_bdev_sync, -}; - -/* Filesystem open: */ - -static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) -{ - return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: - cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); -} - -struct bch_fs *bch2_fs_open(darray_const_str *devices, - struct bch_opts *opts) -{ - bch_sb_handles sbs = {}; - struct bch_fs *c = NULL; - struct bch_sb_handle *best = NULL; - struct printbuf errbuf = PRINTBUF; - int ret = 0; - - if (!try_module_get(THIS_MODULE)) - return ERR_PTR(-ENODEV); - - if (!devices->nr) { - ret = -EINVAL; - goto err; - } - - ret = darray_make_room(&sbs, devices->nr); - if (ret) - goto err; - - darray_for_each(*devices, i) { - struct bch_sb_handle sb = { NULL }; - - ret = bch2_read_super(*i, opts, &sb); - if (ret) - goto err; - - BUG_ON(darray_push(&sbs, sb)); - } - - if (opts->nochanges && !opts->read_only) { - ret = bch_err_throw(c, erofs_nochanges); - goto err_print; - } - - darray_for_each(sbs, sb) - if (!best || sb_cmp(sb->sb, best->sb) > 0) - best = sb; - - darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb, opts); - - if (ret == -BCH_ERR_device_has_been_removed || - ret == -BCH_ERR_device_splitbrain) { - bch2_free_super(sb); - darray_remove_item(&sbs, sb); - best -= best > sb; - ret = 0; - continue; - } - - if (ret) - goto err_print; - } - - c = bch2_fs_alloc(best->sb, opts, &sbs); - ret = PTR_ERR_OR_ZERO(c); - if (ret) - goto err; - - down_write(&c->state_lock); - darray_for_each(sbs, sb) { - ret = bch2_dev_attach_bdev(c, sb); - if (ret) { - up_write(&c->state_lock); - goto err; - } - } - up_write(&c->state_lock); - - if (!c->opts.nostart) { - ret = bch2_fs_start(c); - if (ret) - goto err; - } -out: - darray_for_each(sbs, sb) - bch2_free_super(sb); - darray_exit(&sbs); - printbuf_exit(&errbuf); - module_put(THIS_MODULE); - return c; -err_print: - pr_err("bch_fs_open err opening %s: %s", - devices->data[0], bch2_err_str(ret)); -err: - if (!IS_ERR_OR_NULL(c)) - bch2_fs_stop(c); - c = ERR_PTR(ret); - goto out; -} - -/* Global interfaces/init */ - -static void bcachefs_exit(void) -{ - bch2_debug_exit(); - bch2_vfs_exit(); - bch2_chardev_exit(); - bch2_btree_key_cache_exit(); - if (bcachefs_kset) - kset_unregister(bcachefs_kset); -} - -static int __init bcachefs_init(void) -{ - bch2_bkey_pack_test(); - - if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || - bch2_btree_key_cache_init() || - bch2_chardev_init() || - bch2_vfs_init() || - bch2_debug_init()) - goto err; - - return 0; -err: - bcachefs_exit(); - return -ENOMEM; -} - -#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); -BCH_DEBUG_PARAMS_ALL() -#undef BCH_DEBUG_PARAM - -static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) -{ - /* Match bool exactly, by re-using it. */ - struct static_key *key = kp->arg; - struct kernel_param boolkp = *kp; - bool v; - int ret; - - boolkp.arg = &v; - - ret = param_set_bool(val, &boolkp); - if (ret) - return ret; - if (v) - static_key_enable(key); - else - static_key_disable(key); - return 0; -} - -static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) -{ - struct static_key *key = kp->arg; - return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); -} - -static const struct kernel_param_ops bch2_param_ops_static_key_t = { - .flags = KERNEL_PARAM_OPS_FL_NOARG, - .set = bch2_param_set_static_key_t, - .get = bch2_param_get_static_key_t, -}; - -#define BCH_DEBUG_PARAM(name, description) \ - module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ - __MODULE_PARM_TYPE(name, "static_key_t"); \ - MODULE_PARM_DESC(name, description); -BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - -__maybe_unused -static unsigned bch2_metadata_version = bcachefs_metadata_version_current; -module_param_named(version, bch2_metadata_version, uint, 0444); - -module_exit(bcachefs_exit); -module_init(bcachefs_init); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h deleted file mode 100644 index e90bab9afe78bd..00000000000000 --- a/fs/bcachefs/super.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUPER_H -#define _BCACHEFS_SUPER_H - -#include "extents.h" - -#include "bcachefs_ioctl.h" - -#include - -extern const char * const bch2_fs_flag_strs[]; -extern const char * const bch2_write_refs[]; -extern const char * const bch2_dev_read_refs[]; -extern const char * const bch2_dev_write_refs[]; - -struct bch_fs *bch2_dev_to_fs(dev_t); -struct bch_fs *bch2_uuid_to_fs(__uuid_t); - -bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); -int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); -int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); - -int bch2_dev_fail(struct bch_dev *, int); -int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); -int bch2_dev_add(struct bch_fs *, const char *); -int bch2_dev_online(struct bch_fs *, const char *); -int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); -int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); -struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); - -bool bch2_fs_emergency_read_only(struct bch_fs *); -bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *); - -bool bch2_fs_emergency_read_only_locked(struct bch_fs *); -void bch2_fs_read_only(struct bch_fs *); - -int bch2_fs_read_write(struct bch_fs *); -int bch2_fs_read_write_early(struct bch_fs *); - -int bch2_fs_resize_on_mount(struct bch_fs *); - -void __bch2_fs_stop(struct bch_fs *); -void bch2_fs_free(struct bch_fs *); -void bch2_fs_stop(struct bch_fs *); - -int bch2_fs_init_rw(struct bch_fs *); -int bch2_fs_start(struct bch_fs *); -struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); - -extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; - -#endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h deleted file mode 100644 index 3a899f799d1d18..00000000000000 --- a/fs/bcachefs/super_types.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUPER_TYPES_H -#define _BCACHEFS_SUPER_TYPES_H - -struct bch_fs; - -struct bch_sb_handle_holder { - struct bch_fs *c; -}; - -struct bch_sb_handle { - struct bch_sb *sb; - struct file *s_bdev_file; - struct block_device *bdev; - char *sb_name; - struct bio *bio; - struct bch_sb_handle_holder *holder; - size_t buffer_size; - blk_mode_t mode; - unsigned have_layout:1; - unsigned have_bio:1; - unsigned fs_sb:1; - u64 seq; -}; - -struct bch_devs_mask { - unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; -}; - -struct bch_devs_list { - u8 nr; - u8 data[BCH_BKEY_PTRS_MAX]; -}; - -#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c deleted file mode 100644 index 05848375cea2a1..00000000000000 --- a/fs/bcachefs/sysfs.c +++ /dev/null @@ -1,914 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bcache sysfs interfaces - * - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#ifndef NO_BCACHEFS_SYSFS - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "sysfs.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_gc.h" -#include "buckets.h" -#include "clock.h" -#include "compress.h" -#include "disk_accounting.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "inode.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "keylist.h" -#include "move.h" -#include "movinggc.h" -#include "nocow_locking.h" -#include "opts.h" -#include "rebalance.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-errors.h" -#include "super-io.h" -#include "tests.h" - -#include -#include -#include - -#include "util.h" - -#define SYSFS_OPS(type) \ -const struct sysfs_ops type ## _sysfs_ops = { \ - .show = type ## _show, \ - .store = type ## _store \ -} - -#define SHOW(fn) \ -static ssize_t fn ## _to_text(struct printbuf *, \ - struct kobject *, struct attribute *); \ - \ -static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ - char *buf) \ -{ \ - struct printbuf out = PRINTBUF; \ - ssize_t ret = fn ## _to_text(&out, kobj, attr); \ - \ - if (out.pos && out.buf[out.pos - 1] != '\n') \ - prt_newline(&out); \ - \ - if (!ret && out.allocation_failure) \ - ret = -ENOMEM; \ - \ - if (!ret) { \ - ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ - memcpy(buf, out.buf, ret); \ - } \ - printbuf_exit(&out); \ - return bch2_err_class(ret); \ -} \ - \ -static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ - struct attribute *attr) - -#define STORE(fn) \ -static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\ - const char *, size_t); \ - \ -static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ - const char *buf, size_t size) \ -{ \ - return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \ -} \ - \ -static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\ - const char *buf, size_t size) - -#define __sysfs_attribute(_name, _mode) \ - static struct attribute sysfs_##_name = \ - { .name = #_name, .mode = _mode } - -#define write_attribute(n) __sysfs_attribute(n, 0200) -#define read_attribute(n) __sysfs_attribute(n, 0444) -#define rw_attribute(n) __sysfs_attribute(n, 0644) - -#define sysfs_printf(file, fmt, ...) \ -do { \ - if (attr == &sysfs_ ## file) \ - prt_printf(out, fmt "\n", __VA_ARGS__); \ -} while (0) - -#define sysfs_print(file, var) \ -do { \ - if (attr == &sysfs_ ## file) \ - snprint(out, var); \ -} while (0) - -#define sysfs_hprint(file, val) \ -do { \ - if (attr == &sysfs_ ## file) \ - prt_human_readable_s64(out, val); \ -} while (0) - -#define sysfs_strtoul(file, var) \ -do { \ - if (attr == &sysfs_ ## file) \ - return strtoul_safe(buf, var) ?: (ssize_t) size; \ -} while (0) - -#define sysfs_strtoul_clamp(file, var, min, max) \ -do { \ - if (attr == &sysfs_ ## file) \ - return strtoul_safe_clamp(buf, var, min, max) \ - ?: (ssize_t) size; \ -} while (0) - -#define strtoul_or_return(cp) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (_r) \ - return _r; \ - _v; \ -}) - -write_attribute(trigger_gc); -write_attribute(trigger_discards); -write_attribute(trigger_invalidates); -write_attribute(trigger_journal_commit); -write_attribute(trigger_journal_flush); -write_attribute(trigger_journal_writes); -write_attribute(trigger_btree_cache_shrink); -write_attribute(trigger_btree_key_cache_shrink); -write_attribute(trigger_btree_updates); -write_attribute(trigger_freelist_wakeup); -write_attribute(trigger_recalc_capacity); -write_attribute(trigger_delete_dead_snapshots); -write_attribute(trigger_emergency_read_only); -read_attribute(gc_gens_pos); - -read_attribute(uuid); -read_attribute(minor); -read_attribute(flags); -read_attribute(first_bucket); -read_attribute(nbuckets); -read_attribute(io_done); -read_attribute(io_errors); -write_attribute(io_errors_reset); - -read_attribute(io_latency_read); -read_attribute(io_latency_write); -read_attribute(io_latency_stats_read); -read_attribute(io_latency_stats_write); -read_attribute(congested); - -read_attribute(btree_write_stats); - -read_attribute(btree_cache_size); -read_attribute(compression_stats); -read_attribute(errors); -read_attribute(journal_debug); -read_attribute(btree_cache); -read_attribute(btree_key_cache); -read_attribute(btree_reserve_cache); -read_attribute(open_buckets); -read_attribute(open_buckets_partial); -read_attribute(nocow_lock_table); - -read_attribute(read_refs); -read_attribute(write_refs); - -read_attribute(internal_uuid); -read_attribute(disk_groups); - -read_attribute(has_data); -read_attribute(alloc_debug); -read_attribute(usage_base); - -#define x(t, n, ...) read_attribute(t); -BCH_PERSISTENT_COUNTERS() -#undef x - -rw_attribute(label); - -read_attribute(copy_gc_wait); - -sysfs_pd_controller_attribute(rebalance); -read_attribute(rebalance_status); -read_attribute(snapshot_delete_status); -read_attribute(recovery_status); - -read_attribute(new_stripes); - -read_attribute(io_timers_read); -read_attribute(io_timers_write); - -read_attribute(moving_ctxts); - -#ifdef CONFIG_BCACHEFS_TESTS -write_attribute(perf_test); -#endif /* CONFIG_BCACHEFS_TESTS */ - -#define x(_name) \ - static struct attribute sysfs_time_stat_##_name = \ - { .name = #_name, .mode = 0644 }; - BCH_TIME_STATS() -#undef x - -static size_t bch2_btree_cache_size(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - size_t ret = 0; - struct btree *b; - - mutex_lock(&bc->lock); - list_for_each_entry(b, &bc->live[0].list, list) - ret += btree_buf_bytes(b); - list_for_each_entry(b, &bc->live[1].list, list) - ret += btree_buf_bytes(b); - list_for_each_entry(b, &bc->freeable, list) - ret += btree_buf_bytes(b); - mutex_unlock(&bc->lock); - return ret; -} - -static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) -{ - prt_str(out, "type"); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 24); - prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n"); - - for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) { - struct disk_accounting_pos a; - disk_accounting_key_init(a, compression, .type = i); - struct bpos p = disk_accounting_pos_to_bpos(&a); - u64 v[3]; - bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v)); - - u64 nr_extents = v[0]; - u64 sectors_uncompressed = v[1]; - u64 sectors_compressed = v[2]; - - bch2_prt_compression_type(out, i); - prt_tab(out); - - prt_human_readable_u64(out, sectors_compressed << 9); - prt_tab_rjust(out); - - prt_human_readable_u64(out, sectors_uncompressed << 9); - prt_tab_rjust(out); - - prt_human_readable_u64(out, nr_extents - ? div64_u64(sectors_uncompressed << 9, nr_extents) - : 0); - prt_tab_rjust(out); - prt_newline(out); - } - - return 0; -} - -static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) -{ - bch2_btree_id_to_text(out, c->gc_gens_btree); - prt_printf(out, ": "); - bch2_bpos_to_text(out, c->gc_gens_pos); - prt_printf(out, "\n"); -} - -static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_fs_usage_base b = {}; - - acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64)); - - prt_printf(out, "hidden:\t\t%llu\n", b.hidden); - prt_printf(out, "btree:\t\t%llu\n", b.btree); - prt_printf(out, "data:\t\t%llu\n", b.data); - prt_printf(out, "cached:\t%llu\n", b.cached); - prt_printf(out, "reserved:\t\t%llu\n", b.reserved); - prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes); -} - -SHOW(bch2_fs) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - sysfs_print(minor, c->minor); - sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); - - if (attr == &sysfs_flags) - prt_bitflags(out, bch2_fs_flag_strs, c->flags); - - sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); - - if (attr == &sysfs_btree_write_stats) - bch2_btree_write_stats_to_text(out, c); - - if (attr == &sysfs_gc_gens_pos) - bch2_gc_gens_pos_to_text(out, c); - - sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ - - if (attr == &sysfs_copy_gc_wait) - bch2_copygc_wait_to_text(out, c); - - if (attr == &sysfs_rebalance_status) - bch2_rebalance_status_to_text(out, c); - - if (attr == &sysfs_snapshot_delete_status) - bch2_snapshot_delete_status_to_text(out, c); - - if (attr == &sysfs_recovery_status) - bch2_recovery_pass_status_to_text(out, c); - - /* Debugging: */ - - if (attr == &sysfs_journal_debug) - bch2_journal_debug_to_text(out, &c->journal); - - if (attr == &sysfs_btree_cache) - bch2_btree_cache_to_text(out, &c->btree_cache); - - if (attr == &sysfs_btree_key_cache) - bch2_btree_key_cache_to_text(out, &c->btree_key_cache); - - if (attr == &sysfs_btree_reserve_cache) - bch2_btree_reserve_cache_to_text(out, c); - - if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c, NULL); - - if (attr == &sysfs_open_buckets_partial) - bch2_open_buckets_partial_to_text(out, c); - - if (attr == &sysfs_compression_stats) - bch2_compression_stats_to_text(out, c); - - if (attr == &sysfs_errors) - bch2_fs_errors_to_text(out, c); - - if (attr == &sysfs_new_stripes) - bch2_new_stripes_to_text(out, c); - - if (attr == &sysfs_io_timers_read) - bch2_io_timers_to_text(out, &c->io_clock[READ]); - - if (attr == &sysfs_io_timers_write) - bch2_io_timers_to_text(out, &c->io_clock[WRITE]); - - if (attr == &sysfs_moving_ctxts) - bch2_fs_moving_ctxts_to_text(out, c); - - if (attr == &sysfs_write_refs) - enumerated_ref_to_text(out, &c->writes, bch2_write_refs); - - if (attr == &sysfs_nocow_lock_table) - bch2_nocow_locks_to_text(out, &c->nocow_locks); - - if (attr == &sysfs_disk_groups) - bch2_disk_groups_to_text(out, c); - - if (attr == &sysfs_alloc_debug) - bch2_fs_alloc_debug_to_text(out, c); - - if (attr == &sysfs_usage_base) - bch2_fs_usage_base_to_text(out, c); - - return 0; -} - -STORE(bch2_fs) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - sysfs_pd_controller_store(rebalance, &c->rebalance.pd); - - /* Debugging: */ - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EPERM; - - /* Debugging: */ - - if (attr == &sysfs_trigger_btree_updates) - queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)) - return -EROFS; - - if (attr == &sysfs_trigger_btree_cache_shrink) { - struct btree_cache *bc = &c->btree_cache; - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = strtoul_or_return(buf); - bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc); - } - - if (attr == &sysfs_trigger_btree_key_cache_shrink) { - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = strtoul_or_return(buf); - c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc); - } - - if (attr == &sysfs_trigger_gc) - bch2_gc_gens(c); - - if (attr == &sysfs_trigger_discards) - bch2_do_discards(c); - - if (attr == &sysfs_trigger_invalidates) - bch2_do_invalidates(c); - - if (attr == &sysfs_trigger_journal_commit) - bch2_journal_flush(&c->journal); - - if (attr == &sysfs_trigger_journal_flush) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_meta(&c->journal); - } - - if (attr == &sysfs_trigger_journal_writes) - bch2_journal_do_writes(&c->journal); - - if (attr == &sysfs_trigger_freelist_wakeup) - closure_wake_up(&c->freelist_wait); - - if (attr == &sysfs_trigger_recalc_capacity) { - down_read(&c->state_lock); - bch2_recalc_capacity(c); - up_read(&c->state_lock); - } - - if (attr == &sysfs_trigger_delete_dead_snapshots) - __bch2_delete_dead_snapshots(c); - - if (attr == &sysfs_trigger_emergency_read_only) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "shutdown by sysfs\n"); - bch2_fs_emergency_read_only2(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - -#ifdef CONFIG_BCACHEFS_TESTS - if (attr == &sysfs_perf_test) { - char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; - char *test = strsep(&p, " \t\n"); - char *nr_str = strsep(&p, " \t\n"); - char *threads_str = strsep(&p, " \t\n"); - unsigned threads; - u64 nr; - int ret = -EINVAL; - - if (threads_str && - !(ret = kstrtouint(threads_str, 10, &threads)) && - !(ret = bch2_strtoull_h(nr_str, &nr))) - ret = bch2_btree_perf_test(c, test, nr, threads); - kfree(tmp); - - if (ret) - size = ret; - } -#endif - enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); - return size; -} -SYSFS_OPS(bch2_fs); - -struct attribute *bch2_fs_files[] = { - &sysfs_minor, - &sysfs_btree_cache_size, - &sysfs_btree_write_stats, - - &sysfs_rebalance_status, - &sysfs_snapshot_delete_status, - &sysfs_recovery_status, - - &sysfs_compression_stats, - &sysfs_errors, - -#ifdef CONFIG_BCACHEFS_TESTS - &sysfs_perf_test, -#endif - NULL -}; - -/* counters dir */ - -SHOW(bch2_fs_counters) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj); - u64 counter = 0; - u64 counter_since_mount = 0; - - printbuf_tabstop_push(out, 32); - - #define x(t, n, f, ...) \ - if (attr == &sysfs_##t) { \ - counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ - counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ - if (f & TYPE_SECTORS) { \ - counter <<= 9; \ - counter_since_mount <<= 9; \ - } \ - \ - prt_printf(out, "since mount:\t"); \ - (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\ - prt_human_readable_u64(out, counter_since_mount); \ - prt_newline(out); \ - \ - prt_printf(out, "since filesystem creation:\t"); \ - (f & TYPE_COUNTER) ? prt_u64(out, counter) : \ - prt_human_readable_u64(out, counter); \ - prt_newline(out); \ - } - BCH_PERSISTENT_COUNTERS() - #undef x - return 0; -} - -STORE(bch2_fs_counters) { - return 0; -} - -SYSFS_OPS(bch2_fs_counters); - -struct attribute *bch2_fs_counters_files[] = { -#define x(t, ...) \ - &sysfs_##t, - BCH_PERSISTENT_COUNTERS() -#undef x - NULL -}; -/* internal dir - just a wrapper */ - -SHOW(bch2_fs_internal) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, internal); - - return bch2_fs_to_text(out, &c->kobj, attr); -} - -STORE(bch2_fs_internal) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, internal); - - return bch2_fs_store(&c->kobj, attr, buf, size); -} -SYSFS_OPS(bch2_fs_internal); - -struct attribute *bch2_fs_internal_files[] = { - &sysfs_flags, - &sysfs_journal_debug, - &sysfs_btree_cache, - &sysfs_btree_key_cache, - &sysfs_btree_reserve_cache, - &sysfs_new_stripes, - &sysfs_open_buckets, - &sysfs_open_buckets_partial, - &sysfs_write_refs, - &sysfs_nocow_lock_table, - &sysfs_io_timers_read, - &sysfs_io_timers_write, - - &sysfs_trigger_gc, - &sysfs_trigger_discards, - &sysfs_trigger_invalidates, - &sysfs_trigger_journal_commit, - &sysfs_trigger_journal_flush, - &sysfs_trigger_journal_writes, - &sysfs_trigger_btree_cache_shrink, - &sysfs_trigger_btree_key_cache_shrink, - &sysfs_trigger_btree_updates, - &sysfs_trigger_freelist_wakeup, - &sysfs_trigger_recalc_capacity, - &sysfs_trigger_delete_dead_snapshots, - &sysfs_trigger_emergency_read_only, - - &sysfs_gc_gens_pos, - - &sysfs_copy_gc_wait, - - sysfs_pd_controller_files(rebalance), - - &sysfs_moving_ctxts, - - &sysfs_internal_uuid, - - &sysfs_disk_groups, - &sysfs_alloc_debug, - &sysfs_usage_base, - NULL -}; - -/* options */ - -static ssize_t sysfs_opt_show(struct bch_fs *c, - struct bch_dev *ca, - enum bch_opt_id id, - struct printbuf *out) -{ - const struct bch_option *opt = bch2_opt_table + id; - u64 v; - - if (opt->flags & OPT_FS) { - v = bch2_opt_get_by_id(&c->opts, id); - } else if ((opt->flags & OPT_DEVICE) && opt->get_member) { - v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx); - } else { - return -EINVAL; - } - - bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); - prt_char(out, '\n'); - return 0; -} - -static ssize_t sysfs_opt_store(struct bch_fs *c, - struct bch_dev *ca, - enum bch_opt_id id, - const char *buf, size_t size) -{ - const struct bch_option *opt = bch2_opt_table + id; - int ret = 0; - - /* - * We don't need to take c->writes for correctness, but it eliminates an - * unsightly error message in the dmesg log when we're RO: - */ - if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))) - return -EROFS; - - char *tmp = kstrdup(buf, GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto err; - } - - u64 v; - ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: - bch2_opt_hook_pre_set(c, ca, id, v); - kfree(tmp); - - if (ret < 0) - goto err; - - bool is_sb = opt->get_sb || opt->get_member; - bool changed = false; - - if (is_sb) { - changed = bch2_opt_set_sb(c, ca, opt, v); - } else if (!ca) { - changed = bch2_opt_get_by_id(&c->opts, id) != v; - } else { - /* device options that aren't superblock options aren't - * supported */ - BUG(); - } - - if (!ca) - bch2_opt_set_by_id(&c->opts, id, v); - - if (changed) - bch2_opt_hook_post_set(c, ca, 0, &c->opts, id); - - ret = size; -err: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); - return ret; -} - -SHOW(bch2_fs_opts_dir) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - int id = bch2_opt_lookup(attr->name); - if (id < 0) - return 0; - - return sysfs_opt_show(c, NULL, id, out); -} - -STORE(bch2_fs_opts_dir) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - int id = bch2_opt_lookup(attr->name); - if (id < 0) - return 0; - - return sysfs_opt_store(c, NULL, id, buf, size); -} -SYSFS_OPS(bch2_fs_opts_dir); - -struct attribute *bch2_fs_opts_dir_files[] = { NULL }; - -int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) -{ - for (const struct bch_option *i = bch2_opt_table; - i < bch2_opt_table + bch2_opts_nr; - i++) { - if (i->flags & OPT_HIDDEN) - continue; - if (!(i->flags & type)) - continue; - - int ret = sysfs_create_file(kobj, &i->attr); - if (ret) - return ret; - } - - return 0; -} - -/* time stats */ - -SHOW(bch2_fs_time_stats) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); - -#define x(name) \ - if (attr == &sysfs_time_stat_##name) \ - bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]); - BCH_TIME_STATS() -#undef x - - return 0; -} - -STORE(bch2_fs_time_stats) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); - -#define x(name) \ - if (attr == &sysfs_time_stat_##name) \ - bch2_time_stats_reset(&c->times[BCH_TIME_##name]); - BCH_TIME_STATS() -#undef x - return size; -} -SYSFS_OPS(bch2_fs_time_stats); - -struct attribute *bch2_fs_time_stats_files[] = { -#define x(name) \ - &sysfs_time_stat_##name, - BCH_TIME_STATS() -#undef x - NULL -}; - -static const char * const bch2_rw[] = { - "read", - "write", - NULL -}; - -static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) -{ - int rw, i; - - for (rw = 0; rw < 2; rw++) { - prt_printf(out, "%s:\n", bch2_rw[rw]); - - for (i = 1; i < BCH_DATA_NR; i++) - prt_printf(out, "%-12s:%12llu\n", - bch2_data_type_str(i), - percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); - } -} - -SHOW(bch2_dev) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - struct bch_fs *c = ca->fs; - - sysfs_printf(uuid, "%pU\n", ca->uuid.b); - - sysfs_print(first_bucket, ca->mi.first_bucket); - sysfs_print(nbuckets, ca->mi.nbuckets); - - if (attr == &sysfs_label) { - if (ca->mi.group) - bch2_disk_path_to_text(out, c, ca->mi.group - 1); - prt_char(out, '\n'); - } - - if (attr == &sysfs_has_data) { - prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca)); - prt_char(out, '\n'); - } - - if (attr == &sysfs_io_done) - dev_io_done_to_text(out, ca); - - if (attr == &sysfs_io_errors) - bch2_dev_io_errors_to_text(out, ca); - - sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); - sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); - - if (attr == &sysfs_io_latency_stats_read) - bch2_time_stats_to_text(out, &ca->io_latency[READ].stats); - - if (attr == &sysfs_io_latency_stats_write) - bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats); - - sysfs_printf(congested, "%u%%", - clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) - * 100 / CONGESTED_MAX); - - if (attr == &sysfs_alloc_debug) - bch2_dev_alloc_debug_to_text(out, ca); - - if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c, ca); - - int opt_id = bch2_opt_lookup(attr->name); - if (opt_id >= 0) - return sysfs_opt_show(c, ca, opt_id, out); - - if (attr == &sysfs_read_refs) - enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs); - - if (attr == &sysfs_write_refs) - enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs); - - return 0; -} - -STORE(bch2_dev) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - struct bch_fs *c = ca->fs; - - if (attr == &sysfs_label) { - char *tmp; - int ret; - - tmp = kstrdup(buf, GFP_KERNEL); - if (!tmp) - return -ENOMEM; - - ret = bch2_dev_group_set(c, ca, strim(tmp)); - kfree(tmp); - if (ret) - return ret; - } - - if (attr == &sysfs_io_errors_reset) - bch2_dev_errors_reset(ca); - - int opt_id = bch2_opt_lookup(attr->name); - if (opt_id >= 0) - return sysfs_opt_store(c, ca, opt_id, buf, size); - - return size; -} -SYSFS_OPS(bch2_dev); - -struct attribute *bch2_dev_files[] = { - &sysfs_uuid, - &sysfs_first_bucket, - &sysfs_nbuckets, - - /* settings: */ - &sysfs_label, - - &sysfs_has_data, - &sysfs_io_done, - &sysfs_io_errors, - &sysfs_io_errors_reset, - - &sysfs_io_latency_read, - &sysfs_io_latency_write, - &sysfs_io_latency_stats_read, - &sysfs_io_latency_stats_write, - &sysfs_congested, - - /* debug: */ - &sysfs_alloc_debug, - &sysfs_open_buckets, - - &sysfs_read_refs, - &sysfs_write_refs, - NULL -}; - -#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h deleted file mode 100644 index 303e0433c702c6..00000000000000 --- a/fs/bcachefs/sysfs.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SYSFS_H_ -#define _BCACHEFS_SYSFS_H_ - -#include - -#ifndef NO_BCACHEFS_SYSFS - -struct attribute; -struct sysfs_ops; - -extern struct attribute *bch2_fs_files[]; -extern struct attribute *bch2_fs_counters_files[]; -extern struct attribute *bch2_fs_internal_files[]; -extern struct attribute *bch2_fs_opts_dir_files[]; -extern struct attribute *bch2_fs_time_stats_files[]; -extern struct attribute *bch2_dev_files[]; - -extern const struct sysfs_ops bch2_fs_sysfs_ops; -extern const struct sysfs_ops bch2_fs_counters_sysfs_ops; -extern const struct sysfs_ops bch2_fs_internal_sysfs_ops; -extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -extern const struct sysfs_ops bch2_dev_sysfs_ops; - -int bch2_opts_create_sysfs_files(struct kobject *, unsigned); - -#else - -static struct attribute *bch2_fs_files[] = {}; -static struct attribute *bch2_fs_counters_files[] = {}; -static struct attribute *bch2_fs_internal_files[] = {}; -static struct attribute *bch2_fs_opts_dir_files[] = {}; -static struct attribute *bch2_fs_time_stats_files[] = {}; -static struct attribute *bch2_dev_files[] = {}; - -static const struct sysfs_ops bch2_fs_sysfs_ops; -static const struct sysfs_ops bch2_fs_counters_sysfs_ops; -static const struct sysfs_ops bch2_fs_internal_sysfs_ops; -static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -static const struct sysfs_ops bch2_dev_sysfs_ops; - -static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) -{ return 0; } - -#endif /* NO_BCACHEFS_SYSFS */ - -#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c deleted file mode 100644 index 782a05fe7656b5..00000000000000 --- a/fs/bcachefs/tests.c +++ /dev/null @@ -1,891 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_BCACHEFS_TESTS - -#include "bcachefs.h" -#include "btree_update.h" -#include "journal_reclaim.h" -#include "snapshot.h" -#include "tests.h" - -#include "linux/kthread.h" -#include "linux/random.h" - -static void delete_test_keys(struct bch_fs *c) -{ - int ret; - - ret = bch2_btree_delete_range(c, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), - POS(0, U64_MAX), - 0, NULL); - BUG_ON(ret); - - ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - POS(0, U64_MAX), - 0, NULL); - BUG_ON(ret); -} - -/* unit tests */ - -static int test_delete(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k.p.snapshot = U32_MAX; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_intent); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &k.k_i, 0)); - bch_err_msg(c, ret, "update error"); - if (ret) - goto err; - - pr_info("deleting once"); - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error (first)"); - if (ret) - goto err; - - pr_info("deleting twice"); - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error (second)"); - if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int test_delete_written(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k.p.snapshot = U32_MAX; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_intent); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &k.k_i, 0)); - bch_err_msg(c, ret, "update error"); - if (ret) - goto err; - - bch2_trans_unlock(trans); - bch2_journal_flush_all_pins(&c->journal); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error"); - if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int test_iterate(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test keys"); - - for (i = 0; i < nr; i++) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i; - ck.k.p.snapshot = U32_MAX; - - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i++); - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr); - - pr_info("iterating backwards"); - - ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, - SPOS(0, U64_MAX, U32_MAX), 0, k, ({ - BUG_ON(k.k->p.offset != --i); - 0; - }))); - bch_err_msg(c, ret, "error iterating backwards"); - if (ret) - return ret; - - BUG_ON(i); - return 0; -} - -static int test_iterate_extents(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test extents"); - - for (i = 0; i < nr; i += 8) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i + 8; - ck.k.p.snapshot = U32_MAX; - ck.k.size = 8; - - ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i); - i = k.k->p.offset; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr); - - pr_info("iterating backwards"); - - ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, - SPOS(0, U64_MAX, U32_MAX), 0, k, ({ - BUG_ON(k.k->p.offset != i); - i = bkey_start_offset(k.k); - 0; - }))); - bch_err_msg(c, ret, "error iterating backwards"); - if (ret) - return ret; - - BUG_ON(i); - return 0; -} - -static int test_iterate_slots(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test keys"); - - for (i = 0; i < nr; i++) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i * 2; - ck.k.p.snapshot = U32_MAX; - - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i); - i += 2; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr * 2); - - pr_info("iterating forwards by slots"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_slots, k, ({ - if (i >= nr * 2) - break; - - BUG_ON(k.k->p.offset != i); - BUG_ON(bkey_deleted(k.k) != (i & 1)); - - i++; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards by slots"); - return ret; -} - -static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test keys"); - - for (i = 0; i < nr; i += 16) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i + 16; - ck.k.p.snapshot = U32_MAX; - ck.k.size = 8; - - ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i + 8); - BUG_ON(k.k->size != 8); - i += 16; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr); - - pr_info("iterating forwards by slots"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_slots, k, ({ - if (i == nr) - break; - BUG_ON(bkey_deleted(k.k) != !(i % 16)); - - BUG_ON(bkey_start_offset(k.k) != i); - BUG_ON(k.k->size != 8); - i = k.k->p.offset; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards by slots"); - return ret; -} - -/* - * XXX: we really want to make sure we've got a btree with depth > 0 for these - * tests - */ -static int test_peek_end(struct bch_fs *c, u64 nr) -{ - delete_test_keys(c); - - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return 0; -} - -static int test_peek_end_extents(struct bch_fs *c, u64 nr) -{ - delete_test_keys(c); - - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), 0); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return 0; -} - -/* extent unit tests */ - -static u64 test_version; - -static int insert_test_extent(struct bch_fs *c, - u64 start, u64 end) -{ - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k_i.k.p.offset = end; - k.k_i.k.p.snapshot = U32_MAX; - k.k_i.k.size = end - start; - k.k_i.k.bversion.lo = test_version++; - - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); - bch_err_fn(c, ret); - return ret; -} - -static int __test_extent_overwrite(struct bch_fs *c, - u64 e1_start, u64 e1_end, - u64 e2_start, u64 e2_end) -{ - int ret; - - ret = insert_test_extent(c, e1_start, e1_end) ?: - insert_test_extent(c, e2_start, e2_end); - - delete_test_keys(c); - return ret; -} - -static int test_extent_overwrite_front(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 0, 64, 0, 32) ?: - __test_extent_overwrite(c, 8, 64, 0, 32); -} - -static int test_extent_overwrite_back(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 0, 64, 32, 64) ?: - __test_extent_overwrite(c, 0, 64, 32, 72); -} - -static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 0, 64, 32, 40); -} - -static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 32, 64, 0, 64) ?: - __test_extent_overwrite(c, 32, 64, 0, 128) ?: - __test_extent_overwrite(c, 32, 64, 32, 64) ?: - __test_extent_overwrite(c, 32, 64, 32, 128); -} - -static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid) -{ - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k_i.k.p.inode = inum; - k.k_i.k.p.offset = start + len; - k.k_i.k.p.snapshot = snapid; - k.k_i.k.size = len; - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, - BTREE_UPDATE_internal_snapshot_node)); - bch_err_fn(c, ret); - return ret; -} - -static int test_extent_create_overlapping(struct bch_fs *c, u64 inum) -{ - return insert_test_overlapping_extent(c, inum, 0, 16, U32_MAX - 2) ?: /* overwrite entire */ - insert_test_overlapping_extent(c, inum, 2, 8, U32_MAX - 2) ?: - insert_test_overlapping_extent(c, inum, 4, 4, U32_MAX) ?: - insert_test_overlapping_extent(c, inum, 32, 8, U32_MAX - 2) ?: /* overwrite front/back */ - insert_test_overlapping_extent(c, inum, 36, 8, U32_MAX) ?: - insert_test_overlapping_extent(c, inum, 60, 8, U32_MAX - 2) ?: - insert_test_overlapping_extent(c, inum, 64, 8, U32_MAX); -} - -/* snapshot unit tests */ - -/* Test skipping over keys in unrelated snapshots: */ -static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) -{ - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i_cookie cookie; - int ret; - - bkey_cookie_init(&cookie.k_i); - cookie.k.p.snapshot = snapid_hi; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); - if (ret) - return ret; - - trans = bch2_trans_get(c); - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, snapid_lo), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - - BUG_ON(k.k->p.snapshot != U32_MAX); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int test_snapshots(struct bch_fs *c, u64 nr) -{ - struct bkey_i_cookie cookie; - u32 snapids[2]; - u32 snapid_subvols[2] = { 1, 1 }; - int ret; - - bkey_cookie_init(&cookie.k_i); - cookie.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); - if (ret) - return ret; - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_snapshot_node_create(trans, U32_MAX, - snapids, - snapid_subvols, - 2)); - if (ret) - return ret; - - if (snapids[0] > snapids[1]) - swap(snapids[0], snapids[1]); - - ret = test_snapshot_filter(c, snapids[0], snapids[1]); - bch_err_msg(c, ret, "from test_snapshot_filter"); - return ret; -} - -/* perf tests */ - -static u64 test_rand(void) -{ - u64 v; - - get_random_bytes(&v, sizeof(v)); - return v; -} - -static int rand_insert(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bkey_i_cookie k; - int ret = 0; - u64 i; - - for (i = 0; i < nr; i++) { - bkey_cookie_init(&k.k_i); - k.k.p.offset = test_rand(); - k.k.p.snapshot = U32_MAX; - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0)); - if (ret) - break; - } - - bch2_trans_put(trans); - return ret; -} - -static int rand_insert_multi(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bkey_i_cookie k[8]; - int ret = 0; - unsigned j; - u64 i; - - for (i = 0; i < nr; i += ARRAY_SIZE(k)) { - for (j = 0; j < ARRAY_SIZE(k); j++) { - bkey_cookie_init(&k[j].k_i); - k[j].k.p.offset = test_rand(); - k[j].k.p.snapshot = U32_MAX; - } - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0)); - if (ret) - break; - } - - bch2_trans_put(trans); - return ret; -} - -static int rand_lookup(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - u64 i; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - - for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX)); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter))); - ret = bkey_err(k); - if (ret) - break; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int rand_mixed_trans(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i_cookie *cookie, - u64 i, u64 pos) -{ - struct bkey_s_c k; - int ret; - - bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX)); - - k = bch2_btree_iter_peek(trans, iter); - ret = bkey_err(k); - bch_err_msg(trans->c, ret, "lookup error"); - if (ret) - return ret; - - if (!(i & 3) && k.k) { - bkey_cookie_init(&cookie->k_i); - cookie->k.p = iter->pos; - ret = bch2_trans_update(trans, iter, &cookie->k_i, 0); - } - - return ret; -} - -static int rand_mixed(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i_cookie cookie; - int ret = 0; - u64 i, rand; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - - for (i = 0; i < nr; i++) { - rand = test_rand(); - ret = commit_do(trans, NULL, NULL, 0, - rand_mixed_trans(trans, &iter, &cookie, i, rand)); - if (ret) - break; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int __do_delete(struct btree_trans *trans, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, - BTREE_ITER_intent); - k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k) - goto err; - - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int rand_delete(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - u64 i; - - for (i = 0; i < nr; i++) { - struct bpos pos = SPOS(0, test_rand(), U32_MAX); - - ret = commit_do(trans, NULL, NULL, 0, - __do_delete(trans, pos)); - if (ret) - break; - } - - bch2_trans_put(trans); - return ret; -} - -static int seq_insert(struct bch_fs *c, u64 nr) -{ - struct bkey_i_cookie insert; - - bkey_cookie_init(&insert.k_i); - - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - BTREE_ITER_slots|BTREE_ITER_intent, k, - NULL, NULL, 0, ({ - if (iter.pos.offset >= nr) - break; - insert.k.p = iter.pos; - bch2_trans_update(trans, &iter, &insert.k_i, 0); - }))); -} - -static int seq_lookup(struct bch_fs *c, u64 nr) -{ - return bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, - 0)); -} - -static int seq_overwrite(struct bch_fs *c, u64 nr) -{ - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - BTREE_ITER_intent, k, - NULL, NULL, 0, ({ - struct bkey_i_cookie u; - - bkey_reassemble(&u.k_i, k); - bch2_trans_update(trans, &iter, &u.k_i, 0); - }))); -} - -static int seq_delete(struct bch_fs *c, u64 nr) -{ - return bch2_btree_delete_range(c, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - POS(0, U64_MAX), - 0, NULL); -} - -typedef int (*perf_test_fn)(struct bch_fs *, u64); - -struct test_job { - struct bch_fs *c; - u64 nr; - unsigned nr_threads; - perf_test_fn fn; - - atomic_t ready; - wait_queue_head_t ready_wait; - - atomic_t done; - struct completion done_completion; - - u64 start; - u64 finish; - int ret; -}; - -static int btree_perf_test_thread(void *data) -{ - struct test_job *j = data; - int ret; - - if (atomic_dec_and_test(&j->ready)) { - wake_up(&j->ready_wait); - j->start = sched_clock(); - } else { - wait_event(j->ready_wait, !atomic_read(&j->ready)); - } - - ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); - if (ret) { - bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret)); - j->ret = ret; - } - - if (atomic_dec_and_test(&j->done)) { - j->finish = sched_clock(); - complete(&j->done_completion); - } - - return 0; -} - -int bch2_btree_perf_test(struct bch_fs *c, const char *testname, - u64 nr, unsigned nr_threads) -{ - struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; - char name_buf[20]; - struct printbuf nr_buf = PRINTBUF; - struct printbuf per_sec_buf = PRINTBUF; - unsigned i; - u64 time; - - if (nr == 0 || nr_threads == 0) { - pr_err("nr of iterations or threads is not allowed to be 0"); - return -EINVAL; - } - - atomic_set(&j.ready, nr_threads); - init_waitqueue_head(&j.ready_wait); - - atomic_set(&j.done, nr_threads); - init_completion(&j.done_completion); - -#define perf_test(_test) \ - if (!strcmp(testname, #_test)) j.fn = _test - - perf_test(rand_insert); - perf_test(rand_insert_multi); - perf_test(rand_lookup); - perf_test(rand_mixed); - perf_test(rand_delete); - - perf_test(seq_insert); - perf_test(seq_lookup); - perf_test(seq_overwrite); - perf_test(seq_delete); - - /* a unit test, not a perf test: */ - perf_test(test_delete); - perf_test(test_delete_written); - perf_test(test_iterate); - perf_test(test_iterate_extents); - perf_test(test_iterate_slots); - perf_test(test_iterate_slots_extents); - perf_test(test_peek_end); - perf_test(test_peek_end_extents); - - perf_test(test_extent_overwrite_front); - perf_test(test_extent_overwrite_back); - perf_test(test_extent_overwrite_middle); - perf_test(test_extent_overwrite_all); - perf_test(test_extent_create_overlapping); - - perf_test(test_snapshots); - - if (!j.fn) { - pr_err("unknown test %s", testname); - return -EINVAL; - } - - //pr_info("running test %s:", testname); - - if (nr_threads == 1) - btree_perf_test_thread(&j); - else - for (i = 0; i < nr_threads; i++) - kthread_run(btree_perf_test_thread, &j, - "bcachefs perf test[%u]", i); - - while (wait_for_completion_interruptible(&j.done_completion)) - ; - - time = j.finish - j.start; - - scnprintf(name_buf, sizeof(name_buf), "%s:", testname); - prt_human_readable_u64(&nr_buf, nr); - prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time)); - printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", - name_buf, nr_buf.buf, nr_threads, - div_u64(time, NSEC_PER_SEC), - div_u64(time * nr_threads, nr), - per_sec_buf.buf); - printbuf_exit(&per_sec_buf); - printbuf_exit(&nr_buf); - return j.ret; -} - -#endif /* CONFIG_BCACHEFS_TESTS */ diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h deleted file mode 100644 index c73b18aea7e01d..00000000000000 --- a/fs/bcachefs/tests.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_TEST_H -#define _BCACHEFS_TEST_H - -struct bch_fs; - -#ifdef CONFIG_BCACHEFS_TESTS - -int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); - -#else - -#endif /* CONFIG_BCACHEFS_TESTS */ - -#endif /* _BCACHEFS_TEST_H */ diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c deleted file mode 100644 index 314a24d15d4e7c..00000000000000 --- a/fs/bcachefs/thread_with_file.c +++ /dev/null @@ -1,494 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "thread_with_file.h" - -#include -#include -#include -#include -#include -#include - -void bch2_thread_with_file_exit(struct thread_with_file *thr) -{ - if (thr->task) { - kthread_stop(thr->task); - put_task_struct(thr->task); - } -} - -int bch2_run_thread_with_file(struct thread_with_file *thr, - const struct file_operations *fops, - int (*fn)(void *)) -{ - struct file *file = NULL; - int ret, fd = -1; - unsigned fd_flags = O_CLOEXEC; - - if (fops->read && fops->write) - fd_flags |= O_RDWR; - else if (fops->read) - fd_flags |= O_RDONLY; - else if (fops->write) - fd_flags |= O_WRONLY; - - char name[TASK_COMM_LEN]; - get_task_comm(name, current); - - thr->ret = 0; - thr->task = kthread_create(fn, thr, "%s", name); - ret = PTR_ERR_OR_ZERO(thr->task); - if (ret) - return ret; - - ret = get_unused_fd_flags(fd_flags); - if (ret < 0) - goto err; - fd = ret; - - file = anon_inode_getfile(name, fops, thr, fd_flags); - ret = PTR_ERR_OR_ZERO(file); - if (ret) - goto err; - - get_task_struct(thr->task); - wake_up_process(thr->task); - fd_install(fd, file); - return fd; -err: - if (fd >= 0) - put_unused_fd(fd); - if (thr->task) - kthread_stop(thr->task); - return ret; -} - -/* stdio_redirect */ - -static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen) -{ - return stdio->input.buf.nr > seen || stdio->done; -} - -static bool stdio_redirect_has_input(struct stdio_redirect *stdio) -{ - return stdio_redirect_has_more_input(stdio, 0); -} - -static bool stdio_redirect_has_output(struct stdio_redirect *stdio) -{ - return stdio->output.buf.nr || stdio->done; -} - -#define STDIO_REDIRECT_BUFSIZE 4096 - -static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio) -{ - return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; -} - -static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio) -{ - return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; -} - -static void stdio_buf_init(struct stdio_buf *buf) -{ - spin_lock_init(&buf->lock); - init_waitqueue_head(&buf->wait); - darray_init(&buf->buf); -} - -/* thread_with_stdio */ - -static void thread_with_stdio_done(struct thread_with_stdio *thr) -{ - thr->thr.done = true; - thr->stdio.done = true; - wake_up(&thr->stdio.input.wait); - wake_up(&thr->stdio.output.wait); -} - -static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf, - size_t len, loff_t *ppos) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - struct stdio_buf *buf = &thr->stdio.output; - size_t copied = 0, b; - int ret = 0; - - if (!(file->f_flags & O_NONBLOCK)) { - ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio)); - if (ret) - return ret; - } else if (!stdio_redirect_has_output(&thr->stdio)) - return -EAGAIN; - - while (len && buf->buf.nr) { - if (fault_in_writeable(ubuf, len) == len) { - ret = -EFAULT; - break; - } - - spin_lock_irq(&buf->lock); - b = min_t(size_t, len, buf->buf.nr); - - if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) { - ubuf += b; - len -= b; - copied += b; - buf->buf.nr -= b; - memmove(buf->buf.data, - buf->buf.data + b, - buf->buf.nr); - } - spin_unlock_irq(&buf->lock); - } - - return copied ?: ret; -} - -static int thread_with_stdio_release(struct inode *inode, struct file *file) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - thread_with_stdio_done(thr); - bch2_thread_with_file_exit(&thr->thr); - darray_exit(&thr->stdio.input.buf); - darray_exit(&thr->stdio.output.buf); - thr->ops->exit(thr); - return 0; -} - -static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, - size_t len, loff_t *ppos) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - struct stdio_buf *buf = &thr->stdio.input; - size_t copied = 0; - ssize_t ret = 0; - - while (len) { - if (thr->thr.done) { - ret = -EPIPE; - break; - } - - size_t b = len - fault_in_readable(ubuf, len); - if (!b) { - ret = -EFAULT; - break; - } - - spin_lock(&buf->lock); - size_t makeroom = b; - if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr)) - makeroom = min_t(ssize_t, makeroom, - max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr, - 0)); - darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT); - - b = min(len, darray_room(buf->buf)); - - if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { - buf->buf.nr += b; - ubuf += b; - len -= b; - copied += b; - } - spin_unlock(&buf->lock); - - if (b) { - wake_up(&buf->wait); - } else { - if ((file->f_flags & O_NONBLOCK)) { - ret = -EAGAIN; - break; - } - - ret = wait_event_interruptible(buf->wait, - stdio_redirect_has_input_space(&thr->stdio)); - if (ret) - break; - } - } - - return copied ?: ret; -} - -static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - poll_wait(file, &thr->stdio.output.wait, wait); - poll_wait(file, &thr->stdio.input.wait, wait); - - __poll_t mask = 0; - - if (stdio_redirect_has_output(&thr->stdio)) - mask |= EPOLLIN; - if (stdio_redirect_has_input_space(&thr->stdio)) - mask |= EPOLLOUT; - if (thr->thr.done) - mask |= EPOLLHUP|EPOLLERR; - return mask; -} - -static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - poll_wait(file, &thr->stdio.output.wait, wait); - - __poll_t mask = 0; - - if (stdio_redirect_has_output(&thr->stdio)) - mask |= EPOLLIN; - if (thr->thr.done) - mask |= EPOLLHUP|EPOLLERR; - return mask; -} - -static int thread_with_stdio_flush(struct file *file, fl_owner_t id) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - return thr->thr.ret; -} - -static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - if (thr->ops->unlocked_ioctl) - return thr->ops->unlocked_ioctl(thr, cmd, p); - return -ENOTTY; -} - -static const struct file_operations thread_with_stdio_fops = { - .read = thread_with_stdio_read, - .write = thread_with_stdio_write, - .poll = thread_with_stdio_poll, - .flush = thread_with_stdio_flush, - .release = thread_with_stdio_release, - .unlocked_ioctl = thread_with_stdio_ioctl, -}; - -static const struct file_operations thread_with_stdout_fops = { - .read = thread_with_stdio_read, - .poll = thread_with_stdout_poll, - .flush = thread_with_stdio_flush, - .release = thread_with_stdio_release, - .unlocked_ioctl = thread_with_stdio_ioctl, -}; - -static int thread_with_stdio_fn(void *arg) -{ - struct thread_with_stdio *thr = arg; - - thr->thr.ret = thr->ops->fn(thr); - - thread_with_stdio_done(thr); - return 0; -} - -void bch2_thread_with_stdio_init(struct thread_with_stdio *thr, - const struct thread_with_stdio_ops *ops) -{ - stdio_buf_init(&thr->stdio.input); - stdio_buf_init(&thr->stdio.output); - thr->ops = ops; -} - -int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr) -{ - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn); -} - -int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, - const struct thread_with_stdio_ops *ops) -{ - bch2_thread_with_stdio_init(thr, ops); - - return __bch2_run_thread_with_stdio(thr); -} - -int bch2_run_thread_with_stdout(struct thread_with_stdio *thr, - const struct thread_with_stdio_ops *ops) -{ - stdio_buf_init(&thr->stdio.input); - stdio_buf_init(&thr->stdio.output); - thr->ops = ops; - - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn); -} -EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout); - -int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len) -{ - struct stdio_buf *buf = &stdio->input; - - /* - * we're waiting on user input (or for the file descriptor to be - * closed), don't want a hung task warning: - */ - do { - wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), - sysctl_hung_task_timeout_secs * HZ / 2); - } while (!stdio_redirect_has_input(stdio)); - - if (stdio->done) - return -1; - - spin_lock(&buf->lock); - int ret = min(len, buf->buf.nr); - buf->buf.nr -= ret; - memcpy(ubuf, buf->buf.data, ret); - memmove(buf->buf.data, - buf->buf.data + ret, - buf->buf.nr); - spin_unlock(&buf->lock); - - wake_up(&buf->wait); - return ret; -} - -int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio, - darray_char *line, - unsigned long timeout) -{ - unsigned long until = jiffies + timeout, t; - struct stdio_buf *buf = &stdio->input; - size_t seen = 0; -again: - t = timeout != MAX_SCHEDULE_TIMEOUT - ? max_t(long, until - jiffies, 0) - : timeout; - - t = min(t, sysctl_hung_task_timeout_secs * HZ / 2); - - wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t); - - if (stdio->done) - return -1; - - spin_lock(&buf->lock); - seen = buf->buf.nr; - char *n = memchr(buf->buf.data, '\n', seen); - - if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) { - spin_unlock(&buf->lock); - return -ETIME; - } - - if (!n) { - buf->waiting_for_line = true; - spin_unlock(&buf->lock); - goto again; - } - - size_t b = n + 1 - buf->buf.data; - if (b > line->size) { - spin_unlock(&buf->lock); - int ret = darray_resize(line, b); - if (ret) - return ret; - seen = 0; - goto again; - } - - buf->buf.nr -= b; - memcpy(line->data, buf->buf.data, b); - memmove(buf->buf.data, - buf->buf.data + b, - buf->buf.nr); - line->nr = b; - - buf->waiting_for_line = false; - spin_unlock(&buf->lock); - - wake_up(&buf->wait); - return 0; -} - -int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line) -{ - return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT); -} - -__printf(3, 0) -static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args) -{ - ssize_t ret; - - do { - va_list args2; - size_t len; - - va_copy(args2, args); - len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2); - va_end(args2); - - if (len + 1 <= darray_room(*out)) { - out->nr += len; - return len; - } - - ret = darray_make_room_gfp(out, len + 1, gfp); - } while (ret == 0); - - return ret; -} - -ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, - const char *fmt, va_list args) -{ - struct stdio_buf *buf = &stdio->output; - unsigned long flags; - ssize_t ret; -again: - if (stdio->done) - return -EPIPE; - - spin_lock_irqsave(&buf->lock, flags); - ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); - spin_unlock_irqrestore(&buf->lock, flags); - - if (ret < 0) { - if (nonblocking) - return -EAGAIN; - - ret = wait_event_interruptible(buf->wait, - stdio_redirect_has_output_space(stdio)); - if (ret) - return ret; - goto again; - } - - wake_up(&buf->wait); - return ret; -} - -ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking, - const char *fmt, ...) -{ - va_list args; - ssize_t ret; - - va_start(args, fmt); - ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args); - va_end(args); - - return ret; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h deleted file mode 100644 index 72497b9219113b..00000000000000 --- a/fs/bcachefs/thread_with_file.h +++ /dev/null @@ -1,81 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_THREAD_WITH_FILE_H -#define _BCACHEFS_THREAD_WITH_FILE_H - -#include "thread_with_file_types.h" - -/* - * Thread with file: Run a kthread and connect it to a file descriptor, so that - * it can be interacted with via fd read/write methods and closing the file - * descriptor stops the kthread. - * - * We have two different APIs: - * - * thread_with_file, the low level version. - * You get to define the full file_operations, including your release function, - * which means that you must call bch2_thread_with_file_exit() from your - * .release method - * - * thread_with_stdio, the higher level version - * This implements full piping of input and output, including .poll. - * - * Notes on behaviour: - * - kthread shutdown behaves like writing or reading from a pipe that has been - * closed - * - Input and output buffers are 4096 bytes, although buffers may in some - * situations slightly exceed that limit so as to avoid chopping off a - * message in the middle in nonblocking mode. - * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations - - * should be fine but might change in future revisions. - * - Output buffer may grow past 4096 bytes to deal with messages that are - * bigger than 4096 bytes - * - Writing may be done blocking or nonblocking; in nonblocking mode, we only - * drop entire messages. - * - * To write, use stdio_redirect_printf() - * To read, use stdio_redirect_read() or stdio_redirect_readline() - */ - -struct task_struct; - -struct thread_with_file { - struct task_struct *task; - int ret; - bool done; -}; - -void bch2_thread_with_file_exit(struct thread_with_file *); -int bch2_run_thread_with_file(struct thread_with_file *, - const struct file_operations *, - int (*fn)(void *)); - -struct thread_with_stdio; - -struct thread_with_stdio_ops { - void (*exit)(struct thread_with_stdio *); - int (*fn)(struct thread_with_stdio *); - long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long); -}; - -struct thread_with_stdio { - struct thread_with_file thr; - struct stdio_redirect stdio; - const struct thread_with_stdio_ops *ops; -}; - -void bch2_thread_with_stdio_init(struct thread_with_stdio *, - const struct thread_with_stdio_ops *); -int __bch2_run_thread_with_stdio(struct thread_with_stdio *); -int bch2_run_thread_with_stdio(struct thread_with_stdio *, - const struct thread_with_stdio_ops *); -int bch2_run_thread_with_stdout(struct thread_with_stdio *, - const struct thread_with_stdio_ops *); -int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); - -int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long); -int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *); - -__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); -__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); - -#endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h deleted file mode 100644 index f4d484d44f6334..00000000000000 --- a/fs/bcachefs/thread_with_file_types.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H -#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H - -#include "darray.h" - -struct stdio_buf { - spinlock_t lock; - wait_queue_head_t wait; - darray_char buf; - bool waiting_for_line; -}; - -struct stdio_redirect { - struct stdio_buf input; - struct stdio_buf output; - bool done; -}; - -#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */ diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c deleted file mode 100644 index 2c34fe4be91202..00000000000000 --- a/fs/bcachefs/time_stats.c +++ /dev/null @@ -1,191 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include - -#include "eytzinger.h" -#include "time_stats.h" - -/* disable automatic switching to percpu mode */ -#define TIME_STATS_NONPCPU ((unsigned long) 1) - -static const struct time_unit time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "d", (u64) NSEC_PER_SEC * 3600 * 24}, - { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7}, - { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */ - { "eon", U64_MAX }, -}; - -const struct time_unit *bch2_pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - -static void quantiles_update(struct quantiles *q, u64 v) -{ - unsigned i = 0; - - while (i < ARRAY_SIZE(q->entries)) { - struct quantile_entry *e = q->entries + i; - - if (unlikely(!e->step)) { - e->m = v; - e->step = max_t(unsigned, v / 2, 1024); - } else if (e->m > v) { - e->m = e->m >= e->step - ? e->m - e->step - : 0; - } else if (e->m < v) { - e->m = e->m + e->step > e->m - ? e->m + e->step - : U32_MAX; - } - - if ((e->m > v ? e->m - v : v - e->m) < e->step) - e->step = max_t(unsigned, e->step / 2, 1); - - if (v >= e->m) - break; - - i = eytzinger0_child(i, v > e->m); - } -} - -static inline void time_stats_update_one(struct bch2_time_stats *stats, - u64 start, u64 end) -{ - u64 duration, freq; - bool initted = stats->last_event != 0; - - if (time_after64(end, start)) { - struct quantiles *quantiles = time_stats_to_quantiles(stats); - - duration = end - start; - mean_and_variance_update(&stats->duration_stats, duration); - mean_and_variance_weighted_update(&stats->duration_stats_weighted, - duration, initted, TIME_STATS_MV_WEIGHT); - stats->max_duration = max(stats->max_duration, duration); - stats->min_duration = min(stats->min_duration, duration); - stats->total_duration += duration; - - if (quantiles) - quantiles_update(quantiles, duration); - } - - if (stats->last_event && time_after64(end, stats->last_event)) { - freq = end - stats->last_event; - mean_and_variance_update(&stats->freq_stats, freq); - mean_and_variance_weighted_update(&stats->freq_stats_weighted, - freq, initted, TIME_STATS_MV_WEIGHT); - stats->max_freq = max(stats->max_freq, freq); - stats->min_freq = min(stats->min_freq, freq); - } - - stats->last_event = end; -} - -void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct time_stat_buffer *b) -{ - for (struct time_stat_buffer_entry *i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - time_stats_update_one(stats, i->start, i->end); - b->nr = 0; -} - -static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats, - struct time_stat_buffer *b) -{ - unsigned long flags; - - spin_lock_irqsave(&stats->lock, flags); - __bch2_time_stats_clear_buffer(stats, b); - spin_unlock_irqrestore(&stats->lock, flags); -} - -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) -{ - unsigned long flags; - - if ((unsigned long) stats->buffer <= TIME_STATS_NONPCPU) { - spin_lock_irqsave(&stats->lock, flags); - time_stats_update_one(stats, start, end); - - if (!stats->buffer && - mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && - stats->duration_stats.n > 1024) - stats->buffer = - alloc_percpu_gfp(struct time_stat_buffer, - GFP_ATOMIC); - spin_unlock_irqrestore(&stats->lock, flags); - } else { - struct time_stat_buffer *b; - - preempt_disable(); - b = this_cpu_ptr(stats->buffer); - - BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); - b->entries[b->nr++] = (struct time_stat_buffer_entry) { - .start = start, - .end = end - }; - - if (unlikely(b->nr == ARRAY_SIZE(b->entries))) - time_stats_clear_buffer(stats, b); - preempt_enable(); - } -} - -void bch2_time_stats_reset(struct bch2_time_stats *stats) -{ - spin_lock_irq(&stats->lock); - unsigned offset = offsetof(struct bch2_time_stats, min_duration); - memset((void *) stats + offset, 0, sizeof(*stats) - offset); - - if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) { - int cpu; - for_each_possible_cpu(cpu) - per_cpu_ptr(stats->buffer, cpu)->nr = 0; - } - spin_unlock_irq(&stats->lock); -} - -void bch2_time_stats_exit(struct bch2_time_stats *stats) -{ - if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) - free_percpu(stats->buffer); - stats->buffer = NULL; -} - -void bch2_time_stats_init(struct bch2_time_stats *stats) -{ - memset(stats, 0, sizeof(*stats)); - stats->min_duration = U64_MAX; - stats->min_freq = U64_MAX; - spin_lock_init(&stats->lock); -} - -void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats) -{ - bch2_time_stats_init(stats); - stats->buffer = (struct time_stat_buffer __percpu *) TIME_STATS_NONPCPU; -} diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h deleted file mode 100644 index eddb0985bab4bc..00000000000000 --- a/fs/bcachefs/time_stats.h +++ /dev/null @@ -1,161 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * bch2_time_stats - collect statistics on events that have a duration, with nicely - * formatted textual output on demand - * - * - percpu buffering of event collection: cheap enough to shotgun - * everywhere without worrying about overhead - * - * tracks: - * - number of events - * - maximum event duration ever seen - * - sum of all event durations - * - average event duration, standard and weighted - * - standard deviation of event durations, standard and weighted - * and analagous statistics for the frequency of events - * - * We provide both mean and weighted mean (exponentially weighted), and standard - * deviation and weighted standard deviation, to give an efficient-to-compute - * view of current behaviour versus. average behaviour - "did this event source - * just become wonky, or is this typical?". - * - * Particularly useful for tracking down latency issues. - */ -#ifndef _BCACHEFS_TIME_STATS_H -#define _BCACHEFS_TIME_STATS_H - -#include -#include -#include - -#include "mean_and_variance.h" - -struct time_unit { - const char *name; - u64 nsecs; -}; - -/* - * given a nanosecond value, pick the preferred time units for printing: - */ -const struct time_unit *bch2_pick_time_units(u64 ns); - -/* - * quantiles - do not use: - * - * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't - * use in new code. - */ - -#define NR_QUANTILES 15 -#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) - -struct quantiles { - struct quantile_entry { - u64 m; - u64 step; - } entries[NR_QUANTILES]; -}; - -struct time_stat_buffer { - unsigned nr; - struct time_stat_buffer_entry { - u64 start; - u64 end; - } entries[31]; -}; - -struct bch2_time_stats { - spinlock_t lock; - bool have_quantiles; - struct time_stat_buffer __percpu *buffer; - /* all fields are in nanoseconds */ - u64 min_duration; - u64 max_duration; - u64 total_duration; - u64 max_freq; - u64 min_freq; - u64 last_event; - u64 last_event_start; - - struct mean_and_variance duration_stats; - struct mean_and_variance freq_stats; - -/* default weight for weighted mean and variance calculations */ -#define TIME_STATS_MV_WEIGHT 8 - - struct mean_and_variance_weighted duration_stats_weighted; - struct mean_and_variance_weighted freq_stats_weighted; -}; - -struct bch2_time_stats_quantiles { - struct bch2_time_stats stats; - struct quantiles quantiles; -}; - -static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats) -{ - return stats->have_quantiles - ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles - : NULL; -} - -void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *); -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); - -/** - * time_stats_update - collect a new event being tracked - * - * @stats - bch2_time_stats to update - * @start - start time of event, recorded with local_clock() - * - * The end duration of the event will be the current time - */ -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) -{ - __bch2_time_stats_update(stats, start, local_clock()); -} - -/** - * track_event_change - track state change events - * - * @stats - bch2_time_stats to update - * @v - new state, true or false - * - * Use this when tracking time stats for state changes, i.e. resource X becoming - * blocked/unblocked. - */ -static inline bool track_event_change(struct bch2_time_stats *stats, bool v) -{ - if (v != !!stats->last_event_start) { - if (!v) { - bch2_time_stats_update(stats, stats->last_event_start); - stats->last_event_start = 0; - } else { - stats->last_event_start = local_clock() ?: 1; - return true; - } - } - - return false; -} - -void bch2_time_stats_reset(struct bch2_time_stats *); -void bch2_time_stats_exit(struct bch2_time_stats *); -void bch2_time_stats_init(struct bch2_time_stats *); -void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *); - -static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) -{ - bch2_time_stats_exit(&statq->stats); -} -static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq) -{ - bch2_time_stats_init(&statq->stats); - statq->stats.have_quantiles = true; - memset(&statq->quantiles, 0, sizeof(statq->quantiles)); -} - -#endif /* _BCACHEFS_TIME_STATS_H */ diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c deleted file mode 100644 index dfad1d06633ddb..00000000000000 --- a/fs/bcachefs/trace.c +++ /dev/null @@ -1,18 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "alloc_types.h" -#include "buckets.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_update_interior.h" -#include "keylist.h" -#include "move_types.h" -#include "opts.h" -#include "six.h" - -#include - -#define CREATE_TRACE_POINTS -#include "trace.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h deleted file mode 100644 index 9c5a9c551f03d7..00000000000000 --- a/fs/bcachefs/trace.h +++ /dev/null @@ -1,1883 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM bcachefs - -#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ) - -#include - -#define TRACE_BPOS_entries(name) \ - __field(u64, name##_inode ) \ - __field(u64, name##_offset ) \ - __field(u32, name##_snapshot ) - -#define TRACE_BPOS_assign(dst, src) \ - __entry->dst##_inode = (src).inode; \ - __entry->dst##_offset = (src).offset; \ - __entry->dst##_snapshot = (src).snapshot - -DECLARE_EVENT_CLASS(bpos, - TP_PROTO(const struct bpos *p), - TP_ARGS(p), - - TP_STRUCT__entry( - TRACE_BPOS_entries(p) - ), - - TP_fast_assign( - TRACE_BPOS_assign(p, *p); - ), - - TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) -); - -DECLARE_EVENT_CLASS(fs_str, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __string(str, str ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __assign_str(str); - ), - - TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) -); - -DECLARE_EVENT_CLASS(trans_str, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), - TP_ARGS(trans, caller_ip, str), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __string(str, str ) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __assign_str(str); - ), - - TP_printk("%d,%d %s %pS %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str)) -); - -DECLARE_EVENT_CLASS(trans_str_nocaller, - TP_PROTO(struct btree_trans *trans, const char *str), - TP_ARGS(trans, str), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - __string(str, str ) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(str); - ), - - TP_printk("%d,%d %s %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->trans_fn, __get_str(str)) -); - -DECLARE_EVENT_CLASS(btree_node_nofs, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u8, level ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->level = b->c.level; - __entry->btree_id = b->c.btree_id; - TRACE_BPOS_assign(pos, b->key.k.p); - ), - - TP_printk("%d,%d %u %s %llu:%llu:%u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->level, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) -); - -DECLARE_EVENT_CLASS(btree_node, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - __field(u8, level ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->level = b->c.level; - __entry->btree_id = b->c.btree_id; - TRACE_BPOS_assign(pos, b->key.k.p); - ), - - TP_printk("%d,%d %s %u %s %llu:%llu:%u", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, - __entry->level, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) -); - -DECLARE_EVENT_CLASS(bch_fs, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c), - - TP_STRUCT__entry( - __field(dev_t, dev ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - ), - - TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) -); - -DECLARE_EVENT_CLASS(btree_trans, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - ), - - TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn) -); - -DECLARE_EVENT_CLASS(bio, - TP_PROTO(struct bio *bio), - TP_ARGS(bio), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(sector_t, sector ) - __field(unsigned int, nr_sector ) - __array(char, rwbs, 6 ) - ), - - TP_fast_assign( - __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf); - ), - - TP_printk("%d,%d %s %llu + %u", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, __entry->nr_sector) -); - -/* errors */ - -TRACE_EVENT(error_throw, - TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip), - TP_ARGS(c, bch_err, ip), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(int, err ) - __array(char, err_str, 32 ) - __array(char, ip, 32 ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->err = bch_err; - strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str)); - snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); - ), - - TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ip, __entry->err_str) -); - -TRACE_EVENT(error_downcast, - TP_PROTO(int bch_err, int std_err, unsigned long ip), - TP_ARGS(bch_err, std_err, ip), - - TP_STRUCT__entry( - __array(char, bch_err, 32 ) - __array(char, std_err, 32 ) - __array(char, ip, 32 ) - ), - - TP_fast_assign( - strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); - strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); - snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); - ), - - TP_printk("%s ret %s -> %s %s", __entry->ip, - __entry->bch_err, __entry->std_err, __entry->ip) -); - -/* disk_accounting.c */ - -TRACE_EVENT(accounting_mem_insert, - TP_PROTO(struct bch_fs *c, const char *acc), - TP_ARGS(c, acc), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(unsigned, new_nr ) - __string(acc, acc ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->new_nr = c->accounting.k.nr; - __assign_str(acc); - ), - - TP_printk("%d,%d entries %u added %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->new_nr, - __get_str(acc)) -); - -/* fs.c: */ -TRACE_EVENT(bch2_sync_fs, - TP_PROTO(struct super_block *sb, int wait), - - TP_ARGS(sb, wait), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( int, wait ) - - ), - - TP_fast_assign( - __entry->dev = sb->s_dev; - __entry->wait = wait; - ), - - TP_printk("dev %d,%d wait %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->wait) -); - -/* fs-io.c: */ -TRACE_EVENT(bch2_fsync, - TP_PROTO(struct file *file, int datasync), - - TP_ARGS(file, datasync), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ino_t, parent ) - __field( int, datasync ) - ), - - TP_fast_assign( - struct dentry *dentry = file->f_path.dentry; - - __entry->dev = dentry->d_sb->s_dev; - __entry->ino = d_inode(dentry)->i_ino; - __entry->parent = d_inode(dentry->d_parent)->i_ino; - __entry->datasync = datasync; - ), - - TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned long) __entry->parent, __entry->datasync) -); - -/* super-io.c: */ -TRACE_EVENT(write_super, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(unsigned long, ip ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->ip = ip; - ), - - TP_printk("%d,%d for %pS", - MAJOR(__entry->dev), MINOR(__entry->dev), - (void *) __entry->ip) -); - -/* io.c: */ - -DEFINE_EVENT(bio, io_read_promote, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -TRACE_EVENT(io_read_nopromote, - TP_PROTO(struct bch_fs *c, int ret), - TP_ARGS(c, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, ret, 32 ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); - ), - - TP_printk("%d,%d ret %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ret) -); - -DEFINE_EVENT(bio, io_read_bounce, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_split, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_retry, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_reuse_race, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_fail_and_poison, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -/* ec.c */ - -TRACE_EVENT(stripe_create, - TP_PROTO(struct bch_fs *c, u64 idx, int ret), - TP_ARGS(c, idx, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, idx ) - __field(int, ret ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->idx = idx; - __entry->ret = ret; - ), - - TP_printk("%d,%d idx %llu ret %i", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->idx, - __entry->ret) -); - -/* Journal */ - -DEFINE_EVENT(bch_fs, journal_full, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(fs_str, journal_entry_full, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, journal_entry_close, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(bio, journal_write, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -TRACE_EVENT(journal_reclaim_start, - TP_PROTO(struct bch_fs *c, bool direct, bool kicked, - u64 min_nr, u64 min_key_cache, - u64 btree_cache_dirty, u64 btree_cache_total, - u64 btree_key_cache_dirty, u64 btree_key_cache_total), - TP_ARGS(c, direct, kicked, min_nr, min_key_cache, - btree_cache_dirty, btree_cache_total, - btree_key_cache_dirty, btree_key_cache_total), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(bool, direct ) - __field(bool, kicked ) - __field(u64, min_nr ) - __field(u64, min_key_cache ) - __field(u64, btree_cache_dirty ) - __field(u64, btree_cache_total ) - __field(u64, btree_key_cache_dirty ) - __field(u64, btree_key_cache_total ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->direct = direct; - __entry->kicked = kicked; - __entry->min_nr = min_nr; - __entry->min_key_cache = min_key_cache; - __entry->btree_cache_dirty = btree_cache_dirty; - __entry->btree_cache_total = btree_cache_total; - __entry->btree_key_cache_dirty = btree_key_cache_dirty; - __entry->btree_key_cache_total = btree_key_cache_total; - ), - - TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->direct, - __entry->kicked, - __entry->min_nr, - __entry->min_key_cache, - __entry->btree_cache_dirty, - __entry->btree_cache_total, - __entry->btree_key_cache_dirty, - __entry->btree_key_cache_total) -); - -TRACE_EVENT(journal_reclaim_finish, - TP_PROTO(struct bch_fs *c, u64 nr_flushed), - TP_ARGS(c, nr_flushed), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, nr_flushed ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->nr_flushed = nr_flushed; - ), - - TP_printk("%d,%d flushed %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->nr_flushed) -); - -/* bset.c: */ - -DEFINE_EVENT(bpos, bkey_pack_pos_fail, - TP_PROTO(const struct bpos *p), - TP_ARGS(p) -); - -/* Btree cache: */ - -TRACE_EVENT(btree_cache_scan, - TP_PROTO(long nr_to_scan, long can_free, long ret), - TP_ARGS(nr_to_scan, can_free, ret), - - TP_STRUCT__entry( - __field(long, nr_to_scan ) - __field(long, can_free ) - __field(long, ret ) - ), - - TP_fast_assign( - __entry->nr_to_scan = nr_to_scan; - __entry->can_free = can_free; - __entry->ret = ret; - ), - - TP_printk("scanned for %li nodes, can free %li, ret %li", - __entry->nr_to_scan, __entry->can_free, __entry->ret) -); - -DEFINE_EVENT(btree_node_nofs, btree_cache_reap, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -/* Btree */ - -DEFINE_EVENT(btree_node, btree_node_read, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -TRACE_EVENT(btree_node_write, - TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), - TP_ARGS(b, bytes, sectors), - - TP_STRUCT__entry( - __field(enum btree_node_type, type) - __field(unsigned, bytes ) - __field(unsigned, sectors ) - ), - - TP_fast_assign( - __entry->type = btree_node_type(b); - __entry->bytes = bytes; - __entry->sectors = sectors; - ), - - TP_printk("bkey type %u bytes %u sectors %u", - __entry->type , __entry->bytes, __entry->sectors) -); - -DEFINE_EVENT(btree_node, btree_node_alloc, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_free, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -TRACE_EVENT(btree_reserve_get_fail, - TP_PROTO(const char *trans_fn, - unsigned long caller_ip, - size_t required, - int ret), - TP_ARGS(trans_fn, caller_ip, required, ret), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(size_t, required ) - __array(char, ret, 32 ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->required = required; - strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); - ), - - TP_printk("%s %pS required %zu ret %s", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->required, - __entry->ret) -); - -DEFINE_EVENT(btree_node, btree_node_compact, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_merge, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_split, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_rewrite, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_set_root, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -TRACE_EVENT(btree_path_relock_fail, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned level), - TP_ARGS(trans, caller_ip, path, level), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, level ) - __field(u8, path_idx) - TRACE_BPOS_entries(pos) - __array(char, node, 24 ) - __field(u8, self_read_count ) - __field(u8, self_intent_count) - __field(u8, read_count ) - __field(u8, intent_count ) - __field(u32, iter_lock_seq ) - __field(u32, node_lock_seq ) - ), - - TP_fast_assign( - struct btree *b = btree_path_node(path, level); - struct six_lock_count c; - - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - __entry->level = level; - __entry->path_idx = path - trans->paths; - TRACE_BPOS_assign(pos, path->pos); - - c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level); - __entry->self_read_count = c.n[SIX_LOCK_read]; - __entry->self_intent_count = c.n[SIX_LOCK_intent]; - - if (IS_ERR(b)) { - strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); - } else { - c = six_lock_counts(&path->l[level].b->c.lock); - __entry->read_count = c.n[SIX_LOCK_read]; - __entry->intent_count = c.n[SIX_LOCK_intent]; - scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c); - } - __entry->iter_lock_seq = path->l[level].lock_seq; - __entry->node_lock_seq = is_btree_node(path, level) - ? six_lock_seq(&path->l[level].b->c.lock) - : 0; - ), - - TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->path_idx, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->level, - __entry->node, - __entry->self_read_count, - __entry->self_intent_count, - __entry->read_count, - __entry->intent_count, - __entry->iter_lock_seq, - __entry->node_lock_seq) -); - -TRACE_EVENT(btree_path_upgrade_fail, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned level), - TP_ARGS(trans, caller_ip, path, level), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, level ) - __field(u8, path_idx) - TRACE_BPOS_entries(pos) - __field(u8, locked ) - __field(u8, self_read_count ) - __field(u8, self_intent_count) - __field(u8, read_count ) - __field(u8, intent_count ) - __field(u32, iter_lock_seq ) - __field(u32, node_lock_seq ) - ), - - TP_fast_assign( - struct six_lock_count c; - - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - __entry->level = level; - __entry->path_idx = path - trans->paths; - TRACE_BPOS_assign(pos, path->pos); - __entry->locked = btree_node_locked(path, level); - - c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), - __entry->self_read_count = c.n[SIX_LOCK_read]; - __entry->self_intent_count = c.n[SIX_LOCK_intent]; - c = six_lock_counts(&path->l[level].b->c.lock); - __entry->read_count = c.n[SIX_LOCK_read]; - __entry->intent_count = c.n[SIX_LOCK_intent]; - __entry->iter_lock_seq = path->l[level].lock_seq; - __entry->node_lock_seq = is_btree_node(path, level) - ? six_lock_seq(&path->l[level].b->c.lock) - : 0; - ), - - TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->path_idx, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->level, - __entry->locked, - __entry->self_read_count, - __entry->self_intent_count, - __entry->read_count, - __entry->intent_count, - __entry->iter_lock_seq, - __entry->node_lock_seq) -); - -/* Garbage collection */ - -DEFINE_EVENT(bch_fs, gc_gens_start, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, gc_gens_end, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -/* Allocator */ - -DEFINE_EVENT(fs_str, bucket_alloc, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, bucket_alloc_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DECLARE_EVENT_CLASS(discard_buckets_class, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, seen ) - __field(u64, open ) - __field(u64, need_journal_commit ) - __field(u64, discarded ) - __array(char, err, 16 ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->seen = seen; - __entry->open = open; - __entry->need_journal_commit = need_journal_commit; - __entry->discarded = discarded; - strscpy(__entry->err, err, sizeof(__entry->err)); - ), - - TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->seen, - __entry->open, - __entry->need_journal_commit, - __entry->discarded, - __entry->err) -); - -DEFINE_EVENT(discard_buckets_class, discard_buckets, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err) -); - -DEFINE_EVENT(discard_buckets_class, discard_buckets_fast, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err) -); - -TRACE_EVENT(bucket_invalidate, - TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), - TP_ARGS(c, dev, bucket, sectors), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u32, dev_idx ) - __field(u32, sectors ) - __field(u64, bucket ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->dev_idx = dev; - __entry->sectors = sectors; - __entry->bucket = bucket; - ), - - TP_printk("%d:%d invalidated %u:%llu cached sectors %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->dev_idx, __entry->bucket, - __entry->sectors) -); - -/* Moving IO */ - -DEFINE_EVENT(fs_str, io_move, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_read, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_write, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_finish, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_write_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_start_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -TRACE_EVENT(move_data, - TP_PROTO(struct bch_fs *c, - struct bch_move_stats *stats), - TP_ARGS(c, stats), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, keys_moved ) - __field(u64, keys_raced ) - __field(u64, sectors_seen ) - __field(u64, sectors_moved ) - __field(u64, sectors_raced ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->keys_moved = atomic64_read(&stats->keys_moved); - __entry->keys_raced = atomic64_read(&stats->keys_raced); - __entry->sectors_seen = atomic64_read(&stats->sectors_seen); - __entry->sectors_moved = atomic64_read(&stats->sectors_moved); - __entry->sectors_raced = atomic64_read(&stats->sectors_raced); - ), - - TP_printk("%d,%d keys moved %llu raced %llu" - "sectors seen %llu moved %llu raced %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->keys_moved, - __entry->keys_raced, - __entry->sectors_seen, - __entry->sectors_moved, - __entry->sectors_raced) -); - -TRACE_EVENT(copygc, - TP_PROTO(struct bch_fs *c, - u64 buckets, - u64 sectors_seen, - u64 sectors_moved), - TP_ARGS(c, buckets, sectors_seen, sectors_moved), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, buckets ) - __field(u64, sectors_seen ) - __field(u64, sectors_moved ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->buckets = buckets; - __entry->sectors_seen = sectors_seen; - __entry->sectors_moved = sectors_moved; - ), - - TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->buckets, - __entry->sectors_seen, - __entry->sectors_moved) -); - -TRACE_EVENT(copygc_wait, - TP_PROTO(struct bch_fs *c, - u64 wait_amount, u64 until), - TP_ARGS(c, wait_amount, until), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, wait_amount ) - __field(u64, until ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->wait_amount = wait_amount; - __entry->until = until; - ), - - TP_printk("%d,%u waiting for %llu sectors until %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->wait_amount, __entry->until) -); - -/* btree transactions: */ - -DECLARE_EVENT_CLASS(transaction_event, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - ), - - TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) -); - -DEFINE_EVENT(transaction_event, transaction_commit, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -DEFINE_EVENT(transaction_event, trans_restart_injected, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -TRACE_EVENT(trans_restart_split_race, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree *b), - TP_ARGS(trans, caller_ip, b), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, level ) - __field(u16, written ) - __field(u16, blocks ) - __field(u16, u64s_remaining ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->level = b->c.level; - __entry->written = b->written; - __entry->blocks = btree_blocks(trans->c); - __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b); - ), - - TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", - __entry->trans_fn, (void *) __entry->caller_ip, - __entry->level, - __entry->written, __entry->blocks, - __entry->u64s_remaining) -); - -TRACE_EVENT(trans_blocked_journal_reclaim, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - - __field(unsigned long, key_cache_nr_keys ) - __field(unsigned long, key_cache_nr_dirty ) - __field(long, must_wait ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys); - __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty); - __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c); - ), - - TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li", - __entry->trans_fn, (void *) __entry->caller_ip, - __entry->key_cache_nr_keys, - __entry->key_cache_nr_dirty, - __entry->must_wait) -); - -#if 0 -/* todo: bring back dynamic fault injection */ -DEFINE_EVENT(transaction_event, trans_restart_fault_inject, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); -#endif - -DEFINE_EVENT(transaction_event, trans_traverse_all, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -DEFINE_EVENT(trans_str, trans_restart_too_many_iters, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - const char *paths), - TP_ARGS(trans, caller_ip, paths) -); - -DECLARE_EVENT_CLASS(transaction_restart_iter, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos) - ), - - TP_printk("%s %pS btree %s pos %llu:%llu:%u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(fs_str, trans_restart_upgrade, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(trans_str, trans_restart_relock, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), - TP_ARGS(trans, caller_ip, str) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock, - TP_PROTO(struct btree_trans *trans, - const char *cycle), - TP_ARGS(trans, cycle) -); - -DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -TRACE_EVENT(trans_restart_would_deadlock_write, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - ), - - TP_printk("%s", __entry->trans_fn) -); - -TRACE_EVENT(trans_restart_mem_realloced, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - unsigned long bytes), - TP_ARGS(trans, caller_ip, bytes), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(unsigned long, bytes ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->bytes = bytes; - ), - - TP_printk("%s %pS bytes %lu", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->bytes) -); - -DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -TRACE_EVENT(path_downgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned old_locks_want), - TP_ARGS(trans, caller_ip, path, old_locks_want), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(unsigned, old_locks_want ) - __field(unsigned, new_locks_want ) - __field(unsigned, btree ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->old_locks_want = old_locks_want; - __entry->new_locks_want = path->locks_want; - __entry->btree = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->old_locks_want, - __entry->new_locks_want, - bch2_btree_id_str(__entry->btree), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -TRACE_EVENT(key_cache_fill, - TP_PROTO(struct btree_trans *trans, const char *key), - TP_ARGS(trans, key), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __string(key, key ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(key); - ), - - TP_printk("%s %s", __entry->trans_fn, __get_str(key)) -); - -TRACE_EVENT(write_buffer_flush, - TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size), - TP_ARGS(trans, nr, skipped, fast, size), - - TP_STRUCT__entry( - __field(size_t, nr ) - __field(size_t, skipped ) - __field(size_t, fast ) - __field(size_t, size ) - ), - - TP_fast_assign( - __entry->nr = nr; - __entry->skipped = skipped; - __entry->fast = fast; - __entry->size = size; - ), - - TP_printk("%zu/%zu skipped %zu fast %zu", - __entry->nr, __entry->size, __entry->skipped, __entry->fast) -); - -TRACE_EVENT(write_buffer_flush_sync, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans, caller_ip), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - ), - - TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) -); - -TRACE_EVENT(write_buffer_flush_slowpath, - TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total), - TP_ARGS(trans, slowpath, total), - - TP_STRUCT__entry( - __field(size_t, slowpath ) - __field(size_t, total ) - ), - - TP_fast_assign( - __entry->slowpath = slowpath; - __entry->total = total; - ), - - TP_printk("%zu/%zu", __entry->slowpath, __entry->total) -); - -TRACE_EVENT(write_buffer_maybe_flush, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key), - TP_ARGS(trans, caller_ip, key), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __string(key, key ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(key); - ), - - TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key)) -); - -DEFINE_EVENT(fs_str, rebalance_extent, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, data_update, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_pred, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_created_rebalance, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_evacuate_bucket, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, extent_trim_atomic, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, btree_iter_peek_slot, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, __btree_iter_peek, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, btree_iter_peek_max, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, btree_iter_peek_prev_min, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS - -TRACE_EVENT(update_by_path, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, - struct btree_insert_entry *i, bool overwrite), - TP_ARGS(trans, path, i, overwrite), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(btree_path_idx_t, path_idx ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - __field(u8, overwrite ) - __field(btree_path_idx_t, update_idx ) - __field(btree_path_idx_t, nr_updates ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->path_idx = path - trans->paths; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - __entry->overwrite = overwrite; - __entry->update_idx = i - trans->updates; - __entry->nr_updates = trans->nr_updates; - ), - - TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u", - __entry->trans_fn, - __entry->path_idx, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->overwrite, - __entry->update_idx, - __entry->nr_updates) -); - -TRACE_EVENT(btree_path_lock, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_bkey_cached_common *b), - TP_ARGS(trans, caller_ip, b), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, level ) - __array(char, node, 24 ) - __field(u32, lock_seq ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = b->btree_id; - __entry->level = b->level; - - scnprintf(__entry->node, sizeof(__entry->node), "%px", b); - __entry->lock_seq = six_lock_seq(&b->lock); - ), - - TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->level, - __entry->node, - __entry->lock_seq) -); - -DECLARE_EVENT_CLASS(btree_path_ev, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path), - - TP_STRUCT__entry( - __field(u16, idx ) - __field(u8, ref ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u", - __entry->idx, __entry->ref, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -DEFINE_EVENT(btree_path_ev, btree_path_get_ll, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -DEFINE_EVENT(btree_path_ev, btree_path_put_ll, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -TRACE_EVENT(btree_path_alloc, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, locks_want ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->locks_want = path->locks_want; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u", - __entry->idx, - bch2_btree_id_str(__entry->btree_id), - __entry->locks_want, - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -TRACE_EVENT(btree_path_get, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos), - TP_ARGS(trans, path, new_pos), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, ref ) - __field(u8, preserve ) - __field(u8, locks_want ) - __field(u8, btree_id ) - TRACE_BPOS_entries(old_pos) - TRACE_BPOS_entries(new_pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - __entry->locks_want = path->locks_want; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(old_pos, path->pos); - TRACE_BPOS_assign(new_pos, *new_pos); - ), - - TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u", - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->locks_want, - __entry->old_pos_inode, - __entry->old_pos_offset, - __entry->old_pos_snapshot, - __entry->new_pos_inode, - __entry->new_pos_offset, - __entry->new_pos_snapshot) -); - -DECLARE_EVENT_CLASS(btree_path_clone, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), - TP_ARGS(trans, path, new), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, new_idx ) - __field(u8, btree_id ) - __field(u8, ref ) - __field(u8, preserve ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->new_idx = new - trans->paths; - __entry->btree_id = path->btree_id; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u", - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->new_idx) -); - -DEFINE_EVENT(btree_path_clone, btree_path_clone, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), - TP_ARGS(trans, path, new) -); - -DEFINE_EVENT(btree_path_clone, btree_path_save_pos, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), - TP_ARGS(trans, path, new) -); - -DECLARE_EVENT_CLASS(btree_path_traverse, - TP_PROTO(struct btree_trans *trans, - struct btree_path *path), - TP_ARGS(trans, path), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(btree_path_idx_t, idx ) - __field(u8, ref ) - __field(u8, preserve ) - __field(u8, should_be_locked ) - __field(u8, btree_id ) - __field(u8, level ) - TRACE_BPOS_entries(pos) - __field(u8, locks_want ) - __field(u8, nodes_locked ) - __array(char, node0, 24 ) - __array(char, node1, 24 ) - __array(char, node2, 24 ) - __array(char, node3, 24 ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - __entry->btree_id = path->btree_id; - __entry->level = path->level; - TRACE_BPOS_assign(pos, path->pos); - - __entry->locks_want = path->locks_want; - __entry->nodes_locked = path->nodes_locked; - struct btree *b = path->l[0].b; - if (IS_ERR(b)) - strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); - b = path->l[1].b; - if (IS_ERR(b)) - strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); - b = path->l[2].b; - if (IS_ERR(b)) - strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); - b = path->l[3].b; - if (IS_ERR(b)) - strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); - ), - - TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n" - "locks %u %u %u %u node %s %s %s %s", - __entry->trans_fn, - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->level, - __entry->locks_want, - (__entry->nodes_locked >> 6) & 3, - (__entry->nodes_locked >> 4) & 3, - (__entry->nodes_locked >> 2) & 3, - (__entry->nodes_locked >> 0) & 3, - __entry->node3, - __entry->node2, - __entry->node1, - __entry->node0) -); - -DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start, - TP_PROTO(struct btree_trans *trans, - struct btree_path *path), - TP_ARGS(trans, path) -); - -DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -TRACE_EVENT(btree_path_set_pos, - TP_PROTO(struct btree_trans *trans, - struct btree_path *path, - struct bpos *new_pos), - TP_ARGS(trans, path, new_pos), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, ref ) - __field(u8, preserve ) - __field(u8, btree_id ) - TRACE_BPOS_entries(old_pos) - TRACE_BPOS_entries(new_pos) - __field(u8, locks_want ) - __field(u8, nodes_locked ) - __array(char, node0, 24 ) - __array(char, node1, 24 ) - __array(char, node2, 24 ) - __array(char, node3, 24 ) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(old_pos, path->pos); - TRACE_BPOS_assign(new_pos, *new_pos); - - __entry->nodes_locked = path->nodes_locked; - struct btree *b = path->l[0].b; - if (IS_ERR(b)) - strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); - b = path->l[1].b; - if (IS_ERR(b)) - strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); - b = path->l[2].b; - if (IS_ERR(b)) - strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); - b = path->l[3].b; - if (IS_ERR(b)) - strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); - ), - - TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n" - "locks %u %u %u %u node %s %s %s %s", - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->old_pos_inode, - __entry->old_pos_offset, - __entry->old_pos_snapshot, - __entry->new_pos_inode, - __entry->new_pos_offset, - __entry->new_pos_snapshot, - (__entry->nodes_locked >> 6) & 3, - (__entry->nodes_locked >> 4) & 3, - (__entry->nodes_locked >> 2) & 3, - (__entry->nodes_locked >> 0) & 3, - __entry->node3, - __entry->node2, - __entry->node1, - __entry->node0) -); - -TRACE_EVENT(btree_path_free, - TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup), - TP_ARGS(trans, path, dup), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, preserve ) - __field(u8, should_be_locked) - __field(s8, dup ) - __field(u8, dup_locked ) - ), - - TP_fast_assign( - __entry->idx = path; - __entry->preserve = trans->paths[path].preserve; - __entry->should_be_locked = trans->paths[path].should_be_locked; - __entry->dup = dup ? dup - trans->paths : -1; - __entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0; - ), - - TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx, - __entry->preserve ? 'P' : ' ', - __entry->should_be_locked ? 'S' : ' ', - __entry->dup, - __entry->dup_locked) -); - -#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ -#ifndef _TRACE_BCACHEFS_H - -static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path, - struct btree_insert_entry *i, bool overwrite) {} -static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {} -static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} -static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} -static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} -static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} -static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {} - -#endif -#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ - -#define _TRACE_BCACHEFS_H -#endif /* _TRACE_BCACHEFS_H */ - -/* This part must be outside protection */ -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../fs/bcachefs - -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -#include diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c deleted file mode 100644 index 9764c2e6a91026..00000000000000 --- a/fs/bcachefs/two_state_shared_lock.c +++ /dev/null @@ -1,8 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "two_state_shared_lock.h" - -void __bch2_two_state_lock(two_state_lock_t *lock, int s) -{ - __wait_event(lock->wait, bch2_two_state_trylock(lock, s)); -} diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h deleted file mode 100644 index 7f647846b511fb..00000000000000 --- a/fs/bcachefs/two_state_shared_lock.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_TWO_STATE_LOCK_H -#define _BCACHEFS_TWO_STATE_LOCK_H - -#include -#include -#include - -#include "util.h" - -/* - * Two-state lock - can be taken for add or block - both states are shared, - * like read side of rwsem, but conflict with other state: - */ -typedef struct { - atomic_long_t v; - wait_queue_head_t wait; -} two_state_lock_t; - -static inline void two_state_lock_init(two_state_lock_t *lock) -{ - atomic_long_set(&lock->v, 0); - init_waitqueue_head(&lock->wait); -} - -static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s) -{ - long i = s ? 1 : -1; - - EBUG_ON(atomic_long_read(&lock->v) == 0); - - if (atomic_long_sub_return_release(i, &lock->v) == 0) - wake_up_all(&lock->wait); -} - -static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s) -{ - long i = s ? 1 : -1; - long old; - - old = atomic_long_read(&lock->v); - do { - if (i > 0 ? old < 0 : old > 0) - return false; - } while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i)); - - return true; -} - -void __bch2_two_state_lock(two_state_lock_t *, int); - -static inline void bch2_two_state_lock(two_state_lock_t *lock, int s) -{ - if (!bch2_two_state_trylock(lock, s)) - __bch2_two_state_lock(lock, s); -} - -#endif /* _BCACHEFS_TWO_STATE_LOCK_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c deleted file mode 100644 index df9a6071fe186b..00000000000000 --- a/fs/bcachefs/util.c +++ /dev/null @@ -1,1047 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * random utility code, for bcache but in theory not specific to bcache - * - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eytzinger.h" -#include "mean_and_variance.h" -#include "util.h" - -static const char si_units[] = "?kMGTPEZY"; - -/* string_get_size units: */ -static const char *const units_2[] = { - "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" -}; -static const char *const units_10[] = { - "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" -}; - -static int parse_u64(const char *cp, u64 *res) -{ - const char *start = cp; - u64 v = 0; - - if (!isdigit(*cp)) - return -EINVAL; - - do { - if (v > U64_MAX / 10) - return -ERANGE; - v *= 10; - if (v > U64_MAX - (*cp - '0')) - return -ERANGE; - v += *cp - '0'; - cp++; - } while (isdigit(*cp)); - - *res = v; - return cp - start; -} - -static int bch2_pow(u64 n, u64 p, u64 *res) -{ - *res = 1; - - while (p--) { - if (*res > div64_u64(U64_MAX, n)) - return -ERANGE; - *res *= n; - } - return 0; -} - -static int parse_unit_suffix(const char *cp, u64 *res) -{ - const char *start = cp; - u64 base = 1024; - unsigned u; - int ret; - - if (*cp == ' ') - cp++; - - for (u = 1; u < strlen(si_units); u++) - if (*cp == si_units[u]) { - cp++; - goto got_unit; - } - - for (u = 0; u < ARRAY_SIZE(units_2); u++) - if (!strncmp(cp, units_2[u], strlen(units_2[u]))) { - cp += strlen(units_2[u]); - goto got_unit; - } - - for (u = 0; u < ARRAY_SIZE(units_10); u++) - if (!strncmp(cp, units_10[u], strlen(units_10[u]))) { - cp += strlen(units_10[u]); - base = 1000; - goto got_unit; - } - - *res = 1; - return 0; -got_unit: - ret = bch2_pow(base, u, res); - if (ret) - return ret; - - return cp - start; -} - -#define parse_or_ret(cp, _f) \ -do { \ - int _ret = _f; \ - if (_ret < 0) \ - return _ret; \ - cp += _ret; \ -} while (0) - -static int __bch2_strtou64_h(const char *cp, u64 *res) -{ - const char *start = cp; - u64 v = 0, b, f_n = 0, f_d = 1; - int ret; - - parse_or_ret(cp, parse_u64(cp, &v)); - - if (*cp == '.') { - cp++; - ret = parse_u64(cp, &f_n); - if (ret < 0) - return ret; - cp += ret; - - ret = bch2_pow(10, ret, &f_d); - if (ret) - return ret; - } - - parse_or_ret(cp, parse_unit_suffix(cp, &b)); - - if (v > div64_u64(U64_MAX, b)) - return -ERANGE; - v *= b; - - if (f_n > div64_u64(U64_MAX, b)) - return -ERANGE; - - f_n = div64_u64(f_n * b, f_d); - if (v + f_n < v) - return -ERANGE; - v += f_n; - - *res = v; - return cp - start; -} - -static int __bch2_strtoh(const char *cp, u64 *res, - u64 t_max, bool t_signed) -{ - bool positive = *cp != '-'; - u64 v = 0; - - if (*cp == '+' || *cp == '-') - cp++; - - parse_or_ret(cp, __bch2_strtou64_h(cp, &v)); - - if (*cp == '\n') - cp++; - if (*cp) - return -EINVAL; - - if (positive) { - if (v > t_max) - return -ERANGE; - } else { - if (v && !t_signed) - return -ERANGE; - - if (v > t_max + 1) - return -ERANGE; - v = -v; - } - - *res = v; - return 0; -} - -#define STRTO_H(name, type) \ -int bch2_ ## name ## _h(const char *cp, type *res) \ -{ \ - u64 v = 0; \ - int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ - ANYSINT_MAX(type) != ((type) ~0ULL)); \ - *res = v; \ - return ret; \ -} - -STRTO_H(strtoint, int) -STRTO_H(strtouint, unsigned int) -STRTO_H(strtoll, long long) -STRTO_H(strtoull, unsigned long long) -STRTO_H(strtou64, u64) - -u64 bch2_read_flag_list(const char *opt, const char * const list[]) -{ - u64 ret = 0; - char *p, *s, *d = kstrdup(opt, GFP_KERNEL); - - if (!d) - return -ENOMEM; - - s = strim(d); - - while ((p = strsep(&s, ",;"))) { - int flag = match_string(list, -1, p); - - if (flag < 0) { - ret = -1; - break; - } - - ret |= BIT_ULL(flag); - } - - kfree(d); - - return ret; -} - -bool bch2_is_zero(const void *_p, size_t n) -{ - const char *p = _p; - size_t i; - - for (i = 0; i < n; i++) - if (p[i]) - return false; - return true; -} - -void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits) -{ - while (nr_bits) - prt_char(out, '0' + ((v >> --nr_bits) & 1)); -} - -void bch2_prt_u64_base2(struct printbuf *out, u64 v) -{ - bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); -} - -static bool string_is_spaces(const char *str) -{ - while (*str) { - if (*str != ' ') - return false; - str++; - } - return true; -} - -void bch2_print_string_as_lines(const char *prefix, const char *lines) -{ - bool locked = false; - const char *p; - - if (!lines) { - printk("%s (null)\n", prefix); - return; - } - - locked = console_trylock(); - - while (*lines) { - p = strchrnul(lines, '\n'); - if (!*p && string_is_spaces(lines)) - break; - - printk("%s%.*s\n", prefix, (int) (p - lines), lines); - if (!*p) - break; - lines = p + 1; - } - if (locked) - console_unlock(); -} - -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, - gfp_t gfp) -{ -#ifdef CONFIG_STACKTRACE - unsigned nr_entries = 0; - - stack->nr = 0; - int ret = darray_make_room_gfp(stack, 32, gfp); - if (ret) - return ret; - - if (!down_read_trylock(&task->signal->exec_update_lock)) - return -1; - - do { - nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); - } while (nr_entries == stack->size && - !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp))); - - stack->nr = nr_entries; - up_read(&task->signal->exec_update_lock); - - return ret; -#else - return 0; -#endif -} - -void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) -{ - darray_for_each(*stack, i) { - prt_printf(out, "[<0>] %pB", (void *) *i); - prt_newline(out); - } -} - -int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp) -{ - bch_stacktrace stack = { 0 }; - int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp); - - bch2_prt_backtrace(out, &stack); - darray_exit(&stack); - return ret; -} - -#ifndef __KERNEL__ -#include -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - time_t t = sec; - char buf[64]; - ctime_r(&t, buf); - strim(buf); - prt_str(out, buf); -} -#else -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - char buf[64]; - snprintf(buf, sizeof(buf), "%ptT", &sec); - prt_u64(out, sec); -} -#endif - -void bch2_pr_time_units(struct printbuf *out, u64 ns) -{ - const struct time_unit *u = bch2_pick_time_units(ns); - - prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name); -} - -static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) -{ - const struct time_unit *u = bch2_pick_time_units(ns); - - prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name); -} - -static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) -{ - prt_printf(out, "%s\t", name); - bch2_pr_time_units_aligned(out, ns); - prt_newline(out); -} - -#define TABSTOP_SIZE 12 - -void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) -{ - struct quantiles *quantiles = time_stats_to_quantiles(stats); - s64 f_mean = 0, d_mean = 0; - u64 f_stddev = 0, d_stddev = 0; - - if (stats->buffer) { - int cpu; - - spin_lock_irq(&stats->lock); - for_each_possible_cpu(cpu) - __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); - spin_unlock_irq(&stats->lock); - } - - /* - * avoid divide by zero - */ - if (stats->freq_stats.n) { - f_mean = mean_and_variance_get_mean(stats->freq_stats); - f_stddev = mean_and_variance_get_stddev(stats->freq_stats); - d_mean = mean_and_variance_get_mean(stats->duration_stats); - d_stddev = mean_and_variance_get_stddev(stats->duration_stats); - } - - printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); - prt_printf(out, "count:\t%llu\n", stats->duration_stats.n); - printbuf_tabstop_pop(out); - - printbuf_tabstops_reset(out); - - printbuf_tabstop_push(out, out->indent + 20); - printbuf_tabstop_push(out, TABSTOP_SIZE + 2); - printbuf_tabstop_push(out, 0); - printbuf_tabstop_push(out, TABSTOP_SIZE + 2); - - prt_printf(out, "\tsince mount\r\trecent\r\n"); - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, out->indent + 20); - printbuf_tabstop_push(out, TABSTOP_SIZE); - printbuf_tabstop_push(out, 2); - printbuf_tabstop_push(out, TABSTOP_SIZE); - - prt_printf(out, "duration of events\n"); - printbuf_indent_add(out, 2); - - pr_name_and_units(out, "min:", stats->min_duration); - pr_name_and_units(out, "max:", stats->max_duration); - pr_name_and_units(out, "total:", stats->total_duration); - - prt_printf(out, "mean:\t"); - bch2_pr_time_units_aligned(out, d_mean); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); - prt_newline(out); - - prt_printf(out, "stddev:\t"); - bch2_pr_time_units_aligned(out, d_stddev); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); - - printbuf_indent_sub(out, 2); - prt_newline(out); - - prt_printf(out, "time between events\n"); - printbuf_indent_add(out, 2); - - pr_name_and_units(out, "min:", stats->min_freq); - pr_name_and_units(out, "max:", stats->max_freq); - - prt_printf(out, "mean:\t"); - bch2_pr_time_units_aligned(out, f_mean); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); - prt_newline(out); - - prt_printf(out, "stddev:\t"); - bch2_pr_time_units_aligned(out, f_stddev); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); - - printbuf_indent_sub(out, 2); - prt_newline(out); - - printbuf_tabstops_reset(out); - - if (quantiles) { - int i = eytzinger0_first(NR_QUANTILES); - const struct time_unit *u = - bch2_pick_time_units(quantiles->entries[i].m); - u64 last_q = 0; - - prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(j, NR_QUANTILES) { - bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1; - - u64 q = max(quantiles->entries[j].m, last_q); - prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); - if (is_last) - prt_newline(out); - last_q = q; - } - } -} - -/* ratelimit: */ - -/** - * bch2_ratelimit_delay() - return how long to delay until the next time to do - * some work - * @d: the struct bch_ratelimit to update - * Returns: the amount of time to delay by, in jiffies - */ -u64 bch2_ratelimit_delay(struct bch_ratelimit *d) -{ - u64 now = local_clock(); - - return time_after64(d->next, now) - ? nsecs_to_jiffies(d->next - now) - : 0; -} - -/** - * bch2_ratelimit_increment() - increment @d by the amount of work done - * @d: the struct bch_ratelimit to update - * @done: the amount of work done, in arbitrary units - */ -void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) -{ - u64 now = local_clock(); - - d->next += div_u64(done * NSEC_PER_SEC, d->rate); - - if (time_before64(now + NSEC_PER_SEC, d->next)) - d->next = now + NSEC_PER_SEC; - - if (time_after64(now - NSEC_PER_SEC * 2, d->next)) - d->next = now - NSEC_PER_SEC * 2; -} - -/* pd controller: */ - -/* - * Updates pd_controller. Attempts to scale inputed values to units per second. - * @target: desired value - * @actual: current value - * - * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing - * it makes actual go down. - */ -void bch2_pd_controller_update(struct bch_pd_controller *pd, - s64 target, s64 actual, int sign) -{ - s64 proportional, derivative, change; - - unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; - - if (seconds_since_update == 0) - return; - - pd->last_update = jiffies; - - proportional = actual - target; - proportional *= seconds_since_update; - proportional = div_s64(proportional, pd->p_term_inverse); - - derivative = actual - pd->last_actual; - derivative = div_s64(derivative, seconds_since_update); - derivative = ewma_add(pd->smoothed_derivative, derivative, - (pd->d_term / seconds_since_update) ?: 1); - derivative = derivative * pd->d_term; - derivative = div_s64(derivative, pd->p_term_inverse); - - change = proportional + derivative; - - /* Don't increase rate if not keeping up */ - if (change > 0 && - pd->backpressure && - time_after64(local_clock(), - pd->rate.next + NSEC_PER_MSEC)) - change = 0; - - change *= (sign * -1); - - pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, - 1, UINT_MAX); - - pd->last_actual = actual; - pd->last_derivative = derivative; - pd->last_proportional = proportional; - pd->last_change = change; - pd->last_target = target; -} - -void bch2_pd_controller_init(struct bch_pd_controller *pd) -{ - pd->rate.rate = 1024; - pd->last_update = jiffies; - pd->p_term_inverse = 6000; - pd->d_term = 30; - pd->d_smooth = pd->d_term; - pd->backpressure = 1; -} - -void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 20); - - prt_printf(out, "rate:\t"); - prt_human_readable_s64(out, pd->rate.rate); - prt_newline(out); - - prt_printf(out, "target:\t"); - prt_human_readable_u64(out, pd->last_target); - prt_newline(out); - - prt_printf(out, "actual:\t"); - prt_human_readable_u64(out, pd->last_actual); - prt_newline(out); - - prt_printf(out, "proportional:\t"); - prt_human_readable_s64(out, pd->last_proportional); - prt_newline(out); - - prt_printf(out, "derivative:\t"); - prt_human_readable_s64(out, pd->last_derivative); - prt_newline(out); - - prt_printf(out, "change:\t"); - prt_human_readable_s64(out, pd->last_change); - prt_newline(out); - - prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); -} - -/* misc: */ - -void bch2_bio_map(struct bio *bio, void *base, size_t size) -{ - while (size) { - struct page *page = is_vmalloc_addr(base) - ? vmalloc_to_page(base) - : virt_to_page(base); - unsigned offset = offset_in_page(base); - unsigned len = min_t(size_t, PAGE_SIZE - offset, size); - - BUG_ON(!bio_add_page(bio, page, len, offset)); - size -= len; - base += len; - } -} - -int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) -{ - while (size) { - struct page *page = alloc_pages(gfp_mask, 0); - unsigned len = min_t(size_t, PAGE_SIZE, size); - - if (!page) - return -ENOMEM; - - if (unlikely(!bio_add_page(bio, page, len, 0))) { - __free_page(page); - break; - } - - size -= len; - } - - return 0; -} - -u64 bch2_get_random_u64_below(u64 ceil) -{ - if (ceil <= U32_MAX) - return __get_random_u32_below(ceil); - - /* this is the same (clever) algorithm as in __get_random_u32_below() */ - u64 rand = get_random_u64(); - u64 mult = ceil * rand; - - if (unlikely(mult < ceil)) { - u64 bound; - div64_u64_rem(-ceil, ceil, &bound); - while (unlikely(mult < bound)) { - rand = get_random_u64(); - mult = ceil * rand; - } - } - - return mul_u64_u64_shr(ceil, rand, 64); -} - -void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) -{ - struct bio_vec bv; - struct bvec_iter iter; - - __bio_for_each_segment(bv, dst, iter, dst_iter) { - void *dstp = kmap_local_page(bv.bv_page); - - memcpy(dstp + bv.bv_offset, src, bv.bv_len); - kunmap_local(dstp); - - src += bv.bv_len; - } -} - -void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) -{ - struct bio_vec bv; - struct bvec_iter iter; - - __bio_for_each_segment(bv, src, iter, src_iter) { - void *srcp = kmap_local_page(bv.bv_page); - - memcpy(dst, srcp + bv.bv_offset, bv.bv_len); - kunmap_local(srcp); - - dst += bv.bv_len; - } -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_corrupt_bio(struct bio *bio) -{ - struct bvec_iter iter; - struct bio_vec bv; - unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); - - bio_for_each_segment(bv, bio, iter) { - unsigned u64s = bv.bv_len / sizeof(u64); - - if (offset < u64s) { - u64 *segment = bvec_kmap_local(&bv); - segment[offset] = get_random_u64(); - kunmap_local(segment); - return; - } - offset -= u64s; - } -} -#endif - -void bch2_bio_to_text(struct printbuf *out, struct bio *bio) -{ - prt_printf(out, "bi_remaining:\t%u\n", - atomic_read(&bio->__bi_remaining)); - prt_printf(out, "bi_end_io:\t%ps\n", - bio->bi_end_io); - prt_printf(out, "bi_status:\t%u\n", - bio->bi_status); -} - -#if 0 -void eytzinger1_test(void) -{ - unsigned inorder, size; - - pr_info("1 based eytzinger test:\n"); - - for (size = 2; - size < 65536; - size++) { - unsigned extra = eytzinger1_extra(size); - - if (!(size % 4096)) - pr_info("tree size %u\n", size); - - inorder = 1; - eytzinger1_for_each(eytz, size) { - BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); - BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); - BUG_ON(eytz != eytzinger1_last(size) && - eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); - - inorder++; - } - BUG_ON(inorder - 1 != size); - } -} - -void eytzinger0_test(void) -{ - - unsigned inorder, size; - - pr_info("0 based eytzinger test:\n"); - - for (size = 1; - size < 65536; - size++) { - unsigned extra = eytzinger0_extra(size); - - if (!(size % 4096)) - pr_info("tree size %u\n", size); - - inorder = 0; - eytzinger0_for_each(eytz, size) { - BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); - BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); - BUG_ON(eytz != eytzinger0_last(size) && - eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); - - inorder++; - } - BUG_ON(inorder != size); - - inorder = size - 1; - eytzinger0_for_each_prev(eytz, size) { - BUG_ON(eytz != eytzinger0_first(size) && - eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz); - - inorder--; - } - BUG_ON(inorder != -1); - } -} - -static inline int cmp_u16(const void *_l, const void *_r) -{ - const u16 *l = _l, *r = _r; - - return (*l > *r) - (*r > *l); -} - -static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search) -{ - int r, s; - bool bad; - - r = eytzinger0_find_le(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - if (r >= 0) { - if (test_array[r] > search) { - bad = true; - } else { - s = eytzinger0_next(r, nr); - bad = s >= 0 && test_array[s] <= search; - } - } else { - s = eytzinger0_last(nr); - bad = s >= 0 && test_array[s] <= search; - } - - if (bad) { - s = -1; - eytzinger0_for_each_prev(j, nr) { - if (test_array[j] <= search) { - s = j; - break; - } - } - - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find_le(%12u) = %3i should be %3i\n", - search, r, s); - BUG(); - } -} - -static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) -{ - int r, s; - bool bad; - - r = eytzinger0_find_gt(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - if (r >= 0) { - if (test_array[r] <= search) { - bad = true; - } else { - s = eytzinger0_prev(r, nr); - bad = s >= 0 && test_array[s] > search; - } - } else { - s = eytzinger0_first(nr); - bad = s >= 0 && test_array[s] > search; - } - - if (bad) { - s = -1; - eytzinger0_for_each(j, nr) { - if (test_array[j] > search) { - s = j; - break; - } - } - - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find_gt(%12u) = %3i should be %3i\n", - search, r, s); - BUG(); - } -} - -static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) -{ - int r, s; - bool bad; - - r = eytzinger0_find_ge(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - if (r >= 0) { - if (test_array[r] < search) { - bad = true; - } else { - s = eytzinger0_prev(r, nr); - bad = s >= 0 && test_array[s] >= search; - } - } else { - s = eytzinger0_first(nr); - bad = s >= 0 && test_array[s] >= search; - } - - if (bad) { - s = -1; - eytzinger0_for_each(j, nr) { - if (test_array[j] >= search) { - s = j; - break; - } - } - - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find_ge(%12u) = %3i should be %3i\n", - search, r, s); - BUG(); - } -} - -static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search) -{ - unsigned r; - int s; - bool bad; - - r = eytzinger0_find(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - - if (r < nr) { - bad = test_array[r] != search; - } else { - s = eytzinger0_find_le(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - bad = s >= 0 && test_array[s] == search; - } - - if (bad) { - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find(%12u) = %3i is incorrect\n", - search, r); - BUG(); - } -} - -static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) -{ - eytzinger0_find_test_le(test_array, nr, search); - eytzinger0_find_test_gt(test_array, nr, search); - eytzinger0_find_test_ge(test_array, nr, search); - eytzinger0_find_test_eq(test_array, nr, search); -} - -void eytzinger0_find_test(void) -{ - unsigned i, nr, allocated = 1 << 12; - u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); - - for (nr = 1; nr < allocated; nr++) { - u16 prev = 0; - - pr_info("testing %u elems\n", nr); - - get_random_bytes(test_array, nr * sizeof(test_array[0])); - eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); - - /* verify array is sorted correctly: */ - eytzinger0_for_each(j, nr) { - BUG_ON(test_array[j] < prev); - prev = test_array[j]; - } - - for (i = 0; i < U16_MAX; i += 1 << 12) - eytzinger0_find_test_val(test_array, nr, i); - - for (i = 0; i < nr; i++) { - eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); - eytzinger0_find_test_val(test_array, nr, test_array[i]); - eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); - } - } - - kfree(test_array); -} -#endif - -/* - * Accumulate percpu counters onto one cpu's copy - only valid when access - * against any percpu counter is guarded against - */ -u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) -{ - u64 *ret; - int cpu; - - /* access to pcpu vars has to be blocked by other locking */ - preempt_disable(); - ret = this_cpu_ptr(p); - preempt_enable(); - - for_each_possible_cpu(cpu) { - u64 *i = per_cpu_ptr(p, cpu); - - if (i != ret) { - acc_u64s(ret, i, nr); - memset(i, 0, nr * sizeof(u64)); - } - } - - return ret; -} - -void bch2_darray_str_exit(darray_const_str *d) -{ - darray_for_each(*d, i) - kfree(*i); - darray_exit(d); -} - -int bch2_split_devs(const char *_dev_name, darray_const_str *ret) -{ - darray_init(ret); - - char *dev_name, *s, *orig; - - dev_name = orig = kstrdup(_dev_name, GFP_KERNEL); - if (!dev_name) - return -ENOMEM; - - while ((s = strsep(&dev_name, ":"))) { - char *p = kstrdup(s, GFP_KERNEL); - if (!p) - goto err; - - if (darray_push(ret, p)) { - kfree(p); - goto err; - } - } - - kfree(orig); - return 0; -err: - bch2_darray_str_exit(ret); - kfree(orig); - return -ENOMEM; -} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h deleted file mode 100644 index 6488f098d1407e..00000000000000 --- a/fs/bcachefs/util.h +++ /dev/null @@ -1,782 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_UTIL_H -#define _BCACHEFS_UTIL_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mean_and_variance.h" - -#include "darray.h" -#include "time_stats.h" - -struct closure; - -#ifdef CONFIG_BCACHEFS_DEBUG -#define EBUG_ON(cond) BUG_ON(cond) -#else -#define EBUG_ON(cond) -#endif - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define CPU_BIG_ENDIAN 0 -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#define CPU_BIG_ENDIAN 1 -#endif - -/* type hackery */ - -#define type_is_exact(_val, _type) \ - __builtin_types_compatible_p(typeof(_val), _type) - -#define type_is(_val, _type) \ - (__builtin_types_compatible_p(typeof(_val), _type) || \ - __builtin_types_compatible_p(typeof(_val), const _type)) - -/* Userspace doesn't align allocations as nicely as the kernel allocators: */ -static inline size_t buf_pages(void *p, size_t len) -{ - return DIV_ROUND_UP(len + - ((unsigned long) p & (PAGE_SIZE - 1)), - PAGE_SIZE); -} - -static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) -{ - void *p = unlikely(n >= INT_MAX) - ? vmalloc_noprof(n) - : kvmalloc_noprof(n, flags & ~__GFP_ZERO); - if (p && (flags & __GFP_ZERO)) - memset(p, 0, n); - return p; -} -#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__)) - -#define init_heap(heap, _size, gfp) \ -({ \ - (heap)->nr = 0; \ - (heap)->size = (_size); \ - (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\ - (gfp)); \ -}) - -#define free_heap(heap) \ -do { \ - kvfree((heap)->data); \ - (heap)->data = NULL; \ -} while (0) - -#define ANYSINT_MAX(t) \ - ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) - -#include "printbuf.h" - -#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__) -#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__) -#define printbuf_str(_buf) bch2_printbuf_str(_buf) -#define printbuf_exit(_buf) bch2_printbuf_exit(_buf) - -#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf) -#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf) -#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n) - -#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n) -#define printbuf_indent_add_nextline(_out, _n) bch2_printbuf_indent_add_nextline(_out, _n) -#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n) - -#define prt_newline(_out) bch2_prt_newline(_out) -#define prt_tab(_out) bch2_prt_tab(_out) -#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out) - -#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__) -#define prt_u64(_out, _v) prt_printf(_out, "%llu", (u64) (_v)) -#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__) -#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__) -#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__) -#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__) -#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__) -#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) -#define prt_bitflags_vector(...) bch2_prt_bitflags_vector(__VA_ARGS__) - -void bch2_pr_time_units(struct printbuf *, u64); -void bch2_prt_datetime(struct printbuf *, time64_t); - -#ifdef __KERNEL__ -static inline void uuid_unparse_lower(u8 *uuid, char *out) -{ - sprintf(out, "%pUb", uuid); -} -#else -#include -#endif - -static inline void pr_uuid(struct printbuf *out, u8 *uuid) -{ - char uuid_str[40]; - - uuid_unparse_lower(uuid, uuid_str); - prt_printf(out, "%s", uuid_str); -} - -int bch2_strtoint_h(const char *, int *); -int bch2_strtouint_h(const char *, unsigned int *); -int bch2_strtoll_h(const char *, long long *); -int bch2_strtoull_h(const char *, unsigned long long *); -int bch2_strtou64_h(const char *, u64 *); - -static inline int bch2_strtol_h(const char *cp, long *res) -{ -#if BITS_PER_LONG == 32 - return bch2_strtoint_h(cp, (int *) res); -#else - return bch2_strtoll_h(cp, (long long *) res); -#endif -} - -static inline int bch2_strtoul_h(const char *cp, long *res) -{ -#if BITS_PER_LONG == 32 - return bch2_strtouint_h(cp, (unsigned int *) res); -#else - return bch2_strtoull_h(cp, (unsigned long long *) res); -#endif -} - -#define strtoi_h(cp, res) \ - ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ - : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ - : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ - : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ - : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ - : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ - : -EINVAL) - -#define strtoul_safe(cp, var) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r) \ - var = _v; \ - _r; \ -}) - -#define strtoul_safe_clamp(cp, var, min, max) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r) \ - var = clamp_t(typeof(var), _v, min, max); \ - _r; \ -}) - -#define strtoul_safe_restrict(cp, var, min, max) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r && _v >= min && _v <= max) \ - var = _v; \ - else \ - _r = -EINVAL; \ - _r; \ -}) - -#define snprint(out, var) \ - prt_printf(out, \ - type_is(var, int) ? "%i\n" \ - : type_is(var, unsigned) ? "%u\n" \ - : type_is(var, long) ? "%li\n" \ - : type_is(var, unsigned long) ? "%lu\n" \ - : type_is(var, s64) ? "%lli\n" \ - : type_is(var, u64) ? "%llu\n" \ - : type_is(var, char *) ? "%s\n" \ - : "%i\n", var) - -bool bch2_is_zero(const void *, size_t); - -u64 bch2_read_flag_list(const char *, const char * const[]); - -void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); -void bch2_prt_u64_base2(struct printbuf *, u64); - -void bch2_print_string_as_lines(const char *, const char *); - -typedef DARRAY(unsigned long) bch_stacktrace; -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); -void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); -int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t); - -static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) -{ -#ifdef __KERNEL__ - prt_printf(out, "%pg", bdev); -#else - prt_str(out, bdev->name); -#endif -} - -void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); - -#define ewma_add(ewma, val, weight) \ -({ \ - typeof(ewma) _ewma = (ewma); \ - typeof(weight) _weight = (weight); \ - \ - (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ -}) - -struct bch_ratelimit { - /* Next time we want to do some work, in nanoseconds */ - u64 next; - - /* - * Rate at which we want to do work, in units per nanosecond - * The units here correspond to the units passed to - * bch2_ratelimit_increment() - */ - unsigned rate; -}; - -static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) -{ - d->next = local_clock(); -} - -u64 bch2_ratelimit_delay(struct bch_ratelimit *); -void bch2_ratelimit_increment(struct bch_ratelimit *, u64); - -struct bch_pd_controller { - struct bch_ratelimit rate; - unsigned long last_update; - - s64 last_actual; - s64 smoothed_derivative; - - unsigned p_term_inverse; - unsigned d_smooth; - unsigned d_term; - - /* for exporting to sysfs (no effect on behavior) */ - s64 last_derivative; - s64 last_proportional; - s64 last_change; - s64 last_target; - - /* - * If true, the rate will not increase if bch2_ratelimit_delay() - * is not being called often enough. - */ - bool backpressure; -}; - -void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); -void bch2_pd_controller_init(struct bch_pd_controller *); -void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *); - -#define sysfs_pd_controller_attribute(name) \ - rw_attribute(name##_rate); \ - rw_attribute(name##_rate_bytes); \ - rw_attribute(name##_rate_d_term); \ - rw_attribute(name##_rate_p_term_inverse); \ - read_attribute(name##_rate_debug) - -#define sysfs_pd_controller_files(name) \ - &sysfs_##name##_rate, \ - &sysfs_##name##_rate_bytes, \ - &sysfs_##name##_rate_d_term, \ - &sysfs_##name##_rate_p_term_inverse, \ - &sysfs_##name##_rate_debug - -#define sysfs_pd_controller_show(name, var) \ -do { \ - sysfs_hprint(name##_rate, (var)->rate.rate); \ - sysfs_print(name##_rate_bytes, (var)->rate.rate); \ - sysfs_print(name##_rate_d_term, (var)->d_term); \ - sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ - \ - if (attr == &sysfs_##name##_rate_debug) \ - bch2_pd_controller_debug_to_text(out, var); \ -} while (0) - -#define sysfs_pd_controller_store(name, var) \ -do { \ - sysfs_strtoul_clamp(name##_rate, \ - (var)->rate.rate, 1, UINT_MAX); \ - sysfs_strtoul_clamp(name##_rate_bytes, \ - (var)->rate.rate, 1, UINT_MAX); \ - sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ - sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ - (var)->p_term_inverse, 1, INT_MAX); \ -} while (0) - -#define container_of_or_null(ptr, type, member) \ -({ \ - typeof(ptr) _ptr = ptr; \ - _ptr ? container_of(_ptr, type, member) : NULL; \ -}) - -static inline struct list_head *list_pop(struct list_head *head) -{ - if (list_empty(head)) - return NULL; - - struct list_head *ret = head->next; - list_del_init(ret); - return ret; -} - -#define list_pop_entry(head, type, member) \ - container_of_or_null(list_pop(head), type, member) - -/* Does linear interpolation between powers of two */ -static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) -{ - unsigned fract = x & ~(~0 << fract_bits); - - x >>= fract_bits; - x = 1 << x; - x += (x * fract) >> fract_bits; - - return x; -} - -void bch2_bio_map(struct bio *bio, void *base, size_t); -int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); - -#define closure_bio_submit(bio, cl) \ -do { \ - closure_get(cl); \ - submit_bio(bio); \ -} while (0) - -#define kthread_wait(cond) \ -({ \ - int _ret = 0; \ - \ - while (1) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (kthread_should_stop()) { \ - _ret = -1; \ - break; \ - } \ - \ - if (cond) \ - break; \ - \ - schedule(); \ - } \ - set_current_state(TASK_RUNNING); \ - _ret; \ -}) - -#define kthread_wait_freezable(cond) \ -({ \ - int _ret = 0; \ - while (1) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (kthread_should_stop()) { \ - _ret = -1; \ - break; \ - } \ - \ - if (cond) \ - break; \ - \ - schedule(); \ - try_to_freeze(); \ - } \ - set_current_state(TASK_RUNNING); \ - _ret; \ -}) - -u64 bch2_get_random_u64_below(u64); - -void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); -void memcpy_from_bio(void *, struct bio *, struct bvec_iter); - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_corrupt_bio(struct bio *); - -static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) -{ - if (ratio && !get_random_u32_below(ratio)) - bch2_corrupt_bio(bio); -} -#else -#define bch2_maybe_corrupt_bio(...) do {} while (0) -#endif - -void bch2_bio_to_text(struct printbuf *, struct bio *); - -static inline void memcpy_u64s_small(void *dst, const void *src, - unsigned u64s) -{ - u64 *d = dst; - const u64 *s = src; - - while (u64s--) - *d++ = *s++; -} - -static inline void __memcpy_u64s(void *dst, const void *src, - unsigned u64s) -{ -#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) - long d0, d1, d2; - - asm volatile("rep ; movsq" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (u64s), "1" (dst), "2" (src) - : "memory"); -#else - u64 *d = dst; - const u64 *s = src; - - while (u64s--) - *d++ = *s++; -#endif -} - -static inline void memcpy_u64s(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(!(dst >= src + u64s * sizeof(u64) || - dst + u64s * sizeof(u64) <= src)); - - __memcpy_u64s(dst, src, u64s); -} - -static inline void __memmove_u64s_down(void *dst, const void *src, - unsigned u64s) -{ - __memcpy_u64s(dst, src, u64s); -} - -static inline void memmove_u64s_down(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst > src); - - __memmove_u64s_down(dst, src, u64s); -} - -static inline void __memmove_u64s_down_small(void *dst, const void *src, - unsigned u64s) -{ - memcpy_u64s_small(dst, src, u64s); -} - -static inline void memmove_u64s_down_small(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst > src); - - __memmove_u64s_down_small(dst, src, u64s); -} - -static inline void __memmove_u64s_up_small(void *_dst, const void *_src, - unsigned u64s) -{ - u64 *dst = (u64 *) _dst + u64s; - u64 *src = (u64 *) _src + u64s; - - while (u64s--) - *--dst = *--src; -} - -static inline void memmove_u64s_up_small(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst < src); - - __memmove_u64s_up_small(dst, src, u64s); -} - -static inline void __memmove_u64s_up(void *_dst, const void *_src, - unsigned u64s) -{ - u64 *dst = (u64 *) _dst + u64s - 1; - u64 *src = (u64 *) _src + u64s - 1; - -#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) - long d0, d1, d2; - - asm volatile("std ;\n" - "rep ; movsq\n" - "cld ;\n" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (u64s), "1" (dst), "2" (src) - : "memory"); -#else - while (u64s--) - *dst-- = *src--; -#endif -} - -static inline void memmove_u64s_up(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst < src); - - __memmove_u64s_up(dst, src, u64s); -} - -static inline void memmove_u64s(void *dst, const void *src, - unsigned u64s) -{ - if (dst < src) - __memmove_u64s_down(dst, src, u64s); - else - __memmove_u64s_up(dst, src, u64s); -} - -/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ -static inline void memset_u64s_tail(void *s, int c, unsigned bytes) -{ - unsigned rem = round_up(bytes, sizeof(u64)) - bytes; - - memset(s + bytes, c, rem); -} - -/* just the memmove, doesn't update @_nr */ -#define __array_insert_item(_array, _nr, _pos) \ - memmove(&(_array)[(_pos) + 1], \ - &(_array)[(_pos)], \ - sizeof((_array)[0]) * ((_nr) - (_pos))) - -#define array_insert_item(_array, _nr, _pos, _new_item) \ -do { \ - __array_insert_item(_array, _nr, _pos); \ - (_nr)++; \ - (_array)[(_pos)] = (_new_item); \ -} while (0) - -#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ -do { \ - (_nr) -= (_nr_to_remove); \ - memmove(&(_array)[(_pos)], \ - &(_array)[(_pos) + (_nr_to_remove)], \ - sizeof((_array)[0]) * ((_nr) - (_pos))); \ -} while (0) - -#define array_remove_item(_array, _nr, _pos) \ - array_remove_items(_array, _nr, _pos, 1) - -static inline void __move_gap(void *array, size_t element_size, - size_t nr, size_t size, - size_t old_gap, size_t new_gap) -{ - size_t gap_end = old_gap + size - nr; - - if (new_gap < old_gap) { - size_t move = old_gap - new_gap; - - memmove(array + element_size * (gap_end - move), - array + element_size * (old_gap - move), - element_size * move); - } else if (new_gap > old_gap) { - size_t move = new_gap - old_gap; - - memmove(array + element_size * old_gap, - array + element_size * gap_end, - element_size * move); - } -} - -/* Move the gap in a gap buffer: */ -#define move_gap(_d, _new_gap) \ -do { \ - BUG_ON(_new_gap > (_d)->nr); \ - BUG_ON((_d)->gap > (_d)->nr); \ - \ - __move_gap((_d)->data, sizeof((_d)->data[0]), \ - (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \ - (_d)->gap = _new_gap; \ -} while (0) - -#define bubble_sort(_base, _nr, _cmp) \ -do { \ - ssize_t _i, _last; \ - bool _swapped = true; \ - \ - for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\ - _swapped = false; \ - for (_i = 0; _i < _last; _i++) \ - if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ - swap((_base)[_i], (_base)[_i + 1]); \ - _swapped = true; \ - } \ - } \ -} while (0) - -#define per_cpu_sum(_p) \ -({ \ - TYPEOF_UNQUAL(*_p) _ret = 0; \ - \ - int cpu; \ - for_each_possible_cpu(cpu) \ - _ret += *per_cpu_ptr(_p, cpu); \ - _ret; \ -}) - -static inline u64 percpu_u64_get(u64 __percpu *src) -{ - return per_cpu_sum(src); -} - -static inline void percpu_u64_set(u64 __percpu *dst, u64 src) -{ - int cpu; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(dst, cpu) = 0; - this_cpu_write(*dst, src); -} - -static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) -{ - for (unsigned i = 0; i < nr; i++) - acc[i] += src[i]; -} - -static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, - unsigned nr) -{ - int cpu; - - for_each_possible_cpu(cpu) - acc_u64s(acc, per_cpu_ptr(src, cpu), nr); -} - -static inline void percpu_memset(void __percpu *p, int c, size_t bytes) -{ - int cpu; - - for_each_possible_cpu(cpu) - memset(per_cpu_ptr(p, cpu), c, bytes); -} - -u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); - -static inline int u8_cmp(u8 l, u8 r) -{ - return cmp_int(l, r); -} - -static inline int cmp_le32(__le32 l, __le32 r) -{ - return cmp_int(le32_to_cpu(l), le32_to_cpu(r)); -} - -#include - -static inline bool qstr_eq(const struct qstr l, const struct qstr r) -{ - return l.len == r.len && !memcmp(l.name, r.name, l.len); -} - -void bch2_darray_str_exit(darray_const_str *); -int bch2_split_devs(const char *, darray_const_str *); - -#ifdef __KERNEL__ - -__must_check -static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) -{ - return copy_to_user(to, from, n) ? -EFAULT : 0; -} - -__must_check -static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n) -{ - return copy_from_user(to, from, n) ? -EFAULT : 0; -} - -#endif - -static inline void mod_bit(long nr, volatile unsigned long *addr, bool v) -{ - if (v) - set_bit(nr, addr); - else - clear_bit(nr, addr); -} - -static inline void __set_bit_le64(size_t bit, __le64 *addr) -{ - addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); -} - -static inline void __clear_bit_le64(size_t bit, __le64 *addr) -{ - addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64)); -} - -static inline bool test_bit_le64(size_t bit, __le64 *addr) -{ - return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; -} - -static inline void memcpy_swab(void *_dst, void *_src, size_t len) -{ - u8 *dst = _dst + len; - u8 *src = _src; - - while (len--) - *--dst = *src++; -} - -#define set_flags(_map, _in, _out) \ -do { \ - unsigned _i; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & (1 << _i)) \ - (_out) |= _map[_i]; \ - else \ - (_out) &= ~_map[_i]; \ -} while (0) - -#define map_flags(_map, _in) \ -({ \ - unsigned _out = 0; \ - \ - set_flags(_map, _in, _out); \ - _out; \ -}) - -#define map_flags_rev(_map, _in) \ -({ \ - unsigned _i, _out = 0; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & _map[_i]) { \ - (_out) |= 1 << _i; \ - (_in) &= ~_map[_i]; \ - } \ - (_out); \ -}) - -#define map_defined(_map) \ -({ \ - unsigned _in = ~0; \ - \ - map_flags_rev(_map, _in); \ -}) - -#endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c deleted file mode 100644 index 6620ecae26af3a..00000000000000 --- a/fs/bcachefs/varint.c +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include - -#ifdef CONFIG_VALGRIND -#include -#endif - -#include "errcode.h" -#include "varint.h" - -/** - * bch2_varint_encode - encode a variable length integer - * @out: destination to encode to - * @v: unsigned integer to encode - * Returns: size in bytes of the encoded integer - at most 9 bytes - */ -int bch2_varint_encode(u8 *out, u64 v) -{ - unsigned bits = fls64(v|1); - unsigned bytes = DIV_ROUND_UP(bits, 7); - __le64 v_le; - - if (likely(bytes < 9)) { - v <<= bytes; - v |= ~(~0 << (bytes - 1)); - v_le = cpu_to_le64(v); - memcpy(out, &v_le, bytes); - } else { - *out++ = 255; - bytes = 9; - put_unaligned_le64(v, out); - } - - return bytes; -} - -/** - * bch2_varint_decode - encode a variable length integer - * @in: varint to decode - * @end: end of buffer to decode from - * @out: on success, decoded integer - * Returns: size in bytes of the decoded integer - or -1 on failure (would - * have read past the end of the buffer) - */ -int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) -{ - unsigned bytes = likely(in < end) - ? ffz(*in & 255) + 1 - : 1; - u64 v; - - if (unlikely(in + bytes > end)) - return -BCH_ERR_varint_decode_error; - - if (likely(bytes < 9)) { - __le64 v_le = 0; - - memcpy(&v_le, in, bytes); - v = le64_to_cpu(v_le); - v >>= bytes; - } else { - v = get_unaligned_le64(++in); - } - - *out = v; - return bytes; -} - -/** - * bch2_varint_encode_fast - fast version of bch2_varint_encode - * @out: destination to encode to - * @v: unsigned integer to encode - * Returns: size in bytes of the encoded integer - at most 9 bytes - * - * This version assumes it's always safe to write 8 bytes to @out, even if the - * encoded integer would be smaller. - */ -int bch2_varint_encode_fast(u8 *out, u64 v) -{ - unsigned bits = fls64(v|1); - unsigned bytes = DIV_ROUND_UP(bits, 7); - - if (likely(bytes < 9)) { - v <<= bytes; - v |= ~(~0U << (bytes - 1)); - } else { - *out++ = 255; - bytes = 9; - } - - put_unaligned_le64(v, out); - return bytes; -} - -/** - * bch2_varint_decode_fast - fast version of bch2_varint_decode - * @in: varint to decode - * @end: end of buffer to decode from - * @out: on success, decoded integer - * Returns: size in bytes of the decoded integer - or -1 on failure (would - * have read past the end of the buffer) - * - * This version assumes that it is safe to read at most 8 bytes past the end of - * @end (we still return an error if the varint extends past @end). - */ -int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) -{ -#ifdef CONFIG_VALGRIND - VALGRIND_MAKE_MEM_DEFINED(in, 8); -#endif - u64 v = get_unaligned_le64(in); - unsigned bytes = ffz(*in) + 1; - - if (unlikely(in + bytes > end)) - return -BCH_ERR_varint_decode_error; - - if (likely(bytes < 9)) { - v >>= bytes; - v &= ~(~0ULL << (7 * bytes)); - } else { - v = get_unaligned_le64(++in); - } - - *out = v; - return bytes; -} diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h deleted file mode 100644 index 92a182fb3d7aed..00000000000000 --- a/fs/bcachefs/varint.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_VARINT_H -#define _BCACHEFS_VARINT_H - -int bch2_varint_encode(u8 *, u64); -int bch2_varint_decode(const u8 *, const u8 *, u64 *); - -int bch2_varint_encode_fast(u8 *, u64); -int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *); - -#endif /* _BCACHEFS_VARINT_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h deleted file mode 100644 index 2ad338e282da82..00000000000000 --- a/fs/bcachefs/vstructs.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _VSTRUCTS_H -#define _VSTRUCTS_H - -#include "util.h" - -/* - * NOTE: we can't differentiate between __le64 and u64 with type_is - this - * assumes u64 is little endian: - */ -#define __vstruct_u64s(_s) \ -({ \ - ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ - : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ - : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ - : ((__force u8) ((_s)->u64s))); \ -}) - -#define __vstruct_bytes(_type, _u64s) \ -({ \ - BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ - \ - (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ -}) - -#define vstruct_bytes(_s) \ - __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) - -#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ - (round_up(__vstruct_bytes(_type, _u64s), \ - 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) - -#define vstruct_blocks(_s, _sector_block_bits) \ - __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) - -#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ - __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ - __vstruct_u64s(_s) + (_u64s)) - -#define vstruct_sectors(_s, _sector_block_bits) \ - (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) - -#define vstruct_next(_s) \ - ((typeof(_s)) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) -#define vstruct_last(_s) \ - ((typeof(&(_s)->start[0])) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) -#define vstruct_end(_s) \ - ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) - -#define vstruct_for_each(_s, _i) \ - for (typeof(&(_s)->start[0]) _i = (_s)->start; \ - _i < vstruct_last(_s); \ - _i = vstruct_next(_i)) - -#define vstruct_for_each_safe(_s, _i) \ - for (typeof(&(_s)->start[0]) _next, _i = (_s)->start; \ - _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \ - _i = _next) - -#define vstruct_idx(_s, _idx) \ - ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) - -#endif /* _VSTRUCTS_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c deleted file mode 100644 index 627f153798c679..00000000000000 --- a/fs/bcachefs/xattr.c +++ /dev/null @@ -1,642 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "acl.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "extents.h" -#include "fs.h" -#include "rebalance.h" -#include "str_hash.h" -#include "xattr.h" - -#include -#include -#include - -static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); - -static u64 bch2_xattr_hash(const struct bch_hash_info *info, - const struct xattr_search_key *key) -{ - struct bch_str_hash_ctx ctx; - - bch2_str_hash_init(&ctx, info); - bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); - bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); - - return bch2_str_hash_end(&ctx, info); -} - -static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) -{ - return bch2_xattr_hash(info, key); -} - -static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -{ - struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); - - return bch2_xattr_hash(info, - &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len)); -} - -static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) -{ - struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); - const struct xattr_search_key *r = _r; - - return l.v->x_type != r->type || - l.v->x_name_len != r->name.len || - memcmp(l.v->x_name_and_value, r->name.name, r->name.len); -} - -static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -{ - struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); - struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); - - return l.v->x_type != r.v->x_type || - l.v->x_name_len != r.v->x_name_len || - memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len); -} - -const struct bch_hash_desc bch2_xattr_hash_desc = { - .btree_id = BTREE_ID_xattrs, - .key_type = KEY_TYPE_xattr, - .hash_key = xattr_hash_key, - .hash_bkey = xattr_hash_bkey, - .cmp_key = xattr_cmp_key, - .cmp_bkey = xattr_cmp_bkey, -}; - -int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len)); - int ret = 0; - - bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, - c, xattr_val_size_too_small, - "value too small (%zu < %u)", - bkey_val_u64s(k.k), val_u64s); - - /* XXX why +4 ? */ - val_u64s = xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len) + 4); - - bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, - c, xattr_val_size_too_big, - "value too big (%zu > %u)", - bkey_val_u64s(k.k), val_u64s); - - bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), - c, xattr_invalid_type, - "invalid type (%u)", xattr.v->x_type); - - bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len), - c, xattr_name_invalid_chars, - "xattr name has invalid characters"); -fsck_err: - return ret; -} - -void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct xattr_handler *handler; - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - - handler = bch2_xattr_type_to_handler(xattr.v->x_type); - if (handler && handler->prefix) - prt_printf(out, "%s", handler->prefix); - else if (handler) - prt_printf(out, "(type %u)", xattr.v->x_type); - else - prt_printf(out, "(unknown type %u)", xattr.v->x_type); - - unsigned name_len = xattr.v->x_name_len; - unsigned val_len = le16_to_cpu(xattr.v->x_val_len); - unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - - offsetof(struct bch_xattr, x_name_and_value); - - val_len = min_t(int, val_len, max_name_val_bytes - name_len); - name_len = min(name_len, max_name_val_bytes); - - prt_printf(out, "%.*s:%.*s", - name_len, xattr.v->x_name_and_value, - val_len, (char *) xattr_val(xattr.v)); - - if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || - xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) { - prt_char(out, ' '); - bch2_acl_to_text(out, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); - } -} - -static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, - const char *name, void *buffer, size_t size, int type) -{ - struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); - struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, - inode_inum(inode), &search, 0); - int ret = bkey_err(k); - if (ret) - return ret; - - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - ret = le16_to_cpu(xattr.v->x_val_len); - if (buffer) { - if (ret > size) - ret = -ERANGE; - else - memcpy(buffer, xattr_val(xattr.v), ret); - } - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode_u, - const struct bch_hash_info *hash_info, - const char *name, const void *value, size_t size, - int type, int flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter inode_iter = {}; - int ret; - - ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: - bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); - if (ret) - return ret; - - /* - * Besides the ctime update, extents, dirents and xattrs updates require - * that an inode update also happens - to ensure that if a key exists in - * one of those btrees with a given snapshot ID an inode is also present - */ - inode_u->bi_ctime = bch2_current_time(c); - - ret = bch2_inode_write(trans, &inode_iter, inode_u); - bch2_trans_iter_exit(trans, &inode_iter); - - if (ret) - return ret; - - if (value) { - struct bkey_i_xattr *xattr; - unsigned namelen = strlen(name); - unsigned u64s = BKEY_U64s + - xattr_val_u64s(namelen, size); - - if (u64s > U8_MAX) - return -ERANGE; - - xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); - if (IS_ERR(xattr)) - return PTR_ERR(xattr); - - bkey_xattr_init(&xattr->k_i); - xattr->k.u64s = u64s; - xattr->v.x_type = type; - xattr->v.x_name_len = namelen; - xattr->v.x_val_len = cpu_to_le16(size); - memcpy(xattr->v.x_name_and_value, name, namelen); - memcpy(xattr_val(&xattr->v), value, size); - - ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, - inum, &xattr->k_i, - (flags & XATTR_CREATE ? STR_HASH_must_create : 0)| - (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0)); - } else { - struct xattr_search_key search = - X_SEARCH(type, name, strlen(name)); - - ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, - hash_info, inum, &search); - } - - if (bch2_err_matches(ret, ENOENT)) - ret = flags & XATTR_REPLACE ? -ENODATA : 0; - - return ret; -} - -struct xattr_buf { - char *buf; - size_t len; - size_t used; -}; - -static int __bch2_xattr_emit(const char *prefix, - const char *name, size_t name_len, - struct xattr_buf *buf) -{ - const size_t prefix_len = strlen(prefix); - const size_t total_len = prefix_len + name_len + 1; - - if (buf->buf) { - if (buf->used + total_len > buf->len) - return -ERANGE; - - memcpy(buf->buf + buf->used, prefix, prefix_len); - memcpy(buf->buf + buf->used + prefix_len, - name, name_len); - buf->buf[buf->used + prefix_len + name_len] = '\0'; - } - - buf->used += total_len; - return 0; -} - -static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry) -{ - const struct xattr_handler *handler = bch2_xattr_type_to_handler(type); - - if (!xattr_handler_can_list(handler, dentry)) - return NULL; - - return xattr_prefix(handler); -} - -static int bch2_xattr_emit(struct dentry *dentry, - const struct bch_xattr *xattr, - struct xattr_buf *buf) -{ - const char *prefix; - - prefix = bch2_xattr_prefix(xattr->x_type, dentry); - if (!prefix) - return 0; - - return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf); -} - -static int bch2_xattr_list_bcachefs(struct bch_fs *c, - struct bch_inode_unpacked *inode, - struct xattr_buf *buf, - bool all) -{ - const char *prefix = all ? "bcachefs_effective." : "bcachefs."; - unsigned id; - int ret = 0; - u64 v; - - for (id = 0; id < Inode_opt_nr; id++) { - v = bch2_inode_opt_get(inode, id); - if (!v) - continue; - - if (!all && - !(inode->bi_fields_set & (1 << id))) - continue; - - ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], - strlen(bch2_inode_opts[id]), buf); - if (ret) - break; - } - - return ret; -} - -ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - struct bch_fs *c = dentry->d_sb->s_fs_info; - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; - u64 offset = 0, inum = inode->ei_inode.bi_inum; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, - POS(inum, offset), - POS(inum, U64_MAX), - inode->ei_inum.subvol, 0, k, ({ - if (k.k->type != KEY_TYPE_xattr) - continue; - - bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); - }))) ?: - bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?: - bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); - - return ret ? bch2_err_class(ret) : buf.used; -} - -static int bch2_xattr_get_handler(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = bch2_trans_do(c, - bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); - - if (ret < 0 && bch2_err_matches(ret, ENOENT)) - ret = -ENODATA; - - return bch2_err_class(ret); -} - -static int bch2_xattr_set_handler(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *vinode, - const char *name, const void *value, - size_t size, int flags) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - struct bch_inode_unpacked inode_u; - int ret; - - ret = bch2_trans_run(c, - commit_do(trans, NULL, NULL, 0, - bch2_xattr_set(trans, inode_inum(inode), &inode_u, - &hash, name, value, size, - handler->flags, flags)) ?: - (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0)); - - return bch2_err_class(ret); -} - -static const struct xattr_handler bch_xattr_user_handler = { - .prefix = XATTR_USER_PREFIX, - .get = bch2_xattr_get_handler, - .set = bch2_xattr_set_handler, - .flags = KEY_TYPE_XATTR_INDEX_USER, -}; - -static bool bch2_xattr_trusted_list(struct dentry *dentry) -{ - return capable(CAP_SYS_ADMIN); -} - -static const struct xattr_handler bch_xattr_trusted_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .list = bch2_xattr_trusted_list, - .get = bch2_xattr_get_handler, - .set = bch2_xattr_set_handler, - .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, -}; - -static const struct xattr_handler bch_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .get = bch2_xattr_get_handler, - .set = bch2_xattr_set_handler, - .flags = KEY_TYPE_XATTR_INDEX_SECURITY, -}; - -#ifndef NO_BCACHEFS_FS - -static int opt_to_inode_opt(int id) -{ - switch (id) { -#define x(name, ...) \ - case Opt_##name: return Inode_opt_##name; - BCH_INODE_OPTS() -#undef x - default: - return -1; - } -} - -static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size, - bool all) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_opts opts = - bch2_inode_opts_to_opts(&inode->ei_inode); - const struct bch_option *opt; - int id, inode_opt_id; - struct printbuf out = PRINTBUF; - int ret; - u64 v; - - id = bch2_opt_lookup(name); - if (id < 0 || !bch2_opt_is_inode_opt(id)) - return -EINVAL; - - inode_opt_id = opt_to_inode_opt(id); - if (inode_opt_id < 0) - return -EINVAL; - - opt = bch2_opt_table + id; - - if (!bch2_opt_defined_by_id(&opts, id)) - return -ENODATA; - - if (!all && - !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) - return -ENODATA; - - v = bch2_opt_get_by_id(&opts, id); - bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); - - ret = out.pos; - - if (out.allocation_failure) { - ret = -ENOMEM; - } else if (buffer) { - if (out.pos > size) - ret = -ERANGE; - else - memcpy(buffer, out.buf, out.pos); - } - - printbuf_exit(&out); - return ret; -} - -static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size) -{ - return __bch2_xattr_bcachefs_get(handler, dentry, vinode, - name, buffer, size, false); -} - -struct inode_opt_set { - int id; - u64 v; - bool defined; -}; - -static int inode_opt_set_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct inode_opt_set *s = p; - - if (s->id == Inode_opt_casefold) { - int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v); - if (ret) - return ret; - } - - if (s->defined) - bi->bi_fields_set |= 1U << s->id; - else - bi->bi_fields_set &= ~(1U << s->id); - - bch2_inode_opt_set(bi, s->id, s->v); - - return 0; -} - -static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *vinode, - const char *name, const void *value, - size_t size, int flags) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - const struct bch_option *opt; - char *buf; - struct inode_opt_set s; - int opt_id, inode_opt_id, ret; - - opt_id = bch2_opt_lookup(name); - if (opt_id < 0) - return -EINVAL; - - opt = bch2_opt_table + opt_id; - - inode_opt_id = opt_to_inode_opt(opt_id); - if (inode_opt_id < 0) - return -EINVAL; - - s.id = inode_opt_id; - - if (value) { - u64 v = 0; - - buf = kmalloc(size + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - memcpy(buf, value, size); - buf[size] = '\0'; - - ret = bch2_opt_parse(c, opt, buf, &v, NULL); - kfree(buf); - - if (ret < 0) - goto err_class_exit; - - ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v); - if (ret < 0) - goto err_class_exit; - - s.v = v + 1; - s.defined = true; - } else { - /* - * Check if this option was set on the parent - if so, switched - * back to inheriting from the parent: - * - * rename() also has to deal with keeping inherited options up - * to date - see bch2_reinherit_attrs() - */ - spin_lock(&dentry->d_lock); - if (!IS_ROOT(dentry)) { - struct bch_inode_info *dir = - to_bch_ei(d_inode(dentry->d_parent)); - - s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); - } else { - s.v = 0; - } - spin_unlock(&dentry->d_lock); - - s.defined = false; - } - - mutex_lock(&inode->ei_update_lock); - if (inode_opt_id == Inode_opt_project) { - /* - * inode fields accessible via the xattr interface are stored - * with a +1 bias, so that 0 means unset: - */ - ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); - if (ret) - goto err; - } - - ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); -err: - mutex_unlock(&inode->ei_update_lock); -err_class_exit: - return bch2_err_class(ret); -} - -static const struct xattr_handler bch_xattr_bcachefs_handler = { - .prefix = "bcachefs.", - .get = bch2_xattr_bcachefs_get, - .set = bch2_xattr_bcachefs_set, -}; - -static int bch2_xattr_bcachefs_get_effective( - const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size) -{ - return __bch2_xattr_bcachefs_get(handler, dentry, vinode, - name, buffer, size, true); -} - -/* Noop - xattrs in the bcachefs_effective namespace are inherited */ -static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *vinode, - const char *name, const void *value, - size_t size, int flags) -{ - return 0; -} - -static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { - .prefix = "bcachefs_effective.", - .get = bch2_xattr_bcachefs_get_effective, - .set = bch2_xattr_bcachefs_set_effective, -}; - -#endif /* NO_BCACHEFS_FS */ - -const struct xattr_handler * const bch2_xattr_handlers[] = { - &bch_xattr_user_handler, - &bch_xattr_trusted_handler, - &bch_xattr_security_handler, -#ifndef NO_BCACHEFS_FS - &bch_xattr_bcachefs_handler, - &bch_xattr_bcachefs_effective_handler, -#endif - NULL -}; - -static const struct xattr_handler *bch_xattr_handler_map[] = { - [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, - [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = - &nop_posix_acl_access, - [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = - &nop_posix_acl_default, - [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, - [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, -}; - -static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) -{ - return type < ARRAY_SIZE(bch_xattr_handler_map) - ? bch_xattr_handler_map[type] - : NULL; -} diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h deleted file mode 100644 index 1139bf345f7093..00000000000000 --- a/fs/bcachefs/xattr.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_XATTR_H -#define _BCACHEFS_XATTR_H - -#include "str_hash.h" - -extern const struct bch_hash_desc bch2_xattr_hash_desc; - -int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_xattr ((struct bkey_ops) { \ - .key_validate = bch2_xattr_validate, \ - .val_to_text = bch2_xattr_to_text, \ - .min_val_size = 8, \ -}) - -static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) -{ - return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) + - name_len + val_len, sizeof(u64)); -} - -#define xattr_val(_xattr) \ - ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len) - -struct xattr_search_key { - u8 type; - struct qstr name; -}; - -#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ - { .type = _type, .name = QSTR_INIT(_name, _len) }) - -struct dentry; -struct xattr_handler; -struct bch_hash_info; -struct bch_inode_info; - -/* Exported for cmd_migrate.c in tools: */ -int bch2_xattr_set(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, const struct bch_hash_info *, - const char *, const void *, size_t, int, int); - -ssize_t bch2_xattr_list(struct dentry *, char *, size_t); - -extern const struct xattr_handler * const bch2_xattr_handlers[]; - -#endif /* _BCACHEFS_XATTR_H */ diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h deleted file mode 100644 index 4121b78d9a92a5..00000000000000 --- a/fs/bcachefs/xattr_format.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_XATTR_FORMAT_H -#define _BCACHEFS_XATTR_FORMAT_H - -#define KEY_TYPE_XATTR_INDEX_USER 0 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -#define KEY_TYPE_XATTR_INDEX_SECURITY 4 - -struct bch_xattr { - struct bch_val v; - __u8 x_type; - __u8 x_name_len; - __le16 x_val_len; - /* - * x_name contains the name and value counted by - * x_name_len + x_val_len. The introduction of - * __counted_by(x_name_len) previously caused a false positive - * detection of an out of bounds write. - */ - __u8 x_name_and_value[]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_XATTR_FORMAT_H */ From 74662f9f92b67c0ca55139c5aa392da0f0a26c08 Mon Sep 17 00:00:00 2001 From: Steven 'Steve' Kendall Date: Mon, 29 Sep 2025 21:33:34 +0000 Subject: [PATCH 3139/3206] ALSA: hda/hdmi: Add pin fix for HP ProDesk model The HP ProDesk 400 (SSID 103c:83f3) also needs a quirk for enabling HDMI outputs. This patch adds the required quirk entry. Signed-off-by: Steven 'Steve' Kendall Cc: Signed-off-by: Takashi Iwai --- sound/hda/codecs/hdmi/hdmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/hdmi/hdmi.c b/sound/hda/codecs/hdmi/hdmi.c index dc38bfd9dba598..111c9b5335afcc 100644 --- a/sound/hda/codecs/hdmi/hdmi.c +++ b/sound/hda/codecs/hdmi/hdmi.c @@ -1549,6 +1549,7 @@ static const struct snd_pci_quirk force_connect_list[] = { SND_PCI_QUIRK(0x103c, 0x83e2, "HP EliteDesk 800 G4", 1), SND_PCI_QUIRK(0x103c, 0x83ef, "HP MP9 G4 Retail System AMS", 1), SND_PCI_QUIRK(0x103c, 0x845a, "HP EliteDesk 800 G4 DM 65W", 1), + SND_PCI_QUIRK(0x103c, 0x83f3, "HP ProDesk 400", 1), SND_PCI_QUIRK(0x103c, 0x870f, "HP", 1), SND_PCI_QUIRK(0x103c, 0x871a, "HP", 1), SND_PCI_QUIRK(0x103c, 0x8711, "HP", 1), From 81a2c31257411296862487aaade98b7d9e25dc72 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Mon, 7 Jul 2025 18:31:20 +0300 Subject: [PATCH 3140/3206] mfd: simple-mfd-i2c: Add compatible strings for Layerscape QIXIS FPGA The QIXIS FPGA found on Layerscape boards such as LX2160AQDS, LS1028AQDS etc deals with power-on-reset timing, muxing etc. Use the simple-mfd-i2c as its core driver by adding its compatible string (already found in some dt files). By using the simple-mfd-i2c driver, any child device will have access to the i2c regmap created by it. Signed-off-by: Ioana Ciornei Link: https://lore.kernel.org/r/20250707153120.1371719-1-ioana.ciornei@nxp.com Signed-off-by: Lee Jones --- drivers/mfd/simple-mfd-i2c.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mfd/simple-mfd-i2c.c b/drivers/mfd/simple-mfd-i2c.c index 22159913bea034..f7798bd922224e 100644 --- a/drivers/mfd/simple-mfd-i2c.c +++ b/drivers/mfd/simple-mfd-i2c.c @@ -99,6 +99,8 @@ static const struct of_device_id simple_mfd_i2c_of_match[] = { { .compatible = "maxim,max5970", .data = &maxim_max5970}, { .compatible = "maxim,max5978", .data = &maxim_max5970}, { .compatible = "maxim,max77705-battery", .data = &maxim_mon_max77705}, + { .compatible = "fsl,lx2160aqds-fpga" }, + { .compatible = "fsl,ls1028aqds-fpga" }, {} }; MODULE_DEVICE_TABLE(of, simple_mfd_i2c_of_match); From 0b9483bf7f319468bd37351c112c8f0bfe242028 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Thu, 17 Jul 2025 15:51:34 +0100 Subject: [PATCH 3141/3206] mfd: adp5585: Drop useless return statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In adp5585_reset_ev_parse(), when parsing the adi,reset-pulse-width-us property, we were returning in case it was found and valid. No point in doing that as we'll be returning anyways after the exiting the property scope. And it could actually lead to bugs if new properties happen to added after this one. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-gpio/c85604d9e077511b8aa6ee0786579594cc0103d4.camel@gmail.com/T/#ma25557bd06ccd2531dc9c85ba6be74af781b81aa Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20250717-adp5585-drop-ret-v1-1-2ae65bd780aa@analog.com Signed-off-by: Lee Jones --- drivers/mfd/adp5585.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mfd/adp5585.c b/drivers/mfd/adp5585.c index 58f7cebe2ea4f2..46b3ce3d7bae89 100644 --- a/drivers/mfd/adp5585.c +++ b/drivers/mfd/adp5585.c @@ -432,7 +432,6 @@ static int adp5585_reset_ev_parse(struct adp5585_dev *adp5585) "Invalid value(%u) for adi,reset-pulse-width-us\n", prop_val); } - return ret; } return 0; From ba2b3de78fe63cd3bb169f98c7d92fa09c79ca16 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 24 Jul 2025 12:14:59 +0100 Subject: [PATCH 3142/3206] mfd: Kconfig: Fix spelling mistake "infontainment" -> "infotainment" There is a spelling mistake in the MFD_TIMBERDALE description. Fix it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20250724111459.141633-1-colin.i.king@gmail.com Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index f8e64cfa7d5dc8..4b04adeb32071c 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -2003,7 +2003,7 @@ config MFD_TIMBERDALE multifunction device which exposes numerous platform devices. The timberdale FPGA can be found on the Intel Atom development board - for in-vehicle infontainment, called Russellville. + for in-vehicle infotainment, called Russellville. config MFD_TC3589X bool "Toshiba TC35892 and variants" From 57bf2a312ab2d0bc8ee0f4e8a447fa94a2fc877d Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Fri, 25 Jul 2025 09:07:48 +0200 Subject: [PATCH 3143/3206] mfd: stmpe: Remove IRQ domain upon removal The IRQ domain is (optionally) added during stmpe_probe, but never removed. Add the call to stmpe_remove. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20250725070752.338376-1-alexander.stein@ew.tq-group.com Signed-off-by: Lee Jones --- drivers/mfd/stmpe.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mfd/stmpe.c b/drivers/mfd/stmpe.c index 819d19dc9b4a91..e1165f63aedae2 100644 --- a/drivers/mfd/stmpe.c +++ b/drivers/mfd/stmpe.c @@ -1485,6 +1485,9 @@ int stmpe_probe(struct stmpe_client_info *ci, enum stmpe_partnum partnum) void stmpe_remove(struct stmpe *stmpe) { + if (stmpe->domain) + irq_domain_remove(stmpe->domain); + if (!IS_ERR(stmpe->vio) && regulator_is_enabled(stmpe->vio)) regulator_disable(stmpe->vio); if (!IS_ERR(stmpe->vcc) && regulator_is_enabled(stmpe->vcc)) From 57b1fec0be8590278d81786eba11eb74252f784a Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Fri, 25 Jul 2025 09:07:49 +0200 Subject: [PATCH 3144/3206] mfd: stmpe-spi: Use module_spi_driver to remove boilerplate Driver implements feature of module_spi_driver() manually. Replace it by that macro instead. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20250725070752.338376-2-alexander.stein@ew.tq-group.com Signed-off-by: Lee Jones --- drivers/mfd/stmpe-spi.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/mfd/stmpe-spi.c b/drivers/mfd/stmpe-spi.c index b9cc85ea2c4019..7fee64102cae7a 100644 --- a/drivers/mfd/stmpe-spi.c +++ b/drivers/mfd/stmpe-spi.c @@ -141,18 +141,7 @@ static struct spi_driver stmpe_spi_driver = { .remove = stmpe_spi_remove, .id_table = stmpe_spi_id, }; - -static int __init stmpe_init(void) -{ - return spi_register_driver(&stmpe_spi_driver); -} -subsys_initcall(stmpe_init); - -static void __exit stmpe_exit(void) -{ - spi_unregister_driver(&stmpe_spi_driver); -} -module_exit(stmpe_exit); +module_spi_driver(stmpe_spi_driver); MODULE_DESCRIPTION("STMPE MFD SPI Interface Driver"); MODULE_AUTHOR("Viresh Kumar "); From 557b09699b06f88ad2cb64747e4e8b6fc6e7141b Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Fri, 25 Jul 2025 09:07:50 +0200 Subject: [PATCH 3145/3206] mfd: stmpe-i2c: Use module_i2c_driver to remove boilerplate Driver implements feature of module_i2c_driver() manually. Replace it by that macro instead. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20250725070752.338376-3-alexander.stein@ew.tq-group.com Signed-off-by: Lee Jones --- drivers/mfd/stmpe-i2c.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/mfd/stmpe-i2c.c b/drivers/mfd/stmpe-i2c.c index fe018bedab9837..145836320c1707 100644 --- a/drivers/mfd/stmpe-i2c.c +++ b/drivers/mfd/stmpe-i2c.c @@ -122,18 +122,7 @@ static struct i2c_driver stmpe_i2c_driver = { .remove = stmpe_i2c_remove, .id_table = stmpe_i2c_id, }; - -static int __init stmpe_init(void) -{ - return i2c_add_driver(&stmpe_i2c_driver); -} -subsys_initcall(stmpe_init); - -static void __exit stmpe_exit(void) -{ - i2c_del_driver(&stmpe_i2c_driver); -} -module_exit(stmpe_exit); +module_i2c_driver(stmpe_i2c_driver); MODULE_DESCRIPTION("STMPE MFD I2C Interface Driver"); MODULE_AUTHOR("Rabin Vincent "); From d00c9120414c3d908c0507fe26dd3b401a02c30a Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Fri, 25 Jul 2025 09:11:49 +0200 Subject: [PATCH 3146/3206] mfd: stmpe-spi: Add missing MODULE_LICENSE This driver is licensed GPL-2.0-only, so add the corresponding module flag. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20250725071153.338912-2-alexander.stein@ew.tq-group.com Signed-off-by: Lee Jones --- drivers/mfd/stmpe-spi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/stmpe-spi.c b/drivers/mfd/stmpe-spi.c index 7fee64102cae7a..dea31efface6ec 100644 --- a/drivers/mfd/stmpe-spi.c +++ b/drivers/mfd/stmpe-spi.c @@ -145,3 +145,4 @@ module_spi_driver(stmpe_spi_driver); MODULE_DESCRIPTION("STMPE MFD SPI Interface Driver"); MODULE_AUTHOR("Viresh Kumar "); +MODULE_LICENSE("GPL"); From 00ea54f058cd4cb082302fe598cfe148e0aadf94 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Fri, 25 Jul 2025 09:11:50 +0200 Subject: [PATCH 3147/3206] mfd: stmpe-i2c: Add missing MODULE_LICENSE This driver is licensed GPL-2.0-only, so add the corresponding module flag. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20250725071153.338912-3-alexander.stein@ew.tq-group.com Signed-off-by: Lee Jones --- drivers/mfd/stmpe-i2c.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/stmpe-i2c.c b/drivers/mfd/stmpe-i2c.c index 145836320c1707..943fa363efc35a 100644 --- a/drivers/mfd/stmpe-i2c.c +++ b/drivers/mfd/stmpe-i2c.c @@ -126,3 +126,4 @@ module_i2c_driver(stmpe_i2c_driver); MODULE_DESCRIPTION("STMPE MFD I2C Interface Driver"); MODULE_AUTHOR("Rabin Vincent "); +MODULE_LICENSE("GPL"); From 9d602da0c9919f36d55b051951c1fd4f3dc24593 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Wed, 30 Jul 2025 19:34:22 +0200 Subject: [PATCH 3148/3206] dt-bindings: mfd: qnap,ts433-mcu: Add qnap,ts233-mcu compatible The same MCU is used on other devices of the series with a slightly different set of features, like the number of LEDs. Add a compatible for the MCU used in the TS233 variant. Signed-off-by: Heiko Stuebner Acked-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250730173423.1878599-2-heiko@sntech.de Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml b/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml index 877078ac172f1d..fe9e06cfb93348 100644 --- a/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml +++ b/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml @@ -16,6 +16,7 @@ description: properties: compatible: enum: + - qnap,ts233-mcu - qnap,ts433-mcu patternProperties: From 865417d5652d9e6de021c558e38fe94db0a778ea Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Wed, 30 Jul 2025 19:34:23 +0200 Subject: [PATCH 3149/3206] mfd: qnap-mcu: Add driver data for TS233 variant Add the TS233 compatible and affiliated driver data to qnap-mcu. The TS233 is mostly similar to the TS433, except not having any of the PCIe components, so there are only 2 drives. The fan pwm-limits from the vendor-configuration also are the same as for the ts433 variant. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250730173423.1878599-3-heiko@sntech.de Signed-off-by: Lee Jones --- drivers/mfd/qnap-mcu.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/mfd/qnap-mcu.c b/drivers/mfd/qnap-mcu.c index 89a8a1913d42dd..47d435c93dbb88 100644 --- a/drivers/mfd/qnap-mcu.c +++ b/drivers/mfd/qnap-mcu.c @@ -247,6 +247,14 @@ static int qnap_mcu_power_off(struct sys_off_data *data) return NOTIFY_DONE; } +static const struct qnap_mcu_variant qnap_ts233_mcu = { + .baud_rate = 115200, + .num_drives = 2, + .fan_pwm_min = 51, /* Specified in original model.conf */ + .fan_pwm_max = 255, + .usb_led = true, +}; + static const struct qnap_mcu_variant qnap_ts433_mcu = { .baud_rate = 115200, .num_drives = 4, @@ -319,6 +327,7 @@ static int qnap_mcu_probe(struct serdev_device *serdev) } static const struct of_device_id qnap_mcu_dt_ids[] = { + { .compatible = "qnap,ts233-mcu", .data = &qnap_ts233_mcu }, { .compatible = "qnap,ts433-mcu", .data = &qnap_ts433_mcu }, { /* sentinel */ } }; From 597b398bc27c84f3998c2169ad7ce74e8404533a Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Wed, 30 Jul 2025 19:22:47 +0200 Subject: [PATCH 3150/3206] dt-bindings: mfd: qnap,ts433-mcu: Allow nvmem-layout child node The MCU has an eeprom memory connected internally, that for example contains some mac-addresses for the soc gmac controllers. Therefore allow defining the nvmem-layout for the eeprom. Signed-off-by: Heiko Stuebner Acked-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250730172248.1875122-2-heiko@sntech.de Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml b/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml index fe9e06cfb93348..5454d9403cad79 100644 --- a/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml +++ b/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml @@ -19,6 +19,9 @@ properties: - qnap,ts233-mcu - qnap,ts433-mcu + nvmem-layout: + $ref: /schemas/nvmem/layouts/nvmem-layout.yaml + patternProperties: "^fan-[0-9]+$": $ref: /schemas/hwmon/fan-common.yaml# From 309e65d151ab9be1e7b01d822880cd8c4e611dff Mon Sep 17 00:00:00 2001 From: "Heijligen, Thomas" Date: Thu, 31 Jul 2025 14:45:00 +0000 Subject: [PATCH 3151/3206] mfd: kempld: Switch back to earlier ->init() behavior Commit 9e36775c22c7 ("mfd: kempld: Remove custom DMI matching code") removes the ability to load the driver if no matching system DMI data is found. Before this commit the driver could be loaded using alternative methods such as ACPI or `force_device_id` in the absence of a matching system DMI entry. Restore this ability while keeping the refactored `platform_device_info` table. Signed-off-by: Thomas Heijligen Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/7d2c7e92253d851194a781720051536cca2722b8.camel@secunet.com Signed-off-by: Lee Jones --- drivers/mfd/kempld-core.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/drivers/mfd/kempld-core.c b/drivers/mfd/kempld-core.c index c5bfb6440a930f..77980c7fc31f9f 100644 --- a/drivers/mfd/kempld-core.c +++ b/drivers/mfd/kempld-core.c @@ -779,22 +779,26 @@ MODULE_DEVICE_TABLE(dmi, kempld_dmi_table); static int __init kempld_init(void) { const struct dmi_system_id *id; - int ret = -ENODEV; - for (id = dmi_first_match(kempld_dmi_table); id; id = dmi_first_match(id + 1)) { - /* Check, if user asked for the exact device ID match */ - if (force_device_id[0] && !strstr(id->ident, force_device_id)) - continue; - - ret = kempld_create_platform_device(&kempld_platform_data_generic); - if (ret) - continue; - - break; + /* + * This custom DMI iteration allows the driver to be initialized in three ways: + * - When a forced_device_id string matches any ident in the kempld_dmi_table, + * regardless of whether the DMI device is present in the system dmi table. + * - When a matching entry is present in the DMI system tabe. + * - Through alternative mechanisms like ACPI. + */ + if (force_device_id[0]) { + for (id = kempld_dmi_table; id->matches[0].slot != DMI_NONE; id++) + if (strstr(id->ident, force_device_id)) + if (!kempld_create_platform_device(&kempld_platform_data_generic)) + break; + if (id->matches[0].slot == DMI_NONE) + return -ENODEV; + } else { + for (id = dmi_first_match(kempld_dmi_table); id; id = dmi_first_match(id+1)) + if (kempld_create_platform_device(&kempld_platform_data_generic)) + break; } - if (ret) - return ret; - return platform_driver_register(&kempld_driver); } From 5e1c88679174e4bfe5d152060b06d370bd85de80 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 4 Aug 2025 15:07:23 +0200 Subject: [PATCH 3152/3206] mfd: qnap-mcu: Include linux/types.h in qnap-mcu.h shared header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Relying on other components to include those basic types is unreliable and may cause compile errors like: ../include/linux/mfd/qnap-mcu.h:13:9: error: unknown type name ‘u32’ 13 | u32 baud_rate; | ^~~ ../include/linux/mfd/qnap-mcu.h:17:9: error: unknown type name ‘bool’ 17 | bool usb_led; | ^~~~ So make sure, the types used in the header are available. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250804130726.3180806-2-heiko@sntech.de Signed-off-by: Lee Jones --- include/linux/mfd/qnap-mcu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mfd/qnap-mcu.h b/include/linux/mfd/qnap-mcu.h index 8d48c212fd4446..42bf523f9a5b0c 100644 --- a/include/linux/mfd/qnap-mcu.h +++ b/include/linux/mfd/qnap-mcu.h @@ -7,6 +7,8 @@ #ifndef _LINUX_QNAP_MCU_H_ #define _LINUX_QNAP_MCU_H_ +#include + struct qnap_mcu; struct qnap_mcu_variant { From bf2de43060d528e52e372c63182a94b95c80d305 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 4 Aug 2025 15:07:24 +0200 Subject: [PATCH 3153/3206] mfd: qnap-mcu: Handle errors returned from qnap_mcu_write qnap_mcu_write can return errors and those were not checked before. So do that now. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250804130726.3180806-3-heiko@sntech.de Signed-off-by: Lee Jones --- drivers/mfd/qnap-mcu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mfd/qnap-mcu.c b/drivers/mfd/qnap-mcu.c index 47d435c93dbb88..e794efb6ecb197 100644 --- a/drivers/mfd/qnap-mcu.c +++ b/drivers/mfd/qnap-mcu.c @@ -163,7 +163,11 @@ int qnap_mcu_exec(struct qnap_mcu *mcu, reply->received = 0; reinit_completion(&reply->done); - qnap_mcu_write(mcu, cmd_data, cmd_data_size); + ret = qnap_mcu_write(mcu, cmd_data, cmd_data_size); + if (ret < 0) { + mutex_unlock(&mcu->bus_lock); + return ret; + } serdev_device_wait_until_sent(mcu->serdev, msecs_to_jiffies(QNAP_MCU_TIMEOUT_MS)); From 21c5ffb4211203de0a91de8a1c4a329e508a2ffb Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 4 Aug 2025 15:07:25 +0200 Subject: [PATCH 3154/3206] mfd: qnap-mcu: Convert to guard(mutex) in qnap_mcu_exec guard() makes sure that the mutex gets unlocked when the function returns and thus removes the need for unlock gotos or similar mechanisms and therefore allows for a simpler function structure. So convert the qnap_mcu_exec function to use it. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250804130726.3180806-4-heiko@sntech.de Signed-off-by: Lee Jones --- drivers/mfd/qnap-mcu.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/mfd/qnap-mcu.c b/drivers/mfd/qnap-mcu.c index e794efb6ecb197..6875f27151e9c4 100644 --- a/drivers/mfd/qnap-mcu.c +++ b/drivers/mfd/qnap-mcu.c @@ -156,7 +156,7 @@ int qnap_mcu_exec(struct qnap_mcu *mcu, return -EINVAL; } - mutex_lock(&mcu->bus_lock); + guard(mutex)(&mcu->bus_lock); reply->data = rx; reply->length = length; @@ -164,30 +164,27 @@ int qnap_mcu_exec(struct qnap_mcu *mcu, reinit_completion(&reply->done); ret = qnap_mcu_write(mcu, cmd_data, cmd_data_size); - if (ret < 0) { - mutex_unlock(&mcu->bus_lock); + if (ret < 0) return ret; - } serdev_device_wait_until_sent(mcu->serdev, msecs_to_jiffies(QNAP_MCU_TIMEOUT_MS)); if (!wait_for_completion_timeout(&reply->done, msecs_to_jiffies(QNAP_MCU_TIMEOUT_MS))) { dev_err(&mcu->serdev->dev, "Command timeout\n"); - ret = -ETIMEDOUT; + return -ETIMEDOUT; } else { u8 crc = qnap_mcu_csum(rx, reply_data_size); if (crc != rx[reply_data_size]) { dev_err(&mcu->serdev->dev, "Invalid Checksum received\n"); - ret = -EIO; + return -EIO; } else { memcpy(reply_data, rx, reply_data_size); } } - mutex_unlock(&mcu->bus_lock); - return ret; + return 0; } EXPORT_SYMBOL_GPL(qnap_mcu_exec); From 71f529e9fedc377b43b73ec9d28ec59d9a3a2792 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 4 Aug 2025 15:07:26 +0200 Subject: [PATCH 3155/3206] mfd: qnap-mcu: Improve structure in qnap_mcu_exec Now with guard(mutex) in place, we can make the function's structure a bit easier to read, by removing the nested if-else-clauses. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250804130726.3180806-5-heiko@sntech.de Signed-off-by: Lee Jones --- drivers/mfd/qnap-mcu.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/mfd/qnap-mcu.c b/drivers/mfd/qnap-mcu.c index 6875f27151e9c4..4ec1f4cf902f60 100644 --- a/drivers/mfd/qnap-mcu.c +++ b/drivers/mfd/qnap-mcu.c @@ -150,6 +150,7 @@ int qnap_mcu_exec(struct qnap_mcu *mcu, size_t length = reply_data_size + QNAP_MCU_CHECKSUM_SIZE; struct qnap_mcu_reply *reply = &mcu->reply; int ret = 0; + u8 crc; if (length > sizeof(rx)) { dev_err(&mcu->serdev->dev, "expected data too big for receive buffer"); @@ -172,18 +173,16 @@ int qnap_mcu_exec(struct qnap_mcu *mcu, if (!wait_for_completion_timeout(&reply->done, msecs_to_jiffies(QNAP_MCU_TIMEOUT_MS))) { dev_err(&mcu->serdev->dev, "Command timeout\n"); return -ETIMEDOUT; - } else { - u8 crc = qnap_mcu_csum(rx, reply_data_size); - - if (crc != rx[reply_data_size]) { - dev_err(&mcu->serdev->dev, - "Invalid Checksum received\n"); - return -EIO; - } else { - memcpy(reply_data, rx, reply_data_size); - } } + crc = qnap_mcu_csum(rx, reply_data_size); + if (crc != rx[reply_data_size]) { + dev_err(&mcu->serdev->dev, "Invalid Checksum received\n"); + return -EIO; + } + + memcpy(reply_data, rx, reply_data_size); + return 0; } EXPORT_SYMBOL_GPL(qnap_mcu_exec); From 64e0d839c589f4f2ecd2e3e5bdb5cee6ba6bade9 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 4 Aug 2025 15:32:40 +0200 Subject: [PATCH 3156/3206] mfd: intel_soc_pmic_chtdc_ti: Set use_single_read regmap_config flag Testing has shown that reading multiple registers at once (for 10-bit ADC values) does not work. Set the use_single_read regmap_config flag to make regmap split these for us. This should fix temperature opregion accesses done by drivers/acpi/pmic/intel_pmic_chtdc_ti.c and is also necessary for the upcoming drivers for the ADC and battery MFD cells. Fixes: 6bac0606fdba ("mfd: Add support for Cherry Trail Dollar Cove TI PMIC") Cc: stable@vger.kernel.org Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20250804133240.312383-1-hansg@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/intel_soc_pmic_chtdc_ti.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mfd/intel_soc_pmic_chtdc_ti.c b/drivers/mfd/intel_soc_pmic_chtdc_ti.c index 4c1a68c9f5750f..6daf33e07ea0a8 100644 --- a/drivers/mfd/intel_soc_pmic_chtdc_ti.c +++ b/drivers/mfd/intel_soc_pmic_chtdc_ti.c @@ -82,6 +82,8 @@ static const struct regmap_config chtdc_ti_regmap_config = { .reg_bits = 8, .val_bits = 8, .max_register = 0xff, + /* The hardware does not support reading multiple registers at once */ + .use_single_read = true, }; static const struct regmap_irq chtdc_ti_irqs[] = { From 9ac4890ac39352ccea132109e32911495574c3ec Mon Sep 17 00:00:00 2001 From: Jens Kehne Date: Mon, 4 Aug 2025 15:37:54 +0200 Subject: [PATCH 3157/3206] mfd: da9063: Split chip variant reading in two bus transactions We observed the initial probe of the da9063 failing in da9063_get_device_type in about 30% of boots on a Xilinx ZynqMP based board. The problem originates in da9063_i2c_blockreg_read, which uses a single bus transaction to turn the register page and then read a register. On the bus, this should translate to a write to register 0, followed by a read to the target register, separated by a repeated start. However, we found that after the write to register 0, the controller sometimes continues directly with the register address of the read request, without sending the chip address or a repeated start in between, which makes the read request invalid. To fix this, separate turning the page and reading the register into two separate transactions. This brings the initialization code in line with the rest of the driver, which uses register maps (which to my knowledge do not use repeated starts after turning the page). This has been included in our kernel for several months and was recently included in a shipped product. For us, it reliably fixes the issue, and we have not observed any new issues. While the underlying problem is probably with the i2c controller or its driver, I still propose a change here in the interest of robustness: First, I'm not sure this issue can be fixed on the controller side, since there are other issues related to repeated start which can't (AR# 60695, AR# 61664). Second, similar problems might exist with other controllers. Signed-off-by: Jens Kehne Link: https://lore.kernel.org/r/20250804133754.3496718-1-jens.kehne@agilent.com Signed-off-by: Lee Jones --- drivers/mfd/da9063-i2c.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/mfd/da9063-i2c.c b/drivers/mfd/da9063-i2c.c index c6235cd0dbdc40..1ec9ab56442dfc 100644 --- a/drivers/mfd/da9063-i2c.c +++ b/drivers/mfd/da9063-i2c.c @@ -37,9 +37,13 @@ enum da9063_page_sel_buf_fmt { DA9063_PAGE_SEL_BUF_SIZE, }; +enum da9063_page_sel_msgs { + DA9063_PAGE_SEL_MSG = 0, + DA9063_PAGE_SEL_CNT, +}; + enum da9063_paged_read_msgs { - DA9063_PAGED_READ_MSG_PAGE_SEL = 0, - DA9063_PAGED_READ_MSG_REG_SEL, + DA9063_PAGED_READ_MSG_REG_SEL = 0, DA9063_PAGED_READ_MSG_DATA, DA9063_PAGED_READ_MSG_CNT, }; @@ -65,10 +69,21 @@ static int da9063_i2c_blockreg_read(struct i2c_client *client, u16 addr, (page_num << DA9063_I2C_PAGE_SEL_SHIFT) & DA9063_REG_PAGE_MASK; /* Write reg address, page selection */ - xfer[DA9063_PAGED_READ_MSG_PAGE_SEL].addr = client->addr; - xfer[DA9063_PAGED_READ_MSG_PAGE_SEL].flags = 0; - xfer[DA9063_PAGED_READ_MSG_PAGE_SEL].len = DA9063_PAGE_SEL_BUF_SIZE; - xfer[DA9063_PAGED_READ_MSG_PAGE_SEL].buf = page_sel_buf; + xfer[DA9063_PAGE_SEL_MSG].addr = client->addr; + xfer[DA9063_PAGE_SEL_MSG].flags = 0; + xfer[DA9063_PAGE_SEL_MSG].len = DA9063_PAGE_SEL_BUF_SIZE; + xfer[DA9063_PAGE_SEL_MSG].buf = page_sel_buf; + + ret = i2c_transfer(client->adapter, xfer, DA9063_PAGE_SEL_CNT); + if (ret < 0) { + dev_err(&client->dev, "Page switch failed: %d\n", ret); + return ret; + } + + if (ret != DA9063_PAGE_SEL_CNT) { + dev_err(&client->dev, "Page switch failed to complete\n"); + return -EIO; + } /* Select register address */ xfer[DA9063_PAGED_READ_MSG_REG_SEL].addr = client->addr; From 99767a0c8bb4e206f6cea37f5162073d1899168c Mon Sep 17 00:00:00 2001 From: Waqar Hameed Date: Tue, 5 Aug 2025 11:33:34 +0200 Subject: [PATCH 3158/3206] mfd: macsmc: Remove error prints for devm_add_action_or_reset() When `devm_add_action_or_reset()` fails, it is due to a failed memory allocation and will thus return `-ENOMEM`. `dev_err_probe()` doesn't do anything when error is `-ENOMEM`. Therefore, remove the useless call to `dev_err_probe()` when `devm_add_action_or_reset()` fails, and just return the value instead. Signed-off-by: Waqar Hameed Reviewed-by: Sven Peter Link: https://lore.kernel.org/r/pnd8qjym7td.a.out@axis.com Signed-off-by: Lee Jones --- drivers/mfd/macsmc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mfd/macsmc.c b/drivers/mfd/macsmc.c index 870c8b2028a8fc..c8575c9c9f4be8 100644 --- a/drivers/mfd/macsmc.c +++ b/drivers/mfd/macsmc.c @@ -429,7 +429,7 @@ static int apple_smc_probe(struct platform_device *pdev) ret = devm_add_action_or_reset(dev, apple_smc_rtkit_shutdown, smc); if (ret) - return dev_err_probe(dev, ret, "Failed to register rtkit shutdown action"); + return ret; ret = apple_rtkit_start_ep(smc->rtk, SMC_ENDPOINT); if (ret) @@ -465,7 +465,7 @@ static int apple_smc_probe(struct platform_device *pdev) apple_smc_write_flag(smc, SMC_KEY(NTAP), true); ret = devm_add_action_or_reset(dev, apple_smc_disable_notifications, smc); if (ret) - return dev_err_probe(dev, ret, "Failed to register notification disable action"); + return ret; ret = devm_mfd_add_devices(smc->dev, PLATFORM_DEVID_NONE, apple_smc_devs, ARRAY_SIZE(apple_smc_devs), From 364752aa0c6ab0a06a2d5bfdb362c1ca407f1a30 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Aug 2025 09:19:28 +0200 Subject: [PATCH 3159/3206] mfd: madera: Work around false-positive -Wininitialized warning clang-21 warns about one uninitialized variable getting dereferenced in madera_dev_init: drivers/mfd/madera-core.c:739:10: error: variable 'mfd_devs' is uninitialized when used here [-Werror,-Wuninitialized] 739 | mfd_devs, n_devs, | ^~~~~~~~ drivers/mfd/madera-core.c:459:33: note: initialize the variable 'mfd_devs' to silence this warning 459 | const struct mfd_cell *mfd_devs; | ^ | = NULL The code is actually correct here because n_devs is only nonzero when mfd_devs is a valid pointer, but this is impossible for the compiler to see reliably. Change the logic to check for the pointer as well, to make this easier for the compiler to follow. Signed-off-by: Arnd Bergmann Reviewed-by: Richard Fitzgerald Link: https://lore.kernel.org/r/20250807071932.4085458-1-arnd@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/madera-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c index bdbd5bfc971456..2f74a8c644a32a 100644 --- a/drivers/mfd/madera-core.c +++ b/drivers/mfd/madera-core.c @@ -456,7 +456,7 @@ int madera_dev_init(struct madera *madera) struct device *dev = madera->dev; unsigned int hwid; int (*patch_fn)(struct madera *) = NULL; - const struct mfd_cell *mfd_devs; + const struct mfd_cell *mfd_devs = NULL; int n_devs = 0; int i, ret; @@ -670,7 +670,7 @@ int madera_dev_init(struct madera *madera) goto err_reset; } - if (!n_devs) { + if (!n_devs || !mfd_devs) { dev_err(madera->dev, "Device ID 0x%x not a %s\n", hwid, madera->type_name); ret = -ENODEV; From 58091331b59eebc9bf725178be87021d20e4ec2d Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 7 Aug 2025 08:29:08 -0500 Subject: [PATCH 3160/3206] dt-bindings: mfd: aspeed-lpc: Add missing "clocks" property on lpc-snoop node The ASpeed lpc-snoop nodes have a "clocks" property which isn't documented. It looks like all the LPC child devices have the same clock source. Perhaps it is the parent device that should have the clock, but it's too late for that. The driver for lpc-snoop requires a clock to be present. Signed-off-by: "Rob Herring (Arm)" Acked-by: Andrew Jeffery Link: https://lore.kernel.org/r/20250807132909.3291770-1-robh@kernel.org Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/aspeed-lpc.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/aspeed-lpc.yaml b/Documentation/devicetree/bindings/mfd/aspeed-lpc.yaml index d88854e60b7f95..f329223cec071d 100644 --- a/Documentation/devicetree/bindings/mfd/aspeed-lpc.yaml +++ b/Documentation/devicetree/bindings/mfd/aspeed-lpc.yaml @@ -137,6 +137,9 @@ patternProperties: reg: maxItems: 1 + clocks: + maxItems: 1 + interrupts: maxItems: 1 From ec9b6f054d9de2bc7a713741980c84886a39cd7a Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 12 Aug 2025 17:31:04 +0800 Subject: [PATCH 3161/3206] mfd: kempld: Use PTR_ERR_OR_ZERO() to simplify code Use the standard error pointer macro to shorten the code and simplify. Signed-off-by: Xichao Zhao Link: https://lore.kernel.org/r/20250812093104.103193-1-zhao.xichao@vivo.com Signed-off-by: Lee Jones --- drivers/mfd/kempld-core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/mfd/kempld-core.c b/drivers/mfd/kempld-core.c index 77980c7fc31f9f..c2008d2dc95aa9 100644 --- a/drivers/mfd/kempld-core.c +++ b/drivers/mfd/kempld-core.c @@ -141,10 +141,8 @@ static int kempld_create_platform_device(const struct kempld_platform_data *pdat }; kempld_pdev = platform_device_register_full(&pdevinfo); - if (IS_ERR(kempld_pdev)) - return PTR_ERR(kempld_pdev); - return 0; + return PTR_ERR_OR_ZERO(kempld_pdev); } /** From c1c8ed81e200126b711ee5b012e1fa9a0da2cbe1 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 12 Aug 2025 15:33:18 -0500 Subject: [PATCH 3162/3206] dt-bindings: mfd: syscon: Add "marvell,armada-3700-usb2-host-device-misc" compatible Add the "marvell,armada-3700-usb2-host-device-misc" compatible which is already in use. Signed-off-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250812203319.729300-1-robh@kernel.org Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/syscon.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/syscon.yaml b/Documentation/devicetree/bindings/mfd/syscon.yaml index 27672adeb1fedb..1f381d159af81f 100644 --- a/Documentation/devicetree/bindings/mfd/syscon.yaml +++ b/Documentation/devicetree/bindings/mfd/syscon.yaml @@ -79,6 +79,7 @@ select: - marvell,armada-3700-cpu-misc - marvell,armada-3700-nb-pm - marvell,armada-3700-avs + - marvell,armada-3700-usb2-host-device-misc - marvell,armada-3700-usb2-host-misc - marvell,dove-global-config - mediatek,mt2701-pctl-a-syscfg @@ -185,6 +186,7 @@ properties: - marvell,armada-3700-cpu-misc - marvell,armada-3700-nb-pm - marvell,armada-3700-avs + - marvell,armada-3700-usb2-host-device-misc - marvell,armada-3700-usb2-host-misc - marvell,dove-global-config - mediatek,mt2701-pctl-a-syscfg From a5b03d81c23ef2aae8a88dc9da009a56b4bbb8f6 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Fri, 15 Aug 2025 18:06:01 +0800 Subject: [PATCH 3163/3206] mfd: max899x: Use dedicated interrupt wake setters Use enable_irq_wake() and disable_irq_wake() instead of calling low-level irq_set_irq_wake() with a parameter. No functional changes. Signed-off-by: Xichao Zhao Link: https://lore.kernel.org/r/20250815100601.622923-1-zhao.xichao@vivo.com Signed-off-by: Lee Jones --- drivers/mfd/max8997.c | 4 ++-- drivers/mfd/max8998.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mfd/max8997.c b/drivers/mfd/max8997.c index ffe96b40368e12..7ba8ed1dfde3ea 100644 --- a/drivers/mfd/max8997.c +++ b/drivers/mfd/max8997.c @@ -438,7 +438,7 @@ static int max8997_suspend(struct device *dev) disable_irq(max8997->irq); if (device_may_wakeup(dev)) - irq_set_irq_wake(max8997->irq, 1); + enable_irq_wake(max8997->irq); return 0; } @@ -448,7 +448,7 @@ static int max8997_resume(struct device *dev) struct max8997_dev *max8997 = i2c_get_clientdata(i2c); if (device_may_wakeup(dev)) - irq_set_irq_wake(max8997->irq, 0); + disable_irq_wake(max8997->irq); enable_irq(max8997->irq); return max8997_irq_resume(max8997); } diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c index 6ba27171da28b7..eb13bbaeda5528 100644 --- a/drivers/mfd/max8998.c +++ b/drivers/mfd/max8998.c @@ -234,7 +234,7 @@ static int max8998_suspend(struct device *dev) struct max8998_dev *max8998 = i2c_get_clientdata(i2c); if (device_may_wakeup(dev)) - irq_set_irq_wake(max8998->irq, 1); + enable_irq_wake(max8998->irq); return 0; } @@ -244,7 +244,7 @@ static int max8998_resume(struct device *dev) struct max8998_dev *max8998 = i2c_get_clientdata(i2c); if (device_may_wakeup(dev)) - irq_set_irq_wake(max8998->irq, 0); + disable_irq_wake(max8998->irq); /* * In LP3974, if IRQ registers are not "read & clear" * when it's set during sleep, the interrupt becomes From 9c5ad8374b1fe0c8c9eaabc1146d96a543858c4a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 8 Aug 2025 17:17:56 +0200 Subject: [PATCH 3164/3206] mfd: arizona: Make legacy gpiolib interface optional The only machine that still uses the old gpio number based interface is the wlf_cragg_6410 board file. In order to remove the dependency on the interfaces, add #ifdef blocks here. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250808151822.536879-13-arnd@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/arizona-irq.c | 5 ++++- include/linux/mfd/arizona/pdata.h | 6 ++++++ sound/soc/codecs/arizona-jack.c | 17 ++++++++++++++++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c index 3f8622ee0e59ca..544016d420fe2a 100644 --- a/drivers/mfd/arizona-irq.c +++ b/drivers/mfd/arizona-irq.c @@ -136,7 +136,7 @@ static irqreturn_t arizona_irq_thread(int irq, void *data) dev_err(arizona->dev, "Failed to read main IRQ status: %d\n", ret); } - +#ifdef CONFIG_GPIOLIB_LEGACY /* * Poll the IRQ pin status to see if we're really done * if the interrupt controller can't do it for us. @@ -150,6 +150,7 @@ static irqreturn_t arizona_irq_thread(int irq, void *data) !gpio_get_value_cansleep(arizona->pdata.irq_gpio)) { poll = true; } +#endif } while (poll); pm_runtime_put_autosuspend(arizona->dev); @@ -349,6 +350,7 @@ int arizona_irq_init(struct arizona *arizona) goto err_map_main_irq; } +#ifdef CONFIG_GPIOLIB_LEGACY /* Used to emulate edge trigger and to work around broken pinmux */ if (arizona->pdata.irq_gpio) { if (gpio_to_irq(arizona->pdata.irq_gpio) != arizona->irq) { @@ -368,6 +370,7 @@ int arizona_irq_init(struct arizona *arizona) arizona->pdata.irq_gpio = 0; } } +#endif ret = request_threaded_irq(arizona->irq, NULL, arizona_irq_thread, flags, "arizona", arizona); diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h index 2d13bbea4f3acb..f72e6d4b14a784 100644 --- a/include/linux/mfd/arizona/pdata.h +++ b/include/linux/mfd/arizona/pdata.h @@ -117,8 +117,10 @@ struct arizona_pdata { /** Check for line output with HPDET method */ bool hpdet_acc_id_line; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO used for mic isolation with HPDET */ int hpdet_id_gpio; +#endif /** Channel to use for headphone detection */ unsigned int hpdet_channel; @@ -129,8 +131,10 @@ struct arizona_pdata { /** Extra debounce timeout used during initial mic detection (ms) */ unsigned int micd_detect_debounce; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO for mic detection polarity */ int micd_pol_gpio; +#endif /** Mic detect ramp rate */ unsigned int micd_bias_start_time; @@ -184,8 +188,10 @@ struct arizona_pdata { /** Haptic actuator type */ unsigned int hap_act; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO for primary IRQ (used for edge triggered emulation) */ int irq_gpio; +#endif /** General purpose switch control */ unsigned int gpsw; diff --git a/sound/soc/codecs/arizona-jack.c b/sound/soc/codecs/arizona-jack.c index 22f9c431a0e550..6b55610ad535d6 100644 --- a/sound/soc/codecs/arizona-jack.c +++ b/sound/soc/codecs/arizona-jack.c @@ -461,7 +461,11 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, bool *mic) { struct arizona *arizona = info->arizona; +#ifdef CONFIG_GPIOLIB_LEGACY int id_gpio = arizona->pdata.hpdet_id_gpio; +#else + int id_gpio = 0; +#endif if (!arizona->pdata.hpdet_acc_id) return 0; @@ -472,6 +476,7 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, */ info->hpdet_res[info->num_hpdet_res++] = *reading; +#ifdef CONFIG_GPIOLIB_LEGACY /* Only check the mic directly if we didn't already ID it */ if (id_gpio && info->num_hpdet_res == 1) { dev_dbg(arizona->dev, "Measuring mic\n"); @@ -489,6 +494,7 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, ARIZONA_HP_POLL, ARIZONA_HP_POLL); return -EAGAIN; } +#endif /* OK, got both. Now, compare... */ dev_dbg(arizona->dev, "HPDET measured %d %d\n", @@ -529,7 +535,9 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data) { struct arizona_priv *info = data; struct arizona *arizona = info->arizona; +#ifdef CONFIG_GPIOLIB_LEGACY int id_gpio = arizona->pdata.hpdet_id_gpio; +#endif int ret, reading, state, report; bool mic = false; @@ -585,8 +593,10 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data) arizona_extcon_hp_clamp(info, false); +#ifdef CONFIG_GPIOLIB_LEGACY if (id_gpio) gpio_set_value_cansleep(id_gpio, 0); +#endif /* If we have a mic then reenable MICDET */ if (state && (mic || info->mic)) @@ -1317,6 +1327,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) regmap_update_bits(arizona->regmap, ARIZONA_GP_SWITCH_1, ARIZONA_SW1_MODE_MASK, arizona->pdata.gpsw); +#ifdef CONFIG_GPIOLIB_LEGACY if (pdata->micd_pol_gpio > 0) { if (info->micd_modes[0].gpio) mode = GPIOF_OUT_INIT_HIGH; @@ -1332,7 +1343,9 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) } info->micd_pol_gpio = gpio_to_desc(pdata->micd_pol_gpio); - } else { + } else +#endif + { if (info->micd_modes[0].gpio) mode = GPIOD_OUT_HIGH; else @@ -1353,6 +1366,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) } } +#ifdef CONFIG_GPIOLIB_LEGACY if (arizona->pdata.hpdet_id_gpio > 0) { ret = devm_gpio_request_one(dev, arizona->pdata.hpdet_id_gpio, GPIOF_OUT_INIT_LOW, @@ -1364,6 +1378,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) return ret; } } +#endif return 0; } From fcbe47a83a41cfbf49eb96649acea38605aeaba6 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Aug 2025 18:14:55 +0200 Subject: [PATCH 3165/3206] mfd: Remove unneeded 'fast_io' parameter in regmap_config When using MMIO with regmap, fast_io is implied. No need to set it again. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250813161517.4746-10-wsa+renesas@sang-engineering.com Signed-off-by: Lee Jones --- drivers/mfd/exynos-lpass.c | 1 - drivers/mfd/fsl-imx25-tsadc.c | 1 - drivers/mfd/stm32-lptimer.c | 1 - drivers/mfd/sun4i-gpadc.c | 1 - 4 files changed, 4 deletions(-) diff --git a/drivers/mfd/exynos-lpass.c b/drivers/mfd/exynos-lpass.c index 44797001a4322b..9bb2687c28355d 100644 --- a/drivers/mfd/exynos-lpass.c +++ b/drivers/mfd/exynos-lpass.c @@ -101,7 +101,6 @@ static const struct regmap_config exynos_lpass_reg_conf = { .reg_stride = 4, .val_bits = 32, .max_register = 0xfc, - .fast_io = true, }; static void exynos_lpass_disable_lpass(void *data) diff --git a/drivers/mfd/fsl-imx25-tsadc.c b/drivers/mfd/fsl-imx25-tsadc.c index 0aab6428e04232..467b1a23faeb24 100644 --- a/drivers/mfd/fsl-imx25-tsadc.c +++ b/drivers/mfd/fsl-imx25-tsadc.c @@ -17,7 +17,6 @@ #include static const struct regmap_config mx25_tsadc_regmap_config = { - .fast_io = true, .max_register = 8, .reg_bits = 32, .val_bits = 32, diff --git a/drivers/mfd/stm32-lptimer.c b/drivers/mfd/stm32-lptimer.c index 09073dbc9c8049..123659178cc2b6 100644 --- a/drivers/mfd/stm32-lptimer.c +++ b/drivers/mfd/stm32-lptimer.c @@ -19,7 +19,6 @@ static const struct regmap_config stm32_lptimer_regmap_cfg = { .val_bits = 32, .reg_stride = sizeof(u32), .max_register = STM32_LPTIM_MAX_REGISTER, - .fast_io = true, }; static int stm32_lptimer_detect_encoder(struct stm32_lptimer *ddata) diff --git a/drivers/mfd/sun4i-gpadc.c b/drivers/mfd/sun4i-gpadc.c index 3029d48e982cfd..bf2f6fdaf8bf9d 100644 --- a/drivers/mfd/sun4i-gpadc.c +++ b/drivers/mfd/sun4i-gpadc.c @@ -72,7 +72,6 @@ static const struct regmap_config sun4i_gpadc_regmap_config = { .reg_bits = 32, .val_bits = 32, .reg_stride = 4, - .fast_io = true, }; static const struct of_device_id sun4i_gpadc_of_match[] = { From 3d6a17fccc2832d8bc84420a634330941509d6e1 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 25 Aug 2025 10:12:02 +0200 Subject: [PATCH 3166/3206] dt-bindings: mfd: Move embedded controllers to own directory Move several embedded controller bindings (like ChromeOS EC, Gateworks System Controller and Kontron sl28cpld Board Management) to new subdirectory "embedded-controller" matching their purpose. An embedded controller (EC) is a discrete component that contains a microcontroller (i.e. a small CPU running a small firmware without operating system) mounted into a larger computer system running a fully fledged operating system that needs to utilize the embedded controller as part of its operation. So far the EC bindings were split between "mfd" and "platform" directory. MFD name comes from Linux, not hardware, and "platform" is a bit too generic. Rename Gateworks GSC and Huawei Gaokun filenames to match compatible, as preferred for bindings. Acked-by: Michael Walle # for sl28cpld Reviewed-by: Linus Walleij Signed-off-by: Krzysztof Kozlowski Acked-by: Mark Brown Reviewed-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250825081201.9775-2-krzysztof.kozlowski@linaro.org Signed-off-by: Lee Jones --- .../{platform => embedded-controller}/acer,aspire1-ec.yaml | 2 +- .../{mfd => embedded-controller}/google,cros-ec.yaml | 2 +- .../gateworks-gsc.yaml => embedded-controller/gw,gsc.yaml} | 2 +- .../huawei,gaokun3-ec.yaml} | 2 +- .../{mfd => embedded-controller}/kontron,sl28cpld.yaml | 2 +- .../lenovo,yoga-c630-ec.yaml | 2 +- .../microsoft,surface-sam.yaml | 2 +- .../devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml | 2 +- .../devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml | 2 +- .../interrupt-controller/kontron,sl28cpld-intc.yaml | 2 +- .../devicetree/bindings/pwm/google,cros-ec-pwm.yaml | 2 +- .../devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml | 2 +- Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml | 4 ++-- .../devicetree/bindings/sound/google,cros-ec-codec.yaml | 2 +- .../devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml | 2 +- MAINTAINERS | 6 +++--- 16 files changed, 19 insertions(+), 19 deletions(-) rename Documentation/devicetree/bindings/{platform => embedded-controller}/acer,aspire1-ec.yaml (94%) rename Documentation/devicetree/bindings/{mfd => embedded-controller}/google,cros-ec.yaml (99%) rename Documentation/devicetree/bindings/{mfd/gateworks-gsc.yaml => embedded-controller/gw,gsc.yaml} (98%) rename Documentation/devicetree/bindings/{platform/huawei,gaokun-ec.yaml => embedded-controller/huawei,gaokun3-ec.yaml} (97%) rename Documentation/devicetree/bindings/{mfd => embedded-controller}/kontron,sl28cpld.yaml (97%) rename Documentation/devicetree/bindings/{platform => embedded-controller}/lenovo,yoga-c630-ec.yaml (95%) rename Documentation/devicetree/bindings/{platform => embedded-controller}/microsoft,surface-sam.yaml (92%) diff --git a/Documentation/devicetree/bindings/platform/acer,aspire1-ec.yaml b/Documentation/devicetree/bindings/embedded-controller/acer,aspire1-ec.yaml similarity index 94% rename from Documentation/devicetree/bindings/platform/acer,aspire1-ec.yaml rename to Documentation/devicetree/bindings/embedded-controller/acer,aspire1-ec.yaml index 7cb0134134ffa6..01ee61768527c8 100644 --- a/Documentation/devicetree/bindings/platform/acer,aspire1-ec.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/acer,aspire1-ec.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/platform/acer,aspire1-ec.yaml# +$id: http://devicetree.org/schemas/embedded-controller/acer,aspire1-ec.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Acer Aspire 1 Embedded Controller diff --git a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml b/Documentation/devicetree/bindings/embedded-controller/google,cros-ec.yaml similarity index 99% rename from Documentation/devicetree/bindings/mfd/google,cros-ec.yaml rename to Documentation/devicetree/bindings/embedded-controller/google,cros-ec.yaml index 50f45709006690..3ab5737c9a8f3f 100644 --- a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/google,cros-ec.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/mfd/google,cros-ec.yaml# +$id: http://devicetree.org/schemas/embedded-controller/google,cros-ec.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: ChromeOS Embedded Controller diff --git a/Documentation/devicetree/bindings/mfd/gateworks-gsc.yaml b/Documentation/devicetree/bindings/embedded-controller/gw,gsc.yaml similarity index 98% rename from Documentation/devicetree/bindings/mfd/gateworks-gsc.yaml rename to Documentation/devicetree/bindings/embedded-controller/gw,gsc.yaml index dc379f3ebf24ff..82d4b2dadbae4e 100644 --- a/Documentation/devicetree/bindings/mfd/gateworks-gsc.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/gw,gsc.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause %YAML 1.2 --- -$id: http://devicetree.org/schemas/mfd/gateworks-gsc.yaml# +$id: http://devicetree.org/schemas/embedded-controller/gw,gsc.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Gateworks System Controller diff --git a/Documentation/devicetree/bindings/platform/huawei,gaokun-ec.yaml b/Documentation/devicetree/bindings/embedded-controller/huawei,gaokun3-ec.yaml similarity index 97% rename from Documentation/devicetree/bindings/platform/huawei,gaokun-ec.yaml rename to Documentation/devicetree/bindings/embedded-controller/huawei,gaokun3-ec.yaml index 4a03b0ee314900..cd9e65b6c2ea35 100644 --- a/Documentation/devicetree/bindings/platform/huawei,gaokun-ec.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/huawei,gaokun3-ec.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/platform/huawei,gaokun-ec.yaml# +$id: http://devicetree.org/schemas/embedded-controller/huawei,gaokun3-ec.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Huawei Matebook E Go Embedded Controller diff --git a/Documentation/devicetree/bindings/mfd/kontron,sl28cpld.yaml b/Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml similarity index 97% rename from Documentation/devicetree/bindings/mfd/kontron,sl28cpld.yaml rename to Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml index 37207a97e06c69..0b752f3baaa946 100644 --- a/Documentation/devicetree/bindings/mfd/kontron,sl28cpld.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/mfd/kontron,sl28cpld.yaml# +$id: http://devicetree.org/schemas/embedded-controller/kontron,sl28cpld.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Kontron's sl28cpld board management controller diff --git a/Documentation/devicetree/bindings/platform/lenovo,yoga-c630-ec.yaml b/Documentation/devicetree/bindings/embedded-controller/lenovo,yoga-c630-ec.yaml similarity index 95% rename from Documentation/devicetree/bindings/platform/lenovo,yoga-c630-ec.yaml rename to Documentation/devicetree/bindings/embedded-controller/lenovo,yoga-c630-ec.yaml index 3180ce1a22d445..a029b38e8dc0b1 100644 --- a/Documentation/devicetree/bindings/platform/lenovo,yoga-c630-ec.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/lenovo,yoga-c630-ec.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/platform/lenovo,yoga-c630-ec.yaml# +$id: http://devicetree.org/schemas/embedded-controller/lenovo,yoga-c630-ec.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Lenovo Yoga C630 Embedded Controller. diff --git a/Documentation/devicetree/bindings/platform/microsoft,surface-sam.yaml b/Documentation/devicetree/bindings/embedded-controller/microsoft,surface-sam.yaml similarity index 92% rename from Documentation/devicetree/bindings/platform/microsoft,surface-sam.yaml rename to Documentation/devicetree/bindings/embedded-controller/microsoft,surface-sam.yaml index b33d26f15b2afa..9202cfca0b3518 100644 --- a/Documentation/devicetree/bindings/platform/microsoft,surface-sam.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/microsoft,surface-sam.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/platform/microsoft,surface-sam.yaml# +$id: http://devicetree.org/schemas/embedded-controller/microsoft,surface-sam.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Surface System Aggregator Module (SAM, SSAM) diff --git a/Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml b/Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml index b032471831e7c7..02663d67eac751 100644 --- a/Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml @@ -11,7 +11,7 @@ maintainers: description: | This module is part of the sl28cpld multi-function device. For more - details see ../mfd/kontron,sl28cpld.yaml. + details see ../embedded-controller/kontron,sl28cpld.yaml. There are three flavors of the GPIO controller, one full featured input/output with interrupt support (kontron,sl28cpld-gpio), one diff --git a/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml b/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml index 010333cb25c0e1..5803a1770cad68 100644 --- a/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml +++ b/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml @@ -11,7 +11,7 @@ maintainers: description: | This module is part of the sl28cpld multi-function device. For more - details see ../mfd/kontron,sl28cpld.yaml. + details see ../embedded-controller/kontron,sl28cpld.yaml. properties: compatible: diff --git a/Documentation/devicetree/bindings/interrupt-controller/kontron,sl28cpld-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/kontron,sl28cpld-intc.yaml index e8dfa6507f64d3..87df07beda5926 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/kontron,sl28cpld-intc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/kontron,sl28cpld-intc.yaml @@ -11,7 +11,7 @@ maintainers: description: | This module is part of the sl28cpld multi-function device. For more - details see ../mfd/kontron,sl28cpld.yaml. + details see ../embedded-controller/kontron,sl28cpld.yaml. The following interrupts are available. All types and levels are fixed and handled by the board management controller. diff --git a/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml b/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml index f7bc84b05a871b..8f5a468cfb91fb 100644 --- a/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml @@ -14,7 +14,7 @@ description: | Google's ChromeOS EC PWM is a simple PWM attached to the Embedded Controller (EC) and controlled via a host-command interface. An EC PWM node should be only found as a sub-node of the EC node (see - Documentation/devicetree/bindings/mfd/google,cros-ec.yaml). + Documentation/devicetree/bindings/embedded-controller/google,cros-ec.yaml). allOf: - $ref: pwm.yaml# diff --git a/Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml b/Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml index 981cfec53f3727..19a9d2e15a964f 100644 --- a/Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml @@ -11,7 +11,7 @@ maintainers: description: | This module is part of the sl28cpld multi-function device. For more - details see ../mfd/kontron,sl28cpld.yaml. + details see ../embedded-controller/kontron,sl28cpld.yaml. The controller supports one PWM channel and supports only four distinct frequencies (250Hz, 500Hz, 1kHz, 2kHz). diff --git a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml index adc6b3f36fde49..179c98b33b4d9f 100644 --- a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml +++ b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml @@ -58,7 +58,7 @@ properties: maxItems: 1 cros-ec-rpmsg: - $ref: /schemas/mfd/google,cros-ec.yaml + $ref: /schemas/embedded-controller/google,cros-ec.yaml description: This subnode represents the rpmsg device. The properties of this node are defined by the individual bindings for @@ -126,7 +126,7 @@ patternProperties: maxItems: 1 cros-ec-rpmsg: - $ref: /schemas/mfd/google,cros-ec.yaml + $ref: /schemas/embedded-controller/google,cros-ec.yaml description: This subnode represents the rpmsg device. The properties of this node are defined by the individual bindings for diff --git a/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml b/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml index 1434f443373892..dd51e8c5b8c233 100644 --- a/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml +++ b/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml @@ -15,7 +15,7 @@ description: | Embedded Controller (EC) and is controlled via a host-command interface. An EC codec node should only be found inside the "codecs" subnode of a cros-ec node. - (see Documentation/devicetree/bindings/mfd/google,cros-ec.yaml). + (see Documentation/devicetree/bindings/embedded-controller/google,cros-ec.yaml). allOf: - $ref: dai-common.yaml# diff --git a/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml b/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml index 179272f74de5fb..872a8471ef65fe 100644 --- a/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml @@ -11,7 +11,7 @@ maintainers: description: | This module is part of the sl28cpld multi-function device. For more - details see ../mfd/kontron,sl28cpld.yaml. + details see ../embedded-controller/kontron,sl28cpld.yaml. allOf: - $ref: watchdog.yaml# diff --git a/MAINTAINERS b/MAINTAINERS index cf2f09782499b2..6e3961562d29cf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10128,7 +10128,7 @@ F: drivers/media/i2c/gc2145.c GATEWORKS SYSTEM CONTROLLER (GSC) DRIVER M: Tim Harvey S: Maintained -F: Documentation/devicetree/bindings/mfd/gateworks-gsc.yaml +F: Documentation/devicetree/bindings/embedded-controller/gw,gsc.yaml F: Documentation/hwmon/gsc-hwmon.rst F: drivers/hwmon/gsc-hwmon.c F: drivers/mfd/gateworks-gsc.c @@ -11299,7 +11299,7 @@ F: drivers/net/ethernet/huawei/hinic3/ HUAWEI MATEBOOK E GO EMBEDDED CONTROLLER DRIVER M: Pengyu Luo S: Maintained -F: Documentation/devicetree/bindings/platform/huawei,gaokun-ec.yaml +F: Documentation/devicetree/bindings/embedded-controller/huawei,gaokun3-ec.yaml F: drivers/platform/arm64/huawei-gaokun-ec.c F: drivers/power/supply/huawei-gaokun-battery.c F: drivers/usb/typec/ucsi/ucsi_huawei_gaokun.c @@ -23210,10 +23210,10 @@ F: drivers/usb/misc/sisusbvga/ SL28 CPLD MFD DRIVER M: Michael Walle S: Maintained +F: Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml F: Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml F: Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml F: Documentation/devicetree/bindings/interrupt-controller/kontron,sl28cpld-intc.yaml -F: Documentation/devicetree/bindings/mfd/kontron,sl28cpld.yaml F: Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml F: Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml F: drivers/gpio/gpio-sl28cpld.c From e399d779c9acf277488c5b306b71dcbc71e160ca Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 8 Aug 2025 17:17:57 +0200 Subject: [PATCH 3167/3206] mfd: si476x: Add GPIOLIB_LEGACY dependency This driver uses the legacy gpiolib interfaces to get gpio numbers from platform data: drivers/mfd/si476x-i2c.c: In function 'si476x_core_start': drivers/mfd/si476x-i2c.c:133:21: error: implicit declaration of function 'gpio_is_valid'; did you mean 'uuid_is_valid'? [-Werror=implicit-function-declaration] 133 | if (gpio_is_valid(core->gpio_reset)) There are no in-tree users of this driver, so nothing defines the platform data. Add a dependency on GPIOLIB_LEGACY for the moment to avoid the build failure, and make sure the sound driver does not get built without the mfd portion either pass that dependency along. Alternatively, we could remove the mfd driver along with the radio and sound portions. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507231653.UFlH2dMO-lkp@intel.com/ Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250808151822.536879-14-arnd@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 1 + sound/soc/codecs/Kconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 4b04adeb32071c..97ab36d8a4c9e0 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -1426,6 +1426,7 @@ config MFD_SEC_I2C config MFD_SI476X_CORE tristate "Silicon Laboratories 4761/64/68 AM/FM radio." depends on I2C + depends on GPIOLIB_LEGACY select MFD_CORE select REGMAP_I2C help diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 6d7e4725d89cd3..dfe907c62604c2 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -1902,6 +1902,7 @@ config SND_SOC_SGTL5000 config SND_SOC_SI476X tristate + depends on MFD_SI476X_CORE config SND_SOC_SIGMADSP tristate From eca0259e3b9c0d532051727d3d85fd8a42138f71 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 8 Aug 2025 17:17:58 +0200 Subject: [PATCH 3168/3206] mfd: aat2870: Add GPIOLIB_LEGACY dependency This driver uses the legacy gpiolib interfaces to get gpio numbers from platform data. There are no in-tree users of this driver, so nothing defines the platform data. Add a dependency on GPIOLIB_LEGACY for the moment to avoid the build failure, and make sure the sound driver does not get built without the mfd portion either pass that dependency along. Alternatively, we could remove the mfd driver along with the backlight and regulator portions. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250808151822.536879-15-arnd@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 97ab36d8a4c9e0..6dba1673b213e5 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -129,6 +129,7 @@ config MFD_AAT2870_CORE select MFD_CORE depends on I2C=y depends on GPIOLIB || COMPILE_TEST + depends on GPIOLIB_LEGACY help If you say yes here you get support for the AAT2870. This driver provides common support for accessing the device, From a598ae45f48d7d5a17f8290f2f5bd46046fd0b9b Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Fri, 22 Aug 2025 15:15:26 +0200 Subject: [PATCH 3169/3206] dt-bindings: mfd: sl28cpld: Add sa67mcu compatible The Kontron SMARC-sAM67 module features an on-board house keeping uC. It's designed to be compatible with the older on-board CPLD used on the SMARC-sAL28 board. To be prepared for any board specific quirks, add a specific compatible. Signed-off-by: Michael Walle Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250822131531.1366437-3-mwalle@kernel.org Signed-off-by: Lee Jones --- .../bindings/embedded-controller/kontron,sl28cpld.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml b/Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml index 0b752f3baaa946..a77e67f6cb82f9 100644 --- a/Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml @@ -16,7 +16,12 @@ description: | properties: compatible: - const: kontron,sl28cpld + oneOf: + - items: + - enum: + - kontron,sa67mcu + - const: kontron,sl28cpld + - const: kontron,sl28cpld reg: description: From 8566de1cf6892647c034c576caec772e6cf2c952 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 25 Aug 2025 12:20:50 -0500 Subject: [PATCH 3170/3206] dt-bindings: mfd: Add support the SpacemiT P1 PMIC Enable the SpacemiT P1, which is an I2C-controlled PMIC. Initially only the RTC and regulators will be supported. Signed-off-by: Alex Elder Reviewed-by: "Rob Herring (Arm)" Acked-by: Troy Mitchell Link: https://lore.kernel.org/r/20250825172057.163883-2-elder@riscstar.com Signed-off-by: Lee Jones --- .../devicetree/bindings/mfd/spacemit,p1.yaml | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 Documentation/devicetree/bindings/mfd/spacemit,p1.yaml diff --git a/Documentation/devicetree/bindings/mfd/spacemit,p1.yaml b/Documentation/devicetree/bindings/mfd/spacemit,p1.yaml new file mode 100644 index 00000000000000..c6593ac6ef6adb --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/spacemit,p1.yaml @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mfd/spacemit,p1.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: SpacemiT P1 Power Management Integrated Circuit + +maintainers: + - Troy Mitchell + +description: + P1 is an I2C-controlled PMIC produced by SpacemiT. It implements six + constant-on-time buck converters and twelve low-dropout regulators. + It also contains a load switch, watchdog timer, real-time clock, eight + 12-bit ADC channels, and six GPIOs. Additional details are available + in the "Power Stone/P1" section at the following link. + https://developer.spacemit.com/documentation + +properties: + compatible: + const: spacemit,p1 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + vin-supply: + description: Input supply phandle. + + regulators: + type: object + + patternProperties: + "^(buck[1-6]|aldo[1-4]|dldo[1-7])$": + type: object + $ref: /schemas/regulator/regulator.yaml# + unevaluatedProperties: false + + unevaluatedProperties: false + +required: + - compatible + - reg + - interrupts + +unevaluatedProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + pmic@41 { + compatible = "spacemit,p1"; + reg = <0x41>; + interrupts = <64>; + + regulators { + buck1 { + regulator-name = "buck1"; + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <3450000>; + regulator-ramp-delay = <5000>; + regulator-always-on; + }; + + aldo1 { + regulator-name = "aldo1"; + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <3400000>; + regulator-boot-on; + }; + + dldo1 { + regulator-name = "dldo1"; + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <3400000>; + regulator-boot-on; + }; + }; + }; + }; From 6fc5d415c10e98ac1b31dd1d5653443e691cdcff Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 25 Aug 2025 12:20:51 -0500 Subject: [PATCH 3171/3206] mfd: simple-mfd-i2c: Add SpacemiT P1 support Enable support for the RTC and regulators found in the SpacemiT P1 PMIC. Support is implemented by the simple I2C MFD driver. The P1 PMIC is normally implemented with the SpacemiT K1 SoC. This PMIC provides 6 buck converters and 12 LDO regulators. It also implements a switch, watchdog timer, real-time clock, and more. Initially its RTC and regulators are supported. Signed-off-by: Alex Elder Link: https://lore.kernel.org/r/20250825172057.163883-3-elder@riscstar.com Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 13 +++++++++++++ drivers/mfd/simple-mfd-i2c.c | 17 +++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 6dba1673b213e5..4a43672812da85 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -1254,6 +1254,19 @@ config MFD_QCOM_RPM Say M here if you want to include support for the Qualcomm RPM as a module. This will build a module called "qcom_rpm". +config MFD_SPACEMIT_P1 + tristate "SpacemiT P1 PMIC" + depends on ARCH_SPACEMIT || COMPILE_TEST + depends on I2C + select I2C_K1 + select MFD_SIMPLE_MFD_I2C + help + This option supports the I2C-based SpacemiT P1 PMIC, which + contains regulators, a power switch, GPIOs, an RTC, and more. + This option is selected when any of the supported sub-devices + is configured. The basic functionality is implemented by the + simple MFD I2C driver. + config MFD_SPMI_PMIC tristate "Qualcomm SPMI PMICs" depends on ARCH_QCOM || COMPILE_TEST diff --git a/drivers/mfd/simple-mfd-i2c.c b/drivers/mfd/simple-mfd-i2c.c index f7798bd922224e..63ac2638886061 100644 --- a/drivers/mfd/simple-mfd-i2c.c +++ b/drivers/mfd/simple-mfd-i2c.c @@ -93,6 +93,22 @@ static const struct simple_mfd_data maxim_mon_max77705 = { .mfd_cell_size = ARRAY_SIZE(max77705_sensor_cells), }; +static const struct regmap_config spacemit_p1_regmap_config = { + .reg_bits = 8, + .val_bits = 8, +}; + +static const struct mfd_cell spacemit_p1_cells[] = { + { .name = "spacemit-p1-regulator", }, + { .name = "spacemit-p1-rtc", }, +}; + +static const struct simple_mfd_data spacemit_p1 = { + .regmap_config = &spacemit_p1_regmap_config, + .mfd_cell = spacemit_p1_cells, + .mfd_cell_size = ARRAY_SIZE(spacemit_p1_cells), +}; + static const struct of_device_id simple_mfd_i2c_of_match[] = { { .compatible = "kontron,sl28cpld" }, { .compatible = "silergy,sy7636a", .data = &silergy_sy7636a}, @@ -101,6 +117,7 @@ static const struct of_device_id simple_mfd_i2c_of_match[] = { { .compatible = "maxim,max77705-battery", .data = &maxim_mon_max77705}, { .compatible = "fsl,lx2160aqds-fpga" }, { .compatible = "fsl,ls1028aqds-fpga" }, + { .compatible = "spacemit,p1", .data = &spacemit_p1, }, {} }; MODULE_DEVICE_TABLE(of, simple_mfd_i2c_of_match); From 8a498184e2e8aac24db0faf295b7d0fb62dfe90d Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Mon, 1 Sep 2025 12:04:13 +0100 Subject: [PATCH 3172/3206] dt-bindings: mfd: syscon: Document the control-scb syscon on PolarFire SoC The "control-scb" region, contains the "tvs" temperature and voltage sensors and the control/status registers for the system controller's mailbox. The mailbox has a dedicated node, so there's no need for a child node describing it, looking the syscon up by compatible is sufficient. Acked-by: Krzysztof Kozlowski Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20250901-shorten-yahoo-223aeaecd290@spud Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/syscon.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/syscon.yaml b/Documentation/devicetree/bindings/mfd/syscon.yaml index 1f381d159af81f..657c38175fba21 100644 --- a/Documentation/devicetree/bindings/mfd/syscon.yaml +++ b/Documentation/devicetree/bindings/mfd/syscon.yaml @@ -91,6 +91,7 @@ select: - mediatek,mt8173-pctl-a-syscfg - mediatek,mt8365-syscfg - microchip,lan966x-cpu-syscon + - microchip,mpfs-control-scb - microchip,mpfs-sysreg-scb - microchip,sam9x60-sfr - microchip,sama7d65-ddr3phy @@ -199,6 +200,7 @@ properties: - mediatek,mt8365-infracfg-nao - mediatek,mt8365-syscfg - microchip,lan966x-cpu-syscon + - microchip,mpfs-control-scb - microchip,mpfs-sysreg-scb - microchip,sam9x60-sfr - microchip,sama7d65-ddr3phy From 5f4bbee069836e51ed0b6d7e565a292f070ababc Mon Sep 17 00:00:00 2001 From: Bastien Curutchet Date: Wed, 20 Aug 2025 16:21:13 +0200 Subject: [PATCH 3173/3206] mfd: core: Increment of_node's refcount before linking it to the platform device When an MFD device is added, a platform_device is allocated. If this device is linked to a DT description, the corresponding OF node is linked to the new platform device but the OF node's refcount isn't incremented. As of_node_put() is called during the platform device release, it leads to a refcount underflow. Call of_node_get() to increment the OF node's refcount when the node is linked to the newly created platform device. Signed-off-by: Bastien Curutchet Link: https://lore.kernel.org/r/20250820-mfd-refcount-v1-1-6dcb5eb41756@bootlin.com Signed-off-by: Lee Jones --- drivers/mfd/mfd-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c index 76bd316a50afc5..7d14a1e7631ee8 100644 --- a/drivers/mfd/mfd-core.c +++ b/drivers/mfd/mfd-core.c @@ -131,6 +131,7 @@ static int mfd_match_of_node_to_dev(struct platform_device *pdev, of_entry->np = np; list_add_tail(&of_entry->list, &mfd_of_node_list); + of_node_get(np); device_set_node(&pdev->dev, of_fwnode_handle(np)); #endif return 0; From 9b959e525fa7e8518e57554b6e17849942938dfc Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Thu, 28 Aug 2025 16:01:37 +0200 Subject: [PATCH 3174/3206] mfd: macsmc: Add "apple,t8103-smc" compatible After discussion with the devicetree maintainers we agreed to not extend lists with the generic compatible "apple,smc" anymore [1]. Use "apple,t8103-smc" as base compatible as it is the SoC the driver and bindings were written for. [1]: https://lore.kernel.org/asahi/12ab93b7-1fc2-4ce0-926e-c8141cfe81bf@kernel.org/ Signed-off-by: Janne Grunau Link: https://lore.kernel.org/r/20250828-dt-apple-t6020-v1-18-507ba4c4b98e@jannau.net Signed-off-by: Lee Jones --- drivers/mfd/macsmc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/macsmc.c b/drivers/mfd/macsmc.c index c8575c9c9f4be8..e6cdae221f1d4e 100644 --- a/drivers/mfd/macsmc.c +++ b/drivers/mfd/macsmc.c @@ -478,6 +478,7 @@ static int apple_smc_probe(struct platform_device *pdev) } static const struct of_device_id apple_smc_of_match[] = { + { .compatible = "apple,t8103-smc" }, { .compatible = "apple,smc" }, {}, }; From 1160f9f88be2a911a8d7a27a896b24360a1763c9 Mon Sep 17 00:00:00 2001 From: Alexander Kurz Date: Fri, 29 Aug 2025 20:15:15 +0000 Subject: [PATCH 3175/3206] dt-bindings: mfd: fsl,mc13xxx: Convert txt to DT schema Convert the txt mc13xxx bindings to DT schema attempting to keep most information. The nodes codec and touchscreen are not part of the new schema since it was only briefly mentioned before. Following the convention, rename led-control to fsl,led-control. Signed-off-by: Alexander Kurz Reviewed-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250829201517.15374-6-akurz@blala.de Signed-off-by: Lee Jones --- .../devicetree/bindings/mfd/fsl,mc13xxx.yaml | 218 ++++++++++++++++++ .../devicetree/bindings/mfd/mc13xxx.txt | 156 ------------- 2 files changed, 218 insertions(+), 156 deletions(-) create mode 100644 Documentation/devicetree/bindings/mfd/fsl,mc13xxx.yaml delete mode 100644 Documentation/devicetree/bindings/mfd/mc13xxx.txt diff --git a/Documentation/devicetree/bindings/mfd/fsl,mc13xxx.yaml b/Documentation/devicetree/bindings/mfd/fsl,mc13xxx.yaml new file mode 100644 index 00000000000000..007c2e3eee91b6 --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/fsl,mc13xxx.yaml @@ -0,0 +1,218 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mfd/fsl,mc13xxx.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Freescale MC13xxx Power Management Integrated Circuits (PMIC) + +maintainers: + - Alexander Kurz + +description: > + The MC13xxx PMIC series consists of the three models MC13783, MC13892 + and MC34708 and provide regulators and other features like RTC, ADC, + LED, touchscreen, codec and input buttons. + + Link to datasheets + https://www.nxp.com/docs/en/data-sheet/MC13783.pdf + https://www.nxp.com/docs/en/data-sheet/MC13892.pdf + https://www.nxp.com/docs/en/data-sheet/MC34708.pdf + +properties: + compatible: + enum: + - fsl,mc13783 + - fsl,mc13892 + - fsl,mc34708 + + reg: + description: I2C slave address or SPI chip select number. + maxItems: 1 + + spi-max-frequency: true + + spi-cs-high: true + + system-power-controller: true + + interrupts: + maxItems: 1 + + leds: + type: object + $ref: /schemas/leds/common.yaml# + + properties: + reg: + description: | + One of + MC13783 LED IDs + 0: Main display + 1: AUX display + 2: Keypad + 3: Red 1 + 4: Green 1 + 5: Blue 1 + 6: Red 2 + 7: Green 2 + 8: Blue 2 + 9: Red 3 + 10: Green 3 + 11: Blue 3 + + MC13892 LED IDs + 0: Main display + 1: AUX display + 2: Keypad + 3: Red + 4: Green + 5: Blue + + MC34708 LED IDs + 0: Charger Red + 1: Charger Green + maxItems: 1 + + led-control: + $ref: /schemas/types.yaml#/definitions/uint32-array + description: | + Setting for LED-Control register array length depends on model, + mc13783: 6, mc13892: 4, mc34708: 1 + + regulators: + type: object + + additionalProperties: + type: object + + description: | + List of child nodes specifying the regulators, depending on chip variant. + Each child node is defined using the standard binding for regulators and + the optional regulator properties defined below. + + fsl,mc13xxx-uses-adc: + type: boolean + description: Indicate the ADC is being used + + fsl,mc13xxx-uses-codec: + type: boolean + description: Indicate the Audio Codec is being used + + fsl,mc13xxx-uses-rtc: + type: boolean + description: Indicate the RTC is being used + + fsl,mc13xxx-uses-touch: + type: boolean + description: Indicate the touchscreen controller is being used + +required: + - compatible + - reg + +allOf: + - if: + properties: + compatible: + contains: + const: fsl,mc13783 + then: + properties: + leds: + properties: + led-control: + minItems: 6 + maxItems: 6 + regulators: + patternProperties: + "^gpo[1-4]|pwgt[12]spi|sw[12][ab]|sw3|vaudio|vcam|vdig|vesim|vgen|viohi|violo|vmmc[12]|vrf[12]|vrfbg|vrfcp|vrfdig|vrfref|vsim|vvib$": + type: object + $ref: /schemas/regulator/regulator.yaml# + + unevaluatedProperties: false + + - if: + properties: + compatible: + contains: + const: fsl,mc13892 + then: + properties: + leds: + properties: + led-control: + minItems: 4 + maxItems: 4 + regulators: + patternProperties: + "^gpo[1-4]|pwgt[12]spi|sw[1-4]|swbst|vaudio|vcam|vcoincell|vdig|vgen[1-3]|viohi|vpll|vsd|vusb|vusb2|vvideo$": + type: object + $ref: /schemas/regulator/regulator.yaml# + + unevaluatedProperties: false + + - if: + properties: + compatible: + contains: + const: fsl,mc34708 + then: + properties: + leds: + properties: + led-control: + minItems: 1 + maxItems: 1 + +additionalProperties: false + +examples: + - | + #include + #include + #include + + spi { + #address-cells = <1>; + #size-cells = <0>; + + pmic: mc13892@0 { + compatible = "fsl,mc13892"; + reg = <0>; + spi-max-frequency = <1000000>; + spi-cs-high; + interrupt-parent = <&gpio0>; + interrupts = <8 IRQ_TYPE_LEVEL_HIGH>; + fsl,mc13xxx-uses-rtc; + fsl,mc13xxx-uses-adc; + + leds { + #address-cells = <1>; + #size-cells = <0>; + led-control = <0x000 0x000 0x0e0 0x000>; + + sysled@3 { + reg = <3>; + label = "system:red:live"; + linux,default-trigger = "heartbeat"; + }; + }; + + regulators { + sw1_reg: sw1 { + regulator-min-microvolt = <600000>; + regulator-max-microvolt = <1375000>; + regulator-boot-on; + regulator-always-on; + }; + + sw2_reg: sw2 { + regulator-min-microvolt = <900000>; + regulator-max-microvolt = <1850000>; + regulator-boot-on; + regulator-always-on; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/mfd/mc13xxx.txt b/Documentation/devicetree/bindings/mfd/mc13xxx.txt deleted file mode 100644 index 8261ea73278a6b..00000000000000 --- a/Documentation/devicetree/bindings/mfd/mc13xxx.txt +++ /dev/null @@ -1,156 +0,0 @@ -* Freescale MC13783/MC13892 Power Management Integrated Circuit (PMIC) - -Required properties: -- compatible : Should be "fsl,mc13783" or "fsl,mc13892" - -Optional properties: -- fsl,mc13xxx-uses-adc : Indicate the ADC is being used -- fsl,mc13xxx-uses-codec : Indicate the Audio Codec is being used -- fsl,mc13xxx-uses-rtc : Indicate the RTC is being used -- fsl,mc13xxx-uses-touch : Indicate the touchscreen controller is being used - -Sub-nodes: -- codec: Contain the Audio Codec node. - - adc-port: Contain PMIC SSI port number used for ADC. - - dac-port: Contain PMIC SSI port number used for DAC. -- leds : Contain the led nodes and initial register values in property - "led-control". Number of register depends of used IC, for MC13783 is 6, - for MC13892 is 4, for MC34708 is 1. See datasheet for bits definitions of - these registers. - - #address-cells: Must be 1. - - #size-cells: Must be 0. - Each led node should contain "reg", which used as LED ID (described below). - Optional properties "label" and "linux,default-trigger" is described in - Documentation/devicetree/bindings/leds/common.txt. -- regulators : Contain the regulator nodes. The regulators are bound using - their names as listed below with their registers and bits for enabling. - -MC13783 LED IDs: - 0 : Main display - 1 : AUX display - 2 : Keypad - 3 : Red 1 - 4 : Green 1 - 5 : Blue 1 - 6 : Red 2 - 7 : Green 2 - 8 : Blue 2 - 9 : Red 3 - 10 : Green 3 - 11 : Blue 3 - -MC13892 LED IDs: - 0 : Main display - 1 : AUX display - 2 : Keypad - 3 : Red - 4 : Green - 5 : Blue - -MC34708 LED IDs: - 0 : Charger Red - 1 : Charger Green - -MC13783 regulators: - sw1a : regulator SW1A (register 24, bit 0) - sw1b : regulator SW1B (register 25, bit 0) - sw2a : regulator SW2A (register 26, bit 0) - sw2b : regulator SW2B (register 27, bit 0) - sw3 : regulator SW3 (register 29, bit 20) - vaudio : regulator VAUDIO (register 32, bit 0) - viohi : regulator VIOHI (register 32, bit 3) - violo : regulator VIOLO (register 32, bit 6) - vdig : regulator VDIG (register 32, bit 9) - vgen : regulator VGEN (register 32, bit 12) - vrfdig : regulator VRFDIG (register 32, bit 15) - vrfref : regulator VRFREF (register 32, bit 18) - vrfcp : regulator VRFCP (register 32, bit 21) - vsim : regulator VSIM (register 33, bit 0) - vesim : regulator VESIM (register 33, bit 3) - vcam : regulator VCAM (register 33, bit 6) - vrfbg : regulator VRFBG (register 33, bit 9) - vvib : regulator VVIB (register 33, bit 11) - vrf1 : regulator VRF1 (register 33, bit 12) - vrf2 : regulator VRF2 (register 33, bit 15) - vmmc1 : regulator VMMC1 (register 33, bit 18) - vmmc2 : regulator VMMC2 (register 33, bit 21) - gpo1 : regulator GPO1 (register 34, bit 6) - gpo2 : regulator GPO2 (register 34, bit 8) - gpo3 : regulator GPO3 (register 34, bit 10) - gpo4 : regulator GPO4 (register 34, bit 12) - pwgt1spi : regulator PWGT1SPI (register 34, bit 15) - pwgt2spi : regulator PWGT2SPI (register 34, bit 16) - -MC13892 regulators: - vcoincell : regulator VCOINCELL (register 13, bit 23) - sw1 : regulator SW1 (register 24, bit 0) - sw2 : regulator SW2 (register 25, bit 0) - sw3 : regulator SW3 (register 26, bit 0) - sw4 : regulator SW4 (register 27, bit 0) - swbst : regulator SWBST (register 29, bit 20) - vgen1 : regulator VGEN1 (register 32, bit 0) - viohi : regulator VIOHI (register 32, bit 3) - vdig : regulator VDIG (register 32, bit 9) - vgen2 : regulator VGEN2 (register 32, bit 12) - vpll : regulator VPLL (register 32, bit 15) - vusb2 : regulator VUSB2 (register 32, bit 18) - vgen3 : regulator VGEN3 (register 33, bit 0) - vcam : regulator VCAM (register 33, bit 6) - vvideo : regulator VVIDEO (register 33, bit 12) - vaudio : regulator VAUDIO (register 33, bit 15) - vsd : regulator VSD (register 33, bit 18) - gpo1 : regulator GPO1 (register 34, bit 6) - gpo2 : regulator GPO2 (register 34, bit 8) - gpo3 : regulator GPO3 (register 34, bit 10) - gpo4 : regulator GPO4 (register 34, bit 12) - pwgt1spi : regulator PWGT1SPI (register 34, bit 15) - pwgt2spi : regulator PWGT2SPI (register 34, bit 16) - vusb : regulator VUSB (register 50, bit 3) - - The bindings details of individual regulator device can be found in: - Documentation/devicetree/bindings/regulator/regulator.txt - -Examples: - -ecspi@70010000 { /* ECSPI1 */ - cs-gpios = <&gpio4 24 0>, /* GPIO4_24 */ - <&gpio4 25 0>; /* GPIO4_25 */ - - pmic: mc13892@0 { - #address-cells = <1>; - #size-cells = <0>; - compatible = "fsl,mc13892"; - spi-max-frequency = <6000000>; - reg = <0>; - interrupt-parent = <&gpio0>; - interrupts = <8>; - - leds { - #address-cells = <1>; - #size-cells = <0>; - led-control = <0x000 0x000 0x0e0 0x000>; - - sysled@3 { - reg = <3>; - label = "system:red:live"; - linux,default-trigger = "heartbeat"; - }; - }; - - regulators { - sw1_reg: mc13892__sw1 { - regulator-min-microvolt = <600000>; - regulator-max-microvolt = <1375000>; - regulator-boot-on; - regulator-always-on; - }; - - sw2_reg: mc13892__sw2 { - regulator-min-microvolt = <900000>; - regulator-max-microvolt = <1850000>; - regulator-boot-on; - regulator-always-on; - }; - }; - }; -}; From 5872dcccc5ade97bc27aafe13e361bb8e7c73da1 Mon Sep 17 00:00:00 2001 From: Alexander Kurz Date: Fri, 29 Aug 2025 20:15:16 +0000 Subject: [PATCH 3176/3206] dt-bindings: mfd: fsl,mc13xxx: Add buttons node Add a buttons node and properties describing the "ONOFD" (MC13783) and "PWRON" (MC13892/MC34708) buttons available in the fsl,mc13xxx PMIC ICs. Signed-off-by: Alexander Kurz Reviewed-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250829201517.15374-7-akurz@blala.de Signed-off-by: Lee Jones --- .../devicetree/bindings/mfd/fsl,mc13xxx.yaml | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/fsl,mc13xxx.yaml b/Documentation/devicetree/bindings/mfd/fsl,mc13xxx.yaml index 007c2e3eee91b6..d2886f2686a8d3 100644 --- a/Documentation/devicetree/bindings/mfd/fsl,mc13xxx.yaml +++ b/Documentation/devicetree/bindings/mfd/fsl,mc13xxx.yaml @@ -39,6 +39,58 @@ properties: interrupts: maxItems: 1 + buttons: + type: object + properties: + "#address-cells": + const: 1 + + "#size-cells": + const: 0 + + patternProperties: + "^onkey@[0-2]$": + $ref: /schemas/input/input.yaml# + unevaluatedProperties: false + type: object + + properties: + reg: + description: | + One of + MC13783 BUTTON IDs: + 0: ONOFD1 + 1: ONOFD2 + 2: ONOFD3 + + MC13892 BUTTON IDs: + 0: PWRON1 + 1: PWRON2 + 2: PWRON3 + + MC34708 BUTTON IDs: + 0: PWRON1 + 1: PWRON2 + maximum: 2 + + debounce-delay-ms: + enum: [0, 30, 150, 750] + default: 30 + description: + Sets the debouncing delay in milliseconds. + + active-low: + description: Set active when pin is pulled low. + + linux,code: true + + fsl,enable-reset: + description: + Setting of the global reset option. + type: boolean + + unevaluatedProperties: false + leds: type: object $ref: /schemas/leds/common.yaml# @@ -159,6 +211,12 @@ allOf: const: fsl,mc34708 then: properties: + buttons: + patternProperties: + "^onkey@[0-2]$": + properties: + reg: + maximum: 1 leds: properties: led-control: @@ -187,6 +245,18 @@ examples: fsl,mc13xxx-uses-rtc; fsl,mc13xxx-uses-adc; + buttons { + #address-cells = <1>; + #size-cells = <0>; + + onkey@0 { + reg = <0>; + debounce-delay-ms = <30>; + active-low; + fsl,enable-reset; + }; + }; + leds { #address-cells = <1>; #size-cells = <0>; From 99e2f00cf418c2e12fc934ae2ba790870e98fd67 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Fri, 29 Aug 2025 18:04:49 -0500 Subject: [PATCH 3177/3206] dt-bindings: mfd: Convert aspeed,ast2400-p2a-ctrl to DT schema Convert the aspeed,ast2x00-p2a-ctrl binding to DT schema format. The schema is simple enough to just add it to the parent aspeed,ast2x00-scu binding. Signed-off-by: "Rob Herring (Arm)" Acked-by: Andrew Jeffery Link: https://lore.kernel.org/r/20250829230450.1496151-1-robh@kernel.org Signed-off-by: Lee Jones --- .../bindings/mfd/aspeed,ast2x00-scu.yaml | 33 ++++++++++++- .../bindings/misc/aspeed-p2a-ctrl.txt | 46 ------------------- 2 files changed, 32 insertions(+), 47 deletions(-) delete mode 100644 Documentation/devicetree/bindings/misc/aspeed-p2a-ctrl.txt diff --git a/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml b/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml index 5eccd10d95ce5d..5adb7f6aca4575 100644 --- a/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml +++ b/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml @@ -48,8 +48,34 @@ properties: patternProperties: '^p2a-control@[0-9a-f]+$': - description: See Documentation/devicetree/bindings/misc/aspeed-p2a-ctrl.txt + description: > + PCI-to-AHB Bridge Control + + The bridge is available on platforms with the VGA enabled on the Aspeed + device. In this case, the host has access to a 64KiB window into all of + the BMC's memory. The BMC can disable this bridge. If the bridge is + enabled, the host has read access to all the regions of memory, however + the host only has read and write access depending on a register + controlled by the BMC. type: object + additionalProperties: false + + properties: + compatible: + enum: + - aspeed,ast2400-p2a-ctrl + - aspeed,ast2500-p2a-ctrl + reg: + maxItems: 1 + + memory-region: + maxItems: 1 + description: + A reserved_memory region to be used for the PCI to AHB mapping + + required: + - compatible + - reg '^pinctrl(@[0-9a-f]+)?$': type: object @@ -123,6 +149,11 @@ examples: #size-cells = <1>; ranges = <0x0 0x1e6e2000 0x1000>; + p2a-control@2c { + compatible = "aspeed,ast2400-p2a-ctrl"; + reg = <0x2c 0x4>; + }; + silicon-id@7c { compatible = "aspeed,ast2500-silicon-id", "aspeed,silicon-id"; reg = <0x7c 0x4>, <0x150 0x8>; diff --git a/Documentation/devicetree/bindings/misc/aspeed-p2a-ctrl.txt b/Documentation/devicetree/bindings/misc/aspeed-p2a-ctrl.txt deleted file mode 100644 index f2e2e28b317ce1..00000000000000 --- a/Documentation/devicetree/bindings/misc/aspeed-p2a-ctrl.txt +++ /dev/null @@ -1,46 +0,0 @@ -====================================================================== -Device tree bindings for Aspeed AST2400/AST2500 PCI-to-AHB Bridge Control Driver -====================================================================== - -The bridge is available on platforms with the VGA enabled on the Aspeed device. -In this case, the host has access to a 64KiB window into all of the BMC's -memory. The BMC can disable this bridge. If the bridge is enabled, the host -has read access to all the regions of memory, however the host only has read -and write access depending on a register controlled by the BMC. - -Required properties: -=================== - - - compatible: must be one of: - - "aspeed,ast2400-p2a-ctrl" - - "aspeed,ast2500-p2a-ctrl" - -Optional properties: -=================== - -- reg: A hint for the memory regions associated with the P2A controller -- memory-region: A phandle to a reserved_memory region to be used for the PCI - to AHB mapping - -The p2a-control node should be the child of a syscon node with the required -property: - -- compatible : Should be one of the following: - "aspeed,ast2400-scu", "syscon", "simple-mfd" - "aspeed,ast2500-scu", "syscon", "simple-mfd" - -Example -=================== - -g4 Example ----------- - -syscon: scu@1e6e2000 { - compatible = "aspeed,ast2400-scu", "syscon", "simple-mfd"; - reg = <0x1e6e2000 0x1a8>; - - p2a: p2a-control { - compatible = "aspeed,ast2400-p2a-ctrl"; - memory-region = <&reserved_memory>; - }; -}; From b445c14ac7eb39447a364c142aa85b6487b30c67 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Sun, 31 Aug 2025 10:14:36 +0800 Subject: [PATCH 3178/3206] dt-bindings: mfd: aspeed: Add AST2700 SCU compatibles Add SCU interrupt controller compatible strings for the AST2700 SoC: scu-ic0 to 3. This extends the MFD binding to support AST2700-based platforms. Signed-off-by: Ryan Chen Acked-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250831021438.976893-3-ryan_chen@aspeedtech.com Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml b/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml index 5adb7f6aca4575..da1887d7a8fe55 100644 --- a/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml +++ b/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml @@ -101,6 +101,10 @@ patternProperties: - aspeed,ast2500-scu-ic - aspeed,ast2600-scu-ic0 - aspeed,ast2600-scu-ic1 + - aspeed,ast2700-scu-ic0 + - aspeed,ast2700-scu-ic1 + - aspeed,ast2700-scu-ic2 + - aspeed,ast2700-scu-ic3 '^silicon-id@[0-9a-f]+$': description: Unique hardware silicon identifiers within the SoC From 719d02a25a24601c6fb8a7b1627e1abf015b6c2a Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 21 Aug 2025 20:23:34 +0200 Subject: [PATCH 3179/3206] mfd: bd71828, bd71815: Prepare for power-supply support Add core support for ROHM BD718(15/28/78) PMIC's charger blocks. Signed-off-by: Matti Vaittinen Signed-off-by: Andreas Kemnade Link: https://lore.kernel.org/r/20250821-bd71828-charger-v3-1-cc74ac4e0fb9@kemnade.info Signed-off-by: Lee Jones --- drivers/mfd/rohm-bd71828.c | 44 +++++++++++++++++----- include/linux/mfd/rohm-bd71828.h | 63 ++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 9 deletions(-) diff --git a/drivers/mfd/rohm-bd71828.c b/drivers/mfd/rohm-bd71828.c index a14b7aa69c3c61..84a64c3b9c9f52 100644 --- a/drivers/mfd/rohm-bd71828.c +++ b/drivers/mfd/rohm-bd71828.c @@ -45,8 +45,8 @@ static const struct resource bd71828_rtc_irqs[] = { static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_RMV, "bd71815-dcin-rmv"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_OUT, "bd71815-clps-out"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_IN, "bd71815-clps-in"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_OUT, "bd71815-dcin-clps-out"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_IN, "bd71815-dcin-clps-in"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_OVP_RES, "bd71815-dcin-ovp-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_OVP_DET, "bd71815-dcin-ovp-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_MON_RES, "bd71815-dcin-mon-res"), @@ -56,7 +56,7 @@ static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_LOW_RES, "bd71815-vsys-low-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_LOW_DET, "bd71815-vsys-low-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_RES, "bd71815-vsys-mon-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_RES, "bd71815-vsys-mon-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_DET, "bd71815-vsys-mon-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_WDG_TEMP, "bd71815-chg-wdg-temp"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_WDG_TIME, "bd71815-chg-wdg"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_RECHARGE_RES, "bd71815-rechg-res"), @@ -87,10 +87,10 @@ static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_2_DET, "bd71815-bat-oc2-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_3_RES, "bd71815-bat-oc3-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_3_DET, "bd71815-bat-oc3-det"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_RES, "bd71815-bat-low-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_DET, "bd71815-bat-low-det"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_RES, "bd71815-bat-hi-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_DET, "bd71815-bat-hi-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_RES, "bd71815-temp-bat-low-res"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_DET, "bd71815-temp-bat-low-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_RES, "bd71815-temp-bat-hi-res"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_DET, "bd71815-temp-bat-hi-det"), }; static const struct mfd_cell bd71815_mfd_cells[] = { @@ -109,7 +109,30 @@ static const struct mfd_cell bd71815_mfd_cells[] = { }, }; -static const struct mfd_cell bd71828_mfd_cells[] = { +static const struct resource bd71828_power_irqs[] = { + DEFINE_RES_IRQ_NAMED(BD71828_INT_CHG_TOPOFF_TO_DONE, + "bd71828-chg-done"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_DCIN_DET, "bd71828-pwr-dcin-in"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_DCIN_RMV, "bd71828-pwr-dcin-out"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_BAT_LOW_VOLT_RES, + "bd71828-vbat-normal"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_BAT_LOW_VOLT_DET, "bd71828-vbat-low"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_HI_DET, "bd71828-btemp-hi"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_HI_RES, "bd71828-btemp-cool"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_LOW_DET, "bd71828-btemp-lo"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_LOW_RES, + "bd71828-btemp-warm"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_VF_DET, + "bd71828-temp-hi"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_VF_RES, + "bd71828-temp-norm"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_125_DET, + "bd71828-temp-125-over"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_125_RES, + "bd71828-temp-125-under"), +}; + +static struct mfd_cell bd71828_mfd_cells[] = { { .name = "bd71828-pmic", }, { .name = "bd71828-gpio", }, { .name = "bd71828-led", .of_compatible = "rohm,bd71828-leds" }, @@ -118,8 +141,11 @@ static const struct mfd_cell bd71828_mfd_cells[] = { * BD70528 clock gate are the register address and mask. */ { .name = "bd71828-clk", }, - { .name = "bd71827-power", }, { + .name = "bd71828-power", + .resources = bd71828_power_irqs, + .num_resources = ARRAY_SIZE(bd71828_power_irqs), + }, { .name = "bd71828-rtc", .resources = bd71828_rtc_irqs, .num_resources = ARRAY_SIZE(bd71828_rtc_irqs), diff --git a/include/linux/mfd/rohm-bd71828.h b/include/linux/mfd/rohm-bd71828.h index ce786c96404a3d..73a71ef6915254 100644 --- a/include/linux/mfd/rohm-bd71828.h +++ b/include/linux/mfd/rohm-bd71828.h @@ -189,6 +189,69 @@ enum { /* Charger/Battey */ #define BD71828_REG_CHG_STATE 0x65 #define BD71828_REG_CHG_FULL 0xd2 +#define BD71828_REG_CHG_EN 0x6F +#define BD71828_REG_DCIN_STAT 0x68 +#define BD71828_MASK_DCIN_DET 0x01 +#define BD71828_REG_VDCIN_U 0x9c +#define BD71828_MASK_CHG_EN 0x01 +#define BD71828_CHG_MASK_DCIN_U 0x0f +#define BD71828_REG_BAT_STAT 0x67 +#define BD71828_REG_BAT_TEMP 0x6c +#define BD71828_MASK_BAT_TEMP 0x07 +#define BD71828_BAT_TEMP_OPEN 0x07 +#define BD71828_MASK_BAT_DET 0x20 +#define BD71828_MASK_BAT_DET_DONE 0x10 +#define BD71828_REG_CHG_STATE 0x65 +#define BD71828_REG_VBAT_U 0x8c +#define BD71828_MASK_VBAT_U 0x0f +#define BD71828_REG_VBAT_REX_AVG_U 0x92 + +#define BD71828_REG_OCV_PWRON_U 0x8A + +#define BD71828_REG_VBAT_MIN_AVG_U 0x8e +#define BD71828_REG_VBAT_MIN_AVG_L 0x8f + +#define BD71828_REG_CC_CNT3 0xb5 +#define BD71828_REG_CC_CNT2 0xb6 +#define BD71828_REG_CC_CNT1 0xb7 +#define BD71828_REG_CC_CNT0 0xb8 +#define BD71828_REG_CC_CURCD_AVG_U 0xb2 +#define BD71828_MASK_CC_CURCD_AVG_U 0x3f +#define BD71828_MASK_CC_CUR_DIR 0x80 +#define BD71828_REG_VM_BTMP_U 0xa1 +#define BD71828_REG_VM_BTMP_L 0xa2 +#define BD71828_MASK_VM_BTMP_U 0x0f +#define BD71828_REG_COULOMB_CTRL 0xc4 +#define BD71828_REG_COULOMB_CTRL2 0xd2 +#define BD71828_MASK_REX_CC_CLR 0x01 +#define BD71828_MASK_FULL_CC_CLR 0x10 +#define BD71828_REG_CC_CNT_FULL3 0xbd +#define BD71828_REG_CC_CNT_CHG3 0xc1 + +#define BD71828_REG_VBAT_INITIAL1_U 0x86 +#define BD71828_REG_VBAT_INITIAL1_L 0x87 + +#define BD71828_REG_VBAT_INITIAL2_U 0x88 +#define BD71828_REG_VBAT_INITIAL2_L 0x89 + +#define BD71828_REG_IBAT_U 0xb0 +#define BD71828_REG_IBAT_L 0xb1 + +#define BD71828_REG_IBAT_AVG_U 0xb2 +#define BD71828_REG_IBAT_AVG_L 0xb3 + +#define BD71828_REG_VSYS_AVG_U 0x96 +#define BD71828_REG_VSYS_AVG_L 0x97 +#define BD71828_REG_VSYS_MIN_AVG_U 0x98 +#define BD71828_REG_VSYS_MIN_AVG_L 0x99 +#define BD71828_REG_CHG_SET1 0x75 +#define BD71828_REG_ALM_VBAT_LIMIT_U 0xaa +#define BD71828_REG_BATCAP_MON_LIMIT_U 0xcc +#define BD71828_REG_CONF 0x64 + +#define BD71828_REG_DCIN_CLPS 0x71 + +#define BD71828_REG_MEAS_CLEAR 0xaf /* LEDs */ #define BD71828_REG_LED_CTRL 0x4A From 0d64f6d1ffe96f59145481f7413344b1fa3ad1ce Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Thu, 4 Sep 2025 20:35:05 +0800 Subject: [PATCH 3180/3206] mfd: ls2kbmc: Introduce Loongson-2K BMC core driver The Loongson-2K Board Management Controller provides an PCIe interface to the host to access the feature implemented in the BMC. The BMC is assembled on a server similar to the server machine with Loongson-3 CPU. It supports multiple sub-devices like DRM and IPMI. Co-developed-by: Chong Qiao Signed-off-by: Chong Qiao Reviewed-by: Huacai Chen Acked-by: Corey Minyard Signed-off-by: Binbin Zhou Link: https://lore.kernel.org/r/0dc1fd53020ce2562453961ffed2cd9eb8f359e1.1756987761.git.zhoubinbin@loongson.cn Signed-off-by: Lee Jones --- MAINTAINERS | 6 ++ drivers/mfd/Kconfig | 13 +++ drivers/mfd/Makefile | 2 + drivers/mfd/ls2k-bmc-core.c | 189 ++++++++++++++++++++++++++++++++++++ 4 files changed, 210 insertions(+) create mode 100644 drivers/mfd/ls2k-bmc-core.c diff --git a/MAINTAINERS b/MAINTAINERS index 6e3961562d29cf..2f4e3c0b653300 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14417,6 +14417,12 @@ S: Maintained F: Documentation/devicetree/bindings/thermal/loongson,ls2k-thermal.yaml F: drivers/thermal/loongson2_thermal.c +LOONGSON-2K Board Management Controller (BMC) DRIVER +M: Binbin Zhou +M: Chong Qiao +S: Maintained +F: drivers/mfd/ls2k-bmc-core.c + LOONGSON EDAC DRIVER M: Zhao Qunqin L: linux-edac@vger.kernel.org diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 4a43672812da85..b247f3b1a3b876 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -2480,6 +2480,19 @@ config MFD_LOONGSON_SE commands to the engine and must first send them to the controller, which will forward them to the corresponding engine. +config MFD_LS2K_BMC_CORE + bool "Loongson-2K Board Management Controller Support" + depends on PCI && ACPI_GENERIC_GSI + select MFD_CORE + help + Say yes here to add support for the Loongson-2K BMC which is a Board + Management Controller connected to the PCIe bus. The device supports + multiple sub-devices like display and IPMI. This driver provides common + support for accessing the devices. + + The display is enabled by default in the driver, while the IPMI interface + is enabled independently through the IPMI_LS2K option in the IPMI section. + config MFD_QNAP_MCU tristate "QNAP microcontroller unit core driver" depends on SERIAL_DEV_BUS diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 6dd501a17b206e..865e9f12faff02 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -290,6 +290,8 @@ obj-$(CONFIG_MFD_INTEL_M10_BMC_CORE) += intel-m10-bmc-core.o obj-$(CONFIG_MFD_INTEL_M10_BMC_SPI) += intel-m10-bmc-spi.o obj-$(CONFIG_MFD_INTEL_M10_BMC_PMCI) += intel-m10-bmc-pmci.o +obj-$(CONFIG_MFD_LS2K_BMC_CORE) += ls2k-bmc-core.o + obj-$(CONFIG_MFD_ATC260X) += atc260x-core.o obj-$(CONFIG_MFD_ATC260X_I2C) += atc260x-i2c.o diff --git a/drivers/mfd/ls2k-bmc-core.c b/drivers/mfd/ls2k-bmc-core.c new file mode 100644 index 00000000000000..39cc481d9ba129 --- /dev/null +++ b/drivers/mfd/ls2k-bmc-core.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Loongson-2K Board Management Controller (BMC) Core Driver. + * + * Copyright (C) 2024-2025 Loongson Technology Corporation Limited. + * + * Authors: + * Chong Qiao + * Binbin Zhou + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* LS2K BMC resources */ +#define LS2K_DISPLAY_RES_START (SZ_16M + SZ_2M) +#define LS2K_IPMI_RES_SIZE 0x1C +#define LS2K_IPMI0_RES_START (SZ_16M + 0xF00000) +#define LS2K_IPMI1_RES_START (LS2K_IPMI0_RES_START + LS2K_IPMI_RES_SIZE) +#define LS2K_IPMI2_RES_START (LS2K_IPMI1_RES_START + LS2K_IPMI_RES_SIZE) +#define LS2K_IPMI3_RES_START (LS2K_IPMI2_RES_START + LS2K_IPMI_RES_SIZE) +#define LS2K_IPMI4_RES_START (LS2K_IPMI3_RES_START + LS2K_IPMI_RES_SIZE) + +enum { + LS2K_BMC_DISPLAY, + LS2K_BMC_IPMI0, + LS2K_BMC_IPMI1, + LS2K_BMC_IPMI2, + LS2K_BMC_IPMI3, + LS2K_BMC_IPMI4, +}; + +static struct resource ls2k_display_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_DISPLAY_RES_START, SZ_4M, "simpledrm-res"), +}; + +static struct resource ls2k_ipmi0_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_IPMI0_RES_START, LS2K_IPMI_RES_SIZE, "ipmi0-res"), +}; + +static struct resource ls2k_ipmi1_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_IPMI1_RES_START, LS2K_IPMI_RES_SIZE, "ipmi1-res"), +}; + +static struct resource ls2k_ipmi2_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_IPMI2_RES_START, LS2K_IPMI_RES_SIZE, "ipmi2-res"), +}; + +static struct resource ls2k_ipmi3_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_IPMI3_RES_START, LS2K_IPMI_RES_SIZE, "ipmi3-res"), +}; + +static struct resource ls2k_ipmi4_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_IPMI4_RES_START, LS2K_IPMI_RES_SIZE, "ipmi4-res"), +}; + +static struct mfd_cell ls2k_bmc_cells[] = { + [LS2K_BMC_DISPLAY] = { + .name = "simple-framebuffer", + .num_resources = ARRAY_SIZE(ls2k_display_resources), + .resources = ls2k_display_resources + }, + [LS2K_BMC_IPMI0] = { + .name = "ls2k-ipmi-si", + .num_resources = ARRAY_SIZE(ls2k_ipmi0_resources), + .resources = ls2k_ipmi0_resources + }, + [LS2K_BMC_IPMI1] = { + .name = "ls2k-ipmi-si", + .num_resources = ARRAY_SIZE(ls2k_ipmi1_resources), + .resources = ls2k_ipmi1_resources + }, + [LS2K_BMC_IPMI2] = { + .name = "ls2k-ipmi-si", + .num_resources = ARRAY_SIZE(ls2k_ipmi2_resources), + .resources = ls2k_ipmi2_resources + }, + [LS2K_BMC_IPMI3] = { + .name = "ls2k-ipmi-si", + .num_resources = ARRAY_SIZE(ls2k_ipmi3_resources), + .resources = ls2k_ipmi3_resources + }, + [LS2K_BMC_IPMI4] = { + .name = "ls2k-ipmi-si", + .num_resources = ARRAY_SIZE(ls2k_ipmi4_resources), + .resources = ls2k_ipmi4_resources + }, +}; + +/* + * Currently the Loongson-2K BMC hardware does not have an I2C interface to adapt to the + * resolution. We set the resolution by presetting "video=1280x1024-16@2M" to the BMC memory. + */ +static int ls2k_bmc_parse_mode(struct pci_dev *pdev, struct simplefb_platform_data *pd) +{ + char *mode; + int depth, ret; + + /* The last 16M of PCI BAR0 is used to store the resolution string. */ + mode = devm_ioremap(&pdev->dev, pci_resource_start(pdev, 0) + SZ_16M, SZ_16M); + if (!mode) + return -ENOMEM; + + /* The resolution field starts with the flag "video=". */ + if (!strncmp(mode, "video=", 6)) + mode = mode + 6; + + ret = kstrtoint(strsep(&mode, "x"), 10, &pd->width); + if (ret) + return ret; + + ret = kstrtoint(strsep(&mode, "-"), 10, &pd->height); + if (ret) + return ret; + + ret = kstrtoint(strsep(&mode, "@"), 10, &depth); + if (ret) + return ret; + + pd->stride = pd->width * depth / 8; + pd->format = depth == 32 ? "a8r8g8b8" : "r5g6b5"; + + return 0; +} + +static int ls2k_bmc_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + struct simplefb_platform_data pd; + resource_size_t base; + int ret; + + ret = pci_enable_device(dev); + if (ret) + return ret; + + ret = ls2k_bmc_parse_mode(dev, &pd); + if (ret) + goto disable_pci; + + ls2k_bmc_cells[LS2K_BMC_DISPLAY].platform_data = &pd; + ls2k_bmc_cells[LS2K_BMC_DISPLAY].pdata_size = sizeof(pd); + base = dev->resource[0].start + LS2K_DISPLAY_RES_START; + + /* Remove conflicting efifb device */ + ret = aperture_remove_conflicting_devices(base, SZ_4M, "simple-framebuffer"); + if (ret) { + dev_err(&dev->dev, "Failed to removed firmware framebuffers: %d\n", ret); + goto disable_pci; + } + + return devm_mfd_add_devices(&dev->dev, PLATFORM_DEVID_AUTO, + ls2k_bmc_cells, ARRAY_SIZE(ls2k_bmc_cells), + &dev->resource[0], 0, NULL); + +disable_pci: + pci_disable_device(dev); + return ret; +} + +static void ls2k_bmc_remove(struct pci_dev *dev) +{ + pci_disable_device(dev); +} + +static struct pci_device_id ls2k_bmc_devices[] = { + { PCI_DEVICE(PCI_VENDOR_ID_LOONGSON, 0x1a05) }, + { } +}; +MODULE_DEVICE_TABLE(pci, ls2k_bmc_devices); + +static struct pci_driver ls2k_bmc_driver = { + .name = "ls2k-bmc", + .id_table = ls2k_bmc_devices, + .probe = ls2k_bmc_probe, + .remove = ls2k_bmc_remove, +}; +module_pci_driver(ls2k_bmc_driver); + +MODULE_DESCRIPTION("Loongson-2K Board Management Controller (BMC) Core driver"); +MODULE_AUTHOR("Loongson Technology Corporation Limited"); +MODULE_LICENSE("GPL"); From d952bba3fbb5d8a3c961e32bae3f7ff19a18762f Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Thu, 4 Sep 2025 20:35:06 +0800 Subject: [PATCH 3181/3206] mfd: ls2kbmc: Add Loongson-2K BMC reset function support Since the display is a sub-function of the Loongson-2K BMC, when the BMC reset, the entire BMC PCIe is disconnected, including the display which is interrupted. Quick overview of the entire LS2K BMC reset process: There are two types of reset methods: soft reset (BMC-initiated reboot of IPMI reset command) and BMC watchdog reset (watchdog timeout). First, regardless of the method, an interrupt is generated (PCIe interrupt for soft reset/GPIO interrupt for watchdog reset); Second, during the interrupt process, the system enters bmc_reset_work, clears the bus/IO/mem resources of the LS7A PCI-E bridge, waits for the BMC reset to begin, then restores the parent device's PCI configuration space, waits for the BMC reset to complete, and finally restores the BMC PCI configuration space. Display restoration occurs last. Co-developed-by: Chong Qiao Signed-off-by: Chong Qiao Reviewed-by: Huacai Chen Acked-by: Corey Minyard Signed-off-by: Binbin Zhou Link: https://lore.kernel.org/r/de4e04b42ff9ee282f86f9bb9efbf959a9848205.1756987761.git.zhoubinbin@loongson.cn Signed-off-by: Lee Jones --- drivers/mfd/ls2k-bmc-core.c | 339 ++++++++++++++++++++++++++++++++++++ 1 file changed, 339 insertions(+) diff --git a/drivers/mfd/ls2k-bmc-core.c b/drivers/mfd/ls2k-bmc-core.c index 39cc481d9ba129..e162b3c7c9f821 100644 --- a/drivers/mfd/ls2k-bmc-core.c +++ b/drivers/mfd/ls2k-bmc-core.c @@ -10,8 +10,12 @@ */ #include +#include +#include #include #include +#include +#include #include #include #include @@ -19,6 +23,8 @@ #include #include #include +#include +#include /* LS2K BMC resources */ #define LS2K_DISPLAY_RES_START (SZ_16M + SZ_2M) @@ -29,6 +35,54 @@ #define LS2K_IPMI3_RES_START (LS2K_IPMI2_RES_START + LS2K_IPMI_RES_SIZE) #define LS2K_IPMI4_RES_START (LS2K_IPMI3_RES_START + LS2K_IPMI_RES_SIZE) +#define LS7A_PCI_CFG_SIZE 0x100 + +/* LS7A bridge registers */ +#define LS7A_PCIE_PORT_CTL0 0x0 +#define LS7A_PCIE_PORT_STS1 0xC +#define LS7A_GEN2_CTL 0x80C +#define LS7A_SYMBOL_TIMER 0x71C + +/* Bits of LS7A_PCIE_PORT_CTL0 */ +#define LS2K_BMC_PCIE_LTSSM_ENABLE BIT(3) + +/* Bits of LS7A_PCIE_PORT_STS1 */ +#define LS2K_BMC_PCIE_LTSSM_STS GENMASK(5, 0) +#define LS2K_BMC_PCIE_CONNECTED 0x11 + +#define LS2K_BMC_PCIE_DELAY_US 1000 +#define LS2K_BMC_PCIE_TIMEOUT_US 1000000 + +/* Bits of LS7A_GEN2_CTL */ +#define LS7A_GEN2_SPEED_CHANG BIT(17) +#define LS7A_CONF_PHY_TX BIT(18) + +/* Bits of LS7A_SYMBOL_TIMER */ +#define LS7A_MASK_LEN_MATCH BIT(26) + +/* Interval between interruptions */ +#define LS2K_BMC_INT_INTERVAL (60 * HZ) + +/* Maximum time to wait for U-Boot and DDR to be ready with ms. */ +#define LS2K_BMC_RESET_WAIT_TIME 10000 + +/* It's an experience value */ +#define LS7A_BAR0_CHECK_MAX_TIMES 2000 + +#define PCI_REG_STRIDE 0x4 + +#define LS2K_BMC_RESET_GPIO 14 +#define LOONGSON_GPIO_REG_BASE 0x1FE00500 +#define LOONGSON_GPIO_REG_SIZE 0x18 +#define LOONGSON_GPIO_OEN 0x0 +#define LOONGSON_GPIO_FUNC 0x4 +#define LOONGSON_GPIO_INTPOL 0x10 +#define LOONGSON_GPIO_INTEN 0x14 + +#define LOONGSON_IO_INT_BASE 16 +#define LS2K_BMC_RESET_GPIO_INT_VEC (LS2K_BMC_RESET_GPIO % 8) +#define LS2K_BMC_RESET_GPIO_GSI (LOONGSON_IO_INT_BASE + LS2K_BMC_RESET_GPIO_INT_VEC) + enum { LS2K_BMC_DISPLAY, LS2K_BMC_IPMI0, @@ -95,6 +149,278 @@ static struct mfd_cell ls2k_bmc_cells[] = { }, }; +/* Index of the BMC PCI configuration space to be restored at BMC reset. */ +struct ls2k_bmc_pci_data { + u32 pci_command; + u32 base_address0; + u32 interrupt_line; +}; + +/* Index of the parent PCI configuration space to be restored at BMC reset. */ +struct ls2k_bmc_bridge_pci_data { + u32 pci_command; + u32 base_address[6]; + u32 rom_addreess; + u32 interrupt_line; + u32 msi_hi; + u32 msi_lo; + u32 devctl; + u32 linkcap; + u32 linkctl_sts; + u32 symbol_timer; + u32 gen2_ctrl; +}; + +struct ls2k_bmc_ddata { + struct device *dev; + struct work_struct bmc_reset_work; + struct ls2k_bmc_pci_data bmc_pci_data; + struct ls2k_bmc_bridge_pci_data bridge_pci_data; +}; + +static bool ls2k_bmc_bar0_addr_is_set(struct pci_dev *pdev) +{ + u32 addr; + + pci_read_config_dword(pdev, PCI_BASE_ADDRESS_0, &addr); + + return addr & PCI_BASE_ADDRESS_MEM_MASK ? true : false; +} + +static bool ls2k_bmc_pcie_is_connected(struct pci_dev *parent, struct ls2k_bmc_ddata *ddata) +{ + void __iomem *base; + int val, ret; + + base = pci_iomap(parent, 0, LS7A_PCI_CFG_SIZE); + if (!base) + return false; + + val = readl(base + LS7A_PCIE_PORT_CTL0); + writel(val | LS2K_BMC_PCIE_LTSSM_ENABLE, base + LS7A_PCIE_PORT_CTL0); + + ret = readl_poll_timeout_atomic(base + LS7A_PCIE_PORT_STS1, val, + (val & LS2K_BMC_PCIE_LTSSM_STS) == LS2K_BMC_PCIE_CONNECTED, + LS2K_BMC_PCIE_DELAY_US, LS2K_BMC_PCIE_TIMEOUT_US); + if (ret) { + pci_iounmap(parent, base); + dev_err(ddata->dev, "PCI-E training failed status=0x%x\n", val); + return false; + } + + pci_iounmap(parent, base); + return true; +} + +static void ls2k_bmc_restore_bridge_pci_data(struct pci_dev *parent, struct ls2k_bmc_ddata *ddata) +{ + int base, i = 0; + + pci_write_config_dword(parent, PCI_COMMAND, ddata->bridge_pci_data.pci_command); + + for (base = PCI_BASE_ADDRESS_0; base <= PCI_BASE_ADDRESS_5; base += PCI_REG_STRIDE, i++) + pci_write_config_dword(parent, base, ddata->bridge_pci_data.base_address[i]); + + pci_write_config_dword(parent, PCI_ROM_ADDRESS, ddata->bridge_pci_data.rom_addreess); + pci_write_config_dword(parent, PCI_INTERRUPT_LINE, ddata->bridge_pci_data.interrupt_line); + + pci_write_config_dword(parent, parent->msi_cap + PCI_MSI_ADDRESS_LO, + ddata->bridge_pci_data.msi_lo); + pci_write_config_dword(parent, parent->msi_cap + PCI_MSI_ADDRESS_HI, + ddata->bridge_pci_data.msi_hi); + pci_write_config_dword(parent, parent->pcie_cap + PCI_EXP_DEVCTL, + ddata->bridge_pci_data.devctl); + pci_write_config_dword(parent, parent->pcie_cap + PCI_EXP_LNKCAP, + ddata->bridge_pci_data.linkcap); + pci_write_config_dword(parent, parent->pcie_cap + PCI_EXP_LNKCTL, + ddata->bridge_pci_data.linkctl_sts); + + pci_write_config_dword(parent, LS7A_GEN2_CTL, ddata->bridge_pci_data.gen2_ctrl); + pci_write_config_dword(parent, LS7A_SYMBOL_TIMER, ddata->bridge_pci_data.symbol_timer); +} + +static int ls2k_bmc_recover_pci_data(void *data) +{ + struct ls2k_bmc_ddata *ddata = data; + struct pci_dev *pdev = to_pci_dev(ddata->dev); + struct pci_dev *parent = pdev->bus->self; + u32 i; + + /* + * Clear the bus, io and mem resources of the PCI-E bridge to zero, so that + * the processor can not access the LS2K PCI-E port, to avoid crashing due to + * the lack of return signal from accessing the LS2K PCI-E port. + */ + pci_write_config_dword(parent, PCI_BASE_ADDRESS_2, 0); + pci_write_config_dword(parent, PCI_BASE_ADDRESS_3, 0); + pci_write_config_dword(parent, PCI_BASE_ADDRESS_4, 0); + + /* + * When the LS2K BMC is reset, the LS7A PCI-E port is also reset, and its PCI + * BAR0 register is cleared. Due to the time gap between the GPIO interrupt + * generation and the LS2K BMC reset, the LS7A PCI BAR0 register is read to + * determine whether the reset has begun. + */ + for (i = LS7A_BAR0_CHECK_MAX_TIMES; i > 0 ; i--) { + if (!ls2k_bmc_bar0_addr_is_set(parent)) + break; + mdelay(1); + }; + + if (i == 0) + return false; + + ls2k_bmc_restore_bridge_pci_data(parent, ddata); + + /* Check if PCI-E is connected */ + if (!ls2k_bmc_pcie_is_connected(parent, ddata)) + return false; + + /* Waiting for U-Boot and DDR ready */ + mdelay(LS2K_BMC_RESET_WAIT_TIME); + if (!ls2k_bmc_bar0_addr_is_set(parent)) + return false; + + /* Restore LS2K BMC PCI-E config data */ + pci_write_config_dword(pdev, PCI_COMMAND, ddata->bmc_pci_data.pci_command); + pci_write_config_dword(pdev, PCI_BASE_ADDRESS_0, ddata->bmc_pci_data.base_address0); + pci_write_config_dword(pdev, PCI_INTERRUPT_LINE, ddata->bmc_pci_data.interrupt_line); + + return 0; +} + +static void ls2k_bmc_events_fn(struct work_struct *work) +{ + struct ls2k_bmc_ddata *ddata = container_of(work, struct ls2k_bmc_ddata, bmc_reset_work); + + /* + * The PCI-E is lost when the BMC resets, at which point access to the PCI-E + * from other CPUs is suspended to prevent a crash. + */ + stop_machine(ls2k_bmc_recover_pci_data, ddata, NULL); + + if (IS_ENABLED(CONFIG_VT)) { + /* Re-push the display due to previous PCI-E loss. */ + set_console(vt_move_to_console(MAX_NR_CONSOLES - 1, 1)); + } +} + +static irqreturn_t ls2k_bmc_interrupt(int irq, void *arg) +{ + struct ls2k_bmc_ddata *ddata = arg; + static unsigned long last_jiffies; + + if (system_state != SYSTEM_RUNNING) + return IRQ_HANDLED; + + /* Skip interrupt in LS2K_BMC_INT_INTERVAL */ + if (time_after(jiffies, last_jiffies + LS2K_BMC_INT_INTERVAL)) { + schedule_work(&ddata->bmc_reset_work); + last_jiffies = jiffies; + } + + return IRQ_HANDLED; +} + +/* + * Saves the BMC parent device (LS7A) and its own PCI configuration space registers + * that need to be restored after BMC reset. + */ +static void ls2k_bmc_save_pci_data(struct pci_dev *pdev, struct ls2k_bmc_ddata *ddata) +{ + struct pci_dev *parent = pdev->bus->self; + int base, i = 0; + + pci_read_config_dword(parent, PCI_COMMAND, &ddata->bridge_pci_data.pci_command); + + for (base = PCI_BASE_ADDRESS_0; base <= PCI_BASE_ADDRESS_5; base += PCI_REG_STRIDE, i++) + pci_read_config_dword(parent, base, &ddata->bridge_pci_data.base_address[i]); + + pci_read_config_dword(parent, PCI_ROM_ADDRESS, &ddata->bridge_pci_data.rom_addreess); + pci_read_config_dword(parent, PCI_INTERRUPT_LINE, &ddata->bridge_pci_data.interrupt_line); + + pci_read_config_dword(parent, parent->msi_cap + PCI_MSI_ADDRESS_LO, + &ddata->bridge_pci_data.msi_lo); + pci_read_config_dword(parent, parent->msi_cap + PCI_MSI_ADDRESS_HI, + &ddata->bridge_pci_data.msi_hi); + + pci_read_config_dword(parent, parent->pcie_cap + PCI_EXP_DEVCTL, + &ddata->bridge_pci_data.devctl); + pci_read_config_dword(parent, parent->pcie_cap + PCI_EXP_LNKCAP, + &ddata->bridge_pci_data.linkcap); + pci_read_config_dword(parent, parent->pcie_cap + PCI_EXP_LNKCTL, + &ddata->bridge_pci_data.linkctl_sts); + + pci_read_config_dword(parent, LS7A_GEN2_CTL, &ddata->bridge_pci_data.gen2_ctrl); + ddata->bridge_pci_data.gen2_ctrl |= FIELD_PREP(LS7A_GEN2_SPEED_CHANG, 0x1) | + FIELD_PREP(LS7A_CONF_PHY_TX, 0x0); + + pci_read_config_dword(parent, LS7A_SYMBOL_TIMER, &ddata->bridge_pci_data.symbol_timer); + ddata->bridge_pci_data.symbol_timer |= LS7A_MASK_LEN_MATCH; + + pci_read_config_dword(pdev, PCI_COMMAND, &ddata->bmc_pci_data.pci_command); + pci_read_config_dword(pdev, PCI_BASE_ADDRESS_0, &ddata->bmc_pci_data.base_address0); + pci_read_config_dword(pdev, PCI_INTERRUPT_LINE, &ddata->bmc_pci_data.interrupt_line); +} + +static int ls2k_bmc_init(struct ls2k_bmc_ddata *ddata) +{ + struct pci_dev *pdev = to_pci_dev(ddata->dev); + void __iomem *gpio_base; + int gpio_irq, ret, val; + + ls2k_bmc_save_pci_data(pdev, ddata); + + INIT_WORK(&ddata->bmc_reset_work, ls2k_bmc_events_fn); + + ret = devm_request_irq(&pdev->dev, pdev->irq, ls2k_bmc_interrupt, + IRQF_SHARED | IRQF_TRIGGER_FALLING, "ls2kbmc pcie", ddata); + if (ret) { + dev_err(ddata->dev, "Failed to request LS2KBMC PCI-E IRQ %d.\n", pdev->irq); + return ret; + } + + gpio_base = ioremap(LOONGSON_GPIO_REG_BASE, LOONGSON_GPIO_REG_SIZE); + if (!gpio_base) + return -ENOMEM; + + /* Disable GPIO output */ + val = readl(gpio_base + LOONGSON_GPIO_OEN); + writel(val | BIT(LS2K_BMC_RESET_GPIO), gpio_base + LOONGSON_GPIO_OEN); + + /* Enable GPIO functionality */ + val = readl(gpio_base + LOONGSON_GPIO_FUNC); + writel(val & ~BIT(LS2K_BMC_RESET_GPIO), gpio_base + LOONGSON_GPIO_FUNC); + + /* Set GPIO interrupts to low-level active */ + val = readl(gpio_base + LOONGSON_GPIO_INTPOL); + writel(val & ~BIT(LS2K_BMC_RESET_GPIO), gpio_base + LOONGSON_GPIO_INTPOL); + + /* Enable GPIO interrupts */ + val = readl(gpio_base + LOONGSON_GPIO_INTEN); + writel(val | BIT(LS2K_BMC_RESET_GPIO), gpio_base + LOONGSON_GPIO_INTEN); + + iounmap(gpio_base); + + /* + * Since gpio_chip->to_irq is not implemented in the Loongson-3 GPIO driver, + * acpi_register_gsi() is used to obtain the GPIO IRQ. The GPIO interrupt is a + * watchdog interrupt that is triggered when the BMC resets. + */ + gpio_irq = acpi_register_gsi(NULL, LS2K_BMC_RESET_GPIO_GSI, ACPI_EDGE_SENSITIVE, + ACPI_ACTIVE_LOW); + if (gpio_irq < 0) + return gpio_irq; + + ret = devm_request_irq(ddata->dev, gpio_irq, ls2k_bmc_interrupt, + IRQF_SHARED | IRQF_TRIGGER_FALLING, "ls2kbmc gpio", ddata); + if (ret) + dev_err(ddata->dev, "Failed to request LS2KBMC GPIO IRQ %d.\n", gpio_irq); + + acpi_unregister_gsi(LS2K_BMC_RESET_GPIO_GSI); + return ret; +} + /* * Currently the Loongson-2K BMC hardware does not have an I2C interface to adapt to the * resolution. We set the resolution by presetting "video=1280x1024-16@2M" to the BMC memory. @@ -134,6 +460,7 @@ static int ls2k_bmc_parse_mode(struct pci_dev *pdev, struct simplefb_platform_da static int ls2k_bmc_probe(struct pci_dev *dev, const struct pci_device_id *id) { struct simplefb_platform_data pd; + struct ls2k_bmc_ddata *ddata; resource_size_t base; int ret; @@ -141,6 +468,18 @@ static int ls2k_bmc_probe(struct pci_dev *dev, const struct pci_device_id *id) if (ret) return ret; + ddata = devm_kzalloc(&dev->dev, sizeof(*ddata), GFP_KERNEL); + if (IS_ERR(ddata)) { + ret = -ENOMEM; + goto disable_pci; + } + + ddata->dev = &dev->dev; + + ret = ls2k_bmc_init(ddata); + if (ret) + goto disable_pci; + ret = ls2k_bmc_parse_mode(dev, &pd); if (ret) goto disable_pci; From 62aec8a0a5b61f149bbe518c636e38e484812499 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 3 Sep 2025 10:45:48 +0100 Subject: [PATCH 3182/3206] mfd: cs42l43: Move IRQ enable/disable to encompass force suspend As pm_runtime_force_suspend() will force the device state to suspend, the driver needs to ensure no IRQ handlers are currently running. If not those handlers may find they are now running on suspended hardware despite holding a PM runtime reference. disable_irq() will sync any currently running handlers, so move the IRQ disabling to cover the whole of the forced suspend state to avoid such race conditions. Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20250903094549.271068-6-ckeepax@opensource.cirrus.com Signed-off-by: Lee Jones --- drivers/mfd/cs42l43.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mfd/cs42l43.c b/drivers/mfd/cs42l43.c index 07c8f1b8183eec..959298c8232f45 100644 --- a/drivers/mfd/cs42l43.c +++ b/drivers/mfd/cs42l43.c @@ -1151,6 +1151,8 @@ static int cs42l43_suspend(struct device *dev) return ret; } + disable_irq(cs42l43->irq); + ret = pm_runtime_force_suspend(dev); if (ret) { dev_err(cs42l43->dev, "Failed to force suspend: %d\n", ret); @@ -1164,8 +1166,6 @@ static int cs42l43_suspend(struct device *dev) if (ret) return ret; - disable_irq(cs42l43->irq); - return 0; } @@ -1196,14 +1196,14 @@ static int cs42l43_resume(struct device *dev) if (ret) return ret; - enable_irq(cs42l43->irq); - ret = pm_runtime_force_resume(dev); if (ret) { dev_err(cs42l43->dev, "Failed to force resume: %d\n", ret); return ret; } + enable_irq(cs42l43->irq); + return 0; } From afe0f94992dbe6cc6a38fdb2c8ed2f0f265e3e6e Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 3 Sep 2025 10:45:49 +0100 Subject: [PATCH 3183/3206] mfd: cs42l43: Remove IRQ masking in suspend Now the individual child drivers mask their own IRQs there is no need for the MFD code to do so anymore. Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20250903094549.271068-7-ckeepax@opensource.cirrus.com Signed-off-by: Lee Jones --- drivers/mfd/cs42l43.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/drivers/mfd/cs42l43.c b/drivers/mfd/cs42l43.c index 959298c8232f45..107cfb983fec41 100644 --- a/drivers/mfd/cs42l43.c +++ b/drivers/mfd/cs42l43.c @@ -1117,24 +1117,6 @@ EXPORT_SYMBOL_NS_GPL(cs42l43_dev_probe, "MFD_CS42L43"); static int cs42l43_suspend(struct device *dev) { struct cs42l43 *cs42l43 = dev_get_drvdata(dev); - static const struct reg_sequence mask_all[] = { - { CS42L43_DECIM_MASK, 0xFFFFFFFF, }, - { CS42L43_EQ_MIX_MASK, 0xFFFFFFFF, }, - { CS42L43_ASP_MASK, 0xFFFFFFFF, }, - { CS42L43_PLL_MASK, 0xFFFFFFFF, }, - { CS42L43_SOFT_MASK, 0xFFFFFFFF, }, - { CS42L43_SWIRE_MASK, 0xFFFFFFFF, }, - { CS42L43_MSM_MASK, 0xFFFFFFFF, }, - { CS42L43_ACC_DET_MASK, 0xFFFFFFFF, }, - { CS42L43_I2C_TGT_MASK, 0xFFFFFFFF, }, - { CS42L43_SPI_MSTR_MASK, 0xFFFFFFFF, }, - { CS42L43_SW_TO_SPI_BRIDGE_MASK, 0xFFFFFFFF, }, - { CS42L43_OTP_MASK, 0xFFFFFFFF, }, - { CS42L43_CLASS_D_AMP_MASK, 0xFFFFFFFF, }, - { CS42L43_GPIO_INT_MASK, 0xFFFFFFFF, }, - { CS42L43_ASRC_MASK, 0xFFFFFFFF, }, - { CS42L43_HPOUT_MASK, 0xFFFFFFFF, }, - }; int ret; ret = pm_runtime_resume_and_get(dev); @@ -1143,14 +1125,6 @@ static int cs42l43_suspend(struct device *dev) return ret; } - /* The IRQs will be re-enabled on resume by the cache sync */ - ret = regmap_multi_reg_write_bypassed(cs42l43->regmap, - mask_all, ARRAY_SIZE(mask_all)); - if (ret) { - dev_err(cs42l43->dev, "Failed to mask IRQs: %d\n", ret); - return ret; - } - disable_irq(cs42l43->irq); ret = pm_runtime_force_suspend(dev); From 605c9820e44de2da7d67acf66484136561da63a2 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Tue, 9 Sep 2025 21:23:07 +0300 Subject: [PATCH 3184/3206] mfd: max77705: Setup the core driver as an interrupt controller Current implementation describes only MFD's own topsys interrupts. However, max77705 has a register which indicates interrupt source, i.e. it acts as an interrupt controller. There's 4 interrupt sources in max77705: topsys, charger, fuelgauge, usb type-c manager. Setup max77705 MFD parent as an interrupt controller. Delete topsys interrupts because currently unused. Remove shared interrupt flag, because we're are an interrupt controller now, and subdevices should request interrupts from us. Fixes: c8d50f029748 ("mfd: Add new driver for MAX77705 PMIC") Signed-off-by: Dzmitry Sankouski Link: https://lore.kernel.org/r/20250909-max77705-fix_interrupt_handling-v3-1-233c5a1a20b5@gmail.com Signed-off-by: Lee Jones --- drivers/mfd/max77705.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/drivers/mfd/max77705.c b/drivers/mfd/max77705.c index 6b263bacb8c28d..62dbc63efa8d08 100644 --- a/drivers/mfd/max77705.c +++ b/drivers/mfd/max77705.c @@ -61,21 +61,21 @@ static const struct regmap_config max77705_regmap_config = { .max_register = MAX77705_PMIC_REG_USBC_RESET, }; -static const struct regmap_irq max77705_topsys_irqs[] = { - { .mask = MAX77705_SYSTEM_IRQ_BSTEN_INT, }, - { .mask = MAX77705_SYSTEM_IRQ_SYSUVLO_INT, }, - { .mask = MAX77705_SYSTEM_IRQ_SYSOVLO_INT, }, - { .mask = MAX77705_SYSTEM_IRQ_TSHDN_INT, }, - { .mask = MAX77705_SYSTEM_IRQ_TM_INT, }, +static const struct regmap_irq max77705_irqs[] = { + { .mask = MAX77705_SRC_IRQ_CHG, }, + { .mask = MAX77705_SRC_IRQ_TOP, }, + { .mask = MAX77705_SRC_IRQ_FG, }, + { .mask = MAX77705_SRC_IRQ_USBC, }, }; -static const struct regmap_irq_chip max77705_topsys_irq_chip = { - .name = "max77705-topsys", - .status_base = MAX77705_PMIC_REG_SYSTEM_INT, - .mask_base = MAX77705_PMIC_REG_SYSTEM_INT_MASK, +static const struct regmap_irq_chip max77705_irq_chip = { + .name = "max77705", + .status_base = MAX77705_PMIC_REG_INTSRC, + .ack_base = MAX77705_PMIC_REG_INTSRC, + .mask_base = MAX77705_PMIC_REG_INTSRC_MASK, .num_regs = 1, - .irqs = max77705_topsys_irqs, - .num_irqs = ARRAY_SIZE(max77705_topsys_irqs), + .irqs = max77705_irqs, + .num_irqs = ARRAY_SIZE(max77705_irqs), }; static int max77705_i2c_probe(struct i2c_client *i2c) @@ -110,19 +110,12 @@ static int max77705_i2c_probe(struct i2c_client *i2c) ret = devm_regmap_add_irq_chip(dev, max77705->regmap, i2c->irq, - IRQF_ONESHOT | IRQF_SHARED, 0, - &max77705_topsys_irq_chip, + IRQF_ONESHOT, 0, + &max77705_irq_chip, &irq_data); if (ret) return dev_err_probe(dev, ret, "Failed to add IRQ chip\n"); - /* Unmask interrupts from all blocks in interrupt source register */ - ret = regmap_update_bits(max77705->regmap, - MAX77705_PMIC_REG_INTSRC_MASK, - MAX77705_SRC_IRQ_ALL, (unsigned int)~MAX77705_SRC_IRQ_ALL); - if (ret < 0) - return dev_err_probe(dev, ret, "Could not unmask interrupts in INTSRC\n"); - domain = regmap_irq_get_domain(irq_data); ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, From da32b0e82c523b76265ba1ad25d7ea74f0ece402 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 10 Sep 2025 20:59:06 +0300 Subject: [PATCH 3185/3206] mfd: rz-mtu3: Fix MTU5 NFCR register offset The NFCR register for MTU5 is at 0x1a95 offset according to Datasheet Page 725, Table 16.4. The address of all registers is offset by 0x1200, making the proper address of MTU5 NFCR register be 0x895. Cc: stable@vger.kernel.org Fixes: 654c293e1687 ("mfd: Add Renesas RZ/G2L MTU3a core driver") Signed-off-by: Cosmin Tanislav Reviewed-by: Biju Das Link: https://lore.kernel.org/r/20250910175914.12956-1-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Lee Jones --- drivers/mfd/rz-mtu3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/rz-mtu3.c b/drivers/mfd/rz-mtu3.c index f3dac4a29a8324..9cdfef610398f3 100644 --- a/drivers/mfd/rz-mtu3.c +++ b/drivers/mfd/rz-mtu3.c @@ -32,7 +32,7 @@ static const unsigned long rz_mtu3_8bit_ch_reg_offs[][13] = { [RZ_MTU3_CHAN_2] = MTU_8BIT_CH_1_2(0x204, 0x092, 0x205, 0x200, 0x20c, 0x201, 0x202), [RZ_MTU3_CHAN_3] = MTU_8BIT_CH_3_4_6_7(0x008, 0x093, 0x02c, 0x000, 0x04c, 0x002, 0x004, 0x005, 0x038), [RZ_MTU3_CHAN_4] = MTU_8BIT_CH_3_4_6_7(0x009, 0x094, 0x02d, 0x001, 0x04d, 0x003, 0x006, 0x007, 0x039), - [RZ_MTU3_CHAN_5] = MTU_8BIT_CH_5(0xab2, 0x1eb, 0xab4, 0xab6, 0xa84, 0xa85, 0xa86, 0xa94, 0xa95, 0xa96, 0xaa4, 0xaa5, 0xaa6), + [RZ_MTU3_CHAN_5] = MTU_8BIT_CH_5(0xab2, 0x895, 0xab4, 0xab6, 0xa84, 0xa85, 0xa86, 0xa94, 0xa95, 0xa96, 0xaa4, 0xaa5, 0xaa6), [RZ_MTU3_CHAN_6] = MTU_8BIT_CH_3_4_6_7(0x808, 0x893, 0x82c, 0x800, 0x84c, 0x802, 0x804, 0x805, 0x838), [RZ_MTU3_CHAN_7] = MTU_8BIT_CH_3_4_6_7(0x809, 0x894, 0x82d, 0x801, 0x84d, 0x803, 0x806, 0x807, 0x839), [RZ_MTU3_CHAN_8] = MTU_8BIT_CH_8(0x404, 0x098, 0x400, 0x406, 0x401, 0x402, 0x403) From 800d2c631c2496b676b2ffa969b4cab1d59d5a9b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:25:45 +0200 Subject: [PATCH 3186/3206] mfd: vexpress-sysreg: Use more common syntax for compound literals The (typeof(foo)) construct is unusual in the kernel, use a more typical syntax by explicitly spelling out the type. Link: https://lore.kernel.org/all/20250909-gpio-mmio-gpio-conv-part4-v1-13-9f723dc3524a@linaro.org/ Suggested-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski Acked-by: Sudeep Holla Link: https://lore.kernel.org/r/20250910-make-compound-literals-normal-again-v1-1-076ee7738a0b@linaro.org Signed-off-by: Lee Jones --- drivers/mfd/vexpress-sysreg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/vexpress-sysreg.c b/drivers/mfd/vexpress-sysreg.c index 9399eb850ca29b..f49cee91f71cc2 100644 --- a/drivers/mfd/vexpress-sysreg.c +++ b/drivers/mfd/vexpress-sysreg.c @@ -120,7 +120,7 @@ static int vexpress_sysreg_probe(struct platform_device *pdev) if (!mmc_gpio_chip) return -ENOMEM; - config = (typeof(config)){ + config = (struct gpio_generic_chip_config) { .dev = &pdev->dev, .sz = 4, .dat = base + SYS_MCI, From 73ba00d86a9800d8f5ab1f2e1eb4b852da85aad4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= Date: Thu, 11 Sep 2025 14:43:46 +0200 Subject: [PATCH 3187/3206] mfd: 88pm886: Add GPADC cell MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a cell for the PMIC's onboard General Purpose ADC. Acked-by: Karel Balej # for the PMIC Signed-off-by: Duje Mihanović Link: https://lore.kernel.org/r/20250911-88pm886-gpadc-v4-3-60452710d3a0@dujemihanovic.xyz Signed-off-by: Lee Jones --- drivers/mfd/88pm886.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/88pm886.c b/drivers/mfd/88pm886.c index 39dd9a818b0f0e..e411d8dee55420 100644 --- a/drivers/mfd/88pm886.c +++ b/drivers/mfd/88pm886.c @@ -35,6 +35,7 @@ static const struct resource pm886_onkey_resources[] = { }; static const struct mfd_cell pm886_devs[] = { + MFD_CELL_NAME("88pm886-gpadc"), MFD_CELL_RES("88pm886-onkey", pm886_onkey_resources), MFD_CELL_NAME("88pm886-regulator"), MFD_CELL_NAME("88pm886-rtc"), From c91a0e4e549d0457c61f2199fcd84d699400bee1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 15 Sep 2025 14:29:36 +0300 Subject: [PATCH 3188/3206] mfd: intel-lpss: Add Intel Wildcat Lake LPSS PCI IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Intel Wildcat Lake PCI IDs. Signed-off-by: Ilpo Järvinen Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250915112936.10696-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss-pci.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c index 1a5b8b13f8d0b2..8d92c895d3aeff 100644 --- a/drivers/mfd/intel-lpss-pci.c +++ b/drivers/mfd/intel-lpss-pci.c @@ -367,6 +367,19 @@ static const struct pci_device_id intel_lpss_pci_ids[] = { { PCI_VDEVICE(INTEL, 0x4b79), (kernel_ulong_t)&ehl_i2c_info }, { PCI_VDEVICE(INTEL, 0x4b7a), (kernel_ulong_t)&ehl_i2c_info }, { PCI_VDEVICE(INTEL, 0x4b7b), (kernel_ulong_t)&ehl_i2c_info }, + /* WCL */ + { PCI_VDEVICE(INTEL, 0x4d25), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0x4d26), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0x4d27), (kernel_ulong_t)&tgl_spi_info }, + { PCI_VDEVICE(INTEL, 0x4d30), (kernel_ulong_t)&tgl_spi_info }, + { PCI_VDEVICE(INTEL, 0x4d46), (kernel_ulong_t)&tgl_spi_info }, + { PCI_VDEVICE(INTEL, 0x4d50), (kernel_ulong_t)&ehl_i2c_info }, + { PCI_VDEVICE(INTEL, 0x4d51), (kernel_ulong_t)&ehl_i2c_info }, + { PCI_VDEVICE(INTEL, 0x4d52), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0x4d78), (kernel_ulong_t)&ehl_i2c_info }, + { PCI_VDEVICE(INTEL, 0x4d79), (kernel_ulong_t)&ehl_i2c_info }, + { PCI_VDEVICE(INTEL, 0x4d7a), (kernel_ulong_t)&ehl_i2c_info }, + { PCI_VDEVICE(INTEL, 0x4d7b), (kernel_ulong_t)&ehl_i2c_info }, /* JSL */ { PCI_VDEVICE(INTEL, 0x4da8), (kernel_ulong_t)&spt_uart_info }, { PCI_VDEVICE(INTEL, 0x4da9), (kernel_ulong_t)&spt_uart_info }, From 59863239e09f435e4dd3393637adc4d657772de5 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Fri, 12 Sep 2025 14:07:40 +0200 Subject: [PATCH 3189/3206] dt-bindings: mfd: tps6594: Allow gpio-line-names Setting the signal names in the device tree was already possible, but it will lead to a warning. Allow the gpio-line-names property to fix that. Signed-off-by: Michael Walle Acked-by: "Rob Herring (Arm)" Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/ti,tps6594.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/mfd/ti,tps6594.yaml b/Documentation/devicetree/bindings/mfd/ti,tps6594.yaml index a48cb00afe4381..ca17fbdea691d4 100644 --- a/Documentation/devicetree/bindings/mfd/ti,tps6594.yaml +++ b/Documentation/devicetree/bindings/mfd/ti,tps6594.yaml @@ -41,6 +41,7 @@ properties: system-power-controller: true gpio-controller: true + gpio-line-names: true '#gpio-cells': const: 2 From 354f31e9d2a3e4799d3d057a9e1c9f7ae7348bba Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Fri, 12 Sep 2025 14:07:43 +0200 Subject: [PATCH 3190/3206] dt-bindings: watchdog: Add SMARC-sAM67 support The SMARC-sAM67 board has an on-board uC which has the same register interface as the older CPLD implementation on the SMARC-sAL28 board. Although the MCU emulates the same behavior, be prepared for any quirks and add a board specific compatible. Signed-off-by: Michael Walle Acked-by: Krzysztof Kozlowski Reviewed-by: Guenter Roeck Signed-off-by: Lee Jones --- .../devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml b/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml index 872a8471ef65fe..0821ba0e84a3ca 100644 --- a/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml @@ -18,7 +18,12 @@ allOf: properties: compatible: - const: kontron,sl28cpld-wdt + oneOf: + - items: + - enum: + - kontron,sa67mcu-wdt + - const: kontron,sl28cpld-wdt + - const: kontron,sl28cpld-wdt reg: maxItems: 1 From 02dde2c4c32e8c8404a890d6092a66dbe0f36564 Mon Sep 17 00:00:00 2001 From: Jihed Chaibi Date: Sun, 14 Sep 2025 21:25:14 +0200 Subject: [PATCH 3191/3206] dt-bindings: mfd: twl: Add missing sub-nodes for TWL4030 & TWL603x MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the main TI TWL-family binding to be self-contained and to fix pre-existing validation errors. To ensure future patches are bisectable, child nodes whose bindings are in other patches (audio, keypad, usb, etc.) are now defined using a flexible 'additionalProperties: true' pattern. This removes hard dependencies between the MFD and subsystem bindings. The complete dtbs_check for this binding is clean except for two warnings originating from pre-existing bugs in the OMAP DTS files, for which fixes have already been submitted separately [1][2]. Reviewed-by: Rob Herring Acked-by: Uwe Kleine-König Signed-off-by: Jihed Chaibi Reviewed-by: Andreas Kemnade Signed-off-by: Lee Jones --- .../devicetree/bindings/mfd/ti,twl.yaml | 319 +++++++++++++++--- .../devicetree/bindings/mfd/twl4030-power.txt | 48 --- .../devicetree/bindings/pwm/ti,twl-pwm.txt | 17 - .../devicetree/bindings/pwm/ti,twl-pwmled.txt | 17 - 4 files changed, 273 insertions(+), 128 deletions(-) delete mode 100644 Documentation/devicetree/bindings/mfd/twl4030-power.txt delete mode 100644 Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt delete mode 100644 Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt diff --git a/Documentation/devicetree/bindings/mfd/ti,twl.yaml b/Documentation/devicetree/bindings/mfd/ti,twl.yaml index f162ab60c09b56..776b04e182cb2a 100644 --- a/Documentation/devicetree/bindings/mfd/ti,twl.yaml +++ b/Documentation/devicetree/bindings/mfd/ti,twl.yaml @@ -11,9 +11,9 @@ maintainers: description: | The TWLs are Integrated Power Management Chips. - Some version might contain much more analog function like + Some versions might contain much more analog functions like USB transceiver or Audio amplifier. - These chips are connected to an i2c bus. + These chips are connected to an I2C bus. allOf: - if: @@ -49,33 +49,14 @@ allOf: ti,retain-on-reset: false properties: - madc: - type: object - $ref: /schemas/iio/adc/ti,twl4030-madc.yaml - unevaluatedProperties: false - charger: - type: object $ref: /schemas/power/supply/twl4030-charger.yaml unevaluatedProperties: false - pwrbutton: - type: object - additionalProperties: false - properties: - compatible: - const: ti,twl4030-pwrbutton - interrupts: - items: - - items: - const: 8 - - watchdog: - type: object - additionalProperties: false - properties: - compatible: - const: ti,twl4030-wdt + gpadc: false + + usb-comparator: false + - if: properties: compatible: @@ -106,15 +87,30 @@ allOf: properties: charger: - type: object - properties: - compatible: - const: ti,twl6030-charger + $ref: /schemas/power/supply/ti,twl6030-charger.yaml + unevaluatedProperties: false + gpadc: - type: object properties: compatible: const: ti,twl6030-gpadc + + pwrbutton: false + + madc: false + + watchdog: false + + audio: false + + keypad: false + + twl4030-usb: false + + gpio: false + + power: false + - if: properties: compatible: @@ -142,23 +138,36 @@ allOf: properties: charger: - type: object - properties: - compatible: - items: - - const: ti,twl6032-charger - - const: ti,twl6030-charger + $ref: /schemas/power/supply/ti,twl6030-charger.yaml + unevaluatedProperties: false + gpadc: - type: object properties: compatible: const: ti,twl6032-gpadc + pwrbutton: false + + madc: false + + watchdog: false + + audio: false + + keypad: false + + twl4030-usb: false + + gpio: false + + power: false + properties: compatible: - description: - TWL4030 for integrated power-management/audio CODEC device used in OMAP3 - based boards + description: > + TWL4030 for integrated power-management/audio CODEC device used in + OMAP3 based boards. + TWL6030/32 for integrated power-management used in OMAP4 based boards enum: - ti,twl4030 @@ -181,28 +190,221 @@ properties: "#clock-cells": const: 1 + clocks: + maxItems: 1 + + clock-names: + const: fck + charger: type: object - additionalProperties: true + properties: compatible: true + required: - compatible rtc: type: object additionalProperties: false + properties: compatible: const: ti,twl4030-rtc interrupts: maxItems: 1 + madc: + type: object + $ref: /schemas/iio/adc/ti,twl4030-madc.yaml + unevaluatedProperties: false + + pwrbutton: + type: object + additionalProperties: false + + properties: + compatible: + const: ti,twl4030-pwrbutton + interrupts: + items: + - items: + const: 8 + + watchdog: + type: object + additionalProperties: false + + properties: + compatible: + const: ti,twl4030-wdt + + audio: + type: object + additionalProperties: true + + properties: + compatible: + const: ti,twl4030-audio + + required: + - compatible + + keypad: + type: object + additionalProperties: true + + properties: + compatible: + const: ti,twl4030-keypad + + required: + - compatible + + twl4030-usb: + type: object + additionalProperties: true + + properties: + compatible: + const: ti,twl4030-usb + + required: + - compatible + + gpio: + type: object + additionalProperties: true + + properties: + compatible: + const: ti,twl4030-gpio + + required: + - compatible + + power: + type: object + additionalProperties: false + description: > + The power management module inside the TWL4030 provides several + facilities to control the power resources, including power scripts. + + For now, the binding only supports the complete shutdown of the + system after poweroff. + + Board-specific compatible strings may be used for platform-specific + power configurations. + + A board-specific compatible string (e.g., ti,twl4030-power-omap3-evm) + may be paired with a generic fallback (generally for power saving mode). + + properties: + compatible: + oneOf: + # Case 1: A single compatible string is provided. + - enum: + - ti,twl4030-power + - ti,twl4030-power-reset + - ti,twl4030-power-idle + - ti,twl4030-power-idle-osc-off + - ti,twl4030-power-omap3-sdp + - ti,twl4030-power-omap3-ldp + - ti,twl4030-power-omap3-evm + + # Case 2: The specific, valid fallback for 'idle-osc-off'. + - items: + - const: ti,twl4030-power-idle-osc-off + - const: ti,twl4030-power-idle + + # Case 3: The specific, valid fallback for 'omap3-evm'. + - items: + - const: ti,twl4030-power-omap3-evm + - const: ti,twl4030-power-idle + + ti,system-power-controller: + type: boolean + deprecated: true + description: > + DEPRECATED. The standard 'system-power-controller' + property on the parent node should be used instead. + + ti,use_poweroff: + type: boolean + deprecated: true + description: DEPRECATED, to be removed. + + required: + - compatible + + gpadc: + type: object + $ref: /schemas/iio/adc/ti,twl6030-gpadc.yaml + unevaluatedProperties: false + + properties: + compatible: true + + usb-comparator: + type: object + additionalProperties: true + + properties: + compatible: + const: ti,twl6030-usb + + required: + - compatible + + pwm: + type: object + $ref: /schemas/pwm/pwm.yaml# + unevaluatedProperties: false + description: + PWM controllers (PWM1 and PWM2 on TWL4030, PWM0 and PWM1 on TWL6030/32). + + properties: + compatible: + enum: + - ti,twl4030-pwm + - ti,twl6030-pwm + + '#pwm-cells': + const: 2 + + required: + - compatible + - '#pwm-cells' + + pwmled: + type: object + $ref: /schemas/pwm/pwm.yaml# + unevaluatedProperties: false + description: > + PWM controllers connected to LED terminals (PWMA and PWMB on TWL4030. + + LED PWM on TWL6030/32, mainly used as charging indicator LED). + + properties: + compatible: + enum: + - ti,twl4030-pwmled + - ti,twl6030-pwmled + + '#pwm-cells': + const: 2 + + required: + - compatible + - '#pwm-cells' + patternProperties: "^regulator-": type: object unevaluatedProperties: false $ref: /schemas/regulator/regulator.yaml + properties: compatible: true regulator-initial-mode: @@ -211,12 +413,13 @@ patternProperties: # with low power consumption with low load current capability - 0x0e # Active mode, the regulator can deliver its nominal output # voltage with full-load current capability + ti,retain-on-reset: - description: - Does not turn off the supplies during warm - reset. Could be needed for VMMC, as TWL6030 - reset sequence for this signal does not comply - with the SD specification. + description: > + Does not turn off the supplies during warm reset. + + Could be needed for VMMC, as TWL6030 reset sequence for + this signal does not comply with the SD specification. type: boolean unevaluatedProperties: false @@ -271,6 +474,16 @@ examples: compatible = "ti,twl6030-vmmc"; ti,retain-on-reset; }; + + pwm { + compatible = "ti,twl6030-pwm"; + #pwm-cells = <2>; + }; + + pwmled { + compatible = "ti,twl6030-pwmled"; + #pwm-cells = <2>; + }; }; }; @@ -325,6 +538,20 @@ examples: watchdog { compatible = "ti,twl4030-wdt"; }; + + power { + compatible = "ti,twl4030-power"; + }; + + pwm { + compatible = "ti,twl4030-pwm"; + #pwm-cells = <2>; + }; + + pwmled { + compatible = "ti,twl4030-pwmled"; + #pwm-cells = <2>; + }; }; }; ... diff --git a/Documentation/devicetree/bindings/mfd/twl4030-power.txt b/Documentation/devicetree/bindings/mfd/twl4030-power.txt deleted file mode 100644 index 3d19963312ce0a..00000000000000 --- a/Documentation/devicetree/bindings/mfd/twl4030-power.txt +++ /dev/null @@ -1,48 +0,0 @@ -Texas Instruments TWL family (twl4030) reset and power management module - -The power management module inside the TWL family provides several facilities -to control the power resources, including power scripts. For now, the -binding only supports the complete shutdown of the system after poweroff. - -Required properties: -- compatible : must be one of the following - "ti,twl4030-power" - "ti,twl4030-power-reset" - "ti,twl4030-power-idle" - "ti,twl4030-power-idle-osc-off" - -The use of ti,twl4030-power-reset is recommended at least on -3530 that needs a special configuration for warm reset to work. - -When using ti,twl4030-power-idle, the TI recommended configuration -for idle modes is loaded to the tlw4030 PMIC. - -When using ti,twl4030-power-idle-osc-off, the TI recommended -configuration is used with the external oscillator being shut -down during off-idle. Note that this does not work on all boards -depending on how the external oscillator is wired. - -Optional properties: - -- ti,system-power-controller: This indicates that TWL4030 is the - power supply master of the system. With this flag, the chip will - initiate an ACTIVE-to-OFF or SLEEP-to-OFF transition when the - system poweroffs. - -- ti,use_poweroff: Deprecated name for ti,system-power-controller - -Example: -&i2c1 { - clock-frequency = <2600000>; - - twl: twl@48 { - reg = <0x48>; - interrupts = <7>; /* SYS_NIRQ cascaded to intc */ - interrupt-parent = <&intc>; - - twl_power: power { - compatible = "ti,twl4030-power"; - ti,use_poweroff; - }; - }; -}; diff --git a/Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt b/Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt deleted file mode 100644 index d97ca1964e9470..00000000000000 --- a/Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt +++ /dev/null @@ -1,17 +0,0 @@ -Texas Instruments TWL series PWM drivers - -Supported PWMs: -On TWL4030 series: PWM1 and PWM2 -On TWL6030 series: PWM0 and PWM1 - -Required properties: -- compatible: "ti,twl4030-pwm" or "ti,twl6030-pwm" -- #pwm-cells: should be 2. See pwm.yaml in this directory for a description of - the cells format. - -Example: - -twl_pwm: pwm { - compatible = "ti,twl6030-pwm"; - #pwm-cells = <2>; -}; diff --git a/Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt b/Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt deleted file mode 100644 index 31ca1b032ef034..00000000000000 --- a/Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt +++ /dev/null @@ -1,17 +0,0 @@ -Texas Instruments TWL series PWM drivers connected to LED terminals - -Supported PWMs: -On TWL4030 series: PWMA and PWMB (connected to LEDA and LEDB terminals) -On TWL6030 series: LED PWM (mainly used as charging indicator LED) - -Required properties: -- compatible: "ti,twl4030-pwmled" or "ti,twl6030-pwmled" -- #pwm-cells: should be 2. See pwm.yaml in this directory for a description of - the cells format. - -Example: - -twl_pwmled: pwmled { - compatible = "ti,twl6030-pwmled"; - #pwm-cells = <2>; -}; From 3ed50d77924ff2e35918739df145dd429cee0ce4 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Mon, 22 Sep 2025 17:24:19 +0300 Subject: [PATCH 3192/3206] mfd: simple-mfd-i2c: Keep compatible strings in alphabetical order Reorder the of_device_id structures so that they are in alphabetical order. Signed-off-by: Ioana Ciornei Signed-off-by: Lee Jones --- drivers/mfd/simple-mfd-i2c.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/simple-mfd-i2c.c b/drivers/mfd/simple-mfd-i2c.c index 63ac2638886061..0cca7a9044cd4b 100644 --- a/drivers/mfd/simple-mfd-i2c.c +++ b/drivers/mfd/simple-mfd-i2c.c @@ -110,13 +110,13 @@ static const struct simple_mfd_data spacemit_p1 = { }; static const struct of_device_id simple_mfd_i2c_of_match[] = { + { .compatible = "fsl,ls1028aqds-fpga" }, + { .compatible = "fsl,lx2160aqds-fpga" }, { .compatible = "kontron,sl28cpld" }, - { .compatible = "silergy,sy7636a", .data = &silergy_sy7636a}, { .compatible = "maxim,max5970", .data = &maxim_max5970}, { .compatible = "maxim,max5978", .data = &maxim_max5970}, { .compatible = "maxim,max77705-battery", .data = &maxim_mon_max77705}, - { .compatible = "fsl,lx2160aqds-fpga" }, - { .compatible = "fsl,ls1028aqds-fpga" }, + { .compatible = "silergy,sy7636a", .data = &silergy_sy7636a}, { .compatible = "spacemit,p1", .data = &spacemit_p1, }, {} }; From b9d6cfe2ae699bbf230a6c8e0e32212b04bff661 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Mon, 22 Sep 2025 17:24:20 +0300 Subject: [PATCH 3193/3206] mfd: simple-mfd-i2c: Add compatible string for LX2160ARDB Extend the list of supported devices with the QIXIS FPGA found on the LX2160ARDB board. Signed-off-by: Ioana Ciornei Signed-off-by: Lee Jones --- drivers/mfd/simple-mfd-i2c.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/simple-mfd-i2c.c b/drivers/mfd/simple-mfd-i2c.c index 0cca7a9044cd4b..0a607a1e3ca1de 100644 --- a/drivers/mfd/simple-mfd-i2c.c +++ b/drivers/mfd/simple-mfd-i2c.c @@ -112,6 +112,7 @@ static const struct simple_mfd_data spacemit_p1 = { static const struct of_device_id simple_mfd_i2c_of_match[] = { { .compatible = "fsl,ls1028aqds-fpga" }, { .compatible = "fsl,lx2160aqds-fpga" }, + { .compatible = "fsl,lx2160ardb-fpga" }, { .compatible = "kontron,sl28cpld" }, { .compatible = "maxim,max5970", .data = &maxim_max5970}, { .compatible = "maxim,max5978", .data = &maxim_max5970}, From f97aef092e199c10a3da96ae79b571edd5362faa Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 12:12:37 +0200 Subject: [PATCH 3194/3206] cpufreq: Make drivers using CPUFREQ_ETERNAL specify transition latency Commit a755d0e2d41b ("cpufreq: Honour transition_latency over transition_delay_us") caused platforms where cpuinfo.transition_latency is CPUFREQ_ETERNAL to get a very large transition latency whereas previously it had been capped at 10 ms (and later at 2 ms). This led to a user-observable regression between 6.6 and 6.12 as described by Shawn: "The dbs sampling_rate was 10000 us on 6.6 and suddently becomes 6442450 us (4294967295 / 1000 * 1.5) on 6.12 for these platforms because the default transition delay was dropped [...]. It slows down dbs governor's reacting to CPU loading change dramatically. Also, as transition_delay_us is used by schedutil governor as rate_limit_us, it shows a negative impact on device idle power consumption, because the device gets slightly less time in the lowest OPP." Evidently, the expectation of the drivers using CPUFREQ_ETERNAL as cpuinfo.transition_latency was that it would be capped by the core, but they may as well return a default transition latency value instead of CPUFREQ_ETERNAL and the core need not do anything with it. Accordingly, introduce CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS and make all of the drivers in question use it instead of CPUFREQ_ETERNAL. Also update the related Rust binding. Fixes: a755d0e2d41b ("cpufreq: Honour transition_latency over transition_delay_us") Closes: https://lore.kernel.org/linux-pm/20250922125929.453444-1-shawnguo2@yeah.net/ Reported-by: Shawn Guo Reviewed-by: Mario Limonciello (AMD) Reviewed-by: Jie Zhan Acked-by: Viresh Kumar Cc: 6.6+ # 6.6+ Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/2264949.irdbgypaU6@rafael.j.wysocki [ rjw: Fix typo in new symbol name, drop redundant type cast from Rust binding ] Tested-by: Shawn Guo # with cpufreq-dt driver Reviewed-by: Qais Yousef Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 2 +- drivers/cpufreq/imx6q-cpufreq.c | 2 +- drivers/cpufreq/mediatek-cpufreq-hw.c | 2 +- drivers/cpufreq/rcpufreq_dt.rs | 2 +- drivers/cpufreq/scmi-cpufreq.c | 2 +- drivers/cpufreq/scpi-cpufreq.c | 2 +- drivers/cpufreq/spear-cpufreq.c | 2 +- include/linux/cpufreq.h | 3 +++ rust/kernel/cpufreq.rs | 7 ++++--- 9 files changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 506437489b4db2..7d5079fd168825 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -104,7 +104,7 @@ static int cpufreq_init(struct cpufreq_policy *policy) transition_latency = dev_pm_opp_get_max_transition_latency(cpu_dev); if (!transition_latency) - transition_latency = CPUFREQ_ETERNAL; + transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; cpumask_copy(policy->cpus, priv->cpus); policy->driver_data = priv; diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index db1c88e9d3f9cd..e93697d3edfd9b 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -442,7 +442,7 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev) } if (of_property_read_u32(np, "clock-latency", &transition_latency)) - transition_latency = CPUFREQ_ETERNAL; + transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; /* * Calculate the ramp time for max voltage change in the diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index fce5aa5ceea033..ae4500ab48913d 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -309,7 +309,7 @@ static int mtk_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) latency = readl_relaxed(data->reg_bases[REG_FREQ_LATENCY]) * 1000; if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; policy->fast_switch_possible = true; diff --git a/drivers/cpufreq/rcpufreq_dt.rs b/drivers/cpufreq/rcpufreq_dt.rs index 7e1fbf9a091f74..3909022e1c7448 100644 --- a/drivers/cpufreq/rcpufreq_dt.rs +++ b/drivers/cpufreq/rcpufreq_dt.rs @@ -123,7 +123,7 @@ impl cpufreq::Driver for CPUFreqDTDriver { let mut transition_latency = opp_table.max_transition_latency_ns() as u32; if transition_latency == 0 { - transition_latency = cpufreq::ETERNAL_LATENCY_NS; + transition_latency = cpufreq::DEFAULT_TRANSITION_LATENCY_NS; } policy diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 38c165d526d144..d2a110079f5fd5 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -294,7 +294,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) latency = perf_ops->transition_latency_get(ph, domain); if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index dcbb0ae7dd476c..e530345baddf6a 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -157,7 +157,7 @@ static int scpi_cpufreq_init(struct cpufreq_policy *policy) latency = scpi_ops->get_transition_latency(cpu_dev); if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c index 707c71090cc322..2a1550e1aa21fc 100644 --- a/drivers/cpufreq/spear-cpufreq.c +++ b/drivers/cpufreq/spear-cpufreq.c @@ -182,7 +182,7 @@ static int spear_cpufreq_probe(struct platform_device *pdev) if (of_property_read_u32(np, "clock-latency", &spear_cpufreq.transition_latency)) - spear_cpufreq.transition_latency = CPUFREQ_ETERNAL; + spear_cpufreq.transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; cnt = of_property_count_u32_elems(np, "cpufreq_tbl"); if (cnt <= 0) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 40966512ea1812..bc8c083bc16a66 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -32,6 +32,9 @@ */ #define CPUFREQ_ETERNAL (-1) + +#define CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS NSEC_PER_MSEC + #define CPUFREQ_NAME_LEN 16 /* Print length for names. Extra 1 space for accommodating '\n' in prints */ #define CPUFREQ_NAME_PLEN (CPUFREQ_NAME_LEN + 1) diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index eea57ba95f241d..2ea735700ae75d 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -39,7 +39,8 @@ use macros::vtable; const CPUFREQ_NAME_LEN: usize = bindings::CPUFREQ_NAME_LEN as usize; /// Default transition latency value in nanoseconds. -pub const ETERNAL_LATENCY_NS: u32 = bindings::CPUFREQ_ETERNAL as u32; +pub const DEFAULT_TRANSITION_LATENCY_NS: u32 = + bindings::CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; /// CPU frequency driver flags. pub mod flags { @@ -400,13 +401,13 @@ impl TableBuilder { /// The following example demonstrates how to create a CPU frequency table. /// /// ``` -/// use kernel::cpufreq::{ETERNAL_LATENCY_NS, Policy}; +/// use kernel::cpufreq::{DEFAULT_TRANSITION_LATENCY_NS, Policy}; /// /// fn update_policy(policy: &mut Policy) { /// policy /// .set_dvfs_possible_from_any_cpu(true) /// .set_fast_switch_possible(true) -/// .set_transition_latency_ns(ETERNAL_LATENCY_NS); +/// .set_transition_latency_ns(DEFAULT_TRANSITION_LATENCY_NS); /// /// pr_info!("The policy details are: {:?}\n", (policy.cpu(), policy.cur())); /// } From f965d111e68f4a993cc44d487d416e3d954eea11 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 12:19:41 +0200 Subject: [PATCH 3195/3206] cpufreq: CPPC: Avoid using CPUFREQ_ETERNAL as transition delay If cppc_get_transition_latency() returns CPUFREQ_ETERNAL to indicate a failure to retrieve the transition latency value from the platform firmware, the CPPC cpufreq driver will use that value (converted to microseconds) as the policy transition delay, but it is way too large for any practical use. Address this by making the driver use the cpufreq's default transition latency value (in microseconds) as the transition delay if CPUFREQ_ETERNAL is returned by cppc_get_transition_latency(). Fixes: d4f3388afd48 ("cpufreq / CPPC: Set platform specific transition_delay_us") Cc: 5.19+ # 5.19 Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) Reviewed-by: Jie Zhan Acked-by: Viresh Kumar Reviewed-by: Qais Yousef --- drivers/cpufreq/cppc_cpufreq.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 12de0ac7bbaff0..b71946937c5263 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -308,6 +308,16 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) return 0; } +static unsigned int __cppc_cpufreq_get_transition_delay_us(unsigned int cpu) +{ + unsigned int transition_latency_ns = cppc_get_transition_latency(cpu); + + if (transition_latency_ns == CPUFREQ_ETERNAL) + return CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS / NSEC_PER_USEC; + + return transition_latency_ns / NSEC_PER_USEC; +} + /* * The PCC subspace describes the rate at which platform can accept commands * on the shared PCC channel (including READs which do not count towards freq @@ -330,12 +340,12 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) return 10000; } } - return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; + return __cppc_cpufreq_get_transition_delay_us(cpu); } #else static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) { - return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; + return __cppc_cpufreq_get_transition_delay_us(cpu); } #endif From c28a280bd465690981099cd6e43dfcfa5c28b133 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 12:29:50 +0200 Subject: [PATCH 3196/3206] ACPI: CPPC: Do not use CPUFREQ_ETERNAL as an error value Instead of using CPUFREQ_ETERNAL for signaling an error condition in cppc_get_transition_latency(), change the return value type of that function to int and make it return a proper negative error code on failures. No intentional functional impact. Reviewed-by: Mario Limonciello (AMD) Reviewed-by: Jie Zhan Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Reviewed-by: Qais Yousef --- drivers/acpi/cppc_acpi.c | 16 +++++++--------- drivers/cpufreq/amd-pstate.c | 8 ++++---- drivers/cpufreq/cppc_cpufreq.c | 4 ++-- include/acpi/cppc_acpi.h | 6 +++--- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 6b649031808f80..ab4651205e8adb 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1876,7 +1876,7 @@ EXPORT_SYMBOL_GPL(cppc_set_perf); * If desired_reg is in the SystemMemory or SystemIo ACPI address space, * then assume there is no latency. */ -unsigned int cppc_get_transition_latency(int cpu_num) +int cppc_get_transition_latency(int cpu_num) { /* * Expected transition latency is based on the PCCT timing values @@ -1889,31 +1889,29 @@ unsigned int cppc_get_transition_latency(int cpu_num) * completion of a command before issuing the next command, * in microseconds. */ - unsigned int latency_ns = 0; struct cpc_desc *cpc_desc; struct cpc_register_resource *desired_reg; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu_num); struct cppc_pcc_data *pcc_ss_data; + int latency_ns = 0; cpc_desc = per_cpu(cpc_desc_ptr, cpu_num); if (!cpc_desc) - return CPUFREQ_ETERNAL; + return -ENODATA; desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF]; if (CPC_IN_SYSTEM_MEMORY(desired_reg) || CPC_IN_SYSTEM_IO(desired_reg)) return 0; - else if (!CPC_IN_PCC(desired_reg)) - return CPUFREQ_ETERNAL; - if (pcc_ss_id < 0) - return CPUFREQ_ETERNAL; + if (!CPC_IN_PCC(desired_reg) || pcc_ss_id < 0) + return -ENODATA; pcc_ss_data = pcc_data[pcc_ss_id]; if (pcc_ss_data->pcc_mpar) latency_ns = 60 * (1000 * 1000 * 1000 / pcc_ss_data->pcc_mpar); - latency_ns = max(latency_ns, pcc_ss_data->pcc_nominal * 1000); - latency_ns = max(latency_ns, pcc_ss_data->pcc_mrtt * 1000); + latency_ns = max_t(int, latency_ns, pcc_ss_data->pcc_nominal * 1000); + latency_ns = max_t(int, latency_ns, pcc_ss_data->pcc_mrtt * 1000); return latency_ns; } diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index b4c79fde1979b6..298e92d8cc0315 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -872,10 +872,10 @@ static void amd_pstate_update_limits(struct cpufreq_policy *policy) */ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) { - u32 transition_delay_ns; + int transition_delay_ns; transition_delay_ns = cppc_get_transition_latency(cpu); - if (transition_delay_ns == CPUFREQ_ETERNAL) { + if (transition_delay_ns < 0) { if (cpu_feature_enabled(X86_FEATURE_AMD_FAST_CPPC)) return AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY; else @@ -891,10 +891,10 @@ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) */ static u32 amd_pstate_get_transition_latency(unsigned int cpu) { - u32 transition_latency; + int transition_latency; transition_latency = cppc_get_transition_latency(cpu); - if (transition_latency == CPUFREQ_ETERNAL) + if (transition_latency < 0) return AMD_PSTATE_TRANSITION_LATENCY; return transition_latency; diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index b71946937c5263..e23d9abea13592 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -310,9 +310,9 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) static unsigned int __cppc_cpufreq_get_transition_delay_us(unsigned int cpu) { - unsigned int transition_latency_ns = cppc_get_transition_latency(cpu); + int transition_latency_ns = cppc_get_transition_latency(cpu); - if (transition_latency_ns == CPUFREQ_ETERNAL) + if (transition_latency_ns < 0) return CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS / NSEC_PER_USEC; return transition_latency_ns / NSEC_PER_USEC; diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 20f3d62e7a16a3..13fa8150484456 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -160,7 +160,7 @@ extern unsigned int cppc_khz_to_perf(struct cppc_perf_caps *caps, unsigned int f extern bool acpi_cpc_valid(void); extern bool cppc_allow_fast_switch(void); extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); -extern unsigned int cppc_get_transition_latency(int cpu); +extern int cppc_get_transition_latency(int cpu); extern bool cpc_ffh_supported(void); extern bool cpc_supported_by_cpu(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); @@ -216,9 +216,9 @@ static inline bool cppc_allow_fast_switch(void) { return false; } -static inline unsigned int cppc_get_transition_latency(int cpu) +static inline int cppc_get_transition_latency(int cpu) { - return CPUFREQ_ETERNAL; + return -ENODATA; } static inline bool cpc_ffh_supported(void) { From 950c6451a5c38d375993c3b9da427e2e69b01c30 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 12:31:47 +0200 Subject: [PATCH 3197/3206] cpufreq: Drop unused symbol CPUFREQ_ETERNAL Drop CPUFREQ_ETERNAL that has no users any more along with all references to it in the documentation. No functional impact. Reviewed-by: Mario Limonciello (AMD) Reviewed-by: Jie Zhan Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Reviewed-by: Qais Yousef --- Documentation/admin-guide/pm/cpufreq.rst | 4 ---- Documentation/cpu-freq/cpu-drivers.rst | 3 +-- Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst | 3 +-- Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst | 3 +-- include/linux/cpufreq.h | 5 ----- 5 files changed, 3 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst index cacb9f0307dd5e..738d7b4dc33af1 100644 --- a/Documentation/admin-guide/pm/cpufreq.rst +++ b/Documentation/admin-guide/pm/cpufreq.rst @@ -274,10 +274,6 @@ are the following: The time it takes to switch the CPUs belonging to this policy from one P-state to another, in nanoseconds. - If unknown or if known to be so high that the scaling driver does not - work with the `ondemand`_ governor, -1 (:c:macro:`CPUFREQ_ETERNAL`) - will be returned by reads from this attribute. - ``related_cpus`` List of all (online and offline) CPUs belonging to this policy. diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst index d84ededb66f92a..c5635ac3de5474 100644 --- a/Documentation/cpu-freq/cpu-drivers.rst +++ b/Documentation/cpu-freq/cpu-drivers.rst @@ -109,8 +109,7 @@ Then, the driver must fill in the following values: +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | the time it takes on this CPU to | | | switch between two frequencies in | -| | nanoseconds (if appropriate, else | -| | specify CPUFREQ_ETERNAL) | +| | nanoseconds | +-----------------------------------+--------------------------------------+ |policy->cur | The current operating frequency of | | | this CPU (if appropriate) | diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst index 2ca92042767be4..8238f4c6e4f51a 100644 --- a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -112,8 +112,7 @@ CPUfreq核心层注册一个cpufreq_driver结构体。 | | | +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | CPU在两个频率之间切换所需的时间,以 | -| | 纳秒为单位(如不适用,设定为 | -| | CPUFREQ_ETERNAL) | +| | 纳秒为单位 | | | | +-----------------------------------+--------------------------------------+ |policy->cur | 该CPU当前的工作频率(如适用) | diff --git a/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst index add3de2d4523ad..5435c3928d4b3e 100644 --- a/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst @@ -112,8 +112,7 @@ CPUfreq核心層註冊一個cpufreq_driver結構體。 | | | +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | CPU在兩個頻率之間切換所需的時間,以 | -| | 納秒爲單位(如不適用,設定爲 | -| | CPUFREQ_ETERNAL) | +| | 納秒爲單位 | | | | +-----------------------------------+--------------------------------------+ |policy->cur | 該CPU當前的工作頻率(如適用) | diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index bc8c083bc16a66..0465d1e6f72ac0 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -26,13 +26,8 @@ *********************************************************************/ /* * Frequency values here are CPU kHz - * - * Maximum transition latency is in nanoseconds - if it's unknown, - * CPUFREQ_ETERNAL shall be used. */ -#define CPUFREQ_ETERNAL (-1) - #define CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS NSEC_PER_MSEC #define CPUFREQ_NAME_LEN 16 From d9f866b2bb3eec38b3734f1fed325ec7c55ccdfa Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Oct 2025 13:26:36 +0300 Subject: [PATCH 3198/3206] ACPI: property: Return present device nodes only on fwnode interface fwnode_graph_get_next_subnode() may return fwnode backed by ACPI device nodes and there has been no check these devices are present in the system, unlike there has been on fwnode OF backend. In order to provide consistent behaviour towards callers, add a check for device presence by introducing a new function acpi_get_next_present_subnode(), used as the get_next_child_node() fwnode operation that also checks device node presence. Signed-off-by: Sakari Ailus Reviewed-by: Laurent Pinchart Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251001102636.1272722-2-sakari.ailus@linux.intel.com [ rjw: Kerneldoc comment and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 54baa23a9e5ae3..36d29135e16416 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -1358,6 +1358,28 @@ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, return NULL; } +/* + * acpi_get_next_present_subnode - Return the next present child node handle + * @fwnode: Firmware node to find the next child node for. + * @child: Handle to one of the device's child nodes or a null handle. + * + * Like acpi_get_next_subnode(), but the device nodes returned by + * acpi_get_next_present_subnode() are guaranteed to be present. + * + * Returns: The fwnode handle of the next present sub-node. + */ +static struct fwnode_handle * +acpi_get_next_present_subnode(const struct fwnode_handle *fwnode, + struct fwnode_handle *child) +{ + do { + child = acpi_get_next_subnode(fwnode, child); + } while (is_acpi_device_node(child) && + !acpi_device_is_present(to_acpi_device_node(child))); + + return child; +} + /** * acpi_node_get_parent - Return parent fwnode of this fwnode * @fwnode: Firmware node whose parent to get @@ -1702,7 +1724,7 @@ static int acpi_fwnode_irq_get(const struct fwnode_handle *fwnode, .property_read_string_array = \ acpi_fwnode_property_read_string_array, \ .get_parent = acpi_node_get_parent, \ - .get_next_child_node = acpi_get_next_subnode, \ + .get_next_child_node = acpi_get_next_present_subnode, \ .get_named_child_node = acpi_fwnode_get_named_child_node, \ .get_name = acpi_fwnode_get_name, \ .get_name_prefix = acpi_fwnode_get_name_prefix, \ From e649c3662b4fd9d4e01607c91a0facc7d9005570 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 30 Sep 2025 15:47:11 +0200 Subject: [PATCH 3199/3206] thermal: renesas: Fix RZ/G3E fall-out - Restore sort order in MAINTAINERS and Kconfig, - Remove empty trailing line from Makefile. Fixes: 19d3a401a617c68e ("thermal/drivers/renesas/rzg3e: Add thermal driver for the Renesas RZ/G3E SoC") Signed-off-by: Geert Uytterhoeven Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 14 +++++++------- drivers/thermal/renesas/Kconfig | 14 +++++++------- drivers/thermal/renesas/Makefile | 1 - 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index caed82f4e950e9..24ad408f21ed59 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21701,13 +21701,6 @@ S: Maintained F: Documentation/devicetree/bindings/iio/potentiometer/renesas,x9250.yaml F: drivers/iio/potentiometer/x9250.c -RENESAS RZ/G3S THERMAL SENSOR UNIT DRIVER -M: Claudiu Beznea -L: linux-pm@vger.kernel.org -S: Maintained -F: Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml -F: drivers/thermal/renesas/rzg3s_thermal.c - RENESAS RZ/G3E THERMAL SENSOR UNIT DRIVER M: John Madieu L: linux-pm@vger.kernel.org @@ -21715,6 +21708,13 @@ S: Maintained F: Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml F: drivers/thermal/renesas/rzg3e_thermal.c +RENESAS RZ/G3S THERMAL SENSOR UNIT DRIVER +M: Claudiu Beznea +L: linux-pm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml +F: drivers/thermal/renesas/rzg3s_thermal.c + RESET CONTROLLER FRAMEWORK M: Philipp Zabel S: Maintained diff --git a/drivers/thermal/renesas/Kconfig b/drivers/thermal/renesas/Kconfig index c762c1c30d5a21..5735c8728a31fc 100644 --- a/drivers/thermal/renesas/Kconfig +++ b/drivers/thermal/renesas/Kconfig @@ -27,6 +27,13 @@ config RZG2L_THERMAL Enable this to plug the RZ/G2L thermal sensor driver into the Linux thermal framework. +config RZG3E_THERMAL + tristate "Renesas RZ/G3E thermal driver" + depends on ARCH_RENESAS || COMPILE_TEST + help + Enable this to plug the RZ/G3E thermal sensor driver into the Linux + thermal framework. + config RZG3S_THERMAL tristate "Renesas RZ/G3S thermal driver" depends on ARCH_R9A08G045 || COMPILE_TEST @@ -34,10 +41,3 @@ config RZG3S_THERMAL help Enable this to plug the RZ/G3S thermal sensor driver into the Linux thermal framework. - -config RZG3E_THERMAL - tristate "Renesas RZ/G3E thermal driver" - depends on ARCH_RENESAS || COMPILE_TEST - help - Enable this to plug the RZ/G3E thermal sensor driver into the Linux - thermal framework. diff --git a/drivers/thermal/renesas/Makefile b/drivers/thermal/renesas/Makefile index 0ea59224757226..8f5ae9af277cab 100644 --- a/drivers/thermal/renesas/Makefile +++ b/drivers/thermal/renesas/Makefile @@ -5,4 +5,3 @@ obj-$(CONFIG_RCAR_THERMAL) += rcar_thermal.o obj-$(CONFIG_RZG2L_THERMAL) += rzg2l_thermal.o obj-$(CONFIG_RZG3E_THERMAL) += rzg3e_thermal.o obj-$(CONFIG_RZG3S_THERMAL) += rzg3s_thermal.o - From 59abe7bc7e7c70e9066b3e46874d1b7e6a13de14 Mon Sep 17 00:00:00 2001 From: Ranjani Sridharan Date: Thu, 2 Oct 2025 10:31:25 +0300 Subject: [PATCH 3200/3206] ASoC: SOF: ipc3-topology: Fix multi-core and static pipelines tear down MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the case of static pipelines, freeing the widgets in the pipelines that were not suspended after freeing the scheduler widgets results in errors because the secondary cores are powered off when the scheduler widgets are freed. Fix this by tearing down the leftover pipelines before powering off the secondary cores. Cc: stable@vger.kernel.org Fixes: d7332c4a4f1a ("ASoC: SOF: ipc3-topology: Fix pipeline tear down logic") Signed-off-by: Ranjani Sridharan Reviewed-by: Péter Ujfalusi Reviewed-by: Kai Vehmanen Signed-off-by: Peter Ujfalusi Link: https://patch.msgid.link/20251002073125.32471-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc3-topology.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sound/soc/sof/ipc3-topology.c b/sound/soc/sof/ipc3-topology.c index 473d416bc91064..f449362a2905a3 100644 --- a/sound/soc/sof/ipc3-topology.c +++ b/sound/soc/sof/ipc3-topology.c @@ -2473,11 +2473,6 @@ static int sof_ipc3_tear_down_all_pipelines(struct snd_sof_dev *sdev, bool verif if (ret < 0) return ret; - /* free all the scheduler widgets now */ - ret = sof_ipc3_free_widgets_in_list(sdev, true, &dyn_widgets, verify); - if (ret < 0) - return ret; - /* * Tear down all pipelines associated with PCMs that did not get suspended * and unset the prepare flag so that they can be set up again during resume. @@ -2493,6 +2488,11 @@ static int sof_ipc3_tear_down_all_pipelines(struct snd_sof_dev *sdev, bool verif } } + /* free all the scheduler widgets now. This will also power down the secondary cores */ + ret = sof_ipc3_free_widgets_in_list(sdev, true, &dyn_widgets, verify); + if (ret < 0) + return ret; + list_for_each_entry(sroute, &sdev->route_list, list) sroute->setup = false; From bcd1383516bb5a6f72b2d1e7f7ad42c4a14837d1 Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Thu, 2 Oct 2025 10:47:15 +0300 Subject: [PATCH 3201/3206] ASoC: SOF: ipc4-pcm: fix delay calculation when DSP resamples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the sampling rates going in (host) and out (dai) from the DSP are different, the IPC4 delay reporting does not work correctly. Add support for this case by scaling the all raw position values to a common timebase before calculating real-time delay for the PCM. Cc: stable@vger.kernel.org Fixes: 0ea06680dfcb ("ASoC: SOF: ipc4-pcm: Correct the delay calculation") Signed-off-by: Kai Vehmanen Reviewed-by: Péter Ujfalusi Reviewed-by: Bard Liao Signed-off-by: Peter Ujfalusi Link: https://patch.msgid.link/20251002074719.2084-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 83 ++++++++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 21 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index cb9a06792a47cd..769ba4fed56a95 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -19,12 +19,14 @@ * struct sof_ipc4_timestamp_info - IPC4 timestamp info * @host_copier: the host copier of the pcm stream * @dai_copier: the dai copier of the pcm stream - * @stream_start_offset: reported by fw in memory window (converted to frames) - * @stream_end_offset: reported by fw in memory window (converted to frames) + * @stream_start_offset: reported by fw in memory window (converted to + * frames at host_copier sampling rate) + * @stream_end_offset: reported by fw in memory window (converted to + * frames at host_copier sampling rate) * @llp_offset: llp offset in memory window - * @boundary: wrap boundary should be used for the LLP frame counter * @delay: Calculated and stored in pointer callback. The stored value is - * returned in the delay callback. + * returned in the delay callback. Expressed in frames at host copier + * sampling rate. */ struct sof_ipc4_timestamp_info { struct sof_ipc4_copier *host_copier; @@ -33,7 +35,6 @@ struct sof_ipc4_timestamp_info { u64 stream_end_offset; u32 llp_offset; - u64 boundary; snd_pcm_sframes_t delay; }; @@ -48,6 +49,16 @@ struct sof_ipc4_pcm_stream_priv { bool chain_dma_allocated; }; +/* + * Modulus to use to compare host and link position counters. The sampling + * rates may be different, so the raw hardware counters will wrap + * around at different times. To calculate differences, use + * DELAY_BOUNDARY as a common modulus. This value must be smaller than + * the wrap-around point of any hardware counter, and larger than any + * valid delay measurement. + */ +#define DELAY_BOUNDARY U32_MAX + static inline struct sof_ipc4_timestamp_info * sof_ipc4_sps_to_time_info(struct snd_sof_pcm_stream *sps) { @@ -1049,6 +1060,35 @@ static int sof_ipc4_pcm_hw_params(struct snd_soc_component *component, return 0; } +static u64 sof_ipc4_frames_dai_to_host(struct sof_ipc4_timestamp_info *time_info, u64 value) +{ + u64 dai_rate, host_rate; + + if (!time_info->dai_copier || !time_info->host_copier) + return value; + + /* + * copiers do not change sampling rate, so we can use the + * out_format independently of stream direction + */ + dai_rate = time_info->dai_copier->data.out_format.sampling_frequency; + host_rate = time_info->host_copier->data.out_format.sampling_frequency; + + if (!dai_rate || !host_rate || dai_rate == host_rate) + return value; + + /* take care not to overflow u64, rates can be up to 768000 */ + if (value > U32_MAX) { + value = div64_u64(value, dai_rate); + value *= host_rate; + } else { + value *= host_rate; + value = div64_u64(value, dai_rate); + } + + return value; +} + static int sof_ipc4_get_stream_start_offset(struct snd_sof_dev *sdev, struct snd_pcm_substream *substream, struct snd_sof_pcm_stream *sps, @@ -1099,14 +1139,13 @@ static int sof_ipc4_get_stream_start_offset(struct snd_sof_dev *sdev, time_info->stream_end_offset = ppl_reg.stream_end_offset; do_div(time_info->stream_end_offset, dai_sample_size); + /* convert to host frame time */ + time_info->stream_start_offset = + sof_ipc4_frames_dai_to_host(time_info, time_info->stream_start_offset); + time_info->stream_end_offset = + sof_ipc4_frames_dai_to_host(time_info, time_info->stream_end_offset); + out: - /* - * Calculate the wrap boundary need to be used for delay calculation - * The host counter is in bytes, it will wrap earlier than the frames - * based link counter. - */ - time_info->boundary = div64_u64(~((u64)0), - frames_to_bytes(substream->runtime, 1)); /* Initialize the delay value to 0 (no delay) */ time_info->delay = 0; @@ -1149,6 +1188,8 @@ static int sof_ipc4_pcm_pointer(struct snd_soc_component *component, /* For delay calculation we need the host counter */ host_cnt = snd_sof_pcm_get_host_byte_counter(sdev, component, substream); + + /* Store the original value to host_ptr */ host_ptr = host_cnt; /* convert the host_cnt to frames */ @@ -1167,6 +1208,8 @@ static int sof_ipc4_pcm_pointer(struct snd_soc_component *component, sof_mailbox_read(sdev, time_info->llp_offset, &llp, sizeof(llp)); dai_cnt = ((u64)llp.reading.llp_u << 32) | llp.reading.llp_l; } + + dai_cnt = sof_ipc4_frames_dai_to_host(time_info, dai_cnt); dai_cnt += time_info->stream_end_offset; /* In two cases dai dma counter is not accurate @@ -1200,8 +1243,9 @@ static int sof_ipc4_pcm_pointer(struct snd_soc_component *component, dai_cnt -= time_info->stream_start_offset; } - /* Wrap the dai counter at the boundary where the host counter wraps */ - div64_u64_rem(dai_cnt, time_info->boundary, &dai_cnt); + /* Convert to a common base before comparisons */ + dai_cnt &= DELAY_BOUNDARY; + host_cnt &= DELAY_BOUNDARY; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { head_cnt = host_cnt; @@ -1211,14 +1255,11 @@ static int sof_ipc4_pcm_pointer(struct snd_soc_component *component, tail_cnt = host_cnt; } - if (head_cnt < tail_cnt) { - time_info->delay = time_info->boundary - tail_cnt + head_cnt; - goto out; - } - - time_info->delay = head_cnt - tail_cnt; + if (unlikely(head_cnt < tail_cnt)) + time_info->delay = DELAY_BOUNDARY - tail_cnt + head_cnt; + else + time_info->delay = head_cnt - tail_cnt; -out: /* * Convert the host byte counter to PCM pointer which wraps in buffer * and it is in frames From bace10b59624e6bd8d68bc9304357f292f1b3dcf Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Thu, 2 Oct 2025 10:47:16 +0300 Subject: [PATCH 3202/3206] ASoC: SOF: ipc4-pcm: fix start offset calculation for chain DMA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Assumption that chain DMA module starts the link DMA when 1ms of data is available from host is not correct. Instead the firmware chain DMA module fills the link DMA with initial buffer of zeroes and the host and link DMAs are started at the same time. This results in a small error in delay calculation. This can become a more severe problem if host DMA has delays that exceed 1ms. This results in negative delay to be calculated and bogus values reported to applications. This can confuse some applications like alsa_conformance_test. Fix the issue by correctly calculating the firmware chain DMA preamble size and initializing the start offset to this value. Cc: stable@vger.kernel.org Fixes: a1d203d390e0 ("ASoC: SOF: ipc4-pcm: Enable delay reporting for ChainDMA streams") Signed-off-by: Kai Vehmanen Reviewed-by: Péter Ujfalusi Reviewed-by: Bard Liao Signed-off-by: Peter Ujfalusi Link: https://patch.msgid.link/20251002074719.2084-3-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 14 ++++++++++---- sound/soc/sof/ipc4-topology.c | 1 - sound/soc/sof/ipc4-topology.h | 2 ++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 769ba4fed56a95..9d29d2e56c0086 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -1108,7 +1108,7 @@ static int sof_ipc4_get_stream_start_offset(struct snd_sof_dev *sdev, return -EINVAL; } else if (host_copier->data.gtw_cfg.node_id == SOF_IPC4_CHAIN_DMA_NODE_ID) { /* - * While the firmware does not supports time_info reporting for + * While the firmware does not support time_info reporting for * streams using ChainDMA, it is granted that ChainDMA can only * be used on Host+Link pairs where the link position is * accessible from the host side. @@ -1116,10 +1116,16 @@ static int sof_ipc4_get_stream_start_offset(struct snd_sof_dev *sdev, * Enable delay calculation in case of ChainDMA via host * accessible registers. * - * The ChainDMA uses 2x 1ms ping-pong buffer, dai side starts - * when 1ms data is available + * The ChainDMA prefills the link DMA with a preamble + * of zero samples. Set the stream start offset based + * on size of the preamble (driver provided fifo size + * multiplied by 2.5). We add 1ms of margin as the FW + * will align the buffer size to DMA hardware + * alignment that is not known to host. */ - time_info->stream_start_offset = substream->runtime->rate / MSEC_PER_SEC; + int pre_ms = SOF_IPC4_CHAIN_DMA_BUF_SIZE_MS * 5 / 2 + 1; + + time_info->stream_start_offset = pre_ms * substream->runtime->rate / MSEC_PER_SEC; goto out; } diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index b6a732d0adb4b7..36568160f16396 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -33,7 +33,6 @@ MODULE_PARM_DESC(ipc4_ignore_cpc, #define SOF_IPC4_GAIN_PARAM_ID 0 #define SOF_IPC4_TPLG_ABI_SIZE 6 -#define SOF_IPC4_CHAIN_DMA_BUF_SIZE_MS 2 static DEFINE_IDA(alh_group_ida); static DEFINE_IDA(pipeline_ida); diff --git a/sound/soc/sof/ipc4-topology.h b/sound/soc/sof/ipc4-topology.h index dfa1a6c2ffa829..6b29692dff165d 100644 --- a/sound/soc/sof/ipc4-topology.h +++ b/sound/soc/sof/ipc4-topology.h @@ -263,6 +263,8 @@ struct sof_ipc4_dma_stream_ch_map { #define SOF_IPC4_DMA_METHOD_HDA 1 #define SOF_IPC4_DMA_METHOD_GPDMA 2 /* defined for consistency but not used */ +#define SOF_IPC4_CHAIN_DMA_BUF_SIZE_MS 2 + /** * struct sof_ipc4_dma_config: DMA configuration * @dma_method: HDAudio or GPDMA From a7fe5ff832d61d9393095bc3dd5f06f4af7da3c1 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 2 Oct 2025 16:57:50 +0300 Subject: [PATCH 3203/3206] ASoC: SOF: ipc4-topology: Correct the minimum host DMA buffer size The firmware has changed the minimum host buffer size from 2 periods to 4 periods (1 period is 1ms) which was missed by the kernel side. Adjust the SOF_IPC4_MIN_DMA_BUFFER_SIZE to 4 ms to align with firmware. Link: https://github.com/thesofproject/sof/commit/f0a14a3f410735db18a79eb7a5f40dc49fdee7a7 Fixes: 594c1bb9ff73 ("ASoC: SOF: ipc4-topology: Do not parse the DMA_BUFFER_SIZE token") Signed-off-by: Peter Ujfalusi Reviewed-by: Ranjani Sridharan Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Link: https://patch.msgid.link/20251002135752.2430-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/sof/ipc4-topology.h b/sound/soc/sof/ipc4-topology.h index dfa1a6c2ffa829..d6894fdd7e1dc5 100644 --- a/sound/soc/sof/ipc4-topology.h +++ b/sound/soc/sof/ipc4-topology.h @@ -70,8 +70,8 @@ #define SOF_IPC4_CHAIN_DMA_NODE_ID 0x7fffffff #define SOF_IPC4_INVALID_NODE_ID 0xffffffff -/* FW requires minimum 2ms DMA buffer size */ -#define SOF_IPC4_MIN_DMA_BUFFER_SIZE 2 +/* FW requires minimum 4ms DMA buffer size */ +#define SOF_IPC4_MIN_DMA_BUFFER_SIZE 4 /* * The base of multi-gateways. Multi-gateways addressing starts from From 3dcf683bf1062d69014fe81b90d285c7eb85ca8a Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 2 Oct 2025 16:57:51 +0300 Subject: [PATCH 3204/3206] ASoC: SOF: ipc4-topology: Account for different ChainDMA host buffer size For ChainDMA the firmware allocates 5ms host buffer instead of the standard 4ms which should be taken into account when setting the constraint on the buffer size. Fixes: 842bb8b62cc6 ("ASoC: SOF: ipc4-topology: Save the DMA maximum burst size for PCMs") Signed-off-by: Peter Ujfalusi Reviewed-by: Ranjani Sridharan Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Link: https://patch.msgid.link/20251002135752.2430-3-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.c | 9 +++++++-- sound/soc/sof/ipc4-topology.h | 3 +++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index b6a732d0adb4b7..60cd7955e24f1a 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -666,8 +666,13 @@ static int sof_ipc4_widget_setup_pcm(struct snd_sof_widget *swidget) swidget->tuples, swidget->num_tuples, sizeof(u32), 1); /* Set default DMA buffer size if it is not specified in topology */ - if (!sps->dsp_max_burst_size_in_ms) - sps->dsp_max_burst_size_in_ms = SOF_IPC4_MIN_DMA_BUFFER_SIZE; + if (!sps->dsp_max_burst_size_in_ms) { + struct snd_sof_widget *pipe_widget = swidget->spipe->pipe_widget; + struct sof_ipc4_pipeline *pipeline = pipe_widget->private; + + sps->dsp_max_burst_size_in_ms = pipeline->use_chain_dma ? + SOF_IPC4_CHAIN_DMA_BUFFER_SIZE : SOF_IPC4_MIN_DMA_BUFFER_SIZE; + } } else { /* Capture data is copied from DSP to host in 1ms bursts */ spcm->stream[dir].dsp_max_burst_size_in_ms = 1; diff --git a/sound/soc/sof/ipc4-topology.h b/sound/soc/sof/ipc4-topology.h index d6894fdd7e1dc5..fc3b6411b9b2ce 100644 --- a/sound/soc/sof/ipc4-topology.h +++ b/sound/soc/sof/ipc4-topology.h @@ -73,6 +73,9 @@ /* FW requires minimum 4ms DMA buffer size */ #define SOF_IPC4_MIN_DMA_BUFFER_SIZE 4 +/* ChainDMA in fw uses 5ms DMA buffer */ +#define SOF_IPC4_CHAIN_DMA_BUFFER_SIZE 5 + /* * The base of multi-gateways. Multi-gateways addressing starts from * ALH_MULTI_GTW_BASE and there are ALH_MULTI_GTW_COUNT multi-sources From 45ad27d9a6f7c620d8bbc80be3bab1faf37dfa0a Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 2 Oct 2025 16:57:52 +0300 Subject: [PATCH 3205/3206] ASoC: SOF: Intel: hda-pcm: Place the constraint on period time instead of buffer time Instead of constraining the ALSA buffer time to be double of the firmware host buffer size, it is better to set it for the period time. This will implicitly constrain the buffer time to a safe value (num_periods is at least 2) and prohibits applications to set smaller period size than what will be covered by the initial DMA burst. Fixes: fe76d2e75a6d ("ASoC: SOF: Intel: hda-pcm: Use dsp_max_burst_size_in_ms to place constraint") Signed-off-by: Peter Ujfalusi Reviewed-by: Ranjani Sridharan Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Link: https://patch.msgid.link/20251002135752.2430-4-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-pcm.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/sound/soc/sof/intel/hda-pcm.c b/sound/soc/sof/intel/hda-pcm.c index 1dd8d2092c3b4f..da6c1e7263cde1 100644 --- a/sound/soc/sof/intel/hda-pcm.c +++ b/sound/soc/sof/intel/hda-pcm.c @@ -29,6 +29,8 @@ #define SDnFMT_BITS(x) ((x) << 4) #define SDnFMT_CHAN(x) ((x) << 0) +#define HDA_MAX_PERIOD_TIME_HEADROOM 10 + static bool hda_always_enable_dmi_l1; module_param_named(always_enable_dmi_l1, hda_always_enable_dmi_l1, bool, 0444); MODULE_PARM_DESC(always_enable_dmi_l1, "SOF HDA always enable DMI l1"); @@ -291,19 +293,30 @@ int hda_dsp_pcm_open(struct snd_sof_dev *sdev, * On playback start the DMA will transfer dsp_max_burst_size_in_ms * amount of data in one initial burst to fill up the host DMA buffer. * Consequent DMA burst sizes are shorter and their length can vary. - * To make sure that userspace allocate large enough ALSA buffer we need - * to place a constraint on the buffer time. + * To avoid immediate xrun by the initial burst we need to place + * constraint on the period size (via PERIOD_TIME) to cover the size of + * the host buffer. + * We need to add headroom of max 10ms as the firmware needs time to + * settle to the 1ms pacing and initially it can run faster for few + * internal periods. * * On capture the DMA will transfer 1ms chunks. - * - * Exact dsp_max_burst_size_in_ms constraint is racy, so set the - * constraint to a minimum of 2x dsp_max_burst_size_in_ms. */ - if (spcm->stream[direction].dsp_max_burst_size_in_ms) + if (spcm->stream[direction].dsp_max_burst_size_in_ms) { + unsigned int period_time = spcm->stream[direction].dsp_max_burst_size_in_ms; + + /* + * add headroom over the maximum burst size to cover the time + * needed for the DMA pace to settle. + * Limit the headroom time to HDA_MAX_PERIOD_TIME_HEADROOM + */ + period_time += min(period_time, HDA_MAX_PERIOD_TIME_HEADROOM); + snd_pcm_hw_constraint_minmax(substream->runtime, - SNDRV_PCM_HW_PARAM_BUFFER_TIME, - spcm->stream[direction].dsp_max_burst_size_in_ms * USEC_PER_MSEC * 2, + SNDRV_PCM_HW_PARAM_PERIOD_TIME, + period_time * USEC_PER_MSEC, UINT_MAX); + } /* binding pcm substream to hda stream */ substream->runtime->private_data = &dsp_stream->hstream; From 349a11abcf1744a0e9c8add7362dd3d0341c1814 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 3 Oct 2025 17:11:55 +0300 Subject: [PATCH 3206/3206] Revert "regmap: Synchronize cache for the page selector" This reverts commit 1fd60ed1700cf25d7ee233580c08fda755f9a61b. The patch is not longer available upstream Signed-off-by: Peter Ujfalusi --- drivers/base/regmap/regmap.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 16bf590a65bedd..ce9be3989a218d 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -1559,40 +1559,24 @@ static int _regmap_select_page(struct regmap *map, unsigned int *reg, return -EINVAL; } - /* - * It is possible to have selector register inside data window. - * In that case, selector register is located on every page and it - * needs no page switching, when accessed alone. - * - * Nevertheless we should synchronize the cache values for it. - */ + /* It is possible to have selector register inside data window. + In that case, selector register is located on every page and + it needs no page switching, when accessed alone. */ if (val_num > 1 || range->window_start + win_offset != range->selector_reg) { - unsigned int page_off = win_page * range->window_len; - unsigned int sel_offset = range->selector_reg - range->window_start; - unsigned int sel_register = range->range_min + page_off + sel_offset; - unsigned int val = win_page << range->selector_shift; - unsigned int mask = range->selector_mask; - /* Use separate work_buf during page switching */ orig_work_buf = map->work_buf; map->work_buf = map->selector_work_buf; - ret = _regmap_update_bits(map, range->selector_reg, mask, val, + ret = _regmap_update_bits(map, range->selector_reg, + range->selector_mask, + win_page << range->selector_shift, &page_chg, false); map->work_buf = orig_work_buf; if (ret != 0) return ret; - - /* - * If selector register has been just updated, update the respective - * virtual copy as well. - */ - if (page_chg && - in_range(range->selector_reg, range->window_start, range->window_len)) - _regmap_update_bits(map, sel_register, mask, val, NULL, false); } *reg = range->window_start + win_offset;